diff --git a/sft/revise_Full_smoe_tcmoe/added_tokens.json b/sft/revise_Full_smoe_tcmoe/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..c9d3d3a1b74d87e381e471f7b33784015d2dc0ea --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/added_tokens.json @@ -0,0 +1,13 @@ +{ + "<|assistant|>": 32001, + "<|endoftext|>": 32000, + "<|end|>": 32007, + "<|placeholder1|>": 32002, + "<|placeholder2|>": 32003, + "<|placeholder3|>": 32004, + "<|placeholder4|>": 32005, + "<|placeholder5|>": 32008, + "<|placeholder6|>": 32009, + "<|system|>": 32006, + "<|user|>": 32010 +} diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-13312/added_tokens.json b/sft/revise_Full_smoe_tcmoe/checkpoint-13312/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..c9d3d3a1b74d87e381e471f7b33784015d2dc0ea --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-13312/added_tokens.json @@ -0,0 +1,13 @@ +{ + "<|assistant|>": 32001, + "<|endoftext|>": 32000, + "<|end|>": 32007, + "<|placeholder1|>": 32002, + "<|placeholder2|>": 32003, + "<|placeholder3|>": 32004, + "<|placeholder4|>": 32005, + "<|placeholder5|>": 32008, + "<|placeholder6|>": 32009, + "<|system|>": 32006, + "<|user|>": 32010 +} diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-13312/config.json b/sft/revise_Full_smoe_tcmoe/checkpoint-13312/config.json new file mode 100644 index 0000000000000000000000000000000000000000..da3b0c65c0ef1d3a1c68ffdd7565996d4dd85a33 --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-13312/config.json @@ -0,0 +1,203 @@ +{ + "_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "architectures": [ + "LlavaPhiForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_phi3.Phi3Config", + "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM" + }, + "bal_comp_loss_coef": 0.01, + "balance_loss_coef": 0.01, + "bos_token_id": 1, + "clip_smoe": true, + "diversity_loss_coef": 0.01, + "dropout": false, + "e_loss_coef": 0.001, + "embd_pdrop": 0.0, + "entropy_advance_loss": false, + "eos_token_id": 32000, + "freeze_backbone": false, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 3072, + "hybrid": false, + "image_aspect_ratio": "pad", + "init_weight": true, + "initializer_range": 0.02, + "intermediate_size": 8192, + "is_cosine": false, + "is_norm_weight": false, + "local_rank": 0, + "loss1": "balanceloss", + "loss2": "zloss", + "luna": false, + "max_compete_in_iter": 3, + "max_position_embeddings": 131072, + "mlp_smoe": true, + "mm_hidden_size": 1152, + "mm_patch_merge_type": "flat", + "mm_projector_lr": null, + "mm_projector_type": "moe", + "mm_use_im_patch_token": false, + "mm_use_im_start_end": false, + "mm_vision_select_feature": "patch", + "mm_vision_select_layer": -2, + "mm_vision_tower": "google/siglip-so400m-patch14-224", + "model_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "model_type": "llava_phi", + "moe_name": "smoe_tcmoe", + "moe_relu_l1_reg_coeff_multiplier": 1.2, + "mp_pixel_shuffle_factor": 1, + "norm_softmax": false, + "normalization": true, + "num_attention_heads": 32, + "num_experts": 4, + "num_hidden_layers": 32, + "num_key_value_heads": 32, + "num_layers": 3, + "num_selected": 2, + "number_of_previous_tokens": 2, + "original_max_position_embeddings": 4096, + "pad_token_id": 32000, + "pretrain_mm_mlp_adapter": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft/mm_projector.bin", + "rate_compete": 0.2, + "rate_flip": 0.05, + "resid_pdrop": 0.0, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "long_factor": [ + 1.0800000429153442, + 1.1100000143051147, + 1.1399999856948853, + 1.340000033378601, + 1.5899999141693115, + 1.600000023841858, + 1.6200000047683716, + 2.620000123977661, + 3.2300000190734863, + 3.2300000190734863, + 4.789999961853027, + 7.400000095367432, + 7.700000286102295, + 9.09000015258789, + 12.199999809265137, + 17.670000076293945, + 24.46000099182129, + 28.57000160217285, + 30.420001983642578, + 30.840002059936523, + 32.590003967285156, + 32.93000411987305, + 42.320003509521484, + 44.96000289916992, + 50.340003967285156, + 50.45000457763672, + 57.55000305175781, + 57.93000411987305, + 58.21000289916992, + 60.1400032043457, + 62.61000442504883, + 62.62000274658203, + 62.71000289916992, + 63.1400032043457, + 63.1400032043457, + 63.77000427246094, + 63.93000411987305, + 63.96000289916992, + 63.970001220703125, + 64.02999877929688, + 64.06999969482422, + 64.08000183105469, + 64.12000274658203, + 64.41000366210938, + 64.4800033569336, + 64.51000213623047, + 64.52999877929688, + 64.83999633789062 + ], + "short_factor": [ + 1.0, + 1.0199999809265137, + 1.0299999713897705, + 1.0299999713897705, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0699999332427979, + 1.0999999046325684, + 1.1099998950958252, + 1.1599998474121094, + 1.1599998474121094, + 1.1699998378753662, + 1.2899998426437378, + 1.339999794960022, + 1.679999828338623, + 1.7899998426437378, + 1.8199998140335083, + 1.8499997854232788, + 1.8799997568130493, + 1.9099997282028198, + 1.9399996995925903, + 1.9899996519088745, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0799996852874756, + 2.0899996757507324, + 2.189999580383301, + 2.2199995517730713, + 2.5899994373321533, + 2.729999542236328, + 2.749999523162842, + 2.8399994373321533 + ], + "type": "longrope" + }, + "rope_theta": 10000.0, + "router_loss_coef": 0.01, + "router_theta": 0.1, + "router_z_loss_coef": 0.001, + "scales": [ + 1, + 3 + ], + "sliding_window": 262144, + "sparse_upcycling": true, + "std_gate": 0.02, + "strategy_train": "base", + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "topk_max": 2, + "topk_min": 1, + "torch_dtype": "bfloat16", + "training": true, + "transformers_version": "4.43.0", + "tune_mm_mlp_adapter": false, + "unit_test": true, + "use_cache": false, + "use_mm_proj": true, + "use_old": false, + "version": "phi35", + "vision_tower": "google/siglip-so400m-patch14-224", + "vision_tower_dir": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft/clip.bin", + "vocab_size": 32064, + "warm_up": 0.05 +} diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-13312/generation_config.json b/sft/revise_Full_smoe_tcmoe/checkpoint-13312/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..dad5c4578f0dc5969b38755d095fc30c368bb54a --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-13312/generation_config.json @@ -0,0 +1,12 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "do_sample": true, + "eos_token_id": [ + 32007, + 32001, + 32000 + ], + "pad_token_id": 32000, + "transformers_version": "4.43.0" +} diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-13312/latest b/sft/revise_Full_smoe_tcmoe/checkpoint-13312/latest new file mode 100644 index 0000000000000000000000000000000000000000..41750eb6d66126b6023d0560fd5c7875c0706774 --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-13312/latest @@ -0,0 +1 @@ +global_step13312 \ No newline at end of file diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-13312/model-00001-of-00003.safetensors b/sft/revise_Full_smoe_tcmoe/checkpoint-13312/model-00001-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..4ce76324e7a50d7b0f29d45e5e09dccbbcece39d --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-13312/model-00001-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f30277f19ed754260b559eda45afcb365204da9ca97b7ae36f0b7879ba04a62 +size 4972489328 diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-13312/model-00002-of-00003.safetensors b/sft/revise_Full_smoe_tcmoe/checkpoint-13312/model-00002-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d2e946962cae6a014fcac3339c81444e3eeb9cd8 --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-13312/model-00002-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:584ad467c59cf09afa5aa5d0183d5c5a98f35cc3810c78c982f2b83e74dcddc7 +size 4985902928 diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-13312/model-00003-of-00003.safetensors b/sft/revise_Full_smoe_tcmoe/checkpoint-13312/model-00003-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e867bf72a1a38ab3dd8f62c96de1356bf4d51a74 --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-13312/model-00003-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:521acac92247f8756e0667b77ba18afdd57d10be41c4a9064b547b6220d442ef +size 248971200 diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-13312/model.safetensors.index.json b/sft/revise_Full_smoe_tcmoe/checkpoint-13312/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..3197289c4553bb4cba30dd31a8c232b7496a92b5 --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-13312/model.safetensors.index.json @@ -0,0 +1,1005 @@ +{ + "metadata": { + "total_size": 10207220352 + }, + "weight_map": { + "lm_head.weight": "model-00003-of-00003.safetensors", + "model.embed_tokens.weight": "model-00001-of-00003.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.16.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.16.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.17.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.17.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.18.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.18.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.19.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.19.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.21.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.25.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.25.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.25.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.26.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.26.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.26.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.27.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.27.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.27.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.28.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.28.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.28.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.29.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.29.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.29.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.30.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.30.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.30.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.31.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.31.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.31.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.mm_projector.moelayer.experts.0.0.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.0.0.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.0.2.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.0.2.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.1.0.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.1.0.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.1.2.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.1.2.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.2.0.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.2.0.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.2.2.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.2.2.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.3.0.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.3.0.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.3.2.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.3.2.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.gate.weight": "model-00003-of-00003.safetensors", + "model.norm.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00002-of-00003.safetensors" + } +} diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-13312/rng_state_0.pth b/sft/revise_Full_smoe_tcmoe/checkpoint-13312/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..9231f69f5fd461899867106a669ce247e70c72c2 --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-13312/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f9f23d807f0e704f4ca79670a6631cbff43189cf7f8ff4e1fc0a4330e636a798 +size 14960 diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-13312/rng_state_1.pth b/sft/revise_Full_smoe_tcmoe/checkpoint-13312/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..19fe2dcc766f192ea5de79cec4dcff17172a10f7 --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-13312/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d37f92f6aea5386e84d2d64a1a25d6ef96a10b3bbbfe63627981604c8934076 +size 14960 diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-13312/rng_state_2.pth b/sft/revise_Full_smoe_tcmoe/checkpoint-13312/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..bfe492519c6b79b07a8d68b98c5f3d0c073667aa --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-13312/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:667ebf727735115f00a6bdbe090344e9846c726d11bb555cdc201c415f27ad85 +size 14960 diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-13312/rng_state_3.pth b/sft/revise_Full_smoe_tcmoe/checkpoint-13312/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..838d42ad13e30851fdbd1d8801738a4106a9ce8b --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-13312/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49d306f8c511cba8a225e3b723c5fa79d8a6ecc922f834da914ff0780c78b1fc +size 14960 diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-13312/special_tokens_map.json b/sft/revise_Full_smoe_tcmoe/checkpoint-13312/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3e4d5a5bc1cb51753cc9ae0305ece0da60052b10 --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-13312/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-13312/tokenizer.model b/sft/revise_Full_smoe_tcmoe/checkpoint-13312/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-13312/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-13312/tokenizer_config.json b/sft/revise_Full_smoe_tcmoe/checkpoint-13312/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d579bb0b91b24b214ea3c2e487e27a65017cdc4a --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-13312/tokenizer_config.json @@ -0,0 +1,132 @@ +{ + "add_bos_token": false, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": false + }, + "32000": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32001": { + "content": "<|assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32002": { + "content": "<|placeholder1|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32003": { + "content": "<|placeholder2|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32004": { + "content": "<|placeholder3|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32005": { + "content": "<|placeholder4|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32006": { + "content": "<|system|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32007": { + "content": "<|end|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32008": { + "content": "<|placeholder5|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32009": { + "content": "<|placeholder6|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32010": { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "chat_template": "{% for message in messages %}{% if message['role'] == 'system' and message['content'] %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "legacy": false, + "model_max_length": 2048, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-13312/trainer_state.json b/sft/revise_Full_smoe_tcmoe/checkpoint-13312/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..6292de1d68a65d273e0d1635271815c138547c78 --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-13312/trainer_state.json @@ -0,0 +1,226337 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.8003607395160078, + "eval_steps": 500, + "global_step": 13312, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "auxiliary_loss_clip": 0.20073968, + "auxiliary_loss_mlp": 1.0941844, + "balance_loss_clip": 0.12873733, + "balance_loss_mlp": 0.03705556, + "epoch": 6.012325266796934e-05, + "flos": 24462952254720.0, + "grad_norm": 941654.8300602314, + "language_loss": 24.32558632, + "learning_rate": 0.0, + "loss": 16.92002487, + "num_input_tokens_seen": 19155, + "router_z_loss_clip": 72.03125, + "router_z_loss_mlp": 1058.5, + "step": 1, + "time_per_iteration": 18.343486785888672 + }, + { + "auxiliary_loss_clip": 0.13316599, + "auxiliary_loss_mlp": 0.71558112, + "balance_loss_clip": 0.08576315, + "balance_loss_mlp": 0.02466314, + "epoch": 0.00012024650533593868, + "flos": 20231457598080.0, + "grad_norm": 271164.48776572174, + "language_loss": 15.90828419, + "learning_rate": 4.4628432569317594e-07, + "loss": 16.75703049, + "num_input_tokens_seen": 36175, + "router_z_loss_clip": 47.40625, + "router_z_loss_mlp": 691.5, + "step": 2, + "time_per_iteration": 2.4823946952819824 + }, + { + "auxiliary_loss_clip": 0.13345747, + "auxiliary_loss_mlp": 0.73460984, + "balance_loss_clip": 0.08591475, + "balance_loss_mlp": 0.02464893, + "epoch": 0.000180369758003908, + "flos": 22316532197760.0, + "grad_norm": 30890.300344628693, + "language_loss": 15.82156086, + "learning_rate": 7.073439208833112e-07, + "loss": 16.68962669, + "num_input_tokens_seen": 54870, + "router_z_loss_clip": 47.46875, + "router_z_loss_mlp": 711.0, + "step": 3, + "time_per_iteration": 2.4773216247558594 + }, + { + "auxiliary_loss_clip": 0.13399127, + "auxiliary_loss_mlp": 0.72687411, + "balance_loss_clip": 0.08587996, + "balance_loss_mlp": 0.02472562, + "epoch": 0.00024049301067187735, + "flos": 22420471587840.0, + "grad_norm": 3825.373736974443, + "language_loss": 15.7262888, + "learning_rate": 8.925686513863519e-07, + "loss": 16.58715439, + "num_input_tokens_seen": 74575, + "router_z_loss_clip": 48.15625, + "router_z_loss_mlp": 703.0, + "step": 4, + "time_per_iteration": 2.492133378982544 + }, + { + "auxiliary_loss_clip": 0.13353133, + "auxiliary_loss_mlp": 0.72775936, + "balance_loss_clip": 0.08579096, + "balance_loss_mlp": 0.02463434, + "epoch": 0.0003006162633398467, + "flos": 21403286547840.0, + "grad_norm": 4441.394942298188, + "language_loss": 15.57899952, + "learning_rate": 1.0362401141348472e-06, + "loss": 16.44029045, + "num_input_tokens_seen": 92580, + "router_z_loss_clip": 47.65625, + "router_z_loss_mlp": 704.0, + "step": 5, + "time_per_iteration": 2.7607173919677734 + }, + { + "auxiliary_loss_clip": 0.13327441, + "auxiliary_loss_mlp": 0.71557182, + "balance_loss_clip": 0.08570103, + "balance_loss_mlp": 0.02465384, + "epoch": 0.000360739516007816, + "flos": 21658725319680.0, + "grad_norm": 2540.715684092784, + "language_loss": 14.90827179, + "learning_rate": 1.153628246576487e-06, + "loss": 15.75711823, + "num_input_tokens_seen": 109705, + "router_z_loss_clip": 47.5625, + "router_z_loss_mlp": 691.5, + "step": 6, + "time_per_iteration": 2.6497979164123535 + }, + { + "auxiliary_loss_clip": 0.13351092, + "auxiliary_loss_mlp": 0.7340821, + "balance_loss_clip": 0.08562777, + "balance_loss_mlp": 0.02460942, + "epoch": 0.0004208627686757854, + "flos": 27166682407680.0, + "grad_norm": 2502.417206046203, + "language_loss": 14.593853, + "learning_rate": 1.2528784983718962e-06, + "loss": 15.46144581, + "num_input_tokens_seen": 129425, + "router_z_loss_clip": 47.875, + "router_z_loss_mlp": 710.5, + "step": 7, + "time_per_iteration": 2.7325549125671387 + }, + { + "auxiliary_loss_clip": 0.13360947, + "auxiliary_loss_mlp": 0.73910165, + "balance_loss_clip": 0.08574936, + "balance_loss_mlp": 0.02474618, + "epoch": 0.0004809860213437547, + "flos": 31326727190400.0, + "grad_norm": 4081.02679202092, + "language_loss": 14.47960091, + "learning_rate": 1.338852977079528e-06, + "loss": 15.35231113, + "num_input_tokens_seen": 149210, + "router_z_loss_clip": 47.78125, + "router_z_loss_mlp": 715.5, + "step": 8, + "time_per_iteration": 2.7674574851989746 + }, + { + "auxiliary_loss_clip": 0.13345738, + "auxiliary_loss_mlp": 0.74048162, + "balance_loss_clip": 0.08564517, + "balance_loss_mlp": 0.02466127, + "epoch": 0.000541109274011724, + "flos": 32168541634560.0, + "grad_norm": 2607.7195165159947, + "language_loss": 13.74505424, + "learning_rate": 1.4146878417666224e-06, + "loss": 14.61899281, + "num_input_tokens_seen": 169055, + "router_z_loss_clip": 47.78125, + "router_z_loss_mlp": 716.5, + "step": 9, + "time_per_iteration": 2.8135807514190674 + }, + { + "auxiliary_loss_clip": 0.13289651, + "auxiliary_loss_mlp": 0.7478379, + "balance_loss_clip": 0.08548209, + "balance_loss_mlp": 0.02469334, + "epoch": 0.0006012325266796934, + "flos": 18922845657600.0, + "grad_norm": 8226.203152944285, + "language_loss": 12.47718525, + "learning_rate": 1.4825244398280232e-06, + "loss": 13.35791969, + "num_input_tokens_seen": 188045, + "router_z_loss_clip": 47.375, + "router_z_loss_mlp": 724.5, + "step": 10, + "time_per_iteration": 2.665703296661377 + }, + { + "auxiliary_loss_clip": 0.1330242, + "auxiliary_loss_mlp": 0.74298382, + "balance_loss_clip": 0.08549603, + "balance_loss_mlp": 0.02472211, + "epoch": 0.0006613557793476627, + "flos": 20780755038720.0, + "grad_norm": 29924.608712817644, + "language_loss": 12.23305321, + "learning_rate": 1.5438901072051983e-06, + "loss": 13.10906219, + "num_input_tokens_seen": 207035, + "router_z_loss_clip": 47.53125, + "router_z_loss_mlp": 719.0, + "step": 11, + "time_per_iteration": 2.6799204349517822 + }, + { + "auxiliary_loss_clip": 0.133246, + "auxiliary_loss_mlp": 0.74782056, + "balance_loss_clip": 0.08560382, + "balance_loss_mlp": 0.02467602, + "epoch": 0.000721479032015632, + "flos": 16587321603840.0, + "grad_norm": 24119.088684995622, + "language_loss": 11.84583473, + "learning_rate": 1.5999125722696629e-06, + "loss": 12.72690105, + "num_input_tokens_seen": 223225, + "router_z_loss_clip": 47.6875, + "router_z_loss_mlp": 723.5, + "step": 12, + "time_per_iteration": 2.707231044769287 + }, + { + "auxiliary_loss_clip": 0.13276552, + "auxiliary_loss_mlp": 0.74238944, + "balance_loss_clip": 0.08559544, + "balance_loss_mlp": 0.02461605, + "epoch": 0.0007816022846836014, + "flos": 23812254305280.0, + "grad_norm": 118556.26638855682, + "language_loss": 11.36912918, + "learning_rate": 1.6514482443788434e-06, + "loss": 12.24428368, + "num_input_tokens_seen": 242570, + "router_z_loss_clip": 47.1875, + "router_z_loss_mlp": 718.0, + "step": 13, + "time_per_iteration": 2.696007251739502 + }, + { + "auxiliary_loss_clip": 0.13292459, + "auxiliary_loss_mlp": 0.74095768, + "balance_loss_clip": 0.0856985, + "balance_loss_mlp": 0.02464909, + "epoch": 0.0008417255373515708, + "flos": 19178284429440.0, + "grad_norm": 181106.81391623587, + "language_loss": 10.94849205, + "learning_rate": 1.6991628240650723e-06, + "loss": 11.82237434, + "num_input_tokens_seen": 261215, + "router_z_loss_clip": 47.1875, + "router_z_loss_mlp": 716.5, + "step": 14, + "time_per_iteration": 2.676393985748291 + }, + { + "auxiliary_loss_clip": 0.13372461, + "auxiliary_loss_mlp": 0.75321233, + "balance_loss_clip": 0.08592231, + "balance_loss_mlp": 0.02469672, + "epoch": 0.00090184879001954, + "flos": 26402714006400.0, + "grad_norm": 8872.944602873076, + "language_loss": 11.40745831, + "learning_rate": 1.7435840350181584e-06, + "loss": 12.29439545, + "num_input_tokens_seen": 280035, + "router_z_loss_clip": 47.78125, + "router_z_loss_mlp": 729.5, + "step": 15, + "time_per_iteration": 2.716722249984741 + }, + { + "auxiliary_loss_clip": 0.13287091, + "auxiliary_loss_mlp": 0.73999238, + "balance_loss_clip": 0.0855229, + "balance_loss_mlp": 0.02466036, + "epoch": 0.0009619720426875094, + "flos": 24686157663360.0, + "grad_norm": 5195.838129438997, + "language_loss": 10.71900749, + "learning_rate": 1.7851373027727038e-06, + "loss": 11.59187126, + "num_input_tokens_seen": 300265, + "router_z_loss_clip": 47.3125, + "router_z_loss_mlp": 716.5, + "step": 16, + "time_per_iteration": 2.744054079055786 + }, + { + "auxiliary_loss_clip": 0.13309729, + "auxiliary_loss_mlp": 0.76006317, + "balance_loss_clip": 0.08562544, + "balance_loss_mlp": 0.0247116, + "epoch": 0.0010220952953554788, + "flos": 18630454435200.0, + "grad_norm": 4421.362455936007, + "language_loss": 10.42590714, + "learning_rate": 1.8241705979033208e-06, + "loss": 11.319067, + "num_input_tokens_seen": 317375, + "router_z_loss_clip": 47.5, + "router_z_loss_mlp": 736.0, + "step": 17, + "time_per_iteration": 4.191499471664429 + }, + { + "auxiliary_loss_clip": 0.13315202, + "auxiliary_loss_mlp": 0.7600373, + "balance_loss_clip": 0.08556177, + "balance_loss_mlp": 0.02468574, + "epoch": 0.001082218548023448, + "flos": 26150042419200.0, + "grad_norm": 7888.125072686045, + "language_loss": 9.94283867, + "learning_rate": 1.860972167459798e-06, + "loss": 10.83602905, + "num_input_tokens_seen": 337975, + "router_z_loss_clip": 47.625, + "router_z_loss_mlp": 735.5, + "step": 18, + "time_per_iteration": 2.7808027267456055 + }, + { + "auxiliary_loss_clip": 0.13318592, + "auxiliary_loss_mlp": 0.73953104, + "balance_loss_clip": 0.08563764, + "balance_loss_mlp": 0.02468731, + "epoch": 0.0011423418006914173, + "flos": 19615885977600.0, + "grad_norm": 21999.592558043798, + "language_loss": 8.84625435, + "learning_rate": 1.89578346593066e-06, + "loss": 9.71897125, + "num_input_tokens_seen": 356635, + "router_z_loss_clip": 47.53125, + "router_z_loss_mlp": 716.0, + "step": 19, + "time_per_iteration": 4.131728172302246 + }, + { + "auxiliary_loss_clip": 0.13303626, + "auxiliary_loss_mlp": 0.74244332, + "balance_loss_clip": 0.08565694, + "balance_loss_mlp": 0.02466989, + "epoch": 0.0012024650533593868, + "flos": 17901258278400.0, + "grad_norm": 4121.169450537968, + "language_loss": 8.27947521, + "learning_rate": 1.928808765521199e-06, + "loss": 9.15495491, + "num_input_tokens_seen": 375625, + "router_z_loss_clip": 47.34375, + "router_z_loss_mlp": 718.5, + "step": 20, + "time_per_iteration": 2.708914279937744 + }, + { + "auxiliary_loss_clip": 0.13338368, + "auxiliary_loss_mlp": 0.76394671, + "balance_loss_clip": 0.08570746, + "balance_loss_mlp": 0.02468888, + "epoch": 0.001262588306027356, + "flos": 21258495492480.0, + "grad_norm": 4514.811048777073, + "language_loss": 8.72282791, + "learning_rate": 1.9602224192552076e-06, + "loss": 9.62015915, + "num_input_tokens_seen": 394350, + "router_z_loss_clip": 47.6875, + "router_z_loss_mlp": 740.0, + "step": 21, + "time_per_iteration": 2.685307502746582 + }, + { + "auxiliary_loss_clip": 0.13281943, + "auxiliary_loss_mlp": 0.75118458, + "balance_loss_clip": 0.08552284, + "balance_loss_mlp": 0.02462207, + "epoch": 0.0013227115586953253, + "flos": 26111245178880.0, + "grad_norm": 4471.445911682346, + "language_loss": 8.71503925, + "learning_rate": 1.9901744328983746e-06, + "loss": 9.5990448, + "num_input_tokens_seen": 413255, + "router_z_loss_clip": 47.28125, + "router_z_loss_mlp": 727.5, + "step": 22, + "time_per_iteration": 2.734961748123169 + }, + { + "auxiliary_loss_clip": 0.13285899, + "auxiliary_loss_mlp": 0.73805398, + "balance_loss_clip": 0.08560154, + "balance_loss_mlp": 0.02467511, + "epoch": 0.0013828348113632948, + "flos": 23958177390720.0, + "grad_norm": 2111.5818511880134, + "language_loss": 8.18912506, + "learning_rate": 2.018794797290208e-06, + "loss": 9.06003761, + "num_input_tokens_seen": 433065, + "router_z_loss_clip": 47.3125, + "router_z_loss_mlp": 714.5, + "step": 23, + "time_per_iteration": 2.756584882736206 + }, + { + "auxiliary_loss_clip": 0.13278747, + "auxiliary_loss_mlp": 0.74887347, + "balance_loss_clip": 0.08537573, + "balance_loss_mlp": 0.0247524, + "epoch": 0.001442958064031264, + "flos": 15965125511040.0, + "grad_norm": 1807.1551511559412, + "language_loss": 8.28752899, + "learning_rate": 2.046196897962839e-06, + "loss": 9.16918945, + "num_input_tokens_seen": 451175, + "router_z_loss_clip": 47.4375, + "router_z_loss_mlp": 724.5, + "step": 24, + "time_per_iteration": 2.6928858757019043 + }, + { + "auxiliary_loss_clip": 0.13229564, + "auxiliary_loss_mlp": 0.73557305, + "balance_loss_clip": 0.08544464, + "balance_loss_mlp": 0.02463556, + "epoch": 0.0015030813166992333, + "flos": 18113287835520.0, + "grad_norm": 1186.4376598888527, + "language_loss": 7.80813074, + "learning_rate": 2.0724802282696944e-06, + "loss": 8.67599869, + "num_input_tokens_seen": 468775, + "router_z_loss_clip": 46.875, + "router_z_loss_mlp": 712.0, + "step": 25, + "time_per_iteration": 2.7093117237091064 + }, + { + "auxiliary_loss_clip": 0.13238442, + "auxiliary_loss_mlp": 0.7248075, + "balance_loss_clip": 0.085484, + "balance_loss_mlp": 0.02461214, + "epoch": 0.0015632045693672028, + "flos": 22240740579840.0, + "grad_norm": 3090.3782450571143, + "language_loss": 8.51009178, + "learning_rate": 2.0977325700720194e-06, + "loss": 9.36728287, + "num_input_tokens_seen": 488530, + "router_z_loss_clip": 46.875, + "router_z_loss_mlp": 701.0, + "step": 26, + "time_per_iteration": 2.7142887115478516 + }, + { + "auxiliary_loss_clip": 0.13264546, + "auxiliary_loss_mlp": 0.74387956, + "balance_loss_clip": 0.085568, + "balance_loss_mlp": 0.02464127, + "epoch": 0.001623327822035172, + "flos": 23999448326400.0, + "grad_norm": 883.8040958014411, + "language_loss": 8.80418682, + "learning_rate": 2.122031762649933e-06, + "loss": 9.68071175, + "num_input_tokens_seen": 510495, + "router_z_loss_clip": 47.03125, + "router_z_loss_mlp": 720.5, + "step": 27, + "time_per_iteration": 2.739086389541626 + }, + { + "auxiliary_loss_clip": 0.13261499, + "auxiliary_loss_mlp": 0.74588925, + "balance_loss_clip": 0.08545862, + "balance_loss_mlp": 0.02469785, + "epoch": 0.0016834510747031415, + "flos": 19682914844160.0, + "grad_norm": 778.9563997110462, + "language_loss": 7.52667618, + "learning_rate": 2.1454471497582483e-06, + "loss": 8.40517998, + "num_input_tokens_seen": 528605, + "router_z_loss_clip": 47.125, + "router_z_loss_mlp": 722.0, + "step": 28, + "time_per_iteration": 2.684328079223633 + }, + { + "auxiliary_loss_clip": 0.1322532, + "auxiliary_loss_mlp": 0.72868228, + "balance_loss_clip": 0.08545788, + "balance_loss_mlp": 0.02458075, + "epoch": 0.0017435743273711108, + "flos": 20930241922560.0, + "grad_norm": 711.3301469780024, + "language_loss": 7.32490015, + "learning_rate": 2.1680407726407727e-06, + "loss": 8.18583584, + "num_input_tokens_seen": 548515, + "router_z_loss_clip": 46.84375, + "router_z_loss_mlp": 705.0, + "step": 29, + "time_per_iteration": 2.6822586059570312 + }, + { + "auxiliary_loss_clip": 0.13197789, + "auxiliary_loss_mlp": 0.72772777, + "balance_loss_clip": 0.08529261, + "balance_loss_mlp": 0.02460276, + "epoch": 0.00180369758003908, + "flos": 19533763376640.0, + "grad_norm": 596.7513494595695, + "language_loss": 7.62213326, + "learning_rate": 2.189868360711334e-06, + "loss": 8.48183823, + "num_input_tokens_seen": 564025, + "router_z_loss_clip": 46.65625, + "router_z_loss_mlp": 703.5, + "step": 30, + "time_per_iteration": 2.66929030418396 + }, + { + "auxiliary_loss_clip": 0.13220352, + "auxiliary_loss_mlp": 0.73066145, + "balance_loss_clip": 0.08544487, + "balance_loss_mlp": 0.02460678, + "epoch": 0.0018638208327070496, + "flos": 27460415295360.0, + "grad_norm": 562.9814252823624, + "language_loss": 6.46621895, + "learning_rate": 2.2109801597326265e-06, + "loss": 7.32908344, + "num_input_tokens_seen": 583345, + "router_z_loss_clip": 46.78125, + "router_z_loss_mlp": 707.0, + "step": 31, + "time_per_iteration": 2.769524574279785 + }, + { + "auxiliary_loss_clip": 0.13217463, + "auxiliary_loss_mlp": 0.72719908, + "balance_loss_clip": 0.08546316, + "balance_loss_mlp": 0.02456231, + "epoch": 0.0019239440853750188, + "flos": 13594535723520.0, + "grad_norm": 932.7202356227122, + "language_loss": 6.38840246, + "learning_rate": 2.2314216284658796e-06, + "loss": 7.24777603, + "num_input_tokens_seen": 600010, + "router_z_loss_clip": 46.65625, + "router_z_loss_mlp": 703.0, + "step": 32, + "time_per_iteration": 2.6535158157348633 + }, + { + "auxiliary_loss_clip": 0.13187753, + "auxiliary_loss_mlp": 0.73303366, + "balance_loss_clip": 0.08555806, + "balance_loss_mlp": 0.02453755, + "epoch": 0.001984067338042988, + "flos": 11258466618240.0, + "grad_norm": 1313.3745045414653, + "language_loss": 6.49637842, + "learning_rate": 2.2512340280885094e-06, + "loss": 7.36128998, + "num_input_tokens_seen": 616295, + "router_z_loss_clip": 46.34375, + "router_z_loss_mlp": 709.5, + "step": 33, + "time_per_iteration": 2.7210733890533447 + }, + { + "auxiliary_loss_clip": 0.13162288, + "auxiliary_loss_mlp": 0.73504317, + "balance_loss_clip": 0.08544378, + "balance_loss_mlp": 0.02459392, + "epoch": 0.0020441905907109576, + "flos": 22393413918720.0, + "grad_norm": 826.9088902553285, + "language_loss": 6.77253819, + "learning_rate": 2.270454923596497e-06, + "loss": 7.6392045, + "num_input_tokens_seen": 637640, + "router_z_loss_clip": 46.0625, + "router_z_loss_mlp": 711.5, + "step": 34, + "time_per_iteration": 2.7001218795776367 + }, + { + "auxiliary_loss_clip": 0.13097668, + "auxiliary_loss_mlp": 0.75116229, + "balance_loss_clip": 0.08524574, + "balance_loss_mlp": 0.02459984, + "epoch": 0.0021043138433789266, + "flos": 49788911427840.0, + "grad_norm": 577.9485802079388, + "language_loss": 6.20400715, + "learning_rate": 2.2891186125067434e-06, + "loss": 7.08614588, + "num_input_tokens_seen": 659710, + "router_z_loss_clip": 45.6875, + "router_z_loss_mlp": 727.0, + "step": 35, + "time_per_iteration": 3.031013250350952 + }, + { + "auxiliary_loss_clip": 0.13148203, + "auxiliary_loss_mlp": 0.75109303, + "balance_loss_clip": 0.08537915, + "balance_loss_mlp": 0.02453051, + "epoch": 0.002164437096046896, + "flos": 20564155434240.0, + "grad_norm": 623.9821605724222, + "language_loss": 6.06852198, + "learning_rate": 2.307256493152974e-06, + "loss": 6.95109653, + "num_input_tokens_seen": 679670, + "router_z_loss_clip": 46.0625, + "router_z_loss_mlp": 727.0, + "step": 36, + "time_per_iteration": 2.7437260150909424 + }, + { + "auxiliary_loss_clip": 0.13138273, + "auxiliary_loss_mlp": 0.77219343, + "balance_loss_clip": 0.08535384, + "balance_loss_mlp": 0.02463487, + "epoch": 0.0022245603487148656, + "flos": 26549601413760.0, + "grad_norm": 1356.3181729473308, + "language_loss": 6.23619747, + "learning_rate": 2.3248973825097614e-06, + "loss": 7.13977337, + "num_input_tokens_seen": 700170, + "router_z_loss_clip": 46.03125, + "router_z_loss_mlp": 747.5, + "step": 37, + "time_per_iteration": 2.761021375656128 + }, + { + "auxiliary_loss_clip": 0.1308586, + "auxiliary_loss_mlp": 0.75746208, + "balance_loss_clip": 0.0852948, + "balance_loss_mlp": 0.02455192, + "epoch": 0.0022846836013828346, + "flos": 20344201666560.0, + "grad_norm": 550.1318567752543, + "language_loss": 6.76989794, + "learning_rate": 2.3420677916238357e-06, + "loss": 7.65821838, + "num_input_tokens_seen": 718545, + "router_z_loss_clip": 45.53125, + "router_z_loss_mlp": 733.5, + "step": 38, + "time_per_iteration": 2.797001600265503 + }, + { + "auxiliary_loss_clip": 0.13035053, + "auxiliary_loss_mlp": 0.76824772, + "balance_loss_clip": 0.08534516, + "balance_loss_mlp": 0.02459541, + "epoch": 0.002344806854050804, + "flos": 26254359152640.0, + "grad_norm": 327.614641212253, + "language_loss": 6.69246101, + "learning_rate": 2.358792165262154e-06, + "loss": 7.59105968, + "num_input_tokens_seen": 739865, + "router_z_loss_clip": 45.0, + "router_z_loss_mlp": 744.0, + "step": 39, + "time_per_iteration": 2.7852022647857666 + }, + { + "auxiliary_loss_clip": 0.1300399, + "auxiliary_loss_mlp": 0.74368668, + "balance_loss_clip": 0.08536238, + "balance_loss_mlp": 0.0244484, + "epoch": 0.0024049301067187736, + "flos": 11806296612480.0, + "grad_norm": 474.92846081285364, + "language_loss": 5.92113161, + "learning_rate": 2.3750930912143747e-06, + "loss": 6.79485798, + "num_input_tokens_seen": 755770, + "router_z_loss_clip": 44.6875, + "router_z_loss_mlp": 720.0, + "step": 40, + "time_per_iteration": 2.679415464401245 + }, + { + "auxiliary_loss_clip": 0.1309007, + "auxiliary_loss_mlp": 0.78535652, + "balance_loss_clip": 0.08556648, + "balance_loss_mlp": 0.02461432, + "epoch": 0.0024650533593867426, + "flos": 20637808773120.0, + "grad_norm": 345.5419638030077, + "language_loss": 6.47731018, + "learning_rate": 2.3909914837471044e-06, + "loss": 7.39356709, + "num_input_tokens_seen": 773440, + "router_z_loss_clip": 45.3125, + "router_z_loss_mlp": 760.0, + "step": 41, + "time_per_iteration": 2.835094928741455 + }, + { + "auxiliary_loss_clip": 0.13010421, + "auxiliary_loss_mlp": 0.76229548, + "balance_loss_clip": 0.08534975, + "balance_loss_mlp": 0.02450255, + "epoch": 0.002525176612054712, + "flos": 18412093895040.0, + "grad_norm": 622.6550674421553, + "language_loss": 6.03043365, + "learning_rate": 2.4065067449483835e-06, + "loss": 6.92283392, + "num_input_tokens_seen": 790455, + "router_z_loss_clip": 44.75, + "router_z_loss_mlp": 738.0, + "step": 42, + "time_per_iteration": 2.66955828666687 + }, + { + "auxiliary_loss_clip": 0.13026509, + "auxiliary_loss_mlp": 0.76781166, + "balance_loss_clip": 0.08538143, + "balance_loss_mlp": 0.02464763, + "epoch": 0.0025852998647226816, + "flos": 28191582023040.0, + "grad_norm": 8462.035545761653, + "language_loss": 5.972929, + "learning_rate": 2.4216569070848724e-06, + "loss": 6.87100601, + "num_input_tokens_seen": 810645, + "router_z_loss_clip": 44.875, + "router_z_loss_mlp": 744.0, + "step": 43, + "time_per_iteration": 2.7703070640563965 + }, + { + "auxiliary_loss_clip": 0.13056265, + "auxiliary_loss_mlp": 0.74383116, + "balance_loss_clip": 0.0856277, + "balance_loss_mlp": 0.02459292, + "epoch": 0.0026454231173906506, + "flos": 14288372657280.0, + "grad_norm": 293.14149660558166, + "language_loss": 5.65497112, + "learning_rate": 2.4364587585915504e-06, + "loss": 6.52936459, + "num_input_tokens_seen": 827470, + "router_z_loss_clip": 44.875, + "router_z_loss_mlp": 720.0, + "step": 44, + "time_per_iteration": 2.655585527420044 + }, + { + "auxiliary_loss_clip": 0.13054577, + "auxiliary_loss_mlp": 0.75350422, + "balance_loss_clip": 0.08569255, + "balance_loss_mlp": 0.02450033, + "epoch": 0.00270554637005862, + "flos": 22425796321920.0, + "grad_norm": 174.2843578867089, + "language_loss": 6.01187468, + "learning_rate": 2.450927955901469e-06, + "loss": 6.89592457, + "num_input_tokens_seen": 847285, + "router_z_loss_clip": 44.84375, + "router_z_loss_mlp": 730.0, + "step": 45, + "time_per_iteration": 2.705265522003174 + }, + { + "auxiliary_loss_clip": 0.12984964, + "auxiliary_loss_mlp": 0.73199093, + "balance_loss_clip": 0.08560722, + "balance_loss_mlp": 0.02447144, + "epoch": 0.0027656696227265896, + "flos": 23992236875520.0, + "grad_norm": 191.3929439681521, + "language_loss": 6.48347139, + "learning_rate": 2.465079122983384e-06, + "loss": 7.34531212, + "num_input_tokens_seen": 867545, + "router_z_loss_clip": 44.1875, + "router_z_loss_mlp": 708.5, + "step": 46, + "time_per_iteration": 2.733833074569702 + }, + { + "auxiliary_loss_clip": 0.12997682, + "auxiliary_loss_mlp": 0.73999059, + "balance_loss_clip": 0.08536641, + "balance_loss_mlp": 0.02465855, + "epoch": 0.0028257928753945586, + "flos": 37678511220480.0, + "grad_norm": 214.21785552289575, + "language_loss": 5.68396425, + "learning_rate": 2.4789259401737868e-06, + "loss": 6.55393171, + "num_input_tokens_seen": 889915, + "router_z_loss_clip": 44.5625, + "router_z_loss_mlp": 716.0, + "step": 47, + "time_per_iteration": 2.8230926990509033 + }, + { + "auxiliary_loss_clip": 0.1297729, + "auxiliary_loss_mlp": 0.74471426, + "balance_loss_clip": 0.08536708, + "balance_loss_mlp": 0.0244994, + "epoch": 0.002885916128062528, + "flos": 22460945909760.0, + "grad_norm": 449.4004858001912, + "language_loss": 5.75540733, + "learning_rate": 2.492481223656015e-06, + "loss": 6.62989426, + "num_input_tokens_seen": 908975, + "router_z_loss_clip": 44.40625, + "router_z_loss_mlp": 721.5, + "step": 48, + "time_per_iteration": 2.7284624576568604 + }, + { + "auxiliary_loss_clip": 0.12959239, + "auxiliary_loss_mlp": 0.73848325, + "balance_loss_clip": 0.08549985, + "balance_loss_mlp": 0.02461606, + "epoch": 0.0029460393807304976, + "flos": 27019543438080.0, + "grad_norm": 230.30029270071188, + "language_loss": 6.70517731, + "learning_rate": 2.5057569967437924e-06, + "loss": 7.57325315, + "num_input_tokens_seen": 929810, + "router_z_loss_clip": 44.0625, + "router_z_loss_mlp": 715.0, + "step": 49, + "time_per_iteration": 2.792755603790283 + }, + { + "auxiliary_loss_clip": 0.12996669, + "auxiliary_loss_mlp": 0.71446228, + "balance_loss_clip": 0.08555867, + "balance_loss_mlp": 0.02452083, + "epoch": 0.0030061626333984666, + "flos": 15857328833280.0, + "grad_norm": 311.93786428729913, + "language_loss": 5.55702782, + "learning_rate": 2.51876455396287e-06, + "loss": 6.40145731, + "num_input_tokens_seen": 948650, + "router_z_loss_clip": 44.34375, + "router_z_loss_mlp": 690.5, + "step": 50, + "time_per_iteration": 2.689176559448242 + }, + { + "auxiliary_loss_clip": 0.12955803, + "auxiliary_loss_mlp": 0.71350002, + "balance_loss_clip": 0.08553191, + "balance_loss_mlp": 0.02453516, + "epoch": 0.003066285886066436, + "flos": 31834292497920.0, + "grad_norm": 326.0050772098012, + "language_loss": 6.42039013, + "learning_rate": 2.5315145187866316e-06, + "loss": 7.26344872, + "num_input_tokens_seen": 966455, + "router_z_loss_clip": 44.0, + "router_z_loss_mlp": 689.5, + "step": 51, + "time_per_iteration": 2.751997232437134 + }, + { + "auxiliary_loss_clip": 0.12936625, + "auxiliary_loss_mlp": 0.71062022, + "balance_loss_clip": 0.08552323, + "balance_loss_mlp": 0.02458507, + "epoch": 0.0031264091387344056, + "flos": 41437110291840.0, + "grad_norm": 467.7969407780881, + "language_loss": 5.78601551, + "learning_rate": 2.5440168957651953e-06, + "loss": 6.62600183, + "num_input_tokens_seen": 988110, + "router_z_loss_clip": 43.84375, + "router_z_loss_mlp": 686.5, + "step": 52, + "time_per_iteration": 2.8259687423706055 + }, + { + "auxiliary_loss_clip": 0.12935326, + "auxiliary_loss_mlp": 0.69343221, + "balance_loss_clip": 0.08550587, + "balance_loss_mlp": 0.02448688, + "epoch": 0.0031865323914023747, + "flos": 23447719117440.0, + "grad_norm": 4084.3297995155954, + "language_loss": 5.79331207, + "learning_rate": 2.5562811176888872e-06, + "loss": 6.61609745, + "num_input_tokens_seen": 1008550, + "router_z_loss_clip": 43.78125, + "router_z_loss_mlp": 669.0, + "step": 53, + "time_per_iteration": 2.6902496814727783 + }, + { + "auxiliary_loss_clip": 0.12926383, + "auxiliary_loss_mlp": 0.69104648, + "balance_loss_clip": 0.08542258, + "balance_loss_mlp": 0.02454257, + "epoch": 0.003246655644070344, + "flos": 14434505377920.0, + "grad_norm": 247.18448581495338, + "language_loss": 5.53028297, + "learning_rate": 2.5683160883431093e-06, + "loss": 6.35059309, + "num_input_tokens_seen": 1026840, + "router_z_loss_clip": 43.75, + "router_z_loss_mlp": 666.5, + "step": 54, + "time_per_iteration": 2.642801523208618 + }, + { + "auxiliary_loss_clip": 0.12913677, + "auxiliary_loss_mlp": 0.68966341, + "balance_loss_clip": 0.08543722, + "balance_loss_mlp": 0.02462436, + "epoch": 0.0033067788967383136, + "flos": 35926972997760.0, + "grad_norm": 431.229914559421, + "language_loss": 5.18386555, + "learning_rate": 2.580130221340046e-06, + "loss": 6.00266552, + "num_input_tokens_seen": 1048875, + "router_z_loss_clip": 43.6875, + "router_z_loss_mlp": 665.0, + "step": 55, + "time_per_iteration": 2.7916810512542725 + }, + { + "auxiliary_loss_clip": 0.12884736, + "auxiliary_loss_mlp": 0.68559694, + "balance_loss_clip": 0.08553176, + "balance_loss_mlp": 0.02446416, + "epoch": 0.003366902149406283, + "flos": 22964108878080.0, + "grad_norm": 559.5224439968259, + "language_loss": 5.74156904, + "learning_rate": 2.5917314754514246e-06, + "loss": 6.55601311, + "num_input_tokens_seen": 1066435, + "router_z_loss_clip": 43.28125, + "router_z_loss_mlp": 661.0, + "step": 56, + "time_per_iteration": 2.638873338699341 + }, + { + "auxiliary_loss_clip": 0.12877631, + "auxiliary_loss_mlp": 0.65916806, + "balance_loss_clip": 0.08553813, + "balance_loss_mlp": 0.02440244, + "epoch": 0.003427025402074252, + "flos": 26590830422400.0, + "grad_norm": 1293.1571760901363, + "language_loss": 6.61670828, + "learning_rate": 2.6031273868139713e-06, + "loss": 7.4046526, + "num_input_tokens_seen": 1090330, + "router_z_loss_clip": 43.28125, + "router_z_loss_mlp": 634.0, + "step": 57, + "time_per_iteration": 4.246931314468384 + }, + { + "auxiliary_loss_clip": 0.12864697, + "auxiliary_loss_mlp": 0.66109824, + "balance_loss_clip": 0.08544569, + "balance_loss_mlp": 0.02437945, + "epoch": 0.0034871486547422216, + "flos": 23957967755520.0, + "grad_norm": 1581.401693587077, + "language_loss": 6.75815916, + "learning_rate": 2.614325098333948e-06, + "loss": 7.54790401, + "num_input_tokens_seen": 1109840, + "router_z_loss_clip": 43.25, + "router_z_loss_mlp": 636.0, + "step": 58, + "time_per_iteration": 4.129940986633301 + }, + { + "auxiliary_loss_clip": 0.12923497, + "auxiliary_loss_mlp": 0.64957327, + "balance_loss_clip": 0.08577307, + "balance_loss_mlp": 0.02457325, + "epoch": 0.003547271907410191, + "flos": 21221333406720.0, + "grad_norm": 1242.7465016222895, + "language_loss": 5.84827662, + "learning_rate": 2.625331386578098e-06, + "loss": 6.62708521, + "num_input_tokens_seen": 1128415, + "router_z_loss_clip": 43.40625, + "router_z_loss_mlp": 624.0, + "step": 59, + "time_per_iteration": 2.81791090965271 + }, + { + "auxiliary_loss_clip": 0.1292145, + "auxiliary_loss_mlp": 0.65939367, + "balance_loss_clip": 0.08575267, + "balance_loss_mlp": 0.02462805, + "epoch": 0.00360739516007816, + "flos": 16509894831360.0, + "grad_norm": 2163.0106173410372, + "language_loss": 6.19513655, + "learning_rate": 2.63615268640451e-06, + "loss": 6.98374462, + "num_input_tokens_seen": 1146515, + "router_z_loss_clip": 43.4375, + "router_z_loss_mlp": 634.0, + "step": 60, + "time_per_iteration": 2.6462490558624268 + }, + { + "auxiliary_loss_clip": 0.12888563, + "auxiliary_loss_mlp": 0.64225286, + "balance_loss_clip": 0.08565725, + "balance_loss_mlp": 0.0245771, + "epoch": 0.0036675184127461296, + "flos": 19471052995200.0, + "grad_norm": 635.7445513752676, + "language_loss": 5.79569387, + "learning_rate": 2.6467951135575943e-06, + "loss": 6.56683254, + "num_input_tokens_seen": 1166330, + "router_z_loss_clip": 43.21875, + "router_z_loss_mlp": 617.0, + "step": 61, + "time_per_iteration": 2.681910753250122 + }, + { + "auxiliary_loss_clip": 0.12824672, + "auxiliary_loss_mlp": 0.63430971, + "balance_loss_clip": 0.08548941, + "balance_loss_mlp": 0.02444647, + "epoch": 0.003727641665414099, + "flos": 20963253231360.0, + "grad_norm": 899.0914058712833, + "language_loss": 5.87668133, + "learning_rate": 2.657264485425803e-06, + "loss": 6.63923836, + "num_input_tokens_seen": 1186010, + "router_z_loss_clip": 42.71875, + "router_z_loss_mlp": 609.0, + "step": 62, + "time_per_iteration": 2.6819515228271484 + }, + { + "auxiliary_loss_clip": 0.12823591, + "auxiliary_loss_mlp": 0.6255362, + "balance_loss_clip": 0.08562292, + "balance_loss_mlp": 0.02446202, + "epoch": 0.003787764918082068, + "flos": 18412010040960.0, + "grad_norm": 1285.0325266073119, + "language_loss": 5.71324301, + "learning_rate": 2.6675663401385186e-06, + "loss": 6.46701479, + "num_input_tokens_seen": 1204985, + "router_z_loss_clip": 42.59375, + "router_z_loss_mlp": 600.0, + "step": 63, + "time_per_iteration": 2.6705985069274902 + }, + { + "auxiliary_loss_clip": 0.12830947, + "auxiliary_loss_mlp": 0.62154531, + "balance_loss_clip": 0.08567161, + "balance_loss_mlp": 0.02437731, + "epoch": 0.0038478881707500376, + "flos": 12464271198720.0, + "grad_norm": 1843.6770385957534, + "language_loss": 5.25008583, + "learning_rate": 2.677705954159056e-06, + "loss": 5.99994087, + "num_input_tokens_seen": 1223545, + "router_z_loss_clip": 42.6875, + "router_z_loss_mlp": 597.0, + "step": 64, + "time_per_iteration": 2.7688894271850586 + }, + { + "auxiliary_loss_clip": 0.12807481, + "auxiliary_loss_mlp": 0.61575615, + "balance_loss_clip": 0.08564365, + "balance_loss_mlp": 0.02444756, + "epoch": 0.003908011423418007, + "flos": 13558463740800.0, + "grad_norm": 1007.498474071754, + "language_loss": 5.29735851, + "learning_rate": 2.6876883585136904e-06, + "loss": 6.04118919, + "num_input_tokens_seen": 1241175, + "router_z_loss_clip": 42.40625, + "router_z_loss_mlp": 590.5, + "step": 65, + "time_per_iteration": 2.7044079303741455 + }, + { + "auxiliary_loss_clip": 0.12739113, + "auxiliary_loss_mlp": 0.60150075, + "balance_loss_clip": 0.08550942, + "balance_loss_mlp": 0.02435229, + "epoch": 0.003968134676085976, + "flos": 18339488732160.0, + "grad_norm": 1472.5993340381553, + "language_loss": 5.05529404, + "learning_rate": 2.697518353781685e-06, + "loss": 5.78418589, + "num_input_tokens_seen": 1259315, + "router_z_loss_clip": 41.90625, + "router_z_loss_mlp": 577.0, + "step": 66, + "time_per_iteration": 2.639763116836548 + }, + { + "auxiliary_loss_clip": 0.12713413, + "auxiliary_loss_mlp": 0.58826029, + "balance_loss_clip": 0.08548602, + "balance_loss_mlp": 0.02429543, + "epoch": 0.004028257928753946, + "flos": 20491466417280.0, + "grad_norm": 2128.447716031984, + "language_loss": 5.57779789, + "learning_rate": 2.7072005239581103e-06, + "loss": 6.29319191, + "num_input_tokens_seen": 1277055, + "router_z_loss_clip": 41.65625, + "router_z_loss_mlp": 564.0, + "step": 67, + "time_per_iteration": 2.6764183044433594 + }, + { + "auxiliary_loss_clip": 0.12659386, + "auxiliary_loss_mlp": 0.59566367, + "balance_loss_clip": 0.08534892, + "balance_loss_mlp": 0.02437462, + "epoch": 0.004088381181421915, + "flos": 18849863151360.0, + "grad_norm": 1300.1095038466112, + "language_loss": 5.65431881, + "learning_rate": 2.7167392492896727e-06, + "loss": 6.37657642, + "num_input_tokens_seen": 1294355, + "router_z_loss_clip": 41.21875, + "router_z_loss_mlp": 571.5, + "step": 68, + "time_per_iteration": 2.6499533653259277 + }, + { + "auxiliary_loss_clip": 0.12670201, + "auxiliary_loss_mlp": 0.59023213, + "balance_loss_clip": 0.08528139, + "balance_loss_mlp": 0.02431421, + "epoch": 0.004148504434089885, + "flos": 19433974763520.0, + "grad_norm": 775.8661457915586, + "language_loss": 5.68540192, + "learning_rate": 2.7261387181735195e-06, + "loss": 6.40233564, + "num_input_tokens_seen": 1313525, + "router_z_loss_clip": 41.375, + "router_z_loss_mlp": 566.0, + "step": 69, + "time_per_iteration": 2.680570363998413 + }, + { + "auxiliary_loss_clip": 0.12638462, + "auxiliary_loss_mlp": 0.5930984, + "balance_loss_clip": 0.08532386, + "balance_loss_mlp": 0.02425073, + "epoch": 0.004208627686757853, + "flos": 20816868948480.0, + "grad_norm": 532.7078221445815, + "language_loss": 6.55753994, + "learning_rate": 2.7354029381999196e-06, + "loss": 7.27702332, + "num_input_tokens_seen": 1330505, + "router_z_loss_clip": 41.09375, + "router_z_loss_mlp": 570.0, + "step": 70, + "time_per_iteration": 2.6596553325653076 + }, + { + "auxiliary_loss_clip": 0.12589023, + "auxiliary_loss_mlp": 0.57596606, + "balance_loss_clip": 0.08525643, + "balance_loss_mlp": 0.02420826, + "epoch": 0.004268750939425823, + "flos": 19104589163520.0, + "grad_norm": 3523.620393185992, + "language_loss": 4.99572229, + "learning_rate": 2.7445357464116983e-06, + "loss": 5.69757891, + "num_input_tokens_seen": 1349615, + "router_z_loss_clip": 40.71875, + "router_z_loss_mlp": 552.5, + "step": 71, + "time_per_iteration": 2.6517086029052734 + }, + { + "auxiliary_loss_clip": 0.13345143, + "auxiliary_loss_mlp": 0.53337634, + "balance_loss_clip": 0.08910056, + "balance_loss_mlp": 0.02458726, + "epoch": 0.004328874192093792, + "flos": 52456112340480.0, + "grad_norm": 24.73254947156558, + "language_loss": 0.75920403, + "learning_rate": 2.75354081884615e-06, + "loss": 1.42603183, + "num_input_tokens_seen": 1410275, + "router_z_loss_clip": 44.375, + "router_z_loss_mlp": 508.25, + "step": 72, + "time_per_iteration": 3.4461121559143066 + }, + { + "auxiliary_loss_clip": 0.13279217, + "auxiliary_loss_mlp": 0.51093936, + "balance_loss_clip": 0.08903308, + "balance_loss_mlp": 0.02436709, + "epoch": 0.004388997444761762, + "flos": 66495922260480.0, + "grad_norm": 24.018429481505308, + "language_loss": 0.70889235, + "learning_rate": 2.7624216794188286e-06, + "loss": 1.35262394, + "num_input_tokens_seen": 1473020, + "router_z_loss_clip": 43.71875, + "router_z_loss_mlp": 486.25, + "step": 73, + "time_per_iteration": 3.8973076343536377 + }, + { + "auxiliary_loss_clip": 0.12491501, + "auxiliary_loss_mlp": 0.53349555, + "balance_loss_clip": 0.08502775, + "balance_loss_mlp": 0.02397403, + "epoch": 0.004449120697429731, + "flos": 18958959567360.0, + "grad_norm": 3320.4524015503866, + "language_loss": 5.2433157, + "learning_rate": 2.771181708202938e-06, + "loss": 5.90172577, + "num_input_tokens_seen": 1490385, + "router_z_loss_clip": 39.90625, + "router_z_loss_mlp": 509.5, + "step": 74, + "time_per_iteration": 2.6803529262542725 + }, + { + "auxiliary_loss_clip": 0.12445074, + "auxiliary_loss_mlp": 0.51731253, + "balance_loss_clip": 0.08501716, + "balance_loss_mlp": 0.02390428, + "epoch": 0.004509243950097701, + "flos": 21111817720320.0, + "grad_norm": 2097.466788992517, + "language_loss": 5.57566261, + "learning_rate": 2.779824149153005e-06, + "loss": 6.21742582, + "num_input_tokens_seen": 1509725, + "router_z_loss_clip": 39.4375, + "router_z_loss_mlp": 493.0, + "step": 75, + "time_per_iteration": 2.687678575515747 + }, + { + "auxiliary_loss_clip": 0.12385009, + "auxiliary_loss_mlp": 0.49917772, + "balance_loss_clip": 0.08505447, + "balance_loss_mlp": 0.0235918, + "epoch": 0.004569367202765669, + "flos": 20704082952960.0, + "grad_norm": 7030.779065512956, + "language_loss": 5.64007378, + "learning_rate": 2.788352117317012e-06, + "loss": 6.26310158, + "num_input_tokens_seen": 1527245, + "router_z_loss_clip": 38.8125, + "router_z_loss_mlp": 475.25, + "step": 76, + "time_per_iteration": 2.666630744934082 + }, + { + "auxiliary_loss_clip": 0.12336895, + "auxiliary_loss_mlp": 0.48941305, + "balance_loss_clip": 0.08483945, + "balance_loss_mlp": 0.02359273, + "epoch": 0.004629490455433639, + "flos": 28666136021760.0, + "grad_norm": 620.4309602119407, + "language_loss": 5.72052956, + "learning_rate": 2.796768605577095e-06, + "loss": 6.33331108, + "num_input_tokens_seen": 1548930, + "router_z_loss_clip": 38.5, + "router_z_loss_mlp": 465.5, + "step": 77, + "time_per_iteration": 2.7469568252563477 + }, + { + "auxiliary_loss_clip": 0.12308235, + "auxiliary_loss_mlp": 0.48191378, + "balance_loss_clip": 0.08460534, + "balance_loss_mlp": 0.02366182, + "epoch": 0.004689613708101608, + "flos": 11077142382720.0, + "grad_norm": 1643.3438058920954, + "language_loss": 5.09305811, + "learning_rate": 2.80507649095533e-06, + "loss": 5.69805431, + "num_input_tokens_seen": 1565695, + "router_z_loss_clip": 38.5, + "router_z_loss_mlp": 458.25, + "step": 78, + "time_per_iteration": 2.6558547019958496 + }, + { + "auxiliary_loss_clip": 0.12249273, + "auxiliary_loss_mlp": 0.46293706, + "balance_loss_clip": 0.08442898, + "balance_loss_mlp": 0.02348393, + "epoch": 0.004749736960769578, + "flos": 21805612727040.0, + "grad_norm": 2200.9167741447113, + "language_loss": 4.90451622, + "learning_rate": 2.813278540517843e-06, + "loss": 5.48994637, + "num_input_tokens_seen": 1582625, + "router_z_loss_clip": 38.0625, + "router_z_loss_mlp": 439.75, + "step": 79, + "time_per_iteration": 2.7162697315216064 + }, + { + "auxiliary_loss_clip": 0.12262511, + "auxiliary_loss_mlp": 0.46983981, + "balance_loss_clip": 0.08447941, + "balance_loss_mlp": 0.02355075, + "epoch": 0.004809860213437547, + "flos": 19798803440640.0, + "grad_norm": 344.66463824801895, + "language_loss": 5.05523586, + "learning_rate": 2.8213774169075505e-06, + "loss": 5.64770126, + "num_input_tokens_seen": 1601725, + "router_z_loss_clip": 38.125, + "router_z_loss_mlp": 446.75, + "step": 80, + "time_per_iteration": 2.687460422515869 + }, + { + "auxiliary_loss_clip": 0.12261841, + "auxiliary_loss_mlp": 0.45211679, + "balance_loss_clip": 0.08451226, + "balance_loss_mlp": 0.02364997, + "epoch": 0.004869983466105517, + "flos": 26580893713920.0, + "grad_norm": 1677.7099343970488, + "language_loss": 5.56453705, + "learning_rate": 2.829375683533245e-06, + "loss": 6.13927221, + "num_input_tokens_seen": 1622420, + "router_z_loss_clip": 38.125, + "router_z_loss_mlp": 428.5, + "step": 81, + "time_per_iteration": 2.7709527015686035 + }, + { + "auxiliary_loss_clip": 0.12245495, + "auxiliary_loss_mlp": 0.44303346, + "balance_loss_clip": 0.08439148, + "balance_loss_mlp": 0.02335574, + "epoch": 0.004930106718773485, + "flos": 12828345189120.0, + "grad_norm": 4679.4395433895315, + "language_loss": 4.60398674, + "learning_rate": 2.8372758094402803e-06, + "loss": 5.16947508, + "num_input_tokens_seen": 1640715, + "router_z_loss_clip": 38.125, + "router_z_loss_mlp": 419.75, + "step": 82, + "time_per_iteration": 2.6463286876678467 + }, + { + "auxiliary_loss_clip": 0.12233329, + "auxiliary_loss_mlp": 0.44903332, + "balance_loss_clip": 0.0843938, + "balance_loss_mlp": 0.0234962, + "epoch": 0.004990229971441455, + "flos": 25781901505920.0, + "grad_norm": 1468.5073951038269, + "language_loss": 5.41148376, + "learning_rate": 2.84508017388607e-06, + "loss": 5.98285007, + "num_input_tokens_seen": 1662210, + "router_z_loss_clip": 37.96875, + "router_z_loss_mlp": 425.5, + "step": 83, + "time_per_iteration": 2.751582145690918 + }, + { + "auxiliary_loss_clip": 0.12286501, + "auxiliary_loss_mlp": 0.44843888, + "balance_loss_clip": 0.08466095, + "balance_loss_mlp": 0.0236342, + "epoch": 0.005050353224109424, + "flos": 17463027824640.0, + "grad_norm": 333.54187308321605, + "language_loss": 4.89241934, + "learning_rate": 2.852791070641559e-06, + "loss": 5.46372318, + "num_input_tokens_seen": 1681070, + "router_z_loss_clip": 38.21875, + "router_z_loss_mlp": 425.0, + "step": 84, + "time_per_iteration": 2.6613667011260986 + }, + { + "auxiliary_loss_clip": 0.12715524, + "auxiliary_loss_mlp": 0.33666173, + "balance_loss_clip": 0.08695208, + "balance_loss_mlp": 0.02245275, + "epoch": 0.005110476476777394, + "flos": 69824607160320.0, + "grad_norm": 16.750834021856043, + "language_loss": 0.63998127, + "learning_rate": 2.8604107120381682e-06, + "loss": 1.10379827, + "num_input_tokens_seen": 1747140, + "router_z_loss_clip": 40.09375, + "router_z_loss_mlp": 313.75, + "step": 85, + "time_per_iteration": 3.4564764499664307 + }, + { + "auxiliary_loss_clip": 0.12209877, + "auxiliary_loss_mlp": 0.42757708, + "balance_loss_clip": 0.08426955, + "balance_loss_mlp": 0.02352437, + "epoch": 0.005170599729445363, + "flos": 24796973088000.0, + "grad_norm": 542.703970895993, + "language_loss": 4.92362881, + "learning_rate": 2.8679412327780482e-06, + "loss": 5.47330475, + "num_input_tokens_seen": 1767475, + "router_z_loss_clip": 37.90625, + "router_z_loss_mlp": 403.75, + "step": 86, + "time_per_iteration": 2.775689125061035 + }, + { + "auxiliary_loss_clip": 0.12224952, + "auxiliary_loss_mlp": 0.4164477, + "balance_loss_clip": 0.08412233, + "balance_loss_mlp": 0.02362544, + "epoch": 0.005230722982113333, + "flos": 23264717800320.0, + "grad_norm": 4371.207136836947, + "language_loss": 5.4414258, + "learning_rate": 2.8753846935240833e-06, + "loss": 5.98012304, + "num_input_tokens_seen": 1784980, + "router_z_loss_clip": 38.15625, + "router_z_loss_mlp": 392.25, + "step": 87, + "time_per_iteration": 2.7322311401367188 + }, + { + "auxiliary_loss_clip": 0.12200201, + "auxiliary_loss_mlp": 0.41744971, + "balance_loss_clip": 0.08406796, + "balance_loss_mlp": 0.02365087, + "epoch": 0.005290846234781301, + "flos": 16733622032640.0, + "grad_norm": 2919.861295310318, + "language_loss": 4.86351013, + "learning_rate": 2.8827430842847267e-06, + "loss": 5.40296173, + "num_input_tokens_seen": 1803030, + "router_z_loss_clip": 38.0, + "router_z_loss_mlp": 393.75, + "step": 88, + "time_per_iteration": 2.7260544300079346 + }, + { + "auxiliary_loss_clip": 0.1219901, + "auxiliary_loss_mlp": 0.40224642, + "balance_loss_clip": 0.08417168, + "balance_loss_mlp": 0.02358433, + "epoch": 0.005350969487449271, + "flos": 20892283223040.0, + "grad_norm": 1645.58162705774, + "language_loss": 5.16751766, + "learning_rate": 2.8900183276075957e-06, + "loss": 5.69175386, + "num_input_tokens_seen": 1822865, + "router_z_loss_clip": 37.875, + "router_z_loss_mlp": 378.5, + "step": 89, + "time_per_iteration": 2.674370288848877 + }, + { + "auxiliary_loss_clip": 0.12154645, + "auxiliary_loss_mlp": 0.38342261, + "balance_loss_clip": 0.0840472, + "balance_loss_mlp": 0.02331517, + "epoch": 0.00541109274011724, + "flos": 26216568161280.0, + "grad_norm": 1270.091627450628, + "language_loss": 4.37986279, + "learning_rate": 2.8972122815946455e-06, + "loss": 4.88483191, + "num_input_tokens_seen": 1842435, + "router_z_loss_clip": 37.5, + "router_z_loss_mlp": 360.75, + "step": 90, + "time_per_iteration": 2.7423648834228516 + }, + { + "auxiliary_loss_clip": 0.12150387, + "auxiliary_loss_mlp": 0.38653693, + "balance_loss_clip": 0.08385181, + "balance_loss_mlp": 0.02349981, + "epoch": 0.00547121599278521, + "flos": 21184926007680.0, + "grad_norm": 803.9563265609303, + "language_loss": 5.31085825, + "learning_rate": 2.90432674275074e-06, + "loss": 5.81889915, + "num_input_tokens_seen": 1860065, + "router_z_loss_clip": 37.6875, + "router_z_loss_mlp": 363.0, + "step": 91, + "time_per_iteration": 2.6603400707244873 + }, + { + "auxiliary_loss_clip": 0.12079477, + "auxiliary_loss_mlp": 0.37034535, + "balance_loss_clip": 0.08381163, + "balance_loss_mlp": 0.02342154, + "epoch": 0.005531339245453179, + "flos": 19724856612480.0, + "grad_norm": 829.7403965041182, + "language_loss": 4.4634366, + "learning_rate": 2.91136344867656e-06, + "loss": 4.95457649, + "num_input_tokens_seen": 1878135, + "router_z_loss_clip": 37.0, + "router_z_loss_mlp": 347.25, + "step": 92, + "time_per_iteration": 2.6818525791168213 + }, + { + "auxiliary_loss_clip": 0.1209444, + "auxiliary_loss_mlp": 0.35073167, + "balance_loss_clip": 0.08383686, + "balance_loss_mlp": 0.02309498, + "epoch": 0.005591462498121149, + "flos": 17641291386240.0, + "grad_norm": 1625.08326205636, + "language_loss": 4.56070709, + "learning_rate": 2.918324080615938e-06, + "loss": 5.03238297, + "num_input_tokens_seen": 1894895, + "router_z_loss_clip": 37.125, + "router_z_loss_mlp": 327.5, + "step": 93, + "time_per_iteration": 2.612030029296875 + }, + { + "auxiliary_loss_clip": 0.12023389, + "auxiliary_loss_mlp": 0.34590679, + "balance_loss_clip": 0.08357395, + "balance_loss_mlp": 0.02290875, + "epoch": 0.005651585750789117, + "flos": 20017415543040.0, + "grad_norm": 681.2724931544728, + "language_loss": 4.70847607, + "learning_rate": 2.925210265866963e-06, + "loss": 5.17461681, + "num_input_tokens_seen": 1913220, + "router_z_loss_clip": 36.625, + "router_z_loss_mlp": 322.75, + "step": 94, + "time_per_iteration": 2.6726646423339844 + }, + { + "auxiliary_loss_clip": 0.12331794, + "auxiliary_loss_mlp": 0.21429604, + "balance_loss_clip": 0.08515669, + "balance_loss_mlp": 0.01873939, + "epoch": 0.005711709003457087, + "flos": 59831202758400.0, + "grad_norm": 11.50707364837694, + "language_loss": 0.68575168, + "learning_rate": 2.932023580065507e-06, + "loss": 1.02336574, + "num_input_tokens_seen": 1970970, + "router_z_loss_clip": 38.0, + "router_z_loss_mlp": 195.25, + "step": 95, + "time_per_iteration": 3.168633222579956 + }, + { + "auxiliary_loss_clip": 0.11899618, + "auxiliary_loss_mlp": 0.32138801, + "balance_loss_clip": 0.08329217, + "balance_loss_mlp": 0.02231575, + "epoch": 0.005771832256125056, + "flos": 15564979537920.0, + "grad_norm": 1013.3395640383166, + "language_loss": 4.49414778, + "learning_rate": 2.9387655493491906e-06, + "loss": 4.93453217, + "num_input_tokens_seen": 1988930, + "router_z_loss_clip": 35.71875, + "router_z_loss_mlp": 298.5, + "step": 96, + "time_per_iteration": 5.5690062046051025 + }, + { + "auxiliary_loss_clip": 0.11822618, + "auxiliary_loss_mlp": 0.30064785, + "balance_loss_clip": 0.08285143, + "balance_loss_mlp": 0.02220548, + "epoch": 0.005831955508793026, + "flos": 22534934664960.0, + "grad_norm": 2356.5481695677104, + "language_loss": 5.16498899, + "learning_rate": 2.9454376524092147e-06, + "loss": 5.58386326, + "num_input_tokens_seen": 2006285, + "router_z_loss_clip": 35.4375, + "router_z_loss_mlp": 278.375, + "step": 97, + "time_per_iteration": 4.129577159881592 + }, + { + "auxiliary_loss_clip": 0.11772624, + "auxiliary_loss_mlp": 0.27429676, + "balance_loss_clip": 0.08268203, + "balance_loss_mlp": 0.02161121, + "epoch": 0.005892078761460995, + "flos": 22055600983680.0, + "grad_norm": 1442.767046866879, + "language_loss": 4.65611029, + "learning_rate": 2.952041322436969e-06, + "loss": 5.04813337, + "num_input_tokens_seen": 2024905, + "router_z_loss_clip": 35.03125, + "router_z_loss_mlp": 252.75, + "step": 98, + "time_per_iteration": 4.072925567626953 + }, + { + "auxiliary_loss_clip": 0.12124368, + "auxiliary_loss_mlp": 0.12855935, + "balance_loss_clip": 0.08381641, + "balance_loss_mlp": 0.01625466, + "epoch": 0.005952202014128965, + "flos": 68559865632000.0, + "grad_norm": 9.945172746585492, + "language_loss": 0.65681642, + "learning_rate": 2.9585779489718204e-06, + "loss": 0.90661949, + "num_input_tokens_seen": 2086220, + "router_z_loss_clip": 37.46875, + "router_z_loss_mlp": 112.4375, + "step": 99, + "time_per_iteration": 3.3806052207946777 + }, + { + "auxiliary_loss_clip": 0.11659142, + "auxiliary_loss_mlp": 0.25495899, + "balance_loss_clip": 0.08219896, + "balance_loss_mlp": 0.02095021, + "epoch": 0.006012325266796933, + "flos": 22966624500480.0, + "grad_norm": 5439.355539233552, + "language_loss": 4.89178705, + "learning_rate": 2.9650488796560464e-06, + "loss": 5.26333714, + "num_input_tokens_seen": 2103365, + "router_z_loss_clip": 34.34375, + "router_z_loss_mlp": 233.875, + "step": 100, + "time_per_iteration": 2.6920084953308105 + }, + { + "auxiliary_loss_clip": 0.11642508, + "auxiliary_loss_mlp": 0.23216301, + "balance_loss_clip": 0.08225508, + "balance_loss_mlp": 0.02037103, + "epoch": 0.006072448519464903, + "flos": 17353721773440.0, + "grad_norm": 71170.85330308754, + "language_loss": 4.95652103, + "learning_rate": 2.971455421902446e-06, + "loss": 5.30510902, + "num_input_tokens_seen": 2121995, + "router_z_loss_clip": 34.15625, + "router_z_loss_mlp": 211.875, + "step": 101, + "time_per_iteration": 2.652926206588745 + }, + { + "auxiliary_loss_clip": 0.11583164, + "auxiliary_loss_mlp": 0.214275, + "balance_loss_clip": 0.08206252, + "balance_loss_mlp": 0.01957287, + "epoch": 0.006132571772132872, + "flos": 24688044380160.0, + "grad_norm": 7482.306451170957, + "language_loss": 5.13341808, + "learning_rate": 2.9777988444798075e-06, + "loss": 5.4635253, + "num_input_tokens_seen": 2141815, + "router_z_loss_clip": 33.78125, + "router_z_loss_mlp": 194.625, + "step": 102, + "time_per_iteration": 2.7020983695983887 + }, + { + "auxiliary_loss_clip": 0.11553724, + "auxiliary_loss_mlp": 0.20282698, + "balance_loss_clip": 0.08193958, + "balance_loss_mlp": 0.01923322, + "epoch": 0.006192695024800842, + "flos": 21471279736320.0, + "grad_norm": 1966.1076689836887, + "language_loss": 4.95062399, + "learning_rate": 2.9840803790210285e-06, + "loss": 5.26898813, + "num_input_tokens_seen": 2161125, + "router_z_loss_clip": 33.59375, + "router_z_loss_mlp": 183.75, + "step": 103, + "time_per_iteration": 2.652406692504883 + }, + { + "auxiliary_loss_clip": 0.11498895, + "auxiliary_loss_mlp": 0.18188542, + "balance_loss_clip": 0.08159411, + "balance_loss_mlp": 0.01855535, + "epoch": 0.006252818277468811, + "flos": 17426117301120.0, + "grad_norm": 4017.94727583705, + "language_loss": 4.81252193, + "learning_rate": 2.990301221458371e-06, + "loss": 5.10939646, + "num_input_tokens_seen": 2179510, + "router_z_loss_clip": 33.40625, + "router_z_loss_mlp": 163.25, + "step": 104, + "time_per_iteration": 2.6669459342956543 + }, + { + "auxiliary_loss_clip": 0.11507185, + "auxiliary_loss_mlp": 0.18210354, + "balance_loss_clip": 0.081876, + "balance_loss_mlp": 0.01852931, + "epoch": 0.006312941530136781, + "flos": 19105679266560.0, + "grad_norm": 5275.119248926157, + "language_loss": 4.54453945, + "learning_rate": 2.9964625333900544e-06, + "loss": 4.84171486, + "num_input_tokens_seen": 2197870, + "router_z_loss_clip": 33.1875, + "router_z_loss_mlp": 163.625, + "step": 105, + "time_per_iteration": 2.6467208862304688 + }, + { + "auxiliary_loss_clip": 0.11489026, + "auxiliary_loss_mlp": 0.17571044, + "balance_loss_clip": 0.08164956, + "balance_loss_mlp": 0.01872801, + "epoch": 0.006373064782804749, + "flos": 24067651150080.0, + "grad_norm": 56669.614766689854, + "language_loss": 4.9280014, + "learning_rate": 3.002565443382063e-06, + "loss": 5.2186017, + "num_input_tokens_seen": 2217495, + "router_z_loss_clip": 33.1875, + "router_z_loss_mlp": 157.0, + "step": 106, + "time_per_iteration": 2.7375807762145996 + }, + { + "auxiliary_loss_clip": 0.11464141, + "auxiliary_loss_mlp": 0.16512999, + "balance_loss_clip": 0.08158538, + "balance_loss_mlp": 0.01815734, + "epoch": 0.006433188035472719, + "flos": 18338272848000.0, + "grad_norm": 94457.61945163306, + "language_loss": 4.08243847, + "learning_rate": 3.008611048208843e-06, + "loss": 4.36221027, + "num_input_tokens_seen": 2236520, + "router_z_loss_clip": 33.0625, + "router_z_loss_mlp": 146.875, + "step": 107, + "time_per_iteration": 2.6703994274139404 + }, + { + "auxiliary_loss_clip": 0.12281319, + "auxiliary_loss_mlp": 0.04033342, + "balance_loss_clip": 0.08292686, + "balance_loss_mlp": 0.01773516, + "epoch": 0.006493311288140688, + "flos": 62583266257920.0, + "grad_norm": 1.9990534397749096, + "language_loss": 0.6506741, + "learning_rate": 3.014600414036285e-06, + "loss": 0.81382072, + "num_input_tokens_seen": 2300140, + "router_z_loss_clip": 40.0, + "router_z_loss_mlp": 22.640625, + "step": 108, + "time_per_iteration": 3.3318073749542236 + }, + { + "auxiliary_loss_clip": 0.1146347, + "auxiliary_loss_mlp": 0.17600623, + "balance_loss_clip": 0.08161052, + "balance_loss_mlp": 0.01902381, + "epoch": 0.006553434540808658, + "flos": 19506202583040.0, + "grad_norm": 2213.052526088781, + "language_loss": 5.47699499, + "learning_rate": 3.0205345775501937e-06, + "loss": 5.76763535, + "num_input_tokens_seen": 2317320, + "router_z_loss_clip": 33.03125, + "router_z_loss_mlp": 156.875, + "step": 109, + "time_per_iteration": 2.719162940979004 + }, + { + "auxiliary_loss_clip": 0.11452536, + "auxiliary_loss_mlp": 0.16698027, + "balance_loss_clip": 0.08172794, + "balance_loss_mlp": 0.01903106, + "epoch": 0.006613557793476627, + "flos": 21111398449920.0, + "grad_norm": 8171.333832946622, + "language_loss": 4.33011436, + "learning_rate": 3.0264145470332218e-06, + "loss": 4.61161995, + "num_input_tokens_seen": 2337820, + "router_z_loss_clip": 32.765625, + "router_z_loss_mlp": 147.75, + "step": 110, + "time_per_iteration": 2.7021584510803223 + }, + { + "auxiliary_loss_clip": 0.11498255, + "auxiliary_loss_mlp": 0.16723976, + "balance_loss_clip": 0.08168858, + "balance_loss_mlp": 0.01916846, + "epoch": 0.006673681046144597, + "flos": 26037843402240.0, + "grad_norm": 85243.79091039153, + "language_loss": 5.33909988, + "learning_rate": 3.032241303393073e-06, + "loss": 5.62132263, + "num_input_tokens_seen": 2358560, + "router_z_loss_clip": 33.25, + "router_z_loss_mlp": 148.0625, + "step": 111, + "time_per_iteration": 2.763227939605713 + }, + { + "auxiliary_loss_clip": 0.11479855, + "auxiliary_loss_mlp": 0.17865081, + "balance_loss_clip": 0.08154993, + "balance_loss_mlp": 0.01983733, + "epoch": 0.006733804298812566, + "flos": 23154279719040.0, + "grad_norm": 75829.31622331966, + "language_loss": 4.96874857, + "learning_rate": 3.0380158011446e-06, + "loss": 5.26219797, + "num_input_tokens_seen": 2379005, + "router_z_loss_clip": 33.1875, + "router_z_loss_mlp": 158.875, + "step": 112, + "time_per_iteration": 2.656294822692871 + }, + { + "auxiliary_loss_clip": 0.1147141, + "auxiliary_loss_mlp": 0.17070231, + "balance_loss_clip": 0.08172764, + "balance_loss_mlp": 0.01933513, + "epoch": 0.006793927551480535, + "flos": 11769092599680.0, + "grad_norm": 3384.2074822155987, + "language_loss": 4.32218456, + "learning_rate": 3.0437389693482466e-06, + "loss": 4.60760117, + "num_input_tokens_seen": 2395610, + "router_z_loss_clip": 32.96875, + "router_z_loss_mlp": 151.25, + "step": 113, + "time_per_iteration": 2.6669225692749023 + }, + { + "auxiliary_loss_clip": 0.11510996, + "auxiliary_loss_mlp": 0.18198231, + "balance_loss_clip": 0.08184206, + "balance_loss_mlp": 0.019995, + "epoch": 0.006854050804148504, + "flos": 19177990940160.0, + "grad_norm": 1118.9556792976962, + "language_loss": 4.58965397, + "learning_rate": 3.0494117125071475e-06, + "loss": 4.88674641, + "num_input_tokens_seen": 2415005, + "router_z_loss_clip": 33.28125, + "router_z_loss_mlp": 161.875, + "step": 114, + "time_per_iteration": 2.6245124340057373 + }, + { + "auxiliary_loss_clip": 0.11491105, + "auxiliary_loss_mlp": 0.15876909, + "balance_loss_clip": 0.08183911, + "balance_loss_mlp": 0.01912064, + "epoch": 0.006914174056816474, + "flos": 21988488263040.0, + "grad_norm": 3570.8470324102345, + "language_loss": 4.92026377, + "learning_rate": 3.055034911425055e-06, + "loss": 5.19394398, + "num_input_tokens_seen": 2433965, + "router_z_loss_clip": 33.09375, + "router_z_loss_mlp": 139.625, + "step": 115, + "time_per_iteration": 2.694258689880371 + }, + { + "auxiliary_loss_clip": 0.11497033, + "auxiliary_loss_mlp": 0.17786066, + "balance_loss_clip": 0.08183155, + "balance_loss_mlp": 0.02014583, + "epoch": 0.006974297309484443, + "flos": 16294636892160.0, + "grad_norm": 28497.885490954828, + "language_loss": 4.11111546, + "learning_rate": 3.0606094240271244e-06, + "loss": 4.40394688, + "num_input_tokens_seen": 2451605, + "router_z_loss_clip": 33.125, + "router_z_loss_mlp": 157.75, + "step": 116, + "time_per_iteration": 2.6153717041015625 + }, + { + "auxiliary_loss_clip": 0.11479296, + "auxiliary_loss_mlp": 0.17568065, + "balance_loss_clip": 0.08183482, + "balance_loss_mlp": 0.02040722, + "epoch": 0.007034420562152413, + "flos": 26111161324800.0, + "grad_norm": 6129.230277666204, + "language_loss": 4.56221914, + "learning_rate": 3.0661360861454656e-06, + "loss": 4.8526926, + "num_input_tokens_seen": 2472035, + "router_z_loss_clip": 32.90625, + "router_z_loss_mlp": 155.25, + "step": 117, + "time_per_iteration": 2.698347568511963 + }, + { + "auxiliary_loss_clip": 0.11602448, + "auxiliary_loss_mlp": 0.18875569, + "balance_loss_clip": 0.08221327, + "balance_loss_mlp": 0.02151936, + "epoch": 0.007094543814820382, + "flos": 14208933386880.0, + "grad_norm": 568.8145863995832, + "language_loss": 4.50002289, + "learning_rate": 3.071615712271274e-06, + "loss": 4.80480337, + "num_input_tokens_seen": 2489285, + "router_z_loss_clip": 33.75, + "router_z_loss_mlp": 167.375, + "step": 118, + "time_per_iteration": 2.614288091659546 + }, + { + "auxiliary_loss_clip": 0.11586175, + "auxiliary_loss_mlp": 0.17393641, + "balance_loss_clip": 0.08235049, + "balance_loss_mlp": 0.02086024, + "epoch": 0.007154667067488351, + "flos": 14981329123200.0, + "grad_norm": 337.3163881950513, + "language_loss": 4.89806128, + "learning_rate": 3.0770490962752172e-06, + "loss": 5.18785954, + "num_input_tokens_seen": 2506460, + "router_z_loss_clip": 33.5625, + "router_z_loss_mlp": 153.0, + "step": 119, + "time_per_iteration": 2.6733670234680176 + }, + { + "auxiliary_loss_clip": 0.11613901, + "auxiliary_loss_mlp": 0.17884746, + "balance_loss_clip": 0.08224175, + "balance_loss_mlp": 0.02088849, + "epoch": 0.00721479032015632, + "flos": 20199452538240.0, + "grad_norm": 4431.2993639449, + "language_loss": 4.39706039, + "learning_rate": 3.082437012097686e-06, + "loss": 4.69204712, + "num_input_tokens_seen": 2525565, + "router_z_loss_clip": 33.9375, + "router_z_loss_mlp": 157.75, + "step": 120, + "time_per_iteration": 2.6733429431915283 + }, + { + "auxiliary_loss_clip": 0.11614023, + "auxiliary_loss_mlp": 0.18062758, + "balance_loss_clip": 0.0821183, + "balance_loss_mlp": 0.02144791, + "epoch": 0.00727491357282429, + "flos": 23153650813440.0, + "grad_norm": 6523.034573603343, + "language_loss": 5.06446743, + "learning_rate": 3.0877802144103967e-06, + "loss": 5.36123562, + "num_input_tokens_seen": 2546605, + "router_z_loss_clip": 34.0, + "router_z_loss_mlp": 159.0, + "step": 121, + "time_per_iteration": 2.726327419281006 + }, + { + "auxiliary_loss_clip": 0.11618941, + "auxiliary_loss_mlp": 0.17642631, + "balance_loss_clip": 0.08232379, + "balance_loss_mlp": 0.02127495, + "epoch": 0.007335036825492259, + "flos": 15526811203200.0, + "grad_norm": 1010.4173973733286, + "language_loss": 4.56235886, + "learning_rate": 3.09307943925077e-06, + "loss": 4.85497475, + "num_input_tokens_seen": 2560730, + "router_z_loss_clip": 33.875, + "router_z_loss_mlp": 155.125, + "step": 122, + "time_per_iteration": 2.640110969543457 + }, + { + "auxiliary_loss_clip": 0.11591011, + "auxiliary_loss_mlp": 0.16755471, + "balance_loss_clip": 0.08221178, + "balance_loss_mlp": 0.02094828, + "epoch": 0.007395160078160229, + "flos": 24250233196800.0, + "grad_norm": 4778.191954305265, + "language_loss": 4.97837877, + "learning_rate": 3.0983354046304154e-06, + "loss": 5.2618432, + "num_input_tokens_seen": 2579550, + "router_z_loss_clip": 33.625, + "router_z_loss_mlp": 146.625, + "step": 123, + "time_per_iteration": 2.689462661743164 + }, + { + "auxiliary_loss_clip": 0.11583175, + "auxiliary_loss_mlp": 0.16522312, + "balance_loss_clip": 0.08218054, + "balance_loss_mlp": 0.02069187, + "epoch": 0.007455283330828198, + "flos": 31767976391040.0, + "grad_norm": 918.147653305623, + "language_loss": 4.24658871, + "learning_rate": 3.103548811118979e-06, + "loss": 4.5276432, + "num_input_tokens_seen": 2600390, + "router_z_loss_clip": 33.6875, + "router_z_loss_mlp": 144.625, + "step": 124, + "time_per_iteration": 2.79850172996521 + }, + { + "auxiliary_loss_clip": 0.11631332, + "auxiliary_loss_mlp": 0.17508414, + "balance_loss_clip": 0.08243011, + "balance_loss_mlp": 0.02151969, + "epoch": 0.007515406583496167, + "flos": 26622458138880.0, + "grad_norm": 2521.4972321949017, + "language_loss": 4.22364092, + "learning_rate": 3.108720342404542e-06, + "loss": 4.51503849, + "num_input_tokens_seen": 2620770, + "router_z_loss_clip": 33.90625, + "router_z_loss_mlp": 153.375, + "step": 125, + "time_per_iteration": 2.699488401412964 + }, + { + "auxiliary_loss_clip": 0.11621339, + "auxiliary_loss_mlp": 0.16743667, + "balance_loss_clip": 0.08258513, + "balance_loss_mlp": 0.02131851, + "epoch": 0.007575529836164136, + "flos": 18229637629440.0, + "grad_norm": 2114.724785338214, + "language_loss": 4.42466068, + "learning_rate": 3.1138506658316945e-06, + "loss": 4.70831108, + "num_input_tokens_seen": 2639900, + "router_z_loss_clip": 33.625, + "router_z_loss_mlp": 146.125, + "step": 126, + "time_per_iteration": 2.65913987159729 + }, + { + "auxiliary_loss_clip": 0.11678092, + "auxiliary_loss_mlp": 0.16983882, + "balance_loss_clip": 0.08243092, + "balance_loss_mlp": 0.02127924, + "epoch": 0.007635653088832106, + "flos": 21586916770560.0, + "grad_norm": 719.841664884419, + "language_loss": 3.98921776, + "learning_rate": 3.1189404329183404e-06, + "loss": 4.2758379, + "num_input_tokens_seen": 2657450, + "router_z_loss_clip": 34.3125, + "router_z_loss_mlp": 148.625, + "step": 127, + "time_per_iteration": 2.6392276287078857 + }, + { + "auxiliary_loss_clip": 0.11679719, + "auxiliary_loss_mlp": 0.17065403, + "balance_loss_clip": 0.08245254, + "balance_loss_mlp": 0.02160617, + "epoch": 0.007695776341500075, + "flos": 25382216730240.0, + "grad_norm": 1269.777428310943, + "language_loss": 4.33711529, + "learning_rate": 3.1239902798522317e-06, + "loss": 4.62456656, + "num_input_tokens_seen": 2678150, + "router_z_loss_clip": 34.3125, + "router_z_loss_mlp": 149.125, + "step": 128, + "time_per_iteration": 2.698997974395752 + }, + { + "auxiliary_loss_clip": 0.11722346, + "auxiliary_loss_mlp": 0.16804715, + "balance_loss_clip": 0.08270991, + "balance_loss_mlp": 0.02131863, + "epoch": 0.007755899594168045, + "flos": 22350088558080.0, + "grad_norm": 1159.6537901720856, + "language_loss": 4.87967634, + "learning_rate": 3.129000827968184e-06, + "loss": 5.16494703, + "num_input_tokens_seen": 2698290, + "router_z_loss_clip": 34.5, + "router_z_loss_mlp": 146.625, + "step": 129, + "time_per_iteration": 2.6568491458892822 + }, + { + "auxiliary_loss_clip": 0.11725748, + "auxiliary_loss_mlp": 0.17228858, + "balance_loss_clip": 0.08278215, + "balance_loss_mlp": 0.02165382, + "epoch": 0.007816022846836013, + "flos": 22644869621760.0, + "grad_norm": 436.4430863377033, + "language_loss": 5.01482534, + "learning_rate": 3.133972684206866e-06, + "loss": 5.30437136, + "num_input_tokens_seen": 2717630, + "router_z_loss_clip": 34.4375, + "router_z_loss_mlp": 150.5, + "step": 130, + "time_per_iteration": 2.7268729209899902 + }, + { + "auxiliary_loss_clip": 0.11697873, + "auxiliary_loss_mlp": 0.16884172, + "balance_loss_clip": 0.08257942, + "balance_loss_mlp": 0.02162493, + "epoch": 0.007876146099503984, + "flos": 18188115131520.0, + "grad_norm": 1162.2622739405722, + "language_loss": 4.07958698, + "learning_rate": 3.138906441556014e-06, + "loss": 4.36540699, + "num_input_tokens_seen": 2735835, + "router_z_loss_clip": 34.40625, + "router_z_loss_mlp": 147.25, + "step": 131, + "time_per_iteration": 2.7109498977661133 + }, + { + "auxiliary_loss_clip": 0.11733647, + "auxiliary_loss_mlp": 0.16117501, + "balance_loss_clip": 0.08280095, + "balance_loss_mlp": 0.02128244, + "epoch": 0.007936269352171952, + "flos": 27125788815360.0, + "grad_norm": 7543.348079431309, + "language_loss": 4.20423412, + "learning_rate": 3.143802679474861e-06, + "loss": 4.48274565, + "num_input_tokens_seen": 2756335, + "router_z_loss_clip": 34.5, + "router_z_loss_mlp": 140.0, + "step": 132, + "time_per_iteration": 2.717806816101074 + }, + { + "auxiliary_loss_clip": 0.11797122, + "auxiliary_loss_mlp": 0.16945273, + "balance_loss_clip": 0.08290964, + "balance_loss_mlp": 0.0219918, + "epoch": 0.007996392604839923, + "flos": 19032403271040.0, + "grad_norm": 824.1057706186339, + "language_loss": 4.52130318, + "learning_rate": 3.1486619643025565e-06, + "loss": 4.80872679, + "num_input_tokens_seen": 2775090, + "router_z_loss_clip": 34.96875, + "router_z_loss_mlp": 147.375, + "step": 133, + "time_per_iteration": 2.6183056831359863 + }, + { + "auxiliary_loss_clip": 0.11778916, + "auxiliary_loss_mlp": 0.1607928, + "balance_loss_clip": 0.08279899, + "balance_loss_mlp": 0.02163264, + "epoch": 0.008056515857507891, + "flos": 25491271219200.0, + "grad_norm": 23901.09716796145, + "language_loss": 3.33778429, + "learning_rate": 3.153484849651286e-06, + "loss": 3.61636591, + "num_input_tokens_seen": 2795320, + "router_z_loss_clip": 34.96875, + "router_z_loss_mlp": 139.25, + "step": 134, + "time_per_iteration": 2.715651750564575 + }, + { + "auxiliary_loss_clip": 0.11796138, + "auxiliary_loss_mlp": 0.16928384, + "balance_loss_clip": 0.08284588, + "balance_loss_mlp": 0.02206703, + "epoch": 0.00811663911017586, + "flos": 20563694236800.0, + "grad_norm": 532.3002515432323, + "language_loss": 4.31598186, + "learning_rate": 3.1582718767847806e-06, + "loss": 4.60322666, + "num_input_tokens_seen": 2812815, + "router_z_loss_clip": 35.1875, + "router_z_loss_mlp": 147.25, + "step": 135, + "time_per_iteration": 2.658189296722412 + }, + { + "auxiliary_loss_clip": 0.11834078, + "auxiliary_loss_mlp": 0.17649791, + "balance_loss_clip": 0.08286304, + "balance_loss_mlp": 0.02256724, + "epoch": 0.00817676236284383, + "flos": 18804483365760.0, + "grad_norm": 591.2706889750153, + "language_loss": 4.16468382, + "learning_rate": 3.1630235749828485e-06, + "loss": 4.45952272, + "num_input_tokens_seen": 2830445, + "router_z_loss_clip": 35.4375, + "router_z_loss_mlp": 153.75, + "step": 136, + "time_per_iteration": 5.634068250656128 + }, + { + "auxiliary_loss_clip": 0.11831227, + "auxiliary_loss_mlp": 0.16616376, + "balance_loss_clip": 0.08291583, + "balance_loss_mlp": 0.02193768, + "epoch": 0.008236885615511799, + "flos": 23879576661120.0, + "grad_norm": 754.59577193491, + "language_loss": 4.28476763, + "learning_rate": 3.1677404618925676e-06, + "loss": 4.56924391, + "num_input_tokens_seen": 2846965, + "router_z_loss_clip": 35.46875, + "router_z_loss_mlp": 144.25, + "step": 137, + "time_per_iteration": 2.6984925270080566 + }, + { + "auxiliary_loss_clip": 0.11840196, + "auxiliary_loss_mlp": 0.16576298, + "balance_loss_clip": 0.08293904, + "balance_loss_mlp": 0.02214726, + "epoch": 0.00829700886817977, + "flos": 24650379169920.0, + "grad_norm": 767.1857414798482, + "language_loss": 4.50048828, + "learning_rate": 3.1724230438666953e-06, + "loss": 4.78465271, + "num_input_tokens_seen": 2867520, + "router_z_loss_clip": 35.5, + "router_z_loss_mlp": 143.5625, + "step": 138, + "time_per_iteration": 4.106135368347168 + }, + { + "auxiliary_loss_clip": 0.11846266, + "auxiliary_loss_mlp": 0.16453376, + "balance_loss_clip": 0.08313362, + "balance_loss_mlp": 0.02219978, + "epoch": 0.008357132120847738, + "flos": 25268550266880.0, + "grad_norm": 3135.202751990444, + "language_loss": 4.53827906, + "learning_rate": 3.177071816289865e-06, + "loss": 4.82127523, + "num_input_tokens_seen": 2885675, + "router_z_loss_clip": 35.34375, + "router_z_loss_mlp": 142.5, + "step": 139, + "time_per_iteration": 2.6956582069396973 + }, + { + "auxiliary_loss_clip": 0.11892673, + "auxiliary_loss_mlp": 0.17064422, + "balance_loss_clip": 0.08314734, + "balance_loss_mlp": 0.02245087, + "epoch": 0.008417255373515706, + "flos": 27352325128320.0, + "grad_norm": 729.9492101747932, + "language_loss": 3.41289186, + "learning_rate": 3.181687263893095e-06, + "loss": 3.70246267, + "num_input_tokens_seen": 2905960, + "router_z_loss_clip": 35.71875, + "router_z_loss_mlp": 148.125, + "step": 140, + "time_per_iteration": 2.6964235305786133 + }, + { + "auxiliary_loss_clip": 0.1186142, + "auxiliary_loss_mlp": 0.16847792, + "balance_loss_clip": 0.08325124, + "balance_loss_mlp": 0.02223768, + "epoch": 0.008477378626183677, + "flos": 17644771330560.0, + "grad_norm": 9248.736899536998, + "language_loss": 3.54738212, + "learning_rate": 3.186269861057098e-06, + "loss": 3.83447456, + "num_input_tokens_seen": 2922780, + "router_z_loss_clip": 35.375, + "router_z_loss_mlp": 146.125, + "step": 141, + "time_per_iteration": 2.6551992893218994 + }, + { + "auxiliary_loss_clip": 0.11875261, + "auxiliary_loss_mlp": 0.17182453, + "balance_loss_clip": 0.08333448, + "balance_loss_mlp": 0.02241047, + "epoch": 0.008537501878851645, + "flos": 13886465748480.0, + "grad_norm": 1195.8886145818353, + "language_loss": 3.75801992, + "learning_rate": 3.1908200721048745e-06, + "loss": 4.04859734, + "num_input_tokens_seen": 2938765, + "router_z_loss_clip": 35.46875, + "router_z_loss_mlp": 149.375, + "step": 142, + "time_per_iteration": 2.613173246383667 + }, + { + "auxiliary_loss_clip": 0.11767568, + "auxiliary_loss_mlp": 0.03479403, + "balance_loss_clip": 0.08269441, + "balance_loss_mlp": 0.01324862, + "epoch": 0.008597625131519616, + "flos": 71270783976960.0, + "grad_norm": 1.6897091068609469, + "language_loss": 0.6651473, + "learning_rate": 3.195338351584042e-06, + "loss": 0.81761706, + "num_input_tokens_seen": 3006665, + "router_z_loss_clip": 35.0, + "router_z_loss_mlp": 21.5625, + "step": 143, + "time_per_iteration": 3.571974754333496 + }, + { + "auxiliary_loss_clip": 0.11831102, + "auxiliary_loss_mlp": 0.18004906, + "balance_loss_clip": 0.08322103, + "balance_loss_mlp": 0.02245629, + "epoch": 0.008657748384187584, + "flos": 17608573566720.0, + "grad_norm": 764.3395719536082, + "language_loss": 4.02781963, + "learning_rate": 3.1998251445393258e-06, + "loss": 4.32617998, + "num_input_tokens_seen": 3024335, + "router_z_loss_clip": 35.125, + "router_z_loss_mlp": 157.625, + "step": 144, + "time_per_iteration": 2.950308322906494 + }, + { + "auxiliary_loss_clip": 0.11815393, + "auxiliary_loss_mlp": 0.1653876, + "balance_loss_clip": 0.08320558, + "balance_loss_mlp": 0.021955, + "epoch": 0.008717871636855555, + "flos": 19720789689600.0, + "grad_norm": 995.118837229873, + "language_loss": 3.85104275, + "learning_rate": 3.204280886775619e-06, + "loss": 4.13458443, + "num_input_tokens_seen": 3043300, + "router_z_loss_clip": 34.9375, + "router_z_loss_mlp": 143.625, + "step": 145, + "time_per_iteration": 2.704049587249756 + }, + { + "auxiliary_loss_clip": 0.11712223, + "auxiliary_loss_mlp": 0.1568643, + "balance_loss_clip": 0.08270143, + "balance_loss_mlp": 0.02154936, + "epoch": 0.008777994889523523, + "flos": 24724325998080.0, + "grad_norm": 15039.120691806027, + "language_loss": 3.98885298, + "learning_rate": 3.208706005112005e-06, + "loss": 4.26283932, + "num_input_tokens_seen": 3064610, + "router_z_loss_clip": 34.40625, + "router_z_loss_mlp": 135.4375, + "step": 146, + "time_per_iteration": 2.7329108715057373 + }, + { + "auxiliary_loss_clip": 0.11446112, + "auxiliary_loss_mlp": 0.02845502, + "balance_loss_clip": 0.08152023, + "balance_loss_mlp": 0.01408125, + "epoch": 0.008838118142191492, + "flos": 70150974013440.0, + "grad_norm": 1.1651618479175945, + "language_loss": 0.59517723, + "learning_rate": 3.213100917627104e-06, + "loss": 0.73809338, + "num_input_tokens_seen": 3130385, + "router_z_loss_clip": 33.0, + "router_z_loss_mlp": 14.3671875, + "step": 147, + "time_per_iteration": 3.3949942588806152 + }, + { + "auxiliary_loss_clip": 0.11677637, + "auxiliary_loss_mlp": 0.16713935, + "balance_loss_clip": 0.08274397, + "balance_loss_mlp": 0.02199776, + "epoch": 0.008898241394859462, + "flos": 20050510705920.0, + "grad_norm": 1889.1884601694564, + "language_loss": 4.35780334, + "learning_rate": 3.2174660338961135e-06, + "loss": 4.64171886, + "num_input_tokens_seen": 3149760, + "router_z_loss_clip": 33.96875, + "router_z_loss_mlp": 145.25, + "step": 148, + "time_per_iteration": 2.7146079540252686 + }, + { + "auxiliary_loss_clip": 0.1159438, + "auxiliary_loss_mlp": 0.16573352, + "balance_loss_clip": 0.08248326, + "balance_loss_mlp": 0.02217881, + "epoch": 0.008958364647527431, + "flos": 10748217980160.0, + "grad_norm": 637.0991660467967, + "language_loss": 4.14174032, + "learning_rate": 3.2218017552198588e-06, + "loss": 4.42341805, + "num_input_tokens_seen": 3164500, + "router_z_loss_clip": 33.4375, + "router_z_loss_mlp": 143.625, + "step": 149, + "time_per_iteration": 2.661672353744507 + }, + { + "auxiliary_loss_clip": 0.11618437, + "auxiliary_loss_mlp": 0.16563556, + "balance_loss_clip": 0.08263792, + "balance_loss_mlp": 0.02201984, + "epoch": 0.009018487900195401, + "flos": 29134317110400.0, + "grad_norm": 1769.3998229499293, + "language_loss": 4.95698929, + "learning_rate": 3.226108474846181e-06, + "loss": 5.23880959, + "num_input_tokens_seen": 3182455, + "router_z_loss_clip": 33.5625, + "router_z_loss_mlp": 143.6875, + "step": 150, + "time_per_iteration": 2.7311227321624756 + }, + { + "auxiliary_loss_clip": 0.11585926, + "auxiliary_loss_mlp": 0.16123089, + "balance_loss_clip": 0.08249478, + "balance_loss_mlp": 0.02219281, + "epoch": 0.00907861115286337, + "flos": 32972020035840.0, + "grad_norm": 2114.6136002652206, + "language_loss": 3.36094427, + "learning_rate": 3.2303865781839817e-06, + "loss": 3.63803458, + "num_input_tokens_seen": 3203995, + "router_z_loss_clip": 33.34375, + "router_z_loss_mlp": 139.125, + "step": 151, + "time_per_iteration": 2.7520253658294678 + }, + { + "auxiliary_loss_clip": 0.115492, + "auxiliary_loss_mlp": 0.15748456, + "balance_loss_clip": 0.08239767, + "balance_loss_mlp": 0.02198652, + "epoch": 0.009138734405531338, + "flos": 21768911838720.0, + "grad_norm": 3311.474565423633, + "language_loss": 3.73547316, + "learning_rate": 3.234636443010188e-06, + "loss": 4.00844955, + "num_input_tokens_seen": 3222575, + "router_z_loss_clip": 33.09375, + "router_z_loss_mlp": 135.625, + "step": 152, + "time_per_iteration": 2.694563865661621 + }, + { + "auxiliary_loss_clip": 0.1159073, + "auxiliary_loss_mlp": 0.1623821, + "balance_loss_clip": 0.08250044, + "balance_loss_mlp": 0.02248952, + "epoch": 0.009198857658199309, + "flos": 20847532343040.0, + "grad_norm": 1087.0956983151382, + "language_loss": 3.84302998, + "learning_rate": 3.238858439669943e-06, + "loss": 4.12131977, + "num_input_tokens_seen": 3240180, + "router_z_loss_clip": 33.40625, + "router_z_loss_mlp": 139.875, + "step": 153, + "time_per_iteration": 2.6366450786590576 + }, + { + "auxiliary_loss_clip": 0.11564142, + "auxiliary_loss_mlp": 0.15476364, + "balance_loss_clip": 0.08260261, + "balance_loss_mlp": 0.02207321, + "epoch": 0.009258980910867277, + "flos": 24834386736000.0, + "grad_norm": 8366.148944916698, + "language_loss": 4.13687325, + "learning_rate": 3.2430529312702712e-06, + "loss": 4.40727806, + "num_input_tokens_seen": 3259800, + "router_z_loss_clip": 33.03125, + "router_z_loss_mlp": 132.8125, + "step": 154, + "time_per_iteration": 2.7312138080596924 + }, + { + "auxiliary_loss_clip": 0.11535051, + "auxiliary_loss_mlp": 0.15077396, + "balance_loss_clip": 0.08268774, + "balance_loss_mlp": 0.02198978, + "epoch": 0.009319104163535248, + "flos": 28775442072960.0, + "grad_norm": 662.1258045248602, + "language_loss": 4.14579964, + "learning_rate": 3.2472202738674737e-06, + "loss": 4.41192484, + "num_input_tokens_seen": 3280400, + "router_z_loss_clip": 32.71875, + "router_z_loss_mlp": 128.6875, + "step": 155, + "time_per_iteration": 2.755199909210205 + }, + { + "auxiliary_loss_clip": 0.11566834, + "auxiliary_loss_mlp": 0.15004471, + "balance_loss_clip": 0.08261703, + "balance_loss_mlp": 0.02193191, + "epoch": 0.009379227416203216, + "flos": 16587698947200.0, + "grad_norm": 731.5664855161135, + "language_loss": 3.49704862, + "learning_rate": 3.2513608166485063e-06, + "loss": 3.76276183, + "num_input_tokens_seen": 3297600, + "router_z_loss_clip": 33.09375, + "router_z_loss_mlp": 128.125, + "step": 156, + "time_per_iteration": 2.7707407474517822 + }, + { + "auxiliary_loss_clip": 0.11568415, + "auxiliary_loss_mlp": 0.15332887, + "balance_loss_clip": 0.08266081, + "balance_loss_mlp": 0.02216432, + "epoch": 0.009439350668871187, + "flos": 18335337955200.0, + "grad_norm": 795.683005311381, + "language_loss": 3.94911337, + "learning_rate": 3.2554749021065498e-06, + "loss": 4.2181263, + "num_input_tokens_seen": 3313635, + "router_z_loss_clip": 32.96875, + "router_z_loss_mlp": 131.25, + "step": 157, + "time_per_iteration": 2.6737098693847656 + }, + { + "auxiliary_loss_clip": 0.11567172, + "auxiliary_loss_mlp": 0.15600383, + "balance_loss_clip": 0.0828969, + "balance_loss_mlp": 0.02264203, + "epoch": 0.009499473921539155, + "flos": 24356310865920.0, + "grad_norm": 748.6515809747107, + "language_loss": 3.9944849, + "learning_rate": 3.2595628662110186e-06, + "loss": 4.26616049, + "num_input_tokens_seen": 3333735, + "router_z_loss_clip": 32.75, + "router_z_loss_mlp": 133.5625, + "step": 158, + "time_per_iteration": 2.6704254150390625 + }, + { + "auxiliary_loss_clip": 0.11561831, + "auxiliary_loss_mlp": 0.15665153, + "balance_loss_clip": 0.08273103, + "balance_loss_mlp": 0.02231314, + "epoch": 0.009559597174207124, + "flos": 16404949192320.0, + "grad_norm": 1901.311070356518, + "language_loss": 3.80921197, + "learning_rate": 3.2636250385721982e-06, + "loss": 4.08148146, + "num_input_tokens_seen": 3348800, + "router_z_loss_clip": 32.90625, + "router_z_loss_mlp": 134.4375, + "step": 159, + "time_per_iteration": 2.6218996047973633 + }, + { + "auxiliary_loss_clip": 0.11580203, + "auxiliary_loss_mlp": 0.15643886, + "balance_loss_clip": 0.08278053, + "balance_loss_mlp": 0.02252773, + "epoch": 0.009619720426875094, + "flos": 22863523651200.0, + "grad_norm": 1785.522909187837, + "language_loss": 3.8831954, + "learning_rate": 3.2676617426007263e-06, + "loss": 4.15543652, + "num_input_tokens_seen": 3368595, + "router_z_loss_clip": 33.0, + "router_z_loss_mlp": 134.0, + "step": 160, + "time_per_iteration": 2.6699254512786865 + }, + { + "auxiliary_loss_clip": 0.11567888, + "auxiliary_loss_mlp": 0.15128596, + "balance_loss_clip": 0.08280417, + "balance_loss_mlp": 0.02237971, + "epoch": 0.009679843679543063, + "flos": 19140954635520.0, + "grad_norm": 1894.5705497879367, + "language_loss": 4.38242626, + "learning_rate": 3.2716732956621042e-06, + "loss": 4.6493907, + "num_input_tokens_seen": 3384975, + "router_z_loss_clip": 32.890625, + "router_z_loss_mlp": 129.0, + "step": 161, + "time_per_iteration": 2.692594289779663 + }, + { + "auxiliary_loss_clip": 0.11596949, + "auxiliary_loss_mlp": 0.15413821, + "balance_loss_clip": 0.08296333, + "balance_loss_mlp": 0.02279055, + "epoch": 0.009739966932211033, + "flos": 20309219786880.0, + "grad_norm": 1092.6315431795774, + "language_loss": 3.67637897, + "learning_rate": 3.2756600092264203e-06, + "loss": 3.94648647, + "num_input_tokens_seen": 3404755, + "router_z_loss_clip": 33.0, + "router_z_loss_mlp": 131.4375, + "step": 162, + "time_per_iteration": 2.684589147567749 + }, + { + "auxiliary_loss_clip": 0.10812573, + "auxiliary_loss_mlp": 0.02121325, + "balance_loss_clip": 0.08169468, + "balance_loss_mlp": 0.01469775, + "epoch": 0.009800090184879002, + "flos": 67053200567040.0, + "grad_norm": 1.455168404801105, + "language_loss": 0.72263706, + "learning_rate": 3.279622189013474e-06, + "loss": 0.85197604, + "num_input_tokens_seen": 3467210, + "router_z_loss_clip": 26.484375, + "router_z_loss_mlp": 6.515625, + "step": 163, + "time_per_iteration": 3.2609994411468506 + }, + { + "auxiliary_loss_clip": 0.1158057, + "auxiliary_loss_mlp": 0.15459523, + "balance_loss_clip": 0.08303102, + "balance_loss_mlp": 0.02282033, + "epoch": 0.00986021343754697, + "flos": 17170301185920.0, + "grad_norm": 728.8786194893343, + "language_loss": 3.07243919, + "learning_rate": 3.283560135133457e-06, + "loss": 3.34283996, + "num_input_tokens_seen": 3483220, + "router_z_loss_clip": 32.765625, + "router_z_loss_mlp": 131.8125, + "step": 164, + "time_per_iteration": 2.6558001041412354 + }, + { + "auxiliary_loss_clip": 0.11589515, + "auxiliary_loss_mlp": 0.15754591, + "balance_loss_clip": 0.08312181, + "balance_loss_mlp": 0.02308546, + "epoch": 0.00992033669021494, + "flos": 17755293265920.0, + "grad_norm": 847.0745501241739, + "language_loss": 3.51890922, + "learning_rate": 3.2874741422233565e-06, + "loss": 3.79235029, + "num_input_tokens_seen": 3501465, + "router_z_loss_clip": 32.78125, + "router_z_loss_mlp": 134.4375, + "step": 165, + "time_per_iteration": 2.661271095275879 + }, + { + "auxiliary_loss_clip": 0.11568248, + "auxiliary_loss_mlp": 0.15508898, + "balance_loss_clip": 0.08301617, + "balance_loss_mlp": 0.02294787, + "epoch": 0.00998045994288291, + "flos": 25303490219520.0, + "grad_norm": 327.0790624727143, + "language_loss": 3.23893571, + "learning_rate": 3.2913644995792465e-06, + "loss": 3.50970697, + "num_input_tokens_seen": 3520480, + "router_z_loss_clip": 32.6875, + "router_z_loss_mlp": 132.3125, + "step": 166, + "time_per_iteration": 2.710336923599243 + }, + { + "auxiliary_loss_clip": 0.11574914, + "auxiliary_loss_mlp": 0.14880663, + "balance_loss_clip": 0.08314175, + "balance_loss_mlp": 0.02301317, + "epoch": 0.01004058319555088, + "flos": 32305869676800.0, + "grad_norm": 776.5856268380442, + "language_loss": 4.07326555, + "learning_rate": 3.2952314912845914e-06, + "loss": 4.33782148, + "num_input_tokens_seen": 3539570, + "router_z_loss_clip": 32.609375, + "router_z_loss_mlp": 125.8125, + "step": 167, + "time_per_iteration": 2.779219150543213 + }, + { + "auxiliary_loss_clip": 0.1150827, + "auxiliary_loss_mlp": 0.15720402, + "balance_loss_clip": 0.083069, + "balance_loss_mlp": 0.02304874, + "epoch": 0.010100706448218848, + "flos": 11323399132800.0, + "grad_norm": 2394.835407434967, + "language_loss": 3.28905821, + "learning_rate": 3.299075396334735e-06, + "loss": 3.5613451, + "num_input_tokens_seen": 3555465, + "router_z_loss_clip": 32.0, + "router_z_loss_mlp": 134.25, + "step": 168, + "time_per_iteration": 2.6511645317077637 + }, + { + "auxiliary_loss_clip": 0.11477365, + "auxiliary_loss_mlp": 0.1529358, + "balance_loss_clip": 0.08283502, + "balance_loss_mlp": 0.02299196, + "epoch": 0.010160829700886819, + "flos": 29727820379520.0, + "grad_norm": 656.1528496227621, + "language_loss": 3.4663558, + "learning_rate": 3.3028964887576868e-06, + "loss": 3.73406529, + "num_input_tokens_seen": 3578970, + "router_z_loss_clip": 31.921875, + "router_z_loss_mlp": 130.0, + "step": 169, + "time_per_iteration": 2.744943141937256 + }, + { + "auxiliary_loss_clip": 0.1151928, + "auxiliary_loss_mlp": 0.1559048, + "balance_loss_clip": 0.08316396, + "balance_loss_mlp": 0.02315333, + "epoch": 0.010220952953554787, + "flos": 20418567765120.0, + "grad_norm": 1313.5821328962659, + "language_loss": 3.30928183, + "learning_rate": 3.306695037731344e-06, + "loss": 3.58037925, + "num_input_tokens_seen": 3597275, + "router_z_loss_clip": 32.03125, + "router_z_loss_mlp": 132.75, + "step": 170, + "time_per_iteration": 2.6904942989349365 + }, + { + "auxiliary_loss_clip": 0.11476055, + "auxiliary_loss_mlp": 0.14880618, + "balance_loss_clip": 0.08295664, + "balance_loss_mlp": 0.02301271, + "epoch": 0.010281076206222756, + "flos": 31293170830080.0, + "grad_norm": 1393.3935417181144, + "language_loss": 3.61100364, + "learning_rate": 3.3104713076972827e-06, + "loss": 3.87457037, + "num_input_tokens_seen": 3618905, + "router_z_loss_clip": 31.84375, + "router_z_loss_mlp": 125.75, + "step": 171, + "time_per_iteration": 2.7253830432891846 + }, + { + "auxiliary_loss_clip": 0.11506656, + "auxiliary_loss_mlp": 0.15002409, + "balance_loss_clip": 0.08299719, + "balance_loss_mlp": 0.02294889, + "epoch": 0.010341199458890726, + "flos": 21988949460480.0, + "grad_norm": 857.6014739419991, + "language_loss": 3.63604832, + "learning_rate": 3.314225558471224e-06, + "loss": 3.90113878, + "num_input_tokens_seen": 3639610, + "router_z_loss_clip": 32.015625, + "router_z_loss_mlp": 127.1875, + "step": 172, + "time_per_iteration": 2.687918186187744 + }, + { + "auxiliary_loss_clip": 0.11501465, + "auxiliary_loss_mlp": 0.15934135, + "balance_loss_clip": 0.08304699, + "balance_loss_mlp": 0.02359916, + "epoch": 0.010401322711558695, + "flos": 30818449123200.0, + "grad_norm": 2776.6711688344126, + "language_loss": 3.43709183, + "learning_rate": 3.317958045350308e-06, + "loss": 3.71144772, + "num_input_tokens_seen": 3664030, + "router_z_loss_clip": 31.9375, + "router_z_loss_mlp": 135.6875, + "step": 173, + "time_per_iteration": 2.760416030883789 + }, + { + "auxiliary_loss_clip": 0.11548179, + "auxiliary_loss_mlp": 0.15753293, + "balance_loss_clip": 0.08317138, + "balance_loss_mlp": 0.02337765, + "epoch": 0.010461445964226665, + "flos": 24721642667520.0, + "grad_norm": 1049.1047345334737, + "language_loss": 3.46181607, + "learning_rate": 3.3216690192172596e-06, + "loss": 3.73483086, + "num_input_tokens_seen": 3683615, + "router_z_loss_clip": 32.28125, + "router_z_loss_mlp": 134.125, + "step": 174, + "time_per_iteration": 2.8112432956695557 + }, + { + "auxiliary_loss_clip": 0.11529493, + "auxiliary_loss_mlp": 0.16248052, + "balance_loss_clip": 0.08304952, + "balance_loss_mlp": 0.02319829, + "epoch": 0.010521569216894634, + "flos": 27717950419200.0, + "grad_norm": 1443.6409322594398, + "language_loss": 3.14877939, + "learning_rate": 3.325358726641591e-06, + "loss": 3.42655468, + "num_input_tokens_seen": 3704540, + "router_z_loss_clip": 32.265625, + "router_z_loss_mlp": 139.25, + "step": 175, + "time_per_iteration": 5.6078009605407715 + }, + { + "auxiliary_loss_clip": 0.11549105, + "auxiliary_loss_mlp": 0.15645993, + "balance_loss_clip": 0.08317456, + "balance_loss_mlp": 0.02328122, + "epoch": 0.010581692469562603, + "flos": 12463223022720.0, + "grad_norm": 956.7802143525229, + "language_loss": 3.34866667, + "learning_rate": 3.329027409977902e-06, + "loss": 3.62061763, + "num_input_tokens_seen": 3721320, + "router_z_loss_clip": 32.34375, + "router_z_loss_mlp": 133.375, + "step": 176, + "time_per_iteration": 4.057558059692383 + }, + { + "auxiliary_loss_clip": 0.11580729, + "auxiliary_loss_mlp": 0.16905147, + "balance_loss_clip": 0.08321375, + "balance_loss_mlp": 0.02378779, + "epoch": 0.010641815722230573, + "flos": 19433723201280.0, + "grad_norm": 1505.424754847227, + "language_loss": 3.25544405, + "learning_rate": 3.3326753074614087e-06, + "loss": 3.54030275, + "num_input_tokens_seen": 3739385, + "router_z_loss_clip": 32.5625, + "router_z_loss_mlp": 145.25, + "step": 177, + "time_per_iteration": 4.175410032272339 + }, + { + "auxiliary_loss_clip": 0.11632887, + "auxiliary_loss_mlp": 0.17182559, + "balance_loss_clip": 0.08330977, + "balance_loss_mlp": 0.02387638, + "epoch": 0.010701938974898541, + "flos": 18338440556160.0, + "grad_norm": 1009.0094276513727, + "language_loss": 3.02760315, + "learning_rate": 3.3363026533007716e-06, + "loss": 3.31575751, + "num_input_tokens_seen": 3756360, + "router_z_loss_clip": 33.046875, + "router_z_loss_mlp": 148.0, + "step": 178, + "time_per_iteration": 2.6476314067840576 + }, + { + "auxiliary_loss_clip": 0.11659138, + "auxiliary_loss_mlp": 0.17559879, + "balance_loss_clip": 0.0834986, + "balance_loss_mlp": 0.02398745, + "epoch": 0.010762062227566512, + "flos": 19209283240320.0, + "grad_norm": 645.2944722680985, + "language_loss": 3.18850112, + "learning_rate": 3.3399096777683303e-06, + "loss": 3.48069143, + "num_input_tokens_seen": 3773930, + "router_z_loss_clip": 33.03125, + "router_z_loss_mlp": 151.5, + "step": 179, + "time_per_iteration": 2.673020601272583 + }, + { + "auxiliary_loss_clip": 0.11646449, + "auxiliary_loss_mlp": 0.17152536, + "balance_loss_clip": 0.0833544, + "balance_loss_mlp": 0.02369822, + "epoch": 0.01082218548023448, + "flos": 31432553297280.0, + "grad_norm": 1138.8337468152163, + "language_loss": 3.61664343, + "learning_rate": 3.3434966072878213e-06, + "loss": 3.90463305, + "num_input_tokens_seen": 3793630, + "router_z_loss_clip": 33.125, + "router_z_loss_mlp": 147.75, + "step": 180, + "time_per_iteration": 2.7129592895507812 + }, + { + "auxiliary_loss_clip": 0.1163583, + "auxiliary_loss_mlp": 0.17579561, + "balance_loss_clip": 0.08352019, + "balance_loss_mlp": 0.02406223, + "epoch": 0.01088230873290245, + "flos": 25053501962880.0, + "grad_norm": 1023.6426422721124, + "language_loss": 3.16591597, + "learning_rate": 3.3470636645196674e-06, + "loss": 3.45807004, + "num_input_tokens_seen": 3813610, + "router_z_loss_clip": 32.875, + "router_z_loss_mlp": 151.5, + "step": 181, + "time_per_iteration": 2.7088735103607178 + }, + { + "auxiliary_loss_clip": 0.11667231, + "auxiliary_loss_mlp": 0.17749819, + "balance_loss_clip": 0.08358228, + "balance_loss_mlp": 0.02381167, + "epoch": 0.01094243198557042, + "flos": 22900056831360.0, + "grad_norm": 355.45097956691654, + "language_loss": 3.57462454, + "learning_rate": 3.3506110684439156e-06, + "loss": 3.86879492, + "num_input_tokens_seen": 3831390, + "router_z_loss_clip": 33.03125, + "router_z_loss_mlp": 153.625, + "step": 182, + "time_per_iteration": 2.6655702590942383 + }, + { + "auxiliary_loss_clip": 0.11774068, + "auxiliary_loss_mlp": 0.186405, + "balance_loss_clip": 0.08392486, + "balance_loss_mlp": 0.02429562, + "epoch": 0.011002555238238388, + "flos": 17170720456320.0, + "grad_norm": 544.9308642616941, + "language_loss": 3.01895189, + "learning_rate": 3.3541390344409054e-06, + "loss": 3.32309771, + "num_input_tokens_seen": 3849705, + "router_z_loss_clip": 33.8125, + "router_z_loss_mlp": 162.0, + "step": 183, + "time_per_iteration": 2.672084331512451 + }, + { + "auxiliary_loss_clip": 0.11731043, + "auxiliary_loss_mlp": 0.17741105, + "balance_loss_clip": 0.0838448, + "balance_loss_mlp": 0.02409074, + "epoch": 0.011062678490906358, + "flos": 22316783760000.0, + "grad_norm": 900.0159693716428, + "language_loss": 3.54977012, + "learning_rate": 3.357647774369736e-06, + "loss": 3.84449148, + "num_input_tokens_seen": 3869230, + "router_z_loss_clip": 33.4375, + "router_z_loss_mlp": 153.25, + "step": 184, + "time_per_iteration": 2.664008140563965 + }, + { + "auxiliary_loss_clip": 0.11698474, + "auxiliary_loss_mlp": 0.18400645, + "balance_loss_clip": 0.08363934, + "balance_loss_mlp": 0.02433849, + "epoch": 0.011122801743574327, + "flos": 24395108106240.0, + "grad_norm": 434.928327577731, + "language_loss": 3.09638596, + "learning_rate": 3.3611374966446085e-06, + "loss": 3.39737701, + "num_input_tokens_seen": 3889735, + "router_z_loss_clip": 33.34375, + "router_z_loss_mlp": 159.5, + "step": 185, + "time_per_iteration": 2.726417303085327 + }, + { + "auxiliary_loss_clip": 0.11759127, + "auxiliary_loss_mlp": 0.17777845, + "balance_loss_clip": 0.08374798, + "balance_loss_mlp": 0.02421399, + "epoch": 0.011182924996242297, + "flos": 18156110071680.0, + "grad_norm": 629.7246053366609, + "language_loss": 2.4891119, + "learning_rate": 3.3646084063091142e-06, + "loss": 2.78448153, + "num_input_tokens_seen": 3908855, + "router_z_loss_clip": 33.84375, + "router_z_loss_mlp": 153.5, + "step": 186, + "time_per_iteration": 2.694352865219116 + }, + { + "auxiliary_loss_clip": 0.11730683, + "auxiliary_loss_mlp": 0.17846453, + "balance_loss_clip": 0.08379789, + "balance_loss_mlp": 0.0240456, + "epoch": 0.011243048248910266, + "flos": 15492206666880.0, + "grad_norm": 204.67136476740635, + "language_loss": 3.6299262, + "learning_rate": 3.3680607051085194e-06, + "loss": 3.9256978, + "num_input_tokens_seen": 3923865, + "router_z_loss_clip": 33.53125, + "router_z_loss_mlp": 154.25, + "step": 187, + "time_per_iteration": 2.6440258026123047 + }, + { + "auxiliary_loss_clip": 0.11782947, + "auxiliary_loss_mlp": 0.18885629, + "balance_loss_clip": 0.08391893, + "balance_loss_mlp": 0.02454964, + "epoch": 0.011303171501578235, + "flos": 40926442383360.0, + "grad_norm": 245.45256433797323, + "language_loss": 2.78124428, + "learning_rate": 3.371494591560139e-06, + "loss": 3.0879302, + "num_input_tokens_seen": 3946870, + "router_z_loss_clip": 33.875, + "router_z_loss_mlp": 164.25, + "step": 188, + "time_per_iteration": 2.8504083156585693 + }, + { + "auxiliary_loss_clip": 0.10094331, + "auxiliary_loss_mlp": 0.0271045, + "balance_loss_clip": 0.08081996, + "balance_loss_mlp": 0.01840699, + "epoch": 0.011363294754246205, + "flos": 66321237225600.0, + "grad_norm": 2.5418158680058287, + "language_loss": 0.5572542, + "learning_rate": 3.3749102610218297e-06, + "loss": 0.68530196, + "num_input_tokens_seen": 4010005, + "router_z_loss_clip": 20.140625, + "router_z_loss_mlp": 8.71875, + "step": 189, + "time_per_iteration": 3.351346492767334 + }, + { + "auxiliary_loss_clip": 0.11787133, + "auxiliary_loss_mlp": 0.18362574, + "balance_loss_clip": 0.08391854, + "balance_loss_mlp": 0.02444606, + "epoch": 0.011423418006914174, + "flos": 24907285388160.0, + "grad_norm": 1404.1743205968703, + "language_loss": 3.09611416, + "learning_rate": 3.3783079057586833e-06, + "loss": 3.39761114, + "num_input_tokens_seen": 4029035, + "router_z_loss_clip": 34.0, + "router_z_loss_mlp": 159.125, + "step": 190, + "time_per_iteration": 2.7106430530548096 + }, + { + "auxiliary_loss_clip": 0.11759384, + "auxiliary_loss_mlp": 0.1804318, + "balance_loss_clip": 0.08374631, + "balance_loss_mlp": 0.02442593, + "epoch": 0.011483541259582144, + "flos": 19797964899840.0, + "grad_norm": 958.8286854390585, + "language_loss": 3.06252718, + "learning_rate": 3.3816877150079665e-06, + "loss": 3.36055326, + "num_input_tokens_seen": 4046995, + "router_z_loss_clip": 33.875, + "router_z_loss_mlp": 156.0, + "step": 191, + "time_per_iteration": 2.6592226028442383 + }, + { + "auxiliary_loss_clip": 0.11741614, + "auxiliary_loss_mlp": 0.17628413, + "balance_loss_clip": 0.08397849, + "balance_loss_mlp": 0.02442867, + "epoch": 0.011543664512250112, + "flos": 26184101904000.0, + "grad_norm": 872.0200851454543, + "language_loss": 3.40287876, + "learning_rate": 3.385049875042367e-06, + "loss": 3.69657874, + "num_input_tokens_seen": 4065865, + "router_z_loss_clip": 33.4375, + "router_z_loss_mlp": 151.625, + "step": 192, + "time_per_iteration": 2.7246127128601074 + }, + { + "auxiliary_loss_clip": 0.11744646, + "auxiliary_loss_mlp": 0.1831618, + "balance_loss_clip": 0.08387344, + "balance_loss_mlp": 0.02459247, + "epoch": 0.011603787764918083, + "flos": 23775763052160.0, + "grad_norm": 255.22859463919886, + "language_loss": 3.03195429, + "learning_rate": 3.3883945692315938e-06, + "loss": 3.33256245, + "num_input_tokens_seen": 4085305, + "router_z_loss_clip": 33.59375, + "router_z_loss_mlp": 158.375, + "step": 193, + "time_per_iteration": 2.683800220489502 + }, + { + "auxiliary_loss_clip": 0.11792802, + "auxiliary_loss_mlp": 0.18172303, + "balance_loss_clip": 0.08409159, + "balance_loss_mlp": 0.02449647, + "epoch": 0.011663911017586051, + "flos": 25961255170560.0, + "grad_norm": 151.45813274947093, + "language_loss": 3.26517797, + "learning_rate": 3.3917219781023906e-06, + "loss": 3.56482911, + "num_input_tokens_seen": 4105185, + "router_z_loss_clip": 33.875, + "router_z_loss_mlp": 157.0, + "step": 194, + "time_per_iteration": 2.6878743171691895 + }, + { + "auxiliary_loss_clip": 0.11706592, + "auxiliary_loss_mlp": 0.17706957, + "balance_loss_clip": 0.08367997, + "balance_loss_mlp": 0.0244817, + "epoch": 0.01172403427025402, + "flos": 17901006716160.0, + "grad_norm": 341.36308265873936, + "language_loss": 3.21669102, + "learning_rate": 3.3950322793970014e-06, + "loss": 3.51082659, + "num_input_tokens_seen": 4123160, + "router_z_loss_clip": 33.375, + "router_z_loss_mlp": 152.25, + "step": 195, + "time_per_iteration": 2.6620969772338867 + }, + { + "auxiliary_loss_clip": 0.11741272, + "auxiliary_loss_mlp": 0.18081686, + "balance_loss_clip": 0.08387178, + "balance_loss_mlp": 0.02468893, + "epoch": 0.01178415752292199, + "flos": 17900293956480.0, + "grad_norm": 232.42067340374058, + "language_loss": 3.00283194, + "learning_rate": 3.3983256481301445e-06, + "loss": 3.30106115, + "num_input_tokens_seen": 4140425, + "router_z_loss_clip": 33.53125, + "router_z_loss_mlp": 156.0, + "step": 196, + "time_per_iteration": 2.608747720718384 + }, + { + "auxiliary_loss_clip": 0.11721249, + "auxiliary_loss_mlp": 0.17373422, + "balance_loss_clip": 0.08370736, + "balance_loss_mlp": 0.02444223, + "epoch": 0.011844280775589959, + "flos": 22900224539520.0, + "grad_norm": 115.37051275011517, + "language_loss": 2.93469787, + "learning_rate": 3.4016022566445335e-06, + "loss": 3.22564435, + "num_input_tokens_seen": 4159555, + "router_z_loss_clip": 33.5, + "router_z_loss_mlp": 149.0, + "step": 197, + "time_per_iteration": 2.6884865760803223 + }, + { + "auxiliary_loss_clip": 0.11780085, + "auxiliary_loss_mlp": 0.17500654, + "balance_loss_clip": 0.08412851, + "balance_loss_mlp": 0.02486004, + "epoch": 0.01190440402825793, + "flos": 26987748013440.0, + "grad_norm": 594.5655905086047, + "language_loss": 2.93459964, + "learning_rate": 3.4048622746649966e-06, + "loss": 3.22740698, + "num_input_tokens_seen": 4180480, + "router_z_loss_clip": 33.65625, + "router_z_loss_mlp": 150.25, + "step": 198, + "time_per_iteration": 2.7313427925109863 + }, + { + "auxiliary_loss_clip": 0.11754367, + "auxiliary_loss_mlp": 0.16903168, + "balance_loss_clip": 0.08420561, + "balance_loss_mlp": 0.02462251, + "epoch": 0.011964527280925898, + "flos": 20527789962240.0, + "grad_norm": 145.17481727818333, + "language_loss": 2.84690857, + "learning_rate": 3.4081058693512278e-06, + "loss": 3.13348389, + "num_input_tokens_seen": 4198835, + "router_z_loss_clip": 33.34375, + "router_z_loss_mlp": 144.5, + "step": 199, + "time_per_iteration": 2.688974618911743 + }, + { + "auxiliary_loss_clip": 0.11798929, + "auxiliary_loss_mlp": 0.17447452, + "balance_loss_clip": 0.08422767, + "balance_loss_mlp": 0.02481632, + "epoch": 0.012024650533593867, + "flos": 27753435423360.0, + "grad_norm": 82.0113766879368, + "language_loss": 2.56142473, + "learning_rate": 3.411333205349222e-06, + "loss": 2.85388851, + "num_input_tokens_seen": 4219335, + "router_z_loss_clip": 33.75, + "router_z_loss_mlp": 149.5, + "step": 200, + "time_per_iteration": 2.745638608932495 + }, + { + "auxiliary_loss_clip": 0.11760798, + "auxiliary_loss_mlp": 0.1661135, + "balance_loss_clip": 0.08439215, + "balance_loss_mlp": 0.02475607, + "epoch": 0.012084773786261837, + "flos": 10456623371520.0, + "grad_norm": 81.29107841083456, + "language_loss": 2.49306059, + "learning_rate": 3.4145444448414217e-06, + "loss": 2.77678204, + "num_input_tokens_seen": 4236940, + "router_z_loss_clip": 33.25, + "router_z_loss_mlp": 141.375, + "step": 201, + "time_per_iteration": 2.7527854442596436 + }, + { + "auxiliary_loss_clip": 0.1174719, + "auxiliary_loss_mlp": 0.16602293, + "balance_loss_clip": 0.08432734, + "balance_loss_mlp": 0.02490965, + "epoch": 0.012144897038929806, + "flos": 23111331701760.0, + "grad_norm": 843.8800494285322, + "language_loss": 2.70319819, + "learning_rate": 3.4177397475956223e-06, + "loss": 2.98669291, + "num_input_tokens_seen": 4256755, + "router_z_loss_clip": 33.21875, + "router_z_loss_mlp": 141.125, + "step": 202, + "time_per_iteration": 2.739138603210449 + }, + { + "auxiliary_loss_clip": 0.11772437, + "auxiliary_loss_mlp": 0.16814882, + "balance_loss_clip": 0.08448092, + "balance_loss_mlp": 0.02483826, + "epoch": 0.012205020291597776, + "flos": 21039631827840.0, + "grad_norm": 111.22984226607618, + "language_loss": 2.69834185, + "learning_rate": 3.4209192710126685e-06, + "loss": 2.98421502, + "num_input_tokens_seen": 4276505, + "router_z_loss_clip": 33.25, + "router_z_loss_mlp": 143.375, + "step": 203, + "time_per_iteration": 2.6849801540374756 + }, + { + "auxiliary_loss_clip": 0.09996115, + "auxiliary_loss_mlp": 0.01763683, + "balance_loss_clip": 0.08022483, + "balance_loss_mlp": 0.01355129, + "epoch": 0.012265143544265745, + "flos": 68465416481280.0, + "grad_norm": 2.5939001011358327, + "language_loss": 0.60663998, + "learning_rate": 3.4240831701729837e-06, + "loss": 0.72423798, + "num_input_tokens_seen": 4330965, + "router_z_loss_clip": 19.75, + "router_z_loss_mlp": 4.08984375, + "step": 204, + "time_per_iteration": 3.218200922012329 + }, + { + "auxiliary_loss_clip": 0.11829591, + "auxiliary_loss_mlp": 0.16426852, + "balance_loss_clip": 0.08460154, + "balance_loss_mlp": 0.02486424, + "epoch": 0.012325266796933715, + "flos": 17024923152000.0, + "grad_norm": 175.923318576614, + "language_loss": 2.6947825, + "learning_rate": 3.4272315978819516e-06, + "loss": 2.9773469, + "num_input_tokens_seen": 4348200, + "router_z_loss_clip": 33.6875, + "router_z_loss_mlp": 139.5, + "step": 205, + "time_per_iteration": 2.6580400466918945 + }, + { + "auxiliary_loss_clip": 0.11821875, + "auxiliary_loss_mlp": 0.15477848, + "balance_loss_clip": 0.0845597, + "balance_loss_mlp": 0.02483464, + "epoch": 0.012385390049601683, + "flos": 20195679104640.0, + "grad_norm": 179.20336452265943, + "language_loss": 2.76609898, + "learning_rate": 3.4303647047142043e-06, + "loss": 3.03909636, + "num_input_tokens_seen": 4365460, + "router_z_loss_clip": 33.71875, + "router_z_loss_mlp": 130.0625, + "step": 206, + "time_per_iteration": 2.732661724090576 + }, + { + "auxiliary_loss_clip": 0.11876252, + "auxiliary_loss_mlp": 0.15609139, + "balance_loss_clip": 0.0847889, + "balance_loss_mlp": 0.02498787, + "epoch": 0.012445513302269652, + "flos": 16258690690560.0, + "grad_norm": 37.57079461410369, + "language_loss": 2.63663292, + "learning_rate": 3.43348263905683e-06, + "loss": 2.91148686, + "num_input_tokens_seen": 4383650, + "router_z_loss_clip": 33.9375, + "router_z_loss_mlp": 131.25, + "step": 207, + "time_per_iteration": 2.655898332595825 + }, + { + "auxiliary_loss_clip": 0.11858118, + "auxiliary_loss_mlp": 0.15964949, + "balance_loss_clip": 0.08469288, + "balance_loss_mlp": 0.02500593, + "epoch": 0.012505636554937622, + "flos": 23776224249600.0, + "grad_norm": 80.16610328924297, + "language_loss": 2.31757832, + "learning_rate": 3.436585547151547e-06, + "loss": 2.59580898, + "num_input_tokens_seen": 4403765, + "router_z_loss_clip": 33.90625, + "router_z_loss_mlp": 134.8125, + "step": 208, + "time_per_iteration": 2.7096707820892334 + }, + { + "auxiliary_loss_clip": 0.11891477, + "auxiliary_loss_mlp": 0.15333374, + "balance_loss_clip": 0.08512411, + "balance_loss_mlp": 0.02509888, + "epoch": 0.012565759807605591, + "flos": 30599417750400.0, + "grad_norm": 94.61742092763181, + "language_loss": 2.89340639, + "learning_rate": 3.4396735731358586e-06, + "loss": 3.16565466, + "num_input_tokens_seen": 4421935, + "router_z_loss_clip": 33.8125, + "router_z_loss_mlp": 128.3125, + "step": 209, + "time_per_iteration": 2.7260549068450928 + }, + { + "auxiliary_loss_clip": 0.11866176, + "auxiliary_loss_mlp": 0.14843261, + "balance_loss_clip": 0.08489646, + "balance_loss_mlp": 0.02508056, + "epoch": 0.012625883060273561, + "flos": 40122838200960.0, + "grad_norm": 70.02885877178691, + "language_loss": 2.47040462, + "learning_rate": 3.4427468590832302e-06, + "loss": 2.737499, + "num_input_tokens_seen": 4441470, + "router_z_loss_clip": 33.78125, + "router_z_loss_mlp": 123.375, + "step": 210, + "time_per_iteration": 2.8969995975494385 + }, + { + "auxiliary_loss_clip": 0.1188697, + "auxiliary_loss_mlp": 0.14057073, + "balance_loss_clip": 0.08471721, + "balance_loss_mlp": 0.02497014, + "epoch": 0.01268600631294153, + "flos": 27096509013120.0, + "grad_norm": 122.06391807709156, + "language_loss": 2.54189563, + "learning_rate": 3.445805545042314e-06, + "loss": 2.80133629, + "num_input_tokens_seen": 4459950, + "router_z_loss_clip": 34.15625, + "router_z_loss_mlp": 115.625, + "step": 211, + "time_per_iteration": 2.708080768585205 + }, + { + "auxiliary_loss_clip": 0.11883873, + "auxiliary_loss_mlp": 0.13339609, + "balance_loss_clip": 0.08499163, + "balance_loss_mlp": 0.02499764, + "epoch": 0.012746129565609499, + "flos": 16988431898880.0, + "grad_norm": 126.44131700603937, + "language_loss": 2.37998009, + "learning_rate": 3.448849769075239e-06, + "loss": 2.63221502, + "num_input_tokens_seen": 4478390, + "router_z_loss_clip": 33.84375, + "router_z_loss_mlp": 108.375, + "step": 212, + "time_per_iteration": 2.6480045318603516 + }, + { + "auxiliary_loss_clip": 0.11928719, + "auxiliary_loss_mlp": 0.13044119, + "balance_loss_clip": 0.08510935, + "balance_loss_mlp": 0.02497243, + "epoch": 0.012806252818277469, + "flos": 46543621668480.0, + "grad_norm": 186.42729164055353, + "language_loss": 2.21970725, + "learning_rate": 3.4518796672950093e-06, + "loss": 2.46943569, + "num_input_tokens_seen": 4501665, + "router_z_loss_clip": 34.15625, + "router_z_loss_mlp": 105.5625, + "step": 213, + "time_per_iteration": 2.871330738067627 + }, + { + "auxiliary_loss_clip": 0.119517, + "auxiliary_loss_mlp": 0.12083894, + "balance_loss_clip": 0.08513753, + "balance_loss_mlp": 0.02489167, + "epoch": 0.012866376070945438, + "flos": 14393234442240.0, + "grad_norm": 59.129237382202305, + "language_loss": 2.15201378, + "learning_rate": 3.4548953739020187e-06, + "loss": 2.39236999, + "num_input_tokens_seen": 4519055, + "router_z_loss_clip": 34.40625, + "router_z_loss_mlp": 95.9375, + "step": 214, + "time_per_iteration": 2.677279472351074 + }, + { + "auxiliary_loss_clip": 0.11979187, + "auxiliary_loss_mlp": 0.11437444, + "balance_loss_clip": 0.08527225, + "balance_loss_mlp": 0.02483585, + "epoch": 0.012926499323613408, + "flos": 26148029921280.0, + "grad_norm": 82.8472801825022, + "language_loss": 2.01005268, + "learning_rate": 3.4578970212197196e-06, + "loss": 2.24421906, + "num_input_tokens_seen": 4540870, + "router_z_loss_clip": 34.5, + "router_z_loss_mlp": 89.625, + "step": 215, + "time_per_iteration": 5.505565881729126 + }, + { + "auxiliary_loss_clip": 0.11977073, + "auxiliary_loss_mlp": 0.10736242, + "balance_loss_clip": 0.08518873, + "balance_loss_mlp": 0.02484289, + "epoch": 0.012986622576281377, + "flos": 30124989532800.0, + "grad_norm": 444.29299491343255, + "language_loss": 2.23052669, + "learning_rate": 3.460884739729461e-06, + "loss": 2.45765996, + "num_input_tokens_seen": 4560395, + "router_z_loss_clip": 34.625, + "router_z_loss_mlp": 82.5, + "step": 216, + "time_per_iteration": 4.0875208377838135 + }, + { + "auxiliary_loss_clip": 0.11978886, + "auxiliary_loss_mlp": 0.10150906, + "balance_loss_clip": 0.0852896, + "balance_loss_mlp": 0.02478787, + "epoch": 0.013046745828949347, + "flos": 13959112838400.0, + "grad_norm": 45.21271501184753, + "language_loss": 2.33321786, + "learning_rate": 3.463858658104523e-06, + "loss": 2.55451584, + "num_input_tokens_seen": 4575785, + "router_z_loss_clip": 34.46875, + "router_z_loss_mlp": 76.625, + "step": 217, + "time_per_iteration": 4.032313585281372 + }, + { + "auxiliary_loss_clip": 0.11990365, + "auxiliary_loss_mlp": 0.09330522, + "balance_loss_clip": 0.08498306, + "balance_loss_mlp": 0.02482377, + "epoch": 0.013106869081617315, + "flos": 17353595992320.0, + "grad_norm": 48.7496700865691, + "language_loss": 2.077981, + "learning_rate": 3.4668189032433696e-06, + "loss": 2.29119015, + "num_input_tokens_seen": 4594985, + "router_z_loss_clip": 34.9375, + "router_z_loss_mlp": 68.625, + "step": 218, + "time_per_iteration": 2.655488967895508 + }, + { + "auxiliary_loss_clip": 0.12044869, + "auxiliary_loss_mlp": 0.08778962, + "balance_loss_clip": 0.08527655, + "balance_loss_mlp": 0.02477083, + "epoch": 0.013166992334285284, + "flos": 25892004170880.0, + "grad_norm": 58.49845250600888, + "language_loss": 2.1651845, + "learning_rate": 3.46976560030214e-06, + "loss": 2.3734231, + "num_input_tokens_seen": 4616125, + "router_z_loss_clip": 35.15625, + "router_z_loss_mlp": 63.0, + "step": 219, + "time_per_iteration": 2.7416553497314453 + }, + { + "auxiliary_loss_clip": 0.12097923, + "auxiliary_loss_mlp": 0.08351351, + "balance_loss_clip": 0.08555256, + "balance_loss_mlp": 0.0248282, + "epoch": 0.013227115586953254, + "flos": 31184032487040.0, + "grad_norm": 65.30096795058861, + "language_loss": 2.22661948, + "learning_rate": 3.4726988727263976e-06, + "loss": 2.43111229, + "num_input_tokens_seen": 4637795, + "router_z_loss_clip": 35.40625, + "router_z_loss_mlp": 58.625, + "step": 220, + "time_per_iteration": 2.825364351272583 + }, + { + "auxiliary_loss_clip": 0.12091806, + "auxiliary_loss_mlp": 0.07555279, + "balance_loss_clip": 0.08557573, + "balance_loss_mlp": 0.02477154, + "epoch": 0.013287238839621223, + "flos": 20415213601920.0, + "grad_norm": 85.51848477504389, + "language_loss": 2.08907223, + "learning_rate": 3.475618842282164e-06, + "loss": 2.2855432, + "num_input_tokens_seen": 4656835, + "router_z_loss_clip": 35.375, + "router_z_loss_mlp": 50.75, + "step": 221, + "time_per_iteration": 2.699341058731079 + }, + { + "auxiliary_loss_clip": 0.12102397, + "auxiliary_loss_mlp": 0.07188272, + "balance_loss_clip": 0.08552121, + "balance_loss_mlp": 0.02482462, + "epoch": 0.013347362092289193, + "flos": 14142365717760.0, + "grad_norm": 45.70301732891132, + "language_loss": 2.16536474, + "learning_rate": 3.4785256290862486e-06, + "loss": 2.3582716, + "num_input_tokens_seen": 4673015, + "router_z_loss_clip": 35.5, + "router_z_loss_mlp": 47.0, + "step": 222, + "time_per_iteration": 2.635849714279175 + }, + { + "auxiliary_loss_clip": 0.12141806, + "auxiliary_loss_mlp": 0.06919794, + "balance_loss_clip": 0.08555885, + "balance_loss_mlp": 0.0248864, + "epoch": 0.013407485344957162, + "flos": 21803977572480.0, + "grad_norm": 133.93360024755185, + "language_loss": 2.13315558, + "learning_rate": 3.481419351635897e-06, + "loss": 2.32377172, + "num_input_tokens_seen": 4692355, + "router_z_loss_clip": 35.84375, + "router_z_loss_mlp": 44.375, + "step": 223, + "time_per_iteration": 2.677440881729126 + }, + { + "auxiliary_loss_clip": 0.12133283, + "auxiliary_loss_mlp": 0.06662595, + "balance_loss_clip": 0.08527759, + "balance_loss_mlp": 0.0248779, + "epoch": 0.013467608597625132, + "flos": 18627058344960.0, + "grad_norm": 45.82649386348146, + "language_loss": 2.04508209, + "learning_rate": 3.484300126837776e-06, + "loss": 2.23304057, + "num_input_tokens_seen": 4710080, + "router_z_loss_clip": 36.0, + "router_z_loss_mlp": 41.71875, + "step": 224, + "time_per_iteration": 2.647221803665161 + }, + { + "auxiliary_loss_clip": 0.12132762, + "auxiliary_loss_mlp": 0.06591167, + "balance_loss_clip": 0.0855926, + "balance_loss_mlp": 0.02489604, + "epoch": 0.013527731850293101, + "flos": 18558352396800.0, + "grad_norm": 35.4602333373948, + "language_loss": 1.96751869, + "learning_rate": 3.487168070036317e-06, + "loss": 2.15475798, + "num_input_tokens_seen": 4728980, + "router_z_loss_clip": 35.71875, + "router_z_loss_mlp": 41.0, + "step": 225, + "time_per_iteration": 2.6572558879852295 + }, + { + "auxiliary_loss_clip": 0.12111218, + "auxiliary_loss_mlp": 0.06338836, + "balance_loss_clip": 0.08540972, + "balance_loss_mlp": 0.02487518, + "epoch": 0.01358785510296107, + "flos": 19170318291840.0, + "grad_norm": 35.010295897234684, + "language_loss": 2.14010954, + "learning_rate": 3.4900232950414224e-06, + "loss": 2.32460999, + "num_input_tokens_seen": 4747020, + "router_z_loss_clip": 35.6875, + "router_z_loss_mlp": 38.46875, + "step": 226, + "time_per_iteration": 2.6925666332244873 + }, + { + "auxiliary_loss_clip": 0.12106597, + "auxiliary_loss_mlp": 0.06106333, + "balance_loss_clip": 0.08537765, + "balance_loss_mlp": 0.02477793, + "epoch": 0.01364797835562904, + "flos": 23336442495360.0, + "grad_norm": 62.289483146556975, + "language_loss": 1.89336014, + "learning_rate": 3.4928659141555727e-06, + "loss": 2.07548952, + "num_input_tokens_seen": 4765000, + "router_z_loss_clip": 35.71875, + "router_z_loss_mlp": 36.25, + "step": 227, + "time_per_iteration": 2.662459373474121 + }, + { + "auxiliary_loss_clip": 0.09852038, + "auxiliary_loss_mlp": 0.02028254, + "balance_loss_clip": 0.08093569, + "balance_loss_mlp": 0.01678827, + "epoch": 0.013708101608297009, + "flos": 71016561089280.0, + "grad_norm": 1.118625578373922, + "language_loss": 0.572559, + "learning_rate": 3.4956960382003234e-06, + "loss": 0.6913619, + "num_input_tokens_seen": 4833210, + "router_z_loss_clip": 17.53125, + "router_z_loss_mlp": 3.49804688, + "step": 228, + "time_per_iteration": 3.3785295486450195 + }, + { + "auxiliary_loss_clip": 0.12056112, + "auxiliary_loss_mlp": 0.05858175, + "balance_loss_clip": 0.08522452, + "balance_loss_mlp": 0.02485983, + "epoch": 0.013768224860964979, + "flos": 16330583093760.0, + "grad_norm": 67.20403392826273, + "language_loss": 1.83727443, + "learning_rate": 3.4985137765422354e-06, + "loss": 2.0164175, + "num_input_tokens_seen": 4850120, + "router_z_loss_clip": 35.34375, + "router_z_loss_mlp": 33.765625, + "step": 229, + "time_per_iteration": 2.6247904300689697 + }, + { + "auxiliary_loss_clip": 0.11999249, + "auxiliary_loss_mlp": 0.05601757, + "balance_loss_clip": 0.08509874, + "balance_loss_mlp": 0.02482861, + "epoch": 0.013828348113632948, + "flos": 20199159048960.0, + "grad_norm": 53.50045183346903, + "language_loss": 1.8795563, + "learning_rate": 3.501319237118231e-06, + "loss": 2.05556631, + "num_input_tokens_seen": 4866215, + "router_z_loss_clip": 34.9375, + "router_z_loss_mlp": 31.1875, + "step": 230, + "time_per_iteration": 2.7507057189941406 + }, + { + "auxiliary_loss_clip": 0.12064129, + "auxiliary_loss_mlp": 0.05470717, + "balance_loss_clip": 0.08557475, + "balance_loss_mlp": 0.02487624, + "epoch": 0.013888471366300916, + "flos": 20747408313600.0, + "grad_norm": 34.266749882440614, + "language_loss": 1.64469385, + "learning_rate": 3.5041125264604056e-06, + "loss": 1.82004225, + "num_input_tokens_seen": 4885630, + "router_z_loss_clip": 35.09375, + "router_z_loss_mlp": 29.796875, + "step": 231, + "time_per_iteration": 2.641220808029175 + }, + { + "auxiliary_loss_clip": 0.12051, + "auxiliary_loss_mlp": 0.05321148, + "balance_loss_clip": 0.08549553, + "balance_loss_mlp": 0.02486065, + "epoch": 0.013948594618968886, + "flos": 22097123481600.0, + "grad_norm": 189.27377216215737, + "language_loss": 1.70564377, + "learning_rate": 3.5068937497203002e-06, + "loss": 1.87936521, + "num_input_tokens_seen": 4905570, + "router_z_loss_clip": 35.0, + "router_z_loss_mlp": 28.34375, + "step": 232, + "time_per_iteration": 2.6656322479248047 + }, + { + "auxiliary_loss_clip": 0.12035383, + "auxiliary_loss_mlp": 0.0510756, + "balance_loss_clip": 0.08542152, + "balance_loss_mlp": 0.02483049, + "epoch": 0.014008717871636855, + "flos": 19069229940480.0, + "grad_norm": 76.31242813901656, + "language_loss": 1.64492762, + "learning_rate": 3.509663010692652e-06, + "loss": 1.81635702, + "num_input_tokens_seen": 4923535, + "router_z_loss_clip": 34.96875, + "router_z_loss_mlp": 26.25, + "step": 233, + "time_per_iteration": 2.6354150772094727 + }, + { + "auxiliary_loss_clip": 0.12088259, + "auxiliary_loss_mlp": 0.05079982, + "balance_loss_clip": 0.08570465, + "balance_loss_mlp": 0.02490566, + "epoch": 0.014068841124304825, + "flos": 14534839042560.0, + "grad_norm": 50.00852440461159, + "language_loss": 1.75618017, + "learning_rate": 3.512420411838642e-06, + "loss": 1.92786264, + "num_input_tokens_seen": 4939200, + "router_z_loss_clip": 35.15625, + "router_z_loss_mlp": 25.890625, + "step": 234, + "time_per_iteration": 2.666630983352661 + }, + { + "auxiliary_loss_clip": 0.11989364, + "auxiliary_loss_mlp": 0.05021151, + "balance_loss_clip": 0.08533135, + "balance_loss_mlp": 0.0249277, + "epoch": 0.014128964376972794, + "flos": 18083253346560.0, + "grad_norm": 159.74277839526525, + "language_loss": 1.68861091, + "learning_rate": 3.515166054308634e-06, + "loss": 1.85871601, + "num_input_tokens_seen": 4956620, + "router_z_loss_clip": 34.625, + "router_z_loss_mlp": 25.28125, + "step": 235, + "time_per_iteration": 2.6749186515808105 + }, + { + "auxiliary_loss_clip": 0.12056133, + "auxiliary_loss_mlp": 0.04976581, + "balance_loss_clip": 0.08549982, + "balance_loss_mlp": 0.02495502, + "epoch": 0.014189087629640764, + "flos": 25340778086400.0, + "grad_norm": 181.61682318003585, + "language_loss": 1.60946572, + "learning_rate": 3.5179000379644498e-06, + "loss": 1.77979279, + "num_input_tokens_seen": 4975650, + "router_z_loss_clip": 35.03125, + "router_z_loss_mlp": 24.8125, + "step": 236, + "time_per_iteration": 2.744683027267456 + }, + { + "auxiliary_loss_clip": 0.11981137, + "auxiliary_loss_mlp": 0.04688486, + "balance_loss_clip": 0.08556408, + "balance_loss_mlp": 0.02492746, + "epoch": 0.014249210882308733, + "flos": 36148939263360.0, + "grad_norm": 53.559601436427585, + "language_loss": 1.50691867, + "learning_rate": 3.520622461401154e-06, + "loss": 1.67361498, + "num_input_tokens_seen": 4997415, + "router_z_loss_clip": 34.25, + "router_z_loss_mlp": 21.96875, + "step": 237, + "time_per_iteration": 2.845082998275757 + }, + { + "auxiliary_loss_clip": 0.12020621, + "auxiliary_loss_mlp": 0.04751597, + "balance_loss_clip": 0.08577786, + "balance_loss_mlp": 0.02497874, + "epoch": 0.014309334134976702, + "flos": 12937986656640.0, + "grad_norm": 74.10279300011292, + "language_loss": 1.46138978, + "learning_rate": 3.5233334219683935e-06, + "loss": 1.62911201, + "num_input_tokens_seen": 5013905, + "router_z_loss_clip": 34.4375, + "router_z_loss_mlp": 22.5625, + "step": 238, + "time_per_iteration": 2.658674716949463 + }, + { + "auxiliary_loss_clip": 0.11937614, + "auxiliary_loss_mlp": 0.04392426, + "balance_loss_clip": 0.08564249, + "balance_loss_mlp": 0.02485077, + "epoch": 0.014369457387644672, + "flos": 20783857639680.0, + "grad_norm": 42.588620022932425, + "language_loss": 1.53544843, + "learning_rate": 3.526033015791284e-06, + "loss": 1.69874883, + "num_input_tokens_seen": 5033645, + "router_z_loss_clip": 33.78125, + "router_z_loss_mlp": 19.046875, + "step": 239, + "time_per_iteration": 2.700894355773926 + }, + { + "auxiliary_loss_clip": 0.11902035, + "auxiliary_loss_mlp": 0.04253633, + "balance_loss_clip": 0.08564246, + "balance_loss_mlp": 0.02488191, + "epoch": 0.01442958064031264, + "flos": 25855638698880.0, + "grad_norm": 34.671761903295156, + "language_loss": 1.53386331, + "learning_rate": 3.528721337790862e-06, + "loss": 1.69542003, + "num_input_tokens_seen": 5052875, + "router_z_loss_clip": 33.4375, + "router_z_loss_mlp": 17.671875, + "step": 240, + "time_per_iteration": 2.712979555130005 + }, + { + "auxiliary_loss_clip": 0.11883197, + "auxiliary_loss_mlp": 0.04123231, + "balance_loss_clip": 0.08562298, + "balance_loss_mlp": 0.02487489, + "epoch": 0.014489703892980611, + "flos": 28227150881280.0, + "grad_norm": 79.00201559956153, + "language_loss": 1.47835279, + "learning_rate": 3.531398481704111e-06, + "loss": 1.63841701, + "num_input_tokens_seen": 5075005, + "router_z_loss_clip": 33.15625, + "router_z_loss_mlp": 16.359375, + "step": 241, + "time_per_iteration": 2.7748684883117676 + }, + { + "auxiliary_loss_clip": 0.11856598, + "auxiliary_loss_mlp": 0.0397551, + "balance_loss_clip": 0.08558369, + "balance_loss_mlp": 0.02488541, + "epoch": 0.01454982714564858, + "flos": 22497311381760.0, + "grad_norm": 26.156771136535646, + "language_loss": 1.46749806, + "learning_rate": 3.534064540103573e-06, + "loss": 1.62581909, + "num_input_tokens_seen": 5091875, + "router_z_loss_clip": 32.984375, + "router_z_loss_mlp": 14.875, + "step": 242, + "time_per_iteration": 2.69297456741333 + }, + { + "auxiliary_loss_clip": 0.11859537, + "auxiliary_loss_mlp": 0.03845835, + "balance_loss_clip": 0.08550237, + "balance_loss_mlp": 0.0248704, + "epoch": 0.014609950398316548, + "flos": 21659689641600.0, + "grad_norm": 40.62615504318681, + "language_loss": 1.44594622, + "learning_rate": 3.536719604416555e-06, + "loss": 1.60299993, + "num_input_tokens_seen": 5111290, + "router_z_loss_clip": 33.03125, + "router_z_loss_mlp": 13.5859375, + "step": 243, + "time_per_iteration": 2.7429516315460205 + }, + { + "auxiliary_loss_clip": 0.11778541, + "auxiliary_loss_mlp": 0.03809229, + "balance_loss_clip": 0.08539546, + "balance_loss_mlp": 0.02486292, + "epoch": 0.014670073650984519, + "flos": 21876163464960.0, + "grad_norm": 100.86422067940943, + "language_loss": 1.56203103, + "learning_rate": 3.5393637649439464e-06, + "loss": 1.71790862, + "num_input_tokens_seen": 5132265, + "router_z_loss_clip": 32.34375, + "router_z_loss_mlp": 13.2265625, + "step": 244, + "time_per_iteration": 2.6750683784484863 + }, + { + "auxiliary_loss_clip": 0.11823894, + "auxiliary_loss_mlp": 0.03778996, + "balance_loss_clip": 0.08550587, + "balance_loss_mlp": 0.02497257, + "epoch": 0.014730196903652487, + "flos": 23190142066560.0, + "grad_norm": 48.52251723310838, + "language_loss": 1.50476313, + "learning_rate": 3.54199711087864e-06, + "loss": 1.66079211, + "num_input_tokens_seen": 5148575, + "router_z_loss_clip": 32.71875, + "router_z_loss_mlp": 12.8125, + "step": 245, + "time_per_iteration": 2.72153639793396 + }, + { + "auxiliary_loss_clip": 0.11763392, + "auxiliary_loss_mlp": 0.03610927, + "balance_loss_clip": 0.08551488, + "balance_loss_mlp": 0.02484828, + "epoch": 0.014790320156320457, + "flos": 23229442431360.0, + "grad_norm": 98.70024924690004, + "language_loss": 1.52072549, + "learning_rate": 3.5446197303235913e-06, + "loss": 1.67446864, + "num_input_tokens_seen": 5170415, + "router_z_loss_clip": 32.078125, + "router_z_loss_mlp": 11.265625, + "step": 246, + "time_per_iteration": 2.739284038543701 + }, + { + "auxiliary_loss_clip": 0.11731501, + "auxiliary_loss_mlp": 0.03545591, + "balance_loss_clip": 0.08530955, + "balance_loss_mlp": 0.0246832, + "epoch": 0.014850443408988426, + "flos": 15821005288320.0, + "grad_norm": 33.98035395755878, + "language_loss": 1.40319586, + "learning_rate": 3.5472317103095034e-06, + "loss": 1.55596685, + "num_input_tokens_seen": 5188565, + "router_z_loss_clip": 31.96875, + "router_z_loss_mlp": 10.7734375, + "step": 247, + "time_per_iteration": 2.7273683547973633 + }, + { + "auxiliary_loss_clip": 0.1172208, + "auxiliary_loss_mlp": 0.03547119, + "balance_loss_clip": 0.08564139, + "balance_loss_mlp": 0.02478241, + "epoch": 0.014910566661656396, + "flos": 22787899741440.0, + "grad_norm": 52.371226674183355, + "language_loss": 1.30089116, + "learning_rate": 3.549833136812155e-06, + "loss": 1.453583, + "num_input_tokens_seen": 5207810, + "router_z_loss_clip": 31.578125, + "router_z_loss_mlp": 10.6953125, + "step": 248, + "time_per_iteration": 2.7991907596588135 + }, + { + "auxiliary_loss_clip": 0.11678547, + "auxiliary_loss_mlp": 0.03475812, + "balance_loss_clip": 0.08537906, + "balance_loss_mlp": 0.02466443, + "epoch": 0.014970689914324365, + "flos": 26871440146560.0, + "grad_norm": 39.139484540660874, + "language_loss": 1.33625245, + "learning_rate": 3.552424094769381e-06, + "loss": 1.48779607, + "num_input_tokens_seen": 5226210, + "router_z_loss_clip": 31.390625, + "router_z_loss_mlp": 10.0859375, + "step": 249, + "time_per_iteration": 2.7439961433410645 + }, + { + "auxiliary_loss_clip": 0.11684404, + "auxiliary_loss_mlp": 0.03406032, + "balance_loss_clip": 0.08537483, + "balance_loss_mlp": 0.02458461, + "epoch": 0.015030813166992334, + "flos": 13989943941120.0, + "grad_norm": 151.47532384589994, + "language_loss": 1.465379, + "learning_rate": 3.5550046680977174e-06, + "loss": 1.6162833, + "num_input_tokens_seen": 5241660, + "router_z_loss_clip": 31.46875, + "router_z_loss_mlp": 9.4765625, + "step": 250, + "time_per_iteration": 2.68412184715271 + }, + { + "auxiliary_loss_clip": 0.11659358, + "auxiliary_loss_mlp": 0.03389172, + "balance_loss_clip": 0.08554412, + "balance_loss_mlp": 0.02466397, + "epoch": 0.015090936419660304, + "flos": 24724787195520.0, + "grad_norm": 46.474949555678066, + "language_loss": 1.48383927, + "learning_rate": 3.5575749397087034e-06, + "loss": 1.63432467, + "num_input_tokens_seen": 5261090, + "router_z_loss_clip": 31.0625, + "router_z_loss_mlp": 9.22265625, + "step": 251, + "time_per_iteration": 2.7403595447540283 + }, + { + "auxiliary_loss_clip": 0.11684091, + "auxiliary_loss_mlp": 0.0341421, + "balance_loss_clip": 0.08552309, + "balance_loss_mlp": 0.02502498, + "epoch": 0.015151059672328273, + "flos": 25745829523200.0, + "grad_norm": 38.842940432028065, + "language_loss": 1.35644555, + "learning_rate": 3.5601349915248707e-06, + "loss": 1.50742865, + "num_input_tokens_seen": 5279175, + "router_z_loss_clip": 31.296875, + "router_z_loss_mlp": 9.1171875, + "step": 252, + "time_per_iteration": 2.791579246520996 + }, + { + "auxiliary_loss_clip": 0.11669001, + "auxiliary_loss_mlp": 0.03442915, + "balance_loss_clip": 0.08573347, + "balance_loss_mlp": 0.02537305, + "epoch": 0.015211182924996243, + "flos": 21877588984320.0, + "grad_norm": 62.5379323018988, + "language_loss": 1.55304623, + "learning_rate": 3.5626849044954064e-06, + "loss": 1.70416546, + "num_input_tokens_seen": 5296975, + "router_z_loss_clip": 30.96875, + "router_z_loss_mlp": 9.0625, + "step": 253, + "time_per_iteration": 2.6943836212158203 + }, + { + "auxiliary_loss_clip": 0.09242393, + "auxiliary_loss_mlp": 0.017157, + "balance_loss_clip": 0.07774388, + "balance_loss_mlp": 0.01455537, + "epoch": 0.015271306177664212, + "flos": 66915159765120.0, + "grad_norm": 1.2208472030610649, + "language_loss": 0.55767465, + "learning_rate": 3.5652247586115167e-06, + "loss": 0.66725558, + "num_input_tokens_seen": 5358375, + "router_z_loss_clip": 14.65625, + "router_z_loss_mlp": 2.6015625, + "step": 254, + "time_per_iteration": 4.672732353210449 + }, + { + "auxiliary_loss_clip": 0.11620437, + "auxiliary_loss_mlp": 0.03323486, + "balance_loss_clip": 0.08537702, + "balance_loss_mlp": 0.02497223, + "epoch": 0.01533142943033218, + "flos": 26841405657600.0, + "grad_norm": 25.800997540380294, + "language_loss": 1.37205672, + "learning_rate": 3.567754632921479e-06, + "loss": 1.52149594, + "num_input_tokens_seen": 5377255, + "router_z_loss_clip": 30.84375, + "router_z_loss_mlp": 8.265625, + "step": 255, + "time_per_iteration": 5.487545490264893 + }, + { + "auxiliary_loss_clip": 0.11549303, + "auxiliary_loss_mlp": 0.03243715, + "balance_loss_clip": 0.08531242, + "balance_loss_mlp": 0.02464373, + "epoch": 0.01539155268300015, + "flos": 20820055403520.0, + "grad_norm": 51.38147970022548, + "language_loss": 1.3568666, + "learning_rate": 3.5702746055454075e-06, + "loss": 1.50479686, + "num_input_tokens_seen": 5395320, + "router_z_loss_clip": 30.171875, + "router_z_loss_mlp": 7.7890625, + "step": 256, + "time_per_iteration": 2.7118937969207764 + }, + { + "auxiliary_loss_clip": 0.11515065, + "auxiliary_loss_mlp": 0.0323028, + "balance_loss_clip": 0.08509345, + "balance_loss_mlp": 0.02460093, + "epoch": 0.01545167593566812, + "flos": 15967473425280.0, + "grad_norm": 27.629045104410558, + "language_loss": 1.28094459, + "learning_rate": 3.5727847536897254e-06, + "loss": 1.42839789, + "num_input_tokens_seen": 5411970, + "router_z_loss_clip": 30.046875, + "router_z_loss_mlp": 7.69921875, + "step": 257, + "time_per_iteration": 4.093847751617432 + }, + { + "auxiliary_loss_clip": 0.11514995, + "auxiliary_loss_mlp": 0.03174197, + "balance_loss_clip": 0.08523524, + "balance_loss_mlp": 0.02457415, + "epoch": 0.01551179918833609, + "flos": 22608378368640.0, + "grad_norm": 22.193359085523966, + "language_loss": 1.37467206, + "learning_rate": 3.5752851536613596e-06, + "loss": 1.52156401, + "num_input_tokens_seen": 5430245, + "router_z_loss_clip": 29.921875, + "router_z_loss_mlp": 7.171875, + "step": 258, + "time_per_iteration": 2.6789233684539795 + }, + { + "auxiliary_loss_clip": 0.11490995, + "auxiliary_loss_mlp": 0.03125494, + "balance_loss_clip": 0.08525682, + "balance_loss_mlp": 0.02450675, + "epoch": 0.015571922441004058, + "flos": 22822713912960.0, + "grad_norm": 41.08352403819959, + "language_loss": 1.35431111, + "learning_rate": 3.577775880881658e-06, + "loss": 1.50047588, + "num_input_tokens_seen": 5448905, + "router_z_loss_clip": 29.640625, + "router_z_loss_mlp": 6.75390625, + "step": 259, + "time_per_iteration": 2.716095209121704 + }, + { + "auxiliary_loss_clip": 0.11409761, + "auxiliary_loss_mlp": 0.03065479, + "balance_loss_clip": 0.08500087, + "balance_loss_mlp": 0.02439868, + "epoch": 0.015632045693672027, + "flos": 18952502803200.0, + "grad_norm": 45.41794645804665, + "language_loss": 1.35833013, + "learning_rate": 3.5802570099000424e-06, + "loss": 1.50308251, + "num_input_tokens_seen": 5466405, + "router_z_loss_clip": 29.109375, + "router_z_loss_mlp": 6.25390625, + "step": 260, + "time_per_iteration": 2.63728666305542 + }, + { + "auxiliary_loss_clip": 0.11363758, + "auxiliary_loss_mlp": 0.03047284, + "balance_loss_clip": 0.0847533, + "balance_loss_mlp": 0.02422818, + "epoch": 0.015692168946339995, + "flos": 29979569571840.0, + "grad_norm": 14.449297272648009, + "language_loss": 1.30485594, + "learning_rate": 3.5827286144073947e-06, + "loss": 1.44896626, + "num_input_tokens_seen": 5487055, + "router_z_loss_clip": 28.921875, + "router_z_loss_mlp": 6.23828125, + "step": 261, + "time_per_iteration": 2.7847509384155273 + }, + { + "auxiliary_loss_clip": 0.11379428, + "auxiliary_loss_mlp": 0.03054321, + "balance_loss_clip": 0.08507971, + "balance_loss_mlp": 0.02459991, + "epoch": 0.015752292199007967, + "flos": 19398363978240.0, + "grad_norm": 31.701786044094614, + "language_loss": 1.03000259, + "learning_rate": 3.5851907672491904e-06, + "loss": 1.17434001, + "num_input_tokens_seen": 5506600, + "router_z_loss_clip": 28.71875, + "router_z_loss_mlp": 5.94140625, + "step": 262, + "time_per_iteration": 2.6821658611297607 + }, + { + "auxiliary_loss_clip": 0.11303549, + "auxiliary_loss_mlp": 0.02991728, + "balance_loss_clip": 0.0846238, + "balance_loss_mlp": 0.02461103, + "epoch": 0.015812415451675936, + "flos": 20346088383360.0, + "grad_norm": 21.20591685993131, + "language_loss": 1.06071973, + "learning_rate": 3.587643540438383e-06, + "loss": 1.20367253, + "num_input_tokens_seen": 5524350, + "router_z_loss_clip": 28.421875, + "router_z_loss_mlp": 5.30859375, + "step": 263, + "time_per_iteration": 2.6878163814544678 + }, + { + "auxiliary_loss_clip": 0.11343089, + "auxiliary_loss_mlp": 0.02942515, + "balance_loss_clip": 0.08484475, + "balance_loss_mlp": 0.0242982, + "epoch": 0.015872538704343905, + "flos": 17530392107520.0, + "grad_norm": 30.142563573193335, + "language_loss": 1.29773152, + "learning_rate": 3.590087005168037e-06, + "loss": 1.44058764, + "num_input_tokens_seen": 5542145, + "router_z_loss_clip": 28.59375, + "router_z_loss_mlp": 5.125, + "step": 264, + "time_per_iteration": 2.662154197692871 + }, + { + "auxiliary_loss_clip": 0.11317942, + "auxiliary_loss_mlp": 0.02875043, + "balance_loss_clip": 0.08491537, + "balance_loss_mlp": 0.02415754, + "epoch": 0.015932661957011873, + "flos": 15264622177920.0, + "grad_norm": 32.942584170075996, + "language_loss": 1.38455915, + "learning_rate": 3.5925212318237344e-06, + "loss": 1.52648902, + "num_input_tokens_seen": 5557920, + "router_z_loss_clip": 28.28125, + "router_z_loss_mlp": 4.59375, + "step": 265, + "time_per_iteration": 2.6390388011932373 + }, + { + "auxiliary_loss_clip": 0.11291553, + "auxiliary_loss_mlp": 0.02864291, + "balance_loss_clip": 0.08442727, + "balance_loss_mlp": 0.02421405, + "epoch": 0.015992785209679845, + "flos": 20308674735360.0, + "grad_norm": 55.122223701442024, + "language_loss": 1.13817394, + "learning_rate": 3.5949462899957323e-06, + "loss": 1.27973235, + "num_input_tokens_seen": 5576290, + "router_z_loss_clip": 28.484375, + "router_z_loss_mlp": 4.42773438, + "step": 266, + "time_per_iteration": 2.7511661052703857 + }, + { + "auxiliary_loss_clip": 0.11267024, + "auxiliary_loss_mlp": 0.02842336, + "balance_loss_clip": 0.08455394, + "balance_loss_mlp": 0.02423863, + "epoch": 0.016052908462347814, + "flos": 23368195992960.0, + "grad_norm": 26.951368678186665, + "language_loss": 1.23554707, + "learning_rate": 3.5973622484909068e-06, + "loss": 1.3766408, + "num_input_tokens_seen": 5595205, + "router_z_loss_clip": 28.140625, + "router_z_loss_mlp": 4.17773438, + "step": 267, + "time_per_iteration": 2.681403875350952 + }, + { + "auxiliary_loss_clip": 0.11252864, + "auxiliary_loss_mlp": 0.02837055, + "balance_loss_clip": 0.0845217, + "balance_loss_mlp": 0.02411335, + "epoch": 0.016113031715015783, + "flos": 21292722685440.0, + "grad_norm": 64.20150221953703, + "language_loss": 1.24742389, + "learning_rate": 3.599769175344462e-06, + "loss": 1.38832319, + "num_input_tokens_seen": 5612645, + "router_z_loss_clip": 28.0, + "router_z_loss_mlp": 4.2578125, + "step": 268, + "time_per_iteration": 2.72198224067688 + }, + { + "auxiliary_loss_clip": 0.11163211, + "auxiliary_loss_mlp": 0.02866759, + "balance_loss_clip": 0.08415397, + "balance_loss_mlp": 0.0243093, + "epoch": 0.01617315496768375, + "flos": 18920371962240.0, + "grad_norm": 170.41239636292127, + "language_loss": 1.22916961, + "learning_rate": 3.602167137831432e-06, + "loss": 1.3694694, + "num_input_tokens_seen": 5628345, + "router_z_loss_clip": 27.46875, + "router_z_loss_mlp": 4.36132812, + "step": 269, + "time_per_iteration": 2.6403703689575195 + }, + { + "auxiliary_loss_clip": 0.11217365, + "auxiliary_loss_mlp": 0.02780488, + "balance_loss_clip": 0.08470169, + "balance_loss_mlp": 0.02398446, + "epoch": 0.01623327822035172, + "flos": 16552339724160.0, + "grad_norm": 38.966481299889274, + "language_loss": 1.32494903, + "learning_rate": 3.6045562024779565e-06, + "loss": 1.46492743, + "num_input_tokens_seen": 5645940, + "router_z_loss_clip": 27.515625, + "router_z_loss_mlp": 3.82226562, + "step": 270, + "time_per_iteration": 2.7300021648406982 + }, + { + "auxiliary_loss_clip": 0.11115253, + "auxiliary_loss_mlp": 0.02879213, + "balance_loss_clip": 0.08416284, + "balance_loss_mlp": 0.02523302, + "epoch": 0.016293401473019692, + "flos": 23520198499200.0, + "grad_norm": 74.8782587112652, + "language_loss": 1.26303077, + "learning_rate": 3.606936435072361e-06, + "loss": 1.40297556, + "num_input_tokens_seen": 5665690, + "router_z_loss_clip": 26.984375, + "router_z_loss_mlp": 3.55859375, + "step": 271, + "time_per_iteration": 2.7073349952697754 + }, + { + "auxiliary_loss_clip": 0.11099713, + "auxiliary_loss_mlp": 0.02833465, + "balance_loss_clip": 0.08408779, + "balance_loss_mlp": 0.02473739, + "epoch": 0.01635352472568766, + "flos": 29022579290880.0, + "grad_norm": 92.09487601801163, + "language_loss": 1.22523308, + "learning_rate": 3.609307900676025e-06, + "loss": 1.36456478, + "num_input_tokens_seen": 5683190, + "router_z_loss_clip": 26.921875, + "router_z_loss_mlp": 3.59765625, + "step": 272, + "time_per_iteration": 2.767242670059204 + }, + { + "auxiliary_loss_clip": 0.11100094, + "auxiliary_loss_mlp": 0.02845915, + "balance_loss_clip": 0.08419856, + "balance_loss_mlp": 0.02489432, + "epoch": 0.01641364797835563, + "flos": 13375546277760.0, + "grad_norm": 162.68643260209848, + "language_loss": 1.12912893, + "learning_rate": 3.611670663634051e-06, + "loss": 1.26858902, + "num_input_tokens_seen": 5699780, + "router_z_loss_clip": 26.828125, + "router_z_loss_mlp": 3.5625, + "step": 273, + "time_per_iteration": 2.6756341457366943 + }, + { + "auxiliary_loss_clip": 0.11082844, + "auxiliary_loss_mlp": 0.02877946, + "balance_loss_clip": 0.08410685, + "balance_loss_mlp": 0.02487702, + "epoch": 0.016473771231023598, + "flos": 18883922636160.0, + "grad_norm": 33.34014800610017, + "language_loss": 1.30194449, + "learning_rate": 3.614024787585744e-06, + "loss": 1.44155228, + "num_input_tokens_seen": 5716980, + "router_z_loss_clip": 26.734375, + "router_z_loss_mlp": 3.90234375, + "step": 274, + "time_per_iteration": 2.7216930389404297 + }, + { + "auxiliary_loss_clip": 0.11044294, + "auxiliary_loss_mlp": 0.02852219, + "balance_loss_clip": 0.08402658, + "balance_loss_mlp": 0.02501839, + "epoch": 0.016533894483691566, + "flos": 22608252587520.0, + "grad_norm": 44.408233256015265, + "language_loss": 1.22405624, + "learning_rate": 3.6163703354748927e-06, + "loss": 1.36302137, + "num_input_tokens_seen": 5737780, + "router_z_loss_clip": 26.453125, + "router_z_loss_mlp": 3.50390625, + "step": 275, + "time_per_iteration": 2.6909008026123047 + }, + { + "auxiliary_loss_clip": 0.10985737, + "auxiliary_loss_mlp": 0.02874438, + "balance_loss_clip": 0.08389083, + "balance_loss_mlp": 0.02526728, + "epoch": 0.01659401773635954, + "flos": 21513640775040.0, + "grad_norm": 44.25598676438703, + "language_loss": 1.11958659, + "learning_rate": 3.6187073695598707e-06, + "loss": 1.25818849, + "num_input_tokens_seen": 5758330, + "router_z_loss_clip": 25.984375, + "router_z_loss_mlp": 3.4765625, + "step": 276, + "time_per_iteration": 2.700979471206665 + }, + { + "auxiliary_loss_clip": 0.10974017, + "auxiliary_loss_mlp": 0.02898641, + "balance_loss_clip": 0.08386508, + "balance_loss_mlp": 0.02528615, + "epoch": 0.016654140989027507, + "flos": 32858772842880.0, + "grad_norm": 42.11334181974309, + "language_loss": 1.14762068, + "learning_rate": 3.621035951423551e-06, + "loss": 1.28634739, + "num_input_tokens_seen": 5778340, + "router_z_loss_clip": 25.859375, + "router_z_loss_mlp": 3.703125, + "step": 277, + "time_per_iteration": 2.8497049808502197 + }, + { + "auxiliary_loss_clip": 0.10973347, + "auxiliary_loss_mlp": 0.02864523, + "balance_loss_clip": 0.08391111, + "balance_loss_mlp": 0.02533217, + "epoch": 0.016714264241695476, + "flos": 12310046559360.0, + "grad_norm": 887.2068563232498, + "language_loss": 1.11253488, + "learning_rate": 3.623356141983041e-06, + "loss": 1.25091362, + "num_input_tokens_seen": 5794295, + "router_z_loss_clip": 25.796875, + "router_z_loss_mlp": 3.3125, + "step": 278, + "time_per_iteration": 2.6813693046569824 + }, + { + "auxiliary_loss_clip": 0.10953625, + "auxiliary_loss_mlp": 0.02843702, + "balance_loss_clip": 0.08367237, + "balance_loss_mlp": 0.02501333, + "epoch": 0.016774387494363444, + "flos": 27130820060160.0, + "grad_norm": 34.273698880479216, + "language_loss": 1.25525784, + "learning_rate": 3.6256680014992486e-06, + "loss": 1.39323103, + "num_input_tokens_seen": 5814405, + "router_z_loss_clip": 25.859375, + "router_z_loss_mlp": 3.42382812, + "step": 279, + "time_per_iteration": 2.784980058670044 + }, + { + "auxiliary_loss_clip": 0.10968237, + "auxiliary_loss_mlp": 0.02757426, + "balance_loss_clip": 0.0838433, + "balance_loss_mlp": 0.02447863, + "epoch": 0.016834510747031413, + "flos": 20197356186240.0, + "grad_norm": 53.49395148263472, + "language_loss": 1.29536223, + "learning_rate": 3.6279715895862713e-06, + "loss": 1.43261886, + "num_input_tokens_seen": 5832795, + "router_z_loss_clip": 25.859375, + "router_z_loss_mlp": 3.09570312, + "step": 280, + "time_per_iteration": 2.681295871734619 + }, + { + "auxiliary_loss_clip": 0.10977297, + "auxiliary_loss_mlp": 0.02731509, + "balance_loss_clip": 0.083787, + "balance_loss_mlp": 0.02426143, + "epoch": 0.016894633999699385, + "flos": 27282067879680.0, + "grad_norm": 34.532536985404526, + "language_loss": 1.04021847, + "learning_rate": 3.6302669652206183e-06, + "loss": 1.17730653, + "num_input_tokens_seen": 5855750, + "router_z_loss_clip": 26.0, + "router_z_loss_mlp": 3.0546875, + "step": 281, + "time_per_iteration": 2.760214328765869 + }, + { + "auxiliary_loss_clip": 0.10965681, + "auxiliary_loss_mlp": 0.02675743, + "balance_loss_clip": 0.08379069, + "balance_loss_mlp": 0.02375717, + "epoch": 0.016954757252367354, + "flos": 14908262762880.0, + "grad_norm": 196.2497312811754, + "language_loss": 1.22675765, + "learning_rate": 3.632554186750274e-06, + "loss": 1.36317194, + "num_input_tokens_seen": 5872610, + "router_z_loss_clip": 25.875, + "router_z_loss_mlp": 2.99609375, + "step": 282, + "time_per_iteration": 2.619256019592285 + }, + { + "auxiliary_loss_clip": 0.10984524, + "auxiliary_loss_mlp": 0.02614953, + "balance_loss_clip": 0.0837212, + "balance_loss_mlp": 0.02316834, + "epoch": 0.017014880505035322, + "flos": 21364824723840.0, + "grad_norm": 113.89697119062544, + "language_loss": 1.1510148, + "learning_rate": 3.6348333119035937e-06, + "loss": 1.28700948, + "num_input_tokens_seen": 5892985, + "router_z_loss_clip": 26.125, + "router_z_loss_mlp": 2.98046875, + "step": 283, + "time_per_iteration": 2.7038846015930176 + }, + { + "auxiliary_loss_clip": 0.10939686, + "auxiliary_loss_mlp": 0.02615653, + "balance_loss_clip": 0.08368152, + "balance_loss_mlp": 0.02314101, + "epoch": 0.01707500375770329, + "flos": 35341561647360.0, + "grad_norm": 2832.5964725422496, + "language_loss": 1.17971587, + "learning_rate": 3.6371043977980503e-06, + "loss": 1.31526923, + "num_input_tokens_seen": 5914060, + "router_z_loss_clip": 25.703125, + "router_z_loss_mlp": 3.015625, + "step": 284, + "time_per_iteration": 2.779290199279785 + }, + { + "auxiliary_loss_clip": 0.11009269, + "auxiliary_loss_mlp": 0.02623795, + "balance_loss_clip": 0.08394658, + "balance_loss_mlp": 0.02300118, + "epoch": 0.01713512701037126, + "flos": 23588065906560.0, + "grad_norm": 202.09490986405962, + "language_loss": 1.3942194, + "learning_rate": 3.639367500948819e-06, + "loss": 1.53055, + "num_input_tokens_seen": 5932860, + "router_z_loss_clip": 26.15625, + "router_z_loss_mlp": 3.23632812, + "step": 285, + "time_per_iteration": 2.708090305328369 + }, + { + "auxiliary_loss_clip": 0.10991548, + "auxiliary_loss_mlp": 0.02635612, + "balance_loss_clip": 0.08366679, + "balance_loss_mlp": 0.02286949, + "epoch": 0.01719525026303923, + "flos": 27641781457920.0, + "grad_norm": 356.15135022069484, + "language_loss": 1.3973043, + "learning_rate": 3.6416226772772178e-06, + "loss": 1.53357589, + "num_input_tokens_seen": 5952725, + "router_z_loss_clip": 26.265625, + "router_z_loss_mlp": 3.48828125, + "step": 286, + "time_per_iteration": 2.719446897506714 + }, + { + "auxiliary_loss_clip": 0.11012185, + "auxiliary_loss_mlp": 0.02632762, + "balance_loss_clip": 0.08369677, + "balance_loss_mlp": 0.02288295, + "epoch": 0.0172553735157072, + "flos": 26987035253760.0, + "grad_norm": 104.57350843719594, + "language_loss": 1.20868826, + "learning_rate": 3.643869982119001e-06, + "loss": 1.34513772, + "num_input_tokens_seen": 5970560, + "router_z_loss_clip": 26.4375, + "router_z_loss_mlp": 3.44335938, + "step": 287, + "time_per_iteration": 2.729893207550049 + }, + { + "auxiliary_loss_clip": 0.10980022, + "auxiliary_loss_mlp": 0.02642429, + "balance_loss_clip": 0.08353196, + "balance_loss_mlp": 0.02284801, + "epoch": 0.01731549676837517, + "flos": 14060578533120.0, + "grad_norm": 166.25914626432441, + "language_loss": 1.43957901, + "learning_rate": 3.646109470232502e-06, + "loss": 1.57580352, + "num_input_tokens_seen": 5982980, + "router_z_loss_clip": 26.21875, + "router_z_loss_mlp": 3.57617188, + "step": 288, + "time_per_iteration": 2.649275779724121 + }, + { + "auxiliary_loss_clip": 0.08934768, + "auxiliary_loss_mlp": 0.02473956, + "balance_loss_clip": 0.07674165, + "balance_loss_mlp": 0.02246409, + "epoch": 0.017375620021043137, + "flos": 66533545543680.0, + "grad_norm": 1.4063062090104488, + "language_loss": 0.6396153, + "learning_rate": 3.6483411958066417e-06, + "loss": 0.75370252, + "num_input_tokens_seen": 6049445, + "router_z_loss_clip": 12.625, + "router_z_loss_mlp": 2.27734375, + "step": 289, + "time_per_iteration": 3.379565954208374 + }, + { + "auxiliary_loss_clip": 0.10942794, + "auxiliary_loss_mlp": 0.0259406, + "balance_loss_clip": 0.08345533, + "balance_loss_mlp": 0.02290982, + "epoch": 0.01743574327371111, + "flos": 15229472590080.0, + "grad_norm": 77.68078787610818, + "language_loss": 1.23036659, + "learning_rate": 3.6505652124687957e-06, + "loss": 1.36573505, + "num_input_tokens_seen": 6064150, + "router_z_loss_clip": 26.0, + "router_z_loss_mlp": 3.03320312, + "step": 290, + "time_per_iteration": 2.6509203910827637 + }, + { + "auxiliary_loss_clip": 0.10926615, + "auxiliary_loss_mlp": 0.02615048, + "balance_loss_clip": 0.08348773, + "balance_loss_mlp": 0.02310254, + "epoch": 0.017495866526379078, + "flos": 25380833137920.0, + "grad_norm": 27.564120325217353, + "language_loss": 1.14881706, + "learning_rate": 3.6527815732925258e-06, + "loss": 1.28423381, + "num_input_tokens_seen": 6083920, + "router_z_loss_clip": 25.796875, + "router_z_loss_mlp": 3.046875, + "step": 291, + "time_per_iteration": 2.7178046703338623 + }, + { + "auxiliary_loss_clip": 0.10883434, + "auxiliary_loss_mlp": 0.02591836, + "balance_loss_clip": 0.08332369, + "balance_loss_mlp": 0.02272164, + "epoch": 0.017555989779047047, + "flos": 26366683950720.0, + "grad_norm": 17.764405326344416, + "language_loss": 0.99533927, + "learning_rate": 3.6549903308051806e-06, + "loss": 1.13009202, + "num_input_tokens_seen": 6105460, + "router_z_loss_clip": 25.53125, + "router_z_loss_mlp": 3.1953125, + "step": 292, + "time_per_iteration": 2.788431406021118 + }, + { + "auxiliary_loss_clip": 0.10899352, + "auxiliary_loss_mlp": 0.02663543, + "balance_loss_clip": 0.08339885, + "balance_loss_mlp": 0.02329948, + "epoch": 0.017616113031715015, + "flos": 22344134918400.0, + "grad_norm": 26.042803645754148, + "language_loss": 1.17510223, + "learning_rate": 3.6571915369953646e-06, + "loss": 1.31073129, + "num_input_tokens_seen": 6122890, + "router_z_loss_clip": 25.59375, + "router_z_loss_mlp": 3.33398438, + "step": 293, + "time_per_iteration": 2.6952950954437256 + }, + { + "auxiliary_loss_clip": 0.10900117, + "auxiliary_loss_mlp": 0.02710556, + "balance_loss_clip": 0.08334709, + "balance_loss_mlp": 0.02379822, + "epoch": 0.017676236284382984, + "flos": 20163087066240.0, + "grad_norm": 32.066823918561106, + "language_loss": 1.13700342, + "learning_rate": 3.6593852433202797e-06, + "loss": 1.27311015, + "num_input_tokens_seen": 6142890, + "router_z_loss_clip": 25.640625, + "router_z_loss_mlp": 3.30859375, + "step": 294, + "time_per_iteration": 5.568135976791382 + }, + { + "auxiliary_loss_clip": 0.10885305, + "auxiliary_loss_mlp": 0.02641671, + "balance_loss_clip": 0.08332892, + "balance_loss_mlp": 0.02322953, + "epoch": 0.017736359537050956, + "flos": 25229501464320.0, + "grad_norm": 23.522869629200528, + "language_loss": 1.10671854, + "learning_rate": 3.6615715007129453e-06, + "loss": 1.24198818, + "num_input_tokens_seen": 6162030, + "router_z_loss_clip": 25.515625, + "router_z_loss_mlp": 3.1875, + "step": 295, + "time_per_iteration": 4.106949090957642 + }, + { + "auxiliary_loss_clip": 0.10915332, + "auxiliary_loss_mlp": 0.02662487, + "balance_loss_clip": 0.08334074, + "balance_loss_mlp": 0.02339572, + "epoch": 0.017796482789718925, + "flos": 20344914426240.0, + "grad_norm": 21.437764161161574, + "language_loss": 1.11617136, + "learning_rate": 3.6637503595892897e-06, + "loss": 1.25194955, + "num_input_tokens_seen": 6180540, + "router_z_loss_clip": 25.8125, + "router_z_loss_mlp": 3.22851562, + "step": 296, + "time_per_iteration": 2.6804072856903076 + }, + { + "auxiliary_loss_clip": 0.10889067, + "auxiliary_loss_mlp": 0.02644786, + "balance_loss_clip": 0.08324579, + "balance_loss_mlp": 0.02326259, + "epoch": 0.017856606042386893, + "flos": 22385196218880.0, + "grad_norm": 24.793293378850404, + "language_loss": 1.13374424, + "learning_rate": 3.665921869855132e-06, + "loss": 1.26908278, + "num_input_tokens_seen": 6199425, + "router_z_loss_clip": 25.671875, + "router_z_loss_mlp": 3.18554688, + "step": 297, + "time_per_iteration": 4.217481851577759 + }, + { + "auxiliary_loss_clip": 0.10852176, + "auxiliary_loss_mlp": 0.02688673, + "balance_loss_clip": 0.08303393, + "balance_loss_mlp": 0.02347639, + "epoch": 0.017916729295054862, + "flos": 20236279207680.0, + "grad_norm": 36.45374269731938, + "language_loss": 1.20502043, + "learning_rate": 3.6680860809130346e-06, + "loss": 1.34042883, + "num_input_tokens_seen": 6219170, + "router_z_loss_clip": 25.515625, + "router_z_loss_mlp": 3.40820312, + "step": 298, + "time_per_iteration": 2.6716575622558594 + }, + { + "auxiliary_loss_clip": 0.10865816, + "auxiliary_loss_mlp": 0.02644256, + "balance_loss_clip": 0.08315772, + "balance_loss_mlp": 0.02343848, + "epoch": 0.01797685254772283, + "flos": 19397064240000.0, + "grad_norm": 34.948505853119244, + "language_loss": 1.10227847, + "learning_rate": 3.6702430416690516e-06, + "loss": 1.23737931, + "num_input_tokens_seen": 6237930, + "router_z_loss_clip": 25.5, + "router_z_loss_mlp": 3.00390625, + "step": 299, + "time_per_iteration": 2.6678671836853027 + }, + { + "auxiliary_loss_clip": 0.10841461, + "auxiliary_loss_mlp": 0.02622314, + "balance_loss_clip": 0.08293117, + "balance_loss_mlp": 0.02329536, + "epoch": 0.018036975800390802, + "flos": 24432941024640.0, + "grad_norm": 19.38461643101093, + "language_loss": 0.93498641, + "learning_rate": 3.672392800539357e-06, + "loss": 1.06962407, + "num_input_tokens_seen": 6257170, + "router_z_loss_clip": 25.46875, + "router_z_loss_mlp": 2.92578125, + "step": 300, + "time_per_iteration": 2.678161382675171 + }, + { + "auxiliary_loss_clip": 0.10806506, + "auxiliary_loss_mlp": 0.02621871, + "balance_loss_clip": 0.08281456, + "balance_loss_mlp": 0.02336723, + "epoch": 0.01809709905305877, + "flos": 15784430181120.0, + "grad_norm": 20.696646248156853, + "language_loss": 1.21024799, + "learning_rate": 3.6745354054567686e-06, + "loss": 1.34453177, + "num_input_tokens_seen": 6274780, + "router_z_loss_clip": 25.265625, + "router_z_loss_mlp": 2.85351562, + "step": 301, + "time_per_iteration": 2.6817290782928467 + }, + { + "auxiliary_loss_clip": 0.0850801, + "auxiliary_loss_mlp": 0.01826254, + "balance_loss_clip": 0.07523113, + "balance_loss_mlp": 0.01690356, + "epoch": 0.01815722230572674, + "flos": 67371125356800.0, + "grad_norm": 1.2503467181890604, + "language_loss": 0.62148851, + "learning_rate": 3.676670903877158e-06, + "loss": 0.72483116, + "num_input_tokens_seen": 6340435, + "router_z_loss_clip": 9.859375, + "router_z_loss_mlp": 1.36035156, + "step": 302, + "time_per_iteration": 3.424029588699341 + }, + { + "auxiliary_loss_clip": 0.10791934, + "auxiliary_loss_mlp": 0.02578435, + "balance_loss_clip": 0.08265001, + "balance_loss_mlp": 0.02299963, + "epoch": 0.01821734555839471, + "flos": 15490823074560.0, + "grad_norm": 21.711544566316807, + "language_loss": 1.17839396, + "learning_rate": 3.6787993427857567e-06, + "loss": 1.31209755, + "num_input_tokens_seen": 6358160, + "router_z_loss_clip": 25.265625, + "router_z_loss_mlp": 2.78320312, + "step": 303, + "time_per_iteration": 2.6523215770721436 + }, + { + "auxiliary_loss_clip": 0.10728209, + "auxiliary_loss_mlp": 0.02544189, + "balance_loss_clip": 0.08224705, + "balance_loss_mlp": 0.02301288, + "epoch": 0.018277468811062677, + "flos": 24104268184320.0, + "grad_norm": 23.704422815160775, + "language_loss": 1.0746634, + "learning_rate": 3.680920768703364e-06, + "loss": 1.20738745, + "num_input_tokens_seen": 6378485, + "router_z_loss_clip": 25.03125, + "router_z_loss_mlp": 2.42675781, + "step": 304, + "time_per_iteration": 2.7344958782196045 + }, + { + "auxiliary_loss_clip": 0.1066777, + "auxiliary_loss_mlp": 0.02483555, + "balance_loss_clip": 0.08210013, + "balance_loss_mlp": 0.02260681, + "epoch": 0.01833759206373065, + "flos": 20965601145600.0, + "grad_norm": 30.99837504160223, + "language_loss": 1.03348625, + "learning_rate": 3.6830352276924415e-06, + "loss": 1.16499949, + "num_input_tokens_seen": 6397845, + "router_z_loss_clip": 24.5625, + "router_z_loss_mlp": 2.22949219, + "step": 305, + "time_per_iteration": 2.7260208129882812 + }, + { + "auxiliary_loss_clip": 0.10687442, + "auxiliary_loss_mlp": 0.0251225, + "balance_loss_clip": 0.08201034, + "balance_loss_mlp": 0.0229529, + "epoch": 0.018397715316398618, + "flos": 19396812677760.0, + "grad_norm": 19.918754118514013, + "language_loss": 1.13116205, + "learning_rate": 3.685142765363119e-06, + "loss": 1.26315892, + "num_input_tokens_seen": 6416475, + "router_z_loss_clip": 24.828125, + "router_z_loss_mlp": 2.16992188, + "step": 306, + "time_per_iteration": 2.691499948501587 + }, + { + "auxiliary_loss_clip": 0.10669354, + "auxiliary_loss_mlp": 0.02508631, + "balance_loss_clip": 0.08186156, + "balance_loss_mlp": 0.02314558, + "epoch": 0.018457838569066586, + "flos": 29140228823040.0, + "grad_norm": 47.10981354198648, + "language_loss": 1.13449669, + "learning_rate": 3.687243426879095e-06, + "loss": 1.2662766, + "num_input_tokens_seen": 6437520, + "router_z_loss_clip": 24.859375, + "router_z_loss_mlp": 1.94335938, + "step": 307, + "time_per_iteration": 2.7379393577575684 + }, + { + "auxiliary_loss_clip": 0.10625106, + "auxiliary_loss_mlp": 0.02487612, + "balance_loss_clip": 0.08165652, + "balance_loss_mlp": 0.02317095, + "epoch": 0.018517961821734555, + "flos": 19214733755520.0, + "grad_norm": 42.1678147839251, + "language_loss": 0.98589212, + "learning_rate": 3.6893372569634466e-06, + "loss": 1.11701941, + "num_input_tokens_seen": 6455680, + "router_z_loss_clip": 24.609375, + "router_z_loss_mlp": 1.70605469, + "step": 308, + "time_per_iteration": 2.702864646911621 + }, + { + "auxiliary_loss_clip": 0.1055109, + "auxiliary_loss_mlp": 0.02395341, + "balance_loss_clip": 0.08134291, + "balance_loss_mlp": 0.02218911, + "epoch": 0.018578085074402523, + "flos": 19868809127040.0, + "grad_norm": 28.65950876073581, + "language_loss": 1.1383698, + "learning_rate": 3.6914242999043395e-06, + "loss": 1.26783419, + "num_input_tokens_seen": 6474880, + "router_z_loss_clip": 24.171875, + "router_z_loss_mlp": 1.765625, + "step": 309, + "time_per_iteration": 2.6683051586151123 + }, + { + "auxiliary_loss_clip": 0.10586038, + "auxiliary_loss_mlp": 0.02405273, + "balance_loss_clip": 0.08121731, + "balance_loss_mlp": 0.02230465, + "epoch": 0.018638208327070496, + "flos": 29614740894720.0, + "grad_norm": 52.453360042586766, + "language_loss": 1.0296793, + "learning_rate": 3.69350459956065e-06, + "loss": 1.15959239, + "num_input_tokens_seen": 6495945, + "router_z_loss_clip": 24.625, + "router_z_loss_mlp": 1.74804688, + "step": 310, + "time_per_iteration": 2.775391101837158 + }, + { + "auxiliary_loss_clip": 0.10563378, + "auxiliary_loss_mlp": 0.02371235, + "balance_loss_clip": 0.08112171, + "balance_loss_mlp": 0.02215118, + "epoch": 0.018698331579738464, + "flos": 45741694567680.0, + "grad_norm": 23.410275827875097, + "language_loss": 0.97821265, + "learning_rate": 3.695578199367497e-06, + "loss": 1.10755873, + "num_input_tokens_seen": 6519930, + "router_z_loss_clip": 24.5, + "router_z_loss_mlp": 1.56054688, + "step": 311, + "time_per_iteration": 2.8839335441589355 + }, + { + "auxiliary_loss_clip": 0.10531655, + "auxiliary_loss_mlp": 0.02336008, + "balance_loss_clip": 0.08109175, + "balance_loss_mlp": 0.02177126, + "epoch": 0.018758454832406433, + "flos": 20489621627520.0, + "grad_norm": 82.59483456267918, + "language_loss": 1.18671477, + "learning_rate": 3.6976451423416825e-06, + "loss": 1.31539142, + "num_input_tokens_seen": 6535070, + "router_z_loss_clip": 24.203125, + "router_z_loss_mlp": 1.58886719, + "step": 312, + "time_per_iteration": 2.770037889480591 + }, + { + "auxiliary_loss_clip": 0.10558081, + "auxiliary_loss_mlp": 0.02280057, + "balance_loss_clip": 0.08105703, + "balance_loss_mlp": 0.02130998, + "epoch": 0.0188185780850744, + "flos": 15783088515840.0, + "grad_norm": 63.63527142809732, + "language_loss": 1.19325101, + "learning_rate": 3.699705471087043e-06, + "loss": 1.32163239, + "num_input_tokens_seen": 6554135, + "router_z_loss_clip": 24.515625, + "router_z_loss_mlp": 1.49121094, + "step": 313, + "time_per_iteration": 2.6673521995544434 + }, + { + "auxiliary_loss_clip": 0.10532573, + "auxiliary_loss_mlp": 0.02284473, + "balance_loss_clip": 0.08092797, + "balance_loss_mlp": 0.02119774, + "epoch": 0.018878701337742373, + "flos": 22462329502080.0, + "grad_norm": 55.57556601394066, + "language_loss": 1.1492281, + "learning_rate": 3.7017592277997256e-06, + "loss": 1.27739859, + "num_input_tokens_seen": 6572275, + "router_z_loss_clip": 24.375, + "router_z_loss_mlp": 1.6484375, + "step": 314, + "time_per_iteration": 2.6694388389587402 + }, + { + "auxiliary_loss_clip": 0.10578424, + "auxiliary_loss_mlp": 0.02246847, + "balance_loss_clip": 0.08105191, + "balance_loss_mlp": 0.02083482, + "epoch": 0.018938824590410342, + "flos": 31001576221440.0, + "grad_norm": 45.405049918855795, + "language_loss": 1.21203804, + "learning_rate": 3.7038064542733654e-06, + "loss": 1.34029078, + "num_input_tokens_seen": 6594520, + "router_z_loss_clip": 24.734375, + "router_z_loss_mlp": 1.6328125, + "step": 315, + "time_per_iteration": 2.7529938220977783 + }, + { + "auxiliary_loss_clip": 0.10473935, + "auxiliary_loss_mlp": 0.02224543, + "balance_loss_clip": 0.08059986, + "balance_loss_mlp": 0.02047731, + "epoch": 0.01899894784307831, + "flos": 23265724049280.0, + "grad_norm": 52.87369135887914, + "language_loss": 1.09085321, + "learning_rate": 3.7058471919041945e-06, + "loss": 1.21783805, + "num_input_tokens_seen": 6614245, + "router_z_loss_clip": 24.15625, + "router_z_loss_mlp": 1.76855469, + "step": 316, + "time_per_iteration": 2.7019717693328857 + }, + { + "auxiliary_loss_clip": 0.1049989, + "auxiliary_loss_mlp": 0.02224334, + "balance_loss_clip": 0.08073364, + "balance_loss_mlp": 0.02044757, + "epoch": 0.01905907109574628, + "flos": 17463782511360.0, + "grad_norm": 120.61991368810097, + "language_loss": 1.19369888, + "learning_rate": 3.7078814816960605e-06, + "loss": 1.32094109, + "num_input_tokens_seen": 6632015, + "router_z_loss_clip": 24.234375, + "router_z_loss_mlp": 1.79492188, + "step": 317, + "time_per_iteration": 2.6503257751464844 + }, + { + "auxiliary_loss_clip": 0.10466437, + "auxiliary_loss_mlp": 0.02269676, + "balance_loss_clip": 0.08054706, + "balance_loss_mlp": 0.02081039, + "epoch": 0.019119194348414248, + "flos": 14974578869760.0, + "grad_norm": 61.86297235247138, + "language_loss": 1.22225165, + "learning_rate": 3.709909364265374e-06, + "loss": 1.34961283, + "num_input_tokens_seen": 6649015, + "router_z_loss_clip": 24.109375, + "router_z_loss_mlp": 1.88769531, + "step": 318, + "time_per_iteration": 2.631645917892456 + }, + { + "auxiliary_loss_clip": 0.1039573, + "auxiliary_loss_mlp": 0.02220381, + "balance_loss_clip": 0.08026896, + "balance_loss_mlp": 0.02036608, + "epoch": 0.01917931760108222, + "flos": 25489719918720.0, + "grad_norm": 79.56078914423522, + "language_loss": 1.24628842, + "learning_rate": 3.7119308798459706e-06, + "loss": 1.3724494, + "num_input_tokens_seen": 6669225, + "router_z_loss_clip": 23.65625, + "router_z_loss_mlp": 1.83789062, + "step": 319, + "time_per_iteration": 2.723235607147217 + }, + { + "auxiliary_loss_clip": 0.08211939, + "auxiliary_loss_mlp": 0.01803451, + "balance_loss_clip": 0.07311222, + "balance_loss_mlp": 0.01697974, + "epoch": 0.01923944085375019, + "flos": 71576438872320.0, + "grad_norm": 0.9540157623115577, + "language_loss": 0.59494603, + "learning_rate": 3.7139460682939026e-06, + "loss": 0.69509989, + "num_input_tokens_seen": 6725775, + "router_z_loss_clip": 9.0, + "router_z_loss_mlp": 1.05664062, + "step": 320, + "time_per_iteration": 3.180224895477295 + }, + { + "auxiliary_loss_clip": 0.10427548, + "auxiliary_loss_mlp": 0.02254004, + "balance_loss_clip": 0.0803239, + "balance_loss_mlp": 0.02062601, + "epoch": 0.019299564106418157, + "flos": 19688574994560.0, + "grad_norm": 36.291900925718565, + "language_loss": 1.21542251, + "learning_rate": 3.715954969092154e-06, + "loss": 1.34223795, + "num_input_tokens_seen": 6744170, + "router_z_loss_clip": 23.921875, + "router_z_loss_mlp": 1.9140625, + "step": 321, + "time_per_iteration": 2.682126045227051 + }, + { + "auxiliary_loss_clip": 0.10335587, + "auxiliary_loss_mlp": 0.02247301, + "balance_loss_clip": 0.079924, + "balance_loss_mlp": 0.02050463, + "epoch": 0.019359687359086126, + "flos": 24393682586880.0, + "grad_norm": 33.259970226975035, + "language_loss": 1.13044763, + "learning_rate": 3.7179576213552805e-06, + "loss": 1.25627637, + "num_input_tokens_seen": 6764565, + "router_z_loss_clip": 23.40625, + "router_z_loss_mlp": 1.96972656, + "step": 322, + "time_per_iteration": 2.707108736038208 + }, + { + "auxiliary_loss_clip": 0.10356271, + "auxiliary_loss_mlp": 0.02232923, + "balance_loss_clip": 0.08007558, + "balance_loss_mlp": 0.02039518, + "epoch": 0.019419810611754094, + "flos": 23958177390720.0, + "grad_norm": 36.53278953975959, + "language_loss": 0.99391961, + "learning_rate": 3.719954063833981e-06, + "loss": 1.11981153, + "num_input_tokens_seen": 6785310, + "router_z_loss_clip": 23.46875, + "router_z_loss_mlp": 1.93554688, + "step": 323, + "time_per_iteration": 2.723851442337036 + }, + { + "auxiliary_loss_clip": 0.10368463, + "auxiliary_loss_mlp": 0.02256046, + "balance_loss_clip": 0.08015804, + "balance_loss_mlp": 0.02064739, + "epoch": 0.019479933864422067, + "flos": 22166164846080.0, + "grad_norm": 31.715264393756637, + "language_loss": 1.15310884, + "learning_rate": 3.721944334919596e-06, + "loss": 1.27935386, + "num_input_tokens_seen": 6803290, + "router_z_loss_clip": 23.5, + "router_z_loss_mlp": 1.9140625, + "step": 324, + "time_per_iteration": 2.696791887283325 + }, + { + "auxiliary_loss_clip": 0.10296808, + "auxiliary_loss_mlp": 0.02240866, + "balance_loss_clip": 0.08005355, + "balance_loss_mlp": 0.02052992, + "epoch": 0.019540057117090035, + "flos": 22243381983360.0, + "grad_norm": 43.49790109423306, + "language_loss": 0.94611681, + "learning_rate": 3.7239284726485375e-06, + "loss": 1.07149351, + "num_input_tokens_seen": 6822570, + "router_z_loss_clip": 22.90625, + "router_z_loss_mlp": 1.87890625, + "step": 325, + "time_per_iteration": 2.653348207473755 + }, + { + "auxiliary_loss_clip": 0.10282885, + "auxiliary_loss_mlp": 0.02182889, + "balance_loss_clip": 0.07997272, + "balance_loss_mlp": 0.02001023, + "epoch": 0.019600180369758004, + "flos": 23083603200000.0, + "grad_norm": 27.315965412731057, + "language_loss": 0.98057997, + "learning_rate": 3.72590651470665e-06, + "loss": 1.10523772, + "num_input_tokens_seen": 6841910, + "router_z_loss_clip": 22.859375, + "router_z_loss_mlp": 1.81835938, + "step": 326, + "time_per_iteration": 2.712902545928955 + }, + { + "auxiliary_loss_clip": 0.10212934, + "auxiliary_loss_mlp": 0.0211514, + "balance_loss_clip": 0.07960281, + "balance_loss_mlp": 0.01952062, + "epoch": 0.019660303622425972, + "flos": 25417911369600.0, + "grad_norm": 35.757935523376304, + "language_loss": 1.00482905, + "learning_rate": 3.727878498433505e-06, + "loss": 1.12810981, + "num_input_tokens_seen": 6862480, + "router_z_loss_clip": 22.53125, + "router_z_loss_mlp": 1.63085938, + "step": 327, + "time_per_iteration": 2.7241063117980957 + }, + { + "auxiliary_loss_clip": 0.10138492, + "auxiliary_loss_mlp": 0.02035691, + "balance_loss_clip": 0.07947245, + "balance_loss_mlp": 0.01881101, + "epoch": 0.01972042687509394, + "flos": 23663941378560.0, + "grad_norm": 104.32864902308236, + "language_loss": 1.03565025, + "learning_rate": 3.7298444608266328e-06, + "loss": 1.15739202, + "num_input_tokens_seen": 6882015, + "router_z_loss_clip": 21.9375, + "router_z_loss_mlp": 1.54492188, + "step": 328, + "time_per_iteration": 2.709101438522339 + }, + { + "auxiliary_loss_clip": 0.10164856, + "auxiliary_loss_mlp": 0.01970008, + "balance_loss_clip": 0.0795281, + "balance_loss_mlp": 0.01821044, + "epoch": 0.019780550127761913, + "flos": 18229386067200.0, + "grad_norm": 42.1606706132577, + "language_loss": 1.2875843, + "learning_rate": 3.731804438545683e-06, + "loss": 1.40893316, + "num_input_tokens_seen": 6899785, + "router_z_loss_clip": 22.125, + "router_z_loss_mlp": 1.49023438, + "step": 329, + "time_per_iteration": 2.6586227416992188 + }, + { + "auxiliary_loss_clip": 0.10175324, + "auxiliary_loss_mlp": 0.0194808, + "balance_loss_clip": 0.07956892, + "balance_loss_mlp": 0.0180417, + "epoch": 0.01984067338042988, + "flos": 22425293197440.0, + "grad_norm": 45.342797810033126, + "language_loss": 1.05014217, + "learning_rate": 3.7337584679165324e-06, + "loss": 1.17137623, + "num_input_tokens_seen": 6918575, + "router_z_loss_clip": 22.1875, + "router_z_loss_mlp": 1.43847656, + "step": 330, + "time_per_iteration": 2.7214515209198 + }, + { + "auxiliary_loss_clip": 0.10115402, + "auxiliary_loss_mlp": 0.01893459, + "balance_loss_clip": 0.07927606, + "balance_loss_mlp": 0.01745353, + "epoch": 0.01990079663309785, + "flos": 17060785499520.0, + "grad_norm": 59.15314637886723, + "language_loss": 1.25238144, + "learning_rate": 3.7357065849353186e-06, + "loss": 1.37247014, + "num_input_tokens_seen": 6936965, + "router_z_loss_clip": 21.890625, + "router_z_loss_mlp": 1.48046875, + "step": 331, + "time_per_iteration": 2.657338857650757 + }, + { + "auxiliary_loss_clip": 0.10080996, + "auxiliary_loss_mlp": 0.01847509, + "balance_loss_clip": 0.07917192, + "balance_loss_mlp": 0.01704076, + "epoch": 0.01996091988576582, + "flos": 15967389571200.0, + "grad_norm": 98.01539887897596, + "language_loss": 1.18547392, + "learning_rate": 3.737648825272422e-06, + "loss": 1.30475891, + "num_input_tokens_seen": 6953475, + "router_z_loss_clip": 21.625, + "router_z_loss_mlp": 1.43457031, + "step": 332, + "time_per_iteration": 2.653959035873413 + }, + { + "auxiliary_loss_clip": 0.10103545, + "auxiliary_loss_mlp": 0.01800932, + "balance_loss_clip": 0.07904914, + "balance_loss_mlp": 0.01663794, + "epoch": 0.02002104313843379, + "flos": 23593181005440.0, + "grad_norm": 35.094478760810134, + "language_loss": 1.10768199, + "learning_rate": 3.739585224276384e-06, + "loss": 1.22672677, + "num_input_tokens_seen": 6971630, + "router_z_loss_clip": 21.96875, + "router_z_loss_mlp": 1.37207031, + "step": 333, + "time_per_iteration": 4.1371009349823 + }, + { + "auxiliary_loss_clip": 0.10097618, + "auxiliary_loss_mlp": 0.01781343, + "balance_loss_clip": 0.07907948, + "balance_loss_mlp": 0.01654028, + "epoch": 0.02008116639110176, + "flos": 34103458517760.0, + "grad_norm": 136.68327853765982, + "language_loss": 1.06974816, + "learning_rate": 3.7415158169777673e-06, + "loss": 1.18853784, + "num_input_tokens_seen": 6992775, + "router_z_loss_clip": 21.921875, + "router_z_loss_mlp": 1.2734375, + "step": 334, + "time_per_iteration": 4.332135200500488 + }, + { + "auxiliary_loss_clip": 0.10031913, + "auxiliary_loss_mlp": 0.01781208, + "balance_loss_clip": 0.07884848, + "balance_loss_mlp": 0.01645405, + "epoch": 0.020141289643769728, + "flos": 19690000513920.0, + "grad_norm": 127.35413263461035, + "language_loss": 1.06165111, + "learning_rate": 3.7434406380929575e-06, + "loss": 1.17978239, + "num_input_tokens_seen": 7011425, + "router_z_loss_clip": 21.453125, + "router_z_loss_mlp": 1.35742188, + "step": 335, + "time_per_iteration": 2.6845688819885254 + }, + { + "auxiliary_loss_clip": 0.10012034, + "auxiliary_loss_mlp": 0.01785006, + "balance_loss_clip": 0.07876636, + "balance_loss_mlp": 0.01652064, + "epoch": 0.020201412896437697, + "flos": 20746821335040.0, + "grad_norm": 92.68671579424392, + "language_loss": 1.17325389, + "learning_rate": 3.745359722027911e-06, + "loss": 1.29122424, + "num_input_tokens_seen": 7029450, + "router_z_loss_clip": 21.34375, + "router_z_loss_mlp": 1.33007812, + "step": 336, + "time_per_iteration": 4.08910059928894 + }, + { + "auxiliary_loss_clip": 0.1002828, + "auxiliary_loss_mlp": 0.01777388, + "balance_loss_clip": 0.07887816, + "balance_loss_mlp": 0.01649119, + "epoch": 0.020261536149105665, + "flos": 20272728533760.0, + "grad_norm": 120.00954497896274, + "language_loss": 1.09627342, + "learning_rate": 3.7472731028818428e-06, + "loss": 1.21433008, + "num_input_tokens_seen": 7047555, + "router_z_loss_clip": 21.40625, + "router_z_loss_mlp": 1.28222656, + "step": 337, + "time_per_iteration": 2.805793285369873 + }, + { + "auxiliary_loss_clip": 0.09984031, + "auxiliary_loss_mlp": 0.01793779, + "balance_loss_clip": 0.07868993, + "balance_loss_mlp": 0.01666368, + "epoch": 0.020321659401773638, + "flos": 25855890261120.0, + "grad_norm": 28.99860578242643, + "language_loss": 1.06755781, + "learning_rate": 3.7491808144508626e-06, + "loss": 1.18533587, + "num_input_tokens_seen": 7068185, + "router_z_loss_clip": 21.15625, + "router_z_loss_mlp": 1.2734375, + "step": 338, + "time_per_iteration": 2.731576919555664 + }, + { + "auxiliary_loss_clip": 0.09960704, + "auxiliary_loss_mlp": 0.01799352, + "balance_loss_clip": 0.0785647, + "balance_loss_mlp": 0.01663931, + "epoch": 0.020381782654441606, + "flos": 17501028451200.0, + "grad_norm": 48.687202060804886, + "language_loss": 1.0690763, + "learning_rate": 3.7510828902315576e-06, + "loss": 1.18667698, + "num_input_tokens_seen": 7085955, + "router_z_loss_clip": 21.03125, + "router_z_loss_mlp": 1.35449219, + "step": 339, + "time_per_iteration": 2.6707966327667236 + }, + { + "auxiliary_loss_clip": 0.09979145, + "auxiliary_loss_mlp": 0.01800383, + "balance_loss_clip": 0.07839093, + "balance_loss_mlp": 0.01661433, + "epoch": 0.020441905907109575, + "flos": 24250904029440.0, + "grad_norm": 71.79969186636298, + "language_loss": 1.09025931, + "learning_rate": 3.75297936342452e-06, + "loss": 1.20805454, + "num_input_tokens_seen": 7106345, + "router_z_loss_clip": 21.4375, + "router_z_loss_mlp": 1.38964844, + "step": 340, + "time_per_iteration": 2.6860833168029785 + }, + { + "auxiliary_loss_clip": 0.09942168, + "auxiliary_loss_mlp": 0.01812594, + "balance_loss_clip": 0.07835533, + "balance_loss_mlp": 0.01670592, + "epoch": 0.020502029159777543, + "flos": 22239273133440.0, + "grad_norm": 33.37713513104353, + "language_loss": 1.09787846, + "learning_rate": 3.7548702669378253e-06, + "loss": 1.21542597, + "num_input_tokens_seen": 7125070, + "router_z_loss_clip": 21.09375, + "router_z_loss_mlp": 1.41992188, + "step": 341, + "time_per_iteration": 2.6922483444213867 + }, + { + "auxiliary_loss_clip": 0.09939329, + "auxiliary_loss_mlp": 0.01828812, + "balance_loss_clip": 0.07839939, + "balance_loss_mlp": 0.01694249, + "epoch": 0.020562152412445512, + "flos": 23994668643840.0, + "grad_norm": 29.77192234960925, + "language_loss": 1.11667454, + "learning_rate": 3.756755633390458e-06, + "loss": 1.23435605, + "num_input_tokens_seen": 7144675, + "router_z_loss_clip": 21.0, + "router_z_loss_mlp": 1.34472656, + "step": 342, + "time_per_iteration": 2.6834869384765625 + }, + { + "auxiliary_loss_clip": 0.09933892, + "auxiliary_loss_mlp": 0.01819402, + "balance_loss_clip": 0.07828948, + "balance_loss_mlp": 0.0168541, + "epoch": 0.020622275665113484, + "flos": 26981878227840.0, + "grad_norm": 22.197931915509507, + "language_loss": 1.07990003, + "learning_rate": 3.7586354951156886e-06, + "loss": 1.19743299, + "num_input_tokens_seen": 7165505, + "router_z_loss_clip": 21.0625, + "router_z_loss_mlp": 1.34082031, + "step": 343, + "time_per_iteration": 2.749616861343384 + }, + { + "auxiliary_loss_clip": 0.09917849, + "auxiliary_loss_mlp": 0.01848479, + "balance_loss_clip": 0.07828984, + "balance_loss_mlp": 0.01717921, + "epoch": 0.020682398917781453, + "flos": 22607162484480.0, + "grad_norm": 141.8901696404303, + "language_loss": 0.98407257, + "learning_rate": 3.7605098841644e-06, + "loss": 1.10173583, + "num_input_tokens_seen": 7184605, + "router_z_loss_clip": 20.859375, + "router_z_loss_mlp": 1.30566406, + "step": 344, + "time_per_iteration": 2.675349235534668 + }, + { + "auxiliary_loss_clip": 0.09898005, + "auxiliary_loss_mlp": 0.01869082, + "balance_loss_clip": 0.07812598, + "balance_loss_mlp": 0.01731467, + "epoch": 0.02074252217044942, + "flos": 15019120114560.0, + "grad_norm": 18.785611022256134, + "language_loss": 0.99672723, + "learning_rate": 3.7623788323083666e-06, + "loss": 1.11439812, + "num_input_tokens_seen": 7203065, + "router_z_loss_clip": 20.84375, + "router_z_loss_mlp": 1.37597656, + "step": 345, + "time_per_iteration": 2.692946434020996 + }, + { + "auxiliary_loss_clip": 0.09874325, + "auxiliary_loss_mlp": 0.01900277, + "balance_loss_clip": 0.07799722, + "balance_loss_mlp": 0.01757512, + "epoch": 0.02080264542311739, + "flos": 25345012717440.0, + "grad_norm": 55.83425603592709, + "language_loss": 1.104882, + "learning_rate": 3.7642423710434837e-06, + "loss": 1.222628, + "num_input_tokens_seen": 7222995, + "router_z_loss_clip": 20.734375, + "router_z_loss_mlp": 1.42871094, + "step": 346, + "time_per_iteration": 2.6843760013580322 + }, + { + "auxiliary_loss_clip": 0.09857361, + "auxiliary_loss_mlp": 0.01900508, + "balance_loss_clip": 0.07793791, + "balance_loss_mlp": 0.01751067, + "epoch": 0.02086276867578536, + "flos": 24395611230720.0, + "grad_norm": 77.40789728508068, + "language_loss": 1.02947056, + "learning_rate": 3.7661005315929563e-06, + "loss": 1.14704919, + "num_input_tokens_seen": 7244625, + "router_z_loss_clip": 20.625, + "router_z_loss_mlp": 1.49511719, + "step": 347, + "time_per_iteration": 2.7445502281188965 + }, + { + "auxiliary_loss_clip": 0.09829693, + "auxiliary_loss_mlp": 0.01850064, + "balance_loss_clip": 0.07772936, + "balance_loss_mlp": 0.01707585, + "epoch": 0.02092289192845333, + "flos": 24469096861440.0, + "grad_norm": 39.57326474220843, + "language_loss": 0.95316571, + "learning_rate": 3.7679533449104354e-06, + "loss": 1.06996334, + "num_input_tokens_seen": 7263255, + "router_z_loss_clip": 20.546875, + "router_z_loss_mlp": 1.42578125, + "step": 348, + "time_per_iteration": 2.8197853565216064 + }, + { + "auxiliary_loss_clip": 0.09904477, + "auxiliary_loss_mlp": 0.01869566, + "balance_loss_clip": 0.07792602, + "balance_loss_mlp": 0.01723273, + "epoch": 0.0209830151811213, + "flos": 17455942154880.0, + "grad_norm": 162.53223734199824, + "language_loss": 1.06930375, + "learning_rate": 3.7698008416831116e-06, + "loss": 1.18704414, + "num_input_tokens_seen": 7279275, + "router_z_loss_clip": 21.125, + "router_z_loss_mlp": 1.46289062, + "step": 349, + "time_per_iteration": 2.752092123031616 + }, + { + "auxiliary_loss_clip": 0.09846102, + "auxiliary_loss_mlp": 0.01921246, + "balance_loss_clip": 0.07772378, + "balance_loss_mlp": 0.01771328, + "epoch": 0.021043138433789268, + "flos": 24581295878400.0, + "grad_norm": 27.656933027979164, + "language_loss": 1.05012357, + "learning_rate": 3.7716430523347664e-06, + "loss": 1.16779709, + "num_input_tokens_seen": 7300180, + "router_z_loss_clip": 20.71875, + "router_z_loss_mlp": 1.49902344, + "step": 350, + "time_per_iteration": 2.766042947769165 + }, + { + "auxiliary_loss_clip": 0.0987936, + "auxiliary_loss_mlp": 0.01878538, + "balance_loss_clip": 0.07780807, + "balance_loss_mlp": 0.01733103, + "epoch": 0.021103261686457236, + "flos": 24459579423360.0, + "grad_norm": 79.75623451753691, + "language_loss": 0.99250925, + "learning_rate": 3.773480007028776e-06, + "loss": 1.11008823, + "num_input_tokens_seen": 7317430, + "router_z_loss_clip": 21.0, + "router_z_loss_mlp": 1.45507812, + "step": 351, + "time_per_iteration": 2.7852492332458496 + }, + { + "auxiliary_loss_clip": 0.09914102, + "auxiliary_loss_mlp": 0.01872584, + "balance_loss_clip": 0.07798491, + "balance_loss_mlp": 0.01732013, + "epoch": 0.021163384939125205, + "flos": 14688183214080.0, + "grad_norm": 45.172979776217204, + "language_loss": 1.05138326, + "learning_rate": 3.775311735671078e-06, + "loss": 1.16925001, + "num_input_tokens_seen": 7334875, + "router_z_loss_clip": 21.15625, + "router_z_loss_mlp": 1.40527344, + "step": 352, + "time_per_iteration": 2.670952558517456 + }, + { + "auxiliary_loss_clip": 0.09916839, + "auxiliary_loss_mlp": 0.0188162, + "balance_loss_clip": 0.07782572, + "balance_loss_mlp": 0.01727792, + "epoch": 0.021223508191793177, + "flos": 24499173277440.0, + "grad_norm": 32.69809617550279, + "language_loss": 1.02695966, + "learning_rate": 3.7771382679130878e-06, + "loss": 1.14494431, + "num_input_tokens_seen": 7355185, + "router_z_loss_clip": 21.375, + "router_z_loss_mlp": 1.5390625, + "step": 353, + "time_per_iteration": 2.7037458419799805 + }, + { + "auxiliary_loss_clip": 0.09877251, + "auxiliary_loss_mlp": 0.01866766, + "balance_loss_clip": 0.07783737, + "balance_loss_mlp": 0.01718565, + "epoch": 0.021283631444461146, + "flos": 24132667518720.0, + "grad_norm": 42.14264864151201, + "language_loss": 1.01166749, + "learning_rate": 3.7789596331545845e-06, + "loss": 1.12910759, + "num_input_tokens_seen": 7374425, + "router_z_loss_clip": 20.921875, + "router_z_loss_mlp": 1.48242188, + "step": 354, + "time_per_iteration": 2.692936658859253 + }, + { + "auxiliary_loss_clip": 0.0993467, + "auxiliary_loss_mlp": 0.0189021, + "balance_loss_clip": 0.07795032, + "balance_loss_mlp": 0.01743726, + "epoch": 0.021343754697129114, + "flos": 25199299267200.0, + "grad_norm": 49.082565254141, + "language_loss": 1.02249849, + "learning_rate": 3.780775860546545e-06, + "loss": 1.14074731, + "num_input_tokens_seen": 7394175, + "router_z_loss_clip": 21.359375, + "router_z_loss_mlp": 1.46484375, + "step": 355, + "time_per_iteration": 2.703904151916504 + }, + { + "auxiliary_loss_clip": 0.09890301, + "auxiliary_loss_mlp": 0.01933568, + "balance_loss_clip": 0.07771169, + "balance_loss_mlp": 0.01774495, + "epoch": 0.021403877949797083, + "flos": 17279816872320.0, + "grad_norm": 33.424095724347985, + "language_loss": 1.12320316, + "learning_rate": 3.7825869789939474e-06, + "loss": 1.24144173, + "num_input_tokens_seen": 7412645, + "router_z_loss_clip": 21.21875, + "router_z_loss_mlp": 1.58984375, + "step": 356, + "time_per_iteration": 2.7039332389831543 + }, + { + "auxiliary_loss_clip": 0.09926872, + "auxiliary_loss_mlp": 0.01913321, + "balance_loss_clip": 0.07763862, + "balance_loss_mlp": 0.01768648, + "epoch": 0.021464001202465055, + "flos": 30924946062720.0, + "grad_norm": 28.358403300745604, + "language_loss": 1.00492048, + "learning_rate": 3.784393017158528e-06, + "loss": 1.12332249, + "num_input_tokens_seen": 7432275, + "router_z_loss_clip": 21.640625, + "router_z_loss_mlp": 1.44628906, + "step": 357, + "time_per_iteration": 2.7567434310913086 + }, + { + "auxiliary_loss_clip": 0.09896905, + "auxiliary_loss_mlp": 0.0189471, + "balance_loss_clip": 0.0777001, + "balance_loss_mlp": 0.01751087, + "epoch": 0.021524124455133024, + "flos": 18192182054400.0, + "grad_norm": 311.83490549391024, + "language_loss": 1.00049341, + "learning_rate": 3.786194003461506e-06, + "loss": 1.11840951, + "num_input_tokens_seen": 7450245, + "router_z_loss_clip": 21.28125, + "router_z_loss_mlp": 1.43652344, + "step": 358, + "time_per_iteration": 2.697567939758301 + }, + { + "auxiliary_loss_clip": 0.09952264, + "auxiliary_loss_mlp": 0.01876113, + "balance_loss_clip": 0.0777906, + "balance_loss_mlp": 0.01737449, + "epoch": 0.021584247707800992, + "flos": 13810464495360.0, + "grad_norm": 74.44924093849752, + "language_loss": 1.11748183, + "learning_rate": 3.787989966086264e-06, + "loss": 1.2357657, + "num_input_tokens_seen": 7466845, + "router_z_loss_clip": 21.734375, + "router_z_loss_mlp": 1.38671875, + "step": 359, + "time_per_iteration": 2.683791399002075 + }, + { + "auxiliary_loss_clip": 0.09922898, + "auxiliary_loss_mlp": 0.01885242, + "balance_loss_clip": 0.07765573, + "balance_loss_mlp": 0.01746292, + "epoch": 0.02164437096046896, + "flos": 23301418688640.0, + "grad_norm": 64.98362502413198, + "language_loss": 1.06271791, + "learning_rate": 3.789780932980997e-06, + "loss": 1.18079925, + "num_input_tokens_seen": 7485450, + "router_z_loss_clip": 21.5625, + "router_z_loss_mlp": 1.38867188, + "step": 360, + "time_per_iteration": 2.7144362926483154 + }, + { + "auxiliary_loss_clip": 0.08207352, + "auxiliary_loss_mlp": 0.01776906, + "balance_loss_clip": 0.07236059, + "balance_loss_mlp": 0.01669809, + "epoch": 0.02170449421313693, + "flos": 68919621137280.0, + "grad_norm": 1.0217512577987982, + "language_loss": 0.65141213, + "learning_rate": 3.79156693186132e-06, + "loss": 0.75125468, + "num_input_tokens_seen": 7553780, + "router_z_loss_clip": 9.734375, + "router_z_loss_mlp": 1.07324219, + "step": 361, + "time_per_iteration": 3.3981525897979736 + }, + { + "auxiliary_loss_clip": 0.09926173, + "auxiliary_loss_mlp": 0.01850484, + "balance_loss_clip": 0.07767443, + "balance_loss_mlp": 0.01710961, + "epoch": 0.0217646174658049, + "flos": 25235580885120.0, + "grad_norm": 46.06075194478587, + "language_loss": 1.07240796, + "learning_rate": 3.7933479902128433e-06, + "loss": 1.19017458, + "num_input_tokens_seen": 7574155, + "router_z_loss_clip": 21.5625, + "router_z_loss_mlp": 1.39550781, + "step": 362, + "time_per_iteration": 2.7112934589385986 + }, + { + "auxiliary_loss_clip": 0.09902073, + "auxiliary_loss_mlp": 0.01838434, + "balance_loss_clip": 0.07771316, + "balance_loss_mlp": 0.01689852, + "epoch": 0.02182474071847287, + "flos": 22899721415040.0, + "grad_norm": 31.847388073363284, + "language_loss": 1.10624099, + "learning_rate": 3.7951241352937077e-06, + "loss": 1.22364616, + "num_input_tokens_seen": 7592320, + "router_z_loss_clip": 21.3125, + "router_z_loss_mlp": 1.48632812, + "step": 363, + "time_per_iteration": 2.7391881942749023 + }, + { + "auxiliary_loss_clip": 0.09905075, + "auxiliary_loss_mlp": 0.01804412, + "balance_loss_clip": 0.0776676, + "balance_loss_mlp": 0.01661742, + "epoch": 0.02188486397114084, + "flos": 23665660387200.0, + "grad_norm": 28.541039167709148, + "language_loss": 1.08880925, + "learning_rate": 3.7968953941370915e-06, + "loss": 1.20590401, + "num_input_tokens_seen": 7611185, + "router_z_loss_clip": 21.359375, + "router_z_loss_mlp": 1.42578125, + "step": 364, + "time_per_iteration": 2.7092103958129883 + }, + { + "auxiliary_loss_clip": 0.09940802, + "auxiliary_loss_mlp": 0.01790674, + "balance_loss_clip": 0.07771328, + "balance_loss_mlp": 0.01644666, + "epoch": 0.021944987223808807, + "flos": 21550090101120.0, + "grad_norm": 29.41270562877638, + "language_loss": 1.01945662, + "learning_rate": 3.798661793553676e-06, + "loss": 1.13677144, + "num_input_tokens_seen": 7631970, + "router_z_loss_clip": 21.6875, + "router_z_loss_mlp": 1.4609375, + "step": 365, + "time_per_iteration": 2.7039554119110107 + }, + { + "auxiliary_loss_clip": 0.09880184, + "auxiliary_loss_mlp": 0.01787501, + "balance_loss_clip": 0.07767902, + "balance_loss_mlp": 0.01639968, + "epoch": 0.022005110476476776, + "flos": 16076444060160.0, + "grad_norm": 25.357242967570325, + "language_loss": 1.00391948, + "learning_rate": 3.8004233601340808e-06, + "loss": 1.12059641, + "num_input_tokens_seen": 7649745, + "router_z_loss_clip": 21.125, + "router_z_loss_mlp": 1.47558594, + "step": 366, + "time_per_iteration": 2.6410672664642334 + }, + { + "auxiliary_loss_clip": 0.09886092, + "auxiliary_loss_mlp": 0.01802461, + "balance_loss_clip": 0.07774624, + "balance_loss_mlp": 0.01645009, + "epoch": 0.022065233729144748, + "flos": 21440071290240.0, + "grad_norm": 44.529255844390654, + "language_loss": 1.12988663, + "learning_rate": 3.8021801202512694e-06, + "loss": 1.24677217, + "num_input_tokens_seen": 7668830, + "router_z_loss_clip": 21.09375, + "router_z_loss_mlp": 1.57421875, + "step": 367, + "time_per_iteration": 2.742794990539551 + }, + { + "auxiliary_loss_clip": 0.09926969, + "auxiliary_loss_mlp": 0.01819149, + "balance_loss_clip": 0.0779452, + "balance_loss_mlp": 0.01654545, + "epoch": 0.022125356981812717, + "flos": 21550173955200.0, + "grad_norm": 31.338184320621753, + "language_loss": 1.07241869, + "learning_rate": 3.803932100062912e-06, + "loss": 1.18987989, + "num_input_tokens_seen": 7687240, + "router_z_loss_clip": 21.34375, + "router_z_loss_mlp": 1.64648438, + "step": 368, + "time_per_iteration": 2.660156488418579 + }, + { + "auxiliary_loss_clip": 0.09893043, + "auxiliary_loss_mlp": 0.01817736, + "balance_loss_clip": 0.07784697, + "balance_loss_mlp": 0.01649699, + "epoch": 0.022185480234480685, + "flos": 20710413936000.0, + "grad_norm": 81.09585500154182, + "language_loss": 1.0770272, + "learning_rate": 3.8056793255137264e-06, + "loss": 1.19413495, + "num_input_tokens_seen": 7704440, + "router_z_loss_clip": 21.09375, + "router_z_loss_mlp": 1.6796875, + "step": 369, + "time_per_iteration": 2.6966772079467773 + }, + { + "auxiliary_loss_clip": 0.09905175, + "auxiliary_loss_mlp": 0.01835143, + "balance_loss_clip": 0.07793829, + "balance_loss_mlp": 0.01659667, + "epoch": 0.022245603487148654, + "flos": 25200431297280.0, + "grad_norm": 48.526199326230525, + "language_loss": 1.05259717, + "learning_rate": 3.8074218223377844e-06, + "loss": 1.17000043, + "num_input_tokens_seen": 7727160, + "router_z_loss_clip": 21.09375, + "router_z_loss_mlp": 1.75585938, + "step": 370, + "time_per_iteration": 2.726882219314575 + }, + { + "auxiliary_loss_clip": 0.09840686, + "auxiliary_loss_mlp": 0.01849254, + "balance_loss_clip": 0.0775683, + "balance_loss_mlp": 0.01677497, + "epoch": 0.022305726739816623, + "flos": 21402070663680.0, + "grad_norm": 32.14486041550045, + "language_loss": 1.00516605, + "learning_rate": 3.8091596160607834e-06, + "loss": 1.12206554, + "num_input_tokens_seen": 7747730, + "router_z_loss_clip": 20.828125, + "router_z_loss_mlp": 1.71875, + "step": 371, + "time_per_iteration": 2.6846559047698975 + }, + { + "auxiliary_loss_clip": 0.09844472, + "auxiliary_loss_mlp": 0.01857578, + "balance_loss_clip": 0.07769165, + "balance_loss_mlp": 0.01683151, + "epoch": 0.022365849992484595, + "flos": 22498736901120.0, + "grad_norm": 33.301604666823, + "language_loss": 1.06231499, + "learning_rate": 3.8108927320022896e-06, + "loss": 1.17933559, + "num_input_tokens_seen": 7766765, + "router_z_loss_clip": 20.734375, + "router_z_loss_mlp": 1.74511719, + "step": 372, + "time_per_iteration": 2.7052745819091797 + }, + { + "auxiliary_loss_clip": 0.09826015, + "auxiliary_loss_mlp": 0.01853945, + "balance_loss_clip": 0.07764611, + "balance_loss_mlp": 0.01673796, + "epoch": 0.022425973245152563, + "flos": 17862083694720.0, + "grad_norm": 41.636352487556145, + "language_loss": 1.03913403, + "learning_rate": 3.8126211952779548e-06, + "loss": 1.15593362, + "num_input_tokens_seen": 7784010, + "router_z_loss_clip": 20.640625, + "router_z_loss_mlp": 1.80078125, + "step": 373, + "time_per_iteration": 4.106141090393066 + }, + { + "auxiliary_loss_clip": 0.09845725, + "auxiliary_loss_mlp": 0.01869282, + "balance_loss_clip": 0.07777153, + "balance_loss_mlp": 0.01685128, + "epoch": 0.022486096497820532, + "flos": 15487804327680.0, + "grad_norm": 61.54476347228186, + "language_loss": 1.0650835, + "learning_rate": 3.8143450308016952e-06, + "loss": 1.18223345, + "num_input_tokens_seen": 7801305, + "router_z_loss_clip": 20.703125, + "router_z_loss_mlp": 1.84277344, + "step": 374, + "time_per_iteration": 4.033753871917725 + }, + { + "auxiliary_loss_clip": 0.09812269, + "auxiliary_loss_mlp": 0.01856399, + "balance_loss_clip": 0.07757415, + "balance_loss_mlp": 0.01667095, + "epoch": 0.0225462197504885, + "flos": 27791897247360.0, + "grad_norm": 56.210759270114224, + "language_loss": 1.03319001, + "learning_rate": 3.8160642632878525e-06, + "loss": 1.14987683, + "num_input_tokens_seen": 7823965, + "router_z_loss_clip": 20.5625, + "router_z_loss_mlp": 1.89257812, + "step": 375, + "time_per_iteration": 2.7545790672302246 + }, + { + "auxiliary_loss_clip": 0.0981497, + "auxiliary_loss_mlp": 0.01843627, + "balance_loss_clip": 0.07751609, + "balance_loss_mlp": 0.01665767, + "epoch": 0.02260634300315647, + "flos": 19981804757760.0, + "grad_norm": 57.812718044092065, + "language_loss": 1.07001138, + "learning_rate": 3.817778917253314e-06, + "loss": 1.18659735, + "num_input_tokens_seen": 7842115, + "router_z_loss_clip": 20.625, + "router_z_loss_mlp": 1.77734375, + "step": 376, + "time_per_iteration": 4.076448202133179 + }, + { + "auxiliary_loss_clip": 0.09767978, + "auxiliary_loss_mlp": 0.01843169, + "balance_loss_clip": 0.07741934, + "balance_loss_mlp": 0.01659587, + "epoch": 0.02266646625582444, + "flos": 16032699429120.0, + "grad_norm": 49.61569881920644, + "language_loss": 1.03111744, + "learning_rate": 3.8194890170196155e-06, + "loss": 1.14722896, + "num_input_tokens_seen": 7857830, + "router_z_loss_clip": 20.265625, + "router_z_loss_mlp": 1.83691406, + "step": 377, + "time_per_iteration": 2.7254374027252197 + }, + { + "auxiliary_loss_clip": 0.09738941, + "auxiliary_loss_mlp": 0.01853994, + "balance_loss_clip": 0.07719769, + "balance_loss_mlp": 0.01670221, + "epoch": 0.02272658950849241, + "flos": 20409553451520.0, + "grad_norm": 48.84797020114705, + "language_loss": 1.2001133, + "learning_rate": 3.8211945867150055e-06, + "loss": 1.31604266, + "num_input_tokens_seen": 7875840, + "router_z_loss_clip": 20.171875, + "router_z_loss_mlp": 1.83691406, + "step": 378, + "time_per_iteration": 2.648167848587036 + }, + { + "auxiliary_loss_clip": 0.08046754, + "auxiliary_loss_mlp": 0.0138253, + "balance_loss_clip": 0.07155026, + "balance_loss_mlp": 0.01272953, + "epoch": 0.02278671276116038, + "flos": 69867387469440.0, + "grad_norm": 0.9915915427532991, + "language_loss": 0.75403833, + "learning_rate": 3.822895650276492e-06, + "loss": 0.84833115, + "num_input_tokens_seen": 7940190, + "router_z_loss_clip": 8.90625, + "router_z_loss_mlp": 1.09863281, + "step": 379, + "time_per_iteration": 3.301997661590576 + }, + { + "auxiliary_loss_clip": 0.09709425, + "auxiliary_loss_mlp": 0.01844372, + "balance_loss_clip": 0.07733691, + "balance_loss_mlp": 0.0167643, + "epoch": 0.022846836013828347, + "flos": 38517935823360.0, + "grad_norm": 57.599828595547535, + "language_loss": 1.02933359, + "learning_rate": 3.824592231451859e-06, + "loss": 1.14487147, + "num_input_tokens_seen": 7960840, + "router_z_loss_clip": 19.75, + "router_z_loss_mlp": 1.6796875, + "step": 380, + "time_per_iteration": 2.817310094833374 + }, + { + "auxiliary_loss_clip": 0.09699684, + "auxiliary_loss_mlp": 0.01850822, + "balance_loss_clip": 0.07715706, + "balance_loss_mlp": 0.01682976, + "epoch": 0.02290695926649632, + "flos": 20965768853760.0, + "grad_norm": 97.98649595332142, + "language_loss": 1.19140625, + "learning_rate": 3.826284353801652e-06, + "loss": 1.30691135, + "num_input_tokens_seen": 7975500, + "router_z_loss_clip": 19.875, + "router_z_loss_mlp": 1.6796875, + "step": 381, + "time_per_iteration": 2.6415421962738037 + }, + { + "auxiliary_loss_clip": 0.09691618, + "auxiliary_loss_mlp": 0.01878712, + "balance_loss_clip": 0.0772172, + "balance_loss_mlp": 0.01696942, + "epoch": 0.022967082519164288, + "flos": 24028895836800.0, + "grad_norm": 71.67825440631948, + "language_loss": 1.08586979, + "learning_rate": 3.827972040701142e-06, + "loss": 1.20157313, + "num_input_tokens_seen": 7993880, + "router_z_loss_clip": 19.703125, + "router_z_loss_mlp": 1.81640625, + "step": 382, + "time_per_iteration": 2.688380718231201 + }, + { + "auxiliary_loss_clip": 0.0969088, + "auxiliary_loss_mlp": 0.0187998, + "balance_loss_clip": 0.07735589, + "balance_loss_mlp": 0.01704695, + "epoch": 0.023027205771832256, + "flos": 21003643699200.0, + "grad_norm": 97.39739491884717, + "language_loss": 1.06533158, + "learning_rate": 3.829655315342268e-06, + "loss": 1.18104029, + "num_input_tokens_seen": 8012730, + "router_z_loss_clip": 19.53125, + "router_z_loss_mlp": 1.75292969, + "step": 383, + "time_per_iteration": 2.697038173675537 + }, + { + "auxiliary_loss_clip": 0.09652471, + "auxiliary_loss_mlp": 0.01917586, + "balance_loss_clip": 0.07717164, + "balance_loss_mlp": 0.017485, + "epoch": 0.023087329024500225, + "flos": 21367172638080.0, + "grad_norm": 19.8768776799836, + "language_loss": 1.04799581, + "learning_rate": 3.831334200735543e-06, + "loss": 1.16369653, + "num_input_tokens_seen": 8031275, + "router_z_loss_clip": 19.34375, + "router_z_loss_mlp": 1.68945312, + "step": 384, + "time_per_iteration": 2.778743028640747 + }, + { + "auxiliary_loss_clip": 0.09638548, + "auxiliary_loss_mlp": 0.01934173, + "balance_loss_clip": 0.07711613, + "balance_loss_mlp": 0.01771858, + "epoch": 0.023147452277168194, + "flos": 21879014503680.0, + "grad_norm": 73.36535290584087, + "language_loss": 1.05852127, + "learning_rate": 3.8330087197119426e-06, + "loss": 1.17424858, + "num_input_tokens_seen": 8051600, + "router_z_loss_clip": 19.265625, + "router_z_loss_mlp": 1.62402344, + "step": 385, + "time_per_iteration": 2.6939914226531982 + }, + { + "auxiliary_loss_clip": 0.09652182, + "auxiliary_loss_mlp": 0.01965061, + "balance_loss_clip": 0.07710169, + "balance_loss_mlp": 0.01799503, + "epoch": 0.023207575529836166, + "flos": 18922719876480.0, + "grad_norm": 50.36598663544367, + "language_loss": 0.83061486, + "learning_rate": 3.83467889492477e-06, + "loss": 0.9467873, + "num_input_tokens_seen": 8070600, + "router_z_loss_clip": 19.390625, + "router_z_loss_mlp": 1.65527344, + "step": 386, + "time_per_iteration": 2.655557870864868 + }, + { + "auxiliary_loss_clip": 0.09622966, + "auxiliary_loss_mlp": 0.01950141, + "balance_loss_clip": 0.07707699, + "balance_loss_mlp": 0.01772281, + "epoch": 0.023267698782504134, + "flos": 25052998838400.0, + "grad_norm": 988.1002722416383, + "language_loss": 1.04901791, + "learning_rate": 3.836344748851495e-06, + "loss": 1.16474891, + "num_input_tokens_seen": 8090680, + "router_z_loss_clip": 19.171875, + "router_z_loss_mlp": 1.77832031, + "step": 387, + "time_per_iteration": 2.7180447578430176 + }, + { + "auxiliary_loss_clip": 0.09642081, + "auxiliary_loss_mlp": 0.01949741, + "balance_loss_clip": 0.0771786, + "balance_loss_mlp": 0.0177932, + "epoch": 0.023327822035172103, + "flos": 28887221819520.0, + "grad_norm": 25.325317169555962, + "language_loss": 1.03613186, + "learning_rate": 3.838006303795566e-06, + "loss": 1.15205002, + "num_input_tokens_seen": 8114610, + "router_z_loss_clip": 19.21875, + "router_z_loss_mlp": 1.70410156, + "step": 388, + "time_per_iteration": 2.7562358379364014 + }, + { + "auxiliary_loss_clip": 0.09633669, + "auxiliary_loss_mlp": 0.01946229, + "balance_loss_clip": 0.0770783, + "balance_loss_mlp": 0.01764268, + "epoch": 0.02338794528784007, + "flos": 27128178656640.0, + "grad_norm": 20.981666659787948, + "language_loss": 1.1374321, + "learning_rate": 3.839663581888206e-06, + "loss": 1.25323105, + "num_input_tokens_seen": 8133975, + "router_z_loss_clip": 19.25, + "router_z_loss_mlp": 1.8203125, + "step": 389, + "time_per_iteration": 2.762704372406006 + }, + { + "auxiliary_loss_clip": 0.09556312, + "auxiliary_loss_mlp": 0.01957007, + "balance_loss_clip": 0.07663149, + "balance_loss_mlp": 0.01788016, + "epoch": 0.02344806854050804, + "flos": 21328375397760.0, + "grad_norm": 32.87948782751001, + "language_loss": 1.07566035, + "learning_rate": 3.841316605090178e-06, + "loss": 1.19079351, + "num_input_tokens_seen": 8153570, + "router_z_loss_clip": 18.921875, + "router_z_loss_mlp": 1.68945312, + "step": 390, + "time_per_iteration": 2.659283399581909 + }, + { + "auxiliary_loss_clip": 0.09492537, + "auxiliary_loss_mlp": 0.01896556, + "balance_loss_clip": 0.07636442, + "balance_loss_mlp": 0.01733001, + "epoch": 0.023508191793176012, + "flos": 24796847306880.0, + "grad_norm": 140.16785757024044, + "language_loss": 1.15910161, + "learning_rate": 3.842965395193529e-06, + "loss": 1.27299261, + "num_input_tokens_seen": 8170075, + "router_z_loss_clip": 18.546875, + "router_z_loss_mlp": 1.63476562, + "step": 391, + "time_per_iteration": 2.713545799255371 + }, + { + "auxiliary_loss_clip": 0.09538671, + "auxiliary_loss_mlp": 0.0188554, + "balance_loss_clip": 0.0766757, + "balance_loss_mlp": 0.01730473, + "epoch": 0.02356831504584398, + "flos": 26002651887360.0, + "grad_norm": 36.4029876381944, + "language_loss": 1.06844151, + "learning_rate": 3.84460997382332e-06, + "loss": 1.18268371, + "num_input_tokens_seen": 8190420, + "router_z_loss_clip": 18.6875, + "router_z_loss_mlp": 1.54882812, + "step": 392, + "time_per_iteration": 2.738403081893921 + }, + { + "auxiliary_loss_clip": 0.09424435, + "auxiliary_loss_mlp": 0.01937068, + "balance_loss_clip": 0.07618648, + "balance_loss_mlp": 0.01782287, + "epoch": 0.02362843829851195, + "flos": 19068475253760.0, + "grad_norm": 23.190572901307267, + "language_loss": 1.05277753, + "learning_rate": 3.8462503624393256e-06, + "loss": 1.16639256, + "num_input_tokens_seen": 8208790, + "router_z_loss_clip": 18.0625, + "router_z_loss_mlp": 1.546875, + "step": 393, + "time_per_iteration": 2.730311155319214 + }, + { + "auxiliary_loss_clip": 0.09391345, + "auxiliary_loss_mlp": 0.01894272, + "balance_loss_clip": 0.07595266, + "balance_loss_mlp": 0.01726616, + "epoch": 0.023688561551179918, + "flos": 16076611768320.0, + "grad_norm": 91.86478442531423, + "language_loss": 1.00682688, + "learning_rate": 3.84788658233771e-06, + "loss": 1.11968303, + "num_input_tokens_seen": 8226885, + "router_z_loss_clip": 17.953125, + "router_z_loss_mlp": 1.67578125, + "step": 394, + "time_per_iteration": 2.705462694168091 + }, + { + "auxiliary_loss_clip": 0.09387165, + "auxiliary_loss_mlp": 0.01881808, + "balance_loss_clip": 0.07597888, + "balance_loss_mlp": 0.01708144, + "epoch": 0.023748684803847887, + "flos": 21730575795840.0, + "grad_norm": 29.466731361634597, + "language_loss": 1.02469492, + "learning_rate": 3.84951865465269e-06, + "loss": 1.13738465, + "num_input_tokens_seen": 8246825, + "router_z_loss_clip": 17.875, + "router_z_loss_mlp": 1.73632812, + "step": 395, + "time_per_iteration": 2.67728328704834 + }, + { + "auxiliary_loss_clip": 0.07807533, + "auxiliary_loss_mlp": 0.01422272, + "balance_loss_clip": 0.06998962, + "balance_loss_mlp": 0.01324949, + "epoch": 0.02380880805651586, + "flos": 61944299349120.0, + "grad_norm": 0.9675883167947973, + "language_loss": 0.63979137, + "learning_rate": 3.851146600358172e-06, + "loss": 0.7320894, + "num_input_tokens_seen": 8302835, + "router_z_loss_clip": 8.09375, + "router_z_loss_mlp": 0.97216797, + "step": 396, + "time_per_iteration": 3.085773468017578 + }, + { + "auxiliary_loss_clip": 0.09369384, + "auxiliary_loss_mlp": 0.01878876, + "balance_loss_clip": 0.07592572, + "balance_loss_mlp": 0.01705307, + "epoch": 0.023868931309183827, + "flos": 20272518898560.0, + "grad_norm": 448.6329753345253, + "language_loss": 1.09206522, + "learning_rate": 3.852770440269372e-06, + "loss": 1.20454776, + "num_input_tokens_seen": 8320745, + "router_z_loss_clip": 17.765625, + "router_z_loss_mlp": 1.73632812, + "step": 397, + "time_per_iteration": 2.645312786102295 + }, + { + "auxiliary_loss_clip": 0.09360366, + "auxiliary_loss_mlp": 0.01887806, + "balance_loss_clip": 0.07592075, + "balance_loss_mlp": 0.01703461, + "epoch": 0.023929054561851796, + "flos": 21144954810240.0, + "grad_norm": 35.15382244199787, + "language_loss": 1.09138823, + "learning_rate": 3.854390195044404e-06, + "loss": 1.20386982, + "num_input_tokens_seen": 8339540, + "router_z_loss_clip": 17.671875, + "router_z_loss_mlp": 1.84277344, + "step": 398, + "time_per_iteration": 2.7186756134033203 + }, + { + "auxiliary_loss_clip": 0.09363802, + "auxiliary_loss_mlp": 0.01863352, + "balance_loss_clip": 0.07595689, + "balance_loss_mlp": 0.01681963, + "epoch": 0.023989177814519765, + "flos": 13703548285440.0, + "grad_norm": 79.14501576371894, + "language_loss": 1.17455924, + "learning_rate": 3.856005885185868e-06, + "loss": 1.2868309, + "num_input_tokens_seen": 8354890, + "router_z_loss_clip": 17.6875, + "router_z_loss_mlp": 1.81347656, + "step": 399, + "time_per_iteration": 2.6266868114471436 + }, + { + "auxiliary_loss_clip": 0.09350164, + "auxiliary_loss_mlp": 0.01862402, + "balance_loss_clip": 0.07603092, + "balance_loss_mlp": 0.0168683, + "epoch": 0.024049301067187733, + "flos": 26329060667520.0, + "grad_norm": 31.26445557719831, + "language_loss": 1.02793097, + "learning_rate": 3.857617531042398e-06, + "loss": 1.14005673, + "num_input_tokens_seen": 8375845, + "router_z_loss_clip": 17.46875, + "router_z_loss_mlp": 1.75585938, + "step": 400, + "time_per_iteration": 2.766996145248413 + }, + { + "auxiliary_loss_clip": 0.09326777, + "auxiliary_loss_mlp": 0.01879183, + "balance_loss_clip": 0.07581857, + "balance_loss_mlp": 0.01707522, + "epoch": 0.024109424319855705, + "flos": 24432270192000.0, + "grad_norm": 165.70452294486532, + "language_loss": 0.98901701, + "learning_rate": 3.8592251528102065e-06, + "loss": 1.1010766, + "num_input_tokens_seen": 8395240, + "router_z_loss_clip": 17.46875, + "router_z_loss_mlp": 1.71679688, + "step": 401, + "time_per_iteration": 2.6877481937408447 + }, + { + "auxiliary_loss_clip": 0.09325443, + "auxiliary_loss_mlp": 0.01927273, + "balance_loss_clip": 0.0761469, + "balance_loss_mlp": 0.01736538, + "epoch": 0.024169547572523674, + "flos": 29611764074880.0, + "grad_norm": 158.83382742696674, + "language_loss": 1.04086566, + "learning_rate": 3.8608287705345976e-06, + "loss": 1.15339279, + "num_input_tokens_seen": 8416950, + "router_z_loss_clip": 17.09375, + "router_z_loss_mlp": 1.90722656, + "step": 402, + "time_per_iteration": 2.7297163009643555 + }, + { + "auxiliary_loss_clip": 0.09320071, + "auxiliary_loss_mlp": 0.01914681, + "balance_loss_clip": 0.07593916, + "balance_loss_mlp": 0.01724327, + "epoch": 0.024229670825191642, + "flos": 22608042952320.0, + "grad_norm": 474.9195361774189, + "language_loss": 1.23886442, + "learning_rate": 3.86242840411147e-06, + "loss": 1.35121191, + "num_input_tokens_seen": 8433660, + "router_z_loss_clip": 17.265625, + "router_z_loss_mlp": 1.90234375, + "step": 403, + "time_per_iteration": 2.6663832664489746 + }, + { + "auxiliary_loss_clip": 0.09310063, + "auxiliary_loss_mlp": 0.01918458, + "balance_loss_clip": 0.07606195, + "balance_loss_mlp": 0.01729535, + "epoch": 0.02428979407785961, + "flos": 18156110071680.0, + "grad_norm": 557.4725363749534, + "language_loss": 1.23195148, + "learning_rate": 3.864024073288798e-06, + "loss": 1.34423661, + "num_input_tokens_seen": 8450180, + "router_z_loss_clip": 17.0625, + "router_z_loss_mlp": 1.88867188, + "step": 404, + "time_per_iteration": 2.6930551528930664 + }, + { + "auxiliary_loss_clip": 0.09236102, + "auxiliary_loss_mlp": 0.01972168, + "balance_loss_clip": 0.07543309, + "balance_loss_mlp": 0.01765125, + "epoch": 0.024349917330527583, + "flos": 15310463160960.0, + "grad_norm": 32.91094539461264, + "language_loss": 1.10026622, + "learning_rate": 3.865615797668091e-06, + "loss": 1.21234894, + "num_input_tokens_seen": 8467775, + "router_z_loss_clip": 16.921875, + "router_z_loss_mlp": 2.0703125, + "step": 405, + "time_per_iteration": 2.7313172817230225 + }, + { + "auxiliary_loss_clip": 0.09182028, + "auxiliary_loss_mlp": 0.01998566, + "balance_loss_clip": 0.0751636, + "balance_loss_mlp": 0.01782559, + "epoch": 0.024410040583195552, + "flos": 20779623008640.0, + "grad_norm": 51.884422925202074, + "language_loss": 1.20401216, + "learning_rate": 3.867203596705844e-06, + "loss": 1.31581819, + "num_input_tokens_seen": 8486765, + "router_z_loss_clip": 16.65625, + "router_z_loss_mlp": 2.16015625, + "step": 406, + "time_per_iteration": 2.687269449234009 + }, + { + "auxiliary_loss_clip": 0.09164648, + "auxiliary_loss_mlp": 0.02058169, + "balance_loss_clip": 0.07528092, + "balance_loss_mlp": 0.01824328, + "epoch": 0.02447016383586352, + "flos": 21805319237760.0, + "grad_norm": 51.34272238318618, + "language_loss": 1.09166133, + "learning_rate": 3.86878748971496e-06, + "loss": 1.20388949, + "num_input_tokens_seen": 8506515, + "router_z_loss_clip": 16.375, + "router_z_loss_mlp": 2.33789062, + "step": 407, + "time_per_iteration": 2.7443573474884033 + }, + { + "auxiliary_loss_clip": 0.0913244, + "auxiliary_loss_mlp": 0.02070529, + "balance_loss_clip": 0.07525964, + "balance_loss_mlp": 0.01834208, + "epoch": 0.02453028708853149, + "flos": 33956529183360.0, + "grad_norm": 76.90003006133684, + "language_loss": 0.92362475, + "learning_rate": 3.8703674958661596e-06, + "loss": 1.03565443, + "num_input_tokens_seen": 8528035, + "router_z_loss_clip": 16.0546875, + "router_z_loss_mlp": 2.36132812, + "step": 408, + "time_per_iteration": 2.78354549407959 + }, + { + "auxiliary_loss_clip": 0.09112523, + "auxiliary_loss_mlp": 0.02060747, + "balance_loss_clip": 0.07508834, + "balance_loss_mlp": 0.01828241, + "epoch": 0.024590410341199458, + "flos": 21798485130240.0, + "grad_norm": 96.45423831363296, + "language_loss": 1.18704772, + "learning_rate": 3.871943634189376e-06, + "loss": 1.29878044, + "num_input_tokens_seen": 8546455, + "router_z_loss_clip": 16.015625, + "router_z_loss_mlp": 2.32421875, + "step": 409, + "time_per_iteration": 2.7200136184692383 + }, + { + "auxiliary_loss_clip": 0.09154539, + "auxiliary_loss_mlp": 0.02068674, + "balance_loss_clip": 0.07541502, + "balance_loss_mlp": 0.01836741, + "epoch": 0.02465053359386743, + "flos": 35123243034240.0, + "grad_norm": 76.46793311342431, + "language_loss": 1.05106175, + "learning_rate": 3.873515923575128e-06, + "loss": 1.16329384, + "num_input_tokens_seen": 8568450, + "router_z_loss_clip": 16.1171875, + "router_z_loss_mlp": 2.3203125, + "step": 410, + "time_per_iteration": 2.7935402393341064 + }, + { + "auxiliary_loss_clip": 0.09179245, + "auxiliary_loss_mlp": 0.02052485, + "balance_loss_clip": 0.07555975, + "balance_loss_mlp": 0.01831042, + "epoch": 0.0247106568465354, + "flos": 27458360870400.0, + "grad_norm": 178.4501833385731, + "language_loss": 1.0301317, + "learning_rate": 3.875084382775879e-06, + "loss": 1.14244902, + "num_input_tokens_seen": 8589340, + "router_z_loss_clip": 16.25, + "router_z_loss_mlp": 2.21679688, + "step": 411, + "time_per_iteration": 2.810314416885376 + }, + { + "auxiliary_loss_clip": 0.09117973, + "auxiliary_loss_mlp": 0.02147569, + "balance_loss_clip": 0.07523946, + "balance_loss_mlp": 0.01899232, + "epoch": 0.024770780099203367, + "flos": 20709994665600.0, + "grad_norm": 31.381834451084366, + "language_loss": 1.07807076, + "learning_rate": 3.87664903040738e-06, + "loss": 1.19072616, + "num_input_tokens_seen": 8607150, + "router_z_loss_clip": 15.9375, + "router_z_loss_mlp": 2.48242188, + "step": 412, + "time_per_iteration": 4.135298252105713 + }, + { + "auxiliary_loss_clip": 0.0766484, + "auxiliary_loss_mlp": 0.01383218, + "balance_loss_clip": 0.06950212, + "balance_loss_mlp": 0.01289853, + "epoch": 0.024830903351871336, + "flos": 69571264740480.0, + "grad_norm": 0.8458100626859368, + "language_loss": 0.58554661, + "learning_rate": 3.878209884949994e-06, + "loss": 0.67602718, + "num_input_tokens_seen": 8669865, + "router_z_loss_clip": 7.13671875, + "router_z_loss_mlp": 0.93261719, + "step": 413, + "time_per_iteration": 4.813804864883423 + }, + { + "auxiliary_loss_clip": 0.09105721, + "auxiliary_loss_mlp": 0.02060854, + "balance_loss_clip": 0.07511897, + "balance_loss_mlp": 0.01837503, + "epoch": 0.024891026604539304, + "flos": 32278728153600.0, + "grad_norm": 48.89104730966055, + "language_loss": 0.9726972, + "learning_rate": 3.879766964750006e-06, + "loss": 1.08436298, + "num_input_tokens_seen": 8690235, + "router_z_loss_clip": 15.921875, + "router_z_loss_mlp": 2.234375, + "step": 414, + "time_per_iteration": 2.777872323989868 + }, + { + "auxiliary_loss_clip": 0.0905456, + "auxiliary_loss_mlp": 0.02077859, + "balance_loss_clip": 0.07483284, + "balance_loss_mlp": 0.0185365, + "epoch": 0.024951149857207276, + "flos": 18845712374400.0, + "grad_norm": 208.18956686369972, + "language_loss": 1.01095724, + "learning_rate": 3.881320288020917e-06, + "loss": 1.12228131, + "num_input_tokens_seen": 8706295, + "router_z_loss_clip": 15.71875, + "router_z_loss_mlp": 2.24023438, + "step": 415, + "time_per_iteration": 4.142550230026245 + }, + { + "auxiliary_loss_clip": 0.09080397, + "auxiliary_loss_mlp": 0.02074643, + "balance_loss_clip": 0.07484584, + "balance_loss_mlp": 0.0184805, + "epoch": 0.025011273109875245, + "flos": 15382565199360.0, + "grad_norm": 178.52142115782007, + "language_loss": 1.28543544, + "learning_rate": 3.882869872844723e-06, + "loss": 1.39698577, + "num_input_tokens_seen": 8724200, + "router_z_loss_clip": 15.953125, + "router_z_loss_mlp": 2.26757812, + "step": 416, + "time_per_iteration": 2.6912667751312256 + }, + { + "auxiliary_loss_clip": 0.09093624, + "auxiliary_loss_mlp": 0.02048458, + "balance_loss_clip": 0.07498566, + "balance_loss_mlp": 0.01806797, + "epoch": 0.025071396362543213, + "flos": 18921336284160.0, + "grad_norm": 52.83271193802728, + "language_loss": 0.94415307, + "learning_rate": 3.884415737173176e-06, + "loss": 1.05557394, + "num_input_tokens_seen": 8744170, + "router_z_loss_clip": 15.9609375, + "router_z_loss_mlp": 2.41796875, + "step": 417, + "time_per_iteration": 2.6768579483032227 + }, + { + "auxiliary_loss_clip": 0.0906695, + "auxiliary_loss_mlp": 0.02050523, + "balance_loss_clip": 0.07510033, + "balance_loss_mlp": 0.01817826, + "epoch": 0.025131519615211182, + "flos": 25345012717440.0, + "grad_norm": 47.28632079324067, + "language_loss": 0.95738804, + "learning_rate": 3.8859578988290344e-06, + "loss": 1.06856275, + "num_input_tokens_seen": 8765120, + "router_z_loss_clip": 15.5625, + "router_z_loss_mlp": 2.328125, + "step": 418, + "time_per_iteration": 2.7193026542663574 + }, + { + "auxiliary_loss_clip": 0.09048779, + "auxiliary_loss_mlp": 0.02107992, + "balance_loss_clip": 0.07468801, + "balance_loss_mlp": 0.01844969, + "epoch": 0.02519164286787915, + "flos": 18959169202560.0, + "grad_norm": 64.96228222580599, + "language_loss": 1.10502434, + "learning_rate": 3.887496375507294e-06, + "loss": 1.21659207, + "num_input_tokens_seen": 8783500, + "router_z_loss_clip": 15.7890625, + "router_z_loss_mlp": 2.62890625, + "step": 419, + "time_per_iteration": 2.661895513534546 + }, + { + "auxiliary_loss_clip": 0.09047179, + "auxiliary_loss_mlp": 0.02074314, + "balance_loss_clip": 0.07473344, + "balance_loss_mlp": 0.01826931, + "epoch": 0.025251766120547123, + "flos": 17426913914880.0, + "grad_norm": 60.48178105720379, + "language_loss": 0.91689897, + "learning_rate": 3.8890311847764065e-06, + "loss": 1.02811384, + "num_input_tokens_seen": 8801175, + "router_z_loss_clip": 15.7265625, + "router_z_loss_mlp": 2.47070312, + "step": 420, + "time_per_iteration": 2.690960168838501 + }, + { + "auxiliary_loss_clip": 0.09091747, + "auxiliary_loss_mlp": 0.02038651, + "balance_loss_clip": 0.07504605, + "balance_loss_mlp": 0.01800423, + "epoch": 0.02531188937321509, + "flos": 25052328005760.0, + "grad_norm": 83.61542449738408, + "language_loss": 0.95396888, + "learning_rate": 3.890562344079484e-06, + "loss": 1.06527293, + "num_input_tokens_seen": 8820215, + "router_z_loss_clip": 15.875, + "router_z_loss_mlp": 2.38085938, + "step": 421, + "time_per_iteration": 2.713627338409424 + }, + { + "auxiliary_loss_clip": 0.0910122, + "auxiliary_loss_mlp": 0.02078743, + "balance_loss_clip": 0.07504999, + "balance_loss_mlp": 0.0184185, + "epoch": 0.02537201262588306, + "flos": 30600214364160.0, + "grad_norm": 131.53322969932037, + "language_loss": 1.06396794, + "learning_rate": 3.89208987073549e-06, + "loss": 1.17576766, + "num_input_tokens_seen": 8839660, + "router_z_loss_clip": 15.96875, + "router_z_loss_mlp": 2.36914062, + "step": 422, + "time_per_iteration": 2.779984712600708 + }, + { + "auxiliary_loss_clip": 0.09149099, + "auxiliary_loss_mlp": 0.02005588, + "balance_loss_clip": 0.07524605, + "balance_loss_mlp": 0.01778041, + "epoch": 0.02543213587855103, + "flos": 26072154449280.0, + "grad_norm": 215.69560731113194, + "language_loss": 1.02335918, + "learning_rate": 3.893613781940409e-06, + "loss": 1.13490605, + "num_input_tokens_seen": 8859280, + "router_z_loss_clip": 16.2265625, + "router_z_loss_mlp": 2.27148438, + "step": 423, + "time_per_iteration": 2.72013783454895 + }, + { + "auxiliary_loss_clip": 0.09173086, + "auxiliary_loss_mlp": 0.0200403, + "balance_loss_clip": 0.07535084, + "balance_loss_mlp": 0.01785067, + "epoch": 0.025492259131218997, + "flos": 36030744679680.0, + "grad_norm": 27.081185373152007, + "language_loss": 0.91272038, + "learning_rate": 3.895134094768415e-06, + "loss": 1.02449155, + "num_input_tokens_seen": 8880560, + "router_z_loss_clip": 16.375, + "router_z_loss_mlp": 2.18945312, + "step": 424, + "time_per_iteration": 2.8317928314208984 + }, + { + "auxiliary_loss_clip": 0.09242675, + "auxiliary_loss_mlp": 0.01968499, + "balance_loss_clip": 0.07578178, + "balance_loss_mlp": 0.01753446, + "epoch": 0.02555238238388697, + "flos": 18593963182080.0, + "grad_norm": 166.26721899755887, + "language_loss": 1.05789995, + "learning_rate": 3.896650826173015e-06, + "loss": 1.17001164, + "num_input_tokens_seen": 8899155, + "router_z_loss_clip": 16.625, + "router_z_loss_mlp": 2.15332031, + "step": 425, + "time_per_iteration": 2.660106897354126 + }, + { + "auxiliary_loss_clip": 0.0923897, + "auxiliary_loss_mlp": 0.01943853, + "balance_loss_clip": 0.07566722, + "balance_loss_mlp": 0.01731852, + "epoch": 0.025612505636554938, + "flos": 24250023561600.0, + "grad_norm": 44.6180367993383, + "language_loss": 1.08164155, + "learning_rate": 3.898163992988186e-06, + "loss": 1.19346988, + "num_input_tokens_seen": 8917890, + "router_z_loss_clip": 16.703125, + "router_z_loss_mlp": 2.12109375, + "step": 426, + "time_per_iteration": 2.713566303253174 + }, + { + "auxiliary_loss_clip": 0.07567823, + "auxiliary_loss_mlp": 0.0137553, + "balance_loss_clip": 0.06925757, + "balance_loss_mlp": 0.01282499, + "epoch": 0.025672628889222907, + "flos": 60606617241600.0, + "grad_norm": 0.882551554014491, + "language_loss": 0.57127881, + "learning_rate": 3.899673611929491e-06, + "loss": 0.66071236, + "num_input_tokens_seen": 8978260, + "router_z_loss_clip": 6.43359375, + "router_z_loss_mlp": 0.92919922, + "step": 427, + "time_per_iteration": 3.3642380237579346 + }, + { + "auxiliary_loss_clip": 0.09344095, + "auxiliary_loss_mlp": 0.01954303, + "balance_loss_clip": 0.0761513, + "balance_loss_mlp": 0.01743541, + "epoch": 0.025732752141890875, + "flos": 19579352797440.0, + "grad_norm": 32.1114157010126, + "language_loss": 1.08901465, + "learning_rate": 3.901179699595194e-06, + "loss": 1.20199859, + "num_input_tokens_seen": 8994460, + "router_z_loss_clip": 17.296875, + "router_z_loss_mlp": 2.10839844, + "step": 428, + "time_per_iteration": 2.6606802940368652 + }, + { + "auxiliary_loss_clip": 0.09310514, + "auxiliary_loss_mlp": 0.01961632, + "balance_loss_clip": 0.07603246, + "balance_loss_mlp": 0.01752969, + "epoch": 0.025792875394558847, + "flos": 31292164581120.0, + "grad_norm": 36.551830180207176, + "language_loss": 1.00762367, + "learning_rate": 3.902682272467353e-06, + "loss": 1.12034512, + "num_input_tokens_seen": 9016670, + "router_z_loss_clip": 17.046875, + "router_z_loss_mlp": 2.08984375, + "step": 429, + "time_per_iteration": 2.8459787368774414 + }, + { + "auxiliary_loss_clip": 0.09338318, + "auxiliary_loss_mlp": 0.01955653, + "balance_loss_clip": 0.07623117, + "balance_loss_mlp": 0.01745367, + "epoch": 0.025852998647226816, + "flos": 32387824569600.0, + "grad_norm": 62.5354126598028, + "language_loss": 1.05025983, + "learning_rate": 3.904181346912895e-06, + "loss": 1.16319966, + "num_input_tokens_seen": 9039720, + "router_z_loss_clip": 17.15625, + "router_z_loss_mlp": 2.10644531, + "step": 430, + "time_per_iteration": 2.8446128368377686 + }, + { + "auxiliary_loss_clip": 0.09278628, + "auxiliary_loss_mlp": 0.01943414, + "balance_loss_clip": 0.07600376, + "balance_loss_mlp": 0.01729219, + "epoch": 0.025913121899894784, + "flos": 20199452538240.0, + "grad_norm": 28.225993864396795, + "language_loss": 1.00378919, + "learning_rate": 3.905676939184698e-06, + "loss": 1.11600959, + "num_input_tokens_seen": 9059850, + "router_z_loss_clip": 16.78125, + "router_z_loss_mlp": 2.14453125, + "step": 431, + "time_per_iteration": 2.735534906387329 + }, + { + "auxiliary_loss_clip": 0.09339449, + "auxiliary_loss_mlp": 0.01919694, + "balance_loss_clip": 0.07634744, + "balance_loss_mlp": 0.01714844, + "epoch": 0.025973245152562753, + "flos": 14725680716160.0, + "grad_norm": 242.91179280184718, + "language_loss": 1.11488628, + "learning_rate": 3.907169065422638e-06, + "loss": 1.22747779, + "num_input_tokens_seen": 9077590, + "router_z_loss_clip": 17.046875, + "router_z_loss_mlp": 2.04882812, + "step": 432, + "time_per_iteration": 2.6356372833251953 + }, + { + "auxiliary_loss_clip": 0.09349881, + "auxiliary_loss_mlp": 0.01923388, + "balance_loss_clip": 0.07619249, + "balance_loss_mlp": 0.01717585, + "epoch": 0.02603336840523072, + "flos": 31000947315840.0, + "grad_norm": 39.86728122976192, + "language_loss": 0.95303321, + "learning_rate": 3.908657741654636e-06, + "loss": 1.06576586, + "num_input_tokens_seen": 9099880, + "router_z_loss_clip": 17.328125, + "router_z_loss_mlp": 2.06054688, + "step": 433, + "time_per_iteration": 2.7784080505371094 + }, + { + "auxiliary_loss_clip": 0.09401309, + "auxiliary_loss_mlp": 0.0191169, + "balance_loss_clip": 0.07644869, + "balance_loss_mlp": 0.01712276, + "epoch": 0.026093491657898694, + "flos": 17679753210240.0, + "grad_norm": 1553.0281168066135, + "language_loss": 1.08543563, + "learning_rate": 3.910142983797699e-06, + "loss": 1.19856548, + "num_input_tokens_seen": 9118620, + "router_z_loss_clip": 17.5625, + "router_z_loss_mlp": 1.99511719, + "step": 434, + "time_per_iteration": 2.668267250061035 + }, + { + "auxiliary_loss_clip": 0.09433939, + "auxiliary_loss_mlp": 0.01869234, + "balance_loss_clip": 0.07651832, + "balance_loss_mlp": 0.01678308, + "epoch": 0.026153614910566662, + "flos": 17863593068160.0, + "grad_norm": 33.64342024905016, + "language_loss": 1.03063393, + "learning_rate": 3.9116248076589305e-06, + "loss": 1.14366555, + "num_input_tokens_seen": 9135655, + "router_z_loss_clip": 17.8125, + "router_z_loss_mlp": 1.90917969, + "step": 435, + "time_per_iteration": 2.6838159561157227 + }, + { + "auxiliary_loss_clip": 0.09478317, + "auxiliary_loss_mlp": 0.01863685, + "balance_loss_clip": 0.07678007, + "balance_loss_mlp": 0.01671615, + "epoch": 0.02621373816323463, + "flos": 20017289761920.0, + "grad_norm": 41.08687640619308, + "language_loss": 1.07638645, + "learning_rate": 3.913103228936546e-06, + "loss": 1.18980646, + "num_input_tokens_seen": 9153520, + "router_z_loss_clip": 18.0, + "router_z_loss_mlp": 1.91992188, + "step": 436, + "time_per_iteration": 2.760547399520874 + }, + { + "auxiliary_loss_clip": 0.09473966, + "auxiliary_loss_mlp": 0.0187601, + "balance_loss_clip": 0.07674257, + "balance_loss_mlp": 0.01688708, + "epoch": 0.0262738614159026, + "flos": 19287213137280.0, + "grad_norm": 53.25711722147742, + "language_loss": 0.98595166, + "learning_rate": 3.914578263220868e-06, + "loss": 1.09945142, + "num_input_tokens_seen": 9170750, + "router_z_loss_clip": 18.0, + "router_z_loss_mlp": 1.87402344, + "step": 437, + "time_per_iteration": 2.6779754161834717 + }, + { + "auxiliary_loss_clip": 0.0942243, + "auxiliary_loss_mlp": 0.01861842, + "balance_loss_clip": 0.0761686, + "balance_loss_mlp": 0.01679594, + "epoch": 0.026333984668570568, + "flos": 18813204190080.0, + "grad_norm": 25.40915552443808, + "language_loss": 1.10034943, + "learning_rate": 3.916049925995316e-06, + "loss": 1.21319222, + "num_input_tokens_seen": 9188430, + "router_z_loss_clip": 18.03125, + "router_z_loss_mlp": 1.82421875, + "step": 438, + "time_per_iteration": 2.6451144218444824 + }, + { + "auxiliary_loss_clip": 0.07475804, + "auxiliary_loss_mlp": 0.01367854, + "balance_loss_clip": 0.06865337, + "balance_loss_mlp": 0.01290463, + "epoch": 0.02639410792123854, + "flos": 64593723196800.0, + "grad_norm": 0.9063737016618233, + "language_loss": 0.62703174, + "learning_rate": 3.917518232637377e-06, + "loss": 0.71546829, + "num_input_tokens_seen": 9255835, + "router_z_loss_clip": 6.09765625, + "router_z_loss_mlp": 0.77294922, + "step": 439, + "time_per_iteration": 3.321974992752075 + }, + { + "auxiliary_loss_clip": 0.09522887, + "auxiliary_loss_mlp": 0.0184955, + "balance_loss_clip": 0.07696441, + "balance_loss_mlp": 0.01671499, + "epoch": 0.02645423117390651, + "flos": 28480661009280.0, + "grad_norm": 87.92324241889918, + "language_loss": 0.94047898, + "learning_rate": 3.918983198419573e-06, + "loss": 1.05420327, + "num_input_tokens_seen": 9276835, + "router_z_loss_clip": 18.25, + "router_z_loss_mlp": 1.78027344, + "step": 440, + "time_per_iteration": 2.7474722862243652 + }, + { + "auxiliary_loss_clip": 0.09507709, + "auxiliary_loss_mlp": 0.01844884, + "balance_loss_clip": 0.07691655, + "balance_loss_mlp": 0.01676846, + "epoch": 0.026514354426574478, + "flos": 18557094585600.0, + "grad_norm": 21.281112340814676, + "language_loss": 1.01854694, + "learning_rate": 3.920444838510415e-06, + "loss": 1.13207293, + "num_input_tokens_seen": 9295075, + "router_z_loss_clip": 18.171875, + "router_z_loss_mlp": 1.68066406, + "step": 441, + "time_per_iteration": 2.6456263065338135 + }, + { + "auxiliary_loss_clip": 0.09501958, + "auxiliary_loss_mlp": 0.01843855, + "balance_loss_clip": 0.07712354, + "balance_loss_mlp": 0.01682208, + "epoch": 0.026574477679242446, + "flos": 20674090391040.0, + "grad_norm": 41.33053095224922, + "language_loss": 0.97709602, + "learning_rate": 3.92190316797534e-06, + "loss": 1.09055424, + "num_input_tokens_seen": 9314205, + "router_z_loss_clip": 17.890625, + "router_z_loss_mlp": 1.61621094, + "step": 442, + "time_per_iteration": 2.672673463821411 + }, + { + "auxiliary_loss_clip": 0.07433579, + "auxiliary_loss_mlp": 0.01330966, + "balance_loss_clip": 0.06849352, + "balance_loss_mlp": 0.01265354, + "epoch": 0.026634600931910415, + "flos": 57974718896640.0, + "grad_norm": 0.9677279434812149, + "language_loss": 0.64635992, + "learning_rate": 3.92335820177765e-06, + "loss": 0.73400539, + "num_input_tokens_seen": 9367395, + "router_z_loss_clip": 5.83203125, + "router_z_loss_mlp": 0.65625, + "step": 443, + "time_per_iteration": 3.173064947128296 + }, + { + "auxiliary_loss_clip": 0.09527416, + "auxiliary_loss_mlp": 0.01860056, + "balance_loss_clip": 0.07710861, + "balance_loss_mlp": 0.01695928, + "epoch": 0.026694724184578387, + "flos": 15820586017920.0, + "grad_norm": 61.63283491372988, + "language_loss": 1.0548501, + "learning_rate": 3.924809954779425e-06, + "loss": 1.16872489, + "num_input_tokens_seen": 9385185, + "router_z_loss_clip": 18.15625, + "router_z_loss_mlp": 1.64160156, + "step": 444, + "time_per_iteration": 2.639677047729492 + }, + { + "auxiliary_loss_clip": 0.09502187, + "auxiliary_loss_mlp": 0.01838362, + "balance_loss_clip": 0.07703182, + "balance_loss_mlp": 0.01668608, + "epoch": 0.026754847437246355, + "flos": 23446922503680.0, + "grad_norm": 26.361183363910182, + "language_loss": 1.13923943, + "learning_rate": 3.9262584417424425e-06, + "loss": 1.2526449, + "num_input_tokens_seen": 9403225, + "router_z_loss_clip": 17.96875, + "router_z_loss_mlp": 1.69824219, + "step": 445, + "time_per_iteration": 2.6820874214172363 + }, + { + "auxiliary_loss_clip": 0.09478995, + "auxiliary_loss_mlp": 0.01847369, + "balance_loss_clip": 0.07693952, + "balance_loss_mlp": 0.01688678, + "epoch": 0.026814970689914324, + "flos": 17346552249600.0, + "grad_norm": 24.407324377890284, + "language_loss": 1.13474417, + "learning_rate": 3.9277036773290725e-06, + "loss": 1.24800777, + "num_input_tokens_seen": 9420540, + "router_z_loss_clip": 17.84375, + "router_z_loss_mlp": 1.5859375, + "step": 446, + "time_per_iteration": 2.6508054733276367 + }, + { + "auxiliary_loss_clip": 0.09462097, + "auxiliary_loss_mlp": 0.01860509, + "balance_loss_clip": 0.07703365, + "balance_loss_mlp": 0.01698385, + "epoch": 0.026875093942582293, + "flos": 17900503591680.0, + "grad_norm": 17.536194577693298, + "language_loss": 0.97970635, + "learning_rate": 3.92914567610317e-06, + "loss": 1.09293234, + "num_input_tokens_seen": 9438840, + "router_z_loss_clip": 17.609375, + "router_z_loss_mlp": 1.62109375, + "step": 447, + "time_per_iteration": 2.6584267616271973 + }, + { + "auxiliary_loss_clip": 0.0948635, + "auxiliary_loss_mlp": 0.01891451, + "balance_loss_clip": 0.0770483, + "balance_loss_mlp": 0.01723413, + "epoch": 0.026935217195250265, + "flos": 21730114598400.0, + "grad_norm": 21.562911901589327, + "language_loss": 1.05652094, + "learning_rate": 3.930584452530952e-06, + "loss": 1.17029905, + "num_input_tokens_seen": 9457215, + "router_z_loss_clip": 17.8125, + "router_z_loss_mlp": 1.67871094, + "step": 448, + "time_per_iteration": 2.672372341156006 + }, + { + "auxiliary_loss_clip": 0.09413482, + "auxiliary_loss_mlp": 0.01902533, + "balance_loss_clip": 0.07671943, + "balance_loss_mlp": 0.01741266, + "epoch": 0.026995340447918233, + "flos": 23629378769280.0, + "grad_norm": 23.02833788504926, + "language_loss": 1.03788567, + "learning_rate": 3.9320200209818755e-06, + "loss": 1.1510458, + "num_input_tokens_seen": 9475615, + "router_z_loss_clip": 17.421875, + "router_z_loss_mlp": 1.61328125, + "step": 449, + "time_per_iteration": 2.7325220108032227 + }, + { + "auxiliary_loss_clip": 0.09437311, + "auxiliary_loss_mlp": 0.01924822, + "balance_loss_clip": 0.07667883, + "balance_loss_mlp": 0.0175955, + "epoch": 0.027055463700586202, + "flos": 17937078698880.0, + "grad_norm": 25.829396596685555, + "language_loss": 1.03924859, + "learning_rate": 3.933452395729493e-06, + "loss": 1.15286994, + "num_input_tokens_seen": 9493975, + "router_z_loss_clip": 17.703125, + "router_z_loss_mlp": 1.65332031, + "step": 450, + "time_per_iteration": 2.7811074256896973 + }, + { + "auxiliary_loss_clip": 0.09359707, + "auxiliary_loss_mlp": 0.01970194, + "balance_loss_clip": 0.0764256, + "balance_loss_mlp": 0.01786802, + "epoch": 0.02711558695325417, + "flos": 25125897490560.0, + "grad_norm": 13.607653987068408, + "language_loss": 0.94443107, + "learning_rate": 3.934881590952304e-06, + "loss": 1.05773008, + "num_input_tokens_seen": 9514810, + "router_z_loss_clip": 17.171875, + "router_z_loss_mlp": 1.83398438, + "step": 451, + "time_per_iteration": 2.7412643432617188 + }, + { + "auxiliary_loss_clip": 0.09335385, + "auxiliary_loss_mlp": 0.02017307, + "balance_loss_clip": 0.07637483, + "balance_loss_mlp": 0.0183115, + "epoch": 0.02717571020592214, + "flos": 24245788930560.0, + "grad_norm": 37.22783951143226, + "language_loss": 0.88836813, + "learning_rate": 3.936307620734599e-06, + "loss": 1.00189495, + "num_input_tokens_seen": 9533635, + "router_z_loss_clip": 16.984375, + "router_z_loss_mlp": 1.86132812, + "step": 452, + "time_per_iteration": 4.115676403045654 + }, + { + "auxiliary_loss_clip": 0.09290475, + "auxiliary_loss_mlp": 0.0203207, + "balance_loss_clip": 0.07611442, + "balance_loss_mlp": 0.01843815, + "epoch": 0.02723583345859011, + "flos": 25125939417600.0, + "grad_norm": 26.908598142012707, + "language_loss": 0.85555518, + "learning_rate": 3.937730499067294e-06, + "loss": 0.96878058, + "num_input_tokens_seen": 9555420, + "router_z_loss_clip": 16.796875, + "router_z_loss_mlp": 1.88378906, + "step": 453, + "time_per_iteration": 4.138639211654663 + }, + { + "auxiliary_loss_clip": 0.09325944, + "auxiliary_loss_mlp": 0.02084866, + "balance_loss_clip": 0.07637945, + "balance_loss_mlp": 0.01890889, + "epoch": 0.02729595671125808, + "flos": 42751550090880.0, + "grad_norm": 24.937148454808558, + "language_loss": 1.02160192, + "learning_rate": 3.939150239848748e-06, + "loss": 1.13570988, + "num_input_tokens_seen": 9578950, + "router_z_loss_clip": 16.90625, + "router_z_loss_mlp": 1.94140625, + "step": 454, + "time_per_iteration": 2.851925849914551 + }, + { + "auxiliary_loss_clip": 0.09296365, + "auxiliary_loss_mlp": 0.02123722, + "balance_loss_clip": 0.07621342, + "balance_loss_mlp": 0.01917728, + "epoch": 0.02735607996392605, + "flos": 21436884835200.0, + "grad_norm": 33.11607572615514, + "language_loss": 0.89587128, + "learning_rate": 3.9405668568855866e-06, + "loss": 1.01007211, + "num_input_tokens_seen": 9598160, + "router_z_loss_clip": 16.734375, + "router_z_loss_mlp": 2.0625, + "step": 455, + "time_per_iteration": 4.109623432159424 + }, + { + "auxiliary_loss_clip": 0.09291606, + "auxiliary_loss_mlp": 0.02163595, + "balance_loss_clip": 0.07605162, + "balance_loss_mlp": 0.01945966, + "epoch": 0.027416203216594017, + "flos": 20857762540800.0, + "grad_norm": 21.694013226548094, + "language_loss": 0.99008209, + "learning_rate": 3.941980363893499e-06, + "loss": 1.10463405, + "num_input_tokens_seen": 9616010, + "router_z_loss_clip": 16.84375, + "router_z_loss_mlp": 2.17773438, + "step": 456, + "time_per_iteration": 2.6782984733581543 + }, + { + "auxiliary_loss_clip": 0.09230845, + "auxiliary_loss_mlp": 0.02187109, + "balance_loss_clip": 0.07574348, + "balance_loss_mlp": 0.01970243, + "epoch": 0.027476326469261986, + "flos": 13229497411200.0, + "grad_norm": 28.08353344684151, + "language_loss": 0.97085631, + "learning_rate": 3.9433907744980384e-06, + "loss": 1.0850358, + "num_input_tokens_seen": 9634000, + "router_z_loss_clip": 16.5625, + "router_z_loss_mlp": 2.16894531, + "step": 457, + "time_per_iteration": 2.6582846641540527 + }, + { + "auxiliary_loss_clip": 0.09249748, + "auxiliary_loss_mlp": 0.02209668, + "balance_loss_clip": 0.07581042, + "balance_loss_mlp": 0.01978497, + "epoch": 0.027536449721929958, + "flos": 24031369532160.0, + "grad_norm": 45.18041952436337, + "language_loss": 1.10011601, + "learning_rate": 3.944798102235412e-06, + "loss": 1.21471024, + "num_input_tokens_seen": 9653455, + "router_z_loss_clip": 16.671875, + "router_z_loss_mlp": 2.31054688, + "step": 458, + "time_per_iteration": 2.723140239715576 + }, + { + "auxiliary_loss_clip": 0.09220205, + "auxiliary_loss_mlp": 0.02210297, + "balance_loss_clip": 0.07555029, + "balance_loss_mlp": 0.01976265, + "epoch": 0.027596572974597926, + "flos": 13011094944000.0, + "grad_norm": 45.239920259124276, + "language_loss": 1.02681351, + "learning_rate": 3.9462023605532545e-06, + "loss": 1.14111853, + "num_input_tokens_seen": 9669650, + "router_z_loss_clip": 16.640625, + "router_z_loss_mlp": 2.33984375, + "step": 459, + "time_per_iteration": 2.671720027923584 + }, + { + "auxiliary_loss_clip": 0.09208341, + "auxiliary_loss_mlp": 0.02210187, + "balance_loss_clip": 0.07567435, + "balance_loss_mlp": 0.0198264, + "epoch": 0.027656696227265895, + "flos": 26150671324800.0, + "grad_norm": 19.623434288041715, + "language_loss": 0.97685856, + "learning_rate": 3.947603562811407e-06, + "loss": 1.09104395, + "num_input_tokens_seen": 9691415, + "router_z_loss_clip": 16.40625, + "router_z_loss_mlp": 2.2734375, + "step": 460, + "time_per_iteration": 2.757227897644043 + }, + { + "auxiliary_loss_clip": 0.07349286, + "auxiliary_loss_mlp": 0.01457289, + "balance_loss_clip": 0.06801966, + "balance_loss_mlp": 0.01381853, + "epoch": 0.027716819479933864, + "flos": 60717055322880.0, + "grad_norm": 1.34871546657126, + "language_loss": 0.73767412, + "learning_rate": 3.949001722282675e-06, + "loss": 0.8257398, + "num_input_tokens_seen": 9755605, + "router_z_loss_clip": 5.48828125, + "router_z_loss_mlp": 0.75292969, + "step": 461, + "time_per_iteration": 3.225203514099121 + }, + { + "auxiliary_loss_clip": 0.09153335, + "auxiliary_loss_mlp": 0.02158036, + "balance_loss_clip": 0.07562718, + "balance_loss_mlp": 0.01941456, + "epoch": 0.027776942732601832, + "flos": 31219936761600.0, + "grad_norm": 25.337070845847826, + "language_loss": 1.02236819, + "learning_rate": 3.950396852153582e-06, + "loss": 1.13548183, + "num_input_tokens_seen": 9776270, + "router_z_loss_clip": 15.921875, + "router_z_loss_mlp": 2.16503906, + "step": 462, + "time_per_iteration": 2.761122941970825 + }, + { + "auxiliary_loss_clip": 0.0917296, + "auxiliary_loss_mlp": 0.02143298, + "balance_loss_clip": 0.07564321, + "balance_loss_mlp": 0.01926432, + "epoch": 0.027837065985269804, + "flos": 22681277020800.0, + "grad_norm": 25.879214952659087, + "language_loss": 1.11945248, + "learning_rate": 3.951788965525118e-06, + "loss": 1.23261511, + "num_input_tokens_seen": 9794465, + "router_z_loss_clip": 16.09375, + "router_z_loss_mlp": 2.16796875, + "step": 463, + "time_per_iteration": 2.6517393589019775 + }, + { + "auxiliary_loss_clip": 0.07315847, + "auxiliary_loss_mlp": 0.01337025, + "balance_loss_clip": 0.06773283, + "balance_loss_mlp": 0.01272986, + "epoch": 0.027897189237937773, + "flos": 62200786296960.0, + "grad_norm": 0.9076693638551637, + "language_loss": 0.58966231, + "learning_rate": 3.953178075413476e-06, + "loss": 0.67619097, + "num_input_tokens_seen": 9849685, + "router_z_loss_clip": 5.4375, + "router_z_loss_mlp": 0.64013672, + "step": 464, + "time_per_iteration": 3.2396233081817627 + }, + { + "auxiliary_loss_clip": 0.09172998, + "auxiliary_loss_mlp": 0.02120585, + "balance_loss_clip": 0.07578301, + "balance_loss_mlp": 0.01918502, + "epoch": 0.02795731249060574, + "flos": 24499131350400.0, + "grad_norm": 45.20349334546378, + "language_loss": 1.03495145, + "learning_rate": 3.954564194750784e-06, + "loss": 1.14788723, + "num_input_tokens_seen": 9869505, + "router_z_loss_clip": 15.953125, + "router_z_loss_mlp": 2.02148438, + "step": 465, + "time_per_iteration": 2.725616931915283 + }, + { + "auxiliary_loss_clip": 0.09135859, + "auxiliary_loss_mlp": 0.0204377, + "balance_loss_clip": 0.07563674, + "balance_loss_mlp": 0.01849125, + "epoch": 0.02801743574327371, + "flos": 23739858777600.0, + "grad_norm": 33.78948466858622, + "language_loss": 0.95100033, + "learning_rate": 3.955947336385828e-06, + "loss": 1.06279659, + "num_input_tokens_seen": 9890950, + "router_z_loss_clip": 15.703125, + "router_z_loss_mlp": 1.94628906, + "step": 466, + "time_per_iteration": 2.7096307277679443 + }, + { + "auxiliary_loss_clip": 0.09162845, + "auxiliary_loss_mlp": 0.02091556, + "balance_loss_clip": 0.07588789, + "balance_loss_mlp": 0.0189424, + "epoch": 0.02807755899594168, + "flos": 20634999661440.0, + "grad_norm": 17.071922366982022, + "language_loss": 1.01469541, + "learning_rate": 3.957327513084761e-06, + "loss": 1.12723947, + "num_input_tokens_seen": 9911265, + "router_z_loss_clip": 15.75, + "router_z_loss_mlp": 1.97265625, + "step": 467, + "time_per_iteration": 2.697120189666748 + }, + { + "auxiliary_loss_clip": 0.0908498, + "auxiliary_loss_mlp": 0.02113688, + "balance_loss_clip": 0.07555597, + "balance_loss_mlp": 0.01908934, + "epoch": 0.02813768224860965, + "flos": 19250554176000.0, + "grad_norm": 23.52868546244156, + "language_loss": 1.03801823, + "learning_rate": 3.958704737531818e-06, + "loss": 1.15000498, + "num_input_tokens_seen": 9929025, + "router_z_loss_clip": 15.2734375, + "router_z_loss_mlp": 2.04882812, + "step": 468, + "time_per_iteration": 2.6348235607147217 + }, + { + "auxiliary_loss_clip": 0.09087479, + "auxiliary_loss_mlp": 0.02120186, + "balance_loss_clip": 0.07563758, + "balance_loss_mlp": 0.01912189, + "epoch": 0.02819780550127762, + "flos": 20820306965760.0, + "grad_norm": 34.78387665912523, + "language_loss": 1.11076498, + "learning_rate": 3.9600790223300065e-06, + "loss": 1.2228415, + "num_input_tokens_seen": 9945190, + "router_z_loss_clip": 15.2265625, + "router_z_loss_mlp": 2.08300781, + "step": 469, + "time_per_iteration": 2.6886401176452637 + }, + { + "auxiliary_loss_clip": 0.09051213, + "auxiliary_loss_mlp": 0.02126417, + "balance_loss_clip": 0.07552808, + "balance_loss_mlp": 0.01921949, + "epoch": 0.028257928753945588, + "flos": 19980211530240.0, + "grad_norm": 43.4409759227761, + "language_loss": 1.05499089, + "learning_rate": 3.96145038000181e-06, + "loss": 1.16676712, + "num_input_tokens_seen": 9962820, + "router_z_loss_clip": 15.0078125, + "router_z_loss_mlp": 2.046875, + "step": 470, + "time_per_iteration": 2.649240255355835 + }, + { + "auxiliary_loss_clip": 0.09054536, + "auxiliary_loss_mlp": 0.02164254, + "balance_loss_clip": 0.0753805, + "balance_loss_mlp": 0.0194281, + "epoch": 0.028318052006613557, + "flos": 20490585949440.0, + "grad_norm": 34.229925481391405, + "language_loss": 1.11025834, + "learning_rate": 3.962818822989861e-06, + "loss": 1.2224462, + "num_input_tokens_seen": 9982595, + "router_z_loss_clip": 15.1796875, + "router_z_loss_mlp": 2.21484375, + "step": 471, + "time_per_iteration": 2.694502592086792 + }, + { + "auxiliary_loss_clip": 0.0901389, + "auxiliary_loss_mlp": 0.02100335, + "balance_loss_clip": 0.07527161, + "balance_loss_mlp": 0.01902638, + "epoch": 0.02837817525928153, + "flos": 28522854339840.0, + "grad_norm": 28.640745518781863, + "language_loss": 0.93263328, + "learning_rate": 3.964184363657625e-06, + "loss": 1.04377556, + "num_input_tokens_seen": 10004645, + "router_z_loss_clip": 14.859375, + "router_z_loss_mlp": 1.9765625, + "step": 472, + "time_per_iteration": 2.723616123199463 + }, + { + "auxiliary_loss_clip": 0.09058346, + "auxiliary_loss_mlp": 0.02156495, + "balance_loss_clip": 0.07551048, + "balance_loss_mlp": 0.01941347, + "epoch": 0.028438298511949497, + "flos": 18557597710080.0, + "grad_norm": 31.883678895195217, + "language_loss": 1.09761989, + "learning_rate": 3.965547014290071e-06, + "loss": 1.2097683, + "num_input_tokens_seen": 10022555, + "router_z_loss_clip": 15.078125, + "router_z_loss_mlp": 2.15136719, + "step": 473, + "time_per_iteration": 2.678131580352783 + }, + { + "auxiliary_loss_clip": 0.09018995, + "auxiliary_loss_mlp": 0.02143272, + "balance_loss_clip": 0.07526669, + "balance_loss_mlp": 0.01926216, + "epoch": 0.028498421764617466, + "flos": 16915952517120.0, + "grad_norm": 82.06010961294956, + "language_loss": 1.11515367, + "learning_rate": 3.96690678709433e-06, + "loss": 1.22677636, + "num_input_tokens_seen": 10041025, + "router_z_loss_clip": 14.921875, + "router_z_loss_mlp": 2.171875, + "step": 474, + "time_per_iteration": 2.6410977840423584 + }, + { + "auxiliary_loss_clip": 0.08995185, + "auxiliary_loss_mlp": 0.02205209, + "balance_loss_clip": 0.0752454, + "balance_loss_mlp": 0.01985291, + "epoch": 0.028558545017285435, + "flos": 27785524337280.0, + "grad_norm": 24.826629982331372, + "language_loss": 0.97130352, + "learning_rate": 3.968263694200355e-06, + "loss": 1.0833075, + "num_input_tokens_seen": 10060775, + "router_z_loss_clip": 14.6953125, + "router_z_loss_mlp": 2.19726562, + "step": 475, + "time_per_iteration": 2.7301735877990723 + }, + { + "auxiliary_loss_clip": 0.07259832, + "auxiliary_loss_mlp": 0.01404773, + "balance_loss_clip": 0.06728013, + "balance_loss_mlp": 0.01346599, + "epoch": 0.028618668269953403, + "flos": 65674205596800.0, + "grad_norm": 0.9437348671950723, + "language_loss": 0.66932654, + "learning_rate": 3.969617747661569e-06, + "loss": 0.75597262, + "num_input_tokens_seen": 10120225, + "router_z_loss_clip": 5.3125, + "router_z_loss_mlp": 0.58154297, + "step": 476, + "time_per_iteration": 3.247438430786133 + }, + { + "auxiliary_loss_clip": 0.08952022, + "auxiliary_loss_mlp": 0.02252624, + "balance_loss_clip": 0.07508352, + "balance_loss_mlp": 0.02028701, + "epoch": 0.028678791522621375, + "flos": 21942269936640.0, + "grad_norm": 144.43661292546363, + "language_loss": 1.05051386, + "learning_rate": 3.970968959455509e-06, + "loss": 1.16256034, + "num_input_tokens_seen": 10137880, + "router_z_loss_clip": 14.4296875, + "router_z_loss_mlp": 2.24023438, + "step": 477, + "time_per_iteration": 2.6508686542510986 + }, + { + "auxiliary_loss_clip": 0.08993904, + "auxiliary_loss_mlp": 0.02256823, + "balance_loss_clip": 0.0754967, + "balance_loss_mlp": 0.02029467, + "epoch": 0.028738914775289344, + "flos": 24579115672320.0, + "grad_norm": 33.20185721324117, + "language_loss": 1.03065133, + "learning_rate": 3.97231734148446e-06, + "loss": 1.14315856, + "num_input_tokens_seen": 10156930, + "router_z_loss_clip": 14.453125, + "router_z_loss_mlp": 2.2734375, + "step": 478, + "time_per_iteration": 2.7467830181121826 + }, + { + "auxiliary_loss_clip": 0.08933547, + "auxiliary_loss_mlp": 0.0224041, + "balance_loss_clip": 0.07500903, + "balance_loss_mlp": 0.02019921, + "epoch": 0.028799038027957313, + "flos": 23264633946240.0, + "grad_norm": 28.885721108677235, + "language_loss": 1.00177026, + "learning_rate": 3.973662905576082e-06, + "loss": 1.11350989, + "num_input_tokens_seen": 10176295, + "router_z_loss_clip": 14.328125, + "router_z_loss_mlp": 2.20507812, + "step": 479, + "time_per_iteration": 2.7295467853546143 + }, + { + "auxiliary_loss_clip": 0.08948811, + "auxiliary_loss_mlp": 0.02267472, + "balance_loss_clip": 0.07523456, + "balance_loss_mlp": 0.02031152, + "epoch": 0.02885916128062528, + "flos": 22170692966400.0, + "grad_norm": 33.357673755660976, + "language_loss": 0.91625684, + "learning_rate": 3.975005663484038e-06, + "loss": 1.02841961, + "num_input_tokens_seen": 10195790, + "router_z_loss_clip": 14.25, + "router_z_loss_mlp": 2.36328125, + "step": 480, + "time_per_iteration": 2.766277551651001 + }, + { + "auxiliary_loss_clip": 0.08903027, + "auxiliary_loss_mlp": 0.02291788, + "balance_loss_clip": 0.07483099, + "balance_loss_mlp": 0.02045358, + "epoch": 0.02891928453329325, + "flos": 22939986101760.0, + "grad_norm": 22.287574516605755, + "language_loss": 1.01525128, + "learning_rate": 3.976345626888605e-06, + "loss": 1.12719941, + "num_input_tokens_seen": 10218405, + "router_z_loss_clip": 14.1875, + "router_z_loss_mlp": 2.4609375, + "step": 481, + "time_per_iteration": 2.692387580871582 + }, + { + "auxiliary_loss_clip": 0.07204929, + "auxiliary_loss_mlp": 0.0133661, + "balance_loss_clip": 0.06688471, + "balance_loss_mlp": 0.01279295, + "epoch": 0.028979407785961222, + "flos": 57449376524160.0, + "grad_norm": 0.8487290952821426, + "language_loss": 0.65879083, + "learning_rate": 3.9776828073972864e-06, + "loss": 0.74420619, + "num_input_tokens_seen": 10271005, + "router_z_loss_clip": 5.16015625, + "router_z_loss_mlp": 0.57275391, + "step": 482, + "time_per_iteration": 3.019406318664551 + }, + { + "auxiliary_loss_clip": 0.08916203, + "auxiliary_loss_mlp": 0.02251093, + "balance_loss_clip": 0.0748857, + "balance_loss_mlp": 0.02018397, + "epoch": 0.02903953103862919, + "flos": 16727584538880.0, + "grad_norm": 104.5991727322302, + "language_loss": 1.06331348, + "learning_rate": 3.979017216545415e-06, + "loss": 1.17498636, + "num_input_tokens_seen": 10288405, + "router_z_loss_clip": 14.28125, + "router_z_loss_mlp": 2.32421875, + "step": 483, + "time_per_iteration": 2.609882354736328 + }, + { + "auxiliary_loss_clip": 0.08908117, + "auxiliary_loss_mlp": 0.02236577, + "balance_loss_clip": 0.07510938, + "balance_loss_mlp": 0.02016469, + "epoch": 0.02909965429129716, + "flos": 16769232817920.0, + "grad_norm": 23.083678473769563, + "language_loss": 0.94234419, + "learning_rate": 3.980348865796749e-06, + "loss": 1.05379117, + "num_input_tokens_seen": 10306875, + "router_z_loss_clip": 13.96875, + "router_z_loss_mlp": 2.20507812, + "step": 484, + "time_per_iteration": 2.6507458686828613 + }, + { + "auxiliary_loss_clip": 0.08915585, + "auxiliary_loss_mlp": 0.02232887, + "balance_loss_clip": 0.07503805, + "balance_loss_mlp": 0.02011253, + "epoch": 0.029159777543965128, + "flos": 19790334178560.0, + "grad_norm": 110.91894314268477, + "language_loss": 1.00352454, + "learning_rate": 3.9816777665440615e-06, + "loss": 1.11500931, + "num_input_tokens_seen": 10323965, + "router_z_loss_clip": 14.125, + "router_z_loss_mlp": 2.21679688, + "step": 485, + "time_per_iteration": 2.7673757076263428 + }, + { + "auxiliary_loss_clip": 0.08880442, + "auxiliary_loss_mlp": 0.02237809, + "balance_loss_clip": 0.07482816, + "balance_loss_mlp": 0.02005876, + "epoch": 0.029219900796633096, + "flos": 19648184526720.0, + "grad_norm": 27.10228237086094, + "language_loss": 1.06272924, + "learning_rate": 3.983003930109732e-06, + "loss": 1.17391181, + "num_input_tokens_seen": 10342620, + "router_z_loss_clip": 13.96875, + "router_z_loss_mlp": 2.31835938, + "step": 486, + "time_per_iteration": 2.6508092880249023 + }, + { + "auxiliary_loss_clip": 0.08911004, + "auxiliary_loss_mlp": 0.02193732, + "balance_loss_clip": 0.0752122, + "balance_loss_mlp": 0.01974864, + "epoch": 0.02928002404930107, + "flos": 25892926565760.0, + "grad_norm": 15.693662583850747, + "language_loss": 1.04105806, + "learning_rate": 3.984327367746315e-06, + "loss": 1.15210545, + "num_input_tokens_seen": 10364610, + "router_z_loss_clip": 13.90625, + "router_z_loss_mlp": 2.19042969, + "step": 487, + "time_per_iteration": 2.81233286857605 + }, + { + "auxiliary_loss_clip": 0.0888624, + "auxiliary_loss_mlp": 0.02210903, + "balance_loss_clip": 0.07486838, + "balance_loss_mlp": 0.02002811, + "epoch": 0.029340147301969037, + "flos": 20665243785600.0, + "grad_norm": 49.61563210000309, + "language_loss": 1.12978697, + "learning_rate": 3.985648090637122e-06, + "loss": 1.24075842, + "num_input_tokens_seen": 10380910, + "router_z_loss_clip": 13.9921875, + "router_z_loss_mlp": 2.08300781, + "step": 488, + "time_per_iteration": 2.674189567565918 + }, + { + "auxiliary_loss_clip": 0.08953497, + "auxiliary_loss_mlp": 0.02211393, + "balance_loss_clip": 0.07543504, + "balance_loss_mlp": 0.02002347, + "epoch": 0.029400270554637006, + "flos": 24435288938880.0, + "grad_norm": 19.90256121713189, + "language_loss": 1.00477099, + "learning_rate": 3.986966109896785e-06, + "loss": 1.11641979, + "num_input_tokens_seen": 10400665, + "router_z_loss_clip": 14.1015625, + "router_z_loss_mlp": 2.09277344, + "step": 489, + "time_per_iteration": 2.7639148235321045 + }, + { + "auxiliary_loss_clip": 0.0892607, + "auxiliary_loss_mlp": 0.0220073, + "balance_loss_clip": 0.07529595, + "balance_loss_mlp": 0.01982529, + "epoch": 0.029460393807304974, + "flos": 20127140864640.0, + "grad_norm": 27.578366038116485, + "language_loss": 1.02338409, + "learning_rate": 3.988281436571815e-06, + "loss": 1.13465214, + "num_input_tokens_seen": 10420150, + "router_z_loss_clip": 13.96875, + "router_z_loss_mlp": 2.18359375, + "step": 490, + "time_per_iteration": 2.6444106101989746 + }, + { + "auxiliary_loss_clip": 0.08913176, + "auxiliary_loss_mlp": 0.02195572, + "balance_loss_clip": 0.07533699, + "balance_loss_mlp": 0.0197432, + "epoch": 0.029520517059972943, + "flos": 17681681854080.0, + "grad_norm": 29.015537112342308, + "language_loss": 1.11532688, + "learning_rate": 3.989594081641164e-06, + "loss": 1.22641444, + "num_input_tokens_seen": 10438210, + "router_z_loss_clip": 13.7890625, + "router_z_loss_mlp": 2.21289062, + "step": 491, + "time_per_iteration": 5.5153045654296875 + }, + { + "auxiliary_loss_clip": 0.08889591, + "auxiliary_loss_mlp": 0.02207651, + "balance_loss_clip": 0.07520857, + "balance_loss_mlp": 0.0199317, + "epoch": 0.029580640312640915, + "flos": 18959211129600.0, + "grad_norm": 14.57626480214455, + "language_loss": 0.9931764, + "learning_rate": 3.9909040560167675e-06, + "loss": 1.10414886, + "num_input_tokens_seen": 10455125, + "router_z_loss_clip": 13.6875, + "router_z_loss_mlp": 2.14550781, + "step": 492, + "time_per_iteration": 4.12203049659729 + }, + { + "auxiliary_loss_clip": 0.08912461, + "auxiliary_loss_mlp": 0.02272215, + "balance_loss_clip": 0.07548416, + "balance_loss_mlp": 0.02033606, + "epoch": 0.029640763565308884, + "flos": 18730746172800.0, + "grad_norm": 23.908228280746865, + "language_loss": 1.05753922, + "learning_rate": 3.992211370544093e-06, + "loss": 1.16938591, + "num_input_tokens_seen": 10470990, + "router_z_loss_clip": 13.625, + "router_z_loss_mlp": 2.3828125, + "step": 493, + "time_per_iteration": 2.6953020095825195 + }, + { + "auxiliary_loss_clip": 0.08946873, + "auxiliary_loss_mlp": 0.02207101, + "balance_loss_clip": 0.07561117, + "balance_loss_mlp": 0.01985753, + "epoch": 0.029700886817976852, + "flos": 20601652936320.0, + "grad_norm": 59.82783301164341, + "language_loss": 1.05118871, + "learning_rate": 3.99351603600268e-06, + "loss": 1.16272855, + "num_input_tokens_seen": 10490685, + "router_z_loss_clip": 13.8515625, + "router_z_loss_mlp": 2.21386719, + "step": 494, + "time_per_iteration": 2.6631805896759033 + }, + { + "auxiliary_loss_clip": 0.08915924, + "auxiliary_loss_mlp": 0.02239191, + "balance_loss_clip": 0.07543083, + "balance_loss_mlp": 0.0199753, + "epoch": 0.02976101007064482, + "flos": 22243423910400.0, + "grad_norm": 26.318413946561634, + "language_loss": 1.04354262, + "learning_rate": 3.994818063106668e-06, + "loss": 1.15509367, + "num_input_tokens_seen": 10509435, + "router_z_loss_clip": 13.7265625, + "router_z_loss_mlp": 2.4140625, + "step": 495, + "time_per_iteration": 4.107235908508301 + }, + { + "auxiliary_loss_clip": 0.08888054, + "auxiliary_loss_mlp": 0.02273613, + "balance_loss_clip": 0.07541628, + "balance_loss_mlp": 0.02036148, + "epoch": 0.029821133323312793, + "flos": 23739439507200.0, + "grad_norm": 14.252476342508674, + "language_loss": 0.79374158, + "learning_rate": 3.99611746250533e-06, + "loss": 0.9053582, + "num_input_tokens_seen": 10530050, + "router_z_loss_clip": 13.4609375, + "router_z_loss_mlp": 2.37304688, + "step": 496, + "time_per_iteration": 2.757887363433838 + }, + { + "auxiliary_loss_clip": 0.08908898, + "auxiliary_loss_mlp": 0.0225322, + "balance_loss_clip": 0.07561936, + "balance_loss_mlp": 0.02023385, + "epoch": 0.02988125657598076, + "flos": 22426131738240.0, + "grad_norm": 48.93797296748546, + "language_loss": 1.05435932, + "learning_rate": 3.997414244783595e-06, + "loss": 1.16598058, + "num_input_tokens_seen": 10551370, + "router_z_loss_clip": 13.453125, + "router_z_loss_mlp": 2.296875, + "step": 497, + "time_per_iteration": 2.698960781097412 + }, + { + "auxiliary_loss_clip": 0.08959304, + "auxiliary_loss_mlp": 0.0221962, + "balance_loss_clip": 0.07595803, + "balance_loss_mlp": 0.01998176, + "epoch": 0.02994137982864873, + "flos": 13850267984640.0, + "grad_norm": 57.28331954677374, + "language_loss": 1.09360301, + "learning_rate": 3.998708420462557e-06, + "loss": 1.20539236, + "num_input_tokens_seen": 10569225, + "router_z_loss_clip": 13.640625, + "router_z_loss_mlp": 2.21289062, + "step": 498, + "time_per_iteration": 2.699470281600952 + }, + { + "auxiliary_loss_clip": 0.08942117, + "auxiliary_loss_mlp": 0.02291662, + "balance_loss_clip": 0.07576901, + "balance_loss_mlp": 0.02053434, + "epoch": 0.0300015030813167, + "flos": 23914055416320.0, + "grad_norm": 30.471494656970325, + "language_loss": 1.05517888, + "learning_rate": 4e-06, + "loss": 1.16751671, + "num_input_tokens_seen": 10586170, + "router_z_loss_clip": 13.65625, + "router_z_loss_mlp": 2.37890625, + "step": 499, + "time_per_iteration": 2.6825146675109863 + }, + { + "auxiliary_loss_clip": 0.08909643, + "auxiliary_loss_mlp": 0.02277073, + "balance_loss_clip": 0.07578171, + "balance_loss_mlp": 0.02052769, + "epoch": 0.030061626333984667, + "flos": 22023134726400.0, + "grad_norm": 15.715356901732157, + "language_loss": 0.96281993, + "learning_rate": 3.9999999620799e-06, + "loss": 1.07468712, + "num_input_tokens_seen": 10606205, + "router_z_loss_clip": 13.3046875, + "router_z_loss_mlp": 2.24414062, + "step": 500, + "time_per_iteration": 2.7350914478302 + }, + { + "auxiliary_loss_clip": 0.08887713, + "auxiliary_loss_mlp": 0.02297984, + "balance_loss_clip": 0.07557485, + "balance_loss_mlp": 0.02069103, + "epoch": 0.03012174958665264, + "flos": 23046483041280.0, + "grad_norm": 15.325261953037035, + "language_loss": 1.09255648, + "learning_rate": 3.9999998483196e-06, + "loss": 1.20441341, + "num_input_tokens_seen": 10625995, + "router_z_loss_clip": 13.296875, + "router_z_loss_mlp": 2.2890625, + "step": 501, + "time_per_iteration": 2.6515860557556152 + }, + { + "auxiliary_loss_clip": 0.0895866, + "auxiliary_loss_mlp": 0.02279337, + "balance_loss_clip": 0.07618586, + "balance_loss_mlp": 0.02058275, + "epoch": 0.030181872839320608, + "flos": 18959294983680.0, + "grad_norm": 442.08874740717613, + "language_loss": 1.0616231, + "learning_rate": 3.9999996587191065e-06, + "loss": 1.17400312, + "num_input_tokens_seen": 10644105, + "router_z_loss_clip": 13.40625, + "router_z_loss_mlp": 2.21289062, + "step": 502, + "time_per_iteration": 2.6650314331054688 + }, + { + "auxiliary_loss_clip": 0.08926746, + "auxiliary_loss_mlp": 0.02313635, + "balance_loss_clip": 0.07593986, + "balance_loss_mlp": 0.02080176, + "epoch": 0.030241996091988577, + "flos": 16733747813760.0, + "grad_norm": 40.11923719359636, + "language_loss": 1.00487685, + "learning_rate": 3.999999393278425e-06, + "loss": 1.11728072, + "num_input_tokens_seen": 10661090, + "router_z_loss_clip": 13.3125, + "router_z_loss_mlp": 2.3359375, + "step": 503, + "time_per_iteration": 2.6301283836364746 + }, + { + "auxiliary_loss_clip": 0.08950677, + "auxiliary_loss_mlp": 0.02299167, + "balance_loss_clip": 0.07607222, + "balance_loss_mlp": 0.02070094, + "epoch": 0.030302119344656545, + "flos": 28628806227840.0, + "grad_norm": 16.096297116013613, + "language_loss": 1.02800179, + "learning_rate": 3.999999051997567e-06, + "loss": 1.14050031, + "num_input_tokens_seen": 10682380, + "router_z_loss_clip": 13.4375, + "router_z_loss_mlp": 2.28808594, + "step": 504, + "time_per_iteration": 2.7234466075897217 + }, + { + "auxiliary_loss_clip": 0.08954775, + "auxiliary_loss_mlp": 0.022733, + "balance_loss_clip": 0.07610564, + "balance_loss_mlp": 0.02054241, + "epoch": 0.030362242597324514, + "flos": 15674788713600.0, + "grad_norm": 53.80634610199122, + "language_loss": 0.90572113, + "learning_rate": 3.9999986348765425e-06, + "loss": 1.01800191, + "num_input_tokens_seen": 10699925, + "router_z_loss_clip": 13.453125, + "router_z_loss_mlp": 2.19042969, + "step": 505, + "time_per_iteration": 2.6355271339416504 + }, + { + "auxiliary_loss_clip": 0.07202613, + "auxiliary_loss_mlp": 0.01385887, + "balance_loss_clip": 0.06702607, + "balance_loss_mlp": 0.01312073, + "epoch": 0.030422365849992486, + "flos": 72149173528320.0, + "grad_norm": 1.0312424009228802, + "language_loss": 0.55707914, + "learning_rate": 3.999998141915371e-06, + "loss": 0.64296412, + "num_input_tokens_seen": 10766525, + "router_z_loss_clip": 5.0, + "router_z_loss_mlp": 0.73779297, + "step": 506, + "time_per_iteration": 3.4425716400146484 + }, + { + "auxiliary_loss_clip": 0.08947556, + "auxiliary_loss_mlp": 0.0229462, + "balance_loss_clip": 0.07588895, + "balance_loss_mlp": 0.02080234, + "epoch": 0.030482489102660455, + "flos": 19433974763520.0, + "grad_norm": 15.732874937996321, + "language_loss": 0.96318799, + "learning_rate": 3.999997573114069e-06, + "loss": 1.07560968, + "num_input_tokens_seen": 10786725, + "router_z_loss_clip": 13.5703125, + "router_z_loss_mlp": 2.14648438, + "step": 507, + "time_per_iteration": 2.6885857582092285 + }, + { + "auxiliary_loss_clip": 0.08928548, + "auxiliary_loss_mlp": 0.02259048, + "balance_loss_clip": 0.07588597, + "balance_loss_mlp": 0.02042945, + "epoch": 0.030542612355328423, + "flos": 20382034584960.0, + "grad_norm": 22.351883402694675, + "language_loss": 1.05944586, + "learning_rate": 3.999996928472659e-06, + "loss": 1.17132187, + "num_input_tokens_seen": 10805390, + "router_z_loss_clip": 13.3984375, + "router_z_loss_mlp": 2.15722656, + "step": 508, + "time_per_iteration": 2.659903049468994 + }, + { + "auxiliary_loss_clip": 0.08911724, + "auxiliary_loss_mlp": 0.02284852, + "balance_loss_clip": 0.07589735, + "balance_loss_mlp": 0.02067796, + "epoch": 0.030602735607996392, + "flos": 34685809194240.0, + "grad_norm": 36.57726962187856, + "language_loss": 0.84476292, + "learning_rate": 3.999996207991165e-06, + "loss": 0.95672864, + "num_input_tokens_seen": 10828030, + "router_z_loss_clip": 13.1953125, + "router_z_loss_mlp": 2.17089844, + "step": 509, + "time_per_iteration": 2.8194127082824707 + }, + { + "auxiliary_loss_clip": 0.08892205, + "auxiliary_loss_mlp": 0.02281797, + "balance_loss_clip": 0.07575735, + "balance_loss_mlp": 0.02065503, + "epoch": 0.03066285886066436, + "flos": 23665283043840.0, + "grad_norm": 17.47434487382061, + "language_loss": 0.97325271, + "learning_rate": 3.999995411669614e-06, + "loss": 1.08499277, + "num_input_tokens_seen": 10845240, + "router_z_loss_clip": 13.15625, + "router_z_loss_mlp": 2.16210938, + "step": 510, + "time_per_iteration": 2.6817235946655273 + }, + { + "auxiliary_loss_clip": 0.08892487, + "auxiliary_loss_mlp": 0.02360194, + "balance_loss_clip": 0.07583004, + "balance_loss_mlp": 0.02123492, + "epoch": 0.030722982113332332, + "flos": 23009656371840.0, + "grad_norm": 18.905046526469672, + "language_loss": 1.01792526, + "learning_rate": 3.999994539508036e-06, + "loss": 1.13045216, + "num_input_tokens_seen": 10864325, + "router_z_loss_clip": 13.109375, + "router_z_loss_mlp": 2.36328125, + "step": 511, + "time_per_iteration": 2.7218635082244873 + }, + { + "auxiliary_loss_clip": 0.08893925, + "auxiliary_loss_mlp": 0.02289988, + "balance_loss_clip": 0.07569309, + "balance_loss_mlp": 0.02083041, + "epoch": 0.0307831053660003, + "flos": 24757253452800.0, + "grad_norm": 19.668331583944035, + "language_loss": 0.98058987, + "learning_rate": 3.9999935915064655e-06, + "loss": 1.09242892, + "num_input_tokens_seen": 10883860, + "router_z_loss_clip": 13.25, + "router_z_loss_mlp": 2.07226562, + "step": 512, + "time_per_iteration": 2.6965620517730713 + }, + { + "auxiliary_loss_clip": 0.08852743, + "auxiliary_loss_mlp": 0.02379446, + "balance_loss_clip": 0.0755362, + "balance_loss_mlp": 0.02156858, + "epoch": 0.03084322861866827, + "flos": 26148113775360.0, + "grad_norm": 13.468181826610785, + "language_loss": 1.01916862, + "learning_rate": 3.9999925676649374e-06, + "loss": 1.13149047, + "num_input_tokens_seen": 10904555, + "router_z_loss_clip": 12.984375, + "router_z_loss_mlp": 2.22460938, + "step": 513, + "time_per_iteration": 2.711587429046631 + }, + { + "auxiliary_loss_clip": 0.08845583, + "auxiliary_loss_mlp": 0.02430958, + "balance_loss_clip": 0.07545915, + "balance_loss_mlp": 0.02204555, + "epoch": 0.03090335187133624, + "flos": 18777383769600.0, + "grad_norm": 6.55607776583441, + "language_loss": 0.95138013, + "learning_rate": 3.999991467983491e-06, + "loss": 1.06414557, + "num_input_tokens_seen": 10923700, + "router_z_loss_clip": 13.0, + "router_z_loss_mlp": 2.26269531, + "step": 514, + "time_per_iteration": 2.6500775814056396 + }, + { + "auxiliary_loss_clip": 0.08815307, + "auxiliary_loss_mlp": 0.02407072, + "balance_loss_clip": 0.07539771, + "balance_loss_mlp": 0.02187917, + "epoch": 0.030963475124004207, + "flos": 23228603890560.0, + "grad_norm": 18.204719930438795, + "language_loss": 0.97247916, + "learning_rate": 3.999990292462167e-06, + "loss": 1.08470297, + "num_input_tokens_seen": 10942730, + "router_z_loss_clip": 12.7578125, + "router_z_loss_mlp": 2.19335938, + "step": 515, + "time_per_iteration": 2.7167558670043945 + }, + { + "auxiliary_loss_clip": 0.08806405, + "auxiliary_loss_mlp": 0.02437712, + "balance_loss_clip": 0.0752582, + "balance_loss_mlp": 0.02208258, + "epoch": 0.03102359837667218, + "flos": 42535998662400.0, + "grad_norm": 5.904658856542002, + "language_loss": 1.00314569, + "learning_rate": 3.999989041101011e-06, + "loss": 1.11558676, + "num_input_tokens_seen": 10967120, + "router_z_loss_clip": 12.8203125, + "router_z_loss_mlp": 2.29492188, + "step": 516, + "time_per_iteration": 2.932173013687134 + }, + { + "auxiliary_loss_clip": 0.08796877, + "auxiliary_loss_mlp": 0.02455233, + "balance_loss_clip": 0.07514809, + "balance_loss_mlp": 0.02220629, + "epoch": 0.031083721629340148, + "flos": 21183039290880.0, + "grad_norm": 45.02393900109363, + "language_loss": 0.9180311, + "learning_rate": 3.999987713900071e-06, + "loss": 1.03055215, + "num_input_tokens_seen": 10986775, + "router_z_loss_clip": 12.8359375, + "router_z_loss_mlp": 2.34375, + "step": 517, + "time_per_iteration": 2.666154623031616 + }, + { + "auxiliary_loss_clip": 0.08820206, + "auxiliary_loss_mlp": 0.02414127, + "balance_loss_clip": 0.07551458, + "balance_loss_mlp": 0.02194306, + "epoch": 0.031143844882008116, + "flos": 29723963091840.0, + "grad_norm": 7.285252117980509, + "language_loss": 0.99479294, + "learning_rate": 3.999986310859396e-06, + "loss": 1.10713625, + "num_input_tokens_seen": 11011360, + "router_z_loss_clip": 12.6796875, + "router_z_loss_mlp": 2.19824219, + "step": 518, + "time_per_iteration": 2.752505302429199 + }, + { + "auxiliary_loss_clip": 0.08830461, + "auxiliary_loss_mlp": 0.024645, + "balance_loss_clip": 0.07556459, + "balance_loss_mlp": 0.02246586, + "epoch": 0.031203968134676085, + "flos": 23119172058240.0, + "grad_norm": 20.736865355911096, + "language_loss": 1.01917171, + "learning_rate": 3.999984831979039e-06, + "loss": 1.13212132, + "num_input_tokens_seen": 11030150, + "router_z_loss_clip": 12.734375, + "router_z_loss_mlp": 2.1796875, + "step": 519, + "time_per_iteration": 2.6659457683563232 + }, + { + "auxiliary_loss_clip": 0.08817208, + "auxiliary_loss_mlp": 0.02465606, + "balance_loss_clip": 0.07545176, + "balance_loss_mlp": 0.02241778, + "epoch": 0.03126409138734405, + "flos": 20959815214080.0, + "grad_norm": 7.142122271726701, + "language_loss": 1.00803113, + "learning_rate": 3.999983277259057e-06, + "loss": 1.12085938, + "num_input_tokens_seen": 11049145, + "router_z_loss_clip": 12.7265625, + "router_z_loss_mlp": 2.23632812, + "step": 520, + "time_per_iteration": 2.7612173557281494 + }, + { + "auxiliary_loss_clip": 0.08873951, + "auxiliary_loss_mlp": 0.02427922, + "balance_loss_clip": 0.07591425, + "balance_loss_mlp": 0.02219163, + "epoch": 0.031324214640012026, + "flos": 21656083916160.0, + "grad_norm": 5386.394179139514, + "language_loss": 1.03191018, + "learning_rate": 3.999981646699509e-06, + "loss": 1.14492893, + "num_input_tokens_seen": 11068835, + "router_z_loss_clip": 12.8203125, + "router_z_loss_mlp": 2.08886719, + "step": 521, + "time_per_iteration": 2.6934170722961426 + }, + { + "auxiliary_loss_clip": 0.08889641, + "auxiliary_loss_mlp": 0.02359363, + "balance_loss_clip": 0.07604645, + "balance_loss_mlp": 0.02163669, + "epoch": 0.03138433789267999, + "flos": 23448180314880.0, + "grad_norm": 8.073235529869596, + "language_loss": 0.83005708, + "learning_rate": 3.999979940300456e-06, + "loss": 0.94254714, + "num_input_tokens_seen": 11088980, + "router_z_loss_clip": 12.84375, + "router_z_loss_mlp": 1.95800781, + "step": 522, + "time_per_iteration": 2.8722758293151855 + }, + { + "auxiliary_loss_clip": 0.08903908, + "auxiliary_loss_mlp": 0.02254118, + "balance_loss_clip": 0.07622182, + "balance_loss_mlp": 0.0208465, + "epoch": 0.03144446114534796, + "flos": 18986939631360.0, + "grad_norm": 12.411483225368043, + "language_loss": 1.05680871, + "learning_rate": 3.999978158061963e-06, + "loss": 1.16838908, + "num_input_tokens_seen": 11104300, + "router_z_loss_clip": 12.8046875, + "router_z_loss_mlp": 1.6953125, + "step": 523, + "time_per_iteration": 2.650547742843628 + }, + { + "auxiliary_loss_clip": 0.08934012, + "auxiliary_loss_mlp": 0.02230434, + "balance_loss_clip": 0.07644011, + "balance_loss_mlp": 0.0206087, + "epoch": 0.031504584398015935, + "flos": 22644240716160.0, + "grad_norm": 13.96543726868128, + "language_loss": 1.08792841, + "learning_rate": 3.999976299984099e-06, + "loss": 1.1995728, + "num_input_tokens_seen": 11123335, + "router_z_loss_clip": 12.8984375, + "router_z_loss_mlp": 1.69628906, + "step": 524, + "time_per_iteration": 2.7135303020477295 + }, + { + "auxiliary_loss_clip": 0.08891568, + "auxiliary_loss_mlp": 0.02091454, + "balance_loss_clip": 0.07603844, + "balance_loss_mlp": 0.0193486, + "epoch": 0.0315647076506839, + "flos": 25303364438400.0, + "grad_norm": 13.325751395918596, + "language_loss": 0.96287918, + "learning_rate": 3.999974366066933e-06, + "loss": 1.07270944, + "num_input_tokens_seen": 11140880, + "router_z_loss_clip": 12.875, + "router_z_loss_mlp": 1.56542969, + "step": 525, + "time_per_iteration": 2.7008469104766846 + }, + { + "auxiliary_loss_clip": 0.08895689, + "auxiliary_loss_mlp": 0.02060743, + "balance_loss_clip": 0.07611247, + "balance_loss_mlp": 0.01902052, + "epoch": 0.03162483090335187, + "flos": 16988515752960.0, + "grad_norm": 10.865036443132793, + "language_loss": 0.93799376, + "learning_rate": 3.999972356310538e-06, + "loss": 1.04755807, + "num_input_tokens_seen": 11158710, + "router_z_loss_clip": 12.84375, + "router_z_loss_mlp": 1.58789062, + "step": 526, + "time_per_iteration": 2.6346511840820312 + }, + { + "auxiliary_loss_clip": 0.08917748, + "auxiliary_loss_mlp": 0.01935945, + "balance_loss_clip": 0.07596096, + "balance_loss_mlp": 0.01773629, + "epoch": 0.03168495415601984, + "flos": 18740515173120.0, + "grad_norm": 57.85895101220995, + "language_loss": 0.99752951, + "learning_rate": 3.999970270714991e-06, + "loss": 1.10606647, + "num_input_tokens_seen": 11177550, + "router_z_loss_clip": 13.2109375, + "router_z_loss_mlp": 1.62402344, + "step": 527, + "time_per_iteration": 2.679004669189453 + }, + { + "auxiliary_loss_clip": 0.08855803, + "auxiliary_loss_mlp": 0.01834989, + "balance_loss_clip": 0.07585346, + "balance_loss_mlp": 0.01673914, + "epoch": 0.03174507740868781, + "flos": 21221207625600.0, + "grad_norm": 46.02909291045389, + "language_loss": 1.11322296, + "learning_rate": 3.999968109280371e-06, + "loss": 1.22013092, + "num_input_tokens_seen": 11196230, + "router_z_loss_clip": 12.703125, + "router_z_loss_mlp": 1.61035156, + "step": 528, + "time_per_iteration": 2.6590561866760254 + }, + { + "auxiliary_loss_clip": 0.08896849, + "auxiliary_loss_mlp": 0.01846134, + "balance_loss_clip": 0.07587088, + "balance_loss_mlp": 0.01668655, + "epoch": 0.03180520066135578, + "flos": 24794122049280.0, + "grad_norm": 60.37354361545739, + "language_loss": 0.97275496, + "learning_rate": 3.99996587200676e-06, + "loss": 1.08018494, + "num_input_tokens_seen": 11214935, + "router_z_loss_clip": 13.09375, + "router_z_loss_mlp": 1.77539062, + "step": 529, + "time_per_iteration": 2.7260618209838867 + }, + { + "auxiliary_loss_clip": 0.08883977, + "auxiliary_loss_mlp": 0.01771414, + "balance_loss_clip": 0.07582102, + "balance_loss_mlp": 0.01579535, + "epoch": 0.03186532391402375, + "flos": 24871339186560.0, + "grad_norm": 10627.611218983826, + "language_loss": 1.18170238, + "learning_rate": 3.999963558894243e-06, + "loss": 1.28825641, + "num_input_tokens_seen": 11235310, + "router_z_loss_clip": 13.015625, + "router_z_loss_mlp": 1.91894531, + "step": 530, + "time_per_iteration": 2.7020938396453857 + }, + { + "auxiliary_loss_clip": 0.08833256, + "auxiliary_loss_mlp": 0.01774458, + "balance_loss_clip": 0.07546531, + "balance_loss_mlp": 0.01588683, + "epoch": 0.03192544716669172, + "flos": 21221417260800.0, + "grad_norm": 74.92861353079512, + "language_loss": 0.92192125, + "learning_rate": 3.999961169942907e-06, + "loss": 1.02799833, + "num_input_tokens_seen": 11254425, + "router_z_loss_clip": 12.8671875, + "router_z_loss_mlp": 1.85644531, + "step": 531, + "time_per_iteration": 5.536854028701782 + }, + { + "auxiliary_loss_clip": 0.08819988, + "auxiliary_loss_mlp": 0.0179185, + "balance_loss_clip": 0.07536054, + "balance_loss_mlp": 0.01611224, + "epoch": 0.03198557041935969, + "flos": 24360168153600.0, + "grad_norm": 15.362611414198588, + "language_loss": 1.04843593, + "learning_rate": 3.999958705152843e-06, + "loss": 1.15455437, + "num_input_tokens_seen": 11274595, + "router_z_loss_clip": 12.8359375, + "router_z_loss_mlp": 1.8046875, + "step": 532, + "time_per_iteration": 4.078269958496094 + }, + { + "auxiliary_loss_clip": 0.07593378, + "auxiliary_loss_mlp": 0.01964501, + "balance_loss_clip": 0.07000267, + "balance_loss_mlp": 0.01595619, + "epoch": 0.032045693672027656, + "flos": 61847235993600.0, + "grad_norm": 0.8955673428440366, + "language_loss": 0.58032346, + "learning_rate": 3.9999561645241445e-06, + "loss": 0.67590225, + "num_input_tokens_seen": 11336705, + "router_z_loss_clip": 5.9375, + "router_z_loss_mlp": 3.68554688, + "step": 533, + "time_per_iteration": 3.319361925125122 + }, + { + "auxiliary_loss_clip": 0.08788651, + "auxiliary_loss_mlp": 0.01742728, + "balance_loss_clip": 0.07528964, + "balance_loss_mlp": 0.01567061, + "epoch": 0.03210581692469563, + "flos": 28408475116800.0, + "grad_norm": 18.42557842883857, + "language_loss": 0.99417937, + "learning_rate": 3.999953548056907e-06, + "loss": 1.09949315, + "num_input_tokens_seen": 11356820, + "router_z_loss_clip": 12.5859375, + "router_z_loss_mlp": 1.75585938, + "step": 534, + "time_per_iteration": 4.265074729919434 + }, + { + "auxiliary_loss_clip": 0.08770919, + "auxiliary_loss_mlp": 0.0174947, + "balance_loss_clip": 0.07504185, + "balance_loss_mlp": 0.01577809, + "epoch": 0.03216594017736359, + "flos": 24724661414400.0, + "grad_norm": 508.9639434919875, + "language_loss": 0.94137996, + "learning_rate": 3.999950855751232e-06, + "loss": 1.04658389, + "num_input_tokens_seen": 11376645, + "router_z_loss_clip": 12.671875, + "router_z_loss_mlp": 1.71777344, + "step": 535, + "time_per_iteration": 2.7245981693267822 + }, + { + "auxiliary_loss_clip": 0.08758718, + "auxiliary_loss_mlp": 0.01725335, + "balance_loss_clip": 0.07518992, + "balance_loss_mlp": 0.01554437, + "epoch": 0.032226063430031565, + "flos": 31183445508480.0, + "grad_norm": 22.532643943929422, + "language_loss": 0.94802475, + "learning_rate": 3.999948087607219e-06, + "loss": 1.05286527, + "num_input_tokens_seen": 11397310, + "router_z_loss_clip": 12.390625, + "router_z_loss_mlp": 1.70996094, + "step": 536, + "time_per_iteration": 2.7583792209625244 + }, + { + "auxiliary_loss_clip": 0.08705089, + "auxiliary_loss_mlp": 0.01729852, + "balance_loss_clip": 0.07491484, + "balance_loss_mlp": 0.01569253, + "epoch": 0.03228618668269954, + "flos": 32206584188160.0, + "grad_norm": 18.146665662297185, + "language_loss": 0.83908743, + "learning_rate": 3.999945243624975e-06, + "loss": 0.94343686, + "num_input_tokens_seen": 11418475, + "router_z_loss_clip": 12.1484375, + "router_z_loss_mlp": 1.60546875, + "step": 537, + "time_per_iteration": 2.770418167114258 + }, + { + "auxiliary_loss_clip": 0.08731261, + "auxiliary_loss_mlp": 0.01758368, + "balance_loss_clip": 0.07496089, + "balance_loss_mlp": 0.0159672, + "epoch": 0.0323463099353675, + "flos": 22676036140800.0, + "grad_norm": 12.39933899749453, + "language_loss": 0.95942801, + "learning_rate": 3.999942323804607e-06, + "loss": 1.06432438, + "num_input_tokens_seen": 11436630, + "router_z_loss_clip": 12.3515625, + "router_z_loss_mlp": 1.6171875, + "step": 538, + "time_per_iteration": 2.7392029762268066 + }, + { + "auxiliary_loss_clip": 0.0875225, + "auxiliary_loss_mlp": 0.01750456, + "balance_loss_clip": 0.07507962, + "balance_loss_mlp": 0.01584802, + "epoch": 0.032406433188035474, + "flos": 26912207957760.0, + "grad_norm": 95.24255955505957, + "language_loss": 0.90228236, + "learning_rate": 3.999939328146225e-06, + "loss": 1.00730944, + "num_input_tokens_seen": 11457275, + "router_z_loss_clip": 12.4453125, + "router_z_loss_mlp": 1.65625, + "step": 539, + "time_per_iteration": 2.760545253753662 + }, + { + "auxiliary_loss_clip": 0.08700242, + "auxiliary_loss_mlp": 0.01788145, + "balance_loss_clip": 0.07481987, + "balance_loss_mlp": 0.0161162, + "epoch": 0.03246655644070344, + "flos": 31511992567680.0, + "grad_norm": 15.31403595077071, + "language_loss": 0.89398444, + "learning_rate": 3.999936256649943e-06, + "loss": 0.99886829, + "num_input_tokens_seen": 11476925, + "router_z_loss_clip": 12.1875, + "router_z_loss_mlp": 1.76757812, + "step": 540, + "time_per_iteration": 2.791525363922119 + }, + { + "auxiliary_loss_clip": 0.08740143, + "auxiliary_loss_mlp": 0.01834392, + "balance_loss_clip": 0.07499444, + "balance_loss_mlp": 0.01643276, + "epoch": 0.03252667969337141, + "flos": 23224453113600.0, + "grad_norm": 73.47244628512628, + "language_loss": 0.99572086, + "learning_rate": 3.999933109315878e-06, + "loss": 1.10146618, + "num_input_tokens_seen": 11496830, + "router_z_loss_clip": 12.40625, + "router_z_loss_mlp": 1.90917969, + "step": 541, + "time_per_iteration": 2.698315143585205 + }, + { + "auxiliary_loss_clip": 0.08765414, + "auxiliary_loss_mlp": 0.01821723, + "balance_loss_clip": 0.07523992, + "balance_loss_mlp": 0.01612201, + "epoch": 0.032586802946039384, + "flos": 14762800874880.0, + "grad_norm": 49.77821697975532, + "language_loss": 1.00654817, + "learning_rate": 3.9999298861441496e-06, + "loss": 1.11241961, + "num_input_tokens_seen": 11515605, + "router_z_loss_clip": 12.4296875, + "router_z_loss_mlp": 2.09667969, + "step": 542, + "time_per_iteration": 2.6720223426818848 + }, + { + "auxiliary_loss_clip": 0.08722232, + "auxiliary_loss_mlp": 0.01879557, + "balance_loss_clip": 0.07465587, + "balance_loss_mlp": 0.01644953, + "epoch": 0.03264692619870735, + "flos": 24287688771840.0, + "grad_norm": 65.19472082730613, + "language_loss": 0.83699101, + "learning_rate": 3.999926587134879e-06, + "loss": 0.9430089, + "num_input_tokens_seen": 11536230, + "router_z_loss_clip": 12.5625, + "router_z_loss_mlp": 2.34375, + "step": 543, + "time_per_iteration": 2.692474842071533 + }, + { + "auxiliary_loss_clip": 0.0878472, + "auxiliary_loss_mlp": 0.01882603, + "balance_loss_clip": 0.07507792, + "balance_loss_mlp": 0.01631214, + "epoch": 0.03270704945137532, + "flos": 22899763342080.0, + "grad_norm": 1912.553873416959, + "language_loss": 1.09316349, + "learning_rate": 3.999923212288192e-06, + "loss": 1.19983673, + "num_input_tokens_seen": 11554715, + "router_z_loss_clip": 12.7734375, + "router_z_loss_mlp": 2.51367188, + "step": 544, + "time_per_iteration": 2.663267135620117 + }, + { + "auxiliary_loss_clip": 0.0881625, + "auxiliary_loss_mlp": 0.01879222, + "balance_loss_clip": 0.07490219, + "balance_loss_mlp": 0.01537997, + "epoch": 0.032767172704043286, + "flos": 18046887874560.0, + "grad_norm": 1976.6790975556307, + "language_loss": 0.85651809, + "learning_rate": 3.999919761604216e-06, + "loss": 0.96347284, + "num_input_tokens_seen": 11571370, + "router_z_loss_clip": 13.265625, + "router_z_loss_mlp": 3.41210938, + "step": 545, + "time_per_iteration": 2.6566007137298584 + }, + { + "auxiliary_loss_clip": 0.08881226, + "auxiliary_loss_mlp": 0.01919651, + "balance_loss_clip": 0.07538594, + "balance_loss_mlp": 0.01591969, + "epoch": 0.03282729595671126, + "flos": 22535353935360.0, + "grad_norm": 36635.99630864103, + "language_loss": 1.19350576, + "learning_rate": 3.999916235083083e-06, + "loss": 1.30151451, + "num_input_tokens_seen": 11588560, + "router_z_loss_clip": 13.421875, + "router_z_loss_mlp": 3.27539062, + "step": 546, + "time_per_iteration": 2.6508443355560303 + }, + { + "auxiliary_loss_clip": 0.0885489, + "auxiliary_loss_mlp": 0.01969573, + "balance_loss_clip": 0.07525921, + "balance_loss_mlp": 0.01650092, + "epoch": 0.03288741920937923, + "flos": 20416555267200.0, + "grad_norm": 175.83782863941582, + "language_loss": 1.0484463, + "learning_rate": 3.999912632724925e-06, + "loss": 1.15669084, + "num_input_tokens_seen": 11605685, + "router_z_loss_clip": 13.28125, + "router_z_loss_mlp": 3.1953125, + "step": 547, + "time_per_iteration": 2.709317445755005 + }, + { + "auxiliary_loss_clip": 0.08846241, + "auxiliary_loss_mlp": 0.02054837, + "balance_loss_clip": 0.07521404, + "balance_loss_mlp": 0.01724484, + "epoch": 0.032947542462047195, + "flos": 20784402691200.0, + "grad_norm": 1231.4634556281662, + "language_loss": 0.99917918, + "learning_rate": 3.999908954529881e-06, + "loss": 1.10818994, + "num_input_tokens_seen": 11626290, + "router_z_loss_clip": 13.2578125, + "router_z_loss_mlp": 3.30664062, + "step": 548, + "time_per_iteration": 2.761152744293213 + }, + { + "auxiliary_loss_clip": 0.08837526, + "auxiliary_loss_mlp": 0.02099407, + "balance_loss_clip": 0.07500955, + "balance_loss_mlp": 0.01773059, + "epoch": 0.03300766571471517, + "flos": 19907354805120.0, + "grad_norm": 538.4476306780408, + "language_loss": 0.89559388, + "learning_rate": 3.999905200498087e-06, + "loss": 1.00496316, + "num_input_tokens_seen": 11643950, + "router_z_loss_clip": 13.3671875, + "router_z_loss_mlp": 3.26367188, + "step": 549, + "time_per_iteration": 2.7063941955566406 + }, + { + "auxiliary_loss_clip": 0.08802217, + "auxiliary_loss_mlp": 0.02104246, + "balance_loss_clip": 0.07490957, + "balance_loss_mlp": 0.0178324, + "epoch": 0.03306778896738313, + "flos": 17973569952000.0, + "grad_norm": 95.24031464069257, + "language_loss": 1.00179911, + "learning_rate": 3.999901370629689e-06, + "loss": 1.1108638, + "num_input_tokens_seen": 11662560, + "router_z_loss_clip": 13.125, + "router_z_loss_mlp": 3.20703125, + "step": 550, + "time_per_iteration": 2.6905276775360107 + }, + { + "auxiliary_loss_clip": 0.08789266, + "auxiliary_loss_mlp": 0.02134598, + "balance_loss_clip": 0.07500902, + "balance_loss_mlp": 0.01818551, + "epoch": 0.033127912220051105, + "flos": 21659899276800.0, + "grad_norm": 52.30662645055097, + "language_loss": 0.93777549, + "learning_rate": 3.99989746492483e-06, + "loss": 1.04701412, + "num_input_tokens_seen": 11682265, + "router_z_loss_clip": 12.8984375, + "router_z_loss_mlp": 3.16015625, + "step": 551, + "time_per_iteration": 2.7061314582824707 + }, + { + "auxiliary_loss_clip": 0.08738074, + "auxiliary_loss_mlp": 0.02134365, + "balance_loss_clip": 0.07474738, + "balance_loss_mlp": 0.01835484, + "epoch": 0.03318803547271908, + "flos": 30195875687040.0, + "grad_norm": 81.64424293941155, + "language_loss": 1.06586599, + "learning_rate": 3.999893483383658e-06, + "loss": 1.17459035, + "num_input_tokens_seen": 11699300, + "router_z_loss_clip": 12.6484375, + "router_z_loss_mlp": 2.98828125, + "step": 552, + "time_per_iteration": 2.7557857036590576 + }, + { + "auxiliary_loss_clip": 0.08738689, + "auxiliary_loss_mlp": 0.02132193, + "balance_loss_clip": 0.07474653, + "balance_loss_mlp": 0.01841513, + "epoch": 0.03324815872538704, + "flos": 20382286147200.0, + "grad_norm": 103.46520912531122, + "language_loss": 1.07230687, + "learning_rate": 3.999889426006326e-06, + "loss": 1.18101549, + "num_input_tokens_seen": 11716955, + "router_z_loss_clip": 12.6328125, + "router_z_loss_mlp": 2.90625, + "step": 553, + "time_per_iteration": 2.6690380573272705 + }, + { + "auxiliary_loss_clip": 0.0876793, + "auxiliary_loss_mlp": 0.02203825, + "balance_loss_clip": 0.07493228, + "balance_loss_mlp": 0.01878431, + "epoch": 0.033308281978055014, + "flos": 24500766504960.0, + "grad_norm": 2577.3704160991106, + "language_loss": 0.91311669, + "learning_rate": 3.999885292792986e-06, + "loss": 1.0228343, + "num_input_tokens_seen": 11736130, + "router_z_loss_clip": 12.75, + "router_z_loss_mlp": 3.25390625, + "step": 554, + "time_per_iteration": 2.690467119216919 + }, + { + "auxiliary_loss_clip": 0.08781252, + "auxiliary_loss_mlp": 0.02161472, + "balance_loss_clip": 0.0750941, + "balance_loss_mlp": 0.01854961, + "epoch": 0.03336840523072298, + "flos": 23406406254720.0, + "grad_norm": 23.66967902789698, + "language_loss": 0.92365468, + "learning_rate": 3.999881083743795e-06, + "loss": 1.03308201, + "num_input_tokens_seen": 11754425, + "router_z_loss_clip": 12.7265625, + "router_z_loss_mlp": 3.06445312, + "step": 555, + "time_per_iteration": 2.7009239196777344 + }, + { + "auxiliary_loss_clip": 0.0871176, + "auxiliary_loss_mlp": 0.02191896, + "balance_loss_clip": 0.0746032, + "balance_loss_mlp": 0.01904268, + "epoch": 0.03342852848339095, + "flos": 30557685617280.0, + "grad_norm": 32.47411862244808, + "language_loss": 1.03816569, + "learning_rate": 3.999876798858914e-06, + "loss": 1.14720225, + "num_input_tokens_seen": 11772845, + "router_z_loss_clip": 12.5234375, + "router_z_loss_mlp": 2.875, + "step": 556, + "time_per_iteration": 2.7751269340515137 + }, + { + "auxiliary_loss_clip": 0.08728363, + "auxiliary_loss_mlp": 0.02208938, + "balance_loss_clip": 0.07497713, + "balance_loss_mlp": 0.01914825, + "epoch": 0.03348865173605892, + "flos": 22899931050240.0, + "grad_norm": 26.350622314910414, + "language_loss": 0.97158062, + "learning_rate": 3.999872438138503e-06, + "loss": 1.0809536, + "num_input_tokens_seen": 11792850, + "router_z_loss_clip": 12.3046875, + "router_z_loss_mlp": 2.93945312, + "step": 557, + "time_per_iteration": 2.6803956031799316 + }, + { + "auxiliary_loss_clip": 0.08708371, + "auxiliary_loss_mlp": 0.02154386, + "balance_loss_clip": 0.0748485, + "balance_loss_mlp": 0.01905477, + "epoch": 0.03354877498872689, + "flos": 17681807635200.0, + "grad_norm": 18.772470179547817, + "language_loss": 1.10132766, + "learning_rate": 3.999868001582729e-06, + "loss": 1.20995522, + "num_input_tokens_seen": 11809670, + "router_z_loss_clip": 12.2265625, + "router_z_loss_mlp": 2.49023438, + "step": 558, + "time_per_iteration": 2.650348663330078 + }, + { + "auxiliary_loss_clip": 0.08667068, + "auxiliary_loss_mlp": 0.02131925, + "balance_loss_clip": 0.07472065, + "balance_loss_mlp": 0.01914487, + "epoch": 0.03360889824139486, + "flos": 21659438079360.0, + "grad_norm": 17.45552884003481, + "language_loss": 0.92322779, + "learning_rate": 3.99986348919176e-06, + "loss": 1.03121769, + "num_input_tokens_seen": 11829665, + "router_z_loss_clip": 11.953125, + "router_z_loss_mlp": 2.17578125, + "step": 559, + "time_per_iteration": 2.69866681098938 + }, + { + "auxiliary_loss_clip": 0.08715945, + "auxiliary_loss_mlp": 0.02064835, + "balance_loss_clip": 0.07521564, + "balance_loss_mlp": 0.01861607, + "epoch": 0.033669021494062826, + "flos": 21801671585280.0, + "grad_norm": 8.293279297555102, + "language_loss": 0.96911502, + "learning_rate": 3.9998589009657675e-06, + "loss": 1.07692266, + "num_input_tokens_seen": 11848190, + "router_z_loss_clip": 11.9453125, + "router_z_loss_mlp": 2.03417969, + "step": 560, + "time_per_iteration": 2.7140135765075684 + }, + { + "auxiliary_loss_clip": 0.08642244, + "auxiliary_loss_mlp": 0.01977364, + "balance_loss_clip": 0.07480196, + "balance_loss_mlp": 0.01790062, + "epoch": 0.0337291447467308, + "flos": 21871761125760.0, + "grad_norm": 36.168101096947126, + "language_loss": 0.91244531, + "learning_rate": 3.999854236904925e-06, + "loss": 1.01864135, + "num_input_tokens_seen": 11864795, + "router_z_loss_clip": 11.640625, + "router_z_loss_mlp": 1.875, + "step": 561, + "time_per_iteration": 2.6863293647766113 + }, + { + "auxiliary_loss_clip": 0.08645087, + "auxiliary_loss_mlp": 0.01996294, + "balance_loss_clip": 0.07495341, + "balance_loss_mlp": 0.01809374, + "epoch": 0.03378926799939877, + "flos": 24253251943680.0, + "grad_norm": 9.210066016696686, + "language_loss": 0.90415317, + "learning_rate": 3.999849497009409e-06, + "loss": 1.01056707, + "num_input_tokens_seen": 11885275, + "router_z_loss_clip": 11.4921875, + "router_z_loss_mlp": 1.86914062, + "step": 562, + "time_per_iteration": 2.724127769470215 + }, + { + "auxiliary_loss_clip": 0.08630846, + "auxiliary_loss_mlp": 0.01896325, + "balance_loss_clip": 0.07475269, + "balance_loss_mlp": 0.0172867, + "epoch": 0.033849391252066735, + "flos": 16513290921600.0, + "grad_norm": 8.70795014369516, + "language_loss": 0.93251538, + "learning_rate": 3.999844681279401e-06, + "loss": 1.03778696, + "num_input_tokens_seen": 11903595, + "router_z_loss_clip": 11.5546875, + "router_z_loss_mlp": 1.67773438, + "step": 563, + "time_per_iteration": 2.653869867324829 + }, + { + "auxiliary_loss_clip": 0.08601731, + "auxiliary_loss_mlp": 0.0185707, + "balance_loss_clip": 0.07466102, + "balance_loss_mlp": 0.01686648, + "epoch": 0.03390951450473471, + "flos": 15674746786560.0, + "grad_norm": 12.715008158349837, + "language_loss": 1.03361213, + "learning_rate": 3.99983978971508e-06, + "loss": 1.13820004, + "num_input_tokens_seen": 11917815, + "router_z_loss_clip": 11.34375, + "router_z_loss_mlp": 1.70507812, + "step": 564, + "time_per_iteration": 2.6272659301757812 + }, + { + "auxiliary_loss_clip": 0.08544251, + "auxiliary_loss_mlp": 0.01761406, + "balance_loss_clip": 0.07418631, + "balance_loss_mlp": 0.01609581, + "epoch": 0.03396963775740267, + "flos": 22681444728960.0, + "grad_norm": 17.830043780961535, + "language_loss": 1.06299067, + "learning_rate": 3.999834822316635e-06, + "loss": 1.1660471, + "num_input_tokens_seen": 11936305, + "router_z_loss_clip": 11.2578125, + "router_z_loss_mlp": 1.51855469, + "step": 565, + "time_per_iteration": 2.6662397384643555 + }, + { + "auxiliary_loss_clip": 0.07533604, + "auxiliary_loss_mlp": 0.01361189, + "balance_loss_clip": 0.07012594, + "balance_loss_mlp": 0.01291713, + "epoch": 0.034029761010070644, + "flos": 64414872656640.0, + "grad_norm": 1.941550580035849, + "language_loss": 0.56352836, + "learning_rate": 3.9998297790842535e-06, + "loss": 0.65247625, + "num_input_tokens_seen": 11998940, + "router_z_loss_clip": 5.203125, + "router_z_loss_mlp": 0.6953125, + "step": 566, + "time_per_iteration": 3.3542587757110596 + }, + { + "auxiliary_loss_clip": 0.08492532, + "auxiliary_loss_mlp": 0.0159982, + "balance_loss_clip": 0.07380439, + "balance_loss_mlp": 0.01460488, + "epoch": 0.034089884262738616, + "flos": 25010302383360.0, + "grad_norm": 17.320262523662066, + "language_loss": 0.91644871, + "learning_rate": 3.999824660018126e-06, + "loss": 1.01737225, + "num_input_tokens_seen": 12018860, + "router_z_loss_clip": 11.125, + "router_z_loss_mlp": 1.39355469, + "step": 567, + "time_per_iteration": 2.7798964977264404 + }, + { + "auxiliary_loss_clip": 0.08452182, + "auxiliary_loss_mlp": 0.01578824, + "balance_loss_clip": 0.07376789, + "balance_loss_mlp": 0.01451318, + "epoch": 0.03415000751540658, + "flos": 28446643451520.0, + "grad_norm": 16.848598157475653, + "language_loss": 0.91613495, + "learning_rate": 3.999819465118447e-06, + "loss": 1.01644492, + "num_input_tokens_seen": 12039675, + "router_z_loss_clip": 10.7578125, + "router_z_loss_mlp": 1.27539062, + "step": 568, + "time_per_iteration": 2.7506062984466553 + }, + { + "auxiliary_loss_clip": 0.08471178, + "auxiliary_loss_mlp": 0.01592293, + "balance_loss_clip": 0.07369491, + "balance_loss_mlp": 0.0146307, + "epoch": 0.034210130768074554, + "flos": 21474843534720.0, + "grad_norm": 19.531015605864777, + "language_loss": 0.96641582, + "learning_rate": 3.999814194385413e-06, + "loss": 1.06705046, + "num_input_tokens_seen": 12057680, + "router_z_loss_clip": 11.0234375, + "router_z_loss_mlp": 1.29199219, + "step": 569, + "time_per_iteration": 2.679094076156616 + }, + { + "auxiliary_loss_clip": 0.08444348, + "auxiliary_loss_mlp": 0.01572924, + "balance_loss_clip": 0.07354259, + "balance_loss_mlp": 0.01444559, + "epoch": 0.03427025402074252, + "flos": 18703436941440.0, + "grad_norm": 10.09748529662486, + "language_loss": 1.03407526, + "learning_rate": 3.9998088478192255e-06, + "loss": 1.13424802, + "num_input_tokens_seen": 12076135, + "router_z_loss_clip": 10.90625, + "router_z_loss_mlp": 1.28320312, + "step": 570, + "time_per_iteration": 5.62298059463501 + }, + { + "auxiliary_loss_clip": 0.08452979, + "auxiliary_loss_mlp": 0.01597574, + "balance_loss_clip": 0.07344566, + "balance_loss_mlp": 0.01465204, + "epoch": 0.03433037727341049, + "flos": 20856253167360.0, + "grad_norm": 7.817701028438559, + "language_loss": 0.91945982, + "learning_rate": 3.9998034254200846e-06, + "loss": 1.01996529, + "num_input_tokens_seen": 12094785, + "router_z_loss_clip": 11.09375, + "router_z_loss_mlp": 1.32421875, + "step": 571, + "time_per_iteration": 2.654836654663086 + }, + { + "auxiliary_loss_clip": 0.08401142, + "auxiliary_loss_mlp": 0.01674875, + "balance_loss_clip": 0.073204, + "balance_loss_mlp": 0.01534971, + "epoch": 0.03439050052607846, + "flos": 25417240536960.0, + "grad_norm": 10.131092922686104, + "language_loss": 0.93731064, + "learning_rate": 3.999797927188199e-06, + "loss": 1.0380708, + "num_input_tokens_seen": 12114590, + "router_z_loss_clip": 10.8046875, + "router_z_loss_mlp": 1.39941406, + "step": 572, + "time_per_iteration": 4.118088483810425 + }, + { + "auxiliary_loss_clip": 0.08396388, + "auxiliary_loss_mlp": 0.01765484, + "balance_loss_clip": 0.07306887, + "balance_loss_mlp": 0.01610417, + "epoch": 0.03445062377874643, + "flos": 17646029141760.0, + "grad_norm": 20.127104681387284, + "language_loss": 0.93513721, + "learning_rate": 3.999792353123774e-06, + "loss": 1.03675592, + "num_input_tokens_seen": 12132390, + "router_z_loss_clip": 10.8984375, + "router_z_loss_mlp": 1.55078125, + "step": 573, + "time_per_iteration": 2.743281841278076 + }, + { + "auxiliary_loss_clip": 0.08402257, + "auxiliary_loss_mlp": 0.01880152, + "balance_loss_clip": 0.07297936, + "balance_loss_mlp": 0.01694757, + "epoch": 0.0345107470314144, + "flos": 16770239066880.0, + "grad_norm": 36.525489937717154, + "language_loss": 0.90410393, + "learning_rate": 3.999786703227023e-06, + "loss": 1.00692797, + "num_input_tokens_seen": 12149035, + "router_z_loss_clip": 11.046875, + "router_z_loss_mlp": 1.85351562, + "step": 574, + "time_per_iteration": 4.080662250518799 + }, + { + "auxiliary_loss_clip": 0.08410574, + "auxiliary_loss_mlp": 0.01951083, + "balance_loss_clip": 0.0729783, + "balance_loss_mlp": 0.01742514, + "epoch": 0.03457087028408237, + "flos": 14689776441600.0, + "grad_norm": 44.337021824182244, + "language_loss": 0.94332999, + "learning_rate": 3.9997809774981606e-06, + "loss": 1.04694653, + "num_input_tokens_seen": 12167530, + "router_z_loss_clip": 11.125, + "router_z_loss_mlp": 2.08398438, + "step": 575, + "time_per_iteration": 2.6497297286987305 + }, + { + "auxiliary_loss_clip": 0.0841077, + "auxiliary_loss_mlp": 0.02005797, + "balance_loss_clip": 0.07284614, + "balance_loss_mlp": 0.01780635, + "epoch": 0.03463099353675034, + "flos": 20017499397120.0, + "grad_norm": 29.883353134979416, + "language_loss": 0.90882921, + "learning_rate": 3.9997751759374025e-06, + "loss": 1.01299489, + "num_input_tokens_seen": 12186340, + "router_z_loss_clip": 11.265625, + "router_z_loss_mlp": 2.24804688, + "step": 576, + "time_per_iteration": 2.67240309715271 + }, + { + "auxiliary_loss_clip": 0.08418353, + "auxiliary_loss_mlp": 0.02062659, + "balance_loss_clip": 0.07293572, + "balance_loss_mlp": 0.01817947, + "epoch": 0.03469111678941831, + "flos": 25308144120960.0, + "grad_norm": 230.42461275956111, + "language_loss": 0.94618452, + "learning_rate": 3.99976929854497e-06, + "loss": 1.05099463, + "num_input_tokens_seen": 12204090, + "router_z_loss_clip": 11.2421875, + "router_z_loss_mlp": 2.44921875, + "step": 577, + "time_per_iteration": 2.6817197799682617 + }, + { + "auxiliary_loss_clip": 0.08418664, + "auxiliary_loss_mlp": 0.02057238, + "balance_loss_clip": 0.07282382, + "balance_loss_mlp": 0.01803943, + "epoch": 0.034751240042086275, + "flos": 23266311027840.0, + "grad_norm": 40.134119868020754, + "language_loss": 0.81416667, + "learning_rate": 3.9997633453210845e-06, + "loss": 0.9189257, + "num_input_tokens_seen": 12224850, + "router_z_loss_clip": 11.359375, + "router_z_loss_mlp": 2.53320312, + "step": 578, + "time_per_iteration": 2.6971585750579834 + }, + { + "auxiliary_loss_clip": 0.08457734, + "auxiliary_loss_mlp": 0.0202791, + "balance_loss_clip": 0.07290839, + "balance_loss_mlp": 0.0177881, + "epoch": 0.03481136329475425, + "flos": 23776056541440.0, + "grad_norm": 24.631913893483972, + "language_loss": 0.86342728, + "learning_rate": 3.999757316265973e-06, + "loss": 0.96828371, + "num_input_tokens_seen": 12244935, + "router_z_loss_clip": 11.6640625, + "router_z_loss_mlp": 2.4921875, + "step": 579, + "time_per_iteration": 2.694719076156616 + }, + { + "auxiliary_loss_clip": 0.08425288, + "auxiliary_loss_mlp": 0.0202294, + "balance_loss_clip": 0.07289667, + "balance_loss_mlp": 0.01773459, + "epoch": 0.03487148654742222, + "flos": 20163799825920.0, + "grad_norm": 24.746236106534205, + "language_loss": 0.94137156, + "learning_rate": 3.999751211379863e-06, + "loss": 1.04585385, + "num_input_tokens_seen": 12262140, + "router_z_loss_clip": 11.3671875, + "router_z_loss_mlp": 2.49609375, + "step": 580, + "time_per_iteration": 2.6965222358703613 + }, + { + "auxiliary_loss_clip": 0.08429064, + "auxiliary_loss_mlp": 0.02027245, + "balance_loss_clip": 0.07292753, + "balance_loss_mlp": 0.01790066, + "epoch": 0.034931609800090184, + "flos": 15675082202880.0, + "grad_norm": 72.69729205239823, + "language_loss": 0.92401338, + "learning_rate": 3.999745030662987e-06, + "loss": 1.02857637, + "num_input_tokens_seen": 12280930, + "router_z_loss_clip": 11.34375, + "router_z_loss_mlp": 2.37011719, + "step": 581, + "time_per_iteration": 2.6485416889190674 + }, + { + "auxiliary_loss_clip": 0.08388546, + "auxiliary_loss_mlp": 0.01934185, + "balance_loss_clip": 0.07261664, + "balance_loss_mlp": 0.01722183, + "epoch": 0.034991733052758156, + "flos": 16367912887680.0, + "grad_norm": 7.903206829146829, + "language_loss": 0.86330044, + "learning_rate": 3.99973877411558e-06, + "loss": 0.96652782, + "num_input_tokens_seen": 12299125, + "router_z_loss_clip": 11.28125, + "router_z_loss_mlp": 2.11914062, + "step": 582, + "time_per_iteration": 2.649725914001465 + }, + { + "auxiliary_loss_clip": 0.08328964, + "auxiliary_loss_mlp": 0.01871683, + "balance_loss_clip": 0.07243238, + "balance_loss_mlp": 0.01678087, + "epoch": 0.03505185630542612, + "flos": 19392787681920.0, + "grad_norm": 16.174360943611433, + "language_loss": 0.95958614, + "learning_rate": 3.999732441737877e-06, + "loss": 1.06159258, + "num_input_tokens_seen": 12316905, + "router_z_loss_clip": 10.859375, + "router_z_loss_mlp": 1.9375, + "step": 583, + "time_per_iteration": 2.643488645553589 + }, + { + "auxiliary_loss_clip": 0.08363868, + "auxiliary_loss_mlp": 0.01881498, + "balance_loss_clip": 0.07254223, + "balance_loss_mlp": 0.0168199, + "epoch": 0.03511197955809409, + "flos": 21330094406400.0, + "grad_norm": 77.84633741200611, + "language_loss": 0.91128743, + "learning_rate": 3.99972603353012e-06, + "loss": 1.01374114, + "num_input_tokens_seen": 12335070, + "router_z_loss_clip": 11.09375, + "router_z_loss_mlp": 1.99511719, + "step": 584, + "time_per_iteration": 2.6665167808532715 + }, + { + "auxiliary_loss_clip": 0.08332659, + "auxiliary_loss_mlp": 0.01830344, + "balance_loss_clip": 0.07228079, + "balance_loss_mlp": 0.01642279, + "epoch": 0.035172102810762065, + "flos": 14141736812160.0, + "grad_norm": 18.638483190058057, + "language_loss": 1.05479646, + "learning_rate": 3.999719549492551e-06, + "loss": 1.15642655, + "num_input_tokens_seen": 12350315, + "router_z_loss_clip": 11.046875, + "router_z_loss_mlp": 1.88183594, + "step": 585, + "time_per_iteration": 2.6243345737457275 + }, + { + "auxiliary_loss_clip": 0.08346213, + "auxiliary_loss_mlp": 0.01757237, + "balance_loss_clip": 0.07237425, + "balance_loss_mlp": 0.01597305, + "epoch": 0.03523222606343003, + "flos": 20302092190080.0, + "grad_norm": 16.531437097419627, + "language_loss": 0.96612549, + "learning_rate": 3.9997129896254165e-06, + "loss": 1.06716001, + "num_input_tokens_seen": 12366030, + "router_z_loss_clip": 11.0859375, + "router_z_loss_mlp": 1.59960938, + "step": 586, + "time_per_iteration": 2.79085373878479 + }, + { + "auxiliary_loss_clip": 0.08346236, + "auxiliary_loss_mlp": 0.01816744, + "balance_loss_clip": 0.07224018, + "balance_loss_mlp": 0.01643652, + "epoch": 0.035292349316098, + "flos": 20382034584960.0, + "grad_norm": 18.968444028471765, + "language_loss": 0.85692161, + "learning_rate": 3.999706353928965e-06, + "loss": 0.95855141, + "num_input_tokens_seen": 12384895, + "router_z_loss_clip": 11.234375, + "router_z_loss_mlp": 1.73242188, + "step": 587, + "time_per_iteration": 2.6773126125335693 + }, + { + "auxiliary_loss_clip": 0.08336938, + "auxiliary_loss_mlp": 0.01864921, + "balance_loss_clip": 0.07205997, + "balance_loss_mlp": 0.01679527, + "epoch": 0.03535247256876597, + "flos": 21475011242880.0, + "grad_norm": 15.49018014588467, + "language_loss": 0.87486923, + "learning_rate": 3.999699642403449e-06, + "loss": 0.97688788, + "num_input_tokens_seen": 12404980, + "router_z_loss_clip": 11.3203125, + "router_z_loss_mlp": 1.85546875, + "step": 588, + "time_per_iteration": 2.7011075019836426 + }, + { + "auxiliary_loss_clip": 0.08372419, + "auxiliary_loss_mlp": 0.01837943, + "balance_loss_clip": 0.07240701, + "balance_loss_mlp": 0.01648257, + "epoch": 0.03541259582143394, + "flos": 23629798039680.0, + "grad_norm": 7.372880070726386, + "language_loss": 1.04957795, + "learning_rate": 3.99969285504912e-06, + "loss": 1.15168166, + "num_input_tokens_seen": 12423835, + "router_z_loss_clip": 11.3203125, + "router_z_loss_mlp": 1.8984375, + "step": 589, + "time_per_iteration": 2.6905288696289062 + }, + { + "auxiliary_loss_clip": 0.08381461, + "auxiliary_loss_mlp": 0.01904967, + "balance_loss_clip": 0.07235886, + "balance_loss_mlp": 0.0170708, + "epoch": 0.03547271907410191, + "flos": 33734269428480.0, + "grad_norm": 5.900447642035286, + "language_loss": 0.93457747, + "learning_rate": 3.99968599186624e-06, + "loss": 1.03744173, + "num_input_tokens_seen": 12443135, + "router_z_loss_clip": 11.4609375, + "router_z_loss_mlp": 1.98046875, + "step": 590, + "time_per_iteration": 2.7626585960388184 + }, + { + "auxiliary_loss_clip": 0.08363292, + "auxiliary_loss_mlp": 0.01913512, + "balance_loss_clip": 0.07212853, + "balance_loss_mlp": 0.01716864, + "epoch": 0.03553284232676988, + "flos": 21149147514240.0, + "grad_norm": 8.056614912073432, + "language_loss": 0.93932045, + "learning_rate": 3.999679052855065e-06, + "loss": 1.04208851, + "num_input_tokens_seen": 12462895, + "router_z_loss_clip": 11.5, + "router_z_loss_mlp": 1.96484375, + "step": 591, + "time_per_iteration": 2.6892929077148438 + }, + { + "auxiliary_loss_clip": 0.08372159, + "auxiliary_loss_mlp": 0.0192709, + "balance_loss_clip": 0.0721619, + "balance_loss_mlp": 0.01729871, + "epoch": 0.03559296557943785, + "flos": 20052607057920.0, + "grad_norm": 11.504016210282687, + "language_loss": 0.90931952, + "learning_rate": 3.999672038015861e-06, + "loss": 1.01231205, + "num_input_tokens_seen": 12481515, + "router_z_loss_clip": 11.5546875, + "router_z_loss_mlp": 1.97363281, + "step": 592, + "time_per_iteration": 2.682248830795288 + }, + { + "auxiliary_loss_clip": 0.07476875, + "auxiliary_loss_mlp": 0.01418694, + "balance_loss_clip": 0.06931903, + "balance_loss_mlp": 0.01348551, + "epoch": 0.035653088832105814, + "flos": 60354742268160.0, + "grad_norm": 1.7390456768388496, + "language_loss": 0.61271667, + "learning_rate": 3.999664947348893e-06, + "loss": 0.70167232, + "num_input_tokens_seen": 12548220, + "router_z_loss_clip": 5.4375, + "router_z_loss_mlp": 0.70214844, + "step": 593, + "time_per_iteration": 3.372291088104248 + }, + { + "auxiliary_loss_clip": 0.08396088, + "auxiliary_loss_mlp": 0.01873215, + "balance_loss_clip": 0.07235788, + "balance_loss_mlp": 0.0169402, + "epoch": 0.035713212084773786, + "flos": 20118084624000.0, + "grad_norm": 4.056543882896522, + "language_loss": 0.9366371, + "learning_rate": 3.999657780854429e-06, + "loss": 1.03933024, + "num_input_tokens_seen": 12566105, + "router_z_loss_clip": 11.609375, + "router_z_loss_mlp": 1.79199219, + "step": 594, + "time_per_iteration": 2.656702756881714 + }, + { + "auxiliary_loss_clip": 0.08370538, + "auxiliary_loss_mlp": 0.01864142, + "balance_loss_clip": 0.07210694, + "balance_loss_mlp": 0.01671786, + "epoch": 0.03577333533744176, + "flos": 26292862903680.0, + "grad_norm": 7.659859705492133, + "language_loss": 0.90299201, + "learning_rate": 3.999650538532742e-06, + "loss": 1.00533891, + "num_input_tokens_seen": 12586680, + "router_z_loss_clip": 11.609375, + "router_z_loss_mlp": 1.92480469, + "step": 595, + "time_per_iteration": 2.735182285308838 + }, + { + "auxiliary_loss_clip": 0.08357747, + "auxiliary_loss_mlp": 0.01819213, + "balance_loss_clip": 0.07199049, + "balance_loss_mlp": 0.01642402, + "epoch": 0.035833458590109724, + "flos": 10894392627840.0, + "grad_norm": 11.312857601205495, + "language_loss": 1.05936086, + "learning_rate": 3.999643220384106e-06, + "loss": 1.16113043, + "num_input_tokens_seen": 12601605, + "router_z_loss_clip": 11.6015625, + "router_z_loss_mlp": 1.76953125, + "step": 596, + "time_per_iteration": 2.6456210613250732 + }, + { + "auxiliary_loss_clip": 0.08308871, + "auxiliary_loss_mlp": 0.01797355, + "balance_loss_clip": 0.07171883, + "balance_loss_mlp": 0.01627124, + "epoch": 0.035893581842777696, + "flos": 22096620357120.0, + "grad_norm": 9.130935198122538, + "language_loss": 0.90824974, + "learning_rate": 3.999635826408799e-06, + "loss": 1.00931203, + "num_input_tokens_seen": 12620365, + "router_z_loss_clip": 11.3671875, + "router_z_loss_mlp": 1.70117188, + "step": 597, + "time_per_iteration": 2.6823341846466064 + }, + { + "auxiliary_loss_clip": 0.08270305, + "auxiliary_loss_mlp": 0.01746721, + "balance_loss_clip": 0.0715827, + "balance_loss_mlp": 0.01584406, + "epoch": 0.03595370509544566, + "flos": 23044847886720.0, + "grad_norm": 9.111056149089638, + "language_loss": 0.87109864, + "learning_rate": 3.999628356607101e-06, + "loss": 0.97126889, + "num_input_tokens_seen": 12641140, + "router_z_loss_clip": 11.1171875, + "router_z_loss_mlp": 1.62402344, + "step": 598, + "time_per_iteration": 2.720789670944214 + }, + { + "auxiliary_loss_clip": 0.08249436, + "auxiliary_loss_mlp": 0.01768458, + "balance_loss_clip": 0.07144348, + "balance_loss_mlp": 0.01596511, + "epoch": 0.03601382834811363, + "flos": 20784109201920.0, + "grad_norm": 3.8408259345244593, + "language_loss": 0.87403977, + "learning_rate": 3.999620810979295e-06, + "loss": 0.97421879, + "num_input_tokens_seen": 12661080, + "router_z_loss_clip": 11.0546875, + "router_z_loss_mlp": 1.71972656, + "step": 599, + "time_per_iteration": 2.648764133453369 + }, + { + "auxiliary_loss_clip": 0.08292407, + "auxiliary_loss_mlp": 0.01772624, + "balance_loss_clip": 0.07133689, + "balance_loss_mlp": 0.01594573, + "epoch": 0.036073951600781605, + "flos": 23958470880000.0, + "grad_norm": 6.448569836830266, + "language_loss": 0.96199447, + "learning_rate": 3.999613189525668e-06, + "loss": 1.06264472, + "num_input_tokens_seen": 12678270, + "router_z_loss_clip": 11.6015625, + "router_z_loss_mlp": 1.78027344, + "step": 600, + "time_per_iteration": 2.677182197570801 + }, + { + "auxiliary_loss_clip": 0.08248397, + "auxiliary_loss_mlp": 0.01755802, + "balance_loss_clip": 0.07142025, + "balance_loss_mlp": 0.01582996, + "epoch": 0.03613407485344957, + "flos": 18917562850560.0, + "grad_norm": 6.503034140887701, + "language_loss": 0.8985101, + "learning_rate": 3.999605492246508e-06, + "loss": 0.9985522, + "num_input_tokens_seen": 12697295, + "router_z_loss_clip": 11.0703125, + "router_z_loss_mlp": 1.72753906, + "step": 601, + "time_per_iteration": 2.6344988346099854 + }, + { + "auxiliary_loss_clip": 0.08262836, + "auxiliary_loss_mlp": 0.01796413, + "balance_loss_clip": 0.07111854, + "balance_loss_mlp": 0.01602054, + "epoch": 0.03619419810611754, + "flos": 23045057521920.0, + "grad_norm": 7.606856937764795, + "language_loss": 0.83811623, + "learning_rate": 3.999597719142107e-06, + "loss": 0.93870872, + "num_input_tokens_seen": 12716165, + "router_z_loss_clip": 11.5234375, + "router_z_loss_mlp": 1.94335938, + "step": 602, + "time_per_iteration": 2.6544992923736572 + }, + { + "auxiliary_loss_clip": 0.08245073, + "auxiliary_loss_mlp": 0.01805812, + "balance_loss_clip": 0.07111835, + "balance_loss_mlp": 0.01607543, + "epoch": 0.03625432135878551, + "flos": 29465002448640.0, + "grad_norm": 10.358505294515373, + "language_loss": 0.86272752, + "learning_rate": 3.999589870212761e-06, + "loss": 0.96323633, + "num_input_tokens_seen": 12735475, + "router_z_loss_clip": 11.328125, + "router_z_loss_mlp": 1.984375, + "step": 603, + "time_per_iteration": 2.7074103355407715 + }, + { + "auxiliary_loss_clip": 0.08216999, + "auxiliary_loss_mlp": 0.01791145, + "balance_loss_clip": 0.07080936, + "balance_loss_mlp": 0.01602794, + "epoch": 0.03631444461145348, + "flos": 23514412567680.0, + "grad_norm": 4.761739949728406, + "language_loss": 0.93545526, + "learning_rate": 3.9995819454587664e-06, + "loss": 1.03553677, + "num_input_tokens_seen": 12754540, + "router_z_loss_clip": 11.3671875, + "router_z_loss_mlp": 1.88574219, + "step": 604, + "time_per_iteration": 2.683458089828491 + }, + { + "auxiliary_loss_clip": 0.08179027, + "auxiliary_loss_mlp": 0.01779272, + "balance_loss_clip": 0.07038404, + "balance_loss_mlp": 0.01587965, + "epoch": 0.03637456786412145, + "flos": 16623770929920.0, + "grad_norm": 10.408229209770424, + "language_loss": 0.89575511, + "learning_rate": 3.999573944880424e-06, + "loss": 0.99533808, + "num_input_tokens_seen": 12773050, + "router_z_loss_clip": 11.4140625, + "router_z_loss_mlp": 1.91308594, + "step": 605, + "time_per_iteration": 2.6058335304260254 + }, + { + "auxiliary_loss_clip": 0.08185698, + "auxiliary_loss_mlp": 0.0179345, + "balance_loss_clip": 0.07041989, + "balance_loss_mlp": 0.01587933, + "epoch": 0.03643469111678942, + "flos": 15857328833280.0, + "grad_norm": 18.44965350869095, + "language_loss": 0.94496262, + "learning_rate": 3.9995658684780375e-06, + "loss": 1.04475403, + "num_input_tokens_seen": 12791240, + "router_z_loss_clip": 11.421875, + "router_z_loss_mlp": 2.05566406, + "step": 606, + "time_per_iteration": 2.6620774269104004 + }, + { + "auxiliary_loss_clip": 0.0816614, + "auxiliary_loss_mlp": 0.01748117, + "balance_loss_clip": 0.07028672, + "balance_loss_mlp": 0.01549944, + "epoch": 0.03649481436945739, + "flos": 23626695438720.0, + "grad_norm": 22.881578639374155, + "language_loss": 0.89864534, + "learning_rate": 3.999557716251912e-06, + "loss": 0.99778789, + "num_input_tokens_seen": 12812245, + "router_z_loss_clip": 11.3828125, + "router_z_loss_mlp": 1.98144531, + "step": 607, + "time_per_iteration": 2.643644332885742 + }, + { + "auxiliary_loss_clip": 0.08159362, + "auxiliary_loss_mlp": 0.01746593, + "balance_loss_clip": 0.07035235, + "balance_loss_mlp": 0.01550708, + "epoch": 0.036554937622125354, + "flos": 21760903774080.0, + "grad_norm": 5.869564247499357, + "language_loss": 0.89574814, + "learning_rate": 3.999549488202358e-06, + "loss": 0.99480766, + "num_input_tokens_seen": 12831085, + "router_z_loss_clip": 11.2421875, + "router_z_loss_mlp": 1.95800781, + "step": 608, + "time_per_iteration": 2.6450629234313965 + }, + { + "auxiliary_loss_clip": 0.08127657, + "auxiliary_loss_mlp": 0.01727103, + "balance_loss_clip": 0.07009961, + "balance_loss_mlp": 0.01525497, + "epoch": 0.036615060874793326, + "flos": 17825215098240.0, + "grad_norm": 10.044459064109706, + "language_loss": 0.90011758, + "learning_rate": 3.999541184329688e-06, + "loss": 0.99866509, + "num_input_tokens_seen": 12849115, + "router_z_loss_clip": 11.171875, + "router_z_loss_mlp": 2.01464844, + "step": 609, + "time_per_iteration": 4.030602216720581 + }, + { + "auxiliary_loss_clip": 0.08147175, + "auxiliary_loss_mlp": 0.01709632, + "balance_loss_clip": 0.07004737, + "balance_loss_mlp": 0.01506309, + "epoch": 0.0366751841274613, + "flos": 26759911962240.0, + "grad_norm": 23.288197653985222, + "language_loss": 0.89072526, + "learning_rate": 3.999532804634215e-06, + "loss": 0.98929334, + "num_input_tokens_seen": 12868005, + "router_z_loss_clip": 11.421875, + "router_z_loss_mlp": 2.03515625, + "step": 610, + "time_per_iteration": 4.13908052444458 + }, + { + "auxiliary_loss_clip": 0.08141156, + "auxiliary_loss_mlp": 0.01701532, + "balance_loss_clip": 0.06999695, + "balance_loss_mlp": 0.01503454, + "epoch": 0.03673530738012926, + "flos": 22202949588480.0, + "grad_norm": 12.716864123026268, + "language_loss": 0.93839324, + "learning_rate": 3.9995243491162575e-06, + "loss": 1.03682017, + "num_input_tokens_seen": 12886890, + "router_z_loss_clip": 11.421875, + "router_z_loss_mlp": 1.98046875, + "step": 611, + "time_per_iteration": 4.084355354309082 + }, + { + "auxiliary_loss_clip": 0.08129553, + "auxiliary_loss_mlp": 0.01677889, + "balance_loss_clip": 0.07002232, + "balance_loss_mlp": 0.01494783, + "epoch": 0.036795430632797235, + "flos": 24688673285760.0, + "grad_norm": 5.856966427284507, + "language_loss": 0.80289567, + "learning_rate": 3.999515817776136e-06, + "loss": 0.9009701, + "num_input_tokens_seen": 12906130, + "router_z_loss_clip": 11.296875, + "router_z_loss_mlp": 1.83007812, + "step": 612, + "time_per_iteration": 2.797450065612793 + }, + { + "auxiliary_loss_clip": 0.08124618, + "auxiliary_loss_mlp": 0.01670571, + "balance_loss_clip": 0.06981046, + "balance_loss_mlp": 0.01486607, + "epoch": 0.0368555538854652, + "flos": 17754706287360.0, + "grad_norm": 13.343841316796098, + "language_loss": 0.86962521, + "learning_rate": 3.999507210614175e-06, + "loss": 0.9675771, + "num_input_tokens_seen": 12925260, + "router_z_loss_clip": 11.4453125, + "router_z_loss_mlp": 1.83984375, + "step": 613, + "time_per_iteration": 4.1074419021606445 + }, + { + "auxiliary_loss_clip": 0.0806347, + "auxiliary_loss_mlp": 0.01642999, + "balance_loss_clip": 0.0695873, + "balance_loss_mlp": 0.01476392, + "epoch": 0.03691567713813317, + "flos": 20600772468480.0, + "grad_norm": 5.522225672422525, + "language_loss": 1.0065136, + "learning_rate": 3.9994985276307e-06, + "loss": 1.10357833, + "num_input_tokens_seen": 12944590, + "router_z_loss_clip": 11.046875, + "router_z_loss_mlp": 1.66699219, + "step": 614, + "time_per_iteration": 2.645425796508789 + }, + { + "auxiliary_loss_clip": 0.08091287, + "auxiliary_loss_mlp": 0.01664825, + "balance_loss_clip": 0.06965354, + "balance_loss_mlp": 0.01476188, + "epoch": 0.036975800390801145, + "flos": 33657765050880.0, + "grad_norm": 13.032636577175042, + "language_loss": 0.81820416, + "learning_rate": 3.999489768826041e-06, + "loss": 0.91576523, + "num_input_tokens_seen": 12964785, + "router_z_loss_clip": 11.265625, + "router_z_loss_mlp": 1.88671875, + "step": 615, + "time_per_iteration": 2.781172752380371 + }, + { + "auxiliary_loss_clip": 0.08073606, + "auxiliary_loss_mlp": 0.01648642, + "balance_loss_clip": 0.06957066, + "balance_loss_mlp": 0.01467158, + "epoch": 0.03703592364346911, + "flos": 28301307344640.0, + "grad_norm": 5.888176936290721, + "language_loss": 0.88226712, + "learning_rate": 3.999480934200528e-06, + "loss": 0.97948968, + "num_input_tokens_seen": 12986705, + "router_z_loss_clip": 11.171875, + "router_z_loss_mlp": 1.81445312, + "step": 616, + "time_per_iteration": 2.712480068206787 + }, + { + "auxiliary_loss_clip": 0.08063665, + "auxiliary_loss_mlp": 0.01595674, + "balance_loss_clip": 0.06951402, + "balance_loss_mlp": 0.01438985, + "epoch": 0.03709604689613708, + "flos": 31512327984000.0, + "grad_norm": 15.942016878304402, + "language_loss": 0.7623843, + "learning_rate": 3.999472023754499e-06, + "loss": 0.85897768, + "num_input_tokens_seen": 13010560, + "router_z_loss_clip": 11.1171875, + "router_z_loss_mlp": 1.56738281, + "step": 617, + "time_per_iteration": 2.738520622253418 + }, + { + "auxiliary_loss_clip": 0.08034836, + "auxiliary_loss_mlp": 0.01559373, + "balance_loss_clip": 0.06941325, + "balance_loss_mlp": 0.01401445, + "epoch": 0.03715617014880505, + "flos": 19615424780160.0, + "grad_norm": 6.714823910826054, + "language_loss": 0.88676983, + "learning_rate": 3.99946303748829e-06, + "loss": 0.98271191, + "num_input_tokens_seen": 13028935, + "router_z_loss_clip": 10.953125, + "router_z_loss_mlp": 1.57910156, + "step": 618, + "time_per_iteration": 2.6463687419891357 + }, + { + "auxiliary_loss_clip": 0.08035833, + "auxiliary_loss_mlp": 0.0158681, + "balance_loss_clip": 0.06917505, + "balance_loss_mlp": 0.01430789, + "epoch": 0.03721629340147302, + "flos": 15929598579840.0, + "grad_norm": 200.27470015941975, + "language_loss": 0.97611117, + "learning_rate": 3.999453975402242e-06, + "loss": 1.07233763, + "num_input_tokens_seen": 13046000, + "router_z_loss_clip": 11.171875, + "router_z_loss_mlp": 1.55957031, + "step": 619, + "time_per_iteration": 2.6415488719940186 + }, + { + "auxiliary_loss_clip": 0.08024481, + "auxiliary_loss_mlp": 0.01545146, + "balance_loss_clip": 0.06915386, + "balance_loss_mlp": 0.01399139, + "epoch": 0.03727641665414099, + "flos": 21110182565760.0, + "grad_norm": 5.601090655471351, + "language_loss": 1.00407517, + "learning_rate": 3.9994448374967e-06, + "loss": 1.0997715, + "num_input_tokens_seen": 13062995, + "router_z_loss_clip": 11.1015625, + "router_z_loss_mlp": 1.4609375, + "step": 620, + "time_per_iteration": 2.6569979190826416 + }, + { + "auxiliary_loss_clip": 0.08002374, + "auxiliary_loss_mlp": 0.01557386, + "balance_loss_clip": 0.06899319, + "balance_loss_mlp": 0.01406705, + "epoch": 0.037336539906808956, + "flos": 24138159960960.0, + "grad_norm": 36.40398806521908, + "language_loss": 0.83474398, + "learning_rate": 3.999435623772008e-06, + "loss": 0.9303416, + "num_input_tokens_seen": 13084120, + "router_z_loss_clip": 11.046875, + "router_z_loss_mlp": 1.5078125, + "step": 621, + "time_per_iteration": 2.690336227416992 + }, + { + "auxiliary_loss_clip": 0.07971206, + "auxiliary_loss_mlp": 0.01523645, + "balance_loss_clip": 0.06889994, + "balance_loss_mlp": 0.01385266, + "epoch": 0.03739666315947693, + "flos": 22352981523840.0, + "grad_norm": 9.446463642728892, + "language_loss": 0.92411411, + "learning_rate": 3.999426334228518e-06, + "loss": 1.01906252, + "num_input_tokens_seen": 13100035, + "router_z_loss_clip": 10.828125, + "router_z_loss_mlp": 1.38378906, + "step": 622, + "time_per_iteration": 2.658414363861084 + }, + { + "auxiliary_loss_clip": 0.07994708, + "auxiliary_loss_mlp": 0.01510841, + "balance_loss_clip": 0.06888318, + "balance_loss_mlp": 0.01382, + "epoch": 0.0374567864121449, + "flos": 20455855632000.0, + "grad_norm": 11.361437110202797, + "language_loss": 0.97279346, + "learning_rate": 3.999416968866581e-06, + "loss": 1.06784892, + "num_input_tokens_seen": 13118070, + "router_z_loss_clip": 11.0546875, + "router_z_loss_mlp": 1.2890625, + "step": 623, + "time_per_iteration": 2.641080617904663 + }, + { + "auxiliary_loss_clip": 0.07990901, + "auxiliary_loss_mlp": 0.01512746, + "balance_loss_clip": 0.06881022, + "balance_loss_mlp": 0.0138009, + "epoch": 0.037516909664812866, + "flos": 19214020995840.0, + "grad_norm": 6.5992711028490865, + "language_loss": 0.9044131, + "learning_rate": 3.999407527686551e-06, + "loss": 0.99944961, + "num_input_tokens_seen": 13136355, + "router_z_loss_clip": 11.1171875, + "router_z_loss_mlp": 1.32714844, + "step": 624, + "time_per_iteration": 2.6581132411956787 + }, + { + "auxiliary_loss_clip": 0.07970337, + "auxiliary_loss_mlp": 0.0150074, + "balance_loss_clip": 0.06882318, + "balance_loss_mlp": 0.01368561, + "epoch": 0.03757703291748084, + "flos": 35013643493760.0, + "grad_norm": 9.813739409664771, + "language_loss": 0.77213168, + "learning_rate": 3.999398010688788e-06, + "loss": 0.86684251, + "num_input_tokens_seen": 13155435, + "router_z_loss_clip": 10.8828125, + "router_z_loss_mlp": 1.32128906, + "step": 625, + "time_per_iteration": 2.741912603378296 + }, + { + "auxiliary_loss_clip": 0.07975402, + "auxiliary_loss_mlp": 0.01499832, + "balance_loss_clip": 0.06869578, + "balance_loss_mlp": 0.01362599, + "epoch": 0.0376371561701488, + "flos": 25490977729920.0, + "grad_norm": 10.795152981420221, + "language_loss": 0.84230971, + "learning_rate": 3.999388417873652e-06, + "loss": 0.93706203, + "num_input_tokens_seen": 13174295, + "router_z_loss_clip": 11.0625, + "router_z_loss_mlp": 1.37207031, + "step": 626, + "time_per_iteration": 2.7070746421813965 + }, + { + "auxiliary_loss_clip": 0.07968426, + "auxiliary_loss_mlp": 0.01497735, + "balance_loss_clip": 0.06873227, + "balance_loss_mlp": 0.01361264, + "epoch": 0.037697279422816775, + "flos": 18191301586560.0, + "grad_norm": 4.940336590948721, + "language_loss": 0.86271065, + "learning_rate": 3.999378749241506e-06, + "loss": 0.95737231, + "num_input_tokens_seen": 13192500, + "router_z_loss_clip": 10.953125, + "router_z_loss_mlp": 1.36425781, + "step": 627, + "time_per_iteration": 2.622081756591797 + }, + { + "auxiliary_loss_clip": 0.07952641, + "auxiliary_loss_mlp": 0.01462314, + "balance_loss_clip": 0.06847817, + "balance_loss_mlp": 0.01327273, + "epoch": 0.03775740267548475, + "flos": 24651133856640.0, + "grad_norm": 5.044807916969655, + "language_loss": 0.93558288, + "learning_rate": 3.999369004792719e-06, + "loss": 1.02973247, + "num_input_tokens_seen": 13213470, + "router_z_loss_clip": 11.046875, + "router_z_loss_mlp": 1.35058594, + "step": 628, + "time_per_iteration": 2.699890375137329 + }, + { + "auxiliary_loss_clip": 0.07954629, + "auxiliary_loss_mlp": 0.01473174, + "balance_loss_clip": 0.06867678, + "balance_loss_mlp": 0.01340232, + "epoch": 0.03781752592815271, + "flos": 21294609402240.0, + "grad_norm": 4.416786805856079, + "language_loss": 0.86205798, + "learning_rate": 3.999359184527658e-06, + "loss": 0.95633596, + "num_input_tokens_seen": 13232365, + "router_z_loss_clip": 10.8828125, + "router_z_loss_mlp": 1.32910156, + "step": 629, + "time_per_iteration": 2.629606246948242 + }, + { + "auxiliary_loss_clip": 0.07949786, + "auxiliary_loss_mlp": 0.01478041, + "balance_loss_clip": 0.06862906, + "balance_loss_mlp": 0.01348436, + "epoch": 0.037877649180820684, + "flos": 22095949524480.0, + "grad_norm": 11.02025815590499, + "language_loss": 0.82977569, + "learning_rate": 3.999349288446696e-06, + "loss": 0.92405391, + "num_input_tokens_seen": 13251920, + "router_z_loss_clip": 10.8671875, + "router_z_loss_mlp": 1.29589844, + "step": 630, + "time_per_iteration": 2.6579172611236572 + }, + { + "auxiliary_loss_clip": 0.07989411, + "auxiliary_loss_mlp": 0.01449511, + "balance_loss_clip": 0.06879212, + "balance_loss_mlp": 0.01315711, + "epoch": 0.03793777243348865, + "flos": 14506523562240.0, + "grad_norm": 6.642300097880606, + "language_loss": 0.99746037, + "learning_rate": 3.99933931655021e-06, + "loss": 1.09184957, + "num_input_tokens_seen": 13267440, + "router_z_loss_clip": 11.1015625, + "router_z_loss_mlp": 1.33789062, + "step": 631, + "time_per_iteration": 2.5856504440307617 + }, + { + "auxiliary_loss_clip": 0.079531, + "auxiliary_loss_mlp": 0.0144806, + "balance_loss_clip": 0.06880549, + "balance_loss_mlp": 0.01321221, + "epoch": 0.03799789568615662, + "flos": 21914918778240.0, + "grad_norm": 6.504165414948274, + "language_loss": 0.96511495, + "learning_rate": 3.999329268838575e-06, + "loss": 1.05912662, + "num_input_tokens_seen": 13287850, + "router_z_loss_clip": 10.71875, + "router_z_loss_mlp": 1.26953125, + "step": 632, + "time_per_iteration": 2.6638169288635254 + }, + { + "auxiliary_loss_clip": 0.07980786, + "auxiliary_loss_mlp": 0.01460671, + "balance_loss_clip": 0.06883863, + "balance_loss_mlp": 0.0132668, + "epoch": 0.03805801893882459, + "flos": 24833967465600.0, + "grad_norm": 3.720972995518591, + "language_loss": 0.88515753, + "learning_rate": 3.999319145312175e-06, + "loss": 0.97957206, + "num_input_tokens_seen": 13307760, + "router_z_loss_clip": 10.984375, + "router_z_loss_mlp": 1.33984375, + "step": 633, + "time_per_iteration": 2.7479147911071777 + }, + { + "auxiliary_loss_clip": 0.07973721, + "auxiliary_loss_mlp": 0.01476512, + "balance_loss_clip": 0.06873562, + "balance_loss_mlp": 0.01335273, + "epoch": 0.03811814219149256, + "flos": 30490950240000.0, + "grad_norm": 5.013866846245917, + "language_loss": 0.74909431, + "learning_rate": 3.999308945971392e-06, + "loss": 0.84359664, + "num_input_tokens_seen": 13331230, + "router_z_loss_clip": 11.0078125, + "router_z_loss_mlp": 1.4140625, + "step": 634, + "time_per_iteration": 2.7746760845184326 + }, + { + "auxiliary_loss_clip": 0.07892692, + "auxiliary_loss_mlp": 0.01617175, + "balance_loss_clip": 0.0733197, + "balance_loss_mlp": 0.01455336, + "epoch": 0.03817826544416053, + "flos": 67010671820160.0, + "grad_norm": 1.8703584651187424, + "language_loss": 0.63503969, + "learning_rate": 3.999298670816614e-06, + "loss": 0.73013842, + "num_input_tokens_seen": 13394760, + "router_z_loss_clip": 5.6171875, + "router_z_loss_mlp": 1.61816406, + "step": 635, + "time_per_iteration": 3.2972047328948975 + }, + { + "auxiliary_loss_clip": 0.08014892, + "auxiliary_loss_mlp": 0.01535345, + "balance_loss_clip": 0.06916042, + "balance_loss_mlp": 0.01392198, + "epoch": 0.038238388696828496, + "flos": 20491592198400.0, + "grad_norm": 9.695955755206388, + "language_loss": 0.90505767, + "learning_rate": 3.9992883198482294e-06, + "loss": 1.00056005, + "num_input_tokens_seen": 13412775, + "router_z_loss_clip": 10.9921875, + "router_z_loss_mlp": 1.43066406, + "step": 636, + "time_per_iteration": 2.6479721069335938 + }, + { + "auxiliary_loss_clip": 0.08042439, + "auxiliary_loss_mlp": 0.01559473, + "balance_loss_clip": 0.06923507, + "balance_loss_mlp": 0.01399637, + "epoch": 0.03829851194949647, + "flos": 17971389745920.0, + "grad_norm": 32.79410112755353, + "language_loss": 0.88142544, + "learning_rate": 3.999277893066632e-06, + "loss": 0.97744453, + "num_input_tokens_seen": 13427835, + "router_z_loss_clip": 11.1796875, + "router_z_loss_mlp": 1.59667969, + "step": 637, + "time_per_iteration": 2.6563000679016113 + }, + { + "auxiliary_loss_clip": 0.08110388, + "auxiliary_loss_mlp": 0.0159766, + "balance_loss_clip": 0.06951486, + "balance_loss_mlp": 0.0144078, + "epoch": 0.03835863520216444, + "flos": 22463251896960.0, + "grad_norm": 37.67076952511291, + "language_loss": 0.91187263, + "learning_rate": 3.999267390472215e-06, + "loss": 1.00895298, + "num_input_tokens_seen": 13447295, + "router_z_loss_clip": 11.578125, + "router_z_loss_mlp": 1.56933594, + "step": 638, + "time_per_iteration": 2.6984195709228516 + }, + { + "auxiliary_loss_clip": 0.08094786, + "auxiliary_loss_mlp": 0.01648944, + "balance_loss_clip": 0.0693827, + "balance_loss_mlp": 0.01462406, + "epoch": 0.038418758454832405, + "flos": 22171070309760.0, + "grad_norm": 8.895472090968715, + "language_loss": 0.76717615, + "learning_rate": 3.999256812065381e-06, + "loss": 0.86461353, + "num_input_tokens_seen": 13468455, + "router_z_loss_clip": 11.5703125, + "router_z_loss_mlp": 1.86621094, + "step": 639, + "time_per_iteration": 2.7338461875915527 + }, + { + "auxiliary_loss_clip": 0.08159171, + "auxiliary_loss_mlp": 0.0166434, + "balance_loss_clip": 0.06976852, + "balance_loss_mlp": 0.01475227, + "epoch": 0.03847888170750038, + "flos": 22754049891840.0, + "grad_norm": 14.750114797034104, + "language_loss": 0.93037415, + "learning_rate": 3.999246157846526e-06, + "loss": 1.02860928, + "num_input_tokens_seen": 13489085, + "router_z_loss_clip": 11.8203125, + "router_z_loss_mlp": 1.890625, + "step": 640, + "time_per_iteration": 2.6571292877197266 + }, + { + "auxiliary_loss_clip": 0.08171181, + "auxiliary_loss_mlp": 0.01715232, + "balance_loss_clip": 0.06975375, + "balance_loss_mlp": 0.01501704, + "epoch": 0.03853900496016834, + "flos": 22717852128000.0, + "grad_norm": 10.934463540103733, + "language_loss": 0.90094578, + "learning_rate": 3.9992354278160574e-06, + "loss": 0.99980986, + "num_input_tokens_seen": 13509120, + "router_z_loss_clip": 11.953125, + "router_z_loss_mlp": 2.1328125, + "step": 641, + "time_per_iteration": 2.6885619163513184 + }, + { + "auxiliary_loss_clip": 0.07644878, + "auxiliary_loss_mlp": 0.01447392, + "balance_loss_clip": 0.07120143, + "balance_loss_mlp": 0.01325512, + "epoch": 0.038599128212836314, + "flos": 70420039073280.0, + "grad_norm": 0.9281695288015585, + "language_loss": 0.65025115, + "learning_rate": 3.999224621974381e-06, + "loss": 0.74117386, + "num_input_tokens_seen": 13562005, + "router_z_loss_clip": 5.25, + "router_z_loss_mlp": 1.21679688, + "step": 642, + "time_per_iteration": 3.2678098678588867 + }, + { + "auxiliary_loss_clip": 0.08201542, + "auxiliary_loss_mlp": 0.01819887, + "balance_loss_clip": 0.07001273, + "balance_loss_mlp": 0.01562014, + "epoch": 0.03865925146550429, + "flos": 23301921813120.0, + "grad_norm": 11.481508748032715, + "language_loss": 0.86633605, + "learning_rate": 3.999213740321906e-06, + "loss": 0.96655035, + "num_input_tokens_seen": 13582185, + "router_z_loss_clip": 11.9921875, + "router_z_loss_mlp": 2.57617188, + "step": 643, + "time_per_iteration": 2.659075975418091 + }, + { + "auxiliary_loss_clip": 0.08181606, + "auxiliary_loss_mlp": 0.01825318, + "balance_loss_clip": 0.06992409, + "balance_loss_mlp": 0.01547799, + "epoch": 0.03871937471817225, + "flos": 21436255929600.0, + "grad_norm": 51.325604168223556, + "language_loss": 0.89457649, + "learning_rate": 3.999202782859046e-06, + "loss": 0.99464566, + "num_input_tokens_seen": 13599555, + "router_z_loss_clip": 11.890625, + "router_z_loss_mlp": 2.77539062, + "step": 644, + "time_per_iteration": 2.659674882888794 + }, + { + "auxiliary_loss_clip": 0.08227627, + "auxiliary_loss_mlp": 0.01840427, + "balance_loss_clip": 0.07032949, + "balance_loss_mlp": 0.01557186, + "epoch": 0.038779497970840224, + "flos": 34285914783360.0, + "grad_norm": 72.96819975442757, + "language_loss": 0.90063643, + "learning_rate": 3.9991917495862165e-06, + "loss": 1.00131702, + "num_input_tokens_seen": 13621160, + "router_z_loss_clip": 11.953125, + "router_z_loss_mlp": 2.83007812, + "step": 645, + "time_per_iteration": 2.732840061187744 + }, + { + "auxiliary_loss_clip": 0.08212948, + "auxiliary_loss_mlp": 0.01875445, + "balance_loss_clip": 0.07012647, + "balance_loss_mlp": 0.01580378, + "epoch": 0.03883962122350819, + "flos": 22754930359680.0, + "grad_norm": 12.262203154186425, + "language_loss": 0.90520537, + "learning_rate": 3.9991806405038345e-06, + "loss": 1.00608933, + "num_input_tokens_seen": 13641915, + "router_z_loss_clip": 12.0078125, + "router_z_loss_mlp": 2.95117188, + "step": 646, + "time_per_iteration": 2.6865735054016113 + }, + { + "auxiliary_loss_clip": 0.08250429, + "auxiliary_loss_mlp": 0.01894148, + "balance_loss_clip": 0.07030701, + "balance_loss_mlp": 0.01611288, + "epoch": 0.03889974447617616, + "flos": 21952500134400.0, + "grad_norm": 17.1595872898191, + "language_loss": 0.88891035, + "learning_rate": 3.999169455612323e-06, + "loss": 0.99035615, + "num_input_tokens_seen": 13661410, + "router_z_loss_clip": 12.1953125, + "router_z_loss_mlp": 2.83007812, + "step": 647, + "time_per_iteration": 2.648667097091675 + }, + { + "auxiliary_loss_clip": 0.08277115, + "auxiliary_loss_mlp": 0.01910975, + "balance_loss_clip": 0.0706424, + "balance_loss_mlp": 0.01610376, + "epoch": 0.03895986772884413, + "flos": 31513040743680.0, + "grad_norm": 19.91369953833428, + "language_loss": 0.91710514, + "learning_rate": 3.999158194912106e-06, + "loss": 1.01898599, + "num_input_tokens_seen": 13681705, + "router_z_loss_clip": 12.125, + "router_z_loss_mlp": 3.00585938, + "step": 648, + "time_per_iteration": 2.7659173011779785 + }, + { + "auxiliary_loss_clip": 0.08252379, + "auxiliary_loss_mlp": 0.0196062, + "balance_loss_clip": 0.0704875, + "balance_loss_mlp": 0.01647243, + "epoch": 0.0390199909815121, + "flos": 19907061315840.0, + "grad_norm": 11.116514995705378, + "language_loss": 0.90245318, + "learning_rate": 3.9991468584036086e-06, + "loss": 1.00458312, + "num_input_tokens_seen": 13700400, + "router_z_loss_clip": 12.0234375, + "router_z_loss_mlp": 3.1328125, + "step": 649, + "time_per_iteration": 4.126534938812256 + }, + { + "auxiliary_loss_clip": 0.08304022, + "auxiliary_loss_mlp": 0.01986477, + "balance_loss_clip": 0.07056045, + "balance_loss_mlp": 0.01679394, + "epoch": 0.03908011423418007, + "flos": 21618250997760.0, + "grad_norm": 9.336868328216912, + "language_loss": 0.85345471, + "learning_rate": 3.999135446087263e-06, + "loss": 0.95635974, + "num_input_tokens_seen": 13720145, + "router_z_loss_clip": 12.484375, + "router_z_loss_mlp": 3.07421875, + "step": 650, + "time_per_iteration": 4.1806252002716064 + }, + { + "auxiliary_loss_clip": 0.08239638, + "auxiliary_loss_mlp": 0.01912282, + "balance_loss_clip": 0.0705025, + "balance_loss_mlp": 0.01647351, + "epoch": 0.039140237486848035, + "flos": 18667406885760.0, + "grad_norm": 11.202480244033193, + "language_loss": 0.84588236, + "learning_rate": 3.9991239579635e-06, + "loss": 0.94740158, + "num_input_tokens_seen": 13737500, + "router_z_loss_clip": 11.890625, + "router_z_loss_mlp": 2.6484375, + "step": 651, + "time_per_iteration": 4.02846360206604 + }, + { + "auxiliary_loss_clip": 0.08228613, + "auxiliary_loss_mlp": 0.01893436, + "balance_loss_clip": 0.07038778, + "balance_loss_mlp": 0.01631557, + "epoch": 0.03920036073951601, + "flos": 18667071469440.0, + "grad_norm": 33.17940308554231, + "language_loss": 0.9516173, + "learning_rate": 3.999112394032757e-06, + "loss": 1.05283785, + "num_input_tokens_seen": 13754750, + "router_z_loss_clip": 11.90625, + "router_z_loss_mlp": 2.6171875, + "step": 652, + "time_per_iteration": 2.6877963542938232 + }, + { + "auxiliary_loss_clip": 0.08188264, + "auxiliary_loss_mlp": 0.01841461, + "balance_loss_clip": 0.07017257, + "balance_loss_mlp": 0.01607716, + "epoch": 0.03926048399218398, + "flos": 31361918705280.0, + "grad_norm": 14.717862862310868, + "language_loss": 0.87065995, + "learning_rate": 3.999100754295471e-06, + "loss": 0.97095722, + "num_input_tokens_seen": 13771990, + "router_z_loss_clip": 11.7109375, + "router_z_loss_mlp": 2.33691406, + "step": 653, + "time_per_iteration": 4.161829948425293 + }, + { + "auxiliary_loss_clip": 0.08235107, + "auxiliary_loss_mlp": 0.01869742, + "balance_loss_clip": 0.07023594, + "balance_loss_mlp": 0.01632659, + "epoch": 0.039320607244851945, + "flos": 29610715898880.0, + "grad_norm": 12.720561465838024, + "language_loss": 0.92308909, + "learning_rate": 3.999089038752085e-06, + "loss": 1.0241375, + "num_input_tokens_seen": 13792750, + "router_z_loss_clip": 12.125, + "router_z_loss_mlp": 2.37304688, + "step": 654, + "time_per_iteration": 2.7182300090789795 + }, + { + "auxiliary_loss_clip": 0.07219759, + "auxiliary_loss_mlp": 0.01432266, + "balance_loss_clip": 0.0672446, + "balance_loss_mlp": 0.01342621, + "epoch": 0.03938073049751992, + "flos": 66555362332800.0, + "grad_norm": 4.21609108891928, + "language_loss": 0.5259136, + "learning_rate": 3.999077247403041e-06, + "loss": 0.61243391, + "num_input_tokens_seen": 13858570, + "router_z_loss_clip": 4.94140625, + "router_z_loss_mlp": 0.89599609, + "step": 655, + "time_per_iteration": 3.3539531230926514 + }, + { + "auxiliary_loss_clip": 0.08163472, + "auxiliary_loss_mlp": 0.01789512, + "balance_loss_clip": 0.07021941, + "balance_loss_mlp": 0.01601352, + "epoch": 0.03944085375018788, + "flos": 23374568903040.0, + "grad_norm": 42.09331718280733, + "language_loss": 0.85369515, + "learning_rate": 3.9990653802487886e-06, + "loss": 0.95322502, + "num_input_tokens_seen": 13876335, + "router_z_loss_clip": 11.4140625, + "router_z_loss_mlp": 1.88183594, + "step": 656, + "time_per_iteration": 2.696443557739258 + }, + { + "auxiliary_loss_clip": 0.08208387, + "auxiliary_loss_mlp": 0.01830457, + "balance_loss_clip": 0.07014482, + "balance_loss_mlp": 0.01624177, + "epoch": 0.039500977002855854, + "flos": 18553656568320.0, + "grad_norm": 12.61442729870119, + "language_loss": 0.83751947, + "learning_rate": 3.999053437289776e-06, + "loss": 0.93790793, + "num_input_tokens_seen": 13892640, + "router_z_loss_clip": 11.9296875, + "router_z_loss_mlp": 2.06347656, + "step": 657, + "time_per_iteration": 2.6805458068847656 + }, + { + "auxiliary_loss_clip": 0.08160911, + "auxiliary_loss_mlp": 0.01759172, + "balance_loss_clip": 0.07011348, + "balance_loss_mlp": 0.0155871, + "epoch": 0.039561100255523826, + "flos": 25345264279680.0, + "grad_norm": 59.81491010429953, + "language_loss": 0.86573362, + "learning_rate": 3.999041418526457e-06, + "loss": 0.96493447, + "num_input_tokens_seen": 13910085, + "router_z_loss_clip": 11.5, + "router_z_loss_mlp": 2.00488281, + "step": 658, + "time_per_iteration": 2.7667956352233887 + }, + { + "auxiliary_loss_clip": 0.08139389, + "auxiliary_loss_mlp": 0.01752558, + "balance_loss_clip": 0.07002386, + "balance_loss_mlp": 0.01577368, + "epoch": 0.03962122350819179, + "flos": 18225193363200.0, + "grad_norm": 13.067415763006752, + "language_loss": 0.97220278, + "learning_rate": 3.999029323959287e-06, + "loss": 1.07112217, + "num_input_tokens_seen": 13928800, + "router_z_loss_clip": 11.375, + "router_z_loss_mlp": 1.75097656, + "step": 659, + "time_per_iteration": 2.7390072345733643 + }, + { + "auxiliary_loss_clip": 0.08160311, + "auxiliary_loss_mlp": 0.01767653, + "balance_loss_clip": 0.07020363, + "balance_loss_mlp": 0.01584643, + "epoch": 0.03968134676085976, + "flos": 20528544648960.0, + "grad_norm": 6.696604257077815, + "language_loss": 0.85069668, + "learning_rate": 3.999017153588724e-06, + "loss": 0.94997621, + "num_input_tokens_seen": 13948325, + "router_z_loss_clip": 11.40625, + "router_z_loss_mlp": 1.83203125, + "step": 660, + "time_per_iteration": 2.6942412853240967 + }, + { + "auxiliary_loss_clip": 0.08128712, + "auxiliary_loss_mlp": 0.01673628, + "balance_loss_clip": 0.07018431, + "balance_loss_mlp": 0.01512361, + "epoch": 0.03974147001352773, + "flos": 22429737463680.0, + "grad_norm": 7.3843033134333425, + "language_loss": 0.86255896, + "learning_rate": 3.999004907415231e-06, + "loss": 0.96058238, + "num_input_tokens_seen": 13969090, + "router_z_loss_clip": 11.109375, + "router_z_loss_mlp": 1.61132812, + "step": 661, + "time_per_iteration": 2.688343048095703 + }, + { + "auxiliary_loss_clip": 0.07200997, + "auxiliary_loss_mlp": 0.01397595, + "balance_loss_clip": 0.06707223, + "balance_loss_mlp": 0.01289354, + "epoch": 0.0398015932661957, + "flos": 71149780281600.0, + "grad_norm": 0.9134370604104062, + "language_loss": 0.69827634, + "learning_rate": 3.998992585439272e-06, + "loss": 0.78426224, + "num_input_tokens_seen": 14037555, + "router_z_loss_clip": 4.9375, + "router_z_loss_mlp": 1.08496094, + "step": 662, + "time_per_iteration": 3.4075381755828857 + }, + { + "auxiliary_loss_clip": 0.08114735, + "auxiliary_loss_mlp": 0.01667295, + "balance_loss_clip": 0.06992006, + "balance_loss_mlp": 0.01495347, + "epoch": 0.03986171651886367, + "flos": 16806688392960.0, + "grad_norm": 88.3041379662575, + "language_loss": 0.8901574, + "learning_rate": 3.998980187661314e-06, + "loss": 0.98797774, + "num_input_tokens_seen": 14055765, + "router_z_loss_clip": 11.234375, + "router_z_loss_mlp": 1.71875, + "step": 663, + "time_per_iteration": 2.6151316165924072 + }, + { + "auxiliary_loss_clip": 0.08116017, + "auxiliary_loss_mlp": 0.01665745, + "balance_loss_clip": 0.06974875, + "balance_loss_mlp": 0.01491318, + "epoch": 0.03992183977153164, + "flos": 24541953586560.0, + "grad_norm": 13.584726936237926, + "language_loss": 0.92355931, + "learning_rate": 3.998967714081826e-06, + "loss": 1.02137709, + "num_input_tokens_seen": 14074195, + "router_z_loss_clip": 11.3984375, + "router_z_loss_mlp": 1.74511719, + "step": 664, + "time_per_iteration": 2.7008705139160156 + }, + { + "auxiliary_loss_clip": 0.08040652, + "auxiliary_loss_mlp": 0.01593066, + "balance_loss_clip": 0.06989275, + "balance_loss_mlp": 0.01449252, + "epoch": 0.03998196302419961, + "flos": 15601261155840.0, + "grad_norm": 12.968973833741712, + "language_loss": 0.90573943, + "learning_rate": 3.998955164701281e-06, + "loss": 1.00207651, + "num_input_tokens_seen": 14090215, + "router_z_loss_clip": 10.5078125, + "router_z_loss_mlp": 1.43847656, + "step": 665, + "time_per_iteration": 2.588078737258911 + }, + { + "auxiliary_loss_clip": 0.0806282, + "auxiliary_loss_mlp": 0.01620663, + "balance_loss_clip": 0.06955597, + "balance_loss_mlp": 0.01454533, + "epoch": 0.04004208627686758, + "flos": 25312714168320.0, + "grad_norm": 13.194143098844163, + "language_loss": 0.86261296, + "learning_rate": 3.998942539520158e-06, + "loss": 0.9594478, + "num_input_tokens_seen": 14112150, + "router_z_loss_clip": 11.0859375, + "router_z_loss_mlp": 1.66113281, + "step": 666, + "time_per_iteration": 2.7150063514709473 + }, + { + "auxiliary_loss_clip": 0.08039176, + "auxiliary_loss_mlp": 0.01580059, + "balance_loss_clip": 0.06968041, + "balance_loss_mlp": 0.01428235, + "epoch": 0.04010220952953555, + "flos": 23482365580800.0, + "grad_norm": 143.76139759772911, + "language_loss": 0.91256213, + "learning_rate": 3.998929838538932e-06, + "loss": 1.00875449, + "num_input_tokens_seen": 14131475, + "router_z_loss_clip": 10.71875, + "router_z_loss_mlp": 1.51855469, + "step": 667, + "time_per_iteration": 2.6658053398132324 + }, + { + "auxiliary_loss_clip": 0.08004649, + "auxiliary_loss_mlp": 0.01530234, + "balance_loss_clip": 0.06972381, + "balance_loss_mlp": 0.01387469, + "epoch": 0.04016233278220352, + "flos": 18621691683840.0, + "grad_norm": 22.359711377029505, + "language_loss": 0.8821072, + "learning_rate": 3.998917061758087e-06, + "loss": 0.97745597, + "num_input_tokens_seen": 14146165, + "router_z_loss_clip": 10.3046875, + "router_z_loss_mlp": 1.42773438, + "step": 668, + "time_per_iteration": 2.6255545616149902 + }, + { + "auxiliary_loss_clip": 0.07152489, + "auxiliary_loss_mlp": 0.01341531, + "balance_loss_clip": 0.06666718, + "balance_loss_mlp": 0.01260421, + "epoch": 0.040222456034871484, + "flos": 70926556204800.0, + "grad_norm": 1.1799050230194268, + "language_loss": 0.60729092, + "learning_rate": 3.998904209178107e-06, + "loss": 0.69223112, + "num_input_tokens_seen": 14215005, + "router_z_loss_clip": 4.875, + "router_z_loss_mlp": 0.81103516, + "step": 669, + "time_per_iteration": 3.3595035076141357 + }, + { + "auxiliary_loss_clip": 0.08017544, + "auxiliary_loss_mlp": 0.01537312, + "balance_loss_clip": 0.06961209, + "balance_loss_mlp": 0.0138749, + "epoch": 0.040282579287539456, + "flos": 23770773734400.0, + "grad_norm": 21.749949136203163, + "language_loss": 0.91578722, + "learning_rate": 3.9988912807994785e-06, + "loss": 1.01133573, + "num_input_tokens_seen": 14235510, + "router_z_loss_clip": 10.5703125, + "router_z_loss_mlp": 1.49707031, + "step": 670, + "time_per_iteration": 2.66859769821167 + }, + { + "auxiliary_loss_clip": 0.08002704, + "auxiliary_loss_mlp": 0.01555976, + "balance_loss_clip": 0.0695509, + "balance_loss_mlp": 0.01413116, + "epoch": 0.04034270254020743, + "flos": 18484405568640.0, + "grad_norm": 9.221564261110139, + "language_loss": 0.80103904, + "learning_rate": 3.998878276622692e-06, + "loss": 0.89662588, + "num_input_tokens_seen": 14254565, + "router_z_loss_clip": 10.484375, + "router_z_loss_mlp": 1.4296875, + "step": 671, + "time_per_iteration": 2.6671946048736572 + }, + { + "auxiliary_loss_clip": 0.07994901, + "auxiliary_loss_mlp": 0.01548628, + "balance_loss_clip": 0.06957932, + "balance_loss_mlp": 0.01400332, + "epoch": 0.040402825792875394, + "flos": 17207589052800.0, + "grad_norm": 12.445045366932057, + "language_loss": 0.98976898, + "learning_rate": 3.998865196648242e-06, + "loss": 1.08520412, + "num_input_tokens_seen": 14271885, + "router_z_loss_clip": 10.375, + "router_z_loss_mlp": 1.484375, + "step": 672, + "time_per_iteration": 2.6043524742126465 + }, + { + "auxiliary_loss_clip": 0.08007569, + "auxiliary_loss_mlp": 0.01577526, + "balance_loss_clip": 0.06955793, + "balance_loss_mlp": 0.01428181, + "epoch": 0.040462949045543366, + "flos": 19178242502400.0, + "grad_norm": 16.68355787547426, + "language_loss": 0.95323932, + "learning_rate": 3.998852040876622e-06, + "loss": 1.04909039, + "num_input_tokens_seen": 14289670, + "router_z_loss_clip": 10.53125, + "router_z_loss_mlp": 1.49316406, + "step": 673, + "time_per_iteration": 2.67228102684021 + }, + { + "auxiliary_loss_clip": 0.07999671, + "auxiliary_loss_mlp": 0.01557213, + "balance_loss_clip": 0.06955186, + "balance_loss_mlp": 0.01413161, + "epoch": 0.04052307229821133, + "flos": 24025877089920.0, + "grad_norm": 7.385878323717427, + "language_loss": 0.80140877, + "learning_rate": 3.998838809308334e-06, + "loss": 0.89697754, + "num_input_tokens_seen": 14309285, + "router_z_loss_clip": 10.4375, + "router_z_loss_mlp": 1.43994141, + "step": 674, + "time_per_iteration": 2.6599738597869873 + }, + { + "auxiliary_loss_clip": 0.08032155, + "auxiliary_loss_mlp": 0.01590571, + "balance_loss_clip": 0.06966965, + "balance_loss_mlp": 0.01439795, + "epoch": 0.0405831955508793, + "flos": 16442362840320.0, + "grad_norm": 8.615330731484576, + "language_loss": 0.83709693, + "learning_rate": 3.9988255019438766e-06, + "loss": 0.93332422, + "num_input_tokens_seen": 14328300, + "router_z_loss_clip": 10.6484375, + "router_z_loss_mlp": 1.50683594, + "step": 675, + "time_per_iteration": 2.68145751953125 + }, + { + "auxiliary_loss_clip": 0.07989661, + "auxiliary_loss_mlp": 0.01530552, + "balance_loss_clip": 0.06954966, + "balance_loss_mlp": 0.01384926, + "epoch": 0.040643318803547275, + "flos": 24286808304000.0, + "grad_norm": 7.342047246701879, + "language_loss": 0.80985713, + "learning_rate": 3.998812118783757e-06, + "loss": 0.90505934, + "num_input_tokens_seen": 14346395, + "router_z_loss_clip": 10.3359375, + "router_z_loss_mlp": 1.45605469, + "step": 676, + "time_per_iteration": 2.6827666759490967 + }, + { + "auxiliary_loss_clip": 0.0800771, + "auxiliary_loss_mlp": 0.01548704, + "balance_loss_clip": 0.06941711, + "balance_loss_mlp": 0.01395925, + "epoch": 0.04070344205621524, + "flos": 17717795763840.0, + "grad_norm": 11.552804849972091, + "language_loss": 0.9000327, + "learning_rate": 3.9987986598284804e-06, + "loss": 0.99559683, + "num_input_tokens_seen": 14364605, + "router_z_loss_clip": 10.6640625, + "router_z_loss_mlp": 1.52734375, + "step": 677, + "time_per_iteration": 2.647284984588623 + }, + { + "auxiliary_loss_clip": 0.0795664, + "auxiliary_loss_mlp": 0.01525712, + "balance_loss_clip": 0.06946824, + "balance_loss_mlp": 0.01385522, + "epoch": 0.04076356530888321, + "flos": 26184940444800.0, + "grad_norm": 15.722345117009269, + "language_loss": 0.81235254, + "learning_rate": 3.998785125078559e-06, + "loss": 0.90717608, + "num_input_tokens_seen": 14385265, + "router_z_loss_clip": 10.09375, + "router_z_loss_mlp": 1.40039062, + "step": 678, + "time_per_iteration": 2.713604688644409 + }, + { + "auxiliary_loss_clip": 0.07982595, + "auxiliary_loss_mlp": 0.01542507, + "balance_loss_clip": 0.06946435, + "balance_loss_mlp": 0.01393447, + "epoch": 0.04082368856155118, + "flos": 35782349650560.0, + "grad_norm": 7.406308464158208, + "language_loss": 0.87816763, + "learning_rate": 3.998771514534505e-06, + "loss": 0.97341865, + "num_input_tokens_seen": 14406090, + "router_z_loss_clip": 10.3671875, + "router_z_loss_mlp": 1.4921875, + "step": 679, + "time_per_iteration": 2.7753264904022217 + }, + { + "auxiliary_loss_clip": 0.07950564, + "auxiliary_loss_mlp": 0.01522729, + "balance_loss_clip": 0.06942166, + "balance_loss_mlp": 0.01383969, + "epoch": 0.04088381181421915, + "flos": 28154042593920.0, + "grad_norm": 7.465466597866811, + "language_loss": 0.8230598, + "learning_rate": 3.998757828196835e-06, + "loss": 0.91779268, + "num_input_tokens_seen": 14425130, + "router_z_loss_clip": 10.0859375, + "router_z_loss_mlp": 1.38671875, + "step": 680, + "time_per_iteration": 2.729719400405884 + }, + { + "auxiliary_loss_clip": 0.07993592, + "auxiliary_loss_mlp": 0.01532905, + "balance_loss_clip": 0.06938143, + "balance_loss_mlp": 0.01378696, + "epoch": 0.04094393506688712, + "flos": 27604703226240.0, + "grad_norm": 9.665492233492547, + "language_loss": 0.8765927, + "learning_rate": 3.9987440660660685e-06, + "loss": 0.97185767, + "num_input_tokens_seen": 14447355, + "router_z_loss_clip": 10.5703125, + "router_z_loss_mlp": 1.54199219, + "step": 681, + "time_per_iteration": 2.752514600753784 + }, + { + "auxiliary_loss_clip": 0.07989424, + "auxiliary_loss_mlp": 0.01553673, + "balance_loss_clip": 0.0693374, + "balance_loss_mlp": 0.01390118, + "epoch": 0.04100405831955509, + "flos": 23118668933760.0, + "grad_norm": 7.019008438585821, + "language_loss": 0.77474326, + "learning_rate": 3.998730228142726e-06, + "loss": 0.87017429, + "num_input_tokens_seen": 14466790, + "router_z_loss_clip": 10.5546875, + "router_z_loss_mlp": 1.63476562, + "step": 682, + "time_per_iteration": 2.6727144718170166 + }, + { + "auxiliary_loss_clip": 0.07959605, + "auxiliary_loss_mlp": 0.01503527, + "balance_loss_clip": 0.06938009, + "balance_loss_mlp": 0.01370394, + "epoch": 0.04106418157222306, + "flos": 20162877431040.0, + "grad_norm": 10.358969831785554, + "language_loss": 0.77842575, + "learning_rate": 3.998716314427333e-06, + "loss": 0.87305701, + "num_input_tokens_seen": 14485195, + "router_z_loss_clip": 10.2109375, + "router_z_loss_mlp": 1.33007812, + "step": 683, + "time_per_iteration": 2.6043591499328613 + }, + { + "auxiliary_loss_clip": 0.07972776, + "auxiliary_loss_mlp": 0.01527418, + "balance_loss_clip": 0.06933653, + "balance_loss_mlp": 0.01377405, + "epoch": 0.041124304824891024, + "flos": 17426452717440.0, + "grad_norm": 41.27076771704703, + "language_loss": 0.86504227, + "learning_rate": 3.998702324920417e-06, + "loss": 0.96004421, + "num_input_tokens_seen": 14503370, + "router_z_loss_clip": 10.3984375, + "router_z_loss_mlp": 1.5, + "step": 684, + "time_per_iteration": 2.6286985874176025 + }, + { + "auxiliary_loss_clip": 0.07935933, + "auxiliary_loss_mlp": 0.01488839, + "balance_loss_clip": 0.06928104, + "balance_loss_mlp": 0.01343976, + "epoch": 0.041184428077558996, + "flos": 25788022853760.0, + "grad_norm": 3.9155930370094065, + "language_loss": 0.94948566, + "learning_rate": 3.9986882596225085e-06, + "loss": 1.04373336, + "num_input_tokens_seen": 14526415, + "router_z_loss_clip": 10.0859375, + "router_z_loss_mlp": 1.44824219, + "step": 685, + "time_per_iteration": 2.7345352172851562 + }, + { + "auxiliary_loss_clip": 0.07948299, + "auxiliary_loss_mlp": 0.0149691, + "balance_loss_clip": 0.06921411, + "balance_loss_mlp": 0.01346992, + "epoch": 0.04124455133022697, + "flos": 22971152620800.0, + "grad_norm": 3.7671102410224577, + "language_loss": 0.94070864, + "learning_rate": 3.998674118534141e-06, + "loss": 1.03516078, + "num_input_tokens_seen": 14546595, + "router_z_loss_clip": 10.2734375, + "router_z_loss_mlp": 1.5, + "step": 686, + "time_per_iteration": 2.6663894653320312 + }, + { + "auxiliary_loss_clip": 0.0795872, + "auxiliary_loss_mlp": 0.01501087, + "balance_loss_clip": 0.06920497, + "balance_loss_mlp": 0.01356414, + "epoch": 0.04130467458289493, + "flos": 21295615651200.0, + "grad_norm": 39.86585208650635, + "language_loss": 0.77225804, + "learning_rate": 3.998659901655851e-06, + "loss": 0.8668561, + "num_input_tokens_seen": 14566590, + "router_z_loss_clip": 10.3828125, + "router_z_loss_mlp": 1.44628906, + "step": 687, + "time_per_iteration": 2.6355550289154053 + }, + { + "auxiliary_loss_clip": 0.07898364, + "auxiliary_loss_mlp": 0.01464255, + "balance_loss_clip": 0.06899062, + "balance_loss_mlp": 0.01340564, + "epoch": 0.041364797835562905, + "flos": 19980337311360.0, + "grad_norm": 4.212344971526593, + "language_loss": 0.91093004, + "learning_rate": 3.998645608988177e-06, + "loss": 1.00455618, + "num_input_tokens_seen": 14585965, + "router_z_loss_clip": 10.0078125, + "router_z_loss_mlp": 1.23730469, + "step": 688, + "time_per_iteration": 4.057282209396362 + }, + { + "auxiliary_loss_clip": 0.07878294, + "auxiliary_loss_mlp": 0.01448978, + "balance_loss_clip": 0.06897704, + "balance_loss_mlp": 0.01329388, + "epoch": 0.04142492108823087, + "flos": 21912361228800.0, + "grad_norm": 22.971814885863903, + "language_loss": 0.88008463, + "learning_rate": 3.998631240531661e-06, + "loss": 0.97335738, + "num_input_tokens_seen": 14606015, + "router_z_loss_clip": 9.796875, + "router_z_loss_mlp": 1.19628906, + "step": 689, + "time_per_iteration": 4.07433295249939 + }, + { + "auxiliary_loss_clip": 0.07866906, + "auxiliary_loss_mlp": 0.01444557, + "balance_loss_clip": 0.06897521, + "balance_loss_mlp": 0.01326349, + "epoch": 0.04148504434089884, + "flos": 27647567389440.0, + "grad_norm": 6.767605845927541, + "language_loss": 0.72533339, + "learning_rate": 3.998616796286848e-06, + "loss": 0.81844807, + "num_input_tokens_seen": 14629955, + "router_z_loss_clip": 9.6953125, + "router_z_loss_mlp": 1.18212891, + "step": 690, + "time_per_iteration": 4.110247611999512 + }, + { + "auxiliary_loss_clip": 0.07835479, + "auxiliary_loss_mlp": 0.01439264, + "balance_loss_clip": 0.06874412, + "balance_loss_mlp": 0.01314809, + "epoch": 0.041545167593566815, + "flos": 20524058455680.0, + "grad_norm": 9.225891193910236, + "language_loss": 0.79284167, + "learning_rate": 3.998602276254286e-06, + "loss": 0.88558906, + "num_input_tokens_seen": 14648000, + "router_z_loss_clip": 9.6171875, + "router_z_loss_mlp": 1.24316406, + "step": 691, + "time_per_iteration": 2.667081594467163 + }, + { + "auxiliary_loss_clip": 0.07827538, + "auxiliary_loss_mlp": 0.01419803, + "balance_loss_clip": 0.06878158, + "balance_loss_mlp": 0.01303931, + "epoch": 0.04160529084623478, + "flos": 11872738500480.0, + "grad_norm": 5.1056325398424125, + "language_loss": 0.88591456, + "learning_rate": 3.998587680434526e-06, + "loss": 0.97838795, + "num_input_tokens_seen": 14662235, + "router_z_loss_clip": 9.484375, + "router_z_loss_mlp": 1.15820312, + "step": 692, + "time_per_iteration": 4.027364015579224 + }, + { + "auxiliary_loss_clip": 0.07869601, + "auxiliary_loss_mlp": 0.01461887, + "balance_loss_clip": 0.0685929, + "balance_loss_mlp": 0.01322936, + "epoch": 0.04166541409890275, + "flos": 14833309685760.0, + "grad_norm": 14.964488884578895, + "language_loss": 0.94025421, + "learning_rate": 3.99857300882812e-06, + "loss": 1.0335691, + "num_input_tokens_seen": 14676065, + "router_z_loss_clip": 10.1171875, + "router_z_loss_mlp": 1.38867188, + "step": 693, + "time_per_iteration": 2.6548287868499756 + }, + { + "auxiliary_loss_clip": 0.07852003, + "auxiliary_loss_mlp": 0.01436954, + "balance_loss_clip": 0.06875066, + "balance_loss_mlp": 0.01312977, + "epoch": 0.04172553735157072, + "flos": 25814577398400.0, + "grad_norm": 10.760604695701561, + "language_loss": 0.88156736, + "learning_rate": 3.998558261435626e-06, + "loss": 0.97445703, + "num_input_tokens_seen": 14694955, + "router_z_loss_clip": 9.765625, + "router_z_loss_mlp": 1.24023438, + "step": 694, + "time_per_iteration": 2.6794655323028564 + }, + { + "auxiliary_loss_clip": 0.07850839, + "auxiliary_loss_mlp": 0.01460734, + "balance_loss_clip": 0.0686307, + "balance_loss_mlp": 0.01329222, + "epoch": 0.04178566060423869, + "flos": 24286682522880.0, + "grad_norm": 6.107694720201945, + "language_loss": 0.89735746, + "learning_rate": 3.9985434382576015e-06, + "loss": 0.99047321, + "num_input_tokens_seen": 14715510, + "router_z_loss_clip": 9.890625, + "router_z_loss_mlp": 1.31445312, + "step": 695, + "time_per_iteration": 2.7562625408172607 + }, + { + "auxiliary_loss_clip": 0.07797342, + "auxiliary_loss_mlp": 0.01449631, + "balance_loss_clip": 0.0684258, + "balance_loss_mlp": 0.01321648, + "epoch": 0.04184578385690666, + "flos": 18227667058560.0, + "grad_norm": 4.8539800399764195, + "language_loss": 0.91097277, + "learning_rate": 3.99852853929461e-06, + "loss": 1.00344253, + "num_input_tokens_seen": 14731755, + "router_z_loss_clip": 9.5625, + "router_z_loss_mlp": 1.28027344, + "step": 696, + "time_per_iteration": 2.6180830001831055 + }, + { + "auxiliary_loss_clip": 0.07759669, + "auxiliary_loss_mlp": 0.01436884, + "balance_loss_clip": 0.06835265, + "balance_loss_mlp": 0.01318438, + "epoch": 0.041905907109574626, + "flos": 22781694539520.0, + "grad_norm": 8.248305080547661, + "language_loss": 0.97183168, + "learning_rate": 3.998513564547216e-06, + "loss": 1.06379724, + "num_input_tokens_seen": 14750810, + "router_z_loss_clip": 9.234375, + "router_z_loss_mlp": 1.18359375, + "step": 697, + "time_per_iteration": 2.6976754665374756 + }, + { + "auxiliary_loss_clip": 0.0775051, + "auxiliary_loss_mlp": 0.0142093, + "balance_loss_clip": 0.06823087, + "balance_loss_mlp": 0.01301005, + "epoch": 0.0419660303622426, + "flos": 20163128993280.0, + "grad_norm": 6.669627081417543, + "language_loss": 0.90090138, + "learning_rate": 3.998498514015987e-06, + "loss": 0.99261582, + "num_input_tokens_seen": 14768435, + "router_z_loss_clip": 9.2890625, + "router_z_loss_mlp": 1.20068359, + "step": 698, + "time_per_iteration": 2.6525814533233643 + }, + { + "auxiliary_loss_clip": 0.07798302, + "auxiliary_loss_mlp": 0.01439823, + "balance_loss_clip": 0.06844427, + "balance_loss_mlp": 0.01318039, + "epoch": 0.042026153614910564, + "flos": 23083142002560.0, + "grad_norm": 12.169844049295248, + "language_loss": 0.96140921, + "learning_rate": 3.998483387701495e-06, + "loss": 1.05379045, + "num_input_tokens_seen": 14786690, + "router_z_loss_clip": 9.546875, + "router_z_loss_mlp": 1.21728516, + "step": 699, + "time_per_iteration": 2.700636625289917 + }, + { + "auxiliary_loss_clip": 0.0715683, + "auxiliary_loss_mlp": 0.01383088, + "balance_loss_clip": 0.06685513, + "balance_loss_mlp": 0.01307272, + "epoch": 0.042086276867578536, + "flos": 64516296424320.0, + "grad_norm": 2.8955425132907755, + "language_loss": 0.7356112, + "learning_rate": 3.998468185604312e-06, + "loss": 0.82101035, + "num_input_tokens_seen": 14853840, + "router_z_loss_clip": 4.71875, + "router_z_loss_mlp": 0.75683594, + "step": 700, + "time_per_iteration": 3.2564964294433594 + }, + { + "auxiliary_loss_clip": 0.07741027, + "auxiliary_loss_mlp": 0.01429077, + "balance_loss_clip": 0.0681721, + "balance_loss_mlp": 0.01313587, + "epoch": 0.04214640012024651, + "flos": 15492458229120.0, + "grad_norm": 9.391497638208355, + "language_loss": 0.93962044, + "learning_rate": 3.998452907725016e-06, + "loss": 1.03132153, + "num_input_tokens_seen": 14869580, + "router_z_loss_clip": 9.2421875, + "router_z_loss_mlp": 1.15527344, + "step": 701, + "time_per_iteration": 2.66644024848938 + }, + { + "auxiliary_loss_clip": 0.07737128, + "auxiliary_loss_mlp": 0.01419929, + "balance_loss_clip": 0.06809002, + "balance_loss_mlp": 0.01302341, + "epoch": 0.04220652337291447, + "flos": 23883601656960.0, + "grad_norm": 33.27176662769112, + "language_loss": 0.71847737, + "learning_rate": 3.998437554064184e-06, + "loss": 0.81004792, + "num_input_tokens_seen": 14891065, + "router_z_loss_clip": 9.2890625, + "router_z_loss_mlp": 1.17529297, + "step": 702, + "time_per_iteration": 2.7162067890167236 + }, + { + "auxiliary_loss_clip": 0.07125677, + "auxiliary_loss_mlp": 0.01365095, + "balance_loss_clip": 0.06657615, + "balance_loss_mlp": 0.01297575, + "epoch": 0.042266646625582445, + "flos": 63815289966720.0, + "grad_norm": 0.8674304256332159, + "language_loss": 0.6110186, + "learning_rate": 3.9984221246224006e-06, + "loss": 0.69592631, + "num_input_tokens_seen": 14954815, + "router_z_loss_clip": 4.6875, + "router_z_loss_mlp": 0.67578125, + "step": 703, + "time_per_iteration": 3.3240442276000977 + }, + { + "auxiliary_loss_clip": 0.0710092, + "auxiliary_loss_mlp": 0.01355985, + "balance_loss_clip": 0.06631917, + "balance_loss_mlp": 0.01291803, + "epoch": 0.04232676987825041, + "flos": 50038912154880.0, + "grad_norm": 1.041495616235658, + "language_loss": 0.58151424, + "learning_rate": 3.9984066194002494e-06, + "loss": 0.66608322, + "num_input_tokens_seen": 15003050, + "router_z_loss_clip": 4.6875, + "router_z_loss_mlp": 0.64160156, + "step": 704, + "time_per_iteration": 3.174765110015869 + }, + { + "auxiliary_loss_clip": 0.07745479, + "auxiliary_loss_mlp": 0.01449155, + "balance_loss_clip": 0.06810448, + "balance_loss_mlp": 0.01329278, + "epoch": 0.04238689313091838, + "flos": 21622485628800.0, + "grad_norm": 12.557351496220864, + "language_loss": 0.93966371, + "learning_rate": 3.998391038398319e-06, + "loss": 1.03161013, + "num_input_tokens_seen": 15021990, + "router_z_loss_clip": 9.3515625, + "router_z_loss_mlp": 1.19775391, + "step": 705, + "time_per_iteration": 2.6435232162475586 + }, + { + "auxiliary_loss_clip": 0.07677379, + "auxiliary_loss_mlp": 0.01427121, + "balance_loss_clip": 0.06791299, + "balance_loss_mlp": 0.01325698, + "epoch": 0.042447016383586354, + "flos": 19141080416640.0, + "grad_norm": 3.7381942579388303, + "language_loss": 0.75889277, + "learning_rate": 3.998375381617201e-06, + "loss": 0.8499378, + "num_input_tokens_seen": 15040700, + "router_z_loss_clip": 8.8515625, + "router_z_loss_mlp": 1.01269531, + "step": 706, + "time_per_iteration": 2.671828508377075 + }, + { + "auxiliary_loss_clip": 0.07719514, + "auxiliary_loss_mlp": 0.01450054, + "balance_loss_clip": 0.06807585, + "balance_loss_mlp": 0.01336471, + "epoch": 0.04250713963625432, + "flos": 24432941024640.0, + "grad_norm": 29.794541170575812, + "language_loss": 0.97812521, + "learning_rate": 3.9983596490574875e-06, + "loss": 1.06982088, + "num_input_tokens_seen": 15056725, + "router_z_loss_clip": 9.1171875, + "router_z_loss_mlp": 1.13427734, + "step": 707, + "time_per_iteration": 2.6550920009613037 + }, + { + "auxiliary_loss_clip": 0.07717137, + "auxiliary_loss_mlp": 0.01443639, + "balance_loss_clip": 0.06809401, + "balance_loss_mlp": 0.01333776, + "epoch": 0.04256726288892229, + "flos": 30374348883840.0, + "grad_norm": 14.849267761051758, + "language_loss": 0.85616708, + "learning_rate": 3.998343840719776e-06, + "loss": 0.94777477, + "num_input_tokens_seen": 15077550, + "router_z_loss_clip": 9.09375, + "router_z_loss_mlp": 1.09863281, + "step": 708, + "time_per_iteration": 2.7447280883789062 + }, + { + "auxiliary_loss_clip": 0.07730591, + "auxiliary_loss_mlp": 0.01453146, + "balance_loss_clip": 0.06808455, + "balance_loss_mlp": 0.01341232, + "epoch": 0.04262738614159026, + "flos": 16368248304000.0, + "grad_norm": 3.836638557890093, + "language_loss": 0.88926339, + "learning_rate": 3.998327956604666e-06, + "loss": 0.98110074, + "num_input_tokens_seen": 15094955, + "router_z_loss_clip": 9.21875, + "router_z_loss_mlp": 1.11914062, + "step": 709, + "time_per_iteration": 2.632735252380371 + }, + { + "auxiliary_loss_clip": 0.07711782, + "auxiliary_loss_mlp": 0.01472, + "balance_loss_clip": 0.06786519, + "balance_loss_mlp": 0.01342396, + "epoch": 0.04268750939425823, + "flos": 20418609692160.0, + "grad_norm": 7.682824070104421, + "language_loss": 0.92841685, + "learning_rate": 3.99831199671276e-06, + "loss": 1.02025461, + "num_input_tokens_seen": 15113395, + "router_z_loss_clip": 9.2421875, + "router_z_loss_mlp": 1.296875, + "step": 710, + "time_per_iteration": 2.6799728870391846 + }, + { + "auxiliary_loss_clip": 0.07731062, + "auxiliary_loss_mlp": 0.01465957, + "balance_loss_clip": 0.06815341, + "balance_loss_mlp": 0.01351993, + "epoch": 0.0427476326469262, + "flos": 20309177859840.0, + "grad_norm": 5.073822997040578, + "language_loss": 0.89081585, + "learning_rate": 3.998295961044662e-06, + "loss": 0.98278606, + "num_input_tokens_seen": 15132920, + "router_z_loss_clip": 9.1484375, + "router_z_loss_mlp": 1.13867188, + "step": 711, + "time_per_iteration": 2.6377625465393066 + }, + { + "auxiliary_loss_clip": 0.07695919, + "auxiliary_loss_mlp": 0.01446717, + "balance_loss_clip": 0.06801347, + "balance_loss_mlp": 0.01336377, + "epoch": 0.042807755899594166, + "flos": 21656880529920.0, + "grad_norm": 4.571300727713509, + "language_loss": 0.91390419, + "learning_rate": 3.9982798496009804e-06, + "loss": 1.00533056, + "num_input_tokens_seen": 15153115, + "router_z_loss_clip": 8.9453125, + "router_z_loss_mlp": 1.10302734, + "step": 712, + "time_per_iteration": 2.6158323287963867 + }, + { + "auxiliary_loss_clip": 0.07722442, + "auxiliary_loss_mlp": 0.01473663, + "balance_loss_clip": 0.06794881, + "balance_loss_mlp": 0.01356647, + "epoch": 0.04286787915226214, + "flos": 21441580663680.0, + "grad_norm": 10.343893565695913, + "language_loss": 0.96509683, + "learning_rate": 3.998263662382328e-06, + "loss": 1.05705786, + "num_input_tokens_seen": 15172770, + "router_z_loss_clip": 9.265625, + "router_z_loss_mlp": 1.17041016, + "step": 713, + "time_per_iteration": 2.668109655380249 + }, + { + "auxiliary_loss_clip": 0.07025006, + "auxiliary_loss_mlp": 0.01310492, + "balance_loss_clip": 0.06573053, + "balance_loss_mlp": 0.01250029, + "epoch": 0.04292800240493011, + "flos": 66420256423680.0, + "grad_norm": 1.0671347208063184, + "language_loss": 0.65522671, + "learning_rate": 3.9982473993893165e-06, + "loss": 0.73858166, + "num_input_tokens_seen": 15240055, + "router_z_loss_clip": 4.5078125, + "router_z_loss_mlp": 0.60351562, + "step": 714, + "time_per_iteration": 3.317920207977295 + }, + { + "auxiliary_loss_clip": 0.07647526, + "auxiliary_loss_mlp": 0.01441108, + "balance_loss_clip": 0.0677468, + "balance_loss_mlp": 0.01326476, + "epoch": 0.042988125657598075, + "flos": 31658418777600.0, + "grad_norm": 3.6319248406792983, + "language_loss": 0.79793668, + "learning_rate": 3.998231060622563e-06, + "loss": 0.88882303, + "num_input_tokens_seen": 15261585, + "router_z_loss_clip": 8.73046875, + "router_z_loss_mlp": 1.14550781, + "step": 715, + "time_per_iteration": 2.717393398284912 + }, + { + "auxiliary_loss_clip": 0.07645463, + "auxiliary_loss_mlp": 0.01445614, + "balance_loss_clip": 0.06767702, + "balance_loss_mlp": 0.01331984, + "epoch": 0.04304824891026605, + "flos": 33255690433920.0, + "grad_norm": 29.540799393093693, + "language_loss": 0.77394652, + "learning_rate": 3.998214646082688e-06, + "loss": 0.86485732, + "num_input_tokens_seen": 15281160, + "router_z_loss_clip": 8.7890625, + "router_z_loss_mlp": 1.13623047, + "step": 716, + "time_per_iteration": 2.7298099994659424 + }, + { + "auxiliary_loss_clip": 0.07019071, + "auxiliary_loss_mlp": 0.01306888, + "balance_loss_clip": 0.06569381, + "balance_loss_mlp": 0.01252815, + "epoch": 0.04310837216293401, + "flos": 64086996430080.0, + "grad_norm": 0.9619131870502678, + "language_loss": 0.6602453, + "learning_rate": 3.998198155770314e-06, + "loss": 0.74350488, + "num_input_tokens_seen": 15344505, + "router_z_loss_clip": 4.5, + "router_z_loss_mlp": 0.54199219, + "step": 717, + "time_per_iteration": 3.2711920738220215 + }, + { + "auxiliary_loss_clip": 0.06998679, + "auxiliary_loss_mlp": 0.01302753, + "balance_loss_clip": 0.06550965, + "balance_loss_mlp": 0.01248918, + "epoch": 0.043168495415601985, + "flos": 61361990599680.0, + "grad_norm": 0.9806748941419274, + "language_loss": 0.58663344, + "learning_rate": 3.998181589686065e-06, + "loss": 0.66964775, + "num_input_tokens_seen": 15404050, + "router_z_loss_clip": 4.49609375, + "router_z_loss_mlp": 0.53955078, + "step": 718, + "time_per_iteration": 3.083362579345703 + }, + { + "auxiliary_loss_clip": 0.07634784, + "auxiliary_loss_mlp": 0.01408365, + "balance_loss_clip": 0.06757121, + "balance_loss_mlp": 0.01309135, + "epoch": 0.04322861866826996, + "flos": 20710539717120.0, + "grad_norm": 8.670927241625472, + "language_loss": 0.97469372, + "learning_rate": 3.99816494783057e-06, + "loss": 1.06512523, + "num_input_tokens_seen": 15424190, + "router_z_loss_clip": 8.78125, + "router_z_loss_mlp": 0.99316406, + "step": 719, + "time_per_iteration": 2.620244264602661 + }, + { + "auxiliary_loss_clip": 0.07617359, + "auxiliary_loss_mlp": 0.01437239, + "balance_loss_clip": 0.06746139, + "balance_loss_mlp": 0.01327042, + "epoch": 0.04328874192093792, + "flos": 30381308772480.0, + "grad_norm": 7.103043460272315, + "language_loss": 0.71241379, + "learning_rate": 3.99814823020446e-06, + "loss": 0.8029598, + "num_input_tokens_seen": 15446500, + "router_z_loss_clip": 8.703125, + "router_z_loss_mlp": 1.10253906, + "step": 720, + "time_per_iteration": 2.7137084007263184 + }, + { + "auxiliary_loss_clip": 0.07571768, + "auxiliary_loss_mlp": 0.01420566, + "balance_loss_clip": 0.06721878, + "balance_loss_mlp": 0.01314518, + "epoch": 0.043348865173605894, + "flos": 21951284250240.0, + "grad_norm": 7.242521234745598, + "language_loss": 0.82826072, + "learning_rate": 3.9981314368083684e-06, + "loss": 0.91818404, + "num_input_tokens_seen": 15465830, + "router_z_loss_clip": 8.5078125, + "router_z_loss_mlp": 1.06152344, + "step": 721, + "time_per_iteration": 2.6496849060058594 + }, + { + "auxiliary_loss_clip": 0.07618188, + "auxiliary_loss_mlp": 0.01421571, + "balance_loss_clip": 0.06749155, + "balance_loss_mlp": 0.01323009, + "epoch": 0.04340898842627386, + "flos": 15268982590080.0, + "grad_norm": 11.950148766430376, + "language_loss": 0.94630802, + "learning_rate": 3.998114567642933e-06, + "loss": 1.03670549, + "num_input_tokens_seen": 15479985, + "router_z_loss_clip": 8.6953125, + "router_z_loss_mlp": 0.98486328, + "step": 722, + "time_per_iteration": 2.665302038192749 + }, + { + "auxiliary_loss_clip": 0.07582939, + "auxiliary_loss_mlp": 0.01410079, + "balance_loss_clip": 0.06720737, + "balance_loss_mlp": 0.01309896, + "epoch": 0.04346911167894183, + "flos": 27973011847680.0, + "grad_norm": 7.626593725821058, + "language_loss": 0.90292984, + "learning_rate": 3.998097622708792e-06, + "loss": 0.99286008, + "num_input_tokens_seen": 15501545, + "router_z_loss_clip": 8.625, + "router_z_loss_mlp": 1.00195312, + "step": 723, + "time_per_iteration": 2.6893301010131836 + }, + { + "auxiliary_loss_clip": 0.0756183, + "auxiliary_loss_mlp": 0.01404071, + "balance_loss_clip": 0.06712201, + "balance_loss_mlp": 0.01307798, + "epoch": 0.0435292349316098, + "flos": 29249954144640.0, + "grad_norm": 5.654199567369001, + "language_loss": 0.8762064, + "learning_rate": 3.99808060200659e-06, + "loss": 0.96586531, + "num_input_tokens_seen": 15521725, + "router_z_loss_clip": 8.5, + "router_z_loss_mlp": 0.96337891, + "step": 724, + "time_per_iteration": 2.7862863540649414 + }, + { + "auxiliary_loss_clip": 0.07522231, + "auxiliary_loss_mlp": 0.01408898, + "balance_loss_clip": 0.06700347, + "balance_loss_mlp": 0.01310479, + "epoch": 0.04358935818427777, + "flos": 20564616631680.0, + "grad_norm": 17.469159252810304, + "language_loss": 0.84563124, + "learning_rate": 3.998063505536971e-06, + "loss": 0.93494248, + "num_input_tokens_seen": 15540910, + "router_z_loss_clip": 8.2109375, + "router_z_loss_mlp": 0.98339844, + "step": 725, + "time_per_iteration": 2.6348090171813965 + }, + { + "auxiliary_loss_clip": 0.07563804, + "auxiliary_loss_mlp": 0.01414464, + "balance_loss_clip": 0.06708695, + "balance_loss_mlp": 0.01317428, + "epoch": 0.04364948143694574, + "flos": 14470116163200.0, + "grad_norm": 13.275228581754149, + "language_loss": 0.94372833, + "learning_rate": 3.998046333300584e-06, + "loss": 1.03351104, + "num_input_tokens_seen": 15558640, + "router_z_loss_clip": 8.5546875, + "router_z_loss_mlp": 0.96972656, + "step": 726, + "time_per_iteration": 2.6198081970214844 + }, + { + "auxiliary_loss_clip": 0.06976914, + "auxiliary_loss_mlp": 0.01364793, + "balance_loss_clip": 0.0652867, + "balance_loss_mlp": 0.01297797, + "epoch": 0.043709604689613706, + "flos": 50083216565760.0, + "grad_norm": 0.973992689315138, + "language_loss": 0.56151426, + "learning_rate": 3.998029085298079e-06, + "loss": 0.64493132, + "num_input_tokens_seen": 15612975, + "router_z_loss_clip": 4.4921875, + "router_z_loss_mlp": 0.67041016, + "step": 727, + "time_per_iteration": 3.331416368484497 + }, + { + "auxiliary_loss_clip": 0.07546923, + "auxiliary_loss_mlp": 0.01412171, + "balance_loss_clip": 0.06696635, + "balance_loss_mlp": 0.01320475, + "epoch": 0.04376972794228168, + "flos": 13996861902720.0, + "grad_norm": 5.257747667032763, + "language_loss": 0.87717295, + "learning_rate": 3.998011761530112e-06, + "loss": 0.96676385, + "num_input_tokens_seen": 15631070, + "router_z_loss_clip": 8.51953125, + "router_z_loss_mlp": 0.91699219, + "step": 728, + "time_per_iteration": 3.989957571029663 + }, + { + "auxiliary_loss_clip": 0.07508835, + "auxiliary_loss_mlp": 0.01424416, + "balance_loss_clip": 0.06694756, + "balance_loss_mlp": 0.0133787, + "epoch": 0.04382985119494965, + "flos": 22015084734720.0, + "grad_norm": 7.636957371182376, + "language_loss": 0.80325305, + "learning_rate": 3.997994361997338e-06, + "loss": 0.89258564, + "num_input_tokens_seen": 15647825, + "router_z_loss_clip": 8.14453125, + "router_z_loss_mlp": 0.86572266, + "step": 729, + "time_per_iteration": 4.069265365600586 + }, + { + "auxiliary_loss_clip": 0.07515953, + "auxiliary_loss_mlp": 0.01429781, + "balance_loss_clip": 0.06682766, + "balance_loss_mlp": 0.01337561, + "epoch": 0.043889974447617615, + "flos": 24213322673280.0, + "grad_norm": 4.547809577279536, + "language_loss": 1.00979817, + "learning_rate": 3.997976886700417e-06, + "loss": 1.09925556, + "num_input_tokens_seen": 15668260, + "router_z_loss_clip": 8.33203125, + "router_z_loss_mlp": 0.92285156, + "step": 730, + "time_per_iteration": 4.043174982070923 + }, + { + "auxiliary_loss_clip": 0.07549515, + "auxiliary_loss_mlp": 0.01462607, + "balance_loss_clip": 0.06684491, + "balance_loss_mlp": 0.0135055, + "epoch": 0.04395009770028559, + "flos": 17280236142720.0, + "grad_norm": 42.34250232752857, + "language_loss": 0.93866402, + "learning_rate": 3.997959335640013e-06, + "loss": 1.02878523, + "num_input_tokens_seen": 15685630, + "router_z_loss_clip": 8.6640625, + "router_z_loss_mlp": 1.12011719, + "step": 731, + "time_per_iteration": 2.6158339977264404 + }, + { + "auxiliary_loss_clip": 0.07507139, + "auxiliary_loss_mlp": 0.01450773, + "balance_loss_clip": 0.06690555, + "balance_loss_mlp": 0.0135059, + "epoch": 0.04401022095295355, + "flos": 12314784314880.0, + "grad_norm": 29.143956092822908, + "language_loss": 0.9731133, + "learning_rate": 3.997941708816791e-06, + "loss": 1.0626924, + "num_input_tokens_seen": 15698645, + "router_z_loss_clip": 8.1640625, + "router_z_loss_mlp": 1.00146484, + "step": 732, + "time_per_iteration": 4.100733995437622 + }, + { + "auxiliary_loss_clip": 0.07525843, + "auxiliary_loss_mlp": 0.01458711, + "balance_loss_clip": 0.06679834, + "balance_loss_mlp": 0.01353854, + "epoch": 0.044070344205621524, + "flos": 20965978488960.0, + "grad_norm": 13.482370943505323, + "language_loss": 0.90961432, + "learning_rate": 3.997924006231419e-06, + "loss": 0.9994598, + "num_input_tokens_seen": 15716775, + "router_z_loss_clip": 8.46875, + "router_z_loss_mlp": 1.04785156, + "step": 733, + "time_per_iteration": 2.6597700119018555 + }, + { + "auxiliary_loss_clip": 0.07518548, + "auxiliary_loss_mlp": 0.01469977, + "balance_loss_clip": 0.06685109, + "balance_loss_mlp": 0.01364262, + "epoch": 0.044130467458289496, + "flos": 13850477619840.0, + "grad_norm": 7.4867822080691235, + "language_loss": 0.95689577, + "learning_rate": 3.9979062278845685e-06, + "loss": 1.04678106, + "num_input_tokens_seen": 15733320, + "router_z_loss_clip": 8.34375, + "router_z_loss_mlp": 1.05664062, + "step": 734, + "time_per_iteration": 2.5865581035614014 + }, + { + "auxiliary_loss_clip": 0.0748552, + "auxiliary_loss_mlp": 0.01451415, + "balance_loss_clip": 0.06673294, + "balance_loss_mlp": 0.01355809, + "epoch": 0.04419059071095746, + "flos": 28662152952960.0, + "grad_norm": 3.9560769382385237, + "language_loss": 0.82954776, + "learning_rate": 3.9978883737769125e-06, + "loss": 0.91891712, + "num_input_tokens_seen": 15752705, + "router_z_loss_clip": 8.12890625, + "router_z_loss_mlp": 0.95605469, + "step": 735, + "time_per_iteration": 2.7034595012664795 + }, + { + "auxiliary_loss_clip": 0.07501128, + "auxiliary_loss_mlp": 0.01471986, + "balance_loss_clip": 0.06663659, + "balance_loss_mlp": 0.01360931, + "epoch": 0.04425071396362543, + "flos": 28190743482240.0, + "grad_norm": 5.551572813958511, + "language_loss": 0.95522362, + "learning_rate": 3.9978704439091305e-06, + "loss": 1.04495478, + "num_input_tokens_seen": 15772800, + "router_z_loss_clip": 8.375, + "router_z_loss_mlp": 1.11132812, + "step": 736, + "time_per_iteration": 2.6946370601654053 + }, + { + "auxiliary_loss_clip": 0.07478474, + "auxiliary_loss_mlp": 0.01445427, + "balance_loss_clip": 0.06672784, + "balance_loss_mlp": 0.01338806, + "epoch": 0.0443108372162934, + "flos": 23665031481600.0, + "grad_norm": 16.744954570362566, + "language_loss": 0.88981938, + "learning_rate": 3.997852438281901e-06, + "loss": 0.97905844, + "num_input_tokens_seen": 15793665, + "router_z_loss_clip": 8.0625, + "router_z_loss_mlp": 1.06640625, + "step": 737, + "time_per_iteration": 2.715646266937256 + }, + { + "auxiliary_loss_clip": 0.07480585, + "auxiliary_loss_mlp": 0.01439926, + "balance_loss_clip": 0.0667211, + "balance_loss_mlp": 0.01326964, + "epoch": 0.04437096046896137, + "flos": 33987486067200.0, + "grad_norm": 222.55096495156016, + "language_loss": 0.89570022, + "learning_rate": 3.997834356895906e-06, + "loss": 0.98490536, + "num_input_tokens_seen": 15813175, + "router_z_loss_clip": 8.0859375, + "router_z_loss_mlp": 1.12988281, + "step": 738, + "time_per_iteration": 2.7859413623809814 + }, + { + "auxiliary_loss_clip": 0.06961473, + "auxiliary_loss_mlp": 0.01305245, + "balance_loss_clip": 0.06532852, + "balance_loss_mlp": 0.01250504, + "epoch": 0.04443108372162934, + "flos": 67416268308480.0, + "grad_norm": 0.9420923573397554, + "language_loss": 0.59376323, + "learning_rate": 3.9978161997518324e-06, + "loss": 0.67643034, + "num_input_tokens_seen": 15872050, + "router_z_loss_clip": 4.3046875, + "router_z_loss_mlp": 0.54882812, + "step": 739, + "time_per_iteration": 3.1967270374298096 + }, + { + "auxiliary_loss_clip": 0.07502826, + "auxiliary_loss_mlp": 0.01427717, + "balance_loss_clip": 0.06669345, + "balance_loss_mlp": 0.01320858, + "epoch": 0.04449120697429731, + "flos": 29760454344960.0, + "grad_norm": 6.6049127408313915, + "language_loss": 0.9770751, + "learning_rate": 3.997797966850369e-06, + "loss": 1.0663805, + "num_input_tokens_seen": 15891085, + "router_z_loss_clip": 8.3359375, + "router_z_loss_mlp": 1.06933594, + "step": 740, + "time_per_iteration": 2.768758535385132 + }, + { + "auxiliary_loss_clip": 0.07489674, + "auxiliary_loss_mlp": 0.0143368, + "balance_loss_clip": 0.06660549, + "balance_loss_mlp": 0.01330111, + "epoch": 0.04455133022696528, + "flos": 36510958828800.0, + "grad_norm": 21.062626098117025, + "language_loss": 0.76799577, + "learning_rate": 3.997779658192205e-06, + "loss": 0.85722935, + "num_input_tokens_seen": 15914225, + "router_z_loss_clip": 8.3046875, + "router_z_loss_mlp": 1.03515625, + "step": 741, + "time_per_iteration": 2.755948543548584 + }, + { + "auxiliary_loss_clip": 0.0744606, + "auxiliary_loss_mlp": 0.01441267, + "balance_loss_clip": 0.06655986, + "balance_loss_mlp": 0.01339128, + "epoch": 0.044611453479633245, + "flos": 28811220566400.0, + "grad_norm": 10.341428331493303, + "language_loss": 0.9204191, + "learning_rate": 3.997761273778037e-06, + "loss": 1.00929236, + "num_input_tokens_seen": 15934540, + "router_z_loss_clip": 7.90234375, + "router_z_loss_mlp": 1.02148438, + "step": 742, + "time_per_iteration": 2.6964497566223145 + }, + { + "auxiliary_loss_clip": 0.07461847, + "auxiliary_loss_mlp": 0.01424939, + "balance_loss_clip": 0.06654513, + "balance_loss_mlp": 0.01322085, + "epoch": 0.04467157673230122, + "flos": 20017122053760.0, + "grad_norm": 7.31366885778202, + "language_loss": 0.89204007, + "learning_rate": 3.997742813608561e-06, + "loss": 0.98090798, + "num_input_tokens_seen": 15952560, + "router_z_loss_clip": 8.078125, + "router_z_loss_mlp": 1.02880859, + "step": 743, + "time_per_iteration": 2.6080615520477295 + }, + { + "auxiliary_loss_clip": 0.07439004, + "auxiliary_loss_mlp": 0.01432385, + "balance_loss_clip": 0.06638713, + "balance_loss_mlp": 0.01329913, + "epoch": 0.04473169998496919, + "flos": 18010899745920.0, + "grad_norm": 13.675273731760388, + "language_loss": 0.85338962, + "learning_rate": 3.997724277684479e-06, + "loss": 0.94210356, + "num_input_tokens_seen": 15970620, + "router_z_loss_clip": 8.00390625, + "router_z_loss_mlp": 1.02490234, + "step": 744, + "time_per_iteration": 2.697763204574585 + }, + { + "auxiliary_loss_clip": 0.07427198, + "auxiliary_loss_mlp": 0.01407828, + "balance_loss_clip": 0.06637768, + "balance_loss_mlp": 0.01313938, + "epoch": 0.044791823237637154, + "flos": 20638060335360.0, + "grad_norm": 8.258556171326942, + "language_loss": 0.89771521, + "learning_rate": 3.99770566600649e-06, + "loss": 0.98606539, + "num_input_tokens_seen": 15987325, + "router_z_loss_clip": 7.89453125, + "router_z_loss_mlp": 0.93896484, + "step": 745, + "time_per_iteration": 2.609206438064575 + }, + { + "auxiliary_loss_clip": 0.07450528, + "auxiliary_loss_mlp": 0.01413412, + "balance_loss_clip": 0.06646559, + "balance_loss_mlp": 0.01313371, + "epoch": 0.04485194649030513, + "flos": 31184284049280.0, + "grad_norm": 12.351211228960139, + "language_loss": 0.73676586, + "learning_rate": 3.997686978575302e-06, + "loss": 0.82540524, + "num_input_tokens_seen": 16008310, + "router_z_loss_clip": 8.05859375, + "router_z_loss_mlp": 1.0, + "step": 746, + "time_per_iteration": 2.8217551708221436 + }, + { + "auxiliary_loss_clip": 0.07421336, + "auxiliary_loss_mlp": 0.01411005, + "balance_loss_clip": 0.06631814, + "balance_loss_mlp": 0.01308485, + "epoch": 0.04491206974297309, + "flos": 26150922887040.0, + "grad_norm": 4.52399420645529, + "language_loss": 0.7370531, + "learning_rate": 3.997668215391625e-06, + "loss": 0.82537645, + "num_input_tokens_seen": 16029620, + "router_z_loss_clip": 7.89453125, + "router_z_loss_mlp": 1.02587891, + "step": 747, + "time_per_iteration": 2.724240303039551 + }, + { + "auxiliary_loss_clip": 0.0741486, + "auxiliary_loss_mlp": 0.01407706, + "balance_loss_clip": 0.06629101, + "balance_loss_mlp": 0.0131005, + "epoch": 0.044972192995641064, + "flos": 20673922682880.0, + "grad_norm": 4.695342378066542, + "language_loss": 0.7142753, + "learning_rate": 3.997649376456168e-06, + "loss": 0.80250096, + "num_input_tokens_seen": 16049065, + "router_z_loss_clip": 7.859375, + "router_z_loss_mlp": 0.97607422, + "step": 748, + "time_per_iteration": 2.6020255088806152 + }, + { + "auxiliary_loss_clip": 0.0743566, + "auxiliary_loss_mlp": 0.01385894, + "balance_loss_clip": 0.06626688, + "balance_loss_mlp": 0.01281753, + "epoch": 0.045032316248309036, + "flos": 16112306407680.0, + "grad_norm": 6.462262226814603, + "language_loss": 0.81646264, + "learning_rate": 3.997630461769647e-06, + "loss": 0.90467817, + "num_input_tokens_seen": 16066765, + "router_z_loss_clip": 8.08984375, + "router_z_loss_mlp": 1.04199219, + "step": 749, + "time_per_iteration": 2.715440273284912 + }, + { + "auxiliary_loss_clip": 0.07424041, + "auxiliary_loss_mlp": 0.01391269, + "balance_loss_clip": 0.06627008, + "balance_loss_mlp": 0.01284601, + "epoch": 0.045092439500977, + "flos": 17864725098240.0, + "grad_norm": 4.760324696153287, + "language_loss": 0.94018352, + "learning_rate": 3.997611471332778e-06, + "loss": 1.02833652, + "num_input_tokens_seen": 16085980, + "router_z_loss_clip": 7.96484375, + "router_z_loss_mlp": 1.06542969, + "step": 750, + "time_per_iteration": 2.603782892227173 + }, + { + "auxiliary_loss_clip": 0.07430436, + "auxiliary_loss_mlp": 0.01400307, + "balance_loss_clip": 0.06634089, + "balance_loss_mlp": 0.01284579, + "epoch": 0.04515256275364497, + "flos": 24469809621120.0, + "grad_norm": 8.436133500985974, + "language_loss": 0.79776669, + "learning_rate": 3.9975924051462825e-06, + "loss": 0.88607413, + "num_input_tokens_seen": 16106260, + "router_z_loss_clip": 7.97265625, + "router_z_loss_mlp": 1.15673828, + "step": 751, + "time_per_iteration": 2.6831071376800537 + }, + { + "auxiliary_loss_clip": 0.07439418, + "auxiliary_loss_mlp": 0.01393415, + "balance_loss_clip": 0.06633066, + "balance_loss_mlp": 0.01282932, + "epoch": 0.04521268600631294, + "flos": 20921563025280.0, + "grad_norm": 6.241833654243461, + "language_loss": 0.75070345, + "learning_rate": 3.997573263210883e-06, + "loss": 0.83903182, + "num_input_tokens_seen": 16123475, + "router_z_loss_clip": 8.05859375, + "router_z_loss_mlp": 1.10351562, + "step": 752, + "time_per_iteration": 2.6177663803100586 + }, + { + "auxiliary_loss_clip": 0.07437599, + "auxiliary_loss_mlp": 0.01387858, + "balance_loss_clip": 0.06631324, + "balance_loss_mlp": 0.01275515, + "epoch": 0.04527280925898091, + "flos": 13376552526720.0, + "grad_norm": 9.915844804632899, + "language_loss": 0.97712451, + "learning_rate": 3.997554045527305e-06, + "loss": 1.06537914, + "num_input_tokens_seen": 16138335, + "router_z_loss_clip": 8.0703125, + "router_z_loss_mlp": 1.125, + "step": 753, + "time_per_iteration": 2.613664388656616 + }, + { + "auxiliary_loss_clip": 0.07467066, + "auxiliary_loss_mlp": 0.0138957, + "balance_loss_clip": 0.06645191, + "balance_loss_mlp": 0.01278133, + "epoch": 0.04533293251164888, + "flos": 23260650877440.0, + "grad_norm": 4.960920268809469, + "language_loss": 0.95308006, + "learning_rate": 3.997534752096277e-06, + "loss": 1.04164636, + "num_input_tokens_seen": 16157110, + "router_z_loss_clip": 8.23046875, + "router_z_loss_mlp": 1.11376953, + "step": 754, + "time_per_iteration": 2.6214957237243652 + }, + { + "auxiliary_loss_clip": 0.07402018, + "auxiliary_loss_mlp": 0.01373244, + "balance_loss_clip": 0.06614807, + "balance_loss_mlp": 0.01264812, + "epoch": 0.04539305576431685, + "flos": 12426899477760.0, + "grad_norm": 4.312204742226669, + "language_loss": 0.84473336, + "learning_rate": 3.997515382918531e-06, + "loss": 0.93248594, + "num_input_tokens_seen": 16174155, + "router_z_loss_clip": 7.87890625, + "router_z_loss_mlp": 1.08544922, + "step": 755, + "time_per_iteration": 2.659515857696533 + }, + { + "auxiliary_loss_clip": 0.07425568, + "auxiliary_loss_mlp": 0.01385083, + "balance_loss_clip": 0.06618007, + "balance_loss_mlp": 0.01261582, + "epoch": 0.04545317901698482, + "flos": 16076569841280.0, + "grad_norm": 4.663949688306233, + "language_loss": 0.85189492, + "learning_rate": 3.9974959379948015e-06, + "loss": 0.94000149, + "num_input_tokens_seen": 16192240, + "router_z_loss_clip": 8.078125, + "router_z_loss_mlp": 1.23632812, + "step": 756, + "time_per_iteration": 2.5948095321655273 + }, + { + "auxiliary_loss_clip": 0.0692629, + "auxiliary_loss_mlp": 0.01345145, + "balance_loss_clip": 0.06492035, + "balance_loss_mlp": 0.01295292, + "epoch": 0.045513302269652785, + "flos": 66418118144640.0, + "grad_norm": 0.7901603277703675, + "language_loss": 0.62960637, + "learning_rate": 3.997476417325827e-06, + "loss": 0.71232069, + "num_input_tokens_seen": 16255775, + "router_z_loss_clip": 4.34375, + "router_z_loss_mlp": 0.49829102, + "step": 757, + "time_per_iteration": 3.255581855773926 + }, + { + "auxiliary_loss_clip": 0.07416959, + "auxiliary_loss_mlp": 0.01380818, + "balance_loss_clip": 0.06624802, + "balance_loss_mlp": 0.01258747, + "epoch": 0.04557342552232076, + "flos": 21477694573440.0, + "grad_norm": 3.09506424046452, + "language_loss": 0.87773216, + "learning_rate": 3.997456820912346e-06, + "loss": 0.96570992, + "num_input_tokens_seen": 16277015, + "router_z_loss_clip": 7.921875, + "router_z_loss_mlp": 1.22070312, + "step": 758, + "time_per_iteration": 2.661123514175415 + }, + { + "auxiliary_loss_clip": 0.0740035, + "auxiliary_loss_mlp": 0.01375063, + "balance_loss_clip": 0.06621221, + "balance_loss_mlp": 0.01257952, + "epoch": 0.04563354877498873, + "flos": 23739481434240.0, + "grad_norm": 2.638413914831674, + "language_loss": 0.92492557, + "learning_rate": 3.997437148755101e-06, + "loss": 1.0126797, + "num_input_tokens_seen": 16296005, + "router_z_loss_clip": 7.78515625, + "router_z_loss_mlp": 1.17089844, + "step": 759, + "time_per_iteration": 2.668470859527588 + }, + { + "auxiliary_loss_clip": 0.07430892, + "auxiliary_loss_mlp": 0.01383461, + "balance_loss_clip": 0.06623936, + "balance_loss_mlp": 0.01266541, + "epoch": 0.045693672027656694, + "flos": 25742265724800.0, + "grad_norm": 3.8629420904701237, + "language_loss": 0.79697698, + "learning_rate": 3.9974174008548405e-06, + "loss": 0.88512051, + "num_input_tokens_seen": 16315300, + "router_z_loss_clip": 8.07421875, + "router_z_loss_mlp": 1.16992188, + "step": 760, + "time_per_iteration": 2.716425895690918 + }, + { + "auxiliary_loss_clip": 0.07406907, + "auxiliary_loss_mlp": 0.01369419, + "balance_loss_clip": 0.06620169, + "balance_loss_mlp": 0.01267519, + "epoch": 0.045753795280324666, + "flos": 19725108174720.0, + "grad_norm": 2.8686759977967458, + "language_loss": 0.87246794, + "learning_rate": 3.9973975772123105e-06, + "loss": 0.96023118, + "num_input_tokens_seen": 16333820, + "router_z_loss_clip": 7.87109375, + "router_z_loss_mlp": 1.01855469, + "step": 761, + "time_per_iteration": 2.6261487007141113 + }, + { + "auxiliary_loss_clip": 0.07379207, + "auxiliary_loss_mlp": 0.01371916, + "balance_loss_clip": 0.06607988, + "balance_loss_mlp": 0.01259764, + "epoch": 0.04581391853299264, + "flos": 23262076396800.0, + "grad_norm": 2.7268346941502273, + "language_loss": 0.83904314, + "learning_rate": 3.997377677828266e-06, + "loss": 0.92655438, + "num_input_tokens_seen": 16355290, + "router_z_loss_clip": 7.71875, + "router_z_loss_mlp": 1.12304688, + "step": 762, + "time_per_iteration": 2.677358627319336 + }, + { + "auxiliary_loss_clip": 0.06917945, + "auxiliary_loss_mlp": 0.01342542, + "balance_loss_clip": 0.06491472, + "balance_loss_mlp": 0.01301057, + "epoch": 0.0458740417856606, + "flos": 64250711308800.0, + "grad_norm": 0.9293980504879501, + "language_loss": 0.59131134, + "learning_rate": 3.9973577027034585e-06, + "loss": 0.67391622, + "num_input_tokens_seen": 16415995, + "router_z_loss_clip": 4.25, + "router_z_loss_mlp": 0.41503906, + "step": 763, + "time_per_iteration": 3.262456178665161 + }, + { + "auxiliary_loss_clip": 0.07421511, + "auxiliary_loss_mlp": 0.01399391, + "balance_loss_clip": 0.0662367, + "balance_loss_mlp": 0.01283425, + "epoch": 0.045934165038328575, + "flos": 20775220669440.0, + "grad_norm": 3.4758610459340535, + "language_loss": 0.92935646, + "learning_rate": 3.9973376518386475e-06, + "loss": 1.01756549, + "num_input_tokens_seen": 16433120, + "router_z_loss_clip": 7.98046875, + "router_z_loss_mlp": 1.15869141, + "step": 764, + "time_per_iteration": 2.66152024269104 + }, + { + "auxiliary_loss_clip": 0.07451791, + "auxiliary_loss_mlp": 0.01391333, + "balance_loss_clip": 0.06637829, + "balance_loss_mlp": 0.01274556, + "epoch": 0.04599428829099654, + "flos": 30270661056000.0, + "grad_norm": 3.768496915542153, + "language_loss": 0.90699267, + "learning_rate": 3.997317525234592e-06, + "loss": 0.99542397, + "num_input_tokens_seen": 16453360, + "router_z_loss_clip": 8.14453125, + "router_z_loss_mlp": 1.16845703, + "step": 765, + "time_per_iteration": 2.6835410594940186 + }, + { + "auxiliary_loss_clip": 0.07426902, + "auxiliary_loss_mlp": 0.01398616, + "balance_loss_clip": 0.0662117, + "balance_loss_mlp": 0.01278883, + "epoch": 0.04605441154366451, + "flos": 23045518719360.0, + "grad_norm": 7.076643019058991, + "language_loss": 0.94406933, + "learning_rate": 3.997297322892056e-06, + "loss": 1.03232455, + "num_input_tokens_seen": 16471160, + "router_z_loss_clip": 8.0625, + "router_z_loss_mlp": 1.19580078, + "step": 766, + "time_per_iteration": 2.6382553577423096 + }, + { + "auxiliary_loss_clip": 0.07415807, + "auxiliary_loss_mlp": 0.01393781, + "balance_loss_clip": 0.06614047, + "balance_loss_mlp": 0.01284967, + "epoch": 0.046114534796332485, + "flos": 22023847486080.0, + "grad_norm": 4.776611740874826, + "language_loss": 0.89285934, + "learning_rate": 3.997277044811806e-06, + "loss": 0.98095518, + "num_input_tokens_seen": 16488940, + "router_z_loss_clip": 8.01953125, + "router_z_loss_mlp": 1.08789062, + "step": 767, + "time_per_iteration": 4.195739984512329 + }, + { + "auxiliary_loss_clip": 0.07392205, + "auxiliary_loss_mlp": 0.01374375, + "balance_loss_clip": 0.0661349, + "balance_loss_mlp": 0.01267278, + "epoch": 0.04617465804900045, + "flos": 29870221593600.0, + "grad_norm": 7.642963435689524, + "language_loss": 0.92056656, + "learning_rate": 3.99725669099461e-06, + "loss": 1.00823236, + "num_input_tokens_seen": 16509505, + "router_z_loss_clip": 7.7890625, + "router_z_loss_mlp": 1.0703125, + "step": 768, + "time_per_iteration": 4.208758354187012 + }, + { + "auxiliary_loss_clip": 0.07427865, + "auxiliary_loss_mlp": 0.01386956, + "balance_loss_clip": 0.06619686, + "balance_loss_mlp": 0.01278571, + "epoch": 0.04623478130166842, + "flos": 25637194304640.0, + "grad_norm": 3.542997425401238, + "language_loss": 0.79400444, + "learning_rate": 3.9972362614412395e-06, + "loss": 0.88215268, + "num_input_tokens_seen": 16528840, + "router_z_loss_clip": 8.078125, + "router_z_loss_mlp": 1.08447266, + "step": 769, + "time_per_iteration": 4.17974328994751 + }, + { + "auxiliary_loss_clip": 0.07375413, + "auxiliary_loss_mlp": 0.01385881, + "balance_loss_clip": 0.06606276, + "balance_loss_mlp": 0.01275923, + "epoch": 0.04629490455433639, + "flos": 20455352507520.0, + "grad_norm": 2.7800745603564185, + "language_loss": 0.89842647, + "learning_rate": 3.997215756152471e-06, + "loss": 0.9860394, + "num_input_tokens_seen": 16548335, + "router_z_loss_clip": 7.69140625, + "router_z_loss_mlp": 1.10009766, + "step": 770, + "time_per_iteration": 2.656651735305786 + }, + { + "auxiliary_loss_clip": 0.07423855, + "auxiliary_loss_mlp": 0.01400348, + "balance_loss_clip": 0.06619771, + "balance_loss_mlp": 0.01292678, + "epoch": 0.04635502780700436, + "flos": 23155411749120.0, + "grad_norm": 4.755062709171144, + "language_loss": 0.92055309, + "learning_rate": 3.99719517512908e-06, + "loss": 1.00879514, + "num_input_tokens_seen": 16567725, + "router_z_loss_clip": 8.04296875, + "router_z_loss_mlp": 1.07714844, + "step": 771, + "time_per_iteration": 4.008092403411865 + }, + { + "auxiliary_loss_clip": 0.07446887, + "auxiliary_loss_mlp": 0.0141094, + "balance_loss_clip": 0.06623209, + "balance_loss_mlp": 0.01295641, + "epoch": 0.04641515105967233, + "flos": 23298274160640.0, + "grad_norm": 7.281609081858744, + "language_loss": 0.88918245, + "learning_rate": 3.997174518371848e-06, + "loss": 0.97776067, + "num_input_tokens_seen": 16588175, + "router_z_loss_clip": 8.2265625, + "router_z_loss_mlp": 1.15380859, + "step": 772, + "time_per_iteration": 2.6240971088409424 + }, + { + "auxiliary_loss_clip": 0.07388498, + "auxiliary_loss_mlp": 0.01396403, + "balance_loss_clip": 0.06612748, + "balance_loss_mlp": 0.01294503, + "epoch": 0.046475274312340296, + "flos": 25121579005440.0, + "grad_norm": 3.47084722704317, + "language_loss": 0.78166652, + "learning_rate": 3.997153785881557e-06, + "loss": 0.86951548, + "num_input_tokens_seen": 16607735, + "router_z_loss_clip": 7.765625, + "router_z_loss_mlp": 1.01904297, + "step": 773, + "time_per_iteration": 2.6761457920074463 + }, + { + "auxiliary_loss_clip": 0.07362784, + "auxiliary_loss_mlp": 0.01412458, + "balance_loss_clip": 0.06602354, + "balance_loss_mlp": 0.0130703, + "epoch": 0.04653539756500827, + "flos": 25271946357120.0, + "grad_norm": 3.68531082302782, + "language_loss": 0.82003927, + "learning_rate": 3.997132977658996e-06, + "loss": 0.90779173, + "num_input_tokens_seen": 16627225, + "router_z_loss_clip": 7.609375, + "router_z_loss_mlp": 1.05419922, + "step": 774, + "time_per_iteration": 2.6333625316619873 + }, + { + "auxiliary_loss_clip": 0.0737831, + "auxiliary_loss_mlp": 0.01410602, + "balance_loss_clip": 0.06605712, + "balance_loss_mlp": 0.0129783, + "epoch": 0.046595520817676234, + "flos": 35412238166400.0, + "grad_norm": 3.362442863286837, + "language_loss": 0.78172398, + "learning_rate": 3.997112093704952e-06, + "loss": 0.86961305, + "num_input_tokens_seen": 16647785, + "router_z_loss_clip": 7.73046875, + "router_z_loss_mlp": 1.12792969, + "step": 775, + "time_per_iteration": 2.7341220378875732 + }, + { + "auxiliary_loss_clip": 0.07397586, + "auxiliary_loss_mlp": 0.01408088, + "balance_loss_clip": 0.0662451, + "balance_loss_mlp": 0.01303994, + "epoch": 0.046655644070344206, + "flos": 18118151372160.0, + "grad_norm": 4.938605745427105, + "language_loss": 0.81674814, + "learning_rate": 3.997091134020217e-06, + "loss": 0.90480489, + "num_input_tokens_seen": 16667555, + "router_z_loss_clip": 7.734375, + "router_z_loss_mlp": 1.04052734, + "step": 776, + "time_per_iteration": 2.631185293197632 + }, + { + "auxiliary_loss_clip": 0.07349464, + "auxiliary_loss_mlp": 0.01382372, + "balance_loss_clip": 0.06605366, + "balance_loss_mlp": 0.01283905, + "epoch": 0.04671576732301218, + "flos": 29212959767040.0, + "grad_norm": 3.9530223985438724, + "language_loss": 0.76411474, + "learning_rate": 3.997070098605585e-06, + "loss": 0.85143304, + "num_input_tokens_seen": 16686875, + "router_z_loss_clip": 7.4375, + "router_z_loss_mlp": 0.98535156, + "step": 777, + "time_per_iteration": 2.6883299350738525 + }, + { + "auxiliary_loss_clip": 0.07356873, + "auxiliary_loss_mlp": 0.01403802, + "balance_loss_clip": 0.06604887, + "balance_loss_mlp": 0.0129618, + "epoch": 0.04677589057568014, + "flos": 30485541651840.0, + "grad_norm": 5.886017158674543, + "language_loss": 0.8144322, + "learning_rate": 3.997048987461856e-06, + "loss": 0.90203899, + "num_input_tokens_seen": 16706420, + "router_z_loss_clip": 7.52734375, + "router_z_loss_mlp": 1.07568359, + "step": 778, + "time_per_iteration": 2.685317277908325 + }, + { + "auxiliary_loss_clip": 0.07353938, + "auxiliary_loss_mlp": 0.01397494, + "balance_loss_clip": 0.06609853, + "balance_loss_mlp": 0.01301697, + "epoch": 0.046836013828348115, + "flos": 20563820017920.0, + "grad_norm": 3.1633004103469644, + "language_loss": 0.83870596, + "learning_rate": 3.997027800589829e-06, + "loss": 0.92622018, + "num_input_tokens_seen": 16726390, + "router_z_loss_clip": 7.4375, + "router_z_loss_mlp": 0.95849609, + "step": 779, + "time_per_iteration": 2.737780809402466 + }, + { + "auxiliary_loss_clip": 0.07349363, + "auxiliary_loss_mlp": 0.01400206, + "balance_loss_clip": 0.06610721, + "balance_loss_mlp": 0.01301119, + "epoch": 0.04689613708101608, + "flos": 25454444549760.0, + "grad_norm": 5.859193350473668, + "language_loss": 0.80411738, + "learning_rate": 3.997006537990308e-06, + "loss": 0.89161313, + "num_input_tokens_seen": 16748965, + "router_z_loss_clip": 7.38671875, + "router_z_loss_mlp": 0.99023438, + "step": 780, + "time_per_iteration": 2.7168006896972656 + }, + { + "auxiliary_loss_clip": 0.07343157, + "auxiliary_loss_mlp": 0.0140195, + "balance_loss_clip": 0.06612131, + "balance_loss_mlp": 0.01309253, + "epoch": 0.04695626033368405, + "flos": 23007811582080.0, + "grad_norm": 3.4762604948204707, + "language_loss": 0.80410504, + "learning_rate": 3.996985199664099e-06, + "loss": 0.89155614, + "num_input_tokens_seen": 16768620, + "router_z_loss_clip": 7.3125, + "router_z_loss_mlp": 0.92724609, + "step": 781, + "time_per_iteration": 2.6267943382263184 + }, + { + "auxiliary_loss_clip": 0.07401444, + "auxiliary_loss_mlp": 0.01433849, + "balance_loss_clip": 0.06619258, + "balance_loss_mlp": 0.01321363, + "epoch": 0.047016383586352024, + "flos": 29141193144960.0, + "grad_norm": 4.331089591937386, + "language_loss": 0.79331714, + "learning_rate": 3.99696378561201e-06, + "loss": 0.88167012, + "num_input_tokens_seen": 16789755, + "router_z_loss_clip": 7.83984375, + "router_z_loss_mlp": 1.12451172, + "step": 782, + "time_per_iteration": 2.7272114753723145 + }, + { + "auxiliary_loss_clip": 0.07364355, + "auxiliary_loss_mlp": 0.01439388, + "balance_loss_clip": 0.06623092, + "balance_loss_mlp": 0.01338251, + "epoch": 0.04707650683901999, + "flos": 14981706466560.0, + "grad_norm": 6.433414878185146, + "language_loss": 0.85460365, + "learning_rate": 3.996942295834855e-06, + "loss": 0.94264108, + "num_input_tokens_seen": 16807585, + "router_z_loss_clip": 7.421875, + "router_z_loss_mlp": 1.01269531, + "step": 783, + "time_per_iteration": 2.6950912475585938 + }, + { + "auxiliary_loss_clip": 0.07354224, + "auxiliary_loss_mlp": 0.01436959, + "balance_loss_clip": 0.06629962, + "balance_loss_mlp": 0.01332722, + "epoch": 0.04713663009168796, + "flos": 21657257873280.0, + "grad_norm": 5.367904788236997, + "language_loss": 0.87574267, + "learning_rate": 3.996920730333448e-06, + "loss": 0.96365452, + "num_input_tokens_seen": 16827220, + "router_z_loss_clip": 7.234375, + "router_z_loss_mlp": 1.04150391, + "step": 784, + "time_per_iteration": 2.649948835372925 + }, + { + "auxiliary_loss_clip": 0.07386977, + "auxiliary_loss_mlp": 0.01467498, + "balance_loss_clip": 0.06641141, + "balance_loss_mlp": 0.01344665, + "epoch": 0.04719675334435593, + "flos": 21331939196160.0, + "grad_norm": 33.75407076232228, + "language_loss": 0.85470867, + "learning_rate": 3.996899089108607e-06, + "loss": 0.9432534, + "num_input_tokens_seen": 16846230, + "router_z_loss_clip": 7.453125, + "router_z_loss_mlp": 1.22753906, + "step": 785, + "time_per_iteration": 2.641284227371216 + }, + { + "auxiliary_loss_clip": 0.07399641, + "auxiliary_loss_mlp": 0.01481075, + "balance_loss_clip": 0.06649202, + "balance_loss_mlp": 0.01357002, + "epoch": 0.0472568765970239, + "flos": 17937204480000.0, + "grad_norm": 4.826067054081543, + "language_loss": 0.94969213, + "learning_rate": 3.996877372161152e-06, + "loss": 1.03849936, + "num_input_tokens_seen": 16865325, + "router_z_loss_clip": 7.51953125, + "router_z_loss_mlp": 1.24023438, + "step": 786, + "time_per_iteration": 2.6160340309143066 + }, + { + "auxiliary_loss_clip": 0.07465263, + "auxiliary_loss_mlp": 0.01521969, + "balance_loss_clip": 0.06653383, + "balance_loss_mlp": 0.01371384, + "epoch": 0.04731699984969187, + "flos": 18083169492480.0, + "grad_norm": 10.690384669742231, + "language_loss": 0.84019518, + "learning_rate": 3.9968555794919065e-06, + "loss": 0.93006748, + "num_input_tokens_seen": 16882930, + "router_z_loss_clip": 8.1328125, + "router_z_loss_mlp": 1.50488281, + "step": 787, + "time_per_iteration": 2.5864908695220947 + }, + { + "auxiliary_loss_clip": 0.07389308, + "auxiliary_loss_mlp": 0.01468371, + "balance_loss_clip": 0.06647876, + "balance_loss_mlp": 0.01332663, + "epoch": 0.047377123102359836, + "flos": 23191735294080.0, + "grad_norm": 8.892570877156906, + "language_loss": 0.85964632, + "learning_rate": 3.996833711101698e-06, + "loss": 0.94822311, + "num_input_tokens_seen": 16900710, + "router_z_loss_clip": 7.41796875, + "router_z_loss_mlp": 1.35839844, + "step": 788, + "time_per_iteration": 2.6390748023986816 + }, + { + "auxiliary_loss_clip": 0.07401264, + "auxiliary_loss_mlp": 0.01469979, + "balance_loss_clip": 0.06672339, + "balance_loss_mlp": 0.01334367, + "epoch": 0.04743724635502781, + "flos": 22754469162240.0, + "grad_norm": 17.026258111429804, + "language_loss": 0.89192903, + "learning_rate": 3.996811766991355e-06, + "loss": 0.98064142, + "num_input_tokens_seen": 16919210, + "router_z_loss_clip": 7.29296875, + "router_z_loss_mlp": 1.35449219, + "step": 789, + "time_per_iteration": 2.6131770610809326 + }, + { + "auxiliary_loss_clip": 0.07421435, + "auxiliary_loss_mlp": 0.01479761, + "balance_loss_clip": 0.06683871, + "balance_loss_mlp": 0.01339475, + "epoch": 0.04749736960769577, + "flos": 17244499576320.0, + "grad_norm": 30.32315054606697, + "language_loss": 0.88307178, + "learning_rate": 3.996789747161709e-06, + "loss": 0.97208381, + "num_input_tokens_seen": 16937125, + "router_z_loss_clip": 7.37890625, + "router_z_loss_mlp": 1.40136719, + "step": 790, + "time_per_iteration": 2.618745803833008 + }, + { + "auxiliary_loss_clip": 0.07412322, + "auxiliary_loss_mlp": 0.01470303, + "balance_loss_clip": 0.06664298, + "balance_loss_mlp": 0.01331687, + "epoch": 0.047557492860363745, + "flos": 40488798908160.0, + "grad_norm": 154.88106341207603, + "language_loss": 0.94037831, + "learning_rate": 3.996767651613597e-06, + "loss": 1.02920461, + "num_input_tokens_seen": 16958610, + "router_z_loss_clip": 7.48046875, + "router_z_loss_mlp": 1.38623047, + "step": 791, + "time_per_iteration": 2.7700016498565674 + }, + { + "auxiliary_loss_clip": 0.07422841, + "auxiliary_loss_mlp": 0.01462484, + "balance_loss_clip": 0.06681914, + "balance_loss_mlp": 0.01322198, + "epoch": 0.04761761611303172, + "flos": 18704023920000.0, + "grad_norm": 23.33805920811653, + "language_loss": 0.9476828, + "learning_rate": 3.996745480347854e-06, + "loss": 1.03653598, + "num_input_tokens_seen": 16977300, + "router_z_loss_clip": 7.4140625, + "router_z_loss_mlp": 1.40332031, + "step": 792, + "time_per_iteration": 2.605254888534546 + }, + { + "auxiliary_loss_clip": 0.07424683, + "auxiliary_loss_mlp": 0.01473205, + "balance_loss_clip": 0.0668014, + "balance_loss_mlp": 0.01333396, + "epoch": 0.04767773936569968, + "flos": 20928103643520.0, + "grad_norm": 9.340139883580587, + "language_loss": 0.78320849, + "learning_rate": 3.996723233365324e-06, + "loss": 0.87218744, + "num_input_tokens_seen": 16994950, + "router_z_loss_clip": 7.44921875, + "router_z_loss_mlp": 1.39697266, + "step": 793, + "time_per_iteration": 2.589350938796997 + }, + { + "auxiliary_loss_clip": 0.07421647, + "auxiliary_loss_mlp": 0.01474475, + "balance_loss_clip": 0.06679038, + "balance_loss_mlp": 0.01333379, + "epoch": 0.047737862618367655, + "flos": 23739481434240.0, + "grad_norm": 17.45910394468578, + "language_loss": 0.91955769, + "learning_rate": 3.996700910666847e-06, + "loss": 1.00851893, + "num_input_tokens_seen": 17014760, + "router_z_loss_clip": 7.4296875, + "router_z_loss_mlp": 1.41064453, + "step": 794, + "time_per_iteration": 2.65012264251709 + }, + { + "auxiliary_loss_clip": 0.07410855, + "auxiliary_loss_mlp": 0.01451088, + "balance_loss_clip": 0.06674555, + "balance_loss_mlp": 0.01322247, + "epoch": 0.04779798587103562, + "flos": 23702487056640.0, + "grad_norm": 25.87656480685072, + "language_loss": 0.77586949, + "learning_rate": 3.996678512253272e-06, + "loss": 0.8644889, + "num_input_tokens_seen": 17032715, + "router_z_loss_clip": 7.3671875, + "router_z_loss_mlp": 1.28808594, + "step": 795, + "time_per_iteration": 2.6948788166046143 + }, + { + "auxiliary_loss_clip": 0.07379565, + "auxiliary_loss_mlp": 0.01431544, + "balance_loss_clip": 0.06667496, + "balance_loss_mlp": 0.01302989, + "epoch": 0.04785810912370359, + "flos": 23190058212480.0, + "grad_norm": 8.675826434601191, + "language_loss": 0.85312498, + "learning_rate": 3.996656038125449e-06, + "loss": 0.94123614, + "num_input_tokens_seen": 17052215, + "router_z_loss_clip": 7.12109375, + "router_z_loss_mlp": 1.28466797, + "step": 796, + "time_per_iteration": 2.7435877323150635 + }, + { + "auxiliary_loss_clip": 0.07385565, + "auxiliary_loss_mlp": 0.0140352, + "balance_loss_clip": 0.06662786, + "balance_loss_mlp": 0.01285074, + "epoch": 0.047918232376371564, + "flos": 18046426677120.0, + "grad_norm": 54.926272560680225, + "language_loss": 0.8855834, + "learning_rate": 3.996633488284228e-06, + "loss": 0.97347426, + "num_input_tokens_seen": 17069225, + "router_z_loss_clip": 7.23046875, + "router_z_loss_mlp": 1.18359375, + "step": 797, + "time_per_iteration": 2.6623764038085938 + }, + { + "auxiliary_loss_clip": 0.07094701, + "auxiliary_loss_mlp": 0.01316158, + "balance_loss_clip": 0.0666967, + "balance_loss_mlp": 0.01274649, + "epoch": 0.04797835562903953, + "flos": 62461717511040.0, + "grad_norm": 0.9155106497251145, + "language_loss": 0.64821255, + "learning_rate": 3.996610862730465e-06, + "loss": 0.73232114, + "num_input_tokens_seen": 17126680, + "router_z_loss_clip": 4.25, + "router_z_loss_mlp": 0.4152832, + "step": 798, + "time_per_iteration": 3.148404121398926 + }, + { + "auxiliary_loss_clip": 0.07427999, + "auxiliary_loss_mlp": 0.01422996, + "balance_loss_clip": 0.06684162, + "balance_loss_mlp": 0.01303215, + "epoch": 0.0480384788817075, + "flos": 21513766556160.0, + "grad_norm": 16.018908533164023, + "language_loss": 0.96157068, + "learning_rate": 3.996588161465018e-06, + "loss": 1.05008054, + "num_input_tokens_seen": 17144835, + "router_z_loss_clip": 7.4453125, + "router_z_loss_mlp": 1.19775391, + "step": 799, + "time_per_iteration": 2.6639058589935303 + }, + { + "auxiliary_loss_clip": 0.07364519, + "auxiliary_loss_mlp": 0.01407648, + "balance_loss_clip": 0.06657426, + "balance_loss_mlp": 0.01297594, + "epoch": 0.048098602134375466, + "flos": 21733301053440.0, + "grad_norm": 22.047266878511874, + "language_loss": 0.92366803, + "learning_rate": 3.996565384488748e-06, + "loss": 1.01138973, + "num_input_tokens_seen": 17165030, + "router_z_loss_clip": 7.07421875, + "router_z_loss_mlp": 1.10253906, + "step": 800, + "time_per_iteration": 2.646414041519165 + }, + { + "auxiliary_loss_clip": 0.07370388, + "auxiliary_loss_mlp": 0.01385117, + "balance_loss_clip": 0.06655432, + "balance_loss_mlp": 0.01282549, + "epoch": 0.04815872538704344, + "flos": 22937931676800.0, + "grad_norm": 10.357052219396058, + "language_loss": 0.89344579, + "learning_rate": 3.996542531802518e-06, + "loss": 0.98100084, + "num_input_tokens_seen": 17184895, + "router_z_loss_clip": 7.1484375, + "router_z_loss_mlp": 1.02636719, + "step": 801, + "time_per_iteration": 2.6882050037384033 + }, + { + "auxiliary_loss_clip": 0.07345966, + "auxiliary_loss_mlp": 0.01362249, + "balance_loss_clip": 0.06635958, + "balance_loss_mlp": 0.01265022, + "epoch": 0.04821884863971141, + "flos": 43183952686080.0, + "grad_norm": 6.136831614794949, + "language_loss": 0.85035717, + "learning_rate": 3.996519603407196e-06, + "loss": 0.93743926, + "num_input_tokens_seen": 17208225, + "router_z_loss_clip": 7.10546875, + "router_z_loss_mlp": 0.97216797, + "step": 802, + "time_per_iteration": 2.79622220993042 + }, + { + "auxiliary_loss_clip": 0.07318079, + "auxiliary_loss_mlp": 0.01347073, + "balance_loss_clip": 0.06636789, + "balance_loss_mlp": 0.01265057, + "epoch": 0.048278971892379376, + "flos": 18625171628160.0, + "grad_norm": 43.20373329941697, + "language_loss": 0.91245079, + "learning_rate": 3.996496599303649e-06, + "loss": 0.99910235, + "num_input_tokens_seen": 17226305, + "router_z_loss_clip": 6.81640625, + "router_z_loss_mlp": 0.81982422, + "step": 803, + "time_per_iteration": 2.624542236328125 + }, + { + "auxiliary_loss_clip": 0.07327777, + "auxiliary_loss_mlp": 0.01365974, + "balance_loss_clip": 0.06626104, + "balance_loss_mlp": 0.01271798, + "epoch": 0.04833909514504735, + "flos": 20236279207680.0, + "grad_norm": 95.48194102470296, + "language_loss": 0.905747, + "learning_rate": 3.996473519492753e-06, + "loss": 0.99268442, + "num_input_tokens_seen": 17244545, + "router_z_loss_clip": 7.01953125, + "router_z_loss_mlp": 0.94238281, + "step": 804, + "time_per_iteration": 2.597118854522705 + }, + { + "auxiliary_loss_clip": 0.07322634, + "auxiliary_loss_mlp": 0.01340955, + "balance_loss_clip": 0.0662351, + "balance_loss_mlp": 0.01259273, + "epoch": 0.04839921839771532, + "flos": 24652182032640.0, + "grad_norm": 4.3863417773594096, + "language_loss": 0.91238397, + "learning_rate": 3.99645036397538e-06, + "loss": 0.99901986, + "num_input_tokens_seen": 17265730, + "router_z_loss_clip": 6.9921875, + "router_z_loss_mlp": 0.81689453, + "step": 805, + "time_per_iteration": 2.6999049186706543 + }, + { + "auxiliary_loss_clip": 0.07332969, + "auxiliary_loss_mlp": 0.01347421, + "balance_loss_clip": 0.06628814, + "balance_loss_mlp": 0.01263783, + "epoch": 0.048459341650383285, + "flos": 24834470590080.0, + "grad_norm": 14.417666191465669, + "language_loss": 0.71703786, + "learning_rate": 3.9964271327524085e-06, + "loss": 0.80384171, + "num_input_tokens_seen": 17284820, + "router_z_loss_clip": 7.046875, + "router_z_loss_mlp": 0.8359375, + "step": 806, + "time_per_iteration": 4.025094985961914 + }, + { + "auxiliary_loss_clip": 0.07307116, + "auxiliary_loss_mlp": 0.01343001, + "balance_loss_clip": 0.06628814, + "balance_loss_mlp": 0.01262844, + "epoch": 0.04851946490305126, + "flos": 22169644790400.0, + "grad_norm": 6.037392612651371, + "language_loss": 0.81120235, + "learning_rate": 3.9964038258247214e-06, + "loss": 0.89770353, + "num_input_tokens_seen": 17305085, + "router_z_loss_clip": 6.7734375, + "router_z_loss_mlp": 0.80126953, + "step": 807, + "time_per_iteration": 4.06866717338562 + }, + { + "auxiliary_loss_clip": 0.07289852, + "auxiliary_loss_mlp": 0.01348053, + "balance_loss_clip": 0.06616738, + "balance_loss_mlp": 0.01266228, + "epoch": 0.04857958815571922, + "flos": 19798132608000.0, + "grad_norm": 11.228648532877324, + "language_loss": 0.92036742, + "learning_rate": 3.9963804431932005e-06, + "loss": 1.00674641, + "num_input_tokens_seen": 17322715, + "router_z_loss_clip": 6.73046875, + "router_z_loss_mlp": 0.81738281, + "step": 808, + "time_per_iteration": 3.9916791915893555 + }, + { + "auxiliary_loss_clip": 0.07360442, + "auxiliary_loss_mlp": 0.01352716, + "balance_loss_clip": 0.06635769, + "balance_loss_mlp": 0.01261115, + "epoch": 0.048639711408387194, + "flos": 18703981992960.0, + "grad_norm": 6.742572767322423, + "language_loss": 0.95677304, + "learning_rate": 3.996356984858732e-06, + "loss": 1.04390454, + "num_input_tokens_seen": 17341455, + "router_z_loss_clip": 7.2421875, + "router_z_loss_mlp": 0.91699219, + "step": 809, + "time_per_iteration": 2.6680333614349365 + }, + { + "auxiliary_loss_clip": 0.07315584, + "auxiliary_loss_mlp": 0.01344649, + "balance_loss_clip": 0.06624336, + "balance_loss_mlp": 0.01256863, + "epoch": 0.048699834661055166, + "flos": 24870458718720.0, + "grad_norm": 4.628704942448529, + "language_loss": 0.90077579, + "learning_rate": 3.996333450822208e-06, + "loss": 0.98737824, + "num_input_tokens_seen": 17360765, + "router_z_loss_clip": 6.92578125, + "router_z_loss_mlp": 0.87841797, + "step": 810, + "time_per_iteration": 2.6677091121673584 + }, + { + "auxiliary_loss_clip": 0.07363133, + "auxiliary_loss_mlp": 0.01339196, + "balance_loss_clip": 0.06638221, + "balance_loss_mlp": 0.0126109, + "epoch": 0.04875995791372313, + "flos": 20710246227840.0, + "grad_norm": 31.095133807277897, + "language_loss": 0.84460914, + "learning_rate": 3.99630984108452e-06, + "loss": 0.9316324, + "num_input_tokens_seen": 17380625, + "router_z_loss_clip": 7.25, + "router_z_loss_mlp": 0.78125, + "step": 811, + "time_per_iteration": 4.020594358444214 + }, + { + "auxiliary_loss_clip": 0.07316839, + "auxiliary_loss_mlp": 0.01338146, + "balance_loss_clip": 0.06624701, + "balance_loss_mlp": 0.01256941, + "epoch": 0.048820081166391104, + "flos": 18594256671360.0, + "grad_norm": 4.82975857058881, + "language_loss": 0.78335881, + "learning_rate": 3.9962861556465615e-06, + "loss": 0.86990869, + "num_input_tokens_seen": 17399355, + "router_z_loss_clip": 6.92578125, + "router_z_loss_mlp": 0.81152344, + "step": 812, + "time_per_iteration": 2.614077091217041 + }, + { + "auxiliary_loss_clip": 0.0728099, + "auxiliary_loss_mlp": 0.01351533, + "balance_loss_clip": 0.06610497, + "balance_loss_mlp": 0.0127009, + "epoch": 0.04888020441905907, + "flos": 22713324007680.0, + "grad_norm": 17.655616040127313, + "language_loss": 0.94109142, + "learning_rate": 3.996262394509233e-06, + "loss": 1.02741659, + "num_input_tokens_seen": 17418240, + "router_z_loss_clip": 6.703125, + "router_z_loss_mlp": 0.81494141, + "step": 813, + "time_per_iteration": 2.5956995487213135 + }, + { + "auxiliary_loss_clip": 0.07318511, + "auxiliary_loss_mlp": 0.01349544, + "balance_loss_clip": 0.0662335, + "balance_loss_mlp": 0.01262807, + "epoch": 0.04894032767172704, + "flos": 22791044269440.0, + "grad_norm": 7.289252550466507, + "language_loss": 0.78803051, + "learning_rate": 3.9962385576734335e-06, + "loss": 0.87471104, + "num_input_tokens_seen": 17436250, + "router_z_loss_clip": 6.953125, + "router_z_loss_mlp": 0.8671875, + "step": 814, + "time_per_iteration": 2.625399351119995 + }, + { + "auxiliary_loss_clip": 0.07335538, + "auxiliary_loss_mlp": 0.01355257, + "balance_loss_clip": 0.06626598, + "balance_loss_mlp": 0.01267948, + "epoch": 0.04900045092439501, + "flos": 25522521592320.0, + "grad_norm": 46.975949242566905, + "language_loss": 0.87790531, + "learning_rate": 3.9962146451400675e-06, + "loss": 0.96481323, + "num_input_tokens_seen": 17455750, + "router_z_loss_clip": 7.1015625, + "router_z_loss_mlp": 0.87451172, + "step": 815, + "time_per_iteration": 2.6799027919769287 + }, + { + "auxiliary_loss_clip": 0.0734727, + "auxiliary_loss_mlp": 0.0137345, + "balance_loss_clip": 0.06619896, + "balance_loss_mlp": 0.01271788, + "epoch": 0.04906057417706298, + "flos": 25965280166400.0, + "grad_norm": 11.89199068240792, + "language_loss": 0.95818853, + "learning_rate": 3.996190656910043e-06, + "loss": 1.04539561, + "num_input_tokens_seen": 17474995, + "router_z_loss_clip": 7.28125, + "router_z_loss_mlp": 1.01757812, + "step": 816, + "time_per_iteration": 2.668058395385742 + }, + { + "auxiliary_loss_clip": 0.07340101, + "auxiliary_loss_mlp": 0.01360138, + "balance_loss_clip": 0.066241, + "balance_loss_mlp": 0.01271828, + "epoch": 0.04912069742973095, + "flos": 18630580216320.0, + "grad_norm": 8.092720893633917, + "language_loss": 0.84299397, + "learning_rate": 3.996166592984268e-06, + "loss": 0.92999631, + "num_input_tokens_seen": 17493395, + "router_z_loss_clip": 7.1484375, + "router_z_loss_mlp": 0.88330078, + "step": 817, + "time_per_iteration": 2.5901565551757812 + }, + { + "auxiliary_loss_clip": 0.07312281, + "auxiliary_loss_mlp": 0.01371477, + "balance_loss_clip": 0.06618914, + "balance_loss_mlp": 0.01282404, + "epoch": 0.049180820682398915, + "flos": 23707182885120.0, + "grad_norm": 5.174214831161968, + "language_loss": 0.88566625, + "learning_rate": 3.996142453363656e-06, + "loss": 0.97250384, + "num_input_tokens_seen": 17514565, + "router_z_loss_clip": 6.93359375, + "router_z_loss_mlp": 0.89013672, + "step": 818, + "time_per_iteration": 2.6751646995544434 + }, + { + "auxiliary_loss_clip": 0.07361554, + "auxiliary_loss_mlp": 0.01384487, + "balance_loss_clip": 0.06625406, + "balance_loss_mlp": 0.01290598, + "epoch": 0.04924094393506689, + "flos": 22427179914240.0, + "grad_norm": 6.808629946314654, + "language_loss": 0.81731856, + "learning_rate": 3.996118238049124e-06, + "loss": 0.90477902, + "num_input_tokens_seen": 17534590, + "router_z_loss_clip": 7.36328125, + "router_z_loss_mlp": 0.93798828, + "step": 819, + "time_per_iteration": 2.638293504714966 + }, + { + "auxiliary_loss_clip": 0.07319279, + "auxiliary_loss_mlp": 0.01377789, + "balance_loss_clip": 0.06608901, + "balance_loss_mlp": 0.01285903, + "epoch": 0.04930106718773486, + "flos": 15743033464320.0, + "grad_norm": 10.609665501519604, + "language_loss": 0.88234192, + "learning_rate": 3.996093947041586e-06, + "loss": 0.96931261, + "num_input_tokens_seen": 17551900, + "router_z_loss_clip": 7.109375, + "router_z_loss_mlp": 0.91845703, + "step": 820, + "time_per_iteration": 2.6076858043670654 + }, + { + "auxiliary_loss_clip": 0.07310833, + "auxiliary_loss_mlp": 0.01372579, + "balance_loss_clip": 0.06604609, + "balance_loss_mlp": 0.01282171, + "epoch": 0.049361190440402825, + "flos": 26257922951040.0, + "grad_norm": 5.648893665912937, + "language_loss": 0.94581264, + "learning_rate": 3.996069580341966e-06, + "loss": 1.03264678, + "num_input_tokens_seen": 17571485, + "router_z_loss_clip": 7.0703125, + "router_z_loss_mlp": 0.90380859, + "step": 821, + "time_per_iteration": 2.7164249420166016 + }, + { + "auxiliary_loss_clip": 0.07296955, + "auxiliary_loss_mlp": 0.01366561, + "balance_loss_clip": 0.0660333, + "balance_loss_mlp": 0.01277488, + "epoch": 0.0494213136930708, + "flos": 21258872835840.0, + "grad_norm": 13.842694995476421, + "language_loss": 0.93458569, + "learning_rate": 3.996045137951188e-06, + "loss": 1.02122092, + "num_input_tokens_seen": 17591410, + "router_z_loss_clip": 6.9453125, + "router_z_loss_mlp": 0.890625, + "step": 822, + "time_per_iteration": 2.6453444957733154 + }, + { + "auxiliary_loss_clip": 0.07319045, + "auxiliary_loss_mlp": 0.01374655, + "balance_loss_clip": 0.06613644, + "balance_loss_mlp": 0.0128048, + "epoch": 0.04948143694573876, + "flos": 27973095701760.0, + "grad_norm": 7.088849816783062, + "language_loss": 0.7121917, + "learning_rate": 3.996020619870178e-06, + "loss": 0.79912865, + "num_input_tokens_seen": 17612010, + "router_z_loss_clip": 7.05078125, + "router_z_loss_mlp": 0.94238281, + "step": 823, + "time_per_iteration": 2.6804885864257812 + }, + { + "auxiliary_loss_clip": 0.06953795, + "auxiliary_loss_mlp": 0.01404355, + "balance_loss_clip": 0.06535611, + "balance_loss_mlp": 0.01345371, + "epoch": 0.049541560198406734, + "flos": 66197466345600.0, + "grad_norm": 1.28356919167216, + "language_loss": 0.63197851, + "learning_rate": 3.995996026099866e-06, + "loss": 0.71555996, + "num_input_tokens_seen": 17673430, + "router_z_loss_clip": 4.1875, + "router_z_loss_mlp": 0.58837891, + "step": 824, + "time_per_iteration": 3.3058674335479736 + }, + { + "auxiliary_loss_clip": 0.07323784, + "auxiliary_loss_mlp": 0.01374745, + "balance_loss_clip": 0.06612824, + "balance_loss_mlp": 0.01280998, + "epoch": 0.049601683451074706, + "flos": 22899218290560.0, + "grad_norm": 5.8210235967171435, + "language_loss": 0.9564544, + "learning_rate": 3.995971356641185e-06, + "loss": 1.04343963, + "num_input_tokens_seen": 17689545, + "router_z_loss_clip": 7.11328125, + "router_z_loss_mlp": 0.9375, + "step": 825, + "time_per_iteration": 2.62613844871521 + }, + { + "auxiliary_loss_clip": 0.07281419, + "auxiliary_loss_mlp": 0.01365594, + "balance_loss_clip": 0.06597939, + "balance_loss_mlp": 0.0127695, + "epoch": 0.04966180670374267, + "flos": 21439987436160.0, + "grad_norm": 7.03533776815666, + "language_loss": 0.71345061, + "learning_rate": 3.9959466114950695e-06, + "loss": 0.7999208, + "num_input_tokens_seen": 17705965, + "router_z_loss_clip": 6.83984375, + "router_z_loss_mlp": 0.88671875, + "step": 826, + "time_per_iteration": 2.607252359390259 + }, + { + "auxiliary_loss_clip": 0.07308409, + "auxiliary_loss_mlp": 0.01368352, + "balance_loss_clip": 0.06603594, + "balance_loss_mlp": 0.0127885, + "epoch": 0.04972192995641064, + "flos": 23113218418560.0, + "grad_norm": 6.719033594417253, + "language_loss": 0.82099521, + "learning_rate": 3.995921790662459e-06, + "loss": 0.90776283, + "num_input_tokens_seen": 17724580, + "router_z_loss_clip": 7.05078125, + "router_z_loss_mlp": 0.89550781, + "step": 827, + "time_per_iteration": 2.6468021869659424 + }, + { + "auxiliary_loss_clip": 0.07312737, + "auxiliary_loss_mlp": 0.01384514, + "balance_loss_clip": 0.06605525, + "balance_loss_mlp": 0.01293009, + "epoch": 0.04978205320907861, + "flos": 40415648693760.0, + "grad_norm": 3.6071356819257336, + "language_loss": 0.83064795, + "learning_rate": 3.995896894144294e-06, + "loss": 0.91762054, + "num_input_tokens_seen": 17747755, + "router_z_loss_clip": 7.05859375, + "router_z_loss_mlp": 0.91455078, + "step": 828, + "time_per_iteration": 2.7598366737365723 + }, + { + "auxiliary_loss_clip": 0.07248655, + "auxiliary_loss_mlp": 0.01357422, + "balance_loss_clip": 0.06587116, + "balance_loss_mlp": 0.01271687, + "epoch": 0.04984217646174658, + "flos": 25235580885120.0, + "grad_norm": 7.916023460171269, + "language_loss": 0.88066685, + "learning_rate": 3.995871921941519e-06, + "loss": 0.96672761, + "num_input_tokens_seen": 17768550, + "router_z_loss_clip": 6.609375, + "router_z_loss_mlp": 0.85791016, + "step": 829, + "time_per_iteration": 2.664443016052246 + }, + { + "auxiliary_loss_clip": 0.07290308, + "auxiliary_loss_mlp": 0.01371956, + "balance_loss_clip": 0.06599583, + "balance_loss_mlp": 0.01282025, + "epoch": 0.04990229971441455, + "flos": 15964873948800.0, + "grad_norm": 30.23399077612731, + "language_loss": 0.79482603, + "learning_rate": 3.99584687405508e-06, + "loss": 0.88144869, + "num_input_tokens_seen": 17786080, + "router_z_loss_clip": 6.90625, + "router_z_loss_mlp": 0.90039062, + "step": 830, + "time_per_iteration": 2.5562844276428223 + }, + { + "auxiliary_loss_clip": 0.07284638, + "auxiliary_loss_mlp": 0.01358745, + "balance_loss_clip": 0.06602956, + "balance_loss_mlp": 0.01273677, + "epoch": 0.04996242296708252, + "flos": 18410919937920.0, + "grad_norm": 6.720833612775693, + "language_loss": 0.82703733, + "learning_rate": 3.995821750485929e-06, + "loss": 0.91347122, + "num_input_tokens_seen": 17803635, + "router_z_loss_clip": 6.81640625, + "router_z_loss_mlp": 0.85058594, + "step": 831, + "time_per_iteration": 2.6576318740844727 + }, + { + "auxiliary_loss_clip": 0.07282449, + "auxiliary_loss_mlp": 0.01350763, + "balance_loss_clip": 0.06587234, + "balance_loss_mlp": 0.01262882, + "epoch": 0.05002254621975049, + "flos": 17863802703360.0, + "grad_norm": 5.424543563535015, + "language_loss": 0.97343409, + "learning_rate": 3.995796551235016e-06, + "loss": 1.05976629, + "num_input_tokens_seen": 17822190, + "router_z_loss_clip": 6.953125, + "router_z_loss_mlp": 0.87939453, + "step": 832, + "time_per_iteration": 2.5859360694885254 + }, + { + "auxiliary_loss_clip": 0.07242593, + "auxiliary_loss_mlp": 0.01355446, + "balance_loss_clip": 0.06576244, + "balance_loss_mlp": 0.01268804, + "epoch": 0.050082669472418455, + "flos": 45670682632320.0, + "grad_norm": 14.668918539875873, + "language_loss": 0.86283791, + "learning_rate": 3.9957712763032974e-06, + "loss": 0.94881833, + "num_input_tokens_seen": 17846915, + "router_z_loss_clip": 6.66015625, + "router_z_loss_mlp": 0.86621094, + "step": 833, + "time_per_iteration": 2.8055691719055176 + }, + { + "auxiliary_loss_clip": 0.07249285, + "auxiliary_loss_mlp": 0.01350346, + "balance_loss_clip": 0.06584433, + "balance_loss_mlp": 0.01262561, + "epoch": 0.05014279272508643, + "flos": 37971237859200.0, + "grad_norm": 3.800888643683855, + "language_loss": 0.8636179, + "learning_rate": 3.995745925691733e-06, + "loss": 0.94961417, + "num_input_tokens_seen": 17867270, + "router_z_loss_clip": 6.64453125, + "router_z_loss_mlp": 0.87695312, + "step": 834, + "time_per_iteration": 2.757873296737671 + }, + { + "auxiliary_loss_clip": 0.07281981, + "auxiliary_loss_mlp": 0.01348084, + "balance_loss_clip": 0.0659239, + "balance_loss_mlp": 0.01265353, + "epoch": 0.0502029159777544, + "flos": 21002511669120.0, + "grad_norm": 6.832202768967494, + "language_loss": 0.96576416, + "learning_rate": 3.995720499401282e-06, + "loss": 1.0520649, + "num_input_tokens_seen": 17884880, + "router_z_loss_clip": 6.890625, + "router_z_loss_mlp": 0.82666016, + "step": 835, + "time_per_iteration": 2.5905637741088867 + }, + { + "auxiliary_loss_clip": 0.07274499, + "auxiliary_loss_mlp": 0.01349147, + "balance_loss_clip": 0.06586967, + "balance_loss_mlp": 0.01266273, + "epoch": 0.050263039230422364, + "flos": 15893526597120.0, + "grad_norm": 5.723886418395804, + "language_loss": 0.82083344, + "learning_rate": 3.995694997432911e-06, + "loss": 0.90706992, + "num_input_tokens_seen": 17903695, + "router_z_loss_clip": 6.87890625, + "router_z_loss_mlp": 0.82861328, + "step": 836, + "time_per_iteration": 2.6167397499084473 + }, + { + "auxiliary_loss_clip": 0.0721738, + "auxiliary_loss_mlp": 0.01338932, + "balance_loss_clip": 0.06569374, + "balance_loss_mlp": 0.01261065, + "epoch": 0.050323162483090336, + "flos": 23739565288320.0, + "grad_norm": 23.66781297023958, + "language_loss": 0.88235295, + "learning_rate": 3.9956694197875855e-06, + "loss": 0.96791613, + "num_input_tokens_seen": 17920745, + "router_z_loss_clip": 6.48046875, + "router_z_loss_mlp": 0.77832031, + "step": 837, + "time_per_iteration": 2.614959955215454 + }, + { + "auxiliary_loss_clip": 0.07221343, + "auxiliary_loss_mlp": 0.01354096, + "balance_loss_clip": 0.06550418, + "balance_loss_mlp": 0.01265261, + "epoch": 0.0503832857357583, + "flos": 20272393117440.0, + "grad_norm": 6.0443181189796995, + "language_loss": 0.76965159, + "learning_rate": 3.995643766466275e-06, + "loss": 0.85540605, + "num_input_tokens_seen": 17938220, + "router_z_loss_clip": 6.7109375, + "router_z_loss_mlp": 0.88769531, + "step": 838, + "time_per_iteration": 2.622648239135742 + }, + { + "auxiliary_loss_clip": 0.0724083, + "auxiliary_loss_mlp": 0.01341893, + "balance_loss_clip": 0.06561115, + "balance_loss_mlp": 0.01259353, + "epoch": 0.05044340898842627, + "flos": 17790736343040.0, + "grad_norm": 4.747797763129113, + "language_loss": 0.86986995, + "learning_rate": 3.995618037469953e-06, + "loss": 0.95569718, + "num_input_tokens_seen": 17957325, + "router_z_loss_clip": 6.796875, + "router_z_loss_mlp": 0.82519531, + "step": 839, + "time_per_iteration": 2.5999207496643066 + }, + { + "auxiliary_loss_clip": 0.07210248, + "auxiliary_loss_mlp": 0.01342514, + "balance_loss_clip": 0.06558718, + "balance_loss_mlp": 0.01262024, + "epoch": 0.050503532241094246, + "flos": 22973207045760.0, + "grad_norm": 3.66950577076863, + "language_loss": 0.88844591, + "learning_rate": 3.995592232799595e-06, + "loss": 0.97397357, + "num_input_tokens_seen": 17975875, + "router_z_loss_clip": 6.51953125, + "router_z_loss_mlp": 0.80517578, + "step": 840, + "time_per_iteration": 2.688936948776245 + }, + { + "auxiliary_loss_clip": 0.07223296, + "auxiliary_loss_mlp": 0.01348235, + "balance_loss_clip": 0.06565775, + "balance_loss_mlp": 0.01264264, + "epoch": 0.05056365549376221, + "flos": 22782449226240.0, + "grad_norm": 5.237976654716359, + "language_loss": 0.98182797, + "learning_rate": 3.99556635245618e-06, + "loss": 1.06754327, + "num_input_tokens_seen": 17994340, + "router_z_loss_clip": 6.57421875, + "router_z_loss_mlp": 0.84033203, + "step": 841, + "time_per_iteration": 2.626171588897705 + }, + { + "auxiliary_loss_clip": 0.07216457, + "auxiliary_loss_mlp": 0.01346197, + "balance_loss_clip": 0.06556017, + "balance_loss_mlp": 0.01263227, + "epoch": 0.05062377874643018, + "flos": 30924401011200.0, + "grad_norm": 3.922284831716734, + "language_loss": 0.81540143, + "learning_rate": 3.995540396440688e-06, + "loss": 0.90102798, + "num_input_tokens_seen": 18015260, + "router_z_loss_clip": 6.609375, + "router_z_loss_mlp": 0.82958984, + "step": 842, + "time_per_iteration": 2.707146167755127 + }, + { + "auxiliary_loss_clip": 0.07236033, + "auxiliary_loss_mlp": 0.01355891, + "balance_loss_clip": 0.06555693, + "balance_loss_mlp": 0.0126391, + "epoch": 0.05068390199909815, + "flos": 19653425406720.0, + "grad_norm": 6.4717382946502635, + "language_loss": 0.81965601, + "learning_rate": 3.995514364754105e-06, + "loss": 0.90557522, + "num_input_tokens_seen": 18033960, + "router_z_loss_clip": 6.80078125, + "router_z_loss_mlp": 0.91943359, + "step": 843, + "time_per_iteration": 2.672064781188965 + }, + { + "auxiliary_loss_clip": 0.07235807, + "auxiliary_loss_mlp": 0.01361352, + "balance_loss_clip": 0.06552228, + "balance_loss_mlp": 0.01271992, + "epoch": 0.05074402525176612, + "flos": 37971279786240.0, + "grad_norm": 2.407141650516338, + "language_loss": 0.87016606, + "learning_rate": 3.995488257397417e-06, + "loss": 0.95613766, + "num_input_tokens_seen": 18056700, + "router_z_loss_clip": 6.83203125, + "router_z_loss_mlp": 0.89404297, + "step": 844, + "time_per_iteration": 2.7541916370391846 + }, + { + "auxiliary_loss_clip": 0.07238596, + "auxiliary_loss_mlp": 0.01357268, + "balance_loss_clip": 0.06561587, + "balance_loss_mlp": 0.01275109, + "epoch": 0.05080414850443409, + "flos": 22061177280000.0, + "grad_norm": 5.7438919546505876, + "language_loss": 0.80192208, + "learning_rate": 3.995462074371614e-06, + "loss": 0.8878808, + "num_input_tokens_seen": 18075815, + "router_z_loss_clip": 6.76953125, + "router_z_loss_mlp": 0.82226562, + "step": 845, + "time_per_iteration": 2.5944912433624268 + }, + { + "auxiliary_loss_clip": 0.07213366, + "auxiliary_loss_mlp": 0.01353915, + "balance_loss_clip": 0.06554674, + "balance_loss_mlp": 0.01268561, + "epoch": 0.05086427175710206, + "flos": 20231289889920.0, + "grad_norm": 4.0486216034950475, + "language_loss": 0.91612351, + "learning_rate": 3.99543581567769e-06, + "loss": 1.00179636, + "num_input_tokens_seen": 18095095, + "router_z_loss_clip": 6.578125, + "router_z_loss_mlp": 0.85400391, + "step": 846, + "time_per_iteration": 4.029407739639282 + }, + { + "auxiliary_loss_clip": 0.07198675, + "auxiliary_loss_mlp": 0.01353444, + "balance_loss_clip": 0.06555093, + "balance_loss_mlp": 0.01271094, + "epoch": 0.05092439500977003, + "flos": 15164707783680.0, + "grad_norm": 2.8334464640278307, + "language_loss": 0.91321969, + "learning_rate": 3.9954094813166394e-06, + "loss": 0.99874079, + "num_input_tokens_seen": 18112675, + "router_z_loss_clip": 6.4375, + "router_z_loss_mlp": 0.82324219, + "step": 847, + "time_per_iteration": 4.004042863845825 + }, + { + "auxiliary_loss_clip": 0.07199422, + "auxiliary_loss_mlp": 0.01355266, + "balance_loss_clip": 0.0654697, + "balance_loss_mlp": 0.01273202, + "epoch": 0.050984518262437994, + "flos": 22061806185600.0, + "grad_norm": 3.421485941815423, + "language_loss": 0.86160553, + "learning_rate": 3.995383071289462e-06, + "loss": 0.94715238, + "num_input_tokens_seen": 18130745, + "router_z_loss_clip": 6.52734375, + "router_z_loss_mlp": 0.82080078, + "step": 848, + "time_per_iteration": 4.033248662948608 + }, + { + "auxiliary_loss_clip": 0.07196971, + "auxiliary_loss_mlp": 0.01345708, + "balance_loss_clip": 0.06533228, + "balance_loss_mlp": 0.01262166, + "epoch": 0.05104464151510597, + "flos": 30232911991680.0, + "grad_norm": 3.7966495356829357, + "language_loss": 0.90386808, + "learning_rate": 3.995356585597158e-06, + "loss": 0.98929483, + "num_input_tokens_seen": 18152410, + "router_z_loss_clip": 6.640625, + "router_z_loss_mlp": 0.83544922, + "step": 849, + "time_per_iteration": 2.6612625122070312 + }, + { + "auxiliary_loss_clip": 0.07179346, + "auxiliary_loss_mlp": 0.01359214, + "balance_loss_clip": 0.06533284, + "balance_loss_mlp": 0.01279106, + "epoch": 0.05110476476777394, + "flos": 18338817899520.0, + "grad_norm": 8.277424439503498, + "language_loss": 0.88001835, + "learning_rate": 3.995330024240732e-06, + "loss": 0.96540397, + "num_input_tokens_seen": 18170870, + "router_z_loss_clip": 6.45703125, + "router_z_loss_mlp": 0.80126953, + "step": 850, + "time_per_iteration": 2.591169834136963 + }, + { + "auxiliary_loss_clip": 0.07213688, + "auxiliary_loss_mlp": 0.01358343, + "balance_loss_clip": 0.06542021, + "balance_loss_mlp": 0.01272131, + "epoch": 0.051164888020441904, + "flos": 38007938747520.0, + "grad_norm": 2.8793275004055894, + "language_loss": 0.702048, + "learning_rate": 3.995303387221192e-06, + "loss": 0.78776836, + "num_input_tokens_seen": 18191555, + "router_z_loss_clip": 6.72265625, + "router_z_loss_mlp": 0.86328125, + "step": 851, + "time_per_iteration": 4.218145132064819 + }, + { + "auxiliary_loss_clip": 0.07192284, + "auxiliary_loss_mlp": 0.0136467, + "balance_loss_clip": 0.06527439, + "balance_loss_mlp": 0.01276741, + "epoch": 0.051225011273109876, + "flos": 23045183303040.0, + "grad_norm": 3.6723766751173894, + "language_loss": 0.87184155, + "learning_rate": 3.995276674539547e-06, + "loss": 0.95741105, + "num_input_tokens_seen": 18208620, + "router_z_loss_clip": 6.66015625, + "router_z_loss_mlp": 0.87939453, + "step": 852, + "time_per_iteration": 2.629037380218506 + }, + { + "auxiliary_loss_clip": 0.07206973, + "auxiliary_loss_mlp": 0.01354841, + "balance_loss_clip": 0.06534127, + "balance_loss_mlp": 0.01269678, + "epoch": 0.05128513452577785, + "flos": 18265709612160.0, + "grad_norm": 3.821037496712823, + "language_loss": 0.8378402, + "learning_rate": 3.995249886196811e-06, + "loss": 0.92345834, + "num_input_tokens_seen": 18226370, + "router_z_loss_clip": 6.73046875, + "router_z_loss_mlp": 0.8515625, + "step": 853, + "time_per_iteration": 2.6316466331481934 + }, + { + "auxiliary_loss_clip": 0.07211602, + "auxiliary_loss_mlp": 0.01339797, + "balance_loss_clip": 0.06537303, + "balance_loss_mlp": 0.01257733, + "epoch": 0.05134525777844581, + "flos": 27206360115840.0, + "grad_norm": 3.182696022693741, + "language_loss": 0.80133533, + "learning_rate": 3.995223022193999e-06, + "loss": 0.88684934, + "num_input_tokens_seen": 18247075, + "router_z_loss_clip": 6.7421875, + "router_z_loss_mlp": 0.82080078, + "step": 854, + "time_per_iteration": 2.6477131843566895 + }, + { + "auxiliary_loss_clip": 0.07215541, + "auxiliary_loss_mlp": 0.01344733, + "balance_loss_clip": 0.0654063, + "balance_loss_mlp": 0.01263146, + "epoch": 0.051405381031113785, + "flos": 28369132824960.0, + "grad_norm": 35.99472555736179, + "language_loss": 0.85045469, + "learning_rate": 3.99519608253213e-06, + "loss": 0.93605745, + "num_input_tokens_seen": 18265680, + "router_z_loss_clip": 6.74609375, + "router_z_loss_mlp": 0.81542969, + "step": 855, + "time_per_iteration": 2.6279296875 + }, + { + "auxiliary_loss_clip": 0.06909335, + "auxiliary_loss_mlp": 0.01436301, + "balance_loss_clip": 0.0650633, + "balance_loss_mlp": 0.01398083, + "epoch": 0.05146550428378175, + "flos": 65638049760000.0, + "grad_norm": 0.9716530477482218, + "language_loss": 0.65818644, + "learning_rate": 3.995169067212227e-06, + "loss": 0.74164271, + "num_input_tokens_seen": 18327015, + "router_z_loss_clip": 4.01171875, + "router_z_loss_mlp": 0.3815918, + "step": 856, + "time_per_iteration": 3.1742889881134033 + }, + { + "auxiliary_loss_clip": 0.0715993, + "auxiliary_loss_mlp": 0.01330963, + "balance_loss_clip": 0.06518224, + "balance_loss_mlp": 0.01252571, + "epoch": 0.05152562753644972, + "flos": 22061470769280.0, + "grad_norm": 29.089515075725927, + "language_loss": 0.80351281, + "learning_rate": 3.9951419762353116e-06, + "loss": 0.88842171, + "num_input_tokens_seen": 18345235, + "router_z_loss_clip": 6.41796875, + "router_z_loss_mlp": 0.78417969, + "step": 857, + "time_per_iteration": 2.6136977672576904 + }, + { + "auxiliary_loss_clip": 0.07196955, + "auxiliary_loss_mlp": 0.01347875, + "balance_loss_clip": 0.06528607, + "balance_loss_mlp": 0.01259422, + "epoch": 0.051585750789117694, + "flos": 18514523911680.0, + "grad_norm": 4.501526487205694, + "language_loss": 0.9266271, + "learning_rate": 3.995114809602412e-06, + "loss": 1.01207542, + "num_input_tokens_seen": 18362350, + "router_z_loss_clip": 6.6875, + "router_z_loss_mlp": 0.88427734, + "step": 858, + "time_per_iteration": 2.606518268585205 + }, + { + "auxiliary_loss_clip": 0.07190363, + "auxiliary_loss_mlp": 0.0134683, + "balance_loss_clip": 0.06527077, + "balance_loss_mlp": 0.01261381, + "epoch": 0.05164587404178566, + "flos": 23736630395520.0, + "grad_norm": 4.049462391518637, + "language_loss": 0.80811787, + "learning_rate": 3.9950875673145605e-06, + "loss": 0.89348972, + "num_input_tokens_seen": 18383390, + "router_z_loss_clip": 6.6328125, + "router_z_loss_mlp": 0.85400391, + "step": 859, + "time_per_iteration": 2.624462604522705 + }, + { + "auxiliary_loss_clip": 0.07202329, + "auxiliary_loss_mlp": 0.01352935, + "balance_loss_clip": 0.06525081, + "balance_loss_mlp": 0.01264196, + "epoch": 0.05170599729445363, + "flos": 16258397201280.0, + "grad_norm": 12.806303000100046, + "language_loss": 0.95290452, + "learning_rate": 3.995060249372788e-06, + "loss": 1.03845716, + "num_input_tokens_seen": 18399220, + "router_z_loss_clip": 6.78125, + "router_z_loss_mlp": 0.88769531, + "step": 860, + "time_per_iteration": 2.6383068561553955 + }, + { + "auxiliary_loss_clip": 0.07167631, + "auxiliary_loss_mlp": 0.01344788, + "balance_loss_clip": 0.06524719, + "balance_loss_mlp": 0.01262868, + "epoch": 0.0517661205471216, + "flos": 23992404583680.0, + "grad_norm": 3.0591302489664116, + "language_loss": 0.86028093, + "learning_rate": 3.99503285577813e-06, + "loss": 0.94540519, + "num_input_tokens_seen": 18419005, + "router_z_loss_clip": 6.4375, + "router_z_loss_mlp": 0.81884766, + "step": 861, + "time_per_iteration": 2.6825718879699707 + }, + { + "auxiliary_loss_clip": 0.07179172, + "auxiliary_loss_mlp": 0.01338271, + "balance_loss_clip": 0.06521305, + "balance_loss_mlp": 0.01256732, + "epoch": 0.05182624379978957, + "flos": 29285313367680.0, + "grad_norm": 3.256695777108904, + "language_loss": 0.8236177, + "learning_rate": 3.995005386531627e-06, + "loss": 0.90879214, + "num_input_tokens_seen": 18440550, + "router_z_loss_clip": 6.578125, + "router_z_loss_mlp": 0.81542969, + "step": 862, + "time_per_iteration": 2.723032236099243 + }, + { + "auxiliary_loss_clip": 0.07146881, + "auxiliary_loss_mlp": 0.01338015, + "balance_loss_clip": 0.06502384, + "balance_loss_mlp": 0.01256428, + "epoch": 0.05188636705245754, + "flos": 24177753815040.0, + "grad_norm": 4.080001789672534, + "language_loss": 0.92516744, + "learning_rate": 3.9949778416343195e-06, + "loss": 1.01001632, + "num_input_tokens_seen": 18461950, + "router_z_loss_clip": 6.44140625, + "router_z_loss_mlp": 0.81591797, + "step": 863, + "time_per_iteration": 2.624147653579712 + }, + { + "auxiliary_loss_clip": 0.07156427, + "auxiliary_loss_mlp": 0.0133763, + "balance_loss_clip": 0.06515339, + "balance_loss_mlp": 0.01253897, + "epoch": 0.051946490305125506, + "flos": 26767961953920.0, + "grad_norm": 5.3541817649382875, + "language_loss": 0.7963919, + "learning_rate": 3.9949502210872525e-06, + "loss": 0.88133246, + "num_input_tokens_seen": 18480555, + "router_z_loss_clip": 6.41015625, + "router_z_loss_mlp": 0.83789062, + "step": 864, + "time_per_iteration": 2.6928389072418213 + }, + { + "auxiliary_loss_clip": 0.07167269, + "auxiliary_loss_mlp": 0.01333883, + "balance_loss_clip": 0.0651238, + "balance_loss_mlp": 0.01252963, + "epoch": 0.05200661355779348, + "flos": 21508190259840.0, + "grad_norm": 2.900845784392114, + "language_loss": 0.83983421, + "learning_rate": 3.994922524891474e-06, + "loss": 0.9248457, + "num_input_tokens_seen": 18499645, + "router_z_loss_clip": 6.546875, + "router_z_loss_mlp": 0.80908203, + "step": 865, + "time_per_iteration": 2.6349294185638428 + }, + { + "auxiliary_loss_clip": 0.07157271, + "auxiliary_loss_mlp": 0.01343197, + "balance_loss_clip": 0.06511506, + "balance_loss_mlp": 0.01259417, + "epoch": 0.05206673681046144, + "flos": 18120457359360.0, + "grad_norm": 4.23578044185309, + "language_loss": 0.89868104, + "learning_rate": 3.994894753048032e-06, + "loss": 0.98368573, + "num_input_tokens_seen": 18516810, + "router_z_loss_clip": 6.453125, + "router_z_loss_mlp": 0.83789062, + "step": 866, + "time_per_iteration": 2.605546236038208 + }, + { + "auxiliary_loss_clip": 0.07133412, + "auxiliary_loss_mlp": 0.01337077, + "balance_loss_clip": 0.06502427, + "balance_loss_mlp": 0.01258494, + "epoch": 0.052126860063129415, + "flos": 17528966588160.0, + "grad_norm": 5.089693219930068, + "language_loss": 0.91889334, + "learning_rate": 3.9948669055579815e-06, + "loss": 1.00359821, + "num_input_tokens_seen": 18532510, + "router_z_loss_clip": 6.30859375, + "router_z_loss_mlp": 0.78564453, + "step": 867, + "time_per_iteration": 2.5601866245269775 + }, + { + "auxiliary_loss_clip": 0.07109866, + "auxiliary_loss_mlp": 0.01340108, + "balance_loss_clip": 0.06500173, + "balance_loss_mlp": 0.0126019, + "epoch": 0.05218698331579739, + "flos": 32606227036800.0, + "grad_norm": 2.1025104258361558, + "language_loss": 0.66466248, + "learning_rate": 3.9948389824223785e-06, + "loss": 0.7491622, + "num_input_tokens_seen": 18557380, + "router_z_loss_clip": 6.09765625, + "router_z_loss_mlp": 0.79882812, + "step": 868, + "time_per_iteration": 2.6942384243011475 + }, + { + "auxiliary_loss_clip": 0.0714476, + "auxiliary_loss_mlp": 0.01358483, + "balance_loss_clip": 0.06494892, + "balance_loss_mlp": 0.01263545, + "epoch": 0.05224710656846535, + "flos": 22133824369920.0, + "grad_norm": 2.980657220865539, + "language_loss": 0.87344658, + "learning_rate": 3.994810983642281e-06, + "loss": 0.95847905, + "num_input_tokens_seen": 18575720, + "router_z_loss_clip": 6.5, + "router_z_loss_mlp": 0.94921875, + "step": 869, + "time_per_iteration": 2.5877575874328613 + }, + { + "auxiliary_loss_clip": 0.07143813, + "auxiliary_loss_mlp": 0.01349092, + "balance_loss_clip": 0.06488257, + "balance_loss_mlp": 0.01260353, + "epoch": 0.052307229821133325, + "flos": 11149789472640.0, + "grad_norm": 7.7840171376663285, + "language_loss": 0.91889322, + "learning_rate": 3.994782909218751e-06, + "loss": 1.00382233, + "num_input_tokens_seen": 18592185, + "router_z_loss_clip": 6.55859375, + "router_z_loss_mlp": 0.88720703, + "step": 870, + "time_per_iteration": 2.608442783355713 + }, + { + "auxiliary_loss_clip": 0.07122661, + "auxiliary_loss_mlp": 0.01356358, + "balance_loss_clip": 0.064864, + "balance_loss_mlp": 0.01265759, + "epoch": 0.05236735307380129, + "flos": 19132862716800.0, + "grad_norm": 2.918328667759454, + "language_loss": 0.843858, + "learning_rate": 3.994754759152854e-06, + "loss": 0.92864817, + "num_input_tokens_seen": 18609560, + "router_z_loss_clip": 6.3671875, + "router_z_loss_mlp": 0.90722656, + "step": 871, + "time_per_iteration": 2.5879244804382324 + }, + { + "auxiliary_loss_clip": 0.07078928, + "auxiliary_loss_mlp": 0.01364934, + "balance_loss_clip": 0.06478463, + "balance_loss_mlp": 0.01281488, + "epoch": 0.05242747632646926, + "flos": 20967152446080.0, + "grad_norm": 2.587533245039743, + "language_loss": 0.8462553, + "learning_rate": 3.994726533445656e-06, + "loss": 0.93069392, + "num_input_tokens_seen": 18629405, + "router_z_loss_clip": 6.0078125, + "router_z_loss_mlp": 0.83496094, + "step": 872, + "time_per_iteration": 2.6208133697509766 + }, + { + "auxiliary_loss_clip": 0.06844061, + "auxiliary_loss_mlp": 0.01482571, + "balance_loss_clip": 0.06436051, + "balance_loss_mlp": 0.0141405, + "epoch": 0.052487599579137234, + "flos": 65038005872640.0, + "grad_norm": 0.8977590463147395, + "language_loss": 0.61953008, + "learning_rate": 3.9946982320982274e-06, + "loss": 0.70279646, + "num_input_tokens_seen": 18681480, + "router_z_loss_clip": 4.0625, + "router_z_loss_mlp": 0.68603516, + "step": 873, + "time_per_iteration": 3.134603500366211 + }, + { + "auxiliary_loss_clip": 0.07129098, + "auxiliary_loss_mlp": 0.01340569, + "balance_loss_clip": 0.06492221, + "balance_loss_mlp": 0.01259269, + "epoch": 0.0525477228318052, + "flos": 23294584581120.0, + "grad_norm": 2.232892718211453, + "language_loss": 0.92670178, + "learning_rate": 3.994669855111643e-06, + "loss": 1.01139832, + "num_input_tokens_seen": 18700390, + "router_z_loss_clip": 6.37109375, + "router_z_loss_mlp": 0.81298828, + "step": 874, + "time_per_iteration": 2.6136653423309326 + }, + { + "auxiliary_loss_clip": 0.07136606, + "auxiliary_loss_mlp": 0.01342837, + "balance_loss_clip": 0.0649495, + "balance_loss_mlp": 0.01262681, + "epoch": 0.05260784608447317, + "flos": 32237834561280.0, + "grad_norm": 3.6657665933203796, + "language_loss": 0.78140688, + "learning_rate": 3.994641402486977e-06, + "loss": 0.86620128, + "num_input_tokens_seen": 18721280, + "router_z_loss_clip": 6.41796875, + "router_z_loss_mlp": 0.80175781, + "step": 875, + "time_per_iteration": 2.72760272026062 + }, + { + "auxiliary_loss_clip": 0.07132401, + "auxiliary_loss_mlp": 0.01330422, + "balance_loss_clip": 0.06503764, + "balance_loss_mlp": 0.01255511, + "epoch": 0.052667969337141136, + "flos": 24470270818560.0, + "grad_norm": 2.6184423818700684, + "language_loss": 0.96137547, + "learning_rate": 3.99461287422531e-06, + "loss": 1.04600358, + "num_input_tokens_seen": 18741545, + "router_z_loss_clip": 6.28515625, + "router_z_loss_mlp": 0.74902344, + "step": 876, + "time_per_iteration": 2.627152681350708 + }, + { + "auxiliary_loss_clip": 0.06850941, + "auxiliary_loss_mlp": 0.01378053, + "balance_loss_clip": 0.06451087, + "balance_loss_mlp": 0.01329487, + "epoch": 0.05272809258980911, + "flos": 57804673034880.0, + "grad_norm": 0.7984915998280667, + "language_loss": 0.63229537, + "learning_rate": 3.994584270327722e-06, + "loss": 0.7145853, + "num_input_tokens_seen": 18801400, + "router_z_loss_clip": 4.0, + "router_z_loss_mlp": 0.48510742, + "step": 877, + "time_per_iteration": 3.2541913986206055 + }, + { + "auxiliary_loss_clip": 0.0712804, + "auxiliary_loss_mlp": 0.01326088, + "balance_loss_clip": 0.06496318, + "balance_loss_mlp": 0.01255087, + "epoch": 0.05278821584247708, + "flos": 17426578498560.0, + "grad_norm": 2.7186428977077624, + "language_loss": 0.89685273, + "learning_rate": 3.994555590795299e-06, + "loss": 0.98139405, + "num_input_tokens_seen": 18819670, + "router_z_loss_clip": 6.3125, + "router_z_loss_mlp": 0.71044922, + "step": 878, + "time_per_iteration": 2.5782718658447266 + }, + { + "auxiliary_loss_clip": 0.07154611, + "auxiliary_loss_mlp": 0.0135536, + "balance_loss_clip": 0.06498797, + "balance_loss_mlp": 0.01272485, + "epoch": 0.052848339095145046, + "flos": 26143879144320.0, + "grad_norm": 3.677878171007489, + "language_loss": 0.873586, + "learning_rate": 3.9945268356291275e-06, + "loss": 0.9586857, + "num_input_tokens_seen": 18840580, + "router_z_loss_clip": 6.55859375, + "router_z_loss_mlp": 0.82910156, + "step": 879, + "time_per_iteration": 2.6588823795318604 + }, + { + "auxiliary_loss_clip": 0.07119917, + "auxiliary_loss_mlp": 0.01353348, + "balance_loss_clip": 0.06497534, + "balance_loss_mlp": 0.01274622, + "epoch": 0.05290846234781302, + "flos": 16477680136320.0, + "grad_norm": 3.320308324601447, + "language_loss": 0.88939857, + "learning_rate": 3.9944980048302985e-06, + "loss": 0.97413123, + "num_input_tokens_seen": 18859295, + "router_z_loss_clip": 6.2265625, + "router_z_loss_mlp": 0.78710938, + "step": 880, + "time_per_iteration": 2.578577756881714 + }, + { + "auxiliary_loss_clip": 0.07141528, + "auxiliary_loss_mlp": 0.01362108, + "balance_loss_clip": 0.06505635, + "balance_loss_mlp": 0.0127971, + "epoch": 0.05296858560048098, + "flos": 19871324749440.0, + "grad_norm": 13.59148063097553, + "language_loss": 0.93088204, + "learning_rate": 3.994469098399906e-06, + "loss": 1.01591837, + "num_input_tokens_seen": 18877485, + "router_z_loss_clip": 6.3671875, + "router_z_loss_mlp": 0.82421875, + "step": 881, + "time_per_iteration": 2.5984764099121094 + }, + { + "auxiliary_loss_clip": 0.07145406, + "auxiliary_loss_mlp": 0.01363259, + "balance_loss_clip": 0.06503064, + "balance_loss_mlp": 0.01280146, + "epoch": 0.053028708853148955, + "flos": 24395359668480.0, + "grad_norm": 2.511110361208876, + "language_loss": 0.91561359, + "learning_rate": 3.994440116339046e-06, + "loss": 1.00070024, + "num_input_tokens_seen": 18898275, + "router_z_loss_clip": 6.4140625, + "router_z_loss_mlp": 0.83203125, + "step": 882, + "time_per_iteration": 2.6321942806243896 + }, + { + "auxiliary_loss_clip": 0.07153618, + "auxiliary_loss_mlp": 0.01379213, + "balance_loss_clip": 0.06501983, + "balance_loss_mlp": 0.0129343, + "epoch": 0.05308883210581693, + "flos": 36402072048000.0, + "grad_norm": 3.8602802151834035, + "language_loss": 0.74549603, + "learning_rate": 3.994411058648816e-06, + "loss": 0.83082438, + "num_input_tokens_seen": 18920665, + "router_z_loss_clip": 6.515625, + "router_z_loss_mlp": 0.85839844, + "step": 883, + "time_per_iteration": 2.758694648742676 + }, + { + "auxiliary_loss_clip": 0.07123835, + "auxiliary_loss_mlp": 0.01365604, + "balance_loss_clip": 0.06493074, + "balance_loss_mlp": 0.01279965, + "epoch": 0.05314895535848489, + "flos": 22861427299200.0, + "grad_norm": 3.506018870992282, + "language_loss": 0.79542196, + "learning_rate": 3.994381925330319e-06, + "loss": 0.88031638, + "num_input_tokens_seen": 18939835, + "router_z_loss_clip": 6.3125, + "router_z_loss_mlp": 0.85644531, + "step": 884, + "time_per_iteration": 2.638016700744629 + }, + { + "auxiliary_loss_clip": 0.07094033, + "auxiliary_loss_mlp": 0.01359391, + "balance_loss_clip": 0.06489642, + "balance_loss_mlp": 0.01288057, + "epoch": 0.053209078611152864, + "flos": 12865381493760.0, + "grad_norm": 6.565904312623652, + "language_loss": 0.90469623, + "learning_rate": 3.994352716384659e-06, + "loss": 0.98923051, + "num_input_tokens_seen": 18958405, + "router_z_loss_clip": 6.0390625, + "router_z_loss_mlp": 0.71289062, + "step": 885, + "time_per_iteration": 2.5900588035583496 + }, + { + "auxiliary_loss_clip": 0.07139361, + "auxiliary_loss_mlp": 0.01377795, + "balance_loss_clip": 0.06508732, + "balance_loss_mlp": 0.0129225, + "epoch": 0.05326920186382083, + "flos": 12169112791680.0, + "grad_norm": 9.079017579739912, + "language_loss": 0.91530603, + "learning_rate": 3.994323431812945e-06, + "loss": 1.00047755, + "num_input_tokens_seen": 18975445, + "router_z_loss_clip": 6.3046875, + "router_z_loss_mlp": 0.85595703, + "step": 886, + "time_per_iteration": 4.099337339401245 + }, + { + "auxiliary_loss_clip": 0.07124092, + "auxiliary_loss_mlp": 0.01379295, + "balance_loss_clip": 0.06500152, + "balance_loss_mlp": 0.01295754, + "epoch": 0.0533293251164888, + "flos": 22710011771520.0, + "grad_norm": 3.9905004918105202, + "language_loss": 0.93810099, + "learning_rate": 3.994294071616286e-06, + "loss": 1.02313483, + "num_input_tokens_seen": 18991930, + "router_z_loss_clip": 6.23828125, + "router_z_loss_mlp": 0.83447266, + "step": 887, + "time_per_iteration": 2.5987393856048584 + }, + { + "auxiliary_loss_clip": 0.0714867, + "auxiliary_loss_mlp": 0.01405803, + "balance_loss_clip": 0.06507815, + "balance_loss_mlp": 0.01314536, + "epoch": 0.053389448369156774, + "flos": 26947860670080.0, + "grad_norm": 3.06900720752712, + "language_loss": 0.79354906, + "learning_rate": 3.994264635795796e-06, + "loss": 0.87909377, + "num_input_tokens_seen": 19009790, + "router_z_loss_clip": 6.40234375, + "router_z_loss_mlp": 0.91259766, + "step": 888, + "time_per_iteration": 4.025885820388794 + }, + { + "auxiliary_loss_clip": 0.07115386, + "auxiliary_loss_mlp": 0.01373999, + "balance_loss_clip": 0.06494455, + "balance_loss_mlp": 0.01293223, + "epoch": 0.05344957162182474, + "flos": 25563331330560.0, + "grad_norm": 6.088733603359691, + "language_loss": 0.92500973, + "learning_rate": 3.994235124352592e-06, + "loss": 1.00990355, + "num_input_tokens_seen": 19030170, + "router_z_loss_clip": 6.21484375, + "router_z_loss_mlp": 0.80761719, + "step": 889, + "time_per_iteration": 2.7182345390319824 + }, + { + "auxiliary_loss_clip": 0.07091353, + "auxiliary_loss_mlp": 0.01359755, + "balance_loss_clip": 0.06492079, + "balance_loss_mlp": 0.01289135, + "epoch": 0.05350969487449271, + "flos": 19725779007360.0, + "grad_norm": 3.9732892090836818, + "language_loss": 0.92642856, + "learning_rate": 3.994205537287791e-06, + "loss": 1.0109396, + "num_input_tokens_seen": 19048075, + "router_z_loss_clip": 5.99609375, + "router_z_loss_mlp": 0.70654297, + "step": 890, + "time_per_iteration": 4.055738925933838 + }, + { + "auxiliary_loss_clip": 0.071067, + "auxiliary_loss_mlp": 0.01356348, + "balance_loss_clip": 0.06478938, + "balance_loss_mlp": 0.01276573, + "epoch": 0.053569818127160676, + "flos": 27023694215040.0, + "grad_norm": 3.5767216506214523, + "language_loss": 0.98853362, + "learning_rate": 3.994175874602517e-06, + "loss": 1.07316399, + "num_input_tokens_seen": 19067465, + "router_z_loss_clip": 6.27734375, + "router_z_loss_mlp": 0.79785156, + "step": 891, + "time_per_iteration": 2.651681661605835 + }, + { + "auxiliary_loss_clip": 0.07084872, + "auxiliary_loss_mlp": 0.01351507, + "balance_loss_clip": 0.06476413, + "balance_loss_mlp": 0.01277788, + "epoch": 0.05362994137982865, + "flos": 13193383501440.0, + "grad_norm": 5.794831179079165, + "language_loss": 0.75768781, + "learning_rate": 3.994146136297893e-06, + "loss": 0.84205151, + "num_input_tokens_seen": 19085505, + "router_z_loss_clip": 6.08203125, + "router_z_loss_mlp": 0.73779297, + "step": 892, + "time_per_iteration": 2.5933892726898193 + }, + { + "auxiliary_loss_clip": 0.07096062, + "auxiliary_loss_mlp": 0.01350672, + "balance_loss_clip": 0.0647971, + "balance_loss_mlp": 0.01278002, + "epoch": 0.05369006463249662, + "flos": 28665590970240.0, + "grad_norm": 4.507397126758742, + "language_loss": 0.85958588, + "learning_rate": 3.994116322375049e-06, + "loss": 0.94405323, + "num_input_tokens_seen": 19104360, + "router_z_loss_clip": 6.16796875, + "router_z_loss_mlp": 0.7265625, + "step": 893, + "time_per_iteration": 2.6546378135681152 + }, + { + "auxiliary_loss_clip": 0.07101032, + "auxiliary_loss_mlp": 0.01336529, + "balance_loss_clip": 0.06474701, + "balance_loss_mlp": 0.01265099, + "epoch": 0.053750187885164585, + "flos": 28920736252800.0, + "grad_norm": 9.639579848612797, + "language_loss": 0.85423577, + "learning_rate": 3.994086432835114e-06, + "loss": 0.93861139, + "num_input_tokens_seen": 19124680, + "router_z_loss_clip": 6.265625, + "router_z_loss_mlp": 0.71484375, + "step": 894, + "time_per_iteration": 2.649336099624634 + }, + { + "auxiliary_loss_clip": 0.07051332, + "auxiliary_loss_mlp": 0.0132645, + "balance_loss_clip": 0.06452148, + "balance_loss_mlp": 0.01260742, + "epoch": 0.05381031113783256, + "flos": 15164246586240.0, + "grad_norm": 3.2292453008689215, + "language_loss": 0.79914492, + "learning_rate": 3.994056467679221e-06, + "loss": 0.88292277, + "num_input_tokens_seen": 19142895, + "router_z_loss_clip": 5.9921875, + "router_z_loss_mlp": 0.65722656, + "step": 895, + "time_per_iteration": 2.5825929641723633 + }, + { + "auxiliary_loss_clip": 0.07075687, + "auxiliary_loss_mlp": 0.01335812, + "balance_loss_clip": 0.06453281, + "balance_loss_mlp": 0.01257229, + "epoch": 0.05387043439050053, + "flos": 21841684709760.0, + "grad_norm": 4.836504932030544, + "language_loss": 0.91227436, + "learning_rate": 3.9940264269085065e-06, + "loss": 0.99638927, + "num_input_tokens_seen": 19163125, + "router_z_loss_clip": 6.2265625, + "router_z_loss_mlp": 0.78564453, + "step": 896, + "time_per_iteration": 2.657710313796997 + }, + { + "auxiliary_loss_clip": 0.07047559, + "auxiliary_loss_mlp": 0.0133946, + "balance_loss_clip": 0.06444345, + "balance_loss_mlp": 0.01266504, + "epoch": 0.053930557643168495, + "flos": 17315888855040.0, + "grad_norm": 5.716166538264852, + "language_loss": 0.91855001, + "learning_rate": 3.9939963105241115e-06, + "loss": 1.00242019, + "num_input_tokens_seen": 19179385, + "router_z_loss_clip": 6.0390625, + "router_z_loss_mlp": 0.72998047, + "step": 897, + "time_per_iteration": 2.5864884853363037 + }, + { + "auxiliary_loss_clip": 0.06997538, + "auxiliary_loss_mlp": 0.013383, + "balance_loss_clip": 0.06422779, + "balance_loss_mlp": 0.0126625, + "epoch": 0.05399068089583647, + "flos": 17354350679040.0, + "grad_norm": 28.355738836577903, + "language_loss": 0.93759477, + "learning_rate": 3.993966118527175e-06, + "loss": 1.02095306, + "num_input_tokens_seen": 19198725, + "router_z_loss_clip": 5.74609375, + "router_z_loss_mlp": 0.72070312, + "step": 898, + "time_per_iteration": 2.6132631301879883 + }, + { + "auxiliary_loss_clip": 0.07036521, + "auxiliary_loss_mlp": 0.01343105, + "balance_loss_clip": 0.06425488, + "balance_loss_mlp": 0.01264809, + "epoch": 0.05405080414850443, + "flos": 17491594867200.0, + "grad_norm": 4.630068897804509, + "language_loss": 0.97064686, + "learning_rate": 3.993935850918845e-06, + "loss": 1.05444312, + "num_input_tokens_seen": 19212380, + "router_z_loss_clip": 6.10546875, + "router_z_loss_mlp": 0.78320312, + "step": 899, + "time_per_iteration": 2.5816986560821533 + }, + { + "auxiliary_loss_clip": 0.07002847, + "auxiliary_loss_mlp": 0.01337851, + "balance_loss_clip": 0.06429946, + "balance_loss_mlp": 0.01263131, + "epoch": 0.054110927401172404, + "flos": 24503365981440.0, + "grad_norm": 5.469084454178289, + "language_loss": 0.79532343, + "learning_rate": 3.9939055077002665e-06, + "loss": 0.87873036, + "num_input_tokens_seen": 19232235, + "router_z_loss_clip": 5.73046875, + "router_z_loss_mlp": 0.74755859, + "step": 900, + "time_per_iteration": 2.6616973876953125 + }, + { + "auxiliary_loss_clip": 0.07026203, + "auxiliary_loss_mlp": 0.01335204, + "balance_loss_clip": 0.06429055, + "balance_loss_mlp": 0.01261628, + "epoch": 0.054171050653840376, + "flos": 22936715792640.0, + "grad_norm": 9.114074112173778, + "language_loss": 0.79687816, + "learning_rate": 3.993875088872592e-06, + "loss": 0.88049221, + "num_input_tokens_seen": 19251460, + "router_z_loss_clip": 5.9765625, + "router_z_loss_mlp": 0.73681641, + "step": 901, + "time_per_iteration": 2.6217994689941406 + }, + { + "auxiliary_loss_clip": 0.06969521, + "auxiliary_loss_mlp": 0.01353187, + "balance_loss_clip": 0.06413257, + "balance_loss_mlp": 0.01276941, + "epoch": 0.05423117390650834, + "flos": 12938238218880.0, + "grad_norm": 4.5794905652094675, + "language_loss": 0.8858788, + "learning_rate": 3.9938445944369745e-06, + "loss": 0.96910584, + "num_input_tokens_seen": 19269060, + "router_z_loss_clip": 5.56640625, + "router_z_loss_mlp": 0.76220703, + "step": 902, + "time_per_iteration": 2.600041151046753 + }, + { + "auxiliary_loss_clip": 0.07010742, + "auxiliary_loss_mlp": 0.01348168, + "balance_loss_clip": 0.0642361, + "balance_loss_mlp": 0.01272208, + "epoch": 0.05429129715917631, + "flos": 19907438659200.0, + "grad_norm": 3.5235627900978987, + "language_loss": 0.90038717, + "learning_rate": 3.993814024394569e-06, + "loss": 0.98397624, + "num_input_tokens_seen": 19288620, + "router_z_loss_clip": 5.875, + "router_z_loss_mlp": 0.75927734, + "step": 903, + "time_per_iteration": 2.654343843460083 + }, + { + "auxiliary_loss_clip": 0.07027672, + "auxiliary_loss_mlp": 0.01351984, + "balance_loss_clip": 0.06429485, + "balance_loss_mlp": 0.01276739, + "epoch": 0.05435142041184428, + "flos": 16914065800320.0, + "grad_norm": 3.6682943607818808, + "language_loss": 0.79433787, + "learning_rate": 3.993783378746537e-06, + "loss": 0.87813443, + "num_input_tokens_seen": 19306615, + "router_z_loss_clip": 5.98046875, + "router_z_loss_mlp": 0.75292969, + "step": 904, + "time_per_iteration": 2.5959675312042236 + }, + { + "auxiliary_loss_clip": 0.07042356, + "auxiliary_loss_mlp": 0.01361745, + "balance_loss_clip": 0.06427713, + "balance_loss_mlp": 0.01279062, + "epoch": 0.05441154366451225, + "flos": 23954613592320.0, + "grad_norm": 4.579053653377249, + "language_loss": 0.88901699, + "learning_rate": 3.993752657494039e-06, + "loss": 0.97305799, + "num_input_tokens_seen": 19321680, + "router_z_loss_clip": 6.140625, + "router_z_loss_mlp": 0.82714844, + "step": 905, + "time_per_iteration": 2.6219427585601807 + }, + { + "auxiliary_loss_clip": 0.06998053, + "auxiliary_loss_mlp": 0.01347731, + "balance_loss_clip": 0.06429392, + "balance_loss_mlp": 0.01274727, + "epoch": 0.05447166691718022, + "flos": 19981678976640.0, + "grad_norm": 3.7765145633999624, + "language_loss": 0.78233027, + "learning_rate": 3.993721860638241e-06, + "loss": 0.8657881, + "num_input_tokens_seen": 19339760, + "router_z_loss_clip": 5.6875, + "router_z_loss_mlp": 0.73046875, + "step": 906, + "time_per_iteration": 2.6213393211364746 + }, + { + "auxiliary_loss_clip": 0.07034522, + "auxiliary_loss_mlp": 0.01354415, + "balance_loss_clip": 0.06439427, + "balance_loss_mlp": 0.01281221, + "epoch": 0.05453179016984819, + "flos": 24943483152000.0, + "grad_norm": 3.1487164244038546, + "language_loss": 0.91526973, + "learning_rate": 3.993690988180309e-06, + "loss": 0.9991591, + "num_input_tokens_seen": 19359585, + "router_z_loss_clip": 5.94921875, + "router_z_loss_mlp": 0.73242188, + "step": 907, + "time_per_iteration": 2.6804075241088867 + }, + { + "auxiliary_loss_clip": 0.07033581, + "auxiliary_loss_mlp": 0.01357567, + "balance_loss_clip": 0.06437694, + "balance_loss_mlp": 0.01279461, + "epoch": 0.05459191342251616, + "flos": 18121170119040.0, + "grad_norm": 6.406912601020187, + "language_loss": 0.90540731, + "learning_rate": 3.9936600401214165e-06, + "loss": 0.98931873, + "num_input_tokens_seen": 19378590, + "router_z_loss_clip": 5.953125, + "router_z_loss_mlp": 0.78076172, + "step": 908, + "time_per_iteration": 2.645015001296997 + }, + { + "auxiliary_loss_clip": 0.07043326, + "auxiliary_loss_mlp": 0.01345219, + "balance_loss_clip": 0.06445918, + "balance_loss_mlp": 0.01274695, + "epoch": 0.054652036675184125, + "flos": 19214314485120.0, + "grad_norm": 7.110019645600745, + "language_loss": 0.94541007, + "learning_rate": 3.9936290164627345e-06, + "loss": 1.02929544, + "num_input_tokens_seen": 19397910, + "router_z_loss_clip": 5.96875, + "router_z_loss_mlp": 0.70507812, + "step": 909, + "time_per_iteration": 2.6648013591766357 + }, + { + "auxiliary_loss_clip": 0.07070212, + "auxiliary_loss_mlp": 0.01367531, + "balance_loss_clip": 0.06454301, + "balance_loss_mlp": 0.01287184, + "epoch": 0.0547121599278521, + "flos": 16331253926400.0, + "grad_norm": 4.130588011927331, + "language_loss": 0.76068008, + "learning_rate": 3.99359791720544e-06, + "loss": 0.84505749, + "num_input_tokens_seen": 19415950, + "router_z_loss_clip": 6.15625, + "router_z_loss_mlp": 0.80273438, + "step": 910, + "time_per_iteration": 2.588240146636963 + }, + { + "auxiliary_loss_clip": 0.07039558, + "auxiliary_loss_mlp": 0.0135407, + "balance_loss_clip": 0.06453503, + "balance_loss_mlp": 0.01281829, + "epoch": 0.05477228318052007, + "flos": 20345165988480.0, + "grad_norm": 30.49086914574189, + "language_loss": 0.86822844, + "learning_rate": 3.993566742350714e-06, + "loss": 0.95216471, + "num_input_tokens_seen": 19435275, + "router_z_loss_clip": 5.8671875, + "router_z_loss_mlp": 0.72265625, + "step": 911, + "time_per_iteration": 2.6324408054351807 + }, + { + "auxiliary_loss_clip": 0.07064489, + "auxiliary_loss_mlp": 0.01358074, + "balance_loss_clip": 0.06459624, + "balance_loss_mlp": 0.01280207, + "epoch": 0.054832406433188034, + "flos": 21978216138240.0, + "grad_norm": 33.1555590789585, + "language_loss": 0.80294693, + "learning_rate": 3.993535491899736e-06, + "loss": 0.88717258, + "num_input_tokens_seen": 19452090, + "router_z_loss_clip": 6.046875, + "router_z_loss_mlp": 0.77880859, + "step": 912, + "time_per_iteration": 2.590373992919922 + }, + { + "auxiliary_loss_clip": 0.0703726, + "auxiliary_loss_mlp": 0.01353834, + "balance_loss_clip": 0.06456903, + "balance_loss_mlp": 0.01284979, + "epoch": 0.054892529685856006, + "flos": 16404487994880.0, + "grad_norm": 20.678206909589232, + "language_loss": 0.87077272, + "learning_rate": 3.993504165853694e-06, + "loss": 0.9546836, + "num_input_tokens_seen": 19470865, + "router_z_loss_clip": 5.8046875, + "router_z_loss_mlp": 0.68896484, + "step": 913, + "time_per_iteration": 2.6207854747772217 + }, + { + "auxiliary_loss_clip": 0.07058232, + "auxiliary_loss_mlp": 0.01355937, + "balance_loss_clip": 0.06467378, + "balance_loss_mlp": 0.01279214, + "epoch": 0.05495265293852397, + "flos": 23918709317760.0, + "grad_norm": 2.929829982992902, + "language_loss": 0.86646307, + "learning_rate": 3.993472764213772e-06, + "loss": 0.9506048, + "num_input_tokens_seen": 19492145, + "router_z_loss_clip": 5.91015625, + "router_z_loss_mlp": 0.76708984, + "step": 914, + "time_per_iteration": 2.653738260269165 + }, + { + "auxiliary_loss_clip": 0.07080867, + "auxiliary_loss_mlp": 0.01347963, + "balance_loss_clip": 0.06487378, + "balance_loss_mlp": 0.01278583, + "epoch": 0.055012776191191944, + "flos": 23593767984000.0, + "grad_norm": 5.681880132712419, + "language_loss": 0.94313538, + "learning_rate": 3.9934412869811655e-06, + "loss": 1.02742374, + "num_input_tokens_seen": 19511015, + "router_z_loss_clip": 5.93359375, + "router_z_loss_mlp": 0.69433594, + "step": 915, + "time_per_iteration": 2.6307506561279297 + }, + { + "auxiliary_loss_clip": 0.07055361, + "auxiliary_loss_mlp": 0.01345822, + "balance_loss_clip": 0.06473369, + "balance_loss_mlp": 0.01276442, + "epoch": 0.055072899443859916, + "flos": 17533997832960.0, + "grad_norm": 9.383060565186796, + "language_loss": 0.9327727, + "learning_rate": 3.993409734157064e-06, + "loss": 1.01678455, + "num_input_tokens_seen": 19529040, + "router_z_loss_clip": 5.8203125, + "router_z_loss_mlp": 0.69384766, + "step": 916, + "time_per_iteration": 2.5821292400360107 + }, + { + "auxiliary_loss_clip": 0.0710435, + "auxiliary_loss_mlp": 0.01382873, + "balance_loss_clip": 0.06478155, + "balance_loss_mlp": 0.01299808, + "epoch": 0.05513302269652788, + "flos": 21693246001920.0, + "grad_norm": 9.219504726961107, + "language_loss": 0.83272588, + "learning_rate": 3.993378105742666e-06, + "loss": 0.91759813, + "num_input_tokens_seen": 19549540, + "router_z_loss_clip": 6.2578125, + "router_z_loss_mlp": 0.83056641, + "step": 917, + "time_per_iteration": 2.620739221572876 + }, + { + "auxiliary_loss_clip": 0.07102817, + "auxiliary_loss_mlp": 0.01375299, + "balance_loss_clip": 0.06484253, + "balance_loss_mlp": 0.01293473, + "epoch": 0.05519314594919585, + "flos": 21619257246720.0, + "grad_norm": 3.775060612193374, + "language_loss": 0.84478474, + "learning_rate": 3.9933464017391705e-06, + "loss": 0.92956591, + "num_input_tokens_seen": 19567570, + "router_z_loss_clip": 6.1875, + "router_z_loss_mlp": 0.81787109, + "step": 918, + "time_per_iteration": 2.594416379928589 + }, + { + "auxiliary_loss_clip": 0.07101964, + "auxiliary_loss_mlp": 0.01367305, + "balance_loss_clip": 0.06485492, + "balance_loss_mlp": 0.01289151, + "epoch": 0.05525326920186382, + "flos": 21804983821440.0, + "grad_norm": 30.311763596206674, + "language_loss": 0.92698455, + "learning_rate": 3.99331462214778e-06, + "loss": 1.01167727, + "num_input_tokens_seen": 19585330, + "router_z_loss_clip": 6.171875, + "router_z_loss_mlp": 0.78125, + "step": 919, + "time_per_iteration": 2.652820587158203 + }, + { + "auxiliary_loss_clip": 0.07067424, + "auxiliary_loss_mlp": 0.01355052, + "balance_loss_clip": 0.06469625, + "balance_loss_mlp": 0.01279807, + "epoch": 0.05531339245453179, + "flos": 28447272357120.0, + "grad_norm": 10.071293586926402, + "language_loss": 0.91352344, + "learning_rate": 3.993282766969699e-06, + "loss": 0.99774826, + "num_input_tokens_seen": 19604970, + "router_z_loss_clip": 5.97265625, + "router_z_loss_mlp": 0.75244141, + "step": 920, + "time_per_iteration": 2.676198720932007 + }, + { + "auxiliary_loss_clip": 0.0705073, + "auxiliary_loss_mlp": 0.01349539, + "balance_loss_clip": 0.06465692, + "balance_loss_mlp": 0.01277489, + "epoch": 0.05537351570719976, + "flos": 37383688229760.0, + "grad_norm": 4.912310342767309, + "language_loss": 0.69610375, + "learning_rate": 3.993250836206136e-06, + "loss": 0.78010643, + "num_input_tokens_seen": 19626235, + "router_z_loss_clip": 5.85546875, + "router_z_loss_mlp": 0.72021484, + "step": 921, + "time_per_iteration": 2.729602098464966 + }, + { + "auxiliary_loss_clip": 0.07080688, + "auxiliary_loss_mlp": 0.01369369, + "balance_loss_clip": 0.06465121, + "balance_loss_mlp": 0.01287687, + "epoch": 0.05543363895986773, + "flos": 20090733465600.0, + "grad_norm": 4.2535446135467785, + "language_loss": 0.76117694, + "learning_rate": 3.993218829858301e-06, + "loss": 0.8456775, + "num_input_tokens_seen": 19644305, + "router_z_loss_clip": 6.1640625, + "router_z_loss_mlp": 0.81689453, + "step": 922, + "time_per_iteration": 2.5846810340881348 + }, + { + "auxiliary_loss_clip": 0.07077445, + "auxiliary_loss_mlp": 0.01375095, + "balance_loss_clip": 0.06466563, + "balance_loss_mlp": 0.01293842, + "epoch": 0.0554937622125357, + "flos": 24539773380480.0, + "grad_norm": 5.782149663492731, + "language_loss": 0.86474669, + "learning_rate": 3.993186747927408e-06, + "loss": 0.9492721, + "num_input_tokens_seen": 19662130, + "router_z_loss_clip": 6.1171875, + "router_z_loss_mlp": 0.81298828, + "step": 923, + "time_per_iteration": 2.6038758754730225 + }, + { + "auxiliary_loss_clip": 0.07066977, + "auxiliary_loss_mlp": 0.01365852, + "balance_loss_clip": 0.06460079, + "balance_loss_mlp": 0.01286125, + "epoch": 0.055553885465203665, + "flos": 14325408961920.0, + "grad_norm": 4.5524709486596695, + "language_loss": 0.82890737, + "learning_rate": 3.993154590414675e-06, + "loss": 0.91323566, + "num_input_tokens_seen": 19680715, + "router_z_loss_clip": 6.0703125, + "router_z_loss_mlp": 0.79736328, + "step": 924, + "time_per_iteration": 2.563229560852051 + }, + { + "auxiliary_loss_clip": 0.07049644, + "auxiliary_loss_mlp": 0.01383238, + "balance_loss_clip": 0.06458092, + "balance_loss_mlp": 0.01303654, + "epoch": 0.05561400871787164, + "flos": 27388522892160.0, + "grad_norm": 5.4957057534226115, + "language_loss": 1.05798936, + "learning_rate": 3.993122357321319e-06, + "loss": 1.14231825, + "num_input_tokens_seen": 19700535, + "router_z_loss_clip": 5.9140625, + "router_z_loss_mlp": 0.79492188, + "step": 925, + "time_per_iteration": 4.167480230331421 + }, + { + "auxiliary_loss_clip": 0.07051321, + "auxiliary_loss_mlp": 0.01368022, + "balance_loss_clip": 0.06456822, + "balance_loss_mlp": 0.01291585, + "epoch": 0.05567413197053961, + "flos": 23227681495680.0, + "grad_norm": 4.150968516842117, + "language_loss": 0.85383534, + "learning_rate": 3.993090048648564e-06, + "loss": 0.93802875, + "num_input_tokens_seen": 19718825, + "router_z_loss_clip": 5.94921875, + "router_z_loss_mlp": 0.76367188, + "step": 926, + "time_per_iteration": 4.156589031219482 + }, + { + "auxiliary_loss_clip": 0.07111964, + "auxiliary_loss_mlp": 0.01390888, + "balance_loss_clip": 0.06470172, + "balance_loss_mlp": 0.0130129, + "epoch": 0.055734255223207574, + "flos": 25271988284160.0, + "grad_norm": 8.095313947782397, + "language_loss": 0.79582185, + "learning_rate": 3.993057664397634e-06, + "loss": 0.88085037, + "num_input_tokens_seen": 19739080, + "router_z_loss_clip": 6.42578125, + "router_z_loss_mlp": 0.89550781, + "step": 927, + "time_per_iteration": 2.6851751804351807 + }, + { + "auxiliary_loss_clip": 0.06860578, + "auxiliary_loss_mlp": 0.01306525, + "balance_loss_clip": 0.06486383, + "balance_loss_mlp": 0.01261607, + "epoch": 0.055794378475875546, + "flos": 66524698938240.0, + "grad_norm": 0.7865808163657396, + "language_loss": 0.59965324, + "learning_rate": 3.9930252045697585e-06, + "loss": 0.68132424, + "num_input_tokens_seen": 19802960, + "router_z_loss_clip": 3.74804688, + "router_z_loss_mlp": 0.44921875, + "step": 928, + "time_per_iteration": 4.694532632827759 + }, + { + "auxiliary_loss_clip": 0.0702403, + "auxiliary_loss_mlp": 0.01398439, + "balance_loss_clip": 0.06437568, + "balance_loss_mlp": 0.01313991, + "epoch": 0.05585450172854351, + "flos": 25344635374080.0, + "grad_norm": 5.300738051002958, + "language_loss": 0.99270105, + "learning_rate": 3.992992669166168e-06, + "loss": 1.07692575, + "num_input_tokens_seen": 19822765, + "router_z_loss_clip": 5.8671875, + "router_z_loss_mlp": 0.84472656, + "step": 929, + "time_per_iteration": 2.652329444885254 + }, + { + "auxiliary_loss_clip": 0.07033007, + "auxiliary_loss_mlp": 0.01402576, + "balance_loss_clip": 0.06441823, + "balance_loss_mlp": 0.01318938, + "epoch": 0.05591462498121148, + "flos": 33920163711360.0, + "grad_norm": 20.10669872289237, + "language_loss": 0.7473861, + "learning_rate": 3.992960058188094e-06, + "loss": 0.83174193, + "num_input_tokens_seen": 19843590, + "router_z_loss_clip": 5.91015625, + "router_z_loss_mlp": 0.83691406, + "step": 930, + "time_per_iteration": 4.218009948730469 + }, + { + "auxiliary_loss_clip": 0.0703931, + "auxiliary_loss_mlp": 0.01397804, + "balance_loss_clip": 0.06446733, + "balance_loss_mlp": 0.01313929, + "epoch": 0.055974748233879455, + "flos": 17936617501440.0, + "grad_norm": 4.521391546474749, + "language_loss": 0.88519967, + "learning_rate": 3.992927371636776e-06, + "loss": 0.96957082, + "num_input_tokens_seen": 19860230, + "router_z_loss_clip": 5.91796875, + "router_z_loss_mlp": 0.83886719, + "step": 931, + "time_per_iteration": 2.5678892135620117 + }, + { + "auxiliary_loss_clip": 0.07037735, + "auxiliary_loss_mlp": 0.01413156, + "balance_loss_clip": 0.06439222, + "balance_loss_mlp": 0.01325466, + "epoch": 0.05603487148654742, + "flos": 24028392712320.0, + "grad_norm": 3.3508446860260355, + "language_loss": 0.86982858, + "learning_rate": 3.9928946095134525e-06, + "loss": 0.95433742, + "num_input_tokens_seen": 19880795, + "router_z_loss_clip": 5.9921875, + "router_z_loss_mlp": 0.87695312, + "step": 932, + "time_per_iteration": 2.6454596519470215 + }, + { + "auxiliary_loss_clip": 0.07046005, + "auxiliary_loss_mlp": 0.01409303, + "balance_loss_clip": 0.06444195, + "balance_loss_mlp": 0.01322901, + "epoch": 0.05609499473921539, + "flos": 17312912035200.0, + "grad_norm": 4.63721211876497, + "language_loss": 0.79083282, + "learning_rate": 3.992861771819365e-06, + "loss": 0.87538588, + "num_input_tokens_seen": 19897960, + "router_z_loss_clip": 6.02734375, + "router_z_loss_mlp": 0.86328125, + "step": 933, + "time_per_iteration": 2.5537846088409424 + }, + { + "auxiliary_loss_clip": 0.07023589, + "auxiliary_loss_mlp": 0.01416541, + "balance_loss_clip": 0.06434061, + "balance_loss_mlp": 0.01334287, + "epoch": 0.05615511799188336, + "flos": 21000834587520.0, + "grad_norm": 6.948998666256607, + "language_loss": 0.90410703, + "learning_rate": 3.99282885855576e-06, + "loss": 0.98850828, + "num_input_tokens_seen": 19913315, + "router_z_loss_clip": 5.89453125, + "router_z_loss_mlp": 0.82275391, + "step": 934, + "time_per_iteration": 2.5762336254119873 + }, + { + "auxiliary_loss_clip": 0.06990926, + "auxiliary_loss_mlp": 0.01429171, + "balance_loss_clip": 0.06438624, + "balance_loss_mlp": 0.01345153, + "epoch": 0.05621524124455133, + "flos": 17279062185600.0, + "grad_norm": 7.5646674228018265, + "language_loss": 0.84164441, + "learning_rate": 3.992795869723885e-06, + "loss": 0.92584538, + "num_input_tokens_seen": 19928790, + "router_z_loss_clip": 5.52734375, + "router_z_loss_mlp": 0.83984375, + "step": 935, + "time_per_iteration": 2.6203958988189697 + }, + { + "auxiliary_loss_clip": 0.06841761, + "auxiliary_loss_mlp": 0.01418196, + "balance_loss_clip": 0.06462182, + "balance_loss_mlp": 0.01359927, + "epoch": 0.0562753644972193, + "flos": 58737597194880.0, + "grad_norm": 0.8140808506826857, + "language_loss": 0.69178045, + "learning_rate": 3.99276280532499e-06, + "loss": 0.77438003, + "num_input_tokens_seen": 19988785, + "router_z_loss_clip": 3.79101562, + "router_z_loss_mlp": 0.58105469, + "step": 936, + "time_per_iteration": 3.1629393100738525 + }, + { + "auxiliary_loss_clip": 0.070338, + "auxiliary_loss_mlp": 0.01416227, + "balance_loss_clip": 0.06443301, + "balance_loss_mlp": 0.0133178, + "epoch": 0.05633548774988727, + "flos": 17462776262400.0, + "grad_norm": 4.591481841632389, + "language_loss": 0.81027842, + "learning_rate": 3.992729665360331e-06, + "loss": 0.89477861, + "num_input_tokens_seen": 20007685, + "router_z_loss_clip": 5.8984375, + "router_z_loss_mlp": 0.84472656, + "step": 937, + "time_per_iteration": 2.650186538696289 + }, + { + "auxiliary_loss_clip": 0.0684337, + "auxiliary_loss_mlp": 0.01393468, + "balance_loss_clip": 0.06467308, + "balance_loss_mlp": 0.01340683, + "epoch": 0.05639561100255524, + "flos": 70675939042560.0, + "grad_norm": 0.8752420339339617, + "language_loss": 0.64563346, + "learning_rate": 3.992696449831162e-06, + "loss": 0.72800183, + "num_input_tokens_seen": 20072750, + "router_z_loss_clip": 3.75, + "router_z_loss_mlp": 0.52880859, + "step": 938, + "time_per_iteration": 3.200669050216675 + }, + { + "auxiliary_loss_clip": 0.07073379, + "auxiliary_loss_mlp": 0.01391777, + "balance_loss_clip": 0.06460777, + "balance_loss_mlp": 0.01309332, + "epoch": 0.056455734255223204, + "flos": 20492346885120.0, + "grad_norm": 5.43214954330628, + "language_loss": 0.84251928, + "learning_rate": 3.992663158738745e-06, + "loss": 0.92717087, + "num_input_tokens_seen": 20089070, + "router_z_loss_clip": 6.125, + "router_z_loss_mlp": 0.82373047, + "step": 939, + "time_per_iteration": 2.622727870941162 + }, + { + "auxiliary_loss_clip": 0.07029171, + "auxiliary_loss_mlp": 0.01403853, + "balance_loss_clip": 0.06452838, + "balance_loss_mlp": 0.01326081, + "epoch": 0.056515857507891176, + "flos": 22059961395840.0, + "grad_norm": 5.005416621507547, + "language_loss": 0.76388282, + "learning_rate": 3.992629792084341e-06, + "loss": 0.84821308, + "num_input_tokens_seen": 20108790, + "router_z_loss_clip": 5.765625, + "router_z_loss_mlp": 0.77734375, + "step": 940, + "time_per_iteration": 2.6560001373291016 + }, + { + "auxiliary_loss_clip": 0.07005631, + "auxiliary_loss_mlp": 0.01389365, + "balance_loss_clip": 0.06443679, + "balance_loss_mlp": 0.01314073, + "epoch": 0.05657598076055915, + "flos": 24032291927040.0, + "grad_norm": 11.024308816683174, + "language_loss": 0.7415117, + "learning_rate": 3.992596349869216e-06, + "loss": 0.82546163, + "num_input_tokens_seen": 20128455, + "router_z_loss_clip": 5.6171875, + "router_z_loss_mlp": 0.75341797, + "step": 941, + "time_per_iteration": 2.691328525543213 + }, + { + "auxiliary_loss_clip": 0.07028662, + "auxiliary_loss_mlp": 0.01392256, + "balance_loss_clip": 0.06448376, + "balance_loss_mlp": 0.0131496, + "epoch": 0.05663610401322711, + "flos": 20486057829120.0, + "grad_norm": 6.757951792278694, + "language_loss": 0.8311438, + "learning_rate": 3.992562832094637e-06, + "loss": 0.91535294, + "num_input_tokens_seen": 20145775, + "router_z_loss_clip": 5.80859375, + "router_z_loss_mlp": 0.77246094, + "step": 942, + "time_per_iteration": 2.5987863540649414 + }, + { + "auxiliary_loss_clip": 0.07036945, + "auxiliary_loss_mlp": 0.01378378, + "balance_loss_clip": 0.06460088, + "balance_loss_mlp": 0.01303896, + "epoch": 0.056696227265895086, + "flos": 21075368394240.0, + "grad_norm": 21.600438823460475, + "language_loss": 0.92831737, + "learning_rate": 3.9925292387618755e-06, + "loss": 1.01247072, + "num_input_tokens_seen": 20164315, + "router_z_loss_clip": 5.765625, + "router_z_loss_mlp": 0.74462891, + "step": 943, + "time_per_iteration": 2.62147855758667 + }, + { + "auxiliary_loss_clip": 0.07040788, + "auxiliary_loss_mlp": 0.01386269, + "balance_loss_clip": 0.06462353, + "balance_loss_mlp": 0.01313027, + "epoch": 0.05675635051856306, + "flos": 17827017960960.0, + "grad_norm": 6.279897483523164, + "language_loss": 0.7991842, + "learning_rate": 3.992495569872206e-06, + "loss": 0.8834548, + "num_input_tokens_seen": 20182760, + "router_z_loss_clip": 5.78125, + "router_z_loss_mlp": 0.73242188, + "step": 944, + "time_per_iteration": 2.5755181312561035 + }, + { + "auxiliary_loss_clip": 0.0704762, + "auxiliary_loss_mlp": 0.01372731, + "balance_loss_clip": 0.06471305, + "balance_loss_mlp": 0.01300109, + "epoch": 0.05681647377123102, + "flos": 23122065024000.0, + "grad_norm": 11.186502162192404, + "language_loss": 0.82437181, + "learning_rate": 3.992461825426906e-06, + "loss": 0.90857524, + "num_input_tokens_seen": 20203830, + "router_z_loss_clip": 5.76171875, + "router_z_loss_mlp": 0.7265625, + "step": 945, + "time_per_iteration": 2.646212339401245 + }, + { + "auxiliary_loss_clip": 0.07062095, + "auxiliary_loss_mlp": 0.01352146, + "balance_loss_clip": 0.06473356, + "balance_loss_mlp": 0.01276854, + "epoch": 0.056876597023898995, + "flos": 16076024789760.0, + "grad_norm": 6.503065924665904, + "language_loss": 0.86640823, + "learning_rate": 3.992428005427252e-06, + "loss": 0.95055068, + "num_input_tokens_seen": 20220365, + "router_z_loss_clip": 5.890625, + "router_z_loss_mlp": 0.75195312, + "step": 946, + "time_per_iteration": 2.5955421924591064 + }, + { + "auxiliary_loss_clip": 0.07105307, + "auxiliary_loss_mlp": 0.01349465, + "balance_loss_clip": 0.06487983, + "balance_loss_mlp": 0.01268975, + "epoch": 0.05693672027656696, + "flos": 16841083294080.0, + "grad_norm": 30.160109907470417, + "language_loss": 0.83428961, + "learning_rate": 3.992394109874529e-06, + "loss": 0.91883731, + "num_input_tokens_seen": 20238640, + "router_z_loss_clip": 6.171875, + "router_z_loss_mlp": 0.80517578, + "step": 947, + "time_per_iteration": 2.578885078430176 + }, + { + "auxiliary_loss_clip": 0.07120173, + "auxiliary_loss_mlp": 0.01346427, + "balance_loss_clip": 0.06479014, + "balance_loss_mlp": 0.01264888, + "epoch": 0.05699684352923493, + "flos": 21394104526080.0, + "grad_norm": 7.760122513642949, + "language_loss": 0.89679337, + "learning_rate": 3.9923601387700225e-06, + "loss": 0.98145938, + "num_input_tokens_seen": 20251025, + "router_z_loss_clip": 6.40625, + "router_z_loss_mlp": 0.81542969, + "step": 948, + "time_per_iteration": 2.6047542095184326 + }, + { + "auxiliary_loss_clip": 0.07067588, + "auxiliary_loss_mlp": 0.01342886, + "balance_loss_clip": 0.06478094, + "balance_loss_mlp": 0.01268786, + "epoch": 0.057056966781902904, + "flos": 15565818078720.0, + "grad_norm": 4.718676024566818, + "language_loss": 0.91130018, + "learning_rate": 3.992326092115019e-06, + "loss": 0.99540496, + "num_input_tokens_seen": 20269775, + "router_z_loss_clip": 5.8984375, + "router_z_loss_mlp": 0.74121094, + "step": 949, + "time_per_iteration": 2.59798526763916 + }, + { + "auxiliary_loss_clip": 0.07052803, + "auxiliary_loss_mlp": 0.01334514, + "balance_loss_clip": 0.06479354, + "balance_loss_mlp": 0.01265897, + "epoch": 0.05711709003457087, + "flos": 19943971839360.0, + "grad_norm": 5.50050902669799, + "language_loss": 0.81973231, + "learning_rate": 3.992291969910811e-06, + "loss": 0.90360546, + "num_input_tokens_seen": 20287715, + "router_z_loss_clip": 5.73828125, + "router_z_loss_mlp": 0.68603516, + "step": 950, + "time_per_iteration": 2.6259987354278564 + }, + { + "auxiliary_loss_clip": 0.07096414, + "auxiliary_loss_mlp": 0.01341844, + "balance_loss_clip": 0.06496268, + "balance_loss_mlp": 0.0126536, + "epoch": 0.05717721328723884, + "flos": 30339953982720.0, + "grad_norm": 5.942643661235501, + "language_loss": 0.85793424, + "learning_rate": 3.992257772158691e-06, + "loss": 0.94231689, + "num_input_tokens_seen": 20307070, + "router_z_loss_clip": 5.99609375, + "router_z_loss_mlp": 0.76464844, + "step": 951, + "time_per_iteration": 2.6625497341156006 + }, + { + "auxiliary_loss_clip": 0.07096014, + "auxiliary_loss_mlp": 0.01337385, + "balance_loss_clip": 0.06490001, + "balance_loss_mlp": 0.0125494, + "epoch": 0.05723733653990681, + "flos": 23660251799040.0, + "grad_norm": 12.14793274648965, + "language_loss": 0.90794688, + "learning_rate": 3.992223498859958e-06, + "loss": 0.9922809, + "num_input_tokens_seen": 20324945, + "router_z_loss_clip": 6.06640625, + "router_z_loss_mlp": 0.82373047, + "step": 952, + "time_per_iteration": 2.6754026412963867 + }, + { + "auxiliary_loss_clip": 0.07150276, + "auxiliary_loss_mlp": 0.01358536, + "balance_loss_clip": 0.06509267, + "balance_loss_mlp": 0.01266268, + "epoch": 0.05729745979257478, + "flos": 22062518945280.0, + "grad_norm": 4.876026783534778, + "language_loss": 0.83819556, + "learning_rate": 3.9921891500159084e-06, + "loss": 0.92328364, + "num_input_tokens_seen": 20346135, + "router_z_loss_clip": 6.4140625, + "router_z_loss_mlp": 0.92333984, + "step": 953, + "time_per_iteration": 2.6004669666290283 + }, + { + "auxiliary_loss_clip": 0.07094061, + "auxiliary_loss_mlp": 0.01342327, + "balance_loss_clip": 0.06495301, + "balance_loss_mlp": 0.01262409, + "epoch": 0.05735758304524275, + "flos": 19609554994560.0, + "grad_norm": 6.9064094964387, + "language_loss": 0.9058758, + "learning_rate": 3.992154725627848e-06, + "loss": 0.99023962, + "num_input_tokens_seen": 20364450, + "router_z_loss_clip": 5.98046875, + "router_z_loss_mlp": 0.79931641, + "step": 954, + "time_per_iteration": 2.6270759105682373 + }, + { + "auxiliary_loss_clip": 0.07104363, + "auxiliary_loss_mlp": 0.01340099, + "balance_loss_clip": 0.06505129, + "balance_loss_mlp": 0.01262661, + "epoch": 0.057417706297910716, + "flos": 19105050360960.0, + "grad_norm": 6.439393268367411, + "language_loss": 0.9193548, + "learning_rate": 3.9921202256970804e-06, + "loss": 1.00379944, + "num_input_tokens_seen": 20383500, + "router_z_loss_clip": 6.0, + "router_z_loss_mlp": 0.77490234, + "step": 955, + "time_per_iteration": 2.5784714221954346 + }, + { + "auxiliary_loss_clip": 0.07088242, + "auxiliary_loss_mlp": 0.01339912, + "balance_loss_clip": 0.06500716, + "balance_loss_mlp": 0.01263379, + "epoch": 0.05747782955057869, + "flos": 16660136401920.0, + "grad_norm": 130.9595542139282, + "language_loss": 0.93622941, + "learning_rate": 3.992085650224914e-06, + "loss": 1.02051091, + "num_input_tokens_seen": 20400295, + "router_z_loss_clip": 5.875, + "router_z_loss_mlp": 0.765625, + "step": 956, + "time_per_iteration": 2.654709815979004 + }, + { + "auxiliary_loss_clip": 0.07069805, + "auxiliary_loss_mlp": 0.01336322, + "balance_loss_clip": 0.06513655, + "balance_loss_mlp": 0.01263795, + "epoch": 0.05753795280324665, + "flos": 14507362103040.0, + "grad_norm": 7.35623901329006, + "language_loss": 0.79601187, + "learning_rate": 3.99205099921266e-06, + "loss": 0.88007313, + "num_input_tokens_seen": 20419085, + "router_z_loss_clip": 5.5625, + "router_z_loss_mlp": 0.72509766, + "step": 957, + "time_per_iteration": 2.5814363956451416 + }, + { + "auxiliary_loss_clip": 0.07102334, + "auxiliary_loss_mlp": 0.013347, + "balance_loss_clip": 0.06516448, + "balance_loss_mlp": 0.01260171, + "epoch": 0.057598076055914625, + "flos": 18081995535360.0, + "grad_norm": 9.445676211161578, + "language_loss": 0.8370564, + "learning_rate": 3.992016272661633e-06, + "loss": 0.92142671, + "num_input_tokens_seen": 20437465, + "router_z_loss_clip": 5.859375, + "router_z_loss_mlp": 0.74511719, + "step": 958, + "time_per_iteration": 2.6244523525238037 + }, + { + "auxiliary_loss_clip": 0.0710094, + "auxiliary_loss_mlp": 0.01346675, + "balance_loss_clip": 0.06526074, + "balance_loss_mlp": 0.01272241, + "epoch": 0.0576581993085826, + "flos": 22130679841920.0, + "grad_norm": 4.908180525960309, + "language_loss": 0.91401774, + "learning_rate": 3.99198147057315e-06, + "loss": 0.99849397, + "num_input_tokens_seen": 20456235, + "router_z_loss_clip": 5.74609375, + "router_z_loss_mlp": 0.74365234, + "step": 959, + "time_per_iteration": 2.5950703620910645 + }, + { + "auxiliary_loss_clip": 0.07097997, + "auxiliary_loss_mlp": 0.01349298, + "balance_loss_clip": 0.06514278, + "balance_loss_mlp": 0.01272431, + "epoch": 0.05771832256125056, + "flos": 33190003232640.0, + "grad_norm": 5.502917231642364, + "language_loss": 0.82885253, + "learning_rate": 3.991946592948529e-06, + "loss": 0.91332549, + "num_input_tokens_seen": 20476825, + "router_z_loss_clip": 5.83203125, + "router_z_loss_mlp": 0.76904297, + "step": 960, + "time_per_iteration": 2.7026655673980713 + }, + { + "auxiliary_loss_clip": 0.07121219, + "auxiliary_loss_mlp": 0.0136329, + "balance_loss_clip": 0.06516127, + "balance_loss_mlp": 0.01276888, + "epoch": 0.057778445813918534, + "flos": 24176957201280.0, + "grad_norm": 10.105803552355386, + "language_loss": 0.96418011, + "learning_rate": 3.991911639789094e-06, + "loss": 1.0490253, + "num_input_tokens_seen": 20496965, + "router_z_loss_clip": 6.0546875, + "router_z_loss_mlp": 0.86425781, + "step": 961, + "time_per_iteration": 2.621075391769409 + }, + { + "auxiliary_loss_clip": 0.07137178, + "auxiliary_loss_mlp": 0.0136525, + "balance_loss_clip": 0.06529568, + "balance_loss_mlp": 0.01280421, + "epoch": 0.0578385690665865, + "flos": 29650770950400.0, + "grad_norm": 15.740079848034652, + "language_loss": 0.72144246, + "learning_rate": 3.991876611096169e-06, + "loss": 0.80646676, + "num_input_tokens_seen": 20518035, + "router_z_loss_clip": 6.08203125, + "router_z_loss_mlp": 0.84863281, + "step": 962, + "time_per_iteration": 2.662982702255249 + }, + { + "auxiliary_loss_clip": 0.07124397, + "auxiliary_loss_mlp": 0.01385383, + "balance_loss_clip": 0.06529254, + "balance_loss_mlp": 0.01300888, + "epoch": 0.05789869231925447, + "flos": 20891528536320.0, + "grad_norm": 6.9214750574770765, + "language_loss": 0.92274594, + "learning_rate": 3.991841506871084e-06, + "loss": 1.00784373, + "num_input_tokens_seen": 20534740, + "router_z_loss_clip": 5.953125, + "router_z_loss_mlp": 0.84521484, + "step": 963, + "time_per_iteration": 2.6076695919036865 + }, + { + "auxiliary_loss_clip": 0.07119042, + "auxiliary_loss_mlp": 0.01381304, + "balance_loss_clip": 0.06523143, + "balance_loss_mlp": 0.01297953, + "epoch": 0.057958815571922444, + "flos": 26038262672640.0, + "grad_norm": 11.895031253661099, + "language_loss": 0.8968147, + "learning_rate": 3.99180632711517e-06, + "loss": 0.98181814, + "num_input_tokens_seen": 20553485, + "router_z_loss_clip": 5.96484375, + "router_z_loss_mlp": 0.83300781, + "step": 964, + "time_per_iteration": 2.686906337738037 + }, + { + "auxiliary_loss_clip": 0.07105853, + "auxiliary_loss_mlp": 0.01387507, + "balance_loss_clip": 0.06517063, + "balance_loss_mlp": 0.01305252, + "epoch": 0.05801893882459041, + "flos": 18083588762880.0, + "grad_norm": 5.536598394443464, + "language_loss": 0.80100715, + "learning_rate": 3.99177107182976e-06, + "loss": 0.88594079, + "num_input_tokens_seen": 20572155, + "router_z_loss_clip": 5.88671875, + "router_z_loss_mlp": 0.82275391, + "step": 965, + "time_per_iteration": 4.090426921844482 + }, + { + "auxiliary_loss_clip": 0.07108907, + "auxiliary_loss_mlp": 0.01388674, + "balance_loss_clip": 0.0653006, + "balance_loss_mlp": 0.01307803, + "epoch": 0.05807906207725838, + "flos": 17754664360320.0, + "grad_norm": 8.638909024191255, + "language_loss": 0.85803884, + "learning_rate": 3.99173574101619e-06, + "loss": 0.94301462, + "num_input_tokens_seen": 20590395, + "router_z_loss_clip": 5.79296875, + "router_z_loss_mlp": 0.80859375, + "step": 966, + "time_per_iteration": 2.593015670776367 + }, + { + "auxiliary_loss_clip": 0.07081844, + "auxiliary_loss_mlp": 0.01385278, + "balance_loss_clip": 0.06515825, + "balance_loss_mlp": 0.01308507, + "epoch": 0.058139185329926346, + "flos": 18046133187840.0, + "grad_norm": 11.004143242377477, + "language_loss": 0.80350578, + "learning_rate": 3.9917003346758035e-06, + "loss": 0.88817692, + "num_input_tokens_seen": 20608435, + "router_z_loss_clip": 5.671875, + "router_z_loss_mlp": 0.76855469, + "step": 967, + "time_per_iteration": 4.057944297790527 + }, + { + "auxiliary_loss_clip": 0.06839906, + "auxiliary_loss_mlp": 0.01357839, + "balance_loss_clip": 0.06483683, + "balance_loss_mlp": 0.01313065, + "epoch": 0.05819930858259432, + "flos": 62381355845760.0, + "grad_norm": 0.8360355245003168, + "language_loss": 0.57554376, + "learning_rate": 3.991664852809939e-06, + "loss": 0.65752125, + "num_input_tokens_seen": 20668575, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 0.44799805, + "step": 968, + "time_per_iteration": 3.167989730834961 + }, + { + "auxiliary_loss_clip": 0.07096039, + "auxiliary_loss_mlp": 0.01391053, + "balance_loss_clip": 0.06529184, + "balance_loss_mlp": 0.01317, + "epoch": 0.05825943183526229, + "flos": 19141373905920.0, + "grad_norm": 7.005112994692607, + "language_loss": 0.84630275, + "learning_rate": 3.991629295419945e-06, + "loss": 0.93117368, + "num_input_tokens_seen": 20687355, + "router_z_loss_clip": 5.67578125, + "router_z_loss_mlp": 0.74072266, + "step": 969, + "time_per_iteration": 4.074899911880493 + }, + { + "auxiliary_loss_clip": 0.07116528, + "auxiliary_loss_mlp": 0.0138256, + "balance_loss_clip": 0.06523499, + "balance_loss_mlp": 0.01301068, + "epoch": 0.058319555087930255, + "flos": 29030042304000.0, + "grad_norm": 8.083926871251307, + "language_loss": 0.82668531, + "learning_rate": 3.991593662507167e-06, + "loss": 0.91167617, + "num_input_tokens_seen": 20705710, + "router_z_loss_clip": 5.9296875, + "router_z_loss_mlp": 0.81542969, + "step": 970, + "time_per_iteration": 2.659989833831787 + }, + { + "auxiliary_loss_clip": 0.07099806, + "auxiliary_loss_mlp": 0.01400005, + "balance_loss_clip": 0.06510817, + "balance_loss_mlp": 0.01317321, + "epoch": 0.05837967834059823, + "flos": 18885977061120.0, + "grad_norm": 16.518563352615757, + "language_loss": 0.96487081, + "learning_rate": 3.991557954072958e-06, + "loss": 1.04986882, + "num_input_tokens_seen": 20722405, + "router_z_loss_clip": 5.890625, + "router_z_loss_mlp": 0.82714844, + "step": 971, + "time_per_iteration": 2.666133165359497 + }, + { + "auxiliary_loss_clip": 0.07087609, + "auxiliary_loss_mlp": 0.01388607, + "balance_loss_clip": 0.06502773, + "balance_loss_mlp": 0.01310834, + "epoch": 0.05843980159326619, + "flos": 25710218737920.0, + "grad_norm": 16.27135895590574, + "language_loss": 0.89295512, + "learning_rate": 3.991522170118673e-06, + "loss": 0.97771728, + "num_input_tokens_seen": 20741480, + "router_z_loss_clip": 5.84765625, + "router_z_loss_mlp": 0.77832031, + "step": 972, + "time_per_iteration": 2.655470848083496 + }, + { + "auxiliary_loss_clip": 0.07066658, + "auxiliary_loss_mlp": 0.01374677, + "balance_loss_clip": 0.0650342, + "balance_loss_mlp": 0.01301482, + "epoch": 0.058499924845934165, + "flos": 25558425866880.0, + "grad_norm": 4.193788183762945, + "language_loss": 0.90456176, + "learning_rate": 3.991486310645667e-06, + "loss": 0.98897511, + "num_input_tokens_seen": 20759685, + "router_z_loss_clip": 5.62890625, + "router_z_loss_mlp": 0.73144531, + "step": 973, + "time_per_iteration": 2.6482443809509277 + }, + { + "auxiliary_loss_clip": 0.0705331, + "auxiliary_loss_mlp": 0.01383547, + "balance_loss_clip": 0.06485617, + "balance_loss_mlp": 0.01307635, + "epoch": 0.05856004809860214, + "flos": 16441859715840.0, + "grad_norm": 11.262132273646074, + "language_loss": 0.77443254, + "learning_rate": 3.991450375655301e-06, + "loss": 0.85880107, + "num_input_tokens_seen": 20778180, + "router_z_loss_clip": 5.6796875, + "router_z_loss_mlp": 0.75878906, + "step": 974, + "time_per_iteration": 2.57619047164917 + }, + { + "auxiliary_loss_clip": 0.07050242, + "auxiliary_loss_mlp": 0.01379524, + "balance_loss_clip": 0.06485987, + "balance_loss_mlp": 0.01304852, + "epoch": 0.0586201713512701, + "flos": 39468385486080.0, + "grad_norm": 6.566272929573762, + "language_loss": 0.79448825, + "learning_rate": 3.991414365148936e-06, + "loss": 0.87878591, + "num_input_tokens_seen": 20802705, + "router_z_loss_clip": 5.640625, + "router_z_loss_mlp": 0.74707031, + "step": 975, + "time_per_iteration": 2.79398250579834 + }, + { + "auxiliary_loss_clip": 0.07056309, + "auxiliary_loss_mlp": 0.0138878, + "balance_loss_clip": 0.06472544, + "balance_loss_mlp": 0.01304809, + "epoch": 0.058680294603938074, + "flos": 23371466302080.0, + "grad_norm": 4.828568059250088, + "language_loss": 0.79758298, + "learning_rate": 3.99137827912794e-06, + "loss": 0.88203388, + "num_input_tokens_seen": 20822540, + "router_z_loss_clip": 5.83984375, + "router_z_loss_mlp": 0.83984375, + "step": 976, + "time_per_iteration": 2.6214101314544678 + }, + { + "auxiliary_loss_clip": 0.07040592, + "auxiliary_loss_mlp": 0.01371791, + "balance_loss_clip": 0.06474636, + "balance_loss_mlp": 0.01299216, + "epoch": 0.05874041785660604, + "flos": 32239930913280.0, + "grad_norm": 7.236872171762386, + "language_loss": 0.89953148, + "learning_rate": 3.991342117593679e-06, + "loss": 0.98365533, + "num_input_tokens_seen": 20844175, + "router_z_loss_clip": 5.66015625, + "router_z_loss_mlp": 0.72607422, + "step": 977, + "time_per_iteration": 2.681955099105835 + }, + { + "auxiliary_loss_clip": 0.07041348, + "auxiliary_loss_mlp": 0.01373201, + "balance_loss_clip": 0.06467118, + "balance_loss_mlp": 0.01295619, + "epoch": 0.05880054110927401, + "flos": 22316657978880.0, + "grad_norm": 7.280318669233247, + "language_loss": 0.82238227, + "learning_rate": 3.991305880547527e-06, + "loss": 0.90652776, + "num_input_tokens_seen": 20864730, + "router_z_loss_clip": 5.734375, + "router_z_loss_mlp": 0.77587891, + "step": 978, + "time_per_iteration": 2.614290952682495 + }, + { + "auxiliary_loss_clip": 0.0707294, + "auxiliary_loss_mlp": 0.0136034, + "balance_loss_clip": 0.06484175, + "balance_loss_mlp": 0.01280184, + "epoch": 0.05886066436194198, + "flos": 27387726278400.0, + "grad_norm": 155.96057049304315, + "language_loss": 0.83328485, + "learning_rate": 3.991269567990855e-06, + "loss": 0.91761768, + "num_input_tokens_seen": 20885200, + "router_z_loss_clip": 5.89453125, + "router_z_loss_mlp": 0.80175781, + "step": 979, + "time_per_iteration": 2.635091543197632 + }, + { + "auxiliary_loss_clip": 0.0672864, + "auxiliary_loss_mlp": 0.01304756, + "balance_loss_clip": 0.06376771, + "balance_loss_mlp": 0.01257311, + "epoch": 0.05892078761460995, + "flos": 59601102647040.0, + "grad_norm": 0.9093094214807238, + "language_loss": 0.59396595, + "learning_rate": 3.9912331799250415e-06, + "loss": 0.67429984, + "num_input_tokens_seen": 20940325, + "router_z_loss_clip": 3.51953125, + "router_z_loss_mlp": 0.47387695, + "step": 980, + "time_per_iteration": 3.1261343955993652 + }, + { + "auxiliary_loss_clip": 0.07034945, + "auxiliary_loss_mlp": 0.01348733, + "balance_loss_clip": 0.06472749, + "balance_loss_mlp": 0.01274394, + "epoch": 0.05898091086727792, + "flos": 15419517649920.0, + "grad_norm": 3.186788863209633, + "language_loss": 0.90080172, + "learning_rate": 3.9911967163514665e-06, + "loss": 0.98463851, + "num_input_tokens_seen": 20958220, + "router_z_loss_clip": 5.625, + "router_z_loss_mlp": 0.74267578, + "step": 981, + "time_per_iteration": 2.5808515548706055 + }, + { + "auxiliary_loss_clip": 0.0705516, + "auxiliary_loss_mlp": 0.01348366, + "balance_loss_clip": 0.06484837, + "balance_loss_mlp": 0.0127746, + "epoch": 0.059041034119945886, + "flos": 23661383829120.0, + "grad_norm": 5.662656134717616, + "language_loss": 0.82531273, + "learning_rate": 3.991160177271513e-06, + "loss": 0.90934801, + "num_input_tokens_seen": 20978920, + "router_z_loss_clip": 5.703125, + "router_z_loss_mlp": 0.70898438, + "step": 982, + "time_per_iteration": 2.7105038166046143 + }, + { + "auxiliary_loss_clip": 0.07084571, + "auxiliary_loss_mlp": 0.01361032, + "balance_loss_clip": 0.06488383, + "balance_loss_mlp": 0.01281162, + "epoch": 0.05910115737261386, + "flos": 24761026886400.0, + "grad_norm": 3.604575523078559, + "language_loss": 0.87251258, + "learning_rate": 3.9911235626865654e-06, + "loss": 0.95696855, + "num_input_tokens_seen": 20999490, + "router_z_loss_clip": 5.9609375, + "router_z_loss_mlp": 0.79882812, + "step": 983, + "time_per_iteration": 2.744180917739868 + }, + { + "auxiliary_loss_clip": 0.07044654, + "auxiliary_loss_mlp": 0.01351466, + "balance_loss_clip": 0.06470264, + "balance_loss_mlp": 0.01274648, + "epoch": 0.05916128062528183, + "flos": 11733523741440.0, + "grad_norm": 4.930042751750388, + "language_loss": 0.87498015, + "learning_rate": 3.9910868725980125e-06, + "loss": 0.95894134, + "num_input_tokens_seen": 21017865, + "router_z_loss_clip": 5.734375, + "router_z_loss_mlp": 0.76806641, + "step": 984, + "time_per_iteration": 2.651169538497925 + }, + { + "auxiliary_loss_clip": 0.0704496, + "auxiliary_loss_mlp": 0.01342068, + "balance_loss_clip": 0.06470487, + "balance_loss_mlp": 0.01264582, + "epoch": 0.059221403877949795, + "flos": 21908587795200.0, + "grad_norm": 5.844491017467261, + "language_loss": 0.80473924, + "learning_rate": 3.9910501070072465e-06, + "loss": 0.88860953, + "num_input_tokens_seen": 21035900, + "router_z_loss_clip": 5.74609375, + "router_z_loss_mlp": 0.77490234, + "step": 985, + "time_per_iteration": 2.6289291381835938 + }, + { + "auxiliary_loss_clip": 0.07058708, + "auxiliary_loss_mlp": 0.01361985, + "balance_loss_clip": 0.06475725, + "balance_loss_mlp": 0.01284213, + "epoch": 0.05928152713061777, + "flos": 20519614189440.0, + "grad_norm": 6.301686711015131, + "language_loss": 0.93571031, + "learning_rate": 3.991013265915661e-06, + "loss": 1.01991737, + "num_input_tokens_seen": 21053235, + "router_z_loss_clip": 5.83203125, + "router_z_loss_mlp": 0.77783203, + "step": 986, + "time_per_iteration": 2.655438184738159 + }, + { + "auxiliary_loss_clip": 0.0708475, + "auxiliary_loss_mlp": 0.01349267, + "balance_loss_clip": 0.06479746, + "balance_loss_mlp": 0.01270303, + "epoch": 0.05934165038328574, + "flos": 24501437337600.0, + "grad_norm": 4.15562600287031, + "language_loss": 0.79382873, + "learning_rate": 3.9909763493246525e-06, + "loss": 0.87816888, + "num_input_tokens_seen": 21073090, + "router_z_loss_clip": 6.0546875, + "router_z_loss_mlp": 0.79003906, + "step": 987, + "time_per_iteration": 2.635974168777466 + }, + { + "auxiliary_loss_clip": 0.07112011, + "auxiliary_loss_mlp": 0.01375395, + "balance_loss_clip": 0.06492966, + "balance_loss_mlp": 0.0128861, + "epoch": 0.059401773635953704, + "flos": 38737302612480.0, + "grad_norm": 3.024721532830348, + "language_loss": 0.74664164, + "learning_rate": 3.990939357235621e-06, + "loss": 0.83151573, + "num_input_tokens_seen": 21094895, + "router_z_loss_clip": 6.19140625, + "router_z_loss_mlp": 0.8671875, + "step": 988, + "time_per_iteration": 2.8440210819244385 + }, + { + "auxiliary_loss_clip": 0.06738614, + "auxiliary_loss_mlp": 0.01302441, + "balance_loss_clip": 0.06389277, + "balance_loss_mlp": 0.01254757, + "epoch": 0.059461896888621676, + "flos": 58041244638720.0, + "grad_norm": 0.9346440677006217, + "language_loss": 0.71295583, + "learning_rate": 3.99090228964997e-06, + "loss": 0.79336637, + "num_input_tokens_seen": 21147555, + "router_z_loss_clip": 3.5, + "router_z_loss_mlp": 0.4765625, + "step": 989, + "time_per_iteration": 3.0397932529449463 + }, + { + "auxiliary_loss_clip": 0.07105568, + "auxiliary_loss_mlp": 0.01373719, + "balance_loss_clip": 0.06490866, + "balance_loss_mlp": 0.01288604, + "epoch": 0.05952202014128964, + "flos": 22134369421440.0, + "grad_norm": 3.813782873152628, + "language_loss": 0.81950057, + "learning_rate": 3.990865146569105e-06, + "loss": 0.90429342, + "num_input_tokens_seen": 21167845, + "router_z_loss_clip": 6.1484375, + "router_z_loss_mlp": 0.85107422, + "step": 990, + "time_per_iteration": 2.679490804672241 + }, + { + "auxiliary_loss_clip": 0.07070604, + "auxiliary_loss_mlp": 0.0136635, + "balance_loss_clip": 0.0648191, + "balance_loss_mlp": 0.01286957, + "epoch": 0.059582143393957614, + "flos": 20451495219840.0, + "grad_norm": 3.1821025671437786, + "language_loss": 0.88952839, + "learning_rate": 3.990827927994434e-06, + "loss": 0.97389793, + "num_input_tokens_seen": 21185085, + "router_z_loss_clip": 5.890625, + "router_z_loss_mlp": 0.79443359, + "step": 991, + "time_per_iteration": 2.6212010383605957 + }, + { + "auxiliary_loss_clip": 0.07097097, + "auxiliary_loss_mlp": 0.01373652, + "balance_loss_clip": 0.06486384, + "balance_loss_mlp": 0.012893, + "epoch": 0.059642266646625586, + "flos": 20601149811840.0, + "grad_norm": 4.7552664277712475, + "language_loss": 0.80401003, + "learning_rate": 3.9907906339273674e-06, + "loss": 0.88871753, + "num_input_tokens_seen": 21204230, + "router_z_loss_clip": 6.1171875, + "router_z_loss_mlp": 0.84375, + "step": 992, + "time_per_iteration": 2.6194934844970703 + }, + { + "auxiliary_loss_clip": 0.07081859, + "auxiliary_loss_mlp": 0.01371261, + "balance_loss_clip": 0.06485239, + "balance_loss_mlp": 0.01292869, + "epoch": 0.05970238989929355, + "flos": 19358434707840.0, + "grad_norm": 7.615023287218043, + "language_loss": 0.78822339, + "learning_rate": 3.9907532643693215e-06, + "loss": 0.87275457, + "num_input_tokens_seen": 21222655, + "router_z_loss_clip": 5.96875, + "router_z_loss_mlp": 0.78417969, + "step": 993, + "time_per_iteration": 2.5962717533111572 + }, + { + "auxiliary_loss_clip": 0.07073358, + "auxiliary_loss_mlp": 0.01364747, + "balance_loss_clip": 0.06486119, + "balance_loss_mlp": 0.01289073, + "epoch": 0.05976251315196152, + "flos": 30272002721280.0, + "grad_norm": 5.1352604598244, + "language_loss": 0.83427668, + "learning_rate": 3.990715819321712e-06, + "loss": 0.91865766, + "num_input_tokens_seen": 21242310, + "router_z_loss_clip": 5.875, + "router_z_loss_mlp": 0.75634766, + "step": 994, + "time_per_iteration": 2.677586317062378 + }, + { + "auxiliary_loss_clip": 0.07096842, + "auxiliary_loss_mlp": 0.01391454, + "balance_loss_clip": 0.06492864, + "balance_loss_mlp": 0.01313491, + "epoch": 0.05982263640462949, + "flos": 23191819148160.0, + "grad_norm": 4.423928105923456, + "language_loss": 0.83424294, + "learning_rate": 3.99067829878596e-06, + "loss": 0.91912591, + "num_input_tokens_seen": 21261410, + "router_z_loss_clip": 6.046875, + "router_z_loss_mlp": 0.77978516, + "step": 995, + "time_per_iteration": 2.62821364402771 + }, + { + "auxiliary_loss_clip": 0.07109222, + "auxiliary_loss_mlp": 0.01389117, + "balance_loss_clip": 0.06503183, + "balance_loss_mlp": 0.01309247, + "epoch": 0.05988275965729746, + "flos": 27857584448640.0, + "grad_norm": 3.07551937102457, + "language_loss": 0.89631027, + "learning_rate": 3.990640702763487e-06, + "loss": 0.98129368, + "num_input_tokens_seen": 21280080, + "router_z_loss_clip": 6.05859375, + "router_z_loss_mlp": 0.79785156, + "step": 996, + "time_per_iteration": 2.6472525596618652 + }, + { + "auxiliary_loss_clip": 0.0709434, + "auxiliary_loss_mlp": 0.01374144, + "balance_loss_clip": 0.06487706, + "balance_loss_mlp": 0.01292461, + "epoch": 0.05994288290996543, + "flos": 24686744641920.0, + "grad_norm": 3.8490454271878023, + "language_loss": 0.91812748, + "learning_rate": 3.990603031255718e-06, + "loss": 1.00281239, + "num_input_tokens_seen": 21296765, + "router_z_loss_clip": 6.05078125, + "router_z_loss_mlp": 0.81689453, + "step": 997, + "time_per_iteration": 2.6353485584259033 + }, + { + "auxiliary_loss_clip": 0.06747872, + "auxiliary_loss_mlp": 0.0129538, + "balance_loss_clip": 0.06402076, + "balance_loss_mlp": 0.01256113, + "epoch": 0.0600030061626334, + "flos": 69951187152000.0, + "grad_norm": 1.0138660307708214, + "language_loss": 0.75495923, + "learning_rate": 3.990565284264083e-06, + "loss": 0.83539176, + "num_input_tokens_seen": 21363345, + "router_z_loss_clip": 3.4609375, + "router_z_loss_mlp": 0.39233398, + "step": 998, + "time_per_iteration": 3.2664620876312256 + }, + { + "auxiliary_loss_clip": 0.07050692, + "auxiliary_loss_mlp": 0.01361564, + "balance_loss_clip": 0.06468829, + "balance_loss_mlp": 0.01286844, + "epoch": 0.06006312941530137, + "flos": 26547085791360.0, + "grad_norm": 6.665102912139699, + "language_loss": 0.78679419, + "learning_rate": 3.990527461790013e-06, + "loss": 0.87091672, + "num_input_tokens_seen": 21385290, + "router_z_loss_clip": 5.8203125, + "router_z_loss_mlp": 0.74707031, + "step": 999, + "time_per_iteration": 2.6708481311798096 + }, + { + "auxiliary_loss_clip": 0.07090119, + "auxiliary_loss_mlp": 0.01381378, + "balance_loss_clip": 0.06486722, + "balance_loss_mlp": 0.01301603, + "epoch": 0.060123252667969335, + "flos": 27351276952320.0, + "grad_norm": 3.7400701542168013, + "language_loss": 0.85150427, + "learning_rate": 3.990489563834943e-06, + "loss": 0.93621922, + "num_input_tokens_seen": 21407625, + "router_z_loss_clip": 6.03515625, + "router_z_loss_mlp": 0.79833984, + "step": 1000, + "time_per_iteration": 2.643961191177368 + }, + { + "auxiliary_loss_clip": 0.07061431, + "auxiliary_loss_mlp": 0.01377664, + "balance_loss_clip": 0.06471995, + "balance_loss_mlp": 0.01297555, + "epoch": 0.06018337592063731, + "flos": 27024113485440.0, + "grad_norm": 4.060867986193189, + "language_loss": 0.88738573, + "learning_rate": 3.990451590400309e-06, + "loss": 0.97177666, + "num_input_tokens_seen": 21426835, + "router_z_loss_clip": 5.890625, + "router_z_loss_mlp": 0.80126953, + "step": 1001, + "time_per_iteration": 2.629136323928833 + }, + { + "auxiliary_loss_clip": 0.07032709, + "auxiliary_loss_mlp": 0.01355395, + "balance_loss_clip": 0.06470643, + "balance_loss_mlp": 0.01289306, + "epoch": 0.06024349917330528, + "flos": 25599990291840.0, + "grad_norm": 3.249124655019378, + "language_loss": 0.76097226, + "learning_rate": 3.990413541487551e-06, + "loss": 0.84485334, + "num_input_tokens_seen": 21444920, + "router_z_loss_clip": 5.6171875, + "router_z_loss_mlp": 0.66162109, + "step": 1002, + "time_per_iteration": 2.6258249282836914 + }, + { + "auxiliary_loss_clip": 0.07068716, + "auxiliary_loss_mlp": 0.01374313, + "balance_loss_clip": 0.06480874, + "balance_loss_mlp": 0.01298067, + "epoch": 0.060303622425973244, + "flos": 26139225242880.0, + "grad_norm": 4.8561241229026075, + "language_loss": 0.78990388, + "learning_rate": 3.990375417098112e-06, + "loss": 0.87433422, + "num_input_tokens_seen": 21463555, + "router_z_loss_clip": 5.8828125, + "router_z_loss_mlp": 0.76220703, + "step": 1003, + "time_per_iteration": 2.7662932872772217 + }, + { + "auxiliary_loss_clip": 0.0707517, + "auxiliary_loss_mlp": 0.01365139, + "balance_loss_clip": 0.0647432, + "balance_loss_mlp": 0.01284077, + "epoch": 0.060363745678641216, + "flos": 20383627812480.0, + "grad_norm": 4.219450714846169, + "language_loss": 0.73012471, + "learning_rate": 3.990337217233437e-06, + "loss": 0.81452775, + "num_input_tokens_seen": 21481990, + "router_z_loss_clip": 6.015625, + "router_z_loss_mlp": 0.81005859, + "step": 1004, + "time_per_iteration": 5.472697734832764 + }, + { + "auxiliary_loss_clip": 0.07068998, + "auxiliary_loss_mlp": 0.01370949, + "balance_loss_clip": 0.06471765, + "balance_loss_mlp": 0.0129313, + "epoch": 0.06042386893130918, + "flos": 17754999776640.0, + "grad_norm": 3.350107422381743, + "language_loss": 0.86839885, + "learning_rate": 3.990298941894976e-06, + "loss": 0.95279837, + "num_input_tokens_seen": 21500385, + "router_z_loss_clip": 5.96875, + "router_z_loss_mlp": 0.77832031, + "step": 1005, + "time_per_iteration": 2.628612518310547 + }, + { + "auxiliary_loss_clip": 0.06732726, + "auxiliary_loss_mlp": 0.01300149, + "balance_loss_clip": 0.06388327, + "balance_loss_mlp": 0.01255518, + "epoch": 0.06048399218397715, + "flos": 68559110945280.0, + "grad_norm": 0.8658661250215584, + "language_loss": 0.59003174, + "learning_rate": 3.9902605910841794e-06, + "loss": 0.67036045, + "num_input_tokens_seen": 21561040, + "router_z_loss_clip": 3.4375, + "router_z_loss_mlp": 0.4465332, + "step": 1006, + "time_per_iteration": 3.2709102630615234 + }, + { + "auxiliary_loss_clip": 0.07070711, + "auxiliary_loss_mlp": 0.01360281, + "balance_loss_clip": 0.06464767, + "balance_loss_mlp": 0.01278123, + "epoch": 0.060544115436645125, + "flos": 23265262851840.0, + "grad_norm": 3.0418653981095973, + "language_loss": 0.77645856, + "learning_rate": 3.990222164802503e-06, + "loss": 0.8607685, + "num_input_tokens_seen": 21580655, + "router_z_loss_clip": 6.05859375, + "router_z_loss_mlp": 0.82128906, + "step": 1007, + "time_per_iteration": 4.056382894515991 + }, + { + "auxiliary_loss_clip": 0.07091306, + "auxiliary_loss_mlp": 0.01370917, + "balance_loss_clip": 0.06486145, + "balance_loss_mlp": 0.01290475, + "epoch": 0.06060423868931309, + "flos": 23885236811520.0, + "grad_norm": 3.189900491688776, + "language_loss": 0.83630216, + "learning_rate": 3.9901836630514006e-06, + "loss": 0.92092443, + "num_input_tokens_seen": 21599650, + "router_z_loss_clip": 6.05859375, + "router_z_loss_mlp": 0.8046875, + "step": 1008, + "time_per_iteration": 2.6701247692108154 + }, + { + "auxiliary_loss_clip": 0.07042849, + "auxiliary_loss_mlp": 0.01344814, + "balance_loss_clip": 0.06474254, + "balance_loss_mlp": 0.01273718, + "epoch": 0.06066436194198106, + "flos": 18733010232960.0, + "grad_norm": 8.677434751337552, + "language_loss": 0.80948377, + "learning_rate": 3.990145085832335e-06, + "loss": 0.89336038, + "num_input_tokens_seen": 21617550, + "router_z_loss_clip": 5.6875, + "router_z_loss_mlp": 0.71142578, + "step": 1009, + "time_per_iteration": 4.013457536697388 + }, + { + "auxiliary_loss_clip": 0.07022181, + "auxiliary_loss_mlp": 0.01332483, + "balance_loss_clip": 0.06467105, + "balance_loss_mlp": 0.01266345, + "epoch": 0.06072448519464903, + "flos": 24646689590400.0, + "grad_norm": 3.258884654543471, + "language_loss": 0.95985019, + "learning_rate": 3.990106433146769e-06, + "loss": 1.04339683, + "num_input_tokens_seen": 21635865, + "router_z_loss_clip": 5.55078125, + "router_z_loss_mlp": 0.66162109, + "step": 1010, + "time_per_iteration": 2.631512403488159 + }, + { + "auxiliary_loss_clip": 0.07117961, + "auxiliary_loss_mlp": 0.01383111, + "balance_loss_clip": 0.06489638, + "balance_loss_mlp": 0.01291845, + "epoch": 0.060784608447317, + "flos": 17383672408320.0, + "grad_norm": 3.3823449890168145, + "language_loss": 0.75409305, + "learning_rate": 3.9900677049961665e-06, + "loss": 0.83910382, + "num_input_tokens_seen": 21653945, + "router_z_loss_clip": 6.28125, + "router_z_loss_mlp": 0.91259766, + "step": 1011, + "time_per_iteration": 2.5896708965301514 + }, + { + "auxiliary_loss_clip": 0.07033786, + "auxiliary_loss_mlp": 0.01345512, + "balance_loss_clip": 0.06462559, + "balance_loss_mlp": 0.0126526, + "epoch": 0.06084473169998497, + "flos": 23698336279680.0, + "grad_norm": 3.246815093008435, + "language_loss": 0.89853048, + "learning_rate": 3.990028901381999e-06, + "loss": 0.98232347, + "num_input_tokens_seen": 21671230, + "router_z_loss_clip": 5.7109375, + "router_z_loss_mlp": 0.80273438, + "step": 1012, + "time_per_iteration": 2.637019157409668 + }, + { + "auxiliary_loss_clip": 0.07040339, + "auxiliary_loss_mlp": 0.01338129, + "balance_loss_clip": 0.06458548, + "balance_loss_mlp": 0.01258211, + "epoch": 0.06090485495265294, + "flos": 23552455121280.0, + "grad_norm": 2.5392970439405116, + "language_loss": 0.79602826, + "learning_rate": 3.989990022305734e-06, + "loss": 0.8798129, + "num_input_tokens_seen": 21691155, + "router_z_loss_clip": 5.81640625, + "router_z_loss_mlp": 0.79980469, + "step": 1013, + "time_per_iteration": 2.658986806869507 + }, + { + "auxiliary_loss_clip": 0.0703081, + "auxiliary_loss_mlp": 0.01334151, + "balance_loss_clip": 0.06449694, + "balance_loss_mlp": 0.01255664, + "epoch": 0.06096497820532091, + "flos": 20345501404800.0, + "grad_norm": 3.5799775107607585, + "language_loss": 0.88768977, + "learning_rate": 3.98995106776885e-06, + "loss": 0.97133934, + "num_input_tokens_seen": 21707405, + "router_z_loss_clip": 5.8203125, + "router_z_loss_mlp": 0.78515625, + "step": 1014, + "time_per_iteration": 2.6026017665863037 + }, + { + "auxiliary_loss_clip": 0.07069368, + "auxiliary_loss_mlp": 0.01344703, + "balance_loss_clip": 0.06459542, + "balance_loss_mlp": 0.01260589, + "epoch": 0.061025101457988874, + "flos": 26945638536960.0, + "grad_norm": 5.148864357756937, + "language_loss": 0.77818727, + "learning_rate": 3.98991203777282e-06, + "loss": 0.86232805, + "num_input_tokens_seen": 21728090, + "router_z_loss_clip": 6.1015625, + "router_z_loss_mlp": 0.84082031, + "step": 1015, + "time_per_iteration": 2.6645917892456055 + }, + { + "auxiliary_loss_clip": 0.07000691, + "auxiliary_loss_mlp": 0.01326184, + "balance_loss_clip": 0.06455131, + "balance_loss_mlp": 0.01257949, + "epoch": 0.061085224710656846, + "flos": 25382216730240.0, + "grad_norm": 2.4567185281472868, + "language_loss": 0.82061088, + "learning_rate": 3.9898729323191275e-06, + "loss": 0.90387964, + "num_input_tokens_seen": 21747950, + "router_z_loss_clip": 5.453125, + "router_z_loss_mlp": 0.68359375, + "step": 1016, + "time_per_iteration": 2.631394863128662 + }, + { + "auxiliary_loss_clip": 0.07014458, + "auxiliary_loss_mlp": 0.01339398, + "balance_loss_clip": 0.06457797, + "balance_loss_mlp": 0.01263962, + "epoch": 0.06114534796332482, + "flos": 24831326062080.0, + "grad_norm": 2.2885034058804363, + "language_loss": 0.78705657, + "learning_rate": 3.989833751409254e-06, + "loss": 0.8705951, + "num_input_tokens_seen": 21767900, + "router_z_loss_clip": 5.55859375, + "router_z_loss_mlp": 0.75390625, + "step": 1017, + "time_per_iteration": 2.657306432723999 + }, + { + "auxiliary_loss_clip": 0.07054974, + "auxiliary_loss_mlp": 0.0134134, + "balance_loss_clip": 0.06458369, + "balance_loss_mlp": 0.01256225, + "epoch": 0.061205471215992784, + "flos": 20637724919040.0, + "grad_norm": 9.632952296777574, + "language_loss": 0.88575757, + "learning_rate": 3.989794495044685e-06, + "loss": 0.96972066, + "num_input_tokens_seen": 21787375, + "router_z_loss_clip": 5.97265625, + "router_z_loss_mlp": 0.85107422, + "step": 1018, + "time_per_iteration": 2.5989861488342285 + }, + { + "auxiliary_loss_clip": 0.07009743, + "auxiliary_loss_mlp": 0.01334982, + "balance_loss_clip": 0.06455217, + "balance_loss_mlp": 0.01259165, + "epoch": 0.061265594468660756, + "flos": 16513919827200.0, + "grad_norm": 8.927182809216816, + "language_loss": 0.8225174, + "learning_rate": 3.989755163226909e-06, + "loss": 0.90596467, + "num_input_tokens_seen": 21806275, + "router_z_loss_clip": 5.54296875, + "router_z_loss_mlp": 0.75878906, + "step": 1019, + "time_per_iteration": 2.596885919570923 + }, + { + "auxiliary_loss_clip": 0.07013386, + "auxiliary_loss_mlp": 0.01335228, + "balance_loss_clip": 0.06456258, + "balance_loss_mlp": 0.01263417, + "epoch": 0.06132571772132872, + "flos": 26252765925120.0, + "grad_norm": 3.333827515378615, + "language_loss": 0.86933666, + "learning_rate": 3.989715755957418e-06, + "loss": 0.9528228, + "num_input_tokens_seen": 21826430, + "router_z_loss_clip": 5.5625, + "router_z_loss_mlp": 0.71826172, + "step": 1020, + "time_per_iteration": 2.6224961280822754 + }, + { + "auxiliary_loss_clip": 0.06996658, + "auxiliary_loss_mlp": 0.01346945, + "balance_loss_clip": 0.06447957, + "balance_loss_mlp": 0.01273989, + "epoch": 0.06138584097399669, + "flos": 37423869062400.0, + "grad_norm": 2.8232559173096914, + "language_loss": 0.81487918, + "learning_rate": 3.989676273237705e-06, + "loss": 0.89831525, + "num_input_tokens_seen": 21847800, + "router_z_loss_clip": 5.48828125, + "router_z_loss_mlp": 0.72949219, + "step": 1021, + "time_per_iteration": 2.771052598953247 + }, + { + "auxiliary_loss_clip": 0.06976922, + "auxiliary_loss_mlp": 0.0136383, + "balance_loss_clip": 0.06428508, + "balance_loss_mlp": 0.01285295, + "epoch": 0.061445964226664665, + "flos": 17426410790400.0, + "grad_norm": 7.734725170769636, + "language_loss": 0.9093855, + "learning_rate": 3.9896367150692705e-06, + "loss": 0.99279296, + "num_input_tokens_seen": 21863385, + "router_z_loss_clip": 5.48046875, + "router_z_loss_mlp": 0.78466797, + "step": 1022, + "time_per_iteration": 2.5622968673706055 + }, + { + "auxiliary_loss_clip": 0.0697528, + "auxiliary_loss_mlp": 0.01365327, + "balance_loss_clip": 0.06437931, + "balance_loss_mlp": 0.01295518, + "epoch": 0.06150608747933263, + "flos": 22606365870720.0, + "grad_norm": 3.61040283013288, + "language_loss": 0.84977013, + "learning_rate": 3.989597081453611e-06, + "loss": 0.93317622, + "num_input_tokens_seen": 21881880, + "router_z_loss_clip": 5.37109375, + "router_z_loss_mlp": 0.69824219, + "step": 1023, + "time_per_iteration": 2.6407079696655273 + }, + { + "auxiliary_loss_clip": 0.0673309, + "auxiliary_loss_mlp": 0.01419946, + "balance_loss_clip": 0.06385664, + "balance_loss_mlp": 0.0137119, + "epoch": 0.0615662107320006, + "flos": 56758097139840.0, + "grad_norm": 0.9164460168563352, + "language_loss": 0.64884549, + "learning_rate": 3.989557372392231e-06, + "loss": 0.73037589, + "num_input_tokens_seen": 21940550, + "router_z_loss_clip": 3.46875, + "router_z_loss_mlp": 0.48706055, + "step": 1024, + "time_per_iteration": 3.240457534790039 + }, + { + "auxiliary_loss_clip": 0.06995942, + "auxiliary_loss_mlp": 0.01352799, + "balance_loss_clip": 0.06434722, + "balance_loss_mlp": 0.01272356, + "epoch": 0.06162633398466857, + "flos": 22571342064000.0, + "grad_norm": 2.66796346315112, + "language_loss": 0.91765183, + "learning_rate": 3.989517587886636e-06, + "loss": 1.00113928, + "num_input_tokens_seen": 21958390, + "router_z_loss_clip": 5.61328125, + "router_z_loss_mlp": 0.80371094, + "step": 1025, + "time_per_iteration": 2.6372737884521484 + }, + { + "auxiliary_loss_clip": 0.06986167, + "auxiliary_loss_mlp": 0.01374261, + "balance_loss_clip": 0.06435852, + "balance_loss_mlp": 0.01300828, + "epoch": 0.06168645723733654, + "flos": 25600158000000.0, + "grad_norm": 2.4272602971827535, + "language_loss": 0.871768, + "learning_rate": 3.989477727938335e-06, + "loss": 0.95537233, + "num_input_tokens_seen": 21978625, + "router_z_loss_clip": 5.50390625, + "router_z_loss_mlp": 0.73486328, + "step": 1026, + "time_per_iteration": 2.6508452892303467 + }, + { + "auxiliary_loss_clip": 0.06989977, + "auxiliary_loss_mlp": 0.01363012, + "balance_loss_clip": 0.06439693, + "balance_loss_mlp": 0.01286622, + "epoch": 0.06174658049000451, + "flos": 16003461553920.0, + "grad_norm": 3.495791258705881, + "language_loss": 0.8437736, + "learning_rate": 3.989437792548839e-06, + "loss": 0.92730343, + "num_input_tokens_seen": 21996035, + "router_z_loss_clip": 5.5, + "router_z_loss_mlp": 0.76416016, + "step": 1027, + "time_per_iteration": 2.613172769546509 + }, + { + "auxiliary_loss_clip": 0.06973707, + "auxiliary_loss_mlp": 0.01359003, + "balance_loss_clip": 0.0641673, + "balance_loss_mlp": 0.01281422, + "epoch": 0.06180670374267248, + "flos": 11289842772480.0, + "grad_norm": 3.8173647671524793, + "language_loss": 0.87086433, + "learning_rate": 3.989397781719663e-06, + "loss": 0.95419139, + "num_input_tokens_seen": 22011625, + "router_z_loss_clip": 5.5625, + "router_z_loss_mlp": 0.77539062, + "step": 1028, + "time_per_iteration": 2.6524107456207275 + }, + { + "auxiliary_loss_clip": 0.06704632, + "auxiliary_loss_mlp": 0.01372349, + "balance_loss_clip": 0.06357226, + "balance_loss_mlp": 0.01321519, + "epoch": 0.06186682699534045, + "flos": 65147647340160.0, + "grad_norm": 0.9176628937357996, + "language_loss": 0.60490429, + "learning_rate": 3.989357695452323e-06, + "loss": 0.68567419, + "num_input_tokens_seen": 22066035, + "router_z_loss_clip": 3.46875, + "router_z_loss_mlp": 0.50830078, + "step": 1029, + "time_per_iteration": 3.218085289001465 + }, + { + "auxiliary_loss_clip": 0.07009555, + "auxiliary_loss_mlp": 0.01372678, + "balance_loss_clip": 0.06434123, + "balance_loss_mlp": 0.01287372, + "epoch": 0.061926950248008414, + "flos": 21112111209600.0, + "grad_norm": 3.737194986722716, + "language_loss": 0.85668898, + "learning_rate": 3.98931753374834e-06, + "loss": 0.94051135, + "num_input_tokens_seen": 22085015, + "router_z_loss_clip": 5.75390625, + "router_z_loss_mlp": 0.85253906, + "step": 1030, + "time_per_iteration": 2.7052202224731445 + }, + { + "auxiliary_loss_clip": 0.06989674, + "auxiliary_loss_mlp": 0.01357455, + "balance_loss_clip": 0.06431329, + "balance_loss_mlp": 0.01280446, + "epoch": 0.061987073500676386, + "flos": 17754161235840.0, + "grad_norm": 3.4423452178420013, + "language_loss": 0.83235556, + "learning_rate": 3.989277296609237e-06, + "loss": 0.91582686, + "num_input_tokens_seen": 22102775, + "router_z_loss_clip": 5.5859375, + "router_z_loss_mlp": 0.77050781, + "step": 1031, + "time_per_iteration": 2.588575839996338 + }, + { + "auxiliary_loss_clip": 0.06983647, + "auxiliary_loss_mlp": 0.01355074, + "balance_loss_clip": 0.06433594, + "balance_loss_mlp": 0.01283453, + "epoch": 0.06204719675334436, + "flos": 21842858666880.0, + "grad_norm": 14.220096224086527, + "language_loss": 0.80345309, + "learning_rate": 3.98923698403654e-06, + "loss": 0.88684022, + "num_input_tokens_seen": 22121680, + "router_z_loss_clip": 5.50390625, + "router_z_loss_mlp": 0.71582031, + "step": 1032, + "time_per_iteration": 2.6636962890625 + }, + { + "auxiliary_loss_clip": 0.06996015, + "auxiliary_loss_mlp": 0.01349932, + "balance_loss_clip": 0.064355, + "balance_loss_mlp": 0.01272828, + "epoch": 0.06210732000601232, + "flos": 19359650592000.0, + "grad_norm": 3.724079257252284, + "language_loss": 0.9305315, + "learning_rate": 3.989196596031776e-06, + "loss": 1.01399088, + "num_input_tokens_seen": 22138155, + "router_z_loss_clip": 5.60546875, + "router_z_loss_mlp": 0.77197266, + "step": 1033, + "time_per_iteration": 2.5974748134613037 + }, + { + "auxiliary_loss_clip": 0.06988779, + "auxiliary_loss_mlp": 0.01347157, + "balance_loss_clip": 0.06438898, + "balance_loss_mlp": 0.0127525, + "epoch": 0.062167443258680295, + "flos": 24755534444160.0, + "grad_norm": 3.649174890809254, + "language_loss": 0.87141907, + "learning_rate": 3.989156132596479e-06, + "loss": 0.95477843, + "num_input_tokens_seen": 22157420, + "router_z_loss_clip": 5.49609375, + "router_z_loss_mlp": 0.71875, + "step": 1034, + "time_per_iteration": 2.6747853755950928 + }, + { + "auxiliary_loss_clip": 0.06962503, + "auxiliary_loss_mlp": 0.01360042, + "balance_loss_clip": 0.06434912, + "balance_loss_mlp": 0.01290854, + "epoch": 0.06222756651134827, + "flos": 34466903602560.0, + "grad_norm": 3.3762373845942313, + "language_loss": 0.84657645, + "learning_rate": 3.989115593732182e-06, + "loss": 0.92980194, + "num_input_tokens_seen": 22178620, + "router_z_loss_clip": 5.2734375, + "router_z_loss_mlp": 0.69189453, + "step": 1035, + "time_per_iteration": 2.690265655517578 + }, + { + "auxiliary_loss_clip": 0.06995995, + "auxiliary_loss_mlp": 0.01348638, + "balance_loss_clip": 0.06441504, + "balance_loss_mlp": 0.01275015, + "epoch": 0.06228768976401623, + "flos": 25673601703680.0, + "grad_norm": 4.464615872821339, + "language_loss": 0.81925672, + "learning_rate": 3.989074979440421e-06, + "loss": 0.90270305, + "num_input_tokens_seen": 22197125, + "router_z_loss_clip": 5.5390625, + "router_z_loss_mlp": 0.73583984, + "step": 1036, + "time_per_iteration": 2.6662774085998535 + }, + { + "auxiliary_loss_clip": 0.07003354, + "auxiliary_loss_mlp": 0.01370226, + "balance_loss_clip": 0.064463, + "balance_loss_mlp": 0.01293693, + "epoch": 0.062347813016684205, + "flos": 25301687356800.0, + "grad_norm": 3.754285367283167, + "language_loss": 0.89123344, + "learning_rate": 3.989034289722739e-06, + "loss": 0.97496927, + "num_input_tokens_seen": 22217575, + "router_z_loss_clip": 5.56640625, + "router_z_loss_mlp": 0.76513672, + "step": 1037, + "time_per_iteration": 2.609894037246704 + }, + { + "auxiliary_loss_clip": 0.07008456, + "auxiliary_loss_mlp": 0.01342836, + "balance_loss_clip": 0.06453587, + "balance_loss_mlp": 0.01269641, + "epoch": 0.06240793626935217, + "flos": 26914388163840.0, + "grad_norm": 15.327798453817612, + "language_loss": 0.8346867, + "learning_rate": 3.988993524580676e-06, + "loss": 0.91819966, + "num_input_tokens_seen": 22236840, + "router_z_loss_clip": 5.54296875, + "router_z_loss_mlp": 0.73095703, + "step": 1038, + "time_per_iteration": 2.6626057624816895 + }, + { + "auxiliary_loss_clip": 0.06993866, + "auxiliary_loss_mlp": 0.01340149, + "balance_loss_clip": 0.0645204, + "balance_loss_mlp": 0.01267956, + "epoch": 0.06246805952202014, + "flos": 21622108285440.0, + "grad_norm": 3.08050473605758, + "language_loss": 0.88628823, + "learning_rate": 3.98895268401578e-06, + "loss": 0.96962833, + "num_input_tokens_seen": 22256465, + "router_z_loss_clip": 5.41796875, + "router_z_loss_mlp": 0.72167969, + "step": 1039, + "time_per_iteration": 2.6248486042022705 + }, + { + "auxiliary_loss_clip": 0.0701851, + "auxiliary_loss_mlp": 0.01340836, + "balance_loss_clip": 0.06453219, + "balance_loss_mlp": 0.01264352, + "epoch": 0.0625281827746881, + "flos": 19316954136960.0, + "grad_norm": 4.220230384937809, + "language_loss": 0.85023952, + "learning_rate": 3.9889117680296e-06, + "loss": 0.933833, + "num_input_tokens_seen": 22274025, + "router_z_loss_clip": 5.6484375, + "router_z_loss_mlp": 0.76513672, + "step": 1040, + "time_per_iteration": 2.6646370887756348 + }, + { + "auxiliary_loss_clip": 0.07036482, + "auxiliary_loss_mlp": 0.01364298, + "balance_loss_clip": 0.06464302, + "balance_loss_mlp": 0.01274987, + "epoch": 0.06258830602735609, + "flos": 27753183861120.0, + "grad_norm": 4.590358257909823, + "language_loss": 0.72318321, + "learning_rate": 3.988870776623685e-06, + "loss": 0.80719095, + "num_input_tokens_seen": 22292245, + "router_z_loss_clip": 5.71875, + "router_z_loss_mlp": 0.89306641, + "step": 1041, + "time_per_iteration": 2.6730599403381348 + }, + { + "auxiliary_loss_clip": 0.07040736, + "auxiliary_loss_mlp": 0.01378227, + "balance_loss_clip": 0.06470466, + "balance_loss_mlp": 0.01298548, + "epoch": 0.06264842928002405, + "flos": 23229442431360.0, + "grad_norm": 2.706616424442574, + "language_loss": 0.84952104, + "learning_rate": 3.9888297097995905e-06, + "loss": 0.93371069, + "num_input_tokens_seen": 22311455, + "router_z_loss_clip": 5.6953125, + "router_z_loss_mlp": 0.796875, + "step": 1042, + "time_per_iteration": 2.6521389484405518 + }, + { + "auxiliary_loss_clip": 0.0703849, + "auxiliary_loss_mlp": 0.0134851, + "balance_loss_clip": 0.06476429, + "balance_loss_mlp": 0.01272598, + "epoch": 0.06270855253269202, + "flos": 38408671699200.0, + "grad_norm": 3.072391396873047, + "language_loss": 0.79772788, + "learning_rate": 3.988788567558874e-06, + "loss": 0.88159788, + "num_input_tokens_seen": 22333750, + "router_z_loss_clip": 5.62109375, + "router_z_loss_mlp": 0.75927734, + "step": 1043, + "time_per_iteration": 4.132354021072388 + }, + { + "auxiliary_loss_clip": 0.07023476, + "auxiliary_loss_mlp": 0.01365807, + "balance_loss_clip": 0.06473523, + "balance_loss_mlp": 0.01289656, + "epoch": 0.06276867578535998, + "flos": 22459771952640.0, + "grad_norm": 8.578696431093903, + "language_loss": 0.95484012, + "learning_rate": 3.988747349903097e-06, + "loss": 1.03873289, + "num_input_tokens_seen": 22351940, + "router_z_loss_clip": 5.49609375, + "router_z_loss_mlp": 0.76123047, + "step": 1044, + "time_per_iteration": 4.0872087478637695 + }, + { + "auxiliary_loss_clip": 0.0702454, + "auxiliary_loss_mlp": 0.0136404, + "balance_loss_clip": 0.06474113, + "balance_loss_mlp": 0.0129156, + "epoch": 0.06282879903802796, + "flos": 22937176990080.0, + "grad_norm": 5.298315501835511, + "language_loss": 0.88737643, + "learning_rate": 3.988706056833821e-06, + "loss": 0.97126228, + "num_input_tokens_seen": 22372085, + "router_z_loss_clip": 5.5, + "router_z_loss_mlp": 0.72412109, + "step": 1045, + "time_per_iteration": 2.6359164714813232 + }, + { + "auxiliary_loss_clip": 0.07016507, + "auxiliary_loss_mlp": 0.01377248, + "balance_loss_clip": 0.06467608, + "balance_loss_mlp": 0.01300334, + "epoch": 0.06288892229069593, + "flos": 34827036451200.0, + "grad_norm": 2.8748954821383803, + "language_loss": 0.81643683, + "learning_rate": 3.9886646883526125e-06, + "loss": 0.90037435, + "num_input_tokens_seen": 22392020, + "router_z_loss_clip": 5.484375, + "router_z_loss_mlp": 0.76855469, + "step": 1046, + "time_per_iteration": 4.205566883087158 + }, + { + "auxiliary_loss_clip": 0.07049687, + "auxiliary_loss_mlp": 0.01383919, + "balance_loss_clip": 0.0647831, + "balance_loss_mlp": 0.01309628, + "epoch": 0.06294904554336389, + "flos": 19433178149760.0, + "grad_norm": 3.049904917466256, + "language_loss": 0.8054778, + "learning_rate": 3.988623244461039e-06, + "loss": 0.8898139, + "num_input_tokens_seen": 22411180, + "router_z_loss_clip": 5.71484375, + "router_z_loss_mlp": 0.74267578, + "step": 1047, + "time_per_iteration": 2.628453493118286 + }, + { + "auxiliary_loss_clip": 0.07082113, + "auxiliary_loss_mlp": 0.01418593, + "balance_loss_clip": 0.06488797, + "balance_loss_mlp": 0.01332237, + "epoch": 0.06300916879603187, + "flos": 40671464808960.0, + "grad_norm": 5.477739593856775, + "language_loss": 0.80062962, + "learning_rate": 3.988581725160672e-06, + "loss": 0.88563669, + "num_input_tokens_seen": 22435105, + "router_z_loss_clip": 5.921875, + "router_z_loss_mlp": 0.86279297, + "step": 1048, + "time_per_iteration": 4.191184997558594 + }, + { + "auxiliary_loss_clip": 0.07059699, + "auxiliary_loss_mlp": 0.01409495, + "balance_loss_clip": 0.06479897, + "balance_loss_mlp": 0.01322902, + "epoch": 0.06306929204869983, + "flos": 23810703004800.0, + "grad_norm": 4.634968800445174, + "language_loss": 0.81291783, + "learning_rate": 3.988540130453087e-06, + "loss": 0.89760983, + "num_input_tokens_seen": 22452710, + "router_z_loss_clip": 5.796875, + "router_z_loss_mlp": 0.86669922, + "step": 1049, + "time_per_iteration": 2.650202989578247 + }, + { + "auxiliary_loss_clip": 0.07039324, + "auxiliary_loss_mlp": 0.01395065, + "balance_loss_clip": 0.06466646, + "balance_loss_mlp": 0.01316435, + "epoch": 0.0631294153013678, + "flos": 18921671700480.0, + "grad_norm": 5.321703459602036, + "language_loss": 0.85613585, + "learning_rate": 3.988498460339862e-06, + "loss": 0.9404797, + "num_input_tokens_seen": 22470175, + "router_z_loss_clip": 5.71875, + "router_z_loss_mlp": 0.78662109, + "step": 1050, + "time_per_iteration": 2.6393301486968994 + }, + { + "auxiliary_loss_clip": 0.07003346, + "auxiliary_loss_mlp": 0.01381224, + "balance_loss_clip": 0.06475418, + "balance_loss_mlp": 0.01309221, + "epoch": 0.06318953855403578, + "flos": 24287101793280.0, + "grad_norm": 2.921652621723748, + "language_loss": 0.80915332, + "learning_rate": 3.988456714822575e-06, + "loss": 0.89299899, + "num_input_tokens_seen": 22490020, + "router_z_loss_clip": 5.2734375, + "router_z_loss_mlp": 0.71972656, + "step": 1051, + "time_per_iteration": 2.6563098430633545 + }, + { + "auxiliary_loss_clip": 0.07019964, + "auxiliary_loss_mlp": 0.01395256, + "balance_loss_clip": 0.06461668, + "balance_loss_mlp": 0.01314957, + "epoch": 0.06324966180670374, + "flos": 22535563570560.0, + "grad_norm": 3.4102512673670256, + "language_loss": 0.84142733, + "learning_rate": 3.98841489390281e-06, + "loss": 0.92557955, + "num_input_tokens_seen": 22509685, + "router_z_loss_clip": 5.57421875, + "router_z_loss_mlp": 0.80224609, + "step": 1052, + "time_per_iteration": 2.6776039600372314 + }, + { + "auxiliary_loss_clip": 0.07036786, + "auxiliary_loss_mlp": 0.01379519, + "balance_loss_clip": 0.06459802, + "balance_loss_mlp": 0.01299411, + "epoch": 0.06330978505937171, + "flos": 15783465859200.0, + "grad_norm": 2.8507947153873663, + "language_loss": 0.80809307, + "learning_rate": 3.988372997582155e-06, + "loss": 0.89225614, + "num_input_tokens_seen": 22527905, + "router_z_loss_clip": 5.76953125, + "router_z_loss_mlp": 0.80175781, + "step": 1053, + "time_per_iteration": 2.6043174266815186 + }, + { + "auxiliary_loss_clip": 0.06984901, + "auxiliary_loss_mlp": 0.01368181, + "balance_loss_clip": 0.06446727, + "balance_loss_mlp": 0.0129532, + "epoch": 0.06336990831203967, + "flos": 21477610719360.0, + "grad_norm": 4.159955078588776, + "language_loss": 0.88012934, + "learning_rate": 3.988331025862195e-06, + "loss": 0.96366018, + "num_input_tokens_seen": 22546335, + "router_z_loss_clip": 5.3828125, + "router_z_loss_mlp": 0.72802734, + "step": 1054, + "time_per_iteration": 2.647026777267456 + }, + { + "auxiliary_loss_clip": 0.06987712, + "auxiliary_loss_mlp": 0.01370375, + "balance_loss_clip": 0.06445334, + "balance_loss_mlp": 0.01301568, + "epoch": 0.06343003156470765, + "flos": 18484824839040.0, + "grad_norm": 2.8104304693341837, + "language_loss": 0.89331806, + "learning_rate": 3.9882889787445225e-06, + "loss": 0.97689891, + "num_input_tokens_seen": 22563885, + "router_z_loss_clip": 5.421875, + "router_z_loss_mlp": 0.68798828, + "step": 1055, + "time_per_iteration": 2.5695717334747314 + }, + { + "auxiliary_loss_clip": 0.07031021, + "auxiliary_loss_mlp": 0.01393239, + "balance_loss_clip": 0.06440826, + "balance_loss_mlp": 0.01302354, + "epoch": 0.06349015481737562, + "flos": 25161801765120.0, + "grad_norm": 4.1133835551619224, + "language_loss": 0.85196388, + "learning_rate": 3.988246856230734e-06, + "loss": 0.93620646, + "num_input_tokens_seen": 22583035, + "router_z_loss_clip": 5.89453125, + "router_z_loss_mlp": 0.90820312, + "step": 1056, + "time_per_iteration": 2.685821056365967 + }, + { + "auxiliary_loss_clip": 0.07029925, + "auxiliary_loss_mlp": 0.01408784, + "balance_loss_clip": 0.06446205, + "balance_loss_mlp": 0.01319377, + "epoch": 0.06355027807004358, + "flos": 26879322430080.0, + "grad_norm": 5.02877545894497, + "language_loss": 0.84474576, + "learning_rate": 3.988204658322426e-06, + "loss": 0.92913282, + "num_input_tokens_seen": 22605055, + "router_z_loss_clip": 5.8359375, + "router_z_loss_mlp": 0.89501953, + "step": 1057, + "time_per_iteration": 2.6688387393951416 + }, + { + "auxiliary_loss_clip": 0.06953399, + "auxiliary_loss_mlp": 0.01345887, + "balance_loss_clip": 0.06428042, + "balance_loss_mlp": 0.01278987, + "epoch": 0.06361040132271156, + "flos": 21402951131520.0, + "grad_norm": 3.9641222811805337, + "language_loss": 0.85986251, + "learning_rate": 3.988162385021196e-06, + "loss": 0.94285542, + "num_input_tokens_seen": 22623760, + "router_z_loss_clip": 5.25, + "router_z_loss_mlp": 0.66845703, + "step": 1058, + "time_per_iteration": 2.6371591091156006 + }, + { + "auxiliary_loss_clip": 0.0698344, + "auxiliary_loss_mlp": 0.01353949, + "balance_loss_clip": 0.06427366, + "balance_loss_mlp": 0.01275796, + "epoch": 0.06367052457537953, + "flos": 25739959737600.0, + "grad_norm": 3.2277693096185125, + "language_loss": 0.90202904, + "learning_rate": 3.988120036328651e-06, + "loss": 0.98540288, + "num_input_tokens_seen": 22643000, + "router_z_loss_clip": 5.5625, + "router_z_loss_mlp": 0.78173828, + "step": 1059, + "time_per_iteration": 2.6188669204711914 + }, + { + "auxiliary_loss_clip": 0.06969759, + "auxiliary_loss_mlp": 0.01343893, + "balance_loss_clip": 0.06422018, + "balance_loss_mlp": 0.01267218, + "epoch": 0.0637306478280475, + "flos": 17635840871040.0, + "grad_norm": 3.450468160359764, + "language_loss": 0.94701946, + "learning_rate": 3.988077612246394e-06, + "loss": 1.0301559, + "num_input_tokens_seen": 22660460, + "router_z_loss_clip": 5.48046875, + "router_z_loss_mlp": 0.76708984, + "step": 1060, + "time_per_iteration": 2.659820079803467 + }, + { + "auxiliary_loss_clip": 0.06957703, + "auxiliary_loss_mlp": 0.0133292, + "balance_loss_clip": 0.06419823, + "balance_loss_mlp": 0.01262396, + "epoch": 0.06379077108071547, + "flos": 13667727864960.0, + "grad_norm": 3.5269486179455622, + "language_loss": 0.91039562, + "learning_rate": 3.988035112776035e-06, + "loss": 0.99330181, + "num_input_tokens_seen": 22679270, + "router_z_loss_clip": 5.38671875, + "router_z_loss_mlp": 0.70483398, + "step": 1061, + "time_per_iteration": 2.595237970352173 + }, + { + "auxiliary_loss_clip": 0.07004992, + "auxiliary_loss_mlp": 0.0134989, + "balance_loss_clip": 0.06433421, + "balance_loss_mlp": 0.01272071, + "epoch": 0.06385089433338344, + "flos": 28486950065280.0, + "grad_norm": 26.387846770017223, + "language_loss": 0.80432439, + "learning_rate": 3.987992537919185e-06, + "loss": 0.88787317, + "num_input_tokens_seen": 22699330, + "router_z_loss_clip": 5.7109375, + "router_z_loss_mlp": 0.77832031, + "step": 1062, + "time_per_iteration": 2.69326114654541 + }, + { + "auxiliary_loss_clip": 0.06971388, + "auxiliary_loss_mlp": 0.01333448, + "balance_loss_clip": 0.06420203, + "balance_loss_mlp": 0.01260349, + "epoch": 0.0639110175860514, + "flos": 24317052428160.0, + "grad_norm": 14.259145516712906, + "language_loss": 0.90426183, + "learning_rate": 3.987949887677459e-06, + "loss": 0.98731029, + "num_input_tokens_seen": 22717945, + "router_z_loss_clip": 5.5, + "router_z_loss_mlp": 0.73095703, + "step": 1063, + "time_per_iteration": 2.642476797103882 + }, + { + "auxiliary_loss_clip": 0.06974378, + "auxiliary_loss_mlp": 0.01332583, + "balance_loss_clip": 0.06425211, + "balance_loss_mlp": 0.01259436, + "epoch": 0.06397114083871938, + "flos": 22097291189760.0, + "grad_norm": 2.9601227778370176, + "language_loss": 0.82562792, + "learning_rate": 3.9879071620524744e-06, + "loss": 0.90869761, + "num_input_tokens_seen": 22736790, + "router_z_loss_clip": 5.48828125, + "router_z_loss_mlp": 0.73144531, + "step": 1064, + "time_per_iteration": 2.661435604095459 + }, + { + "auxiliary_loss_clip": 0.06941259, + "auxiliary_loss_mlp": 0.01342729, + "balance_loss_clip": 0.06412596, + "balance_loss_mlp": 0.01271298, + "epoch": 0.06403126409138735, + "flos": 19578849672960.0, + "grad_norm": 3.2505919469988727, + "language_loss": 0.86995006, + "learning_rate": 3.987864361045851e-06, + "loss": 0.95278984, + "num_input_tokens_seen": 22754745, + "router_z_loss_clip": 5.28515625, + "router_z_loss_mlp": 0.71386719, + "step": 1065, + "time_per_iteration": 2.5758113861083984 + }, + { + "auxiliary_loss_clip": 0.06963679, + "auxiliary_loss_mlp": 0.01340247, + "balance_loss_clip": 0.06401139, + "balance_loss_mlp": 0.01265669, + "epoch": 0.06409138734405531, + "flos": 40816968624000.0, + "grad_norm": 2.0842805851080395, + "language_loss": 0.71325147, + "learning_rate": 3.987821484659211e-06, + "loss": 0.79629076, + "num_input_tokens_seen": 22776780, + "router_z_loss_clip": 5.625, + "router_z_loss_mlp": 0.74609375, + "step": 1066, + "time_per_iteration": 2.7917239665985107 + }, + { + "auxiliary_loss_clip": 0.06944396, + "auxiliary_loss_mlp": 0.0133661, + "balance_loss_clip": 0.06404863, + "balance_loss_mlp": 0.01266419, + "epoch": 0.06415151059672328, + "flos": 20446631683200.0, + "grad_norm": 3.9323967107233093, + "language_loss": 0.93839109, + "learning_rate": 3.987778532894181e-06, + "loss": 1.02120125, + "num_input_tokens_seen": 22793915, + "router_z_loss_clip": 5.390625, + "router_z_loss_mlp": 0.70166016, + "step": 1067, + "time_per_iteration": 2.6115174293518066 + }, + { + "auxiliary_loss_clip": 0.06956208, + "auxiliary_loss_mlp": 0.0134, + "balance_loss_clip": 0.06410809, + "balance_loss_mlp": 0.01270954, + "epoch": 0.06421163384939126, + "flos": 18077006217600.0, + "grad_norm": 2.3907527813163947, + "language_loss": 0.86262715, + "learning_rate": 3.987735505752391e-06, + "loss": 0.94558918, + "num_input_tokens_seen": 22812670, + "router_z_loss_clip": 5.453125, + "router_z_loss_mlp": 0.68994141, + "step": 1068, + "time_per_iteration": 2.6069822311401367 + }, + { + "auxiliary_loss_clip": 0.06937677, + "auxiliary_loss_mlp": 0.01339596, + "balance_loss_clip": 0.0640877, + "balance_loss_mlp": 0.01269787, + "epoch": 0.06427175710205922, + "flos": 25126526396160.0, + "grad_norm": 3.0644651013361175, + "language_loss": 0.92719203, + "learning_rate": 3.987692403235471e-06, + "loss": 1.0099647, + "num_input_tokens_seen": 22832440, + "router_z_loss_clip": 5.296875, + "router_z_loss_mlp": 0.69775391, + "step": 1069, + "time_per_iteration": 2.6751255989074707 + }, + { + "auxiliary_loss_clip": 0.06952519, + "auxiliary_loss_mlp": 0.01331878, + "balance_loss_clip": 0.06402327, + "balance_loss_mlp": 0.01256777, + "epoch": 0.06433188035472719, + "flos": 17385684906240.0, + "grad_norm": 4.001862380962301, + "language_loss": 0.98985177, + "learning_rate": 3.987649225345056e-06, + "loss": 1.07269573, + "num_input_tokens_seen": 22845495, + "router_z_loss_clip": 5.5078125, + "router_z_loss_mlp": 0.75048828, + "step": 1070, + "time_per_iteration": 2.5646464824676514 + }, + { + "auxiliary_loss_clip": 0.06933151, + "auxiliary_loss_mlp": 0.01337757, + "balance_loss_clip": 0.0639724, + "balance_loss_mlp": 0.01267042, + "epoch": 0.06439200360739517, + "flos": 23552371267200.0, + "grad_norm": 2.5082910657712474, + "language_loss": 0.90418053, + "learning_rate": 3.987605972082782e-06, + "loss": 0.98688966, + "num_input_tokens_seen": 22865390, + "router_z_loss_clip": 5.359375, + "router_z_loss_mlp": 0.70703125, + "step": 1071, + "time_per_iteration": 2.6427106857299805 + }, + { + "auxiliary_loss_clip": 0.06918223, + "auxiliary_loss_mlp": 0.01334321, + "balance_loss_clip": 0.06398708, + "balance_loss_mlp": 0.01262414, + "epoch": 0.06445212686006313, + "flos": 21986014567680.0, + "grad_norm": 1.871300371090536, + "language_loss": 0.79228568, + "learning_rate": 3.987562643450292e-06, + "loss": 0.87481117, + "num_input_tokens_seen": 22885495, + "router_z_loss_clip": 5.19921875, + "router_z_loss_mlp": 0.71923828, + "step": 1072, + "time_per_iteration": 2.647038698196411 + }, + { + "auxiliary_loss_clip": 0.06937171, + "auxiliary_loss_mlp": 0.01329872, + "balance_loss_clip": 0.06401432, + "balance_loss_mlp": 0.01259205, + "epoch": 0.0645122501127311, + "flos": 25928369642880.0, + "grad_norm": 2.655186985808554, + "language_loss": 0.84775895, + "learning_rate": 3.987519239449226e-06, + "loss": 0.9304294, + "num_input_tokens_seen": 22904845, + "router_z_loss_clip": 5.35546875, + "router_z_loss_mlp": 0.70800781, + "step": 1073, + "time_per_iteration": 2.658341646194458 + }, + { + "auxiliary_loss_clip": 0.06906792, + "auxiliary_loss_mlp": 0.01330074, + "balance_loss_clip": 0.06396446, + "balance_loss_mlp": 0.01263412, + "epoch": 0.06457237336539907, + "flos": 25632498476160.0, + "grad_norm": 1.923481252052909, + "language_loss": 0.82366061, + "learning_rate": 3.987475760081233e-06, + "loss": 0.90602928, + "num_input_tokens_seen": 22925940, + "router_z_loss_clip": 5.09765625, + "router_z_loss_mlp": 0.66650391, + "step": 1074, + "time_per_iteration": 2.6500589847564697 + }, + { + "auxiliary_loss_clip": 0.06911084, + "auxiliary_loss_mlp": 0.01341632, + "balance_loss_clip": 0.0638795, + "balance_loss_mlp": 0.01268152, + "epoch": 0.06463249661806704, + "flos": 19470088673280.0, + "grad_norm": 4.283359791903129, + "language_loss": 0.82960403, + "learning_rate": 3.987432205347958e-06, + "loss": 0.91213125, + "num_input_tokens_seen": 22944375, + "router_z_loss_clip": 5.23046875, + "router_z_loss_mlp": 0.73486328, + "step": 1075, + "time_per_iteration": 2.620055675506592 + }, + { + "auxiliary_loss_clip": 0.06919183, + "auxiliary_loss_mlp": 0.01329908, + "balance_loss_clip": 0.06393343, + "balance_loss_mlp": 0.01260528, + "epoch": 0.064692619870735, + "flos": 24504833427840.0, + "grad_norm": 4.7074268898703, + "language_loss": 0.90130782, + "learning_rate": 3.987388575251055e-06, + "loss": 0.98379874, + "num_input_tokens_seen": 22959145, + "router_z_loss_clip": 5.26171875, + "router_z_loss_mlp": 0.69335938, + "step": 1076, + "time_per_iteration": 2.6410202980041504 + }, + { + "auxiliary_loss_clip": 0.06917243, + "auxiliary_loss_mlp": 0.01324517, + "balance_loss_clip": 0.06391963, + "balance_loss_mlp": 0.01256901, + "epoch": 0.06475274312340297, + "flos": 17024252319360.0, + "grad_norm": 4.89859871786138, + "language_loss": 0.84430212, + "learning_rate": 3.98734486979218e-06, + "loss": 0.92671967, + "num_input_tokens_seen": 22978100, + "router_z_loss_clip": 5.25390625, + "router_z_loss_mlp": 0.67578125, + "step": 1077, + "time_per_iteration": 2.6577157974243164 + }, + { + "auxiliary_loss_clip": 0.06961326, + "auxiliary_loss_mlp": 0.0134572, + "balance_loss_clip": 0.06399816, + "balance_loss_mlp": 0.01265659, + "epoch": 0.06481286637607095, + "flos": 24579409161600.0, + "grad_norm": 2.525164880783881, + "language_loss": 0.95071888, + "learning_rate": 3.987301088972986e-06, + "loss": 1.03378928, + "num_input_tokens_seen": 22997285, + "router_z_loss_clip": 5.609375, + "router_z_loss_mlp": 0.80078125, + "step": 1078, + "time_per_iteration": 2.60807466506958 + }, + { + "auxiliary_loss_clip": 0.0696152, + "auxiliary_loss_mlp": 0.01348441, + "balance_loss_clip": 0.0639492, + "balance_loss_mlp": 0.01266616, + "epoch": 0.06487298962873891, + "flos": 21111985428480.0, + "grad_norm": 2.577127703708103, + "language_loss": 0.81118071, + "learning_rate": 3.987257232795137e-06, + "loss": 0.89428037, + "num_input_tokens_seen": 23016285, + "router_z_loss_clip": 5.6640625, + "router_z_loss_mlp": 0.81835938, + "step": 1079, + "time_per_iteration": 2.6317968368530273 + }, + { + "auxiliary_loss_clip": 0.06928547, + "auxiliary_loss_mlp": 0.01328554, + "balance_loss_clip": 0.06390582, + "balance_loss_mlp": 0.01256837, + "epoch": 0.06493311288140688, + "flos": 24615103800960.0, + "grad_norm": 2.4676521714353865, + "language_loss": 0.72843546, + "learning_rate": 3.987213301260294e-06, + "loss": 0.81100643, + "num_input_tokens_seen": 23036420, + "router_z_loss_clip": 5.37109375, + "router_z_loss_mlp": 0.71728516, + "step": 1080, + "time_per_iteration": 2.6215646266937256 + }, + { + "auxiliary_loss_clip": 0.06919578, + "auxiliary_loss_mlp": 0.01334283, + "balance_loss_clip": 0.06385017, + "balance_loss_mlp": 0.01258323, + "epoch": 0.06499323613407486, + "flos": 25345054644480.0, + "grad_norm": 2.8195024652173233, + "language_loss": 0.76152724, + "learning_rate": 3.987169294370123e-06, + "loss": 0.8440659, + "num_input_tokens_seen": 23056945, + "router_z_loss_clip": 5.34375, + "router_z_loss_mlp": 0.75927734, + "step": 1081, + "time_per_iteration": 2.619861364364624 + }, + { + "auxiliary_loss_clip": 0.06903991, + "auxiliary_loss_mlp": 0.01330699, + "balance_loss_clip": 0.06382824, + "balance_loss_mlp": 0.01260985, + "epoch": 0.06505335938674282, + "flos": 20381908803840.0, + "grad_norm": 3.8302016885059436, + "language_loss": 0.87991226, + "learning_rate": 3.987125212126294e-06, + "loss": 0.96225917, + "num_input_tokens_seen": 23074940, + "router_z_loss_clip": 5.2109375, + "router_z_loss_mlp": 0.69726562, + "step": 1082, + "time_per_iteration": 3.9682254791259766 + }, + { + "auxiliary_loss_clip": 0.06965172, + "auxiliary_loss_mlp": 0.01343743, + "balance_loss_clip": 0.06394538, + "balance_loss_mlp": 0.01265304, + "epoch": 0.06511348263941079, + "flos": 25344970790400.0, + "grad_norm": 3.078052560557278, + "language_loss": 0.85807657, + "learning_rate": 3.987081054530478e-06, + "loss": 0.94116569, + "num_input_tokens_seen": 23093420, + "router_z_loss_clip": 5.70703125, + "router_z_loss_mlp": 0.78417969, + "step": 1083, + "time_per_iteration": 4.172176361083984 + }, + { + "auxiliary_loss_clip": 0.06918654, + "auxiliary_loss_mlp": 0.01347933, + "balance_loss_clip": 0.06379002, + "balance_loss_mlp": 0.01269684, + "epoch": 0.06517360589207877, + "flos": 20337912610560.0, + "grad_norm": 5.768369350853526, + "language_loss": 0.82737648, + "learning_rate": 3.987036821584348e-06, + "loss": 0.91004241, + "num_input_tokens_seen": 23111550, + "router_z_loss_clip": 5.40234375, + "router_z_loss_mlp": 0.78173828, + "step": 1084, + "time_per_iteration": 2.5647377967834473 + }, + { + "auxiliary_loss_clip": 0.06925946, + "auxiliary_loss_mlp": 0.01344614, + "balance_loss_clip": 0.06381474, + "balance_loss_mlp": 0.0126379, + "epoch": 0.06523372914474673, + "flos": 31688956391040.0, + "grad_norm": 2.8637661589946664, + "language_loss": 0.69041795, + "learning_rate": 3.986992513289584e-06, + "loss": 0.7731235, + "num_input_tokens_seen": 23130335, + "router_z_loss_clip": 5.44921875, + "router_z_loss_mlp": 0.80908203, + "step": 1085, + "time_per_iteration": 2.6726510524749756 + }, + { + "auxiliary_loss_clip": 0.06912835, + "auxiliary_loss_mlp": 0.01346265, + "balance_loss_clip": 0.06394207, + "balance_loss_mlp": 0.01271496, + "epoch": 0.0652938523974147, + "flos": 20784612326400.0, + "grad_norm": 3.652482458321433, + "language_loss": 0.80282378, + "learning_rate": 3.9869481296478645e-06, + "loss": 0.88541472, + "num_input_tokens_seen": 23152380, + "router_z_loss_clip": 5.1875, + "router_z_loss_mlp": 0.74707031, + "step": 1086, + "time_per_iteration": 4.0445778369903564 + }, + { + "auxiliary_loss_clip": 0.06903446, + "auxiliary_loss_mlp": 0.01343539, + "balance_loss_clip": 0.06383859, + "balance_loss_mlp": 0.01271489, + "epoch": 0.06535397565008266, + "flos": 16696627655040.0, + "grad_norm": 2.983342921031512, + "language_loss": 0.88718885, + "learning_rate": 3.986903670660872e-06, + "loss": 0.96965867, + "num_input_tokens_seen": 23171630, + "router_z_loss_clip": 5.1953125, + "router_z_loss_mlp": 0.72021484, + "step": 1087, + "time_per_iteration": 2.612272024154663 + }, + { + "auxiliary_loss_clip": 0.06922436, + "auxiliary_loss_mlp": 0.01359561, + "balance_loss_clip": 0.06381297, + "balance_loss_mlp": 0.01282457, + "epoch": 0.06541409890275064, + "flos": 26875171653120.0, + "grad_norm": 4.165814553604834, + "language_loss": 0.81038088, + "learning_rate": 3.9868591363302945e-06, + "loss": 0.89320087, + "num_input_tokens_seen": 23192520, + "router_z_loss_clip": 5.4140625, + "router_z_loss_mlp": 0.77099609, + "step": 1088, + "time_per_iteration": 4.128512620925903 + }, + { + "auxiliary_loss_clip": 0.06905861, + "auxiliary_loss_mlp": 0.01369914, + "balance_loss_clip": 0.0637981, + "balance_loss_mlp": 0.01292333, + "epoch": 0.06547422215541861, + "flos": 20527831889280.0, + "grad_norm": 2.3905965673188043, + "language_loss": 0.73899305, + "learning_rate": 3.9868145266578186e-06, + "loss": 0.82175082, + "num_input_tokens_seen": 23210710, + "router_z_loss_clip": 5.26171875, + "router_z_loss_mlp": 0.77587891, + "step": 1089, + "time_per_iteration": 2.5846424102783203 + }, + { + "auxiliary_loss_clip": 0.06903853, + "auxiliary_loss_mlp": 0.01367809, + "balance_loss_clip": 0.06390744, + "balance_loss_mlp": 0.01297094, + "epoch": 0.06553434540808657, + "flos": 22022925091200.0, + "grad_norm": 2.5933459275490005, + "language_loss": 0.88925481, + "learning_rate": 3.9867698416451366e-06, + "loss": 0.97197139, + "num_input_tokens_seen": 23230305, + "router_z_loss_clip": 5.12890625, + "router_z_loss_mlp": 0.70751953, + "step": 1090, + "time_per_iteration": 2.632730722427368 + }, + { + "auxiliary_loss_clip": 0.06923388, + "auxiliary_loss_mlp": 0.01379562, + "balance_loss_clip": 0.06394897, + "balance_loss_mlp": 0.01304031, + "epoch": 0.06559446866075455, + "flos": 24615648852480.0, + "grad_norm": 5.07637209675267, + "language_loss": 0.7519111, + "learning_rate": 3.9867250812939434e-06, + "loss": 0.83494061, + "num_input_tokens_seen": 23249015, + "router_z_loss_clip": 5.28125, + "router_z_loss_mlp": 0.75634766, + "step": 1091, + "time_per_iteration": 2.6071624755859375 + }, + { + "auxiliary_loss_clip": 0.06920849, + "auxiliary_loss_mlp": 0.01367283, + "balance_loss_clip": 0.06403629, + "balance_loss_mlp": 0.01298141, + "epoch": 0.06565459191342252, + "flos": 24280686956160.0, + "grad_norm": 3.183278775232349, + "language_loss": 0.85751635, + "learning_rate": 3.986680245605936e-06, + "loss": 0.94039762, + "num_input_tokens_seen": 23265105, + "router_z_loss_clip": 5.1640625, + "router_z_loss_mlp": 0.69091797, + "step": 1092, + "time_per_iteration": 2.605273962020874 + }, + { + "auxiliary_loss_clip": 0.06938382, + "auxiliary_loss_mlp": 0.01382517, + "balance_loss_clip": 0.06414036, + "balance_loss_mlp": 0.0131123, + "epoch": 0.06571471516609048, + "flos": 24793493143680.0, + "grad_norm": 3.590473362105347, + "language_loss": 0.74473059, + "learning_rate": 3.986635334582814e-06, + "loss": 0.82793957, + "num_input_tokens_seen": 23283950, + "router_z_loss_clip": 5.2421875, + "router_z_loss_mlp": 0.71337891, + "step": 1093, + "time_per_iteration": 2.638237237930298 + }, + { + "auxiliary_loss_clip": 0.06921268, + "auxiliary_loss_mlp": 0.01380472, + "balance_loss_clip": 0.06396792, + "balance_loss_mlp": 0.01303797, + "epoch": 0.06577483841875846, + "flos": 26221347843840.0, + "grad_norm": 88.21387149104662, + "language_loss": 0.90390575, + "learning_rate": 3.986590348226282e-06, + "loss": 0.98692322, + "num_input_tokens_seen": 23305005, + "router_z_loss_clip": 5.2421875, + "router_z_loss_mlp": 0.76660156, + "step": 1094, + "time_per_iteration": 2.6458590030670166 + }, + { + "auxiliary_loss_clip": 0.06927408, + "auxiliary_loss_mlp": 0.01386993, + "balance_loss_clip": 0.06403756, + "balance_loss_mlp": 0.01310603, + "epoch": 0.06583496167142643, + "flos": 25087519520640.0, + "grad_norm": 2.736930049066649, + "language_loss": 0.83897924, + "learning_rate": 3.986545286538044e-06, + "loss": 0.92212319, + "num_input_tokens_seen": 23323220, + "router_z_loss_clip": 5.23046875, + "router_z_loss_mlp": 0.76416016, + "step": 1095, + "time_per_iteration": 2.678666114807129 + }, + { + "auxiliary_loss_clip": 0.06935441, + "auxiliary_loss_mlp": 0.01385344, + "balance_loss_clip": 0.06404546, + "balance_loss_mlp": 0.01317443, + "epoch": 0.06589508492409439, + "flos": 25636900815360.0, + "grad_norm": 5.395614329655057, + "language_loss": 0.73154068, + "learning_rate": 3.986500149519811e-06, + "loss": 0.81474853, + "num_input_tokens_seen": 23342235, + "router_z_loss_clip": 5.3046875, + "router_z_loss_mlp": 0.67871094, + "step": 1096, + "time_per_iteration": 2.6446287631988525 + }, + { + "auxiliary_loss_clip": 0.06917029, + "auxiliary_loss_mlp": 0.01365132, + "balance_loss_clip": 0.06399326, + "balance_loss_mlp": 0.01297755, + "epoch": 0.06595520817676236, + "flos": 23627701687680.0, + "grad_norm": 3.583666651431395, + "language_loss": 0.80129099, + "learning_rate": 3.986454937173292e-06, + "loss": 0.8841126, + "num_input_tokens_seen": 23363680, + "router_z_loss_clip": 5.171875, + "router_z_loss_mlp": 0.67285156, + "step": 1097, + "time_per_iteration": 2.610381603240967 + }, + { + "auxiliary_loss_clip": 0.06948523, + "auxiliary_loss_mlp": 0.01368674, + "balance_loss_clip": 0.0639759, + "balance_loss_mlp": 0.01295384, + "epoch": 0.06601533142943034, + "flos": 33810019119360.0, + "grad_norm": 2.548144949478092, + "language_loss": 0.80388427, + "learning_rate": 3.986409649500203e-06, + "loss": 0.88705623, + "num_input_tokens_seen": 23385590, + "router_z_loss_clip": 5.50390625, + "router_z_loss_mlp": 0.73339844, + "step": 1098, + "time_per_iteration": 2.720482110977173 + }, + { + "auxiliary_loss_clip": 0.06938128, + "auxiliary_loss_mlp": 0.01366931, + "balance_loss_clip": 0.06409903, + "balance_loss_mlp": 0.01293498, + "epoch": 0.0660754546820983, + "flos": 20264175417600.0, + "grad_norm": 10.171489722923557, + "language_loss": 0.84726501, + "learning_rate": 3.986364286502261e-06, + "loss": 0.93031561, + "num_input_tokens_seen": 23402945, + "router_z_loss_clip": 5.28125, + "router_z_loss_mlp": 0.73486328, + "step": 1099, + "time_per_iteration": 2.598655939102173 + }, + { + "auxiliary_loss_clip": 0.06904539, + "auxiliary_loss_mlp": 0.01375441, + "balance_loss_clip": 0.0639468, + "balance_loss_mlp": 0.01307397, + "epoch": 0.06613557793476627, + "flos": 19360195643520.0, + "grad_norm": 3.568327868722517, + "language_loss": 0.8664155, + "learning_rate": 3.986318848181186e-06, + "loss": 0.94921529, + "num_input_tokens_seen": 23421410, + "router_z_loss_clip": 5.09765625, + "router_z_loss_mlp": 0.68066406, + "step": 1100, + "time_per_iteration": 2.577528238296509 + }, + { + "auxiliary_loss_clip": 0.06927315, + "auxiliary_loss_mlp": 0.01369622, + "balance_loss_clip": 0.06391686, + "balance_loss_mlp": 0.01299861, + "epoch": 0.06619570118743424, + "flos": 13777788602880.0, + "grad_norm": 2.758398197018795, + "language_loss": 0.76281518, + "learning_rate": 3.986273334538702e-06, + "loss": 0.84578454, + "num_input_tokens_seen": 23438870, + "router_z_loss_clip": 5.3515625, + "router_z_loss_mlp": 0.69775391, + "step": 1101, + "time_per_iteration": 2.6156139373779297 + }, + { + "auxiliary_loss_clip": 0.06904308, + "auxiliary_loss_mlp": 0.01359683, + "balance_loss_clip": 0.06387865, + "balance_loss_mlp": 0.01295215, + "epoch": 0.06625582444010221, + "flos": 17863593068160.0, + "grad_norm": 4.389912717391851, + "language_loss": 0.89471924, + "learning_rate": 3.986227745576533e-06, + "loss": 0.97735918, + "num_input_tokens_seen": 23456975, + "router_z_loss_clip": 5.16796875, + "router_z_loss_mlp": 0.64501953, + "step": 1102, + "time_per_iteration": 2.569350242614746 + }, + { + "auxiliary_loss_clip": 0.0692213, + "auxiliary_loss_mlp": 0.01377442, + "balance_loss_clip": 0.06385392, + "balance_loss_mlp": 0.01306584, + "epoch": 0.06631594769277017, + "flos": 11843584479360.0, + "grad_norm": 3.5425773042581055, + "language_loss": 0.86216784, + "learning_rate": 3.98618208129641e-06, + "loss": 0.94516355, + "num_input_tokens_seen": 23473440, + "router_z_loss_clip": 5.36328125, + "router_z_loss_mlp": 0.70898438, + "step": 1103, + "time_per_iteration": 2.6067960262298584 + }, + { + "auxiliary_loss_clip": 0.06886483, + "auxiliary_loss_mlp": 0.01371541, + "balance_loss_clip": 0.06376658, + "balance_loss_mlp": 0.01305547, + "epoch": 0.06637607094543815, + "flos": 19799683908480.0, + "grad_norm": 2.4626452299406383, + "language_loss": 0.8457936, + "learning_rate": 3.986136341700063e-06, + "loss": 0.92837381, + "num_input_tokens_seen": 23493880, + "router_z_loss_clip": 5.09765625, + "router_z_loss_mlp": 0.66015625, + "step": 1104, + "time_per_iteration": 2.5836308002471924 + }, + { + "auxiliary_loss_clip": 0.06882686, + "auxiliary_loss_mlp": 0.01367781, + "balance_loss_clip": 0.0637526, + "balance_loss_mlp": 0.01303408, + "epoch": 0.06643619419810612, + "flos": 25493032154880.0, + "grad_norm": 1.7655477747418094, + "language_loss": 0.83173895, + "learning_rate": 3.986090526789227e-06, + "loss": 0.91424364, + "num_input_tokens_seen": 23514920, + "router_z_loss_clip": 5.0703125, + "router_z_loss_mlp": 0.64306641, + "step": 1105, + "time_per_iteration": 2.662261486053467 + }, + { + "auxiliary_loss_clip": 0.06873615, + "auxiliary_loss_mlp": 0.01369586, + "balance_loss_clip": 0.06380346, + "balance_loss_mlp": 0.01308694, + "epoch": 0.06649631745077408, + "flos": 16952234135040.0, + "grad_norm": 2.812403865753697, + "language_loss": 0.99235487, + "learning_rate": 3.986044636565639e-06, + "loss": 1.0747869, + "num_input_tokens_seen": 23531635, + "router_z_loss_clip": 4.9296875, + "router_z_loss_mlp": 0.60839844, + "step": 1106, + "time_per_iteration": 2.55377459526062 + }, + { + "auxiliary_loss_clip": 0.0691068, + "auxiliary_loss_mlp": 0.01368117, + "balance_loss_clip": 0.06380811, + "balance_loss_mlp": 0.01299977, + "epoch": 0.06655644070344206, + "flos": 17864431608960.0, + "grad_norm": 9.796712570365342, + "language_loss": 0.85572082, + "learning_rate": 3.985998671031039e-06, + "loss": 0.93850881, + "num_input_tokens_seen": 23551020, + "router_z_loss_clip": 5.296875, + "router_z_loss_mlp": 0.68115234, + "step": 1107, + "time_per_iteration": 2.607999324798584 + }, + { + "auxiliary_loss_clip": 0.06769384, + "auxiliary_loss_mlp": 0.01408352, + "balance_loss_clip": 0.06440101, + "balance_loss_mlp": 0.01358189, + "epoch": 0.06661656395611003, + "flos": 61438033779840.0, + "grad_norm": 0.835907980773472, + "language_loss": 0.57139766, + "learning_rate": 3.9859526301871705e-06, + "loss": 0.653175, + "num_input_tokens_seen": 23610675, + "router_z_loss_clip": 3.28515625, + "router_z_loss_mlp": 0.50195312, + "step": 1108, + "time_per_iteration": 3.1505634784698486 + }, + { + "auxiliary_loss_clip": 0.06919513, + "auxiliary_loss_mlp": 0.01358617, + "balance_loss_clip": 0.06388947, + "balance_loss_mlp": 0.01289285, + "epoch": 0.066676687208778, + "flos": 20668304459520.0, + "grad_norm": 4.7813305453067985, + "language_loss": 0.74593651, + "learning_rate": 3.9859065140357795e-06, + "loss": 0.82871783, + "num_input_tokens_seen": 23628710, + "router_z_loss_clip": 5.30078125, + "router_z_loss_mlp": 0.69384766, + "step": 1109, + "time_per_iteration": 2.5951621532440186 + }, + { + "auxiliary_loss_clip": 0.06901313, + "auxiliary_loss_mlp": 0.01359309, + "balance_loss_clip": 0.06382284, + "balance_loss_mlp": 0.01292219, + "epoch": 0.06673681046144596, + "flos": 20929613016960.0, + "grad_norm": 2.4423466539648686, + "language_loss": 0.81162918, + "learning_rate": 3.985860322578614e-06, + "loss": 0.89423537, + "num_input_tokens_seen": 23649160, + "router_z_loss_clip": 5.18359375, + "router_z_loss_mlp": 0.66992188, + "step": 1110, + "time_per_iteration": 2.5594658851623535 + }, + { + "auxiliary_loss_clip": 0.06916048, + "auxiliary_loss_mlp": 0.01350686, + "balance_loss_clip": 0.06385787, + "balance_loss_mlp": 0.01283261, + "epoch": 0.06679693371411394, + "flos": 31073762113920.0, + "grad_norm": 3.192640550751645, + "language_loss": 0.74339402, + "learning_rate": 3.985814055817427e-06, + "loss": 0.82606131, + "num_input_tokens_seen": 23671995, + "router_z_loss_clip": 5.296875, + "router_z_loss_mlp": 0.67431641, + "step": 1111, + "time_per_iteration": 2.6675732135772705 + }, + { + "auxiliary_loss_clip": 0.0692247, + "auxiliary_loss_mlp": 0.01336011, + "balance_loss_clip": 0.0638883, + "balance_loss_mlp": 0.01269492, + "epoch": 0.0668570569667819, + "flos": 21732630220800.0, + "grad_norm": 3.09844838926034, + "language_loss": 0.81051421, + "learning_rate": 3.985767713753971e-06, + "loss": 0.89309895, + "num_input_tokens_seen": 23690705, + "router_z_loss_clip": 5.3359375, + "router_z_loss_mlp": 0.66455078, + "step": 1112, + "time_per_iteration": 2.5785021781921387 + }, + { + "auxiliary_loss_clip": 0.06900664, + "auxiliary_loss_mlp": 0.01347702, + "balance_loss_clip": 0.06384552, + "balance_loss_mlp": 0.01282185, + "epoch": 0.06691718021944987, + "flos": 22753840256640.0, + "grad_norm": 2.9756537070092466, + "language_loss": 0.82400674, + "learning_rate": 3.985721296390005e-06, + "loss": 0.90649039, + "num_input_tokens_seen": 23709990, + "router_z_loss_clip": 5.1640625, + "router_z_loss_mlp": 0.65576172, + "step": 1113, + "time_per_iteration": 2.6159799098968506 + }, + { + "auxiliary_loss_clip": 0.06872059, + "auxiliary_loss_mlp": 0.01337269, + "balance_loss_clip": 0.06376456, + "balance_loss_mlp": 0.01280382, + "epoch": 0.06697730347211785, + "flos": 16551333475200.0, + "grad_norm": 3.049422068587495, + "language_loss": 0.85146165, + "learning_rate": 3.985674803727289e-06, + "loss": 0.93355489, + "num_input_tokens_seen": 23728485, + "router_z_loss_clip": 4.95703125, + "router_z_loss_mlp": 0.56884766, + "step": 1114, + "time_per_iteration": 2.5442495346069336 + }, + { + "auxiliary_loss_clip": 0.06720632, + "auxiliary_loss_mlp": 0.01311166, + "balance_loss_clip": 0.06393555, + "balance_loss_mlp": 0.01264675, + "epoch": 0.06703742672478581, + "flos": 59801545612800.0, + "grad_norm": 0.814822871226623, + "language_loss": 0.58299243, + "learning_rate": 3.985628235767584e-06, + "loss": 0.66331041, + "num_input_tokens_seen": 23786650, + "router_z_loss_clip": 3.2734375, + "router_z_loss_mlp": 0.46435547, + "step": 1115, + "time_per_iteration": 3.1831469535827637 + }, + { + "auxiliary_loss_clip": 0.06912658, + "auxiliary_loss_mlp": 0.01326736, + "balance_loss_clip": 0.06393988, + "balance_loss_mlp": 0.01261314, + "epoch": 0.06709754997745378, + "flos": 16805807925120.0, + "grad_norm": 5.78180725653176, + "language_loss": 0.94695258, + "learning_rate": 3.985581592512658e-06, + "loss": 1.02934647, + "num_input_tokens_seen": 23802555, + "router_z_loss_clip": 5.1875, + "router_z_loss_mlp": 0.65332031, + "step": 1116, + "time_per_iteration": 2.6025443077087402 + }, + { + "auxiliary_loss_clip": 0.06950381, + "auxiliary_loss_mlp": 0.01352294, + "balance_loss_clip": 0.06407215, + "balance_loss_mlp": 0.01283105, + "epoch": 0.06715767323012176, + "flos": 22129883228160.0, + "grad_norm": 3.297350824619057, + "language_loss": 0.90161335, + "learning_rate": 3.985534873964279e-06, + "loss": 0.98464012, + "num_input_tokens_seen": 23822945, + "router_z_loss_clip": 5.42578125, + "router_z_loss_mlp": 0.69189453, + "step": 1117, + "time_per_iteration": 2.640014410018921 + }, + { + "auxiliary_loss_clip": 0.06703123, + "auxiliary_loss_mlp": 0.01296382, + "balance_loss_clip": 0.06378835, + "balance_loss_mlp": 0.01254898, + "epoch": 0.06721779648278972, + "flos": 66634522842240.0, + "grad_norm": 0.828477744144983, + "language_loss": 0.59793437, + "learning_rate": 3.985488080124218e-06, + "loss": 0.67792934, + "num_input_tokens_seen": 23874075, + "router_z_loss_clip": 3.24804688, + "router_z_loss_mlp": 0.41503906, + "step": 1118, + "time_per_iteration": 3.1895816326141357 + }, + { + "auxiliary_loss_clip": 0.0694533, + "auxiliary_loss_mlp": 0.0134688, + "balance_loss_clip": 0.06400572, + "balance_loss_mlp": 0.0127092, + "epoch": 0.06727791973545769, + "flos": 22389011579520.0, + "grad_norm": 4.072656467009049, + "language_loss": 0.87426257, + "learning_rate": 3.985441210994251e-06, + "loss": 0.95718467, + "num_input_tokens_seen": 23889720, + "router_z_loss_clip": 5.453125, + "router_z_loss_mlp": 0.76025391, + "step": 1119, + "time_per_iteration": 2.588590621948242 + }, + { + "auxiliary_loss_clip": 0.0690966, + "auxiliary_loss_mlp": 0.01331486, + "balance_loss_clip": 0.06396869, + "balance_loss_mlp": 0.01269116, + "epoch": 0.06733804298812565, + "flos": 24287143720320.0, + "grad_norm": 3.964620176038611, + "language_loss": 0.88010037, + "learning_rate": 3.9853942665761545e-06, + "loss": 0.9625119, + "num_input_tokens_seen": 23909385, + "router_z_loss_clip": 5.12109375, + "router_z_loss_mlp": 0.62451172, + "step": 1120, + "time_per_iteration": 2.6959142684936523 + }, + { + "auxiliary_loss_clip": 0.06922112, + "auxiliary_loss_mlp": 0.01340271, + "balance_loss_clip": 0.06406626, + "balance_loss_mlp": 0.01275421, + "epoch": 0.06739816624079363, + "flos": 15922638691200.0, + "grad_norm": 2.824028723834481, + "language_loss": 0.81958008, + "learning_rate": 3.985347246871708e-06, + "loss": 0.90220392, + "num_input_tokens_seen": 23926830, + "router_z_loss_clip": 5.15625, + "router_z_loss_mlp": 0.6484375, + "step": 1121, + "time_per_iteration": 2.5337889194488525 + }, + { + "auxiliary_loss_clip": 0.0669936, + "auxiliary_loss_mlp": 0.01328619, + "balance_loss_clip": 0.0637704, + "balance_loss_mlp": 0.01291044, + "epoch": 0.0674582894934616, + "flos": 71422031796480.0, + "grad_norm": 0.7591545371637793, + "language_loss": 0.58392835, + "learning_rate": 3.985300151882694e-06, + "loss": 0.66420811, + "num_input_tokens_seen": 23992640, + "router_z_loss_clip": 3.21875, + "router_z_loss_mlp": 0.375, + "step": 1122, + "time_per_iteration": 4.871971130371094 + }, + { + "auxiliary_loss_clip": 0.06934178, + "auxiliary_loss_mlp": 0.01339594, + "balance_loss_clip": 0.06410946, + "balance_loss_mlp": 0.01275269, + "epoch": 0.06751841274612956, + "flos": 25271988284160.0, + "grad_norm": 2.7004693252579286, + "language_loss": 0.75033748, + "learning_rate": 3.985252981610901e-06, + "loss": 0.83307523, + "num_input_tokens_seen": 24011135, + "router_z_loss_clip": 5.23046875, + "router_z_loss_mlp": 0.64355469, + "step": 1123, + "time_per_iteration": 4.122293472290039 + }, + { + "auxiliary_loss_clip": 0.06974602, + "auxiliary_loss_mlp": 0.0135696, + "balance_loss_clip": 0.06425263, + "balance_loss_mlp": 0.01278282, + "epoch": 0.06757853599879754, + "flos": 23809067850240.0, + "grad_norm": 9.643312426369809, + "language_loss": 0.82052922, + "learning_rate": 3.985205736058114e-06, + "loss": 0.90384483, + "num_input_tokens_seen": 24030695, + "router_z_loss_clip": 5.49609375, + "router_z_loss_mlp": 0.78637695, + "step": 1124, + "time_per_iteration": 2.6173415184020996 + }, + { + "auxiliary_loss_clip": 0.06911455, + "auxiliary_loss_mlp": 0.01341629, + "balance_loss_clip": 0.06401114, + "balance_loss_mlp": 0.01274705, + "epoch": 0.0676386592514655, + "flos": 21040260733440.0, + "grad_norm": 3.063274936287039, + "language_loss": 0.74925935, + "learning_rate": 3.985158415226128e-06, + "loss": 0.83179009, + "num_input_tokens_seen": 24050680, + "router_z_loss_clip": 5.10546875, + "router_z_loss_mlp": 0.66870117, + "step": 1125, + "time_per_iteration": 3.984415292739868 + }, + { + "auxiliary_loss_clip": 0.0694951, + "auxiliary_loss_mlp": 0.01360506, + "balance_loss_clip": 0.06422167, + "balance_loss_mlp": 0.01290745, + "epoch": 0.06769878250413347, + "flos": 25563331330560.0, + "grad_norm": 3.6371795971434935, + "language_loss": 0.84025776, + "learning_rate": 3.985111019116736e-06, + "loss": 0.92335784, + "num_input_tokens_seen": 24067205, + "router_z_loss_clip": 5.2734375, + "router_z_loss_mlp": 0.69726562, + "step": 1126, + "time_per_iteration": 2.6536872386932373 + }, + { + "auxiliary_loss_clip": 0.06684255, + "auxiliary_loss_mlp": 0.01367323, + "balance_loss_clip": 0.06366412, + "balance_loss_mlp": 0.01329891, + "epoch": 0.06775890575680145, + "flos": 70676316385920.0, + "grad_norm": 0.9685337357274917, + "language_loss": 0.60214978, + "learning_rate": 3.985063547731735e-06, + "loss": 0.68266553, + "num_input_tokens_seen": 24131320, + "router_z_loss_clip": 3.1796875, + "router_z_loss_mlp": 0.37353516, + "step": 1127, + "time_per_iteration": 3.2334144115448 + }, + { + "auxiliary_loss_clip": 0.06927685, + "auxiliary_loss_mlp": 0.01345826, + "balance_loss_clip": 0.0640737, + "balance_loss_mlp": 0.01276304, + "epoch": 0.06781902900946941, + "flos": 24241051175040.0, + "grad_norm": 3.0319163993738307, + "language_loss": 0.83925569, + "learning_rate": 3.985016001072925e-06, + "loss": 0.92199081, + "num_input_tokens_seen": 24149930, + "router_z_loss_clip": 5.19921875, + "router_z_loss_mlp": 0.6953125, + "step": 1128, + "time_per_iteration": 4.002989053726196 + }, + { + "auxiliary_loss_clip": 0.06986301, + "auxiliary_loss_mlp": 0.01369711, + "balance_loss_clip": 0.06426411, + "balance_loss_mlp": 0.01288792, + "epoch": 0.06787915226213738, + "flos": 22423825751040.0, + "grad_norm": 5.128906887201041, + "language_loss": 0.79490405, + "learning_rate": 3.984968379142109e-06, + "loss": 0.87846416, + "num_input_tokens_seen": 24169590, + "router_z_loss_clip": 5.59375, + "router_z_loss_mlp": 0.80908203, + "step": 1129, + "time_per_iteration": 2.6091246604919434 + }, + { + "auxiliary_loss_clip": 0.06950344, + "auxiliary_loss_mlp": 0.0134506, + "balance_loss_clip": 0.06413193, + "balance_loss_mlp": 0.01275251, + "epoch": 0.06793927551480534, + "flos": 37716092576640.0, + "grad_norm": 7.724208809946286, + "language_loss": 0.75193048, + "learning_rate": 3.984920681941094e-06, + "loss": 0.83488452, + "num_input_tokens_seen": 24189965, + "router_z_loss_clip": 5.37109375, + "router_z_loss_mlp": 0.69873047, + "step": 1130, + "time_per_iteration": 2.747319221496582 + }, + { + "auxiliary_loss_clip": 0.06924557, + "auxiliary_loss_mlp": 0.01342805, + "balance_loss_clip": 0.06402417, + "balance_loss_mlp": 0.01275428, + "epoch": 0.06799939876747332, + "flos": 20637682992000.0, + "grad_norm": 3.4742611596039583, + "language_loss": 0.83601421, + "learning_rate": 3.984872909471688e-06, + "loss": 0.91868782, + "num_input_tokens_seen": 24208045, + "router_z_loss_clip": 5.21484375, + "router_z_loss_mlp": 0.67333984, + "step": 1131, + "time_per_iteration": 2.619173765182495 + }, + { + "auxiliary_loss_clip": 0.06889838, + "auxiliary_loss_mlp": 0.01323899, + "balance_loss_clip": 0.06390625, + "balance_loss_mlp": 0.01266011, + "epoch": 0.06805952202014129, + "flos": 14869759011840.0, + "grad_norm": 6.452833361572522, + "language_loss": 0.83523953, + "learning_rate": 3.984825061735701e-06, + "loss": 0.91737688, + "num_input_tokens_seen": 24223805, + "router_z_loss_clip": 4.99609375, + "router_z_loss_mlp": 0.57958984, + "step": 1132, + "time_per_iteration": 2.5897791385650635 + }, + { + "auxiliary_loss_clip": 0.06909724, + "auxiliary_loss_mlp": 0.01329094, + "balance_loss_clip": 0.06400912, + "balance_loss_mlp": 0.0126813, + "epoch": 0.06811964527280925, + "flos": 48920710147200.0, + "grad_norm": 2.2815724812180056, + "language_loss": 0.66480637, + "learning_rate": 3.9847771387349495e-06, + "loss": 0.74719459, + "num_input_tokens_seen": 24249475, + "router_z_loss_clip": 5.09375, + "router_z_loss_mlp": 0.61035156, + "step": 1133, + "time_per_iteration": 2.830873966217041 + }, + { + "auxiliary_loss_clip": 0.06951424, + "auxiliary_loss_mlp": 0.01351356, + "balance_loss_clip": 0.06402567, + "balance_loss_mlp": 0.0127573, + "epoch": 0.06817976852547723, + "flos": 15382649053440.0, + "grad_norm": 2.526233551435035, + "language_loss": 0.78033423, + "learning_rate": 3.9847291404712506e-06, + "loss": 0.86336207, + "num_input_tokens_seen": 24267980, + "router_z_loss_clip": 5.484375, + "router_z_loss_mlp": 0.75634766, + "step": 1134, + "time_per_iteration": 2.5770034790039062 + }, + { + "auxiliary_loss_clip": 0.06920115, + "auxiliary_loss_mlp": 0.0133773, + "balance_loss_clip": 0.06399941, + "balance_loss_mlp": 0.01275216, + "epoch": 0.0682398917781452, + "flos": 20161661546880.0, + "grad_norm": 3.170480536995333, + "language_loss": 0.89855266, + "learning_rate": 3.984681066946423e-06, + "loss": 0.98113102, + "num_input_tokens_seen": 24286805, + "router_z_loss_clip": 5.19921875, + "router_z_loss_mlp": 0.625, + "step": 1135, + "time_per_iteration": 2.574153423309326 + }, + { + "auxiliary_loss_clip": 0.06912802, + "auxiliary_loss_mlp": 0.01339867, + "balance_loss_clip": 0.06390901, + "balance_loss_mlp": 0.01268723, + "epoch": 0.06830001503081316, + "flos": 23447341774080.0, + "grad_norm": 4.323885929511343, + "language_loss": 0.81566894, + "learning_rate": 3.984632918162291e-06, + "loss": 0.89819562, + "num_input_tokens_seen": 24305855, + "router_z_loss_clip": 5.2109375, + "router_z_loss_mlp": 0.7109375, + "step": 1136, + "time_per_iteration": 2.632093906402588 + }, + { + "auxiliary_loss_clip": 0.0691568, + "auxiliary_loss_mlp": 0.01339988, + "balance_loss_clip": 0.06395651, + "balance_loss_mlp": 0.01271133, + "epoch": 0.06836013828348114, + "flos": 34358352238080.0, + "grad_norm": 3.452027949613855, + "language_loss": 0.86628962, + "learning_rate": 3.984584694120679e-06, + "loss": 0.94884622, + "num_input_tokens_seen": 24326535, + "router_z_loss_clip": 5.1953125, + "router_z_loss_mlp": 0.68798828, + "step": 1137, + "time_per_iteration": 2.7281885147094727 + }, + { + "auxiliary_loss_clip": 0.0688309, + "auxiliary_loss_mlp": 0.01332345, + "balance_loss_clip": 0.06381994, + "balance_loss_mlp": 0.01269736, + "epoch": 0.06842026153614911, + "flos": 23155537530240.0, + "grad_norm": 8.291551749105667, + "language_loss": 0.81329322, + "learning_rate": 3.984536394823418e-06, + "loss": 0.89544761, + "num_input_tokens_seen": 24345810, + "router_z_loss_clip": 5.0078125, + "router_z_loss_mlp": 0.62646484, + "step": 1138, + "time_per_iteration": 2.605118989944458 + }, + { + "auxiliary_loss_clip": 0.06915967, + "auxiliary_loss_mlp": 0.01331137, + "balance_loss_clip": 0.06396595, + "balance_loss_mlp": 0.01263808, + "epoch": 0.06848038478881707, + "flos": 24616026195840.0, + "grad_norm": 3.6376188064113704, + "language_loss": 0.88301587, + "learning_rate": 3.984488020272336e-06, + "loss": 0.96548682, + "num_input_tokens_seen": 24366095, + "router_z_loss_clip": 5.203125, + "router_z_loss_mlp": 0.67382812, + "step": 1139, + "time_per_iteration": 2.5919554233551025 + }, + { + "auxiliary_loss_clip": 0.06913859, + "auxiliary_loss_mlp": 0.01335261, + "balance_loss_clip": 0.0640454, + "balance_loss_mlp": 0.01272175, + "epoch": 0.06854050804148504, + "flos": 40890663889920.0, + "grad_norm": 3.4360954602414515, + "language_loss": 0.78086925, + "learning_rate": 3.984439570469271e-06, + "loss": 0.8633604, + "num_input_tokens_seen": 24388665, + "router_z_loss_clip": 5.09375, + "router_z_loss_mlp": 0.6315918, + "step": 1140, + "time_per_iteration": 2.805285930633545 + }, + { + "auxiliary_loss_clip": 0.06922249, + "auxiliary_loss_mlp": 0.01343333, + "balance_loss_clip": 0.06401816, + "balance_loss_mlp": 0.01273191, + "epoch": 0.06860063129415302, + "flos": 31694448833280.0, + "grad_norm": 3.650068739701382, + "language_loss": 0.7214306, + "learning_rate": 3.9843910454160574e-06, + "loss": 0.80408645, + "num_input_tokens_seen": 24407705, + "router_z_loss_clip": 5.1953125, + "router_z_loss_mlp": 0.70166016, + "step": 1141, + "time_per_iteration": 2.661224603652954 + }, + { + "auxiliary_loss_clip": 0.06967719, + "auxiliary_loss_mlp": 0.0134803, + "balance_loss_clip": 0.06416196, + "balance_loss_mlp": 0.01274931, + "epoch": 0.06866075454682098, + "flos": 26549265997440.0, + "grad_norm": 3.4867433558806664, + "language_loss": 0.81973946, + "learning_rate": 3.984342445114538e-06, + "loss": 0.902897, + "num_input_tokens_seen": 24428390, + "router_z_loss_clip": 5.515625, + "router_z_loss_mlp": 0.73095703, + "step": 1142, + "time_per_iteration": 2.6615188121795654 + }, + { + "auxiliary_loss_clip": 0.06894746, + "auxiliary_loss_mlp": 0.01330861, + "balance_loss_clip": 0.06396586, + "balance_loss_mlp": 0.01266488, + "epoch": 0.06872087779948895, + "flos": 29797658357760.0, + "grad_norm": 2.7600235318020157, + "language_loss": 0.71011055, + "learning_rate": 3.984293769566553e-06, + "loss": 0.79236662, + "num_input_tokens_seen": 24450810, + "router_z_loss_clip": 4.97265625, + "router_z_loss_mlp": 0.64404297, + "step": 1143, + "time_per_iteration": 2.6366419792175293 + }, + { + "auxiliary_loss_clip": 0.06881121, + "auxiliary_loss_mlp": 0.01324263, + "balance_loss_clip": 0.06384973, + "balance_loss_mlp": 0.01260987, + "epoch": 0.06878100105215693, + "flos": 26948070305280.0, + "grad_norm": 2.948232373137099, + "language_loss": 0.77426863, + "learning_rate": 3.98424501877395e-06, + "loss": 0.85632247, + "num_input_tokens_seen": 24469965, + "router_z_loss_clip": 4.9609375, + "router_z_loss_mlp": 0.63232422, + "step": 1144, + "time_per_iteration": 2.6423499584198 + }, + { + "auxiliary_loss_clip": 0.06941762, + "auxiliary_loss_mlp": 0.01342145, + "balance_loss_clip": 0.0640377, + "balance_loss_mlp": 0.01268617, + "epoch": 0.06884112430482489, + "flos": 10675361255040.0, + "grad_norm": 11.35172742857112, + "language_loss": 0.95204943, + "learning_rate": 3.984196192738577e-06, + "loss": 1.03488851, + "num_input_tokens_seen": 24486370, + "router_z_loss_clip": 5.37890625, + "router_z_loss_mlp": 0.73486328, + "step": 1145, + "time_per_iteration": 2.5397605895996094 + }, + { + "auxiliary_loss_clip": 0.06956828, + "auxiliary_loss_mlp": 0.01350992, + "balance_loss_clip": 0.06409793, + "balance_loss_mlp": 0.01275032, + "epoch": 0.06890124755749286, + "flos": 20199871808640.0, + "grad_norm": 2.888200090327115, + "language_loss": 0.85492933, + "learning_rate": 3.984147291462285e-06, + "loss": 0.93800759, + "num_input_tokens_seen": 24503780, + "router_z_loss_clip": 5.47265625, + "router_z_loss_mlp": 0.76025391, + "step": 1146, + "time_per_iteration": 2.594526529312134 + }, + { + "auxiliary_loss_clip": 0.06872599, + "auxiliary_loss_mlp": 0.01322623, + "balance_loss_clip": 0.06383249, + "balance_loss_mlp": 0.01261373, + "epoch": 0.06896137081016084, + "flos": 20455520215680.0, + "grad_norm": 3.1845992476426472, + "language_loss": 0.87540007, + "learning_rate": 3.98409831494693e-06, + "loss": 0.95735222, + "num_input_tokens_seen": 24522320, + "router_z_loss_clip": 4.890625, + "router_z_loss_mlp": 0.61303711, + "step": 1147, + "time_per_iteration": 2.583275556564331 + }, + { + "auxiliary_loss_clip": 0.06904457, + "auxiliary_loss_mlp": 0.01331833, + "balance_loss_clip": 0.06408815, + "balance_loss_mlp": 0.01268628, + "epoch": 0.0690214940628288, + "flos": 18374512538880.0, + "grad_norm": 2.487655094523106, + "language_loss": 0.88253343, + "learning_rate": 3.984049263194367e-06, + "loss": 0.96489632, + "num_input_tokens_seen": 24540445, + "router_z_loss_clip": 4.9453125, + "router_z_loss_mlp": 0.63232422, + "step": 1148, + "time_per_iteration": 2.6046411991119385 + }, + { + "auxiliary_loss_clip": 0.06914362, + "auxiliary_loss_mlp": 0.01331137, + "balance_loss_clip": 0.0640358, + "balance_loss_mlp": 0.01259516, + "epoch": 0.06908161731549677, + "flos": 20564239288320.0, + "grad_norm": 4.03707404203517, + "language_loss": 0.7250514, + "learning_rate": 3.9840001362064575e-06, + "loss": 0.80750638, + "num_input_tokens_seen": 24557105, + "router_z_loss_clip": 5.10546875, + "router_z_loss_mlp": 0.71606445, + "step": 1149, + "time_per_iteration": 2.598886489868164 + }, + { + "auxiliary_loss_clip": 0.06921704, + "auxiliary_loss_mlp": 0.01339506, + "balance_loss_clip": 0.06409335, + "balance_loss_mlp": 0.01271891, + "epoch": 0.06914174056816474, + "flos": 27571104938880.0, + "grad_norm": 5.60622478722484, + "language_loss": 0.87750047, + "learning_rate": 3.983950933985064e-06, + "loss": 0.96011257, + "num_input_tokens_seen": 24578240, + "router_z_loss_clip": 5.125, + "router_z_loss_mlp": 0.67626953, + "step": 1150, + "time_per_iteration": 2.618924379348755 + }, + { + "auxiliary_loss_clip": 0.06931552, + "auxiliary_loss_mlp": 0.01344517, + "balance_loss_clip": 0.06421608, + "balance_loss_mlp": 0.01277283, + "epoch": 0.06920186382083271, + "flos": 15309331130880.0, + "grad_norm": 4.140310732721626, + "language_loss": 0.85321879, + "learning_rate": 3.983901656532052e-06, + "loss": 0.93597955, + "num_input_tokens_seen": 24593585, + "router_z_loss_clip": 5.08984375, + "router_z_loss_mlp": 0.671875, + "step": 1151, + "time_per_iteration": 2.561635971069336 + }, + { + "auxiliary_loss_clip": 0.06954889, + "auxiliary_loss_mlp": 0.01331032, + "balance_loss_clip": 0.06432007, + "balance_loss_mlp": 0.01262987, + "epoch": 0.06926198707350067, + "flos": 25198125310080.0, + "grad_norm": 6.641784633133515, + "language_loss": 0.8773886, + "learning_rate": 3.983852303849291e-06, + "loss": 0.96024776, + "num_input_tokens_seen": 24613110, + "router_z_loss_clip": 5.2265625, + "router_z_loss_mlp": 0.68066406, + "step": 1152, + "time_per_iteration": 2.610301971435547 + }, + { + "auxiliary_loss_clip": 0.06939621, + "auxiliary_loss_mlp": 0.01350234, + "balance_loss_clip": 0.06435804, + "balance_loss_mlp": 0.01282142, + "epoch": 0.06932211032616864, + "flos": 13260328513920.0, + "grad_norm": 2.8280818960049046, + "language_loss": 0.93534935, + "learning_rate": 3.983802875938651e-06, + "loss": 1.01824796, + "num_input_tokens_seen": 24628795, + "router_z_loss_clip": 5.03125, + "router_z_loss_mlp": 0.68066406, + "step": 1153, + "time_per_iteration": 2.595799207687378 + }, + { + "auxiliary_loss_clip": 0.06937614, + "auxiliary_loss_mlp": 0.01346443, + "balance_loss_clip": 0.06424908, + "balance_loss_mlp": 0.01280687, + "epoch": 0.06938223357883662, + "flos": 24834386736000.0, + "grad_norm": 3.275555077522592, + "language_loss": 0.83502865, + "learning_rate": 3.983753372802008e-06, + "loss": 0.91786921, + "num_input_tokens_seen": 24645480, + "router_z_loss_clip": 5.125, + "router_z_loss_mlp": 0.65771484, + "step": 1154, + "time_per_iteration": 2.615935802459717 + }, + { + "auxiliary_loss_clip": 0.06924553, + "auxiliary_loss_mlp": 0.01343071, + "balance_loss_clip": 0.06417688, + "balance_loss_mlp": 0.01275837, + "epoch": 0.06944235683150458, + "flos": 27274730647680.0, + "grad_norm": 2.790851822686811, + "language_loss": 0.77858025, + "learning_rate": 3.983703794441237e-06, + "loss": 0.86125654, + "num_input_tokens_seen": 24664630, + "router_z_loss_clip": 5.06640625, + "router_z_loss_mlp": 0.67285156, + "step": 1155, + "time_per_iteration": 2.6646928787231445 + }, + { + "auxiliary_loss_clip": 0.06934217, + "auxiliary_loss_mlp": 0.01349275, + "balance_loss_clip": 0.06429212, + "balance_loss_mlp": 0.01284616, + "epoch": 0.06950248008417255, + "flos": 25814493544320.0, + "grad_norm": 4.449978036613599, + "language_loss": 0.73122412, + "learning_rate": 3.98365414085822e-06, + "loss": 0.81405902, + "num_input_tokens_seen": 24684210, + "router_z_loss_clip": 5.05078125, + "router_z_loss_mlp": 0.64697266, + "step": 1156, + "time_per_iteration": 2.6129708290100098 + }, + { + "auxiliary_loss_clip": 0.06933945, + "auxiliary_loss_mlp": 0.0134792, + "balance_loss_clip": 0.06418756, + "balance_loss_mlp": 0.01275202, + "epoch": 0.06956260333684053, + "flos": 22277818811520.0, + "grad_norm": 6.490327446037073, + "language_loss": 0.77343124, + "learning_rate": 3.98360441205484e-06, + "loss": 0.85624993, + "num_input_tokens_seen": 24702490, + "router_z_loss_clip": 5.1484375, + "router_z_loss_mlp": 0.7265625, + "step": 1157, + "time_per_iteration": 2.617549419403076 + }, + { + "auxiliary_loss_clip": 0.06920086, + "auxiliary_loss_mlp": 0.01334116, + "balance_loss_clip": 0.06410048, + "balance_loss_mlp": 0.01268265, + "epoch": 0.0696227265895085, + "flos": 29689442409600.0, + "grad_norm": 3.2808507481159785, + "language_loss": 0.7421459, + "learning_rate": 3.983554608032982e-06, + "loss": 0.8246879, + "num_input_tokens_seen": 24724340, + "router_z_loss_clip": 5.1015625, + "router_z_loss_mlp": 0.65869141, + "step": 1158, + "time_per_iteration": 2.649886131286621 + }, + { + "auxiliary_loss_clip": 0.0693851, + "auxiliary_loss_mlp": 0.01343202, + "balance_loss_clip": 0.06428596, + "balance_loss_mlp": 0.01279401, + "epoch": 0.06968284984217646, + "flos": 25531158562560.0, + "grad_norm": 2.8574838231568687, + "language_loss": 0.82572293, + "learning_rate": 3.983504728794533e-06, + "loss": 0.90854007, + "num_input_tokens_seen": 24745550, + "router_z_loss_clip": 5.09765625, + "router_z_loss_mlp": 0.63818359, + "step": 1159, + "time_per_iteration": 2.657604694366455 + }, + { + "auxiliary_loss_clip": 0.06916194, + "auxiliary_loss_mlp": 0.01333029, + "balance_loss_clip": 0.06403087, + "balance_loss_mlp": 0.01260598, + "epoch": 0.06974297309484444, + "flos": 20703454047360.0, + "grad_norm": 4.319041132998911, + "language_loss": 0.83704364, + "learning_rate": 3.983454774341387e-06, + "loss": 0.91953588, + "num_input_tokens_seen": 24762575, + "router_z_loss_clip": 5.125, + "router_z_loss_mlp": 0.72460938, + "step": 1160, + "time_per_iteration": 2.5699267387390137 + }, + { + "auxiliary_loss_clip": 0.06909285, + "auxiliary_loss_mlp": 0.01331612, + "balance_loss_clip": 0.06406631, + "balance_loss_mlp": 0.01266857, + "epoch": 0.0698030963475124, + "flos": 26512397400960.0, + "grad_norm": 2.5893552087800598, + "language_loss": 0.78334123, + "learning_rate": 3.983404744675437e-06, + "loss": 0.86575019, + "num_input_tokens_seen": 24782605, + "router_z_loss_clip": 5.0234375, + "router_z_loss_mlp": 0.64794922, + "step": 1161, + "time_per_iteration": 4.190939664840698 + }, + { + "auxiliary_loss_clip": 0.06900249, + "auxiliary_loss_mlp": 0.0132851, + "balance_loss_clip": 0.06396457, + "balance_loss_mlp": 0.01263899, + "epoch": 0.06986321960018037, + "flos": 23047279655040.0, + "grad_norm": 6.695162889354259, + "language_loss": 0.8492136, + "learning_rate": 3.9833546397985794e-06, + "loss": 0.93150115, + "num_input_tokens_seen": 24802910, + "router_z_loss_clip": 5.0390625, + "router_z_loss_mlp": 0.64575195, + "step": 1162, + "time_per_iteration": 2.639911413192749 + }, + { + "auxiliary_loss_clip": 0.06873773, + "auxiliary_loss_mlp": 0.01325161, + "balance_loss_clip": 0.06388026, + "balance_loss_mlp": 0.01266557, + "epoch": 0.06992334285284833, + "flos": 28592356901760.0, + "grad_norm": 3.1892890701678778, + "language_loss": 0.82525402, + "learning_rate": 3.983304459712716e-06, + "loss": 0.90724337, + "num_input_tokens_seen": 24823305, + "router_z_loss_clip": 4.8515625, + "router_z_loss_mlp": 0.58642578, + "step": 1163, + "time_per_iteration": 4.1009368896484375 + }, + { + "auxiliary_loss_clip": 0.06902477, + "auxiliary_loss_mlp": 0.0132859, + "balance_loss_clip": 0.06390633, + "balance_loss_mlp": 0.01260832, + "epoch": 0.06998346610551631, + "flos": 20601694863360.0, + "grad_norm": 2.8425577951758956, + "language_loss": 0.8088491, + "learning_rate": 3.983254204419749e-06, + "loss": 0.89115977, + "num_input_tokens_seen": 24842155, + "router_z_loss_clip": 5.1171875, + "router_z_loss_mlp": 0.67773438, + "step": 1164, + "time_per_iteration": 2.6123766899108887 + }, + { + "auxiliary_loss_clip": 0.06897761, + "auxiliary_loss_mlp": 0.01323941, + "balance_loss_clip": 0.06385773, + "balance_loss_mlp": 0.012589, + "epoch": 0.07004358935818428, + "flos": 22535437789440.0, + "grad_norm": 2.2246598791524903, + "language_loss": 0.75642318, + "learning_rate": 3.983203873921583e-06, + "loss": 0.83864021, + "num_input_tokens_seen": 24862080, + "router_z_loss_clip": 5.1171875, + "router_z_loss_mlp": 0.64941406, + "step": 1165, + "time_per_iteration": 4.041048288345337 + }, + { + "auxiliary_loss_clip": 0.06871405, + "auxiliary_loss_mlp": 0.01319453, + "balance_loss_clip": 0.06375992, + "balance_loss_mlp": 0.01258847, + "epoch": 0.07010371261085224, + "flos": 28957646776320.0, + "grad_norm": 2.442665636555923, + "language_loss": 0.83451885, + "learning_rate": 3.983153468220128e-06, + "loss": 0.91642749, + "num_input_tokens_seen": 24886165, + "router_z_loss_clip": 4.94921875, + "router_z_loss_mlp": 0.60668945, + "step": 1166, + "time_per_iteration": 2.652954339981079 + }, + { + "auxiliary_loss_clip": 0.06883232, + "auxiliary_loss_mlp": 0.01318395, + "balance_loss_clip": 0.06374976, + "balance_loss_mlp": 0.01257599, + "epoch": 0.07016383586352022, + "flos": 23665870022400.0, + "grad_norm": 2.9279177018628393, + "language_loss": 0.87250483, + "learning_rate": 3.983102987317295e-06, + "loss": 0.95452112, + "num_input_tokens_seen": 24905775, + "router_z_loss_clip": 5.07421875, + "router_z_loss_mlp": 0.60791016, + "step": 1167, + "time_per_iteration": 3.997807502746582 + }, + { + "auxiliary_loss_clip": 0.06869654, + "auxiliary_loss_mlp": 0.01315759, + "balance_loss_clip": 0.0637234, + "balance_loss_mlp": 0.01256608, + "epoch": 0.07022395911618819, + "flos": 19798258389120.0, + "grad_norm": 3.2057139816430826, + "language_loss": 0.9293927, + "learning_rate": 3.983052431214997e-06, + "loss": 1.01124692, + "num_input_tokens_seen": 24924295, + "router_z_loss_clip": 4.97265625, + "router_z_loss_mlp": 0.59106445, + "step": 1168, + "time_per_iteration": 2.6452579498291016 + }, + { + "auxiliary_loss_clip": 0.06893629, + "auxiliary_loss_mlp": 0.01330714, + "balance_loss_clip": 0.06368282, + "balance_loss_mlp": 0.01258331, + "epoch": 0.07028408236885615, + "flos": 21695551989120.0, + "grad_norm": 11.495675802169094, + "language_loss": 0.91365838, + "learning_rate": 3.983001799915153e-06, + "loss": 0.99590182, + "num_input_tokens_seen": 24943210, + "router_z_loss_clip": 5.24609375, + "router_z_loss_mlp": 0.72363281, + "step": 1169, + "time_per_iteration": 2.647975444793701 + }, + { + "auxiliary_loss_clip": 0.06888205, + "auxiliary_loss_mlp": 0.01328046, + "balance_loss_clip": 0.06373216, + "balance_loss_mlp": 0.01262696, + "epoch": 0.07034420562152413, + "flos": 25637445866880.0, + "grad_norm": 2.8251979605986515, + "language_loss": 0.87019682, + "learning_rate": 3.982951093419681e-06, + "loss": 0.95235932, + "num_input_tokens_seen": 24960360, + "router_z_loss_clip": 5.14453125, + "router_z_loss_mlp": 0.65356445, + "step": 1170, + "time_per_iteration": 2.6168391704559326 + }, + { + "auxiliary_loss_clip": 0.06855451, + "auxiliary_loss_mlp": 0.01322256, + "balance_loss_clip": 0.06370235, + "balance_loss_mlp": 0.01265703, + "epoch": 0.0704043288741921, + "flos": 20816198115840.0, + "grad_norm": 5.8134102676021175, + "language_loss": 0.77777052, + "learning_rate": 3.982900311730506e-06, + "loss": 0.85954762, + "num_input_tokens_seen": 24978290, + "router_z_loss_clip": 4.84765625, + "router_z_loss_mlp": 0.56542969, + "step": 1171, + "time_per_iteration": 2.5752956867218018 + }, + { + "auxiliary_loss_clip": 0.06854077, + "auxiliary_loss_mlp": 0.01325506, + "balance_loss_clip": 0.06365283, + "balance_loss_mlp": 0.01268191, + "epoch": 0.07046445212686006, + "flos": 25600241854080.0, + "grad_norm": 2.1487650465547463, + "language_loss": 0.92066246, + "learning_rate": 3.9828494548495514e-06, + "loss": 1.00245833, + "num_input_tokens_seen": 24997055, + "router_z_loss_clip": 4.88671875, + "router_z_loss_mlp": 0.57373047, + "step": 1172, + "time_per_iteration": 2.6476805210113525 + }, + { + "auxiliary_loss_clip": 0.06885421, + "auxiliary_loss_mlp": 0.01324663, + "balance_loss_clip": 0.06371161, + "balance_loss_mlp": 0.01262006, + "epoch": 0.07052457537952803, + "flos": 25564086017280.0, + "grad_norm": 2.603738764291359, + "language_loss": 0.84748065, + "learning_rate": 3.982798522778748e-06, + "loss": 0.92958152, + "num_input_tokens_seen": 25017490, + "router_z_loss_clip": 5.140625, + "router_z_loss_mlp": 0.62695312, + "step": 1173, + "time_per_iteration": 2.6071321964263916 + }, + { + "auxiliary_loss_clip": 0.06857952, + "auxiliary_loss_mlp": 0.01331109, + "balance_loss_clip": 0.06368312, + "balance_loss_mlp": 0.01273054, + "epoch": 0.070584698632196, + "flos": 17974450419840.0, + "grad_norm": 3.5775835502164868, + "language_loss": 0.85116845, + "learning_rate": 3.9827475155200245e-06, + "loss": 0.9330591, + "num_input_tokens_seen": 25035660, + "router_z_loss_clip": 4.89453125, + "router_z_loss_mlp": 0.58129883, + "step": 1174, + "time_per_iteration": 2.57753324508667 + }, + { + "auxiliary_loss_clip": 0.06853965, + "auxiliary_loss_mlp": 0.01334878, + "balance_loss_clip": 0.06364483, + "balance_loss_mlp": 0.01276847, + "epoch": 0.07064482188486397, + "flos": 25377353193600.0, + "grad_norm": 2.5795508468108053, + "language_loss": 0.87789464, + "learning_rate": 3.982696433075317e-06, + "loss": 0.95978308, + "num_input_tokens_seen": 25054785, + "router_z_loss_clip": 4.89453125, + "router_z_loss_mlp": 0.58056641, + "step": 1175, + "time_per_iteration": 2.610611915588379 + }, + { + "auxiliary_loss_clip": 0.06871554, + "auxiliary_loss_mlp": 0.01331862, + "balance_loss_clip": 0.06373453, + "balance_loss_mlp": 0.0127116, + "epoch": 0.07070494513753194, + "flos": 24906782263680.0, + "grad_norm": 2.676154874226604, + "language_loss": 0.87147272, + "learning_rate": 3.982645275446563e-06, + "loss": 0.95350683, + "num_input_tokens_seen": 25075180, + "router_z_loss_clip": 4.97265625, + "router_z_loss_mlp": 0.60644531, + "step": 1176, + "time_per_iteration": 2.6749603748321533 + }, + { + "auxiliary_loss_clip": 0.06855497, + "auxiliary_loss_mlp": 0.01331059, + "balance_loss_clip": 0.06369121, + "balance_loss_mlp": 0.01272075, + "epoch": 0.07076506839019991, + "flos": 22343715648000.0, + "grad_norm": 7.137695949749425, + "language_loss": 0.76855987, + "learning_rate": 3.982594042635701e-06, + "loss": 0.85042542, + "num_input_tokens_seen": 25093035, + "router_z_loss_clip": 4.86328125, + "router_z_loss_mlp": 0.58984375, + "step": 1177, + "time_per_iteration": 2.57594895362854 + }, + { + "auxiliary_loss_clip": 0.06883623, + "auxiliary_loss_mlp": 0.0132835, + "balance_loss_clip": 0.06377017, + "balance_loss_mlp": 0.01265599, + "epoch": 0.07082519164286788, + "flos": 18666694126080.0, + "grad_norm": 2.8035814441303164, + "language_loss": 0.8769573, + "learning_rate": 3.982542734644673e-06, + "loss": 0.959077, + "num_input_tokens_seen": 25112520, + "router_z_loss_clip": 5.0625, + "router_z_loss_mlp": 0.62695312, + "step": 1178, + "time_per_iteration": 2.6013543605804443 + }, + { + "auxiliary_loss_clip": 0.06703987, + "auxiliary_loss_mlp": 0.0134181, + "balance_loss_clip": 0.06385635, + "balance_loss_mlp": 0.01304808, + "epoch": 0.07088531489553584, + "flos": 63674691615360.0, + "grad_norm": 0.8655968349167181, + "language_loss": 0.63642812, + "learning_rate": 3.982491351475427e-06, + "loss": 0.71688616, + "num_input_tokens_seen": 25177760, + "router_z_loss_clip": 3.1875, + "router_z_loss_mlp": 0.36938477, + "step": 1179, + "time_per_iteration": 3.3081142902374268 + }, + { + "auxiliary_loss_clip": 0.06890059, + "auxiliary_loss_mlp": 0.01335612, + "balance_loss_clip": 0.06383069, + "balance_loss_mlp": 0.01270047, + "epoch": 0.07094543814820382, + "flos": 21577902456960.0, + "grad_norm": 4.088495173814758, + "language_loss": 0.87769747, + "learning_rate": 3.98243989312991e-06, + "loss": 0.9599542, + "num_input_tokens_seen": 25195260, + "router_z_loss_clip": 5.0703125, + "router_z_loss_mlp": 0.65625, + "step": 1180, + "time_per_iteration": 2.559685707092285 + }, + { + "auxiliary_loss_clip": 0.06872466, + "auxiliary_loss_mlp": 0.01339604, + "balance_loss_clip": 0.06370541, + "balance_loss_mlp": 0.01274754, + "epoch": 0.07100556140087179, + "flos": 22096326867840.0, + "grad_norm": 6.479686279022214, + "language_loss": 0.90814912, + "learning_rate": 3.982388359610074e-06, + "loss": 0.99026984, + "num_input_tokens_seen": 25212740, + "router_z_loss_clip": 5.015625, + "router_z_loss_mlp": 0.6484375, + "step": 1181, + "time_per_iteration": 2.616978883743286 + }, + { + "auxiliary_loss_clip": 0.06848356, + "auxiliary_loss_mlp": 0.01339504, + "balance_loss_clip": 0.06372169, + "balance_loss_mlp": 0.01279351, + "epoch": 0.07106568465353975, + "flos": 47933056471680.0, + "grad_norm": 6.025910143763993, + "language_loss": 0.86037725, + "learning_rate": 3.9823367509178725e-06, + "loss": 0.94225585, + "num_input_tokens_seen": 25236420, + "router_z_loss_clip": 4.75390625, + "router_z_loss_mlp": 0.60131836, + "step": 1182, + "time_per_iteration": 2.7946407794952393 + }, + { + "auxiliary_loss_clip": 0.06876318, + "auxiliary_loss_mlp": 0.0134218, + "balance_loss_clip": 0.06371553, + "balance_loss_mlp": 0.01276806, + "epoch": 0.07112580790620772, + "flos": 23447551409280.0, + "grad_norm": 3.676638851024929, + "language_loss": 0.82862288, + "learning_rate": 3.982285067055262e-06, + "loss": 0.91080785, + "num_input_tokens_seen": 25255120, + "router_z_loss_clip": 5.04296875, + "router_z_loss_mlp": 0.65332031, + "step": 1183, + "time_per_iteration": 2.60546612739563 + }, + { + "auxiliary_loss_clip": 0.06882935, + "auxiliary_loss_mlp": 0.01336855, + "balance_loss_clip": 0.0637991, + "balance_loss_mlp": 0.01272101, + "epoch": 0.0711859311588757, + "flos": 31877030880000.0, + "grad_norm": 4.3786669508725335, + "language_loss": 0.81657791, + "learning_rate": 3.982233308024204e-06, + "loss": 0.8987757, + "num_input_tokens_seen": 25275150, + "router_z_loss_clip": 5.02734375, + "router_z_loss_mlp": 0.64794922, + "step": 1184, + "time_per_iteration": 2.651372194290161 + }, + { + "auxiliary_loss_clip": 0.06854693, + "auxiliary_loss_mlp": 0.013301, + "balance_loss_clip": 0.06374621, + "balance_loss_mlp": 0.01271926, + "epoch": 0.07124605441154366, + "flos": 19616514883200.0, + "grad_norm": 2.502972307695957, + "language_loss": 0.79704922, + "learning_rate": 3.98218147382666e-06, + "loss": 0.87889707, + "num_input_tokens_seen": 25293680, + "router_z_loss_clip": 4.796875, + "router_z_loss_mlp": 0.58178711, + "step": 1185, + "time_per_iteration": 2.591947555541992 + }, + { + "auxiliary_loss_clip": 0.06869413, + "auxiliary_loss_mlp": 0.01332248, + "balance_loss_clip": 0.06377724, + "balance_loss_mlp": 0.0127169, + "epoch": 0.07130617766421163, + "flos": 14689776441600.0, + "grad_norm": 8.952451247795917, + "language_loss": 0.68110502, + "learning_rate": 3.982129564464596e-06, + "loss": 0.7631216, + "num_input_tokens_seen": 25310050, + "router_z_loss_clip": 4.91015625, + "router_z_loss_mlp": 0.60546875, + "step": 1186, + "time_per_iteration": 2.52742862701416 + }, + { + "auxiliary_loss_clip": 0.06856332, + "auxiliary_loss_mlp": 0.01335213, + "balance_loss_clip": 0.06375858, + "balance_loss_mlp": 0.01277587, + "epoch": 0.07136630091687961, + "flos": 26075131269120.0, + "grad_norm": 3.0050123348369984, + "language_loss": 0.72187626, + "learning_rate": 3.98207757993998e-06, + "loss": 0.8037917, + "num_input_tokens_seen": 25331020, + "router_z_loss_clip": 4.796875, + "router_z_loss_mlp": 0.57641602, + "step": 1187, + "time_per_iteration": 2.6516740322113037 + }, + { + "auxiliary_loss_clip": 0.06852362, + "auxiliary_loss_mlp": 0.01318955, + "balance_loss_clip": 0.06373794, + "balance_loss_mlp": 0.01261901, + "epoch": 0.07142642416954757, + "flos": 15674621005440.0, + "grad_norm": 8.213543534109728, + "language_loss": 0.81159407, + "learning_rate": 3.9820255202547845e-06, + "loss": 0.89330727, + "num_input_tokens_seen": 25347875, + "router_z_loss_clip": 4.78125, + "router_z_loss_mlp": 0.57006836, + "step": 1188, + "time_per_iteration": 2.535729169845581 + }, + { + "auxiliary_loss_clip": 0.06864372, + "auxiliary_loss_mlp": 0.01337634, + "balance_loss_clip": 0.06379133, + "balance_loss_mlp": 0.01275216, + "epoch": 0.07148654742221554, + "flos": 19761389792640.0, + "grad_norm": 3.9335979273681794, + "language_loss": 0.87605166, + "learning_rate": 3.981973385410981e-06, + "loss": 0.95807171, + "num_input_tokens_seen": 25366715, + "router_z_loss_clip": 4.84765625, + "router_z_loss_mlp": 0.62402344, + "step": 1189, + "time_per_iteration": 2.6562387943267822 + }, + { + "auxiliary_loss_clip": 0.06861293, + "auxiliary_loss_mlp": 0.01342124, + "balance_loss_clip": 0.06382903, + "balance_loss_mlp": 0.01281685, + "epoch": 0.07154667067488352, + "flos": 23477669752320.0, + "grad_norm": 2.556740892092056, + "language_loss": 0.79916418, + "learning_rate": 3.9819211754105494e-06, + "loss": 0.88119841, + "num_input_tokens_seen": 25385450, + "router_z_loss_clip": 4.78125, + "router_z_loss_mlp": 0.60473633, + "step": 1190, + "time_per_iteration": 2.5854697227478027 + }, + { + "auxiliary_loss_clip": 0.06877136, + "auxiliary_loss_mlp": 0.01341277, + "balance_loss_clip": 0.06381981, + "balance_loss_mlp": 0.01274925, + "epoch": 0.07160679392755148, + "flos": 18338859826560.0, + "grad_norm": 3.405692469784563, + "language_loss": 0.78708088, + "learning_rate": 3.981868890255468e-06, + "loss": 0.86926508, + "num_input_tokens_seen": 25403940, + "router_z_loss_clip": 4.94140625, + "router_z_loss_mlp": 0.6628418, + "step": 1191, + "time_per_iteration": 2.638591766357422 + }, + { + "auxiliary_loss_clip": 0.06881537, + "auxiliary_loss_mlp": 0.01331932, + "balance_loss_clip": 0.06384552, + "balance_loss_mlp": 0.01271493, + "epoch": 0.07166691718021945, + "flos": 17752484154240.0, + "grad_norm": 4.470338815774188, + "language_loss": 0.76098609, + "learning_rate": 3.981816529947719e-06, + "loss": 0.84312069, + "num_input_tokens_seen": 25420410, + "router_z_loss_clip": 4.9609375, + "router_z_loss_mlp": 0.60424805, + "step": 1192, + "time_per_iteration": 2.5505447387695312 + }, + { + "auxiliary_loss_clip": 0.06871057, + "auxiliary_loss_mlp": 0.01335615, + "balance_loss_clip": 0.06381638, + "balance_loss_mlp": 0.01275009, + "epoch": 0.07172704043288743, + "flos": 22457885235840.0, + "grad_norm": 6.182703134969588, + "language_loss": 0.8089788, + "learning_rate": 3.9817640944892896e-06, + "loss": 0.89104557, + "num_input_tokens_seen": 25439415, + "router_z_loss_clip": 4.88671875, + "router_z_loss_mlp": 0.60644531, + "step": 1193, + "time_per_iteration": 2.633073329925537 + }, + { + "auxiliary_loss_clip": 0.06859954, + "auxiliary_loss_mlp": 0.01339771, + "balance_loss_clip": 0.06379488, + "balance_loss_mlp": 0.0127733, + "epoch": 0.07178716368555539, + "flos": 23228981233920.0, + "grad_norm": 5.198460731675794, + "language_loss": 0.88664103, + "learning_rate": 3.981711583882166e-06, + "loss": 0.96863824, + "num_input_tokens_seen": 25458715, + "router_z_loss_clip": 4.80078125, + "router_z_loss_mlp": 0.62426758, + "step": 1194, + "time_per_iteration": 2.5827341079711914 + }, + { + "auxiliary_loss_clip": 0.06866181, + "auxiliary_loss_mlp": 0.01325528, + "balance_loss_clip": 0.06383646, + "balance_loss_mlp": 0.01270096, + "epoch": 0.07184728693822336, + "flos": 25157064009600.0, + "grad_norm": 6.369260359442203, + "language_loss": 0.83872163, + "learning_rate": 3.981658998128341e-06, + "loss": 0.92063868, + "num_input_tokens_seen": 25477985, + "router_z_loss_clip": 4.828125, + "router_z_loss_mlp": 0.55444336, + "step": 1195, + "time_per_iteration": 2.6193504333496094 + }, + { + "auxiliary_loss_clip": 0.06856936, + "auxiliary_loss_mlp": 0.01324202, + "balance_loss_clip": 0.06375654, + "balance_loss_mlp": 0.01265241, + "epoch": 0.07190741019089132, + "flos": 22717894055040.0, + "grad_norm": 2.883346879050408, + "language_loss": 0.81836474, + "learning_rate": 3.981606337229808e-06, + "loss": 0.90017617, + "num_input_tokens_seen": 25497110, + "router_z_loss_clip": 4.8046875, + "router_z_loss_mlp": 0.58984375, + "step": 1196, + "time_per_iteration": 2.586151123046875 + }, + { + "auxiliary_loss_clip": 0.06870347, + "auxiliary_loss_mlp": 0.0135034, + "balance_loss_clip": 0.06381004, + "balance_loss_mlp": 0.0128828, + "epoch": 0.0719675334435593, + "flos": 29357247697920.0, + "grad_norm": 3.757214572000768, + "language_loss": 0.74150658, + "learning_rate": 3.9815536011885655e-06, + "loss": 0.82371342, + "num_input_tokens_seen": 25516555, + "router_z_loss_clip": 4.89453125, + "router_z_loss_mlp": 0.62109375, + "step": 1197, + "time_per_iteration": 2.653139114379883 + }, + { + "auxiliary_loss_clip": 0.06849834, + "auxiliary_loss_mlp": 0.01333514, + "balance_loss_clip": 0.0637273, + "balance_loss_mlp": 0.01277867, + "epoch": 0.07202765669622727, + "flos": 17645609871360.0, + "grad_norm": 7.565571046606514, + "language_loss": 0.88836908, + "learning_rate": 3.98150079000661e-06, + "loss": 0.97020251, + "num_input_tokens_seen": 25533895, + "router_z_loss_clip": 4.76953125, + "router_z_loss_mlp": 0.55664062, + "step": 1198, + "time_per_iteration": 2.558506727218628 + }, + { + "auxiliary_loss_clip": 0.06868395, + "auxiliary_loss_mlp": 0.01336115, + "balance_loss_clip": 0.06385568, + "balance_loss_mlp": 0.01278942, + "epoch": 0.07208777994889523, + "flos": 21440448633600.0, + "grad_norm": 9.650241915118821, + "language_loss": 0.86308157, + "learning_rate": 3.981447903685947e-06, + "loss": 0.94512665, + "num_input_tokens_seen": 25554195, + "router_z_loss_clip": 4.828125, + "router_z_loss_mlp": 0.57202148, + "step": 1199, + "time_per_iteration": 2.593768835067749 + }, + { + "auxiliary_loss_clip": 0.06879794, + "auxiliary_loss_mlp": 0.01340676, + "balance_loss_clip": 0.06389172, + "balance_loss_mlp": 0.01281167, + "epoch": 0.07214790320156321, + "flos": 26947776816000.0, + "grad_norm": 2.5713335496183136, + "language_loss": 0.78793061, + "learning_rate": 3.981394942228581e-06, + "loss": 0.87013531, + "num_input_tokens_seen": 25574155, + "router_z_loss_clip": 4.91015625, + "router_z_loss_mlp": 0.59521484, + "step": 1200, + "time_per_iteration": 2.6549324989318848 + }, + { + "auxiliary_loss_clip": 0.06889373, + "auxiliary_loss_mlp": 0.01341905, + "balance_loss_clip": 0.06398184, + "balance_loss_mlp": 0.01281109, + "epoch": 0.07220802645423118, + "flos": 23886997747200.0, + "grad_norm": 3.3919476714664185, + "language_loss": 0.84325218, + "learning_rate": 3.98134190563652e-06, + "loss": 0.925565, + "num_input_tokens_seen": 25592735, + "router_z_loss_clip": 4.91015625, + "router_z_loss_mlp": 0.60839844, + "step": 1201, + "time_per_iteration": 3.9977235794067383 + }, + { + "auxiliary_loss_clip": 0.06908435, + "auxiliary_loss_mlp": 0.01338574, + "balance_loss_clip": 0.06397285, + "balance_loss_mlp": 0.0127382, + "epoch": 0.07226814970689914, + "flos": 19249464072960.0, + "grad_norm": 2.7243272317134624, + "language_loss": 0.71221054, + "learning_rate": 3.981288793911775e-06, + "loss": 0.7946806, + "num_input_tokens_seen": 25611510, + "router_z_loss_clip": 5.109375, + "router_z_loss_mlp": 0.6472168, + "step": 1202, + "time_per_iteration": 4.006861925125122 + }, + { + "auxiliary_loss_clip": 0.06890082, + "auxiliary_loss_mlp": 0.01341886, + "balance_loss_clip": 0.06389347, + "balance_loss_mlp": 0.01278705, + "epoch": 0.07232827295956712, + "flos": 19178074794240.0, + "grad_norm": 3.218171076661328, + "language_loss": 0.89525115, + "learning_rate": 3.98123560705636e-06, + "loss": 0.97757077, + "num_input_tokens_seen": 25629560, + "router_z_loss_clip": 5.00390625, + "router_z_loss_mlp": 0.63232422, + "step": 1203, + "time_per_iteration": 2.6098897457122803 + }, + { + "auxiliary_loss_clip": 0.069024, + "auxiliary_loss_mlp": 0.01349525, + "balance_loss_clip": 0.06393193, + "balance_loss_mlp": 0.01279335, + "epoch": 0.07238839621223508, + "flos": 17645567944320.0, + "grad_norm": 3.0614329982122266, + "language_loss": 0.81485641, + "learning_rate": 3.981182345072293e-06, + "loss": 0.89737558, + "num_input_tokens_seen": 25648330, + "router_z_loss_clip": 5.09375, + "router_z_loss_mlp": 0.70214844, + "step": 1204, + "time_per_iteration": 3.999619960784912 + }, + { + "auxiliary_loss_clip": 0.06911701, + "auxiliary_loss_mlp": 0.01333494, + "balance_loss_clip": 0.06413823, + "balance_loss_mlp": 0.01269693, + "epoch": 0.07244851946490305, + "flos": 28299797971200.0, + "grad_norm": 3.782046298297649, + "language_loss": 0.84954846, + "learning_rate": 3.981129007961593e-06, + "loss": 0.9320004, + "num_input_tokens_seen": 25669470, + "router_z_loss_clip": 4.97265625, + "router_z_loss_mlp": 0.63818359, + "step": 1205, + "time_per_iteration": 2.658663272857666 + }, + { + "auxiliary_loss_clip": 0.06914138, + "auxiliary_loss_mlp": 0.0134752, + "balance_loss_clip": 0.06405394, + "balance_loss_mlp": 0.01278021, + "epoch": 0.07250864271757101, + "flos": 22571383991040.0, + "grad_norm": 9.50364615421703, + "language_loss": 0.78291214, + "learning_rate": 3.981075595726283e-06, + "loss": 0.86552876, + "num_input_tokens_seen": 25690470, + "router_z_loss_clip": 5.078125, + "router_z_loss_mlp": 0.69458008, + "step": 1206, + "time_per_iteration": 2.6500728130340576 + }, + { + "auxiliary_loss_clip": 0.06879818, + "auxiliary_loss_mlp": 0.01347642, + "balance_loss_clip": 0.06386471, + "balance_loss_mlp": 0.0128594, + "epoch": 0.072568765970239, + "flos": 21768869911680.0, + "grad_norm": 3.061800504881848, + "language_loss": 0.79528189, + "learning_rate": 3.981022108368387e-06, + "loss": 0.87755644, + "num_input_tokens_seen": 25709205, + "router_z_loss_clip": 4.9375, + "router_z_loss_mlp": 0.61767578, + "step": 1207, + "time_per_iteration": 4.111234903335571 + }, + { + "auxiliary_loss_clip": 0.06890166, + "auxiliary_loss_mlp": 0.0133734, + "balance_loss_clip": 0.06392397, + "balance_loss_mlp": 0.01278618, + "epoch": 0.07262888922290696, + "flos": 25526672369280.0, + "grad_norm": 2.516808639831756, + "language_loss": 0.82780725, + "learning_rate": 3.9809685458899345e-06, + "loss": 0.91008234, + "num_input_tokens_seen": 25728485, + "router_z_loss_clip": 4.98046875, + "router_z_loss_mlp": 0.58789062, + "step": 1208, + "time_per_iteration": 2.65267276763916 + }, + { + "auxiliary_loss_clip": 0.06873606, + "auxiliary_loss_mlp": 0.01329274, + "balance_loss_clip": 0.06393886, + "balance_loss_mlp": 0.01270813, + "epoch": 0.07268901247557492, + "flos": 21252080655360.0, + "grad_norm": 3.726862788271486, + "language_loss": 0.80825698, + "learning_rate": 3.980914908292955e-06, + "loss": 0.89028573, + "num_input_tokens_seen": 25747730, + "router_z_loss_clip": 4.80078125, + "router_z_loss_mlp": 0.58496094, + "step": 1209, + "time_per_iteration": 2.5653858184814453 + }, + { + "auxiliary_loss_clip": 0.06887256, + "auxiliary_loss_mlp": 0.01333341, + "balance_loss_clip": 0.06401981, + "balance_loss_mlp": 0.012714, + "epoch": 0.0727491357282429, + "flos": 25485611068800.0, + "grad_norm": 85.1554110577333, + "language_loss": 0.83058631, + "learning_rate": 3.980861195579486e-06, + "loss": 0.91279227, + "num_input_tokens_seen": 25768050, + "router_z_loss_clip": 4.84375, + "router_z_loss_mlp": 0.61962891, + "step": 1210, + "time_per_iteration": 2.6290841102600098 + }, + { + "auxiliary_loss_clip": 0.06912959, + "auxiliary_loss_mlp": 0.01335995, + "balance_loss_clip": 0.064188, + "balance_loss_mlp": 0.01275437, + "epoch": 0.07280925898091087, + "flos": 24469054934400.0, + "grad_norm": 2.3690681332483092, + "language_loss": 0.87872899, + "learning_rate": 3.98080740775156e-06, + "loss": 0.96121848, + "num_input_tokens_seen": 25787985, + "router_z_loss_clip": 4.93359375, + "router_z_loss_mlp": 0.60571289, + "step": 1211, + "time_per_iteration": 2.601407289505005 + }, + { + "auxiliary_loss_clip": 0.06907704, + "auxiliary_loss_mlp": 0.01325307, + "balance_loss_clip": 0.06408024, + "balance_loss_mlp": 0.01262221, + "epoch": 0.07286938223357883, + "flos": 18292725354240.0, + "grad_norm": 12.676001298421971, + "language_loss": 0.94102865, + "learning_rate": 3.98075354481122e-06, + "loss": 1.0233587, + "num_input_tokens_seen": 25803620, + "router_z_loss_clip": 5.0, + "router_z_loss_mlp": 0.63134766, + "step": 1212, + "time_per_iteration": 2.583038806915283 + }, + { + "auxiliary_loss_clip": 0.06906819, + "auxiliary_loss_mlp": 0.0132597, + "balance_loss_clip": 0.06410546, + "balance_loss_mlp": 0.01265579, + "epoch": 0.07292950548624681, + "flos": 21221123771520.0, + "grad_norm": 2.174057870864043, + "language_loss": 0.74973536, + "learning_rate": 3.9806996067605055e-06, + "loss": 0.8320632, + "num_input_tokens_seen": 25823315, + "router_z_loss_clip": 4.96484375, + "router_z_loss_mlp": 0.60449219, + "step": 1213, + "time_per_iteration": 2.58750319480896 + }, + { + "auxiliary_loss_clip": 0.06919889, + "auxiliary_loss_mlp": 0.01335737, + "balance_loss_clip": 0.06414144, + "balance_loss_mlp": 0.01270815, + "epoch": 0.07298962873891478, + "flos": 24648492453120.0, + "grad_norm": 3.5327448066046547, + "language_loss": 0.86681479, + "learning_rate": 3.980645593601465e-06, + "loss": 0.9493711, + "num_input_tokens_seen": 25842605, + "router_z_loss_clip": 5.046875, + "router_z_loss_mlp": 0.64868164, + "step": 1214, + "time_per_iteration": 2.6603875160217285 + }, + { + "auxiliary_loss_clip": 0.0691122, + "auxiliary_loss_mlp": 0.01328745, + "balance_loss_clip": 0.06415356, + "balance_loss_mlp": 0.01268855, + "epoch": 0.07304975199158274, + "flos": 27060101614080.0, + "grad_norm": 2.7007963802747197, + "language_loss": 0.87098217, + "learning_rate": 3.980591505336144e-06, + "loss": 0.95338178, + "num_input_tokens_seen": 25863030, + "router_z_loss_clip": 4.95703125, + "router_z_loss_mlp": 0.59863281, + "step": 1215, + "time_per_iteration": 2.6591246128082275 + }, + { + "auxiliary_loss_clip": 0.06944987, + "auxiliary_loss_mlp": 0.01336211, + "balance_loss_clip": 0.06434523, + "balance_loss_mlp": 0.01269025, + "epoch": 0.07310987524425071, + "flos": 33558353781120.0, + "grad_norm": 3.0486240121539385, + "language_loss": 0.83975989, + "learning_rate": 3.980537341966595e-06, + "loss": 0.9225719, + "num_input_tokens_seen": 25888015, + "router_z_loss_clip": 5.1015625, + "router_z_loss_mlp": 0.67138672, + "step": 1216, + "time_per_iteration": 2.7674107551574707 + }, + { + "auxiliary_loss_clip": 0.06944714, + "auxiliary_loss_mlp": 0.01339054, + "balance_loss_clip": 0.06429577, + "balance_loss_mlp": 0.01274585, + "epoch": 0.07316999849691869, + "flos": 28118473735680.0, + "grad_norm": 3.328421621220486, + "language_loss": 0.78921533, + "learning_rate": 3.980483103494872e-06, + "loss": 0.87205303, + "num_input_tokens_seen": 25908660, + "router_z_loss_clip": 5.1484375, + "router_z_loss_mlp": 0.64550781, + "step": 1217, + "time_per_iteration": 2.672692060470581 + }, + { + "auxiliary_loss_clip": 0.06904574, + "auxiliary_loss_mlp": 0.01321216, + "balance_loss_clip": 0.06406265, + "balance_loss_mlp": 0.01263614, + "epoch": 0.07323012174958665, + "flos": 14397888343680.0, + "grad_norm": 2.4648840381938752, + "language_loss": 0.88704532, + "learning_rate": 3.98042878992303e-06, + "loss": 0.96930325, + "num_input_tokens_seen": 25927215, + "router_z_loss_clip": 4.984375, + "router_z_loss_mlp": 0.57592773, + "step": 1218, + "time_per_iteration": 2.6067652702331543 + }, + { + "auxiliary_loss_clip": 0.06908453, + "auxiliary_loss_mlp": 0.01339024, + "balance_loss_clip": 0.06418494, + "balance_loss_mlp": 0.01277607, + "epoch": 0.07329024500225462, + "flos": 21622862972160.0, + "grad_norm": 2.509726295852636, + "language_loss": 0.89056909, + "learning_rate": 3.9803744012531305e-06, + "loss": 0.9730438, + "num_input_tokens_seen": 25945500, + "router_z_loss_clip": 4.89453125, + "router_z_loss_mlp": 0.61376953, + "step": 1219, + "time_per_iteration": 2.644948959350586 + }, + { + "auxiliary_loss_clip": 0.0689719, + "auxiliary_loss_mlp": 0.01336847, + "balance_loss_clip": 0.06407624, + "balance_loss_mlp": 0.01275287, + "epoch": 0.0733503682549226, + "flos": 13229078140800.0, + "grad_norm": 3.459180464583836, + "language_loss": 0.87265766, + "learning_rate": 3.980319937487235e-06, + "loss": 0.95499802, + "num_input_tokens_seen": 25963105, + "router_z_loss_clip": 4.8984375, + "router_z_loss_mlp": 0.61621094, + "step": 1220, + "time_per_iteration": 2.575570583343506 + }, + { + "auxiliary_loss_clip": 0.06925908, + "auxiliary_loss_mlp": 0.01352206, + "balance_loss_clip": 0.06422862, + "balance_loss_mlp": 0.0128974, + "epoch": 0.07341049150759056, + "flos": 20893331399040.0, + "grad_norm": 4.615259324948809, + "language_loss": 0.79933828, + "learning_rate": 3.98026539862741e-06, + "loss": 0.88211942, + "num_input_tokens_seen": 25981690, + "router_z_loss_clip": 5.03125, + "router_z_loss_mlp": 0.62451172, + "step": 1221, + "time_per_iteration": 2.6174440383911133 + }, + { + "auxiliary_loss_clip": 0.06900848, + "auxiliary_loss_mlp": 0.01351796, + "balance_loss_clip": 0.06404451, + "balance_loss_mlp": 0.01290761, + "epoch": 0.07347061476025853, + "flos": 15418972598400.0, + "grad_norm": 2.5998624424358106, + "language_loss": 0.95159388, + "learning_rate": 3.980210784675722e-06, + "loss": 1.03412032, + "num_input_tokens_seen": 25999890, + "router_z_loss_clip": 4.9609375, + "router_z_loss_mlp": 0.61035156, + "step": 1222, + "time_per_iteration": 2.5956273078918457 + }, + { + "auxiliary_loss_clip": 0.06908462, + "auxiliary_loss_mlp": 0.01358079, + "balance_loss_clip": 0.06414389, + "balance_loss_mlp": 0.01303147, + "epoch": 0.0735307380129265, + "flos": 11113591708800.0, + "grad_norm": 14.551194351183868, + "language_loss": 0.93725538, + "learning_rate": 3.980156095634242e-06, + "loss": 1.01992083, + "num_input_tokens_seen": 26016445, + "router_z_loss_clip": 4.94140625, + "router_z_loss_mlp": 0.54907227, + "step": 1223, + "time_per_iteration": 2.5886712074279785 + }, + { + "auxiliary_loss_clip": 0.06916398, + "auxiliary_loss_mlp": 0.01394841, + "balance_loss_clip": 0.06417241, + "balance_loss_mlp": 0.01330874, + "epoch": 0.07359086126559447, + "flos": 23739146017920.0, + "grad_norm": 2.48832330955176, + "language_loss": 0.84952593, + "learning_rate": 3.980101331505045e-06, + "loss": 0.93263835, + "num_input_tokens_seen": 26036080, + "router_z_loss_clip": 4.984375, + "router_z_loss_mlp": 0.63989258, + "step": 1224, + "time_per_iteration": 2.600796937942505 + }, + { + "auxiliary_loss_clip": 0.06916806, + "auxiliary_loss_mlp": 0.01413444, + "balance_loss_clip": 0.06410658, + "balance_loss_mlp": 0.0134354, + "epoch": 0.07365098451826244, + "flos": 20999115578880.0, + "grad_norm": 3.5000549679052932, + "language_loss": 0.86487269, + "learning_rate": 3.9800464922902076e-06, + "loss": 0.94817519, + "num_input_tokens_seen": 26055805, + "router_z_loss_clip": 5.0625, + "router_z_loss_mlp": 0.69921875, + "step": 1225, + "time_per_iteration": 2.6348657608032227 + }, + { + "auxiliary_loss_clip": 0.06893472, + "auxiliary_loss_mlp": 0.01405003, + "balance_loss_clip": 0.06406252, + "balance_loss_mlp": 0.01345017, + "epoch": 0.0737111077709304, + "flos": 19938982521600.0, + "grad_norm": 2.4160640893773544, + "language_loss": 0.93043572, + "learning_rate": 3.979991577991808e-06, + "loss": 1.01342046, + "num_input_tokens_seen": 26073905, + "router_z_loss_clip": 4.8671875, + "router_z_loss_mlp": 0.59960938, + "step": 1226, + "time_per_iteration": 2.5814220905303955 + }, + { + "auxiliary_loss_clip": 0.06951886, + "auxiliary_loss_mlp": 0.01454874, + "balance_loss_clip": 0.06431323, + "balance_loss_mlp": 0.01382633, + "epoch": 0.07377123102359838, + "flos": 16587153895680.0, + "grad_norm": 17.71044350544229, + "language_loss": 0.81177175, + "learning_rate": 3.97993658861193e-06, + "loss": 0.89583939, + "num_input_tokens_seen": 26091700, + "router_z_loss_clip": 5.21484375, + "router_z_loss_mlp": 0.72216797, + "step": 1227, + "time_per_iteration": 2.562495708465576 + }, + { + "auxiliary_loss_clip": 0.06910308, + "auxiliary_loss_mlp": 0.0141995, + "balance_loss_clip": 0.06419577, + "balance_loss_mlp": 0.01357318, + "epoch": 0.07383135427626634, + "flos": 28335911880960.0, + "grad_norm": 2.0840618907227113, + "language_loss": 0.88551241, + "learning_rate": 3.9798815241526575e-06, + "loss": 0.96881503, + "num_input_tokens_seen": 26114105, + "router_z_loss_clip": 4.90625, + "router_z_loss_mlp": 0.6262207, + "step": 1228, + "time_per_iteration": 2.6383354663848877 + }, + { + "auxiliary_loss_clip": 0.06927899, + "auxiliary_loss_mlp": 0.01421335, + "balance_loss_clip": 0.06420749, + "balance_loss_mlp": 0.01352098, + "epoch": 0.07389147752893431, + "flos": 20053277890560.0, + "grad_norm": 2.9618119227327493, + "language_loss": 0.82374752, + "learning_rate": 3.97982638461608e-06, + "loss": 0.90723979, + "num_input_tokens_seen": 26131165, + "router_z_loss_clip": 5.0625, + "router_z_loss_mlp": 0.69238281, + "step": 1229, + "time_per_iteration": 2.572110414505005 + }, + { + "auxiliary_loss_clip": 0.06918953, + "auxiliary_loss_mlp": 0.01426217, + "balance_loss_clip": 0.06413613, + "balance_loss_mlp": 0.01351926, + "epoch": 0.07395160078160229, + "flos": 18120038088960.0, + "grad_norm": 2.8764105468999697, + "language_loss": 0.81244183, + "learning_rate": 3.979771170004287e-06, + "loss": 0.89589357, + "num_input_tokens_seen": 26150040, + "router_z_loss_clip": 5.046875, + "router_z_loss_mlp": 0.74267578, + "step": 1230, + "time_per_iteration": 2.580080270767212 + }, + { + "auxiliary_loss_clip": 0.06901585, + "auxiliary_loss_mlp": 0.01391553, + "balance_loss_clip": 0.06406316, + "balance_loss_mlp": 0.01325273, + "epoch": 0.07401172403427025, + "flos": 23593726056960.0, + "grad_norm": 2.3354922031953547, + "language_loss": 0.83756942, + "learning_rate": 3.979715880319372e-06, + "loss": 0.92050081, + "num_input_tokens_seen": 26169380, + "router_z_loss_clip": 4.95703125, + "router_z_loss_mlp": 0.66210938, + "step": 1231, + "time_per_iteration": 2.6182961463928223 + }, + { + "auxiliary_loss_clip": 0.06916339, + "auxiliary_loss_mlp": 0.01398184, + "balance_loss_clip": 0.06416178, + "balance_loss_mlp": 0.01340868, + "epoch": 0.07407184728693822, + "flos": 26367187075200.0, + "grad_norm": 2.448759958115063, + "language_loss": 0.97958755, + "learning_rate": 3.979660515563434e-06, + "loss": 1.0627327, + "num_input_tokens_seen": 26189420, + "router_z_loss_clip": 5.0, + "router_z_loss_mlp": 0.57373047, + "step": 1232, + "time_per_iteration": 2.6219074726104736 + }, + { + "auxiliary_loss_clip": 0.06881506, + "auxiliary_loss_mlp": 0.01383375, + "balance_loss_clip": 0.06404279, + "balance_loss_mlp": 0.01327991, + "epoch": 0.0741319705396062, + "flos": 22207016511360.0, + "grad_norm": 2.790382340569057, + "language_loss": 0.83657277, + "learning_rate": 3.979605075738569e-06, + "loss": 0.91922164, + "num_input_tokens_seen": 26209300, + "router_z_loss_clip": 4.7734375, + "router_z_loss_mlp": 0.55395508, + "step": 1233, + "time_per_iteration": 2.6186439990997314 + }, + { + "auxiliary_loss_clip": 0.06909496, + "auxiliary_loss_mlp": 0.0136395, + "balance_loss_clip": 0.06408279, + "balance_loss_mlp": 0.01302462, + "epoch": 0.07419209379227416, + "flos": 39209508696960.0, + "grad_norm": 3.1172656995673393, + "language_loss": 0.73086953, + "learning_rate": 3.979549560846883e-06, + "loss": 0.813604, + "num_input_tokens_seen": 26228110, + "router_z_loss_clip": 5.0078125, + "router_z_loss_mlp": 0.61450195, + "step": 1234, + "time_per_iteration": 2.750397205352783 + }, + { + "auxiliary_loss_clip": 0.0689207, + "auxiliary_loss_mlp": 0.01355226, + "balance_loss_clip": 0.06398024, + "balance_loss_mlp": 0.01294786, + "epoch": 0.07425221704494213, + "flos": 22787899741440.0, + "grad_norm": 2.355636628350322, + "language_loss": 0.789891, + "learning_rate": 3.979493970890478e-06, + "loss": 0.87236392, + "num_input_tokens_seen": 26247020, + "router_z_loss_clip": 4.9375, + "router_z_loss_mlp": 0.60473633, + "step": 1235, + "time_per_iteration": 2.5847980976104736 + }, + { + "auxiliary_loss_clip": 0.06876536, + "auxiliary_loss_mlp": 0.0134157, + "balance_loss_clip": 0.0640441, + "balance_loss_mlp": 0.01286972, + "epoch": 0.0743123402976101, + "flos": 22279495893120.0, + "grad_norm": 4.38662001374288, + "language_loss": 0.84938204, + "learning_rate": 3.979438305871464e-06, + "loss": 0.93156314, + "num_input_tokens_seen": 26265750, + "router_z_loss_clip": 4.71484375, + "router_z_loss_mlp": 0.54589844, + "step": 1236, + "time_per_iteration": 2.6517555713653564 + }, + { + "auxiliary_loss_clip": 0.06904443, + "auxiliary_loss_mlp": 0.013457, + "balance_loss_clip": 0.06407445, + "balance_loss_mlp": 0.01288479, + "epoch": 0.07437246355027807, + "flos": 29322768942720.0, + "grad_norm": 2.2405587930301705, + "language_loss": 0.78282797, + "learning_rate": 3.979382565791951e-06, + "loss": 0.86532938, + "num_input_tokens_seen": 26287905, + "router_z_loss_clip": 4.96875, + "router_z_loss_mlp": 0.57275391, + "step": 1237, + "time_per_iteration": 2.729818105697632 + }, + { + "auxiliary_loss_clip": 0.06881858, + "auxiliary_loss_mlp": 0.01325868, + "balance_loss_clip": 0.06397796, + "balance_loss_mlp": 0.01274488, + "epoch": 0.07443258680294604, + "flos": 31953367549440.0, + "grad_norm": 2.5947803667316123, + "language_loss": 0.79746008, + "learning_rate": 3.979326750654053e-06, + "loss": 0.87953734, + "num_input_tokens_seen": 26311795, + "router_z_loss_clip": 4.84765625, + "router_z_loss_mlp": 0.51391602, + "step": 1238, + "time_per_iteration": 2.7127678394317627 + }, + { + "auxiliary_loss_clip": 0.06888152, + "auxiliary_loss_mlp": 0.01350045, + "balance_loss_clip": 0.06387939, + "balance_loss_mlp": 0.01285982, + "epoch": 0.074492710055614, + "flos": 22682031707520.0, + "grad_norm": 6.17193517167714, + "language_loss": 0.88359845, + "learning_rate": 3.9792708604598854e-06, + "loss": 0.96598047, + "num_input_tokens_seen": 26330330, + "router_z_loss_clip": 5.00390625, + "router_z_loss_mlp": 0.64038086, + "step": 1239, + "time_per_iteration": 2.5982487201690674 + }, + { + "auxiliary_loss_clip": 0.06867203, + "auxiliary_loss_mlp": 0.01339139, + "balance_loss_clip": 0.06376298, + "balance_loss_mlp": 0.01279201, + "epoch": 0.07455283330828198, + "flos": 21290752114560.0, + "grad_norm": 4.728508562946579, + "language_loss": 0.9183414, + "learning_rate": 3.979214895211569e-06, + "loss": 1.00040483, + "num_input_tokens_seen": 26348865, + "router_z_loss_clip": 4.90625, + "router_z_loss_mlp": 0.59960938, + "step": 1240, + "time_per_iteration": 3.982212781906128 + }, + { + "auxiliary_loss_clip": 0.0687404, + "auxiliary_loss_mlp": 0.01344277, + "balance_loss_clip": 0.06383809, + "balance_loss_mlp": 0.01287676, + "epoch": 0.07461295656094995, + "flos": 24395150033280.0, + "grad_norm": 2.7209561023558506, + "language_loss": 0.903265, + "learning_rate": 3.979158854911225e-06, + "loss": 0.98544812, + "num_input_tokens_seen": 26368210, + "router_z_loss_clip": 4.8984375, + "router_z_loss_mlp": 0.56616211, + "step": 1241, + "time_per_iteration": 2.622676372528076 + }, + { + "auxiliary_loss_clip": 0.06764787, + "auxiliary_loss_mlp": 0.01319561, + "balance_loss_clip": 0.06452408, + "balance_loss_mlp": 0.01283775, + "epoch": 0.07467307981361791, + "flos": 62127971498880.0, + "grad_norm": 0.8806411506129102, + "language_loss": 0.63242501, + "learning_rate": 3.979102739560979e-06, + "loss": 0.71326846, + "num_input_tokens_seen": 26424890, + "router_z_loss_clip": 3.125, + "router_z_loss_mlp": 0.35864258, + "step": 1242, + "time_per_iteration": 4.608001947402954 + }, + { + "auxiliary_loss_clip": 0.06884564, + "auxiliary_loss_mlp": 0.01350666, + "balance_loss_clip": 0.06376857, + "balance_loss_mlp": 0.01288319, + "epoch": 0.07473320306628589, + "flos": 24870039448320.0, + "grad_norm": 20.01115775481137, + "language_loss": 0.65988898, + "learning_rate": 3.9790465491629595e-06, + "loss": 0.74224126, + "num_input_tokens_seen": 26446405, + "router_z_loss_clip": 5.08203125, + "router_z_loss_mlp": 0.6237793, + "step": 1243, + "time_per_iteration": 2.686720371246338 + }, + { + "auxiliary_loss_clip": 0.068617, + "auxiliary_loss_mlp": 0.01347661, + "balance_loss_clip": 0.06381305, + "balance_loss_mlp": 0.01292491, + "epoch": 0.07479332631895386, + "flos": 24903973152000.0, + "grad_norm": 3.6813184842747346, + "language_loss": 0.78008217, + "learning_rate": 3.978990283719296e-06, + "loss": 0.86217576, + "num_input_tokens_seen": 26466070, + "router_z_loss_clip": 4.8046875, + "router_z_loss_mlp": 0.55175781, + "step": 1244, + "time_per_iteration": 4.040115833282471 + }, + { + "auxiliary_loss_clip": 0.06851211, + "auxiliary_loss_mlp": 0.01348909, + "balance_loss_clip": 0.06370524, + "balance_loss_mlp": 0.01292833, + "epoch": 0.07485344957162182, + "flos": 17819932291200.0, + "grad_norm": 21.86650929914808, + "language_loss": 0.72362238, + "learning_rate": 3.978933943232123e-06, + "loss": 0.80562365, + "num_input_tokens_seen": 26479350, + "router_z_loss_clip": 4.80078125, + "router_z_loss_mlp": 0.56103516, + "step": 1245, + "time_per_iteration": 2.524477481842041 + }, + { + "auxiliary_loss_clip": 0.06865877, + "auxiliary_loss_mlp": 0.01375645, + "balance_loss_clip": 0.06379819, + "balance_loss_mlp": 0.01317042, + "epoch": 0.0749135728242898, + "flos": 25017304199040.0, + "grad_norm": 2.436107230077969, + "language_loss": 0.90751457, + "learning_rate": 3.978877527703576e-06, + "loss": 0.98992985, + "num_input_tokens_seen": 26498255, + "router_z_loss_clip": 4.85546875, + "router_z_loss_mlp": 0.58642578, + "step": 1246, + "time_per_iteration": 4.0361082553863525 + }, + { + "auxiliary_loss_clip": 0.06889592, + "auxiliary_loss_mlp": 0.01353914, + "balance_loss_clip": 0.06373734, + "balance_loss_mlp": 0.0128978, + "epoch": 0.07497369607695777, + "flos": 17827898428800.0, + "grad_norm": 3.630435288529284, + "language_loss": 0.91536689, + "learning_rate": 3.9788210371357945e-06, + "loss": 0.99780184, + "num_input_tokens_seen": 26515375, + "router_z_loss_clip": 5.15234375, + "router_z_loss_mlp": 0.64111328, + "step": 1247, + "time_per_iteration": 2.558710813522339 + }, + { + "auxiliary_loss_clip": 0.06850724, + "auxiliary_loss_mlp": 0.01373111, + "balance_loss_clip": 0.06373762, + "balance_loss_mlp": 0.01312124, + "epoch": 0.07503381932962573, + "flos": 15126287886720.0, + "grad_norm": 2.9459859952497336, + "language_loss": 0.67146099, + "learning_rate": 3.978764471530921e-06, + "loss": 0.7536993, + "num_input_tokens_seen": 26533595, + "router_z_loss_clip": 4.7578125, + "router_z_loss_mlp": 0.60986328, + "step": 1248, + "time_per_iteration": 2.5451245307922363 + }, + { + "auxiliary_loss_clip": 0.06826814, + "auxiliary_loss_mlp": 0.0138466, + "balance_loss_clip": 0.06362367, + "balance_loss_mlp": 0.01326009, + "epoch": 0.0750939425822937, + "flos": 12820588686720.0, + "grad_norm": 4.865871965779137, + "language_loss": 0.76126468, + "learning_rate": 3.978707830891102e-06, + "loss": 0.84337938, + "num_input_tokens_seen": 26549405, + "router_z_loss_clip": 4.64453125, + "router_z_loss_mlp": 0.58642578, + "step": 1249, + "time_per_iteration": 2.547814130783081 + }, + { + "auxiliary_loss_clip": 0.06878477, + "auxiliary_loss_mlp": 0.01356674, + "balance_loss_clip": 0.06384575, + "balance_loss_mlp": 0.01291156, + "epoch": 0.07515406583496168, + "flos": 24213700016640.0, + "grad_norm": 3.3650478618726805, + "language_loss": 0.84855753, + "learning_rate": 3.978651115218482e-06, + "loss": 0.93090904, + "num_input_tokens_seen": 26567200, + "router_z_loss_clip": 4.9296875, + "router_z_loss_mlp": 0.65429688, + "step": 1250, + "time_per_iteration": 2.6201655864715576 + }, + { + "auxiliary_loss_clip": 0.0685844, + "auxiliary_loss_mlp": 0.01372833, + "balance_loss_clip": 0.06383228, + "balance_loss_mlp": 0.01312036, + "epoch": 0.07521418908762964, + "flos": 26695482572160.0, + "grad_norm": 2.950747307093222, + "language_loss": 0.7010417, + "learning_rate": 3.978594324515215e-06, + "loss": 0.7833544, + "num_input_tokens_seen": 26586190, + "router_z_loss_clip": 4.74609375, + "router_z_loss_mlp": 0.60742188, + "step": 1251, + "time_per_iteration": 2.6431658267974854 + }, + { + "auxiliary_loss_clip": 0.06735167, + "auxiliary_loss_mlp": 0.01321971, + "balance_loss_clip": 0.06424966, + "balance_loss_mlp": 0.0128411, + "epoch": 0.0752743123402976, + "flos": 59115255546240.0, + "grad_norm": 0.864981950603712, + "language_loss": 0.69976699, + "learning_rate": 3.9785374587834515e-06, + "loss": 0.78033841, + "num_input_tokens_seen": 26650710, + "router_z_loss_clip": 3.10351562, + "router_z_loss_mlp": 0.37792969, + "step": 1252, + "time_per_iteration": 3.2185781002044678 + }, + { + "auxiliary_loss_clip": 0.06854245, + "auxiliary_loss_mlp": 0.01348889, + "balance_loss_clip": 0.06374305, + "balance_loss_mlp": 0.01288426, + "epoch": 0.07533443559296558, + "flos": 23483749173120.0, + "grad_norm": 3.3162526589419876, + "language_loss": 0.82824075, + "learning_rate": 3.97848051802535e-06, + "loss": 0.91027212, + "num_input_tokens_seen": 26669000, + "router_z_loss_clip": 4.7890625, + "router_z_loss_mlp": 0.60498047, + "step": 1253, + "time_per_iteration": 2.6227848529815674 + }, + { + "auxiliary_loss_clip": 0.06867173, + "auxiliary_loss_mlp": 0.01358456, + "balance_loss_clip": 0.06365065, + "balance_loss_mlp": 0.01293749, + "epoch": 0.07539455884563355, + "flos": 20884149377280.0, + "grad_norm": 6.3858164660002625, + "language_loss": 0.96525204, + "learning_rate": 3.978423502243069e-06, + "loss": 1.04750824, + "num_input_tokens_seen": 26683075, + "router_z_loss_clip": 5.015625, + "router_z_loss_mlp": 0.64697266, + "step": 1254, + "time_per_iteration": 2.5511484146118164 + }, + { + "auxiliary_loss_clip": 0.06840456, + "auxiliary_loss_mlp": 0.0135521, + "balance_loss_clip": 0.06368542, + "balance_loss_mlp": 0.012916, + "epoch": 0.07545468209830151, + "flos": 27680327136000.0, + "grad_norm": 2.4514498349060307, + "language_loss": 0.9076122, + "learning_rate": 3.97836641143877e-06, + "loss": 0.98956883, + "num_input_tokens_seen": 26701875, + "router_z_loss_clip": 4.71875, + "router_z_loss_mlp": 0.63525391, + "step": 1255, + "time_per_iteration": 2.6308302879333496 + }, + { + "auxiliary_loss_clip": 0.06840869, + "auxiliary_loss_mlp": 0.01347194, + "balance_loss_clip": 0.06364559, + "balance_loss_mlp": 0.01285968, + "epoch": 0.0755148053509695, + "flos": 14142198009600.0, + "grad_norm": 2.7245497332904325, + "language_loss": 0.81970763, + "learning_rate": 3.978309245614618e-06, + "loss": 0.90158832, + "num_input_tokens_seen": 26719050, + "router_z_loss_clip": 4.75390625, + "router_z_loss_mlp": 0.61230469, + "step": 1256, + "time_per_iteration": 2.552151679992676 + }, + { + "auxiliary_loss_clip": 0.06681269, + "auxiliary_loss_mlp": 0.01315431, + "balance_loss_clip": 0.06378952, + "balance_loss_mlp": 0.01282076, + "epoch": 0.07557492860363746, + "flos": 58251764822400.0, + "grad_norm": 0.7695886437006154, + "language_loss": 0.58049726, + "learning_rate": 3.9782520047727825e-06, + "loss": 0.66046429, + "num_input_tokens_seen": 26780650, + "router_z_loss_clip": 3.03125, + "router_z_loss_mlp": 0.33374023, + "step": 1257, + "time_per_iteration": 3.304816246032715 + }, + { + "auxiliary_loss_clip": 0.06853162, + "auxiliary_loss_mlp": 0.0135189, + "balance_loss_clip": 0.0636155, + "balance_loss_mlp": 0.01284012, + "epoch": 0.07563505185630542, + "flos": 24651259637760.0, + "grad_norm": 2.373470459060695, + "language_loss": 0.93104446, + "learning_rate": 3.978194688915432e-06, + "loss": 1.0130949, + "num_input_tokens_seen": 26798725, + "router_z_loss_clip": 4.91015625, + "router_z_loss_mlp": 0.6784668, + "step": 1258, + "time_per_iteration": 2.6907479763031006 + }, + { + "auxiliary_loss_clip": 0.06829782, + "auxiliary_loss_mlp": 0.01330684, + "balance_loss_clip": 0.06361564, + "balance_loss_mlp": 0.01273559, + "epoch": 0.07569517510897339, + "flos": 15528362503680.0, + "grad_norm": 3.094615329702446, + "language_loss": 0.84079689, + "learning_rate": 3.978137298044741e-06, + "loss": 0.92240155, + "num_input_tokens_seen": 26817005, + "router_z_loss_clip": 4.6796875, + "router_z_loss_mlp": 0.57128906, + "step": 1259, + "time_per_iteration": 2.5581536293029785 + }, + { + "auxiliary_loss_clip": 0.06848526, + "auxiliary_loss_mlp": 0.0132832, + "balance_loss_clip": 0.06371632, + "balance_loss_mlp": 0.01271052, + "epoch": 0.07575529836164137, + "flos": 22934954856960.0, + "grad_norm": 3.148240250348832, + "language_loss": 0.77577376, + "learning_rate": 3.978079832162885e-06, + "loss": 0.85754222, + "num_input_tokens_seen": 26836655, + "router_z_loss_clip": 4.76171875, + "router_z_loss_mlp": 0.57275391, + "step": 1260, + "time_per_iteration": 2.601511240005493 + }, + { + "auxiliary_loss_clip": 0.06837059, + "auxiliary_loss_mlp": 0.01329742, + "balance_loss_clip": 0.06359653, + "balance_loss_mlp": 0.01268421, + "epoch": 0.07581542161430933, + "flos": 19506537999360.0, + "grad_norm": 2.0302273693268535, + "language_loss": 0.87771595, + "learning_rate": 3.978022291272044e-06, + "loss": 0.95938396, + "num_input_tokens_seen": 26854925, + "router_z_loss_clip": 4.77734375, + "router_z_loss_mlp": 0.61328125, + "step": 1261, + "time_per_iteration": 2.5501255989074707 + }, + { + "auxiliary_loss_clip": 0.06841564, + "auxiliary_loss_mlp": 0.01315914, + "balance_loss_clip": 0.06369701, + "balance_loss_mlp": 0.01256547, + "epoch": 0.0758755448669773, + "flos": 24980519456640.0, + "grad_norm": 2.7189086354386407, + "language_loss": 0.84886664, + "learning_rate": 3.977964675374399e-06, + "loss": 0.93044144, + "num_input_tokens_seen": 26876170, + "router_z_loss_clip": 4.70703125, + "router_z_loss_mlp": 0.59423828, + "step": 1262, + "time_per_iteration": 2.642197370529175 + }, + { + "auxiliary_loss_clip": 0.06848589, + "auxiliary_loss_mlp": 0.01328257, + "balance_loss_clip": 0.06354951, + "balance_loss_mlp": 0.01263312, + "epoch": 0.07593566811964528, + "flos": 22754678797440.0, + "grad_norm": 3.7332355829542183, + "language_loss": 0.84859836, + "learning_rate": 3.977906984472136e-06, + "loss": 0.93036681, + "num_input_tokens_seen": 26895005, + "router_z_loss_clip": 4.94140625, + "router_z_loss_mlp": 0.64941406, + "step": 1263, + "time_per_iteration": 2.5762293338775635 + }, + { + "auxiliary_loss_clip": 0.06852871, + "auxiliary_loss_mlp": 0.01316465, + "balance_loss_clip": 0.06365145, + "balance_loss_mlp": 0.0126039, + "epoch": 0.07599579137231324, + "flos": 23119088204160.0, + "grad_norm": 2.8380907470503036, + "language_loss": 0.78429461, + "learning_rate": 3.977849218567442e-06, + "loss": 0.86598796, + "num_input_tokens_seen": 26913930, + "router_z_loss_clip": 4.87890625, + "router_z_loss_mlp": 0.56103516, + "step": 1264, + "time_per_iteration": 2.7333550453186035 + }, + { + "auxiliary_loss_clip": 0.06862055, + "auxiliary_loss_mlp": 0.01331538, + "balance_loss_clip": 0.06363812, + "balance_loss_mlp": 0.01272362, + "epoch": 0.07605591462498121, + "flos": 14507362103040.0, + "grad_norm": 3.0292139687816455, + "language_loss": 0.84203875, + "learning_rate": 3.977791377662507e-06, + "loss": 0.92397463, + "num_input_tokens_seen": 26931485, + "router_z_loss_clip": 4.984375, + "router_z_loss_mlp": 0.59179688, + "step": 1265, + "time_per_iteration": 2.587218761444092 + }, + { + "auxiliary_loss_clip": 0.06855778, + "auxiliary_loss_mlp": 0.01328532, + "balance_loss_clip": 0.0636021, + "balance_loss_mlp": 0.01264779, + "epoch": 0.07611603787764919, + "flos": 23521037040000.0, + "grad_norm": 3.3546410086249976, + "language_loss": 0.67662913, + "learning_rate": 3.977733461759524e-06, + "loss": 0.7584722, + "num_input_tokens_seen": 26951670, + "router_z_loss_clip": 4.953125, + "router_z_loss_mlp": 0.63720703, + "step": 1266, + "time_per_iteration": 2.6307120323181152 + }, + { + "auxiliary_loss_clip": 0.06869242, + "auxiliary_loss_mlp": 0.01332957, + "balance_loss_clip": 0.06363578, + "balance_loss_mlp": 0.01267201, + "epoch": 0.07617616113031715, + "flos": 21513640775040.0, + "grad_norm": 2.4484297039949894, + "language_loss": 0.81777161, + "learning_rate": 3.977675470860691e-06, + "loss": 0.89979357, + "num_input_tokens_seen": 26970335, + "router_z_loss_clip": 5.0546875, + "router_z_loss_mlp": 0.65673828, + "step": 1267, + "time_per_iteration": 2.5816946029663086 + }, + { + "auxiliary_loss_clip": 0.06859374, + "auxiliary_loss_mlp": 0.01329793, + "balance_loss_clip": 0.06364329, + "balance_loss_mlp": 0.01269354, + "epoch": 0.07623628438298512, + "flos": 14578164403200.0, + "grad_norm": 3.901991680203772, + "language_loss": 0.74711108, + "learning_rate": 3.977617404968205e-06, + "loss": 0.82900274, + "num_input_tokens_seen": 26986025, + "router_z_loss_clip": 4.94140625, + "router_z_loss_mlp": 0.60498047, + "step": 1268, + "time_per_iteration": 2.5329971313476562 + }, + { + "auxiliary_loss_clip": 0.06849901, + "auxiliary_loss_mlp": 0.01321442, + "balance_loss_clip": 0.06367739, + "balance_loss_mlp": 0.01263959, + "epoch": 0.07629640763565308, + "flos": 14725638789120.0, + "grad_norm": 7.47291205592579, + "language_loss": 0.85124403, + "learning_rate": 3.977559264084269e-06, + "loss": 0.93295747, + "num_input_tokens_seen": 27004045, + "router_z_loss_clip": 4.8125, + "router_z_loss_mlp": 0.57421875, + "step": 1269, + "time_per_iteration": 2.5311200618743896 + }, + { + "auxiliary_loss_clip": 0.06839523, + "auxiliary_loss_mlp": 0.01320369, + "balance_loss_clip": 0.0637067, + "balance_loss_mlp": 0.01264126, + "epoch": 0.07635653088832106, + "flos": 14908220835840.0, + "grad_norm": 2.6697300314393355, + "language_loss": 0.91628265, + "learning_rate": 3.977501048211088e-06, + "loss": 0.99788159, + "num_input_tokens_seen": 27022070, + "router_z_loss_clip": 4.6875, + "router_z_loss_mlp": 0.5625, + "step": 1270, + "time_per_iteration": 2.590938091278076 + }, + { + "auxiliary_loss_clip": 0.06847905, + "auxiliary_loss_mlp": 0.01334774, + "balance_loss_clip": 0.06368862, + "balance_loss_mlp": 0.01272309, + "epoch": 0.07641665414098903, + "flos": 26658865537920.0, + "grad_norm": 4.240829447117421, + "language_loss": 0.73391259, + "learning_rate": 3.977442757350869e-06, + "loss": 0.81573939, + "num_input_tokens_seen": 27041755, + "router_z_loss_clip": 4.7890625, + "router_z_loss_mlp": 0.625, + "step": 1271, + "time_per_iteration": 2.5961694717407227 + }, + { + "auxiliary_loss_clip": 0.06838269, + "auxiliary_loss_mlp": 0.01329276, + "balance_loss_clip": 0.06381856, + "balance_loss_mlp": 0.01278445, + "epoch": 0.07647677739365699, + "flos": 25199970099840.0, + "grad_norm": 3.136617280050721, + "language_loss": 0.8526597, + "learning_rate": 3.977384391505823e-06, + "loss": 0.93433517, + "num_input_tokens_seen": 27061540, + "router_z_loss_clip": 4.55859375, + "router_z_loss_mlp": 0.50878906, + "step": 1272, + "time_per_iteration": 2.6091222763061523 + }, + { + "auxiliary_loss_clip": 0.06845278, + "auxiliary_loss_mlp": 0.01336295, + "balance_loss_clip": 0.06370107, + "balance_loss_mlp": 0.01279599, + "epoch": 0.07653690064632497, + "flos": 20564365069440.0, + "grad_norm": 3.1222866186562674, + "language_loss": 0.82570672, + "learning_rate": 3.977325950678162e-06, + "loss": 0.90752244, + "num_input_tokens_seen": 27081395, + "router_z_loss_clip": 4.75, + "router_z_loss_mlp": 0.56713867, + "step": 1273, + "time_per_iteration": 2.5675384998321533 + }, + { + "auxiliary_loss_clip": 0.06864737, + "auxiliary_loss_mlp": 0.01336748, + "balance_loss_clip": 0.06374316, + "balance_loss_mlp": 0.01277787, + "epoch": 0.07659702389899294, + "flos": 22275219335040.0, + "grad_norm": 2.5887634532412123, + "language_loss": 0.83504725, + "learning_rate": 3.977267434870103e-06, + "loss": 0.91706204, + "num_input_tokens_seen": 27101175, + "router_z_loss_clip": 4.90234375, + "router_z_loss_mlp": 0.58862305, + "step": 1274, + "time_per_iteration": 2.594106912612915 + }, + { + "auxiliary_loss_clip": 0.06835781, + "auxiliary_loss_mlp": 0.01338776, + "balance_loss_clip": 0.06372908, + "balance_loss_mlp": 0.01281961, + "epoch": 0.0766571471516609, + "flos": 32644563079680.0, + "grad_norm": 2.657989216371077, + "language_loss": 0.75383544, + "learning_rate": 3.977208844083865e-06, + "loss": 0.835581, + "num_input_tokens_seen": 27124505, + "router_z_loss_clip": 4.62109375, + "router_z_loss_mlp": 0.56835938, + "step": 1275, + "time_per_iteration": 2.6635921001434326 + }, + { + "auxiliary_loss_clip": 0.06867371, + "auxiliary_loss_mlp": 0.01354656, + "balance_loss_clip": 0.06370118, + "balance_loss_mlp": 0.01289377, + "epoch": 0.07671727040432888, + "flos": 15272672169600.0, + "grad_norm": 3.4268385774262637, + "language_loss": 0.82329005, + "learning_rate": 3.9771501783216685e-06, + "loss": 0.90551031, + "num_input_tokens_seen": 27140960, + "router_z_loss_clip": 4.98046875, + "router_z_loss_mlp": 0.65234375, + "step": 1276, + "time_per_iteration": 2.5468428134918213 + }, + { + "auxiliary_loss_clip": 0.06860888, + "auxiliary_loss_mlp": 0.01344496, + "balance_loss_clip": 0.06380928, + "balance_loss_mlp": 0.01285964, + "epoch": 0.07677739365699685, + "flos": 28191665877120.0, + "grad_norm": 8.54617583390301, + "language_loss": 0.61651218, + "learning_rate": 3.97709143758574e-06, + "loss": 0.69856602, + "num_input_tokens_seen": 27160985, + "router_z_loss_clip": 4.78515625, + "router_z_loss_mlp": 0.58544922, + "step": 1277, + "time_per_iteration": 2.6240146160125732 + }, + { + "auxiliary_loss_clip": 0.06864151, + "auxiliary_loss_mlp": 0.01358552, + "balance_loss_clip": 0.06375778, + "balance_loss_mlp": 0.01298471, + "epoch": 0.07683751690966481, + "flos": 18301991230080.0, + "grad_norm": 2.6958136098916565, + "language_loss": 0.76683849, + "learning_rate": 3.977032621878305e-06, + "loss": 0.84906554, + "num_input_tokens_seen": 27178390, + "router_z_loss_clip": 4.875, + "router_z_loss_mlp": 0.60058594, + "step": 1278, + "time_per_iteration": 2.595947742462158 + }, + { + "auxiliary_loss_clip": 0.06835216, + "auxiliary_loss_mlp": 0.01346069, + "balance_loss_clip": 0.06372848, + "balance_loss_mlp": 0.01289683, + "epoch": 0.07689764016233278, + "flos": 21987565868160.0, + "grad_norm": 3.428980152963994, + "language_loss": 0.90527773, + "learning_rate": 3.976973731201596e-06, + "loss": 0.98709059, + "num_input_tokens_seen": 27197505, + "router_z_loss_clip": 4.62109375, + "router_z_loss_mlp": 0.56420898, + "step": 1279, + "time_per_iteration": 3.962568521499634 + }, + { + "auxiliary_loss_clip": 0.06834365, + "auxiliary_loss_mlp": 0.01339419, + "balance_loss_clip": 0.06362047, + "balance_loss_mlp": 0.01287301, + "epoch": 0.07695776341500075, + "flos": 22242417661440.0, + "grad_norm": 3.3495960477632685, + "language_loss": 0.85256732, + "learning_rate": 3.976914765557845e-06, + "loss": 0.93430507, + "num_input_tokens_seen": 27214260, + "router_z_loss_clip": 4.71484375, + "router_z_loss_mlp": 0.52148438, + "step": 1280, + "time_per_iteration": 2.5692243576049805 + }, + { + "auxiliary_loss_clip": 0.06832324, + "auxiliary_loss_mlp": 0.01339262, + "balance_loss_clip": 0.06368576, + "balance_loss_mlp": 0.01283662, + "epoch": 0.07701788666766872, + "flos": 16149300785280.0, + "grad_norm": 2.5153075146211274, + "language_loss": 0.78576446, + "learning_rate": 3.9768557249492875e-06, + "loss": 0.8674804, + "num_input_tokens_seen": 27232525, + "router_z_loss_clip": 4.63671875, + "router_z_loss_mlp": 0.55541992, + "step": 1281, + "time_per_iteration": 4.005364894866943 + }, + { + "auxiliary_loss_clip": 0.06866302, + "auxiliary_loss_mlp": 0.01356763, + "balance_loss_clip": 0.06371205, + "balance_loss_mlp": 0.01291317, + "epoch": 0.07707800992033668, + "flos": 19468998570240.0, + "grad_norm": 5.650134420498799, + "language_loss": 0.77910447, + "learning_rate": 3.9767966093781634e-06, + "loss": 0.8613351, + "num_input_tokens_seen": 27249800, + "router_z_loss_clip": 4.95703125, + "router_z_loss_mlp": 0.65429688, + "step": 1282, + "time_per_iteration": 2.6096553802490234 + }, + { + "auxiliary_loss_clip": 0.06843832, + "auxiliary_loss_mlp": 0.01354603, + "balance_loss_clip": 0.06370867, + "balance_loss_mlp": 0.01298647, + "epoch": 0.07713813317300466, + "flos": 18996415142400.0, + "grad_norm": 3.5179830835441974, + "language_loss": 0.86225599, + "learning_rate": 3.976737418846713e-06, + "loss": 0.94424033, + "num_input_tokens_seen": 27268895, + "router_z_loss_clip": 4.72265625, + "router_z_loss_mlp": 0.55932617, + "step": 1283, + "time_per_iteration": 2.605346202850342 + }, + { + "auxiliary_loss_clip": 0.06835528, + "auxiliary_loss_mlp": 0.01347471, + "balance_loss_clip": 0.06358841, + "balance_loss_mlp": 0.01292039, + "epoch": 0.07719825642567263, + "flos": 18119828453760.0, + "grad_norm": 2.430743235056626, + "language_loss": 0.77539676, + "learning_rate": 3.976678153357181e-06, + "loss": 0.85722673, + "num_input_tokens_seen": 27288180, + "router_z_loss_clip": 4.76171875, + "router_z_loss_mlp": 0.55444336, + "step": 1284, + "time_per_iteration": 3.990124225616455 + }, + { + "auxiliary_loss_clip": 0.06827543, + "auxiliary_loss_mlp": 0.01355487, + "balance_loss_clip": 0.06358978, + "balance_loss_mlp": 0.01300294, + "epoch": 0.0772583796783406, + "flos": 42204307075200.0, + "grad_norm": 2.435341154952095, + "language_loss": 0.78285027, + "learning_rate": 3.976618812911817e-06, + "loss": 0.86468053, + "num_input_tokens_seen": 27311815, + "router_z_loss_clip": 4.6796875, + "router_z_loss_mlp": 0.55200195, + "step": 1285, + "time_per_iteration": 2.7569363117218018 + }, + { + "auxiliary_loss_clip": 0.06851525, + "auxiliary_loss_mlp": 0.01337351, + "balance_loss_clip": 0.06371935, + "balance_loss_mlp": 0.01278081, + "epoch": 0.07731850293100857, + "flos": 24760565688960.0, + "grad_norm": 2.195462031898389, + "language_loss": 0.86501926, + "learning_rate": 3.9765593975128685e-06, + "loss": 0.946908, + "num_input_tokens_seen": 27331890, + "router_z_loss_clip": 4.79296875, + "router_z_loss_mlp": 0.59277344, + "step": 1286, + "time_per_iteration": 4.058920383453369 + }, + { + "auxiliary_loss_clip": 0.06876462, + "auxiliary_loss_mlp": 0.01367501, + "balance_loss_clip": 0.0637191, + "balance_loss_mlp": 0.01299314, + "epoch": 0.07737862618367654, + "flos": 17571537262080.0, + "grad_norm": 2.773879522110049, + "language_loss": 0.79808044, + "learning_rate": 3.97649990716259e-06, + "loss": 0.88052011, + "num_input_tokens_seen": 27348320, + "router_z_loss_clip": 5.04296875, + "router_z_loss_mlp": 0.68212891, + "step": 1287, + "time_per_iteration": 2.562206506729126 + }, + { + "auxiliary_loss_clip": 0.06845251, + "auxiliary_loss_mlp": 0.01340112, + "balance_loss_clip": 0.06370382, + "balance_loss_mlp": 0.01288136, + "epoch": 0.0774387494363445, + "flos": 25633798214400.0, + "grad_norm": 2.3847373218246983, + "language_loss": 0.8715058, + "learning_rate": 3.976440341863237e-06, + "loss": 0.95335943, + "num_input_tokens_seen": 27367670, + "router_z_loss_clip": 4.74609375, + "router_z_loss_mlp": 0.51953125, + "step": 1288, + "time_per_iteration": 2.600308656692505 + }, + { + "auxiliary_loss_clip": 0.0688329, + "auxiliary_loss_mlp": 0.01364865, + "balance_loss_clip": 0.06375885, + "balance_loss_mlp": 0.01300611, + "epoch": 0.07749887268901248, + "flos": 12244778628480.0, + "grad_norm": 3.451146773235629, + "language_loss": 0.8824665, + "learning_rate": 3.976380701617068e-06, + "loss": 0.96494806, + "num_input_tokens_seen": 27385485, + "router_z_loss_clip": 5.0703125, + "router_z_loss_mlp": 0.64306641, + "step": 1289, + "time_per_iteration": 2.6120755672454834 + }, + { + "auxiliary_loss_clip": 0.06845821, + "auxiliary_loss_mlp": 0.01332003, + "balance_loss_clip": 0.06365949, + "balance_loss_mlp": 0.0127781, + "epoch": 0.07755899594168045, + "flos": 25088609623680.0, + "grad_norm": 3.9721153981819377, + "language_loss": 0.87731397, + "learning_rate": 3.976320986426344e-06, + "loss": 0.95909214, + "num_input_tokens_seen": 27405110, + "router_z_loss_clip": 4.80078125, + "router_z_loss_mlp": 0.54150391, + "step": 1290, + "time_per_iteration": 2.6039535999298096 + }, + { + "auxiliary_loss_clip": 0.06849636, + "auxiliary_loss_mlp": 0.0134794, + "balance_loss_clip": 0.0637328, + "balance_loss_mlp": 0.01286833, + "epoch": 0.07761911919434841, + "flos": 14251629841920.0, + "grad_norm": 2.80389948255575, + "language_loss": 0.9359982, + "learning_rate": 3.9762611962933315e-06, + "loss": 1.0179739, + "num_input_tokens_seen": 27422855, + "router_z_loss_clip": 4.765625, + "router_z_loss_mlp": 0.61157227, + "step": 1291, + "time_per_iteration": 2.620960235595703 + }, + { + "auxiliary_loss_clip": 0.06740145, + "auxiliary_loss_mlp": 0.01502792, + "balance_loss_clip": 0.06432445, + "balance_loss_mlp": 0.01475422, + "epoch": 0.07767924244701638, + "flos": 67259639099520.0, + "grad_norm": 0.9524065323514693, + "language_loss": 0.65448344, + "learning_rate": 3.9762013312202955e-06, + "loss": 0.73691273, + "num_input_tokens_seen": 27487190, + "router_z_loss_clip": 3.078125, + "router_z_loss_mlp": 0.27416992, + "step": 1292, + "time_per_iteration": 3.3147408962249756 + }, + { + "auxiliary_loss_clip": 0.06863274, + "auxiliary_loss_mlp": 0.01339428, + "balance_loss_clip": 0.06369414, + "balance_loss_mlp": 0.01279203, + "epoch": 0.07773936569968436, + "flos": 28558548979200.0, + "grad_norm": 5.92776916982661, + "language_loss": 0.89760518, + "learning_rate": 3.9761413912095075e-06, + "loss": 0.97963214, + "num_input_tokens_seen": 27510465, + "router_z_loss_clip": 4.9375, + "router_z_loss_mlp": 0.60229492, + "step": 1293, + "time_per_iteration": 2.649545431137085 + }, + { + "auxiliary_loss_clip": 0.06850281, + "auxiliary_loss_mlp": 0.0134015, + "balance_loss_clip": 0.06365186, + "balance_loss_mlp": 0.01280689, + "epoch": 0.07779948895235232, + "flos": 27497619308160.0, + "grad_norm": 4.7786851588669315, + "language_loss": 0.88117272, + "learning_rate": 3.976081376263239e-06, + "loss": 0.96307707, + "num_input_tokens_seen": 27528645, + "router_z_loss_clip": 4.84765625, + "router_z_loss_mlp": 0.59521484, + "step": 1294, + "time_per_iteration": 2.7246196269989014 + }, + { + "auxiliary_loss_clip": 0.06872948, + "auxiliary_loss_mlp": 0.01341599, + "balance_loss_clip": 0.06369777, + "balance_loss_mlp": 0.01276034, + "epoch": 0.07785961220502029, + "flos": 18229176432000.0, + "grad_norm": 2.917147299599652, + "language_loss": 0.82283127, + "learning_rate": 3.976021286383768e-06, + "loss": 0.90497679, + "num_input_tokens_seen": 27546165, + "router_z_loss_clip": 5.03125, + "router_z_loss_mlp": 0.65576172, + "step": 1295, + "time_per_iteration": 2.565981149673462 + }, + { + "auxiliary_loss_clip": 0.06823503, + "auxiliary_loss_mlp": 0.0131494, + "balance_loss_clip": 0.06354046, + "balance_loss_mlp": 0.01258459, + "epoch": 0.07791973545768827, + "flos": 24615145728000.0, + "grad_norm": 2.406299450212834, + "language_loss": 0.90690672, + "learning_rate": 3.975961121573371e-06, + "loss": 0.9882912, + "num_input_tokens_seen": 27566520, + "router_z_loss_clip": 4.69140625, + "router_z_loss_mlp": 0.56494141, + "step": 1296, + "time_per_iteration": 2.6269545555114746 + }, + { + "auxiliary_loss_clip": 0.06845632, + "auxiliary_loss_mlp": 0.01328069, + "balance_loss_clip": 0.06355733, + "balance_loss_mlp": 0.01267058, + "epoch": 0.07797985871035623, + "flos": 14287156773120.0, + "grad_norm": 2.6954148658412636, + "language_loss": 0.98733974, + "learning_rate": 3.9759008818343305e-06, + "loss": 1.06907678, + "num_input_tokens_seen": 27581960, + "router_z_loss_clip": 4.8984375, + "router_z_loss_mlp": 0.61010742, + "step": 1297, + "time_per_iteration": 2.550185441970825 + }, + { + "auxiliary_loss_clip": 0.06845116, + "auxiliary_loss_mlp": 0.01318807, + "balance_loss_clip": 0.06359702, + "balance_loss_mlp": 0.01258606, + "epoch": 0.0780399819630242, + "flos": 26616965696640.0, + "grad_norm": 2.8603722020093287, + "language_loss": 0.7874198, + "learning_rate": 3.97584056716893e-06, + "loss": 0.86905909, + "num_input_tokens_seen": 27601415, + "router_z_loss_clip": 4.8515625, + "router_z_loss_mlp": 0.60229492, + "step": 1298, + "time_per_iteration": 2.6391749382019043 + }, + { + "auxiliary_loss_clip": 0.06826787, + "auxiliary_loss_mlp": 0.01312488, + "balance_loss_clip": 0.06351642, + "balance_loss_mlp": 0.01258558, + "epoch": 0.07810010521569218, + "flos": 21840846168960.0, + "grad_norm": 2.2381109850938077, + "language_loss": 0.83600903, + "learning_rate": 3.9757801775794575e-06, + "loss": 0.91740179, + "num_input_tokens_seen": 27621490, + "router_z_loss_clip": 4.75, + "router_z_loss_mlp": 0.53979492, + "step": 1299, + "time_per_iteration": 2.5871427059173584 + }, + { + "auxiliary_loss_clip": 0.0681142, + "auxiliary_loss_mlp": 0.01314166, + "balance_loss_clip": 0.06352274, + "balance_loss_mlp": 0.01260713, + "epoch": 0.07816022846836014, + "flos": 25088022645120.0, + "grad_norm": 2.404074331576357, + "language_loss": 0.89199561, + "learning_rate": 3.975719713068202e-06, + "loss": 0.97325152, + "num_input_tokens_seen": 27640600, + "router_z_loss_clip": 4.58984375, + "router_z_loss_mlp": 0.53442383, + "step": 1300, + "time_per_iteration": 2.633734941482544 + }, + { + "auxiliary_loss_clip": 0.06848504, + "auxiliary_loss_mlp": 0.01319579, + "balance_loss_clip": 0.0636059, + "balance_loss_mlp": 0.0125964, + "epoch": 0.0782203517210281, + "flos": 40927197070080.0, + "grad_norm": 2.022718991796153, + "language_loss": 0.7445091, + "learning_rate": 3.975659173637458e-06, + "loss": 0.82618994, + "num_input_tokens_seen": 27663070, + "router_z_loss_clip": 4.87890625, + "router_z_loss_mlp": 0.59936523, + "step": 1301, + "time_per_iteration": 2.7330377101898193 + }, + { + "auxiliary_loss_clip": 0.06825704, + "auxiliary_loss_mlp": 0.01316028, + "balance_loss_clip": 0.0635405, + "balance_loss_mlp": 0.01261335, + "epoch": 0.07828047497369607, + "flos": 41181587665920.0, + "grad_norm": 2.1366155853756275, + "language_loss": 0.73607302, + "learning_rate": 3.97559855928952e-06, + "loss": 0.81749034, + "num_input_tokens_seen": 27686425, + "router_z_loss_clip": 4.7109375, + "router_z_loss_mlp": 0.54736328, + "step": 1302, + "time_per_iteration": 2.781339168548584 + }, + { + "auxiliary_loss_clip": 0.06820317, + "auxiliary_loss_mlp": 0.01324174, + "balance_loss_clip": 0.06356553, + "balance_loss_mlp": 0.01270124, + "epoch": 0.07834059822636405, + "flos": 23513951370240.0, + "grad_norm": 3.2246124193670433, + "language_loss": 0.84486687, + "learning_rate": 3.9755378700266864e-06, + "loss": 0.92631173, + "num_input_tokens_seen": 27704900, + "router_z_loss_clip": 4.640625, + "router_z_loss_mlp": 0.54101562, + "step": 1303, + "time_per_iteration": 2.5946569442749023 + }, + { + "auxiliary_loss_clip": 0.06814861, + "auxiliary_loss_mlp": 0.01309278, + "balance_loss_clip": 0.06343949, + "balance_loss_mlp": 0.01254919, + "epoch": 0.07840072147903202, + "flos": 20200165297920.0, + "grad_norm": 2.085099882897468, + "language_loss": 0.77159727, + "learning_rate": 3.9754771058512585e-06, + "loss": 0.85283864, + "num_input_tokens_seen": 27724890, + "router_z_loss_clip": 4.69921875, + "router_z_loss_mlp": 0.54394531, + "step": 1304, + "time_per_iteration": 2.5800909996032715 + }, + { + "auxiliary_loss_clip": 0.06828763, + "auxiliary_loss_mlp": 0.01313707, + "balance_loss_clip": 0.06349462, + "balance_loss_mlp": 0.01258799, + "epoch": 0.07846084473169998, + "flos": 21367172638080.0, + "grad_norm": 2.1177139553290734, + "language_loss": 0.7841258, + "learning_rate": 3.975416266765542e-06, + "loss": 0.86555046, + "num_input_tokens_seen": 27743115, + "router_z_loss_clip": 4.78515625, + "router_z_loss_mlp": 0.54882812, + "step": 1305, + "time_per_iteration": 2.569558620452881 + }, + { + "auxiliary_loss_clip": 0.06855056, + "auxiliary_loss_mlp": 0.01321096, + "balance_loss_clip": 0.06367438, + "balance_loss_mlp": 0.01261348, + "epoch": 0.07852096798436796, + "flos": 25418037150720.0, + "grad_norm": 3.9004874062794057, + "language_loss": 0.88314414, + "learning_rate": 3.975355352771841e-06, + "loss": 0.96490562, + "num_input_tokens_seen": 27763570, + "router_z_loss_clip": 4.87109375, + "router_z_loss_mlp": 0.59765625, + "step": 1306, + "time_per_iteration": 2.6575305461883545 + }, + { + "auxiliary_loss_clip": 0.06810681, + "auxiliary_loss_mlp": 0.01315273, + "balance_loss_clip": 0.06347391, + "balance_loss_mlp": 0.01263608, + "epoch": 0.07858109123703592, + "flos": 24578360985600.0, + "grad_norm": 4.395850337278793, + "language_loss": 0.93214571, + "learning_rate": 3.975294363872468e-06, + "loss": 1.01340532, + "num_input_tokens_seen": 27780030, + "router_z_loss_clip": 4.6328125, + "router_z_loss_mlp": 0.51660156, + "step": 1307, + "time_per_iteration": 2.592435359954834 + }, + { + "auxiliary_loss_clip": 0.0682511, + "auxiliary_loss_mlp": 0.0131993, + "balance_loss_clip": 0.06345625, + "balance_loss_mlp": 0.01262566, + "epoch": 0.07864121448970389, + "flos": 20704250661120.0, + "grad_norm": 3.2307026300408683, + "language_loss": 0.8507998, + "learning_rate": 3.975233300069735e-06, + "loss": 0.93225014, + "num_input_tokens_seen": 27796225, + "router_z_loss_clip": 4.7890625, + "router_z_loss_mlp": 0.57373047, + "step": 1308, + "time_per_iteration": 2.597881555557251 + }, + { + "auxiliary_loss_clip": 0.06792136, + "auxiliary_loss_mlp": 0.01314144, + "balance_loss_clip": 0.06338251, + "balance_loss_mlp": 0.01262598, + "epoch": 0.07870133774237187, + "flos": 22973207045760.0, + "grad_norm": 1.9389316858499817, + "language_loss": 0.79464692, + "learning_rate": 3.975172161365958e-06, + "loss": 0.87570971, + "num_input_tokens_seen": 27815975, + "router_z_loss_clip": 4.53515625, + "router_z_loss_mlp": 0.515625, + "step": 1309, + "time_per_iteration": 2.599799871444702 + }, + { + "auxiliary_loss_clip": 0.06823064, + "auxiliary_loss_mlp": 0.01328854, + "balance_loss_clip": 0.06347175, + "balance_loss_mlp": 0.01272683, + "epoch": 0.07876146099503983, + "flos": 18848689194240.0, + "grad_norm": 2.5866734138361345, + "language_loss": 0.83378398, + "learning_rate": 3.975110947763453e-06, + "loss": 0.91530323, + "num_input_tokens_seen": 27832255, + "router_z_loss_clip": 4.765625, + "router_z_loss_mlp": 0.56176758, + "step": 1310, + "time_per_iteration": 2.5724973678588867 + }, + { + "auxiliary_loss_clip": 0.0678651, + "auxiliary_loss_mlp": 0.01315999, + "balance_loss_clip": 0.06338531, + "balance_loss_mlp": 0.01264811, + "epoch": 0.0788215842477078, + "flos": 23812631648640.0, + "grad_norm": 2.2765510373912683, + "language_loss": 0.76230896, + "learning_rate": 3.9750496592645435e-06, + "loss": 0.84333402, + "num_input_tokens_seen": 27852180, + "router_z_loss_clip": 4.48046875, + "router_z_loss_mlp": 0.51123047, + "step": 1311, + "time_per_iteration": 2.632310628890991 + }, + { + "auxiliary_loss_clip": 0.0680154, + "auxiliary_loss_mlp": 0.01319845, + "balance_loss_clip": 0.06336971, + "balance_loss_mlp": 0.01265009, + "epoch": 0.07888170750037576, + "flos": 21586329792000.0, + "grad_norm": 3.554782909684318, + "language_loss": 0.88360095, + "learning_rate": 3.974988295871553e-06, + "loss": 0.96481478, + "num_input_tokens_seen": 27871435, + "router_z_loss_clip": 4.640625, + "router_z_loss_mlp": 0.54882812, + "step": 1312, + "time_per_iteration": 2.7384519577026367 + }, + { + "auxiliary_loss_clip": 0.06786558, + "auxiliary_loss_mlp": 0.01318936, + "balance_loss_clip": 0.06334423, + "balance_loss_mlp": 0.01270561, + "epoch": 0.07894183075304374, + "flos": 19870947406080.0, + "grad_norm": 2.1624292410526773, + "language_loss": 0.84578681, + "learning_rate": 3.9749268575868085e-06, + "loss": 0.92684174, + "num_input_tokens_seen": 27890625, + "router_z_loss_clip": 4.5234375, + "router_z_loss_mlp": 0.48388672, + "step": 1313, + "time_per_iteration": 2.6043031215667725 + }, + { + "auxiliary_loss_clip": 0.06836893, + "auxiliary_loss_mlp": 0.01334789, + "balance_loss_clip": 0.06342322, + "balance_loss_mlp": 0.01270368, + "epoch": 0.07900195400571171, + "flos": 16148965368960.0, + "grad_norm": 3.8741474948490717, + "language_loss": 0.75254732, + "learning_rate": 3.97486534441264e-06, + "loss": 0.83426416, + "num_input_tokens_seen": 27906530, + "router_z_loss_clip": 4.94140625, + "router_z_loss_mlp": 0.64404297, + "step": 1314, + "time_per_iteration": 2.532270669937134 + }, + { + "auxiliary_loss_clip": 0.06814209, + "auxiliary_loss_mlp": 0.01316459, + "balance_loss_clip": 0.06346349, + "balance_loss_mlp": 0.01263363, + "epoch": 0.07906207725837967, + "flos": 23736840030720.0, + "grad_norm": 2.0058439737114826, + "language_loss": 0.8208642, + "learning_rate": 3.974803756351379e-06, + "loss": 0.9021709, + "num_input_tokens_seen": 27926725, + "router_z_loss_clip": 4.67578125, + "router_z_loss_mlp": 0.53125, + "step": 1315, + "time_per_iteration": 2.6085028648376465 + }, + { + "auxiliary_loss_clip": 0.06824351, + "auxiliary_loss_mlp": 0.01326067, + "balance_loss_clip": 0.06345295, + "balance_loss_mlp": 0.01265914, + "epoch": 0.07912220051104765, + "flos": 24322712578560.0, + "grad_norm": 1.9106769346900934, + "language_loss": 0.76054502, + "learning_rate": 3.974742093405362e-06, + "loss": 0.84204924, + "num_input_tokens_seen": 27947875, + "router_z_loss_clip": 4.79296875, + "router_z_loss_mlp": 0.60083008, + "step": 1316, + "time_per_iteration": 2.586472749710083 + }, + { + "auxiliary_loss_clip": 0.0684765, + "auxiliary_loss_mlp": 0.01325754, + "balance_loss_clip": 0.06349534, + "balance_loss_mlp": 0.01266244, + "epoch": 0.07918232376371562, + "flos": 18886018988160.0, + "grad_norm": 4.4995832003619, + "language_loss": 0.68677568, + "learning_rate": 3.974680355576927e-06, + "loss": 0.76850969, + "num_input_tokens_seen": 27965040, + "router_z_loss_clip": 4.98046875, + "router_z_loss_mlp": 0.59472656, + "step": 1317, + "time_per_iteration": 2.5489861965179443 + }, + { + "auxiliary_loss_clip": 0.06869859, + "auxiliary_loss_mlp": 0.01349552, + "balance_loss_clip": 0.06357804, + "balance_loss_mlp": 0.01281912, + "epoch": 0.07924244701638358, + "flos": 27382862741760.0, + "grad_norm": 3.047310758275923, + "language_loss": 0.75324464, + "learning_rate": 3.974618542868415e-06, + "loss": 0.83543873, + "num_input_tokens_seen": 27985330, + "router_z_loss_clip": 5.12109375, + "router_z_loss_mlp": 0.67700195, + "step": 1318, + "time_per_iteration": 2.5918128490448 + }, + { + "auxiliary_loss_clip": 0.06830844, + "auxiliary_loss_mlp": 0.01322573, + "balance_loss_clip": 0.06359029, + "balance_loss_mlp": 0.01269692, + "epoch": 0.07930257026905156, + "flos": 25127574572160.0, + "grad_norm": 1.9442087070115428, + "language_loss": 0.92534363, + "learning_rate": 3.97455665528217e-06, + "loss": 1.0068779, + "num_input_tokens_seen": 28007615, + "router_z_loss_clip": 4.7109375, + "router_z_loss_mlp": 0.52929688, + "step": 1319, + "time_per_iteration": 3.993619203567505 + }, + { + "auxiliary_loss_clip": 0.06832193, + "auxiliary_loss_mlp": 0.0132254, + "balance_loss_clip": 0.06361841, + "balance_loss_mlp": 0.01272902, + "epoch": 0.07936269352171953, + "flos": 21840804241920.0, + "grad_norm": 2.144433650708689, + "language_loss": 0.81964207, + "learning_rate": 3.974494692820539e-06, + "loss": 0.90118945, + "num_input_tokens_seen": 28027765, + "router_z_loss_clip": 4.703125, + "router_z_loss_mlp": 0.49633789, + "step": 1320, + "time_per_iteration": 3.991323232650757 + }, + { + "auxiliary_loss_clip": 0.06858893, + "auxiliary_loss_mlp": 0.01331954, + "balance_loss_clip": 0.06361651, + "balance_loss_mlp": 0.01271801, + "epoch": 0.07942281677438749, + "flos": 16944477632640.0, + "grad_norm": 2.2380017082009576, + "language_loss": 0.71816266, + "learning_rate": 3.974432655485872e-06, + "loss": 0.80007118, + "num_input_tokens_seen": 28044225, + "router_z_loss_clip": 4.96484375, + "router_z_loss_mlp": 0.60205078, + "step": 1321, + "time_per_iteration": 2.5437092781066895 + }, + { + "auxiliary_loss_clip": 0.06835557, + "auxiliary_loss_mlp": 0.01340758, + "balance_loss_clip": 0.06363731, + "balance_loss_mlp": 0.01282297, + "epoch": 0.07948294002705546, + "flos": 18992515927680.0, + "grad_norm": 2.7756488817332943, + "language_loss": 0.86391938, + "learning_rate": 3.9743705432805195e-06, + "loss": 0.94568253, + "num_input_tokens_seen": 28062915, + "router_z_loss_clip": 4.7109375, + "router_z_loss_mlp": 0.5847168, + "step": 1322, + "time_per_iteration": 2.579061985015869 + }, + { + "auxiliary_loss_clip": 0.06837995, + "auxiliary_loss_mlp": 0.01339731, + "balance_loss_clip": 0.0636203, + "balance_loss_mlp": 0.01284681, + "epoch": 0.07954306327972344, + "flos": 21659983130880.0, + "grad_norm": 2.3668510426442144, + "language_loss": 0.92888951, + "learning_rate": 3.974308356206838e-06, + "loss": 1.01066673, + "num_input_tokens_seen": 28082175, + "router_z_loss_clip": 4.7578125, + "router_z_loss_mlp": 0.55053711, + "step": 1323, + "time_per_iteration": 3.9885079860687256 + }, + { + "auxiliary_loss_clip": 0.06820317, + "auxiliary_loss_mlp": 0.01320075, + "balance_loss_clip": 0.06361794, + "balance_loss_mlp": 0.01267504, + "epoch": 0.0796031865323914, + "flos": 23226717173760.0, + "grad_norm": 4.577989929254941, + "language_loss": 0.84617591, + "learning_rate": 3.974246094267187e-06, + "loss": 0.92757982, + "num_input_tokens_seen": 28102645, + "router_z_loss_clip": 4.58203125, + "router_z_loss_mlp": 0.52661133, + "step": 1324, + "time_per_iteration": 2.575162410736084 + }, + { + "auxiliary_loss_clip": 0.0682738, + "auxiliary_loss_mlp": 0.01317412, + "balance_loss_clip": 0.06365715, + "balance_loss_mlp": 0.0126372, + "epoch": 0.07966330978505937, + "flos": 23301209053440.0, + "grad_norm": 4.146924168553952, + "language_loss": 0.81619465, + "learning_rate": 3.974183757463925e-06, + "loss": 0.89764249, + "num_input_tokens_seen": 28122805, + "router_z_loss_clip": 4.6171875, + "router_z_loss_mlp": 0.53710938, + "step": 1325, + "time_per_iteration": 3.9960508346557617 + }, + { + "auxiliary_loss_clip": 0.06838783, + "auxiliary_loss_mlp": 0.01317663, + "balance_loss_clip": 0.06375229, + "balance_loss_mlp": 0.01262112, + "epoch": 0.07972343303772735, + "flos": 18368768534400.0, + "grad_norm": 3.482553532723253, + "language_loss": 0.90544963, + "learning_rate": 3.974121345799418e-06, + "loss": 0.98701411, + "num_input_tokens_seen": 28140530, + "router_z_loss_clip": 4.63671875, + "router_z_loss_mlp": 0.55493164, + "step": 1326, + "time_per_iteration": 2.5401828289031982 + }, + { + "auxiliary_loss_clip": 0.0682137, + "auxiliary_loss_mlp": 0.01316322, + "balance_loss_clip": 0.06366737, + "balance_loss_mlp": 0.01263488, + "epoch": 0.07978355629039531, + "flos": 21768995692800.0, + "grad_norm": 2.4962093100336085, + "language_loss": 0.85295928, + "learning_rate": 3.974058859276032e-06, + "loss": 0.93433619, + "num_input_tokens_seen": 28159640, + "router_z_loss_clip": 4.54296875, + "router_z_loss_mlp": 0.52856445, + "step": 1327, + "time_per_iteration": 2.6081485748291016 + }, + { + "auxiliary_loss_clip": 0.0686523, + "auxiliary_loss_mlp": 0.01320845, + "balance_loss_clip": 0.06376741, + "balance_loss_mlp": 0.01260119, + "epoch": 0.07984367954306328, + "flos": 18557178439680.0, + "grad_norm": 3.6856767873413077, + "language_loss": 0.82425529, + "learning_rate": 3.9739962978961354e-06, + "loss": 0.90611601, + "num_input_tokens_seen": 28177050, + "router_z_loss_clip": 4.88671875, + "router_z_loss_mlp": 0.60742188, + "step": 1328, + "time_per_iteration": 2.5963807106018066 + }, + { + "auxiliary_loss_clip": 0.06855517, + "auxiliary_loss_mlp": 0.01323941, + "balance_loss_clip": 0.06378672, + "balance_loss_mlp": 0.01266315, + "epoch": 0.07990380279573125, + "flos": 16908741066240.0, + "grad_norm": 2.810501054411486, + "language_loss": 0.77465802, + "learning_rate": 3.973933661662101e-06, + "loss": 0.85645258, + "num_input_tokens_seen": 28193245, + "router_z_loss_clip": 4.76953125, + "router_z_loss_mlp": 0.57666016, + "step": 1329, + "time_per_iteration": 2.5654993057250977 + }, + { + "auxiliary_loss_clip": 0.06870389, + "auxiliary_loss_mlp": 0.01332359, + "balance_loss_clip": 0.06403654, + "balance_loss_mlp": 0.01277785, + "epoch": 0.07996392604839922, + "flos": 24105358287360.0, + "grad_norm": 3.2158550447724354, + "language_loss": 0.83423603, + "learning_rate": 3.973870950576305e-06, + "loss": 0.91626346, + "num_input_tokens_seen": 28213570, + "router_z_loss_clip": 4.66796875, + "router_z_loss_mlp": 0.5456543, + "step": 1330, + "time_per_iteration": 2.689359426498413 + }, + { + "auxiliary_loss_clip": 0.06871998, + "auxiliary_loss_mlp": 0.01327325, + "balance_loss_clip": 0.06395264, + "balance_loss_mlp": 0.01271893, + "epoch": 0.08002404930106718, + "flos": 14283257558400.0, + "grad_norm": 2.3593668670474375, + "language_loss": 0.91363919, + "learning_rate": 3.9738081646411255e-06, + "loss": 0.99563241, + "num_input_tokens_seen": 28229980, + "router_z_loss_clip": 4.765625, + "router_z_loss_mlp": 0.5534668, + "step": 1331, + "time_per_iteration": 2.535022735595703 + }, + { + "auxiliary_loss_clip": 0.06886654, + "auxiliary_loss_mlp": 0.01331981, + "balance_loss_clip": 0.0639886, + "balance_loss_mlp": 0.01274283, + "epoch": 0.08008417255373516, + "flos": 40415732547840.0, + "grad_norm": 8.382777264974079, + "language_loss": 0.75984204, + "learning_rate": 3.973745303858942e-06, + "loss": 0.84202838, + "num_input_tokens_seen": 28253840, + "router_z_loss_clip": 4.875, + "router_z_loss_mlp": 0.57666016, + "step": 1332, + "time_per_iteration": 2.798543691635132 + }, + { + "auxiliary_loss_clip": 0.06853566, + "auxiliary_loss_mlp": 0.01322273, + "balance_loss_clip": 0.06399575, + "balance_loss_mlp": 0.01270894, + "epoch": 0.08014429580640313, + "flos": 18484866766080.0, + "grad_norm": 3.077187306300229, + "language_loss": 0.84502465, + "learning_rate": 3.973682368232138e-06, + "loss": 0.92678297, + "num_input_tokens_seen": 28271675, + "router_z_loss_clip": 4.54296875, + "router_z_loss_mlp": 0.51318359, + "step": 1333, + "time_per_iteration": 2.55322003364563 + }, + { + "auxiliary_loss_clip": 0.06860092, + "auxiliary_loss_mlp": 0.01337998, + "balance_loss_clip": 0.06402323, + "balance_loss_mlp": 0.01283972, + "epoch": 0.0802044190590711, + "flos": 22059835614720.0, + "grad_norm": 5.409358557797253, + "language_loss": 0.77425432, + "learning_rate": 3.9736193577631015e-06, + "loss": 0.85623527, + "num_input_tokens_seen": 28291850, + "router_z_loss_clip": 4.57421875, + "router_z_loss_mlp": 0.54052734, + "step": 1334, + "time_per_iteration": 2.6176130771636963 + }, + { + "auxiliary_loss_clip": 0.06866166, + "auxiliary_loss_mlp": 0.01339925, + "balance_loss_clip": 0.06404187, + "balance_loss_mlp": 0.01288045, + "epoch": 0.08026454231173906, + "flos": 24579115672320.0, + "grad_norm": 2.171957673256717, + "language_loss": 0.82094586, + "learning_rate": 3.973556272454221e-06, + "loss": 0.90300679, + "num_input_tokens_seen": 28310780, + "router_z_loss_clip": 4.6171875, + "router_z_loss_mlp": 0.51855469, + "step": 1335, + "time_per_iteration": 2.5995283126831055 + }, + { + "auxiliary_loss_clip": 0.0666078, + "auxiliary_loss_mlp": 0.0130214, + "balance_loss_clip": 0.06361455, + "balance_loss_mlp": 0.01275747, + "epoch": 0.08032466556440704, + "flos": 52597716940800.0, + "grad_norm": 0.7171954407460774, + "language_loss": 0.56264853, + "learning_rate": 3.973493112307889e-06, + "loss": 0.64227772, + "num_input_tokens_seen": 28369985, + "router_z_loss_clip": 3.0, + "router_z_loss_mlp": 0.2644043, + "step": 1336, + "time_per_iteration": 3.246748447418213 + }, + { + "auxiliary_loss_clip": 0.06839207, + "auxiliary_loss_mlp": 0.01326336, + "balance_loss_clip": 0.06379974, + "balance_loss_mlp": 0.01274528, + "epoch": 0.080384788817075, + "flos": 23849500245120.0, + "grad_norm": 4.030100704660237, + "language_loss": 0.70582694, + "learning_rate": 3.9734298773265005e-06, + "loss": 0.78748238, + "num_input_tokens_seen": 28388670, + "router_z_loss_clip": 4.58984375, + "router_z_loss_mlp": 0.51757812, + "step": 1337, + "time_per_iteration": 2.6283833980560303 + }, + { + "auxiliary_loss_clip": 0.06838794, + "auxiliary_loss_mlp": 0.01334035, + "balance_loss_clip": 0.06387126, + "balance_loss_mlp": 0.01282751, + "epoch": 0.08044491206974297, + "flos": 25307640996480.0, + "grad_norm": 2.123866739454124, + "language_loss": 0.89543176, + "learning_rate": 3.973366567512453e-06, + "loss": 0.97716004, + "num_input_tokens_seen": 28411845, + "router_z_loss_clip": 4.515625, + "router_z_loss_mlp": 0.51245117, + "step": 1338, + "time_per_iteration": 2.657308340072632 + }, + { + "auxiliary_loss_clip": 0.0684766, + "auxiliary_loss_mlp": 0.01327669, + "balance_loss_clip": 0.06375088, + "balance_loss_mlp": 0.01275956, + "epoch": 0.08050503532241095, + "flos": 22382093617920.0, + "grad_norm": 3.2141596734882705, + "language_loss": 0.89268589, + "learning_rate": 3.973303182868147e-06, + "loss": 0.97443926, + "num_input_tokens_seen": 28427875, + "router_z_loss_clip": 4.7265625, + "router_z_loss_mlp": 0.51708984, + "step": 1339, + "time_per_iteration": 2.592478036880493 + }, + { + "auxiliary_loss_clip": 0.06819817, + "auxiliary_loss_mlp": 0.01317452, + "balance_loss_clip": 0.06381136, + "balance_loss_mlp": 0.01272391, + "epoch": 0.08056515857507891, + "flos": 18375351079680.0, + "grad_norm": 3.0627135326619093, + "language_loss": 0.91607487, + "learning_rate": 3.973239723395988e-06, + "loss": 0.99744761, + "num_input_tokens_seen": 28446615, + "router_z_loss_clip": 4.390625, + "router_z_loss_mlp": 0.45019531, + "step": 1340, + "time_per_iteration": 2.576737403869629 + }, + { + "auxiliary_loss_clip": 0.06633395, + "auxiliary_loss_mlp": 0.01308679, + "balance_loss_clip": 0.06341641, + "balance_loss_mlp": 0.01279282, + "epoch": 0.08062528182774688, + "flos": 51364938545280.0, + "grad_norm": 0.8608858843500025, + "language_loss": 0.65432441, + "learning_rate": 3.97317618909838e-06, + "loss": 0.73374522, + "num_input_tokens_seen": 28505290, + "router_z_loss_clip": 2.91992188, + "router_z_loss_mlp": 0.29321289, + "step": 1341, + "time_per_iteration": 3.1589889526367188 + }, + { + "auxiliary_loss_clip": 0.06851779, + "auxiliary_loss_mlp": 0.01330947, + "balance_loss_clip": 0.06375904, + "balance_loss_mlp": 0.01274966, + "epoch": 0.08068540508041486, + "flos": 17604925914240.0, + "grad_norm": 3.057229978757205, + "language_loss": 0.9131434, + "learning_rate": 3.973112579977733e-06, + "loss": 0.99497068, + "num_input_tokens_seen": 28522735, + "router_z_loss_clip": 4.7578125, + "router_z_loss_mlp": 0.55932617, + "step": 1342, + "time_per_iteration": 2.5444014072418213 + }, + { + "auxiliary_loss_clip": 0.06830276, + "auxiliary_loss_mlp": 0.01334079, + "balance_loss_clip": 0.06376267, + "balance_loss_mlp": 0.01283748, + "epoch": 0.08074552833308282, + "flos": 10565761714560.0, + "grad_norm": 4.354152160697022, + "language_loss": 0.78571475, + "learning_rate": 3.973048896036459e-06, + "loss": 0.86735827, + "num_input_tokens_seen": 28539460, + "router_z_loss_clip": 4.54296875, + "router_z_loss_mlp": 0.50268555, + "step": 1343, + "time_per_iteration": 2.5960419178009033 + }, + { + "auxiliary_loss_clip": 0.06624237, + "auxiliary_loss_mlp": 0.01296199, + "balance_loss_clip": 0.06332739, + "balance_loss_mlp": 0.0127157, + "epoch": 0.08080565158575079, + "flos": 60859624245120.0, + "grad_norm": 0.7713053801929547, + "language_loss": 0.57751364, + "learning_rate": 3.972985137276974e-06, + "loss": 0.65671802, + "num_input_tokens_seen": 28599855, + "router_z_loss_clip": 2.90625, + "router_z_loss_mlp": 0.24609375, + "step": 1344, + "time_per_iteration": 3.101456880569458 + }, + { + "auxiliary_loss_clip": 0.06825489, + "auxiliary_loss_mlp": 0.01321695, + "balance_loss_clip": 0.06367917, + "balance_loss_mlp": 0.01271937, + "epoch": 0.08086577483841875, + "flos": 18338188993920.0, + "grad_norm": 5.096262211204216, + "language_loss": 0.90334368, + "learning_rate": 3.972921303701695e-06, + "loss": 0.98481554, + "num_input_tokens_seen": 28617585, + "router_z_loss_clip": 4.578125, + "router_z_loss_mlp": 0.49780273, + "step": 1345, + "time_per_iteration": 2.586388349533081 + }, + { + "auxiliary_loss_clip": 0.0679345, + "auxiliary_loss_mlp": 0.013189, + "balance_loss_clip": 0.06356402, + "balance_loss_mlp": 0.01272527, + "epoch": 0.08092589809108673, + "flos": 21550048174080.0, + "grad_norm": 2.3072860000969437, + "language_loss": 0.89656544, + "learning_rate": 3.972857395313042e-06, + "loss": 0.97768891, + "num_input_tokens_seen": 28636355, + "router_z_loss_clip": 4.37109375, + "router_z_loss_mlp": 0.46386719, + "step": 1346, + "time_per_iteration": 2.582712411880493 + }, + { + "auxiliary_loss_clip": 0.06790248, + "auxiliary_loss_mlp": 0.01314356, + "balance_loss_clip": 0.06353667, + "balance_loss_mlp": 0.0126734, + "epoch": 0.0809860213437547, + "flos": 22134662910720.0, + "grad_norm": 2.14729633171376, + "language_loss": 0.94647479, + "learning_rate": 3.972793412113439e-06, + "loss": 1.0275209, + "num_input_tokens_seen": 28656260, + "router_z_loss_clip": 4.36328125, + "router_z_loss_mlp": 0.47021484, + "step": 1347, + "time_per_iteration": 2.625967025756836 + }, + { + "auxiliary_loss_clip": 0.06793564, + "auxiliary_loss_mlp": 0.01318721, + "balance_loss_clip": 0.06355867, + "balance_loss_mlp": 0.01268487, + "epoch": 0.08104614459642266, + "flos": 21731875534080.0, + "grad_norm": 1.9969105850097444, + "language_loss": 0.91454613, + "learning_rate": 3.972729354105312e-06, + "loss": 0.99566901, + "num_input_tokens_seen": 28675865, + "router_z_loss_clip": 4.37109375, + "router_z_loss_mlp": 0.50219727, + "step": 1348, + "time_per_iteration": 2.5634779930114746 + }, + { + "auxiliary_loss_clip": 0.06800284, + "auxiliary_loss_mlp": 0.01324319, + "balance_loss_clip": 0.06360676, + "balance_loss_mlp": 0.01274585, + "epoch": 0.08110626784909064, + "flos": 23958764369280.0, + "grad_norm": 1.9721965286660104, + "language_loss": 0.78618681, + "learning_rate": 3.97266522129109e-06, + "loss": 0.86743283, + "num_input_tokens_seen": 28696255, + "router_z_loss_clip": 4.39453125, + "router_z_loss_mlp": 0.49731445, + "step": 1349, + "time_per_iteration": 2.6185498237609863 + }, + { + "auxiliary_loss_clip": 0.06800876, + "auxiliary_loss_mlp": 0.01313559, + "balance_loss_clip": 0.06350809, + "balance_loss_mlp": 0.01260082, + "epoch": 0.0811663911017586, + "flos": 19031648584320.0, + "grad_norm": 2.1691769325426407, + "language_loss": 0.90292668, + "learning_rate": 3.972601013673205e-06, + "loss": 0.98407102, + "num_input_tokens_seen": 28713905, + "router_z_loss_clip": 4.5, + "router_z_loss_mlp": 0.53491211, + "step": 1350, + "time_per_iteration": 2.5529837608337402 + }, + { + "auxiliary_loss_clip": 0.06778225, + "auxiliary_loss_mlp": 0.01313184, + "balance_loss_clip": 0.06345821, + "balance_loss_mlp": 0.01263522, + "epoch": 0.08122651435442657, + "flos": 15346744778880.0, + "grad_norm": 2.4256402439075524, + "language_loss": 0.84302771, + "learning_rate": 3.972536731254092e-06, + "loss": 0.92394179, + "num_input_tokens_seen": 28732075, + "router_z_loss_clip": 4.33203125, + "router_z_loss_mlp": 0.49633789, + "step": 1351, + "time_per_iteration": 2.574605941772461 + }, + { + "auxiliary_loss_clip": 0.06780043, + "auxiliary_loss_mlp": 0.01313675, + "balance_loss_clip": 0.06340061, + "balance_loss_mlp": 0.01260365, + "epoch": 0.08128663760709455, + "flos": 23228226547200.0, + "grad_norm": 2.4241077577089296, + "language_loss": 0.77524561, + "learning_rate": 3.972472374036189e-06, + "loss": 0.85618269, + "num_input_tokens_seen": 28751150, + "router_z_loss_clip": 4.39453125, + "router_z_loss_mlp": 0.53393555, + "step": 1352, + "time_per_iteration": 2.5638983249664307 + }, + { + "auxiliary_loss_clip": 0.06784214, + "auxiliary_loss_mlp": 0.01317971, + "balance_loss_clip": 0.06339107, + "balance_loss_mlp": 0.01263802, + "epoch": 0.08134676085976252, + "flos": 22972158869760.0, + "grad_norm": 2.0098905052691154, + "language_loss": 0.84226817, + "learning_rate": 3.972407942021935e-06, + "loss": 0.92329001, + "num_input_tokens_seen": 28773360, + "router_z_loss_clip": 4.44140625, + "router_z_loss_mlp": 0.54223633, + "step": 1353, + "time_per_iteration": 2.64945125579834 + }, + { + "auxiliary_loss_clip": 0.06608218, + "auxiliary_loss_mlp": 0.01309213, + "balance_loss_clip": 0.06325812, + "balance_loss_mlp": 0.01278219, + "epoch": 0.08140688411243048, + "flos": 64338592642560.0, + "grad_norm": 0.8262871142057754, + "language_loss": 0.5983628, + "learning_rate": 3.972343435213775e-06, + "loss": 0.67753708, + "num_input_tokens_seen": 28833390, + "router_z_loss_clip": 2.828125, + "router_z_loss_mlp": 0.30957031, + "step": 1354, + "time_per_iteration": 3.1732943058013916 + }, + { + "auxiliary_loss_clip": 0.06774879, + "auxiliary_loss_mlp": 0.0130121, + "balance_loss_clip": 0.0634238, + "balance_loss_mlp": 0.01251332, + "epoch": 0.08146700736509845, + "flos": 22498401484800.0, + "grad_norm": 1.9500881523267093, + "language_loss": 0.84588456, + "learning_rate": 3.972278853614154e-06, + "loss": 0.92664552, + "num_input_tokens_seen": 28852430, + "router_z_loss_clip": 4.32421875, + "router_z_loss_mlp": 0.49853516, + "step": 1355, + "time_per_iteration": 2.6024701595306396 + }, + { + "auxiliary_loss_clip": 0.06776839, + "auxiliary_loss_mlp": 0.01312133, + "balance_loss_clip": 0.06341404, + "balance_loss_mlp": 0.01258727, + "epoch": 0.08152713061776642, + "flos": 20453885061120.0, + "grad_norm": 2.065670918937768, + "language_loss": 0.73062277, + "learning_rate": 3.972214197225521e-06, + "loss": 0.81151247, + "num_input_tokens_seen": 28870685, + "router_z_loss_clip": 4.3515625, + "router_z_loss_mlp": 0.53393555, + "step": 1356, + "time_per_iteration": 2.72872257232666 + }, + { + "auxiliary_loss_clip": 0.06800745, + "auxiliary_loss_mlp": 0.01315187, + "balance_loss_clip": 0.06343117, + "balance_loss_mlp": 0.01261305, + "epoch": 0.08158725387043439, + "flos": 23556983241600.0, + "grad_norm": 2.136910900826005, + "language_loss": 0.72079623, + "learning_rate": 3.972149466050329e-06, + "loss": 0.80195546, + "num_input_tokens_seen": 28889860, + "router_z_loss_clip": 4.57421875, + "router_z_loss_mlp": 0.5390625, + "step": 1357, + "time_per_iteration": 2.5841641426086426 + }, + { + "auxiliary_loss_clip": 0.06792152, + "auxiliary_loss_mlp": 0.01312262, + "balance_loss_clip": 0.06345978, + "balance_loss_mlp": 0.01258093, + "epoch": 0.08164737712310235, + "flos": 22023763632000.0, + "grad_norm": 3.905031036394957, + "language_loss": 0.86688, + "learning_rate": 3.97208466009103e-06, + "loss": 0.94792414, + "num_input_tokens_seen": 28905865, + "router_z_loss_clip": 4.45703125, + "router_z_loss_mlp": 0.54150391, + "step": 1358, + "time_per_iteration": 4.091388940811157 + }, + { + "auxiliary_loss_clip": 0.0678063, + "auxiliary_loss_mlp": 0.01322843, + "balance_loss_clip": 0.06336431, + "balance_loss_mlp": 0.01268985, + "epoch": 0.08170750037577033, + "flos": 23374568903040.0, + "grad_norm": 2.183092150408785, + "language_loss": 1.0464294, + "learning_rate": 3.972019779350084e-06, + "loss": 1.12746406, + "num_input_tokens_seen": 28925250, + "router_z_loss_clip": 4.4453125, + "router_z_loss_mlp": 0.53857422, + "step": 1359, + "time_per_iteration": 2.638028860092163 + }, + { + "auxiliary_loss_clip": 0.06798591, + "auxiliary_loss_mlp": 0.01334932, + "balance_loss_clip": 0.06339104, + "balance_loss_mlp": 0.01274732, + "epoch": 0.0817676236284383, + "flos": 28404743610240.0, + "grad_norm": 2.2550025008974335, + "language_loss": 0.86049831, + "learning_rate": 3.971954823829951e-06, + "loss": 0.9418335, + "num_input_tokens_seen": 28943445, + "router_z_loss_clip": 4.59765625, + "router_z_loss_mlp": 0.60229492, + "step": 1360, + "time_per_iteration": 4.079089164733887 + }, + { + "auxiliary_loss_clip": 0.06791367, + "auxiliary_loss_mlp": 0.01327265, + "balance_loss_clip": 0.06338443, + "balance_loss_mlp": 0.01274146, + "epoch": 0.08182774688110626, + "flos": 19215027244800.0, + "grad_norm": 8.376592298607987, + "language_loss": 0.74940681, + "learning_rate": 3.971889793533093e-06, + "loss": 0.83059311, + "num_input_tokens_seen": 28962695, + "router_z_loss_clip": 4.53125, + "router_z_loss_mlp": 0.53125, + "step": 1361, + "time_per_iteration": 2.6070094108581543 + }, + { + "auxiliary_loss_clip": 0.06780887, + "auxiliary_loss_mlp": 0.01320749, + "balance_loss_clip": 0.06343664, + "balance_loss_mlp": 0.01270443, + "epoch": 0.08188787013377424, + "flos": 22790750780160.0, + "grad_norm": 2.8909747766913574, + "language_loss": 0.79067749, + "learning_rate": 3.971824688461976e-06, + "loss": 0.87169385, + "num_input_tokens_seen": 28982120, + "router_z_loss_clip": 4.3671875, + "router_z_loss_mlp": 0.50244141, + "step": 1362, + "time_per_iteration": 2.575406074523926 + }, + { + "auxiliary_loss_clip": 0.06776625, + "auxiliary_loss_mlp": 0.01317112, + "balance_loss_clip": 0.06338399, + "balance_loss_mlp": 0.01266543, + "epoch": 0.08194799338644221, + "flos": 16473026234880.0, + "grad_norm": 2.5840358465526787, + "language_loss": 0.74518561, + "learning_rate": 3.971759508619069e-06, + "loss": 0.826123, + "num_input_tokens_seen": 28998100, + "router_z_loss_clip": 4.3828125, + "router_z_loss_mlp": 0.50537109, + "step": 1363, + "time_per_iteration": 3.9524402618408203 + }, + { + "auxiliary_loss_clip": 0.06785508, + "auxiliary_loss_mlp": 0.01321755, + "balance_loss_clip": 0.06342393, + "balance_loss_mlp": 0.01265846, + "epoch": 0.08200811663911017, + "flos": 23920218691200.0, + "grad_norm": 2.478943630227512, + "language_loss": 0.79175317, + "learning_rate": 3.971694254006844e-06, + "loss": 0.87282574, + "num_input_tokens_seen": 29017095, + "router_z_loss_clip": 4.43359375, + "router_z_loss_mlp": 0.55859375, + "step": 1364, + "time_per_iteration": 2.607170343399048 + }, + { + "auxiliary_loss_clip": 0.06783722, + "auxiliary_loss_mlp": 0.01316868, + "balance_loss_clip": 0.06340142, + "balance_loss_mlp": 0.01262867, + "epoch": 0.08206823989177814, + "flos": 17902641870720.0, + "grad_norm": 2.8411268969790275, + "language_loss": 0.83563399, + "learning_rate": 3.971628924627776e-06, + "loss": 0.91663992, + "num_input_tokens_seen": 29037240, + "router_z_loss_clip": 4.4375, + "router_z_loss_mlp": 0.54003906, + "step": 1365, + "time_per_iteration": 4.020315647125244 + }, + { + "auxiliary_loss_clip": 0.06767645, + "auxiliary_loss_mlp": 0.01324198, + "balance_loss_clip": 0.06336691, + "balance_loss_mlp": 0.01274917, + "epoch": 0.08212836314444612, + "flos": 22094272442880.0, + "grad_norm": 1.9744562731627089, + "language_loss": 0.83576512, + "learning_rate": 3.97156352048434e-06, + "loss": 0.91668355, + "num_input_tokens_seen": 29056250, + "router_z_loss_clip": 4.30078125, + "router_z_loss_mlp": 0.49243164, + "step": 1366, + "time_per_iteration": 2.5904746055603027 + }, + { + "auxiliary_loss_clip": 0.06785953, + "auxiliary_loss_mlp": 0.01321056, + "balance_loss_clip": 0.06344087, + "balance_loss_mlp": 0.01269963, + "epoch": 0.08218848639711408, + "flos": 17602326437760.0, + "grad_norm": 2.595099293602591, + "language_loss": 0.84101415, + "learning_rate": 3.97149804157902e-06, + "loss": 0.92208421, + "num_input_tokens_seen": 29073380, + "router_z_loss_clip": 4.41015625, + "router_z_loss_mlp": 0.51074219, + "step": 1367, + "time_per_iteration": 2.547091007232666 + }, + { + "auxiliary_loss_clip": 0.06812844, + "auxiliary_loss_mlp": 0.01336623, + "balance_loss_clip": 0.06357861, + "balance_loss_mlp": 0.01283504, + "epoch": 0.08224860964978205, + "flos": 17863551141120.0, + "grad_norm": 3.794710967606561, + "language_loss": 0.85955203, + "learning_rate": 3.9714324879142946e-06, + "loss": 0.94104671, + "num_input_tokens_seen": 29091330, + "router_z_loss_clip": 4.546875, + "router_z_loss_mlp": 0.53100586, + "step": 1368, + "time_per_iteration": 2.6025125980377197 + }, + { + "auxiliary_loss_clip": 0.06754048, + "auxiliary_loss_mlp": 0.01305347, + "balance_loss_clip": 0.06340475, + "balance_loss_mlp": 0.01259881, + "epoch": 0.08230873290245003, + "flos": 25234406928000.0, + "grad_norm": 1.7485210372757418, + "language_loss": 0.82751203, + "learning_rate": 3.971366859492653e-06, + "loss": 0.90810603, + "num_input_tokens_seen": 29110375, + "router_z_loss_clip": 4.12890625, + "router_z_loss_mlp": 0.45458984, + "step": 1369, + "time_per_iteration": 2.6027116775512695 + }, + { + "auxiliary_loss_clip": 0.06772825, + "auxiliary_loss_mlp": 0.01314688, + "balance_loss_clip": 0.06341462, + "balance_loss_mlp": 0.01264811, + "epoch": 0.08236885615511799, + "flos": 31768144099200.0, + "grad_norm": 4.8921113569353425, + "language_loss": 0.77775633, + "learning_rate": 3.971301156316582e-06, + "loss": 0.85863149, + "num_input_tokens_seen": 29129395, + "router_z_loss_clip": 4.31640625, + "router_z_loss_mlp": 0.49902344, + "step": 1370, + "time_per_iteration": 2.685317039489746 + }, + { + "auxiliary_loss_clip": 0.06783543, + "auxiliary_loss_mlp": 0.01317271, + "balance_loss_clip": 0.06345622, + "balance_loss_mlp": 0.01265153, + "epoch": 0.08242897940778596, + "flos": 23192615761920.0, + "grad_norm": 2.053394395942029, + "language_loss": 0.76803637, + "learning_rate": 3.971235378388573e-06, + "loss": 0.84904444, + "num_input_tokens_seen": 29148650, + "router_z_loss_clip": 4.37890625, + "router_z_loss_mlp": 0.52124023, + "step": 1371, + "time_per_iteration": 2.6406354904174805 + }, + { + "auxiliary_loss_clip": 0.06769266, + "auxiliary_loss_mlp": 0.01317025, + "balance_loss_clip": 0.06335683, + "balance_loss_mlp": 0.01267625, + "epoch": 0.08248910266045394, + "flos": 34499327932800.0, + "grad_norm": 3.0324747361967557, + "language_loss": 0.72827047, + "learning_rate": 3.971169525711122e-06, + "loss": 0.80913335, + "num_input_tokens_seen": 29170785, + "router_z_loss_clip": 4.33203125, + "router_z_loss_mlp": 0.49438477, + "step": 1372, + "time_per_iteration": 2.709796905517578 + }, + { + "auxiliary_loss_clip": 0.06798708, + "auxiliary_loss_mlp": 0.01317216, + "balance_loss_clip": 0.06345405, + "balance_loss_mlp": 0.01260854, + "epoch": 0.0825492259131219, + "flos": 13440059521920.0, + "grad_norm": 3.0329353190283075, + "language_loss": 0.9010855, + "learning_rate": 3.9711035982867246e-06, + "loss": 0.98224467, + "num_input_tokens_seen": 29185210, + "router_z_loss_clip": 4.53125, + "router_z_loss_mlp": 0.56420898, + "step": 1373, + "time_per_iteration": 2.5570318698883057 + }, + { + "auxiliary_loss_clip": 0.06774755, + "auxiliary_loss_mlp": 0.01317124, + "balance_loss_clip": 0.0634156, + "balance_loss_mlp": 0.01267056, + "epoch": 0.08260934916578987, + "flos": 25819608643200.0, + "grad_norm": 3.0603308178325657, + "language_loss": 0.84582615, + "learning_rate": 3.971037596117882e-06, + "loss": 0.92674494, + "num_input_tokens_seen": 29205210, + "router_z_loss_clip": 4.3359375, + "router_z_loss_mlp": 0.50024414, + "step": 1374, + "time_per_iteration": 2.596226215362549 + }, + { + "auxiliary_loss_clip": 0.06626149, + "auxiliary_loss_mlp": 0.0129603, + "balance_loss_clip": 0.06341976, + "balance_loss_mlp": 0.01265918, + "epoch": 0.08266947241845783, + "flos": 63478609061760.0, + "grad_norm": 0.8009341803089134, + "language_loss": 0.60659707, + "learning_rate": 3.970971519207095e-06, + "loss": 0.68581879, + "num_input_tokens_seen": 29265350, + "router_z_loss_clip": 2.84375, + "router_z_loss_mlp": 0.30053711, + "step": 1375, + "time_per_iteration": 3.177459716796875 + }, + { + "auxiliary_loss_clip": 0.06618689, + "auxiliary_loss_mlp": 0.01286424, + "balance_loss_clip": 0.06334813, + "balance_loss_mlp": 0.01256718, + "epoch": 0.08272959567112581, + "flos": 70013855606400.0, + "grad_norm": 0.886054791003263, + "language_loss": 0.62275791, + "learning_rate": 3.970905367556871e-06, + "loss": 0.70180905, + "num_input_tokens_seen": 29321475, + "router_z_loss_clip": 2.84375, + "router_z_loss_mlp": 0.29638672, + "step": 1376, + "time_per_iteration": 3.1206676959991455 + }, + { + "auxiliary_loss_clip": 0.06771185, + "auxiliary_loss_mlp": 0.01316915, + "balance_loss_clip": 0.06341776, + "balance_loss_mlp": 0.01268611, + "epoch": 0.08278971892379378, + "flos": 20419574014080.0, + "grad_norm": 2.5198182509144735, + "language_loss": 0.84768277, + "learning_rate": 3.970839141169718e-06, + "loss": 0.92856377, + "num_input_tokens_seen": 29341405, + "router_z_loss_clip": 4.2890625, + "router_z_loss_mlp": 0.48266602, + "step": 1377, + "time_per_iteration": 2.6820216178894043 + }, + { + "auxiliary_loss_clip": 0.06764729, + "auxiliary_loss_mlp": 0.01308146, + "balance_loss_clip": 0.06342821, + "balance_loss_mlp": 0.0126144, + "epoch": 0.08284984217646174, + "flos": 26257461753600.0, + "grad_norm": 2.286420184169047, + "language_loss": 0.86602247, + "learning_rate": 3.970772840048147e-06, + "loss": 0.94675124, + "num_input_tokens_seen": 29361955, + "router_z_loss_clip": 4.2109375, + "router_z_loss_mlp": 0.46728516, + "step": 1378, + "time_per_iteration": 2.5983967781066895 + }, + { + "auxiliary_loss_clip": 0.06779523, + "auxiliary_loss_mlp": 0.01324128, + "balance_loss_clip": 0.06348801, + "balance_loss_mlp": 0.01275396, + "epoch": 0.08290996542912972, + "flos": 27201370798080.0, + "grad_norm": 4.155383498543994, + "language_loss": 0.9020921, + "learning_rate": 3.970706464194672e-06, + "loss": 0.98312867, + "num_input_tokens_seen": 29382395, + "router_z_loss_clip": 4.30664062, + "router_z_loss_mlp": 0.48779297, + "step": 1379, + "time_per_iteration": 2.6558284759521484 + }, + { + "auxiliary_loss_clip": 0.06771149, + "auxiliary_loss_mlp": 0.01307486, + "balance_loss_clip": 0.06347619, + "balance_loss_mlp": 0.01261972, + "epoch": 0.08297008868179769, + "flos": 38627367655680.0, + "grad_norm": 2.766384510146163, + "language_loss": 0.80964148, + "learning_rate": 3.970640013611812e-06, + "loss": 0.89042783, + "num_input_tokens_seen": 29404460, + "router_z_loss_clip": 4.23828125, + "router_z_loss_mlp": 0.45483398, + "step": 1380, + "time_per_iteration": 2.7228140830993652 + }, + { + "auxiliary_loss_clip": 0.06759404, + "auxiliary_loss_mlp": 0.01314619, + "balance_loss_clip": 0.06340429, + "balance_loss_mlp": 0.01265576, + "epoch": 0.08303021193446565, + "flos": 19980924289920.0, + "grad_norm": 2.7915027065661593, + "language_loss": 0.88561881, + "learning_rate": 3.970573488302083e-06, + "loss": 0.96635896, + "num_input_tokens_seen": 29422675, + "router_z_loss_clip": 4.18554688, + "router_z_loss_mlp": 0.49023438, + "step": 1381, + "time_per_iteration": 2.6598143577575684 + }, + { + "auxiliary_loss_clip": 0.06800985, + "auxiliary_loss_mlp": 0.0131809, + "balance_loss_clip": 0.06359053, + "balance_loss_mlp": 0.01265972, + "epoch": 0.08309033518713363, + "flos": 13667769792000.0, + "grad_norm": 3.693105114641136, + "language_loss": 0.91473186, + "learning_rate": 3.970506888268011e-06, + "loss": 0.99592257, + "num_input_tokens_seen": 29439840, + "router_z_loss_clip": 4.41796875, + "router_z_loss_mlp": 0.52148438, + "step": 1382, + "time_per_iteration": 2.5975959300994873 + }, + { + "auxiliary_loss_clip": 0.06790116, + "auxiliary_loss_mlp": 0.01312438, + "balance_loss_clip": 0.06361018, + "balance_loss_mlp": 0.01263229, + "epoch": 0.0831504584398016, + "flos": 17974492346880.0, + "grad_norm": 2.495217268396043, + "language_loss": 0.78734231, + "learning_rate": 3.970440213512121e-06, + "loss": 0.86836791, + "num_input_tokens_seen": 29457360, + "router_z_loss_clip": 4.28515625, + "router_z_loss_mlp": 0.49243164, + "step": 1383, + "time_per_iteration": 2.625793695449829 + }, + { + "auxiliary_loss_clip": 0.06786636, + "auxiliary_loss_mlp": 0.01320002, + "balance_loss_clip": 0.06359254, + "balance_loss_mlp": 0.01273797, + "epoch": 0.08321058169246956, + "flos": 22607959098240.0, + "grad_norm": 2.963836437118746, + "language_loss": 0.85324878, + "learning_rate": 3.97037346403694e-06, + "loss": 0.93431515, + "num_input_tokens_seen": 29477040, + "router_z_loss_clip": 4.26953125, + "router_z_loss_mlp": 0.46240234, + "step": 1384, + "time_per_iteration": 2.6376733779907227 + }, + { + "auxiliary_loss_clip": 0.06818897, + "auxiliary_loss_mlp": 0.01334638, + "balance_loss_clip": 0.06359202, + "balance_loss_mlp": 0.01276106, + "epoch": 0.08327070494513754, + "flos": 22855976784000.0, + "grad_norm": 3.1601990232642225, + "language_loss": 0.86789215, + "learning_rate": 3.970306639845e-06, + "loss": 0.94942749, + "num_input_tokens_seen": 29492010, + "router_z_loss_clip": 4.59375, + "router_z_loss_mlp": 0.58569336, + "step": 1385, + "time_per_iteration": 2.568554639816284 + }, + { + "auxiliary_loss_clip": 0.06798602, + "auxiliary_loss_mlp": 0.0132055, + "balance_loss_clip": 0.06352767, + "balance_loss_mlp": 0.01267978, + "epoch": 0.0833308281978055, + "flos": 22789451041920.0, + "grad_norm": 2.43217008586481, + "language_loss": 0.71394652, + "learning_rate": 3.970239740938835e-06, + "loss": 0.795138, + "num_input_tokens_seen": 29511850, + "router_z_loss_clip": 4.45703125, + "router_z_loss_mlp": 0.52563477, + "step": 1386, + "time_per_iteration": 2.6096982955932617 + }, + { + "auxiliary_loss_clip": 0.06791467, + "auxiliary_loss_mlp": 0.01322523, + "balance_loss_clip": 0.06356902, + "balance_loss_mlp": 0.01273099, + "epoch": 0.08339095145047347, + "flos": 20818713738240.0, + "grad_norm": 2.3900622326762133, + "language_loss": 0.84172809, + "learning_rate": 3.97017276732098e-06, + "loss": 0.92286795, + "num_input_tokens_seen": 29531415, + "router_z_loss_clip": 4.34375, + "router_z_loss_mlp": 0.49389648, + "step": 1387, + "time_per_iteration": 2.575343132019043 + }, + { + "auxiliary_loss_clip": 0.06797379, + "auxiliary_loss_mlp": 0.01318956, + "balance_loss_clip": 0.06353064, + "balance_loss_mlp": 0.01265598, + "epoch": 0.08345107470314143, + "flos": 18521274165120.0, + "grad_norm": 5.434584550719809, + "language_loss": 0.79640985, + "learning_rate": 3.970105718993978e-06, + "loss": 0.87757325, + "num_input_tokens_seen": 29549525, + "router_z_loss_clip": 4.44140625, + "router_z_loss_mlp": 0.53369141, + "step": 1388, + "time_per_iteration": 2.567218780517578 + }, + { + "auxiliary_loss_clip": 0.06780161, + "auxiliary_loss_mlp": 0.01317075, + "balance_loss_clip": 0.06354657, + "balance_loss_mlp": 0.0126932, + "epoch": 0.08351119795580941, + "flos": 18813623460480.0, + "grad_norm": 2.631761877844796, + "language_loss": 0.82141799, + "learning_rate": 3.970038595960369e-06, + "loss": 0.90239036, + "num_input_tokens_seen": 29568705, + "router_z_loss_clip": 4.2578125, + "router_z_loss_mlp": 0.47827148, + "step": 1389, + "time_per_iteration": 2.5653841495513916 + }, + { + "auxiliary_loss_clip": 0.06804, + "auxiliary_loss_mlp": 0.01321664, + "balance_loss_clip": 0.06357203, + "balance_loss_mlp": 0.01264014, + "epoch": 0.08357132120847738, + "flos": 18447662753280.0, + "grad_norm": 4.4672809610096005, + "language_loss": 0.89901805, + "learning_rate": 3.969971398222699e-06, + "loss": 0.9802748, + "num_input_tokens_seen": 29585855, + "router_z_loss_clip": 4.46484375, + "router_z_loss_mlp": 0.57666016, + "step": 1390, + "time_per_iteration": 2.5599520206451416 + }, + { + "auxiliary_loss_clip": 0.06784607, + "auxiliary_loss_mlp": 0.01318322, + "balance_loss_clip": 0.06351756, + "balance_loss_mlp": 0.01268469, + "epoch": 0.08363144446114534, + "flos": 25929585527040.0, + "grad_norm": 2.0099549817565, + "language_loss": 0.88354278, + "learning_rate": 3.969904125783517e-06, + "loss": 0.96457207, + "num_input_tokens_seen": 29607280, + "router_z_loss_clip": 4.328125, + "router_z_loss_mlp": 0.49853516, + "step": 1391, + "time_per_iteration": 2.611985921859741 + }, + { + "auxiliary_loss_clip": 0.06815389, + "auxiliary_loss_mlp": 0.01329624, + "balance_loss_clip": 0.06354406, + "balance_loss_mlp": 0.01268851, + "epoch": 0.08369156771381332, + "flos": 18047223290880.0, + "grad_norm": 3.4660821416963805, + "language_loss": 0.90262675, + "learning_rate": 3.969836778645371e-06, + "loss": 0.98407698, + "num_input_tokens_seen": 29624130, + "router_z_loss_clip": 4.609375, + "router_z_loss_mlp": 0.60791016, + "step": 1392, + "time_per_iteration": 2.5649681091308594 + }, + { + "auxiliary_loss_clip": 0.06784143, + "auxiliary_loss_mlp": 0.01319854, + "balance_loss_clip": 0.06346482, + "balance_loss_mlp": 0.01270025, + "epoch": 0.08375169096648129, + "flos": 22681822072320.0, + "grad_norm": 4.398591622405809, + "language_loss": 0.82388842, + "learning_rate": 3.969769356810819e-06, + "loss": 0.90492845, + "num_input_tokens_seen": 29643210, + "router_z_loss_clip": 4.375, + "router_z_loss_mlp": 0.4987793, + "step": 1393, + "time_per_iteration": 2.596484899520874 + }, + { + "auxiliary_loss_clip": 0.06777762, + "auxiliary_loss_mlp": 0.01325984, + "balance_loss_clip": 0.06353533, + "balance_loss_mlp": 0.01276679, + "epoch": 0.08381181421914925, + "flos": 26110238929920.0, + "grad_norm": 2.2804276198164386, + "language_loss": 0.86896241, + "learning_rate": 3.969701860282415e-06, + "loss": 0.94999981, + "num_input_tokens_seen": 29663920, + "router_z_loss_clip": 4.24414062, + "router_z_loss_mlp": 0.49291992, + "step": 1394, + "time_per_iteration": 2.6082303524017334 + }, + { + "auxiliary_loss_clip": 0.06795013, + "auxiliary_loss_mlp": 0.01318108, + "balance_loss_clip": 0.06360835, + "balance_loss_mlp": 0.01267063, + "epoch": 0.08387193747181723, + "flos": 20635796275200.0, + "grad_norm": 2.9482675367733306, + "language_loss": 0.84974355, + "learning_rate": 3.969634289062719e-06, + "loss": 0.93087476, + "num_input_tokens_seen": 29683825, + "router_z_loss_clip": 4.3359375, + "router_z_loss_mlp": 0.51098633, + "step": 1395, + "time_per_iteration": 2.579622745513916 + }, + { + "auxiliary_loss_clip": 0.06798401, + "auxiliary_loss_mlp": 0.01311309, + "balance_loss_clip": 0.06349191, + "balance_loss_mlp": 0.01256282, + "epoch": 0.0839320607244852, + "flos": 13448193367680.0, + "grad_norm": 3.513957453818194, + "language_loss": 0.85002828, + "learning_rate": 3.969566643154293e-06, + "loss": 0.93112534, + "num_input_tokens_seen": 29698775, + "router_z_loss_clip": 4.48828125, + "router_z_loss_mlp": 0.55078125, + "step": 1396, + "time_per_iteration": 2.5521080493927 + }, + { + "auxiliary_loss_clip": 0.06784061, + "auxiliary_loss_mlp": 0.0131232, + "balance_loss_clip": 0.06356047, + "balance_loss_mlp": 0.0126261, + "epoch": 0.08399218397715316, + "flos": 23484000735360.0, + "grad_norm": 4.145800578493811, + "language_loss": 0.79030329, + "learning_rate": 3.969498922559703e-06, + "loss": 0.87126708, + "num_input_tokens_seen": 29719430, + "router_z_loss_clip": 4.28515625, + "router_z_loss_mlp": 0.49682617, + "step": 1397, + "time_per_iteration": 4.026551961898804 + }, + { + "auxiliary_loss_clip": 0.06777123, + "auxiliary_loss_mlp": 0.01309701, + "balance_loss_clip": 0.06349255, + "balance_loss_mlp": 0.01258655, + "epoch": 0.08405230722982113, + "flos": 25927698810240.0, + "grad_norm": 3.1837358420566173, + "language_loss": 0.79802477, + "learning_rate": 3.969431127281516e-06, + "loss": 0.87889296, + "num_input_tokens_seen": 29739685, + "router_z_loss_clip": 4.27734375, + "router_z_loss_mlp": 0.51123047, + "step": 1398, + "time_per_iteration": 2.6027841567993164 + }, + { + "auxiliary_loss_clip": 0.06793746, + "auxiliary_loss_mlp": 0.01312625, + "balance_loss_clip": 0.06375143, + "balance_loss_mlp": 0.01265299, + "epoch": 0.0841124304824891, + "flos": 17973192608640.0, + "grad_norm": 3.0716222673767404, + "language_loss": 0.96745825, + "learning_rate": 3.969363257322304e-06, + "loss": 1.048522, + "num_input_tokens_seen": 29756165, + "router_z_loss_clip": 4.1875, + "router_z_loss_mlp": 0.47290039, + "step": 1399, + "time_per_iteration": 3.9915521144866943 + }, + { + "auxiliary_loss_clip": 0.06813341, + "auxiliary_loss_mlp": 0.01316281, + "balance_loss_clip": 0.06352973, + "balance_loss_mlp": 0.01258012, + "epoch": 0.08417255373515707, + "flos": 25636733107200.0, + "grad_norm": 6.6751707009018055, + "language_loss": 0.83959824, + "learning_rate": 3.96929531268464e-06, + "loss": 0.92089444, + "num_input_tokens_seen": 29776425, + "router_z_loss_clip": 4.6015625, + "router_z_loss_mlp": 0.58300781, + "step": 1400, + "time_per_iteration": 2.6097705364227295 + }, + { + "auxiliary_loss_clip": 0.06801295, + "auxiliary_loss_mlp": 0.01317439, + "balance_loss_clip": 0.06362335, + "balance_loss_mlp": 0.01264868, + "epoch": 0.08423267698782504, + "flos": 26256874775040.0, + "grad_norm": 2.3612401801911487, + "language_loss": 0.8841815, + "learning_rate": 3.969227293371099e-06, + "loss": 0.96536887, + "num_input_tokens_seen": 29796440, + "router_z_loss_clip": 4.38671875, + "router_z_loss_mlp": 0.52539062, + "step": 1401, + "time_per_iteration": 2.654085874557495 + }, + { + "auxiliary_loss_clip": 0.06806403, + "auxiliary_loss_mlp": 0.01316426, + "balance_loss_clip": 0.0637629, + "balance_loss_mlp": 0.01264594, + "epoch": 0.08429280024049302, + "flos": 20125757272320.0, + "grad_norm": 2.1446358728684753, + "language_loss": 0.90116793, + "learning_rate": 3.969159199384263e-06, + "loss": 0.98239625, + "num_input_tokens_seen": 29814755, + "router_z_loss_clip": 4.296875, + "router_z_loss_mlp": 0.51733398, + "step": 1402, + "time_per_iteration": 4.018750905990601 + }, + { + "auxiliary_loss_clip": 0.067935, + "auxiliary_loss_mlp": 0.01308153, + "balance_loss_clip": 0.06370865, + "balance_loss_mlp": 0.01261519, + "epoch": 0.08435292349316098, + "flos": 42934593335040.0, + "grad_norm": 3.3097945414979324, + "language_loss": 0.91613716, + "learning_rate": 3.9690910307267125e-06, + "loss": 0.99715364, + "num_input_tokens_seen": 29834785, + "router_z_loss_clip": 4.21484375, + "router_z_loss_mlp": 0.46655273, + "step": 1403, + "time_per_iteration": 2.75314998626709 + }, + { + "auxiliary_loss_clip": 0.06802634, + "auxiliary_loss_mlp": 0.01312918, + "balance_loss_clip": 0.0636553, + "balance_loss_mlp": 0.01259679, + "epoch": 0.08441304674582895, + "flos": 22863984848640.0, + "grad_norm": 2.1842752098613696, + "language_loss": 0.8341198, + "learning_rate": 3.969022787401033e-06, + "loss": 0.91527522, + "num_input_tokens_seen": 29854695, + "router_z_loss_clip": 4.37109375, + "router_z_loss_mlp": 0.5324707, + "step": 1404, + "time_per_iteration": 4.128188371658325 + }, + { + "auxiliary_loss_clip": 0.06814778, + "auxiliary_loss_mlp": 0.01317505, + "balance_loss_clip": 0.06364593, + "balance_loss_mlp": 0.01263884, + "epoch": 0.08447316999849692, + "flos": 18703436941440.0, + "grad_norm": 2.408821192970914, + "language_loss": 0.85791099, + "learning_rate": 3.968954469409811e-06, + "loss": 0.93923384, + "num_input_tokens_seen": 29872180, + "router_z_loss_clip": 4.5, + "router_z_loss_mlp": 0.53588867, + "step": 1405, + "time_per_iteration": 2.6186141967773438 + }, + { + "auxiliary_loss_clip": 0.06785356, + "auxiliary_loss_mlp": 0.01307288, + "balance_loss_clip": 0.06358731, + "balance_loss_mlp": 0.01261488, + "epoch": 0.08453329325116489, + "flos": 25491061584000.0, + "grad_norm": 2.376275583502495, + "language_loss": 0.82456648, + "learning_rate": 3.968886076755639e-06, + "loss": 0.9054929, + "num_input_tokens_seen": 29893205, + "router_z_loss_clip": 4.2578125, + "router_z_loss_mlp": 0.45825195, + "step": 1406, + "time_per_iteration": 2.620391845703125 + }, + { + "auxiliary_loss_clip": 0.06791453, + "auxiliary_loss_mlp": 0.01321291, + "balance_loss_clip": 0.06356591, + "balance_loss_mlp": 0.01271461, + "epoch": 0.08459341650383286, + "flos": 20925839583360.0, + "grad_norm": 2.994077443847897, + "language_loss": 0.81261843, + "learning_rate": 3.96881760944111e-06, + "loss": 0.8937459, + "num_input_tokens_seen": 29911970, + "router_z_loss_clip": 4.34765625, + "router_z_loss_mlp": 0.49853516, + "step": 1407, + "time_per_iteration": 2.6037673950195312 + }, + { + "auxiliary_loss_clip": 0.06790854, + "auxiliary_loss_mlp": 0.01321715, + "balance_loss_clip": 0.06351606, + "balance_loss_mlp": 0.01269525, + "epoch": 0.08465353975650082, + "flos": 13048215102720.0, + "grad_norm": 4.665844838977458, + "language_loss": 0.93093699, + "learning_rate": 3.968749067468819e-06, + "loss": 1.01206267, + "num_input_tokens_seen": 29929925, + "router_z_loss_clip": 4.39453125, + "router_z_loss_mlp": 0.52197266, + "step": 1408, + "time_per_iteration": 2.5401058197021484 + }, + { + "auxiliary_loss_clip": 0.06614841, + "auxiliary_loss_mlp": 0.0131788, + "balance_loss_clip": 0.06340891, + "balance_loss_mlp": 0.01289985, + "epoch": 0.0847136630091688, + "flos": 60896912112000.0, + "grad_norm": 0.8563868358173309, + "language_loss": 0.62132567, + "learning_rate": 3.968680450841368e-06, + "loss": 0.7006529, + "num_input_tokens_seen": 29985950, + "router_z_loss_clip": 2.75, + "router_z_loss_mlp": 0.27954102, + "step": 1409, + "time_per_iteration": 3.2652077674865723 + }, + { + "auxiliary_loss_clip": 0.06755531, + "auxiliary_loss_mlp": 0.01311791, + "balance_loss_clip": 0.06338526, + "balance_loss_mlp": 0.01266802, + "epoch": 0.08477378626183676, + "flos": 22051743696000.0, + "grad_norm": 2.2146573769232916, + "language_loss": 0.88621575, + "learning_rate": 3.968611759561355e-06, + "loss": 0.96688896, + "num_input_tokens_seen": 30004330, + "router_z_loss_clip": 4.1640625, + "router_z_loss_mlp": 0.44995117, + "step": 1410, + "time_per_iteration": 2.5771710872650146 + }, + { + "auxiliary_loss_clip": 0.06769306, + "auxiliary_loss_mlp": 0.01318797, + "balance_loss_clip": 0.06336072, + "balance_loss_mlp": 0.01268253, + "epoch": 0.08483390951450473, + "flos": 16695537552000.0, + "grad_norm": 2.3714211979189987, + "language_loss": 0.76187658, + "learning_rate": 3.968542993631388e-06, + "loss": 0.84275758, + "num_input_tokens_seen": 30022555, + "router_z_loss_clip": 4.328125, + "router_z_loss_mlp": 0.50585938, + "step": 1411, + "time_per_iteration": 2.5831918716430664 + }, + { + "auxiliary_loss_clip": 0.06605848, + "auxiliary_loss_mlp": 0.01302084, + "balance_loss_clip": 0.06331737, + "balance_loss_mlp": 0.01268491, + "epoch": 0.08489403276717271, + "flos": 51604430313600.0, + "grad_norm": 0.8982882759913209, + "language_loss": 0.57100856, + "learning_rate": 3.968474153054073e-06, + "loss": 0.65008789, + "num_input_tokens_seen": 30077220, + "router_z_loss_clip": 2.7421875, + "router_z_loss_mlp": 0.33618164, + "step": 1412, + "time_per_iteration": 3.1449196338653564 + }, + { + "auxiliary_loss_clip": 0.06776647, + "auxiliary_loss_mlp": 0.0131046, + "balance_loss_clip": 0.06348051, + "balance_loss_mlp": 0.01261393, + "epoch": 0.08495415601984067, + "flos": 17098031439360.0, + "grad_norm": 4.4528738806487, + "language_loss": 0.91184032, + "learning_rate": 3.96840523783202e-06, + "loss": 0.99271137, + "num_input_tokens_seen": 30094600, + "router_z_loss_clip": 4.28515625, + "router_z_loss_mlp": 0.49145508, + "step": 1413, + "time_per_iteration": 2.5736677646636963 + }, + { + "auxiliary_loss_clip": 0.06762269, + "auxiliary_loss_mlp": 0.01310346, + "balance_loss_clip": 0.06341726, + "balance_loss_mlp": 0.01261685, + "epoch": 0.08501427927250864, + "flos": 23155034405760.0, + "grad_norm": 2.1658829941413997, + "language_loss": 0.9017415, + "learning_rate": 3.968336247967844e-06, + "loss": 0.98246765, + "num_input_tokens_seen": 30114475, + "router_z_loss_clip": 4.20703125, + "router_z_loss_mlp": 0.48706055, + "step": 1414, + "time_per_iteration": 2.6087806224823 + }, + { + "auxiliary_loss_clip": 0.06782193, + "auxiliary_loss_mlp": 0.01303484, + "balance_loss_clip": 0.06352735, + "balance_loss_mlp": 0.01258423, + "epoch": 0.08507440252517662, + "flos": 19069649210880.0, + "grad_norm": 2.082765030572706, + "language_loss": 0.79920703, + "learning_rate": 3.96826718346416e-06, + "loss": 0.88006377, + "num_input_tokens_seen": 30133350, + "router_z_loss_clip": 4.2890625, + "router_z_loss_mlp": 0.45068359, + "step": 1415, + "time_per_iteration": 2.5629544258117676 + }, + { + "auxiliary_loss_clip": 0.06759159, + "auxiliary_loss_mlp": 0.01306699, + "balance_loss_clip": 0.06336564, + "balance_loss_mlp": 0.01259492, + "epoch": 0.08513452577784458, + "flos": 60195249550080.0, + "grad_norm": 8.264598666401978, + "language_loss": 0.72300386, + "learning_rate": 3.968198044323587e-06, + "loss": 0.80366242, + "num_input_tokens_seen": 30159005, + "router_z_loss_clip": 4.21875, + "router_z_loss_mlp": 0.47216797, + "step": 1416, + "time_per_iteration": 2.9444239139556885 + }, + { + "auxiliary_loss_clip": 0.06803774, + "auxiliary_loss_mlp": 0.01317561, + "balance_loss_clip": 0.0635466, + "balance_loss_mlp": 0.01264608, + "epoch": 0.08519464903051255, + "flos": 27315917729280.0, + "grad_norm": 2.5149113887395407, + "language_loss": 0.77021283, + "learning_rate": 3.968128830548748e-06, + "loss": 0.85142624, + "num_input_tokens_seen": 30179450, + "router_z_loss_clip": 4.48046875, + "router_z_loss_mlp": 0.5300293, + "step": 1417, + "time_per_iteration": 2.619328260421753 + }, + { + "auxiliary_loss_clip": 0.06779526, + "auxiliary_loss_mlp": 0.01310101, + "balance_loss_clip": 0.06341187, + "balance_loss_mlp": 0.01259341, + "epoch": 0.08525477228318051, + "flos": 20272644679680.0, + "grad_norm": 2.930615198621333, + "language_loss": 0.84423447, + "learning_rate": 3.968059542142265e-06, + "loss": 0.92513078, + "num_input_tokens_seen": 30197235, + "router_z_loss_clip": 4.37890625, + "router_z_loss_mlp": 0.5078125, + "step": 1418, + "time_per_iteration": 2.5782899856567383 + }, + { + "auxiliary_loss_clip": 0.06606524, + "auxiliary_loss_mlp": 0.01274095, + "balance_loss_clip": 0.06333332, + "balance_loss_mlp": 0.01249931, + "epoch": 0.08531489553584849, + "flos": 67633580672640.0, + "grad_norm": 0.9458512268838744, + "language_loss": 0.5659793, + "learning_rate": 3.9679901791067685e-06, + "loss": 0.64478552, + "num_input_tokens_seen": 30257410, + "router_z_loss_clip": 2.73046875, + "router_z_loss_mlp": 0.24157715, + "step": 1419, + "time_per_iteration": 3.1296868324279785 + }, + { + "auxiliary_loss_clip": 0.06790996, + "auxiliary_loss_mlp": 0.01306783, + "balance_loss_clip": 0.06354627, + "balance_loss_mlp": 0.01259004, + "epoch": 0.08537501878851646, + "flos": 27534362123520.0, + "grad_norm": 2.6126551890980076, + "language_loss": 0.72536588, + "learning_rate": 3.967920741444886e-06, + "loss": 0.80634367, + "num_input_tokens_seen": 30277865, + "router_z_loss_clip": 4.359375, + "router_z_loss_mlp": 0.4777832, + "step": 1420, + "time_per_iteration": 2.629305839538574 + }, + { + "auxiliary_loss_clip": 0.06772007, + "auxiliary_loss_mlp": 0.01307483, + "balance_loss_clip": 0.06343359, + "balance_loss_mlp": 0.01257272, + "epoch": 0.08543514204118442, + "flos": 22790918488320.0, + "grad_norm": 2.3388359886837917, + "language_loss": 0.89903885, + "learning_rate": 3.967851229159252e-06, + "loss": 0.97983378, + "num_input_tokens_seen": 30298545, + "router_z_loss_clip": 4.27929688, + "router_z_loss_mlp": 0.50244141, + "step": 1421, + "time_per_iteration": 2.5863590240478516 + }, + { + "auxiliary_loss_clip": 0.06597036, + "auxiliary_loss_mlp": 0.01275978, + "balance_loss_clip": 0.06325173, + "balance_loss_mlp": 0.01249919, + "epoch": 0.0854952652938524, + "flos": 61010872064640.0, + "grad_norm": 0.7745811005373293, + "language_loss": 0.63692141, + "learning_rate": 3.967781642252502e-06, + "loss": 0.71565151, + "num_input_tokens_seen": 30361725, + "router_z_loss_clip": 2.71875, + "router_z_loss_mlp": 0.26098633, + "step": 1422, + "time_per_iteration": 3.19461989402771 + }, + { + "auxiliary_loss_clip": 0.06765623, + "auxiliary_loss_mlp": 0.01311314, + "balance_loss_clip": 0.06344545, + "balance_loss_mlp": 0.01266444, + "epoch": 0.08555538854652037, + "flos": 28045575083520.0, + "grad_norm": 3.3087422543747205, + "language_loss": 0.84878761, + "learning_rate": 3.967711980727276e-06, + "loss": 0.92955703, + "num_input_tokens_seen": 30382180, + "router_z_loss_clip": 4.21289062, + "router_z_loss_mlp": 0.44873047, + "step": 1423, + "time_per_iteration": 2.6554226875305176 + }, + { + "auxiliary_loss_clip": 0.06776007, + "auxiliary_loss_mlp": 0.01303967, + "balance_loss_clip": 0.06351057, + "balance_loss_mlp": 0.01261314, + "epoch": 0.08561551179918833, + "flos": 23515293035520.0, + "grad_norm": 2.569087931646671, + "language_loss": 0.7765131, + "learning_rate": 3.967642244586213e-06, + "loss": 0.85731286, + "num_input_tokens_seen": 30402980, + "router_z_loss_clip": 4.24609375, + "router_z_loss_mlp": 0.42602539, + "step": 1424, + "time_per_iteration": 2.7058026790618896 + }, + { + "auxiliary_loss_clip": 0.06765693, + "auxiliary_loss_mlp": 0.01310667, + "balance_loss_clip": 0.06343248, + "balance_loss_mlp": 0.01265988, + "epoch": 0.08567563505185631, + "flos": 17932005527040.0, + "grad_norm": 1.9981101747379681, + "language_loss": 0.78279495, + "learning_rate": 3.96757243383196e-06, + "loss": 0.86355859, + "num_input_tokens_seen": 30420800, + "router_z_loss_clip": 4.22265625, + "router_z_loss_mlp": 0.44677734, + "step": 1425, + "time_per_iteration": 2.575941801071167 + }, + { + "auxiliary_loss_clip": 0.06768522, + "auxiliary_loss_mlp": 0.01310756, + "balance_loss_clip": 0.06347974, + "balance_loss_mlp": 0.01264074, + "epoch": 0.08573575830452428, + "flos": 19725695153280.0, + "grad_norm": 2.337358950389625, + "language_loss": 0.95636088, + "learning_rate": 3.9675025484671624e-06, + "loss": 1.03715372, + "num_input_tokens_seen": 30439620, + "router_z_loss_clip": 4.20507812, + "router_z_loss_mlp": 0.46679688, + "step": 1426, + "time_per_iteration": 2.5706772804260254 + }, + { + "auxiliary_loss_clip": 0.06791019, + "auxiliary_loss_mlp": 0.01318941, + "balance_loss_clip": 0.06355577, + "balance_loss_mlp": 0.01267776, + "epoch": 0.08579588155719224, + "flos": 17937414115200.0, + "grad_norm": 3.6077969135085945, + "language_loss": 0.78100324, + "learning_rate": 3.967432588494471e-06, + "loss": 0.86210281, + "num_input_tokens_seen": 30457300, + "router_z_loss_clip": 4.3515625, + "router_z_loss_mlp": 0.51196289, + "step": 1427, + "time_per_iteration": 2.620664119720459 + }, + { + "auxiliary_loss_clip": 0.06773555, + "auxiliary_loss_mlp": 0.01322231, + "balance_loss_clip": 0.06351949, + "balance_loss_mlp": 0.01272831, + "epoch": 0.08585600480986022, + "flos": 16038694995840.0, + "grad_norm": 4.670417341284444, + "language_loss": 0.84344131, + "learning_rate": 3.96736255391654e-06, + "loss": 0.92439914, + "num_input_tokens_seen": 30471580, + "router_z_loss_clip": 4.21679688, + "router_z_loss_mlp": 0.49414062, + "step": 1428, + "time_per_iteration": 2.5323448181152344 + }, + { + "auxiliary_loss_clip": 0.06797348, + "auxiliary_loss_mlp": 0.01327926, + "balance_loss_clip": 0.06359121, + "balance_loss_mlp": 0.01274211, + "epoch": 0.08591612806252819, + "flos": 28664920137600.0, + "grad_norm": 3.8563401660428136, + "language_loss": 0.82438064, + "learning_rate": 3.967292444736023e-06, + "loss": 0.90563333, + "num_input_tokens_seen": 30492720, + "router_z_loss_clip": 4.375, + "router_z_loss_mlp": 0.53710938, + "step": 1429, + "time_per_iteration": 2.6729156970977783 + }, + { + "auxiliary_loss_clip": 0.06787296, + "auxiliary_loss_mlp": 0.01320421, + "balance_loss_clip": 0.06368907, + "balance_loss_mlp": 0.0127586, + "epoch": 0.08597625131519615, + "flos": 20965349583360.0, + "grad_norm": 2.123464733030403, + "language_loss": 0.90146309, + "learning_rate": 3.967222260955578e-06, + "loss": 0.98254025, + "num_input_tokens_seen": 30509535, + "router_z_loss_clip": 4.18554688, + "router_z_loss_mlp": 0.44580078, + "step": 1430, + "time_per_iteration": 2.553527355194092 + }, + { + "auxiliary_loss_clip": 0.06773631, + "auxiliary_loss_mlp": 0.01318779, + "balance_loss_clip": 0.06357691, + "balance_loss_mlp": 0.01274552, + "epoch": 0.08603637456786412, + "flos": 23262747229440.0, + "grad_norm": 2.0722520617005924, + "language_loss": 0.84170914, + "learning_rate": 3.96715200257787e-06, + "loss": 0.92263317, + "num_input_tokens_seen": 30529490, + "router_z_loss_clip": 4.16015625, + "router_z_loss_mlp": 0.44213867, + "step": 1431, + "time_per_iteration": 2.5954349040985107 + }, + { + "auxiliary_loss_clip": 0.06773046, + "auxiliary_loss_mlp": 0.01317231, + "balance_loss_clip": 0.06352717, + "balance_loss_mlp": 0.01270858, + "epoch": 0.0860964978205321, + "flos": 28701704880000.0, + "grad_norm": 5.769747909175534, + "language_loss": 0.79544812, + "learning_rate": 3.967081669605559e-06, + "loss": 0.87635088, + "num_input_tokens_seen": 30550205, + "router_z_loss_clip": 4.19726562, + "router_z_loss_mlp": 0.46362305, + "step": 1432, + "time_per_iteration": 2.6024515628814697 + }, + { + "auxiliary_loss_clip": 0.06771973, + "auxiliary_loss_mlp": 0.01314171, + "balance_loss_clip": 0.06355675, + "balance_loss_mlp": 0.01269325, + "epoch": 0.08615662107320006, + "flos": 19324542931200.0, + "grad_norm": 3.3903634053002336, + "language_loss": 0.75487757, + "learning_rate": 3.967011262041315e-06, + "loss": 0.83573902, + "num_input_tokens_seen": 30568830, + "router_z_loss_clip": 4.1640625, + "router_z_loss_mlp": 0.44848633, + "step": 1433, + "time_per_iteration": 2.5895845890045166 + }, + { + "auxiliary_loss_clip": 0.06795658, + "auxiliary_loss_mlp": 0.01322619, + "balance_loss_clip": 0.0636312, + "balance_loss_mlp": 0.01272313, + "epoch": 0.08621674432586802, + "flos": 15857161125120.0, + "grad_norm": 4.641351982999466, + "language_loss": 0.88055921, + "learning_rate": 3.9669407798878065e-06, + "loss": 0.96174198, + "num_input_tokens_seen": 30585730, + "router_z_loss_clip": 4.328125, + "router_z_loss_mlp": 0.50268555, + "step": 1434, + "time_per_iteration": 2.5355098247528076 + }, + { + "auxiliary_loss_clip": 0.06779063, + "auxiliary_loss_mlp": 0.01311558, + "balance_loss_clip": 0.06353655, + "balance_loss_mlp": 0.01263803, + "epoch": 0.086276867578536, + "flos": 14105874464640.0, + "grad_norm": 4.793331202343017, + "language_loss": 0.80184627, + "learning_rate": 3.966870223147707e-06, + "loss": 0.88275254, + "num_input_tokens_seen": 30603180, + "router_z_loss_clip": 4.25195312, + "router_z_loss_mlp": 0.4777832, + "step": 1435, + "time_per_iteration": 2.57381272315979 + }, + { + "auxiliary_loss_clip": 0.06627634, + "auxiliary_loss_mlp": 0.01282391, + "balance_loss_clip": 0.06350996, + "balance_loss_mlp": 0.01255616, + "epoch": 0.08633699083120397, + "flos": 70206500142720.0, + "grad_norm": 0.941958531658993, + "language_loss": 0.58419931, + "learning_rate": 3.96679959182369e-06, + "loss": 0.66329956, + "num_input_tokens_seen": 30668895, + "router_z_loss_clip": 2.7734375, + "router_z_loss_mlp": 0.26831055, + "step": 1436, + "time_per_iteration": 3.282787561416626 + }, + { + "auxiliary_loss_clip": 0.06781173, + "auxiliary_loss_mlp": 0.01309156, + "balance_loss_clip": 0.06351152, + "balance_loss_mlp": 0.01261949, + "epoch": 0.08639711408387193, + "flos": 30306565330560.0, + "grad_norm": 3.136203943019662, + "language_loss": 0.71995145, + "learning_rate": 3.966728885918437e-06, + "loss": 0.80085474, + "num_input_tokens_seen": 30688955, + "router_z_loss_clip": 4.296875, + "router_z_loss_mlp": 0.47167969, + "step": 1437, + "time_per_iteration": 4.062320232391357 + }, + { + "auxiliary_loss_clip": 0.06771993, + "auxiliary_loss_mlp": 0.01311453, + "balance_loss_clip": 0.06345055, + "balance_loss_mlp": 0.01262553, + "epoch": 0.08645723733653991, + "flos": 20303014584960.0, + "grad_norm": 2.1552544434513154, + "language_loss": 0.74663305, + "learning_rate": 3.966658105434627e-06, + "loss": 0.82746744, + "num_input_tokens_seen": 30706095, + "router_z_loss_clip": 4.26757812, + "router_z_loss_mlp": 0.48925781, + "step": 1438, + "time_per_iteration": 2.5902743339538574 + }, + { + "auxiliary_loss_clip": 0.06752677, + "auxiliary_loss_mlp": 0.01311557, + "balance_loss_clip": 0.06331892, + "balance_loss_mlp": 0.01263468, + "epoch": 0.08651736058920788, + "flos": 32898911748480.0, + "grad_norm": 2.1102638652127093, + "language_loss": 0.6610049, + "learning_rate": 3.966587250374945e-06, + "loss": 0.7416473, + "num_input_tokens_seen": 30729025, + "router_z_loss_clip": 4.20703125, + "router_z_loss_mlp": 0.48071289, + "step": 1439, + "time_per_iteration": 4.177356719970703 + }, + { + "auxiliary_loss_clip": 0.06767576, + "auxiliary_loss_mlp": 0.01319213, + "balance_loss_clip": 0.06342776, + "balance_loss_mlp": 0.01270934, + "epoch": 0.08657748384187584, + "flos": 22643863372800.0, + "grad_norm": 6.195931442958794, + "language_loss": 0.89298683, + "learning_rate": 3.966516320742077e-06, + "loss": 0.97385472, + "num_input_tokens_seen": 30746155, + "router_z_loss_clip": 4.25, + "router_z_loss_mlp": 0.4831543, + "step": 1440, + "time_per_iteration": 2.5557472705841064 + }, + { + "auxiliary_loss_clip": 0.06781097, + "auxiliary_loss_mlp": 0.01307911, + "balance_loss_clip": 0.06338568, + "balance_loss_mlp": 0.01254028, + "epoch": 0.08663760709454381, + "flos": 23664947627520.0, + "grad_norm": 2.369224573412665, + "language_loss": 0.86471045, + "learning_rate": 3.9664453165387124e-06, + "loss": 0.94560057, + "num_input_tokens_seen": 30761410, + "router_z_loss_clip": 4.421875, + "router_z_loss_mlp": 0.53833008, + "step": 1441, + "time_per_iteration": 2.65085768699646 + }, + { + "auxiliary_loss_clip": 0.06611373, + "auxiliary_loss_mlp": 0.01295436, + "balance_loss_clip": 0.06333591, + "balance_loss_mlp": 0.01268138, + "epoch": 0.08669773034721179, + "flos": 62703823484160.0, + "grad_norm": 0.803695610307685, + "language_loss": 0.60671109, + "learning_rate": 3.966374237767545e-06, + "loss": 0.68577921, + "num_input_tokens_seen": 30823010, + "router_z_loss_clip": 2.78125, + "router_z_loss_mlp": 0.27368164, + "step": 1442, + "time_per_iteration": 4.761855125427246 + }, + { + "auxiliary_loss_clip": 0.0676527, + "auxiliary_loss_mlp": 0.0130763, + "balance_loss_clip": 0.06333362, + "balance_loss_mlp": 0.0125885, + "epoch": 0.08675785359987975, + "flos": 20673713047680.0, + "grad_norm": 2.753695330350272, + "language_loss": 0.81546146, + "learning_rate": 3.96630308443127e-06, + "loss": 0.8961904, + "num_input_tokens_seen": 30841980, + "router_z_loss_clip": 4.31640625, + "router_z_loss_mlp": 0.48803711, + "step": 1443, + "time_per_iteration": 2.581735134124756 + }, + { + "auxiliary_loss_clip": 0.06751874, + "auxiliary_loss_mlp": 0.01309584, + "balance_loss_clip": 0.06329648, + "balance_loss_mlp": 0.01264404, + "epoch": 0.08681797685254772, + "flos": 26948070305280.0, + "grad_norm": 2.052695672066824, + "language_loss": 0.83898687, + "learning_rate": 3.966231856532584e-06, + "loss": 0.91960144, + "num_input_tokens_seen": 30863280, + "router_z_loss_clip": 4.22460938, + "router_z_loss_mlp": 0.45166016, + "step": 1444, + "time_per_iteration": 4.03491473197937 + }, + { + "auxiliary_loss_clip": 0.06771353, + "auxiliary_loss_mlp": 0.01313762, + "balance_loss_clip": 0.063327, + "balance_loss_mlp": 0.01263408, + "epoch": 0.0868781001052157, + "flos": 17718676231680.0, + "grad_norm": 2.3029002758170236, + "language_loss": 0.89515543, + "learning_rate": 3.966160554074189e-06, + "loss": 0.97600663, + "num_input_tokens_seen": 30881710, + "router_z_loss_clip": 4.3828125, + "router_z_loss_mlp": 0.50341797, + "step": 1445, + "time_per_iteration": 2.53659987449646 + }, + { + "auxiliary_loss_clip": 0.06757164, + "auxiliary_loss_mlp": 0.01319102, + "balance_loss_clip": 0.0633342, + "balance_loss_mlp": 0.01269916, + "epoch": 0.08693822335788366, + "flos": 19901820435840.0, + "grad_norm": 2.912516601595955, + "language_loss": 0.84297967, + "learning_rate": 3.96608917705879e-06, + "loss": 0.92374229, + "num_input_tokens_seen": 30900225, + "router_z_loss_clip": 4.23632812, + "router_z_loss_mlp": 0.49169922, + "step": 1446, + "time_per_iteration": 2.5991437435150146 + }, + { + "auxiliary_loss_clip": 0.06602339, + "auxiliary_loss_mlp": 0.01278086, + "balance_loss_clip": 0.06327674, + "balance_loss_mlp": 0.01252623, + "epoch": 0.08699834661055163, + "flos": 67040957871360.0, + "grad_norm": 0.7332106315857324, + "language_loss": 0.54912937, + "learning_rate": 3.966017725489091e-06, + "loss": 0.62793368, + "num_input_tokens_seen": 30959580, + "router_z_loss_clip": 2.75, + "router_z_loss_mlp": 0.25488281, + "step": 1447, + "time_per_iteration": 3.2708306312561035 + }, + { + "auxiliary_loss_clip": 0.06739033, + "auxiliary_loss_mlp": 0.01328667, + "balance_loss_clip": 0.06324905, + "balance_loss_mlp": 0.01282223, + "epoch": 0.0870584698632196, + "flos": 13485648942720.0, + "grad_norm": 3.073032874929238, + "language_loss": 0.86241722, + "learning_rate": 3.965946199367804e-06, + "loss": 0.94309419, + "num_input_tokens_seen": 30976775, + "router_z_loss_clip": 4.140625, + "router_z_loss_mlp": 0.46508789, + "step": 1448, + "time_per_iteration": 2.537522792816162 + }, + { + "auxiliary_loss_clip": 0.067637, + "auxiliary_loss_mlp": 0.01323636, + "balance_loss_clip": 0.06333195, + "balance_loss_mlp": 0.01275666, + "epoch": 0.08711859311588757, + "flos": 16112516042880.0, + "grad_norm": 5.523495984670142, + "language_loss": 0.81949937, + "learning_rate": 3.965874598697638e-06, + "loss": 0.90037274, + "num_input_tokens_seen": 30990495, + "router_z_loss_clip": 4.3046875, + "router_z_loss_mlp": 0.47949219, + "step": 1449, + "time_per_iteration": 2.57389760017395 + }, + { + "auxiliary_loss_clip": 0.06749628, + "auxiliary_loss_mlp": 0.01305238, + "balance_loss_clip": 0.06335508, + "balance_loss_mlp": 0.01262227, + "epoch": 0.08717871636855554, + "flos": 38481528424320.0, + "grad_norm": 2.3810554922577354, + "language_loss": 0.73064238, + "learning_rate": 3.965802923481313e-06, + "loss": 0.81119096, + "num_input_tokens_seen": 31014080, + "router_z_loss_clip": 4.140625, + "router_z_loss_mlp": 0.43017578, + "step": 1450, + "time_per_iteration": 2.7252304553985596 + }, + { + "auxiliary_loss_clip": 0.06761701, + "auxiliary_loss_mlp": 0.01323911, + "balance_loss_clip": 0.06337759, + "balance_loss_mlp": 0.01275416, + "epoch": 0.0872388396212235, + "flos": 17605932163200.0, + "grad_norm": 2.1112425767796474, + "language_loss": 0.85553432, + "learning_rate": 3.965731173721542e-06, + "loss": 0.9363904, + "num_input_tokens_seen": 31031210, + "router_z_loss_clip": 4.24414062, + "router_z_loss_mlp": 0.48486328, + "step": 1451, + "time_per_iteration": 2.556896209716797 + }, + { + "auxiliary_loss_clip": 0.06751224, + "auxiliary_loss_mlp": 0.01307951, + "balance_loss_clip": 0.06344092, + "balance_loss_mlp": 0.01266395, + "epoch": 0.08729896287389148, + "flos": 25265489592960.0, + "grad_norm": 2.067410826923288, + "language_loss": 0.76721281, + "learning_rate": 3.965659349421049e-06, + "loss": 0.84780455, + "num_input_tokens_seen": 31049710, + "router_z_loss_clip": 4.0703125, + "router_z_loss_mlp": 0.41577148, + "step": 1452, + "time_per_iteration": 2.5980234146118164 + }, + { + "auxiliary_loss_clip": 0.06767467, + "auxiliary_loss_mlp": 0.01321022, + "balance_loss_clip": 0.06343699, + "balance_loss_mlp": 0.01272623, + "epoch": 0.08735908612655945, + "flos": 15637836263040.0, + "grad_norm": 4.836985480100509, + "language_loss": 0.8246457, + "learning_rate": 3.965587450582556e-06, + "loss": 0.90553057, + "num_input_tokens_seen": 31066160, + "router_z_loss_clip": 4.23828125, + "router_z_loss_mlp": 0.48364258, + "step": 1453, + "time_per_iteration": 2.5459630489349365 + }, + { + "auxiliary_loss_clip": 0.06754768, + "auxiliary_loss_mlp": 0.0129928, + "balance_loss_clip": 0.06342497, + "balance_loss_mlp": 0.0125646, + "epoch": 0.08741920937922741, + "flos": 20345920675200.0, + "grad_norm": 3.0656217118084, + "language_loss": 0.72998244, + "learning_rate": 3.96551547720879e-06, + "loss": 0.81052291, + "num_input_tokens_seen": 31085270, + "router_z_loss_clip": 4.12695312, + "router_z_loss_mlp": 0.42822266, + "step": 1454, + "time_per_iteration": 2.551548957824707 + }, + { + "auxiliary_loss_clip": 0.0662789, + "auxiliary_loss_mlp": 0.01303999, + "balance_loss_clip": 0.06353966, + "balance_loss_mlp": 0.01280789, + "epoch": 0.08747933263189539, + "flos": 62841052944000.0, + "grad_norm": 0.7529223255178736, + "language_loss": 0.58298737, + "learning_rate": 3.96544342930248e-06, + "loss": 0.66230631, + "num_input_tokens_seen": 31148445, + "router_z_loss_clip": 2.734375, + "router_z_loss_mlp": 0.23181152, + "step": 1455, + "time_per_iteration": 3.2130184173583984 + }, + { + "auxiliary_loss_clip": 0.06774339, + "auxiliary_loss_mlp": 0.01313917, + "balance_loss_clip": 0.06350334, + "balance_loss_mlp": 0.01265303, + "epoch": 0.08753945588456336, + "flos": 33044122074240.0, + "grad_norm": 1.7776650768799964, + "language_loss": 0.79278296, + "learning_rate": 3.965371306866359e-06, + "loss": 0.87366557, + "num_input_tokens_seen": 31168770, + "router_z_loss_clip": 4.23632812, + "router_z_loss_mlp": 0.4855957, + "step": 1456, + "time_per_iteration": 2.6745898723602295 + }, + { + "auxiliary_loss_clip": 0.06785175, + "auxiliary_loss_mlp": 0.01319613, + "balance_loss_clip": 0.06356893, + "balance_loss_mlp": 0.01271881, + "epoch": 0.08759957913723132, + "flos": 35554807088640.0, + "grad_norm": 2.255439619282858, + "language_loss": 0.74143755, + "learning_rate": 3.96529910990316e-06, + "loss": 0.82248545, + "num_input_tokens_seen": 31189270, + "router_z_loss_clip": 4.28515625, + "router_z_loss_mlp": 0.47753906, + "step": 1457, + "time_per_iteration": 2.6837821006774902 + }, + { + "auxiliary_loss_clip": 0.06763137, + "auxiliary_loss_mlp": 0.01308035, + "balance_loss_clip": 0.06348729, + "balance_loss_mlp": 0.01264738, + "epoch": 0.0876597023898993, + "flos": 23917283798400.0, + "grad_norm": 1.7808177247023305, + "language_loss": 0.88680792, + "learning_rate": 3.965226838415622e-06, + "loss": 0.96751964, + "num_input_tokens_seen": 31210385, + "router_z_loss_clip": 4.140625, + "router_z_loss_mlp": 0.43261719, + "step": 1458, + "time_per_iteration": 2.5912857055664062 + }, + { + "auxiliary_loss_clip": 0.0677645, + "auxiliary_loss_mlp": 0.01313188, + "balance_loss_clip": 0.06355318, + "balance_loss_mlp": 0.01268151, + "epoch": 0.08771982564256726, + "flos": 18119912307840.0, + "grad_norm": 3.1042726617035297, + "language_loss": 0.82429975, + "learning_rate": 3.965154492406486e-06, + "loss": 0.90519613, + "num_input_tokens_seen": 31229745, + "router_z_loss_clip": 4.20703125, + "router_z_loss_mlp": 0.45043945, + "step": 1459, + "time_per_iteration": 2.5870959758758545 + }, + { + "auxiliary_loss_clip": 0.0679104, + "auxiliary_loss_mlp": 0.01327895, + "balance_loss_clip": 0.06355593, + "balance_loss_mlp": 0.01275062, + "epoch": 0.08777994889523523, + "flos": 17717711909760.0, + "grad_norm": 7.236455309064537, + "language_loss": 0.8621763, + "learning_rate": 3.9650820718784945e-06, + "loss": 0.94336569, + "num_input_tokens_seen": 31248280, + "router_z_loss_clip": 4.35546875, + "router_z_loss_mlp": 0.52856445, + "step": 1460, + "time_per_iteration": 2.574669361114502 + }, + { + "auxiliary_loss_clip": 0.06771254, + "auxiliary_loss_mlp": 0.01315799, + "balance_loss_clip": 0.06352662, + "balance_loss_mlp": 0.01271215, + "epoch": 0.0878400721479032, + "flos": 12824320193280.0, + "grad_norm": 3.2811276479841847, + "language_loss": 0.83160508, + "learning_rate": 3.965009576834394e-06, + "loss": 0.91247559, + "num_input_tokens_seen": 31262190, + "router_z_loss_clip": 4.18359375, + "router_z_loss_mlp": 0.44580078, + "step": 1461, + "time_per_iteration": 2.575343608856201 + }, + { + "auxiliary_loss_clip": 0.06765963, + "auxiliary_loss_mlp": 0.01303985, + "balance_loss_clip": 0.06350134, + "balance_loss_mlp": 0.01261094, + "epoch": 0.08790019540057117, + "flos": 26399359843200.0, + "grad_norm": 3.960130795636661, + "language_loss": 0.77723432, + "learning_rate": 3.964937007276932e-06, + "loss": 0.85793376, + "num_input_tokens_seen": 31283690, + "router_z_loss_clip": 4.15625, + "router_z_loss_mlp": 0.42895508, + "step": 1462, + "time_per_iteration": 2.6177735328674316 + }, + { + "auxiliary_loss_clip": 0.06788168, + "auxiliary_loss_mlp": 0.01309058, + "balance_loss_clip": 0.06352487, + "balance_loss_mlp": 0.01258371, + "epoch": 0.08796031865323914, + "flos": 19139822605440.0, + "grad_norm": 5.369695457360621, + "language_loss": 0.76475191, + "learning_rate": 3.9648643632088634e-06, + "loss": 0.84572417, + "num_input_tokens_seen": 31302505, + "router_z_loss_clip": 4.359375, + "router_z_loss_mlp": 0.50732422, + "step": 1463, + "time_per_iteration": 2.532130241394043 + }, + { + "auxiliary_loss_clip": 0.06770946, + "auxiliary_loss_mlp": 0.01316317, + "balance_loss_clip": 0.06331752, + "balance_loss_mlp": 0.01261218, + "epoch": 0.0880204419059071, + "flos": 26070896638080.0, + "grad_norm": 3.6430076592813427, + "language_loss": 0.85532415, + "learning_rate": 3.964791644632941e-06, + "loss": 0.9361968, + "num_input_tokens_seen": 31323070, + "router_z_loss_clip": 4.39453125, + "router_z_loss_mlp": 0.55126953, + "step": 1464, + "time_per_iteration": 2.606081962585449 + }, + { + "auxiliary_loss_clip": 0.06766248, + "auxiliary_loss_mlp": 0.01314801, + "balance_loss_clip": 0.06340823, + "balance_loss_mlp": 0.01264948, + "epoch": 0.08808056515857508, + "flos": 22383602991360.0, + "grad_norm": 2.6056498019463774, + "language_loss": 0.80711126, + "learning_rate": 3.964718851551923e-06, + "loss": 0.88792181, + "num_input_tokens_seen": 31341880, + "router_z_loss_clip": 4.25, + "router_z_loss_mlp": 0.4987793, + "step": 1465, + "time_per_iteration": 2.555612325668335 + }, + { + "auxiliary_loss_clip": 0.06765096, + "auxiliary_loss_mlp": 0.0132391, + "balance_loss_clip": 0.06346563, + "balance_loss_mlp": 0.01275654, + "epoch": 0.08814068841124305, + "flos": 23191986856320.0, + "grad_norm": 5.208613872763048, + "language_loss": 0.8713969, + "learning_rate": 3.9646459839685675e-06, + "loss": 0.95228696, + "num_input_tokens_seen": 31361995, + "router_z_loss_clip": 4.18554688, + "router_z_loss_mlp": 0.48266602, + "step": 1466, + "time_per_iteration": 2.5865933895111084 + }, + { + "auxiliary_loss_clip": 0.067513, + "auxiliary_loss_mlp": 0.01319742, + "balance_loss_clip": 0.06332761, + "balance_loss_mlp": 0.01270842, + "epoch": 0.08820081166391101, + "flos": 25162262962560.0, + "grad_norm": 2.171865464101356, + "language_loss": 0.85806906, + "learning_rate": 3.964573041885641e-06, + "loss": 0.93877947, + "num_input_tokens_seen": 31381515, + "router_z_loss_clip": 4.18359375, + "router_z_loss_mlp": 0.48852539, + "step": 1467, + "time_per_iteration": 2.5861306190490723 + }, + { + "auxiliary_loss_clip": 0.06751268, + "auxiliary_loss_mlp": 0.0130998, + "balance_loss_clip": 0.06337693, + "balance_loss_mlp": 0.01262654, + "epoch": 0.08826093491657899, + "flos": 22237386416640.0, + "grad_norm": 2.29409858909566, + "language_loss": 0.78131318, + "learning_rate": 3.964500025305907e-06, + "loss": 0.86192572, + "num_input_tokens_seen": 31400345, + "router_z_loss_clip": 4.1328125, + "router_z_loss_mlp": 0.47387695, + "step": 1468, + "time_per_iteration": 2.5800206661224365 + }, + { + "auxiliary_loss_clip": 0.06742708, + "auxiliary_loss_mlp": 0.01311969, + "balance_loss_clip": 0.06332668, + "balance_loss_mlp": 0.01265501, + "epoch": 0.08832105816924696, + "flos": 22133279318400.0, + "grad_norm": 1.8356690071746322, + "language_loss": 0.82406783, + "learning_rate": 3.9644269342321355e-06, + "loss": 0.90461457, + "num_input_tokens_seen": 31419620, + "router_z_loss_clip": 4.09570312, + "router_z_loss_mlp": 0.46459961, + "step": 1469, + "time_per_iteration": 2.5584611892700195 + }, + { + "auxiliary_loss_clip": 0.06744162, + "auxiliary_loss_mlp": 0.01313281, + "balance_loss_clip": 0.06327502, + "balance_loss_mlp": 0.01264739, + "epoch": 0.08838118142191492, + "flos": 17572250021760.0, + "grad_norm": 2.2192924058432615, + "language_loss": 0.79711461, + "learning_rate": 3.9643537686670974e-06, + "loss": 0.877689, + "num_input_tokens_seen": 31437970, + "router_z_loss_clip": 4.16210938, + "router_z_loss_mlp": 0.48535156, + "step": 1470, + "time_per_iteration": 2.5447630882263184 + }, + { + "auxiliary_loss_clip": 0.06739189, + "auxiliary_loss_mlp": 0.01312164, + "balance_loss_clip": 0.06326798, + "balance_loss_mlp": 0.0126274, + "epoch": 0.0884413046745829, + "flos": 20783480296320.0, + "grad_norm": 2.030528760335608, + "language_loss": 0.86272311, + "learning_rate": 3.964280528613569e-06, + "loss": 0.94323671, + "num_input_tokens_seen": 31457040, + "router_z_loss_clip": 4.125, + "router_z_loss_mlp": 0.49511719, + "step": 1471, + "time_per_iteration": 2.7219297885894775 + }, + { + "auxiliary_loss_clip": 0.06719133, + "auxiliary_loss_mlp": 0.01304039, + "balance_loss_clip": 0.06321308, + "balance_loss_mlp": 0.01263222, + "epoch": 0.08850142792725087, + "flos": 22131686090880.0, + "grad_norm": 5.945068157557599, + "language_loss": 0.85369575, + "learning_rate": 3.964207214074324e-06, + "loss": 0.93392742, + "num_input_tokens_seen": 31477520, + "router_z_loss_clip": 3.97851562, + "router_z_loss_mlp": 0.40820312, + "step": 1472, + "time_per_iteration": 2.6007394790649414 + }, + { + "auxiliary_loss_clip": 0.06741676, + "auxiliary_loss_mlp": 0.01307162, + "balance_loss_clip": 0.06323978, + "balance_loss_mlp": 0.01258811, + "epoch": 0.08856155117991883, + "flos": 22425251270400.0, + "grad_norm": 4.024487815181785, + "language_loss": 0.85227764, + "learning_rate": 3.964133825052146e-06, + "loss": 0.93276608, + "num_input_tokens_seen": 31495575, + "router_z_loss_clip": 4.1796875, + "router_z_loss_mlp": 0.48388672, + "step": 1473, + "time_per_iteration": 2.610280752182007 + }, + { + "auxiliary_loss_clip": 0.06745915, + "auxiliary_loss_mlp": 0.01303107, + "balance_loss_clip": 0.0632661, + "balance_loss_mlp": 0.01257998, + "epoch": 0.0886216744325868, + "flos": 29945132743680.0, + "grad_norm": 1.5926466073589443, + "language_loss": 0.80301654, + "learning_rate": 3.964060361549816e-06, + "loss": 0.88350677, + "num_input_tokens_seen": 31520020, + "router_z_loss_clip": 4.1953125, + "router_z_loss_mlp": 0.45092773, + "step": 1474, + "time_per_iteration": 2.74392032623291 + }, + { + "auxiliary_loss_clip": 0.0673038, + "auxiliary_loss_mlp": 0.01308218, + "balance_loss_clip": 0.06324204, + "balance_loss_mlp": 0.01263062, + "epoch": 0.08868179768525478, + "flos": 23988798858240.0, + "grad_norm": 2.028999420252469, + "language_loss": 0.80928683, + "learning_rate": 3.963986823570121e-06, + "loss": 0.88967282, + "num_input_tokens_seen": 31539265, + "router_z_loss_clip": 4.05859375, + "router_z_loss_mlp": 0.45166016, + "step": 1475, + "time_per_iteration": 2.570007801055908 + }, + { + "auxiliary_loss_clip": 0.06742392, + "auxiliary_loss_mlp": 0.01303332, + "balance_loss_clip": 0.06327485, + "balance_loss_mlp": 0.01256387, + "epoch": 0.08874192093792274, + "flos": 43187264922240.0, + "grad_norm": 1.8785525854248355, + "language_loss": 0.76261604, + "learning_rate": 3.963913211115848e-06, + "loss": 0.84307337, + "num_input_tokens_seen": 31563425, + "router_z_loss_clip": 4.1484375, + "router_z_loss_mlp": 0.46972656, + "step": 1476, + "time_per_iteration": 4.163857460021973 + }, + { + "auxiliary_loss_clip": 0.06743093, + "auxiliary_loss_mlp": 0.01308468, + "balance_loss_clip": 0.06333718, + "balance_loss_mlp": 0.01262405, + "epoch": 0.0888020441905907, + "flos": 32860491851520.0, + "grad_norm": 1.6890231836232912, + "language_loss": 0.76270819, + "learning_rate": 3.9638395241897895e-06, + "loss": 0.84322381, + "num_input_tokens_seen": 31584525, + "router_z_loss_clip": 4.09375, + "router_z_loss_mlp": 0.46069336, + "step": 1477, + "time_per_iteration": 2.6772334575653076 + }, + { + "auxiliary_loss_clip": 0.06751049, + "auxiliary_loss_mlp": 0.01308123, + "balance_loss_clip": 0.06334269, + "balance_loss_mlp": 0.01263468, + "epoch": 0.08886216744325869, + "flos": 23156124508800.0, + "grad_norm": 2.600680931100332, + "language_loss": 0.88817739, + "learning_rate": 3.963765762794739e-06, + "loss": 0.96876919, + "num_input_tokens_seen": 31603325, + "router_z_loss_clip": 4.16601562, + "router_z_loss_mlp": 0.44677734, + "step": 1478, + "time_per_iteration": 4.08270525932312 + }, + { + "auxiliary_loss_clip": 0.0675, + "auxiliary_loss_mlp": 0.01309174, + "balance_loss_clip": 0.06336476, + "balance_loss_mlp": 0.01263803, + "epoch": 0.08892229069592665, + "flos": 23338371139200.0, + "grad_norm": 1.8272738608530537, + "language_loss": 0.79003656, + "learning_rate": 3.963691926933495e-06, + "loss": 0.87062836, + "num_input_tokens_seen": 31624820, + "router_z_loss_clip": 4.1328125, + "router_z_loss_mlp": 0.45361328, + "step": 1479, + "time_per_iteration": 2.5917623043060303 + }, + { + "auxiliary_loss_clip": 0.06747445, + "auxiliary_loss_mlp": 0.01303872, + "balance_loss_clip": 0.06333964, + "balance_loss_mlp": 0.01256665, + "epoch": 0.08898241394859462, + "flos": 26221012427520.0, + "grad_norm": 4.931621721483509, + "language_loss": 0.80906087, + "learning_rate": 3.9636180166088555e-06, + "loss": 0.88957405, + "num_input_tokens_seen": 31646080, + "router_z_loss_clip": 4.1328125, + "router_z_loss_mlp": 0.47265625, + "step": 1480, + "time_per_iteration": 2.6102962493896484 + }, + { + "auxiliary_loss_clip": 0.06771734, + "auxiliary_loss_mlp": 0.01331796, + "balance_loss_clip": 0.06338413, + "balance_loss_mlp": 0.01278986, + "epoch": 0.0890425372012626, + "flos": 23557444439040.0, + "grad_norm": 2.1143063599710135, + "language_loss": 0.68804622, + "learning_rate": 3.963544031823624e-06, + "loss": 0.76908153, + "num_input_tokens_seen": 31665770, + "router_z_loss_clip": 4.33203125, + "router_z_loss_mlp": 0.52807617, + "step": 1481, + "time_per_iteration": 4.085212707519531 + }, + { + "auxiliary_loss_clip": 0.06743339, + "auxiliary_loss_mlp": 0.01307322, + "balance_loss_clip": 0.06335256, + "balance_loss_mlp": 0.01264358, + "epoch": 0.08910266045393056, + "flos": 23009446736640.0, + "grad_norm": 2.5169726563525234, + "language_loss": 0.99559236, + "learning_rate": 3.9634699725806065e-06, + "loss": 1.07609892, + "num_input_tokens_seen": 31683805, + "router_z_loss_clip": 4.07617188, + "router_z_loss_mlp": 0.42993164, + "step": 1482, + "time_per_iteration": 2.564034938812256 + }, + { + "auxiliary_loss_clip": 0.06760907, + "auxiliary_loss_mlp": 0.0131259, + "balance_loss_clip": 0.06338564, + "balance_loss_mlp": 0.01264024, + "epoch": 0.08916278370659853, + "flos": 31943766257280.0, + "grad_norm": 3.2036096398767993, + "language_loss": 0.81227845, + "learning_rate": 3.96339583888261e-06, + "loss": 0.89301342, + "num_input_tokens_seen": 31704630, + "router_z_loss_clip": 4.22460938, + "router_z_loss_mlp": 0.48535156, + "step": 1483, + "time_per_iteration": 4.063607215881348 + }, + { + "auxiliary_loss_clip": 0.06743906, + "auxiliary_loss_mlp": 0.01316489, + "balance_loss_clip": 0.06329283, + "balance_loss_mlp": 0.01268519, + "epoch": 0.08922290695926649, + "flos": 17536219966080.0, + "grad_norm": 10.926297293099243, + "language_loss": 0.87554848, + "learning_rate": 3.963321630732448e-06, + "loss": 0.95615244, + "num_input_tokens_seen": 31723255, + "router_z_loss_clip": 4.140625, + "router_z_loss_mlp": 0.47998047, + "step": 1484, + "time_per_iteration": 2.5457398891448975 + }, + { + "auxiliary_loss_clip": 0.06757183, + "auxiliary_loss_mlp": 0.01321525, + "balance_loss_clip": 0.06330685, + "balance_loss_mlp": 0.01272315, + "epoch": 0.08928303021193447, + "flos": 32133392046720.0, + "grad_norm": 2.337720635500538, + "language_loss": 0.82324612, + "learning_rate": 3.963247348132932e-06, + "loss": 0.90403324, + "num_input_tokens_seen": 31747045, + "router_z_loss_clip": 4.265625, + "router_z_loss_mlp": 0.49267578, + "step": 1485, + "time_per_iteration": 2.6794724464416504 + }, + { + "auxiliary_loss_clip": 0.06736165, + "auxiliary_loss_mlp": 0.01302402, + "balance_loss_clip": 0.06326707, + "balance_loss_mlp": 0.01256125, + "epoch": 0.08934315346460243, + "flos": 22131392601600.0, + "grad_norm": 3.158284640334893, + "language_loss": 0.84766626, + "learning_rate": 3.96317299108688e-06, + "loss": 0.92805195, + "num_input_tokens_seen": 31766615, + "router_z_loss_clip": 4.09765625, + "router_z_loss_mlp": 0.46264648, + "step": 1486, + "time_per_iteration": 2.5732409954071045 + }, + { + "auxiliary_loss_clip": 0.06736217, + "auxiliary_loss_mlp": 0.0130934, + "balance_loss_clip": 0.06328043, + "balance_loss_mlp": 0.01267569, + "epoch": 0.0894032767172704, + "flos": 22572264458880.0, + "grad_norm": 1.7672180345851645, + "language_loss": 0.78605509, + "learning_rate": 3.963098559597111e-06, + "loss": 0.86651075, + "num_input_tokens_seen": 31785855, + "router_z_loss_clip": 4.07617188, + "router_z_loss_mlp": 0.41748047, + "step": 1487, + "time_per_iteration": 2.5952718257904053 + }, + { + "auxiliary_loss_clip": 0.06736919, + "auxiliary_loss_mlp": 0.01308401, + "balance_loss_clip": 0.06326038, + "balance_loss_mlp": 0.0126353, + "epoch": 0.08946339996993838, + "flos": 20199578319360.0, + "grad_norm": 4.25204894574284, + "language_loss": 0.85387635, + "learning_rate": 3.963024053666449e-06, + "loss": 0.93432951, + "num_input_tokens_seen": 31804210, + "router_z_loss_clip": 4.10546875, + "router_z_loss_mlp": 0.44873047, + "step": 1488, + "time_per_iteration": 2.5534958839416504 + }, + { + "auxiliary_loss_clip": 0.06725559, + "auxiliary_loss_mlp": 0.01303445, + "balance_loss_clip": 0.06320536, + "balance_loss_mlp": 0.01259838, + "epoch": 0.08952352322260634, + "flos": 48371035363200.0, + "grad_norm": 2.4620081078023173, + "language_loss": 0.74370039, + "learning_rate": 3.962949473297718e-06, + "loss": 0.82399046, + "num_input_tokens_seen": 31826150, + "router_z_loss_clip": 4.04882812, + "router_z_loss_mlp": 0.43554688, + "step": 1489, + "time_per_iteration": 2.780122756958008 + }, + { + "auxiliary_loss_clip": 0.06736162, + "auxiliary_loss_mlp": 0.01308033, + "balance_loss_clip": 0.06324734, + "balance_loss_mlp": 0.01264092, + "epoch": 0.08958364647527431, + "flos": 31800736137600.0, + "grad_norm": 2.6258968543660584, + "language_loss": 0.91654348, + "learning_rate": 3.962874818493745e-06, + "loss": 0.99698538, + "num_input_tokens_seen": 31848060, + "router_z_loss_clip": 4.11132812, + "router_z_loss_mlp": 0.43945312, + "step": 1490, + "time_per_iteration": 2.619051456451416 + }, + { + "auxiliary_loss_clip": 0.06748827, + "auxiliary_loss_mlp": 0.01303631, + "balance_loss_clip": 0.06332797, + "balance_loss_mlp": 0.01258737, + "epoch": 0.08964376972794229, + "flos": 23374988173440.0, + "grad_norm": 2.6637397886572076, + "language_loss": 0.76370478, + "learning_rate": 3.9628000892573635e-06, + "loss": 0.84422934, + "num_input_tokens_seen": 31870040, + "router_z_loss_clip": 4.1640625, + "router_z_loss_mlp": 0.44897461, + "step": 1491, + "time_per_iteration": 2.590679407119751 + }, + { + "auxiliary_loss_clip": 0.06728335, + "auxiliary_loss_mlp": 0.01302455, + "balance_loss_clip": 0.06325481, + "balance_loss_mlp": 0.01261804, + "epoch": 0.08970389298061025, + "flos": 23301502542720.0, + "grad_norm": 1.853626118240874, + "language_loss": 0.78431886, + "learning_rate": 3.9627252855914055e-06, + "loss": 0.86462677, + "num_input_tokens_seen": 31890400, + "router_z_loss_clip": 4.02539062, + "router_z_loss_mlp": 0.40673828, + "step": 1492, + "time_per_iteration": 2.5715339183807373 + }, + { + "auxiliary_loss_clip": 0.06729841, + "auxiliary_loss_mlp": 0.01304764, + "balance_loss_clip": 0.06324601, + "balance_loss_mlp": 0.01260298, + "epoch": 0.08976401623327822, + "flos": 33769419016320.0, + "grad_norm": 3.870321699477457, + "language_loss": 0.73167109, + "learning_rate": 3.962650407498707e-06, + "loss": 0.81201714, + "num_input_tokens_seen": 31913435, + "router_z_loss_clip": 4.046875, + "router_z_loss_mlp": 0.44433594, + "step": 1493, + "time_per_iteration": 2.6644091606140137 + }, + { + "auxiliary_loss_clip": 0.0673489, + "auxiliary_loss_mlp": 0.01306407, + "balance_loss_clip": 0.06327641, + "balance_loss_mlp": 0.01259987, + "epoch": 0.08982413948594618, + "flos": 23917535360640.0, + "grad_norm": 1.970514386565943, + "language_loss": 0.88832223, + "learning_rate": 3.962575454982109e-06, + "loss": 0.96873516, + "num_input_tokens_seen": 31932435, + "router_z_loss_clip": 4.07617188, + "router_z_loss_mlp": 0.46435547, + "step": 1494, + "time_per_iteration": 2.58363676071167 + }, + { + "auxiliary_loss_clip": 0.06728575, + "auxiliary_loss_mlp": 0.01309753, + "balance_loss_clip": 0.06328882, + "balance_loss_mlp": 0.01267792, + "epoch": 0.08988426273861416, + "flos": 16843305427200.0, + "grad_norm": 4.2307100076147774, + "language_loss": 0.84796005, + "learning_rate": 3.962500428044454e-06, + "loss": 0.92834336, + "num_input_tokens_seen": 31950125, + "router_z_loss_clip": 3.99414062, + "router_z_loss_mlp": 0.41967773, + "step": 1495, + "time_per_iteration": 2.5592563152313232 + }, + { + "auxiliary_loss_clip": 0.06737964, + "auxiliary_loss_mlp": 0.01307798, + "balance_loss_clip": 0.06329042, + "balance_loss_mlp": 0.01263476, + "epoch": 0.08994438599128213, + "flos": 14798621295360.0, + "grad_norm": 2.6872032858380885, + "language_loss": 0.72458923, + "learning_rate": 3.962425326688585e-06, + "loss": 0.80504692, + "num_input_tokens_seen": 31968050, + "router_z_loss_clip": 4.08203125, + "router_z_loss_mlp": 0.44287109, + "step": 1496, + "time_per_iteration": 2.527702569961548 + }, + { + "auxiliary_loss_clip": 0.06731858, + "auxiliary_loss_mlp": 0.01301643, + "balance_loss_clip": 0.06328158, + "balance_loss_mlp": 0.01259038, + "epoch": 0.09000450924395009, + "flos": 17390087245440.0, + "grad_norm": 1.9873412980644265, + "language_loss": 0.82173735, + "learning_rate": 3.962350150917351e-06, + "loss": 0.90207237, + "num_input_tokens_seen": 31985675, + "router_z_loss_clip": 4.03515625, + "router_z_loss_mlp": 0.42578125, + "step": 1497, + "time_per_iteration": 2.5877413749694824 + }, + { + "auxiliary_loss_clip": 0.06743819, + "auxiliary_loss_mlp": 0.01303103, + "balance_loss_clip": 0.06327296, + "balance_loss_mlp": 0.01257064, + "epoch": 0.09006463249661807, + "flos": 24287269501440.0, + "grad_norm": 4.64905554567639, + "language_loss": 0.85617393, + "learning_rate": 3.9622749007336035e-06, + "loss": 0.93664312, + "num_input_tokens_seen": 32005180, + "router_z_loss_clip": 4.1640625, + "router_z_loss_mlp": 0.4609375, + "step": 1498, + "time_per_iteration": 2.5904557704925537 + }, + { + "auxiliary_loss_clip": 0.06749868, + "auxiliary_loss_mlp": 0.01309538, + "balance_loss_clip": 0.06334974, + "balance_loss_mlp": 0.01263666, + "epoch": 0.09012475574928604, + "flos": 13666931251200.0, + "grad_norm": 3.85109419291821, + "language_loss": 0.81540704, + "learning_rate": 3.962199576140195e-06, + "loss": 0.89600116, + "num_input_tokens_seen": 32022970, + "router_z_loss_clip": 4.1484375, + "router_z_loss_mlp": 0.45849609, + "step": 1499, + "time_per_iteration": 2.5302114486694336 + }, + { + "auxiliary_loss_clip": 0.06728019, + "auxiliary_loss_mlp": 0.01300863, + "balance_loss_clip": 0.06331602, + "balance_loss_mlp": 0.01261142, + "epoch": 0.090184879001954, + "flos": 23333884945920.0, + "grad_norm": 2.0381377997897636, + "language_loss": 0.94349372, + "learning_rate": 3.962124177139981e-06, + "loss": 1.02378249, + "num_input_tokens_seen": 32043055, + "router_z_loss_clip": 3.96484375, + "router_z_loss_mlp": 0.3972168, + "step": 1500, + "time_per_iteration": 2.5795865058898926 + }, + { + "auxiliary_loss_clip": 0.0677222, + "auxiliary_loss_mlp": 0.01314156, + "balance_loss_clip": 0.06350215, + "balance_loss_mlp": 0.01263539, + "epoch": 0.09024500225462198, + "flos": 23009320955520.0, + "grad_norm": 3.436423392701186, + "language_loss": 0.77039468, + "learning_rate": 3.962048703735822e-06, + "loss": 0.8512584, + "num_input_tokens_seen": 32061900, + "router_z_loss_clip": 4.21875, + "router_z_loss_mlp": 0.50634766, + "step": 1501, + "time_per_iteration": 2.5764503479003906 + }, + { + "auxiliary_loss_clip": 0.06607839, + "auxiliary_loss_mlp": 0.01283791, + "balance_loss_clip": 0.06328217, + "balance_loss_mlp": 0.01261165, + "epoch": 0.09030512550728995, + "flos": 62208626653440.0, + "grad_norm": 0.7031155649326037, + "language_loss": 0.58089769, + "learning_rate": 3.96197315593058e-06, + "loss": 0.659814, + "num_input_tokens_seen": 32122745, + "router_z_loss_clip": 2.796875, + "router_z_loss_mlp": 0.22619629, + "step": 1502, + "time_per_iteration": 3.1644375324249268 + }, + { + "auxiliary_loss_clip": 0.06763642, + "auxiliary_loss_mlp": 0.01313188, + "balance_loss_clip": 0.06354539, + "balance_loss_mlp": 0.01269653, + "epoch": 0.09036524875995791, + "flos": 38809907775360.0, + "grad_norm": 3.4086152145479427, + "language_loss": 0.72101718, + "learning_rate": 3.961897533727119e-06, + "loss": 0.80178547, + "num_input_tokens_seen": 32145125, + "router_z_loss_clip": 4.09179688, + "router_z_loss_mlp": 0.43579102, + "step": 1503, + "time_per_iteration": 2.724386215209961 + }, + { + "auxiliary_loss_clip": 0.06781425, + "auxiliary_loss_mlp": 0.01307874, + "balance_loss_clip": 0.06363953, + "balance_loss_mlp": 0.01263075, + "epoch": 0.09042537201262588, + "flos": 21696642092160.0, + "grad_norm": 2.1842796361034793, + "language_loss": 0.881266, + "learning_rate": 3.961821837128306e-06, + "loss": 0.96215898, + "num_input_tokens_seen": 32166255, + "router_z_loss_clip": 4.1796875, + "router_z_loss_mlp": 0.44848633, + "step": 1504, + "time_per_iteration": 2.5873734951019287 + }, + { + "auxiliary_loss_clip": 0.06790902, + "auxiliary_loss_mlp": 0.01331983, + "balance_loss_clip": 0.06361797, + "balance_loss_mlp": 0.01280795, + "epoch": 0.09048549526529386, + "flos": 22272536004480.0, + "grad_norm": 3.0474410186464427, + "language_loss": 0.75017542, + "learning_rate": 3.961746066137014e-06, + "loss": 0.83140427, + "num_input_tokens_seen": 32184010, + "router_z_loss_clip": 4.2890625, + "router_z_loss_mlp": 0.51171875, + "step": 1505, + "time_per_iteration": 2.542175054550171 + }, + { + "auxiliary_loss_clip": 0.06765792, + "auxiliary_loss_mlp": 0.0131069, + "balance_loss_clip": 0.06354111, + "balance_loss_mlp": 0.01263936, + "epoch": 0.09054561851796182, + "flos": 14616165029760.0, + "grad_norm": 3.6481054719455166, + "language_loss": 0.83357459, + "learning_rate": 3.961670220756114e-06, + "loss": 0.91433942, + "num_input_tokens_seen": 32201635, + "router_z_loss_clip": 4.11914062, + "router_z_loss_mlp": 0.46777344, + "step": 1506, + "time_per_iteration": 2.5811927318573 + }, + { + "auxiliary_loss_clip": 0.06768796, + "auxiliary_loss_mlp": 0.01305475, + "balance_loss_clip": 0.06366544, + "balance_loss_mlp": 0.01262584, + "epoch": 0.09060574177062979, + "flos": 27643542393600.0, + "grad_norm": 2.7002549048976388, + "language_loss": 0.78016138, + "learning_rate": 3.961594300988482e-06, + "loss": 0.8609041, + "num_input_tokens_seen": 32221940, + "router_z_loss_clip": 4.02148438, + "router_z_loss_mlp": 0.42871094, + "step": 1507, + "time_per_iteration": 2.6117966175079346 + }, + { + "auxiliary_loss_clip": 0.06588461, + "auxiliary_loss_mlp": 0.01287299, + "balance_loss_clip": 0.06317182, + "balance_loss_mlp": 0.01264351, + "epoch": 0.09066586502329776, + "flos": 66104637621120.0, + "grad_norm": 0.7149959192610794, + "language_loss": 0.57417059, + "learning_rate": 3.961518306836998e-06, + "loss": 0.65292823, + "num_input_tokens_seen": 32276495, + "router_z_loss_clip": 2.71875, + "router_z_loss_mlp": 0.22924805, + "step": 1508, + "time_per_iteration": 3.055577516555786 + }, + { + "auxiliary_loss_clip": 0.06765939, + "auxiliary_loss_mlp": 0.01315934, + "balance_loss_clip": 0.06356797, + "balance_loss_mlp": 0.01271135, + "epoch": 0.09072598827596573, + "flos": 18922426387200.0, + "grad_norm": 2.757411639882116, + "language_loss": 0.87097013, + "learning_rate": 3.961442238304543e-06, + "loss": 0.95178884, + "num_input_tokens_seen": 32294130, + "router_z_loss_clip": 4.09179688, + "router_z_loss_mlp": 0.44775391, + "step": 1509, + "time_per_iteration": 2.5325253009796143 + }, + { + "auxiliary_loss_clip": 0.06796411, + "auxiliary_loss_mlp": 0.01325092, + "balance_loss_clip": 0.06366567, + "balance_loss_mlp": 0.01275358, + "epoch": 0.0907861115286337, + "flos": 24827804190720.0, + "grad_norm": 3.0354649762753896, + "language_loss": 0.86899114, + "learning_rate": 3.961366095394002e-06, + "loss": 0.95020616, + "num_input_tokens_seen": 32313555, + "router_z_loss_clip": 4.29492188, + "router_z_loss_mlp": 0.49707031, + "step": 1510, + "time_per_iteration": 2.608421564102173 + }, + { + "auxiliary_loss_clip": 0.06775412, + "auxiliary_loss_mlp": 0.01304282, + "balance_loss_clip": 0.06358128, + "balance_loss_mlp": 0.01260127, + "epoch": 0.09084623478130167, + "flos": 21659270371200.0, + "grad_norm": 2.4633218193770103, + "language_loss": 0.89968181, + "learning_rate": 3.961289878108262e-06, + "loss": 0.98047876, + "num_input_tokens_seen": 32331430, + "router_z_loss_clip": 4.17773438, + "router_z_loss_mlp": 0.44140625, + "step": 1511, + "time_per_iteration": 2.566403388977051 + }, + { + "auxiliary_loss_clip": 0.0674355, + "auxiliary_loss_mlp": 0.01315251, + "balance_loss_clip": 0.06338912, + "balance_loss_mlp": 0.01272121, + "epoch": 0.09090635803396964, + "flos": 27647148119040.0, + "grad_norm": 2.09202487509347, + "language_loss": 0.86417758, + "learning_rate": 3.9612135864502135e-06, + "loss": 0.94476557, + "num_input_tokens_seen": 32353705, + "router_z_loss_clip": 4.0546875, + "router_z_loss_mlp": 0.43164062, + "step": 1512, + "time_per_iteration": 2.665790319442749 + }, + { + "auxiliary_loss_clip": 0.06752454, + "auxiliary_loss_mlp": 0.0130495, + "balance_loss_clip": 0.06350584, + "balance_loss_mlp": 0.01262726, + "epoch": 0.0909664812866376, + "flos": 17673757643520.0, + "grad_norm": 2.5146334197942926, + "language_loss": 0.88217908, + "learning_rate": 3.961137220422749e-06, + "loss": 0.96275318, + "num_input_tokens_seen": 32370520, + "router_z_loss_clip": 4.02148438, + "router_z_loss_mlp": 0.42211914, + "step": 1513, + "time_per_iteration": 2.531816244125366 + }, + { + "auxiliary_loss_clip": 0.06760095, + "auxiliary_loss_mlp": 0.01314183, + "balance_loss_clip": 0.06354512, + "balance_loss_mlp": 0.01272078, + "epoch": 0.09102660453930557, + "flos": 23958261244800.0, + "grad_norm": 5.873122305201123, + "language_loss": 0.88520277, + "learning_rate": 3.961060780028764e-06, + "loss": 0.9659456, + "num_input_tokens_seen": 32389105, + "router_z_loss_clip": 4.05664062, + "router_z_loss_mlp": 0.42138672, + "step": 1514, + "time_per_iteration": 2.609802722930908 + }, + { + "auxiliary_loss_clip": 0.06748682, + "auxiliary_loss_mlp": 0.01305229, + "balance_loss_clip": 0.06345841, + "balance_loss_mlp": 0.01266104, + "epoch": 0.09108672779197355, + "flos": 25820195621760.0, + "grad_norm": 1.9733366853077507, + "language_loss": 0.91259241, + "learning_rate": 3.960984265271159e-06, + "loss": 0.99313152, + "num_input_tokens_seen": 32408065, + "router_z_loss_clip": 4.02929688, + "router_z_loss_mlp": 0.39111328, + "step": 1515, + "time_per_iteration": 2.626183271408081 + }, + { + "auxiliary_loss_clip": 0.06753635, + "auxiliary_loss_mlp": 0.01307479, + "balance_loss_clip": 0.06346089, + "balance_loss_mlp": 0.01264754, + "epoch": 0.09114685104464151, + "flos": 29646620173440.0, + "grad_norm": 2.1883056599674195, + "language_loss": 0.87669599, + "learning_rate": 3.9609076761528335e-06, + "loss": 0.9573071, + "num_input_tokens_seen": 32427225, + "router_z_loss_clip": 4.078125, + "router_z_loss_mlp": 0.42700195, + "step": 1516, + "time_per_iteration": 4.0171709060668945 + }, + { + "auxiliary_loss_clip": 0.06753673, + "auxiliary_loss_mlp": 0.01309986, + "balance_loss_clip": 0.06344739, + "balance_loss_mlp": 0.01267643, + "epoch": 0.09120697429730948, + "flos": 33738084789120.0, + "grad_norm": 1.96049698042547, + "language_loss": 0.82941747, + "learning_rate": 3.960831012676692e-06, + "loss": 0.91005409, + "num_input_tokens_seen": 32450510, + "router_z_loss_clip": 4.0859375, + "router_z_loss_mlp": 0.42285156, + "step": 1517, + "time_per_iteration": 4.134803056716919 + }, + { + "auxiliary_loss_clip": 0.06748644, + "auxiliary_loss_mlp": 0.01313239, + "balance_loss_clip": 0.06338718, + "balance_loss_mlp": 0.0127061, + "epoch": 0.09126709754997746, + "flos": 18406559525760.0, + "grad_norm": 1.9085933618955446, + "language_loss": 0.79150838, + "learning_rate": 3.960754274845642e-06, + "loss": 0.87212718, + "num_input_tokens_seen": 32468425, + "router_z_loss_clip": 4.09375, + "router_z_loss_mlp": 0.42626953, + "step": 1518, + "time_per_iteration": 2.609239101409912 + }, + { + "auxiliary_loss_clip": 0.06742416, + "auxiliary_loss_mlp": 0.01311508, + "balance_loss_clip": 0.0633543, + "balance_loss_mlp": 0.01267853, + "epoch": 0.09132722080264542, + "flos": 22098674782080.0, + "grad_norm": 1.8265694387954685, + "language_loss": 0.88381147, + "learning_rate": 3.960677462662594e-06, + "loss": 0.9643507, + "num_input_tokens_seen": 32487510, + "router_z_loss_clip": 4.0703125, + "router_z_loss_mlp": 0.43676758, + "step": 1519, + "time_per_iteration": 2.559178590774536 + }, + { + "auxiliary_loss_clip": 0.06749827, + "auxiliary_loss_mlp": 0.01303758, + "balance_loss_clip": 0.06334724, + "balance_loss_mlp": 0.01259507, + "epoch": 0.09138734405531339, + "flos": 21039547973760.0, + "grad_norm": 3.1504469624820497, + "language_loss": 0.75833631, + "learning_rate": 3.96060057613046e-06, + "loss": 0.83887213, + "num_input_tokens_seen": 32507250, + "router_z_loss_clip": 4.15625, + "router_z_loss_mlp": 0.44238281, + "step": 1520, + "time_per_iteration": 2.5994057655334473 + }, + { + "auxiliary_loss_clip": 0.06753822, + "auxiliary_loss_mlp": 0.0130995, + "balance_loss_clip": 0.06342606, + "balance_loss_mlp": 0.01263912, + "epoch": 0.09144746730798137, + "flos": 20090104560000.0, + "grad_norm": 3.4850769207863648, + "language_loss": 0.8813951, + "learning_rate": 3.960523615252156e-06, + "loss": 0.96203285, + "num_input_tokens_seen": 32526045, + "router_z_loss_clip": 4.1171875, + "router_z_loss_mlp": 0.45996094, + "step": 1521, + "time_per_iteration": 3.9595701694488525 + }, + { + "auxiliary_loss_clip": 0.06768003, + "auxiliary_loss_mlp": 0.0131471, + "balance_loss_clip": 0.06346045, + "balance_loss_mlp": 0.01269864, + "epoch": 0.09150759056064933, + "flos": 22783874745600.0, + "grad_norm": 2.490873911959668, + "language_loss": 0.85374022, + "learning_rate": 3.960446580030599e-06, + "loss": 0.93456733, + "num_input_tokens_seen": 32546575, + "router_z_loss_clip": 4.21875, + "router_z_loss_mlp": 0.44824219, + "step": 1522, + "time_per_iteration": 4.0201475620269775 + }, + { + "auxiliary_loss_clip": 0.06745256, + "auxiliary_loss_mlp": 0.01307893, + "balance_loss_clip": 0.06349748, + "balance_loss_mlp": 0.01265359, + "epoch": 0.0915677138133173, + "flos": 27571733844480.0, + "grad_norm": 3.0013683058651974, + "language_loss": 0.82841086, + "learning_rate": 3.960369470468711e-06, + "loss": 0.90894234, + "num_input_tokens_seen": 32568795, + "router_z_loss_clip": 3.95117188, + "router_z_loss_mlp": 0.42504883, + "step": 1523, + "time_per_iteration": 2.6468050479888916 + }, + { + "auxiliary_loss_clip": 0.0678298, + "auxiliary_loss_mlp": 0.01311185, + "balance_loss_clip": 0.06364655, + "balance_loss_mlp": 0.01265838, + "epoch": 0.09162783706598528, + "flos": 17680340188800.0, + "grad_norm": 4.7132272646544395, + "language_loss": 0.75685203, + "learning_rate": 3.960292286569418e-06, + "loss": 0.83779365, + "num_input_tokens_seen": 32587010, + "router_z_loss_clip": 4.1796875, + "router_z_loss_mlp": 0.45361328, + "step": 1524, + "time_per_iteration": 2.521636962890625 + }, + { + "auxiliary_loss_clip": 0.06770191, + "auxiliary_loss_mlp": 0.01303707, + "balance_loss_clip": 0.06361801, + "balance_loss_mlp": 0.01259814, + "epoch": 0.09168796031865324, + "flos": 18484028225280.0, + "grad_norm": 2.538080589714564, + "language_loss": 0.88912833, + "learning_rate": 3.960215028335644e-06, + "loss": 0.96986729, + "num_input_tokens_seen": 32602375, + "router_z_loss_clip": 4.08398438, + "router_z_loss_mlp": 0.43920898, + "step": 1525, + "time_per_iteration": 2.523988962173462 + }, + { + "auxiliary_loss_clip": 0.06788673, + "auxiliary_loss_mlp": 0.01309343, + "balance_loss_clip": 0.06375777, + "balance_loss_mlp": 0.01263877, + "epoch": 0.0917480835713212, + "flos": 29395290251520.0, + "grad_norm": 2.947838768384084, + "language_loss": 0.76479626, + "learning_rate": 3.96013769577032e-06, + "loss": 0.84577644, + "num_input_tokens_seen": 32621460, + "router_z_loss_clip": 4.125, + "router_z_loss_mlp": 0.45458984, + "step": 1526, + "time_per_iteration": 2.622180700302124 + }, + { + "auxiliary_loss_clip": 0.06764297, + "auxiliary_loss_mlp": 0.0130915, + "balance_loss_clip": 0.06361825, + "balance_loss_mlp": 0.01267212, + "epoch": 0.09180820682398917, + "flos": 19835504328960.0, + "grad_norm": 3.217414250452265, + "language_loss": 0.78915322, + "learning_rate": 3.960060288876378e-06, + "loss": 0.86988777, + "num_input_tokens_seen": 32640440, + "router_z_loss_clip": 4.01953125, + "router_z_loss_mlp": 0.41967773, + "step": 1527, + "time_per_iteration": 2.574036121368408 + }, + { + "auxiliary_loss_clip": 0.0678985, + "auxiliary_loss_mlp": 0.0131218, + "balance_loss_clip": 0.0637854, + "balance_loss_mlp": 0.01269146, + "epoch": 0.09186833007665715, + "flos": 23848619777280.0, + "grad_norm": 2.3845621342237284, + "language_loss": 0.81092995, + "learning_rate": 3.959982807656753e-06, + "loss": 0.89195025, + "num_input_tokens_seen": 32660020, + "router_z_loss_clip": 4.109375, + "router_z_loss_mlp": 0.42993164, + "step": 1528, + "time_per_iteration": 2.55942440032959 + }, + { + "auxiliary_loss_clip": 0.067963, + "auxiliary_loss_mlp": 0.01308536, + "balance_loss_clip": 0.06370017, + "balance_loss_mlp": 0.01259708, + "epoch": 0.09192845332932512, + "flos": 12937693167360.0, + "grad_norm": 3.969055249882827, + "language_loss": 0.79179597, + "learning_rate": 3.959905252114384e-06, + "loss": 0.87284434, + "num_input_tokens_seen": 32678170, + "router_z_loss_clip": 4.26171875, + "router_z_loss_mlp": 0.48828125, + "step": 1529, + "time_per_iteration": 2.559513807296753 + }, + { + "auxiliary_loss_clip": 0.06793401, + "auxiliary_loss_mlp": 0.01313121, + "balance_loss_clip": 0.06376834, + "balance_loss_mlp": 0.01266081, + "epoch": 0.09198857658199308, + "flos": 24574503697920.0, + "grad_norm": 2.3851695624911433, + "language_loss": 0.84393311, + "learning_rate": 3.959827622252211e-06, + "loss": 0.92499834, + "num_input_tokens_seen": 32697540, + "router_z_loss_clip": 4.1640625, + "router_z_loss_mlp": 0.47021484, + "step": 1530, + "time_per_iteration": 2.586825132369995 + }, + { + "auxiliary_loss_clip": 0.06782777, + "auxiliary_loss_mlp": 0.01307988, + "balance_loss_clip": 0.0637871, + "balance_loss_mlp": 0.01264596, + "epoch": 0.09204869983466106, + "flos": 20273231658240.0, + "grad_norm": 2.9699033759595728, + "language_loss": 0.85435712, + "learning_rate": 3.959749918073179e-06, + "loss": 0.93526471, + "num_input_tokens_seen": 32716805, + "router_z_loss_clip": 4.0390625, + "router_z_loss_mlp": 0.43383789, + "step": 1531, + "time_per_iteration": 2.592822313308716 + }, + { + "auxiliary_loss_clip": 0.06784501, + "auxiliary_loss_mlp": 0.01306885, + "balance_loss_clip": 0.06371005, + "balance_loss_mlp": 0.01261967, + "epoch": 0.09210882308732903, + "flos": 20891780098560.0, + "grad_norm": 2.1537883780568907, + "language_loss": 0.82955891, + "learning_rate": 3.959672139580233e-06, + "loss": 0.91047275, + "num_input_tokens_seen": 32736385, + "router_z_loss_clip": 4.13671875, + "router_z_loss_mlp": 0.44897461, + "step": 1532, + "time_per_iteration": 2.5733680725097656 + }, + { + "auxiliary_loss_clip": 0.06776289, + "auxiliary_loss_mlp": 0.01303592, + "balance_loss_clip": 0.06368969, + "balance_loss_mlp": 0.01262059, + "epoch": 0.09216894633999699, + "flos": 30964246427520.0, + "grad_norm": 3.2208618489711593, + "language_loss": 0.85266644, + "learning_rate": 3.9595942867763235e-06, + "loss": 0.93346524, + "num_input_tokens_seen": 32757140, + "router_z_loss_clip": 4.06835938, + "router_z_loss_mlp": 0.41552734, + "step": 1533, + "time_per_iteration": 2.640906810760498 + }, + { + "auxiliary_loss_clip": 0.06779255, + "auxiliary_loss_mlp": 0.01307047, + "balance_loss_clip": 0.06369043, + "balance_loss_mlp": 0.01263369, + "epoch": 0.09222906959266497, + "flos": 13156556832000.0, + "grad_norm": 2.5924628709665987, + "language_loss": 0.91772735, + "learning_rate": 3.959516359664402e-06, + "loss": 0.99859047, + "num_input_tokens_seen": 32774860, + "router_z_loss_clip": 4.09960938, + "router_z_loss_mlp": 0.43652344, + "step": 1534, + "time_per_iteration": 2.5586555004119873 + }, + { + "auxiliary_loss_clip": 0.06771498, + "auxiliary_loss_mlp": 0.01306705, + "balance_loss_clip": 0.06357232, + "balance_loss_mlp": 0.01260142, + "epoch": 0.09228919284533293, + "flos": 26001603711360.0, + "grad_norm": 3.0123317324125694, + "language_loss": 0.77440608, + "learning_rate": 3.959438358247424e-06, + "loss": 0.85518813, + "num_input_tokens_seen": 32795250, + "router_z_loss_clip": 4.14257812, + "router_z_loss_mlp": 0.46557617, + "step": 1535, + "time_per_iteration": 2.5873541831970215 + }, + { + "auxiliary_loss_clip": 0.06759383, + "auxiliary_loss_mlp": 0.0131007, + "balance_loss_clip": 0.06362146, + "balance_loss_mlp": 0.012688, + "epoch": 0.0923493160980009, + "flos": 18666694126080.0, + "grad_norm": 2.0947698011843707, + "language_loss": 0.83399653, + "learning_rate": 3.959360282528346e-06, + "loss": 0.91469115, + "num_input_tokens_seen": 32813805, + "router_z_loss_clip": 3.97070312, + "router_z_loss_mlp": 0.41235352, + "step": 1536, + "time_per_iteration": 2.5708868503570557 + }, + { + "auxiliary_loss_clip": 0.06743568, + "auxiliary_loss_mlp": 0.01297679, + "balance_loss_clip": 0.06350097, + "balance_loss_mlp": 0.01257767, + "epoch": 0.09240943935066886, + "flos": 21146673818880.0, + "grad_norm": 2.077431495660488, + "language_loss": 0.91567117, + "learning_rate": 3.959282132510131e-06, + "loss": 0.99608374, + "num_input_tokens_seen": 32830960, + "router_z_loss_clip": 3.93945312, + "router_z_loss_mlp": 0.39916992, + "step": 1537, + "time_per_iteration": 2.5669217109680176 + }, + { + "auxiliary_loss_clip": 0.06758659, + "auxiliary_loss_mlp": 0.01302061, + "balance_loss_clip": 0.06354217, + "balance_loss_mlp": 0.01258288, + "epoch": 0.09246956260333684, + "flos": 20598298773120.0, + "grad_norm": 2.764633424079652, + "language_loss": 0.82388502, + "learning_rate": 3.959203908195741e-06, + "loss": 0.9044922, + "num_input_tokens_seen": 32848275, + "router_z_loss_clip": 4.04296875, + "router_z_loss_mlp": 0.43774414, + "step": 1538, + "time_per_iteration": 2.5693938732147217 + }, + { + "auxiliary_loss_clip": 0.06616426, + "auxiliary_loss_mlp": 0.01331188, + "balance_loss_clip": 0.06353034, + "balance_loss_mlp": 0.01300217, + "epoch": 0.09252968585600481, + "flos": 67580052312960.0, + "grad_norm": 0.7302597602699774, + "language_loss": 0.57435596, + "learning_rate": 3.959125609588142e-06, + "loss": 0.65383208, + "num_input_tokens_seen": 32917730, + "router_z_loss_clip": 2.625, + "router_z_loss_mlp": 0.30932617, + "step": 1539, + "time_per_iteration": 3.310535430908203 + }, + { + "auxiliary_loss_clip": 0.06755982, + "auxiliary_loss_mlp": 0.01299614, + "balance_loss_clip": 0.06351999, + "balance_loss_mlp": 0.01256174, + "epoch": 0.09258980910867277, + "flos": 17389542193920.0, + "grad_norm": 3.846304679224495, + "language_loss": 0.7084049, + "learning_rate": 3.959047236690304e-06, + "loss": 0.78896087, + "num_input_tokens_seen": 32934910, + "router_z_loss_clip": 4.03320312, + "router_z_loss_mlp": 0.43457031, + "step": 1540, + "time_per_iteration": 2.5759708881378174 + }, + { + "auxiliary_loss_clip": 0.06744132, + "auxiliary_loss_mlp": 0.01299701, + "balance_loss_clip": 0.0634924, + "balance_loss_mlp": 0.0125824, + "epoch": 0.09264993236134075, + "flos": 19872205217280.0, + "grad_norm": 1.8486482297190108, + "language_loss": 0.8567428, + "learning_rate": 3.958968789505198e-06, + "loss": 0.93718112, + "num_input_tokens_seen": 32953840, + "router_z_loss_clip": 3.95117188, + "router_z_loss_mlp": 0.41455078, + "step": 1541, + "time_per_iteration": 2.5332911014556885 + }, + { + "auxiliary_loss_clip": 0.06613824, + "auxiliary_loss_mlp": 0.01296188, + "balance_loss_clip": 0.06351398, + "balance_loss_mlp": 0.01268222, + "epoch": 0.09271005561400872, + "flos": 62301455377920.0, + "grad_norm": 0.8853632542817719, + "language_loss": 0.62370431, + "learning_rate": 3.9588902680358e-06, + "loss": 0.70280445, + "num_input_tokens_seen": 33011410, + "router_z_loss_clip": 2.625, + "router_z_loss_mlp": 0.28027344, + "step": 1542, + "time_per_iteration": 3.234708309173584 + }, + { + "auxiliary_loss_clip": 0.06759306, + "auxiliary_loss_mlp": 0.01304245, + "balance_loss_clip": 0.06356558, + "balance_loss_mlp": 0.01259923, + "epoch": 0.09277017886667668, + "flos": 23336358641280.0, + "grad_norm": 2.3970894213309, + "language_loss": 0.84548283, + "learning_rate": 3.958811672285086e-06, + "loss": 0.92611837, + "num_input_tokens_seen": 33031675, + "router_z_loss_clip": 4.03320312, + "router_z_loss_mlp": 0.44360352, + "step": 1543, + "time_per_iteration": 2.5636215209960938 + }, + { + "auxiliary_loss_clip": 0.06747155, + "auxiliary_loss_mlp": 0.01303454, + "balance_loss_clip": 0.06351274, + "balance_loss_mlp": 0.01258178, + "epoch": 0.09283030211934466, + "flos": 54757088513280.0, + "grad_norm": 2.335606951107943, + "language_loss": 0.73961073, + "learning_rate": 3.958733002256038e-06, + "loss": 0.82011688, + "num_input_tokens_seen": 33056355, + "router_z_loss_clip": 3.96289062, + "router_z_loss_mlp": 0.45288086, + "step": 1544, + "time_per_iteration": 2.8664584159851074 + }, + { + "auxiliary_loss_clip": 0.06775358, + "auxiliary_loss_mlp": 0.01304809, + "balance_loss_clip": 0.06364222, + "balance_loss_mlp": 0.01260082, + "epoch": 0.09289042537201263, + "flos": 30342385751040.0, + "grad_norm": 2.3360980643139673, + "language_loss": 0.78971326, + "learning_rate": 3.958654257951637e-06, + "loss": 0.87051487, + "num_input_tokens_seen": 33079520, + "router_z_loss_clip": 4.109375, + "router_z_loss_mlp": 0.44750977, + "step": 1545, + "time_per_iteration": 2.6384429931640625 + }, + { + "auxiliary_loss_clip": 0.0674521, + "auxiliary_loss_mlp": 0.01308675, + "balance_loss_clip": 0.06349306, + "balance_loss_mlp": 0.01266499, + "epoch": 0.09295054862468059, + "flos": 17752274519040.0, + "grad_norm": 3.8854693427637796, + "language_loss": 0.77781618, + "learning_rate": 3.9585754393748706e-06, + "loss": 0.85835493, + "num_input_tokens_seen": 33096135, + "router_z_loss_clip": 3.95703125, + "router_z_loss_mlp": 0.42163086, + "step": 1546, + "time_per_iteration": 2.5352087020874023 + }, + { + "auxiliary_loss_clip": 0.06760454, + "auxiliary_loss_mlp": 0.01300982, + "balance_loss_clip": 0.06357808, + "balance_loss_mlp": 0.01258066, + "epoch": 0.09301067187734856, + "flos": 23664528357120.0, + "grad_norm": 2.488248885797729, + "language_loss": 0.85732055, + "learning_rate": 3.9584965465287275e-06, + "loss": 0.93793488, + "num_input_tokens_seen": 33115245, + "router_z_loss_clip": 4.02734375, + "router_z_loss_mlp": 0.42919922, + "step": 1547, + "time_per_iteration": 2.6185734272003174 + }, + { + "auxiliary_loss_clip": 0.0676943, + "auxiliary_loss_mlp": 0.01302462, + "balance_loss_clip": 0.06361516, + "balance_loss_mlp": 0.01256733, + "epoch": 0.09307079513001654, + "flos": 27535242591360.0, + "grad_norm": 10.105633046635301, + "language_loss": 0.69631422, + "learning_rate": 3.958417579416199e-06, + "loss": 0.77703309, + "num_input_tokens_seen": 33136640, + "router_z_loss_clip": 4.078125, + "router_z_loss_mlp": 0.45703125, + "step": 1548, + "time_per_iteration": 2.590592861175537 + }, + { + "auxiliary_loss_clip": 0.06756231, + "auxiliary_loss_mlp": 0.01308751, + "balance_loss_clip": 0.06351212, + "balance_loss_mlp": 0.01262164, + "epoch": 0.0931309183826845, + "flos": 20632945236480.0, + "grad_norm": 2.778765119974638, + "language_loss": 0.85783607, + "learning_rate": 3.9583385380402795e-06, + "loss": 0.93848586, + "num_input_tokens_seen": 33155060, + "router_z_loss_clip": 4.0546875, + "router_z_loss_mlp": 0.46582031, + "step": 1549, + "time_per_iteration": 2.5733652114868164 + }, + { + "auxiliary_loss_clip": 0.0674461, + "auxiliary_loss_mlp": 0.0130734, + "balance_loss_clip": 0.06348558, + "balance_loss_mlp": 0.01260515, + "epoch": 0.09319104163535247, + "flos": 29028239441280.0, + "grad_norm": 2.291130376172184, + "language_loss": 0.78293371, + "learning_rate": 3.958259422403966e-06, + "loss": 0.86345315, + "num_input_tokens_seen": 33175420, + "router_z_loss_clip": 3.96289062, + "router_z_loss_mlp": 0.46777344, + "step": 1550, + "time_per_iteration": 2.675468683242798 + }, + { + "auxiliary_loss_clip": 0.06764482, + "auxiliary_loss_mlp": 0.01307112, + "balance_loss_clip": 0.06363475, + "balance_loss_mlp": 0.01261932, + "epoch": 0.09325116488802045, + "flos": 25308605318400.0, + "grad_norm": 3.8025580487165827, + "language_loss": 0.85284662, + "learning_rate": 3.95818023251026e-06, + "loss": 0.93356252, + "num_input_tokens_seen": 33194120, + "router_z_loss_clip": 4.00976562, + "router_z_loss_mlp": 0.4519043, + "step": 1551, + "time_per_iteration": 2.6053500175476074 + }, + { + "auxiliary_loss_clip": 0.06596169, + "auxiliary_loss_mlp": 0.0130535, + "balance_loss_clip": 0.0633968, + "balance_loss_mlp": 0.01277837, + "epoch": 0.09331128814068841, + "flos": 61556144509440.0, + "grad_norm": 0.7233822491319317, + "language_loss": 0.61895663, + "learning_rate": 3.958100968362163e-06, + "loss": 0.69797182, + "num_input_tokens_seen": 33261080, + "router_z_loss_clip": 2.5625, + "router_z_loss_mlp": 0.27587891, + "step": 1552, + "time_per_iteration": 3.3384416103363037 + }, + { + "auxiliary_loss_clip": 0.06590016, + "auxiliary_loss_mlp": 0.01301581, + "balance_loss_clip": 0.06333126, + "balance_loss_mlp": 0.012734, + "epoch": 0.09337141139335638, + "flos": 53312810883840.0, + "grad_norm": 0.7946952857616146, + "language_loss": 0.59040678, + "learning_rate": 3.958021629962681e-06, + "loss": 0.66932273, + "num_input_tokens_seen": 33330235, + "router_z_loss_clip": 2.5625, + "router_z_loss_mlp": 0.28222656, + "step": 1553, + "time_per_iteration": 3.328634262084961 + }, + { + "auxiliary_loss_clip": 0.06762205, + "auxiliary_loss_mlp": 0.01305187, + "balance_loss_clip": 0.06356394, + "balance_loss_mlp": 0.01259005, + "epoch": 0.09343153464602436, + "flos": 23483539537920.0, + "grad_norm": 2.4998209031659853, + "language_loss": 0.888143, + "learning_rate": 3.957942217314823e-06, + "loss": 0.96881694, + "num_input_tokens_seen": 33349035, + "router_z_loss_clip": 4.05078125, + "router_z_loss_mlp": 0.46142578, + "step": 1554, + "time_per_iteration": 2.581807851791382 + }, + { + "auxiliary_loss_clip": 0.06741555, + "auxiliary_loss_mlp": 0.01307833, + "balance_loss_clip": 0.06351957, + "balance_loss_mlp": 0.01266014, + "epoch": 0.09349165789869232, + "flos": 19359399029760.0, + "grad_norm": 2.344370035353047, + "language_loss": 0.83131635, + "learning_rate": 3.957862730421599e-06, + "loss": 0.91181016, + "num_input_tokens_seen": 33368060, + "router_z_loss_clip": 3.89453125, + "router_z_loss_mlp": 0.41772461, + "step": 1555, + "time_per_iteration": 2.5902695655822754 + }, + { + "auxiliary_loss_clip": 0.06587426, + "auxiliary_loss_mlp": 0.01289293, + "balance_loss_clip": 0.06331394, + "balance_loss_mlp": 0.01264736, + "epoch": 0.09355178115136029, + "flos": 67520626968960.0, + "grad_norm": 0.861973728001382, + "language_loss": 0.59963852, + "learning_rate": 3.957783169286024e-06, + "loss": 0.67840576, + "num_input_tokens_seen": 33430825, + "router_z_loss_clip": 2.5625, + "router_z_loss_mlp": 0.2454834, + "step": 1556, + "time_per_iteration": 4.633097410202026 + }, + { + "auxiliary_loss_clip": 0.06743869, + "auxiliary_loss_mlp": 0.01306461, + "balance_loss_clip": 0.06350282, + "balance_loss_mlp": 0.01262378, + "epoch": 0.09361190440402825, + "flos": 37350676920960.0, + "grad_norm": 4.324378965941339, + "language_loss": 0.86094332, + "learning_rate": 3.9577035339111155e-06, + "loss": 0.94144666, + "num_input_tokens_seen": 33454855, + "router_z_loss_clip": 3.93359375, + "router_z_loss_mlp": 0.44091797, + "step": 1557, + "time_per_iteration": 4.159425258636475 + }, + { + "auxiliary_loss_clip": 0.06735416, + "auxiliary_loss_mlp": 0.01305568, + "balance_loss_clip": 0.0634184, + "balance_loss_mlp": 0.01261961, + "epoch": 0.09367202765669623, + "flos": 24906614555520.0, + "grad_norm": 1.8416864834979163, + "language_loss": 0.79618692, + "learning_rate": 3.957623824299893e-06, + "loss": 0.87659669, + "num_input_tokens_seen": 33476000, + "router_z_loss_clip": 3.9375, + "router_z_loss_mlp": 0.4362793, + "step": 1558, + "time_per_iteration": 2.592564105987549 + }, + { + "auxiliary_loss_clip": 0.0675108, + "auxiliary_loss_mlp": 0.01310633, + "balance_loss_clip": 0.06350247, + "balance_loss_mlp": 0.0126562, + "epoch": 0.0937321509093642, + "flos": 15710986477440.0, + "grad_norm": 2.1774663365636555, + "language_loss": 0.81722063, + "learning_rate": 3.957544040455379e-06, + "loss": 0.89783776, + "num_input_tokens_seen": 33493845, + "router_z_loss_clip": 4.00390625, + "router_z_loss_mlp": 0.44995117, + "step": 1559, + "time_per_iteration": 2.6032233238220215 + }, + { + "auxiliary_loss_clip": 0.06735763, + "auxiliary_loss_mlp": 0.01315647, + "balance_loss_clip": 0.06339972, + "balance_loss_mlp": 0.0126844, + "epoch": 0.09379227416203216, + "flos": 20489663554560.0, + "grad_norm": 4.6744208078316785, + "language_loss": 0.77938354, + "learning_rate": 3.957464182380599e-06, + "loss": 0.85989761, + "num_input_tokens_seen": 33510850, + "router_z_loss_clip": 3.95117188, + "router_z_loss_mlp": 0.47216797, + "step": 1560, + "time_per_iteration": 4.077486753463745 + }, + { + "auxiliary_loss_clip": 0.06748343, + "auxiliary_loss_mlp": 0.01308417, + "balance_loss_clip": 0.06347422, + "balance_loss_mlp": 0.01262736, + "epoch": 0.09385239741470014, + "flos": 24359329612800.0, + "grad_norm": 2.0394992370655975, + "language_loss": 0.82801652, + "learning_rate": 3.95738425007858e-06, + "loss": 0.90858412, + "num_input_tokens_seen": 33530430, + "router_z_loss_clip": 4.0078125, + "router_z_loss_mlp": 0.45678711, + "step": 1561, + "time_per_iteration": 2.596116781234741 + }, + { + "auxiliary_loss_clip": 0.06752103, + "auxiliary_loss_mlp": 0.01323602, + "balance_loss_clip": 0.06347683, + "balance_loss_mlp": 0.01280186, + "epoch": 0.0939125206673681, + "flos": 33299812408320.0, + "grad_norm": 7.4214047506541085, + "language_loss": 0.63655907, + "learning_rate": 3.957304243552354e-06, + "loss": 0.71731609, + "num_input_tokens_seen": 33551975, + "router_z_loss_clip": 4.046875, + "router_z_loss_mlp": 0.43457031, + "step": 1562, + "time_per_iteration": 4.075207710266113 + }, + { + "auxiliary_loss_clip": 0.06726522, + "auxiliary_loss_mlp": 0.01325114, + "balance_loss_clip": 0.06341539, + "balance_loss_mlp": 0.012796, + "epoch": 0.09397264392003607, + "flos": 19250973446400.0, + "grad_norm": 3.0209063418471516, + "language_loss": 0.87167883, + "learning_rate": 3.957224162804956e-06, + "loss": 0.95219523, + "num_input_tokens_seen": 33569850, + "router_z_loss_clip": 3.84960938, + "router_z_loss_mlp": 0.45556641, + "step": 1563, + "time_per_iteration": 2.5672974586486816 + }, + { + "auxiliary_loss_clip": 0.06731268, + "auxiliary_loss_mlp": 0.01318973, + "balance_loss_clip": 0.06341776, + "balance_loss_mlp": 0.01275843, + "epoch": 0.09403276717270405, + "flos": 19323997879680.0, + "grad_norm": 4.036825223775372, + "language_loss": 0.77853692, + "learning_rate": 3.9571440078394205e-06, + "loss": 0.85903931, + "num_input_tokens_seen": 33590510, + "router_z_loss_clip": 3.90039062, + "router_z_loss_mlp": 0.43139648, + "step": 1564, + "time_per_iteration": 2.586803913116455 + }, + { + "auxiliary_loss_clip": 0.06734219, + "auxiliary_loss_mlp": 0.0132655, + "balance_loss_clip": 0.06344242, + "balance_loss_mlp": 0.01285876, + "epoch": 0.09409289042537201, + "flos": 23589701061120.0, + "grad_norm": 2.2846066488683725, + "language_loss": 0.81194431, + "learning_rate": 3.9570637786587895e-06, + "loss": 0.89255196, + "num_input_tokens_seen": 33608810, + "router_z_loss_clip": 3.90039062, + "router_z_loss_mlp": 0.40649414, + "step": 1565, + "time_per_iteration": 2.5794317722320557 + }, + { + "auxiliary_loss_clip": 0.06753047, + "auxiliary_loss_mlp": 0.01322466, + "balance_loss_clip": 0.06351732, + "balance_loss_mlp": 0.01275616, + "epoch": 0.09415301367803998, + "flos": 20083689722880.0, + "grad_norm": 2.6435222335860984, + "language_loss": 0.77859378, + "learning_rate": 3.956983475266103e-06, + "loss": 0.85934889, + "num_input_tokens_seen": 33627265, + "router_z_loss_clip": 4.01171875, + "router_z_loss_mlp": 0.46850586, + "step": 1566, + "time_per_iteration": 2.585827112197876 + }, + { + "auxiliary_loss_clip": 0.06732298, + "auxiliary_loss_mlp": 0.01317656, + "balance_loss_clip": 0.06341095, + "balance_loss_mlp": 0.01273048, + "epoch": 0.09421313693070796, + "flos": 21067234548480.0, + "grad_norm": 2.512043511854747, + "language_loss": 0.79885954, + "learning_rate": 3.956903097664407e-06, + "loss": 0.87935913, + "num_input_tokens_seen": 33644810, + "router_z_loss_clip": 3.91015625, + "router_z_loss_mlp": 0.44555664, + "step": 1567, + "time_per_iteration": 2.6127569675445557 + }, + { + "auxiliary_loss_clip": 0.06736939, + "auxiliary_loss_mlp": 0.01312026, + "balance_loss_clip": 0.06345257, + "balance_loss_mlp": 0.01268467, + "epoch": 0.09427326018337592, + "flos": 24323006067840.0, + "grad_norm": 2.023408518632979, + "language_loss": 0.8442241, + "learning_rate": 3.956822645856749e-06, + "loss": 0.92471373, + "num_input_tokens_seen": 33665665, + "router_z_loss_clip": 3.91796875, + "router_z_loss_mlp": 0.43505859, + "step": 1568, + "time_per_iteration": 2.569720506668091 + }, + { + "auxiliary_loss_clip": 0.06755883, + "auxiliary_loss_mlp": 0.01306618, + "balance_loss_clip": 0.06353641, + "balance_loss_mlp": 0.01263583, + "epoch": 0.09433338343604389, + "flos": 20269667859840.0, + "grad_norm": 2.477497103121254, + "language_loss": 0.77784359, + "learning_rate": 3.9567421198461814e-06, + "loss": 0.85846859, + "num_input_tokens_seen": 33684760, + "router_z_loss_clip": 4.01757812, + "router_z_loss_mlp": 0.43041992, + "step": 1569, + "time_per_iteration": 2.573776960372925 + }, + { + "auxiliary_loss_clip": 0.06750233, + "auxiliary_loss_mlp": 0.01322236, + "balance_loss_clip": 0.06360742, + "balance_loss_mlp": 0.01281443, + "epoch": 0.09439350668871185, + "flos": 12746683785600.0, + "grad_norm": 3.1104432371221495, + "language_loss": 0.87103617, + "learning_rate": 3.956661519635756e-06, + "loss": 0.95176083, + "num_input_tokens_seen": 33700750, + "router_z_loss_clip": 3.8984375, + "router_z_loss_mlp": 0.40795898, + "step": 1570, + "time_per_iteration": 2.5129590034484863 + }, + { + "auxiliary_loss_clip": 0.06749961, + "auxiliary_loss_mlp": 0.01311255, + "balance_loss_clip": 0.06350505, + "balance_loss_mlp": 0.01269007, + "epoch": 0.09445362994137983, + "flos": 25970101776000.0, + "grad_norm": 2.3671248077954297, + "language_loss": 0.7803812, + "learning_rate": 3.95658084522853e-06, + "loss": 0.86099339, + "num_input_tokens_seen": 33724430, + "router_z_loss_clip": 3.99609375, + "router_z_loss_mlp": 0.42236328, + "step": 1571, + "time_per_iteration": 2.7541556358337402 + }, + { + "auxiliary_loss_clip": 0.0672407, + "auxiliary_loss_mlp": 0.01308455, + "balance_loss_clip": 0.06346194, + "balance_loss_mlp": 0.01269807, + "epoch": 0.0945137531940478, + "flos": 19720831616640.0, + "grad_norm": 2.4306247586771934, + "language_loss": 0.81068146, + "learning_rate": 3.956500096627561e-06, + "loss": 0.89100671, + "num_input_tokens_seen": 33743455, + "router_z_loss_clip": 3.78125, + "router_z_loss_mlp": 0.38623047, + "step": 1572, + "time_per_iteration": 2.5679988861083984 + }, + { + "auxiliary_loss_clip": 0.06744019, + "auxiliary_loss_mlp": 0.01308416, + "balance_loss_clip": 0.06344286, + "balance_loss_mlp": 0.01265691, + "epoch": 0.09457387644671576, + "flos": 23622796224000.0, + "grad_norm": 3.3370924728894185, + "language_loss": 0.8915112, + "learning_rate": 3.956419273835913e-06, + "loss": 0.97203565, + "num_input_tokens_seen": 33763435, + "router_z_loss_clip": 3.99804688, + "router_z_loss_mlp": 0.42700195, + "step": 1573, + "time_per_iteration": 2.607600688934326 + }, + { + "auxiliary_loss_clip": 0.06757497, + "auxiliary_loss_mlp": 0.01304776, + "balance_loss_clip": 0.0635422, + "balance_loss_mlp": 0.0125919, + "epoch": 0.09463399969938374, + "flos": 26914681653120.0, + "grad_norm": 3.5983977458342764, + "language_loss": 0.83351094, + "learning_rate": 3.95633837685665e-06, + "loss": 0.91413361, + "num_input_tokens_seen": 33784325, + "router_z_loss_clip": 4.02734375, + "router_z_loss_mlp": 0.45605469, + "step": 1574, + "time_per_iteration": 2.629686117172241 + }, + { + "auxiliary_loss_clip": 0.06738517, + "auxiliary_loss_mlp": 0.01306377, + "balance_loss_clip": 0.06343692, + "balance_loss_mlp": 0.01264463, + "epoch": 0.0946941229520517, + "flos": 23666331219840.0, + "grad_norm": 2.307572986084867, + "language_loss": 0.82900977, + "learning_rate": 3.95625740569284e-06, + "loss": 0.9094587, + "num_input_tokens_seen": 33802510, + "router_z_loss_clip": 3.9453125, + "router_z_loss_mlp": 0.41918945, + "step": 1575, + "time_per_iteration": 2.6788809299468994 + }, + { + "auxiliary_loss_clip": 0.06738277, + "auxiliary_loss_mlp": 0.013099, + "balance_loss_clip": 0.06341611, + "balance_loss_mlp": 0.01265912, + "epoch": 0.09475424620471967, + "flos": 24140927145600.0, + "grad_norm": 3.091827797586119, + "language_loss": 0.88420904, + "learning_rate": 3.956176360347553e-06, + "loss": 0.9646908, + "num_input_tokens_seen": 33819980, + "router_z_loss_clip": 3.96875, + "router_z_loss_mlp": 0.43969727, + "step": 1576, + "time_per_iteration": 2.579481840133667 + }, + { + "auxiliary_loss_clip": 0.06599005, + "auxiliary_loss_mlp": 0.01293963, + "balance_loss_clip": 0.06343846, + "balance_loss_mlp": 0.01269894, + "epoch": 0.09481436945738765, + "flos": 68446283022720.0, + "grad_norm": 0.9736372426009887, + "language_loss": 0.66026628, + "learning_rate": 3.956095240823862e-06, + "loss": 0.73919594, + "num_input_tokens_seen": 33878925, + "router_z_loss_clip": 2.5546875, + "router_z_loss_mlp": 0.24060059, + "step": 1577, + "time_per_iteration": 3.1515533924102783 + }, + { + "auxiliary_loss_clip": 0.06730399, + "auxiliary_loss_mlp": 0.01300904, + "balance_loss_clip": 0.06338648, + "balance_loss_mlp": 0.01260373, + "epoch": 0.09487449271005562, + "flos": 16659633277440.0, + "grad_norm": 8.095983487206498, + "language_loss": 0.81352609, + "learning_rate": 3.956014047124844e-06, + "loss": 0.89383912, + "num_input_tokens_seen": 33897600, + "router_z_loss_clip": 3.91601562, + "router_z_loss_mlp": 0.40551758, + "step": 1578, + "time_per_iteration": 2.5477943420410156 + }, + { + "auxiliary_loss_clip": 0.06728384, + "auxiliary_loss_mlp": 0.01305272, + "balance_loss_clip": 0.06339101, + "balance_loss_mlp": 0.01262261, + "epoch": 0.09493461596272358, + "flos": 24281860913280.0, + "grad_norm": 2.2398618164761674, + "language_loss": 0.79482144, + "learning_rate": 3.955932779253578e-06, + "loss": 0.87515795, + "num_input_tokens_seen": 33917365, + "router_z_loss_clip": 3.89453125, + "router_z_loss_mlp": 0.43017578, + "step": 1579, + "time_per_iteration": 2.5968475341796875 + }, + { + "auxiliary_loss_clip": 0.06732477, + "auxiliary_loss_mlp": 0.01300696, + "balance_loss_clip": 0.06336749, + "balance_loss_mlp": 0.012579, + "epoch": 0.09499473921539155, + "flos": 21876373100160.0, + "grad_norm": 2.5076146880491406, + "language_loss": 0.75397295, + "learning_rate": 3.955851437213144e-06, + "loss": 0.83430469, + "num_input_tokens_seen": 33936680, + "router_z_loss_clip": 3.95703125, + "router_z_loss_mlp": 0.42822266, + "step": 1580, + "time_per_iteration": 2.570138931274414 + }, + { + "auxiliary_loss_clip": 0.06724589, + "auxiliary_loss_mlp": 0.01310914, + "balance_loss_clip": 0.06333821, + "balance_loss_mlp": 0.01268666, + "epoch": 0.09505486246805953, + "flos": 33555544669440.0, + "grad_norm": 5.064476993970354, + "language_loss": 0.78532892, + "learning_rate": 3.955770021006627e-06, + "loss": 0.86568391, + "num_input_tokens_seen": 33960685, + "router_z_loss_clip": 3.90625, + "router_z_loss_mlp": 0.42236328, + "step": 1581, + "time_per_iteration": 2.6650803089141846 + }, + { + "auxiliary_loss_clip": 0.06722299, + "auxiliary_loss_mlp": 0.01301656, + "balance_loss_clip": 0.06332248, + "balance_loss_mlp": 0.01261006, + "epoch": 0.09511498572072749, + "flos": 21221752677120.0, + "grad_norm": 5.1362606458817925, + "language_loss": 0.89191097, + "learning_rate": 3.955688530637116e-06, + "loss": 0.97215056, + "num_input_tokens_seen": 33980015, + "router_z_loss_clip": 3.90429688, + "router_z_loss_mlp": 0.40698242, + "step": 1582, + "time_per_iteration": 2.5564815998077393 + }, + { + "auxiliary_loss_clip": 0.06727481, + "auxiliary_loss_mlp": 0.01303544, + "balance_loss_clip": 0.06332925, + "balance_loss_mlp": 0.01261773, + "epoch": 0.09517510897339546, + "flos": 14616542373120.0, + "grad_norm": 2.3229781210723393, + "language_loss": 0.68368226, + "learning_rate": 3.955606966107699e-06, + "loss": 0.76399243, + "num_input_tokens_seen": 33997705, + "router_z_loss_clip": 3.94140625, + "router_z_loss_mlp": 0.41772461, + "step": 1583, + "time_per_iteration": 2.6164753437042236 + }, + { + "auxiliary_loss_clip": 0.06727771, + "auxiliary_loss_mlp": 0.01304751, + "balance_loss_clip": 0.06331809, + "balance_loss_mlp": 0.01261048, + "epoch": 0.09523523222606343, + "flos": 27824531212800.0, + "grad_norm": 3.115442275670272, + "language_loss": 0.72724044, + "learning_rate": 3.95552532742147e-06, + "loss": 0.80756557, + "num_input_tokens_seen": 34017465, + "router_z_loss_clip": 3.95703125, + "router_z_loss_mlp": 0.43725586, + "step": 1584, + "time_per_iteration": 2.604071855545044 + }, + { + "auxiliary_loss_clip": 0.06722259, + "auxiliary_loss_mlp": 0.01304961, + "balance_loss_clip": 0.06331295, + "balance_loss_mlp": 0.01265431, + "epoch": 0.0952953554787314, + "flos": 20712887631360.0, + "grad_norm": 1.6075041233622491, + "language_loss": 0.82572448, + "learning_rate": 3.955443614581525e-06, + "loss": 0.90599668, + "num_input_tokens_seen": 34038550, + "router_z_loss_clip": 3.91210938, + "router_z_loss_mlp": 0.39550781, + "step": 1585, + "time_per_iteration": 2.586507797241211 + }, + { + "auxiliary_loss_clip": 0.0673333, + "auxiliary_loss_mlp": 0.01317767, + "balance_loss_clip": 0.06331026, + "balance_loss_mlp": 0.01272039, + "epoch": 0.09535547873139937, + "flos": 24794080122240.0, + "grad_norm": 2.5515489551775854, + "language_loss": 0.74444079, + "learning_rate": 3.955361827590961e-06, + "loss": 0.82495177, + "num_input_tokens_seen": 34058665, + "router_z_loss_clip": 4.01953125, + "router_z_loss_mlp": 0.45727539, + "step": 1586, + "time_per_iteration": 2.629486083984375 + }, + { + "auxiliary_loss_clip": 0.06581648, + "auxiliary_loss_mlp": 0.01282136, + "balance_loss_clip": 0.06328419, + "balance_loss_mlp": 0.01258128, + "epoch": 0.09541560198406734, + "flos": 71930114956800.0, + "grad_norm": 0.7905774049307454, + "language_loss": 0.55110765, + "learning_rate": 3.955279966452883e-06, + "loss": 0.62974548, + "num_input_tokens_seen": 34109655, + "router_z_loss_clip": 2.53125, + "router_z_loss_mlp": 0.23974609, + "step": 1587, + "time_per_iteration": 2.9765305519104004 + }, + { + "auxiliary_loss_clip": 0.06737173, + "auxiliary_loss_mlp": 0.01308566, + "balance_loss_clip": 0.06336194, + "balance_loss_mlp": 0.01264316, + "epoch": 0.09547572523673531, + "flos": 28989609909120.0, + "grad_norm": 3.1625529132554835, + "language_loss": 0.82650244, + "learning_rate": 3.955198031170391e-06, + "loss": 0.90695989, + "num_input_tokens_seen": 34131115, + "router_z_loss_clip": 4.01171875, + "router_z_loss_mlp": 0.44213867, + "step": 1588, + "time_per_iteration": 2.6358370780944824 + }, + { + "auxiliary_loss_clip": 0.06726347, + "auxiliary_loss_mlp": 0.01313798, + "balance_loss_clip": 0.06331095, + "balance_loss_mlp": 0.01270716, + "epoch": 0.09553584848940327, + "flos": 24140759437440.0, + "grad_norm": 5.541794796195464, + "language_loss": 0.83084911, + "learning_rate": 3.955116021746594e-06, + "loss": 0.91125059, + "num_input_tokens_seen": 34151925, + "router_z_loss_clip": 3.95507812, + "router_z_loss_mlp": 0.43066406, + "step": 1589, + "time_per_iteration": 2.609682559967041 + }, + { + "auxiliary_loss_clip": 0.06720543, + "auxiliary_loss_mlp": 0.01306342, + "balance_loss_clip": 0.06330015, + "balance_loss_mlp": 0.01265263, + "epoch": 0.09559597174207124, + "flos": 42861401193600.0, + "grad_norm": 2.659540476465126, + "language_loss": 0.66428804, + "learning_rate": 3.955033938184601e-06, + "loss": 0.7445569, + "num_input_tokens_seen": 34175395, + "router_z_loss_clip": 3.91015625, + "router_z_loss_mlp": 0.41113281, + "step": 1590, + "time_per_iteration": 2.7904412746429443 + }, + { + "auxiliary_loss_clip": 0.06727439, + "auxiliary_loss_mlp": 0.01307692, + "balance_loss_clip": 0.06336293, + "balance_loss_mlp": 0.01267947, + "epoch": 0.09565609499473922, + "flos": 32678999907840.0, + "grad_norm": 1.976054240399588, + "language_loss": 0.84640449, + "learning_rate": 3.954951780487526e-06, + "loss": 0.92675579, + "num_input_tokens_seen": 34197760, + "router_z_loss_clip": 3.91210938, + "router_z_loss_mlp": 0.39746094, + "step": 1591, + "time_per_iteration": 2.677856683731079 + }, + { + "auxiliary_loss_clip": 0.0673625, + "auxiliary_loss_mlp": 0.01301164, + "balance_loss_clip": 0.06335758, + "balance_loss_mlp": 0.01259751, + "epoch": 0.09571621824740718, + "flos": 18484279787520.0, + "grad_norm": 3.2019409014799245, + "language_loss": 0.76485634, + "learning_rate": 3.9548695486584835e-06, + "loss": 0.84523046, + "num_input_tokens_seen": 34215330, + "router_z_loss_clip": 4.00976562, + "router_z_loss_mlp": 0.41381836, + "step": 1592, + "time_per_iteration": 2.5469346046447754 + }, + { + "auxiliary_loss_clip": 0.06718349, + "auxiliary_loss_mlp": 0.01308454, + "balance_loss_clip": 0.06327368, + "balance_loss_mlp": 0.01266444, + "epoch": 0.09577634150007515, + "flos": 29395164470400.0, + "grad_norm": 2.5830614134690757, + "language_loss": 0.75440031, + "learning_rate": 3.954787242700592e-06, + "loss": 0.8346684, + "num_input_tokens_seen": 34237745, + "router_z_loss_clip": 3.90625, + "router_z_loss_mlp": 0.42041016, + "step": 1593, + "time_per_iteration": 2.6077914237976074 + }, + { + "auxiliary_loss_clip": 0.06715257, + "auxiliary_loss_mlp": 0.01313469, + "balance_loss_clip": 0.06327495, + "balance_loss_mlp": 0.01269863, + "epoch": 0.09583646475274313, + "flos": 22754511089280.0, + "grad_norm": 3.098780608368182, + "language_loss": 0.70938909, + "learning_rate": 3.954704862616971e-06, + "loss": 0.78967637, + "num_input_tokens_seen": 34256565, + "router_z_loss_clip": 3.87890625, + "router_z_loss_mlp": 0.4362793, + "step": 1594, + "time_per_iteration": 2.6091833114624023 + }, + { + "auxiliary_loss_clip": 0.06719844, + "auxiliary_loss_mlp": 0.01312184, + "balance_loss_clip": 0.06326512, + "balance_loss_mlp": 0.01271247, + "epoch": 0.0958965880054111, + "flos": 23224495040640.0, + "grad_norm": 3.065197690061672, + "language_loss": 0.83355862, + "learning_rate": 3.954622408410747e-06, + "loss": 0.91387886, + "num_input_tokens_seen": 34275970, + "router_z_loss_clip": 3.92773438, + "router_z_loss_mlp": 0.40942383, + "step": 1595, + "time_per_iteration": 3.978273630142212 + }, + { + "auxiliary_loss_clip": 0.06729501, + "auxiliary_loss_mlp": 0.01321195, + "balance_loss_clip": 0.06329941, + "balance_loss_mlp": 0.01278638, + "epoch": 0.09595671125807906, + "flos": 21330807166080.0, + "grad_norm": 2.8509518249201866, + "language_loss": 0.87066317, + "learning_rate": 3.954539880085045e-06, + "loss": 0.95117009, + "num_input_tokens_seen": 34295490, + "router_z_loss_clip": 3.99414062, + "router_z_loss_mlp": 0.42529297, + "step": 1596, + "time_per_iteration": 4.032626390457153 + }, + { + "auxiliary_loss_clip": 0.06723377, + "auxiliary_loss_mlp": 0.01316069, + "balance_loss_clip": 0.06335501, + "balance_loss_mlp": 0.01273273, + "epoch": 0.09601683451074704, + "flos": 39612841125120.0, + "grad_norm": 3.1423731979310587, + "language_loss": 0.70766866, + "learning_rate": 3.9544572776429945e-06, + "loss": 0.78806317, + "num_input_tokens_seen": 34319990, + "router_z_loss_clip": 3.8828125, + "router_z_loss_mlp": 0.42773438, + "step": 1597, + "time_per_iteration": 2.7174298763275146 + }, + { + "auxiliary_loss_clip": 0.06742129, + "auxiliary_loss_mlp": 0.01306146, + "balance_loss_clip": 0.06339651, + "balance_loss_mlp": 0.01265687, + "epoch": 0.096076957763415, + "flos": 23739523361280.0, + "grad_norm": 3.050895337571829, + "language_loss": 0.77272135, + "learning_rate": 3.954374601087729e-06, + "loss": 0.85320413, + "num_input_tokens_seen": 34339225, + "router_z_loss_clip": 4.02734375, + "router_z_loss_mlp": 0.40429688, + "step": 1598, + "time_per_iteration": 2.5799829959869385 + }, + { + "auxiliary_loss_clip": 0.06737213, + "auxiliary_loss_mlp": 0.01319114, + "balance_loss_clip": 0.06339812, + "balance_loss_mlp": 0.01276103, + "epoch": 0.09613708101608297, + "flos": 34686689662080.0, + "grad_norm": 4.982256482437043, + "language_loss": 0.70875788, + "learning_rate": 3.954291850422382e-06, + "loss": 0.78932118, + "num_input_tokens_seen": 34361020, + "router_z_loss_clip": 3.96679688, + "router_z_loss_mlp": 0.43041992, + "step": 1599, + "time_per_iteration": 4.165144443511963 + }, + { + "auxiliary_loss_clip": 0.0672265, + "auxiliary_loss_mlp": 0.01315059, + "balance_loss_clip": 0.06336158, + "balance_loss_mlp": 0.01275029, + "epoch": 0.09619720426875093, + "flos": 20746192429440.0, + "grad_norm": 2.7563705555600655, + "language_loss": 0.85738063, + "learning_rate": 3.954209025650093e-06, + "loss": 0.93775773, + "num_input_tokens_seen": 34378630, + "router_z_loss_clip": 3.86523438, + "router_z_loss_mlp": 0.40014648, + "step": 1600, + "time_per_iteration": 2.583336591720581 + }, + { + "auxiliary_loss_clip": 0.06737998, + "auxiliary_loss_mlp": 0.01310218, + "balance_loss_clip": 0.06341977, + "balance_loss_mlp": 0.01270641, + "epoch": 0.09625732752141891, + "flos": 13047795832320.0, + "grad_norm": 2.909698328635622, + "language_loss": 0.82446879, + "learning_rate": 3.954126126774001e-06, + "loss": 0.90495098, + "num_input_tokens_seen": 34397110, + "router_z_loss_clip": 3.96484375, + "router_z_loss_mlp": 0.39599609, + "step": 1601, + "time_per_iteration": 3.9834721088409424 + }, + { + "auxiliary_loss_clip": 0.06743482, + "auxiliary_loss_mlp": 0.01303448, + "balance_loss_clip": 0.06337628, + "balance_loss_mlp": 0.01262368, + "epoch": 0.09631745077408688, + "flos": 22280250579840.0, + "grad_norm": 5.887605287140624, + "language_loss": 0.84592891, + "learning_rate": 3.954043153797251e-06, + "loss": 0.92639828, + "num_input_tokens_seen": 34414165, + "router_z_loss_clip": 4.0546875, + "router_z_loss_mlp": 0.41088867, + "step": 1602, + "time_per_iteration": 2.5633962154388428 + }, + { + "auxiliary_loss_clip": 0.06747036, + "auxiliary_loss_mlp": 0.01307728, + "balance_loss_clip": 0.06349348, + "balance_loss_mlp": 0.012661, + "epoch": 0.09637757402675484, + "flos": 24761236521600.0, + "grad_norm": 2.955003508709107, + "language_loss": 0.65285349, + "learning_rate": 3.953960106722989e-06, + "loss": 0.73340118, + "num_input_tokens_seen": 34434445, + "router_z_loss_clip": 3.97851562, + "router_z_loss_mlp": 0.41625977, + "step": 1603, + "time_per_iteration": 2.6790709495544434 + }, + { + "auxiliary_loss_clip": 0.06770037, + "auxiliary_loss_mlp": 0.01301761, + "balance_loss_clip": 0.06360609, + "balance_loss_mlp": 0.01258321, + "epoch": 0.09643769727942282, + "flos": 22531873991040.0, + "grad_norm": 5.353230367509213, + "language_loss": 0.72867018, + "learning_rate": 3.953876985554364e-06, + "loss": 0.80938816, + "num_input_tokens_seen": 34453095, + "router_z_loss_clip": 4.09570312, + "router_z_loss_mlp": 0.43505859, + "step": 1604, + "time_per_iteration": 2.608727216720581 + }, + { + "auxiliary_loss_clip": 0.06740201, + "auxiliary_loss_mlp": 0.01291258, + "balance_loss_clip": 0.06351058, + "balance_loss_mlp": 0.01254327, + "epoch": 0.09649782053209079, + "flos": 30929138766720.0, + "grad_norm": 4.793252253869783, + "language_loss": 0.80923069, + "learning_rate": 3.953793790294527e-06, + "loss": 0.88954532, + "num_input_tokens_seen": 34473680, + "router_z_loss_clip": 3.890625, + "router_z_loss_mlp": 0.36938477, + "step": 1605, + "time_per_iteration": 2.6763031482696533 + }, + { + "auxiliary_loss_clip": 0.06759577, + "auxiliary_loss_mlp": 0.01298287, + "balance_loss_clip": 0.06351094, + "balance_loss_mlp": 0.01258805, + "epoch": 0.09655794378475875, + "flos": 25344635374080.0, + "grad_norm": 2.3859738867756524, + "language_loss": 0.77227855, + "learning_rate": 3.953710520946634e-06, + "loss": 0.85285711, + "num_input_tokens_seen": 34492610, + "router_z_loss_clip": 4.08203125, + "router_z_loss_mlp": 0.39501953, + "step": 1606, + "time_per_iteration": 2.5902390480041504 + }, + { + "auxiliary_loss_clip": 0.0675118, + "auxiliary_loss_mlp": 0.0129606, + "balance_loss_clip": 0.06355944, + "balance_loss_mlp": 0.01258009, + "epoch": 0.09661806703742673, + "flos": 22352604180480.0, + "grad_norm": 2.2398823980048133, + "language_loss": 0.77161521, + "learning_rate": 3.953627177513843e-06, + "loss": 0.85208762, + "num_input_tokens_seen": 34511855, + "router_z_loss_clip": 3.953125, + "router_z_loss_mlp": 0.38085938, + "step": 1607, + "time_per_iteration": 2.5747807025909424 + }, + { + "auxiliary_loss_clip": 0.06767638, + "auxiliary_loss_mlp": 0.01306362, + "balance_loss_clip": 0.06365312, + "balance_loss_mlp": 0.01268597, + "epoch": 0.0966781902900947, + "flos": 17463405168000.0, + "grad_norm": 2.424309477239619, + "language_loss": 0.89527833, + "learning_rate": 3.953543759999312e-06, + "loss": 0.97601831, + "num_input_tokens_seen": 34528905, + "router_z_loss_clip": 4.02539062, + "router_z_loss_mlp": 0.37768555, + "step": 1608, + "time_per_iteration": 2.528881072998047 + }, + { + "auxiliary_loss_clip": 0.06782863, + "auxiliary_loss_mlp": 0.01306552, + "balance_loss_clip": 0.06378618, + "balance_loss_mlp": 0.01264471, + "epoch": 0.09673831354276266, + "flos": 36912991518720.0, + "grad_norm": 7.970472148643012, + "language_loss": 0.74000025, + "learning_rate": 3.953460268406207e-06, + "loss": 0.82089442, + "num_input_tokens_seen": 34548480, + "router_z_loss_clip": 4.0390625, + "router_z_loss_mlp": 0.4206543, + "step": 1609, + "time_per_iteration": 2.734060764312744 + }, + { + "auxiliary_loss_clip": 0.06767572, + "auxiliary_loss_mlp": 0.01304591, + "balance_loss_clip": 0.06368488, + "balance_loss_mlp": 0.01264418, + "epoch": 0.09679843679543064, + "flos": 20707185553920.0, + "grad_norm": 3.4585784172758123, + "language_loss": 0.86017323, + "learning_rate": 3.953376702737693e-06, + "loss": 0.94089484, + "num_input_tokens_seen": 34565410, + "router_z_loss_clip": 3.99414062, + "router_z_loss_mlp": 0.40185547, + "step": 1610, + "time_per_iteration": 2.6115059852600098 + }, + { + "auxiliary_loss_clip": 0.06763892, + "auxiliary_loss_mlp": 0.01304909, + "balance_loss_clip": 0.06364195, + "balance_loss_mlp": 0.01263877, + "epoch": 0.0968585600480986, + "flos": 23521288602240.0, + "grad_norm": 2.270672864322457, + "language_loss": 0.68734491, + "learning_rate": 3.953293062996939e-06, + "loss": 0.76803291, + "num_input_tokens_seen": 34584840, + "router_z_loss_clip": 3.9921875, + "router_z_loss_mlp": 0.41040039, + "step": 1611, + "time_per_iteration": 2.614010810852051 + }, + { + "auxiliary_loss_clip": 0.06775121, + "auxiliary_loss_mlp": 0.01302817, + "balance_loss_clip": 0.06373329, + "balance_loss_mlp": 0.01263239, + "epoch": 0.09691868330076657, + "flos": 20127350499840.0, + "grad_norm": 2.139701940573329, + "language_loss": 0.82997268, + "learning_rate": 3.953209349187115e-06, + "loss": 0.91075206, + "num_input_tokens_seen": 34603360, + "router_z_loss_clip": 4.0234375, + "router_z_loss_mlp": 0.39599609, + "step": 1612, + "time_per_iteration": 2.5493521690368652 + }, + { + "auxiliary_loss_clip": 0.06771481, + "auxiliary_loss_mlp": 0.01301111, + "balance_loss_clip": 0.06373016, + "balance_loss_mlp": 0.01260509, + "epoch": 0.09697880655343454, + "flos": 16550243372160.0, + "grad_norm": 8.083682244788854, + "language_loss": 0.82256299, + "learning_rate": 3.953125561311398e-06, + "loss": 0.90328896, + "num_input_tokens_seen": 34620760, + "router_z_loss_clip": 3.984375, + "router_z_loss_mlp": 0.40600586, + "step": 1613, + "time_per_iteration": 2.597912311553955 + }, + { + "auxiliary_loss_clip": 0.06750716, + "auxiliary_loss_mlp": 0.01299993, + "balance_loss_clip": 0.06359349, + "balance_loss_mlp": 0.01259724, + "epoch": 0.09703892980610251, + "flos": 26111370960000.0, + "grad_norm": 2.0260319330855654, + "language_loss": 0.86653531, + "learning_rate": 3.953041699372964e-06, + "loss": 0.94704247, + "num_input_tokens_seen": 34640695, + "router_z_loss_clip": 3.91210938, + "router_z_loss_mlp": 0.40258789, + "step": 1614, + "time_per_iteration": 2.6904046535491943 + }, + { + "auxiliary_loss_clip": 0.06673412, + "auxiliary_loss_mlp": 0.0133076, + "balance_loss_clip": 0.06412064, + "balance_loss_mlp": 0.01308611, + "epoch": 0.09709905305877048, + "flos": 60463712903040.0, + "grad_norm": 0.7036996820791193, + "language_loss": 0.54819673, + "learning_rate": 3.952957763374992e-06, + "loss": 0.6282385, + "num_input_tokens_seen": 34702395, + "router_z_loss_clip": 2.61914062, + "router_z_loss_mlp": 0.22180176, + "step": 1615, + "time_per_iteration": 3.235962152481079 + }, + { + "auxiliary_loss_clip": 0.06658442, + "auxiliary_loss_mlp": 0.01303789, + "balance_loss_clip": 0.06397749, + "balance_loss_mlp": 0.01282129, + "epoch": 0.09715917631143844, + "flos": 57660510885120.0, + "grad_norm": 0.7526049722603284, + "language_loss": 0.58190084, + "learning_rate": 3.952873753320666e-06, + "loss": 0.66152322, + "num_input_tokens_seen": 34768910, + "router_z_loss_clip": 2.59765625, + "router_z_loss_mlp": 0.21691895, + "step": 1616, + "time_per_iteration": 3.387523889541626 + }, + { + "auxiliary_loss_clip": 0.06757308, + "auxiliary_loss_mlp": 0.01307733, + "balance_loss_clip": 0.06359798, + "balance_loss_mlp": 0.01265652, + "epoch": 0.09721929956410642, + "flos": 20564448923520.0, + "grad_norm": 2.209089082853045, + "language_loss": 0.70192569, + "learning_rate": 3.952789669213172e-06, + "loss": 0.78257608, + "num_input_tokens_seen": 34787680, + "router_z_loss_clip": 3.97265625, + "router_z_loss_mlp": 0.42041016, + "step": 1617, + "time_per_iteration": 2.5756118297576904 + }, + { + "auxiliary_loss_clip": 0.06757677, + "auxiliary_loss_mlp": 0.0131002, + "balance_loss_clip": 0.06358766, + "balance_loss_mlp": 0.01269298, + "epoch": 0.09727942281677439, + "flos": 27351696222720.0, + "grad_norm": 2.235248973511229, + "language_loss": 0.81849337, + "learning_rate": 3.952705511055698e-06, + "loss": 0.89917034, + "num_input_tokens_seen": 34808330, + "router_z_loss_clip": 3.98632812, + "router_z_loss_mlp": 0.40722656, + "step": 1618, + "time_per_iteration": 2.6768393516540527 + }, + { + "auxiliary_loss_clip": 0.0674091, + "auxiliary_loss_mlp": 0.01309795, + "balance_loss_clip": 0.06356256, + "balance_loss_mlp": 0.01273293, + "epoch": 0.09733954606944235, + "flos": 24906991898880.0, + "grad_norm": 1.9369475823390685, + "language_loss": 0.94461536, + "learning_rate": 3.952621278851435e-06, + "loss": 1.0251224, + "num_input_tokens_seen": 34830020, + "router_z_loss_clip": 3.84375, + "router_z_loss_mlp": 0.36474609, + "step": 1619, + "time_per_iteration": 2.6324799060821533 + }, + { + "auxiliary_loss_clip": 0.06749003, + "auxiliary_loss_mlp": 0.01319848, + "balance_loss_clip": 0.06356695, + "balance_loss_mlp": 0.01280556, + "epoch": 0.09739966932211033, + "flos": 31511992567680.0, + "grad_norm": 2.8077555075872183, + "language_loss": 0.90160304, + "learning_rate": 3.9525369726035784e-06, + "loss": 0.98229158, + "num_input_tokens_seen": 34850330, + "router_z_loss_clip": 3.91601562, + "router_z_loss_mlp": 0.39257812, + "step": 1620, + "time_per_iteration": 2.658043146133423 + }, + { + "auxiliary_loss_clip": 0.06742691, + "auxiliary_loss_mlp": 0.01310778, + "balance_loss_clip": 0.06352507, + "balance_loss_mlp": 0.01268602, + "epoch": 0.0974597925747783, + "flos": 23885614154880.0, + "grad_norm": 11.754534189846764, + "language_loss": 0.78833234, + "learning_rate": 3.952452592315324e-06, + "loss": 0.86886704, + "num_input_tokens_seen": 34871640, + "router_z_loss_clip": 3.90429688, + "router_z_loss_mlp": 0.421875, + "step": 1621, + "time_per_iteration": 2.575810432434082 + }, + { + "auxiliary_loss_clip": 0.06744215, + "auxiliary_loss_mlp": 0.01311535, + "balance_loss_clip": 0.06357577, + "balance_loss_mlp": 0.01271863, + "epoch": 0.09751991582744626, + "flos": 17025300495360.0, + "grad_norm": 3.321884403192612, + "language_loss": 0.7956326, + "learning_rate": 3.952368137989871e-06, + "loss": 0.87619019, + "num_input_tokens_seen": 34888100, + "router_z_loss_clip": 3.86523438, + "router_z_loss_mlp": 0.39648438, + "step": 1622, + "time_per_iteration": 2.5544931888580322 + }, + { + "auxiliary_loss_clip": 0.06764823, + "auxiliary_loss_mlp": 0.01312235, + "balance_loss_clip": 0.06359966, + "balance_loss_mlp": 0.0127199, + "epoch": 0.09758003908011423, + "flos": 28410403760640.0, + "grad_norm": 4.629544309513281, + "language_loss": 0.86985308, + "learning_rate": 3.9522836096304225e-06, + "loss": 0.95062363, + "num_input_tokens_seen": 34910485, + "router_z_loss_clip": 4.046875, + "router_z_loss_mlp": 0.40209961, + "step": 1623, + "time_per_iteration": 2.612455129623413 + }, + { + "auxiliary_loss_clip": 0.06759211, + "auxiliary_loss_mlp": 0.01313929, + "balance_loss_clip": 0.06368798, + "balance_loss_mlp": 0.01275353, + "epoch": 0.09764016233278221, + "flos": 18149150183040.0, + "grad_norm": 2.3724260177997, + "language_loss": 0.82168519, + "learning_rate": 3.952199007240184e-06, + "loss": 0.90241659, + "num_input_tokens_seen": 34928615, + "router_z_loss_clip": 3.90429688, + "router_z_loss_mlp": 0.38598633, + "step": 1624, + "time_per_iteration": 2.572327136993408 + }, + { + "auxiliary_loss_clip": 0.06750062, + "auxiliary_loss_mlp": 0.01321107, + "balance_loss_clip": 0.06362263, + "balance_loss_mlp": 0.01284462, + "epoch": 0.09770028558545017, + "flos": 15270869306880.0, + "grad_norm": 2.8002590375685195, + "language_loss": 0.87639892, + "learning_rate": 3.952114330822364e-06, + "loss": 0.95711064, + "num_input_tokens_seen": 34946045, + "router_z_loss_clip": 3.87890625, + "router_z_loss_mlp": 0.36645508, + "step": 1625, + "time_per_iteration": 2.5327792167663574 + }, + { + "auxiliary_loss_clip": 0.06781108, + "auxiliary_loss_mlp": 0.01314743, + "balance_loss_clip": 0.06374431, + "balance_loss_mlp": 0.01273353, + "epoch": 0.09776040883811814, + "flos": 23478382512000.0, + "grad_norm": 2.111707696763749, + "language_loss": 0.8695811, + "learning_rate": 3.952029580380172e-06, + "loss": 0.95053965, + "num_input_tokens_seen": 34962865, + "router_z_loss_clip": 4.06445312, + "router_z_loss_mlp": 0.4140625, + "step": 1626, + "time_per_iteration": 2.631251096725464 + }, + { + "auxiliary_loss_clip": 0.067652, + "auxiliary_loss_mlp": 0.01306731, + "balance_loss_clip": 0.06367379, + "balance_loss_mlp": 0.01267177, + "epoch": 0.09782053209078612, + "flos": 24506510509440.0, + "grad_norm": 2.38090987978409, + "language_loss": 0.84928203, + "learning_rate": 3.9519447559168234e-06, + "loss": 0.93000138, + "num_input_tokens_seen": 34983505, + "router_z_loss_clip": 3.97460938, + "router_z_loss_mlp": 0.39550781, + "step": 1627, + "time_per_iteration": 2.6171953678131104 + }, + { + "auxiliary_loss_clip": 0.06749414, + "auxiliary_loss_mlp": 0.01311575, + "balance_loss_clip": 0.06362557, + "balance_loss_mlp": 0.01274334, + "epoch": 0.09788065534345408, + "flos": 21586623281280.0, + "grad_norm": 2.0465991602511107, + "language_loss": 0.86433482, + "learning_rate": 3.951859857435534e-06, + "loss": 0.94494474, + "num_input_tokens_seen": 35001825, + "router_z_loss_clip": 3.8671875, + "router_z_loss_mlp": 0.37255859, + "step": 1628, + "time_per_iteration": 2.5730161666870117 + }, + { + "auxiliary_loss_clip": 0.06751154, + "auxiliary_loss_mlp": 0.013221, + "balance_loss_clip": 0.06365977, + "balance_loss_mlp": 0.0128362, + "epoch": 0.09794077859612205, + "flos": 23849332536960.0, + "grad_norm": 2.074450963540643, + "language_loss": 0.76707101, + "learning_rate": 3.951774884939523e-06, + "loss": 0.84780353, + "num_input_tokens_seen": 35023075, + "router_z_loss_clip": 3.8515625, + "router_z_loss_mlp": 0.38452148, + "step": 1629, + "time_per_iteration": 2.615643262863159 + }, + { + "auxiliary_loss_clip": 0.06753751, + "auxiliary_loss_mlp": 0.01312675, + "balance_loss_clip": 0.06363355, + "balance_loss_mlp": 0.01273288, + "epoch": 0.09800090184879003, + "flos": 23666708563200.0, + "grad_norm": 2.0658158581699806, + "language_loss": 0.79474878, + "learning_rate": 3.951689838432013e-06, + "loss": 0.87541306, + "num_input_tokens_seen": 35043480, + "router_z_loss_clip": 3.91015625, + "router_z_loss_mlp": 0.39379883, + "step": 1630, + "time_per_iteration": 2.5846662521362305 + }, + { + "auxiliary_loss_clip": 0.06751612, + "auxiliary_loss_mlp": 0.01306103, + "balance_loss_clip": 0.06359278, + "balance_loss_mlp": 0.01266335, + "epoch": 0.09806102510145799, + "flos": 17061456332160.0, + "grad_norm": 3.092577982684634, + "language_loss": 0.88391125, + "learning_rate": 3.951604717916228e-06, + "loss": 0.96448845, + "num_input_tokens_seen": 35061490, + "router_z_loss_clip": 3.92382812, + "router_z_loss_mlp": 0.39770508, + "step": 1631, + "time_per_iteration": 2.545468807220459 + }, + { + "auxiliary_loss_clip": 0.06742664, + "auxiliary_loss_mlp": 0.01296447, + "balance_loss_clip": 0.06359032, + "balance_loss_mlp": 0.01259039, + "epoch": 0.09812114835412596, + "flos": 23885278738560.0, + "grad_norm": 2.2303411170681566, + "language_loss": 0.8421644, + "learning_rate": 3.9515195233953975e-06, + "loss": 0.92255551, + "num_input_tokens_seen": 35079670, + "router_z_loss_clip": 3.8359375, + "router_z_loss_mlp": 0.37426758, + "step": 1632, + "time_per_iteration": 2.5765457153320312 + }, + { + "auxiliary_loss_clip": 0.06746343, + "auxiliary_loss_mlp": 0.01300275, + "balance_loss_clip": 0.0636283, + "balance_loss_mlp": 0.01262557, + "epoch": 0.09818127160679392, + "flos": 20601862571520.0, + "grad_norm": 2.054168262723839, + "language_loss": 0.80421484, + "learning_rate": 3.951434254872751e-06, + "loss": 0.88468099, + "num_input_tokens_seen": 35099205, + "router_z_loss_clip": 3.83984375, + "router_z_loss_mlp": 0.37744141, + "step": 1633, + "time_per_iteration": 2.5900163650512695 + }, + { + "auxiliary_loss_clip": 0.06752759, + "auxiliary_loss_mlp": 0.01296054, + "balance_loss_clip": 0.06366011, + "balance_loss_mlp": 0.01257931, + "epoch": 0.0982413948594619, + "flos": 15492835572480.0, + "grad_norm": 3.0165255601535743, + "language_loss": 0.74936914, + "learning_rate": 3.951348912351521e-06, + "loss": 0.82985729, + "num_input_tokens_seen": 35115270, + "router_z_loss_clip": 3.86914062, + "router_z_loss_mlp": 0.38134766, + "step": 1634, + "time_per_iteration": 3.9524917602539062 + }, + { + "auxiliary_loss_clip": 0.06754396, + "auxiliary_loss_mlp": 0.01296894, + "balance_loss_clip": 0.06358244, + "balance_loss_mlp": 0.01258485, + "epoch": 0.09830151811212987, + "flos": 24214999754880.0, + "grad_norm": 4.629396807552869, + "language_loss": 0.75166363, + "learning_rate": 3.951263495834947e-06, + "loss": 0.83217651, + "num_input_tokens_seen": 35134065, + "router_z_loss_clip": 3.95898438, + "router_z_loss_mlp": 0.3840332, + "step": 1635, + "time_per_iteration": 2.619173049926758 + }, + { + "auxiliary_loss_clip": 0.06750873, + "auxiliary_loss_mlp": 0.01303971, + "balance_loss_clip": 0.0635405, + "balance_loss_mlp": 0.01262486, + "epoch": 0.09836164136479783, + "flos": 20600814395520.0, + "grad_norm": 5.1262872331137945, + "language_loss": 0.79884511, + "learning_rate": 3.951178005326264e-06, + "loss": 0.87939358, + "num_input_tokens_seen": 35154870, + "router_z_loss_clip": 3.96875, + "router_z_loss_mlp": 0.41455078, + "step": 1636, + "time_per_iteration": 4.063632965087891 + }, + { + "auxiliary_loss_clip": 0.06755228, + "auxiliary_loss_mlp": 0.0130259, + "balance_loss_clip": 0.06357834, + "balance_loss_mlp": 0.01260486, + "epoch": 0.09842176461746581, + "flos": 19939653354240.0, + "grad_norm": 2.182253503011162, + "language_loss": 0.72318256, + "learning_rate": 3.951092440828715e-06, + "loss": 0.80376077, + "num_input_tokens_seen": 35171850, + "router_z_loss_clip": 3.97460938, + "router_z_loss_mlp": 0.42163086, + "step": 1637, + "time_per_iteration": 2.573108196258545 + }, + { + "auxiliary_loss_clip": 0.0673624, + "auxiliary_loss_mlp": 0.01302289, + "balance_loss_clip": 0.06349343, + "balance_loss_mlp": 0.01263045, + "epoch": 0.09848188787013377, + "flos": 21220956063360.0, + "grad_norm": 2.9423896219595016, + "language_loss": 0.79459947, + "learning_rate": 3.951006802345545e-06, + "loss": 0.87498474, + "num_input_tokens_seen": 35188795, + "router_z_loss_clip": 3.87304688, + "router_z_loss_mlp": 0.39257812, + "step": 1638, + "time_per_iteration": 2.620058536529541 + }, + { + "auxiliary_loss_clip": 0.06725241, + "auxiliary_loss_mlp": 0.01294434, + "balance_loss_clip": 0.06345727, + "balance_loss_mlp": 0.01258027, + "epoch": 0.09854201112280174, + "flos": 30162109691520.0, + "grad_norm": 1.743966069044169, + "language_loss": 0.7446866, + "learning_rate": 3.950921089880003e-06, + "loss": 0.82488334, + "num_input_tokens_seen": 35212100, + "router_z_loss_clip": 3.79296875, + "router_z_loss_mlp": 0.36401367, + "step": 1639, + "time_per_iteration": 4.186578750610352 + }, + { + "auxiliary_loss_clip": 0.06740695, + "auxiliary_loss_mlp": 0.01301032, + "balance_loss_clip": 0.06346842, + "balance_loss_mlp": 0.01260025, + "epoch": 0.09860213437546972, + "flos": 21801671585280.0, + "grad_norm": 2.1837560711862114, + "language_loss": 0.90050477, + "learning_rate": 3.950835303435337e-06, + "loss": 0.9809221, + "num_input_tokens_seen": 35230390, + "router_z_loss_clip": 3.93945312, + "router_z_loss_mlp": 0.41040039, + "step": 1640, + "time_per_iteration": 2.571072816848755 + }, + { + "auxiliary_loss_clip": 0.06734361, + "auxiliary_loss_mlp": 0.01304387, + "balance_loss_clip": 0.06346233, + "balance_loss_mlp": 0.01265548, + "epoch": 0.09866225762813768, + "flos": 21842062053120.0, + "grad_norm": 2.730520486163119, + "language_loss": 0.82726961, + "learning_rate": 3.950749443014801e-06, + "loss": 0.90765709, + "num_input_tokens_seen": 35250405, + "router_z_loss_clip": 3.87695312, + "router_z_loss_mlp": 0.38818359, + "step": 1641, + "time_per_iteration": 3.9849867820739746 + }, + { + "auxiliary_loss_clip": 0.06739942, + "auxiliary_loss_mlp": 0.01313392, + "balance_loss_clip": 0.06347778, + "balance_loss_mlp": 0.01271692, + "epoch": 0.09872238088080565, + "flos": 17605093622400.0, + "grad_norm": 3.096093902434135, + "language_loss": 0.88531339, + "learning_rate": 3.95066350862165e-06, + "loss": 0.96584678, + "num_input_tokens_seen": 35262820, + "router_z_loss_clip": 3.92382812, + "router_z_loss_mlp": 0.41699219, + "step": 1642, + "time_per_iteration": 2.516415596008301 + }, + { + "auxiliary_loss_clip": 0.06737699, + "auxiliary_loss_mlp": 0.01318919, + "balance_loss_clip": 0.06353228, + "balance_loss_mlp": 0.01281606, + "epoch": 0.09878250413347361, + "flos": 27643500466560.0, + "grad_norm": 2.0791034906225883, + "language_loss": 0.82263941, + "learning_rate": 3.950577500259144e-06, + "loss": 0.90320563, + "num_input_tokens_seen": 35284490, + "router_z_loss_clip": 3.84179688, + "router_z_loss_mlp": 0.37304688, + "step": 1643, + "time_per_iteration": 2.647494077682495 + }, + { + "auxiliary_loss_clip": 0.06734201, + "auxiliary_loss_mlp": 0.01331721, + "balance_loss_clip": 0.06346507, + "balance_loss_mlp": 0.01293407, + "epoch": 0.0988426273861416, + "flos": 16550285299200.0, + "grad_norm": 2.4456553195112574, + "language_loss": 0.84032261, + "learning_rate": 3.950491417930543e-06, + "loss": 0.92098182, + "num_input_tokens_seen": 35302815, + "router_z_loss_clip": 3.88085938, + "router_z_loss_mlp": 0.3828125, + "step": 1644, + "time_per_iteration": 2.532773733139038 + }, + { + "auxiliary_loss_clip": 0.06725995, + "auxiliary_loss_mlp": 0.01324281, + "balance_loss_clip": 0.06350633, + "balance_loss_mlp": 0.0128499, + "epoch": 0.09890275063880956, + "flos": 21221668823040.0, + "grad_norm": 2.0467133061416956, + "language_loss": 0.70372713, + "learning_rate": 3.9504052616391124e-06, + "loss": 0.78422999, + "num_input_tokens_seen": 35321175, + "router_z_loss_clip": 3.75390625, + "router_z_loss_mlp": 0.39282227, + "step": 1645, + "time_per_iteration": 2.622675657272339 + }, + { + "auxiliary_loss_clip": 0.06615774, + "auxiliary_loss_mlp": 0.01318713, + "balance_loss_clip": 0.06367776, + "balance_loss_mlp": 0.01297721, + "epoch": 0.09896287389147752, + "flos": 59398255111680.0, + "grad_norm": 0.866313536392572, + "language_loss": 0.6076256, + "learning_rate": 3.950319031388119e-06, + "loss": 0.68697047, + "num_input_tokens_seen": 35381740, + "router_z_loss_clip": 2.48242188, + "router_z_loss_mlp": 0.21008301, + "step": 1646, + "time_per_iteration": 3.1056430339813232 + }, + { + "auxiliary_loss_clip": 0.06736847, + "auxiliary_loss_mlp": 0.01330956, + "balance_loss_clip": 0.06343894, + "balance_loss_mlp": 0.01288517, + "epoch": 0.0990229971441455, + "flos": 29650351680000.0, + "grad_norm": 13.669187568501263, + "language_loss": 0.74906254, + "learning_rate": 3.950232727180833e-06, + "loss": 0.82974058, + "num_input_tokens_seen": 35403760, + "router_z_loss_clip": 3.93164062, + "router_z_loss_mlp": 0.42456055, + "step": 1647, + "time_per_iteration": 2.6270813941955566 + }, + { + "auxiliary_loss_clip": 0.06742343, + "auxiliary_loss_mlp": 0.01344997, + "balance_loss_clip": 0.0635362, + "balance_loss_mlp": 0.01305277, + "epoch": 0.09908312039681347, + "flos": 21841265439360.0, + "grad_norm": 3.219880040136517, + "language_loss": 0.86054468, + "learning_rate": 3.950146349020525e-06, + "loss": 0.94141805, + "num_input_tokens_seen": 35424050, + "router_z_loss_clip": 3.88671875, + "router_z_loss_mlp": 0.3972168, + "step": 1648, + "time_per_iteration": 2.6192800998687744 + }, + { + "auxiliary_loss_clip": 0.06595583, + "auxiliary_loss_mlp": 0.01312987, + "balance_loss_clip": 0.06350748, + "balance_loss_mlp": 0.01292542, + "epoch": 0.09914324364948143, + "flos": 57584425777920.0, + "grad_norm": 0.7273762983113155, + "language_loss": 0.5560773, + "learning_rate": 3.950059896910473e-06, + "loss": 0.63516295, + "num_input_tokens_seen": 35481690, + "router_z_loss_clip": 2.44140625, + "router_z_loss_mlp": 0.20446777, + "step": 1649, + "time_per_iteration": 3.1318249702453613 + }, + { + "auxiliary_loss_clip": 0.06736004, + "auxiliary_loss_mlp": 0.01331784, + "balance_loss_clip": 0.06347787, + "balance_loss_mlp": 0.01293232, + "epoch": 0.09920336690214941, + "flos": 34131270873600.0, + "grad_norm": 3.80404299498915, + "language_loss": 0.92154968, + "learning_rate": 3.949973370853954e-06, + "loss": 1.00222754, + "num_input_tokens_seen": 35498635, + "router_z_loss_clip": 3.88085938, + "router_z_loss_mlp": 0.38574219, + "step": 1650, + "time_per_iteration": 2.640519142150879 + }, + { + "auxiliary_loss_clip": 0.06583999, + "auxiliary_loss_mlp": 0.012899, + "balance_loss_clip": 0.06337862, + "balance_loss_mlp": 0.012688, + "epoch": 0.09926349015481738, + "flos": 71239910947200.0, + "grad_norm": 0.7750953568391499, + "language_loss": 0.63578606, + "learning_rate": 3.94988677085425e-06, + "loss": 0.71452504, + "num_input_tokens_seen": 35565720, + "router_z_loss_clip": 2.46679688, + "router_z_loss_mlp": 0.21118164, + "step": 1651, + "time_per_iteration": 3.380758047103882 + }, + { + "auxiliary_loss_clip": 0.06739324, + "auxiliary_loss_mlp": 0.01313359, + "balance_loss_clip": 0.06352896, + "balance_loss_mlp": 0.01275236, + "epoch": 0.09932361340748534, + "flos": 23155369822080.0, + "grad_norm": 3.694899481712973, + "language_loss": 0.89802289, + "learning_rate": 3.949800096914643e-06, + "loss": 0.97854972, + "num_input_tokens_seen": 35586000, + "router_z_loss_clip": 3.8671875, + "router_z_loss_mlp": 0.38110352, + "step": 1652, + "time_per_iteration": 2.571901321411133 + }, + { + "auxiliary_loss_clip": 0.06737585, + "auxiliary_loss_mlp": 0.01305643, + "balance_loss_clip": 0.06349514, + "balance_loss_mlp": 0.01267735, + "epoch": 0.09938373666015332, + "flos": 19834791569280.0, + "grad_norm": 2.586330184077195, + "language_loss": 0.8401894, + "learning_rate": 3.949713349038422e-06, + "loss": 0.92062169, + "num_input_tokens_seen": 35604355, + "router_z_loss_clip": 3.88085938, + "router_z_loss_mlp": 0.37890625, + "step": 1653, + "time_per_iteration": 2.5631346702575684 + }, + { + "auxiliary_loss_clip": 0.0674301, + "auxiliary_loss_mlp": 0.01306602, + "balance_loss_clip": 0.06348432, + "balance_loss_mlp": 0.01266428, + "epoch": 0.09944385991282129, + "flos": 22097165408640.0, + "grad_norm": 3.5179958225358914, + "language_loss": 0.81669748, + "learning_rate": 3.949626527228875e-06, + "loss": 0.89719361, + "num_input_tokens_seen": 35625495, + "router_z_loss_clip": 3.94335938, + "router_z_loss_mlp": 0.40136719, + "step": 1654, + "time_per_iteration": 2.602562427520752 + }, + { + "auxiliary_loss_clip": 0.06716993, + "auxiliary_loss_mlp": 0.01303058, + "balance_loss_clip": 0.0634619, + "balance_loss_mlp": 0.01268178, + "epoch": 0.09950398316548925, + "flos": 19835043131520.0, + "grad_norm": 8.671208784933132, + "language_loss": 0.83012509, + "learning_rate": 3.949539631489295e-06, + "loss": 0.91032565, + "num_input_tokens_seen": 35645030, + "router_z_loss_clip": 3.70703125, + "router_z_loss_mlp": 0.34863281, + "step": 1655, + "time_per_iteration": 2.5673985481262207 + }, + { + "auxiliary_loss_clip": 0.06726938, + "auxiliary_loss_mlp": 0.01297279, + "balance_loss_clip": 0.06340201, + "balance_loss_mlp": 0.01259799, + "epoch": 0.09956410641815722, + "flos": 25009715404800.0, + "grad_norm": 2.461628043042503, + "language_loss": 0.82767576, + "learning_rate": 3.9494526618229765e-06, + "loss": 0.90791798, + "num_input_tokens_seen": 35664305, + "router_z_loss_clip": 3.86523438, + "router_z_loss_mlp": 0.37475586, + "step": 1656, + "time_per_iteration": 2.581664800643921 + }, + { + "auxiliary_loss_clip": 0.06710893, + "auxiliary_loss_mlp": 0.01307317, + "balance_loss_clip": 0.06336491, + "balance_loss_mlp": 0.01268812, + "epoch": 0.0996242296708252, + "flos": 19323746317440.0, + "grad_norm": 1.719286888169867, + "language_loss": 0.90283895, + "learning_rate": 3.949365618233217e-06, + "loss": 0.98302102, + "num_input_tokens_seen": 35684060, + "router_z_loss_clip": 3.74804688, + "router_z_loss_mlp": 0.38525391, + "step": 1657, + "time_per_iteration": 2.57688045501709 + }, + { + "auxiliary_loss_clip": 0.06739774, + "auxiliary_loss_mlp": 0.01311666, + "balance_loss_clip": 0.06340782, + "balance_loss_mlp": 0.01267869, + "epoch": 0.09968435292349316, + "flos": 21878050181760.0, + "grad_norm": 2.9029706728478533, + "language_loss": 0.87311482, + "learning_rate": 3.9492785007233195e-06, + "loss": 0.95362926, + "num_input_tokens_seen": 35703250, + "router_z_loss_clip": 3.98632812, + "router_z_loss_mlp": 0.43823242, + "step": 1658, + "time_per_iteration": 2.628093719482422 + }, + { + "auxiliary_loss_clip": 0.06571998, + "auxiliary_loss_mlp": 0.01376397, + "balance_loss_clip": 0.06328425, + "balance_loss_mlp": 0.01349933, + "epoch": 0.09974447617616113, + "flos": 65401912154880.0, + "grad_norm": 0.9037243571562794, + "language_loss": 0.60433233, + "learning_rate": 3.949191309296585e-06, + "loss": 0.68381631, + "num_input_tokens_seen": 35762165, + "router_z_loss_clip": 2.4375, + "router_z_loss_mlp": 0.26513672, + "step": 1659, + "time_per_iteration": 3.2305996417999268 + }, + { + "auxiliary_loss_clip": 0.06713426, + "auxiliary_loss_mlp": 0.01317119, + "balance_loss_clip": 0.06331229, + "balance_loss_mlp": 0.0127735, + "epoch": 0.0998045994288291, + "flos": 23666624709120.0, + "grad_norm": 2.0571407511312865, + "language_loss": 0.87086773, + "learning_rate": 3.949104043956321e-06, + "loss": 0.95117325, + "num_input_tokens_seen": 35781520, + "router_z_loss_clip": 3.82226562, + "router_z_loss_mlp": 0.39746094, + "step": 1660, + "time_per_iteration": 2.5779190063476562 + }, + { + "auxiliary_loss_clip": 0.0670151, + "auxiliary_loss_mlp": 0.01332109, + "balance_loss_clip": 0.06323117, + "balance_loss_mlp": 0.01290529, + "epoch": 0.09986472268149707, + "flos": 19615802123520.0, + "grad_norm": 2.4762315311071315, + "language_loss": 0.80644435, + "learning_rate": 3.949016704705836e-06, + "loss": 0.88678062, + "num_input_tokens_seen": 35799565, + "router_z_loss_clip": 3.78710938, + "router_z_loss_mlp": 0.41552734, + "step": 1661, + "time_per_iteration": 2.691804885864258 + }, + { + "auxiliary_loss_clip": 0.06725313, + "auxiliary_loss_mlp": 0.0132162, + "balance_loss_clip": 0.0632514, + "balance_loss_mlp": 0.01278443, + "epoch": 0.09992484593416504, + "flos": 26220467376000.0, + "grad_norm": 2.2620896744149412, + "language_loss": 0.8613416, + "learning_rate": 3.948929291548443e-06, + "loss": 0.94181097, + "num_input_tokens_seen": 35821085, + "router_z_loss_clip": 4.00585938, + "router_z_loss_mlp": 0.43164062, + "step": 1662, + "time_per_iteration": 2.6255035400390625 + }, + { + "auxiliary_loss_clip": 0.06704119, + "auxiliary_loss_mlp": 0.0133037, + "balance_loss_clip": 0.06321694, + "balance_loss_mlp": 0.0128941, + "epoch": 0.09998496918683301, + "flos": 17499393296640.0, + "grad_norm": 2.3672212997838993, + "language_loss": 0.90448183, + "learning_rate": 3.9488418044874546e-06, + "loss": 0.98482674, + "num_input_tokens_seen": 35839840, + "router_z_loss_clip": 3.82226562, + "router_z_loss_mlp": 0.40966797, + "step": 1663, + "time_per_iteration": 2.6671247482299805 + }, + { + "auxiliary_loss_clip": 0.06712753, + "auxiliary_loss_mlp": 0.01334758, + "balance_loss_clip": 0.06319161, + "balance_loss_mlp": 0.01292105, + "epoch": 0.10004509243950098, + "flos": 22791715102080.0, + "grad_norm": 2.952995005402735, + "language_loss": 0.72149938, + "learning_rate": 3.948754243526191e-06, + "loss": 0.80197442, + "num_input_tokens_seen": 35861545, + "router_z_loss_clip": 3.93164062, + "router_z_loss_mlp": 0.42651367, + "step": 1664, + "time_per_iteration": 2.619164228439331 + }, + { + "auxiliary_loss_clip": 0.06713652, + "auxiliary_loss_mlp": 0.01325429, + "balance_loss_clip": 0.06323303, + "balance_loss_mlp": 0.01284159, + "epoch": 0.10010521569216894, + "flos": 16258984179840.0, + "grad_norm": 39.90990553234195, + "language_loss": 0.80576968, + "learning_rate": 3.94866660866797e-06, + "loss": 0.88616049, + "num_input_tokens_seen": 35878295, + "router_z_loss_clip": 3.90429688, + "router_z_loss_mlp": 0.41235352, + "step": 1665, + "time_per_iteration": 2.605639934539795 + }, + { + "auxiliary_loss_clip": 0.06714154, + "auxiliary_loss_mlp": 0.01316999, + "balance_loss_clip": 0.06327689, + "balance_loss_mlp": 0.01278017, + "epoch": 0.10016533894483691, + "flos": 23409047658240.0, + "grad_norm": 2.1899546372821566, + "language_loss": 0.71735048, + "learning_rate": 3.9485788999161165e-06, + "loss": 0.79766202, + "num_input_tokens_seen": 35898990, + "router_z_loss_clip": 3.86523438, + "router_z_loss_mlp": 0.38964844, + "step": 1666, + "time_per_iteration": 2.565112352371216 + }, + { + "auxiliary_loss_clip": 0.06721501, + "auxiliary_loss_mlp": 0.01334152, + "balance_loss_clip": 0.06329556, + "balance_loss_mlp": 0.01286492, + "epoch": 0.10022546219750489, + "flos": 19360195643520.0, + "grad_norm": 2.4453770076419055, + "language_loss": 0.80451995, + "learning_rate": 3.948491117273956e-06, + "loss": 0.88507646, + "num_input_tokens_seen": 35916225, + "router_z_loss_clip": 3.921875, + "router_z_loss_mlp": 0.47680664, + "step": 1667, + "time_per_iteration": 2.5686376094818115 + }, + { + "auxiliary_loss_clip": 0.06714002, + "auxiliary_loss_mlp": 0.01313023, + "balance_loss_clip": 0.06328776, + "balance_loss_mlp": 0.01272492, + "epoch": 0.10028558545017285, + "flos": 27092525944320.0, + "grad_norm": 3.3659339438704357, + "language_loss": 0.79832667, + "learning_rate": 3.948403260744817e-06, + "loss": 0.8785969, + "num_input_tokens_seen": 35934630, + "router_z_loss_clip": 3.84960938, + "router_z_loss_mlp": 0.40551758, + "step": 1668, + "time_per_iteration": 2.5726866722106934 + }, + { + "auxiliary_loss_clip": 0.0670673, + "auxiliary_loss_mlp": 0.013093, + "balance_loss_clip": 0.06318925, + "balance_loss_mlp": 0.01268101, + "epoch": 0.10034570870284082, + "flos": 25854003544320.0, + "grad_norm": 2.568927800509246, + "language_loss": 0.79338908, + "learning_rate": 3.948315330332031e-06, + "loss": 0.87354934, + "num_input_tokens_seen": 35953855, + "router_z_loss_clip": 3.87695312, + "router_z_loss_mlp": 0.41235352, + "step": 1669, + "time_per_iteration": 2.6188042163848877 + }, + { + "auxiliary_loss_clip": 0.06725293, + "auxiliary_loss_mlp": 0.0130808, + "balance_loss_clip": 0.06329028, + "balance_loss_mlp": 0.01264497, + "epoch": 0.1004058319555088, + "flos": 26256707066880.0, + "grad_norm": 15.895164476932296, + "language_loss": 0.87389982, + "learning_rate": 3.948227326038933e-06, + "loss": 0.95423353, + "num_input_tokens_seen": 35974555, + "router_z_loss_clip": 3.9609375, + "router_z_loss_mlp": 0.43579102, + "step": 1670, + "time_per_iteration": 2.6586272716522217 + }, + { + "auxiliary_loss_clip": 0.06691795, + "auxiliary_loss_mlp": 0.01298769, + "balance_loss_clip": 0.06322314, + "balance_loss_mlp": 0.0126098, + "epoch": 0.10046595520817676, + "flos": 25381545897600.0, + "grad_norm": 1.8967452212827218, + "language_loss": 0.7865597, + "learning_rate": 3.9481392478688586e-06, + "loss": 0.86646533, + "num_input_tokens_seen": 35996830, + "router_z_loss_clip": 3.69335938, + "router_z_loss_mlp": 0.37817383, + "step": 1671, + "time_per_iteration": 2.6737799644470215 + }, + { + "auxiliary_loss_clip": 0.06549042, + "auxiliary_loss_mlp": 0.01335852, + "balance_loss_clip": 0.06305933, + "balance_loss_mlp": 0.01310293, + "epoch": 0.10052607846084473, + "flos": 67479146398080.0, + "grad_norm": 0.7871321089675286, + "language_loss": 0.60865933, + "learning_rate": 3.948051095825149e-06, + "loss": 0.68750823, + "num_input_tokens_seen": 36054465, + "router_z_loss_clip": 2.4375, + "router_z_loss_mlp": 0.25585938, + "step": 1672, + "time_per_iteration": 3.1528263092041016 + }, + { + "auxiliary_loss_clip": 0.06706591, + "auxiliary_loss_mlp": 0.01299319, + "balance_loss_clip": 0.06322384, + "balance_loss_mlp": 0.01258406, + "epoch": 0.10058620171351271, + "flos": 21366795294720.0, + "grad_norm": 25.353895208902486, + "language_loss": 0.78260916, + "learning_rate": 3.947962869911147e-06, + "loss": 0.86266828, + "num_input_tokens_seen": 36073480, + "router_z_loss_clip": 3.83984375, + "router_z_loss_mlp": 0.40917969, + "step": 1673, + "time_per_iteration": 2.548840045928955 + }, + { + "auxiliary_loss_clip": 0.06713213, + "auxiliary_loss_mlp": 0.01301927, + "balance_loss_clip": 0.06326719, + "balance_loss_mlp": 0.01261419, + "epoch": 0.10064632496618067, + "flos": 16805724071040.0, + "grad_norm": 3.2623460746575867, + "language_loss": 0.75444734, + "learning_rate": 3.947874570130197e-06, + "loss": 0.83459872, + "num_input_tokens_seen": 36091830, + "router_z_loss_clip": 3.8671875, + "router_z_loss_mlp": 0.4050293, + "step": 1674, + "time_per_iteration": 3.9417338371276855 + }, + { + "auxiliary_loss_clip": 0.06701215, + "auxiliary_loss_mlp": 0.01303034, + "balance_loss_clip": 0.0631593, + "balance_loss_mlp": 0.01264124, + "epoch": 0.10070644821884864, + "flos": 23631433194240.0, + "grad_norm": 2.3845334341515905, + "language_loss": 0.80716002, + "learning_rate": 3.947786196485649e-06, + "loss": 0.88720256, + "num_input_tokens_seen": 36111400, + "router_z_loss_clip": 3.8515625, + "router_z_loss_mlp": 0.38891602, + "step": 1675, + "time_per_iteration": 2.6035287380218506 + }, + { + "auxiliary_loss_clip": 0.06711227, + "auxiliary_loss_mlp": 0.01308342, + "balance_loss_clip": 0.06320765, + "balance_loss_mlp": 0.01266404, + "epoch": 0.1007665714715166, + "flos": 24469516131840.0, + "grad_norm": 3.2401043480386122, + "language_loss": 0.82723379, + "learning_rate": 3.947697748980853e-06, + "loss": 0.90742946, + "num_input_tokens_seen": 36129345, + "router_z_loss_clip": 3.90234375, + "router_z_loss_mlp": 0.41943359, + "step": 1676, + "time_per_iteration": 4.029613256454468 + }, + { + "auxiliary_loss_clip": 0.06714617, + "auxiliary_loss_mlp": 0.01315911, + "balance_loss_clip": 0.0632771, + "balance_loss_mlp": 0.0127476, + "epoch": 0.10082669472418458, + "flos": 16804550113920.0, + "grad_norm": 2.3128991920650295, + "language_loss": 0.87477523, + "learning_rate": 3.947609227619163e-06, + "loss": 0.95508051, + "num_input_tokens_seen": 36146255, + "router_z_loss_clip": 3.87109375, + "router_z_loss_mlp": 0.41113281, + "step": 1677, + "time_per_iteration": 2.593122720718384 + }, + { + "auxiliary_loss_clip": 0.06712872, + "auxiliary_loss_mlp": 0.01323048, + "balance_loss_clip": 0.06321359, + "balance_loss_mlp": 0.01280586, + "epoch": 0.10088681797685255, + "flos": 13558673376000.0, + "grad_norm": 2.3885344519990017, + "language_loss": 0.87886804, + "learning_rate": 3.947520632403936e-06, + "loss": 0.9592272, + "num_input_tokens_seen": 36164050, + "router_z_loss_clip": 3.9140625, + "router_z_loss_mlp": 0.42480469, + "step": 1678, + "time_per_iteration": 4.02148962020874 + }, + { + "auxiliary_loss_clip": 0.06711318, + "auxiliary_loss_mlp": 0.01321227, + "balance_loss_clip": 0.06328011, + "balance_loss_mlp": 0.01282985, + "epoch": 0.10094694122952051, + "flos": 25272868752000.0, + "grad_norm": 13.556620814946344, + "language_loss": 0.91124773, + "learning_rate": 3.947431963338532e-06, + "loss": 0.99157315, + "num_input_tokens_seen": 36183530, + "router_z_loss_clip": 3.83007812, + "router_z_loss_mlp": 0.38256836, + "step": 1679, + "time_per_iteration": 2.593204975128174 + }, + { + "auxiliary_loss_clip": 0.06551328, + "auxiliary_loss_mlp": 0.01270219, + "balance_loss_clip": 0.06307815, + "balance_loss_mlp": 0.01249143, + "epoch": 0.10100706448218849, + "flos": 69875521315200.0, + "grad_norm": 0.8658555731993547, + "language_loss": 0.53157437, + "learning_rate": 3.947343220426312e-06, + "loss": 0.60978985, + "num_input_tokens_seen": 36248550, + "router_z_loss_clip": 2.4375, + "router_z_loss_mlp": 0.2109375, + "step": 1680, + "time_per_iteration": 4.680401802062988 + }, + { + "auxiliary_loss_clip": 0.06706315, + "auxiliary_loss_mlp": 0.01330393, + "balance_loss_clip": 0.06326837, + "balance_loss_mlp": 0.0129103, + "epoch": 0.10106718773485646, + "flos": 20012677787520.0, + "grad_norm": 2.2086252291478403, + "language_loss": 0.78363287, + "learning_rate": 3.947254403670641e-06, + "loss": 0.86399996, + "num_input_tokens_seen": 36266065, + "router_z_loss_clip": 3.79296875, + "router_z_loss_mlp": 0.39331055, + "step": 1681, + "time_per_iteration": 2.5842180252075195 + }, + { + "auxiliary_loss_clip": 0.06727763, + "auxiliary_loss_mlp": 0.0133733, + "balance_loss_clip": 0.06334171, + "balance_loss_mlp": 0.01293271, + "epoch": 0.10112731098752442, + "flos": 13484852328960.0, + "grad_norm": 2.7825426019965707, + "language_loss": 0.9580273, + "learning_rate": 3.947165513074889e-06, + "loss": 1.03867817, + "num_input_tokens_seen": 36280960, + "router_z_loss_clip": 3.9375, + "router_z_loss_mlp": 0.44067383, + "step": 1682, + "time_per_iteration": 2.5091476440429688 + }, + { + "auxiliary_loss_clip": 0.06722884, + "auxiliary_loss_mlp": 0.01333979, + "balance_loss_clip": 0.06334428, + "balance_loss_mlp": 0.01291803, + "epoch": 0.1011874342401924, + "flos": 18521944997760.0, + "grad_norm": 4.013093374062749, + "language_loss": 0.88974559, + "learning_rate": 3.947076548642425e-06, + "loss": 0.97031426, + "num_input_tokens_seen": 36299010, + "router_z_loss_clip": 3.8828125, + "router_z_loss_mlp": 0.421875, + "step": 1683, + "time_per_iteration": 2.583263635635376 + }, + { + "auxiliary_loss_clip": 0.0671032, + "auxiliary_loss_mlp": 0.01319793, + "balance_loss_clip": 0.06327897, + "balance_loss_mlp": 0.0128074, + "epoch": 0.10124755749286037, + "flos": 20708904562560.0, + "grad_norm": 3.51695946667963, + "language_loss": 0.76482016, + "learning_rate": 3.946987510376624e-06, + "loss": 0.84512126, + "num_input_tokens_seen": 36318400, + "router_z_loss_clip": 3.82226562, + "router_z_loss_mlp": 0.390625, + "step": 1684, + "time_per_iteration": 2.5566201210021973 + }, + { + "auxiliary_loss_clip": 0.06545618, + "auxiliary_loss_mlp": 0.01270157, + "balance_loss_clip": 0.06304231, + "balance_loss_mlp": 0.01252085, + "epoch": 0.10130768074552833, + "flos": 56130100387200.0, + "grad_norm": 0.7359306974182547, + "language_loss": 0.6108619, + "learning_rate": 3.9468983982808615e-06, + "loss": 0.68901968, + "num_input_tokens_seen": 36381815, + "router_z_loss_clip": 2.40625, + "router_z_loss_mlp": 0.1809082, + "step": 1685, + "time_per_iteration": 3.2871286869049072 + }, + { + "auxiliary_loss_clip": 0.06715102, + "auxiliary_loss_mlp": 0.01314643, + "balance_loss_clip": 0.06328554, + "balance_loss_mlp": 0.01273612, + "epoch": 0.1013678039981963, + "flos": 33410921322240.0, + "grad_norm": 2.782312478618552, + "language_loss": 0.61882973, + "learning_rate": 3.946809212358516e-06, + "loss": 0.6991272, + "num_input_tokens_seen": 36404320, + "router_z_loss_clip": 3.86328125, + "router_z_loss_mlp": 0.41064453, + "step": 1686, + "time_per_iteration": 2.6534583568573 + }, + { + "auxiliary_loss_clip": 0.0670934, + "auxiliary_loss_mlp": 0.01311437, + "balance_loss_clip": 0.0633449, + "balance_loss_mlp": 0.01272622, + "epoch": 0.10142792725086427, + "flos": 31913480206080.0, + "grad_norm": 4.585581221965215, + "language_loss": 0.8288697, + "learning_rate": 3.946719952612972e-06, + "loss": 0.90907753, + "num_input_tokens_seen": 36427510, + "router_z_loss_clip": 3.74804688, + "router_z_loss_mlp": 0.38793945, + "step": 1687, + "time_per_iteration": 2.6766278743743896 + }, + { + "auxiliary_loss_clip": 0.06718412, + "auxiliary_loss_mlp": 0.0131249, + "balance_loss_clip": 0.06331126, + "balance_loss_mlp": 0.01271601, + "epoch": 0.10148805050353224, + "flos": 28483512048000.0, + "grad_norm": 2.9352499009147386, + "language_loss": 0.73686063, + "learning_rate": 3.94663061904761e-06, + "loss": 0.81716961, + "num_input_tokens_seen": 36448230, + "router_z_loss_clip": 3.88085938, + "router_z_loss_mlp": 0.40917969, + "step": 1688, + "time_per_iteration": 2.625084400177002 + }, + { + "auxiliary_loss_clip": 0.06704164, + "auxiliary_loss_mlp": 0.01310415, + "balance_loss_clip": 0.06328401, + "balance_loss_mlp": 0.01267905, + "epoch": 0.1015481737562002, + "flos": 25154799949440.0, + "grad_norm": 2.7691275113498293, + "language_loss": 0.88195848, + "learning_rate": 3.94654121166582e-06, + "loss": 0.9621042, + "num_input_tokens_seen": 36464395, + "router_z_loss_clip": 3.7578125, + "router_z_loss_mlp": 0.42480469, + "step": 1689, + "time_per_iteration": 2.595492362976074 + }, + { + "auxiliary_loss_clip": 0.06716056, + "auxiliary_loss_mlp": 0.01310716, + "balance_loss_clip": 0.06332745, + "balance_loss_mlp": 0.01270328, + "epoch": 0.10160829700886818, + "flos": 30890593088640.0, + "grad_norm": 2.202394662859946, + "language_loss": 0.89776945, + "learning_rate": 3.946451730470993e-06, + "loss": 0.97803724, + "num_input_tokens_seen": 36486475, + "router_z_loss_clip": 3.83398438, + "router_z_loss_mlp": 0.40429688, + "step": 1690, + "time_per_iteration": 2.6406383514404297 + }, + { + "auxiliary_loss_clip": 0.06720668, + "auxiliary_loss_mlp": 0.01309465, + "balance_loss_clip": 0.06337205, + "balance_loss_mlp": 0.01267932, + "epoch": 0.10166842026153615, + "flos": 20418190421760.0, + "grad_norm": 2.5850789066585595, + "language_loss": 0.85274917, + "learning_rate": 3.946362175466521e-06, + "loss": 0.93305051, + "num_input_tokens_seen": 36505310, + "router_z_loss_clip": 3.83789062, + "router_z_loss_mlp": 0.4152832, + "step": 1691, + "time_per_iteration": 2.6336474418640137 + }, + { + "auxiliary_loss_clip": 0.06720576, + "auxiliary_loss_mlp": 0.01308382, + "balance_loss_clip": 0.06329723, + "balance_loss_mlp": 0.01266039, + "epoch": 0.10172854351420411, + "flos": 33485832472320.0, + "grad_norm": 1.9210168222319979, + "language_loss": 0.67985535, + "learning_rate": 3.946272546655801e-06, + "loss": 0.76014495, + "num_input_tokens_seen": 36529820, + "router_z_loss_clip": 3.91210938, + "router_z_loss_mlp": 0.4230957, + "step": 1692, + "time_per_iteration": 2.7298569679260254 + }, + { + "auxiliary_loss_clip": 0.0670909, + "auxiliary_loss_mlp": 0.01313275, + "balance_loss_clip": 0.06329532, + "balance_loss_mlp": 0.01271933, + "epoch": 0.1017886667668721, + "flos": 23557109022720.0, + "grad_norm": 2.364359015626866, + "language_loss": 0.77791357, + "learning_rate": 3.94618284404223e-06, + "loss": 0.85813725, + "num_input_tokens_seen": 36549000, + "router_z_loss_clip": 3.79101562, + "router_z_loss_mlp": 0.41333008, + "step": 1693, + "time_per_iteration": 2.5772159099578857 + }, + { + "auxiliary_loss_clip": 0.06718149, + "auxiliary_loss_mlp": 0.01308582, + "balance_loss_clip": 0.06332842, + "balance_loss_mlp": 0.01267813, + "epoch": 0.10184879001954006, + "flos": 23303011916160.0, + "grad_norm": 1.7868831519316952, + "language_loss": 0.88559091, + "learning_rate": 3.9460930676292105e-06, + "loss": 0.96585822, + "num_input_tokens_seen": 36567515, + "router_z_loss_clip": 3.85742188, + "router_z_loss_mlp": 0.4074707, + "step": 1694, + "time_per_iteration": 2.6128172874450684 + }, + { + "auxiliary_loss_clip": 0.06728393, + "auxiliary_loss_mlp": 0.01308189, + "balance_loss_clip": 0.06335086, + "balance_loss_mlp": 0.01266681, + "epoch": 0.10190891327220802, + "flos": 18339069461760.0, + "grad_norm": 12.701803193315635, + "language_loss": 0.81483626, + "learning_rate": 3.946003217420147e-06, + "loss": 0.89520216, + "num_input_tokens_seen": 36586190, + "router_z_loss_clip": 3.9375, + "router_z_loss_mlp": 0.41503906, + "step": 1695, + "time_per_iteration": 2.5383710861206055 + }, + { + "auxiliary_loss_clip": 0.06719907, + "auxiliary_loss_mlp": 0.01309327, + "balance_loss_clip": 0.06335149, + "balance_loss_mlp": 0.01268152, + "epoch": 0.10196903652487599, + "flos": 26472006933120.0, + "grad_norm": 2.5208321376903173, + "language_loss": 0.87899506, + "learning_rate": 3.945913293418447e-06, + "loss": 0.95928741, + "num_input_tokens_seen": 36607495, + "router_z_loss_clip": 3.84765625, + "router_z_loss_mlp": 0.41186523, + "step": 1696, + "time_per_iteration": 2.651993989944458 + }, + { + "auxiliary_loss_clip": 0.067072, + "auxiliary_loss_mlp": 0.01308456, + "balance_loss_clip": 0.06329801, + "balance_loss_mlp": 0.01268545, + "epoch": 0.10202915977754397, + "flos": 21875618413440.0, + "grad_norm": 1.9807901580601361, + "language_loss": 0.83342528, + "learning_rate": 3.945823295627519e-06, + "loss": 0.91358191, + "num_input_tokens_seen": 36628555, + "router_z_loss_clip": 3.77734375, + "router_z_loss_mlp": 0.39916992, + "step": 1697, + "time_per_iteration": 2.5826144218444824 + }, + { + "auxiliary_loss_clip": 0.06717139, + "auxiliary_loss_mlp": 0.01309728, + "balance_loss_clip": 0.06333424, + "balance_loss_mlp": 0.01268339, + "epoch": 0.10208928303021193, + "flos": 22316322562560.0, + "grad_norm": 4.080073154744023, + "language_loss": 0.82607067, + "learning_rate": 3.9457332240507775e-06, + "loss": 0.90633935, + "num_input_tokens_seen": 36646250, + "router_z_loss_clip": 3.83789062, + "router_z_loss_mlp": 0.4140625, + "step": 1698, + "time_per_iteration": 2.6105751991271973 + }, + { + "auxiliary_loss_clip": 0.06711876, + "auxiliary_loss_mlp": 0.01312643, + "balance_loss_clip": 0.06331024, + "balance_loss_mlp": 0.01272541, + "epoch": 0.1021494062828799, + "flos": 22131811872000.0, + "grad_norm": 3.7730678992984594, + "language_loss": 0.78052682, + "learning_rate": 3.945643078691637e-06, + "loss": 0.86077201, + "num_input_tokens_seen": 36666675, + "router_z_loss_clip": 3.8046875, + "router_z_loss_mlp": 0.40112305, + "step": 1699, + "time_per_iteration": 2.554769515991211 + }, + { + "auxiliary_loss_clip": 0.06706256, + "auxiliary_loss_mlp": 0.01310666, + "balance_loss_clip": 0.06325917, + "balance_loss_mlp": 0.01269253, + "epoch": 0.10220952953554788, + "flos": 19652922282240.0, + "grad_norm": 2.595218153740113, + "language_loss": 0.81135154, + "learning_rate": 3.945552859553516e-06, + "loss": 0.89152074, + "num_input_tokens_seen": 36685225, + "router_z_loss_clip": 3.80273438, + "router_z_loss_mlp": 0.41430664, + "step": 1700, + "time_per_iteration": 2.6276824474334717 + }, + { + "auxiliary_loss_clip": 0.06713387, + "auxiliary_loss_mlp": 0.01308957, + "balance_loss_clip": 0.06330973, + "balance_loss_mlp": 0.01269284, + "epoch": 0.10226965278821584, + "flos": 29794765392000.0, + "grad_norm": 1.915620858004171, + "language_loss": 0.78195202, + "learning_rate": 3.945462566639836e-06, + "loss": 0.86217546, + "num_input_tokens_seen": 36705985, + "router_z_loss_clip": 3.82421875, + "router_z_loss_mlp": 0.39697266, + "step": 1701, + "time_per_iteration": 2.6159350872039795 + }, + { + "auxiliary_loss_clip": 0.06729369, + "auxiliary_loss_mlp": 0.01324821, + "balance_loss_clip": 0.06331599, + "balance_loss_mlp": 0.01279617, + "epoch": 0.10232977604088381, + "flos": 27024239266560.0, + "grad_norm": 2.5261274720011473, + "language_loss": 0.79135132, + "learning_rate": 3.945372199954019e-06, + "loss": 0.87189317, + "num_input_tokens_seen": 36725815, + "router_z_loss_clip": 3.98046875, + "router_z_loss_mlp": 0.4519043, + "step": 1702, + "time_per_iteration": 2.629913806915283 + }, + { + "auxiliary_loss_clip": 0.06706569, + "auxiliary_loss_mlp": 0.01317465, + "balance_loss_clip": 0.06326532, + "balance_loss_mlp": 0.01277983, + "epoch": 0.10238989929355179, + "flos": 20783857639680.0, + "grad_norm": 2.3222724065629494, + "language_loss": 0.95639896, + "learning_rate": 3.945281759499494e-06, + "loss": 1.03663921, + "num_input_tokens_seen": 36742345, + "router_z_loss_clip": 3.80078125, + "router_z_loss_mlp": 0.39501953, + "step": 1703, + "time_per_iteration": 2.601848840713501 + }, + { + "auxiliary_loss_clip": 0.06547229, + "auxiliary_loss_mlp": 0.01318477, + "balance_loss_clip": 0.06308849, + "balance_loss_mlp": 0.01299118, + "epoch": 0.10245002254621975, + "flos": 57716471013120.0, + "grad_norm": 0.8331319138238726, + "language_loss": 0.55242068, + "learning_rate": 3.94519124527969e-06, + "loss": 0.63107777, + "num_input_tokens_seen": 36798775, + "router_z_loss_clip": 2.375, + "router_z_loss_mlp": 0.19335938, + "step": 1704, + "time_per_iteration": 3.1248717308044434 + }, + { + "auxiliary_loss_clip": 0.06706051, + "auxiliary_loss_mlp": 0.01308758, + "balance_loss_clip": 0.06321411, + "balance_loss_mlp": 0.0126775, + "epoch": 0.10251014579888772, + "flos": 16805724071040.0, + "grad_norm": 2.30707717904525, + "language_loss": 0.8659755, + "learning_rate": 3.945100657298039e-06, + "loss": 0.94612348, + "num_input_tokens_seen": 36816295, + "router_z_loss_clip": 3.84960938, + "router_z_loss_mlp": 0.41015625, + "step": 1705, + "time_per_iteration": 2.5850555896759033 + }, + { + "auxiliary_loss_clip": 0.06541149, + "auxiliary_loss_mlp": 0.01304681, + "balance_loss_clip": 0.06304285, + "balance_loss_mlp": 0.01286478, + "epoch": 0.1025702690515557, + "flos": 68584533459840.0, + "grad_norm": 0.7436655566620352, + "language_loss": 0.60505682, + "learning_rate": 3.9450099955579765e-06, + "loss": 0.68351519, + "num_input_tokens_seen": 36882030, + "router_z_loss_clip": 2.375, + "router_z_loss_mlp": 0.18212891, + "step": 1706, + "time_per_iteration": 3.239501953125 + }, + { + "auxiliary_loss_clip": 0.06703549, + "auxiliary_loss_mlp": 0.01305907, + "balance_loss_clip": 0.0632052, + "balance_loss_mlp": 0.01262729, + "epoch": 0.10263039230422366, + "flos": 14871939217920.0, + "grad_norm": 2.8485004441458637, + "language_loss": 0.88280994, + "learning_rate": 3.94491926006294e-06, + "loss": 0.96290451, + "num_input_tokens_seen": 36899245, + "router_z_loss_clip": 3.828125, + "router_z_loss_mlp": 0.43188477, + "step": 1707, + "time_per_iteration": 2.6399993896484375 + }, + { + "auxiliary_loss_clip": 0.0669533, + "auxiliary_loss_mlp": 0.01302799, + "balance_loss_clip": 0.06323209, + "balance_loss_mlp": 0.01262887, + "epoch": 0.10269051555689163, + "flos": 25344593447040.0, + "grad_norm": 2.5980108077369604, + "language_loss": 0.74784869, + "learning_rate": 3.944828450816369e-06, + "loss": 0.82783002, + "num_input_tokens_seen": 36920950, + "router_z_loss_clip": 3.71875, + "router_z_loss_mlp": 0.39892578, + "step": 1708, + "time_per_iteration": 2.654852867126465 + }, + { + "auxiliary_loss_clip": 0.06703041, + "auxiliary_loss_mlp": 0.01305178, + "balance_loss_clip": 0.06323138, + "balance_loss_mlp": 0.01263049, + "epoch": 0.10275063880955959, + "flos": 21075116832000.0, + "grad_norm": 2.060667127210552, + "language_loss": 0.92398179, + "learning_rate": 3.944737567821709e-06, + "loss": 1.00406396, + "num_input_tokens_seen": 36938900, + "router_z_loss_clip": 3.80078125, + "router_z_loss_mlp": 0.42114258, + "step": 1709, + "time_per_iteration": 2.573854446411133 + }, + { + "auxiliary_loss_clip": 0.06702737, + "auxiliary_loss_mlp": 0.01298282, + "balance_loss_clip": 0.06322797, + "balance_loss_mlp": 0.01257703, + "epoch": 0.10281076206222757, + "flos": 30373636124160.0, + "grad_norm": 12.814317235362356, + "language_loss": 0.90276158, + "learning_rate": 3.944646611082406e-06, + "loss": 0.98277175, + "num_input_tokens_seen": 36957010, + "router_z_loss_clip": 3.79882812, + "router_z_loss_mlp": 0.40551758, + "step": 1710, + "time_per_iteration": 2.6228139400482178 + }, + { + "auxiliary_loss_clip": 0.06701953, + "auxiliary_loss_mlp": 0.01305177, + "balance_loss_clip": 0.06325494, + "balance_loss_mlp": 0.01263096, + "epoch": 0.10287088531489554, + "flos": 22424748145920.0, + "grad_norm": 2.0240875797159554, + "language_loss": 0.80754149, + "learning_rate": 3.944555580601908e-06, + "loss": 0.88761282, + "num_input_tokens_seen": 36977690, + "router_z_loss_clip": 3.75976562, + "router_z_loss_mlp": 0.42089844, + "step": 1711, + "time_per_iteration": 2.583343982696533 + }, + { + "auxiliary_loss_clip": 0.06708579, + "auxiliary_loss_mlp": 0.01306816, + "balance_loss_clip": 0.06325286, + "balance_loss_mlp": 0.01263447, + "epoch": 0.1029310085675635, + "flos": 25122501400320.0, + "grad_norm": 2.3794944473216684, + "language_loss": 0.74649823, + "learning_rate": 3.944464476383668e-06, + "loss": 0.82665217, + "num_input_tokens_seen": 36997300, + "router_z_loss_clip": 3.83398438, + "router_z_loss_mlp": 0.43383789, + "step": 1712, + "time_per_iteration": 2.571152687072754 + }, + { + "auxiliary_loss_clip": 0.06692443, + "auxiliary_loss_mlp": 0.01305083, + "balance_loss_clip": 0.0632696, + "balance_loss_mlp": 0.01265911, + "epoch": 0.10299113182023148, + "flos": 19871869800960.0, + "grad_norm": 3.881117444097493, + "language_loss": 0.88232982, + "learning_rate": 3.94437329843114e-06, + "loss": 0.96230507, + "num_input_tokens_seen": 37016110, + "router_z_loss_clip": 3.65429688, + "router_z_loss_mlp": 0.3918457, + "step": 1713, + "time_per_iteration": 4.005250453948975 + }, + { + "auxiliary_loss_clip": 0.06698017, + "auxiliary_loss_mlp": 0.01309494, + "balance_loss_clip": 0.06326848, + "balance_loss_mlp": 0.0126789, + "epoch": 0.10305125507289944, + "flos": 20453633498880.0, + "grad_norm": 1.7755930908575366, + "language_loss": 0.74034607, + "learning_rate": 3.944282046747782e-06, + "loss": 0.82042122, + "num_input_tokens_seen": 37036405, + "router_z_loss_clip": 3.70703125, + "router_z_loss_mlp": 0.41601562, + "step": 1714, + "time_per_iteration": 2.5871846675872803 + }, + { + "auxiliary_loss_clip": 0.06718543, + "auxiliary_loss_mlp": 0.01323459, + "balance_loss_clip": 0.06333546, + "balance_loss_mlp": 0.01278446, + "epoch": 0.10311137832556741, + "flos": 26258090659200.0, + "grad_norm": 2.9350503756017425, + "language_loss": 0.92344153, + "learning_rate": 3.944190721337053e-06, + "loss": 1.00386155, + "num_input_tokens_seen": 37057580, + "router_z_loss_clip": 3.84765625, + "router_z_loss_mlp": 0.45043945, + "step": 1715, + "time_per_iteration": 4.0185253620147705 + }, + { + "auxiliary_loss_clip": 0.06704861, + "auxiliary_loss_mlp": 0.01311537, + "balance_loss_clip": 0.06330159, + "balance_loss_mlp": 0.01269957, + "epoch": 0.10317150157823539, + "flos": 35307711797760.0, + "grad_norm": 2.2230189858401834, + "language_loss": 0.77534348, + "learning_rate": 3.944099322202418e-06, + "loss": 0.85550749, + "num_input_tokens_seen": 37079120, + "router_z_loss_clip": 3.74804688, + "router_z_loss_mlp": 0.41577148, + "step": 1716, + "time_per_iteration": 2.6924543380737305 + }, + { + "auxiliary_loss_clip": 0.06704281, + "auxiliary_loss_mlp": 0.01322549, + "balance_loss_clip": 0.06326932, + "balance_loss_mlp": 0.01278037, + "epoch": 0.10323162483090335, + "flos": 25747171188480.0, + "grad_norm": 4.647251493858166, + "language_loss": 0.87329108, + "learning_rate": 3.944007849347342e-06, + "loss": 0.9535594, + "num_input_tokens_seen": 37099710, + "router_z_loss_clip": 3.76757812, + "router_z_loss_mlp": 0.44506836, + "step": 1717, + "time_per_iteration": 2.5771939754486084 + }, + { + "auxiliary_loss_clip": 0.06709914, + "auxiliary_loss_mlp": 0.01337871, + "balance_loss_clip": 0.06322803, + "balance_loss_mlp": 0.0129393, + "epoch": 0.10329174808357132, + "flos": 16295475432960.0, + "grad_norm": 2.5245058321168297, + "language_loss": 0.84142077, + "learning_rate": 3.943916302775292e-06, + "loss": 0.9218986, + "num_input_tokens_seen": 37117775, + "router_z_loss_clip": 3.87109375, + "router_z_loss_mlp": 0.43945312, + "step": 1718, + "time_per_iteration": 3.9576940536499023 + }, + { + "auxiliary_loss_clip": 0.06693481, + "auxiliary_loss_mlp": 0.01328919, + "balance_loss_clip": 0.06322589, + "balance_loss_mlp": 0.01288626, + "epoch": 0.10335187133623928, + "flos": 36696475768320.0, + "grad_norm": 4.723677538171457, + "language_loss": 0.75181365, + "learning_rate": 3.943824682489742e-06, + "loss": 0.83203769, + "num_input_tokens_seen": 37140280, + "router_z_loss_clip": 3.70703125, + "router_z_loss_mlp": 0.40283203, + "step": 1719, + "time_per_iteration": 4.132940769195557 + }, + { + "auxiliary_loss_clip": 0.06689329, + "auxiliary_loss_mlp": 0.01317642, + "balance_loss_clip": 0.06317558, + "balance_loss_mlp": 0.01278064, + "epoch": 0.10341199458890726, + "flos": 14980909852800.0, + "grad_norm": 1.9928809485399477, + "language_loss": 0.94301736, + "learning_rate": 3.9437329884941665e-06, + "loss": 1.02308702, + "num_input_tokens_seen": 37158350, + "router_z_loss_clip": 3.71679688, + "router_z_loss_mlp": 0.39575195, + "step": 1720, + "time_per_iteration": 2.53070068359375 + }, + { + "auxiliary_loss_clip": 0.06693915, + "auxiliary_loss_mlp": 0.01322313, + "balance_loss_clip": 0.06316631, + "balance_loss_mlp": 0.0127811, + "epoch": 0.10347211784157523, + "flos": 21037745111040.0, + "grad_norm": 2.2577738133608944, + "language_loss": 0.80850732, + "learning_rate": 3.943641220792039e-06, + "loss": 0.88866961, + "num_input_tokens_seen": 37177120, + "router_z_loss_clip": 3.77148438, + "router_z_loss_mlp": 0.44213867, + "step": 1721, + "time_per_iteration": 2.6165122985839844 + }, + { + "auxiliary_loss_clip": 0.06711201, + "auxiliary_loss_mlp": 0.01332384, + "balance_loss_clip": 0.06324577, + "balance_loss_mlp": 0.01286345, + "epoch": 0.1035322410942432, + "flos": 19798216462080.0, + "grad_norm": 2.2916288774806137, + "language_loss": 0.81885946, + "learning_rate": 3.9435493793868434e-06, + "loss": 0.89929533, + "num_input_tokens_seen": 37195895, + "router_z_loss_clip": 3.87109375, + "router_z_loss_mlp": 0.46044922, + "step": 1722, + "time_per_iteration": 2.585881471633911 + }, + { + "auxiliary_loss_clip": 0.06547391, + "auxiliary_loss_mlp": 0.01290481, + "balance_loss_clip": 0.06313527, + "balance_loss_mlp": 0.01272635, + "epoch": 0.10359236434691117, + "flos": 52716037305600.0, + "grad_norm": 0.9610809671594381, + "language_loss": 0.66722119, + "learning_rate": 3.943457464282059e-06, + "loss": 0.74559999, + "num_input_tokens_seen": 37247270, + "router_z_loss_clip": 2.34375, + "router_z_loss_mlp": 0.17883301, + "step": 1723, + "time_per_iteration": 2.9245951175689697 + }, + { + "auxiliary_loss_clip": 0.0669903, + "auxiliary_loss_mlp": 0.01310212, + "balance_loss_clip": 0.06318312, + "balance_loss_mlp": 0.01267582, + "epoch": 0.10365248759957914, + "flos": 18411255354240.0, + "grad_norm": 3.390195963482514, + "language_loss": 0.78785694, + "learning_rate": 3.9433654754811745e-06, + "loss": 0.86794937, + "num_input_tokens_seen": 37265595, + "router_z_loss_clip": 3.8046875, + "router_z_loss_mlp": 0.42651367, + "step": 1724, + "time_per_iteration": 2.587998151779175 + }, + { + "auxiliary_loss_clip": 0.06701188, + "auxiliary_loss_mlp": 0.01310671, + "balance_loss_clip": 0.06321733, + "balance_loss_mlp": 0.01269663, + "epoch": 0.1037126108522471, + "flos": 47563615820160.0, + "grad_norm": 2.288753840195378, + "language_loss": 0.76223904, + "learning_rate": 3.943273412987676e-06, + "loss": 0.84235764, + "num_input_tokens_seen": 37286660, + "router_z_loss_clip": 3.79296875, + "router_z_loss_mlp": 0.41015625, + "step": 1725, + "time_per_iteration": 2.7683663368225098 + }, + { + "auxiliary_loss_clip": 0.06675334, + "auxiliary_loss_mlp": 0.01298882, + "balance_loss_clip": 0.06309348, + "balance_loss_mlp": 0.01258041, + "epoch": 0.10377273410491508, + "flos": 22822671985920.0, + "grad_norm": 2.2764288322332265, + "language_loss": 0.76062018, + "learning_rate": 3.943181276805054e-06, + "loss": 0.84036231, + "num_input_tokens_seen": 37304915, + "router_z_loss_clip": 3.66601562, + "router_z_loss_mlp": 0.40869141, + "step": 1726, + "time_per_iteration": 2.587892770767212 + }, + { + "auxiliary_loss_clip": 0.06701919, + "auxiliary_loss_mlp": 0.01307243, + "balance_loss_clip": 0.0631658, + "balance_loss_mlp": 0.0126316, + "epoch": 0.10383285735758305, + "flos": 26145556225920.0, + "grad_norm": 2.697441848061202, + "language_loss": 0.76235563, + "learning_rate": 3.9430890669368035e-06, + "loss": 0.84244722, + "num_input_tokens_seen": 37325265, + "router_z_loss_clip": 3.85351562, + "router_z_loss_mlp": 0.44042969, + "step": 1727, + "time_per_iteration": 2.6308248043060303 + }, + { + "auxiliary_loss_clip": 0.06691539, + "auxiliary_loss_mlp": 0.0130793, + "balance_loss_clip": 0.0631765, + "balance_loss_mlp": 0.01265277, + "epoch": 0.10389298061025101, + "flos": 17097402533760.0, + "grad_norm": 2.4502843901442315, + "language_loss": 0.86415958, + "learning_rate": 3.942996783386422e-06, + "loss": 0.94415426, + "num_input_tokens_seen": 37341650, + "router_z_loss_clip": 3.73828125, + "router_z_loss_mlp": 0.42675781, + "step": 1728, + "time_per_iteration": 2.5618197917938232 + }, + { + "auxiliary_loss_clip": 0.06685561, + "auxiliary_loss_mlp": 0.01302161, + "balance_loss_clip": 0.06312057, + "balance_loss_mlp": 0.01259484, + "epoch": 0.10395310386291898, + "flos": 20782683682560.0, + "grad_norm": 2.0546311064170726, + "language_loss": 0.71406788, + "learning_rate": 3.942904426157406e-06, + "loss": 0.79394507, + "num_input_tokens_seen": 37360270, + "router_z_loss_clip": 3.73632812, + "router_z_loss_mlp": 0.42675781, + "step": 1729, + "time_per_iteration": 2.5618793964385986 + }, + { + "auxiliary_loss_clip": 0.06693864, + "auxiliary_loss_mlp": 0.01305753, + "balance_loss_clip": 0.06314608, + "balance_loss_mlp": 0.01260954, + "epoch": 0.10401322711558696, + "flos": 12825032952960.0, + "grad_norm": 2.8841772006205617, + "language_loss": 0.83575559, + "learning_rate": 3.9428119952532605e-06, + "loss": 0.91575181, + "num_input_tokens_seen": 37375225, + "router_z_loss_clip": 3.79492188, + "router_z_loss_mlp": 0.44775391, + "step": 1730, + "time_per_iteration": 2.623878002166748 + }, + { + "auxiliary_loss_clip": 0.06680113, + "auxiliary_loss_mlp": 0.01302214, + "balance_loss_clip": 0.06313114, + "balance_loss_mlp": 0.01260681, + "epoch": 0.10407335036825492, + "flos": 23191274096640.0, + "grad_norm": 1.835927341089653, + "language_loss": 0.77408624, + "learning_rate": 3.942719490677489e-06, + "loss": 0.85390949, + "num_input_tokens_seen": 37395165, + "router_z_loss_clip": 3.67382812, + "router_z_loss_mlp": 0.4152832, + "step": 1731, + "time_per_iteration": 2.5633392333984375 + }, + { + "auxiliary_loss_clip": 0.0668644, + "auxiliary_loss_mlp": 0.01313118, + "balance_loss_clip": 0.0632073, + "balance_loss_mlp": 0.01273159, + "epoch": 0.10413347362092289, + "flos": 26111370960000.0, + "grad_norm": 1.90471773366097, + "language_loss": 0.84198594, + "learning_rate": 3.9426269124336e-06, + "loss": 0.92198151, + "num_input_tokens_seen": 37414845, + "router_z_loss_clip": 3.65625, + "router_z_loss_mlp": 0.39941406, + "step": 1732, + "time_per_iteration": 2.6176345348358154 + }, + { + "auxiliary_loss_clip": 0.06683554, + "auxiliary_loss_mlp": 0.01314534, + "balance_loss_clip": 0.06312263, + "balance_loss_mlp": 0.01271905, + "epoch": 0.10419359687359087, + "flos": 12646014704640.0, + "grad_norm": 2.549467420686237, + "language_loss": 0.8515988, + "learning_rate": 3.942534260525104e-06, + "loss": 0.93157971, + "num_input_tokens_seen": 37432490, + "router_z_loss_clip": 3.71679688, + "router_z_loss_mlp": 0.42626953, + "step": 1733, + "time_per_iteration": 2.529829978942871 + }, + { + "auxiliary_loss_clip": 0.06699164, + "auxiliary_loss_mlp": 0.01313294, + "balance_loss_clip": 0.06323372, + "balance_loss_mlp": 0.01269139, + "epoch": 0.10425372012625883, + "flos": 12129099667200.0, + "grad_norm": 4.348408719624472, + "language_loss": 0.78445566, + "learning_rate": 3.942441534955514e-06, + "loss": 0.86458015, + "num_input_tokens_seen": 37449435, + "router_z_loss_clip": 3.7578125, + "router_z_loss_mlp": 0.44165039, + "step": 1734, + "time_per_iteration": 2.5436649322509766 + }, + { + "auxiliary_loss_clip": 0.06683113, + "auxiliary_loss_mlp": 0.01310658, + "balance_loss_clip": 0.06320634, + "balance_loss_mlp": 0.01270937, + "epoch": 0.1043138433789268, + "flos": 25344551520000.0, + "grad_norm": 1.8276863047745044, + "language_loss": 0.76546466, + "learning_rate": 3.9423487357283465e-06, + "loss": 0.84540236, + "num_input_tokens_seen": 37469105, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 0.3972168, + "step": 1735, + "time_per_iteration": 2.6129813194274902 + }, + { + "auxiliary_loss_clip": 0.06697765, + "auxiliary_loss_mlp": 0.01313856, + "balance_loss_clip": 0.06318491, + "balance_loss_mlp": 0.01269438, + "epoch": 0.10437396663159478, + "flos": 29174539870080.0, + "grad_norm": 2.0479038136948735, + "language_loss": 0.80253965, + "learning_rate": 3.94225586284712e-06, + "loss": 0.88265586, + "num_input_tokens_seen": 37490540, + "router_z_loss_clip": 3.79492188, + "router_z_loss_mlp": 0.44360352, + "step": 1736, + "time_per_iteration": 2.6438446044921875 + }, + { + "auxiliary_loss_clip": 0.06694648, + "auxiliary_loss_mlp": 0.01312039, + "balance_loss_clip": 0.06322388, + "balance_loss_mlp": 0.01269267, + "epoch": 0.10443408988426274, + "flos": 25087687228800.0, + "grad_norm": 4.638523885209388, + "language_loss": 0.71961701, + "learning_rate": 3.942162916315356e-06, + "loss": 0.79968387, + "num_input_tokens_seen": 37511905, + "router_z_loss_clip": 3.72460938, + "router_z_loss_mlp": 0.42773438, + "step": 1737, + "time_per_iteration": 2.5947039127349854 + }, + { + "auxiliary_loss_clip": 0.06704547, + "auxiliary_loss_mlp": 0.01309535, + "balance_loss_clip": 0.06322168, + "balance_loss_mlp": 0.01263305, + "epoch": 0.1044942131369307, + "flos": 26766746069760.0, + "grad_norm": 2.5677527060209715, + "language_loss": 0.83228981, + "learning_rate": 3.942069896136581e-06, + "loss": 0.91243058, + "num_input_tokens_seen": 37533635, + "router_z_loss_clip": 3.82617188, + "router_z_loss_mlp": 0.46191406, + "step": 1738, + "time_per_iteration": 2.615252733230591 + }, + { + "auxiliary_loss_clip": 0.06695886, + "auxiliary_loss_mlp": 0.01310975, + "balance_loss_clip": 0.06315427, + "balance_loss_mlp": 0.01265747, + "epoch": 0.10455433638959867, + "flos": 18448543221120.0, + "grad_norm": 2.179337588406841, + "language_loss": 0.76366144, + "learning_rate": 3.9419768023143196e-06, + "loss": 0.84373009, + "num_input_tokens_seen": 37552035, + "router_z_loss_clip": 3.80273438, + "router_z_loss_mlp": 0.45239258, + "step": 1739, + "time_per_iteration": 2.5386781692504883 + }, + { + "auxiliary_loss_clip": 0.06684839, + "auxiliary_loss_mlp": 0.01316183, + "balance_loss_clip": 0.06310752, + "balance_loss_mlp": 0.01271456, + "epoch": 0.10461445964226665, + "flos": 23225207800320.0, + "grad_norm": 1.9549702888486553, + "language_loss": 0.7847473, + "learning_rate": 3.941883634852104e-06, + "loss": 0.86475754, + "num_input_tokens_seen": 37571540, + "router_z_loss_clip": 3.74023438, + "router_z_loss_mlp": 0.44775391, + "step": 1740, + "time_per_iteration": 2.6215531826019287 + }, + { + "auxiliary_loss_clip": 0.06687017, + "auxiliary_loss_mlp": 0.01315844, + "balance_loss_clip": 0.06320937, + "balance_loss_mlp": 0.01273953, + "epoch": 0.10467458289493461, + "flos": 24350860350720.0, + "grad_norm": 2.5281783737696246, + "language_loss": 0.86859214, + "learning_rate": 3.941790393753467e-06, + "loss": 0.94862068, + "num_input_tokens_seen": 37588265, + "router_z_loss_clip": 3.66210938, + "router_z_loss_mlp": 0.41894531, + "step": 1741, + "time_per_iteration": 2.5947859287261963 + }, + { + "auxiliary_loss_clip": 0.06689818, + "auxiliary_loss_mlp": 0.01306432, + "balance_loss_clip": 0.06307445, + "balance_loss_mlp": 0.01259201, + "epoch": 0.10473470614760258, + "flos": 21294315912960.0, + "grad_norm": 3.2114625668667367, + "language_loss": 0.76732343, + "learning_rate": 3.941697079021942e-06, + "loss": 0.84728593, + "num_input_tokens_seen": 37606860, + "router_z_loss_clip": 3.82421875, + "router_z_loss_mlp": 0.47265625, + "step": 1742, + "time_per_iteration": 2.5832579135894775 + }, + { + "auxiliary_loss_clip": 0.06678567, + "auxiliary_loss_mlp": 0.01303781, + "balance_loss_clip": 0.06306475, + "balance_loss_mlp": 0.01260628, + "epoch": 0.10479482940027056, + "flos": 21693287928960.0, + "grad_norm": 9.553870000179, + "language_loss": 0.89069176, + "learning_rate": 3.94160369066107e-06, + "loss": 0.97051525, + "num_input_tokens_seen": 37625210, + "router_z_loss_clip": 3.71875, + "router_z_loss_mlp": 0.43164062, + "step": 1743, + "time_per_iteration": 2.5764474868774414 + }, + { + "auxiliary_loss_clip": 0.06671779, + "auxiliary_loss_mlp": 0.01307955, + "balance_loss_clip": 0.06307401, + "balance_loss_mlp": 0.01264801, + "epoch": 0.10485495265293852, + "flos": 21579076414080.0, + "grad_norm": 2.2332748103162907, + "language_loss": 0.77711093, + "learning_rate": 3.941510228674391e-06, + "loss": 0.8569082, + "num_input_tokens_seen": 37644110, + "router_z_loss_clip": 3.64648438, + "router_z_loss_mlp": 0.43164062, + "step": 1744, + "time_per_iteration": 2.5712687969207764 + }, + { + "auxiliary_loss_clip": 0.06674588, + "auxiliary_loss_mlp": 0.01310978, + "balance_loss_clip": 0.06307609, + "balance_loss_mlp": 0.01270685, + "epoch": 0.10491507590560649, + "flos": 37971070151040.0, + "grad_norm": 4.071178521090377, + "language_loss": 0.81752264, + "learning_rate": 3.941416693065451e-06, + "loss": 0.89737833, + "num_input_tokens_seen": 37665800, + "router_z_loss_clip": 3.671875, + "router_z_loss_mlp": 0.40332031, + "step": 1745, + "time_per_iteration": 2.7351014614105225 + }, + { + "auxiliary_loss_clip": 0.06685829, + "auxiliary_loss_mlp": 0.01305127, + "balance_loss_clip": 0.0631006, + "balance_loss_mlp": 0.01260472, + "epoch": 0.10497519915827447, + "flos": 26403552547200.0, + "grad_norm": 2.408878958176613, + "language_loss": 0.84535897, + "learning_rate": 3.941323083837794e-06, + "loss": 0.92526853, + "num_input_tokens_seen": 37685095, + "router_z_loss_clip": 3.7578125, + "router_z_loss_mlp": 0.44628906, + "step": 1746, + "time_per_iteration": 2.6103639602661133 + }, + { + "auxiliary_loss_clip": 0.06678679, + "auxiliary_loss_mlp": 0.01312181, + "balance_loss_clip": 0.06308784, + "balance_loss_mlp": 0.01272174, + "epoch": 0.10503532241094243, + "flos": 40671842152320.0, + "grad_norm": 2.4792988701606444, + "language_loss": 0.72187877, + "learning_rate": 3.941229400994971e-06, + "loss": 0.80178738, + "num_input_tokens_seen": 37707445, + "router_z_loss_clip": 3.69921875, + "router_z_loss_mlp": 0.40014648, + "step": 1747, + "time_per_iteration": 2.7907614707946777 + }, + { + "auxiliary_loss_clip": 0.06697921, + "auxiliary_loss_mlp": 0.01310121, + "balance_loss_clip": 0.06312211, + "balance_loss_mlp": 0.0126432, + "epoch": 0.1050954456636104, + "flos": 29797239087360.0, + "grad_norm": 4.268942313212568, + "language_loss": 0.86334866, + "learning_rate": 3.941135644540535e-06, + "loss": 0.94342911, + "num_input_tokens_seen": 37728325, + "router_z_loss_clip": 3.859375, + "router_z_loss_mlp": 0.45825195, + "step": 1748, + "time_per_iteration": 2.6081960201263428 + }, + { + "auxiliary_loss_clip": 0.06687598, + "auxiliary_loss_mlp": 0.01305718, + "balance_loss_clip": 0.06311792, + "balance_loss_mlp": 0.0126409, + "epoch": 0.10515556891627838, + "flos": 23955116716800.0, + "grad_norm": 1.9464829787737532, + "language_loss": 0.73449892, + "learning_rate": 3.941041814478041e-06, + "loss": 0.81443208, + "num_input_tokens_seen": 37748910, + "router_z_loss_clip": 3.76171875, + "router_z_loss_mlp": 0.41625977, + "step": 1749, + "time_per_iteration": 2.628052234649658 + }, + { + "auxiliary_loss_clip": 0.06669957, + "auxiliary_loss_mlp": 0.01310674, + "balance_loss_clip": 0.0630856, + "balance_loss_mlp": 0.01270882, + "epoch": 0.10521569216894634, + "flos": 18265458049920.0, + "grad_norm": 3.456638635747079, + "language_loss": 0.84465253, + "learning_rate": 3.940947910811047e-06, + "loss": 0.92445886, + "num_input_tokens_seen": 37765745, + "router_z_loss_clip": 3.61328125, + "router_z_loss_mlp": 0.39794922, + "step": 1750, + "time_per_iteration": 2.537736177444458 + }, + { + "auxiliary_loss_clip": 0.06687038, + "auxiliary_loss_mlp": 0.01306152, + "balance_loss_clip": 0.06307652, + "balance_loss_mlp": 0.01264238, + "epoch": 0.10527581542161431, + "flos": 15636033400320.0, + "grad_norm": 3.4228490231822364, + "language_loss": 0.94313812, + "learning_rate": 3.940853933543114e-06, + "loss": 1.0230701, + "num_input_tokens_seen": 37780520, + "router_z_loss_clip": 3.79101562, + "router_z_loss_mlp": 0.41918945, + "step": 1751, + "time_per_iteration": 2.525054931640625 + }, + { + "auxiliary_loss_clip": 0.06674927, + "auxiliary_loss_mlp": 0.01302904, + "balance_loss_clip": 0.06309814, + "balance_loss_mlp": 0.01265686, + "epoch": 0.10533593867428227, + "flos": 18302494354560.0, + "grad_norm": 3.1318677329631757, + "language_loss": 0.8055681, + "learning_rate": 3.940759882677805e-06, + "loss": 0.88534641, + "num_input_tokens_seen": 37799515, + "router_z_loss_clip": 3.65234375, + "router_z_loss_mlp": 0.37207031, + "step": 1752, + "time_per_iteration": 2.61299467086792 + }, + { + "auxiliary_loss_clip": 0.06668897, + "auxiliary_loss_mlp": 0.01309257, + "balance_loss_clip": 0.06304127, + "balance_loss_mlp": 0.01268869, + "epoch": 0.10539606192695025, + "flos": 29030922771840.0, + "grad_norm": 1.9587092194109417, + "language_loss": 0.77260768, + "learning_rate": 3.940665758218686e-06, + "loss": 0.85238922, + "num_input_tokens_seen": 37818695, + "router_z_loss_clip": 3.64453125, + "router_z_loss_mlp": 0.40356445, + "step": 1753, + "time_per_iteration": 3.9985692501068115 + }, + { + "auxiliary_loss_clip": 0.06682716, + "auxiliary_loss_mlp": 0.01311036, + "balance_loss_clip": 0.06304091, + "balance_loss_mlp": 0.01267, + "epoch": 0.10545618517961822, + "flos": 19974593306880.0, + "grad_norm": 2.3568862676270244, + "language_loss": 0.85363507, + "learning_rate": 3.940571560169328e-06, + "loss": 0.93357253, + "num_input_tokens_seen": 37837860, + "router_z_loss_clip": 3.78710938, + "router_z_loss_mlp": 0.44067383, + "step": 1754, + "time_per_iteration": 2.5938985347747803 + }, + { + "auxiliary_loss_clip": 0.06682456, + "auxiliary_loss_mlp": 0.01316264, + "balance_loss_clip": 0.06304919, + "balance_loss_mlp": 0.012723, + "epoch": 0.10551630843228618, + "flos": 16148923441920.0, + "grad_norm": 4.265882829931168, + "language_loss": 0.71315837, + "learning_rate": 3.940477288533302e-06, + "loss": 0.7931456, + "num_input_tokens_seen": 37856260, + "router_z_loss_clip": 3.77148438, + "router_z_loss_mlp": 0.43969727, + "step": 1755, + "time_per_iteration": 3.9860999584198 + }, + { + "auxiliary_loss_clip": 0.06684709, + "auxiliary_loss_mlp": 0.01318348, + "balance_loss_clip": 0.06302933, + "balance_loss_mlp": 0.01273025, + "epoch": 0.10557643168495416, + "flos": 23446754795520.0, + "grad_norm": 2.7157076999837364, + "language_loss": 0.78681093, + "learning_rate": 3.940382943314182e-06, + "loss": 0.86684155, + "num_input_tokens_seen": 37876960, + "router_z_loss_clip": 3.8203125, + "router_z_loss_mlp": 0.453125, + "step": 1756, + "time_per_iteration": 2.616227149963379 + }, + { + "auxiliary_loss_clip": 0.06683522, + "auxiliary_loss_mlp": 0.01310683, + "balance_loss_clip": 0.06306458, + "balance_loss_mlp": 0.0126927, + "epoch": 0.10563655493762213, + "flos": 21805528872960.0, + "grad_norm": 1.8370818155350874, + "language_loss": 0.81619543, + "learning_rate": 3.940288524515547e-06, + "loss": 0.89613748, + "num_input_tokens_seen": 37897070, + "router_z_loss_clip": 3.77148438, + "router_z_loss_mlp": 0.41381836, + "step": 1757, + "time_per_iteration": 2.5410592555999756 + }, + { + "auxiliary_loss_clip": 0.06685489, + "auxiliary_loss_mlp": 0.01318192, + "balance_loss_clip": 0.06307954, + "balance_loss_mlp": 0.01272177, + "epoch": 0.10569667819029009, + "flos": 53813347176960.0, + "grad_norm": 2.270274116106966, + "language_loss": 0.800345, + "learning_rate": 3.940194032140976e-06, + "loss": 0.88038182, + "num_input_tokens_seen": 37923635, + "router_z_loss_clip": 3.77734375, + "router_z_loss_mlp": 0.46020508, + "step": 1758, + "time_per_iteration": 4.229799032211304 + }, + { + "auxiliary_loss_clip": 0.06687906, + "auxiliary_loss_mlp": 0.01314474, + "balance_loss_clip": 0.06312382, + "balance_loss_mlp": 0.01272537, + "epoch": 0.10575680144295807, + "flos": 22931432985600.0, + "grad_norm": 1.92460183667747, + "language_loss": 0.93262696, + "learning_rate": 3.940099466194054e-06, + "loss": 1.01265085, + "num_input_tokens_seen": 37942650, + "router_z_loss_clip": 3.75585938, + "router_z_loss_mlp": 0.41967773, + "step": 1759, + "time_per_iteration": 4.090106248855591 + }, + { + "auxiliary_loss_clip": 0.066918, + "auxiliary_loss_mlp": 0.01305635, + "balance_loss_clip": 0.06315835, + "balance_loss_mlp": 0.01262219, + "epoch": 0.10581692469562604, + "flos": 14141820666240.0, + "grad_norm": 3.0343588084928204, + "language_loss": 0.78992438, + "learning_rate": 3.940004826678365e-06, + "loss": 0.86989868, + "num_input_tokens_seen": 37960660, + "router_z_loss_clip": 3.75976562, + "router_z_loss_mlp": 0.43383789, + "step": 1760, + "time_per_iteration": 2.5582082271575928 + }, + { + "auxiliary_loss_clip": 0.06697676, + "auxiliary_loss_mlp": 0.0131432, + "balance_loss_clip": 0.06312977, + "balance_loss_mlp": 0.01266588, + "epoch": 0.105877047948294, + "flos": 25965909072000.0, + "grad_norm": 2.31808263898244, + "language_loss": 0.91032952, + "learning_rate": 3.939910113597498e-06, + "loss": 0.99044949, + "num_input_tokens_seen": 37978625, + "router_z_loss_clip": 3.8515625, + "router_z_loss_mlp": 0.47729492, + "step": 1761, + "time_per_iteration": 2.5757992267608643 + }, + { + "auxiliary_loss_clip": 0.06676473, + "auxiliary_loss_mlp": 0.01306238, + "balance_loss_clip": 0.06308871, + "balance_loss_mlp": 0.01264229, + "epoch": 0.10593717120096197, + "flos": 30672693745920.0, + "grad_norm": 2.4539135080814862, + "language_loss": 0.79606199, + "learning_rate": 3.9398153269550464e-06, + "loss": 0.87588912, + "num_input_tokens_seen": 38000005, + "router_z_loss_clip": 3.6796875, + "router_z_loss_mlp": 0.42041016, + "step": 1762, + "time_per_iteration": 2.6716315746307373 + }, + { + "auxiliary_loss_clip": 0.06617578, + "auxiliary_loss_mlp": 0.01351391, + "balance_loss_clip": 0.06387473, + "balance_loss_mlp": 0.01331745, + "epoch": 0.10599729445362994, + "flos": 66459347153280.0, + "grad_norm": 0.7549006377741803, + "language_loss": 0.60690284, + "learning_rate": 3.939720466754602e-06, + "loss": 0.68659246, + "num_input_tokens_seen": 38066165, + "router_z_loss_clip": 2.31054688, + "router_z_loss_mlp": 0.19628906, + "step": 1763, + "time_per_iteration": 3.3268401622772217 + }, + { + "auxiliary_loss_clip": 0.06678826, + "auxiliary_loss_mlp": 0.01304205, + "balance_loss_clip": 0.06307326, + "balance_loss_mlp": 0.01263221, + "epoch": 0.10605741770629791, + "flos": 23954445884160.0, + "grad_norm": 2.5468873407149744, + "language_loss": 0.81550586, + "learning_rate": 3.939625532999763e-06, + "loss": 0.89533615, + "num_input_tokens_seen": 38086150, + "router_z_loss_clip": 3.71289062, + "router_z_loss_mlp": 0.40991211, + "step": 1764, + "time_per_iteration": 2.6332688331604004 + }, + { + "auxiliary_loss_clip": 0.06680285, + "auxiliary_loss_mlp": 0.01305528, + "balance_loss_clip": 0.06314, + "balance_loss_mlp": 0.0126359, + "epoch": 0.10611754095896588, + "flos": 19393039244160.0, + "grad_norm": 2.1888720223736384, + "language_loss": 0.81130767, + "learning_rate": 3.9395305256941314e-06, + "loss": 0.89116579, + "num_input_tokens_seen": 38104205, + "router_z_loss_clip": 3.66210938, + "router_z_loss_mlp": 0.41943359, + "step": 1765, + "time_per_iteration": 2.5613298416137695 + }, + { + "auxiliary_loss_clip": 0.0667872, + "auxiliary_loss_mlp": 0.01306506, + "balance_loss_clip": 0.06306241, + "balance_loss_mlp": 0.01263328, + "epoch": 0.10617766421163385, + "flos": 22244472086400.0, + "grad_norm": 2.2657345433152853, + "language_loss": 0.78213799, + "learning_rate": 3.939435444841306e-06, + "loss": 0.86199021, + "num_input_tokens_seen": 38122005, + "router_z_loss_clip": 3.72460938, + "router_z_loss_mlp": 0.43188477, + "step": 1766, + "time_per_iteration": 2.596531867980957 + }, + { + "auxiliary_loss_clip": 0.0668143, + "auxiliary_loss_mlp": 0.01312404, + "balance_loss_clip": 0.06318849, + "balance_loss_mlp": 0.01270705, + "epoch": 0.10623778746430182, + "flos": 28412248550400.0, + "grad_norm": 1.8379569457301719, + "language_loss": 0.78568375, + "learning_rate": 3.939340290444895e-06, + "loss": 0.8656221, + "num_input_tokens_seen": 38143365, + "router_z_loss_clip": 3.62304688, + "router_z_loss_mlp": 0.41674805, + "step": 1767, + "time_per_iteration": 2.6066575050354004 + }, + { + "auxiliary_loss_clip": 0.06566842, + "auxiliary_loss_mlp": 0.01278755, + "balance_loss_clip": 0.06337046, + "balance_loss_mlp": 0.01260039, + "epoch": 0.10629791071696978, + "flos": 64254778231680.0, + "grad_norm": 0.6896173149576642, + "language_loss": 0.57757622, + "learning_rate": 3.939245062508506e-06, + "loss": 0.6560322, + "num_input_tokens_seen": 38210035, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.18688965, + "step": 1768, + "time_per_iteration": 3.3073205947875977 + }, + { + "auxiliary_loss_clip": 0.06681848, + "auxiliary_loss_mlp": 0.01302238, + "balance_loss_clip": 0.06313933, + "balance_loss_mlp": 0.01260634, + "epoch": 0.10635803396963776, + "flos": 22754217600000.0, + "grad_norm": 1.7735238866189138, + "language_loss": 0.88016206, + "learning_rate": 3.939149761035749e-06, + "loss": 0.9600029, + "num_input_tokens_seen": 38231230, + "router_z_loss_clip": 3.67578125, + "router_z_loss_mlp": 0.41625977, + "step": 1769, + "time_per_iteration": 2.59757924079895 + }, + { + "auxiliary_loss_clip": 0.06688489, + "auxiliary_loss_mlp": 0.01307377, + "balance_loss_clip": 0.06315921, + "balance_loss_mlp": 0.01266035, + "epoch": 0.10641815722230573, + "flos": 31403818546560.0, + "grad_norm": 1.8774824554466385, + "language_loss": 0.62396371, + "learning_rate": 3.9390543860302395e-06, + "loss": 0.70392233, + "num_input_tokens_seen": 38253890, + "router_z_loss_clip": 3.72460938, + "router_z_loss_mlp": 0.41357422, + "step": 1770, + "time_per_iteration": 2.619767904281616 + }, + { + "auxiliary_loss_clip": 0.06544405, + "auxiliary_loss_mlp": 0.01277398, + "balance_loss_clip": 0.06314689, + "balance_loss_mlp": 0.01260136, + "epoch": 0.1064782804749737, + "flos": 58567230645120.0, + "grad_norm": 0.8566843095142983, + "language_loss": 0.57127362, + "learning_rate": 3.9389589374955925e-06, + "loss": 0.64949167, + "num_input_tokens_seen": 38304290, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.17285156, + "step": 1771, + "time_per_iteration": 3.075225353240967 + }, + { + "auxiliary_loss_clip": 0.06680871, + "auxiliary_loss_mlp": 0.01316894, + "balance_loss_clip": 0.06314114, + "balance_loss_mlp": 0.01274432, + "epoch": 0.10653840372764166, + "flos": 23994626716800.0, + "grad_norm": 1.9413884947034454, + "language_loss": 0.90273499, + "learning_rate": 3.938863415435429e-06, + "loss": 0.98271263, + "num_input_tokens_seen": 38324725, + "router_z_loss_clip": 3.66601562, + "router_z_loss_mlp": 0.42431641, + "step": 1772, + "time_per_iteration": 2.5640146732330322 + }, + { + "auxiliary_loss_clip": 0.06695, + "auxiliary_loss_mlp": 0.01317722, + "balance_loss_clip": 0.0631227, + "balance_loss_mlp": 0.01272828, + "epoch": 0.10659852698030964, + "flos": 18300272221440.0, + "grad_norm": 4.259637608820723, + "language_loss": 0.78636491, + "learning_rate": 3.93876781985337e-06, + "loss": 0.86649209, + "num_input_tokens_seen": 38340735, + "router_z_loss_clip": 3.83203125, + "router_z_loss_mlp": 0.44824219, + "step": 1773, + "time_per_iteration": 2.528411626815796 + }, + { + "auxiliary_loss_clip": 0.06679896, + "auxiliary_loss_mlp": 0.01313366, + "balance_loss_clip": 0.06312554, + "balance_loss_mlp": 0.01272024, + "epoch": 0.1066586502329776, + "flos": 32168751269760.0, + "grad_norm": 2.123173958110219, + "language_loss": 0.84472597, + "learning_rate": 3.938672150753041e-06, + "loss": 0.92465854, + "num_input_tokens_seen": 38361315, + "router_z_loss_clip": 3.67578125, + "router_z_loss_mlp": 0.41333008, + "step": 1774, + "time_per_iteration": 2.6232900619506836 + }, + { + "auxiliary_loss_clip": 0.06689709, + "auxiliary_loss_mlp": 0.01315484, + "balance_loss_clip": 0.06314571, + "balance_loss_mlp": 0.0127245, + "epoch": 0.10671877348564557, + "flos": 17790904051200.0, + "grad_norm": 3.7633279602301326, + "language_loss": 0.78288794, + "learning_rate": 3.9385764081380704e-06, + "loss": 0.86293983, + "num_input_tokens_seen": 38377425, + "router_z_loss_clip": 3.75585938, + "router_z_loss_mlp": 0.43066406, + "step": 1775, + "time_per_iteration": 2.5444161891937256 + }, + { + "auxiliary_loss_clip": 0.06541309, + "auxiliary_loss_mlp": 0.01282331, + "balance_loss_clip": 0.06314777, + "balance_loss_mlp": 0.0126594, + "epoch": 0.10677889673831355, + "flos": 63531074517120.0, + "grad_norm": 0.8449773894494127, + "language_loss": 0.57561356, + "learning_rate": 3.9384805920120876e-06, + "loss": 0.65384996, + "num_input_tokens_seen": 38440275, + "router_z_loss_clip": 2.265625, + "router_z_loss_mlp": 0.16394043, + "step": 1776, + "time_per_iteration": 3.194715976715088 + }, + { + "auxiliary_loss_clip": 0.06668387, + "auxiliary_loss_mlp": 0.01308478, + "balance_loss_clip": 0.063052, + "balance_loss_mlp": 0.01266421, + "epoch": 0.10683901999098151, + "flos": 22024182902400.0, + "grad_norm": 4.182030492494299, + "language_loss": 0.84917277, + "learning_rate": 3.938384702378727e-06, + "loss": 0.92894137, + "num_input_tokens_seen": 38461820, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 0.42041016, + "step": 1777, + "time_per_iteration": 2.595827102661133 + }, + { + "auxiliary_loss_clip": 0.06665277, + "auxiliary_loss_mlp": 0.01305083, + "balance_loss_clip": 0.06308808, + "balance_loss_mlp": 0.01265076, + "epoch": 0.10689914324364948, + "flos": 25049435040000.0, + "grad_norm": 3.105295988575609, + "language_loss": 0.89778632, + "learning_rate": 3.938288739241625e-06, + "loss": 0.97748995, + "num_input_tokens_seen": 38482235, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 0.40014648, + "step": 1778, + "time_per_iteration": 2.5659501552581787 + }, + { + "auxiliary_loss_clip": 0.06673209, + "auxiliary_loss_mlp": 0.0130986, + "balance_loss_clip": 0.06311059, + "balance_loss_mlp": 0.01270068, + "epoch": 0.10695926649631746, + "flos": 16440643831680.0, + "grad_norm": 2.394911901784639, + "language_loss": 0.85383832, + "learning_rate": 3.938192702604417e-06, + "loss": 0.93366897, + "num_input_tokens_seen": 38500690, + "router_z_loss_clip": 3.62304688, + "router_z_loss_mlp": 0.39794922, + "step": 1779, + "time_per_iteration": 2.593081474304199 + }, + { + "auxiliary_loss_clip": 0.06673639, + "auxiliary_loss_mlp": 0.01307049, + "balance_loss_clip": 0.06310658, + "balance_loss_mlp": 0.01266255, + "epoch": 0.10701938974898542, + "flos": 16984281121920.0, + "grad_norm": 6.263456292034634, + "language_loss": 0.689089, + "learning_rate": 3.9380965924707495e-06, + "loss": 0.76889586, + "num_input_tokens_seen": 38518405, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 0.40844727, + "step": 1780, + "time_per_iteration": 2.5288658142089844 + }, + { + "auxiliary_loss_clip": 0.06670965, + "auxiliary_loss_mlp": 0.01308635, + "balance_loss_clip": 0.06307741, + "balance_loss_mlp": 0.01267675, + "epoch": 0.10707951300165339, + "flos": 15893568524160.0, + "grad_norm": 2.7813039840033116, + "language_loss": 0.94183797, + "learning_rate": 3.938000408844265e-06, + "loss": 1.02163386, + "num_input_tokens_seen": 38535060, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 0.40942383, + "step": 1781, + "time_per_iteration": 2.5472099781036377 + }, + { + "auxiliary_loss_clip": 0.06674273, + "auxiliary_loss_mlp": 0.01309874, + "balance_loss_clip": 0.06307364, + "balance_loss_mlp": 0.01267793, + "epoch": 0.10713963625432135, + "flos": 14252510309760.0, + "grad_norm": 2.902551508287184, + "language_loss": 0.80661923, + "learning_rate": 3.9379041517286105e-06, + "loss": 0.88646066, + "num_input_tokens_seen": 38552855, + "router_z_loss_clip": 3.671875, + "router_z_loss_mlp": 0.4206543, + "step": 1782, + "time_per_iteration": 2.510643482208252 + }, + { + "auxiliary_loss_clip": 0.06686161, + "auxiliary_loss_mlp": 0.01310662, + "balance_loss_clip": 0.06313431, + "balance_loss_mlp": 0.01267341, + "epoch": 0.10719975950698933, + "flos": 16761224753280.0, + "grad_norm": 2.870404925374148, + "language_loss": 0.80170923, + "learning_rate": 3.937807821127436e-06, + "loss": 0.88167745, + "num_input_tokens_seen": 38570075, + "router_z_loss_clip": 3.7265625, + "router_z_loss_mlp": 0.43334961, + "step": 1783, + "time_per_iteration": 2.5342109203338623 + }, + { + "auxiliary_loss_clip": 0.06683534, + "auxiliary_loss_mlp": 0.01311834, + "balance_loss_clip": 0.063077, + "balance_loss_mlp": 0.0126818, + "epoch": 0.1072598827596573, + "flos": 22717181295360.0, + "grad_norm": 2.882000106412139, + "language_loss": 0.88123596, + "learning_rate": 3.937711417044395e-06, + "loss": 0.96118969, + "num_input_tokens_seen": 38587970, + "router_z_loss_clip": 3.75390625, + "router_z_loss_mlp": 0.4362793, + "step": 1784, + "time_per_iteration": 2.5347747802734375 + }, + { + "auxiliary_loss_clip": 0.0667218, + "auxiliary_loss_mlp": 0.0129997, + "balance_loss_clip": 0.06303082, + "balance_loss_mlp": 0.01257484, + "epoch": 0.10732000601232526, + "flos": 23264969362560.0, + "grad_norm": 3.307544320202646, + "language_loss": 1.02124667, + "learning_rate": 3.937614939483143e-06, + "loss": 1.10096812, + "num_input_tokens_seen": 38605840, + "router_z_loss_clip": 3.6875, + "router_z_loss_mlp": 0.42480469, + "step": 1785, + "time_per_iteration": 2.573028802871704 + }, + { + "auxiliary_loss_clip": 0.06653184, + "auxiliary_loss_mlp": 0.01298346, + "balance_loss_clip": 0.06301528, + "balance_loss_mlp": 0.01260676, + "epoch": 0.10738012926499324, + "flos": 24213951578880.0, + "grad_norm": 1.5126040850021356, + "language_loss": 0.86291718, + "learning_rate": 3.937518388447339e-06, + "loss": 0.94243246, + "num_input_tokens_seen": 38627070, + "router_z_loss_clip": 3.51757812, + "router_z_loss_mlp": 0.37670898, + "step": 1786, + "time_per_iteration": 2.583588123321533 + }, + { + "auxiliary_loss_clip": 0.06674268, + "auxiliary_loss_mlp": 0.01305446, + "balance_loss_clip": 0.06299917, + "balance_loss_mlp": 0.01260337, + "epoch": 0.1074402525176612, + "flos": 20929361454720.0, + "grad_norm": 2.204457856509681, + "language_loss": 0.80718577, + "learning_rate": 3.937421763940642e-06, + "loss": 0.88698298, + "num_input_tokens_seen": 38645840, + "router_z_loss_clip": 3.74414062, + "router_z_loss_mlp": 0.45092773, + "step": 1787, + "time_per_iteration": 2.5648107528686523 + }, + { + "auxiliary_loss_clip": 0.06675328, + "auxiliary_loss_mlp": 0.01304706, + "balance_loss_clip": 0.06304328, + "balance_loss_mlp": 0.01262769, + "epoch": 0.10750037577032917, + "flos": 16952695332480.0, + "grad_norm": 2.64327450986053, + "language_loss": 0.8385697, + "learning_rate": 3.937325065966719e-06, + "loss": 0.91837001, + "num_input_tokens_seen": 38664770, + "router_z_loss_clip": 3.71484375, + "router_z_loss_mlp": 0.41943359, + "step": 1788, + "time_per_iteration": 2.5402321815490723 + }, + { + "auxiliary_loss_clip": 0.06668989, + "auxiliary_loss_mlp": 0.01316653, + "balance_loss_clip": 0.0630315, + "balance_loss_mlp": 0.01276384, + "epoch": 0.10756049902299715, + "flos": 20272770460800.0, + "grad_norm": 2.8631598958886135, + "language_loss": 0.79821587, + "learning_rate": 3.9372282945292335e-06, + "loss": 0.87807226, + "num_input_tokens_seen": 38683865, + "router_z_loss_clip": 3.66015625, + "router_z_loss_mlp": 0.40258789, + "step": 1789, + "time_per_iteration": 2.5255203247070312 + }, + { + "auxiliary_loss_clip": 0.06671752, + "auxiliary_loss_mlp": 0.01304626, + "balance_loss_clip": 0.06304207, + "balance_loss_mlp": 0.01261019, + "epoch": 0.10762062227566511, + "flos": 23593264859520.0, + "grad_norm": 3.1602441142249584, + "language_loss": 0.75890934, + "learning_rate": 3.937131449631859e-06, + "loss": 0.83867311, + "num_input_tokens_seen": 38702485, + "router_z_loss_clip": 3.671875, + "router_z_loss_mlp": 0.43603516, + "step": 1790, + "time_per_iteration": 2.6021804809570312 + }, + { + "auxiliary_loss_clip": 0.06681746, + "auxiliary_loss_mlp": 0.01304108, + "balance_loss_clip": 0.06308466, + "balance_loss_mlp": 0.01261741, + "epoch": 0.10768074552833308, + "flos": 24316549303680.0, + "grad_norm": 2.153087509424505, + "language_loss": 0.80275488, + "learning_rate": 3.9370345312782645e-06, + "loss": 0.88261342, + "num_input_tokens_seen": 38722475, + "router_z_loss_clip": 3.734375, + "router_z_loss_mlp": 0.42333984, + "step": 1791, + "time_per_iteration": 2.546696662902832 + }, + { + "auxiliary_loss_clip": 0.06660049, + "auxiliary_loss_mlp": 0.01311951, + "balance_loss_clip": 0.06307684, + "balance_loss_mlp": 0.01273255, + "epoch": 0.10774086878100106, + "flos": 25306760528640.0, + "grad_norm": 1.9333309848647533, + "language_loss": 0.72259545, + "learning_rate": 3.936937539472126e-06, + "loss": 0.80231547, + "num_input_tokens_seen": 38743285, + "router_z_loss_clip": 3.5234375, + "router_z_loss_mlp": 0.38647461, + "step": 1792, + "time_per_iteration": 3.9801604747772217 + }, + { + "auxiliary_loss_clip": 0.06673245, + "auxiliary_loss_mlp": 0.01302989, + "balance_loss_clip": 0.06307209, + "balance_loss_mlp": 0.01260813, + "epoch": 0.10780099203366902, + "flos": 22060506447360.0, + "grad_norm": 2.562098500680419, + "language_loss": 0.78115147, + "learning_rate": 3.9368404742171236e-06, + "loss": 0.86091387, + "num_input_tokens_seen": 38763035, + "router_z_loss_clip": 3.65820312, + "router_z_loss_mlp": 0.42163086, + "step": 1793, + "time_per_iteration": 2.5435540676116943 + }, + { + "auxiliary_loss_clip": 0.06668183, + "auxiliary_loss_mlp": 0.01304414, + "balance_loss_clip": 0.06312631, + "balance_loss_mlp": 0.01268151, + "epoch": 0.10786111528633699, + "flos": 22754091818880.0, + "grad_norm": 1.5894120102976992, + "language_loss": 0.86093199, + "learning_rate": 3.936743335516936e-06, + "loss": 0.94065803, + "num_input_tokens_seen": 38784900, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 0.36279297, + "step": 1794, + "time_per_iteration": 4.001549482345581 + }, + { + "auxiliary_loss_clip": 0.0669271, + "auxiliary_loss_mlp": 0.01312602, + "balance_loss_clip": 0.06319374, + "balance_loss_mlp": 0.01269472, + "epoch": 0.10792123853900495, + "flos": 20857510978560.0, + "grad_norm": 2.1590787324009257, + "language_loss": 0.77325815, + "learning_rate": 3.936646123375246e-06, + "loss": 0.8533113, + "num_input_tokens_seen": 38804695, + "router_z_loss_clip": 3.734375, + "router_z_loss_mlp": 0.43115234, + "step": 1795, + "time_per_iteration": 2.601548910140991 + }, + { + "auxiliary_loss_clip": 0.06686068, + "auxiliary_loss_mlp": 0.01301313, + "balance_loss_clip": 0.06317562, + "balance_loss_mlp": 0.01262212, + "epoch": 0.10798136179167293, + "flos": 17754454725120.0, + "grad_norm": 3.0035183040345306, + "language_loss": 0.83787191, + "learning_rate": 3.936548837795741e-06, + "loss": 0.91774577, + "num_input_tokens_seen": 38822395, + "router_z_loss_clip": 3.68359375, + "router_z_loss_mlp": 0.39086914, + "step": 1796, + "time_per_iteration": 2.506821870803833 + }, + { + "auxiliary_loss_clip": 0.06692545, + "auxiliary_loss_mlp": 0.01329164, + "balance_loss_clip": 0.06318776, + "balance_loss_mlp": 0.01285318, + "epoch": 0.1080414850443409, + "flos": 13594745358720.0, + "grad_norm": 2.560788533662373, + "language_loss": 0.7551347, + "learning_rate": 3.936451478782111e-06, + "loss": 0.83535177, + "num_input_tokens_seen": 38839865, + "router_z_loss_clip": 3.73828125, + "router_z_loss_mlp": 0.43847656, + "step": 1797, + "time_per_iteration": 3.9367597103118896 + }, + { + "auxiliary_loss_clip": 0.06662647, + "auxiliary_loss_mlp": 0.01300606, + "balance_loss_clip": 0.06308785, + "balance_loss_mlp": 0.0126265, + "epoch": 0.10810160829700886, + "flos": 16259026106880.0, + "grad_norm": 2.354924251941542, + "language_loss": 0.83353364, + "learning_rate": 3.936354046338046e-06, + "loss": 0.91316622, + "num_input_tokens_seen": 38857300, + "router_z_loss_clip": 3.53515625, + "router_z_loss_mlp": 0.37939453, + "step": 1798, + "time_per_iteration": 4.009509086608887 + }, + { + "auxiliary_loss_clip": 0.06672391, + "auxiliary_loss_mlp": 0.01305094, + "balance_loss_clip": 0.06315865, + "balance_loss_mlp": 0.01265635, + "epoch": 0.10816173154967684, + "flos": 15163282264320.0, + "grad_norm": 3.5539012768628786, + "language_loss": 0.87248892, + "learning_rate": 3.936256540467242e-06, + "loss": 0.95226371, + "num_input_tokens_seen": 38874960, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 0.39477539, + "step": 1799, + "time_per_iteration": 2.5058934688568115 + }, + { + "auxiliary_loss_clip": 0.06677136, + "auxiliary_loss_mlp": 0.01305557, + "balance_loss_clip": 0.06318786, + "balance_loss_mlp": 0.01268459, + "epoch": 0.10822185480234481, + "flos": 17791113686400.0, + "grad_norm": 2.263102555339672, + "language_loss": 0.78951424, + "learning_rate": 3.9361589611733955e-06, + "loss": 0.86934125, + "num_input_tokens_seen": 38893610, + "router_z_loss_clip": 3.58203125, + "router_z_loss_mlp": 0.37084961, + "step": 1800, + "time_per_iteration": 2.546147584915161 + }, + { + "auxiliary_loss_clip": 0.06672224, + "auxiliary_loss_mlp": 0.01299, + "balance_loss_clip": 0.06316296, + "balance_loss_mlp": 0.01262546, + "epoch": 0.10828197805501277, + "flos": 25563708673920.0, + "grad_norm": 5.510395821762047, + "language_loss": 0.74356997, + "learning_rate": 3.9360613084602075e-06, + "loss": 0.82328218, + "num_input_tokens_seen": 38913485, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 0.36425781, + "step": 1801, + "time_per_iteration": 2.6982262134552 + }, + { + "auxiliary_loss_clip": 0.06691626, + "auxiliary_loss_mlp": 0.01309625, + "balance_loss_clip": 0.06324095, + "balance_loss_mlp": 0.01272813, + "epoch": 0.10834210130768075, + "flos": 28991748188160.0, + "grad_norm": 2.1562213268616355, + "language_loss": 0.67963791, + "learning_rate": 3.935963582331381e-06, + "loss": 0.75965041, + "num_input_tokens_seen": 38935650, + "router_z_loss_clip": 3.67578125, + "router_z_loss_mlp": 0.3684082, + "step": 1802, + "time_per_iteration": 2.633770704269409 + }, + { + "auxiliary_loss_clip": 0.06676073, + "auxiliary_loss_mlp": 0.01309023, + "balance_loss_clip": 0.0632169, + "balance_loss_mlp": 0.01273379, + "epoch": 0.10840222456034872, + "flos": 20270045203200.0, + "grad_norm": 4.600711865085207, + "language_loss": 0.83367407, + "learning_rate": 3.935865782790621e-06, + "loss": 0.9135251, + "num_input_tokens_seen": 38954130, + "router_z_loss_clip": 3.55078125, + "router_z_loss_mlp": 0.35668945, + "step": 1803, + "time_per_iteration": 2.5231714248657227 + }, + { + "auxiliary_loss_clip": 0.06688153, + "auxiliary_loss_mlp": 0.01302267, + "balance_loss_clip": 0.06328186, + "balance_loss_mlp": 0.01263286, + "epoch": 0.10846234781301668, + "flos": 19868851054080.0, + "grad_norm": 2.166179009667806, + "language_loss": 0.92279881, + "learning_rate": 3.9357679098416365e-06, + "loss": 1.00270307, + "num_input_tokens_seen": 38972905, + "router_z_loss_clip": 3.59765625, + "router_z_loss_mlp": 0.39013672, + "step": 1804, + "time_per_iteration": 2.5790512561798096 + }, + { + "auxiliary_loss_clip": 0.06684472, + "auxiliary_loss_mlp": 0.01313096, + "balance_loss_clip": 0.06322414, + "balance_loss_mlp": 0.01273327, + "epoch": 0.10852247106568465, + "flos": 26476283491200.0, + "grad_norm": 2.1541825231451384, + "language_loss": 0.7834245, + "learning_rate": 3.935669963488139e-06, + "loss": 0.8634001, + "num_input_tokens_seen": 38993255, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 0.39794922, + "step": 1805, + "time_per_iteration": 2.579225778579712 + }, + { + "auxiliary_loss_clip": 0.06686831, + "auxiliary_loss_mlp": 0.01314489, + "balance_loss_clip": 0.06327775, + "balance_loss_mlp": 0.01276938, + "epoch": 0.10858259431835263, + "flos": 30089420674560.0, + "grad_norm": 1.8150777160293243, + "language_loss": 0.87391019, + "learning_rate": 3.935571943733843e-06, + "loss": 0.95392346, + "num_input_tokens_seen": 39012610, + "router_z_loss_clip": 3.58789062, + "router_z_loss_mlp": 0.37548828, + "step": 1806, + "time_per_iteration": 2.6113767623901367 + }, + { + "auxiliary_loss_clip": 0.06674515, + "auxiliary_loss_mlp": 0.01306373, + "balance_loss_clip": 0.06320654, + "balance_loss_mlp": 0.01270038, + "epoch": 0.10864271757102059, + "flos": 19069313794560.0, + "grad_norm": 2.587857349139583, + "language_loss": 0.81862879, + "learning_rate": 3.9354738505824635e-06, + "loss": 0.89843768, + "num_input_tokens_seen": 39030120, + "router_z_loss_clip": 3.53515625, + "router_z_loss_mlp": 0.36328125, + "step": 1807, + "time_per_iteration": 2.5133659839630127 + }, + { + "auxiliary_loss_clip": 0.06671922, + "auxiliary_loss_mlp": 0.01298096, + "balance_loss_clip": 0.06316403, + "balance_loss_mlp": 0.01264193, + "epoch": 0.10870284082368856, + "flos": 24721558813440.0, + "grad_norm": 5.872677105154593, + "language_loss": 0.80080831, + "learning_rate": 3.9353756840377225e-06, + "loss": 0.88050854, + "num_input_tokens_seen": 39049875, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 0.33911133, + "step": 1808, + "time_per_iteration": 2.615813732147217 + }, + { + "auxiliary_loss_clip": 0.06679243, + "auxiliary_loss_mlp": 0.01305785, + "balance_loss_clip": 0.06317936, + "balance_loss_mlp": 0.0126926, + "epoch": 0.10876296407635654, + "flos": 20633322579840.0, + "grad_norm": 1.9478579539752536, + "language_loss": 0.80837792, + "learning_rate": 3.935277444103342e-06, + "loss": 0.88822818, + "num_input_tokens_seen": 39068935, + "router_z_loss_clip": 3.61328125, + "router_z_loss_mlp": 0.36523438, + "step": 1809, + "time_per_iteration": 2.5448191165924072 + }, + { + "auxiliary_loss_clip": 0.0666375, + "auxiliary_loss_mlp": 0.01303981, + "balance_loss_clip": 0.06309726, + "balance_loss_mlp": 0.01265119, + "epoch": 0.1088230873290245, + "flos": 21586245937920.0, + "grad_norm": 2.4636813373380213, + "language_loss": 0.86466354, + "learning_rate": 3.935179130783046e-06, + "loss": 0.94434083, + "num_input_tokens_seen": 39087370, + "router_z_loss_clip": 3.54101562, + "router_z_loss_mlp": 0.38891602, + "step": 1810, + "time_per_iteration": 2.603607654571533 + }, + { + "auxiliary_loss_clip": 0.06689243, + "auxiliary_loss_mlp": 0.01306323, + "balance_loss_clip": 0.06319645, + "balance_loss_mlp": 0.01268367, + "epoch": 0.10888321058169247, + "flos": 26476283491200.0, + "grad_norm": 1.9747664396184277, + "language_loss": 0.65524805, + "learning_rate": 3.935080744080564e-06, + "loss": 0.73520374, + "num_input_tokens_seen": 39106635, + "router_z_loss_clip": 3.69726562, + "router_z_loss_mlp": 0.37939453, + "step": 1811, + "time_per_iteration": 2.581341505050659 + }, + { + "auxiliary_loss_clip": 0.0667599, + "auxiliary_loss_mlp": 0.01304861, + "balance_loss_clip": 0.06313843, + "balance_loss_mlp": 0.01266166, + "epoch": 0.10894333383436045, + "flos": 25855722552960.0, + "grad_norm": 2.675746043218001, + "language_loss": 0.75747859, + "learning_rate": 3.934982283999626e-06, + "loss": 0.83728707, + "num_input_tokens_seen": 39126335, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 0.38671875, + "step": 1812, + "time_per_iteration": 2.6015379428863525 + }, + { + "auxiliary_loss_clip": 0.06657378, + "auxiliary_loss_mlp": 0.01303294, + "balance_loss_clip": 0.06303936, + "balance_loss_mlp": 0.01265219, + "epoch": 0.10900345708702841, + "flos": 19543238887680.0, + "grad_norm": 2.31852988369708, + "language_loss": 0.74425399, + "learning_rate": 3.934883750543966e-06, + "loss": 0.82386076, + "num_input_tokens_seen": 39144820, + "router_z_loss_clip": 3.53320312, + "router_z_loss_mlp": 0.38085938, + "step": 1813, + "time_per_iteration": 2.5689308643341064 + }, + { + "auxiliary_loss_clip": 0.06659622, + "auxiliary_loss_mlp": 0.01293341, + "balance_loss_clip": 0.06308373, + "balance_loss_mlp": 0.01258556, + "epoch": 0.10906358033969638, + "flos": 23630091528960.0, + "grad_norm": 1.8365155089256564, + "language_loss": 0.84168994, + "learning_rate": 3.93478514371732e-06, + "loss": 0.92121959, + "num_input_tokens_seen": 39165945, + "router_z_loss_clip": 3.51367188, + "router_z_loss_mlp": 0.34790039, + "step": 1814, + "time_per_iteration": 2.5616791248321533 + }, + { + "auxiliary_loss_clip": 0.06670845, + "auxiliary_loss_mlp": 0.01300399, + "balance_loss_clip": 0.06307411, + "balance_loss_mlp": 0.01261036, + "epoch": 0.10912370359236434, + "flos": 21221039917440.0, + "grad_norm": 3.301230683958358, + "language_loss": 0.85154849, + "learning_rate": 3.934686463523429e-06, + "loss": 0.93126094, + "num_input_tokens_seen": 39183520, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 0.39355469, + "step": 1815, + "time_per_iteration": 2.57688307762146 + }, + { + "auxiliary_loss_clip": 0.06661555, + "auxiliary_loss_mlp": 0.01302183, + "balance_loss_clip": 0.06312289, + "balance_loss_mlp": 0.01263726, + "epoch": 0.10918382684503232, + "flos": 13558296032640.0, + "grad_norm": 2.7300514950641714, + "language_loss": 0.73428917, + "learning_rate": 3.9345877099660315e-06, + "loss": 0.81392652, + "num_input_tokens_seen": 39201190, + "router_z_loss_clip": 3.4921875, + "router_z_loss_mlp": 0.38476562, + "step": 1816, + "time_per_iteration": 2.503822088241577 + }, + { + "auxiliary_loss_clip": 0.06674603, + "auxiliary_loss_mlp": 0.01310351, + "balance_loss_clip": 0.06307942, + "balance_loss_mlp": 0.01269105, + "epoch": 0.10924395009770028, + "flos": 27971712109440.0, + "grad_norm": 2.9873916021139078, + "language_loss": 0.74010128, + "learning_rate": 3.9344888830488744e-06, + "loss": 0.81995082, + "num_input_tokens_seen": 39221210, + "router_z_loss_clip": 3.66601562, + "router_z_loss_mlp": 0.41235352, + "step": 1817, + "time_per_iteration": 2.636141300201416 + }, + { + "auxiliary_loss_clip": 0.06667508, + "auxiliary_loss_mlp": 0.01306282, + "balance_loss_clip": 0.06316356, + "balance_loss_mlp": 0.01268659, + "epoch": 0.10930407335036825, + "flos": 25600912686720.0, + "grad_norm": 1.8767258076281454, + "language_loss": 0.68811858, + "learning_rate": 3.934389982775706e-06, + "loss": 0.76785648, + "num_input_tokens_seen": 39242025, + "router_z_loss_clip": 3.51171875, + "router_z_loss_mlp": 0.37597656, + "step": 1818, + "time_per_iteration": 2.5823488235473633 + }, + { + "auxiliary_loss_clip": 0.06675036, + "auxiliary_loss_mlp": 0.01306463, + "balance_loss_clip": 0.06313543, + "balance_loss_mlp": 0.01266575, + "epoch": 0.10936419660303623, + "flos": 18412177749120.0, + "grad_norm": 2.168064712705315, + "language_loss": 0.74997962, + "learning_rate": 3.934291009150275e-06, + "loss": 0.82979459, + "num_input_tokens_seen": 39259870, + "router_z_loss_clip": 3.61523438, + "router_z_loss_mlp": 0.39892578, + "step": 1819, + "time_per_iteration": 2.5780999660491943 + }, + { + "auxiliary_loss_clip": 0.0666959, + "auxiliary_loss_mlp": 0.01302484, + "balance_loss_clip": 0.06316112, + "balance_loss_mlp": 0.01264123, + "epoch": 0.1094243198557042, + "flos": 23846523425280.0, + "grad_norm": 2.805852177899608, + "language_loss": 0.75565147, + "learning_rate": 3.934191962176335e-06, + "loss": 0.83537227, + "num_input_tokens_seen": 39278500, + "router_z_loss_clip": 3.53515625, + "router_z_loss_mlp": 0.38354492, + "step": 1820, + "time_per_iteration": 2.55102801322937 + }, + { + "auxiliary_loss_clip": 0.06670672, + "auxiliary_loss_mlp": 0.01301119, + "balance_loss_clip": 0.06311028, + "balance_loss_mlp": 0.01261065, + "epoch": 0.10948444310837216, + "flos": 14648589360000.0, + "grad_norm": 3.185311290283081, + "language_loss": 0.84421206, + "learning_rate": 3.934092841857642e-06, + "loss": 0.92392999, + "num_input_tokens_seen": 39294800, + "router_z_loss_clip": 3.59765625, + "router_z_loss_mlp": 0.40039062, + "step": 1821, + "time_per_iteration": 2.557086229324341 + }, + { + "auxiliary_loss_clip": 0.06666994, + "auxiliary_loss_mlp": 0.01310986, + "balance_loss_clip": 0.06314231, + "balance_loss_mlp": 0.01271409, + "epoch": 0.10954456636104014, + "flos": 27826250221440.0, + "grad_norm": 3.7637860321271117, + "language_loss": 0.78284943, + "learning_rate": 3.933993648197955e-06, + "loss": 0.86262918, + "num_input_tokens_seen": 39314625, + "router_z_loss_clip": 3.52734375, + "router_z_loss_mlp": 0.39575195, + "step": 1822, + "time_per_iteration": 2.607753038406372 + }, + { + "auxiliary_loss_clip": 0.06665225, + "auxiliary_loss_mlp": 0.01305751, + "balance_loss_clip": 0.06311564, + "balance_loss_mlp": 0.01267271, + "epoch": 0.1096046896137081, + "flos": 33629491497600.0, + "grad_norm": 2.4721955378281133, + "language_loss": 0.81345534, + "learning_rate": 3.933894381201034e-06, + "loss": 0.89316511, + "num_input_tokens_seen": 39336465, + "router_z_loss_clip": 3.53515625, + "router_z_loss_mlp": 0.38525391, + "step": 1823, + "time_per_iteration": 2.7046356201171875 + }, + { + "auxiliary_loss_clip": 0.06663416, + "auxiliary_loss_mlp": 0.01297526, + "balance_loss_clip": 0.06311031, + "balance_loss_mlp": 0.01260643, + "epoch": 0.10966481286637607, + "flos": 26987370670080.0, + "grad_norm": 1.5405254615008266, + "language_loss": 0.8184576, + "learning_rate": 3.933795040870645e-06, + "loss": 0.898067, + "num_input_tokens_seen": 39357930, + "router_z_loss_clip": 3.5234375, + "router_z_loss_mlp": 0.36889648, + "step": 1824, + "time_per_iteration": 2.6020491123199463 + }, + { + "auxiliary_loss_clip": 0.06675697, + "auxiliary_loss_mlp": 0.01302612, + "balance_loss_clip": 0.06317075, + "balance_loss_mlp": 0.01262796, + "epoch": 0.10972493611904403, + "flos": 23042751534720.0, + "grad_norm": 2.030784567379419, + "language_loss": 0.88740194, + "learning_rate": 3.933695627210554e-06, + "loss": 0.96718502, + "num_input_tokens_seen": 39376380, + "router_z_loss_clip": 3.5859375, + "router_z_loss_mlp": 0.3984375, + "step": 1825, + "time_per_iteration": 2.6143786907196045 + }, + { + "auxiliary_loss_clip": 0.06672946, + "auxiliary_loss_mlp": 0.01304094, + "balance_loss_clip": 0.06315491, + "balance_loss_mlp": 0.01265113, + "epoch": 0.10978505937171201, + "flos": 38113261729920.0, + "grad_norm": 4.39958169553056, + "language_loss": 0.77133435, + "learning_rate": 3.933596140224532e-06, + "loss": 0.85110474, + "num_input_tokens_seen": 39399935, + "router_z_loss_clip": 3.57617188, + "router_z_loss_mlp": 0.39013672, + "step": 1826, + "time_per_iteration": 2.6767754554748535 + }, + { + "auxiliary_loss_clip": 0.06562361, + "auxiliary_loss_mlp": 0.01306115, + "balance_loss_clip": 0.06342762, + "balance_loss_mlp": 0.01289641, + "epoch": 0.10984518262437998, + "flos": 59867987500800.0, + "grad_norm": 0.8265503512589908, + "language_loss": 0.55217832, + "learning_rate": 3.93349657991635e-06, + "loss": 0.63086313, + "num_input_tokens_seen": 39460685, + "router_z_loss_clip": 2.19921875, + "router_z_loss_mlp": 0.16479492, + "step": 1827, + "time_per_iteration": 3.2042500972747803 + }, + { + "auxiliary_loss_clip": 0.06558152, + "auxiliary_loss_mlp": 0.01284859, + "balance_loss_clip": 0.06338888, + "balance_loss_mlp": 0.01267704, + "epoch": 0.10990530587704794, + "flos": 66741088907520.0, + "grad_norm": 0.7202592314019287, + "language_loss": 0.55369592, + "learning_rate": 3.933396946289784e-06, + "loss": 0.63212597, + "num_input_tokens_seen": 39524765, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.17175293, + "step": 1828, + "time_per_iteration": 3.2514500617980957 + }, + { + "auxiliary_loss_clip": 0.06692256, + "auxiliary_loss_mlp": 0.01311884, + "balance_loss_clip": 0.06327218, + "balance_loss_mlp": 0.01270018, + "epoch": 0.10996542912971592, + "flos": 25454234914560.0, + "grad_norm": 6.114677648786519, + "language_loss": 0.86263084, + "learning_rate": 3.933297239348612e-06, + "loss": 0.94267225, + "num_input_tokens_seen": 39543640, + "router_z_loss_clip": 3.65234375, + "router_z_loss_mlp": 0.41918945, + "step": 1829, + "time_per_iteration": 2.586923360824585 + }, + { + "auxiliary_loss_clip": 0.06682983, + "auxiliary_loss_mlp": 0.01319143, + "balance_loss_clip": 0.06320649, + "balance_loss_mlp": 0.01279207, + "epoch": 0.11002555238238389, + "flos": 44028282752640.0, + "grad_norm": 2.5270889660052025, + "language_loss": 0.90112162, + "learning_rate": 3.933197459096614e-06, + "loss": 0.98114288, + "num_input_tokens_seen": 39567525, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 0.3996582, + "step": 1830, + "time_per_iteration": 2.8102030754089355 + }, + { + "auxiliary_loss_clip": 0.06544227, + "auxiliary_loss_mlp": 0.01284934, + "balance_loss_clip": 0.06324031, + "balance_loss_mlp": 0.01268376, + "epoch": 0.11008567563505185, + "flos": 54085248547200.0, + "grad_norm": 0.6738836054555057, + "language_loss": 0.55525172, + "learning_rate": 3.9330976055375756e-06, + "loss": 0.63354337, + "num_input_tokens_seen": 39628470, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.16564941, + "step": 1831, + "time_per_iteration": 4.652044057846069 + }, + { + "auxiliary_loss_clip": 0.06700309, + "auxiliary_loss_mlp": 0.01328613, + "balance_loss_clip": 0.06332322, + "balance_loss_mlp": 0.01284744, + "epoch": 0.11014579888771983, + "flos": 24249981634560.0, + "grad_norm": 4.072580491450979, + "language_loss": 0.92313743, + "learning_rate": 3.932997678675282e-06, + "loss": 1.00342667, + "num_input_tokens_seen": 39646670, + "router_z_loss_clip": 3.67773438, + "router_z_loss_mlp": 0.43823242, + "step": 1832, + "time_per_iteration": 2.6010701656341553 + }, + { + "auxiliary_loss_clip": 0.06543858, + "auxiliary_loss_mlp": 0.01268849, + "balance_loss_clip": 0.06322708, + "balance_loss_mlp": 0.0125247, + "epoch": 0.1102059221403878, + "flos": 57763653661440.0, + "grad_norm": 0.681716215184674, + "language_loss": 0.59753174, + "learning_rate": 3.932897678513523e-06, + "loss": 0.67565876, + "num_input_tokens_seen": 39712915, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.16381836, + "step": 1833, + "time_per_iteration": 3.3245253562927246 + }, + { + "auxiliary_loss_clip": 0.0668912, + "auxiliary_loss_mlp": 0.01321784, + "balance_loss_clip": 0.06319445, + "balance_loss_mlp": 0.01278773, + "epoch": 0.11026604539305576, + "flos": 16800818607360.0, + "grad_norm": 5.311308312768562, + "language_loss": 0.81575066, + "learning_rate": 3.93279760505609e-06, + "loss": 0.89585972, + "num_input_tokens_seen": 39730650, + "router_z_loss_clip": 3.703125, + "router_z_loss_mlp": 0.42993164, + "step": 1834, + "time_per_iteration": 4.020633697509766 + }, + { + "auxiliary_loss_clip": 0.0668771, + "auxiliary_loss_mlp": 0.01323505, + "balance_loss_clip": 0.0632341, + "balance_loss_mlp": 0.01282997, + "epoch": 0.11032616864572373, + "flos": 23994920206080.0, + "grad_norm": 4.522465656610911, + "language_loss": 0.91756475, + "learning_rate": 3.932697458306779e-06, + "loss": 0.99767691, + "num_input_tokens_seen": 39751065, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 0.40478516, + "step": 1835, + "time_per_iteration": 2.5956919193267822 + }, + { + "auxiliary_loss_clip": 0.06685364, + "auxiliary_loss_mlp": 0.01321402, + "balance_loss_clip": 0.06324954, + "balance_loss_mlp": 0.01281729, + "epoch": 0.1103862918983917, + "flos": 19689329681280.0, + "grad_norm": 3.000861759629478, + "language_loss": 0.66412532, + "learning_rate": 3.932597238269386e-06, + "loss": 0.74419296, + "num_input_tokens_seen": 39769245, + "router_z_loss_clip": 3.60546875, + "router_z_loss_mlp": 0.39648438, + "step": 1836, + "time_per_iteration": 2.5927958488464355 + }, + { + "auxiliary_loss_clip": 0.06670263, + "auxiliary_loss_mlp": 0.01319261, + "balance_loss_clip": 0.06317647, + "balance_loss_mlp": 0.01279541, + "epoch": 0.11044641515105967, + "flos": 32169086686080.0, + "grad_norm": 2.1343283023714865, + "language_loss": 0.74546272, + "learning_rate": 3.932496944947711e-06, + "loss": 0.82535791, + "num_input_tokens_seen": 39790830, + "router_z_loss_clip": 3.52539062, + "router_z_loss_mlp": 0.3972168, + "step": 1837, + "time_per_iteration": 5.453325033187866 + }, + { + "auxiliary_loss_clip": 0.06688204, + "auxiliary_loss_mlp": 0.01319143, + "balance_loss_clip": 0.06321806, + "balance_loss_mlp": 0.01281496, + "epoch": 0.11050653840372764, + "flos": 16694573230080.0, + "grad_norm": 2.107729732197389, + "language_loss": 0.79967713, + "learning_rate": 3.93239657834556e-06, + "loss": 0.87975061, + "num_input_tokens_seen": 39809475, + "router_z_loss_clip": 3.66210938, + "router_z_loss_mlp": 0.3762207, + "step": 1838, + "time_per_iteration": 2.5330708026885986 + }, + { + "auxiliary_loss_clip": 0.06681567, + "auxiliary_loss_mlp": 0.01310209, + "balance_loss_clip": 0.06323014, + "balance_loss_mlp": 0.01271013, + "epoch": 0.11056666165639562, + "flos": 21214205809920.0, + "grad_norm": 1.83916180844076, + "language_loss": 0.72651547, + "learning_rate": 3.932296138466736e-06, + "loss": 0.8064332, + "num_input_tokens_seen": 39826355, + "router_z_loss_clip": 3.58398438, + "router_z_loss_mlp": 0.39160156, + "step": 1839, + "time_per_iteration": 2.5494542121887207 + }, + { + "auxiliary_loss_clip": 0.06685573, + "auxiliary_loss_mlp": 0.01308897, + "balance_loss_clip": 0.06317459, + "balance_loss_mlp": 0.0126777, + "epoch": 0.11062678490906358, + "flos": 19170444072960.0, + "grad_norm": 2.2710606045718835, + "language_loss": 0.80620813, + "learning_rate": 3.93219562531505e-06, + "loss": 0.88615286, + "num_input_tokens_seen": 39845335, + "router_z_loss_clip": 3.68359375, + "router_z_loss_mlp": 0.41137695, + "step": 1840, + "time_per_iteration": 2.525967836380005 + }, + { + "auxiliary_loss_clip": 0.0666925, + "auxiliary_loss_mlp": 0.01306907, + "balance_loss_clip": 0.06314851, + "balance_loss_mlp": 0.01271287, + "epoch": 0.11068690816173155, + "flos": 24901457529600.0, + "grad_norm": 1.7471100044619239, + "language_loss": 0.89207804, + "learning_rate": 3.932095038894311e-06, + "loss": 0.97183955, + "num_input_tokens_seen": 39865065, + "router_z_loss_clip": 3.54296875, + "router_z_loss_mlp": 0.35620117, + "step": 1841, + "time_per_iteration": 2.6120924949645996 + }, + { + "auxiliary_loss_clip": 0.06674149, + "auxiliary_loss_mlp": 0.01316221, + "balance_loss_clip": 0.06318908, + "balance_loss_mlp": 0.01276739, + "epoch": 0.11074703141439952, + "flos": 16478015552640.0, + "grad_norm": 2.1111741847875822, + "language_loss": 0.92148924, + "learning_rate": 3.931994379208334e-06, + "loss": 1.00139296, + "num_input_tokens_seen": 39882780, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 0.39477539, + "step": 1842, + "time_per_iteration": 2.5187559127807617 + }, + { + "auxiliary_loss_clip": 0.06674332, + "auxiliary_loss_mlp": 0.01308171, + "balance_loss_clip": 0.06317849, + "balance_loss_mlp": 0.01269166, + "epoch": 0.11080715466706749, + "flos": 19178535991680.0, + "grad_norm": 2.023955120097268, + "language_loss": 0.87531722, + "learning_rate": 3.931893646260937e-06, + "loss": 0.95514226, + "num_input_tokens_seen": 39900295, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 0.39038086, + "step": 1843, + "time_per_iteration": 2.6090967655181885 + }, + { + "auxiliary_loss_clip": 0.06693342, + "auxiliary_loss_mlp": 0.01302224, + "balance_loss_clip": 0.0632928, + "balance_loss_mlp": 0.01261073, + "epoch": 0.11086727791973545, + "flos": 27711325946880.0, + "grad_norm": 2.219830309112563, + "language_loss": 0.75884986, + "learning_rate": 3.931792840055941e-06, + "loss": 0.8388055, + "num_input_tokens_seen": 39922075, + "router_z_loss_clip": 3.64257812, + "router_z_loss_mlp": 0.41137695, + "step": 1844, + "time_per_iteration": 2.6051831245422363 + }, + { + "auxiliary_loss_clip": 0.06685966, + "auxiliary_loss_mlp": 0.01305534, + "balance_loss_clip": 0.06324236, + "balance_loss_mlp": 0.01264311, + "epoch": 0.11092740117240343, + "flos": 18520854894720.0, + "grad_norm": 2.695467374521673, + "language_loss": 0.77040052, + "learning_rate": 3.931691960597165e-06, + "loss": 0.85031545, + "num_input_tokens_seen": 39940115, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 0.41235352, + "step": 1845, + "time_per_iteration": 2.6330642700195312 + }, + { + "auxiliary_loss_clip": 0.06677614, + "auxiliary_loss_mlp": 0.01301707, + "balance_loss_clip": 0.06324686, + "balance_loss_mlp": 0.01264681, + "epoch": 0.1109875244250714, + "flos": 20528796211200.0, + "grad_norm": 2.004922205839187, + "language_loss": 0.77657044, + "learning_rate": 3.9315910078884375e-06, + "loss": 0.85636371, + "num_input_tokens_seen": 39959920, + "router_z_loss_clip": 3.52929688, + "router_z_loss_mlp": 0.37036133, + "step": 1846, + "time_per_iteration": 2.5549449920654297 + }, + { + "auxiliary_loss_clip": 0.06701723, + "auxiliary_loss_mlp": 0.01300229, + "balance_loss_clip": 0.0633509, + "balance_loss_mlp": 0.01259627, + "epoch": 0.11104764767773936, + "flos": 14103484623360.0, + "grad_norm": 2.935889161115543, + "language_loss": 0.88190699, + "learning_rate": 3.931489981933584e-06, + "loss": 0.96192646, + "num_input_tokens_seen": 39974755, + "router_z_loss_clip": 3.671875, + "router_z_loss_mlp": 0.40600586, + "step": 1847, + "time_per_iteration": 2.544952869415283 + }, + { + "auxiliary_loss_clip": 0.06695546, + "auxiliary_loss_mlp": 0.01304809, + "balance_loss_clip": 0.06331737, + "balance_loss_mlp": 0.01263944, + "epoch": 0.11110777093040733, + "flos": 20600730541440.0, + "grad_norm": 2.320230631722476, + "language_loss": 0.79106438, + "learning_rate": 3.931388882736438e-06, + "loss": 0.87106788, + "num_input_tokens_seen": 39993355, + "router_z_loss_clip": 3.63476562, + "router_z_loss_mlp": 0.40893555, + "step": 1848, + "time_per_iteration": 2.6920952796936035 + }, + { + "auxiliary_loss_clip": 0.0668249, + "auxiliary_loss_mlp": 0.01302322, + "balance_loss_clip": 0.06330639, + "balance_loss_mlp": 0.01266702, + "epoch": 0.11116789418307531, + "flos": 21876247319040.0, + "grad_norm": 2.02298107620041, + "language_loss": 0.79027736, + "learning_rate": 3.931287710300832e-06, + "loss": 0.87012547, + "num_input_tokens_seen": 40012410, + "router_z_loss_clip": 3.51757812, + "router_z_loss_mlp": 0.35595703, + "step": 1849, + "time_per_iteration": 2.630244255065918 + }, + { + "auxiliary_loss_clip": 0.0669456, + "auxiliary_loss_mlp": 0.01300991, + "balance_loss_clip": 0.06327619, + "balance_loss_mlp": 0.01259363, + "epoch": 0.11122801743574327, + "flos": 15528488284800.0, + "grad_norm": 3.153012159345978, + "language_loss": 0.73516262, + "learning_rate": 3.931186464630601e-06, + "loss": 0.81511813, + "num_input_tokens_seen": 40029315, + "router_z_loss_clip": 3.66601562, + "router_z_loss_mlp": 0.41625977, + "step": 1850, + "time_per_iteration": 2.5095834732055664 + }, + { + "auxiliary_loss_clip": 0.06693517, + "auxiliary_loss_mlp": 0.01305101, + "balance_loss_clip": 0.06331346, + "balance_loss_mlp": 0.01265952, + "epoch": 0.11128814068841124, + "flos": 14397511000320.0, + "grad_norm": 2.7195587095410594, + "language_loss": 0.83262205, + "learning_rate": 3.931085145729588e-06, + "loss": 0.91260827, + "num_input_tokens_seen": 40045765, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 0.39135742, + "step": 1851, + "time_per_iteration": 2.5094821453094482 + }, + { + "auxiliary_loss_clip": 0.06681279, + "auxiliary_loss_mlp": 0.01301356, + "balance_loss_clip": 0.06326675, + "balance_loss_mlp": 0.01266285, + "epoch": 0.11134826394107922, + "flos": 16659465569280.0, + "grad_norm": 3.1935743698172874, + "language_loss": 0.90682918, + "learning_rate": 3.930983753601631e-06, + "loss": 0.98665553, + "num_input_tokens_seen": 40061660, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 0.35083008, + "step": 1852, + "time_per_iteration": 2.5097947120666504 + }, + { + "auxiliary_loss_clip": 0.06688742, + "auxiliary_loss_mlp": 0.0130004, + "balance_loss_clip": 0.06332849, + "balance_loss_mlp": 0.01261392, + "epoch": 0.11140838719374718, + "flos": 16696627655040.0, + "grad_norm": 2.055655946127079, + "language_loss": 0.73742187, + "learning_rate": 3.930882288250578e-06, + "loss": 0.81730974, + "num_input_tokens_seen": 40080180, + "router_z_loss_clip": 3.56445312, + "router_z_loss_mlp": 0.38647461, + "step": 1853, + "time_per_iteration": 2.5568370819091797 + }, + { + "auxiliary_loss_clip": 0.06563053, + "auxiliary_loss_mlp": 0.01299008, + "balance_loss_clip": 0.06346013, + "balance_loss_mlp": 0.01281771, + "epoch": 0.11146851044641515, + "flos": 60994101248640.0, + "grad_norm": 0.7599812832333546, + "language_loss": 0.53835392, + "learning_rate": 3.930780749680273e-06, + "loss": 0.61697447, + "num_input_tokens_seen": 40138910, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.17260742, + "step": 1854, + "time_per_iteration": 3.1410884857177734 + }, + { + "auxiliary_loss_clip": 0.06710939, + "auxiliary_loss_mlp": 0.01301728, + "balance_loss_clip": 0.06327829, + "balance_loss_mlp": 0.01258336, + "epoch": 0.11152863369908313, + "flos": 22199301936000.0, + "grad_norm": 2.170007206040738, + "language_loss": 0.86019069, + "learning_rate": 3.9306791378945705e-06, + "loss": 0.94031739, + "num_input_tokens_seen": 40157745, + "router_z_loss_clip": 3.83398438, + "router_z_loss_mlp": 0.43383789, + "step": 1855, + "time_per_iteration": 2.5451245307922363 + }, + { + "auxiliary_loss_clip": 0.06687084, + "auxiliary_loss_mlp": 0.01297488, + "balance_loss_clip": 0.0632429, + "balance_loss_mlp": 0.01258745, + "epoch": 0.11158875695175109, + "flos": 19543742012160.0, + "grad_norm": 2.6985711919434054, + "language_loss": 0.83108622, + "learning_rate": 3.9305774528973205e-06, + "loss": 0.91093194, + "num_input_tokens_seen": 40175375, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 0.38720703, + "step": 1856, + "time_per_iteration": 2.578641653060913 + }, + { + "auxiliary_loss_clip": 0.06667097, + "auxiliary_loss_mlp": 0.01293205, + "balance_loss_clip": 0.06315985, + "balance_loss_mlp": 0.01257824, + "epoch": 0.11164888020441906, + "flos": 25448994034560.0, + "grad_norm": 1.90457681551641, + "language_loss": 0.84520233, + "learning_rate": 3.93047569469238e-06, + "loss": 0.92480534, + "num_input_tokens_seen": 40195715, + "router_z_loss_clip": 3.51171875, + "router_z_loss_mlp": 0.35375977, + "step": 1857, + "time_per_iteration": 2.581700086593628 + }, + { + "auxiliary_loss_clip": 0.06686676, + "auxiliary_loss_mlp": 0.01304106, + "balance_loss_clip": 0.06318156, + "balance_loss_mlp": 0.01263289, + "epoch": 0.11170900345708702, + "flos": 15638171679360.0, + "grad_norm": 2.609725880853407, + "language_loss": 0.85109961, + "learning_rate": 3.930373863283608e-06, + "loss": 0.9310075, + "num_input_tokens_seen": 40213975, + "router_z_loss_clip": 3.68164062, + "router_z_loss_mlp": 0.40795898, + "step": 1858, + "time_per_iteration": 2.536013603210449 + }, + { + "auxiliary_loss_clip": 0.0668328, + "auxiliary_loss_mlp": 0.01297406, + "balance_loss_clip": 0.06323688, + "balance_loss_mlp": 0.01259569, + "epoch": 0.111769126709755, + "flos": 23046105697920.0, + "grad_norm": 2.4700078024873102, + "language_loss": 0.92790282, + "learning_rate": 3.930271958674866e-06, + "loss": 1.00770962, + "num_input_tokens_seen": 40233905, + "router_z_loss_clip": 3.59570312, + "router_z_loss_mlp": 0.37841797, + "step": 1859, + "time_per_iteration": 2.541881799697876 + }, + { + "auxiliary_loss_clip": 0.06691643, + "auxiliary_loss_mlp": 0.01299678, + "balance_loss_clip": 0.06318307, + "balance_loss_mlp": 0.0125774, + "epoch": 0.11182924996242297, + "flos": 20857091708160.0, + "grad_norm": 2.367815973832506, + "language_loss": 0.8396585, + "learning_rate": 3.930169980870018e-06, + "loss": 0.9195717, + "num_input_tokens_seen": 40252810, + "router_z_loss_clip": 3.72851562, + "router_z_loss_mlp": 0.41943359, + "step": 1860, + "time_per_iteration": 2.565051555633545 + }, + { + "auxiliary_loss_clip": 0.06669357, + "auxiliary_loss_mlp": 0.01300378, + "balance_loss_clip": 0.06315688, + "balance_loss_mlp": 0.01263065, + "epoch": 0.11188937321509093, + "flos": 17460763764480.0, + "grad_norm": 2.7908462123762026, + "language_loss": 0.7628203, + "learning_rate": 3.930067929872931e-06, + "loss": 0.84251761, + "num_input_tokens_seen": 40272000, + "router_z_loss_clip": 3.53710938, + "router_z_loss_mlp": 0.37304688, + "step": 1861, + "time_per_iteration": 2.5033557415008545 + }, + { + "auxiliary_loss_clip": 0.06670874, + "auxiliary_loss_mlp": 0.01303256, + "balance_loss_clip": 0.0631748, + "balance_loss_mlp": 0.01266635, + "epoch": 0.11194949646775891, + "flos": 24102507248640.0, + "grad_norm": 2.306450242478339, + "language_loss": 0.90480924, + "learning_rate": 3.929965805687474e-06, + "loss": 0.9845506, + "num_input_tokens_seen": 40290660, + "router_z_loss_clip": 3.53515625, + "router_z_loss_mlp": 0.3659668, + "step": 1862, + "time_per_iteration": 2.582846164703369 + }, + { + "auxiliary_loss_clip": 0.06675294, + "auxiliary_loss_mlp": 0.01301536, + "balance_loss_clip": 0.0632014, + "balance_loss_mlp": 0.01265273, + "epoch": 0.11200961972042688, + "flos": 25160627808000.0, + "grad_norm": 2.402216402179579, + "language_loss": 0.88216799, + "learning_rate": 3.92986360831752e-06, + "loss": 0.9619363, + "num_input_tokens_seen": 40307820, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 0.36279297, + "step": 1863, + "time_per_iteration": 2.548849105834961 + }, + { + "auxiliary_loss_clip": 0.06661677, + "auxiliary_loss_mlp": 0.01299701, + "balance_loss_clip": 0.06311835, + "balance_loss_mlp": 0.01259933, + "epoch": 0.11206974297309484, + "flos": 21294735183360.0, + "grad_norm": 3.3365899426908574, + "language_loss": 0.65844059, + "learning_rate": 3.929761337766945e-06, + "loss": 0.73805434, + "num_input_tokens_seen": 40327430, + "router_z_loss_clip": 3.5, + "router_z_loss_mlp": 0.39770508, + "step": 1864, + "time_per_iteration": 2.5405185222625732 + }, + { + "auxiliary_loss_clip": 0.06660779, + "auxiliary_loss_mlp": 0.01305926, + "balance_loss_clip": 0.06303211, + "balance_loss_mlp": 0.01270211, + "epoch": 0.11212986622576282, + "flos": 18921881335680.0, + "grad_norm": 2.2819326265061717, + "language_loss": 0.75939113, + "learning_rate": 3.929658994039627e-06, + "loss": 0.83905816, + "num_input_tokens_seen": 40344545, + "router_z_loss_clip": 3.58007812, + "router_z_loss_mlp": 0.35693359, + "step": 1865, + "time_per_iteration": 2.518132209777832 + }, + { + "auxiliary_loss_clip": 0.06676203, + "auxiliary_loss_mlp": 0.01303479, + "balance_loss_clip": 0.06308948, + "balance_loss_mlp": 0.01262066, + "epoch": 0.11218998947843078, + "flos": 22061344988160.0, + "grad_norm": 2.4630430297676087, + "language_loss": 0.86701274, + "learning_rate": 3.929556577139446e-06, + "loss": 0.94680953, + "num_input_tokens_seen": 40362300, + "router_z_loss_clip": 3.67382812, + "router_z_loss_mlp": 0.4140625, + "step": 1866, + "time_per_iteration": 2.559826135635376 + }, + { + "auxiliary_loss_clip": 0.06668604, + "auxiliary_loss_mlp": 0.0129946, + "balance_loss_clip": 0.06308037, + "balance_loss_mlp": 0.01259405, + "epoch": 0.11225011273109875, + "flos": 24578612547840.0, + "grad_norm": 1.6697676286935108, + "language_loss": 0.82806516, + "learning_rate": 3.929454087070286e-06, + "loss": 0.90774584, + "num_input_tokens_seen": 40384720, + "router_z_loss_clip": 3.60546875, + "router_z_loss_mlp": 0.40014648, + "step": 1867, + "time_per_iteration": 2.6024861335754395 + }, + { + "auxiliary_loss_clip": 0.06666633, + "auxiliary_loss_mlp": 0.01303841, + "balance_loss_clip": 0.06308746, + "balance_loss_mlp": 0.01266099, + "epoch": 0.11231023598376672, + "flos": 28446140327040.0, + "grad_norm": 2.646357828465267, + "language_loss": 0.88275552, + "learning_rate": 3.929351523836035e-06, + "loss": 0.96246034, + "num_input_tokens_seen": 40404000, + "router_z_loss_clip": 3.58007812, + "router_z_loss_mlp": 0.37744141, + "step": 1868, + "time_per_iteration": 2.6040542125701904 + }, + { + "auxiliary_loss_clip": 0.06659871, + "auxiliary_loss_mlp": 0.01297203, + "balance_loss_clip": 0.06306987, + "balance_loss_mlp": 0.01259866, + "epoch": 0.1123703592364347, + "flos": 14431318922880.0, + "grad_norm": 2.6026187077821796, + "language_loss": 0.69696379, + "learning_rate": 3.9292488874405795e-06, + "loss": 0.77653456, + "num_input_tokens_seen": 40418665, + "router_z_loss_clip": 3.52929688, + "router_z_loss_mlp": 0.3737793, + "step": 1869, + "time_per_iteration": 2.562173843383789 + }, + { + "auxiliary_loss_clip": 0.06669002, + "auxiliary_loss_mlp": 0.01308207, + "balance_loss_clip": 0.06307223, + "balance_loss_mlp": 0.01267629, + "epoch": 0.11243048248910266, + "flos": 22242753077760.0, + "grad_norm": 2.004713314117072, + "language_loss": 0.78550231, + "learning_rate": 3.929146177887814e-06, + "loss": 0.86527443, + "num_input_tokens_seen": 40437870, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 0.40600586, + "step": 1870, + "time_per_iteration": 2.5912842750549316 + }, + { + "auxiliary_loss_clip": 0.06677727, + "auxiliary_loss_mlp": 0.01300065, + "balance_loss_clip": 0.06308755, + "balance_loss_mlp": 0.01259462, + "epoch": 0.11249060574177062, + "flos": 18589435061760.0, + "grad_norm": 2.325375460191994, + "language_loss": 0.77409399, + "learning_rate": 3.929043395181631e-06, + "loss": 0.85387194, + "num_input_tokens_seen": 40455570, + "router_z_loss_clip": 3.69140625, + "router_z_loss_mlp": 0.40625, + "step": 1871, + "time_per_iteration": 3.970134735107422 + }, + { + "auxiliary_loss_clip": 0.06669156, + "auxiliary_loss_mlp": 0.01304929, + "balance_loss_clip": 0.06304972, + "balance_loss_mlp": 0.01264803, + "epoch": 0.1125507289944386, + "flos": 22863146307840.0, + "grad_norm": 2.5010943819542395, + "language_loss": 0.83236814, + "learning_rate": 3.928940539325929e-06, + "loss": 0.91210902, + "num_input_tokens_seen": 40473600, + "router_z_loss_clip": 3.64257812, + "router_z_loss_mlp": 0.40112305, + "step": 1872, + "time_per_iteration": 2.53498911857605 + }, + { + "auxiliary_loss_clip": 0.0666475, + "auxiliary_loss_mlp": 0.0132478, + "balance_loss_clip": 0.06302819, + "balance_loss_mlp": 0.01284344, + "epoch": 0.11261085224710657, + "flos": 19681447397760.0, + "grad_norm": 2.9026103981965963, + "language_loss": 0.84496641, + "learning_rate": 3.9288376103246095e-06, + "loss": 0.92486167, + "num_input_tokens_seen": 40490025, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 0.40454102, + "step": 1873, + "time_per_iteration": 3.988614082336426 + }, + { + "auxiliary_loss_clip": 0.06668855, + "auxiliary_loss_mlp": 0.01305813, + "balance_loss_clip": 0.06300959, + "balance_loss_mlp": 0.01266664, + "epoch": 0.11267097549977453, + "flos": 26069680753920.0, + "grad_norm": 2.0146094287088454, + "language_loss": 0.92890203, + "learning_rate": 3.928734608181575e-06, + "loss": 1.00864863, + "num_input_tokens_seen": 40511580, + "router_z_loss_clip": 3.67578125, + "router_z_loss_mlp": 0.3918457, + "step": 1874, + "time_per_iteration": 2.594095230102539 + }, + { + "auxiliary_loss_clip": 0.06647091, + "auxiliary_loss_mlp": 0.01311618, + "balance_loss_clip": 0.06294458, + "balance_loss_mlp": 0.01272589, + "epoch": 0.11273109875244251, + "flos": 21074194437120.0, + "grad_norm": 2.447545582518425, + "language_loss": 0.7598331, + "learning_rate": 3.928631532900729e-06, + "loss": 0.8394202, + "num_input_tokens_seen": 40530155, + "router_z_loss_clip": 3.52539062, + "router_z_loss_mlp": 0.39038086, + "step": 1875, + "time_per_iteration": 2.5846669673919678 + }, + { + "auxiliary_loss_clip": 0.06650866, + "auxiliary_loss_mlp": 0.01305089, + "balance_loss_clip": 0.06300622, + "balance_loss_mlp": 0.01270042, + "epoch": 0.11279122200511048, + "flos": 27096299377920.0, + "grad_norm": 2.1373581639008603, + "language_loss": 0.73336905, + "learning_rate": 3.928528384485984e-06, + "loss": 0.81292862, + "num_input_tokens_seen": 40549500, + "router_z_loss_clip": 3.50585938, + "router_z_loss_mlp": 0.3503418, + "step": 1876, + "time_per_iteration": 3.9819693565368652 + }, + { + "auxiliary_loss_clip": 0.06655607, + "auxiliary_loss_mlp": 0.01304943, + "balance_loss_clip": 0.06303705, + "balance_loss_mlp": 0.01268489, + "epoch": 0.11285134525777844, + "flos": 20193163482240.0, + "grad_norm": 1.9863695087931013, + "language_loss": 0.78284073, + "learning_rate": 3.9284251629412475e-06, + "loss": 0.86244625, + "num_input_tokens_seen": 40567475, + "router_z_loss_clip": 3.51953125, + "router_z_loss_mlp": 0.36474609, + "step": 1877, + "time_per_iteration": 4.03458046913147 + }, + { + "auxiliary_loss_clip": 0.06652889, + "auxiliary_loss_mlp": 0.01306338, + "balance_loss_clip": 0.06294097, + "balance_loss_mlp": 0.01265139, + "epoch": 0.11291146851044641, + "flos": 12463348803840.0, + "grad_norm": 2.614643448765401, + "language_loss": 0.8943826, + "learning_rate": 3.928321868270436e-06, + "loss": 0.97397494, + "num_input_tokens_seen": 40583280, + "router_z_loss_clip": 3.58789062, + "router_z_loss_mlp": 0.41186523, + "step": 1878, + "time_per_iteration": 2.5039942264556885 + }, + { + "auxiliary_loss_clip": 0.06650617, + "auxiliary_loss_mlp": 0.01298934, + "balance_loss_clip": 0.0629722, + "balance_loss_mlp": 0.01262981, + "epoch": 0.11297159176311439, + "flos": 23849164828800.0, + "grad_norm": 2.5452203644148748, + "language_loss": 0.83347368, + "learning_rate": 3.928218500477466e-06, + "loss": 0.91296917, + "num_input_tokens_seen": 40603080, + "router_z_loss_clip": 3.53710938, + "router_z_loss_mlp": 0.35961914, + "step": 1879, + "time_per_iteration": 2.597705125808716 + }, + { + "auxiliary_loss_clip": 0.06658179, + "auxiliary_loss_mlp": 0.01304624, + "balance_loss_clip": 0.06296952, + "balance_loss_mlp": 0.01265333, + "epoch": 0.11303171501578235, + "flos": 29937585876480.0, + "grad_norm": 2.2031468075921765, + "language_loss": 0.71889591, + "learning_rate": 3.928115059566259e-06, + "loss": 0.79852396, + "num_input_tokens_seen": 40623255, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 0.39306641, + "step": 1880, + "time_per_iteration": 2.5943877696990967 + }, + { + "auxiliary_loss_clip": 0.06640352, + "auxiliary_loss_mlp": 0.01299738, + "balance_loss_clip": 0.06297569, + "balance_loss_mlp": 0.01262163, + "epoch": 0.11309183826845032, + "flos": 16186169381760.0, + "grad_norm": 2.477930763311184, + "language_loss": 0.74137151, + "learning_rate": 3.928011545540734e-06, + "loss": 0.82077241, + "num_input_tokens_seen": 40641570, + "router_z_loss_clip": 3.43164062, + "router_z_loss_mlp": 0.37573242, + "step": 1881, + "time_per_iteration": 2.5628225803375244 + }, + { + "auxiliary_loss_clip": 0.06661209, + "auxiliary_loss_mlp": 0.01303844, + "balance_loss_clip": 0.06301182, + "balance_loss_mlp": 0.01264767, + "epoch": 0.1131519615211183, + "flos": 12025537620480.0, + "grad_norm": 2.71671437451568, + "language_loss": 0.75070721, + "learning_rate": 3.927907958404819e-06, + "loss": 0.83035773, + "num_input_tokens_seen": 40658775, + "router_z_loss_clip": 3.59765625, + "router_z_loss_mlp": 0.39111328, + "step": 1882, + "time_per_iteration": 2.5252811908721924 + }, + { + "auxiliary_loss_clip": 0.06659748, + "auxiliary_loss_mlp": 0.01301896, + "balance_loss_clip": 0.06302463, + "balance_loss_mlp": 0.0126363, + "epoch": 0.11321208477378626, + "flos": 26257335972480.0, + "grad_norm": 2.360500107686341, + "language_loss": 0.81115943, + "learning_rate": 3.92780429816244e-06, + "loss": 0.89077592, + "num_input_tokens_seen": 40679555, + "router_z_loss_clip": 3.57226562, + "router_z_loss_mlp": 0.3828125, + "step": 1883, + "time_per_iteration": 2.6215126514434814 + }, + { + "auxiliary_loss_clip": 0.06662337, + "auxiliary_loss_mlp": 0.01301794, + "balance_loss_clip": 0.06305344, + "balance_loss_mlp": 0.01264076, + "epoch": 0.11327220802645423, + "flos": 13631530101120.0, + "grad_norm": 4.398339236734383, + "language_loss": 0.78793007, + "learning_rate": 3.927700564817529e-06, + "loss": 0.86757141, + "num_input_tokens_seen": 40697295, + "router_z_loss_clip": 3.56835938, + "router_z_loss_mlp": 0.37719727, + "step": 1884, + "time_per_iteration": 2.5176398754119873 + }, + { + "auxiliary_loss_clip": 0.06509344, + "auxiliary_loss_mlp": 0.0129272, + "balance_loss_clip": 0.06295511, + "balance_loss_mlp": 0.0127789, + "epoch": 0.1133323312791222, + "flos": 57210582787200.0, + "grad_norm": 0.8090343621743066, + "language_loss": 0.55328304, + "learning_rate": 3.927596758374019e-06, + "loss": 0.63130367, + "num_input_tokens_seen": 40758095, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.14794922, + "step": 1885, + "time_per_iteration": 3.0971505641937256 + }, + { + "auxiliary_loss_clip": 0.06646755, + "auxiliary_loss_mlp": 0.01313183, + "balance_loss_clip": 0.06301701, + "balance_loss_mlp": 0.01277062, + "epoch": 0.11339245453179017, + "flos": 24358407217920.0, + "grad_norm": 2.1975512476365444, + "language_loss": 0.917539, + "learning_rate": 3.927492878835848e-06, + "loss": 0.99713838, + "num_input_tokens_seen": 40777140, + "router_z_loss_clip": 3.45117188, + "router_z_loss_mlp": 0.36132812, + "step": 1886, + "time_per_iteration": 2.557039260864258 + }, + { + "auxiliary_loss_clip": 0.06661782, + "auxiliary_loss_mlp": 0.01305618, + "balance_loss_clip": 0.06311518, + "balance_loss_mlp": 0.01271882, + "epoch": 0.11345257778445814, + "flos": 22676665046400.0, + "grad_norm": 2.7768273002598427, + "language_loss": 0.86747134, + "learning_rate": 3.927388926206953e-06, + "loss": 0.94714534, + "num_input_tokens_seen": 40797505, + "router_z_loss_clip": 3.50195312, + "router_z_loss_mlp": 0.33740234, + "step": 1887, + "time_per_iteration": 2.586071014404297 + }, + { + "auxiliary_loss_clip": 0.06653242, + "auxiliary_loss_mlp": 0.01304972, + "balance_loss_clip": 0.06302808, + "balance_loss_mlp": 0.01268279, + "epoch": 0.11351270103712612, + "flos": 20993245793280.0, + "grad_norm": 4.850859640376328, + "language_loss": 0.7868247, + "learning_rate": 3.927284900491277e-06, + "loss": 0.86640686, + "num_input_tokens_seen": 40812970, + "router_z_loss_clip": 3.50195312, + "router_z_loss_mlp": 0.36694336, + "step": 1888, + "time_per_iteration": 2.5445072650909424 + }, + { + "auxiliary_loss_clip": 0.06662205, + "auxiliary_loss_mlp": 0.01311301, + "balance_loss_clip": 0.06306933, + "balance_loss_mlp": 0.01271366, + "epoch": 0.11357282428979408, + "flos": 37358014152960.0, + "grad_norm": 2.243152205453325, + "language_loss": 0.69439191, + "learning_rate": 3.927180801692764e-06, + "loss": 0.77412695, + "num_input_tokens_seen": 40837745, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 0.39916992, + "step": 1889, + "time_per_iteration": 2.7570948600769043 + }, + { + "auxiliary_loss_clip": 0.06658383, + "auxiliary_loss_mlp": 0.01303074, + "balance_loss_clip": 0.06306529, + "balance_loss_mlp": 0.01266811, + "epoch": 0.11363294754246205, + "flos": 21762580855680.0, + "grad_norm": 2.3560992330068, + "language_loss": 0.85365129, + "learning_rate": 3.927076629815362e-06, + "loss": 0.93326581, + "num_input_tokens_seen": 40856490, + "router_z_loss_clip": 3.51953125, + "router_z_loss_mlp": 0.36279297, + "step": 1890, + "time_per_iteration": 2.539299964904785 + }, + { + "auxiliary_loss_clip": 0.06646931, + "auxiliary_loss_mlp": 0.0130946, + "balance_loss_clip": 0.06299055, + "balance_loss_mlp": 0.01272887, + "epoch": 0.11369307079513001, + "flos": 22608252587520.0, + "grad_norm": 3.2867804654433734, + "language_loss": 0.66679269, + "learning_rate": 3.926972384863022e-06, + "loss": 0.74635661, + "num_input_tokens_seen": 40874070, + "router_z_loss_clip": 3.48046875, + "router_z_loss_mlp": 0.36572266, + "step": 1891, + "time_per_iteration": 2.5804758071899414 + }, + { + "auxiliary_loss_clip": 0.06662975, + "auxiliary_loss_mlp": 0.01306025, + "balance_loss_clip": 0.06305033, + "balance_loss_mlp": 0.01268188, + "epoch": 0.11375319404779799, + "flos": 21950655344640.0, + "grad_norm": 2.3010503008358887, + "language_loss": 0.89755237, + "learning_rate": 3.9268680668396956e-06, + "loss": 0.97724235, + "num_input_tokens_seen": 40892425, + "router_z_loss_clip": 3.58398438, + "router_z_loss_mlp": 0.37817383, + "step": 1892, + "time_per_iteration": 2.5231149196624756 + }, + { + "auxiliary_loss_clip": 0.06664805, + "auxiliary_loss_mlp": 0.01310273, + "balance_loss_clip": 0.06304479, + "balance_loss_mlp": 0.01271149, + "epoch": 0.11381331730046595, + "flos": 26402588225280.0, + "grad_norm": 2.9760722646413966, + "language_loss": 0.75163257, + "learning_rate": 3.926763675749339e-06, + "loss": 0.83138341, + "num_input_tokens_seen": 40912190, + "router_z_loss_clip": 3.60546875, + "router_z_loss_mlp": 0.39111328, + "step": 1893, + "time_per_iteration": 2.6722171306610107 + }, + { + "auxiliary_loss_clip": 0.06657124, + "auxiliary_loss_mlp": 0.0130867, + "balance_loss_clip": 0.06306865, + "balance_loss_mlp": 0.01271405, + "epoch": 0.11387344055313392, + "flos": 23811373837440.0, + "grad_norm": 2.1739305302665417, + "language_loss": 0.81218535, + "learning_rate": 3.92665921159591e-06, + "loss": 0.89184326, + "num_input_tokens_seen": 40928395, + "router_z_loss_clip": 3.5, + "router_z_loss_mlp": 0.37255859, + "step": 1894, + "time_per_iteration": 2.5737743377685547 + }, + { + "auxiliary_loss_clip": 0.06661002, + "auxiliary_loss_mlp": 0.01313123, + "balance_loss_clip": 0.06302214, + "balance_loss_mlp": 0.01272187, + "epoch": 0.1139335638058019, + "flos": 34529865546240.0, + "grad_norm": 3.0499673553250317, + "language_loss": 0.81167793, + "learning_rate": 3.926554674383371e-06, + "loss": 0.89141917, + "num_input_tokens_seen": 40946555, + "router_z_loss_clip": 3.5859375, + "router_z_loss_mlp": 0.40991211, + "step": 1895, + "time_per_iteration": 2.6510303020477295 + }, + { + "auxiliary_loss_clip": 0.06495596, + "auxiliary_loss_mlp": 0.01269878, + "balance_loss_clip": 0.06284232, + "balance_loss_mlp": 0.01256026, + "epoch": 0.11399368705846986, + "flos": 70609790643840.0, + "grad_norm": 0.7664991761837657, + "language_loss": 0.63306981, + "learning_rate": 3.926450064115686e-06, + "loss": 0.71072453, + "num_input_tokens_seen": 41004910, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.13891602, + "step": 1896, + "time_per_iteration": 3.2715020179748535 + }, + { + "auxiliary_loss_clip": 0.06653456, + "auxiliary_loss_mlp": 0.01306088, + "balance_loss_clip": 0.06306494, + "balance_loss_mlp": 0.01266224, + "epoch": 0.11405381031113783, + "flos": 21330597530880.0, + "grad_norm": 2.7976416245645988, + "language_loss": 0.86136234, + "learning_rate": 3.926345380796821e-06, + "loss": 0.94095778, + "num_input_tokens_seen": 41026385, + "router_z_loss_clip": 3.47070312, + "router_z_loss_mlp": 0.3984375, + "step": 1897, + "time_per_iteration": 2.602890729904175 + }, + { + "auxiliary_loss_clip": 0.06656732, + "auxiliary_loss_mlp": 0.01307974, + "balance_loss_clip": 0.06304093, + "balance_loss_mlp": 0.01270041, + "epoch": 0.11411393356380581, + "flos": 19725820934400.0, + "grad_norm": 2.6374143353220068, + "language_loss": 0.80644619, + "learning_rate": 3.9262406244307465e-06, + "loss": 0.88609326, + "num_input_tokens_seen": 41045315, + "router_z_loss_clip": 3.52734375, + "router_z_loss_mlp": 0.37915039, + "step": 1898, + "time_per_iteration": 2.5834596157073975 + }, + { + "auxiliary_loss_clip": 0.06665078, + "auxiliary_loss_mlp": 0.0130214, + "balance_loss_clip": 0.06307302, + "balance_loss_mlp": 0.01261823, + "epoch": 0.11417405681647377, + "flos": 17536261893120.0, + "grad_norm": 3.558801225381502, + "language_loss": 0.74948764, + "learning_rate": 3.926135795021435e-06, + "loss": 0.82915986, + "num_input_tokens_seen": 41063390, + "router_z_loss_clip": 3.57226562, + "router_z_loss_mlp": 0.40283203, + "step": 1899, + "time_per_iteration": 2.5195093154907227 + }, + { + "auxiliary_loss_clip": 0.06484325, + "auxiliary_loss_mlp": 0.01277698, + "balance_loss_clip": 0.06276824, + "balance_loss_mlp": 0.01262463, + "epoch": 0.11423418006914174, + "flos": 59694168205440.0, + "grad_norm": 0.8563849035990295, + "language_loss": 0.63607001, + "learning_rate": 3.92603089257286e-06, + "loss": 0.71369016, + "num_input_tokens_seen": 41124180, + "router_z_loss_clip": 2.078125, + "router_z_loss_mlp": 0.15209961, + "step": 1900, + "time_per_iteration": 3.140596389770508 + }, + { + "auxiliary_loss_clip": 0.06654657, + "auxiliary_loss_mlp": 0.01295658, + "balance_loss_clip": 0.06311209, + "balance_loss_mlp": 0.01260706, + "epoch": 0.1142943033218097, + "flos": 22969223976960.0, + "grad_norm": 2.413799712437086, + "language_loss": 0.7948848, + "learning_rate": 3.925925917089001e-06, + "loss": 0.87438798, + "num_input_tokens_seen": 41143485, + "router_z_loss_clip": 3.43359375, + "router_z_loss_mlp": 0.34960938, + "step": 1901, + "time_per_iteration": 2.5521771907806396 + }, + { + "auxiliary_loss_clip": 0.06657314, + "auxiliary_loss_mlp": 0.01303255, + "balance_loss_clip": 0.06311248, + "balance_loss_mlp": 0.01264011, + "epoch": 0.11435442657447768, + "flos": 18261558835200.0, + "grad_norm": 2.3832212906881862, + "language_loss": 0.8530966, + "learning_rate": 3.925820868573839e-06, + "loss": 0.93270218, + "num_input_tokens_seen": 41161695, + "router_z_loss_clip": 3.4609375, + "router_z_loss_mlp": 0.39257812, + "step": 1902, + "time_per_iteration": 2.538130521774292 + }, + { + "auxiliary_loss_clip": 0.06657556, + "auxiliary_loss_mlp": 0.01298528, + "balance_loss_clip": 0.06305373, + "balance_loss_mlp": 0.01259737, + "epoch": 0.11441454982714565, + "flos": 24068070420480.0, + "grad_norm": 1.6413453356185448, + "language_loss": 0.79046285, + "learning_rate": 3.925715747031356e-06, + "loss": 0.87002361, + "num_input_tokens_seen": 41181715, + "router_z_loss_clip": 3.52539062, + "router_z_loss_mlp": 0.38793945, + "step": 1903, + "time_per_iteration": 2.5491714477539062 + }, + { + "auxiliary_loss_clip": 0.0665084, + "auxiliary_loss_mlp": 0.01296782, + "balance_loss_clip": 0.06302907, + "balance_loss_mlp": 0.01262021, + "epoch": 0.11447467307981361, + "flos": 25344719228160.0, + "grad_norm": 2.444047148927425, + "language_loss": 0.7716713, + "learning_rate": 3.925610552465539e-06, + "loss": 0.85114753, + "num_input_tokens_seen": 41201770, + "router_z_loss_clip": 3.47851562, + "router_z_loss_mlp": 0.34765625, + "step": 1904, + "time_per_iteration": 2.581732749938965 + }, + { + "auxiliary_loss_clip": 0.0665014, + "auxiliary_loss_mlp": 0.01305214, + "balance_loss_clip": 0.0630317, + "balance_loss_mlp": 0.01263967, + "epoch": 0.11453479633248159, + "flos": 21732546366720.0, + "grad_norm": 2.531757155305884, + "language_loss": 0.9328481, + "learning_rate": 3.9255052848803764e-06, + "loss": 1.01240158, + "num_input_tokens_seen": 41220590, + "router_z_loss_clip": 3.46484375, + "router_z_loss_mlp": 0.41259766, + "step": 1905, + "time_per_iteration": 2.5455148220062256 + }, + { + "auxiliary_loss_clip": 0.06677254, + "auxiliary_loss_mlp": 0.01302143, + "balance_loss_clip": 0.06310458, + "balance_loss_mlp": 0.0126185, + "epoch": 0.11459491958514956, + "flos": 12974771399040.0, + "grad_norm": 15.201644676234393, + "language_loss": 0.79179782, + "learning_rate": 3.925399944279861e-06, + "loss": 0.87159181, + "num_input_tokens_seen": 41237250, + "router_z_loss_clip": 3.66992188, + "router_z_loss_mlp": 0.40258789, + "step": 1906, + "time_per_iteration": 2.557220220565796 + }, + { + "auxiliary_loss_clip": 0.06651148, + "auxiliary_loss_mlp": 0.01309487, + "balance_loss_clip": 0.06300925, + "balance_loss_mlp": 0.0127022, + "epoch": 0.11465504283781752, + "flos": 22717935982080.0, + "grad_norm": 2.7916231383135903, + "language_loss": 0.84417903, + "learning_rate": 3.925294530667986e-06, + "loss": 0.92378545, + "num_input_tokens_seen": 41256680, + "router_z_loss_clip": 3.50195312, + "router_z_loss_mlp": 0.39257812, + "step": 1907, + "time_per_iteration": 2.538357734680176 + }, + { + "auxiliary_loss_clip": 0.06659371, + "auxiliary_loss_mlp": 0.01305713, + "balance_loss_clip": 0.06306633, + "balance_loss_mlp": 0.01266064, + "epoch": 0.1147151660904855, + "flos": 23404142194560.0, + "grad_norm": 5.983288386648609, + "language_loss": 0.85784996, + "learning_rate": 3.92518904404875e-06, + "loss": 0.93750072, + "num_input_tokens_seen": 41270955, + "router_z_loss_clip": 3.53125, + "router_z_loss_mlp": 0.39648438, + "step": 1908, + "time_per_iteration": 2.566323757171631 + }, + { + "auxiliary_loss_clip": 0.06483665, + "auxiliary_loss_mlp": 0.01269821, + "balance_loss_clip": 0.0627609, + "balance_loss_mlp": 0.01254252, + "epoch": 0.11477528934315347, + "flos": 63028639036800.0, + "grad_norm": 0.8722245963969955, + "language_loss": 0.60927975, + "learning_rate": 3.925083484426153e-06, + "loss": 0.68681461, + "num_input_tokens_seen": 41319180, + "router_z_loss_clip": 2.07617188, + "router_z_loss_mlp": 0.15551758, + "step": 1909, + "time_per_iteration": 2.9047083854675293 + }, + { + "auxiliary_loss_clip": 0.06651932, + "auxiliary_loss_mlp": 0.01304657, + "balance_loss_clip": 0.06305454, + "balance_loss_mlp": 0.01265223, + "epoch": 0.11483541259582143, + "flos": 16331086218240.0, + "grad_norm": 2.669666495614271, + "language_loss": 0.8074221, + "learning_rate": 3.924977851804197e-06, + "loss": 0.88698798, + "num_input_tokens_seen": 41337480, + "router_z_loss_clip": 3.46289062, + "router_z_loss_mlp": 0.39404297, + "step": 1910, + "time_per_iteration": 2.5531835556030273 + }, + { + "auxiliary_loss_clip": 0.06656756, + "auxiliary_loss_mlp": 0.01297855, + "balance_loss_clip": 0.06303862, + "balance_loss_mlp": 0.01258516, + "epoch": 0.1148955358484894, + "flos": 21586916770560.0, + "grad_norm": 2.9098941838716046, + "language_loss": 0.78589714, + "learning_rate": 3.9248721461868875e-06, + "loss": 0.86544329, + "num_input_tokens_seen": 41354650, + "router_z_loss_clip": 3.52734375, + "router_z_loss_mlp": 0.39331055, + "step": 1911, + "time_per_iteration": 3.928828477859497 + }, + { + "auxiliary_loss_clip": 0.06639488, + "auxiliary_loss_mlp": 0.01303362, + "balance_loss_clip": 0.06301475, + "balance_loss_mlp": 0.01266931, + "epoch": 0.11495565910115738, + "flos": 27681249530880.0, + "grad_norm": 2.02553210679246, + "language_loss": 0.80990648, + "learning_rate": 3.9247663675782336e-06, + "loss": 0.88933504, + "num_input_tokens_seen": 41376935, + "router_z_loss_clip": 3.38085938, + "router_z_loss_mlp": 0.36401367, + "step": 1912, + "time_per_iteration": 2.5985615253448486 + }, + { + "auxiliary_loss_clip": 0.06649567, + "auxiliary_loss_mlp": 0.01304436, + "balance_loss_clip": 0.06303079, + "balance_loss_mlp": 0.01266575, + "epoch": 0.11501578235382534, + "flos": 20638815022080.0, + "grad_norm": 2.0778571754475124, + "language_loss": 0.79150605, + "learning_rate": 3.924660515982246e-06, + "loss": 0.87104607, + "num_input_tokens_seen": 41396105, + "router_z_loss_clip": 3.46484375, + "router_z_loss_mlp": 0.37866211, + "step": 1913, + "time_per_iteration": 3.9840147495269775 + }, + { + "auxiliary_loss_clip": 0.06649221, + "auxiliary_loss_mlp": 0.01302596, + "balance_loss_clip": 0.06302825, + "balance_loss_mlp": 0.01266214, + "epoch": 0.1150759056064933, + "flos": 19835252766720.0, + "grad_norm": 2.174223201073213, + "language_loss": 0.71977127, + "learning_rate": 3.924554591402939e-06, + "loss": 0.79928941, + "num_input_tokens_seen": 41415600, + "router_z_loss_clip": 3.46484375, + "router_z_loss_mlp": 0.36352539, + "step": 1914, + "time_per_iteration": 2.564162492752075 + }, + { + "auxiliary_loss_clip": 0.06490675, + "auxiliary_loss_mlp": 0.01271492, + "balance_loss_clip": 0.06283194, + "balance_loss_mlp": 0.01257139, + "epoch": 0.11513602885916129, + "flos": 70068543194880.0, + "grad_norm": 0.7330745369663106, + "language_loss": 0.61048496, + "learning_rate": 3.92444859384433e-06, + "loss": 0.68810666, + "num_input_tokens_seen": 41478760, + "router_z_loss_clip": 2.0625, + "router_z_loss_mlp": 0.14343262, + "step": 1915, + "time_per_iteration": 4.616885662078857 + }, + { + "auxiliary_loss_clip": 0.06646329, + "auxiliary_loss_mlp": 0.01309796, + "balance_loss_clip": 0.06301694, + "balance_loss_mlp": 0.01271697, + "epoch": 0.11519615211182925, + "flos": 15747100387200.0, + "grad_norm": 2.8536727053056077, + "language_loss": 0.94662005, + "learning_rate": 3.924342523310436e-06, + "loss": 1.02618122, + "num_input_tokens_seen": 41495720, + "router_z_loss_clip": 3.44726562, + "router_z_loss_mlp": 0.38085938, + "step": 1916, + "time_per_iteration": 2.544074058532715 + }, + { + "auxiliary_loss_clip": 0.06649305, + "auxiliary_loss_mlp": 0.01297855, + "balance_loss_clip": 0.06298364, + "balance_loss_mlp": 0.01258945, + "epoch": 0.11525627536449722, + "flos": 20673880755840.0, + "grad_norm": 1.9176091228095486, + "language_loss": 0.73714519, + "learning_rate": 3.9242363798052806e-06, + "loss": 0.81661683, + "num_input_tokens_seen": 41513585, + "router_z_loss_clip": 3.5078125, + "router_z_loss_mlp": 0.3894043, + "step": 1917, + "time_per_iteration": 3.988520383834839 + }, + { + "auxiliary_loss_clip": 0.06637132, + "auxiliary_loss_mlp": 0.01303977, + "balance_loss_clip": 0.06296226, + "balance_loss_mlp": 0.01264876, + "epoch": 0.1153163986171652, + "flos": 20309555203200.0, + "grad_norm": 2.2006178662795546, + "language_loss": 0.7638135, + "learning_rate": 3.92413016333289e-06, + "loss": 0.84322459, + "num_input_tokens_seen": 41533390, + "router_z_loss_clip": 3.40820312, + "router_z_loss_mlp": 0.39135742, + "step": 1918, + "time_per_iteration": 2.531501531600952 + }, + { + "auxiliary_loss_clip": 0.06653848, + "auxiliary_loss_mlp": 0.01302011, + "balance_loss_clip": 0.06300295, + "balance_loss_mlp": 0.01263983, + "epoch": 0.11537652186983316, + "flos": 17645064819840.0, + "grad_norm": 6.624924967769877, + "language_loss": 0.87652063, + "learning_rate": 3.92402387389729e-06, + "loss": 0.95607924, + "num_input_tokens_seen": 41551015, + "router_z_loss_clip": 3.53320312, + "router_z_loss_mlp": 0.38037109, + "step": 1919, + "time_per_iteration": 2.5388336181640625 + }, + { + "auxiliary_loss_clip": 0.06642918, + "auxiliary_loss_mlp": 0.01303256, + "balance_loss_clip": 0.06300272, + "balance_loss_mlp": 0.01265872, + "epoch": 0.11543664512250112, + "flos": 21075787664640.0, + "grad_norm": 2.5165855021660697, + "language_loss": 0.87737721, + "learning_rate": 3.923917511502512e-06, + "loss": 0.95683897, + "num_input_tokens_seen": 41568055, + "router_z_loss_clip": 3.42773438, + "router_z_loss_mlp": 0.37402344, + "step": 1920, + "time_per_iteration": 2.536255121231079 + }, + { + "auxiliary_loss_clip": 0.0663945, + "auxiliary_loss_mlp": 0.01300031, + "balance_loss_clip": 0.06300904, + "balance_loss_mlp": 0.01262671, + "epoch": 0.11549676837516909, + "flos": 22754175672960.0, + "grad_norm": 2.0755692503441696, + "language_loss": 0.81216776, + "learning_rate": 3.923811076152589e-06, + "loss": 0.89156258, + "num_input_tokens_seen": 41587435, + "router_z_loss_clip": 3.3828125, + "router_z_loss_mlp": 0.3737793, + "step": 1921, + "time_per_iteration": 2.5809693336486816 + }, + { + "auxiliary_loss_clip": 0.06661837, + "auxiliary_loss_mlp": 0.01301821, + "balance_loss_clip": 0.06303193, + "balance_loss_mlp": 0.0126036, + "epoch": 0.11555689162783707, + "flos": 19174510995840.0, + "grad_norm": 2.11935003712056, + "language_loss": 0.79765266, + "learning_rate": 3.923704567851557e-06, + "loss": 0.87728924, + "num_input_tokens_seen": 41604975, + "router_z_loss_clip": 3.5859375, + "router_z_loss_mlp": 0.41455078, + "step": 1922, + "time_per_iteration": 2.521562099456787 + }, + { + "auxiliary_loss_clip": 0.06651014, + "auxiliary_loss_mlp": 0.01303966, + "balance_loss_clip": 0.06302896, + "balance_loss_mlp": 0.01265939, + "epoch": 0.11561701488050503, + "flos": 24579031818240.0, + "grad_norm": 1.9630494189649508, + "language_loss": 0.85855269, + "learning_rate": 3.923597986603456e-06, + "loss": 0.93810248, + "num_input_tokens_seen": 41626155, + "router_z_loss_clip": 3.48046875, + "router_z_loss_mlp": 0.38037109, + "step": 1923, + "time_per_iteration": 2.6439831256866455 + }, + { + "auxiliary_loss_clip": 0.06647194, + "auxiliary_loss_mlp": 0.01294133, + "balance_loss_clip": 0.0630134, + "balance_loss_mlp": 0.01258465, + "epoch": 0.115677138133173, + "flos": 17098283001600.0, + "grad_norm": 2.06344411433486, + "language_loss": 0.8208636, + "learning_rate": 3.9234913324123264e-06, + "loss": 0.90027684, + "num_input_tokens_seen": 41644805, + "router_z_loss_clip": 3.4609375, + "router_z_loss_mlp": 0.35668945, + "step": 1924, + "time_per_iteration": 2.5213494300842285 + }, + { + "auxiliary_loss_clip": 0.06494077, + "auxiliary_loss_mlp": 0.01268349, + "balance_loss_clip": 0.06289093, + "balance_loss_mlp": 0.01252459, + "epoch": 0.11573726138584098, + "flos": 62724032317440.0, + "grad_norm": 0.8075731701213882, + "language_loss": 0.60936594, + "learning_rate": 3.923384605282212e-06, + "loss": 0.6869902, + "num_input_tokens_seen": 41709345, + "router_z_loss_clip": 2.06054688, + "router_z_loss_mlp": 0.15881348, + "step": 1925, + "time_per_iteration": 3.2047207355499268 + }, + { + "auxiliary_loss_clip": 0.06648477, + "auxiliary_loss_mlp": 0.01300045, + "balance_loss_clip": 0.06303966, + "balance_loss_mlp": 0.01261016, + "epoch": 0.11579738463850894, + "flos": 22607665608960.0, + "grad_norm": 2.013389480073572, + "language_loss": 0.76518846, + "learning_rate": 3.923277805217161e-06, + "loss": 0.84467369, + "num_input_tokens_seen": 41730210, + "router_z_loss_clip": 3.44726562, + "router_z_loss_mlp": 0.39038086, + "step": 1926, + "time_per_iteration": 2.55283784866333 + }, + { + "auxiliary_loss_clip": 0.06666763, + "auxiliary_loss_mlp": 0.01299238, + "balance_loss_clip": 0.06301835, + "balance_loss_mlp": 0.01255583, + "epoch": 0.11585750789117691, + "flos": 21732630220800.0, + "grad_norm": 5.887246019394102, + "language_loss": 0.7431767, + "learning_rate": 3.923170932221222e-06, + "loss": 0.82283664, + "num_input_tokens_seen": 41750270, + "router_z_loss_clip": 3.64648438, + "router_z_loss_mlp": 0.43652344, + "step": 1927, + "time_per_iteration": 2.560518503189087 + }, + { + "auxiliary_loss_clip": 0.06652652, + "auxiliary_loss_mlp": 0.01306042, + "balance_loss_clip": 0.0630243, + "balance_loss_mlp": 0.01264986, + "epoch": 0.11591763114384489, + "flos": 26294917328640.0, + "grad_norm": 2.5509114333241873, + "language_loss": 0.88765574, + "learning_rate": 3.92306398629845e-06, + "loss": 0.96724266, + "num_input_tokens_seen": 41772975, + "router_z_loss_clip": 3.50585938, + "router_z_loss_mlp": 0.41064453, + "step": 1928, + "time_per_iteration": 2.6590919494628906 + }, + { + "auxiliary_loss_clip": 0.06657438, + "auxiliary_loss_mlp": 0.01301093, + "balance_loss_clip": 0.06300268, + "balance_loss_mlp": 0.01261468, + "epoch": 0.11597775439651285, + "flos": 23006721479040.0, + "grad_norm": 2.0893495121762844, + "language_loss": 0.7806766, + "learning_rate": 3.922956967452898e-06, + "loss": 0.86026198, + "num_input_tokens_seen": 41791765, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 0.39648438, + "step": 1929, + "time_per_iteration": 2.5792133808135986 + }, + { + "auxiliary_loss_clip": 0.06650299, + "auxiliary_loss_mlp": 0.01295794, + "balance_loss_clip": 0.06304935, + "balance_loss_mlp": 0.01259626, + "epoch": 0.11603787764918082, + "flos": 31949845678080.0, + "grad_norm": 1.6257603780251215, + "language_loss": 0.78351086, + "learning_rate": 3.922849875688626e-06, + "loss": 0.86297178, + "num_input_tokens_seen": 41815615, + "router_z_loss_clip": 3.453125, + "router_z_loss_mlp": 0.36181641, + "step": 1930, + "time_per_iteration": 2.6880123615264893 + }, + { + "auxiliary_loss_clip": 0.06647912, + "auxiliary_loss_mlp": 0.01295728, + "balance_loss_clip": 0.06302683, + "balance_loss_mlp": 0.01257438, + "epoch": 0.1160980009018488, + "flos": 22277944592640.0, + "grad_norm": 1.7868265367767153, + "language_loss": 0.73173678, + "learning_rate": 3.922742711009693e-06, + "loss": 0.81117314, + "num_input_tokens_seen": 41834810, + "router_z_loss_clip": 3.453125, + "router_z_loss_mlp": 0.3828125, + "step": 1931, + "time_per_iteration": 2.5717685222625732 + }, + { + "auxiliary_loss_clip": 0.06652078, + "auxiliary_loss_mlp": 0.01303044, + "balance_loss_clip": 0.06304099, + "balance_loss_mlp": 0.01264539, + "epoch": 0.11615812415451676, + "flos": 22790205728640.0, + "grad_norm": 1.6665760080165584, + "language_loss": 0.8340829, + "learning_rate": 3.922635473420164e-06, + "loss": 0.91363412, + "num_input_tokens_seen": 41854975, + "router_z_loss_clip": 3.47851562, + "router_z_loss_mlp": 0.38500977, + "step": 1932, + "time_per_iteration": 2.601752519607544 + }, + { + "auxiliary_loss_clip": 0.0648433, + "auxiliary_loss_mlp": 0.01265345, + "balance_loss_clip": 0.06279489, + "balance_loss_mlp": 0.01250242, + "epoch": 0.11621824740718473, + "flos": 67165483438080.0, + "grad_norm": 0.7530575515980809, + "language_loss": 0.61312342, + "learning_rate": 3.922528162924105e-06, + "loss": 0.69062018, + "num_input_tokens_seen": 41911105, + "router_z_loss_clip": 2.046875, + "router_z_loss_mlp": 0.15075684, + "step": 1933, + "time_per_iteration": 3.078101873397827 + }, + { + "auxiliary_loss_clip": 0.06656399, + "auxiliary_loss_mlp": 0.01297791, + "balance_loss_clip": 0.06303177, + "balance_loss_mlp": 0.01259239, + "epoch": 0.11627837065985269, + "flos": 20382160366080.0, + "grad_norm": 2.5724054750959446, + "language_loss": 0.8773917, + "learning_rate": 3.922420779525586e-06, + "loss": 0.95693362, + "num_input_tokens_seen": 41931750, + "router_z_loss_clip": 3.52929688, + "router_z_loss_mlp": 0.38574219, + "step": 1934, + "time_per_iteration": 2.5999112129211426 + }, + { + "auxiliary_loss_clip": 0.06669597, + "auxiliary_loss_mlp": 0.01303802, + "balance_loss_clip": 0.0630424, + "balance_loss_mlp": 0.01260386, + "epoch": 0.11633849391252067, + "flos": 21732252877440.0, + "grad_norm": 3.12484100633917, + "language_loss": 0.67964768, + "learning_rate": 3.9223133232286776e-06, + "loss": 0.75938165, + "num_input_tokens_seen": 41949400, + "router_z_loss_clip": 3.65625, + "router_z_loss_mlp": 0.43408203, + "step": 1935, + "time_per_iteration": 2.5801587104797363 + }, + { + "auxiliary_loss_clip": 0.06657647, + "auxiliary_loss_mlp": 0.01296559, + "balance_loss_clip": 0.06305058, + "balance_loss_mlp": 0.01259485, + "epoch": 0.11639861716518864, + "flos": 18811023984000.0, + "grad_norm": 1.935927362539055, + "language_loss": 0.77021551, + "learning_rate": 3.922205794037456e-06, + "loss": 0.84975761, + "num_input_tokens_seen": 41968100, + "router_z_loss_clip": 3.52734375, + "router_z_loss_mlp": 0.37084961, + "step": 1936, + "time_per_iteration": 2.5624840259552 + }, + { + "auxiliary_loss_clip": 0.06655373, + "auxiliary_loss_mlp": 0.01299017, + "balance_loss_clip": 0.06303351, + "balance_loss_mlp": 0.01259678, + "epoch": 0.1164587404178566, + "flos": 21221333406720.0, + "grad_norm": 1.9207342779057202, + "language_loss": 0.85928023, + "learning_rate": 3.922098191955998e-06, + "loss": 0.93882406, + "num_input_tokens_seen": 41986375, + "router_z_loss_clip": 3.51953125, + "router_z_loss_mlp": 0.39355469, + "step": 1937, + "time_per_iteration": 2.5510001182556152 + }, + { + "auxiliary_loss_clip": 0.06649198, + "auxiliary_loss_mlp": 0.01298206, + "balance_loss_clip": 0.06305847, + "balance_loss_mlp": 0.01261561, + "epoch": 0.11651886367052458, + "flos": 27826040586240.0, + "grad_norm": 2.6065443485594613, + "language_loss": 0.78032261, + "learning_rate": 3.921990516988384e-06, + "loss": 0.85979664, + "num_input_tokens_seen": 42006055, + "router_z_loss_clip": 3.43359375, + "router_z_loss_mlp": 0.36645508, + "step": 1938, + "time_per_iteration": 2.6225640773773193 + }, + { + "auxiliary_loss_clip": 0.06663075, + "auxiliary_loss_mlp": 0.01303768, + "balance_loss_clip": 0.06310252, + "balance_loss_mlp": 0.01266098, + "epoch": 0.11657898692319255, + "flos": 22895570638080.0, + "grad_norm": 1.931552039208485, + "language_loss": 0.80530608, + "learning_rate": 3.921882769138696e-06, + "loss": 0.88497448, + "num_input_tokens_seen": 42024995, + "router_z_loss_clip": 3.52929688, + "router_z_loss_mlp": 0.37670898, + "step": 1939, + "time_per_iteration": 2.5451977252960205 + }, + { + "auxiliary_loss_clip": 0.06656967, + "auxiliary_loss_mlp": 0.01296552, + "balance_loss_clip": 0.06312265, + "balance_loss_mlp": 0.01261409, + "epoch": 0.11663911017586051, + "flos": 24322712578560.0, + "grad_norm": 2.6690615994939795, + "language_loss": 0.88347197, + "learning_rate": 3.9217749484110215e-06, + "loss": 0.96300709, + "num_input_tokens_seen": 42042640, + "router_z_loss_clip": 3.4453125, + "router_z_loss_mlp": 0.3515625, + "step": 1940, + "time_per_iteration": 2.572737216949463 + }, + { + "auxiliary_loss_clip": 0.06642211, + "auxiliary_loss_mlp": 0.01298321, + "balance_loss_clip": 0.06303503, + "balance_loss_mlp": 0.01262987, + "epoch": 0.11669923342852849, + "flos": 42350020525440.0, + "grad_norm": 1.538525373225641, + "language_loss": 0.7696858, + "learning_rate": 3.921667054809449e-06, + "loss": 0.84909111, + "num_input_tokens_seen": 42067005, + "router_z_loss_clip": 3.38671875, + "router_z_loss_mlp": 0.35327148, + "step": 1941, + "time_per_iteration": 2.72994065284729 + }, + { + "auxiliary_loss_clip": 0.06658466, + "auxiliary_loss_mlp": 0.01294978, + "balance_loss_clip": 0.06313083, + "balance_loss_mlp": 0.01259525, + "epoch": 0.11675935668119646, + "flos": 14646660716160.0, + "grad_norm": 2.147321627209633, + "language_loss": 0.9028796, + "learning_rate": 3.921559088338068e-06, + "loss": 0.98241401, + "num_input_tokens_seen": 42082295, + "router_z_loss_clip": 3.453125, + "router_z_loss_mlp": 0.35449219, + "step": 1942, + "time_per_iteration": 2.550832986831665 + }, + { + "auxiliary_loss_clip": 0.06645136, + "auxiliary_loss_mlp": 0.0129601, + "balance_loss_clip": 0.06305736, + "balance_loss_mlp": 0.01262154, + "epoch": 0.11681947993386442, + "flos": 35125213605120.0, + "grad_norm": 1.8932460092328547, + "language_loss": 0.69414169, + "learning_rate": 3.921451049000975e-06, + "loss": 0.77355313, + "num_input_tokens_seen": 42105295, + "router_z_loss_clip": 3.39453125, + "router_z_loss_mlp": 0.33813477, + "step": 1943, + "time_per_iteration": 2.6689436435699463 + }, + { + "auxiliary_loss_clip": 0.06646268, + "auxiliary_loss_mlp": 0.01301771, + "balance_loss_clip": 0.06305961, + "balance_loss_mlp": 0.01264721, + "epoch": 0.11687960318653239, + "flos": 38992531749120.0, + "grad_norm": 3.030291623904481, + "language_loss": 0.71275461, + "learning_rate": 3.921342936802265e-06, + "loss": 0.79223496, + "num_input_tokens_seen": 42125520, + "router_z_loss_clip": 3.40429688, + "router_z_loss_mlp": 0.37060547, + "step": 1944, + "time_per_iteration": 2.8050050735473633 + }, + { + "auxiliary_loss_clip": 0.06641431, + "auxiliary_loss_mlp": 0.01296797, + "balance_loss_clip": 0.06306483, + "balance_loss_mlp": 0.01261606, + "epoch": 0.11693972643920036, + "flos": 26002190689920.0, + "grad_norm": 1.654338946560172, + "language_loss": 0.83736217, + "learning_rate": 3.921234751746038e-06, + "loss": 0.91674441, + "num_input_tokens_seen": 42146335, + "router_z_loss_clip": 3.34765625, + "router_z_loss_mlp": 0.35205078, + "step": 1945, + "time_per_iteration": 2.6361136436462402 + }, + { + "auxiliary_loss_clip": 0.06650846, + "auxiliary_loss_mlp": 0.01293506, + "balance_loss_clip": 0.06312834, + "balance_loss_mlp": 0.01259579, + "epoch": 0.11699984969186833, + "flos": 27279552257280.0, + "grad_norm": 2.078454883436641, + "language_loss": 0.78074771, + "learning_rate": 3.9211264938363975e-06, + "loss": 0.86019123, + "num_input_tokens_seen": 42165320, + "router_z_loss_clip": 3.38085938, + "router_z_loss_mlp": 0.33935547, + "step": 1946, + "time_per_iteration": 2.6417500972747803 + }, + { + "auxiliary_loss_clip": 0.06645864, + "auxiliary_loss_mlp": 0.01291798, + "balance_loss_clip": 0.06307344, + "balance_loss_mlp": 0.01256083, + "epoch": 0.1170599729445363, + "flos": 15273217221120.0, + "grad_norm": 2.310732730392425, + "language_loss": 0.70257539, + "learning_rate": 3.921018163077448e-06, + "loss": 0.78195202, + "num_input_tokens_seen": 42182955, + "router_z_loss_clip": 3.38671875, + "router_z_loss_mlp": 0.35717773, + "step": 1947, + "time_per_iteration": 2.536513090133667 + }, + { + "auxiliary_loss_clip": 0.0665355, + "auxiliary_loss_mlp": 0.01301689, + "balance_loss_clip": 0.0630812, + "balance_loss_mlp": 0.01263113, + "epoch": 0.11712009619720427, + "flos": 17170007696640.0, + "grad_norm": 1.8188768357243443, + "language_loss": 0.86507225, + "learning_rate": 3.920909759473295e-06, + "loss": 0.94462466, + "num_input_tokens_seen": 42200760, + "router_z_loss_clip": 3.45507812, + "router_z_loss_mlp": 0.38574219, + "step": 1948, + "time_per_iteration": 2.515779495239258 + }, + { + "auxiliary_loss_clip": 0.06494473, + "auxiliary_loss_mlp": 0.01265792, + "balance_loss_clip": 0.06290484, + "balance_loss_mlp": 0.01249031, + "epoch": 0.11718021944987224, + "flos": 70961076887040.0, + "grad_norm": 2.567078438362061, + "language_loss": 0.65165019, + "learning_rate": 3.920801283028054e-06, + "loss": 0.72925287, + "num_input_tokens_seen": 42265745, + "router_z_loss_clip": 2.04101562, + "router_z_loss_mlp": 0.16772461, + "step": 1949, + "time_per_iteration": 3.177534341812134 + }, + { + "auxiliary_loss_clip": 0.06637877, + "auxiliary_loss_mlp": 0.0129446, + "balance_loss_clip": 0.06306669, + "balance_loss_mlp": 0.01261344, + "epoch": 0.1172403427025402, + "flos": 27460750711680.0, + "grad_norm": 1.6361907196052987, + "language_loss": 0.73358595, + "learning_rate": 3.920692733745835e-06, + "loss": 0.81290931, + "num_input_tokens_seen": 42286245, + "router_z_loss_clip": 3.31054688, + "router_z_loss_mlp": 0.33129883, + "step": 1950, + "time_per_iteration": 4.022751808166504 + }, + { + "auxiliary_loss_clip": 0.06660106, + "auxiliary_loss_mlp": 0.01302647, + "balance_loss_clip": 0.063132, + "balance_loss_mlp": 0.01265382, + "epoch": 0.11730046595520818, + "flos": 15674075953920.0, + "grad_norm": 2.7331916034067363, + "language_loss": 0.77657926, + "learning_rate": 3.920584111630755e-06, + "loss": 0.85620677, + "num_input_tokens_seen": 42302710, + "router_z_loss_clip": 3.46875, + "router_z_loss_mlp": 0.37280273, + "step": 1951, + "time_per_iteration": 2.5281777381896973 + }, + { + "auxiliary_loss_clip": 0.06648034, + "auxiliary_loss_mlp": 0.01294944, + "balance_loss_clip": 0.06303104, + "balance_loss_mlp": 0.01259801, + "epoch": 0.11736058920787615, + "flos": 25637320085760.0, + "grad_norm": 1.948975435069226, + "language_loss": 0.77674389, + "learning_rate": 3.9204754166869325e-06, + "loss": 0.85617363, + "num_input_tokens_seen": 42324115, + "router_z_loss_clip": 3.44921875, + "router_z_loss_mlp": 0.35131836, + "step": 1952, + "time_per_iteration": 4.001826286315918 + }, + { + "auxiliary_loss_clip": 0.06657356, + "auxiliary_loss_mlp": 0.01307688, + "balance_loss_clip": 0.06309209, + "balance_loss_mlp": 0.01270828, + "epoch": 0.11742071246054411, + "flos": 21440742122880.0, + "grad_norm": 9.62552088472932, + "language_loss": 0.73713255, + "learning_rate": 3.920366648918491e-06, + "loss": 0.81678301, + "num_input_tokens_seen": 42342505, + "router_z_loss_clip": 3.48242188, + "router_z_loss_mlp": 0.3684082, + "step": 1953, + "time_per_iteration": 2.5549252033233643 + }, + { + "auxiliary_loss_clip": 0.06670918, + "auxiliary_loss_mlp": 0.0130466, + "balance_loss_clip": 0.06317334, + "balance_loss_mlp": 0.01266203, + "epoch": 0.11748083571321208, + "flos": 16003377699840.0, + "grad_norm": 2.536716983337743, + "language_loss": 0.80894691, + "learning_rate": 3.920257808329552e-06, + "loss": 0.88870263, + "num_input_tokens_seen": 42360525, + "router_z_loss_clip": 3.53710938, + "router_z_loss_mlp": 0.38452148, + "step": 1954, + "time_per_iteration": 2.5963521003723145 + }, + { + "auxiliary_loss_clip": 0.06659664, + "auxiliary_loss_mlp": 0.01298566, + "balance_loss_clip": 0.06309056, + "balance_loss_mlp": 0.01260037, + "epoch": 0.11754095896588006, + "flos": 16185582403200.0, + "grad_norm": 1.9904438509588216, + "language_loss": 0.86966431, + "learning_rate": 3.920148894924246e-06, + "loss": 0.94924664, + "num_input_tokens_seen": 42377045, + "router_z_loss_clip": 3.50195312, + "router_z_loss_mlp": 0.38500977, + "step": 1955, + "time_per_iteration": 3.9597103595733643 + }, + { + "auxiliary_loss_clip": 0.06656501, + "auxiliary_loss_mlp": 0.0129708, + "balance_loss_clip": 0.06311554, + "balance_loss_mlp": 0.01262962, + "epoch": 0.11760108221854802, + "flos": 13266701424000.0, + "grad_norm": 2.228472811519511, + "language_loss": 0.79745102, + "learning_rate": 3.920039908706701e-06, + "loss": 0.8769868, + "num_input_tokens_seen": 42393960, + "router_z_loss_clip": 3.44726562, + "router_z_loss_mlp": 0.34130859, + "step": 1956, + "time_per_iteration": 3.990912437438965 + }, + { + "auxiliary_loss_clip": 0.0665153, + "auxiliary_loss_mlp": 0.01299416, + "balance_loss_clip": 0.06313992, + "balance_loss_mlp": 0.01266014, + "epoch": 0.11766120547121599, + "flos": 24505294625280.0, + "grad_norm": 2.0751916947238755, + "language_loss": 0.81691504, + "learning_rate": 3.91993084968105e-06, + "loss": 0.89642453, + "num_input_tokens_seen": 42413160, + "router_z_loss_clip": 3.37695312, + "router_z_loss_mlp": 0.33398438, + "step": 1957, + "time_per_iteration": 2.6472387313842773 + }, + { + "auxiliary_loss_clip": 0.06660254, + "auxiliary_loss_mlp": 0.01296947, + "balance_loss_clip": 0.06313962, + "balance_loss_mlp": 0.01261757, + "epoch": 0.11772132872388397, + "flos": 17789562385920.0, + "grad_norm": 3.000987002447453, + "language_loss": 0.80231309, + "learning_rate": 3.919821717851428e-06, + "loss": 0.88188511, + "num_input_tokens_seen": 42432590, + "router_z_loss_clip": 3.45898438, + "router_z_loss_mlp": 0.35180664, + "step": 1958, + "time_per_iteration": 2.5531046390533447 + }, + { + "auxiliary_loss_clip": 0.06667449, + "auxiliary_loss_mlp": 0.01302997, + "balance_loss_clip": 0.06316346, + "balance_loss_mlp": 0.01263968, + "epoch": 0.11778145197655193, + "flos": 13220776586880.0, + "grad_norm": 3.2848276198767725, + "language_loss": 0.78886813, + "learning_rate": 3.919712513221976e-06, + "loss": 0.86857259, + "num_input_tokens_seen": 42450135, + "router_z_loss_clip": 3.51171875, + "router_z_loss_mlp": 0.39038086, + "step": 1959, + "time_per_iteration": 2.57987642288208 + }, + { + "auxiliary_loss_clip": 0.06661299, + "auxiliary_loss_mlp": 0.01290487, + "balance_loss_clip": 0.06313363, + "balance_loss_mlp": 0.0125656, + "epoch": 0.1178415752292199, + "flos": 20236446915840.0, + "grad_norm": 2.2069161558777033, + "language_loss": 0.72216022, + "learning_rate": 3.919603235796832e-06, + "loss": 0.80167806, + "num_input_tokens_seen": 42470050, + "router_z_loss_clip": 3.47851562, + "router_z_loss_mlp": 0.33911133, + "step": 1960, + "time_per_iteration": 2.568760633468628 + }, + { + "auxiliary_loss_clip": 0.06675136, + "auxiliary_loss_mlp": 0.0129754, + "balance_loss_clip": 0.0632275, + "balance_loss_mlp": 0.01260156, + "epoch": 0.11790169848188788, + "flos": 13044777085440.0, + "grad_norm": 2.729190408722114, + "language_loss": 0.83173323, + "learning_rate": 3.9194938855801406e-06, + "loss": 0.91146004, + "num_input_tokens_seen": 42484335, + "router_z_loss_clip": 3.51953125, + "router_z_loss_mlp": 0.3737793, + "step": 1961, + "time_per_iteration": 2.5375704765319824 + }, + { + "auxiliary_loss_clip": 0.06648357, + "auxiliary_loss_mlp": 0.01294811, + "balance_loss_clip": 0.06310797, + "balance_loss_mlp": 0.01261671, + "epoch": 0.11796182173455584, + "flos": 22271026631040.0, + "grad_norm": 1.7537121481691995, + "language_loss": 0.93383837, + "learning_rate": 3.919384462576049e-06, + "loss": 1.01327002, + "num_input_tokens_seen": 42502720, + "router_z_loss_clip": 3.375, + "router_z_loss_mlp": 0.33105469, + "step": 1962, + "time_per_iteration": 2.5976755619049072 + }, + { + "auxiliary_loss_clip": 0.06656337, + "auxiliary_loss_mlp": 0.01295869, + "balance_loss_clip": 0.06308894, + "balance_loss_mlp": 0.0125994, + "epoch": 0.1180219449872238, + "flos": 10639750469760.0, + "grad_norm": 2.255465148131723, + "language_loss": 0.89418864, + "learning_rate": 3.919274966788707e-06, + "loss": 0.97371072, + "num_input_tokens_seen": 42519460, + "router_z_loss_clip": 3.4765625, + "router_z_loss_mlp": 0.35961914, + "step": 1963, + "time_per_iteration": 2.543811321258545 + }, + { + "auxiliary_loss_clip": 0.06669922, + "auxiliary_loss_mlp": 0.01296273, + "balance_loss_clip": 0.0631619, + "balance_loss_mlp": 0.01260963, + "epoch": 0.11808206823989177, + "flos": 20929906506240.0, + "grad_norm": 1.978622705265592, + "language_loss": 0.85645056, + "learning_rate": 3.919165398222265e-06, + "loss": 0.93611252, + "num_input_tokens_seen": 42539420, + "router_z_loss_clip": 3.5390625, + "router_z_loss_mlp": 0.35327148, + "step": 1964, + "time_per_iteration": 2.623378276824951 + }, + { + "auxiliary_loss_clip": 0.06654269, + "auxiliary_loss_mlp": 0.01293841, + "balance_loss_clip": 0.06309862, + "balance_loss_mlp": 0.01258722, + "epoch": 0.11814219149255975, + "flos": 20784151128960.0, + "grad_norm": 2.5088973707394833, + "language_loss": 0.84141672, + "learning_rate": 3.919055756880879e-06, + "loss": 0.92089784, + "num_input_tokens_seen": 42558225, + "router_z_loss_clip": 3.44335938, + "router_z_loss_mlp": 0.35107422, + "step": 1965, + "time_per_iteration": 2.5660836696624756 + }, + { + "auxiliary_loss_clip": 0.0666364, + "auxiliary_loss_mlp": 0.01301878, + "balance_loss_clip": 0.06310593, + "balance_loss_mlp": 0.01261681, + "epoch": 0.11820231474522772, + "flos": 48770594357760.0, + "grad_norm": 7.622964926374016, + "language_loss": 0.75756431, + "learning_rate": 3.918946042768707e-06, + "loss": 0.83721948, + "num_input_tokens_seen": 42580790, + "router_z_loss_clip": 3.53320312, + "router_z_loss_mlp": 0.40185547, + "step": 1966, + "time_per_iteration": 2.82966947555542 + }, + { + "auxiliary_loss_clip": 0.06671088, + "auxiliary_loss_mlp": 0.01309316, + "balance_loss_clip": 0.06322029, + "balance_loss_mlp": 0.01273887, + "epoch": 0.11826243799789568, + "flos": 16696166457600.0, + "grad_norm": 4.386609320764267, + "language_loss": 0.74750423, + "learning_rate": 3.918836255889908e-06, + "loss": 0.8273083, + "num_input_tokens_seen": 42597355, + "router_z_loss_clip": 3.49414062, + "router_z_loss_mlp": 0.35449219, + "step": 1967, + "time_per_iteration": 2.5282158851623535 + }, + { + "auxiliary_loss_clip": 0.06658092, + "auxiliary_loss_mlp": 0.01304409, + "balance_loss_clip": 0.06307551, + "balance_loss_mlp": 0.01268003, + "epoch": 0.11832256125056366, + "flos": 16915533246720.0, + "grad_norm": 2.9401944207789934, + "language_loss": 0.90244436, + "learning_rate": 3.9187263962486456e-06, + "loss": 0.98206937, + "num_input_tokens_seen": 42616060, + "router_z_loss_clip": 3.5078125, + "router_z_loss_mlp": 0.36401367, + "step": 1968, + "time_per_iteration": 2.573209285736084 + }, + { + "auxiliary_loss_clip": 0.06659393, + "auxiliary_loss_mlp": 0.01300215, + "balance_loss_clip": 0.06313194, + "balance_loss_mlp": 0.01266264, + "epoch": 0.11838268450323162, + "flos": 22827032398080.0, + "grad_norm": 2.909458687960279, + "language_loss": 0.68506658, + "learning_rate": 3.918616463849087e-06, + "loss": 0.76466268, + "num_input_tokens_seen": 42636285, + "router_z_loss_clip": 3.46289062, + "router_z_loss_mlp": 0.33935547, + "step": 1969, + "time_per_iteration": 2.574584484100342 + }, + { + "auxiliary_loss_clip": 0.06652254, + "auxiliary_loss_mlp": 0.01317322, + "balance_loss_clip": 0.06307729, + "balance_loss_mlp": 0.01281034, + "epoch": 0.11844280775589959, + "flos": 33554035296000.0, + "grad_norm": 1.9192483322460232, + "language_loss": 0.81922328, + "learning_rate": 3.918506458695399e-06, + "loss": 0.89891899, + "num_input_tokens_seen": 42658320, + "router_z_loss_clip": 3.44335938, + "router_z_loss_mlp": 0.36303711, + "step": 1970, + "time_per_iteration": 2.688477039337158 + }, + { + "auxiliary_loss_clip": 0.06493312, + "auxiliary_loss_mlp": 0.01272243, + "balance_loss_clip": 0.06287479, + "balance_loss_mlp": 0.01257163, + "epoch": 0.11850293100856757, + "flos": 66371522474880.0, + "grad_norm": 0.7778041955901001, + "language_loss": 0.66349763, + "learning_rate": 3.918396380791754e-06, + "loss": 0.74115324, + "num_input_tokens_seen": 42721500, + "router_z_loss_clip": 2.0625, + "router_z_loss_mlp": 0.1505127, + "step": 1971, + "time_per_iteration": 3.1715264320373535 + }, + { + "auxiliary_loss_clip": 0.06664559, + "auxiliary_loss_mlp": 0.01309662, + "balance_loss_clip": 0.06317366, + "balance_loss_mlp": 0.01274996, + "epoch": 0.11856305426123553, + "flos": 24687960526080.0, + "grad_norm": 2.78038897761295, + "language_loss": 0.81843936, + "learning_rate": 3.918286230142327e-06, + "loss": 0.89818156, + "num_input_tokens_seen": 42739825, + "router_z_loss_clip": 3.47070312, + "router_z_loss_mlp": 0.34643555, + "step": 1972, + "time_per_iteration": 2.6285483837127686 + }, + { + "auxiliary_loss_clip": 0.06645221, + "auxiliary_loss_mlp": 0.01320916, + "balance_loss_clip": 0.06308153, + "balance_loss_mlp": 0.01286179, + "epoch": 0.1186231775139035, + "flos": 24287017939200.0, + "grad_norm": 2.7493832888964116, + "language_loss": 0.746387, + "learning_rate": 3.918176006751292e-06, + "loss": 0.82604837, + "num_input_tokens_seen": 42758695, + "router_z_loss_clip": 3.36914062, + "router_z_loss_mlp": 0.34716797, + "step": 1973, + "time_per_iteration": 2.607680082321167 + }, + { + "auxiliary_loss_clip": 0.06639803, + "auxiliary_loss_mlp": 0.0131421, + "balance_loss_clip": 0.06300108, + "balance_loss_mlp": 0.01277851, + "epoch": 0.11868330076657148, + "flos": 21763042053120.0, + "grad_norm": 1.6365219196166583, + "language_loss": 0.73750299, + "learning_rate": 3.918065710622832e-06, + "loss": 0.81704313, + "num_input_tokens_seen": 42778510, + "router_z_loss_clip": 3.3984375, + "router_z_loss_mlp": 0.36352539, + "step": 1974, + "time_per_iteration": 2.603078603744507 + }, + { + "auxiliary_loss_clip": 0.06653641, + "auxiliary_loss_mlp": 0.01323127, + "balance_loss_clip": 0.06305285, + "balance_loss_mlp": 0.01286196, + "epoch": 0.11874342401923944, + "flos": 17197568490240.0, + "grad_norm": 3.7102130607090893, + "language_loss": 0.79475862, + "learning_rate": 3.917955341761128e-06, + "loss": 0.87452626, + "num_input_tokens_seen": 42793995, + "router_z_loss_clip": 3.48242188, + "router_z_loss_mlp": 0.36914062, + "step": 1975, + "time_per_iteration": 2.529472827911377 + }, + { + "auxiliary_loss_clip": 0.06637481, + "auxiliary_loss_mlp": 0.01318957, + "balance_loss_clip": 0.06305119, + "balance_loss_mlp": 0.01286246, + "epoch": 0.11880354727190741, + "flos": 15234629616000.0, + "grad_norm": 3.277775960681522, + "language_loss": 0.77101427, + "learning_rate": 3.917844900170364e-06, + "loss": 0.85057861, + "num_input_tokens_seen": 42809000, + "router_z_loss_clip": 3.3203125, + "router_z_loss_mlp": 0.32714844, + "step": 1976, + "time_per_iteration": 2.5576260089874268 + }, + { + "auxiliary_loss_clip": 0.06648317, + "auxiliary_loss_mlp": 0.01301156, + "balance_loss_clip": 0.0630495, + "balance_loss_mlp": 0.0126537, + "epoch": 0.11886367052457537, + "flos": 27317343248640.0, + "grad_norm": 1.6788870618385208, + "language_loss": 0.76201534, + "learning_rate": 3.91773438585473e-06, + "loss": 0.84151006, + "num_input_tokens_seen": 42831585, + "router_z_loss_clip": 3.43164062, + "router_z_loss_mlp": 0.35791016, + "step": 1977, + "time_per_iteration": 2.6103506088256836 + }, + { + "auxiliary_loss_clip": 0.06654633, + "auxiliary_loss_mlp": 0.01297753, + "balance_loss_clip": 0.06301999, + "balance_loss_mlp": 0.01261346, + "epoch": 0.11892379377724335, + "flos": 21804648405120.0, + "grad_norm": 2.329560685386949, + "language_loss": 0.75601208, + "learning_rate": 3.9176237988184165e-06, + "loss": 0.835536, + "num_input_tokens_seen": 42848420, + "router_z_loss_clip": 3.5234375, + "router_z_loss_mlp": 0.36401367, + "step": 1978, + "time_per_iteration": 2.556502103805542 + }, + { + "auxiliary_loss_clip": 0.06647499, + "auxiliary_loss_mlp": 0.01294249, + "balance_loss_clip": 0.06307921, + "balance_loss_mlp": 0.0126068, + "epoch": 0.11898391702991132, + "flos": 13996191070080.0, + "grad_norm": 1.8023230195278173, + "language_loss": 0.74423146, + "learning_rate": 3.917513139065616e-06, + "loss": 0.82364893, + "num_input_tokens_seen": 42866645, + "router_z_loss_clip": 3.40039062, + "router_z_loss_mlp": 0.33569336, + "step": 1979, + "time_per_iteration": 2.595372200012207 + }, + { + "auxiliary_loss_clip": 0.0664144, + "auxiliary_loss_mlp": 0.01296465, + "balance_loss_clip": 0.06302245, + "balance_loss_mlp": 0.01261965, + "epoch": 0.11904404028257928, + "flos": 32242907733120.0, + "grad_norm": 1.646895354500375, + "language_loss": 0.99974936, + "learning_rate": 3.917402406600525e-06, + "loss": 1.07912838, + "num_input_tokens_seen": 42888515, + "router_z_loss_clip": 3.39257812, + "router_z_loss_mlp": 0.34521484, + "step": 1980, + "time_per_iteration": 2.6381077766418457 + }, + { + "auxiliary_loss_clip": 0.06647406, + "auxiliary_loss_mlp": 0.01292706, + "balance_loss_clip": 0.0630409, + "balance_loss_mlp": 0.01256299, + "epoch": 0.11910416353524726, + "flos": 23592971370240.0, + "grad_norm": 2.6857595325388095, + "language_loss": 0.87083352, + "learning_rate": 3.917291601427342e-06, + "loss": 0.95023465, + "num_input_tokens_seen": 42909035, + "router_z_loss_clip": 3.43359375, + "router_z_loss_mlp": 0.36401367, + "step": 1981, + "time_per_iteration": 2.5953710079193115 + }, + { + "auxiliary_loss_clip": 0.0664432, + "auxiliary_loss_mlp": 0.01298025, + "balance_loss_clip": 0.06305191, + "balance_loss_mlp": 0.01263287, + "epoch": 0.11916428678791523, + "flos": 25339268712960.0, + "grad_norm": 1.936683956575477, + "language_loss": 0.86578631, + "learning_rate": 3.91718072355027e-06, + "loss": 0.94520986, + "num_input_tokens_seen": 42927555, + "router_z_loss_clip": 3.38867188, + "router_z_loss_mlp": 0.34765625, + "step": 1982, + "time_per_iteration": 2.5845234394073486 + }, + { + "auxiliary_loss_clip": 0.06636401, + "auxiliary_loss_mlp": 0.01296498, + "balance_loss_clip": 0.06295685, + "balance_loss_mlp": 0.0126095, + "epoch": 0.11922441004058319, + "flos": 19793939904000.0, + "grad_norm": 2.0505681107153273, + "language_loss": 0.86230731, + "learning_rate": 3.917069772973513e-06, + "loss": 0.94163632, + "num_input_tokens_seen": 42945300, + "router_z_loss_clip": 3.40820312, + "router_z_loss_mlp": 0.35571289, + "step": 1983, + "time_per_iteration": 2.554844379425049 + }, + { + "auxiliary_loss_clip": 0.06654783, + "auxiliary_loss_mlp": 0.01292763, + "balance_loss_clip": 0.06302382, + "balance_loss_mlp": 0.01256858, + "epoch": 0.11928453329325117, + "flos": 21541578912000.0, + "grad_norm": 3.6464912777756373, + "language_loss": 0.78593659, + "learning_rate": 3.916958749701277e-06, + "loss": 0.86541206, + "num_input_tokens_seen": 42961295, + "router_z_loss_clip": 3.51757812, + "router_z_loss_mlp": 0.35913086, + "step": 1984, + "time_per_iteration": 2.5320324897766113 + }, + { + "auxiliary_loss_clip": 0.06647135, + "auxiliary_loss_mlp": 0.01292695, + "balance_loss_clip": 0.0630364, + "balance_loss_mlp": 0.0125574, + "epoch": 0.11934465654591914, + "flos": 20821522849920.0, + "grad_norm": 1.8707303629344072, + "language_loss": 0.84522444, + "learning_rate": 3.9168476537377745e-06, + "loss": 0.92462277, + "num_input_tokens_seen": 42980330, + "router_z_loss_clip": 3.43164062, + "router_z_loss_mlp": 0.36962891, + "step": 1985, + "time_per_iteration": 2.6096858978271484 + }, + { + "auxiliary_loss_clip": 0.06641059, + "auxiliary_loss_mlp": 0.01296367, + "balance_loss_clip": 0.06304613, + "balance_loss_mlp": 0.01263346, + "epoch": 0.1194047797985871, + "flos": 19066169266560.0, + "grad_norm": 3.6983230286651945, + "language_loss": 0.75468755, + "learning_rate": 3.916736485087216e-06, + "loss": 0.83406186, + "num_input_tokens_seen": 42996125, + "router_z_loss_clip": 3.3671875, + "router_z_loss_mlp": 0.33007812, + "step": 1986, + "time_per_iteration": 2.497166633605957 + }, + { + "auxiliary_loss_clip": 0.06650525, + "auxiliary_loss_mlp": 0.01300056, + "balance_loss_clip": 0.06311469, + "balance_loss_mlp": 0.01265771, + "epoch": 0.11946490305125507, + "flos": 27196842677760.0, + "grad_norm": 2.5090300356015227, + "language_loss": 0.73365855, + "learning_rate": 3.916625243753819e-06, + "loss": 0.81316435, + "num_input_tokens_seen": 43014180, + "router_z_loss_clip": 3.390625, + "router_z_loss_mlp": 0.34301758, + "step": 1987, + "time_per_iteration": 2.6316466331481934 + }, + { + "auxiliary_loss_clip": 0.06659403, + "auxiliary_loss_mlp": 0.01313937, + "balance_loss_clip": 0.06313819, + "balance_loss_mlp": 0.01275886, + "epoch": 0.11952502630392305, + "flos": 21146925381120.0, + "grad_norm": 1.9895182313514284, + "language_loss": 0.73564172, + "learning_rate": 3.916513929741799e-06, + "loss": 0.81537521, + "num_input_tokens_seen": 43032120, + "router_z_loss_clip": 3.453125, + "router_z_loss_mlp": 0.38012695, + "step": 1988, + "time_per_iteration": 2.538780450820923 + }, + { + "auxiliary_loss_clip": 0.06646325, + "auxiliary_loss_mlp": 0.01300531, + "balance_loss_clip": 0.06309503, + "balance_loss_mlp": 0.01265817, + "epoch": 0.11958514955659101, + "flos": 22130260571520.0, + "grad_norm": 2.1843811344265434, + "language_loss": 0.82602763, + "learning_rate": 3.91640254305538e-06, + "loss": 0.90549618, + "num_input_tokens_seen": 43052215, + "router_z_loss_clip": 3.3671875, + "router_z_loss_mlp": 0.34716797, + "step": 1989, + "time_per_iteration": 2.6741979122161865 + }, + { + "auxiliary_loss_clip": 0.06651568, + "auxiliary_loss_mlp": 0.01303723, + "balance_loss_clip": 0.06308736, + "balance_loss_mlp": 0.01266482, + "epoch": 0.11964527280925898, + "flos": 17427333185280.0, + "grad_norm": 3.1495832164614828, + "language_loss": 0.77526391, + "learning_rate": 3.916291083698784e-06, + "loss": 0.85481679, + "num_input_tokens_seen": 43069720, + "router_z_loss_clip": 3.42578125, + "router_z_loss_mlp": 0.37255859, + "step": 1990, + "time_per_iteration": 3.9906837940216064 + }, + { + "auxiliary_loss_clip": 0.06541168, + "auxiliary_loss_mlp": 0.0131986, + "balance_loss_clip": 0.06337936, + "balance_loss_mlp": 0.01304852, + "epoch": 0.11970539606192696, + "flos": 70698804007680.0, + "grad_norm": 0.8660684283454352, + "language_loss": 0.55407226, + "learning_rate": 3.916179551676238e-06, + "loss": 0.63268256, + "num_input_tokens_seen": 43123130, + "router_z_loss_clip": 2.03125, + "router_z_loss_mlp": 0.14978027, + "step": 1991, + "time_per_iteration": 4.6956093311309814 + }, + { + "auxiliary_loss_clip": 0.06638116, + "auxiliary_loss_mlp": 0.01295675, + "balance_loss_clip": 0.06307568, + "balance_loss_mlp": 0.01263345, + "epoch": 0.11976551931459492, + "flos": 21221375333760.0, + "grad_norm": 2.476959921909238, + "language_loss": 0.79074007, + "learning_rate": 3.916067946991971e-06, + "loss": 0.87007797, + "num_input_tokens_seen": 43140015, + "router_z_loss_clip": 3.3046875, + "router_z_loss_mlp": 0.32348633, + "step": 1992, + "time_per_iteration": 2.5945029258728027 + }, + { + "auxiliary_loss_clip": 0.06650865, + "auxiliary_loss_mlp": 0.01302479, + "balance_loss_clip": 0.06309184, + "balance_loss_mlp": 0.01267647, + "epoch": 0.11982564256726289, + "flos": 25995566217600.0, + "grad_norm": 2.0953190944700215, + "language_loss": 0.800017, + "learning_rate": 3.915956269650216e-06, + "loss": 0.87955046, + "num_input_tokens_seen": 43160105, + "router_z_loss_clip": 3.41601562, + "router_z_loss_mlp": 0.34838867, + "step": 1993, + "time_per_iteration": 2.5923471450805664 + }, + { + "auxiliary_loss_clip": 0.06641386, + "auxiliary_loss_mlp": 0.0130103, + "balance_loss_clip": 0.06304519, + "balance_loss_mlp": 0.01266793, + "epoch": 0.11988576581993086, + "flos": 21656964384000.0, + "grad_norm": 1.8929635889117382, + "language_loss": 0.83093858, + "learning_rate": 3.915844519655208e-06, + "loss": 0.91036278, + "num_input_tokens_seen": 43179835, + "router_z_loss_clip": 3.37304688, + "router_z_loss_mlp": 0.3425293, + "step": 1994, + "time_per_iteration": 2.58314847946167 + }, + { + "auxiliary_loss_clip": 0.06638885, + "auxiliary_loss_mlp": 0.01299925, + "balance_loss_clip": 0.06306463, + "balance_loss_mlp": 0.01265617, + "epoch": 0.11994588907259883, + "flos": 17863048016640.0, + "grad_norm": 2.42141016996774, + "language_loss": 0.90494514, + "learning_rate": 3.915732697011183e-06, + "loss": 0.98433328, + "num_input_tokens_seen": 43197210, + "router_z_loss_clip": 3.3203125, + "router_z_loss_mlp": 0.34301758, + "step": 1995, + "time_per_iteration": 5.38932991027832 + }, + { + "auxiliary_loss_clip": 0.06647271, + "auxiliary_loss_mlp": 0.01300085, + "balance_loss_clip": 0.06306107, + "balance_loss_mlp": 0.01263583, + "epoch": 0.1200060123252668, + "flos": 24469725767040.0, + "grad_norm": 3.463827549229225, + "language_loss": 0.75938386, + "learning_rate": 3.9156208017223825e-06, + "loss": 0.83885741, + "num_input_tokens_seen": 43215050, + "router_z_loss_clip": 3.41015625, + "router_z_loss_mlp": 0.36523438, + "step": 1996, + "time_per_iteration": 2.630936861038208 + }, + { + "auxiliary_loss_clip": 0.06633951, + "auxiliary_loss_mlp": 0.01306595, + "balance_loss_clip": 0.06300932, + "balance_loss_mlp": 0.01273097, + "epoch": 0.12006613557793476, + "flos": 18737831842560.0, + "grad_norm": 2.002664476767551, + "language_loss": 0.88733006, + "learning_rate": 3.915508833793048e-06, + "loss": 0.96673548, + "num_input_tokens_seen": 43233900, + "router_z_loss_clip": 3.328125, + "router_z_loss_mlp": 0.33496094, + "step": 1997, + "time_per_iteration": 2.542490243911743 + }, + { + "auxiliary_loss_clip": 0.06639601, + "auxiliary_loss_mlp": 0.01299934, + "balance_loss_clip": 0.06303362, + "balance_loss_mlp": 0.01265864, + "epoch": 0.12012625883060274, + "flos": 22273374545280.0, + "grad_norm": 2.268718132008626, + "language_loss": 0.8047471, + "learning_rate": 3.915396793227428e-06, + "loss": 0.88414252, + "num_input_tokens_seen": 43252105, + "router_z_loss_clip": 3.36523438, + "router_z_loss_mlp": 0.34033203, + "step": 1998, + "time_per_iteration": 2.6070334911346436 + }, + { + "auxiliary_loss_clip": 0.06640439, + "auxiliary_loss_mlp": 0.01306471, + "balance_loss_clip": 0.06312488, + "balance_loss_mlp": 0.01272401, + "epoch": 0.1201863820832707, + "flos": 21764761061760.0, + "grad_norm": 2.100057893204002, + "language_loss": 0.73916173, + "learning_rate": 3.915284680029769e-06, + "loss": 0.81863081, + "num_input_tokens_seen": 43270315, + "router_z_loss_clip": 3.27929688, + "router_z_loss_mlp": 0.34033203, + "step": 1999, + "time_per_iteration": 2.5563113689422607 + }, + { + "auxiliary_loss_clip": 0.0664693, + "auxiliary_loss_mlp": 0.01298334, + "balance_loss_clip": 0.06304446, + "balance_loss_mlp": 0.01263763, + "epoch": 0.12024650533593867, + "flos": 21914415653760.0, + "grad_norm": 2.961282874650153, + "language_loss": 0.76137137, + "learning_rate": 3.915172494204323e-06, + "loss": 0.84082401, + "num_input_tokens_seen": 43289935, + "router_z_loss_clip": 3.42578125, + "router_z_loss_mlp": 0.34545898, + "step": 2000, + "time_per_iteration": 2.6174545288085938 + }, + { + "auxiliary_loss_clip": 0.0664265, + "auxiliary_loss_mlp": 0.0131017, + "balance_loss_clip": 0.06307586, + "balance_loss_mlp": 0.012756, + "epoch": 0.12030662858860665, + "flos": 21695635843200.0, + "grad_norm": 1.7187756113932227, + "language_loss": 0.86554497, + "learning_rate": 3.915060235755344e-06, + "loss": 0.94507325, + "num_input_tokens_seen": 43309325, + "router_z_loss_clip": 3.34960938, + "router_z_loss_mlp": 0.34545898, + "step": 2001, + "time_per_iteration": 2.575740098953247 + }, + { + "auxiliary_loss_clip": 0.06635608, + "auxiliary_loss_mlp": 0.01303825, + "balance_loss_clip": 0.06303231, + "balance_loss_mlp": 0.01270232, + "epoch": 0.12036675184127461, + "flos": 12938280145920.0, + "grad_norm": 3.0530773908117297, + "language_loss": 0.75370091, + "learning_rate": 3.91494790468709e-06, + "loss": 0.83309525, + "num_input_tokens_seen": 43327010, + "router_z_loss_clip": 3.32226562, + "router_z_loss_mlp": 0.33618164, + "step": 2002, + "time_per_iteration": 2.5708627700805664 + }, + { + "auxiliary_loss_clip": 0.06653483, + "auxiliary_loss_mlp": 0.01301657, + "balance_loss_clip": 0.06308778, + "balance_loss_mlp": 0.01265322, + "epoch": 0.12042687509394258, + "flos": 20857469051520.0, + "grad_norm": 3.724600785525669, + "language_loss": 0.79714429, + "learning_rate": 3.9148355010038185e-06, + "loss": 0.87669575, + "num_input_tokens_seen": 43345650, + "router_z_loss_clip": 3.44726562, + "router_z_loss_mlp": 0.36352539, + "step": 2003, + "time_per_iteration": 2.5530362129211426 + }, + { + "auxiliary_loss_clip": 0.06639621, + "auxiliary_loss_mlp": 0.01310661, + "balance_loss_clip": 0.06304517, + "balance_loss_mlp": 0.01276638, + "epoch": 0.12048699834661056, + "flos": 23885320665600.0, + "grad_norm": 3.082354768272036, + "language_loss": 0.72748882, + "learning_rate": 3.914723024709793e-06, + "loss": 0.80699164, + "num_input_tokens_seen": 43365555, + "router_z_loss_clip": 3.3515625, + "router_z_loss_mlp": 0.34008789, + "step": 2004, + "time_per_iteration": 2.583922863006592 + }, + { + "auxiliary_loss_clip": 0.06642192, + "auxiliary_loss_mlp": 0.01300449, + "balance_loss_clip": 0.06302966, + "balance_loss_mlp": 0.01263899, + "epoch": 0.12054712159927852, + "flos": 19762605676800.0, + "grad_norm": 1.8151207739831152, + "language_loss": 0.79435182, + "learning_rate": 3.914610475809279e-06, + "loss": 0.87377822, + "num_input_tokens_seen": 43384990, + "router_z_loss_clip": 3.39257812, + "router_z_loss_mlp": 0.36547852, + "step": 2005, + "time_per_iteration": 2.5544016361236572 + }, + { + "auxiliary_loss_clip": 0.06498255, + "auxiliary_loss_mlp": 0.01304889, + "balance_loss_clip": 0.06296292, + "balance_loss_mlp": 0.01289821, + "epoch": 0.12060724485194649, + "flos": 51688999411200.0, + "grad_norm": 0.895152271859771, + "language_loss": 0.5819217, + "learning_rate": 3.914497854306543e-06, + "loss": 0.65995312, + "num_input_tokens_seen": 43436335, + "router_z_loss_clip": 2.02539062, + "router_z_loss_mlp": 0.15039062, + "step": 2006, + "time_per_iteration": 2.9925737380981445 + }, + { + "auxiliary_loss_clip": 0.06637617, + "auxiliary_loss_mlp": 0.01298518, + "balance_loss_clip": 0.06307045, + "balance_loss_mlp": 0.01264042, + "epoch": 0.12066736810461445, + "flos": 18996582850560.0, + "grad_norm": 2.2145885601274653, + "language_loss": 0.77570707, + "learning_rate": 3.9143851602058575e-06, + "loss": 0.85506845, + "num_input_tokens_seen": 43456495, + "router_z_loss_clip": 3.30664062, + "router_z_loss_mlp": 0.34472656, + "step": 2007, + "time_per_iteration": 2.5426108837127686 + }, + { + "auxiliary_loss_clip": 0.0663473, + "auxiliary_loss_mlp": 0.01296019, + "balance_loss_clip": 0.06301288, + "balance_loss_mlp": 0.01260352, + "epoch": 0.12072749135728243, + "flos": 16477554355200.0, + "grad_norm": 3.5055454300142346, + "language_loss": 0.8601926, + "learning_rate": 3.914272393511494e-06, + "loss": 0.93950009, + "num_input_tokens_seen": 43473085, + "router_z_loss_clip": 3.328125, + "router_z_loss_mlp": 0.35668945, + "step": 2008, + "time_per_iteration": 2.5499417781829834 + }, + { + "auxiliary_loss_clip": 0.06641807, + "auxiliary_loss_mlp": 0.01291488, + "balance_loss_clip": 0.06305657, + "balance_loss_mlp": 0.0125768, + "epoch": 0.1207876146099504, + "flos": 18082917930240.0, + "grad_norm": 2.14462830622821, + "language_loss": 0.84945571, + "learning_rate": 3.91415955422773e-06, + "loss": 0.92878866, + "num_input_tokens_seen": 43491135, + "router_z_loss_clip": 3.359375, + "router_z_loss_mlp": 0.33813477, + "step": 2009, + "time_per_iteration": 2.5377557277679443 + }, + { + "auxiliary_loss_clip": 0.06634751, + "auxiliary_loss_mlp": 0.01300176, + "balance_loss_clip": 0.06306206, + "balance_loss_mlp": 0.01266225, + "epoch": 0.12084773786261836, + "flos": 21878008254720.0, + "grad_norm": 2.1676887329617336, + "language_loss": 0.85496145, + "learning_rate": 3.914046642358844e-06, + "loss": 0.93431073, + "num_input_tokens_seen": 43510440, + "router_z_loss_clip": 3.29101562, + "router_z_loss_mlp": 0.33959961, + "step": 2010, + "time_per_iteration": 2.577526330947876 + }, + { + "auxiliary_loss_clip": 0.06654292, + "auxiliary_loss_mlp": 0.0131443, + "balance_loss_clip": 0.06313477, + "balance_loss_mlp": 0.01277666, + "epoch": 0.12090786111528634, + "flos": 18338985607680.0, + "grad_norm": 2.943319840268963, + "language_loss": 0.85397738, + "learning_rate": 3.9139336579091174e-06, + "loss": 0.93366468, + "num_input_tokens_seen": 43530145, + "router_z_loss_clip": 3.40625, + "router_z_loss_mlp": 0.36767578, + "step": 2011, + "time_per_iteration": 2.5281803607940674 + }, + { + "auxiliary_loss_clip": 0.06651285, + "auxiliary_loss_mlp": 0.01306451, + "balance_loss_clip": 0.06310041, + "balance_loss_mlp": 0.01270975, + "epoch": 0.1209679843679543, + "flos": 21112236990720.0, + "grad_norm": 2.078534673475464, + "language_loss": 0.97477353, + "learning_rate": 3.913820600882834e-06, + "loss": 1.05435085, + "num_input_tokens_seen": 43549315, + "router_z_loss_clip": 3.4140625, + "router_z_loss_mlp": 0.35498047, + "step": 2012, + "time_per_iteration": 2.607473611831665 + }, + { + "auxiliary_loss_clip": 0.06639741, + "auxiliary_loss_mlp": 0.01302196, + "balance_loss_clip": 0.06309405, + "balance_loss_mlp": 0.01268865, + "epoch": 0.12102810762062227, + "flos": 29248612479360.0, + "grad_norm": 1.9848767494674133, + "language_loss": 0.81610048, + "learning_rate": 3.913707471284283e-06, + "loss": 0.89551985, + "num_input_tokens_seen": 43569240, + "router_z_loss_clip": 3.30078125, + "router_z_loss_mlp": 0.33325195, + "step": 2013, + "time_per_iteration": 2.616990566253662 + }, + { + "auxiliary_loss_clip": 0.06652003, + "auxiliary_loss_mlp": 0.01311561, + "balance_loss_clip": 0.06309032, + "balance_loss_mlp": 0.0127525, + "epoch": 0.12108823087329025, + "flos": 17936407866240.0, + "grad_norm": 5.4278493881784415, + "language_loss": 0.78293782, + "learning_rate": 3.9135942691177515e-06, + "loss": 0.8625735, + "num_input_tokens_seen": 43587710, + "router_z_loss_clip": 3.43164062, + "router_z_loss_mlp": 0.36328125, + "step": 2014, + "time_per_iteration": 2.651820421218872 + }, + { + "auxiliary_loss_clip": 0.06640598, + "auxiliary_loss_mlp": 0.01320367, + "balance_loss_clip": 0.0630708, + "balance_loss_mlp": 0.01286344, + "epoch": 0.12114835412595822, + "flos": 22098549000960.0, + "grad_norm": 2.982829144387911, + "language_loss": 0.88284999, + "learning_rate": 3.913480994387535e-06, + "loss": 0.96245968, + "num_input_tokens_seen": 43606000, + "router_z_loss_clip": 3.33398438, + "router_z_loss_mlp": 0.34008789, + "step": 2015, + "time_per_iteration": 2.5447444915771484 + }, + { + "auxiliary_loss_clip": 0.06640744, + "auxiliary_loss_mlp": 0.01318151, + "balance_loss_clip": 0.06308715, + "balance_loss_mlp": 0.01284534, + "epoch": 0.12120847737862618, + "flos": 20418567765120.0, + "grad_norm": 2.096885211944344, + "language_loss": 0.70457768, + "learning_rate": 3.913367647097926e-06, + "loss": 0.78416657, + "num_input_tokens_seen": 43624815, + "router_z_loss_clip": 3.32226562, + "router_z_loss_mlp": 0.3359375, + "step": 2016, + "time_per_iteration": 2.596148729324341 + }, + { + "auxiliary_loss_clip": 0.06646016, + "auxiliary_loss_mlp": 0.01314653, + "balance_loss_clip": 0.06304827, + "balance_loss_mlp": 0.01276792, + "epoch": 0.12126860063129415, + "flos": 22315484021760.0, + "grad_norm": 2.9748504234470214, + "language_loss": 0.80719239, + "learning_rate": 3.913254227253225e-06, + "loss": 0.8867991, + "num_input_tokens_seen": 43643960, + "router_z_loss_clip": 3.41015625, + "router_z_loss_mlp": 0.37890625, + "step": 2017, + "time_per_iteration": 2.531651020050049 + }, + { + "auxiliary_loss_clip": 0.06646961, + "auxiliary_loss_mlp": 0.01325201, + "balance_loss_clip": 0.06301364, + "balance_loss_mlp": 0.01289128, + "epoch": 0.12132872388396213, + "flos": 13704428753280.0, + "grad_norm": 11.74399096976628, + "language_loss": 0.70780957, + "learning_rate": 3.913140734857731e-06, + "loss": 0.78753114, + "num_input_tokens_seen": 43662650, + "router_z_loss_clip": 3.45507812, + "router_z_loss_mlp": 0.3605957, + "step": 2018, + "time_per_iteration": 2.555253267288208 + }, + { + "auxiliary_loss_clip": 0.06636061, + "auxiliary_loss_mlp": 0.01298517, + "balance_loss_clip": 0.06301028, + "balance_loss_mlp": 0.01264828, + "epoch": 0.12138884713663009, + "flos": 26473851722880.0, + "grad_norm": 2.8042762769346714, + "language_loss": 0.73802805, + "learning_rate": 3.91302716991575e-06, + "loss": 0.81737387, + "num_input_tokens_seen": 43684205, + "router_z_loss_clip": 3.34765625, + "router_z_loss_mlp": 0.33691406, + "step": 2019, + "time_per_iteration": 2.6203458309173584 + }, + { + "auxiliary_loss_clip": 0.06639916, + "auxiliary_loss_mlp": 0.01311356, + "balance_loss_clip": 0.06299765, + "balance_loss_mlp": 0.01277238, + "epoch": 0.12144897038929806, + "flos": 26148952316160.0, + "grad_norm": 1.829808829925435, + "language_loss": 0.93501657, + "learning_rate": 3.912913532431586e-06, + "loss": 1.01452923, + "num_input_tokens_seen": 43706320, + "router_z_loss_clip": 3.40234375, + "router_z_loss_mlp": 0.34130859, + "step": 2020, + "time_per_iteration": 2.5888445377349854 + }, + { + "auxiliary_loss_clip": 0.06633772, + "auxiliary_loss_mlp": 0.01299116, + "balance_loss_clip": 0.06297548, + "balance_loss_mlp": 0.01263568, + "epoch": 0.12150909364196603, + "flos": 24724451779200.0, + "grad_norm": 2.526616616661372, + "language_loss": 0.78976464, + "learning_rate": 3.912799822409549e-06, + "loss": 0.86909354, + "num_input_tokens_seen": 43724805, + "router_z_loss_clip": 3.36328125, + "router_z_loss_mlp": 0.35546875, + "step": 2021, + "time_per_iteration": 2.6022841930389404 + }, + { + "auxiliary_loss_clip": 0.0663517, + "auxiliary_loss_mlp": 0.01299013, + "balance_loss_clip": 0.06302813, + "balance_loss_mlp": 0.01266898, + "epoch": 0.121569216894634, + "flos": 25193177919360.0, + "grad_norm": 2.2515588789305645, + "language_loss": 0.8175382, + "learning_rate": 3.912686039853952e-06, + "loss": 0.89688003, + "num_input_tokens_seen": 43742320, + "router_z_loss_clip": 3.32421875, + "router_z_loss_mlp": 0.32128906, + "step": 2022, + "time_per_iteration": 2.5850207805633545 + }, + { + "auxiliary_loss_clip": 0.0664625, + "auxiliary_loss_mlp": 0.01295093, + "balance_loss_clip": 0.06304103, + "balance_loss_mlp": 0.0125964, + "epoch": 0.12162934014730196, + "flos": 13449241543680.0, + "grad_norm": 2.226180845904462, + "language_loss": 0.8644762, + "learning_rate": 3.912572184769108e-06, + "loss": 0.94388956, + "num_input_tokens_seen": 43760665, + "router_z_loss_clip": 3.421875, + "router_z_loss_mlp": 0.35424805, + "step": 2023, + "time_per_iteration": 2.541822671890259 + }, + { + "auxiliary_loss_clip": 0.06652313, + "auxiliary_loss_mlp": 0.01299326, + "balance_loss_clip": 0.06306356, + "balance_loss_mlp": 0.01261394, + "epoch": 0.12168946339996994, + "flos": 16951772937600.0, + "grad_norm": 3.6496728157667477, + "language_loss": 0.87528783, + "learning_rate": 3.912458257159335e-06, + "loss": 0.95480424, + "num_input_tokens_seen": 43779020, + "router_z_loss_clip": 3.4609375, + "router_z_loss_mlp": 0.37963867, + "step": 2024, + "time_per_iteration": 2.510047674179077 + }, + { + "auxiliary_loss_clip": 0.06637174, + "auxiliary_loss_mlp": 0.01298516, + "balance_loss_clip": 0.06299831, + "balance_loss_mlp": 0.01262872, + "epoch": 0.12174958665263791, + "flos": 29828699095680.0, + "grad_norm": 2.180683853985422, + "language_loss": 0.73548269, + "learning_rate": 3.912344257028954e-06, + "loss": 0.8148396, + "num_input_tokens_seen": 43798850, + "router_z_loss_clip": 3.37695312, + "router_z_loss_mlp": 0.35620117, + "step": 2025, + "time_per_iteration": 2.612072229385376 + }, + { + "auxiliary_loss_clip": 0.06640136, + "auxiliary_loss_mlp": 0.01296236, + "balance_loss_clip": 0.06301836, + "balance_loss_mlp": 0.01260425, + "epoch": 0.12180970990530587, + "flos": 24648366672000.0, + "grad_norm": 1.6158057232252747, + "language_loss": 0.77162802, + "learning_rate": 3.912230184382286e-06, + "loss": 0.85099173, + "num_input_tokens_seen": 43820130, + "router_z_loss_clip": 3.38476562, + "router_z_loss_mlp": 0.35766602, + "step": 2026, + "time_per_iteration": 2.5995230674743652 + }, + { + "auxiliary_loss_clip": 0.06645372, + "auxiliary_loss_mlp": 0.01300506, + "balance_loss_clip": 0.06307228, + "balance_loss_mlp": 0.01264219, + "epoch": 0.12186983315797385, + "flos": 20527915743360.0, + "grad_norm": 2.387338120412035, + "language_loss": 0.90280318, + "learning_rate": 3.912116039223659e-06, + "loss": 0.9822619, + "num_input_tokens_seen": 43838485, + "router_z_loss_clip": 3.37890625, + "router_z_loss_mlp": 0.36254883, + "step": 2027, + "time_per_iteration": 2.534867763519287 + }, + { + "auxiliary_loss_clip": 0.06634748, + "auxiliary_loss_mlp": 0.0129945, + "balance_loss_clip": 0.06304284, + "balance_loss_mlp": 0.01266905, + "epoch": 0.12192995641064182, + "flos": 27825705169920.0, + "grad_norm": 2.1781707070906644, + "language_loss": 0.76798415, + "learning_rate": 3.912001821557399e-06, + "loss": 0.84732616, + "num_input_tokens_seen": 43859080, + "router_z_loss_clip": 3.30859375, + "router_z_loss_mlp": 0.32543945, + "step": 2028, + "time_per_iteration": 2.578725576400757 + }, + { + "auxiliary_loss_clip": 0.0664517, + "auxiliary_loss_mlp": 0.01295232, + "balance_loss_clip": 0.06306128, + "balance_loss_mlp": 0.012614, + "epoch": 0.12199007966330978, + "flos": 22023512069760.0, + "grad_norm": 2.4518178731886318, + "language_loss": 0.78897178, + "learning_rate": 3.911887531387839e-06, + "loss": 0.86837584, + "num_input_tokens_seen": 43879030, + "router_z_loss_clip": 3.39257812, + "router_z_loss_mlp": 0.33813477, + "step": 2029, + "time_per_iteration": 2.5508341789245605 + }, + { + "auxiliary_loss_clip": 0.06643746, + "auxiliary_loss_mlp": 0.01296807, + "balance_loss_clip": 0.06307071, + "balance_loss_mlp": 0.01262475, + "epoch": 0.12205020291597775, + "flos": 23302005667200.0, + "grad_norm": 2.091887383256169, + "language_loss": 0.80821085, + "learning_rate": 3.911773168719313e-06, + "loss": 0.8876164, + "num_input_tokens_seen": 43898505, + "router_z_loss_clip": 3.36328125, + "router_z_loss_mlp": 0.34326172, + "step": 2030, + "time_per_iteration": 3.9340591430664062 + }, + { + "auxiliary_loss_clip": 0.06641008, + "auxiliary_loss_mlp": 0.01296523, + "balance_loss_clip": 0.06307271, + "balance_loss_mlp": 0.01263097, + "epoch": 0.12211032616864573, + "flos": 26038849651200.0, + "grad_norm": 4.123821558530392, + "language_loss": 0.75410855, + "learning_rate": 3.911658733556155e-06, + "loss": 0.83348382, + "num_input_tokens_seen": 43917945, + "router_z_loss_clip": 3.33984375, + "router_z_loss_mlp": 0.33398438, + "step": 2031, + "time_per_iteration": 4.0164101123809814 + }, + { + "auxiliary_loss_clip": 0.06642319, + "auxiliary_loss_mlp": 0.01298968, + "balance_loss_clip": 0.06307532, + "balance_loss_mlp": 0.01265947, + "epoch": 0.12217044942131369, + "flos": 20416932610560.0, + "grad_norm": 1.945082071582731, + "language_loss": 0.76790285, + "learning_rate": 3.911544225902707e-06, + "loss": 0.84731567, + "num_input_tokens_seen": 43937385, + "router_z_loss_clip": 3.34765625, + "router_z_loss_mlp": 0.33032227, + "step": 2032, + "time_per_iteration": 2.5583930015563965 + }, + { + "auxiliary_loss_clip": 0.0663031, + "auxiliary_loss_mlp": 0.01300948, + "balance_loss_clip": 0.06305249, + "balance_loss_mlp": 0.01266901, + "epoch": 0.12223057267398166, + "flos": 22863817140480.0, + "grad_norm": 1.7389762148633483, + "language_loss": 0.89850545, + "learning_rate": 3.911429645763311e-06, + "loss": 0.97781807, + "num_input_tokens_seen": 43958130, + "router_z_loss_clip": 3.24609375, + "router_z_loss_mlp": 0.34057617, + "step": 2033, + "time_per_iteration": 2.5717952251434326 + }, + { + "auxiliary_loss_clip": 0.06656118, + "auxiliary_loss_mlp": 0.01295873, + "balance_loss_clip": 0.06305313, + "balance_loss_mlp": 0.01260063, + "epoch": 0.12229069592664964, + "flos": 20053739088000.0, + "grad_norm": 2.329108980084039, + "language_loss": 0.67293733, + "learning_rate": 3.911314993142311e-06, + "loss": 0.75245726, + "num_input_tokens_seen": 43976800, + "router_z_loss_clip": 3.50585938, + "router_z_loss_mlp": 0.3581543, + "step": 2034, + "time_per_iteration": 5.42257833480835 + }, + { + "auxiliary_loss_clip": 0.06636314, + "auxiliary_loss_mlp": 0.01296044, + "balance_loss_clip": 0.06304356, + "balance_loss_mlp": 0.0126164, + "epoch": 0.1223508191793176, + "flos": 22280963339520.0, + "grad_norm": 1.830897331176389, + "language_loss": 0.77330279, + "learning_rate": 3.911200268044055e-06, + "loss": 0.85262644, + "num_input_tokens_seen": 43996620, + "router_z_loss_clip": 3.32226562, + "router_z_loss_mlp": 0.34375, + "step": 2035, + "time_per_iteration": 2.636413097381592 + }, + { + "auxiliary_loss_clip": 0.06651293, + "auxiliary_loss_mlp": 0.01293249, + "balance_loss_clip": 0.06302898, + "balance_loss_mlp": 0.01258893, + "epoch": 0.12241094243198557, + "flos": 21292009925760.0, + "grad_norm": 2.7740017238095187, + "language_loss": 0.73084652, + "learning_rate": 3.911085470472892e-06, + "loss": 0.81029195, + "num_input_tokens_seen": 44016175, + "router_z_loss_clip": 3.48632812, + "router_z_loss_mlp": 0.34350586, + "step": 2036, + "time_per_iteration": 2.528167724609375 + }, + { + "auxiliary_loss_clip": 0.06639268, + "auxiliary_loss_mlp": 0.01290851, + "balance_loss_clip": 0.06301118, + "balance_loss_mlp": 0.01256185, + "epoch": 0.12247106568465355, + "flos": 17387823185280.0, + "grad_norm": 1.824605307650974, + "language_loss": 0.84228837, + "learning_rate": 3.910970600433178e-06, + "loss": 0.92158961, + "num_input_tokens_seen": 44035060, + "router_z_loss_clip": 3.3828125, + "router_z_loss_mlp": 0.34692383, + "step": 2037, + "time_per_iteration": 2.554356575012207 + }, + { + "auxiliary_loss_clip": 0.06640968, + "auxiliary_loss_mlp": 0.0129909, + "balance_loss_clip": 0.06304546, + "balance_loss_mlp": 0.01265043, + "epoch": 0.12253118893732151, + "flos": 27051548497920.0, + "grad_norm": 3.231665500772768, + "language_loss": 0.81365263, + "learning_rate": 3.910855657929267e-06, + "loss": 0.89305323, + "num_input_tokens_seen": 44053330, + "router_z_loss_clip": 3.36523438, + "router_z_loss_mlp": 0.34057617, + "step": 2038, + "time_per_iteration": 2.5666050910949707 + }, + { + "auxiliary_loss_clip": 0.0649721, + "auxiliary_loss_mlp": 0.01268916, + "balance_loss_clip": 0.06293084, + "balance_loss_mlp": 0.01256113, + "epoch": 0.12259131218998948, + "flos": 53878055328000.0, + "grad_norm": 0.7896182211698063, + "language_loss": 0.58607936, + "learning_rate": 3.910740642965518e-06, + "loss": 0.66374058, + "num_input_tokens_seen": 44107575, + "router_z_loss_clip": 2.04101562, + "router_z_loss_mlp": 0.12817383, + "step": 2039, + "time_per_iteration": 3.1232099533081055 + }, + { + "auxiliary_loss_clip": 0.06641525, + "auxiliary_loss_mlp": 0.0129467, + "balance_loss_clip": 0.06306375, + "balance_loss_mlp": 0.01261053, + "epoch": 0.12265143544265744, + "flos": 17897233282560.0, + "grad_norm": 3.4610063472864065, + "language_loss": 0.82137585, + "learning_rate": 3.910625555546292e-06, + "loss": 0.90073782, + "num_input_tokens_seen": 44126075, + "router_z_loss_clip": 3.34765625, + "router_z_loss_mlp": 0.33569336, + "step": 2040, + "time_per_iteration": 2.5443432331085205 + }, + { + "auxiliary_loss_clip": 0.06629258, + "auxiliary_loss_mlp": 0.01288004, + "balance_loss_clip": 0.06301395, + "balance_loss_mlp": 0.01255031, + "epoch": 0.12271155869532542, + "flos": 21806577048960.0, + "grad_norm": 2.3749836007198546, + "language_loss": 0.84196723, + "learning_rate": 3.910510395675953e-06, + "loss": 0.92113984, + "num_input_tokens_seen": 44145605, + "router_z_loss_clip": 3.27539062, + "router_z_loss_mlp": 0.32983398, + "step": 2041, + "time_per_iteration": 2.5387189388275146 + }, + { + "auxiliary_loss_clip": 0.06646631, + "auxiliary_loss_mlp": 0.01292367, + "balance_loss_clip": 0.06301489, + "balance_loss_mlp": 0.0125627, + "epoch": 0.12277168194799339, + "flos": 19834917350400.0, + "grad_norm": 2.032940304960421, + "language_loss": 0.68564701, + "learning_rate": 3.9103951633588694e-06, + "loss": 0.76503706, + "num_input_tokens_seen": 44164770, + "router_z_loss_clip": 3.44726562, + "router_z_loss_mlp": 0.36083984, + "step": 2042, + "time_per_iteration": 2.5871469974517822 + }, + { + "auxiliary_loss_clip": 0.06626363, + "auxiliary_loss_mlp": 0.01291525, + "balance_loss_clip": 0.06293724, + "balance_loss_mlp": 0.01258957, + "epoch": 0.12283180520066135, + "flos": 23227597641600.0, + "grad_norm": 4.507885061874762, + "language_loss": 0.82501084, + "learning_rate": 3.910279858599409e-06, + "loss": 0.90418965, + "num_input_tokens_seen": 44184025, + "router_z_loss_clip": 3.328125, + "router_z_loss_mlp": 0.32568359, + "step": 2043, + "time_per_iteration": 2.5436289310455322 + }, + { + "auxiliary_loss_clip": 0.06642601, + "auxiliary_loss_mlp": 0.01293474, + "balance_loss_clip": 0.06301275, + "balance_loss_mlp": 0.01260501, + "epoch": 0.12289192845332933, + "flos": 18594466306560.0, + "grad_norm": 1.8262165625903515, + "language_loss": 0.8169322, + "learning_rate": 3.910164481401946e-06, + "loss": 0.89629292, + "num_input_tokens_seen": 44202950, + "router_z_loss_clip": 3.41210938, + "router_z_loss_mlp": 0.32983398, + "step": 2044, + "time_per_iteration": 2.5594139099121094 + }, + { + "auxiliary_loss_clip": 0.06635186, + "auxiliary_loss_mlp": 0.0128851, + "balance_loss_clip": 0.06299295, + "balance_loss_mlp": 0.01254416, + "epoch": 0.1229520517059973, + "flos": 25775612449920.0, + "grad_norm": 1.8452303970598702, + "language_loss": 0.79028547, + "learning_rate": 3.910049031770853e-06, + "loss": 0.86952239, + "num_input_tokens_seen": 44221115, + "router_z_loss_clip": 3.35742188, + "router_z_loss_mlp": 0.34082031, + "step": 2045, + "time_per_iteration": 2.5465781688690186 + }, + { + "auxiliary_loss_clip": 0.06636953, + "auxiliary_loss_mlp": 0.01295167, + "balance_loss_clip": 0.06298777, + "balance_loss_mlp": 0.01262408, + "epoch": 0.12301217495866526, + "flos": 20893541034240.0, + "grad_norm": 1.9769865564806426, + "language_loss": 0.69156218, + "learning_rate": 3.90993350971051e-06, + "loss": 0.77088338, + "num_input_tokens_seen": 44240575, + "router_z_loss_clip": 3.38671875, + "router_z_loss_mlp": 0.32763672, + "step": 2046, + "time_per_iteration": 2.5848565101623535 + }, + { + "auxiliary_loss_clip": 0.06628656, + "auxiliary_loss_mlp": 0.01290131, + "balance_loss_clip": 0.06297234, + "balance_loss_mlp": 0.01257277, + "epoch": 0.12307229821133324, + "flos": 22384735021440.0, + "grad_norm": 2.0992511324886713, + "language_loss": 0.73182803, + "learning_rate": 3.909817915225297e-06, + "loss": 0.8110159, + "num_input_tokens_seen": 44257145, + "router_z_loss_clip": 3.31640625, + "router_z_loss_mlp": 0.32861328, + "step": 2047, + "time_per_iteration": 2.5309009552001953 + }, + { + "auxiliary_loss_clip": 0.06630135, + "auxiliary_loss_mlp": 0.0129866, + "balance_loss_clip": 0.06297912, + "balance_loss_mlp": 0.01263732, + "epoch": 0.1231324214640012, + "flos": 23374065778560.0, + "grad_norm": 2.486188262823441, + "language_loss": 0.77457881, + "learning_rate": 3.909702248319597e-06, + "loss": 0.85386682, + "num_input_tokens_seen": 44278035, + "router_z_loss_clip": 3.32226562, + "router_z_loss_mlp": 0.34912109, + "step": 2048, + "time_per_iteration": 2.6273012161254883 + }, + { + "auxiliary_loss_clip": 0.06627734, + "auxiliary_loss_mlp": 0.01290224, + "balance_loss_clip": 0.06297483, + "balance_loss_mlp": 0.01258514, + "epoch": 0.12319254471666917, + "flos": 23773624773120.0, + "grad_norm": 1.9256853930308273, + "language_loss": 0.8659687, + "learning_rate": 3.909586508997797e-06, + "loss": 0.94514829, + "num_input_tokens_seen": 44296980, + "router_z_loss_clip": 3.30273438, + "router_z_loss_mlp": 0.31665039, + "step": 2049, + "time_per_iteration": 2.559253692626953 + }, + { + "auxiliary_loss_clip": 0.06639866, + "auxiliary_loss_mlp": 0.01291416, + "balance_loss_clip": 0.06300847, + "balance_loss_mlp": 0.01257751, + "epoch": 0.12325266796933713, + "flos": 23556899387520.0, + "grad_norm": 2.574663902354124, + "language_loss": 0.76814753, + "learning_rate": 3.909470697264285e-06, + "loss": 0.84746033, + "num_input_tokens_seen": 44318005, + "router_z_loss_clip": 3.390625, + "router_z_loss_mlp": 0.33691406, + "step": 2050, + "time_per_iteration": 2.6138648986816406 + }, + { + "auxiliary_loss_clip": 0.06634495, + "auxiliary_loss_mlp": 0.0128935, + "balance_loss_clip": 0.06301371, + "balance_loss_mlp": 0.01256353, + "epoch": 0.12331279122200511, + "flos": 24430593110400.0, + "grad_norm": 2.4676515957678826, + "language_loss": 0.82809746, + "learning_rate": 3.909354813123452e-06, + "loss": 0.90733588, + "num_input_tokens_seen": 44335260, + "router_z_loss_clip": 3.328125, + "router_z_loss_mlp": 0.32983398, + "step": 2051, + "time_per_iteration": 2.53440260887146 + }, + { + "auxiliary_loss_clip": 0.06631288, + "auxiliary_loss_mlp": 0.01288335, + "balance_loss_clip": 0.06299216, + "balance_loss_mlp": 0.01256625, + "epoch": 0.12337291447467308, + "flos": 25491438927360.0, + "grad_norm": 2.0266783151609666, + "language_loss": 0.81273621, + "learning_rate": 3.909238856579693e-06, + "loss": 0.89193243, + "num_input_tokens_seen": 44355315, + "router_z_loss_clip": 3.32421875, + "router_z_loss_mlp": 0.3170166, + "step": 2052, + "time_per_iteration": 2.5801045894622803 + }, + { + "auxiliary_loss_clip": 0.06643972, + "auxiliary_loss_mlp": 0.012894, + "balance_loss_clip": 0.06304064, + "balance_loss_mlp": 0.0125533, + "epoch": 0.12343303772734104, + "flos": 23556731679360.0, + "grad_norm": 2.520879144307052, + "language_loss": 0.75331706, + "learning_rate": 3.909122827637406e-06, + "loss": 0.83265078, + "num_input_tokens_seen": 44373020, + "router_z_loss_clip": 3.40039062, + "router_z_loss_mlp": 0.34082031, + "step": 2053, + "time_per_iteration": 2.5373435020446777 + }, + { + "auxiliary_loss_clip": 0.06645267, + "auxiliary_loss_mlp": 0.01289892, + "balance_loss_clip": 0.06306874, + "balance_loss_mlp": 0.01256919, + "epoch": 0.12349316098000902, + "flos": 47567724670080.0, + "grad_norm": 1.6252086945457442, + "language_loss": 0.75631851, + "learning_rate": 3.909006726300991e-06, + "loss": 0.83567011, + "num_input_tokens_seen": 44397525, + "router_z_loss_clip": 3.37890625, + "router_z_loss_mlp": 0.32983398, + "step": 2054, + "time_per_iteration": 2.7952961921691895 + }, + { + "auxiliary_loss_clip": 0.06634779, + "auxiliary_loss_mlp": 0.01287596, + "balance_loss_clip": 0.06307411, + "balance_loss_mlp": 0.0125715, + "epoch": 0.12355328423267699, + "flos": 25052956911360.0, + "grad_norm": 1.7485213657356729, + "language_loss": 0.86270738, + "learning_rate": 3.908890552574849e-06, + "loss": 0.94193119, + "num_input_tokens_seen": 44415890, + "router_z_loss_clip": 3.27148438, + "router_z_loss_mlp": 0.30419922, + "step": 2055, + "time_per_iteration": 2.553056001663208 + }, + { + "auxiliary_loss_clip": 0.06643809, + "auxiliary_loss_mlp": 0.01295066, + "balance_loss_clip": 0.06311696, + "balance_loss_mlp": 0.0126226, + "epoch": 0.12361340748534495, + "flos": 27716524899840.0, + "grad_norm": 2.053117172443155, + "language_loss": 0.78908336, + "learning_rate": 3.908774306463384e-06, + "loss": 0.86847222, + "num_input_tokens_seen": 44436625, + "router_z_loss_clip": 3.32421875, + "router_z_loss_mlp": 0.328125, + "step": 2056, + "time_per_iteration": 2.632049322128296 + }, + { + "auxiliary_loss_clip": 0.06652766, + "auxiliary_loss_mlp": 0.01294236, + "balance_loss_clip": 0.06316112, + "balance_loss_mlp": 0.01262002, + "epoch": 0.12367353073801293, + "flos": 26147778359040.0, + "grad_norm": 2.0516910638510835, + "language_loss": 0.84512216, + "learning_rate": 3.908657987971009e-06, + "loss": 0.92459214, + "num_input_tokens_seen": 44455265, + "router_z_loss_clip": 3.3671875, + "router_z_loss_mlp": 0.32226562, + "step": 2057, + "time_per_iteration": 2.5529589653015137 + }, + { + "auxiliary_loss_clip": 0.06650747, + "auxiliary_loss_mlp": 0.0129436, + "balance_loss_clip": 0.06317189, + "balance_loss_mlp": 0.01261553, + "epoch": 0.1237336539906809, + "flos": 25163143430400.0, + "grad_norm": 1.8863431007110945, + "language_loss": 0.7932052, + "learning_rate": 3.90854159710213e-06, + "loss": 0.87265623, + "num_input_tokens_seen": 44475815, + "router_z_loss_clip": 3.33398438, + "router_z_loss_mlp": 0.328125, + "step": 2058, + "time_per_iteration": 2.636936902999878 + }, + { + "auxiliary_loss_clip": 0.06652544, + "auxiliary_loss_mlp": 0.01294377, + "balance_loss_clip": 0.06313539, + "balance_loss_mlp": 0.01259782, + "epoch": 0.12379377724334886, + "flos": 15310001963520.0, + "grad_norm": 2.1631103181071865, + "language_loss": 0.84899569, + "learning_rate": 3.9084251338611624e-06, + "loss": 0.92846489, + "num_input_tokens_seen": 44494045, + "router_z_loss_clip": 3.38671875, + "router_z_loss_mlp": 0.34619141, + "step": 2059, + "time_per_iteration": 2.534330129623413 + }, + { + "auxiliary_loss_clip": 0.06649262, + "auxiliary_loss_mlp": 0.01290616, + "balance_loss_clip": 0.06311791, + "balance_loss_mlp": 0.01258405, + "epoch": 0.12385390049601683, + "flos": 21321792852480.0, + "grad_norm": 2.425291985469593, + "language_loss": 0.82626045, + "learning_rate": 3.908308598252523e-06, + "loss": 0.90565926, + "num_input_tokens_seen": 44509120, + "router_z_loss_clip": 3.37695312, + "router_z_loss_mlp": 0.32177734, + "step": 2060, + "time_per_iteration": 2.6014535427093506 + }, + { + "auxiliary_loss_clip": 0.06642138, + "auxiliary_loss_mlp": 0.01290673, + "balance_loss_clip": 0.06310271, + "balance_loss_mlp": 0.01256579, + "epoch": 0.1239140237486848, + "flos": 15120711590400.0, + "grad_norm": 2.0800945388405734, + "language_loss": 0.87935984, + "learning_rate": 3.9081919902806306e-06, + "loss": 0.95868802, + "num_input_tokens_seen": 44525780, + "router_z_loss_clip": 3.31640625, + "router_z_loss_mlp": 0.34082031, + "step": 2061, + "time_per_iteration": 2.494584321975708 + }, + { + "auxiliary_loss_clip": 0.0663335, + "auxiliary_loss_mlp": 0.01291205, + "balance_loss_clip": 0.06306711, + "balance_loss_mlp": 0.01260259, + "epoch": 0.12397414700135277, + "flos": 21982534623360.0, + "grad_norm": 1.9753177189275368, + "language_loss": 0.85858583, + "learning_rate": 3.908075309949906e-06, + "loss": 0.9378314, + "num_input_tokens_seen": 44543125, + "router_z_loss_clip": 3.26953125, + "router_z_loss_mlp": 0.30932617, + "step": 2062, + "time_per_iteration": 2.5650103092193604 + }, + { + "auxiliary_loss_clip": 0.06642005, + "auxiliary_loss_mlp": 0.01288926, + "balance_loss_clip": 0.06311023, + "balance_loss_mlp": 0.01256549, + "epoch": 0.12403427025402074, + "flos": 13404909934080.0, + "grad_norm": 1.7604795458830171, + "language_loss": 0.80305374, + "learning_rate": 3.907958557264774e-06, + "loss": 0.88236302, + "num_input_tokens_seen": 44560275, + "router_z_loss_clip": 3.30859375, + "router_z_loss_mlp": 0.32373047, + "step": 2063, + "time_per_iteration": 2.5019121170043945 + }, + { + "auxiliary_loss_clip": 0.06644779, + "auxiliary_loss_mlp": 0.0129093, + "balance_loss_clip": 0.06312533, + "balance_loss_mlp": 0.01257146, + "epoch": 0.12409439350668872, + "flos": 15309750401280.0, + "grad_norm": 2.5047408324670832, + "language_loss": 0.80646086, + "learning_rate": 3.907841732229663e-06, + "loss": 0.885818, + "num_input_tokens_seen": 44577640, + "router_z_loss_clip": 3.32421875, + "router_z_loss_mlp": 0.33789062, + "step": 2064, + "time_per_iteration": 2.5915873050689697 + }, + { + "auxiliary_loss_clip": 0.06642206, + "auxiliary_loss_mlp": 0.01295102, + "balance_loss_clip": 0.06310631, + "balance_loss_mlp": 0.01263583, + "epoch": 0.12415451675935668, + "flos": 25016339877120.0, + "grad_norm": 2.4114555321806677, + "language_loss": 0.93642998, + "learning_rate": 3.907724834849002e-06, + "loss": 1.0158031, + "num_input_tokens_seen": 44594860, + "router_z_loss_clip": 3.31445312, + "router_z_loss_mlp": 0.31542969, + "step": 2065, + "time_per_iteration": 2.561858892440796 + }, + { + "auxiliary_loss_clip": 0.06650305, + "auxiliary_loss_mlp": 0.01289676, + "balance_loss_clip": 0.06313996, + "balance_loss_mlp": 0.01256845, + "epoch": 0.12421464001202465, + "flos": 23666457000960.0, + "grad_norm": 2.189266948105698, + "language_loss": 0.81909287, + "learning_rate": 3.907607865127225e-06, + "loss": 0.89849269, + "num_input_tokens_seen": 44614780, + "router_z_loss_clip": 3.36523438, + "router_z_loss_mlp": 0.32836914, + "step": 2066, + "time_per_iteration": 2.593202590942383 + }, + { + "auxiliary_loss_clip": 0.06490391, + "auxiliary_loss_mlp": 0.01263186, + "balance_loss_clip": 0.06289329, + "balance_loss_mlp": 0.01251599, + "epoch": 0.12427476326469263, + "flos": 65753686794240.0, + "grad_norm": 0.8319051039342746, + "language_loss": 0.63633674, + "learning_rate": 3.907490823068766e-06, + "loss": 0.71387255, + "num_input_tokens_seen": 44671240, + "router_z_loss_clip": 2.00195312, + "router_z_loss_mlp": 0.11578369, + "step": 2067, + "time_per_iteration": 3.1761627197265625 + }, + { + "auxiliary_loss_clip": 0.06645706, + "auxiliary_loss_mlp": 0.01298846, + "balance_loss_clip": 0.0631035, + "balance_loss_mlp": 0.01263441, + "epoch": 0.12433488651736059, + "flos": 24542372856960.0, + "grad_norm": 1.826307317776044, + "language_loss": 0.94409752, + "learning_rate": 3.907373708678063e-06, + "loss": 1.023543, + "num_input_tokens_seen": 44691050, + "router_z_loss_clip": 3.3515625, + "router_z_loss_mlp": 0.35375977, + "step": 2068, + "time_per_iteration": 2.548051357269287 + }, + { + "auxiliary_loss_clip": 0.06634392, + "auxiliary_loss_mlp": 0.01295819, + "balance_loss_clip": 0.06307046, + "balance_loss_mlp": 0.01265087, + "epoch": 0.12439500977002856, + "flos": 21037828965120.0, + "grad_norm": 2.192174211914145, + "language_loss": 0.82850045, + "learning_rate": 3.9072565219595596e-06, + "loss": 0.90780252, + "num_input_tokens_seen": 44709850, + "router_z_loss_clip": 3.2734375, + "router_z_loss_mlp": 0.30712891, + "step": 2069, + "time_per_iteration": 3.9771463871002197 + }, + { + "auxiliary_loss_clip": 0.0664653, + "auxiliary_loss_mlp": 0.01287176, + "balance_loss_clip": 0.06312294, + "balance_loss_mlp": 0.01255276, + "epoch": 0.12445513302269653, + "flos": 26837380661760.0, + "grad_norm": 2.140489528942806, + "language_loss": 0.78554291, + "learning_rate": 3.907139262917696e-06, + "loss": 0.86487997, + "num_input_tokens_seen": 44731475, + "router_z_loss_clip": 3.33984375, + "router_z_loss_mlp": 0.31884766, + "step": 2070, + "time_per_iteration": 2.5697221755981445 + }, + { + "auxiliary_loss_clip": 0.06645045, + "auxiliary_loss_mlp": 0.01288939, + "balance_loss_clip": 0.06311486, + "balance_loss_mlp": 0.01258469, + "epoch": 0.1245152562753645, + "flos": 18374764101120.0, + "grad_norm": 2.28424874253062, + "language_loss": 0.81667042, + "learning_rate": 3.907021931556922e-06, + "loss": 0.89601028, + "num_input_tokens_seen": 44749685, + "router_z_loss_clip": 3.3359375, + "router_z_loss_mlp": 0.3046875, + "step": 2071, + "time_per_iteration": 3.9356284141540527 + }, + { + "auxiliary_loss_clip": 0.06624742, + "auxiliary_loss_mlp": 0.01289094, + "balance_loss_clip": 0.06303577, + "balance_loss_mlp": 0.01256407, + "epoch": 0.12457537952803246, + "flos": 33116098331520.0, + "grad_norm": 2.0527550980706626, + "language_loss": 0.79415953, + "learning_rate": 3.906904527881684e-06, + "loss": 0.87329787, + "num_input_tokens_seen": 44772165, + "router_z_loss_clip": 3.20898438, + "router_z_loss_mlp": 0.32666016, + "step": 2072, + "time_per_iteration": 2.659824848175049 + }, + { + "auxiliary_loss_clip": 0.06639021, + "auxiliary_loss_mlp": 0.01293554, + "balance_loss_clip": 0.06306598, + "balance_loss_mlp": 0.01260819, + "epoch": 0.12463550278070043, + "flos": 22276267511040.0, + "grad_norm": 2.0170209718237144, + "language_loss": 0.76458508, + "learning_rate": 3.9067870518964355e-06, + "loss": 0.84391081, + "num_input_tokens_seen": 44790580, + "router_z_loss_clip": 3.32226562, + "router_z_loss_mlp": 0.32739258, + "step": 2073, + "time_per_iteration": 4.0372233390808105 + }, + { + "auxiliary_loss_clip": 0.06627664, + "auxiliary_loss_mlp": 0.012867, + "balance_loss_clip": 0.06303963, + "balance_loss_mlp": 0.01255491, + "epoch": 0.12469562603336841, + "flos": 14683445458560.0, + "grad_norm": 1.9751185197934578, + "language_loss": 0.9136548, + "learning_rate": 3.906669503605631e-06, + "loss": 0.99279845, + "num_input_tokens_seen": 44806730, + "router_z_loss_clip": 3.24023438, + "router_z_loss_mlp": 0.3125, + "step": 2074, + "time_per_iteration": 3.880718946456909 + }, + { + "auxiliary_loss_clip": 0.06644025, + "auxiliary_loss_mlp": 0.01296508, + "balance_loss_clip": 0.06306964, + "balance_loss_mlp": 0.0126065, + "epoch": 0.12475574928603637, + "flos": 24651720835200.0, + "grad_norm": 2.411338932827457, + "language_loss": 0.85379255, + "learning_rate": 3.906551883013728e-06, + "loss": 0.93319792, + "num_input_tokens_seen": 44825550, + "router_z_loss_clip": 3.36914062, + "router_z_loss_mlp": 0.35839844, + "step": 2075, + "time_per_iteration": 2.593402147293091 + }, + { + "auxiliary_loss_clip": 0.06632458, + "auxiliary_loss_mlp": 0.01300353, + "balance_loss_clip": 0.06302904, + "balance_loss_mlp": 0.01267166, + "epoch": 0.12481587253870434, + "flos": 21769540744320.0, + "grad_norm": 1.9904013424210072, + "language_loss": 0.73795271, + "learning_rate": 3.9064341901251865e-06, + "loss": 0.81728083, + "num_input_tokens_seen": 44844155, + "router_z_loss_clip": 3.29296875, + "router_z_loss_mlp": 0.33227539, + "step": 2076, + "time_per_iteration": 2.5252525806427 + }, + { + "auxiliary_loss_clip": 0.06619625, + "auxiliary_loss_mlp": 0.01296003, + "balance_loss_clip": 0.06298469, + "balance_loss_mlp": 0.0126632, + "epoch": 0.12487599579137232, + "flos": 21438687697920.0, + "grad_norm": 2.119852671968812, + "language_loss": 0.76853049, + "learning_rate": 3.906316424944469e-06, + "loss": 0.84768671, + "num_input_tokens_seen": 44863780, + "router_z_loss_clip": 3.2109375, + "router_z_loss_mlp": 0.29663086, + "step": 2077, + "time_per_iteration": 2.5812795162200928 + }, + { + "auxiliary_loss_clip": 0.06627834, + "auxiliary_loss_mlp": 0.01294428, + "balance_loss_clip": 0.06298409, + "balance_loss_mlp": 0.01261503, + "epoch": 0.12493611904404028, + "flos": 16113228802560.0, + "grad_norm": 2.6079444778137906, + "language_loss": 0.83980322, + "learning_rate": 3.906198587476043e-06, + "loss": 0.9190259, + "num_input_tokens_seen": 44881480, + "router_z_loss_clip": 3.296875, + "router_z_loss_mlp": 0.3293457, + "step": 2078, + "time_per_iteration": 2.5144779682159424 + }, + { + "auxiliary_loss_clip": 0.06633472, + "auxiliary_loss_mlp": 0.01297977, + "balance_loss_clip": 0.06301548, + "balance_loss_mlp": 0.01265337, + "epoch": 0.12499624229670825, + "flos": 21586749062400.0, + "grad_norm": 2.088353376240652, + "language_loss": 0.7681694, + "learning_rate": 3.906080677724374e-06, + "loss": 0.84748387, + "num_input_tokens_seen": 44900390, + "router_z_loss_clip": 3.32226562, + "router_z_loss_mlp": 0.32617188, + "step": 2079, + "time_per_iteration": 2.638761043548584 + }, + { + "auxiliary_loss_clip": 0.06640807, + "auxiliary_loss_mlp": 0.01295919, + "balance_loss_clip": 0.06307015, + "balance_loss_mlp": 0.01263351, + "epoch": 0.1250563655493762, + "flos": 25705522909440.0, + "grad_norm": 2.3726479932939064, + "language_loss": 0.85245967, + "learning_rate": 3.905962695693935e-06, + "loss": 0.93182695, + "num_input_tokens_seen": 44920375, + "router_z_loss_clip": 3.33984375, + "router_z_loss_mlp": 0.32592773, + "step": 2080, + "time_per_iteration": 2.5898683071136475 + }, + { + "auxiliary_loss_clip": 0.06618196, + "auxiliary_loss_mlp": 0.0130361, + "balance_loss_clip": 0.06292213, + "balance_loss_mlp": 0.01269993, + "epoch": 0.12511648880204418, + "flos": 16915113976320.0, + "grad_norm": 2.1047824756143263, + "language_loss": 0.86146665, + "learning_rate": 3.9058446413892e-06, + "loss": 0.94068468, + "num_input_tokens_seen": 44938415, + "router_z_loss_clip": 3.25976562, + "router_z_loss_mlp": 0.3359375, + "step": 2081, + "time_per_iteration": 2.5291430950164795 + }, + { + "auxiliary_loss_clip": 0.06628423, + "auxiliary_loss_mlp": 0.01299212, + "balance_loss_clip": 0.06304745, + "balance_loss_mlp": 0.01268289, + "epoch": 0.12517661205471217, + "flos": 17573423978880.0, + "grad_norm": 1.9525319716543403, + "language_loss": 0.77591729, + "learning_rate": 3.905726514814646e-06, + "loss": 0.85519361, + "num_input_tokens_seen": 44957135, + "router_z_loss_clip": 3.23632812, + "router_z_loss_mlp": 0.30908203, + "step": 2082, + "time_per_iteration": 2.5817041397094727 + }, + { + "auxiliary_loss_clip": 0.06645833, + "auxiliary_loss_mlp": 0.01295307, + "balance_loss_clip": 0.06304055, + "balance_loss_mlp": 0.01261118, + "epoch": 0.12523673530738014, + "flos": 16039240047360.0, + "grad_norm": 3.06086551706414, + "language_loss": 0.80167735, + "learning_rate": 3.9056083159747495e-06, + "loss": 0.88108873, + "num_input_tokens_seen": 44974480, + "router_z_loss_clip": 3.421875, + "router_z_loss_mlp": 0.34179688, + "step": 2083, + "time_per_iteration": 2.6278059482574463 + }, + { + "auxiliary_loss_clip": 0.06632711, + "auxiliary_loss_mlp": 0.01297422, + "balance_loss_clip": 0.06298797, + "balance_loss_mlp": 0.0126297, + "epoch": 0.1252968585600481, + "flos": 18813833095680.0, + "grad_norm": 3.451384720222282, + "language_loss": 0.92214763, + "learning_rate": 3.9054900448739966e-06, + "loss": 1.00144899, + "num_input_tokens_seen": 44990310, + "router_z_loss_clip": 3.33789062, + "router_z_loss_mlp": 0.34472656, + "step": 2084, + "time_per_iteration": 2.501530647277832 + }, + { + "auxiliary_loss_clip": 0.0662484, + "auxiliary_loss_mlp": 0.01295191, + "balance_loss_clip": 0.06296518, + "balance_loss_mlp": 0.01263171, + "epoch": 0.12535698181271607, + "flos": 27278923351680.0, + "grad_norm": 1.9702751102582312, + "language_loss": 0.81308639, + "learning_rate": 3.905371701516869e-06, + "loss": 0.89228666, + "num_input_tokens_seen": 45010720, + "router_z_loss_clip": 3.28320312, + "router_z_loss_mlp": 0.32006836, + "step": 2085, + "time_per_iteration": 2.5993080139160156 + }, + { + "auxiliary_loss_clip": 0.06621981, + "auxiliary_loss_mlp": 0.01314133, + "balance_loss_clip": 0.06297316, + "balance_loss_mlp": 0.01281469, + "epoch": 0.12541710506538403, + "flos": 22060590301440.0, + "grad_norm": 2.513443994409739, + "language_loss": 0.89793539, + "learning_rate": 3.905253285907856e-06, + "loss": 0.97729653, + "num_input_tokens_seen": 45030360, + "router_z_loss_clip": 3.24609375, + "router_z_loss_mlp": 0.32641602, + "step": 2086, + "time_per_iteration": 2.526017427444458 + }, + { + "auxiliary_loss_clip": 0.0661508, + "auxiliary_loss_mlp": 0.01297904, + "balance_loss_clip": 0.06298057, + "balance_loss_mlp": 0.01269651, + "epoch": 0.125477228318052, + "flos": 12607888296960.0, + "grad_norm": 2.458580206146656, + "language_loss": 0.88740981, + "learning_rate": 3.905134798051447e-06, + "loss": 0.96653962, + "num_input_tokens_seen": 45045085, + "router_z_loss_clip": 3.16992188, + "router_z_loss_mlp": 0.28271484, + "step": 2087, + "time_per_iteration": 2.6768429279327393 + }, + { + "auxiliary_loss_clip": 0.06626555, + "auxiliary_loss_mlp": 0.0130267, + "balance_loss_clip": 0.06301963, + "balance_loss_mlp": 0.0127077, + "epoch": 0.12553735157071996, + "flos": 23885362592640.0, + "grad_norm": 1.907782132807464, + "language_loss": 0.74902099, + "learning_rate": 3.905016237952136e-06, + "loss": 0.82831323, + "num_input_tokens_seen": 45065145, + "router_z_loss_clip": 3.24804688, + "router_z_loss_mlp": 0.3190918, + "step": 2088, + "time_per_iteration": 2.584322690963745 + }, + { + "auxiliary_loss_clip": 0.06515329, + "auxiliary_loss_mlp": 0.01279784, + "balance_loss_clip": 0.06318291, + "balance_loss_mlp": 0.01264752, + "epoch": 0.12559747482338796, + "flos": 69940998881280.0, + "grad_norm": 0.7370797813517723, + "language_loss": 0.61766195, + "learning_rate": 3.904897605614418e-06, + "loss": 0.69561303, + "num_input_tokens_seen": 45126230, + "router_z_loss_clip": 1.96875, + "router_z_loss_mlp": 0.15002441, + "step": 2089, + "time_per_iteration": 3.1401424407958984 + }, + { + "auxiliary_loss_clip": 0.06624255, + "auxiliary_loss_mlp": 0.01293606, + "balance_loss_clip": 0.06302167, + "balance_loss_mlp": 0.01262707, + "epoch": 0.12565759807605592, + "flos": 24286389033600.0, + "grad_norm": 1.9922861494736146, + "language_loss": 0.80224949, + "learning_rate": 3.904778901042793e-06, + "loss": 0.88142806, + "num_input_tokens_seen": 45145545, + "router_z_loss_clip": 3.22070312, + "router_z_loss_mlp": 0.30883789, + "step": 2090, + "time_per_iteration": 2.6044373512268066 + }, + { + "auxiliary_loss_clip": 0.0651547, + "auxiliary_loss_mlp": 0.01267283, + "balance_loss_clip": 0.06318653, + "balance_loss_mlp": 0.01254635, + "epoch": 0.12571772132872389, + "flos": 56468011904640.0, + "grad_norm": 0.7384472353065198, + "language_loss": 0.58865118, + "learning_rate": 3.90466012424176e-06, + "loss": 0.66647875, + "num_input_tokens_seen": 45206845, + "router_z_loss_clip": 1.96875, + "router_z_loss_mlp": 0.12646484, + "step": 2091, + "time_per_iteration": 3.1160824298858643 + }, + { + "auxiliary_loss_clip": 0.06630008, + "auxiliary_loss_mlp": 0.01289162, + "balance_loss_clip": 0.06302688, + "balance_loss_mlp": 0.0125781, + "epoch": 0.12577784458139185, + "flos": 41255576421120.0, + "grad_norm": 1.8290499485408422, + "language_loss": 0.65244853, + "learning_rate": 3.904541275215825e-06, + "loss": 0.73164022, + "num_input_tokens_seen": 45228495, + "router_z_loss_clip": 3.26953125, + "router_z_loss_mlp": 0.31347656, + "step": 2092, + "time_per_iteration": 2.743067741394043 + }, + { + "auxiliary_loss_clip": 0.06640761, + "auxiliary_loss_mlp": 0.01299851, + "balance_loss_clip": 0.06305548, + "balance_loss_mlp": 0.01265542, + "epoch": 0.12583796783405982, + "flos": 19761599427840.0, + "grad_norm": 2.082922063254684, + "language_loss": 0.82319552, + "learning_rate": 3.904422353969493e-06, + "loss": 0.9026016, + "num_input_tokens_seen": 45245720, + "router_z_loss_clip": 3.34960938, + "router_z_loss_mlp": 0.34277344, + "step": 2093, + "time_per_iteration": 2.5252139568328857 + }, + { + "auxiliary_loss_clip": 0.06622188, + "auxiliary_loss_mlp": 0.01291379, + "balance_loss_clip": 0.06303331, + "balance_loss_mlp": 0.01260766, + "epoch": 0.12589809108672778, + "flos": 22608797639040.0, + "grad_norm": 2.0047110075262635, + "language_loss": 0.76888406, + "learning_rate": 3.904303360507276e-06, + "loss": 0.84801972, + "num_input_tokens_seen": 45265650, + "router_z_loss_clip": 3.1875, + "router_z_loss_mlp": 0.30639648, + "step": 2094, + "time_per_iteration": 2.5590462684631348 + }, + { + "auxiliary_loss_clip": 0.06619669, + "auxiliary_loss_mlp": 0.01298019, + "balance_loss_clip": 0.06299751, + "balance_loss_mlp": 0.01266309, + "epoch": 0.12595821433939577, + "flos": 45233248792320.0, + "grad_norm": 1.7774170004570267, + "language_loss": 0.78170305, + "learning_rate": 3.9041842948336835e-06, + "loss": 0.8608799, + "num_input_tokens_seen": 45287790, + "router_z_loss_clip": 3.20117188, + "router_z_loss_mlp": 0.31689453, + "step": 2095, + "time_per_iteration": 2.7437078952789307 + }, + { + "auxiliary_loss_clip": 0.06632219, + "auxiliary_loss_mlp": 0.01294772, + "balance_loss_clip": 0.06299502, + "balance_loss_mlp": 0.01263492, + "epoch": 0.12601833759206374, + "flos": 14325115472640.0, + "grad_norm": 2.871933509106217, + "language_loss": 0.84611917, + "learning_rate": 3.904065156953232e-06, + "loss": 0.92538905, + "num_input_tokens_seen": 45305720, + "router_z_loss_clip": 3.328125, + "router_z_loss_mlp": 0.31274414, + "step": 2096, + "time_per_iteration": 2.530060052871704 + }, + { + "auxiliary_loss_clip": 0.06630743, + "auxiliary_loss_mlp": 0.01306013, + "balance_loss_clip": 0.06297809, + "balance_loss_mlp": 0.01272038, + "epoch": 0.1260784608447317, + "flos": 21294651329280.0, + "grad_norm": 2.3649533335504365, + "language_loss": 0.7677502, + "learning_rate": 3.903945946870439e-06, + "loss": 0.84711778, + "num_input_tokens_seen": 45325290, + "router_z_loss_clip": 3.33203125, + "router_z_loss_mlp": 0.33984375, + "step": 2097, + "time_per_iteration": 2.5258843898773193 + }, + { + "auxiliary_loss_clip": 0.06624204, + "auxiliary_loss_mlp": 0.01300362, + "balance_loss_clip": 0.06299201, + "balance_loss_mlp": 0.0127025, + "epoch": 0.12613858409739967, + "flos": 26258719564800.0, + "grad_norm": 2.151256625756143, + "language_loss": 0.88275403, + "learning_rate": 3.9038266645898246e-06, + "loss": 0.96199965, + "num_input_tokens_seen": 45344465, + "router_z_loss_clip": 3.25195312, + "router_z_loss_mlp": 0.30102539, + "step": 2098, + "time_per_iteration": 2.5916357040405273 + }, + { + "auxiliary_loss_clip": 0.0664238, + "auxiliary_loss_mlp": 0.01307801, + "balance_loss_clip": 0.06306277, + "balance_loss_mlp": 0.0127149, + "epoch": 0.12619870735006763, + "flos": 21586413646080.0, + "grad_norm": 1.8808679634119545, + "language_loss": 0.71169508, + "learning_rate": 3.903707310115912e-06, + "loss": 0.79119694, + "num_input_tokens_seen": 45362465, + "router_z_loss_clip": 3.359375, + "router_z_loss_mlp": 0.36303711, + "step": 2099, + "time_per_iteration": 2.525548219680786 + }, + { + "auxiliary_loss_clip": 0.06636767, + "auxiliary_loss_mlp": 0.01301654, + "balance_loss_clip": 0.06306287, + "balance_loss_mlp": 0.0126756, + "epoch": 0.1262588306027356, + "flos": 23373646508160.0, + "grad_norm": 3.191355313927065, + "language_loss": 0.83154678, + "learning_rate": 3.903587883453228e-06, + "loss": 0.91093099, + "num_input_tokens_seen": 45382700, + "router_z_loss_clip": 3.30664062, + "router_z_loss_mlp": 0.34106445, + "step": 2100, + "time_per_iteration": 2.581777572631836 + }, + { + "auxiliary_loss_clip": 0.06632592, + "auxiliary_loss_mlp": 0.01304584, + "balance_loss_clip": 0.06304123, + "balance_loss_mlp": 0.01271325, + "epoch": 0.12631895385540357, + "flos": 23955619841280.0, + "grad_norm": 1.9586534535799036, + "language_loss": 0.81579792, + "learning_rate": 3.903468384606302e-06, + "loss": 0.89516962, + "num_input_tokens_seen": 45401005, + "router_z_loss_clip": 3.28515625, + "router_z_loss_mlp": 0.33227539, + "step": 2101, + "time_per_iteration": 2.579571008682251 + }, + { + "auxiliary_loss_clip": 0.06508025, + "auxiliary_loss_mlp": 0.01260999, + "balance_loss_clip": 0.06310984, + "balance_loss_mlp": 0.0125033, + "epoch": 0.12637907710807156, + "flos": 70301760635520.0, + "grad_norm": 0.6797956524806741, + "language_loss": 0.57154572, + "learning_rate": 3.903348813579662e-06, + "loss": 0.6492359, + "num_input_tokens_seen": 45466555, + "router_z_loss_clip": 1.96875, + "router_z_loss_mlp": 0.10681152, + "step": 2102, + "time_per_iteration": 3.2542574405670166 + }, + { + "auxiliary_loss_clip": 0.06635006, + "auxiliary_loss_mlp": 0.0129624, + "balance_loss_clip": 0.06302785, + "balance_loss_mlp": 0.01264888, + "epoch": 0.12643920036073952, + "flos": 18920833159680.0, + "grad_norm": 2.1103424848105177, + "language_loss": 0.95015359, + "learning_rate": 3.903229170377845e-06, + "loss": 1.02946603, + "num_input_tokens_seen": 45485165, + "router_z_loss_clip": 3.32617188, + "router_z_loss_mlp": 0.31396484, + "step": 2103, + "time_per_iteration": 2.554858684539795 + }, + { + "auxiliary_loss_clip": 0.06615217, + "auxiliary_loss_mlp": 0.01290733, + "balance_loss_clip": 0.0629935, + "balance_loss_mlp": 0.0126099, + "epoch": 0.1264993236134075, + "flos": 27789926676480.0, + "grad_norm": 1.8409874759375768, + "language_loss": 0.79467118, + "learning_rate": 3.903109455005387e-06, + "loss": 0.8737306, + "num_input_tokens_seen": 45504630, + "router_z_loss_clip": 3.15820312, + "router_z_loss_mlp": 0.29711914, + "step": 2104, + "time_per_iteration": 2.6194100379943848 + }, + { + "auxiliary_loss_clip": 0.06630556, + "auxiliary_loss_mlp": 0.01294269, + "balance_loss_clip": 0.06301397, + "balance_loss_mlp": 0.0126256, + "epoch": 0.12655944686607545, + "flos": 24761739646080.0, + "grad_norm": 2.4857210053550625, + "language_loss": 0.82356828, + "learning_rate": 3.902989667466828e-06, + "loss": 0.90281653, + "num_input_tokens_seen": 45524885, + "router_z_loss_clip": 3.29296875, + "router_z_loss_mlp": 0.31713867, + "step": 2105, + "time_per_iteration": 2.6011011600494385 + }, + { + "auxiliary_loss_clip": 0.06645899, + "auxiliary_loss_mlp": 0.01301591, + "balance_loss_clip": 0.0630343, + "balance_loss_mlp": 0.01263587, + "epoch": 0.12661957011874342, + "flos": 24139753188480.0, + "grad_norm": 2.6380144602222653, + "language_loss": 0.84079802, + "learning_rate": 3.90286980776671e-06, + "loss": 0.92027295, + "num_input_tokens_seen": 45545000, + "router_z_loss_clip": 3.421875, + "router_z_loss_mlp": 0.37963867, + "step": 2106, + "time_per_iteration": 2.572817087173462 + }, + { + "auxiliary_loss_clip": 0.0662559, + "auxiliary_loss_mlp": 0.012898, + "balance_loss_clip": 0.06298016, + "balance_loss_mlp": 0.01256422, + "epoch": 0.12667969337141138, + "flos": 24576180779520.0, + "grad_norm": 1.9395738781277843, + "language_loss": 0.74407184, + "learning_rate": 3.902749875909578e-06, + "loss": 0.82322574, + "num_input_tokens_seen": 45564210, + "router_z_loss_clip": 3.2734375, + "router_z_loss_mlp": 0.33374023, + "step": 2107, + "time_per_iteration": 2.6193723678588867 + }, + { + "auxiliary_loss_clip": 0.06622959, + "auxiliary_loss_mlp": 0.01290393, + "balance_loss_clip": 0.06299001, + "balance_loss_mlp": 0.01259017, + "epoch": 0.12673981662407935, + "flos": 22967546895360.0, + "grad_norm": 2.0472212441306175, + "language_loss": 0.80444276, + "learning_rate": 3.90262987189998e-06, + "loss": 0.88357627, + "num_input_tokens_seen": 45583030, + "router_z_loss_clip": 3.24023438, + "router_z_loss_mlp": 0.31396484, + "step": 2108, + "time_per_iteration": 2.5497617721557617 + }, + { + "auxiliary_loss_clip": 0.06627882, + "auxiliary_loss_mlp": 0.01288653, + "balance_loss_clip": 0.06298642, + "balance_loss_mlp": 0.01256562, + "epoch": 0.12679993987674734, + "flos": 17280613486080.0, + "grad_norm": 2.14760795310841, + "language_loss": 0.77326792, + "learning_rate": 3.902509795742467e-06, + "loss": 0.85243326, + "num_input_tokens_seen": 45602265, + "router_z_loss_clip": 3.29296875, + "router_z_loss_mlp": 0.32080078, + "step": 2109, + "time_per_iteration": 3.9535577297210693 + }, + { + "auxiliary_loss_clip": 0.06619301, + "auxiliary_loss_mlp": 0.01294051, + "balance_loss_clip": 0.0629691, + "balance_loss_mlp": 0.01260672, + "epoch": 0.1268600631294153, + "flos": 17280865048320.0, + "grad_norm": 1.6861552096477337, + "language_loss": 0.83234507, + "learning_rate": 3.902389647441592e-06, + "loss": 0.91147858, + "num_input_tokens_seen": 45620595, + "router_z_loss_clip": 3.22265625, + "router_z_loss_mlp": 0.33374023, + "step": 2110, + "time_per_iteration": 3.975102424621582 + }, + { + "auxiliary_loss_clip": 0.06634356, + "auxiliary_loss_mlp": 0.01289468, + "balance_loss_clip": 0.06303843, + "balance_loss_mlp": 0.01256661, + "epoch": 0.12692018638208327, + "flos": 24067902712320.0, + "grad_norm": 1.6854035382994426, + "language_loss": 0.79946983, + "learning_rate": 3.90226942700191e-06, + "loss": 0.878708, + "num_input_tokens_seen": 45641140, + "router_z_loss_clip": 3.31054688, + "router_z_loss_mlp": 0.32788086, + "step": 2111, + "time_per_iteration": 2.549649953842163 + }, + { + "auxiliary_loss_clip": 0.06640926, + "auxiliary_loss_mlp": 0.0129832, + "balance_loss_clip": 0.06299084, + "balance_loss_mlp": 0.01261199, + "epoch": 0.12698030963475124, + "flos": 31839952648320.0, + "grad_norm": 2.9365318295255984, + "language_loss": 0.78364569, + "learning_rate": 3.902149134427982e-06, + "loss": 0.86303812, + "num_input_tokens_seen": 45662315, + "router_z_loss_clip": 3.4140625, + "router_z_loss_mlp": 0.37109375, + "step": 2112, + "time_per_iteration": 2.641850233078003 + }, + { + "auxiliary_loss_clip": 0.06616612, + "auxiliary_loss_mlp": 0.01293574, + "balance_loss_clip": 0.062942, + "balance_loss_mlp": 0.01262342, + "epoch": 0.1270404328874192, + "flos": 25194058387200.0, + "grad_norm": 2.0317084660262688, + "language_loss": 0.86970478, + "learning_rate": 3.902028769724367e-06, + "loss": 0.94880664, + "num_input_tokens_seen": 45680335, + "router_z_loss_clip": 3.22851562, + "router_z_loss_mlp": 0.31225586, + "step": 2113, + "time_per_iteration": 5.534189224243164 + }, + { + "auxiliary_loss_clip": 0.06626937, + "auxiliary_loss_mlp": 0.01298292, + "balance_loss_clip": 0.06295247, + "balance_loss_mlp": 0.01265462, + "epoch": 0.12710055614008717, + "flos": 16002790721280.0, + "grad_norm": 2.427248740860799, + "language_loss": 0.75266403, + "learning_rate": 3.9019083328956315e-06, + "loss": 0.83191633, + "num_input_tokens_seen": 45696240, + "router_z_loss_clip": 3.3203125, + "router_z_loss_mlp": 0.32788086, + "step": 2114, + "time_per_iteration": 2.491520643234253 + }, + { + "auxiliary_loss_clip": 0.06621046, + "auxiliary_loss_mlp": 0.01302494, + "balance_loss_clip": 0.06295703, + "balance_loss_mlp": 0.01270975, + "epoch": 0.12716067939275516, + "flos": 15091012517760.0, + "grad_norm": 2.3252793600318125, + "language_loss": 0.85064435, + "learning_rate": 3.901787823946341e-06, + "loss": 0.92987972, + "num_input_tokens_seen": 45713695, + "router_z_loss_clip": 3.2578125, + "router_z_loss_mlp": 0.31518555, + "step": 2115, + "time_per_iteration": 2.5152101516723633 + }, + { + "auxiliary_loss_clip": 0.06622103, + "auxiliary_loss_mlp": 0.01292068, + "balance_loss_clip": 0.06295006, + "balance_loss_mlp": 0.01260787, + "epoch": 0.12722080264542313, + "flos": 28374373704960.0, + "grad_norm": 1.6080767966631377, + "language_loss": 0.88167703, + "learning_rate": 3.901667242881065e-06, + "loss": 0.96081877, + "num_input_tokens_seen": 45736655, + "router_z_loss_clip": 3.2734375, + "router_z_loss_mlp": 0.3125, + "step": 2116, + "time_per_iteration": 2.61238169670105 + }, + { + "auxiliary_loss_clip": 0.06614063, + "auxiliary_loss_mlp": 0.01310146, + "balance_loss_clip": 0.06294715, + "balance_loss_mlp": 0.0127877, + "epoch": 0.1272809258980911, + "flos": 32388159985920.0, + "grad_norm": 4.443941469464488, + "language_loss": 0.72083235, + "learning_rate": 3.9015465897043775e-06, + "loss": 0.8000744, + "num_input_tokens_seen": 45758195, + "router_z_loss_clip": 3.1953125, + "router_z_loss_mlp": 0.3137207, + "step": 2117, + "time_per_iteration": 2.6185410022735596 + }, + { + "auxiliary_loss_clip": 0.06630652, + "auxiliary_loss_mlp": 0.01300593, + "balance_loss_clip": 0.06301345, + "balance_loss_mlp": 0.0126781, + "epoch": 0.12734104915075906, + "flos": 16039952807040.0, + "grad_norm": 1.9850917523754936, + "language_loss": 0.87703407, + "learning_rate": 3.901425864420852e-06, + "loss": 0.95634645, + "num_input_tokens_seen": 45774280, + "router_z_loss_clip": 3.29101562, + "router_z_loss_mlp": 0.32739258, + "step": 2118, + "time_per_iteration": 2.503112316131592 + }, + { + "auxiliary_loss_clip": 0.06623712, + "auxiliary_loss_mlp": 0.01308307, + "balance_loss_clip": 0.06299254, + "balance_loss_mlp": 0.01276359, + "epoch": 0.12740117240342702, + "flos": 18266296590720.0, + "grad_norm": 1.8669738886398666, + "language_loss": 0.88737518, + "learning_rate": 3.901305067035068e-06, + "loss": 0.96669531, + "num_input_tokens_seen": 45792760, + "router_z_loss_clip": 3.24414062, + "router_z_loss_mlp": 0.31945801, + "step": 2119, + "time_per_iteration": 2.541663885116577 + }, + { + "auxiliary_loss_clip": 0.06633841, + "auxiliary_loss_mlp": 0.01294245, + "balance_loss_clip": 0.06305236, + "balance_loss_mlp": 0.01260652, + "epoch": 0.127461295656095, + "flos": 12125242379520.0, + "grad_norm": 2.4570566612421154, + "language_loss": 0.88616729, + "learning_rate": 3.901184197551605e-06, + "loss": 0.96544814, + "num_input_tokens_seen": 45804300, + "router_z_loss_clip": 3.2890625, + "router_z_loss_mlp": 0.33569336, + "step": 2120, + "time_per_iteration": 2.481060743331909 + }, + { + "auxiliary_loss_clip": 0.06631807, + "auxiliary_loss_mlp": 0.01302004, + "balance_loss_clip": 0.06303513, + "balance_loss_mlp": 0.01269079, + "epoch": 0.12752141890876295, + "flos": 23155831019520.0, + "grad_norm": 1.9663880058350043, + "language_loss": 0.7779758, + "learning_rate": 3.901063255975046e-06, + "loss": 0.85731387, + "num_input_tokens_seen": 45823780, + "router_z_loss_clip": 3.28515625, + "router_z_loss_mlp": 0.3293457, + "step": 2121, + "time_per_iteration": 2.5578267574310303 + }, + { + "auxiliary_loss_clip": 0.06632394, + "auxiliary_loss_mlp": 0.01293067, + "balance_loss_clip": 0.06304775, + "balance_loss_mlp": 0.01258949, + "epoch": 0.12758154216143094, + "flos": 21622359847680.0, + "grad_norm": 2.5772818076611976, + "language_loss": 0.84019601, + "learning_rate": 3.900942242309978e-06, + "loss": 0.91945064, + "num_input_tokens_seen": 45840495, + "router_z_loss_clip": 3.28125, + "router_z_loss_mlp": 0.34106445, + "step": 2122, + "time_per_iteration": 2.5861244201660156 + }, + { + "auxiliary_loss_clip": 0.06629082, + "auxiliary_loss_mlp": 0.01293636, + "balance_loss_clip": 0.06302215, + "balance_loss_mlp": 0.01260162, + "epoch": 0.1276416654140989, + "flos": 15930395193600.0, + "grad_norm": 1.9995911681983476, + "language_loss": 0.80520052, + "learning_rate": 3.90082115656099e-06, + "loss": 0.88442767, + "num_input_tokens_seen": 45857735, + "router_z_loss_clip": 3.26953125, + "router_z_loss_mlp": 0.33496094, + "step": 2123, + "time_per_iteration": 2.543966770172119 + }, + { + "auxiliary_loss_clip": 0.06636834, + "auxiliary_loss_mlp": 0.01289825, + "balance_loss_clip": 0.06312384, + "balance_loss_mlp": 0.01257687, + "epoch": 0.12770178866676687, + "flos": 22389263141760.0, + "grad_norm": 1.6312979029769639, + "language_loss": 0.80678988, + "learning_rate": 3.900699998732673e-06, + "loss": 0.88605642, + "num_input_tokens_seen": 45876485, + "router_z_loss_clip": 3.24023438, + "router_z_loss_mlp": 0.3215332, + "step": 2124, + "time_per_iteration": 2.590118169784546 + }, + { + "auxiliary_loss_clip": 0.06636873, + "auxiliary_loss_mlp": 0.01291865, + "balance_loss_clip": 0.06307361, + "balance_loss_mlp": 0.01261228, + "epoch": 0.12776191191943484, + "flos": 21658851100800.0, + "grad_norm": 2.2926076774548765, + "language_loss": 0.76290202, + "learning_rate": 3.900578768829623e-06, + "loss": 0.84218943, + "num_input_tokens_seen": 45894645, + "router_z_loss_clip": 3.29492188, + "router_z_loss_mlp": 0.30639648, + "step": 2125, + "time_per_iteration": 2.5684149265289307 + }, + { + "auxiliary_loss_clip": 0.06631321, + "auxiliary_loss_mlp": 0.01289055, + "balance_loss_clip": 0.0630435, + "balance_loss_mlp": 0.01257011, + "epoch": 0.1278220351721028, + "flos": 25742056089600.0, + "grad_norm": 2.526811883204058, + "language_loss": 0.79172325, + "learning_rate": 3.900457466856434e-06, + "loss": 0.87092698, + "num_input_tokens_seen": 45913755, + "router_z_loss_clip": 3.26757812, + "router_z_loss_mlp": 0.3203125, + "step": 2126, + "time_per_iteration": 2.6264641284942627 + }, + { + "auxiliary_loss_clip": 0.06645348, + "auxiliary_loss_mlp": 0.01292083, + "balance_loss_clip": 0.06316036, + "balance_loss_mlp": 0.01259563, + "epoch": 0.12788215842477077, + "flos": 41252515747200.0, + "grad_norm": 1.559600581864003, + "language_loss": 0.70510435, + "learning_rate": 3.9003360928177085e-06, + "loss": 0.7844786, + "num_input_tokens_seen": 45936095, + "router_z_loss_clip": 3.29101562, + "router_z_loss_mlp": 0.32543945, + "step": 2127, + "time_per_iteration": 2.7501988410949707 + }, + { + "auxiliary_loss_clip": 0.06512339, + "auxiliary_loss_mlp": 0.01271557, + "balance_loss_clip": 0.06312746, + "balance_loss_mlp": 0.01259123, + "epoch": 0.12794228167743876, + "flos": 70899079265280.0, + "grad_norm": 0.8027421200972868, + "language_loss": 0.6268698, + "learning_rate": 3.900214646718047e-06, + "loss": 0.70470876, + "num_input_tokens_seen": 46004655, + "router_z_loss_clip": 2.0, + "router_z_loss_mlp": 0.12438965, + "step": 2128, + "time_per_iteration": 3.2327187061309814 + }, + { + "auxiliary_loss_clip": 0.06647713, + "auxiliary_loss_mlp": 0.0129082, + "balance_loss_clip": 0.06314018, + "balance_loss_mlp": 0.01255987, + "epoch": 0.12800240493010673, + "flos": 16295307724800.0, + "grad_norm": 3.2224372102485757, + "language_loss": 0.78878236, + "learning_rate": 3.900093128562056e-06, + "loss": 0.86816764, + "num_input_tokens_seen": 46023610, + "router_z_loss_clip": 3.3359375, + "router_z_loss_mlp": 0.34790039, + "step": 2129, + "time_per_iteration": 2.513296365737915 + }, + { + "auxiliary_loss_clip": 0.06653494, + "auxiliary_loss_mlp": 0.01302761, + "balance_loss_clip": 0.06312658, + "balance_loss_mlp": 0.012649, + "epoch": 0.1280625281827747, + "flos": 20637850700160.0, + "grad_norm": 2.4415165367574394, + "language_loss": 0.80974901, + "learning_rate": 3.899971538354343e-06, + "loss": 0.88931155, + "num_input_tokens_seen": 46041725, + "router_z_loss_clip": 3.40820312, + "router_z_loss_mlp": 0.37866211, + "step": 2130, + "time_per_iteration": 2.551335573196411 + }, + { + "auxiliary_loss_clip": 0.06635942, + "auxiliary_loss_mlp": 0.01301168, + "balance_loss_clip": 0.06304602, + "balance_loss_mlp": 0.01268457, + "epoch": 0.12812265143544266, + "flos": 22644869621760.0, + "grad_norm": 1.8063453022697407, + "language_loss": 0.73535526, + "learning_rate": 3.899849876099518e-06, + "loss": 0.81472635, + "num_input_tokens_seen": 46061095, + "router_z_loss_clip": 3.31445312, + "router_z_loss_mlp": 0.3269043, + "step": 2131, + "time_per_iteration": 2.591715097427368 + }, + { + "auxiliary_loss_clip": 0.06649061, + "auxiliary_loss_mlp": 0.01307481, + "balance_loss_clip": 0.06316839, + "balance_loss_mlp": 0.01274961, + "epoch": 0.12818277468811062, + "flos": 34723306696320.0, + "grad_norm": 2.4480572994081213, + "language_loss": 0.74477613, + "learning_rate": 3.899728141802197e-06, + "loss": 0.8243416, + "num_input_tokens_seen": 46082670, + "router_z_loss_clip": 3.32226562, + "router_z_loss_mlp": 0.32519531, + "step": 2132, + "time_per_iteration": 2.644005060195923 + }, + { + "auxiliary_loss_clip": 0.06630264, + "auxiliary_loss_mlp": 0.01301188, + "balance_loss_clip": 0.06311467, + "balance_loss_mlp": 0.01268573, + "epoch": 0.1282428979407786, + "flos": 23118752787840.0, + "grad_norm": 2.134664592917613, + "language_loss": 0.83662349, + "learning_rate": 3.8996063354669935e-06, + "loss": 0.91593802, + "num_input_tokens_seen": 46102410, + "router_z_loss_clip": 3.1875, + "router_z_loss_mlp": 0.32617188, + "step": 2133, + "time_per_iteration": 2.526437520980835 + }, + { + "auxiliary_loss_clip": 0.06657492, + "auxiliary_loss_mlp": 0.01312656, + "balance_loss_clip": 0.06318928, + "balance_loss_mlp": 0.01277823, + "epoch": 0.12830302119344655, + "flos": 20892786347520.0, + "grad_norm": 3.0593036297338223, + "language_loss": 0.82609046, + "learning_rate": 3.899484457098528e-06, + "loss": 0.90579188, + "num_input_tokens_seen": 46121145, + "router_z_loss_clip": 3.38476562, + "router_z_loss_mlp": 0.34814453, + "step": 2134, + "time_per_iteration": 2.57069993019104 + }, + { + "auxiliary_loss_clip": 0.06644946, + "auxiliary_loss_mlp": 0.01299694, + "balance_loss_clip": 0.0631265, + "balance_loss_mlp": 0.01266363, + "epoch": 0.12836314444611455, + "flos": 21404208942720.0, + "grad_norm": 1.8809028559826366, + "language_loss": 0.84531921, + "learning_rate": 3.899362506701421e-06, + "loss": 0.92476559, + "num_input_tokens_seen": 46140740, + "router_z_loss_clip": 3.3203125, + "router_z_loss_mlp": 0.33325195, + "step": 2135, + "time_per_iteration": 2.5816993713378906 + }, + { + "auxiliary_loss_clip": 0.06641332, + "auxiliary_loss_mlp": 0.01305378, + "balance_loss_clip": 0.06312244, + "balance_loss_mlp": 0.01272142, + "epoch": 0.1284232676987825, + "flos": 13667560156800.0, + "grad_norm": 3.0323333945799176, + "language_loss": 0.78892457, + "learning_rate": 3.899240484280298e-06, + "loss": 0.86839169, + "num_input_tokens_seen": 46156805, + "router_z_loss_clip": 3.2890625, + "router_z_loss_mlp": 0.33227539, + "step": 2136, + "time_per_iteration": 2.529231548309326 + }, + { + "auxiliary_loss_clip": 0.06499572, + "auxiliary_loss_mlp": 0.01289102, + "balance_loss_clip": 0.06299701, + "balance_loss_mlp": 0.01276156, + "epoch": 0.12848339095145048, + "flos": 60012904337280.0, + "grad_norm": 0.8797489168749767, + "language_loss": 0.5947628, + "learning_rate": 3.899118389839785e-06, + "loss": 0.67264956, + "num_input_tokens_seen": 46222085, + "router_z_loss_clip": 2.0, + "router_z_loss_mlp": 0.12957764, + "step": 2137, + "time_per_iteration": 3.308232545852661 + }, + { + "auxiliary_loss_clip": 0.06652065, + "auxiliary_loss_mlp": 0.01307251, + "balance_loss_clip": 0.06317523, + "balance_loss_mlp": 0.01273515, + "epoch": 0.12854351420411844, + "flos": 13886507675520.0, + "grad_norm": 2.603073013301421, + "language_loss": 0.84481782, + "learning_rate": 3.898996223384512e-06, + "loss": 0.924411, + "num_input_tokens_seen": 46239970, + "router_z_loss_clip": 3.34765625, + "router_z_loss_mlp": 0.3371582, + "step": 2138, + "time_per_iteration": 2.5150487422943115 + }, + { + "auxiliary_loss_clip": 0.0665133, + "auxiliary_loss_mlp": 0.01300544, + "balance_loss_clip": 0.06310506, + "balance_loss_mlp": 0.01263136, + "epoch": 0.1286036374567864, + "flos": 22644534205440.0, + "grad_norm": 2.3721539245571237, + "language_loss": 0.79668736, + "learning_rate": 3.898873984919113e-06, + "loss": 0.87620616, + "num_input_tokens_seen": 46257740, + "router_z_loss_clip": 3.41015625, + "router_z_loss_mlp": 0.37402344, + "step": 2139, + "time_per_iteration": 2.5760304927825928 + }, + { + "auxiliary_loss_clip": 0.06645858, + "auxiliary_loss_mlp": 0.01289965, + "balance_loss_clip": 0.06314536, + "balance_loss_mlp": 0.0125754, + "epoch": 0.12866376070945437, + "flos": 16330121896320.0, + "grad_norm": 1.944874099387006, + "language_loss": 0.86374593, + "learning_rate": 3.8987516744482215e-06, + "loss": 0.94310415, + "num_input_tokens_seen": 46275445, + "router_z_loss_clip": 3.30859375, + "router_z_loss_mlp": 0.32421875, + "step": 2140, + "time_per_iteration": 2.5656511783599854 + }, + { + "auxiliary_loss_clip": 0.06634524, + "auxiliary_loss_mlp": 0.01284799, + "balance_loss_clip": 0.06308289, + "balance_loss_mlp": 0.01254496, + "epoch": 0.12872388396212234, + "flos": 11879321045760.0, + "grad_norm": 2.00800168780761, + "language_loss": 0.87046349, + "learning_rate": 3.898629291976476e-06, + "loss": 0.94965667, + "num_input_tokens_seen": 46291710, + "router_z_loss_clip": 3.2578125, + "router_z_loss_mlp": 0.30322266, + "step": 2141, + "time_per_iteration": 2.589749336242676 + }, + { + "auxiliary_loss_clip": 0.06646, + "auxiliary_loss_mlp": 0.01294177, + "balance_loss_clip": 0.06311622, + "balance_loss_mlp": 0.01261037, + "epoch": 0.12878400721479033, + "flos": 28374331777920.0, + "grad_norm": 2.3143248810569563, + "language_loss": 0.69344199, + "learning_rate": 3.898506837508518e-06, + "loss": 0.77284372, + "num_input_tokens_seen": 46311335, + "router_z_loss_clip": 3.34375, + "router_z_loss_mlp": 0.33154297, + "step": 2142, + "time_per_iteration": 2.631613254547119 + }, + { + "auxiliary_loss_clip": 0.06645877, + "auxiliary_loss_mlp": 0.01292532, + "balance_loss_clip": 0.06308207, + "balance_loss_mlp": 0.01257723, + "epoch": 0.1288441304674583, + "flos": 25892842711680.0, + "grad_norm": 1.8471793604151003, + "language_loss": 0.84538341, + "learning_rate": 3.89838431104899e-06, + "loss": 0.92476749, + "num_input_tokens_seen": 46330985, + "router_z_loss_clip": 3.38085938, + "router_z_loss_mlp": 0.34814453, + "step": 2143, + "time_per_iteration": 2.62510085105896 + }, + { + "auxiliary_loss_clip": 0.06646847, + "auxiliary_loss_mlp": 0.01296075, + "balance_loss_clip": 0.06309757, + "balance_loss_mlp": 0.01261194, + "epoch": 0.12890425372012626, + "flos": 20820097330560.0, + "grad_norm": 2.9481033880232284, + "language_loss": 0.82936227, + "learning_rate": 3.898261712602539e-06, + "loss": 0.90879142, + "num_input_tokens_seen": 46351295, + "router_z_loss_clip": 3.37109375, + "router_z_loss_mlp": 0.34912109, + "step": 2144, + "time_per_iteration": 2.562148332595825 + }, + { + "auxiliary_loss_clip": 0.06632444, + "auxiliary_loss_mlp": 0.01299578, + "balance_loss_clip": 0.06302489, + "balance_loss_mlp": 0.01263196, + "epoch": 0.12896437697279423, + "flos": 22572599875200.0, + "grad_norm": 2.2245116542983046, + "language_loss": 0.80073792, + "learning_rate": 3.898139042173813e-06, + "loss": 0.88005811, + "num_input_tokens_seen": 46368600, + "router_z_loss_clip": 3.29882812, + "router_z_loss_mlp": 0.36376953, + "step": 2145, + "time_per_iteration": 2.5510518550872803 + }, + { + "auxiliary_loss_clip": 0.06636346, + "auxiliary_loss_mlp": 0.0130688, + "balance_loss_clip": 0.06306225, + "balance_loss_mlp": 0.01269877, + "epoch": 0.1290245002254622, + "flos": 17499561004800.0, + "grad_norm": 2.1761731102138686, + "language_loss": 0.83456767, + "learning_rate": 3.898016299767465e-06, + "loss": 0.91399992, + "num_input_tokens_seen": 46387370, + "router_z_loss_clip": 3.30273438, + "router_z_loss_mlp": 0.36987305, + "step": 2146, + "time_per_iteration": 2.5113868713378906 + }, + { + "auxiliary_loss_clip": 0.06626259, + "auxiliary_loss_mlp": 0.01301495, + "balance_loss_clip": 0.06300884, + "balance_loss_mlp": 0.01266042, + "epoch": 0.12908462347813016, + "flos": 36324142151040.0, + "grad_norm": 4.395125583857354, + "language_loss": 0.72594023, + "learning_rate": 3.897893485388149e-06, + "loss": 0.8052178, + "num_input_tokens_seen": 46409570, + "router_z_loss_clip": 3.25585938, + "router_z_loss_mlp": 0.35449219, + "step": 2147, + "time_per_iteration": 2.7282183170318604 + }, + { + "auxiliary_loss_clip": 0.06638759, + "auxiliary_loss_mlp": 0.01311135, + "balance_loss_clip": 0.0630547, + "balance_loss_mlp": 0.0127685, + "epoch": 0.12914474673079815, + "flos": 22535312008320.0, + "grad_norm": 2.709676387149746, + "language_loss": 0.73026669, + "learning_rate": 3.897770599040521e-06, + "loss": 0.80976564, + "num_input_tokens_seen": 46429320, + "router_z_loss_clip": 3.33203125, + "router_z_loss_mlp": 0.34326172, + "step": 2148, + "time_per_iteration": 2.5520236492156982 + }, + { + "auxiliary_loss_clip": 0.0663462, + "auxiliary_loss_mlp": 0.01329577, + "balance_loss_clip": 0.06310473, + "balance_loss_mlp": 0.01295626, + "epoch": 0.12920486998346611, + "flos": 21478533114240.0, + "grad_norm": 1.8799370652963014, + "language_loss": 0.80598587, + "learning_rate": 3.897647640729242e-06, + "loss": 0.88562787, + "num_input_tokens_seen": 46450155, + "router_z_loss_clip": 3.2421875, + "router_z_loss_mlp": 0.33959961, + "step": 2149, + "time_per_iteration": 3.9808621406555176 + }, + { + "auxiliary_loss_clip": 0.06633235, + "auxiliary_loss_mlp": 0.01311577, + "balance_loss_clip": 0.06304948, + "balance_loss_mlp": 0.01273907, + "epoch": 0.12926499323613408, + "flos": 27316001583360.0, + "grad_norm": 1.9848043356035314, + "language_loss": 0.77766216, + "learning_rate": 3.897524610458975e-06, + "loss": 0.85711026, + "num_input_tokens_seen": 46470280, + "router_z_loss_clip": 3.27929688, + "router_z_loss_mlp": 0.37646484, + "step": 2150, + "time_per_iteration": 4.050567388534546 + }, + { + "auxiliary_loss_clip": 0.06637069, + "auxiliary_loss_mlp": 0.01309125, + "balance_loss_clip": 0.06305329, + "balance_loss_mlp": 0.01273791, + "epoch": 0.12932511648880204, + "flos": 22097710460160.0, + "grad_norm": 2.600129389398131, + "language_loss": 0.71828127, + "learning_rate": 3.8974015082343835e-06, + "loss": 0.79774326, + "num_input_tokens_seen": 46487605, + "router_z_loss_clip": 3.31835938, + "router_z_loss_mlp": 0.35351562, + "step": 2151, + "time_per_iteration": 2.539199113845825 + }, + { + "auxiliary_loss_clip": 0.06638855, + "auxiliary_loss_mlp": 0.01316478, + "balance_loss_clip": 0.06308948, + "balance_loss_mlp": 0.01280716, + "epoch": 0.12938523974147, + "flos": 20308968224640.0, + "grad_norm": 2.09152011854814, + "language_loss": 0.85415232, + "learning_rate": 3.897278334060137e-06, + "loss": 0.93370569, + "num_input_tokens_seen": 46505100, + "router_z_loss_clip": 3.296875, + "router_z_loss_mlp": 0.35766602, + "step": 2152, + "time_per_iteration": 4.064931631088257 + }, + { + "auxiliary_loss_clip": 0.06626976, + "auxiliary_loss_mlp": 0.0130895, + "balance_loss_clip": 0.06301983, + "balance_loss_mlp": 0.01275118, + "epoch": 0.12944536299413797, + "flos": 19505992947840.0, + "grad_norm": 2.0734690645371865, + "language_loss": 0.79983026, + "learning_rate": 3.897155087940906e-06, + "loss": 0.87918949, + "num_input_tokens_seen": 46524020, + "router_z_loss_clip": 3.25, + "router_z_loss_mlp": 0.33837891, + "step": 2153, + "time_per_iteration": 3.9787750244140625 + }, + { + "auxiliary_loss_clip": 0.06634978, + "auxiliary_loss_mlp": 0.01296438, + "balance_loss_clip": 0.06309275, + "balance_loss_mlp": 0.01262845, + "epoch": 0.12950548624680594, + "flos": 27715099380480.0, + "grad_norm": 1.6134334939452253, + "language_loss": 0.81228089, + "learning_rate": 3.897031769881364e-06, + "loss": 0.89159513, + "num_input_tokens_seen": 46544640, + "router_z_loss_clip": 3.2578125, + "router_z_loss_mlp": 0.3359375, + "step": 2154, + "time_per_iteration": 2.6176583766937256 + }, + { + "auxiliary_loss_clip": 0.06634305, + "auxiliary_loss_mlp": 0.01301182, + "balance_loss_clip": 0.06307935, + "balance_loss_mlp": 0.01267756, + "epoch": 0.12956560949947393, + "flos": 17571369553920.0, + "grad_norm": 5.013009585067341, + "language_loss": 0.84744835, + "learning_rate": 3.896908379886188e-06, + "loss": 0.92680323, + "num_input_tokens_seen": 46561395, + "router_z_loss_clip": 3.265625, + "router_z_loss_mlp": 0.33422852, + "step": 2155, + "time_per_iteration": 2.512476921081543 + }, + { + "auxiliary_loss_clip": 0.06635429, + "auxiliary_loss_mlp": 0.01300286, + "balance_loss_clip": 0.06301479, + "balance_loss_mlp": 0.01265668, + "epoch": 0.1296257327521419, + "flos": 20746989043200.0, + "grad_norm": 7.629659850029062, + "language_loss": 0.77301121, + "learning_rate": 3.896784917960055e-06, + "loss": 0.85236835, + "num_input_tokens_seen": 46579395, + "router_z_loss_clip": 3.34179688, + "router_z_loss_mlp": 0.34619141, + "step": 2156, + "time_per_iteration": 2.5492148399353027 + }, + { + "auxiliary_loss_clip": 0.06627367, + "auxiliary_loss_mlp": 0.01301012, + "balance_loss_clip": 0.06305566, + "balance_loss_mlp": 0.01268063, + "epoch": 0.12968585600480986, + "flos": 16400756488320.0, + "grad_norm": 2.322189413476167, + "language_loss": 0.88143146, + "learning_rate": 3.896661384107648e-06, + "loss": 0.96071517, + "num_input_tokens_seen": 46597090, + "router_z_loss_clip": 3.21875, + "router_z_loss_mlp": 0.32910156, + "step": 2157, + "time_per_iteration": 2.571720838546753 + }, + { + "auxiliary_loss_clip": 0.06642087, + "auxiliary_loss_mlp": 0.0129196, + "balance_loss_clip": 0.06308718, + "balance_loss_mlp": 0.01257699, + "epoch": 0.12974597925747783, + "flos": 28337043911040.0, + "grad_norm": 2.3553612027238753, + "language_loss": 0.82135451, + "learning_rate": 3.896537778333651e-06, + "loss": 0.90069497, + "num_input_tokens_seen": 46617355, + "router_z_loss_clip": 3.33398438, + "router_z_loss_mlp": 0.34277344, + "step": 2158, + "time_per_iteration": 2.5973830223083496 + }, + { + "auxiliary_loss_clip": 0.06639753, + "auxiliary_loss_mlp": 0.0129687, + "balance_loss_clip": 0.06306097, + "balance_loss_mlp": 0.01263467, + "epoch": 0.1298061025101458, + "flos": 9687036746880.0, + "grad_norm": 2.577133138726625, + "language_loss": 0.76591945, + "learning_rate": 3.896414100642752e-06, + "loss": 0.84528571, + "num_input_tokens_seen": 46633130, + "router_z_loss_clip": 3.33984375, + "router_z_loss_mlp": 0.33422852, + "step": 2159, + "time_per_iteration": 2.4932103157043457 + }, + { + "auxiliary_loss_clip": 0.06634657, + "auxiliary_loss_mlp": 0.01294131, + "balance_loss_clip": 0.06308954, + "balance_loss_mlp": 0.01261086, + "epoch": 0.12986622576281376, + "flos": 27716986097280.0, + "grad_norm": 2.475517406269625, + "language_loss": 0.83553314, + "learning_rate": 3.89629035103964e-06, + "loss": 0.91482103, + "num_input_tokens_seen": 46650575, + "router_z_loss_clip": 3.26171875, + "router_z_loss_mlp": 0.33056641, + "step": 2160, + "time_per_iteration": 2.603818655014038 + }, + { + "auxiliary_loss_clip": 0.06627609, + "auxiliary_loss_mlp": 0.01293116, + "balance_loss_clip": 0.06306535, + "balance_loss_mlp": 0.01259118, + "epoch": 0.12992634901548175, + "flos": 18807963310080.0, + "grad_norm": 1.593154120113757, + "language_loss": 0.83271182, + "learning_rate": 3.896166529529008e-06, + "loss": 0.91191912, + "num_input_tokens_seen": 46668780, + "router_z_loss_clip": 3.21484375, + "router_z_loss_mlp": 0.33984375, + "step": 2161, + "time_per_iteration": 2.5266897678375244 + }, + { + "auxiliary_loss_clip": 0.06639621, + "auxiliary_loss_mlp": 0.01302779, + "balance_loss_clip": 0.06313581, + "balance_loss_mlp": 0.01268423, + "epoch": 0.12998647226814972, + "flos": 29134442891520.0, + "grad_norm": 2.3185391348432254, + "language_loss": 0.83230841, + "learning_rate": 3.896042636115551e-06, + "loss": 0.91173244, + "num_input_tokens_seen": 46687550, + "router_z_loss_clip": 3.25976562, + "router_z_loss_mlp": 0.34375, + "step": 2162, + "time_per_iteration": 2.65075945854187 + }, + { + "auxiliary_loss_clip": 0.06644595, + "auxiliary_loss_mlp": 0.0130915, + "balance_loss_clip": 0.06308532, + "balance_loss_mlp": 0.01275485, + "epoch": 0.13004659552081768, + "flos": 19579855921920.0, + "grad_norm": 2.844531827385147, + "language_loss": 0.74537766, + "learning_rate": 3.895918670803968e-06, + "loss": 0.82491517, + "num_input_tokens_seen": 46706730, + "router_z_loss_clip": 3.36132812, + "router_z_loss_mlp": 0.33666992, + "step": 2163, + "time_per_iteration": 2.54642653465271 + }, + { + "auxiliary_loss_clip": 0.06640218, + "auxiliary_loss_mlp": 0.0130695, + "balance_loss_clip": 0.06307475, + "balance_loss_mlp": 0.01271259, + "epoch": 0.13010671877348565, + "flos": 22497059819520.0, + "grad_norm": 2.8300840640024605, + "language_loss": 0.82687104, + "learning_rate": 3.895794633598958e-06, + "loss": 0.90634274, + "num_input_tokens_seen": 46724250, + "router_z_loss_clip": 3.33007812, + "router_z_loss_mlp": 0.35668945, + "step": 2164, + "time_per_iteration": 2.5606889724731445 + }, + { + "auxiliary_loss_clip": 0.06643611, + "auxiliary_loss_mlp": 0.01308241, + "balance_loss_clip": 0.0631078, + "balance_loss_mlp": 0.0127317, + "epoch": 0.1301668420261536, + "flos": 23884985249280.0, + "grad_norm": 2.1372618334431004, + "language_loss": 0.72789967, + "learning_rate": 3.8956705245052256e-06, + "loss": 0.80741817, + "num_input_tokens_seen": 46744105, + "router_z_loss_clip": 3.33007812, + "router_z_loss_mlp": 0.35058594, + "step": 2165, + "time_per_iteration": 2.5799126625061035 + }, + { + "auxiliary_loss_clip": 0.06653779, + "auxiliary_loss_mlp": 0.01315345, + "balance_loss_clip": 0.06317334, + "balance_loss_mlp": 0.0127932, + "epoch": 0.13022696527882158, + "flos": 23156963049600.0, + "grad_norm": 2.4025078023781563, + "language_loss": 0.76332915, + "learning_rate": 3.8955463435274765e-06, + "loss": 0.84302044, + "num_input_tokens_seen": 46764250, + "router_z_loss_clip": 3.3671875, + "router_z_loss_mlp": 0.35986328, + "step": 2166, + "time_per_iteration": 2.6160640716552734 + }, + { + "auxiliary_loss_clip": 0.06650659, + "auxiliary_loss_mlp": 0.01325427, + "balance_loss_clip": 0.06318434, + "balance_loss_mlp": 0.01292144, + "epoch": 0.13028708853148954, + "flos": 26916149099520.0, + "grad_norm": 2.7267776489226945, + "language_loss": 0.84227574, + "learning_rate": 3.895422090670421e-06, + "loss": 0.92203659, + "num_input_tokens_seen": 46786865, + "router_z_loss_clip": 3.3203125, + "router_z_loss_mlp": 0.33276367, + "step": 2167, + "time_per_iteration": 2.6118650436401367 + }, + { + "auxiliary_loss_clip": 0.0665281, + "auxiliary_loss_mlp": 0.01322266, + "balance_loss_clip": 0.06323615, + "balance_loss_mlp": 0.01284524, + "epoch": 0.13034721178415754, + "flos": 21257824659840.0, + "grad_norm": 1.882236850474067, + "language_loss": 0.84621233, + "learning_rate": 3.89529776593877e-06, + "loss": 0.9259631, + "num_input_tokens_seen": 46807030, + "router_z_loss_clip": 3.29101562, + "router_z_loss_mlp": 0.37719727, + "step": 2168, + "time_per_iteration": 2.599341869354248 + }, + { + "auxiliary_loss_clip": 0.06651181, + "auxiliary_loss_mlp": 0.01330045, + "balance_loss_clip": 0.0631827, + "balance_loss_mlp": 0.01296166, + "epoch": 0.1304073350368255, + "flos": 18772646014080.0, + "grad_norm": 2.6769280516725495, + "language_loss": 0.81258374, + "learning_rate": 3.8951733693372375e-06, + "loss": 0.89239597, + "num_input_tokens_seen": 46826280, + "router_z_loss_clip": 3.32617188, + "router_z_loss_mlp": 0.33886719, + "step": 2169, + "time_per_iteration": 2.551320791244507 + }, + { + "auxiliary_loss_clip": 0.06645042, + "auxiliary_loss_mlp": 0.01325755, + "balance_loss_clip": 0.06314517, + "balance_loss_mlp": 0.01290898, + "epoch": 0.13046745828949347, + "flos": 28371941936640.0, + "grad_norm": 2.6264294111585285, + "language_loss": 0.6902529, + "learning_rate": 3.8950489008705406e-06, + "loss": 0.76996082, + "num_input_tokens_seen": 46846505, + "router_z_loss_clip": 3.30664062, + "router_z_loss_mlp": 0.34838867, + "step": 2170, + "time_per_iteration": 2.636103868484497 + }, + { + "auxiliary_loss_clip": 0.06639146, + "auxiliary_loss_mlp": 0.01323013, + "balance_loss_clip": 0.063104, + "balance_loss_mlp": 0.01289826, + "epoch": 0.13052758154216143, + "flos": 29612518761600.0, + "grad_norm": 2.576487358768087, + "language_loss": 0.68392706, + "learning_rate": 3.8949243605434e-06, + "loss": 0.76354867, + "num_input_tokens_seen": 46867380, + "router_z_loss_clip": 3.28125, + "router_z_loss_mlp": 0.33178711, + "step": 2171, + "time_per_iteration": 2.6055140495300293 + }, + { + "auxiliary_loss_clip": 0.06645554, + "auxiliary_loss_mlp": 0.01327149, + "balance_loss_clip": 0.06309786, + "balance_loss_mlp": 0.0129215, + "epoch": 0.1305877047948294, + "flos": 19396938458880.0, + "grad_norm": 3.1003670458212973, + "language_loss": 0.73706764, + "learning_rate": 3.894799748360537e-06, + "loss": 0.81679469, + "num_input_tokens_seen": 46886810, + "router_z_loss_clip": 3.35742188, + "router_z_loss_mlp": 0.35009766, + "step": 2172, + "time_per_iteration": 2.541368007659912 + }, + { + "auxiliary_loss_clip": 0.06633269, + "auxiliary_loss_mlp": 0.01311381, + "balance_loss_clip": 0.06310625, + "balance_loss_mlp": 0.01278884, + "epoch": 0.13064782804749736, + "flos": 16879209701760.0, + "grad_norm": 2.044770569718403, + "language_loss": 0.7695576, + "learning_rate": 3.894675064326678e-06, + "loss": 0.84900403, + "num_input_tokens_seen": 46905620, + "router_z_loss_clip": 3.22460938, + "router_z_loss_mlp": 0.32470703, + "step": 2173, + "time_per_iteration": 2.5094704627990723 + }, + { + "auxiliary_loss_clip": 0.06648449, + "auxiliary_loss_mlp": 0.0132515, + "balance_loss_clip": 0.06310691, + "balance_loss_mlp": 0.01289125, + "epoch": 0.13070795130016533, + "flos": 24506049312000.0, + "grad_norm": 2.8505370909687575, + "language_loss": 0.725703, + "learning_rate": 3.894550308446551e-06, + "loss": 0.805439, + "num_input_tokens_seen": 46925120, + "router_z_loss_clip": 3.3828125, + "router_z_loss_mlp": 0.36035156, + "step": 2174, + "time_per_iteration": 2.5734338760375977 + }, + { + "auxiliary_loss_clip": 0.06505907, + "auxiliary_loss_mlp": 0.01291883, + "balance_loss_clip": 0.0631025, + "balance_loss_mlp": 0.0128004, + "epoch": 0.13076807455283332, + "flos": 71075288401920.0, + "grad_norm": 0.7747015133023086, + "language_loss": 0.58868217, + "learning_rate": 3.894425480724886e-06, + "loss": 0.66666007, + "num_input_tokens_seen": 46988195, + "router_z_loss_clip": 1.953125, + "router_z_loss_mlp": 0.11834717, + "step": 2175, + "time_per_iteration": 3.2926440238952637 + }, + { + "auxiliary_loss_clip": 0.0663542, + "auxiliary_loss_mlp": 0.01313196, + "balance_loss_clip": 0.06304372, + "balance_loss_mlp": 0.01276337, + "epoch": 0.13082819780550128, + "flos": 20270380619520.0, + "grad_norm": 2.4663196598164543, + "language_loss": 0.8129558, + "learning_rate": 3.894300581166417e-06, + "loss": 0.89244199, + "num_input_tokens_seen": 47004720, + "router_z_loss_clip": 3.30664062, + "router_z_loss_mlp": 0.36865234, + "step": 2176, + "time_per_iteration": 2.509202480316162 + }, + { + "auxiliary_loss_clip": 0.06636009, + "auxiliary_loss_mlp": 0.01308249, + "balance_loss_clip": 0.06307728, + "balance_loss_mlp": 0.01275204, + "epoch": 0.13088832105816925, + "flos": 34211884101120.0, + "grad_norm": 2.555490160200695, + "language_loss": 0.75945169, + "learning_rate": 3.894175609775881e-06, + "loss": 0.83889425, + "num_input_tokens_seen": 47024255, + "router_z_loss_clip": 3.28710938, + "router_z_loss_mlp": 0.33056641, + "step": 2177, + "time_per_iteration": 2.666957378387451 + }, + { + "auxiliary_loss_clip": 0.06632685, + "auxiliary_loss_mlp": 0.01303929, + "balance_loss_clip": 0.0630488, + "balance_loss_mlp": 0.01266378, + "epoch": 0.13094844431083721, + "flos": 17900797080960.0, + "grad_norm": 1.8104390236362107, + "language_loss": 0.8256914, + "learning_rate": 3.894050566558015e-06, + "loss": 0.90505755, + "num_input_tokens_seen": 47042465, + "router_z_loss_clip": 3.27929688, + "router_z_loss_mlp": 0.37548828, + "step": 2178, + "time_per_iteration": 2.5337579250335693 + }, + { + "auxiliary_loss_clip": 0.06635031, + "auxiliary_loss_mlp": 0.01298768, + "balance_loss_clip": 0.06305701, + "balance_loss_mlp": 0.01263625, + "epoch": 0.13100856756350518, + "flos": 17317062812160.0, + "grad_norm": 2.2347658227591327, + "language_loss": 0.76173234, + "learning_rate": 3.893925451517562e-06, + "loss": 0.84107035, + "num_input_tokens_seen": 47060370, + "router_z_loss_clip": 3.296875, + "router_z_loss_mlp": 0.35131836, + "step": 2179, + "time_per_iteration": 2.606982469558716 + }, + { + "auxiliary_loss_clip": 0.06624588, + "auxiliary_loss_mlp": 0.01289469, + "balance_loss_clip": 0.0630476, + "balance_loss_mlp": 0.01256281, + "epoch": 0.13106869081617314, + "flos": 22207142292480.0, + "grad_norm": 2.1299268574103074, + "language_loss": 0.85375142, + "learning_rate": 3.893800264659266e-06, + "loss": 0.93289196, + "num_input_tokens_seen": 47081415, + "router_z_loss_clip": 3.19921875, + "router_z_loss_mlp": 0.33154297, + "step": 2180, + "time_per_iteration": 2.585345506668091 + }, + { + "auxiliary_loss_clip": 0.06632008, + "auxiliary_loss_mlp": 0.01298661, + "balance_loss_clip": 0.06304625, + "balance_loss_mlp": 0.01265282, + "epoch": 0.13112881406884114, + "flos": 21769708452480.0, + "grad_norm": 1.7694842435775522, + "language_loss": 0.9062323, + "learning_rate": 3.8936750059878746e-06, + "loss": 0.98553902, + "num_input_tokens_seen": 47099860, + "router_z_loss_clip": 3.2734375, + "router_z_loss_mlp": 0.33374023, + "step": 2181, + "time_per_iteration": 2.5587892532348633 + }, + { + "auxiliary_loss_clip": 0.06634288, + "auxiliary_loss_mlp": 0.01294395, + "balance_loss_clip": 0.06307417, + "balance_loss_mlp": 0.01259776, + "epoch": 0.1311889373215091, + "flos": 23337784160640.0, + "grad_norm": 2.2247782487696557, + "language_loss": 0.70639372, + "learning_rate": 3.893549675508137e-06, + "loss": 0.78568053, + "num_input_tokens_seen": 47118540, + "router_z_loss_clip": 3.26953125, + "router_z_loss_mlp": 0.34594727, + "step": 2182, + "time_per_iteration": 2.5555248260498047 + }, + { + "auxiliary_loss_clip": 0.06638541, + "auxiliary_loss_mlp": 0.0130911, + "balance_loss_clip": 0.06305085, + "balance_loss_mlp": 0.01272799, + "epoch": 0.13124906057417707, + "flos": 21473250307200.0, + "grad_norm": 2.348832160211932, + "language_loss": 0.79619586, + "learning_rate": 3.893424273224806e-06, + "loss": 0.8756724, + "num_input_tokens_seen": 47136710, + "router_z_loss_clip": 3.33789062, + "router_z_loss_mlp": 0.36303711, + "step": 2183, + "time_per_iteration": 2.6583075523376465 + }, + { + "auxiliary_loss_clip": 0.06622553, + "auxiliary_loss_mlp": 0.01296715, + "balance_loss_clip": 0.06301284, + "balance_loss_mlp": 0.0126379, + "epoch": 0.13130918382684503, + "flos": 23261531345280.0, + "grad_norm": 1.7633024883927577, + "language_loss": 0.86310816, + "learning_rate": 3.893298799142636e-06, + "loss": 0.94230086, + "num_input_tokens_seen": 47157155, + "router_z_loss_clip": 3.2109375, + "router_z_loss_mlp": 0.32910156, + "step": 2184, + "time_per_iteration": 2.565059185028076 + }, + { + "auxiliary_loss_clip": 0.06636564, + "auxiliary_loss_mlp": 0.01289356, + "balance_loss_clip": 0.06310757, + "balance_loss_mlp": 0.0125593, + "epoch": 0.131369307079513, + "flos": 20856588583680.0, + "grad_norm": 2.0374007595813106, + "language_loss": 0.83394486, + "learning_rate": 3.893173253266387e-06, + "loss": 0.91320401, + "num_input_tokens_seen": 47176820, + "router_z_loss_clip": 3.25585938, + "router_z_loss_mlp": 0.33447266, + "step": 2185, + "time_per_iteration": 2.581048011779785 + }, + { + "auxiliary_loss_clip": 0.06633392, + "auxiliary_loss_mlp": 0.01301523, + "balance_loss_clip": 0.063053, + "balance_loss_mlp": 0.012675, + "epoch": 0.13142943033218096, + "flos": 17864138119680.0, + "grad_norm": 2.061355049120503, + "language_loss": 0.7394222, + "learning_rate": 3.893047635600818e-06, + "loss": 0.8187713, + "num_input_tokens_seen": 47195855, + "router_z_loss_clip": 3.27734375, + "router_z_loss_mlp": 0.33984375, + "step": 2186, + "time_per_iteration": 2.5314900875091553 + }, + { + "auxiliary_loss_clip": 0.06633774, + "auxiliary_loss_mlp": 0.01305006, + "balance_loss_clip": 0.06309012, + "balance_loss_mlp": 0.01268337, + "epoch": 0.13148955358484893, + "flos": 21002343960960.0, + "grad_norm": 2.3237992911957748, + "language_loss": 0.8187871, + "learning_rate": 3.892921946150693e-06, + "loss": 0.89817482, + "num_input_tokens_seen": 47214535, + "router_z_loss_clip": 3.24609375, + "router_z_loss_mlp": 0.36669922, + "step": 2187, + "time_per_iteration": 2.575146198272705 + }, + { + "auxiliary_loss_clip": 0.0650041, + "auxiliary_loss_mlp": 0.01303078, + "balance_loss_clip": 0.06306808, + "balance_loss_mlp": 0.01287998, + "epoch": 0.13154967683751692, + "flos": 70192035313920.0, + "grad_norm": 0.8229480574179819, + "language_loss": 0.58883667, + "learning_rate": 3.892796184920778e-06, + "loss": 0.66687155, + "num_input_tokens_seen": 47270300, + "router_z_loss_clip": 1.9375, + "router_z_loss_mlp": 0.1505127, + "step": 2188, + "time_per_iteration": 4.631601572036743 + }, + { + "auxiliary_loss_clip": 0.06627252, + "auxiliary_loss_mlp": 0.01301964, + "balance_loss_clip": 0.06307825, + "balance_loss_mlp": 0.01268609, + "epoch": 0.1316098000901849, + "flos": 20382411928320.0, + "grad_norm": 1.8739878728488704, + "language_loss": 0.75486964, + "learning_rate": 3.892670351915842e-06, + "loss": 0.83416182, + "num_input_tokens_seen": 47290720, + "router_z_loss_clip": 3.1953125, + "router_z_loss_mlp": 0.33300781, + "step": 2189, + "time_per_iteration": 4.007068395614624 + }, + { + "auxiliary_loss_clip": 0.06638934, + "auxiliary_loss_mlp": 0.01302262, + "balance_loss_clip": 0.06312171, + "balance_loss_mlp": 0.01267691, + "epoch": 0.13166992334285285, + "flos": 23227723422720.0, + "grad_norm": 2.019862807668573, + "language_loss": 0.73193908, + "learning_rate": 3.892544447140657e-06, + "loss": 0.81135106, + "num_input_tokens_seen": 47311820, + "router_z_loss_clip": 3.26757812, + "router_z_loss_mlp": 0.34570312, + "step": 2190, + "time_per_iteration": 2.5776755809783936 + }, + { + "auxiliary_loss_clip": 0.06636755, + "auxiliary_loss_mlp": 0.01299753, + "balance_loss_clip": 0.06315562, + "balance_loss_mlp": 0.01266828, + "epoch": 0.13173004659552082, + "flos": 23337616452480.0, + "grad_norm": 1.8457361126651268, + "language_loss": 0.75608957, + "learning_rate": 3.892418470599996e-06, + "loss": 0.83545464, + "num_input_tokens_seen": 47331605, + "router_z_loss_clip": 3.21484375, + "router_z_loss_mlp": 0.32958984, + "step": 2191, + "time_per_iteration": 2.580988645553589 + }, + { + "auxiliary_loss_clip": 0.06637161, + "auxiliary_loss_mlp": 0.01295844, + "balance_loss_clip": 0.06311083, + "balance_loss_mlp": 0.01258699, + "epoch": 0.13179016984818878, + "flos": 21257866586880.0, + "grad_norm": 2.0212941585210613, + "language_loss": 0.80481809, + "learning_rate": 3.892292422298637e-06, + "loss": 0.88414812, + "num_input_tokens_seen": 47350455, + "router_z_loss_clip": 3.2578125, + "router_z_loss_mlp": 0.37133789, + "step": 2192, + "time_per_iteration": 5.4770941734313965 + }, + { + "auxiliary_loss_clip": 0.06644538, + "auxiliary_loss_mlp": 0.01301425, + "balance_loss_clip": 0.06318243, + "balance_loss_mlp": 0.01265758, + "epoch": 0.13185029310085675, + "flos": 17783357184000.0, + "grad_norm": 2.540381366914011, + "language_loss": 0.86697793, + "learning_rate": 3.892166302241361e-06, + "loss": 0.94643748, + "num_input_tokens_seen": 47368225, + "router_z_loss_clip": 3.26757812, + "router_z_loss_mlp": 0.35693359, + "step": 2193, + "time_per_iteration": 2.5420453548431396 + }, + { + "auxiliary_loss_clip": 0.06500036, + "auxiliary_loss_mlp": 0.01269775, + "balance_loss_clip": 0.06307782, + "balance_loss_mlp": 0.01257103, + "epoch": 0.1319104163535247, + "flos": 69872586422400.0, + "grad_norm": 0.721919772393688, + "language_loss": 0.54093373, + "learning_rate": 3.8920401104329475e-06, + "loss": 0.61863184, + "num_input_tokens_seen": 47427125, + "router_z_loss_clip": 1.921875, + "router_z_loss_mlp": 0.12683105, + "step": 2194, + "time_per_iteration": 3.1521217823028564 + }, + { + "auxiliary_loss_clip": 0.06633582, + "auxiliary_loss_mlp": 0.01294441, + "balance_loss_clip": 0.06310762, + "balance_loss_mlp": 0.01261277, + "epoch": 0.1319705396061927, + "flos": 25200305516160.0, + "grad_norm": 1.726437316735012, + "language_loss": 0.7434622, + "learning_rate": 3.891913846878185e-06, + "loss": 0.82274246, + "num_input_tokens_seen": 47450275, + "router_z_loss_clip": 3.23046875, + "router_z_loss_mlp": 0.33154297, + "step": 2195, + "time_per_iteration": 2.593909740447998 + }, + { + "auxiliary_loss_clip": 0.06639563, + "auxiliary_loss_mlp": 0.01299138, + "balance_loss_clip": 0.0630713, + "balance_loss_mlp": 0.01264305, + "epoch": 0.13203066285886067, + "flos": 20746695553920.0, + "grad_norm": 1.9416785711103928, + "language_loss": 0.79390305, + "learning_rate": 3.891787511581859e-06, + "loss": 0.87329006, + "num_input_tokens_seen": 47469155, + "router_z_loss_clip": 3.32421875, + "router_z_loss_mlp": 0.34838867, + "step": 2196, + "time_per_iteration": 2.5824716091156006 + }, + { + "auxiliary_loss_clip": 0.06635743, + "auxiliary_loss_mlp": 0.01302288, + "balance_loss_clip": 0.06304654, + "balance_loss_mlp": 0.01269148, + "epoch": 0.13209078611152864, + "flos": 22060925717760.0, + "grad_norm": 8.075867999821003, + "language_loss": 0.76482284, + "learning_rate": 3.89166110454876e-06, + "loss": 0.84420311, + "num_input_tokens_seen": 47488405, + "router_z_loss_clip": 3.31054688, + "router_z_loss_mlp": 0.33105469, + "step": 2197, + "time_per_iteration": 2.5501832962036133 + }, + { + "auxiliary_loss_clip": 0.06635305, + "auxiliary_loss_mlp": 0.01300777, + "balance_loss_clip": 0.06304274, + "balance_loss_mlp": 0.01266063, + "epoch": 0.1321509093641966, + "flos": 16289731428480.0, + "grad_norm": 2.9293196732039126, + "language_loss": 0.81022984, + "learning_rate": 3.891534625783685e-06, + "loss": 0.88959062, + "num_input_tokens_seen": 47505650, + "router_z_loss_clip": 3.31054688, + "router_z_loss_mlp": 0.34716797, + "step": 2198, + "time_per_iteration": 2.570861577987671 + }, + { + "auxiliary_loss_clip": 0.06631541, + "auxiliary_loss_mlp": 0.01313296, + "balance_loss_clip": 0.06305937, + "balance_loss_mlp": 0.01279513, + "epoch": 0.13221103261686457, + "flos": 16988725388160.0, + "grad_norm": 2.4451285716665914, + "language_loss": 0.83851683, + "learning_rate": 3.891408075291425e-06, + "loss": 0.91796517, + "num_input_tokens_seen": 47521540, + "router_z_loss_clip": 3.25390625, + "router_z_loss_mlp": 0.33764648, + "step": 2199, + "time_per_iteration": 2.521033525466919 + }, + { + "auxiliary_loss_clip": 0.06631772, + "auxiliary_loss_mlp": 0.01306909, + "balance_loss_clip": 0.06307507, + "balance_loss_mlp": 0.01272887, + "epoch": 0.13227115586953253, + "flos": 34240996195200.0, + "grad_norm": 1.9425616182298255, + "language_loss": 0.71189994, + "learning_rate": 3.8912814530767826e-06, + "loss": 0.79128671, + "num_input_tokens_seen": 47543625, + "router_z_loss_clip": 3.24023438, + "router_z_loss_mlp": 0.34033203, + "step": 2200, + "time_per_iteration": 2.670046806335449 + }, + { + "auxiliary_loss_clip": 0.06617988, + "auxiliary_loss_mlp": 0.01304715, + "balance_loss_clip": 0.06300868, + "balance_loss_mlp": 0.01274341, + "epoch": 0.13233127912220052, + "flos": 20711000914560.0, + "grad_norm": 2.1724926946699754, + "language_loss": 0.86090875, + "learning_rate": 3.891154759144557e-06, + "loss": 0.94013584, + "num_input_tokens_seen": 47563740, + "router_z_loss_clip": 3.17773438, + "router_z_loss_mlp": 0.30371094, + "step": 2201, + "time_per_iteration": 2.570223569869995 + }, + { + "auxiliary_loss_clip": 0.06631213, + "auxiliary_loss_mlp": 0.01297349, + "balance_loss_clip": 0.06304044, + "balance_loss_mlp": 0.01263828, + "epoch": 0.1323914023748685, + "flos": 25810971672960.0, + "grad_norm": 1.9172071001088793, + "language_loss": 0.87768662, + "learning_rate": 3.891027993499554e-06, + "loss": 0.95697218, + "num_input_tokens_seen": 47582655, + "router_z_loss_clip": 3.2734375, + "router_z_loss_mlp": 0.33496094, + "step": 2202, + "time_per_iteration": 2.6102631092071533 + }, + { + "auxiliary_loss_clip": 0.06636258, + "auxiliary_loss_mlp": 0.012969, + "balance_loss_clip": 0.06311007, + "balance_loss_mlp": 0.01264427, + "epoch": 0.13245152562753645, + "flos": 21257908513920.0, + "grad_norm": 2.5432278039111202, + "language_loss": 0.73953617, + "learning_rate": 3.89090115614658e-06, + "loss": 0.81886774, + "num_input_tokens_seen": 47600875, + "router_z_loss_clip": 3.25390625, + "router_z_loss_mlp": 0.32470703, + "step": 2203, + "time_per_iteration": 2.582125425338745 + }, + { + "auxiliary_loss_clip": 0.0663885, + "auxiliary_loss_mlp": 0.01297802, + "balance_loss_clip": 0.06312627, + "balance_loss_mlp": 0.01266879, + "epoch": 0.13251164888020442, + "flos": 26617552675200.0, + "grad_norm": 2.0999892579623918, + "language_loss": 0.74886954, + "learning_rate": 3.890774247090444e-06, + "loss": 0.82823604, + "num_input_tokens_seen": 47619250, + "router_z_loss_clip": 3.26171875, + "router_z_loss_mlp": 0.30883789, + "step": 2204, + "time_per_iteration": 2.634873867034912 + }, + { + "auxiliary_loss_clip": 0.06637383, + "auxiliary_loss_mlp": 0.01309474, + "balance_loss_clip": 0.06314126, + "balance_loss_mlp": 0.01276119, + "epoch": 0.13257177213287238, + "flos": 29834485027200.0, + "grad_norm": 2.4895096645832235, + "language_loss": 0.79621047, + "learning_rate": 3.89064726633596e-06, + "loss": 0.87567902, + "num_input_tokens_seen": 47639445, + "router_z_loss_clip": 3.23242188, + "router_z_loss_mlp": 0.33349609, + "step": 2205, + "time_per_iteration": 2.619999647140503 + }, + { + "auxiliary_loss_clip": 0.06630976, + "auxiliary_loss_mlp": 0.01295213, + "balance_loss_clip": 0.06307817, + "balance_loss_mlp": 0.01261548, + "epoch": 0.13263189538554035, + "flos": 21294902891520.0, + "grad_norm": 2.228894402461185, + "language_loss": 0.80627573, + "learning_rate": 3.890520213887941e-06, + "loss": 0.88553762, + "num_input_tokens_seen": 47658740, + "router_z_loss_clip": 3.234375, + "router_z_loss_mlp": 0.33666992, + "step": 2206, + "time_per_iteration": 2.5711123943328857 + }, + { + "auxiliary_loss_clip": 0.06638241, + "auxiliary_loss_mlp": 0.01297492, + "balance_loss_clip": 0.06313571, + "balance_loss_mlp": 0.0126676, + "epoch": 0.13269201863820831, + "flos": 16879880534400.0, + "grad_norm": 2.2771237083056297, + "language_loss": 0.76153713, + "learning_rate": 3.890393089751208e-06, + "loss": 0.84089446, + "num_input_tokens_seen": 47676880, + "router_z_loss_clip": 3.2421875, + "router_z_loss_mlp": 0.30688477, + "step": 2207, + "time_per_iteration": 2.5054686069488525 + }, + { + "auxiliary_loss_clip": 0.06632576, + "auxiliary_loss_mlp": 0.01289317, + "balance_loss_clip": 0.06313936, + "balance_loss_mlp": 0.01259014, + "epoch": 0.1327521418908763, + "flos": 23775679198080.0, + "grad_norm": 2.287917678450009, + "language_loss": 0.85195792, + "learning_rate": 3.890265893930578e-06, + "loss": 0.9311769, + "num_input_tokens_seen": 47696635, + "router_z_loss_clip": 3.18359375, + "router_z_loss_mlp": 0.30322266, + "step": 2208, + "time_per_iteration": 2.609978675842285 + }, + { + "auxiliary_loss_clip": 0.0661916, + "auxiliary_loss_mlp": 0.0129287, + "balance_loss_clip": 0.06309634, + "balance_loss_mlp": 0.01263712, + "epoch": 0.13281226514354427, + "flos": 26512858598400.0, + "grad_norm": 2.1774657992842923, + "language_loss": 0.86578667, + "learning_rate": 3.890138626430876e-06, + "loss": 0.94490695, + "num_input_tokens_seen": 47717760, + "router_z_loss_clip": 3.09765625, + "router_z_loss_mlp": 0.29174805, + "step": 2209, + "time_per_iteration": 2.5905022621154785 + }, + { + "auxiliary_loss_clip": 0.06630558, + "auxiliary_loss_mlp": 0.01296527, + "balance_loss_clip": 0.06307525, + "balance_loss_mlp": 0.01264817, + "epoch": 0.13287238839621224, + "flos": 24505671968640.0, + "grad_norm": 2.0974790857001255, + "language_loss": 0.83324587, + "learning_rate": 3.890011287256929e-06, + "loss": 0.91251671, + "num_input_tokens_seen": 47737685, + "router_z_loss_clip": 3.234375, + "router_z_loss_mlp": 0.31689453, + "step": 2210, + "time_per_iteration": 2.605640172958374 + }, + { + "auxiliary_loss_clip": 0.06520031, + "auxiliary_loss_mlp": 0.01268108, + "balance_loss_clip": 0.06330763, + "balance_loss_mlp": 0.01256634, + "epoch": 0.1329325116488802, + "flos": 67713984264960.0, + "grad_norm": 0.7321997743468096, + "language_loss": 0.57977009, + "learning_rate": 3.889883876413563e-06, + "loss": 0.65765154, + "num_input_tokens_seen": 47802415, + "router_z_loss_clip": 1.890625, + "router_z_loss_mlp": 0.11456299, + "step": 2211, + "time_per_iteration": 3.2822937965393066 + }, + { + "auxiliary_loss_clip": 0.06521661, + "auxiliary_loss_mlp": 0.01258942, + "balance_loss_clip": 0.0633207, + "balance_loss_mlp": 0.01247897, + "epoch": 0.13299263490154817, + "flos": 72283440896640.0, + "grad_norm": 0.7669964089142771, + "language_loss": 0.54991639, + "learning_rate": 3.889756393905611e-06, + "loss": 0.62772238, + "num_input_tokens_seen": 47871485, + "router_z_loss_clip": 1.89648438, + "router_z_loss_mlp": 0.1105957, + "step": 2212, + "time_per_iteration": 3.2838916778564453 + }, + { + "auxiliary_loss_clip": 0.0664072, + "auxiliary_loss_mlp": 0.01298095, + "balance_loss_clip": 0.06314459, + "balance_loss_mlp": 0.012661, + "epoch": 0.13305275815421613, + "flos": 17937078698880.0, + "grad_norm": 3.2445802523020144, + "language_loss": 0.75483733, + "learning_rate": 3.889628839737908e-06, + "loss": 0.83422554, + "num_input_tokens_seen": 47888315, + "router_z_loss_clip": 3.26367188, + "router_z_loss_mlp": 0.31982422, + "step": 2213, + "time_per_iteration": 2.599457025527954 + }, + { + "auxiliary_loss_clip": 0.06623878, + "auxiliary_loss_mlp": 0.01290528, + "balance_loss_clip": 0.06308766, + "balance_loss_mlp": 0.01260917, + "epoch": 0.13311288140688413, + "flos": 22346566686720.0, + "grad_norm": 1.7850496574832224, + "language_loss": 0.80468798, + "learning_rate": 3.889501213915291e-06, + "loss": 0.88383198, + "num_input_tokens_seen": 47906600, + "router_z_loss_clip": 3.15429688, + "router_z_loss_mlp": 0.29614258, + "step": 2214, + "time_per_iteration": 2.572476625442505 + }, + { + "auxiliary_loss_clip": 0.06633762, + "auxiliary_loss_mlp": 0.01291249, + "balance_loss_clip": 0.06310902, + "balance_loss_mlp": 0.01259992, + "epoch": 0.1331730046595521, + "flos": 31877030880000.0, + "grad_norm": 1.879682062967662, + "language_loss": 0.71106076, + "learning_rate": 3.889373516442597e-06, + "loss": 0.79031086, + "num_input_tokens_seen": 47927630, + "router_z_loss_clip": 3.22851562, + "router_z_loss_mlp": 0.3125, + "step": 2215, + "time_per_iteration": 2.6289784908294678 + }, + { + "auxiliary_loss_clip": 0.06635362, + "auxiliary_loss_mlp": 0.01297639, + "balance_loss_clip": 0.06308068, + "balance_loss_mlp": 0.01264762, + "epoch": 0.13323312791222006, + "flos": 22573438416000.0, + "grad_norm": 2.1877299894623063, + "language_loss": 0.81866241, + "learning_rate": 3.889245747324671e-06, + "loss": 0.89799237, + "num_input_tokens_seen": 47947935, + "router_z_loss_clip": 3.27148438, + "router_z_loss_mlp": 0.32861328, + "step": 2216, + "time_per_iteration": 2.5978689193725586 + }, + { + "auxiliary_loss_clip": 0.06628902, + "auxiliary_loss_mlp": 0.01291342, + "balance_loss_clip": 0.06306753, + "balance_loss_mlp": 0.01260229, + "epoch": 0.13329325116488802, + "flos": 15090635174400.0, + "grad_norm": 1.945076656101512, + "language_loss": 0.8810879, + "learning_rate": 3.889117906566356e-06, + "loss": 0.96029037, + "num_input_tokens_seen": 47965515, + "router_z_loss_clip": 3.22265625, + "router_z_loss_mlp": 0.3112793, + "step": 2217, + "time_per_iteration": 2.5901639461517334 + }, + { + "auxiliary_loss_clip": 0.0662536, + "auxiliary_loss_mlp": 0.0129587, + "balance_loss_clip": 0.06307805, + "balance_loss_mlp": 0.01262563, + "epoch": 0.133353374417556, + "flos": 27461002273920.0, + "grad_norm": 2.771116888328456, + "language_loss": 0.75384659, + "learning_rate": 3.888989994172501e-06, + "loss": 0.83305889, + "num_input_tokens_seen": 47985675, + "router_z_loss_clip": 3.1796875, + "router_z_loss_mlp": 0.33349609, + "step": 2218, + "time_per_iteration": 2.5716331005096436 + }, + { + "auxiliary_loss_clip": 0.06631406, + "auxiliary_loss_mlp": 0.01293158, + "balance_loss_clip": 0.06307958, + "balance_loss_mlp": 0.01259875, + "epoch": 0.13341349767022395, + "flos": 24101081729280.0, + "grad_norm": 1.6852729372488615, + "language_loss": 0.88550645, + "learning_rate": 3.8888620101479565e-06, + "loss": 0.96475214, + "num_input_tokens_seen": 48004985, + "router_z_loss_clip": 3.23242188, + "router_z_loss_mlp": 0.33300781, + "step": 2219, + "time_per_iteration": 2.6070170402526855 + }, + { + "auxiliary_loss_clip": 0.06621003, + "auxiliary_loss_mlp": 0.01287851, + "balance_loss_clip": 0.06303806, + "balance_loss_mlp": 0.01257381, + "epoch": 0.13347362092289192, + "flos": 24140088604800.0, + "grad_norm": 2.0906842838932556, + "language_loss": 0.7815029, + "learning_rate": 3.888733954497574e-06, + "loss": 0.86059141, + "num_input_tokens_seen": 48024965, + "router_z_loss_clip": 3.16992188, + "router_z_loss_mlp": 0.3046875, + "step": 2220, + "time_per_iteration": 2.5560426712036133 + }, + { + "auxiliary_loss_clip": 0.06625573, + "auxiliary_loss_mlp": 0.01294385, + "balance_loss_clip": 0.06307516, + "balance_loss_mlp": 0.0126432, + "epoch": 0.1335337441755599, + "flos": 18441499478400.0, + "grad_norm": 3.5848326197945974, + "language_loss": 0.80259734, + "learning_rate": 3.888605827226212e-06, + "loss": 0.88179696, + "num_input_tokens_seen": 48040890, + "router_z_loss_clip": 3.18164062, + "router_z_loss_mlp": 0.30078125, + "step": 2221, + "time_per_iteration": 2.554230213165283 + }, + { + "auxiliary_loss_clip": 0.06500886, + "auxiliary_loss_mlp": 0.01279151, + "balance_loss_clip": 0.06314573, + "balance_loss_mlp": 0.01265382, + "epoch": 0.13359386742822787, + "flos": 50627608542720.0, + "grad_norm": 0.9620548374199929, + "language_loss": 0.69134498, + "learning_rate": 3.8884776283387275e-06, + "loss": 0.76914537, + "num_input_tokens_seen": 48091855, + "router_z_loss_clip": 1.86621094, + "router_z_loss_mlp": 0.13806152, + "step": 2222, + "time_per_iteration": 3.0396814346313477 + }, + { + "auxiliary_loss_clip": 0.0662626, + "auxiliary_loss_mlp": 0.01285858, + "balance_loss_clip": 0.06309929, + "balance_loss_mlp": 0.01257987, + "epoch": 0.13365399068089584, + "flos": 22784294016000.0, + "grad_norm": 6.993006748631453, + "language_loss": 0.68394774, + "learning_rate": 3.888349357839982e-06, + "loss": 0.76306891, + "num_input_tokens_seen": 48111350, + "router_z_loss_clip": 3.16015625, + "router_z_loss_mlp": 0.27856445, + "step": 2223, + "time_per_iteration": 2.6058313846588135 + }, + { + "auxiliary_loss_clip": 0.06624826, + "auxiliary_loss_mlp": 0.01288517, + "balance_loss_clip": 0.06304329, + "balance_loss_mlp": 0.01257296, + "epoch": 0.1337141139335638, + "flos": 12536540945280.0, + "grad_norm": 2.4608215865303937, + "language_loss": 0.8412739, + "learning_rate": 3.88822101573484e-06, + "loss": 0.9204073, + "num_input_tokens_seen": 48129840, + "router_z_loss_clip": 3.2109375, + "router_z_loss_mlp": 0.31213379, + "step": 2224, + "time_per_iteration": 2.5372307300567627 + }, + { + "auxiliary_loss_clip": 0.066294, + "auxiliary_loss_mlp": 0.01287352, + "balance_loss_clip": 0.06301981, + "balance_loss_mlp": 0.01255499, + "epoch": 0.13377423718623177, + "flos": 23045560646400.0, + "grad_norm": 2.2168840240666294, + "language_loss": 0.67877412, + "learning_rate": 3.888092602028167e-06, + "loss": 0.7579416, + "num_input_tokens_seen": 48149240, + "router_z_loss_clip": 3.27734375, + "router_z_loss_mlp": 0.31835938, + "step": 2225, + "time_per_iteration": 2.567253589630127 + }, + { + "auxiliary_loss_clip": 0.06627665, + "auxiliary_loss_mlp": 0.01285599, + "balance_loss_clip": 0.06307095, + "balance_loss_mlp": 0.01257406, + "epoch": 0.13383436043889974, + "flos": 16221905948160.0, + "grad_norm": 2.1695875347778184, + "language_loss": 0.90785301, + "learning_rate": 3.887964116724835e-06, + "loss": 0.98698568, + "num_input_tokens_seen": 48166330, + "router_z_loss_clip": 3.20703125, + "router_z_loss_mlp": 0.28186035, + "step": 2226, + "time_per_iteration": 2.6064305305480957 + }, + { + "auxiliary_loss_clip": 0.06623043, + "auxiliary_loss_mlp": 0.0129267, + "balance_loss_clip": 0.06300287, + "balance_loss_mlp": 0.01261771, + "epoch": 0.1338944836915677, + "flos": 24286514814720.0, + "grad_norm": 2.574481606503262, + "language_loss": 0.75021911, + "learning_rate": 3.887835559829712e-06, + "loss": 0.82937622, + "num_input_tokens_seen": 48187600, + "router_z_loss_clip": 3.23046875, + "router_z_loss_mlp": 0.30883789, + "step": 2227, + "time_per_iteration": 4.016468286514282 + }, + { + "auxiliary_loss_clip": 0.06618345, + "auxiliary_loss_mlp": 0.01292665, + "balance_loss_clip": 0.0629885, + "balance_loss_mlp": 0.01261265, + "epoch": 0.1339546069442357, + "flos": 17603793884160.0, + "grad_norm": 2.0025343623105214, + "language_loss": 0.8591758, + "learning_rate": 3.8877069313476764e-06, + "loss": 0.93828595, + "num_input_tokens_seen": 48204400, + "router_z_loss_clip": 3.1953125, + "router_z_loss_mlp": 0.31396484, + "step": 2228, + "time_per_iteration": 2.55798077583313 + }, + { + "auxiliary_loss_clip": 0.06615113, + "auxiliary_loss_mlp": 0.01284588, + "balance_loss_clip": 0.06298958, + "balance_loss_mlp": 0.01255548, + "epoch": 0.13401473019690366, + "flos": 18996163580160.0, + "grad_norm": 1.8879365390563052, + "language_loss": 0.82201439, + "learning_rate": 3.8875782312836054e-06, + "loss": 0.90101147, + "num_input_tokens_seen": 48222180, + "router_z_loss_clip": 3.15820312, + "router_z_loss_mlp": 0.29052734, + "step": 2229, + "time_per_iteration": 4.120098829269409 + }, + { + "auxiliary_loss_clip": 0.06619616, + "auxiliary_loss_mlp": 0.01290736, + "balance_loss_clip": 0.06300908, + "balance_loss_mlp": 0.01259849, + "epoch": 0.13407485344957162, + "flos": 26951214833280.0, + "grad_norm": 2.2979177943800386, + "language_loss": 0.7564404, + "learning_rate": 3.887449459642378e-06, + "loss": 0.83554387, + "num_input_tokens_seen": 48243245, + "router_z_loss_clip": 3.1875, + "router_z_loss_mlp": 0.30871582, + "step": 2230, + "time_per_iteration": 2.6150131225585938 + }, + { + "auxiliary_loss_clip": 0.06620437, + "auxiliary_loss_mlp": 0.01289621, + "balance_loss_clip": 0.06302108, + "balance_loss_mlp": 0.01261059, + "epoch": 0.1341349767022396, + "flos": 20345585258880.0, + "grad_norm": 1.8496833611889134, + "language_loss": 0.81113201, + "learning_rate": 3.8873206164288785e-06, + "loss": 0.89023262, + "num_input_tokens_seen": 48262600, + "router_z_loss_clip": 3.1796875, + "router_z_loss_mlp": 0.28564453, + "step": 2231, + "time_per_iteration": 2.5791971683502197 + }, + { + "auxiliary_loss_clip": 0.06629717, + "auxiliary_loss_mlp": 0.01304097, + "balance_loss_clip": 0.0629984, + "balance_loss_mlp": 0.01268811, + "epoch": 0.13419509995490755, + "flos": 29869802323200.0, + "grad_norm": 3.0058197712179218, + "language_loss": 0.73244405, + "learning_rate": 3.887191701647992e-06, + "loss": 0.81178224, + "num_input_tokens_seen": 48285075, + "router_z_loss_clip": 3.30273438, + "router_z_loss_mlp": 0.3527832, + "step": 2232, + "time_per_iteration": 4.126416444778442 + }, + { + "auxiliary_loss_clip": 0.06625827, + "auxiliary_loss_mlp": 0.01292477, + "balance_loss_clip": 0.06298069, + "balance_loss_mlp": 0.01260052, + "epoch": 0.13425522320757552, + "flos": 26950250511360.0, + "grad_norm": 2.8502119867979823, + "language_loss": 0.67005944, + "learning_rate": 3.8870627153046066e-06, + "loss": 0.74924242, + "num_input_tokens_seen": 48301285, + "router_z_loss_clip": 3.27539062, + "router_z_loss_mlp": 0.32421875, + "step": 2233, + "time_per_iteration": 2.57535457611084 + }, + { + "auxiliary_loss_clip": 0.0661561, + "auxiliary_loss_mlp": 0.01292122, + "balance_loss_clip": 0.0629602, + "balance_loss_mlp": 0.0126096, + "epoch": 0.1343153464602435, + "flos": 15782501537280.0, + "grad_norm": 2.818232021038303, + "language_loss": 0.82633889, + "learning_rate": 3.886933657403615e-06, + "loss": 0.90541625, + "num_input_tokens_seen": 48317835, + "router_z_loss_clip": 3.19726562, + "router_z_loss_mlp": 0.31176758, + "step": 2234, + "time_per_iteration": 2.5729787349700928 + }, + { + "auxiliary_loss_clip": 0.06617501, + "auxiliary_loss_mlp": 0.01296303, + "balance_loss_clip": 0.06299153, + "balance_loss_mlp": 0.01266668, + "epoch": 0.13437546971291148, + "flos": 24321370913280.0, + "grad_norm": 2.028590274897441, + "language_loss": 0.82841778, + "learning_rate": 3.886804527949909e-06, + "loss": 0.90755594, + "num_input_tokens_seen": 48335670, + "router_z_loss_clip": 3.18359375, + "router_z_loss_mlp": 0.29638672, + "step": 2235, + "time_per_iteration": 2.593050241470337 + }, + { + "auxiliary_loss_clip": 0.06612507, + "auxiliary_loss_mlp": 0.01293723, + "balance_loss_clip": 0.06294179, + "balance_loss_mlp": 0.01261989, + "epoch": 0.13443559296557944, + "flos": 26657817361920.0, + "grad_norm": 1.9716678370354759, + "language_loss": 0.87708902, + "learning_rate": 3.8866753269483864e-06, + "loss": 0.95615125, + "num_input_tokens_seen": 48357805, + "router_z_loss_clip": 3.18554688, + "router_z_loss_mlp": 0.31738281, + "step": 2236, + "time_per_iteration": 2.5910720825195312 + }, + { + "auxiliary_loss_clip": 0.06621092, + "auxiliary_loss_mlp": 0.01294743, + "balance_loss_clip": 0.06297852, + "balance_loss_mlp": 0.012627, + "epoch": 0.1344957162182474, + "flos": 21802216636800.0, + "grad_norm": 1.7646832896946034, + "language_loss": 0.78455186, + "learning_rate": 3.886546054403946e-06, + "loss": 0.86371022, + "num_input_tokens_seen": 48377845, + "router_z_loss_clip": 3.23632812, + "router_z_loss_mlp": 0.32080078, + "step": 2237, + "time_per_iteration": 2.5423593521118164 + }, + { + "auxiliary_loss_clip": 0.06621015, + "auxiliary_loss_mlp": 0.01296744, + "balance_loss_clip": 0.06297819, + "balance_loss_mlp": 0.01263746, + "epoch": 0.13455583947091537, + "flos": 19871785946880.0, + "grad_norm": 2.139876962287315, + "language_loss": 0.80559266, + "learning_rate": 3.886416710321491e-06, + "loss": 0.88477021, + "num_input_tokens_seen": 48394735, + "router_z_loss_clip": 3.23046875, + "router_z_loss_mlp": 0.33007812, + "step": 2238, + "time_per_iteration": 2.547511100769043 + }, + { + "auxiliary_loss_clip": 0.0662026, + "auxiliary_loss_mlp": 0.01290468, + "balance_loss_clip": 0.06300892, + "balance_loss_mlp": 0.0125945, + "epoch": 0.13461596272358334, + "flos": 30854730741120.0, + "grad_norm": 2.2946937997388983, + "language_loss": 0.69019175, + "learning_rate": 3.886287294705924e-06, + "loss": 0.76929903, + "num_input_tokens_seen": 48414200, + "router_z_loss_clip": 3.19335938, + "router_z_loss_mlp": 0.31005859, + "step": 2239, + "time_per_iteration": 2.6161396503448486 + }, + { + "auxiliary_loss_clip": 0.06626255, + "auxiliary_loss_mlp": 0.0129458, + "balance_loss_clip": 0.06302193, + "balance_loss_mlp": 0.01262609, + "epoch": 0.1346760859762513, + "flos": 12499253078400.0, + "grad_norm": 2.740092234793679, + "language_loss": 0.83294439, + "learning_rate": 3.8861578075621555e-06, + "loss": 0.91215271, + "num_input_tokens_seen": 48431065, + "router_z_loss_clip": 3.2421875, + "router_z_loss_mlp": 0.31958008, + "step": 2240, + "time_per_iteration": 2.531810998916626 + }, + { + "auxiliary_loss_clip": 0.06621873, + "auxiliary_loss_mlp": 0.01289824, + "balance_loss_clip": 0.06297486, + "balance_loss_mlp": 0.01256278, + "epoch": 0.1347362092289193, + "flos": 21842607104640.0, + "grad_norm": 1.6487000610588447, + "language_loss": 0.78665066, + "learning_rate": 3.886028248895093e-06, + "loss": 0.86576766, + "num_input_tokens_seen": 48450335, + "router_z_loss_clip": 3.2421875, + "router_z_loss_mlp": 0.33569336, + "step": 2241, + "time_per_iteration": 2.5346198081970215 + }, + { + "auxiliary_loss_clip": 0.06618196, + "auxiliary_loss_mlp": 0.01285675, + "balance_loss_clip": 0.06305367, + "balance_loss_mlp": 0.01256636, + "epoch": 0.13479633248158726, + "flos": 23515502670720.0, + "grad_norm": 1.8184249012274396, + "language_loss": 0.84641361, + "learning_rate": 3.88589861870965e-06, + "loss": 0.92545235, + "num_input_tokens_seen": 48468555, + "router_z_loss_clip": 3.12890625, + "router_z_loss_mlp": 0.29052734, + "step": 2242, + "time_per_iteration": 2.6532411575317383 + }, + { + "auxiliary_loss_clip": 0.0662721, + "auxiliary_loss_mlp": 0.01293952, + "balance_loss_clip": 0.06304164, + "balance_loss_mlp": 0.01261098, + "epoch": 0.13485645573425523, + "flos": 29350874787840.0, + "grad_norm": 2.677815565759994, + "language_loss": 0.66332561, + "learning_rate": 3.885768917010744e-06, + "loss": 0.74253726, + "num_input_tokens_seen": 48488515, + "router_z_loss_clip": 3.22851562, + "router_z_loss_mlp": 0.32836914, + "step": 2243, + "time_per_iteration": 2.599304437637329 + }, + { + "auxiliary_loss_clip": 0.06611082, + "auxiliary_loss_mlp": 0.01284736, + "balance_loss_clip": 0.06295401, + "balance_loss_mlp": 0.01256042, + "epoch": 0.1349165789869232, + "flos": 28044484980480.0, + "grad_norm": 1.4756823100545766, + "language_loss": 0.73444742, + "learning_rate": 3.8856391438032895e-06, + "loss": 0.81340563, + "num_input_tokens_seen": 48510515, + "router_z_loss_clip": 3.15234375, + "router_z_loss_mlp": 0.28662109, + "step": 2244, + "time_per_iteration": 2.640366554260254 + }, + { + "auxiliary_loss_clip": 0.06614108, + "auxiliary_loss_mlp": 0.01291938, + "balance_loss_clip": 0.062971, + "balance_loss_mlp": 0.01260133, + "epoch": 0.13497670223959116, + "flos": 22859834071680.0, + "grad_norm": 7.9965666613423, + "language_loss": 0.87522435, + "learning_rate": 3.88550929909221e-06, + "loss": 0.95428485, + "num_input_tokens_seen": 48529940, + "router_z_loss_clip": 3.17578125, + "router_z_loss_mlp": 0.31787109, + "step": 2245, + "time_per_iteration": 2.537259340286255 + }, + { + "auxiliary_loss_clip": 0.06609753, + "auxiliary_loss_mlp": 0.01291064, + "balance_loss_clip": 0.06298651, + "balance_loss_mlp": 0.0126119, + "epoch": 0.13503682549225912, + "flos": 16509517488000.0, + "grad_norm": 1.6351770671547161, + "language_loss": 0.80275553, + "learning_rate": 3.88537938288243e-06, + "loss": 0.88176376, + "num_input_tokens_seen": 48548190, + "router_z_loss_clip": 3.11328125, + "router_z_loss_mlp": 0.29858398, + "step": 2246, + "time_per_iteration": 2.576324224472046 + }, + { + "auxiliary_loss_clip": 0.06503996, + "auxiliary_loss_mlp": 0.01268266, + "balance_loss_clip": 0.06311698, + "balance_loss_mlp": 0.01256631, + "epoch": 0.1350969487449271, + "flos": 70775979217920.0, + "grad_norm": 0.7288766997222871, + "language_loss": 0.60674834, + "learning_rate": 3.885249395178874e-06, + "loss": 0.68447095, + "num_input_tokens_seen": 48613165, + "router_z_loss_clip": 1.921875, + "router_z_loss_mlp": 0.11621094, + "step": 2247, + "time_per_iteration": 3.295891046524048 + }, + { + "auxiliary_loss_clip": 0.06638567, + "auxiliary_loss_mlp": 0.01298182, + "balance_loss_clip": 0.06305797, + "balance_loss_mlp": 0.01262229, + "epoch": 0.13515707199759508, + "flos": 23082680805120.0, + "grad_norm": 2.7104639981136662, + "language_loss": 0.82279253, + "learning_rate": 3.885119335986473e-06, + "loss": 0.90216005, + "num_input_tokens_seen": 48631705, + "router_z_loss_clip": 3.33203125, + "router_z_loss_mlp": 0.359375, + "step": 2248, + "time_per_iteration": 2.575490951538086 + }, + { + "auxiliary_loss_clip": 0.06606994, + "auxiliary_loss_mlp": 0.01284005, + "balance_loss_clip": 0.0629556, + "balance_loss_mlp": 0.01255013, + "epoch": 0.13521719525026304, + "flos": 23193244667520.0, + "grad_norm": 1.8435286673705464, + "language_loss": 0.7853781, + "learning_rate": 3.884989205310157e-06, + "loss": 0.86428809, + "num_input_tokens_seen": 48649740, + "router_z_loss_clip": 3.11328125, + "router_z_loss_mlp": 0.2902832, + "step": 2249, + "time_per_iteration": 2.5745737552642822 + }, + { + "auxiliary_loss_clip": 0.06615513, + "auxiliary_loss_mlp": 0.01290474, + "balance_loss_clip": 0.06300813, + "balance_loss_mlp": 0.01262293, + "epoch": 0.135277318502931, + "flos": 24797937409920.0, + "grad_norm": 1.7186486055988894, + "language_loss": 0.86064833, + "learning_rate": 3.884859003154862e-06, + "loss": 0.93970823, + "num_input_tokens_seen": 48671565, + "router_z_loss_clip": 3.14453125, + "router_z_loss_mlp": 0.28210449, + "step": 2250, + "time_per_iteration": 2.594517469406128 + }, + { + "auxiliary_loss_clip": 0.06621417, + "auxiliary_loss_mlp": 0.01303153, + "balance_loss_clip": 0.06298415, + "balance_loss_mlp": 0.01270108, + "epoch": 0.13533744175559898, + "flos": 21915044559360.0, + "grad_norm": 3.4195422131585564, + "language_loss": 0.83116192, + "learning_rate": 3.884728729525524e-06, + "loss": 0.91040766, + "num_input_tokens_seen": 48690425, + "router_z_loss_clip": 3.22851562, + "router_z_loss_mlp": 0.33032227, + "step": 2251, + "time_per_iteration": 2.5615222454071045 + }, + { + "auxiliary_loss_clip": 0.066163, + "auxiliary_loss_mlp": 0.01290158, + "balance_loss_clip": 0.06295693, + "balance_loss_mlp": 0.01258579, + "epoch": 0.13539756500826694, + "flos": 21217434192000.0, + "grad_norm": 1.7358628614083547, + "language_loss": 0.86943758, + "learning_rate": 3.884598384427084e-06, + "loss": 0.94850212, + "num_input_tokens_seen": 48707505, + "router_z_loss_clip": 3.20703125, + "router_z_loss_mlp": 0.31555176, + "step": 2252, + "time_per_iteration": 2.5325772762298584 + }, + { + "auxiliary_loss_clip": 0.06482528, + "auxiliary_loss_mlp": 0.01279879, + "balance_loss_clip": 0.06294215, + "balance_loss_mlp": 0.01267404, + "epoch": 0.1354576882609349, + "flos": 63260835500160.0, + "grad_norm": 0.7528010548037618, + "language_loss": 0.61151105, + "learning_rate": 3.884467967864485e-06, + "loss": 0.68913507, + "num_input_tokens_seen": 48775895, + "router_z_loss_clip": 1.88867188, + "router_z_loss_mlp": 0.12481689, + "step": 2253, + "time_per_iteration": 3.2731101512908936 + }, + { + "auxiliary_loss_clip": 0.06617865, + "auxiliary_loss_mlp": 0.01297527, + "balance_loss_clip": 0.06298327, + "balance_loss_mlp": 0.01266961, + "epoch": 0.1355178115136029, + "flos": 25489971480960.0, + "grad_norm": 1.734180018549956, + "language_loss": 0.90171039, + "learning_rate": 3.884337479842671e-06, + "loss": 0.98086423, + "num_input_tokens_seen": 48798370, + "router_z_loss_clip": 3.19726562, + "router_z_loss_mlp": 0.30517578, + "step": 2254, + "time_per_iteration": 2.5830373764038086 + }, + { + "auxiliary_loss_clip": 0.06624171, + "auxiliary_loss_mlp": 0.01291824, + "balance_loss_clip": 0.06297128, + "balance_loss_mlp": 0.01259709, + "epoch": 0.13557793476627086, + "flos": 21623491877760.0, + "grad_norm": 2.5405517045767865, + "language_loss": 0.85834336, + "learning_rate": 3.884206920366591e-06, + "loss": 0.93750322, + "num_input_tokens_seen": 48817955, + "router_z_loss_clip": 3.26757812, + "router_z_loss_mlp": 0.32104492, + "step": 2255, + "time_per_iteration": 2.5849766731262207 + }, + { + "auxiliary_loss_clip": 0.06615041, + "auxiliary_loss_mlp": 0.01294235, + "balance_loss_clip": 0.06296261, + "balance_loss_mlp": 0.01264862, + "epoch": 0.13563805801893883, + "flos": 24933839932800.0, + "grad_norm": 2.4937460094050534, + "language_loss": 0.7602762, + "learning_rate": 3.884076289441196e-06, + "loss": 0.83936894, + "num_input_tokens_seen": 48836330, + "router_z_loss_clip": 3.18945312, + "router_z_loss_mlp": 0.29370117, + "step": 2256, + "time_per_iteration": 2.5914275646209717 + }, + { + "auxiliary_loss_clip": 0.06621285, + "auxiliary_loss_mlp": 0.01288962, + "balance_loss_clip": 0.06294358, + "balance_loss_mlp": 0.01257563, + "epoch": 0.1356981812716068, + "flos": 14754415466880.0, + "grad_norm": 2.129121942862091, + "language_loss": 0.84234703, + "learning_rate": 3.88394558707144e-06, + "loss": 0.92144954, + "num_input_tokens_seen": 48851890, + "router_z_loss_clip": 3.26367188, + "router_z_loss_mlp": 0.31420898, + "step": 2257, + "time_per_iteration": 2.5664286613464355 + }, + { + "auxiliary_loss_clip": 0.06630847, + "auxiliary_loss_mlp": 0.0129256, + "balance_loss_clip": 0.06299773, + "balance_loss_mlp": 0.01259658, + "epoch": 0.13575830452427476, + "flos": 11113256292480.0, + "grad_norm": 1.9364367185101232, + "language_loss": 0.83362973, + "learning_rate": 3.883814813262277e-06, + "loss": 0.91286373, + "num_input_tokens_seen": 48865510, + "router_z_loss_clip": 3.3125, + "router_z_loss_mlp": 0.32910156, + "step": 2258, + "time_per_iteration": 2.521657705307007 + }, + { + "auxiliary_loss_clip": 0.06621088, + "auxiliary_loss_mlp": 0.01297355, + "balance_loss_clip": 0.0629478, + "balance_loss_mlp": 0.01264858, + "epoch": 0.13581842777694272, + "flos": 17964849127680.0, + "grad_norm": 2.721301656824917, + "language_loss": 0.83752787, + "learning_rate": 3.883683968018669e-06, + "loss": 0.91671234, + "num_input_tokens_seen": 48882360, + "router_z_loss_clip": 3.26171875, + "router_z_loss_mlp": 0.32519531, + "step": 2259, + "time_per_iteration": 2.521693706512451 + }, + { + "auxiliary_loss_clip": 0.0660786, + "auxiliary_loss_mlp": 0.01289157, + "balance_loss_clip": 0.06291058, + "balance_loss_mlp": 0.01260952, + "epoch": 0.1358785510296107, + "flos": 22863817140480.0, + "grad_norm": 2.0214358343175927, + "language_loss": 0.74903429, + "learning_rate": 3.8835530513455755e-06, + "loss": 0.82800448, + "num_input_tokens_seen": 48902700, + "router_z_loss_clip": 3.16992188, + "router_z_loss_mlp": 0.28198242, + "step": 2260, + "time_per_iteration": 2.5302374362945557 + }, + { + "auxiliary_loss_clip": 0.0660997, + "auxiliary_loss_mlp": 0.0129096, + "balance_loss_clip": 0.06293269, + "balance_loss_mlp": 0.01260859, + "epoch": 0.13593867428227868, + "flos": 25746542282880.0, + "grad_norm": 2.2338901691781925, + "language_loss": 0.76686287, + "learning_rate": 3.883422063247961e-06, + "loss": 0.84587216, + "num_input_tokens_seen": 48922525, + "router_z_loss_clip": 3.16796875, + "router_z_loss_mlp": 0.30114746, + "step": 2261, + "time_per_iteration": 2.5939574241638184 + }, + { + "auxiliary_loss_clip": 0.06616522, + "auxiliary_loss_mlp": 0.01291008, + "balance_loss_clip": 0.0629552, + "balance_loss_mlp": 0.01259132, + "epoch": 0.13599879753494665, + "flos": 31257350409600.0, + "grad_norm": 2.2895573692407547, + "language_loss": 0.6521523, + "learning_rate": 3.883291003730794e-06, + "loss": 0.73122764, + "num_input_tokens_seen": 48942510, + "router_z_loss_clip": 3.21289062, + "router_z_loss_mlp": 0.31884766, + "step": 2262, + "time_per_iteration": 2.615324020385742 + }, + { + "auxiliary_loss_clip": 0.0662135, + "auxiliary_loss_mlp": 0.01300411, + "balance_loss_clip": 0.06298003, + "balance_loss_mlp": 0.01269584, + "epoch": 0.1360589207876146, + "flos": 23921853845760.0, + "grad_norm": 2.421989013841254, + "language_loss": 0.84175652, + "learning_rate": 3.883159872799043e-06, + "loss": 0.92097414, + "num_input_tokens_seen": 48962625, + "router_z_loss_clip": 3.23632812, + "router_z_loss_mlp": 0.30859375, + "step": 2263, + "time_per_iteration": 2.5566399097442627 + }, + { + "auxiliary_loss_clip": 0.06629188, + "auxiliary_loss_mlp": 0.01291754, + "balance_loss_clip": 0.06304573, + "balance_loss_mlp": 0.0125859, + "epoch": 0.13611904404028258, + "flos": 19980295384320.0, + "grad_norm": 2.5264058207475215, + "language_loss": 0.89336157, + "learning_rate": 3.8830286704576815e-06, + "loss": 0.97257102, + "num_input_tokens_seen": 48982525, + "router_z_loss_clip": 3.24804688, + "router_z_loss_mlp": 0.33178711, + "step": 2264, + "time_per_iteration": 2.5305962562561035 + }, + { + "auxiliary_loss_clip": 0.06637362, + "auxiliary_loss_mlp": 0.0129781, + "balance_loss_clip": 0.06308438, + "balance_loss_mlp": 0.01265195, + "epoch": 0.13617916729295054, + "flos": 15345990092160.0, + "grad_norm": 2.7927094576438716, + "language_loss": 0.71764517, + "learning_rate": 3.882897396711683e-06, + "loss": 0.79699689, + "num_input_tokens_seen": 48997605, + "router_z_loss_clip": 3.28515625, + "router_z_loss_mlp": 0.32617188, + "step": 2265, + "time_per_iteration": 2.561797857284546 + }, + { + "auxiliary_loss_clip": 0.06615983, + "auxiliary_loss_mlp": 0.01290453, + "balance_loss_clip": 0.06299248, + "balance_loss_mlp": 0.01262034, + "epoch": 0.1362392905456185, + "flos": 27458402797440.0, + "grad_norm": 2.5604448311617825, + "language_loss": 0.67458075, + "learning_rate": 3.882766051566027e-06, + "loss": 0.75364506, + "num_input_tokens_seen": 49018535, + "router_z_loss_clip": 3.16992188, + "router_z_loss_mlp": 0.28381348, + "step": 2266, + "time_per_iteration": 2.5694286823272705 + }, + { + "auxiliary_loss_clip": 0.06624304, + "auxiliary_loss_mlp": 0.01294932, + "balance_loss_clip": 0.06304609, + "balance_loss_mlp": 0.01263711, + "epoch": 0.1362994137982865, + "flos": 25015920606720.0, + "grad_norm": 2.0527906242943983, + "language_loss": 0.77445233, + "learning_rate": 3.882634635025694e-06, + "loss": 0.85364473, + "num_input_tokens_seen": 49038865, + "router_z_loss_clip": 3.1953125, + "router_z_loss_mlp": 0.31237793, + "step": 2267, + "time_per_iteration": 4.004362106323242 + }, + { + "auxiliary_loss_clip": 0.06632047, + "auxiliary_loss_mlp": 0.01290209, + "balance_loss_clip": 0.0631062, + "balance_loss_mlp": 0.01259882, + "epoch": 0.13635953705095447, + "flos": 20309261713920.0, + "grad_norm": 1.8370610095313742, + "language_loss": 0.836191, + "learning_rate": 3.882503147095667e-06, + "loss": 0.91541362, + "num_input_tokens_seen": 49058010, + "router_z_loss_clip": 3.21875, + "router_z_loss_mlp": 0.30322266, + "step": 2268, + "time_per_iteration": 3.9506208896636963 + }, + { + "auxiliary_loss_clip": 0.06630498, + "auxiliary_loss_mlp": 0.01294319, + "balance_loss_clip": 0.06311751, + "balance_loss_mlp": 0.01262013, + "epoch": 0.13641966030362243, + "flos": 31366530679680.0, + "grad_norm": 1.9828007462930386, + "language_loss": 0.7747438, + "learning_rate": 3.882371587780931e-06, + "loss": 0.85399193, + "num_input_tokens_seen": 49080330, + "router_z_loss_clip": 3.1875, + "router_z_loss_mlp": 0.32299805, + "step": 2269, + "time_per_iteration": 2.653453826904297 + }, + { + "auxiliary_loss_clip": 0.06638865, + "auxiliary_loss_mlp": 0.01296587, + "balance_loss_clip": 0.06316057, + "balance_loss_mlp": 0.0126545, + "epoch": 0.1364797835562904, + "flos": 20483122936320.0, + "grad_norm": 2.359526754249971, + "language_loss": 0.8236903, + "learning_rate": 3.882239957086477e-06, + "loss": 0.90304482, + "num_input_tokens_seen": 49097035, + "router_z_loss_clip": 3.22851562, + "router_z_loss_mlp": 0.31152344, + "step": 2270, + "time_per_iteration": 2.5675413608551025 + }, + { + "auxiliary_loss_clip": 0.06635441, + "auxiliary_loss_mlp": 0.01293131, + "balance_loss_clip": 0.06311204, + "balance_loss_mlp": 0.01261254, + "epoch": 0.13653990680895836, + "flos": 13083280836480.0, + "grad_norm": 2.670574241660613, + "language_loss": 0.77002323, + "learning_rate": 3.882108255017295e-06, + "loss": 0.84930891, + "num_input_tokens_seen": 49113945, + "router_z_loss_clip": 3.24414062, + "router_z_loss_mlp": 0.31884766, + "step": 2271, + "time_per_iteration": 3.976745367050171 + }, + { + "auxiliary_loss_clip": 0.06636623, + "auxiliary_loss_mlp": 0.01296686, + "balance_loss_clip": 0.06313315, + "balance_loss_mlp": 0.0126419, + "epoch": 0.13660003006162633, + "flos": 16952443770240.0, + "grad_norm": 2.320627701174975, + "language_loss": 0.81754398, + "learning_rate": 3.881976481578379e-06, + "loss": 0.89687717, + "num_input_tokens_seen": 49132855, + "router_z_loss_clip": 3.23242188, + "router_z_loss_mlp": 0.32495117, + "step": 2272, + "time_per_iteration": 4.03596043586731 + }, + { + "auxiliary_loss_clip": 0.0650102, + "auxiliary_loss_mlp": 0.01266825, + "balance_loss_clip": 0.06312356, + "balance_loss_mlp": 0.01255327, + "epoch": 0.1366601533142943, + "flos": 68703105386880.0, + "grad_norm": 0.6745755938751765, + "language_loss": 0.60570937, + "learning_rate": 3.8818446367747255e-06, + "loss": 0.68338782, + "num_input_tokens_seen": 49198310, + "router_z_loss_clip": 1.890625, + "router_z_loss_mlp": 0.11480713, + "step": 2273, + "time_per_iteration": 3.287332534790039 + }, + { + "auxiliary_loss_clip": 0.06625689, + "auxiliary_loss_mlp": 0.01290706, + "balance_loss_clip": 0.06308322, + "balance_loss_mlp": 0.01259831, + "epoch": 0.13672027656696228, + "flos": 19250176832640.0, + "grad_norm": 1.730825672757131, + "language_loss": 0.79225731, + "learning_rate": 3.881712720611336e-06, + "loss": 0.87142122, + "num_input_tokens_seen": 49217250, + "router_z_loss_clip": 3.171875, + "router_z_loss_mlp": 0.30883789, + "step": 2274, + "time_per_iteration": 2.562556743621826 + }, + { + "auxiliary_loss_clip": 0.06626303, + "auxiliary_loss_mlp": 0.01302977, + "balance_loss_clip": 0.06308225, + "balance_loss_mlp": 0.01270457, + "epoch": 0.13678039981963025, + "flos": 24541785878400.0, + "grad_norm": 2.937872524874316, + "language_loss": 0.79763901, + "learning_rate": 3.881580733093211e-06, + "loss": 0.87693179, + "num_input_tokens_seen": 49236615, + "router_z_loss_clip": 3.17773438, + "router_z_loss_mlp": 0.32519531, + "step": 2275, + "time_per_iteration": 2.560577630996704 + }, + { + "auxiliary_loss_clip": 0.06630076, + "auxiliary_loss_mlp": 0.01293627, + "balance_loss_clip": 0.06306267, + "balance_loss_mlp": 0.01259914, + "epoch": 0.13684052307229821, + "flos": 15674788713600.0, + "grad_norm": 2.8834689051693196, + "language_loss": 0.82202291, + "learning_rate": 3.881448674225356e-06, + "loss": 0.9012599, + "num_input_tokens_seen": 49253935, + "router_z_loss_clip": 3.2421875, + "router_z_loss_mlp": 0.33691406, + "step": 2276, + "time_per_iteration": 2.6382758617401123 + }, + { + "auxiliary_loss_clip": 0.06636757, + "auxiliary_loss_mlp": 0.01296316, + "balance_loss_clip": 0.06304651, + "balance_loss_mlp": 0.01260839, + "epoch": 0.13690064632496618, + "flos": 28371983863680.0, + "grad_norm": 2.682466270477189, + "language_loss": 0.71951526, + "learning_rate": 3.881316544012779e-06, + "loss": 0.79884601, + "num_input_tokens_seen": 49273605, + "router_z_loss_clip": 3.32421875, + "router_z_loss_mlp": 0.35473633, + "step": 2277, + "time_per_iteration": 2.59140944480896 + }, + { + "auxiliary_loss_clip": 0.06638919, + "auxiliary_loss_mlp": 0.01298071, + "balance_loss_clip": 0.06309501, + "balance_loss_mlp": 0.01265312, + "epoch": 0.13696076957763414, + "flos": 23411605207680.0, + "grad_norm": 2.2485386037649144, + "language_loss": 0.82153767, + "learning_rate": 3.88118434246049e-06, + "loss": 0.90090752, + "num_input_tokens_seen": 49291785, + "router_z_loss_clip": 3.296875, + "router_z_loss_mlp": 0.32739258, + "step": 2278, + "time_per_iteration": 2.5540530681610107 + }, + { + "auxiliary_loss_clip": 0.06627095, + "auxiliary_loss_mlp": 0.01287889, + "balance_loss_clip": 0.06304022, + "balance_loss_mlp": 0.01256358, + "epoch": 0.1370208928303021, + "flos": 37205760084480.0, + "grad_norm": 2.776511982198055, + "language_loss": 0.76353186, + "learning_rate": 3.881052069573502e-06, + "loss": 0.84268171, + "num_input_tokens_seen": 49311405, + "router_z_loss_clip": 3.234375, + "router_z_loss_mlp": 0.31506348, + "step": 2279, + "time_per_iteration": 2.659834623336792 + }, + { + "auxiliary_loss_clip": 0.06632279, + "auxiliary_loss_mlp": 0.01290702, + "balance_loss_clip": 0.06309781, + "balance_loss_mlp": 0.01260041, + "epoch": 0.13708101608297008, + "flos": 26983052184960.0, + "grad_norm": 1.8236300001025265, + "language_loss": 0.78161544, + "learning_rate": 3.880919725356831e-06, + "loss": 0.86084521, + "num_input_tokens_seen": 49331835, + "router_z_loss_clip": 3.22460938, + "router_z_loss_mlp": 0.30639648, + "step": 2280, + "time_per_iteration": 2.5933265686035156 + }, + { + "auxiliary_loss_clip": 0.06616117, + "auxiliary_loss_mlp": 0.01291386, + "balance_loss_clip": 0.06299774, + "balance_loss_mlp": 0.01259009, + "epoch": 0.13714113933563807, + "flos": 32564243341440.0, + "grad_norm": 2.0971089694494003, + "language_loss": 0.80573678, + "learning_rate": 3.880787309815496e-06, + "loss": 0.88481188, + "num_input_tokens_seen": 49352290, + "router_z_loss_clip": 3.1640625, + "router_z_loss_mlp": 0.32373047, + "step": 2281, + "time_per_iteration": 2.6089248657226562 + }, + { + "auxiliary_loss_clip": 0.06637304, + "auxiliary_loss_mlp": 0.01294147, + "balance_loss_clip": 0.06310696, + "balance_loss_mlp": 0.01260601, + "epoch": 0.13720126258830603, + "flos": 16105807716480.0, + "grad_norm": 1.9438647514298306, + "language_loss": 0.84104228, + "learning_rate": 3.880654822954518e-06, + "loss": 0.92035675, + "num_input_tokens_seen": 49370285, + "router_z_loss_clip": 3.26367188, + "router_z_loss_mlp": 0.33544922, + "step": 2282, + "time_per_iteration": 2.6252219676971436 + }, + { + "auxiliary_loss_clip": 0.06621532, + "auxiliary_loss_mlp": 0.01288566, + "balance_loss_clip": 0.06310192, + "balance_loss_mlp": 0.01258716, + "epoch": 0.137261385840974, + "flos": 18959630400000.0, + "grad_norm": 1.6598116001029841, + "language_loss": 0.74414694, + "learning_rate": 3.8805222647789195e-06, + "loss": 0.82324791, + "num_input_tokens_seen": 49389610, + "router_z_loss_clip": 3.1171875, + "router_z_loss_mlp": 0.29858398, + "step": 2283, + "time_per_iteration": 2.510495185852051 + }, + { + "auxiliary_loss_clip": 0.06626984, + "auxiliary_loss_mlp": 0.01293133, + "balance_loss_clip": 0.06314456, + "balance_loss_mlp": 0.01261686, + "epoch": 0.13732150909364196, + "flos": 23302173375360.0, + "grad_norm": 4.31542841231349, + "language_loss": 0.85737264, + "learning_rate": 3.880389635293729e-06, + "loss": 0.93657386, + "num_input_tokens_seen": 49408390, + "router_z_loss_clip": 3.125, + "router_z_loss_mlp": 0.31445312, + "step": 2284, + "time_per_iteration": 2.569772720336914 + }, + { + "auxiliary_loss_clip": 0.06637374, + "auxiliary_loss_mlp": 0.01296079, + "balance_loss_clip": 0.06309589, + "balance_loss_mlp": 0.01263702, + "epoch": 0.13738163234630993, + "flos": 29358966706560.0, + "grad_norm": 2.3287060101811643, + "language_loss": 0.76374751, + "learning_rate": 3.880256934503974e-06, + "loss": 0.84308201, + "num_input_tokens_seen": 49427725, + "router_z_loss_clip": 3.27734375, + "router_z_loss_mlp": 0.32348633, + "step": 2285, + "time_per_iteration": 2.618502140045166 + }, + { + "auxiliary_loss_clip": 0.06630811, + "auxiliary_loss_mlp": 0.01295468, + "balance_loss_clip": 0.06312186, + "balance_loss_mlp": 0.0126619, + "epoch": 0.1374417555989779, + "flos": 26658572048640.0, + "grad_norm": 1.8592668297074675, + "language_loss": 0.76012349, + "learning_rate": 3.880124162414689e-06, + "loss": 0.83938622, + "num_input_tokens_seen": 49449000, + "router_z_loss_clip": 3.18164062, + "router_z_loss_mlp": 0.29296875, + "step": 2286, + "time_per_iteration": 2.7475874423980713 + }, + { + "auxiliary_loss_clip": 0.06634222, + "auxiliary_loss_mlp": 0.01290764, + "balance_loss_clip": 0.06310531, + "balance_loss_mlp": 0.01258029, + "epoch": 0.1375018788516459, + "flos": 28411074593280.0, + "grad_norm": 5.375995383381602, + "language_loss": 0.87619269, + "learning_rate": 3.879991319030908e-06, + "loss": 0.95544249, + "num_input_tokens_seen": 49468360, + "router_z_loss_clip": 3.23242188, + "router_z_loss_mlp": 0.32763672, + "step": 2287, + "time_per_iteration": 2.7319629192352295 + }, + { + "auxiliary_loss_clip": 0.06638976, + "auxiliary_loss_mlp": 0.01305844, + "balance_loss_clip": 0.06320731, + "balance_loss_mlp": 0.01274683, + "epoch": 0.13756200210431385, + "flos": 37422695105280.0, + "grad_norm": 2.4551568049715486, + "language_loss": 0.70291626, + "learning_rate": 3.879858404357666e-06, + "loss": 0.78236449, + "num_input_tokens_seen": 49493450, + "router_z_loss_clip": 3.18359375, + "router_z_loss_mlp": 0.3112793, + "step": 2288, + "time_per_iteration": 2.6788651943206787 + }, + { + "auxiliary_loss_clip": 0.06632806, + "auxiliary_loss_mlp": 0.01293292, + "balance_loss_clip": 0.06312902, + "balance_loss_mlp": 0.01262667, + "epoch": 0.13762212535698182, + "flos": 22717642492800.0, + "grad_norm": 3.117032975681255, + "language_loss": 0.88826561, + "learning_rate": 3.879725418400005e-06, + "loss": 0.96752661, + "num_input_tokens_seen": 49511220, + "router_z_loss_clip": 3.1953125, + "router_z_loss_mlp": 0.30651855, + "step": 2289, + "time_per_iteration": 2.5602166652679443 + }, + { + "auxiliary_loss_clip": 0.06632558, + "auxiliary_loss_mlp": 0.01293233, + "balance_loss_clip": 0.06320693, + "balance_loss_mlp": 0.01263181, + "epoch": 0.13768224860964978, + "flos": 23959057858560.0, + "grad_norm": 1.9772525840465298, + "language_loss": 0.75630605, + "learning_rate": 3.879592361162969e-06, + "loss": 0.8355639, + "num_input_tokens_seen": 49529820, + "router_z_loss_clip": 3.11914062, + "router_z_loss_mlp": 0.30065918, + "step": 2290, + "time_per_iteration": 2.5592398643493652 + }, + { + "auxiliary_loss_clip": 0.06540786, + "auxiliary_loss_mlp": 0.01268874, + "balance_loss_clip": 0.06353199, + "balance_loss_mlp": 0.01257585, + "epoch": 0.13774237186231775, + "flos": 63612568212480.0, + "grad_norm": 0.6705422790130379, + "language_loss": 0.51642907, + "learning_rate": 3.8794592326516015e-06, + "loss": 0.59452564, + "num_input_tokens_seen": 49595325, + "router_z_loss_clip": 1.875, + "router_z_loss_mlp": 0.112854, + "step": 2291, + "time_per_iteration": 3.2724592685699463 + }, + { + "auxiliary_loss_clip": 0.06630601, + "auxiliary_loss_mlp": 0.01294866, + "balance_loss_clip": 0.0631279, + "balance_loss_mlp": 0.01263657, + "epoch": 0.1378024951149857, + "flos": 24286263252480.0, + "grad_norm": 2.140362896023876, + "language_loss": 0.72877645, + "learning_rate": 3.879326032870952e-06, + "loss": 0.80803108, + "num_input_tokens_seen": 49615850, + "router_z_loss_clip": 3.1796875, + "router_z_loss_mlp": 0.31201172, + "step": 2292, + "time_per_iteration": 2.571537971496582 + }, + { + "auxiliary_loss_clip": 0.0663756, + "auxiliary_loss_mlp": 0.01294271, + "balance_loss_clip": 0.06317808, + "balance_loss_mlp": 0.01261179, + "epoch": 0.13786261836765368, + "flos": 14025722434560.0, + "grad_norm": 2.9525020540096842, + "language_loss": 0.81376028, + "learning_rate": 3.879192761826071e-06, + "loss": 0.89307863, + "num_input_tokens_seen": 49631860, + "router_z_loss_clip": 3.19726562, + "router_z_loss_mlp": 0.33056641, + "step": 2293, + "time_per_iteration": 2.520320177078247 + }, + { + "auxiliary_loss_clip": 0.06629369, + "auxiliary_loss_mlp": 0.01294538, + "balance_loss_clip": 0.06308883, + "balance_loss_mlp": 0.01262065, + "epoch": 0.13792274162032167, + "flos": 28886592913920.0, + "grad_norm": 15.103956304175181, + "language_loss": 0.79534554, + "learning_rate": 3.879059419522011e-06, + "loss": 0.87458467, + "num_input_tokens_seen": 49652145, + "router_z_loss_clip": 3.20507812, + "router_z_loss_mlp": 0.32470703, + "step": 2294, + "time_per_iteration": 2.5958240032196045 + }, + { + "auxiliary_loss_clip": 0.06628333, + "auxiliary_loss_mlp": 0.01293802, + "balance_loss_clip": 0.06314936, + "balance_loss_mlp": 0.01264739, + "epoch": 0.13798286487298964, + "flos": 21147344651520.0, + "grad_norm": 2.1249265647314575, + "language_loss": 0.82119411, + "learning_rate": 3.878926005963831e-06, + "loss": 0.90041548, + "num_input_tokens_seen": 49669880, + "router_z_loss_clip": 3.1328125, + "router_z_loss_mlp": 0.29040527, + "step": 2295, + "time_per_iteration": 2.5259695053100586 + }, + { + "auxiliary_loss_clip": 0.06624444, + "auxiliary_loss_mlp": 0.0128892, + "balance_loss_clip": 0.06304439, + "balance_loss_mlp": 0.01258569, + "epoch": 0.1380429881256576, + "flos": 22493286385920.0, + "grad_norm": 1.9411162070190993, + "language_loss": 0.79297817, + "learning_rate": 3.878792521156588e-06, + "loss": 0.8721118, + "num_input_tokens_seen": 49687255, + "router_z_loss_clip": 3.203125, + "router_z_loss_mlp": 0.3034668, + "step": 2296, + "time_per_iteration": 2.5404605865478516 + }, + { + "auxiliary_loss_clip": 0.06623581, + "auxiliary_loss_mlp": 0.01292011, + "balance_loss_clip": 0.06309658, + "balance_loss_mlp": 0.01261755, + "epoch": 0.13810311137832557, + "flos": 21399429260160.0, + "grad_norm": 1.8193304302063846, + "language_loss": 0.79101717, + "learning_rate": 3.8786589651053446e-06, + "loss": 0.87017298, + "num_input_tokens_seen": 49706650, + "router_z_loss_clip": 3.13867188, + "router_z_loss_mlp": 0.30249023, + "step": 2297, + "time_per_iteration": 2.544902801513672 + }, + { + "auxiliary_loss_clip": 0.06617336, + "auxiliary_loss_mlp": 0.01292431, + "balance_loss_clip": 0.06304273, + "balance_loss_mlp": 0.01261162, + "epoch": 0.13816323463099353, + "flos": 25996195123200.0, + "grad_norm": 2.1649336589446113, + "language_loss": 0.70034248, + "learning_rate": 3.878525337815164e-06, + "loss": 0.77944016, + "num_input_tokens_seen": 49725715, + "router_z_loss_clip": 3.12890625, + "router_z_loss_mlp": 0.31286621, + "step": 2298, + "time_per_iteration": 2.7027747631073 + }, + { + "auxiliary_loss_clip": 0.06625488, + "auxiliary_loss_mlp": 0.01293838, + "balance_loss_clip": 0.06304887, + "balance_loss_mlp": 0.01263511, + "epoch": 0.1382233578836615, + "flos": 19250260686720.0, + "grad_norm": 1.8032659924791181, + "language_loss": 0.87816125, + "learning_rate": 3.878391639291116e-06, + "loss": 0.95735455, + "num_input_tokens_seen": 49744710, + "router_z_loss_clip": 3.20703125, + "router_z_loss_mlp": 0.30310059, + "step": 2299, + "time_per_iteration": 2.5216784477233887 + }, + { + "auxiliary_loss_clip": 0.06619459, + "auxiliary_loss_mlp": 0.01291843, + "balance_loss_clip": 0.06297824, + "balance_loss_mlp": 0.01258965, + "epoch": 0.1382834811363295, + "flos": 25673392068480.0, + "grad_norm": 1.8041271752460513, + "language_loss": 0.77313578, + "learning_rate": 3.878257869538267e-06, + "loss": 0.85224879, + "num_input_tokens_seen": 49764300, + "router_z_loss_clip": 3.21484375, + "router_z_loss_mlp": 0.32910156, + "step": 2300, + "time_per_iteration": 2.5652666091918945 + }, + { + "auxiliary_loss_clip": 0.06615824, + "auxiliary_loss_mlp": 0.01292831, + "balance_loss_clip": 0.06301995, + "balance_loss_mlp": 0.01263219, + "epoch": 0.13834360438899745, + "flos": 19788992513280.0, + "grad_norm": 2.607101946436598, + "language_loss": 0.84398985, + "learning_rate": 3.878124028561692e-06, + "loss": 0.92307633, + "num_input_tokens_seen": 49778380, + "router_z_loss_clip": 3.13867188, + "router_z_loss_mlp": 0.29589844, + "step": 2301, + "time_per_iteration": 2.5100109577178955 + }, + { + "auxiliary_loss_clip": 0.06616862, + "auxiliary_loss_mlp": 0.01292457, + "balance_loss_clip": 0.06302989, + "balance_loss_mlp": 0.01262631, + "epoch": 0.13840372764166542, + "flos": 26659200954240.0, + "grad_norm": 1.960897603887865, + "language_loss": 0.87807304, + "learning_rate": 3.877990116366466e-06, + "loss": 0.95716619, + "num_input_tokens_seen": 49797460, + "router_z_loss_clip": 3.140625, + "router_z_loss_mlp": 0.2980957, + "step": 2302, + "time_per_iteration": 2.5661840438842773 + }, + { + "auxiliary_loss_clip": 0.0648245, + "auxiliary_loss_mlp": 0.01256791, + "balance_loss_clip": 0.06296428, + "balance_loss_mlp": 0.01245943, + "epoch": 0.13846385089433338, + "flos": 70532321944320.0, + "grad_norm": 0.7317106160807376, + "language_loss": 0.65412122, + "learning_rate": 3.877856132957667e-06, + "loss": 0.73151362, + "num_input_tokens_seen": 49868005, + "router_z_loss_clip": 1.859375, + "router_z_loss_mlp": 0.10864258, + "step": 2303, + "time_per_iteration": 3.325839042663574 + }, + { + "auxiliary_loss_clip": 0.06609396, + "auxiliary_loss_mlp": 0.01287851, + "balance_loss_clip": 0.0630075, + "balance_loss_mlp": 0.01258263, + "epoch": 0.13852397414700135, + "flos": 17354644168320.0, + "grad_norm": 2.0774651772022885, + "language_loss": 0.79740053, + "learning_rate": 3.877722078340374e-06, + "loss": 0.87637299, + "num_input_tokens_seen": 49885825, + "router_z_loss_clip": 3.09179688, + "router_z_loss_mlp": 0.29589844, + "step": 2304, + "time_per_iteration": 2.543011426925659 + }, + { + "auxiliary_loss_clip": 0.06619786, + "auxiliary_loss_mlp": 0.01290997, + "balance_loss_clip": 0.06300867, + "balance_loss_mlp": 0.01261147, + "epoch": 0.13858409739966931, + "flos": 21550257809280.0, + "grad_norm": 3.5409811557707527, + "language_loss": 0.78727001, + "learning_rate": 3.877587952519672e-06, + "loss": 0.86637783, + "num_input_tokens_seen": 49905975, + "router_z_loss_clip": 3.19335938, + "router_z_loss_mlp": 0.2980957, + "step": 2305, + "time_per_iteration": 2.546365261077881 + }, + { + "auxiliary_loss_clip": 0.06604174, + "auxiliary_loss_mlp": 0.01290068, + "balance_loss_clip": 0.06297874, + "balance_loss_mlp": 0.01261624, + "epoch": 0.13864422065233728, + "flos": 21586329792000.0, + "grad_norm": 1.8829847036148735, + "language_loss": 0.89061654, + "learning_rate": 3.877453755500647e-06, + "loss": 0.96955895, + "num_input_tokens_seen": 49925800, + "router_z_loss_clip": 3.06054688, + "router_z_loss_mlp": 0.28442383, + "step": 2306, + "time_per_iteration": 2.564483165740967 + }, + { + "auxiliary_loss_clip": 0.06468673, + "auxiliary_loss_mlp": 0.0125835, + "balance_loss_clip": 0.0628318, + "balance_loss_mlp": 0.01247258, + "epoch": 0.13870434390500527, + "flos": 53384927650560.0, + "grad_norm": 0.8396257339497795, + "language_loss": 0.58554721, + "learning_rate": 3.877319487288387e-06, + "loss": 0.66281742, + "num_input_tokens_seen": 49977620, + "router_z_loss_clip": 1.859375, + "router_z_loss_mlp": 0.11108398, + "step": 2307, + "time_per_iteration": 4.632705450057983 + }, + { + "auxiliary_loss_clip": 0.0661881, + "auxiliary_loss_mlp": 0.01288588, + "balance_loss_clip": 0.06295981, + "balance_loss_mlp": 0.01258022, + "epoch": 0.13876446715767324, + "flos": 22572641802240.0, + "grad_norm": 1.7746642333134461, + "language_loss": 0.80762124, + "learning_rate": 3.877185147887984e-06, + "loss": 0.88669527, + "num_input_tokens_seen": 49996650, + "router_z_loss_clip": 3.23046875, + "router_z_loss_mlp": 0.30566406, + "step": 2308, + "time_per_iteration": 3.985261917114258 + }, + { + "auxiliary_loss_clip": 0.06612652, + "auxiliary_loss_mlp": 0.0129232, + "balance_loss_clip": 0.06302111, + "balance_loss_mlp": 0.01262208, + "epoch": 0.1388245904103412, + "flos": 20711671747200.0, + "grad_norm": 2.3070434354932425, + "language_loss": 0.7942912, + "learning_rate": 3.877050737304533e-06, + "loss": 0.8733409, + "num_input_tokens_seen": 50015640, + "router_z_loss_clip": 3.10742188, + "router_z_loss_mlp": 0.30102539, + "step": 2309, + "time_per_iteration": 2.5814623832702637 + }, + { + "auxiliary_loss_clip": 0.06621584, + "auxiliary_loss_mlp": 0.01295268, + "balance_loss_clip": 0.06297516, + "balance_loss_mlp": 0.0126444, + "epoch": 0.13888471366300917, + "flos": 20560382000640.0, + "grad_norm": 2.2863258472271437, + "language_loss": 0.6975733, + "learning_rate": 3.876916255543129e-06, + "loss": 0.77674186, + "num_input_tokens_seen": 50033500, + "router_z_loss_clip": 3.24023438, + "router_z_loss_mlp": 0.30786133, + "step": 2310, + "time_per_iteration": 2.5402469635009766 + }, + { + "auxiliary_loss_clip": 0.06612189, + "auxiliary_loss_mlp": 0.01299127, + "balance_loss_clip": 0.06296849, + "balance_loss_mlp": 0.01268967, + "epoch": 0.13894483691567713, + "flos": 13842008357760.0, + "grad_norm": 1.8909078278877924, + "language_loss": 0.85131961, + "learning_rate": 3.8767817026088725e-06, + "loss": 0.9304328, + "num_input_tokens_seen": 50050075, + "router_z_loss_clip": 3.15625, + "router_z_loss_mlp": 0.30126953, + "step": 2311, + "time_per_iteration": 5.377658128738403 + }, + { + "auxiliary_loss_clip": 0.06618226, + "auxiliary_loss_mlp": 0.01294733, + "balance_loss_clip": 0.06296492, + "balance_loss_mlp": 0.01264358, + "epoch": 0.1390049601683451, + "flos": 28037567018880.0, + "grad_norm": 2.5894979273704783, + "language_loss": 0.83215213, + "learning_rate": 3.876647078506866e-06, + "loss": 0.9112817, + "num_input_tokens_seen": 50070080, + "router_z_loss_clip": 3.21679688, + "router_z_loss_mlp": 0.30395508, + "step": 2312, + "time_per_iteration": 2.6039178371429443 + }, + { + "auxiliary_loss_clip": 0.06618522, + "auxiliary_loss_mlp": 0.01290839, + "balance_loss_clip": 0.06296252, + "balance_loss_mlp": 0.01259964, + "epoch": 0.13906508342101306, + "flos": 26763475760640.0, + "grad_norm": 1.7282329609081795, + "language_loss": 0.87823701, + "learning_rate": 3.876512383242215e-06, + "loss": 0.95733058, + "num_input_tokens_seen": 50090040, + "router_z_loss_clip": 3.22460938, + "router_z_loss_mlp": 0.30883789, + "step": 2313, + "time_per_iteration": 2.6105740070343018 + }, + { + "auxiliary_loss_clip": 0.06614069, + "auxiliary_loss_mlp": 0.01289702, + "balance_loss_clip": 0.06295129, + "balance_loss_mlp": 0.01259185, + "epoch": 0.13912520667368106, + "flos": 24541995513600.0, + "grad_norm": 1.8286826676096326, + "language_loss": 0.81090409, + "learning_rate": 3.876377616820024e-06, + "loss": 0.88994175, + "num_input_tokens_seen": 50110595, + "router_z_loss_clip": 3.1875, + "router_z_loss_mlp": 0.30541992, + "step": 2314, + "time_per_iteration": 2.581137180328369 + }, + { + "auxiliary_loss_clip": 0.06609131, + "auxiliary_loss_mlp": 0.0129379, + "balance_loss_clip": 0.06295457, + "balance_loss_mlp": 0.01263678, + "epoch": 0.13918532992634902, + "flos": 19388007999360.0, + "grad_norm": 4.757536248820732, + "language_loss": 0.86588097, + "learning_rate": 3.876242779245409e-06, + "loss": 0.94491017, + "num_input_tokens_seen": 50125430, + "router_z_loss_clip": 3.140625, + "router_z_loss_mlp": 0.30126953, + "step": 2315, + "time_per_iteration": 2.5262932777404785 + }, + { + "auxiliary_loss_clip": 0.06611065, + "auxiliary_loss_mlp": 0.01285772, + "balance_loss_clip": 0.06296186, + "balance_loss_mlp": 0.01255159, + "epoch": 0.139245453179017, + "flos": 21330010552320.0, + "grad_norm": 2.405797075318415, + "language_loss": 0.78922898, + "learning_rate": 3.876107870523477e-06, + "loss": 0.86819738, + "num_input_tokens_seen": 50144120, + "router_z_loss_clip": 3.15039062, + "router_z_loss_mlp": 0.30615234, + "step": 2316, + "time_per_iteration": 2.529972553253174 + }, + { + "auxiliary_loss_clip": 0.06613404, + "auxiliary_loss_mlp": 0.01292141, + "balance_loss_clip": 0.06296711, + "balance_loss_mlp": 0.01260026, + "epoch": 0.13930557643168495, + "flos": 19506747634560.0, + "grad_norm": 1.7528689753979556, + "language_loss": 0.77613419, + "learning_rate": 3.875972890659349e-06, + "loss": 0.85518968, + "num_input_tokens_seen": 50162500, + "router_z_loss_clip": 3.16992188, + "router_z_loss_mlp": 0.32116699, + "step": 2317, + "time_per_iteration": 2.5425355434417725 + }, + { + "auxiliary_loss_clip": 0.06624125, + "auxiliary_loss_mlp": 0.01286591, + "balance_loss_clip": 0.0630217, + "balance_loss_mlp": 0.01257027, + "epoch": 0.13936569968435292, + "flos": 25417869442560.0, + "grad_norm": 1.999588880264202, + "language_loss": 0.81447107, + "learning_rate": 3.875837839658139e-06, + "loss": 0.89357817, + "num_input_tokens_seen": 50182415, + "router_z_loss_clip": 3.22265625, + "router_z_loss_mlp": 0.2956543, + "step": 2318, + "time_per_iteration": 2.577786922454834 + }, + { + "auxiliary_loss_clip": 0.06479447, + "auxiliary_loss_mlp": 0.01268448, + "balance_loss_clip": 0.06297284, + "balance_loss_mlp": 0.01257373, + "epoch": 0.13942582293702088, + "flos": 70793211231360.0, + "grad_norm": 0.8224169172372592, + "language_loss": 0.59232461, + "learning_rate": 3.87570271752497e-06, + "loss": 0.66980362, + "num_input_tokens_seen": 50245160, + "router_z_loss_clip": 1.82421875, + "router_z_loss_mlp": 0.11090088, + "step": 2319, + "time_per_iteration": 3.204317092895508 + }, + { + "auxiliary_loss_clip": 0.06613657, + "auxiliary_loss_mlp": 0.01294413, + "balance_loss_clip": 0.06293797, + "balance_loss_mlp": 0.01263514, + "epoch": 0.13948594618968888, + "flos": 35599725676800.0, + "grad_norm": 2.1444622790100762, + "language_loss": 0.66576529, + "learning_rate": 3.875567524264967e-06, + "loss": 0.74484605, + "num_input_tokens_seen": 50268215, + "router_z_loss_clip": 3.20117188, + "router_z_loss_mlp": 0.30957031, + "step": 2320, + "time_per_iteration": 2.677716016769409 + }, + { + "auxiliary_loss_clip": 0.06604615, + "auxiliary_loss_mlp": 0.01292225, + "balance_loss_clip": 0.062957, + "balance_loss_mlp": 0.01263245, + "epoch": 0.13954606944235684, + "flos": 21111482304000.0, + "grad_norm": 1.7128433163135388, + "language_loss": 0.7132194, + "learning_rate": 3.875432259883256e-06, + "loss": 0.79218775, + "num_input_tokens_seen": 50288575, + "router_z_loss_clip": 3.08984375, + "router_z_loss_mlp": 0.28967285, + "step": 2321, + "time_per_iteration": 2.5557823181152344 + }, + { + "auxiliary_loss_clip": 0.06610114, + "auxiliary_loss_mlp": 0.01289737, + "balance_loss_clip": 0.06294077, + "balance_loss_mlp": 0.01258158, + "epoch": 0.1396061926950248, + "flos": 25051154048640.0, + "grad_norm": 2.1088337541486215, + "language_loss": 0.87096989, + "learning_rate": 3.875296924384965e-06, + "loss": 0.9499684, + "num_input_tokens_seen": 50308735, + "router_z_loss_clip": 3.15820312, + "router_z_loss_mlp": 0.3157959, + "step": 2322, + "time_per_iteration": 2.563751459121704 + }, + { + "auxiliary_loss_clip": 0.06602737, + "auxiliary_loss_mlp": 0.01287424, + "balance_loss_clip": 0.06298044, + "balance_loss_mlp": 0.01258718, + "epoch": 0.13966631594769277, + "flos": 37643193924480.0, + "grad_norm": 1.6181543517844332, + "language_loss": 0.68045509, + "learning_rate": 3.875161517775226e-06, + "loss": 0.75935674, + "num_input_tokens_seen": 50331025, + "router_z_loss_clip": 3.05078125, + "router_z_loss_mlp": 0.28710938, + "step": 2323, + "time_per_iteration": 2.8503611087799072 + }, + { + "auxiliary_loss_clip": 0.06623898, + "auxiliary_loss_mlp": 0.01286528, + "balance_loss_clip": 0.06301014, + "balance_loss_mlp": 0.01257393, + "epoch": 0.13972643920036074, + "flos": 16696627655040.0, + "grad_norm": 2.142170673512178, + "language_loss": 0.90579832, + "learning_rate": 3.875026040059175e-06, + "loss": 0.98490262, + "num_input_tokens_seen": 50349725, + "router_z_loss_clip": 3.23046875, + "router_z_loss_mlp": 0.29150391, + "step": 2324, + "time_per_iteration": 2.5540571212768555 + }, + { + "auxiliary_loss_clip": 0.06618317, + "auxiliary_loss_mlp": 0.01286509, + "balance_loss_clip": 0.0630155, + "balance_loss_mlp": 0.01256659, + "epoch": 0.1397865624530287, + "flos": 23337742233600.0, + "grad_norm": 4.139742528061125, + "language_loss": 0.72620469, + "learning_rate": 3.8748904912419485e-06, + "loss": 0.80525297, + "num_input_tokens_seen": 50367965, + "router_z_loss_clip": 3.16992188, + "router_z_loss_mlp": 0.29821777, + "step": 2325, + "time_per_iteration": 2.5619618892669678 + }, + { + "auxiliary_loss_clip": 0.0662512, + "auxiliary_loss_mlp": 0.01293129, + "balance_loss_clip": 0.06308709, + "balance_loss_mlp": 0.01264591, + "epoch": 0.13984668570569667, + "flos": 22784000526720.0, + "grad_norm": 2.1958407614138, + "language_loss": 0.83206451, + "learning_rate": 3.874754871328688e-06, + "loss": 0.91124701, + "num_input_tokens_seen": 50385605, + "router_z_loss_clip": 3.16210938, + "router_z_loss_mlp": 0.28503418, + "step": 2326, + "time_per_iteration": 2.544154167175293 + }, + { + "auxiliary_loss_clip": 0.06607386, + "auxiliary_loss_mlp": 0.0128622, + "balance_loss_clip": 0.06303836, + "balance_loss_mlp": 0.01256764, + "epoch": 0.13990680895836466, + "flos": 19470759505920.0, + "grad_norm": 1.8381162719470834, + "language_loss": 0.90198052, + "learning_rate": 3.874619180324534e-06, + "loss": 0.98091662, + "num_input_tokens_seen": 50403985, + "router_z_loss_clip": 3.03515625, + "router_z_loss_mlp": 0.2947998, + "step": 2327, + "time_per_iteration": 2.544022798538208 + }, + { + "auxiliary_loss_clip": 0.06612301, + "auxiliary_loss_mlp": 0.01294926, + "balance_loss_clip": 0.06299497, + "balance_loss_mlp": 0.01263479, + "epoch": 0.13996693221103262, + "flos": 20309555203200.0, + "grad_norm": 2.1153988454525927, + "language_loss": 0.86492193, + "learning_rate": 3.874483418234632e-06, + "loss": 0.9439941, + "num_input_tokens_seen": 50421590, + "router_z_loss_clip": 3.12890625, + "router_z_loss_mlp": 0.31433105, + "step": 2328, + "time_per_iteration": 2.498436212539673 + }, + { + "auxiliary_loss_clip": 0.06619829, + "auxiliary_loss_mlp": 0.01290779, + "balance_loss_clip": 0.06303053, + "balance_loss_mlp": 0.01261239, + "epoch": 0.1400270554637006, + "flos": 26625434958720.0, + "grad_norm": 2.232478376897894, + "language_loss": 0.74862719, + "learning_rate": 3.874347585064131e-06, + "loss": 0.82773322, + "num_input_tokens_seen": 50443945, + "router_z_loss_clip": 3.16992188, + "router_z_loss_mlp": 0.29541016, + "step": 2329, + "time_per_iteration": 2.625213146209717 + }, + { + "auxiliary_loss_clip": 0.06613478, + "auxiliary_loss_mlp": 0.01291404, + "balance_loss_clip": 0.06302348, + "balance_loss_mlp": 0.01261912, + "epoch": 0.14008717871636855, + "flos": 19397651218560.0, + "grad_norm": 2.9962397362189797, + "language_loss": 0.79502976, + "learning_rate": 3.874211680818183e-06, + "loss": 0.87407863, + "num_input_tokens_seen": 50462065, + "router_z_loss_clip": 3.11132812, + "router_z_loss_mlp": 0.29516602, + "step": 2330, + "time_per_iteration": 2.526705265045166 + }, + { + "auxiliary_loss_clip": 0.06610473, + "auxiliary_loss_mlp": 0.01292963, + "balance_loss_clip": 0.06299306, + "balance_loss_mlp": 0.01265187, + "epoch": 0.14014730196903652, + "flos": 15309624620160.0, + "grad_norm": 3.126642482841082, + "language_loss": 0.73399383, + "learning_rate": 3.87407570550194e-06, + "loss": 0.81302822, + "num_input_tokens_seen": 50479565, + "router_z_loss_clip": 3.11328125, + "router_z_loss_mlp": 0.27783203, + "step": 2331, + "time_per_iteration": 2.5545501708984375 + }, + { + "auxiliary_loss_clip": 0.06595145, + "auxiliary_loss_mlp": 0.01295524, + "balance_loss_clip": 0.06296061, + "balance_loss_mlp": 0.01267176, + "epoch": 0.14020742522170448, + "flos": 14945047505280.0, + "grad_norm": 1.5446780905805184, + "language_loss": 0.73888373, + "learning_rate": 3.873939659120557e-06, + "loss": 0.81779039, + "num_input_tokens_seen": 50497305, + "router_z_loss_clip": 2.9921875, + "router_z_loss_mlp": 0.28344727, + "step": 2332, + "time_per_iteration": 2.5132856369018555 + }, + { + "auxiliary_loss_clip": 0.06469279, + "auxiliary_loss_mlp": 0.01265718, + "balance_loss_clip": 0.0628898, + "balance_loss_mlp": 0.01254947, + "epoch": 0.14026754847437245, + "flos": 48839956410240.0, + "grad_norm": 0.7856293848414069, + "language_loss": 0.55978549, + "learning_rate": 3.873803541679196e-06, + "loss": 0.63713545, + "num_input_tokens_seen": 50549735, + "router_z_loss_clip": 1.80371094, + "router_z_loss_mlp": 0.10784912, + "step": 2333, + "time_per_iteration": 3.0545504093170166 + }, + { + "auxiliary_loss_clip": 0.06614032, + "auxiliary_loss_mlp": 0.01304219, + "balance_loss_clip": 0.06302805, + "balance_loss_mlp": 0.01274512, + "epoch": 0.14032767172704044, + "flos": 25779972862080.0, + "grad_norm": 1.7607916686559548, + "language_loss": 0.83699584, + "learning_rate": 3.873667353183016e-06, + "loss": 0.91617835, + "num_input_tokens_seen": 50570100, + "router_z_loss_clip": 3.11328125, + "router_z_loss_mlp": 0.29699707, + "step": 2334, + "time_per_iteration": 2.6067097187042236 + }, + { + "auxiliary_loss_clip": 0.06611067, + "auxiliary_loss_mlp": 0.01296359, + "balance_loss_clip": 0.06295306, + "balance_loss_mlp": 0.01268023, + "epoch": 0.1403877949797084, + "flos": 21222884707200.0, + "grad_norm": 3.2536049566200846, + "language_loss": 0.81910211, + "learning_rate": 3.8735310936371825e-06, + "loss": 0.89817637, + "num_input_tokens_seen": 50589185, + "router_z_loss_clip": 3.15625, + "router_z_loss_mlp": 0.28356934, + "step": 2335, + "time_per_iteration": 2.5793120861053467 + }, + { + "auxiliary_loss_clip": 0.06618994, + "auxiliary_loss_mlp": 0.0129466, + "balance_loss_clip": 0.06299357, + "balance_loss_mlp": 0.01262044, + "epoch": 0.14044791823237637, + "flos": 22754678797440.0, + "grad_norm": 1.8425920337650705, + "language_loss": 0.83025301, + "learning_rate": 3.873394763046862e-06, + "loss": 0.9093895, + "num_input_tokens_seen": 50609645, + "router_z_loss_clip": 3.19335938, + "router_z_loss_mlp": 0.32617188, + "step": 2336, + "time_per_iteration": 2.5754895210266113 + }, + { + "auxiliary_loss_clip": 0.0660933, + "auxiliary_loss_mlp": 0.0129189, + "balance_loss_clip": 0.06299824, + "balance_loss_mlp": 0.01261516, + "epoch": 0.14050804148504434, + "flos": 22970775277440.0, + "grad_norm": 1.9428001111866895, + "language_loss": 0.81449389, + "learning_rate": 3.873258361417225e-06, + "loss": 0.89350611, + "num_input_tokens_seen": 50628385, + "router_z_loss_clip": 3.09765625, + "router_z_loss_mlp": 0.30371094, + "step": 2337, + "time_per_iteration": 2.542494773864746 + }, + { + "auxiliary_loss_clip": 0.06620462, + "auxiliary_loss_mlp": 0.01292117, + "balance_loss_clip": 0.06304233, + "balance_loss_mlp": 0.01262493, + "epoch": 0.1405681647377123, + "flos": 22206890730240.0, + "grad_norm": 2.099495755823345, + "language_loss": 0.80428421, + "learning_rate": 3.873121888753442e-06, + "loss": 0.88341004, + "num_input_tokens_seen": 50647260, + "router_z_loss_clip": 3.16796875, + "router_z_loss_mlp": 0.29626465, + "step": 2338, + "time_per_iteration": 2.5587832927703857 + }, + { + "auxiliary_loss_clip": 0.06618391, + "auxiliary_loss_mlp": 0.01291133, + "balance_loss_clip": 0.06299177, + "balance_loss_mlp": 0.01259447, + "epoch": 0.14062828799038027, + "flos": 23739607215360.0, + "grad_norm": 2.563407914599119, + "language_loss": 0.81585765, + "learning_rate": 3.87298534506069e-06, + "loss": 0.89495289, + "num_input_tokens_seen": 50666130, + "router_z_loss_clip": 3.19140625, + "router_z_loss_mlp": 0.31689453, + "step": 2339, + "time_per_iteration": 2.541985273361206 + }, + { + "auxiliary_loss_clip": 0.06608106, + "auxiliary_loss_mlp": 0.01284227, + "balance_loss_clip": 0.06301871, + "balance_loss_mlp": 0.01254735, + "epoch": 0.14068841124304826, + "flos": 39211856611200.0, + "grad_norm": 1.7427009821835167, + "language_loss": 0.66622555, + "learning_rate": 3.872848730344146e-06, + "loss": 0.7451489, + "num_input_tokens_seen": 50687440, + "router_z_loss_clip": 3.06640625, + "router_z_loss_mlp": 0.29492188, + "step": 2340, + "time_per_iteration": 2.7599191665649414 + }, + { + "auxiliary_loss_clip": 0.06615461, + "auxiliary_loss_mlp": 0.01296967, + "balance_loss_clip": 0.06309174, + "balance_loss_mlp": 0.01267952, + "epoch": 0.14074853449571623, + "flos": 20198278581120.0, + "grad_norm": 2.455789479029152, + "language_loss": 0.80003643, + "learning_rate": 3.87271204460899e-06, + "loss": 0.87916064, + "num_input_tokens_seen": 50704030, + "router_z_loss_clip": 3.06445312, + "router_z_loss_mlp": 0.2902832, + "step": 2341, + "time_per_iteration": 2.5097782611846924 + }, + { + "auxiliary_loss_clip": 0.06617275, + "auxiliary_loss_mlp": 0.01290109, + "balance_loss_clip": 0.06306843, + "balance_loss_mlp": 0.01261118, + "epoch": 0.1408086577483842, + "flos": 18411800405760.0, + "grad_norm": 1.7920815266740484, + "language_loss": 0.81707942, + "learning_rate": 3.8725752878604066e-06, + "loss": 0.89615333, + "num_input_tokens_seen": 50723305, + "router_z_loss_clip": 3.10351562, + "router_z_loss_mlp": 0.29003906, + "step": 2342, + "time_per_iteration": 2.5234599113464355 + }, + { + "auxiliary_loss_clip": 0.06617711, + "auxiliary_loss_mlp": 0.01285014, + "balance_loss_clip": 0.06315217, + "balance_loss_mlp": 0.01257858, + "epoch": 0.14086878100105216, + "flos": 25271569013760.0, + "grad_norm": 1.8907393143090194, + "language_loss": 0.79096431, + "learning_rate": 3.87243846010358e-06, + "loss": 0.8699916, + "num_input_tokens_seen": 50743270, + "router_z_loss_clip": 3.02734375, + "router_z_loss_mlp": 0.27172852, + "step": 2343, + "time_per_iteration": 2.566734552383423 + }, + { + "auxiliary_loss_clip": 0.0648333, + "auxiliary_loss_mlp": 0.01280273, + "balance_loss_clip": 0.06304723, + "balance_loss_mlp": 0.01268566, + "epoch": 0.14092890425372012, + "flos": 65997553703040.0, + "grad_norm": 0.8105470614930316, + "language_loss": 0.61667693, + "learning_rate": 3.872301561343699e-06, + "loss": 0.69431293, + "num_input_tokens_seen": 50802710, + "router_z_loss_clip": 1.78125, + "router_z_loss_mlp": 0.11694336, + "step": 2344, + "time_per_iteration": 3.107311964035034 + }, + { + "auxiliary_loss_clip": 0.06612515, + "auxiliary_loss_mlp": 0.01296816, + "balance_loss_clip": 0.06307824, + "balance_loss_mlp": 0.01267514, + "epoch": 0.1409890275063881, + "flos": 23701564661760.0, + "grad_norm": 1.4479662088391603, + "language_loss": 0.66076458, + "learning_rate": 3.872164591585956e-06, + "loss": 0.73985791, + "num_input_tokens_seen": 50822625, + "router_z_loss_clip": 3.046875, + "router_z_loss_mlp": 0.29321289, + "step": 2345, + "time_per_iteration": 2.548482656478882 + }, + { + "auxiliary_loss_clip": 0.06630909, + "auxiliary_loss_mlp": 0.0129167, + "balance_loss_clip": 0.06307563, + "balance_loss_mlp": 0.01260676, + "epoch": 0.14104915075905605, + "flos": 23629923820800.0, + "grad_norm": 2.297389176264822, + "language_loss": 0.7525146, + "learning_rate": 3.8720275508355435e-06, + "loss": 0.83174026, + "num_input_tokens_seen": 50842330, + "router_z_loss_clip": 3.23242188, + "router_z_loss_mlp": 0.31005859, + "step": 2346, + "time_per_iteration": 3.9794979095458984 + }, + { + "auxiliary_loss_clip": 0.06626198, + "auxiliary_loss_mlp": 0.01293091, + "balance_loss_clip": 0.06312405, + "balance_loss_mlp": 0.0126162, + "epoch": 0.14110927401172405, + "flos": 20601485228160.0, + "grad_norm": 2.0524474508447876, + "language_loss": 0.7827574, + "learning_rate": 3.8718904390976585e-06, + "loss": 0.86195028, + "num_input_tokens_seen": 50861035, + "router_z_loss_clip": 3.13867188, + "router_z_loss_mlp": 0.31445312, + "step": 2347, + "time_per_iteration": 3.98130202293396 + }, + { + "auxiliary_loss_clip": 0.06624688, + "auxiliary_loss_mlp": 0.01292693, + "balance_loss_clip": 0.06315368, + "balance_loss_mlp": 0.01263725, + "epoch": 0.141169397264392, + "flos": 28555530232320.0, + "grad_norm": 2.266106813963602, + "language_loss": 0.77906024, + "learning_rate": 3.8717532563775e-06, + "loss": 0.85823405, + "num_input_tokens_seen": 50880105, + "router_z_loss_clip": 3.09765625, + "router_z_loss_mlp": 0.28955078, + "step": 2348, + "time_per_iteration": 2.594891309738159 + }, + { + "auxiliary_loss_clip": 0.06614843, + "auxiliary_loss_mlp": 0.01295406, + "balance_loss_clip": 0.06306847, + "balance_loss_mlp": 0.01267558, + "epoch": 0.14122952051705998, + "flos": 17097947585280.0, + "grad_norm": 2.2615839491571097, + "language_loss": 0.88040984, + "learning_rate": 3.871616002680272e-06, + "loss": 0.95951235, + "num_input_tokens_seen": 50897720, + "router_z_loss_clip": 3.08007812, + "router_z_loss_mlp": 0.27856445, + "step": 2349, + "time_per_iteration": 2.547189712524414 + }, + { + "auxiliary_loss_clip": 0.06613597, + "auxiliary_loss_mlp": 0.01290937, + "balance_loss_clip": 0.06307055, + "balance_loss_mlp": 0.01260754, + "epoch": 0.14128964376972794, + "flos": 28953915269760.0, + "grad_norm": 1.755772853620136, + "language_loss": 0.89833802, + "learning_rate": 3.871478678011177e-06, + "loss": 0.97738338, + "num_input_tokens_seen": 50918385, + "router_z_loss_clip": 3.06445312, + "router_z_loss_mlp": 0.30200195, + "step": 2350, + "time_per_iteration": 2.5965797901153564 + }, + { + "auxiliary_loss_clip": 0.06614771, + "auxiliary_loss_mlp": 0.01295884, + "balance_loss_clip": 0.06303953, + "balance_loss_mlp": 0.0126626, + "epoch": 0.1413497670223959, + "flos": 18995828163840.0, + "grad_norm": 2.169076392434691, + "language_loss": 0.81670076, + "learning_rate": 3.871341282375423e-06, + "loss": 0.89580733, + "num_input_tokens_seen": 50938270, + "router_z_loss_clip": 3.10742188, + "router_z_loss_mlp": 0.29638672, + "step": 2351, + "time_per_iteration": 4.039130687713623 + }, + { + "auxiliary_loss_clip": 0.06617273, + "auxiliary_loss_mlp": 0.012885, + "balance_loss_clip": 0.06303668, + "balance_loss_mlp": 0.01259246, + "epoch": 0.14140989027506387, + "flos": 29870053885440.0, + "grad_norm": 2.711725731055931, + "language_loss": 0.85320342, + "learning_rate": 3.871203815778219e-06, + "loss": 0.93226123, + "num_input_tokens_seen": 50958155, + "router_z_loss_clip": 3.1328125, + "router_z_loss_mlp": 0.29223633, + "step": 2352, + "time_per_iteration": 2.6179373264312744 + }, + { + "auxiliary_loss_clip": 0.06476805, + "auxiliary_loss_mlp": 0.01279755, + "balance_loss_clip": 0.06299812, + "balance_loss_mlp": 0.01267614, + "epoch": 0.14147001352773186, + "flos": 62098901331840.0, + "grad_norm": 0.8822482530682503, + "language_loss": 0.61915213, + "learning_rate": 3.87106627822478e-06, + "loss": 0.69671774, + "num_input_tokens_seen": 51020705, + "router_z_loss_clip": 1.76953125, + "router_z_loss_mlp": 0.12139893, + "step": 2353, + "time_per_iteration": 3.087498188018799 + }, + { + "auxiliary_loss_clip": 0.06606863, + "auxiliary_loss_mlp": 0.01289785, + "balance_loss_clip": 0.06297718, + "balance_loss_mlp": 0.01259458, + "epoch": 0.14153013678039983, + "flos": 22023973267200.0, + "grad_norm": 1.6072508509392793, + "language_loss": 0.88457793, + "learning_rate": 3.8709286697203196e-06, + "loss": 0.96354443, + "num_input_tokens_seen": 51039995, + "router_z_loss_clip": 3.09179688, + "router_z_loss_mlp": 0.30297852, + "step": 2354, + "time_per_iteration": 2.5465357303619385 + }, + { + "auxiliary_loss_clip": 0.06612588, + "auxiliary_loss_mlp": 0.01286583, + "balance_loss_clip": 0.0630111, + "balance_loss_mlp": 0.01255231, + "epoch": 0.1415902600330678, + "flos": 19726365985920.0, + "grad_norm": 1.842515646240357, + "language_loss": 0.75627196, + "learning_rate": 3.870790990270057e-06, + "loss": 0.83526361, + "num_input_tokens_seen": 51059075, + "router_z_loss_clip": 3.11523438, + "router_z_loss_mlp": 0.31347656, + "step": 2355, + "time_per_iteration": 2.5172102451324463 + }, + { + "auxiliary_loss_clip": 0.0647012, + "auxiliary_loss_mlp": 0.01269619, + "balance_loss_clip": 0.06293327, + "balance_loss_mlp": 0.01258312, + "epoch": 0.14165038328573576, + "flos": 65919330316800.0, + "grad_norm": 0.6582247032564781, + "language_loss": 0.51791292, + "learning_rate": 3.870653239879212e-06, + "loss": 0.59531033, + "num_input_tokens_seen": 51120380, + "router_z_loss_clip": 1.765625, + "router_z_loss_mlp": 0.11303711, + "step": 2356, + "time_per_iteration": 3.150625228881836 + }, + { + "auxiliary_loss_clip": 0.06615196, + "auxiliary_loss_mlp": 0.01292015, + "balance_loss_clip": 0.06304777, + "balance_loss_mlp": 0.01263262, + "epoch": 0.14171050653840372, + "flos": 12135011379840.0, + "grad_norm": 2.2420127528599973, + "language_loss": 0.71637189, + "learning_rate": 3.8705154185530095e-06, + "loss": 0.79544401, + "num_input_tokens_seen": 51136950, + "router_z_loss_clip": 3.10742188, + "router_z_loss_mlp": 0.28759766, + "step": 2357, + "time_per_iteration": 2.552600383758545 + }, + { + "auxiliary_loss_clip": 0.06616427, + "auxiliary_loss_mlp": 0.01288449, + "balance_loss_clip": 0.06301764, + "balance_loss_mlp": 0.01259624, + "epoch": 0.1417706297910717, + "flos": 20418735473280.0, + "grad_norm": 1.865810969860464, + "language_loss": 0.83125997, + "learning_rate": 3.870377526296674e-06, + "loss": 0.91030866, + "num_input_tokens_seen": 51155175, + "router_z_loss_clip": 3.15039062, + "router_z_loss_mlp": 0.28833008, + "step": 2358, + "time_per_iteration": 2.5359318256378174 + }, + { + "auxiliary_loss_clip": 0.06627464, + "auxiliary_loss_mlp": 0.01304325, + "balance_loss_clip": 0.06307626, + "balance_loss_mlp": 0.01270685, + "epoch": 0.14183075304373965, + "flos": 22386831373440.0, + "grad_norm": 2.098054947183796, + "language_loss": 0.72660583, + "learning_rate": 3.870239563115436e-06, + "loss": 0.8059237, + "num_input_tokens_seen": 51174500, + "router_z_loss_clip": 3.19726562, + "router_z_loss_mlp": 0.33642578, + "step": 2359, + "time_per_iteration": 2.5888121128082275 + }, + { + "auxiliary_loss_clip": 0.06615248, + "auxiliary_loss_mlp": 0.01292517, + "balance_loss_clip": 0.06299685, + "balance_loss_mlp": 0.0126126, + "epoch": 0.14189087629640765, + "flos": 21587503749120.0, + "grad_norm": 2.25647767982073, + "language_loss": 0.77278101, + "learning_rate": 3.870101529014526e-06, + "loss": 0.85185868, + "num_input_tokens_seen": 51194270, + "router_z_loss_clip": 3.16015625, + "router_z_loss_mlp": 0.31225586, + "step": 2360, + "time_per_iteration": 2.579084634780884 + }, + { + "auxiliary_loss_clip": 0.06601179, + "auxiliary_loss_mlp": 0.01289048, + "balance_loss_clip": 0.06295604, + "balance_loss_mlp": 0.01258936, + "epoch": 0.1419509995490756, + "flos": 20014312942080.0, + "grad_norm": 2.059957260866831, + "language_loss": 0.83125579, + "learning_rate": 3.869963423999178e-06, + "loss": 0.91015804, + "num_input_tokens_seen": 51211850, + "router_z_loss_clip": 3.0546875, + "router_z_loss_mlp": 0.30102539, + "step": 2361, + "time_per_iteration": 2.5846474170684814 + }, + { + "auxiliary_loss_clip": 0.06605215, + "auxiliary_loss_mlp": 0.01291381, + "balance_loss_clip": 0.06296673, + "balance_loss_mlp": 0.01261745, + "epoch": 0.14201112280174358, + "flos": 31949552188800.0, + "grad_norm": 1.940007653055607, + "language_loss": 0.75587547, + "learning_rate": 3.86982524807463e-06, + "loss": 0.83484137, + "num_input_tokens_seen": 51233545, + "router_z_loss_clip": 3.08789062, + "router_z_loss_mlp": 0.29663086, + "step": 2362, + "time_per_iteration": 2.6412899494171143 + }, + { + "auxiliary_loss_clip": 0.06603248, + "auxiliary_loss_mlp": 0.01291653, + "balance_loss_clip": 0.06299227, + "balance_loss_mlp": 0.01262948, + "epoch": 0.14207124605441154, + "flos": 41473811180160.0, + "grad_norm": 1.7220107932789903, + "language_loss": 0.74775076, + "learning_rate": 3.869687001246122e-06, + "loss": 0.82669979, + "num_input_tokens_seen": 51257615, + "router_z_loss_clip": 3.04296875, + "router_z_loss_mlp": 0.28686523, + "step": 2363, + "time_per_iteration": 2.7700705528259277 + }, + { + "auxiliary_loss_clip": 0.0660228, + "auxiliary_loss_mlp": 0.01297174, + "balance_loss_clip": 0.06294844, + "balance_loss_mlp": 0.01268051, + "epoch": 0.1421313693070795, + "flos": 31913186716800.0, + "grad_norm": 1.995738601500514, + "language_loss": 0.74229443, + "learning_rate": 3.8695486835188946e-06, + "loss": 0.82128894, + "num_input_tokens_seen": 51279645, + "router_z_loss_clip": 3.07226562, + "router_z_loss_mlp": 0.2911377, + "step": 2364, + "time_per_iteration": 2.636725664138794 + }, + { + "auxiliary_loss_clip": 0.06596863, + "auxiliary_loss_mlp": 0.01292827, + "balance_loss_clip": 0.06297632, + "balance_loss_mlp": 0.01264741, + "epoch": 0.14219149255974747, + "flos": 26878609670400.0, + "grad_norm": 3.4348232103303853, + "language_loss": 0.91282582, + "learning_rate": 3.869410294898195e-06, + "loss": 0.9917227, + "num_input_tokens_seen": 51299775, + "router_z_loss_clip": 2.9921875, + "router_z_loss_mlp": 0.28100586, + "step": 2365, + "time_per_iteration": 2.6131789684295654 + }, + { + "auxiliary_loss_clip": 0.06604894, + "auxiliary_loss_mlp": 0.01286963, + "balance_loss_clip": 0.06295748, + "balance_loss_mlp": 0.01257613, + "epoch": 0.14225161581241544, + "flos": 27461882741760.0, + "grad_norm": 1.7987446671320764, + "language_loss": 0.67002726, + "learning_rate": 3.869271835389268e-06, + "loss": 0.74894583, + "num_input_tokens_seen": 51319430, + "router_z_loss_clip": 3.09570312, + "router_z_loss_mlp": 0.29345703, + "step": 2366, + "time_per_iteration": 2.5887913703918457 + }, + { + "auxiliary_loss_clip": 0.06604536, + "auxiliary_loss_mlp": 0.01294035, + "balance_loss_clip": 0.06302322, + "balance_loss_mlp": 0.01266069, + "epoch": 0.14231173906508343, + "flos": 10566055203840.0, + "grad_norm": 1.9092553080536903, + "language_loss": 0.81985664, + "learning_rate": 3.8691333049973665e-06, + "loss": 0.89884233, + "num_input_tokens_seen": 51336045, + "router_z_loss_clip": 3.0234375, + "router_z_loss_mlp": 0.27978516, + "step": 2367, + "time_per_iteration": 2.5478296279907227 + }, + { + "auxiliary_loss_clip": 0.06620896, + "auxiliary_loss_mlp": 0.01287452, + "balance_loss_clip": 0.06312472, + "balance_loss_mlp": 0.01257244, + "epoch": 0.1423718623177514, + "flos": 28367539597440.0, + "grad_norm": 1.7968709236925184, + "language_loss": 0.83861458, + "learning_rate": 3.868994703727742e-06, + "loss": 0.91769814, + "num_input_tokens_seen": 51357030, + "router_z_loss_clip": 3.08984375, + "router_z_loss_mlp": 0.30224609, + "step": 2368, + "time_per_iteration": 2.6346163749694824 + }, + { + "auxiliary_loss_clip": 0.06607647, + "auxiliary_loss_mlp": 0.01292051, + "balance_loss_clip": 0.06299834, + "balance_loss_mlp": 0.01262558, + "epoch": 0.14243198557041936, + "flos": 19360279497600.0, + "grad_norm": 2.15297979683556, + "language_loss": 0.8844623, + "learning_rate": 3.868856031585652e-06, + "loss": 0.96345925, + "num_input_tokens_seen": 51374890, + "router_z_loss_clip": 3.078125, + "router_z_loss_mlp": 0.29516602, + "step": 2369, + "time_per_iteration": 2.539616823196411 + }, + { + "auxiliary_loss_clip": 0.06609218, + "auxiliary_loss_mlp": 0.01286988, + "balance_loss_clip": 0.06298466, + "balance_loss_mlp": 0.01257067, + "epoch": 0.14249210882308733, + "flos": 28814952072960.0, + "grad_norm": 1.4943626605358518, + "language_loss": 0.76837498, + "learning_rate": 3.868717288576354e-06, + "loss": 0.84733701, + "num_input_tokens_seen": 51398100, + "router_z_loss_clip": 3.10546875, + "router_z_loss_mlp": 0.29931641, + "step": 2370, + "time_per_iteration": 2.6086556911468506 + }, + { + "auxiliary_loss_clip": 0.06600792, + "auxiliary_loss_mlp": 0.01298284, + "balance_loss_clip": 0.06298122, + "balance_loss_mlp": 0.01270198, + "epoch": 0.1425522320757553, + "flos": 21841433147520.0, + "grad_norm": 1.5553091357309907, + "language_loss": 0.83888042, + "learning_rate": 3.868578474705109e-06, + "loss": 0.91787124, + "num_input_tokens_seen": 51418745, + "router_z_loss_clip": 3.02734375, + "router_z_loss_mlp": 0.28076172, + "step": 2371, + "time_per_iteration": 2.5464093685150146 + }, + { + "auxiliary_loss_clip": 0.06608661, + "auxiliary_loss_mlp": 0.01298037, + "balance_loss_clip": 0.06299958, + "balance_loss_mlp": 0.01267448, + "epoch": 0.14261235532842326, + "flos": 17317230520320.0, + "grad_norm": 1.80299500179396, + "language_loss": 0.84039259, + "learning_rate": 3.868439589977181e-06, + "loss": 0.91945958, + "num_input_tokens_seen": 51437455, + "router_z_loss_clip": 3.08984375, + "router_z_loss_mlp": 0.30615234, + "step": 2372, + "time_per_iteration": 2.6340725421905518 + }, + { + "auxiliary_loss_clip": 0.0660327, + "auxiliary_loss_mlp": 0.01297499, + "balance_loss_clip": 0.06296232, + "balance_loss_mlp": 0.01267149, + "epoch": 0.14267247858109125, + "flos": 18812659138560.0, + "grad_norm": 1.948811934487197, + "language_loss": 0.8570497, + "learning_rate": 3.868300634397836e-06, + "loss": 0.93605745, + "num_input_tokens_seen": 51455710, + "router_z_loss_clip": 3.07226562, + "router_z_loss_mlp": 0.30322266, + "step": 2373, + "time_per_iteration": 2.580719232559204 + }, + { + "auxiliary_loss_clip": 0.06601362, + "auxiliary_loss_mlp": 0.01295253, + "balance_loss_clip": 0.06296989, + "balance_loss_mlp": 0.01266547, + "epoch": 0.14273260183375922, + "flos": 11362783351680.0, + "grad_norm": 1.9518464435556906, + "language_loss": 0.87130672, + "learning_rate": 3.8681616079723445e-06, + "loss": 0.95027292, + "num_input_tokens_seen": 51471270, + "router_z_loss_clip": 3.04101562, + "router_z_loss_mlp": 0.28710938, + "step": 2374, + "time_per_iteration": 2.499939441680908 + }, + { + "auxiliary_loss_clip": 0.0660402, + "auxiliary_loss_mlp": 0.01294805, + "balance_loss_clip": 0.06292336, + "balance_loss_mlp": 0.01264526, + "epoch": 0.14279272508642718, + "flos": 27575800767360.0, + "grad_norm": 1.5586534981326832, + "language_loss": 0.79946959, + "learning_rate": 3.868022510705977e-06, + "loss": 0.87845778, + "num_input_tokens_seen": 51492705, + "router_z_loss_clip": 3.1171875, + "router_z_loss_mlp": 0.30273438, + "step": 2375, + "time_per_iteration": 2.610959768295288 + }, + { + "auxiliary_loss_clip": 0.06608847, + "auxiliary_loss_mlp": 0.01308792, + "balance_loss_clip": 0.06302035, + "balance_loss_mlp": 0.01278454, + "epoch": 0.14285284833909515, + "flos": 16258019857920.0, + "grad_norm": 4.976375068021591, + "language_loss": 0.77988309, + "learning_rate": 3.867883342604009e-06, + "loss": 0.85905945, + "num_input_tokens_seen": 51510780, + "router_z_loss_clip": 3.0703125, + "router_z_loss_mlp": 0.30310059, + "step": 2376, + "time_per_iteration": 2.5109288692474365 + }, + { + "auxiliary_loss_clip": 0.06606634, + "auxiliary_loss_mlp": 0.01292138, + "balance_loss_clip": 0.06302465, + "balance_loss_mlp": 0.01263742, + "epoch": 0.1429129715917631, + "flos": 19761725208960.0, + "grad_norm": 1.9346292161061796, + "language_loss": 0.94255036, + "learning_rate": 3.867744103671717e-06, + "loss": 1.02153814, + "num_input_tokens_seen": 51531400, + "router_z_loss_clip": 3.04101562, + "router_z_loss_mlp": 0.28393555, + "step": 2377, + "time_per_iteration": 2.5885112285614014 + }, + { + "auxiliary_loss_clip": 0.06608409, + "auxiliary_loss_mlp": 0.01297565, + "balance_loss_clip": 0.06298596, + "balance_loss_mlp": 0.01267524, + "epoch": 0.14297309484443108, + "flos": 21142606896000.0, + "grad_norm": 1.9262255620531108, + "language_loss": 0.92638403, + "learning_rate": 3.867604793914382e-06, + "loss": 1.00544381, + "num_input_tokens_seen": 51548215, + "router_z_loss_clip": 3.09960938, + "router_z_loss_mlp": 0.30029297, + "step": 2378, + "time_per_iteration": 2.5396018028259277 + }, + { + "auxiliary_loss_clip": 0.06602019, + "auxiliary_loss_mlp": 0.01288289, + "balance_loss_clip": 0.06294227, + "balance_loss_mlp": 0.01259667, + "epoch": 0.14303321809709904, + "flos": 23593432567680.0, + "grad_norm": 1.925396398414909, + "language_loss": 0.7506215, + "learning_rate": 3.8674654133372864e-06, + "loss": 0.82952458, + "num_input_tokens_seen": 51566820, + "router_z_loss_clip": 3.07421875, + "router_z_loss_mlp": 0.28649902, + "step": 2379, + "time_per_iteration": 2.5452654361724854 + }, + { + "auxiliary_loss_clip": 0.06604548, + "auxiliary_loss_mlp": 0.01289072, + "balance_loss_clip": 0.06300471, + "balance_loss_mlp": 0.01259342, + "epoch": 0.14309334134976703, + "flos": 15893778159360.0, + "grad_norm": 2.089306422098332, + "language_loss": 0.80051982, + "learning_rate": 3.867325961945714e-06, + "loss": 0.87945604, + "num_input_tokens_seen": 51585075, + "router_z_loss_clip": 3.0390625, + "router_z_loss_mlp": 0.29736328, + "step": 2380, + "time_per_iteration": 2.526667594909668 + }, + { + "auxiliary_loss_clip": 0.06614038, + "auxiliary_loss_mlp": 0.01293901, + "balance_loss_clip": 0.06305015, + "balance_loss_mlp": 0.01263348, + "epoch": 0.143153464602435, + "flos": 16331086218240.0, + "grad_norm": 2.094305551914021, + "language_loss": 0.88833153, + "learning_rate": 3.867186439744955e-06, + "loss": 0.96741092, + "num_input_tokens_seen": 51603185, + "router_z_loss_clip": 3.09179688, + "router_z_loss_mlp": 0.30578613, + "step": 2381, + "time_per_iteration": 2.5728068351745605 + }, + { + "auxiliary_loss_clip": 0.06602444, + "auxiliary_loss_mlp": 0.0128486, + "balance_loss_clip": 0.06299065, + "balance_loss_mlp": 0.01256226, + "epoch": 0.14321358785510296, + "flos": 17097737950080.0, + "grad_norm": 2.316632685614806, + "language_loss": 0.77740443, + "learning_rate": 3.867046846740299e-06, + "loss": 0.85627747, + "num_input_tokens_seen": 51620880, + "router_z_loss_clip": 3.02929688, + "router_z_loss_mlp": 0.28625488, + "step": 2382, + "time_per_iteration": 2.5297727584838867 + }, + { + "auxiliary_loss_clip": 0.06601999, + "auxiliary_loss_mlp": 0.01286872, + "balance_loss_clip": 0.06297128, + "balance_loss_mlp": 0.01257904, + "epoch": 0.14327371110777093, + "flos": 26330108843520.0, + "grad_norm": 2.004241684907444, + "language_loss": 0.78048921, + "learning_rate": 3.866907182937039e-06, + "loss": 0.85937786, + "num_input_tokens_seen": 51640170, + "router_z_loss_clip": 3.05078125, + "router_z_loss_mlp": 0.28955078, + "step": 2383, + "time_per_iteration": 2.598944664001465 + }, + { + "auxiliary_loss_clip": 0.06614614, + "auxiliary_loss_mlp": 0.01292365, + "balance_loss_clip": 0.06304877, + "balance_loss_mlp": 0.01261513, + "epoch": 0.1433338343604389, + "flos": 18082163243520.0, + "grad_norm": 3.628436675924041, + "language_loss": 0.88476908, + "learning_rate": 3.866767448340471e-06, + "loss": 0.96383882, + "num_input_tokens_seen": 51656580, + "router_z_loss_clip": 3.09765625, + "router_z_loss_mlp": 0.30834961, + "step": 2384, + "time_per_iteration": 2.5066895484924316 + }, + { + "auxiliary_loss_clip": 0.06611983, + "auxiliary_loss_mlp": 0.01297446, + "balance_loss_clip": 0.06300933, + "balance_loss_mlp": 0.0126719, + "epoch": 0.14339395761310686, + "flos": 15528110941440.0, + "grad_norm": 5.651210237348795, + "language_loss": 0.81964046, + "learning_rate": 3.866627642955895e-06, + "loss": 0.89873475, + "num_input_tokens_seen": 51674645, + "router_z_loss_clip": 3.11132812, + "router_z_loss_mlp": 0.30273438, + "step": 2385, + "time_per_iteration": 3.9016833305358887 + }, + { + "auxiliary_loss_clip": 0.06612079, + "auxiliary_loss_mlp": 0.01294874, + "balance_loss_clip": 0.06302845, + "balance_loss_mlp": 0.01266406, + "epoch": 0.14345408086577485, + "flos": 28556368773120.0, + "grad_norm": 2.028141972046204, + "language_loss": 0.76766604, + "learning_rate": 3.866487766788612e-06, + "loss": 0.8467356, + "num_input_tokens_seen": 51695770, + "router_z_loss_clip": 3.09570312, + "router_z_loss_mlp": 0.28479004, + "step": 2386, + "time_per_iteration": 4.032405376434326 + }, + { + "auxiliary_loss_clip": 0.06616995, + "auxiliary_loss_mlp": 0.01287556, + "balance_loss_clip": 0.06312285, + "balance_loss_mlp": 0.01258958, + "epoch": 0.14351420411844282, + "flos": 20236279207680.0, + "grad_norm": 2.123480501578919, + "language_loss": 0.79237044, + "learning_rate": 3.866347819843925e-06, + "loss": 0.87141591, + "num_input_tokens_seen": 51714165, + "router_z_loss_clip": 3.046875, + "router_z_loss_mlp": 0.28601074, + "step": 2387, + "time_per_iteration": 2.5608971118927 + }, + { + "auxiliary_loss_clip": 0.06612308, + "auxiliary_loss_mlp": 0.01293206, + "balance_loss_clip": 0.06306893, + "balance_loss_mlp": 0.01263023, + "epoch": 0.14357432737111078, + "flos": 19871157041280.0, + "grad_norm": 2.5788985385847396, + "language_loss": 0.83602524, + "learning_rate": 3.866207802127143e-06, + "loss": 0.91508037, + "num_input_tokens_seen": 51734440, + "router_z_loss_clip": 3.06054688, + "router_z_loss_mlp": 0.30200195, + "step": 2388, + "time_per_iteration": 2.5413224697113037 + }, + { + "auxiliary_loss_clip": 0.06619543, + "auxiliary_loss_mlp": 0.01287669, + "balance_loss_clip": 0.06312172, + "balance_loss_mlp": 0.0126006, + "epoch": 0.14363445062377875, + "flos": 28264354894080.0, + "grad_norm": 2.5598639084548176, + "language_loss": 0.83343434, + "learning_rate": 3.866067713643573e-06, + "loss": 0.91250646, + "num_input_tokens_seen": 51753730, + "router_z_loss_clip": 3.07421875, + "router_z_loss_mlp": 0.27648926, + "step": 2389, + "time_per_iteration": 2.6027376651763916 + }, + { + "auxiliary_loss_clip": 0.06612187, + "auxiliary_loss_mlp": 0.01286457, + "balance_loss_clip": 0.06301727, + "balance_loss_mlp": 0.01257013, + "epoch": 0.1436945738764467, + "flos": 18192517470720.0, + "grad_norm": 2.036228542153499, + "language_loss": 0.84029567, + "learning_rate": 3.8659275543985285e-06, + "loss": 0.91928208, + "num_input_tokens_seen": 51771195, + "router_z_loss_clip": 3.10546875, + "router_z_loss_mlp": 0.29467773, + "step": 2390, + "time_per_iteration": 5.428901672363281 + }, + { + "auxiliary_loss_clip": 0.06612678, + "auxiliary_loss_mlp": 0.01293631, + "balance_loss_clip": 0.06306715, + "balance_loss_mlp": 0.01264282, + "epoch": 0.14375469712911468, + "flos": 27315246896640.0, + "grad_norm": 2.34202135113637, + "language_loss": 0.75496042, + "learning_rate": 3.865787324397324e-06, + "loss": 0.83402348, + "num_input_tokens_seen": 51792290, + "router_z_loss_clip": 3.06054688, + "router_z_loss_mlp": 0.29345703, + "step": 2391, + "time_per_iteration": 2.599823236465454 + }, + { + "auxiliary_loss_clip": 0.06462222, + "auxiliary_loss_mlp": 0.01318708, + "balance_loss_clip": 0.06290679, + "balance_loss_mlp": 0.01307848, + "epoch": 0.14381482038178264, + "flos": 56908757980800.0, + "grad_norm": 0.847659725006037, + "language_loss": 0.61820173, + "learning_rate": 3.865647023645277e-06, + "loss": 0.69601095, + "num_input_tokens_seen": 51843675, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.10876465, + "step": 2392, + "time_per_iteration": 3.007570266723633 + }, + { + "auxiliary_loss_clip": 0.06623066, + "auxiliary_loss_mlp": 0.01297432, + "balance_loss_clip": 0.06308551, + "balance_loss_mlp": 0.01267105, + "epoch": 0.14387494363445064, + "flos": 14287282554240.0, + "grad_norm": 6.716541515366395, + "language_loss": 0.77778554, + "learning_rate": 3.865506652147709e-06, + "loss": 0.85699052, + "num_input_tokens_seen": 51860285, + "router_z_loss_clip": 3.14453125, + "router_z_loss_mlp": 0.30322266, + "step": 2393, + "time_per_iteration": 2.5064942836761475 + }, + { + "auxiliary_loss_clip": 0.06614703, + "auxiliary_loss_mlp": 0.01296275, + "balance_loss_clip": 0.06308223, + "balance_loss_mlp": 0.01266687, + "epoch": 0.1439350668871186, + "flos": 26768884348800.0, + "grad_norm": 2.0037821703408287, + "language_loss": 0.78038269, + "learning_rate": 3.865366209909941e-06, + "loss": 0.85949242, + "num_input_tokens_seen": 51880105, + "router_z_loss_clip": 3.06445312, + "router_z_loss_mlp": 0.2956543, + "step": 2394, + "time_per_iteration": 2.6112003326416016 + }, + { + "auxiliary_loss_clip": 0.06611894, + "auxiliary_loss_mlp": 0.01285238, + "balance_loss_clip": 0.06308618, + "balance_loss_mlp": 0.01256866, + "epoch": 0.14399519013978657, + "flos": 40709926632960.0, + "grad_norm": 2.2776605014778, + "language_loss": 0.87247694, + "learning_rate": 3.8652256969372994e-06, + "loss": 0.95144826, + "num_input_tokens_seen": 51905175, + "router_z_loss_clip": 3.03320312, + "router_z_loss_mlp": 0.28381348, + "step": 2395, + "time_per_iteration": 2.708005428314209 + }, + { + "auxiliary_loss_clip": 0.06606728, + "auxiliary_loss_mlp": 0.0129272, + "balance_loss_clip": 0.06306736, + "balance_loss_mlp": 0.01262846, + "epoch": 0.14405531339245453, + "flos": 20563652309760.0, + "grad_norm": 1.5258430726739798, + "language_loss": 0.83690441, + "learning_rate": 3.865085113235113e-06, + "loss": 0.91589892, + "num_input_tokens_seen": 51924490, + "router_z_loss_clip": 3.00195312, + "router_z_loss_mlp": 0.29882812, + "step": 2396, + "time_per_iteration": 2.554426431655884 + }, + { + "auxiliary_loss_clip": 0.06608565, + "auxiliary_loss_mlp": 0.01286347, + "balance_loss_clip": 0.06309813, + "balance_loss_mlp": 0.0125664, + "epoch": 0.1441154366451225, + "flos": 19578975454080.0, + "grad_norm": 3.4820488024482787, + "language_loss": 0.83915055, + "learning_rate": 3.864944458808712e-06, + "loss": 0.9180997, + "num_input_tokens_seen": 51940490, + "router_z_loss_clip": 2.98828125, + "router_z_loss_mlp": 0.29711914, + "step": 2397, + "time_per_iteration": 2.504763603210449 + }, + { + "auxiliary_loss_clip": 0.0661477, + "auxiliary_loss_mlp": 0.01289633, + "balance_loss_clip": 0.0631109, + "balance_loss_mlp": 0.01261452, + "epoch": 0.14417555989779046, + "flos": 18521735362560.0, + "grad_norm": 2.264494400552882, + "language_loss": 0.81188649, + "learning_rate": 3.86480373366343e-06, + "loss": 0.89093053, + "num_input_tokens_seen": 51957910, + "router_z_loss_clip": 3.03710938, + "router_z_loss_mlp": 0.28186035, + "step": 2398, + "time_per_iteration": 2.5385115146636963 + }, + { + "auxiliary_loss_clip": 0.0661198, + "auxiliary_loss_mlp": 0.01292634, + "balance_loss_clip": 0.06310214, + "balance_loss_mlp": 0.01263535, + "epoch": 0.14423568315045843, + "flos": 26038933505280.0, + "grad_norm": 2.0391001830721014, + "language_loss": 0.65964776, + "learning_rate": 3.864662937804603e-06, + "loss": 0.73869389, + "num_input_tokens_seen": 51978010, + "router_z_loss_clip": 3.01367188, + "router_z_loss_mlp": 0.2911377, + "step": 2399, + "time_per_iteration": 2.5843687057495117 + }, + { + "auxiliary_loss_clip": 0.06611193, + "auxiliary_loss_mlp": 0.01283302, + "balance_loss_clip": 0.06308104, + "balance_loss_mlp": 0.01253953, + "epoch": 0.14429580640312642, + "flos": 21295238307840.0, + "grad_norm": 1.6766317515480094, + "language_loss": 0.83645046, + "learning_rate": 3.864522071237571e-06, + "loss": 0.91539544, + "num_input_tokens_seen": 51998515, + "router_z_loss_clip": 3.03320312, + "router_z_loss_mlp": 0.29321289, + "step": 2400, + "time_per_iteration": 2.555400848388672 + }, + { + "auxiliary_loss_clip": 0.06611119, + "auxiliary_loss_mlp": 0.01295227, + "balance_loss_clip": 0.06304638, + "balance_loss_mlp": 0.01263494, + "epoch": 0.14435592965579438, + "flos": 25634636755200.0, + "grad_norm": 1.4775307939223221, + "language_loss": 0.75889075, + "learning_rate": 3.864381133967676e-06, + "loss": 0.83795416, + "num_input_tokens_seen": 52019270, + "router_z_loss_clip": 3.06445312, + "router_z_loss_mlp": 0.31738281, + "step": 2401, + "time_per_iteration": 2.595510959625244 + }, + { + "auxiliary_loss_clip": 0.06599294, + "auxiliary_loss_mlp": 0.01290815, + "balance_loss_clip": 0.06300685, + "balance_loss_mlp": 0.01262991, + "epoch": 0.14441605290846235, + "flos": 22971488037120.0, + "grad_norm": 3.551603969288966, + "language_loss": 0.81723303, + "learning_rate": 3.86424012600026e-06, + "loss": 0.89613414, + "num_input_tokens_seen": 52039315, + "router_z_loss_clip": 2.984375, + "router_z_loss_mlp": 0.27832031, + "step": 2402, + "time_per_iteration": 2.586766242980957 + }, + { + "auxiliary_loss_clip": 0.06609451, + "auxiliary_loss_mlp": 0.0129576, + "balance_loss_clip": 0.06306025, + "balance_loss_mlp": 0.01267246, + "epoch": 0.14447617616113032, + "flos": 17353386357120.0, + "grad_norm": 2.060017923221776, + "language_loss": 0.8556419, + "learning_rate": 3.864099047340673e-06, + "loss": 0.93469405, + "num_input_tokens_seen": 52056555, + "router_z_loss_clip": 3.03320312, + "router_z_loss_mlp": 0.28491211, + "step": 2403, + "time_per_iteration": 2.607682943344116 + }, + { + "auxiliary_loss_clip": 0.06604473, + "auxiliary_loss_mlp": 0.01296468, + "balance_loss_clip": 0.06304755, + "balance_loss_mlp": 0.01267644, + "epoch": 0.14453629941379828, + "flos": 24066896463360.0, + "grad_norm": 1.6573993279871784, + "language_loss": 0.71218109, + "learning_rate": 3.863957897994262e-06, + "loss": 0.79119051, + "num_input_tokens_seen": 52075800, + "router_z_loss_clip": 2.99609375, + "router_z_loss_mlp": 0.28833008, + "step": 2404, + "time_per_iteration": 2.5632174015045166 + }, + { + "auxiliary_loss_clip": 0.06603173, + "auxiliary_loss_mlp": 0.0129217, + "balance_loss_clip": 0.06303019, + "balance_loss_mlp": 0.0126282, + "epoch": 0.14459642266646625, + "flos": 14434924648320.0, + "grad_norm": 2.334574719230043, + "language_loss": 0.74209595, + "learning_rate": 3.863816677966381e-06, + "loss": 0.82104933, + "num_input_tokens_seen": 52092585, + "router_z_loss_clip": 3.0, + "router_z_loss_mlp": 0.29345703, + "step": 2405, + "time_per_iteration": 2.520474910736084 + }, + { + "auxiliary_loss_clip": 0.06599967, + "auxiliary_loss_mlp": 0.01307828, + "balance_loss_clip": 0.06301095, + "balance_loss_mlp": 0.01279647, + "epoch": 0.14465654591913424, + "flos": 9871337802240.0, + "grad_norm": 2.8694662985653245, + "language_loss": 0.74507034, + "learning_rate": 3.863675387262386e-06, + "loss": 0.8241483, + "num_input_tokens_seen": 52108990, + "router_z_loss_clip": 2.984375, + "router_z_loss_mlp": 0.28173828, + "step": 2406, + "time_per_iteration": 2.5204012393951416 + }, + { + "auxiliary_loss_clip": 0.0660891, + "auxiliary_loss_mlp": 0.01299289, + "balance_loss_clip": 0.06308217, + "balance_loss_mlp": 0.01270584, + "epoch": 0.1447166691718022, + "flos": 24979890551040.0, + "grad_norm": 2.4466515535741027, + "language_loss": 0.77524543, + "learning_rate": 3.8635340258876325e-06, + "loss": 0.85432744, + "num_input_tokens_seen": 52125385, + "router_z_loss_clip": 3.00976562, + "router_z_loss_mlp": 0.28686523, + "step": 2407, + "time_per_iteration": 2.5871012210845947 + }, + { + "auxiliary_loss_clip": 0.06596132, + "auxiliary_loss_mlp": 0.01309759, + "balance_loss_clip": 0.06298497, + "balance_loss_mlp": 0.01281459, + "epoch": 0.14477679242447017, + "flos": 21914457580800.0, + "grad_norm": 2.4005439664015156, + "language_loss": 0.80167431, + "learning_rate": 3.8633925938474826e-06, + "loss": 0.88073325, + "num_input_tokens_seen": 52144985, + "router_z_loss_clip": 2.9765625, + "router_z_loss_mlp": 0.28320312, + "step": 2408, + "time_per_iteration": 2.5400643348693848 + }, + { + "auxiliary_loss_clip": 0.06608425, + "auxiliary_loss_mlp": 0.01300861, + "balance_loss_clip": 0.06305376, + "balance_loss_mlp": 0.0126939, + "epoch": 0.14483691567713813, + "flos": 20747030970240.0, + "grad_norm": 2.230633188895553, + "language_loss": 0.83653724, + "learning_rate": 3.863251091147299e-06, + "loss": 0.9156301, + "num_input_tokens_seen": 52163885, + "router_z_loss_clip": 3.03320312, + "router_z_loss_mlp": 0.31445312, + "step": 2409, + "time_per_iteration": 2.5423808097839355 + }, + { + "auxiliary_loss_clip": 0.06608373, + "auxiliary_loss_mlp": 0.0129938, + "balance_loss_clip": 0.06298821, + "balance_loss_mlp": 0.0126978, + "epoch": 0.1448970389298061, + "flos": 35416388943360.0, + "grad_norm": 2.041474654068305, + "language_loss": 0.76231539, + "learning_rate": 3.863109517792446e-06, + "loss": 0.84139293, + "num_input_tokens_seen": 52184325, + "router_z_loss_clip": 3.09765625, + "router_z_loss_mlp": 0.29602051, + "step": 2410, + "time_per_iteration": 2.6380317211151123 + }, + { + "auxiliary_loss_clip": 0.0660304, + "auxiliary_loss_mlp": 0.01294458, + "balance_loss_clip": 0.06304888, + "balance_loss_mlp": 0.01265491, + "epoch": 0.14495716218247406, + "flos": 15419853066240.0, + "grad_norm": 1.847852108753089, + "language_loss": 0.8233192, + "learning_rate": 3.8629678737882945e-06, + "loss": 0.90229416, + "num_input_tokens_seen": 52202740, + "router_z_loss_clip": 2.98046875, + "router_z_loss_mlp": 0.28942871, + "step": 2411, + "time_per_iteration": 2.5439260005950928 + }, + { + "auxiliary_loss_clip": 0.06610366, + "auxiliary_loss_mlp": 0.0129153, + "balance_loss_clip": 0.06308557, + "balance_loss_mlp": 0.01262514, + "epoch": 0.14501728543514203, + "flos": 33701677390080.0, + "grad_norm": 2.23940850930143, + "language_loss": 0.71979284, + "learning_rate": 3.862826159140214e-06, + "loss": 0.79881179, + "num_input_tokens_seen": 52223100, + "router_z_loss_clip": 3.01757812, + "router_z_loss_mlp": 0.29003906, + "step": 2412, + "time_per_iteration": 2.654892921447754 + }, + { + "auxiliary_loss_clip": 0.06603752, + "auxiliary_loss_mlp": 0.01292883, + "balance_loss_clip": 0.06306557, + "balance_loss_mlp": 0.01265465, + "epoch": 0.14507740868781002, + "flos": 15601512718080.0, + "grad_norm": 1.90667529133839, + "language_loss": 0.78426313, + "learning_rate": 3.862684373853579e-06, + "loss": 0.86322957, + "num_input_tokens_seen": 52239690, + "router_z_loss_clip": 2.96875, + "router_z_loss_mlp": 0.27441406, + "step": 2413, + "time_per_iteration": 2.5105841159820557 + }, + { + "auxiliary_loss_clip": 0.06474504, + "auxiliary_loss_mlp": 0.01256457, + "balance_loss_clip": 0.06298508, + "balance_loss_mlp": 0.01246152, + "epoch": 0.145137531940478, + "flos": 66695247924480.0, + "grad_norm": 0.8850823768955927, + "language_loss": 0.58774322, + "learning_rate": 3.8625425179337656e-06, + "loss": 0.66505289, + "num_input_tokens_seen": 52296705, + "router_z_loss_clip": 1.76269531, + "router_z_loss_mlp": 0.10308838, + "step": 2414, + "time_per_iteration": 3.0886166095733643 + }, + { + "auxiliary_loss_clip": 0.06466582, + "auxiliary_loss_mlp": 0.01255839, + "balance_loss_clip": 0.06291236, + "balance_loss_mlp": 0.01245486, + "epoch": 0.14519765519314595, + "flos": 67542806373120.0, + "grad_norm": 0.8215511806181923, + "language_loss": 0.61917955, + "learning_rate": 3.862400591386154e-06, + "loss": 0.69640374, + "num_input_tokens_seen": 52361830, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.10357666, + "step": 2415, + "time_per_iteration": 3.1800529956817627 + }, + { + "auxiliary_loss_clip": 0.06605236, + "auxiliary_loss_mlp": 0.0128974, + "balance_loss_clip": 0.06304489, + "balance_loss_mlp": 0.01261226, + "epoch": 0.14525777844581392, + "flos": 17204151035520.0, + "grad_norm": 1.9287382315286696, + "language_loss": 0.72791839, + "learning_rate": 3.8622585942161245e-06, + "loss": 0.80686808, + "num_input_tokens_seen": 52379420, + "router_z_loss_clip": 3.0078125, + "router_z_loss_mlp": 0.28540039, + "step": 2416, + "time_per_iteration": 2.5888171195983887 + }, + { + "auxiliary_loss_clip": 0.06466876, + "auxiliary_loss_mlp": 0.01256349, + "balance_loss_clip": 0.06292045, + "balance_loss_mlp": 0.01246574, + "epoch": 0.14531790169848188, + "flos": 65425349370240.0, + "grad_norm": 0.6779730680906524, + "language_loss": 0.60441911, + "learning_rate": 3.8621165264290635e-06, + "loss": 0.68165135, + "num_input_tokens_seen": 52446290, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.09765625, + "step": 2417, + "time_per_iteration": 3.256091356277466 + }, + { + "auxiliary_loss_clip": 0.06611343, + "auxiliary_loss_mlp": 0.01295709, + "balance_loss_clip": 0.06300741, + "balance_loss_mlp": 0.0126543, + "epoch": 0.14537802495114985, + "flos": 32570783959680.0, + "grad_norm": 9.327498524911116, + "language_loss": 0.80428064, + "learning_rate": 3.861974388030356e-06, + "loss": 0.88335121, + "num_input_tokens_seen": 52467295, + "router_z_loss_clip": 3.10742188, + "router_z_loss_mlp": 0.30297852, + "step": 2418, + "time_per_iteration": 2.6627931594848633 + }, + { + "auxiliary_loss_clip": 0.06597205, + "auxiliary_loss_mlp": 0.01293692, + "balance_loss_clip": 0.06300168, + "balance_loss_mlp": 0.01265952, + "epoch": 0.1454381482038178, + "flos": 20232338065920.0, + "grad_norm": 1.7107019560934957, + "language_loss": 0.72557437, + "learning_rate": 3.861832179025394e-06, + "loss": 0.80448335, + "num_input_tokens_seen": 52487295, + "router_z_loss_clip": 2.97265625, + "router_z_loss_mlp": 0.27746582, + "step": 2419, + "time_per_iteration": 2.55110764503479 + }, + { + "auxiliary_loss_clip": 0.06605242, + "auxiliary_loss_mlp": 0.01287615, + "balance_loss_clip": 0.06300443, + "balance_loss_mlp": 0.01258563, + "epoch": 0.1454982714564858, + "flos": 22899721415040.0, + "grad_norm": 2.764675065682222, + "language_loss": 0.91167969, + "learning_rate": 3.861689899419569e-06, + "loss": 0.99060822, + "num_input_tokens_seen": 52504220, + "router_z_loss_clip": 3.04882812, + "router_z_loss_mlp": 0.29064941, + "step": 2420, + "time_per_iteration": 2.554682731628418 + }, + { + "auxiliary_loss_clip": 0.06610379, + "auxiliary_loss_mlp": 0.01289829, + "balance_loss_clip": 0.06309067, + "balance_loss_mlp": 0.01262757, + "epoch": 0.14555839470915377, + "flos": 20236027645440.0, + "grad_norm": 2.2697741355192034, + "language_loss": 0.83967364, + "learning_rate": 3.861547549218276e-06, + "loss": 0.91867572, + "num_input_tokens_seen": 52521900, + "router_z_loss_clip": 3.015625, + "router_z_loss_mlp": 0.27050781, + "step": 2421, + "time_per_iteration": 2.5464484691619873 + }, + { + "auxiliary_loss_clip": 0.06610221, + "auxiliary_loss_mlp": 0.01287397, + "balance_loss_clip": 0.0630337, + "balance_loss_mlp": 0.01259216, + "epoch": 0.14561851796182174, + "flos": 22242753077760.0, + "grad_norm": 1.9618808249376125, + "language_loss": 0.82542074, + "learning_rate": 3.861405128426914e-06, + "loss": 0.90439695, + "num_input_tokens_seen": 52540495, + "router_z_loss_clip": 3.06835938, + "router_z_loss_mlp": 0.28173828, + "step": 2422, + "time_per_iteration": 2.5524632930755615 + }, + { + "auxiliary_loss_clip": 0.06461698, + "auxiliary_loss_mlp": 0.01262269, + "balance_loss_clip": 0.06287467, + "balance_loss_mlp": 0.01252607, + "epoch": 0.1456786412144897, + "flos": 52655758692480.0, + "grad_norm": 0.899920685315801, + "language_loss": 0.63252938, + "learning_rate": 3.861262637050883e-06, + "loss": 0.70976901, + "num_input_tokens_seen": 52603305, + "router_z_loss_clip": 1.74316406, + "router_z_loss_mlp": 0.09649658, + "step": 2423, + "time_per_iteration": 3.186488151550293 + }, + { + "auxiliary_loss_clip": 0.06612016, + "auxiliary_loss_mlp": 0.01288368, + "balance_loss_clip": 0.06311088, + "balance_loss_mlp": 0.01261402, + "epoch": 0.14573876446715767, + "flos": 23228352328320.0, + "grad_norm": 1.6675722488639018, + "language_loss": 0.82883829, + "learning_rate": 3.861120075095585e-06, + "loss": 0.90784216, + "num_input_tokens_seen": 52623435, + "router_z_loss_clip": 3.00976562, + "router_z_loss_mlp": 0.26928711, + "step": 2424, + "time_per_iteration": 2.6136088371276855 + }, + { + "auxiliary_loss_clip": 0.0660837, + "auxiliary_loss_mlp": 0.01282475, + "balance_loss_clip": 0.06310098, + "balance_loss_mlp": 0.01254246, + "epoch": 0.14579888771982563, + "flos": 18120331578240.0, + "grad_norm": 3.5994104334935733, + "language_loss": 0.79757202, + "learning_rate": 3.860977442566429e-06, + "loss": 0.87648046, + "num_input_tokens_seen": 52642255, + "router_z_loss_clip": 2.98242188, + "router_z_loss_mlp": 0.28271484, + "step": 2425, + "time_per_iteration": 4.07472825050354 + }, + { + "auxiliary_loss_clip": 0.06616544, + "auxiliary_loss_mlp": 0.01291448, + "balance_loss_clip": 0.06312044, + "balance_loss_mlp": 0.01263577, + "epoch": 0.14585901097249362, + "flos": 23007476165760.0, + "grad_norm": 3.905152777460985, + "language_loss": 0.84682351, + "learning_rate": 3.860834739468821e-06, + "loss": 0.92590338, + "num_input_tokens_seen": 52658700, + "router_z_loss_clip": 3.046875, + "router_z_loss_mlp": 0.27893066, + "step": 2426, + "time_per_iteration": 3.9595530033111572 + }, + { + "auxiliary_loss_clip": 0.066182, + "auxiliary_loss_mlp": 0.01297578, + "balance_loss_clip": 0.06312812, + "balance_loss_mlp": 0.0126904, + "epoch": 0.1459191342251616, + "flos": 21915212267520.0, + "grad_norm": 3.268887858496738, + "language_loss": 0.87538207, + "learning_rate": 3.860691965808173e-06, + "loss": 0.95453984, + "num_input_tokens_seen": 52678140, + "router_z_loss_clip": 3.0546875, + "router_z_loss_mlp": 0.28564453, + "step": 2427, + "time_per_iteration": 2.5644760131835938 + }, + { + "auxiliary_loss_clip": 0.0661422, + "auxiliary_loss_mlp": 0.01289371, + "balance_loss_clip": 0.06305077, + "balance_loss_mlp": 0.01258805, + "epoch": 0.14597925747782955, + "flos": 14980742144640.0, + "grad_norm": 1.9191014162631195, + "language_loss": 0.67673224, + "learning_rate": 3.8605491215899e-06, + "loss": 0.75576818, + "num_input_tokens_seen": 52696825, + "router_z_loss_clip": 3.09179688, + "router_z_loss_mlp": 0.3059082, + "step": 2428, + "time_per_iteration": 2.507455348968506 + }, + { + "auxiliary_loss_clip": 0.06609876, + "auxiliary_loss_mlp": 0.01290631, + "balance_loss_clip": 0.06305391, + "balance_loss_mlp": 0.01261807, + "epoch": 0.14603938073049752, + "flos": 21075200686080.0, + "grad_norm": 1.7530902442774277, + "language_loss": 0.84668899, + "learning_rate": 3.860406206819417e-06, + "loss": 0.92569411, + "num_input_tokens_seen": 52715125, + "router_z_loss_clip": 3.046875, + "router_z_loss_mlp": 0.28833008, + "step": 2429, + "time_per_iteration": 2.5743284225463867 + }, + { + "auxiliary_loss_clip": 0.06606025, + "auxiliary_loss_mlp": 0.01297985, + "balance_loss_clip": 0.06307633, + "balance_loss_mlp": 0.01269661, + "epoch": 0.14609950398316549, + "flos": 19870863552000.0, + "grad_norm": 1.787324656259552, + "language_loss": 0.80119967, + "learning_rate": 3.860263221502145e-06, + "loss": 0.88023973, + "num_input_tokens_seen": 52734015, + "router_z_loss_clip": 2.98242188, + "router_z_loss_mlp": 0.28308105, + "step": 2430, + "time_per_iteration": 3.9587552547454834 + }, + { + "auxiliary_loss_clip": 0.06618911, + "auxiliary_loss_mlp": 0.01299566, + "balance_loss_clip": 0.06312407, + "balance_loss_mlp": 0.01271552, + "epoch": 0.14615962723583345, + "flos": 22425377051520.0, + "grad_norm": 2.031204881913862, + "language_loss": 0.84236491, + "learning_rate": 3.860120165643504e-06, + "loss": 0.92154968, + "num_input_tokens_seen": 52753025, + "router_z_loss_clip": 3.0625, + "router_z_loss_mlp": 0.28051758, + "step": 2431, + "time_per_iteration": 2.5258126258850098 + }, + { + "auxiliary_loss_clip": 0.06622316, + "auxiliary_loss_mlp": 0.01304388, + "balance_loss_clip": 0.06307245, + "balance_loss_mlp": 0.01273823, + "epoch": 0.14621975048850142, + "flos": 22352813815680.0, + "grad_norm": 2.3067012157334976, + "language_loss": 0.79905456, + "learning_rate": 3.859977039248921e-06, + "loss": 0.87832165, + "num_input_tokens_seen": 52773420, + "router_z_loss_clip": 3.15039062, + "router_z_loss_mlp": 0.30566406, + "step": 2432, + "time_per_iteration": 2.5560994148254395 + }, + { + "auxiliary_loss_clip": 0.06613283, + "auxiliary_loss_mlp": 0.01299078, + "balance_loss_clip": 0.06307921, + "balance_loss_mlp": 0.01268894, + "epoch": 0.1462798737411694, + "flos": 24396030501120.0, + "grad_norm": 3.9772219479987796, + "language_loss": 0.8163479, + "learning_rate": 3.859833842323822e-06, + "loss": 0.89547151, + "num_input_tokens_seen": 52792870, + "router_z_loss_clip": 3.0546875, + "router_z_loss_mlp": 0.30175781, + "step": 2433, + "time_per_iteration": 2.5528087615966797 + }, + { + "auxiliary_loss_clip": 0.06603821, + "auxiliary_loss_mlp": 0.01308033, + "balance_loss_clip": 0.06304027, + "balance_loss_mlp": 0.0128052, + "epoch": 0.14633999699383737, + "flos": 19250679957120.0, + "grad_norm": 5.860215383122996, + "language_loss": 0.79175711, + "learning_rate": 3.859690574873638e-06, + "loss": 0.87087572, + "num_input_tokens_seen": 52811615, + "router_z_loss_clip": 3.0, + "router_z_loss_mlp": 0.27526855, + "step": 2434, + "time_per_iteration": 2.5396053791046143 + }, + { + "auxiliary_loss_clip": 0.0649661, + "auxiliary_loss_mlp": 0.01339476, + "balance_loss_clip": 0.0632303, + "balance_loss_mlp": 0.01328705, + "epoch": 0.14640012024650534, + "flos": 62679658780800.0, + "grad_norm": 0.822335797554765, + "language_loss": 0.58256161, + "learning_rate": 3.8595472369038e-06, + "loss": 0.66092247, + "num_input_tokens_seen": 52873230, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.10784912, + "step": 2435, + "time_per_iteration": 3.147134304046631 + }, + { + "auxiliary_loss_clip": 0.06602708, + "auxiliary_loss_mlp": 0.0130236, + "balance_loss_clip": 0.06305322, + "balance_loss_mlp": 0.01274036, + "epoch": 0.1464602434991733, + "flos": 12281144100480.0, + "grad_norm": 2.2533392469478453, + "language_loss": 0.89637053, + "learning_rate": 3.859403828419744e-06, + "loss": 0.97542119, + "num_input_tokens_seen": 52889325, + "router_z_loss_clip": 2.97460938, + "router_z_loss_mlp": 0.28320312, + "step": 2436, + "time_per_iteration": 2.5397794246673584 + }, + { + "auxiliary_loss_clip": 0.06608147, + "auxiliary_loss_mlp": 0.01302382, + "balance_loss_clip": 0.06305888, + "balance_loss_mlp": 0.01274391, + "epoch": 0.14652036675184127, + "flos": 20928480986880.0, + "grad_norm": 2.9920720004583194, + "language_loss": 0.75810778, + "learning_rate": 3.85926034942691e-06, + "loss": 0.83721304, + "num_input_tokens_seen": 52909705, + "router_z_loss_clip": 3.02148438, + "router_z_loss_mlp": 0.2800293, + "step": 2437, + "time_per_iteration": 2.5667972564697266 + }, + { + "auxiliary_loss_clip": 0.06610391, + "auxiliary_loss_mlp": 0.01306019, + "balance_loss_clip": 0.06306973, + "balance_loss_mlp": 0.01277123, + "epoch": 0.14658049000450923, + "flos": 27710151989760.0, + "grad_norm": 2.606428121821339, + "language_loss": 0.7401824, + "learning_rate": 3.859116799930736e-06, + "loss": 0.81934643, + "num_input_tokens_seen": 52930300, + "router_z_loss_clip": 3.03515625, + "router_z_loss_mlp": 0.28857422, + "step": 2438, + "time_per_iteration": 2.6476521492004395 + }, + { + "auxiliary_loss_clip": 0.06605977, + "auxiliary_loss_mlp": 0.01303285, + "balance_loss_clip": 0.06305391, + "balance_loss_mlp": 0.01274865, + "epoch": 0.14664061325717723, + "flos": 24943483152000.0, + "grad_norm": 2.0459162456522595, + "language_loss": 0.7577256, + "learning_rate": 3.858973179936668e-06, + "loss": 0.83681822, + "num_input_tokens_seen": 52949955, + "router_z_loss_clip": 3.00585938, + "router_z_loss_mlp": 0.28442383, + "step": 2439, + "time_per_iteration": 2.5789241790771484 + }, + { + "auxiliary_loss_clip": 0.06618818, + "auxiliary_loss_mlp": 0.01305858, + "balance_loss_clip": 0.06318325, + "balance_loss_mlp": 0.01278261, + "epoch": 0.1467007365098452, + "flos": 40307306964480.0, + "grad_norm": 4.636382420589035, + "language_loss": 0.74925351, + "learning_rate": 3.85882948945015e-06, + "loss": 0.82850027, + "num_input_tokens_seen": 52972905, + "router_z_loss_clip": 3.00976562, + "router_z_loss_mlp": 0.27624512, + "step": 2440, + "time_per_iteration": 2.7299485206604004 + }, + { + "auxiliary_loss_clip": 0.06605764, + "auxiliary_loss_mlp": 0.01314168, + "balance_loss_clip": 0.06310172, + "balance_loss_mlp": 0.01287667, + "epoch": 0.14676085976251316, + "flos": 26548175894400.0, + "grad_norm": 2.8544116905201755, + "language_loss": 0.84429544, + "learning_rate": 3.85868572847663e-06, + "loss": 0.92349476, + "num_input_tokens_seen": 52994850, + "router_z_loss_clip": 2.95703125, + "router_z_loss_mlp": 0.26513672, + "step": 2441, + "time_per_iteration": 2.631824016571045 + }, + { + "auxiliary_loss_clip": 0.0662398, + "auxiliary_loss_mlp": 0.01301683, + "balance_loss_clip": 0.06313129, + "balance_loss_mlp": 0.0127188, + "epoch": 0.14682098301518112, + "flos": 23556857460480.0, + "grad_norm": 2.3203183858424175, + "language_loss": 0.73868263, + "learning_rate": 3.858541897021563e-06, + "loss": 0.81793922, + "num_input_tokens_seen": 53014740, + "router_z_loss_clip": 3.11132812, + "router_z_loss_mlp": 0.29785156, + "step": 2442, + "time_per_iteration": 2.549813747406006 + }, + { + "auxiliary_loss_clip": 0.06618661, + "auxiliary_loss_mlp": 0.01300103, + "balance_loss_clip": 0.06309915, + "balance_loss_mlp": 0.01271934, + "epoch": 0.1468811062678491, + "flos": 11655048792960.0, + "grad_norm": 3.9053582460255756, + "language_loss": 0.82657981, + "learning_rate": 3.8583979950904e-06, + "loss": 0.90576744, + "num_input_tokens_seen": 53029780, + "router_z_loss_clip": 3.08789062, + "router_z_loss_mlp": 0.28161621, + "step": 2443, + "time_per_iteration": 2.5171542167663574 + }, + { + "auxiliary_loss_clip": 0.06611481, + "auxiliary_loss_mlp": 0.01308471, + "balance_loss_clip": 0.06310362, + "balance_loss_mlp": 0.0128184, + "epoch": 0.14694122952051705, + "flos": 23009237101440.0, + "grad_norm": 2.0286604977239477, + "language_loss": 0.84266245, + "learning_rate": 3.858254022688599e-06, + "loss": 0.92186195, + "num_input_tokens_seen": 53048620, + "router_z_loss_clip": 3.01171875, + "router_z_loss_mlp": 0.26635742, + "step": 2444, + "time_per_iteration": 2.5373833179473877 + }, + { + "auxiliary_loss_clip": 0.06614003, + "auxiliary_loss_mlp": 0.01304434, + "balance_loss_clip": 0.0631294, + "balance_loss_mlp": 0.0127692, + "epoch": 0.14700135277318502, + "flos": 26509797924480.0, + "grad_norm": 1.800920496835182, + "language_loss": 0.72034383, + "learning_rate": 3.85810997982162e-06, + "loss": 0.79952818, + "num_input_tokens_seen": 53070055, + "router_z_loss_clip": 3.0078125, + "router_z_loss_mlp": 0.27539062, + "step": 2445, + "time_per_iteration": 2.6035430431365967 + }, + { + "auxiliary_loss_clip": 0.0652153, + "auxiliary_loss_mlp": 0.01258872, + "balance_loss_clip": 0.06346728, + "balance_loss_mlp": 0.01251392, + "epoch": 0.147061476025853, + "flos": 59467841527680.0, + "grad_norm": 0.7965915579325233, + "language_loss": 0.62555134, + "learning_rate": 3.857965866494923e-06, + "loss": 0.70335531, + "num_input_tokens_seen": 53126945, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.074646, + "step": 2446, + "time_per_iteration": 3.0864346027374268 + }, + { + "auxiliary_loss_clip": 0.06631434, + "auxiliary_loss_mlp": 0.01305294, + "balance_loss_clip": 0.06324492, + "balance_loss_mlp": 0.01278603, + "epoch": 0.14712159927852098, + "flos": 28338637138560.0, + "grad_norm": 5.819879904445231, + "language_loss": 0.75890815, + "learning_rate": 3.857821682713975e-06, + "loss": 0.83827543, + "num_input_tokens_seen": 53149130, + "router_z_loss_clip": 3.0703125, + "router_z_loss_mlp": 0.26708984, + "step": 2447, + "time_per_iteration": 2.6405458450317383 + }, + { + "auxiliary_loss_clip": 0.0662236, + "auxiliary_loss_mlp": 0.01295421, + "balance_loss_clip": 0.06319176, + "balance_loss_mlp": 0.01267097, + "epoch": 0.14718172253118894, + "flos": 27097263699840.0, + "grad_norm": 3.1585594254982094, + "language_loss": 0.86766493, + "learning_rate": 3.857677428484242e-06, + "loss": 0.94684267, + "num_input_tokens_seen": 53167120, + "router_z_loss_clip": 3.03515625, + "router_z_loss_mlp": 0.28344727, + "step": 2448, + "time_per_iteration": 2.588178873062134 + }, + { + "auxiliary_loss_clip": 0.06500641, + "auxiliary_loss_mlp": 0.01262898, + "balance_loss_clip": 0.0632707, + "balance_loss_mlp": 0.01254792, + "epoch": 0.1472418457838569, + "flos": 66725827464960.0, + "grad_norm": 0.7311302410121435, + "language_loss": 0.56820273, + "learning_rate": 3.857533103811195e-06, + "loss": 0.64583808, + "num_input_tokens_seen": 53227945, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.08105469, + "step": 2449, + "time_per_iteration": 3.1432383060455322 + }, + { + "auxiliary_loss_clip": 0.06619844, + "auxiliary_loss_mlp": 0.01304126, + "balance_loss_clip": 0.06319091, + "balance_loss_mlp": 0.01278663, + "epoch": 0.14730196903652487, + "flos": 19579730140800.0, + "grad_norm": 2.3714801519715185, + "language_loss": 0.86300421, + "learning_rate": 3.857388708700307e-06, + "loss": 0.94224387, + "num_input_tokens_seen": 53244615, + "router_z_loss_clip": 3.0078125, + "router_z_loss_mlp": 0.2545166, + "step": 2450, + "time_per_iteration": 2.6230788230895996 + }, + { + "auxiliary_loss_clip": 0.06624465, + "auxiliary_loss_mlp": 0.01292799, + "balance_loss_clip": 0.06318057, + "balance_loss_mlp": 0.01265774, + "epoch": 0.14736209228919284, + "flos": 16076611768320.0, + "grad_norm": 3.0293103266492336, + "language_loss": 0.76407862, + "learning_rate": 3.857244243157052e-06, + "loss": 0.84325123, + "num_input_tokens_seen": 53262205, + "router_z_loss_clip": 3.06445312, + "router_z_loss_mlp": 0.2701416, + "step": 2451, + "time_per_iteration": 2.562429428100586 + }, + { + "auxiliary_loss_clip": 0.06606978, + "auxiliary_loss_mlp": 0.0129124, + "balance_loss_clip": 0.0631422, + "balance_loss_mlp": 0.01263881, + "epoch": 0.1474222155418608, + "flos": 23046147624960.0, + "grad_norm": 2.189425489790517, + "language_loss": 0.82725209, + "learning_rate": 3.85709970718691e-06, + "loss": 0.90623426, + "num_input_tokens_seen": 53282445, + "router_z_loss_clip": 2.92578125, + "router_z_loss_mlp": 0.27355957, + "step": 2452, + "time_per_iteration": 2.5850419998168945 + }, + { + "auxiliary_loss_clip": 0.06614233, + "auxiliary_loss_mlp": 0.01290168, + "balance_loss_clip": 0.06316262, + "balance_loss_mlp": 0.01264562, + "epoch": 0.1474823387945288, + "flos": 17024210392320.0, + "grad_norm": 1.704036472783103, + "language_loss": 0.7534892, + "learning_rate": 3.856955100795361e-06, + "loss": 0.83253324, + "num_input_tokens_seen": 53299060, + "router_z_loss_clip": 2.98046875, + "router_z_loss_mlp": 0.2565918, + "step": 2453, + "time_per_iteration": 2.56315016746521 + }, + { + "auxiliary_loss_clip": 0.06629206, + "auxiliary_loss_mlp": 0.01291559, + "balance_loss_clip": 0.06321974, + "balance_loss_mlp": 0.01263521, + "epoch": 0.14754246204719676, + "flos": 17900880935040.0, + "grad_norm": 2.0859032314961836, + "language_loss": 0.7740314, + "learning_rate": 3.856810423987889e-06, + "loss": 0.853239, + "num_input_tokens_seen": 53315970, + "router_z_loss_clip": 3.06835938, + "router_z_loss_mlp": 0.28076172, + "step": 2454, + "time_per_iteration": 2.512051582336426 + }, + { + "auxiliary_loss_clip": 0.06621231, + "auxiliary_loss_mlp": 0.01296513, + "balance_loss_clip": 0.06321682, + "balance_loss_mlp": 0.01269392, + "epoch": 0.14760258529986472, + "flos": 13084161304320.0, + "grad_norm": 2.060710477094934, + "language_loss": 0.84565163, + "learning_rate": 3.856665676769979e-06, + "loss": 0.92482901, + "num_input_tokens_seen": 53332940, + "router_z_loss_clip": 2.99609375, + "router_z_loss_mlp": 0.2713623, + "step": 2455, + "time_per_iteration": 2.5514066219329834 + }, + { + "auxiliary_loss_clip": 0.06633241, + "auxiliary_loss_mlp": 0.01283691, + "balance_loss_clip": 0.06325488, + "balance_loss_mlp": 0.01257393, + "epoch": 0.1476627085525327, + "flos": 30813627513600.0, + "grad_norm": 5.872574686414898, + "language_loss": 0.85135001, + "learning_rate": 3.85652085914712e-06, + "loss": 0.93051934, + "num_input_tokens_seen": 53353295, + "router_z_loss_clip": 3.07421875, + "router_z_loss_mlp": 0.26281738, + "step": 2456, + "time_per_iteration": 2.638485908508301 + }, + { + "auxiliary_loss_clip": 0.0661984, + "auxiliary_loss_mlp": 0.01288462, + "balance_loss_clip": 0.06324227, + "balance_loss_mlp": 0.01261926, + "epoch": 0.14772283180520066, + "flos": 21695887405440.0, + "grad_norm": 3.5788318870076674, + "language_loss": 0.85374033, + "learning_rate": 3.856375971124805e-06, + "loss": 0.93282336, + "num_input_tokens_seen": 53373410, + "router_z_loss_clip": 2.9609375, + "router_z_loss_mlp": 0.26550293, + "step": 2457, + "time_per_iteration": 2.5397539138793945 + }, + { + "auxiliary_loss_clip": 0.06612187, + "auxiliary_loss_mlp": 0.01285174, + "balance_loss_clip": 0.06322154, + "balance_loss_mlp": 0.01258817, + "epoch": 0.14778295505786862, + "flos": 18776335593600.0, + "grad_norm": 2.2072082990650896, + "language_loss": 0.76667166, + "learning_rate": 3.856231012708527e-06, + "loss": 0.84564531, + "num_input_tokens_seen": 53391430, + "router_z_loss_clip": 2.90039062, + "router_z_loss_mlp": 0.26379395, + "step": 2458, + "time_per_iteration": 2.5479953289031982 + }, + { + "auxiliary_loss_clip": 0.0664083, + "auxiliary_loss_mlp": 0.01290982, + "balance_loss_clip": 0.06331704, + "balance_loss_mlp": 0.01262992, + "epoch": 0.1478430783105366, + "flos": 22900224539520.0, + "grad_norm": 2.4431680555354185, + "language_loss": 0.84230208, + "learning_rate": 3.856085983903782e-06, + "loss": 0.92162013, + "num_input_tokens_seen": 53409960, + "router_z_loss_clip": 3.09375, + "router_z_loss_mlp": 0.28027344, + "step": 2459, + "time_per_iteration": 2.555878162384033 + }, + { + "auxiliary_loss_clip": 0.06625295, + "auxiliary_loss_mlp": 0.01283208, + "balance_loss_clip": 0.06332543, + "balance_loss_mlp": 0.01257983, + "epoch": 0.14790320156320458, + "flos": 15090635174400.0, + "grad_norm": 2.440333441232677, + "language_loss": 0.76468259, + "learning_rate": 3.855940884716071e-06, + "loss": 0.84376764, + "num_input_tokens_seen": 53426160, + "router_z_loss_clip": 2.92773438, + "router_z_loss_mlp": 0.2520752, + "step": 2460, + "time_per_iteration": 2.528325319290161 + }, + { + "auxiliary_loss_clip": 0.06624737, + "auxiliary_loss_mlp": 0.01287086, + "balance_loss_clip": 0.06318681, + "balance_loss_mlp": 0.0125912, + "epoch": 0.14796332481587254, + "flos": 26511894276480.0, + "grad_norm": 1.7434250987621476, + "language_loss": 0.82039559, + "learning_rate": 3.855795715150896e-06, + "loss": 0.89951384, + "num_input_tokens_seen": 53448530, + "router_z_loss_clip": 3.0625, + "router_z_loss_mlp": 0.27941895, + "step": 2461, + "time_per_iteration": 2.609023332595825 + }, + { + "auxiliary_loss_clip": 0.06627606, + "auxiliary_loss_mlp": 0.0129144, + "balance_loss_clip": 0.06326235, + "balance_loss_mlp": 0.01263497, + "epoch": 0.1480234480685405, + "flos": 17568392734080.0, + "grad_norm": 4.638743932579621, + "language_loss": 0.6665929, + "learning_rate": 3.855650475213761e-06, + "loss": 0.74578333, + "num_input_tokens_seen": 53465915, + "router_z_loss_clip": 3.015625, + "router_z_loss_mlp": 0.27954102, + "step": 2462, + "time_per_iteration": 2.5234897136688232 + }, + { + "auxiliary_loss_clip": 0.06619708, + "auxiliary_loss_mlp": 0.01287497, + "balance_loss_clip": 0.06320504, + "balance_loss_mlp": 0.01260925, + "epoch": 0.14808357132120847, + "flos": 53594693147520.0, + "grad_norm": 12.154278546197556, + "language_loss": 0.68225503, + "learning_rate": 3.8555051649101745e-06, + "loss": 0.76132703, + "num_input_tokens_seen": 53496055, + "router_z_loss_clip": 2.9921875, + "router_z_loss_mlp": 0.26574707, + "step": 2463, + "time_per_iteration": 2.847352981567383 + }, + { + "auxiliary_loss_clip": 0.06631631, + "auxiliary_loss_mlp": 0.01292564, + "balance_loss_clip": 0.06328086, + "balance_loss_mlp": 0.01264788, + "epoch": 0.14814369457387644, + "flos": 19835420474880.0, + "grad_norm": 2.5558663587768917, + "language_loss": 0.77389717, + "learning_rate": 3.855359784245646e-06, + "loss": 0.85313916, + "num_input_tokens_seen": 53513790, + "router_z_loss_clip": 3.03515625, + "router_z_loss_mlp": 0.27783203, + "step": 2464, + "time_per_iteration": 3.9868950843811035 + }, + { + "auxiliary_loss_clip": 0.0661262, + "auxiliary_loss_mlp": 0.01291855, + "balance_loss_clip": 0.06322042, + "balance_loss_mlp": 0.01266356, + "epoch": 0.1482038178265444, + "flos": 23921769991680.0, + "grad_norm": 1.9637026483751652, + "language_loss": 0.80667269, + "learning_rate": 3.855214333225688e-06, + "loss": 0.88571739, + "num_input_tokens_seen": 53533410, + "router_z_loss_clip": 2.90820312, + "router_z_loss_mlp": 0.25500488, + "step": 2465, + "time_per_iteration": 4.024165630340576 + }, + { + "auxiliary_loss_clip": 0.06628035, + "auxiliary_loss_mlp": 0.01295444, + "balance_loss_clip": 0.06321928, + "balance_loss_mlp": 0.01265976, + "epoch": 0.1482639410792124, + "flos": 24177376471680.0, + "grad_norm": 3.100026638907138, + "language_loss": 0.77266049, + "learning_rate": 3.855068811855817e-06, + "loss": 0.85189527, + "num_input_tokens_seen": 53554775, + "router_z_loss_clip": 3.06054688, + "router_z_loss_mlp": 0.29467773, + "step": 2466, + "time_per_iteration": 2.583932638168335 + }, + { + "auxiliary_loss_clip": 0.06510445, + "auxiliary_loss_mlp": 0.01275284, + "balance_loss_clip": 0.06339325, + "balance_loss_mlp": 0.012657, + "epoch": 0.14832406433188036, + "flos": 66209205916800.0, + "grad_norm": 0.9642098795906485, + "language_loss": 0.60506117, + "learning_rate": 3.854923220141551e-06, + "loss": 0.68291849, + "num_input_tokens_seen": 53609675, + "router_z_loss_clip": 1.71191406, + "router_z_loss_mlp": 0.09570312, + "step": 2467, + "time_per_iteration": 3.206559419631958 + }, + { + "auxiliary_loss_clip": 0.06627056, + "auxiliary_loss_mlp": 0.0129155, + "balance_loss_clip": 0.06326642, + "balance_loss_mlp": 0.01264573, + "epoch": 0.14838418758454833, + "flos": 25418372567040.0, + "grad_norm": 2.1383686818257877, + "language_loss": 0.88646448, + "learning_rate": 3.85477755808841e-06, + "loss": 0.96565056, + "num_input_tokens_seen": 53626950, + "router_z_loss_clip": 3.00390625, + "router_z_loss_mlp": 0.26965332, + "step": 2468, + "time_per_iteration": 2.586428642272949 + }, + { + "auxiliary_loss_clip": 0.06632069, + "auxiliary_loss_mlp": 0.01295941, + "balance_loss_clip": 0.0632536, + "balance_loss_mlp": 0.01267236, + "epoch": 0.1484443108372163, + "flos": 23295800465280.0, + "grad_norm": 2.089009169061615, + "language_loss": 0.76661634, + "learning_rate": 3.854631825701919e-06, + "loss": 0.84589648, + "num_input_tokens_seen": 53644200, + "router_z_loss_clip": 3.0625, + "router_z_loss_mlp": 0.28686523, + "step": 2469, + "time_per_iteration": 5.45016884803772 + }, + { + "auxiliary_loss_clip": 0.06630681, + "auxiliary_loss_mlp": 0.01291477, + "balance_loss_clip": 0.06328478, + "balance_loss_mlp": 0.01264131, + "epoch": 0.14850443408988426, + "flos": 14652949772160.0, + "grad_norm": 3.485678754962802, + "language_loss": 0.76790643, + "learning_rate": 3.854486022987603e-06, + "loss": 0.84712803, + "num_input_tokens_seen": 53659650, + "router_z_loss_clip": 3.0234375, + "router_z_loss_mlp": 0.2734375, + "step": 2470, + "time_per_iteration": 2.514772653579712 + }, + { + "auxiliary_loss_clip": 0.06622952, + "auxiliary_loss_mlp": 0.01299835, + "balance_loss_clip": 0.06329592, + "balance_loss_mlp": 0.0127324, + "epoch": 0.14856455734255222, + "flos": 23554761108480.0, + "grad_norm": 3.1357945603829576, + "language_loss": 0.73019731, + "learning_rate": 3.8543401499509905e-06, + "loss": 0.80942523, + "num_input_tokens_seen": 53680275, + "router_z_loss_clip": 2.93359375, + "router_z_loss_mlp": 0.26623535, + "step": 2471, + "time_per_iteration": 2.5867044925689697 + }, + { + "auxiliary_loss_clip": 0.06632146, + "auxiliary_loss_mlp": 0.01309567, + "balance_loss_clip": 0.06325525, + "balance_loss_mlp": 0.01281862, + "epoch": 0.1486246805952202, + "flos": 18083127565440.0, + "grad_norm": 2.6270207816723894, + "language_loss": 0.90878981, + "learning_rate": 3.854194206597615e-06, + "loss": 0.98820698, + "num_input_tokens_seen": 53698270, + "router_z_loss_clip": 3.0625, + "router_z_loss_mlp": 0.27709961, + "step": 2472, + "time_per_iteration": 2.5934388637542725 + }, + { + "auxiliary_loss_clip": 0.06620878, + "auxiliary_loss_mlp": 0.01314043, + "balance_loss_clip": 0.06322667, + "balance_loss_mlp": 0.01286136, + "epoch": 0.14868480384788818, + "flos": 19359566737920.0, + "grad_norm": 2.5877207728101332, + "language_loss": 0.81794894, + "learning_rate": 3.854048192933008e-06, + "loss": 0.89729816, + "num_input_tokens_seen": 53716845, + "router_z_loss_clip": 2.984375, + "router_z_loss_mlp": 0.2791748, + "step": 2473, + "time_per_iteration": 2.551769256591797 + }, + { + "auxiliary_loss_clip": 0.06630681, + "auxiliary_loss_mlp": 0.01339003, + "balance_loss_clip": 0.06328606, + "balance_loss_mlp": 0.01311346, + "epoch": 0.14874492710055615, + "flos": 22206723022080.0, + "grad_norm": 2.4925002468384423, + "language_loss": 0.79495537, + "learning_rate": 3.853902108962709e-06, + "loss": 0.87465227, + "num_input_tokens_seen": 53734970, + "router_z_loss_clip": 3.01953125, + "router_z_loss_mlp": 0.27624512, + "step": 2474, + "time_per_iteration": 2.55029034614563 + }, + { + "auxiliary_loss_clip": 0.06643772, + "auxiliary_loss_mlp": 0.01336817, + "balance_loss_clip": 0.06335679, + "balance_loss_mlp": 0.01309256, + "epoch": 0.1488050503532241, + "flos": 21109427879040.0, + "grad_norm": 2.598618910298095, + "language_loss": 0.8324194, + "learning_rate": 3.853755954692255e-06, + "loss": 0.91222525, + "num_input_tokens_seen": 53753415, + "router_z_loss_clip": 3.08007812, + "router_z_loss_mlp": 0.27575684, + "step": 2475, + "time_per_iteration": 2.557748794555664 + }, + { + "auxiliary_loss_clip": 0.06641456, + "auxiliary_loss_mlp": 0.01357893, + "balance_loss_clip": 0.06342697, + "balance_loss_mlp": 0.01329998, + "epoch": 0.14886517360589208, + "flos": 12791476592640.0, + "grad_norm": 3.118918756982401, + "language_loss": 0.81896377, + "learning_rate": 3.85360973012719e-06, + "loss": 0.89895725, + "num_input_tokens_seen": 53770305, + "router_z_loss_clip": 2.984375, + "router_z_loss_mlp": 0.27929688, + "step": 2476, + "time_per_iteration": 2.5228424072265625 + }, + { + "auxiliary_loss_clip": 0.06643493, + "auxiliary_loss_mlp": 0.01381513, + "balance_loss_clip": 0.06351461, + "balance_loss_mlp": 0.01354202, + "epoch": 0.14892529685856004, + "flos": 29030503501440.0, + "grad_norm": 5.933104141951435, + "language_loss": 0.78306687, + "learning_rate": 3.853463435273058e-06, + "loss": 0.86331695, + "num_input_tokens_seen": 53788895, + "router_z_loss_clip": 2.921875, + "router_z_loss_mlp": 0.27307129, + "step": 2477, + "time_per_iteration": 2.6379337310791016 + }, + { + "auxiliary_loss_clip": 0.06518018, + "auxiliary_loss_mlp": 0.01346882, + "balance_loss_clip": 0.06346889, + "balance_loss_mlp": 0.01337793, + "epoch": 0.148985420111228, + "flos": 61944215495040.0, + "grad_norm": 0.7948106415234558, + "language_loss": 0.60108519, + "learning_rate": 3.853317070135407e-06, + "loss": 0.67973411, + "num_input_tokens_seen": 53850260, + "router_z_loss_clip": 1.7109375, + "router_z_loss_mlp": 0.09100342, + "step": 2478, + "time_per_iteration": 3.2091856002807617 + }, + { + "auxiliary_loss_clip": 0.06656381, + "auxiliary_loss_mlp": 0.01381988, + "balance_loss_clip": 0.06356013, + "balance_loss_mlp": 0.01354606, + "epoch": 0.149045543363896, + "flos": 23921937699840.0, + "grad_norm": 3.933079411076695, + "language_loss": 0.71247137, + "learning_rate": 3.853170634719787e-06, + "loss": 0.79285508, + "num_input_tokens_seen": 53867520, + "router_z_loss_clip": 3.00195312, + "router_z_loss_mlp": 0.27392578, + "step": 2479, + "time_per_iteration": 2.613901376724243 + }, + { + "auxiliary_loss_clip": 0.06657803, + "auxiliary_loss_mlp": 0.01383638, + "balance_loss_clip": 0.06357619, + "balance_loss_mlp": 0.01356411, + "epoch": 0.14910566661656396, + "flos": 23660293726080.0, + "grad_norm": 3.520474403550157, + "language_loss": 0.82057166, + "learning_rate": 3.853024129031751e-06, + "loss": 0.90098608, + "num_input_tokens_seen": 53886620, + "router_z_loss_clip": 3.00390625, + "router_z_loss_mlp": 0.27246094, + "step": 2480, + "time_per_iteration": 2.6175220012664795 + }, + { + "auxiliary_loss_clip": 0.06659204, + "auxiliary_loss_mlp": 0.01416958, + "balance_loss_clip": 0.06354087, + "balance_loss_mlp": 0.01387727, + "epoch": 0.14916578986923193, + "flos": 20520452730240.0, + "grad_norm": 2.2296604280919805, + "language_loss": 0.85048115, + "learning_rate": 3.852877553076854e-06, + "loss": 0.9312427, + "num_input_tokens_seen": 53902230, + "router_z_loss_clip": 3.05273438, + "router_z_loss_mlp": 0.29248047, + "step": 2481, + "time_per_iteration": 2.617551565170288 + }, + { + "auxiliary_loss_clip": 0.06647365, + "auxiliary_loss_mlp": 0.01423314, + "balance_loss_clip": 0.06347671, + "balance_loss_mlp": 0.01393416, + "epoch": 0.1492259131218999, + "flos": 22498359557760.0, + "grad_norm": 1.912212150867571, + "language_loss": 0.78788674, + "learning_rate": 3.8527309068606546e-06, + "loss": 0.86859351, + "num_input_tokens_seen": 53919475, + "router_z_loss_clip": 2.9921875, + "router_z_loss_mlp": 0.29882812, + "step": 2482, + "time_per_iteration": 2.5733768939971924 + }, + { + "auxiliary_loss_clip": 0.06663539, + "auxiliary_loss_mlp": 0.0143468, + "balance_loss_clip": 0.06351975, + "balance_loss_mlp": 0.01405808, + "epoch": 0.14928603637456786, + "flos": 23192657688960.0, + "grad_norm": 2.2991604479376777, + "language_loss": 0.80652654, + "learning_rate": 3.852584190388713e-06, + "loss": 0.88750875, + "num_input_tokens_seen": 53939150, + "router_z_loss_clip": 3.1171875, + "router_z_loss_mlp": 0.28857422, + "step": 2483, + "time_per_iteration": 2.597843647003174 + }, + { + "auxiliary_loss_clip": 0.06641878, + "auxiliary_loss_mlp": 0.01472083, + "balance_loss_clip": 0.06352127, + "balance_loss_mlp": 0.01442948, + "epoch": 0.14934615962723582, + "flos": 21659731568640.0, + "grad_norm": 2.0225233992765728, + "language_loss": 0.71627355, + "learning_rate": 3.852437403666595e-06, + "loss": 0.79741317, + "num_input_tokens_seen": 53958735, + "router_z_loss_clip": 2.90039062, + "router_z_loss_mlp": 0.2911377, + "step": 2484, + "time_per_iteration": 2.5717227458953857 + }, + { + "auxiliary_loss_clip": 0.06650308, + "auxiliary_loss_mlp": 0.01467216, + "balance_loss_clip": 0.06347484, + "balance_loss_mlp": 0.01435006, + "epoch": 0.1494062828799038, + "flos": 27016356983040.0, + "grad_norm": 2.0068383034806154, + "language_loss": 0.85284823, + "learning_rate": 3.852290546699863e-06, + "loss": 0.9340235, + "num_input_tokens_seen": 53975065, + "router_z_loss_clip": 3.02539062, + "router_z_loss_mlp": 0.32226562, + "step": 2485, + "time_per_iteration": 2.7037456035614014 + }, + { + "auxiliary_loss_clip": 0.0664534, + "auxiliary_loss_mlp": 0.01441016, + "balance_loss_clip": 0.06342804, + "balance_loss_mlp": 0.01410952, + "epoch": 0.14946640613257178, + "flos": 21221291479680.0, + "grad_norm": 2.0879118929126133, + "language_loss": 0.85614496, + "learning_rate": 3.8521436194940894e-06, + "loss": 0.93700856, + "num_input_tokens_seen": 53993330, + "router_z_loss_clip": 3.02539062, + "router_z_loss_mlp": 0.30053711, + "step": 2486, + "time_per_iteration": 2.5492942333221436 + }, + { + "auxiliary_loss_clip": 0.06628142, + "auxiliary_loss_mlp": 0.01484598, + "balance_loss_clip": 0.06337839, + "balance_loss_mlp": 0.01454963, + "epoch": 0.14952652938523975, + "flos": 13375965548160.0, + "grad_norm": 2.5864541617313805, + "language_loss": 0.75625527, + "learning_rate": 3.851996622054842e-06, + "loss": 0.83738261, + "num_input_tokens_seen": 54010515, + "router_z_loss_clip": 2.90039062, + "router_z_loss_mlp": 0.29638672, + "step": 2487, + "time_per_iteration": 2.6050243377685547 + }, + { + "auxiliary_loss_clip": 0.06636909, + "auxiliary_loss_mlp": 0.01458272, + "balance_loss_clip": 0.06336737, + "balance_loss_mlp": 0.01427635, + "epoch": 0.1495866526379077, + "flos": 35526491608320.0, + "grad_norm": 2.6345212857914415, + "language_loss": 0.72756326, + "learning_rate": 3.8518495543877e-06, + "loss": 0.80851501, + "num_input_tokens_seen": 54031315, + "router_z_loss_clip": 3.0, + "router_z_loss_mlp": 0.30639648, + "step": 2488, + "time_per_iteration": 2.7038300037384033 + }, + { + "auxiliary_loss_clip": 0.06629623, + "auxiliary_loss_mlp": 0.01463441, + "balance_loss_clip": 0.06324254, + "balance_loss_mlp": 0.01431421, + "epoch": 0.14964677589057568, + "flos": 17637392171520.0, + "grad_norm": 3.2533111651102633, + "language_loss": 0.71329439, + "learning_rate": 3.851702416498235e-06, + "loss": 0.79422504, + "num_input_tokens_seen": 54045965, + "router_z_loss_clip": 3.05273438, + "router_z_loss_mlp": 0.3203125, + "step": 2489, + "time_per_iteration": 2.6397132873535156 + }, + { + "auxiliary_loss_clip": 0.06627091, + "auxiliary_loss_mlp": 0.01445303, + "balance_loss_clip": 0.06321006, + "balance_loss_mlp": 0.01412807, + "epoch": 0.14970689914324364, + "flos": 20190102808320.0, + "grad_norm": 15.387963507460157, + "language_loss": 0.82698536, + "learning_rate": 3.8515552083920295e-06, + "loss": 0.90770924, + "num_input_tokens_seen": 54059960, + "router_z_loss_clip": 3.05859375, + "router_z_loss_mlp": 0.32446289, + "step": 2490, + "time_per_iteration": 2.560051918029785 + }, + { + "auxiliary_loss_clip": 0.0662353, + "auxiliary_loss_mlp": 0.01421627, + "balance_loss_clip": 0.06318316, + "balance_loss_mlp": 0.013913, + "epoch": 0.1497670223959116, + "flos": 37237136238720.0, + "grad_norm": 2.555318554574921, + "language_loss": 0.81524169, + "learning_rate": 3.851407930074666e-06, + "loss": 0.8956933, + "num_input_tokens_seen": 54079330, + "router_z_loss_clip": 3.05273438, + "router_z_loss_mlp": 0.30322266, + "step": 2491, + "time_per_iteration": 2.7191121578216553 + }, + { + "auxiliary_loss_clip": 0.06628857, + "auxiliary_loss_mlp": 0.01437567, + "balance_loss_clip": 0.06323408, + "balance_loss_mlp": 0.01406072, + "epoch": 0.1498271456485796, + "flos": 24461675775360.0, + "grad_norm": 2.0859620961652032, + "language_loss": 0.91616488, + "learning_rate": 3.851260581551727e-06, + "loss": 0.99682909, + "num_input_tokens_seen": 54097555, + "router_z_loss_clip": 3.05273438, + "router_z_loss_mlp": 0.31469727, + "step": 2492, + "time_per_iteration": 2.5775644779205322 + }, + { + "auxiliary_loss_clip": 0.06620014, + "auxiliary_loss_mlp": 0.01407656, + "balance_loss_clip": 0.06319647, + "balance_loss_mlp": 0.01375589, + "epoch": 0.14988726890124757, + "flos": 16259235742080.0, + "grad_norm": 4.194340578044498, + "language_loss": 0.80698526, + "learning_rate": 3.851113162828802e-06, + "loss": 0.88726199, + "num_input_tokens_seen": 54115600, + "router_z_loss_clip": 3.00585938, + "router_z_loss_mlp": 0.3203125, + "step": 2493, + "time_per_iteration": 2.522217273712158 + }, + { + "auxiliary_loss_clip": 0.06625558, + "auxiliary_loss_mlp": 0.01423964, + "balance_loss_clip": 0.06320652, + "balance_loss_mlp": 0.01391014, + "epoch": 0.14994739215391553, + "flos": 20672622944640.0, + "grad_norm": 1.92476481647275, + "language_loss": 0.81586623, + "learning_rate": 3.85096567391148e-06, + "loss": 0.89636147, + "num_input_tokens_seen": 54135220, + "router_z_loss_clip": 3.05078125, + "router_z_loss_mlp": 0.32958984, + "step": 2494, + "time_per_iteration": 2.5768370628356934 + }, + { + "auxiliary_loss_clip": 0.06620924, + "auxiliary_loss_mlp": 0.01381746, + "balance_loss_clip": 0.06323613, + "balance_loss_mlp": 0.01351562, + "epoch": 0.1500075154065835, + "flos": 70666855603200.0, + "grad_norm": 1.9921469546830013, + "language_loss": 0.67712897, + "learning_rate": 3.850818114805354e-06, + "loss": 0.75715572, + "num_input_tokens_seen": 54161065, + "router_z_loss_clip": 2.97851562, + "router_z_loss_mlp": 0.30187988, + "step": 2495, + "time_per_iteration": 2.9661571979522705 + }, + { + "auxiliary_loss_clip": 0.06548879, + "auxiliary_loss_mlp": 0.01321563, + "balance_loss_clip": 0.06377496, + "balance_loss_mlp": 0.01310876, + "epoch": 0.15006763865925146, + "flos": 68029827431040.0, + "grad_norm": 0.8769612772619841, + "language_loss": 0.5954529, + "learning_rate": 3.850670485516019e-06, + "loss": 0.67415726, + "num_input_tokens_seen": 54225095, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.10699463, + "step": 2496, + "time_per_iteration": 3.202047109603882 + }, + { + "auxiliary_loss_clip": 0.06631249, + "auxiliary_loss_mlp": 0.0133476, + "balance_loss_clip": 0.06323538, + "balance_loss_mlp": 0.01304254, + "epoch": 0.15012776191191943, + "flos": 18922216752000.0, + "grad_norm": 2.34505525234942, + "language_loss": 0.66916072, + "learning_rate": 3.850522786049075e-06, + "loss": 0.74882078, + "num_input_tokens_seen": 54243750, + "router_z_loss_clip": 3.08007812, + "router_z_loss_mlp": 0.30505371, + "step": 2497, + "time_per_iteration": 2.5355312824249268 + }, + { + "auxiliary_loss_clip": 0.06621728, + "auxiliary_loss_mlp": 0.01327478, + "balance_loss_clip": 0.06319709, + "balance_loss_mlp": 0.01299762, + "epoch": 0.1501878851645874, + "flos": 23708985747840.0, + "grad_norm": 1.6926191632820315, + "language_loss": 0.76545727, + "learning_rate": 3.850375016410121e-06, + "loss": 0.84494931, + "num_input_tokens_seen": 54266185, + "router_z_loss_clip": 3.0234375, + "router_z_loss_mlp": 0.27746582, + "step": 2498, + "time_per_iteration": 2.6315629482269287 + }, + { + "auxiliary_loss_clip": 0.06625126, + "auxiliary_loss_mlp": 0.0132033, + "balance_loss_clip": 0.06315958, + "balance_loss_mlp": 0.01288454, + "epoch": 0.15024800841725539, + "flos": 20418777400320.0, + "grad_norm": 2.3031515729251377, + "language_loss": 0.72851908, + "learning_rate": 3.850227176604761e-06, + "loss": 0.80797374, + "num_input_tokens_seen": 54283940, + "router_z_loss_clip": 3.09570312, + "router_z_loss_mlp": 0.3190918, + "step": 2499, + "time_per_iteration": 2.550572395324707 + }, + { + "auxiliary_loss_clip": 0.06615321, + "auxiliary_loss_mlp": 0.01299804, + "balance_loss_clip": 0.06312654, + "balance_loss_mlp": 0.01270002, + "epoch": 0.15030813166992335, + "flos": 31838904472320.0, + "grad_norm": 2.1036429780105204, + "language_loss": 0.72527623, + "learning_rate": 3.850079266638601e-06, + "loss": 0.80442744, + "num_input_tokens_seen": 54304830, + "router_z_loss_clip": 3.02539062, + "router_z_loss_mlp": 0.29760742, + "step": 2500, + "time_per_iteration": 2.66140079498291 + }, + { + "auxiliary_loss_clip": 0.06611083, + "auxiliary_loss_mlp": 0.01296332, + "balance_loss_clip": 0.06309603, + "balance_loss_mlp": 0.0126765, + "epoch": 0.15036825492259132, + "flos": 35665664440320.0, + "grad_norm": 2.1651988912264697, + "language_loss": 0.6639303, + "learning_rate": 3.849931286517249e-06, + "loss": 0.74300444, + "num_input_tokens_seen": 54325595, + "router_z_loss_clip": 3.01953125, + "router_z_loss_mlp": 0.28686523, + "step": 2501, + "time_per_iteration": 2.6920387744903564 + }, + { + "auxiliary_loss_clip": 0.06617519, + "auxiliary_loss_mlp": 0.0129342, + "balance_loss_clip": 0.06313312, + "balance_loss_mlp": 0.01262283, + "epoch": 0.15042837817525928, + "flos": 18843238679040.0, + "grad_norm": 2.189390095106363, + "language_loss": 0.84965289, + "learning_rate": 3.849783236246318e-06, + "loss": 0.92876226, + "num_input_tokens_seen": 54342180, + "router_z_loss_clip": 3.04101562, + "router_z_loss_mlp": 0.31152344, + "step": 2502, + "time_per_iteration": 2.5896334648132324 + }, + { + "auxiliary_loss_clip": 0.06611362, + "auxiliary_loss_mlp": 0.01289243, + "balance_loss_clip": 0.06310903, + "balance_loss_mlp": 0.0126142, + "epoch": 0.15048850142792725, + "flos": 19541436024960.0, + "grad_norm": 2.1165990533687746, + "language_loss": 0.78282011, + "learning_rate": 3.849635115831421e-06, + "loss": 0.86182618, + "num_input_tokens_seen": 54360255, + "router_z_loss_clip": 3.00585938, + "router_z_loss_mlp": 0.2779541, + "step": 2503, + "time_per_iteration": 3.9853694438934326 + }, + { + "auxiliary_loss_clip": 0.06603716, + "auxiliary_loss_mlp": 0.01289674, + "balance_loss_clip": 0.06307186, + "balance_loss_mlp": 0.01263102, + "epoch": 0.1505486246805952, + "flos": 22024015194240.0, + "grad_norm": 1.9675013040349558, + "language_loss": 0.8635025, + "learning_rate": 3.849486925278176e-06, + "loss": 0.94243646, + "num_input_tokens_seen": 54378260, + "router_z_loss_clip": 2.96679688, + "router_z_loss_mlp": 0.26586914, + "step": 2504, + "time_per_iteration": 2.544656991958618 + }, + { + "auxiliary_loss_clip": 0.06603047, + "auxiliary_loss_mlp": 0.0129183, + "balance_loss_clip": 0.06305411, + "balance_loss_mlp": 0.01264794, + "epoch": 0.15060874793326318, + "flos": 20749840081920.0, + "grad_norm": 2.8187796049403127, + "language_loss": 0.83803535, + "learning_rate": 3.8493386645922e-06, + "loss": 0.91698414, + "num_input_tokens_seen": 54399745, + "router_z_loss_clip": 2.97851562, + "router_z_loss_mlp": 0.27050781, + "step": 2505, + "time_per_iteration": 3.988954544067383 + }, + { + "auxiliary_loss_clip": 0.06600159, + "auxiliary_loss_mlp": 0.01291215, + "balance_loss_clip": 0.06303117, + "balance_loss_mlp": 0.01263249, + "epoch": 0.15066887118593117, + "flos": 16477470501120.0, + "grad_norm": 1.903749804745976, + "language_loss": 0.77148849, + "learning_rate": 3.849190333779117e-06, + "loss": 0.85040224, + "num_input_tokens_seen": 54417105, + "router_z_loss_clip": 2.97070312, + "router_z_loss_mlp": 0.27978516, + "step": 2506, + "time_per_iteration": 2.548551559448242 + }, + { + "auxiliary_loss_clip": 0.06619012, + "auxiliary_loss_mlp": 0.01287214, + "balance_loss_clip": 0.06307869, + "balance_loss_mlp": 0.01257722, + "epoch": 0.15072899443859913, + "flos": 19864490641920.0, + "grad_norm": 4.281401041045214, + "language_loss": 0.78119665, + "learning_rate": 3.849041932844552e-06, + "loss": 0.86025894, + "num_input_tokens_seen": 54433920, + "router_z_loss_clip": 3.11328125, + "router_z_loss_mlp": 0.29467773, + "step": 2507, + "time_per_iteration": 2.494123697280884 + }, + { + "auxiliary_loss_clip": 0.06598042, + "auxiliary_loss_mlp": 0.01289211, + "balance_loss_clip": 0.06304646, + "balance_loss_mlp": 0.01262532, + "epoch": 0.1507891176912671, + "flos": 20782348266240.0, + "grad_norm": 1.9743385281698682, + "language_loss": 0.69510758, + "learning_rate": 3.848893461794131e-06, + "loss": 0.77398014, + "num_input_tokens_seen": 54451540, + "router_z_loss_clip": 2.93554688, + "router_z_loss_mlp": 0.26647949, + "step": 2508, + "time_per_iteration": 2.53487491607666 + }, + { + "auxiliary_loss_clip": 0.06608425, + "auxiliary_loss_mlp": 0.01288258, + "balance_loss_clip": 0.06303222, + "balance_loss_mlp": 0.01259946, + "epoch": 0.15084924094393506, + "flos": 23593390640640.0, + "grad_norm": 1.8413842263271991, + "language_loss": 0.78278601, + "learning_rate": 3.8487449206334845e-06, + "loss": 0.86175287, + "num_input_tokens_seen": 54470800, + "router_z_loss_clip": 3.0546875, + "router_z_loss_mlp": 0.28320312, + "step": 2509, + "time_per_iteration": 5.512920141220093 + }, + { + "auxiliary_loss_clip": 0.06619874, + "auxiliary_loss_mlp": 0.01301611, + "balance_loss_clip": 0.06305903, + "balance_loss_mlp": 0.01270879, + "epoch": 0.15090936419660303, + "flos": 18916430820480.0, + "grad_norm": 3.8878243194331756, + "language_loss": 0.82607746, + "learning_rate": 3.848596309368246e-06, + "loss": 0.90529227, + "num_input_tokens_seen": 54486525, + "router_z_loss_clip": 3.140625, + "router_z_loss_mlp": 0.30688477, + "step": 2510, + "time_per_iteration": 2.4956603050231934 + }, + { + "auxiliary_loss_clip": 0.0661021, + "auxiliary_loss_mlp": 0.01290438, + "balance_loss_clip": 0.06301613, + "balance_loss_mlp": 0.01258919, + "epoch": 0.150969487449271, + "flos": 17933514900480.0, + "grad_norm": 2.455863983709149, + "language_loss": 0.74876237, + "learning_rate": 3.8484476280040495e-06, + "loss": 0.82776886, + "num_input_tokens_seen": 54503795, + "router_z_loss_clip": 3.08789062, + "router_z_loss_mlp": 0.31518555, + "step": 2511, + "time_per_iteration": 2.551175832748413 + }, + { + "auxiliary_loss_clip": 0.06603982, + "auxiliary_loss_mlp": 0.0129301, + "balance_loss_clip": 0.06306278, + "balance_loss_mlp": 0.012649, + "epoch": 0.151029610701939, + "flos": 24249897780480.0, + "grad_norm": 3.2919067663681854, + "language_loss": 0.6990515, + "learning_rate": 3.848298876546534e-06, + "loss": 0.77802145, + "num_input_tokens_seen": 54523025, + "router_z_loss_clip": 2.9765625, + "router_z_loss_mlp": 0.28100586, + "step": 2512, + "time_per_iteration": 2.592564344406128 + }, + { + "auxiliary_loss_clip": 0.06602003, + "auxiliary_loss_mlp": 0.01290201, + "balance_loss_clip": 0.06302576, + "balance_loss_mlp": 0.01260136, + "epoch": 0.15108973395460695, + "flos": 30270199858560.0, + "grad_norm": 3.311694411348407, + "language_loss": 0.75370401, + "learning_rate": 3.84815005500134e-06, + "loss": 0.8326261, + "num_input_tokens_seen": 54545025, + "router_z_loss_clip": 2.99609375, + "router_z_loss_mlp": 0.30078125, + "step": 2513, + "time_per_iteration": 2.675105571746826 + }, + { + "auxiliary_loss_clip": 0.06516539, + "auxiliary_loss_mlp": 0.01341982, + "balance_loss_clip": 0.06344443, + "balance_loss_mlp": 0.01333804, + "epoch": 0.15114985720727492, + "flos": 60456711087360.0, + "grad_norm": 0.8564181084280313, + "language_loss": 0.64582717, + "learning_rate": 3.84800116337411e-06, + "loss": 0.72441238, + "num_input_tokens_seen": 54604545, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.08178711, + "step": 2514, + "time_per_iteration": 3.1119604110717773 + }, + { + "auxiliary_loss_clip": 0.06602134, + "auxiliary_loss_mlp": 0.01300136, + "balance_loss_clip": 0.06303127, + "balance_loss_mlp": 0.01271299, + "epoch": 0.15120998045994288, + "flos": 20527915743360.0, + "grad_norm": 2.3848506685629487, + "language_loss": 0.74193883, + "learning_rate": 3.8478522016704916e-06, + "loss": 0.82096153, + "num_input_tokens_seen": 54620590, + "router_z_loss_clip": 2.9921875, + "router_z_loss_mlp": 0.28869629, + "step": 2515, + "time_per_iteration": 2.554006814956665 + }, + { + "auxiliary_loss_clip": 0.06601816, + "auxiliary_loss_mlp": 0.01297055, + "balance_loss_clip": 0.06304994, + "balance_loss_mlp": 0.01269577, + "epoch": 0.15127010371261085, + "flos": 21185303351040.0, + "grad_norm": 1.9231590772251361, + "language_loss": 0.78707075, + "learning_rate": 3.8477031698961325e-06, + "loss": 0.86605948, + "num_input_tokens_seen": 54640410, + "router_z_loss_clip": 2.96875, + "router_z_loss_mlp": 0.27490234, + "step": 2516, + "time_per_iteration": 2.5447309017181396 + }, + { + "auxiliary_loss_clip": 0.06496674, + "auxiliary_loss_mlp": 0.01300995, + "balance_loss_clip": 0.063242, + "balance_loss_mlp": 0.01292406, + "epoch": 0.1513302269652788, + "flos": 65339537189760.0, + "grad_norm": 0.7164418146378366, + "language_loss": 0.54901356, + "learning_rate": 3.8475540680566835e-06, + "loss": 0.62699026, + "num_input_tokens_seen": 54701430, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.08599854, + "step": 2517, + "time_per_iteration": 3.1926348209381104 + }, + { + "auxiliary_loss_clip": 0.06606746, + "auxiliary_loss_mlp": 0.01299298, + "balance_loss_clip": 0.06308446, + "balance_loss_mlp": 0.01269257, + "epoch": 0.15139035021794678, + "flos": 19141918957440.0, + "grad_norm": 1.8480469380115683, + "language_loss": 0.79359663, + "learning_rate": 3.8474048961577995e-06, + "loss": 0.87265706, + "num_input_tokens_seen": 54720845, + "router_z_loss_clip": 2.98046875, + "router_z_loss_mlp": 0.30078125, + "step": 2518, + "time_per_iteration": 2.563261032104492 + }, + { + "auxiliary_loss_clip": 0.06615496, + "auxiliary_loss_mlp": 0.01294147, + "balance_loss_clip": 0.06308527, + "balance_loss_mlp": 0.01264154, + "epoch": 0.15145047347061477, + "flos": 26585841104640.0, + "grad_norm": 2.595059574569343, + "language_loss": 0.71604168, + "learning_rate": 3.847255654205137e-06, + "loss": 0.79513812, + "num_input_tokens_seen": 54740495, + "router_z_loss_clip": 3.0703125, + "router_z_loss_mlp": 0.29980469, + "step": 2519, + "time_per_iteration": 2.5810017585754395 + }, + { + "auxiliary_loss_clip": 0.06607082, + "auxiliary_loss_mlp": 0.01285902, + "balance_loss_clip": 0.06307598, + "balance_loss_mlp": 0.01257483, + "epoch": 0.15151059672328274, + "flos": 20309177859840.0, + "grad_norm": 2.5486902935962368, + "language_loss": 0.80309343, + "learning_rate": 3.847106342204354e-06, + "loss": 0.88202327, + "num_input_tokens_seen": 54758415, + "router_z_loss_clip": 2.99804688, + "router_z_loss_mlp": 0.28393555, + "step": 2520, + "time_per_iteration": 2.5701065063476562 + }, + { + "auxiliary_loss_clip": 0.06607689, + "auxiliary_loss_mlp": 0.01293848, + "balance_loss_clip": 0.06306153, + "balance_loss_mlp": 0.01262853, + "epoch": 0.1515707199759507, + "flos": 27234591742080.0, + "grad_norm": 2.513682116437687, + "language_loss": 0.7522434, + "learning_rate": 3.846956960161114e-06, + "loss": 0.83125877, + "num_input_tokens_seen": 54779355, + "router_z_loss_clip": 3.01953125, + "router_z_loss_mlp": 0.31005859, + "step": 2521, + "time_per_iteration": 2.6066393852233887 + }, + { + "auxiliary_loss_clip": 0.06609409, + "auxiliary_loss_mlp": 0.01293912, + "balance_loss_clip": 0.06305401, + "balance_loss_mlp": 0.012643, + "epoch": 0.15163084322861867, + "flos": 23594229181440.0, + "grad_norm": 3.360256579964136, + "language_loss": 0.82804251, + "learning_rate": 3.84680750808108e-06, + "loss": 0.9070757, + "num_input_tokens_seen": 54799465, + "router_z_loss_clip": 3.04101562, + "router_z_loss_mlp": 0.29614258, + "step": 2522, + "time_per_iteration": 2.6204471588134766 + }, + { + "auxiliary_loss_clip": 0.06466869, + "auxiliary_loss_mlp": 0.01261371, + "balance_loss_clip": 0.06295171, + "balance_loss_mlp": 0.01253491, + "epoch": 0.15169096648128663, + "flos": 66908786855040.0, + "grad_norm": 0.8016115215940587, + "language_loss": 0.58029842, + "learning_rate": 3.846657985969922e-06, + "loss": 0.65758073, + "num_input_tokens_seen": 54857665, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.07873535, + "step": 2523, + "time_per_iteration": 3.1140880584716797 + }, + { + "auxiliary_loss_clip": 0.06599564, + "auxiliary_loss_mlp": 0.0128657, + "balance_loss_clip": 0.0630584, + "balance_loss_mlp": 0.0125821, + "epoch": 0.1517510897339546, + "flos": 29103024810240.0, + "grad_norm": 3.3848907238065324, + "language_loss": 0.7552231, + "learning_rate": 3.8465083938333066e-06, + "loss": 0.83408445, + "num_input_tokens_seen": 54879895, + "router_z_loss_clip": 2.93945312, + "router_z_loss_mlp": 0.2833252, + "step": 2524, + "time_per_iteration": 2.6701698303222656 + }, + { + "auxiliary_loss_clip": 0.066016, + "auxiliary_loss_mlp": 0.01289357, + "balance_loss_clip": 0.0629995, + "balance_loss_mlp": 0.01259889, + "epoch": 0.1518112129866226, + "flos": 18412597019520.0, + "grad_norm": 1.915224291313093, + "language_loss": 0.75580716, + "learning_rate": 3.8463587316769085e-06, + "loss": 0.8347168, + "num_input_tokens_seen": 54898245, + "router_z_loss_clip": 3.01757812, + "router_z_loss_mlp": 0.29443359, + "step": 2525, + "time_per_iteration": 2.5224146842956543 + }, + { + "auxiliary_loss_clip": 0.06610245, + "auxiliary_loss_mlp": 0.01284071, + "balance_loss_clip": 0.06303011, + "balance_loss_mlp": 0.01254436, + "epoch": 0.15187133623929056, + "flos": 19431165651840.0, + "grad_norm": 1.8765466933559616, + "language_loss": 0.80763042, + "learning_rate": 3.846208999506402e-06, + "loss": 0.88657361, + "num_input_tokens_seen": 54917060, + "router_z_loss_clip": 3.0703125, + "router_z_loss_mlp": 0.29638672, + "step": 2526, + "time_per_iteration": 2.6248834133148193 + }, + { + "auxiliary_loss_clip": 0.06594585, + "auxiliary_loss_mlp": 0.01286752, + "balance_loss_clip": 0.06300339, + "balance_loss_mlp": 0.01258869, + "epoch": 0.15193145949195852, + "flos": 17571914605440.0, + "grad_norm": 1.7842428302313325, + "language_loss": 0.8627159, + "learning_rate": 3.846059197327466e-06, + "loss": 0.94152921, + "num_input_tokens_seen": 54936365, + "router_z_loss_clip": 2.94335938, + "router_z_loss_mlp": 0.27893066, + "step": 2527, + "time_per_iteration": 2.5703248977661133 + }, + { + "auxiliary_loss_clip": 0.06595106, + "auxiliary_loss_mlp": 0.01287139, + "balance_loss_clip": 0.06298759, + "balance_loss_mlp": 0.01258386, + "epoch": 0.15199158274462649, + "flos": 36185472443520.0, + "grad_norm": 2.5277358880769034, + "language_loss": 0.69832277, + "learning_rate": 3.845909325145779e-06, + "loss": 0.77714521, + "num_input_tokens_seen": 54961365, + "router_z_loss_clip": 2.9609375, + "router_z_loss_mlp": 0.28710938, + "step": 2528, + "time_per_iteration": 2.6980392932891846 + }, + { + "auxiliary_loss_clip": 0.06594975, + "auxiliary_loss_mlp": 0.01296705, + "balance_loss_clip": 0.06302442, + "balance_loss_mlp": 0.01268142, + "epoch": 0.15205170599729445, + "flos": 23080416744960.0, + "grad_norm": 1.7045403282780136, + "language_loss": 0.87845027, + "learning_rate": 3.845759382967026e-06, + "loss": 0.95736718, + "num_input_tokens_seen": 54980750, + "router_z_loss_clip": 2.92382812, + "router_z_loss_mlp": 0.28588867, + "step": 2529, + "time_per_iteration": 2.557424545288086 + }, + { + "auxiliary_loss_clip": 0.06594887, + "auxiliary_loss_mlp": 0.01282389, + "balance_loss_clip": 0.06300049, + "balance_loss_mlp": 0.01254446, + "epoch": 0.15211182924996242, + "flos": 21914876851200.0, + "grad_norm": 2.4637975770903227, + "language_loss": 0.84209996, + "learning_rate": 3.845609370796893e-06, + "loss": 0.92087275, + "num_input_tokens_seen": 54999675, + "router_z_loss_clip": 2.9453125, + "router_z_loss_mlp": 0.27929688, + "step": 2530, + "time_per_iteration": 2.567228317260742 + }, + { + "auxiliary_loss_clip": 0.06598973, + "auxiliary_loss_mlp": 0.01283946, + "balance_loss_clip": 0.06302072, + "balance_loss_mlp": 0.01255336, + "epoch": 0.15217195250263038, + "flos": 13886675383680.0, + "grad_norm": 2.4321779104905312, + "language_loss": 0.82142234, + "learning_rate": 3.845459288641066e-06, + "loss": 0.90025157, + "num_input_tokens_seen": 55018295, + "router_z_loss_clip": 2.96875, + "router_z_loss_mlp": 0.28637695, + "step": 2531, + "time_per_iteration": 2.516402006149292 + }, + { + "auxiliary_loss_clip": 0.06592906, + "auxiliary_loss_mlp": 0.01285145, + "balance_loss_clip": 0.06298403, + "balance_loss_mlp": 0.01258085, + "epoch": 0.15223207575529837, + "flos": 24542247075840.0, + "grad_norm": 1.9096136580750296, + "language_loss": 0.79480046, + "learning_rate": 3.8453091365052394e-06, + "loss": 0.87358099, + "num_input_tokens_seen": 55037975, + "router_z_loss_clip": 2.94726562, + "router_z_loss_mlp": 0.27050781, + "step": 2532, + "time_per_iteration": 2.602570056915283 + }, + { + "auxiliary_loss_clip": 0.06598103, + "auxiliary_loss_mlp": 0.01292588, + "balance_loss_clip": 0.06306568, + "balance_loss_mlp": 0.01264038, + "epoch": 0.15229219900796634, + "flos": 25563876382080.0, + "grad_norm": 2.360683407186041, + "language_loss": 0.88639164, + "learning_rate": 3.845158914395105e-06, + "loss": 0.96529853, + "num_input_tokens_seen": 55057135, + "router_z_loss_clip": 2.91796875, + "router_z_loss_mlp": 0.28552246, + "step": 2533, + "time_per_iteration": 2.5762295722961426 + }, + { + "auxiliary_loss_clip": 0.06594107, + "auxiliary_loss_mlp": 0.01284606, + "balance_loss_clip": 0.06298208, + "balance_loss_mlp": 0.01254935, + "epoch": 0.1523523222606343, + "flos": 18222761594880.0, + "grad_norm": 2.499608410280873, + "language_loss": 0.79898536, + "learning_rate": 3.84500862231636e-06, + "loss": 0.87777245, + "num_input_tokens_seen": 55075525, + "router_z_loss_clip": 2.95898438, + "router_z_loss_mlp": 0.29650879, + "step": 2534, + "time_per_iteration": 2.5181829929351807 + }, + { + "auxiliary_loss_clip": 0.06609488, + "auxiliary_loss_mlp": 0.01289006, + "balance_loss_clip": 0.0630374, + "balance_loss_mlp": 0.01258965, + "epoch": 0.15241244551330227, + "flos": 13264940488320.0, + "grad_norm": 3.191609676619316, + "language_loss": 0.77956164, + "learning_rate": 3.844858260274702e-06, + "loss": 0.8585465, + "num_input_tokens_seen": 55090845, + "router_z_loss_clip": 3.05859375, + "router_z_loss_mlp": 0.30029297, + "step": 2535, + "time_per_iteration": 2.503119945526123 + }, + { + "auxiliary_loss_clip": 0.06608094, + "auxiliary_loss_mlp": 0.01284526, + "balance_loss_clip": 0.06304367, + "balance_loss_mlp": 0.01254271, + "epoch": 0.15247256876597023, + "flos": 19721083178880.0, + "grad_norm": 3.2947050027003066, + "language_loss": 0.79165435, + "learning_rate": 3.844707828275835e-06, + "loss": 0.87058055, + "num_input_tokens_seen": 55108750, + "router_z_loss_clip": 3.03710938, + "router_z_loss_mlp": 0.30249023, + "step": 2536, + "time_per_iteration": 2.5530476570129395 + }, + { + "auxiliary_loss_clip": 0.06598002, + "auxiliary_loss_mlp": 0.0128534, + "balance_loss_clip": 0.06305596, + "balance_loss_mlp": 0.01255537, + "epoch": 0.1525326920186382, + "flos": 20382076512000.0, + "grad_norm": 2.2639852442912174, + "language_loss": 0.76164496, + "learning_rate": 3.844557326325461e-06, + "loss": 0.84047836, + "num_input_tokens_seen": 55126750, + "router_z_loss_clip": 2.92578125, + "router_z_loss_mlp": 0.29785156, + "step": 2537, + "time_per_iteration": 2.5634751319885254 + }, + { + "auxiliary_loss_clip": 0.06616107, + "auxiliary_loss_mlp": 0.01291403, + "balance_loss_clip": 0.06314284, + "balance_loss_mlp": 0.0126017, + "epoch": 0.15259281527130616, + "flos": 13595122702080.0, + "grad_norm": 2.083719097909717, + "language_loss": 0.78846097, + "learning_rate": 3.8444067544292896e-06, + "loss": 0.86753607, + "num_input_tokens_seen": 55144690, + "router_z_loss_clip": 3.01953125, + "router_z_loss_mlp": 0.31225586, + "step": 2538, + "time_per_iteration": 2.525216579437256 + }, + { + "auxiliary_loss_clip": 0.0661103, + "auxiliary_loss_mlp": 0.01284923, + "balance_loss_clip": 0.06318808, + "balance_loss_mlp": 0.0125735, + "epoch": 0.15265293852397416, + "flos": 22867590574080.0, + "grad_norm": 1.595971485409624, + "language_loss": 0.90629852, + "learning_rate": 3.844256112593029e-06, + "loss": 0.98525798, + "num_input_tokens_seen": 55166055, + "router_z_loss_clip": 2.921875, + "router_z_loss_mlp": 0.27600098, + "step": 2539, + "time_per_iteration": 2.5915887355804443 + }, + { + "auxiliary_loss_clip": 0.06619261, + "auxiliary_loss_mlp": 0.01284998, + "balance_loss_clip": 0.06323005, + "balance_loss_mlp": 0.01258056, + "epoch": 0.15271306177664212, + "flos": 29245174462080.0, + "grad_norm": 1.9545185046664433, + "language_loss": 0.94507146, + "learning_rate": 3.844105400822391e-06, + "loss": 1.02411401, + "num_input_tokens_seen": 55186285, + "router_z_loss_clip": 2.9609375, + "router_z_loss_mlp": 0.26953125, + "step": 2540, + "time_per_iteration": 2.5957653522491455 + }, + { + "auxiliary_loss_clip": 0.06626961, + "auxiliary_loss_mlp": 0.01293534, + "balance_loss_clip": 0.06334557, + "balance_loss_mlp": 0.01266021, + "epoch": 0.1527731850293101, + "flos": 31253912392320.0, + "grad_norm": 1.8583637495379903, + "language_loss": 0.76235664, + "learning_rate": 3.843954619123092e-06, + "loss": 0.84156162, + "num_input_tokens_seen": 55207915, + "router_z_loss_clip": 2.92578125, + "router_z_loss_mlp": 0.27490234, + "step": 2541, + "time_per_iteration": 2.6641690731048584 + }, + { + "auxiliary_loss_clip": 0.06626125, + "auxiliary_loss_mlp": 0.01288118, + "balance_loss_clip": 0.06332077, + "balance_loss_mlp": 0.01259139, + "epoch": 0.15283330828197805, + "flos": 22388550382080.0, + "grad_norm": 1.961487412354616, + "language_loss": 0.82183802, + "learning_rate": 3.84380376750085e-06, + "loss": 0.90098047, + "num_input_tokens_seen": 55227860, + "router_z_loss_clip": 2.94140625, + "router_z_loss_mlp": 0.28991699, + "step": 2542, + "time_per_iteration": 2.5667076110839844 + }, + { + "auxiliary_loss_clip": 0.06644198, + "auxiliary_loss_mlp": 0.01293823, + "balance_loss_clip": 0.0634245, + "balance_loss_mlp": 0.01263568, + "epoch": 0.15289343153464602, + "flos": 25527175493760.0, + "grad_norm": 2.1541705335190597, + "language_loss": 0.78364998, + "learning_rate": 3.843652845961383e-06, + "loss": 0.8630302, + "num_input_tokens_seen": 55247330, + "router_z_loss_clip": 3.01953125, + "router_z_loss_mlp": 0.3026123, + "step": 2543, + "time_per_iteration": 3.986154556274414 + }, + { + "auxiliary_loss_clip": 0.06638096, + "auxiliary_loss_mlp": 0.01299522, + "balance_loss_clip": 0.06343587, + "balance_loss_mlp": 0.01271616, + "epoch": 0.15295355478731398, + "flos": 22716468535680.0, + "grad_norm": 3.1436155023596886, + "language_loss": 0.88072753, + "learning_rate": 3.843501854510416e-06, + "loss": 0.96010375, + "num_input_tokens_seen": 55266195, + "router_z_loss_clip": 2.9453125, + "router_z_loss_mlp": 0.27905273, + "step": 2544, + "time_per_iteration": 3.9873733520507812 + }, + { + "auxiliary_loss_clip": 0.06648069, + "auxiliary_loss_mlp": 0.01297216, + "balance_loss_clip": 0.06342938, + "balance_loss_mlp": 0.01266937, + "epoch": 0.15301367803998198, + "flos": 23257548276480.0, + "grad_norm": 3.867712661232465, + "language_loss": 0.83686781, + "learning_rate": 3.843350793153673e-06, + "loss": 0.91632062, + "num_input_tokens_seen": 55283305, + "router_z_loss_clip": 3.05078125, + "router_z_loss_mlp": 0.30273438, + "step": 2545, + "time_per_iteration": 2.5443849563598633 + }, + { + "auxiliary_loss_clip": 0.06650628, + "auxiliary_loss_mlp": 0.01286742, + "balance_loss_clip": 0.06356554, + "balance_loss_mlp": 0.01259086, + "epoch": 0.15307380129264994, + "flos": 25893597398400.0, + "grad_norm": 2.572032347282614, + "language_loss": 0.71873057, + "learning_rate": 3.843199661896884e-06, + "loss": 0.79810423, + "num_input_tokens_seen": 55303035, + "router_z_loss_clip": 2.94335938, + "router_z_loss_mlp": 0.27661133, + "step": 2546, + "time_per_iteration": 2.650826930999756 + }, + { + "auxiliary_loss_clip": 0.06637084, + "auxiliary_loss_mlp": 0.0129342, + "balance_loss_clip": 0.06340081, + "balance_loss_mlp": 0.01263164, + "epoch": 0.1531339245453179, + "flos": 46983780766080.0, + "grad_norm": 1.694960648035813, + "language_loss": 0.78831929, + "learning_rate": 3.843048460745779e-06, + "loss": 0.86762434, + "num_input_tokens_seen": 55327570, + "router_z_loss_clip": 2.96875, + "router_z_loss_mlp": 0.30249023, + "step": 2547, + "time_per_iteration": 2.7530312538146973 + }, + { + "auxiliary_loss_clip": 0.06643492, + "auxiliary_loss_mlp": 0.01284901, + "balance_loss_clip": 0.06342105, + "balance_loss_mlp": 0.0125579, + "epoch": 0.15319404779798587, + "flos": 35890817160960.0, + "grad_norm": 3.38346990001629, + "language_loss": 0.75178528, + "learning_rate": 3.842897189706092e-06, + "loss": 0.83106923, + "num_input_tokens_seen": 55351090, + "router_z_loss_clip": 3.015625, + "router_z_loss_mlp": 0.29138184, + "step": 2548, + "time_per_iteration": 4.090601682662964 + }, + { + "auxiliary_loss_clip": 0.06638174, + "auxiliary_loss_mlp": 0.01283175, + "balance_loss_clip": 0.06343598, + "balance_loss_mlp": 0.01255757, + "epoch": 0.15325417105065384, + "flos": 25671463424640.0, + "grad_norm": 1.8173203040893826, + "language_loss": 0.82054353, + "learning_rate": 3.842745848783558e-06, + "loss": 0.89975703, + "num_input_tokens_seen": 55371050, + "router_z_loss_clip": 2.94335938, + "router_z_loss_mlp": 0.27416992, + "step": 2549, + "time_per_iteration": 4.0024590492248535 + }, + { + "auxiliary_loss_clip": 0.06642953, + "auxiliary_loss_mlp": 0.01284523, + "balance_loss_clip": 0.06343073, + "balance_loss_mlp": 0.01256366, + "epoch": 0.1533142943033218, + "flos": 18776838718080.0, + "grad_norm": 1.6738213226373704, + "language_loss": 0.76089072, + "learning_rate": 3.842594437983917e-06, + "loss": 0.84016538, + "num_input_tokens_seen": 55390375, + "router_z_loss_clip": 2.99804688, + "router_z_loss_mlp": 0.28137207, + "step": 2550, + "time_per_iteration": 2.5584487915039062 + }, + { + "auxiliary_loss_clip": 0.06640078, + "auxiliary_loss_mlp": 0.01284284, + "balance_loss_clip": 0.063375, + "balance_loss_mlp": 0.01257093, + "epoch": 0.15337441755598977, + "flos": 23113218418560.0, + "grad_norm": 2.77223179347166, + "language_loss": 0.78078097, + "learning_rate": 3.8424429573129115e-06, + "loss": 0.86002457, + "num_input_tokens_seen": 55408890, + "router_z_loss_clip": 3.02734375, + "router_z_loss_mlp": 0.27172852, + "step": 2551, + "time_per_iteration": 2.5581319332122803 + }, + { + "auxiliary_loss_clip": 0.06594751, + "auxiliary_loss_mlp": 0.01264842, + "balance_loss_clip": 0.0641477, + "balance_loss_mlp": 0.01255657, + "epoch": 0.15343454080865776, + "flos": 59881278372480.0, + "grad_norm": 0.9086682427744472, + "language_loss": 0.56718183, + "learning_rate": 3.842291406776283e-06, + "loss": 0.6457777, + "num_input_tokens_seen": 55463815, + "router_z_loss_clip": 1.796875, + "router_z_loss_mlp": 0.09179688, + "step": 2552, + "time_per_iteration": 3.099020004272461 + }, + { + "auxiliary_loss_clip": 0.06649399, + "auxiliary_loss_mlp": 0.01294284, + "balance_loss_clip": 0.06343735, + "balance_loss_mlp": 0.01263695, + "epoch": 0.15349466406132573, + "flos": 11915644590720.0, + "grad_norm": 7.1683362370520625, + "language_loss": 0.89047897, + "learning_rate": 3.84213978637978e-06, + "loss": 0.96991581, + "num_input_tokens_seen": 55481050, + "router_z_loss_clip": 3.05664062, + "router_z_loss_mlp": 0.30615234, + "step": 2553, + "time_per_iteration": 2.5545389652252197 + }, + { + "auxiliary_loss_clip": 0.06633511, + "auxiliary_loss_mlp": 0.01288342, + "balance_loss_clip": 0.0633003, + "balance_loss_mlp": 0.01258575, + "epoch": 0.1535547873139937, + "flos": 24103681205760.0, + "grad_norm": 2.37345039804312, + "language_loss": 0.79193908, + "learning_rate": 3.841988096129152e-06, + "loss": 0.87115765, + "num_input_tokens_seen": 55500050, + "router_z_loss_clip": 3.03515625, + "router_z_loss_mlp": 0.29748535, + "step": 2554, + "time_per_iteration": 2.5949606895446777 + }, + { + "auxiliary_loss_clip": 0.06630482, + "auxiliary_loss_mlp": 0.01286402, + "balance_loss_clip": 0.06329404, + "balance_loss_mlp": 0.01256278, + "epoch": 0.15361491056666166, + "flos": 17572208094720.0, + "grad_norm": 5.650486163134607, + "language_loss": 0.79014289, + "learning_rate": 3.841836336030151e-06, + "loss": 0.86931169, + "num_input_tokens_seen": 55518125, + "router_z_loss_clip": 3.01171875, + "router_z_loss_mlp": 0.3013916, + "step": 2555, + "time_per_iteration": 2.5340495109558105 + }, + { + "auxiliary_loss_clip": 0.0662353, + "auxiliary_loss_mlp": 0.01288339, + "balance_loss_clip": 0.06330266, + "balance_loss_mlp": 0.01260671, + "epoch": 0.15367503381932962, + "flos": 25053040765440.0, + "grad_norm": 1.6796179562313394, + "language_loss": 0.78025055, + "learning_rate": 3.8416845060885305e-06, + "loss": 0.85936922, + "num_input_tokens_seen": 55540960, + "router_z_loss_clip": 2.93554688, + "router_z_loss_mlp": 0.2767334, + "step": 2556, + "time_per_iteration": 2.623685121536255 + }, + { + "auxiliary_loss_clip": 0.06620497, + "auxiliary_loss_mlp": 0.01288231, + "balance_loss_clip": 0.0633128, + "balance_loss_mlp": 0.01260086, + "epoch": 0.15373515707199759, + "flos": 21513808483200.0, + "grad_norm": 2.256114728182097, + "language_loss": 0.91304088, + "learning_rate": 3.84153260631005e-06, + "loss": 0.99212819, + "num_input_tokens_seen": 55559210, + "router_z_loss_clip": 2.890625, + "router_z_loss_mlp": 0.28161621, + "step": 2557, + "time_per_iteration": 2.6546642780303955 + }, + { + "auxiliary_loss_clip": 0.06632135, + "auxiliary_loss_mlp": 0.01294079, + "balance_loss_clip": 0.0633366, + "balance_loss_mlp": 0.0126411, + "epoch": 0.15379528032466555, + "flos": 26001897200640.0, + "grad_norm": 2.0796567985016656, + "language_loss": 0.71532625, + "learning_rate": 3.841380636700468e-06, + "loss": 0.79458839, + "num_input_tokens_seen": 55578925, + "router_z_loss_clip": 2.98632812, + "router_z_loss_mlp": 0.29980469, + "step": 2558, + "time_per_iteration": 2.604158401489258 + }, + { + "auxiliary_loss_clip": 0.06622511, + "auxiliary_loss_mlp": 0.01287721, + "balance_loss_clip": 0.06324002, + "balance_loss_mlp": 0.01258336, + "epoch": 0.15385540357733354, + "flos": 19282685016960.0, + "grad_norm": 2.0921223854633166, + "language_loss": 0.93401122, + "learning_rate": 3.841228597265548e-06, + "loss": 1.0131135, + "num_input_tokens_seen": 55597255, + "router_z_loss_clip": 2.98242188, + "router_z_loss_mlp": 0.29382324, + "step": 2559, + "time_per_iteration": 2.546621799468994 + }, + { + "auxiliary_loss_clip": 0.06626738, + "auxiliary_loss_mlp": 0.01291924, + "balance_loss_clip": 0.06328855, + "balance_loss_mlp": 0.01262289, + "epoch": 0.1539155268300015, + "flos": 28556788043520.0, + "grad_norm": 2.7498914144184994, + "language_loss": 0.65563196, + "learning_rate": 3.841076488011055e-06, + "loss": 0.73481858, + "num_input_tokens_seen": 55619515, + "router_z_loss_clip": 2.97851562, + "router_z_loss_mlp": 0.29638672, + "step": 2560, + "time_per_iteration": 2.633558511734009 + }, + { + "auxiliary_loss_clip": 0.06620878, + "auxiliary_loss_mlp": 0.01293003, + "balance_loss_clip": 0.06320217, + "balance_loss_mlp": 0.01262927, + "epoch": 0.15397565008266947, + "flos": 23554257984000.0, + "grad_norm": 1.9722034302545564, + "language_loss": 0.89109504, + "learning_rate": 3.8409243089427574e-06, + "loss": 0.9702338, + "num_input_tokens_seen": 55640050, + "router_z_loss_clip": 3.00585938, + "router_z_loss_mlp": 0.30065918, + "step": 2561, + "time_per_iteration": 2.593822479248047 + }, + { + "auxiliary_loss_clip": 0.06618848, + "auxiliary_loss_mlp": 0.01287729, + "balance_loss_clip": 0.06331521, + "balance_loss_mlp": 0.01260811, + "epoch": 0.15403577333533744, + "flos": 17135696649600.0, + "grad_norm": 2.292455015225775, + "language_loss": 0.83781528, + "learning_rate": 3.840772060066425e-06, + "loss": 0.91688108, + "num_input_tokens_seen": 55658695, + "router_z_loss_clip": 2.87109375, + "router_z_loss_mlp": 0.26928711, + "step": 2562, + "time_per_iteration": 2.5630288124084473 + }, + { + "auxiliary_loss_clip": 0.06628443, + "auxiliary_loss_mlp": 0.01297123, + "balance_loss_clip": 0.06321231, + "balance_loss_mlp": 0.01265175, + "epoch": 0.1540958965880054, + "flos": 17900252029440.0, + "grad_norm": 3.685635027542056, + "language_loss": 0.75855017, + "learning_rate": 3.840619741387832e-06, + "loss": 0.83780587, + "num_input_tokens_seen": 55676340, + "router_z_loss_clip": 3.0703125, + "router_z_loss_mlp": 0.31958008, + "step": 2563, + "time_per_iteration": 2.5140066146850586 + }, + { + "auxiliary_loss_clip": 0.06627464, + "auxiliary_loss_mlp": 0.01290382, + "balance_loss_clip": 0.06320702, + "balance_loss_mlp": 0.01258481, + "epoch": 0.15415601984067337, + "flos": 32169296321280.0, + "grad_norm": 2.478610974211426, + "language_loss": 0.77803361, + "learning_rate": 3.8404673529127534e-06, + "loss": 0.85721207, + "num_input_tokens_seen": 55698890, + "router_z_loss_clip": 3.0703125, + "router_z_loss_mlp": 0.3190918, + "step": 2564, + "time_per_iteration": 2.659982681274414 + }, + { + "auxiliary_loss_clip": 0.06615369, + "auxiliary_loss_mlp": 0.0129364, + "balance_loss_clip": 0.06320594, + "balance_loss_mlp": 0.01264267, + "epoch": 0.15421614309334136, + "flos": 24031243751040.0, + "grad_norm": 1.9916685694635767, + "language_loss": 0.71840364, + "learning_rate": 3.840314894646969e-06, + "loss": 0.7974937, + "num_input_tokens_seen": 55718535, + "router_z_loss_clip": 2.94726562, + "router_z_loss_mlp": 0.29321289, + "step": 2565, + "time_per_iteration": 2.553128480911255 + }, + { + "auxiliary_loss_clip": 0.06614129, + "auxiliary_loss_mlp": 0.01296634, + "balance_loss_clip": 0.06317951, + "balance_loss_mlp": 0.01266212, + "epoch": 0.15427626634600933, + "flos": 24392676337920.0, + "grad_norm": 2.5526224211901676, + "language_loss": 0.72527832, + "learning_rate": 3.840162366596259e-06, + "loss": 0.8043859, + "num_input_tokens_seen": 55738970, + "router_z_loss_clip": 2.9609375, + "router_z_loss_mlp": 0.30419922, + "step": 2566, + "time_per_iteration": 2.6016533374786377 + }, + { + "auxiliary_loss_clip": 0.06605071, + "auxiliary_loss_mlp": 0.01292884, + "balance_loss_clip": 0.06314062, + "balance_loss_mlp": 0.01265263, + "epoch": 0.1543363895986773, + "flos": 23338287285120.0, + "grad_norm": 2.301564838599309, + "language_loss": 0.86417472, + "learning_rate": 3.840009768766408e-06, + "loss": 0.94315434, + "num_input_tokens_seen": 55759585, + "router_z_loss_clip": 2.9140625, + "router_z_loss_mlp": 0.27612305, + "step": 2567, + "time_per_iteration": 2.5882625579833984 + }, + { + "auxiliary_loss_clip": 0.06608227, + "auxiliary_loss_mlp": 0.01293398, + "balance_loss_clip": 0.06315389, + "balance_loss_mlp": 0.01265348, + "epoch": 0.15439651285134526, + "flos": 24280225758720.0, + "grad_norm": 2.3922484360691576, + "language_loss": 0.79661417, + "learning_rate": 3.839857101163202e-06, + "loss": 0.87563044, + "num_input_tokens_seen": 55779250, + "router_z_loss_clip": 2.93164062, + "router_z_loss_mlp": 0.28039551, + "step": 2568, + "time_per_iteration": 2.6128549575805664 + }, + { + "auxiliary_loss_clip": 0.06604031, + "auxiliary_loss_mlp": 0.01296391, + "balance_loss_clip": 0.06313319, + "balance_loss_mlp": 0.01268103, + "epoch": 0.15445663610401322, + "flos": 22462832626560.0, + "grad_norm": 2.2987457723616482, + "language_loss": 0.71156412, + "learning_rate": 3.83970436379243e-06, + "loss": 0.79056835, + "num_input_tokens_seen": 55800470, + "router_z_loss_clip": 2.90820312, + "router_z_loss_mlp": 0.28295898, + "step": 2569, + "time_per_iteration": 2.555661916732788 + }, + { + "auxiliary_loss_clip": 0.06609643, + "auxiliary_loss_mlp": 0.0129108, + "balance_loss_clip": 0.06317194, + "balance_loss_mlp": 0.0126197, + "epoch": 0.1545167593566812, + "flos": 22055223640320.0, + "grad_norm": 2.1871959478456433, + "language_loss": 0.7775144, + "learning_rate": 3.839551556659884e-06, + "loss": 0.85652161, + "num_input_tokens_seen": 55817795, + "router_z_loss_clip": 2.92773438, + "router_z_loss_mlp": 0.29150391, + "step": 2570, + "time_per_iteration": 2.5834736824035645 + }, + { + "auxiliary_loss_clip": 0.06598657, + "auxiliary_loss_mlp": 0.01290077, + "balance_loss_clip": 0.06308745, + "balance_loss_mlp": 0.01260513, + "epoch": 0.15457688260934915, + "flos": 19324375223040.0, + "grad_norm": 2.749201239461968, + "language_loss": 0.7861867, + "learning_rate": 3.839398679771359e-06, + "loss": 0.86507404, + "num_input_tokens_seen": 55836125, + "router_z_loss_clip": 2.8984375, + "router_z_loss_mlp": 0.29541016, + "step": 2571, + "time_per_iteration": 2.5391428470611572 + }, + { + "auxiliary_loss_clip": 0.06606804, + "auxiliary_loss_mlp": 0.01294872, + "balance_loss_clip": 0.06313352, + "balance_loss_mlp": 0.01265785, + "epoch": 0.15463700586201715, + "flos": 24140843291520.0, + "grad_norm": 1.901838675989398, + "language_loss": 0.83756542, + "learning_rate": 3.839245733132652e-06, + "loss": 0.91658223, + "num_input_tokens_seen": 55855280, + "router_z_loss_clip": 2.93554688, + "router_z_loss_mlp": 0.29101562, + "step": 2572, + "time_per_iteration": 2.597111463546753 + }, + { + "auxiliary_loss_clip": 0.06611877, + "auxiliary_loss_mlp": 0.01296064, + "balance_loss_clip": 0.06316563, + "balance_loss_mlp": 0.01266393, + "epoch": 0.1546971291146851, + "flos": 22427808819840.0, + "grad_norm": 2.3334374955274466, + "language_loss": 0.91633451, + "learning_rate": 3.839092716749563e-06, + "loss": 0.9954139, + "num_input_tokens_seen": 55875695, + "router_z_loss_clip": 2.95898438, + "router_z_loss_mlp": 0.29699707, + "step": 2573, + "time_per_iteration": 2.553586721420288 + }, + { + "auxiliary_loss_clip": 0.06606219, + "auxiliary_loss_mlp": 0.01288918, + "balance_loss_clip": 0.06312492, + "balance_loss_mlp": 0.01258639, + "epoch": 0.15475725236735308, + "flos": 17536010330880.0, + "grad_norm": 1.5970575826599196, + "language_loss": 0.71088636, + "learning_rate": 3.838939630627893e-06, + "loss": 0.78983772, + "num_input_tokens_seen": 55894575, + "router_z_loss_clip": 2.93554688, + "router_z_loss_mlp": 0.30249023, + "step": 2574, + "time_per_iteration": 2.5485129356384277 + }, + { + "auxiliary_loss_clip": 0.06606239, + "auxiliary_loss_mlp": 0.01287836, + "balance_loss_clip": 0.06312916, + "balance_loss_mlp": 0.01258439, + "epoch": 0.15481737562002104, + "flos": 22567778265600.0, + "grad_norm": 2.064736624590997, + "language_loss": 0.83194166, + "learning_rate": 3.838786474773448e-06, + "loss": 0.91088241, + "num_input_tokens_seen": 55912855, + "router_z_loss_clip": 2.93359375, + "router_z_loss_mlp": 0.29394531, + "step": 2575, + "time_per_iteration": 2.5202696323394775 + }, + { + "auxiliary_loss_clip": 0.06611623, + "auxiliary_loss_mlp": 0.01295032, + "balance_loss_clip": 0.06317705, + "balance_loss_mlp": 0.01267137, + "epoch": 0.154877498872689, + "flos": 24907620804480.0, + "grad_norm": 1.9923268704643078, + "language_loss": 0.8600359, + "learning_rate": 3.838633249192036e-06, + "loss": 0.93910241, + "num_input_tokens_seen": 55932375, + "router_z_loss_clip": 2.93945312, + "router_z_loss_mlp": 0.27929688, + "step": 2576, + "time_per_iteration": 2.5677525997161865 + }, + { + "auxiliary_loss_clip": 0.06609543, + "auxiliary_loss_mlp": 0.01301269, + "balance_loss_clip": 0.06318229, + "balance_loss_mlp": 0.01275126, + "epoch": 0.15493762212535697, + "flos": 28155048842880.0, + "grad_norm": 2.065090565667539, + "language_loss": 0.82887769, + "learning_rate": 3.838479953889465e-06, + "loss": 0.90798575, + "num_input_tokens_seen": 55953970, + "router_z_loss_clip": 2.91796875, + "router_z_loss_mlp": 0.26147461, + "step": 2577, + "time_per_iteration": 2.5728230476379395 + }, + { + "auxiliary_loss_clip": 0.06618612, + "auxiliary_loss_mlp": 0.01306082, + "balance_loss_clip": 0.06324668, + "balance_loss_mlp": 0.01276852, + "epoch": 0.15499774537802496, + "flos": 25418162931840.0, + "grad_norm": 2.85112064725787, + "language_loss": 0.77597427, + "learning_rate": 3.8383265888715525e-06, + "loss": 0.85522127, + "num_input_tokens_seen": 55973120, + "router_z_loss_clip": 2.94335938, + "router_z_loss_mlp": 0.29199219, + "step": 2578, + "time_per_iteration": 2.5934667587280273 + }, + { + "auxiliary_loss_clip": 0.06630063, + "auxiliary_loss_mlp": 0.01289241, + "balance_loss_clip": 0.06328662, + "balance_loss_mlp": 0.01259224, + "epoch": 0.15505786863069293, + "flos": 22098213584640.0, + "grad_norm": 1.7655677053725216, + "language_loss": 0.8325448, + "learning_rate": 3.83817315414411e-06, + "loss": 0.91173792, + "num_input_tokens_seen": 55993260, + "router_z_loss_clip": 3.01367188, + "router_z_loss_mlp": 0.30004883, + "step": 2579, + "time_per_iteration": 2.5339503288269043 + }, + { + "auxiliary_loss_clip": 0.06624122, + "auxiliary_loss_mlp": 0.01293638, + "balance_loss_clip": 0.06327586, + "balance_loss_mlp": 0.01264074, + "epoch": 0.1551179918833609, + "flos": 18923223000960.0, + "grad_norm": 3.703462791860066, + "language_loss": 0.81290895, + "learning_rate": 3.838019649712958e-06, + "loss": 0.89208651, + "num_input_tokens_seen": 56012130, + "router_z_loss_clip": 2.96484375, + "router_z_loss_mlp": 0.2956543, + "step": 2580, + "time_per_iteration": 2.547076940536499 + }, + { + "auxiliary_loss_clip": 0.06553604, + "auxiliary_loss_mlp": 0.01296097, + "balance_loss_clip": 0.06379167, + "balance_loss_mlp": 0.01287341, + "epoch": 0.15517811513602886, + "flos": 66259281530880.0, + "grad_norm": 0.8290210768149422, + "language_loss": 0.59028411, + "learning_rate": 3.8378660755839166e-06, + "loss": 0.6687811, + "num_input_tokens_seen": 56079045, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.08770752, + "step": 2581, + "time_per_iteration": 4.748734712600708 + }, + { + "auxiliary_loss_clip": 0.06615421, + "auxiliary_loss_mlp": 0.01287932, + "balance_loss_clip": 0.06319774, + "balance_loss_mlp": 0.01259286, + "epoch": 0.15523823838869683, + "flos": 24027344536320.0, + "grad_norm": 2.048194408824491, + "language_loss": 0.86481762, + "learning_rate": 3.8377124317628095e-06, + "loss": 0.94385123, + "num_input_tokens_seen": 56098745, + "router_z_loss_clip": 2.95703125, + "router_z_loss_mlp": 0.28625488, + "step": 2582, + "time_per_iteration": 2.5417592525482178 + }, + { + "auxiliary_loss_clip": 0.0661144, + "auxiliary_loss_mlp": 0.01292493, + "balance_loss_clip": 0.06316175, + "balance_loss_mlp": 0.01262262, + "epoch": 0.1552983616413648, + "flos": 20491256782080.0, + "grad_norm": 2.196568898095916, + "language_loss": 0.79934382, + "learning_rate": 3.8375587182554625e-06, + "loss": 0.87838316, + "num_input_tokens_seen": 56117655, + "router_z_loss_clip": 2.95117188, + "router_z_loss_mlp": 0.30236816, + "step": 2583, + "time_per_iteration": 4.1261961460113525 + }, + { + "auxiliary_loss_clip": 0.06610835, + "auxiliary_loss_mlp": 0.01301507, + "balance_loss_clip": 0.06316249, + "balance_loss_mlp": 0.01272956, + "epoch": 0.15535848489403276, + "flos": 32131798819200.0, + "grad_norm": 2.2182475294075643, + "language_loss": 0.77203268, + "learning_rate": 3.837404935067705e-06, + "loss": 0.85115612, + "num_input_tokens_seen": 56141960, + "router_z_loss_clip": 2.9453125, + "router_z_loss_mlp": 0.28515625, + "step": 2584, + "time_per_iteration": 2.71648907661438 + }, + { + "auxiliary_loss_clip": 0.06603897, + "auxiliary_loss_mlp": 0.01292119, + "balance_loss_clip": 0.06309253, + "balance_loss_mlp": 0.01263676, + "epoch": 0.15541860814670075, + "flos": 19104379528320.0, + "grad_norm": 2.0708341386331157, + "language_loss": 0.76718783, + "learning_rate": 3.837251082205368e-06, + "loss": 0.84614801, + "num_input_tokens_seen": 56161430, + "router_z_loss_clip": 2.9453125, + "router_z_loss_mlp": 0.28442383, + "step": 2585, + "time_per_iteration": 2.548250198364258 + }, + { + "auxiliary_loss_clip": 0.06590863, + "auxiliary_loss_mlp": 0.01288896, + "balance_loss_clip": 0.06303678, + "balance_loss_mlp": 0.01260607, + "epoch": 0.1554787313993687, + "flos": 19178158648320.0, + "grad_norm": 2.0117198745869134, + "language_loss": 0.6235339, + "learning_rate": 3.837097159674286e-06, + "loss": 0.70233154, + "num_input_tokens_seen": 56179390, + "router_z_loss_clip": 2.87109375, + "router_z_loss_mlp": 0.28283691, + "step": 2586, + "time_per_iteration": 2.5397160053253174 + }, + { + "auxiliary_loss_clip": 0.06596754, + "auxiliary_loss_mlp": 0.01289508, + "balance_loss_clip": 0.0630295, + "balance_loss_mlp": 0.0126023, + "epoch": 0.15553885465203668, + "flos": 16149384639360.0, + "grad_norm": 2.0060039427442065, + "language_loss": 0.82540935, + "learning_rate": 3.836943167480296e-06, + "loss": 0.90427202, + "num_input_tokens_seen": 56198020, + "router_z_loss_clip": 2.9453125, + "router_z_loss_mlp": 0.29321289, + "step": 2587, + "time_per_iteration": 2.5246498584747314 + }, + { + "auxiliary_loss_clip": 0.06596097, + "auxiliary_loss_mlp": 0.01287288, + "balance_loss_clip": 0.06299823, + "balance_loss_mlp": 0.01257152, + "epoch": 0.15559897790470464, + "flos": 25344803082240.0, + "grad_norm": 1.8823875807099288, + "language_loss": 0.8996799, + "learning_rate": 3.836789105629236e-06, + "loss": 0.97851378, + "num_input_tokens_seen": 56218165, + "router_z_loss_clip": 2.9609375, + "router_z_loss_mlp": 0.30126953, + "step": 2588, + "time_per_iteration": 4.054608345031738 + }, + { + "auxiliary_loss_clip": 0.06588855, + "auxiliary_loss_mlp": 0.01285264, + "balance_loss_clip": 0.06298578, + "balance_loss_mlp": 0.01255628, + "epoch": 0.1556591011573726, + "flos": 23155453676160.0, + "grad_norm": 2.3276735592444253, + "language_loss": 0.65979421, + "learning_rate": 3.83663497412695e-06, + "loss": 0.7385354, + "num_input_tokens_seen": 56237160, + "router_z_loss_clip": 2.90234375, + "router_z_loss_mlp": 0.29614258, + "step": 2589, + "time_per_iteration": 2.5870378017425537 + }, + { + "auxiliary_loss_clip": 0.06587367, + "auxiliary_loss_mlp": 0.01283128, + "balance_loss_clip": 0.06296779, + "balance_loss_mlp": 0.01254554, + "epoch": 0.15571922441004057, + "flos": 25377353193600.0, + "grad_norm": 1.8444510343536653, + "language_loss": 0.83209628, + "learning_rate": 3.836480772979281e-06, + "loss": 0.91080129, + "num_input_tokens_seen": 56257610, + "router_z_loss_clip": 2.90820312, + "router_z_loss_mlp": 0.2857666, + "step": 2590, + "time_per_iteration": 2.567789316177368 + }, + { + "auxiliary_loss_clip": 0.06586926, + "auxiliary_loss_mlp": 0.01284797, + "balance_loss_clip": 0.06295232, + "balance_loss_mlp": 0.0125819, + "epoch": 0.15577934766270854, + "flos": 14506565489280.0, + "grad_norm": 2.5394168350381956, + "language_loss": 0.80645335, + "learning_rate": 3.836326502192077e-06, + "loss": 0.88517064, + "num_input_tokens_seen": 56275215, + "router_z_loss_clip": 2.91796875, + "router_z_loss_mlp": 0.26635742, + "step": 2591, + "time_per_iteration": 2.552945852279663 + }, + { + "auxiliary_loss_clip": 0.06583126, + "auxiliary_loss_mlp": 0.0128094, + "balance_loss_clip": 0.06296018, + "balance_loss_mlp": 0.01255953, + "epoch": 0.15583947091537653, + "flos": 37423575573120.0, + "grad_norm": 4.213698124732034, + "language_loss": 0.6586749, + "learning_rate": 3.836172161771189e-06, + "loss": 0.73731554, + "num_input_tokens_seen": 56297130, + "router_z_loss_clip": 2.87109375, + "router_z_loss_mlp": 0.25024414, + "step": 2592, + "time_per_iteration": 2.6843414306640625 + }, + { + "auxiliary_loss_clip": 0.06601857, + "auxiliary_loss_mlp": 0.01282978, + "balance_loss_clip": 0.06306329, + "balance_loss_mlp": 0.01254547, + "epoch": 0.1558995941680445, + "flos": 21841097731200.0, + "grad_norm": 2.3724666239354804, + "language_loss": 0.83576721, + "learning_rate": 3.836017751722467e-06, + "loss": 0.91461557, + "num_input_tokens_seen": 56314995, + "router_z_loss_clip": 2.95898438, + "router_z_loss_mlp": 0.28442383, + "step": 2593, + "time_per_iteration": 2.565361738204956 + }, + { + "auxiliary_loss_clip": 0.06586924, + "auxiliary_loss_mlp": 0.01289301, + "balance_loss_clip": 0.06303876, + "balance_loss_mlp": 0.01261526, + "epoch": 0.15595971742071246, + "flos": 19798845367680.0, + "grad_norm": 2.2297480783075847, + "language_loss": 0.74099863, + "learning_rate": 3.8358632720517695e-06, + "loss": 0.8197608, + "num_input_tokens_seen": 56334005, + "router_z_loss_clip": 2.83203125, + "router_z_loss_mlp": 0.27819824, + "step": 2594, + "time_per_iteration": 2.55253267288208 + }, + { + "auxiliary_loss_clip": 0.06601368, + "auxiliary_loss_mlp": 0.01282916, + "balance_loss_clip": 0.06319516, + "balance_loss_mlp": 0.01257346, + "epoch": 0.15601984067338043, + "flos": 26729038932480.0, + "grad_norm": 2.826820029132309, + "language_loss": 0.82562411, + "learning_rate": 3.835708722764952e-06, + "loss": 0.90446699, + "num_input_tokens_seen": 56353795, + "router_z_loss_clip": 2.81835938, + "router_z_loss_mlp": 0.2557373, + "step": 2595, + "time_per_iteration": 2.640240430831909 + }, + { + "auxiliary_loss_clip": 0.06626514, + "auxiliary_loss_mlp": 0.01281437, + "balance_loss_clip": 0.06334631, + "balance_loss_mlp": 0.01254936, + "epoch": 0.1560799639260484, + "flos": 18375183371520.0, + "grad_norm": 9.37489887619581, + "language_loss": 0.87632233, + "learning_rate": 3.835554103867876e-06, + "loss": 0.95540184, + "num_input_tokens_seen": 56373195, + "router_z_loss_clip": 2.91992188, + "router_z_loss_mlp": 0.26538086, + "step": 2596, + "time_per_iteration": 2.529327869415283 + }, + { + "auxiliary_loss_clip": 0.06606492, + "auxiliary_loss_mlp": 0.01287289, + "balance_loss_clip": 0.06323552, + "balance_loss_mlp": 0.01261015, + "epoch": 0.15614008717871636, + "flos": 22605149986560.0, + "grad_norm": 2.807545322610708, + "language_loss": 0.69688505, + "learning_rate": 3.835399415366404e-06, + "loss": 0.77582288, + "num_input_tokens_seen": 56391525, + "router_z_loss_clip": 2.82617188, + "router_z_loss_mlp": 0.26306152, + "step": 2597, + "time_per_iteration": 2.5685815811157227 + }, + { + "auxiliary_loss_clip": 0.0662894, + "auxiliary_loss_mlp": 0.01280666, + "balance_loss_clip": 0.06348241, + "balance_loss_mlp": 0.01256455, + "epoch": 0.15620021043138435, + "flos": 22753379059200.0, + "grad_norm": 2.0232351113841514, + "language_loss": 0.80914307, + "learning_rate": 3.8352446572664035e-06, + "loss": 0.88823915, + "num_input_tokens_seen": 56410715, + "router_z_loss_clip": 2.81054688, + "router_z_loss_mlp": 0.2421875, + "step": 2598, + "time_per_iteration": 2.554202079772949 + }, + { + "auxiliary_loss_clip": 0.0662708, + "auxiliary_loss_mlp": 0.01284312, + "balance_loss_clip": 0.06344105, + "balance_loss_mlp": 0.01257895, + "epoch": 0.15626033368405232, + "flos": 13119897870720.0, + "grad_norm": 2.0408523791990016, + "language_loss": 0.83276039, + "learning_rate": 3.8350898295737405e-06, + "loss": 0.91187429, + "num_input_tokens_seen": 56429170, + "router_z_loss_clip": 2.83007812, + "router_z_loss_mlp": 0.26391602, + "step": 2599, + "time_per_iteration": 2.66353702545166 + }, + { + "auxiliary_loss_clip": 0.06639346, + "auxiliary_loss_mlp": 0.01292644, + "balance_loss_clip": 0.06344323, + "balance_loss_mlp": 0.0126469, + "epoch": 0.15632045693672028, + "flos": 16477931698560.0, + "grad_norm": 2.3045518919772046, + "language_loss": 0.82379115, + "learning_rate": 3.834934932294287e-06, + "loss": 0.9031111, + "num_input_tokens_seen": 56445685, + "router_z_loss_clip": 2.95117188, + "router_z_loss_mlp": 0.27941895, + "step": 2600, + "time_per_iteration": 2.50607967376709 + }, + { + "auxiliary_loss_clip": 0.06646761, + "auxiliary_loss_mlp": 0.01287391, + "balance_loss_clip": 0.0635706, + "balance_loss_mlp": 0.01259305, + "epoch": 0.15638058018938825, + "flos": 20856672437760.0, + "grad_norm": 2.020166421544308, + "language_loss": 0.88839436, + "learning_rate": 3.834779965433917e-06, + "loss": 0.96773589, + "num_input_tokens_seen": 56465900, + "router_z_loss_clip": 2.8984375, + "router_z_loss_mlp": 0.28076172, + "step": 2601, + "time_per_iteration": 2.574437141418457 + }, + { + "auxiliary_loss_clip": 0.06648471, + "auxiliary_loss_mlp": 0.01294906, + "balance_loss_clip": 0.06352241, + "balance_loss_mlp": 0.01267989, + "epoch": 0.1564407034420562, + "flos": 21878762941440.0, + "grad_norm": 2.51177361833528, + "language_loss": 0.79510248, + "learning_rate": 3.834624928998508e-06, + "loss": 0.87453628, + "num_input_tokens_seen": 56485020, + "router_z_loss_clip": 2.96679688, + "router_z_loss_mlp": 0.26940918, + "step": 2602, + "time_per_iteration": 2.5957844257354736 + }, + { + "auxiliary_loss_clip": 0.06633168, + "auxiliary_loss_mlp": 0.01292264, + "balance_loss_clip": 0.06345348, + "balance_loss_mlp": 0.01265979, + "epoch": 0.15650082669472418, + "flos": 21840888096000.0, + "grad_norm": 1.9170738392352888, + "language_loss": 0.7431488, + "learning_rate": 3.8344698229939376e-06, + "loss": 0.82240313, + "num_input_tokens_seen": 56505205, + "router_z_loss_clip": 2.875, + "router_z_loss_mlp": 0.26293945, + "step": 2603, + "time_per_iteration": 2.5696704387664795 + }, + { + "auxiliary_loss_clip": 0.06625052, + "auxiliary_loss_mlp": 0.01287753, + "balance_loss_clip": 0.06337333, + "balance_loss_mlp": 0.01261217, + "epoch": 0.15656094994739214, + "flos": 13804343147520.0, + "grad_norm": 2.480258971716289, + "language_loss": 0.88529468, + "learning_rate": 3.8343146474260865e-06, + "loss": 0.9644227, + "num_input_tokens_seen": 56521495, + "router_z_loss_clip": 2.87890625, + "router_z_loss_mlp": 0.26538086, + "step": 2604, + "time_per_iteration": 2.5110373497009277 + }, + { + "auxiliary_loss_clip": 0.06634312, + "auxiliary_loss_mlp": 0.01291425, + "balance_loss_clip": 0.06341597, + "balance_loss_mlp": 0.01266558, + "epoch": 0.15662107320006013, + "flos": 27315582312960.0, + "grad_norm": 2.192350516429204, + "language_loss": 0.85880566, + "learning_rate": 3.834159402300841e-06, + "loss": 0.93806314, + "num_input_tokens_seen": 56540665, + "router_z_loss_clip": 2.9296875, + "router_z_loss_mlp": 0.2487793, + "step": 2605, + "time_per_iteration": 2.6109507083892822 + }, + { + "auxiliary_loss_clip": 0.06649123, + "auxiliary_loss_mlp": 0.01294389, + "balance_loss_clip": 0.06348212, + "balance_loss_mlp": 0.01265802, + "epoch": 0.1566811964527281, + "flos": 26691876846720.0, + "grad_norm": 1.9127965853266395, + "language_loss": 0.73996091, + "learning_rate": 3.834004087624087e-06, + "loss": 0.81939602, + "num_input_tokens_seen": 56560805, + "router_z_loss_clip": 3.00976562, + "router_z_loss_mlp": 0.28564453, + "step": 2606, + "time_per_iteration": 2.7345151901245117 + }, + { + "auxiliary_loss_clip": 0.06621392, + "auxiliary_loss_mlp": 0.01286091, + "balance_loss_clip": 0.06334884, + "balance_loss_mlp": 0.01260246, + "epoch": 0.15674131970539606, + "flos": 16108323338880.0, + "grad_norm": 2.273122789948623, + "language_loss": 0.77297181, + "learning_rate": 3.8338487034017145e-06, + "loss": 0.85204661, + "num_input_tokens_seen": 56576335, + "router_z_loss_clip": 2.8671875, + "router_z_loss_mlp": 0.25842285, + "step": 2607, + "time_per_iteration": 2.571983575820923 + }, + { + "auxiliary_loss_clip": 0.06614074, + "auxiliary_loss_mlp": 0.01286338, + "balance_loss_clip": 0.06327923, + "balance_loss_mlp": 0.01260791, + "epoch": 0.15680144295806403, + "flos": 19175349536640.0, + "grad_norm": 1.917731361959034, + "language_loss": 0.8328836, + "learning_rate": 3.833693249639615e-06, + "loss": 0.91188771, + "num_input_tokens_seen": 56595880, + "router_z_loss_clip": 2.86328125, + "router_z_loss_mlp": 0.25598145, + "step": 2608, + "time_per_iteration": 2.5823540687561035 + }, + { + "auxiliary_loss_clip": 0.06622173, + "auxiliary_loss_mlp": 0.01295073, + "balance_loss_clip": 0.06326167, + "balance_loss_mlp": 0.01264901, + "epoch": 0.156861566210732, + "flos": 20819678060160.0, + "grad_norm": 2.1481617307418017, + "language_loss": 0.73101258, + "learning_rate": 3.833537726343684e-06, + "loss": 0.81018502, + "num_input_tokens_seen": 56615130, + "router_z_loss_clip": 2.9609375, + "router_z_loss_mlp": 0.30163574, + "step": 2609, + "time_per_iteration": 2.572356700897217 + }, + { + "auxiliary_loss_clip": 0.06605803, + "auxiliary_loss_mlp": 0.01286832, + "balance_loss_clip": 0.06311236, + "balance_loss_mlp": 0.01260928, + "epoch": 0.15692168946339996, + "flos": 20054158358400.0, + "grad_norm": 2.0130429141277446, + "language_loss": 0.73445058, + "learning_rate": 3.833382133519818e-06, + "loss": 0.8133769, + "num_input_tokens_seen": 56634005, + "router_z_loss_clip": 2.94726562, + "router_z_loss_mlp": 0.2590332, + "step": 2610, + "time_per_iteration": 2.567537784576416 + }, + { + "auxiliary_loss_clip": 0.06606032, + "auxiliary_loss_mlp": 0.01288962, + "balance_loss_clip": 0.06310159, + "balance_loss_mlp": 0.01258873, + "epoch": 0.15698181271606793, + "flos": 21404502432000.0, + "grad_norm": 1.9787082052238874, + "language_loss": 0.73279381, + "learning_rate": 3.833226471173919e-06, + "loss": 0.81174374, + "num_input_tokens_seen": 56653480, + "router_z_loss_clip": 2.95898438, + "router_z_loss_mlp": 0.30065918, + "step": 2611, + "time_per_iteration": 2.582390308380127 + }, + { + "auxiliary_loss_clip": 0.06594902, + "auxiliary_loss_mlp": 0.01284265, + "balance_loss_clip": 0.06304685, + "balance_loss_mlp": 0.01259172, + "epoch": 0.15704193596873592, + "flos": 20851347703680.0, + "grad_norm": 2.098501694873674, + "language_loss": 0.71879792, + "learning_rate": 3.833070739311887e-06, + "loss": 0.79758954, + "num_input_tokens_seen": 56672270, + "router_z_loss_clip": 2.90234375, + "router_z_loss_mlp": 0.25097656, + "step": 2612, + "time_per_iteration": 2.577627658843994 + }, + { + "auxiliary_loss_clip": 0.0659887, + "auxiliary_loss_mlp": 0.01283795, + "balance_loss_clip": 0.06308534, + "balance_loss_mlp": 0.0125832, + "epoch": 0.15710205922140388, + "flos": 21769456890240.0, + "grad_norm": 2.359608918603851, + "language_loss": 0.77193695, + "learning_rate": 3.83291493793963e-06, + "loss": 0.85076362, + "num_input_tokens_seen": 56691510, + "router_z_loss_clip": 2.90429688, + "router_z_loss_mlp": 0.2545166, + "step": 2613, + "time_per_iteration": 2.5632479190826416 + }, + { + "auxiliary_loss_clip": 0.06608421, + "auxiliary_loss_mlp": 0.01292559, + "balance_loss_clip": 0.06315231, + "balance_loss_mlp": 0.01266106, + "epoch": 0.15716218247407185, + "flos": 25014453160320.0, + "grad_norm": 1.6622650675423762, + "language_loss": 0.66684031, + "learning_rate": 3.832759067063055e-06, + "loss": 0.74585009, + "num_input_tokens_seen": 56712230, + "router_z_loss_clip": 2.93359375, + "router_z_loss_mlp": 0.26428223, + "step": 2614, + "time_per_iteration": 2.684286117553711 + }, + { + "auxiliary_loss_clip": 0.0661184, + "auxiliary_loss_mlp": 0.01292567, + "balance_loss_clip": 0.06314493, + "balance_loss_mlp": 0.01264255, + "epoch": 0.1572223057267398, + "flos": 20197691602560.0, + "grad_norm": 3.2869095787841576, + "language_loss": 0.76402575, + "learning_rate": 3.832603126688072e-06, + "loss": 0.84306979, + "num_input_tokens_seen": 56727490, + "router_z_loss_clip": 2.9765625, + "router_z_loss_mlp": 0.28308105, + "step": 2615, + "time_per_iteration": 2.551769971847534 + }, + { + "auxiliary_loss_clip": 0.06589202, + "auxiliary_loss_mlp": 0.01285836, + "balance_loss_clip": 0.06304425, + "balance_loss_mlp": 0.01260587, + "epoch": 0.15728242897940778, + "flos": 20965810780800.0, + "grad_norm": 1.7986527043954237, + "language_loss": 0.74040192, + "learning_rate": 3.832447116820594e-06, + "loss": 0.81915236, + "num_input_tokens_seen": 56747385, + "router_z_loss_clip": 2.84960938, + "router_z_loss_mlp": 0.25256348, + "step": 2616, + "time_per_iteration": 2.5935630798339844 + }, + { + "auxiliary_loss_clip": 0.06601542, + "auxiliary_loss_mlp": 0.01283526, + "balance_loss_clip": 0.06305884, + "balance_loss_mlp": 0.01256966, + "epoch": 0.15734255223207574, + "flos": 23044764032640.0, + "grad_norm": 2.1005464521191426, + "language_loss": 0.73305666, + "learning_rate": 3.832291037466539e-06, + "loss": 0.81190741, + "num_input_tokens_seen": 56768055, + "router_z_loss_clip": 2.95703125, + "router_z_loss_mlp": 0.265625, + "step": 2617, + "time_per_iteration": 2.6116013526916504 + }, + { + "auxiliary_loss_clip": 0.06593003, + "auxiliary_loss_mlp": 0.01287239, + "balance_loss_clip": 0.06306564, + "balance_loss_mlp": 0.012605, + "epoch": 0.15740267548474374, + "flos": 20556357004800.0, + "grad_norm": 2.1735503953171813, + "language_loss": 0.75337285, + "learning_rate": 3.8321348886318235e-06, + "loss": 0.83217525, + "num_input_tokens_seen": 56785110, + "router_z_loss_clip": 2.86523438, + "router_z_loss_mlp": 0.26745605, + "step": 2618, + "time_per_iteration": 2.558271884918213 + }, + { + "auxiliary_loss_clip": 0.06606486, + "auxiliary_loss_mlp": 0.01288019, + "balance_loss_clip": 0.06305802, + "balance_loss_mlp": 0.01260052, + "epoch": 0.1574627987374117, + "flos": 22672262707200.0, + "grad_norm": 2.4653942739702277, + "language_loss": 0.79897004, + "learning_rate": 3.8319786703223695e-06, + "loss": 0.87791508, + "num_input_tokens_seen": 56804975, + "router_z_loss_clip": 3.00585938, + "router_z_loss_mlp": 0.2800293, + "step": 2619, + "time_per_iteration": 2.5732688903808594 + }, + { + "auxiliary_loss_clip": 0.06592336, + "auxiliary_loss_mlp": 0.01289339, + "balance_loss_clip": 0.06304029, + "balance_loss_mlp": 0.01263304, + "epoch": 0.15752292199007967, + "flos": 16806352976640.0, + "grad_norm": 1.8956550238632917, + "language_loss": 0.77960408, + "learning_rate": 3.831822382544101e-06, + "loss": 0.85842085, + "num_input_tokens_seen": 56822470, + "router_z_loss_clip": 2.88476562, + "router_z_loss_mlp": 0.26013184, + "step": 2620, + "time_per_iteration": 2.556342363357544 + }, + { + "auxiliary_loss_clip": 0.06608844, + "auxiliary_loss_mlp": 0.01287118, + "balance_loss_clip": 0.06316274, + "balance_loss_mlp": 0.01259843, + "epoch": 0.15758304524274763, + "flos": 29833856121600.0, + "grad_norm": 1.8795614053933318, + "language_loss": 0.72243416, + "learning_rate": 3.831666025302944e-06, + "loss": 0.80139381, + "num_input_tokens_seen": 56842100, + "router_z_loss_clip": 2.92382812, + "router_z_loss_mlp": 0.27282715, + "step": 2621, + "time_per_iteration": 4.014448881149292 + }, + { + "auxiliary_loss_clip": 0.06605494, + "auxiliary_loss_mlp": 0.01287754, + "balance_loss_clip": 0.06309334, + "balance_loss_mlp": 0.01260813, + "epoch": 0.1576431684954156, + "flos": 53589116851200.0, + "grad_norm": 5.362699165833927, + "language_loss": 0.73428345, + "learning_rate": 3.831509598604828e-06, + "loss": 0.81321585, + "num_input_tokens_seen": 56865920, + "router_z_loss_clip": 2.96484375, + "router_z_loss_mlp": 0.26940918, + "step": 2622, + "time_per_iteration": 2.9332852363586426 + }, + { + "auxiliary_loss_clip": 0.06587812, + "auxiliary_loss_mlp": 0.01287353, + "balance_loss_clip": 0.06302886, + "balance_loss_mlp": 0.01262284, + "epoch": 0.15770329174808356, + "flos": 20819887695360.0, + "grad_norm": 1.8034719431418926, + "language_loss": 0.88731241, + "learning_rate": 3.831353102455684e-06, + "loss": 0.96606404, + "num_input_tokens_seen": 56885265, + "router_z_loss_clip": 2.84765625, + "router_z_loss_mlp": 0.25085449, + "step": 2623, + "time_per_iteration": 3.993907928466797 + }, + { + "auxiliary_loss_clip": 0.06595732, + "auxiliary_loss_mlp": 0.01282154, + "balance_loss_clip": 0.0630911, + "balance_loss_mlp": 0.01255594, + "epoch": 0.15776341500075153, + "flos": 24981148362240.0, + "grad_norm": 2.539905380031208, + "language_loss": 0.82629728, + "learning_rate": 3.831196536861448e-06, + "loss": 0.90507615, + "num_input_tokens_seen": 56906710, + "router_z_loss_clip": 2.86523438, + "router_z_loss_mlp": 0.265625, + "step": 2624, + "time_per_iteration": 2.5706846714019775 + }, + { + "auxiliary_loss_clip": 0.06606949, + "auxiliary_loss_mlp": 0.01292533, + "balance_loss_clip": 0.06309812, + "balance_loss_mlp": 0.01266093, + "epoch": 0.15782353825341952, + "flos": 21914331799680.0, + "grad_norm": 3.0693090763099815, + "language_loss": 0.81940538, + "learning_rate": 3.831039901828054e-06, + "loss": 0.89840019, + "num_input_tokens_seen": 56924275, + "router_z_loss_clip": 2.9765625, + "router_z_loss_mlp": 0.26452637, + "step": 2625, + "time_per_iteration": 2.569840669631958 + }, + { + "auxiliary_loss_clip": 0.06593765, + "auxiliary_loss_mlp": 0.01293944, + "balance_loss_clip": 0.06303135, + "balance_loss_mlp": 0.01268064, + "epoch": 0.15788366150608749, + "flos": 26184395393280.0, + "grad_norm": 2.523517901800404, + "language_loss": 0.81776226, + "learning_rate": 3.830883197361445e-06, + "loss": 0.89663935, + "num_input_tokens_seen": 56941525, + "router_z_loss_clip": 2.90429688, + "router_z_loss_mlp": 0.25891113, + "step": 2626, + "time_per_iteration": 2.561379909515381 + }, + { + "auxiliary_loss_clip": 0.06594853, + "auxiliary_loss_mlp": 0.01294161, + "balance_loss_clip": 0.06304863, + "balance_loss_mlp": 0.01267434, + "epoch": 0.15794378475875545, + "flos": 27717321513600.0, + "grad_norm": 1.6929688421529916, + "language_loss": 0.7457962, + "learning_rate": 3.830726423467561e-06, + "loss": 0.82468635, + "num_input_tokens_seen": 56962145, + "router_z_loss_clip": 2.90039062, + "router_z_loss_mlp": 0.26708984, + "step": 2627, + "time_per_iteration": 2.596707344055176 + }, + { + "auxiliary_loss_clip": 0.06587663, + "auxiliary_loss_mlp": 0.01294139, + "balance_loss_clip": 0.06296949, + "balance_loss_mlp": 0.01267007, + "epoch": 0.15800390801142342, + "flos": 12135011379840.0, + "grad_norm": 2.3877400099999413, + "language_loss": 0.87097675, + "learning_rate": 3.830569580152348e-06, + "loss": 0.94979477, + "num_input_tokens_seen": 56977505, + "router_z_loss_clip": 2.91015625, + "router_z_loss_mlp": 0.27172852, + "step": 2628, + "time_per_iteration": 5.372643709182739 + }, + { + "auxiliary_loss_clip": 0.06588875, + "auxiliary_loss_mlp": 0.01280598, + "balance_loss_clip": 0.06300817, + "balance_loss_mlp": 0.0125548, + "epoch": 0.15806403126409138, + "flos": 20711084768640.0, + "grad_norm": 2.1789511738163236, + "language_loss": 0.77439439, + "learning_rate": 3.830412667421752e-06, + "loss": 0.85308909, + "num_input_tokens_seen": 56996770, + "router_z_loss_clip": 2.88085938, + "router_z_loss_mlp": 0.25097656, + "step": 2629, + "time_per_iteration": 2.571425199508667 + }, + { + "auxiliary_loss_clip": 0.06593206, + "auxiliary_loss_mlp": 0.0128531, + "balance_loss_clip": 0.06298864, + "balance_loss_mlp": 0.01257117, + "epoch": 0.15812415451675935, + "flos": 17827479158400.0, + "grad_norm": 2.6284348264521853, + "language_loss": 0.74838495, + "learning_rate": 3.8302556852817245e-06, + "loss": 0.82717013, + "num_input_tokens_seen": 57014970, + "router_z_loss_clip": 2.94726562, + "router_z_loss_mlp": 0.28186035, + "step": 2630, + "time_per_iteration": 2.538496971130371 + }, + { + "auxiliary_loss_clip": 0.06592915, + "auxiliary_loss_mlp": 0.01286291, + "balance_loss_clip": 0.06295606, + "balance_loss_mlp": 0.0125904, + "epoch": 0.15818427776942734, + "flos": 20090230341120.0, + "grad_norm": 3.888480122572148, + "language_loss": 0.84692156, + "learning_rate": 3.8300986337382184e-06, + "loss": 0.9257136, + "num_input_tokens_seen": 57034045, + "router_z_loss_clip": 2.97070312, + "router_z_loss_mlp": 0.27270508, + "step": 2631, + "time_per_iteration": 2.6821517944335938 + }, + { + "auxiliary_loss_clip": 0.06584532, + "auxiliary_loss_mlp": 0.01280599, + "balance_loss_clip": 0.06294788, + "balance_loss_mlp": 0.01253563, + "epoch": 0.1582444010220953, + "flos": 21221249552640.0, + "grad_norm": 8.851391146614638, + "language_loss": 0.79768324, + "learning_rate": 3.8299415127971895e-06, + "loss": 0.87633461, + "num_input_tokens_seen": 57053695, + "router_z_loss_clip": 2.8984375, + "router_z_loss_mlp": 0.27050781, + "step": 2632, + "time_per_iteration": 2.5977976322174072 + }, + { + "auxiliary_loss_clip": 0.06588165, + "auxiliary_loss_mlp": 0.01281414, + "balance_loss_clip": 0.06294183, + "balance_loss_mlp": 0.01255414, + "epoch": 0.15830452427476327, + "flos": 17864138119680.0, + "grad_norm": 1.985726901466477, + "language_loss": 0.83594966, + "learning_rate": 3.829784322464594e-06, + "loss": 0.91464543, + "num_input_tokens_seen": 57071290, + "router_z_loss_clip": 2.93945312, + "router_z_loss_mlp": 0.2598877, + "step": 2633, + "time_per_iteration": 2.569474220275879 + }, + { + "auxiliary_loss_clip": 0.0658908, + "auxiliary_loss_mlp": 0.0128242, + "balance_loss_clip": 0.0629508, + "balance_loss_mlp": 0.01256265, + "epoch": 0.15836464752743123, + "flos": 24541827805440.0, + "grad_norm": 1.6688248008006443, + "language_loss": 0.78379452, + "learning_rate": 3.829627062746394e-06, + "loss": 0.86250955, + "num_input_tokens_seen": 57091465, + "router_z_loss_clip": 2.93945312, + "router_z_loss_mlp": 0.26196289, + "step": 2634, + "time_per_iteration": 2.5919923782348633 + }, + { + "auxiliary_loss_clip": 0.06593279, + "auxiliary_loss_mlp": 0.01291316, + "balance_loss_clip": 0.06295943, + "balance_loss_mlp": 0.01263337, + "epoch": 0.1584247707800992, + "flos": 20127057010560.0, + "grad_norm": 2.0830753641117306, + "language_loss": 0.89997375, + "learning_rate": 3.829469733648552e-06, + "loss": 0.97881973, + "num_input_tokens_seen": 57110075, + "router_z_loss_clip": 2.9765625, + "router_z_loss_mlp": 0.27966309, + "step": 2635, + "time_per_iteration": 2.5786406993865967 + }, + { + "auxiliary_loss_clip": 0.06588058, + "auxiliary_loss_mlp": 0.01288113, + "balance_loss_clip": 0.06292774, + "balance_loss_mlp": 0.01260218, + "epoch": 0.15848489403276717, + "flos": 20382202293120.0, + "grad_norm": 2.014850044069841, + "language_loss": 0.7709136, + "learning_rate": 3.829312335177034e-06, + "loss": 0.8496753, + "num_input_tokens_seen": 57128945, + "router_z_loss_clip": 2.95703125, + "router_z_loss_mlp": 0.27868652, + "step": 2636, + "time_per_iteration": 2.6201331615448 + }, + { + "auxiliary_loss_clip": 0.06586573, + "auxiliary_loss_mlp": 0.0128751, + "balance_loss_clip": 0.06290652, + "balance_loss_mlp": 0.0126101, + "epoch": 0.15854501728543513, + "flos": 39356018760960.0, + "grad_norm": 2.044553358008507, + "language_loss": 0.73238122, + "learning_rate": 3.82915486733781e-06, + "loss": 0.81112206, + "num_input_tokens_seen": 57152385, + "router_z_loss_clip": 2.95898438, + "router_z_loss_mlp": 0.26489258, + "step": 2637, + "time_per_iteration": 2.742854595184326 + }, + { + "auxiliary_loss_clip": 0.06583421, + "auxiliary_loss_mlp": 0.01288932, + "balance_loss_clip": 0.06292208, + "balance_loss_mlp": 0.01262468, + "epoch": 0.15860514053810312, + "flos": 24871297259520.0, + "grad_norm": 1.8074381255816763, + "language_loss": 0.79285657, + "learning_rate": 3.82899733013685e-06, + "loss": 0.87158012, + "num_input_tokens_seen": 57172620, + "router_z_loss_clip": 2.91601562, + "router_z_loss_mlp": 0.26489258, + "step": 2638, + "time_per_iteration": 2.5642874240875244 + }, + { + "auxiliary_loss_clip": 0.06588158, + "auxiliary_loss_mlp": 0.01287351, + "balance_loss_clip": 0.06294204, + "balance_loss_mlp": 0.01258908, + "epoch": 0.1586652637907711, + "flos": 26184982371840.0, + "grad_norm": 2.3471549301232844, + "language_loss": 0.76132977, + "learning_rate": 3.828839723580128e-06, + "loss": 0.84008479, + "num_input_tokens_seen": 57194680, + "router_z_loss_clip": 2.94335938, + "router_z_loss_mlp": 0.28491211, + "step": 2639, + "time_per_iteration": 2.615779399871826 + }, + { + "auxiliary_loss_clip": 0.06586854, + "auxiliary_loss_mlp": 0.01295396, + "balance_loss_clip": 0.06293523, + "balance_loss_mlp": 0.01267299, + "epoch": 0.15872538704343905, + "flos": 19798174535040.0, + "grad_norm": 1.8583301329388602, + "language_loss": 0.82681525, + "learning_rate": 3.82868204767362e-06, + "loss": 0.90563774, + "num_input_tokens_seen": 57214675, + "router_z_loss_clip": 2.93164062, + "router_z_loss_mlp": 0.28076172, + "step": 2640, + "time_per_iteration": 2.5406789779663086 + }, + { + "auxiliary_loss_clip": 0.06583565, + "auxiliary_loss_mlp": 0.0129063, + "balance_loss_clip": 0.06294291, + "balance_loss_mlp": 0.01262342, + "epoch": 0.15878551029610702, + "flos": 28482883142400.0, + "grad_norm": 1.847395702831907, + "language_loss": 0.67676318, + "learning_rate": 3.828524302423306e-06, + "loss": 0.75550508, + "num_input_tokens_seen": 57235830, + "router_z_loss_clip": 2.89453125, + "router_z_loss_mlp": 0.28308105, + "step": 2641, + "time_per_iteration": 2.6107757091522217 + }, + { + "auxiliary_loss_clip": 0.06593709, + "auxiliary_loss_mlp": 0.01287834, + "balance_loss_clip": 0.06291051, + "balance_loss_mlp": 0.01259199, + "epoch": 0.15884563354877498, + "flos": 24213532308480.0, + "grad_norm": 2.4455482341546366, + "language_loss": 0.77487421, + "learning_rate": 3.828366487835167e-06, + "loss": 0.85368967, + "num_input_tokens_seen": 57255970, + "router_z_loss_clip": 3.02929688, + "router_z_loss_mlp": 0.28674316, + "step": 2642, + "time_per_iteration": 2.549790382385254 + }, + { + "auxiliary_loss_clip": 0.06588584, + "auxiliary_loss_mlp": 0.01290508, + "balance_loss_clip": 0.06297128, + "balance_loss_mlp": 0.0126303, + "epoch": 0.15890575680144295, + "flos": 23956332600960.0, + "grad_norm": 2.206510162678276, + "language_loss": 0.71574652, + "learning_rate": 3.828208603915186e-06, + "loss": 0.79453743, + "num_input_tokens_seen": 57274435, + "router_z_loss_clip": 2.91992188, + "router_z_loss_mlp": 0.27478027, + "step": 2643, + "time_per_iteration": 2.5622386932373047 + }, + { + "auxiliary_loss_clip": 0.06581764, + "auxiliary_loss_mlp": 0.01292278, + "balance_loss_clip": 0.06295977, + "balance_loss_mlp": 0.01265432, + "epoch": 0.15896588005411091, + "flos": 21221375333760.0, + "grad_norm": 1.9554363630175624, + "language_loss": 0.78877175, + "learning_rate": 3.828050650669353e-06, + "loss": 0.86751211, + "num_input_tokens_seen": 57293115, + "router_z_loss_clip": 2.859375, + "router_z_loss_mlp": 0.26867676, + "step": 2644, + "time_per_iteration": 2.519049644470215 + }, + { + "auxiliary_loss_clip": 0.06584983, + "auxiliary_loss_mlp": 0.01285638, + "balance_loss_clip": 0.06294875, + "balance_loss_mlp": 0.01257588, + "epoch": 0.1590260033067789, + "flos": 24359203831680.0, + "grad_norm": 1.8306681743440225, + "language_loss": 0.83401352, + "learning_rate": 3.827892628103657e-06, + "loss": 0.91271967, + "num_input_tokens_seen": 57312565, + "router_z_loss_clip": 2.90234375, + "router_z_loss_mlp": 0.28039551, + "step": 2645, + "time_per_iteration": 2.5938899517059326 + }, + { + "auxiliary_loss_clip": 0.06594808, + "auxiliary_loss_mlp": 0.01293395, + "balance_loss_clip": 0.063001, + "balance_loss_mlp": 0.01263914, + "epoch": 0.15908612655944687, + "flos": 32056719960960.0, + "grad_norm": 2.510422612834076, + "language_loss": 0.70788723, + "learning_rate": 3.827734536224087e-06, + "loss": 0.78676921, + "num_input_tokens_seen": 57333360, + "router_z_loss_clip": 2.94921875, + "router_z_loss_mlp": 0.2947998, + "step": 2646, + "time_per_iteration": 2.6329824924468994 + }, + { + "auxiliary_loss_clip": 0.06588359, + "auxiliary_loss_mlp": 0.01289443, + "balance_loss_clip": 0.06303679, + "balance_loss_mlp": 0.01262728, + "epoch": 0.15914624981211484, + "flos": 17791155613440.0, + "grad_norm": 1.930709185953096, + "language_loss": 0.63532102, + "learning_rate": 3.827576375036642e-06, + "loss": 0.71409905, + "num_input_tokens_seen": 57350575, + "router_z_loss_clip": 2.84765625, + "router_z_loss_mlp": 0.26696777, + "step": 2647, + "time_per_iteration": 2.5299501419067383 + }, + { + "auxiliary_loss_clip": 0.06584711, + "auxiliary_loss_mlp": 0.01288467, + "balance_loss_clip": 0.06297973, + "balance_loss_mlp": 0.0126174, + "epoch": 0.1592063730647828, + "flos": 17718298888320.0, + "grad_norm": 2.1247786745604818, + "language_loss": 0.90530396, + "learning_rate": 3.827418144547318e-06, + "loss": 0.98403573, + "num_input_tokens_seen": 57367570, + "router_z_loss_clip": 2.86914062, + "router_z_loss_mlp": 0.26757812, + "step": 2648, + "time_per_iteration": 2.5112242698669434 + }, + { + "auxiliary_loss_clip": 0.06582057, + "auxiliary_loss_mlp": 0.01285915, + "balance_loss_clip": 0.06301906, + "balance_loss_mlp": 0.01259915, + "epoch": 0.15926649631745077, + "flos": 18808927632000.0, + "grad_norm": 2.0063837423825044, + "language_loss": 0.92929685, + "learning_rate": 3.827259844762114e-06, + "loss": 1.00797653, + "num_input_tokens_seen": 57383980, + "router_z_loss_clip": 2.80078125, + "router_z_loss_mlp": 0.26013184, + "step": 2649, + "time_per_iteration": 2.5400166511535645 + }, + { + "auxiliary_loss_clip": 0.06614827, + "auxiliary_loss_mlp": 0.01289461, + "balance_loss_clip": 0.0630791, + "balance_loss_mlp": 0.01258156, + "epoch": 0.15932661957011873, + "flos": 17571956532480.0, + "grad_norm": 3.5338623134858924, + "language_loss": 0.73033249, + "learning_rate": 3.827101475687033e-06, + "loss": 0.80937541, + "num_input_tokens_seen": 57400840, + "router_z_loss_clip": 3.07226562, + "router_z_loss_mlp": 0.31311035, + "step": 2650, + "time_per_iteration": 2.499260187149048 + }, + { + "auxiliary_loss_clip": 0.06585062, + "auxiliary_loss_mlp": 0.01286624, + "balance_loss_clip": 0.06301688, + "balance_loss_mlp": 0.01259837, + "epoch": 0.15938674282278673, + "flos": 13339432368000.0, + "grad_norm": 2.105429239138805, + "language_loss": 0.72751939, + "learning_rate": 3.826943037328082e-06, + "loss": 0.80623615, + "num_input_tokens_seen": 57419230, + "router_z_loss_clip": 2.83398438, + "router_z_loss_mlp": 0.2677002, + "step": 2651, + "time_per_iteration": 2.5559604167938232 + }, + { + "auxiliary_loss_clip": 0.06597096, + "auxiliary_loss_mlp": 0.01284795, + "balance_loss_clip": 0.06307643, + "balance_loss_mlp": 0.01257925, + "epoch": 0.1594468660754547, + "flos": 22494879613440.0, + "grad_norm": 1.8417049105495777, + "language_loss": 0.80598879, + "learning_rate": 3.8267845296912674e-06, + "loss": 0.88480765, + "num_input_tokens_seen": 57439315, + "router_z_loss_clip": 2.89257812, + "router_z_loss_mlp": 0.26855469, + "step": 2652, + "time_per_iteration": 2.562206745147705 + }, + { + "auxiliary_loss_clip": 0.06582868, + "auxiliary_loss_mlp": 0.01288009, + "balance_loss_clip": 0.06299073, + "balance_loss_mlp": 0.01260745, + "epoch": 0.15950698932812266, + "flos": 15011782882560.0, + "grad_norm": 3.0665030726784233, + "language_loss": 0.71219099, + "learning_rate": 3.826625952782601e-06, + "loss": 0.79089975, + "num_input_tokens_seen": 57454635, + "router_z_loss_clip": 2.83984375, + "router_z_loss_mlp": 0.27258301, + "step": 2653, + "time_per_iteration": 2.5217130184173584 + }, + { + "auxiliary_loss_clip": 0.06588405, + "auxiliary_loss_mlp": 0.01286539, + "balance_loss_clip": 0.06299819, + "balance_loss_mlp": 0.01261064, + "epoch": 0.15956711258079062, + "flos": 30163074013440.0, + "grad_norm": 3.2964270915620655, + "language_loss": 0.78400207, + "learning_rate": 3.826467306608095e-06, + "loss": 0.86275154, + "num_input_tokens_seen": 57476805, + "router_z_loss_clip": 2.890625, + "router_z_loss_mlp": 0.25488281, + "step": 2654, + "time_per_iteration": 2.68938946723938 + }, + { + "auxiliary_loss_clip": 0.06585521, + "auxiliary_loss_mlp": 0.01284621, + "balance_loss_clip": 0.06301536, + "balance_loss_mlp": 0.01259265, + "epoch": 0.1596272358334586, + "flos": 21039044849280.0, + "grad_norm": 1.8634603693624054, + "language_loss": 0.82786137, + "learning_rate": 3.826308591173765e-06, + "loss": 0.90656281, + "num_input_tokens_seen": 57496400, + "router_z_loss_clip": 2.83984375, + "router_z_loss_mlp": 0.25341797, + "step": 2655, + "time_per_iteration": 2.5611259937286377 + }, + { + "auxiliary_loss_clip": 0.06585874, + "auxiliary_loss_mlp": 0.01285173, + "balance_loss_clip": 0.06296754, + "balance_loss_mlp": 0.01259937, + "epoch": 0.15968735908612655, + "flos": 15273426856320.0, + "grad_norm": 1.9406686852412747, + "language_loss": 0.74707991, + "learning_rate": 3.826149806485631e-06, + "loss": 0.82579041, + "num_input_tokens_seen": 57513700, + "router_z_loss_clip": 2.890625, + "router_z_loss_mlp": 0.25244141, + "step": 2656, + "time_per_iteration": 2.510824680328369 + }, + { + "auxiliary_loss_clip": 0.06577112, + "auxiliary_loss_mlp": 0.0129381, + "balance_loss_clip": 0.06299932, + "balance_loss_mlp": 0.01268705, + "epoch": 0.15974748233879452, + "flos": 52677338647680.0, + "grad_norm": 1.8958398061879393, + "language_loss": 0.78470719, + "learning_rate": 3.825990952549713e-06, + "loss": 0.86341643, + "num_input_tokens_seen": 57536180, + "router_z_loss_clip": 2.7734375, + "router_z_loss_mlp": 0.25109863, + "step": 2657, + "time_per_iteration": 2.8164706230163574 + }, + { + "auxiliary_loss_clip": 0.06582649, + "auxiliary_loss_mlp": 0.01286585, + "balance_loss_clip": 0.062974, + "balance_loss_mlp": 0.01260514, + "epoch": 0.1598076055914625, + "flos": 18739047726720.0, + "grad_norm": 1.7078792593137306, + "language_loss": 0.75124943, + "learning_rate": 3.825832029372035e-06, + "loss": 0.82994181, + "num_input_tokens_seen": 57555025, + "router_z_loss_clip": 2.84960938, + "router_z_loss_mlp": 0.26098633, + "step": 2658, + "time_per_iteration": 2.539357900619507 + }, + { + "auxiliary_loss_clip": 0.06584077, + "auxiliary_loss_mlp": 0.01290613, + "balance_loss_clip": 0.06297718, + "balance_loss_mlp": 0.0126354, + "epoch": 0.15986772884413047, + "flos": 34357681405440.0, + "grad_norm": 1.7106510421340806, + "language_loss": 0.76173538, + "learning_rate": 3.825673036958624e-06, + "loss": 0.84048235, + "num_input_tokens_seen": 57577660, + "router_z_loss_clip": 2.86328125, + "router_z_loss_mlp": 0.27087402, + "step": 2659, + "time_per_iteration": 2.7063279151916504 + }, + { + "auxiliary_loss_clip": 0.06590043, + "auxiliary_loss_mlp": 0.01292057, + "balance_loss_clip": 0.06300306, + "balance_loss_mlp": 0.01265164, + "epoch": 0.15992785209679844, + "flos": 22061596550400.0, + "grad_norm": 2.109703300615196, + "language_loss": 0.91436422, + "learning_rate": 3.825513975315508e-06, + "loss": 0.99318516, + "num_input_tokens_seen": 57596335, + "router_z_loss_clip": 2.8984375, + "router_z_loss_mlp": 0.26855469, + "step": 2660, + "time_per_iteration": 3.960657835006714 + }, + { + "auxiliary_loss_clip": 0.06587565, + "auxiliary_loss_mlp": 0.01283697, + "balance_loss_clip": 0.06297715, + "balance_loss_mlp": 0.01257018, + "epoch": 0.1599879753494664, + "flos": 33073946928000.0, + "grad_norm": 2.772952590222661, + "language_loss": 0.79090029, + "learning_rate": 3.82535484444872e-06, + "loss": 0.86961293, + "num_input_tokens_seen": 57616830, + "router_z_loss_clip": 2.90039062, + "router_z_loss_mlp": 0.26647949, + "step": 2661, + "time_per_iteration": 2.64117693901062 + }, + { + "auxiliary_loss_clip": 0.0657732, + "auxiliary_loss_mlp": 0.01287922, + "balance_loss_clip": 0.06293119, + "balance_loss_mlp": 0.01262495, + "epoch": 0.16004809860213437, + "flos": 28045533156480.0, + "grad_norm": 1.8363743510340895, + "language_loss": 0.74837106, + "learning_rate": 3.825195644364292e-06, + "loss": 0.82702351, + "num_input_tokens_seen": 57635515, + "router_z_loss_clip": 2.84375, + "router_z_loss_mlp": 0.25390625, + "step": 2662, + "time_per_iteration": 4.100783586502075 + }, + { + "auxiliary_loss_clip": 0.06590086, + "auxiliary_loss_mlp": 0.01285907, + "balance_loss_clip": 0.06299042, + "balance_loss_mlp": 0.01259967, + "epoch": 0.16010822185480234, + "flos": 22786096878720.0, + "grad_norm": 1.8771670502098623, + "language_loss": 0.82632995, + "learning_rate": 3.825036375068263e-06, + "loss": 0.90508991, + "num_input_tokens_seen": 57654250, + "router_z_loss_clip": 2.91015625, + "router_z_loss_mlp": 0.25964355, + "step": 2663, + "time_per_iteration": 2.5558366775512695 + }, + { + "auxiliary_loss_clip": 0.06586467, + "auxiliary_loss_mlp": 0.01285272, + "balance_loss_clip": 0.06297847, + "balance_loss_mlp": 0.01260011, + "epoch": 0.16016834510747033, + "flos": 20090188414080.0, + "grad_norm": 3.3923647685745344, + "language_loss": 0.81316251, + "learning_rate": 3.824877036566672e-06, + "loss": 0.89187992, + "num_input_tokens_seen": 57672645, + "router_z_loss_clip": 2.88671875, + "router_z_loss_mlp": 0.25268555, + "step": 2664, + "time_per_iteration": 2.5118319988250732 + }, + { + "auxiliary_loss_clip": 0.06584498, + "auxiliary_loss_mlp": 0.01285586, + "balance_loss_clip": 0.06298545, + "balance_loss_mlp": 0.01259038, + "epoch": 0.1602284683601383, + "flos": 21179391638400.0, + "grad_norm": 1.6927431664351194, + "language_loss": 0.94832575, + "learning_rate": 3.824717628865561e-06, + "loss": 1.02702665, + "num_input_tokens_seen": 57691055, + "router_z_loss_clip": 2.86132812, + "router_z_loss_mlp": 0.26550293, + "step": 2665, + "time_per_iteration": 2.54654860496521 + }, + { + "auxiliary_loss_clip": 0.06588221, + "auxiliary_loss_mlp": 0.0128992, + "balance_loss_clip": 0.06298642, + "balance_loss_mlp": 0.01263051, + "epoch": 0.16028859161280626, + "flos": 14652823991040.0, + "grad_norm": 2.069431022104881, + "language_loss": 0.85796285, + "learning_rate": 3.824558151970974e-06, + "loss": 0.93674427, + "num_input_tokens_seen": 57707235, + "router_z_loss_clip": 2.89648438, + "router_z_loss_mlp": 0.26879883, + "step": 2666, + "time_per_iteration": 2.483457088470459 + }, + { + "auxiliary_loss_clip": 0.06582008, + "auxiliary_loss_mlp": 0.01292714, + "balance_loss_clip": 0.06294423, + "balance_loss_mlp": 0.01268645, + "epoch": 0.16034871486547422, + "flos": 20995677561600.0, + "grad_norm": 1.9110296287370478, + "language_loss": 0.82042331, + "learning_rate": 3.8243986058889595e-06, + "loss": 0.89917052, + "num_input_tokens_seen": 57724190, + "router_z_loss_clip": 2.875, + "router_z_loss_mlp": 0.24072266, + "step": 2667, + "time_per_iteration": 3.9772729873657227 + }, + { + "auxiliary_loss_clip": 0.06585021, + "auxiliary_loss_mlp": 0.01299108, + "balance_loss_clip": 0.06302348, + "balance_loss_mlp": 0.01272608, + "epoch": 0.1604088381181422, + "flos": 21404167015680.0, + "grad_norm": 2.2548046072843664, + "language_loss": 0.74520987, + "learning_rate": 3.824238990625567e-06, + "loss": 0.82405114, + "num_input_tokens_seen": 57743620, + "router_z_loss_clip": 2.82421875, + "router_z_loss_mlp": 0.26513672, + "step": 2668, + "time_per_iteration": 2.5379245281219482 + }, + { + "auxiliary_loss_clip": 0.06581191, + "auxiliary_loss_mlp": 0.01286404, + "balance_loss_clip": 0.06295477, + "balance_loss_mlp": 0.01259296, + "epoch": 0.16046896137081015, + "flos": 23883601656960.0, + "grad_norm": 1.6904761581724046, + "language_loss": 0.78225315, + "learning_rate": 3.824079306186848e-06, + "loss": 0.86092913, + "num_input_tokens_seen": 57764810, + "router_z_loss_clip": 2.86132812, + "router_z_loss_mlp": 0.27124023, + "step": 2669, + "time_per_iteration": 2.5322623252868652 + }, + { + "auxiliary_loss_clip": 0.06461855, + "auxiliary_loss_mlp": 0.01262059, + "balance_loss_clip": 0.06292316, + "balance_loss_mlp": 0.01253518, + "epoch": 0.16052908462347812, + "flos": 59823907453440.0, + "grad_norm": 0.8025105121256505, + "language_loss": 0.55497211, + "learning_rate": 3.823919552578861e-06, + "loss": 0.63221133, + "num_input_tokens_seen": 57824390, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.08551025, + "step": 2670, + "time_per_iteration": 3.0635480880737305 + }, + { + "auxiliary_loss_clip": 0.06584324, + "auxiliary_loss_mlp": 0.01300694, + "balance_loss_clip": 0.06294604, + "balance_loss_mlp": 0.01273097, + "epoch": 0.1605892078761461, + "flos": 18302494354560.0, + "grad_norm": 1.9278903563018932, + "language_loss": 0.79113603, + "learning_rate": 3.82375972980766e-06, + "loss": 0.86998624, + "num_input_tokens_seen": 57843665, + "router_z_loss_clip": 2.8984375, + "router_z_loss_mlp": 0.27587891, + "step": 2671, + "time_per_iteration": 2.5478527545928955 + }, + { + "auxiliary_loss_clip": 0.06586512, + "auxiliary_loss_mlp": 0.01285282, + "balance_loss_clip": 0.06298812, + "balance_loss_mlp": 0.01259914, + "epoch": 0.16064933112881408, + "flos": 32168918977920.0, + "grad_norm": 2.1901870356390964, + "language_loss": 0.65440154, + "learning_rate": 3.8235998378793086e-06, + "loss": 0.73311949, + "num_input_tokens_seen": 57863305, + "router_z_loss_clip": 2.87890625, + "router_z_loss_mlp": 0.25378418, + "step": 2672, + "time_per_iteration": 2.659353494644165 + }, + { + "auxiliary_loss_clip": 0.06589735, + "auxiliary_loss_mlp": 0.01293218, + "balance_loss_clip": 0.06296135, + "balance_loss_mlp": 0.01263916, + "epoch": 0.16070945438148204, + "flos": 19834959277440.0, + "grad_norm": 2.1290275432047037, + "language_loss": 0.86193001, + "learning_rate": 3.8234398767998675e-06, + "loss": 0.94075954, + "num_input_tokens_seen": 57883025, + "router_z_loss_clip": 2.93554688, + "router_z_loss_mlp": 0.29296875, + "step": 2673, + "time_per_iteration": 2.5288193225860596 + }, + { + "auxiliary_loss_clip": 0.06583102, + "auxiliary_loss_mlp": 0.01291016, + "balance_loss_clip": 0.06295739, + "balance_loss_mlp": 0.0126572, + "epoch": 0.16076957763415, + "flos": 18918569099520.0, + "grad_norm": 2.3065631305512473, + "language_loss": 0.73982865, + "learning_rate": 3.823279846575403e-06, + "loss": 0.81856978, + "num_input_tokens_seen": 57901430, + "router_z_loss_clip": 2.87304688, + "router_z_loss_mlp": 0.25305176, + "step": 2674, + "time_per_iteration": 2.524121046066284 + }, + { + "auxiliary_loss_clip": 0.06576435, + "auxiliary_loss_mlp": 0.0128192, + "balance_loss_clip": 0.06293078, + "balance_loss_mlp": 0.01255086, + "epoch": 0.16082970088681797, + "flos": 16770071358720.0, + "grad_norm": 3.691225614104051, + "language_loss": 0.85411537, + "learning_rate": 3.823119747211986e-06, + "loss": 0.93269891, + "num_input_tokens_seen": 57919550, + "router_z_loss_clip": 2.83007812, + "router_z_loss_mlp": 0.26806641, + "step": 2675, + "time_per_iteration": 2.4984703063964844 + }, + { + "auxiliary_loss_clip": 0.06581541, + "auxiliary_loss_mlp": 0.01285801, + "balance_loss_clip": 0.06293826, + "balance_loss_mlp": 0.01259468, + "epoch": 0.16088982413948594, + "flos": 35158560330240.0, + "grad_norm": 1.8394721735800996, + "language_loss": 0.83251232, + "learning_rate": 3.822959578715685e-06, + "loss": 0.91118574, + "num_input_tokens_seen": 57939890, + "router_z_loss_clip": 2.875, + "router_z_loss_mlp": 0.26306152, + "step": 2676, + "time_per_iteration": 2.6714260578155518 + }, + { + "auxiliary_loss_clip": 0.06567734, + "auxiliary_loss_mlp": 0.01280714, + "balance_loss_clip": 0.06290022, + "balance_loss_mlp": 0.01257456, + "epoch": 0.1609499473921539, + "flos": 18631125267840.0, + "grad_norm": 4.8459600996760805, + "language_loss": 0.74951547, + "learning_rate": 3.822799341092573e-06, + "loss": 0.82799989, + "num_input_tokens_seen": 57957410, + "router_z_loss_clip": 2.77539062, + "router_z_loss_mlp": 0.23266602, + "step": 2677, + "time_per_iteration": 2.5061256885528564 + }, + { + "auxiliary_loss_clip": 0.06577247, + "auxiliary_loss_mlp": 0.01283067, + "balance_loss_clip": 0.06292509, + "balance_loss_mlp": 0.01258164, + "epoch": 0.1610100706448219, + "flos": 33154057031040.0, + "grad_norm": 1.8038433202406936, + "language_loss": 0.77285242, + "learning_rate": 3.822639034348728e-06, + "loss": 0.85145557, + "num_input_tokens_seen": 57977900, + "router_z_loss_clip": 2.84765625, + "router_z_loss_mlp": 0.24926758, + "step": 2678, + "time_per_iteration": 2.6886472702026367 + }, + { + "auxiliary_loss_clip": 0.06581186, + "auxiliary_loss_mlp": 0.01287879, + "balance_loss_clip": 0.06295253, + "balance_loss_mlp": 0.01261271, + "epoch": 0.16107019389748986, + "flos": 34685054507520.0, + "grad_norm": 1.8476006870379242, + "language_loss": 0.71465111, + "learning_rate": 3.822478658490228e-06, + "loss": 0.79334176, + "num_input_tokens_seen": 57998210, + "router_z_loss_clip": 2.85742188, + "router_z_loss_mlp": 0.26611328, + "step": 2679, + "time_per_iteration": 2.646629810333252 + }, + { + "auxiliary_loss_clip": 0.06453654, + "auxiliary_loss_mlp": 0.01258662, + "balance_loss_clip": 0.06285442, + "balance_loss_mlp": 0.01250973, + "epoch": 0.16113031715015783, + "flos": 65730920411520.0, + "grad_norm": 0.7655469055577169, + "language_loss": 0.51874888, + "learning_rate": 3.822318213523154e-06, + "loss": 0.59587204, + "num_input_tokens_seen": 58059420, + "router_z_loss_clip": 1.6875, + "router_z_loss_mlp": 0.07678223, + "step": 2680, + "time_per_iteration": 3.3470637798309326 + }, + { + "auxiliary_loss_clip": 0.06584955, + "auxiliary_loss_mlp": 0.01288163, + "balance_loss_clip": 0.06295321, + "balance_loss_mlp": 0.01259363, + "epoch": 0.1611904404028258, + "flos": 20816156188800.0, + "grad_norm": 2.2126972690115476, + "language_loss": 0.81079412, + "learning_rate": 3.8221576994535925e-06, + "loss": 0.88952529, + "num_input_tokens_seen": 58078370, + "router_z_loss_clip": 2.8984375, + "router_z_loss_mlp": 0.28808594, + "step": 2681, + "time_per_iteration": 2.5526723861694336 + }, + { + "auxiliary_loss_clip": 0.06577247, + "auxiliary_loss_mlp": 0.01287934, + "balance_loss_clip": 0.06295492, + "balance_loss_mlp": 0.01262029, + "epoch": 0.16125056365549376, + "flos": 27020172343680.0, + "grad_norm": 2.1176985882953647, + "language_loss": 0.70093226, + "learning_rate": 3.821997116287627e-06, + "loss": 0.77958405, + "num_input_tokens_seen": 58097395, + "router_z_loss_clip": 2.81835938, + "router_z_loss_mlp": 0.25891113, + "step": 2682, + "time_per_iteration": 2.5618250370025635 + }, + { + "auxiliary_loss_clip": 0.0657934, + "auxiliary_loss_mlp": 0.01288185, + "balance_loss_clip": 0.06295457, + "balance_loss_mlp": 0.01261708, + "epoch": 0.16131068690816172, + "flos": 19281762622080.0, + "grad_norm": 2.105414566897303, + "language_loss": 0.88063419, + "learning_rate": 3.821836464031348e-06, + "loss": 0.9593094, + "num_input_tokens_seen": 58115630, + "router_z_loss_clip": 2.83789062, + "router_z_loss_mlp": 0.26464844, + "step": 2683, + "time_per_iteration": 2.528503656387329 + }, + { + "auxiliary_loss_clip": 0.06581098, + "auxiliary_loss_mlp": 0.01286491, + "balance_loss_clip": 0.06297365, + "balance_loss_mlp": 0.01260718, + "epoch": 0.16137081016082971, + "flos": 35347137943680.0, + "grad_norm": 2.6304159370219447, + "language_loss": 0.75242329, + "learning_rate": 3.821675742690849e-06, + "loss": 0.83109927, + "num_input_tokens_seen": 58138655, + "router_z_loss_clip": 2.83984375, + "router_z_loss_mlp": 0.25744629, + "step": 2684, + "time_per_iteration": 2.6683855056762695 + }, + { + "auxiliary_loss_clip": 0.06584509, + "auxiliary_loss_mlp": 0.01281022, + "balance_loss_clip": 0.0629454, + "balance_loss_mlp": 0.01253831, + "epoch": 0.16143093341349768, + "flos": 34242924839040.0, + "grad_norm": 3.4255618739056395, + "language_loss": 0.70703149, + "learning_rate": 3.821514952272223e-06, + "loss": 0.78568679, + "num_input_tokens_seen": 58157440, + "router_z_loss_clip": 2.90039062, + "router_z_loss_mlp": 0.27185059, + "step": 2685, + "time_per_iteration": 2.6502463817596436 + }, + { + "auxiliary_loss_clip": 0.06573574, + "auxiliary_loss_mlp": 0.01295712, + "balance_loss_clip": 0.06295055, + "balance_loss_mlp": 0.01269724, + "epoch": 0.16149105666616564, + "flos": 28006400499840.0, + "grad_norm": 2.7207808014988495, + "language_loss": 0.72642833, + "learning_rate": 3.821354092781567e-06, + "loss": 0.80512118, + "num_input_tokens_seen": 58176660, + "router_z_loss_clip": 2.78515625, + "router_z_loss_mlp": 0.26000977, + "step": 2686, + "time_per_iteration": 2.5685417652130127 + }, + { + "auxiliary_loss_clip": 0.06583634, + "auxiliary_loss_mlp": 0.01298345, + "balance_loss_clip": 0.06294057, + "balance_loss_mlp": 0.01269628, + "epoch": 0.1615511799188336, + "flos": 19427434145280.0, + "grad_norm": 2.058545535595822, + "language_loss": 0.82461345, + "learning_rate": 3.821193164224981e-06, + "loss": 0.90343326, + "num_input_tokens_seen": 58195085, + "router_z_loss_clip": 2.90234375, + "router_z_loss_mlp": 0.2869873, + "step": 2687, + "time_per_iteration": 2.5222442150115967 + }, + { + "auxiliary_loss_clip": 0.06594162, + "auxiliary_loss_mlp": 0.01299687, + "balance_loss_clip": 0.06299458, + "balance_loss_mlp": 0.01269109, + "epoch": 0.16161130317150157, + "flos": 22861217664000.0, + "grad_norm": 2.6401237934402575, + "language_loss": 0.72416258, + "learning_rate": 3.821032166608568e-06, + "loss": 0.80310106, + "num_input_tokens_seen": 58213540, + "router_z_loss_clip": 2.9453125, + "router_z_loss_mlp": 0.30578613, + "step": 2688, + "time_per_iteration": 2.5157902240753174 + }, + { + "auxiliary_loss_clip": 0.06589709, + "auxiliary_loss_mlp": 0.01309231, + "balance_loss_clip": 0.06303161, + "balance_loss_mlp": 0.0128161, + "epoch": 0.16167142642416954, + "flos": 26118833973120.0, + "grad_norm": 1.7781492277957918, + "language_loss": 0.76426512, + "learning_rate": 3.8208710999384325e-06, + "loss": 0.84325451, + "num_input_tokens_seen": 58236995, + "router_z_loss_clip": 2.8671875, + "router_z_loss_mlp": 0.27636719, + "step": 2689, + "time_per_iteration": 2.61681866645813 + }, + { + "auxiliary_loss_clip": 0.06586435, + "auxiliary_loss_mlp": 0.01313647, + "balance_loss_clip": 0.06302889, + "balance_loss_mlp": 0.01286182, + "epoch": 0.1617315496768375, + "flos": 22785551827200.0, + "grad_norm": 2.168912849024457, + "language_loss": 0.883026, + "learning_rate": 3.820709964220683e-06, + "loss": 0.96202683, + "num_input_tokens_seen": 58257230, + "router_z_loss_clip": 2.8359375, + "router_z_loss_mlp": 0.27478027, + "step": 2690, + "time_per_iteration": 2.542171001434326 + }, + { + "auxiliary_loss_clip": 0.06581193, + "auxiliary_loss_mlp": 0.01303059, + "balance_loss_clip": 0.06297438, + "balance_loss_mlp": 0.01277, + "epoch": 0.1617916729295055, + "flos": 22023721704960.0, + "grad_norm": 1.681429316785462, + "language_loss": 0.88894439, + "learning_rate": 3.8205487594614284e-06, + "loss": 0.96778685, + "num_input_tokens_seen": 58277080, + "router_z_loss_clip": 2.83398438, + "router_z_loss_mlp": 0.26049805, + "step": 2691, + "time_per_iteration": 2.5444743633270264 + }, + { + "auxiliary_loss_clip": 0.06592601, + "auxiliary_loss_mlp": 0.01300554, + "balance_loss_clip": 0.06297764, + "balance_loss_mlp": 0.01270108, + "epoch": 0.16185179618217346, + "flos": 23444574589440.0, + "grad_norm": 5.894128293889176, + "language_loss": 0.8353231, + "learning_rate": 3.820387485666784e-06, + "loss": 0.91425461, + "num_input_tokens_seen": 58294815, + "router_z_loss_clip": 2.94921875, + "router_z_loss_mlp": 0.30456543, + "step": 2692, + "time_per_iteration": 2.5367183685302734 + }, + { + "auxiliary_loss_clip": 0.06601407, + "auxiliary_loss_mlp": 0.01299753, + "balance_loss_clip": 0.06306131, + "balance_loss_mlp": 0.01270404, + "epoch": 0.16191191943484143, + "flos": 25673182433280.0, + "grad_norm": 2.87727514771051, + "language_loss": 0.82700074, + "learning_rate": 3.820226142842862e-06, + "loss": 0.9060123, + "num_input_tokens_seen": 58313215, + "router_z_loss_clip": 2.953125, + "router_z_loss_mlp": 0.29333496, + "step": 2693, + "time_per_iteration": 2.6187057495117188 + }, + { + "auxiliary_loss_clip": 0.06582904, + "auxiliary_loss_mlp": 0.01312533, + "balance_loss_clip": 0.06302174, + "balance_loss_mlp": 0.01286724, + "epoch": 0.1619720426875094, + "flos": 23484126516480.0, + "grad_norm": 1.4528149346161843, + "language_loss": 0.85022998, + "learning_rate": 3.820064730995783e-06, + "loss": 0.92918432, + "num_input_tokens_seen": 58333215, + "router_z_loss_clip": 2.80664062, + "router_z_loss_mlp": 0.25793457, + "step": 2694, + "time_per_iteration": 2.5672922134399414 + }, + { + "auxiliary_loss_clip": 0.06594259, + "auxiliary_loss_mlp": 0.01304563, + "balance_loss_clip": 0.0630251, + "balance_loss_mlp": 0.0127612, + "epoch": 0.16203216594017736, + "flos": 24140465948160.0, + "grad_norm": 2.1096932177369654, + "language_loss": 0.70739377, + "learning_rate": 3.819903250131667e-06, + "loss": 0.78638196, + "num_input_tokens_seen": 58351160, + "router_z_loss_clip": 2.921875, + "router_z_loss_mlp": 0.28442383, + "step": 2695, + "time_per_iteration": 2.5555880069732666 + }, + { + "auxiliary_loss_clip": 0.0659132, + "auxiliary_loss_mlp": 0.01297552, + "balance_loss_clip": 0.0630125, + "balance_loss_mlp": 0.01269943, + "epoch": 0.16209228919284532, + "flos": 22346566686720.0, + "grad_norm": 2.7194545314545153, + "language_loss": 0.83673584, + "learning_rate": 3.819741700256637e-06, + "loss": 0.91562462, + "num_input_tokens_seen": 58368505, + "router_z_loss_clip": 2.90429688, + "router_z_loss_mlp": 0.27600098, + "step": 2696, + "time_per_iteration": 2.520920753479004 + }, + { + "auxiliary_loss_clip": 0.06605247, + "auxiliary_loss_mlp": 0.01295053, + "balance_loss_clip": 0.06302903, + "balance_loss_mlp": 0.01263773, + "epoch": 0.1621524124455133, + "flos": 15820586017920.0, + "grad_norm": 2.3129442406301766, + "language_loss": 0.89183378, + "learning_rate": 3.8195800813768194e-06, + "loss": 0.97083676, + "num_input_tokens_seen": 58385085, + "router_z_loss_clip": 3.02539062, + "router_z_loss_mlp": 0.31274414, + "step": 2697, + "time_per_iteration": 2.5259652137756348 + }, + { + "auxiliary_loss_clip": 0.0658388, + "auxiliary_loss_mlp": 0.01292599, + "balance_loss_clip": 0.06303512, + "balance_loss_mlp": 0.01267004, + "epoch": 0.16221253569818128, + "flos": 30193905116160.0, + "grad_norm": 1.495271767432462, + "language_loss": 0.81588805, + "learning_rate": 3.819418393498343e-06, + "loss": 0.89465284, + "num_input_tokens_seen": 58406985, + "router_z_loss_clip": 2.8046875, + "router_z_loss_mlp": 0.25598145, + "step": 2698, + "time_per_iteration": 2.595975160598755 + }, + { + "auxiliary_loss_clip": 0.06588376, + "auxiliary_loss_mlp": 0.01284743, + "balance_loss_clip": 0.06309167, + "balance_loss_mlp": 0.01257765, + "epoch": 0.16227265895084925, + "flos": 24612546251520.0, + "grad_norm": 1.6873939512975982, + "language_loss": 0.78418016, + "learning_rate": 3.819256636627339e-06, + "loss": 0.86291134, + "num_input_tokens_seen": 58426205, + "router_z_loss_clip": 2.79296875, + "router_z_loss_mlp": 0.26965332, + "step": 2699, + "time_per_iteration": 2.5874006748199463 + }, + { + "auxiliary_loss_clip": 0.06599343, + "auxiliary_loss_mlp": 0.01283682, + "balance_loss_clip": 0.06313124, + "balance_loss_mlp": 0.0125754, + "epoch": 0.1623327822035172, + "flos": 19579436651520.0, + "grad_norm": 5.305505294911747, + "language_loss": 0.86966538, + "learning_rate": 3.81909481076994e-06, + "loss": 0.94849563, + "num_input_tokens_seen": 58443830, + "router_z_loss_clip": 2.859375, + "router_z_loss_mlp": 0.2611084, + "step": 2700, + "time_per_iteration": 4.029258966445923 + }, + { + "auxiliary_loss_clip": 0.06593184, + "auxiliary_loss_mlp": 0.01283437, + "balance_loss_clip": 0.06310724, + "balance_loss_mlp": 0.01256042, + "epoch": 0.16239290545618518, + "flos": 26475612658560.0, + "grad_norm": 1.7724025685719413, + "language_loss": 0.80958557, + "learning_rate": 3.818932915932284e-06, + "loss": 0.8883518, + "num_input_tokens_seen": 58464405, + "router_z_loss_clip": 2.83007812, + "router_z_loss_mlp": 0.27404785, + "step": 2701, + "time_per_iteration": 2.5998921394348145 + }, + { + "auxiliary_loss_clip": 0.06590648, + "auxiliary_loss_mlp": 0.01284929, + "balance_loss_clip": 0.06304645, + "balance_loss_mlp": 0.01256271, + "epoch": 0.16245302870885314, + "flos": 15857454614400.0, + "grad_norm": 1.7204107394325303, + "language_loss": 0.74345064, + "learning_rate": 3.818770952120511e-06, + "loss": 0.8222065, + "num_input_tokens_seen": 58483295, + "router_z_loss_clip": 2.859375, + "router_z_loss_mlp": 0.28649902, + "step": 2702, + "time_per_iteration": 3.937354803085327 + }, + { + "auxiliary_loss_clip": 0.06603839, + "auxiliary_loss_mlp": 0.0128822, + "balance_loss_clip": 0.06313589, + "balance_loss_mlp": 0.01259252, + "epoch": 0.1625131519615211, + "flos": 14761710771840.0, + "grad_norm": 9.119129404803312, + "language_loss": 0.7369948, + "learning_rate": 3.81860891934076e-06, + "loss": 0.81591535, + "num_input_tokens_seen": 58501205, + "router_z_loss_clip": 2.90234375, + "router_z_loss_mlp": 0.28955078, + "step": 2703, + "time_per_iteration": 2.5070807933807373 + }, + { + "auxiliary_loss_clip": 0.066023, + "auxiliary_loss_mlp": 0.01283178, + "balance_loss_clip": 0.0631163, + "balance_loss_mlp": 0.01255033, + "epoch": 0.1625732752141891, + "flos": 28228073276160.0, + "grad_norm": 2.112253840465368, + "language_loss": 0.70914233, + "learning_rate": 3.818446817599176e-06, + "loss": 0.78799713, + "num_input_tokens_seen": 58522315, + "router_z_loss_clip": 2.90820312, + "router_z_loss_mlp": 0.28112793, + "step": 2704, + "time_per_iteration": 2.6071994304656982 + }, + { + "auxiliary_loss_clip": 0.06486984, + "auxiliary_loss_mlp": 0.01271467, + "balance_loss_clip": 0.06323022, + "balance_loss_mlp": 0.01264725, + "epoch": 0.16263339846685707, + "flos": 67347268871040.0, + "grad_norm": 0.7781332743607355, + "language_loss": 0.53379726, + "learning_rate": 3.818284646901907e-06, + "loss": 0.61138183, + "num_input_tokens_seen": 58586695, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.06756592, + "step": 2705, + "time_per_iteration": 3.1592283248901367 + }, + { + "auxiliary_loss_clip": 0.06599878, + "auxiliary_loss_mlp": 0.01288619, + "balance_loss_clip": 0.06308411, + "balance_loss_mlp": 0.01259854, + "epoch": 0.16269352171952503, + "flos": 14324360785920.0, + "grad_norm": 2.6444300047772575, + "language_loss": 0.76420808, + "learning_rate": 3.818122407255102e-06, + "loss": 0.84309304, + "num_input_tokens_seen": 58602435, + "router_z_loss_clip": 2.9140625, + "router_z_loss_mlp": 0.2878418, + "step": 2706, + "time_per_iteration": 2.494798183441162 + }, + { + "auxiliary_loss_clip": 0.06595413, + "auxiliary_loss_mlp": 0.01288657, + "balance_loss_clip": 0.06307741, + "balance_loss_mlp": 0.01263015, + "epoch": 0.162753644972193, + "flos": 28367916940800.0, + "grad_norm": 2.0996317585826727, + "language_loss": 0.73324966, + "learning_rate": 3.817960098664914e-06, + "loss": 0.8120904, + "num_input_tokens_seen": 58621275, + "router_z_loss_clip": 2.87695312, + "router_z_loss_mlp": 0.25646973, + "step": 2707, + "time_per_iteration": 5.361986875534058 + }, + { + "auxiliary_loss_clip": 0.06597963, + "auxiliary_loss_mlp": 0.01297936, + "balance_loss_clip": 0.06310263, + "balance_loss_mlp": 0.01270721, + "epoch": 0.16281376822486096, + "flos": 19943971839360.0, + "grad_norm": 3.72169556400114, + "language_loss": 0.83658004, + "learning_rate": 3.817797721137495e-06, + "loss": 0.91553903, + "num_input_tokens_seen": 58637550, + "router_z_loss_clip": 2.875, + "router_z_loss_mlp": 0.27233887, + "step": 2708, + "time_per_iteration": 2.528703451156616 + }, + { + "auxiliary_loss_clip": 0.0659356, + "auxiliary_loss_mlp": 0.01292098, + "balance_loss_clip": 0.06302815, + "balance_loss_mlp": 0.01262701, + "epoch": 0.16287389147752893, + "flos": 21258118149120.0, + "grad_norm": 2.208557612842335, + "language_loss": 0.86945301, + "learning_rate": 3.817635274679006e-06, + "loss": 0.94830966, + "num_input_tokens_seen": 58654135, + "router_z_loss_clip": 2.91015625, + "router_z_loss_mlp": 0.29394531, + "step": 2709, + "time_per_iteration": 2.5158472061157227 + }, + { + "auxiliary_loss_clip": 0.06590779, + "auxiliary_loss_mlp": 0.01297599, + "balance_loss_clip": 0.06302857, + "balance_loss_mlp": 0.0127123, + "epoch": 0.1629340147301969, + "flos": 19250679957120.0, + "grad_norm": 2.0845626973210942, + "language_loss": 0.926085, + "learning_rate": 3.817472759295605e-06, + "loss": 1.00496876, + "num_input_tokens_seen": 58674320, + "router_z_loss_clip": 2.87695312, + "router_z_loss_mlp": 0.26367188, + "step": 2710, + "time_per_iteration": 2.566678762435913 + }, + { + "auxiliary_loss_clip": 0.06590527, + "auxiliary_loss_mlp": 0.01299634, + "balance_loss_clip": 0.06304915, + "balance_loss_mlp": 0.01271691, + "epoch": 0.16299413798286488, + "flos": 21255896016000.0, + "grad_norm": 2.354283395736919, + "language_loss": 0.82405818, + "learning_rate": 3.817310174993453e-06, + "loss": 0.90295976, + "num_input_tokens_seen": 58691000, + "router_z_loss_clip": 2.859375, + "router_z_loss_mlp": 0.27954102, + "step": 2711, + "time_per_iteration": 2.5129330158233643 + }, + { + "auxiliary_loss_clip": 0.06600536, + "auxiliary_loss_mlp": 0.01290666, + "balance_loss_clip": 0.0630425, + "balance_loss_mlp": 0.0126115, + "epoch": 0.16305426123553285, + "flos": 18776545228800.0, + "grad_norm": 3.9666408475565462, + "language_loss": 0.82468587, + "learning_rate": 3.817147521778719e-06, + "loss": 0.90359789, + "num_input_tokens_seen": 58710230, + "router_z_loss_clip": 2.96289062, + "router_z_loss_mlp": 0.29516602, + "step": 2712, + "time_per_iteration": 2.5337300300598145 + }, + { + "auxiliary_loss_clip": 0.06597727, + "auxiliary_loss_mlp": 0.01290483, + "balance_loss_clip": 0.06302102, + "balance_loss_mlp": 0.01261563, + "epoch": 0.16311438448820081, + "flos": 22093643537280.0, + "grad_norm": 1.9569381877955756, + "language_loss": 0.78029472, + "learning_rate": 3.816984799657568e-06, + "loss": 0.85917681, + "num_input_tokens_seen": 58728610, + "router_z_loss_clip": 2.95898438, + "router_z_loss_mlp": 0.28942871, + "step": 2713, + "time_per_iteration": 2.5238146781921387 + }, + { + "auxiliary_loss_clip": 0.06594867, + "auxiliary_loss_mlp": 0.0130017, + "balance_loss_clip": 0.06315845, + "balance_loss_mlp": 0.01271799, + "epoch": 0.16317450774086878, + "flos": 16472565037440.0, + "grad_norm": 2.250248562702171, + "language_loss": 0.80385303, + "learning_rate": 3.8168220086361715e-06, + "loss": 0.88280344, + "num_input_tokens_seen": 58744385, + "router_z_loss_clip": 2.79101562, + "router_z_loss_mlp": 0.28369141, + "step": 2714, + "time_per_iteration": 2.5166831016540527 + }, + { + "auxiliary_loss_clip": 0.06589634, + "auxiliary_loss_mlp": 0.01294838, + "balance_loss_clip": 0.06306746, + "balance_loss_mlp": 0.01269899, + "epoch": 0.16323463099353674, + "flos": 24359832737280.0, + "grad_norm": 1.8056327126335605, + "language_loss": 0.78403461, + "learning_rate": 3.816659148720702e-06, + "loss": 0.8628794, + "num_input_tokens_seen": 58763905, + "router_z_loss_clip": 2.828125, + "router_z_loss_mlp": 0.24951172, + "step": 2715, + "time_per_iteration": 2.5939090251922607 + }, + { + "auxiliary_loss_clip": 0.06588797, + "auxiliary_loss_mlp": 0.01288106, + "balance_loss_clip": 0.06304932, + "balance_loss_mlp": 0.01261952, + "epoch": 0.1632947542462047, + "flos": 24907872366720.0, + "grad_norm": 2.046246244819102, + "language_loss": 0.82485706, + "learning_rate": 3.816496219917336e-06, + "loss": 0.90362608, + "num_input_tokens_seen": 58785580, + "router_z_loss_clip": 2.84179688, + "router_z_loss_mlp": 0.26147461, + "step": 2716, + "time_per_iteration": 2.593174457550049 + }, + { + "auxiliary_loss_clip": 0.06597836, + "auxiliary_loss_mlp": 0.01294616, + "balance_loss_clip": 0.06307962, + "balance_loss_mlp": 0.01266017, + "epoch": 0.1633548774988727, + "flos": 24907285388160.0, + "grad_norm": 1.9895193792693864, + "language_loss": 0.87446529, + "learning_rate": 3.816333222232251e-06, + "loss": 0.95338982, + "num_input_tokens_seen": 58806075, + "router_z_loss_clip": 2.90234375, + "router_z_loss_mlp": 0.28613281, + "step": 2717, + "time_per_iteration": 2.55460262298584 + }, + { + "auxiliary_loss_clip": 0.0659758, + "auxiliary_loss_mlp": 0.01288078, + "balance_loss_clip": 0.06314965, + "balance_loss_mlp": 0.01262413, + "epoch": 0.16341500075154067, + "flos": 30449008471680.0, + "grad_norm": 1.9093048334188691, + "language_loss": 0.77648151, + "learning_rate": 3.816170155671629e-06, + "loss": 0.8553381, + "num_input_tokens_seen": 58827405, + "router_z_loss_clip": 2.82617188, + "router_z_loss_mlp": 0.25671387, + "step": 2718, + "time_per_iteration": 2.6473746299743652 + }, + { + "auxiliary_loss_clip": 0.06597009, + "auxiliary_loss_mlp": 0.01285687, + "balance_loss_clip": 0.0631033, + "balance_loss_mlp": 0.01259783, + "epoch": 0.16347512400420863, + "flos": 22791253904640.0, + "grad_norm": 2.222005290704418, + "language_loss": 0.74954313, + "learning_rate": 3.816007020241652e-06, + "loss": 0.82837009, + "num_input_tokens_seen": 58847205, + "router_z_loss_clip": 2.8671875, + "router_z_loss_mlp": 0.25866699, + "step": 2719, + "time_per_iteration": 2.551116704940796 + }, + { + "auxiliary_loss_clip": 0.0659292, + "auxiliary_loss_mlp": 0.01283628, + "balance_loss_clip": 0.0630803, + "balance_loss_mlp": 0.01257831, + "epoch": 0.1635352472568766, + "flos": 22639083690240.0, + "grad_norm": 1.7533438569003168, + "language_loss": 0.73446441, + "learning_rate": 3.815843815948507e-06, + "loss": 0.81322992, + "num_input_tokens_seen": 58866865, + "router_z_loss_clip": 2.84765625, + "router_z_loss_mlp": 0.25805664, + "step": 2720, + "time_per_iteration": 2.5771543979644775 + }, + { + "auxiliary_loss_clip": 0.06588636, + "auxiliary_loss_mlp": 0.01282225, + "balance_loss_clip": 0.0630826, + "balance_loss_mlp": 0.01254949, + "epoch": 0.16359537050954456, + "flos": 15528362503680.0, + "grad_norm": 2.643329433322918, + "language_loss": 0.7707237, + "learning_rate": 3.8156805427983824e-06, + "loss": 0.84943235, + "num_input_tokens_seen": 58885200, + "router_z_loss_clip": 2.80664062, + "router_z_loss_mlp": 0.27294922, + "step": 2721, + "time_per_iteration": 2.4961769580841064 + }, + { + "auxiliary_loss_clip": 0.06596414, + "auxiliary_loss_mlp": 0.0128382, + "balance_loss_clip": 0.0630523, + "balance_loss_mlp": 0.01256175, + "epoch": 0.16365549376221253, + "flos": 22096578430080.0, + "grad_norm": 2.1311655694461917, + "language_loss": 0.79885328, + "learning_rate": 3.8155172007974695e-06, + "loss": 0.87765563, + "num_input_tokens_seen": 58906385, + "router_z_loss_clip": 2.91601562, + "router_z_loss_mlp": 0.27648926, + "step": 2722, + "time_per_iteration": 2.614875078201294 + }, + { + "auxiliary_loss_clip": 0.06605944, + "auxiliary_loss_mlp": 0.01289108, + "balance_loss_clip": 0.06310583, + "balance_loss_mlp": 0.01258602, + "epoch": 0.1637156170148805, + "flos": 24067148025600.0, + "grad_norm": 1.9382892216015752, + "language_loss": 0.85628319, + "learning_rate": 3.8153537899519624e-06, + "loss": 0.93523371, + "num_input_tokens_seen": 58925040, + "router_z_loss_clip": 2.95507812, + "router_z_loss_mlp": 0.30493164, + "step": 2723, + "time_per_iteration": 2.531521797180176 + }, + { + "auxiliary_loss_clip": 0.0658607, + "auxiliary_loss_mlp": 0.01286244, + "balance_loss_clip": 0.06307479, + "balance_loss_mlp": 0.01259732, + "epoch": 0.1637757402675485, + "flos": 26692212263040.0, + "grad_norm": 4.459915510598608, + "language_loss": 0.71697843, + "learning_rate": 3.815190310268058e-06, + "loss": 0.7957015, + "num_input_tokens_seen": 58944790, + "router_z_loss_clip": 2.79101562, + "router_z_loss_mlp": 0.26477051, + "step": 2724, + "time_per_iteration": 2.577958822250366 + }, + { + "auxiliary_loss_clip": 0.06581962, + "auxiliary_loss_mlp": 0.01288602, + "balance_loss_clip": 0.06304826, + "balance_loss_mlp": 0.01263521, + "epoch": 0.16383586352021645, + "flos": 16112432188800.0, + "grad_norm": 1.9457979219444324, + "language_loss": 0.71286237, + "learning_rate": 3.815026761751955e-06, + "loss": 0.79156804, + "num_input_tokens_seen": 58962500, + "router_z_loss_clip": 2.77539062, + "router_z_loss_mlp": 0.25085449, + "step": 2725, + "time_per_iteration": 2.497311592102051 + }, + { + "auxiliary_loss_clip": 0.06590257, + "auxiliary_loss_mlp": 0.01285785, + "balance_loss_clip": 0.06310654, + "balance_loss_mlp": 0.01259761, + "epoch": 0.16389598677288442, + "flos": 19171031051520.0, + "grad_norm": 2.1904929355188325, + "language_loss": 0.89010125, + "learning_rate": 3.814863144409855e-06, + "loss": 0.96886164, + "num_input_tokens_seen": 58980355, + "router_z_loss_clip": 2.79101562, + "router_z_loss_mlp": 0.26013184, + "step": 2726, + "time_per_iteration": 2.5101511478424072 + }, + { + "auxiliary_loss_clip": 0.06595127, + "auxiliary_loss_mlp": 0.01285031, + "balance_loss_clip": 0.06307214, + "balance_loss_mlp": 0.01257732, + "epoch": 0.16395611002555238, + "flos": 21513431139840.0, + "grad_norm": 1.9675738265317178, + "language_loss": 0.75618744, + "learning_rate": 3.814699458247963e-06, + "loss": 0.83498907, + "num_input_tokens_seen": 58999505, + "router_z_loss_clip": 2.87890625, + "router_z_loss_mlp": 0.27331543, + "step": 2727, + "time_per_iteration": 2.5322039127349854 + }, + { + "auxiliary_loss_clip": 0.06578872, + "auxiliary_loss_mlp": 0.012812, + "balance_loss_clip": 0.06301126, + "balance_loss_mlp": 0.01257298, + "epoch": 0.16401623327822035, + "flos": 21477401084160.0, + "grad_norm": 2.357425852181157, + "language_loss": 0.82921708, + "learning_rate": 3.8145357032724855e-06, + "loss": 0.90781784, + "num_input_tokens_seen": 59017930, + "router_z_loss_clip": 2.78125, + "router_z_loss_mlp": 0.23913574, + "step": 2728, + "time_per_iteration": 2.538081407546997 + }, + { + "auxiliary_loss_clip": 0.06590319, + "auxiliary_loss_mlp": 0.01282423, + "balance_loss_clip": 0.0630119, + "balance_loss_mlp": 0.01255685, + "epoch": 0.1640763565308883, + "flos": 13631362392960.0, + "grad_norm": 3.359167938327165, + "language_loss": 0.85634404, + "learning_rate": 3.814371879489633e-06, + "loss": 0.93507141, + "num_input_tokens_seen": 59035130, + "router_z_loss_clip": 2.890625, + "router_z_loss_mlp": 0.26745605, + "step": 2729, + "time_per_iteration": 2.555157423019409 + }, + { + "auxiliary_loss_clip": 0.06590364, + "auxiliary_loss_mlp": 0.01282244, + "balance_loss_clip": 0.06303068, + "balance_loss_mlp": 0.01255732, + "epoch": 0.16413647978355628, + "flos": 15457057079040.0, + "grad_norm": 2.0375012641424193, + "language_loss": 0.73386455, + "learning_rate": 3.814207986905616e-06, + "loss": 0.81259066, + "num_input_tokens_seen": 59053080, + "router_z_loss_clip": 2.87695312, + "router_z_loss_mlp": 0.26477051, + "step": 2730, + "time_per_iteration": 2.5347042083740234 + }, + { + "auxiliary_loss_clip": 0.06593673, + "auxiliary_loss_mlp": 0.01289719, + "balance_loss_clip": 0.06303447, + "balance_loss_mlp": 0.01261967, + "epoch": 0.16419660303622427, + "flos": 45889043172480.0, + "grad_norm": 1.5633038653846945, + "language_loss": 0.75101161, + "learning_rate": 3.814044025526651e-06, + "loss": 0.82984555, + "num_input_tokens_seen": 59075610, + "router_z_loss_clip": 2.90234375, + "router_z_loss_mlp": 0.27734375, + "step": 2731, + "time_per_iteration": 2.7257211208343506 + }, + { + "auxiliary_loss_clip": 0.06592289, + "auxiliary_loss_mlp": 0.012866, + "balance_loss_clip": 0.06302358, + "balance_loss_mlp": 0.01258967, + "epoch": 0.16425672628889224, + "flos": 18958791859200.0, + "grad_norm": 2.3112437011786238, + "language_loss": 0.79966319, + "learning_rate": 3.8138799953589548e-06, + "loss": 0.87845206, + "num_input_tokens_seen": 59094555, + "router_z_loss_clip": 2.90039062, + "router_z_loss_mlp": 0.27648926, + "step": 2732, + "time_per_iteration": 2.5160276889801025 + }, + { + "auxiliary_loss_clip": 0.06590726, + "auxiliary_loss_mlp": 0.01293299, + "balance_loss_clip": 0.06301223, + "balance_loss_mlp": 0.01263854, + "epoch": 0.1643168495415602, + "flos": 24319316488320.0, + "grad_norm": 2.024679597680736, + "language_loss": 0.69993633, + "learning_rate": 3.8137158964087473e-06, + "loss": 0.77877665, + "num_input_tokens_seen": 59113515, + "router_z_loss_clip": 2.89648438, + "router_z_loss_mlp": 0.29467773, + "step": 2733, + "time_per_iteration": 2.53328537940979 + }, + { + "auxiliary_loss_clip": 0.06586764, + "auxiliary_loss_mlp": 0.0128512, + "balance_loss_clip": 0.06300272, + "balance_loss_mlp": 0.01256426, + "epoch": 0.16437697279422817, + "flos": 26434970628480.0, + "grad_norm": 2.0387940274909537, + "language_loss": 0.81552017, + "learning_rate": 3.8135517286822508e-06, + "loss": 0.89423895, + "num_input_tokens_seen": 59133275, + "router_z_loss_clip": 2.86132812, + "router_z_loss_mlp": 0.28674316, + "step": 2734, + "time_per_iteration": 2.567229747772217 + }, + { + "auxiliary_loss_clip": 0.0658897, + "auxiliary_loss_mlp": 0.01289023, + "balance_loss_clip": 0.06299339, + "balance_loss_mlp": 0.01261271, + "epoch": 0.16443709604689613, + "flos": 34540808503680.0, + "grad_norm": 4.048112349799869, + "language_loss": 0.82907999, + "learning_rate": 3.8133874921856914e-06, + "loss": 0.90785992, + "num_input_tokens_seen": 59154095, + "router_z_loss_clip": 2.89257812, + "router_z_loss_mlp": 0.27758789, + "step": 2735, + "time_per_iteration": 2.63996958732605 + }, + { + "auxiliary_loss_clip": 0.06579679, + "auxiliary_loss_mlp": 0.01279603, + "balance_loss_clip": 0.06297098, + "balance_loss_mlp": 0.01254783, + "epoch": 0.1644972192995641, + "flos": 23264717800320.0, + "grad_norm": 2.4207218830736417, + "language_loss": 0.80072814, + "learning_rate": 3.813223186925296e-06, + "loss": 0.87932098, + "num_input_tokens_seen": 59173795, + "router_z_loss_clip": 2.828125, + "router_z_loss_mlp": 0.24816895, + "step": 2736, + "time_per_iteration": 2.546694755554199 + }, + { + "auxiliary_loss_clip": 0.0658504, + "auxiliary_loss_mlp": 0.0128325, + "balance_loss_clip": 0.06300261, + "balance_loss_mlp": 0.01256499, + "epoch": 0.1645573425522321, + "flos": 26986825618560.0, + "grad_norm": 1.6682039059194231, + "language_loss": 0.82238322, + "learning_rate": 3.8130588129072964e-06, + "loss": 0.90106606, + "num_input_tokens_seen": 59191610, + "router_z_loss_clip": 2.84765625, + "router_z_loss_mlp": 0.2677002, + "step": 2737, + "time_per_iteration": 2.5593652725219727 + }, + { + "auxiliary_loss_clip": 0.06591076, + "auxiliary_loss_mlp": 0.0128149, + "balance_loss_clip": 0.06302774, + "balance_loss_mlp": 0.01256087, + "epoch": 0.16461746580490005, + "flos": 28739495871360.0, + "grad_norm": 1.7184215818783282, + "language_loss": 0.88135791, + "learning_rate": 3.8128943701379246e-06, + "loss": 0.96008366, + "num_input_tokens_seen": 59213000, + "router_z_loss_clip": 2.88476562, + "router_z_loss_mlp": 0.25402832, + "step": 2738, + "time_per_iteration": 2.6650192737579346 + }, + { + "auxiliary_loss_clip": 0.06589583, + "auxiliary_loss_mlp": 0.0128808, + "balance_loss_clip": 0.06299618, + "balance_loss_mlp": 0.01259446, + "epoch": 0.16467758905756802, + "flos": 24936062065920.0, + "grad_norm": 2.428798415539057, + "language_loss": 0.72705042, + "learning_rate": 3.8127298586234167e-06, + "loss": 0.80582702, + "num_input_tokens_seen": 59232340, + "router_z_loss_clip": 2.90234375, + "router_z_loss_mlp": 0.28649902, + "step": 2739, + "time_per_iteration": 4.007360935211182 + }, + { + "auxiliary_loss_clip": 0.06576341, + "auxiliary_loss_mlp": 0.0128871, + "balance_loss_clip": 0.06294868, + "balance_loss_mlp": 0.01261435, + "epoch": 0.16473771231023598, + "flos": 24833380487040.0, + "grad_norm": 2.4914045636792133, + "language_loss": 0.82377362, + "learning_rate": 3.8125652783700104e-06, + "loss": 0.90242416, + "num_input_tokens_seen": 59253950, + "router_z_loss_clip": 2.81445312, + "router_z_loss_mlp": 0.27270508, + "step": 2740, + "time_per_iteration": 2.5806076526641846 + }, + { + "auxiliary_loss_clip": 0.06593102, + "auxiliary_loss_mlp": 0.01294674, + "balance_loss_clip": 0.0629887, + "balance_loss_mlp": 0.01265218, + "epoch": 0.16479783556290395, + "flos": 39905609690880.0, + "grad_norm": 2.0874742304604785, + "language_loss": 0.6960665, + "learning_rate": 3.8124006293839475e-06, + "loss": 0.77494431, + "num_input_tokens_seen": 59275545, + "router_z_loss_clip": 2.9453125, + "router_z_loss_mlp": 0.29431152, + "step": 2741, + "time_per_iteration": 2.67899489402771 + }, + { + "auxiliary_loss_clip": 0.06583216, + "auxiliary_loss_mlp": 0.01289999, + "balance_loss_clip": 0.06296665, + "balance_loss_mlp": 0.0126295, + "epoch": 0.16485795881557191, + "flos": 19902449341440.0, + "grad_norm": 1.99300527848014, + "language_loss": 0.80380434, + "learning_rate": 3.812235911671472e-06, + "loss": 0.88253653, + "num_input_tokens_seen": 59293480, + "router_z_loss_clip": 2.86523438, + "router_z_loss_mlp": 0.27062988, + "step": 2742, + "time_per_iteration": 4.01186203956604 + }, + { + "auxiliary_loss_clip": 0.06583486, + "auxiliary_loss_mlp": 0.0128544, + "balance_loss_clip": 0.06299208, + "balance_loss_mlp": 0.01258034, + "epoch": 0.16491808206823988, + "flos": 20562017155200.0, + "grad_norm": 1.859989576393153, + "language_loss": 0.85480952, + "learning_rate": 3.8120711252388274e-06, + "loss": 0.9334988, + "num_input_tokens_seen": 59313435, + "router_z_loss_clip": 2.84375, + "router_z_loss_mlp": 0.27392578, + "step": 2743, + "time_per_iteration": 2.531813859939575 + }, + { + "auxiliary_loss_clip": 0.06583907, + "auxiliary_loss_mlp": 0.01288972, + "balance_loss_clip": 0.06300064, + "balance_loss_mlp": 0.01261018, + "epoch": 0.16497820532090787, + "flos": 23806803790080.0, + "grad_norm": 1.9796677960929725, + "language_loss": 0.87141418, + "learning_rate": 3.811906270092265e-06, + "loss": 0.95014304, + "num_input_tokens_seen": 59331535, + "router_z_loss_clip": 2.8359375, + "router_z_loss_mlp": 0.27966309, + "step": 2744, + "time_per_iteration": 2.5968780517578125 + }, + { + "auxiliary_loss_clip": 0.06573457, + "auxiliary_loss_mlp": 0.01283559, + "balance_loss_clip": 0.0629618, + "balance_loss_mlp": 0.01258847, + "epoch": 0.16503832857357584, + "flos": 25489510283520.0, + "grad_norm": 2.535956000825199, + "language_loss": 0.83221614, + "learning_rate": 3.811741346238036e-06, + "loss": 0.91078633, + "num_input_tokens_seen": 59350680, + "router_z_loss_clip": 2.76953125, + "router_z_loss_mlp": 0.24743652, + "step": 2745, + "time_per_iteration": 2.5640015602111816 + }, + { + "auxiliary_loss_clip": 0.06588263, + "auxiliary_loss_mlp": 0.01287637, + "balance_loss_clip": 0.06305014, + "balance_loss_mlp": 0.01261196, + "epoch": 0.1650984518262438, + "flos": 17681849562240.0, + "grad_norm": 2.0373309792274883, + "language_loss": 0.7743578, + "learning_rate": 3.8115763536823923e-06, + "loss": 0.85311675, + "num_input_tokens_seen": 59367020, + "router_z_loss_clip": 2.83007812, + "router_z_loss_mlp": 0.26452637, + "step": 2746, + "time_per_iteration": 5.4125282764434814 + }, + { + "auxiliary_loss_clip": 0.06589019, + "auxiliary_loss_mlp": 0.01289439, + "balance_loss_clip": 0.06303473, + "balance_loss_mlp": 0.01261723, + "epoch": 0.16515857507891177, + "flos": 18704401263360.0, + "grad_norm": 1.60188965958096, + "language_loss": 0.81673479, + "learning_rate": 3.811411292431592e-06, + "loss": 0.89551938, + "num_input_tokens_seen": 59386075, + "router_z_loss_clip": 2.85742188, + "router_z_loss_mlp": 0.27685547, + "step": 2747, + "time_per_iteration": 2.5460550785064697 + }, + { + "auxiliary_loss_clip": 0.06594047, + "auxiliary_loss_mlp": 0.0128679, + "balance_loss_clip": 0.06307407, + "balance_loss_mlp": 0.01260707, + "epoch": 0.16521869833157973, + "flos": 15015472462080.0, + "grad_norm": 2.468884923074517, + "language_loss": 0.71168172, + "learning_rate": 3.8112461624918945e-06, + "loss": 0.79049003, + "num_input_tokens_seen": 59402690, + "router_z_loss_clip": 2.86328125, + "router_z_loss_mlp": 0.26074219, + "step": 2748, + "time_per_iteration": 2.493168592453003 + }, + { + "auxiliary_loss_clip": 0.06589203, + "auxiliary_loss_mlp": 0.01284146, + "balance_loss_clip": 0.06305005, + "balance_loss_mlp": 0.01259732, + "epoch": 0.1652788215842477, + "flos": 22126654846080.0, + "grad_norm": 5.244624397631241, + "language_loss": 0.8897143, + "learning_rate": 3.811080963869561e-06, + "loss": 0.9684478, + "num_input_tokens_seen": 59421130, + "router_z_loss_clip": 2.84179688, + "router_z_loss_mlp": 0.24401855, + "step": 2749, + "time_per_iteration": 2.6453802585601807 + }, + { + "auxiliary_loss_clip": 0.0659653, + "auxiliary_loss_mlp": 0.01290094, + "balance_loss_clip": 0.06307155, + "balance_loss_mlp": 0.01261913, + "epoch": 0.16533894483691566, + "flos": 18339027534720.0, + "grad_norm": 3.9658549336517446, + "language_loss": 0.79764348, + "learning_rate": 3.8109156965708557e-06, + "loss": 0.87650967, + "num_input_tokens_seen": 59438970, + "router_z_loss_clip": 2.88867188, + "router_z_loss_mlp": 0.28210449, + "step": 2750, + "time_per_iteration": 2.5099878311157227 + }, + { + "auxiliary_loss_clip": 0.06587892, + "auxiliary_loss_mlp": 0.01285687, + "balance_loss_clip": 0.06303497, + "balance_loss_mlp": 0.01257673, + "epoch": 0.16539906808958366, + "flos": 22388592309120.0, + "grad_norm": 1.8681239023451541, + "language_loss": 0.95973986, + "learning_rate": 3.8107503606020455e-06, + "loss": 1.03847575, + "num_input_tokens_seen": 59458510, + "router_z_loss_clip": 2.84570312, + "router_z_loss_mlp": 0.2800293, + "step": 2751, + "time_per_iteration": 2.580857753753662 + }, + { + "auxiliary_loss_clip": 0.06591333, + "auxiliary_loss_mlp": 0.01293333, + "balance_loss_clip": 0.06311293, + "balance_loss_mlp": 0.01266344, + "epoch": 0.16545919134225162, + "flos": 22717726346880.0, + "grad_norm": 2.017884310231, + "language_loss": 0.71926272, + "learning_rate": 3.8105849559693997e-06, + "loss": 0.79810935, + "num_input_tokens_seen": 59477110, + "router_z_loss_clip": 2.79882812, + "router_z_loss_mlp": 0.26965332, + "step": 2752, + "time_per_iteration": 2.5533626079559326 + }, + { + "auxiliary_loss_clip": 0.06474683, + "auxiliary_loss_mlp": 0.01280412, + "balance_loss_clip": 0.06313415, + "balance_loss_mlp": 0.01272663, + "epoch": 0.1655193145949196, + "flos": 67822493702400.0, + "grad_norm": 0.7367497765392101, + "language_loss": 0.5395115, + "learning_rate": 3.810419482679192e-06, + "loss": 0.61706245, + "num_input_tokens_seen": 59541155, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.07739258, + "step": 2753, + "time_per_iteration": 3.283729314804077 + }, + { + "auxiliary_loss_clip": 0.06593385, + "auxiliary_loss_mlp": 0.01285286, + "balance_loss_clip": 0.06311026, + "balance_loss_mlp": 0.01258547, + "epoch": 0.16557943784758755, + "flos": 24287353355520.0, + "grad_norm": 1.793852310261697, + "language_loss": 0.75999093, + "learning_rate": 3.8102539407376954e-06, + "loss": 0.8387776, + "num_input_tokens_seen": 59561155, + "router_z_loss_clip": 2.82421875, + "router_z_loss_mlp": 0.26757812, + "step": 2754, + "time_per_iteration": 2.608365297317505 + }, + { + "auxiliary_loss_clip": 0.06608296, + "auxiliary_loss_mlp": 0.01288183, + "balance_loss_clip": 0.06315503, + "balance_loss_mlp": 0.01260575, + "epoch": 0.16563956110025552, + "flos": 20089727216640.0, + "grad_norm": 2.367713266740868, + "language_loss": 0.87993264, + "learning_rate": 3.810088330151188e-06, + "loss": 0.95889747, + "num_input_tokens_seen": 59580460, + "router_z_loss_clip": 2.9296875, + "router_z_loss_mlp": 0.27600098, + "step": 2755, + "time_per_iteration": 2.5239596366882324 + }, + { + "auxiliary_loss_clip": 0.06584992, + "auxiliary_loss_mlp": 0.01279054, + "balance_loss_clip": 0.06303753, + "balance_loss_mlp": 0.01253877, + "epoch": 0.16569968435292348, + "flos": 28041382379520.0, + "grad_norm": 1.6563009546595795, + "language_loss": 0.7383014, + "learning_rate": 3.80992265092595e-06, + "loss": 0.81694186, + "num_input_tokens_seen": 59600025, + "router_z_loss_clip": 2.8125, + "router_z_loss_mlp": 0.25195312, + "step": 2756, + "time_per_iteration": 2.6032936573028564 + }, + { + "auxiliary_loss_clip": 0.06582732, + "auxiliary_loss_mlp": 0.01284003, + "balance_loss_clip": 0.06305105, + "balance_loss_mlp": 0.0125817, + "epoch": 0.16575980760559147, + "flos": 26257461753600.0, + "grad_norm": 1.6426190009356174, + "language_loss": 0.75875264, + "learning_rate": 3.8097569030682636e-06, + "loss": 0.83741999, + "num_input_tokens_seen": 59620600, + "router_z_loss_clip": 2.7734375, + "router_z_loss_mlp": 0.25817871, + "step": 2757, + "time_per_iteration": 2.5745790004730225 + }, + { + "auxiliary_loss_clip": 0.06586438, + "auxiliary_loss_mlp": 0.01285191, + "balance_loss_clip": 0.063063, + "balance_loss_mlp": 0.01258822, + "epoch": 0.16581993085825944, + "flos": 26951382541440.0, + "grad_norm": 1.7077128151850376, + "language_loss": 0.85793787, + "learning_rate": 3.8095910865844137e-06, + "loss": 0.93665409, + "num_input_tokens_seen": 59641385, + "router_z_loss_clip": 2.80078125, + "router_z_loss_mlp": 0.26391602, + "step": 2758, + "time_per_iteration": 2.6094768047332764 + }, + { + "auxiliary_loss_clip": 0.06582282, + "auxiliary_loss_mlp": 0.01281611, + "balance_loss_clip": 0.06301229, + "balance_loss_mlp": 0.01255981, + "epoch": 0.1658800541109274, + "flos": 21660192766080.0, + "grad_norm": 2.0058299268215602, + "language_loss": 0.79821748, + "learning_rate": 3.809425201480689e-06, + "loss": 0.87685645, + "num_input_tokens_seen": 59659865, + "router_z_loss_clip": 2.81054688, + "router_z_loss_mlp": 0.25646973, + "step": 2759, + "time_per_iteration": 2.5326881408691406 + }, + { + "auxiliary_loss_clip": 0.06584738, + "auxiliary_loss_mlp": 0.01287284, + "balance_loss_clip": 0.06296851, + "balance_loss_mlp": 0.01258721, + "epoch": 0.16594017736359537, + "flos": 16441063102080.0, + "grad_norm": 2.640523985370613, + "language_loss": 0.76520288, + "learning_rate": 3.8092592477633793e-06, + "loss": 0.84392309, + "num_input_tokens_seen": 59678780, + "router_z_loss_clip": 2.8828125, + "router_z_loss_mlp": 0.28588867, + "step": 2760, + "time_per_iteration": 2.5365755558013916 + }, + { + "auxiliary_loss_clip": 0.06596339, + "auxiliary_loss_mlp": 0.01287081, + "balance_loss_clip": 0.06307873, + "balance_loss_mlp": 0.01260986, + "epoch": 0.16600030061626334, + "flos": 22643779518720.0, + "grad_norm": 1.8139140163731928, + "language_loss": 0.74449325, + "learning_rate": 3.8090932254387774e-06, + "loss": 0.82332754, + "num_input_tokens_seen": 59698795, + "router_z_loss_clip": 2.88671875, + "router_z_loss_mlp": 0.26086426, + "step": 2761, + "time_per_iteration": 2.5551891326904297 + }, + { + "auxiliary_loss_clip": 0.06586796, + "auxiliary_loss_mlp": 0.01291841, + "balance_loss_clip": 0.0630264, + "balance_loss_mlp": 0.01263922, + "epoch": 0.1660604238689313, + "flos": 26403887963520.0, + "grad_norm": 1.8147235749558717, + "language_loss": 0.89404368, + "learning_rate": 3.8089271345131788e-06, + "loss": 0.97283, + "num_input_tokens_seen": 59718795, + "router_z_loss_clip": 2.83984375, + "router_z_loss_mlp": 0.27905273, + "step": 2762, + "time_per_iteration": 2.587952136993408 + }, + { + "auxiliary_loss_clip": 0.0659417, + "auxiliary_loss_mlp": 0.01281866, + "balance_loss_clip": 0.0630425, + "balance_loss_mlp": 0.01255282, + "epoch": 0.16612054712159927, + "flos": 23046776530560.0, + "grad_norm": 1.779645358746394, + "language_loss": 0.8912673, + "learning_rate": 3.8087609749928822e-06, + "loss": 0.97002763, + "num_input_tokens_seen": 59737555, + "router_z_loss_clip": 2.90234375, + "router_z_loss_mlp": 0.26611328, + "step": 2763, + "time_per_iteration": 2.5509772300720215 + }, + { + "auxiliary_loss_clip": 0.06462647, + "auxiliary_loss_mlp": 0.01266671, + "balance_loss_clip": 0.06301262, + "balance_loss_mlp": 0.01259697, + "epoch": 0.16618067037426726, + "flos": 59261388266880.0, + "grad_norm": 0.7675418877188291, + "language_loss": 0.59855133, + "learning_rate": 3.8085947468841885e-06, + "loss": 0.67584455, + "num_input_tokens_seen": 59800915, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.06988525, + "step": 2764, + "time_per_iteration": 3.221308708190918 + }, + { + "auxiliary_loss_clip": 0.06595036, + "auxiliary_loss_mlp": 0.0129625, + "balance_loss_clip": 0.06311496, + "balance_loss_mlp": 0.01269607, + "epoch": 0.16624079362693522, + "flos": 27206192407680.0, + "grad_norm": 22.231303672766604, + "language_loss": 0.8298772, + "learning_rate": 3.808428450193401e-06, + "loss": 0.90879005, + "num_input_tokens_seen": 59822910, + "router_z_loss_clip": 2.83203125, + "router_z_loss_mlp": 0.26635742, + "step": 2765, + "time_per_iteration": 2.5886435508728027 + }, + { + "auxiliary_loss_clip": 0.06603917, + "auxiliary_loss_mlp": 0.0129703, + "balance_loss_clip": 0.06306268, + "balance_loss_mlp": 0.01269099, + "epoch": 0.1663009168796032, + "flos": 10929542215680.0, + "grad_norm": 2.384069935097126, + "language_loss": 0.7120772, + "learning_rate": 3.8082620849268244e-06, + "loss": 0.79108667, + "num_input_tokens_seen": 59838805, + "router_z_loss_clip": 2.97851562, + "router_z_loss_mlp": 0.27941895, + "step": 2766, + "time_per_iteration": 2.526913642883301 + }, + { + "auxiliary_loss_clip": 0.06591118, + "auxiliary_loss_mlp": 0.0128837, + "balance_loss_clip": 0.06309089, + "balance_loss_mlp": 0.01262526, + "epoch": 0.16636104013227115, + "flos": 17900168175360.0, + "grad_norm": 2.2120517261374593, + "language_loss": 0.89624047, + "learning_rate": 3.808095651090769e-06, + "loss": 0.97503531, + "num_input_tokens_seen": 59855345, + "router_z_loss_clip": 2.82226562, + "router_z_loss_mlp": 0.25830078, + "step": 2767, + "time_per_iteration": 2.4989144802093506 + }, + { + "auxiliary_loss_clip": 0.06446301, + "auxiliary_loss_mlp": 0.0126062, + "balance_loss_clip": 0.0628543, + "balance_loss_mlp": 0.01253307, + "epoch": 0.16642116338493912, + "flos": 66748342285440.0, + "grad_norm": 0.6237778354152628, + "language_loss": 0.52864301, + "learning_rate": 3.8079291486915447e-06, + "loss": 0.60571223, + "num_input_tokens_seen": 59917710, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.07293701, + "step": 2768, + "time_per_iteration": 3.263981580734253 + }, + { + "auxiliary_loss_clip": 0.06597716, + "auxiliary_loss_mlp": 0.01287278, + "balance_loss_clip": 0.06305783, + "balance_loss_mlp": 0.0126048, + "epoch": 0.16648128663760708, + "flos": 19032067854720.0, + "grad_norm": 2.5043941820877524, + "language_loss": 0.85743988, + "learning_rate": 3.8077625777354667e-06, + "loss": 0.93628991, + "num_input_tokens_seen": 59935105, + "router_z_loss_clip": 2.91796875, + "router_z_loss_mlp": 0.26782227, + "step": 2769, + "time_per_iteration": 2.5169060230255127 + }, + { + "auxiliary_loss_clip": 0.06441471, + "auxiliary_loss_mlp": 0.01258691, + "balance_loss_clip": 0.06281121, + "balance_loss_mlp": 0.01251771, + "epoch": 0.16654140989027508, + "flos": 70154370103680.0, + "grad_norm": 0.7855037683883999, + "language_loss": 0.57378197, + "learning_rate": 3.80759593822885e-06, + "loss": 0.65078354, + "num_input_tokens_seen": 59984085, + "router_z_loss_clip": 1.60351562, + "router_z_loss_mlp": 0.06939697, + "step": 2770, + "time_per_iteration": 3.0450947284698486 + }, + { + "auxiliary_loss_clip": 0.0643771, + "auxiliary_loss_mlp": 0.01261832, + "balance_loss_clip": 0.06278233, + "balance_loss_mlp": 0.01254959, + "epoch": 0.16660153314294304, + "flos": 70290398407680.0, + "grad_norm": 0.8814976481921372, + "language_loss": 0.5630703, + "learning_rate": 3.807429230178015e-06, + "loss": 0.64006579, + "num_input_tokens_seen": 60043470, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.06890869, + "step": 2771, + "time_per_iteration": 3.0379133224487305 + }, + { + "auxiliary_loss_clip": 0.06582694, + "auxiliary_loss_mlp": 0.01286148, + "balance_loss_clip": 0.06303653, + "balance_loss_mlp": 0.01260756, + "epoch": 0.166661656395611, + "flos": 23081590702080.0, + "grad_norm": 2.5291823890046534, + "language_loss": 0.71466291, + "learning_rate": 3.8072624535892817e-06, + "loss": 0.79335129, + "num_input_tokens_seen": 60063045, + "router_z_loss_clip": 2.79101562, + "router_z_loss_mlp": 0.25378418, + "step": 2772, + "time_per_iteration": 2.551870584487915 + }, + { + "auxiliary_loss_clip": 0.06576528, + "auxiliary_loss_mlp": 0.01281534, + "balance_loss_clip": 0.06298962, + "balance_loss_mlp": 0.01255082, + "epoch": 0.16672177964827897, + "flos": 28373912507520.0, + "grad_norm": 1.9791838329774285, + "language_loss": 0.87486583, + "learning_rate": 3.807095608468975e-06, + "loss": 0.95344645, + "num_input_tokens_seen": 60081945, + "router_z_loss_clip": 2.77929688, + "router_z_loss_mlp": 0.26452637, + "step": 2773, + "time_per_iteration": 2.613593339920044 + }, + { + "auxiliary_loss_clip": 0.06585228, + "auxiliary_loss_mlp": 0.01284542, + "balance_loss_clip": 0.06305268, + "balance_loss_mlp": 0.01259532, + "epoch": 0.16678190290094694, + "flos": 19095700631040.0, + "grad_norm": 2.4658170667158545, + "language_loss": 0.8279835, + "learning_rate": 3.8069286948234224e-06, + "loss": 0.90668118, + "num_input_tokens_seen": 60096820, + "router_z_loss_clip": 2.80273438, + "router_z_loss_mlp": 0.25012207, + "step": 2774, + "time_per_iteration": 2.5196969509124756 + }, + { + "auxiliary_loss_clip": 0.06592362, + "auxiliary_loss_mlp": 0.01284239, + "balance_loss_clip": 0.06307152, + "balance_loss_mlp": 0.01258871, + "epoch": 0.1668420261536149, + "flos": 21805612727040.0, + "grad_norm": 2.7739422626660053, + "language_loss": 0.84618509, + "learning_rate": 3.806761712658952e-06, + "loss": 0.92495108, + "num_input_tokens_seen": 60116140, + "router_z_loss_clip": 2.85546875, + "router_z_loss_mlp": 0.25354004, + "step": 2775, + "time_per_iteration": 2.5799014568328857 + }, + { + "auxiliary_loss_clip": 0.06591405, + "auxiliary_loss_mlp": 0.01285295, + "balance_loss_clip": 0.06311037, + "balance_loss_mlp": 0.01260702, + "epoch": 0.16690214940628287, + "flos": 19068559107840.0, + "grad_norm": 2.4582225386756793, + "language_loss": 0.81805599, + "learning_rate": 3.806594661981897e-06, + "loss": 0.89682293, + "num_input_tokens_seen": 60134235, + "router_z_loss_clip": 2.8046875, + "router_z_loss_mlp": 0.24584961, + "step": 2776, + "time_per_iteration": 2.547075033187866 + }, + { + "auxiliary_loss_clip": 0.06574798, + "auxiliary_loss_mlp": 0.01281066, + "balance_loss_clip": 0.06304269, + "balance_loss_mlp": 0.01257188, + "epoch": 0.16696227265895086, + "flos": 18594550160640.0, + "grad_norm": 2.127036404214793, + "language_loss": 0.80698764, + "learning_rate": 3.8064275427985906e-06, + "loss": 0.88554621, + "num_input_tokens_seen": 60153275, + "router_z_loss_clip": 2.703125, + "router_z_loss_mlp": 0.2388916, + "step": 2777, + "time_per_iteration": 2.701383352279663 + }, + { + "auxiliary_loss_clip": 0.06586365, + "auxiliary_loss_mlp": 0.0128362, + "balance_loss_clip": 0.06303923, + "balance_loss_mlp": 0.01258323, + "epoch": 0.16702239591161883, + "flos": 23300747856000.0, + "grad_norm": 1.7658630551266277, + "language_loss": 0.85838449, + "learning_rate": 3.806260355115371e-06, + "loss": 0.93708432, + "num_input_tokens_seen": 60173215, + "router_z_loss_clip": 2.82421875, + "router_z_loss_mlp": 0.25305176, + "step": 2778, + "time_per_iteration": 4.054275989532471 + }, + { + "auxiliary_loss_clip": 0.06594409, + "auxiliary_loss_mlp": 0.01286931, + "balance_loss_clip": 0.06310806, + "balance_loss_mlp": 0.01260908, + "epoch": 0.1670825191642868, + "flos": 24432521754240.0, + "grad_norm": 2.130533626904146, + "language_loss": 0.75036883, + "learning_rate": 3.8060930989385778e-06, + "loss": 0.82918215, + "num_input_tokens_seen": 60190515, + "router_z_loss_clip": 2.83789062, + "router_z_loss_mlp": 0.26013184, + "step": 2779, + "time_per_iteration": 2.5570623874664307 + }, + { + "auxiliary_loss_clip": 0.06586824, + "auxiliary_loss_mlp": 0.01289404, + "balance_loss_clip": 0.06304757, + "balance_loss_mlp": 0.01263237, + "epoch": 0.16714264241695476, + "flos": 26804830550400.0, + "grad_norm": 2.754931380433817, + "language_loss": 0.66534865, + "learning_rate": 3.805925774274554e-06, + "loss": 0.74411094, + "num_input_tokens_seen": 60211655, + "router_z_loss_clip": 2.8203125, + "router_z_loss_mlp": 0.26147461, + "step": 2780, + "time_per_iteration": 2.5990118980407715 + }, + { + "auxiliary_loss_clip": 0.06585376, + "auxiliary_loss_mlp": 0.01289397, + "balance_loss_clip": 0.06306757, + "balance_loss_mlp": 0.01263075, + "epoch": 0.16720276566962272, + "flos": 21841768563840.0, + "grad_norm": 3.156228906236902, + "language_loss": 0.80115324, + "learning_rate": 3.805758381129643e-06, + "loss": 0.87990093, + "num_input_tokens_seen": 60230860, + "router_z_loss_clip": 2.78320312, + "router_z_loss_mlp": 0.26318359, + "step": 2781, + "time_per_iteration": 3.9395251274108887 + }, + { + "auxiliary_loss_clip": 0.06586023, + "auxiliary_loss_mlp": 0.01284981, + "balance_loss_clip": 0.06303258, + "balance_loss_mlp": 0.01258791, + "epoch": 0.1672628889222907, + "flos": 21476814105600.0, + "grad_norm": 1.4411022993090745, + "language_loss": 0.75756633, + "learning_rate": 3.805590919510193e-06, + "loss": 0.83627641, + "num_input_tokens_seen": 60250535, + "router_z_loss_clip": 2.828125, + "router_z_loss_mlp": 0.26171875, + "step": 2782, + "time_per_iteration": 2.6298012733459473 + }, + { + "auxiliary_loss_clip": 0.06600203, + "auxiliary_loss_mlp": 0.01288992, + "balance_loss_clip": 0.06305742, + "balance_loss_mlp": 0.0126242, + "epoch": 0.16732301217495865, + "flos": 30781915943040.0, + "grad_norm": 2.647632172572772, + "language_loss": 0.6861552, + "learning_rate": 3.8054233894225547e-06, + "loss": 0.76504719, + "num_input_tokens_seen": 60269530, + "router_z_loss_clip": 2.94335938, + "router_z_loss_mlp": 0.26550293, + "step": 2783, + "time_per_iteration": 2.5996532440185547 + }, + { + "auxiliary_loss_clip": 0.06581019, + "auxiliary_loss_mlp": 0.01284416, + "balance_loss_clip": 0.06301262, + "balance_loss_mlp": 0.0125931, + "epoch": 0.16738313542762664, + "flos": 23480940061440.0, + "grad_norm": 1.7043112393392166, + "language_loss": 0.70624614, + "learning_rate": 3.805255790873081e-06, + "loss": 0.78490055, + "num_input_tokens_seen": 60289900, + "router_z_loss_clip": 2.79492188, + "router_z_loss_mlp": 0.25109863, + "step": 2784, + "time_per_iteration": 2.5658257007598877 + }, + { + "auxiliary_loss_clip": 0.06592201, + "auxiliary_loss_mlp": 0.01289494, + "balance_loss_clip": 0.06306473, + "balance_loss_mlp": 0.01263041, + "epoch": 0.1674432586802946, + "flos": 29796861744000.0, + "grad_norm": 2.259998214947441, + "language_loss": 0.61717749, + "learning_rate": 3.805088123868126e-06, + "loss": 0.69599444, + "num_input_tokens_seen": 60310025, + "router_z_loss_clip": 2.85742188, + "router_z_loss_mlp": 0.2644043, + "step": 2785, + "time_per_iteration": 4.003845691680908 + }, + { + "auxiliary_loss_clip": 0.064503, + "auxiliary_loss_mlp": 0.01262182, + "balance_loss_clip": 0.06288917, + "balance_loss_mlp": 0.01255161, + "epoch": 0.16750338193296258, + "flos": 66157228857600.0, + "grad_norm": 0.7834191651915974, + "language_loss": 0.58330011, + "learning_rate": 3.8049203884140492e-06, + "loss": 0.66042489, + "num_input_tokens_seen": 60377800, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.07037354, + "step": 2786, + "time_per_iteration": 4.598146200180054 + }, + { + "auxiliary_loss_clip": 0.06587794, + "auxiliary_loss_mlp": 0.01289611, + "balance_loss_clip": 0.06301168, + "balance_loss_mlp": 0.0126298, + "epoch": 0.16756350518563054, + "flos": 25702881505920.0, + "grad_norm": 2.328984985341375, + "language_loss": 0.76757109, + "learning_rate": 3.80475258451721e-06, + "loss": 0.84634513, + "num_input_tokens_seen": 60398215, + "router_z_loss_clip": 2.8671875, + "router_z_loss_mlp": 0.26623535, + "step": 2787, + "time_per_iteration": 2.5801339149475098 + }, + { + "auxiliary_loss_clip": 0.06585419, + "auxiliary_loss_mlp": 0.01283974, + "balance_loss_clip": 0.06301223, + "balance_loss_mlp": 0.01257891, + "epoch": 0.1676236284382985, + "flos": 23841911450880.0, + "grad_norm": 1.9360315934234018, + "language_loss": 0.78495795, + "learning_rate": 3.804584712183972e-06, + "loss": 0.86365187, + "num_input_tokens_seen": 60416910, + "router_z_loss_clip": 2.84375, + "router_z_loss_mlp": 0.26086426, + "step": 2788, + "time_per_iteration": 2.5693655014038086 + }, + { + "auxiliary_loss_clip": 0.06435917, + "auxiliary_loss_mlp": 0.0126534, + "balance_loss_clip": 0.06275532, + "balance_loss_mlp": 0.01257765, + "epoch": 0.16768375169096647, + "flos": 59891313663360.0, + "grad_norm": 0.8394736884379908, + "language_loss": 0.59391403, + "learning_rate": 3.8044167714207013e-06, + "loss": 0.67092663, + "num_input_tokens_seen": 60468660, + "router_z_loss_clip": 1.60839844, + "router_z_loss_mlp": 0.07562256, + "step": 2789, + "time_per_iteration": 3.006455659866333 + }, + { + "auxiliary_loss_clip": 0.06580187, + "auxiliary_loss_mlp": 0.01282981, + "balance_loss_clip": 0.06298364, + "balance_loss_mlp": 0.01257566, + "epoch": 0.16774387494363446, + "flos": 38444785608960.0, + "grad_norm": 1.7149926461558054, + "language_loss": 0.71297312, + "learning_rate": 3.804248762233765e-06, + "loss": 0.79160476, + "num_input_tokens_seen": 60492370, + "router_z_loss_clip": 2.8203125, + "router_z_loss_mlp": 0.25427246, + "step": 2790, + "time_per_iteration": 2.6886403560638428 + }, + { + "auxiliary_loss_clip": 0.065869, + "auxiliary_loss_mlp": 0.01286845, + "balance_loss_clip": 0.06305605, + "balance_loss_mlp": 0.01260142, + "epoch": 0.16780399819630243, + "flos": 22644156862080.0, + "grad_norm": 1.6857838889349592, + "language_loss": 0.7969588, + "learning_rate": 3.8040806846295356e-06, + "loss": 0.8756963, + "num_input_tokens_seen": 60512655, + "router_z_loss_clip": 2.8125, + "router_z_loss_mlp": 0.26696777, + "step": 2791, + "time_per_iteration": 2.542351484298706 + }, + { + "auxiliary_loss_clip": 0.06585324, + "auxiliary_loss_mlp": 0.01283873, + "balance_loss_clip": 0.06304726, + "balance_loss_mlp": 0.01256502, + "epoch": 0.1678641214489704, + "flos": 32900001851520.0, + "grad_norm": 1.6260668766519037, + "language_loss": 0.72283256, + "learning_rate": 3.8039125386143853e-06, + "loss": 0.80152452, + "num_input_tokens_seen": 60533090, + "router_z_loss_clip": 2.80664062, + "router_z_loss_mlp": 0.27355957, + "step": 2792, + "time_per_iteration": 2.681652784347534 + }, + { + "auxiliary_loss_clip": 0.06588314, + "auxiliary_loss_mlp": 0.01281257, + "balance_loss_clip": 0.06305955, + "balance_loss_mlp": 0.01256223, + "epoch": 0.16792424470163836, + "flos": 19981133925120.0, + "grad_norm": 2.7315250216088756, + "language_loss": 0.7262826, + "learning_rate": 3.803744324194691e-06, + "loss": 0.80497831, + "num_input_tokens_seen": 60553190, + "router_z_loss_clip": 2.82617188, + "router_z_loss_mlp": 0.25036621, + "step": 2793, + "time_per_iteration": 2.5261969566345215 + }, + { + "auxiliary_loss_clip": 0.06583093, + "auxiliary_loss_mlp": 0.01283488, + "balance_loss_clip": 0.06301598, + "balance_loss_mlp": 0.01257333, + "epoch": 0.16798436795430632, + "flos": 19726114423680.0, + "grad_norm": 2.037397007218884, + "language_loss": 0.78064799, + "learning_rate": 3.803576041376831e-06, + "loss": 0.85931379, + "num_input_tokens_seen": 60571995, + "router_z_loss_clip": 2.81445312, + "router_z_loss_mlp": 0.26135254, + "step": 2794, + "time_per_iteration": 2.5393919944763184 + }, + { + "auxiliary_loss_clip": 0.06580402, + "auxiliary_loss_mlp": 0.01288563, + "balance_loss_clip": 0.06298761, + "balance_loss_mlp": 0.01262206, + "epoch": 0.1680444912069743, + "flos": 28111346138880.0, + "grad_norm": 2.312644294934493, + "language_loss": 0.72345173, + "learning_rate": 3.803407690167187e-06, + "loss": 0.80214143, + "num_input_tokens_seen": 60591275, + "router_z_loss_clip": 2.81640625, + "router_z_loss_mlp": 0.26379395, + "step": 2795, + "time_per_iteration": 2.565215587615967 + }, + { + "auxiliary_loss_clip": 0.06578698, + "auxiliary_loss_mlp": 0.01278302, + "balance_loss_clip": 0.06297935, + "balance_loss_mlp": 0.01254329, + "epoch": 0.16810461445964225, + "flos": 18080695797120.0, + "grad_norm": 1.8533332907405589, + "language_loss": 0.85181081, + "learning_rate": 3.803239270572142e-06, + "loss": 0.93038082, + "num_input_tokens_seen": 60609235, + "router_z_loss_clip": 2.81054688, + "router_z_loss_mlp": 0.23986816, + "step": 2796, + "time_per_iteration": 2.627962112426758 + }, + { + "auxiliary_loss_clip": 0.06595714, + "auxiliary_loss_mlp": 0.01283274, + "balance_loss_clip": 0.0630767, + "balance_loss_mlp": 0.01256571, + "epoch": 0.16816473771231025, + "flos": 23885488373760.0, + "grad_norm": 2.13286065055067, + "language_loss": 0.82093614, + "learning_rate": 3.8030707825980838e-06, + "loss": 0.89972603, + "num_input_tokens_seen": 60629880, + "router_z_loss_clip": 2.8828125, + "router_z_loss_mlp": 0.26696777, + "step": 2797, + "time_per_iteration": 2.5887176990509033 + }, + { + "auxiliary_loss_clip": 0.06571205, + "auxiliary_loss_mlp": 0.01280989, + "balance_loss_clip": 0.06298848, + "balance_loss_mlp": 0.01257922, + "epoch": 0.1682248609649782, + "flos": 22790163801600.0, + "grad_norm": 1.6719709230048432, + "language_loss": 0.75814915, + "learning_rate": 3.802902226251401e-06, + "loss": 0.83667111, + "num_input_tokens_seen": 60651175, + "router_z_loss_clip": 2.72460938, + "router_z_loss_mlp": 0.23071289, + "step": 2798, + "time_per_iteration": 2.5682647228240967 + }, + { + "auxiliary_loss_clip": 0.06575698, + "auxiliary_loss_mlp": 0.01285158, + "balance_loss_clip": 0.06297997, + "balance_loss_mlp": 0.01261483, + "epoch": 0.16828498421764618, + "flos": 20711545966080.0, + "grad_norm": 1.6493106854951614, + "language_loss": 0.8051939, + "learning_rate": 3.8027336015384845e-06, + "loss": 0.88380253, + "num_input_tokens_seen": 60670210, + "router_z_loss_clip": 2.77929688, + "router_z_loss_mlp": 0.23669434, + "step": 2799, + "time_per_iteration": 2.5808820724487305 + }, + { + "auxiliary_loss_clip": 0.06588444, + "auxiliary_loss_mlp": 0.01290497, + "balance_loss_clip": 0.06306663, + "balance_loss_mlp": 0.01264951, + "epoch": 0.16834510747031414, + "flos": 29427714581760.0, + "grad_norm": 2.08568782894778, + "language_loss": 0.71203279, + "learning_rate": 3.8025649084657296e-06, + "loss": 0.79082221, + "num_input_tokens_seen": 60690895, + "router_z_loss_clip": 2.81835938, + "router_z_loss_mlp": 0.25561523, + "step": 2800, + "time_per_iteration": 2.6072590351104736 + }, + { + "auxiliary_loss_clip": 0.06577089, + "auxiliary_loss_mlp": 0.01284192, + "balance_loss_clip": 0.06299706, + "balance_loss_mlp": 0.01258705, + "epoch": 0.1684052307229821, + "flos": 18150407994240.0, + "grad_norm": 2.3689825925758647, + "language_loss": 0.84516144, + "learning_rate": 3.8023961470395326e-06, + "loss": 0.9237743, + "num_input_tokens_seen": 60708280, + "router_z_loss_clip": 2.7734375, + "router_z_loss_mlp": 0.25488281, + "step": 2801, + "time_per_iteration": 2.5283203125 + }, + { + "auxiliary_loss_clip": 0.06582664, + "auxiliary_loss_mlp": 0.01284981, + "balance_loss_clip": 0.06302365, + "balance_loss_mlp": 0.01258612, + "epoch": 0.16846535397565007, + "flos": 16579439320320.0, + "grad_norm": 3.0795087290353744, + "language_loss": 0.84073383, + "learning_rate": 3.8022273172662933e-06, + "loss": 0.91941023, + "num_input_tokens_seen": 60724150, + "router_z_loss_clip": 2.80273438, + "router_z_loss_mlp": 0.26391602, + "step": 2802, + "time_per_iteration": 2.493727684020996 + }, + { + "auxiliary_loss_clip": 0.06582403, + "auxiliary_loss_mlp": 0.01282997, + "balance_loss_clip": 0.0630409, + "balance_loss_mlp": 0.01256831, + "epoch": 0.16852547722831807, + "flos": 30416667995520.0, + "grad_norm": 4.967511006144659, + "language_loss": 0.81234676, + "learning_rate": 3.802058419152413e-06, + "loss": 0.89100075, + "num_input_tokens_seen": 60746485, + "router_z_loss_clip": 2.78320312, + "router_z_loss_mlp": 0.26147461, + "step": 2803, + "time_per_iteration": 2.6188409328460693 + }, + { + "auxiliary_loss_clip": 0.06578018, + "auxiliary_loss_mlp": 0.01280405, + "balance_loss_clip": 0.06301461, + "balance_loss_mlp": 0.01256157, + "epoch": 0.16858560048098603, + "flos": 33515279982720.0, + "grad_norm": 2.6560543874068205, + "language_loss": 0.77301621, + "learning_rate": 3.801889452704297e-06, + "loss": 0.85160041, + "num_input_tokens_seen": 60762875, + "router_z_loss_clip": 2.76367188, + "router_z_loss_mlp": 0.24230957, + "step": 2804, + "time_per_iteration": 2.6222236156463623 + }, + { + "auxiliary_loss_clip": 0.06456417, + "auxiliary_loss_mlp": 0.01261999, + "balance_loss_clip": 0.06296105, + "balance_loss_mlp": 0.0125524, + "epoch": 0.168645723733654, + "flos": 67390845793920.0, + "grad_norm": 0.7985418659660302, + "language_loss": 0.55433214, + "learning_rate": 3.8017204179283526e-06, + "loss": 0.63151628, + "num_input_tokens_seen": 60825510, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.06774902, + "step": 2805, + "time_per_iteration": 3.1424005031585693 + }, + { + "auxiliary_loss_clip": 0.06571464, + "auxiliary_loss_mlp": 0.01283981, + "balance_loss_clip": 0.06301463, + "balance_loss_mlp": 0.01260723, + "epoch": 0.16870584698632196, + "flos": 21331016801280.0, + "grad_norm": 1.8814500249786532, + "language_loss": 0.74235076, + "learning_rate": 3.8015513148309892e-06, + "loss": 0.82090515, + "num_input_tokens_seen": 60844440, + "router_z_loss_clip": 2.70117188, + "router_z_loss_mlp": 0.23254395, + "step": 2806, + "time_per_iteration": 2.5448226928710938 + }, + { + "auxiliary_loss_clip": 0.06569488, + "auxiliary_loss_mlp": 0.01288633, + "balance_loss_clip": 0.06295753, + "balance_loss_mlp": 0.01264123, + "epoch": 0.16876597023898993, + "flos": 20747030970240.0, + "grad_norm": 2.4625186255791407, + "language_loss": 0.70848989, + "learning_rate": 3.80138214341862e-06, + "loss": 0.78707111, + "num_input_tokens_seen": 60863210, + "router_z_loss_clip": 2.734375, + "router_z_loss_mlp": 0.24523926, + "step": 2807, + "time_per_iteration": 2.5282390117645264 + }, + { + "auxiliary_loss_clip": 0.06578949, + "auxiliary_loss_mlp": 0.01289591, + "balance_loss_clip": 0.06299816, + "balance_loss_mlp": 0.0126383, + "epoch": 0.1688260934916579, + "flos": 20309806765440.0, + "grad_norm": 3.7758907272624715, + "language_loss": 0.71724349, + "learning_rate": 3.8012129036976587e-06, + "loss": 0.79592896, + "num_input_tokens_seen": 60882510, + "router_z_loss_clip": 2.79101562, + "router_z_loss_mlp": 0.25744629, + "step": 2808, + "time_per_iteration": 2.5146172046661377 + }, + { + "auxiliary_loss_clip": 0.06592815, + "auxiliary_loss_mlp": 0.01288179, + "balance_loss_clip": 0.06306504, + "balance_loss_mlp": 0.01261119, + "epoch": 0.16888621674432586, + "flos": 20347136559360.0, + "grad_norm": 2.150924717168134, + "language_loss": 0.80452245, + "learning_rate": 3.8010435956745236e-06, + "loss": 0.88333237, + "num_input_tokens_seen": 60901105, + "router_z_loss_clip": 2.86328125, + "router_z_loss_mlp": 0.27075195, + "step": 2809, + "time_per_iteration": 2.590801477432251 + }, + { + "auxiliary_loss_clip": 0.06586212, + "auxiliary_loss_mlp": 0.01286252, + "balance_loss_clip": 0.06299593, + "balance_loss_mlp": 0.01258965, + "epoch": 0.16894633999699385, + "flos": 16248963617280.0, + "grad_norm": 2.023624064417177, + "language_loss": 0.8897475, + "learning_rate": 3.8008742193556358e-06, + "loss": 0.96847212, + "num_input_tokens_seen": 60915340, + "router_z_loss_clip": 2.86523438, + "router_z_loss_mlp": 0.27294922, + "step": 2810, + "time_per_iteration": 2.553370714187622 + }, + { + "auxiliary_loss_clip": 0.0659079, + "auxiliary_loss_mlp": 0.01302127, + "balance_loss_clip": 0.06304274, + "balance_loss_mlp": 0.01273994, + "epoch": 0.16900646324966181, + "flos": 19616347175040.0, + "grad_norm": 1.906856377822649, + "language_loss": 0.93345243, + "learning_rate": 3.800704774747416e-06, + "loss": 1.01238155, + "num_input_tokens_seen": 60933735, + "router_z_loss_clip": 2.86523438, + "router_z_loss_mlp": 0.28137207, + "step": 2811, + "time_per_iteration": 2.5584306716918945 + }, + { + "auxiliary_loss_clip": 0.06579725, + "auxiliary_loss_mlp": 0.01293368, + "balance_loss_clip": 0.0629798, + "balance_loss_mlp": 0.01266534, + "epoch": 0.16906658650232978, + "flos": 22024644099840.0, + "grad_norm": 1.777677884933971, + "language_loss": 0.80087781, + "learning_rate": 3.800535261856291e-06, + "loss": 0.87960875, + "num_input_tokens_seen": 60953105, + "router_z_loss_clip": 2.81445312, + "router_z_loss_mlp": 0.26818848, + "step": 2812, + "time_per_iteration": 2.5193934440612793 + }, + { + "auxiliary_loss_clip": 0.06578699, + "auxiliary_loss_mlp": 0.01288816, + "balance_loss_clip": 0.06299478, + "balance_loss_mlp": 0.01262983, + "epoch": 0.16912670975499774, + "flos": 11768212131840.0, + "grad_norm": 2.3060118484148586, + "language_loss": 0.76260078, + "learning_rate": 3.8003656806886887e-06, + "loss": 0.84127587, + "num_input_tokens_seen": 60969150, + "router_z_loss_clip": 2.79296875, + "router_z_loss_mlp": 0.25830078, + "step": 2813, + "time_per_iteration": 2.5597875118255615 + }, + { + "auxiliary_loss_clip": 0.06583597, + "auxiliary_loss_mlp": 0.01290749, + "balance_loss_clip": 0.06300971, + "balance_loss_mlp": 0.01265083, + "epoch": 0.1691868330076657, + "flos": 17166443898240.0, + "grad_norm": 2.6968588943339444, + "language_loss": 0.70284265, + "learning_rate": 3.8001960312510396e-06, + "loss": 0.78158611, + "num_input_tokens_seen": 60982825, + "router_z_loss_clip": 2.82617188, + "router_z_loss_mlp": 0.2565918, + "step": 2814, + "time_per_iteration": 2.4971132278442383 + }, + { + "auxiliary_loss_clip": 0.06581523, + "auxiliary_loss_mlp": 0.01299068, + "balance_loss_clip": 0.06302465, + "balance_loss_mlp": 0.01272174, + "epoch": 0.16924695626033368, + "flos": 22422693720960.0, + "grad_norm": 1.782997034372258, + "language_loss": 0.63103068, + "learning_rate": 3.800026313549776e-06, + "loss": 0.7098366, + "num_input_tokens_seen": 61000875, + "router_z_loss_clip": 2.79296875, + "router_z_loss_mlp": 0.2689209, + "step": 2815, + "time_per_iteration": 2.583073377609253 + }, + { + "auxiliary_loss_clip": 0.06579722, + "auxiliary_loss_mlp": 0.01301206, + "balance_loss_clip": 0.06305208, + "balance_loss_mlp": 0.01275195, + "epoch": 0.16930707951300164, + "flos": 25746835772160.0, + "grad_norm": 1.6235196600742487, + "language_loss": 0.82652867, + "learning_rate": 3.7998565275913342e-06, + "loss": 0.90533793, + "num_input_tokens_seen": 61021940, + "router_z_loss_clip": 2.7421875, + "router_z_loss_mlp": 0.26037598, + "step": 2816, + "time_per_iteration": 2.567267894744873 + }, + { + "auxiliary_loss_clip": 0.06582578, + "auxiliary_loss_mlp": 0.01283511, + "balance_loss_clip": 0.06305215, + "balance_loss_mlp": 0.01257404, + "epoch": 0.16936720276566963, + "flos": 22753588694400.0, + "grad_norm": 2.305113279035628, + "language_loss": 0.88275278, + "learning_rate": 3.799686673382153e-06, + "loss": 0.96141362, + "num_input_tokens_seen": 61040285, + "router_z_loss_clip": 2.77148438, + "router_z_loss_mlp": 0.26074219, + "step": 2817, + "time_per_iteration": 2.55474853515625 + }, + { + "auxiliary_loss_clip": 0.06582828, + "auxiliary_loss_mlp": 0.0128986, + "balance_loss_clip": 0.06307572, + "balance_loss_mlp": 0.01264326, + "epoch": 0.1694273260183376, + "flos": 19580191338240.0, + "grad_norm": 1.9827332941616407, + "language_loss": 0.82882643, + "learning_rate": 3.799516750928672e-06, + "loss": 0.90755332, + "num_input_tokens_seen": 61059020, + "router_z_loss_clip": 2.75195312, + "router_z_loss_mlp": 0.2557373, + "step": 2818, + "time_per_iteration": 4.006748676300049 + }, + { + "auxiliary_loss_clip": 0.06584448, + "auxiliary_loss_mlp": 0.01293023, + "balance_loss_clip": 0.06306577, + "balance_loss_mlp": 0.01267905, + "epoch": 0.16948744927100556, + "flos": 12462636044160.0, + "grad_norm": 2.7889091010227367, + "language_loss": 0.81285071, + "learning_rate": 3.799346760237336e-06, + "loss": 0.8916254, + "num_input_tokens_seen": 61074245, + "router_z_loss_clip": 2.77929688, + "router_z_loss_mlp": 0.2512207, + "step": 2819, + "time_per_iteration": 2.513493537902832 + }, + { + "auxiliary_loss_clip": 0.06486231, + "auxiliary_loss_mlp": 0.01264851, + "balance_loss_clip": 0.06326687, + "balance_loss_mlp": 0.01257299, + "epoch": 0.16954757252367353, + "flos": 71309470164480.0, + "grad_norm": 0.8945207214981431, + "language_loss": 0.6004045, + "learning_rate": 3.7991767013145902e-06, + "loss": 0.67791533, + "num_input_tokens_seen": 61127080, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.07537842, + "step": 2820, + "time_per_iteration": 3.0841901302337646 + }, + { + "auxiliary_loss_clip": 0.06583934, + "auxiliary_loss_mlp": 0.01283404, + "balance_loss_clip": 0.06305862, + "balance_loss_mlp": 0.01258656, + "epoch": 0.1696076957763415, + "flos": 29614237770240.0, + "grad_norm": 2.2684361224992315, + "language_loss": 0.79040307, + "learning_rate": 3.7990065741668844e-06, + "loss": 0.86907649, + "num_input_tokens_seen": 61146955, + "router_z_loss_clip": 2.78125, + "router_z_loss_mlp": 0.24755859, + "step": 2821, + "time_per_iteration": 4.0664753913879395 + }, + { + "auxiliary_loss_clip": 0.06580411, + "auxiliary_loss_mlp": 0.01287682, + "balance_loss_clip": 0.06301302, + "balance_loss_mlp": 0.01260884, + "epoch": 0.16966781902900946, + "flos": 24395359668480.0, + "grad_norm": 4.427680473234215, + "language_loss": 0.79946303, + "learning_rate": 3.7988363788006685e-06, + "loss": 0.87814403, + "num_input_tokens_seen": 61166605, + "router_z_loss_clip": 2.79101562, + "router_z_loss_mlp": 0.26782227, + "step": 2822, + "time_per_iteration": 2.591439962387085 + }, + { + "auxiliary_loss_clip": 0.06573688, + "auxiliary_loss_mlp": 0.01292623, + "balance_loss_clip": 0.06300368, + "balance_loss_mlp": 0.0126834, + "epoch": 0.16972794228167745, + "flos": 23045392938240.0, + "grad_norm": 1.79403732378333, + "language_loss": 0.75404185, + "learning_rate": 3.7986661152223967e-06, + "loss": 0.83270496, + "num_input_tokens_seen": 61186535, + "router_z_loss_clip": 2.734375, + "router_z_loss_mlp": 0.24291992, + "step": 2823, + "time_per_iteration": 2.607241153717041 + }, + { + "auxiliary_loss_clip": 0.06584911, + "auxiliary_loss_mlp": 0.01296286, + "balance_loss_clip": 0.06309374, + "balance_loss_mlp": 0.01270704, + "epoch": 0.16978806553434542, + "flos": 35237915746560.0, + "grad_norm": 1.9541945473914888, + "language_loss": 0.60637134, + "learning_rate": 3.7984957834385257e-06, + "loss": 0.68518329, + "num_input_tokens_seen": 61208965, + "router_z_loss_clip": 2.7578125, + "router_z_loss_mlp": 0.2557373, + "step": 2824, + "time_per_iteration": 4.110937595367432 + }, + { + "auxiliary_loss_clip": 0.06588213, + "auxiliary_loss_mlp": 0.01295922, + "balance_loss_clip": 0.06311615, + "balance_loss_mlp": 0.01271114, + "epoch": 0.16984818878701338, + "flos": 32022366986880.0, + "grad_norm": 1.641592491230249, + "language_loss": 0.73562557, + "learning_rate": 3.7983253834555144e-06, + "loss": 0.81446695, + "num_input_tokens_seen": 61230670, + "router_z_loss_clip": 2.765625, + "router_z_loss_mlp": 0.24816895, + "step": 2825, + "time_per_iteration": 2.634206533432007 + }, + { + "auxiliary_loss_clip": 0.06593174, + "auxiliary_loss_mlp": 0.01295449, + "balance_loss_clip": 0.06306911, + "balance_loss_mlp": 0.01267411, + "epoch": 0.16990831203968135, + "flos": 22824936046080.0, + "grad_norm": 2.0964880275629465, + "language_loss": 0.86494017, + "learning_rate": 3.7981549152798245e-06, + "loss": 0.94382638, + "num_input_tokens_seen": 61249510, + "router_z_loss_clip": 2.8671875, + "router_z_loss_mlp": 0.28051758, + "step": 2826, + "time_per_iteration": 4.0616254806518555 + }, + { + "auxiliary_loss_clip": 0.0658946, + "auxiliary_loss_mlp": 0.01287444, + "balance_loss_clip": 0.0630484, + "balance_loss_mlp": 0.01260122, + "epoch": 0.1699684352923493, + "flos": 23046315333120.0, + "grad_norm": 1.7026807922554432, + "language_loss": 0.83019429, + "learning_rate": 3.7979843789179196e-06, + "loss": 0.90896332, + "num_input_tokens_seen": 61269440, + "router_z_loss_clip": 2.8515625, + "router_z_loss_mlp": 0.27307129, + "step": 2827, + "time_per_iteration": 2.5943539142608643 + }, + { + "auxiliary_loss_clip": 0.0658665, + "auxiliary_loss_mlp": 0.01291922, + "balance_loss_clip": 0.06303778, + "balance_loss_mlp": 0.01264206, + "epoch": 0.17002855854501728, + "flos": 21440532487680.0, + "grad_norm": 1.9993521816112911, + "language_loss": 0.75042886, + "learning_rate": 3.797813774376267e-06, + "loss": 0.82921457, + "num_input_tokens_seen": 61288195, + "router_z_loss_clip": 2.83203125, + "router_z_loss_mlp": 0.27722168, + "step": 2828, + "time_per_iteration": 2.5574147701263428 + }, + { + "auxiliary_loss_clip": 0.06457284, + "auxiliary_loss_mlp": 0.01264115, + "balance_loss_clip": 0.06297607, + "balance_loss_mlp": 0.01257433, + "epoch": 0.17008868179768524, + "flos": 71473966928640.0, + "grad_norm": 0.7544805989931621, + "language_loss": 0.56274545, + "learning_rate": 3.797643101661336e-06, + "loss": 0.63995945, + "num_input_tokens_seen": 61350850, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.06695557, + "step": 2829, + "time_per_iteration": 3.2194459438323975 + }, + { + "auxiliary_loss_clip": 0.06582125, + "auxiliary_loss_mlp": 0.01292929, + "balance_loss_clip": 0.06305368, + "balance_loss_mlp": 0.01267168, + "epoch": 0.17014880505035324, + "flos": 24907327315200.0, + "grad_norm": 1.8200636755843338, + "language_loss": 0.84280431, + "learning_rate": 3.7974723607795983e-06, + "loss": 0.9215548, + "num_input_tokens_seen": 61370765, + "router_z_loss_clip": 2.76757812, + "router_z_loss_mlp": 0.25769043, + "step": 2830, + "time_per_iteration": 2.5831046104431152 + }, + { + "auxiliary_loss_clip": 0.0658033, + "auxiliary_loss_mlp": 0.01286886, + "balance_loss_clip": 0.06302518, + "balance_loss_mlp": 0.0125985, + "epoch": 0.1702089283030212, + "flos": 29870263520640.0, + "grad_norm": 2.350653052094916, + "language_loss": 0.78878641, + "learning_rate": 3.797301551737529e-06, + "loss": 0.86745858, + "num_input_tokens_seen": 61388935, + "router_z_loss_clip": 2.77929688, + "router_z_loss_mlp": 0.2701416, + "step": 2831, + "time_per_iteration": 2.6084542274475098 + }, + { + "auxiliary_loss_clip": 0.06581105, + "auxiliary_loss_mlp": 0.01292582, + "balance_loss_clip": 0.06301375, + "balance_loss_mlp": 0.01266975, + "epoch": 0.17026905155568917, + "flos": 17749171918080.0, + "grad_norm": 2.0319157009696327, + "language_loss": 0.80466926, + "learning_rate": 3.7971306745416044e-06, + "loss": 0.88340604, + "num_input_tokens_seen": 61407350, + "router_z_loss_clip": 2.79882812, + "router_z_loss_mlp": 0.25610352, + "step": 2832, + "time_per_iteration": 2.5211668014526367 + }, + { + "auxiliary_loss_clip": 0.06573536, + "auxiliary_loss_mlp": 0.01286888, + "balance_loss_clip": 0.06297776, + "balance_loss_mlp": 0.0126133, + "epoch": 0.17032917480835713, + "flos": 23155327895040.0, + "grad_norm": 1.986078489446087, + "language_loss": 0.89480335, + "learning_rate": 3.7969597291983046e-06, + "loss": 0.97340751, + "num_input_tokens_seen": 61429010, + "router_z_loss_clip": 2.7578125, + "router_z_loss_mlp": 0.25561523, + "step": 2833, + "time_per_iteration": 2.560952663421631 + }, + { + "auxiliary_loss_clip": 0.06575279, + "auxiliary_loss_mlp": 0.01285966, + "balance_loss_clip": 0.06302077, + "balance_loss_mlp": 0.01261123, + "epoch": 0.1703892980610251, + "flos": 39211940465280.0, + "grad_norm": 2.220027390834487, + "language_loss": 0.73524815, + "learning_rate": 3.7967887157141115e-06, + "loss": 0.81386054, + "num_input_tokens_seen": 61450040, + "router_z_loss_clip": 2.73046875, + "router_z_loss_mlp": 0.24829102, + "step": 2834, + "time_per_iteration": 2.679527521133423 + }, + { + "auxiliary_loss_clip": 0.06581013, + "auxiliary_loss_mlp": 0.01285804, + "balance_loss_clip": 0.06300581, + "balance_loss_mlp": 0.01260245, + "epoch": 0.17044942131369306, + "flos": 23045728354560.0, + "grad_norm": 1.8327084439605401, + "language_loss": 0.87308288, + "learning_rate": 3.7966176340955106e-06, + "loss": 0.95175111, + "num_input_tokens_seen": 61468585, + "router_z_loss_clip": 2.8046875, + "router_z_loss_mlp": 0.2557373, + "step": 2835, + "time_per_iteration": 2.656421661376953 + }, + { + "auxiliary_loss_clip": 0.06579748, + "auxiliary_loss_mlp": 0.01283404, + "balance_loss_clip": 0.06297451, + "balance_loss_mlp": 0.01256451, + "epoch": 0.17050954456636103, + "flos": 17060533937280.0, + "grad_norm": 2.3811755619363058, + "language_loss": 0.75235045, + "learning_rate": 3.796446484348989e-06, + "loss": 0.83098197, + "num_input_tokens_seen": 61486330, + "router_z_loss_clip": 2.82226562, + "router_z_loss_mlp": 0.26940918, + "step": 2836, + "time_per_iteration": 2.4939451217651367 + }, + { + "auxiliary_loss_clip": 0.06577778, + "auxiliary_loss_mlp": 0.01283432, + "balance_loss_clip": 0.06295718, + "balance_loss_mlp": 0.01256955, + "epoch": 0.17056966781902902, + "flos": 16842634594560.0, + "grad_norm": 2.2113478912931606, + "language_loss": 0.81597924, + "learning_rate": 3.796275266481036e-06, + "loss": 0.89459133, + "num_input_tokens_seen": 61503950, + "router_z_loss_clip": 2.8203125, + "router_z_loss_mlp": 0.26501465, + "step": 2837, + "time_per_iteration": 2.5308785438537598 + }, + { + "auxiliary_loss_clip": 0.06567004, + "auxiliary_loss_mlp": 0.01296468, + "balance_loss_clip": 0.06298919, + "balance_loss_mlp": 0.01272149, + "epoch": 0.17062979107169698, + "flos": 17718340815360.0, + "grad_norm": 2.307982469607828, + "language_loss": 0.84291762, + "learning_rate": 3.7961039804981456e-06, + "loss": 0.92155236, + "num_input_tokens_seen": 61523550, + "router_z_loss_clip": 2.67578125, + "router_z_loss_mlp": 0.24328613, + "step": 2838, + "time_per_iteration": 2.509929895401001 + }, + { + "auxiliary_loss_clip": 0.06570365, + "auxiliary_loss_mlp": 0.01284738, + "balance_loss_clip": 0.06295732, + "balance_loss_mlp": 0.01260264, + "epoch": 0.17068991432436495, + "flos": 22531035450240.0, + "grad_norm": 1.8555127422179185, + "language_loss": 0.94406807, + "learning_rate": 3.795932626406812e-06, + "loss": 1.02261913, + "num_input_tokens_seen": 61542720, + "router_z_loss_clip": 2.74804688, + "router_z_loss_mlp": 0.24450684, + "step": 2839, + "time_per_iteration": 2.588021755218506 + }, + { + "auxiliary_loss_clip": 0.06569307, + "auxiliary_loss_mlp": 0.01284917, + "balance_loss_clip": 0.06293422, + "balance_loss_mlp": 0.01256808, + "epoch": 0.17075003757703291, + "flos": 25889698183680.0, + "grad_norm": 2.1000046554588394, + "language_loss": 0.84480917, + "learning_rate": 3.7957612042135336e-06, + "loss": 0.92335141, + "num_input_tokens_seen": 61563040, + "router_z_loss_clip": 2.75976562, + "router_z_loss_mlp": 0.28100586, + "step": 2840, + "time_per_iteration": 2.5653579235076904 + }, + { + "auxiliary_loss_clip": 0.06573716, + "auxiliary_loss_mlp": 0.01290397, + "balance_loss_clip": 0.06298221, + "balance_loss_mlp": 0.01263503, + "epoch": 0.17081016082970088, + "flos": 20126931229440.0, + "grad_norm": 1.871912800472889, + "language_loss": 0.76954079, + "learning_rate": 3.79558971392481e-06, + "loss": 0.8481819, + "num_input_tokens_seen": 61581890, + "router_z_loss_clip": 2.75390625, + "router_z_loss_mlp": 0.26879883, + "step": 2841, + "time_per_iteration": 2.5525524616241455 + }, + { + "auxiliary_loss_clip": 0.06573537, + "auxiliary_loss_mlp": 0.01282668, + "balance_loss_clip": 0.06297247, + "balance_loss_mlp": 0.01257026, + "epoch": 0.17087028408236885, + "flos": 24943441224960.0, + "grad_norm": 1.6793065618865832, + "language_loss": 0.77364486, + "learning_rate": 3.7954181555471443e-06, + "loss": 0.85220695, + "num_input_tokens_seen": 61602095, + "router_z_loss_clip": 2.765625, + "router_z_loss_mlp": 0.2565918, + "step": 2842, + "time_per_iteration": 2.5674381256103516 + }, + { + "auxiliary_loss_clip": 0.06561892, + "auxiliary_loss_mlp": 0.01282368, + "balance_loss_clip": 0.06295875, + "balance_loss_mlp": 0.01257489, + "epoch": 0.17093040733503684, + "flos": 19063108592640.0, + "grad_norm": 1.967223672886595, + "language_loss": 0.87176019, + "learning_rate": 3.795246529087043e-06, + "loss": 0.95020282, + "num_input_tokens_seen": 61620400, + "router_z_loss_clip": 2.66015625, + "router_z_loss_mlp": 0.24853516, + "step": 2843, + "time_per_iteration": 2.546586036682129 + }, + { + "auxiliary_loss_clip": 0.06571361, + "auxiliary_loss_mlp": 0.01285811, + "balance_loss_clip": 0.06299275, + "balance_loss_mlp": 0.01262339, + "epoch": 0.1709905305877048, + "flos": 13083993596160.0, + "grad_norm": 1.8800221555677419, + "language_loss": 0.69446707, + "learning_rate": 3.7950748345510126e-06, + "loss": 0.7730388, + "num_input_tokens_seen": 61637680, + "router_z_loss_clip": 2.72265625, + "router_z_loss_mlp": 0.23461914, + "step": 2844, + "time_per_iteration": 2.5857818126678467 + }, + { + "auxiliary_loss_clip": 0.06575634, + "auxiliary_loss_mlp": 0.01288208, + "balance_loss_clip": 0.06299984, + "balance_loss_mlp": 0.0126346, + "epoch": 0.17105065384037277, + "flos": 19215530369280.0, + "grad_norm": 1.7660184935388845, + "language_loss": 0.79213876, + "learning_rate": 3.7949030719455646e-06, + "loss": 0.87077713, + "num_input_tokens_seen": 61655630, + "router_z_loss_clip": 2.75585938, + "router_z_loss_mlp": 0.24780273, + "step": 2845, + "time_per_iteration": 2.5564208030700684 + }, + { + "auxiliary_loss_clip": 0.06577709, + "auxiliary_loss_mlp": 0.01293667, + "balance_loss_clip": 0.06302914, + "balance_loss_mlp": 0.01268586, + "epoch": 0.17111077709304073, + "flos": 18521106456960.0, + "grad_norm": 2.255753625544696, + "language_loss": 0.79110825, + "learning_rate": 3.7947312412772127e-06, + "loss": 0.86982203, + "num_input_tokens_seen": 61673475, + "router_z_loss_clip": 2.74804688, + "router_z_loss_mlp": 0.25085449, + "step": 2846, + "time_per_iteration": 2.513607978820801 + }, + { + "auxiliary_loss_clip": 0.06568472, + "auxiliary_loss_mlp": 0.01290569, + "balance_loss_clip": 0.06298524, + "balance_loss_mlp": 0.01266727, + "epoch": 0.1711709003457087, + "flos": 25089699726720.0, + "grad_norm": 1.7214534237870849, + "language_loss": 0.80675447, + "learning_rate": 3.794559342552472e-06, + "loss": 0.88534492, + "num_input_tokens_seen": 61693370, + "router_z_loss_clip": 2.69921875, + "router_z_loss_mlp": 0.23852539, + "step": 2847, + "time_per_iteration": 2.618793249130249 + }, + { + "auxiliary_loss_clip": 0.06569728, + "auxiliary_loss_mlp": 0.01293508, + "balance_loss_clip": 0.0629475, + "balance_loss_mlp": 0.01268796, + "epoch": 0.17123102359837666, + "flos": 17572124240640.0, + "grad_norm": 2.2846174525506973, + "language_loss": 0.88074541, + "learning_rate": 3.7943873757778614e-06, + "loss": 0.95937777, + "num_input_tokens_seen": 61710820, + "router_z_loss_clip": 2.75, + "router_z_loss_mlp": 0.24719238, + "step": 2848, + "time_per_iteration": 2.487272024154663 + }, + { + "auxiliary_loss_clip": 0.06569223, + "auxiliary_loss_mlp": 0.01309638, + "balance_loss_clip": 0.06294799, + "balance_loss_mlp": 0.01284688, + "epoch": 0.17129114685104463, + "flos": 26180244616320.0, + "grad_norm": 1.906108969463994, + "language_loss": 0.76101243, + "learning_rate": 3.794215340959902e-06, + "loss": 0.83980107, + "num_input_tokens_seen": 61729855, + "router_z_loss_clip": 2.74414062, + "router_z_loss_mlp": 0.24938965, + "step": 2849, + "time_per_iteration": 2.620347738265991 + }, + { + "auxiliary_loss_clip": 0.06449599, + "auxiliary_loss_mlp": 0.01264516, + "balance_loss_clip": 0.06291912, + "balance_loss_mlp": 0.01257077, + "epoch": 0.17135127010371262, + "flos": 69290696943360.0, + "grad_norm": 0.770033327211451, + "language_loss": 0.57434958, + "learning_rate": 3.7940432381051163e-06, + "loss": 0.65149075, + "num_input_tokens_seen": 61790290, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.07421875, + "step": 2850, + "time_per_iteration": 3.1464109420776367 + }, + { + "auxiliary_loss_clip": 0.0656237, + "auxiliary_loss_mlp": 0.01301725, + "balance_loss_clip": 0.06296088, + "balance_loss_mlp": 0.01277966, + "epoch": 0.1714113933563806, + "flos": 23556857460480.0, + "grad_norm": 2.479535747356738, + "language_loss": 0.81586778, + "learning_rate": 3.793871067220031e-06, + "loss": 0.89450872, + "num_input_tokens_seen": 61809265, + "router_z_loss_clip": 2.66601562, + "router_z_loss_mlp": 0.23742676, + "step": 2851, + "time_per_iteration": 2.558507204055786 + }, + { + "auxiliary_loss_clip": 0.06565535, + "auxiliary_loss_mlp": 0.01289531, + "balance_loss_clip": 0.06298645, + "balance_loss_mlp": 0.01267119, + "epoch": 0.17147151660904855, + "flos": 21148854024960.0, + "grad_norm": 2.2154108843285107, + "language_loss": 0.94662631, + "learning_rate": 3.7936988283111764e-06, + "loss": 1.025177, + "num_input_tokens_seen": 61828980, + "router_z_loss_clip": 2.66992188, + "router_z_loss_mlp": 0.22412109, + "step": 2852, + "time_per_iteration": 2.518974542617798 + }, + { + "auxiliary_loss_clip": 0.0657506, + "auxiliary_loss_mlp": 0.01290477, + "balance_loss_clip": 0.06300224, + "balance_loss_mlp": 0.01264299, + "epoch": 0.17153163986171652, + "flos": 18630873705600.0, + "grad_norm": 1.8056831581423547, + "language_loss": 0.70245004, + "learning_rate": 3.7935265213850817e-06, + "loss": 0.7811054, + "num_input_tokens_seen": 61847915, + "router_z_loss_clip": 2.74609375, + "router_z_loss_mlp": 0.26184082, + "step": 2853, + "time_per_iteration": 2.552562952041626 + }, + { + "auxiliary_loss_clip": 0.06576742, + "auxiliary_loss_mlp": 0.01296459, + "balance_loss_clip": 0.06299934, + "balance_loss_mlp": 0.01271663, + "epoch": 0.17159176311438448, + "flos": 18229134504960.0, + "grad_norm": 2.1946039611354418, + "language_loss": 0.67477524, + "learning_rate": 3.7933541464482815e-06, + "loss": 0.75350726, + "num_input_tokens_seen": 61865570, + "router_z_loss_clip": 2.765625, + "router_z_loss_mlp": 0.2479248, + "step": 2854, + "time_per_iteration": 2.5350561141967773 + }, + { + "auxiliary_loss_clip": 0.06572944, + "auxiliary_loss_mlp": 0.0128611, + "balance_loss_clip": 0.06305773, + "balance_loss_mlp": 0.01263973, + "epoch": 0.17165188636705245, + "flos": 20744976545280.0, + "grad_norm": 1.5291061865624715, + "language_loss": 0.89537871, + "learning_rate": 3.7931817035073124e-06, + "loss": 0.97396928, + "num_input_tokens_seen": 61883340, + "router_z_loss_clip": 2.671875, + "router_z_loss_mlp": 0.22143555, + "step": 2855, + "time_per_iteration": 2.5376133918762207 + }, + { + "auxiliary_loss_clip": 0.06575546, + "auxiliary_loss_mlp": 0.01295321, + "balance_loss_clip": 0.06304888, + "balance_loss_mlp": 0.01271145, + "epoch": 0.17171200961972044, + "flos": 24906824190720.0, + "grad_norm": 2.4271457535299654, + "language_loss": 0.84835625, + "learning_rate": 3.7930091925687134e-06, + "loss": 0.9270649, + "num_input_tokens_seen": 61900610, + "router_z_loss_clip": 2.70703125, + "router_z_loss_mlp": 0.24206543, + "step": 2856, + "time_per_iteration": 2.551483392715454 + }, + { + "auxiliary_loss_clip": 0.06575087, + "auxiliary_loss_mlp": 0.01290512, + "balance_loss_clip": 0.0630254, + "balance_loss_mlp": 0.01267528, + "epoch": 0.1717721328723884, + "flos": 20163464409600.0, + "grad_norm": 7.491722293090189, + "language_loss": 0.87615776, + "learning_rate": 3.792836613639026e-06, + "loss": 0.95481372, + "num_input_tokens_seen": 61916795, + "router_z_loss_clip": 2.72460938, + "router_z_loss_mlp": 0.23010254, + "step": 2857, + "time_per_iteration": 4.012267112731934 + }, + { + "auxiliary_loss_clip": 0.06572698, + "auxiliary_loss_mlp": 0.01287955, + "balance_loss_clip": 0.06301427, + "balance_loss_mlp": 0.01262385, + "epoch": 0.17183225612505637, + "flos": 23367357452160.0, + "grad_norm": 2.309816452702101, + "language_loss": 0.78393459, + "learning_rate": 3.7926639667247947e-06, + "loss": 0.86254114, + "num_input_tokens_seen": 61936665, + "router_z_loss_clip": 2.71484375, + "router_z_loss_mlp": 0.25585938, + "step": 2858, + "time_per_iteration": 2.58130145072937 + }, + { + "auxiliary_loss_clip": 0.06589144, + "auxiliary_loss_mlp": 0.0128985, + "balance_loss_clip": 0.06303509, + "balance_loss_mlp": 0.0126453, + "epoch": 0.17189237937772434, + "flos": 18120163870080.0, + "grad_norm": 2.664171996061716, + "language_loss": 0.77798349, + "learning_rate": 3.7924912518325663e-06, + "loss": 0.85677344, + "num_input_tokens_seen": 61954415, + "router_z_loss_clip": 2.85742188, + "router_z_loss_mlp": 0.25317383, + "step": 2859, + "time_per_iteration": 2.5043106079101562 + }, + { + "auxiliary_loss_clip": 0.06572397, + "auxiliary_loss_mlp": 0.01281612, + "balance_loss_clip": 0.06301641, + "balance_loss_mlp": 0.01258939, + "epoch": 0.1719525026303923, + "flos": 23265137070720.0, + "grad_norm": 5.679736885155129, + "language_loss": 0.77697283, + "learning_rate": 3.7923184689688902e-06, + "loss": 0.85551292, + "num_input_tokens_seen": 61973940, + "router_z_loss_clip": 2.7109375, + "router_z_loss_mlp": 0.22692871, + "step": 2860, + "time_per_iteration": 2.572662591934204 + }, + { + "auxiliary_loss_clip": 0.06574808, + "auxiliary_loss_mlp": 0.01292828, + "balance_loss_clip": 0.06301817, + "balance_loss_mlp": 0.01270583, + "epoch": 0.17201262588306027, + "flos": 20816156188800.0, + "grad_norm": 2.1792765136561036, + "language_loss": 0.82509398, + "learning_rate": 3.792145618140317e-06, + "loss": 0.90377033, + "num_input_tokens_seen": 61991845, + "router_z_loss_clip": 2.72851562, + "router_z_loss_mlp": 0.22229004, + "step": 2861, + "time_per_iteration": 3.9328150749206543 + }, + { + "auxiliary_loss_clip": 0.06577721, + "auxiliary_loss_mlp": 0.01292683, + "balance_loss_clip": 0.06305138, + "balance_loss_mlp": 0.0126896, + "epoch": 0.17207274913572823, + "flos": 20382076512000.0, + "grad_norm": 2.450020121503541, + "language_loss": 0.8692534, + "learning_rate": 3.7919726993534038e-06, + "loss": 0.9479574, + "num_input_tokens_seen": 62009395, + "router_z_loss_clip": 2.72460938, + "router_z_loss_mlp": 0.23718262, + "step": 2862, + "time_per_iteration": 2.533240795135498 + }, + { + "auxiliary_loss_clip": 0.06570788, + "auxiliary_loss_mlp": 0.01286464, + "balance_loss_clip": 0.06306001, + "balance_loss_mlp": 0.01264387, + "epoch": 0.17213287238839622, + "flos": 26805082112640.0, + "grad_norm": 1.8452916722599864, + "language_loss": 0.78642774, + "learning_rate": 3.7917997126147054e-06, + "loss": 0.86500025, + "num_input_tokens_seen": 62029005, + "router_z_loss_clip": 2.64257812, + "router_z_loss_mlp": 0.22045898, + "step": 2863, + "time_per_iteration": 2.5886759757995605 + }, + { + "auxiliary_loss_clip": 0.06585991, + "auxiliary_loss_mlp": 0.0129295, + "balance_loss_clip": 0.06318994, + "balance_loss_mlp": 0.01270336, + "epoch": 0.1721929956410642, + "flos": 26037927256320.0, + "grad_norm": 1.9522517065159992, + "language_loss": 0.73622, + "learning_rate": 3.7916266579307823e-06, + "loss": 0.81500947, + "num_input_tokens_seen": 62048730, + "router_z_loss_clip": 2.66796875, + "router_z_loss_mlp": 0.22631836, + "step": 2864, + "time_per_iteration": 4.05191445350647 + }, + { + "auxiliary_loss_clip": 0.06582697, + "auxiliary_loss_mlp": 0.01292894, + "balance_loss_clip": 0.06309051, + "balance_loss_mlp": 0.01269362, + "epoch": 0.17225311889373215, + "flos": 22279621674240.0, + "grad_norm": 1.6774687827131978, + "language_loss": 0.73856592, + "learning_rate": 3.7914535353081973e-06, + "loss": 0.81732178, + "num_input_tokens_seen": 62069000, + "router_z_loss_clip": 2.73632812, + "router_z_loss_mlp": 0.23535156, + "step": 2865, + "time_per_iteration": 3.9612531661987305 + }, + { + "auxiliary_loss_clip": 0.06584621, + "auxiliary_loss_mlp": 0.01305521, + "balance_loss_clip": 0.06313194, + "balance_loss_mlp": 0.01281405, + "epoch": 0.17231324214640012, + "flos": 21294106277760.0, + "grad_norm": 2.4869534197111385, + "language_loss": 0.79160404, + "learning_rate": 3.7912803447535145e-06, + "loss": 0.87050545, + "num_input_tokens_seen": 62086750, + "router_z_loss_clip": 2.71484375, + "router_z_loss_mlp": 0.24121094, + "step": 2866, + "time_per_iteration": 2.542663812637329 + }, + { + "auxiliary_loss_clip": 0.06586975, + "auxiliary_loss_mlp": 0.01295234, + "balance_loss_clip": 0.0631168, + "balance_loss_mlp": 0.01269688, + "epoch": 0.17237336539906808, + "flos": 19686520569600.0, + "grad_norm": 2.39942640082668, + "language_loss": 0.80413449, + "learning_rate": 3.7911070862733016e-06, + "loss": 0.8829565, + "num_input_tokens_seen": 62106240, + "router_z_loss_clip": 2.75390625, + "router_z_loss_mlp": 0.25549316, + "step": 2867, + "time_per_iteration": 2.524634599685669 + }, + { + "auxiliary_loss_clip": 0.06577912, + "auxiliary_loss_mlp": 0.01291096, + "balance_loss_clip": 0.063054, + "balance_loss_mlp": 0.01267123, + "epoch": 0.17243348865173605, + "flos": 17535339498240.0, + "grad_norm": 1.6440546002054504, + "language_loss": 0.80347586, + "learning_rate": 3.7909337598741276e-06, + "loss": 0.88216591, + "num_input_tokens_seen": 62124895, + "router_z_loss_clip": 2.72460938, + "router_z_loss_mlp": 0.23974609, + "step": 2868, + "time_per_iteration": 2.5237460136413574 + }, + { + "auxiliary_loss_clip": 0.06586674, + "auxiliary_loss_mlp": 0.0129419, + "balance_loss_clip": 0.06310418, + "balance_loss_mlp": 0.01270241, + "epoch": 0.17249361190440402, + "flos": 18265751539200.0, + "grad_norm": 1.9212015042396675, + "language_loss": 0.84995282, + "learning_rate": 3.7907603655625674e-06, + "loss": 0.92876148, + "num_input_tokens_seen": 62143510, + "router_z_loss_clip": 2.76171875, + "router_z_loss_mlp": 0.23937988, + "step": 2869, + "time_per_iteration": 2.4968101978302 + }, + { + "auxiliary_loss_clip": 0.06574747, + "auxiliary_loss_mlp": 0.01290391, + "balance_loss_clip": 0.06302473, + "balance_loss_mlp": 0.01265393, + "epoch": 0.172553735157072, + "flos": 21180020544000.0, + "grad_norm": 2.372251531694949, + "language_loss": 0.78318757, + "learning_rate": 3.7905869033451932e-06, + "loss": 0.861839, + "num_input_tokens_seen": 62162285, + "router_z_loss_clip": 2.71875, + "router_z_loss_mlp": 0.25, + "step": 2870, + "time_per_iteration": 2.6494200229644775 + }, + { + "auxiliary_loss_clip": 0.06572236, + "auxiliary_loss_mlp": 0.01286981, + "balance_loss_clip": 0.06308384, + "balance_loss_mlp": 0.01266083, + "epoch": 0.17261385840973997, + "flos": 22279831309440.0, + "grad_norm": 1.8100610801094352, + "language_loss": 0.77937269, + "learning_rate": 3.7904133732285857e-06, + "loss": 0.85796487, + "num_input_tokens_seen": 62180970, + "router_z_loss_clip": 2.63671875, + "router_z_loss_mlp": 0.20910645, + "step": 2871, + "time_per_iteration": 2.6145200729370117 + }, + { + "auxiliary_loss_clip": 0.06580749, + "auxiliary_loss_mlp": 0.01284391, + "balance_loss_clip": 0.06306709, + "balance_loss_mlp": 0.01260263, + "epoch": 0.17267398166240794, + "flos": 27928680238080.0, + "grad_norm": 2.361348336036686, + "language_loss": 0.75478256, + "learning_rate": 3.7902397752193228e-06, + "loss": 0.83343399, + "num_input_tokens_seen": 62198965, + "router_z_loss_clip": 2.74414062, + "router_z_loss_mlp": 0.24157715, + "step": 2872, + "time_per_iteration": 2.598762035369873 + }, + { + "auxiliary_loss_clip": 0.06570577, + "auxiliary_loss_mlp": 0.01297063, + "balance_loss_clip": 0.06302171, + "balance_loss_mlp": 0.01274067, + "epoch": 0.1727341049150759, + "flos": 21951661593600.0, + "grad_norm": 1.9699566193216007, + "language_loss": 0.83421481, + "learning_rate": 3.790066109323988e-06, + "loss": 0.91289121, + "num_input_tokens_seen": 62219890, + "router_z_loss_clip": 2.68359375, + "router_z_loss_mlp": 0.23010254, + "step": 2873, + "time_per_iteration": 2.5375001430511475 + }, + { + "auxiliary_loss_clip": 0.06575856, + "auxiliary_loss_mlp": 0.01290457, + "balance_loss_clip": 0.06307539, + "balance_loss_mlp": 0.01266198, + "epoch": 0.17279422816774387, + "flos": 18112742784000.0, + "grad_norm": 2.023952379864123, + "language_loss": 0.75553465, + "learning_rate": 3.7898923755491678e-06, + "loss": 0.83419782, + "num_input_tokens_seen": 62237140, + "router_z_loss_clip": 2.6796875, + "router_z_loss_mlp": 0.24243164, + "step": 2874, + "time_per_iteration": 2.6628403663635254 + }, + { + "auxiliary_loss_clip": 0.06583337, + "auxiliary_loss_mlp": 0.01288686, + "balance_loss_clip": 0.06308968, + "balance_loss_mlp": 0.01261959, + "epoch": 0.17285435142041183, + "flos": 21841936272000.0, + "grad_norm": 2.156422611189301, + "language_loss": 0.81707162, + "learning_rate": 3.7897185739014487e-06, + "loss": 0.89579183, + "num_input_tokens_seen": 62255405, + "router_z_loss_clip": 2.7421875, + "router_z_loss_mlp": 0.26733398, + "step": 2875, + "time_per_iteration": 2.5195512771606445 + }, + { + "auxiliary_loss_clip": 0.06576921, + "auxiliary_loss_mlp": 0.0129142, + "balance_loss_clip": 0.06303119, + "balance_loss_mlp": 0.01265122, + "epoch": 0.17291447467307983, + "flos": 18374219049600.0, + "grad_norm": 2.297860169925143, + "language_loss": 0.89334786, + "learning_rate": 3.7895447043874217e-06, + "loss": 0.9720313, + "num_input_tokens_seen": 62271280, + "router_z_loss_clip": 2.73828125, + "router_z_loss_mlp": 0.26281738, + "step": 2876, + "time_per_iteration": 2.5156540870666504 + }, + { + "auxiliary_loss_clip": 0.06576936, + "auxiliary_loss_mlp": 0.01286777, + "balance_loss_clip": 0.06307175, + "balance_loss_mlp": 0.01262793, + "epoch": 0.1729745979257478, + "flos": 18630580216320.0, + "grad_norm": 2.037856806425618, + "language_loss": 0.85539293, + "learning_rate": 3.789370767013681e-06, + "loss": 0.93403006, + "num_input_tokens_seen": 62289140, + "router_z_loss_clip": 2.69726562, + "router_z_loss_mlp": 0.23986816, + "step": 2877, + "time_per_iteration": 2.4874324798583984 + }, + { + "auxiliary_loss_clip": 0.06576495, + "auxiliary_loss_mlp": 0.01284602, + "balance_loss_clip": 0.06305559, + "balance_loss_mlp": 0.01260593, + "epoch": 0.17303472117841576, + "flos": 23004122002560.0, + "grad_norm": 1.956584823379214, + "language_loss": 0.79972547, + "learning_rate": 3.7891967617868204e-06, + "loss": 0.87833643, + "num_input_tokens_seen": 62307490, + "router_z_loss_clip": 2.70898438, + "router_z_loss_mlp": 0.23986816, + "step": 2878, + "time_per_iteration": 2.5546791553497314 + }, + { + "auxiliary_loss_clip": 0.06571983, + "auxiliary_loss_mlp": 0.01289115, + "balance_loss_clip": 0.06302349, + "balance_loss_mlp": 0.01264558, + "epoch": 0.17309484443108372, + "flos": 25671169935360.0, + "grad_norm": 1.824315336901638, + "language_loss": 0.72073978, + "learning_rate": 3.78902268871344e-06, + "loss": 0.79935074, + "num_input_tokens_seen": 62328570, + "router_z_loss_clip": 2.69335938, + "router_z_loss_mlp": 0.24584961, + "step": 2879, + "time_per_iteration": 2.5585644245147705 + }, + { + "auxiliary_loss_clip": 0.06575425, + "auxiliary_loss_mlp": 0.01284736, + "balance_loss_clip": 0.06301329, + "balance_loss_mlp": 0.01260048, + "epoch": 0.1731549676837517, + "flos": 13557960616320.0, + "grad_norm": 1.9540483547981324, + "language_loss": 0.8431474, + "learning_rate": 3.78884854780014e-06, + "loss": 0.921749, + "num_input_tokens_seen": 62345735, + "router_z_loss_clip": 2.7421875, + "router_z_loss_mlp": 0.24682617, + "step": 2880, + "time_per_iteration": 2.5332508087158203 + }, + { + "auxiliary_loss_clip": 0.06579134, + "auxiliary_loss_mlp": 0.01281408, + "balance_loss_clip": 0.06303075, + "balance_loss_mlp": 0.01256565, + "epoch": 0.17321509093641965, + "flos": 22863733286400.0, + "grad_norm": 3.3854797576129525, + "language_loss": 0.82168967, + "learning_rate": 3.7886743390535236e-06, + "loss": 0.90029514, + "num_input_tokens_seen": 62365525, + "router_z_loss_clip": 2.76171875, + "router_z_loss_mlp": 0.2487793, + "step": 2881, + "time_per_iteration": 2.5265071392059326 + }, + { + "auxiliary_loss_clip": 0.06575799, + "auxiliary_loss_mlp": 0.01283502, + "balance_loss_clip": 0.06305043, + "balance_loss_mlp": 0.0125904, + "epoch": 0.17327521418908762, + "flos": 24359665029120.0, + "grad_norm": 1.8504646386399068, + "language_loss": 0.77975154, + "learning_rate": 3.788500062480197e-06, + "loss": 0.85834455, + "num_input_tokens_seen": 62385160, + "router_z_loss_clip": 2.70898438, + "router_z_loss_mlp": 0.24450684, + "step": 2882, + "time_per_iteration": 2.56476092338562 + }, + { + "auxiliary_loss_clip": 0.0657361, + "auxiliary_loss_mlp": 0.01281786, + "balance_loss_clip": 0.063067, + "balance_loss_mlp": 0.01260495, + "epoch": 0.1733353374417556, + "flos": 33113373073920.0, + "grad_norm": 2.021690524452963, + "language_loss": 0.77161384, + "learning_rate": 3.788325718086769e-06, + "loss": 0.85016787, + "num_input_tokens_seen": 62405280, + "router_z_loss_clip": 2.66992188, + "router_z_loss_mlp": 0.21276855, + "step": 2883, + "time_per_iteration": 2.6154749393463135 + }, + { + "auxiliary_loss_clip": 0.06569435, + "auxiliary_loss_mlp": 0.01278991, + "balance_loss_clip": 0.06301424, + "balance_loss_mlp": 0.01256365, + "epoch": 0.17339546069442358, + "flos": 24395778938880.0, + "grad_norm": 4.943843215515709, + "language_loss": 0.86164784, + "learning_rate": 3.7881513058798503e-06, + "loss": 0.94013214, + "num_input_tokens_seen": 62423665, + "router_z_loss_clip": 2.68164062, + "router_z_loss_mlp": 0.22631836, + "step": 2884, + "time_per_iteration": 2.5598208904266357 + }, + { + "auxiliary_loss_clip": 0.06577636, + "auxiliary_loss_mlp": 0.01280409, + "balance_loss_clip": 0.06308297, + "balance_loss_mlp": 0.01256878, + "epoch": 0.17345558394709154, + "flos": 27461589252480.0, + "grad_norm": 1.714045228397976, + "language_loss": 0.75027329, + "learning_rate": 3.787976825866055e-06, + "loss": 0.82885373, + "num_input_tokens_seen": 62445170, + "router_z_loss_clip": 2.69335938, + "router_z_loss_mlp": 0.23535156, + "step": 2885, + "time_per_iteration": 2.584550619125366 + }, + { + "auxiliary_loss_clip": 0.06567928, + "auxiliary_loss_mlp": 0.01282091, + "balance_loss_clip": 0.06304367, + "balance_loss_mlp": 0.01259954, + "epoch": 0.1735157071997595, + "flos": 24689260264320.0, + "grad_norm": 1.6836608181022428, + "language_loss": 0.71760321, + "learning_rate": 3.7878022780519998e-06, + "loss": 0.79610336, + "num_input_tokens_seen": 62466135, + "router_z_loss_clip": 2.63671875, + "router_z_loss_mlp": 0.22131348, + "step": 2886, + "time_per_iteration": 2.5990986824035645 + }, + { + "auxiliary_loss_clip": 0.06574686, + "auxiliary_loss_mlp": 0.01280319, + "balance_loss_clip": 0.06304233, + "balance_loss_mlp": 0.01257275, + "epoch": 0.17357583045242747, + "flos": 21695300426880.0, + "grad_norm": 2.252280410203818, + "language_loss": 0.70329314, + "learning_rate": 3.7876276624443024e-06, + "loss": 0.78184319, + "num_input_tokens_seen": 62483910, + "router_z_loss_clip": 2.703125, + "router_z_loss_mlp": 0.23071289, + "step": 2887, + "time_per_iteration": 2.528995990753174 + }, + { + "auxiliary_loss_clip": 0.0657585, + "auxiliary_loss_mlp": 0.0127978, + "balance_loss_clip": 0.06305341, + "balance_loss_mlp": 0.01258155, + "epoch": 0.17363595370509544, + "flos": 15380846190720.0, + "grad_norm": 1.8987045627788157, + "language_loss": 0.85982835, + "learning_rate": 3.787452979049585e-06, + "loss": 0.93838477, + "num_input_tokens_seen": 62501530, + "router_z_loss_clip": 2.70507812, + "router_z_loss_mlp": 0.21618652, + "step": 2888, + "time_per_iteration": 2.520200252532959 + }, + { + "auxiliary_loss_clip": 0.06585068, + "auxiliary_loss_mlp": 0.0128524, + "balance_loss_clip": 0.06313335, + "balance_loss_mlp": 0.01262077, + "epoch": 0.1736960769577634, + "flos": 23447719117440.0, + "grad_norm": 1.9850534312792847, + "language_loss": 0.79895031, + "learning_rate": 3.7872782278744718e-06, + "loss": 0.87765336, + "num_input_tokens_seen": 62521295, + "router_z_loss_clip": 2.71679688, + "router_z_loss_mlp": 0.23193359, + "step": 2889, + "time_per_iteration": 2.5683798789978027 + }, + { + "auxiliary_loss_clip": 0.06572761, + "auxiliary_loss_mlp": 0.01291973, + "balance_loss_clip": 0.06309643, + "balance_loss_mlp": 0.01268966, + "epoch": 0.1737562002104314, + "flos": 18593711619840.0, + "grad_norm": 2.1673011596526743, + "language_loss": 0.85773498, + "learning_rate": 3.7871034089255883e-06, + "loss": 0.93638229, + "num_input_tokens_seen": 62539615, + "router_z_loss_clip": 2.62890625, + "router_z_loss_mlp": 0.23010254, + "step": 2890, + "time_per_iteration": 2.5268702507019043 + }, + { + "auxiliary_loss_clip": 0.06571183, + "auxiliary_loss_mlp": 0.0127752, + "balance_loss_clip": 0.06302673, + "balance_loss_mlp": 0.0125493, + "epoch": 0.17381632346309936, + "flos": 16003629262080.0, + "grad_norm": 2.262236435886973, + "language_loss": 0.8327142, + "learning_rate": 3.7869285222095653e-06, + "loss": 0.91120124, + "num_input_tokens_seen": 62556820, + "router_z_loss_clip": 2.68359375, + "router_z_loss_mlp": 0.22595215, + "step": 2891, + "time_per_iteration": 2.4975481033325195 + }, + { + "auxiliary_loss_clip": 0.065819, + "auxiliary_loss_mlp": 0.01286901, + "balance_loss_clip": 0.06304774, + "balance_loss_mlp": 0.01263512, + "epoch": 0.17387644671576732, + "flos": 13374749664000.0, + "grad_norm": 2.593478250918492, + "language_loss": 0.82133532, + "learning_rate": 3.7867535677330334e-06, + "loss": 0.9000234, + "num_input_tokens_seen": 62572450, + "router_z_loss_clip": 2.76953125, + "router_z_loss_mlp": 0.23388672, + "step": 2892, + "time_per_iteration": 2.488811492919922 + }, + { + "auxiliary_loss_clip": 0.06588026, + "auxiliary_loss_mlp": 0.0128266, + "balance_loss_clip": 0.06313482, + "balance_loss_mlp": 0.0125759, + "epoch": 0.1739365699684353, + "flos": 26622877409280.0, + "grad_norm": 1.869199176824797, + "language_loss": 0.7570942, + "learning_rate": 3.786578545502627e-06, + "loss": 0.83580112, + "num_input_tokens_seen": 62592580, + "router_z_loss_clip": 2.74804688, + "router_z_loss_mlp": 0.25061035, + "step": 2893, + "time_per_iteration": 2.6775050163269043 + }, + { + "auxiliary_loss_clip": 0.06578243, + "auxiliary_loss_mlp": 0.01282281, + "balance_loss_clip": 0.06306182, + "balance_loss_mlp": 0.01257903, + "epoch": 0.17399669322110325, + "flos": 23374736611200.0, + "grad_norm": 1.8950837051329763, + "language_loss": 0.82900345, + "learning_rate": 3.7864034555249828e-06, + "loss": 0.90760863, + "num_input_tokens_seen": 62611220, + "router_z_loss_clip": 2.72265625, + "router_z_loss_mlp": 0.24377441, + "step": 2894, + "time_per_iteration": 2.5567498207092285 + }, + { + "auxiliary_loss_clip": 0.06582697, + "auxiliary_loss_mlp": 0.01287491, + "balance_loss_clip": 0.06309928, + "balance_loss_mlp": 0.01263232, + "epoch": 0.17405681647377122, + "flos": 22060590301440.0, + "grad_norm": 2.244882299044818, + "language_loss": 0.74999332, + "learning_rate": 3.786228297806741e-06, + "loss": 0.82869518, + "num_input_tokens_seen": 62629185, + "router_z_loss_clip": 2.73046875, + "router_z_loss_mlp": 0.24279785, + "step": 2895, + "time_per_iteration": 2.535771369934082 + }, + { + "auxiliary_loss_clip": 0.06500985, + "auxiliary_loss_mlp": 0.01252168, + "balance_loss_clip": 0.06341717, + "balance_loss_mlp": 0.01244449, + "epoch": 0.1741169397264392, + "flos": 61476537530880.0, + "grad_norm": 0.8158755233881254, + "language_loss": 0.62716168, + "learning_rate": 3.7860530723545435e-06, + "loss": 0.7046932, + "num_input_tokens_seen": 62691895, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.0770874, + "step": 2896, + "time_per_iteration": 3.260303497314453 + }, + { + "auxiliary_loss_clip": 0.06578183, + "auxiliary_loss_mlp": 0.01278967, + "balance_loss_clip": 0.06304477, + "balance_loss_mlp": 0.01254791, + "epoch": 0.17417706297910718, + "flos": 27025245515520.0, + "grad_norm": 1.768440838457988, + "language_loss": 0.76261735, + "learning_rate": 3.785877779175034e-06, + "loss": 0.84118891, + "num_input_tokens_seen": 62713790, + "router_z_loss_clip": 2.73632812, + "router_z_loss_mlp": 0.24157715, + "step": 2897, + "time_per_iteration": 3.9564483165740967 + }, + { + "auxiliary_loss_clip": 0.06567717, + "auxiliary_loss_mlp": 0.01283821, + "balance_loss_clip": 0.06302972, + "balance_loss_mlp": 0.01260325, + "epoch": 0.17423718623177514, + "flos": 33516957064320.0, + "grad_norm": 2.1770598890745694, + "language_loss": 0.7037769, + "learning_rate": 3.7857024182748606e-06, + "loss": 0.78229225, + "num_input_tokens_seen": 62736285, + "router_z_loss_clip": 2.64453125, + "router_z_loss_mlp": 0.23486328, + "step": 2898, + "time_per_iteration": 2.6747710704803467 + }, + { + "auxiliary_loss_clip": 0.06586026, + "auxiliary_loss_mlp": 0.01283538, + "balance_loss_clip": 0.0630955, + "balance_loss_mlp": 0.01261008, + "epoch": 0.1742973094844431, + "flos": 27205982772480.0, + "grad_norm": 2.322018652940294, + "language_loss": 0.77535176, + "learning_rate": 3.7855269896606717e-06, + "loss": 0.85404742, + "num_input_tokens_seen": 62756240, + "router_z_loss_clip": 2.76757812, + "router_z_loss_mlp": 0.22509766, + "step": 2899, + "time_per_iteration": 2.5824503898620605 + }, + { + "auxiliary_loss_clip": 0.06566149, + "auxiliary_loss_mlp": 0.01285927, + "balance_loss_clip": 0.06301811, + "balance_loss_mlp": 0.01263611, + "epoch": 0.17435743273711107, + "flos": 22717307076480.0, + "grad_norm": 1.8730005414784603, + "language_loss": 0.7345652, + "learning_rate": 3.785351493339121e-06, + "loss": 0.81308603, + "num_input_tokens_seen": 62775910, + "router_z_loss_clip": 2.64453125, + "router_z_loss_mlp": 0.22302246, + "step": 2900, + "time_per_iteration": 3.9656574726104736 + }, + { + "auxiliary_loss_clip": 0.06572049, + "auxiliary_loss_mlp": 0.01282784, + "balance_loss_clip": 0.06301104, + "balance_loss_mlp": 0.01259311, + "epoch": 0.17441755598977904, + "flos": 41656141664640.0, + "grad_norm": 1.6285149505686385, + "language_loss": 0.70661789, + "learning_rate": 3.785175929316863e-06, + "loss": 0.7851662, + "num_input_tokens_seen": 62799385, + "router_z_loss_clip": 2.71289062, + "router_z_loss_mlp": 0.23474121, + "step": 2901, + "time_per_iteration": 2.6915066242218018 + }, + { + "auxiliary_loss_clip": 0.06578797, + "auxiliary_loss_mlp": 0.01281619, + "balance_loss_clip": 0.06304422, + "balance_loss_mlp": 0.0125885, + "epoch": 0.174477679242447, + "flos": 26294372277120.0, + "grad_norm": 4.182093359181909, + "language_loss": 0.76958787, + "learning_rate": 3.7850002976005543e-06, + "loss": 0.84819204, + "num_input_tokens_seen": 62819380, + "router_z_loss_clip": 2.74414062, + "router_z_loss_mlp": 0.2277832, + "step": 2902, + "time_per_iteration": 2.58911395072937 + }, + { + "auxiliary_loss_clip": 0.06574767, + "auxiliary_loss_mlp": 0.0128676, + "balance_loss_clip": 0.06303128, + "balance_loss_mlp": 0.01265076, + "epoch": 0.174537802495115, + "flos": 17864221973760.0, + "grad_norm": 2.5386707468858942, + "language_loss": 0.82260907, + "learning_rate": 3.7848245981968558e-06, + "loss": 0.90122437, + "num_input_tokens_seen": 62836205, + "router_z_loss_clip": 2.71875, + "router_z_loss_mlp": 0.21679688, + "step": 2903, + "time_per_iteration": 3.919084072113037 + }, + { + "auxiliary_loss_clip": 0.06573024, + "auxiliary_loss_mlp": 0.01291861, + "balance_loss_clip": 0.06307561, + "balance_loss_mlp": 0.01269139, + "epoch": 0.17459792574778296, + "flos": 16945441954560.0, + "grad_norm": 1.7914306748896518, + "language_loss": 0.7447511, + "learning_rate": 3.784648831112429e-06, + "loss": 0.82340002, + "num_input_tokens_seen": 62854045, + "router_z_loss_clip": 2.65039062, + "router_z_loss_mlp": 0.22717285, + "step": 2904, + "time_per_iteration": 2.578841209411621 + }, + { + "auxiliary_loss_clip": 0.06575242, + "auxiliary_loss_mlp": 0.01290708, + "balance_loss_clip": 0.0630535, + "balance_loss_mlp": 0.01266592, + "epoch": 0.17465804900045093, + "flos": 25527049712640.0, + "grad_norm": 2.1432197986147004, + "language_loss": 0.65256733, + "learning_rate": 3.7844729963539406e-06, + "loss": 0.73122686, + "num_input_tokens_seen": 62873075, + "router_z_loss_clip": 2.703125, + "router_z_loss_mlp": 0.24133301, + "step": 2905, + "time_per_iteration": 3.9871487617492676 + }, + { + "auxiliary_loss_clip": 0.06593791, + "auxiliary_loss_mlp": 0.0129467, + "balance_loss_clip": 0.06312381, + "balance_loss_mlp": 0.01270137, + "epoch": 0.1747181722531189, + "flos": 24135853973760.0, + "grad_norm": 2.2797831517729046, + "language_loss": 0.80441433, + "learning_rate": 3.7842970939280566e-06, + "loss": 0.88329899, + "num_input_tokens_seen": 62892675, + "router_z_loss_clip": 2.8125, + "router_z_loss_mlp": 0.24511719, + "step": 2906, + "time_per_iteration": 2.556459903717041 + }, + { + "auxiliary_loss_clip": 0.065907, + "auxiliary_loss_mlp": 0.01299352, + "balance_loss_clip": 0.0631306, + "balance_loss_mlp": 0.01274306, + "epoch": 0.17477829550578686, + "flos": 17754580506240.0, + "grad_norm": 7.784703467250062, + "language_loss": 0.81983393, + "learning_rate": 3.784121123841449e-06, + "loss": 0.89873445, + "num_input_tokens_seen": 62910675, + "router_z_loss_clip": 2.77734375, + "router_z_loss_mlp": 0.25024414, + "step": 2907, + "time_per_iteration": 2.5256009101867676 + }, + { + "auxiliary_loss_clip": 0.06586979, + "auxiliary_loss_mlp": 0.01293929, + "balance_loss_clip": 0.06311269, + "balance_loss_mlp": 0.01269777, + "epoch": 0.17483841875845482, + "flos": 15382732907520.0, + "grad_norm": 1.9551973542338994, + "language_loss": 0.82190001, + "learning_rate": 3.7839450861007886e-06, + "loss": 0.90070903, + "num_input_tokens_seen": 62928130, + "router_z_loss_clip": 2.7578125, + "router_z_loss_mlp": 0.24133301, + "step": 2908, + "time_per_iteration": 2.5280957221984863 + }, + { + "auxiliary_loss_clip": 0.0658935, + "auxiliary_loss_mlp": 0.01308706, + "balance_loss_clip": 0.06314441, + "balance_loss_mlp": 0.01283279, + "epoch": 0.17489854201112282, + "flos": 17168624104320.0, + "grad_norm": 3.0308502496460243, + "language_loss": 0.8151319, + "learning_rate": 3.7837689807127518e-06, + "loss": 0.89411247, + "num_input_tokens_seen": 62944290, + "router_z_loss_clip": 2.74804688, + "router_z_loss_mlp": 0.25427246, + "step": 2909, + "time_per_iteration": 2.501805543899536 + }, + { + "auxiliary_loss_clip": 0.06591058, + "auxiliary_loss_mlp": 0.01307034, + "balance_loss_clip": 0.06313848, + "balance_loss_mlp": 0.01280235, + "epoch": 0.17495866526379078, + "flos": 19761347865600.0, + "grad_norm": 2.106593508541441, + "language_loss": 0.77213359, + "learning_rate": 3.783592807684017e-06, + "loss": 0.85111451, + "num_input_tokens_seen": 62963505, + "router_z_loss_clip": 2.76953125, + "router_z_loss_mlp": 0.26818848, + "step": 2910, + "time_per_iteration": 2.5401246547698975 + }, + { + "auxiliary_loss_clip": 0.065902, + "auxiliary_loss_mlp": 0.01309875, + "balance_loss_clip": 0.06316847, + "balance_loss_mlp": 0.01282147, + "epoch": 0.17501878851645875, + "flos": 28518535854720.0, + "grad_norm": 6.625386462851426, + "language_loss": 0.8799597, + "learning_rate": 3.7834165670212645e-06, + "loss": 0.95896053, + "num_input_tokens_seen": 62985020, + "router_z_loss_clip": 2.73242188, + "router_z_loss_mlp": 0.27770996, + "step": 2911, + "time_per_iteration": 2.60190486907959 + }, + { + "auxiliary_loss_clip": 0.06591105, + "auxiliary_loss_mlp": 0.01300463, + "balance_loss_clip": 0.06318109, + "balance_loss_mlp": 0.0127537, + "epoch": 0.1750789117691267, + "flos": 17936994844800.0, + "grad_norm": 2.1857421016012832, + "language_loss": 0.90469962, + "learning_rate": 3.7832402587311764e-06, + "loss": 0.98361528, + "num_input_tokens_seen": 63001745, + "router_z_loss_clip": 2.73046875, + "router_z_loss_mlp": 0.2512207, + "step": 2912, + "time_per_iteration": 2.5914218425750732 + }, + { + "auxiliary_loss_clip": 0.06588344, + "auxiliary_loss_mlp": 0.01304507, + "balance_loss_clip": 0.06308792, + "balance_loss_mlp": 0.01277041, + "epoch": 0.17513903502179468, + "flos": 18265248414720.0, + "grad_norm": 2.129743219312126, + "language_loss": 0.74037218, + "learning_rate": 3.783063882820439e-06, + "loss": 0.81930077, + "num_input_tokens_seen": 63019750, + "router_z_loss_clip": 2.79296875, + "router_z_loss_mlp": 0.27453613, + "step": 2913, + "time_per_iteration": 2.5509839057922363 + }, + { + "auxiliary_loss_clip": 0.06580269, + "auxiliary_loss_mlp": 0.01314219, + "balance_loss_clip": 0.06308483, + "balance_loss_mlp": 0.01289781, + "epoch": 0.17519915827446264, + "flos": 20711084768640.0, + "grad_norm": 1.8784732947097995, + "language_loss": 0.70240569, + "learning_rate": 3.782887439295741e-06, + "loss": 0.78135055, + "num_input_tokens_seen": 63039500, + "router_z_loss_clip": 2.71289062, + "router_z_loss_mlp": 0.24450684, + "step": 2914, + "time_per_iteration": 2.560774564743042 + }, + { + "auxiliary_loss_clip": 0.06575729, + "auxiliary_loss_mlp": 0.0130416, + "balance_loss_clip": 0.06304997, + "balance_loss_mlp": 0.01278935, + "epoch": 0.1752592815271306, + "flos": 20529928241280.0, + "grad_norm": 1.7233134110017265, + "language_loss": 0.94360971, + "learning_rate": 3.782710928163772e-06, + "loss": 1.0224086, + "num_input_tokens_seen": 63059785, + "router_z_loss_clip": 2.70507812, + "router_z_loss_mlp": 0.25231934, + "step": 2915, + "time_per_iteration": 2.5500216484069824 + }, + { + "auxiliary_loss_clip": 0.06576817, + "auxiliary_loss_mlp": 0.01301313, + "balance_loss_clip": 0.06306335, + "balance_loss_mlp": 0.01277269, + "epoch": 0.1753194047797986, + "flos": 21805696581120.0, + "grad_norm": 1.6995224084103926, + "language_loss": 0.81995428, + "learning_rate": 3.782534349431226e-06, + "loss": 0.89873564, + "num_input_tokens_seen": 63079385, + "router_z_loss_clip": 2.70703125, + "router_z_loss_mlp": 0.24060059, + "step": 2916, + "time_per_iteration": 2.6210248470306396 + }, + { + "auxiliary_loss_clip": 0.06578801, + "auxiliary_loss_mlp": 0.01308944, + "balance_loss_clip": 0.06305841, + "balance_loss_mlp": 0.01282694, + "epoch": 0.17537952803246656, + "flos": 20674719296640.0, + "grad_norm": 7.015160336993527, + "language_loss": 0.74587643, + "learning_rate": 3.782357703104799e-06, + "loss": 0.82475388, + "num_input_tokens_seen": 63098970, + "router_z_loss_clip": 2.72851562, + "router_z_loss_mlp": 0.26245117, + "step": 2917, + "time_per_iteration": 2.5568697452545166 + }, + { + "auxiliary_loss_clip": 0.06575756, + "auxiliary_loss_mlp": 0.01293408, + "balance_loss_clip": 0.06306349, + "balance_loss_mlp": 0.01269018, + "epoch": 0.17543965128513453, + "flos": 23301837959040.0, + "grad_norm": 1.9034970134752385, + "language_loss": 0.77783519, + "learning_rate": 3.7821809891911897e-06, + "loss": 0.85652685, + "num_input_tokens_seen": 63118750, + "router_z_loss_clip": 2.6953125, + "router_z_loss_mlp": 0.24414062, + "step": 2918, + "time_per_iteration": 2.592294692993164 + }, + { + "auxiliary_loss_clip": 0.06589542, + "auxiliary_loss_mlp": 0.01295236, + "balance_loss_clip": 0.06310425, + "balance_loss_mlp": 0.01271549, + "epoch": 0.1754997745378025, + "flos": 29103234445440.0, + "grad_norm": 2.152727236459042, + "language_loss": 0.75315654, + "learning_rate": 3.782004207697098e-06, + "loss": 0.83200431, + "num_input_tokens_seen": 63136865, + "router_z_loss_clip": 2.79101562, + "router_z_loss_mlp": 0.23693848, + "step": 2919, + "time_per_iteration": 2.67553973197937 + }, + { + "auxiliary_loss_clip": 0.06596158, + "auxiliary_loss_mlp": 0.01303514, + "balance_loss_clip": 0.06314485, + "balance_loss_mlp": 0.01279601, + "epoch": 0.17555989779047046, + "flos": 30379547836800.0, + "grad_norm": 1.8096477139902465, + "language_loss": 0.74872279, + "learning_rate": 3.781827358629228e-06, + "loss": 0.82771957, + "num_input_tokens_seen": 63158325, + "router_z_loss_clip": 2.81835938, + "router_z_loss_mlp": 0.23925781, + "step": 2920, + "time_per_iteration": 2.6885359287261963 + }, + { + "auxiliary_loss_clip": 0.06577891, + "auxiliary_loss_mlp": 0.01294192, + "balance_loss_clip": 0.06307238, + "balance_loss_mlp": 0.01270982, + "epoch": 0.17562002104313842, + "flos": 23293284842880.0, + "grad_norm": 2.5308626608738423, + "language_loss": 0.80572176, + "learning_rate": 3.7816504419942873e-06, + "loss": 0.88444257, + "num_input_tokens_seen": 63173115, + "router_z_loss_clip": 2.70703125, + "router_z_loss_mlp": 0.23217773, + "step": 2921, + "time_per_iteration": 2.51985502243042 + }, + { + "auxiliary_loss_clip": 0.06590457, + "auxiliary_loss_mlp": 0.01284789, + "balance_loss_clip": 0.06311172, + "balance_loss_mlp": 0.01260971, + "epoch": 0.1756801442958064, + "flos": 24797434285440.0, + "grad_norm": 1.5780045761030037, + "language_loss": 0.88755381, + "learning_rate": 3.7814734577989823e-06, + "loss": 0.96630621, + "num_input_tokens_seen": 63192880, + "router_z_loss_clip": 2.79296875, + "router_z_loss_mlp": 0.23815918, + "step": 2922, + "time_per_iteration": 2.595477819442749 + }, + { + "auxiliary_loss_clip": 0.06584172, + "auxiliary_loss_mlp": 0.01290113, + "balance_loss_clip": 0.06306588, + "balance_loss_mlp": 0.01265211, + "epoch": 0.17574026754847438, + "flos": 25778086145280.0, + "grad_norm": 2.2356333874414043, + "language_loss": 0.63389397, + "learning_rate": 3.7812964060500253e-06, + "loss": 0.71263683, + "num_input_tokens_seen": 63214395, + "router_z_loss_clip": 2.78125, + "router_z_loss_mlp": 0.24890137, + "step": 2923, + "time_per_iteration": 2.56712007522583 + }, + { + "auxiliary_loss_clip": 0.06590886, + "auxiliary_loss_mlp": 0.01293522, + "balance_loss_clip": 0.06313786, + "balance_loss_mlp": 0.01269394, + "epoch": 0.17580039080114235, + "flos": 17462273137920.0, + "grad_norm": 2.8211803221017617, + "language_loss": 0.81614435, + "learning_rate": 3.78111928675413e-06, + "loss": 0.89498842, + "num_input_tokens_seen": 63231020, + "router_z_loss_clip": 2.77148438, + "router_z_loss_mlp": 0.24145508, + "step": 2924, + "time_per_iteration": 2.5396065711975098 + }, + { + "auxiliary_loss_clip": 0.06586142, + "auxiliary_loss_mlp": 0.01294774, + "balance_loss_clip": 0.06306558, + "balance_loss_mlp": 0.01269108, + "epoch": 0.1758605140538103, + "flos": 14869633230720.0, + "grad_norm": 2.6608767055753244, + "language_loss": 0.71953624, + "learning_rate": 3.7809420999180126e-06, + "loss": 0.79834545, + "num_input_tokens_seen": 63246245, + "router_z_loss_clip": 2.79492188, + "router_z_loss_mlp": 0.25671387, + "step": 2925, + "time_per_iteration": 2.594172239303589 + }, + { + "auxiliary_loss_clip": 0.0657725, + "auxiliary_loss_mlp": 0.01284494, + "balance_loss_clip": 0.06310555, + "balance_loss_mlp": 0.01261546, + "epoch": 0.17592063730647828, + "flos": 23011165745280.0, + "grad_norm": 1.6593164954495325, + "language_loss": 0.72342992, + "learning_rate": 3.7807648455483934e-06, + "loss": 0.80204731, + "num_input_tokens_seen": 63267790, + "router_z_loss_clip": 2.6640625, + "router_z_loss_mlp": 0.22961426, + "step": 2926, + "time_per_iteration": 2.592061758041382 + }, + { + "auxiliary_loss_clip": 0.06592301, + "auxiliary_loss_mlp": 0.0128622, + "balance_loss_clip": 0.06310115, + "balance_loss_mlp": 0.01260911, + "epoch": 0.17598076055914624, + "flos": 20747911438080.0, + "grad_norm": 1.7750261498089963, + "language_loss": 0.85897779, + "learning_rate": 3.7805875236519918e-06, + "loss": 0.93776292, + "num_input_tokens_seen": 63286830, + "router_z_loss_clip": 2.8203125, + "router_z_loss_mlp": 0.25317383, + "step": 2927, + "time_per_iteration": 2.546537160873413 + }, + { + "auxiliary_loss_clip": 0.06583759, + "auxiliary_loss_mlp": 0.01277616, + "balance_loss_clip": 0.06312352, + "balance_loss_mlp": 0.01255431, + "epoch": 0.1760408838118142, + "flos": 34100607479040.0, + "grad_norm": 1.9484214610767971, + "language_loss": 0.72539592, + "learning_rate": 3.7804101342355336e-06, + "loss": 0.80400968, + "num_input_tokens_seen": 63308870, + "router_z_loss_clip": 2.71289062, + "router_z_loss_mlp": 0.22167969, + "step": 2928, + "time_per_iteration": 2.674516201019287 + }, + { + "auxiliary_loss_clip": 0.06577812, + "auxiliary_loss_mlp": 0.01278822, + "balance_loss_clip": 0.06308608, + "balance_loss_mlp": 0.01256292, + "epoch": 0.1761010070644822, + "flos": 24174902776320.0, + "grad_norm": 1.786019104625144, + "language_loss": 0.83572811, + "learning_rate": 3.780232677305744e-06, + "loss": 0.91429448, + "num_input_tokens_seen": 63329005, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 0.22521973, + "step": 2929, + "time_per_iteration": 2.5528249740600586 + }, + { + "auxiliary_loss_clip": 0.06584716, + "auxiliary_loss_mlp": 0.01284422, + "balance_loss_clip": 0.06311291, + "balance_loss_mlp": 0.01261439, + "epoch": 0.17616113031715017, + "flos": 26583660898560.0, + "grad_norm": 1.8454669041222298, + "language_loss": 0.80018413, + "learning_rate": 3.7800551528693535e-06, + "loss": 0.87887549, + "num_input_tokens_seen": 63349390, + "router_z_loss_clip": 2.73242188, + "router_z_loss_mlp": 0.2298584, + "step": 2930, + "time_per_iteration": 2.6004958152770996 + }, + { + "auxiliary_loss_clip": 0.06579742, + "auxiliary_loss_mlp": 0.01287089, + "balance_loss_clip": 0.06306133, + "balance_loss_mlp": 0.01261935, + "epoch": 0.17622125356981813, + "flos": 25673853265920.0, + "grad_norm": 2.4724081113031677, + "language_loss": 0.77905595, + "learning_rate": 3.7798775609330927e-06, + "loss": 0.85772425, + "num_input_tokens_seen": 63368835, + "router_z_loss_clip": 2.73632812, + "router_z_loss_mlp": 0.25195312, + "step": 2931, + "time_per_iteration": 2.580275774002075 + }, + { + "auxiliary_loss_clip": 0.0657528, + "auxiliary_loss_mlp": 0.01279459, + "balance_loss_clip": 0.063051, + "balance_loss_mlp": 0.01256988, + "epoch": 0.1762813768224861, + "flos": 16514129462400.0, + "grad_norm": 2.8370907048277973, + "language_loss": 0.75863802, + "learning_rate": 3.779699901503696e-06, + "loss": 0.83718544, + "num_input_tokens_seen": 63385220, + "router_z_loss_clip": 2.703125, + "router_z_loss_mlp": 0.22473145, + "step": 2932, + "time_per_iteration": 2.5535829067230225 + }, + { + "auxiliary_loss_clip": 0.06587049, + "auxiliary_loss_mlp": 0.0128414, + "balance_loss_clip": 0.06307124, + "balance_loss_mlp": 0.01258975, + "epoch": 0.17634150007515406, + "flos": 11215518600960.0, + "grad_norm": 2.570844699660862, + "language_loss": 0.90240741, + "learning_rate": 3.7795221745879016e-06, + "loss": 0.98111933, + "num_input_tokens_seen": 63400865, + "router_z_loss_clip": 2.80273438, + "router_z_loss_mlp": 0.25146484, + "step": 2933, + "time_per_iteration": 2.5120935440063477 + }, + { + "auxiliary_loss_clip": 0.06578325, + "auxiliary_loss_mlp": 0.01278816, + "balance_loss_clip": 0.06313163, + "balance_loss_mlp": 0.01256893, + "epoch": 0.17640162332782203, + "flos": 23666750490240.0, + "grad_norm": 2.3821255620265376, + "language_loss": 0.89272201, + "learning_rate": 3.779344380192448e-06, + "loss": 0.97129339, + "num_input_tokens_seen": 63421390, + "router_z_loss_clip": 2.65429688, + "router_z_loss_mlp": 0.21936035, + "step": 2934, + "time_per_iteration": 2.5753555297851562 + }, + { + "auxiliary_loss_clip": 0.06578338, + "auxiliary_loss_mlp": 0.0128005, + "balance_loss_clip": 0.0630947, + "balance_loss_mlp": 0.0125709, + "epoch": 0.17646174658049, + "flos": 53808819056640.0, + "grad_norm": 1.971590125699774, + "language_loss": 0.71700215, + "learning_rate": 3.779166518324077e-06, + "loss": 0.79558611, + "num_input_tokens_seen": 63444715, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 0.2298584, + "step": 2935, + "time_per_iteration": 2.8537397384643555 + }, + { + "auxiliary_loss_clip": 0.06584434, + "auxiliary_loss_mlp": 0.01288458, + "balance_loss_clip": 0.06307955, + "balance_loss_mlp": 0.01264401, + "epoch": 0.17652186983315798, + "flos": 24250820175360.0, + "grad_norm": 8.554775287736033, + "language_loss": 0.71186781, + "learning_rate": 3.7789885889895325e-06, + "loss": 0.79059678, + "num_input_tokens_seen": 63465525, + "router_z_loss_clip": 2.76953125, + "router_z_loss_mlp": 0.24047852, + "step": 2936, + "time_per_iteration": 4.091250896453857 + }, + { + "auxiliary_loss_clip": 0.06580865, + "auxiliary_loss_mlp": 0.01286216, + "balance_loss_clip": 0.06309694, + "balance_loss_mlp": 0.01263745, + "epoch": 0.17658199308582595, + "flos": 27461715033600.0, + "grad_norm": 1.9442195602404513, + "language_loss": 0.72206265, + "learning_rate": 3.7788105921955634e-06, + "loss": 0.80073345, + "num_input_tokens_seen": 63485815, + "router_z_loss_clip": 2.7109375, + "router_z_loss_mlp": 0.22473145, + "step": 2937, + "time_per_iteration": 2.5836215019226074 + }, + { + "auxiliary_loss_clip": 0.06581761, + "auxiliary_loss_mlp": 0.0128249, + "balance_loss_clip": 0.06303879, + "balance_loss_mlp": 0.01258088, + "epoch": 0.17664211633849392, + "flos": 22425167416320.0, + "grad_norm": 2.618384752485795, + "language_loss": 0.76896954, + "learning_rate": 3.7786325279489184e-06, + "loss": 0.84761202, + "num_input_tokens_seen": 63503905, + "router_z_loss_clip": 2.77929688, + "router_z_loss_mlp": 0.24389648, + "step": 2938, + "time_per_iteration": 2.5426154136657715 + }, + { + "auxiliary_loss_clip": 0.06581972, + "auxiliary_loss_mlp": 0.0129211, + "balance_loss_clip": 0.06306289, + "balance_loss_mlp": 0.01268638, + "epoch": 0.17670223959116188, + "flos": 24721642667520.0, + "grad_norm": 2.0224209621562803, + "language_loss": 0.72049117, + "learning_rate": 3.7784543962563495e-06, + "loss": 0.79923201, + "num_input_tokens_seen": 63521985, + "router_z_loss_clip": 2.7578125, + "router_z_loss_mlp": 0.23474121, + "step": 2939, + "time_per_iteration": 4.034467935562134 + }, + { + "auxiliary_loss_clip": 0.06574269, + "auxiliary_loss_mlp": 0.01281711, + "balance_loss_clip": 0.06305616, + "balance_loss_mlp": 0.01258668, + "epoch": 0.17676236284382985, + "flos": 22533383364480.0, + "grad_norm": 2.2379803860691667, + "language_loss": 0.75736713, + "learning_rate": 3.7782761971246115e-06, + "loss": 0.83592695, + "num_input_tokens_seen": 63539830, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 0.23034668, + "step": 2940, + "time_per_iteration": 2.6091058254241943 + }, + { + "auxiliary_loss_clip": 0.06579125, + "auxiliary_loss_mlp": 0.01284811, + "balance_loss_clip": 0.06305407, + "balance_loss_mlp": 0.01261494, + "epoch": 0.1768224860964978, + "flos": 12389988954240.0, + "grad_norm": 2.2625025035762443, + "language_loss": 0.86326134, + "learning_rate": 3.7780979305604616e-06, + "loss": 0.94190073, + "num_input_tokens_seen": 63555495, + "router_z_loss_clip": 2.73632812, + "router_z_loss_mlp": 0.2331543, + "step": 2941, + "time_per_iteration": 2.529346227645874 + }, + { + "auxiliary_loss_clip": 0.06590004, + "auxiliary_loss_mlp": 0.01292545, + "balance_loss_clip": 0.06314506, + "balance_loss_mlp": 0.01269073, + "epoch": 0.1768826093491658, + "flos": 24360335861760.0, + "grad_norm": 2.5150262997144806, + "language_loss": 0.78079373, + "learning_rate": 3.7779195965706607e-06, + "loss": 0.8596192, + "num_input_tokens_seen": 63575290, + "router_z_loss_clip": 2.75390625, + "router_z_loss_mlp": 0.23498535, + "step": 2942, + "time_per_iteration": 2.5893354415893555 + }, + { + "auxiliary_loss_clip": 0.06590073, + "auxiliary_loss_mlp": 0.01285718, + "balance_loss_clip": 0.06313878, + "balance_loss_mlp": 0.01261745, + "epoch": 0.17694273260183377, + "flos": 23593893765120.0, + "grad_norm": 1.793399089669822, + "language_loss": 0.81007993, + "learning_rate": 3.77774119516197e-06, + "loss": 0.88883781, + "num_input_tokens_seen": 63594670, + "router_z_loss_clip": 2.76171875, + "router_z_loss_mlp": 0.23962402, + "step": 2943, + "time_per_iteration": 4.085087537765503 + }, + { + "auxiliary_loss_clip": 0.065895, + "auxiliary_loss_mlp": 0.01284454, + "balance_loss_clip": 0.06311318, + "balance_loss_mlp": 0.01260266, + "epoch": 0.17700285585450173, + "flos": 26768297370240.0, + "grad_norm": 2.7078535987609524, + "language_loss": 0.81690747, + "learning_rate": 3.777562726341155e-06, + "loss": 0.89564693, + "num_input_tokens_seen": 63614780, + "router_z_loss_clip": 2.78125, + "router_z_loss_mlp": 0.24194336, + "step": 2944, + "time_per_iteration": 4.037370204925537 + }, + { + "auxiliary_loss_clip": 0.06577846, + "auxiliary_loss_mlp": 0.01285687, + "balance_loss_clip": 0.06307179, + "balance_loss_mlp": 0.01262, + "epoch": 0.1770629791071697, + "flos": 42785986919040.0, + "grad_norm": 3.287704950657118, + "language_loss": 0.74187398, + "learning_rate": 3.7773841901149835e-06, + "loss": 0.82050931, + "num_input_tokens_seen": 63637190, + "router_z_loss_clip": 2.70898438, + "router_z_loss_mlp": 0.23693848, + "step": 2945, + "time_per_iteration": 2.726703405380249 + }, + { + "auxiliary_loss_clip": 0.06568955, + "auxiliary_loss_mlp": 0.01286818, + "balance_loss_clip": 0.06300092, + "balance_loss_mlp": 0.01263596, + "epoch": 0.17712310235983766, + "flos": 17350954588800.0, + "grad_norm": 3.5781735305150013, + "language_loss": 0.78848231, + "learning_rate": 3.7772055864902256e-06, + "loss": 0.86704004, + "num_input_tokens_seen": 63652140, + "router_z_loss_clip": 2.68554688, + "router_z_loss_mlp": 0.23217773, + "step": 2946, + "time_per_iteration": 2.6050639152526855 + }, + { + "auxiliary_loss_clip": 0.06568858, + "auxiliary_loss_mlp": 0.01284865, + "balance_loss_clip": 0.06300168, + "balance_loss_mlp": 0.01262156, + "epoch": 0.17718322561250563, + "flos": 23885278738560.0, + "grad_norm": 1.9584306466242212, + "language_loss": 0.77679253, + "learning_rate": 3.7770269154736535e-06, + "loss": 0.85532975, + "num_input_tokens_seen": 63671700, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 0.22705078, + "step": 2947, + "time_per_iteration": 2.562394857406616 + }, + { + "auxiliary_loss_clip": 0.06579228, + "auxiliary_loss_mlp": 0.01286605, + "balance_loss_clip": 0.06305858, + "balance_loss_mlp": 0.01262573, + "epoch": 0.1772433488651736, + "flos": 36475306116480.0, + "grad_norm": 3.3061595908349193, + "language_loss": 0.7337119, + "learning_rate": 3.7768481770720424e-06, + "loss": 0.81237024, + "num_input_tokens_seen": 63691685, + "router_z_loss_clip": 2.734375, + "router_z_loss_mlp": 0.24023438, + "step": 2948, + "time_per_iteration": 2.6738815307617188 + }, + { + "auxiliary_loss_clip": 0.06568594, + "auxiliary_loss_mlp": 0.01285694, + "balance_loss_clip": 0.06305531, + "balance_loss_mlp": 0.01263915, + "epoch": 0.1773034721178416, + "flos": 26691457576320.0, + "grad_norm": 2.3861566912178915, + "language_loss": 0.82720947, + "learning_rate": 3.776669371292171e-06, + "loss": 0.90575236, + "num_input_tokens_seen": 63711720, + "router_z_loss_clip": 2.63085938, + "router_z_loss_mlp": 0.21777344, + "step": 2949, + "time_per_iteration": 2.6339261531829834 + }, + { + "auxiliary_loss_clip": 0.06558515, + "auxiliary_loss_mlp": 0.0129088, + "balance_loss_clip": 0.06397671, + "balance_loss_mlp": 0.01282136, + "epoch": 0.17736359537050955, + "flos": 57136007053440.0, + "grad_norm": 0.7127406603181583, + "language_loss": 0.65079832, + "learning_rate": 3.7764904981408186e-06, + "loss": 0.72929227, + "num_input_tokens_seen": 63776280, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.08758545, + "step": 2950, + "time_per_iteration": 3.2668871879577637 + }, + { + "auxiliary_loss_clip": 0.06572378, + "auxiliary_loss_mlp": 0.01284106, + "balance_loss_clip": 0.06306554, + "balance_loss_mlp": 0.01260896, + "epoch": 0.17742371862317752, + "flos": 27205479648000.0, + "grad_norm": 1.9196695606626306, + "language_loss": 0.84746122, + "learning_rate": 3.7763115576247686e-06, + "loss": 0.92602605, + "num_input_tokens_seen": 63797535, + "router_z_loss_clip": 2.65820312, + "router_z_loss_mlp": 0.2322998, + "step": 2951, + "time_per_iteration": 2.585566520690918 + }, + { + "auxiliary_loss_clip": 0.06574618, + "auxiliary_loss_mlp": 0.01283229, + "balance_loss_clip": 0.06301534, + "balance_loss_mlp": 0.01260556, + "epoch": 0.17748384187584548, + "flos": 20966020416000.0, + "grad_norm": 2.232427680766164, + "language_loss": 0.82122993, + "learning_rate": 3.776132549750806e-06, + "loss": 0.89980847, + "num_input_tokens_seen": 63817045, + "router_z_loss_clip": 2.72851562, + "router_z_loss_mlp": 0.22680664, + "step": 2952, + "time_per_iteration": 2.55747652053833 + }, + { + "auxiliary_loss_clip": 0.06570595, + "auxiliary_loss_mlp": 0.01296069, + "balance_loss_clip": 0.06303248, + "balance_loss_mlp": 0.01272251, + "epoch": 0.17754396512851345, + "flos": 25017052636800.0, + "grad_norm": 5.629810818318968, + "language_loss": 0.8066265, + "learning_rate": 3.7759534745257194e-06, + "loss": 0.88529313, + "num_input_tokens_seen": 63837665, + "router_z_loss_clip": 2.671875, + "router_z_loss_mlp": 0.23840332, + "step": 2953, + "time_per_iteration": 2.5756490230560303 + }, + { + "auxiliary_loss_clip": 0.06576403, + "auxiliary_loss_mlp": 0.01299444, + "balance_loss_clip": 0.06307617, + "balance_loss_mlp": 0.01275877, + "epoch": 0.1776040883811814, + "flos": 32059780634880.0, + "grad_norm": 1.9568540134603198, + "language_loss": 0.89472413, + "learning_rate": 3.7757743319562994e-06, + "loss": 0.97348255, + "num_input_tokens_seen": 63858455, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 0.2355957, + "step": 2954, + "time_per_iteration": 2.64989972114563 + }, + { + "auxiliary_loss_clip": 0.06576417, + "auxiliary_loss_mlp": 0.01304463, + "balance_loss_clip": 0.06308817, + "balance_loss_mlp": 0.01280788, + "epoch": 0.17766421163384938, + "flos": 21579579538560.0, + "grad_norm": 2.0844074095191423, + "language_loss": 0.85445726, + "learning_rate": 3.7755951220493386e-06, + "loss": 0.93326604, + "num_input_tokens_seen": 63876935, + "router_z_loss_clip": 2.6796875, + "router_z_loss_mlp": 0.23693848, + "step": 2955, + "time_per_iteration": 2.5314552783966064 + }, + { + "auxiliary_loss_clip": 0.06566998, + "auxiliary_loss_mlp": 0.01298177, + "balance_loss_clip": 0.06301849, + "balance_loss_mlp": 0.01274287, + "epoch": 0.17772433488651737, + "flos": 22425922103040.0, + "grad_norm": 1.629233918934169, + "language_loss": 0.7198323, + "learning_rate": 3.7754158448116327e-06, + "loss": 0.79848409, + "num_input_tokens_seen": 63896815, + "router_z_loss_clip": 2.65039062, + "router_z_loss_mlp": 0.2388916, + "step": 2956, + "time_per_iteration": 2.5686161518096924 + }, + { + "auxiliary_loss_clip": 0.06565966, + "auxiliary_loss_mlp": 0.01302663, + "balance_loss_clip": 0.06303196, + "balance_loss_mlp": 0.01279632, + "epoch": 0.17778445813918534, + "flos": 25636481544960.0, + "grad_norm": 1.8690466813220736, + "language_loss": 0.8383618, + "learning_rate": 3.7752365002499795e-06, + "loss": 0.9170481, + "num_input_tokens_seen": 63916140, + "router_z_loss_clip": 2.63085938, + "router_z_loss_mlp": 0.23034668, + "step": 2957, + "time_per_iteration": 2.5693180561065674 + }, + { + "auxiliary_loss_clip": 0.06574687, + "auxiliary_loss_mlp": 0.0129708, + "balance_loss_clip": 0.06307757, + "balance_loss_mlp": 0.01274323, + "epoch": 0.1778445813918533, + "flos": 25635810712320.0, + "grad_norm": 1.5960329991483622, + "language_loss": 0.75535214, + "learning_rate": 3.7750570883711807e-06, + "loss": 0.83406979, + "num_input_tokens_seen": 63935220, + "router_z_loss_clip": 2.66992188, + "router_z_loss_mlp": 0.22753906, + "step": 2958, + "time_per_iteration": 2.6068832874298096 + }, + { + "auxiliary_loss_clip": 0.06572513, + "auxiliary_loss_mlp": 0.01295837, + "balance_loss_clip": 0.06305174, + "balance_loss_mlp": 0.01273533, + "epoch": 0.17790470464452127, + "flos": 22351975274880.0, + "grad_norm": 2.4916809347301867, + "language_loss": 0.8152473, + "learning_rate": 3.7748776091820397e-06, + "loss": 0.89393079, + "num_input_tokens_seen": 63954550, + "router_z_loss_clip": 2.67773438, + "router_z_loss_mlp": 0.22302246, + "step": 2959, + "time_per_iteration": 2.532893419265747 + }, + { + "auxiliary_loss_clip": 0.06580231, + "auxiliary_loss_mlp": 0.01291039, + "balance_loss_clip": 0.06308466, + "balance_loss_mlp": 0.01267293, + "epoch": 0.17796482789718923, + "flos": 18771052786560.0, + "grad_norm": 1.971364332808954, + "language_loss": 0.52699149, + "learning_rate": 3.774698062689362e-06, + "loss": 0.60570425, + "num_input_tokens_seen": 63972425, + "router_z_loss_clip": 2.71679688, + "router_z_loss_mlp": 0.23754883, + "step": 2960, + "time_per_iteration": 2.5427799224853516 + }, + { + "auxiliary_loss_clip": 0.06575893, + "auxiliary_loss_mlp": 0.01290781, + "balance_loss_clip": 0.06308038, + "balance_loss_mlp": 0.01267726, + "epoch": 0.1780249511498572, + "flos": 23447719117440.0, + "grad_norm": 1.7972451693934908, + "language_loss": 0.90068716, + "learning_rate": 3.7745184488999548e-06, + "loss": 0.97935379, + "num_input_tokens_seen": 63992165, + "router_z_loss_clip": 2.6796875, + "router_z_loss_mlp": 0.23083496, + "step": 2961, + "time_per_iteration": 2.5641977787017822 + }, + { + "auxiliary_loss_clip": 0.06579147, + "auxiliary_loss_mlp": 0.01285295, + "balance_loss_clip": 0.06309063, + "balance_loss_mlp": 0.0126075, + "epoch": 0.1780850744025252, + "flos": 23374149632640.0, + "grad_norm": 3.006724243875413, + "language_loss": 0.79600328, + "learning_rate": 3.774338767820631e-06, + "loss": 0.87464768, + "num_input_tokens_seen": 64013470, + "router_z_loss_clip": 2.703125, + "router_z_loss_mlp": 0.2454834, + "step": 2962, + "time_per_iteration": 2.605395555496216 + }, + { + "auxiliary_loss_clip": 0.06579778, + "auxiliary_loss_mlp": 0.01288142, + "balance_loss_clip": 0.06310856, + "balance_loss_mlp": 0.01262977, + "epoch": 0.17814519765519315, + "flos": 13777117770240.0, + "grad_norm": 1.8585534107816564, + "language_loss": 0.75987798, + "learning_rate": 3.774159019458203e-06, + "loss": 0.83855718, + "num_input_tokens_seen": 64030975, + "router_z_loss_clip": 2.6875, + "router_z_loss_mlp": 0.25146484, + "step": 2963, + "time_per_iteration": 2.4989051818847656 + }, + { + "auxiliary_loss_clip": 0.06582604, + "auxiliary_loss_mlp": 0.01280238, + "balance_loss_clip": 0.06308165, + "balance_loss_mlp": 0.01255573, + "epoch": 0.17820532090786112, + "flos": 21982073425920.0, + "grad_norm": 2.394373782804808, + "language_loss": 0.79892176, + "learning_rate": 3.7739792038194877e-06, + "loss": 0.87755024, + "num_input_tokens_seen": 64050075, + "router_z_loss_clip": 2.74609375, + "router_z_loss_mlp": 0.24682617, + "step": 2964, + "time_per_iteration": 2.6040844917297363 + }, + { + "auxiliary_loss_clip": 0.06584992, + "auxiliary_loss_mlp": 0.01284037, + "balance_loss_clip": 0.06315298, + "balance_loss_mlp": 0.01259289, + "epoch": 0.17826544416052909, + "flos": 24797727774720.0, + "grad_norm": 4.1010799155066, + "language_loss": 0.8221398, + "learning_rate": 3.7737993209113027e-06, + "loss": 0.90083003, + "num_input_tokens_seen": 64071920, + "router_z_loss_clip": 2.69726562, + "router_z_loss_mlp": 0.24755859, + "step": 2965, + "time_per_iteration": 2.5539731979370117 + }, + { + "auxiliary_loss_clip": 0.06570912, + "auxiliary_loss_mlp": 0.01281116, + "balance_loss_clip": 0.06306428, + "balance_loss_mlp": 0.01258788, + "epoch": 0.17832556741319705, + "flos": 13884411323520.0, + "grad_norm": 2.4679554184574974, + "language_loss": 0.96086347, + "learning_rate": 3.7736193707404698e-06, + "loss": 1.03938377, + "num_input_tokens_seen": 64086835, + "router_z_loss_clip": 2.64257812, + "router_z_loss_mlp": 0.22338867, + "step": 2966, + "time_per_iteration": 2.527735948562622 + }, + { + "auxiliary_loss_clip": 0.06579631, + "auxiliary_loss_mlp": 0.01280876, + "balance_loss_clip": 0.06311509, + "balance_loss_mlp": 0.0125688, + "epoch": 0.17838569066586502, + "flos": 36649502755200.0, + "grad_norm": 2.0843689120837965, + "language_loss": 0.73698831, + "learning_rate": 3.7734393533138127e-06, + "loss": 0.81559336, + "num_input_tokens_seen": 64107360, + "router_z_loss_clip": 2.68359375, + "router_z_loss_mlp": 0.24023438, + "step": 2967, + "time_per_iteration": 2.7015600204467773 + }, + { + "auxiliary_loss_clip": 0.06577688, + "auxiliary_loss_mlp": 0.01283294, + "balance_loss_clip": 0.06315881, + "balance_loss_mlp": 0.01260192, + "epoch": 0.17844581391853298, + "flos": 18732087838080.0, + "grad_norm": 3.4272342033369956, + "language_loss": 0.77622253, + "learning_rate": 3.773259268638157e-06, + "loss": 0.85483229, + "num_input_tokens_seen": 64124690, + "router_z_loss_clip": 2.62109375, + "router_z_loss_mlp": 0.2310791, + "step": 2968, + "time_per_iteration": 2.5782222747802734 + }, + { + "auxiliary_loss_clip": 0.06574235, + "auxiliary_loss_mlp": 0.01280569, + "balance_loss_clip": 0.06309816, + "balance_loss_mlp": 0.01257716, + "epoch": 0.17850593717120097, + "flos": 27385168728960.0, + "grad_norm": 2.732998701382931, + "language_loss": 0.76891911, + "learning_rate": 3.7730791167203333e-06, + "loss": 0.84746712, + "num_input_tokens_seen": 64146315, + "router_z_loss_clip": 2.64453125, + "router_z_loss_mlp": 0.2286377, + "step": 2969, + "time_per_iteration": 2.596932888031006 + }, + { + "auxiliary_loss_clip": 0.06469887, + "auxiliary_loss_mlp": 0.01257031, + "balance_loss_clip": 0.06316882, + "balance_loss_mlp": 0.01250105, + "epoch": 0.17856606042386894, + "flos": 67014696816000.0, + "grad_norm": 0.8163537423270849, + "language_loss": 0.69127434, + "learning_rate": 3.772898897567171e-06, + "loss": 0.76854354, + "num_input_tokens_seen": 64210875, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.06939697, + "step": 2970, + "time_per_iteration": 3.239208221435547 + }, + { + "auxiliary_loss_clip": 0.06585611, + "auxiliary_loss_mlp": 0.01285467, + "balance_loss_clip": 0.06311353, + "balance_loss_mlp": 0.01261936, + "epoch": 0.1786261836765369, + "flos": 36986015952000.0, + "grad_norm": 1.9165060952178286, + "language_loss": 0.67737955, + "learning_rate": 3.772718611185505e-06, + "loss": 0.75609034, + "num_input_tokens_seen": 64230740, + "router_z_loss_clip": 2.7421875, + "router_z_loss_mlp": 0.23522949, + "step": 2971, + "time_per_iteration": 2.6962218284606934 + }, + { + "auxiliary_loss_clip": 0.06573113, + "auxiliary_loss_mlp": 0.01289649, + "balance_loss_clip": 0.06303954, + "balance_loss_mlp": 0.01265164, + "epoch": 0.17868630692920487, + "flos": 24832122675840.0, + "grad_norm": 2.3195878790033992, + "language_loss": 0.90615618, + "learning_rate": 3.7725382575821717e-06, + "loss": 0.98478377, + "num_input_tokens_seen": 64252300, + "router_z_loss_clip": 2.69335938, + "router_z_loss_mlp": 0.24475098, + "step": 2972, + "time_per_iteration": 2.5959432125091553 + }, + { + "auxiliary_loss_clip": 0.06576589, + "auxiliary_loss_mlp": 0.01296839, + "balance_loss_clip": 0.06306117, + "balance_loss_mlp": 0.01272747, + "epoch": 0.17874643018187283, + "flos": 16987509504000.0, + "grad_norm": 2.140735852517547, + "language_loss": 0.89032125, + "learning_rate": 3.77235783676401e-06, + "loss": 0.96905553, + "num_input_tokens_seen": 64270105, + "router_z_loss_clip": 2.70507812, + "router_z_loss_mlp": 0.24084473, + "step": 2973, + "time_per_iteration": 2.5378026962280273 + }, + { + "auxiliary_loss_clip": 0.06586085, + "auxiliary_loss_mlp": 0.01287873, + "balance_loss_clip": 0.06315553, + "balance_loss_mlp": 0.01263459, + "epoch": 0.1788065534345408, + "flos": 21038499797760.0, + "grad_norm": 2.0743135363702097, + "language_loss": 0.77368832, + "learning_rate": 3.7721773487378615e-06, + "loss": 0.8524279, + "num_input_tokens_seen": 64287250, + "router_z_loss_clip": 2.70703125, + "router_z_loss_mlp": 0.2442627, + "step": 2974, + "time_per_iteration": 2.53279972076416 + }, + { + "auxiliary_loss_clip": 0.06580098, + "auxiliary_loss_mlp": 0.01294024, + "balance_loss_clip": 0.06311634, + "balance_loss_mlp": 0.01269825, + "epoch": 0.17886667668720876, + "flos": 23994500935680.0, + "grad_norm": 2.8964956916015323, + "language_loss": 0.75456583, + "learning_rate": 3.7719967935105705e-06, + "loss": 0.83330709, + "num_input_tokens_seen": 64307140, + "router_z_loss_clip": 2.68359375, + "router_z_loss_mlp": 0.24182129, + "step": 2975, + "time_per_iteration": 2.5941531658172607 + }, + { + "auxiliary_loss_clip": 0.06574937, + "auxiliary_loss_mlp": 0.01296496, + "balance_loss_clip": 0.06309143, + "balance_loss_mlp": 0.0127443, + "epoch": 0.17892679993987676, + "flos": 25746626136960.0, + "grad_norm": 1.5983536265516811, + "language_loss": 0.73931366, + "learning_rate": 3.7718161710889833e-06, + "loss": 0.81802797, + "num_input_tokens_seen": 64328760, + "router_z_loss_clip": 2.65820312, + "router_z_loss_mlp": 0.22070312, + "step": 2976, + "time_per_iteration": 3.9981672763824463 + }, + { + "auxiliary_loss_clip": 0.06569345, + "auxiliary_loss_mlp": 0.01289522, + "balance_loss_clip": 0.06309073, + "balance_loss_mlp": 0.01268697, + "epoch": 0.17898692319254472, + "flos": 25706277596160.0, + "grad_norm": 1.568582717127115, + "language_loss": 0.7779026, + "learning_rate": 3.7716354814799495e-06, + "loss": 0.85649121, + "num_input_tokens_seen": 64348800, + "router_z_loss_clip": 2.60351562, + "router_z_loss_mlp": 0.20837402, + "step": 2977, + "time_per_iteration": 2.6050028800964355 + }, + { + "auxiliary_loss_clip": 0.06579779, + "auxiliary_loss_mlp": 0.01290892, + "balance_loss_clip": 0.06314169, + "balance_loss_mlp": 0.01267538, + "epoch": 0.1790470464452127, + "flos": 19323830171520.0, + "grad_norm": 2.1998049901746395, + "language_loss": 0.80421352, + "learning_rate": 3.7714547246903203e-06, + "loss": 0.88292015, + "num_input_tokens_seen": 64367955, + "router_z_loss_clip": 2.65234375, + "router_z_loss_mlp": 0.23339844, + "step": 2978, + "time_per_iteration": 4.010040044784546 + }, + { + "auxiliary_loss_clip": 0.06576563, + "auxiliary_loss_mlp": 0.01293687, + "balance_loss_clip": 0.06306942, + "balance_loss_mlp": 0.01267556, + "epoch": 0.17910716969788065, + "flos": 30052048953600.0, + "grad_norm": 1.73318348994846, + "language_loss": 0.77042997, + "learning_rate": 3.7712739007269508e-06, + "loss": 0.84913242, + "num_input_tokens_seen": 64389805, + "router_z_loss_clip": 2.69335938, + "router_z_loss_mlp": 0.2611084, + "step": 2979, + "time_per_iteration": 2.608980655670166 + }, + { + "auxiliary_loss_clip": 0.06560802, + "auxiliary_loss_mlp": 0.01281236, + "balance_loss_clip": 0.06300105, + "balance_loss_mlp": 0.01258264, + "epoch": 0.17916729295054862, + "flos": 19433848982400.0, + "grad_norm": 2.44165935104879, + "language_loss": 0.69755781, + "learning_rate": 3.7710930095966976e-06, + "loss": 0.77597821, + "num_input_tokens_seen": 64408220, + "router_z_loss_clip": 2.60742188, + "router_z_loss_mlp": 0.22961426, + "step": 2980, + "time_per_iteration": 2.5433084964752197 + }, + { + "auxiliary_loss_clip": 0.06568111, + "auxiliary_loss_mlp": 0.01287625, + "balance_loss_clip": 0.06298865, + "balance_loss_mlp": 0.01262627, + "epoch": 0.17922741620321658, + "flos": 14616877789440.0, + "grad_norm": 2.147684280368508, + "language_loss": 0.7145257, + "learning_rate": 3.7709120513064196e-06, + "loss": 0.79308307, + "num_input_tokens_seen": 64426380, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 0.25, + "step": 2981, + "time_per_iteration": 2.500054359436035 + }, + { + "auxiliary_loss_clip": 0.06576173, + "auxiliary_loss_mlp": 0.01291804, + "balance_loss_clip": 0.06304301, + "balance_loss_mlp": 0.01267676, + "epoch": 0.17928753945588458, + "flos": 17171013945600.0, + "grad_norm": 2.0884907581744514, + "language_loss": 0.82620054, + "learning_rate": 3.7707310258629796e-06, + "loss": 0.90488029, + "num_input_tokens_seen": 64444355, + "router_z_loss_clip": 2.72070312, + "router_z_loss_mlp": 0.24145508, + "step": 2982, + "time_per_iteration": 2.5748655796051025 + }, + { + "auxiliary_loss_clip": 0.06564468, + "auxiliary_loss_mlp": 0.01285766, + "balance_loss_clip": 0.06298885, + "balance_loss_mlp": 0.01263212, + "epoch": 0.17934766270855254, + "flos": 31403860473600.0, + "grad_norm": 1.5724638299649338, + "language_loss": 0.83894312, + "learning_rate": 3.7705499332732413e-06, + "loss": 0.91744542, + "num_input_tokens_seen": 64467800, + "router_z_loss_clip": 2.65429688, + "router_z_loss_mlp": 0.2253418, + "step": 2983, + "time_per_iteration": 5.515043497085571 + }, + { + "auxiliary_loss_clip": 0.0656914, + "auxiliary_loss_mlp": 0.01282068, + "balance_loss_clip": 0.06294827, + "balance_loss_mlp": 0.01257571, + "epoch": 0.1794077859612205, + "flos": 20820558528000.0, + "grad_norm": 2.232182880378402, + "language_loss": 0.86948806, + "learning_rate": 3.7703687735440718e-06, + "loss": 0.94800013, + "num_input_tokens_seen": 64487230, + "router_z_loss_clip": 2.74414062, + "router_z_loss_mlp": 0.24523926, + "step": 2984, + "time_per_iteration": 2.51488995552063 + }, + { + "auxiliary_loss_clip": 0.0657285, + "auxiliary_loss_mlp": 0.0128885, + "balance_loss_clip": 0.06300434, + "balance_loss_mlp": 0.01263315, + "epoch": 0.17946790921388847, + "flos": 28994096102400.0, + "grad_norm": 1.3770556187482685, + "language_loss": 0.90024149, + "learning_rate": 3.7701875466823416e-06, + "loss": 0.97885847, + "num_input_tokens_seen": 64509165, + "router_z_loss_clip": 2.72851562, + "router_z_loss_mlp": 0.25537109, + "step": 2985, + "time_per_iteration": 2.6063013076782227 + }, + { + "auxiliary_loss_clip": 0.06556329, + "auxiliary_loss_mlp": 0.01283368, + "balance_loss_clip": 0.06297163, + "balance_loss_mlp": 0.01261088, + "epoch": 0.17952803246655644, + "flos": 20743131755520.0, + "grad_norm": 1.9976249367728316, + "language_loss": 0.71013325, + "learning_rate": 3.770006252694922e-06, + "loss": 0.78853023, + "num_input_tokens_seen": 64527940, + "router_z_loss_clip": 2.58984375, + "router_z_loss_mlp": 0.22277832, + "step": 2986, + "time_per_iteration": 2.519601345062256 + }, + { + "auxiliary_loss_clip": 0.0656532, + "auxiliary_loss_mlp": 0.01291064, + "balance_loss_clip": 0.06300499, + "balance_loss_mlp": 0.01266805, + "epoch": 0.1795881557192244, + "flos": 28263390572160.0, + "grad_norm": 2.1489314529360994, + "language_loss": 0.78320301, + "learning_rate": 3.769824891588688e-06, + "loss": 0.86176682, + "num_input_tokens_seen": 64545230, + "router_z_loss_clip": 2.65039062, + "router_z_loss_mlp": 0.24243164, + "step": 2987, + "time_per_iteration": 2.6449100971221924 + }, + { + "auxiliary_loss_clip": 0.06569126, + "auxiliary_loss_mlp": 0.01288456, + "balance_loss_clip": 0.06297948, + "balance_loss_mlp": 0.01263589, + "epoch": 0.17964827897189237, + "flos": 18558016980480.0, + "grad_norm": 1.9340316390641499, + "language_loss": 0.78628373, + "learning_rate": 3.7696434633705164e-06, + "loss": 0.86485958, + "num_input_tokens_seen": 64563820, + "router_z_loss_clip": 2.7109375, + "router_z_loss_mlp": 0.24890137, + "step": 2988, + "time_per_iteration": 2.53200101852417 + }, + { + "auxiliary_loss_clip": 0.06451814, + "auxiliary_loss_mlp": 0.01275074, + "balance_loss_clip": 0.06303016, + "balance_loss_mlp": 0.01267408, + "epoch": 0.17970840222456036, + "flos": 58182052625280.0, + "grad_norm": 0.7360596365876024, + "language_loss": 0.62615538, + "learning_rate": 3.7694619680472875e-06, + "loss": 0.70342427, + "num_input_tokens_seen": 64621315, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.07653809, + "step": 2989, + "time_per_iteration": 3.076199769973755 + }, + { + "auxiliary_loss_clip": 0.06567107, + "auxiliary_loss_mlp": 0.01292244, + "balance_loss_clip": 0.06300405, + "balance_loss_mlp": 0.0126808, + "epoch": 0.17976852547722832, + "flos": 20306662237440.0, + "grad_norm": 2.2696852334697035, + "language_loss": 0.71750367, + "learning_rate": 3.7692804056258837e-06, + "loss": 0.79609722, + "num_input_tokens_seen": 64639885, + "router_z_loss_clip": 2.66601562, + "router_z_loss_mlp": 0.24157715, + "step": 2990, + "time_per_iteration": 2.5519793033599854 + }, + { + "auxiliary_loss_clip": 0.06572431, + "auxiliary_loss_mlp": 0.01293466, + "balance_loss_clip": 0.0629989, + "balance_loss_mlp": 0.0126873, + "epoch": 0.1798286487298963, + "flos": 39677564004480.0, + "grad_norm": 1.9736942492438545, + "language_loss": 0.69419956, + "learning_rate": 3.7690987761131893e-06, + "loss": 0.77285856, + "num_input_tokens_seen": 64661220, + "router_z_loss_clip": 2.7265625, + "router_z_loss_mlp": 0.24743652, + "step": 2991, + "time_per_iteration": 2.6942460536956787 + }, + { + "auxiliary_loss_clip": 0.06566148, + "auxiliary_loss_mlp": 0.01286066, + "balance_loss_clip": 0.0629756, + "balance_loss_mlp": 0.012617, + "epoch": 0.17988877198256426, + "flos": 25527385128960.0, + "grad_norm": 1.696800264728132, + "language_loss": 0.83554435, + "learning_rate": 3.7689170795160924e-06, + "loss": 0.91406649, + "num_input_tokens_seen": 64682530, + "router_z_loss_clip": 2.68359375, + "router_z_loss_mlp": 0.24365234, + "step": 2992, + "time_per_iteration": 2.5905981063842773 + }, + { + "auxiliary_loss_clip": 0.06555136, + "auxiliary_loss_mlp": 0.01287452, + "balance_loss_clip": 0.06296399, + "balance_loss_mlp": 0.01264087, + "epoch": 0.17994889523523222, + "flos": 18813539606400.0, + "grad_norm": 1.8489809189150626, + "language_loss": 0.83113515, + "learning_rate": 3.7687353158414822e-06, + "loss": 0.90956104, + "num_input_tokens_seen": 64701025, + "router_z_loss_clip": 2.59179688, + "router_z_loss_mlp": 0.23352051, + "step": 2993, + "time_per_iteration": 2.52469801902771 + }, + { + "auxiliary_loss_clip": 0.06567293, + "auxiliary_loss_mlp": 0.01295673, + "balance_loss_clip": 0.06297931, + "balance_loss_mlp": 0.01270532, + "epoch": 0.18000901848790019, + "flos": 21110601836160.0, + "grad_norm": 1.6727087173341013, + "language_loss": 0.79138827, + "learning_rate": 3.7685534850962517e-06, + "loss": 0.87001795, + "num_input_tokens_seen": 64719570, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 0.25134277, + "step": 2994, + "time_per_iteration": 2.6068711280822754 + }, + { + "auxiliary_loss_clip": 0.06570512, + "auxiliary_loss_mlp": 0.01299664, + "balance_loss_clip": 0.06303661, + "balance_loss_mlp": 0.01275656, + "epoch": 0.18006914174056818, + "flos": 19652586865920.0, + "grad_norm": 2.057688194559839, + "language_loss": 0.81263554, + "learning_rate": 3.768371587287296e-06, + "loss": 0.89133728, + "num_input_tokens_seen": 64738110, + "router_z_loss_clip": 2.66796875, + "router_z_loss_mlp": 0.24023438, + "step": 2995, + "time_per_iteration": 2.55191707611084 + }, + { + "auxiliary_loss_clip": 0.06569074, + "auxiliary_loss_mlp": 0.0128305, + "balance_loss_clip": 0.06302823, + "balance_loss_mlp": 0.012599, + "epoch": 0.18012926499323614, + "flos": 19505909093760.0, + "grad_norm": 1.5669289310044971, + "language_loss": 0.84560204, + "learning_rate": 3.768189622421512e-06, + "loss": 0.92412329, + "num_input_tokens_seen": 64756345, + "router_z_loss_clip": 2.66210938, + "router_z_loss_mlp": 0.23156738, + "step": 2996, + "time_per_iteration": 2.5438597202301025 + }, + { + "auxiliary_loss_clip": 0.06562654, + "auxiliary_loss_mlp": 0.012845, + "balance_loss_clip": 0.06302606, + "balance_loss_mlp": 0.01261124, + "epoch": 0.1801893882459041, + "flos": 19470759505920.0, + "grad_norm": 1.7191902249906965, + "language_loss": 0.88438457, + "learning_rate": 3.7680075905058006e-06, + "loss": 0.96285611, + "num_input_tokens_seen": 64776375, + "router_z_loss_clip": 2.6015625, + "router_z_loss_mlp": 0.23352051, + "step": 2997, + "time_per_iteration": 2.5537290573120117 + }, + { + "auxiliary_loss_clip": 0.06589026, + "auxiliary_loss_mlp": 0.01294218, + "balance_loss_clip": 0.06317096, + "balance_loss_mlp": 0.01268731, + "epoch": 0.18024951149857207, + "flos": 26877938837760.0, + "grad_norm": 1.8629134602199495, + "language_loss": 0.86106455, + "learning_rate": 3.7678254915470643e-06, + "loss": 0.939897, + "num_input_tokens_seen": 64796210, + "router_z_loss_clip": 2.71679688, + "router_z_loss_mlp": 0.25500488, + "step": 2998, + "time_per_iteration": 2.6256613731384277 + }, + { + "auxiliary_loss_clip": 0.06576181, + "auxiliary_loss_mlp": 0.01293189, + "balance_loss_clip": 0.06311405, + "balance_loss_mlp": 0.01269573, + "epoch": 0.18030963475124004, + "flos": 30234421365120.0, + "grad_norm": 1.8712207411963018, + "language_loss": 0.84650278, + "learning_rate": 3.7676433255522084e-06, + "loss": 0.92519647, + "num_input_tokens_seen": 64818590, + "router_z_loss_clip": 2.64648438, + "router_z_loss_mlp": 0.23608398, + "step": 2999, + "time_per_iteration": 2.6169869899749756 + }, + { + "auxiliary_loss_clip": 0.06576863, + "auxiliary_loss_mlp": 0.01287758, + "balance_loss_clip": 0.06310622, + "balance_loss_mlp": 0.01263905, + "epoch": 0.180369758003908, + "flos": 22313681159040.0, + "grad_norm": 2.163703762887268, + "language_loss": 0.75604963, + "learning_rate": 3.76746109252814e-06, + "loss": 0.83469582, + "num_input_tokens_seen": 64838350, + "router_z_loss_clip": 2.6640625, + "router_z_loss_mlp": 0.23852539, + "step": 3000, + "time_per_iteration": 2.6028895378112793 + }, + { + "auxiliary_loss_clip": 0.06574081, + "auxiliary_loss_mlp": 0.01292075, + "balance_loss_clip": 0.06310557, + "balance_loss_mlp": 0.01270034, + "epoch": 0.18042988125657597, + "flos": 23738726747520.0, + "grad_norm": 2.5967993482221114, + "language_loss": 0.72796941, + "learning_rate": 3.76727879248177e-06, + "loss": 0.80663097, + "num_input_tokens_seen": 64858065, + "router_z_loss_clip": 2.63671875, + "router_z_loss_mlp": 0.22033691, + "step": 3001, + "time_per_iteration": 2.5506463050842285 + }, + { + "auxiliary_loss_clip": 0.06583872, + "auxiliary_loss_mlp": 0.01288133, + "balance_loss_clip": 0.06311986, + "balance_loss_mlp": 0.01262336, + "epoch": 0.18049000450924396, + "flos": 24099781991040.0, + "grad_norm": 2.0612506576335488, + "language_loss": 0.88948703, + "learning_rate": 3.767096425420011e-06, + "loss": 0.96820712, + "num_input_tokens_seen": 64877305, + "router_z_loss_clip": 2.71484375, + "router_z_loss_mlp": 0.25793457, + "step": 3002, + "time_per_iteration": 2.606262683868408 + }, + { + "auxiliary_loss_clip": 0.06584583, + "auxiliary_loss_mlp": 0.01297298, + "balance_loss_clip": 0.06316328, + "balance_loss_mlp": 0.01274613, + "epoch": 0.18055012776191193, + "flos": 22169602863360.0, + "grad_norm": 1.9471434915323604, + "language_loss": 0.82044661, + "learning_rate": 3.7669139913497788e-06, + "loss": 0.89926547, + "num_input_tokens_seen": 64896955, + "router_z_loss_clip": 2.68359375, + "router_z_loss_mlp": 0.22705078, + "step": 3003, + "time_per_iteration": 2.519054889678955 + }, + { + "auxiliary_loss_clip": 0.06584047, + "auxiliary_loss_mlp": 0.01304701, + "balance_loss_clip": 0.0631455, + "balance_loss_mlp": 0.01281098, + "epoch": 0.1806102510145799, + "flos": 28921155523200.0, + "grad_norm": 1.9671809983045359, + "language_loss": 0.67718011, + "learning_rate": 3.7667314902779907e-06, + "loss": 0.75606757, + "num_input_tokens_seen": 64917080, + "router_z_loss_clip": 2.6953125, + "router_z_loss_mlp": 0.23608398, + "step": 3004, + "time_per_iteration": 2.576216459274292 + }, + { + "auxiliary_loss_clip": 0.06581833, + "auxiliary_loss_mlp": 0.01290497, + "balance_loss_clip": 0.06313001, + "balance_loss_mlp": 0.01265976, + "epoch": 0.18067037426724786, + "flos": 19031648584320.0, + "grad_norm": 1.7292261015630317, + "language_loss": 0.86117315, + "learning_rate": 3.7665489222115677e-06, + "loss": 0.93989646, + "num_input_tokens_seen": 64935215, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 0.2454834, + "step": 3005, + "time_per_iteration": 2.51688814163208 + }, + { + "auxiliary_loss_clip": 0.06579112, + "auxiliary_loss_mlp": 0.01292933, + "balance_loss_clip": 0.0631589, + "balance_loss_mlp": 0.01270247, + "epoch": 0.18073049751991582, + "flos": 27460960346880.0, + "grad_norm": 1.9900110027616933, + "language_loss": 0.84054905, + "learning_rate": 3.766366287157432e-06, + "loss": 0.9192695, + "num_input_tokens_seen": 64956275, + "router_z_loss_clip": 2.6328125, + "router_z_loss_mlp": 0.22692871, + "step": 3006, + "time_per_iteration": 2.6471307277679443 + }, + { + "auxiliary_loss_clip": 0.06573892, + "auxiliary_loss_mlp": 0.01293776, + "balance_loss_clip": 0.06311665, + "balance_loss_mlp": 0.01270399, + "epoch": 0.1807906207725838, + "flos": 28736309416320.0, + "grad_norm": 1.8980852178108305, + "language_loss": 0.77909601, + "learning_rate": 3.7661835851225103e-06, + "loss": 0.85777271, + "num_input_tokens_seen": 64979390, + "router_z_loss_clip": 2.62304688, + "router_z_loss_mlp": 0.23376465, + "step": 3007, + "time_per_iteration": 2.596728801727295 + }, + { + "auxiliary_loss_clip": 0.06488212, + "auxiliary_loss_mlp": 0.01341948, + "balance_loss_clip": 0.06340114, + "balance_loss_mlp": 0.01332817, + "epoch": 0.18085074402525175, + "flos": 64488861411840.0, + "grad_norm": 0.8091646786767962, + "language_loss": 0.57128072, + "learning_rate": 3.7660008161137294e-06, + "loss": 0.64958233, + "num_input_tokens_seen": 65043135, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.09136963, + "step": 3008, + "time_per_iteration": 3.2818551063537598 + }, + { + "auxiliary_loss_clip": 0.06575561, + "auxiliary_loss_mlp": 0.0128936, + "balance_loss_clip": 0.06307852, + "balance_loss_mlp": 0.0126528, + "epoch": 0.18091086727791975, + "flos": 23483665319040.0, + "grad_norm": 2.791287786369512, + "language_loss": 0.68172324, + "learning_rate": 3.765817980138021e-06, + "loss": 0.76037246, + "num_input_tokens_seen": 65062845, + "router_z_loss_clip": 2.67773438, + "router_z_loss_mlp": 0.24072266, + "step": 3009, + "time_per_iteration": 2.612866163253784 + }, + { + "auxiliary_loss_clip": 0.06566571, + "auxiliary_loss_mlp": 0.01283544, + "balance_loss_clip": 0.06299911, + "balance_loss_mlp": 0.01261228, + "epoch": 0.1809709905305877, + "flos": 24177334544640.0, + "grad_norm": 2.2065616524174745, + "language_loss": 0.76732111, + "learning_rate": 3.7656350772023177e-06, + "loss": 0.84582222, + "num_input_tokens_seen": 65082110, + "router_z_loss_clip": 2.6640625, + "router_z_loss_mlp": 0.22314453, + "step": 3010, + "time_per_iteration": 2.570751190185547 + }, + { + "auxiliary_loss_clip": 0.0656049, + "auxiliary_loss_mlp": 0.01277678, + "balance_loss_clip": 0.06301664, + "balance_loss_mlp": 0.01255028, + "epoch": 0.18103111378325568, + "flos": 21657006311040.0, + "grad_norm": 1.5802962280270132, + "language_loss": 0.68172359, + "learning_rate": 3.7654521073135553e-06, + "loss": 0.76010525, + "num_input_tokens_seen": 65101985, + "router_z_loss_clip": 2.58789062, + "router_z_loss_mlp": 0.22644043, + "step": 3011, + "time_per_iteration": 2.5724563598632812 + }, + { + "auxiliary_loss_clip": 0.0656517, + "auxiliary_loss_mlp": 0.01279328, + "balance_loss_clip": 0.06304309, + "balance_loss_mlp": 0.01256989, + "epoch": 0.18109123703592364, + "flos": 53698632537600.0, + "grad_norm": 1.5833259733478497, + "language_loss": 0.71816081, + "learning_rate": 3.7652690704786723e-06, + "loss": 0.79660583, + "num_input_tokens_seen": 65129295, + "router_z_loss_clip": 2.60546875, + "router_z_loss_mlp": 0.22351074, + "step": 3012, + "time_per_iteration": 2.810831069946289 + }, + { + "auxiliary_loss_clip": 0.06566492, + "auxiliary_loss_mlp": 0.01285528, + "balance_loss_clip": 0.06309225, + "balance_loss_mlp": 0.01261376, + "epoch": 0.1811513602885916, + "flos": 35854325907840.0, + "grad_norm": 2.597528045864961, + "language_loss": 0.63496852, + "learning_rate": 3.765085966704609e-06, + "loss": 0.7134887, + "num_input_tokens_seen": 65150625, + "router_z_loss_clip": 2.57617188, + "router_z_loss_mlp": 0.24169922, + "step": 3013, + "time_per_iteration": 2.728149175643921 + }, + { + "auxiliary_loss_clip": 0.0656557, + "auxiliary_loss_mlp": 0.01286402, + "balance_loss_clip": 0.06302488, + "balance_loss_mlp": 0.01262405, + "epoch": 0.18121148354125957, + "flos": 23739355653120.0, + "grad_norm": 1.5758176693533255, + "language_loss": 0.76564461, + "learning_rate": 3.764902795998309e-06, + "loss": 0.84416431, + "num_input_tokens_seen": 65170880, + "router_z_loss_clip": 2.63085938, + "router_z_loss_mlp": 0.23986816, + "step": 3014, + "time_per_iteration": 2.547717332839966 + }, + { + "auxiliary_loss_clip": 0.06584823, + "auxiliary_loss_mlp": 0.01295776, + "balance_loss_clip": 0.06314109, + "balance_loss_mlp": 0.01270336, + "epoch": 0.18127160679392756, + "flos": 28735470875520.0, + "grad_norm": 2.560866552798296, + "language_loss": 0.66988617, + "learning_rate": 3.7647195583667184e-06, + "loss": 0.74869215, + "num_input_tokens_seen": 65192530, + "router_z_loss_clip": 2.70898438, + "router_z_loss_mlp": 0.2545166, + "step": 3015, + "time_per_iteration": 2.69026780128479 + }, + { + "auxiliary_loss_clip": 0.06569196, + "auxiliary_loss_mlp": 0.01280146, + "balance_loss_clip": 0.06306805, + "balance_loss_mlp": 0.0125696, + "epoch": 0.18133173004659553, + "flos": 20491256782080.0, + "grad_norm": 2.469275114619788, + "language_loss": 0.78958207, + "learning_rate": 3.764536253816785e-06, + "loss": 0.86807549, + "num_input_tokens_seen": 65211675, + "router_z_loss_clip": 2.62304688, + "router_z_loss_mlp": 0.23168945, + "step": 3016, + "time_per_iteration": 3.9831480979919434 + }, + { + "auxiliary_loss_clip": 0.0657627, + "auxiliary_loss_mlp": 0.01288204, + "balance_loss_clip": 0.06304967, + "balance_loss_mlp": 0.01262967, + "epoch": 0.1813918532992635, + "flos": 22857905427840.0, + "grad_norm": 1.6723213639278358, + "language_loss": 0.84196192, + "learning_rate": 3.7643528823554602e-06, + "loss": 0.92060661, + "num_input_tokens_seen": 65231185, + "router_z_loss_clip": 2.71484375, + "router_z_loss_mlp": 0.25256348, + "step": 3017, + "time_per_iteration": 2.5418076515197754 + }, + { + "auxiliary_loss_clip": 0.06562062, + "auxiliary_loss_mlp": 0.01287085, + "balance_loss_clip": 0.063041, + "balance_loss_mlp": 0.01264197, + "epoch": 0.18145197655193146, + "flos": 36074028113280.0, + "grad_norm": 1.9391079186566258, + "language_loss": 0.68509835, + "learning_rate": 3.764169443989697e-06, + "loss": 0.76358986, + "num_input_tokens_seen": 65251645, + "router_z_loss_clip": 2.58203125, + "router_z_loss_mlp": 0.22900391, + "step": 3018, + "time_per_iteration": 4.119429111480713 + }, + { + "auxiliary_loss_clip": 0.06567694, + "auxiliary_loss_mlp": 0.01285506, + "balance_loss_clip": 0.06301513, + "balance_loss_mlp": 0.01262296, + "epoch": 0.18151209980459942, + "flos": 24030698699520.0, + "grad_norm": 1.811235496294486, + "language_loss": 0.76789671, + "learning_rate": 3.7639859387264518e-06, + "loss": 0.84642869, + "num_input_tokens_seen": 65271125, + "router_z_loss_clip": 2.65820312, + "router_z_loss_mlp": 0.23205566, + "step": 3019, + "time_per_iteration": 2.5501174926757812 + }, + { + "auxiliary_loss_clip": 0.06571496, + "auxiliary_loss_mlp": 0.01294569, + "balance_loss_clip": 0.0630317, + "balance_loss_mlp": 0.01267544, + "epoch": 0.1815722230572674, + "flos": 23958470880000.0, + "grad_norm": 3.3265475746221305, + "language_loss": 0.82225502, + "learning_rate": 3.7638023665726834e-06, + "loss": 0.90091568, + "num_input_tokens_seen": 65290600, + "router_z_loss_clip": 2.68554688, + "router_z_loss_mlp": 0.26989746, + "step": 3020, + "time_per_iteration": 2.5695080757141113 + }, + { + "auxiliary_loss_clip": 0.06568192, + "auxiliary_loss_mlp": 0.01285845, + "balance_loss_clip": 0.06304596, + "balance_loss_mlp": 0.01262433, + "epoch": 0.18163234630993536, + "flos": 24392885973120.0, + "grad_norm": 1.8328180932997555, + "language_loss": 0.78643721, + "learning_rate": 3.763618727535352e-06, + "loss": 0.8649776, + "num_input_tokens_seen": 65311040, + "router_z_loss_clip": 2.63867188, + "router_z_loss_mlp": 0.234375, + "step": 3021, + "time_per_iteration": 2.551942825317383 + }, + { + "auxiliary_loss_clip": 0.06560968, + "auxiliary_loss_mlp": 0.01283899, + "balance_loss_clip": 0.06301476, + "balance_loss_mlp": 0.01261034, + "epoch": 0.18169246956260335, + "flos": 24688295942400.0, + "grad_norm": 2.040482316083418, + "language_loss": 0.85882831, + "learning_rate": 3.763435021621422e-06, + "loss": 0.93727696, + "num_input_tokens_seen": 65332115, + "router_z_loss_clip": 2.59179688, + "router_z_loss_mlp": 0.22851562, + "step": 3022, + "time_per_iteration": 5.58092737197876 + }, + { + "auxiliary_loss_clip": 0.06578015, + "auxiliary_loss_mlp": 0.01285165, + "balance_loss_clip": 0.06310268, + "balance_loss_mlp": 0.0126031, + "epoch": 0.1817525928152713, + "flos": 24250149342720.0, + "grad_norm": 1.8455534069636814, + "language_loss": 0.7011804, + "learning_rate": 3.763251248837859e-06, + "loss": 0.77981222, + "num_input_tokens_seen": 65352210, + "router_z_loss_clip": 2.67773438, + "router_z_loss_mlp": 0.24853516, + "step": 3023, + "time_per_iteration": 2.5510292053222656 + }, + { + "auxiliary_loss_clip": 0.06576993, + "auxiliary_loss_mlp": 0.01285425, + "balance_loss_clip": 0.06311849, + "balance_loss_mlp": 0.01262382, + "epoch": 0.18181271606793928, + "flos": 16477680136320.0, + "grad_norm": 3.5802196750479753, + "language_loss": 0.7475239, + "learning_rate": 3.7630674091916317e-06, + "loss": 0.82614803, + "num_input_tokens_seen": 65370600, + "router_z_loss_clip": 2.6484375, + "router_z_loss_mlp": 0.23034668, + "step": 3024, + "time_per_iteration": 2.532150983810425 + }, + { + "auxiliary_loss_clip": 0.0657917, + "auxiliary_loss_mlp": 0.01281973, + "balance_loss_clip": 0.06315119, + "balance_loss_mlp": 0.01258239, + "epoch": 0.18187283932060724, + "flos": 18585787409280.0, + "grad_norm": 2.5283577302616593, + "language_loss": 0.89396572, + "learning_rate": 3.7628835026897123e-06, + "loss": 0.97257715, + "num_input_tokens_seen": 65387270, + "router_z_loss_clip": 2.640625, + "router_z_loss_mlp": 0.23742676, + "step": 3025, + "time_per_iteration": 2.503992795944214 + }, + { + "auxiliary_loss_clip": 0.0657706, + "auxiliary_loss_mlp": 0.01284845, + "balance_loss_clip": 0.06313155, + "balance_loss_mlp": 0.01260049, + "epoch": 0.1819329625732752, + "flos": 20273105877120.0, + "grad_norm": 1.766887401432974, + "language_loss": 0.80214149, + "learning_rate": 3.7626995293390735e-06, + "loss": 0.88076055, + "num_input_tokens_seen": 65406550, + "router_z_loss_clip": 2.63867188, + "router_z_loss_mlp": 0.24804688, + "step": 3026, + "time_per_iteration": 2.5226128101348877 + }, + { + "auxiliary_loss_clip": 0.06583989, + "auxiliary_loss_mlp": 0.01292049, + "balance_loss_clip": 0.06316754, + "balance_loss_mlp": 0.01267695, + "epoch": 0.18199308582594317, + "flos": 25921242046080.0, + "grad_norm": 3.8781285127645924, + "language_loss": 0.76237446, + "learning_rate": 3.762515489146692e-06, + "loss": 0.84113485, + "num_input_tokens_seen": 65425955, + "router_z_loss_clip": 2.67578125, + "router_z_loss_mlp": 0.2434082, + "step": 3027, + "time_per_iteration": 2.578749418258667 + }, + { + "auxiliary_loss_clip": 0.06592765, + "auxiliary_loss_mlp": 0.01296803, + "balance_loss_clip": 0.06322083, + "balance_loss_mlp": 0.01271328, + "epoch": 0.18205320907861114, + "flos": 15382942542720.0, + "grad_norm": 3.274226659229475, + "language_loss": 0.86130804, + "learning_rate": 3.762331382119546e-06, + "loss": 0.94020373, + "num_input_tokens_seen": 65442820, + "router_z_loss_clip": 2.71289062, + "router_z_loss_mlp": 0.25476074, + "step": 3028, + "time_per_iteration": 2.5201306343078613 + }, + { + "auxiliary_loss_clip": 0.06585124, + "auxiliary_loss_mlp": 0.01291016, + "balance_loss_clip": 0.06319305, + "balance_loss_mlp": 0.01263896, + "epoch": 0.18211333233127913, + "flos": 25630485978240.0, + "grad_norm": 1.8702692274079507, + "language_loss": 0.83509612, + "learning_rate": 3.7621472082646183e-06, + "loss": 0.91385752, + "num_input_tokens_seen": 65461825, + "router_z_loss_clip": 2.65429688, + "router_z_loss_mlp": 0.27111816, + "step": 3029, + "time_per_iteration": 2.562183380126953 + }, + { + "auxiliary_loss_clip": 0.06592625, + "auxiliary_loss_mlp": 0.01296678, + "balance_loss_clip": 0.06326656, + "balance_loss_mlp": 0.01269153, + "epoch": 0.1821734555839471, + "flos": 14981329123200.0, + "grad_norm": 1.9791177396807749, + "language_loss": 0.78960443, + "learning_rate": 3.761962967588891e-06, + "loss": 0.86849743, + "num_input_tokens_seen": 65479480, + "router_z_loss_clip": 2.65820312, + "router_z_loss_mlp": 0.27514648, + "step": 3030, + "time_per_iteration": 2.5145437717437744 + }, + { + "auxiliary_loss_clip": 0.06592657, + "auxiliary_loss_mlp": 0.01296331, + "balance_loss_clip": 0.06325006, + "balance_loss_mlp": 0.01269748, + "epoch": 0.18223357883661506, + "flos": 20200291079040.0, + "grad_norm": 1.9881761765350903, + "language_loss": 0.86102521, + "learning_rate": 3.761778660099352e-06, + "loss": 0.93991506, + "num_input_tokens_seen": 65497775, + "router_z_loss_clip": 2.6796875, + "router_z_loss_mlp": 0.26623535, + "step": 3031, + "time_per_iteration": 2.5260634422302246 + }, + { + "auxiliary_loss_clip": 0.06592748, + "auxiliary_loss_mlp": 0.01294791, + "balance_loss_clip": 0.06325988, + "balance_loss_mlp": 0.01270473, + "epoch": 0.18229370208928303, + "flos": 15237438727680.0, + "grad_norm": 2.0909174524979033, + "language_loss": 0.8092168, + "learning_rate": 3.76159428580299e-06, + "loss": 0.88809216, + "num_input_tokens_seen": 65516505, + "router_z_loss_clip": 2.66796875, + "router_z_loss_mlp": 0.24316406, + "step": 3032, + "time_per_iteration": 2.5710113048553467 + }, + { + "auxiliary_loss_clip": 0.06594816, + "auxiliary_loss_mlp": 0.01293656, + "balance_loss_clip": 0.06321192, + "balance_loss_mlp": 0.0126718, + "epoch": 0.182353825341951, + "flos": 23847026549760.0, + "grad_norm": 1.952875580311909, + "language_loss": 0.81854784, + "learning_rate": 3.761409844706795e-06, + "loss": 0.89743257, + "num_input_tokens_seen": 65536160, + "router_z_loss_clip": 2.73632812, + "router_z_loss_mlp": 0.26501465, + "step": 3033, + "time_per_iteration": 2.5495798587799072 + }, + { + "auxiliary_loss_clip": 0.06484132, + "auxiliary_loss_mlp": 0.01303963, + "balance_loss_clip": 0.06340252, + "balance_loss_mlp": 0.01294378, + "epoch": 0.18241394859461896, + "flos": 61208017522560.0, + "grad_norm": 0.8447557433525825, + "language_loss": 0.63402653, + "learning_rate": 3.7612253368177625e-06, + "loss": 0.71190745, + "num_input_tokens_seen": 65589375, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.09570312, + "step": 3034, + "time_per_iteration": 3.0660452842712402 + }, + { + "auxiliary_loss_clip": 0.0658728, + "auxiliary_loss_mlp": 0.01296965, + "balance_loss_clip": 0.0632379, + "balance_loss_mlp": 0.01271896, + "epoch": 0.18247407184728695, + "flos": 18476439431040.0, + "grad_norm": 2.061097584564917, + "language_loss": 0.80526477, + "learning_rate": 3.7610407621428893e-06, + "loss": 0.88410723, + "num_input_tokens_seen": 65606720, + "router_z_loss_clip": 2.63476562, + "router_z_loss_mlp": 0.25073242, + "step": 3035, + "time_per_iteration": 2.5506694316864014 + }, + { + "auxiliary_loss_clip": 0.06580287, + "auxiliary_loss_mlp": 0.01288285, + "balance_loss_clip": 0.06319961, + "balance_loss_mlp": 0.01264181, + "epoch": 0.18253419509995492, + "flos": 21801042679680.0, + "grad_norm": 1.6140632959859456, + "language_loss": 0.85371202, + "learning_rate": 3.7608561206891735e-06, + "loss": 0.93239772, + "num_input_tokens_seen": 65625495, + "router_z_loss_clip": 2.6015625, + "router_z_loss_mlp": 0.24108887, + "step": 3036, + "time_per_iteration": 2.6029741764068604 + }, + { + "auxiliary_loss_clip": 0.06580038, + "auxiliary_loss_mlp": 0.01290184, + "balance_loss_clip": 0.0632468, + "balance_loss_mlp": 0.01266843, + "epoch": 0.18259431835262288, + "flos": 20154743585280.0, + "grad_norm": 2.265799944133398, + "language_loss": 0.80322921, + "learning_rate": 3.760671412463617e-06, + "loss": 0.88193142, + "num_input_tokens_seen": 65643515, + "router_z_loss_clip": 2.54882812, + "router_z_loss_mlp": 0.23327637, + "step": 3037, + "time_per_iteration": 2.519632577896118 + }, + { + "auxiliary_loss_clip": 0.06593587, + "auxiliary_loss_mlp": 0.01295693, + "balance_loss_clip": 0.063269, + "balance_loss_mlp": 0.01270373, + "epoch": 0.18265444160529085, + "flos": 16987132160640.0, + "grad_norm": 4.978587383263401, + "language_loss": 0.80596817, + "learning_rate": 3.7604866374732246e-06, + "loss": 0.88486093, + "num_input_tokens_seen": 65658155, + "router_z_loss_clip": 2.66601562, + "router_z_loss_mlp": 0.25341797, + "step": 3038, + "time_per_iteration": 2.549565315246582 + }, + { + "auxiliary_loss_clip": 0.06577064, + "auxiliary_loss_mlp": 0.01293219, + "balance_loss_clip": 0.06316892, + "balance_loss_mlp": 0.01268221, + "epoch": 0.1827145648579588, + "flos": 34431879795840.0, + "grad_norm": 3.0715308969073907, + "language_loss": 0.6822418, + "learning_rate": 3.7603017957250023e-06, + "loss": 0.76094472, + "num_input_tokens_seen": 65679310, + "router_z_loss_clip": 2.60351562, + "router_z_loss_mlp": 0.24987793, + "step": 3039, + "time_per_iteration": 2.664839267730713 + }, + { + "auxiliary_loss_clip": 0.06579359, + "auxiliary_loss_mlp": 0.01283138, + "balance_loss_clip": 0.06312781, + "balance_loss_mlp": 0.0125783, + "epoch": 0.18277468811062678, + "flos": 53298905834880.0, + "grad_norm": 2.0617529505454866, + "language_loss": 0.74242914, + "learning_rate": 3.7601168872259593e-06, + "loss": 0.82105416, + "num_input_tokens_seen": 65705235, + "router_z_loss_clip": 2.66796875, + "router_z_loss_mlp": 0.25305176, + "step": 3040, + "time_per_iteration": 2.8341598510742188 + }, + { + "auxiliary_loss_clip": 0.06576048, + "auxiliary_loss_mlp": 0.01287293, + "balance_loss_clip": 0.06314505, + "balance_loss_mlp": 0.01261997, + "epoch": 0.18283481136329474, + "flos": 31658879975040.0, + "grad_norm": 2.270513376553218, + "language_loss": 0.61012894, + "learning_rate": 3.7599319119831075e-06, + "loss": 0.68876237, + "num_input_tokens_seen": 65727575, + "router_z_loss_clip": 2.61328125, + "router_z_loss_mlp": 0.25305176, + "step": 3041, + "time_per_iteration": 2.6312432289123535 + }, + { + "auxiliary_loss_clip": 0.065763, + "auxiliary_loss_mlp": 0.01280171, + "balance_loss_clip": 0.06311682, + "balance_loss_mlp": 0.01254779, + "epoch": 0.18289493461596273, + "flos": 53148957753600.0, + "grad_norm": 1.9789856473501881, + "language_loss": 0.60569113, + "learning_rate": 3.7597468700034616e-06, + "loss": 0.68425584, + "num_input_tokens_seen": 65751370, + "router_z_loss_clip": 2.6484375, + "router_z_loss_mlp": 0.25366211, + "step": 3042, + "time_per_iteration": 2.8294289112091064 + }, + { + "auxiliary_loss_clip": 0.06571855, + "auxiliary_loss_mlp": 0.01284933, + "balance_loss_clip": 0.06311391, + "balance_loss_mlp": 0.01261818, + "epoch": 0.1829550578686307, + "flos": 25595797587840.0, + "grad_norm": 2.1969947776781593, + "language_loss": 0.87948751, + "learning_rate": 3.7595617612940374e-06, + "loss": 0.95805538, + "num_input_tokens_seen": 65771040, + "router_z_loss_clip": 2.60742188, + "router_z_loss_mlp": 0.2310791, + "step": 3043, + "time_per_iteration": 2.5895864963531494 + }, + { + "auxiliary_loss_clip": 0.06576079, + "auxiliary_loss_mlp": 0.01280472, + "balance_loss_clip": 0.06308874, + "balance_loss_mlp": 0.01255737, + "epoch": 0.18301518112129866, + "flos": 22608001025280.0, + "grad_norm": 2.7546688504112633, + "language_loss": 0.71556103, + "learning_rate": 3.7593765858618552e-06, + "loss": 0.79412657, + "num_input_tokens_seen": 65789345, + "router_z_loss_clip": 2.671875, + "router_z_loss_mlp": 0.24731445, + "step": 3044, + "time_per_iteration": 2.524653196334839 + }, + { + "auxiliary_loss_clip": 0.06580091, + "auxiliary_loss_mlp": 0.0128018, + "balance_loss_clip": 0.06309704, + "balance_loss_mlp": 0.01255277, + "epoch": 0.18307530437396663, + "flos": 34029176273280.0, + "grad_norm": 2.5838478211487406, + "language_loss": 0.65133858, + "learning_rate": 3.7591913437139365e-06, + "loss": 0.72994125, + "num_input_tokens_seen": 65810990, + "router_z_loss_clip": 2.70117188, + "router_z_loss_mlp": 0.24914551, + "step": 3045, + "time_per_iteration": 2.656454086303711 + }, + { + "auxiliary_loss_clip": 0.06567913, + "auxiliary_loss_mlp": 0.01279381, + "balance_loss_clip": 0.06306372, + "balance_loss_mlp": 0.01256898, + "epoch": 0.1831354276266346, + "flos": 21284756547840.0, + "grad_norm": 3.147408680423339, + "language_loss": 0.803563, + "learning_rate": 3.7590060348573066e-06, + "loss": 0.88203591, + "num_input_tokens_seen": 65827230, + "router_z_loss_clip": 2.6171875, + "router_z_loss_mlp": 0.22497559, + "step": 3046, + "time_per_iteration": 2.503777503967285 + }, + { + "auxiliary_loss_clip": 0.06581149, + "auxiliary_loss_mlp": 0.01284573, + "balance_loss_clip": 0.06310049, + "balance_loss_mlp": 0.01259217, + "epoch": 0.18319555087930256, + "flos": 21039338338560.0, + "grad_norm": 2.4200593706157627, + "language_loss": 0.79505324, + "learning_rate": 3.7588206592989903e-06, + "loss": 0.87371051, + "num_input_tokens_seen": 65845900, + "router_z_loss_clip": 2.71289062, + "router_z_loss_mlp": 0.25354004, + "step": 3047, + "time_per_iteration": 2.5604546070098877 + }, + { + "auxiliary_loss_clip": 0.06579873, + "auxiliary_loss_mlp": 0.01282037, + "balance_loss_clip": 0.06320655, + "balance_loss_mlp": 0.01258243, + "epoch": 0.18325567413197055, + "flos": 34390944276480.0, + "grad_norm": 1.4781726378987778, + "language_loss": 0.81601483, + "learning_rate": 3.7586352170460194e-06, + "loss": 0.89463389, + "num_input_tokens_seen": 65868730, + "router_z_loss_clip": 2.59570312, + "router_z_loss_mlp": 0.23779297, + "step": 3048, + "time_per_iteration": 2.6359665393829346 + }, + { + "auxiliary_loss_clip": 0.06575403, + "auxiliary_loss_mlp": 0.01285089, + "balance_loss_clip": 0.0631268, + "balance_loss_mlp": 0.01260472, + "epoch": 0.18331579738463852, + "flos": 20564742412800.0, + "grad_norm": 2.1940168845136045, + "language_loss": 0.87414008, + "learning_rate": 3.758449708105424e-06, + "loss": 0.95274496, + "num_input_tokens_seen": 65888420, + "router_z_loss_clip": 2.62890625, + "router_z_loss_mlp": 0.24633789, + "step": 3049, + "time_per_iteration": 2.5575695037841797 + }, + { + "auxiliary_loss_clip": 0.06592787, + "auxiliary_loss_mlp": 0.01283738, + "balance_loss_clip": 0.0632069, + "balance_loss_mlp": 0.01259086, + "epoch": 0.18337592063730648, + "flos": 19613663844480.0, + "grad_norm": 3.2022638976819486, + "language_loss": 0.78845787, + "learning_rate": 3.75826413248424e-06, + "loss": 0.86722308, + "num_input_tokens_seen": 65905840, + "router_z_loss_clip": 2.72265625, + "router_z_loss_mlp": 0.24694824, + "step": 3050, + "time_per_iteration": 2.5530426502227783 + }, + { + "auxiliary_loss_clip": 0.06580114, + "auxiliary_loss_mlp": 0.01276938, + "balance_loss_clip": 0.06318066, + "balance_loss_mlp": 0.01253466, + "epoch": 0.18343604388997445, + "flos": 20857301343360.0, + "grad_norm": 2.3642096483096764, + "language_loss": 1.00007951, + "learning_rate": 3.7580784901895035e-06, + "loss": 1.07865, + "num_input_tokens_seen": 65922845, + "router_z_loss_clip": 2.62109375, + "router_z_loss_mlp": 0.23474121, + "step": 3051, + "time_per_iteration": 2.53879714012146 + }, + { + "auxiliary_loss_clip": 0.06576733, + "auxiliary_loss_mlp": 0.01279033, + "balance_loss_clip": 0.06316614, + "balance_loss_mlp": 0.01255025, + "epoch": 0.1834961671426424, + "flos": 24402109921920.0, + "grad_norm": 1.6089937167063422, + "language_loss": 0.87510651, + "learning_rate": 3.7578927812282542e-06, + "loss": 0.95366418, + "num_input_tokens_seen": 65945555, + "router_z_loss_clip": 2.6015625, + "router_z_loss_mlp": 0.23999023, + "step": 3052, + "time_per_iteration": 2.616711378097534 + }, + { + "auxiliary_loss_clip": 0.06578867, + "auxiliary_loss_mlp": 0.01277944, + "balance_loss_clip": 0.06319693, + "balance_loss_mlp": 0.01255485, + "epoch": 0.18355629039531038, + "flos": 21257992368000.0, + "grad_norm": 1.906783267886923, + "language_loss": 0.73879737, + "learning_rate": 3.7577070056075356e-06, + "loss": 0.81736547, + "num_input_tokens_seen": 65963965, + "router_z_loss_clip": 2.59179688, + "router_z_loss_mlp": 0.22473145, + "step": 3053, + "time_per_iteration": 2.5624823570251465 + }, + { + "auxiliary_loss_clip": 0.06577893, + "auxiliary_loss_mlp": 0.01281464, + "balance_loss_clip": 0.06309894, + "balance_loss_mlp": 0.01257264, + "epoch": 0.18361641364797834, + "flos": 28663830034560.0, + "grad_norm": 2.5767200648108233, + "language_loss": 0.6330536, + "learning_rate": 3.7575211633343902e-06, + "loss": 0.71164715, + "num_input_tokens_seen": 65985965, + "router_z_loss_clip": 2.6796875, + "router_z_loss_mlp": 0.24194336, + "step": 3054, + "time_per_iteration": 2.6126291751861572 + }, + { + "auxiliary_loss_clip": 0.06580043, + "auxiliary_loss_mlp": 0.01278803, + "balance_loss_clip": 0.0631642, + "balance_loss_mlp": 0.0125539, + "epoch": 0.18367653690064634, + "flos": 20924414064000.0, + "grad_norm": 2.0083810279560192, + "language_loss": 0.79178774, + "learning_rate": 3.7573352544158663e-06, + "loss": 0.87037629, + "num_input_tokens_seen": 66005645, + "router_z_loss_clip": 2.63671875, + "router_z_loss_mlp": 0.23400879, + "step": 3055, + "time_per_iteration": 3.9858450889587402 + }, + { + "auxiliary_loss_clip": 0.06567059, + "auxiliary_loss_mlp": 0.01278609, + "balance_loss_clip": 0.06311087, + "balance_loss_mlp": 0.01255971, + "epoch": 0.1837366601533143, + "flos": 28772884523520.0, + "grad_norm": 1.844309785332071, + "language_loss": 0.71021843, + "learning_rate": 3.757149278859014e-06, + "loss": 0.78867513, + "num_input_tokens_seen": 66025675, + "router_z_loss_clip": 2.56054688, + "router_z_loss_mlp": 0.2265625, + "step": 3056, + "time_per_iteration": 2.623892068862915 + }, + { + "auxiliary_loss_clip": 0.06573971, + "auxiliary_loss_mlp": 0.01282679, + "balance_loss_clip": 0.06309162, + "balance_loss_mlp": 0.0125954, + "epoch": 0.18379678340598227, + "flos": 21257782732800.0, + "grad_norm": 1.9202402240588465, + "language_loss": 0.81177384, + "learning_rate": 3.7569632366708842e-06, + "loss": 0.89034033, + "num_input_tokens_seen": 66046125, + "router_z_loss_clip": 2.64648438, + "router_z_loss_mlp": 0.23144531, + "step": 3057, + "time_per_iteration": 3.994014263153076 + }, + { + "auxiliary_loss_clip": 0.06576763, + "auxiliary_loss_mlp": 0.01288527, + "balance_loss_clip": 0.06303927, + "balance_loss_mlp": 0.01263029, + "epoch": 0.18385690665865023, + "flos": 20455981413120.0, + "grad_norm": 5.209505310648867, + "language_loss": 0.83562195, + "learning_rate": 3.756777127858533e-06, + "loss": 0.91427481, + "num_input_tokens_seen": 66064375, + "router_z_loss_clip": 2.73242188, + "router_z_loss_mlp": 0.25500488, + "step": 3058, + "time_per_iteration": 2.559356689453125 + }, + { + "auxiliary_loss_clip": 0.0658073, + "auxiliary_loss_mlp": 0.01283954, + "balance_loss_clip": 0.06315949, + "balance_loss_mlp": 0.01259278, + "epoch": 0.1839170299113182, + "flos": 26147736432000.0, + "grad_norm": 2.1347539719525552, + "language_loss": 0.86113238, + "learning_rate": 3.756590952429017e-06, + "loss": 0.93977928, + "num_input_tokens_seen": 66084590, + "router_z_loss_clip": 2.6484375, + "router_z_loss_mlp": 0.2467041, + "step": 3059, + "time_per_iteration": 2.5702602863311768 + }, + { + "auxiliary_loss_clip": 0.0656752, + "auxiliary_loss_mlp": 0.01279577, + "balance_loss_clip": 0.06304803, + "balance_loss_mlp": 0.01255997, + "epoch": 0.18397715316398616, + "flos": 31765921966080.0, + "grad_norm": 1.5595075663945241, + "language_loss": 0.73269093, + "learning_rate": 3.756404710389396e-06, + "loss": 0.81116188, + "num_input_tokens_seen": 66107105, + "router_z_loss_clip": 2.625, + "router_z_loss_mlp": 0.23583984, + "step": 3060, + "time_per_iteration": 2.6496734619140625 + }, + { + "auxiliary_loss_clip": 0.06572919, + "auxiliary_loss_mlp": 0.01280202, + "balance_loss_clip": 0.06306632, + "balance_loss_mlp": 0.01254715, + "epoch": 0.18403727641665413, + "flos": 24619548067200.0, + "grad_norm": 1.685629450787069, + "language_loss": 0.73033082, + "learning_rate": 3.7562184017467323e-06, + "loss": 0.80886197, + "num_input_tokens_seen": 66129295, + "router_z_loss_clip": 2.6640625, + "router_z_loss_mlp": 0.25512695, + "step": 3061, + "time_per_iteration": 2.611788034439087 + }, + { + "auxiliary_loss_clip": 0.06574027, + "auxiliary_loss_mlp": 0.01285757, + "balance_loss_clip": 0.06309725, + "balance_loss_mlp": 0.01262666, + "epoch": 0.18409739966932212, + "flos": 23446503233280.0, + "grad_norm": 3.8650330009727893, + "language_loss": 0.81972837, + "learning_rate": 3.7560320265080906e-06, + "loss": 0.89832628, + "num_input_tokens_seen": 66146910, + "router_z_loss_clip": 2.64453125, + "router_z_loss_mlp": 0.23095703, + "step": 3062, + "time_per_iteration": 5.428592920303345 + }, + { + "auxiliary_loss_clip": 0.06579094, + "auxiliary_loss_mlp": 0.01285398, + "balance_loss_clip": 0.06309452, + "balance_loss_mlp": 0.01260806, + "epoch": 0.18415752292199009, + "flos": 21878637160320.0, + "grad_norm": 1.977008299285237, + "language_loss": 0.74067175, + "learning_rate": 3.7558455846805383e-06, + "loss": 0.81931663, + "num_input_tokens_seen": 66165370, + "router_z_loss_clip": 2.69921875, + "router_z_loss_mlp": 0.24572754, + "step": 3063, + "time_per_iteration": 2.53143572807312 + }, + { + "auxiliary_loss_clip": 0.06568366, + "auxiliary_loss_mlp": 0.0128141, + "balance_loss_clip": 0.06305687, + "balance_loss_mlp": 0.01257556, + "epoch": 0.18421764617465805, + "flos": 25417701734400.0, + "grad_norm": 1.7280289049146156, + "language_loss": 0.66864884, + "learning_rate": 3.7556590762711463e-06, + "loss": 0.74714661, + "num_input_tokens_seen": 66186210, + "router_z_loss_clip": 2.62890625, + "router_z_loss_mlp": 0.23864746, + "step": 3064, + "time_per_iteration": 2.595961332321167 + }, + { + "auxiliary_loss_clip": 0.06569844, + "auxiliary_loss_mlp": 0.0127972, + "balance_loss_clip": 0.06304256, + "balance_loss_mlp": 0.01256748, + "epoch": 0.18427776942732602, + "flos": 27205395793920.0, + "grad_norm": 1.7817654183541871, + "language_loss": 0.69580668, + "learning_rate": 3.7554725012869853e-06, + "loss": 0.77430236, + "num_input_tokens_seen": 66204800, + "router_z_loss_clip": 2.65625, + "router_z_loss_mlp": 0.22937012, + "step": 3065, + "time_per_iteration": 2.5717501640319824 + }, + { + "auxiliary_loss_clip": 0.06574196, + "auxiliary_loss_mlp": 0.01283905, + "balance_loss_clip": 0.06306924, + "balance_loss_mlp": 0.01258168, + "epoch": 0.18433789267999398, + "flos": 27859303457280.0, + "grad_norm": 2.294674560085645, + "language_loss": 0.73328084, + "learning_rate": 3.7552858597351318e-06, + "loss": 0.81186187, + "num_input_tokens_seen": 66222195, + "router_z_loss_clip": 2.671875, + "router_z_loss_mlp": 0.25720215, + "step": 3066, + "time_per_iteration": 2.5840933322906494 + }, + { + "auxiliary_loss_clip": 0.06567979, + "auxiliary_loss_mlp": 0.01283252, + "balance_loss_clip": 0.06303403, + "balance_loss_mlp": 0.01259458, + "epoch": 0.18439801593266195, + "flos": 17862502965120.0, + "grad_norm": 1.9426241343058523, + "language_loss": 0.8287726, + "learning_rate": 3.7550991516226622e-06, + "loss": 0.90728498, + "num_input_tokens_seen": 66239505, + "router_z_loss_clip": 2.64453125, + "router_z_loss_mlp": 0.23791504, + "step": 3067, + "time_per_iteration": 2.510010004043579 + }, + { + "auxiliary_loss_clip": 0.06482083, + "auxiliary_loss_mlp": 0.01256206, + "balance_loss_clip": 0.06330505, + "balance_loss_mlp": 0.01248302, + "epoch": 0.18445813918532994, + "flos": 56408236416000.0, + "grad_norm": 0.8014843936748705, + "language_loss": 0.59808761, + "learning_rate": 3.754912376956657e-06, + "loss": 0.67547047, + "num_input_tokens_seen": 66295695, + "router_z_loss_clip": 1.515625, + "router_z_loss_mlp": 0.07897949, + "step": 3068, + "time_per_iteration": 3.036146879196167 + }, + { + "auxiliary_loss_clip": 0.06564388, + "auxiliary_loss_mlp": 0.01280505, + "balance_loss_clip": 0.06303549, + "balance_loss_mlp": 0.01256687, + "epoch": 0.1845182624379979, + "flos": 20963085523200.0, + "grad_norm": 1.8439912741449518, + "language_loss": 0.77266169, + "learning_rate": 3.7547255357441987e-06, + "loss": 0.8511107, + "num_input_tokens_seen": 66315315, + "router_z_loss_clip": 2.60742188, + "router_z_loss_mlp": 0.23840332, + "step": 3069, + "time_per_iteration": 2.5499565601348877 + }, + { + "auxiliary_loss_clip": 0.06570058, + "auxiliary_loss_mlp": 0.01283287, + "balance_loss_clip": 0.06303704, + "balance_loss_mlp": 0.01258038, + "epoch": 0.18457838569066587, + "flos": 20491382563200.0, + "grad_norm": 2.2630610204441655, + "language_loss": 0.86447155, + "learning_rate": 3.7545386279923718e-06, + "loss": 0.94300503, + "num_input_tokens_seen": 66333675, + "router_z_loss_clip": 2.6640625, + "router_z_loss_mlp": 0.25280762, + "step": 3070, + "time_per_iteration": 2.573843479156494 + }, + { + "auxiliary_loss_clip": 0.06575848, + "auxiliary_loss_mlp": 0.0128984, + "balance_loss_clip": 0.06307413, + "balance_loss_mlp": 0.01265545, + "epoch": 0.18463850894333383, + "flos": 25017094563840.0, + "grad_norm": 2.0459920671080725, + "language_loss": 0.78778827, + "learning_rate": 3.754351653708265e-06, + "loss": 0.86644518, + "num_input_tokens_seen": 66354075, + "router_z_loss_clip": 2.68164062, + "router_z_loss_mlp": 0.24279785, + "step": 3071, + "time_per_iteration": 2.6498963832855225 + }, + { + "auxiliary_loss_clip": 0.06567957, + "auxiliary_loss_mlp": 0.01281558, + "balance_loss_clip": 0.06301579, + "balance_loss_mlp": 0.01256142, + "epoch": 0.1846986321960018, + "flos": 16806311049600.0, + "grad_norm": 2.346095649750701, + "language_loss": 0.77759838, + "learning_rate": 3.7541646128989674e-06, + "loss": 0.85609353, + "num_input_tokens_seen": 66372520, + "router_z_loss_clip": 2.66210938, + "router_z_loss_mlp": 0.25427246, + "step": 3072, + "time_per_iteration": 2.5731780529022217 + }, + { + "auxiliary_loss_clip": 0.06569058, + "auxiliary_loss_mlp": 0.01286345, + "balance_loss_clip": 0.06299037, + "balance_loss_mlp": 0.01261096, + "epoch": 0.18475875544866976, + "flos": 20820726236160.0, + "grad_norm": 1.9004070702769575, + "language_loss": 0.87276495, + "learning_rate": 3.7539775055715715e-06, + "loss": 0.95131898, + "num_input_tokens_seen": 66390745, + "router_z_loss_clip": 2.703125, + "router_z_loss_mlp": 0.25231934, + "step": 3073, + "time_per_iteration": 2.5327014923095703 + }, + { + "auxiliary_loss_clip": 0.06571067, + "auxiliary_loss_mlp": 0.01285925, + "balance_loss_clip": 0.06302057, + "balance_loss_mlp": 0.01261523, + "epoch": 0.18481887870133773, + "flos": 22608001025280.0, + "grad_norm": 2.4702398063651314, + "language_loss": 0.9204939, + "learning_rate": 3.7537903317331732e-06, + "loss": 0.99906385, + "num_input_tokens_seen": 66410525, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 0.24401855, + "step": 3074, + "time_per_iteration": 2.6219372749328613 + }, + { + "auxiliary_loss_clip": 0.06566601, + "auxiliary_loss_mlp": 0.01284131, + "balance_loss_clip": 0.06302647, + "balance_loss_mlp": 0.01257583, + "epoch": 0.18487900195400572, + "flos": 29466218332800.0, + "grad_norm": 2.295087571563985, + "language_loss": 0.64970315, + "learning_rate": 3.75360309139087e-06, + "loss": 0.72821045, + "num_input_tokens_seen": 66432535, + "router_z_loss_clip": 2.640625, + "router_z_loss_mlp": 0.26550293, + "step": 3075, + "time_per_iteration": 2.6108217239379883 + }, + { + "auxiliary_loss_clip": 0.06563977, + "auxiliary_loss_mlp": 0.0128829, + "balance_loss_clip": 0.06303947, + "balance_loss_mlp": 0.01264519, + "epoch": 0.1849391252066737, + "flos": 20634622318080.0, + "grad_norm": 2.1580493004205943, + "language_loss": 0.7321173, + "learning_rate": 3.753415784551761e-06, + "loss": 0.81063998, + "num_input_tokens_seen": 66450620, + "router_z_loss_clip": 2.60351562, + "router_z_loss_mlp": 0.23742676, + "step": 3076, + "time_per_iteration": 2.552551746368408 + }, + { + "auxiliary_loss_clip": 0.06574243, + "auxiliary_loss_mlp": 0.01280151, + "balance_loss_clip": 0.06304738, + "balance_loss_mlp": 0.01256309, + "epoch": 0.18499924845934165, + "flos": 14433750691200.0, + "grad_norm": 2.459416187119703, + "language_loss": 0.82324487, + "learning_rate": 3.7532284112229507e-06, + "loss": 0.90178883, + "num_input_tokens_seen": 66467865, + "router_z_loss_clip": 2.69726562, + "router_z_loss_mlp": 0.23864746, + "step": 3077, + "time_per_iteration": 2.493069648742676 + }, + { + "auxiliary_loss_clip": 0.06560019, + "auxiliary_loss_mlp": 0.01280161, + "balance_loss_clip": 0.06302261, + "balance_loss_mlp": 0.01256748, + "epoch": 0.18505937171200962, + "flos": 23733611648640.0, + "grad_norm": 1.8347096473751274, + "language_loss": 0.79534197, + "learning_rate": 3.7530409714115424e-06, + "loss": 0.87374371, + "num_input_tokens_seen": 66486245, + "router_z_loss_clip": 2.57617188, + "router_z_loss_mlp": 0.23425293, + "step": 3078, + "time_per_iteration": 2.5838091373443604 + }, + { + "auxiliary_loss_clip": 0.0657796, + "auxiliary_loss_mlp": 0.01288284, + "balance_loss_clip": 0.06314268, + "balance_loss_mlp": 0.0126536, + "epoch": 0.18511949496467758, + "flos": 25964525479680.0, + "grad_norm": 2.3879568543100174, + "language_loss": 0.78543603, + "learning_rate": 3.7528534651246453e-06, + "loss": 0.86409843, + "num_input_tokens_seen": 66506510, + "router_z_loss_clip": 2.63867188, + "router_z_loss_mlp": 0.22937012, + "step": 3079, + "time_per_iteration": 2.5836563110351562 + }, + { + "auxiliary_loss_clip": 0.06574707, + "auxiliary_loss_mlp": 0.01290584, + "balance_loss_clip": 0.06311746, + "balance_loss_mlp": 0.01266921, + "epoch": 0.18517961821734555, + "flos": 42423506156160.0, + "grad_norm": 2.6792059094445393, + "language_loss": 0.82738018, + "learning_rate": 3.752665892369369e-06, + "loss": 0.90603304, + "num_input_tokens_seen": 66530960, + "router_z_loss_clip": 2.62890625, + "router_z_loss_mlp": 0.23669434, + "step": 3080, + "time_per_iteration": 2.7419395446777344 + }, + { + "auxiliary_loss_clip": 0.06581488, + "auxiliary_loss_mlp": 0.01283912, + "balance_loss_clip": 0.06312552, + "balance_loss_mlp": 0.01258306, + "epoch": 0.18523974147001354, + "flos": 24104435892480.0, + "grad_norm": 2.0136248585759815, + "language_loss": 0.75280142, + "learning_rate": 3.7524782531528266e-06, + "loss": 0.83145541, + "num_input_tokens_seen": 66550275, + "router_z_loss_clip": 2.69140625, + "router_z_loss_mlp": 0.25622559, + "step": 3081, + "time_per_iteration": 2.558880567550659 + }, + { + "auxiliary_loss_clip": 0.06580579, + "auxiliary_loss_mlp": 0.01294641, + "balance_loss_clip": 0.06314941, + "balance_loss_mlp": 0.01267354, + "epoch": 0.1852998647226815, + "flos": 27381688784640.0, + "grad_norm": 2.2228183561660533, + "language_loss": 0.72592467, + "learning_rate": 3.7522905474821334e-06, + "loss": 0.80467689, + "num_input_tokens_seen": 66569040, + "router_z_loss_clip": 2.65820312, + "router_z_loss_mlp": 0.27282715, + "step": 3082, + "time_per_iteration": 2.588782787322998 + }, + { + "auxiliary_loss_clip": 0.06586821, + "auxiliary_loss_mlp": 0.01289587, + "balance_loss_clip": 0.06314754, + "balance_loss_mlp": 0.01263409, + "epoch": 0.18535998797534947, + "flos": 18338650191360.0, + "grad_norm": 1.9336985276158285, + "language_loss": 0.70667702, + "learning_rate": 3.752102775364407e-06, + "loss": 0.78544116, + "num_input_tokens_seen": 66587775, + "router_z_loss_clip": 2.72070312, + "router_z_loss_mlp": 0.26184082, + "step": 3083, + "time_per_iteration": 2.630099296569824 + }, + { + "auxiliary_loss_clip": 0.06573243, + "auxiliary_loss_mlp": 0.01286773, + "balance_loss_clip": 0.06312741, + "balance_loss_mlp": 0.01261548, + "epoch": 0.18542011122801744, + "flos": 37853881816320.0, + "grad_norm": 1.8745280868212635, + "language_loss": 0.69687432, + "learning_rate": 3.751914936806767e-06, + "loss": 0.77547449, + "num_input_tokens_seen": 66610800, + "router_z_loss_clip": 2.60742188, + "router_z_loss_mlp": 0.25244141, + "step": 3084, + "time_per_iteration": 2.7246148586273193 + }, + { + "auxiliary_loss_clip": 0.06577612, + "auxiliary_loss_mlp": 0.01284469, + "balance_loss_clip": 0.06314437, + "balance_loss_mlp": 0.01261402, + "epoch": 0.1854802344806854, + "flos": 25192171670400.0, + "grad_norm": 1.5329506051970134, + "language_loss": 0.78209639, + "learning_rate": 3.7517270318163377e-06, + "loss": 0.86071718, + "num_input_tokens_seen": 66630960, + "router_z_loss_clip": 2.6328125, + "router_z_loss_mlp": 0.23071289, + "step": 3085, + "time_per_iteration": 2.6189463138580322 + }, + { + "auxiliary_loss_clip": 0.06579587, + "auxiliary_loss_mlp": 0.01287952, + "balance_loss_clip": 0.06314654, + "balance_loss_mlp": 0.01261964, + "epoch": 0.18554035773335337, + "flos": 26691541430400.0, + "grad_norm": 1.8306415954747441, + "language_loss": 0.74554545, + "learning_rate": 3.751539060400244e-06, + "loss": 0.82422084, + "num_input_tokens_seen": 66650585, + "router_z_loss_clip": 2.65234375, + "router_z_loss_mlp": 0.2598877, + "step": 3086, + "time_per_iteration": 2.5668296813964844 + }, + { + "auxiliary_loss_clip": 0.06581503, + "auxiliary_loss_mlp": 0.0129843, + "balance_loss_clip": 0.06316213, + "balance_loss_mlp": 0.01272026, + "epoch": 0.18560048098602133, + "flos": 22353568502400.0, + "grad_norm": 2.451797107788235, + "language_loss": 0.70597452, + "learning_rate": 3.7513510225656132e-06, + "loss": 0.78477389, + "num_input_tokens_seen": 66670045, + "router_z_loss_clip": 2.65429688, + "router_z_loss_mlp": 0.26391602, + "step": 3087, + "time_per_iteration": 2.568380117416382 + }, + { + "auxiliary_loss_clip": 0.06584737, + "auxiliary_loss_mlp": 0.01292318, + "balance_loss_clip": 0.06317757, + "balance_loss_mlp": 0.01264543, + "epoch": 0.18566060423868933, + "flos": 17754245089920.0, + "grad_norm": 1.9281487675228464, + "language_loss": 0.73915106, + "learning_rate": 3.7511629183195764e-06, + "loss": 0.81792164, + "num_input_tokens_seen": 66688790, + "router_z_loss_clip": 2.671875, + "router_z_loss_mlp": 0.27783203, + "step": 3088, + "time_per_iteration": 2.536055326461792 + }, + { + "auxiliary_loss_clip": 0.06578237, + "auxiliary_loss_mlp": 0.01288694, + "balance_loss_clip": 0.06316703, + "balance_loss_mlp": 0.0126571, + "epoch": 0.1857207274913573, + "flos": 24683558186880.0, + "grad_norm": 1.798814131108877, + "language_loss": 0.92793214, + "learning_rate": 3.7509747476692663e-06, + "loss": 1.00660145, + "num_input_tokens_seen": 66708090, + "router_z_loss_clip": 2.6171875, + "router_z_loss_mlp": 0.2298584, + "step": 3089, + "time_per_iteration": 2.591520071029663 + }, + { + "auxiliary_loss_clip": 0.06581305, + "auxiliary_loss_mlp": 0.01284125, + "balance_loss_clip": 0.06316443, + "balance_loss_mlp": 0.01260772, + "epoch": 0.18578085074402526, + "flos": 28155426186240.0, + "grad_norm": 2.9732427277308724, + "language_loss": 0.59245396, + "learning_rate": 3.7507865106218176e-06, + "loss": 0.67110825, + "num_input_tokens_seen": 66727320, + "router_z_loss_clip": 2.6484375, + "router_z_loss_mlp": 0.23352051, + "step": 3090, + "time_per_iteration": 2.587693452835083 + }, + { + "auxiliary_loss_clip": 0.06569171, + "auxiliary_loss_mlp": 0.01294048, + "balance_loss_clip": 0.06308332, + "balance_loss_mlp": 0.01269372, + "epoch": 0.18584097399669322, + "flos": 23958764369280.0, + "grad_norm": 1.6455413495288673, + "language_loss": 0.825216, + "learning_rate": 3.7505982071843695e-06, + "loss": 0.90384817, + "num_input_tokens_seen": 66747505, + "router_z_loss_clip": 2.61132812, + "router_z_loss_mlp": 0.24694824, + "step": 3091, + "time_per_iteration": 2.564748525619507 + }, + { + "auxiliary_loss_clip": 0.06580666, + "auxiliary_loss_mlp": 0.01293234, + "balance_loss_clip": 0.06311382, + "balance_loss_mlp": 0.01266758, + "epoch": 0.18590109724936119, + "flos": 17207379417600.0, + "grad_norm": 2.4797040605264904, + "language_loss": 0.8537268, + "learning_rate": 3.7504098373640617e-06, + "loss": 0.93246579, + "num_input_tokens_seen": 66766425, + "router_z_loss_clip": 2.69335938, + "router_z_loss_mlp": 0.2644043, + "step": 3092, + "time_per_iteration": 2.514536142349243 + }, + { + "auxiliary_loss_clip": 0.06587748, + "auxiliary_loss_mlp": 0.01293739, + "balance_loss_clip": 0.06317791, + "balance_loss_mlp": 0.012665, + "epoch": 0.18596122050202915, + "flos": 17239761820800.0, + "grad_norm": 2.2590627268781316, + "language_loss": 0.93402261, + "learning_rate": 3.750221401168038e-06, + "loss": 1.01283741, + "num_input_tokens_seen": 66781130, + "router_z_loss_clip": 2.69921875, + "router_z_loss_mlp": 0.27246094, + "step": 3093, + "time_per_iteration": 2.5037660598754883 + }, + { + "auxiliary_loss_clip": 0.06575991, + "auxiliary_loss_mlp": 0.01284238, + "balance_loss_clip": 0.06309767, + "balance_loss_mlp": 0.01258477, + "epoch": 0.18602134375469712, + "flos": 19025862652800.0, + "grad_norm": 1.8616717248352448, + "language_loss": 0.77931499, + "learning_rate": 3.750032898603443e-06, + "loss": 0.85791731, + "num_input_tokens_seen": 66797535, + "router_z_loss_clip": 2.66210938, + "router_z_loss_mlp": 0.25744629, + "step": 3094, + "time_per_iteration": 2.529491662979126 + }, + { + "auxiliary_loss_clip": 0.06576168, + "auxiliary_loss_mlp": 0.0128492, + "balance_loss_clip": 0.06311647, + "balance_loss_mlp": 0.01260637, + "epoch": 0.1860814670073651, + "flos": 50961285429120.0, + "grad_norm": 1.6485050019084173, + "language_loss": 0.70511484, + "learning_rate": 3.749844329677425e-06, + "loss": 0.7837258, + "num_input_tokens_seen": 66821720, + "router_z_loss_clip": 2.64257812, + "router_z_loss_mlp": 0.24291992, + "step": 3095, + "time_per_iteration": 4.124077558517456 + }, + { + "auxiliary_loss_clip": 0.0658177, + "auxiliary_loss_mlp": 0.01296881, + "balance_loss_clip": 0.06310082, + "balance_loss_mlp": 0.01268819, + "epoch": 0.18614159026003307, + "flos": 19397064240000.0, + "grad_norm": 1.9264485804072164, + "language_loss": 0.81302798, + "learning_rate": 3.749655694397135e-06, + "loss": 0.89181447, + "num_input_tokens_seen": 66839060, + "router_z_loss_clip": 2.71484375, + "router_z_loss_mlp": 0.28051758, + "step": 3096, + "time_per_iteration": 2.5277867317199707 + }, + { + "auxiliary_loss_clip": 0.06581111, + "auxiliary_loss_mlp": 0.01285017, + "balance_loss_clip": 0.06310429, + "balance_loss_mlp": 0.01259173, + "epoch": 0.18620171351270104, + "flos": 21805235383680.0, + "grad_norm": 1.9931413029080365, + "language_loss": 0.76143897, + "learning_rate": 3.7494669927697255e-06, + "loss": 0.84010023, + "num_input_tokens_seen": 66857760, + "router_z_loss_clip": 2.70898438, + "router_z_loss_mlp": 0.25842285, + "step": 3097, + "time_per_iteration": 3.982475996017456 + }, + { + "auxiliary_loss_clip": 0.06569855, + "auxiliary_loss_mlp": 0.01288887, + "balance_loss_clip": 0.06308468, + "balance_loss_mlp": 0.01263877, + "epoch": 0.186261836765369, + "flos": 16368499866240.0, + "grad_norm": 2.207337076402474, + "language_loss": 0.67101508, + "learning_rate": 3.749278224802352e-06, + "loss": 0.74960256, + "num_input_tokens_seen": 66876460, + "router_z_loss_clip": 2.61328125, + "router_z_loss_mlp": 0.25061035, + "step": 3098, + "time_per_iteration": 2.5570473670959473 + }, + { + "auxiliary_loss_clip": 0.06578363, + "auxiliary_loss_mlp": 0.01287977, + "balance_loss_clip": 0.06308189, + "balance_loss_mlp": 0.0126044, + "epoch": 0.18632196001803697, + "flos": 23377168379520.0, + "grad_norm": 1.559550653919394, + "language_loss": 0.70188725, + "learning_rate": 3.7490893905021733e-06, + "loss": 0.7805506, + "num_input_tokens_seen": 66897960, + "router_z_loss_clip": 2.703125, + "router_z_loss_mlp": 0.2755127, + "step": 3099, + "time_per_iteration": 2.5704476833343506 + }, + { + "auxiliary_loss_clip": 0.0657559, + "auxiliary_loss_mlp": 0.01292152, + "balance_loss_clip": 0.06309687, + "balance_loss_mlp": 0.01266689, + "epoch": 0.18638208327070493, + "flos": 22498569192960.0, + "grad_norm": 1.5145032946618349, + "language_loss": 0.72489583, + "learning_rate": 3.7489004898763494e-06, + "loss": 0.80357325, + "num_input_tokens_seen": 66917675, + "router_z_loss_clip": 2.66015625, + "router_z_loss_mlp": 0.25463867, + "step": 3100, + "time_per_iteration": 2.628770351409912 + }, + { + "auxiliary_loss_clip": 0.06585407, + "auxiliary_loss_mlp": 0.01287458, + "balance_loss_clip": 0.06314865, + "balance_loss_mlp": 0.01261971, + "epoch": 0.18644220652337293, + "flos": 29172317736960.0, + "grad_norm": 1.7314771672192502, + "language_loss": 0.80930734, + "learning_rate": 3.7487115229320444e-06, + "loss": 0.88803601, + "num_input_tokens_seen": 66936000, + "router_z_loss_clip": 2.703125, + "router_z_loss_mlp": 0.25524902, + "step": 3101, + "time_per_iteration": 4.063347578048706 + }, + { + "auxiliary_loss_clip": 0.0657436, + "auxiliary_loss_mlp": 0.01283038, + "balance_loss_clip": 0.06309733, + "balance_loss_mlp": 0.01259494, + "epoch": 0.1865023297760409, + "flos": 24250736321280.0, + "grad_norm": 2.4348094857493834, + "language_loss": 0.77630436, + "learning_rate": 3.7485224896764222e-06, + "loss": 0.85487837, + "num_input_tokens_seen": 66955700, + "router_z_loss_clip": 2.64453125, + "router_z_loss_mlp": 0.23535156, + "step": 3102, + "time_per_iteration": 3.9878056049346924 + }, + { + "auxiliary_loss_clip": 0.06580452, + "auxiliary_loss_mlp": 0.01284097, + "balance_loss_clip": 0.0631346, + "balance_loss_mlp": 0.01259504, + "epoch": 0.18656245302870886, + "flos": 19133617403520.0, + "grad_norm": 4.261808326107292, + "language_loss": 0.77043533, + "learning_rate": 3.7483333901166525e-06, + "loss": 0.8490808, + "num_input_tokens_seen": 66972815, + "router_z_loss_clip": 2.66992188, + "router_z_loss_mlp": 0.24584961, + "step": 3103, + "time_per_iteration": 2.5497515201568604 + }, + { + "auxiliary_loss_clip": 0.06580411, + "auxiliary_loss_mlp": 0.01279736, + "balance_loss_clip": 0.06311087, + "balance_loss_mlp": 0.01255596, + "epoch": 0.18662257628137682, + "flos": 17791994154240.0, + "grad_norm": 1.8534126866214053, + "language_loss": 0.80155015, + "learning_rate": 3.7481442242599054e-06, + "loss": 0.88015163, + "num_input_tokens_seen": 66992280, + "router_z_loss_clip": 2.69335938, + "router_z_loss_mlp": 0.24157715, + "step": 3104, + "time_per_iteration": 2.5436315536499023 + }, + { + "auxiliary_loss_clip": 0.06576735, + "auxiliary_loss_mlp": 0.01287024, + "balance_loss_clip": 0.06310537, + "balance_loss_mlp": 0.01262884, + "epoch": 0.1866826995340448, + "flos": 24031201824000.0, + "grad_norm": 1.9078675803700618, + "language_loss": 0.86523151, + "learning_rate": 3.747954992113354e-06, + "loss": 0.94386911, + "num_input_tokens_seen": 67012220, + "router_z_loss_clip": 2.66210938, + "router_z_loss_mlp": 0.24169922, + "step": 3105, + "time_per_iteration": 2.5862667560577393 + }, + { + "auxiliary_loss_clip": 0.06594124, + "auxiliary_loss_mlp": 0.01282565, + "balance_loss_clip": 0.06317551, + "balance_loss_mlp": 0.01257853, + "epoch": 0.18674282278671275, + "flos": 26148533045760.0, + "grad_norm": 3.6817594399013203, + "language_loss": 0.87727821, + "learning_rate": 3.7477656936841742e-06, + "loss": 0.95604515, + "num_input_tokens_seen": 67032030, + "router_z_loss_clip": 2.765625, + "router_z_loss_mlp": 0.24719238, + "step": 3106, + "time_per_iteration": 2.6158018112182617 + }, + { + "auxiliary_loss_clip": 0.06587484, + "auxiliary_loss_mlp": 0.01282217, + "balance_loss_clip": 0.06311296, + "balance_loss_mlp": 0.01259078, + "epoch": 0.18680294603938072, + "flos": 19206893399040.0, + "grad_norm": 1.800292289422269, + "language_loss": 0.78916037, + "learning_rate": 3.7475763289795445e-06, + "loss": 0.86785746, + "num_input_tokens_seen": 67048920, + "router_z_loss_clip": 2.76171875, + "router_z_loss_mlp": 0.23132324, + "step": 3107, + "time_per_iteration": 2.519771099090576 + }, + { + "auxiliary_loss_clip": 0.06579127, + "auxiliary_loss_mlp": 0.01290711, + "balance_loss_clip": 0.06304596, + "balance_loss_mlp": 0.01264997, + "epoch": 0.1868630692920487, + "flos": 28551840652800.0, + "grad_norm": 3.3283393961991345, + "language_loss": 0.75120842, + "learning_rate": 3.7473868980066446e-06, + "loss": 0.82990676, + "num_input_tokens_seen": 67068645, + "router_z_loss_clip": 2.75, + "router_z_loss_mlp": 0.25720215, + "step": 3108, + "time_per_iteration": 2.5681068897247314 + }, + { + "auxiliary_loss_clip": 0.06588297, + "auxiliary_loss_mlp": 0.01287258, + "balance_loss_clip": 0.06313515, + "balance_loss_mlp": 0.01262451, + "epoch": 0.18692319254471668, + "flos": 17243702962560.0, + "grad_norm": 1.5585462553143232, + "language_loss": 0.7488178, + "learning_rate": 3.747197400772658e-06, + "loss": 0.82757336, + "num_input_tokens_seen": 67087075, + "router_z_loss_clip": 2.7421875, + "router_z_loss_mlp": 0.24816895, + "step": 3109, + "time_per_iteration": 2.5719470977783203 + }, + { + "auxiliary_loss_clip": 0.06585538, + "auxiliary_loss_mlp": 0.01282339, + "balance_loss_clip": 0.06316088, + "balance_loss_mlp": 0.01256113, + "epoch": 0.18698331579738464, + "flos": 23191861075200.0, + "grad_norm": 1.4817620217833272, + "language_loss": 0.85173523, + "learning_rate": 3.747007837284772e-06, + "loss": 0.93041396, + "num_input_tokens_seen": 67108040, + "router_z_loss_clip": 2.69726562, + "router_z_loss_mlp": 0.26220703, + "step": 3110, + "time_per_iteration": 2.604595899581909 + }, + { + "auxiliary_loss_clip": 0.06572624, + "auxiliary_loss_mlp": 0.01284902, + "balance_loss_clip": 0.06305574, + "balance_loss_mlp": 0.01260142, + "epoch": 0.1870434390500526, + "flos": 25523192424960.0, + "grad_norm": 2.402854340329271, + "language_loss": 0.85246378, + "learning_rate": 3.7468182075501737e-06, + "loss": 0.93103909, + "num_input_tokens_seen": 67127605, + "router_z_loss_clip": 2.671875, + "router_z_loss_mlp": 0.24755859, + "step": 3111, + "time_per_iteration": 2.58076810836792 + }, + { + "auxiliary_loss_clip": 0.06578258, + "auxiliary_loss_mlp": 0.0128217, + "balance_loss_clip": 0.06306738, + "balance_loss_mlp": 0.0125778, + "epoch": 0.18710356230272057, + "flos": 19506999196800.0, + "grad_norm": 1.9642208489694009, + "language_loss": 0.77830005, + "learning_rate": 3.7466285115760536e-06, + "loss": 0.85690439, + "num_input_tokens_seen": 67145785, + "router_z_loss_clip": 2.71679688, + "router_z_loss_mlp": 0.24365234, + "step": 3112, + "time_per_iteration": 2.5625264644622803 + }, + { + "auxiliary_loss_clip": 0.06577107, + "auxiliary_loss_mlp": 0.01281729, + "balance_loss_clip": 0.06307282, + "balance_loss_mlp": 0.01258113, + "epoch": 0.18716368555538854, + "flos": 26768129662080.0, + "grad_norm": 2.238016316213089, + "language_loss": 0.65778387, + "learning_rate": 3.7464387493696046e-06, + "loss": 0.73637217, + "num_input_tokens_seen": 67165930, + "router_z_loss_clip": 2.70117188, + "router_z_loss_mlp": 0.23620605, + "step": 3113, + "time_per_iteration": 2.6080710887908936 + }, + { + "auxiliary_loss_clip": 0.06588607, + "auxiliary_loss_mlp": 0.01279317, + "balance_loss_clip": 0.06312529, + "balance_loss_mlp": 0.01254962, + "epoch": 0.1872238088080565, + "flos": 25196490155520.0, + "grad_norm": 2.335075222112074, + "language_loss": 0.82613724, + "learning_rate": 3.746248920938024e-06, + "loss": 0.90481651, + "num_input_tokens_seen": 67185830, + "router_z_loss_clip": 2.75976562, + "router_z_loss_mlp": 0.2434082, + "step": 3114, + "time_per_iteration": 2.5988082885742188 + }, + { + "auxiliary_loss_clip": 0.06587939, + "auxiliary_loss_mlp": 0.01289131, + "balance_loss_clip": 0.06312289, + "balance_loss_mlp": 0.01262655, + "epoch": 0.1872839320607245, + "flos": 24141220634880.0, + "grad_norm": 2.589653310619875, + "language_loss": 0.58319235, + "learning_rate": 3.74605902628851e-06, + "loss": 0.66196311, + "num_input_tokens_seen": 67206930, + "router_z_loss_clip": 2.75390625, + "router_z_loss_mlp": 0.26464844, + "step": 3115, + "time_per_iteration": 2.597001552581787 + }, + { + "auxiliary_loss_clip": 0.06578196, + "auxiliary_loss_mlp": 0.01284839, + "balance_loss_clip": 0.06308471, + "balance_loss_mlp": 0.01261676, + "epoch": 0.18734405531339246, + "flos": 21179349711360.0, + "grad_norm": 2.089321408475999, + "language_loss": 0.7264486, + "learning_rate": 3.745869065428261e-06, + "loss": 0.80507892, + "num_input_tokens_seen": 67226290, + "router_z_loss_clip": 2.6953125, + "router_z_loss_mlp": 0.23168945, + "step": 3116, + "time_per_iteration": 2.559483051300049 + }, + { + "auxiliary_loss_clip": 0.06573902, + "auxiliary_loss_mlp": 0.01278215, + "balance_loss_clip": 0.06309307, + "balance_loss_mlp": 0.01256292, + "epoch": 0.18740417856606043, + "flos": 17243325619200.0, + "grad_norm": 2.0473943382883184, + "language_loss": 0.79514784, + "learning_rate": 3.7456790383644833e-06, + "loss": 0.87366909, + "num_input_tokens_seen": 67244410, + "router_z_loss_clip": 2.64648438, + "router_z_loss_mlp": 0.21936035, + "step": 3117, + "time_per_iteration": 2.5308892726898193 + }, + { + "auxiliary_loss_clip": 0.06575021, + "auxiliary_loss_mlp": 0.01286113, + "balance_loss_clip": 0.06310903, + "balance_loss_mlp": 0.01262426, + "epoch": 0.1874643018187284, + "flos": 32565626933760.0, + "grad_norm": 1.6927935343473184, + "language_loss": 0.84475845, + "learning_rate": 3.745488945104381e-06, + "loss": 0.92336977, + "num_input_tokens_seen": 67264470, + "router_z_loss_clip": 2.64257812, + "router_z_loss_mlp": 0.23669434, + "step": 3118, + "time_per_iteration": 2.645819902420044 + }, + { + "auxiliary_loss_clip": 0.06577513, + "auxiliary_loss_mlp": 0.01281432, + "balance_loss_clip": 0.06306227, + "balance_loss_mlp": 0.01256184, + "epoch": 0.18752442507139636, + "flos": 23264843581440.0, + "grad_norm": 1.8564508885039195, + "language_loss": 0.77631271, + "learning_rate": 3.7452987856551636e-06, + "loss": 0.85490215, + "num_input_tokens_seen": 67284315, + "router_z_loss_clip": 2.7109375, + "router_z_loss_mlp": 0.25280762, + "step": 3119, + "time_per_iteration": 2.5282692909240723 + }, + { + "auxiliary_loss_clip": 0.06577515, + "auxiliary_loss_mlp": 0.01280917, + "balance_loss_clip": 0.06308109, + "balance_loss_mlp": 0.01257934, + "epoch": 0.18758454832406432, + "flos": 21767150903040.0, + "grad_norm": 1.872231122069903, + "language_loss": 0.83286214, + "learning_rate": 3.7451085600240406e-06, + "loss": 0.91144645, + "num_input_tokens_seen": 67302780, + "router_z_loss_clip": 2.69921875, + "router_z_loss_mlp": 0.22973633, + "step": 3120, + "time_per_iteration": 2.5557563304901123 + }, + { + "auxiliary_loss_clip": 0.06574757, + "auxiliary_loss_mlp": 0.01283184, + "balance_loss_clip": 0.06308539, + "balance_loss_mlp": 0.01260606, + "epoch": 0.1876446715767323, + "flos": 29577956152320.0, + "grad_norm": 1.9256466590755805, + "language_loss": 0.85764915, + "learning_rate": 3.7449182682182263e-06, + "loss": 0.93622863, + "num_input_tokens_seen": 67323405, + "router_z_loss_clip": 2.6640625, + "router_z_loss_mlp": 0.22595215, + "step": 3121, + "time_per_iteration": 2.5938265323638916 + }, + { + "auxiliary_loss_clip": 0.06579052, + "auxiliary_loss_mlp": 0.01278188, + "balance_loss_clip": 0.06313133, + "balance_loss_mlp": 0.01255037, + "epoch": 0.18770479482940028, + "flos": 30348465171840.0, + "grad_norm": 1.7101492266675271, + "language_loss": 0.71341884, + "learning_rate": 3.744727910244937e-06, + "loss": 0.79199123, + "num_input_tokens_seen": 67345800, + "router_z_loss_clip": 2.66015625, + "router_z_loss_mlp": 0.23156738, + "step": 3122, + "time_per_iteration": 2.6486034393310547 + }, + { + "auxiliary_loss_clip": 0.06583723, + "auxiliary_loss_mlp": 0.01279754, + "balance_loss_clip": 0.06317301, + "balance_loss_mlp": 0.01255602, + "epoch": 0.18776491808206824, + "flos": 14470619287680.0, + "grad_norm": 1.9121070999681127, + "language_loss": 0.71984768, + "learning_rate": 3.7445374861113905e-06, + "loss": 0.79848242, + "num_input_tokens_seen": 67363575, + "router_z_loss_clip": 2.6640625, + "router_z_loss_mlp": 0.24157715, + "step": 3123, + "time_per_iteration": 2.50598406791687 + }, + { + "auxiliary_loss_clip": 0.06582906, + "auxiliary_loss_mlp": 0.01279768, + "balance_loss_clip": 0.06318765, + "balance_loss_mlp": 0.01258251, + "epoch": 0.1878250413347362, + "flos": 24505420406400.0, + "grad_norm": 1.8100549345620827, + "language_loss": 0.74830985, + "learning_rate": 3.7443469958248066e-06, + "loss": 0.8269366, + "num_input_tokens_seen": 67381765, + "router_z_loss_clip": 2.63867188, + "router_z_loss_mlp": 0.21520996, + "step": 3124, + "time_per_iteration": 2.588963031768799 + }, + { + "auxiliary_loss_clip": 0.06579177, + "auxiliary_loss_mlp": 0.01284317, + "balance_loss_clip": 0.06309149, + "balance_loss_mlp": 0.01260177, + "epoch": 0.18788516458740417, + "flos": 39795632807040.0, + "grad_norm": 2.0156197395212225, + "language_loss": 0.81827998, + "learning_rate": 3.7441564393924106e-06, + "loss": 0.89691496, + "num_input_tokens_seen": 67405000, + "router_z_loss_clip": 2.703125, + "router_z_loss_mlp": 0.24133301, + "step": 3125, + "time_per_iteration": 2.6984996795654297 + }, + { + "auxiliary_loss_clip": 0.06689048, + "auxiliary_loss_mlp": 0.01323199, + "balance_loss_clip": 0.06516109, + "balance_loss_mlp": 0.01312268, + "epoch": 0.18794528784007214, + "flos": 64717844221440.0, + "grad_norm": 0.9517259918121469, + "language_loss": 0.63560247, + "learning_rate": 3.7439658168214273e-06, + "loss": 0.715725, + "num_input_tokens_seen": 67467140, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.10949707, + "step": 3126, + "time_per_iteration": 3.246349811553955 + }, + { + "auxiliary_loss_clip": 0.06580469, + "auxiliary_loss_mlp": 0.01289138, + "balance_loss_clip": 0.06317941, + "balance_loss_mlp": 0.01265118, + "epoch": 0.1880054110927401, + "flos": 28629728622720.0, + "grad_norm": 1.7132867879725662, + "language_loss": 0.81907004, + "learning_rate": 3.7437751281190857e-06, + "loss": 0.89776611, + "num_input_tokens_seen": 67487980, + "router_z_loss_clip": 2.62304688, + "router_z_loss_mlp": 0.24035645, + "step": 3127, + "time_per_iteration": 2.6359355449676514 + }, + { + "auxiliary_loss_clip": 0.06571439, + "auxiliary_loss_mlp": 0.01288176, + "balance_loss_clip": 0.06401625, + "balance_loss_mlp": 0.01277983, + "epoch": 0.1880655343454081, + "flos": 64508959192320.0, + "grad_norm": 0.7555261261025208, + "language_loss": 0.61928779, + "learning_rate": 3.7435843732926164e-06, + "loss": 0.69788396, + "num_input_tokens_seen": 67552500, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.10192871, + "step": 3128, + "time_per_iteration": 3.3078746795654297 + }, + { + "auxiliary_loss_clip": 0.06593472, + "auxiliary_loss_mlp": 0.01285866, + "balance_loss_clip": 0.06323253, + "balance_loss_mlp": 0.0126243, + "epoch": 0.18812565759807606, + "flos": 32132679287040.0, + "grad_norm": 2.3201362692378806, + "language_loss": 0.72451007, + "learning_rate": 3.7433935523492536e-06, + "loss": 0.80330348, + "num_input_tokens_seen": 67573295, + "router_z_loss_clip": 2.70117188, + "router_z_loss_mlp": 0.234375, + "step": 3129, + "time_per_iteration": 2.684316396713257 + }, + { + "auxiliary_loss_clip": 0.06599562, + "auxiliary_loss_mlp": 0.01283183, + "balance_loss_clip": 0.06331511, + "balance_loss_mlp": 0.01259294, + "epoch": 0.18818578085074403, + "flos": 20629674927360.0, + "grad_norm": 2.0063290669545024, + "language_loss": 0.85961545, + "learning_rate": 3.7432026652962314e-06, + "loss": 0.93844295, + "num_input_tokens_seen": 67590010, + "router_z_loss_clip": 2.68359375, + "router_z_loss_mlp": 0.23876953, + "step": 3130, + "time_per_iteration": 2.5385701656341553 + }, + { + "auxiliary_loss_clip": 0.0659353, + "auxiliary_loss_mlp": 0.0128556, + "balance_loss_clip": 0.06323448, + "balance_loss_mlp": 0.01262564, + "epoch": 0.188245904103412, + "flos": 28848131089920.0, + "grad_norm": 1.7743332045981155, + "language_loss": 0.77165318, + "learning_rate": 3.7430117121407897e-06, + "loss": 0.85044408, + "num_input_tokens_seen": 67611110, + "router_z_loss_clip": 2.69921875, + "router_z_loss_mlp": 0.23010254, + "step": 3131, + "time_per_iteration": 2.6456139087677 + }, + { + "auxiliary_loss_clip": 0.06594209, + "auxiliary_loss_mlp": 0.0129295, + "balance_loss_clip": 0.06329745, + "balance_loss_mlp": 0.01266891, + "epoch": 0.18830602735607996, + "flos": 29427379165440.0, + "grad_norm": 1.8335043044334671, + "language_loss": 0.8226279, + "learning_rate": 3.74282069289017e-06, + "loss": 0.90149951, + "num_input_tokens_seen": 67631990, + "router_z_loss_clip": 2.640625, + "router_z_loss_mlp": 0.26049805, + "step": 3132, + "time_per_iteration": 2.604219436645508 + }, + { + "auxiliary_loss_clip": 0.06612615, + "auxiliary_loss_mlp": 0.01296327, + "balance_loss_clip": 0.06340778, + "balance_loss_mlp": 0.01269886, + "epoch": 0.18836615060874792, + "flos": 28879884587520.0, + "grad_norm": 2.5361304129104476, + "language_loss": 0.80964118, + "learning_rate": 3.742629607551614e-06, + "loss": 0.88873059, + "num_input_tokens_seen": 67650490, + "router_z_loss_clip": 2.72070312, + "router_z_loss_mlp": 0.26452637, + "step": 3133, + "time_per_iteration": 2.6110780239105225 + }, + { + "auxiliary_loss_clip": 0.06596034, + "auxiliary_loss_mlp": 0.01290384, + "balance_loss_clip": 0.06326675, + "balance_loss_mlp": 0.01266709, + "epoch": 0.18842627386141592, + "flos": 22608294514560.0, + "grad_norm": 1.918700832470348, + "language_loss": 0.83331311, + "learning_rate": 3.7424384561323698e-06, + "loss": 0.91217732, + "num_input_tokens_seen": 67668860, + "router_z_loss_clip": 2.69921875, + "router_z_loss_mlp": 0.23669434, + "step": 3134, + "time_per_iteration": 3.9871177673339844 + }, + { + "auxiliary_loss_clip": 0.06585519, + "auxiliary_loss_mlp": 0.01303727, + "balance_loss_clip": 0.06320879, + "balance_loss_mlp": 0.01279873, + "epoch": 0.18848639711408388, + "flos": 24580834680960.0, + "grad_norm": 1.5688225209098985, + "language_loss": 0.83794045, + "learning_rate": 3.742247238639684e-06, + "loss": 0.91683292, + "num_input_tokens_seen": 67690220, + "router_z_loss_clip": 2.64648438, + "router_z_loss_mlp": 0.23852539, + "step": 3135, + "time_per_iteration": 2.576728343963623 + }, + { + "auxiliary_loss_clip": 0.06580248, + "auxiliary_loss_mlp": 0.01300724, + "balance_loss_clip": 0.06314597, + "balance_loss_mlp": 0.01277049, + "epoch": 0.18854652036675185, + "flos": 34175350920960.0, + "grad_norm": 2.0171444284890674, + "language_loss": 0.79025453, + "learning_rate": 3.7420559550808083e-06, + "loss": 0.86906427, + "num_input_tokens_seen": 67709820, + "router_z_loss_clip": 2.65429688, + "router_z_loss_mlp": 0.23681641, + "step": 3136, + "time_per_iteration": 4.059029817581177 + }, + { + "auxiliary_loss_clip": 0.06580447, + "auxiliary_loss_mlp": 0.01296286, + "balance_loss_clip": 0.06314041, + "balance_loss_mlp": 0.01272348, + "epoch": 0.1886066436194198, + "flos": 24205985441280.0, + "grad_norm": 1.848748774649379, + "language_loss": 0.82736617, + "learning_rate": 3.741864605462996e-06, + "loss": 0.90613353, + "num_input_tokens_seen": 67729490, + "router_z_loss_clip": 2.66796875, + "router_z_loss_mlp": 0.23925781, + "step": 3137, + "time_per_iteration": 2.5432510375976562 + }, + { + "auxiliary_loss_clip": 0.06589224, + "auxiliary_loss_mlp": 0.01291304, + "balance_loss_clip": 0.0632188, + "balance_loss_mlp": 0.0126745, + "epoch": 0.18866676687208778, + "flos": 21257405389440.0, + "grad_norm": 1.7037003999682347, + "language_loss": 0.81716311, + "learning_rate": 3.741673189793504e-06, + "loss": 0.89596832, + "num_input_tokens_seen": 67749665, + "router_z_loss_clip": 2.67382812, + "router_z_loss_mlp": 0.23864746, + "step": 3138, + "time_per_iteration": 2.5536084175109863 + }, + { + "auxiliary_loss_clip": 0.06589679, + "auxiliary_loss_mlp": 0.01290101, + "balance_loss_clip": 0.06319093, + "balance_loss_mlp": 0.01265985, + "epoch": 0.18872689012475574, + "flos": 37318294517760.0, + "grad_norm": 2.1585183145570723, + "language_loss": 0.64404404, + "learning_rate": 3.7414817080795896e-06, + "loss": 0.72284186, + "num_input_tokens_seen": 67776230, + "router_z_loss_clip": 2.70507812, + "router_z_loss_mlp": 0.24133301, + "step": 3139, + "time_per_iteration": 2.7355217933654785 + }, + { + "auxiliary_loss_clip": 0.06586127, + "auxiliary_loss_mlp": 0.01305421, + "balance_loss_clip": 0.06318149, + "balance_loss_mlp": 0.01280554, + "epoch": 0.1887870133774237, + "flos": 21658641465600.0, + "grad_norm": 2.033663323673097, + "language_loss": 0.72120833, + "learning_rate": 3.741290160328514e-06, + "loss": 0.80012381, + "num_input_tokens_seen": 67795080, + "router_z_loss_clip": 2.68164062, + "router_z_loss_mlp": 0.24865723, + "step": 3140, + "time_per_iteration": 2.556196928024292 + }, + { + "auxiliary_loss_clip": 0.06585391, + "auxiliary_loss_mlp": 0.01291018, + "balance_loss_clip": 0.06316558, + "balance_loss_mlp": 0.01264935, + "epoch": 0.1888471366300917, + "flos": 15930143631360.0, + "grad_norm": 2.3984250647338254, + "language_loss": 0.88684165, + "learning_rate": 3.7410985465475412e-06, + "loss": 0.9656058, + "num_input_tokens_seen": 67813110, + "router_z_loss_clip": 2.68554688, + "router_z_loss_mlp": 0.26086426, + "step": 3141, + "time_per_iteration": 5.341757774353027 + }, + { + "auxiliary_loss_clip": 0.06587377, + "auxiliary_loss_mlp": 0.01281785, + "balance_loss_clip": 0.06315634, + "balance_loss_mlp": 0.01256358, + "epoch": 0.18890725988275966, + "flos": 18557933126400.0, + "grad_norm": 1.8324612256611552, + "language_loss": 0.7775296, + "learning_rate": 3.7409068667439378e-06, + "loss": 0.85622126, + "num_input_tokens_seen": 67831070, + "router_z_loss_clip": 2.72070312, + "router_z_loss_mlp": 0.25390625, + "step": 3142, + "time_per_iteration": 2.5836708545684814 + }, + { + "auxiliary_loss_clip": 0.06576081, + "auxiliary_loss_mlp": 0.01283372, + "balance_loss_clip": 0.06312332, + "balance_loss_mlp": 0.01261413, + "epoch": 0.18896738313542763, + "flos": 28848550360320.0, + "grad_norm": 1.9913316615923113, + "language_loss": 0.79816502, + "learning_rate": 3.740715120924971e-06, + "loss": 0.87675953, + "num_input_tokens_seen": 67852170, + "router_z_loss_clip": 2.640625, + "router_z_loss_mlp": 0.21972656, + "step": 3143, + "time_per_iteration": 2.6068625450134277 + }, + { + "auxiliary_loss_clip": 0.06581955, + "auxiliary_loss_mlp": 0.01290595, + "balance_loss_clip": 0.0631283, + "balance_loss_mlp": 0.01266146, + "epoch": 0.1890275063880956, + "flos": 22418249454720.0, + "grad_norm": 2.17929571565749, + "language_loss": 0.72435296, + "learning_rate": 3.740523309097912e-06, + "loss": 0.80307841, + "num_input_tokens_seen": 67869945, + "router_z_loss_clip": 2.69140625, + "router_z_loss_mlp": 0.24475098, + "step": 3144, + "time_per_iteration": 2.565488338470459 + }, + { + "auxiliary_loss_clip": 0.06576345, + "auxiliary_loss_mlp": 0.0128465, + "balance_loss_clip": 0.0630596, + "balance_loss_mlp": 0.012602, + "epoch": 0.18908762964076356, + "flos": 24250862102400.0, + "grad_norm": 2.4312750691575253, + "language_loss": 0.74294418, + "learning_rate": 3.7403314312700356e-06, + "loss": 0.82155418, + "num_input_tokens_seen": 67890240, + "router_z_loss_clip": 2.70507812, + "router_z_loss_mlp": 0.24438477, + "step": 3145, + "time_per_iteration": 2.582784414291382 + }, + { + "auxiliary_loss_clip": 0.0656594, + "auxiliary_loss_mlp": 0.01281011, + "balance_loss_clip": 0.063042, + "balance_loss_mlp": 0.01258385, + "epoch": 0.18914775289343153, + "flos": 16988599607040.0, + "grad_norm": 2.264042873648611, + "language_loss": 0.77487111, + "learning_rate": 3.740139487448616e-06, + "loss": 0.85334063, + "num_input_tokens_seen": 67907825, + "router_z_loss_clip": 2.6171875, + "router_z_loss_mlp": 0.22631836, + "step": 3146, + "time_per_iteration": 2.5446579456329346 + }, + { + "auxiliary_loss_clip": 0.06567892, + "auxiliary_loss_mlp": 0.01282874, + "balance_loss_clip": 0.06302544, + "balance_loss_mlp": 0.01259342, + "epoch": 0.1892078761460995, + "flos": 21550257809280.0, + "grad_norm": 2.367888350934947, + "language_loss": 0.79622674, + "learning_rate": 3.7399474776409326e-06, + "loss": 0.87473428, + "num_input_tokens_seen": 67926670, + "router_z_loss_clip": 2.66015625, + "router_z_loss_mlp": 0.23535156, + "step": 3147, + "time_per_iteration": 2.5432369709014893 + }, + { + "auxiliary_loss_clip": 0.06564464, + "auxiliary_loss_mlp": 0.0128295, + "balance_loss_clip": 0.06297393, + "balance_loss_mlp": 0.01259096, + "epoch": 0.18926799939876748, + "flos": 23007979290240.0, + "grad_norm": 3.3066597325179443, + "language_loss": 0.67790151, + "learning_rate": 3.739755401854267e-06, + "loss": 0.75637561, + "num_input_tokens_seen": 67943645, + "router_z_loss_clip": 2.66992188, + "router_z_loss_mlp": 0.23864746, + "step": 3148, + "time_per_iteration": 2.5936107635498047 + }, + { + "auxiliary_loss_clip": 0.06566582, + "auxiliary_loss_mlp": 0.01281142, + "balance_loss_clip": 0.06297165, + "balance_loss_mlp": 0.01256693, + "epoch": 0.18932812265143545, + "flos": 22279537820160.0, + "grad_norm": 2.2349625482761843, + "language_loss": 0.76378185, + "learning_rate": 3.739563260095902e-06, + "loss": 0.84225905, + "num_input_tokens_seen": 67962345, + "router_z_loss_clip": 2.69335938, + "router_z_loss_mlp": 0.24450684, + "step": 3149, + "time_per_iteration": 2.5491833686828613 + }, + { + "auxiliary_loss_clip": 0.0656079, + "auxiliary_loss_mlp": 0.01279685, + "balance_loss_clip": 0.06300658, + "balance_loss_mlp": 0.01256785, + "epoch": 0.1893882459041034, + "flos": 18630328654080.0, + "grad_norm": 2.2856364952022687, + "language_loss": 0.81782246, + "learning_rate": 3.7393710523731245e-06, + "loss": 0.89622724, + "num_input_tokens_seen": 67979760, + "router_z_loss_clip": 2.59960938, + "router_z_loss_mlp": 0.22912598, + "step": 3150, + "time_per_iteration": 2.568166494369507 + }, + { + "auxiliary_loss_clip": 0.06565347, + "auxiliary_loss_mlp": 0.01285958, + "balance_loss_clip": 0.06297709, + "balance_loss_mlp": 0.01262617, + "epoch": 0.18944836915677138, + "flos": 22899553706880.0, + "grad_norm": 2.23925150788406, + "language_loss": 0.86091208, + "learning_rate": 3.7391787786932215e-06, + "loss": 0.93942523, + "num_input_tokens_seen": 67996895, + "router_z_loss_clip": 2.67773438, + "router_z_loss_mlp": 0.2331543, + "step": 3151, + "time_per_iteration": 2.520254373550415 + }, + { + "auxiliary_loss_clip": 0.06570399, + "auxiliary_loss_mlp": 0.01289995, + "balance_loss_clip": 0.06303516, + "balance_loss_mlp": 0.01266297, + "epoch": 0.18950849240943934, + "flos": 26803698520320.0, + "grad_norm": 1.7542668261130185, + "language_loss": 0.75358492, + "learning_rate": 3.7389864390634857e-06, + "loss": 0.83218884, + "num_input_tokens_seen": 68018365, + "router_z_loss_clip": 2.66796875, + "router_z_loss_mlp": 0.23706055, + "step": 3152, + "time_per_iteration": 2.612248182296753 + }, + { + "auxiliary_loss_clip": 0.06565326, + "auxiliary_loss_mlp": 0.01283167, + "balance_loss_clip": 0.06301029, + "balance_loss_mlp": 0.01258431, + "epoch": 0.1895686156621073, + "flos": 24977919980160.0, + "grad_norm": 1.8204901028243692, + "language_loss": 0.76455373, + "learning_rate": 3.738794033491209e-06, + "loss": 0.84303862, + "num_input_tokens_seen": 68037985, + "router_z_loss_clip": 2.64257812, + "router_z_loss_mlp": 0.24755859, + "step": 3153, + "time_per_iteration": 2.5559494495391846 + }, + { + "auxiliary_loss_clip": 0.06567015, + "auxiliary_loss_mlp": 0.01280834, + "balance_loss_clip": 0.06300367, + "balance_loss_mlp": 0.01256599, + "epoch": 0.1896287389147753, + "flos": 21950990760960.0, + "grad_norm": 1.7894410743269322, + "language_loss": 0.80290896, + "learning_rate": 3.7386015619836887e-06, + "loss": 0.88138747, + "num_input_tokens_seen": 68057975, + "router_z_loss_clip": 2.66601562, + "router_z_loss_mlp": 0.24255371, + "step": 3154, + "time_per_iteration": 2.554861545562744 + }, + { + "auxiliary_loss_clip": 0.06572987, + "auxiliary_loss_mlp": 0.01294065, + "balance_loss_clip": 0.06302256, + "balance_loss_mlp": 0.01267612, + "epoch": 0.18968886216744327, + "flos": 18183628938240.0, + "grad_norm": 2.9256856308256447, + "language_loss": 0.74259496, + "learning_rate": 3.738409024548223e-06, + "loss": 0.82126546, + "num_input_tokens_seen": 68074175, + "router_z_loss_clip": 2.70898438, + "router_z_loss_mlp": 0.26452637, + "step": 3155, + "time_per_iteration": 2.473719358444214 + }, + { + "auxiliary_loss_clip": 0.06557501, + "auxiliary_loss_mlp": 0.01284077, + "balance_loss_clip": 0.06296935, + "balance_loss_mlp": 0.01260247, + "epoch": 0.18974898542011123, + "flos": 20418735473280.0, + "grad_norm": 2.585248701074102, + "language_loss": 0.74503541, + "learning_rate": 3.7382164211921136e-06, + "loss": 0.82345116, + "num_input_tokens_seen": 68095230, + "router_z_loss_clip": 2.60546875, + "router_z_loss_mlp": 0.23815918, + "step": 3156, + "time_per_iteration": 2.5825979709625244 + }, + { + "auxiliary_loss_clip": 0.06561351, + "auxiliary_loss_mlp": 0.01283032, + "balance_loss_clip": 0.06294506, + "balance_loss_mlp": 0.01259786, + "epoch": 0.1898091086727792, + "flos": 23991356407680.0, + "grad_norm": 1.7654819302184697, + "language_loss": 0.68914878, + "learning_rate": 3.7380237519226623e-06, + "loss": 0.76759267, + "num_input_tokens_seen": 68113805, + "router_z_loss_clip": 2.66796875, + "router_z_loss_mlp": 0.23266602, + "step": 3157, + "time_per_iteration": 2.614276170730591 + }, + { + "auxiliary_loss_clip": 0.06562739, + "auxiliary_loss_mlp": 0.01287461, + "balance_loss_clip": 0.06299365, + "balance_loss_mlp": 0.01263822, + "epoch": 0.18986923192544716, + "flos": 27644590569600.0, + "grad_norm": 1.6841569236878713, + "language_loss": 0.80553401, + "learning_rate": 3.737831016747176e-06, + "loss": 0.88403606, + "num_input_tokens_seen": 68133190, + "router_z_loss_clip": 2.63867188, + "router_z_loss_mlp": 0.23657227, + "step": 3158, + "time_per_iteration": 2.6667590141296387 + }, + { + "auxiliary_loss_clip": 0.06570458, + "auxiliary_loss_mlp": 0.01285173, + "balance_loss_clip": 0.06298561, + "balance_loss_mlp": 0.01260509, + "epoch": 0.18992935517811513, + "flos": 25491271219200.0, + "grad_norm": 2.1165299373469755, + "language_loss": 0.72984976, + "learning_rate": 3.737638215672964e-06, + "loss": 0.808406, + "num_input_tokens_seen": 68152330, + "router_z_loss_clip": 2.71875, + "router_z_loss_mlp": 0.2467041, + "step": 3159, + "time_per_iteration": 2.5685224533081055 + }, + { + "auxiliary_loss_clip": 0.06567825, + "auxiliary_loss_mlp": 0.01281428, + "balance_loss_clip": 0.06301159, + "balance_loss_mlp": 0.01257014, + "epoch": 0.1899894784307831, + "flos": 17426578498560.0, + "grad_norm": 1.8951112773112917, + "language_loss": 0.86019123, + "learning_rate": 3.7374453487073366e-06, + "loss": 0.93868375, + "num_input_tokens_seen": 68170185, + "router_z_loss_clip": 2.66601562, + "router_z_loss_mlp": 0.24438477, + "step": 3160, + "time_per_iteration": 2.533764362335205 + }, + { + "auxiliary_loss_clip": 0.06553883, + "auxiliary_loss_mlp": 0.0128672, + "balance_loss_clip": 0.06294671, + "balance_loss_mlp": 0.01264154, + "epoch": 0.19004960168345109, + "flos": 27499925295360.0, + "grad_norm": 1.7631570201415632, + "language_loss": 0.74244189, + "learning_rate": 3.7372524158576074e-06, + "loss": 0.82084787, + "num_input_tokens_seen": 68191665, + "router_z_loss_clip": 2.59570312, + "router_z_loss_mlp": 0.22570801, + "step": 3161, + "time_per_iteration": 2.590913772583008 + }, + { + "auxiliary_loss_clip": 0.06558438, + "auxiliary_loss_mlp": 0.01279623, + "balance_loss_clip": 0.06296802, + "balance_loss_mlp": 0.01255817, + "epoch": 0.19010972493611905, + "flos": 38663858908800.0, + "grad_norm": 1.9041337161295762, + "language_loss": 0.81525451, + "learning_rate": 3.7370594171310926e-06, + "loss": 0.89363515, + "num_input_tokens_seen": 68214635, + "router_z_loss_clip": 2.61523438, + "router_z_loss_mlp": 0.23803711, + "step": 3162, + "time_per_iteration": 2.7009496688842773 + }, + { + "auxiliary_loss_clip": 0.06556226, + "auxiliary_loss_mlp": 0.01281702, + "balance_loss_clip": 0.06291863, + "balance_loss_mlp": 0.012573, + "epoch": 0.19016984818878702, + "flos": 19250763811200.0, + "grad_norm": 2.198798501736265, + "language_loss": 0.77194953, + "learning_rate": 3.73686635253511e-06, + "loss": 0.8503288, + "num_input_tokens_seen": 68232150, + "router_z_loss_clip": 2.64453125, + "router_z_loss_mlp": 0.2442627, + "step": 3163, + "time_per_iteration": 2.5443172454833984 + }, + { + "auxiliary_loss_clip": 0.06551848, + "auxiliary_loss_mlp": 0.01280109, + "balance_loss_clip": 0.06291605, + "balance_loss_mlp": 0.01256291, + "epoch": 0.19022997144145498, + "flos": 37605947984640.0, + "grad_norm": 1.6741633946121544, + "language_loss": 0.75098169, + "learning_rate": 3.736673222076982e-06, + "loss": 0.82930118, + "num_input_tokens_seen": 68253370, + "router_z_loss_clip": 2.6015625, + "router_z_loss_mlp": 0.23815918, + "step": 3164, + "time_per_iteration": 2.6625473499298096 + }, + { + "auxiliary_loss_clip": 0.06555005, + "auxiliary_loss_mlp": 0.01280136, + "balance_loss_clip": 0.06294911, + "balance_loss_mlp": 0.01256759, + "epoch": 0.19029009469412295, + "flos": 61543874615040.0, + "grad_norm": 2.119573778415358, + "language_loss": 0.67527556, + "learning_rate": 3.7364800257640313e-06, + "loss": 0.75362694, + "num_input_tokens_seen": 68278895, + "router_z_loss_clip": 2.60351562, + "router_z_loss_mlp": 0.23364258, + "step": 3165, + "time_per_iteration": 2.8877623081207275 + }, + { + "auxiliary_loss_clip": 0.06552027, + "auxiliary_loss_mlp": 0.01278943, + "balance_loss_clip": 0.06292567, + "balance_loss_mlp": 0.01254433, + "epoch": 0.1903502179467909, + "flos": 13960077160320.0, + "grad_norm": 2.3966036589645916, + "language_loss": 0.75069398, + "learning_rate": 3.7362867636035835e-06, + "loss": 0.82900369, + "num_input_tokens_seen": 68294880, + "router_z_loss_clip": 2.59765625, + "router_z_loss_mlp": 0.24523926, + "step": 3166, + "time_per_iteration": 2.505680799484253 + }, + { + "auxiliary_loss_clip": 0.06499279, + "auxiliary_loss_mlp": 0.0131955, + "balance_loss_clip": 0.06350935, + "balance_loss_mlp": 0.01311236, + "epoch": 0.1904103411994589, + "flos": 66920484499200.0, + "grad_norm": 0.8228799096925371, + "language_loss": 0.50405741, + "learning_rate": 3.736093435602968e-06, + "loss": 0.58224571, + "num_input_tokens_seen": 68359665, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.08319092, + "step": 3167, + "time_per_iteration": 3.1767730712890625 + }, + { + "auxiliary_loss_clip": 0.06551085, + "auxiliary_loss_mlp": 0.0128493, + "balance_loss_clip": 0.06295685, + "balance_loss_mlp": 0.0126141, + "epoch": 0.19047046445212687, + "flos": 21915296121600.0, + "grad_norm": 1.8666443369688703, + "language_loss": 0.75258517, + "learning_rate": 3.7359000417695156e-06, + "loss": 0.83094531, + "num_input_tokens_seen": 68378950, + "router_z_loss_clip": 2.5546875, + "router_z_loss_mlp": 0.23522949, + "step": 3168, + "time_per_iteration": 2.539647102355957 + }, + { + "auxiliary_loss_clip": 0.06476398, + "auxiliary_loss_mlp": 0.01306941, + "balance_loss_clip": 0.06328493, + "balance_loss_mlp": 0.01299204, + "epoch": 0.19053058770479483, + "flos": 59271549338880.0, + "grad_norm": 0.8502356895352512, + "language_loss": 0.60174263, + "learning_rate": 3.73570658211056e-06, + "loss": 0.67957604, + "num_input_tokens_seen": 68434235, + "router_z_loss_clip": 1.4765625, + "router_z_loss_mlp": 0.07727051, + "step": 3169, + "time_per_iteration": 3.0786385536193848 + }, + { + "auxiliary_loss_clip": 0.06569149, + "auxiliary_loss_mlp": 0.01284984, + "balance_loss_clip": 0.06301555, + "balance_loss_mlp": 0.01260057, + "epoch": 0.1905907109574628, + "flos": 23958093536640.0, + "grad_norm": 1.6203962411975037, + "language_loss": 0.79296863, + "learning_rate": 3.735513056633436e-06, + "loss": 0.87151003, + "num_input_tokens_seen": 68453830, + "router_z_loss_clip": 2.67773438, + "router_z_loss_mlp": 0.24926758, + "step": 3170, + "time_per_iteration": 2.5439629554748535 + }, + { + "auxiliary_loss_clip": 0.06568529, + "auxiliary_loss_mlp": 0.01282478, + "balance_loss_clip": 0.06308423, + "balance_loss_mlp": 0.01258636, + "epoch": 0.19065083421013077, + "flos": 20818378321920.0, + "grad_norm": 3.266788836182488, + "language_loss": 0.78913432, + "learning_rate": 3.7353194653454834e-06, + "loss": 0.86764443, + "num_input_tokens_seen": 68473005, + "router_z_loss_clip": 2.6015625, + "router_z_loss_mlp": 0.23840332, + "step": 3171, + "time_per_iteration": 2.5944604873657227 + }, + { + "auxiliary_loss_clip": 0.06584235, + "auxiliary_loss_mlp": 0.01294559, + "balance_loss_clip": 0.06313154, + "balance_loss_mlp": 0.01269323, + "epoch": 0.19071095746279873, + "flos": 31293003121920.0, + "grad_norm": 1.9362395671252917, + "language_loss": 0.79769027, + "learning_rate": 3.7351258082540426e-06, + "loss": 0.8764782, + "num_input_tokens_seen": 68493470, + "router_z_loss_clip": 2.71484375, + "router_z_loss_mlp": 0.25256348, + "step": 3172, + "time_per_iteration": 2.6039323806762695 + }, + { + "auxiliary_loss_clip": 0.06578603, + "auxiliary_loss_mlp": 0.01291257, + "balance_loss_clip": 0.06316808, + "balance_loss_mlp": 0.0126738, + "epoch": 0.1907710807154667, + "flos": 14361397090560.0, + "grad_norm": 1.549568453685288, + "language_loss": 0.81519973, + "learning_rate": 3.7349320853664576e-06, + "loss": 0.89389837, + "num_input_tokens_seen": 68511290, + "router_z_loss_clip": 2.6171875, + "router_z_loss_mlp": 0.2388916, + "step": 3173, + "time_per_iteration": 2.566249132156372 + }, + { + "auxiliary_loss_clip": 0.06577085, + "auxiliary_loss_mlp": 0.01291087, + "balance_loss_clip": 0.06311868, + "balance_loss_mlp": 0.01266077, + "epoch": 0.1908312039681347, + "flos": 26914388163840.0, + "grad_norm": 1.4831321875737526, + "language_loss": 0.79620194, + "learning_rate": 3.7347382966900735e-06, + "loss": 0.87488365, + "num_input_tokens_seen": 68532575, + "router_z_loss_clip": 2.65429688, + "router_z_loss_mlp": 0.25012207, + "step": 3174, + "time_per_iteration": 4.032260179519653 + }, + { + "auxiliary_loss_clip": 0.06571774, + "auxiliary_loss_mlp": 0.01295417, + "balance_loss_clip": 0.06307514, + "balance_loss_mlp": 0.01271563, + "epoch": 0.19089132722080265, + "flos": 14498767059840.0, + "grad_norm": 1.9289574693520037, + "language_loss": 0.82161433, + "learning_rate": 3.7345444422322395e-06, + "loss": 0.9002862, + "num_input_tokens_seen": 68548760, + "router_z_loss_clip": 2.640625, + "router_z_loss_mlp": 0.23864746, + "step": 3175, + "time_per_iteration": 3.92791748046875 + }, + { + "auxiliary_loss_clip": 0.06570717, + "auxiliary_loss_mlp": 0.01290773, + "balance_loss_clip": 0.06306395, + "balance_loss_mlp": 0.01265393, + "epoch": 0.19095145047347062, + "flos": 13957771173120.0, + "grad_norm": 2.497584127695701, + "language_loss": 0.86521202, + "learning_rate": 3.7343505220003067e-06, + "loss": 0.94382691, + "num_input_tokens_seen": 68563100, + "router_z_loss_clip": 2.640625, + "router_z_loss_mlp": 0.25390625, + "step": 3176, + "time_per_iteration": 2.5083093643188477 + }, + { + "auxiliary_loss_clip": 0.06573781, + "auxiliary_loss_mlp": 0.01293305, + "balance_loss_clip": 0.06304888, + "balance_loss_mlp": 0.01265148, + "epoch": 0.19101157372613858, + "flos": 25308940734720.0, + "grad_norm": 2.21127293150792, + "language_loss": 0.82911885, + "learning_rate": 3.7341565360016285e-06, + "loss": 0.90778971, + "num_input_tokens_seen": 68581650, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 0.28137207, + "step": 3177, + "time_per_iteration": 2.5615227222442627 + }, + { + "auxiliary_loss_clip": 0.06560818, + "auxiliary_loss_mlp": 0.01287183, + "balance_loss_clip": 0.06300267, + "balance_loss_mlp": 0.01263985, + "epoch": 0.19107169697880655, + "flos": 20564448923520.0, + "grad_norm": 2.02770964818788, + "language_loss": 0.75787783, + "learning_rate": 3.73396248424356e-06, + "loss": 0.83635783, + "num_input_tokens_seen": 68600360, + "router_z_loss_clip": 2.60351562, + "router_z_loss_mlp": 0.23205566, + "step": 3178, + "time_per_iteration": 2.6215403079986572 + }, + { + "auxiliary_loss_clip": 0.06568342, + "auxiliary_loss_mlp": 0.01282871, + "balance_loss_clip": 0.06301986, + "balance_loss_mlp": 0.01260233, + "epoch": 0.19113182023147451, + "flos": 22169644790400.0, + "grad_norm": 1.6828125352275214, + "language_loss": 0.82549155, + "learning_rate": 3.7337683667334606e-06, + "loss": 0.90400362, + "num_input_tokens_seen": 68617885, + "router_z_loss_clip": 2.6640625, + "router_z_loss_mlp": 0.22644043, + "step": 3179, + "time_per_iteration": 2.5675652027130127 + }, + { + "auxiliary_loss_clip": 0.06569887, + "auxiliary_loss_mlp": 0.01296491, + "balance_loss_clip": 0.06307887, + "balance_loss_mlp": 0.012734, + "epoch": 0.19119194348414248, + "flos": 18586667877120.0, + "grad_norm": 2.5330173520749124, + "language_loss": 0.80732077, + "learning_rate": 3.733574183478691e-06, + "loss": 0.88598454, + "num_input_tokens_seen": 68634550, + "router_z_loss_clip": 2.62109375, + "router_z_loss_mlp": 0.23095703, + "step": 3180, + "time_per_iteration": 3.945387601852417 + }, + { + "auxiliary_loss_clip": 0.06563538, + "auxiliary_loss_mlp": 0.01290582, + "balance_loss_clip": 0.06302621, + "balance_loss_mlp": 0.01266883, + "epoch": 0.19125206673681047, + "flos": 19032738687360.0, + "grad_norm": 2.1003445268953373, + "language_loss": 0.79773259, + "learning_rate": 3.733379934486615e-06, + "loss": 0.87627381, + "num_input_tokens_seen": 68651895, + "router_z_loss_clip": 2.609375, + "router_z_loss_mlp": 0.23706055, + "step": 3181, + "time_per_iteration": 3.9274189472198486 + }, + { + "auxiliary_loss_clip": 0.06568001, + "auxiliary_loss_mlp": 0.01288302, + "balance_loss_clip": 0.06304715, + "balance_loss_mlp": 0.0126477, + "epoch": 0.19131218998947844, + "flos": 21696725946240.0, + "grad_norm": 2.2417902838655888, + "language_loss": 0.74386561, + "learning_rate": 3.7331856197645973e-06, + "loss": 0.82242858, + "num_input_tokens_seen": 68671500, + "router_z_loss_clip": 2.63476562, + "router_z_loss_mlp": 0.23547363, + "step": 3182, + "time_per_iteration": 2.550570487976074 + }, + { + "auxiliary_loss_clip": 0.06570706, + "auxiliary_loss_mlp": 0.0129189, + "balance_loss_clip": 0.06306151, + "balance_loss_mlp": 0.01267166, + "epoch": 0.1913723132421464, + "flos": 18448459367040.0, + "grad_norm": 1.7754326163332461, + "language_loss": 0.66467738, + "learning_rate": 3.7329912393200084e-06, + "loss": 0.7433033, + "num_input_tokens_seen": 68690570, + "router_z_loss_clip": 2.6484375, + "router_z_loss_mlp": 0.24719238, + "step": 3183, + "time_per_iteration": 2.589555501937866 + }, + { + "auxiliary_loss_clip": 0.06578184, + "auxiliary_loss_mlp": 0.01296721, + "balance_loss_clip": 0.06308434, + "balance_loss_mlp": 0.01268659, + "epoch": 0.19143243649481437, + "flos": 27167101678080.0, + "grad_norm": 1.7849918331200134, + "language_loss": 0.73866975, + "learning_rate": 3.7327967931602173e-06, + "loss": 0.81741881, + "num_input_tokens_seen": 68709735, + "router_z_loss_clip": 2.69921875, + "router_z_loss_mlp": 0.28076172, + "step": 3184, + "time_per_iteration": 2.7020864486694336 + }, + { + "auxiliary_loss_clip": 0.06571424, + "auxiliary_loss_mlp": 0.01290073, + "balance_loss_clip": 0.06304838, + "balance_loss_mlp": 0.01264049, + "epoch": 0.19149255974748233, + "flos": 21724244812800.0, + "grad_norm": 1.9651356872089878, + "language_loss": 0.89339554, + "learning_rate": 3.732602281292598e-06, + "loss": 0.97201049, + "num_input_tokens_seen": 68727565, + "router_z_loss_clip": 2.66601562, + "router_z_loss_mlp": 0.26037598, + "step": 3185, + "time_per_iteration": 2.512737512588501 + }, + { + "auxiliary_loss_clip": 0.06568564, + "auxiliary_loss_mlp": 0.01286821, + "balance_loss_clip": 0.06304171, + "balance_loss_mlp": 0.01261429, + "epoch": 0.1915526830001503, + "flos": 22969433612160.0, + "grad_norm": 2.041503418641191, + "language_loss": 0.74291968, + "learning_rate": 3.7324077037245267e-06, + "loss": 0.82147354, + "num_input_tokens_seen": 68748110, + "router_z_loss_clip": 2.64648438, + "router_z_loss_mlp": 0.25390625, + "step": 3186, + "time_per_iteration": 2.577359676361084 + }, + { + "auxiliary_loss_clip": 0.06579722, + "auxiliary_loss_mlp": 0.01289876, + "balance_loss_clip": 0.06312623, + "balance_loss_mlp": 0.01264675, + "epoch": 0.1916128062528183, + "flos": 26147946067200.0, + "grad_norm": 1.9086459802632982, + "language_loss": 0.84205973, + "learning_rate": 3.7322130604633825e-06, + "loss": 0.92075574, + "num_input_tokens_seen": 68769765, + "router_z_loss_clip": 2.671875, + "router_z_loss_mlp": 0.25231934, + "step": 3187, + "time_per_iteration": 2.575345039367676 + }, + { + "auxiliary_loss_clip": 0.06462009, + "auxiliary_loss_mlp": 0.01273815, + "balance_loss_clip": 0.06313258, + "balance_loss_mlp": 0.01266967, + "epoch": 0.19167292950548626, + "flos": 54943513119360.0, + "grad_norm": 0.8344019653061644, + "language_loss": 0.56017417, + "learning_rate": 3.732018351516544e-06, + "loss": 0.63753241, + "num_input_tokens_seen": 68826815, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.06866455, + "step": 3188, + "time_per_iteration": 3.186802387237549 + }, + { + "auxiliary_loss_clip": 0.06575608, + "auxiliary_loss_mlp": 0.01301201, + "balance_loss_clip": 0.06310253, + "balance_loss_mlp": 0.01276942, + "epoch": 0.19173305275815422, + "flos": 29943497589120.0, + "grad_norm": 2.242687399889932, + "language_loss": 0.70996517, + "learning_rate": 3.731823576891397e-06, + "loss": 0.78873324, + "num_input_tokens_seen": 68847585, + "router_z_loss_clip": 2.65039062, + "router_z_loss_mlp": 0.24267578, + "step": 3189, + "time_per_iteration": 2.5879886150360107 + }, + { + "auxiliary_loss_clip": 0.0656148, + "auxiliary_loss_mlp": 0.01285809, + "balance_loss_clip": 0.06303851, + "balance_loss_mlp": 0.01263994, + "epoch": 0.1917931760108222, + "flos": 24759140169600.0, + "grad_norm": 2.034629185065424, + "language_loss": 0.74848962, + "learning_rate": 3.7316287365953266e-06, + "loss": 0.82696253, + "num_input_tokens_seen": 68866620, + "router_z_loss_clip": 2.578125, + "router_z_loss_mlp": 0.21813965, + "step": 3190, + "time_per_iteration": 2.618912696838379 + }, + { + "auxiliary_loss_clip": 0.06566381, + "auxiliary_loss_mlp": 0.01292718, + "balance_loss_clip": 0.06306858, + "balance_loss_mlp": 0.01268614, + "epoch": 0.19185329926349015, + "flos": 18849527735040.0, + "grad_norm": 1.9370060266864375, + "language_loss": 0.84794742, + "learning_rate": 3.73143383063572e-06, + "loss": 0.92653841, + "num_input_tokens_seen": 68885515, + "router_z_loss_clip": 2.59179688, + "router_z_loss_mlp": 0.24108887, + "step": 3191, + "time_per_iteration": 2.5354197025299072 + }, + { + "auxiliary_loss_clip": 0.06560425, + "auxiliary_loss_mlp": 0.01288793, + "balance_loss_clip": 0.06303156, + "balance_loss_mlp": 0.01265595, + "epoch": 0.19191342251615812, + "flos": 22092721142400.0, + "grad_norm": 1.810553957384375, + "language_loss": 0.90797645, + "learning_rate": 3.73123885901997e-06, + "loss": 0.98646855, + "num_input_tokens_seen": 68903225, + "router_z_loss_clip": 2.57421875, + "router_z_loss_mlp": 0.23193359, + "step": 3192, + "time_per_iteration": 2.594034433364868 + }, + { + "auxiliary_loss_clip": 0.06575879, + "auxiliary_loss_mlp": 0.01297652, + "balance_loss_clip": 0.06307722, + "balance_loss_mlp": 0.01273727, + "epoch": 0.19197354576882608, + "flos": 22205465210880.0, + "grad_norm": 3.128458316309985, + "language_loss": 0.76021564, + "learning_rate": 3.7310438217554687e-06, + "loss": 0.83895093, + "num_input_tokens_seen": 68922860, + "router_z_loss_clip": 2.68554688, + "router_z_loss_mlp": 0.23925781, + "step": 3193, + "time_per_iteration": 2.5328986644744873 + }, + { + "auxiliary_loss_clip": 0.06572805, + "auxiliary_loss_mlp": 0.01303133, + "balance_loss_clip": 0.06305176, + "balance_loss_mlp": 0.01278504, + "epoch": 0.19203366902149407, + "flos": 24902505705600.0, + "grad_norm": 1.8726296466629722, + "language_loss": 0.75837868, + "learning_rate": 3.730848718849612e-06, + "loss": 0.83713806, + "num_input_tokens_seen": 68943000, + "router_z_loss_clip": 2.67382812, + "router_z_loss_mlp": 0.24633789, + "step": 3194, + "time_per_iteration": 2.594693660736084 + }, + { + "auxiliary_loss_clip": 0.06443634, + "auxiliary_loss_mlp": 0.01272062, + "balance_loss_clip": 0.06298726, + "balance_loss_mlp": 0.01264749, + "epoch": 0.19209379227416204, + "flos": 68435256211200.0, + "grad_norm": 0.738426265798758, + "language_loss": 0.68323666, + "learning_rate": 3.7306535503097985e-06, + "loss": 0.76039362, + "num_input_tokens_seen": 69000255, + "router_z_loss_clip": 1.453125, + "router_z_loss_mlp": 0.07293701, + "step": 3195, + "time_per_iteration": 3.082646369934082 + }, + { + "auxiliary_loss_clip": 0.0656238, + "auxiliary_loss_mlp": 0.0129433, + "balance_loss_clip": 0.06301422, + "balance_loss_mlp": 0.01270488, + "epoch": 0.19215391552683, + "flos": 22061848112640.0, + "grad_norm": 2.817360442151248, + "language_loss": 0.74132156, + "learning_rate": 3.730458316143429e-06, + "loss": 0.81988871, + "num_input_tokens_seen": 69019665, + "router_z_loss_clip": 2.61132812, + "router_z_loss_mlp": 0.23852539, + "step": 3196, + "time_per_iteration": 2.5596578121185303 + }, + { + "auxiliary_loss_clip": 0.0656443, + "auxiliary_loss_mlp": 0.01296614, + "balance_loss_clip": 0.06303307, + "balance_loss_mlp": 0.01272939, + "epoch": 0.19221403877949797, + "flos": 20309177859840.0, + "grad_norm": 2.156505210347581, + "language_loss": 0.84144557, + "learning_rate": 3.7302630163579068e-06, + "loss": 0.92005599, + "num_input_tokens_seen": 69039055, + "router_z_loss_clip": 2.61523438, + "router_z_loss_mlp": 0.23657227, + "step": 3197, + "time_per_iteration": 2.505884885787964 + }, + { + "auxiliary_loss_clip": 0.06563333, + "auxiliary_loss_mlp": 0.01294057, + "balance_loss_clip": 0.06297445, + "balance_loss_mlp": 0.0126894, + "epoch": 0.19227416203216594, + "flos": 23192028783360.0, + "grad_norm": 2.1973705189643042, + "language_loss": 0.8105517, + "learning_rate": 3.7300676509606373e-06, + "loss": 0.88912559, + "num_input_tokens_seen": 69056370, + "router_z_loss_clip": 2.65625, + "router_z_loss_mlp": 0.25109863, + "step": 3198, + "time_per_iteration": 2.5759875774383545 + }, + { + "auxiliary_loss_clip": 0.06570526, + "auxiliary_loss_mlp": 0.01303751, + "balance_loss_clip": 0.06301676, + "balance_loss_mlp": 0.01279194, + "epoch": 0.1923342852848339, + "flos": 25783872076800.0, + "grad_norm": 2.3405078734196274, + "language_loss": 0.79434526, + "learning_rate": 3.729872219959029e-06, + "loss": 0.873088, + "num_input_tokens_seen": 69075915, + "router_z_loss_clip": 2.6875, + "router_z_loss_mlp": 0.24536133, + "step": 3199, + "time_per_iteration": 2.57918643951416 + }, + { + "auxiliary_loss_clip": 0.06561789, + "auxiliary_loss_mlp": 0.01291155, + "balance_loss_clip": 0.06299184, + "balance_loss_mlp": 0.01267694, + "epoch": 0.19239440853750187, + "flos": 17133977640960.0, + "grad_norm": 1.9996812909650197, + "language_loss": 0.84443569, + "learning_rate": 3.7296767233604934e-06, + "loss": 0.92296517, + "num_input_tokens_seen": 69094145, + "router_z_loss_clip": 2.625, + "router_z_loss_mlp": 0.23449707, + "step": 3200, + "time_per_iteration": 2.5089356899261475 + }, + { + "auxiliary_loss_clip": 0.06560853, + "auxiliary_loss_mlp": 0.01287978, + "balance_loss_clip": 0.06299884, + "balance_loss_mlp": 0.01265185, + "epoch": 0.19245453179016986, + "flos": 16440601904640.0, + "grad_norm": 1.9071909055640763, + "language_loss": 0.79753184, + "learning_rate": 3.729481161172443e-06, + "loss": 0.87602013, + "num_input_tokens_seen": 69111110, + "router_z_loss_clip": 2.609375, + "router_z_loss_mlp": 0.22790527, + "step": 3201, + "time_per_iteration": 2.5428295135498047 + }, + { + "auxiliary_loss_clip": 0.06563856, + "auxiliary_loss_mlp": 0.01287849, + "balance_loss_clip": 0.06298736, + "balance_loss_mlp": 0.01264365, + "epoch": 0.19251465504283782, + "flos": 20236530769920.0, + "grad_norm": 3.4105372180153273, + "language_loss": 0.70024735, + "learning_rate": 3.7292855334022927e-06, + "loss": 0.77876443, + "num_input_tokens_seen": 69130280, + "router_z_loss_clip": 2.65039062, + "router_z_loss_mlp": 0.23498535, + "step": 3202, + "time_per_iteration": 2.545257806777954 + }, + { + "auxiliary_loss_clip": 0.06559525, + "auxiliary_loss_mlp": 0.01288531, + "balance_loss_clip": 0.06303041, + "balance_loss_mlp": 0.01265965, + "epoch": 0.1925747782955058, + "flos": 19470549870720.0, + "grad_norm": 1.8972638993856672, + "language_loss": 0.9187758, + "learning_rate": 3.7290898400574627e-06, + "loss": 0.9972564, + "num_input_tokens_seen": 69149570, + "router_z_loss_clip": 2.56445312, + "router_z_loss_mlp": 0.22570801, + "step": 3203, + "time_per_iteration": 2.52083420753479 + }, + { + "auxiliary_loss_clip": 0.06569508, + "auxiliary_loss_mlp": 0.01288191, + "balance_loss_clip": 0.06305829, + "balance_loss_mlp": 0.01263193, + "epoch": 0.19263490154817375, + "flos": 17791407175680.0, + "grad_norm": 2.3309919698880637, + "language_loss": 0.82672936, + "learning_rate": 3.7288940811453725e-06, + "loss": 0.9053064, + "num_input_tokens_seen": 69168190, + "router_z_loss_clip": 2.63671875, + "router_z_loss_mlp": 0.25012207, + "step": 3204, + "time_per_iteration": 2.552898645401001 + }, + { + "auxiliary_loss_clip": 0.06554051, + "auxiliary_loss_mlp": 0.01280623, + "balance_loss_clip": 0.06297573, + "balance_loss_mlp": 0.01257437, + "epoch": 0.19269502480084172, + "flos": 17462818189440.0, + "grad_norm": 2.4686415170818927, + "language_loss": 0.76927221, + "learning_rate": 3.7286982566734454e-06, + "loss": 0.84761888, + "num_input_tokens_seen": 69186950, + "router_z_loss_clip": 2.56835938, + "router_z_loss_mlp": 0.23181152, + "step": 3205, + "time_per_iteration": 2.635087251663208 + }, + { + "auxiliary_loss_clip": 0.06570686, + "auxiliary_loss_mlp": 0.01281824, + "balance_loss_clip": 0.06303753, + "balance_loss_mlp": 0.01259913, + "epoch": 0.19275514805350968, + "flos": 21513305358720.0, + "grad_norm": 2.6796703276560034, + "language_loss": 0.84088528, + "learning_rate": 3.728502366649107e-06, + "loss": 0.91941041, + "num_input_tokens_seen": 69204850, + "router_z_loss_clip": 2.671875, + "router_z_loss_mlp": 0.21911621, + "step": 3206, + "time_per_iteration": 2.5875258445739746 + }, + { + "auxiliary_loss_clip": 0.06462742, + "auxiliary_loss_mlp": 0.01299031, + "balance_loss_clip": 0.06320498, + "balance_loss_mlp": 0.01291426, + "epoch": 0.19281527130617768, + "flos": 47711578602240.0, + "grad_norm": 0.8155276906071137, + "language_loss": 0.60688889, + "learning_rate": 3.728306411079786e-06, + "loss": 0.68450665, + "num_input_tokens_seen": 69259200, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.07592773, + "step": 3207, + "time_per_iteration": 2.98170804977417 + }, + { + "auxiliary_loss_clip": 0.06570975, + "auxiliary_loss_mlp": 0.01284779, + "balance_loss_clip": 0.06306583, + "balance_loss_mlp": 0.01261426, + "epoch": 0.19287539455884564, + "flos": 11805961196160.0, + "grad_norm": 2.350100512422909, + "language_loss": 0.76272619, + "learning_rate": 3.7281103899729125e-06, + "loss": 0.8412838, + "num_input_tokens_seen": 69275835, + "router_z_loss_clip": 2.64453125, + "router_z_loss_mlp": 0.23364258, + "step": 3208, + "time_per_iteration": 2.495664358139038 + }, + { + "auxiliary_loss_clip": 0.06570548, + "auxiliary_loss_mlp": 0.01287656, + "balance_loss_clip": 0.06303693, + "balance_loss_mlp": 0.01263253, + "epoch": 0.1929355178115136, + "flos": 20637724919040.0, + "grad_norm": 2.572131519169912, + "language_loss": 0.61787575, + "learning_rate": 3.7279143033359195e-06, + "loss": 0.69645774, + "num_input_tokens_seen": 69294810, + "router_z_loss_clip": 2.66796875, + "router_z_loss_mlp": 0.24389648, + "step": 3209, + "time_per_iteration": 2.5720291137695312 + }, + { + "auxiliary_loss_clip": 0.06569174, + "auxiliary_loss_mlp": 0.0128696, + "balance_loss_clip": 0.06303342, + "balance_loss_mlp": 0.01262832, + "epoch": 0.19299564106418157, + "flos": 40817555602560.0, + "grad_norm": 2.1926342764258773, + "language_loss": 0.80817664, + "learning_rate": 3.727718151176243e-06, + "loss": 0.88673794, + "num_input_tokens_seen": 69316065, + "router_z_loss_clip": 2.65820312, + "router_z_loss_mlp": 0.24133301, + "step": 3210, + "time_per_iteration": 2.6967084407806396 + }, + { + "auxiliary_loss_clip": 0.06562287, + "auxiliary_loss_mlp": 0.01281086, + "balance_loss_clip": 0.06305824, + "balance_loss_mlp": 0.01258913, + "epoch": 0.19305576431684954, + "flos": 11365718244480.0, + "grad_norm": 4.335018711819376, + "language_loss": 0.83798629, + "learning_rate": 3.7275219335013217e-06, + "loss": 0.9164201, + "num_input_tokens_seen": 69332900, + "router_z_loss_clip": 2.56445312, + "router_z_loss_mlp": 0.22167969, + "step": 3211, + "time_per_iteration": 2.522151470184326 + }, + { + "auxiliary_loss_clip": 0.06460443, + "auxiliary_loss_mlp": 0.01261987, + "balance_loss_clip": 0.06318722, + "balance_loss_mlp": 0.01254787, + "epoch": 0.1931158875695175, + "flos": 54527476798080.0, + "grad_norm": 0.9401062048905866, + "language_loss": 0.63522434, + "learning_rate": 3.7273256503185953e-06, + "loss": 0.71244872, + "num_input_tokens_seen": 69382535, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.07196045, + "step": 3212, + "time_per_iteration": 3.0072474479675293 + }, + { + "auxiliary_loss_clip": 0.06559554, + "auxiliary_loss_mlp": 0.01284587, + "balance_loss_clip": 0.06301133, + "balance_loss_mlp": 0.01260936, + "epoch": 0.19317601082218547, + "flos": 19834540007040.0, + "grad_norm": 1.629103353649286, + "language_loss": 0.7732501, + "learning_rate": 3.7271293016355074e-06, + "loss": 0.85169148, + "num_input_tokens_seen": 69400600, + "router_z_loss_clip": 2.58398438, + "router_z_loss_mlp": 0.23669434, + "step": 3213, + "time_per_iteration": 3.972214698791504 + }, + { + "auxiliary_loss_clip": 0.06571522, + "auxiliary_loss_mlp": 0.01282458, + "balance_loss_clip": 0.06306578, + "balance_loss_mlp": 0.01259749, + "epoch": 0.19323613407485346, + "flos": 13157143810560.0, + "grad_norm": 2.0451873974907864, + "language_loss": 0.71339387, + "learning_rate": 3.726932887459503e-06, + "loss": 0.79193366, + "num_input_tokens_seen": 69417350, + "router_z_loss_clip": 2.64648438, + "router_z_loss_mlp": 0.22729492, + "step": 3214, + "time_per_iteration": 2.542698383331299 + }, + { + "auxiliary_loss_clip": 0.06565271, + "auxiliary_loss_mlp": 0.01287539, + "balance_loss_clip": 0.06303567, + "balance_loss_mlp": 0.01264365, + "epoch": 0.19329625732752143, + "flos": 14032388833920.0, + "grad_norm": 2.534528672768976, + "language_loss": 0.75987494, + "learning_rate": 3.72673640779803e-06, + "loss": 0.83840305, + "num_input_tokens_seen": 69431845, + "router_z_loss_clip": 2.61328125, + "router_z_loss_mlp": 0.23205566, + "step": 3215, + "time_per_iteration": 3.8739888668060303 + }, + { + "auxiliary_loss_clip": 0.06557035, + "auxiliary_loss_mlp": 0.01279943, + "balance_loss_clip": 0.06302097, + "balance_loss_mlp": 0.01257615, + "epoch": 0.1933563805801894, + "flos": 23448641512320.0, + "grad_norm": 2.010602658012729, + "language_loss": 0.88668227, + "learning_rate": 3.72653986265854e-06, + "loss": 0.96505201, + "num_input_tokens_seen": 69453275, + "router_z_loss_clip": 2.55273438, + "router_z_loss_mlp": 0.22338867, + "step": 3216, + "time_per_iteration": 2.5690455436706543 + }, + { + "auxiliary_loss_clip": 0.06557489, + "auxiliary_loss_mlp": 0.01281443, + "balance_loss_clip": 0.06301452, + "balance_loss_mlp": 0.01259019, + "epoch": 0.19341650383285736, + "flos": 20491550271360.0, + "grad_norm": 2.1677144094151823, + "language_loss": 0.80915409, + "learning_rate": 3.726343252048485e-06, + "loss": 0.88754338, + "num_input_tokens_seen": 69471830, + "router_z_loss_clip": 2.55859375, + "router_z_loss_mlp": 0.2244873, + "step": 3217, + "time_per_iteration": 2.522089958190918 + }, + { + "auxiliary_loss_clip": 0.06573136, + "auxiliary_loss_mlp": 0.01282755, + "balance_loss_clip": 0.06306149, + "balance_loss_mlp": 0.01257709, + "epoch": 0.19347662708552532, + "flos": 17864305827840.0, + "grad_norm": 3.8111547770960907, + "language_loss": 0.63612419, + "learning_rate": 3.7261465759753206e-06, + "loss": 0.71468312, + "num_input_tokens_seen": 69489320, + "router_z_loss_clip": 2.66992188, + "router_z_loss_mlp": 0.25048828, + "step": 3218, + "time_per_iteration": 2.511009693145752 + }, + { + "auxiliary_loss_clip": 0.06568655, + "auxiliary_loss_mlp": 0.01286799, + "balance_loss_clip": 0.06304532, + "balance_loss_mlp": 0.01262945, + "epoch": 0.1935367503381933, + "flos": 18193188303360.0, + "grad_norm": 1.6615722636986479, + "language_loss": 0.80769217, + "learning_rate": 3.7259498344465053e-06, + "loss": 0.88624674, + "num_input_tokens_seen": 69506665, + "router_z_loss_clip": 2.63867188, + "router_z_loss_mlp": 0.23852539, + "step": 3219, + "time_per_iteration": 2.49652099609375 + }, + { + "auxiliary_loss_clip": 0.06560229, + "auxiliary_loss_mlp": 0.01283688, + "balance_loss_clip": 0.06305727, + "balance_loss_mlp": 0.01262183, + "epoch": 0.19359687359086128, + "flos": 15961939056000.0, + "grad_norm": 2.4004031272371096, + "language_loss": 0.87055713, + "learning_rate": 3.7257530274694993e-06, + "loss": 0.94899631, + "num_input_tokens_seen": 69523835, + "router_z_loss_clip": 2.54492188, + "router_z_loss_mlp": 0.21520996, + "step": 3220, + "time_per_iteration": 3.9898974895477295 + }, + { + "auxiliary_loss_clip": 0.06557765, + "auxiliary_loss_mlp": 0.01279498, + "balance_loss_clip": 0.06308522, + "balance_loss_mlp": 0.0125829, + "epoch": 0.19365699684352924, + "flos": 21221584968960.0, + "grad_norm": 2.3273733740868296, + "language_loss": 0.84724689, + "learning_rate": 3.725556155051766e-06, + "loss": 0.92561948, + "num_input_tokens_seen": 69542620, + "router_z_loss_clip": 2.48828125, + "router_z_loss_mlp": 0.21191406, + "step": 3221, + "time_per_iteration": 2.546876907348633 + }, + { + "auxiliary_loss_clip": 0.06557351, + "auxiliary_loss_mlp": 0.01282697, + "balance_loss_clip": 0.06305219, + "balance_loss_mlp": 0.01260333, + "epoch": 0.1937171200961972, + "flos": 17316811249920.0, + "grad_norm": 2.1420374809622507, + "language_loss": 0.8628484, + "learning_rate": 3.7253592172007702e-06, + "loss": 0.94124895, + "num_input_tokens_seen": 69561130, + "router_z_loss_clip": 2.52148438, + "router_z_loss_mlp": 0.22351074, + "step": 3222, + "time_per_iteration": 2.497483015060425 + }, + { + "auxiliary_loss_clip": 0.06565784, + "auxiliary_loss_mlp": 0.0127706, + "balance_loss_clip": 0.06304947, + "balance_loss_mlp": 0.01255114, + "epoch": 0.19377724334886517, + "flos": 22642228218240.0, + "grad_norm": 2.292443034833117, + "language_loss": 0.7909472, + "learning_rate": 3.72516221392398e-06, + "loss": 0.86937559, + "num_input_tokens_seen": 69580425, + "router_z_loss_clip": 2.609375, + "router_z_loss_mlp": 0.21948242, + "step": 3223, + "time_per_iteration": 2.63804292678833 + }, + { + "auxiliary_loss_clip": 0.06563858, + "auxiliary_loss_mlp": 0.01278148, + "balance_loss_clip": 0.06308811, + "balance_loss_mlp": 0.01256452, + "epoch": 0.19383736660153314, + "flos": 15081872423040.0, + "grad_norm": 2.2027436227921977, + "language_loss": 0.76066363, + "learning_rate": 3.7249651452288653e-06, + "loss": 0.83908367, + "num_input_tokens_seen": 69597085, + "router_z_loss_clip": 2.55273438, + "router_z_loss_mlp": 0.21728516, + "step": 3224, + "time_per_iteration": 2.4926822185516357 + }, + { + "auxiliary_loss_clip": 0.06569614, + "auxiliary_loss_mlp": 0.01280842, + "balance_loss_clip": 0.06311695, + "balance_loss_mlp": 0.01257155, + "epoch": 0.1938974898542011, + "flos": 47130626246400.0, + "grad_norm": 2.47304361876348, + "language_loss": 0.71419585, + "learning_rate": 3.7247680111229e-06, + "loss": 0.79270041, + "num_input_tokens_seen": 69618885, + "router_z_loss_clip": 2.578125, + "router_z_loss_mlp": 0.23681641, + "step": 3225, + "time_per_iteration": 2.8417437076568604 + }, + { + "auxiliary_loss_clip": 0.0656653, + "auxiliary_loss_mlp": 0.01277702, + "balance_loss_clip": 0.06306545, + "balance_loss_mlp": 0.01255076, + "epoch": 0.19395761310686907, + "flos": 25819734424320.0, + "grad_norm": 2.3579945849430235, + "language_loss": 0.6987173, + "learning_rate": 3.7245708116135585e-06, + "loss": 0.77715963, + "num_input_tokens_seen": 69638200, + "router_z_loss_clip": 2.59960938, + "router_z_loss_mlp": 0.22619629, + "step": 3226, + "time_per_iteration": 2.5816895961761475 + }, + { + "auxiliary_loss_clip": 0.06556038, + "auxiliary_loss_mlp": 0.01279426, + "balance_loss_clip": 0.06305292, + "balance_loss_mlp": 0.01255608, + "epoch": 0.19401773635953706, + "flos": 23046315333120.0, + "grad_norm": 1.6993594132957168, + "language_loss": 0.76826584, + "learning_rate": 3.7243735467083193e-06, + "loss": 0.84662044, + "num_input_tokens_seen": 69657550, + "router_z_loss_clip": 2.5078125, + "router_z_loss_mlp": 0.23815918, + "step": 3227, + "time_per_iteration": 2.5873494148254395 + }, + { + "auxiliary_loss_clip": 0.06565821, + "auxiliary_loss_mlp": 0.01279646, + "balance_loss_clip": 0.063063, + "balance_loss_mlp": 0.01257187, + "epoch": 0.19407785961220503, + "flos": 15925615511040.0, + "grad_norm": 1.984580707337323, + "language_loss": 0.70403302, + "learning_rate": 3.724176216414662e-06, + "loss": 0.78248763, + "num_input_tokens_seen": 69675005, + "router_z_loss_clip": 2.59765625, + "router_z_loss_mlp": 0.22460938, + "step": 3228, + "time_per_iteration": 2.5275485515594482 + }, + { + "auxiliary_loss_clip": 0.06563079, + "auxiliary_loss_mlp": 0.01279835, + "balance_loss_clip": 0.06306829, + "balance_loss_mlp": 0.01257662, + "epoch": 0.194137982864873, + "flos": 25928872767360.0, + "grad_norm": 1.8334459249779138, + "language_loss": 0.74913502, + "learning_rate": 3.72397882074007e-06, + "loss": 0.82756412, + "num_input_tokens_seen": 69696455, + "router_z_loss_clip": 2.56640625, + "router_z_loss_mlp": 0.2220459, + "step": 3229, + "time_per_iteration": 2.588756561279297 + }, + { + "auxiliary_loss_clip": 0.06561101, + "auxiliary_loss_mlp": 0.01283623, + "balance_loss_clip": 0.06304256, + "balance_loss_mlp": 0.01260126, + "epoch": 0.19419810611754096, + "flos": 13266407934720.0, + "grad_norm": 2.0512138922716034, + "language_loss": 0.66050041, + "learning_rate": 3.7237813596920285e-06, + "loss": 0.73894763, + "num_input_tokens_seen": 69714245, + "router_z_loss_clip": 2.57226562, + "router_z_loss_mlp": 0.23486328, + "step": 3230, + "time_per_iteration": 2.51173996925354 + }, + { + "auxiliary_loss_clip": 0.06559683, + "auxiliary_loss_mlp": 0.01282022, + "balance_loss_clip": 0.06306173, + "balance_loss_mlp": 0.01259444, + "epoch": 0.19425822937020892, + "flos": 15710986477440.0, + "grad_norm": 1.9323382078744304, + "language_loss": 0.82361978, + "learning_rate": 3.7235838332780254e-06, + "loss": 0.90203679, + "num_input_tokens_seen": 69731515, + "router_z_loss_clip": 2.53515625, + "router_z_loss_mlp": 0.22583008, + "step": 3231, + "time_per_iteration": 2.5331170558929443 + }, + { + "auxiliary_loss_clip": 0.06565376, + "auxiliary_loss_mlp": 0.01284277, + "balance_loss_clip": 0.0630706, + "balance_loss_mlp": 0.01260793, + "epoch": 0.1943183526228769, + "flos": 23110912431360.0, + "grad_norm": 1.7851653331870696, + "language_loss": 0.8806898, + "learning_rate": 3.72338624150555e-06, + "loss": 0.95918632, + "num_input_tokens_seen": 69748885, + "router_z_loss_clip": 2.5859375, + "router_z_loss_mlp": 0.23474121, + "step": 3232, + "time_per_iteration": 2.556128740310669 + }, + { + "auxiliary_loss_clip": 0.06561054, + "auxiliary_loss_mlp": 0.01288213, + "balance_loss_clip": 0.06308518, + "balance_loss_mlp": 0.01265718, + "epoch": 0.19437847587554485, + "flos": 24718707774720.0, + "grad_norm": 1.9425002506843316, + "language_loss": 0.8592729, + "learning_rate": 3.723188584382096e-06, + "loss": 0.93776554, + "num_input_tokens_seen": 69767540, + "router_z_loss_clip": 2.52929688, + "router_z_loss_mlp": 0.22497559, + "step": 3233, + "time_per_iteration": 2.5888071060180664 + }, + { + "auxiliary_loss_clip": 0.06570844, + "auxiliary_loss_mlp": 0.01287681, + "balance_loss_clip": 0.06309654, + "balance_loss_mlp": 0.01263195, + "epoch": 0.19443859912821285, + "flos": 23123448616320.0, + "grad_norm": 2.322933236090491, + "language_loss": 0.8952834, + "learning_rate": 3.722990861915158e-06, + "loss": 0.97386861, + "num_input_tokens_seen": 69789340, + "router_z_loss_clip": 2.61328125, + "router_z_loss_mlp": 0.24499512, + "step": 3234, + "time_per_iteration": 2.598424196243286 + }, + { + "auxiliary_loss_clip": 0.0656711, + "auxiliary_loss_mlp": 0.01279524, + "balance_loss_clip": 0.06307149, + "balance_loss_mlp": 0.01256243, + "epoch": 0.1944987223808808, + "flos": 15089545071360.0, + "grad_norm": 2.0762312051619993, + "language_loss": 0.7883603, + "learning_rate": 3.722793074112234e-06, + "loss": 0.86682659, + "num_input_tokens_seen": 69806470, + "router_z_loss_clip": 2.59765625, + "router_z_loss_mlp": 0.23291016, + "step": 3235, + "time_per_iteration": 2.518150806427002 + }, + { + "auxiliary_loss_clip": 0.06562902, + "auxiliary_loss_mlp": 0.01278747, + "balance_loss_clip": 0.06309078, + "balance_loss_mlp": 0.01257253, + "epoch": 0.19455884563354878, + "flos": 17132258632320.0, + "grad_norm": 2.012702835830896, + "language_loss": 0.79693586, + "learning_rate": 3.7225952209808233e-06, + "loss": 0.87535232, + "num_input_tokens_seen": 69822655, + "router_z_loss_clip": 2.54296875, + "router_z_loss_mlp": 0.21520996, + "step": 3236, + "time_per_iteration": 2.5621957778930664 + }, + { + "auxiliary_loss_clip": 0.06562862, + "auxiliary_loss_mlp": 0.01279358, + "balance_loss_clip": 0.06309117, + "balance_loss_mlp": 0.0125635, + "epoch": 0.19461896888621674, + "flos": 20199578319360.0, + "grad_norm": 1.7644130728207734, + "language_loss": 0.76505381, + "learning_rate": 3.72239730252843e-06, + "loss": 0.84347594, + "num_input_tokens_seen": 69841895, + "router_z_loss_clip": 2.53710938, + "router_z_loss_mlp": 0.23010254, + "step": 3237, + "time_per_iteration": 2.545138359069824 + }, + { + "auxiliary_loss_clip": 0.06572011, + "auxiliary_loss_mlp": 0.01287724, + "balance_loss_clip": 0.06309787, + "balance_loss_mlp": 0.01264455, + "epoch": 0.1946790921388847, + "flos": 25308395683200.0, + "grad_norm": 3.0171180207385855, + "language_loss": 0.75939953, + "learning_rate": 3.7221993187625583e-06, + "loss": 0.8379969, + "num_input_tokens_seen": 69862220, + "router_z_loss_clip": 2.62304688, + "router_z_loss_mlp": 0.23291016, + "step": 3238, + "time_per_iteration": 2.6292033195495605 + }, + { + "auxiliary_loss_clip": 0.06564013, + "auxiliary_loss_mlp": 0.01283016, + "balance_loss_clip": 0.0631004, + "balance_loss_mlp": 0.0126033, + "epoch": 0.19473921539155267, + "flos": 20199578319360.0, + "grad_norm": 5.2039179549819, + "language_loss": 0.740753, + "learning_rate": 3.7220012696907155e-06, + "loss": 0.81922328, + "num_input_tokens_seen": 69881830, + "router_z_loss_clip": 2.5390625, + "router_z_loss_mlp": 0.22692871, + "step": 3239, + "time_per_iteration": 2.5251026153564453 + }, + { + "auxiliary_loss_clip": 0.06561047, + "auxiliary_loss_mlp": 0.01279887, + "balance_loss_clip": 0.06308049, + "balance_loss_mlp": 0.01257464, + "epoch": 0.19479933864422067, + "flos": 20894002231680.0, + "grad_norm": 2.589752485587752, + "language_loss": 0.74076676, + "learning_rate": 3.721803155320412e-06, + "loss": 0.8191762, + "num_input_tokens_seen": 69900515, + "router_z_loss_clip": 2.53125, + "router_z_loss_mlp": 0.22424316, + "step": 3240, + "time_per_iteration": 2.5630886554718018 + }, + { + "auxiliary_loss_clip": 0.06569096, + "auxiliary_loss_mlp": 0.01285658, + "balance_loss_clip": 0.06312588, + "balance_loss_mlp": 0.01262758, + "epoch": 0.19485946189688863, + "flos": 23301837959040.0, + "grad_norm": 2.269188581778515, + "language_loss": 0.67009896, + "learning_rate": 3.7216049756591606e-06, + "loss": 0.7486465, + "num_input_tokens_seen": 69920060, + "router_z_loss_clip": 2.56640625, + "router_z_loss_mlp": 0.22888184, + "step": 3241, + "time_per_iteration": 2.5366311073303223 + }, + { + "auxiliary_loss_clip": 0.0657091, + "auxiliary_loss_mlp": 0.01284859, + "balance_loss_clip": 0.06315701, + "balance_loss_mlp": 0.01261017, + "epoch": 0.1949195851495566, + "flos": 23301796032000.0, + "grad_norm": 1.7252715969085026, + "language_loss": 0.8313868, + "learning_rate": 3.7214067307144754e-06, + "loss": 0.90994453, + "num_input_tokens_seen": 69939820, + "router_z_loss_clip": 2.5546875, + "router_z_loss_mlp": 0.23828125, + "step": 3242, + "time_per_iteration": 2.5582659244537354 + }, + { + "auxiliary_loss_clip": 0.06462191, + "auxiliary_loss_mlp": 0.01271622, + "balance_loss_clip": 0.06317475, + "balance_loss_mlp": 0.01264684, + "epoch": 0.19497970840222456, + "flos": 64982884285440.0, + "grad_norm": 0.8039225971535554, + "language_loss": 0.57435864, + "learning_rate": 3.721208420493875e-06, + "loss": 0.6516968, + "num_input_tokens_seen": 70002145, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.06951904, + "step": 3243, + "time_per_iteration": 3.1517677307128906 + }, + { + "auxiliary_loss_clip": 0.06582105, + "auxiliary_loss_mlp": 0.01289713, + "balance_loss_clip": 0.06324299, + "balance_loss_mlp": 0.01264619, + "epoch": 0.19503983165489253, + "flos": 19650574368000.0, + "grad_norm": 1.7327160710810887, + "language_loss": 0.83662367, + "learning_rate": 3.7210100450048784e-06, + "loss": 0.91534185, + "num_input_tokens_seen": 70020510, + "router_z_loss_clip": 2.58007812, + "router_z_loss_mlp": 0.25085449, + "step": 3244, + "time_per_iteration": 2.580615282058716 + }, + { + "auxiliary_loss_clip": 0.06580628, + "auxiliary_loss_mlp": 0.01287488, + "balance_loss_clip": 0.06321178, + "balance_loss_mlp": 0.01264206, + "epoch": 0.1950999549075605, + "flos": 21148308973440.0, + "grad_norm": 1.8443508562563502, + "language_loss": 0.77383208, + "learning_rate": 3.7208116042550088e-06, + "loss": 0.85251331, + "num_input_tokens_seen": 70040760, + "router_z_loss_clip": 2.59375, + "router_z_loss_mlp": 0.23278809, + "step": 3245, + "time_per_iteration": 2.562547206878662 + }, + { + "auxiliary_loss_clip": 0.06574707, + "auxiliary_loss_mlp": 0.01284069, + "balance_loss_clip": 0.06316134, + "balance_loss_mlp": 0.01260168, + "epoch": 0.19516007816022846, + "flos": 20890815776640.0, + "grad_norm": 1.9180190042930891, + "language_loss": 0.84645605, + "learning_rate": 3.7206130982517906e-06, + "loss": 0.92504388, + "num_input_tokens_seen": 70058720, + "router_z_loss_clip": 2.58789062, + "router_z_loss_mlp": 0.2388916, + "step": 3246, + "time_per_iteration": 2.5781290531158447 + }, + { + "auxiliary_loss_clip": 0.06585012, + "auxiliary_loss_mlp": 0.01283635, + "balance_loss_clip": 0.0632351, + "balance_loss_mlp": 0.012612, + "epoch": 0.19522020141289645, + "flos": 16916287933440.0, + "grad_norm": 2.4019655481348177, + "language_loss": 0.77056623, + "learning_rate": 3.7204145270027514e-06, + "loss": 0.8492527, + "num_input_tokens_seen": 70076470, + "router_z_loss_clip": 2.61523438, + "router_z_loss_mlp": 0.22436523, + "step": 3247, + "time_per_iteration": 2.5042033195495605 + }, + { + "auxiliary_loss_clip": 0.06582692, + "auxiliary_loss_mlp": 0.01287787, + "balance_loss_clip": 0.06325091, + "balance_loss_mlp": 0.01264136, + "epoch": 0.19528032466556441, + "flos": 26732183460480.0, + "grad_norm": 1.5912411640106108, + "language_loss": 0.75763261, + "learning_rate": 3.720215890515421e-06, + "loss": 0.83633739, + "num_input_tokens_seen": 70096220, + "router_z_loss_clip": 2.58007812, + "router_z_loss_mlp": 0.23669434, + "step": 3248, + "time_per_iteration": 2.629751205444336 + }, + { + "auxiliary_loss_clip": 0.0657216, + "auxiliary_loss_mlp": 0.01286346, + "balance_loss_clip": 0.06312956, + "balance_loss_mlp": 0.01263994, + "epoch": 0.19534044791823238, + "flos": 21039170630400.0, + "grad_norm": 2.0257715109614822, + "language_loss": 0.79102194, + "learning_rate": 3.7200171887973316e-06, + "loss": 0.86960697, + "num_input_tokens_seen": 70114800, + "router_z_loss_clip": 2.58984375, + "router_z_loss_mlp": 0.22375488, + "step": 3249, + "time_per_iteration": 2.5774686336517334 + }, + { + "auxiliary_loss_clip": 0.06565905, + "auxiliary_loss_mlp": 0.01285899, + "balance_loss_clip": 0.06309386, + "balance_loss_mlp": 0.01263035, + "epoch": 0.19540057117090034, + "flos": 22350256266240.0, + "grad_norm": 1.6645797480066, + "language_loss": 0.73634374, + "learning_rate": 3.7198184218560176e-06, + "loss": 0.81486177, + "num_input_tokens_seen": 70134930, + "router_z_loss_clip": 2.56835938, + "router_z_loss_mlp": 0.2286377, + "step": 3250, + "time_per_iteration": 2.5834462642669678 + }, + { + "auxiliary_loss_clip": 0.06557436, + "auxiliary_loss_mlp": 0.01284202, + "balance_loss_clip": 0.06304777, + "balance_loss_mlp": 0.01261791, + "epoch": 0.1954606944235683, + "flos": 20307626559360.0, + "grad_norm": 5.203824713813235, + "language_loss": 0.80619103, + "learning_rate": 3.719619589699017e-06, + "loss": 0.88460743, + "num_input_tokens_seen": 70152045, + "router_z_loss_clip": 2.52929688, + "router_z_loss_mlp": 0.22399902, + "step": 3251, + "time_per_iteration": 2.5159976482391357 + }, + { + "auxiliary_loss_clip": 0.06569009, + "auxiliary_loss_mlp": 0.0128766, + "balance_loss_clip": 0.06309755, + "balance_loss_mlp": 0.01264593, + "epoch": 0.19552081767623627, + "flos": 17352463962240.0, + "grad_norm": 2.6280610562746882, + "language_loss": 0.84652966, + "learning_rate": 3.7194206923338695e-06, + "loss": 0.92509639, + "num_input_tokens_seen": 70169240, + "router_z_loss_clip": 2.59375, + "router_z_loss_mlp": 0.23071289, + "step": 3252, + "time_per_iteration": 2.584712505340576 + }, + { + "auxiliary_loss_clip": 0.0657175, + "auxiliary_loss_mlp": 0.01284666, + "balance_loss_clip": 0.06305347, + "balance_loss_mlp": 0.01258559, + "epoch": 0.19558094092890424, + "flos": 31985666098560.0, + "grad_norm": 1.8259798075239808, + "language_loss": 0.74205744, + "learning_rate": 3.719221729768117e-06, + "loss": 0.82062161, + "num_input_tokens_seen": 70192690, + "router_z_loss_clip": 2.66601562, + "router_z_loss_mlp": 0.26098633, + "step": 3253, + "time_per_iteration": 4.126874685287476 + }, + { + "auxiliary_loss_clip": 0.06567718, + "auxiliary_loss_mlp": 0.01281159, + "balance_loss_clip": 0.06301166, + "balance_loss_mlp": 0.0125721, + "epoch": 0.19564106418157223, + "flos": 22274716210560.0, + "grad_norm": 1.973936337746025, + "language_loss": 0.77398765, + "learning_rate": 3.7190227020093037e-06, + "loss": 0.85247642, + "num_input_tokens_seen": 70209685, + "router_z_loss_clip": 2.66210938, + "router_z_loss_mlp": 0.23962402, + "step": 3254, + "time_per_iteration": 2.6537773609161377 + }, + { + "auxiliary_loss_clip": 0.06437294, + "auxiliary_loss_mlp": 0.01260118, + "balance_loss_clip": 0.06291844, + "balance_loss_mlp": 0.01253204, + "epoch": 0.1957011874342402, + "flos": 54379876631040.0, + "grad_norm": 0.7412950515810539, + "language_loss": 0.55013955, + "learning_rate": 3.7188236090649774e-06, + "loss": 0.62711358, + "num_input_tokens_seen": 70265050, + "router_z_loss_clip": 1.453125, + "router_z_loss_mlp": 0.06933594, + "step": 3255, + "time_per_iteration": 4.54949426651001 + }, + { + "auxiliary_loss_clip": 0.06563026, + "auxiliary_loss_mlp": 0.01289416, + "balance_loss_clip": 0.06301506, + "balance_loss_mlp": 0.01265407, + "epoch": 0.19576131068690816, + "flos": 16511991183360.0, + "grad_norm": 2.710710922193229, + "language_loss": 0.71672189, + "learning_rate": 3.718624450942688e-06, + "loss": 0.79524636, + "num_input_tokens_seen": 70281830, + "router_z_loss_clip": 2.61523438, + "router_z_loss_mlp": 0.2401123, + "step": 3256, + "time_per_iteration": 2.563938617706299 + }, + { + "auxiliary_loss_clip": 0.06557887, + "auxiliary_loss_mlp": 0.01283051, + "balance_loss_clip": 0.06298412, + "balance_loss_mlp": 0.01259591, + "epoch": 0.19582143393957613, + "flos": 14724800248320.0, + "grad_norm": 2.2116868908222176, + "language_loss": 0.8133806, + "learning_rate": 3.718425227649987e-06, + "loss": 0.89178997, + "num_input_tokens_seen": 70297420, + "router_z_loss_clip": 2.59570312, + "router_z_loss_mlp": 0.23461914, + "step": 3257, + "time_per_iteration": 2.546842336654663 + }, + { + "auxiliary_loss_clip": 0.06568147, + "auxiliary_loss_mlp": 0.01289159, + "balance_loss_clip": 0.06309533, + "balance_loss_mlp": 0.01264554, + "epoch": 0.1958815571922441, + "flos": 24432354046080.0, + "grad_norm": 4.3707104143190785, + "language_loss": 0.76246595, + "learning_rate": 3.7182259391944292e-06, + "loss": 0.841039, + "num_input_tokens_seen": 70319210, + "router_z_loss_clip": 2.58398438, + "router_z_loss_mlp": 0.24609375, + "step": 3258, + "time_per_iteration": 2.596585273742676 + }, + { + "auxiliary_loss_clip": 0.06562606, + "auxiliary_loss_mlp": 0.01282027, + "balance_loss_clip": 0.06300102, + "balance_loss_mlp": 0.01257828, + "epoch": 0.19594168044491206, + "flos": 24907285388160.0, + "grad_norm": 1.9490064747675282, + "language_loss": 0.74507892, + "learning_rate": 3.7180265855835714e-06, + "loss": 0.82352525, + "num_input_tokens_seen": 70339045, + "router_z_loss_clip": 2.62695312, + "router_z_loss_mlp": 0.24230957, + "step": 3259, + "time_per_iteration": 2.572443723678589 + }, + { + "auxiliary_loss_clip": 0.06562422, + "auxiliary_loss_mlp": 0.01289683, + "balance_loss_clip": 0.06298189, + "balance_loss_mlp": 0.01263302, + "epoch": 0.19600180369758005, + "flos": 12061819238400.0, + "grad_norm": 2.2810085679716106, + "language_loss": 0.7772423, + "learning_rate": 3.7178271668249735e-06, + "loss": 0.85576332, + "num_input_tokens_seen": 70356505, + "router_z_loss_clip": 2.63867188, + "router_z_loss_mlp": 0.26379395, + "step": 3260, + "time_per_iteration": 5.330974340438843 + }, + { + "auxiliary_loss_clip": 0.06562512, + "auxiliary_loss_mlp": 0.01290293, + "balance_loss_clip": 0.06300309, + "balance_loss_mlp": 0.01266046, + "epoch": 0.19606192695024802, + "flos": 20856504729600.0, + "grad_norm": 2.085882514659535, + "language_loss": 0.83190846, + "learning_rate": 3.7176276829261975e-06, + "loss": 0.91043651, + "num_input_tokens_seen": 70375410, + "router_z_loss_clip": 2.62109375, + "router_z_loss_mlp": 0.24279785, + "step": 3261, + "time_per_iteration": 2.5832743644714355 + }, + { + "auxiliary_loss_clip": 0.06565593, + "auxiliary_loss_mlp": 0.01288067, + "balance_loss_clip": 0.06304751, + "balance_loss_mlp": 0.01263296, + "epoch": 0.19612205020291598, + "flos": 28483050850560.0, + "grad_norm": 1.7951789750723233, + "language_loss": 0.77451867, + "learning_rate": 3.717428133894807e-06, + "loss": 0.85305524, + "num_input_tokens_seen": 70396315, + "router_z_loss_clip": 2.60546875, + "router_z_loss_mlp": 0.24768066, + "step": 3262, + "time_per_iteration": 2.5895204544067383 + }, + { + "auxiliary_loss_clip": 0.06560683, + "auxiliary_loss_mlp": 0.01286928, + "balance_loss_clip": 0.06303811, + "balance_loss_mlp": 0.01264004, + "epoch": 0.19618217345558395, + "flos": 25563666746880.0, + "grad_norm": 1.6758780497522678, + "language_loss": 0.87025416, + "learning_rate": 3.71722851973837e-06, + "loss": 0.94873023, + "num_input_tokens_seen": 70417945, + "router_z_loss_clip": 2.57226562, + "router_z_loss_mlp": 0.22937012, + "step": 3263, + "time_per_iteration": 2.5864033699035645 + }, + { + "auxiliary_loss_clip": 0.0656628, + "auxiliary_loss_mlp": 0.01296773, + "balance_loss_clip": 0.06306224, + "balance_loss_mlp": 0.0127137, + "epoch": 0.1962422967082519, + "flos": 25271359378560.0, + "grad_norm": 1.67172611639437, + "language_loss": 0.74829996, + "learning_rate": 3.717028840464455e-06, + "loss": 0.82693052, + "num_input_tokens_seen": 70438690, + "router_z_loss_clip": 2.60546875, + "router_z_loss_mlp": 0.25390625, + "step": 3264, + "time_per_iteration": 2.5601091384887695 + }, + { + "auxiliary_loss_clip": 0.06569743, + "auxiliary_loss_mlp": 0.01288835, + "balance_loss_clip": 0.0631538, + "balance_loss_mlp": 0.01264337, + "epoch": 0.19630241996091988, + "flos": 18813371898240.0, + "grad_norm": 2.189524829184907, + "language_loss": 0.7983582, + "learning_rate": 3.7168290960806344e-06, + "loss": 0.87694395, + "num_input_tokens_seen": 70455385, + "router_z_loss_clip": 2.546875, + "router_z_loss_mlp": 0.24511719, + "step": 3265, + "time_per_iteration": 2.540691614151001 + }, + { + "auxiliary_loss_clip": 0.06455089, + "auxiliary_loss_mlp": 0.01265834, + "balance_loss_clip": 0.06313262, + "balance_loss_mlp": 0.01257317, + "epoch": 0.19636254321358784, + "flos": 62338240120320.0, + "grad_norm": 0.7691014679533006, + "language_loss": 0.53069305, + "learning_rate": 3.716629286594483e-06, + "loss": 0.60790235, + "num_input_tokens_seen": 70514280, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.08526611, + "step": 3266, + "time_per_iteration": 3.1712465286254883 + }, + { + "auxiliary_loss_clip": 0.06579427, + "auxiliary_loss_mlp": 0.01300624, + "balance_loss_clip": 0.06317084, + "balance_loss_mlp": 0.01276138, + "epoch": 0.19642266646625584, + "flos": 21075703810560.0, + "grad_norm": 2.1807082930425548, + "language_loss": 0.8080219, + "learning_rate": 3.7164294120135767e-06, + "loss": 0.88682246, + "num_input_tokens_seen": 70531800, + "router_z_loss_clip": 2.61914062, + "router_z_loss_mlp": 0.24487305, + "step": 3267, + "time_per_iteration": 2.551907539367676 + }, + { + "auxiliary_loss_clip": 0.06564153, + "auxiliary_loss_mlp": 0.0128147, + "balance_loss_clip": 0.06308893, + "balance_loss_mlp": 0.01257366, + "epoch": 0.1964827897189238, + "flos": 14543979137280.0, + "grad_norm": 2.1592598522148694, + "language_loss": 0.8731035, + "learning_rate": 3.7162294723454953e-06, + "loss": 0.95155978, + "num_input_tokens_seen": 70550615, + "router_z_loss_clip": 2.55078125, + "router_z_loss_mlp": 0.24108887, + "step": 3268, + "time_per_iteration": 2.520824909210205 + }, + { + "auxiliary_loss_clip": 0.06570253, + "auxiliary_loss_mlp": 0.01291413, + "balance_loss_clip": 0.0631839, + "balance_loss_mlp": 0.01268858, + "epoch": 0.19654291297159177, + "flos": 19250638030080.0, + "grad_norm": 2.3684809338902215, + "language_loss": 0.70127171, + "learning_rate": 3.7160294675978197e-06, + "loss": 0.77988833, + "num_input_tokens_seen": 70568690, + "router_z_loss_clip": 2.52148438, + "router_z_loss_mlp": 0.22546387, + "step": 3269, + "time_per_iteration": 2.542065382003784 + }, + { + "auxiliary_loss_clip": 0.06579614, + "auxiliary_loss_mlp": 0.01289007, + "balance_loss_clip": 0.06318989, + "balance_loss_mlp": 0.01263008, + "epoch": 0.19660303622425973, + "flos": 25782823900800.0, + "grad_norm": 3.1056086534351324, + "language_loss": 0.80997849, + "learning_rate": 3.715829397778135e-06, + "loss": 0.88866472, + "num_input_tokens_seen": 70588665, + "router_z_loss_clip": 2.60351562, + "router_z_loss_mlp": 0.25976562, + "step": 3270, + "time_per_iteration": 2.5732779502868652 + }, + { + "auxiliary_loss_clip": 0.0656828, + "auxiliary_loss_mlp": 0.0128367, + "balance_loss_clip": 0.06310552, + "balance_loss_mlp": 0.01257468, + "epoch": 0.1966631594769277, + "flos": 20601401374080.0, + "grad_norm": 4.117702501056874, + "language_loss": 0.84620351, + "learning_rate": 3.715629262894028e-06, + "loss": 0.92472303, + "num_input_tokens_seen": 70606900, + "router_z_loss_clip": 2.58007812, + "router_z_loss_mlp": 0.26220703, + "step": 3271, + "time_per_iteration": 2.54874587059021 + }, + { + "auxiliary_loss_clip": 0.06565209, + "auxiliary_loss_mlp": 0.01287963, + "balance_loss_clip": 0.06316341, + "balance_loss_mlp": 0.01263311, + "epoch": 0.19672328272959566, + "flos": 23629965747840.0, + "grad_norm": 1.9724475535226151, + "language_loss": 0.8064115, + "learning_rate": 3.715429062953087e-06, + "loss": 0.88494325, + "num_input_tokens_seen": 70625955, + "router_z_loss_clip": 2.48828125, + "router_z_loss_mlp": 0.2467041, + "step": 3272, + "time_per_iteration": 2.5446958541870117 + }, + { + "auxiliary_loss_clip": 0.06582461, + "auxiliary_loss_mlp": 0.01289002, + "balance_loss_clip": 0.06322335, + "balance_loss_mlp": 0.0126218, + "epoch": 0.19678340598226365, + "flos": 23117369195520.0, + "grad_norm": 1.7276133269560208, + "language_loss": 0.81592834, + "learning_rate": 3.7152287979629043e-06, + "loss": 0.89464301, + "num_input_tokens_seen": 70646090, + "router_z_loss_clip": 2.60351562, + "router_z_loss_mlp": 0.26831055, + "step": 3273, + "time_per_iteration": 2.625422239303589 + }, + { + "auxiliary_loss_clip": 0.06569564, + "auxiliary_loss_mlp": 0.01284595, + "balance_loss_clip": 0.06313652, + "balance_loss_mlp": 0.0126142, + "epoch": 0.19684352923493162, + "flos": 24541702024320.0, + "grad_norm": 1.8603958272733907, + "language_loss": 0.78998351, + "learning_rate": 3.7150284679310735e-06, + "loss": 0.86852515, + "num_input_tokens_seen": 70666065, + "router_z_loss_clip": 2.5625, + "router_z_loss_mlp": 0.23181152, + "step": 3274, + "time_per_iteration": 2.6299047470092773 + }, + { + "auxiliary_loss_clip": 0.06566115, + "auxiliary_loss_mlp": 0.01283599, + "balance_loss_clip": 0.0630929, + "balance_loss_mlp": 0.01259722, + "epoch": 0.19690365248759958, + "flos": 21802510126080.0, + "grad_norm": 2.495100495270235, + "language_loss": 0.82370663, + "learning_rate": 3.7148280728651914e-06, + "loss": 0.90220374, + "num_input_tokens_seen": 70681580, + "router_z_loss_clip": 2.56835938, + "router_z_loss_mlp": 0.23864746, + "step": 3275, + "time_per_iteration": 2.532348394393921 + }, + { + "auxiliary_loss_clip": 0.06571324, + "auxiliary_loss_mlp": 0.0128437, + "balance_loss_clip": 0.06313166, + "balance_loss_mlp": 0.01259134, + "epoch": 0.19696377574026755, + "flos": 19061683073280.0, + "grad_norm": 2.1007591714873968, + "language_loss": 0.81547761, + "learning_rate": 3.7146276127728563e-06, + "loss": 0.8940345, + "num_input_tokens_seen": 70697745, + "router_z_loss_clip": 2.58203125, + "router_z_loss_mlp": 0.25244141, + "step": 3276, + "time_per_iteration": 2.533137798309326 + }, + { + "auxiliary_loss_clip": 0.06571773, + "auxiliary_loss_mlp": 0.01280216, + "balance_loss_clip": 0.0631392, + "balance_loss_mlp": 0.01256135, + "epoch": 0.19702389899293551, + "flos": 22827325887360.0, + "grad_norm": 2.204561669505926, + "language_loss": 0.89893198, + "learning_rate": 3.7144270876616713e-06, + "loss": 0.97745186, + "num_input_tokens_seen": 70715110, + "router_z_loss_clip": 2.58007812, + "router_z_loss_mlp": 0.24084473, + "step": 3277, + "time_per_iteration": 2.5781216621398926 + }, + { + "auxiliary_loss_clip": 0.0657627, + "auxiliary_loss_mlp": 0.01285494, + "balance_loss_clip": 0.06313394, + "balance_loss_mlp": 0.01258922, + "epoch": 0.19708402224560348, + "flos": 22901021153280.0, + "grad_norm": 2.1685116517567273, + "language_loss": 0.63218272, + "learning_rate": 3.714226497539239e-06, + "loss": 0.71080041, + "num_input_tokens_seen": 70734715, + "router_z_loss_clip": 2.62890625, + "router_z_loss_mlp": 0.26574707, + "step": 3278, + "time_per_iteration": 2.5733482837677 + }, + { + "auxiliary_loss_clip": 0.06573428, + "auxiliary_loss_mlp": 0.01286907, + "balance_loss_clip": 0.0631459, + "balance_loss_mlp": 0.01261515, + "epoch": 0.19714414549827144, + "flos": 25668989729280.0, + "grad_norm": 2.1172991336759983, + "language_loss": 0.75555933, + "learning_rate": 3.714025842413166e-06, + "loss": 0.83416271, + "num_input_tokens_seen": 70752650, + "router_z_loss_clip": 2.5859375, + "router_z_loss_mlp": 0.25378418, + "step": 3279, + "time_per_iteration": 2.598710775375366 + }, + { + "auxiliary_loss_clip": 0.06574699, + "auxiliary_loss_mlp": 0.0128012, + "balance_loss_clip": 0.06317799, + "balance_loss_mlp": 0.01256671, + "epoch": 0.19720426875093944, + "flos": 23922776240640.0, + "grad_norm": 1.6530428540457747, + "language_loss": 0.82974696, + "learning_rate": 3.713825122291061e-06, + "loss": 0.90829515, + "num_input_tokens_seen": 70772365, + "router_z_loss_clip": 2.56835938, + "router_z_loss_mlp": 0.23449707, + "step": 3280, + "time_per_iteration": 2.618016481399536 + }, + { + "auxiliary_loss_clip": 0.06568167, + "auxiliary_loss_mlp": 0.01283165, + "balance_loss_clip": 0.0630914, + "balance_loss_mlp": 0.01259085, + "epoch": 0.1972643920036074, + "flos": 13887178508160.0, + "grad_norm": 2.6497469055747036, + "language_loss": 0.78509879, + "learning_rate": 3.713624337180536e-06, + "loss": 0.86361206, + "num_input_tokens_seen": 70790340, + "router_z_loss_clip": 2.58984375, + "router_z_loss_mlp": 0.24084473, + "step": 3281, + "time_per_iteration": 2.5222740173339844 + }, + { + "auxiliary_loss_clip": 0.06561945, + "auxiliary_loss_mlp": 0.01286304, + "balance_loss_clip": 0.06312899, + "balance_loss_mlp": 0.01263952, + "epoch": 0.19732451525627537, + "flos": 19869479959680.0, + "grad_norm": 1.7725817592402109, + "language_loss": 0.80340242, + "learning_rate": 3.7134234870892045e-06, + "loss": 0.88188481, + "num_input_tokens_seen": 70809295, + "router_z_loss_clip": 2.4921875, + "router_z_loss_mlp": 0.22351074, + "step": 3282, + "time_per_iteration": 2.6235008239746094 + }, + { + "auxiliary_loss_clip": 0.06573974, + "auxiliary_loss_mlp": 0.01283963, + "balance_loss_clip": 0.06315407, + "balance_loss_mlp": 0.01259668, + "epoch": 0.19738463850894333, + "flos": 24980477529600.0, + "grad_norm": 1.861487958506938, + "language_loss": 0.72318685, + "learning_rate": 3.7132225720246826e-06, + "loss": 0.80176622, + "num_input_tokens_seen": 70828765, + "router_z_loss_clip": 2.58398438, + "router_z_loss_mlp": 0.24304199, + "step": 3283, + "time_per_iteration": 2.5938494205474854 + }, + { + "auxiliary_loss_clip": 0.06574511, + "auxiliary_loss_mlp": 0.01281543, + "balance_loss_clip": 0.06317373, + "balance_loss_mlp": 0.01256247, + "epoch": 0.1974447617616113, + "flos": 18374722174080.0, + "grad_norm": 1.6759301931344739, + "language_loss": 0.79791147, + "learning_rate": 3.7130215919945886e-06, + "loss": 0.87647206, + "num_input_tokens_seen": 70846805, + "router_z_loss_clip": 2.56640625, + "router_z_loss_mlp": 0.25292969, + "step": 3284, + "time_per_iteration": 2.530935049057007 + }, + { + "auxiliary_loss_clip": 0.06572407, + "auxiliary_loss_mlp": 0.01285612, + "balance_loss_clip": 0.06312867, + "balance_loss_mlp": 0.01260554, + "epoch": 0.19750488501427926, + "flos": 22899511779840.0, + "grad_norm": 1.8637255752391477, + "language_loss": 0.87043929, + "learning_rate": 3.7128205470065445e-06, + "loss": 0.94901949, + "num_input_tokens_seen": 70863805, + "router_z_loss_clip": 2.59570312, + "router_z_loss_mlp": 0.25061035, + "step": 3285, + "time_per_iteration": 2.5539395809173584 + }, + { + "auxiliary_loss_clip": 0.06561802, + "auxiliary_loss_mlp": 0.01282259, + "balance_loss_clip": 0.06307627, + "balance_loss_mlp": 0.01258012, + "epoch": 0.19756500826694723, + "flos": 21877924400640.0, + "grad_norm": 2.4795216745498956, + "language_loss": 0.88948774, + "learning_rate": 3.712619437068174e-06, + "loss": 0.96792841, + "num_input_tokens_seen": 70882660, + "router_z_loss_clip": 2.54296875, + "router_z_loss_mlp": 0.24243164, + "step": 3286, + "time_per_iteration": 2.5367021560668945 + }, + { + "auxiliary_loss_clip": 0.06569161, + "auxiliary_loss_mlp": 0.01280864, + "balance_loss_clip": 0.06308903, + "balance_loss_mlp": 0.01256641, + "epoch": 0.19762513151961522, + "flos": 15164414294400.0, + "grad_norm": 2.1735993607640904, + "language_loss": 0.79236507, + "learning_rate": 3.712418262187102e-06, + "loss": 0.87086535, + "num_input_tokens_seen": 70898765, + "router_z_loss_clip": 2.6015625, + "router_z_loss_mlp": 0.24230957, + "step": 3287, + "time_per_iteration": 2.4954702854156494 + }, + { + "auxiliary_loss_clip": 0.0656468, + "auxiliary_loss_mlp": 0.01280142, + "balance_loss_clip": 0.0630395, + "balance_loss_mlp": 0.01256824, + "epoch": 0.1976852547722832, + "flos": 16984239194880.0, + "grad_norm": 4.513328663516958, + "language_loss": 0.81957221, + "learning_rate": 3.7122170223709584e-06, + "loss": 0.89802045, + "num_input_tokens_seen": 70916370, + "router_z_loss_clip": 2.60742188, + "router_z_loss_mlp": 0.23303223, + "step": 3288, + "time_per_iteration": 2.504995584487915 + }, + { + "auxiliary_loss_clip": 0.0655796, + "auxiliary_loss_mlp": 0.01284666, + "balance_loss_clip": 0.06307058, + "balance_loss_mlp": 0.01260526, + "epoch": 0.19774537802495115, + "flos": 20309135932800.0, + "grad_norm": 2.127297919409227, + "language_loss": 0.73378497, + "learning_rate": 3.712015717627374e-06, + "loss": 0.81221128, + "num_input_tokens_seen": 70934870, + "router_z_loss_clip": 2.50976562, + "router_z_loss_mlp": 0.24157715, + "step": 3289, + "time_per_iteration": 2.5189085006713867 + }, + { + "auxiliary_loss_clip": 0.06562441, + "auxiliary_loss_mlp": 0.01280497, + "balance_loss_clip": 0.06308928, + "balance_loss_mlp": 0.0125718, + "epoch": 0.19780550127761912, + "flos": 27242893296000.0, + "grad_norm": 3.229663808517491, + "language_loss": 0.79990375, + "learning_rate": 3.7118143479639813e-06, + "loss": 0.87833309, + "num_input_tokens_seen": 70955140, + "router_z_loss_clip": 2.53320312, + "router_z_loss_mlp": 0.2331543, + "step": 3290, + "time_per_iteration": 2.615630626678467 + }, + { + "auxiliary_loss_clip": 0.06446102, + "auxiliary_loss_mlp": 0.01262954, + "balance_loss_clip": 0.06305116, + "balance_loss_mlp": 0.01256308, + "epoch": 0.19786562453028708, + "flos": 63572597015040.0, + "grad_norm": 0.871535655745335, + "language_loss": 0.60331321, + "learning_rate": 3.711612913388418e-06, + "loss": 0.68040371, + "num_input_tokens_seen": 71012005, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.06658936, + "step": 3291, + "time_per_iteration": 3.1708285808563232 + }, + { + "auxiliary_loss_clip": 0.06578626, + "auxiliary_loss_mlp": 0.01283318, + "balance_loss_clip": 0.06312629, + "balance_loss_mlp": 0.0125621, + "epoch": 0.19792574778295505, + "flos": 26293869152640.0, + "grad_norm": 1.6662005392394712, + "language_loss": 0.82490212, + "learning_rate": 3.7114114139083204e-06, + "loss": 0.90352154, + "num_input_tokens_seen": 71031140, + "router_z_loss_clip": 2.65820312, + "router_z_loss_mlp": 0.2713623, + "step": 3292, + "time_per_iteration": 4.009428024291992 + }, + { + "auxiliary_loss_clip": 0.06559315, + "auxiliary_loss_mlp": 0.01281718, + "balance_loss_clip": 0.06308785, + "balance_loss_mlp": 0.01259641, + "epoch": 0.19798587103562304, + "flos": 19944265328640.0, + "grad_norm": 2.398610043576172, + "language_loss": 0.82271063, + "learning_rate": 3.7112098495313313e-06, + "loss": 0.9011209, + "num_input_tokens_seen": 71050250, + "router_z_loss_clip": 2.50390625, + "router_z_loss_mlp": 0.2208252, + "step": 3293, + "time_per_iteration": 2.5567917823791504 + }, + { + "auxiliary_loss_clip": 0.06584712, + "auxiliary_loss_mlp": 0.0128547, + "balance_loss_clip": 0.06316388, + "balance_loss_mlp": 0.01259351, + "epoch": 0.198045994288291, + "flos": 20126428104960.0, + "grad_norm": 22.121432113432896, + "language_loss": 0.62642097, + "learning_rate": 3.711008220265093e-06, + "loss": 0.70512283, + "num_input_tokens_seen": 71068665, + "router_z_loss_clip": 2.68554688, + "router_z_loss_mlp": 0.26135254, + "step": 3294, + "time_per_iteration": 4.055817365646362 + }, + { + "auxiliary_loss_clip": 0.06568369, + "auxiliary_loss_mlp": 0.01283249, + "balance_loss_clip": 0.06312987, + "balance_loss_mlp": 0.01259849, + "epoch": 0.19810611754095897, + "flos": 17973444170880.0, + "grad_norm": 2.078666367863598, + "language_loss": 0.88182533, + "learning_rate": 3.710806526117251e-06, + "loss": 0.96034157, + "num_input_tokens_seen": 71085320, + "router_z_loss_clip": 2.55273438, + "router_z_loss_mlp": 0.23413086, + "step": 3295, + "time_per_iteration": 2.616658926010132 + }, + { + "auxiliary_loss_clip": 0.06566019, + "auxiliary_loss_mlp": 0.01286636, + "balance_loss_clip": 0.06313851, + "balance_loss_mlp": 0.01265298, + "epoch": 0.19816624079362694, + "flos": 15090257831040.0, + "grad_norm": 2.9890739239636575, + "language_loss": 0.82427287, + "learning_rate": 3.7106047670954544e-06, + "loss": 0.90279943, + "num_input_tokens_seen": 71102020, + "router_z_loss_clip": 2.5234375, + "router_z_loss_mlp": 0.21337891, + "step": 3296, + "time_per_iteration": 2.642479658126831 + }, + { + "auxiliary_loss_clip": 0.06579386, + "auxiliary_loss_mlp": 0.01281841, + "balance_loss_clip": 0.06320241, + "balance_loss_mlp": 0.01256593, + "epoch": 0.1982263640462949, + "flos": 24907327315200.0, + "grad_norm": 2.6461649791490522, + "language_loss": 0.69111884, + "learning_rate": 3.710402943207354e-06, + "loss": 0.76973104, + "num_input_tokens_seen": 71123390, + "router_z_loss_clip": 2.59179688, + "router_z_loss_mlp": 0.25268555, + "step": 3297, + "time_per_iteration": 2.5983548164367676 + }, + { + "auxiliary_loss_clip": 0.06568186, + "auxiliary_loss_mlp": 0.01294298, + "balance_loss_clip": 0.06316572, + "balance_loss_mlp": 0.01272125, + "epoch": 0.19828648729896287, + "flos": 20382453855360.0, + "grad_norm": 1.615710211373745, + "language_loss": 0.8249923, + "learning_rate": 3.7102010544606016e-06, + "loss": 0.90361714, + "num_input_tokens_seen": 71141800, + "router_z_loss_clip": 2.51757812, + "router_z_loss_mlp": 0.22167969, + "step": 3298, + "time_per_iteration": 2.548333168029785 + }, + { + "auxiliary_loss_clip": 0.0657866, + "auxiliary_loss_mlp": 0.01298019, + "balance_loss_clip": 0.06318102, + "balance_loss_mlp": 0.01272592, + "epoch": 0.19834661055163083, + "flos": 18886018988160.0, + "grad_norm": 1.9534827487794544, + "language_loss": 0.86188138, + "learning_rate": 3.7099991008628544e-06, + "loss": 0.94064808, + "num_input_tokens_seen": 71159505, + "router_z_loss_clip": 2.60546875, + "router_z_loss_mlp": 0.25402832, + "step": 3299, + "time_per_iteration": 3.944326400756836 + }, + { + "auxiliary_loss_clip": 0.06449087, + "auxiliary_loss_mlp": 0.01270227, + "balance_loss_clip": 0.06307668, + "balance_loss_mlp": 0.01262615, + "epoch": 0.19840673380429882, + "flos": 60278908723200.0, + "grad_norm": 0.7519898728992364, + "language_loss": 0.53224742, + "learning_rate": 3.7097970824217706e-06, + "loss": 0.60944057, + "num_input_tokens_seen": 71223265, + "router_z_loss_clip": 1.41308594, + "router_z_loss_mlp": 0.07598877, + "step": 3300, + "time_per_iteration": 4.6055073738098145 + }, + { + "auxiliary_loss_clip": 0.06570522, + "auxiliary_loss_mlp": 0.01292871, + "balance_loss_clip": 0.06315967, + "balance_loss_mlp": 0.01267706, + "epoch": 0.1984668570569668, + "flos": 19908235272960.0, + "grad_norm": 2.2853574973511472, + "language_loss": 0.73847342, + "learning_rate": 3.7095949991450093e-06, + "loss": 0.81710732, + "num_input_tokens_seen": 71242385, + "router_z_loss_clip": 2.546875, + "router_z_loss_mlp": 0.25183105, + "step": 3301, + "time_per_iteration": 2.6006925106048584 + }, + { + "auxiliary_loss_clip": 0.06563142, + "auxiliary_loss_mlp": 0.01290092, + "balance_loss_clip": 0.0631086, + "balance_loss_mlp": 0.01267239, + "epoch": 0.19852698030963475, + "flos": 15635865692160.0, + "grad_norm": 3.8656690955217976, + "language_loss": 0.8953101, + "learning_rate": 3.709392851040235e-06, + "loss": 0.9738425, + "num_input_tokens_seen": 71258990, + "router_z_loss_clip": 2.5234375, + "router_z_loss_mlp": 0.22851562, + "step": 3302, + "time_per_iteration": 2.487173080444336 + }, + { + "auxiliary_loss_clip": 0.06567049, + "auxiliary_loss_mlp": 0.0128658, + "balance_loss_clip": 0.06310292, + "balance_loss_mlp": 0.01263013, + "epoch": 0.19858710356230272, + "flos": 43153037729280.0, + "grad_norm": 2.6127475741484347, + "language_loss": 0.74595749, + "learning_rate": 3.709190638115111e-06, + "loss": 0.82449377, + "num_input_tokens_seen": 71282770, + "router_z_loss_clip": 2.56835938, + "router_z_loss_mlp": 0.23596191, + "step": 3303, + "time_per_iteration": 2.733031749725342 + }, + { + "auxiliary_loss_clip": 0.06567588, + "auxiliary_loss_mlp": 0.0129499, + "balance_loss_clip": 0.06313773, + "balance_loss_mlp": 0.01270373, + "epoch": 0.19864722681497068, + "flos": 35151348879360.0, + "grad_norm": 2.3312818962460686, + "language_loss": 0.75973707, + "learning_rate": 3.7089883603773084e-06, + "loss": 0.83836287, + "num_input_tokens_seen": 71301410, + "router_z_loss_clip": 2.54101562, + "router_z_loss_mlp": 0.24597168, + "step": 3304, + "time_per_iteration": 2.627612829208374 + }, + { + "auxiliary_loss_clip": 0.06565879, + "auxiliary_loss_mlp": 0.01301567, + "balance_loss_clip": 0.06315561, + "balance_loss_mlp": 0.01279156, + "epoch": 0.19870735006763865, + "flos": 19432088046720.0, + "grad_norm": 2.2073504264205277, + "language_loss": 0.86939341, + "learning_rate": 3.7087860178344955e-06, + "loss": 0.9480679, + "num_input_tokens_seen": 71319670, + "router_z_loss_clip": 2.50390625, + "router_z_loss_mlp": 0.22399902, + "step": 3305, + "time_per_iteration": 2.5243277549743652 + }, + { + "auxiliary_loss_clip": 0.06573498, + "auxiliary_loss_mlp": 0.01293424, + "balance_loss_clip": 0.06314258, + "balance_loss_mlp": 0.01270035, + "epoch": 0.19876747332030664, + "flos": 23553671005440.0, + "grad_norm": 1.7277126311559312, + "language_loss": 0.69397068, + "learning_rate": 3.7085836104943445e-06, + "loss": 0.77263987, + "num_input_tokens_seen": 71339850, + "router_z_loss_clip": 2.59570312, + "router_z_loss_mlp": 0.23388672, + "step": 3306, + "time_per_iteration": 2.6042323112487793 + }, + { + "auxiliary_loss_clip": 0.06570327, + "auxiliary_loss_mlp": 0.01299594, + "balance_loss_clip": 0.06314942, + "balance_loss_mlp": 0.0127723, + "epoch": 0.1988275965729746, + "flos": 19835672037120.0, + "grad_norm": 3.1120189325389735, + "language_loss": 0.77373499, + "learning_rate": 3.7083811383645332e-06, + "loss": 0.85243422, + "num_input_tokens_seen": 71359795, + "router_z_loss_clip": 2.55273438, + "router_z_loss_mlp": 0.22375488, + "step": 3307, + "time_per_iteration": 2.6128084659576416 + }, + { + "auxiliary_loss_clip": 0.06569448, + "auxiliary_loss_mlp": 0.01292327, + "balance_loss_clip": 0.06316574, + "balance_loss_mlp": 0.01270452, + "epoch": 0.19888771982564257, + "flos": 23520366207360.0, + "grad_norm": 3.545114094394172, + "language_loss": 0.7662878, + "learning_rate": 3.708178601452737e-06, + "loss": 0.84490561, + "num_input_tokens_seen": 71378885, + "router_z_loss_clip": 2.52929688, + "router_z_loss_mlp": 0.21875, + "step": 3308, + "time_per_iteration": 2.5699222087860107 + }, + { + "auxiliary_loss_clip": 0.06565186, + "auxiliary_loss_mlp": 0.01291629, + "balance_loss_clip": 0.0631263, + "balance_loss_mlp": 0.0126799, + "epoch": 0.19894784307831054, + "flos": 18156403560960.0, + "grad_norm": 1.7056349525902872, + "language_loss": 0.76261461, + "learning_rate": 3.7079759997666374e-06, + "loss": 0.84118271, + "num_input_tokens_seen": 71397285, + "router_z_loss_clip": 2.52539062, + "router_z_loss_mlp": 0.23657227, + "step": 3309, + "time_per_iteration": 2.5804028511047363 + }, + { + "auxiliary_loss_clip": 0.06557433, + "auxiliary_loss_mlp": 0.01287248, + "balance_loss_clip": 0.06309506, + "balance_loss_mlp": 0.0126287, + "epoch": 0.1990079663309785, + "flos": 24282280183680.0, + "grad_norm": 1.5893437900436935, + "language_loss": 0.8845197, + "learning_rate": 3.707773333313917e-06, + "loss": 0.96296644, + "num_input_tokens_seen": 71415775, + "router_z_loss_clip": 2.47851562, + "router_z_loss_mlp": 0.24377441, + "step": 3310, + "time_per_iteration": 2.540788412094116 + }, + { + "auxiliary_loss_clip": 0.06554775, + "auxiliary_loss_mlp": 0.01280476, + "balance_loss_clip": 0.06304908, + "balance_loss_mlp": 0.01256575, + "epoch": 0.19906808958364647, + "flos": 34906391867520.0, + "grad_norm": 2.4688423193302347, + "language_loss": 0.64663219, + "learning_rate": 3.70757060210226e-06, + "loss": 0.72498477, + "num_input_tokens_seen": 71437315, + "router_z_loss_clip": 2.50195312, + "router_z_loss_mlp": 0.23925781, + "step": 3311, + "time_per_iteration": 2.6754508018493652 + }, + { + "auxiliary_loss_clip": 0.06567319, + "auxiliary_loss_mlp": 0.01285122, + "balance_loss_clip": 0.06310549, + "balance_loss_mlp": 0.01261351, + "epoch": 0.19912821283631443, + "flos": 24031788802560.0, + "grad_norm": 3.0857408174701186, + "language_loss": 0.75624847, + "learning_rate": 3.707367806139355e-06, + "loss": 0.83477283, + "num_input_tokens_seen": 71456320, + "router_z_loss_clip": 2.56445312, + "router_z_loss_mlp": 0.23779297, + "step": 3312, + "time_per_iteration": 2.5815083980560303 + }, + { + "auxiliary_loss_clip": 0.06553487, + "auxiliary_loss_mlp": 0.01286524, + "balance_loss_clip": 0.06300232, + "balance_loss_mlp": 0.01262611, + "epoch": 0.19918833608898243, + "flos": 19864155225600.0, + "grad_norm": 2.0583715987658264, + "language_loss": 0.84526402, + "learning_rate": 3.7071649454328915e-06, + "loss": 0.92366409, + "num_input_tokens_seen": 71475360, + "router_z_loss_clip": 2.53125, + "router_z_loss_mlp": 0.23937988, + "step": 3313, + "time_per_iteration": 2.5260941982269287 + }, + { + "auxiliary_loss_clip": 0.06547163, + "auxiliary_loss_mlp": 0.01284622, + "balance_loss_clip": 0.06294618, + "balance_loss_mlp": 0.01261376, + "epoch": 0.1992484593416504, + "flos": 29103444080640.0, + "grad_norm": 1.8813056340492245, + "language_loss": 0.81481469, + "learning_rate": 3.7069620199905625e-06, + "loss": 0.89313251, + "num_input_tokens_seen": 71496155, + "router_z_loss_clip": 2.52539062, + "router_z_loss_mlp": 0.2322998, + "step": 3314, + "time_per_iteration": 2.618865966796875 + }, + { + "auxiliary_loss_clip": 0.06544838, + "auxiliary_loss_mlp": 0.01278619, + "balance_loss_clip": 0.06300788, + "balance_loss_mlp": 0.01257924, + "epoch": 0.19930858259431836, + "flos": 23301754104960.0, + "grad_norm": 1.60969518187187, + "language_loss": 0.88063407, + "learning_rate": 3.7067590298200627e-06, + "loss": 0.95886856, + "num_input_tokens_seen": 71517295, + "router_z_loss_clip": 2.43945312, + "router_z_loss_mlp": 0.20690918, + "step": 3315, + "time_per_iteration": 2.5732057094573975 + }, + { + "auxiliary_loss_clip": 0.06550217, + "auxiliary_loss_mlp": 0.01280633, + "balance_loss_clip": 0.06298293, + "balance_loss_mlp": 0.0125728, + "epoch": 0.19936870584698632, + "flos": 25386619069440.0, + "grad_norm": 1.6023919835075873, + "language_loss": 0.71362162, + "learning_rate": 3.7065559749290892e-06, + "loss": 0.79193014, + "num_input_tokens_seen": 71540000, + "router_z_loss_clip": 2.51953125, + "router_z_loss_mlp": 0.23352051, + "step": 3316, + "time_per_iteration": 2.6071085929870605 + }, + { + "auxiliary_loss_clip": 0.06427301, + "auxiliary_loss_mlp": 0.01269861, + "balance_loss_clip": 0.06290084, + "balance_loss_mlp": 0.01263975, + "epoch": 0.1994288290996543, + "flos": 62190038246400.0, + "grad_norm": 0.8251623423654184, + "language_loss": 0.6634506, + "learning_rate": 3.706352855325342e-06, + "loss": 0.74042213, + "num_input_tokens_seen": 71607880, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.05880737, + "step": 3317, + "time_per_iteration": 3.216862201690674 + }, + { + "auxiliary_loss_clip": 0.06558052, + "auxiliary_loss_mlp": 0.01286476, + "balance_loss_clip": 0.06302503, + "balance_loss_mlp": 0.01262813, + "epoch": 0.19948895235232225, + "flos": 19031816292480.0, + "grad_norm": 2.159914212237722, + "language_loss": 0.74519444, + "learning_rate": 3.7061496710165233e-06, + "loss": 0.82363975, + "num_input_tokens_seen": 71625695, + "router_z_loss_clip": 2.55078125, + "router_z_loss_mlp": 0.23669434, + "step": 3318, + "time_per_iteration": 2.5432114601135254 + }, + { + "auxiliary_loss_clip": 0.06544004, + "auxiliary_loss_mlp": 0.01278248, + "balance_loss_clip": 0.06298326, + "balance_loss_mlp": 0.01256266, + "epoch": 0.19954907560499022, + "flos": 37824895503360.0, + "grad_norm": 2.0763327087054604, + "language_loss": 0.79865813, + "learning_rate": 3.7059464220103385e-06, + "loss": 0.87688065, + "num_input_tokens_seen": 71648520, + "router_z_loss_clip": 2.45703125, + "router_z_loss_mlp": 0.21984863, + "step": 3319, + "time_per_iteration": 2.6703901290893555 + }, + { + "auxiliary_loss_clip": 0.06551617, + "auxiliary_loss_mlp": 0.01282829, + "balance_loss_clip": 0.06300303, + "balance_loss_mlp": 0.01259631, + "epoch": 0.1996091988576582, + "flos": 49576420673280.0, + "grad_norm": 2.869788826425785, + "language_loss": 0.763668, + "learning_rate": 3.7057431083144945e-06, + "loss": 0.84201247, + "num_input_tokens_seen": 71672185, + "router_z_loss_clip": 2.51757812, + "router_z_loss_mlp": 0.2322998, + "step": 3320, + "time_per_iteration": 2.817199945449829 + }, + { + "auxiliary_loss_clip": 0.06552573, + "auxiliary_loss_mlp": 0.01291841, + "balance_loss_clip": 0.06302333, + "balance_loss_mlp": 0.01269608, + "epoch": 0.19966932211032618, + "flos": 22642018583040.0, + "grad_norm": 1.4988243809721686, + "language_loss": 0.81033528, + "learning_rate": 3.705539729936701e-06, + "loss": 0.8887794, + "num_input_tokens_seen": 71692890, + "router_z_loss_clip": 2.50390625, + "router_z_loss_mlp": 0.22229004, + "step": 3321, + "time_per_iteration": 2.6688761711120605 + }, + { + "auxiliary_loss_clip": 0.06416404, + "auxiliary_loss_mlp": 0.01265491, + "balance_loss_clip": 0.06281121, + "balance_loss_mlp": 0.01258195, + "epoch": 0.19972944536299414, + "flos": 54098973417600.0, + "grad_norm": 0.8569411614728654, + "language_loss": 0.65245974, + "learning_rate": 3.7053362868846696e-06, + "loss": 0.72927874, + "num_input_tokens_seen": 71745815, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.07275391, + "step": 3322, + "time_per_iteration": 3.000269651412964 + }, + { + "auxiliary_loss_clip": 0.06410387, + "auxiliary_loss_mlp": 0.01261864, + "balance_loss_clip": 0.06274698, + "balance_loss_mlp": 0.01254372, + "epoch": 0.1997895686156621, + "flos": 69371995731840.0, + "grad_norm": 0.7694165297899808, + "language_loss": 0.56849998, + "learning_rate": 3.7051327791661153e-06, + "loss": 0.64522249, + "num_input_tokens_seen": 71806915, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.07476807, + "step": 3323, + "time_per_iteration": 3.330606698989868 + }, + { + "auxiliary_loss_clip": 0.06562012, + "auxiliary_loss_mlp": 0.01292664, + "balance_loss_clip": 0.06316413, + "balance_loss_mlp": 0.01268596, + "epoch": 0.19984969186833007, + "flos": 18558058907520.0, + "grad_norm": 1.8232624283894519, + "language_loss": 0.81610429, + "learning_rate": 3.7049292067887555e-06, + "loss": 0.89465106, + "num_input_tokens_seen": 71824645, + "router_z_loss_clip": 2.45703125, + "router_z_loss_mlp": 0.24084473, + "step": 3324, + "time_per_iteration": 2.5314769744873047 + }, + { + "auxiliary_loss_clip": 0.06558169, + "auxiliary_loss_mlp": 0.01292911, + "balance_loss_clip": 0.06310347, + "balance_loss_mlp": 0.01268318, + "epoch": 0.19990981512099804, + "flos": 26436438074880.0, + "grad_norm": 1.6515442637335616, + "language_loss": 0.54047406, + "learning_rate": 3.7047255697603092e-06, + "loss": 0.61898488, + "num_input_tokens_seen": 71845125, + "router_z_loss_clip": 2.48242188, + "router_z_loss_mlp": 0.24609375, + "step": 3325, + "time_per_iteration": 2.6192479133605957 + }, + { + "auxiliary_loss_clip": 0.06565623, + "auxiliary_loss_mlp": 0.01288281, + "balance_loss_clip": 0.063146, + "balance_loss_mlp": 0.01265572, + "epoch": 0.19996993837366603, + "flos": 16331547415680.0, + "grad_norm": 1.9371709062145088, + "language_loss": 0.8658272, + "learning_rate": 3.7045218680884984e-06, + "loss": 0.94436622, + "num_input_tokens_seen": 71863500, + "router_z_loss_clip": 2.50976562, + "router_z_loss_mlp": 0.22729492, + "step": 3326, + "time_per_iteration": 2.5111629962921143 + }, + { + "auxiliary_loss_clip": 0.06551019, + "auxiliary_loss_mlp": 0.01289033, + "balance_loss_clip": 0.06305069, + "balance_loss_mlp": 0.01266705, + "epoch": 0.200030061626334, + "flos": 20849460986880.0, + "grad_norm": 6.809877440219623, + "language_loss": 0.7272824, + "learning_rate": 3.7043181017810476e-06, + "loss": 0.8056829, + "num_input_tokens_seen": 71881845, + "router_z_loss_clip": 2.45898438, + "router_z_loss_mlp": 0.22314453, + "step": 3327, + "time_per_iteration": 2.5571372509002686 + }, + { + "auxiliary_loss_clip": 0.06566358, + "auxiliary_loss_mlp": 0.01287053, + "balance_loss_clip": 0.06313111, + "balance_loss_mlp": 0.01261756, + "epoch": 0.20009018487900196, + "flos": 23768341966080.0, + "grad_norm": 1.841950801645188, + "language_loss": 0.77914047, + "learning_rate": 3.7041142708456833e-06, + "loss": 0.8576746, + "num_input_tokens_seen": 71900940, + "router_z_loss_clip": 2.53320312, + "router_z_loss_mlp": 0.25317383, + "step": 3328, + "time_per_iteration": 2.5489912033081055 + }, + { + "auxiliary_loss_clip": 0.06559211, + "auxiliary_loss_mlp": 0.01288822, + "balance_loss_clip": 0.06314486, + "balance_loss_mlp": 0.01265338, + "epoch": 0.20015030813166992, + "flos": 28119186495360.0, + "grad_norm": 1.7739956363125764, + "language_loss": 0.6938678, + "learning_rate": 3.7039103752901353e-06, + "loss": 0.77234817, + "num_input_tokens_seen": 71921925, + "router_z_loss_clip": 2.4453125, + "router_z_loss_mlp": 0.23474121, + "step": 3329, + "time_per_iteration": 2.790318489074707 + }, + { + "auxiliary_loss_clip": 0.06562928, + "auxiliary_loss_mlp": 0.01288787, + "balance_loss_clip": 0.06310034, + "balance_loss_mlp": 0.01263396, + "epoch": 0.2002104313843379, + "flos": 26074250801280.0, + "grad_norm": 1.6222638892170962, + "language_loss": 0.81793886, + "learning_rate": 3.7037064151221353e-06, + "loss": 0.896456, + "num_input_tokens_seen": 71941855, + "router_z_loss_clip": 2.52929688, + "router_z_loss_mlp": 0.25415039, + "step": 3330, + "time_per_iteration": 2.6165175437927246 + }, + { + "auxiliary_loss_clip": 0.06561245, + "auxiliary_loss_mlp": 0.01293061, + "balance_loss_clip": 0.06310615, + "balance_loss_mlp": 0.01268874, + "epoch": 0.20027055463700585, + "flos": 22973332826880.0, + "grad_norm": 3.6220429921180877, + "language_loss": 0.7808395, + "learning_rate": 3.703502390349417e-06, + "loss": 0.85938263, + "num_input_tokens_seen": 71960915, + "router_z_loss_clip": 2.50585938, + "router_z_loss_mlp": 0.24194336, + "step": 3331, + "time_per_iteration": 4.07051157951355 + }, + { + "auxiliary_loss_clip": 0.06564473, + "auxiliary_loss_mlp": 0.01290798, + "balance_loss_clip": 0.06310149, + "balance_loss_mlp": 0.01266014, + "epoch": 0.20033067788967382, + "flos": 17171433216000.0, + "grad_norm": 1.7477664730796658, + "language_loss": 0.79863441, + "learning_rate": 3.7032983009797176e-06, + "loss": 0.87718713, + "num_input_tokens_seen": 71979220, + "router_z_loss_clip": 2.54296875, + "router_z_loss_mlp": 0.24780273, + "step": 3332, + "time_per_iteration": 2.5321452617645264 + }, + { + "auxiliary_loss_clip": 0.06409155, + "auxiliary_loss_mlp": 0.01261657, + "balance_loss_clip": 0.06275231, + "balance_loss_mlp": 0.01253551, + "epoch": 0.2003908011423418, + "flos": 60842476085760.0, + "grad_norm": 0.9021189232739572, + "language_loss": 0.61913729, + "learning_rate": 3.703094147020776e-06, + "loss": 0.69584543, + "num_input_tokens_seen": 72033950, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.08105469, + "step": 3333, + "time_per_iteration": 4.713933706283569 + }, + { + "auxiliary_loss_clip": 0.06552575, + "auxiliary_loss_mlp": 0.0128469, + "balance_loss_clip": 0.06299093, + "balance_loss_mlp": 0.0126123, + "epoch": 0.20045092439500978, + "flos": 24212987256960.0, + "grad_norm": 1.8847951547254278, + "language_loss": 0.82181144, + "learning_rate": 3.7028899284803334e-06, + "loss": 0.90018404, + "num_input_tokens_seen": 72051395, + "router_z_loss_clip": 2.53320312, + "router_z_loss_mlp": 0.23461914, + "step": 3334, + "time_per_iteration": 2.597038984298706 + }, + { + "auxiliary_loss_clip": 0.0654801, + "auxiliary_loss_mlp": 0.01282898, + "balance_loss_clip": 0.06293298, + "balance_loss_mlp": 0.01256874, + "epoch": 0.20051104764767774, + "flos": 29395290251520.0, + "grad_norm": 2.256626356817437, + "language_loss": 0.7536357, + "learning_rate": 3.702685645366134e-06, + "loss": 0.83194482, + "num_input_tokens_seen": 72071305, + "router_z_loss_clip": 2.54882812, + "router_z_loss_mlp": 0.26049805, + "step": 3335, + "time_per_iteration": 2.5860390663146973 + }, + { + "auxiliary_loss_clip": 0.06552432, + "auxiliary_loss_mlp": 0.01280424, + "balance_loss_clip": 0.06300009, + "balance_loss_mlp": 0.0125632, + "epoch": 0.2005711709003457, + "flos": 23520575842560.0, + "grad_norm": 6.047041669068293, + "language_loss": 0.80452931, + "learning_rate": 3.7024812976859243e-06, + "loss": 0.88285786, + "num_input_tokens_seen": 72090165, + "router_z_loss_clip": 2.5234375, + "router_z_loss_mlp": 0.24108887, + "step": 3336, + "time_per_iteration": 2.662705898284912 + }, + { + "auxiliary_loss_clip": 0.06555694, + "auxiliary_loss_mlp": 0.01283807, + "balance_loss_clip": 0.06297083, + "balance_loss_mlp": 0.01258045, + "epoch": 0.20063129415301367, + "flos": 22529106806400.0, + "grad_norm": 1.88296777376126, + "language_loss": 0.78839928, + "learning_rate": 3.7022768854474532e-06, + "loss": 0.86679429, + "num_input_tokens_seen": 72107210, + "router_z_loss_clip": 2.5859375, + "router_z_loss_mlp": 0.25756836, + "step": 3337, + "time_per_iteration": 2.541239023208618 + }, + { + "auxiliary_loss_clip": 0.06548997, + "auxiliary_loss_mlp": 0.01282446, + "balance_loss_clip": 0.06296889, + "balance_loss_mlp": 0.01258389, + "epoch": 0.20069141740568164, + "flos": 25965405947520.0, + "grad_norm": 2.093788516709133, + "language_loss": 0.69608915, + "learning_rate": 3.7020724086584724e-06, + "loss": 0.77440357, + "num_input_tokens_seen": 72126315, + "router_z_loss_clip": 2.51953125, + "router_z_loss_mlp": 0.24072266, + "step": 3338, + "time_per_iteration": 4.011674165725708 + }, + { + "auxiliary_loss_clip": 0.06553162, + "auxiliary_loss_mlp": 0.01285819, + "balance_loss_clip": 0.06298589, + "balance_loss_mlp": 0.01261703, + "epoch": 0.2007515406583496, + "flos": 24797560066560.0, + "grad_norm": 2.5614555335728375, + "language_loss": 0.70278549, + "learning_rate": 3.701867867326735e-06, + "loss": 0.78117526, + "num_input_tokens_seen": 72146470, + "router_z_loss_clip": 2.54492188, + "router_z_loss_mlp": 0.24121094, + "step": 3339, + "time_per_iteration": 4.021097183227539 + }, + { + "auxiliary_loss_clip": 0.06558233, + "auxiliary_loss_mlp": 0.01288707, + "balance_loss_clip": 0.06300814, + "balance_loss_mlp": 0.01264401, + "epoch": 0.2008116639110176, + "flos": 37934746606080.0, + "grad_norm": 2.4782874615073265, + "language_loss": 0.67773008, + "learning_rate": 3.7016632614599974e-06, + "loss": 0.75619948, + "num_input_tokens_seen": 72166600, + "router_z_loss_clip": 2.578125, + "router_z_loss_mlp": 0.24291992, + "step": 3340, + "time_per_iteration": 2.741156816482544 + }, + { + "auxiliary_loss_clip": 0.06555235, + "auxiliary_loss_mlp": 0.01284766, + "balance_loss_clip": 0.06297287, + "balance_loss_mlp": 0.01258122, + "epoch": 0.20087178716368556, + "flos": 20746779408000.0, + "grad_norm": 2.067820693237163, + "language_loss": 0.74698186, + "learning_rate": 3.701458591066019e-06, + "loss": 0.82538182, + "num_input_tokens_seen": 72185160, + "router_z_loss_clip": 2.58007812, + "router_z_loss_mlp": 0.26623535, + "step": 3341, + "time_per_iteration": 2.564480781555176 + }, + { + "auxiliary_loss_clip": 0.06547385, + "auxiliary_loss_mlp": 0.01280207, + "balance_loss_clip": 0.06298249, + "balance_loss_mlp": 0.01256532, + "epoch": 0.20093191041635353, + "flos": 23849122901760.0, + "grad_norm": 1.820842392943319, + "language_loss": 0.7265389, + "learning_rate": 3.70125385615256e-06, + "loss": 0.80481482, + "num_input_tokens_seen": 72205160, + "router_z_loss_clip": 2.48828125, + "router_z_loss_mlp": 0.23657227, + "step": 3342, + "time_per_iteration": 2.5828449726104736 + }, + { + "auxiliary_loss_clip": 0.065575, + "auxiliary_loss_mlp": 0.01288338, + "balance_loss_clip": 0.06302083, + "balance_loss_mlp": 0.01264174, + "epoch": 0.2009920336690215, + "flos": 21797395027200.0, + "grad_norm": 1.987813203177408, + "language_loss": 0.73357129, + "learning_rate": 3.701049056727384e-06, + "loss": 0.81202972, + "num_input_tokens_seen": 72223555, + "router_z_loss_clip": 2.55273438, + "router_z_loss_mlp": 0.24169922, + "step": 3343, + "time_per_iteration": 2.547868490219116 + }, + { + "auxiliary_loss_clip": 0.06554922, + "auxiliary_loss_mlp": 0.012954, + "balance_loss_clip": 0.06301528, + "balance_loss_mlp": 0.01269865, + "epoch": 0.20105215692168946, + "flos": 26366390461440.0, + "grad_norm": 2.115251797604865, + "language_loss": 0.81433517, + "learning_rate": 3.7008441927982574e-06, + "loss": 0.89283836, + "num_input_tokens_seen": 72242465, + "router_z_loss_clip": 2.53515625, + "router_z_loss_mlp": 0.25524902, + "step": 3344, + "time_per_iteration": 2.6067302227020264 + }, + { + "auxiliary_loss_clip": 0.06556335, + "auxiliary_loss_mlp": 0.01281302, + "balance_loss_clip": 0.06301118, + "balance_loss_mlp": 0.01258426, + "epoch": 0.20111228017435742, + "flos": 18813288044160.0, + "grad_norm": 4.0042293338609385, + "language_loss": 0.84618676, + "learning_rate": 3.700639264372948e-06, + "loss": 0.92456311, + "num_input_tokens_seen": 72260655, + "router_z_loss_clip": 2.55273438, + "router_z_loss_mlp": 0.2286377, + "step": 3345, + "time_per_iteration": 2.554713726043701 + }, + { + "auxiliary_loss_clip": 0.06542929, + "auxiliary_loss_mlp": 0.01295407, + "balance_loss_clip": 0.0629687, + "balance_loss_mlp": 0.01272697, + "epoch": 0.20117240342702541, + "flos": 19981301633280.0, + "grad_norm": 2.1108086187654025, + "language_loss": 0.68437809, + "learning_rate": 3.7004342714592283e-06, + "loss": 0.76276147, + "num_input_tokens_seen": 72279055, + "router_z_loss_clip": 2.4609375, + "router_z_loss_mlp": 0.22705078, + "step": 3346, + "time_per_iteration": 2.5748066902160645 + }, + { + "auxiliary_loss_clip": 0.06553109, + "auxiliary_loss_mlp": 0.01283392, + "balance_loss_clip": 0.06301546, + "balance_loss_mlp": 0.01258739, + "epoch": 0.20123252667969338, + "flos": 23148368006400.0, + "grad_norm": 1.9426154174848713, + "language_loss": 0.73952061, + "learning_rate": 3.70022921406487e-06, + "loss": 0.81788564, + "num_input_tokens_seen": 72297895, + "router_z_loss_clip": 2.51757812, + "router_z_loss_mlp": 0.24682617, + "step": 3347, + "time_per_iteration": 2.5353236198425293 + }, + { + "auxiliary_loss_clip": 0.06546339, + "auxiliary_loss_mlp": 0.01287781, + "balance_loss_clip": 0.0629671, + "balance_loss_mlp": 0.01263487, + "epoch": 0.20129264993236134, + "flos": 23228352328320.0, + "grad_norm": 1.557023243146552, + "language_loss": 0.87284029, + "learning_rate": 3.70002409219765e-06, + "loss": 0.95118147, + "num_input_tokens_seen": 72318385, + "router_z_loss_clip": 2.49414062, + "router_z_loss_mlp": 0.24316406, + "step": 3348, + "time_per_iteration": 2.5943105220794678 + }, + { + "auxiliary_loss_clip": 0.06550047, + "auxiliary_loss_mlp": 0.01294068, + "balance_loss_clip": 0.06302264, + "balance_loss_mlp": 0.01269034, + "epoch": 0.2013527731850293, + "flos": 21877882473600.0, + "grad_norm": 1.6966939322149492, + "language_loss": 0.71502012, + "learning_rate": 3.699818905865346e-06, + "loss": 0.7934612, + "num_input_tokens_seen": 72338235, + "router_z_loss_clip": 2.47851562, + "router_z_loss_mlp": 0.25061035, + "step": 3349, + "time_per_iteration": 2.5671966075897217 + }, + { + "auxiliary_loss_clip": 0.06552055, + "auxiliary_loss_mlp": 0.01290022, + "balance_loss_clip": 0.06301533, + "balance_loss_mlp": 0.01263486, + "epoch": 0.20141289643769728, + "flos": 18046636312320.0, + "grad_norm": 1.7460886195435679, + "language_loss": 0.72473693, + "learning_rate": 3.6996136550757377e-06, + "loss": 0.80315775, + "num_input_tokens_seen": 72357825, + "router_z_loss_clip": 2.50195312, + "router_z_loss_mlp": 0.26501465, + "step": 3350, + "time_per_iteration": 2.558486223220825 + }, + { + "auxiliary_loss_clip": 0.06561922, + "auxiliary_loss_mlp": 0.01282894, + "balance_loss_clip": 0.0630732, + "balance_loss_mlp": 0.01256728, + "epoch": 0.20147301969036524, + "flos": 23958219317760.0, + "grad_norm": 2.4285458765514623, + "language_loss": 0.76773715, + "learning_rate": 3.69940833983661e-06, + "loss": 0.84618533, + "num_input_tokens_seen": 72376335, + "router_z_loss_clip": 2.54882812, + "router_z_loss_mlp": 0.26135254, + "step": 3351, + "time_per_iteration": 2.5236856937408447 + }, + { + "auxiliary_loss_clip": 0.0657143, + "auxiliary_loss_mlp": 0.01289916, + "balance_loss_clip": 0.06311074, + "balance_loss_mlp": 0.01260638, + "epoch": 0.2015331429430332, + "flos": 25594749411840.0, + "grad_norm": 1.6280311670130643, + "language_loss": 0.81367022, + "learning_rate": 3.699202960155748e-06, + "loss": 0.89228368, + "num_input_tokens_seen": 72395440, + "router_z_loss_clip": 2.60742188, + "router_z_loss_mlp": 0.29248047, + "step": 3352, + "time_per_iteration": 2.603740692138672 + }, + { + "auxiliary_loss_clip": 0.06557955, + "auxiliary_loss_mlp": 0.01286544, + "balance_loss_clip": 0.06303968, + "balance_loss_mlp": 0.01258458, + "epoch": 0.2015932661957012, + "flos": 26732351168640.0, + "grad_norm": 2.001275007108419, + "language_loss": 0.81670761, + "learning_rate": 3.6989975160409396e-06, + "loss": 0.89515263, + "num_input_tokens_seen": 72414670, + "router_z_loss_clip": 2.54101562, + "router_z_loss_mlp": 0.28063965, + "step": 3353, + "time_per_iteration": 2.5631332397460938 + }, + { + "auxiliary_loss_clip": 0.06555627, + "auxiliary_loss_mlp": 0.01278407, + "balance_loss_clip": 0.0630668, + "balance_loss_mlp": 0.01253206, + "epoch": 0.20165338944836916, + "flos": 15638632876800.0, + "grad_norm": 1.8574199324884482, + "language_loss": 0.9049592, + "learning_rate": 3.6987920074999747e-06, + "loss": 0.98329961, + "num_input_tokens_seen": 72432210, + "router_z_loss_clip": 2.49414062, + "router_z_loss_mlp": 0.2520752, + "step": 3354, + "time_per_iteration": 2.567229986190796 + }, + { + "auxiliary_loss_clip": 0.06439115, + "auxiliary_loss_mlp": 0.01275126, + "balance_loss_clip": 0.06305242, + "balance_loss_mlp": 0.01268129, + "epoch": 0.20171351270103713, + "flos": 57929926089600.0, + "grad_norm": 0.8202677442032412, + "language_loss": 0.55840385, + "learning_rate": 3.6985864345406465e-06, + "loss": 0.63554633, + "num_input_tokens_seen": 72489225, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.07012939, + "step": 3355, + "time_per_iteration": 3.118603229522705 + }, + { + "auxiliary_loss_clip": 0.06557105, + "auxiliary_loss_mlp": 0.01281149, + "balance_loss_clip": 0.06309459, + "balance_loss_mlp": 0.01257474, + "epoch": 0.2017736359537051, + "flos": 20820768163200.0, + "grad_norm": 1.5861142309185163, + "language_loss": 0.84845644, + "learning_rate": 3.698380797170751e-06, + "loss": 0.92683893, + "num_input_tokens_seen": 72508715, + "router_z_loss_clip": 2.47265625, + "router_z_loss_mlp": 0.23669434, + "step": 3356, + "time_per_iteration": 2.5407068729400635 + }, + { + "auxiliary_loss_clip": 0.06578876, + "auxiliary_loss_mlp": 0.01283859, + "balance_loss_clip": 0.06314196, + "balance_loss_mlp": 0.01255344, + "epoch": 0.20183375920637306, + "flos": 17097696023040.0, + "grad_norm": 3.7689574240726147, + "language_loss": 0.71072245, + "learning_rate": 3.698175095398085e-06, + "loss": 0.78934979, + "num_input_tokens_seen": 72525135, + "router_z_loss_clip": 2.64257812, + "router_z_loss_mlp": 0.28515625, + "step": 3357, + "time_per_iteration": 2.4921233654022217 + }, + { + "auxiliary_loss_clip": 0.065685, + "auxiliary_loss_mlp": 0.01288812, + "balance_loss_clip": 0.0631017, + "balance_loss_mlp": 0.01263206, + "epoch": 0.20189388245904102, + "flos": 18667323031680.0, + "grad_norm": 2.064581487792546, + "language_loss": 0.72707927, + "learning_rate": 3.6979693292304493e-06, + "loss": 0.80565238, + "num_input_tokens_seen": 72543690, + "router_z_loss_clip": 2.58007812, + "router_z_loss_mlp": 0.25585938, + "step": 3358, + "time_per_iteration": 2.531280040740967 + }, + { + "auxiliary_loss_clip": 0.06550319, + "auxiliary_loss_mlp": 0.0128707, + "balance_loss_clip": 0.06304348, + "balance_loss_mlp": 0.01263633, + "epoch": 0.20195400571170902, + "flos": 16802705324160.0, + "grad_norm": 1.761827203655194, + "language_loss": 0.83542818, + "learning_rate": 3.6977634986756463e-06, + "loss": 0.91380209, + "num_input_tokens_seen": 72560725, + "router_z_loss_clip": 2.46289062, + "router_z_loss_mlp": 0.234375, + "step": 3359, + "time_per_iteration": 2.5004122257232666 + }, + { + "auxiliary_loss_clip": 0.06415485, + "auxiliary_loss_mlp": 0.01275385, + "balance_loss_clip": 0.06281421, + "balance_loss_mlp": 0.01269109, + "epoch": 0.20201412896437698, + "flos": 67192792669440.0, + "grad_norm": 0.7763137973079639, + "language_loss": 0.58718604, + "learning_rate": 3.697557603741482e-06, + "loss": 0.66409475, + "num_input_tokens_seen": 72621940, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.06274414, + "step": 3360, + "time_per_iteration": 3.202280282974243 + }, + { + "auxiliary_loss_clip": 0.06567518, + "auxiliary_loss_mlp": 0.01281863, + "balance_loss_clip": 0.06312253, + "balance_loss_mlp": 0.01257055, + "epoch": 0.20207425221704495, + "flos": 21331477998720.0, + "grad_norm": 2.7701451368403767, + "language_loss": 0.63371557, + "learning_rate": 3.697351644435763e-06, + "loss": 0.71220934, + "num_input_tokens_seen": 72639135, + "router_z_loss_clip": 2.55078125, + "router_z_loss_mlp": 0.24841309, + "step": 3361, + "time_per_iteration": 2.591505527496338 + }, + { + "auxiliary_loss_clip": 0.06556661, + "auxiliary_loss_mlp": 0.01280295, + "balance_loss_clip": 0.06304803, + "balance_loss_mlp": 0.01257049, + "epoch": 0.2021343754697129, + "flos": 22533509145600.0, + "grad_norm": 1.837331842396403, + "language_loss": 0.76495373, + "learning_rate": 3.6971456207662993e-06, + "loss": 0.84332329, + "num_input_tokens_seen": 72658525, + "router_z_loss_clip": 2.51953125, + "router_z_loss_mlp": 0.23254395, + "step": 3362, + "time_per_iteration": 2.5748798847198486 + }, + { + "auxiliary_loss_clip": 0.06552652, + "auxiliary_loss_mlp": 0.01281781, + "balance_loss_clip": 0.06300291, + "balance_loss_mlp": 0.01257379, + "epoch": 0.20219449872238088, + "flos": 19068852597120.0, + "grad_norm": 1.6506097934595576, + "language_loss": 0.77716577, + "learning_rate": 3.6969395327409035e-06, + "loss": 0.85551012, + "num_input_tokens_seen": 72678085, + "router_z_loss_clip": 2.52734375, + "router_z_loss_mlp": 0.24365234, + "step": 3363, + "time_per_iteration": 2.5682361125946045 + }, + { + "auxiliary_loss_clip": 0.06556462, + "auxiliary_loss_mlp": 0.01285372, + "balance_loss_clip": 0.06303493, + "balance_loss_mlp": 0.01262198, + "epoch": 0.20225462197504884, + "flos": 24723864800640.0, + "grad_norm": 1.5662342973814338, + "language_loss": 0.75767177, + "learning_rate": 3.696733380367391e-06, + "loss": 0.83609009, + "num_input_tokens_seen": 72698695, + "router_z_loss_clip": 2.53515625, + "router_z_loss_mlp": 0.23181152, + "step": 3364, + "time_per_iteration": 2.620352029800415 + }, + { + "auxiliary_loss_clip": 0.06564072, + "auxiliary_loss_mlp": 0.01282858, + "balance_loss_clip": 0.06306748, + "balance_loss_mlp": 0.01259374, + "epoch": 0.2023147452277168, + "flos": 22024895662080.0, + "grad_norm": 2.684464985384485, + "language_loss": 0.72232616, + "learning_rate": 3.6965271636535783e-06, + "loss": 0.80079544, + "num_input_tokens_seen": 72717880, + "router_z_loss_clip": 2.57226562, + "router_z_loss_mlp": 0.23474121, + "step": 3365, + "time_per_iteration": 2.6884727478027344 + }, + { + "auxiliary_loss_clip": 0.06551654, + "auxiliary_loss_mlp": 0.01279748, + "balance_loss_clip": 0.0629961, + "balance_loss_mlp": 0.01256336, + "epoch": 0.2023748684803848, + "flos": 17750555510400.0, + "grad_norm": 1.8865204005259733, + "language_loss": 0.86329257, + "learning_rate": 3.696320882607286e-06, + "loss": 0.94160658, + "num_input_tokens_seen": 72736410, + "router_z_loss_clip": 2.52148438, + "router_z_loss_mlp": 0.23425293, + "step": 3366, + "time_per_iteration": 2.541398525238037 + }, + { + "auxiliary_loss_clip": 0.06552443, + "auxiliary_loss_mlp": 0.01277252, + "balance_loss_clip": 0.06302489, + "balance_loss_mlp": 0.01254698, + "epoch": 0.20243499173305277, + "flos": 31146912328320.0, + "grad_norm": 1.6069123477498997, + "language_loss": 0.69763649, + "learning_rate": 3.696114537236335e-06, + "loss": 0.77593338, + "num_input_tokens_seen": 72758295, + "router_z_loss_clip": 2.49804688, + "router_z_loss_mlp": 0.22558594, + "step": 3367, + "time_per_iteration": 2.674370527267456 + }, + { + "auxiliary_loss_clip": 0.06562914, + "auxiliary_loss_mlp": 0.01285589, + "balance_loss_clip": 0.06300482, + "balance_loss_mlp": 0.01257777, + "epoch": 0.20249511498572073, + "flos": 33847726256640.0, + "grad_norm": 1.76028679400595, + "language_loss": 0.69152057, + "learning_rate": 3.6959081275485512e-06, + "loss": 0.77000558, + "num_input_tokens_seen": 72782495, + "router_z_loss_clip": 2.62109375, + "router_z_loss_mlp": 0.27819824, + "step": 3368, + "time_per_iteration": 2.6662635803222656 + }, + { + "auxiliary_loss_clip": 0.06551345, + "auxiliary_loss_mlp": 0.0128738, + "balance_loss_clip": 0.06301117, + "balance_loss_mlp": 0.01263657, + "epoch": 0.2025552382383887, + "flos": 21222088093440.0, + "grad_norm": 1.819755421756695, + "language_loss": 0.78064144, + "learning_rate": 3.6957016535517615e-06, + "loss": 0.8590287, + "num_input_tokens_seen": 72801885, + "router_z_loss_clip": 2.50390625, + "router_z_loss_mlp": 0.23718262, + "step": 3369, + "time_per_iteration": 2.5846660137176514 + }, + { + "auxiliary_loss_clip": 0.06560668, + "auxiliary_loss_mlp": 0.01282514, + "balance_loss_clip": 0.06299458, + "balance_loss_mlp": 0.01257492, + "epoch": 0.20261536149105666, + "flos": 14652614355840.0, + "grad_norm": 3.2010156823618687, + "language_loss": 0.66533637, + "learning_rate": 3.695495115253795e-06, + "loss": 0.74376816, + "num_input_tokens_seen": 72816990, + "router_z_loss_clip": 2.61132812, + "router_z_loss_mlp": 0.25024414, + "step": 3370, + "time_per_iteration": 3.953664541244507 + }, + { + "auxiliary_loss_clip": 0.06420556, + "auxiliary_loss_mlp": 0.01256354, + "balance_loss_clip": 0.06284036, + "balance_loss_mlp": 0.01249797, + "epoch": 0.20267548474372463, + "flos": 66803380018560.0, + "grad_norm": 0.6606134365812599, + "language_loss": 0.58273321, + "learning_rate": 3.6952885126624834e-06, + "loss": 0.65950233, + "num_input_tokens_seen": 72879240, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.06567383, + "step": 3371, + "time_per_iteration": 3.2517025470733643 + }, + { + "auxiliary_loss_clip": 0.06555597, + "auxiliary_loss_mlp": 0.01283717, + "balance_loss_clip": 0.06300298, + "balance_loss_mlp": 0.01257944, + "epoch": 0.2027356079963926, + "flos": 24687667036800.0, + "grad_norm": 1.6416079718190109, + "language_loss": 0.92020303, + "learning_rate": 3.6950818457856617e-06, + "loss": 0.99859619, + "num_input_tokens_seen": 72899030, + "router_z_loss_clip": 2.55273438, + "router_z_loss_mlp": 0.25769043, + "step": 3372, + "time_per_iteration": 4.108370065689087 + }, + { + "auxiliary_loss_clip": 0.06555616, + "auxiliary_loss_mlp": 0.01283062, + "balance_loss_clip": 0.06298956, + "balance_loss_mlp": 0.01258672, + "epoch": 0.20279573124906058, + "flos": 26399443697280.0, + "grad_norm": 1.769817073167301, + "language_loss": 0.79293168, + "learning_rate": 3.694875114631167e-06, + "loss": 0.87131846, + "num_input_tokens_seen": 72919190, + "router_z_loss_clip": 2.56640625, + "router_z_loss_mlp": 0.24414062, + "step": 3373, + "time_per_iteration": 2.6076717376708984 + }, + { + "auxiliary_loss_clip": 0.06543471, + "auxiliary_loss_mlp": 0.01280674, + "balance_loss_clip": 0.06296648, + "balance_loss_mlp": 0.01256343, + "epoch": 0.20285585450172855, + "flos": 33808006621440.0, + "grad_norm": 3.4143342380796255, + "language_loss": 0.72364163, + "learning_rate": 3.6946683192068377e-06, + "loss": 0.8018831, + "num_input_tokens_seen": 72939720, + "router_z_loss_clip": 2.46679688, + "router_z_loss_mlp": 0.24328613, + "step": 3374, + "time_per_iteration": 2.6686174869537354 + }, + { + "auxiliary_loss_clip": 0.06419748, + "auxiliary_loss_mlp": 0.01258876, + "balance_loss_clip": 0.06284177, + "balance_loss_mlp": 0.01252266, + "epoch": 0.20291597775439651, + "flos": 71185768410240.0, + "grad_norm": 1.0120800133799934, + "language_loss": 0.62520474, + "learning_rate": 3.694461459520516e-06, + "loss": 0.70199096, + "num_input_tokens_seen": 73000015, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.06622314, + "step": 3375, + "time_per_iteration": 3.159513473510742 + }, + { + "auxiliary_loss_clip": 0.06548455, + "auxiliary_loss_mlp": 0.01283408, + "balance_loss_clip": 0.06294296, + "balance_loss_mlp": 0.0125891, + "epoch": 0.20297610100706448, + "flos": 19499368475520.0, + "grad_norm": 1.6178559610323104, + "language_loss": 0.82908762, + "learning_rate": 3.6942545355800463e-06, + "loss": 0.90740621, + "num_input_tokens_seen": 73017675, + "router_z_loss_clip": 2.54296875, + "router_z_loss_mlp": 0.24499512, + "step": 3376, + "time_per_iteration": 2.5366275310516357 + }, + { + "auxiliary_loss_clip": 0.06553418, + "auxiliary_loss_mlp": 0.01284265, + "balance_loss_clip": 0.06296962, + "balance_loss_mlp": 0.0125854, + "epoch": 0.20303622425973245, + "flos": 25050944413440.0, + "grad_norm": 2.015544075965587, + "language_loss": 0.82464767, + "learning_rate": 3.6940475473932743e-06, + "loss": 0.90302449, + "num_input_tokens_seen": 73036135, + "router_z_loss_clip": 2.56835938, + "router_z_loss_mlp": 0.25720215, + "step": 3377, + "time_per_iteration": 2.579468250274658 + }, + { + "auxiliary_loss_clip": 0.06554671, + "auxiliary_loss_mlp": 0.01287763, + "balance_loss_clip": 0.06300091, + "balance_loss_mlp": 0.01261453, + "epoch": 0.2030963475124004, + "flos": 21986266129920.0, + "grad_norm": 1.7361857812490578, + "language_loss": 0.7745406, + "learning_rate": 3.69384049496805e-06, + "loss": 0.85296494, + "num_input_tokens_seen": 73054075, + "router_z_loss_clip": 2.546875, + "router_z_loss_mlp": 0.26306152, + "step": 3378, + "time_per_iteration": 3.999164342880249 + }, + { + "auxiliary_loss_clip": 0.06557525, + "auxiliary_loss_mlp": 0.01285912, + "balance_loss_clip": 0.06298093, + "balance_loss_mlp": 0.01259423, + "epoch": 0.2031564707650684, + "flos": 19506496072320.0, + "grad_norm": 1.7814270376711854, + "language_loss": 0.80552137, + "learning_rate": 3.6936333783122242e-06, + "loss": 0.88395572, + "num_input_tokens_seen": 73073530, + "router_z_loss_clip": 2.59375, + "router_z_loss_mlp": 0.26525879, + "step": 3379, + "time_per_iteration": 3.94376277923584 + }, + { + "auxiliary_loss_clip": 0.06547987, + "auxiliary_loss_mlp": 0.01283987, + "balance_loss_clip": 0.06298195, + "balance_loss_mlp": 0.01259799, + "epoch": 0.20321659401773637, + "flos": 22753630621440.0, + "grad_norm": 1.8399421212903948, + "language_loss": 0.87578034, + "learning_rate": 3.6934261974336505e-06, + "loss": 0.95410013, + "num_input_tokens_seen": 73092820, + "router_z_loss_clip": 2.49609375, + "router_z_loss_mlp": 0.24206543, + "step": 3380, + "time_per_iteration": 2.5826356410980225 + }, + { + "auxiliary_loss_clip": 0.06554954, + "auxiliary_loss_mlp": 0.01300173, + "balance_loss_clip": 0.06299303, + "balance_loss_mlp": 0.01274817, + "epoch": 0.20327671727040433, + "flos": 22462455283200.0, + "grad_norm": 2.147675917051705, + "language_loss": 0.75801265, + "learning_rate": 3.693218952340186e-06, + "loss": 0.83656389, + "num_input_tokens_seen": 73113385, + "router_z_loss_clip": 2.55664062, + "router_z_loss_mlp": 0.2532959, + "step": 3381, + "time_per_iteration": 2.580035924911499 + }, + { + "auxiliary_loss_clip": 0.06559204, + "auxiliary_loss_mlp": 0.0128659, + "balance_loss_clip": 0.06297147, + "balance_loss_mlp": 0.01260198, + "epoch": 0.2033368405230723, + "flos": 19540807119360.0, + "grad_norm": 1.8225171591496117, + "language_loss": 0.79701936, + "learning_rate": 3.6930116430396895e-06, + "loss": 0.87547731, + "num_input_tokens_seen": 73131195, + "router_z_loss_clip": 2.62304688, + "router_z_loss_mlp": 0.26391602, + "step": 3382, + "time_per_iteration": 2.743842601776123 + }, + { + "auxiliary_loss_clip": 0.06551235, + "auxiliary_loss_mlp": 0.01283934, + "balance_loss_clip": 0.06293041, + "balance_loss_mlp": 0.01258745, + "epoch": 0.20339696377574026, + "flos": 13814489491200.0, + "grad_norm": 1.712325191768153, + "language_loss": 0.80308962, + "learning_rate": 3.6928042695400214e-06, + "loss": 0.8814413, + "num_input_tokens_seen": 73148850, + "router_z_loss_clip": 2.58203125, + "router_z_loss_mlp": 0.25195312, + "step": 3383, + "time_per_iteration": 2.6428067684173584 + }, + { + "auxiliary_loss_clip": 0.06548008, + "auxiliary_loss_mlp": 0.01285433, + "balance_loss_clip": 0.06295451, + "balance_loss_mlp": 0.01259541, + "epoch": 0.20345708702840823, + "flos": 20345627185920.0, + "grad_norm": 1.7809184522678074, + "language_loss": 0.75199848, + "learning_rate": 3.6925968318490464e-06, + "loss": 0.83033288, + "num_input_tokens_seen": 73166775, + "router_z_loss_clip": 2.52734375, + "router_z_loss_mlp": 0.25891113, + "step": 3384, + "time_per_iteration": 2.5601112842559814 + }, + { + "auxiliary_loss_clip": 0.06573269, + "auxiliary_loss_mlp": 0.01282943, + "balance_loss_clip": 0.06306025, + "balance_loss_mlp": 0.01256229, + "epoch": 0.2035172102810762, + "flos": 20339254275840.0, + "grad_norm": 2.5841350087074852, + "language_loss": 0.77226508, + "learning_rate": 3.6923893299746293e-06, + "loss": 0.85082722, + "num_input_tokens_seen": 73183215, + "router_z_loss_clip": 2.66992188, + "router_z_loss_mlp": 0.26745605, + "step": 3385, + "time_per_iteration": 2.527583122253418 + }, + { + "auxiliary_loss_clip": 0.06553946, + "auxiliary_loss_mlp": 0.01288968, + "balance_loss_clip": 0.06300423, + "balance_loss_mlp": 0.01263934, + "epoch": 0.2035773335337442, + "flos": 23337658379520.0, + "grad_norm": 1.6683994830989402, + "language_loss": 0.70000219, + "learning_rate": 3.692181763924639e-06, + "loss": 0.7784313, + "num_input_tokens_seen": 73203290, + "router_z_loss_clip": 2.53710938, + "router_z_loss_mlp": 0.25048828, + "step": 3386, + "time_per_iteration": 2.583940029144287 + }, + { + "auxiliary_loss_clip": 0.06550556, + "auxiliary_loss_mlp": 0.01289862, + "balance_loss_clip": 0.0629431, + "balance_loss_mlp": 0.01265495, + "epoch": 0.20363745678641215, + "flos": 28337924378880.0, + "grad_norm": 1.2744067098921972, + "language_loss": 0.81998229, + "learning_rate": 3.691974133706947e-06, + "loss": 0.89838648, + "num_input_tokens_seen": 73226185, + "router_z_loss_clip": 2.5625, + "router_z_loss_mlp": 0.24365234, + "step": 3387, + "time_per_iteration": 2.624765634536743 + }, + { + "auxiliary_loss_clip": 0.06543861, + "auxiliary_loss_mlp": 0.01285642, + "balance_loss_clip": 0.06297304, + "balance_loss_mlp": 0.01261705, + "epoch": 0.20369758003908012, + "flos": 18921503992320.0, + "grad_norm": 2.338231566069276, + "language_loss": 0.80333674, + "learning_rate": 3.6917664393294262e-06, + "loss": 0.88163185, + "num_input_tokens_seen": 73243300, + "router_z_loss_clip": 2.47070312, + "router_z_loss_mlp": 0.23925781, + "step": 3388, + "time_per_iteration": 2.565795421600342 + }, + { + "auxiliary_loss_clip": 0.06553982, + "auxiliary_loss_mlp": 0.01281213, + "balance_loss_clip": 0.06297579, + "balance_loss_mlp": 0.0125693, + "epoch": 0.20375770329174808, + "flos": 19212218133120.0, + "grad_norm": 1.8814817968190891, + "language_loss": 0.72894287, + "learning_rate": 3.6915586807999527e-06, + "loss": 0.80729485, + "num_input_tokens_seen": 73261490, + "router_z_loss_clip": 2.56445312, + "router_z_loss_mlp": 0.24279785, + "step": 3389, + "time_per_iteration": 2.5263590812683105 + }, + { + "auxiliary_loss_clip": 0.06544612, + "auxiliary_loss_mlp": 0.01286594, + "balance_loss_clip": 0.06296231, + "balance_loss_mlp": 0.01262204, + "epoch": 0.20381782654441605, + "flos": 19397106167040.0, + "grad_norm": 2.5524619095037626, + "language_loss": 0.88214552, + "learning_rate": 3.691350858126404e-06, + "loss": 0.96045768, + "num_input_tokens_seen": 73280180, + "router_z_loss_clip": 2.484375, + "router_z_loss_mlp": 0.24377441, + "step": 3390, + "time_per_iteration": 2.5450997352600098 + }, + { + "auxiliary_loss_clip": 0.06546676, + "auxiliary_loss_mlp": 0.01283726, + "balance_loss_clip": 0.06297011, + "balance_loss_mlp": 0.01260683, + "epoch": 0.203877949797084, + "flos": 24834764079360.0, + "grad_norm": 2.430374095532116, + "language_loss": 0.71690643, + "learning_rate": 3.691142971316662e-06, + "loss": 0.79521036, + "num_input_tokens_seen": 73300680, + "router_z_loss_clip": 2.49609375, + "router_z_loss_mlp": 0.23022461, + "step": 3391, + "time_per_iteration": 2.5983424186706543 + }, + { + "auxiliary_loss_clip": 0.06548478, + "auxiliary_loss_mlp": 0.01287319, + "balance_loss_clip": 0.06300271, + "balance_loss_mlp": 0.01263799, + "epoch": 0.20393807304975198, + "flos": 18009432299520.0, + "grad_norm": 3.271459971820983, + "language_loss": 0.87029123, + "learning_rate": 3.6909350203786086e-06, + "loss": 0.94864917, + "num_input_tokens_seen": 73316760, + "router_z_loss_clip": 2.48242188, + "router_z_loss_mlp": 0.2355957, + "step": 3392, + "time_per_iteration": 2.5094432830810547 + }, + { + "auxiliary_loss_clip": 0.06555735, + "auxiliary_loss_mlp": 0.01288889, + "balance_loss_clip": 0.06302007, + "balance_loss_mlp": 0.0126432, + "epoch": 0.20399819630241997, + "flos": 24213867724800.0, + "grad_norm": 1.4298747009925739, + "language_loss": 0.8143822, + "learning_rate": 3.69072700532013e-06, + "loss": 0.8928284, + "num_input_tokens_seen": 73339385, + "router_z_loss_clip": 2.53710938, + "router_z_loss_mlp": 0.24560547, + "step": 3393, + "time_per_iteration": 2.674898147583008 + }, + { + "auxiliary_loss_clip": 0.06555712, + "auxiliary_loss_mlp": 0.01283361, + "balance_loss_clip": 0.0630876, + "balance_loss_mlp": 0.01260747, + "epoch": 0.20405831955508794, + "flos": 20783396442240.0, + "grad_norm": 2.2973425083766377, + "language_loss": 0.87181509, + "learning_rate": 3.6905189261491137e-06, + "loss": 0.9502058, + "num_input_tokens_seen": 73357235, + "router_z_loss_clip": 2.47070312, + "router_z_loss_mlp": 0.22619629, + "step": 3394, + "time_per_iteration": 2.5489470958709717 + }, + { + "auxiliary_loss_clip": 0.06548424, + "auxiliary_loss_mlp": 0.0128548, + "balance_loss_clip": 0.06299029, + "balance_loss_mlp": 0.01262448, + "epoch": 0.2041184428077559, + "flos": 15492332448000.0, + "grad_norm": 2.1306464149991027, + "language_loss": 0.8456347, + "learning_rate": 3.69031078287345e-06, + "loss": 0.92397374, + "num_input_tokens_seen": 73374435, + "router_z_loss_clip": 2.49609375, + "router_z_loss_mlp": 0.23034668, + "step": 3395, + "time_per_iteration": 2.5297558307647705 + }, + { + "auxiliary_loss_clip": 0.06554371, + "auxiliary_loss_mlp": 0.01288203, + "balance_loss_clip": 0.06299008, + "balance_loss_mlp": 0.0126448, + "epoch": 0.20417856606042387, + "flos": 15592582258560.0, + "grad_norm": 1.9297262637725432, + "language_loss": 0.84104818, + "learning_rate": 3.690102575501033e-06, + "loss": 0.91947389, + "num_input_tokens_seen": 73391025, + "router_z_loss_clip": 2.55273438, + "router_z_loss_mlp": 0.23730469, + "step": 3396, + "time_per_iteration": 2.492448568344116 + }, + { + "auxiliary_loss_clip": 0.0654766, + "auxiliary_loss_mlp": 0.01296047, + "balance_loss_clip": 0.06301443, + "balance_loss_mlp": 0.01272706, + "epoch": 0.20423868931309183, + "flos": 24286137471360.0, + "grad_norm": 2.084884773893835, + "language_loss": 0.7751056, + "learning_rate": 3.6898943040397556e-06, + "loss": 0.85354269, + "num_input_tokens_seen": 73409270, + "router_z_loss_clip": 2.46289062, + "router_z_loss_mlp": 0.2331543, + "step": 3397, + "time_per_iteration": 2.5621836185455322 + }, + { + "auxiliary_loss_clip": 0.06547033, + "auxiliary_loss_mlp": 0.01291146, + "balance_loss_clip": 0.06300367, + "balance_loss_mlp": 0.01268067, + "epoch": 0.2042988125657598, + "flos": 18619176061440.0, + "grad_norm": 3.401004534017878, + "language_loss": 0.88746947, + "learning_rate": 3.689685968497518e-06, + "loss": 0.96585131, + "num_input_tokens_seen": 73425225, + "router_z_loss_clip": 2.46875, + "router_z_loss_mlp": 0.23083496, + "step": 3398, + "time_per_iteration": 2.4821889400482178 + }, + { + "auxiliary_loss_clip": 0.06555858, + "auxiliary_loss_mlp": 0.01287072, + "balance_loss_clip": 0.06305312, + "balance_loss_mlp": 0.01263361, + "epoch": 0.2043589358184278, + "flos": 17855836565760.0, + "grad_norm": 2.044777021305177, + "language_loss": 0.79053116, + "learning_rate": 3.6894775688822186e-06, + "loss": 0.8689605, + "num_input_tokens_seen": 73440940, + "router_z_loss_clip": 2.50390625, + "router_z_loss_mlp": 0.23706055, + "step": 3399, + "time_per_iteration": 2.5007028579711914 + }, + { + "auxiliary_loss_clip": 0.06554085, + "auxiliary_loss_mlp": 0.01288353, + "balance_loss_clip": 0.06300685, + "balance_loss_mlp": 0.01264678, + "epoch": 0.20441905907109575, + "flos": 21441832225920.0, + "grad_norm": 3.4484144890832327, + "language_loss": 0.77263522, + "learning_rate": 3.6892691052017603e-06, + "loss": 0.85105962, + "num_input_tokens_seen": 73458805, + "router_z_loss_clip": 2.53125, + "router_z_loss_mlp": 0.23669434, + "step": 3400, + "time_per_iteration": 2.524930715560913 + }, + { + "auxiliary_loss_clip": 0.06546277, + "auxiliary_loss_mlp": 0.0128369, + "balance_loss_clip": 0.0630067, + "balance_loss_mlp": 0.01262423, + "epoch": 0.20447918232376372, + "flos": 27714847818240.0, + "grad_norm": 1.566944783994086, + "language_loss": 0.7976017, + "learning_rate": 3.6890605774640487e-06, + "loss": 0.87590134, + "num_input_tokens_seen": 73479380, + "router_z_loss_clip": 2.45703125, + "router_z_loss_mlp": 0.21264648, + "step": 3401, + "time_per_iteration": 2.5868172645568848 + }, + { + "auxiliary_loss_clip": 0.06547564, + "auxiliary_loss_mlp": 0.01287222, + "balance_loss_clip": 0.06297088, + "balance_loss_mlp": 0.01263833, + "epoch": 0.20453930557643168, + "flos": 30533017789440.0, + "grad_norm": 1.6743436404675067, + "language_loss": 0.69998658, + "learning_rate": 3.688851985676991e-06, + "loss": 0.7783345, + "num_input_tokens_seen": 73505105, + "router_z_loss_clip": 2.50195312, + "router_z_loss_mlp": 0.23400879, + "step": 3402, + "time_per_iteration": 2.664961099624634 + }, + { + "auxiliary_loss_clip": 0.06561718, + "auxiliary_loss_mlp": 0.01282309, + "balance_loss_clip": 0.06309628, + "balance_loss_mlp": 0.01259981, + "epoch": 0.20459942882909965, + "flos": 18993480249600.0, + "grad_norm": 2.0207590642868736, + "language_loss": 0.82498461, + "learning_rate": 3.688643329848496e-06, + "loss": 0.90342486, + "num_input_tokens_seen": 73523700, + "router_z_loss_clip": 2.52148438, + "router_z_loss_mlp": 0.2232666, + "step": 3403, + "time_per_iteration": 2.527240514755249 + }, + { + "auxiliary_loss_clip": 0.0655287, + "auxiliary_loss_mlp": 0.0128312, + "balance_loss_clip": 0.06304024, + "balance_loss_mlp": 0.01260256, + "epoch": 0.20465955208176762, + "flos": 20345207915520.0, + "grad_norm": 1.870475930372837, + "language_loss": 0.83792305, + "learning_rate": 3.6884346099864772e-06, + "loss": 0.91628289, + "num_input_tokens_seen": 73542625, + "router_z_loss_clip": 2.48632812, + "router_z_loss_mlp": 0.22900391, + "step": 3404, + "time_per_iteration": 2.5108580589294434 + }, + { + "auxiliary_loss_clip": 0.06555478, + "auxiliary_loss_mlp": 0.01280254, + "balance_loss_clip": 0.06302839, + "balance_loss_mlp": 0.0125671, + "epoch": 0.20471967533443558, + "flos": 21257615024640.0, + "grad_norm": 1.9668153962924477, + "language_loss": 0.86568373, + "learning_rate": 3.6882258260988487e-06, + "loss": 0.94404107, + "num_input_tokens_seen": 73561450, + "router_z_loss_clip": 2.52539062, + "router_z_loss_mlp": 0.2355957, + "step": 3405, + "time_per_iteration": 2.6064257621765137 + }, + { + "auxiliary_loss_clip": 0.06551084, + "auxiliary_loss_mlp": 0.0128024, + "balance_loss_clip": 0.06302287, + "balance_loss_mlp": 0.01257256, + "epoch": 0.20477979858710357, + "flos": 14506775124480.0, + "grad_norm": 2.695451734790842, + "language_loss": 0.85318458, + "learning_rate": 3.6880169781935276e-06, + "loss": 0.93149781, + "num_input_tokens_seen": 73577155, + "router_z_loss_clip": 2.49023438, + "router_z_loss_mlp": 0.22973633, + "step": 3406, + "time_per_iteration": 2.490360975265503 + }, + { + "auxiliary_loss_clip": 0.06551544, + "auxiliary_loss_mlp": 0.01279954, + "balance_loss_clip": 0.06302837, + "balance_loss_mlp": 0.01256768, + "epoch": 0.20483992183977154, + "flos": 11405018609280.0, + "grad_norm": 8.923539759508978, + "language_loss": 0.69000643, + "learning_rate": 3.6878080662784336e-06, + "loss": 0.76832145, + "num_input_tokens_seen": 73594900, + "router_z_loss_clip": 2.484375, + "router_z_loss_mlp": 0.23193359, + "step": 3407, + "time_per_iteration": 2.5344340801239014 + }, + { + "auxiliary_loss_clip": 0.06549555, + "auxiliary_loss_mlp": 0.01280964, + "balance_loss_clip": 0.06303824, + "balance_loss_mlp": 0.01258374, + "epoch": 0.2049000450924395, + "flos": 19065917704320.0, + "grad_norm": 2.112423962078429, + "language_loss": 0.85367447, + "learning_rate": 3.6875990903614886e-06, + "loss": 0.93197966, + "num_input_tokens_seen": 73613810, + "router_z_loss_clip": 2.45703125, + "router_z_loss_mlp": 0.22583008, + "step": 3408, + "time_per_iteration": 2.5491087436676025 + }, + { + "auxiliary_loss_clip": 0.06564584, + "auxiliary_loss_mlp": 0.0128728, + "balance_loss_clip": 0.06310433, + "balance_loss_mlp": 0.0126314, + "epoch": 0.20496016834510747, + "flos": 14579799557760.0, + "grad_norm": 2.4221013711544876, + "language_loss": 0.65169537, + "learning_rate": 3.6873900504506166e-06, + "loss": 0.730214, + "num_input_tokens_seen": 73631495, + "router_z_loss_clip": 2.54492188, + "router_z_loss_mlp": 0.24121094, + "step": 3409, + "time_per_iteration": 2.5570828914642334 + }, + { + "auxiliary_loss_clip": 0.06553619, + "auxiliary_loss_mlp": 0.0128187, + "balance_loss_clip": 0.06302843, + "balance_loss_mlp": 0.01259029, + "epoch": 0.20502029159777543, + "flos": 22133069683200.0, + "grad_norm": 1.5677004994493864, + "language_loss": 0.81331646, + "learning_rate": 3.687180946553745e-06, + "loss": 0.89167136, + "num_input_tokens_seen": 73652840, + "router_z_loss_clip": 2.5078125, + "router_z_loss_mlp": 0.22851562, + "step": 3410, + "time_per_iteration": 3.9941341876983643 + }, + { + "auxiliary_loss_clip": 0.06562116, + "auxiliary_loss_mlp": 0.01278044, + "balance_loss_clip": 0.06316169, + "balance_loss_mlp": 0.01256252, + "epoch": 0.2050804148504434, + "flos": 25373873249280.0, + "grad_norm": 2.231323409005704, + "language_loss": 0.76898587, + "learning_rate": 3.686971778678803e-06, + "loss": 0.84738749, + "num_input_tokens_seen": 73672150, + "router_z_loss_clip": 2.45898438, + "router_z_loss_mlp": 0.21801758, + "step": 3411, + "time_per_iteration": 2.557502031326294 + }, + { + "auxiliary_loss_clip": 0.06566584, + "auxiliary_loss_mlp": 0.01283098, + "balance_loss_clip": 0.06318649, + "balance_loss_mlp": 0.01260567, + "epoch": 0.2051405381031114, + "flos": 23626443876480.0, + "grad_norm": 1.9814328821552187, + "language_loss": 0.73997778, + "learning_rate": 3.686762546833722e-06, + "loss": 0.81847459, + "num_input_tokens_seen": 73691940, + "router_z_loss_clip": 2.47851562, + "router_z_loss_mlp": 0.22521973, + "step": 3412, + "time_per_iteration": 4.038960695266724 + }, + { + "auxiliary_loss_clip": 0.06568237, + "auxiliary_loss_mlp": 0.01280941, + "balance_loss_clip": 0.06316938, + "balance_loss_mlp": 0.01257183, + "epoch": 0.20520066135577936, + "flos": 19570338483840.0, + "grad_norm": 2.4438525241528963, + "language_loss": 0.79063112, + "learning_rate": 3.6865532510264362e-06, + "loss": 0.86912292, + "num_input_tokens_seen": 73709080, + "router_z_loss_clip": 2.515625, + "router_z_loss_mlp": 0.23754883, + "step": 3413, + "time_per_iteration": 2.5169565677642822 + }, + { + "auxiliary_loss_clip": 0.0655475, + "auxiliary_loss_mlp": 0.0128187, + "balance_loss_clip": 0.06315412, + "balance_loss_mlp": 0.01259423, + "epoch": 0.20526078460844732, + "flos": 17682184978560.0, + "grad_norm": 1.8594099787920526, + "language_loss": 0.85324407, + "learning_rate": 3.6863438912648823e-06, + "loss": 0.93161035, + "num_input_tokens_seen": 73727670, + "router_z_loss_clip": 2.39453125, + "router_z_loss_mlp": 0.2244873, + "step": 3414, + "time_per_iteration": 2.51891827583313 + }, + { + "auxiliary_loss_clip": 0.06556672, + "auxiliary_loss_mlp": 0.01283982, + "balance_loss_clip": 0.0631127, + "balance_loss_mlp": 0.01261451, + "epoch": 0.2053209078611153, + "flos": 21505632710400.0, + "grad_norm": 1.8989416463636506, + "language_loss": 0.8139196, + "learning_rate": 3.6861344675569986e-06, + "loss": 0.89232612, + "num_input_tokens_seen": 73747170, + "router_z_loss_clip": 2.453125, + "router_z_loss_mlp": 0.22521973, + "step": 3415, + "time_per_iteration": 2.534064769744873 + }, + { + "auxiliary_loss_clip": 0.06545444, + "auxiliary_loss_mlp": 0.01280017, + "balance_loss_clip": 0.06300274, + "balance_loss_mlp": 0.01259048, + "epoch": 0.20538103111378325, + "flos": 25670163686400.0, + "grad_norm": 1.9272907146050138, + "language_loss": 0.73450923, + "learning_rate": 3.6859249799107275e-06, + "loss": 0.81276381, + "num_input_tokens_seen": 73767690, + "router_z_loss_clip": 2.44921875, + "router_z_loss_mlp": 0.20959473, + "step": 3416, + "time_per_iteration": 2.5862622261047363 + }, + { + "auxiliary_loss_clip": 0.06555279, + "auxiliary_loss_mlp": 0.01278586, + "balance_loss_clip": 0.06309061, + "balance_loss_mlp": 0.01256342, + "epoch": 0.20544115436645122, + "flos": 23155663311360.0, + "grad_norm": 3.21470343355828, + "language_loss": 0.79731691, + "learning_rate": 3.6857154283340115e-06, + "loss": 0.87565553, + "num_input_tokens_seen": 73786900, + "router_z_loss_clip": 2.46484375, + "router_z_loss_mlp": 0.22253418, + "step": 3417, + "time_per_iteration": 2.5488288402557373 + }, + { + "auxiliary_loss_clip": 0.06553051, + "auxiliary_loss_mlp": 0.0128197, + "balance_loss_clip": 0.06304517, + "balance_loss_mlp": 0.01258248, + "epoch": 0.20550127761911918, + "flos": 19396435334400.0, + "grad_norm": 3.2012221600430744, + "language_loss": 0.88593423, + "learning_rate": 3.685505812834798e-06, + "loss": 0.96428442, + "num_input_tokens_seen": 73804515, + "router_z_loss_clip": 2.484375, + "router_z_loss_mlp": 0.23681641, + "step": 3418, + "time_per_iteration": 5.385840177536011 + }, + { + "auxiliary_loss_clip": 0.06553373, + "auxiliary_loss_mlp": 0.01284895, + "balance_loss_clip": 0.06304052, + "balance_loss_mlp": 0.0125998, + "epoch": 0.20556140087178718, + "flos": 22899721415040.0, + "grad_norm": 2.325256215928591, + "language_loss": 0.63040721, + "learning_rate": 3.685296133421035e-06, + "loss": 0.70878994, + "num_input_tokens_seen": 73822910, + "router_z_loss_clip": 2.49609375, + "router_z_loss_mlp": 0.24926758, + "step": 3419, + "time_per_iteration": 2.5786759853363037 + }, + { + "auxiliary_loss_clip": 0.06563735, + "auxiliary_loss_mlp": 0.01291649, + "balance_loss_clip": 0.06310479, + "balance_loss_mlp": 0.01265554, + "epoch": 0.20562152412445514, + "flos": 19795365423360.0, + "grad_norm": 1.7732270709951168, + "language_loss": 0.86988509, + "learning_rate": 3.685086390100674e-06, + "loss": 0.948439, + "num_input_tokens_seen": 73841160, + "router_z_loss_clip": 2.53515625, + "router_z_loss_mlp": 0.26098633, + "step": 3420, + "time_per_iteration": 2.5364928245544434 + }, + { + "auxiliary_loss_clip": 0.06546585, + "auxiliary_loss_mlp": 0.01284653, + "balance_loss_clip": 0.0630153, + "balance_loss_mlp": 0.01261109, + "epoch": 0.2056816473771231, + "flos": 31509728507520.0, + "grad_norm": 10.333340616962191, + "language_loss": 0.71886712, + "learning_rate": 3.684876582881668e-06, + "loss": 0.79717946, + "num_input_tokens_seen": 73862795, + "router_z_loss_clip": 2.44726562, + "router_z_loss_mlp": 0.2355957, + "step": 3421, + "time_per_iteration": 2.6350786685943604 + }, + { + "auxiliary_loss_clip": 0.06544094, + "auxiliary_loss_mlp": 0.01288814, + "balance_loss_clip": 0.0629928, + "balance_loss_mlp": 0.0126564, + "epoch": 0.20574177062979107, + "flos": 23265095143680.0, + "grad_norm": 2.122387036588777, + "language_loss": 0.72175372, + "learning_rate": 3.6846667117719732e-06, + "loss": 0.8000828, + "num_input_tokens_seen": 73881525, + "router_z_loss_clip": 2.44921875, + "router_z_loss_mlp": 0.23168945, + "step": 3422, + "time_per_iteration": 2.578552007675171 + }, + { + "auxiliary_loss_clip": 0.06409879, + "auxiliary_loss_mlp": 0.01269861, + "balance_loss_clip": 0.06279843, + "balance_loss_mlp": 0.01263078, + "epoch": 0.20580189388245904, + "flos": 70331124291840.0, + "grad_norm": 0.7131964126658911, + "language_loss": 0.551377, + "learning_rate": 3.684456776779548e-06, + "loss": 0.62817442, + "num_input_tokens_seen": 73937775, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.06799316, + "step": 3423, + "time_per_iteration": 3.2106337547302246 + }, + { + "auxiliary_loss_clip": 0.06548166, + "auxiliary_loss_mlp": 0.0128448, + "balance_loss_clip": 0.06301543, + "balance_loss_mlp": 0.01261091, + "epoch": 0.205862017135127, + "flos": 30745802033280.0, + "grad_norm": 1.8660135712145316, + "language_loss": 0.72238076, + "learning_rate": 3.684246777912353e-06, + "loss": 0.80070728, + "num_input_tokens_seen": 73958250, + "router_z_loss_clip": 2.46875, + "router_z_loss_mlp": 0.23400879, + "step": 3424, + "time_per_iteration": 2.614389181137085 + }, + { + "auxiliary_loss_clip": 0.06544662, + "auxiliary_loss_mlp": 0.01287262, + "balance_loss_clip": 0.06303795, + "balance_loss_mlp": 0.01263229, + "epoch": 0.20592214038779497, + "flos": 21330932947200.0, + "grad_norm": 1.6926765615616197, + "language_loss": 0.75646138, + "learning_rate": 3.684036715178351e-06, + "loss": 0.83478063, + "num_input_tokens_seen": 73977775, + "router_z_loss_clip": 2.41015625, + "router_z_loss_mlp": 0.24023438, + "step": 3425, + "time_per_iteration": 2.5351436138153076 + }, + { + "auxiliary_loss_clip": 0.06546403, + "auxiliary_loss_mlp": 0.01289796, + "balance_loss_clip": 0.06304145, + "balance_loss_mlp": 0.01266813, + "epoch": 0.20598226364046296, + "flos": 22898002406400.0, + "grad_norm": 1.848184132977354, + "language_loss": 0.88618112, + "learning_rate": 3.683826588585508e-06, + "loss": 0.9645431, + "num_input_tokens_seen": 73996590, + "router_z_loss_clip": 2.421875, + "router_z_loss_mlp": 0.22998047, + "step": 3426, + "time_per_iteration": 2.604752779006958 + }, + { + "auxiliary_loss_clip": 0.06551787, + "auxiliary_loss_mlp": 0.01284615, + "balance_loss_clip": 0.06311674, + "balance_loss_mlp": 0.01261226, + "epoch": 0.20604238689313092, + "flos": 23885362592640.0, + "grad_norm": 1.5517486951437824, + "language_loss": 0.77144063, + "learning_rate": 3.6836163981417926e-06, + "loss": 0.8498047, + "num_input_tokens_seen": 74015935, + "router_z_loss_clip": 2.40625, + "router_z_loss_mlp": 0.23376465, + "step": 3427, + "time_per_iteration": 2.5526115894317627 + }, + { + "auxiliary_loss_clip": 0.06556956, + "auxiliary_loss_mlp": 0.01287227, + "balance_loss_clip": 0.06309945, + "balance_loss_mlp": 0.01264661, + "epoch": 0.2061025101457989, + "flos": 22498024141440.0, + "grad_norm": 1.8896972045039995, + "language_loss": 0.74443614, + "learning_rate": 3.683406143855174e-06, + "loss": 0.822878, + "num_input_tokens_seen": 74036575, + "router_z_loss_clip": 2.46875, + "router_z_loss_mlp": 0.22558594, + "step": 3428, + "time_per_iteration": 2.5644474029541016 + }, + { + "auxiliary_loss_clip": 0.06552382, + "auxiliary_loss_mlp": 0.01283805, + "balance_loss_clip": 0.06304047, + "balance_loss_mlp": 0.01259427, + "epoch": 0.20616263339846685, + "flos": 22784713286400.0, + "grad_norm": 1.96097325322206, + "language_loss": 0.74164659, + "learning_rate": 3.6831958257336256e-06, + "loss": 0.82000846, + "num_input_tokens_seen": 74055365, + "router_z_loss_clip": 2.484375, + "router_z_loss_mlp": 0.24377441, + "step": 3429, + "time_per_iteration": 2.5337913036346436 + }, + { + "auxiliary_loss_clip": 0.06551956, + "auxiliary_loss_mlp": 0.01286455, + "balance_loss_clip": 0.06304303, + "balance_loss_mlp": 0.01263126, + "epoch": 0.20622275665113482, + "flos": 20887755102720.0, + "grad_norm": 2.9642283368918863, + "language_loss": 0.86220586, + "learning_rate": 3.6829854437851237e-06, + "loss": 0.94058996, + "num_input_tokens_seen": 74074875, + "router_z_loss_clip": 2.4765625, + "router_z_loss_mlp": 0.23327637, + "step": 3430, + "time_per_iteration": 2.5939443111419678 + }, + { + "auxiliary_loss_clip": 0.06546243, + "auxiliary_loss_mlp": 0.01280876, + "balance_loss_clip": 0.06300765, + "balance_loss_mlp": 0.01257607, + "epoch": 0.20628287990380278, + "flos": 19360489132800.0, + "grad_norm": 1.6588894263331828, + "language_loss": 0.70011377, + "learning_rate": 3.6827749980176444e-06, + "loss": 0.77838504, + "num_input_tokens_seen": 74094505, + "router_z_loss_clip": 2.45507812, + "router_z_loss_mlp": 0.23278809, + "step": 3431, + "time_per_iteration": 2.565840482711792 + }, + { + "auxiliary_loss_clip": 0.06410907, + "auxiliary_loss_mlp": 0.0126731, + "balance_loss_clip": 0.06280327, + "balance_loss_mlp": 0.01261215, + "epoch": 0.20634300315647078, + "flos": 71536970799360.0, + "grad_norm": 0.791675242165557, + "language_loss": 0.60400987, + "learning_rate": 3.6825644884391693e-06, + "loss": 0.68079197, + "num_input_tokens_seen": 74158500, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.0609436, + "step": 3432, + "time_per_iteration": 3.305082082748413 + }, + { + "auxiliary_loss_clip": 0.06552991, + "auxiliary_loss_mlp": 0.01280414, + "balance_loss_clip": 0.06308176, + "balance_loss_mlp": 0.01257561, + "epoch": 0.20640312640913874, + "flos": 21730072671360.0, + "grad_norm": 1.5897016059046762, + "language_loss": 0.72477019, + "learning_rate": 3.682353915057679e-06, + "loss": 0.80310422, + "num_input_tokens_seen": 74176685, + "router_z_loss_clip": 2.44921875, + "router_z_loss_mlp": 0.22875977, + "step": 3433, + "time_per_iteration": 2.564393997192383 + }, + { + "auxiliary_loss_clip": 0.06561184, + "auxiliary_loss_mlp": 0.01281531, + "balance_loss_clip": 0.06312474, + "balance_loss_mlp": 0.01258512, + "epoch": 0.2064632496618067, + "flos": 20560256219520.0, + "grad_norm": 1.7877531320590552, + "language_loss": 0.87141019, + "learning_rate": 3.6821432778811604e-06, + "loss": 0.94983733, + "num_input_tokens_seen": 74194935, + "router_z_loss_clip": 2.484375, + "router_z_loss_mlp": 0.23010254, + "step": 3434, + "time_per_iteration": 2.5466108322143555 + }, + { + "auxiliary_loss_clip": 0.06556005, + "auxiliary_loss_mlp": 0.01283316, + "balance_loss_clip": 0.06305495, + "balance_loss_mlp": 0.01259427, + "epoch": 0.20652337291447467, + "flos": 29830669666560.0, + "grad_norm": 1.6526860814470912, + "language_loss": 0.6970489, + "learning_rate": 3.6819325769176004e-06, + "loss": 0.77544212, + "num_input_tokens_seen": 74215400, + "router_z_loss_clip": 2.50585938, + "router_z_loss_mlp": 0.2388916, + "step": 3435, + "time_per_iteration": 2.613896369934082 + }, + { + "auxiliary_loss_clip": 0.06545977, + "auxiliary_loss_mlp": 0.01289312, + "balance_loss_clip": 0.0630382, + "balance_loss_mlp": 0.01264325, + "epoch": 0.20658349616714264, + "flos": 26220844719360.0, + "grad_norm": 1.7674379542335852, + "language_loss": 0.89957321, + "learning_rate": 3.681721812174988e-06, + "loss": 0.97792608, + "num_input_tokens_seen": 74234090, + "router_z_loss_clip": 2.42578125, + "router_z_loss_mlp": 0.24975586, + "step": 3436, + "time_per_iteration": 2.590360641479492 + }, + { + "auxiliary_loss_clip": 0.06548543, + "auxiliary_loss_mlp": 0.01277538, + "balance_loss_clip": 0.06303848, + "balance_loss_mlp": 0.01254209, + "epoch": 0.2066436194198106, + "flos": 26001477930240.0, + "grad_norm": 1.7140409089026185, + "language_loss": 0.77244872, + "learning_rate": 3.6815109836613163e-06, + "loss": 0.8507095, + "num_input_tokens_seen": 74253345, + "router_z_loss_clip": 2.44335938, + "router_z_loss_mlp": 0.23339844, + "step": 3437, + "time_per_iteration": 2.6068568229675293 + }, + { + "auxiliary_loss_clip": 0.06548648, + "auxiliary_loss_mlp": 0.01280201, + "balance_loss_clip": 0.06300757, + "balance_loss_mlp": 0.01257682, + "epoch": 0.20670374267247857, + "flos": 21367466127360.0, + "grad_norm": 2.0146667208247355, + "language_loss": 0.78725338, + "learning_rate": 3.6813000913845795e-06, + "loss": 0.86554188, + "num_input_tokens_seen": 74271615, + "router_z_loss_clip": 2.47460938, + "router_z_loss_mlp": 0.22521973, + "step": 3438, + "time_per_iteration": 2.567963123321533 + }, + { + "auxiliary_loss_clip": 0.06407821, + "auxiliary_loss_mlp": 0.01263014, + "balance_loss_clip": 0.06278364, + "balance_loss_mlp": 0.01257164, + "epoch": 0.20676386592514656, + "flos": 66403108264320.0, + "grad_norm": 0.8029327028802032, + "language_loss": 0.66817588, + "learning_rate": 3.6810891353527747e-06, + "loss": 0.74488425, + "num_input_tokens_seen": 74331390, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.05844116, + "step": 3439, + "time_per_iteration": 3.1231849193573 + }, + { + "auxiliary_loss_clip": 0.06557775, + "auxiliary_loss_mlp": 0.01283609, + "balance_loss_clip": 0.06302103, + "balance_loss_mlp": 0.01260423, + "epoch": 0.20682398917781453, + "flos": 17280278069760.0, + "grad_norm": 1.9287299109512155, + "language_loss": 0.8404541, + "learning_rate": 3.6808781155739014e-06, + "loss": 0.91886795, + "num_input_tokens_seen": 74347335, + "router_z_loss_clip": 2.5546875, + "router_z_loss_mlp": 0.23168945, + "step": 3440, + "time_per_iteration": 2.496563196182251 + }, + { + "auxiliary_loss_clip": 0.06545421, + "auxiliary_loss_mlp": 0.01282262, + "balance_loss_clip": 0.06298509, + "balance_loss_mlp": 0.0126028, + "epoch": 0.2068841124304825, + "flos": 18083127565440.0, + "grad_norm": 3.100665935871663, + "language_loss": 0.85299611, + "learning_rate": 3.6806670320559614e-06, + "loss": 0.93127292, + "num_input_tokens_seen": 74366310, + "router_z_loss_clip": 2.47265625, + "router_z_loss_mlp": 0.2199707, + "step": 3441, + "time_per_iteration": 2.528823137283325 + }, + { + "auxiliary_loss_clip": 0.06546343, + "auxiliary_loss_mlp": 0.01282668, + "balance_loss_clip": 0.06300771, + "balance_loss_mlp": 0.01258958, + "epoch": 0.20694423568315046, + "flos": 27354798823680.0, + "grad_norm": 1.6487564578537555, + "language_loss": 0.86298448, + "learning_rate": 3.680455884806959e-06, + "loss": 0.94127464, + "num_input_tokens_seen": 74387100, + "router_z_loss_clip": 2.45898438, + "router_z_loss_mlp": 0.23693848, + "step": 3442, + "time_per_iteration": 2.5904433727264404 + }, + { + "auxiliary_loss_clip": 0.06553168, + "auxiliary_loss_mlp": 0.0128107, + "balance_loss_clip": 0.06302296, + "balance_loss_mlp": 0.01256298, + "epoch": 0.20700435893581842, + "flos": 20236027645440.0, + "grad_norm": 1.991917549605425, + "language_loss": 0.74110967, + "learning_rate": 3.6802446738349014e-06, + "loss": 0.81945205, + "num_input_tokens_seen": 74404460, + "router_z_loss_clip": 2.50976562, + "router_z_loss_mlp": 0.24755859, + "step": 3443, + "time_per_iteration": 2.546297311782837 + }, + { + "auxiliary_loss_clip": 0.06540793, + "auxiliary_loss_mlp": 0.01282, + "balance_loss_clip": 0.06298059, + "balance_loss_mlp": 0.01259183, + "epoch": 0.2070644821884864, + "flos": 20637347575680.0, + "grad_norm": 5.522598582225395, + "language_loss": 0.86263227, + "learning_rate": 3.680033399147797e-06, + "loss": 0.94086015, + "num_input_tokens_seen": 74423790, + "router_z_loss_clip": 2.42773438, + "router_z_loss_mlp": 0.22814941, + "step": 3444, + "time_per_iteration": 2.5644776821136475 + }, + { + "auxiliary_loss_clip": 0.06396829, + "auxiliary_loss_mlp": 0.01270586, + "balance_loss_clip": 0.06267206, + "balance_loss_mlp": 0.01264399, + "epoch": 0.20712460544115438, + "flos": 65960098128000.0, + "grad_norm": 0.6752802627643808, + "language_loss": 0.56895542, + "learning_rate": 3.6798220607536585e-06, + "loss": 0.64562953, + "num_input_tokens_seen": 74488130, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.06185913, + "step": 3445, + "time_per_iteration": 3.133159637451172 + }, + { + "auxiliary_loss_clip": 0.06550106, + "auxiliary_loss_mlp": 0.0128273, + "balance_loss_clip": 0.06305806, + "balance_loss_mlp": 0.01259412, + "epoch": 0.20718472869382235, + "flos": 19431542995200.0, + "grad_norm": 1.845349461285762, + "language_loss": 0.78388685, + "learning_rate": 3.6796106586604987e-06, + "loss": 0.86221522, + "num_input_tokens_seen": 74506720, + "router_z_loss_clip": 2.4453125, + "router_z_loss_mlp": 0.23327637, + "step": 3446, + "time_per_iteration": 2.5563149452209473 + }, + { + "auxiliary_loss_clip": 0.06562304, + "auxiliary_loss_mlp": 0.0128875, + "balance_loss_clip": 0.06302087, + "balance_loss_mlp": 0.01263215, + "epoch": 0.2072448519464903, + "flos": 24506007384960.0, + "grad_norm": 2.528724295630225, + "language_loss": 0.63215572, + "learning_rate": 3.679399192876334e-06, + "loss": 0.7106663, + "num_input_tokens_seen": 74525330, + "router_z_loss_clip": 2.60351562, + "router_z_loss_mlp": 0.25549316, + "step": 3447, + "time_per_iteration": 2.5858354568481445 + }, + { + "auxiliary_loss_clip": 0.06550243, + "auxiliary_loss_mlp": 0.01285454, + "balance_loss_clip": 0.06302016, + "balance_loss_mlp": 0.01261624, + "epoch": 0.20730497519915828, + "flos": 23082345388800.0, + "grad_norm": 1.7246458475869415, + "language_loss": 0.87330115, + "learning_rate": 3.679187663409184e-06, + "loss": 0.95165813, + "num_input_tokens_seen": 74544535, + "router_z_loss_clip": 2.47851562, + "router_z_loss_mlp": 0.23840332, + "step": 3448, + "time_per_iteration": 2.5367424488067627 + }, + { + "auxiliary_loss_clip": 0.06547908, + "auxiliary_loss_mlp": 0.01287375, + "balance_loss_clip": 0.06301224, + "balance_loss_mlp": 0.0126407, + "epoch": 0.20736509845182624, + "flos": 21075368394240.0, + "grad_norm": 2.238353970842136, + "language_loss": 0.75934261, + "learning_rate": 3.6789760702670696e-06, + "loss": 0.83769548, + "num_input_tokens_seen": 74562300, + "router_z_loss_clip": 2.47070312, + "router_z_loss_mlp": 0.23291016, + "step": 3449, + "time_per_iteration": 3.94480562210083 + }, + { + "auxiliary_loss_clip": 0.06557415, + "auxiliary_loss_mlp": 0.01291462, + "balance_loss_clip": 0.06305711, + "balance_loss_mlp": 0.01267262, + "epoch": 0.2074252217044942, + "flos": 17638021077120.0, + "grad_norm": 1.9890451191355467, + "language_loss": 0.77508813, + "learning_rate": 3.6787644134580134e-06, + "loss": 0.8535769, + "num_input_tokens_seen": 74580080, + "router_z_loss_clip": 2.51757812, + "router_z_loss_mlp": 0.24243164, + "step": 3450, + "time_per_iteration": 2.545430898666382 + }, + { + "auxiliary_loss_clip": 0.06561074, + "auxiliary_loss_mlp": 0.01294493, + "balance_loss_clip": 0.06309673, + "balance_loss_mlp": 0.01270579, + "epoch": 0.20748534495716217, + "flos": 23553209808000.0, + "grad_norm": 2.274256725147599, + "language_loss": 0.823879, + "learning_rate": 3.6785526929900436e-06, + "loss": 0.90243471, + "num_input_tokens_seen": 74598980, + "router_z_loss_clip": 2.515625, + "router_z_loss_mlp": 0.23913574, + "step": 3451, + "time_per_iteration": 4.003388404846191 + }, + { + "auxiliary_loss_clip": 0.0640305, + "auxiliary_loss_mlp": 0.01254439, + "balance_loss_clip": 0.06273949, + "balance_loss_mlp": 0.01248494, + "epoch": 0.20754546820983016, + "flos": 52268666757120.0, + "grad_norm": 0.7675919354914552, + "language_loss": 0.56549037, + "learning_rate": 3.6783409088711875e-06, + "loss": 0.64206523, + "num_input_tokens_seen": 74655275, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.05941772, + "step": 3452, + "time_per_iteration": 3.0660083293914795 + }, + { + "auxiliary_loss_clip": 0.06557937, + "auxiliary_loss_mlp": 0.01287582, + "balance_loss_clip": 0.06309802, + "balance_loss_mlp": 0.01264956, + "epoch": 0.20760559146249813, + "flos": 20418609692160.0, + "grad_norm": 1.8872949255610445, + "language_loss": 0.88967919, + "learning_rate": 3.6781290611094755e-06, + "loss": 0.9681344, + "num_input_tokens_seen": 74674560, + "router_z_loss_clip": 2.48046875, + "router_z_loss_mlp": 0.22619629, + "step": 3453, + "time_per_iteration": 2.581430673599243 + }, + { + "auxiliary_loss_clip": 0.06554953, + "auxiliary_loss_mlp": 0.01287205, + "balance_loss_clip": 0.06307904, + "balance_loss_mlp": 0.01263256, + "epoch": 0.2076657147151661, + "flos": 23192825397120.0, + "grad_norm": 1.4776896143180385, + "language_loss": 0.80720532, + "learning_rate": 3.6779171497129407e-06, + "loss": 0.88562691, + "num_input_tokens_seen": 74694500, + "router_z_loss_clip": 2.47070312, + "router_z_loss_mlp": 0.23962402, + "step": 3454, + "time_per_iteration": 2.5793018341064453 + }, + { + "auxiliary_loss_clip": 0.06549348, + "auxiliary_loss_mlp": 0.01286388, + "balance_loss_clip": 0.06301847, + "balance_loss_mlp": 0.01263476, + "epoch": 0.20772583796783406, + "flos": 18298595139840.0, + "grad_norm": 4.241833159654324, + "language_loss": 0.78446364, + "learning_rate": 3.6777051746896202e-06, + "loss": 0.86282104, + "num_input_tokens_seen": 74710485, + "router_z_loss_clip": 2.4765625, + "router_z_loss_mlp": 0.22912598, + "step": 3455, + "time_per_iteration": 2.5377535820007324 + }, + { + "auxiliary_loss_clip": 0.0654678, + "auxiliary_loss_mlp": 0.01279125, + "balance_loss_clip": 0.06301546, + "balance_loss_mlp": 0.01256547, + "epoch": 0.20778596122050202, + "flos": 17608531639680.0, + "grad_norm": 1.6321737814924744, + "language_loss": 0.81251496, + "learning_rate": 3.6774931360475516e-06, + "loss": 0.89077407, + "num_input_tokens_seen": 74727450, + "router_z_loss_clip": 2.453125, + "router_z_loss_mlp": 0.22595215, + "step": 3456, + "time_per_iteration": 2.5125768184661865 + }, + { + "auxiliary_loss_clip": 0.06554688, + "auxiliary_loss_mlp": 0.01282924, + "balance_loss_clip": 0.06304802, + "balance_loss_mlp": 0.01259893, + "epoch": 0.20784608447317, + "flos": 23812380086400.0, + "grad_norm": 2.3276439316102695, + "language_loss": 0.79071975, + "learning_rate": 3.6772810337947745e-06, + "loss": 0.86909586, + "num_input_tokens_seen": 74746725, + "router_z_loss_clip": 2.49804688, + "router_z_loss_mlp": 0.23022461, + "step": 3457, + "time_per_iteration": 5.41590428352356 + }, + { + "auxiliary_loss_clip": 0.06553855, + "auxiliary_loss_mlp": 0.01279092, + "balance_loss_clip": 0.0630386, + "balance_loss_mlp": 0.01255739, + "epoch": 0.20790620772583795, + "flos": 17645022892800.0, + "grad_norm": 1.9963286729709264, + "language_loss": 0.84664595, + "learning_rate": 3.677068867939333e-06, + "loss": 0.9249754, + "num_input_tokens_seen": 74765255, + "router_z_loss_clip": 2.49609375, + "router_z_loss_mlp": 0.23364258, + "step": 3458, + "time_per_iteration": 2.610107183456421 + }, + { + "auxiliary_loss_clip": 0.06541788, + "auxiliary_loss_mlp": 0.01277058, + "balance_loss_clip": 0.06299603, + "balance_loss_mlp": 0.01254289, + "epoch": 0.20796633097850595, + "flos": 27680997968640.0, + "grad_norm": 1.7522329071194311, + "language_loss": 0.76853168, + "learning_rate": 3.676856638489272e-06, + "loss": 0.8467201, + "num_input_tokens_seen": 74785710, + "router_z_loss_clip": 2.41796875, + "router_z_loss_mlp": 0.2277832, + "step": 3459, + "time_per_iteration": 2.63517689704895 + }, + { + "auxiliary_loss_clip": 0.06543219, + "auxiliary_loss_mlp": 0.01279579, + "balance_loss_clip": 0.06299554, + "balance_loss_mlp": 0.01257024, + "epoch": 0.2080264542311739, + "flos": 19251770060160.0, + "grad_norm": 1.8057193688460893, + "language_loss": 0.77803749, + "learning_rate": 3.6766443454526382e-06, + "loss": 0.85626543, + "num_input_tokens_seen": 74804490, + "router_z_loss_clip": 2.43945312, + "router_z_loss_mlp": 0.22570801, + "step": 3460, + "time_per_iteration": 2.5500359535217285 + }, + { + "auxiliary_loss_clip": 0.06544735, + "auxiliary_loss_mlp": 0.01276939, + "balance_loss_clip": 0.06297737, + "balance_loss_mlp": 0.01255315, + "epoch": 0.20808657748384188, + "flos": 27533146239360.0, + "grad_norm": 1.865214089074118, + "language_loss": 0.76152873, + "learning_rate": 3.6764319888374836e-06, + "loss": 0.8397454, + "num_input_tokens_seen": 74826340, + "router_z_loss_clip": 2.46484375, + "router_z_loss_mlp": 0.21618652, + "step": 3461, + "time_per_iteration": 2.575975179672241 + }, + { + "auxiliary_loss_clip": 0.06554922, + "auxiliary_loss_mlp": 0.01279751, + "balance_loss_clip": 0.06301013, + "balance_loss_mlp": 0.01256183, + "epoch": 0.20814670073650984, + "flos": 26914262382720.0, + "grad_norm": 2.229402903272821, + "language_loss": 0.89438462, + "learning_rate": 3.6762195686518604e-06, + "loss": 0.97273135, + "num_input_tokens_seen": 74844960, + "router_z_loss_clip": 2.53515625, + "router_z_loss_mlp": 0.23571777, + "step": 3462, + "time_per_iteration": 2.5732173919677734 + }, + { + "auxiliary_loss_clip": 0.06402825, + "auxiliary_loss_mlp": 0.01283843, + "balance_loss_clip": 0.06274004, + "balance_loss_mlp": 0.01278395, + "epoch": 0.2082068239891778, + "flos": 70195850674560.0, + "grad_norm": 0.9150130859854356, + "language_loss": 0.59001637, + "learning_rate": 3.6760070849038226e-06, + "loss": 0.66688299, + "num_input_tokens_seen": 74909075, + "router_z_loss_clip": 1.28417969, + "router_z_loss_mlp": 0.05456543, + "step": 3463, + "time_per_iteration": 3.269202709197998 + }, + { + "auxiliary_loss_clip": 0.06550549, + "auxiliary_loss_mlp": 0.01282784, + "balance_loss_clip": 0.06300794, + "balance_loss_mlp": 0.01257929, + "epoch": 0.20826694724184577, + "flos": 24614978019840.0, + "grad_norm": 2.6522237220698663, + "language_loss": 0.66949397, + "learning_rate": 3.675794537601429e-06, + "loss": 0.74782729, + "num_input_tokens_seen": 74928125, + "router_z_loss_clip": 2.5, + "router_z_loss_mlp": 0.2487793, + "step": 3464, + "time_per_iteration": 2.5638158321380615 + }, + { + "auxiliary_loss_clip": 0.06556059, + "auxiliary_loss_mlp": 0.01287892, + "balance_loss_clip": 0.06307128, + "balance_loss_mlp": 0.01263299, + "epoch": 0.20832707049451377, + "flos": 12897218845440.0, + "grad_norm": 2.2476817474527913, + "language_loss": 0.84321886, + "learning_rate": 3.6755819267527373e-06, + "loss": 0.9216584, + "num_input_tokens_seen": 74945090, + "router_z_loss_clip": 2.49023438, + "router_z_loss_mlp": 0.24609375, + "step": 3465, + "time_per_iteration": 2.5794646739959717 + }, + { + "auxiliary_loss_clip": 0.06542073, + "auxiliary_loss_mlp": 0.01282156, + "balance_loss_clip": 0.06295872, + "balance_loss_mlp": 0.01258326, + "epoch": 0.20838719374718173, + "flos": 22205129794560.0, + "grad_norm": 3.281235222185926, + "language_loss": 0.82741451, + "learning_rate": 3.6753692523658113e-06, + "loss": 0.90565681, + "num_input_tokens_seen": 74963630, + "router_z_loss_clip": 2.46289062, + "router_z_loss_mlp": 0.23828125, + "step": 3466, + "time_per_iteration": 2.540011405944824 + }, + { + "auxiliary_loss_clip": 0.06540319, + "auxiliary_loss_mlp": 0.01287937, + "balance_loss_clip": 0.06300111, + "balance_loss_mlp": 0.01267243, + "epoch": 0.2084473169998497, + "flos": 15164036951040.0, + "grad_norm": 2.490655035944783, + "language_loss": 0.82892549, + "learning_rate": 3.675156514448716e-06, + "loss": 0.90720803, + "num_input_tokens_seen": 74981875, + "router_z_loss_clip": 2.40234375, + "router_z_loss_mlp": 0.20690918, + "step": 3467, + "time_per_iteration": 2.54622745513916 + }, + { + "auxiliary_loss_clip": 0.06540733, + "auxiliary_loss_mlp": 0.01289148, + "balance_loss_clip": 0.06303266, + "balance_loss_mlp": 0.01268167, + "epoch": 0.20850744025251766, + "flos": 17462482773120.0, + "grad_norm": 1.8114532422505003, + "language_loss": 0.82299387, + "learning_rate": 3.674943713009518e-06, + "loss": 0.90129268, + "num_input_tokens_seen": 74999155, + "router_z_loss_clip": 2.37304688, + "router_z_loss_mlp": 0.2097168, + "step": 3468, + "time_per_iteration": 2.5321285724639893 + }, + { + "auxiliary_loss_clip": 0.06553383, + "auxiliary_loss_mlp": 0.01280357, + "balance_loss_clip": 0.06302625, + "balance_loss_mlp": 0.01257158, + "epoch": 0.20856756350518563, + "flos": 25705439055360.0, + "grad_norm": 1.667306072143411, + "language_loss": 0.9042781, + "learning_rate": 3.6747308480562856e-06, + "loss": 0.98261553, + "num_input_tokens_seen": 75017850, + "router_z_loss_clip": 2.5078125, + "router_z_loss_mlp": 0.23217773, + "step": 3469, + "time_per_iteration": 2.6107866764068604 + }, + { + "auxiliary_loss_clip": 0.0655106, + "auxiliary_loss_mlp": 0.01281556, + "balance_loss_clip": 0.06308927, + "balance_loss_mlp": 0.01259872, + "epoch": 0.2086276867578536, + "flos": 37898213425920.0, + "grad_norm": 1.9476878714472061, + "language_loss": 0.77294397, + "learning_rate": 3.674517919597092e-06, + "loss": 0.85127008, + "num_input_tokens_seen": 75039270, + "router_z_loss_clip": 2.421875, + "router_z_loss_mlp": 0.21679688, + "step": 3470, + "time_per_iteration": 2.7083425521850586 + }, + { + "auxiliary_loss_clip": 0.06547298, + "auxiliary_loss_mlp": 0.01289218, + "balance_loss_clip": 0.06307482, + "balance_loss_mlp": 0.01266283, + "epoch": 0.20868781001052156, + "flos": 25564169871360.0, + "grad_norm": 1.8036684586339249, + "language_loss": 0.76289082, + "learning_rate": 3.674304927640011e-06, + "loss": 0.84125602, + "num_input_tokens_seen": 75059350, + "router_z_loss_clip": 2.40039062, + "router_z_loss_mlp": 0.22937012, + "step": 3471, + "time_per_iteration": 2.589884042739868 + }, + { + "auxiliary_loss_clip": 0.06554438, + "auxiliary_loss_mlp": 0.01280867, + "balance_loss_clip": 0.06303854, + "balance_loss_mlp": 0.01259028, + "epoch": 0.20874793326318955, + "flos": 27536961600000.0, + "grad_norm": 1.6381609540737498, + "language_loss": 0.76341867, + "learning_rate": 3.67409187219312e-06, + "loss": 0.84177172, + "num_input_tokens_seen": 75080150, + "router_z_loss_clip": 2.5078125, + "router_z_loss_mlp": 0.21813965, + "step": 3472, + "time_per_iteration": 2.610260009765625 + }, + { + "auxiliary_loss_clip": 0.06544036, + "auxiliary_loss_mlp": 0.01279562, + "balance_loss_clip": 0.06302247, + "balance_loss_mlp": 0.01259022, + "epoch": 0.20880805651585752, + "flos": 18554243546880.0, + "grad_norm": 2.073955911698539, + "language_loss": 0.85418117, + "learning_rate": 3.6738787532644966e-06, + "loss": 0.93241715, + "num_input_tokens_seen": 75097920, + "router_z_loss_clip": 2.41601562, + "router_z_loss_mlp": 0.20532227, + "step": 3473, + "time_per_iteration": 2.5741372108459473 + }, + { + "auxiliary_loss_clip": 0.06431094, + "auxiliary_loss_mlp": 0.01255526, + "balance_loss_clip": 0.06305239, + "balance_loss_mlp": 0.01250132, + "epoch": 0.20886817976852548, + "flos": 65966596819200.0, + "grad_norm": 0.8661888314681573, + "language_loss": 0.63746876, + "learning_rate": 3.6736655708622235e-06, + "loss": 0.71433502, + "num_input_tokens_seen": 75152410, + "router_z_loss_clip": 1.25585938, + "router_z_loss_mlp": 0.05401611, + "step": 3474, + "time_per_iteration": 3.061617612838745 + }, + { + "auxiliary_loss_clip": 0.06545534, + "auxiliary_loss_mlp": 0.01278543, + "balance_loss_clip": 0.06299987, + "balance_loss_mlp": 0.01255751, + "epoch": 0.20892830302119345, + "flos": 36548120914560.0, + "grad_norm": 1.9594452651536962, + "language_loss": 0.70746702, + "learning_rate": 3.6734523249943844e-06, + "loss": 0.78570777, + "num_input_tokens_seen": 75173265, + "router_z_loss_clip": 2.45898438, + "router_z_loss_mlp": 0.22790527, + "step": 3475, + "time_per_iteration": 2.7295854091644287 + }, + { + "auxiliary_loss_clip": 0.06544538, + "auxiliary_loss_mlp": 0.01277403, + "balance_loss_clip": 0.06299123, + "balance_loss_mlp": 0.01255754, + "epoch": 0.2089884262738614, + "flos": 20962582398720.0, + "grad_norm": 1.6086426160627472, + "language_loss": 0.70801485, + "learning_rate": 3.673239015669065e-06, + "loss": 0.78623426, + "num_input_tokens_seen": 75193640, + "router_z_loss_clip": 2.45703125, + "router_z_loss_mlp": 0.21643066, + "step": 3476, + "time_per_iteration": 2.6065874099731445 + }, + { + "auxiliary_loss_clip": 0.06538086, + "auxiliary_loss_mlp": 0.01275305, + "balance_loss_clip": 0.06299278, + "balance_loss_mlp": 0.0125523, + "epoch": 0.20904854952652938, + "flos": 22790666926080.0, + "grad_norm": 1.9785394209574967, + "language_loss": 0.90003526, + "learning_rate": 3.6730256428943544e-06, + "loss": 0.9781692, + "num_input_tokens_seen": 75212545, + "router_z_loss_clip": 2.390625, + "router_z_loss_mlp": 0.20080566, + "step": 3477, + "time_per_iteration": 2.5576000213623047 + }, + { + "auxiliary_loss_clip": 0.06542666, + "auxiliary_loss_mlp": 0.01278801, + "balance_loss_clip": 0.06302647, + "balance_loss_mlp": 0.01257594, + "epoch": 0.20910867277919734, + "flos": 27309838308480.0, + "grad_norm": 2.554960999675803, + "language_loss": 0.69433093, + "learning_rate": 3.672812206678344e-06, + "loss": 0.77254558, + "num_input_tokens_seen": 75230865, + "router_z_loss_clip": 2.40234375, + "router_z_loss_mlp": 0.21203613, + "step": 3478, + "time_per_iteration": 2.605890989303589 + }, + { + "auxiliary_loss_clip": 0.0654031, + "auxiliary_loss_mlp": 0.01282288, + "balance_loss_clip": 0.06298592, + "balance_loss_mlp": 0.01260461, + "epoch": 0.20916879603186533, + "flos": 14324444640000.0, + "grad_norm": 1.9959140715838508, + "language_loss": 0.85550553, + "learning_rate": 3.672598707029127e-06, + "loss": 0.93373156, + "num_input_tokens_seen": 75248285, + "router_z_loss_clip": 2.41992188, + "router_z_loss_mlp": 0.21813965, + "step": 3479, + "time_per_iteration": 2.5808637142181396 + }, + { + "auxiliary_loss_clip": 0.06542581, + "auxiliary_loss_mlp": 0.01279649, + "balance_loss_clip": 0.06299447, + "balance_loss_mlp": 0.01258072, + "epoch": 0.2092289192845333, + "flos": 22279537820160.0, + "grad_norm": 2.3833241848820372, + "language_loss": 0.75129831, + "learning_rate": 3.6723851439548003e-06, + "loss": 0.82952058, + "num_input_tokens_seen": 75266310, + "router_z_loss_clip": 2.43359375, + "router_z_loss_mlp": 0.21569824, + "step": 3480, + "time_per_iteration": 2.519789218902588 + }, + { + "auxiliary_loss_clip": 0.06546038, + "auxiliary_loss_mlp": 0.01278892, + "balance_loss_clip": 0.06306421, + "balance_loss_mlp": 0.01258495, + "epoch": 0.20928904253720126, + "flos": 14836118797440.0, + "grad_norm": 2.1621149118450163, + "language_loss": 0.7689389, + "learning_rate": 3.67217151746346e-06, + "loss": 0.84718817, + "num_input_tokens_seen": 75284175, + "router_z_loss_clip": 2.39648438, + "router_z_loss_mlp": 0.20410156, + "step": 3481, + "time_per_iteration": 2.541019916534424 + }, + { + "auxiliary_loss_clip": 0.06542054, + "auxiliary_loss_mlp": 0.01279748, + "balance_loss_clip": 0.06299154, + "balance_loss_mlp": 0.01257718, + "epoch": 0.20934916578986923, + "flos": 23266017538560.0, + "grad_norm": 1.9029543431357738, + "language_loss": 0.85756385, + "learning_rate": 3.671957827563209e-06, + "loss": 0.93578184, + "num_input_tokens_seen": 75303465, + "router_z_loss_clip": 2.4296875, + "router_z_loss_mlp": 0.22021484, + "step": 3482, + "time_per_iteration": 2.57550048828125 + }, + { + "auxiliary_loss_clip": 0.06538534, + "auxiliary_loss_mlp": 0.01281551, + "balance_loss_clip": 0.0629866, + "balance_loss_mlp": 0.01260237, + "epoch": 0.2094092890425372, + "flos": 32022492768000.0, + "grad_norm": 2.0122422455266076, + "language_loss": 0.71876764, + "learning_rate": 3.6717440742621494e-06, + "loss": 0.79696846, + "num_input_tokens_seen": 75325290, + "router_z_loss_clip": 2.3984375, + "router_z_loss_mlp": 0.21325684, + "step": 3483, + "time_per_iteration": 2.6664113998413086 + }, + { + "auxiliary_loss_clip": 0.06543796, + "auxiliary_loss_mlp": 0.01277426, + "balance_loss_clip": 0.06297769, + "balance_loss_mlp": 0.0125567, + "epoch": 0.20946941229520516, + "flos": 20016744710400.0, + "grad_norm": 1.623254768822543, + "language_loss": 0.75620067, + "learning_rate": 3.6715302575683865e-06, + "loss": 0.83441281, + "num_input_tokens_seen": 75343895, + "router_z_loss_clip": 2.45898438, + "router_z_loss_mlp": 0.21728516, + "step": 3484, + "time_per_iteration": 2.537745714187622 + }, + { + "auxiliary_loss_clip": 0.06537648, + "auxiliary_loss_mlp": 0.01274667, + "balance_loss_clip": 0.0629506, + "balance_loss_mlp": 0.01252733, + "epoch": 0.20952953554787315, + "flos": 30748401509760.0, + "grad_norm": 1.6710062021876058, + "language_loss": 0.71473777, + "learning_rate": 3.6713163774900292e-06, + "loss": 0.79286093, + "num_input_tokens_seen": 75367100, + "router_z_loss_clip": 2.42382812, + "router_z_loss_mlp": 0.21936035, + "step": 3485, + "time_per_iteration": 2.6310439109802246 + }, + { + "auxiliary_loss_clip": 0.0654947, + "auxiliary_loss_mlp": 0.01281894, + "balance_loss_clip": 0.06304678, + "balance_loss_mlp": 0.01258517, + "epoch": 0.20958965880054112, + "flos": 27055950837120.0, + "grad_norm": 1.7793136829828902, + "language_loss": 0.83105123, + "learning_rate": 3.6711024340351875e-06, + "loss": 0.90936482, + "num_input_tokens_seen": 75389925, + "router_z_loss_clip": 2.44921875, + "router_z_loss_mlp": 0.23376465, + "step": 3486, + "time_per_iteration": 2.5819222927093506 + }, + { + "auxiliary_loss_clip": 0.06539689, + "auxiliary_loss_mlp": 0.01279221, + "balance_loss_clip": 0.06297638, + "balance_loss_mlp": 0.01257978, + "epoch": 0.20964978205320908, + "flos": 34212680714880.0, + "grad_norm": 2.582218695391969, + "language_loss": 0.87821579, + "learning_rate": 3.6708884272119737e-06, + "loss": 0.95640486, + "num_input_tokens_seen": 75408575, + "router_z_loss_clip": 2.421875, + "router_z_loss_mlp": 0.21240234, + "step": 3487, + "time_per_iteration": 2.639369487762451 + }, + { + "auxiliary_loss_clip": 0.06538714, + "auxiliary_loss_mlp": 0.01279661, + "balance_loss_clip": 0.06298582, + "balance_loss_mlp": 0.01258227, + "epoch": 0.20970990530587705, + "flos": 23484168443520.0, + "grad_norm": 2.287931950731532, + "language_loss": 0.72719586, + "learning_rate": 3.670674357028504e-06, + "loss": 0.80537963, + "num_input_tokens_seen": 75427155, + "router_z_loss_clip": 2.40039062, + "router_z_loss_mlp": 0.21411133, + "step": 3488, + "time_per_iteration": 3.9480032920837402 + }, + { + "auxiliary_loss_clip": 0.06540683, + "auxiliary_loss_mlp": 0.01275293, + "balance_loss_clip": 0.06299531, + "balance_loss_mlp": 0.01255123, + "epoch": 0.209770028558545, + "flos": 18557346147840.0, + "grad_norm": 2.67396224290917, + "language_loss": 0.81189376, + "learning_rate": 3.6704602234928945e-06, + "loss": 0.89005351, + "num_input_tokens_seen": 75444450, + "router_z_loss_clip": 2.41210938, + "router_z_loss_mlp": 0.20178223, + "step": 3489, + "time_per_iteration": 2.500709295272827 + }, + { + "auxiliary_loss_clip": 0.0654545, + "auxiliary_loss_mlp": 0.01278304, + "balance_loss_clip": 0.06303608, + "balance_loss_mlp": 0.0125724, + "epoch": 0.20983015181121298, + "flos": 21623533804800.0, + "grad_norm": 2.0567102060198743, + "language_loss": 0.73407692, + "learning_rate": 3.670246026613266e-06, + "loss": 0.81231445, + "num_input_tokens_seen": 75462625, + "router_z_loss_clip": 2.41992188, + "router_z_loss_mlp": 0.21057129, + "step": 3490, + "time_per_iteration": 2.5622947216033936 + }, + { + "auxiliary_loss_clip": 0.06534347, + "auxiliary_loss_mlp": 0.01280989, + "balance_loss_clip": 0.06300151, + "balance_loss_mlp": 0.01260128, + "epoch": 0.20989027506388094, + "flos": 16619787861120.0, + "grad_norm": 1.7677892351641744, + "language_loss": 0.71503973, + "learning_rate": 3.6700317663977415e-06, + "loss": 0.7931931, + "num_input_tokens_seen": 75480640, + "router_z_loss_clip": 2.34570312, + "router_z_loss_mlp": 0.20849609, + "step": 3491, + "time_per_iteration": 4.0022783279418945 + }, + { + "auxiliary_loss_clip": 0.06542461, + "auxiliary_loss_mlp": 0.01283797, + "balance_loss_clip": 0.0629908, + "balance_loss_mlp": 0.01260957, + "epoch": 0.20995039831654894, + "flos": 23222692177920.0, + "grad_norm": 2.702657778988086, + "language_loss": 0.80329478, + "learning_rate": 3.669817442854444e-06, + "loss": 0.88155735, + "num_input_tokens_seen": 75494900, + "router_z_loss_clip": 2.43164062, + "router_z_loss_mlp": 0.22839355, + "step": 3492, + "time_per_iteration": 2.5376975536346436 + }, + { + "auxiliary_loss_clip": 0.06546506, + "auxiliary_loss_mlp": 0.01283519, + "balance_loss_clip": 0.06307527, + "balance_loss_mlp": 0.01262741, + "epoch": 0.2100105215692169, + "flos": 18152881689600.0, + "grad_norm": 1.9319737068083613, + "language_loss": 0.87613726, + "learning_rate": 3.669603055991502e-06, + "loss": 0.95443749, + "num_input_tokens_seen": 75513370, + "router_z_loss_clip": 2.39453125, + "router_z_loss_mlp": 0.20800781, + "step": 3493, + "time_per_iteration": 2.5462660789489746 + }, + { + "auxiliary_loss_clip": 0.06538918, + "auxiliary_loss_mlp": 0.01283808, + "balance_loss_clip": 0.06303683, + "balance_loss_mlp": 0.01262673, + "epoch": 0.21007064482188487, + "flos": 15967179936000.0, + "grad_norm": 1.7380368048158776, + "language_loss": 0.69753766, + "learning_rate": 3.6693886058170455e-06, + "loss": 0.77576494, + "num_input_tokens_seen": 75532480, + "router_z_loss_clip": 2.35351562, + "router_z_loss_mlp": 0.21130371, + "step": 3494, + "time_per_iteration": 2.523575782775879 + }, + { + "auxiliary_loss_clip": 0.0654956, + "auxiliary_loss_mlp": 0.0128408, + "balance_loss_clip": 0.06306064, + "balance_loss_mlp": 0.01262598, + "epoch": 0.21013076807455283, + "flos": 32242614243840.0, + "grad_norm": 1.6795437076377473, + "language_loss": 0.79639518, + "learning_rate": 3.6691740923392053e-06, + "loss": 0.87473154, + "num_input_tokens_seen": 75552745, + "router_z_loss_clip": 2.4375, + "router_z_loss_mlp": 0.21472168, + "step": 3495, + "time_per_iteration": 2.679564952850342 + }, + { + "auxiliary_loss_clip": 0.06543255, + "auxiliary_loss_mlp": 0.01280683, + "balance_loss_clip": 0.06300748, + "balance_loss_mlp": 0.01258832, + "epoch": 0.2101908913272208, + "flos": 23703493305600.0, + "grad_norm": 2.110842443067005, + "language_loss": 0.77733672, + "learning_rate": 3.668959515566116e-06, + "loss": 0.85557616, + "num_input_tokens_seen": 75574355, + "router_z_loss_clip": 2.42773438, + "router_z_loss_mlp": 0.21862793, + "step": 3496, + "time_per_iteration": 2.5728261470794678 + }, + { + "auxiliary_loss_clip": 0.06546371, + "auxiliary_loss_mlp": 0.01280297, + "balance_loss_clip": 0.06304142, + "balance_loss_mlp": 0.01257993, + "epoch": 0.21025101457988876, + "flos": 20381992657920.0, + "grad_norm": 2.1840810602746643, + "language_loss": 0.82214069, + "learning_rate": 3.668744875505915e-06, + "loss": 0.90040743, + "num_input_tokens_seen": 75592215, + "router_z_loss_clip": 2.421875, + "router_z_loss_mlp": 0.22302246, + "step": 3497, + "time_per_iteration": 5.435751438140869 + }, + { + "auxiliary_loss_clip": 0.06554863, + "auxiliary_loss_mlp": 0.01281759, + "balance_loss_clip": 0.06307989, + "balance_loss_mlp": 0.01259205, + "epoch": 0.21031113783255675, + "flos": 25782740046720.0, + "grad_norm": 1.9653925911520136, + "language_loss": 0.68009126, + "learning_rate": 3.668530172166741e-06, + "loss": 0.75845742, + "num_input_tokens_seen": 75610740, + "router_z_loss_clip": 2.46875, + "router_z_loss_mlp": 0.22558594, + "step": 3498, + "time_per_iteration": 2.6047511100769043 + }, + { + "auxiliary_loss_clip": 0.06550896, + "auxiliary_loss_mlp": 0.01291723, + "balance_loss_clip": 0.06304521, + "balance_loss_mlp": 0.01269789, + "epoch": 0.21037126108522472, + "flos": 22024769880960.0, + "grad_norm": 1.5964372308761317, + "language_loss": 0.81248403, + "learning_rate": 3.6683154055567352e-06, + "loss": 0.89091027, + "num_input_tokens_seen": 75631005, + "router_z_loss_clip": 2.46484375, + "router_z_loss_mlp": 0.21948242, + "step": 3499, + "time_per_iteration": 2.5279107093811035 + }, + { + "auxiliary_loss_clip": 0.06537838, + "auxiliary_loss_mlp": 0.01278117, + "balance_loss_clip": 0.06300277, + "balance_loss_mlp": 0.01257911, + "epoch": 0.21043138433789269, + "flos": 25340861940480.0, + "grad_norm": 2.3111316875342274, + "language_loss": 0.78733355, + "learning_rate": 3.668100575684043e-06, + "loss": 0.86549306, + "num_input_tokens_seen": 75650655, + "router_z_loss_clip": 2.37890625, + "router_z_loss_mlp": 0.20214844, + "step": 3500, + "time_per_iteration": 2.5789358615875244 + }, + { + "auxiliary_loss_clip": 0.06548081, + "auxiliary_loss_mlp": 0.01281815, + "balance_loss_clip": 0.06307902, + "balance_loss_mlp": 0.01259809, + "epoch": 0.21049150759056065, + "flos": 25563708673920.0, + "grad_norm": 1.5222387073827752, + "language_loss": 0.74519855, + "learning_rate": 3.6678856825568094e-06, + "loss": 0.82349753, + "num_input_tokens_seen": 75669895, + "router_z_loss_clip": 2.40234375, + "router_z_loss_mlp": 0.22021484, + "step": 3501, + "time_per_iteration": 2.5740344524383545 + }, + { + "auxiliary_loss_clip": 0.06532234, + "auxiliary_loss_mlp": 0.01279657, + "balance_loss_clip": 0.06293183, + "balance_loss_mlp": 0.01258521, + "epoch": 0.21055163084322862, + "flos": 24501982389120.0, + "grad_norm": 1.5726278305934103, + "language_loss": 0.75732303, + "learning_rate": 3.667670726183183e-06, + "loss": 0.83544195, + "num_input_tokens_seen": 75689535, + "router_z_loss_clip": 2.39453125, + "router_z_loss_mlp": 0.21142578, + "step": 3502, + "time_per_iteration": 2.564650535583496 + }, + { + "auxiliary_loss_clip": 0.06532737, + "auxiliary_loss_mlp": 0.01282141, + "balance_loss_clip": 0.06294994, + "balance_loss_mlp": 0.01260731, + "epoch": 0.21061175409589658, + "flos": 25746123012480.0, + "grad_norm": 2.0578640076956165, + "language_loss": 0.78642297, + "learning_rate": 3.667455706571316e-06, + "loss": 0.86457181, + "num_input_tokens_seen": 75709265, + "router_z_loss_clip": 2.37695312, + "router_z_loss_mlp": 0.21411133, + "step": 3503, + "time_per_iteration": 2.5651087760925293 + }, + { + "auxiliary_loss_clip": 0.06548393, + "auxiliary_loss_mlp": 0.01287579, + "balance_loss_clip": 0.06300595, + "balance_loss_mlp": 0.01262426, + "epoch": 0.21067187734856455, + "flos": 18995115404160.0, + "grad_norm": 2.3829290271278363, + "language_loss": 0.79109055, + "learning_rate": 3.6672406237293617e-06, + "loss": 0.86945021, + "num_input_tokens_seen": 75727050, + "router_z_loss_clip": 2.47460938, + "router_z_loss_mlp": 0.25134277, + "step": 3504, + "time_per_iteration": 2.5907576084136963 + }, + { + "auxiliary_loss_clip": 0.06540846, + "auxiliary_loss_mlp": 0.01277653, + "balance_loss_clip": 0.06295908, + "balance_loss_mlp": 0.012561, + "epoch": 0.21073200060123254, + "flos": 24688337869440.0, + "grad_norm": 2.6276986020802386, + "language_loss": 0.77414715, + "learning_rate": 3.6670254776654754e-06, + "loss": 0.85233212, + "num_input_tokens_seen": 75747175, + "router_z_loss_clip": 2.44921875, + "router_z_loss_mlp": 0.21557617, + "step": 3505, + "time_per_iteration": 2.564504861831665 + }, + { + "auxiliary_loss_clip": 0.06529057, + "auxiliary_loss_mlp": 0.01277583, + "balance_loss_clip": 0.06294015, + "balance_loss_mlp": 0.01257186, + "epoch": 0.2107921238539005, + "flos": 28557039605760.0, + "grad_norm": 2.0513581673642434, + "language_loss": 0.64351165, + "learning_rate": 3.6668102683878163e-06, + "loss": 0.721578, + "num_input_tokens_seen": 75767690, + "router_z_loss_clip": 2.34960938, + "router_z_loss_mlp": 0.20397949, + "step": 3506, + "time_per_iteration": 2.641390323638916 + }, + { + "auxiliary_loss_clip": 0.06535215, + "auxiliary_loss_mlp": 0.01278768, + "balance_loss_clip": 0.0629719, + "balance_loss_mlp": 0.01257656, + "epoch": 0.21085224710656847, + "flos": 25893094273920.0, + "grad_norm": 2.3889311598286436, + "language_loss": 0.82716179, + "learning_rate": 3.6665949959045443e-06, + "loss": 0.90530163, + "num_input_tokens_seen": 75787255, + "router_z_loss_clip": 2.38085938, + "router_z_loss_mlp": 0.21105957, + "step": 3507, + "time_per_iteration": 2.5718142986297607 + }, + { + "auxiliary_loss_clip": 0.06534198, + "auxiliary_loss_mlp": 0.01280018, + "balance_loss_clip": 0.06294642, + "balance_loss_mlp": 0.0125769, + "epoch": 0.21091237035923643, + "flos": 14981664539520.0, + "grad_norm": 1.9856074738329712, + "language_loss": 0.76547742, + "learning_rate": 3.666379660223824e-06, + "loss": 0.84361959, + "num_input_tokens_seen": 75805890, + "router_z_loss_clip": 2.3984375, + "router_z_loss_mlp": 0.22338867, + "step": 3508, + "time_per_iteration": 2.5104117393493652 + }, + { + "auxiliary_loss_clip": 0.06543706, + "auxiliary_loss_mlp": 0.01282498, + "balance_loss_clip": 0.06299506, + "balance_loss_mlp": 0.01261159, + "epoch": 0.2109724936119044, + "flos": 16368080595840.0, + "grad_norm": 2.529935640705384, + "language_loss": 0.86242574, + "learning_rate": 3.6661642613538192e-06, + "loss": 0.94068778, + "num_input_tokens_seen": 75821620, + "router_z_loss_clip": 2.44335938, + "router_z_loss_mlp": 0.21325684, + "step": 3509, + "time_per_iteration": 2.508370876312256 + }, + { + "auxiliary_loss_clip": 0.06541994, + "auxiliary_loss_mlp": 0.01280685, + "balance_loss_clip": 0.06295836, + "balance_loss_mlp": 0.01258679, + "epoch": 0.21103261686457236, + "flos": 31510315486080.0, + "grad_norm": 1.7053981088389916, + "language_loss": 0.68853724, + "learning_rate": 3.6659487993026987e-06, + "loss": 0.76676404, + "num_input_tokens_seen": 75842490, + "router_z_loss_clip": 2.46289062, + "router_z_loss_mlp": 0.22009277, + "step": 3510, + "time_per_iteration": 2.6452746391296387 + }, + { + "auxiliary_loss_clip": 0.06542882, + "auxiliary_loss_mlp": 0.01284418, + "balance_loss_clip": 0.06299803, + "balance_loss_mlp": 0.01263259, + "epoch": 0.21109274011724033, + "flos": 27351360806400.0, + "grad_norm": 1.7932280077203222, + "language_loss": 0.7352736, + "learning_rate": 3.6657332740786327e-06, + "loss": 0.8135466, + "num_input_tokens_seen": 75865985, + "router_z_loss_clip": 2.43164062, + "router_z_loss_mlp": 0.21154785, + "step": 3511, + "time_per_iteration": 2.6538095474243164 + }, + { + "auxiliary_loss_clip": 0.06553793, + "auxiliary_loss_mlp": 0.01288613, + "balance_loss_clip": 0.06308056, + "balance_loss_mlp": 0.01265546, + "epoch": 0.21115286336990832, + "flos": 17825927857920.0, + "grad_norm": 2.4490749473958577, + "language_loss": 0.70309734, + "learning_rate": 3.665517685689794e-06, + "loss": 0.78152132, + "num_input_tokens_seen": 75882745, + "router_z_loss_clip": 2.4609375, + "router_z_loss_mlp": 0.23071289, + "step": 3512, + "time_per_iteration": 2.5178020000457764 + }, + { + "auxiliary_loss_clip": 0.06542063, + "auxiliary_loss_mlp": 0.01280138, + "balance_loss_clip": 0.06299283, + "balance_loss_mlp": 0.01257739, + "epoch": 0.2112129866225763, + "flos": 27205228085760.0, + "grad_norm": 1.580176351931222, + "language_loss": 0.73930323, + "learning_rate": 3.6653020341443584e-06, + "loss": 0.81752527, + "num_input_tokens_seen": 75904305, + "router_z_loss_clip": 2.43164062, + "router_z_loss_mlp": 0.22412109, + "step": 3513, + "time_per_iteration": 2.62662410736084 + }, + { + "auxiliary_loss_clip": 0.06537203, + "auxiliary_loss_mlp": 0.01281283, + "balance_loss_clip": 0.06301522, + "balance_loss_mlp": 0.01260303, + "epoch": 0.21127310987524425, + "flos": 23737846279680.0, + "grad_norm": 1.7494748899805272, + "language_loss": 0.75353736, + "learning_rate": 3.665086319450502e-06, + "loss": 0.8317222, + "num_input_tokens_seen": 75923710, + "router_z_loss_clip": 2.359375, + "router_z_loss_mlp": 0.20983887, + "step": 3514, + "time_per_iteration": 2.584502696990967 + }, + { + "auxiliary_loss_clip": 0.06546184, + "auxiliary_loss_mlp": 0.01281455, + "balance_loss_clip": 0.06301809, + "balance_loss_mlp": 0.01261309, + "epoch": 0.21133323312791222, + "flos": 18338356702080.0, + "grad_norm": 1.6761924057980855, + "language_loss": 0.77322358, + "learning_rate": 3.6648705416164062e-06, + "loss": 0.85149997, + "num_input_tokens_seen": 75942625, + "router_z_loss_clip": 2.4453125, + "router_z_loss_mlp": 0.20141602, + "step": 3515, + "time_per_iteration": 2.552231550216675 + }, + { + "auxiliary_loss_clip": 0.06544478, + "auxiliary_loss_mlp": 0.0128088, + "balance_loss_clip": 0.06304052, + "balance_loss_mlp": 0.01260865, + "epoch": 0.21139335638058018, + "flos": 17936994844800.0, + "grad_norm": 2.0687526262765212, + "language_loss": 0.69083852, + "learning_rate": 3.6646547006502518e-06, + "loss": 0.76909214, + "num_input_tokens_seen": 75959930, + "router_z_loss_clip": 2.40625, + "router_z_loss_mlp": 0.19995117, + "step": 3516, + "time_per_iteration": 2.535282611846924 + }, + { + "auxiliary_loss_clip": 0.0654862, + "auxiliary_loss_mlp": 0.01279905, + "balance_loss_clip": 0.0630609, + "balance_loss_mlp": 0.01257756, + "epoch": 0.21145347963324815, + "flos": 24579073745280.0, + "grad_norm": 1.818548989117399, + "language_loss": 0.85523438, + "learning_rate": 3.664438796560225e-06, + "loss": 0.93351966, + "num_input_tokens_seen": 75980335, + "router_z_loss_clip": 2.42578125, + "router_z_loss_mlp": 0.22155762, + "step": 3517, + "time_per_iteration": 2.5862202644348145 + }, + { + "auxiliary_loss_clip": 0.06554718, + "auxiliary_loss_mlp": 0.01280908, + "balance_loss_clip": 0.06311698, + "balance_loss_mlp": 0.01260368, + "epoch": 0.21151360288591614, + "flos": 35854787105280.0, + "grad_norm": 2.178791897783965, + "language_loss": 0.6333189, + "learning_rate": 3.664222829354512e-06, + "loss": 0.71167523, + "num_input_tokens_seen": 76002095, + "router_z_loss_clip": 2.43164062, + "router_z_loss_mlp": 0.20532227, + "step": 3518, + "time_per_iteration": 2.6618587970733643 + }, + { + "auxiliary_loss_clip": 0.0654604, + "auxiliary_loss_mlp": 0.0129195, + "balance_loss_clip": 0.06306089, + "balance_loss_mlp": 0.01271625, + "epoch": 0.2115737261385841, + "flos": 24647989328640.0, + "grad_norm": 1.8588369306942552, + "language_loss": 0.90024757, + "learning_rate": 3.664006799041303e-06, + "loss": 0.97862744, + "num_input_tokens_seen": 76020425, + "router_z_loss_clip": 2.39648438, + "router_z_loss_mlp": 0.20336914, + "step": 3519, + "time_per_iteration": 2.5962281227111816 + }, + { + "auxiliary_loss_clip": 0.06553498, + "auxiliary_loss_mlp": 0.0129082, + "balance_loss_clip": 0.06309567, + "balance_loss_mlp": 0.01268945, + "epoch": 0.21163384939125207, + "flos": 25233652241280.0, + "grad_norm": 1.74321759448714, + "language_loss": 0.81933582, + "learning_rate": 3.6637907056287886e-06, + "loss": 0.89777905, + "num_input_tokens_seen": 76041210, + "router_z_loss_clip": 2.43945312, + "router_z_loss_mlp": 0.21862793, + "step": 3520, + "time_per_iteration": 2.6036746501922607 + }, + { + "auxiliary_loss_clip": 0.06544603, + "auxiliary_loss_mlp": 0.0127827, + "balance_loss_clip": 0.0630887, + "balance_loss_mlp": 0.01257576, + "epoch": 0.21169397264392004, + "flos": 26074670071680.0, + "grad_norm": 1.5989262406015683, + "language_loss": 0.76731956, + "learning_rate": 3.6635745491251642e-06, + "loss": 0.84554833, + "num_input_tokens_seen": 76062685, + "router_z_loss_clip": 2.35742188, + "router_z_loss_mlp": 0.20690918, + "step": 3521, + "time_per_iteration": 2.613945960998535 + }, + { + "auxiliary_loss_clip": 0.06548078, + "auxiliary_loss_mlp": 0.01281462, + "balance_loss_clip": 0.06310651, + "balance_loss_mlp": 0.01261364, + "epoch": 0.211754095896588, + "flos": 23114266594560.0, + "grad_norm": 2.104686387571933, + "language_loss": 0.75886559, + "learning_rate": 3.663358329538626e-06, + "loss": 0.83716094, + "num_input_tokens_seen": 76082300, + "router_z_loss_clip": 2.37695312, + "router_z_loss_mlp": 0.20092773, + "step": 3522, + "time_per_iteration": 2.530388355255127 + }, + { + "auxiliary_loss_clip": 0.06550008, + "auxiliary_loss_mlp": 0.01276271, + "balance_loss_clip": 0.06309568, + "balance_loss_mlp": 0.01255994, + "epoch": 0.21181421914925597, + "flos": 27928806019200.0, + "grad_norm": 2.55069435165465, + "language_loss": 0.71218652, + "learning_rate": 3.663142046877374e-06, + "loss": 0.79044926, + "num_input_tokens_seen": 76101135, + "router_z_loss_clip": 2.40625, + "router_z_loss_mlp": 0.20288086, + "step": 3523, + "time_per_iteration": 2.6448264122009277 + }, + { + "auxiliary_loss_clip": 0.06544726, + "auxiliary_loss_mlp": 0.01276969, + "balance_loss_clip": 0.06308427, + "balance_loss_mlp": 0.01256191, + "epoch": 0.21187434240192393, + "flos": 17134313057280.0, + "grad_norm": 2.0846198886990566, + "language_loss": 0.77930927, + "learning_rate": 3.6629257011496085e-06, + "loss": 0.8575263, + "num_input_tokens_seen": 76119320, + "router_z_loss_clip": 2.36523438, + "router_z_loss_mlp": 0.20788574, + "step": 3524, + "time_per_iteration": 2.527096748352051 + }, + { + "auxiliary_loss_clip": 0.06557429, + "auxiliary_loss_mlp": 0.01277075, + "balance_loss_clip": 0.0631334, + "balance_loss_mlp": 0.01255045, + "epoch": 0.21193446565459192, + "flos": 22354071626880.0, + "grad_norm": 2.138137470282545, + "language_loss": 0.82111794, + "learning_rate": 3.6627092923635338e-06, + "loss": 0.89946306, + "num_input_tokens_seen": 76137445, + "router_z_loss_clip": 2.43945312, + "router_z_loss_mlp": 0.22033691, + "step": 3525, + "time_per_iteration": 2.583249807357788 + }, + { + "auxiliary_loss_clip": 0.06547971, + "auxiliary_loss_mlp": 0.01274856, + "balance_loss_clip": 0.06308704, + "balance_loss_mlp": 0.01254519, + "epoch": 0.2119945889072599, + "flos": 27206779386240.0, + "grad_norm": 1.7514877674009408, + "language_loss": 0.75671291, + "learning_rate": 3.662492820527356e-06, + "loss": 0.83494115, + "num_input_tokens_seen": 76159500, + "router_z_loss_clip": 2.39453125, + "router_z_loss_mlp": 0.20324707, + "step": 3526, + "time_per_iteration": 2.56286883354187 + }, + { + "auxiliary_loss_clip": 0.06556675, + "auxiliary_loss_mlp": 0.01279028, + "balance_loss_clip": 0.0631361, + "balance_loss_mlp": 0.01258107, + "epoch": 0.21205471215992786, + "flos": 20997480424320.0, + "grad_norm": 1.9989732630407808, + "language_loss": 0.77276337, + "learning_rate": 3.662276285649284e-06, + "loss": 0.85112035, + "num_input_tokens_seen": 76177990, + "router_z_loss_clip": 2.4296875, + "router_z_loss_mlp": 0.20910645, + "step": 3527, + "time_per_iteration": 2.7162973880767822 + }, + { + "auxiliary_loss_clip": 0.06551696, + "auxiliary_loss_mlp": 0.01279873, + "balance_loss_clip": 0.06314081, + "balance_loss_mlp": 0.01258224, + "epoch": 0.21211483541259582, + "flos": 20784025347840.0, + "grad_norm": 2.0427089539116783, + "language_loss": 0.78184944, + "learning_rate": 3.662059687737528e-06, + "loss": 0.86016512, + "num_input_tokens_seen": 76197125, + "router_z_loss_clip": 2.375, + "router_z_loss_mlp": 0.21643066, + "step": 3528, + "time_per_iteration": 3.990530490875244 + }, + { + "auxiliary_loss_clip": 0.06551792, + "auxiliary_loss_mlp": 0.01277875, + "balance_loss_clip": 0.06313196, + "balance_loss_mlp": 0.01257025, + "epoch": 0.21217495866526379, + "flos": 18996079726080.0, + "grad_norm": 1.942993331862389, + "language_loss": 0.82054245, + "learning_rate": 3.6618430268003024e-06, + "loss": 0.89883912, + "num_input_tokens_seen": 76216215, + "router_z_loss_clip": 2.38671875, + "router_z_loss_mlp": 0.20861816, + "step": 3529, + "time_per_iteration": 2.564383029937744 + }, + { + "auxiliary_loss_clip": 0.06555474, + "auxiliary_loss_mlp": 0.01278138, + "balance_loss_clip": 0.06313926, + "balance_loss_mlp": 0.01257134, + "epoch": 0.21223508191793175, + "flos": 20673503412480.0, + "grad_norm": 2.2777790477523236, + "language_loss": 0.77694297, + "learning_rate": 3.6616263028458235e-06, + "loss": 0.85527909, + "num_input_tokens_seen": 76237010, + "router_z_loss_clip": 2.41796875, + "router_z_loss_mlp": 0.21008301, + "step": 3530, + "time_per_iteration": 2.576662540435791 + }, + { + "auxiliary_loss_clip": 0.06550869, + "auxiliary_loss_mlp": 0.01274157, + "balance_loss_clip": 0.06314521, + "balance_loss_mlp": 0.01254106, + "epoch": 0.21229520517059972, + "flos": 21622904899200.0, + "grad_norm": 2.3150689342230644, + "language_loss": 0.83926791, + "learning_rate": 3.661409515882308e-06, + "loss": 0.91751814, + "num_input_tokens_seen": 76255965, + "router_z_loss_clip": 2.36523438, + "router_z_loss_mlp": 0.20043945, + "step": 3531, + "time_per_iteration": 4.092180252075195 + }, + { + "auxiliary_loss_clip": 0.06553733, + "auxiliary_loss_mlp": 0.01280648, + "balance_loss_clip": 0.06313696, + "balance_loss_mlp": 0.0125888, + "epoch": 0.2123553284232677, + "flos": 13996232997120.0, + "grad_norm": 2.2553338764718145, + "language_loss": 0.74256229, + "learning_rate": 3.661192665917977e-06, + "loss": 0.82090604, + "num_input_tokens_seen": 76272150, + "router_z_loss_clip": 2.40039062, + "router_z_loss_mlp": 0.21777344, + "step": 3532, + "time_per_iteration": 2.5215070247650146 + }, + { + "auxiliary_loss_clip": 0.06549011, + "auxiliary_loss_mlp": 0.01276957, + "balance_loss_clip": 0.06309506, + "balance_loss_mlp": 0.01255714, + "epoch": 0.21241545167593567, + "flos": 18302745916800.0, + "grad_norm": 1.8963653738624293, + "language_loss": 0.74378759, + "learning_rate": 3.660975752961054e-06, + "loss": 0.82204729, + "num_input_tokens_seen": 76291425, + "router_z_loss_clip": 2.39648438, + "router_z_loss_mlp": 0.21252441, + "step": 3533, + "time_per_iteration": 2.5286645889282227 + }, + { + "auxiliary_loss_clip": 0.06554842, + "auxiliary_loss_mlp": 0.01279741, + "balance_loss_clip": 0.06312128, + "balance_loss_mlp": 0.01257341, + "epoch": 0.21247557492860364, + "flos": 34721461906560.0, + "grad_norm": 1.8118406193913599, + "language_loss": 0.71620667, + "learning_rate": 3.6607587770197634e-06, + "loss": 0.79455251, + "num_input_tokens_seen": 76313975, + "router_z_loss_clip": 2.42773438, + "router_z_loss_mlp": 0.22399902, + "step": 3534, + "time_per_iteration": 2.6872916221618652 + }, + { + "auxiliary_loss_clip": 0.06548804, + "auxiliary_loss_mlp": 0.01283614, + "balance_loss_clip": 0.0630706, + "balance_loss_mlp": 0.01262586, + "epoch": 0.2125356981812716, + "flos": 22060254885120.0, + "grad_norm": 2.3502862502903046, + "language_loss": 0.72866982, + "learning_rate": 3.6605417381023346e-06, + "loss": 0.80699402, + "num_input_tokens_seen": 76330955, + "router_z_loss_clip": 2.41992188, + "router_z_loss_mlp": 0.21032715, + "step": 3535, + "time_per_iteration": 2.5843448638916016 + }, + { + "auxiliary_loss_clip": 0.06546953, + "auxiliary_loss_mlp": 0.01279722, + "balance_loss_clip": 0.06307133, + "balance_loss_mlp": 0.01257621, + "epoch": 0.21259582143393957, + "flos": 28555865648640.0, + "grad_norm": 2.199655139190772, + "language_loss": 0.70759106, + "learning_rate": 3.660324636216996e-06, + "loss": 0.7858578, + "num_input_tokens_seen": 76352680, + "router_z_loss_clip": 2.3984375, + "router_z_loss_mlp": 0.22106934, + "step": 3536, + "time_per_iteration": 4.056318998336792 + }, + { + "auxiliary_loss_clip": 0.06557733, + "auxiliary_loss_mlp": 0.01286072, + "balance_loss_clip": 0.06310252, + "balance_loss_mlp": 0.0126415, + "epoch": 0.21265594468660753, + "flos": 20127140864640.0, + "grad_norm": 2.2134041941920897, + "language_loss": 0.8820163, + "learning_rate": 3.660107471371981e-06, + "loss": 0.96045434, + "num_input_tokens_seen": 76370750, + "router_z_loss_clip": 2.4765625, + "router_z_loss_mlp": 0.21911621, + "step": 3537, + "time_per_iteration": 2.6233468055725098 + }, + { + "auxiliary_loss_clip": 0.06541121, + "auxiliary_loss_mlp": 0.01278147, + "balance_loss_clip": 0.06304413, + "balance_loss_mlp": 0.01256094, + "epoch": 0.21271606793927553, + "flos": 23082890440320.0, + "grad_norm": 1.7848498720134809, + "language_loss": 0.81086004, + "learning_rate": 3.659890243575524e-06, + "loss": 0.88905263, + "num_input_tokens_seen": 76390610, + "router_z_loss_clip": 2.36914062, + "router_z_loss_mlp": 0.22058105, + "step": 3538, + "time_per_iteration": 2.5589442253112793 + }, + { + "auxiliary_loss_clip": 0.06545715, + "auxiliary_loss_mlp": 0.01283722, + "balance_loss_clip": 0.06305592, + "balance_loss_mlp": 0.01263981, + "epoch": 0.2127761911919435, + "flos": 26394118963200.0, + "grad_norm": 2.023826748108625, + "language_loss": 0.87817419, + "learning_rate": 3.659672952835863e-06, + "loss": 0.95646858, + "num_input_tokens_seen": 76408860, + "router_z_loss_clip": 2.40234375, + "router_z_loss_mlp": 0.19763184, + "step": 3539, + "time_per_iteration": 2.6115527153015137 + }, + { + "auxiliary_loss_clip": 0.06554011, + "auxiliary_loss_mlp": 0.01284638, + "balance_loss_clip": 0.06309317, + "balance_loss_mlp": 0.01264277, + "epoch": 0.21283631444461146, + "flos": 20234182855680.0, + "grad_norm": 3.1687626880856667, + "language_loss": 0.59144789, + "learning_rate": 3.659455599161237e-06, + "loss": 0.66983438, + "num_input_tokens_seen": 76424980, + "router_z_loss_clip": 2.4453125, + "router_z_loss_mlp": 0.20361328, + "step": 3540, + "time_per_iteration": 2.525139570236206 + }, + { + "auxiliary_loss_clip": 0.06543202, + "auxiliary_loss_mlp": 0.01278528, + "balance_loss_clip": 0.0630211, + "balance_loss_mlp": 0.01256557, + "epoch": 0.21289643769727942, + "flos": 13522140195840.0, + "grad_norm": 1.940296770056649, + "language_loss": 0.7721082, + "learning_rate": 3.659238182559888e-06, + "loss": 0.85032547, + "num_input_tokens_seen": 76443135, + "router_z_loss_clip": 2.40820312, + "router_z_loss_mlp": 0.21972656, + "step": 3541, + "time_per_iteration": 2.563164234161377 + }, + { + "auxiliary_loss_clip": 0.06542824, + "auxiliary_loss_mlp": 0.01283205, + "balance_loss_clip": 0.06305471, + "balance_loss_mlp": 0.01262486, + "epoch": 0.2129565609499474, + "flos": 24833967465600.0, + "grad_norm": 1.7979798329536472, + "language_loss": 0.69596064, + "learning_rate": 3.6590207030400615e-06, + "loss": 0.77422094, + "num_input_tokens_seen": 76462470, + "router_z_loss_clip": 2.37304688, + "router_z_loss_mlp": 0.20703125, + "step": 3542, + "time_per_iteration": 2.6213386058807373 + }, + { + "auxiliary_loss_clip": 0.06542216, + "auxiliary_loss_mlp": 0.01284362, + "balance_loss_clip": 0.0630642, + "balance_loss_mlp": 0.01264692, + "epoch": 0.21301668420261535, + "flos": 23665953876480.0, + "grad_norm": 1.8238030340304547, + "language_loss": 0.77012485, + "learning_rate": 3.658803160610004e-06, + "loss": 0.84839058, + "num_input_tokens_seen": 76481995, + "router_z_loss_clip": 2.359375, + "router_z_loss_mlp": 0.19677734, + "step": 3543, + "time_per_iteration": 2.5654232501983643 + }, + { + "auxiliary_loss_clip": 0.0654586, + "auxiliary_loss_mlp": 0.01282767, + "balance_loss_clip": 0.0630815, + "balance_loss_mlp": 0.01261488, + "epoch": 0.21307680745528332, + "flos": 16368416012160.0, + "grad_norm": 2.0315626098903468, + "language_loss": 0.67305464, + "learning_rate": 3.6585855552779634e-06, + "loss": 0.75134087, + "num_input_tokens_seen": 76500245, + "router_z_loss_clip": 2.37890625, + "router_z_loss_mlp": 0.2130127, + "step": 3544, + "time_per_iteration": 2.513288736343384 + }, + { + "auxiliary_loss_clip": 0.06542834, + "auxiliary_loss_mlp": 0.01284, + "balance_loss_clip": 0.06304078, + "balance_loss_mlp": 0.01264223, + "epoch": 0.2131369307079513, + "flos": 19105092288000.0, + "grad_norm": 1.7034786511890583, + "language_loss": 0.71322483, + "learning_rate": 3.6583678870521934e-06, + "loss": 0.79149318, + "num_input_tokens_seen": 76519535, + "router_z_loss_clip": 2.38867188, + "router_z_loss_mlp": 0.19763184, + "step": 3545, + "time_per_iteration": 2.5347442626953125 + }, + { + "auxiliary_loss_clip": 0.06549121, + "auxiliary_loss_mlp": 0.01288311, + "balance_loss_clip": 0.06306408, + "balance_loss_mlp": 0.01268224, + "epoch": 0.21319705396061928, + "flos": 30380050961280.0, + "grad_norm": 2.304335172733059, + "language_loss": 0.73178399, + "learning_rate": 3.658150155940946e-06, + "loss": 0.81015837, + "num_input_tokens_seen": 76542065, + "router_z_loss_clip": 2.42382812, + "router_z_loss_mlp": 0.20092773, + "step": 3546, + "time_per_iteration": 2.6647720336914062 + }, + { + "auxiliary_loss_clip": 0.0655164, + "auxiliary_loss_mlp": 0.01278696, + "balance_loss_clip": 0.06310475, + "balance_loss_mlp": 0.01258609, + "epoch": 0.21325717721328724, + "flos": 21761616533760.0, + "grad_norm": 1.9338253687785023, + "language_loss": 0.81206107, + "learning_rate": 3.657932361952479e-06, + "loss": 0.89036447, + "num_input_tokens_seen": 76560540, + "router_z_loss_clip": 2.41210938, + "router_z_loss_mlp": 0.20092773, + "step": 3547, + "time_per_iteration": 2.533062696456909 + }, + { + "auxiliary_loss_clip": 0.06547703, + "auxiliary_loss_mlp": 0.01281658, + "balance_loss_clip": 0.06302875, + "balance_loss_mlp": 0.01259127, + "epoch": 0.2133173004659552, + "flos": 28738447695360.0, + "grad_norm": 3.206018032759459, + "language_loss": 0.74960929, + "learning_rate": 3.6577145050950504e-06, + "loss": 0.82790291, + "num_input_tokens_seen": 76581760, + "router_z_loss_clip": 2.45117188, + "router_z_loss_mlp": 0.22521973, + "step": 3548, + "time_per_iteration": 2.605151414871216 + }, + { + "auxiliary_loss_clip": 0.06554648, + "auxiliary_loss_mlp": 0.01281207, + "balance_loss_clip": 0.06309359, + "balance_loss_mlp": 0.01259236, + "epoch": 0.21337742371862317, + "flos": 16842760375680.0, + "grad_norm": 2.056331081084102, + "language_loss": 0.74889886, + "learning_rate": 3.657496585376922e-06, + "loss": 0.82725745, + "num_input_tokens_seen": 76599940, + "router_z_loss_clip": 2.453125, + "router_z_loss_mlp": 0.21972656, + "step": 3549, + "time_per_iteration": 2.518305540084839 + }, + { + "auxiliary_loss_clip": 0.06547625, + "auxiliary_loss_mlp": 0.01281066, + "balance_loss_clip": 0.06306933, + "balance_loss_mlp": 0.01261278, + "epoch": 0.21343754697129114, + "flos": 24431683213440.0, + "grad_norm": 1.7052192349692608, + "language_loss": 0.8095907, + "learning_rate": 3.657278602806357e-06, + "loss": 0.88787764, + "num_input_tokens_seen": 76619580, + "router_z_loss_clip": 2.40625, + "router_z_loss_mlp": 0.19787598, + "step": 3550, + "time_per_iteration": 2.621840715408325 + }, + { + "auxiliary_loss_clip": 0.06544942, + "auxiliary_loss_mlp": 0.01278049, + "balance_loss_clip": 0.06309815, + "balance_loss_mlp": 0.01258653, + "epoch": 0.21349767022395913, + "flos": 19283271995520.0, + "grad_norm": 1.8011583081598594, + "language_loss": 0.88582718, + "learning_rate": 3.657060557391621e-06, + "loss": 0.96405709, + "num_input_tokens_seen": 76638195, + "router_z_loss_clip": 2.35546875, + "router_z_loss_mlp": 0.19384766, + "step": 3551, + "time_per_iteration": 2.5354909896850586 + }, + { + "auxiliary_loss_clip": 0.06541884, + "auxiliary_loss_mlp": 0.01276889, + "balance_loss_clip": 0.06304973, + "balance_loss_mlp": 0.01256635, + "epoch": 0.2135577934766271, + "flos": 17353260576000.0, + "grad_norm": 1.8291964059748265, + "language_loss": 0.83669794, + "learning_rate": 3.656842449140983e-06, + "loss": 0.91488564, + "num_input_tokens_seen": 76656695, + "router_z_loss_clip": 2.36914062, + "router_z_loss_mlp": 0.20275879, + "step": 3552, + "time_per_iteration": 2.5428099632263184 + }, + { + "auxiliary_loss_clip": 0.06543534, + "auxiliary_loss_mlp": 0.01282655, + "balance_loss_clip": 0.06305505, + "balance_loss_mlp": 0.01261329, + "epoch": 0.21361791672929506, + "flos": 24063416519040.0, + "grad_norm": 1.71251087169846, + "language_loss": 0.77181637, + "learning_rate": 3.656624278062713e-06, + "loss": 0.85007823, + "num_input_tokens_seen": 76677430, + "router_z_loss_clip": 2.38085938, + "router_z_loss_mlp": 0.21325684, + "step": 3553, + "time_per_iteration": 2.5453906059265137 + }, + { + "auxiliary_loss_clip": 0.06546006, + "auxiliary_loss_mlp": 0.01280965, + "balance_loss_clip": 0.06308904, + "balance_loss_mlp": 0.01260556, + "epoch": 0.21367803998196302, + "flos": 22168596614400.0, + "grad_norm": 1.6386548216082337, + "language_loss": 0.72918522, + "learning_rate": 3.6564060441650843e-06, + "loss": 0.80745488, + "num_input_tokens_seen": 76697615, + "router_z_loss_clip": 2.37109375, + "router_z_loss_mlp": 0.20397949, + "step": 3554, + "time_per_iteration": 2.610447883605957 + }, + { + "auxiliary_loss_clip": 0.06543835, + "auxiliary_loss_mlp": 0.01296522, + "balance_loss_clip": 0.06304221, + "balance_loss_mlp": 0.01276483, + "epoch": 0.213738163234631, + "flos": 20893205617920.0, + "grad_norm": 2.167468133085416, + "language_loss": 0.6838634, + "learning_rate": 3.6561877474563724e-06, + "loss": 0.76226699, + "num_input_tokens_seen": 76715685, + "router_z_loss_clip": 2.39453125, + "router_z_loss_mlp": 0.20043945, + "step": 3555, + "time_per_iteration": 2.6348068714141846 + }, + { + "auxiliary_loss_clip": 0.06544648, + "auxiliary_loss_mlp": 0.01283651, + "balance_loss_clip": 0.06303324, + "balance_loss_mlp": 0.01262861, + "epoch": 0.21379828648729896, + "flos": 28410739176960.0, + "grad_norm": 1.8068010568670265, + "language_loss": 0.6581043, + "learning_rate": 3.6559693879448553e-06, + "loss": 0.73638725, + "num_input_tokens_seen": 76735405, + "router_z_loss_clip": 2.4140625, + "router_z_loss_mlp": 0.20800781, + "step": 3556, + "time_per_iteration": 2.6547720432281494 + }, + { + "auxiliary_loss_clip": 0.06542179, + "auxiliary_loss_mlp": 0.0129054, + "balance_loss_clip": 0.06305043, + "balance_loss_mlp": 0.01269905, + "epoch": 0.21385840973996692, + "flos": 25486030339200.0, + "grad_norm": 1.6965425102308196, + "language_loss": 0.73263884, + "learning_rate": 3.6557509656388125e-06, + "loss": 0.81096601, + "num_input_tokens_seen": 76754395, + "router_z_loss_clip": 2.375, + "router_z_loss_mlp": 0.20617676, + "step": 3557, + "time_per_iteration": 2.5850143432617188 + }, + { + "auxiliary_loss_clip": 0.06555384, + "auxiliary_loss_mlp": 0.01282331, + "balance_loss_clip": 0.06310774, + "balance_loss_mlp": 0.01260814, + "epoch": 0.2139185329926349, + "flos": 28081772847360.0, + "grad_norm": 1.6861756161591135, + "language_loss": 0.67894918, + "learning_rate": 3.655532480546528e-06, + "loss": 0.75732636, + "num_input_tokens_seen": 76777210, + "router_z_loss_clip": 2.4453125, + "router_z_loss_mlp": 0.21508789, + "step": 3558, + "time_per_iteration": 2.6937482357025146 + }, + { + "auxiliary_loss_clip": 0.06554736, + "auxiliary_loss_mlp": 0.01278958, + "balance_loss_clip": 0.06306359, + "balance_loss_mlp": 0.0125905, + "epoch": 0.21397865624530288, + "flos": 19614628166400.0, + "grad_norm": 2.1418574307637575, + "language_loss": 0.81358159, + "learning_rate": 3.655313932676286e-06, + "loss": 0.89191854, + "num_input_tokens_seen": 76795830, + "router_z_loss_clip": 2.484375, + "router_z_loss_mlp": 0.19909668, + "step": 3559, + "time_per_iteration": 2.5145814418792725 + }, + { + "auxiliary_loss_clip": 0.06551723, + "auxiliary_loss_mlp": 0.01281472, + "balance_loss_clip": 0.06314635, + "balance_loss_mlp": 0.01262899, + "epoch": 0.21403877949797084, + "flos": 24688463650560.0, + "grad_norm": 1.6715073288493136, + "language_loss": 0.68710625, + "learning_rate": 3.655095322036373e-06, + "loss": 0.7654382, + "num_input_tokens_seen": 76814700, + "router_z_loss_clip": 2.36914062, + "router_z_loss_mlp": 0.18554688, + "step": 3560, + "time_per_iteration": 2.5956926345825195 + }, + { + "auxiliary_loss_clip": 0.06554615, + "auxiliary_loss_mlp": 0.01279566, + "balance_loss_clip": 0.0631121, + "balance_loss_mlp": 0.01259313, + "epoch": 0.2140989027506388, + "flos": 19866628920960.0, + "grad_norm": 1.9885830979576231, + "language_loss": 0.73618603, + "learning_rate": 3.65487664863508e-06, + "loss": 0.81452787, + "num_input_tokens_seen": 76833400, + "router_z_loss_clip": 2.43359375, + "router_z_loss_mlp": 0.20263672, + "step": 3561, + "time_per_iteration": 2.5286123752593994 + }, + { + "auxiliary_loss_clip": 0.06553814, + "auxiliary_loss_mlp": 0.01282143, + "balance_loss_clip": 0.06311779, + "balance_loss_mlp": 0.01262402, + "epoch": 0.21415902600330677, + "flos": 19141331978880.0, + "grad_norm": 2.350872095274855, + "language_loss": 0.78756285, + "learning_rate": 3.654657912480698e-06, + "loss": 0.86592233, + "num_input_tokens_seen": 76850645, + "router_z_loss_clip": 2.41992188, + "router_z_loss_mlp": 0.19763184, + "step": 3562, + "time_per_iteration": 2.608041286468506 + }, + { + "auxiliary_loss_clip": 0.06546983, + "auxiliary_loss_mlp": 0.01281911, + "balance_loss_clip": 0.06309752, + "balance_loss_mlp": 0.01261788, + "epoch": 0.21421914925597474, + "flos": 22279076622720.0, + "grad_norm": 1.5018972458321598, + "language_loss": 0.85257983, + "learning_rate": 3.6544391135815237e-06, + "loss": 0.93086874, + "num_input_tokens_seen": 76870135, + "router_z_loss_clip": 2.37304688, + "router_z_loss_mlp": 0.20117188, + "step": 3563, + "time_per_iteration": 2.5593912601470947 + }, + { + "auxiliary_loss_clip": 0.06548097, + "auxiliary_loss_mlp": 0.01281509, + "balance_loss_clip": 0.06308593, + "balance_loss_mlp": 0.01262531, + "epoch": 0.2142792725086427, + "flos": 33883504750080.0, + "grad_norm": 1.9248219523503745, + "language_loss": 0.76925778, + "learning_rate": 3.6542202519458507e-06, + "loss": 0.84755385, + "num_input_tokens_seen": 76893905, + "router_z_loss_clip": 2.39453125, + "router_z_loss_mlp": 0.18981934, + "step": 3564, + "time_per_iteration": 2.668755531311035 + }, + { + "auxiliary_loss_clip": 0.06542072, + "auxiliary_loss_mlp": 0.01280159, + "balance_loss_clip": 0.06305549, + "balance_loss_mlp": 0.01261181, + "epoch": 0.2143393957613107, + "flos": 19865538817920.0, + "grad_norm": 1.690691453330226, + "language_loss": 0.89139843, + "learning_rate": 3.654001327581981e-06, + "loss": 0.9696207, + "num_input_tokens_seen": 76914205, + "router_z_loss_clip": 2.36328125, + "router_z_loss_mlp": 0.18969727, + "step": 3565, + "time_per_iteration": 2.660306215286255 + }, + { + "auxiliary_loss_clip": 0.06436334, + "auxiliary_loss_mlp": 0.01286647, + "balance_loss_clip": 0.06303974, + "balance_loss_mlp": 0.01279924, + "epoch": 0.21439951901397866, + "flos": 68549300017920.0, + "grad_norm": 0.8225285981700966, + "language_loss": 0.52211988, + "learning_rate": 3.653782340498215e-06, + "loss": 0.59934968, + "num_input_tokens_seen": 76975650, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.06738281, + "step": 3566, + "time_per_iteration": 3.0845720767974854 + }, + { + "auxiliary_loss_clip": 0.06539588, + "auxiliary_loss_mlp": 0.01284533, + "balance_loss_clip": 0.06306818, + "balance_loss_mlp": 0.0126478, + "epoch": 0.21445964226664663, + "flos": 19689161973120.0, + "grad_norm": 1.8060006281631265, + "language_loss": 0.68295264, + "learning_rate": 3.6535632907028566e-06, + "loss": 0.76119387, + "num_input_tokens_seen": 76992615, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.19775391, + "step": 3567, + "time_per_iteration": 2.5250415802001953 + }, + { + "auxiliary_loss_clip": 0.06543978, + "auxiliary_loss_mlp": 0.01283364, + "balance_loss_clip": 0.06310168, + "balance_loss_mlp": 0.012641, + "epoch": 0.2145197655193146, + "flos": 31116039298560.0, + "grad_norm": 2.0548954423707753, + "language_loss": 0.75150776, + "learning_rate": 3.6533441782042126e-06, + "loss": 0.82978123, + "num_input_tokens_seen": 77017005, + "router_z_loss_clip": 2.33984375, + "router_z_loss_mlp": 0.19250488, + "step": 3568, + "time_per_iteration": 4.018412113189697 + }, + { + "auxiliary_loss_clip": 0.06538366, + "auxiliary_loss_mlp": 0.01282205, + "balance_loss_clip": 0.063043, + "balance_loss_mlp": 0.01261773, + "epoch": 0.21457988877198256, + "flos": 20127015083520.0, + "grad_norm": 2.3975687399079284, + "language_loss": 0.78487438, + "learning_rate": 3.6531250030105917e-06, + "loss": 0.86308008, + "num_input_tokens_seen": 77034990, + "router_z_loss_clip": 2.34179688, + "router_z_loss_mlp": 0.20446777, + "step": 3569, + "time_per_iteration": 2.6051042079925537 + }, + { + "auxiliary_loss_clip": 0.06554128, + "auxiliary_loss_mlp": 0.01283223, + "balance_loss_clip": 0.06309038, + "balance_loss_mlp": 0.01262183, + "epoch": 0.21464001202465052, + "flos": 18593963182080.0, + "grad_norm": 2.5916710851503173, + "language_loss": 0.7048617, + "learning_rate": 3.6529057651303053e-06, + "loss": 0.78323519, + "num_input_tokens_seen": 77052610, + "router_z_loss_clip": 2.453125, + "router_z_loss_mlp": 0.21032715, + "step": 3570, + "time_per_iteration": 2.5029172897338867 + }, + { + "auxiliary_loss_clip": 0.06548594, + "auxiliary_loss_mlp": 0.01293921, + "balance_loss_clip": 0.06305287, + "balance_loss_mlp": 0.01274621, + "epoch": 0.21470013527731852, + "flos": 21841600855680.0, + "grad_norm": 3.519297534980699, + "language_loss": 0.79412138, + "learning_rate": 3.6526864645716666e-06, + "loss": 0.87254649, + "num_input_tokens_seen": 77072475, + "router_z_loss_clip": 2.43554688, + "router_z_loss_mlp": 0.19311523, + "step": 3571, + "time_per_iteration": 3.984830141067505 + }, + { + "auxiliary_loss_clip": 0.06547887, + "auxiliary_loss_mlp": 0.01283536, + "balance_loss_clip": 0.06306981, + "balance_loss_mlp": 0.01263413, + "epoch": 0.21476025852998648, + "flos": 17608992837120.0, + "grad_norm": 2.1137138833129114, + "language_loss": 0.83417559, + "learning_rate": 3.652467101342991e-06, + "loss": 0.91248989, + "num_input_tokens_seen": 77089930, + "router_z_loss_clip": 2.41210938, + "router_z_loss_mlp": 0.20117188, + "step": 3572, + "time_per_iteration": 2.550900459289551 + }, + { + "auxiliary_loss_clip": 0.06544446, + "auxiliary_loss_mlp": 0.01290796, + "balance_loss_clip": 0.06300403, + "balance_loss_mlp": 0.01271114, + "epoch": 0.21482038178265445, + "flos": 24835267203840.0, + "grad_norm": 5.91831897424108, + "language_loss": 0.6534397, + "learning_rate": 3.652247675452598e-06, + "loss": 0.73179209, + "num_input_tokens_seen": 77108970, + "router_z_loss_clip": 2.44140625, + "router_z_loss_mlp": 0.19677734, + "step": 3573, + "time_per_iteration": 2.574037551879883 + }, + { + "auxiliary_loss_clip": 0.06536618, + "auxiliary_loss_mlp": 0.01287357, + "balance_loss_clip": 0.06305118, + "balance_loss_mlp": 0.0126814, + "epoch": 0.2148805050353224, + "flos": 23264927435520.0, + "grad_norm": 1.8228372560216166, + "language_loss": 0.76129293, + "learning_rate": 3.652028186908807e-06, + "loss": 0.83953267, + "num_input_tokens_seen": 77126045, + "router_z_loss_clip": 2.31640625, + "router_z_loss_mlp": 0.1920166, + "step": 3574, + "time_per_iteration": 2.610541343688965 + }, + { + "auxiliary_loss_clip": 0.06537417, + "auxiliary_loss_mlp": 0.01280783, + "balance_loss_clip": 0.06298707, + "balance_loss_mlp": 0.0126066, + "epoch": 0.21494062828799038, + "flos": 21326907951360.0, + "grad_norm": 2.0935140233911644, + "language_loss": 0.72909325, + "learning_rate": 3.6518086357199416e-06, + "loss": 0.8072753, + "num_input_tokens_seen": 77144600, + "router_z_loss_clip": 2.38867188, + "router_z_loss_mlp": 0.20117188, + "step": 3575, + "time_per_iteration": 2.581932306289673 + }, + { + "auxiliary_loss_clip": 0.06537387, + "auxiliary_loss_mlp": 0.01288909, + "balance_loss_clip": 0.06302074, + "balance_loss_mlp": 0.01269657, + "epoch": 0.21500075154065834, + "flos": 18849276172800.0, + "grad_norm": 2.2103119968131986, + "language_loss": 0.6923548, + "learning_rate": 3.6515890218943277e-06, + "loss": 0.77061772, + "num_input_tokens_seen": 77162965, + "router_z_loss_clip": 2.35742188, + "router_z_loss_mlp": 0.19262695, + "step": 3576, + "time_per_iteration": 5.394233703613281 + }, + { + "auxiliary_loss_clip": 0.06547244, + "auxiliary_loss_mlp": 0.01282016, + "balance_loss_clip": 0.06304461, + "balance_loss_mlp": 0.0126069, + "epoch": 0.2150608747933263, + "flos": 18447872388480.0, + "grad_norm": 1.9274083971527407, + "language_loss": 0.89371777, + "learning_rate": 3.651369345440292e-06, + "loss": 0.97201031, + "num_input_tokens_seen": 77179960, + "router_z_loss_clip": 2.42773438, + "router_z_loss_mlp": 0.21337891, + "step": 3577, + "time_per_iteration": 2.5629777908325195 + }, + { + "auxiliary_loss_clip": 0.06425267, + "auxiliary_loss_mlp": 0.01303124, + "balance_loss_clip": 0.06298774, + "balance_loss_mlp": 0.01297548, + "epoch": 0.2151209980459943, + "flos": 66617443808640.0, + "grad_norm": 0.7978427219987446, + "language_loss": 0.56304139, + "learning_rate": 3.6511496063661654e-06, + "loss": 0.64032531, + "num_input_tokens_seen": 77239500, + "router_z_loss_clip": 1.265625, + "router_z_loss_mlp": 0.05581665, + "step": 3578, + "time_per_iteration": 3.0982370376586914 + }, + { + "auxiliary_loss_clip": 0.06546376, + "auxiliary_loss_mlp": 0.0128684, + "balance_loss_clip": 0.06309081, + "balance_loss_mlp": 0.0126729, + "epoch": 0.21518112129866226, + "flos": 21581633963520.0, + "grad_norm": 1.7619248126111737, + "language_loss": 0.89097106, + "learning_rate": 3.6509298046802807e-06, + "loss": 0.96930325, + "num_input_tokens_seen": 77254680, + "router_z_loss_clip": 2.37304688, + "router_z_loss_mlp": 0.19555664, + "step": 3579, + "time_per_iteration": 2.5552327632904053 + }, + { + "auxiliary_loss_clip": 0.06544919, + "auxiliary_loss_mlp": 0.01281002, + "balance_loss_clip": 0.06304899, + "balance_loss_mlp": 0.01260498, + "epoch": 0.21524124455133023, + "flos": 20053822942080.0, + "grad_norm": 1.8548300822509616, + "language_loss": 0.78671825, + "learning_rate": 3.650709940390972e-06, + "loss": 0.86497748, + "num_input_tokens_seen": 77274060, + "router_z_loss_clip": 2.39648438, + "router_z_loss_mlp": 0.20507812, + "step": 3580, + "time_per_iteration": 2.538740634918213 + }, + { + "auxiliary_loss_clip": 0.06547832, + "auxiliary_loss_mlp": 0.01284221, + "balance_loss_clip": 0.06311843, + "balance_loss_mlp": 0.01265279, + "epoch": 0.2153013678039982, + "flos": 23958680515200.0, + "grad_norm": 2.0040984242528905, + "language_loss": 0.73520374, + "learning_rate": 3.6504900135065775e-06, + "loss": 0.81352425, + "num_input_tokens_seen": 77293255, + "router_z_loss_clip": 2.359375, + "router_z_loss_mlp": 0.18933105, + "step": 3581, + "time_per_iteration": 2.5783493518829346 + }, + { + "auxiliary_loss_clip": 0.06544261, + "auxiliary_loss_mlp": 0.01283002, + "balance_loss_clip": 0.06307264, + "balance_loss_mlp": 0.01262438, + "epoch": 0.21536149105666616, + "flos": 20601107884800.0, + "grad_norm": 2.9043222851567574, + "language_loss": 0.71477044, + "learning_rate": 3.6502700240354357e-06, + "loss": 0.79304302, + "num_input_tokens_seen": 77312390, + "router_z_loss_clip": 2.3671875, + "router_z_loss_mlp": 0.20556641, + "step": 3582, + "time_per_iteration": 2.5253281593322754 + }, + { + "auxiliary_loss_clip": 0.06553562, + "auxiliary_loss_mlp": 0.01282797, + "balance_loss_clip": 0.06315581, + "balance_loss_mlp": 0.01262209, + "epoch": 0.21542161430933413, + "flos": 12865046077440.0, + "grad_norm": 2.5916269023447795, + "language_loss": 0.85900396, + "learning_rate": 3.650049971985889e-06, + "loss": 0.93736756, + "num_input_tokens_seen": 77330985, + "router_z_loss_clip": 2.37890625, + "router_z_loss_mlp": 0.20568848, + "step": 3583, + "time_per_iteration": 2.580411434173584 + }, + { + "auxiliary_loss_clip": 0.0655268, + "auxiliary_loss_mlp": 0.01295505, + "balance_loss_clip": 0.06312086, + "balance_loss_mlp": 0.01275561, + "epoch": 0.21548173756200212, + "flos": 26111077470720.0, + "grad_norm": 2.720923149453336, + "language_loss": 0.83510441, + "learning_rate": 3.6498298573662824e-06, + "loss": 0.91358626, + "num_input_tokens_seen": 77350770, + "router_z_loss_clip": 2.40625, + "router_z_loss_mlp": 0.19934082, + "step": 3584, + "time_per_iteration": 2.587843179702759 + }, + { + "auxiliary_loss_clip": 0.06549002, + "auxiliary_loss_mlp": 0.01288111, + "balance_loss_clip": 0.06314336, + "balance_loss_mlp": 0.01267667, + "epoch": 0.21554186081467008, + "flos": 22170315623040.0, + "grad_norm": 2.7712372256622357, + "language_loss": 0.91010725, + "learning_rate": 3.6496096801849625e-06, + "loss": 0.9884783, + "num_input_tokens_seen": 77370510, + "router_z_loss_clip": 2.34375, + "router_z_loss_mlp": 0.20446777, + "step": 3585, + "time_per_iteration": 2.5638017654418945 + }, + { + "auxiliary_loss_clip": 0.06548285, + "auxiliary_loss_mlp": 0.0129374, + "balance_loss_clip": 0.0631054, + "balance_loss_mlp": 0.012745, + "epoch": 0.21560198406733805, + "flos": 22973458608000.0, + "grad_norm": 2.0799258962001548, + "language_loss": 0.75285476, + "learning_rate": 3.649389440450277e-06, + "loss": 0.83127499, + "num_input_tokens_seen": 77390645, + "router_z_loss_clip": 2.37890625, + "router_z_loss_mlp": 0.19238281, + "step": 3586, + "time_per_iteration": 2.5816385746002197 + }, + { + "auxiliary_loss_clip": 0.06560329, + "auxiliary_loss_mlp": 0.01301548, + "balance_loss_clip": 0.06317623, + "balance_loss_mlp": 0.012817, + "epoch": 0.215662107320006, + "flos": 22790708853120.0, + "grad_norm": 1.7819627104594034, + "language_loss": 0.83628035, + "learning_rate": 3.6491691381705804e-06, + "loss": 0.91489911, + "num_input_tokens_seen": 77409655, + "router_z_loss_clip": 2.42382812, + "router_z_loss_mlp": 0.19848633, + "step": 3587, + "time_per_iteration": 2.5768468379974365 + }, + { + "auxiliary_loss_clip": 0.06549525, + "auxiliary_loss_mlp": 0.01284104, + "balance_loss_clip": 0.06311873, + "balance_loss_mlp": 0.01265114, + "epoch": 0.21572223057267398, + "flos": 30891850899840.0, + "grad_norm": 2.819752743062096, + "language_loss": 0.764575, + "learning_rate": 3.648948773354224e-06, + "loss": 0.8429113, + "num_input_tokens_seen": 77430560, + "router_z_loss_clip": 2.37890625, + "router_z_loss_mlp": 0.18981934, + "step": 3588, + "time_per_iteration": 2.6578357219696045 + }, + { + "auxiliary_loss_clip": 0.06557232, + "auxiliary_loss_mlp": 0.01294163, + "balance_loss_clip": 0.06316121, + "balance_loss_mlp": 0.01274494, + "epoch": 0.21578235382534194, + "flos": 26918413159680.0, + "grad_norm": 3.674353356251158, + "language_loss": 0.8181411, + "learning_rate": 3.6487283460095643e-06, + "loss": 0.89665502, + "num_input_tokens_seen": 77455000, + "router_z_loss_clip": 2.4140625, + "router_z_loss_mlp": 0.19689941, + "step": 3589, + "time_per_iteration": 2.6730964183807373 + }, + { + "auxiliary_loss_clip": 0.06560542, + "auxiliary_loss_mlp": 0.01287343, + "balance_loss_clip": 0.06321919, + "balance_loss_mlp": 0.01267959, + "epoch": 0.2158424770780099, + "flos": 24432605608320.0, + "grad_norm": 2.119721317496626, + "language_loss": 0.73323047, + "learning_rate": 3.648507856144961e-06, + "loss": 0.81170928, + "num_input_tokens_seen": 77475075, + "router_z_loss_clip": 2.38671875, + "router_z_loss_mlp": 0.19384766, + "step": 3590, + "time_per_iteration": 2.5885848999023438 + }, + { + "auxiliary_loss_clip": 0.06554762, + "auxiliary_loss_mlp": 0.0128494, + "balance_loss_clip": 0.06310897, + "balance_loss_mlp": 0.01264401, + "epoch": 0.2159026003306779, + "flos": 23956542236160.0, + "grad_norm": 2.0666561712978813, + "language_loss": 0.84929311, + "learning_rate": 3.648287303768775e-06, + "loss": 0.92769015, + "num_input_tokens_seen": 77495945, + "router_z_loss_clip": 2.44140625, + "router_z_loss_mlp": 0.20544434, + "step": 3591, + "time_per_iteration": 2.5598154067993164 + }, + { + "auxiliary_loss_clip": 0.0656037, + "auxiliary_loss_mlp": 0.01294269, + "balance_loss_clip": 0.06315921, + "balance_loss_mlp": 0.01272776, + "epoch": 0.21596272358334587, + "flos": 30048191665920.0, + "grad_norm": 1.8943006547331833, + "language_loss": 0.69118065, + "learning_rate": 3.6480666888893686e-06, + "loss": 0.76972699, + "num_input_tokens_seen": 77517140, + "router_z_loss_clip": 2.44140625, + "router_z_loss_mlp": 0.21496582, + "step": 3592, + "time_per_iteration": 2.623124599456787 + }, + { + "auxiliary_loss_clip": 0.06558264, + "auxiliary_loss_mlp": 0.01284651, + "balance_loss_clip": 0.06314576, + "balance_loss_mlp": 0.01264218, + "epoch": 0.21602284683601383, + "flos": 20382495782400.0, + "grad_norm": 3.2836833125469753, + "language_loss": 0.84947151, + "learning_rate": 3.647846011515108e-06, + "loss": 0.92790061, + "num_input_tokens_seen": 77536085, + "router_z_loss_clip": 2.4375, + "router_z_loss_mlp": 0.2043457, + "step": 3593, + "time_per_iteration": 2.5159051418304443 + }, + { + "auxiliary_loss_clip": 0.06551524, + "auxiliary_loss_mlp": 0.01289729, + "balance_loss_clip": 0.06309479, + "balance_loss_mlp": 0.01267615, + "epoch": 0.2160829700886818, + "flos": 20783648004480.0, + "grad_norm": 2.6962087820066567, + "language_loss": 0.76424301, + "learning_rate": 3.6476252716543625e-06, + "loss": 0.84265554, + "num_input_tokens_seen": 77553675, + "router_z_loss_clip": 2.42382812, + "router_z_loss_mlp": 0.22119141, + "step": 3594, + "time_per_iteration": 2.530874490737915 + }, + { + "auxiliary_loss_clip": 0.06549954, + "auxiliary_loss_mlp": 0.01280574, + "balance_loss_clip": 0.06313863, + "balance_loss_mlp": 0.01260189, + "epoch": 0.21614309334134976, + "flos": 22316322562560.0, + "grad_norm": 1.5622924015328905, + "language_loss": 0.80828846, + "learning_rate": 3.6474044693155007e-06, + "loss": 0.88659382, + "num_input_tokens_seen": 77573360, + "router_z_loss_clip": 2.36132812, + "router_z_loss_mlp": 0.20385742, + "step": 3595, + "time_per_iteration": 2.5720436573028564 + }, + { + "auxiliary_loss_clip": 0.0655812, + "auxiliary_loss_mlp": 0.01282788, + "balance_loss_clip": 0.06310599, + "balance_loss_mlp": 0.01261962, + "epoch": 0.21620321659401773, + "flos": 19615592488320.0, + "grad_norm": 2.071968351759389, + "language_loss": 0.79120421, + "learning_rate": 3.647183604506897e-06, + "loss": 0.86961329, + "num_input_tokens_seen": 77591865, + "router_z_loss_clip": 2.4765625, + "router_z_loss_mlp": 0.20825195, + "step": 3596, + "time_per_iteration": 2.529978036880493 + }, + { + "auxiliary_loss_clip": 0.06547653, + "auxiliary_loss_mlp": 0.01279822, + "balance_loss_clip": 0.06309111, + "balance_loss_mlp": 0.01258615, + "epoch": 0.2162633398466857, + "flos": 18850701692160.0, + "grad_norm": 1.8098333997433065, + "language_loss": 0.83728772, + "learning_rate": 3.6469626772369253e-06, + "loss": 0.91556245, + "num_input_tokens_seen": 77611600, + "router_z_loss_clip": 2.38671875, + "router_z_loss_mlp": 0.2121582, + "step": 3597, + "time_per_iteration": 2.514389991760254 + }, + { + "auxiliary_loss_clip": 0.06559294, + "auxiliary_loss_mlp": 0.01284022, + "balance_loss_clip": 0.06315802, + "balance_loss_mlp": 0.01262421, + "epoch": 0.21632346309935369, + "flos": 18774490803840.0, + "grad_norm": 2.0845397374343655, + "language_loss": 0.81213892, + "learning_rate": 3.6467416875139642e-06, + "loss": 0.89057213, + "num_input_tokens_seen": 77630665, + "router_z_loss_clip": 2.4375, + "router_z_loss_mlp": 0.21606445, + "step": 3598, + "time_per_iteration": 2.517596960067749 + }, + { + "auxiliary_loss_clip": 0.06554621, + "auxiliary_loss_mlp": 0.01287936, + "balance_loss_clip": 0.06312433, + "balance_loss_mlp": 0.01265072, + "epoch": 0.21638358635202165, + "flos": 26331576289920.0, + "grad_norm": 1.6266226591192001, + "language_loss": 0.82318664, + "learning_rate": 3.6465206353463934e-06, + "loss": 0.90161228, + "num_input_tokens_seen": 77650835, + "router_z_loss_clip": 2.41992188, + "router_z_loss_mlp": 0.22851562, + "step": 3599, + "time_per_iteration": 2.567528486251831 + }, + { + "auxiliary_loss_clip": 0.06553015, + "auxiliary_loss_mlp": 0.01284743, + "balance_loss_clip": 0.06314674, + "balance_loss_mlp": 0.01263107, + "epoch": 0.21644370960468962, + "flos": 20747156751360.0, + "grad_norm": 2.0891036476830585, + "language_loss": 0.76652539, + "learning_rate": 3.6462995207425947e-06, + "loss": 0.84490293, + "num_input_tokens_seen": 77669000, + "router_z_loss_clip": 2.38476562, + "router_z_loss_mlp": 0.21618652, + "step": 3600, + "time_per_iteration": 2.5642178058624268 + }, + { + "auxiliary_loss_clip": 0.06555548, + "auxiliary_loss_mlp": 0.01287253, + "balance_loss_clip": 0.06316924, + "balance_loss_mlp": 0.01267512, + "epoch": 0.21650383285735758, + "flos": 23959183639680.0, + "grad_norm": 1.8375873098897355, + "language_loss": 0.80812716, + "learning_rate": 3.6460783437109533e-06, + "loss": 0.88655519, + "num_input_tokens_seen": 77688745, + "router_z_loss_clip": 2.38476562, + "router_z_loss_mlp": 0.19726562, + "step": 3601, + "time_per_iteration": 2.536790132522583 + }, + { + "auxiliary_loss_clip": 0.06558496, + "auxiliary_loss_mlp": 0.01286287, + "balance_loss_clip": 0.06317312, + "balance_loss_mlp": 0.01265783, + "epoch": 0.21656395611002555, + "flos": 23702864400000.0, + "grad_norm": 1.8593805820505158, + "language_loss": 0.84205902, + "learning_rate": 3.6458571042598565e-06, + "loss": 0.92050683, + "num_input_tokens_seen": 77708445, + "router_z_loss_clip": 2.41210938, + "router_z_loss_mlp": 0.2052002, + "step": 3602, + "time_per_iteration": 2.5919816493988037 + }, + { + "auxiliary_loss_clip": 0.06553967, + "auxiliary_loss_mlp": 0.01285958, + "balance_loss_clip": 0.06313825, + "balance_loss_mlp": 0.01265371, + "epoch": 0.2166240793626935, + "flos": 20672035966080.0, + "grad_norm": 1.6537912100509087, + "language_loss": 0.75107038, + "learning_rate": 3.645635802397693e-06, + "loss": 0.82946962, + "num_input_tokens_seen": 77728465, + "router_z_loss_clip": 2.40429688, + "router_z_loss_mlp": 0.20581055, + "step": 3603, + "time_per_iteration": 2.5602827072143555 + }, + { + "auxiliary_loss_clip": 0.06545025, + "auxiliary_loss_mlp": 0.01279689, + "balance_loss_clip": 0.06314509, + "balance_loss_mlp": 0.0125996, + "epoch": 0.2166842026153615, + "flos": 21586916770560.0, + "grad_norm": 1.9607230977514314, + "language_loss": 0.75016356, + "learning_rate": 3.645414438132855e-06, + "loss": 0.82841063, + "num_input_tokens_seen": 77746735, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.1973877, + "step": 3604, + "time_per_iteration": 2.7099287509918213 + }, + { + "auxiliary_loss_clip": 0.06550605, + "auxiliary_loss_mlp": 0.01283396, + "balance_loss_clip": 0.06315283, + "balance_loss_mlp": 0.01263881, + "epoch": 0.21674432586802947, + "flos": 25637068523520.0, + "grad_norm": 1.5948705207891358, + "language_loss": 0.80732697, + "learning_rate": 3.6451930114737366e-06, + "loss": 0.88566697, + "num_input_tokens_seen": 77768105, + "router_z_loss_clip": 2.35351562, + "router_z_loss_mlp": 0.19506836, + "step": 3605, + "time_per_iteration": 2.601269483566284 + }, + { + "auxiliary_loss_clip": 0.06465107, + "auxiliary_loss_mlp": 0.01314575, + "balance_loss_clip": 0.0633797, + "balance_loss_mlp": 0.01307596, + "epoch": 0.21680444912069743, + "flos": 56435126376960.0, + "grad_norm": 0.68181157035555, + "language_loss": 0.58316016, + "learning_rate": 3.6449715224287347e-06, + "loss": 0.66095698, + "num_input_tokens_seen": 77833750, + "router_z_loss_clip": 1.27246094, + "router_z_loss_mlp": 0.06994629, + "step": 3606, + "time_per_iteration": 3.2531886100769043 + }, + { + "auxiliary_loss_clip": 0.06547002, + "auxiliary_loss_mlp": 0.01286663, + "balance_loss_clip": 0.06303971, + "balance_loss_mlp": 0.01264502, + "epoch": 0.2168645723733654, + "flos": 23885823790080.0, + "grad_norm": 1.8693102201830953, + "language_loss": 0.73682618, + "learning_rate": 3.644749971006248e-06, + "loss": 0.81516284, + "num_input_tokens_seen": 77853780, + "router_z_loss_clip": 2.42773438, + "router_z_loss_mlp": 0.22155762, + "step": 3607, + "time_per_iteration": 4.0285868644714355 + }, + { + "auxiliary_loss_clip": 0.06548688, + "auxiliary_loss_mlp": 0.01281672, + "balance_loss_clip": 0.06306184, + "balance_loss_mlp": 0.01259595, + "epoch": 0.21692469562603336, + "flos": 16951814864640.0, + "grad_norm": 1.845726065350227, + "language_loss": 0.78116572, + "learning_rate": 3.6445283572146765e-06, + "loss": 0.85946935, + "num_input_tokens_seen": 77872575, + "router_z_loss_clip": 2.421875, + "router_z_loss_mlp": 0.22070312, + "step": 3608, + "time_per_iteration": 2.4997665882110596 + }, + { + "auxiliary_loss_clip": 0.06549841, + "auxiliary_loss_mlp": 0.01279583, + "balance_loss_clip": 0.06307275, + "balance_loss_mlp": 0.01260248, + "epoch": 0.21698481887870133, + "flos": 25126065198720.0, + "grad_norm": 2.052249511327834, + "language_loss": 0.74638152, + "learning_rate": 3.6443066810624255e-06, + "loss": 0.82467568, + "num_input_tokens_seen": 77892700, + "router_z_loss_clip": 2.42773438, + "router_z_loss_mlp": 0.19335938, + "step": 3609, + "time_per_iteration": 2.5834193229675293 + }, + { + "auxiliary_loss_clip": 0.06540599, + "auxiliary_loss_mlp": 0.01279572, + "balance_loss_clip": 0.06301089, + "balance_loss_mlp": 0.01258221, + "epoch": 0.2170449421313693, + "flos": 17900461664640.0, + "grad_norm": 2.066668805909691, + "language_loss": 0.8888129, + "learning_rate": 3.6440849425579e-06, + "loss": 0.96701467, + "num_input_tokens_seen": 77911060, + "router_z_loss_clip": 2.39648438, + "router_z_loss_mlp": 0.21374512, + "step": 3610, + "time_per_iteration": 3.978980302810669 + }, + { + "auxiliary_loss_clip": 0.06540407, + "auxiliary_loss_mlp": 0.01284961, + "balance_loss_clip": 0.06302356, + "balance_loss_mlp": 0.01264457, + "epoch": 0.2171050653840373, + "flos": 22645121184000.0, + "grad_norm": 2.4524698956279978, + "language_loss": 0.78034103, + "learning_rate": 3.6438631417095095e-06, + "loss": 0.85859472, + "num_input_tokens_seen": 77929930, + "router_z_loss_clip": 2.38085938, + "router_z_loss_mlp": 0.20507812, + "step": 3611, + "time_per_iteration": 2.537783622741699 + }, + { + "auxiliary_loss_clip": 0.06539893, + "auxiliary_loss_mlp": 0.01277493, + "balance_loss_clip": 0.06301216, + "balance_loss_mlp": 0.0125619, + "epoch": 0.21716518863670525, + "flos": 19506034874880.0, + "grad_norm": 1.9372172398113192, + "language_loss": 0.63866782, + "learning_rate": 3.6436412785256637e-06, + "loss": 0.71684164, + "num_input_tokens_seen": 77949060, + "router_z_loss_clip": 2.38671875, + "router_z_loss_mlp": 0.21313477, + "step": 3612, + "time_per_iteration": 2.5200283527374268 + }, + { + "auxiliary_loss_clip": 0.06543254, + "auxiliary_loss_mlp": 0.01280194, + "balance_loss_clip": 0.06303414, + "balance_loss_mlp": 0.01259761, + "epoch": 0.21722531188937322, + "flos": 19798132608000.0, + "grad_norm": 1.7866878621114652, + "language_loss": 0.76463711, + "learning_rate": 3.643419353014776e-06, + "loss": 0.84287155, + "num_input_tokens_seen": 77967920, + "router_z_loss_clip": 2.40039062, + "router_z_loss_mlp": 0.2043457, + "step": 3613, + "time_per_iteration": 2.536395311355591 + }, + { + "auxiliary_loss_clip": 0.06540725, + "auxiliary_loss_mlp": 0.01277778, + "balance_loss_clip": 0.06303174, + "balance_loss_mlp": 0.01256165, + "epoch": 0.21728543514204118, + "flos": 13339474295040.0, + "grad_norm": 1.8023674067133515, + "language_loss": 0.72213733, + "learning_rate": 3.643197365185261e-06, + "loss": 0.80032235, + "num_input_tokens_seen": 77985330, + "router_z_loss_clip": 2.375, + "router_z_loss_mlp": 0.21582031, + "step": 3614, + "time_per_iteration": 2.5000360012054443 + }, + { + "auxiliary_loss_clip": 0.06542929, + "auxiliary_loss_mlp": 0.01277823, + "balance_loss_clip": 0.06304483, + "balance_loss_mlp": 0.01256973, + "epoch": 0.21734555839470915, + "flos": 15237312946560.0, + "grad_norm": 2.7303590898197463, + "language_loss": 0.73928845, + "learning_rate": 3.6429753150455378e-06, + "loss": 0.81749594, + "num_input_tokens_seen": 78003105, + "router_z_loss_clip": 2.38476562, + "router_z_loss_mlp": 0.20849609, + "step": 3615, + "time_per_iteration": 3.924616813659668 + }, + { + "auxiliary_loss_clip": 0.0654763, + "auxiliary_loss_mlp": 0.01279327, + "balance_loss_clip": 0.06301322, + "balance_loss_mlp": 0.0125694, + "epoch": 0.2174056816473771, + "flos": 19980043822080.0, + "grad_norm": 2.1391350951981467, + "language_loss": 0.913239, + "learning_rate": 3.6427532026040263e-06, + "loss": 0.99150848, + "num_input_tokens_seen": 78019655, + "router_z_loss_clip": 2.46875, + "router_z_loss_mlp": 0.22387695, + "step": 3616, + "time_per_iteration": 3.9379403591156006 + }, + { + "auxiliary_loss_clip": 0.06540038, + "auxiliary_loss_mlp": 0.01284656, + "balance_loss_clip": 0.06298746, + "balance_loss_mlp": 0.01263163, + "epoch": 0.21746580490004508, + "flos": 16692309169920.0, + "grad_norm": 2.057861674488091, + "language_loss": 0.81572813, + "learning_rate": 3.642531027869148e-06, + "loss": 0.89397502, + "num_input_tokens_seen": 78036025, + "router_z_loss_clip": 2.41601562, + "router_z_loss_mlp": 0.21496582, + "step": 3617, + "time_per_iteration": 2.5517330169677734 + }, + { + "auxiliary_loss_clip": 0.06543958, + "auxiliary_loss_mlp": 0.01280959, + "balance_loss_clip": 0.06300673, + "balance_loss_mlp": 0.01258881, + "epoch": 0.21752592815271307, + "flos": 25778840832000.0, + "grad_norm": 1.7475820668036919, + "language_loss": 0.76030993, + "learning_rate": 3.642308790849329e-06, + "loss": 0.83855915, + "num_input_tokens_seen": 78055645, + "router_z_loss_clip": 2.43164062, + "router_z_loss_mlp": 0.2208252, + "step": 3618, + "time_per_iteration": 2.5874650478363037 + }, + { + "auxiliary_loss_clip": 0.06542084, + "auxiliary_loss_mlp": 0.01277743, + "balance_loss_clip": 0.06299525, + "balance_loss_mlp": 0.01255928, + "epoch": 0.21758605140538104, + "flos": 11259430940160.0, + "grad_norm": 1.9309868599682727, + "language_loss": 0.69592559, + "learning_rate": 3.642086491552996e-06, + "loss": 0.77412391, + "num_input_tokens_seen": 78071660, + "router_z_loss_clip": 2.421875, + "router_z_loss_mlp": 0.21826172, + "step": 3619, + "time_per_iteration": 2.5259079933166504 + }, + { + "auxiliary_loss_clip": 0.06549741, + "auxiliary_loss_mlp": 0.01287424, + "balance_loss_clip": 0.06307657, + "balance_loss_mlp": 0.01264906, + "epoch": 0.217646174658049, + "flos": 19248290115840.0, + "grad_norm": 1.6696593228851853, + "language_loss": 0.78744078, + "learning_rate": 3.641864129988579e-06, + "loss": 0.86581242, + "num_input_tokens_seen": 78091265, + "router_z_loss_clip": 2.41992188, + "router_z_loss_mlp": 0.22521973, + "step": 3620, + "time_per_iteration": 2.5225844383239746 + }, + { + "auxiliary_loss_clip": 0.06542689, + "auxiliary_loss_mlp": 0.01283495, + "balance_loss_clip": 0.06306273, + "balance_loss_mlp": 0.01263349, + "epoch": 0.21770629791071697, + "flos": 21951619666560.0, + "grad_norm": 1.6751510482296663, + "language_loss": 0.80184436, + "learning_rate": 3.641641706164509e-06, + "loss": 0.88010621, + "num_input_tokens_seen": 78110095, + "router_z_loss_clip": 2.36328125, + "router_z_loss_mlp": 0.20141602, + "step": 3621, + "time_per_iteration": 2.5528457164764404 + }, + { + "auxiliary_loss_clip": 0.0654473, + "auxiliary_loss_mlp": 0.01278712, + "balance_loss_clip": 0.06305254, + "balance_loss_mlp": 0.012594, + "epoch": 0.21776642116338493, + "flos": 24943776641280.0, + "grad_norm": 1.5217586163816694, + "language_loss": 0.87951142, + "learning_rate": 3.641419220089221e-06, + "loss": 0.95774585, + "num_input_tokens_seen": 78129475, + "router_z_loss_clip": 2.39648438, + "router_z_loss_mlp": 0.19299316, + "step": 3622, + "time_per_iteration": 2.621716022491455 + }, + { + "auxiliary_loss_clip": 0.06559718, + "auxiliary_loss_mlp": 0.01277107, + "balance_loss_clip": 0.06313318, + "balance_loss_mlp": 0.01254445, + "epoch": 0.2178265444160529, + "flos": 17827017960960.0, + "grad_norm": 3.34018590012949, + "language_loss": 0.77879506, + "learning_rate": 3.641196671771152e-06, + "loss": 0.85716331, + "num_input_tokens_seen": 78146880, + "router_z_loss_clip": 2.46289062, + "router_z_loss_mlp": 0.22668457, + "step": 3623, + "time_per_iteration": 2.5479788780212402 + }, + { + "auxiliary_loss_clip": 0.0655373, + "auxiliary_loss_mlp": 0.01283267, + "balance_loss_clip": 0.06310436, + "balance_loss_mlp": 0.0126132, + "epoch": 0.2178866676687209, + "flos": 17718760085760.0, + "grad_norm": 2.118806527220675, + "language_loss": 0.85078007, + "learning_rate": 3.640974061218741e-06, + "loss": 0.92914999, + "num_input_tokens_seen": 78165065, + "router_z_loss_clip": 2.43359375, + "router_z_loss_mlp": 0.21936035, + "step": 3624, + "time_per_iteration": 2.4991443157196045 + }, + { + "auxiliary_loss_clip": 0.06544428, + "auxiliary_loss_mlp": 0.01281962, + "balance_loss_clip": 0.06301346, + "balance_loss_mlp": 0.01259014, + "epoch": 0.21794679092138886, + "flos": 16951437521280.0, + "grad_norm": 2.3785715622769357, + "language_loss": 0.7814458, + "learning_rate": 3.640751388440429e-06, + "loss": 0.85970974, + "num_input_tokens_seen": 78180005, + "router_z_loss_clip": 2.42773438, + "router_z_loss_mlp": 0.22961426, + "step": 3625, + "time_per_iteration": 2.5113301277160645 + }, + { + "auxiliary_loss_clip": 0.06435797, + "auxiliary_loss_mlp": 0.01281105, + "balance_loss_clip": 0.0630773, + "balance_loss_mlp": 0.01275631, + "epoch": 0.21800691417405682, + "flos": 63737737413120.0, + "grad_norm": 0.7732492376258139, + "language_loss": 0.60674119, + "learning_rate": 3.64052865344466e-06, + "loss": 0.68391013, + "num_input_tokens_seen": 78245350, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.05477905, + "step": 3626, + "time_per_iteration": 3.230576992034912 + }, + { + "auxiliary_loss_clip": 0.06551459, + "auxiliary_loss_mlp": 0.01275255, + "balance_loss_clip": 0.06306285, + "balance_loss_mlp": 0.01252271, + "epoch": 0.21806703742672479, + "flos": 21622821045120.0, + "grad_norm": 2.0426080259896664, + "language_loss": 0.91217983, + "learning_rate": 3.6403058562398795e-06, + "loss": 0.99044704, + "num_input_tokens_seen": 78264165, + "router_z_loss_clip": 2.45117188, + "router_z_loss_mlp": 0.22961426, + "step": 3627, + "time_per_iteration": 2.571704149246216 + }, + { + "auxiliary_loss_clip": 0.06549745, + "auxiliary_loss_mlp": 0.01279629, + "balance_loss_clip": 0.06307864, + "balance_loss_mlp": 0.01257313, + "epoch": 0.21812716067939275, + "flos": 19361034184320.0, + "grad_norm": 1.8240036323551578, + "language_loss": 0.74830574, + "learning_rate": 3.6400829968345365e-06, + "loss": 0.82659948, + "num_input_tokens_seen": 78283745, + "router_z_loss_clip": 2.421875, + "router_z_loss_mlp": 0.2232666, + "step": 3628, + "time_per_iteration": 2.5547990798950195 + }, + { + "auxiliary_loss_clip": 0.06543273, + "auxiliary_loss_mlp": 0.01279581, + "balance_loss_clip": 0.06304347, + "balance_loss_mlp": 0.01257039, + "epoch": 0.21818728393206072, + "flos": 23554467619200.0, + "grad_norm": 1.7805187473711719, + "language_loss": 0.77940357, + "learning_rate": 3.6398600752370826e-06, + "loss": 0.85763204, + "num_input_tokens_seen": 78302900, + "router_z_loss_clip": 2.38867188, + "router_z_loss_mlp": 0.2253418, + "step": 3629, + "time_per_iteration": 2.5777294635772705 + }, + { + "auxiliary_loss_clip": 0.06540327, + "auxiliary_loss_mlp": 0.01278528, + "balance_loss_clip": 0.06302765, + "balance_loss_mlp": 0.01257822, + "epoch": 0.21824740718472868, + "flos": 30233289335040.0, + "grad_norm": 1.6105707802077895, + "language_loss": 0.72294879, + "learning_rate": 3.63963709145597e-06, + "loss": 0.80113733, + "num_input_tokens_seen": 78326470, + "router_z_loss_clip": 2.375, + "router_z_loss_mlp": 0.20703125, + "step": 3630, + "time_per_iteration": 2.6015560626983643 + }, + { + "auxiliary_loss_clip": 0.06535304, + "auxiliary_loss_mlp": 0.01279689, + "balance_loss_clip": 0.06303381, + "balance_loss_mlp": 0.01259364, + "epoch": 0.21830753043739667, + "flos": 26140860397440.0, + "grad_norm": 1.9295675894773927, + "language_loss": 0.77031553, + "learning_rate": 3.6394140454996544e-06, + "loss": 0.8484655, + "num_input_tokens_seen": 78345810, + "router_z_loss_clip": 2.31835938, + "router_z_loss_mlp": 0.203125, + "step": 3631, + "time_per_iteration": 2.5712599754333496 + }, + { + "auxiliary_loss_clip": 0.06546577, + "auxiliary_loss_mlp": 0.01286362, + "balance_loss_clip": 0.06304416, + "balance_loss_mlp": 0.01265274, + "epoch": 0.21836765369006464, + "flos": 21726299237760.0, + "grad_norm": 24.58992261392957, + "language_loss": 0.76358086, + "learning_rate": 3.639190937376594e-06, + "loss": 0.84191024, + "num_input_tokens_seen": 78364085, + "router_z_loss_clip": 2.41992188, + "router_z_loss_mlp": 0.21081543, + "step": 3632, + "time_per_iteration": 2.5312108993530273 + }, + { + "auxiliary_loss_clip": 0.06541382, + "auxiliary_loss_mlp": 0.01277975, + "balance_loss_clip": 0.06306228, + "balance_loss_mlp": 0.01258008, + "epoch": 0.2184277769427326, + "flos": 19943678350080.0, + "grad_norm": 2.014902514553352, + "language_loss": 0.8455261, + "learning_rate": 3.638967767095249e-06, + "loss": 0.9237197, + "num_input_tokens_seen": 78381385, + "router_z_loss_clip": 2.35351562, + "router_z_loss_mlp": 0.19958496, + "step": 3633, + "time_per_iteration": 2.5392541885375977 + }, + { + "auxiliary_loss_clip": 0.06536385, + "auxiliary_loss_mlp": 0.01279679, + "balance_loss_clip": 0.06300621, + "balance_loss_mlp": 0.0125821, + "epoch": 0.21848790019540057, + "flos": 20346591507840.0, + "grad_norm": 2.269088705731375, + "language_loss": 0.82069844, + "learning_rate": 3.6387445346640823e-06, + "loss": 0.89885902, + "num_input_tokens_seen": 78400500, + "router_z_loss_clip": 2.35742188, + "router_z_loss_mlp": 0.21484375, + "step": 3634, + "time_per_iteration": 2.5536303520202637 + }, + { + "auxiliary_loss_clip": 0.06544928, + "auxiliary_loss_mlp": 0.01275115, + "balance_loss_clip": 0.063034, + "balance_loss_mlp": 0.01254063, + "epoch": 0.21854802344806853, + "flos": 15456302392320.0, + "grad_norm": 2.1744892406337133, + "language_loss": 0.75276726, + "learning_rate": 3.638521240091558e-06, + "loss": 0.83096772, + "num_input_tokens_seen": 78418340, + "router_z_loss_clip": 2.41796875, + "router_z_loss_mlp": 0.21044922, + "step": 3635, + "time_per_iteration": 2.5158851146698 + }, + { + "auxiliary_loss_clip": 0.0653572, + "auxiliary_loss_mlp": 0.01278867, + "balance_loss_clip": 0.06301719, + "balance_loss_mlp": 0.01259018, + "epoch": 0.2186081467007365, + "flos": 16325384140800.0, + "grad_norm": 1.9753193728837781, + "language_loss": 0.88470638, + "learning_rate": 3.6382978833861445e-06, + "loss": 0.96285218, + "num_input_tokens_seen": 78434375, + "router_z_loss_clip": 2.33984375, + "router_z_loss_mlp": 0.19836426, + "step": 3636, + "time_per_iteration": 2.5056772232055664 + }, + { + "auxiliary_loss_clip": 0.06536973, + "auxiliary_loss_mlp": 0.01285934, + "balance_loss_clip": 0.06300446, + "balance_loss_mlp": 0.01264798, + "epoch": 0.2186682699534045, + "flos": 21695677770240.0, + "grad_norm": 1.933426681732421, + "language_loss": 0.76219505, + "learning_rate": 3.638074464556311e-06, + "loss": 0.84042412, + "num_input_tokens_seen": 78451735, + "router_z_loss_clip": 2.3671875, + "router_z_loss_mlp": 0.21118164, + "step": 3637, + "time_per_iteration": 2.5159406661987305 + }, + { + "auxiliary_loss_clip": 0.06547473, + "auxiliary_loss_mlp": 0.01280148, + "balance_loss_clip": 0.06303671, + "balance_loss_mlp": 0.0125726, + "epoch": 0.21872839320607246, + "flos": 17743427913600.0, + "grad_norm": 3.0066644559057867, + "language_loss": 0.90341294, + "learning_rate": 3.63785098361053e-06, + "loss": 0.98168921, + "num_input_tokens_seen": 78462730, + "router_z_loss_clip": 2.43945312, + "router_z_loss_mlp": 0.22888184, + "step": 3638, + "time_per_iteration": 2.475271224975586 + }, + { + "auxiliary_loss_clip": 0.06535378, + "auxiliary_loss_mlp": 0.01286586, + "balance_loss_clip": 0.06297417, + "balance_loss_mlp": 0.01264318, + "epoch": 0.21878851645874042, + "flos": 18656757417600.0, + "grad_norm": 3.417327747399998, + "language_loss": 0.90034223, + "learning_rate": 3.637627440557275e-06, + "loss": 0.97856188, + "num_input_tokens_seen": 78476300, + "router_z_loss_clip": 2.38085938, + "router_z_loss_mlp": 0.22265625, + "step": 3639, + "time_per_iteration": 2.4722554683685303 + }, + { + "auxiliary_loss_clip": 0.06531254, + "auxiliary_loss_mlp": 0.01281993, + "balance_loss_clip": 0.06296734, + "balance_loss_mlp": 0.01262264, + "epoch": 0.2188486397114084, + "flos": 25564463360640.0, + "grad_norm": 1.6695470201966474, + "language_loss": 0.7997371, + "learning_rate": 3.637403835405024e-06, + "loss": 0.87786961, + "num_input_tokens_seen": 78496135, + "router_z_loss_clip": 2.34375, + "router_z_loss_mlp": 0.19726562, + "step": 3640, + "time_per_iteration": 2.5905494689941406 + }, + { + "auxiliary_loss_clip": 0.06541579, + "auxiliary_loss_mlp": 0.01284166, + "balance_loss_clip": 0.06302525, + "balance_loss_mlp": 0.01260074, + "epoch": 0.21890876296407635, + "flos": 17897400990720.0, + "grad_norm": 8.732271245188107, + "language_loss": 0.72940969, + "learning_rate": 3.637180168162255e-06, + "loss": 0.80766714, + "num_input_tokens_seen": 78513855, + "router_z_loss_clip": 2.38671875, + "router_z_loss_mlp": 0.24084473, + "step": 3641, + "time_per_iteration": 2.5452075004577637 + }, + { + "auxiliary_loss_clip": 0.06541288, + "auxiliary_loss_mlp": 0.01280481, + "balance_loss_clip": 0.06304857, + "balance_loss_mlp": 0.01259619, + "epoch": 0.21896888621674432, + "flos": 17754915922560.0, + "grad_norm": 1.8801395061290727, + "language_loss": 0.81693721, + "learning_rate": 3.63695643883745e-06, + "loss": 0.89515489, + "num_input_tokens_seen": 78531740, + "router_z_loss_clip": 2.36328125, + "router_z_loss_mlp": 0.20874023, + "step": 3642, + "time_per_iteration": 2.5234179496765137 + }, + { + "auxiliary_loss_clip": 0.06550857, + "auxiliary_loss_mlp": 0.01284985, + "balance_loss_clip": 0.06311135, + "balance_loss_mlp": 0.01262204, + "epoch": 0.21902900946941228, + "flos": 23082890440320.0, + "grad_norm": 1.5963488152753738, + "language_loss": 0.71952182, + "learning_rate": 3.6367326474390928e-06, + "loss": 0.79788017, + "num_input_tokens_seen": 78549600, + "router_z_loss_clip": 2.39648438, + "router_z_loss_mlp": 0.2277832, + "step": 3643, + "time_per_iteration": 2.5542049407958984 + }, + { + "auxiliary_loss_clip": 0.06535246, + "auxiliary_loss_mlp": 0.01285725, + "balance_loss_clip": 0.06298445, + "balance_loss_mlp": 0.01264506, + "epoch": 0.21908913272208028, + "flos": 48189501492480.0, + "grad_norm": 1.9271022520918928, + "language_loss": 0.69055694, + "learning_rate": 3.6365087939756696e-06, + "loss": 0.76876664, + "num_input_tokens_seen": 78573350, + "router_z_loss_clip": 2.37109375, + "router_z_loss_mlp": 0.21228027, + "step": 3644, + "time_per_iteration": 2.8034632205963135 + }, + { + "auxiliary_loss_clip": 0.06548485, + "auxiliary_loss_mlp": 0.01283418, + "balance_loss_clip": 0.06302129, + "balance_loss_mlp": 0.01261531, + "epoch": 0.21914925597474824, + "flos": 22243298129280.0, + "grad_norm": 2.4423330778710937, + "language_loss": 0.78728521, + "learning_rate": 3.636284878455669e-06, + "loss": 0.86560422, + "num_input_tokens_seen": 78591005, + "router_z_loss_clip": 2.46484375, + "router_z_loss_mlp": 0.21911621, + "step": 3645, + "time_per_iteration": 2.547746419906616 + }, + { + "auxiliary_loss_clip": 0.06531754, + "auxiliary_loss_mlp": 0.01275201, + "balance_loss_clip": 0.06300971, + "balance_loss_mlp": 0.01254936, + "epoch": 0.2192093792274162, + "flos": 22131853799040.0, + "grad_norm": 1.5020846701532837, + "language_loss": 0.82847381, + "learning_rate": 3.636060900887582e-06, + "loss": 0.90654337, + "num_input_tokens_seen": 78610645, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.20263672, + "step": 3646, + "time_per_iteration": 2.569216012954712 + }, + { + "auxiliary_loss_clip": 0.06536786, + "auxiliary_loss_mlp": 0.01283667, + "balance_loss_clip": 0.06302559, + "balance_loss_mlp": 0.01263449, + "epoch": 0.21926950248008417, + "flos": 15674914494720.0, + "grad_norm": 1.6949719683005162, + "language_loss": 0.83080441, + "learning_rate": 3.635836861279901e-06, + "loss": 0.90900892, + "num_input_tokens_seen": 78628340, + "router_z_loss_clip": 2.34179688, + "router_z_loss_mlp": 0.20227051, + "step": 3647, + "time_per_iteration": 3.9349160194396973 + }, + { + "auxiliary_loss_clip": 0.06534994, + "auxiliary_loss_mlp": 0.01281644, + "balance_loss_clip": 0.06301765, + "balance_loss_mlp": 0.01261105, + "epoch": 0.21932962573275214, + "flos": 30270199858560.0, + "grad_norm": 1.587891801710132, + "language_loss": 0.7257458, + "learning_rate": 3.635612759641123e-06, + "loss": 0.80391216, + "num_input_tokens_seen": 78649355, + "router_z_loss_clip": 2.33203125, + "router_z_loss_mlp": 0.20532227, + "step": 3648, + "time_per_iteration": 2.6465656757354736 + }, + { + "auxiliary_loss_clip": 0.06545104, + "auxiliary_loss_mlp": 0.0128538, + "balance_loss_clip": 0.06304809, + "balance_loss_mlp": 0.01263434, + "epoch": 0.2193897489854201, + "flos": 10784751160320.0, + "grad_norm": 3.088861131276654, + "language_loss": 0.74724281, + "learning_rate": 3.635388595979745e-06, + "loss": 0.8255477, + "num_input_tokens_seen": 78664915, + "router_z_loss_clip": 2.40039062, + "router_z_loss_mlp": 0.21960449, + "step": 3649, + "time_per_iteration": 2.510040283203125 + }, + { + "auxiliary_loss_clip": 0.06531087, + "auxiliary_loss_mlp": 0.01295006, + "balance_loss_clip": 0.06299826, + "balance_loss_mlp": 0.01274752, + "epoch": 0.21944987223808807, + "flos": 19138984064640.0, + "grad_norm": 4.303407628828735, + "language_loss": 0.86915123, + "learning_rate": 3.635164370304267e-06, + "loss": 0.94741207, + "num_input_tokens_seen": 78681475, + "router_z_loss_clip": 2.3125, + "router_z_loss_mlp": 0.20251465, + "step": 3650, + "time_per_iteration": 3.93752384185791 + }, + { + "auxiliary_loss_clip": 0.06543732, + "auxiliary_loss_mlp": 0.01294843, + "balance_loss_clip": 0.06307691, + "balance_loss_mlp": 0.01273422, + "epoch": 0.21950999549075606, + "flos": 22717726346880.0, + "grad_norm": 2.457938069648898, + "language_loss": 0.8456791, + "learning_rate": 3.6349400826231927e-06, + "loss": 0.92406487, + "num_input_tokens_seen": 78702300, + "router_z_loss_clip": 2.359375, + "router_z_loss_mlp": 0.2142334, + "step": 3651, + "time_per_iteration": 2.7058322429656982 + }, + { + "auxiliary_loss_clip": 0.06539044, + "auxiliary_loss_mlp": 0.01290725, + "balance_loss_clip": 0.06304742, + "balance_loss_mlp": 0.01270257, + "epoch": 0.21957011874342403, + "flos": 10565929422720.0, + "grad_norm": 1.8310150193660448, + "language_loss": 0.74885792, + "learning_rate": 3.634715732945027e-06, + "loss": 0.82715559, + "num_input_tokens_seen": 78720230, + "router_z_loss_clip": 2.34570312, + "router_z_loss_mlp": 0.20458984, + "step": 3652, + "time_per_iteration": 2.512620210647583 + }, + { + "auxiliary_loss_clip": 0.06458014, + "auxiliary_loss_mlp": 0.01487979, + "balance_loss_clip": 0.06335165, + "balance_loss_mlp": 0.01477775, + "epoch": 0.219630241996092, + "flos": 65765105677440.0, + "grad_norm": 0.8085744951241601, + "language_loss": 0.51588702, + "learning_rate": 3.6344913212782764e-06, + "loss": 0.59534693, + "num_input_tokens_seen": 78780200, + "router_z_loss_clip": 1.234375, + "router_z_loss_mlp": 0.10205078, + "step": 3653, + "time_per_iteration": 3.156705617904663 + }, + { + "auxiliary_loss_clip": 0.06532414, + "auxiliary_loss_mlp": 0.01292976, + "balance_loss_clip": 0.06300488, + "balance_loss_mlp": 0.01271685, + "epoch": 0.21969036524875996, + "flos": 23703367524480.0, + "grad_norm": 2.2498105533123467, + "language_loss": 0.7598449, + "learning_rate": 3.6342668476314514e-06, + "loss": 0.83809876, + "num_input_tokens_seen": 78800575, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.21289062, + "step": 3654, + "time_per_iteration": 2.5549349784851074 + }, + { + "auxiliary_loss_clip": 0.06539033, + "auxiliary_loss_mlp": 0.01287688, + "balance_loss_clip": 0.06300852, + "balance_loss_mlp": 0.01265277, + "epoch": 0.21975048850142792, + "flos": 19646130101760.0, + "grad_norm": 1.856190016757107, + "language_loss": 0.72937429, + "learning_rate": 3.634042312013064e-06, + "loss": 0.80764157, + "num_input_tokens_seen": 78819585, + "router_z_loss_clip": 2.3828125, + "router_z_loss_mlp": 0.22412109, + "step": 3655, + "time_per_iteration": 5.397899866104126 + }, + { + "auxiliary_loss_clip": 0.06537225, + "auxiliary_loss_mlp": 0.01285968, + "balance_loss_clip": 0.06301227, + "balance_loss_mlp": 0.01265667, + "epoch": 0.21981061175409589, + "flos": 22453944094080.0, + "grad_norm": 1.6446350088012902, + "language_loss": 0.81351042, + "learning_rate": 3.6338177144316276e-06, + "loss": 0.89174235, + "num_input_tokens_seen": 78837330, + "router_z_loss_clip": 2.36328125, + "router_z_loss_mlp": 0.20300293, + "step": 3656, + "time_per_iteration": 2.53308367729187 + }, + { + "auxiliary_loss_clip": 0.06536204, + "auxiliary_loss_mlp": 0.01286139, + "balance_loss_clip": 0.06302683, + "balance_loss_mlp": 0.01265027, + "epoch": 0.21987073500676388, + "flos": 18157032466560.0, + "grad_norm": 2.081609460517537, + "language_loss": 0.86280632, + "learning_rate": 3.63359305489566e-06, + "loss": 0.94102979, + "num_input_tokens_seen": 78854955, + "router_z_loss_clip": 2.33398438, + "router_z_loss_mlp": 0.21105957, + "step": 3657, + "time_per_iteration": 2.5165464878082275 + }, + { + "auxiliary_loss_clip": 0.06534712, + "auxiliary_loss_mlp": 0.01283645, + "balance_loss_clip": 0.0629717, + "balance_loss_mlp": 0.01263439, + "epoch": 0.21993085825943184, + "flos": 25632666184320.0, + "grad_norm": 1.606816904846988, + "language_loss": 0.80728716, + "learning_rate": 3.6333683334136803e-06, + "loss": 0.88547069, + "num_input_tokens_seen": 78874965, + "router_z_loss_clip": 2.375, + "router_z_loss_mlp": 0.20202637, + "step": 3658, + "time_per_iteration": 2.5528533458709717 + }, + { + "auxiliary_loss_clip": 0.06407537, + "auxiliary_loss_mlp": 0.01256954, + "balance_loss_clip": 0.0628604, + "balance_loss_mlp": 0.01250839, + "epoch": 0.2199909815120998, + "flos": 70946429621760.0, + "grad_norm": 0.7593962827668586, + "language_loss": 0.58126092, + "learning_rate": 3.6331435499942095e-06, + "loss": 0.65790582, + "num_input_tokens_seen": 78937740, + "router_z_loss_clip": 1.21875, + "router_z_loss_mlp": 0.06103516, + "step": 3659, + "time_per_iteration": 3.237276077270508 + }, + { + "auxiliary_loss_clip": 0.06524363, + "auxiliary_loss_mlp": 0.01284023, + "balance_loss_clip": 0.06293888, + "balance_loss_mlp": 0.01264091, + "epoch": 0.22005110476476777, + "flos": 21549964320000.0, + "grad_norm": 2.05919214646248, + "language_loss": 0.75117528, + "learning_rate": 3.632918704645772e-06, + "loss": 0.82925916, + "num_input_tokens_seen": 78955055, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.19946289, + "step": 3660, + "time_per_iteration": 2.5259556770324707 + }, + { + "auxiliary_loss_clip": 0.06528022, + "auxiliary_loss_mlp": 0.01287991, + "balance_loss_clip": 0.06292684, + "balance_loss_mlp": 0.01267976, + "epoch": 0.22011122801743574, + "flos": 22061051498880.0, + "grad_norm": 2.4805712407940645, + "language_loss": 0.81579179, + "learning_rate": 3.632693797376893e-06, + "loss": 0.89395189, + "num_input_tokens_seen": 78974895, + "router_z_loss_clip": 2.35546875, + "router_z_loss_mlp": 0.20019531, + "step": 3661, + "time_per_iteration": 2.5724833011627197 + }, + { + "auxiliary_loss_clip": 0.06527096, + "auxiliary_loss_mlp": 0.01283614, + "balance_loss_clip": 0.06295218, + "balance_loss_mlp": 0.01264039, + "epoch": 0.2201713512701037, + "flos": 26694811739520.0, + "grad_norm": 2.4209612671003993, + "language_loss": 0.73935246, + "learning_rate": 3.632468828196102e-06, + "loss": 0.81745958, + "num_input_tokens_seen": 78994990, + "router_z_loss_clip": 2.31640625, + "router_z_loss_mlp": 0.19567871, + "step": 3662, + "time_per_iteration": 2.594336986541748 + }, + { + "auxiliary_loss_clip": 0.06524752, + "auxiliary_loss_mlp": 0.01286026, + "balance_loss_clip": 0.06294893, + "balance_loss_mlp": 0.01266976, + "epoch": 0.22023147452277167, + "flos": 22168470833280.0, + "grad_norm": 1.5979135918213576, + "language_loss": 0.79490995, + "learning_rate": 3.632243797111929e-06, + "loss": 0.87301779, + "num_input_tokens_seen": 79014405, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.19042969, + "step": 3663, + "time_per_iteration": 2.6437172889709473 + }, + { + "auxiliary_loss_clip": 0.06536885, + "auxiliary_loss_mlp": 0.01285417, + "balance_loss_clip": 0.06298422, + "balance_loss_mlp": 0.01264627, + "epoch": 0.22029159777543966, + "flos": 22528981025280.0, + "grad_norm": 1.9228872111745317, + "language_loss": 0.81154871, + "learning_rate": 3.632018704132908e-06, + "loss": 0.8897717, + "num_input_tokens_seen": 79032375, + "router_z_loss_clip": 2.38671875, + "router_z_loss_mlp": 0.20800781, + "step": 3664, + "time_per_iteration": 2.551218271255493 + }, + { + "auxiliary_loss_clip": 0.06543128, + "auxiliary_loss_mlp": 0.01279618, + "balance_loss_clip": 0.06298529, + "balance_loss_mlp": 0.01257457, + "epoch": 0.22035172102810763, + "flos": 13047502343040.0, + "grad_norm": 2.388837963421245, + "language_loss": 0.77563322, + "learning_rate": 3.6317935492675742e-06, + "loss": 0.85386074, + "num_input_tokens_seen": 79049635, + "router_z_loss_clip": 2.44335938, + "router_z_loss_mlp": 0.22167969, + "step": 3665, + "time_per_iteration": 2.5317838191986084 + }, + { + "auxiliary_loss_clip": 0.06533245, + "auxiliary_loss_mlp": 0.0128412, + "balance_loss_clip": 0.06298798, + "balance_loss_mlp": 0.01263616, + "epoch": 0.2204118442807756, + "flos": 12170538311040.0, + "grad_norm": 5.328131395204355, + "language_loss": 0.98459631, + "learning_rate": 3.631568332524466e-06, + "loss": 1.06277001, + "num_input_tokens_seen": 79062890, + "router_z_loss_clip": 2.34765625, + "router_z_loss_mlp": 0.20507812, + "step": 3666, + "time_per_iteration": 2.500293254852295 + }, + { + "auxiliary_loss_clip": 0.06531642, + "auxiliary_loss_mlp": 0.01281342, + "balance_loss_clip": 0.06297208, + "balance_loss_mlp": 0.01260767, + "epoch": 0.22047196753344356, + "flos": 40117345758720.0, + "grad_norm": 2.0087807452217143, + "language_loss": 0.81544572, + "learning_rate": 3.631343053912122e-06, + "loss": 0.89357555, + "num_input_tokens_seen": 79085495, + "router_z_loss_clip": 2.34179688, + "router_z_loss_mlp": 0.20568848, + "step": 3667, + "time_per_iteration": 2.7539899349212646 + }, + { + "auxiliary_loss_clip": 0.06542197, + "auxiliary_loss_mlp": 0.0128155, + "balance_loss_clip": 0.06300189, + "balance_loss_mlp": 0.01258363, + "epoch": 0.22053209078611152, + "flos": 20706892064640.0, + "grad_norm": 2.631241235852179, + "language_loss": 0.77648765, + "learning_rate": 3.631117713439087e-06, + "loss": 0.85472512, + "num_input_tokens_seen": 79101820, + "router_z_loss_clip": 2.41992188, + "router_z_loss_mlp": 0.23168945, + "step": 3668, + "time_per_iteration": 2.524740695953369 + }, + { + "auxiliary_loss_clip": 0.06534266, + "auxiliary_loss_mlp": 0.01279226, + "balance_loss_clip": 0.06300663, + "balance_loss_mlp": 0.01258758, + "epoch": 0.2205922140387795, + "flos": 24723026259840.0, + "grad_norm": 2.1996350177899386, + "language_loss": 0.72024125, + "learning_rate": 3.630892311113904e-06, + "loss": 0.7983762, + "num_input_tokens_seen": 79123320, + "router_z_loss_clip": 2.33789062, + "router_z_loss_mlp": 0.20471191, + "step": 3669, + "time_per_iteration": 2.5901756286621094 + }, + { + "auxiliary_loss_clip": 0.06540591, + "auxiliary_loss_mlp": 0.01281842, + "balance_loss_clip": 0.06304247, + "balance_loss_mlp": 0.01261398, + "epoch": 0.22065233729144745, + "flos": 23484000735360.0, + "grad_norm": 1.708018932230371, + "language_loss": 0.85830641, + "learning_rate": 3.6306668469451215e-06, + "loss": 0.93653071, + "num_input_tokens_seen": 79141615, + "router_z_loss_clip": 2.36132812, + "router_z_loss_mlp": 0.20422363, + "step": 3670, + "time_per_iteration": 2.6102726459503174 + }, + { + "auxiliary_loss_clip": 0.06539471, + "auxiliary_loss_mlp": 0.01279884, + "balance_loss_clip": 0.06300244, + "balance_loss_mlp": 0.01259678, + "epoch": 0.22071246054411545, + "flos": 35234268094080.0, + "grad_norm": 1.8596418583208814, + "language_loss": 0.77398729, + "learning_rate": 3.6304413209412886e-06, + "loss": 0.85218084, + "num_input_tokens_seen": 79164910, + "router_z_loss_clip": 2.39453125, + "router_z_loss_mlp": 0.20202637, + "step": 3671, + "time_per_iteration": 2.6463472843170166 + }, + { + "auxiliary_loss_clip": 0.06536315, + "auxiliary_loss_mlp": 0.01275828, + "balance_loss_clip": 0.06302021, + "balance_loss_mlp": 0.01256934, + "epoch": 0.2207725837967834, + "flos": 18156151998720.0, + "grad_norm": 3.3605951725525807, + "language_loss": 0.81071377, + "learning_rate": 3.6302157331109573e-06, + "loss": 0.88883519, + "num_input_tokens_seen": 79179685, + "router_z_loss_clip": 2.34375, + "router_z_loss_mlp": 0.18896484, + "step": 3672, + "time_per_iteration": 2.522409200668335 + }, + { + "auxiliary_loss_clip": 0.06541845, + "auxiliary_loss_mlp": 0.01282888, + "balance_loss_clip": 0.06304064, + "balance_loss_mlp": 0.01262086, + "epoch": 0.22083270704945138, + "flos": 20484967726080.0, + "grad_norm": 2.0276751679318905, + "language_loss": 0.74039209, + "learning_rate": 3.629990083462682e-06, + "loss": 0.8186394, + "num_input_tokens_seen": 79196285, + "router_z_loss_clip": 2.38085938, + "router_z_loss_mlp": 0.20800781, + "step": 3673, + "time_per_iteration": 2.5588481426239014 + }, + { + "auxiliary_loss_clip": 0.06537451, + "auxiliary_loss_mlp": 0.0127904, + "balance_loss_clip": 0.06301045, + "balance_loss_mlp": 0.01258154, + "epoch": 0.22089283030211934, + "flos": 34133451079680.0, + "grad_norm": 2.1113123853963223, + "language_loss": 0.77576697, + "learning_rate": 3.6297643720050203e-06, + "loss": 0.85393184, + "num_input_tokens_seen": 79216060, + "router_z_loss_clip": 2.3671875, + "router_z_loss_mlp": 0.2088623, + "step": 3674, + "time_per_iteration": 2.6212525367736816 + }, + { + "auxiliary_loss_clip": 0.06539989, + "auxiliary_loss_mlp": 0.01277379, + "balance_loss_clip": 0.06303889, + "balance_loss_mlp": 0.01255349, + "epoch": 0.2209529535547873, + "flos": 18083043711360.0, + "grad_norm": 2.9913121905850213, + "language_loss": 0.7632584, + "learning_rate": 3.6295385987465293e-06, + "loss": 0.84143209, + "num_input_tokens_seen": 79235145, + "router_z_loss_clip": 2.36523438, + "router_z_loss_mlp": 0.22033691, + "step": 3675, + "time_per_iteration": 2.529346466064453 + }, + { + "auxiliary_loss_clip": 0.06540923, + "auxiliary_loss_mlp": 0.01279311, + "balance_loss_clip": 0.06303286, + "balance_loss_mlp": 0.01258592, + "epoch": 0.22101307680745527, + "flos": 27242725587840.0, + "grad_norm": 1.8493496269427605, + "language_loss": 0.8074736, + "learning_rate": 3.629312763695772e-06, + "loss": 0.88567591, + "num_input_tokens_seen": 79256960, + "router_z_loss_clip": 2.38085938, + "router_z_loss_mlp": 0.20727539, + "step": 3676, + "time_per_iteration": 2.5729713439941406 + }, + { + "auxiliary_loss_clip": 0.06539683, + "auxiliary_loss_mlp": 0.0128126, + "balance_loss_clip": 0.06299066, + "balance_loss_mlp": 0.01260637, + "epoch": 0.22107320006012326, + "flos": 16548566290560.0, + "grad_norm": 2.695197102889201, + "language_loss": 0.76204234, + "learning_rate": 3.6290868668613107e-06, + "loss": 0.84025168, + "num_input_tokens_seen": 79274860, + "router_z_loss_clip": 2.40820312, + "router_z_loss_mlp": 0.2064209, + "step": 3677, + "time_per_iteration": 2.5165653228759766 + }, + { + "auxiliary_loss_clip": 0.0653778, + "auxiliary_loss_mlp": 0.01277642, + "balance_loss_clip": 0.06301221, + "balance_loss_mlp": 0.01257889, + "epoch": 0.22113332331279123, + "flos": 22061009571840.0, + "grad_norm": 1.9269573452829223, + "language_loss": 0.84673274, + "learning_rate": 3.628860908251712e-06, + "loss": 0.92488694, + "num_input_tokens_seen": 79294005, + "router_z_loss_clip": 2.36523438, + "router_z_loss_mlp": 0.19750977, + "step": 3678, + "time_per_iteration": 2.5460638999938965 + }, + { + "auxiliary_loss_clip": 0.06537814, + "auxiliary_loss_mlp": 0.01282989, + "balance_loss_clip": 0.06304095, + "balance_loss_mlp": 0.01262354, + "epoch": 0.2211934465654592, + "flos": 26619690954240.0, + "grad_norm": 2.1729831488916327, + "language_loss": 0.89362311, + "learning_rate": 3.6286348878755452e-06, + "loss": 0.9718312, + "num_input_tokens_seen": 79314005, + "router_z_loss_clip": 2.34179688, + "router_z_loss_mlp": 0.20629883, + "step": 3679, + "time_per_iteration": 2.596503973007202 + }, + { + "auxiliary_loss_clip": 0.06542142, + "auxiliary_loss_mlp": 0.01291632, + "balance_loss_clip": 0.06301068, + "balance_loss_mlp": 0.01269817, + "epoch": 0.22125356981812716, + "flos": 16365564973440.0, + "grad_norm": 3.197923457760992, + "language_loss": 0.87311327, + "learning_rate": 3.6284088057413803e-06, + "loss": 0.95145106, + "num_input_tokens_seen": 79331030, + "router_z_loss_clip": 2.40820312, + "router_z_loss_mlp": 0.21801758, + "step": 3680, + "time_per_iteration": 2.507798433303833 + }, + { + "auxiliary_loss_clip": 0.06534758, + "auxiliary_loss_mlp": 0.01279239, + "balance_loss_clip": 0.06302372, + "balance_loss_mlp": 0.01258211, + "epoch": 0.22131369307079513, + "flos": 21657257873280.0, + "grad_norm": 1.8058433539562604, + "language_loss": 0.81643963, + "learning_rate": 3.6281826618577894e-06, + "loss": 0.89457959, + "num_input_tokens_seen": 79348560, + "router_z_loss_clip": 2.32226562, + "router_z_loss_mlp": 0.21032715, + "step": 3681, + "time_per_iteration": 2.536559820175171 + }, + { + "auxiliary_loss_clip": 0.06530598, + "auxiliary_loss_mlp": 0.01283453, + "balance_loss_clip": 0.06302136, + "balance_loss_mlp": 0.01264344, + "epoch": 0.2213738163234631, + "flos": 19615592488320.0, + "grad_norm": 3.0843961282743138, + "language_loss": 0.80613208, + "learning_rate": 3.62795645623335e-06, + "loss": 0.88427258, + "num_input_tokens_seen": 79367175, + "router_z_loss_clip": 2.28320312, + "router_z_loss_mlp": 0.19116211, + "step": 3682, + "time_per_iteration": 2.5523715019226074 + }, + { + "auxiliary_loss_clip": 0.06540116, + "auxiliary_loss_mlp": 0.01284666, + "balance_loss_clip": 0.06302039, + "balance_loss_mlp": 0.01261933, + "epoch": 0.22143393957613106, + "flos": 23630217310080.0, + "grad_norm": 1.560467578099588, + "language_loss": 0.78323001, + "learning_rate": 3.627730188876638e-06, + "loss": 0.86147785, + "num_input_tokens_seen": 79388435, + "router_z_loss_clip": 2.38085938, + "router_z_loss_mlp": 0.22729492, + "step": 3683, + "time_per_iteration": 2.563915491104126 + }, + { + "auxiliary_loss_clip": 0.06546305, + "auxiliary_loss_mlp": 0.01292128, + "balance_loss_clip": 0.06304266, + "balance_loss_mlp": 0.01270801, + "epoch": 0.22149406282879905, + "flos": 26185108152960.0, + "grad_norm": 2.3659446396904276, + "language_loss": 0.73827177, + "learning_rate": 3.627503859796234e-06, + "loss": 0.81665611, + "num_input_tokens_seen": 79407910, + "router_z_loss_clip": 2.42382812, + "router_z_loss_mlp": 0.21337891, + "step": 3684, + "time_per_iteration": 2.5829403400421143 + }, + { + "auxiliary_loss_clip": 0.06539842, + "auxiliary_loss_mlp": 0.01288295, + "balance_loss_clip": 0.06303138, + "balance_loss_mlp": 0.01266396, + "epoch": 0.221554186081467, + "flos": 14544104918400.0, + "grad_norm": 1.9346272357304948, + "language_loss": 0.81055164, + "learning_rate": 3.6272774690007207e-06, + "loss": 0.88883299, + "num_input_tokens_seen": 79424020, + "router_z_loss_clip": 2.3671875, + "router_z_loss_mlp": 0.21899414, + "step": 3685, + "time_per_iteration": 2.5229949951171875 + }, + { + "auxiliary_loss_clip": 0.06531791, + "auxiliary_loss_mlp": 0.0128599, + "balance_loss_clip": 0.06302623, + "balance_loss_mlp": 0.01266607, + "epoch": 0.22161430933413498, + "flos": 22245059064960.0, + "grad_norm": 1.5947500054188823, + "language_loss": 0.87523818, + "learning_rate": 3.6270510164986823e-06, + "loss": 0.95341599, + "num_input_tokens_seen": 79445605, + "router_z_loss_clip": 2.29296875, + "router_z_loss_mlp": 0.19372559, + "step": 3686, + "time_per_iteration": 4.0018064975738525 + }, + { + "auxiliary_loss_clip": 0.06530964, + "auxiliary_loss_mlp": 0.01294037, + "balance_loss_clip": 0.06297237, + "balance_loss_mlp": 0.01272198, + "epoch": 0.22167443258680294, + "flos": 23483162194560.0, + "grad_norm": 2.0272053301197186, + "language_loss": 0.78420949, + "learning_rate": 3.626824502298707e-06, + "loss": 0.86245942, + "num_input_tokens_seen": 79463850, + "router_z_loss_clip": 2.3359375, + "router_z_loss_mlp": 0.21826172, + "step": 3687, + "time_per_iteration": 2.543321132659912 + }, + { + "auxiliary_loss_clip": 0.06551681, + "auxiliary_loss_mlp": 0.01283958, + "balance_loss_clip": 0.0630649, + "balance_loss_mlp": 0.01261177, + "epoch": 0.2217345558394709, + "flos": 23227723422720.0, + "grad_norm": 1.7957197826329643, + "language_loss": 0.85492283, + "learning_rate": 3.626597926409383e-06, + "loss": 0.93327922, + "num_input_tokens_seen": 79482845, + "router_z_loss_clip": 2.44921875, + "router_z_loss_mlp": 0.2277832, + "step": 3688, + "time_per_iteration": 2.5456702709198 + }, + { + "auxiliary_loss_clip": 0.06557921, + "auxiliary_loss_mlp": 0.01283081, + "balance_loss_clip": 0.0631456, + "balance_loss_mlp": 0.01260812, + "epoch": 0.22179467909213887, + "flos": 20017247834880.0, + "grad_norm": 1.8193279444648072, + "language_loss": 0.81821239, + "learning_rate": 3.6263712888393027e-06, + "loss": 0.89662236, + "num_input_tokens_seen": 79501550, + "router_z_loss_clip": 2.43554688, + "router_z_loss_mlp": 0.22265625, + "step": 3689, + "time_per_iteration": 4.073091506958008 + }, + { + "auxiliary_loss_clip": 0.06540284, + "auxiliary_loss_mlp": 0.0128456, + "balance_loss_clip": 0.06304172, + "balance_loss_mlp": 0.01263269, + "epoch": 0.22185480234480687, + "flos": 19689203900160.0, + "grad_norm": 2.302195520769192, + "language_loss": 0.70934272, + "learning_rate": 3.626144589597061e-06, + "loss": 0.7875911, + "num_input_tokens_seen": 79519680, + "router_z_loss_clip": 2.36132812, + "router_z_loss_mlp": 0.2130127, + "step": 3690, + "time_per_iteration": 2.5177161693573 + }, + { + "auxiliary_loss_clip": 0.06548303, + "auxiliary_loss_mlp": 0.01286756, + "balance_loss_clip": 0.06307022, + "balance_loss_mlp": 0.01264416, + "epoch": 0.22191492559747483, + "flos": 21987817430400.0, + "grad_norm": 2.3084892961245576, + "language_loss": 0.7285862, + "learning_rate": 3.6259178286912528e-06, + "loss": 0.80693686, + "num_input_tokens_seen": 79539000, + "router_z_loss_clip": 2.4140625, + "router_z_loss_mlp": 0.22338867, + "step": 3691, + "time_per_iteration": 2.545271873474121 + }, + { + "auxiliary_loss_clip": 0.0654895, + "auxiliary_loss_mlp": 0.01283693, + "balance_loss_clip": 0.06313456, + "balance_loss_mlp": 0.01261771, + "epoch": 0.2219750488501428, + "flos": 23228813525760.0, + "grad_norm": 2.0680633952732195, + "language_loss": 0.71962094, + "learning_rate": 3.625691006130477e-06, + "loss": 0.79794735, + "num_input_tokens_seen": 79559695, + "router_z_loss_clip": 2.36132812, + "router_z_loss_mlp": 0.21936035, + "step": 3692, + "time_per_iteration": 2.543306350708008 + }, + { + "auxiliary_loss_clip": 0.06558576, + "auxiliary_loss_mlp": 0.0128071, + "balance_loss_clip": 0.06317012, + "balance_loss_mlp": 0.01258394, + "epoch": 0.22203517210281076, + "flos": 22459939660800.0, + "grad_norm": 1.9780142392305156, + "language_loss": 0.87528688, + "learning_rate": 3.6254641219233362e-06, + "loss": 0.95367974, + "num_input_tokens_seen": 79579095, + "router_z_loss_clip": 2.41601562, + "router_z_loss_mlp": 0.22338867, + "step": 3693, + "time_per_iteration": 2.571045398712158 + }, + { + "auxiliary_loss_clip": 0.06534213, + "auxiliary_loss_mlp": 0.01282043, + "balance_loss_clip": 0.06303744, + "balance_loss_mlp": 0.01261122, + "epoch": 0.22209529535547873, + "flos": 17569985961600.0, + "grad_norm": 2.4004359049860824, + "language_loss": 0.86418116, + "learning_rate": 3.6252371760784325e-06, + "loss": 0.94234371, + "num_input_tokens_seen": 79596430, + "router_z_loss_clip": 2.30859375, + "router_z_loss_mlp": 0.20922852, + "step": 3694, + "time_per_iteration": 4.03299617767334 + }, + { + "auxiliary_loss_clip": 0.06554222, + "auxiliary_loss_mlp": 0.0127962, + "balance_loss_clip": 0.06307386, + "balance_loss_mlp": 0.0125815, + "epoch": 0.2221554186081467, + "flos": 21475178951040.0, + "grad_norm": 1.7692850214061204, + "language_loss": 0.69924927, + "learning_rate": 3.6250101686043725e-06, + "loss": 0.77758765, + "num_input_tokens_seen": 79615825, + "router_z_loss_clip": 2.46875, + "router_z_loss_mlp": 0.21472168, + "step": 3695, + "time_per_iteration": 3.989173412322998 + }, + { + "auxiliary_loss_clip": 0.06536973, + "auxiliary_loss_mlp": 0.01283487, + "balance_loss_clip": 0.0630603, + "balance_loss_mlp": 0.01262781, + "epoch": 0.22221554186081466, + "flos": 27680956041600.0, + "grad_norm": 1.7088419756312998, + "language_loss": 0.72215462, + "learning_rate": 3.6247830995097637e-06, + "loss": 0.80035925, + "num_input_tokens_seen": 79637875, + "router_z_loss_clip": 2.30859375, + "router_z_loss_mlp": 0.20715332, + "step": 3696, + "time_per_iteration": 2.6339590549468994 + }, + { + "auxiliary_loss_clip": 0.06543445, + "auxiliary_loss_mlp": 0.01279581, + "balance_loss_clip": 0.06307454, + "balance_loss_mlp": 0.01257825, + "epoch": 0.22227566511348265, + "flos": 25966202561280.0, + "grad_norm": 1.8417969407055101, + "language_loss": 0.88068652, + "learning_rate": 3.624555968803217e-06, + "loss": 0.95891678, + "num_input_tokens_seen": 79656970, + "router_z_loss_clip": 2.36132812, + "router_z_loss_mlp": 0.21740723, + "step": 3697, + "time_per_iteration": 2.5599191188812256 + }, + { + "auxiliary_loss_clip": 0.06533489, + "auxiliary_loss_mlp": 0.01284902, + "balance_loss_clip": 0.06305174, + "balance_loss_mlp": 0.01265042, + "epoch": 0.22233578836615062, + "flos": 39213240203520.0, + "grad_norm": 2.5935528152985867, + "language_loss": 0.6687606, + "learning_rate": 3.624328776493346e-06, + "loss": 0.74694455, + "num_input_tokens_seen": 79680275, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.1986084, + "step": 3698, + "time_per_iteration": 2.812140703201294 + }, + { + "auxiliary_loss_clip": 0.06546268, + "auxiliary_loss_mlp": 0.01282222, + "balance_loss_clip": 0.06307642, + "balance_loss_mlp": 0.01260216, + "epoch": 0.22239591161881858, + "flos": 36292682142720.0, + "grad_norm": 1.853195446284453, + "language_loss": 0.82990527, + "learning_rate": 3.6241015225887637e-06, + "loss": 0.90819019, + "num_input_tokens_seen": 79701255, + "router_z_loss_clip": 2.38671875, + "router_z_loss_mlp": 0.22009277, + "step": 3699, + "time_per_iteration": 2.667423725128174 + }, + { + "auxiliary_loss_clip": 0.06537004, + "auxiliary_loss_mlp": 0.01281329, + "balance_loss_clip": 0.06302205, + "balance_loss_mlp": 0.01260014, + "epoch": 0.22245603487148655, + "flos": 19725779007360.0, + "grad_norm": 1.45021308141165, + "language_loss": 0.80335897, + "learning_rate": 3.62387420709809e-06, + "loss": 0.88154227, + "num_input_tokens_seen": 79721315, + "router_z_loss_clip": 2.34765625, + "router_z_loss_mlp": 0.21313477, + "step": 3700, + "time_per_iteration": 2.5526716709136963 + }, + { + "auxiliary_loss_clip": 0.06548695, + "auxiliary_loss_mlp": 0.01279557, + "balance_loss_clip": 0.06306358, + "balance_loss_mlp": 0.01257885, + "epoch": 0.2225161581241545, + "flos": 46290950081280.0, + "grad_norm": 3.047641549556173, + "language_loss": 0.73186177, + "learning_rate": 3.623646830029943e-06, + "loss": 0.81014431, + "num_input_tokens_seen": 79742705, + "router_z_loss_clip": 2.42382812, + "router_z_loss_mlp": 0.21655273, + "step": 3701, + "time_per_iteration": 2.776974678039551 + }, + { + "auxiliary_loss_clip": 0.06535295, + "auxiliary_loss_mlp": 0.01280405, + "balance_loss_clip": 0.06300849, + "balance_loss_mlp": 0.01259734, + "epoch": 0.22257628137682248, + "flos": 23702990181120.0, + "grad_norm": 4.404280219854046, + "language_loss": 0.80455184, + "learning_rate": 3.6234193913929454e-06, + "loss": 0.88270885, + "num_input_tokens_seen": 79763000, + "router_z_loss_clip": 2.34570312, + "router_z_loss_mlp": 0.20666504, + "step": 3702, + "time_per_iteration": 2.5657999515533447 + }, + { + "auxiliary_loss_clip": 0.06524558, + "auxiliary_loss_mlp": 0.01274253, + "balance_loss_clip": 0.06297488, + "balance_loss_mlp": 0.01253331, + "epoch": 0.22263640462949044, + "flos": 19359986008320.0, + "grad_norm": 3.4101413472023405, + "language_loss": 0.78629804, + "learning_rate": 3.623191891195723e-06, + "loss": 0.86428618, + "num_input_tokens_seen": 79781335, + "router_z_loss_clip": 2.27148438, + "router_z_loss_mlp": 0.20910645, + "step": 3703, + "time_per_iteration": 2.550189971923828 + }, + { + "auxiliary_loss_clip": 0.06541737, + "auxiliary_loss_mlp": 0.01279602, + "balance_loss_clip": 0.06300878, + "balance_loss_mlp": 0.01257084, + "epoch": 0.22269652788215843, + "flos": 20782138631040.0, + "grad_norm": 2.0986231414271828, + "language_loss": 0.75210625, + "learning_rate": 3.6229643294469005e-06, + "loss": 0.83031964, + "num_input_tokens_seen": 79800150, + "router_z_loss_clip": 2.41015625, + "router_z_loss_mlp": 0.22509766, + "step": 3704, + "time_per_iteration": 2.5540754795074463 + }, + { + "auxiliary_loss_clip": 0.06527826, + "auxiliary_loss_mlp": 0.01288936, + "balance_loss_clip": 0.06299336, + "balance_loss_mlp": 0.01268682, + "epoch": 0.2227566511348264, + "flos": 47969631578880.0, + "grad_norm": 1.891044771341396, + "language_loss": 0.65108556, + "learning_rate": 3.6227367061551074e-06, + "loss": 0.72925317, + "num_input_tokens_seen": 79822390, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.20239258, + "step": 3705, + "time_per_iteration": 2.8109097480773926 + }, + { + "auxiliary_loss_clip": 0.06438605, + "auxiliary_loss_mlp": 0.01266416, + "balance_loss_clip": 0.0631493, + "balance_loss_mlp": 0.012611, + "epoch": 0.22281677438749437, + "flos": 66235676607360.0, + "grad_norm": 1.322453387614222, + "language_loss": 0.65218806, + "learning_rate": 3.6225090213289766e-06, + "loss": 0.72923827, + "num_input_tokens_seen": 79873350, + "router_z_loss_clip": 1.234375, + "router_z_loss_mlp": 0.05322266, + "step": 3706, + "time_per_iteration": 3.059636354446411 + }, + { + "auxiliary_loss_clip": 0.06534128, + "auxiliary_loss_mlp": 0.01286492, + "balance_loss_clip": 0.06297205, + "balance_loss_mlp": 0.01266274, + "epoch": 0.22287689764016233, + "flos": 21878050181760.0, + "grad_norm": 2.374246987916323, + "language_loss": 0.80905002, + "learning_rate": 3.622281274977141e-06, + "loss": 0.88725626, + "num_input_tokens_seen": 79891715, + "router_z_loss_clip": 2.36914062, + "router_z_loss_mlp": 0.20202637, + "step": 3707, + "time_per_iteration": 2.5891129970550537 + }, + { + "auxiliary_loss_clip": 0.06532377, + "auxiliary_loss_mlp": 0.01280313, + "balance_loss_clip": 0.06298505, + "balance_loss_mlp": 0.01257854, + "epoch": 0.2229370208928303, + "flos": 27679824011520.0, + "grad_norm": 1.802742500055583, + "language_loss": 0.79219007, + "learning_rate": 3.6220534671082367e-06, + "loss": 0.87031698, + "num_input_tokens_seen": 79911175, + "router_z_loss_clip": 2.34179688, + "router_z_loss_mlp": 0.2244873, + "step": 3708, + "time_per_iteration": 2.5907180309295654 + }, + { + "auxiliary_loss_clip": 0.06539932, + "auxiliary_loss_mlp": 0.01293698, + "balance_loss_clip": 0.06300655, + "balance_loss_mlp": 0.01271525, + "epoch": 0.22299714414549826, + "flos": 30162612816000.0, + "grad_norm": 1.9019649120082793, + "language_loss": 0.81583631, + "learning_rate": 3.6218255977309024e-06, + "loss": 0.89417267, + "num_input_tokens_seen": 79931875, + "router_z_loss_clip": 2.39453125, + "router_z_loss_mlp": 0.22167969, + "step": 3709, + "time_per_iteration": 2.658768892288208 + }, + { + "auxiliary_loss_clip": 0.06540084, + "auxiliary_loss_mlp": 0.01295766, + "balance_loss_clip": 0.0630019, + "balance_loss_mlp": 0.01274464, + "epoch": 0.22305726739816625, + "flos": 23148871130880.0, + "grad_norm": 2.9556041497723236, + "language_loss": 0.69413233, + "learning_rate": 3.6215976668537787e-06, + "loss": 0.77249086, + "num_input_tokens_seen": 79952445, + "router_z_loss_clip": 2.39648438, + "router_z_loss_mlp": 0.21289062, + "step": 3710, + "time_per_iteration": 2.603476047515869 + }, + { + "auxiliary_loss_clip": 0.06536471, + "auxiliary_loss_mlp": 0.01286054, + "balance_loss_clip": 0.06297636, + "balance_loss_mlp": 0.01264429, + "epoch": 0.22311739065083422, + "flos": 19178116721280.0, + "grad_norm": 2.184897161331363, + "language_loss": 0.91282266, + "learning_rate": 3.6213696744855096e-06, + "loss": 0.99104792, + "num_input_tokens_seen": 79971030, + "router_z_loss_clip": 2.39257812, + "router_z_loss_mlp": 0.21606445, + "step": 3711, + "time_per_iteration": 2.6093854904174805 + }, + { + "auxiliary_loss_clip": 0.06539471, + "auxiliary_loss_mlp": 0.01298084, + "balance_loss_clip": 0.06302293, + "balance_loss_mlp": 0.01275911, + "epoch": 0.22317751390350218, + "flos": 13621467611520.0, + "grad_norm": 2.3638705243519142, + "language_loss": 0.89271343, + "learning_rate": 3.6211416206347395e-06, + "loss": 0.97108901, + "num_input_tokens_seen": 79982085, + "router_z_loss_clip": 2.375, + "router_z_loss_mlp": 0.22192383, + "step": 3712, + "time_per_iteration": 2.5170199871063232 + }, + { + "auxiliary_loss_clip": 0.06530519, + "auxiliary_loss_mlp": 0.01292247, + "balance_loss_clip": 0.06299888, + "balance_loss_mlp": 0.01271481, + "epoch": 0.22323763715617015, + "flos": 11032643064960.0, + "grad_norm": 2.927785991832361, + "language_loss": 0.74880064, + "learning_rate": 3.620913505310117e-06, + "loss": 0.82702827, + "num_input_tokens_seen": 79997460, + "router_z_loss_clip": 2.30859375, + "router_z_loss_mlp": 0.2076416, + "step": 3713, + "time_per_iteration": 2.521813154220581 + }, + { + "auxiliary_loss_clip": 0.06534518, + "auxiliary_loss_mlp": 0.0130023, + "balance_loss_clip": 0.06298245, + "balance_loss_mlp": 0.01277556, + "epoch": 0.22329776040883811, + "flos": 41360647841280.0, + "grad_norm": 2.458794372685298, + "language_loss": 0.62675929, + "learning_rate": 3.6206853285202917e-06, + "loss": 0.70510674, + "num_input_tokens_seen": 80022450, + "router_z_loss_clip": 2.36328125, + "router_z_loss_mlp": 0.22668457, + "step": 3714, + "time_per_iteration": 2.704357862472534 + }, + { + "auxiliary_loss_clip": 0.06529912, + "auxiliary_loss_mlp": 0.01289936, + "balance_loss_clip": 0.06295826, + "balance_loss_mlp": 0.01267906, + "epoch": 0.22335788366150608, + "flos": 25126568323200.0, + "grad_norm": 1.757427072944695, + "language_loss": 0.79499549, + "learning_rate": 3.6204570902739164e-06, + "loss": 0.87319398, + "num_input_tokens_seen": 80042100, + "router_z_loss_clip": 2.33984375, + "router_z_loss_mlp": 0.22009277, + "step": 3715, + "time_per_iteration": 2.571711301803589 + }, + { + "auxiliary_loss_clip": 0.06527971, + "auxiliary_loss_mlp": 0.01294287, + "balance_loss_clip": 0.06293058, + "balance_loss_mlp": 0.0127302, + "epoch": 0.22341800691417404, + "flos": 16989144658560.0, + "grad_norm": 1.5961840175356918, + "language_loss": 0.77329421, + "learning_rate": 3.620228790579645e-06, + "loss": 0.85151684, + "num_input_tokens_seen": 80059690, + "router_z_loss_clip": 2.34765625, + "router_z_loss_mlp": 0.21276855, + "step": 3716, + "time_per_iteration": 2.502037286758423 + }, + { + "auxiliary_loss_clip": 0.06529684, + "auxiliary_loss_mlp": 0.0129404, + "balance_loss_clip": 0.06297298, + "balance_loss_mlp": 0.01273977, + "epoch": 0.22347813016684204, + "flos": 14141904520320.0, + "grad_norm": 2.4369226344025665, + "language_loss": 0.80004126, + "learning_rate": 3.6200004294461367e-06, + "loss": 0.87827849, + "num_input_tokens_seen": 80076060, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.20068359, + "step": 3717, + "time_per_iteration": 2.5208563804626465 + }, + { + "auxiliary_loss_clip": 0.065373, + "auxiliary_loss_mlp": 0.01297317, + "balance_loss_clip": 0.06298472, + "balance_loss_mlp": 0.01275215, + "epoch": 0.22353825341951, + "flos": 23589323717760.0, + "grad_norm": 2.564573329936102, + "language_loss": 0.68781847, + "learning_rate": 3.6197720068820497e-06, + "loss": 0.76616466, + "num_input_tokens_seen": 80094760, + "router_z_loss_clip": 2.38867188, + "router_z_loss_mlp": 0.22106934, + "step": 3718, + "time_per_iteration": 2.6491305828094482 + }, + { + "auxiliary_loss_clip": 0.06536659, + "auxiliary_loss_mlp": 0.01296292, + "balance_loss_clip": 0.06298986, + "balance_loss_mlp": 0.01271187, + "epoch": 0.22359837667217797, + "flos": 29831759769600.0, + "grad_norm": 1.515297493499622, + "language_loss": 0.80957985, + "learning_rate": 3.619543522896045e-06, + "loss": 0.88790929, + "num_input_tokens_seen": 80114475, + "router_z_loss_clip": 2.37695312, + "router_z_loss_mlp": 0.25085449, + "step": 3719, + "time_per_iteration": 2.6334550380706787 + }, + { + "auxiliary_loss_clip": 0.06540611, + "auxiliary_loss_mlp": 0.01300766, + "balance_loss_clip": 0.06299402, + "balance_loss_mlp": 0.01276793, + "epoch": 0.22365849992484593, + "flos": 17608867056000.0, + "grad_norm": 2.352033480486632, + "language_loss": 0.87360144, + "learning_rate": 3.6193149774967885e-06, + "loss": 0.95201522, + "num_input_tokens_seen": 80132920, + "router_z_loss_clip": 2.41210938, + "router_z_loss_mlp": 0.23962402, + "step": 3720, + "time_per_iteration": 2.5415003299713135 + }, + { + "auxiliary_loss_clip": 0.06526608, + "auxiliary_loss_mlp": 0.01292998, + "balance_loss_clip": 0.06295964, + "balance_loss_mlp": 0.01271672, + "epoch": 0.2237186231775139, + "flos": 22717558638720.0, + "grad_norm": 1.8478771577440833, + "language_loss": 0.75151736, + "learning_rate": 3.619086370692945e-06, + "loss": 0.8297134, + "num_input_tokens_seen": 80152845, + "router_z_loss_clip": 2.30664062, + "router_z_loss_mlp": 0.21325684, + "step": 3721, + "time_per_iteration": 2.548450469970703 + }, + { + "auxiliary_loss_clip": 0.06540586, + "auxiliary_loss_mlp": 0.0129148, + "balance_loss_clip": 0.06299528, + "balance_loss_mlp": 0.01269105, + "epoch": 0.22377874643018186, + "flos": 13376720234880.0, + "grad_norm": 2.2094798322640736, + "language_loss": 0.79352558, + "learning_rate": 3.6188577024931844e-06, + "loss": 0.87184626, + "num_input_tokens_seen": 80170680, + "router_z_loss_clip": 2.41210938, + "router_z_loss_mlp": 0.22375488, + "step": 3722, + "time_per_iteration": 2.519277572631836 + }, + { + "auxiliary_loss_clip": 0.06531984, + "auxiliary_loss_mlp": 0.01288897, + "balance_loss_clip": 0.06299505, + "balance_loss_mlp": 0.01267964, + "epoch": 0.22383886968284986, + "flos": 17900797080960.0, + "grad_norm": 2.2930078409484196, + "language_loss": 0.83410442, + "learning_rate": 3.618628972906178e-06, + "loss": 0.91231328, + "num_input_tokens_seen": 80189030, + "router_z_loss_clip": 2.32421875, + "router_z_loss_mlp": 0.20922852, + "step": 3723, + "time_per_iteration": 2.5086076259613037 + }, + { + "auxiliary_loss_clip": 0.06544059, + "auxiliary_loss_mlp": 0.01285781, + "balance_loss_clip": 0.06305651, + "balance_loss_mlp": 0.01263834, + "epoch": 0.22389899293551782, + "flos": 23886033425280.0, + "grad_norm": 4.429276920778782, + "language_loss": 0.84606177, + "learning_rate": 3.6184001819405984e-06, + "loss": 0.92436016, + "num_input_tokens_seen": 80208365, + "router_z_loss_clip": 2.3828125, + "router_z_loss_mlp": 0.21960449, + "step": 3724, + "time_per_iteration": 2.574178695678711 + }, + { + "auxiliary_loss_clip": 0.06534179, + "auxiliary_loss_mlp": 0.01287846, + "balance_loss_clip": 0.06299204, + "balance_loss_mlp": 0.01267211, + "epoch": 0.2239591161881858, + "flos": 27279929600640.0, + "grad_norm": 1.978846940821608, + "language_loss": 0.79885381, + "learning_rate": 3.618171329605121e-06, + "loss": 0.87707412, + "num_input_tokens_seen": 80228685, + "router_z_loss_clip": 2.3515625, + "router_z_loss_mlp": 0.20617676, + "step": 3725, + "time_per_iteration": 2.589184522628784 + }, + { + "auxiliary_loss_clip": 0.06541407, + "auxiliary_loss_mlp": 0.01289084, + "balance_loss_clip": 0.06307919, + "balance_loss_mlp": 0.01267197, + "epoch": 0.22401923944085375, + "flos": 22243423910400.0, + "grad_norm": 1.7178260071510263, + "language_loss": 0.78001326, + "learning_rate": 3.6179424159084254e-06, + "loss": 0.85831815, + "num_input_tokens_seen": 80247635, + "router_z_loss_clip": 2.3359375, + "router_z_loss_mlp": 0.21875, + "step": 3726, + "time_per_iteration": 3.980494976043701 + }, + { + "auxiliary_loss_clip": 0.06552388, + "auxiliary_loss_mlp": 0.01297244, + "balance_loss_clip": 0.06307887, + "balance_loss_mlp": 0.01272175, + "epoch": 0.22407936269352172, + "flos": 12057920023680.0, + "grad_norm": 3.478702992871699, + "language_loss": 0.73437679, + "learning_rate": 3.6177134408591914e-06, + "loss": 0.81287301, + "num_input_tokens_seen": 80260045, + "router_z_loss_clip": 2.45117188, + "router_z_loss_mlp": 0.25097656, + "step": 3727, + "time_per_iteration": 2.4799015522003174 + }, + { + "auxiliary_loss_clip": 0.06549139, + "auxiliary_loss_mlp": 0.01296668, + "balance_loss_clip": 0.06309944, + "balance_loss_mlp": 0.0127341, + "epoch": 0.22413948594618968, + "flos": 19359482883840.0, + "grad_norm": 2.179866459674304, + "language_loss": 0.8799302, + "learning_rate": 3.6174844044661013e-06, + "loss": 0.95838827, + "num_input_tokens_seen": 80277680, + "router_z_loss_clip": 2.39257812, + "router_z_loss_mlp": 0.23254395, + "step": 3728, + "time_per_iteration": 2.547523021697998 + }, + { + "auxiliary_loss_clip": 0.0653842, + "auxiliary_loss_mlp": 0.01294185, + "balance_loss_clip": 0.06303863, + "balance_loss_mlp": 0.0126989, + "epoch": 0.22419960919885765, + "flos": 24176789493120.0, + "grad_norm": 1.9160734665449493, + "language_loss": 0.80446088, + "learning_rate": 3.6172553067378406e-06, + "loss": 0.88278687, + "num_input_tokens_seen": 80294795, + "router_z_loss_clip": 2.34375, + "router_z_loss_mlp": 0.24328613, + "step": 3729, + "time_per_iteration": 4.021615266799927 + }, + { + "auxiliary_loss_clip": 0.06533324, + "auxiliary_loss_mlp": 0.01292111, + "balance_loss_clip": 0.06302898, + "balance_loss_mlp": 0.01271237, + "epoch": 0.22425973245152564, + "flos": 27386007269760.0, + "grad_norm": 1.6841051152750983, + "language_loss": 0.87170112, + "learning_rate": 3.6170261476830964e-06, + "loss": 0.94995546, + "num_input_tokens_seen": 80315425, + "router_z_loss_clip": 2.30273438, + "router_z_loss_mlp": 0.2088623, + "step": 3730, + "time_per_iteration": 2.598576307296753 + }, + { + "auxiliary_loss_clip": 0.0653019, + "auxiliary_loss_mlp": 0.01298076, + "balance_loss_clip": 0.06300467, + "balance_loss_mlp": 0.01276403, + "epoch": 0.2243198557041936, + "flos": 13740794225280.0, + "grad_norm": 2.088554635044429, + "language_loss": 0.73449922, + "learning_rate": 3.616796927310559e-06, + "loss": 0.81278187, + "num_input_tokens_seen": 80333905, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.21655273, + "step": 3731, + "time_per_iteration": 2.5361716747283936 + }, + { + "auxiliary_loss_clip": 0.06541456, + "auxiliary_loss_mlp": 0.01292681, + "balance_loss_clip": 0.06301124, + "balance_loss_mlp": 0.01267933, + "epoch": 0.22437997895686157, + "flos": 19535775874560.0, + "grad_norm": 5.172507402775724, + "language_loss": 0.75803339, + "learning_rate": 3.6165676456289195e-06, + "loss": 0.83637482, + "num_input_tokens_seen": 80352165, + "router_z_loss_clip": 2.40625, + "router_z_loss_mlp": 0.24755859, + "step": 3732, + "time_per_iteration": 2.5423076152801514 + }, + { + "auxiliary_loss_clip": 0.06533462, + "auxiliary_loss_mlp": 0.01296517, + "balance_loss_clip": 0.06298938, + "balance_loss_mlp": 0.01273664, + "epoch": 0.22444010220952954, + "flos": 23703032108160.0, + "grad_norm": 1.6752991374876018, + "language_loss": 0.89338291, + "learning_rate": 3.616338302646873e-06, + "loss": 0.97168273, + "num_input_tokens_seen": 80371305, + "router_z_loss_clip": 2.34375, + "router_z_loss_mlp": 0.2286377, + "step": 3733, + "time_per_iteration": 4.021088123321533 + }, + { + "auxiliary_loss_clip": 0.065323, + "auxiliary_loss_mlp": 0.01294952, + "balance_loss_clip": 0.06298727, + "balance_loss_mlp": 0.01270193, + "epoch": 0.2245002254621975, + "flos": 22389514704000.0, + "grad_norm": 1.4651206016819107, + "language_loss": 0.85422146, + "learning_rate": 3.6161088983731166e-06, + "loss": 0.93249398, + "num_input_tokens_seen": 80391020, + "router_z_loss_clip": 2.33398438, + "router_z_loss_mlp": 0.24780273, + "step": 3734, + "time_per_iteration": 2.5562949180603027 + }, + { + "auxiliary_loss_clip": 0.06539299, + "auxiliary_loss_mlp": 0.01283537, + "balance_loss_clip": 0.06303868, + "balance_loss_mlp": 0.01261113, + "epoch": 0.22456034871486547, + "flos": 26949453897600.0, + "grad_norm": 1.579737554219585, + "language_loss": 0.77332962, + "learning_rate": 3.6158794328163482e-06, + "loss": 0.85155803, + "num_input_tokens_seen": 80411365, + "router_z_loss_clip": 2.3515625, + "router_z_loss_mlp": 0.22436523, + "step": 3735, + "time_per_iteration": 4.016703367233276 + }, + { + "auxiliary_loss_clip": 0.06526705, + "auxiliary_loss_mlp": 0.01290552, + "balance_loss_clip": 0.06298478, + "balance_loss_mlp": 0.01269559, + "epoch": 0.22462047196753343, + "flos": 28990700012160.0, + "grad_norm": 1.885472064442235, + "language_loss": 0.84907603, + "learning_rate": 3.6156499059852702e-06, + "loss": 0.92724866, + "num_input_tokens_seen": 80431075, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.21008301, + "step": 3736, + "time_per_iteration": 2.6118290424346924 + }, + { + "auxiliary_loss_clip": 0.06536424, + "auxiliary_loss_mlp": 0.01285836, + "balance_loss_clip": 0.0630133, + "balance_loss_mlp": 0.01261922, + "epoch": 0.22468059522020142, + "flos": 20017541324160.0, + "grad_norm": 1.5290746464045628, + "language_loss": 0.87103891, + "learning_rate": 3.615420317888586e-06, + "loss": 0.94926155, + "num_input_tokens_seen": 80449240, + "router_z_loss_clip": 2.3515625, + "router_z_loss_mlp": 0.23913574, + "step": 3737, + "time_per_iteration": 2.5211808681488037 + }, + { + "auxiliary_loss_clip": 0.06534176, + "auxiliary_loss_mlp": 0.01288351, + "balance_loss_clip": 0.06294889, + "balance_loss_mlp": 0.01263949, + "epoch": 0.2247407184728694, + "flos": 29321846547840.0, + "grad_norm": 1.8581473098744326, + "language_loss": 0.80131769, + "learning_rate": 3.6151906685350006e-06, + "loss": 0.87954295, + "num_input_tokens_seen": 80467900, + "router_z_loss_clip": 2.39257812, + "router_z_loss_mlp": 0.24389648, + "step": 3738, + "time_per_iteration": 2.604417085647583 + }, + { + "auxiliary_loss_clip": 0.06530435, + "auxiliary_loss_mlp": 0.01285051, + "balance_loss_clip": 0.06293893, + "balance_loss_mlp": 0.01263295, + "epoch": 0.22480084172553735, + "flos": 22317035322240.0, + "grad_norm": 1.7432458267253939, + "language_loss": 0.77190316, + "learning_rate": 3.614960957933224e-06, + "loss": 0.85005802, + "num_input_tokens_seen": 80487100, + "router_z_loss_clip": 2.36914062, + "router_z_loss_mlp": 0.21728516, + "step": 3739, + "time_per_iteration": 2.540266275405884 + }, + { + "auxiliary_loss_clip": 0.06531328, + "auxiliary_loss_mlp": 0.01283134, + "balance_loss_clip": 0.06295189, + "balance_loss_mlp": 0.01260091, + "epoch": 0.22486096497820532, + "flos": 25598019720960.0, + "grad_norm": 4.441094103460663, + "language_loss": 0.74799633, + "learning_rate": 3.6147311860919655e-06, + "loss": 0.82614094, + "num_input_tokens_seen": 80508625, + "router_z_loss_clip": 2.36132812, + "router_z_loss_mlp": 0.23022461, + "step": 3740, + "time_per_iteration": 2.640592575073242 + }, + { + "auxiliary_loss_clip": 0.06520827, + "auxiliary_loss_mlp": 0.01278747, + "balance_loss_clip": 0.06289122, + "balance_loss_mlp": 0.01256681, + "epoch": 0.22492108823087328, + "flos": 17645651798400.0, + "grad_norm": 2.0040821388775285, + "language_loss": 0.75983584, + "learning_rate": 3.614501353019939e-06, + "loss": 0.83783156, + "num_input_tokens_seen": 80527345, + "router_z_loss_clip": 2.31640625, + "router_z_loss_mlp": 0.22070312, + "step": 3741, + "time_per_iteration": 2.513965129852295 + }, + { + "auxiliary_loss_clip": 0.06526901, + "auxiliary_loss_mlp": 0.01283674, + "balance_loss_clip": 0.06296658, + "balance_loss_mlp": 0.0126224, + "epoch": 0.22498121148354125, + "flos": 16040246296320.0, + "grad_norm": 1.702368757801579, + "language_loss": 0.87747514, + "learning_rate": 3.6142714587258592e-06, + "loss": 0.95558089, + "num_input_tokens_seen": 80545545, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.21435547, + "step": 3742, + "time_per_iteration": 2.5164167881011963 + }, + { + "auxiliary_loss_clip": 0.0652426, + "auxiliary_loss_mlp": 0.01281848, + "balance_loss_clip": 0.06294844, + "balance_loss_mlp": 0.01259389, + "epoch": 0.22504133473620924, + "flos": 24030489064320.0, + "grad_norm": 1.7109022824395175, + "language_loss": 0.82010657, + "learning_rate": 3.614041503218444e-06, + "loss": 0.89816761, + "num_input_tokens_seen": 80565040, + "router_z_loss_clip": 2.29296875, + "router_z_loss_mlp": 0.22473145, + "step": 3743, + "time_per_iteration": 2.5486276149749756 + }, + { + "auxiliary_loss_clip": 0.06524298, + "auxiliary_loss_mlp": 0.0127565, + "balance_loss_clip": 0.06291372, + "balance_loss_mlp": 0.01254562, + "epoch": 0.2251014579888772, + "flos": 16769610161280.0, + "grad_norm": 2.126207867209009, + "language_loss": 0.64185399, + "learning_rate": 3.6138114865064134e-06, + "loss": 0.7198534, + "num_input_tokens_seen": 80582815, + "router_z_loss_clip": 2.328125, + "router_z_loss_mlp": 0.2109375, + "step": 3744, + "time_per_iteration": 2.535020351409912 + }, + { + "auxiliary_loss_clip": 0.06527244, + "auxiliary_loss_mlp": 0.01277496, + "balance_loss_clip": 0.06293654, + "balance_loss_mlp": 0.01256372, + "epoch": 0.22516158124154517, + "flos": 13996191070080.0, + "grad_norm": 3.1643825534304684, + "language_loss": 0.76886272, + "learning_rate": 3.613581408598489e-06, + "loss": 0.84691012, + "num_input_tokens_seen": 80600865, + "router_z_loss_clip": 2.3359375, + "router_z_loss_mlp": 0.21105957, + "step": 3745, + "time_per_iteration": 2.5233495235443115 + }, + { + "auxiliary_loss_clip": 0.06522205, + "auxiliary_loss_mlp": 0.01281406, + "balance_loss_clip": 0.06290923, + "balance_loss_mlp": 0.01260675, + "epoch": 0.22522170449421314, + "flos": 14394869596800.0, + "grad_norm": 1.6969236990578618, + "language_loss": 0.80721819, + "learning_rate": 3.6133512695033965e-06, + "loss": 0.88525426, + "num_input_tokens_seen": 80617455, + "router_z_loss_clip": 2.3125, + "router_z_loss_mlp": 0.20739746, + "step": 3746, + "time_per_iteration": 2.559129476547241 + }, + { + "auxiliary_loss_clip": 0.06533524, + "auxiliary_loss_mlp": 0.01280566, + "balance_loss_clip": 0.06296681, + "balance_loss_mlp": 0.0125881, + "epoch": 0.2252818277468811, + "flos": 23812338159360.0, + "grad_norm": 2.077776202364112, + "language_loss": 0.86226261, + "learning_rate": 3.613121069229862e-06, + "loss": 0.94040346, + "num_input_tokens_seen": 80635125, + "router_z_loss_clip": 2.3671875, + "router_z_loss_mlp": 0.21765137, + "step": 3747, + "time_per_iteration": 2.5834550857543945 + }, + { + "auxiliary_loss_clip": 0.06530412, + "auxiliary_loss_mlp": 0.01275087, + "balance_loss_clip": 0.06296586, + "balance_loss_mlp": 0.01255095, + "epoch": 0.22534195099954907, + "flos": 24725038757760.0, + "grad_norm": 1.8595393434505574, + "language_loss": 0.76982796, + "learning_rate": 3.6128908077866145e-06, + "loss": 0.84788299, + "num_input_tokens_seen": 80656370, + "router_z_loss_clip": 2.34179688, + "router_z_loss_mlp": 0.1998291, + "step": 3748, + "time_per_iteration": 2.5877788066864014 + }, + { + "auxiliary_loss_clip": 0.0652978, + "auxiliary_loss_mlp": 0.0128313, + "balance_loss_clip": 0.06296694, + "balance_loss_mlp": 0.01261768, + "epoch": 0.22540207425221703, + "flos": 21038625578880.0, + "grad_norm": 1.5282192474331018, + "language_loss": 0.80547005, + "learning_rate": 3.6126604851823864e-06, + "loss": 0.88359916, + "num_input_tokens_seen": 80676495, + "router_z_loss_clip": 2.33203125, + "router_z_loss_mlp": 0.21374512, + "step": 3749, + "time_per_iteration": 2.5356597900390625 + }, + { + "auxiliary_loss_clip": 0.06526259, + "auxiliary_loss_mlp": 0.01273546, + "balance_loss_clip": 0.06298405, + "balance_loss_mlp": 0.01253698, + "epoch": 0.22546219750488503, + "flos": 19396351480320.0, + "grad_norm": 1.5225090015602234, + "language_loss": 0.80070651, + "learning_rate": 3.6124301014259108e-06, + "loss": 0.87870455, + "num_input_tokens_seen": 80694755, + "router_z_loss_clip": 2.2734375, + "router_z_loss_mlp": 0.19848633, + "step": 3750, + "time_per_iteration": 2.524614095687866 + }, + { + "auxiliary_loss_clip": 0.06532078, + "auxiliary_loss_mlp": 0.01279372, + "balance_loss_clip": 0.06297495, + "balance_loss_mlp": 0.01258117, + "epoch": 0.225522320757553, + "flos": 25199760464640.0, + "grad_norm": 5.336084937176506, + "language_loss": 0.8300491, + "learning_rate": 3.6121996565259244e-06, + "loss": 0.90816361, + "num_input_tokens_seen": 80713670, + "router_z_loss_clip": 2.34765625, + "router_z_loss_mlp": 0.21264648, + "step": 3751, + "time_per_iteration": 2.5638771057128906 + }, + { + "auxiliary_loss_clip": 0.06527963, + "auxiliary_loss_mlp": 0.01280546, + "balance_loss_clip": 0.06296829, + "balance_loss_mlp": 0.01260149, + "epoch": 0.22558244401022096, + "flos": 17168456396160.0, + "grad_norm": 1.7246902184661286, + "language_loss": 0.8427825, + "learning_rate": 3.611969150491165e-06, + "loss": 0.92086762, + "num_input_tokens_seen": 80731450, + "router_z_loss_clip": 2.31054688, + "router_z_loss_mlp": 0.20385742, + "step": 3752, + "time_per_iteration": 2.5650362968444824 + }, + { + "auxiliary_loss_clip": 0.06527157, + "auxiliary_loss_mlp": 0.01275092, + "balance_loss_clip": 0.06298538, + "balance_loss_mlp": 0.01254839, + "epoch": 0.22564256726288892, + "flos": 15236306697600.0, + "grad_norm": 1.7312534305272433, + "language_loss": 0.78620666, + "learning_rate": 3.611738583330375e-06, + "loss": 0.8642292, + "num_input_tokens_seen": 80748415, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.20251465, + "step": 3753, + "time_per_iteration": 2.510344982147217 + }, + { + "auxiliary_loss_clip": 0.06525348, + "auxiliary_loss_mlp": 0.01279816, + "balance_loss_clip": 0.06296748, + "balance_loss_mlp": 0.01257869, + "epoch": 0.2257026905155569, + "flos": 34577215902720.0, + "grad_norm": 1.9706921359503449, + "language_loss": 0.79448152, + "learning_rate": 3.611507955052295e-06, + "loss": 0.8725332, + "num_input_tokens_seen": 80770835, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.21948242, + "step": 3754, + "time_per_iteration": 2.6429665088653564 + }, + { + "auxiliary_loss_clip": 0.06526577, + "auxiliary_loss_mlp": 0.01281198, + "balance_loss_clip": 0.06299241, + "balance_loss_mlp": 0.01259835, + "epoch": 0.22576281376822485, + "flos": 19944642672000.0, + "grad_norm": 1.7667035857085684, + "language_loss": 0.70640147, + "learning_rate": 3.6112772656656727e-06, + "loss": 0.78447914, + "num_input_tokens_seen": 80787840, + "router_z_loss_clip": 2.2734375, + "router_z_loss_mlp": 0.21374512, + "step": 3755, + "time_per_iteration": 2.5482447147369385 + }, + { + "auxiliary_loss_clip": 0.06530152, + "auxiliary_loss_mlp": 0.01282078, + "balance_loss_clip": 0.06295566, + "balance_loss_mlp": 0.01261085, + "epoch": 0.22582293702089282, + "flos": 24607892350080.0, + "grad_norm": 2.6955819116528588, + "language_loss": 0.77899122, + "learning_rate": 3.6110465151792547e-06, + "loss": 0.85711348, + "num_input_tokens_seen": 80806335, + "router_z_loss_clip": 2.34570312, + "router_z_loss_mlp": 0.21008301, + "step": 3756, + "time_per_iteration": 2.573639392852783 + }, + { + "auxiliary_loss_clip": 0.06536651, + "auxiliary_loss_mlp": 0.01278842, + "balance_loss_clip": 0.0629873, + "balance_loss_mlp": 0.01255394, + "epoch": 0.2258830602735608, + "flos": 23041451796480.0, + "grad_norm": 2.9460656412940405, + "language_loss": 0.82867002, + "learning_rate": 3.6108157036017916e-06, + "loss": 0.90682495, + "num_input_tokens_seen": 80825355, + "router_z_loss_clip": 2.38085938, + "router_z_loss_mlp": 0.23461914, + "step": 3757, + "time_per_iteration": 2.5425305366516113 + }, + { + "auxiliary_loss_clip": 0.06538612, + "auxiliary_loss_mlp": 0.01279229, + "balance_loss_clip": 0.06302969, + "balance_loss_mlp": 0.01257164, + "epoch": 0.22594318352622877, + "flos": 22164068494080.0, + "grad_norm": 3.099441845199118, + "language_loss": 0.73941171, + "learning_rate": 3.6105848309420358e-06, + "loss": 0.81759018, + "num_input_tokens_seen": 80842570, + "router_z_loss_clip": 2.359375, + "router_z_loss_mlp": 0.2208252, + "step": 3758, + "time_per_iteration": 2.506148099899292 + }, + { + "auxiliary_loss_clip": 0.06531477, + "auxiliary_loss_mlp": 0.01288595, + "balance_loss_clip": 0.06296086, + "balance_loss_mlp": 0.01266816, + "epoch": 0.22600330677889674, + "flos": 20600478979200.0, + "grad_norm": 2.4125098710516117, + "language_loss": 0.77881908, + "learning_rate": 3.6103538972087412e-06, + "loss": 0.85701978, + "num_input_tokens_seen": 80858745, + "router_z_loss_clip": 2.35546875, + "router_z_loss_mlp": 0.21777344, + "step": 3759, + "time_per_iteration": 2.5171775817871094 + }, + { + "auxiliary_loss_clip": 0.06534176, + "auxiliary_loss_mlp": 0.01288917, + "balance_loss_clip": 0.06296586, + "balance_loss_mlp": 0.01266267, + "epoch": 0.2260634300315647, + "flos": 35667970427520.0, + "grad_norm": 1.6851914496917324, + "language_loss": 0.7921207, + "learning_rate": 3.6101229024106655e-06, + "loss": 0.87035167, + "num_input_tokens_seen": 80880085, + "router_z_loss_clip": 2.37695312, + "router_z_loss_mlp": 0.22644043, + "step": 3760, + "time_per_iteration": 2.6410677433013916 + }, + { + "auxiliary_loss_clip": 0.06433272, + "auxiliary_loss_mlp": 0.01258557, + "balance_loss_clip": 0.06311189, + "balance_loss_mlp": 0.01252156, + "epoch": 0.22612355328423267, + "flos": 72107707685760.0, + "grad_norm": 0.875668320300708, + "language_loss": 0.60230321, + "learning_rate": 3.609891846556569e-06, + "loss": 0.67922151, + "num_input_tokens_seen": 80937660, + "router_z_loss_clip": 1.21875, + "router_z_loss_mlp": 0.06408691, + "step": 3761, + "time_per_iteration": 3.1083786487579346 + }, + { + "auxiliary_loss_clip": 0.06545433, + "auxiliary_loss_mlp": 0.01288291, + "balance_loss_clip": 0.06303856, + "balance_loss_mlp": 0.01267012, + "epoch": 0.22618367653690064, + "flos": 22790373436800.0, + "grad_norm": 3.0022983434583783, + "language_loss": 0.77876461, + "learning_rate": 3.609660729655211e-06, + "loss": 0.8571018, + "num_input_tokens_seen": 80956265, + "router_z_loss_clip": 2.41601562, + "router_z_loss_mlp": 0.21289062, + "step": 3762, + "time_per_iteration": 2.5256128311157227 + }, + { + "auxiliary_loss_clip": 0.06531228, + "auxiliary_loss_mlp": 0.01280361, + "balance_loss_clip": 0.06294668, + "balance_loss_mlp": 0.01258343, + "epoch": 0.22624379978956863, + "flos": 20454388185600.0, + "grad_norm": 1.959767281760525, + "language_loss": 0.79828411, + "learning_rate": 3.6094295517153573e-06, + "loss": 0.87639999, + "num_input_tokens_seen": 80975185, + "router_z_loss_clip": 2.36328125, + "router_z_loss_mlp": 0.22033691, + "step": 3763, + "time_per_iteration": 2.528965950012207 + }, + { + "auxiliary_loss_clip": 0.06540731, + "auxiliary_loss_mlp": 0.01291635, + "balance_loss_clip": 0.06300753, + "balance_loss_mlp": 0.01268949, + "epoch": 0.2263039230422366, + "flos": 17500189910400.0, + "grad_norm": 1.5800574189561347, + "language_loss": 0.91907668, + "learning_rate": 3.6091983127457743e-06, + "loss": 0.99740022, + "num_input_tokens_seen": 80992830, + "router_z_loss_clip": 2.3984375, + "router_z_loss_mlp": 0.22705078, + "step": 3764, + "time_per_iteration": 2.5012450218200684 + }, + { + "auxiliary_loss_clip": 0.06527007, + "auxiliary_loss_mlp": 0.01291683, + "balance_loss_clip": 0.06295396, + "balance_loss_mlp": 0.01271001, + "epoch": 0.22636404629490456, + "flos": 28337295473280.0, + "grad_norm": 3.379650672619254, + "language_loss": 0.75542498, + "learning_rate": 3.6089670127552293e-06, + "loss": 0.83361191, + "num_input_tokens_seen": 81013675, + "router_z_loss_clip": 2.31445312, + "router_z_loss_mlp": 0.20690918, + "step": 3765, + "time_per_iteration": 2.6149775981903076 + }, + { + "auxiliary_loss_clip": 0.06519896, + "auxiliary_loss_mlp": 0.01290584, + "balance_loss_clip": 0.06290467, + "balance_loss_mlp": 0.01268256, + "epoch": 0.22642416954757252, + "flos": 17494152416640.0, + "grad_norm": 2.1325205607667526, + "language_loss": 0.90732884, + "learning_rate": 3.608735651752494e-06, + "loss": 0.98543364, + "num_input_tokens_seen": 81030345, + "router_z_loss_clip": 2.29492188, + "router_z_loss_mlp": 0.22338867, + "step": 3766, + "time_per_iteration": 3.925321340560913 + }, + { + "auxiliary_loss_clip": 0.06520344, + "auxiliary_loss_mlp": 0.01279841, + "balance_loss_clip": 0.0629393, + "balance_loss_mlp": 0.0125756, + "epoch": 0.2264842928002405, + "flos": 24390621912960.0, + "grad_norm": 1.5335844294501488, + "language_loss": 0.74866152, + "learning_rate": 3.6085042297463417e-06, + "loss": 0.82666337, + "num_input_tokens_seen": 81051000, + "router_z_loss_clip": 2.26757812, + "router_z_loss_mlp": 0.22290039, + "step": 3767, + "time_per_iteration": 2.585827589035034 + }, + { + "auxiliary_loss_clip": 0.06526411, + "auxiliary_loss_mlp": 0.01285323, + "balance_loss_clip": 0.06291486, + "balance_loss_mlp": 0.01262816, + "epoch": 0.22654441605290845, + "flos": 19836971775360.0, + "grad_norm": 1.5156609478299474, + "language_loss": 0.72064531, + "learning_rate": 3.6082727467455477e-06, + "loss": 0.79876268, + "num_input_tokens_seen": 81071205, + "router_z_loss_clip": 2.35351562, + "router_z_loss_mlp": 0.22521973, + "step": 3768, + "time_per_iteration": 3.9932377338409424 + }, + { + "auxiliary_loss_clip": 0.06525982, + "auxiliary_loss_mlp": 0.01291355, + "balance_loss_clip": 0.06294759, + "balance_loss_mlp": 0.01268347, + "epoch": 0.22660453930557642, + "flos": 27462050449920.0, + "grad_norm": 1.8227506475765343, + "language_loss": 0.78781188, + "learning_rate": 3.6080412027588905e-06, + "loss": 0.86598527, + "num_input_tokens_seen": 81091880, + "router_z_loss_clip": 2.31445312, + "router_z_loss_mlp": 0.22998047, + "step": 3769, + "time_per_iteration": 2.5796549320220947 + }, + { + "auxiliary_loss_clip": 0.06531481, + "auxiliary_loss_mlp": 0.01287446, + "balance_loss_clip": 0.06292526, + "balance_loss_mlp": 0.01265428, + "epoch": 0.2266646625582444, + "flos": 23995004060160.0, + "grad_norm": 2.604534401291856, + "language_loss": 0.69374454, + "learning_rate": 3.6078095977951488e-06, + "loss": 0.77193379, + "num_input_tokens_seen": 81113290, + "router_z_loss_clip": 2.38867188, + "router_z_loss_mlp": 0.22021484, + "step": 3770, + "time_per_iteration": 2.6160407066345215 + }, + { + "auxiliary_loss_clip": 0.065291, + "auxiliary_loss_mlp": 0.01289999, + "balance_loss_clip": 0.06292273, + "balance_loss_mlp": 0.01269077, + "epoch": 0.22672478581091238, + "flos": 26034698874240.0, + "grad_norm": 1.4830972618629188, + "language_loss": 0.8083868, + "learning_rate": 3.6075779318631067e-06, + "loss": 0.88657784, + "num_input_tokens_seen": 81133535, + "router_z_loss_clip": 2.37109375, + "router_z_loss_mlp": 0.20922852, + "step": 3771, + "time_per_iteration": 2.576948642730713 + }, + { + "auxiliary_loss_clip": 0.06521479, + "auxiliary_loss_mlp": 0.01283736, + "balance_loss_clip": 0.06290901, + "balance_loss_mlp": 0.012613, + "epoch": 0.22678490906358034, + "flos": 23848577850240.0, + "grad_norm": 1.5694676435300003, + "language_loss": 0.79189658, + "learning_rate": 3.6073462049715486e-06, + "loss": 0.86994874, + "num_input_tokens_seen": 81154650, + "router_z_loss_clip": 2.30273438, + "router_z_loss_mlp": 0.22436523, + "step": 3772, + "time_per_iteration": 4.012827396392822 + }, + { + "auxiliary_loss_clip": 0.06410234, + "auxiliary_loss_mlp": 0.01286376, + "balance_loss_clip": 0.06287075, + "balance_loss_mlp": 0.01280571, + "epoch": 0.2268450323162483, + "flos": 65070163912320.0, + "grad_norm": 0.6415690360853892, + "language_loss": 0.53899318, + "learning_rate": 3.607114417129261e-06, + "loss": 0.61595929, + "num_input_tokens_seen": 81221240, + "router_z_loss_clip": 1.234375, + "router_z_loss_mlp": 0.0579834, + "step": 3773, + "time_per_iteration": 3.249551773071289 + }, + { + "auxiliary_loss_clip": 0.06526346, + "auxiliary_loss_mlp": 0.01287624, + "balance_loss_clip": 0.06294057, + "balance_loss_mlp": 0.01266238, + "epoch": 0.22690515556891627, + "flos": 22532251334400.0, + "grad_norm": 1.8359701531623327, + "language_loss": 0.70997107, + "learning_rate": 3.6068825683450334e-06, + "loss": 0.78811073, + "num_input_tokens_seen": 81241520, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.21386719, + "step": 3774, + "time_per_iteration": 2.558279275894165 + }, + { + "auxiliary_loss_clip": 0.06521672, + "auxiliary_loss_mlp": 0.01287873, + "balance_loss_clip": 0.06291246, + "balance_loss_mlp": 0.01266857, + "epoch": 0.22696527882158424, + "flos": 18229344140160.0, + "grad_norm": 2.047907778931267, + "language_loss": 0.75449002, + "learning_rate": 3.606650658627658e-06, + "loss": 0.83258545, + "num_input_tokens_seen": 81256825, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.21008301, + "step": 3775, + "time_per_iteration": 3.928666353225708 + }, + { + "auxiliary_loss_clip": 0.06524701, + "auxiliary_loss_mlp": 0.01286732, + "balance_loss_clip": 0.06292307, + "balance_loss_mlp": 0.01266168, + "epoch": 0.22702540207425223, + "flos": 17024923152000.0, + "grad_norm": 2.031895062113734, + "language_loss": 0.82818532, + "learning_rate": 3.606418687985928e-06, + "loss": 0.90629965, + "num_input_tokens_seen": 81275695, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.20581055, + "step": 3776, + "time_per_iteration": 2.5941483974456787 + }, + { + "auxiliary_loss_clip": 0.06528914, + "auxiliary_loss_mlp": 0.01279846, + "balance_loss_clip": 0.06293055, + "balance_loss_mlp": 0.01259222, + "epoch": 0.2270855253269202, + "flos": 21332316539520.0, + "grad_norm": 1.645158938946052, + "language_loss": 0.83362442, + "learning_rate": 3.606186656428641e-06, + "loss": 0.91171205, + "num_input_tokens_seen": 81294920, + "router_z_loss_clip": 2.359375, + "router_z_loss_mlp": 0.20617676, + "step": 3777, + "time_per_iteration": 2.5177228450775146 + }, + { + "auxiliary_loss_clip": 0.06532624, + "auxiliary_loss_mlp": 0.01278936, + "balance_loss_clip": 0.06296799, + "balance_loss_mlp": 0.01257002, + "epoch": 0.22714564857958816, + "flos": 23557276730880.0, + "grad_norm": 1.8837878269403912, + "language_loss": 0.73246169, + "learning_rate": 3.6059545639645955e-06, + "loss": 0.81057739, + "num_input_tokens_seen": 81314275, + "router_z_loss_clip": 2.36328125, + "router_z_loss_mlp": 0.21948242, + "step": 3778, + "time_per_iteration": 2.5589511394500732 + }, + { + "auxiliary_loss_clip": 0.06530988, + "auxiliary_loss_mlp": 0.01275867, + "balance_loss_clip": 0.06293572, + "balance_loss_mlp": 0.01255673, + "epoch": 0.22720577183225613, + "flos": 25996237050240.0, + "grad_norm": 2.9659284448048555, + "language_loss": 0.65779513, + "learning_rate": 3.605722410602591e-06, + "loss": 0.73586369, + "num_input_tokens_seen": 81333890, + "router_z_loss_clip": 2.37304688, + "router_z_loss_mlp": 0.20178223, + "step": 3779, + "time_per_iteration": 2.543818950653076 + }, + { + "auxiliary_loss_clip": 0.06525169, + "auxiliary_loss_mlp": 0.01276701, + "balance_loss_clip": 0.06295511, + "balance_loss_mlp": 0.01255982, + "epoch": 0.2272658950849241, + "flos": 20820432746880.0, + "grad_norm": 1.7825989229768946, + "language_loss": 0.70823693, + "learning_rate": 3.6054901963514323e-06, + "loss": 0.7862556, + "num_input_tokens_seen": 81353640, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.20703125, + "step": 3780, + "time_per_iteration": 2.558850049972534 + }, + { + "auxiliary_loss_clip": 0.06528573, + "auxiliary_loss_mlp": 0.01280577, + "balance_loss_clip": 0.06296494, + "balance_loss_mlp": 0.01257927, + "epoch": 0.22732601833759206, + "flos": 23915187446400.0, + "grad_norm": 1.6463040629853982, + "language_loss": 0.89639765, + "learning_rate": 3.6052579212199246e-06, + "loss": 0.97448915, + "num_input_tokens_seen": 81371595, + "router_z_loss_clip": 2.32226562, + "router_z_loss_mlp": 0.2265625, + "step": 3781, + "time_per_iteration": 2.527230739593506 + }, + { + "auxiliary_loss_clip": 0.06532317, + "auxiliary_loss_mlp": 0.01280346, + "balance_loss_clip": 0.06296034, + "balance_loss_mlp": 0.01257672, + "epoch": 0.22738614159026002, + "flos": 15929850142080.0, + "grad_norm": 2.4692396393453016, + "language_loss": 0.75309098, + "learning_rate": 3.6050255852168753e-06, + "loss": 0.83121765, + "num_input_tokens_seen": 81388435, + "router_z_loss_clip": 2.36523438, + "router_z_loss_mlp": 0.2265625, + "step": 3782, + "time_per_iteration": 2.4901020526885986 + }, + { + "auxiliary_loss_clip": 0.06532567, + "auxiliary_loss_mlp": 0.01278379, + "balance_loss_clip": 0.06300219, + "balance_loss_mlp": 0.01257959, + "epoch": 0.22744626484292801, + "flos": 24212148716160.0, + "grad_norm": 1.7681967435875452, + "language_loss": 0.8314634, + "learning_rate": 3.604793188351095e-06, + "loss": 0.90957284, + "num_input_tokens_seen": 81410195, + "router_z_loss_clip": 2.32421875, + "router_z_loss_mlp": 0.20422363, + "step": 3783, + "time_per_iteration": 2.559361696243286 + }, + { + "auxiliary_loss_clip": 0.06539755, + "auxiliary_loss_mlp": 0.0128451, + "balance_loss_clip": 0.06305835, + "balance_loss_mlp": 0.01262266, + "epoch": 0.22750638809559598, + "flos": 24798734023680.0, + "grad_norm": 1.794476113807414, + "language_loss": 0.76757884, + "learning_rate": 3.6045607306313964e-06, + "loss": 0.8458215, + "num_input_tokens_seen": 81430060, + "router_z_loss_clip": 2.34179688, + "router_z_loss_mlp": 0.22229004, + "step": 3784, + "time_per_iteration": 2.6693339347839355 + }, + { + "auxiliary_loss_clip": 0.06533188, + "auxiliary_loss_mlp": 0.012806, + "balance_loss_clip": 0.06299379, + "balance_loss_mlp": 0.01257998, + "epoch": 0.22756651134826394, + "flos": 22243004640000.0, + "grad_norm": 1.5985438146538498, + "language_loss": 0.71667248, + "learning_rate": 3.604328212066594e-06, + "loss": 0.79481035, + "num_input_tokens_seen": 81447375, + "router_z_loss_clip": 2.34179688, + "router_z_loss_mlp": 0.22583008, + "step": 3785, + "time_per_iteration": 2.5436675548553467 + }, + { + "auxiliary_loss_clip": 0.06421004, + "auxiliary_loss_mlp": 0.0127133, + "balance_loss_clip": 0.0629871, + "balance_loss_mlp": 0.01265915, + "epoch": 0.2276266346009319, + "flos": 62728225021440.0, + "grad_norm": 1.545506426452605, + "language_loss": 0.63058448, + "learning_rate": 3.6040956326655047e-06, + "loss": 0.70750785, + "num_input_tokens_seen": 81505235, + "router_z_loss_clip": 1.21875, + "router_z_loss_mlp": 0.05422974, + "step": 3786, + "time_per_iteration": 3.1247661113739014 + }, + { + "auxiliary_loss_clip": 0.06538717, + "auxiliary_loss_mlp": 0.01277474, + "balance_loss_clip": 0.06302891, + "balance_loss_mlp": 0.01254299, + "epoch": 0.22768675785359987, + "flos": 18618085958400.0, + "grad_norm": 2.466113986800572, + "language_loss": 0.8751514, + "learning_rate": 3.6038629924369486e-06, + "loss": 0.95331335, + "num_input_tokens_seen": 81518685, + "router_z_loss_clip": 2.35742188, + "router_z_loss_mlp": 0.23156738, + "step": 3787, + "time_per_iteration": 2.488539457321167 + }, + { + "auxiliary_loss_clip": 0.06537791, + "auxiliary_loss_mlp": 0.01280159, + "balance_loss_clip": 0.06305036, + "balance_loss_mlp": 0.01259488, + "epoch": 0.22774688110626784, + "flos": 26877477640320.0, + "grad_norm": 2.053207704033697, + "language_loss": 0.73054254, + "learning_rate": 3.6036302913897474e-06, + "loss": 0.80872202, + "num_input_tokens_seen": 81538940, + "router_z_loss_clip": 2.33007812, + "router_z_loss_mlp": 0.20678711, + "step": 3788, + "time_per_iteration": 2.5763657093048096 + }, + { + "auxiliary_loss_clip": 0.06534025, + "auxiliary_loss_mlp": 0.01282834, + "balance_loss_clip": 0.06303776, + "balance_loss_mlp": 0.01260971, + "epoch": 0.2278070043589358, + "flos": 15557977722240.0, + "grad_norm": 4.57361945380841, + "language_loss": 0.68007839, + "learning_rate": 3.6033975295327243e-06, + "loss": 0.75824702, + "num_input_tokens_seen": 81555525, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.21850586, + "step": 3789, + "time_per_iteration": 2.4907443523406982 + }, + { + "auxiliary_loss_clip": 0.0653897, + "auxiliary_loss_mlp": 0.01283477, + "balance_loss_clip": 0.06308074, + "balance_loss_mlp": 0.0126115, + "epoch": 0.2278671276116038, + "flos": 22422987210240.0, + "grad_norm": 2.4388022002275243, + "language_loss": 0.76775718, + "learning_rate": 3.6031647068747065e-06, + "loss": 0.84598166, + "num_input_tokens_seen": 81576305, + "router_z_loss_clip": 2.31054688, + "router_z_loss_mlp": 0.22338867, + "step": 3790, + "time_per_iteration": 2.5787651538848877 + }, + { + "auxiliary_loss_clip": 0.06540109, + "auxiliary_loss_mlp": 0.01282259, + "balance_loss_clip": 0.06309578, + "balance_loss_mlp": 0.01259252, + "epoch": 0.22792725086427176, + "flos": 20637641064960.0, + "grad_norm": 1.9300771626575046, + "language_loss": 0.91910696, + "learning_rate": 3.602931823424522e-06, + "loss": 0.99733061, + "num_input_tokens_seen": 81594115, + "router_z_loss_clip": 2.30664062, + "router_z_loss_mlp": 0.23010254, + "step": 3791, + "time_per_iteration": 2.52327823638916 + }, + { + "auxiliary_loss_clip": 0.06538808, + "auxiliary_loss_mlp": 0.01277492, + "balance_loss_clip": 0.06302848, + "balance_loss_mlp": 0.01256893, + "epoch": 0.22798737411693973, + "flos": 31436662147200.0, + "grad_norm": 1.9637481556258098, + "language_loss": 0.83064067, + "learning_rate": 3.6026988791910026e-06, + "loss": 0.9088037, + "num_input_tokens_seen": 81615355, + "router_z_loss_clip": 2.36132812, + "router_z_loss_mlp": 0.20617676, + "step": 3792, + "time_per_iteration": 2.6190388202667236 + }, + { + "auxiliary_loss_clip": 0.06410792, + "auxiliary_loss_mlp": 0.01268683, + "balance_loss_clip": 0.06289717, + "balance_loss_mlp": 0.01263259, + "epoch": 0.2280474973696077, + "flos": 52412074220160.0, + "grad_norm": 1.1033671526650368, + "language_loss": 0.65792358, + "learning_rate": 3.602465874182981e-06, + "loss": 0.73471832, + "num_input_tokens_seen": 81662075, + "router_z_loss_clip": 1.2109375, + "router_z_loss_mlp": 0.05432129, + "step": 3793, + "time_per_iteration": 2.9110665321350098 + }, + { + "auxiliary_loss_clip": 0.0654863, + "auxiliary_loss_mlp": 0.01287304, + "balance_loss_clip": 0.06306019, + "balance_loss_mlp": 0.01261889, + "epoch": 0.22810762062227566, + "flos": 26403300984960.0, + "grad_norm": 1.9908643306499119, + "language_loss": 0.78207439, + "learning_rate": 3.602232808409293e-06, + "loss": 0.8604337, + "num_input_tokens_seen": 81681625, + "router_z_loss_clip": 2.4296875, + "router_z_loss_mlp": 0.25415039, + "step": 3794, + "time_per_iteration": 2.5911734104156494 + }, + { + "auxiliary_loss_clip": 0.06544799, + "auxiliary_loss_mlp": 0.01285336, + "balance_loss_clip": 0.06310074, + "balance_loss_mlp": 0.01262412, + "epoch": 0.22816774387494362, + "flos": 25637445866880.0, + "grad_norm": 3.443157636284035, + "language_loss": 0.81285226, + "learning_rate": 3.6019996818787755e-06, + "loss": 0.89115357, + "num_input_tokens_seen": 81701170, + "router_z_loss_clip": 2.34960938, + "router_z_loss_mlp": 0.22912598, + "step": 3795, + "time_per_iteration": 2.6825528144836426 + }, + { + "auxiliary_loss_clip": 0.06536914, + "auxiliary_loss_mlp": 0.0128896, + "balance_loss_clip": 0.06306744, + "balance_loss_mlp": 0.01267586, + "epoch": 0.22822786712761162, + "flos": 22457507892480.0, + "grad_norm": 1.703568435651106, + "language_loss": 0.77948368, + "learning_rate": 3.6017664946002704e-06, + "loss": 0.85774243, + "num_input_tokens_seen": 81721265, + "router_z_loss_clip": 2.30664062, + "router_z_loss_mlp": 0.21362305, + "step": 3796, + "time_per_iteration": 2.5418922901153564 + }, + { + "auxiliary_loss_clip": 0.06535624, + "auxiliary_loss_mlp": 0.01278994, + "balance_loss_clip": 0.06302401, + "balance_loss_mlp": 0.01258692, + "epoch": 0.22828799038027958, + "flos": 12207323053440.0, + "grad_norm": 2.5041816771456076, + "language_loss": 0.96305406, + "learning_rate": 3.6015332465826188e-06, + "loss": 1.04120016, + "num_input_tokens_seen": 81736565, + "router_z_loss_clip": 2.33203125, + "router_z_loss_mlp": 0.20324707, + "step": 3797, + "time_per_iteration": 2.5794107913970947 + }, + { + "auxiliary_loss_clip": 0.06537494, + "auxiliary_loss_mlp": 0.01281478, + "balance_loss_clip": 0.06304877, + "balance_loss_mlp": 0.01260057, + "epoch": 0.22834811363294755, + "flos": 22091379477120.0, + "grad_norm": 1.517581709018558, + "language_loss": 0.82277977, + "learning_rate": 3.601299937834666e-06, + "loss": 0.90096951, + "num_input_tokens_seen": 81756240, + "router_z_loss_clip": 2.32421875, + "router_z_loss_mlp": 0.2142334, + "step": 3798, + "time_per_iteration": 2.618784189224243 + }, + { + "auxiliary_loss_clip": 0.06536907, + "auxiliary_loss_mlp": 0.01279844, + "balance_loss_clip": 0.06300005, + "balance_loss_mlp": 0.01257146, + "epoch": 0.2284082368856155, + "flos": 24867104555520.0, + "grad_norm": 1.8603662335211264, + "language_loss": 0.79381669, + "learning_rate": 3.6010665683652596e-06, + "loss": 0.87198418, + "num_input_tokens_seen": 81775720, + "router_z_loss_clip": 2.36914062, + "router_z_loss_mlp": 0.22705078, + "step": 3799, + "time_per_iteration": 2.591053009033203 + }, + { + "auxiliary_loss_clip": 0.06534393, + "auxiliary_loss_mlp": 0.0128126, + "balance_loss_clip": 0.06300979, + "balance_loss_mlp": 0.01258646, + "epoch": 0.22846836013828348, + "flos": 23299280409600.0, + "grad_norm": 1.5152328596048934, + "language_loss": 0.75782096, + "learning_rate": 3.6008331381832484e-06, + "loss": 0.83597749, + "num_input_tokens_seen": 81795830, + "router_z_loss_clip": 2.33398438, + "router_z_loss_mlp": 0.22619629, + "step": 3800, + "time_per_iteration": 2.5370395183563232 + }, + { + "auxiliary_loss_clip": 0.06535068, + "auxiliary_loss_mlp": 0.01279113, + "balance_loss_clip": 0.06302812, + "balance_loss_mlp": 0.01258001, + "epoch": 0.22852848339095144, + "flos": 27423462844800.0, + "grad_norm": 1.9420817073182375, + "language_loss": 0.64685607, + "learning_rate": 3.600599647297484e-06, + "loss": 0.72499788, + "num_input_tokens_seen": 81815745, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.21105957, + "step": 3801, + "time_per_iteration": 2.6190593242645264 + }, + { + "auxiliary_loss_clip": 0.06524718, + "auxiliary_loss_mlp": 0.01278565, + "balance_loss_clip": 0.06296816, + "balance_loss_mlp": 0.01257835, + "epoch": 0.2285886066436194, + "flos": 26328054418560.0, + "grad_norm": 1.6808395254049295, + "language_loss": 0.81957126, + "learning_rate": 3.60036609571682e-06, + "loss": 0.89760411, + "num_input_tokens_seen": 81835155, + "router_z_loss_clip": 2.27929688, + "router_z_loss_mlp": 0.20727539, + "step": 3802, + "time_per_iteration": 2.554079055786133 + }, + { + "auxiliary_loss_clip": 0.06534229, + "auxiliary_loss_mlp": 0.01286931, + "balance_loss_clip": 0.06299631, + "balance_loss_mlp": 0.0126415, + "epoch": 0.2286487298962874, + "flos": 29724298508160.0, + "grad_norm": 1.6760491170738747, + "language_loss": 0.79838073, + "learning_rate": 3.600132483450114e-06, + "loss": 0.87659228, + "num_input_tokens_seen": 81855655, + "router_z_loss_clip": 2.34960938, + "router_z_loss_mlp": 0.22790527, + "step": 3803, + "time_per_iteration": 2.6287641525268555 + }, + { + "auxiliary_loss_clip": 0.0653572, + "auxiliary_loss_mlp": 0.01279074, + "balance_loss_clip": 0.06296768, + "balance_loss_mlp": 0.012559, + "epoch": 0.22870885314895537, + "flos": 21293435445120.0, + "grad_norm": 1.7238152987334623, + "language_loss": 0.86273003, + "learning_rate": 3.5998988105062235e-06, + "loss": 0.94087803, + "num_input_tokens_seen": 81876385, + "router_z_loss_clip": 2.390625, + "router_z_loss_mlp": 0.23168945, + "step": 3804, + "time_per_iteration": 2.511462450027466 + }, + { + "auxiliary_loss_clip": 0.06539486, + "auxiliary_loss_mlp": 0.01279472, + "balance_loss_clip": 0.06301028, + "balance_loss_mlp": 0.01257537, + "epoch": 0.22876897640162333, + "flos": 14944754016000.0, + "grad_norm": 1.89266353651555, + "language_loss": 0.76854289, + "learning_rate": 3.59966507689401e-06, + "loss": 0.84673244, + "num_input_tokens_seen": 81893225, + "router_z_loss_clip": 2.38476562, + "router_z_loss_mlp": 0.21923828, + "step": 3805, + "time_per_iteration": 3.929358959197998 + }, + { + "auxiliary_loss_clip": 0.0654166, + "auxiliary_loss_mlp": 0.01280204, + "balance_loss_clip": 0.06298529, + "balance_loss_mlp": 0.01257542, + "epoch": 0.2288290996542913, + "flos": 18119786526720.0, + "grad_norm": 2.0123502787071073, + "language_loss": 0.79403114, + "learning_rate": 3.5994312826223363e-06, + "loss": 0.87224978, + "num_input_tokens_seen": 81911350, + "router_z_loss_clip": 2.43359375, + "router_z_loss_mlp": 0.22680664, + "step": 3806, + "time_per_iteration": 2.538203477859497 + }, + { + "auxiliary_loss_clip": 0.06540429, + "auxiliary_loss_mlp": 0.01282432, + "balance_loss_clip": 0.06303287, + "balance_loss_mlp": 0.01259878, + "epoch": 0.22888922290695926, + "flos": 39864296828160.0, + "grad_norm": 1.8839046523975558, + "language_loss": 0.70310783, + "learning_rate": 3.5991974277000684e-06, + "loss": 0.78133643, + "num_input_tokens_seen": 81935420, + "router_z_loss_clip": 2.37304688, + "router_z_loss_mlp": 0.22546387, + "step": 3807, + "time_per_iteration": 4.134840488433838 + }, + { + "auxiliary_loss_clip": 0.06550615, + "auxiliary_loss_mlp": 0.01290274, + "balance_loss_clip": 0.06307966, + "balance_loss_mlp": 0.01265121, + "epoch": 0.22894934615962723, + "flos": 23410431250560.0, + "grad_norm": 2.1946772997431103, + "language_loss": 0.65960705, + "learning_rate": 3.5989635121360733e-06, + "loss": 0.73801601, + "num_input_tokens_seen": 81953845, + "router_z_loss_clip": 2.42578125, + "router_z_loss_mlp": 0.25183105, + "step": 3808, + "time_per_iteration": 2.561497688293457 + }, + { + "auxiliary_loss_clip": 0.06539108, + "auxiliary_loss_mlp": 0.01281064, + "balance_loss_clip": 0.06300798, + "balance_loss_mlp": 0.01259154, + "epoch": 0.22900946941229522, + "flos": 18848898829440.0, + "grad_norm": 1.7761632941249064, + "language_loss": 0.75198555, + "learning_rate": 3.598729535939222e-06, + "loss": 0.83018732, + "num_input_tokens_seen": 81972100, + "router_z_loss_clip": 2.38476562, + "router_z_loss_mlp": 0.21899414, + "step": 3809, + "time_per_iteration": 2.490895986557007 + }, + { + "auxiliary_loss_clip": 0.06533305, + "auxiliary_loss_mlp": 0.0127892, + "balance_loss_clip": 0.06299955, + "balance_loss_mlp": 0.01257331, + "epoch": 0.22906959266496318, + "flos": 22935961105920.0, + "grad_norm": 1.4656596651362013, + "language_loss": 0.82576305, + "learning_rate": 3.5984954991183862e-06, + "loss": 0.90388525, + "num_input_tokens_seen": 81992760, + "router_z_loss_clip": 2.33398438, + "router_z_loss_mlp": 0.21606445, + "step": 3810, + "time_per_iteration": 2.5684924125671387 + }, + { + "auxiliary_loss_clip": 0.06535805, + "auxiliary_loss_mlp": 0.01278794, + "balance_loss_clip": 0.06303711, + "balance_loss_mlp": 0.01259041, + "epoch": 0.22912971591763115, + "flos": 19360614913920.0, + "grad_norm": 1.8664104481323773, + "language_loss": 0.79914212, + "learning_rate": 3.598261401682441e-06, + "loss": 0.8772881, + "num_input_tokens_seen": 82009080, + "router_z_loss_clip": 2.31835938, + "router_z_loss_mlp": 0.19750977, + "step": 3811, + "time_per_iteration": 3.9766526222229004 + }, + { + "auxiliary_loss_clip": 0.0653518, + "auxiliary_loss_mlp": 0.01280553, + "balance_loss_clip": 0.06300636, + "balance_loss_mlp": 0.01258976, + "epoch": 0.22918983917029911, + "flos": 19938940594560.0, + "grad_norm": 1.7476175457386653, + "language_loss": 0.83391893, + "learning_rate": 3.5980272436402632e-06, + "loss": 0.91207623, + "num_input_tokens_seen": 82026705, + "router_z_loss_clip": 2.34765625, + "router_z_loss_mlp": 0.21569824, + "step": 3812, + "time_per_iteration": 2.5174708366394043 + }, + { + "auxiliary_loss_clip": 0.0655017, + "auxiliary_loss_mlp": 0.01288002, + "balance_loss_clip": 0.06306149, + "balance_loss_mlp": 0.01264673, + "epoch": 0.22924996242296708, + "flos": 16696501873920.0, + "grad_norm": 2.3839142545709886, + "language_loss": 0.8400377, + "learning_rate": 3.5977930250007324e-06, + "loss": 0.91841948, + "num_input_tokens_seen": 82043245, + "router_z_loss_clip": 2.44335938, + "router_z_loss_mlp": 0.2331543, + "step": 3813, + "time_per_iteration": 2.554779052734375 + }, + { + "auxiliary_loss_clip": 0.06538843, + "auxiliary_loss_mlp": 0.01276305, + "balance_loss_clip": 0.06301966, + "balance_loss_mlp": 0.01255456, + "epoch": 0.22931008567563504, + "flos": 33044457490560.0, + "grad_norm": 1.6858267943586043, + "language_loss": 0.70580167, + "learning_rate": 3.5975587457727298e-06, + "loss": 0.78395313, + "num_input_tokens_seen": 82066870, + "router_z_loss_clip": 2.36914062, + "router_z_loss_mlp": 0.20861816, + "step": 3814, + "time_per_iteration": 2.6764509677886963 + }, + { + "auxiliary_loss_clip": 0.06536946, + "auxiliary_loss_mlp": 0.01276372, + "balance_loss_clip": 0.06305344, + "balance_loss_mlp": 0.01256786, + "epoch": 0.229370208928303, + "flos": 23337322963200.0, + "grad_norm": 2.8831118113675114, + "language_loss": 0.67954975, + "learning_rate": 3.597324405965139e-06, + "loss": 0.75768292, + "num_input_tokens_seen": 82083180, + "router_z_loss_clip": 2.31445312, + "router_z_loss_mlp": 0.19604492, + "step": 3815, + "time_per_iteration": 3.9759562015533447 + }, + { + "auxiliary_loss_clip": 0.06547147, + "auxiliary_loss_mlp": 0.01282792, + "balance_loss_clip": 0.06311129, + "balance_loss_mlp": 0.01259952, + "epoch": 0.229430332180971, + "flos": 28624068472320.0, + "grad_norm": 1.7261339214380451, + "language_loss": 0.83511633, + "learning_rate": 3.597090005586848e-06, + "loss": 0.91341567, + "num_input_tokens_seen": 82102950, + "router_z_loss_clip": 2.359375, + "router_z_loss_mlp": 0.22839355, + "step": 3816, + "time_per_iteration": 2.6059420108795166 + }, + { + "auxiliary_loss_clip": 0.06539545, + "auxiliary_loss_mlp": 0.01275582, + "balance_loss_clip": 0.06303526, + "balance_loss_mlp": 0.01253302, + "epoch": 0.22949045543363897, + "flos": 17243912597760.0, + "grad_norm": 2.759151157832335, + "language_loss": 0.87850988, + "learning_rate": 3.596855544646742e-06, + "loss": 0.95666116, + "num_input_tokens_seen": 82119510, + "router_z_loss_clip": 2.36132812, + "router_z_loss_mlp": 0.22290039, + "step": 3817, + "time_per_iteration": 2.4830808639526367 + }, + { + "auxiliary_loss_clip": 0.06543944, + "auxiliary_loss_mlp": 0.01278311, + "balance_loss_clip": 0.06306894, + "balance_loss_mlp": 0.01256412, + "epoch": 0.22955057868630693, + "flos": 27496654986240.0, + "grad_norm": 1.6534336608142677, + "language_loss": 0.75343978, + "learning_rate": 3.5966210231537154e-06, + "loss": 0.83166242, + "num_input_tokens_seen": 82140095, + "router_z_loss_clip": 2.37304688, + "router_z_loss_mlp": 0.21899414, + "step": 3818, + "time_per_iteration": 2.634387969970703 + }, + { + "auxiliary_loss_clip": 0.06541272, + "auxiliary_loss_mlp": 0.01278617, + "balance_loss_clip": 0.06305389, + "balance_loss_mlp": 0.0125524, + "epoch": 0.2296107019389749, + "flos": 23483036413440.0, + "grad_norm": 1.7338201278327374, + "language_loss": 0.75486314, + "learning_rate": 3.596386441116659e-06, + "loss": 0.83306205, + "num_input_tokens_seen": 82159510, + "router_z_loss_clip": 2.359375, + "router_z_loss_mlp": 0.23376465, + "step": 3819, + "time_per_iteration": 2.593780279159546 + }, + { + "auxiliary_loss_clip": 0.06542156, + "auxiliary_loss_mlp": 0.01283095, + "balance_loss_clip": 0.06305272, + "balance_loss_mlp": 0.01263009, + "epoch": 0.22967082519164286, + "flos": 31293212757120.0, + "grad_norm": 1.753994919034331, + "language_loss": 0.8208195, + "learning_rate": 3.5961517985444684e-06, + "loss": 0.89907205, + "num_input_tokens_seen": 82179580, + "router_z_loss_clip": 2.3671875, + "router_z_loss_mlp": 0.20092773, + "step": 3820, + "time_per_iteration": 2.6047699451446533 + }, + { + "auxiliary_loss_clip": 0.06548945, + "auxiliary_loss_mlp": 0.0128207, + "balance_loss_clip": 0.06306617, + "balance_loss_mlp": 0.0125892, + "epoch": 0.22973094844431083, + "flos": 14647415402880.0, + "grad_norm": 4.329935521611207, + "language_loss": 0.70069146, + "learning_rate": 3.595917095446042e-06, + "loss": 0.77900159, + "num_input_tokens_seen": 82195585, + "router_z_loss_clip": 2.42773438, + "router_z_loss_mlp": 0.23156738, + "step": 3821, + "time_per_iteration": 2.479454517364502 + }, + { + "auxiliary_loss_clip": 0.06540461, + "auxiliary_loss_mlp": 0.01284444, + "balance_loss_clip": 0.06305948, + "balance_loss_mlp": 0.0126177, + "epoch": 0.2297910716969788, + "flos": 22831057393920.0, + "grad_norm": 2.1026243527938897, + "language_loss": 0.83607674, + "learning_rate": 3.5956823318302796e-06, + "loss": 0.91432583, + "num_input_tokens_seen": 82217530, + "router_z_loss_clip": 2.34375, + "router_z_loss_mlp": 0.22668457, + "step": 3822, + "time_per_iteration": 2.6070644855499268 + }, + { + "auxiliary_loss_clip": 0.06532617, + "auxiliary_loss_mlp": 0.01279894, + "balance_loss_clip": 0.06300794, + "balance_loss_mlp": 0.01256637, + "epoch": 0.2298511949496468, + "flos": 23045644500480.0, + "grad_norm": 1.4679532921797136, + "language_loss": 0.66860032, + "learning_rate": 3.5954475077060833e-06, + "loss": 0.74672538, + "num_input_tokens_seen": 82237980, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.23266602, + "step": 3823, + "time_per_iteration": 2.5421886444091797 + }, + { + "auxiliary_loss_clip": 0.06414426, + "auxiliary_loss_mlp": 0.01282472, + "balance_loss_clip": 0.062925, + "balance_loss_mlp": 0.01277524, + "epoch": 0.22991131820231475, + "flos": 66910296228480.0, + "grad_norm": 0.7674542175482253, + "language_loss": 0.56982124, + "learning_rate": 3.595212623082357e-06, + "loss": 0.64679027, + "num_input_tokens_seen": 82301785, + "router_z_loss_clip": 1.21875, + "router_z_loss_mlp": 0.04943848, + "step": 3824, + "time_per_iteration": 3.2466728687286377 + }, + { + "auxiliary_loss_clip": 0.06530097, + "auxiliary_loss_mlp": 0.0127961, + "balance_loss_clip": 0.06299412, + "balance_loss_mlp": 0.01258975, + "epoch": 0.22997144145498272, + "flos": 17891782767360.0, + "grad_norm": 2.0818696062092643, + "language_loss": 0.73658061, + "learning_rate": 3.594977677968009e-06, + "loss": 0.81467766, + "num_input_tokens_seen": 82317355, + "router_z_loss_clip": 2.30664062, + "router_z_loss_mlp": 0.2064209, + "step": 3825, + "time_per_iteration": 2.4705512523651123 + }, + { + "auxiliary_loss_clip": 0.06534772, + "auxiliary_loss_mlp": 0.01279784, + "balance_loss_clip": 0.06299614, + "balance_loss_mlp": 0.01257432, + "epoch": 0.23003156470765068, + "flos": 24683055062400.0, + "grad_norm": 2.356013632504241, + "language_loss": 0.88289648, + "learning_rate": 3.5947426723719473e-06, + "loss": 0.96104205, + "num_input_tokens_seen": 82336645, + "router_z_loss_clip": 2.3515625, + "router_z_loss_mlp": 0.22351074, + "step": 3826, + "time_per_iteration": 2.5636119842529297 + }, + { + "auxiliary_loss_clip": 0.06540347, + "auxiliary_loss_mlp": 0.01282145, + "balance_loss_clip": 0.0629928, + "balance_loss_mlp": 0.0125897, + "epoch": 0.23009168796031865, + "flos": 15819412060800.0, + "grad_norm": 2.476820030154751, + "language_loss": 0.81866372, + "learning_rate": 3.594507606303083e-06, + "loss": 0.89688861, + "num_input_tokens_seen": 82354225, + "router_z_loss_clip": 2.41601562, + "router_z_loss_mlp": 0.23181152, + "step": 3827, + "time_per_iteration": 2.4817094802856445 + }, + { + "auxiliary_loss_clip": 0.06527712, + "auxiliary_loss_mlp": 0.01278643, + "balance_loss_clip": 0.06294617, + "balance_loss_mlp": 0.01256911, + "epoch": 0.2301518112129866, + "flos": 16217755171200.0, + "grad_norm": 1.7308897820243296, + "language_loss": 0.87303799, + "learning_rate": 3.5942724797703314e-06, + "loss": 0.95110154, + "num_input_tokens_seen": 82370240, + "router_z_loss_clip": 2.33398438, + "router_z_loss_mlp": 0.21716309, + "step": 3828, + "time_per_iteration": 2.517916202545166 + }, + { + "auxiliary_loss_clip": 0.06537049, + "auxiliary_loss_mlp": 0.01281894, + "balance_loss_clip": 0.06300969, + "balance_loss_mlp": 0.01260686, + "epoch": 0.2302119344656546, + "flos": 20601820644480.0, + "grad_norm": 2.1621841127041668, + "language_loss": 0.71223086, + "learning_rate": 3.594037292782607e-06, + "loss": 0.79042029, + "num_input_tokens_seen": 82389145, + "router_z_loss_clip": 2.36328125, + "router_z_loss_mlp": 0.21191406, + "step": 3829, + "time_per_iteration": 2.5232293605804443 + }, + { + "auxiliary_loss_clip": 0.06527743, + "auxiliary_loss_mlp": 0.01278561, + "balance_loss_clip": 0.06299868, + "balance_loss_mlp": 0.01258629, + "epoch": 0.23027205771832257, + "flos": 26804117790720.0, + "grad_norm": 1.5730479724984117, + "language_loss": 0.84944689, + "learning_rate": 3.5938020453488293e-06, + "loss": 0.92750996, + "num_input_tokens_seen": 82409185, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.19934082, + "step": 3830, + "time_per_iteration": 2.6153595447540283 + }, + { + "auxiliary_loss_clip": 0.0653088, + "auxiliary_loss_mlp": 0.01278488, + "balance_loss_clip": 0.06299009, + "balance_loss_mlp": 0.01256863, + "epoch": 0.23033218097099054, + "flos": 43883365916160.0, + "grad_norm": 2.1076872960056834, + "language_loss": 0.67121679, + "learning_rate": 3.5935667374779177e-06, + "loss": 0.74931049, + "num_input_tokens_seen": 82432070, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.21630859, + "step": 3831, + "time_per_iteration": 2.7302401065826416 + }, + { + "auxiliary_loss_clip": 0.06528492, + "auxiliary_loss_mlp": 0.0127826, + "balance_loss_clip": 0.06295311, + "balance_loss_mlp": 0.01255944, + "epoch": 0.2303923042236585, + "flos": 26074837779840.0, + "grad_norm": 2.0679638399971525, + "language_loss": 0.7580992, + "learning_rate": 3.5933313691787957e-06, + "loss": 0.83616674, + "num_input_tokens_seen": 82450625, + "router_z_loss_clip": 2.33203125, + "router_z_loss_mlp": 0.2232666, + "step": 3832, + "time_per_iteration": 2.5789363384246826 + }, + { + "auxiliary_loss_clip": 0.06538022, + "auxiliary_loss_mlp": 0.01277154, + "balance_loss_clip": 0.06301656, + "balance_loss_mlp": 0.01254731, + "epoch": 0.23045242747632647, + "flos": 18302284719360.0, + "grad_norm": 1.9809188001289737, + "language_loss": 0.88229948, + "learning_rate": 3.593095940460389e-06, + "loss": 0.96045125, + "num_input_tokens_seen": 82468575, + "router_z_loss_clip": 2.36523438, + "router_z_loss_mlp": 0.22387695, + "step": 3833, + "time_per_iteration": 2.4890406131744385 + }, + { + "auxiliary_loss_clip": 0.06526786, + "auxiliary_loss_mlp": 0.01275622, + "balance_loss_clip": 0.06291149, + "balance_loss_mlp": 0.01253295, + "epoch": 0.23051255072899443, + "flos": 25527636691200.0, + "grad_norm": 1.751792699614105, + "language_loss": 0.75447762, + "learning_rate": 3.592860451331624e-06, + "loss": 0.83250165, + "num_input_tokens_seen": 82488655, + "router_z_loss_clip": 2.35742188, + "router_z_loss_mlp": 0.2232666, + "step": 3834, + "time_per_iteration": 2.5706892013549805 + }, + { + "auxiliary_loss_clip": 0.06528607, + "auxiliary_loss_mlp": 0.0128462, + "balance_loss_clip": 0.06295913, + "balance_loss_mlp": 0.01262089, + "epoch": 0.2305726739816624, + "flos": 21221584968960.0, + "grad_norm": 2.065687600185831, + "language_loss": 0.86859775, + "learning_rate": 3.592624901801432e-06, + "loss": 0.94673002, + "num_input_tokens_seen": 82507220, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.2253418, + "step": 3835, + "time_per_iteration": 2.5243782997131348 + }, + { + "auxiliary_loss_clip": 0.06531255, + "auxiliary_loss_mlp": 0.01277066, + "balance_loss_clip": 0.06292518, + "balance_loss_mlp": 0.01255489, + "epoch": 0.2306327972343304, + "flos": 23337826087680.0, + "grad_norm": 2.699164056519065, + "language_loss": 0.8346436, + "learning_rate": 3.5923892918787432e-06, + "loss": 0.91272676, + "num_input_tokens_seen": 82527920, + "router_z_loss_clip": 2.38671875, + "router_z_loss_mlp": 0.21594238, + "step": 3836, + "time_per_iteration": 2.588916301727295 + }, + { + "auxiliary_loss_clip": 0.06530184, + "auxiliary_loss_mlp": 0.01278505, + "balance_loss_clip": 0.0629724, + "balance_loss_mlp": 0.01257918, + "epoch": 0.23069292048699835, + "flos": 20672832579840.0, + "grad_norm": 1.5308621387149557, + "language_loss": 0.80123997, + "learning_rate": 3.5921536215724934e-06, + "loss": 0.87932694, + "num_input_tokens_seen": 82549040, + "router_z_loss_clip": 2.328125, + "router_z_loss_mlp": 0.20581055, + "step": 3837, + "time_per_iteration": 2.5265891551971436 + }, + { + "auxiliary_loss_clip": 0.06398934, + "auxiliary_loss_mlp": 0.01263477, + "balance_loss_clip": 0.06276935, + "balance_loss_mlp": 0.01257871, + "epoch": 0.23075304373966632, + "flos": 70472854673280.0, + "grad_norm": 0.8661269137999401, + "language_loss": 0.65425092, + "learning_rate": 3.5919178908916184e-06, + "loss": 0.73087507, + "num_input_tokens_seen": 82604070, + "router_z_loss_clip": 1.21875, + "router_z_loss_mlp": 0.05606079, + "step": 3838, + "time_per_iteration": 3.0690691471099854 + }, + { + "auxiliary_loss_clip": 0.06529964, + "auxiliary_loss_mlp": 0.01281931, + "balance_loss_clip": 0.0629662, + "balance_loss_mlp": 0.01260592, + "epoch": 0.23081316699233428, + "flos": 16623603221760.0, + "grad_norm": 1.9712307402798914, + "language_loss": 0.76919234, + "learning_rate": 3.591682099845058e-06, + "loss": 0.84731126, + "num_input_tokens_seen": 82619665, + "router_z_loss_clip": 2.33398438, + "router_z_loss_mlp": 0.21337891, + "step": 3839, + "time_per_iteration": 2.507899522781372 + }, + { + "auxiliary_loss_clip": 0.06539556, + "auxiliary_loss_mlp": 0.01283771, + "balance_loss_clip": 0.06303147, + "balance_loss_mlp": 0.01261873, + "epoch": 0.23087329024500225, + "flos": 13303192677120.0, + "grad_norm": 1.9535711626830803, + "language_loss": 0.6973604, + "learning_rate": 3.591446248441752e-06, + "loss": 0.77559364, + "num_input_tokens_seen": 82637530, + "router_z_loss_clip": 2.36523438, + "router_z_loss_mlp": 0.21899414, + "step": 3840, + "time_per_iteration": 2.507403612136841 + }, + { + "auxiliary_loss_clip": 0.06524121, + "auxiliary_loss_mlp": 0.01283726, + "balance_loss_clip": 0.06291715, + "balance_loss_mlp": 0.01261994, + "epoch": 0.23093341349767021, + "flos": 17791574883840.0, + "grad_norm": 2.1010490795203967, + "language_loss": 0.79679501, + "learning_rate": 3.591210336690645e-06, + "loss": 0.87487352, + "num_input_tokens_seen": 82656130, + "router_z_loss_clip": 2.32421875, + "router_z_loss_mlp": 0.21740723, + "step": 3841, + "time_per_iteration": 2.542506456375122 + }, + { + "auxiliary_loss_clip": 0.06525128, + "auxiliary_loss_mlp": 0.0128577, + "balance_loss_clip": 0.06292316, + "balance_loss_mlp": 0.0126591, + "epoch": 0.23099353675033818, + "flos": 23994920206080.0, + "grad_norm": 2.202794692504719, + "language_loss": 0.83472121, + "learning_rate": 3.590974364600683e-06, + "loss": 0.91283023, + "num_input_tokens_seen": 82675295, + "router_z_loss_clip": 2.328125, + "router_z_loss_mlp": 0.19873047, + "step": 3842, + "time_per_iteration": 2.5885045528411865 + }, + { + "auxiliary_loss_clip": 0.06525495, + "auxiliary_loss_mlp": 0.01277864, + "balance_loss_clip": 0.06294134, + "balance_loss_mlp": 0.01256251, + "epoch": 0.23105366000300617, + "flos": 36004567478400.0, + "grad_norm": 1.5198018897685672, + "language_loss": 0.66582537, + "learning_rate": 3.5907383321808135e-06, + "loss": 0.74385899, + "num_input_tokens_seen": 82703260, + "router_z_loss_clip": 2.3125, + "router_z_loss_mlp": 0.21630859, + "step": 3843, + "time_per_iteration": 2.7418570518493652 + }, + { + "auxiliary_loss_clip": 0.06517389, + "auxiliary_loss_mlp": 0.01282302, + "balance_loss_clip": 0.06289946, + "balance_loss_mlp": 0.01261667, + "epoch": 0.23111378325567414, + "flos": 31252822289280.0, + "grad_norm": 2.0273673860648613, + "language_loss": 0.77953953, + "learning_rate": 3.590502239439987e-06, + "loss": 0.85753644, + "num_input_tokens_seen": 82725060, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.2064209, + "step": 3844, + "time_per_iteration": 2.697105884552002 + }, + { + "auxiliary_loss_clip": 0.0652685, + "auxiliary_loss_mlp": 0.01278908, + "balance_loss_clip": 0.0629425, + "balance_loss_mlp": 0.01258618, + "epoch": 0.2311739065083421, + "flos": 19214230631040.0, + "grad_norm": 1.5733936305181, + "language_loss": 0.78526026, + "learning_rate": 3.590266086387156e-06, + "loss": 0.86331779, + "num_input_tokens_seen": 82742960, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.20275879, + "step": 3845, + "time_per_iteration": 3.9081645011901855 + }, + { + "auxiliary_loss_clip": 0.06512116, + "auxiliary_loss_mlp": 0.01275528, + "balance_loss_clip": 0.06288872, + "balance_loss_mlp": 0.01256323, + "epoch": 0.23123402976101007, + "flos": 23365638443520.0, + "grad_norm": 2.144369954512039, + "language_loss": 0.7696318, + "learning_rate": 3.590029873031276e-06, + "loss": 0.84750825, + "num_input_tokens_seen": 82760205, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.1920166, + "step": 3846, + "time_per_iteration": 2.5204334259033203 + }, + { + "auxiliary_loss_clip": 0.06530652, + "auxiliary_loss_mlp": 0.01280785, + "balance_loss_clip": 0.06296441, + "balance_loss_mlp": 0.01258946, + "epoch": 0.23129415301367803, + "flos": 13740458808960.0, + "grad_norm": 2.058546116129278, + "language_loss": 0.70736533, + "learning_rate": 3.589793599381304e-06, + "loss": 0.78547966, + "num_input_tokens_seen": 82778590, + "router_z_loss_clip": 2.34570312, + "router_z_loss_mlp": 0.21862793, + "step": 3847, + "time_per_iteration": 3.955061197280884 + }, + { + "auxiliary_loss_clip": 0.06395237, + "auxiliary_loss_mlp": 0.01270099, + "balance_loss_clip": 0.06274906, + "balance_loss_mlp": 0.01264553, + "epoch": 0.231354276266346, + "flos": 69756907461120.0, + "grad_norm": 0.7764718422559022, + "language_loss": 0.60909712, + "learning_rate": 3.589557265446198e-06, + "loss": 0.68575048, + "num_input_tokens_seen": 82833925, + "router_z_loss_clip": 1.203125, + "router_z_loss_mlp": 0.05557251, + "step": 3848, + "time_per_iteration": 3.0406246185302734 + }, + { + "auxiliary_loss_clip": 0.0652846, + "auxiliary_loss_mlp": 0.0128118, + "balance_loss_clip": 0.06295802, + "balance_loss_mlp": 0.01259925, + "epoch": 0.231414399519014, + "flos": 18840597275520.0, + "grad_norm": 2.051565204924659, + "language_loss": 0.79345453, + "learning_rate": 3.589320871234923e-06, + "loss": 0.87155092, + "num_input_tokens_seen": 82850625, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.21252441, + "step": 3849, + "time_per_iteration": 2.508357048034668 + }, + { + "auxiliary_loss_clip": 0.06525768, + "auxiliary_loss_mlp": 0.01279584, + "balance_loss_clip": 0.06294318, + "balance_loss_mlp": 0.01257995, + "epoch": 0.23147452277168196, + "flos": 36143949945600.0, + "grad_norm": 1.9799304996672493, + "language_loss": 0.72033536, + "learning_rate": 3.5890844167564405e-06, + "loss": 0.7983889, + "num_input_tokens_seen": 82872105, + "router_z_loss_clip": 2.31640625, + "router_z_loss_mlp": 0.21594238, + "step": 3850, + "time_per_iteration": 2.6283209323883057 + }, + { + "auxiliary_loss_clip": 0.06522007, + "auxiliary_loss_mlp": 0.012814, + "balance_loss_clip": 0.06293751, + "balance_loss_mlp": 0.01260562, + "epoch": 0.23153464602434992, + "flos": 20819091081600.0, + "grad_norm": 2.1585980033328216, + "language_loss": 0.76770389, + "learning_rate": 3.588847902019718e-06, + "loss": 0.84573799, + "num_input_tokens_seen": 82890595, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.20825195, + "step": 3851, + "time_per_iteration": 3.9542527198791504 + }, + { + "auxiliary_loss_clip": 0.06522575, + "auxiliary_loss_mlp": 0.01285563, + "balance_loss_clip": 0.06294242, + "balance_loss_mlp": 0.01264367, + "epoch": 0.2315947692770179, + "flos": 19945606993920.0, + "grad_norm": 4.396515099862161, + "language_loss": 0.70780337, + "learning_rate": 3.588611327033723e-06, + "loss": 0.78588474, + "num_input_tokens_seen": 82908910, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.21191406, + "step": 3852, + "time_per_iteration": 2.5292365550994873 + }, + { + "auxiliary_loss_clip": 0.06530476, + "auxiliary_loss_mlp": 0.01287483, + "balance_loss_clip": 0.0629744, + "balance_loss_mlp": 0.01267027, + "epoch": 0.23165489252968585, + "flos": 12859805197440.0, + "grad_norm": 2.0519661349019906, + "language_loss": 0.68142366, + "learning_rate": 3.588374691807428e-06, + "loss": 0.75960326, + "num_input_tokens_seen": 82925405, + "router_z_loss_clip": 2.33203125, + "router_z_loss_mlp": 0.20471191, + "step": 3853, + "time_per_iteration": 2.524214267730713 + }, + { + "auxiliary_loss_clip": 0.06532255, + "auxiliary_loss_mlp": 0.0127975, + "balance_loss_clip": 0.06299816, + "balance_loss_mlp": 0.01258579, + "epoch": 0.23171501578235382, + "flos": 30636202492800.0, + "grad_norm": 2.067759569090495, + "language_loss": 0.80620718, + "learning_rate": 3.5881379963498053e-06, + "loss": 0.88432729, + "num_input_tokens_seen": 82945615, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.21166992, + "step": 3854, + "time_per_iteration": 3.9913628101348877 + }, + { + "auxiliary_loss_clip": 0.06540599, + "auxiliary_loss_mlp": 0.0128392, + "balance_loss_clip": 0.06299743, + "balance_loss_mlp": 0.0126201, + "epoch": 0.23177513903502178, + "flos": 23849709880320.0, + "grad_norm": 1.9679065377847755, + "language_loss": 0.66096866, + "learning_rate": 3.587901240669831e-06, + "loss": 0.73921382, + "num_input_tokens_seen": 82967570, + "router_z_loss_clip": 2.4140625, + "router_z_loss_mlp": 0.21899414, + "step": 3855, + "time_per_iteration": 2.560032844543457 + }, + { + "auxiliary_loss_clip": 0.06526054, + "auxiliary_loss_mlp": 0.0129156, + "balance_loss_clip": 0.06295231, + "balance_loss_mlp": 0.0126972, + "epoch": 0.23183526228768978, + "flos": 29578040006400.0, + "grad_norm": 1.903884891832667, + "language_loss": 0.71179903, + "learning_rate": 3.5876644247764815e-06, + "loss": 0.78997517, + "num_input_tokens_seen": 82987435, + "router_z_loss_clip": 2.30664062, + "router_z_loss_mlp": 0.21838379, + "step": 3856, + "time_per_iteration": 2.602130174636841 + }, + { + "auxiliary_loss_clip": 0.06526691, + "auxiliary_loss_mlp": 0.01281572, + "balance_loss_clip": 0.06295416, + "balance_loss_mlp": 0.01261032, + "epoch": 0.23189538554035774, + "flos": 34467155164800.0, + "grad_norm": 1.5724941960823864, + "language_loss": 0.77830631, + "learning_rate": 3.5874275486787387e-06, + "loss": 0.85638893, + "num_input_tokens_seen": 83010505, + "router_z_loss_clip": 2.3125, + "router_z_loss_mlp": 0.20532227, + "step": 3857, + "time_per_iteration": 2.6366043090820312 + }, + { + "auxiliary_loss_clip": 0.06534412, + "auxiliary_loss_mlp": 0.0128226, + "balance_loss_clip": 0.06299518, + "balance_loss_mlp": 0.01259813, + "epoch": 0.2319555087930257, + "flos": 18009558080640.0, + "grad_norm": 2.2572913357008804, + "language_loss": 0.91563249, + "learning_rate": 3.587190612385584e-06, + "loss": 0.99379921, + "num_input_tokens_seen": 83026705, + "router_z_loss_clip": 2.34765625, + "router_z_loss_mlp": 0.2244873, + "step": 3858, + "time_per_iteration": 2.532270908355713 + }, + { + "auxiliary_loss_clip": 0.06524485, + "auxiliary_loss_mlp": 0.01281992, + "balance_loss_clip": 0.06299204, + "balance_loss_mlp": 0.01261833, + "epoch": 0.23201563204569367, + "flos": 23149709671680.0, + "grad_norm": 2.204043049012761, + "language_loss": 0.77328205, + "learning_rate": 3.5869536159060026e-06, + "loss": 0.85134679, + "num_input_tokens_seen": 83046500, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.20153809, + "step": 3859, + "time_per_iteration": 2.539982318878174 + }, + { + "auxiliary_loss_clip": 0.06526206, + "auxiliary_loss_mlp": 0.01282174, + "balance_loss_clip": 0.0629694, + "balance_loss_mlp": 0.01261098, + "epoch": 0.23207575529836164, + "flos": 20674300026240.0, + "grad_norm": 1.845949683873727, + "language_loss": 0.84980345, + "learning_rate": 3.58671655924898e-06, + "loss": 0.9278872, + "num_input_tokens_seen": 83065280, + "router_z_loss_clip": 2.29101562, + "router_z_loss_mlp": 0.21057129, + "step": 3860, + "time_per_iteration": 2.5464277267456055 + }, + { + "auxiliary_loss_clip": 0.06522566, + "auxiliary_loss_mlp": 0.01275514, + "balance_loss_clip": 0.06296555, + "balance_loss_mlp": 0.01254927, + "epoch": 0.2321358785510296, + "flos": 16477805917440.0, + "grad_norm": 2.2860023761203423, + "language_loss": 0.83316106, + "learning_rate": 3.586479442423508e-06, + "loss": 0.91114187, + "num_input_tokens_seen": 83082310, + "router_z_loss_clip": 2.25976562, + "router_z_loss_mlp": 0.20581055, + "step": 3861, + "time_per_iteration": 2.611527681350708 + }, + { + "auxiliary_loss_clip": 0.06526297, + "auxiliary_loss_mlp": 0.01281702, + "balance_loss_clip": 0.06296666, + "balance_loss_mlp": 0.01261198, + "epoch": 0.2321960018036976, + "flos": 21622737191040.0, + "grad_norm": 1.932164160561112, + "language_loss": 0.86100018, + "learning_rate": 3.586242265438576e-06, + "loss": 0.93908012, + "num_input_tokens_seen": 83102065, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.2052002, + "step": 3862, + "time_per_iteration": 2.599078893661499 + }, + { + "auxiliary_loss_clip": 0.06517789, + "auxiliary_loss_mlp": 0.01277863, + "balance_loss_clip": 0.0629621, + "balance_loss_mlp": 0.0125898, + "epoch": 0.23225612505636556, + "flos": 22277734957440.0, + "grad_norm": 1.8279700206037066, + "language_loss": 0.75524014, + "learning_rate": 3.5860050283031773e-06, + "loss": 0.83319664, + "num_input_tokens_seen": 83121445, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.18884277, + "step": 3863, + "time_per_iteration": 2.5592801570892334 + }, + { + "auxiliary_loss_clip": 0.06518993, + "auxiliary_loss_mlp": 0.01279608, + "balance_loss_clip": 0.06295245, + "balance_loss_mlp": 0.01260237, + "epoch": 0.23231624830903352, + "flos": 17057431336320.0, + "grad_norm": 1.8656538002376628, + "language_loss": 0.7504397, + "learning_rate": 3.58576773102631e-06, + "loss": 0.82842577, + "num_input_tokens_seen": 83138175, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.19372559, + "step": 3864, + "time_per_iteration": 2.549480438232422 + }, + { + "auxiliary_loss_clip": 0.06521947, + "auxiliary_loss_mlp": 0.01276148, + "balance_loss_clip": 0.06295212, + "balance_loss_mlp": 0.01255572, + "epoch": 0.2323763715617015, + "flos": 34648353619200.0, + "grad_norm": 2.1960138476201023, + "language_loss": 0.70505309, + "learning_rate": 3.5855303736169714e-06, + "loss": 0.78303403, + "num_input_tokens_seen": 83161975, + "router_z_loss_clip": 2.26757812, + "router_z_loss_mlp": 0.20568848, + "step": 3865, + "time_per_iteration": 2.6358752250671387 + }, + { + "auxiliary_loss_clip": 0.06539118, + "auxiliary_loss_mlp": 0.01279948, + "balance_loss_clip": 0.06299968, + "balance_loss_mlp": 0.01256464, + "epoch": 0.23243649481436945, + "flos": 25557922742400.0, + "grad_norm": 1.8533317501805489, + "language_loss": 0.95648015, + "learning_rate": 3.5852929560841617e-06, + "loss": 1.03467083, + "num_input_tokens_seen": 83180905, + "router_z_loss_clip": 2.39257812, + "router_z_loss_mlp": 0.23510742, + "step": 3866, + "time_per_iteration": 2.5805771350860596 + }, + { + "auxiliary_loss_clip": 0.06523386, + "auxiliary_loss_mlp": 0.0128215, + "balance_loss_clip": 0.06294955, + "balance_loss_mlp": 0.01260561, + "epoch": 0.23249661806703742, + "flos": 20489411992320.0, + "grad_norm": 3.3036871554572285, + "language_loss": 0.74161094, + "learning_rate": 3.5850554784368846e-06, + "loss": 0.81966627, + "num_input_tokens_seen": 83196390, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.21569824, + "step": 3867, + "time_per_iteration": 2.485872268676758 + }, + { + "auxiliary_loss_clip": 0.06527717, + "auxiliary_loss_mlp": 0.01278812, + "balance_loss_clip": 0.06298171, + "balance_loss_mlp": 0.01257271, + "epoch": 0.23255674131970538, + "flos": 20382956979840.0, + "grad_norm": 1.7596317335066716, + "language_loss": 0.82912898, + "learning_rate": 3.584817940684145e-06, + "loss": 0.90719432, + "num_input_tokens_seen": 83216165, + "router_z_loss_clip": 2.29296875, + "router_z_loss_mlp": 0.2154541, + "step": 3868, + "time_per_iteration": 2.5404841899871826 + }, + { + "auxiliary_loss_clip": 0.06518516, + "auxiliary_loss_mlp": 0.01279395, + "balance_loss_clip": 0.0629604, + "balance_loss_mlp": 0.01260321, + "epoch": 0.23261686457237338, + "flos": 17061833675520.0, + "grad_norm": 1.6597028261056146, + "language_loss": 0.73686016, + "learning_rate": 3.58458034283495e-06, + "loss": 0.81483924, + "num_input_tokens_seen": 83233845, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.1907959, + "step": 3869, + "time_per_iteration": 2.4850685596466064 + }, + { + "auxiliary_loss_clip": 0.06524374, + "auxiliary_loss_mlp": 0.01289937, + "balance_loss_clip": 0.06296247, + "balance_loss_mlp": 0.01268241, + "epoch": 0.23267698782504134, + "flos": 29177726325120.0, + "grad_norm": 1.8030595092782438, + "language_loss": 0.8079325, + "learning_rate": 3.5843426848983097e-06, + "loss": 0.88607562, + "num_input_tokens_seen": 83254930, + "router_z_loss_clip": 2.28320312, + "router_z_loss_mlp": 0.21716309, + "step": 3870, + "time_per_iteration": 2.5915870666503906 + }, + { + "auxiliary_loss_clip": 0.06532744, + "auxiliary_loss_mlp": 0.01283178, + "balance_loss_clip": 0.06299601, + "balance_loss_mlp": 0.0126178, + "epoch": 0.2327371110777093, + "flos": 21180355960320.0, + "grad_norm": 1.9640097574691695, + "language_loss": 0.71693742, + "learning_rate": 3.5841049668832357e-06, + "loss": 0.79509664, + "num_input_tokens_seen": 83272095, + "router_z_loss_clip": 2.33398438, + "router_z_loss_mlp": 0.21411133, + "step": 3871, + "time_per_iteration": 2.4897918701171875 + }, + { + "auxiliary_loss_clip": 0.065286, + "auxiliary_loss_mlp": 0.01280741, + "balance_loss_clip": 0.06295659, + "balance_loss_mlp": 0.01260034, + "epoch": 0.23279723433037727, + "flos": 24869997521280.0, + "grad_norm": 2.5352867939179484, + "language_loss": 0.69289309, + "learning_rate": 3.5838671887987433e-06, + "loss": 0.77098656, + "num_input_tokens_seen": 83290980, + "router_z_loss_clip": 2.33007812, + "router_z_loss_mlp": 0.20715332, + "step": 3872, + "time_per_iteration": 2.5636072158813477 + }, + { + "auxiliary_loss_clip": 0.06535204, + "auxiliary_loss_mlp": 0.01285984, + "balance_loss_clip": 0.06299452, + "balance_loss_mlp": 0.01263894, + "epoch": 0.23285735758304524, + "flos": 38809823921280.0, + "grad_norm": 2.0709139139802497, + "language_loss": 0.78303361, + "learning_rate": 3.5836293506538474e-06, + "loss": 0.86124545, + "num_input_tokens_seen": 83315175, + "router_z_loss_clip": 2.35742188, + "router_z_loss_mlp": 0.22094727, + "step": 3873, + "time_per_iteration": 2.671551465988159 + }, + { + "auxiliary_loss_clip": 0.06419215, + "auxiliary_loss_mlp": 0.01286246, + "balance_loss_clip": 0.06302594, + "balance_loss_mlp": 0.01280601, + "epoch": 0.2329174808357132, + "flos": 53962274280960.0, + "grad_norm": 0.8377063316545934, + "language_loss": 0.60286367, + "learning_rate": 3.5833914524575687e-06, + "loss": 0.67991829, + "num_input_tokens_seen": 83372060, + "router_z_loss_clip": 1.171875, + "router_z_loss_mlp": 0.05636597, + "step": 3874, + "time_per_iteration": 3.087822675704956 + }, + { + "auxiliary_loss_clip": 0.06525364, + "auxiliary_loss_mlp": 0.01281697, + "balance_loss_clip": 0.06298245, + "balance_loss_mlp": 0.012608, + "epoch": 0.23297760408838117, + "flos": 21222549290880.0, + "grad_norm": 2.3064833177652773, + "language_loss": 0.81324208, + "learning_rate": 3.583153494218927e-06, + "loss": 0.89131272, + "num_input_tokens_seen": 83389795, + "router_z_loss_clip": 2.2734375, + "router_z_loss_mlp": 0.20898438, + "step": 3875, + "time_per_iteration": 2.560511589050293 + }, + { + "auxiliary_loss_clip": 0.06520373, + "auxiliary_loss_mlp": 0.01275593, + "balance_loss_clip": 0.06294609, + "balance_loss_mlp": 0.01255983, + "epoch": 0.23303772734104916, + "flos": 28410613395840.0, + "grad_norm": 2.285945976693144, + "language_loss": 0.62077069, + "learning_rate": 3.5829154759469464e-06, + "loss": 0.69873035, + "num_input_tokens_seen": 83410005, + "router_z_loss_clip": 2.25976562, + "router_z_loss_mlp": 0.19628906, + "step": 3876, + "time_per_iteration": 2.63901948928833 + }, + { + "auxiliary_loss_clip": 0.06525883, + "auxiliary_loss_mlp": 0.01277799, + "balance_loss_clip": 0.06296121, + "balance_loss_mlp": 0.01258034, + "epoch": 0.23309785059371713, + "flos": 24321328986240.0, + "grad_norm": 1.9984006432494335, + "language_loss": 0.71087664, + "learning_rate": 3.5826773976506523e-06, + "loss": 0.78891349, + "num_input_tokens_seen": 83430250, + "router_z_loss_clip": 2.29492188, + "router_z_loss_mlp": 0.19787598, + "step": 3877, + "time_per_iteration": 2.533858299255371 + }, + { + "auxiliary_loss_clip": 0.06524412, + "auxiliary_loss_mlp": 0.01274037, + "balance_loss_clip": 0.06297307, + "balance_loss_mlp": 0.01253485, + "epoch": 0.2331579738463851, + "flos": 15997633695360.0, + "grad_norm": 2.4085120625047143, + "language_loss": 0.81286502, + "learning_rate": 3.582439259339073e-06, + "loss": 0.89084947, + "num_input_tokens_seen": 83447950, + "router_z_loss_clip": 2.27148438, + "router_z_loss_mlp": 0.20556641, + "step": 3878, + "time_per_iteration": 2.5396199226379395 + }, + { + "auxiliary_loss_clip": 0.06534204, + "auxiliary_loss_mlp": 0.01280932, + "balance_loss_clip": 0.06299698, + "balance_loss_mlp": 0.0126013, + "epoch": 0.23321809709905306, + "flos": 36435418773120.0, + "grad_norm": 2.3738521781051207, + "language_loss": 0.75046253, + "learning_rate": 3.5822010610212374e-06, + "loss": 0.82861388, + "num_input_tokens_seen": 83467785, + "router_z_loss_clip": 2.34570312, + "router_z_loss_mlp": 0.20788574, + "step": 3879, + "time_per_iteration": 2.6389944553375244 + }, + { + "auxiliary_loss_clip": 0.06528227, + "auxiliary_loss_mlp": 0.01279465, + "balance_loss_clip": 0.06299725, + "balance_loss_mlp": 0.01257972, + "epoch": 0.23327822035172102, + "flos": 21331184509440.0, + "grad_norm": 4.081669167605711, + "language_loss": 0.90526301, + "learning_rate": 3.5819628027061795e-06, + "loss": 0.98333991, + "num_input_tokens_seen": 83485390, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.21496582, + "step": 3880, + "time_per_iteration": 2.5659923553466797 + }, + { + "auxiliary_loss_clip": 0.06530303, + "auxiliary_loss_mlp": 0.01278258, + "balance_loss_clip": 0.06297769, + "balance_loss_mlp": 0.0125841, + "epoch": 0.233338343604389, + "flos": 19177907086080.0, + "grad_norm": 1.8856968798779488, + "language_loss": 0.72716117, + "learning_rate": 3.5817244844029334e-06, + "loss": 0.80524671, + "num_input_tokens_seen": 83504890, + "router_z_loss_clip": 2.32421875, + "router_z_loss_mlp": 0.19848633, + "step": 3881, + "time_per_iteration": 2.528083324432373 + }, + { + "auxiliary_loss_clip": 0.0653114, + "auxiliary_loss_mlp": 0.01278184, + "balance_loss_clip": 0.06302784, + "balance_loss_mlp": 0.0125805, + "epoch": 0.23339846685705698, + "flos": 26915939464320.0, + "grad_norm": 1.6578041146422486, + "language_loss": 0.68699455, + "learning_rate": 3.581486106120537e-06, + "loss": 0.76508778, + "num_input_tokens_seen": 83526475, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.20129395, + "step": 3882, + "time_per_iteration": 2.575275182723999 + }, + { + "auxiliary_loss_clip": 0.06529698, + "auxiliary_loss_mlp": 0.0127867, + "balance_loss_clip": 0.0629693, + "balance_loss_mlp": 0.01258226, + "epoch": 0.23345859010972494, + "flos": 32351375243520.0, + "grad_norm": 2.0584115637368767, + "language_loss": 0.77458596, + "learning_rate": 3.5812476678680287e-06, + "loss": 0.8526696, + "num_input_tokens_seen": 83546620, + "router_z_loss_clip": 2.33007812, + "router_z_loss_mlp": 0.20446777, + "step": 3883, + "time_per_iteration": 2.626533269882202 + }, + { + "auxiliary_loss_clip": 0.06405331, + "auxiliary_loss_mlp": 0.01262592, + "balance_loss_clip": 0.0628854, + "balance_loss_mlp": 0.01257663, + "epoch": 0.2335187133623929, + "flos": 58505805273600.0, + "grad_norm": 0.7704933603606158, + "language_loss": 0.59193355, + "learning_rate": 3.58100916965445e-06, + "loss": 0.66861278, + "num_input_tokens_seen": 83616160, + "router_z_loss_clip": 1.17089844, + "router_z_loss_mlp": 0.04925537, + "step": 3884, + "time_per_iteration": 4.6365087032318115 + }, + { + "auxiliary_loss_clip": 0.06533933, + "auxiliary_loss_mlp": 0.01280044, + "balance_loss_clip": 0.06302384, + "balance_loss_mlp": 0.01260017, + "epoch": 0.23357883661506088, + "flos": 24509822745600.0, + "grad_norm": 1.6610169782824564, + "language_loss": 0.80755335, + "learning_rate": 3.5807706114888455e-06, + "loss": 0.88569313, + "num_input_tokens_seen": 83636795, + "router_z_loss_clip": 2.31640625, + "router_z_loss_mlp": 0.20031738, + "step": 3885, + "time_per_iteration": 2.6180286407470703 + }, + { + "auxiliary_loss_clip": 0.06523974, + "auxiliary_loss_mlp": 0.01286823, + "balance_loss_clip": 0.06296945, + "balance_loss_mlp": 0.01265687, + "epoch": 0.23363895986772884, + "flos": 18953760614400.0, + "grad_norm": 2.3207575064623613, + "language_loss": 0.88500953, + "learning_rate": 3.580531993380261e-06, + "loss": 0.96311754, + "num_input_tokens_seen": 83654050, + "router_z_loss_clip": 2.26953125, + "router_z_loss_mlp": 0.21130371, + "step": 3886, + "time_per_iteration": 2.5116477012634277 + }, + { + "auxiliary_loss_clip": 0.06532702, + "auxiliary_loss_mlp": 0.01282855, + "balance_loss_clip": 0.06302926, + "balance_loss_mlp": 0.01262518, + "epoch": 0.2336990831203968, + "flos": 31694993884800.0, + "grad_norm": 1.8877154320423692, + "language_loss": 0.74203557, + "learning_rate": 3.5802933153377445e-06, + "loss": 0.82019114, + "num_input_tokens_seen": 83673720, + "router_z_loss_clip": 2.29882812, + "router_z_loss_mlp": 0.20336914, + "step": 3887, + "time_per_iteration": 4.024793863296509 + }, + { + "auxiliary_loss_clip": 0.06531121, + "auxiliary_loss_mlp": 0.01281305, + "balance_loss_clip": 0.06301375, + "balance_loss_mlp": 0.01261206, + "epoch": 0.23375920637306477, + "flos": 27717237659520.0, + "grad_norm": 1.8176198265631485, + "language_loss": 0.84478307, + "learning_rate": 3.5800545773703475e-06, + "loss": 0.92290735, + "num_input_tokens_seen": 83693470, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.20092773, + "step": 3888, + "time_per_iteration": 2.6297786235809326 + }, + { + "auxiliary_loss_clip": 0.06524558, + "auxiliary_loss_mlp": 0.01283639, + "balance_loss_clip": 0.06298919, + "balance_loss_mlp": 0.01263934, + "epoch": 0.23381932962573276, + "flos": 17681346437760.0, + "grad_norm": 2.056965631559896, + "language_loss": 0.88319886, + "learning_rate": 3.5798157794871225e-06, + "loss": 0.96128076, + "num_input_tokens_seen": 83711620, + "router_z_loss_clip": 2.25585938, + "router_z_loss_mlp": 0.19689941, + "step": 3889, + "time_per_iteration": 2.524937152862549 + }, + { + "auxiliary_loss_clip": 0.06524722, + "auxiliary_loss_mlp": 0.01282198, + "balance_loss_clip": 0.06299812, + "balance_loss_mlp": 0.01262708, + "epoch": 0.23387945287840073, + "flos": 14395833918720.0, + "grad_norm": 2.5361674913720487, + "language_loss": 0.7777229, + "learning_rate": 3.579576921697125e-06, + "loss": 0.85579211, + "num_input_tokens_seen": 83727890, + "router_z_loss_clip": 2.24804688, + "router_z_loss_mlp": 0.19470215, + "step": 3890, + "time_per_iteration": 4.02982497215271 + }, + { + "auxiliary_loss_clip": 0.06526545, + "auxiliary_loss_mlp": 0.01284178, + "balance_loss_clip": 0.06297928, + "balance_loss_mlp": 0.01264008, + "epoch": 0.2339395761310687, + "flos": 46108451888640.0, + "grad_norm": 1.897831891943022, + "language_loss": 0.74213481, + "learning_rate": 3.579338004009412e-06, + "loss": 0.82024205, + "num_input_tokens_seen": 83749370, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.20166016, + "step": 3891, + "time_per_iteration": 2.7951042652130127 + }, + { + "auxiliary_loss_clip": 0.06524959, + "auxiliary_loss_mlp": 0.01281513, + "balance_loss_clip": 0.06301059, + "balance_loss_mlp": 0.01262821, + "epoch": 0.23399969938373666, + "flos": 22388508455040.0, + "grad_norm": 1.6273389699862264, + "language_loss": 0.82863498, + "learning_rate": 3.5790990264330433e-06, + "loss": 0.90669972, + "num_input_tokens_seen": 83769560, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.18688965, + "step": 3892, + "time_per_iteration": 2.530782461166382 + }, + { + "auxiliary_loss_clip": 0.06531358, + "auxiliary_loss_mlp": 0.01281181, + "balance_loss_clip": 0.06301633, + "balance_loss_mlp": 0.01260951, + "epoch": 0.23405982263640462, + "flos": 43518746874240.0, + "grad_norm": 1.4575042253356143, + "language_loss": 0.65593249, + "learning_rate": 3.578859988977082e-06, + "loss": 0.7340579, + "num_input_tokens_seen": 83795635, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.20227051, + "step": 3893, + "time_per_iteration": 4.212572813034058 + }, + { + "auxiliary_loss_clip": 0.06519544, + "auxiliary_loss_mlp": 0.01283369, + "balance_loss_clip": 0.06297972, + "balance_loss_mlp": 0.01263259, + "epoch": 0.2341199458890726, + "flos": 22571216282880.0, + "grad_norm": 2.0084649252152564, + "language_loss": 0.79620147, + "learning_rate": 3.5786208916505916e-06, + "loss": 0.87423062, + "num_input_tokens_seen": 83814090, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.20117188, + "step": 3894, + "time_per_iteration": 2.580109119415283 + }, + { + "auxiliary_loss_clip": 0.06524212, + "auxiliary_loss_mlp": 0.01276443, + "balance_loss_clip": 0.06300013, + "balance_loss_mlp": 0.01257763, + "epoch": 0.23418006914174055, + "flos": 25641764352000.0, + "grad_norm": 1.5130292757453454, + "language_loss": 0.82681906, + "learning_rate": 3.5783817344626383e-06, + "loss": 0.90482563, + "num_input_tokens_seen": 83836870, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.18664551, + "step": 3895, + "time_per_iteration": 2.583759069442749 + }, + { + "auxiliary_loss_clip": 0.06520028, + "auxiliary_loss_mlp": 0.01278233, + "balance_loss_clip": 0.06295593, + "balance_loss_mlp": 0.0125885, + "epoch": 0.23424019239440855, + "flos": 13549826770560.0, + "grad_norm": 2.4592405022159496, + "language_loss": 0.81334293, + "learning_rate": 3.578142517422292e-06, + "loss": 0.89132559, + "num_input_tokens_seen": 83853275, + "router_z_loss_clip": 2.24414062, + "router_z_loss_mlp": 0.19372559, + "step": 3896, + "time_per_iteration": 2.536252021789551 + }, + { + "auxiliary_loss_clip": 0.06530771, + "auxiliary_loss_mlp": 0.012867, + "balance_loss_clip": 0.06299435, + "balance_loss_mlp": 0.01264253, + "epoch": 0.2343003156470765, + "flos": 22426131738240.0, + "grad_norm": 3.0940729647414598, + "language_loss": 0.83988011, + "learning_rate": 3.577903240538623e-06, + "loss": 0.91805482, + "num_input_tokens_seen": 83872340, + "router_z_loss_clip": 2.31445312, + "router_z_loss_mlp": 0.2244873, + "step": 3897, + "time_per_iteration": 2.572230577468872 + }, + { + "auxiliary_loss_clip": 0.06528857, + "auxiliary_loss_mlp": 0.01279177, + "balance_loss_clip": 0.06296414, + "balance_loss_mlp": 0.01258626, + "epoch": 0.23436043889974448, + "flos": 14795644475520.0, + "grad_norm": 2.317273344502078, + "language_loss": 0.79819012, + "learning_rate": 3.577663903820705e-06, + "loss": 0.87627041, + "num_input_tokens_seen": 83888795, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.20544434, + "step": 3898, + "time_per_iteration": 2.5207583904266357 + }, + { + "auxiliary_loss_clip": 0.0651897, + "auxiliary_loss_mlp": 0.01278878, + "balance_loss_clip": 0.06297988, + "balance_loss_mlp": 0.0126021, + "epoch": 0.23442056215241244, + "flos": 22972242723840.0, + "grad_norm": 1.88849810547605, + "language_loss": 0.7476474, + "learning_rate": 3.577424507277614e-06, + "loss": 0.82562584, + "num_input_tokens_seen": 83906820, + "router_z_loss_clip": 2.20800781, + "router_z_loss_mlp": 0.18676758, + "step": 3899, + "time_per_iteration": 2.535256862640381 + }, + { + "auxiliary_loss_clip": 0.06525272, + "auxiliary_loss_mlp": 0.01280019, + "balance_loss_clip": 0.06296974, + "balance_loss_mlp": 0.01259515, + "epoch": 0.2344806854050804, + "flos": 23077901122560.0, + "grad_norm": 1.7218865416029, + "language_loss": 0.75599915, + "learning_rate": 3.5771850509184277e-06, + "loss": 0.83405209, + "num_input_tokens_seen": 83926370, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.20507812, + "step": 3900, + "time_per_iteration": 2.5674827098846436 + }, + { + "auxiliary_loss_clip": 0.06524841, + "auxiliary_loss_mlp": 0.01281356, + "balance_loss_clip": 0.06296976, + "balance_loss_mlp": 0.01260959, + "epoch": 0.23454080865774837, + "flos": 16332805226880.0, + "grad_norm": 2.155964713283421, + "language_loss": 0.67468774, + "learning_rate": 3.5769455347522256e-06, + "loss": 0.75274968, + "num_input_tokens_seen": 83944600, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.20410156, + "step": 3901, + "time_per_iteration": 2.536736249923706 + }, + { + "auxiliary_loss_clip": 0.06415819, + "auxiliary_loss_mlp": 0.01256149, + "balance_loss_clip": 0.06299057, + "balance_loss_mlp": 0.01251181, + "epoch": 0.23460093191041637, + "flos": 67779545685120.0, + "grad_norm": 0.7514179301091559, + "language_loss": 0.58278525, + "learning_rate": 3.576705958788091e-06, + "loss": 0.65950489, + "num_input_tokens_seen": 84005100, + "router_z_loss_clip": 1.16796875, + "router_z_loss_mlp": 0.0496521, + "step": 3902, + "time_per_iteration": 3.134718894958496 + }, + { + "auxiliary_loss_clip": 0.06519462, + "auxiliary_loss_mlp": 0.01278211, + "balance_loss_clip": 0.06292997, + "balance_loss_mlp": 0.01258375, + "epoch": 0.23466105516308433, + "flos": 20082725400960.0, + "grad_norm": 4.781089560028637, + "language_loss": 0.80931306, + "learning_rate": 3.576466323035108e-06, + "loss": 0.88728976, + "num_input_tokens_seen": 84023775, + "router_z_loss_clip": 2.26171875, + "router_z_loss_mlp": 0.19836426, + "step": 3903, + "time_per_iteration": 2.525059938430786 + }, + { + "auxiliary_loss_clip": 0.06522641, + "auxiliary_loss_mlp": 0.01278887, + "balance_loss_clip": 0.06295069, + "balance_loss_mlp": 0.01258955, + "epoch": 0.2347211784157523, + "flos": 24542708273280.0, + "grad_norm": 1.8578223556950417, + "language_loss": 0.82988703, + "learning_rate": 3.5762266275023645e-06, + "loss": 0.90790236, + "num_input_tokens_seen": 84042605, + "router_z_loss_clip": 2.27929688, + "router_z_loss_mlp": 0.19909668, + "step": 3904, + "time_per_iteration": 2.5903875827789307 + }, + { + "auxiliary_loss_clip": 0.0652332, + "auxiliary_loss_mlp": 0.01285911, + "balance_loss_clip": 0.06295672, + "balance_loss_mlp": 0.01265562, + "epoch": 0.23478130166842026, + "flos": 23811751180800.0, + "grad_norm": 1.985666710181995, + "language_loss": 0.7223646, + "learning_rate": 3.57598687219895e-06, + "loss": 0.80045688, + "num_input_tokens_seen": 84061520, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.20361328, + "step": 3905, + "time_per_iteration": 2.5441884994506836 + }, + { + "auxiliary_loss_clip": 0.06517074, + "auxiliary_loss_mlp": 0.01274876, + "balance_loss_clip": 0.06294023, + "balance_loss_mlp": 0.01255564, + "epoch": 0.23484142492108823, + "flos": 24099823918080.0, + "grad_norm": 2.433861192511871, + "language_loss": 0.71703601, + "learning_rate": 3.5757470571339543e-06, + "loss": 0.79495549, + "num_input_tokens_seen": 84081800, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.19311523, + "step": 3906, + "time_per_iteration": 2.698309898376465 + }, + { + "auxiliary_loss_clip": 0.06533175, + "auxiliary_loss_mlp": 0.01285298, + "balance_loss_clip": 0.06297503, + "balance_loss_mlp": 0.01264341, + "epoch": 0.2349015481737562, + "flos": 29103486007680.0, + "grad_norm": 2.7858195598302014, + "language_loss": 0.74089986, + "learning_rate": 3.575507182316473e-06, + "loss": 0.81908458, + "num_input_tokens_seen": 84102340, + "router_z_loss_clip": 2.35742188, + "router_z_loss_mlp": 0.20959473, + "step": 3907, + "time_per_iteration": 2.578900098800659 + }, + { + "auxiliary_loss_clip": 0.06524273, + "auxiliary_loss_mlp": 0.01280946, + "balance_loss_clip": 0.06294693, + "balance_loss_mlp": 0.01260418, + "epoch": 0.23496167142642416, + "flos": 18922258679040.0, + "grad_norm": 2.1308722973133385, + "language_loss": 0.73705935, + "learning_rate": 3.575267247755601e-06, + "loss": 0.81511152, + "num_input_tokens_seen": 84120370, + "router_z_loss_clip": 2.29492188, + "router_z_loss_mlp": 0.2052002, + "step": 3908, + "time_per_iteration": 2.599888801574707 + }, + { + "auxiliary_loss_clip": 0.06415461, + "auxiliary_loss_mlp": 0.01265268, + "balance_loss_clip": 0.06297816, + "balance_loss_mlp": 0.01259901, + "epoch": 0.23502179467909215, + "flos": 55884906541440.0, + "grad_norm": 1.2475277524680826, + "language_loss": 0.73364127, + "learning_rate": 3.5750272534604367e-06, + "loss": 0.81044865, + "num_input_tokens_seen": 84165515, + "router_z_loss_clip": 1.171875, + "router_z_loss_mlp": 0.05374146, + "step": 3909, + "time_per_iteration": 2.9221227169036865 + }, + { + "auxiliary_loss_clip": 0.06524064, + "auxiliary_loss_mlp": 0.01285302, + "balance_loss_clip": 0.06297419, + "balance_loss_mlp": 0.01265013, + "epoch": 0.23508191793176011, + "flos": 23408083336320.0, + "grad_norm": 1.6005271399570604, + "language_loss": 0.88581395, + "learning_rate": 3.5747871994400822e-06, + "loss": 0.9639076, + "num_input_tokens_seen": 84184540, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.20288086, + "step": 3910, + "time_per_iteration": 2.571974277496338 + }, + { + "auxiliary_loss_clip": 0.06520193, + "auxiliary_loss_mlp": 0.01278841, + "balance_loss_clip": 0.06293051, + "balance_loss_mlp": 0.01258658, + "epoch": 0.23514204118442808, + "flos": 20053864869120.0, + "grad_norm": 1.9643755437340527, + "language_loss": 0.76589572, + "learning_rate": 3.5745470857036386e-06, + "loss": 0.84388608, + "num_input_tokens_seen": 84202025, + "router_z_loss_clip": 2.27148438, + "router_z_loss_mlp": 0.2019043, + "step": 3911, + "time_per_iteration": 2.5159506797790527 + }, + { + "auxiliary_loss_clip": 0.06514487, + "auxiliary_loss_mlp": 0.01291153, + "balance_loss_clip": 0.06293596, + "balance_loss_mlp": 0.01272568, + "epoch": 0.23520216443709605, + "flos": 21587126405760.0, + "grad_norm": 1.5390832092388007, + "language_loss": 0.82200038, + "learning_rate": 3.5743069122602122e-06, + "loss": 0.90005672, + "num_input_tokens_seen": 84221895, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.18579102, + "step": 3912, + "time_per_iteration": 2.53330135345459 + }, + { + "auxiliary_loss_clip": 0.06515642, + "auxiliary_loss_mlp": 0.01288785, + "balance_loss_clip": 0.06294793, + "balance_loss_mlp": 0.01269604, + "epoch": 0.235262287689764, + "flos": 23192573834880.0, + "grad_norm": 1.8330232089961167, + "language_loss": 0.72023201, + "learning_rate": 3.574066679118909e-06, + "loss": 0.79827625, + "num_input_tokens_seen": 84240455, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.19177246, + "step": 3913, + "time_per_iteration": 2.5643818378448486 + }, + { + "auxiliary_loss_clip": 0.06528541, + "auxiliary_loss_mlp": 0.01277731, + "balance_loss_clip": 0.0629672, + "balance_loss_mlp": 0.01257238, + "epoch": 0.23532241094243198, + "flos": 23191903002240.0, + "grad_norm": 1.784539383466316, + "language_loss": 0.76976919, + "learning_rate": 3.57382638628884e-06, + "loss": 0.84783185, + "num_input_tokens_seen": 84261605, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.20483398, + "step": 3914, + "time_per_iteration": 2.575133800506592 + }, + { + "auxiliary_loss_clip": 0.06525879, + "auxiliary_loss_mlp": 0.01279953, + "balance_loss_clip": 0.06294835, + "balance_loss_mlp": 0.01259759, + "epoch": 0.23538253419509997, + "flos": 17025007006080.0, + "grad_norm": 2.4875564397369745, + "language_loss": 0.90170735, + "learning_rate": 3.5735860337791174e-06, + "loss": 0.97976559, + "num_input_tokens_seen": 84278675, + "router_z_loss_clip": 2.30859375, + "router_z_loss_mlp": 0.2019043, + "step": 3915, + "time_per_iteration": 2.563430070877075 + }, + { + "auxiliary_loss_clip": 0.06418007, + "auxiliary_loss_mlp": 0.01258116, + "balance_loss_clip": 0.06301998, + "balance_loss_mlp": 0.0125336, + "epoch": 0.23544265744776793, + "flos": 63465276263040.0, + "grad_norm": 0.7933859009920101, + "language_loss": 0.59378946, + "learning_rate": 3.573345621598854e-06, + "loss": 0.6705507, + "num_input_tokens_seen": 84329765, + "router_z_loss_clip": 1.15625, + "router_z_loss_mlp": 0.04748535, + "step": 3916, + "time_per_iteration": 3.0965490341186523 + }, + { + "auxiliary_loss_clip": 0.06410776, + "auxiliary_loss_mlp": 0.01260488, + "balance_loss_clip": 0.06295535, + "balance_loss_mlp": 0.01255756, + "epoch": 0.2355027807004359, + "flos": 70537395116160.0, + "grad_norm": 0.7426668339088592, + "language_loss": 0.49443412, + "learning_rate": 3.5731051497571675e-06, + "loss": 0.57114673, + "num_input_tokens_seen": 84393680, + "router_z_loss_clip": 1.15625, + "router_z_loss_mlp": 0.04724121, + "step": 3917, + "time_per_iteration": 3.180136203765869 + }, + { + "auxiliary_loss_clip": 0.06525698, + "auxiliary_loss_mlp": 0.01279416, + "balance_loss_clip": 0.06297344, + "balance_loss_mlp": 0.01259687, + "epoch": 0.23556290395310386, + "flos": 21440742122880.0, + "grad_norm": 2.189382839240281, + "language_loss": 0.77017808, + "learning_rate": 3.5728646182631756e-06, + "loss": 0.84822929, + "num_input_tokens_seen": 84412640, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.19714355, + "step": 3918, + "time_per_iteration": 2.546833038330078 + }, + { + "auxiliary_loss_clip": 0.0652653, + "auxiliary_loss_mlp": 0.01274201, + "balance_loss_clip": 0.06294574, + "balance_loss_mlp": 0.01254353, + "epoch": 0.23562302720577183, + "flos": 18192223981440.0, + "grad_norm": 2.402769767514051, + "language_loss": 0.70165813, + "learning_rate": 3.5726240271259995e-06, + "loss": 0.77966547, + "num_input_tokens_seen": 84431605, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.1986084, + "step": 3919, + "time_per_iteration": 2.561800479888916 + }, + { + "auxiliary_loss_clip": 0.06516096, + "auxiliary_loss_mlp": 0.01279326, + "balance_loss_clip": 0.06294449, + "balance_loss_mlp": 0.0125999, + "epoch": 0.2356831504584398, + "flos": 33739091038080.0, + "grad_norm": 1.6359966895302622, + "language_loss": 0.71094656, + "learning_rate": 3.5723833763547634e-06, + "loss": 0.78890085, + "num_input_tokens_seen": 84454210, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.19335938, + "step": 3920, + "time_per_iteration": 2.672703504562378 + }, + { + "auxiliary_loss_clip": 0.065192, + "auxiliary_loss_mlp": 0.0127625, + "balance_loss_clip": 0.06295229, + "balance_loss_mlp": 0.0125707, + "epoch": 0.23574327371110776, + "flos": 24939122739840.0, + "grad_norm": 1.9300596293530992, + "language_loss": 0.77833009, + "learning_rate": 3.5721426659585916e-06, + "loss": 0.85628462, + "num_input_tokens_seen": 84475540, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.19189453, + "step": 3921, + "time_per_iteration": 2.5823934078216553 + }, + { + "auxiliary_loss_clip": 0.06519832, + "auxiliary_loss_mlp": 0.01273471, + "balance_loss_clip": 0.06293498, + "balance_loss_mlp": 0.01254898, + "epoch": 0.23580339696377575, + "flos": 17827940355840.0, + "grad_norm": 2.282195745019935, + "language_loss": 0.76750088, + "learning_rate": 3.571901895946612e-06, + "loss": 0.84543383, + "num_input_tokens_seen": 84494580, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.18566895, + "step": 3922, + "time_per_iteration": 2.5005834102630615 + }, + { + "auxiliary_loss_clip": 0.06518443, + "auxiliary_loss_mlp": 0.01276376, + "balance_loss_clip": 0.06292558, + "balance_loss_mlp": 0.01257255, + "epoch": 0.23586352021644372, + "flos": 26293827225600.0, + "grad_norm": 2.0102031772622277, + "language_loss": 0.80626559, + "learning_rate": 3.571661066327956e-06, + "loss": 0.88421381, + "num_input_tokens_seen": 84513850, + "router_z_loss_clip": 2.25976562, + "router_z_loss_mlp": 0.19128418, + "step": 3923, + "time_per_iteration": 2.581338882446289 + }, + { + "auxiliary_loss_clip": 0.0652013, + "auxiliary_loss_mlp": 0.01275781, + "balance_loss_clip": 0.06296518, + "balance_loss_mlp": 0.01256326, + "epoch": 0.23592364346911168, + "flos": 14251965258240.0, + "grad_norm": 1.780788070615976, + "language_loss": 0.7507394, + "learning_rate": 3.571420177111754e-06, + "loss": 0.82869852, + "num_input_tokens_seen": 84532315, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.19458008, + "step": 3924, + "time_per_iteration": 3.9297289848327637 + }, + { + "auxiliary_loss_clip": 0.06516001, + "auxiliary_loss_mlp": 0.01276934, + "balance_loss_clip": 0.06293369, + "balance_loss_mlp": 0.01258039, + "epoch": 0.23598376672177965, + "flos": 18593837400960.0, + "grad_norm": 1.7528516859224217, + "language_loss": 0.83231425, + "learning_rate": 3.5711792283071416e-06, + "loss": 0.91024363, + "num_input_tokens_seen": 84550970, + "router_z_loss_clip": 2.2265625, + "router_z_loss_mlp": 0.18884277, + "step": 3925, + "time_per_iteration": 2.5267770290374756 + }, + { + "auxiliary_loss_clip": 0.06520985, + "auxiliary_loss_mlp": 0.01279855, + "balance_loss_clip": 0.06293195, + "balance_loss_mlp": 0.01259673, + "epoch": 0.2360438899744476, + "flos": 22682325196800.0, + "grad_norm": 1.753261892654821, + "language_loss": 0.60038519, + "learning_rate": 3.5709382199232564e-06, + "loss": 0.6783936, + "num_input_tokens_seen": 84571655, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.20178223, + "step": 3926, + "time_per_iteration": 4.023118257522583 + }, + { + "auxiliary_loss_clip": 0.06514051, + "auxiliary_loss_mlp": 0.01276677, + "balance_loss_clip": 0.06293727, + "balance_loss_mlp": 0.01257735, + "epoch": 0.23610401322711558, + "flos": 29577872298240.0, + "grad_norm": 1.9607796947198142, + "language_loss": 0.72402066, + "learning_rate": 3.570697151969235e-06, + "loss": 0.80192792, + "num_input_tokens_seen": 84593130, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.1895752, + "step": 3927, + "time_per_iteration": 2.6113367080688477 + }, + { + "auxiliary_loss_clip": 0.06515504, + "auxiliary_loss_mlp": 0.01275291, + "balance_loss_clip": 0.06291251, + "balance_loss_mlp": 0.01256373, + "epoch": 0.23616413647978354, + "flos": 17864347754880.0, + "grad_norm": 2.08357001670468, + "language_loss": 0.75570691, + "learning_rate": 3.570456024454221e-06, + "loss": 0.83361489, + "num_input_tokens_seen": 84612410, + "router_z_loss_clip": 2.24609375, + "router_z_loss_mlp": 0.18920898, + "step": 3928, + "time_per_iteration": 2.601884365081787 + }, + { + "auxiliary_loss_clip": 0.06522287, + "auxiliary_loss_mlp": 0.01280424, + "balance_loss_clip": 0.06293722, + "balance_loss_mlp": 0.01260338, + "epoch": 0.23622425973245154, + "flos": 11039393318400.0, + "grad_norm": 3.3378461006384788, + "language_loss": 0.82518888, + "learning_rate": 3.5702148373873576e-06, + "loss": 0.903216, + "num_input_tokens_seen": 84627610, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.20080566, + "step": 3929, + "time_per_iteration": 3.9035136699676514 + }, + { + "auxiliary_loss_clip": 0.0652993, + "auxiliary_loss_mlp": 0.01281554, + "balance_loss_clip": 0.06295136, + "balance_loss_mlp": 0.01261228, + "epoch": 0.2362843829851195, + "flos": 23410766666880.0, + "grad_norm": 2.0127268398029607, + "language_loss": 0.7229315, + "learning_rate": 3.569973590777789e-06, + "loss": 0.80104637, + "num_input_tokens_seen": 84648415, + "router_z_loss_clip": 2.34960938, + "router_z_loss_mlp": 0.203125, + "step": 3930, + "time_per_iteration": 2.5537455081939697 + }, + { + "auxiliary_loss_clip": 0.06516138, + "auxiliary_loss_mlp": 0.01275778, + "balance_loss_clip": 0.06290947, + "balance_loss_mlp": 0.01257312, + "epoch": 0.23634450623778747, + "flos": 39539103932160.0, + "grad_norm": 1.8975533795335693, + "language_loss": 0.74476141, + "learning_rate": 3.569732284634665e-06, + "loss": 0.82268059, + "num_input_tokens_seen": 84670080, + "router_z_loss_clip": 2.25390625, + "router_z_loss_mlp": 0.18444824, + "step": 3931, + "time_per_iteration": 2.6975677013397217 + }, + { + "auxiliary_loss_clip": 0.06517775, + "auxiliary_loss_mlp": 0.01279269, + "balance_loss_clip": 0.06291172, + "balance_loss_mlp": 0.01260208, + "epoch": 0.23640462949045543, + "flos": 24214077360000.0, + "grad_norm": 2.102820580807434, + "language_loss": 0.8105433, + "learning_rate": 3.569490918967136e-06, + "loss": 0.88851368, + "num_input_tokens_seen": 84686465, + "router_z_loss_clip": 2.26757812, + "router_z_loss_mlp": 0.19055176, + "step": 3932, + "time_per_iteration": 2.539280652999878 + }, + { + "auxiliary_loss_clip": 0.06510118, + "auxiliary_loss_mlp": 0.01272436, + "balance_loss_clip": 0.06289183, + "balance_loss_mlp": 0.01254949, + "epoch": 0.2364647527431234, + "flos": 26184898517760.0, + "grad_norm": 1.6370407311570319, + "language_loss": 0.85819322, + "learning_rate": 3.5692494937843537e-06, + "loss": 0.93601882, + "num_input_tokens_seen": 84708825, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.17480469, + "step": 3933, + "time_per_iteration": 4.0140979290008545 + }, + { + "auxiliary_loss_clip": 0.06528582, + "auxiliary_loss_mlp": 0.01277532, + "balance_loss_clip": 0.06296912, + "balance_loss_mlp": 0.01257314, + "epoch": 0.23652487599579136, + "flos": 22643444102400.0, + "grad_norm": 3.233125821654351, + "language_loss": 0.83709848, + "learning_rate": 3.5690080090954727e-06, + "loss": 0.91515964, + "num_input_tokens_seen": 84726165, + "router_z_loss_clip": 2.31835938, + "router_z_loss_mlp": 0.20214844, + "step": 3934, + "time_per_iteration": 2.542692184448242 + }, + { + "auxiliary_loss_clip": 0.06519171, + "auxiliary_loss_mlp": 0.01281493, + "balance_loss_clip": 0.06292115, + "balance_loss_mlp": 0.01262896, + "epoch": 0.23658499924845935, + "flos": 21768702203520.0, + "grad_norm": 1.7174434370199074, + "language_loss": 0.7898351, + "learning_rate": 3.5687664649096515e-06, + "loss": 0.86784172, + "num_input_tokens_seen": 84745815, + "router_z_loss_clip": 2.26953125, + "router_z_loss_mlp": 0.18615723, + "step": 3935, + "time_per_iteration": 2.5311288833618164 + }, + { + "auxiliary_loss_clip": 0.0651848, + "auxiliary_loss_mlp": 0.01276844, + "balance_loss_clip": 0.06296465, + "balance_loss_mlp": 0.01258533, + "epoch": 0.23664512250112732, + "flos": 21805486945920.0, + "grad_norm": 1.7511193987533888, + "language_loss": 0.80239666, + "learning_rate": 3.5685248612360487e-06, + "loss": 0.88034987, + "num_input_tokens_seen": 84765415, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.1829834, + "step": 3936, + "time_per_iteration": 2.5497477054595947 + }, + { + "auxiliary_loss_clip": 0.06513149, + "auxiliary_loss_mlp": 0.01276001, + "balance_loss_clip": 0.06288509, + "balance_loss_mlp": 0.01256593, + "epoch": 0.23670524575379528, + "flos": 22644450351360.0, + "grad_norm": 1.4782770271817958, + "language_loss": 0.79820013, + "learning_rate": 3.568283198083826e-06, + "loss": 0.8760916, + "num_input_tokens_seen": 84787080, + "router_z_loss_clip": 2.24804688, + "router_z_loss_mlp": 0.19396973, + "step": 3937, + "time_per_iteration": 2.5636842250823975 + }, + { + "auxiliary_loss_clip": 0.06515164, + "auxiliary_loss_mlp": 0.0127913, + "balance_loss_clip": 0.06294726, + "balance_loss_mlp": 0.01261487, + "epoch": 0.23676536900646325, + "flos": 16730225942400.0, + "grad_norm": 2.2850190898814686, + "language_loss": 0.85810506, + "learning_rate": 3.568041475462147e-06, + "loss": 0.93604803, + "num_input_tokens_seen": 84805395, + "router_z_loss_clip": 2.20507812, + "router_z_loss_mlp": 0.1763916, + "step": 3938, + "time_per_iteration": 2.568195343017578 + }, + { + "auxiliary_loss_clip": 0.06509314, + "auxiliary_loss_mlp": 0.01278669, + "balance_loss_clip": 0.06288411, + "balance_loss_mlp": 0.01259393, + "epoch": 0.23682549225913122, + "flos": 11138720734080.0, + "grad_norm": 3.1023600205020876, + "language_loss": 0.94564033, + "learning_rate": 3.5677996933801785e-06, + "loss": 1.02351999, + "num_input_tokens_seen": 84818090, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.19287109, + "step": 3939, + "time_per_iteration": 2.4615180492401123 + }, + { + "auxiliary_loss_clip": 0.0652378, + "auxiliary_loss_mlp": 0.01277473, + "balance_loss_clip": 0.06294175, + "balance_loss_mlp": 0.0125803, + "epoch": 0.23688561551179918, + "flos": 22564843372800.0, + "grad_norm": 5.475058210638743, + "language_loss": 0.82803464, + "learning_rate": 3.567557851847088e-06, + "loss": 0.90604717, + "num_input_tokens_seen": 84837695, + "router_z_loss_clip": 2.29492188, + "router_z_loss_mlp": 0.19445801, + "step": 3940, + "time_per_iteration": 2.573552131652832 + }, + { + "auxiliary_loss_clip": 0.06531326, + "auxiliary_loss_mlp": 0.01276996, + "balance_loss_clip": 0.06295921, + "balance_loss_mlp": 0.0125679, + "epoch": 0.23694573876446715, + "flos": 18520771040640.0, + "grad_norm": 2.098492916494123, + "language_loss": 0.8946867, + "learning_rate": 3.5673159508720464e-06, + "loss": 0.97276992, + "num_input_tokens_seen": 84854630, + "router_z_loss_clip": 2.35351562, + "router_z_loss_mlp": 0.2019043, + "step": 3941, + "time_per_iteration": 2.5142972469329834 + }, + { + "auxiliary_loss_clip": 0.06529268, + "auxiliary_loss_mlp": 0.01286958, + "balance_loss_clip": 0.06297106, + "balance_loss_mlp": 0.01267503, + "epoch": 0.23700586201713514, + "flos": 15340246087680.0, + "grad_norm": 1.8886698836060631, + "language_loss": 0.84989077, + "learning_rate": 3.5670739904642274e-06, + "loss": 0.92805308, + "num_input_tokens_seen": 84871805, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.19458008, + "step": 3942, + "time_per_iteration": 2.56052827835083 + }, + { + "auxiliary_loss_clip": 0.06538361, + "auxiliary_loss_mlp": 0.01285865, + "balance_loss_clip": 0.06307331, + "balance_loss_mlp": 0.01265492, + "epoch": 0.2370659852698031, + "flos": 23953775051520.0, + "grad_norm": 2.0845511028002197, + "language_loss": 0.81156456, + "learning_rate": 3.5668319706328065e-06, + "loss": 0.88980681, + "num_input_tokens_seen": 84889815, + "router_z_loss_clip": 2.31054688, + "router_z_loss_mlp": 0.20373535, + "step": 3943, + "time_per_iteration": 2.539264678955078 + }, + { + "auxiliary_loss_clip": 0.06543057, + "auxiliary_loss_mlp": 0.01292355, + "balance_loss_clip": 0.06306483, + "balance_loss_mlp": 0.01271494, + "epoch": 0.23712610852247107, + "flos": 15336514581120.0, + "grad_norm": 2.5863771047568926, + "language_loss": 0.682428, + "learning_rate": 3.566589891386959e-06, + "loss": 0.76078212, + "num_input_tokens_seen": 84904380, + "router_z_loss_clip": 2.3671875, + "router_z_loss_mlp": 0.20861816, + "step": 3944, + "time_per_iteration": 2.520453929901123 + }, + { + "auxiliary_loss_clip": 0.06529288, + "auxiliary_loss_mlp": 0.01297026, + "balance_loss_clip": 0.06299931, + "balance_loss_mlp": 0.01276963, + "epoch": 0.23718623177513903, + "flos": 19688658848640.0, + "grad_norm": 1.6926271274644824, + "language_loss": 0.76068223, + "learning_rate": 3.566347752735866e-06, + "loss": 0.83894539, + "num_input_tokens_seen": 84922935, + "router_z_loss_clip": 2.29296875, + "router_z_loss_mlp": 0.20043945, + "step": 3945, + "time_per_iteration": 2.517084836959839 + }, + { + "auxiliary_loss_clip": 0.06535566, + "auxiliary_loss_mlp": 0.01288141, + "balance_loss_clip": 0.06307153, + "balance_loss_mlp": 0.0126859, + "epoch": 0.237246355027807, + "flos": 24980351748480.0, + "grad_norm": 1.7408538946114391, + "language_loss": 0.63962567, + "learning_rate": 3.5661055546887094e-06, + "loss": 0.71786278, + "num_input_tokens_seen": 84943685, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.19555664, + "step": 3946, + "time_per_iteration": 2.6133670806884766 + }, + { + "auxiliary_loss_clip": 0.06535441, + "auxiliary_loss_mlp": 0.01289697, + "balance_loss_clip": 0.06306995, + "balance_loss_mlp": 0.01269324, + "epoch": 0.23730647828047496, + "flos": 15382816761600.0, + "grad_norm": 3.1254224655104252, + "language_loss": 0.77114201, + "learning_rate": 3.5658632972546734e-06, + "loss": 0.84939343, + "num_input_tokens_seen": 84959505, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.20385742, + "step": 3947, + "time_per_iteration": 2.495837926864624 + }, + { + "auxiliary_loss_clip": 0.06540522, + "auxiliary_loss_mlp": 0.01290208, + "balance_loss_clip": 0.06311937, + "balance_loss_mlp": 0.01270431, + "epoch": 0.23736660153314296, + "flos": 28158738422400.0, + "grad_norm": 1.595292591120463, + "language_loss": 0.80941439, + "learning_rate": 3.565620980442944e-06, + "loss": 0.88772172, + "num_input_tokens_seen": 84982130, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.19775391, + "step": 3948, + "time_per_iteration": 2.6460211277008057 + }, + { + "auxiliary_loss_clip": 0.06542704, + "auxiliary_loss_mlp": 0.01297731, + "balance_loss_clip": 0.06312679, + "balance_loss_mlp": 0.01277025, + "epoch": 0.23742672478581092, + "flos": 22092385726080.0, + "grad_norm": 1.753357741589714, + "language_loss": 0.80419362, + "learning_rate": 3.5653786042627107e-06, + "loss": 0.88259804, + "num_input_tokens_seen": 85000640, + "router_z_loss_clip": 2.30078125, + "router_z_loss_mlp": 0.20715332, + "step": 3949, + "time_per_iteration": 2.5428664684295654 + }, + { + "auxiliary_loss_clip": 0.06549721, + "auxiliary_loss_mlp": 0.01294419, + "balance_loss_clip": 0.06317213, + "balance_loss_mlp": 0.012732, + "epoch": 0.2374868480384789, + "flos": 19543238887680.0, + "grad_norm": 1.6923054699564082, + "language_loss": 0.73375976, + "learning_rate": 3.565136168723163e-06, + "loss": 0.81220114, + "num_input_tokens_seen": 85018970, + "router_z_loss_clip": 2.32421875, + "router_z_loss_mlp": 0.2121582, + "step": 3950, + "time_per_iteration": 2.6125261783599854 + }, + { + "auxiliary_loss_clip": 0.06527583, + "auxiliary_loss_mlp": 0.01288007, + "balance_loss_clip": 0.06302388, + "balance_loss_mlp": 0.01268957, + "epoch": 0.23754697129114685, + "flos": 19427769561600.0, + "grad_norm": 1.893051910973559, + "language_loss": 0.73254943, + "learning_rate": 3.564893673833495e-06, + "loss": 0.8107053, + "num_input_tokens_seen": 85035905, + "router_z_loss_clip": 2.25390625, + "router_z_loss_mlp": 0.1907959, + "step": 3951, + "time_per_iteration": 2.501091957092285 + }, + { + "auxiliary_loss_clip": 0.06543966, + "auxiliary_loss_mlp": 0.01301622, + "balance_loss_clip": 0.06315006, + "balance_loss_mlp": 0.01280332, + "epoch": 0.23760709454381482, + "flos": 19507208832000.0, + "grad_norm": 1.727887568846887, + "language_loss": 0.7427932, + "learning_rate": 3.564651119602903e-06, + "loss": 0.82124901, + "num_input_tokens_seen": 85054560, + "router_z_loss_clip": 2.2890625, + "router_z_loss_mlp": 0.2130127, + "step": 3952, + "time_per_iteration": 2.5467019081115723 + }, + { + "auxiliary_loss_clip": 0.06536686, + "auxiliary_loss_mlp": 0.01292988, + "balance_loss_clip": 0.0630881, + "balance_loss_mlp": 0.01273379, + "epoch": 0.23766721779648278, + "flos": 27644045518080.0, + "grad_norm": 3.105577179216311, + "language_loss": 0.71633041, + "learning_rate": 3.564408506040583e-06, + "loss": 0.79462719, + "num_input_tokens_seen": 85074425, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.19604492, + "step": 3953, + "time_per_iteration": 2.599946975708008 + }, + { + "auxiliary_loss_clip": 0.06537458, + "auxiliary_loss_mlp": 0.01292831, + "balance_loss_clip": 0.06305911, + "balance_loss_mlp": 0.01272673, + "epoch": 0.23772734104915075, + "flos": 23411102083200.0, + "grad_norm": 6.547469437533346, + "language_loss": 0.82534778, + "learning_rate": 3.5641658331557356e-06, + "loss": 0.90365064, + "num_input_tokens_seen": 85092865, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.20166016, + "step": 3954, + "time_per_iteration": 2.595163583755493 + }, + { + "auxiliary_loss_clip": 0.06538694, + "auxiliary_loss_mlp": 0.01291334, + "balance_loss_clip": 0.0630859, + "balance_loss_mlp": 0.01271486, + "epoch": 0.23778746430181874, + "flos": 15710902623360.0, + "grad_norm": 2.2065720754909606, + "language_loss": 0.66202033, + "learning_rate": 3.5639231009575634e-06, + "loss": 0.74032056, + "num_input_tokens_seen": 85110175, + "router_z_loss_clip": 2.30273438, + "router_z_loss_mlp": 0.19848633, + "step": 3955, + "time_per_iteration": 2.5345511436462402 + }, + { + "auxiliary_loss_clip": 0.06527859, + "auxiliary_loss_mlp": 0.01285762, + "balance_loss_clip": 0.06301668, + "balance_loss_mlp": 0.01266081, + "epoch": 0.2378475875544867, + "flos": 19432381536000.0, + "grad_norm": 1.4478942147045952, + "language_loss": 0.84203303, + "learning_rate": 3.5636803094552704e-06, + "loss": 0.92016923, + "num_input_tokens_seen": 85129925, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.19689941, + "step": 3956, + "time_per_iteration": 2.5458483695983887 + }, + { + "auxiliary_loss_clip": 0.06526335, + "auxiliary_loss_mlp": 0.01287929, + "balance_loss_clip": 0.06303546, + "balance_loss_mlp": 0.01268438, + "epoch": 0.23790771080715467, + "flos": 22274338867200.0, + "grad_norm": 2.194064451149358, + "language_loss": 0.8561964, + "learning_rate": 3.5634374586580635e-06, + "loss": 0.93433905, + "num_input_tokens_seen": 85147755, + "router_z_loss_clip": 2.2265625, + "router_z_loss_mlp": 0.19494629, + "step": 3957, + "time_per_iteration": 2.5579113960266113 + }, + { + "auxiliary_loss_clip": 0.06532466, + "auxiliary_loss_mlp": 0.01283677, + "balance_loss_clip": 0.0630599, + "balance_loss_mlp": 0.01264008, + "epoch": 0.23796783405982264, + "flos": 20053445598720.0, + "grad_norm": 2.4454692262909856, + "language_loss": 0.7073434, + "learning_rate": 3.563194548575151e-06, + "loss": 0.78550482, + "num_input_tokens_seen": 85165270, + "router_z_loss_clip": 2.26757812, + "router_z_loss_mlp": 0.19665527, + "step": 3958, + "time_per_iteration": 2.556201219558716 + }, + { + "auxiliary_loss_clip": 0.06533751, + "auxiliary_loss_mlp": 0.01277914, + "balance_loss_clip": 0.06301822, + "balance_loss_mlp": 0.01257303, + "epoch": 0.2380279573124906, + "flos": 14251084790400.0, + "grad_norm": 4.548053192599961, + "language_loss": 0.66760004, + "learning_rate": 3.562951579215745e-06, + "loss": 0.74571669, + "num_input_tokens_seen": 85181555, + "router_z_loss_clip": 2.32226562, + "router_z_loss_mlp": 0.20617676, + "step": 3959, + "time_per_iteration": 2.491999626159668 + }, + { + "auxiliary_loss_clip": 0.06529753, + "auxiliary_loss_mlp": 0.01278003, + "balance_loss_clip": 0.06303047, + "balance_loss_mlp": 0.01259228, + "epoch": 0.23808808056515857, + "flos": 21185638767360.0, + "grad_norm": 1.7806564555446132, + "language_loss": 0.72341377, + "learning_rate": 3.5627085505890586e-06, + "loss": 0.80149138, + "num_input_tokens_seen": 85199455, + "router_z_loss_clip": 2.26757812, + "router_z_loss_mlp": 0.18774414, + "step": 3960, + "time_per_iteration": 2.523761034011841 + }, + { + "auxiliary_loss_clip": 0.0652384, + "auxiliary_loss_mlp": 0.0127522, + "balance_loss_clip": 0.06296217, + "balance_loss_mlp": 0.01255169, + "epoch": 0.23814820381782653, + "flos": 22534850810880.0, + "grad_norm": 1.610971251516654, + "language_loss": 0.7476449, + "learning_rate": 3.562465462704307e-06, + "loss": 0.82563543, + "num_input_tokens_seen": 85219170, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.20031738, + "step": 3961, + "time_per_iteration": 2.5350120067596436 + }, + { + "auxiliary_loss_clip": 0.06528293, + "auxiliary_loss_mlp": 0.01283237, + "balance_loss_clip": 0.06297825, + "balance_loss_mlp": 0.01261505, + "epoch": 0.23820832707049452, + "flos": 22309991579520.0, + "grad_norm": 2.008938617955162, + "language_loss": 0.66267157, + "learning_rate": 3.5622223155707085e-06, + "loss": 0.74078679, + "num_input_tokens_seen": 85238480, + "router_z_loss_clip": 2.30273438, + "router_z_loss_mlp": 0.21728516, + "step": 3962, + "time_per_iteration": 2.554936170578003 + }, + { + "auxiliary_loss_clip": 0.06522447, + "auxiliary_loss_mlp": 0.01279056, + "balance_loss_clip": 0.0629696, + "balance_loss_mlp": 0.0126009, + "epoch": 0.2382684503231625, + "flos": 24871297259520.0, + "grad_norm": 1.868964177707197, + "language_loss": 0.75134146, + "learning_rate": 3.561979109197483e-06, + "loss": 0.82935649, + "num_input_tokens_seen": 85259180, + "router_z_loss_clip": 2.25585938, + "router_z_loss_mlp": 0.18969727, + "step": 3963, + "time_per_iteration": 3.9841935634613037 + }, + { + "auxiliary_loss_clip": 0.0652955, + "auxiliary_loss_mlp": 0.01278457, + "balance_loss_clip": 0.06298651, + "balance_loss_mlp": 0.01257428, + "epoch": 0.23832857357583045, + "flos": 21878050181760.0, + "grad_norm": 2.083636930734351, + "language_loss": 0.77508426, + "learning_rate": 3.5617358435938538e-06, + "loss": 0.85316432, + "num_input_tokens_seen": 85278550, + "router_z_loss_clip": 2.30859375, + "router_z_loss_mlp": 0.21032715, + "step": 3964, + "time_per_iteration": 2.546093463897705 + }, + { + "auxiliary_loss_clip": 0.06513681, + "auxiliary_loss_mlp": 0.01275741, + "balance_loss_clip": 0.06290185, + "balance_loss_mlp": 0.01256275, + "epoch": 0.23838869682849842, + "flos": 21294441694080.0, + "grad_norm": 2.0070777911568207, + "language_loss": 0.72507781, + "learning_rate": 3.561492518769045e-06, + "loss": 0.80297208, + "num_input_tokens_seen": 85297345, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.19458008, + "step": 3965, + "time_per_iteration": 2.605717182159424 + }, + { + "auxiliary_loss_clip": 0.06518564, + "auxiliary_loss_mlp": 0.012776, + "balance_loss_clip": 0.06293208, + "balance_loss_mlp": 0.01258181, + "epoch": 0.23844882008116638, + "flos": 16186211308800.0, + "grad_norm": 2.069567415104782, + "language_loss": 0.79030257, + "learning_rate": 3.561249134732282e-06, + "loss": 0.8682642, + "num_input_tokens_seen": 85315105, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.19396973, + "step": 3966, + "time_per_iteration": 3.980722427368164 + }, + { + "auxiliary_loss_clip": 0.06517511, + "auxiliary_loss_mlp": 0.01283232, + "balance_loss_clip": 0.06290257, + "balance_loss_mlp": 0.01264647, + "epoch": 0.23850894333383435, + "flos": 21076165008000.0, + "grad_norm": 3.0015774693629433, + "language_loss": 0.69417417, + "learning_rate": 3.561005691492797e-06, + "loss": 0.77218163, + "num_input_tokens_seen": 85334735, + "router_z_loss_clip": 2.27929688, + "router_z_loss_mlp": 0.18579102, + "step": 3967, + "time_per_iteration": 2.542595386505127 + }, + { + "auxiliary_loss_clip": 0.06523537, + "auxiliary_loss_mlp": 0.01278611, + "balance_loss_clip": 0.0629587, + "balance_loss_mlp": 0.01257821, + "epoch": 0.23856906658650234, + "flos": 17207295563520.0, + "grad_norm": 1.9959497275253817, + "language_loss": 0.68410718, + "learning_rate": 3.5607621890598185e-06, + "loss": 0.76212859, + "num_input_tokens_seen": 85352875, + "router_z_loss_clip": 2.27539062, + "router_z_loss_mlp": 0.20800781, + "step": 3968, + "time_per_iteration": 2.5275728702545166 + }, + { + "auxiliary_loss_clip": 0.06526159, + "auxiliary_loss_mlp": 0.01279655, + "balance_loss_clip": 0.0629804, + "balance_loss_mlp": 0.01261392, + "epoch": 0.2386291898391703, + "flos": 29501451774720.0, + "grad_norm": 2.0078802263631994, + "language_loss": 0.77147222, + "learning_rate": 3.5605186274425823e-06, + "loss": 0.84953034, + "num_input_tokens_seen": 85372205, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.18261719, + "step": 3969, + "time_per_iteration": 4.006864547729492 + }, + { + "auxiliary_loss_clip": 0.06514208, + "auxiliary_loss_mlp": 0.01292793, + "balance_loss_clip": 0.06291697, + "balance_loss_mlp": 0.01274602, + "epoch": 0.23868931309183827, + "flos": 21148854024960.0, + "grad_norm": 1.9717404660495825, + "language_loss": 0.76892555, + "learning_rate": 3.5602750066503225e-06, + "loss": 0.84699559, + "num_input_tokens_seen": 85389705, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.18188477, + "step": 3970, + "time_per_iteration": 2.558915615081787 + }, + { + "auxiliary_loss_clip": 0.06523073, + "auxiliary_loss_mlp": 0.0128602, + "balance_loss_clip": 0.0629265, + "balance_loss_mlp": 0.01265969, + "epoch": 0.23874943634450624, + "flos": 25665342076800.0, + "grad_norm": 2.212795121423013, + "language_loss": 0.85452002, + "learning_rate": 3.5600313266922793e-06, + "loss": 0.93261099, + "num_input_tokens_seen": 85407855, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.20043945, + "step": 3971, + "time_per_iteration": 2.5621652603149414 + }, + { + "auxiliary_loss_clip": 0.06391954, + "auxiliary_loss_mlp": 0.01255828, + "balance_loss_clip": 0.06279661, + "balance_loss_mlp": 0.01251122, + "epoch": 0.2388095595971742, + "flos": 59006871889920.0, + "grad_norm": 0.7183517633018239, + "language_loss": 0.62744105, + "learning_rate": 3.5597875875776915e-06, + "loss": 0.70391893, + "num_input_tokens_seen": 85470885, + "router_z_loss_clip": 1.125, + "router_z_loss_mlp": 0.04696655, + "step": 3972, + "time_per_iteration": 4.643376350402832 + }, + { + "auxiliary_loss_clip": 0.06515118, + "auxiliary_loss_mlp": 0.01277926, + "balance_loss_clip": 0.06290536, + "balance_loss_mlp": 0.01258399, + "epoch": 0.23886968284984217, + "flos": 16805975633280.0, + "grad_norm": 3.0192177240020976, + "language_loss": 0.81866533, + "learning_rate": 3.5595437893158013e-06, + "loss": 0.89659578, + "num_input_tokens_seen": 85488460, + "router_z_loss_clip": 2.2421875, + "router_z_loss_mlp": 0.19543457, + "step": 3973, + "time_per_iteration": 2.5597283840179443 + }, + { + "auxiliary_loss_clip": 0.06517763, + "auxiliary_loss_mlp": 0.01283675, + "balance_loss_clip": 0.06291795, + "balance_loss_mlp": 0.01265162, + "epoch": 0.23892980610251013, + "flos": 22389221214720.0, + "grad_norm": 1.829209898292947, + "language_loss": 0.79696077, + "learning_rate": 3.5592999319158546e-06, + "loss": 0.8749752, + "num_input_tokens_seen": 85508590, + "router_z_loss_clip": 2.25976562, + "router_z_loss_mlp": 0.18518066, + "step": 3974, + "time_per_iteration": 2.5331227779388428 + }, + { + "auxiliary_loss_clip": 0.06524864, + "auxiliary_loss_mlp": 0.01291591, + "balance_loss_clip": 0.06296244, + "balance_loss_mlp": 0.01272279, + "epoch": 0.23898992935517813, + "flos": 12828135553920.0, + "grad_norm": 6.773745042238101, + "language_loss": 0.85156423, + "learning_rate": 3.5590560153870984e-06, + "loss": 0.92972875, + "num_input_tokens_seen": 85525970, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.19311523, + "step": 3975, + "time_per_iteration": 2.6016342639923096 + }, + { + "auxiliary_loss_clip": 0.06513388, + "auxiliary_loss_mlp": 0.01278416, + "balance_loss_clip": 0.06290747, + "balance_loss_mlp": 0.01260117, + "epoch": 0.2390500526078461, + "flos": 22352142983040.0, + "grad_norm": 3.375355565005516, + "language_loss": 0.84191501, + "learning_rate": 3.5588120397387816e-06, + "loss": 0.91983294, + "num_input_tokens_seen": 85543700, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.1829834, + "step": 3976, + "time_per_iteration": 2.5339527130126953 + }, + { + "auxiliary_loss_clip": 0.06511909, + "auxiliary_loss_mlp": 0.01282136, + "balance_loss_clip": 0.06290296, + "balance_loss_mlp": 0.01264111, + "epoch": 0.23911017586051406, + "flos": 22641263896320.0, + "grad_norm": 3.0704844059493497, + "language_loss": 0.74960983, + "learning_rate": 3.5585680049801566e-06, + "loss": 0.82755029, + "num_input_tokens_seen": 85562765, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.18029785, + "step": 3977, + "time_per_iteration": 2.5528597831726074 + }, + { + "auxiliary_loss_clip": 0.06524444, + "auxiliary_loss_mlp": 0.01281803, + "balance_loss_clip": 0.06296922, + "balance_loss_mlp": 0.01261478, + "epoch": 0.23917029911318202, + "flos": 23658993987840.0, + "grad_norm": 3.246082679368102, + "language_loss": 0.7235828, + "learning_rate": 3.5583239111204764e-06, + "loss": 0.80164528, + "num_input_tokens_seen": 85581755, + "router_z_loss_clip": 2.2734375, + "router_z_loss_mlp": 0.203125, + "step": 3978, + "time_per_iteration": 2.548459768295288 + }, + { + "auxiliary_loss_clip": 0.06536747, + "auxiliary_loss_mlp": 0.01279264, + "balance_loss_clip": 0.06306014, + "balance_loss_mlp": 0.0125994, + "epoch": 0.23923042236585, + "flos": 22790163801600.0, + "grad_norm": 2.3394422136849875, + "language_loss": 0.79264927, + "learning_rate": 3.558079758168997e-06, + "loss": 0.87080932, + "num_input_tokens_seen": 85599455, + "router_z_loss_clip": 2.30859375, + "router_z_loss_mlp": 0.1932373, + "step": 3979, + "time_per_iteration": 2.5696120262145996 + }, + { + "auxiliary_loss_clip": 0.06521225, + "auxiliary_loss_mlp": 0.01282521, + "balance_loss_clip": 0.06295727, + "balance_loss_mlp": 0.01263185, + "epoch": 0.23929054561851795, + "flos": 28155300405120.0, + "grad_norm": 1.7900268576070866, + "language_loss": 0.81971824, + "learning_rate": 3.557835546134977e-06, + "loss": 0.89775562, + "num_input_tokens_seen": 85619970, + "router_z_loss_clip": 2.25585938, + "router_z_loss_mlp": 0.1932373, + "step": 3980, + "time_per_iteration": 2.587286949157715 + }, + { + "auxiliary_loss_clip": 0.06519361, + "auxiliary_loss_mlp": 0.01281001, + "balance_loss_clip": 0.06296664, + "balance_loss_mlp": 0.01261891, + "epoch": 0.23935066887118592, + "flos": 21692491315200.0, + "grad_norm": 1.7930077111492302, + "language_loss": 0.84270984, + "learning_rate": 3.5575912750276775e-06, + "loss": 0.92071348, + "num_input_tokens_seen": 85638850, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.19091797, + "step": 3981, + "time_per_iteration": 2.550725221633911 + }, + { + "auxiliary_loss_clip": 0.06535558, + "auxiliary_loss_mlp": 0.01280601, + "balance_loss_clip": 0.06303745, + "balance_loss_mlp": 0.01260669, + "epoch": 0.2394107921238539, + "flos": 32130121737600.0, + "grad_norm": 2.0248039039910393, + "language_loss": 0.77712274, + "learning_rate": 3.5573469448563607e-06, + "loss": 0.85528433, + "num_input_tokens_seen": 85656285, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.19934082, + "step": 3982, + "time_per_iteration": 2.594698667526245 + }, + { + "auxiliary_loss_clip": 0.06530322, + "auxiliary_loss_mlp": 0.01280321, + "balance_loss_clip": 0.06304529, + "balance_loss_mlp": 0.01261307, + "epoch": 0.23947091537652188, + "flos": 17024839297920.0, + "grad_norm": 1.9623565914246572, + "language_loss": 0.7809152, + "learning_rate": 3.5571025556302915e-06, + "loss": 0.85902166, + "num_input_tokens_seen": 85673020, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.19006348, + "step": 3983, + "time_per_iteration": 2.537132740020752 + }, + { + "auxiliary_loss_clip": 0.06527262, + "auxiliary_loss_mlp": 0.01280803, + "balance_loss_clip": 0.0630171, + "balance_loss_mlp": 0.01261956, + "epoch": 0.23953103862918984, + "flos": 20599640438400.0, + "grad_norm": 2.137172968887566, + "language_loss": 0.73945713, + "learning_rate": 3.556858107358737e-06, + "loss": 0.81753772, + "num_input_tokens_seen": 85692565, + "router_z_loss_clip": 2.25585938, + "router_z_loss_mlp": 0.18835449, + "step": 3984, + "time_per_iteration": 2.538221836090088 + }, + { + "auxiliary_loss_clip": 0.06531888, + "auxiliary_loss_mlp": 0.01281613, + "balance_loss_clip": 0.06302323, + "balance_loss_mlp": 0.01262587, + "epoch": 0.2395911618818578, + "flos": 20710707425280.0, + "grad_norm": 1.9765684717262704, + "language_loss": 0.7965889, + "learning_rate": 3.5566136000509674e-06, + "loss": 0.87472391, + "num_input_tokens_seen": 85709730, + "router_z_loss_clip": 2.29492188, + "router_z_loss_mlp": 0.19030762, + "step": 3985, + "time_per_iteration": 2.551649570465088 + }, + { + "auxiliary_loss_clip": 0.06532246, + "auxiliary_loss_mlp": 0.0127953, + "balance_loss_clip": 0.06304285, + "balance_loss_mlp": 0.01259265, + "epoch": 0.23965128513452577, + "flos": 27060982081920.0, + "grad_norm": 1.916737509209056, + "language_loss": 0.73610401, + "learning_rate": 3.556369033716254e-06, + "loss": 0.8142218, + "num_input_tokens_seen": 85730045, + "router_z_loss_clip": 2.27929688, + "router_z_loss_mlp": 0.20263672, + "step": 3986, + "time_per_iteration": 2.710397481918335 + }, + { + "auxiliary_loss_clip": 0.06540911, + "auxiliary_loss_mlp": 0.01281338, + "balance_loss_clip": 0.0630495, + "balance_loss_mlp": 0.01261, + "epoch": 0.23971140838719374, + "flos": 23150254723200.0, + "grad_norm": 1.785192597796332, + "language_loss": 0.88325328, + "learning_rate": 3.556124408363871e-06, + "loss": 0.96147585, + "num_input_tokens_seen": 85747590, + "router_z_loss_clip": 2.36328125, + "router_z_loss_mlp": 0.20336914, + "step": 3987, + "time_per_iteration": 2.6331911087036133 + }, + { + "auxiliary_loss_clip": 0.06529854, + "auxiliary_loss_mlp": 0.01278502, + "balance_loss_clip": 0.06312454, + "balance_loss_mlp": 0.0126043, + "epoch": 0.23977153163986173, + "flos": 18039341007360.0, + "grad_norm": 2.2552133940915224, + "language_loss": 0.84056735, + "learning_rate": 3.5558797240030945e-06, + "loss": 0.91865093, + "num_input_tokens_seen": 85763460, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.18078613, + "step": 3988, + "time_per_iteration": 2.5413994789123535 + }, + { + "auxiliary_loss_clip": 0.06533512, + "auxiliary_loss_mlp": 0.01288032, + "balance_loss_clip": 0.06306052, + "balance_loss_mlp": 0.01267052, + "epoch": 0.2398316548925297, + "flos": 18119157621120.0, + "grad_norm": 1.6232739060807335, + "language_loss": 0.85473406, + "learning_rate": 3.5556349806432035e-06, + "loss": 0.93294942, + "num_input_tokens_seen": 85782050, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.2097168, + "step": 3989, + "time_per_iteration": 2.528348207473755 + }, + { + "auxiliary_loss_clip": 0.06527147, + "auxiliary_loss_mlp": 0.01286562, + "balance_loss_clip": 0.06305796, + "balance_loss_mlp": 0.01266642, + "epoch": 0.23989177814519766, + "flos": 12572612928000.0, + "grad_norm": 2.695913709141839, + "language_loss": 0.8517406, + "learning_rate": 3.555390178293477e-06, + "loss": 0.92987764, + "num_input_tokens_seen": 85797400, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.19909668, + "step": 3990, + "time_per_iteration": 2.52915358543396 + }, + { + "auxiliary_loss_clip": 0.06527729, + "auxiliary_loss_mlp": 0.01283435, + "balance_loss_clip": 0.06302518, + "balance_loss_mlp": 0.01264064, + "epoch": 0.23995190139786562, + "flos": 25271569013760.0, + "grad_norm": 1.4267230320219149, + "language_loss": 0.76345301, + "learning_rate": 3.5551453169631994e-06, + "loss": 0.84156466, + "num_input_tokens_seen": 85818995, + "router_z_loss_clip": 2.25, + "router_z_loss_mlp": 0.19372559, + "step": 3991, + "time_per_iteration": 2.556820869445801 + }, + { + "auxiliary_loss_clip": 0.06413993, + "auxiliary_loss_mlp": 0.0126815, + "balance_loss_clip": 0.06298733, + "balance_loss_mlp": 0.01262789, + "epoch": 0.2400120246505336, + "flos": 61978107271680.0, + "grad_norm": 0.8724678757997124, + "language_loss": 0.6358996, + "learning_rate": 3.554900396661656e-06, + "loss": 0.71272099, + "num_input_tokens_seen": 85876695, + "router_z_loss_clip": 1.15625, + "router_z_loss_mlp": 0.05368042, + "step": 3992, + "time_per_iteration": 3.0817418098449707 + }, + { + "auxiliary_loss_clip": 0.06411353, + "auxiliary_loss_mlp": 0.01264238, + "balance_loss_clip": 0.06297012, + "balance_loss_mlp": 0.01259121, + "epoch": 0.24007214790320155, + "flos": 66727923816960.0, + "grad_norm": 0.7394753945990321, + "language_loss": 0.62864375, + "learning_rate": 3.5546554173981334e-06, + "loss": 0.70539963, + "num_input_tokens_seen": 85940990, + "router_z_loss_clip": 1.140625, + "router_z_loss_mlp": 0.05117798, + "step": 3993, + "time_per_iteration": 3.2552971839904785 + }, + { + "auxiliary_loss_clip": 0.0652933, + "auxiliary_loss_mlp": 0.01280032, + "balance_loss_clip": 0.062997, + "balance_loss_mlp": 0.0125886, + "epoch": 0.24013227115586952, + "flos": 25815667501440.0, + "grad_norm": 1.8775036450716396, + "language_loss": 0.77610862, + "learning_rate": 3.5544103791819218e-06, + "loss": 0.85420227, + "num_input_tokens_seen": 85961165, + "router_z_loss_clip": 2.30078125, + "router_z_loss_mlp": 0.21154785, + "step": 3994, + "time_per_iteration": 2.6225738525390625 + }, + { + "auxiliary_loss_clip": 0.06526788, + "auxiliary_loss_mlp": 0.01288387, + "balance_loss_clip": 0.06296962, + "balance_loss_mlp": 0.01266822, + "epoch": 0.2401923944085375, + "flos": 25564672995840.0, + "grad_norm": 1.626402048760673, + "language_loss": 0.78733414, + "learning_rate": 3.5541652820223124e-06, + "loss": 0.86548591, + "num_input_tokens_seen": 85982710, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.21557617, + "step": 3995, + "time_per_iteration": 2.5860579013824463 + }, + { + "auxiliary_loss_clip": 0.06395802, + "auxiliary_loss_mlp": 0.01265549, + "balance_loss_clip": 0.06282736, + "balance_loss_mlp": 0.01260685, + "epoch": 0.24025251766120548, + "flos": 54961457892480.0, + "grad_norm": 0.8928130340410044, + "language_loss": 0.63566971, + "learning_rate": 3.5539201259286006e-06, + "loss": 0.71228325, + "num_input_tokens_seen": 86046935, + "router_z_loss_clip": 1.12597656, + "router_z_loss_mlp": 0.04858398, + "step": 3996, + "time_per_iteration": 3.232227087020874 + }, + { + "auxiliary_loss_clip": 0.06522241, + "auxiliary_loss_mlp": 0.01283128, + "balance_loss_clip": 0.06290409, + "balance_loss_mlp": 0.0126328, + "epoch": 0.24031264091387344, + "flos": 20637305648640.0, + "grad_norm": 2.8724335092069864, + "language_loss": 0.71121502, + "learning_rate": 3.5536749109100808e-06, + "loss": 0.78926873, + "num_input_tokens_seen": 86064355, + "router_z_loss_clip": 2.31835938, + "router_z_loss_mlp": 0.19848633, + "step": 3997, + "time_per_iteration": 2.5484869480133057 + }, + { + "auxiliary_loss_clip": 0.06510898, + "auxiliary_loss_mlp": 0.01285703, + "balance_loss_clip": 0.06285729, + "balance_loss_mlp": 0.01265473, + "epoch": 0.2403727641665414, + "flos": 20892492858240.0, + "grad_norm": 1.7909711234465908, + "language_loss": 0.87516266, + "learning_rate": 3.5534296369760535e-06, + "loss": 0.9531287, + "num_input_tokens_seen": 86081340, + "router_z_loss_clip": 2.25390625, + "router_z_loss_mlp": 0.20227051, + "step": 3998, + "time_per_iteration": 2.563215970993042 + }, + { + "auxiliary_loss_clip": 0.06526193, + "auxiliary_loss_mlp": 0.01279159, + "balance_loss_clip": 0.06292593, + "balance_loss_mlp": 0.01258762, + "epoch": 0.24043288741920937, + "flos": 22826613127680.0, + "grad_norm": 1.593528116777893, + "language_loss": 0.76414531, + "learning_rate": 3.5531843041358183e-06, + "loss": 0.84219879, + "num_input_tokens_seen": 86102260, + "router_z_loss_clip": 2.3359375, + "router_z_loss_mlp": 0.20410156, + "step": 3999, + "time_per_iteration": 2.5577592849731445 + }, + { + "auxiliary_loss_clip": 0.06511137, + "auxiliary_loss_mlp": 0.01275527, + "balance_loss_clip": 0.0628795, + "balance_loss_mlp": 0.01256716, + "epoch": 0.24049301067187734, + "flos": 27966261594240.0, + "grad_norm": 2.3407253335254086, + "language_loss": 0.73292184, + "learning_rate": 3.552938912398679e-06, + "loss": 0.81078851, + "num_input_tokens_seen": 86123400, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.18823242, + "step": 4000, + "time_per_iteration": 2.583524703979492 + }, + { + "auxiliary_loss_clip": 0.06528921, + "auxiliary_loss_mlp": 0.01283655, + "balance_loss_clip": 0.06293923, + "balance_loss_mlp": 0.01261935, + "epoch": 0.24055313392454533, + "flos": 27458360870400.0, + "grad_norm": 2.671051655318694, + "language_loss": 0.67159665, + "learning_rate": 3.5526934617739397e-06, + "loss": 0.74972242, + "num_input_tokens_seen": 86144060, + "router_z_loss_clip": 2.3515625, + "router_z_loss_mlp": 0.21728516, + "step": 4001, + "time_per_iteration": 2.6188552379608154 + }, + { + "auxiliary_loss_clip": 0.06522354, + "auxiliary_loss_mlp": 0.01279459, + "balance_loss_clip": 0.06293849, + "balance_loss_mlp": 0.01257703, + "epoch": 0.2406132571772133, + "flos": 25563666746880.0, + "grad_norm": 5.034242823707272, + "language_loss": 0.83152658, + "learning_rate": 3.5524479522709095e-06, + "loss": 0.90954471, + "num_input_tokens_seen": 86163005, + "router_z_loss_clip": 2.28320312, + "router_z_loss_mlp": 0.21740723, + "step": 4002, + "time_per_iteration": 3.9769785404205322 + }, + { + "auxiliary_loss_clip": 0.06519094, + "auxiliary_loss_mlp": 0.01282536, + "balance_loss_clip": 0.06293523, + "balance_loss_mlp": 0.01262032, + "epoch": 0.24067338042988126, + "flos": 24798482461440.0, + "grad_norm": 2.0463487498067323, + "language_loss": 0.83599687, + "learning_rate": 3.552202383898897e-06, + "loss": 0.91401321, + "num_input_tokens_seen": 86182580, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.20483398, + "step": 4003, + "time_per_iteration": 2.581669569015503 + }, + { + "auxiliary_loss_clip": 0.06526292, + "auxiliary_loss_mlp": 0.01281725, + "balance_loss_clip": 0.06295015, + "balance_loss_mlp": 0.01261412, + "epoch": 0.24073350368254923, + "flos": 21184171320960.0, + "grad_norm": 2.0670244348036646, + "language_loss": 0.87907362, + "learning_rate": 3.551956756667215e-06, + "loss": 0.9571538, + "num_input_tokens_seen": 86200665, + "router_z_loss_clip": 2.31445312, + "router_z_loss_mlp": 0.20300293, + "step": 4004, + "time_per_iteration": 2.514268636703491 + }, + { + "auxiliary_loss_clip": 0.06526911, + "auxiliary_loss_mlp": 0.01282868, + "balance_loss_clip": 0.06294513, + "balance_loss_mlp": 0.01261815, + "epoch": 0.2407936269352172, + "flos": 22501252523520.0, + "grad_norm": 3.538522770409821, + "language_loss": 0.78168321, + "learning_rate": 3.551711070585177e-06, + "loss": 0.85978097, + "num_input_tokens_seen": 86221640, + "router_z_loss_clip": 2.32421875, + "router_z_loss_mlp": 0.21057129, + "step": 4005, + "time_per_iteration": 2.67775559425354 + }, + { + "auxiliary_loss_clip": 0.0651572, + "auxiliary_loss_mlp": 0.01283457, + "balance_loss_clip": 0.06293365, + "balance_loss_mlp": 0.01263084, + "epoch": 0.24085375018788516, + "flos": 18556968804480.0, + "grad_norm": 2.371719422478697, + "language_loss": 0.79360878, + "learning_rate": 3.5514653256620995e-06, + "loss": 0.87160051, + "num_input_tokens_seen": 86240795, + "router_z_loss_clip": 2.2265625, + "router_z_loss_mlp": 0.20373535, + "step": 4006, + "time_per_iteration": 4.034858465194702 + }, + { + "auxiliary_loss_clip": 0.0653493, + "auxiliary_loss_mlp": 0.01283621, + "balance_loss_clip": 0.06296381, + "balance_loss_mlp": 0.01260709, + "epoch": 0.24091387344055312, + "flos": 24177418398720.0, + "grad_norm": 1.8737477168573817, + "language_loss": 0.71813238, + "learning_rate": 3.551219521907302e-06, + "loss": 0.79631788, + "num_input_tokens_seen": 86262000, + "router_z_loss_clip": 2.38671875, + "router_z_loss_mlp": 0.22912598, + "step": 4007, + "time_per_iteration": 2.5730202198028564 + }, + { + "auxiliary_loss_clip": 0.06518448, + "auxiliary_loss_mlp": 0.01300708, + "balance_loss_clip": 0.06295364, + "balance_loss_mlp": 0.01278773, + "epoch": 0.24097399669322112, + "flos": 11041112327040.0, + "grad_norm": 6.473369852788927, + "language_loss": 0.76978099, + "learning_rate": 3.5509736593301042e-06, + "loss": 0.84797251, + "num_input_tokens_seen": 86279680, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.21936035, + "step": 4008, + "time_per_iteration": 2.55989146232605 + }, + { + "auxiliary_loss_clip": 0.06518552, + "auxiliary_loss_mlp": 0.01286303, + "balance_loss_clip": 0.062894, + "balance_loss_mlp": 0.01264928, + "epoch": 0.24103411994588908, + "flos": 17170762383360.0, + "grad_norm": 2.1979472110907556, + "language_loss": 0.75080305, + "learning_rate": 3.5507277379398295e-06, + "loss": 0.82885164, + "num_input_tokens_seen": 86297180, + "router_z_loss_clip": 2.29296875, + "router_z_loss_mlp": 0.21398926, + "step": 4009, + "time_per_iteration": 3.957920551300049 + }, + { + "auxiliary_loss_clip": 0.06521554, + "auxiliary_loss_mlp": 0.01301136, + "balance_loss_clip": 0.06293823, + "balance_loss_mlp": 0.01279869, + "epoch": 0.24109424319855705, + "flos": 20674258099200.0, + "grad_norm": 1.5898496231384156, + "language_loss": 0.80111217, + "learning_rate": 3.550481757745804e-06, + "loss": 0.8793391, + "num_input_tokens_seen": 86317660, + "router_z_loss_clip": 2.27539062, + "router_z_loss_mlp": 0.21264648, + "step": 4010, + "time_per_iteration": 2.5475916862487793 + }, + { + "auxiliary_loss_clip": 0.06527252, + "auxiliary_loss_mlp": 0.01291864, + "balance_loss_clip": 0.06297424, + "balance_loss_mlp": 0.01268964, + "epoch": 0.241154366451225, + "flos": 28188982546560.0, + "grad_norm": 2.0856120841249366, + "language_loss": 0.70933908, + "learning_rate": 3.5502357187573555e-06, + "loss": 0.78753024, + "num_input_tokens_seen": 86338325, + "router_z_loss_clip": 2.29882812, + "router_z_loss_mlp": 0.22912598, + "step": 4011, + "time_per_iteration": 2.630932092666626 + }, + { + "auxiliary_loss_clip": 0.06528456, + "auxiliary_loss_mlp": 0.0128714, + "balance_loss_clip": 0.06298923, + "balance_loss_mlp": 0.01265766, + "epoch": 0.24121448970389298, + "flos": 21696222821760.0, + "grad_norm": 1.7418824634594252, + "language_loss": 0.694484, + "learning_rate": 3.5499896209838118e-06, + "loss": 0.77263987, + "num_input_tokens_seen": 86357615, + "router_z_loss_clip": 2.29492188, + "router_z_loss_mlp": 0.21362305, + "step": 4012, + "time_per_iteration": 3.988281726837158 + }, + { + "auxiliary_loss_clip": 0.06528036, + "auxiliary_loss_mlp": 0.01287792, + "balance_loss_clip": 0.06296879, + "balance_loss_mlp": 0.01264391, + "epoch": 0.24127461295656094, + "flos": 39685530142080.0, + "grad_norm": 1.5971840931497265, + "language_loss": 0.74512959, + "learning_rate": 3.5497434644345073e-06, + "loss": 0.82328784, + "num_input_tokens_seen": 86380355, + "router_z_loss_clip": 2.31054688, + "router_z_loss_mlp": 0.23388672, + "step": 4013, + "time_per_iteration": 2.7159719467163086 + }, + { + "auxiliary_loss_clip": 0.06531674, + "auxiliary_loss_mlp": 0.01283711, + "balance_loss_clip": 0.0630402, + "balance_loss_mlp": 0.01263231, + "epoch": 0.2413347362092289, + "flos": 19141960884480.0, + "grad_norm": 1.667652232266074, + "language_loss": 0.89031768, + "learning_rate": 3.5494972491187753e-06, + "loss": 0.96847153, + "num_input_tokens_seen": 86399125, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.20483398, + "step": 4014, + "time_per_iteration": 2.5638303756713867 + }, + { + "auxiliary_loss_clip": 0.06538786, + "auxiliary_loss_mlp": 0.01289681, + "balance_loss_clip": 0.06304225, + "balance_loss_mlp": 0.01268831, + "epoch": 0.2413948594618969, + "flos": 26946099734400.0, + "grad_norm": 1.9521080560444544, + "language_loss": 0.95043075, + "learning_rate": 3.549250975045952e-06, + "loss": 1.02871537, + "num_input_tokens_seen": 86418625, + "router_z_loss_clip": 2.34179688, + "router_z_loss_mlp": 0.20849609, + "step": 4015, + "time_per_iteration": 2.5697052478790283 + }, + { + "auxiliary_loss_clip": 0.0653477, + "auxiliary_loss_mlp": 0.01278309, + "balance_loss_clip": 0.06306844, + "balance_loss_mlp": 0.01257781, + "epoch": 0.24145498271456486, + "flos": 25235077760640.0, + "grad_norm": 1.8045004389175856, + "language_loss": 0.83243644, + "learning_rate": 3.5490046422253768e-06, + "loss": 0.91056728, + "num_input_tokens_seen": 86438375, + "router_z_loss_clip": 2.27929688, + "router_z_loss_mlp": 0.2052002, + "step": 4016, + "time_per_iteration": 2.5709176063537598 + }, + { + "auxiliary_loss_clip": 0.06532364, + "auxiliary_loss_mlp": 0.01285254, + "balance_loss_clip": 0.06311545, + "balance_loss_mlp": 0.0126463, + "epoch": 0.24151510596723283, + "flos": 40671339027840.0, + "grad_norm": 2.079467312298135, + "language_loss": 0.69439638, + "learning_rate": 3.54875825066639e-06, + "loss": 0.77257252, + "num_input_tokens_seen": 86463230, + "router_z_loss_clip": 2.20507812, + "router_z_loss_mlp": 0.20617676, + "step": 4017, + "time_per_iteration": 2.6893186569213867 + }, + { + "auxiliary_loss_clip": 0.06536807, + "auxiliary_loss_mlp": 0.01288936, + "balance_loss_clip": 0.06305309, + "balance_loss_mlp": 0.01266286, + "epoch": 0.2415752292199008, + "flos": 18151917367680.0, + "grad_norm": 1.6840714927615923, + "language_loss": 0.84970623, + "learning_rate": 3.5485118003783353e-06, + "loss": 0.92796361, + "num_input_tokens_seen": 86481230, + "router_z_loss_clip": 2.31445312, + "router_z_loss_mlp": 0.2265625, + "step": 4018, + "time_per_iteration": 2.521129608154297 + }, + { + "auxiliary_loss_clip": 0.06448493, + "auxiliary_loss_mlp": 0.01257752, + "balance_loss_clip": 0.06334345, + "balance_loss_mlp": 0.01253335, + "epoch": 0.24163535247256876, + "flos": 67307213819520.0, + "grad_norm": 1.2396896293086193, + "language_loss": 0.6054306, + "learning_rate": 3.548265291370558e-06, + "loss": 0.68249303, + "num_input_tokens_seen": 86541260, + "router_z_loss_clip": 1.140625, + "router_z_loss_mlp": 0.04425049, + "step": 4019, + "time_per_iteration": 3.2191333770751953 + }, + { + "auxiliary_loss_clip": 0.06539527, + "auxiliary_loss_mlp": 0.0127314, + "balance_loss_clip": 0.06310145, + "balance_loss_mlp": 0.01253983, + "epoch": 0.24169547572523672, + "flos": 24935810503680.0, + "grad_norm": 1.839335570686334, + "language_loss": 0.73635018, + "learning_rate": 3.5480187236524055e-06, + "loss": 0.81447685, + "num_input_tokens_seen": 86559580, + "router_z_loss_clip": 2.29296875, + "router_z_loss_mlp": 0.19140625, + "step": 4020, + "time_per_iteration": 2.587033271789551 + }, + { + "auxiliary_loss_clip": 0.06547633, + "auxiliary_loss_mlp": 0.01279706, + "balance_loss_clip": 0.06321433, + "balance_loss_mlp": 0.01259094, + "epoch": 0.24175559897790472, + "flos": 18733303722240.0, + "grad_norm": 1.757855043925666, + "language_loss": 0.81927264, + "learning_rate": 3.5477720972332285e-06, + "loss": 0.89754599, + "num_input_tokens_seen": 86577560, + "router_z_loss_clip": 2.26171875, + "router_z_loss_mlp": 0.20617676, + "step": 4021, + "time_per_iteration": 2.516295909881592 + }, + { + "auxiliary_loss_clip": 0.06542306, + "auxiliary_loss_mlp": 0.0127859, + "balance_loss_clip": 0.06314138, + "balance_loss_mlp": 0.01255201, + "epoch": 0.24181572223057268, + "flos": 23045937989760.0, + "grad_norm": 1.9677245364232816, + "language_loss": 0.76831293, + "learning_rate": 3.547525412122378e-06, + "loss": 0.84652191, + "num_input_tokens_seen": 86595350, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.23388672, + "step": 4022, + "time_per_iteration": 2.560833692550659 + }, + { + "auxiliary_loss_clip": 0.0655847, + "auxiliary_loss_mlp": 0.01279281, + "balance_loss_clip": 0.06321847, + "balance_loss_mlp": 0.01257477, + "epoch": 0.24187584548324065, + "flos": 20382411928320.0, + "grad_norm": 1.7589452517035808, + "language_loss": 0.75334597, + "learning_rate": 3.5472786683292083e-06, + "loss": 0.83172357, + "num_input_tokens_seen": 86614805, + "router_z_loss_clip": 2.36523438, + "router_z_loss_mlp": 0.21789551, + "step": 4023, + "time_per_iteration": 2.5414137840270996 + }, + { + "auxiliary_loss_clip": 0.06554291, + "auxiliary_loss_mlp": 0.01279196, + "balance_loss_clip": 0.06325305, + "balance_loss_mlp": 0.01258466, + "epoch": 0.2419359687359086, + "flos": 21403915453440.0, + "grad_norm": 1.837159559636974, + "language_loss": 0.82581335, + "learning_rate": 3.5470318658630766e-06, + "loss": 0.90414816, + "num_input_tokens_seen": 86633700, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.20751953, + "step": 4024, + "time_per_iteration": 2.570636034011841 + }, + { + "auxiliary_loss_clip": 0.06544912, + "auxiliary_loss_mlp": 0.01281053, + "balance_loss_clip": 0.06319256, + "balance_loss_mlp": 0.01260394, + "epoch": 0.24199609198857658, + "flos": 18375309152640.0, + "grad_norm": 1.8763334718563411, + "language_loss": 0.86724782, + "learning_rate": 3.5467850047333424e-06, + "loss": 0.94550753, + "num_input_tokens_seen": 86650905, + "router_z_loss_clip": 2.25585938, + "router_z_loss_mlp": 0.20654297, + "step": 4025, + "time_per_iteration": 2.507725715637207 + }, + { + "auxiliary_loss_clip": 0.0654591, + "auxiliary_loss_mlp": 0.01281956, + "balance_loss_clip": 0.06312732, + "balance_loss_mlp": 0.01261905, + "epoch": 0.24205621524124454, + "flos": 19469962892160.0, + "grad_norm": 2.105058685916829, + "language_loss": 0.72386706, + "learning_rate": 3.546538084949365e-06, + "loss": 0.80214572, + "num_input_tokens_seen": 86669185, + "router_z_loss_clip": 2.33398438, + "router_z_loss_mlp": 0.20068359, + "step": 4026, + "time_per_iteration": 2.573822498321533 + }, + { + "auxiliary_loss_clip": 0.06536272, + "auxiliary_loss_mlp": 0.01278576, + "balance_loss_clip": 0.06314979, + "balance_loss_mlp": 0.01258191, + "epoch": 0.2421163384939125, + "flos": 14981706466560.0, + "grad_norm": 5.331027510747572, + "language_loss": 0.64474452, + "learning_rate": 3.546291106520509e-06, + "loss": 0.722893, + "num_input_tokens_seen": 86686805, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.20397949, + "step": 4027, + "time_per_iteration": 2.5038652420043945 + }, + { + "auxiliary_loss_clip": 0.06553975, + "auxiliary_loss_mlp": 0.01291382, + "balance_loss_clip": 0.063242, + "balance_loss_mlp": 0.01271069, + "epoch": 0.2421764617465805, + "flos": 18668161572480.0, + "grad_norm": 2.149571528027882, + "language_loss": 0.70816404, + "learning_rate": 3.5460440694561388e-06, + "loss": 0.78661758, + "num_input_tokens_seen": 86705520, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.203125, + "step": 4028, + "time_per_iteration": 2.5707366466522217 + }, + { + "auxiliary_loss_clip": 0.06448589, + "auxiliary_loss_mlp": 0.01261037, + "balance_loss_clip": 0.06335288, + "balance_loss_mlp": 0.01254865, + "epoch": 0.24223658499924847, + "flos": 64368025424640.0, + "grad_norm": 0.8397041896242922, + "language_loss": 0.55315495, + "learning_rate": 3.545796973765623e-06, + "loss": 0.63025129, + "num_input_tokens_seen": 86767320, + "router_z_loss_clip": 1.13085938, + "router_z_loss_mlp": 0.06170654, + "step": 4029, + "time_per_iteration": 3.149601936340332 + }, + { + "auxiliary_loss_clip": 0.06557409, + "auxiliary_loss_mlp": 0.01307587, + "balance_loss_clip": 0.06331506, + "balance_loss_mlp": 0.01284615, + "epoch": 0.24229670825191643, + "flos": 25782278849280.0, + "grad_norm": 2.2612571716693664, + "language_loss": 0.75111073, + "learning_rate": 3.54554981945833e-06, + "loss": 0.82976073, + "num_input_tokens_seen": 86788110, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.22998047, + "step": 4030, + "time_per_iteration": 2.5939297676086426 + }, + { + "auxiliary_loss_clip": 0.0654521, + "auxiliary_loss_mlp": 0.0130894, + "balance_loss_clip": 0.06321512, + "balance_loss_mlp": 0.01287733, + "epoch": 0.2423568315045844, + "flos": 20673251850240.0, + "grad_norm": 1.8607136485921192, + "language_loss": 0.77126729, + "learning_rate": 3.5453026065436343e-06, + "loss": 0.84980875, + "num_input_tokens_seen": 86807640, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.2121582, + "step": 4031, + "time_per_iteration": 2.5886638164520264 + }, + { + "auxiliary_loss_clip": 0.06556953, + "auxiliary_loss_mlp": 0.01312472, + "balance_loss_clip": 0.06323709, + "balance_loss_mlp": 0.01290252, + "epoch": 0.24241695475725236, + "flos": 22422987210240.0, + "grad_norm": 1.956173023936914, + "language_loss": 0.66108859, + "learning_rate": 3.5450553350309083e-06, + "loss": 0.73978281, + "num_input_tokens_seen": 86826795, + "router_z_loss_clip": 2.33203125, + "router_z_loss_mlp": 0.22216797, + "step": 4032, + "time_per_iteration": 2.5665037631988525 + }, + { + "auxiliary_loss_clip": 0.06539695, + "auxiliary_loss_mlp": 0.01309421, + "balance_loss_clip": 0.06316876, + "balance_loss_mlp": 0.0128751, + "epoch": 0.24247707800992033, + "flos": 17134732327680.0, + "grad_norm": 3.4494454498841725, + "language_loss": 0.81464761, + "learning_rate": 3.5448080049295286e-06, + "loss": 0.89313877, + "num_input_tokens_seen": 86843175, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.21911621, + "step": 4033, + "time_per_iteration": 2.5237317085266113 + }, + { + "auxiliary_loss_clip": 0.06538171, + "auxiliary_loss_mlp": 0.01328283, + "balance_loss_clip": 0.06318024, + "balance_loss_mlp": 0.01305359, + "epoch": 0.2425372012625883, + "flos": 31621885597440.0, + "grad_norm": 1.909836856098088, + "language_loss": 0.69935066, + "learning_rate": 3.5445606162488754e-06, + "loss": 0.7780152, + "num_input_tokens_seen": 86863185, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.22900391, + "step": 4034, + "time_per_iteration": 2.713991641998291 + }, + { + "auxiliary_loss_clip": 0.06546839, + "auxiliary_loss_mlp": 0.01319063, + "balance_loss_clip": 0.06324256, + "balance_loss_mlp": 0.01298273, + "epoch": 0.24259732451525629, + "flos": 16331589342720.0, + "grad_norm": 2.1729941621503532, + "language_loss": 0.96340013, + "learning_rate": 3.5443131689983283e-06, + "loss": 1.04205918, + "num_input_tokens_seen": 86880040, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.20776367, + "step": 4035, + "time_per_iteration": 2.532848596572876 + }, + { + "auxiliary_loss_clip": 0.06537193, + "auxiliary_loss_mlp": 0.01327475, + "balance_loss_clip": 0.06319901, + "balance_loss_mlp": 0.01307447, + "epoch": 0.24265744776792425, + "flos": 22863230161920.0, + "grad_norm": 1.6992215283488847, + "language_loss": 0.78653824, + "learning_rate": 3.5440656631872715e-06, + "loss": 0.8651849, + "num_input_tokens_seen": 86900610, + "router_z_loss_clip": 2.17578125, + "router_z_loss_mlp": 0.20019531, + "step": 4036, + "time_per_iteration": 2.6079328060150146 + }, + { + "auxiliary_loss_clip": 0.06539825, + "auxiliary_loss_mlp": 0.01304693, + "balance_loss_clip": 0.06315397, + "balance_loss_mlp": 0.01282806, + "epoch": 0.24271757102059222, + "flos": 21878008254720.0, + "grad_norm": 1.624872867937933, + "language_loss": 0.74970233, + "learning_rate": 3.5438180988250898e-06, + "loss": 0.82814753, + "num_input_tokens_seen": 86919385, + "router_z_loss_clip": 2.2421875, + "router_z_loss_mlp": 0.21887207, + "step": 4037, + "time_per_iteration": 2.561479091644287 + }, + { + "auxiliary_loss_clip": 0.06526245, + "auxiliary_loss_mlp": 0.01308805, + "balance_loss_clip": 0.06302498, + "balance_loss_mlp": 0.01287539, + "epoch": 0.24277769427326018, + "flos": 19214649901440.0, + "grad_norm": 4.15075765155633, + "language_loss": 0.76952362, + "learning_rate": 3.543570475921171e-06, + "loss": 0.84787416, + "num_input_tokens_seen": 86938885, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.21276855, + "step": 4038, + "time_per_iteration": 2.514899492263794 + }, + { + "auxiliary_loss_clip": 0.06539176, + "auxiliary_loss_mlp": 0.01295141, + "balance_loss_clip": 0.06314565, + "balance_loss_mlp": 0.01272992, + "epoch": 0.24283781752592815, + "flos": 19505909093760.0, + "grad_norm": 2.116114626089979, + "language_loss": 0.72802031, + "learning_rate": 3.543322794484905e-06, + "loss": 0.80636352, + "num_input_tokens_seen": 86957705, + "router_z_loss_clip": 2.24414062, + "router_z_loss_mlp": 0.22167969, + "step": 4039, + "time_per_iteration": 2.603787422180176 + }, + { + "auxiliary_loss_clip": 0.06537706, + "auxiliary_loss_mlp": 0.01290985, + "balance_loss_clip": 0.06312682, + "balance_loss_mlp": 0.01269372, + "epoch": 0.2428979407785961, + "flos": 19908444908160.0, + "grad_norm": 1.7691638050154863, + "language_loss": 0.78818536, + "learning_rate": 3.5430750545256843e-06, + "loss": 0.86647218, + "num_input_tokens_seen": 86975845, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.21606445, + "step": 4040, + "time_per_iteration": 2.570063829421997 + }, + { + "auxiliary_loss_clip": 0.06530759, + "auxiliary_loss_mlp": 0.01283615, + "balance_loss_clip": 0.06313588, + "balance_loss_mlp": 0.01265162, + "epoch": 0.2429580640312641, + "flos": 24722523135360.0, + "grad_norm": 1.6907745152184719, + "language_loss": 0.81039703, + "learning_rate": 3.5428272560529027e-06, + "loss": 0.8885408, + "num_input_tokens_seen": 86994800, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.18444824, + "step": 4041, + "time_per_iteration": 2.5693795680999756 + }, + { + "auxiliary_loss_clip": 0.06532191, + "auxiliary_loss_mlp": 0.01286793, + "balance_loss_clip": 0.06311769, + "balance_loss_mlp": 0.01267529, + "epoch": 0.24301818728393207, + "flos": 25637529720960.0, + "grad_norm": 3.2457124561568, + "language_loss": 0.77433085, + "learning_rate": 3.542579399075957e-06, + "loss": 0.8525207, + "num_input_tokens_seen": 87016845, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.19262695, + "step": 4042, + "time_per_iteration": 3.9626972675323486 + }, + { + "auxiliary_loss_clip": 0.0653407, + "auxiliary_loss_mlp": 0.01279857, + "balance_loss_clip": 0.06316316, + "balance_loss_mlp": 0.01260652, + "epoch": 0.24307831053660003, + "flos": 26148700753920.0, + "grad_norm": 1.8532279658121147, + "language_loss": 0.82188201, + "learning_rate": 3.542331483604246e-06, + "loss": 0.90002131, + "num_input_tokens_seen": 87036270, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.19226074, + "step": 4043, + "time_per_iteration": 2.598202705383301 + }, + { + "auxiliary_loss_clip": 0.06538229, + "auxiliary_loss_mlp": 0.0127841, + "balance_loss_clip": 0.06309159, + "balance_loss_mlp": 0.01256594, + "epoch": 0.243138433789268, + "flos": 14977136419200.0, + "grad_norm": 2.775508644952731, + "language_loss": 0.73897892, + "learning_rate": 3.5420835096471706e-06, + "loss": 0.81714529, + "num_input_tokens_seen": 87049920, + "router_z_loss_clip": 2.29101562, + "router_z_loss_mlp": 0.21801758, + "step": 4044, + "time_per_iteration": 2.483752489089966 + }, + { + "auxiliary_loss_clip": 0.06534028, + "auxiliary_loss_mlp": 0.01284645, + "balance_loss_clip": 0.0631184, + "balance_loss_mlp": 0.01263629, + "epoch": 0.24319855704193596, + "flos": 25198670361600.0, + "grad_norm": 2.3685654829247227, + "language_loss": 0.83778739, + "learning_rate": 3.5418354772141337e-06, + "loss": 0.91597402, + "num_input_tokens_seen": 87068230, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.21020508, + "step": 4045, + "time_per_iteration": 2.60435152053833 + }, + { + "auxiliary_loss_clip": 0.06529962, + "auxiliary_loss_mlp": 0.0127985, + "balance_loss_clip": 0.06307946, + "balance_loss_mlp": 0.01260323, + "epoch": 0.24325868029460393, + "flos": 22133740515840.0, + "grad_norm": 1.834350653864789, + "language_loss": 0.87040859, + "learning_rate": 3.541587386314541e-06, + "loss": 0.94850671, + "num_input_tokens_seen": 87086435, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.19519043, + "step": 4046, + "time_per_iteration": 3.990011692047119 + }, + { + "auxiliary_loss_clip": 0.0652798, + "auxiliary_loss_mlp": 0.01281438, + "balance_loss_clip": 0.06311028, + "balance_loss_mlp": 0.01260922, + "epoch": 0.2433188035472719, + "flos": 23588107833600.0, + "grad_norm": 2.274532821816236, + "language_loss": 0.72945291, + "learning_rate": 3.5413392369578e-06, + "loss": 0.80754709, + "num_input_tokens_seen": 87105340, + "router_z_loss_clip": 2.16894531, + "router_z_loss_mlp": 0.20495605, + "step": 4047, + "time_per_iteration": 2.552464246749878 + }, + { + "auxiliary_loss_clip": 0.06530058, + "auxiliary_loss_mlp": 0.01284969, + "balance_loss_clip": 0.06306041, + "balance_loss_mlp": 0.01263666, + "epoch": 0.2433789267999399, + "flos": 24469809621120.0, + "grad_norm": 3.993347012147321, + "language_loss": 0.74453223, + "learning_rate": 3.5410910291533213e-06, + "loss": 0.8226825, + "num_input_tokens_seen": 87125780, + "router_z_loss_clip": 2.23828125, + "router_z_loss_mlp": 0.21325684, + "step": 4048, + "time_per_iteration": 4.027734279632568 + }, + { + "auxiliary_loss_clip": 0.06529407, + "auxiliary_loss_mlp": 0.01275879, + "balance_loss_clip": 0.06309648, + "balance_loss_mlp": 0.0125671, + "epoch": 0.24343905005260785, + "flos": 16733622032640.0, + "grad_norm": 2.185429514920852, + "language_loss": 0.73832756, + "learning_rate": 3.5408427629105155e-06, + "loss": 0.81638038, + "num_input_tokens_seen": 87144470, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.19165039, + "step": 4049, + "time_per_iteration": 2.5527403354644775 + }, + { + "auxiliary_loss_clip": 0.06525055, + "auxiliary_loss_mlp": 0.01275563, + "balance_loss_clip": 0.06306046, + "balance_loss_mlp": 0.01256084, + "epoch": 0.24349917330527582, + "flos": 20049294821760.0, + "grad_norm": 1.6558681415401064, + "language_loss": 0.74824917, + "learning_rate": 3.5405944382387985e-06, + "loss": 0.82625538, + "num_input_tokens_seen": 87162830, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.19482422, + "step": 4050, + "time_per_iteration": 2.517671585083008 + }, + { + "auxiliary_loss_clip": 0.06520879, + "auxiliary_loss_mlp": 0.0127856, + "balance_loss_clip": 0.06303313, + "balance_loss_mlp": 0.01258187, + "epoch": 0.24355929655794378, + "flos": 17426285009280.0, + "grad_norm": 2.447710360159803, + "language_loss": 0.75780261, + "learning_rate": 3.5403460551475854e-06, + "loss": 0.83579695, + "num_input_tokens_seen": 87180905, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.20361328, + "step": 4051, + "time_per_iteration": 3.961841583251953 + }, + { + "auxiliary_loss_clip": 0.06532377, + "auxiliary_loss_mlp": 0.01277824, + "balance_loss_clip": 0.06310124, + "balance_loss_mlp": 0.01257343, + "epoch": 0.24361941981061175, + "flos": 25417995223680.0, + "grad_norm": 2.289221862828171, + "language_loss": 0.71344352, + "learning_rate": 3.540097613646296e-06, + "loss": 0.79154545, + "num_input_tokens_seen": 87202290, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.20471191, + "step": 4052, + "time_per_iteration": 2.5851869583129883 + }, + { + "auxiliary_loss_clip": 0.06524909, + "auxiliary_loss_mlp": 0.01278097, + "balance_loss_clip": 0.06306259, + "balance_loss_mlp": 0.01258583, + "epoch": 0.2436795430632797, + "flos": 22827493595520.0, + "grad_norm": 1.7731467261886882, + "language_loss": 0.82073057, + "learning_rate": 3.539849113744351e-06, + "loss": 0.89876068, + "num_input_tokens_seen": 87221650, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.19519043, + "step": 4053, + "time_per_iteration": 2.6217734813690186 + }, + { + "auxiliary_loss_clip": 0.06533736, + "auxiliary_loss_mlp": 0.01278722, + "balance_loss_clip": 0.06309207, + "balance_loss_mlp": 0.01260126, + "epoch": 0.2437396663159477, + "flos": 15163030702080.0, + "grad_norm": 1.5690390746940162, + "language_loss": 0.78588867, + "learning_rate": 3.539600555451172e-06, + "loss": 0.86401325, + "num_input_tokens_seen": 87238515, + "router_z_loss_clip": 2.24414062, + "router_z_loss_mlp": 0.18615723, + "step": 4054, + "time_per_iteration": 2.513720750808716 + }, + { + "auxiliary_loss_clip": 0.06529565, + "auxiliary_loss_mlp": 0.0128197, + "balance_loss_clip": 0.06307493, + "balance_loss_mlp": 0.01263111, + "epoch": 0.24379978956861567, + "flos": 22097710460160.0, + "grad_norm": 1.7039269278884617, + "language_loss": 0.84417951, + "learning_rate": 3.5393519387761866e-06, + "loss": 0.92229491, + "num_input_tokens_seen": 87256290, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.1887207, + "step": 4055, + "time_per_iteration": 2.557584524154663 + }, + { + "auxiliary_loss_clip": 0.06542832, + "auxiliary_loss_mlp": 0.01280691, + "balance_loss_clip": 0.06312343, + "balance_loss_mlp": 0.01259508, + "epoch": 0.24385991282128364, + "flos": 31475878657920.0, + "grad_norm": 2.786051029634521, + "language_loss": 0.56684959, + "learning_rate": 3.5391032637288217e-06, + "loss": 0.6450848, + "num_input_tokens_seen": 87277085, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.21179199, + "step": 4056, + "time_per_iteration": 2.6548893451690674 + }, + { + "auxiliary_loss_clip": 0.06533613, + "auxiliary_loss_mlp": 0.01283826, + "balance_loss_clip": 0.06307291, + "balance_loss_mlp": 0.01262321, + "epoch": 0.2439200360739516, + "flos": 23845055978880.0, + "grad_norm": 2.215401064957846, + "language_loss": 0.80586845, + "learning_rate": 3.538854530318506e-06, + "loss": 0.88404286, + "num_input_tokens_seen": 87293020, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.21520996, + "step": 4057, + "time_per_iteration": 2.5563580989837646 + }, + { + "auxiliary_loss_clip": 0.06533922, + "auxiliary_loss_mlp": 0.01279797, + "balance_loss_clip": 0.06311886, + "balance_loss_mlp": 0.01261009, + "epoch": 0.24398015932661957, + "flos": 19175684952960.0, + "grad_norm": 1.7331406857586058, + "language_loss": 0.79934907, + "learning_rate": 3.538605738554673e-06, + "loss": 0.87748623, + "num_input_tokens_seen": 87311445, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.18786621, + "step": 4058, + "time_per_iteration": 2.5552098751068115 + }, + { + "auxiliary_loss_clip": 0.06541391, + "auxiliary_loss_mlp": 0.01280168, + "balance_loss_clip": 0.06312001, + "balance_loss_mlp": 0.01259772, + "epoch": 0.24404028257928753, + "flos": 25269095318400.0, + "grad_norm": 1.7324044437804977, + "language_loss": 0.86104828, + "learning_rate": 3.538356888446756e-06, + "loss": 0.93926388, + "num_input_tokens_seen": 87332055, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.20410156, + "step": 4059, + "time_per_iteration": 2.575345754623413 + }, + { + "auxiliary_loss_clip": 0.06538763, + "auxiliary_loss_mlp": 0.01274337, + "balance_loss_clip": 0.06318676, + "balance_loss_mlp": 0.01255621, + "epoch": 0.2441004058319555, + "flos": 26474606409600.0, + "grad_norm": 1.5285193147278118, + "language_loss": 0.74698234, + "learning_rate": 3.5381079800041913e-06, + "loss": 0.8251133, + "num_input_tokens_seen": 87351295, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.18713379, + "step": 4060, + "time_per_iteration": 2.6277999877929688 + }, + { + "auxiliary_loss_clip": 0.06560756, + "auxiliary_loss_mlp": 0.01280844, + "balance_loss_clip": 0.06327853, + "balance_loss_mlp": 0.01259469, + "epoch": 0.2441605290846235, + "flos": 26767752318720.0, + "grad_norm": 1.6858410849727605, + "language_loss": 0.73894358, + "learning_rate": 3.5378590132364182e-06, + "loss": 0.81735957, + "num_input_tokens_seen": 87370650, + "router_z_loss_clip": 2.328125, + "router_z_loss_mlp": 0.21374512, + "step": 4061, + "time_per_iteration": 2.5895774364471436 + }, + { + "auxiliary_loss_clip": 0.06538899, + "auxiliary_loss_mlp": 0.01273593, + "balance_loss_clip": 0.0631846, + "balance_loss_mlp": 0.01254103, + "epoch": 0.24422065233729146, + "flos": 21112236990720.0, + "grad_norm": 1.7809128746808311, + "language_loss": 0.76782405, + "learning_rate": 3.5376099881528768e-06, + "loss": 0.84594905, + "num_input_tokens_seen": 87389020, + "router_z_loss_clip": 2.20507812, + "router_z_loss_mlp": 0.19494629, + "step": 4062, + "time_per_iteration": 2.5655109882354736 + }, + { + "auxiliary_loss_clip": 0.06538436, + "auxiliary_loss_mlp": 0.01278297, + "balance_loss_clip": 0.06319936, + "balance_loss_mlp": 0.01258019, + "epoch": 0.24428077558995942, + "flos": 25269891932160.0, + "grad_norm": 1.624722619478305, + "language_loss": 0.84975201, + "learning_rate": 3.537360904763011e-06, + "loss": 0.92791933, + "num_input_tokens_seen": 87409695, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.20263672, + "step": 4063, + "time_per_iteration": 2.569420576095581 + }, + { + "auxiliary_loss_clip": 0.06559969, + "auxiliary_loss_mlp": 0.01275678, + "balance_loss_clip": 0.06327148, + "balance_loss_mlp": 0.01254459, + "epoch": 0.24434089884262739, + "flos": 20491508344320.0, + "grad_norm": 2.099790248638241, + "language_loss": 0.68837494, + "learning_rate": 3.5371117630762656e-06, + "loss": 0.76673138, + "num_input_tokens_seen": 87428250, + "router_z_loss_clip": 2.328125, + "router_z_loss_mlp": 0.2121582, + "step": 4064, + "time_per_iteration": 2.560065984725952 + }, + { + "auxiliary_loss_clip": 0.06547809, + "auxiliary_loss_mlp": 0.01276127, + "balance_loss_clip": 0.06317605, + "balance_loss_mlp": 0.01255349, + "epoch": 0.24440102209529535, + "flos": 23628456374400.0, + "grad_norm": 1.7607893449036869, + "language_loss": 0.70700729, + "learning_rate": 3.536862563102088e-06, + "loss": 0.78524667, + "num_input_tokens_seen": 87449380, + "router_z_loss_clip": 2.29882812, + "router_z_loss_mlp": 0.20788574, + "step": 4065, + "time_per_iteration": 2.5619614124298096 + }, + { + "auxiliary_loss_clip": 0.06554856, + "auxiliary_loss_mlp": 0.0127847, + "balance_loss_clip": 0.06322616, + "balance_loss_mlp": 0.01256726, + "epoch": 0.24446114534796332, + "flos": 20560382000640.0, + "grad_norm": 2.0639555504298372, + "language_loss": 0.84639663, + "learning_rate": 3.5366133048499282e-06, + "loss": 0.92472994, + "num_input_tokens_seen": 87465365, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.21765137, + "step": 4066, + "time_per_iteration": 2.5640382766723633 + }, + { + "auxiliary_loss_clip": 0.0647334, + "auxiliary_loss_mlp": 0.01266455, + "balance_loss_clip": 0.06356817, + "balance_loss_mlp": 0.01260456, + "epoch": 0.24452126860063128, + "flos": 60406719327360.0, + "grad_norm": 0.7224646734980834, + "language_loss": 0.52123713, + "learning_rate": 3.5363639883292374e-06, + "loss": 0.59863508, + "num_input_tokens_seen": 87522525, + "router_z_loss_clip": 1.1640625, + "router_z_loss_mlp": 0.05990601, + "step": 4067, + "time_per_iteration": 3.067857503890991 + }, + { + "auxiliary_loss_clip": 0.06549152, + "auxiliary_loss_mlp": 0.01275932, + "balance_loss_clip": 0.063198, + "balance_loss_mlp": 0.01255106, + "epoch": 0.24458139185329927, + "flos": 15126958719360.0, + "grad_norm": 4.582785635832698, + "language_loss": 0.72625411, + "learning_rate": 3.5361146135494706e-06, + "loss": 0.80450499, + "num_input_tokens_seen": 87539170, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.20825195, + "step": 4068, + "time_per_iteration": 2.5490705966949463 + }, + { + "auxiliary_loss_clip": 0.06542531, + "auxiliary_loss_mlp": 0.0127677, + "balance_loss_clip": 0.06318012, + "balance_loss_mlp": 0.01256111, + "epoch": 0.24464151510596724, + "flos": 28005771594240.0, + "grad_norm": 1.4744908303961997, + "language_loss": 0.7839663, + "learning_rate": 3.5358651805200835e-06, + "loss": 0.86215931, + "num_input_tokens_seen": 87558875, + "router_z_loss_clip": 2.24609375, + "router_z_loss_mlp": 0.20654297, + "step": 4069, + "time_per_iteration": 2.6064302921295166 + }, + { + "auxiliary_loss_clip": 0.06535528, + "auxiliary_loss_mlp": 0.01277448, + "balance_loss_clip": 0.06312935, + "balance_loss_mlp": 0.01257493, + "epoch": 0.2447016383586352, + "flos": 19799138856960.0, + "grad_norm": 1.9167348410225946, + "language_loss": 0.80741036, + "learning_rate": 3.5356156892505347e-06, + "loss": 0.88554007, + "num_input_tokens_seen": 87576485, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.19946289, + "step": 4070, + "time_per_iteration": 2.633073568344116 + }, + { + "auxiliary_loss_clip": 0.06543916, + "auxiliary_loss_mlp": 0.0127809, + "balance_loss_clip": 0.06317008, + "balance_loss_mlp": 0.01258825, + "epoch": 0.24476176161130317, + "flos": 26074460436480.0, + "grad_norm": 1.476613235331205, + "language_loss": 0.8444066, + "learning_rate": 3.5353661397502854e-06, + "loss": 0.92262667, + "num_input_tokens_seen": 87598620, + "router_z_loss_clip": 2.2734375, + "router_z_loss_mlp": 0.19262695, + "step": 4071, + "time_per_iteration": 2.6165285110473633 + }, + { + "auxiliary_loss_clip": 0.06545337, + "auxiliary_loss_mlp": 0.01275719, + "balance_loss_clip": 0.06310376, + "balance_loss_mlp": 0.01254679, + "epoch": 0.24482188486397113, + "flos": 18849527735040.0, + "grad_norm": 2.1913275656577857, + "language_loss": 0.8027429, + "learning_rate": 3.535116532028798e-06, + "loss": 0.88095343, + "num_input_tokens_seen": 87616595, + "router_z_loss_clip": 2.3515625, + "router_z_loss_mlp": 0.21032715, + "step": 4072, + "time_per_iteration": 2.580077648162842 + }, + { + "auxiliary_loss_clip": 0.06531823, + "auxiliary_loss_mlp": 0.01275557, + "balance_loss_clip": 0.06311209, + "balance_loss_mlp": 0.01257031, + "epoch": 0.2448820081166391, + "flos": 21258202003200.0, + "grad_norm": 1.4781582217057618, + "language_loss": 0.7076053, + "learning_rate": 3.5348668660955382e-06, + "loss": 0.7856791, + "num_input_tokens_seen": 87635755, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.18505859, + "step": 4073, + "time_per_iteration": 2.5430707931518555 + }, + { + "auxiliary_loss_clip": 0.06525481, + "auxiliary_loss_mlp": 0.01279613, + "balance_loss_clip": 0.06303517, + "balance_loss_mlp": 0.01260921, + "epoch": 0.2449421313693071, + "flos": 23957254995840.0, + "grad_norm": 2.412576467354098, + "language_loss": 0.67577648, + "learning_rate": 3.5346171419599728e-06, + "loss": 0.75382745, + "num_input_tokens_seen": 87652885, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.18676758, + "step": 4074, + "time_per_iteration": 2.5616037845611572 + }, + { + "auxiliary_loss_clip": 0.06435025, + "auxiliary_loss_mlp": 0.01257107, + "balance_loss_clip": 0.06320108, + "balance_loss_mlp": 0.01251907, + "epoch": 0.24500225462197506, + "flos": 60705902730240.0, + "grad_norm": 0.8764237694402175, + "language_loss": 0.68656927, + "learning_rate": 3.5343673596315718e-06, + "loss": 0.76349056, + "num_input_tokens_seen": 87713220, + "router_z_loss_clip": 1.1484375, + "router_z_loss_mlp": 0.05203247, + "step": 4075, + "time_per_iteration": 3.2623581886291504 + }, + { + "auxiliary_loss_clip": 0.06527948, + "auxiliary_loss_mlp": 0.01276643, + "balance_loss_clip": 0.06305515, + "balance_loss_mlp": 0.01257414, + "epoch": 0.24506237787464302, + "flos": 26291018113920.0, + "grad_norm": 2.301278269127432, + "language_loss": 0.79781568, + "learning_rate": 3.5341175191198063e-06, + "loss": 0.87586164, + "num_input_tokens_seen": 87732680, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.19226074, + "step": 4076, + "time_per_iteration": 2.6342012882232666 + }, + { + "auxiliary_loss_clip": 0.06535772, + "auxiliary_loss_mlp": 0.01280909, + "balance_loss_clip": 0.06304428, + "balance_loss_mlp": 0.01258462, + "epoch": 0.245122501127311, + "flos": 20557530961920.0, + "grad_norm": 1.9232761502629154, + "language_loss": 0.82461953, + "learning_rate": 3.533867620434151e-06, + "loss": 0.90278631, + "num_input_tokens_seen": 87751880, + "router_z_loss_clip": 2.31445312, + "router_z_loss_mlp": 0.2244873, + "step": 4077, + "time_per_iteration": 2.5863101482391357 + }, + { + "auxiliary_loss_clip": 0.06532669, + "auxiliary_loss_mlp": 0.01279172, + "balance_loss_clip": 0.06305817, + "balance_loss_mlp": 0.01257774, + "epoch": 0.24518262437997895, + "flos": 29140312677120.0, + "grad_norm": 2.8377644839815357, + "language_loss": 0.63268852, + "learning_rate": 3.533617663584082e-06, + "loss": 0.71080685, + "num_input_tokens_seen": 87771795, + "router_z_loss_clip": 2.26953125, + "router_z_loss_mlp": 0.21398926, + "step": 4078, + "time_per_iteration": 2.6045711040496826 + }, + { + "auxiliary_loss_clip": 0.06522519, + "auxiliary_loss_mlp": 0.01277179, + "balance_loss_clip": 0.06301752, + "balance_loss_mlp": 0.01258249, + "epoch": 0.24524274763264692, + "flos": 23483623392000.0, + "grad_norm": 1.4700896000405594, + "language_loss": 0.75762683, + "learning_rate": 3.5333676485790765e-06, + "loss": 0.8356238, + "num_input_tokens_seen": 87793640, + "router_z_loss_clip": 2.20800781, + "router_z_loss_mlp": 0.18933105, + "step": 4079, + "time_per_iteration": 2.6327531337738037 + }, + { + "auxiliary_loss_clip": 0.06521107, + "auxiliary_loss_mlp": 0.01276139, + "balance_loss_clip": 0.06297373, + "balance_loss_mlp": 0.01256171, + "epoch": 0.24530287088531488, + "flos": 17206792439040.0, + "grad_norm": 1.743597814486786, + "language_loss": 0.75652814, + "learning_rate": 3.5331175754286173e-06, + "loss": 0.83450055, + "num_input_tokens_seen": 87812390, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.1998291, + "step": 4080, + "time_per_iteration": 2.5027806758880615 + }, + { + "auxiliary_loss_clip": 0.06517033, + "auxiliary_loss_mlp": 0.01282693, + "balance_loss_clip": 0.06296979, + "balance_loss_mlp": 0.01262129, + "epoch": 0.24536299413798288, + "flos": 14872903539840.0, + "grad_norm": 1.7999885027482954, + "language_loss": 0.83532149, + "learning_rate": 3.532867444142186e-06, + "loss": 0.91331875, + "num_input_tokens_seen": 87830640, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.20544434, + "step": 4081, + "time_per_iteration": 3.9672679901123047 + }, + { + "auxiliary_loss_clip": 0.06524678, + "auxiliary_loss_mlp": 0.01276758, + "balance_loss_clip": 0.06300613, + "balance_loss_mlp": 0.01257458, + "epoch": 0.24542311739065084, + "flos": 35270759347200.0, + "grad_norm": 2.0934334924975797, + "language_loss": 0.7376107, + "learning_rate": 3.532617254729267e-06, + "loss": 0.81562507, + "num_input_tokens_seen": 87850450, + "router_z_loss_clip": 2.2421875, + "router_z_loss_mlp": 0.19311523, + "step": 4082, + "time_per_iteration": 2.687596559524536 + }, + { + "auxiliary_loss_clip": 0.06520141, + "auxiliary_loss_mlp": 0.01272132, + "balance_loss_clip": 0.06301866, + "balance_loss_mlp": 0.01254334, + "epoch": 0.2454832406433188, + "flos": 21508903019520.0, + "grad_norm": 4.081398895882933, + "language_loss": 0.72681344, + "learning_rate": 3.5323670071993485e-06, + "loss": 0.8047362, + "num_input_tokens_seen": 87868810, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.17810059, + "step": 4083, + "time_per_iteration": 2.5715560913085938 + }, + { + "auxiliary_loss_clip": 0.06531677, + "auxiliary_loss_mlp": 0.01285124, + "balance_loss_clip": 0.06304878, + "balance_loss_mlp": 0.01263404, + "epoch": 0.24554336389598677, + "flos": 14761878480000.0, + "grad_norm": 2.078496591548884, + "language_loss": 0.75461411, + "learning_rate": 3.532116701561919e-06, + "loss": 0.83278215, + "num_input_tokens_seen": 87885685, + "router_z_loss_clip": 2.26953125, + "router_z_loss_mlp": 0.21704102, + "step": 4084, + "time_per_iteration": 2.527059316635132 + }, + { + "auxiliary_loss_clip": 0.06521569, + "auxiliary_loss_mlp": 0.01278312, + "balance_loss_clip": 0.06299873, + "balance_loss_mlp": 0.01259238, + "epoch": 0.24560348714865474, + "flos": 14981790320640.0, + "grad_norm": 1.9240939687866982, + "language_loss": 0.85311353, + "learning_rate": 3.531866337826471e-06, + "loss": 0.93111229, + "num_input_tokens_seen": 87903715, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.19055176, + "step": 4085, + "time_per_iteration": 4.107008695602417 + }, + { + "auxiliary_loss_clip": 0.06523392, + "auxiliary_loss_mlp": 0.01277742, + "balance_loss_clip": 0.06299591, + "balance_loss_mlp": 0.0125725, + "epoch": 0.2456636104013227, + "flos": 22682073634560.0, + "grad_norm": 1.671481131781836, + "language_loss": 0.79073685, + "learning_rate": 3.5316159160024982e-06, + "loss": 0.86874819, + "num_input_tokens_seen": 87923375, + "router_z_loss_clip": 2.23828125, + "router_z_loss_mlp": 0.20495605, + "step": 4086, + "time_per_iteration": 2.5609679222106934 + }, + { + "auxiliary_loss_clip": 0.06519614, + "auxiliary_loss_mlp": 0.01278477, + "balance_loss_clip": 0.06300113, + "balance_loss_mlp": 0.01260107, + "epoch": 0.2457237336539907, + "flos": 27425307634560.0, + "grad_norm": 1.6115503736345718, + "language_loss": 0.75352013, + "learning_rate": 3.531365436099496e-06, + "loss": 0.83150113, + "num_input_tokens_seen": 87943115, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.18359375, + "step": 4087, + "time_per_iteration": 4.046957015991211 + }, + { + "auxiliary_loss_clip": 0.06525059, + "auxiliary_loss_mlp": 0.01276774, + "balance_loss_clip": 0.06299827, + "balance_loss_mlp": 0.0125633, + "epoch": 0.24578385690665866, + "flos": 20418609692160.0, + "grad_norm": 2.7081304915573914, + "language_loss": 0.79987848, + "learning_rate": 3.5311148981269635e-06, + "loss": 0.87789685, + "num_input_tokens_seen": 87959505, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.20458984, + "step": 4088, + "time_per_iteration": 2.5119664669036865 + }, + { + "auxiliary_loss_clip": 0.06519316, + "auxiliary_loss_mlp": 0.01276403, + "balance_loss_clip": 0.06303117, + "balance_loss_mlp": 0.01258152, + "epoch": 0.24584398015932662, + "flos": 23922273116160.0, + "grad_norm": 2.802199957042034, + "language_loss": 0.77758735, + "learning_rate": 3.5308643020944e-06, + "loss": 0.85554451, + "num_input_tokens_seen": 87979725, + "router_z_loss_clip": 2.16601562, + "router_z_loss_mlp": 0.18249512, + "step": 4089, + "time_per_iteration": 2.5686089992523193 + }, + { + "auxiliary_loss_clip": 0.06525148, + "auxiliary_loss_mlp": 0.01281238, + "balance_loss_clip": 0.0630155, + "balance_loss_mlp": 0.01261021, + "epoch": 0.2459041034119946, + "flos": 41505313115520.0, + "grad_norm": 1.8031915906993192, + "language_loss": 0.81701422, + "learning_rate": 3.530613648011309e-06, + "loss": 0.89507812, + "num_input_tokens_seen": 87998270, + "router_z_loss_clip": 2.23828125, + "router_z_loss_mlp": 0.20214844, + "step": 4090, + "time_per_iteration": 2.678403377532959 + }, + { + "auxiliary_loss_clip": 0.065328, + "auxiliary_loss_mlp": 0.01279305, + "balance_loss_clip": 0.06309135, + "balance_loss_mlp": 0.01258861, + "epoch": 0.24596422666466256, + "flos": 19942755955200.0, + "grad_norm": 2.438516046551743, + "language_loss": 0.73629344, + "learning_rate": 3.5303629358871946e-06, + "loss": 0.8144145, + "num_input_tokens_seen": 88016760, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.20446777, + "step": 4091, + "time_per_iteration": 3.961276054382324 + }, + { + "auxiliary_loss_clip": 0.06539448, + "auxiliary_loss_mlp": 0.01279874, + "balance_loss_clip": 0.06316313, + "balance_loss_mlp": 0.0126148, + "epoch": 0.24602434991733052, + "flos": 21550970568960.0, + "grad_norm": 2.2480658521871897, + "language_loss": 0.77723873, + "learning_rate": 3.5301121657315653e-06, + "loss": 0.85543197, + "num_input_tokens_seen": 88036465, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.18408203, + "step": 4092, + "time_per_iteration": 2.5494375228881836 + }, + { + "auxiliary_loss_clip": 0.06537454, + "auxiliary_loss_mlp": 0.01278374, + "balance_loss_clip": 0.06307742, + "balance_loss_mlp": 0.01258907, + "epoch": 0.24608447316999849, + "flos": 23191735294080.0, + "grad_norm": 2.380112015735871, + "language_loss": 0.82381165, + "learning_rate": 3.5298613375539287e-06, + "loss": 0.90196991, + "num_input_tokens_seen": 88053270, + "router_z_loss_clip": 2.30078125, + "router_z_loss_mlp": 0.19470215, + "step": 4093, + "time_per_iteration": 2.5551040172576904 + }, + { + "auxiliary_loss_clip": 0.06532703, + "auxiliary_loss_mlp": 0.01285, + "balance_loss_clip": 0.06305315, + "balance_loss_mlp": 0.01264412, + "epoch": 0.24614459642266648, + "flos": 19647345985920.0, + "grad_norm": 21.11973952887688, + "language_loss": 0.87671578, + "learning_rate": 3.529610451363797e-06, + "loss": 0.95489287, + "num_input_tokens_seen": 88072305, + "router_z_loss_clip": 2.27148438, + "router_z_loss_mlp": 0.20581055, + "step": 4094, + "time_per_iteration": 2.534127712249756 + }, + { + "auxiliary_loss_clip": 0.06404499, + "auxiliary_loss_mlp": 0.01293713, + "balance_loss_clip": 0.06291573, + "balance_loss_mlp": 0.01289332, + "epoch": 0.24620471967533444, + "flos": 61757231109120.0, + "grad_norm": 0.7533459551406883, + "language_loss": 0.57023478, + "learning_rate": 3.5293595071706833e-06, + "loss": 0.64721692, + "num_input_tokens_seen": 88137995, + "router_z_loss_clip": 1.125, + "router_z_loss_mlp": 0.04388428, + "step": 4095, + "time_per_iteration": 3.238482713699341 + }, + { + "auxiliary_loss_clip": 0.06404348, + "auxiliary_loss_mlp": 0.01286038, + "balance_loss_clip": 0.06290346, + "balance_loss_mlp": 0.01281767, + "epoch": 0.2462648429280024, + "flos": 69174431003520.0, + "grad_norm": 0.6365745764429788, + "language_loss": 0.56240451, + "learning_rate": 3.5291085049841042e-06, + "loss": 0.63930833, + "num_input_tokens_seen": 88208490, + "router_z_loss_clip": 1.140625, + "router_z_loss_mlp": 0.04275513, + "step": 4096, + "time_per_iteration": 3.3192596435546875 + }, + { + "auxiliary_loss_clip": 0.06545975, + "auxiliary_loss_mlp": 0.01281956, + "balance_loss_clip": 0.06318395, + "balance_loss_mlp": 0.01262143, + "epoch": 0.24632496618067037, + "flos": 29467140727680.0, + "grad_norm": 1.505356285132213, + "language_loss": 0.78075927, + "learning_rate": 3.5288574448135773e-06, + "loss": 0.85903859, + "num_input_tokens_seen": 88228050, + "router_z_loss_clip": 2.2734375, + "router_z_loss_mlp": 0.19812012, + "step": 4097, + "time_per_iteration": 2.617108106613159 + }, + { + "auxiliary_loss_clip": 0.06547391, + "auxiliary_loss_mlp": 0.01279842, + "balance_loss_clip": 0.06315026, + "balance_loss_mlp": 0.01259993, + "epoch": 0.24638508943333834, + "flos": 24323341484160.0, + "grad_norm": 2.0372573834811267, + "language_loss": 0.77321315, + "learning_rate": 3.5286063266686235e-06, + "loss": 0.85148549, + "num_input_tokens_seen": 88248090, + "router_z_loss_clip": 2.32421875, + "router_z_loss_mlp": 0.1986084, + "step": 4098, + "time_per_iteration": 2.6069419384002686 + }, + { + "auxiliary_loss_clip": 0.06542017, + "auxiliary_loss_mlp": 0.01275521, + "balance_loss_clip": 0.0631687, + "balance_loss_mlp": 0.01257341, + "epoch": 0.2464452126860063, + "flos": 26620236005760.0, + "grad_norm": 2.17921698337753, + "language_loss": 0.69183016, + "learning_rate": 3.528355150558764e-06, + "loss": 0.77000558, + "num_input_tokens_seen": 88267545, + "router_z_loss_clip": 2.25, + "router_z_loss_mlp": 0.1817627, + "step": 4099, + "time_per_iteration": 2.655956506729126 + }, + { + "auxiliary_loss_clip": 0.06525709, + "auxiliary_loss_mlp": 0.01274552, + "balance_loss_clip": 0.06309929, + "balance_loss_mlp": 0.01256062, + "epoch": 0.24650533593867427, + "flos": 31220481813120.0, + "grad_norm": 2.2743270797915076, + "language_loss": 0.67268491, + "learning_rate": 3.5281039164935237e-06, + "loss": 0.75068748, + "num_input_tokens_seen": 88289785, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.18493652, + "step": 4100, + "time_per_iteration": 2.6497559547424316 + }, + { + "auxiliary_loss_clip": 0.0641202, + "auxiliary_loss_mlp": 0.01258309, + "balance_loss_clip": 0.06296985, + "balance_loss_mlp": 0.01253758, + "epoch": 0.24656545919134226, + "flos": 68513269962240.0, + "grad_norm": 0.6889590379062642, + "language_loss": 0.61607081, + "learning_rate": 3.5278526244824304e-06, + "loss": 0.69277412, + "num_input_tokens_seen": 88357320, + "router_z_loss_clip": 1.1484375, + "router_z_loss_mlp": 0.04559326, + "step": 4101, + "time_per_iteration": 3.2961082458496094 + }, + { + "auxiliary_loss_clip": 0.06538613, + "auxiliary_loss_mlp": 0.01277942, + "balance_loss_clip": 0.06317261, + "balance_loss_mlp": 0.01259, + "epoch": 0.24662558244401023, + "flos": 20090398049280.0, + "grad_norm": 1.6193028382456236, + "language_loss": 0.73591036, + "learning_rate": 3.527601274535012e-06, + "loss": 0.81407589, + "num_input_tokens_seen": 88377040, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.18945312, + "step": 4102, + "time_per_iteration": 2.542275905609131 + }, + { + "auxiliary_loss_clip": 0.0654332, + "auxiliary_loss_mlp": 0.01273749, + "balance_loss_clip": 0.06317908, + "balance_loss_mlp": 0.01255152, + "epoch": 0.2466857056966782, + "flos": 30709310780160.0, + "grad_norm": 2.0137613654817854, + "language_loss": 0.76325667, + "learning_rate": 3.5273498666608004e-06, + "loss": 0.84142733, + "num_input_tokens_seen": 88395085, + "router_z_loss_clip": 2.25, + "router_z_loss_mlp": 0.18603516, + "step": 4103, + "time_per_iteration": 2.6544189453125 + }, + { + "auxiliary_loss_clip": 0.06542745, + "auxiliary_loss_mlp": 0.01273413, + "balance_loss_clip": 0.06315098, + "balance_loss_mlp": 0.01253159, + "epoch": 0.24674582894934616, + "flos": 22535102373120.0, + "grad_norm": 2.0816413841430697, + "language_loss": 0.79265451, + "learning_rate": 3.5270984008693288e-06, + "loss": 0.87081611, + "num_input_tokens_seen": 88413205, + "router_z_loss_clip": 2.27929688, + "router_z_loss_mlp": 0.20275879, + "step": 4104, + "time_per_iteration": 2.5569820404052734 + }, + { + "auxiliary_loss_clip": 0.06525403, + "auxiliary_loss_mlp": 0.01276885, + "balance_loss_clip": 0.06306183, + "balance_loss_mlp": 0.01257251, + "epoch": 0.24680595220201412, + "flos": 20710581644160.0, + "grad_norm": 1.7450607123984514, + "language_loss": 0.83681756, + "learning_rate": 3.526846877170133e-06, + "loss": 0.9148404, + "num_input_tokens_seen": 88431525, + "router_z_loss_clip": 2.19335938, + "router_z_loss_mlp": 0.19641113, + "step": 4105, + "time_per_iteration": 2.553579330444336 + }, + { + "auxiliary_loss_clip": 0.06533727, + "auxiliary_loss_mlp": 0.01273598, + "balance_loss_clip": 0.06309752, + "balance_loss_mlp": 0.01255371, + "epoch": 0.2468660754546821, + "flos": 21836946954240.0, + "grad_norm": 1.9208859898797113, + "language_loss": 0.77469373, + "learning_rate": 3.52659529557275e-06, + "loss": 0.85276699, + "num_input_tokens_seen": 88451210, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.18212891, + "step": 4106, + "time_per_iteration": 2.5389256477355957 + }, + { + "auxiliary_loss_clip": 0.06534247, + "auxiliary_loss_mlp": 0.01276275, + "balance_loss_clip": 0.06310344, + "balance_loss_mlp": 0.01257463, + "epoch": 0.24692619870735008, + "flos": 15273049512960.0, + "grad_norm": 2.4615103155960485, + "language_loss": 0.73436344, + "learning_rate": 3.5263436560867205e-06, + "loss": 0.81246865, + "num_input_tokens_seen": 88467790, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.18798828, + "step": 4107, + "time_per_iteration": 2.5545566082000732 + }, + { + "auxiliary_loss_clip": 0.06538644, + "auxiliary_loss_mlp": 0.01275055, + "balance_loss_clip": 0.06314194, + "balance_loss_mlp": 0.01256745, + "epoch": 0.24698632196001805, + "flos": 29687933036160.0, + "grad_norm": 2.1377324014009504, + "language_loss": 0.66432422, + "learning_rate": 3.526091958721587e-06, + "loss": 0.7424612, + "num_input_tokens_seen": 88490330, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.18322754, + "step": 4108, + "time_per_iteration": 2.6196486949920654 + }, + { + "auxiliary_loss_clip": 0.06540007, + "auxiliary_loss_mlp": 0.01277779, + "balance_loss_clip": 0.06313555, + "balance_loss_mlp": 0.01259623, + "epoch": 0.247046445212686, + "flos": 39174736452480.0, + "grad_norm": 2.010829594577025, + "language_loss": 0.73608756, + "learning_rate": 3.5258402034868936e-06, + "loss": 0.81426549, + "num_input_tokens_seen": 88512435, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.18151855, + "step": 4109, + "time_per_iteration": 2.764406442642212 + }, + { + "auxiliary_loss_clip": 0.06534623, + "auxiliary_loss_mlp": 0.01277352, + "balance_loss_clip": 0.06311052, + "balance_loss_mlp": 0.01259077, + "epoch": 0.24710656846535398, + "flos": 23004834762240.0, + "grad_norm": 1.68605601916547, + "language_loss": 0.79419786, + "learning_rate": 3.5255883903921866e-06, + "loss": 0.87231761, + "num_input_tokens_seen": 88529780, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.18249512, + "step": 4110, + "time_per_iteration": 2.5460774898529053 + }, + { + "auxiliary_loss_clip": 0.06540776, + "auxiliary_loss_mlp": 0.01276666, + "balance_loss_clip": 0.06313831, + "balance_loss_mlp": 0.01257032, + "epoch": 0.24716669171802194, + "flos": 26440085727360.0, + "grad_norm": 2.6454329848736604, + "language_loss": 0.81789577, + "learning_rate": 3.5253365194470144e-06, + "loss": 0.89607012, + "num_input_tokens_seen": 88547200, + "router_z_loss_clip": 2.26953125, + "router_z_loss_mlp": 0.19628906, + "step": 4111, + "time_per_iteration": 2.632023811340332 + }, + { + "auxiliary_loss_clip": 0.06537174, + "auxiliary_loss_mlp": 0.0127416, + "balance_loss_clip": 0.06311068, + "balance_loss_mlp": 0.01256065, + "epoch": 0.2472268149706899, + "flos": 23336358641280.0, + "grad_norm": 1.983709335436533, + "language_loss": 0.75390071, + "learning_rate": 3.5250845906609294e-06, + "loss": 0.83201408, + "num_input_tokens_seen": 88566415, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.18115234, + "step": 4112, + "time_per_iteration": 2.5546083450317383 + }, + { + "auxiliary_loss_clip": 0.06533875, + "auxiliary_loss_mlp": 0.01274467, + "balance_loss_clip": 0.06308994, + "balance_loss_mlp": 0.01255548, + "epoch": 0.24728693822335787, + "flos": 23775469562880.0, + "grad_norm": 2.380234182887367, + "language_loss": 0.83472633, + "learning_rate": 3.5248326040434835e-06, + "loss": 0.91280973, + "num_input_tokens_seen": 88585225, + "router_z_loss_clip": 2.24804688, + "router_z_loss_mlp": 0.18920898, + "step": 4113, + "time_per_iteration": 2.6223254203796387 + }, + { + "auxiliary_loss_clip": 0.06540644, + "auxiliary_loss_mlp": 0.01276865, + "balance_loss_clip": 0.06315883, + "balance_loss_mlp": 0.01257279, + "epoch": 0.24734706147602586, + "flos": 19323494755200.0, + "grad_norm": 2.0367731486494636, + "language_loss": 0.87924093, + "learning_rate": 3.5245805596042322e-06, + "loss": 0.95741606, + "num_input_tokens_seen": 88603280, + "router_z_loss_clip": 2.24804688, + "router_z_loss_mlp": 0.19580078, + "step": 4114, + "time_per_iteration": 2.5495545864105225 + }, + { + "auxiliary_loss_clip": 0.06532501, + "auxiliary_loss_mlp": 0.01273212, + "balance_loss_clip": 0.06308883, + "balance_loss_mlp": 0.01255474, + "epoch": 0.24740718472869383, + "flos": 28044275345280.0, + "grad_norm": 1.9170399047542779, + "language_loss": 0.75640035, + "learning_rate": 3.524328457352734e-06, + "loss": 0.83445752, + "num_input_tokens_seen": 88624925, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.17736816, + "step": 4115, + "time_per_iteration": 2.6333982944488525 + }, + { + "auxiliary_loss_clip": 0.0642873, + "auxiliary_loss_mlp": 0.01264911, + "balance_loss_clip": 0.06315603, + "balance_loss_mlp": 0.01259151, + "epoch": 0.2474673079813618, + "flos": 68129265899520.0, + "grad_norm": 0.63897767002188, + "language_loss": 0.58004332, + "learning_rate": 3.5240762972985475e-06, + "loss": 0.65697974, + "num_input_tokens_seen": 88691475, + "router_z_loss_clip": 1.1328125, + "router_z_loss_mlp": 0.05752563, + "step": 4116, + "time_per_iteration": 3.251235246658325 + }, + { + "auxiliary_loss_clip": 0.06532618, + "auxiliary_loss_mlp": 0.01276179, + "balance_loss_clip": 0.063094, + "balance_loss_mlp": 0.01257022, + "epoch": 0.24752743123402976, + "flos": 29470075620480.0, + "grad_norm": 1.407143363910891, + "language_loss": 0.8425988, + "learning_rate": 3.523824079451235e-06, + "loss": 0.92068678, + "num_input_tokens_seen": 88713425, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.19152832, + "step": 4117, + "time_per_iteration": 2.640665292739868 + }, + { + "auxiliary_loss_clip": 0.06425081, + "auxiliary_loss_mlp": 0.01267093, + "balance_loss_clip": 0.0631275, + "balance_loss_mlp": 0.01262089, + "epoch": 0.24758755448669773, + "flos": 58367946908160.0, + "grad_norm": 0.8764773034828885, + "language_loss": 0.63508207, + "learning_rate": 3.5235718038203602e-06, + "loss": 0.71200383, + "num_input_tokens_seen": 88769995, + "router_z_loss_clip": 1.125, + "router_z_loss_mlp": 0.05001831, + "step": 4118, + "time_per_iteration": 3.052507162094116 + }, + { + "auxiliary_loss_clip": 0.0652981, + "auxiliary_loss_mlp": 0.01277419, + "balance_loss_clip": 0.06307684, + "balance_loss_mlp": 0.01258203, + "epoch": 0.2476476777393657, + "flos": 20490502095360.0, + "grad_norm": 1.7262960547494681, + "language_loss": 0.80051601, + "learning_rate": 3.523319470415491e-06, + "loss": 0.87858826, + "num_input_tokens_seen": 88789970, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.19238281, + "step": 4119, + "time_per_iteration": 2.554318428039551 + }, + { + "auxiliary_loss_clip": 0.06530587, + "auxiliary_loss_mlp": 0.01282865, + "balance_loss_clip": 0.06310613, + "balance_loss_mlp": 0.01265198, + "epoch": 0.24770780099203366, + "flos": 20492179176960.0, + "grad_norm": 2.4192345138137386, + "language_loss": 0.74556476, + "learning_rate": 3.5230670792461943e-06, + "loss": 0.8236993, + "num_input_tokens_seen": 88810000, + "router_z_loss_clip": 2.19921875, + "router_z_loss_mlp": 0.17663574, + "step": 4120, + "time_per_iteration": 3.996234655380249 + }, + { + "auxiliary_loss_clip": 0.06531808, + "auxiliary_loss_mlp": 0.01276043, + "balance_loss_clip": 0.06307146, + "balance_loss_mlp": 0.01256362, + "epoch": 0.24776792424470165, + "flos": 15157915603200.0, + "grad_norm": 2.13486110959629, + "language_loss": 0.89734054, + "learning_rate": 3.522814630322041e-06, + "loss": 0.97541904, + "num_input_tokens_seen": 88827515, + "router_z_loss_clip": 2.24804688, + "router_z_loss_mlp": 0.19689941, + "step": 4121, + "time_per_iteration": 2.5337533950805664 + }, + { + "auxiliary_loss_clip": 0.06540959, + "auxiliary_loss_mlp": 0.01278306, + "balance_loss_clip": 0.06314932, + "balance_loss_mlp": 0.01258744, + "epoch": 0.2478280474973696, + "flos": 21731833607040.0, + "grad_norm": 2.0829104418917646, + "language_loss": 0.69792116, + "learning_rate": 3.5225621236526045e-06, + "loss": 0.77611381, + "num_input_tokens_seen": 88845025, + "router_z_loss_clip": 2.25585938, + "router_z_loss_mlp": 0.19580078, + "step": 4122, + "time_per_iteration": 2.5857455730438232 + }, + { + "auxiliary_loss_clip": 0.06535036, + "auxiliary_loss_mlp": 0.01273779, + "balance_loss_clip": 0.0630946, + "balance_loss_mlp": 0.01254729, + "epoch": 0.24788817075003758, + "flos": 20418400056960.0, + "grad_norm": 2.5894895086667264, + "language_loss": 0.80832231, + "learning_rate": 3.5223095592474596e-06, + "loss": 0.88641047, + "num_input_tokens_seen": 88861740, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.19042969, + "step": 4123, + "time_per_iteration": 2.533696174621582 + }, + { + "auxiliary_loss_clip": 0.06528741, + "auxiliary_loss_mlp": 0.01276684, + "balance_loss_clip": 0.06306656, + "balance_loss_mlp": 0.01259625, + "epoch": 0.24794829400270554, + "flos": 22599867179520.0, + "grad_norm": 2.45373622595604, + "language_loss": 0.75091624, + "learning_rate": 3.5220569371161846e-06, + "loss": 0.82897043, + "num_input_tokens_seen": 88879740, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.1706543, + "step": 4124, + "time_per_iteration": 2.5478947162628174 + }, + { + "auxiliary_loss_clip": 0.06523614, + "auxiliary_loss_mlp": 0.01276208, + "balance_loss_clip": 0.06306844, + "balance_loss_mlp": 0.01258708, + "epoch": 0.2480084172553735, + "flos": 39685362433920.0, + "grad_norm": 1.4066224864196382, + "language_loss": 0.74510413, + "learning_rate": 3.521804257268357e-06, + "loss": 0.82310236, + "num_input_tokens_seen": 88904095, + "router_z_loss_clip": 2.16503906, + "router_z_loss_mlp": 0.17504883, + "step": 4125, + "time_per_iteration": 4.164500951766968 + }, + { + "auxiliary_loss_clip": 0.06546921, + "auxiliary_loss_mlp": 0.01279637, + "balance_loss_clip": 0.06313127, + "balance_loss_mlp": 0.01260599, + "epoch": 0.24806854050804147, + "flos": 22060129104000.0, + "grad_norm": 1.9518521214536066, + "language_loss": 0.69807184, + "learning_rate": 3.5215515197135595e-06, + "loss": 0.77633739, + "num_input_tokens_seen": 88920740, + "router_z_loss_clip": 2.34179688, + "router_z_loss_mlp": 0.19030762, + "step": 4126, + "time_per_iteration": 2.520550489425659 + }, + { + "auxiliary_loss_clip": 0.06526291, + "auxiliary_loss_mlp": 0.0127589, + "balance_loss_clip": 0.06304894, + "balance_loss_mlp": 0.01257281, + "epoch": 0.24812866376070947, + "flos": 15492164739840.0, + "grad_norm": 2.6036079521490834, + "language_loss": 0.81805199, + "learning_rate": 3.5212987244613764e-06, + "loss": 0.89607382, + "num_input_tokens_seen": 88938510, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.18615723, + "step": 4127, + "time_per_iteration": 4.052755832672119 + }, + { + "auxiliary_loss_clip": 0.06533966, + "auxiliary_loss_mlp": 0.012739, + "balance_loss_clip": 0.06306454, + "balance_loss_mlp": 0.01255494, + "epoch": 0.24818878701337743, + "flos": 14762758947840.0, + "grad_norm": 2.4130643839940746, + "language_loss": 0.85122234, + "learning_rate": 3.5210458715213927e-06, + "loss": 0.92930102, + "num_input_tokens_seen": 88955235, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.18395996, + "step": 4128, + "time_per_iteration": 2.5801029205322266 + }, + { + "auxiliary_loss_clip": 0.06541854, + "auxiliary_loss_mlp": 0.01278965, + "balance_loss_clip": 0.06316209, + "balance_loss_mlp": 0.01260821, + "epoch": 0.2482489102660454, + "flos": 27096886356480.0, + "grad_norm": 2.0112959815575713, + "language_loss": 0.66149813, + "learning_rate": 3.5207929609031973e-06, + "loss": 0.73970628, + "num_input_tokens_seen": 88975210, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.18151855, + "step": 4129, + "time_per_iteration": 2.5865726470947266 + }, + { + "auxiliary_loss_clip": 0.06528358, + "auxiliary_loss_mlp": 0.01276243, + "balance_loss_clip": 0.06307153, + "balance_loss_mlp": 0.01257444, + "epoch": 0.24830903351871336, + "flos": 26474522555520.0, + "grad_norm": 1.7021812681223303, + "language_loss": 0.75761282, + "learning_rate": 3.5205399926163806e-06, + "loss": 0.83565885, + "num_input_tokens_seen": 88996120, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.18811035, + "step": 4130, + "time_per_iteration": 2.6659512519836426 + }, + { + "auxiliary_loss_clip": 0.06526491, + "auxiliary_loss_mlp": 0.01274514, + "balance_loss_clip": 0.06302534, + "balance_loss_mlp": 0.01255, + "epoch": 0.24836915677138133, + "flos": 10232225337600.0, + "grad_norm": 2.0871707802719004, + "language_loss": 0.77625716, + "learning_rate": 3.520286966670535e-06, + "loss": 0.85426718, + "num_input_tokens_seen": 89008685, + "router_z_loss_clip": 2.23828125, + "router_z_loss_mlp": 0.19519043, + "step": 4131, + "time_per_iteration": 3.906522274017334 + }, + { + "auxiliary_loss_clip": 0.06519566, + "auxiliary_loss_mlp": 0.01270892, + "balance_loss_clip": 0.0630278, + "balance_loss_mlp": 0.01253582, + "epoch": 0.2484292800240493, + "flos": 30088162863360.0, + "grad_norm": 1.7622390062278706, + "language_loss": 0.84475207, + "learning_rate": 3.520033883075255e-06, + "loss": 0.92265671, + "num_input_tokens_seen": 89031160, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.17297363, + "step": 4132, + "time_per_iteration": 2.6436057090759277 + }, + { + "auxiliary_loss_clip": 0.06525066, + "auxiliary_loss_mlp": 0.01275924, + "balance_loss_clip": 0.06302708, + "balance_loss_mlp": 0.01256779, + "epoch": 0.24848940327671726, + "flos": 13447899878400.0, + "grad_norm": 1.545647189211169, + "language_loss": 0.71393758, + "learning_rate": 3.5197807418401386e-06, + "loss": 0.79194742, + "num_input_tokens_seen": 89047235, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.19152832, + "step": 4133, + "time_per_iteration": 2.5431106090545654 + }, + { + "auxiliary_loss_clip": 0.06542444, + "auxiliary_loss_mlp": 0.01275489, + "balance_loss_clip": 0.06309851, + "balance_loss_mlp": 0.01255116, + "epoch": 0.24854952652938525, + "flos": 19975683409920.0, + "grad_norm": 2.3352452144714513, + "language_loss": 0.6286931, + "learning_rate": 3.5195275429747834e-06, + "loss": 0.70687246, + "num_input_tokens_seen": 89064790, + "router_z_loss_clip": 2.328125, + "router_z_loss_mlp": 0.20373535, + "step": 4134, + "time_per_iteration": 2.571525812149048 + }, + { + "auxiliary_loss_clip": 0.06524864, + "auxiliary_loss_mlp": 0.01277288, + "balance_loss_clip": 0.06301688, + "balance_loss_mlp": 0.01258883, + "epoch": 0.24860964978205322, + "flos": 18156026217600.0, + "grad_norm": 1.960513817978903, + "language_loss": 0.79140246, + "learning_rate": 3.5192742864887914e-06, + "loss": 0.86942399, + "num_input_tokens_seen": 89083250, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.18383789, + "step": 4135, + "time_per_iteration": 2.588916301727295 + }, + { + "auxiliary_loss_clip": 0.06524552, + "auxiliary_loss_mlp": 0.01274974, + "balance_loss_clip": 0.06303368, + "balance_loss_mlp": 0.01256294, + "epoch": 0.24866977303472118, + "flos": 11733397960320.0, + "grad_norm": 2.2852251503119234, + "language_loss": 0.8410641, + "learning_rate": 3.5190209723917662e-06, + "loss": 0.9190594, + "num_input_tokens_seen": 89100905, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.18676758, + "step": 4136, + "time_per_iteration": 2.497654676437378 + }, + { + "auxiliary_loss_clip": 0.06524116, + "auxiliary_loss_mlp": 0.01273427, + "balance_loss_clip": 0.06297501, + "balance_loss_mlp": 0.01254521, + "epoch": 0.24872989628738915, + "flos": 34832109623040.0, + "grad_norm": 1.7046352309858128, + "language_loss": 0.71601558, + "learning_rate": 3.518767600693314e-06, + "loss": 0.79399109, + "num_input_tokens_seen": 89122630, + "router_z_loss_clip": 2.26757812, + "router_z_loss_mlp": 0.18908691, + "step": 4137, + "time_per_iteration": 2.732480764389038 + }, + { + "auxiliary_loss_clip": 0.06525281, + "auxiliary_loss_mlp": 0.01273776, + "balance_loss_clip": 0.06299166, + "balance_loss_mlp": 0.01255549, + "epoch": 0.2487900195400571, + "flos": 13704512607360.0, + "grad_norm": 2.5230361612400296, + "language_loss": 0.67583597, + "learning_rate": 3.518514171403042e-06, + "loss": 0.7538265, + "num_input_tokens_seen": 89141050, + "router_z_loss_clip": 2.26171875, + "router_z_loss_mlp": 0.18212891, + "step": 4138, + "time_per_iteration": 2.531855583190918 + }, + { + "auxiliary_loss_clip": 0.06519014, + "auxiliary_loss_mlp": 0.01272692, + "balance_loss_clip": 0.06298752, + "balance_loss_mlp": 0.01254501, + "epoch": 0.24885014279272508, + "flos": 25344845009280.0, + "grad_norm": 1.9341473695701388, + "language_loss": 0.83479851, + "learning_rate": 3.51826068453056e-06, + "loss": 0.91271555, + "num_input_tokens_seen": 89160810, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.18188477, + "step": 4139, + "time_per_iteration": 2.6051557064056396 + }, + { + "auxiliary_loss_clip": 0.06528804, + "auxiliary_loss_mlp": 0.01275882, + "balance_loss_clip": 0.06300579, + "balance_loss_mlp": 0.01255711, + "epoch": 0.24891026604539307, + "flos": 20637724919040.0, + "grad_norm": 1.6977646822397727, + "language_loss": 0.79297662, + "learning_rate": 3.518007140085481e-06, + "loss": 0.87102342, + "num_input_tokens_seen": 89180610, + "router_z_loss_clip": 2.2890625, + "router_z_loss_mlp": 0.20178223, + "step": 4140, + "time_per_iteration": 2.5448291301727295 + }, + { + "auxiliary_loss_clip": 0.0641291, + "auxiliary_loss_mlp": 0.01270262, + "balance_loss_clip": 0.06303305, + "balance_loss_mlp": 0.0126555, + "epoch": 0.24897038929806103, + "flos": 66979086030720.0, + "grad_norm": 0.8107945435966392, + "language_loss": 0.60717231, + "learning_rate": 3.51775353807742e-06, + "loss": 0.68400407, + "num_input_tokens_seen": 89241880, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.04705811, + "step": 4141, + "time_per_iteration": 3.2685940265655518 + }, + { + "auxiliary_loss_clip": 0.06525983, + "auxiliary_loss_mlp": 0.01275717, + "balance_loss_clip": 0.06301422, + "balance_loss_mlp": 0.01256894, + "epoch": 0.249030512550729, + "flos": 36401359288320.0, + "grad_norm": 1.7802793710753735, + "language_loss": 0.72871864, + "learning_rate": 3.5174998785159913e-06, + "loss": 0.80673563, + "num_input_tokens_seen": 89263340, + "router_z_loss_clip": 2.24609375, + "router_z_loss_mlp": 0.18823242, + "step": 4142, + "time_per_iteration": 2.6564056873321533 + }, + { + "auxiliary_loss_clip": 0.0652248, + "auxiliary_loss_mlp": 0.01276725, + "balance_loss_clip": 0.06302793, + "balance_loss_mlp": 0.0125789, + "epoch": 0.24909063580339696, + "flos": 20160361808640.0, + "grad_norm": 1.9535741137498925, + "language_loss": 0.81280798, + "learning_rate": 3.5172461614108157e-06, + "loss": 0.8908, + "num_input_tokens_seen": 89282870, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.18823242, + "step": 4143, + "time_per_iteration": 2.5795881748199463 + }, + { + "auxiliary_loss_clip": 0.06522508, + "auxiliary_loss_mlp": 0.01275624, + "balance_loss_clip": 0.06301625, + "balance_loss_mlp": 0.01257039, + "epoch": 0.24915075905606493, + "flos": 26403887963520.0, + "grad_norm": 1.964912825826696, + "language_loss": 0.59448719, + "learning_rate": 3.5169923867715137e-06, + "loss": 0.67246854, + "num_input_tokens_seen": 89303830, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.18579102, + "step": 4144, + "time_per_iteration": 2.5888898372650146 + }, + { + "auxiliary_loss_clip": 0.06520054, + "auxiliary_loss_mlp": 0.01279478, + "balance_loss_clip": 0.06300642, + "balance_loss_mlp": 0.01260608, + "epoch": 0.2492108823087329, + "flos": 27534655612800.0, + "grad_norm": 2.2926576094039253, + "language_loss": 0.79198605, + "learning_rate": 3.516738554607708e-06, + "loss": 0.86998141, + "num_input_tokens_seen": 89324350, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.18859863, + "step": 4145, + "time_per_iteration": 2.6068575382232666 + }, + { + "auxiliary_loss_clip": 0.06539698, + "auxiliary_loss_mlp": 0.01282889, + "balance_loss_clip": 0.06307465, + "balance_loss_mlp": 0.01262587, + "epoch": 0.24927100556140086, + "flos": 16697088852480.0, + "grad_norm": 2.388513156986414, + "language_loss": 0.65914291, + "learning_rate": 3.5164846649290253e-06, + "loss": 0.73736882, + "num_input_tokens_seen": 89342875, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.20300293, + "step": 4146, + "time_per_iteration": 2.550225019454956 + }, + { + "auxiliary_loss_clip": 0.06418058, + "auxiliary_loss_mlp": 0.01257626, + "balance_loss_clip": 0.06307501, + "balance_loss_mlp": 0.01252389, + "epoch": 0.24933112881406885, + "flos": 62791899724800.0, + "grad_norm": 0.9255702942051489, + "language_loss": 0.67495543, + "learning_rate": 3.5162307177450915e-06, + "loss": 0.75171226, + "num_input_tokens_seen": 89404925, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.05239868, + "step": 4147, + "time_per_iteration": 3.2676596641540527 + }, + { + "auxiliary_loss_clip": 0.06525366, + "auxiliary_loss_mlp": 0.01281982, + "balance_loss_clip": 0.06302118, + "balance_loss_mlp": 0.01261764, + "epoch": 0.24939125206673682, + "flos": 26659242881280.0, + "grad_norm": 1.678024692441642, + "language_loss": 0.89250457, + "learning_rate": 3.5159767130655366e-06, + "loss": 0.97057807, + "num_input_tokens_seen": 89425090, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.20214844, + "step": 4148, + "time_per_iteration": 2.5950350761413574 + }, + { + "auxiliary_loss_clip": 0.06529681, + "auxiliary_loss_mlp": 0.01281757, + "balance_loss_clip": 0.06300169, + "balance_loss_mlp": 0.0125968, + "epoch": 0.24945137531940478, + "flos": 20710623571200.0, + "grad_norm": 1.8952521518004763, + "language_loss": 0.68350649, + "learning_rate": 3.5157226508999935e-06, + "loss": 0.76162088, + "num_input_tokens_seen": 89442615, + "router_z_loss_clip": 2.29492188, + "router_z_loss_mlp": 0.22070312, + "step": 4149, + "time_per_iteration": 2.52567720413208 + }, + { + "auxiliary_loss_clip": 0.06528307, + "auxiliary_loss_mlp": 0.0128627, + "balance_loss_clip": 0.06306647, + "balance_loss_mlp": 0.01266398, + "epoch": 0.24951149857207275, + "flos": 23775385708800.0, + "grad_norm": 1.639238516163445, + "language_loss": 0.71759897, + "learning_rate": 3.515468531258095e-06, + "loss": 0.79574472, + "num_input_tokens_seen": 89463025, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.1986084, + "step": 4150, + "time_per_iteration": 2.580000877380371 + }, + { + "auxiliary_loss_clip": 0.06529218, + "auxiliary_loss_mlp": 0.01284871, + "balance_loss_clip": 0.06303831, + "balance_loss_mlp": 0.01264129, + "epoch": 0.2495716218247407, + "flos": 15669589760640.0, + "grad_norm": 1.939767404293352, + "language_loss": 0.73002028, + "learning_rate": 3.515214354149478e-06, + "loss": 0.80816114, + "num_input_tokens_seen": 89480225, + "router_z_loss_clip": 2.25390625, + "router_z_loss_mlp": 0.20739746, + "step": 4151, + "time_per_iteration": 2.4935879707336426 + }, + { + "auxiliary_loss_clip": 0.06534886, + "auxiliary_loss_mlp": 0.01281273, + "balance_loss_clip": 0.0630331, + "balance_loss_mlp": 0.01261055, + "epoch": 0.24963174507740868, + "flos": 24057924076800.0, + "grad_norm": 4.265592628376469, + "language_loss": 0.64070994, + "learning_rate": 3.514960119583781e-06, + "loss": 0.71887159, + "num_input_tokens_seen": 89496985, + "router_z_loss_clip": 2.31445312, + "router_z_loss_mlp": 0.20227051, + "step": 4152, + "time_per_iteration": 2.5687365531921387 + }, + { + "auxiliary_loss_clip": 0.06516105, + "auxiliary_loss_mlp": 0.01279803, + "balance_loss_clip": 0.06296911, + "balance_loss_mlp": 0.01259979, + "epoch": 0.24969186833007664, + "flos": 21806073924480.0, + "grad_norm": 2.335025994250793, + "language_loss": 0.7798419, + "learning_rate": 3.514705827570645e-06, + "loss": 0.85780108, + "num_input_tokens_seen": 89514420, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.19812012, + "step": 4153, + "time_per_iteration": 2.5565860271453857 + }, + { + "auxiliary_loss_clip": 0.06523906, + "auxiliary_loss_mlp": 0.01276939, + "balance_loss_clip": 0.06304043, + "balance_loss_mlp": 0.01257806, + "epoch": 0.24975199158274464, + "flos": 19944307255680.0, + "grad_norm": 2.3946475317027978, + "language_loss": 0.77287221, + "learning_rate": 3.514451478119711e-06, + "loss": 0.85088068, + "num_input_tokens_seen": 89532925, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.19152832, + "step": 4154, + "time_per_iteration": 2.5327064990997314 + }, + { + "auxiliary_loss_clip": 0.06533594, + "auxiliary_loss_mlp": 0.0128089, + "balance_loss_clip": 0.06299926, + "balance_loss_mlp": 0.01258145, + "epoch": 0.2498121148354126, + "flos": 25345515841920.0, + "grad_norm": 1.7912237432514402, + "language_loss": 0.71052945, + "learning_rate": 3.5141970712406258e-06, + "loss": 0.78867429, + "num_input_tokens_seen": 89552855, + "router_z_loss_clip": 2.33984375, + "router_z_loss_mlp": 0.22766113, + "step": 4155, + "time_per_iteration": 2.566044330596924 + }, + { + "auxiliary_loss_clip": 0.06528749, + "auxiliary_loss_mlp": 0.01277956, + "balance_loss_clip": 0.06300025, + "balance_loss_mlp": 0.01257809, + "epoch": 0.24987223808808057, + "flos": 20565119756160.0, + "grad_norm": 1.6974291352944781, + "language_loss": 0.75592315, + "learning_rate": 3.513942606943036e-06, + "loss": 0.83399028, + "num_input_tokens_seen": 89572830, + "router_z_loss_clip": 2.2890625, + "router_z_loss_mlp": 0.20141602, + "step": 4156, + "time_per_iteration": 2.5388355255126953 + }, + { + "auxiliary_loss_clip": 0.06524897, + "auxiliary_loss_mlp": 0.01278507, + "balance_loss_clip": 0.06303049, + "balance_loss_mlp": 0.0125842, + "epoch": 0.24993236134074853, + "flos": 19754052560640.0, + "grad_norm": 3.125892113983293, + "language_loss": 0.77757698, + "learning_rate": 3.513688085236591e-06, + "loss": 0.85561097, + "num_input_tokens_seen": 89590345, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.20068359, + "step": 4157, + "time_per_iteration": 2.5327329635620117 + }, + { + "auxiliary_loss_clip": 0.06527505, + "auxiliary_loss_mlp": 0.012775, + "balance_loss_clip": 0.06301083, + "balance_loss_mlp": 0.01257068, + "epoch": 0.2499924845934165, + "flos": 18776209812480.0, + "grad_norm": 1.8891569690037928, + "language_loss": 0.82203197, + "learning_rate": 3.513433506130942e-06, + "loss": 0.90008199, + "num_input_tokens_seen": 89610295, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.20422363, + "step": 4158, + "time_per_iteration": 2.5894827842712402 + }, + { + "auxiliary_loss_clip": 0.06518973, + "auxiliary_loss_mlp": 0.01272913, + "balance_loss_clip": 0.06295922, + "balance_loss_mlp": 0.012544, + "epoch": 0.25005260784608446, + "flos": 16877658401280.0, + "grad_norm": 2.206587551308884, + "language_loss": 0.75718945, + "learning_rate": 3.5131788696357427e-06, + "loss": 0.83510834, + "num_input_tokens_seen": 89627795, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.18505859, + "step": 4159, + "time_per_iteration": 2.5279693603515625 + }, + { + "auxiliary_loss_clip": 0.06529576, + "auxiliary_loss_mlp": 0.01278956, + "balance_loss_clip": 0.06300279, + "balance_loss_mlp": 0.01258142, + "epoch": 0.2501127310987524, + "flos": 22131057185280.0, + "grad_norm": 2.1699031495969354, + "language_loss": 0.71598893, + "learning_rate": 3.512924175760649e-06, + "loss": 0.7940743, + "num_input_tokens_seen": 89648090, + "router_z_loss_clip": 2.29492188, + "router_z_loss_mlp": 0.20812988, + "step": 4160, + "time_per_iteration": 3.9746532440185547 + }, + { + "auxiliary_loss_clip": 0.06424317, + "auxiliary_loss_mlp": 0.01267599, + "balance_loss_clip": 0.06313459, + "balance_loss_mlp": 0.01263326, + "epoch": 0.2501728543514204, + "flos": 69480071170560.0, + "grad_norm": 0.7438462037708533, + "language_loss": 0.56844532, + "learning_rate": 3.5126694245153186e-06, + "loss": 0.64536446, + "num_input_tokens_seen": 89710345, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.04278564, + "step": 4161, + "time_per_iteration": 3.233760356903076 + }, + { + "auxiliary_loss_clip": 0.06530809, + "auxiliary_loss_mlp": 0.01282686, + "balance_loss_clip": 0.06298731, + "balance_loss_mlp": 0.01261848, + "epoch": 0.25023297760408836, + "flos": 16295601214080.0, + "grad_norm": 2.49700797922569, + "language_loss": 0.8179751, + "learning_rate": 3.5124146159094125e-06, + "loss": 0.89611006, + "num_input_tokens_seen": 89729390, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.20849609, + "step": 4162, + "time_per_iteration": 2.553572654724121 + }, + { + "auxiliary_loss_clip": 0.0652239, + "auxiliary_loss_mlp": 0.01280647, + "balance_loss_clip": 0.06294353, + "balance_loss_mlp": 0.01260358, + "epoch": 0.2502931008567563, + "flos": 12242598422400.0, + "grad_norm": 2.2503072324763616, + "language_loss": 0.88019562, + "learning_rate": 3.5121597499525927e-06, + "loss": 0.95822597, + "num_input_tokens_seen": 89742805, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.203125, + "step": 4163, + "time_per_iteration": 2.531467914581299 + }, + { + "auxiliary_loss_clip": 0.06520548, + "auxiliary_loss_mlp": 0.01277405, + "balance_loss_clip": 0.06293885, + "balance_loss_mlp": 0.01257092, + "epoch": 0.25035322410942434, + "flos": 23188003787520.0, + "grad_norm": 1.6365124228332002, + "language_loss": 0.83867121, + "learning_rate": 3.5119048266545232e-06, + "loss": 0.91665077, + "num_input_tokens_seen": 89761145, + "router_z_loss_clip": 2.26757812, + "router_z_loss_mlp": 0.20300293, + "step": 4164, + "time_per_iteration": 4.068189382553101 + }, + { + "auxiliary_loss_clip": 0.06509531, + "auxiliary_loss_mlp": 0.01280667, + "balance_loss_clip": 0.06292763, + "balance_loss_mlp": 0.01262106, + "epoch": 0.2504133473620923, + "flos": 20922904690560.0, + "grad_norm": 1.788160941639295, + "language_loss": 0.7460506, + "learning_rate": 3.5116498460248716e-06, + "loss": 0.82395256, + "num_input_tokens_seen": 89780905, + "router_z_loss_clip": 2.16601562, + "router_z_loss_mlp": 0.18579102, + "step": 4165, + "time_per_iteration": 2.568701982498169 + }, + { + "auxiliary_loss_clip": 0.06526586, + "auxiliary_loss_mlp": 0.01278077, + "balance_loss_clip": 0.06293961, + "balance_loss_mlp": 0.01257883, + "epoch": 0.2504734706147603, + "flos": 20782725609600.0, + "grad_norm": 1.8100288551258081, + "language_loss": 0.74429101, + "learning_rate": 3.5113948080733062e-06, + "loss": 0.82233763, + "num_input_tokens_seen": 89799230, + "router_z_loss_clip": 2.33007812, + "router_z_loss_mlp": 0.2019043, + "step": 4166, + "time_per_iteration": 3.989368438720703 + }, + { + "auxiliary_loss_clip": 0.065147, + "auxiliary_loss_mlp": 0.01277163, + "balance_loss_clip": 0.06293219, + "balance_loss_mlp": 0.0125778, + "epoch": 0.25053359386742824, + "flos": 24355681960320.0, + "grad_norm": 1.5960764456675967, + "language_loss": 0.82469785, + "learning_rate": 3.5111397128094973e-06, + "loss": 0.90261644, + "num_input_tokens_seen": 89818240, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.19384766, + "step": 4167, + "time_per_iteration": 2.554733991622925 + }, + { + "auxiliary_loss_clip": 0.06513357, + "auxiliary_loss_mlp": 0.01280403, + "balance_loss_clip": 0.06292276, + "balance_loss_mlp": 0.01260614, + "epoch": 0.2505937171200962, + "flos": 21220578720000.0, + "grad_norm": 1.9887592956808484, + "language_loss": 0.80394876, + "learning_rate": 3.51088456024312e-06, + "loss": 0.88188636, + "num_input_tokens_seen": 89834485, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.19799805, + "step": 4168, + "time_per_iteration": 2.576969623565674 + }, + { + "auxiliary_loss_clip": 0.06531397, + "auxiliary_loss_mlp": 0.01277594, + "balance_loss_clip": 0.06300385, + "balance_loss_mlp": 0.01256196, + "epoch": 0.25065384037276417, + "flos": 41436816802560.0, + "grad_norm": 4.930314721126017, + "language_loss": 0.69985271, + "learning_rate": 3.510629350383849e-06, + "loss": 0.7779426, + "num_input_tokens_seen": 89855645, + "router_z_loss_clip": 2.31445312, + "router_z_loss_mlp": 0.21386719, + "step": 4169, + "time_per_iteration": 2.709149122238159 + }, + { + "auxiliary_loss_clip": 0.06511761, + "auxiliary_loss_mlp": 0.01277868, + "balance_loss_clip": 0.06292351, + "balance_loss_mlp": 0.0125827, + "epoch": 0.25071396362543213, + "flos": 26109274608000.0, + "grad_norm": 1.904216953279787, + "language_loss": 0.77927327, + "learning_rate": 3.510374083241361e-06, + "loss": 0.85716957, + "num_input_tokens_seen": 89874895, + "router_z_loss_clip": 2.19335938, + "router_z_loss_mlp": 0.19592285, + "step": 4170, + "time_per_iteration": 4.016170024871826 + }, + { + "auxiliary_loss_clip": 0.0651409, + "auxiliary_loss_mlp": 0.01278168, + "balance_loss_clip": 0.06291165, + "balance_loss_mlp": 0.01258975, + "epoch": 0.2507740868781001, + "flos": 19105008433920.0, + "grad_norm": 2.5077494433812966, + "language_loss": 0.76900339, + "learning_rate": 3.5101187588253368e-06, + "loss": 0.84692597, + "num_input_tokens_seen": 89891700, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.1920166, + "step": 4171, + "time_per_iteration": 2.5651609897613525 + }, + { + "auxiliary_loss_clip": 0.06406491, + "auxiliary_loss_mlp": 0.01262132, + "balance_loss_clip": 0.06297328, + "balance_loss_mlp": 0.01257083, + "epoch": 0.25083421013076806, + "flos": 64361652514560.0, + "grad_norm": 0.8214086964760371, + "language_loss": 0.6006844, + "learning_rate": 3.509863377145458e-06, + "loss": 0.67737067, + "num_input_tokens_seen": 89955775, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.05047607, + "step": 4172, + "time_per_iteration": 3.1837103366851807 + }, + { + "auxiliary_loss_clip": 0.06520402, + "auxiliary_loss_mlp": 0.01281138, + "balance_loss_clip": 0.06295419, + "balance_loss_mlp": 0.012603, + "epoch": 0.25089433338343603, + "flos": 24286430960640.0, + "grad_norm": 1.3489665028935822, + "language_loss": 0.79424238, + "learning_rate": 3.509607938211409e-06, + "loss": 0.87225777, + "num_input_tokens_seen": 89977150, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.20849609, + "step": 4173, + "time_per_iteration": 2.6214826107025146 + }, + { + "auxiliary_loss_clip": 0.06513289, + "auxiliary_loss_mlp": 0.01273745, + "balance_loss_clip": 0.06291197, + "balance_loss_mlp": 0.01254398, + "epoch": 0.250954456636104, + "flos": 14726896600320.0, + "grad_norm": 1.8312177549547823, + "language_loss": 0.83930022, + "learning_rate": 3.509352442032875e-06, + "loss": 0.91717052, + "num_input_tokens_seen": 89994925, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.19360352, + "step": 4174, + "time_per_iteration": 2.5973377227783203 + }, + { + "auxiliary_loss_clip": 0.06519122, + "auxiliary_loss_mlp": 0.0127901, + "balance_loss_clip": 0.0629285, + "balance_loss_mlp": 0.01259341, + "epoch": 0.25101457988877196, + "flos": 22280208652800.0, + "grad_norm": 2.088546315652338, + "language_loss": 0.71558678, + "learning_rate": 3.509096888619545e-06, + "loss": 0.79356813, + "num_input_tokens_seen": 90013235, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.19665527, + "step": 4175, + "time_per_iteration": 2.6718719005584717 + }, + { + "auxiliary_loss_clip": 0.06522886, + "auxiliary_loss_mlp": 0.01277602, + "balance_loss_clip": 0.06295571, + "balance_loss_mlp": 0.01256502, + "epoch": 0.2510747031414399, + "flos": 25195441979520.0, + "grad_norm": 1.9595604726907228, + "language_loss": 0.81335604, + "learning_rate": 3.50884127798111e-06, + "loss": 0.891361, + "num_input_tokens_seen": 90032150, + "router_z_loss_clip": 2.2734375, + "router_z_loss_mlp": 0.2109375, + "step": 4176, + "time_per_iteration": 2.5455691814422607 + }, + { + "auxiliary_loss_clip": 0.06515132, + "auxiliary_loss_mlp": 0.01279504, + "balance_loss_clip": 0.06292217, + "balance_loss_mlp": 0.01257319, + "epoch": 0.25113482639410795, + "flos": 20710455863040.0, + "grad_norm": 1.8805810902271358, + "language_loss": 0.83346581, + "learning_rate": 3.5085856101272623e-06, + "loss": 0.91141224, + "num_input_tokens_seen": 90049085, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.22167969, + "step": 4177, + "time_per_iteration": 2.5471949577331543 + }, + { + "auxiliary_loss_clip": 0.06520942, + "auxiliary_loss_mlp": 0.01276628, + "balance_loss_clip": 0.06300486, + "balance_loss_mlp": 0.01256375, + "epoch": 0.2511949496467759, + "flos": 21513347285760.0, + "grad_norm": 2.081094632338002, + "language_loss": 0.83410418, + "learning_rate": 3.508329885067698e-06, + "loss": 0.91207987, + "num_input_tokens_seen": 90067695, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.20251465, + "step": 4178, + "time_per_iteration": 2.5352370738983154 + }, + { + "auxiliary_loss_clip": 0.06514454, + "auxiliary_loss_mlp": 0.01274486, + "balance_loss_clip": 0.06294617, + "balance_loss_mlp": 0.01255949, + "epoch": 0.2512550728994439, + "flos": 20707898313600.0, + "grad_norm": 2.160080340734635, + "language_loss": 0.75744665, + "learning_rate": 3.508074102812112e-06, + "loss": 0.83533603, + "num_input_tokens_seen": 90083890, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.18554688, + "step": 4179, + "time_per_iteration": 2.560995578765869 + }, + { + "auxiliary_loss_clip": 0.0652363, + "auxiliary_loss_mlp": 0.0128226, + "balance_loss_clip": 0.06298499, + "balance_loss_mlp": 0.01261053, + "epoch": 0.25131519615211184, + "flos": 18484531349760.0, + "grad_norm": 2.0850842878171347, + "language_loss": 0.70515448, + "learning_rate": 3.507818263370206e-06, + "loss": 0.78321338, + "num_input_tokens_seen": 90100995, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.2121582, + "step": 4180, + "time_per_iteration": 2.510233163833618 + }, + { + "auxiliary_loss_clip": 0.06511761, + "auxiliary_loss_mlp": 0.01275296, + "balance_loss_clip": 0.06292045, + "balance_loss_mlp": 0.0125565, + "epoch": 0.2513753194047798, + "flos": 20491131000960.0, + "grad_norm": 1.8144815234901748, + "language_loss": 0.86591852, + "learning_rate": 3.5075623667516796e-06, + "loss": 0.94378912, + "num_input_tokens_seen": 90120365, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.19628906, + "step": 4181, + "time_per_iteration": 2.546736240386963 + }, + { + "auxiliary_loss_clip": 0.06519435, + "auxiliary_loss_mlp": 0.01276165, + "balance_loss_clip": 0.06298217, + "balance_loss_mlp": 0.01256555, + "epoch": 0.25143544265744777, + "flos": 37679182053120.0, + "grad_norm": 1.8572714108551465, + "language_loss": 0.68626046, + "learning_rate": 3.507306412966238e-06, + "loss": 0.76421642, + "num_input_tokens_seen": 90142610, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.19616699, + "step": 4182, + "time_per_iteration": 2.6632721424102783 + }, + { + "auxiliary_loss_clip": 0.06408723, + "auxiliary_loss_mlp": 0.01271787, + "balance_loss_clip": 0.0630056, + "balance_loss_mlp": 0.012679, + "epoch": 0.25149556591011574, + "flos": 69386502487680.0, + "grad_norm": 0.837431587640593, + "language_loss": 0.70118701, + "learning_rate": 3.5070504020235853e-06, + "loss": 0.77799207, + "num_input_tokens_seen": 90200555, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.03881836, + "step": 4183, + "time_per_iteration": 3.194293737411499 + }, + { + "auxiliary_loss_clip": 0.0651418, + "auxiliary_loss_mlp": 0.01278526, + "balance_loss_clip": 0.06292195, + "balance_loss_mlp": 0.01258725, + "epoch": 0.2515556891627837, + "flos": 13995478310400.0, + "grad_norm": 2.4106350957321805, + "language_loss": 0.74627292, + "learning_rate": 3.506794333933431e-06, + "loss": 0.82419991, + "num_input_tokens_seen": 90218120, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.19799805, + "step": 4184, + "time_per_iteration": 2.589237689971924 + }, + { + "auxiliary_loss_clip": 0.0652144, + "auxiliary_loss_mlp": 0.01279322, + "balance_loss_clip": 0.06299628, + "balance_loss_mlp": 0.01258496, + "epoch": 0.25161581241545167, + "flos": 22170022133760.0, + "grad_norm": 2.9216799071507964, + "language_loss": 0.83484751, + "learning_rate": 3.506538208705484e-06, + "loss": 0.91285515, + "num_input_tokens_seen": 90236790, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.20837402, + "step": 4185, + "time_per_iteration": 2.5535552501678467 + }, + { + "auxiliary_loss_clip": 0.06393237, + "auxiliary_loss_mlp": 0.01262208, + "balance_loss_clip": 0.06284703, + "balance_loss_mlp": 0.01258632, + "epoch": 0.25167593566811963, + "flos": 69375936873600.0, + "grad_norm": 0.7619629684954553, + "language_loss": 0.61517715, + "learning_rate": 3.5062820263494574e-06, + "loss": 0.69173163, + "num_input_tokens_seen": 90297070, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.03567505, + "step": 4186, + "time_per_iteration": 3.0749270915985107 + }, + { + "auxiliary_loss_clip": 0.06517404, + "auxiliary_loss_mlp": 0.01276354, + "balance_loss_clip": 0.06296861, + "balance_loss_mlp": 0.01256946, + "epoch": 0.2517360589207876, + "flos": 13266533715840.0, + "grad_norm": 1.9855339768496567, + "language_loss": 0.79795682, + "learning_rate": 3.5060257868750656e-06, + "loss": 0.87589443, + "num_input_tokens_seen": 90315255, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.1940918, + "step": 4187, + "time_per_iteration": 2.507354974746704 + }, + { + "auxiliary_loss_clip": 0.06517795, + "auxiliary_loss_mlp": 0.01276527, + "balance_loss_clip": 0.06298955, + "balance_loss_mlp": 0.01257001, + "epoch": 0.25179618217345556, + "flos": 20383208542080.0, + "grad_norm": 1.642205422551737, + "language_loss": 0.80147833, + "learning_rate": 3.5057694902920244e-06, + "loss": 0.87942159, + "num_input_tokens_seen": 90334990, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.19519043, + "step": 4188, + "time_per_iteration": 2.5763680934906006 + }, + { + "auxiliary_loss_clip": 0.06512115, + "auxiliary_loss_mlp": 0.01281194, + "balance_loss_clip": 0.06293076, + "balance_loss_mlp": 0.01261405, + "epoch": 0.25185630542612353, + "flos": 27670767770880.0, + "grad_norm": 1.9118309511671905, + "language_loss": 0.75198257, + "learning_rate": 3.5055131366100534e-06, + "loss": 0.8299157, + "num_input_tokens_seen": 90351825, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.19775391, + "step": 4189, + "time_per_iteration": 2.5764901638031006 + }, + { + "auxiliary_loss_clip": 0.06511948, + "auxiliary_loss_mlp": 0.01272813, + "balance_loss_clip": 0.06296545, + "balance_loss_mlp": 0.01255253, + "epoch": 0.25191642867879155, + "flos": 21002805158400.0, + "grad_norm": 1.9652552730181423, + "language_loss": 0.84938216, + "learning_rate": 3.5052567258388745e-06, + "loss": 0.92722976, + "num_input_tokens_seen": 90369860, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.17565918, + "step": 4190, + "time_per_iteration": 2.592289447784424 + }, + { + "auxiliary_loss_clip": 0.06519347, + "auxiliary_loss_mlp": 0.01277887, + "balance_loss_clip": 0.0629743, + "balance_loss_mlp": 0.01256513, + "epoch": 0.2519765519314595, + "flos": 21112027355520.0, + "grad_norm": 3.618444667756858, + "language_loss": 0.7581113, + "learning_rate": 3.5050002579882082e-06, + "loss": 0.83608365, + "num_input_tokens_seen": 90389245, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.21386719, + "step": 4191, + "time_per_iteration": 2.526263952255249 + }, + { + "auxiliary_loss_clip": 0.06391463, + "auxiliary_loss_mlp": 0.01256383, + "balance_loss_clip": 0.06282607, + "balance_loss_mlp": 0.01252372, + "epoch": 0.2520366751841275, + "flos": 62765932158720.0, + "grad_norm": 0.7119135795788611, + "language_loss": 0.56952, + "learning_rate": 3.5047437330677823e-06, + "loss": 0.64599848, + "num_input_tokens_seen": 90456735, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.0401001, + "step": 4192, + "time_per_iteration": 3.271810531616211 + }, + { + "auxiliary_loss_clip": 0.06513695, + "auxiliary_loss_mlp": 0.01277171, + "balance_loss_clip": 0.06298056, + "balance_loss_mlp": 0.01257835, + "epoch": 0.25209679843679544, + "flos": 22236254386560.0, + "grad_norm": 1.9003966807864532, + "language_loss": 0.77017993, + "learning_rate": 3.504487151087323e-06, + "loss": 0.84808856, + "num_input_tokens_seen": 90474165, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.19335938, + "step": 4193, + "time_per_iteration": 2.57377028465271 + }, + { + "auxiliary_loss_clip": 0.06516427, + "auxiliary_loss_mlp": 0.01274362, + "balance_loss_clip": 0.06290127, + "balance_loss_mlp": 0.01254573, + "epoch": 0.2521569216894634, + "flos": 12171502632960.0, + "grad_norm": 10.029516736128722, + "language_loss": 0.84954166, + "learning_rate": 3.5042305120565598e-06, + "loss": 0.92744958, + "num_input_tokens_seen": 90491660, + "router_z_loss_clip": 2.26171875, + "router_z_loss_mlp": 0.19787598, + "step": 4194, + "time_per_iteration": 2.553053140640259 + }, + { + "auxiliary_loss_clip": 0.06517825, + "auxiliary_loss_mlp": 0.01277837, + "balance_loss_clip": 0.06293463, + "balance_loss_mlp": 0.01258668, + "epoch": 0.2522170449421314, + "flos": 23707182885120.0, + "grad_norm": 1.454284137617771, + "language_loss": 0.88584, + "learning_rate": 3.5039738159852253e-06, + "loss": 0.96379662, + "num_input_tokens_seen": 90514025, + "router_z_loss_clip": 2.2421875, + "router_z_loss_mlp": 0.19165039, + "step": 4195, + "time_per_iteration": 2.576735734939575 + }, + { + "auxiliary_loss_clip": 0.06516481, + "auxiliary_loss_mlp": 0.01280538, + "balance_loss_clip": 0.06291771, + "balance_loss_mlp": 0.01258258, + "epoch": 0.25227716819479934, + "flos": 20961073025280.0, + "grad_norm": 2.023401186655312, + "language_loss": 0.86073804, + "learning_rate": 3.503717062883053e-06, + "loss": 0.93870831, + "num_input_tokens_seen": 90533530, + "router_z_loss_clip": 2.24609375, + "router_z_loss_mlp": 0.22290039, + "step": 4196, + "time_per_iteration": 2.561074733734131 + }, + { + "auxiliary_loss_clip": 0.06519768, + "auxiliary_loss_mlp": 0.01277786, + "balance_loss_clip": 0.06297043, + "balance_loss_mlp": 0.01258486, + "epoch": 0.2523372914474673, + "flos": 23338077649920.0, + "grad_norm": 1.7735111095668046, + "language_loss": 0.8382597, + "learning_rate": 3.5034602527597786e-06, + "loss": 0.91623521, + "num_input_tokens_seen": 90554025, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.19299316, + "step": 4197, + "time_per_iteration": 2.606966018676758 + }, + { + "auxiliary_loss_clip": 0.06523669, + "auxiliary_loss_mlp": 0.01282022, + "balance_loss_clip": 0.06298, + "balance_loss_mlp": 0.01260898, + "epoch": 0.25239741470013527, + "flos": 36978217522560.0, + "grad_norm": 2.239450775339409, + "language_loss": 0.72922301, + "learning_rate": 3.5032033856251405e-06, + "loss": 0.80727994, + "num_input_tokens_seen": 90576930, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.21130371, + "step": 4198, + "time_per_iteration": 2.6708526611328125 + }, + { + "auxiliary_loss_clip": 0.06527208, + "auxiliary_loss_mlp": 0.012804, + "balance_loss_clip": 0.06297485, + "balance_loss_mlp": 0.01258967, + "epoch": 0.25245753795280323, + "flos": 18521777289600.0, + "grad_norm": 2.0891954597653055, + "language_loss": 0.77475321, + "learning_rate": 3.50294646148888e-06, + "loss": 0.85282922, + "num_input_tokens_seen": 90595710, + "router_z_loss_clip": 2.29882812, + "router_z_loss_mlp": 0.21447754, + "step": 4199, + "time_per_iteration": 3.9535269737243652 + }, + { + "auxiliary_loss_clip": 0.06522667, + "auxiliary_loss_mlp": 0.01277202, + "balance_loss_clip": 0.06296766, + "balance_loss_mlp": 0.01257485, + "epoch": 0.2525176612054712, + "flos": 32353387741440.0, + "grad_norm": 1.7804914051128766, + "language_loss": 0.74169135, + "learning_rate": 3.502689480360739e-06, + "loss": 0.81969011, + "num_input_tokens_seen": 90617945, + "router_z_loss_clip": 2.25976562, + "router_z_loss_mlp": 0.19714355, + "step": 4200, + "time_per_iteration": 2.637592315673828 + }, + { + "auxiliary_loss_clip": 0.06517747, + "auxiliary_loss_mlp": 0.01275367, + "balance_loss_clip": 0.06294595, + "balance_loss_mlp": 0.01255602, + "epoch": 0.25257778445813917, + "flos": 45268440307200.0, + "grad_norm": 1.5897560976370495, + "language_loss": 0.82704282, + "learning_rate": 3.5024324422504616e-06, + "loss": 0.90497398, + "num_input_tokens_seen": 90640855, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.19775391, + "step": 4201, + "time_per_iteration": 2.740555763244629 + }, + { + "auxiliary_loss_clip": 0.06520839, + "auxiliary_loss_mlp": 0.01279409, + "balance_loss_clip": 0.06295383, + "balance_loss_mlp": 0.01259048, + "epoch": 0.25263790771080713, + "flos": 23374526976000.0, + "grad_norm": 1.712909977397354, + "language_loss": 0.75193971, + "learning_rate": 3.5021753471677965e-06, + "loss": 0.82994223, + "num_input_tokens_seen": 90661350, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.20361328, + "step": 4202, + "time_per_iteration": 2.55350661277771 + }, + { + "auxiliary_loss_clip": 0.06512797, + "auxiliary_loss_mlp": 0.01277812, + "balance_loss_clip": 0.06294158, + "balance_loss_mlp": 0.01258226, + "epoch": 0.25269803096347515, + "flos": 18520938748800.0, + "grad_norm": 3.10045167794265, + "language_loss": 0.73924601, + "learning_rate": 3.501918195122491e-06, + "loss": 0.81715208, + "num_input_tokens_seen": 90680540, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.19592285, + "step": 4203, + "time_per_iteration": 2.539475917816162 + }, + { + "auxiliary_loss_clip": 0.06523657, + "auxiliary_loss_mlp": 0.01272979, + "balance_loss_clip": 0.0629805, + "balance_loss_mlp": 0.01252964, + "epoch": 0.2527581542161431, + "flos": 24617870985600.0, + "grad_norm": 1.4931409888350198, + "language_loss": 0.78306639, + "learning_rate": 3.501660986124297e-06, + "loss": 0.86103272, + "num_input_tokens_seen": 90703460, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.20007324, + "step": 4204, + "time_per_iteration": 4.058368682861328 + }, + { + "auxiliary_loss_clip": 0.0651952, + "auxiliary_loss_mlp": 0.01278061, + "balance_loss_clip": 0.06294288, + "balance_loss_mlp": 0.01258427, + "epoch": 0.2528182774688111, + "flos": 12646266266880.0, + "grad_norm": 2.5678524165435928, + "language_loss": 0.72629768, + "learning_rate": 3.5014037201829684e-06, + "loss": 0.80427349, + "num_input_tokens_seen": 90718815, + "router_z_loss_clip": 2.25585938, + "router_z_loss_mlp": 0.19616699, + "step": 4205, + "time_per_iteration": 2.503054618835449 + }, + { + "auxiliary_loss_clip": 0.06508891, + "auxiliary_loss_mlp": 0.01281235, + "balance_loss_clip": 0.06295824, + "balance_loss_mlp": 0.01264164, + "epoch": 0.25287840072147905, + "flos": 46947331440000.0, + "grad_norm": 1.3326329418173375, + "language_loss": 0.76355231, + "learning_rate": 3.50114639730826e-06, + "loss": 0.84145361, + "num_input_tokens_seen": 90742125, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.17077637, + "step": 4206, + "time_per_iteration": 4.097341537475586 + }, + { + "auxiliary_loss_clip": 0.06516857, + "auxiliary_loss_mlp": 0.01278993, + "balance_loss_clip": 0.06295118, + "balance_loss_mlp": 0.0126042, + "epoch": 0.252938523974147, + "flos": 18885641644800.0, + "grad_norm": 1.8849973173990275, + "language_loss": 0.79775047, + "learning_rate": 3.5008890175099296e-06, + "loss": 0.875709, + "num_input_tokens_seen": 90760785, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.18579102, + "step": 4207, + "time_per_iteration": 2.545203447341919 + }, + { + "auxiliary_loss_clip": 0.06511112, + "auxiliary_loss_mlp": 0.01280475, + "balance_loss_clip": 0.06293532, + "balance_loss_mlp": 0.01261628, + "epoch": 0.252998647226815, + "flos": 21441245247360.0, + "grad_norm": 1.449056492648579, + "language_loss": 0.76862776, + "learning_rate": 3.5006315807977375e-06, + "loss": 0.84654361, + "num_input_tokens_seen": 90780045, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.18859863, + "step": 4208, + "time_per_iteration": 2.540531873703003 + }, + { + "auxiliary_loss_clip": 0.06512551, + "auxiliary_loss_mlp": 0.01282266, + "balance_loss_clip": 0.06295963, + "balance_loss_mlp": 0.01264098, + "epoch": 0.25305877047948294, + "flos": 25448365128960.0, + "grad_norm": 1.8025422596027827, + "language_loss": 0.70108622, + "learning_rate": 3.5003740871814456e-06, + "loss": 0.77903438, + "num_input_tokens_seen": 90797980, + "router_z_loss_clip": 2.1640625, + "router_z_loss_mlp": 0.1817627, + "step": 4209, + "time_per_iteration": 2.586179256439209 + }, + { + "auxiliary_loss_clip": 0.06401253, + "auxiliary_loss_mlp": 0.01256172, + "balance_loss_clip": 0.06294125, + "balance_loss_mlp": 0.01251663, + "epoch": 0.2531188937321509, + "flos": 60205213457280.0, + "grad_norm": 0.7328516672129679, + "language_loss": 0.55096745, + "learning_rate": 3.5001165366708175e-06, + "loss": 0.62754166, + "num_input_tokens_seen": 90864865, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.0451355, + "step": 4210, + "time_per_iteration": 4.676252841949463 + }, + { + "auxiliary_loss_clip": 0.06515378, + "auxiliary_loss_mlp": 0.01285614, + "balance_loss_clip": 0.06294395, + "balance_loss_mlp": 0.01265861, + "epoch": 0.25317901698481887, + "flos": 19688449213440.0, + "grad_norm": 2.0935195986224837, + "language_loss": 0.81166065, + "learning_rate": 3.4998589292756204e-06, + "loss": 0.88967055, + "num_input_tokens_seen": 90882885, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.19763184, + "step": 4211, + "time_per_iteration": 2.5251474380493164 + }, + { + "auxiliary_loss_clip": 0.06513076, + "auxiliary_loss_mlp": 0.01275756, + "balance_loss_clip": 0.06299528, + "balance_loss_mlp": 0.01258554, + "epoch": 0.25323914023748684, + "flos": 24431012380800.0, + "grad_norm": 1.7184165713115493, + "language_loss": 0.78543985, + "learning_rate": 3.499601265005622e-06, + "loss": 0.86332822, + "num_input_tokens_seen": 90902985, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.17199707, + "step": 4212, + "time_per_iteration": 2.609750986099243 + }, + { + "auxiliary_loss_clip": 0.06514729, + "auxiliary_loss_mlp": 0.01278491, + "balance_loss_clip": 0.06293602, + "balance_loss_mlp": 0.0125912, + "epoch": 0.2532992634901548, + "flos": 25454528403840.0, + "grad_norm": 1.862422609084939, + "language_loss": 0.53407073, + "learning_rate": 3.4993435438705938e-06, + "loss": 0.61200291, + "num_input_tokens_seen": 90923550, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.19384766, + "step": 4213, + "time_per_iteration": 2.5825159549713135 + }, + { + "auxiliary_loss_clip": 0.06517738, + "auxiliary_loss_mlp": 0.01278881, + "balance_loss_clip": 0.06296406, + "balance_loss_mlp": 0.01259832, + "epoch": 0.25335938674282277, + "flos": 18886605966720.0, + "grad_norm": 2.428420926128805, + "language_loss": 0.65041012, + "learning_rate": 3.499085765880308e-06, + "loss": 0.72837627, + "num_input_tokens_seen": 90943260, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.19030762, + "step": 4214, + "time_per_iteration": 2.567539930343628 + }, + { + "auxiliary_loss_clip": 0.06391697, + "auxiliary_loss_mlp": 0.01257675, + "balance_loss_clip": 0.06284457, + "balance_loss_mlp": 0.01253702, + "epoch": 0.25341950999549073, + "flos": 53079692025600.0, + "grad_norm": 0.8253897319773601, + "language_loss": 0.57886475, + "learning_rate": 3.4988279310445396e-06, + "loss": 0.65535849, + "num_input_tokens_seen": 90996295, + "router_z_loss_clip": 1.07324219, + "router_z_loss_mlp": 0.03970337, + "step": 4215, + "time_per_iteration": 2.941021680831909 + }, + { + "auxiliary_loss_clip": 0.06512114, + "auxiliary_loss_mlp": 0.01274398, + "balance_loss_clip": 0.0629489, + "balance_loss_mlp": 0.0125604, + "epoch": 0.2534796332481587, + "flos": 39029609980800.0, + "grad_norm": 1.6071125602920209, + "language_loss": 0.84078032, + "learning_rate": 3.498570039373066e-06, + "loss": 0.9186455, + "num_input_tokens_seen": 91017545, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.18359375, + "step": 4216, + "time_per_iteration": 2.732790946960449 + }, + { + "auxiliary_loss_clip": 0.06509562, + "auxiliary_loss_mlp": 0.0127764, + "balance_loss_clip": 0.06290903, + "balance_loss_mlp": 0.01259294, + "epoch": 0.2535397565008267, + "flos": 23593809911040.0, + "grad_norm": 1.7865601815504963, + "language_loss": 0.81036615, + "learning_rate": 3.498312090875666e-06, + "loss": 0.88823819, + "num_input_tokens_seen": 91037715, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.18371582, + "step": 4217, + "time_per_iteration": 2.5606398582458496 + }, + { + "auxiliary_loss_clip": 0.06514265, + "auxiliary_loss_mlp": 0.01279769, + "balance_loss_clip": 0.06294704, + "balance_loss_mlp": 0.01260255, + "epoch": 0.2535998797534947, + "flos": 19287422772480.0, + "grad_norm": 2.529157470409933, + "language_loss": 0.761132, + "learning_rate": 3.4980540855621218e-06, + "loss": 0.83907235, + "num_input_tokens_seen": 91055295, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.19519043, + "step": 4218, + "time_per_iteration": 2.623429298400879 + }, + { + "auxiliary_loss_clip": 0.06516235, + "auxiliary_loss_mlp": 0.01282224, + "balance_loss_clip": 0.06296211, + "balance_loss_mlp": 0.01262757, + "epoch": 0.25366000300616265, + "flos": 24031201824000.0, + "grad_norm": 1.721807278316132, + "language_loss": 0.75063616, + "learning_rate": 3.4977960234422167e-06, + "loss": 0.82862079, + "num_input_tokens_seen": 91075485, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.19482422, + "step": 4219, + "time_per_iteration": 2.564220428466797 + }, + { + "auxiliary_loss_clip": 0.06520407, + "auxiliary_loss_mlp": 0.0127968, + "balance_loss_clip": 0.06298073, + "balance_loss_mlp": 0.01259713, + "epoch": 0.2537201262588306, + "flos": 16294888454400.0, + "grad_norm": 1.6804083546431516, + "language_loss": 0.81834626, + "learning_rate": 3.497537904525736e-06, + "loss": 0.89634717, + "num_input_tokens_seen": 91093620, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.19970703, + "step": 4220, + "time_per_iteration": 2.576335906982422 + }, + { + "auxiliary_loss_clip": 0.0652357, + "auxiliary_loss_mlp": 0.01275521, + "balance_loss_clip": 0.06301299, + "balance_loss_mlp": 0.01256936, + "epoch": 0.2537802495114986, + "flos": 23301376761600.0, + "grad_norm": 2.4535775533256796, + "language_loss": 0.71752739, + "learning_rate": 3.497279728822468e-06, + "loss": 0.79551834, + "num_input_tokens_seen": 91114110, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.18579102, + "step": 4221, + "time_per_iteration": 2.561870813369751 + }, + { + "auxiliary_loss_clip": 0.06528511, + "auxiliary_loss_mlp": 0.01279389, + "balance_loss_clip": 0.0630452, + "balance_loss_mlp": 0.01259148, + "epoch": 0.25384037276416654, + "flos": 17644855184640.0, + "grad_norm": 1.5017476973585115, + "language_loss": 0.62507772, + "learning_rate": 3.497021496342202e-06, + "loss": 0.70315671, + "num_input_tokens_seen": 91133135, + "router_z_loss_clip": 2.23828125, + "router_z_loss_mlp": 0.20239258, + "step": 4222, + "time_per_iteration": 2.6921043395996094 + }, + { + "auxiliary_loss_clip": 0.06520825, + "auxiliary_loss_mlp": 0.01278393, + "balance_loss_clip": 0.06297866, + "balance_loss_mlp": 0.0125864, + "epoch": 0.2539004960168345, + "flos": 21513473066880.0, + "grad_norm": 1.6064438591236823, + "language_loss": 0.75066334, + "learning_rate": 3.496763207094731e-06, + "loss": 0.82865554, + "num_input_tokens_seen": 91151805, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.19763184, + "step": 4223, + "time_per_iteration": 2.525251626968384 + }, + { + "auxiliary_loss_clip": 0.06514867, + "auxiliary_loss_mlp": 0.01278352, + "balance_loss_clip": 0.06297616, + "balance_loss_mlp": 0.01260101, + "epoch": 0.2539606192695025, + "flos": 23957632339200.0, + "grad_norm": 1.753259760034452, + "language_loss": 0.80341679, + "learning_rate": 3.49650486108985e-06, + "loss": 0.88134897, + "num_input_tokens_seen": 91172270, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.18261719, + "step": 4224, + "time_per_iteration": 2.6002583503723145 + }, + { + "auxiliary_loss_clip": 0.06515887, + "auxiliary_loss_mlp": 0.01281311, + "balance_loss_clip": 0.0629767, + "balance_loss_mlp": 0.01261999, + "epoch": 0.25402074252217044, + "flos": 24176537930880.0, + "grad_norm": 1.4707313275482783, + "language_loss": 0.78211224, + "learning_rate": 3.496246458337354e-06, + "loss": 0.8600843, + "num_input_tokens_seen": 91192080, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.19299316, + "step": 4225, + "time_per_iteration": 2.5527138710021973 + }, + { + "auxiliary_loss_clip": 0.06521728, + "auxiliary_loss_mlp": 0.01282671, + "balance_loss_clip": 0.06302264, + "balance_loss_mlp": 0.01263013, + "epoch": 0.2540808657748384, + "flos": 22309320746880.0, + "grad_norm": 1.6188569007516582, + "language_loss": 0.85543132, + "learning_rate": 3.4959879988470426e-06, + "loss": 0.93347526, + "num_input_tokens_seen": 91211450, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.1965332, + "step": 4226, + "time_per_iteration": 2.5676872730255127 + }, + { + "auxiliary_loss_clip": 0.06515788, + "auxiliary_loss_mlp": 0.01277599, + "balance_loss_clip": 0.06296097, + "balance_loss_mlp": 0.01258883, + "epoch": 0.25414098902750637, + "flos": 27606883432320.0, + "grad_norm": 1.6805883261517605, + "language_loss": 0.71414381, + "learning_rate": 3.4957294826287164e-06, + "loss": 0.79207766, + "num_input_tokens_seen": 91231835, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.18713379, + "step": 4227, + "time_per_iteration": 2.5918691158294678 + }, + { + "auxiliary_loss_clip": 0.06387169, + "auxiliary_loss_mlp": 0.01261576, + "balance_loss_clip": 0.06279954, + "balance_loss_mlp": 0.01257166, + "epoch": 0.25420111228017434, + "flos": 58188760951680.0, + "grad_norm": 0.9697801274632529, + "language_loss": 0.61857057, + "learning_rate": 3.4954709096921785e-06, + "loss": 0.69505799, + "num_input_tokens_seen": 91288755, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.04418945, + "step": 4228, + "time_per_iteration": 3.01169490814209 + }, + { + "auxiliary_loss_clip": 0.06514917, + "auxiliary_loss_mlp": 0.01279347, + "balance_loss_clip": 0.0629469, + "balance_loss_mlp": 0.01258235, + "epoch": 0.2542612355328423, + "flos": 11467645136640.0, + "grad_norm": 2.3876652287650577, + "language_loss": 0.8721081, + "learning_rate": 3.4952122800472336e-06, + "loss": 0.95005071, + "num_input_tokens_seen": 91302485, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.21130371, + "step": 4229, + "time_per_iteration": 2.5960769653320312 + }, + { + "auxiliary_loss_clip": 0.06519967, + "auxiliary_loss_mlp": 0.01277589, + "balance_loss_clip": 0.06299049, + "balance_loss_mlp": 0.01257836, + "epoch": 0.2543213587855103, + "flos": 22972452359040.0, + "grad_norm": 2.100172466954555, + "language_loss": 0.78119314, + "learning_rate": 3.4949535937036892e-06, + "loss": 0.85916877, + "num_input_tokens_seen": 91321120, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.19775391, + "step": 4230, + "time_per_iteration": 2.5483899116516113 + }, + { + "auxiliary_loss_clip": 0.06511904, + "auxiliary_loss_mlp": 0.01277721, + "balance_loss_clip": 0.06292608, + "balance_loss_mlp": 0.01257622, + "epoch": 0.2543814820381783, + "flos": 18257953109760.0, + "grad_norm": 2.00545114565419, + "language_loss": 0.75687885, + "learning_rate": 3.4946948506713544e-06, + "loss": 0.83477509, + "num_input_tokens_seen": 91338575, + "router_z_loss_clip": 2.19335938, + "router_z_loss_mlp": 0.20092773, + "step": 4231, + "time_per_iteration": 2.566326379776001 + }, + { + "auxiliary_loss_clip": 0.06520282, + "auxiliary_loss_mlp": 0.01278584, + "balance_loss_clip": 0.06300422, + "balance_loss_mlp": 0.01259761, + "epoch": 0.25444160529084625, + "flos": 15638129752320.0, + "grad_norm": 1.7887257039808522, + "language_loss": 0.74637282, + "learning_rate": 3.4944360509600416e-06, + "loss": 0.82436144, + "num_input_tokens_seen": 91357355, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.18823242, + "step": 4232, + "time_per_iteration": 2.5229685306549072 + }, + { + "auxiliary_loss_clip": 0.0652221, + "auxiliary_loss_mlp": 0.01293975, + "balance_loss_clip": 0.06303085, + "balance_loss_mlp": 0.01272947, + "epoch": 0.2545017285435142, + "flos": 24607431152640.0, + "grad_norm": 1.8617746927090988, + "language_loss": 0.87183899, + "learning_rate": 3.4941771945795637e-06, + "loss": 0.95000088, + "num_input_tokens_seen": 91376515, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.21032715, + "step": 4233, + "time_per_iteration": 2.6281485557556152 + }, + { + "auxiliary_loss_clip": 0.06505871, + "auxiliary_loss_mlp": 0.01278753, + "balance_loss_clip": 0.06294682, + "balance_loss_mlp": 0.01260442, + "epoch": 0.2545618517961822, + "flos": 24685654538880.0, + "grad_norm": 1.601433299567329, + "language_loss": 0.75604707, + "learning_rate": 3.493918281539737e-06, + "loss": 0.8338933, + "num_input_tokens_seen": 91397595, + "router_z_loss_clip": 2.11328125, + "router_z_loss_mlp": 0.18322754, + "step": 4234, + "time_per_iteration": 2.596642017364502 + }, + { + "auxiliary_loss_clip": 0.06514844, + "auxiliary_loss_mlp": 0.01287463, + "balance_loss_clip": 0.06292339, + "balance_loss_mlp": 0.01268938, + "epoch": 0.25462197504885015, + "flos": 23921937699840.0, + "grad_norm": 1.4560099290474922, + "language_loss": 0.75372213, + "learning_rate": 3.493659311850379e-06, + "loss": 0.83174521, + "num_input_tokens_seen": 91417775, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.18518066, + "step": 4235, + "time_per_iteration": 2.592942953109741 + }, + { + "auxiliary_loss_clip": 0.06532556, + "auxiliary_loss_mlp": 0.01283911, + "balance_loss_clip": 0.06299181, + "balance_loss_mlp": 0.01261797, + "epoch": 0.2546820983015181, + "flos": 24796134547200.0, + "grad_norm": 1.9414760170646592, + "language_loss": 0.65519691, + "learning_rate": 3.4934002855213106e-06, + "loss": 0.73336154, + "num_input_tokens_seen": 91437665, + "router_z_loss_clip": 2.33398438, + "router_z_loss_mlp": 0.22131348, + "step": 4236, + "time_per_iteration": 2.5583407878875732 + }, + { + "auxiliary_loss_clip": 0.06512251, + "auxiliary_loss_mlp": 0.01281938, + "balance_loss_clip": 0.06294776, + "balance_loss_mlp": 0.01262984, + "epoch": 0.2547422215541861, + "flos": 18740095902720.0, + "grad_norm": 1.5016735811799797, + "language_loss": 0.678509, + "learning_rate": 3.493141202562354e-06, + "loss": 0.75645095, + "num_input_tokens_seen": 91456705, + "router_z_loss_clip": 2.17578125, + "router_z_loss_mlp": 0.18945312, + "step": 4237, + "time_per_iteration": 2.5650389194488525 + }, + { + "auxiliary_loss_clip": 0.0651492, + "auxiliary_loss_mlp": 0.01282053, + "balance_loss_clip": 0.06293051, + "balance_loss_mlp": 0.01261394, + "epoch": 0.25480234480685404, + "flos": 21038751360000.0, + "grad_norm": 2.061881611294133, + "language_loss": 0.75628269, + "learning_rate": 3.492882062983333e-06, + "loss": 0.83425242, + "num_input_tokens_seen": 91475535, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.20654297, + "step": 4238, + "time_per_iteration": 2.529883861541748 + }, + { + "auxiliary_loss_clip": 0.06513957, + "auxiliary_loss_mlp": 0.0127785, + "balance_loss_clip": 0.06292559, + "balance_loss_mlp": 0.01258287, + "epoch": 0.254862468059522, + "flos": 25089112748160.0, + "grad_norm": 1.8905919191970875, + "language_loss": 0.81253731, + "learning_rate": 3.492622866794074e-06, + "loss": 0.89045537, + "num_input_tokens_seen": 91499140, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.19555664, + "step": 4239, + "time_per_iteration": 4.02100944519043 + }, + { + "auxiliary_loss_clip": 0.06508629, + "auxiliary_loss_mlp": 0.01294237, + "balance_loss_clip": 0.06291452, + "balance_loss_mlp": 0.01273471, + "epoch": 0.25492259131219, + "flos": 20564658558720.0, + "grad_norm": 1.7183169382614727, + "language_loss": 0.7800405, + "learning_rate": 3.492363614004407e-06, + "loss": 0.85806918, + "num_input_tokens_seen": 91518335, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.2076416, + "step": 4240, + "time_per_iteration": 2.5323400497436523 + }, + { + "auxiliary_loss_clip": 0.06515411, + "auxiliary_loss_mlp": 0.01282684, + "balance_loss_clip": 0.06290809, + "balance_loss_mlp": 0.01262037, + "epoch": 0.25498271456485794, + "flos": 25048889988480.0, + "grad_norm": 1.7684080721058644, + "language_loss": 0.83764112, + "learning_rate": 3.492104304624162e-06, + "loss": 0.915622, + "num_input_tokens_seen": 91537655, + "router_z_loss_clip": 2.24804688, + "router_z_loss_mlp": 0.20629883, + "step": 4241, + "time_per_iteration": 2.618563413619995 + }, + { + "auxiliary_loss_clip": 0.06511963, + "auxiliary_loss_mlp": 0.01282405, + "balance_loss_clip": 0.06292334, + "balance_loss_mlp": 0.01262676, + "epoch": 0.2550428378175259, + "flos": 26185820912640.0, + "grad_norm": 1.7847215082139707, + "language_loss": 0.73873413, + "learning_rate": 3.4918449386631725e-06, + "loss": 0.81667781, + "num_input_tokens_seen": 91557545, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.19750977, + "step": 4242, + "time_per_iteration": 2.6289515495300293 + }, + { + "auxiliary_loss_clip": 0.06517772, + "auxiliary_loss_mlp": 0.01279972, + "balance_loss_clip": 0.06296564, + "balance_loss_mlp": 0.01260398, + "epoch": 0.2551029610701939, + "flos": 15272420607360.0, + "grad_norm": 2.4567533637161896, + "language_loss": 0.72771823, + "learning_rate": 3.491585516131273e-06, + "loss": 0.80569565, + "num_input_tokens_seen": 91574405, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.19567871, + "step": 4243, + "time_per_iteration": 3.9432499408721924 + }, + { + "auxiliary_loss_clip": 0.06515735, + "auxiliary_loss_mlp": 0.0127996, + "balance_loss_clip": 0.06295779, + "balance_loss_mlp": 0.01260195, + "epoch": 0.2551630843228619, + "flos": 18117774028800.0, + "grad_norm": 1.7474968125895491, + "language_loss": 0.82239074, + "learning_rate": 3.491326037038301e-06, + "loss": 0.90034771, + "num_input_tokens_seen": 91593755, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.19750977, + "step": 4244, + "time_per_iteration": 2.6024672985076904 + }, + { + "auxiliary_loss_clip": 0.06397872, + "auxiliary_loss_mlp": 0.01258297, + "balance_loss_clip": 0.06291912, + "balance_loss_mlp": 0.01253388, + "epoch": 0.25522320757552985, + "flos": 70543055266560.0, + "grad_norm": 0.6771353060664416, + "language_loss": 0.57579219, + "learning_rate": 3.4910665013940967e-06, + "loss": 0.65235388, + "num_input_tokens_seen": 91660335, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.04904175, + "step": 4245, + "time_per_iteration": 4.687421083450317 + }, + { + "auxiliary_loss_clip": 0.06516664, + "auxiliary_loss_mlp": 0.01277203, + "balance_loss_clip": 0.06290803, + "balance_loss_mlp": 0.01256628, + "epoch": 0.2552833308281978, + "flos": 22899679488000.0, + "grad_norm": 2.827648139992037, + "language_loss": 0.65781415, + "learning_rate": 3.4908069092085015e-06, + "loss": 0.73575282, + "num_input_tokens_seen": 91678500, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.20593262, + "step": 4246, + "time_per_iteration": 2.542945384979248 + }, + { + "auxiliary_loss_clip": 0.06504452, + "auxiliary_loss_mlp": 0.01278422, + "balance_loss_clip": 0.06290503, + "balance_loss_mlp": 0.01258455, + "epoch": 0.2553434540808658, + "flos": 22060003322880.0, + "grad_norm": 2.2137811054544003, + "language_loss": 0.82470047, + "learning_rate": 3.4905472604913585e-06, + "loss": 0.90252924, + "num_input_tokens_seen": 91696430, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.19970703, + "step": 4247, + "time_per_iteration": 2.5786685943603516 + }, + { + "auxiliary_loss_clip": 0.06521233, + "auxiliary_loss_mlp": 0.01279993, + "balance_loss_clip": 0.062906, + "balance_loss_mlp": 0.01257271, + "epoch": 0.25540357733353375, + "flos": 16549656393600.0, + "grad_norm": 2.135954108256579, + "language_loss": 0.83991635, + "learning_rate": 3.490287555252514e-06, + "loss": 0.91792852, + "num_input_tokens_seen": 91713270, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.22729492, + "step": 4248, + "time_per_iteration": 2.5408127307891846 + }, + { + "auxiliary_loss_clip": 0.06511332, + "auxiliary_loss_mlp": 0.01273979, + "balance_loss_clip": 0.062884, + "balance_loss_mlp": 0.01253773, + "epoch": 0.2554637005862017, + "flos": 17570531013120.0, + "grad_norm": 2.3193810219262585, + "language_loss": 0.84631854, + "learning_rate": 3.4900277935018166e-06, + "loss": 0.92417163, + "num_input_tokens_seen": 91728865, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.20202637, + "step": 4249, + "time_per_iteration": 4.003984212875366 + }, + { + "auxiliary_loss_clip": 0.06380495, + "auxiliary_loss_mlp": 0.01253384, + "balance_loss_clip": 0.06276014, + "balance_loss_mlp": 0.01249388, + "epoch": 0.2555238238388697, + "flos": 72263441698560.0, + "grad_norm": 0.7365466774710785, + "language_loss": 0.56168175, + "learning_rate": 3.489767975249115e-06, + "loss": 0.63802058, + "num_input_tokens_seen": 91787470, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.03994751, + "step": 4250, + "time_per_iteration": 3.169614553451538 + }, + { + "auxiliary_loss_clip": 0.06511974, + "auxiliary_loss_mlp": 0.01277356, + "balance_loss_clip": 0.06289789, + "balance_loss_mlp": 0.01255433, + "epoch": 0.25558394709153764, + "flos": 24396323990400.0, + "grad_norm": 2.4378887831258527, + "language_loss": 0.81129342, + "learning_rate": 3.4895081005042632e-06, + "loss": 0.88918668, + "num_input_tokens_seen": 91805640, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.21936035, + "step": 4251, + "time_per_iteration": 2.576631784439087 + }, + { + "auxiliary_loss_clip": 0.06382731, + "auxiliary_loss_mlp": 0.01258719, + "balance_loss_clip": 0.06278136, + "balance_loss_mlp": 0.01254794, + "epoch": 0.2556440703442056, + "flos": 69251857776000.0, + "grad_norm": 0.7756464213587903, + "language_loss": 0.66132653, + "learning_rate": 3.4892481692771146e-06, + "loss": 0.73774105, + "num_input_tokens_seen": 91869695, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.03921509, + "step": 4252, + "time_per_iteration": 3.2080140113830566 + }, + { + "auxiliary_loss_clip": 0.06505658, + "auxiliary_loss_mlp": 0.0127465, + "balance_loss_clip": 0.06288829, + "balance_loss_mlp": 0.01255922, + "epoch": 0.2557041935968736, + "flos": 24870919916160.0, + "grad_norm": 1.8769862610793295, + "language_loss": 0.74028432, + "learning_rate": 3.4889881815775267e-06, + "loss": 0.81808746, + "num_input_tokens_seen": 91889920, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.18737793, + "step": 4253, + "time_per_iteration": 2.569730520248413 + }, + { + "auxiliary_loss_clip": 0.06509089, + "auxiliary_loss_mlp": 0.01282548, + "balance_loss_clip": 0.06290878, + "balance_loss_mlp": 0.01261746, + "epoch": 0.25576431684954154, + "flos": 22498694974080.0, + "grad_norm": 4.507455095580577, + "language_loss": 0.742535, + "learning_rate": 3.488728137415357e-06, + "loss": 0.82045132, + "num_input_tokens_seen": 91908665, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.20800781, + "step": 4254, + "time_per_iteration": 2.58933424949646 + }, + { + "auxiliary_loss_clip": 0.0651402, + "auxiliary_loss_mlp": 0.0127796, + "balance_loss_clip": 0.06292839, + "balance_loss_mlp": 0.01257253, + "epoch": 0.2558244401022095, + "flos": 19832569436160.0, + "grad_norm": 1.7853658258569405, + "language_loss": 0.81599152, + "learning_rate": 3.4884680368004675e-06, + "loss": 0.89391136, + "num_input_tokens_seen": 91927855, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.20703125, + "step": 4255, + "time_per_iteration": 2.5198400020599365 + }, + { + "auxiliary_loss_clip": 0.06507239, + "auxiliary_loss_mlp": 0.01282593, + "balance_loss_clip": 0.06290218, + "balance_loss_mlp": 0.01262304, + "epoch": 0.2558845633548775, + "flos": 23226968736000.0, + "grad_norm": 1.3889535500711463, + "language_loss": 0.85781598, + "learning_rate": 3.488207879742721e-06, + "loss": 0.93571424, + "num_input_tokens_seen": 91948500, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.20275879, + "step": 4256, + "time_per_iteration": 2.6466193199157715 + }, + { + "auxiliary_loss_clip": 0.06518268, + "auxiliary_loss_mlp": 0.01279996, + "balance_loss_clip": 0.06292354, + "balance_loss_mlp": 0.01259432, + "epoch": 0.2559446866075455, + "flos": 16843682770560.0, + "grad_norm": 2.0395659723156814, + "language_loss": 0.75505483, + "learning_rate": 3.4879476662519826e-06, + "loss": 0.83303738, + "num_input_tokens_seen": 91968375, + "router_z_loss_clip": 2.25585938, + "router_z_loss_mlp": 0.20556641, + "step": 4257, + "time_per_iteration": 2.5399420261383057 + }, + { + "auxiliary_loss_clip": 0.06380453, + "auxiliary_loss_mlp": 0.01254162, + "balance_loss_clip": 0.06277193, + "balance_loss_mlp": 0.01249772, + "epoch": 0.25600480986021346, + "flos": 57612741258240.0, + "grad_norm": 0.7838298602570629, + "language_loss": 0.65205377, + "learning_rate": 3.4876873963381196e-06, + "loss": 0.72839993, + "num_input_tokens_seen": 92028490, + "router_z_loss_clip": 1.03125, + "router_z_loss_mlp": 0.04397583, + "step": 4258, + "time_per_iteration": 3.1310055255889893 + }, + { + "auxiliary_loss_clip": 0.06504042, + "auxiliary_loss_mlp": 0.01278745, + "balance_loss_clip": 0.06291071, + "balance_loss_mlp": 0.01257192, + "epoch": 0.2560649331128814, + "flos": 27827088762240.0, + "grad_norm": 1.6413095395992356, + "language_loss": 0.76769841, + "learning_rate": 3.4874270700110013e-06, + "loss": 0.84552622, + "num_input_tokens_seen": 92048060, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.2154541, + "step": 4259, + "time_per_iteration": 2.6200387477874756 + }, + { + "auxiliary_loss_clip": 0.06386054, + "auxiliary_loss_mlp": 0.01255029, + "balance_loss_clip": 0.06282675, + "balance_loss_mlp": 0.01250824, + "epoch": 0.2561250563655494, + "flos": 70972187552640.0, + "grad_norm": 0.7732791072218576, + "language_loss": 0.58378285, + "learning_rate": 3.4871666872804994e-06, + "loss": 0.66019368, + "num_input_tokens_seen": 92118180, + "router_z_loss_clip": 1.03125, + "router_z_loss_mlp": 0.04208374, + "step": 4260, + "time_per_iteration": 3.2671031951904297 + }, + { + "auxiliary_loss_clip": 0.06510498, + "auxiliary_loss_mlp": 0.01277826, + "balance_loss_clip": 0.06290598, + "balance_loss_mlp": 0.0125824, + "epoch": 0.25618517961821735, + "flos": 27018998386560.0, + "grad_norm": 1.6762593333812295, + "language_loss": 0.77063274, + "learning_rate": 3.4869062481564875e-06, + "loss": 0.84851599, + "num_input_tokens_seen": 92137570, + "router_z_loss_clip": 2.19921875, + "router_z_loss_mlp": 0.19580078, + "step": 4261, + "time_per_iteration": 2.6590030193328857 + }, + { + "auxiliary_loss_clip": 0.06510883, + "auxiliary_loss_mlp": 0.01281621, + "balance_loss_clip": 0.06293076, + "balance_loss_mlp": 0.01261534, + "epoch": 0.2562453028708853, + "flos": 23073708418560.0, + "grad_norm": 1.5026397479094624, + "language_loss": 0.83196223, + "learning_rate": 3.486645752648842e-06, + "loss": 0.90988725, + "num_input_tokens_seen": 92157625, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.20080566, + "step": 4262, + "time_per_iteration": 2.606386661529541 + }, + { + "auxiliary_loss_clip": 0.06520962, + "auxiliary_loss_mlp": 0.01278022, + "balance_loss_clip": 0.06295811, + "balance_loss_mlp": 0.0125778, + "epoch": 0.2563054261235533, + "flos": 15126120178560.0, + "grad_norm": 2.976746783245639, + "language_loss": 0.7460134, + "learning_rate": 3.4863852007674405e-06, + "loss": 0.82400322, + "num_input_tokens_seen": 92175350, + "router_z_loss_clip": 2.25585938, + "router_z_loss_mlp": 0.20239258, + "step": 4263, + "time_per_iteration": 2.573204517364502 + }, + { + "auxiliary_loss_clip": 0.06511976, + "auxiliary_loss_mlp": 0.01275308, + "balance_loss_clip": 0.0629802, + "balance_loss_mlp": 0.01256008, + "epoch": 0.25636554937622125, + "flos": 27862238350080.0, + "grad_norm": 1.7189236473805392, + "language_loss": 0.83209884, + "learning_rate": 3.486124592522163e-06, + "loss": 0.90997171, + "num_input_tokens_seen": 92196070, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.19299316, + "step": 4264, + "time_per_iteration": 2.5768978595733643 + }, + { + "auxiliary_loss_clip": 0.06522107, + "auxiliary_loss_mlp": 0.01275609, + "balance_loss_clip": 0.06300539, + "balance_loss_mlp": 0.01255403, + "epoch": 0.2564256726288892, + "flos": 28912979750400.0, + "grad_norm": 2.7518222985569247, + "language_loss": 0.75264466, + "learning_rate": 3.4858639279228924e-06, + "loss": 0.83062184, + "num_input_tokens_seen": 92216310, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.20202637, + "step": 4265, + "time_per_iteration": 2.6022770404815674 + }, + { + "auxiliary_loss_clip": 0.06514756, + "auxiliary_loss_mlp": 0.01276084, + "balance_loss_clip": 0.06293936, + "balance_loss_mlp": 0.01256701, + "epoch": 0.2564857958815572, + "flos": 18520812967680.0, + "grad_norm": 2.7205564726060754, + "language_loss": 0.82059085, + "learning_rate": 3.485603206979513e-06, + "loss": 0.89849925, + "num_input_tokens_seen": 92234510, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.19396973, + "step": 4266, + "time_per_iteration": 2.5768039226531982 + }, + { + "auxiliary_loss_clip": 0.06513181, + "auxiliary_loss_mlp": 0.01282165, + "balance_loss_clip": 0.06295994, + "balance_loss_mlp": 0.01263199, + "epoch": 0.25654591913422514, + "flos": 25814745106560.0, + "grad_norm": 2.256505464235654, + "language_loss": 0.79590619, + "learning_rate": 3.4853424297019103e-06, + "loss": 0.8738597, + "num_input_tokens_seen": 92254070, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.1895752, + "step": 4267, + "time_per_iteration": 2.58900785446167 + }, + { + "auxiliary_loss_clip": 0.06512932, + "auxiliary_loss_mlp": 0.01282882, + "balance_loss_clip": 0.06302384, + "balance_loss_mlp": 0.01263439, + "epoch": 0.2566060423868931, + "flos": 19105805047680.0, + "grad_norm": 1.7450924080459818, + "language_loss": 0.79543281, + "learning_rate": 3.4850815960999736e-06, + "loss": 0.87339091, + "num_input_tokens_seen": 92275060, + "router_z_loss_clip": 2.10546875, + "router_z_loss_mlp": 0.19421387, + "step": 4268, + "time_per_iteration": 2.532245635986328 + }, + { + "auxiliary_loss_clip": 0.06515032, + "auxiliary_loss_mlp": 0.01281336, + "balance_loss_clip": 0.06296947, + "balance_loss_mlp": 0.01261166, + "epoch": 0.25666616563956113, + "flos": 23849584099200.0, + "grad_norm": 1.6329297187056233, + "language_loss": 0.69106698, + "learning_rate": 3.484820706183595e-06, + "loss": 0.76903057, + "num_input_tokens_seen": 92293610, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.20153809, + "step": 4269, + "time_per_iteration": 2.7064032554626465 + }, + { + "auxiliary_loss_clip": 0.06520134, + "auxiliary_loss_mlp": 0.01278603, + "balance_loss_clip": 0.06299803, + "balance_loss_mlp": 0.01259016, + "epoch": 0.2567262888922291, + "flos": 14608366600320.0, + "grad_norm": 2.976489070793836, + "language_loss": 0.79361498, + "learning_rate": 3.484559759962666e-06, + "loss": 0.8716023, + "num_input_tokens_seen": 92308305, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.19580078, + "step": 4270, + "time_per_iteration": 2.5247366428375244 + }, + { + "auxiliary_loss_clip": 0.06528008, + "auxiliary_loss_mlp": 0.01281711, + "balance_loss_clip": 0.0630113, + "balance_loss_mlp": 0.0125899, + "epoch": 0.25678641214489706, + "flos": 32930791027200.0, + "grad_norm": 2.0785991894062104, + "language_loss": 0.68438745, + "learning_rate": 3.4842987574470816e-06, + "loss": 0.76248461, + "num_input_tokens_seen": 92329875, + "router_z_loss_clip": 2.26757812, + "router_z_loss_mlp": 0.22717285, + "step": 4271, + "time_per_iteration": 2.6327364444732666 + }, + { + "auxiliary_loss_clip": 0.06521121, + "auxiliary_loss_mlp": 0.01277495, + "balance_loss_clip": 0.06297284, + "balance_loss_mlp": 0.01256395, + "epoch": 0.256846535397565, + "flos": 24106029120000.0, + "grad_norm": 1.3298745054932861, + "language_loss": 0.87827712, + "learning_rate": 3.4840376986467403e-06, + "loss": 0.9562633, + "num_input_tokens_seen": 92348780, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.2109375, + "step": 4272, + "time_per_iteration": 2.5886576175689697 + }, + { + "auxiliary_loss_clip": 0.06520741, + "auxiliary_loss_mlp": 0.0127846, + "balance_loss_clip": 0.06299604, + "balance_loss_mlp": 0.01256204, + "epoch": 0.256906658650233, + "flos": 19724437342080.0, + "grad_norm": 1.6471317846086577, + "language_loss": 0.8228811, + "learning_rate": 3.483776583571541e-06, + "loss": 0.90087312, + "num_input_tokens_seen": 92368175, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.22253418, + "step": 4273, + "time_per_iteration": 2.5273654460906982 + }, + { + "auxiliary_loss_clip": 0.06513067, + "auxiliary_loss_mlp": 0.0127658, + "balance_loss_clip": 0.06299708, + "balance_loss_mlp": 0.01257638, + "epoch": 0.25696678190290095, + "flos": 22932019964160.0, + "grad_norm": 1.4706338186359442, + "language_loss": 0.77439249, + "learning_rate": 3.4835154122313846e-06, + "loss": 0.85228896, + "num_input_tokens_seen": 92387755, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.18933105, + "step": 4274, + "time_per_iteration": 2.5805962085723877 + }, + { + "auxiliary_loss_clip": 0.06508841, + "auxiliary_loss_mlp": 0.01274973, + "balance_loss_clip": 0.06295496, + "balance_loss_mlp": 0.0125435, + "epoch": 0.2570269051555689, + "flos": 27315163042560.0, + "grad_norm": 1.5809391622925344, + "language_loss": 0.84101403, + "learning_rate": 3.4832541846361743e-06, + "loss": 0.91885215, + "num_input_tokens_seen": 92409850, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.20629883, + "step": 4275, + "time_per_iteration": 2.5743672847747803 + }, + { + "auxiliary_loss_clip": 0.0652002, + "auxiliary_loss_mlp": 0.01273541, + "balance_loss_clip": 0.06298091, + "balance_loss_mlp": 0.01252965, + "epoch": 0.2570870284082369, + "flos": 27570811449600.0, + "grad_norm": 2.3295240533415016, + "language_loss": 0.78590673, + "learning_rate": 3.4829929007958175e-06, + "loss": 0.86384231, + "num_input_tokens_seen": 92431250, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.20581055, + "step": 4276, + "time_per_iteration": 2.631866216659546 + }, + { + "auxiliary_loss_clip": 0.06515533, + "auxiliary_loss_mlp": 0.01279943, + "balance_loss_clip": 0.06298599, + "balance_loss_mlp": 0.01260237, + "epoch": 0.25714715166090485, + "flos": 28738405768320.0, + "grad_norm": 1.6396366021430353, + "language_loss": 0.79803967, + "learning_rate": 3.4827315607202214e-06, + "loss": 0.8759945, + "num_input_tokens_seen": 92452065, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.19714355, + "step": 4277, + "time_per_iteration": 2.5990161895751953 + }, + { + "auxiliary_loss_clip": 0.06513472, + "auxiliary_loss_mlp": 0.01272259, + "balance_loss_clip": 0.06296529, + "balance_loss_mlp": 0.01254377, + "epoch": 0.2572072749135728, + "flos": 20121606495360.0, + "grad_norm": 1.9596681746733369, + "language_loss": 0.78998482, + "learning_rate": 3.482470164419295e-06, + "loss": 0.8678422, + "num_input_tokens_seen": 92470025, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.17883301, + "step": 4278, + "time_per_iteration": 4.02304744720459 + }, + { + "auxiliary_loss_clip": 0.06522302, + "auxiliary_loss_mlp": 0.01278536, + "balance_loss_clip": 0.06301469, + "balance_loss_mlp": 0.01256911, + "epoch": 0.2572673981662408, + "flos": 26037969183360.0, + "grad_norm": 2.3063853220673067, + "language_loss": 0.75400203, + "learning_rate": 3.482208711902952e-06, + "loss": 0.83201039, + "num_input_tokens_seen": 92489825, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.21618652, + "step": 4279, + "time_per_iteration": 2.5523123741149902 + }, + { + "auxiliary_loss_clip": 0.06516609, + "auxiliary_loss_mlp": 0.0128394, + "balance_loss_clip": 0.06297271, + "balance_loss_mlp": 0.01262721, + "epoch": 0.25732752141890874, + "flos": 16112054845440.0, + "grad_norm": 3.423283610494841, + "language_loss": 0.85997081, + "learning_rate": 3.4819472031811065e-06, + "loss": 0.9379763, + "num_input_tokens_seen": 92507270, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.2121582, + "step": 4280, + "time_per_iteration": 2.5104546546936035 + }, + { + "auxiliary_loss_clip": 0.06517641, + "auxiliary_loss_mlp": 0.01282108, + "balance_loss_clip": 0.06295675, + "balance_loss_mlp": 0.0126133, + "epoch": 0.2573876446715767, + "flos": 22530322690560.0, + "grad_norm": 2.5830483171875955, + "language_loss": 0.78735828, + "learning_rate": 3.4816856382636744e-06, + "loss": 0.86535579, + "num_input_tokens_seen": 92526300, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.20788574, + "step": 4281, + "time_per_iteration": 2.511723279953003 + }, + { + "auxiliary_loss_clip": 0.06512952, + "auxiliary_loss_mlp": 0.01285256, + "balance_loss_clip": 0.06294534, + "balance_loss_mlp": 0.01264048, + "epoch": 0.2574477679242447, + "flos": 23957548485120.0, + "grad_norm": 1.8266556980022217, + "language_loss": 0.87782013, + "learning_rate": 3.4814240171605737e-06, + "loss": 0.9558022, + "num_input_tokens_seen": 92546465, + "router_z_loss_clip": 2.18066406, + "router_z_loss_mlp": 0.21203613, + "step": 4282, + "time_per_iteration": 2.5573971271514893 + }, + { + "auxiliary_loss_clip": 0.06509817, + "auxiliary_loss_mlp": 0.0128236, + "balance_loss_clip": 0.06291438, + "balance_loss_mlp": 0.01262905, + "epoch": 0.2575078911769127, + "flos": 21988278627840.0, + "grad_norm": 1.3881538001933933, + "language_loss": 0.71042287, + "learning_rate": 3.4811623398817267e-06, + "loss": 0.78834462, + "num_input_tokens_seen": 92567260, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.19470215, + "step": 4283, + "time_per_iteration": 3.9826109409332275 + }, + { + "auxiliary_loss_clip": 0.06500088, + "auxiliary_loss_mlp": 0.01289815, + "balance_loss_clip": 0.06290558, + "balance_loss_mlp": 0.01271051, + "epoch": 0.25756801442958066, + "flos": 21951997009920.0, + "grad_norm": 1.9398744879334104, + "language_loss": 0.80991805, + "learning_rate": 3.4809006064370553e-06, + "loss": 0.88781703, + "num_input_tokens_seen": 92585425, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 0.18762207, + "step": 4284, + "time_per_iteration": 2.512471914291382 + }, + { + "auxiliary_loss_clip": 0.06508928, + "auxiliary_loss_mlp": 0.01294414, + "balance_loss_clip": 0.06291771, + "balance_loss_mlp": 0.01274923, + "epoch": 0.2576281376822486, + "flos": 35270675493120.0, + "grad_norm": 2.158245566426343, + "language_loss": 0.70814562, + "learning_rate": 3.4806388168364835e-06, + "loss": 0.78617907, + "num_input_tokens_seen": 92604770, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.19494629, + "step": 4285, + "time_per_iteration": 4.088344097137451 + }, + { + "auxiliary_loss_clip": 0.06504595, + "auxiliary_loss_mlp": 0.0128171, + "balance_loss_clip": 0.06288387, + "balance_loss_mlp": 0.01262505, + "epoch": 0.2576882609349166, + "flos": 14136705567360.0, + "grad_norm": 1.771877130646751, + "language_loss": 0.58818436, + "learning_rate": 3.4803769710899402e-06, + "loss": 0.66604745, + "num_input_tokens_seen": 92622635, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.1920166, + "step": 4286, + "time_per_iteration": 2.5344176292419434 + }, + { + "auxiliary_loss_clip": 0.0650837, + "auxiliary_loss_mlp": 0.01278621, + "balance_loss_clip": 0.06287025, + "balance_loss_mlp": 0.01259118, + "epoch": 0.25774838418758456, + "flos": 23265053216640.0, + "grad_norm": 2.057811055203196, + "language_loss": 0.6464054, + "learning_rate": 3.480115069207354e-06, + "loss": 0.72427529, + "num_input_tokens_seen": 92642960, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.19494629, + "step": 4287, + "time_per_iteration": 2.5958328247070312 + }, + { + "auxiliary_loss_clip": 0.0650748, + "auxiliary_loss_mlp": 0.01286721, + "balance_loss_clip": 0.06287187, + "balance_loss_mlp": 0.01265824, + "epoch": 0.2578085074402525, + "flos": 22608378368640.0, + "grad_norm": 1.9946373780944937, + "language_loss": 0.7222265, + "learning_rate": 3.4798531111986557e-06, + "loss": 0.80016851, + "num_input_tokens_seen": 92662455, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.2088623, + "step": 4288, + "time_per_iteration": 2.5767109394073486 + }, + { + "auxiliary_loss_clip": 0.06504134, + "auxiliary_loss_mlp": 0.01288175, + "balance_loss_clip": 0.06288374, + "balance_loss_mlp": 0.01268851, + "epoch": 0.2578686306929205, + "flos": 24578780256000.0, + "grad_norm": 1.4737569046844996, + "language_loss": 0.77657092, + "learning_rate": 3.4795910970737786e-06, + "loss": 0.85449398, + "num_input_tokens_seen": 92683520, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.1932373, + "step": 4289, + "time_per_iteration": 3.9734480381011963 + }, + { + "auxiliary_loss_clip": 0.0651005, + "auxiliary_loss_mlp": 0.01285951, + "balance_loss_clip": 0.06290901, + "balance_loss_mlp": 0.012641, + "epoch": 0.25792875394558845, + "flos": 18119828453760.0, + "grad_norm": 2.192134211179858, + "language_loss": 0.8580482, + "learning_rate": 3.4793290268426592e-06, + "loss": 0.93600821, + "num_input_tokens_seen": 92701450, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.21838379, + "step": 4290, + "time_per_iteration": 2.5564229488372803 + }, + { + "auxiliary_loss_clip": 0.0651224, + "auxiliary_loss_mlp": 0.01283874, + "balance_loss_clip": 0.06293762, + "balance_loss_mlp": 0.01263573, + "epoch": 0.2579888771982564, + "flos": 17718760085760.0, + "grad_norm": 2.0247866667145344, + "language_loss": 0.73390263, + "learning_rate": 3.4790669005152354e-06, + "loss": 0.81186378, + "num_input_tokens_seen": 92720355, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.20300293, + "step": 4291, + "time_per_iteration": 2.497671365737915 + }, + { + "auxiliary_loss_clip": 0.06508101, + "auxiliary_loss_mlp": 0.01275245, + "balance_loss_clip": 0.06287237, + "balance_loss_mlp": 0.01255647, + "epoch": 0.2580490004509244, + "flos": 16440350342400.0, + "grad_norm": 2.23272675200871, + "language_loss": 0.82139969, + "learning_rate": 3.4788047181014458e-06, + "loss": 0.8992331, + "num_input_tokens_seen": 92736755, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.19604492, + "step": 4292, + "time_per_iteration": 2.5467498302459717 + }, + { + "auxiliary_loss_clip": 0.06505652, + "auxiliary_loss_mlp": 0.01282583, + "balance_loss_clip": 0.06289525, + "balance_loss_mlp": 0.01262532, + "epoch": 0.25810912370359235, + "flos": 33842946574080.0, + "grad_norm": 1.9023591833174374, + "language_loss": 0.67644775, + "learning_rate": 3.4785424796112337e-06, + "loss": 0.7543301, + "num_input_tokens_seen": 92757655, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.20043945, + "step": 4293, + "time_per_iteration": 2.626880168914795 + }, + { + "auxiliary_loss_clip": 0.06507371, + "auxiliary_loss_mlp": 0.01275889, + "balance_loss_clip": 0.06295517, + "balance_loss_mlp": 0.01257244, + "epoch": 0.2581692469562603, + "flos": 25199257340160.0, + "grad_norm": 2.9603548878770387, + "language_loss": 0.76158464, + "learning_rate": 3.478280185054542e-06, + "loss": 0.83941722, + "num_input_tokens_seen": 92776100, + "router_z_loss_clip": 2.12109375, + "router_z_loss_mlp": 0.18640137, + "step": 4294, + "time_per_iteration": 2.5711581707000732 + }, + { + "auxiliary_loss_clip": 0.06508272, + "auxiliary_loss_mlp": 0.01277058, + "balance_loss_clip": 0.06293358, + "balance_loss_mlp": 0.01257866, + "epoch": 0.2582293702089283, + "flos": 34940619060480.0, + "grad_norm": 2.382767918587226, + "language_loss": 0.81769538, + "learning_rate": 3.478017834441318e-06, + "loss": 0.8955487, + "num_input_tokens_seen": 92798880, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.1920166, + "step": 4295, + "time_per_iteration": 2.635817766189575 + }, + { + "auxiliary_loss_clip": 0.06519823, + "auxiliary_loss_mlp": 0.01276702, + "balance_loss_clip": 0.06295969, + "balance_loss_mlp": 0.01256496, + "epoch": 0.2582894934615963, + "flos": 26841028314240.0, + "grad_norm": 1.964012337767824, + "language_loss": 0.72949934, + "learning_rate": 3.4777554277815096e-06, + "loss": 0.80746454, + "num_input_tokens_seen": 92817750, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.20214844, + "step": 4296, + "time_per_iteration": 2.569481134414673 + }, + { + "auxiliary_loss_clip": 0.06514452, + "auxiliary_loss_mlp": 0.01277621, + "balance_loss_clip": 0.0629501, + "balance_loss_mlp": 0.0125732, + "epoch": 0.25834961671426426, + "flos": 23522252924160.0, + "grad_norm": 1.7245670135783875, + "language_loss": 0.87440747, + "learning_rate": 3.477492965085067e-06, + "loss": 0.95232815, + "num_input_tokens_seen": 92837995, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.20288086, + "step": 4297, + "time_per_iteration": 2.5871896743774414 + }, + { + "auxiliary_loss_clip": 0.06510781, + "auxiliary_loss_mlp": 0.01279012, + "balance_loss_clip": 0.062922, + "balance_loss_mlp": 0.01260558, + "epoch": 0.25840973996693223, + "flos": 22456837059840.0, + "grad_norm": 2.9037965134923076, + "language_loss": 0.84894854, + "learning_rate": 3.477230446361943e-06, + "loss": 0.9268465, + "num_input_tokens_seen": 92857245, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.18469238, + "step": 4298, + "time_per_iteration": 2.5290613174438477 + }, + { + "auxiliary_loss_clip": 0.06510766, + "auxiliary_loss_mlp": 0.01276006, + "balance_loss_clip": 0.06292143, + "balance_loss_mlp": 0.01256158, + "epoch": 0.2584698632196002, + "flos": 11295544849920.0, + "grad_norm": 2.12928453409433, + "language_loss": 0.83727312, + "learning_rate": 3.4769678716220927e-06, + "loss": 0.91514087, + "num_input_tokens_seen": 92873265, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.1986084, + "step": 4299, + "time_per_iteration": 2.5314571857452393 + }, + { + "auxiliary_loss_clip": 0.06506392, + "auxiliary_loss_mlp": 0.01272204, + "balance_loss_clip": 0.06292024, + "balance_loss_mlp": 0.01253214, + "epoch": 0.25852998647226816, + "flos": 17935569325440.0, + "grad_norm": 2.08690605682093, + "language_loss": 0.83303946, + "learning_rate": 3.4767052408754726e-06, + "loss": 0.91082543, + "num_input_tokens_seen": 92890880, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.18981934, + "step": 4300, + "time_per_iteration": 2.494170904159546 + }, + { + "auxiliary_loss_clip": 0.06507458, + "auxiliary_loss_mlp": 0.01272704, + "balance_loss_clip": 0.06287713, + "balance_loss_mlp": 0.01254012, + "epoch": 0.2585901097249361, + "flos": 33264620893440.0, + "grad_norm": 3.3706811216639307, + "language_loss": 0.67941749, + "learning_rate": 3.4764425541320417e-06, + "loss": 0.75721914, + "num_input_tokens_seen": 92910770, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.18688965, + "step": 4301, + "time_per_iteration": 2.6923537254333496 + }, + { + "auxiliary_loss_clip": 0.06512292, + "auxiliary_loss_mlp": 0.01275999, + "balance_loss_clip": 0.06289004, + "balance_loss_mlp": 0.01257009, + "epoch": 0.2586502329776041, + "flos": 18447033847680.0, + "grad_norm": 2.7819934823512282, + "language_loss": 0.83073664, + "learning_rate": 3.4761798114017617e-06, + "loss": 0.90861952, + "num_input_tokens_seen": 92929520, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.18994141, + "step": 4302, + "time_per_iteration": 2.5102365016937256 + }, + { + "auxiliary_loss_clip": 0.06508462, + "auxiliary_loss_mlp": 0.01276586, + "balance_loss_clip": 0.06292115, + "balance_loss_mlp": 0.01257358, + "epoch": 0.25871035623027205, + "flos": 17973989222400.0, + "grad_norm": 1.7107484291097332, + "language_loss": 0.91874599, + "learning_rate": 3.475917012694595e-06, + "loss": 0.99659652, + "num_input_tokens_seen": 92947890, + "router_z_loss_clip": 2.1640625, + "router_z_loss_mlp": 0.19238281, + "step": 4303, + "time_per_iteration": 2.5386602878570557 + }, + { + "auxiliary_loss_clip": 0.06508803, + "auxiliary_loss_mlp": 0.01277595, + "balance_loss_clip": 0.0629281, + "balance_loss_mlp": 0.01258569, + "epoch": 0.25877047948294, + "flos": 27784392307200.0, + "grad_norm": 1.7938003883067368, + "language_loss": 0.67601281, + "learning_rate": 3.475654158020507e-06, + "loss": 0.75387681, + "num_input_tokens_seen": 92967690, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.19018555, + "step": 4304, + "time_per_iteration": 2.5739033222198486 + }, + { + "auxiliary_loss_clip": 0.06507856, + "auxiliary_loss_mlp": 0.01276896, + "balance_loss_clip": 0.06286401, + "balance_loss_mlp": 0.01257477, + "epoch": 0.258830602735608, + "flos": 27133209901440.0, + "grad_norm": 2.1929382614593242, + "language_loss": 0.73436916, + "learning_rate": 3.4753912473894657e-06, + "loss": 0.81221676, + "num_input_tokens_seen": 92986830, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.1940918, + "step": 4305, + "time_per_iteration": 2.5877888202667236 + }, + { + "auxiliary_loss_clip": 0.06515621, + "auxiliary_loss_mlp": 0.01276889, + "balance_loss_clip": 0.06293313, + "balance_loss_mlp": 0.01255992, + "epoch": 0.25889072598827595, + "flos": 17896730158080.0, + "grad_norm": 1.8662067033328453, + "language_loss": 0.76418924, + "learning_rate": 3.4751282808114403e-06, + "loss": 0.84211433, + "num_input_tokens_seen": 93002740, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.20898438, + "step": 4306, + "time_per_iteration": 2.482933282852173 + }, + { + "auxiliary_loss_clip": 0.06403579, + "auxiliary_loss_mlp": 0.01258203, + "balance_loss_clip": 0.06296976, + "balance_loss_mlp": 0.01253566, + "epoch": 0.2589508492409439, + "flos": 53951582885760.0, + "grad_norm": 0.8023409981232837, + "language_loss": 0.56592381, + "learning_rate": 3.474865258296403e-06, + "loss": 0.64254159, + "num_input_tokens_seen": 93058645, + "router_z_loss_clip": 1.06347656, + "router_z_loss_mlp": 0.04629517, + "step": 4307, + "time_per_iteration": 3.1265084743499756 + }, + { + "auxiliary_loss_clip": 0.06500413, + "auxiliary_loss_mlp": 0.0127407, + "balance_loss_clip": 0.06289256, + "balance_loss_mlp": 0.01256105, + "epoch": 0.2590109724936119, + "flos": 22132063434240.0, + "grad_norm": 1.735104377472534, + "language_loss": 0.71851504, + "learning_rate": 3.474602179854327e-06, + "loss": 0.79625988, + "num_input_tokens_seen": 93077140, + "router_z_loss_clip": 2.11328125, + "router_z_loss_mlp": 0.17956543, + "step": 4308, + "time_per_iteration": 2.5442304611206055 + }, + { + "auxiliary_loss_clip": 0.06513858, + "auxiliary_loss_mlp": 0.01278615, + "balance_loss_clip": 0.0629196, + "balance_loss_mlp": 0.01258993, + "epoch": 0.2590710957462799, + "flos": 13478395564800.0, + "grad_norm": 2.8033587428294657, + "language_loss": 0.84278727, + "learning_rate": 3.4743390454951886e-06, + "loss": 0.92071199, + "num_input_tokens_seen": 93093580, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.19628906, + "step": 4309, + "time_per_iteration": 2.546034336090088 + }, + { + "auxiliary_loss_clip": 0.06504438, + "auxiliary_loss_mlp": 0.01276588, + "balance_loss_clip": 0.06290606, + "balance_loss_mlp": 0.01258814, + "epoch": 0.25913121899894787, + "flos": 22313219961600.0, + "grad_norm": 1.5400127324827177, + "language_loss": 0.84972912, + "learning_rate": 3.474075855228966e-06, + "loss": 0.92753935, + "num_input_tokens_seen": 93112345, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.17785645, + "step": 4310, + "time_per_iteration": 2.5188028812408447 + }, + { + "auxiliary_loss_clip": 0.06511362, + "auxiliary_loss_mlp": 0.0127375, + "balance_loss_clip": 0.06293052, + "balance_loss_mlp": 0.01254533, + "epoch": 0.25919134225161583, + "flos": 25818770102400.0, + "grad_norm": 1.8118221315599161, + "language_loss": 0.78088975, + "learning_rate": 3.473812609065639e-06, + "loss": 0.85874081, + "num_input_tokens_seen": 93131545, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.19213867, + "step": 4311, + "time_per_iteration": 2.6044604778289795 + }, + { + "auxiliary_loss_clip": 0.06511068, + "auxiliary_loss_mlp": 0.01275144, + "balance_loss_clip": 0.06293963, + "balance_loss_mlp": 0.01256666, + "epoch": 0.2592514655042838, + "flos": 31220314104960.0, + "grad_norm": 4.381167674093932, + "language_loss": 0.73062587, + "learning_rate": 3.4735493070151904e-06, + "loss": 0.80848801, + "num_input_tokens_seen": 93150730, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.18469238, + "step": 4312, + "time_per_iteration": 2.587942600250244 + }, + { + "auxiliary_loss_clip": 0.06508243, + "auxiliary_loss_mlp": 0.01275986, + "balance_loss_clip": 0.06291987, + "balance_loss_mlp": 0.012569, + "epoch": 0.25931158875695176, + "flos": 18480296718720.0, + "grad_norm": 1.7543304647253515, + "language_loss": 0.70305753, + "learning_rate": 3.4732859490876044e-06, + "loss": 0.78089976, + "num_input_tokens_seen": 93167895, + "router_z_loss_clip": 2.16113281, + "router_z_loss_mlp": 0.19091797, + "step": 4313, + "time_per_iteration": 2.5092732906341553 + }, + { + "auxiliary_loss_clip": 0.06508952, + "auxiliary_loss_mlp": 0.01278616, + "balance_loss_clip": 0.06293979, + "balance_loss_mlp": 0.0125971, + "epoch": 0.2593717120096197, + "flos": 19213895214720.0, + "grad_norm": 1.751562510714179, + "language_loss": 0.81158572, + "learning_rate": 3.473022535292867e-06, + "loss": 0.8894614, + "num_input_tokens_seen": 93187650, + "router_z_loss_clip": 2.15332031, + "router_z_loss_mlp": 0.18908691, + "step": 4314, + "time_per_iteration": 2.5584335327148438 + }, + { + "auxiliary_loss_clip": 0.06515148, + "auxiliary_loss_mlp": 0.01278316, + "balance_loss_clip": 0.06292658, + "balance_loss_mlp": 0.01257359, + "epoch": 0.2594318352622877, + "flos": 31256050671360.0, + "grad_norm": 1.9178095473181331, + "language_loss": 0.67283171, + "learning_rate": 3.472759065640968e-06, + "loss": 0.7507664, + "num_input_tokens_seen": 93207370, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.20959473, + "step": 4315, + "time_per_iteration": 2.6295278072357178 + }, + { + "auxiliary_loss_clip": 0.06506292, + "auxiliary_loss_mlp": 0.01277654, + "balance_loss_clip": 0.06292329, + "balance_loss_mlp": 0.01259463, + "epoch": 0.25949195851495566, + "flos": 22243759326720.0, + "grad_norm": 1.412764147956583, + "language_loss": 0.80242419, + "learning_rate": 3.4724955401418976e-06, + "loss": 0.88026369, + "num_input_tokens_seen": 93227925, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.18212891, + "step": 4316, + "time_per_iteration": 2.5277152061462402 + }, + { + "auxiliary_loss_clip": 0.06510989, + "auxiliary_loss_mlp": 0.01277583, + "balance_loss_clip": 0.06290686, + "balance_loss_mlp": 0.01256781, + "epoch": 0.2595520817676236, + "flos": 28083449928960.0, + "grad_norm": 1.6660208675023864, + "language_loss": 0.78127223, + "learning_rate": 3.4722319588056487e-06, + "loss": 0.85915792, + "num_input_tokens_seen": 93250020, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.20812988, + "step": 4317, + "time_per_iteration": 2.6210665702819824 + }, + { + "auxiliary_loss_clip": 0.06507257, + "auxiliary_loss_mlp": 0.01281581, + "balance_loss_clip": 0.06291957, + "balance_loss_mlp": 0.01262054, + "epoch": 0.2596122050202916, + "flos": 20196727280640.0, + "grad_norm": 2.4040812102587377, + "language_loss": 0.78420109, + "learning_rate": 3.4719683216422163e-06, + "loss": 0.86208946, + "num_input_tokens_seen": 93269070, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.19519043, + "step": 4318, + "time_per_iteration": 3.9600155353546143 + }, + { + "auxiliary_loss_clip": 0.06505568, + "auxiliary_loss_mlp": 0.01276855, + "balance_loss_clip": 0.06290057, + "balance_loss_mlp": 0.01256637, + "epoch": 0.25967232827295955, + "flos": 22534431540480.0, + "grad_norm": 2.66294558684285, + "language_loss": 0.77022719, + "learning_rate": 3.471704628661598e-06, + "loss": 0.84805143, + "num_input_tokens_seen": 93290250, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.20227051, + "step": 4319, + "time_per_iteration": 2.544752836227417 + }, + { + "auxiliary_loss_clip": 0.0650554, + "auxiliary_loss_mlp": 0.01280509, + "balance_loss_clip": 0.06290743, + "balance_loss_mlp": 0.01261555, + "epoch": 0.2597324515256275, + "flos": 21074445999360.0, + "grad_norm": 1.7925219732685136, + "language_loss": 0.77426791, + "learning_rate": 3.4714408798737925e-06, + "loss": 0.85212845, + "num_input_tokens_seen": 93310090, + "router_z_loss_clip": 2.14941406, + "router_z_loss_mlp": 0.18945312, + "step": 4320, + "time_per_iteration": 2.569967269897461 + }, + { + "auxiliary_loss_clip": 0.06508496, + "auxiliary_loss_mlp": 0.01273671, + "balance_loss_clip": 0.06289761, + "balance_loss_mlp": 0.01254634, + "epoch": 0.2597925747782955, + "flos": 22055810618880.0, + "grad_norm": 1.593385908573569, + "language_loss": 0.71533716, + "learning_rate": 3.471177075288801e-06, + "loss": 0.79315877, + "num_input_tokens_seen": 93329570, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.19042969, + "step": 4321, + "time_per_iteration": 2.5314829349517822 + }, + { + "auxiliary_loss_clip": 0.0650996, + "auxiliary_loss_mlp": 0.01274348, + "balance_loss_clip": 0.06287652, + "balance_loss_mlp": 0.01254011, + "epoch": 0.2598526980309635, + "flos": 19543071179520.0, + "grad_norm": 2.282331155451991, + "language_loss": 0.75262189, + "learning_rate": 3.4709132149166277e-06, + "loss": 0.83046496, + "num_input_tokens_seen": 93347920, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.20336914, + "step": 4322, + "time_per_iteration": 2.525724411010742 + }, + { + "auxiliary_loss_clip": 0.06509394, + "auxiliary_loss_mlp": 0.01275417, + "balance_loss_clip": 0.06289983, + "balance_loss_mlp": 0.0125533, + "epoch": 0.25991282128363147, + "flos": 24501521191680.0, + "grad_norm": 2.623736611083137, + "language_loss": 0.7442928, + "learning_rate": 3.470649298767278e-06, + "loss": 0.82214087, + "num_input_tokens_seen": 93367145, + "router_z_loss_clip": 2.19335938, + "router_z_loss_mlp": 0.20092773, + "step": 4323, + "time_per_iteration": 3.957674026489258 + }, + { + "auxiliary_loss_clip": 0.06515582, + "auxiliary_loss_mlp": 0.01279409, + "balance_loss_clip": 0.0628901, + "balance_loss_mlp": 0.01258893, + "epoch": 0.25997294453629943, + "flos": 24207410960640.0, + "grad_norm": 1.7976461796423409, + "language_loss": 0.68052149, + "learning_rate": 3.4703853268507597e-06, + "loss": 0.75847143, + "num_input_tokens_seen": 93386555, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.20495605, + "step": 4324, + "time_per_iteration": 4.001135349273682 + }, + { + "auxiliary_loss_clip": 0.06505544, + "auxiliary_loss_mlp": 0.01272584, + "balance_loss_clip": 0.06286605, + "balance_loss_mlp": 0.01254608, + "epoch": 0.2600330677889674, + "flos": 31439597040000.0, + "grad_norm": 1.7946989584541546, + "language_loss": 0.71402133, + "learning_rate": 3.470121299177082e-06, + "loss": 0.79180264, + "num_input_tokens_seen": 93405590, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.1796875, + "step": 4325, + "time_per_iteration": 2.6213603019714355 + }, + { + "auxiliary_loss_clip": 0.06501837, + "auxiliary_loss_mlp": 0.01274613, + "balance_loss_clip": 0.06284901, + "balance_loss_mlp": 0.01255004, + "epoch": 0.26009319104163536, + "flos": 32274116179200.0, + "grad_norm": 1.826124228611905, + "language_loss": 0.73262805, + "learning_rate": 3.469857215756257e-06, + "loss": 0.81039256, + "num_input_tokens_seen": 93424750, + "router_z_loss_clip": 2.16601562, + "router_z_loss_mlp": 0.19592285, + "step": 4326, + "time_per_iteration": 2.593801736831665 + }, + { + "auxiliary_loss_clip": 0.06500994, + "auxiliary_loss_mlp": 0.01276886, + "balance_loss_clip": 0.06288173, + "balance_loss_mlp": 0.01258051, + "epoch": 0.26015331429430333, + "flos": 26293994933760.0, + "grad_norm": 1.858424121782002, + "language_loss": 0.8722446, + "learning_rate": 3.4695930765982997e-06, + "loss": 0.95002341, + "num_input_tokens_seen": 93443465, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.18835449, + "step": 4327, + "time_per_iteration": 2.5950510501861572 + }, + { + "auxiliary_loss_clip": 0.06508228, + "auxiliary_loss_mlp": 0.01274095, + "balance_loss_clip": 0.06287643, + "balance_loss_mlp": 0.01254271, + "epoch": 0.2602134375469713, + "flos": 21148728243840.0, + "grad_norm": 1.765295937421399, + "language_loss": 0.8100785, + "learning_rate": 3.4693288817132255e-06, + "loss": 0.88790172, + "num_input_tokens_seen": 93462580, + "router_z_loss_clip": 2.20507812, + "router_z_loss_mlp": 0.19824219, + "step": 4328, + "time_per_iteration": 3.923682928085327 + }, + { + "auxiliary_loss_clip": 0.06502862, + "auxiliary_loss_mlp": 0.01277051, + "balance_loss_clip": 0.06285354, + "balance_loss_mlp": 0.01258704, + "epoch": 0.26027356079963926, + "flos": 25928411569920.0, + "grad_norm": 1.3948699622732248, + "language_loss": 0.88172936, + "learning_rate": 3.4690646311110525e-06, + "loss": 0.95952845, + "num_input_tokens_seen": 93482790, + "router_z_loss_clip": 2.17578125, + "router_z_loss_mlp": 0.18347168, + "step": 4329, + "time_per_iteration": 2.5685267448425293 + }, + { + "auxiliary_loss_clip": 0.06502585, + "auxiliary_loss_mlp": 0.01271461, + "balance_loss_clip": 0.06288581, + "balance_loss_mlp": 0.0125327, + "epoch": 0.2603336840523072, + "flos": 26366390461440.0, + "grad_norm": 1.8811175805050973, + "language_loss": 0.77705932, + "learning_rate": 3.468800324801802e-06, + "loss": 0.85479975, + "num_input_tokens_seen": 93498795, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.18188477, + "step": 4330, + "time_per_iteration": 2.6185224056243896 + }, + { + "auxiliary_loss_clip": 0.06508863, + "auxiliary_loss_mlp": 0.01277238, + "balance_loss_clip": 0.06289242, + "balance_loss_mlp": 0.0125826, + "epoch": 0.2603938073049752, + "flos": 23520408134400.0, + "grad_norm": 1.5596482888270802, + "language_loss": 0.76200908, + "learning_rate": 3.4685359627954958e-06, + "loss": 0.8398701, + "num_input_tokens_seen": 93518335, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.18981934, + "step": 4331, + "time_per_iteration": 2.5152506828308105 + }, + { + "auxiliary_loss_clip": 0.06507871, + "auxiliary_loss_mlp": 0.01272217, + "balance_loss_clip": 0.06292268, + "balance_loss_mlp": 0.01254527, + "epoch": 0.26045393055764315, + "flos": 25381336262400.0, + "grad_norm": 1.426884348550376, + "language_loss": 0.69540298, + "learning_rate": 3.4682715451021584e-06, + "loss": 0.77320385, + "num_input_tokens_seen": 93539170, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.17700195, + "step": 4332, + "time_per_iteration": 2.5776190757751465 + }, + { + "auxiliary_loss_clip": 0.06511752, + "auxiliary_loss_mlp": 0.01275479, + "balance_loss_clip": 0.0629351, + "balance_loss_mlp": 0.0125693, + "epoch": 0.2605140538103111, + "flos": 27642494217600.0, + "grad_norm": 1.8844860211449586, + "language_loss": 0.79951644, + "learning_rate": 3.4680070717318174e-06, + "loss": 0.87738872, + "num_input_tokens_seen": 93558480, + "router_z_loss_clip": 2.18164062, + "router_z_loss_mlp": 0.1854248, + "step": 4333, + "time_per_iteration": 2.5523998737335205 + }, + { + "auxiliary_loss_clip": 0.06501235, + "auxiliary_loss_mlp": 0.01272154, + "balance_loss_clip": 0.06290703, + "balance_loss_mlp": 0.01254714, + "epoch": 0.2605741770629791, + "flos": 13774602147840.0, + "grad_norm": 1.6726919145500945, + "language_loss": 0.81128466, + "learning_rate": 3.467742542694501e-06, + "loss": 0.8890186, + "num_input_tokens_seen": 93575220, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 0.17443848, + "step": 4334, + "time_per_iteration": 2.522210121154785 + }, + { + "auxiliary_loss_clip": 0.06510483, + "auxiliary_loss_mlp": 0.01278802, + "balance_loss_clip": 0.06293018, + "balance_loss_mlp": 0.01259859, + "epoch": 0.26063430031564705, + "flos": 26038933505280.0, + "grad_norm": 1.7438742011205015, + "language_loss": 0.80170292, + "learning_rate": 3.46747795800024e-06, + "loss": 0.87959582, + "num_input_tokens_seen": 93597015, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.18945312, + "step": 4335, + "time_per_iteration": 2.582817792892456 + }, + { + "auxiliary_loss_clip": 0.06403506, + "auxiliary_loss_mlp": 0.01257225, + "balance_loss_clip": 0.06297, + "balance_loss_mlp": 0.01252544, + "epoch": 0.26069442356831507, + "flos": 62463143030400.0, + "grad_norm": 0.8284851894367303, + "language_loss": 0.60816151, + "learning_rate": 3.467213317659068e-06, + "loss": 0.6847688, + "num_input_tokens_seen": 93657775, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.04672241, + "step": 4336, + "time_per_iteration": 3.2036406993865967 + }, + { + "auxiliary_loss_clip": 0.0651319, + "auxiliary_loss_mlp": 0.0127574, + "balance_loss_clip": 0.06294517, + "balance_loss_mlp": 0.01257405, + "epoch": 0.26075454682098304, + "flos": 13631530101120.0, + "grad_norm": 1.8662385080657846, + "language_loss": 0.78028893, + "learning_rate": 3.46694862168102e-06, + "loss": 0.85817826, + "num_input_tokens_seen": 93676145, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.18322754, + "step": 4337, + "time_per_iteration": 2.4899747371673584 + }, + { + "auxiliary_loss_clip": 0.06515083, + "auxiliary_loss_mlp": 0.01276173, + "balance_loss_clip": 0.06295171, + "balance_loss_mlp": 0.01256289, + "epoch": 0.260814670073651, + "flos": 12130776748800.0, + "grad_norm": 2.165940638299647, + "language_loss": 0.74851859, + "learning_rate": 3.4666838700761334e-06, + "loss": 0.82643116, + "num_input_tokens_seen": 93692480, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.19897461, + "step": 4338, + "time_per_iteration": 2.5323259830474854 + }, + { + "auxiliary_loss_clip": 0.06522977, + "auxiliary_loss_mlp": 0.01274339, + "balance_loss_clip": 0.0629933, + "balance_loss_mlp": 0.01255039, + "epoch": 0.26087479332631897, + "flos": 15127964968320.0, + "grad_norm": 2.9662822483112388, + "language_loss": 0.81419933, + "learning_rate": 3.466419062854447e-06, + "loss": 0.89217252, + "num_input_tokens_seen": 93710165, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.19287109, + "step": 4339, + "time_per_iteration": 2.486024856567383 + }, + { + "auxiliary_loss_clip": 0.06514673, + "auxiliary_loss_mlp": 0.0127648, + "balance_loss_clip": 0.06300991, + "balance_loss_mlp": 0.01259278, + "epoch": 0.26093491657898693, + "flos": 24687834744960.0, + "grad_norm": 1.5467473582016638, + "language_loss": 0.77106607, + "learning_rate": 3.4661542000260033e-06, + "loss": 0.84897768, + "num_input_tokens_seen": 93730185, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.17199707, + "step": 4340, + "time_per_iteration": 2.570777416229248 + }, + { + "auxiliary_loss_clip": 0.06513949, + "auxiliary_loss_mlp": 0.01274956, + "balance_loss_clip": 0.062961, + "balance_loss_mlp": 0.01255788, + "epoch": 0.2609950398316549, + "flos": 25122669108480.0, + "grad_norm": 1.4533527138525517, + "language_loss": 0.82740015, + "learning_rate": 3.465889281600845e-06, + "loss": 0.90528917, + "num_input_tokens_seen": 93747690, + "router_z_loss_clip": 2.18164062, + "router_z_loss_mlp": 0.19177246, + "step": 4341, + "time_per_iteration": 2.5946342945098877 + }, + { + "auxiliary_loss_clip": 0.06519589, + "auxiliary_loss_mlp": 0.01282035, + "balance_loss_clip": 0.06303687, + "balance_loss_mlp": 0.01261794, + "epoch": 0.26105516308432286, + "flos": 28556159137920.0, + "grad_norm": 1.7858700463590271, + "language_loss": 0.77163744, + "learning_rate": 3.4656243075890183e-06, + "loss": 0.84965372, + "num_input_tokens_seen": 93767405, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.20251465, + "step": 4342, + "time_per_iteration": 2.5742342472076416 + }, + { + "auxiliary_loss_clip": 0.06521034, + "auxiliary_loss_mlp": 0.01277248, + "balance_loss_clip": 0.06303718, + "balance_loss_mlp": 0.01258115, + "epoch": 0.2611152863369908, + "flos": 39539984400000.0, + "grad_norm": 1.7100835603344944, + "language_loss": 0.66681403, + "learning_rate": 3.4653592780005707e-06, + "loss": 0.74479687, + "num_input_tokens_seen": 93789950, + "router_z_loss_clip": 2.17578125, + "router_z_loss_mlp": 0.19140625, + "step": 4343, + "time_per_iteration": 2.662271738052368 + }, + { + "auxiliary_loss_clip": 0.06524731, + "auxiliary_loss_mlp": 0.01280109, + "balance_loss_clip": 0.0630408, + "balance_loss_mlp": 0.01261917, + "epoch": 0.2611754095896588, + "flos": 13740416881920.0, + "grad_norm": 1.8127929734390111, + "language_loss": 0.74220115, + "learning_rate": 3.465094192845553e-06, + "loss": 0.82024956, + "num_input_tokens_seen": 93807835, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.18200684, + "step": 4344, + "time_per_iteration": 2.5201361179351807 + }, + { + "auxiliary_loss_clip": 0.06524797, + "auxiliary_loss_mlp": 0.01284917, + "balance_loss_clip": 0.06307752, + "balance_loss_mlp": 0.01264484, + "epoch": 0.26123553284232676, + "flos": 21513011869440.0, + "grad_norm": 2.1854473316742338, + "language_loss": 0.8696478, + "learning_rate": 3.4648290521340165e-06, + "loss": 0.94774491, + "num_input_tokens_seen": 93825670, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.20422363, + "step": 4345, + "time_per_iteration": 2.510000228881836 + }, + { + "auxiliary_loss_clip": 0.06521724, + "auxiliary_loss_mlp": 0.01276675, + "balance_loss_clip": 0.06307776, + "balance_loss_mlp": 0.01258293, + "epoch": 0.2612956560949947, + "flos": 21145751424000.0, + "grad_norm": 2.0739898036059095, + "language_loss": 0.76897335, + "learning_rate": 3.464563855876015e-06, + "loss": 0.84695733, + "num_input_tokens_seen": 93844045, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.18371582, + "step": 4346, + "time_per_iteration": 2.5322000980377197 + }, + { + "auxiliary_loss_clip": 0.06522055, + "auxiliary_loss_mlp": 0.01275162, + "balance_loss_clip": 0.06305227, + "balance_loss_mlp": 0.01256911, + "epoch": 0.2613557793476627, + "flos": 25126023271680.0, + "grad_norm": 1.5562871556893731, + "language_loss": 0.76140273, + "learning_rate": 3.464298604081606e-06, + "loss": 0.83937496, + "num_input_tokens_seen": 93864380, + "router_z_loss_clip": 2.16894531, + "router_z_loss_mlp": 0.18249512, + "step": 4347, + "time_per_iteration": 2.557077169418335 + }, + { + "auxiliary_loss_clip": 0.06522661, + "auxiliary_loss_mlp": 0.01286127, + "balance_loss_clip": 0.06307539, + "balance_loss_mlp": 0.01267208, + "epoch": 0.26141590260033065, + "flos": 26074879706880.0, + "grad_norm": 1.3369896368920637, + "language_loss": 0.7377249, + "learning_rate": 3.4640332967608476e-06, + "loss": 0.81581283, + "num_input_tokens_seen": 93885475, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.18920898, + "step": 4348, + "time_per_iteration": 2.5915603637695312 + }, + { + "auxiliary_loss_clip": 0.06527912, + "auxiliary_loss_mlp": 0.01280562, + "balance_loss_clip": 0.06309946, + "balance_loss_mlp": 0.01260881, + "epoch": 0.2614760258529987, + "flos": 25708415875200.0, + "grad_norm": 1.876318754691465, + "language_loss": 0.9123491, + "learning_rate": 3.463767933923799e-06, + "loss": 0.99043381, + "num_input_tokens_seen": 93905545, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.19689941, + "step": 4349, + "time_per_iteration": 2.594332218170166 + }, + { + "auxiliary_loss_clip": 0.06524529, + "auxiliary_loss_mlp": 0.01276126, + "balance_loss_clip": 0.0631379, + "balance_loss_mlp": 0.01256695, + "epoch": 0.26153614910566664, + "flos": 17462902043520.0, + "grad_norm": 1.601755901803269, + "language_loss": 0.80459869, + "learning_rate": 3.463502515580524e-06, + "loss": 0.8826052, + "num_input_tokens_seen": 93924185, + "router_z_loss_clip": 2.10546875, + "router_z_loss_mlp": 0.19433594, + "step": 4350, + "time_per_iteration": 2.509274482727051 + }, + { + "auxiliary_loss_clip": 0.06520928, + "auxiliary_loss_mlp": 0.01277683, + "balance_loss_clip": 0.0631097, + "balance_loss_mlp": 0.01259063, + "epoch": 0.2615962723583346, + "flos": 17718676231680.0, + "grad_norm": 1.8928977658247819, + "language_loss": 0.62482548, + "learning_rate": 3.4632370417410866e-06, + "loss": 0.7028116, + "num_input_tokens_seen": 93942825, + "router_z_loss_clip": 2.1015625, + "router_z_loss_mlp": 0.18615723, + "step": 4351, + "time_per_iteration": 2.522862672805786 + }, + { + "auxiliary_loss_clip": 0.06526107, + "auxiliary_loss_mlp": 0.01278827, + "balance_loss_clip": 0.06308405, + "balance_loss_mlp": 0.01259396, + "epoch": 0.26165639561100257, + "flos": 23264340456960.0, + "grad_norm": 2.4783042039829546, + "language_loss": 0.84264326, + "learning_rate": 3.462971512415555e-06, + "loss": 0.92069256, + "num_input_tokens_seen": 93962045, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.19445801, + "step": 4352, + "time_per_iteration": 2.5326311588287354 + }, + { + "auxiliary_loss_clip": 0.06398427, + "auxiliary_loss_mlp": 0.01261209, + "balance_loss_clip": 0.06294002, + "balance_loss_mlp": 0.01256817, + "epoch": 0.26171651886367053, + "flos": 66756155443200.0, + "grad_norm": 0.7669563885543124, + "language_loss": 0.7057451, + "learning_rate": 3.462705927613996e-06, + "loss": 0.78234154, + "num_input_tokens_seen": 94021175, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.04397583, + "step": 4353, + "time_per_iteration": 3.093543529510498 + }, + { + "auxiliary_loss_clip": 0.06517833, + "auxiliary_loss_mlp": 0.01279039, + "balance_loss_clip": 0.06305997, + "balance_loss_mlp": 0.01259619, + "epoch": 0.2617766421163385, + "flos": 22356713030400.0, + "grad_norm": 1.943198757771125, + "language_loss": 0.77770078, + "learning_rate": 3.4624402873464816e-06, + "loss": 0.8556695, + "num_input_tokens_seen": 94043370, + "router_z_loss_clip": 2.1171875, + "router_z_loss_mlp": 0.19433594, + "step": 4354, + "time_per_iteration": 2.5782573223114014 + }, + { + "auxiliary_loss_clip": 0.06522856, + "auxiliary_loss_mlp": 0.01279183, + "balance_loss_clip": 0.06303968, + "balance_loss_mlp": 0.01259907, + "epoch": 0.26183676536900646, + "flos": 26074208874240.0, + "grad_norm": 2.16382169558429, + "language_loss": 0.68941987, + "learning_rate": 3.462174591623085e-06, + "loss": 0.7674402, + "num_input_tokens_seen": 94063510, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.19274902, + "step": 4355, + "time_per_iteration": 2.608482599258423 + }, + { + "auxiliary_loss_clip": 0.06517249, + "auxiliary_loss_mlp": 0.01282478, + "balance_loss_clip": 0.06301509, + "balance_loss_mlp": 0.01260889, + "epoch": 0.26189688862167443, + "flos": 21002847085440.0, + "grad_norm": 2.1598133279644554, + "language_loss": 0.68533909, + "learning_rate": 3.4619088404538815e-06, + "loss": 0.76333642, + "num_input_tokens_seen": 94083865, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.21594238, + "step": 4356, + "time_per_iteration": 2.526376247406006 + }, + { + "auxiliary_loss_clip": 0.06398848, + "auxiliary_loss_mlp": 0.01254107, + "balance_loss_clip": 0.06295048, + "balance_loss_mlp": 0.01249723, + "epoch": 0.2619570118743424, + "flos": 65817780768000.0, + "grad_norm": 0.6753767209108164, + "language_loss": 0.5316326, + "learning_rate": 3.4616430338489487e-06, + "loss": 0.60816211, + "num_input_tokens_seen": 94144095, + "router_z_loss_clip": 1.03125, + "router_z_loss_mlp": 0.04391479, + "step": 4357, + "time_per_iteration": 4.58653450012207 + }, + { + "auxiliary_loss_clip": 0.065238, + "auxiliary_loss_mlp": 0.01280125, + "balance_loss_clip": 0.06302594, + "balance_loss_mlp": 0.01261183, + "epoch": 0.26201713512701036, + "flos": 28774310042880.0, + "grad_norm": 1.9589657113609436, + "language_loss": 0.85308599, + "learning_rate": 3.4613771718183654e-06, + "loss": 0.93112528, + "num_input_tokens_seen": 94163035, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.18933105, + "step": 4358, + "time_per_iteration": 2.65427303314209 + }, + { + "auxiliary_loss_clip": 0.0652793, + "auxiliary_loss_mlp": 0.0127535, + "balance_loss_clip": 0.06300082, + "balance_loss_mlp": 0.01254917, + "epoch": 0.2620772583796783, + "flos": 26439750311040.0, + "grad_norm": 2.2013035586341663, + "language_loss": 0.68206531, + "learning_rate": 3.4611112543722127e-06, + "loss": 0.7600981, + "num_input_tokens_seen": 94182520, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.20422363, + "step": 4359, + "time_per_iteration": 2.5460946559906006 + }, + { + "auxiliary_loss_clip": 0.06517753, + "auxiliary_loss_mlp": 0.01278599, + "balance_loss_clip": 0.06299832, + "balance_loss_mlp": 0.01258763, + "epoch": 0.2621373816323463, + "flos": 20162667795840.0, + "grad_norm": 1.9413360196767273, + "language_loss": 0.7857362, + "learning_rate": 3.4608452815205757e-06, + "loss": 0.86369967, + "num_input_tokens_seen": 94201795, + "router_z_loss_clip": 2.18164062, + "router_z_loss_mlp": 0.19848633, + "step": 4360, + "time_per_iteration": 2.5442395210266113 + }, + { + "auxiliary_loss_clip": 0.06513859, + "auxiliary_loss_mlp": 0.01282389, + "balance_loss_clip": 0.06305451, + "balance_loss_mlp": 0.01262839, + "epoch": 0.26219750488501425, + "flos": 28628764300800.0, + "grad_norm": 1.9016418571028826, + "language_loss": 0.68632245, + "learning_rate": 3.4605792532735387e-06, + "loss": 0.76428491, + "num_input_tokens_seen": 94222390, + "router_z_loss_clip": 2.08105469, + "router_z_loss_mlp": 0.19519043, + "step": 4361, + "time_per_iteration": 2.5506739616394043 + }, + { + "auxiliary_loss_clip": 0.0652248, + "auxiliary_loss_mlp": 0.01277506, + "balance_loss_clip": 0.06302515, + "balance_loss_mlp": 0.01256298, + "epoch": 0.2622576281376823, + "flos": 15046806689280.0, + "grad_norm": 1.72568625675014, + "language_loss": 0.84433615, + "learning_rate": 3.46031316964119e-06, + "loss": 0.92233592, + "num_input_tokens_seen": 94239980, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.21179199, + "step": 4362, + "time_per_iteration": 3.9455041885375977 + }, + { + "auxiliary_loss_clip": 0.06516212, + "auxiliary_loss_mlp": 0.01274047, + "balance_loss_clip": 0.06303745, + "balance_loss_mlp": 0.01254914, + "epoch": 0.26231775139035024, + "flos": 26403426766080.0, + "grad_norm": 1.7310155723144771, + "language_loss": 0.65182602, + "learning_rate": 3.4600470306336197e-06, + "loss": 0.72972858, + "num_input_tokens_seen": 94260715, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.19140625, + "step": 4363, + "time_per_iteration": 2.5710229873657227 + }, + { + "auxiliary_loss_clip": 0.06417713, + "auxiliary_loss_mlp": 0.01270336, + "balance_loss_clip": 0.06313097, + "balance_loss_mlp": 0.01263804, + "epoch": 0.2623778746430182, + "flos": 65430380615040.0, + "grad_norm": 0.9022976396731897, + "language_loss": 0.61189461, + "learning_rate": 3.4597808362609194e-06, + "loss": 0.68877506, + "num_input_tokens_seen": 94321285, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.06542969, + "step": 4364, + "time_per_iteration": 4.728578805923462 + }, + { + "auxiliary_loss_clip": 0.06528256, + "auxiliary_loss_mlp": 0.01280703, + "balance_loss_clip": 0.06308191, + "balance_loss_mlp": 0.01260402, + "epoch": 0.26243799789568617, + "flos": 12609104181120.0, + "grad_norm": 2.531531320883944, + "language_loss": 0.72247571, + "learning_rate": 3.459514586533184e-06, + "loss": 0.80056524, + "num_input_tokens_seen": 94335420, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.20300293, + "step": 4365, + "time_per_iteration": 2.5567469596862793 + }, + { + "auxiliary_loss_clip": 0.06519997, + "auxiliary_loss_mlp": 0.01276088, + "balance_loss_clip": 0.06307054, + "balance_loss_mlp": 0.01257146, + "epoch": 0.26249812114835414, + "flos": 28631783047680.0, + "grad_norm": 1.7351756990107399, + "language_loss": 0.78023124, + "learning_rate": 3.459248281460509e-06, + "loss": 0.85819209, + "num_input_tokens_seen": 94357440, + "router_z_loss_clip": 2.12792969, + "router_z_loss_mlp": 0.18945312, + "step": 4366, + "time_per_iteration": 2.6212668418884277 + }, + { + "auxiliary_loss_clip": 0.06522524, + "auxiliary_loss_mlp": 0.01276459, + "balance_loss_clip": 0.06305946, + "balance_loss_mlp": 0.01258351, + "epoch": 0.2625582444010221, + "flos": 14470661214720.0, + "grad_norm": 1.579355851615032, + "language_loss": 0.77007079, + "learning_rate": 3.4589819210529927e-06, + "loss": 0.84806067, + "num_input_tokens_seen": 94375690, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.18103027, + "step": 4367, + "time_per_iteration": 2.602072238922119 + }, + { + "auxiliary_loss_clip": 0.06517363, + "auxiliary_loss_mlp": 0.01271186, + "balance_loss_clip": 0.06304537, + "balance_loss_mlp": 0.01253471, + "epoch": 0.26261836765369007, + "flos": 16617984998400.0, + "grad_norm": 1.5269013949985815, + "language_loss": 0.70157337, + "learning_rate": 3.458715505320736e-06, + "loss": 0.77945888, + "num_input_tokens_seen": 94393190, + "router_z_loss_clip": 2.13183594, + "router_z_loss_mlp": 0.17700195, + "step": 4368, + "time_per_iteration": 4.012764930725098 + }, + { + "auxiliary_loss_clip": 0.06516206, + "auxiliary_loss_mlp": 0.01278713, + "balance_loss_clip": 0.06299432, + "balance_loss_mlp": 0.01256635, + "epoch": 0.26267849090635803, + "flos": 20525861318400.0, + "grad_norm": 1.916794033771568, + "language_loss": 0.79240829, + "learning_rate": 3.458449034273841e-06, + "loss": 0.87035751, + "num_input_tokens_seen": 94410975, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.22070312, + "step": 4369, + "time_per_iteration": 2.51906418800354 + }, + { + "auxiliary_loss_clip": 0.06514631, + "auxiliary_loss_mlp": 0.01276005, + "balance_loss_clip": 0.06301987, + "balance_loss_mlp": 0.01256883, + "epoch": 0.262738614159026, + "flos": 21330220187520.0, + "grad_norm": 3.2285566965587873, + "language_loss": 0.83905816, + "learning_rate": 3.4581825079224133e-06, + "loss": 0.91696453, + "num_input_tokens_seen": 94429985, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.19116211, + "step": 4370, + "time_per_iteration": 2.562302589416504 + }, + { + "auxiliary_loss_clip": 0.06520583, + "auxiliary_loss_mlp": 0.01275132, + "balance_loss_clip": 0.06299531, + "balance_loss_mlp": 0.01253972, + "epoch": 0.26279873741169396, + "flos": 17609454034560.0, + "grad_norm": 1.7096089610285066, + "language_loss": 0.71678042, + "learning_rate": 3.4579159262765575e-06, + "loss": 0.79473758, + "num_input_tokens_seen": 94448660, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.21179199, + "step": 4371, + "time_per_iteration": 2.4965152740478516 + }, + { + "auxiliary_loss_clip": 0.06398421, + "auxiliary_loss_mlp": 0.01256739, + "balance_loss_clip": 0.0629326, + "balance_loss_mlp": 0.01252516, + "epoch": 0.2628588606643619, + "flos": 60969139931520.0, + "grad_norm": 0.666639264120038, + "language_loss": 0.56056166, + "learning_rate": 3.457649289346384e-06, + "loss": 0.63711321, + "num_input_tokens_seen": 94515630, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.04226685, + "step": 4372, + "time_per_iteration": 3.2867443561553955 + }, + { + "auxiliary_loss_clip": 0.06512036, + "auxiliary_loss_mlp": 0.01277679, + "balance_loss_clip": 0.06298684, + "balance_loss_mlp": 0.01259178, + "epoch": 0.2629189839170299, + "flos": 27023652288000.0, + "grad_norm": 1.5439358769508327, + "language_loss": 0.78190762, + "learning_rate": 3.4573825971420042e-06, + "loss": 0.85980475, + "num_input_tokens_seen": 94535385, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.18505859, + "step": 4373, + "time_per_iteration": 2.577479362487793 + }, + { + "auxiliary_loss_clip": 0.06510606, + "auxiliary_loss_mlp": 0.01278833, + "balance_loss_clip": 0.06297645, + "balance_loss_mlp": 0.01260427, + "epoch": 0.26297910716969786, + "flos": 17025635911680.0, + "grad_norm": 2.1443132622279664, + "language_loss": 0.723768, + "learning_rate": 3.4571158496735294e-06, + "loss": 0.80166239, + "num_input_tokens_seen": 94552650, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.18383789, + "step": 4374, + "time_per_iteration": 2.5588772296905518 + }, + { + "auxiliary_loss_clip": 0.06517059, + "auxiliary_loss_mlp": 0.01278902, + "balance_loss_clip": 0.0630156, + "balance_loss_mlp": 0.01258505, + "epoch": 0.2630392304223659, + "flos": 24903889297920.0, + "grad_norm": 2.1190930293084933, + "language_loss": 0.81199759, + "learning_rate": 3.4568490469510756e-06, + "loss": 0.88995719, + "num_input_tokens_seen": 94574075, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.20373535, + "step": 4375, + "time_per_iteration": 2.591381311416626 + }, + { + "auxiliary_loss_clip": 0.0651055, + "auxiliary_loss_mlp": 0.01275326, + "balance_loss_clip": 0.0629838, + "balance_loss_mlp": 0.01257289, + "epoch": 0.26309935367503384, + "flos": 32862336641280.0, + "grad_norm": 1.9139045559413268, + "language_loss": 0.66626596, + "learning_rate": 3.4565821889847603e-06, + "loss": 0.74412477, + "num_input_tokens_seen": 94594255, + "router_z_loss_clip": 2.12304688, + "router_z_loss_mlp": 0.18041992, + "step": 4376, + "time_per_iteration": 2.643944025039673 + }, + { + "auxiliary_loss_clip": 0.06515232, + "auxiliary_loss_mlp": 0.01276237, + "balance_loss_clip": 0.06297503, + "balance_loss_mlp": 0.01257485, + "epoch": 0.2631594769277018, + "flos": 15893400816000.0, + "grad_norm": 1.6251454157029055, + "language_loss": 0.70145154, + "learning_rate": 3.4563152757847026e-06, + "loss": 0.77936625, + "num_input_tokens_seen": 94611410, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.1875, + "step": 4377, + "time_per_iteration": 2.5593788623809814 + }, + { + "auxiliary_loss_clip": 0.06513406, + "auxiliary_loss_mlp": 0.01274994, + "balance_loss_clip": 0.06296869, + "balance_loss_mlp": 0.01255408, + "epoch": 0.2632196001803698, + "flos": 50816242811520.0, + "grad_norm": 1.6666327452584295, + "language_loss": 0.80235565, + "learning_rate": 3.4560483073610233e-06, + "loss": 0.88023967, + "num_input_tokens_seen": 94636575, + "router_z_loss_clip": 2.16601562, + "router_z_loss_mlp": 0.19592285, + "step": 4378, + "time_per_iteration": 2.794290065765381 + }, + { + "auxiliary_loss_clip": 0.0651051, + "auxiliary_loss_mlp": 0.01272396, + "balance_loss_clip": 0.06297652, + "balance_loss_mlp": 0.0125492, + "epoch": 0.26327972343303774, + "flos": 13737733551360.0, + "grad_norm": 2.7188396998417548, + "language_loss": 0.77230549, + "learning_rate": 3.455781283723846e-06, + "loss": 0.85013449, + "num_input_tokens_seen": 94654345, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.17480469, + "step": 4379, + "time_per_iteration": 2.542442560195923 + }, + { + "auxiliary_loss_clip": 0.06519607, + "auxiliary_loss_mlp": 0.01274968, + "balance_loss_clip": 0.06299821, + "balance_loss_mlp": 0.01255084, + "epoch": 0.2633398466857057, + "flos": 23775846906240.0, + "grad_norm": 1.9724368576120554, + "language_loss": 0.78418016, + "learning_rate": 3.4555142048832975e-06, + "loss": 0.86212587, + "num_input_tokens_seen": 94673985, + "router_z_loss_clip": 2.19921875, + "router_z_loss_mlp": 0.19897461, + "step": 4380, + "time_per_iteration": 2.529573440551758 + }, + { + "auxiliary_loss_clip": 0.06516172, + "auxiliary_loss_mlp": 0.012759, + "balance_loss_clip": 0.06296928, + "balance_loss_mlp": 0.01257518, + "epoch": 0.26339996993837367, + "flos": 27607680046080.0, + "grad_norm": 1.9046534185934374, + "language_loss": 0.6460917, + "learning_rate": 3.4552470708495036e-06, + "loss": 0.72401243, + "num_input_tokens_seen": 94693145, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.18383789, + "step": 4381, + "time_per_iteration": 2.5774149894714355 + }, + { + "auxiliary_loss_clip": 0.06511073, + "auxiliary_loss_mlp": 0.01273848, + "balance_loss_clip": 0.06295128, + "balance_loss_mlp": 0.01255394, + "epoch": 0.26346009319104163, + "flos": 16951982572800.0, + "grad_norm": 1.8115834165165374, + "language_loss": 0.8293367, + "learning_rate": 3.454979881632595e-06, + "loss": 0.90718591, + "num_input_tokens_seen": 94710185, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.18444824, + "step": 4382, + "time_per_iteration": 2.503119945526123 + }, + { + "auxiliary_loss_clip": 0.06526808, + "auxiliary_loss_mlp": 0.01282548, + "balance_loss_clip": 0.06304507, + "balance_loss_mlp": 0.0126196, + "epoch": 0.2635202164437096, + "flos": 37241245088640.0, + "grad_norm": 2.8611377763647363, + "language_loss": 0.70728219, + "learning_rate": 3.4547126372427035e-06, + "loss": 0.78537577, + "num_input_tokens_seen": 94730280, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.20581055, + "step": 4383, + "time_per_iteration": 2.7256851196289062 + }, + { + "auxiliary_loss_clip": 0.06511825, + "auxiliary_loss_mlp": 0.01278143, + "balance_loss_clip": 0.0629648, + "balance_loss_mlp": 0.01260214, + "epoch": 0.26358033969637756, + "flos": 21002721304320.0, + "grad_norm": 1.8636489890531567, + "language_loss": 0.69725919, + "learning_rate": 3.4544453376899638e-06, + "loss": 0.77515888, + "num_input_tokens_seen": 94748560, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.17919922, + "step": 4384, + "time_per_iteration": 2.526306629180908 + }, + { + "auxiliary_loss_clip": 0.06514609, + "auxiliary_loss_mlp": 0.01274952, + "balance_loss_clip": 0.06301568, + "balance_loss_mlp": 0.01256355, + "epoch": 0.26364046294904553, + "flos": 27753561204480.0, + "grad_norm": 2.704228439938978, + "language_loss": 0.70769042, + "learning_rate": 3.45417798298451e-06, + "loss": 0.785586, + "num_input_tokens_seen": 94767570, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.18603516, + "step": 4385, + "time_per_iteration": 2.6091294288635254 + }, + { + "auxiliary_loss_clip": 0.06510788, + "auxiliary_loss_mlp": 0.01275036, + "balance_loss_clip": 0.06297003, + "balance_loss_mlp": 0.01255903, + "epoch": 0.2637005862017135, + "flos": 22899679488000.0, + "grad_norm": 1.8400483569046413, + "language_loss": 0.85200071, + "learning_rate": 3.453910573136482e-06, + "loss": 0.92985892, + "num_input_tokens_seen": 94784985, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.19116211, + "step": 4386, + "time_per_iteration": 2.5284476280212402 + }, + { + "auxiliary_loss_clip": 0.06516191, + "auxiliary_loss_mlp": 0.01275321, + "balance_loss_clip": 0.06302508, + "balance_loss_mlp": 0.01255759, + "epoch": 0.26376070945438146, + "flos": 15054143921280.0, + "grad_norm": 1.9881194524454247, + "language_loss": 0.77597183, + "learning_rate": 3.4536431081560196e-06, + "loss": 0.85388696, + "num_input_tokens_seen": 94802545, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.19567871, + "step": 4387, + "time_per_iteration": 2.522135019302368 + }, + { + "auxiliary_loss_clip": 0.0651316, + "auxiliary_loss_mlp": 0.01278261, + "balance_loss_clip": 0.06301039, + "balance_loss_mlp": 0.01259378, + "epoch": 0.2638208327070494, + "flos": 21148141265280.0, + "grad_norm": 2.1303107819849316, + "language_loss": 0.76193964, + "learning_rate": 3.453375588053264e-06, + "loss": 0.83985388, + "num_input_tokens_seen": 94820730, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.1887207, + "step": 4388, + "time_per_iteration": 2.5082008838653564 + }, + { + "auxiliary_loss_clip": 0.06516623, + "auxiliary_loss_mlp": 0.01271478, + "balance_loss_clip": 0.06302176, + "balance_loss_mlp": 0.01253681, + "epoch": 0.26388095595971744, + "flos": 21732001315200.0, + "grad_norm": 2.125202232596161, + "language_loss": 0.86967361, + "learning_rate": 3.4531080128383617e-06, + "loss": 0.94755471, + "num_input_tokens_seen": 94839175, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.17785645, + "step": 4389, + "time_per_iteration": 2.570643901824951 + }, + { + "auxiliary_loss_clip": 0.06416489, + "auxiliary_loss_mlp": 0.01268137, + "balance_loss_clip": 0.0630957, + "balance_loss_mlp": 0.01263464, + "epoch": 0.2639410792123854, + "flos": 65536542138240.0, + "grad_norm": 0.8199197454978128, + "language_loss": 0.60138249, + "learning_rate": 3.452840382521457e-06, + "loss": 0.6782288, + "num_input_tokens_seen": 94898865, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.04666138, + "step": 4390, + "time_per_iteration": 3.174226999282837 + }, + { + "auxiliary_loss_clip": 0.06524064, + "auxiliary_loss_mlp": 0.01274153, + "balance_loss_clip": 0.06302064, + "balance_loss_mlp": 0.01255008, + "epoch": 0.2640012024650534, + "flos": 23954907081600.0, + "grad_norm": 1.739207981028, + "language_loss": 0.77995527, + "learning_rate": 3.4525726971127e-06, + "loss": 0.85793746, + "num_input_tokens_seen": 94917490, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.19152832, + "step": 4391, + "time_per_iteration": 2.5869362354278564 + }, + { + "auxiliary_loss_clip": 0.06415629, + "auxiliary_loss_mlp": 0.01265443, + "balance_loss_clip": 0.06309642, + "balance_loss_mlp": 0.0126082, + "epoch": 0.26406132571772134, + "flos": 56462420880000.0, + "grad_norm": 0.8885893091984226, + "language_loss": 0.58835375, + "learning_rate": 3.45230495662224e-06, + "loss": 0.66516447, + "num_input_tokens_seen": 94969065, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.04620361, + "step": 4392, + "time_per_iteration": 3.1856343746185303 + }, + { + "auxiliary_loss_clip": 0.0652501, + "auxiliary_loss_mlp": 0.0127481, + "balance_loss_clip": 0.06303259, + "balance_loss_mlp": 0.01256631, + "epoch": 0.2641214489703893, + "flos": 22097039627520.0, + "grad_norm": 1.7095674260711007, + "language_loss": 0.69284153, + "learning_rate": 3.4520371610602306e-06, + "loss": 0.77083969, + "num_input_tokens_seen": 94988540, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.1817627, + "step": 4393, + "time_per_iteration": 2.5519895553588867 + }, + { + "auxiliary_loss_clip": 0.06526117, + "auxiliary_loss_mlp": 0.01277548, + "balance_loss_clip": 0.06302489, + "balance_loss_mlp": 0.01255959, + "epoch": 0.26418157222305727, + "flos": 16550327226240.0, + "grad_norm": 2.304177456685855, + "language_loss": 0.84805501, + "learning_rate": 3.4517693104368267e-06, + "loss": 0.92609167, + "num_input_tokens_seen": 95004810, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.21594238, + "step": 4394, + "time_per_iteration": 2.5253031253814697 + }, + { + "auxiliary_loss_clip": 0.06528334, + "auxiliary_loss_mlp": 0.01280976, + "balance_loss_clip": 0.06304967, + "balance_loss_mlp": 0.01260066, + "epoch": 0.26424169547572524, + "flos": 18008006780160.0, + "grad_norm": 1.9555526734650441, + "language_loss": 0.70342916, + "learning_rate": 3.4515014047621856e-06, + "loss": 0.78152227, + "num_input_tokens_seen": 95024085, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.20910645, + "step": 4395, + "time_per_iteration": 2.5117664337158203 + }, + { + "auxiliary_loss_clip": 0.06512758, + "auxiliary_loss_mlp": 0.01272399, + "balance_loss_clip": 0.06300145, + "balance_loss_mlp": 0.01253171, + "epoch": 0.2643018187283932, + "flos": 16988893096320.0, + "grad_norm": 1.791387622967983, + "language_loss": 0.87312353, + "learning_rate": 3.4512334440464655e-06, + "loss": 0.95097506, + "num_input_tokens_seen": 95042515, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.19238281, + "step": 4396, + "time_per_iteration": 2.566774368286133 + }, + { + "auxiliary_loss_clip": 0.06404904, + "auxiliary_loss_mlp": 0.01257464, + "balance_loss_clip": 0.06300922, + "balance_loss_mlp": 0.01252997, + "epoch": 0.26436194198106117, + "flos": 59682135144960.0, + "grad_norm": 0.7723405564107855, + "language_loss": 0.54990101, + "learning_rate": 3.4509654282998277e-06, + "loss": 0.62652469, + "num_input_tokens_seen": 95094835, + "router_z_loss_clip": 1.0390625, + "router_z_loss_mlp": 0.04473877, + "step": 4397, + "time_per_iteration": 4.373678684234619 + }, + { + "auxiliary_loss_clip": 0.06510547, + "auxiliary_loss_mlp": 0.01274266, + "balance_loss_clip": 0.06297219, + "balance_loss_mlp": 0.01255633, + "epoch": 0.26442206523372913, + "flos": 32928694675200.0, + "grad_norm": 2.4292177107300224, + "language_loss": 0.78606653, + "learning_rate": 3.450697357532435e-06, + "loss": 0.86391467, + "num_input_tokens_seen": 95113480, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.1862793, + "step": 4398, + "time_per_iteration": 2.6890292167663574 + }, + { + "auxiliary_loss_clip": 0.06511252, + "auxiliary_loss_mlp": 0.01279415, + "balance_loss_clip": 0.06294377, + "balance_loss_mlp": 0.01259244, + "epoch": 0.2644821884863971, + "flos": 21037409694720.0, + "grad_norm": 1.6698754866149341, + "language_loss": 0.67733896, + "learning_rate": 3.4504292317544534e-06, + "loss": 0.75524557, + "num_input_tokens_seen": 95132580, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.20178223, + "step": 4399, + "time_per_iteration": 2.5403761863708496 + }, + { + "auxiliary_loss_clip": 0.06507229, + "auxiliary_loss_mlp": 0.01274507, + "balance_loss_clip": 0.06301808, + "balance_loss_mlp": 0.01256841, + "epoch": 0.26454231173906506, + "flos": 20783019098880.0, + "grad_norm": 1.5093240378212085, + "language_loss": 0.8695311, + "learning_rate": 3.4501610509760504e-06, + "loss": 0.94734848, + "num_input_tokens_seen": 95152375, + "router_z_loss_clip": 2.05175781, + "router_z_loss_mlp": 0.17675781, + "step": 4400, + "time_per_iteration": 2.546402931213379 + }, + { + "auxiliary_loss_clip": 0.06514899, + "auxiliary_loss_mlp": 0.01275157, + "balance_loss_clip": 0.06298938, + "balance_loss_mlp": 0.01255404, + "epoch": 0.264602434991733, + "flos": 16624399835520.0, + "grad_norm": 2.9592381962347076, + "language_loss": 0.77008456, + "learning_rate": 3.4498928152073944e-06, + "loss": 0.84798515, + "num_input_tokens_seen": 95170265, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.19750977, + "step": 4401, + "time_per_iteration": 4.000045537948608 + }, + { + "auxiliary_loss_clip": 0.06515318, + "auxiliary_loss_mlp": 0.01277892, + "balance_loss_clip": 0.0629567, + "balance_loss_mlp": 0.01257149, + "epoch": 0.26466255824440105, + "flos": 19068726816000.0, + "grad_norm": 1.7667226788610035, + "language_loss": 0.88791883, + "learning_rate": 3.4496245244586577e-06, + "loss": 0.96585095, + "num_input_tokens_seen": 95188655, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.20739746, + "step": 4402, + "time_per_iteration": 2.504951000213623 + }, + { + "auxiliary_loss_clip": 0.06514971, + "auxiliary_loss_mlp": 0.01280074, + "balance_loss_clip": 0.06299384, + "balance_loss_mlp": 0.01261203, + "epoch": 0.264722681497069, + "flos": 22645246965120.0, + "grad_norm": 2.1016866817380944, + "language_loss": 0.78604829, + "learning_rate": 3.4493561787400137e-06, + "loss": 0.86399865, + "num_input_tokens_seen": 95209615, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.18884277, + "step": 4403, + "time_per_iteration": 3.9830996990203857 + }, + { + "auxiliary_loss_clip": 0.06513863, + "auxiliary_loss_mlp": 0.01273109, + "balance_loss_clip": 0.0629956, + "balance_loss_mlp": 0.01254322, + "epoch": 0.264782804749737, + "flos": 22498862682240.0, + "grad_norm": 2.2718142403423887, + "language_loss": 0.88776851, + "learning_rate": 3.4490877780616387e-06, + "loss": 0.96563816, + "num_input_tokens_seen": 95227810, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.18774414, + "step": 4404, + "time_per_iteration": 2.5655670166015625 + }, + { + "auxiliary_loss_clip": 0.06512003, + "auxiliary_loss_mlp": 0.01272083, + "balance_loss_clip": 0.06294957, + "balance_loss_mlp": 0.01253666, + "epoch": 0.26484292800240494, + "flos": 16805891779200.0, + "grad_norm": 1.6853243703943699, + "language_loss": 0.77144921, + "learning_rate": 3.448819322433709e-06, + "loss": 0.84929001, + "num_input_tokens_seen": 95245890, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.18408203, + "step": 4405, + "time_per_iteration": 2.5151660442352295 + }, + { + "auxiliary_loss_clip": 0.06518488, + "auxiliary_loss_mlp": 0.01280263, + "balance_loss_clip": 0.06303011, + "balance_loss_mlp": 0.0126113, + "epoch": 0.2649030512550729, + "flos": 20455939486080.0, + "grad_norm": 1.6552463254663874, + "language_loss": 0.70570582, + "learning_rate": 3.4485508118664066e-06, + "loss": 0.78369337, + "num_input_tokens_seen": 95264955, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.19152832, + "step": 4406, + "time_per_iteration": 2.5817081928253174 + }, + { + "auxiliary_loss_clip": 0.06515051, + "auxiliary_loss_mlp": 0.01282775, + "balance_loss_clip": 0.06304015, + "balance_loss_mlp": 0.01264071, + "epoch": 0.2649631745077409, + "flos": 22422190596480.0, + "grad_norm": 1.6043271976664373, + "language_loss": 0.84213567, + "learning_rate": 3.448282246369912e-06, + "loss": 0.92011392, + "num_input_tokens_seen": 95284245, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 0.18701172, + "step": 4407, + "time_per_iteration": 2.5317513942718506 + }, + { + "auxiliary_loss_clip": 0.06506669, + "auxiliary_loss_mlp": 0.01274017, + "balance_loss_clip": 0.06294346, + "balance_loss_mlp": 0.01255384, + "epoch": 0.26502329776040884, + "flos": 35124794334720.0, + "grad_norm": 1.8863485028384246, + "language_loss": 0.76080608, + "learning_rate": 3.4480136259544084e-06, + "loss": 0.83861291, + "num_input_tokens_seen": 95307125, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.18615723, + "step": 4408, + "time_per_iteration": 4.144388675689697 + }, + { + "auxiliary_loss_clip": 0.06504838, + "auxiliary_loss_mlp": 0.01278565, + "balance_loss_clip": 0.06293095, + "balance_loss_mlp": 0.01259765, + "epoch": 0.2650834210130768, + "flos": 38696073603840.0, + "grad_norm": 1.6572856868324277, + "language_loss": 0.71237993, + "learning_rate": 3.447744950630084e-06, + "loss": 0.79021394, + "num_input_tokens_seen": 95329150, + "router_z_loss_clip": 2.11523438, + "router_z_loss_mlp": 0.18786621, + "step": 4409, + "time_per_iteration": 2.6830790042877197 + }, + { + "auxiliary_loss_clip": 0.06513892, + "auxiliary_loss_mlp": 0.01277956, + "balance_loss_clip": 0.06296389, + "balance_loss_mlp": 0.01258513, + "epoch": 0.26514354426574477, + "flos": 24723655165440.0, + "grad_norm": 1.9985850932403133, + "language_loss": 0.74335337, + "learning_rate": 3.4474762204071253e-06, + "loss": 0.82127184, + "num_input_tokens_seen": 95349880, + "router_z_loss_clip": 2.17578125, + "router_z_loss_mlp": 0.19445801, + "step": 4410, + "time_per_iteration": 2.5640783309936523 + }, + { + "auxiliary_loss_clip": 0.06510055, + "auxiliary_loss_mlp": 0.01275315, + "balance_loss_clip": 0.06293881, + "balance_loss_mlp": 0.01256873, + "epoch": 0.26520366751841273, + "flos": 20346381872640.0, + "grad_norm": 1.7362440314024254, + "language_loss": 0.74604267, + "learning_rate": 3.4472074352957244e-06, + "loss": 0.82389635, + "num_input_tokens_seen": 95368570, + "router_z_loss_clip": 2.1640625, + "router_z_loss_mlp": 0.18457031, + "step": 4411, + "time_per_iteration": 2.681392192840576 + }, + { + "auxiliary_loss_clip": 0.06503807, + "auxiliary_loss_mlp": 0.0127974, + "balance_loss_clip": 0.06292095, + "balance_loss_mlp": 0.01260941, + "epoch": 0.2652637907710807, + "flos": 22350046631040.0, + "grad_norm": 1.9068391403977176, + "language_loss": 0.83043784, + "learning_rate": 3.446938595306071e-06, + "loss": 0.90827328, + "num_input_tokens_seen": 95387065, + "router_z_loss_clip": 2.1171875, + "router_z_loss_mlp": 0.18798828, + "step": 4412, + "time_per_iteration": 2.570462942123413 + }, + { + "auxiliary_loss_clip": 0.06509882, + "auxiliary_loss_mlp": 0.01280008, + "balance_loss_clip": 0.0629638, + "balance_loss_mlp": 0.01260327, + "epoch": 0.26532391402374866, + "flos": 19360279497600.0, + "grad_norm": 1.6015505507863077, + "language_loss": 0.75010121, + "learning_rate": 3.4466697004483622e-06, + "loss": 0.82800013, + "num_input_tokens_seen": 95406345, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.19677734, + "step": 4413, + "time_per_iteration": 2.5575060844421387 + }, + { + "auxiliary_loss_clip": 0.06392879, + "auxiliary_loss_mlp": 0.01259819, + "balance_loss_clip": 0.06288524, + "balance_loss_mlp": 0.01255307, + "epoch": 0.26538403727641663, + "flos": 44804479121280.0, + "grad_norm": 0.9088609657061584, + "language_loss": 0.57055008, + "learning_rate": 3.446400750732793e-06, + "loss": 0.64707708, + "num_input_tokens_seen": 95463595, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.04522705, + "step": 4414, + "time_per_iteration": 3.090242624282837 + }, + { + "auxiliary_loss_clip": 0.06501576, + "auxiliary_loss_mlp": 0.01278206, + "balance_loss_clip": 0.06294522, + "balance_loss_mlp": 0.01260587, + "epoch": 0.26544416052908465, + "flos": 28189359889920.0, + "grad_norm": 1.5322949912702364, + "language_loss": 0.74997067, + "learning_rate": 3.4461317461695625e-06, + "loss": 0.82776845, + "num_input_tokens_seen": 95484115, + "router_z_loss_clip": 2.0703125, + "router_z_loss_mlp": 0.17626953, + "step": 4415, + "time_per_iteration": 2.6143665313720703 + }, + { + "auxiliary_loss_clip": 0.06505995, + "auxiliary_loss_mlp": 0.01278176, + "balance_loss_clip": 0.06289595, + "balance_loss_mlp": 0.0125791, + "epoch": 0.2655042837817526, + "flos": 17570824502400.0, + "grad_norm": 4.108925661978825, + "language_loss": 0.87716872, + "learning_rate": 3.4458626867688707e-06, + "loss": 0.95501041, + "num_input_tokens_seen": 95501435, + "router_z_loss_clip": 2.1640625, + "router_z_loss_mlp": 0.20263672, + "step": 4416, + "time_per_iteration": 2.4974279403686523 + }, + { + "auxiliary_loss_clip": 0.06510112, + "auxiliary_loss_mlp": 0.01280216, + "balance_loss_clip": 0.0629703, + "balance_loss_mlp": 0.0126094, + "epoch": 0.2655644070344206, + "flos": 23411437499520.0, + "grad_norm": 1.4955026126411677, + "language_loss": 0.77089638, + "learning_rate": 3.4455935725409217e-06, + "loss": 0.84879971, + "num_input_tokens_seen": 95520135, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.19274902, + "step": 4417, + "time_per_iteration": 2.576826572418213 + }, + { + "auxiliary_loss_clip": 0.0650158, + "auxiliary_loss_mlp": 0.01274734, + "balance_loss_clip": 0.06292985, + "balance_loss_mlp": 0.01255946, + "epoch": 0.26562453028708854, + "flos": 26475612658560.0, + "grad_norm": 1.3751463134954343, + "language_loss": 0.80062425, + "learning_rate": 3.4453244034959196e-06, + "loss": 0.87838733, + "num_input_tokens_seen": 95541705, + "router_z_loss_clip": 2.08203125, + "router_z_loss_mlp": 0.18786621, + "step": 4418, + "time_per_iteration": 2.573490619659424 + }, + { + "auxiliary_loss_clip": 0.06510676, + "auxiliary_loss_mlp": 0.01274316, + "balance_loss_clip": 0.06295326, + "balance_loss_mlp": 0.01254945, + "epoch": 0.2656846535397565, + "flos": 19213475944320.0, + "grad_norm": 2.092556142181657, + "language_loss": 0.67613918, + "learning_rate": 3.445055179644071e-06, + "loss": 0.7539891, + "num_input_tokens_seen": 95560300, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.19372559, + "step": 4419, + "time_per_iteration": 2.5705552101135254 + }, + { + "auxiliary_loss_clip": 0.06507199, + "auxiliary_loss_mlp": 0.01281966, + "balance_loss_clip": 0.06293494, + "balance_loss_mlp": 0.01262153, + "epoch": 0.2657447767924245, + "flos": 30558566085120.0, + "grad_norm": 1.8356097714997412, + "language_loss": 0.79905182, + "learning_rate": 3.444785900995585e-06, + "loss": 0.87694353, + "num_input_tokens_seen": 95580150, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.19799805, + "step": 4420, + "time_per_iteration": 2.5966663360595703 + }, + { + "auxiliary_loss_clip": 0.06514539, + "auxiliary_loss_mlp": 0.01276693, + "balance_loss_clip": 0.06294198, + "balance_loss_mlp": 0.01256367, + "epoch": 0.26580490004509244, + "flos": 20928984111360.0, + "grad_norm": 2.015825119850129, + "language_loss": 0.81966692, + "learning_rate": 3.444516567560673e-06, + "loss": 0.89757919, + "num_input_tokens_seen": 95597570, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.20324707, + "step": 4421, + "time_per_iteration": 2.5285565853118896 + }, + { + "auxiliary_loss_clip": 0.06503608, + "auxiliary_loss_mlp": 0.01277509, + "balance_loss_clip": 0.06293386, + "balance_loss_mlp": 0.01259341, + "epoch": 0.2658650232977604, + "flos": 43955845297920.0, + "grad_norm": 1.6494646012937118, + "language_loss": 0.66448712, + "learning_rate": 3.444247179349548e-06, + "loss": 0.74229831, + "num_input_tokens_seen": 95619415, + "router_z_loss_clip": 2.1015625, + "router_z_loss_mlp": 0.1817627, + "step": 4422, + "time_per_iteration": 2.715272903442383 + }, + { + "auxiliary_loss_clip": 0.0650918, + "auxiliary_loss_mlp": 0.01275047, + "balance_loss_clip": 0.06296968, + "balance_loss_mlp": 0.01257011, + "epoch": 0.26592514655042837, + "flos": 29724256581120.0, + "grad_norm": 6.571308072686312, + "language_loss": 0.75332773, + "learning_rate": 3.4439777363724252e-06, + "loss": 0.83116996, + "num_input_tokens_seen": 95639155, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.18029785, + "step": 4423, + "time_per_iteration": 2.5891942977905273 + }, + { + "auxiliary_loss_clip": 0.06514621, + "auxiliary_loss_mlp": 0.01277348, + "balance_loss_clip": 0.06297594, + "balance_loss_mlp": 0.01257619, + "epoch": 0.26598526980309634, + "flos": 46687616110080.0, + "grad_norm": 1.5716819541281883, + "language_loss": 0.78054529, + "learning_rate": 3.443708238639522e-06, + "loss": 0.85846502, + "num_input_tokens_seen": 95663320, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.19726562, + "step": 4424, + "time_per_iteration": 2.731308698654175 + }, + { + "auxiliary_loss_clip": 0.06513417, + "auxiliary_loss_mlp": 0.01282972, + "balance_loss_clip": 0.06298374, + "balance_loss_mlp": 0.01263147, + "epoch": 0.2660453930557643, + "flos": 11514115025280.0, + "grad_norm": 1.8953438163908696, + "language_loss": 0.7980895, + "learning_rate": 3.4434386861610573e-06, + "loss": 0.87605333, + "num_input_tokens_seen": 95680260, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.19824219, + "step": 4425, + "time_per_iteration": 2.536639928817749 + }, + { + "auxiliary_loss_clip": 0.0650531, + "auxiliary_loss_mlp": 0.01275945, + "balance_loss_clip": 0.06295954, + "balance_loss_mlp": 0.01257837, + "epoch": 0.26610551630843227, + "flos": 24798692096640.0, + "grad_norm": 1.624984400061838, + "language_loss": 0.81150436, + "learning_rate": 3.4431690789472532e-06, + "loss": 0.88931698, + "num_input_tokens_seen": 95701140, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 0.18103027, + "step": 4426, + "time_per_iteration": 2.55570912361145 + }, + { + "auxiliary_loss_clip": 0.06512492, + "auxiliary_loss_mlp": 0.01281328, + "balance_loss_clip": 0.06298596, + "balance_loss_mlp": 0.01262302, + "epoch": 0.26616563956110023, + "flos": 27643793955840.0, + "grad_norm": 1.6446869519549492, + "language_loss": 0.77695107, + "learning_rate": 3.442899417008333e-06, + "loss": 0.85488927, + "num_input_tokens_seen": 95722060, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.19042969, + "step": 4427, + "time_per_iteration": 2.609236001968384 + }, + { + "auxiliary_loss_clip": 0.06512281, + "auxiliary_loss_mlp": 0.01275028, + "balance_loss_clip": 0.06306126, + "balance_loss_mlp": 0.01257588, + "epoch": 0.26622576281376825, + "flos": 28369887511680.0, + "grad_norm": 1.5754757805335664, + "language_loss": 0.77615106, + "learning_rate": 3.4426297003545227e-06, + "loss": 0.85402417, + "num_input_tokens_seen": 95742495, + "router_z_loss_clip": 2.0625, + "router_z_loss_mlp": 0.17443848, + "step": 4428, + "time_per_iteration": 2.5886542797088623 + }, + { + "auxiliary_loss_clip": 0.06507164, + "auxiliary_loss_mlp": 0.01273818, + "balance_loss_clip": 0.06292614, + "balance_loss_mlp": 0.0125627, + "epoch": 0.2662858860664362, + "flos": 18047265217920.0, + "grad_norm": 1.9210496781424948, + "language_loss": 0.83184117, + "learning_rate": 3.4423599289960495e-06, + "loss": 0.90965092, + "num_input_tokens_seen": 95761510, + "router_z_loss_clip": 2.14746094, + "router_z_loss_mlp": 0.17541504, + "step": 4429, + "time_per_iteration": 2.5387768745422363 + }, + { + "auxiliary_loss_clip": 0.06512052, + "auxiliary_loss_mlp": 0.01276801, + "balance_loss_clip": 0.06301999, + "balance_loss_mlp": 0.01256762, + "epoch": 0.2663460093191042, + "flos": 22752163175040.0, + "grad_norm": 1.799497911690532, + "language_loss": 0.73120302, + "learning_rate": 3.442090102943143e-06, + "loss": 0.80909157, + "num_input_tokens_seen": 95782385, + "router_z_loss_clip": 2.1015625, + "router_z_loss_mlp": 0.20043945, + "step": 4430, + "time_per_iteration": 2.6026084423065186 + }, + { + "auxiliary_loss_clip": 0.06508531, + "auxiliary_loss_mlp": 0.0127429, + "balance_loss_clip": 0.06296858, + "balance_loss_mlp": 0.012548, + "epoch": 0.26640613257177215, + "flos": 16514422951680.0, + "grad_norm": 2.040164300856009, + "language_loss": 0.83262235, + "learning_rate": 3.441820222206035e-06, + "loss": 0.91045058, + "num_input_tokens_seen": 95800595, + "router_z_loss_clip": 2.1171875, + "router_z_loss_mlp": 0.19482422, + "step": 4431, + "time_per_iteration": 2.5464959144592285 + }, + { + "auxiliary_loss_clip": 0.0651544, + "auxiliary_loss_mlp": 0.01281122, + "balance_loss_clip": 0.06296271, + "balance_loss_mlp": 0.01261488, + "epoch": 0.2664662558244401, + "flos": 23082638878080.0, + "grad_norm": 2.4012085548553537, + "language_loss": 0.76319212, + "learning_rate": 3.44155028679496e-06, + "loss": 0.84115773, + "num_input_tokens_seen": 95818480, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.19641113, + "step": 4432, + "time_per_iteration": 2.5570900440216064 + }, + { + "auxiliary_loss_clip": 0.06513382, + "auxiliary_loss_mlp": 0.01279336, + "balance_loss_clip": 0.0629918, + "balance_loss_mlp": 0.01259011, + "epoch": 0.2665263790771081, + "flos": 23776098468480.0, + "grad_norm": 1.7645797084145118, + "language_loss": 0.8352288, + "learning_rate": 3.441280296720154e-06, + "loss": 0.91315603, + "num_input_tokens_seen": 95837205, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.20324707, + "step": 4433, + "time_per_iteration": 2.5431323051452637 + }, + { + "auxiliary_loss_clip": 0.06506403, + "auxiliary_loss_mlp": 0.01279917, + "balance_loss_clip": 0.06294529, + "balance_loss_mlp": 0.01260248, + "epoch": 0.26658650232977604, + "flos": 28008748414080.0, + "grad_norm": 2.0130085710694097, + "language_loss": 0.77006185, + "learning_rate": 3.441010251991854e-06, + "loss": 0.84792507, + "num_input_tokens_seen": 95858395, + "router_z_loss_clip": 2.11230469, + "router_z_loss_mlp": 0.19677734, + "step": 4434, + "time_per_iteration": 2.626286268234253 + }, + { + "auxiliary_loss_clip": 0.06505096, + "auxiliary_loss_mlp": 0.01274565, + "balance_loss_clip": 0.06296869, + "balance_loss_mlp": 0.01255563, + "epoch": 0.266646625582444, + "flos": 22170147914880.0, + "grad_norm": 1.9216331890087734, + "language_loss": 0.82643783, + "learning_rate": 3.440740152620301e-06, + "loss": 0.90423441, + "num_input_tokens_seen": 95877875, + "router_z_loss_clip": 2.07910156, + "router_z_loss_mlp": 0.18994141, + "step": 4435, + "time_per_iteration": 2.519731283187866 + }, + { + "auxiliary_loss_clip": 0.06515168, + "auxiliary_loss_mlp": 0.01287569, + "balance_loss_clip": 0.06296054, + "balance_loss_mlp": 0.01267065, + "epoch": 0.266706748835112, + "flos": 27860687049600.0, + "grad_norm": 2.5550616111147257, + "language_loss": 0.88173652, + "learning_rate": 3.4404699986157376e-06, + "loss": 0.95976388, + "num_input_tokens_seen": 95895820, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.2052002, + "step": 4436, + "time_per_iteration": 2.5790481567382812 + }, + { + "auxiliary_loss_clip": 0.0650726, + "auxiliary_loss_mlp": 0.01276794, + "balance_loss_clip": 0.0629128, + "balance_loss_mlp": 0.01258507, + "epoch": 0.26676687208777994, + "flos": 25819231299840.0, + "grad_norm": 5.920609689832761, + "language_loss": 0.79025435, + "learning_rate": 3.440199789988407e-06, + "loss": 0.86809486, + "num_input_tokens_seen": 95918025, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.1829834, + "step": 4437, + "time_per_iteration": 3.9761762619018555 + }, + { + "auxiliary_loss_clip": 0.06508271, + "auxiliary_loss_mlp": 0.01274387, + "balance_loss_clip": 0.06295269, + "balance_loss_mlp": 0.01256065, + "epoch": 0.2668269953404479, + "flos": 36073399207680.0, + "grad_norm": 3.5501154130665333, + "language_loss": 0.64866304, + "learning_rate": 3.439929526748556e-06, + "loss": 0.72648954, + "num_input_tokens_seen": 95937725, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.18322754, + "step": 4438, + "time_per_iteration": 2.655214786529541 + }, + { + "auxiliary_loss_clip": 0.0650841, + "auxiliary_loss_mlp": 0.01282243, + "balance_loss_clip": 0.0629243, + "balance_loss_mlp": 0.01263015, + "epoch": 0.26688711859311587, + "flos": 26576994499200.0, + "grad_norm": 1.9779853569110368, + "language_loss": 0.76120412, + "learning_rate": 3.4396592089064334e-06, + "loss": 0.83911061, + "num_input_tokens_seen": 95956335, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.1920166, + "step": 4439, + "time_per_iteration": 2.5468099117279053 + }, + { + "auxiliary_loss_clip": 0.06509372, + "auxiliary_loss_mlp": 0.01279302, + "balance_loss_clip": 0.06293344, + "balance_loss_mlp": 0.01259156, + "epoch": 0.26694724184578383, + "flos": 26768968202880.0, + "grad_norm": 1.7452542153948158, + "language_loss": 0.71747917, + "learning_rate": 3.4393888364722897e-06, + "loss": 0.79536593, + "num_input_tokens_seen": 95977135, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.20141602, + "step": 4440, + "time_per_iteration": 2.5845727920532227 + }, + { + "auxiliary_loss_clip": 0.06513558, + "auxiliary_loss_mlp": 0.01278841, + "balance_loss_clip": 0.06297302, + "balance_loss_mlp": 0.01258003, + "epoch": 0.2670073650984518, + "flos": 20965894634880.0, + "grad_norm": 2.018310090260772, + "language_loss": 0.67180222, + "learning_rate": 3.439118409456376e-06, + "loss": 0.74972624, + "num_input_tokens_seen": 95995435, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.20837402, + "step": 4441, + "time_per_iteration": 4.018662691116333 + }, + { + "auxiliary_loss_clip": 0.06511593, + "auxiliary_loss_mlp": 0.01281375, + "balance_loss_clip": 0.06295494, + "balance_loss_mlp": 0.01260692, + "epoch": 0.2670674883511198, + "flos": 28373577091200.0, + "grad_norm": 1.7028334543675463, + "language_loss": 0.77360296, + "learning_rate": 3.4388479278689486e-06, + "loss": 0.8515327, + "num_input_tokens_seen": 96016340, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.20690918, + "step": 4442, + "time_per_iteration": 2.613529682159424 + }, + { + "auxiliary_loss_clip": 0.06397913, + "auxiliary_loss_mlp": 0.0126448, + "balance_loss_clip": 0.06295023, + "balance_loss_mlp": 0.01259818, + "epoch": 0.2671276116037878, + "flos": 58989010970880.0, + "grad_norm": 0.9159689493293411, + "language_loss": 0.61561328, + "learning_rate": 3.4385773917202637e-06, + "loss": 0.6922372, + "num_input_tokens_seen": 96071205, + "router_z_loss_clip": 1.03125, + "router_z_loss_mlp": 0.04653931, + "step": 4443, + "time_per_iteration": 4.460381031036377 + }, + { + "auxiliary_loss_clip": 0.06510781, + "auxiliary_loss_mlp": 0.01278926, + "balance_loss_clip": 0.06294855, + "balance_loss_mlp": 0.0126021, + "epoch": 0.26718773485645575, + "flos": 43955132538240.0, + "grad_norm": 8.593795125602613, + "language_loss": 0.76795793, + "learning_rate": 3.4383068010205793e-06, + "loss": 0.845855, + "num_input_tokens_seen": 96094240, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.18725586, + "step": 4444, + "time_per_iteration": 2.7442104816436768 + }, + { + "auxiliary_loss_clip": 0.06512623, + "auxiliary_loss_mlp": 0.0127732, + "balance_loss_clip": 0.06297334, + "balance_loss_mlp": 0.01256255, + "epoch": 0.2672478581091237, + "flos": 25235329322880.0, + "grad_norm": 2.0392997213265867, + "language_loss": 0.81111336, + "learning_rate": 3.438036155780158e-06, + "loss": 0.88901269, + "num_input_tokens_seen": 96114105, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.21057129, + "step": 4445, + "time_per_iteration": 2.5493359565734863 + }, + { + "auxiliary_loss_clip": 0.06511448, + "auxiliary_loss_mlp": 0.01275318, + "balance_loss_clip": 0.0629541, + "balance_loss_mlp": 0.01256054, + "epoch": 0.2673079813617917, + "flos": 15273594564480.0, + "grad_norm": 1.8279407549944744, + "language_loss": 0.89906365, + "learning_rate": 3.43776545600926e-06, + "loss": 0.97693127, + "num_input_tokens_seen": 96132140, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.19262695, + "step": 4446, + "time_per_iteration": 2.536916971206665 + }, + { + "auxiliary_loss_clip": 0.06512347, + "auxiliary_loss_mlp": 0.01275408, + "balance_loss_clip": 0.06295379, + "balance_loss_mlp": 0.01256894, + "epoch": 0.26736810461445965, + "flos": 25819944059520.0, + "grad_norm": 1.8969857257431861, + "language_loss": 0.68977708, + "learning_rate": 3.437494701718153e-06, + "loss": 0.76765466, + "num_input_tokens_seen": 96152090, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.18518066, + "step": 4447, + "time_per_iteration": 4.071701526641846 + }, + { + "auxiliary_loss_clip": 0.06511723, + "auxiliary_loss_mlp": 0.01279215, + "balance_loss_clip": 0.06295793, + "balance_loss_mlp": 0.01259116, + "epoch": 0.2674282278671276, + "flos": 24318981072000.0, + "grad_norm": 1.8615578685879888, + "language_loss": 0.83522677, + "learning_rate": 3.4372238929171026e-06, + "loss": 0.91313618, + "num_input_tokens_seen": 96170015, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.2010498, + "step": 4448, + "time_per_iteration": 2.581207036972046 + }, + { + "auxiliary_loss_clip": 0.06506026, + "auxiliary_loss_mlp": 0.0127612, + "balance_loss_clip": 0.06295379, + "balance_loss_mlp": 0.01256855, + "epoch": 0.2674883511197956, + "flos": 22821330320640.0, + "grad_norm": 1.5806903023960923, + "language_loss": 0.84385109, + "learning_rate": 3.436953029616378e-06, + "loss": 0.92167258, + "num_input_tokens_seen": 96188065, + "router_z_loss_clip": 2.10742188, + "router_z_loss_mlp": 0.19262695, + "step": 4449, + "time_per_iteration": 2.556368827819824 + }, + { + "auxiliary_loss_clip": 0.06523807, + "auxiliary_loss_mlp": 0.01278506, + "balance_loss_clip": 0.06298804, + "balance_loss_mlp": 0.01256679, + "epoch": 0.26754847437246354, + "flos": 25376514652800.0, + "grad_norm": 2.5106466446094275, + "language_loss": 0.84170121, + "learning_rate": 3.4366821118262506e-06, + "loss": 0.91972435, + "num_input_tokens_seen": 96205780, + "router_z_loss_clip": 2.25390625, + "router_z_loss_mlp": 0.21838379, + "step": 4450, + "time_per_iteration": 2.540792465209961 + }, + { + "auxiliary_loss_clip": 0.06503032, + "auxiliary_loss_mlp": 0.01274274, + "balance_loss_clip": 0.06293193, + "balance_loss_mlp": 0.01255248, + "epoch": 0.2676085976251315, + "flos": 20236698478080.0, + "grad_norm": 1.7838817445044992, + "language_loss": 0.81239712, + "learning_rate": 3.4364111395569937e-06, + "loss": 0.8901701, + "num_input_tokens_seen": 96224990, + "router_z_loss_clip": 2.09570312, + "router_z_loss_mlp": 0.19042969, + "step": 4451, + "time_per_iteration": 2.552764892578125 + }, + { + "auxiliary_loss_clip": 0.06515267, + "auxiliary_loss_mlp": 0.01275992, + "balance_loss_clip": 0.06304526, + "balance_loss_mlp": 0.01257324, + "epoch": 0.26766872087779947, + "flos": 28045784718720.0, + "grad_norm": 1.859886698365648, + "language_loss": 0.87156057, + "learning_rate": 3.436140112818882e-06, + "loss": 0.94947314, + "num_input_tokens_seen": 96245345, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.18664551, + "step": 4452, + "time_per_iteration": 2.580838918685913 + }, + { + "auxiliary_loss_clip": 0.06515863, + "auxiliary_loss_mlp": 0.01278142, + "balance_loss_clip": 0.06301846, + "balance_loss_mlp": 0.01258377, + "epoch": 0.26772884413046744, + "flos": 18329803585920.0, + "grad_norm": 2.0572254627861577, + "language_loss": 0.84003425, + "learning_rate": 3.435869031622194e-06, + "loss": 0.91797435, + "num_input_tokens_seen": 96259000, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.19775391, + "step": 4453, + "time_per_iteration": 2.5120368003845215 + }, + { + "auxiliary_loss_clip": 0.06513035, + "auxiliary_loss_mlp": 0.01281566, + "balance_loss_clip": 0.06298169, + "balance_loss_mlp": 0.01261992, + "epoch": 0.2677889673831354, + "flos": 22134075932160.0, + "grad_norm": 1.66096029715733, + "language_loss": 0.79950684, + "learning_rate": 3.435597895977208e-06, + "loss": 0.87745285, + "num_input_tokens_seen": 96277000, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.19580078, + "step": 4454, + "time_per_iteration": 2.5411524772644043 + }, + { + "auxiliary_loss_clip": 0.06518991, + "auxiliary_loss_mlp": 0.0127963, + "balance_loss_clip": 0.0630191, + "balance_loss_mlp": 0.01259949, + "epoch": 0.2678490906358034, + "flos": 23736001489920.0, + "grad_norm": 1.4726826789128313, + "language_loss": 0.72626883, + "learning_rate": 3.435326705894206e-06, + "loss": 0.80425501, + "num_input_tokens_seen": 96297010, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.19689941, + "step": 4455, + "time_per_iteration": 2.600341558456421 + }, + { + "auxiliary_loss_clip": 0.0650526, + "auxiliary_loss_mlp": 0.01280807, + "balance_loss_clip": 0.06295176, + "balance_loss_mlp": 0.01262675, + "epoch": 0.2679092138884714, + "flos": 21769414963200.0, + "grad_norm": 1.6724393178855028, + "language_loss": 0.74066579, + "learning_rate": 3.435055461383471e-06, + "loss": 0.81852639, + "num_input_tokens_seen": 96315780, + "router_z_loss_clip": 2.09863281, + "router_z_loss_mlp": 0.18139648, + "step": 4456, + "time_per_iteration": 2.5469894409179688 + }, + { + "auxiliary_loss_clip": 0.06520095, + "auxiliary_loss_mlp": 0.01278452, + "balance_loss_clip": 0.06300029, + "balance_loss_mlp": 0.01258127, + "epoch": 0.26796933714113935, + "flos": 19866670848000.0, + "grad_norm": 2.417277333537857, + "language_loss": 0.71260488, + "learning_rate": 3.4347841624552896e-06, + "loss": 0.79059041, + "num_input_tokens_seen": 96333465, + "router_z_loss_clip": 2.20507812, + "router_z_loss_mlp": 0.20324707, + "step": 4457, + "time_per_iteration": 2.592397451400757 + }, + { + "auxiliary_loss_clip": 0.06517951, + "auxiliary_loss_mlp": 0.01279854, + "balance_loss_clip": 0.06301091, + "balance_loss_mlp": 0.01259183, + "epoch": 0.2680294603938073, + "flos": 20054116431360.0, + "grad_norm": 2.0107664890053143, + "language_loss": 0.79466271, + "learning_rate": 3.4345128091199493e-06, + "loss": 0.87264079, + "num_input_tokens_seen": 96352005, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.20666504, + "step": 4458, + "time_per_iteration": 2.5134661197662354 + }, + { + "auxiliary_loss_clip": 0.06383923, + "auxiliary_loss_mlp": 0.01263146, + "balance_loss_clip": 0.06281242, + "balance_loss_mlp": 0.01258718, + "epoch": 0.2680895836464753, + "flos": 72134918334720.0, + "grad_norm": 0.8734266993254428, + "language_loss": 0.5870322, + "learning_rate": 3.434241401387739e-06, + "loss": 0.66350281, + "num_input_tokens_seen": 96406265, + "router_z_loss_clip": 1.02734375, + "router_z_loss_mlp": 0.04437256, + "step": 4459, + "time_per_iteration": 3.2277050018310547 + }, + { + "auxiliary_loss_clip": 0.06506394, + "auxiliary_loss_mlp": 0.01274552, + "balance_loss_clip": 0.06292672, + "balance_loss_mlp": 0.01255633, + "epoch": 0.26814970689914325, + "flos": 20455310580480.0, + "grad_norm": 1.8403982609946155, + "language_loss": 0.85477257, + "learning_rate": 3.4339699392689507e-06, + "loss": 0.93258202, + "num_input_tokens_seen": 96425225, + "router_z_loss_clip": 2.13769531, + "router_z_loss_mlp": 0.18920898, + "step": 4460, + "time_per_iteration": 2.513317346572876 + }, + { + "auxiliary_loss_clip": 0.06504844, + "auxiliary_loss_mlp": 0.01281285, + "balance_loss_clip": 0.06292892, + "balance_loss_mlp": 0.01261866, + "epoch": 0.2682098301518112, + "flos": 17572459656960.0, + "grad_norm": 1.8133404743184358, + "language_loss": 0.69389015, + "learning_rate": 3.4336984227738796e-06, + "loss": 0.7717514, + "num_input_tokens_seen": 96443780, + "router_z_loss_clip": 2.12011719, + "router_z_loss_mlp": 0.19421387, + "step": 4461, + "time_per_iteration": 2.5566093921661377 + }, + { + "auxiliary_loss_clip": 0.06506921, + "auxiliary_loss_mlp": 0.01281085, + "balance_loss_clip": 0.06293105, + "balance_loss_mlp": 0.01260152, + "epoch": 0.2682699534044792, + "flos": 18339237169920.0, + "grad_norm": 1.6584506269914416, + "language_loss": 0.67031932, + "learning_rate": 3.43342685191282e-06, + "loss": 0.74819934, + "num_input_tokens_seen": 96464530, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.20935059, + "step": 4462, + "time_per_iteration": 2.5427775382995605 + }, + { + "auxiliary_loss_clip": 0.06508102, + "auxiliary_loss_mlp": 0.01282385, + "balance_loss_clip": 0.0629629, + "balance_loss_mlp": 0.01263287, + "epoch": 0.26833007665714714, + "flos": 25308311829120.0, + "grad_norm": 1.7808644454945033, + "language_loss": 0.69747704, + "learning_rate": 3.4331552266960705e-06, + "loss": 0.77538192, + "num_input_tokens_seen": 96483345, + "router_z_loss_clip": 2.11523438, + "router_z_loss_mlp": 0.19116211, + "step": 4463, + "time_per_iteration": 2.6194493770599365 + }, + { + "auxiliary_loss_clip": 0.06508362, + "auxiliary_loss_mlp": 0.01280959, + "balance_loss_clip": 0.06291216, + "balance_loss_mlp": 0.0126092, + "epoch": 0.2683901999098151, + "flos": 16104046780800.0, + "grad_norm": 2.9245690778148465, + "language_loss": 0.78600121, + "learning_rate": 3.432883547133931e-06, + "loss": 0.86389446, + "num_input_tokens_seen": 96498305, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.20056152, + "step": 4464, + "time_per_iteration": 2.463418483734131 + }, + { + "auxiliary_loss_clip": 0.06508331, + "auxiliary_loss_mlp": 0.01281824, + "balance_loss_clip": 0.06294504, + "balance_loss_mlp": 0.01262154, + "epoch": 0.2684503231624831, + "flos": 27315414604800.0, + "grad_norm": 1.7531136867378412, + "language_loss": 0.71091688, + "learning_rate": 3.432611813236704e-06, + "loss": 0.78881842, + "num_input_tokens_seen": 96519740, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.19665527, + "step": 4465, + "time_per_iteration": 2.6083028316497803 + }, + { + "auxiliary_loss_clip": 0.06379254, + "auxiliary_loss_mlp": 0.01259677, + "balance_loss_clip": 0.0627647, + "balance_loss_mlp": 0.01255094, + "epoch": 0.26851044641515104, + "flos": 71879060292480.0, + "grad_norm": 0.6551429372657154, + "language_loss": 0.52683848, + "learning_rate": 3.4323400250146943e-06, + "loss": 0.60322779, + "num_input_tokens_seen": 96588870, + "router_z_loss_clip": 1.02929688, + "router_z_loss_mlp": 0.04577637, + "step": 4466, + "time_per_iteration": 3.2851803302764893 + }, + { + "auxiliary_loss_clip": 0.06507096, + "auxiliary_loss_mlp": 0.01283442, + "balance_loss_clip": 0.06291512, + "balance_loss_mlp": 0.01263105, + "epoch": 0.268570569667819, + "flos": 18739676632320.0, + "grad_norm": 10.994589827837663, + "language_loss": 0.74195564, + "learning_rate": 3.4320681824782057e-06, + "loss": 0.81986099, + "num_input_tokens_seen": 96605100, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.20324707, + "step": 4467, + "time_per_iteration": 2.4971463680267334 + }, + { + "auxiliary_loss_clip": 0.06517448, + "auxiliary_loss_mlp": 0.01283031, + "balance_loss_clip": 0.06297839, + "balance_loss_mlp": 0.01264005, + "epoch": 0.268630692920487, + "flos": 18182832324480.0, + "grad_norm": 2.2391086352503504, + "language_loss": 0.81577581, + "learning_rate": 3.4317962856375493e-06, + "loss": 0.89378059, + "num_input_tokens_seen": 96621410, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.19042969, + "step": 4468, + "time_per_iteration": 2.547626256942749 + }, + { + "auxiliary_loss_clip": 0.06377872, + "auxiliary_loss_mlp": 0.01264177, + "balance_loss_clip": 0.06275174, + "balance_loss_mlp": 0.01259552, + "epoch": 0.268690816173155, + "flos": 68754229176960.0, + "grad_norm": 0.8279608156690638, + "language_loss": 0.59413958, + "learning_rate": 3.4315243345030334e-06, + "loss": 0.67056012, + "num_input_tokens_seen": 96684810, + "router_z_loss_clip": 1.03125, + "router_z_loss_mlp": 0.0461731, + "step": 4469, + "time_per_iteration": 3.2565419673919678 + }, + { + "auxiliary_loss_clip": 0.06507242, + "auxiliary_loss_mlp": 0.01284548, + "balance_loss_clip": 0.06292132, + "balance_loss_mlp": 0.01263304, + "epoch": 0.26875093942582295, + "flos": 23300160877440.0, + "grad_norm": 1.9707129205098373, + "language_loss": 0.8163017, + "learning_rate": 3.431252329084972e-06, + "loss": 0.89421958, + "num_input_tokens_seen": 96701920, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.21240234, + "step": 4470, + "time_per_iteration": 2.542893171310425 + }, + { + "auxiliary_loss_clip": 0.06497125, + "auxiliary_loss_mlp": 0.0128145, + "balance_loss_clip": 0.0628901, + "balance_loss_mlp": 0.012619, + "epoch": 0.2688110626784909, + "flos": 21549880465920.0, + "grad_norm": 1.5945085425671264, + "language_loss": 0.83326346, + "learning_rate": 3.4309802693936786e-06, + "loss": 0.91104919, + "num_input_tokens_seen": 96721260, + "router_z_loss_clip": 2.08496094, + "router_z_loss_mlp": 0.19555664, + "step": 4471, + "time_per_iteration": 2.5213489532470703 + }, + { + "auxiliary_loss_clip": 0.06497657, + "auxiliary_loss_mlp": 0.01284463, + "balance_loss_clip": 0.06289607, + "balance_loss_mlp": 0.01264365, + "epoch": 0.2688711859311589, + "flos": 28407804284160.0, + "grad_norm": 1.9607526414443455, + "language_loss": 0.70046443, + "learning_rate": 3.43070815543947e-06, + "loss": 0.77828562, + "num_input_tokens_seen": 96740385, + "router_z_loss_clip": 2.08007812, + "router_z_loss_mlp": 0.20092773, + "step": 4472, + "time_per_iteration": 2.6251678466796875 + }, + { + "auxiliary_loss_clip": 0.06504884, + "auxiliary_loss_mlp": 0.0128234, + "balance_loss_clip": 0.06293008, + "balance_loss_mlp": 0.01263112, + "epoch": 0.26893130918382685, + "flos": 26002148762880.0, + "grad_norm": 1.9293915951077794, + "language_loss": 0.68364072, + "learning_rate": 3.4304359872326656e-06, + "loss": 0.76151299, + "num_input_tokens_seen": 96761860, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.19213867, + "step": 4473, + "time_per_iteration": 2.5682830810546875 + }, + { + "auxiliary_loss_clip": 0.06499921, + "auxiliary_loss_mlp": 0.01278958, + "balance_loss_clip": 0.06292213, + "balance_loss_mlp": 0.01259467, + "epoch": 0.2689914324364948, + "flos": 20345878748160.0, + "grad_norm": 1.608174101079712, + "language_loss": 0.83682281, + "learning_rate": 3.4301637647835843e-06, + "loss": 0.91461158, + "num_input_tokens_seen": 96781890, + "router_z_loss_clip": 2.08007812, + "router_z_loss_mlp": 0.19470215, + "step": 4474, + "time_per_iteration": 2.554151773452759 + }, + { + "auxiliary_loss_clip": 0.06502855, + "auxiliary_loss_mlp": 0.01275806, + "balance_loss_clip": 0.06296148, + "balance_loss_mlp": 0.01256482, + "epoch": 0.2690515556891628, + "flos": 19470759505920.0, + "grad_norm": 1.847749203594977, + "language_loss": 0.70725596, + "learning_rate": 3.4298914881025494e-06, + "loss": 0.78504252, + "num_input_tokens_seen": 96800390, + "router_z_loss_clip": 2.06835938, + "router_z_loss_mlp": 0.19348145, + "step": 4475, + "time_per_iteration": 2.5116677284240723 + }, + { + "auxiliary_loss_clip": 0.06503256, + "auxiliary_loss_mlp": 0.01277275, + "balance_loss_clip": 0.06287575, + "balance_loss_mlp": 0.01257188, + "epoch": 0.26911167894183075, + "flos": 18151875440640.0, + "grad_norm": 2.2814450019498236, + "language_loss": 0.73125452, + "learning_rate": 3.4296191571998863e-06, + "loss": 0.80905986, + "num_input_tokens_seen": 96816685, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.20092773, + "step": 4476, + "time_per_iteration": 3.923501968383789 + }, + { + "auxiliary_loss_clip": 0.0650249, + "auxiliary_loss_mlp": 0.01274263, + "balance_loss_clip": 0.06291398, + "balance_loss_mlp": 0.01255511, + "epoch": 0.2691718021944987, + "flos": 19981385487360.0, + "grad_norm": 1.4862356596427981, + "language_loss": 0.80676347, + "learning_rate": 3.429346772085922e-06, + "loss": 0.88453096, + "num_input_tokens_seen": 96836285, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 0.18762207, + "step": 4477, + "time_per_iteration": 2.562681198120117 + }, + { + "auxiliary_loss_clip": 0.06506729, + "auxiliary_loss_mlp": 0.01275723, + "balance_loss_clip": 0.06289821, + "balance_loss_mlp": 0.01254873, + "epoch": 0.2692319254471667, + "flos": 37455622560000.0, + "grad_norm": 1.8507584096301994, + "language_loss": 0.65612036, + "learning_rate": 3.429074332770984e-06, + "loss": 0.73394483, + "num_input_tokens_seen": 96857745, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.20861816, + "step": 4478, + "time_per_iteration": 2.6743321418762207 + }, + { + "auxiliary_loss_clip": 0.06505084, + "auxiliary_loss_mlp": 0.01278495, + "balance_loss_clip": 0.06291381, + "balance_loss_mlp": 0.01259242, + "epoch": 0.26929204869983464, + "flos": 22134411348480.0, + "grad_norm": 2.2415663972983864, + "language_loss": 0.81841063, + "learning_rate": 3.4288018392654047e-06, + "loss": 0.89624637, + "num_input_tokens_seen": 96877295, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.19250488, + "step": 4479, + "time_per_iteration": 2.563365936279297 + }, + { + "auxiliary_loss_clip": 0.06510025, + "auxiliary_loss_mlp": 0.01277354, + "balance_loss_clip": 0.06295313, + "balance_loss_mlp": 0.01258305, + "epoch": 0.2693521719525026, + "flos": 19799055002880.0, + "grad_norm": 1.97047433874797, + "language_loss": 0.81362212, + "learning_rate": 3.4285292915795166e-06, + "loss": 0.89149588, + "num_input_tokens_seen": 96896160, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.19055176, + "step": 4480, + "time_per_iteration": 2.505098342895508 + }, + { + "auxiliary_loss_clip": 0.06504171, + "auxiliary_loss_mlp": 0.01276381, + "balance_loss_clip": 0.06296593, + "balance_loss_mlp": 0.01257677, + "epoch": 0.2694122952051706, + "flos": 21000415317120.0, + "grad_norm": 1.6210366032838512, + "language_loss": 0.7826978, + "learning_rate": 3.4282566897236543e-06, + "loss": 0.86050338, + "num_input_tokens_seen": 96915410, + "router_z_loss_clip": 2.07421875, + "router_z_loss_mlp": 0.18713379, + "step": 4481, + "time_per_iteration": 4.100890874862671 + }, + { + "auxiliary_loss_clip": 0.06511036, + "auxiliary_loss_mlp": 0.01275006, + "balance_loss_clip": 0.06298155, + "balance_loss_mlp": 0.01254192, + "epoch": 0.2694724184578386, + "flos": 25856519166720.0, + "grad_norm": 1.8924674974759383, + "language_loss": 0.74293458, + "learning_rate": 3.4279840337081547e-06, + "loss": 0.820795, + "num_input_tokens_seen": 96937865, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.20788574, + "step": 4482, + "time_per_iteration": 4.145740747451782 + }, + { + "auxiliary_loss_clip": 0.06511661, + "auxiliary_loss_mlp": 0.01276613, + "balance_loss_clip": 0.06299306, + "balance_loss_mlp": 0.01256836, + "epoch": 0.26953254171050656, + "flos": 21733594542720.0, + "grad_norm": 2.48131981073459, + "language_loss": 0.72700799, + "learning_rate": 3.4277113235433584e-06, + "loss": 0.80489069, + "num_input_tokens_seen": 96957710, + "router_z_loss_clip": 2.12109375, + "router_z_loss_mlp": 0.19763184, + "step": 4483, + "time_per_iteration": 2.5375680923461914 + }, + { + "auxiliary_loss_clip": 0.06523035, + "auxiliary_loss_mlp": 0.01278438, + "balance_loss_clip": 0.0630566, + "balance_loss_mlp": 0.01257994, + "epoch": 0.2695926649631745, + "flos": 19689078119040.0, + "grad_norm": 2.054691934345778, + "language_loss": 0.87485874, + "learning_rate": 3.427438559239605e-06, + "loss": 0.95287347, + "num_input_tokens_seen": 96975890, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.20446777, + "step": 4484, + "time_per_iteration": 2.541909694671631 + }, + { + "auxiliary_loss_clip": 0.06515766, + "auxiliary_loss_mlp": 0.01278738, + "balance_loss_clip": 0.06300886, + "balance_loss_mlp": 0.01259474, + "epoch": 0.2696527882158425, + "flos": 32894257847040.0, + "grad_norm": 2.0183728032076966, + "language_loss": 0.66971946, + "learning_rate": 3.427165740807239e-06, + "loss": 0.74766451, + "num_input_tokens_seen": 96998595, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.19262695, + "step": 4485, + "time_per_iteration": 2.623896598815918 + }, + { + "auxiliary_loss_clip": 0.06514997, + "auxiliary_loss_mlp": 0.01282999, + "balance_loss_clip": 0.06301111, + "balance_loss_mlp": 0.01262877, + "epoch": 0.26971291146851045, + "flos": 12128806177920.0, + "grad_norm": 3.3281733059389498, + "language_loss": 0.74281263, + "learning_rate": 3.426892868256604e-06, + "loss": 0.82079262, + "num_input_tokens_seen": 97013715, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.2010498, + "step": 4486, + "time_per_iteration": 2.525820016860962 + }, + { + "auxiliary_loss_clip": 0.06519947, + "auxiliary_loss_mlp": 0.01289409, + "balance_loss_clip": 0.06302445, + "balance_loss_mlp": 0.01268846, + "epoch": 0.2697730347211784, + "flos": 22640467282560.0, + "grad_norm": 2.8316541967285183, + "language_loss": 0.84592897, + "learning_rate": 3.4266199415980495e-06, + "loss": 0.92402256, + "num_input_tokens_seen": 97031570, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.20556641, + "step": 4487, + "time_per_iteration": 3.936244249343872 + }, + { + "auxiliary_loss_clip": 0.06520635, + "auxiliary_loss_mlp": 0.01285695, + "balance_loss_clip": 0.06303369, + "balance_loss_mlp": 0.01264845, + "epoch": 0.2698331579738464, + "flos": 23519695374720.0, + "grad_norm": 2.431656191901387, + "language_loss": 0.73194599, + "learning_rate": 3.4263469608419234e-06, + "loss": 0.81000936, + "num_input_tokens_seen": 97049815, + "router_z_loss_clip": 2.17578125, + "router_z_loss_mlp": 0.20861816, + "step": 4488, + "time_per_iteration": 2.522861957550049 + }, + { + "auxiliary_loss_clip": 0.06516892, + "auxiliary_loss_mlp": 0.0127853, + "balance_loss_clip": 0.06303044, + "balance_loss_mlp": 0.01258681, + "epoch": 0.26989328122651435, + "flos": 24647360423040.0, + "grad_norm": 1.6427618857215789, + "language_loss": 0.84162384, + "learning_rate": 3.426073925998578e-06, + "loss": 0.91957808, + "num_input_tokens_seen": 97067570, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.1986084, + "step": 4489, + "time_per_iteration": 2.558133602142334 + }, + { + "auxiliary_loss_clip": 0.06523076, + "auxiliary_loss_mlp": 0.0128704, + "balance_loss_clip": 0.0630331, + "balance_loss_mlp": 0.01265821, + "epoch": 0.2699534044791823, + "flos": 10775904554880.0, + "grad_norm": 2.0847356564254014, + "language_loss": 0.90199494, + "learning_rate": 3.4258008370783656e-06, + "loss": 0.98009604, + "num_input_tokens_seen": 97082180, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.21228027, + "step": 4490, + "time_per_iteration": 2.461840867996216 + }, + { + "auxiliary_loss_clip": 0.06505966, + "auxiliary_loss_mlp": 0.01275421, + "balance_loss_clip": 0.06297465, + "balance_loss_mlp": 0.01256288, + "epoch": 0.2700135277318503, + "flos": 36180021928320.0, + "grad_norm": 2.13129158363681, + "language_loss": 0.73836827, + "learning_rate": 3.4255276940916434e-06, + "loss": 0.81618214, + "num_input_tokens_seen": 97103470, + "router_z_loss_clip": 2.08496094, + "router_z_loss_mlp": 0.19128418, + "step": 4491, + "time_per_iteration": 2.6479640007019043 + }, + { + "auxiliary_loss_clip": 0.06516409, + "auxiliary_loss_mlp": 0.01284517, + "balance_loss_clip": 0.06303698, + "balance_loss_mlp": 0.01264788, + "epoch": 0.27007365098451824, + "flos": 17424020949120.0, + "grad_norm": 2.8438546283757793, + "language_loss": 0.74296927, + "learning_rate": 3.4252544970487676e-06, + "loss": 0.82097852, + "num_input_tokens_seen": 97118100, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.19726562, + "step": 4492, + "time_per_iteration": 2.462226629257202 + }, + { + "auxiliary_loss_clip": 0.06510016, + "auxiliary_loss_mlp": 0.01279369, + "balance_loss_clip": 0.06300159, + "balance_loss_mlp": 0.01259926, + "epoch": 0.2701337742371862, + "flos": 23192448053760.0, + "grad_norm": 1.7359009481863723, + "language_loss": 0.88954818, + "learning_rate": 3.4249812459600986e-06, + "loss": 0.96744204, + "num_input_tokens_seen": 97136765, + "router_z_loss_clip": 2.09960938, + "router_z_loss_mlp": 0.19445801, + "step": 4493, + "time_per_iteration": 2.5385639667510986 + }, + { + "auxiliary_loss_clip": 0.06509903, + "auxiliary_loss_mlp": 0.01283619, + "balance_loss_clip": 0.06296834, + "balance_loss_mlp": 0.01265201, + "epoch": 0.2701938974898542, + "flos": 24396365917440.0, + "grad_norm": 1.3961943163888275, + "language_loss": 0.71571529, + "learning_rate": 3.424707940835998e-06, + "loss": 0.79365045, + "num_input_tokens_seen": 97157470, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.1842041, + "step": 4494, + "time_per_iteration": 2.542644500732422 + }, + { + "auxiliary_loss_clip": 0.06499185, + "auxiliary_loss_mlp": 0.01282381, + "balance_loss_clip": 0.0629191, + "balance_loss_mlp": 0.01263713, + "epoch": 0.2702540207425222, + "flos": 26221641333120.0, + "grad_norm": 2.6689304552375366, + "language_loss": 0.8697859, + "learning_rate": 3.42443458168683e-06, + "loss": 0.94760156, + "num_input_tokens_seen": 97176905, + "router_z_loss_clip": 2.07226562, + "router_z_loss_mlp": 0.18652344, + "step": 4495, + "time_per_iteration": 2.6052844524383545 + }, + { + "auxiliary_loss_clip": 0.06507061, + "auxiliary_loss_mlp": 0.01284126, + "balance_loss_clip": 0.06293719, + "balance_loss_mlp": 0.01263944, + "epoch": 0.27031414399519016, + "flos": 22932439234560.0, + "grad_norm": 1.7866659337876034, + "language_loss": 0.76608586, + "learning_rate": 3.424161168522959e-06, + "loss": 0.84399772, + "num_input_tokens_seen": 97196380, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.20166016, + "step": 4496, + "time_per_iteration": 2.5191855430603027 + }, + { + "auxiliary_loss_clip": 0.06445029, + "auxiliary_loss_mlp": 0.01262058, + "balance_loss_clip": 0.06340651, + "balance_loss_mlp": 0.01257498, + "epoch": 0.2703742672478581, + "flos": 63037904912640.0, + "grad_norm": 0.6591771406427821, + "language_loss": 0.49976462, + "learning_rate": 3.423887701354754e-06, + "loss": 0.57683551, + "num_input_tokens_seen": 97260100, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.0456543, + "step": 4497, + "time_per_iteration": 3.2403736114501953 + }, + { + "auxiliary_loss_clip": 0.06506558, + "auxiliary_loss_mlp": 0.01283587, + "balance_loss_clip": 0.06295481, + "balance_loss_mlp": 0.01266039, + "epoch": 0.2704343905005261, + "flos": 18846341280000.0, + "grad_norm": 2.8639988273107657, + "language_loss": 0.72431815, + "learning_rate": 3.4236141801925847e-06, + "loss": 0.80221957, + "num_input_tokens_seen": 97277935, + "router_z_loss_clip": 2.11328125, + "router_z_loss_mlp": 0.17553711, + "step": 4498, + "time_per_iteration": 2.509298086166382 + }, + { + "auxiliary_loss_clip": 0.06432115, + "auxiliary_loss_mlp": 0.01259251, + "balance_loss_clip": 0.06327531, + "balance_loss_mlp": 0.01254679, + "epoch": 0.27049451375319405, + "flos": 71253635817600.0, + "grad_norm": 0.9422572009255263, + "language_loss": 0.5900467, + "learning_rate": 3.4233406050468237e-06, + "loss": 0.66696036, + "num_input_tokens_seen": 97338845, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.04577637, + "step": 4499, + "time_per_iteration": 3.2116270065307617 + }, + { + "auxiliary_loss_clip": 0.06502165, + "auxiliary_loss_mlp": 0.01281307, + "balance_loss_clip": 0.06292122, + "balance_loss_mlp": 0.01261422, + "epoch": 0.270554637005862, + "flos": 24285257003520.0, + "grad_norm": 2.589715304320551, + "language_loss": 0.73975158, + "learning_rate": 3.4230669759278438e-06, + "loss": 0.8175863, + "num_input_tokens_seen": 97356640, + "router_z_loss_clip": 2.09960938, + "router_z_loss_mlp": 0.19897461, + "step": 4500, + "time_per_iteration": 2.537710189819336 + }, + { + "auxiliary_loss_clip": 0.06501484, + "auxiliary_loss_mlp": 0.01276741, + "balance_loss_clip": 0.06289591, + "balance_loss_mlp": 0.01257965, + "epoch": 0.27061476025853, + "flos": 17636889047040.0, + "grad_norm": 2.788947169536346, + "language_loss": 0.81470346, + "learning_rate": 3.4227932928460215e-06, + "loss": 0.89248574, + "num_input_tokens_seen": 97372585, + "router_z_loss_clip": 2.11523438, + "router_z_loss_mlp": 0.18774414, + "step": 4501, + "time_per_iteration": 2.5423648357391357 + }, + { + "auxiliary_loss_clip": 0.06510358, + "auxiliary_loss_mlp": 0.01287368, + "balance_loss_clip": 0.06294559, + "balance_loss_mlp": 0.01267579, + "epoch": 0.27067488351119795, + "flos": 22716594316800.0, + "grad_norm": 1.5278818221734496, + "language_loss": 0.7303015, + "learning_rate": 3.422519555811735e-06, + "loss": 0.8082788, + "num_input_tokens_seen": 97393315, + "router_z_loss_clip": 2.15917969, + "router_z_loss_mlp": 0.19775391, + "step": 4502, + "time_per_iteration": 2.5804011821746826 + }, + { + "auxiliary_loss_clip": 0.06507368, + "auxiliary_loss_mlp": 0.01278341, + "balance_loss_clip": 0.06289332, + "balance_loss_mlp": 0.01258576, + "epoch": 0.2707350067638659, + "flos": 41729333806080.0, + "grad_norm": 1.6949775973694576, + "language_loss": 0.69090897, + "learning_rate": 3.4222457648353642e-06, + "loss": 0.76876605, + "num_input_tokens_seen": 97417860, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.19763184, + "step": 4503, + "time_per_iteration": 2.740292549133301 + }, + { + "auxiliary_loss_clip": 0.06502387, + "auxiliary_loss_mlp": 0.0128307, + "balance_loss_clip": 0.06290283, + "balance_loss_mlp": 0.01263746, + "epoch": 0.2707951300165339, + "flos": 20199159048960.0, + "grad_norm": 1.9752400870870641, + "language_loss": 0.69172543, + "learning_rate": 3.4219719199272918e-06, + "loss": 0.76958001, + "num_input_tokens_seen": 97436780, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.1932373, + "step": 4504, + "time_per_iteration": 2.548069477081299 + }, + { + "auxiliary_loss_clip": 0.06502561, + "auxiliary_loss_mlp": 0.0128216, + "balance_loss_clip": 0.06291538, + "balance_loss_mlp": 0.01263492, + "epoch": 0.27085525326920185, + "flos": 21440364779520.0, + "grad_norm": 2.9855030089462993, + "language_loss": 0.76122642, + "learning_rate": 3.421698021097902e-06, + "loss": 0.8390736, + "num_input_tokens_seen": 97456190, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 0.18652344, + "step": 4505, + "time_per_iteration": 2.527165651321411 + }, + { + "auxiliary_loss_clip": 0.06505956, + "auxiliary_loss_mlp": 0.0128432, + "balance_loss_clip": 0.06289993, + "balance_loss_mlp": 0.01264459, + "epoch": 0.2709153765218698, + "flos": 17680885240320.0, + "grad_norm": 2.0693026918396487, + "language_loss": 0.73959178, + "learning_rate": 3.42142406835758e-06, + "loss": 0.81749451, + "num_input_tokens_seen": 97474545, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.1986084, + "step": 4506, + "time_per_iteration": 2.5131149291992188 + }, + { + "auxiliary_loss_clip": 0.0650361, + "auxiliary_loss_mlp": 0.01278265, + "balance_loss_clip": 0.06290495, + "balance_loss_mlp": 0.01258595, + "epoch": 0.2709754997745378, + "flos": 24462136972800.0, + "grad_norm": 1.8128724600792683, + "language_loss": 0.81647539, + "learning_rate": 3.421150061716715e-06, + "loss": 0.89429414, + "num_input_tokens_seen": 97494520, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.1965332, + "step": 4507, + "time_per_iteration": 2.684535503387451 + }, + { + "auxiliary_loss_clip": 0.06395597, + "auxiliary_loss_mlp": 0.01254395, + "balance_loss_clip": 0.0629042, + "balance_loss_mlp": 0.01250205, + "epoch": 0.2710356230272058, + "flos": 65229602232960.0, + "grad_norm": 0.712447813073055, + "language_loss": 0.50718415, + "learning_rate": 3.420876001185698e-06, + "loss": 0.58368409, + "num_input_tokens_seen": 97552455, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.04193115, + "step": 4508, + "time_per_iteration": 3.111752986907959 + }, + { + "auxiliary_loss_clip": 0.0649793, + "auxiliary_loss_mlp": 0.01272465, + "balance_loss_clip": 0.06289998, + "balance_loss_mlp": 0.01255263, + "epoch": 0.27109574627987376, + "flos": 25491606635520.0, + "grad_norm": 2.0258218163980213, + "language_loss": 0.75015354, + "learning_rate": 3.4206018867749197e-06, + "loss": 0.82785749, + "num_input_tokens_seen": 97572650, + "router_z_loss_clip": 2.08007812, + "router_z_loss_mlp": 0.171875, + "step": 4509, + "time_per_iteration": 2.555316209793091 + }, + { + "auxiliary_loss_clip": 0.06495094, + "auxiliary_loss_mlp": 0.01275639, + "balance_loss_clip": 0.06289092, + "balance_loss_mlp": 0.01256947, + "epoch": 0.2711558695325417, + "flos": 19688910410880.0, + "grad_norm": 2.3712253737099767, + "language_loss": 0.71864915, + "learning_rate": 3.4203277184947757e-06, + "loss": 0.79635644, + "num_input_tokens_seen": 97591150, + "router_z_loss_clip": 2.05859375, + "router_z_loss_mlp": 0.18688965, + "step": 4510, + "time_per_iteration": 2.5428407192230225 + }, + { + "auxiliary_loss_clip": 0.06499062, + "auxiliary_loss_mlp": 0.01279444, + "balance_loss_clip": 0.0629103, + "balance_loss_mlp": 0.012608, + "epoch": 0.2712159927852097, + "flos": 18593627765760.0, + "grad_norm": 2.5496745820614515, + "language_loss": 0.71357799, + "learning_rate": 3.4200534963556627e-06, + "loss": 0.791363, + "num_input_tokens_seen": 97607410, + "router_z_loss_clip": 2.07910156, + "router_z_loss_mlp": 0.1862793, + "step": 4511, + "time_per_iteration": 2.483739137649536 + }, + { + "auxiliary_loss_clip": 0.06505338, + "auxiliary_loss_mlp": 0.01274141, + "balance_loss_clip": 0.06292383, + "balance_loss_mlp": 0.01254817, + "epoch": 0.27127611603787766, + "flos": 25637403939840.0, + "grad_norm": 1.9202075405224084, + "language_loss": 0.81604505, + "learning_rate": 3.419779220367979e-06, + "loss": 0.89383984, + "num_input_tokens_seen": 97626870, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.1932373, + "step": 4512, + "time_per_iteration": 2.593388795852661 + }, + { + "auxiliary_loss_clip": 0.06503928, + "auxiliary_loss_mlp": 0.01273233, + "balance_loss_clip": 0.06296667, + "balance_loss_mlp": 0.01255554, + "epoch": 0.2713362392905456, + "flos": 23155663311360.0, + "grad_norm": 1.8072498717910284, + "language_loss": 0.809147, + "learning_rate": 3.419504890542124e-06, + "loss": 0.88691866, + "num_input_tokens_seen": 97646595, + "router_z_loss_clip": 2.07226562, + "router_z_loss_mlp": 0.17663574, + "step": 4513, + "time_per_iteration": 2.519502639770508 + }, + { + "auxiliary_loss_clip": 0.06501831, + "auxiliary_loss_mlp": 0.01278947, + "balance_loss_clip": 0.0628939, + "balance_loss_mlp": 0.01261018, + "epoch": 0.2713963625432136, + "flos": 18371409937920.0, + "grad_norm": 3.81368034370299, + "language_loss": 0.88867396, + "learning_rate": 3.4192305068885026e-06, + "loss": 0.96648169, + "num_input_tokens_seen": 97665485, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.17932129, + "step": 4514, + "time_per_iteration": 2.54484224319458 + }, + { + "auxiliary_loss_clip": 0.06502509, + "auxiliary_loss_mlp": 0.01277056, + "balance_loss_clip": 0.06292502, + "balance_loss_mlp": 0.01258709, + "epoch": 0.27145648579588155, + "flos": 22498275703680.0, + "grad_norm": 1.610354502574947, + "language_loss": 0.92402363, + "learning_rate": 3.418956069417517e-06, + "loss": 1.00181937, + "num_input_tokens_seen": 97683800, + "router_z_loss_clip": 2.09863281, + "router_z_loss_mlp": 0.18347168, + "step": 4515, + "time_per_iteration": 2.5121350288391113 + }, + { + "auxiliary_loss_clip": 0.06511631, + "auxiliary_loss_mlp": 0.01281138, + "balance_loss_clip": 0.06296228, + "balance_loss_mlp": 0.01259669, + "epoch": 0.2715166090485495, + "flos": 19244265120000.0, + "grad_norm": 2.423654901761582, + "language_loss": 0.73979908, + "learning_rate": 3.4186815781395756e-06, + "loss": 0.81772685, + "num_input_tokens_seen": 97700505, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.21435547, + "step": 4516, + "time_per_iteration": 3.917318344116211 + }, + { + "auxiliary_loss_clip": 0.06498563, + "auxiliary_loss_mlp": 0.01272915, + "balance_loss_clip": 0.06289151, + "balance_loss_mlp": 0.01253627, + "epoch": 0.2715767323012175, + "flos": 17714902798080.0, + "grad_norm": 1.854313921742246, + "language_loss": 0.76927733, + "learning_rate": 3.4184070330650866e-06, + "loss": 0.84699214, + "num_input_tokens_seen": 97717410, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 0.19287109, + "step": 4517, + "time_per_iteration": 2.576723098754883 + }, + { + "auxiliary_loss_clip": 0.06500702, + "auxiliary_loss_mlp": 0.01276287, + "balance_loss_clip": 0.06291518, + "balance_loss_mlp": 0.01256701, + "epoch": 0.27163685555388545, + "flos": 22389430849920.0, + "grad_norm": 2.0334929641517956, + "language_loss": 0.7833634, + "learning_rate": 3.4181324342044607e-06, + "loss": 0.86113334, + "num_input_tokens_seen": 97734545, + "router_z_loss_clip": 2.09179688, + "router_z_loss_mlp": 0.19592285, + "step": 4518, + "time_per_iteration": 2.5335004329681396 + }, + { + "auxiliary_loss_clip": 0.06502728, + "auxiliary_loss_mlp": 0.01276691, + "balance_loss_clip": 0.06292961, + "balance_loss_mlp": 0.0125925, + "epoch": 0.2716969788065534, + "flos": 22353358867200.0, + "grad_norm": 1.6261203259974584, + "language_loss": 0.68873644, + "learning_rate": 3.41785778156811e-06, + "loss": 0.76653063, + "num_input_tokens_seen": 97754000, + "router_z_loss_clip": 2.09863281, + "router_z_loss_mlp": 0.17443848, + "step": 4519, + "time_per_iteration": 2.60939359664917 + }, + { + "auxiliary_loss_clip": 0.06500532, + "auxiliary_loss_mlp": 0.0127723, + "balance_loss_clip": 0.06291862, + "balance_loss_mlp": 0.01260302, + "epoch": 0.2717571020592214, + "flos": 25235497031040.0, + "grad_norm": 1.9620818548787327, + "language_loss": 0.75925875, + "learning_rate": 3.417583075166451e-06, + "loss": 0.83703637, + "num_input_tokens_seen": 97772080, + "router_z_loss_clip": 2.08789062, + "router_z_loss_mlp": 0.16931152, + "step": 4520, + "time_per_iteration": 3.988518238067627 + }, + { + "auxiliary_loss_clip": 0.06503896, + "auxiliary_loss_mlp": 0.012736, + "balance_loss_clip": 0.06291716, + "balance_loss_mlp": 0.01253942, + "epoch": 0.2718172253118894, + "flos": 20195343688320.0, + "grad_norm": 3.05783023991908, + "language_loss": 0.76690799, + "learning_rate": 3.4173083150099e-06, + "loss": 0.84468293, + "num_input_tokens_seen": 97789370, + "router_z_loss_clip": 2.12304688, + "router_z_loss_mlp": 0.1965332, + "step": 4521, + "time_per_iteration": 3.9463987350463867 + }, + { + "auxiliary_loss_clip": 0.0650706, + "auxiliary_loss_mlp": 0.0127528, + "balance_loss_clip": 0.06291709, + "balance_loss_mlp": 0.01255432, + "epoch": 0.27187734856455736, + "flos": 14324318858880.0, + "grad_norm": 2.0792585055499435, + "language_loss": 0.74927616, + "learning_rate": 3.417033501108875e-06, + "loss": 0.82709956, + "num_input_tokens_seen": 97807385, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.19824219, + "step": 4522, + "time_per_iteration": 2.576792001724243 + }, + { + "auxiliary_loss_clip": 0.06503602, + "auxiliary_loss_mlp": 0.01276885, + "balance_loss_clip": 0.06291734, + "balance_loss_mlp": 0.01258884, + "epoch": 0.27193747181722533, + "flos": 21114375269760.0, + "grad_norm": 1.7974712998396492, + "language_loss": 0.73055947, + "learning_rate": 3.416758633473798e-06, + "loss": 0.80836433, + "num_input_tokens_seen": 97827930, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.17993164, + "step": 4523, + "time_per_iteration": 2.5116758346557617 + }, + { + "auxiliary_loss_clip": 0.06493908, + "auxiliary_loss_mlp": 0.01278011, + "balance_loss_clip": 0.06286807, + "balance_loss_mlp": 0.01259665, + "epoch": 0.2719975950698933, + "flos": 19688910410880.0, + "grad_norm": 1.3231652709358832, + "language_loss": 0.74779463, + "learning_rate": 3.4164837121150915e-06, + "loss": 0.82551384, + "num_input_tokens_seen": 97847440, + "router_z_loss_clip": 2.0703125, + "router_z_loss_mlp": 0.18334961, + "step": 4524, + "time_per_iteration": 2.5318901538848877 + }, + { + "auxiliary_loss_clip": 0.06503987, + "auxiliary_loss_mlp": 0.01277059, + "balance_loss_clip": 0.06291917, + "balance_loss_mlp": 0.01258248, + "epoch": 0.27205771832256126, + "flos": 24761488083840.0, + "grad_norm": 2.222226091972884, + "language_loss": 0.76783192, + "learning_rate": 3.4162087370431803e-06, + "loss": 0.84564239, + "num_input_tokens_seen": 97867620, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.18811035, + "step": 4525, + "time_per_iteration": 2.594209909439087 + }, + { + "auxiliary_loss_clip": 0.06492639, + "auxiliary_loss_mlp": 0.01271759, + "balance_loss_clip": 0.0628486, + "balance_loss_mlp": 0.01254712, + "epoch": 0.2721178415752292, + "flos": 21760903774080.0, + "grad_norm": 1.8877793172534498, + "language_loss": 0.82166058, + "learning_rate": 3.4159337082684926e-06, + "loss": 0.89930463, + "num_input_tokens_seen": 97884345, + "router_z_loss_clip": 2.07617188, + "router_z_loss_mlp": 0.17041016, + "step": 4526, + "time_per_iteration": 3.9739785194396973 + }, + { + "auxiliary_loss_clip": 0.06510428, + "auxiliary_loss_mlp": 0.01273954, + "balance_loss_clip": 0.06292043, + "balance_loss_mlp": 0.01254189, + "epoch": 0.2721779648278972, + "flos": 12681667416960.0, + "grad_norm": 2.608637418907724, + "language_loss": 0.77407986, + "learning_rate": 3.4156586258014566e-06, + "loss": 0.8519237, + "num_input_tokens_seen": 97901500, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.19763184, + "step": 4527, + "time_per_iteration": 2.5017969608306885 + }, + { + "auxiliary_loss_clip": 0.06502572, + "auxiliary_loss_mlp": 0.01278457, + "balance_loss_clip": 0.0629287, + "balance_loss_mlp": 0.01260194, + "epoch": 0.27223808808056515, + "flos": 16258774544640.0, + "grad_norm": 2.1231016049423608, + "language_loss": 0.82676923, + "learning_rate": 3.415383489652503e-06, + "loss": 0.90457952, + "num_input_tokens_seen": 97917800, + "router_z_loss_clip": 2.09570312, + "router_z_loss_mlp": 0.18249512, + "step": 4528, + "time_per_iteration": 2.5011186599731445 + }, + { + "auxiliary_loss_clip": 0.06500327, + "auxiliary_loss_mlp": 0.012781, + "balance_loss_clip": 0.06293638, + "balance_loss_mlp": 0.01260064, + "epoch": 0.2722982113332331, + "flos": 27753225788160.0, + "grad_norm": 1.6573852241711216, + "language_loss": 0.77553773, + "learning_rate": 3.4151082998320666e-06, + "loss": 0.85332191, + "num_input_tokens_seen": 97937225, + "router_z_loss_clip": 2.06640625, + "router_z_loss_mlp": 0.18041992, + "step": 4529, + "time_per_iteration": 2.5810396671295166 + }, + { + "auxiliary_loss_clip": 0.06499013, + "auxiliary_loss_mlp": 0.01277211, + "balance_loss_clip": 0.06287834, + "balance_loss_mlp": 0.01259055, + "epoch": 0.2723583345859011, + "flos": 21732756001920.0, + "grad_norm": 2.1115027178358354, + "language_loss": 0.82665265, + "learning_rate": 3.4148330563505805e-06, + "loss": 0.90441489, + "num_input_tokens_seen": 97956845, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.18164062, + "step": 4530, + "time_per_iteration": 2.586454391479492 + }, + { + "auxiliary_loss_clip": 0.06502904, + "auxiliary_loss_mlp": 0.01282339, + "balance_loss_clip": 0.06295159, + "balance_loss_mlp": 0.0126379, + "epoch": 0.27241845783856905, + "flos": 17352925159680.0, + "grad_norm": 2.154635693147181, + "language_loss": 0.92694783, + "learning_rate": 3.4145577592184838e-06, + "loss": 1.0048002, + "num_input_tokens_seen": 97972465, + "router_z_loss_clip": 2.078125, + "router_z_loss_mlp": 0.18530273, + "step": 4531, + "time_per_iteration": 2.5160703659057617 + }, + { + "auxiliary_loss_clip": 0.06501545, + "auxiliary_loss_mlp": 0.01275858, + "balance_loss_clip": 0.06288925, + "balance_loss_mlp": 0.01257928, + "epoch": 0.272478581091237, + "flos": 24761278448640.0, + "grad_norm": 1.903467624841223, + "language_loss": 0.76781744, + "learning_rate": 3.4142824084462155e-06, + "loss": 0.84559143, + "num_input_tokens_seen": 97990770, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.17919922, + "step": 4532, + "time_per_iteration": 2.568319082260132 + }, + { + "auxiliary_loss_clip": 0.06500092, + "auxiliary_loss_mlp": 0.01272039, + "balance_loss_clip": 0.06296673, + "balance_loss_mlp": 0.0125448, + "epoch": 0.272538704343905, + "flos": 17895723909120.0, + "grad_norm": 2.5230523304945685, + "language_loss": 0.89717656, + "learning_rate": 3.4140070040442162e-06, + "loss": 0.97489792, + "num_input_tokens_seen": 98005775, + "router_z_loss_clip": 2.03125, + "router_z_loss_mlp": 0.17565918, + "step": 4533, + "time_per_iteration": 2.538637399673462 + }, + { + "auxiliary_loss_clip": 0.06497633, + "auxiliary_loss_mlp": 0.01272152, + "balance_loss_clip": 0.06294405, + "balance_loss_mlp": 0.01255559, + "epoch": 0.272598827596573, + "flos": 22939021779840.0, + "grad_norm": 1.9282389689502992, + "language_loss": 0.72213519, + "learning_rate": 3.413731546022929e-06, + "loss": 0.79983306, + "num_input_tokens_seen": 98025750, + "router_z_loss_clip": 2.03027344, + "router_z_loss_mlp": 0.16589355, + "step": 4534, + "time_per_iteration": 2.5503549575805664 + }, + { + "auxiliary_loss_clip": 0.06500763, + "auxiliary_loss_mlp": 0.01275564, + "balance_loss_clip": 0.06290451, + "balance_loss_mlp": 0.01255847, + "epoch": 0.27265895084924097, + "flos": 24244447265280.0, + "grad_norm": 1.8514773269853142, + "language_loss": 0.91784394, + "learning_rate": 3.4134560343928005e-06, + "loss": 0.99560714, + "num_input_tokens_seen": 98044955, + "router_z_loss_clip": 2.10449219, + "router_z_loss_mlp": 0.19702148, + "step": 4535, + "time_per_iteration": 2.558943510055542 + }, + { + "auxiliary_loss_clip": 0.06506651, + "auxiliary_loss_mlp": 0.01276542, + "balance_loss_clip": 0.06297188, + "balance_loss_mlp": 0.01258768, + "epoch": 0.27271907410190893, + "flos": 27019962708480.0, + "grad_norm": 1.7799258806344853, + "language_loss": 0.73195565, + "learning_rate": 3.4131804691642778e-06, + "loss": 0.80978757, + "num_input_tokens_seen": 98065860, + "router_z_loss_clip": 2.09472656, + "router_z_loss_mlp": 0.17773438, + "step": 4536, + "time_per_iteration": 2.5590782165527344 + }, + { + "auxiliary_loss_clip": 0.06502935, + "auxiliary_loss_mlp": 0.01275256, + "balance_loss_clip": 0.0629502, + "balance_loss_mlp": 0.01257351, + "epoch": 0.2727791973545769, + "flos": 34460027568000.0, + "grad_norm": 1.8462150885541477, + "language_loss": 0.72167033, + "learning_rate": 3.41290485034781e-06, + "loss": 0.79945225, + "num_input_tokens_seen": 98085450, + "router_z_loss_clip": 2.07714844, + "router_z_loss_mlp": 0.17907715, + "step": 4537, + "time_per_iteration": 2.680515766143799 + }, + { + "auxiliary_loss_clip": 0.06501988, + "auxiliary_loss_mlp": 0.01276469, + "balance_loss_clip": 0.06293489, + "balance_loss_mlp": 0.0125829, + "epoch": 0.27283932060724486, + "flos": 15045842367360.0, + "grad_norm": 2.3888098238231503, + "language_loss": 0.78421736, + "learning_rate": 3.4126291779538485e-06, + "loss": 0.8620019, + "num_input_tokens_seen": 98099115, + "router_z_loss_clip": 2.08496094, + "router_z_loss_mlp": 0.1817627, + "step": 4538, + "time_per_iteration": 2.4626059532165527 + }, + { + "auxiliary_loss_clip": 0.06506806, + "auxiliary_loss_mlp": 0.01275863, + "balance_loss_clip": 0.06298484, + "balance_loss_mlp": 0.01258566, + "epoch": 0.2728994438599128, + "flos": 21658767246720.0, + "grad_norm": 1.6357140094020364, + "language_loss": 0.90640903, + "learning_rate": 3.412353451992847e-06, + "loss": 0.9842357, + "num_input_tokens_seen": 98118415, + "router_z_loss_clip": 2.08007812, + "router_z_loss_mlp": 0.17297363, + "step": 4539, + "time_per_iteration": 2.5629584789276123 + }, + { + "auxiliary_loss_clip": 0.06501281, + "auxiliary_loss_mlp": 0.01272923, + "balance_loss_clip": 0.06294584, + "balance_loss_mlp": 0.01253778, + "epoch": 0.2729595671125808, + "flos": 17493313875840.0, + "grad_norm": 1.7229738452441967, + "language_loss": 0.88610893, + "learning_rate": 3.4120776724752607e-06, + "loss": 0.96385098, + "num_input_tokens_seen": 98136300, + "router_z_loss_clip": 2.06542969, + "router_z_loss_mlp": 0.19140625, + "step": 4540, + "time_per_iteration": 2.4959304332733154 + }, + { + "auxiliary_loss_clip": 0.06504017, + "auxiliary_loss_mlp": 0.01274499, + "balance_loss_clip": 0.06294081, + "balance_loss_mlp": 0.0125744, + "epoch": 0.27301969036524876, + "flos": 19324249441920.0, + "grad_norm": 2.2191409784662, + "language_loss": 0.8242712, + "learning_rate": 3.4118018394115476e-06, + "loss": 0.9020564, + "num_input_tokens_seen": 98154580, + "router_z_loss_clip": 2.09960938, + "router_z_loss_mlp": 0.17053223, + "step": 4541, + "time_per_iteration": 2.550239086151123 + }, + { + "auxiliary_loss_clip": 0.06500127, + "auxiliary_loss_mlp": 0.01279087, + "balance_loss_clip": 0.06291916, + "balance_loss_mlp": 0.01260431, + "epoch": 0.2730798136179167, + "flos": 21071427252480.0, + "grad_norm": 2.3060281935178795, + "language_loss": 0.80131608, + "learning_rate": 3.4115259528121678e-06, + "loss": 0.87910819, + "num_input_tokens_seen": 98173115, + "router_z_loss_clip": 2.08007812, + "router_z_loss_mlp": 0.18664551, + "step": 4542, + "time_per_iteration": 2.519717216491699 + }, + { + "auxiliary_loss_clip": 0.06509651, + "auxiliary_loss_mlp": 0.01276731, + "balance_loss_clip": 0.06301565, + "balance_loss_mlp": 0.01258599, + "epoch": 0.2731399368705847, + "flos": 19177739377920.0, + "grad_norm": 1.9524817452008785, + "language_loss": 0.89606124, + "learning_rate": 3.411250012687582e-06, + "loss": 0.97392499, + "num_input_tokens_seen": 98190260, + "router_z_loss_clip": 2.08105469, + "router_z_loss_mlp": 0.18139648, + "step": 4543, + "time_per_iteration": 2.5182156562805176 + }, + { + "auxiliary_loss_clip": 0.06509942, + "auxiliary_loss_mlp": 0.01279235, + "balance_loss_clip": 0.06297313, + "balance_loss_mlp": 0.012604, + "epoch": 0.27320006012325265, + "flos": 18294989414400.0, + "grad_norm": 2.101118642115193, + "language_loss": 0.64112943, + "learning_rate": 3.410974019048255e-06, + "loss": 0.7190212, + "num_input_tokens_seen": 98207115, + "router_z_loss_clip": 2.12597656, + "router_z_loss_mlp": 0.18823242, + "step": 4544, + "time_per_iteration": 2.482348918914795 + }, + { + "auxiliary_loss_clip": 0.06504791, + "auxiliary_loss_mlp": 0.01282982, + "balance_loss_clip": 0.06296986, + "balance_loss_mlp": 0.01264231, + "epoch": 0.2732601833759206, + "flos": 34869607125120.0, + "grad_norm": 1.6845842729353224, + "language_loss": 0.70290005, + "learning_rate": 3.410697971904651e-06, + "loss": 0.78077781, + "num_input_tokens_seen": 98230610, + "router_z_loss_clip": 2.07714844, + "router_z_loss_mlp": 0.1875, + "step": 4545, + "time_per_iteration": 2.6779940128326416 + }, + { + "auxiliary_loss_clip": 0.06375119, + "auxiliary_loss_mlp": 0.01256033, + "balance_loss_clip": 0.06273499, + "balance_loss_mlp": 0.01252296, + "epoch": 0.2733203066285886, + "flos": 53929514534400.0, + "grad_norm": 0.7176798913576009, + "language_loss": 0.61676908, + "learning_rate": 3.4104218712672383e-06, + "loss": 0.6930806, + "num_input_tokens_seen": 98293585, + "router_z_loss_clip": 1.015625, + "router_z_loss_mlp": 0.03729248, + "step": 4546, + "time_per_iteration": 3.1508243083953857 + }, + { + "auxiliary_loss_clip": 0.06510071, + "auxiliary_loss_mlp": 0.01277702, + "balance_loss_clip": 0.06301852, + "balance_loss_mlp": 0.01258843, + "epoch": 0.2733804298812566, + "flos": 20665411493760.0, + "grad_norm": 1.9095347334938924, + "language_loss": 0.65170372, + "learning_rate": 3.410145717146488e-06, + "loss": 0.72958136, + "num_input_tokens_seen": 98311680, + "router_z_loss_clip": 2.08105469, + "router_z_loss_mlp": 0.1887207, + "step": 4547, + "time_per_iteration": 2.57828426361084 + }, + { + "auxiliary_loss_clip": 0.06498976, + "auxiliary_loss_mlp": 0.0127425, + "balance_loss_clip": 0.06296893, + "balance_loss_mlp": 0.01257799, + "epoch": 0.27344055313392457, + "flos": 25891333338240.0, + "grad_norm": 2.438857151480637, + "language_loss": 0.78365928, + "learning_rate": 3.4098695095528694e-06, + "loss": 0.86139154, + "num_input_tokens_seen": 98330770, + "router_z_loss_clip": 2.02246094, + "router_z_loss_mlp": 0.16455078, + "step": 4548, + "time_per_iteration": 2.566077470779419 + }, + { + "auxiliary_loss_clip": 0.0650417, + "auxiliary_loss_mlp": 0.01276844, + "balance_loss_clip": 0.06295689, + "balance_loss_mlp": 0.01259785, + "epoch": 0.27350067638659253, + "flos": 22936380376320.0, + "grad_norm": 2.3129649243249157, + "language_loss": 0.83350241, + "learning_rate": 3.4095932484968585e-06, + "loss": 0.91131258, + "num_input_tokens_seen": 98349860, + "router_z_loss_clip": 2.08496094, + "router_z_loss_mlp": 0.17053223, + "step": 4549, + "time_per_iteration": 2.560349941253662 + }, + { + "auxiliary_loss_clip": 0.06503863, + "auxiliary_loss_mlp": 0.01277801, + "balance_loss_clip": 0.06292209, + "balance_loss_mlp": 0.0125707, + "epoch": 0.2735607996392605, + "flos": 16579313539200.0, + "grad_norm": 2.1355332193902568, + "language_loss": 0.71687186, + "learning_rate": 3.4093169339889305e-06, + "loss": 0.79468852, + "num_input_tokens_seen": 98367040, + "router_z_loss_clip": 2.11328125, + "router_z_loss_mlp": 0.20727539, + "step": 4550, + "time_per_iteration": 2.4829771518707275 + }, + { + "auxiliary_loss_clip": 0.06503724, + "auxiliary_loss_mlp": 0.01271817, + "balance_loss_clip": 0.06298332, + "balance_loss_mlp": 0.01253435, + "epoch": 0.27362092289192846, + "flos": 19651245200640.0, + "grad_norm": 2.4590448673698546, + "language_loss": 0.79561722, + "learning_rate": 3.409040566039563e-06, + "loss": 0.87337267, + "num_input_tokens_seen": 98384010, + "router_z_loss_clip": 2.05078125, + "router_z_loss_mlp": 0.18371582, + "step": 4551, + "time_per_iteration": 2.5074269771575928 + }, + { + "auxiliary_loss_clip": 0.06500211, + "auxiliary_loss_mlp": 0.01281852, + "balance_loss_clip": 0.06290769, + "balance_loss_mlp": 0.01263565, + "epoch": 0.27368104614459643, + "flos": 17644855184640.0, + "grad_norm": 2.2858009613836465, + "language_loss": 0.71362597, + "learning_rate": 3.4087641446592362e-06, + "loss": 0.79144663, + "num_input_tokens_seen": 98399625, + "router_z_loss_clip": 2.09765625, + "router_z_loss_mlp": 0.18286133, + "step": 4552, + "time_per_iteration": 2.478208541870117 + }, + { + "auxiliary_loss_clip": 0.0650662, + "auxiliary_loss_mlp": 0.01277463, + "balance_loss_clip": 0.06295393, + "balance_loss_mlp": 0.01258759, + "epoch": 0.2737411693972644, + "flos": 21586455573120.0, + "grad_norm": 1.8660820035104149, + "language_loss": 0.71756262, + "learning_rate": 3.408487669858431e-06, + "loss": 0.79540348, + "num_input_tokens_seen": 98417310, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.18701172, + "step": 4553, + "time_per_iteration": 2.5268712043762207 + }, + { + "auxiliary_loss_clip": 0.0650337, + "auxiliary_loss_mlp": 0.01274907, + "balance_loss_clip": 0.06293483, + "balance_loss_mlp": 0.01255738, + "epoch": 0.27380129264993236, + "flos": 25491145438080.0, + "grad_norm": 1.7561499880950933, + "language_loss": 0.60065031, + "learning_rate": 3.4082111416476337e-06, + "loss": 0.67843306, + "num_input_tokens_seen": 98438670, + "router_z_loss_clip": 2.09765625, + "router_z_loss_mlp": 0.19177246, + "step": 4554, + "time_per_iteration": 2.5836522579193115 + }, + { + "auxiliary_loss_clip": 0.06509934, + "auxiliary_loss_mlp": 0.01281174, + "balance_loss_clip": 0.06291255, + "balance_loss_mlp": 0.01261838, + "epoch": 0.2738614159026003, + "flos": 18667155323520.0, + "grad_norm": 1.5632450212680145, + "language_loss": 0.74850649, + "learning_rate": 3.4079345600373275e-06, + "loss": 0.82641757, + "num_input_tokens_seen": 98456060, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.1932373, + "step": 4555, + "time_per_iteration": 3.9590039253234863 + }, + { + "auxiliary_loss_clip": 0.06511028, + "auxiliary_loss_mlp": 0.01279514, + "balance_loss_clip": 0.0629926, + "balance_loss_mlp": 0.0125982, + "epoch": 0.2739215391552683, + "flos": 23483874954240.0, + "grad_norm": 6.994475758797384, + "language_loss": 0.7822473, + "learning_rate": 3.407657925038002e-06, + "loss": 0.86015272, + "num_input_tokens_seen": 98473765, + "router_z_loss_clip": 2.11523438, + "router_z_loss_mlp": 0.19677734, + "step": 4556, + "time_per_iteration": 2.5688674449920654 + }, + { + "auxiliary_loss_clip": 0.06517123, + "auxiliary_loss_mlp": 0.01280796, + "balance_loss_clip": 0.06293104, + "balance_loss_mlp": 0.01260125, + "epoch": 0.27398166240793626, + "flos": 17134313057280.0, + "grad_norm": 1.8677949115203087, + "language_loss": 0.83077759, + "learning_rate": 3.4073812366601473e-06, + "loss": 0.90875673, + "num_input_tokens_seen": 98490590, + "router_z_loss_clip": 2.2421875, + "router_z_loss_mlp": 0.20690918, + "step": 4557, + "time_per_iteration": 2.490562915802002 + }, + { + "auxiliary_loss_clip": 0.06504503, + "auxiliary_loss_mlp": 0.01276773, + "balance_loss_clip": 0.06292793, + "balance_loss_mlp": 0.01256292, + "epoch": 0.2740417856606042, + "flos": 23411563280640.0, + "grad_norm": 1.9738441909854203, + "language_loss": 0.73066616, + "learning_rate": 3.4071044949142547e-06, + "loss": 0.80847895, + "num_input_tokens_seen": 98510590, + "router_z_loss_clip": 2.1171875, + "router_z_loss_mlp": 0.20483398, + "step": 4558, + "time_per_iteration": 2.5761232376098633 + }, + { + "auxiliary_loss_clip": 0.06504066, + "auxiliary_loss_mlp": 0.01276845, + "balance_loss_clip": 0.06292865, + "balance_loss_mlp": 0.01256651, + "epoch": 0.2741019089132722, + "flos": 12784307068800.0, + "grad_norm": 2.149984670873407, + "language_loss": 0.68751299, + "learning_rate": 3.406827699810819e-06, + "loss": 0.76532209, + "num_input_tokens_seen": 98527875, + "router_z_loss_clip": 2.10839844, + "router_z_loss_mlp": 0.2019043, + "step": 4559, + "time_per_iteration": 2.4976439476013184 + }, + { + "auxiliary_loss_clip": 0.06501673, + "auxiliary_loss_mlp": 0.01278249, + "balance_loss_clip": 0.0629222, + "balance_loss_mlp": 0.01259676, + "epoch": 0.27416203216594015, + "flos": 20637850700160.0, + "grad_norm": 1.7403202614473876, + "language_loss": 0.72741163, + "learning_rate": 3.4065508513603353e-06, + "loss": 0.80521083, + "num_input_tokens_seen": 98547575, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 0.18566895, + "step": 4560, + "time_per_iteration": 4.005557537078857 + }, + { + "auxiliary_loss_clip": 0.06501405, + "auxiliary_loss_mlp": 0.01278052, + "balance_loss_clip": 0.06289977, + "balance_loss_mlp": 0.01259718, + "epoch": 0.27422215541860817, + "flos": 26548762872960.0, + "grad_norm": 1.7791790627265829, + "language_loss": 0.82245278, + "learning_rate": 3.406273949573303e-06, + "loss": 0.90024734, + "num_input_tokens_seen": 98566290, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.18334961, + "step": 4561, + "time_per_iteration": 4.059048652648926 + }, + { + "auxiliary_loss_clip": 0.06510133, + "auxiliary_loss_mlp": 0.01276094, + "balance_loss_clip": 0.06296331, + "balance_loss_mlp": 0.012564, + "epoch": 0.27428227867127614, + "flos": 23337868014720.0, + "grad_norm": 1.9098162884662422, + "language_loss": 0.75760031, + "learning_rate": 3.4059969944602214e-06, + "loss": 0.83546257, + "num_input_tokens_seen": 98586255, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.19702148, + "step": 4562, + "time_per_iteration": 2.558397054672241 + }, + { + "auxiliary_loss_clip": 0.06506505, + "auxiliary_loss_mlp": 0.01277189, + "balance_loss_clip": 0.06293164, + "balance_loss_mlp": 0.01258092, + "epoch": 0.2743424019239441, + "flos": 23041074453120.0, + "grad_norm": 1.577834756327151, + "language_loss": 0.75198597, + "learning_rate": 3.4057199860315928e-06, + "loss": 0.8298229, + "num_input_tokens_seen": 98606030, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.19091797, + "step": 4563, + "time_per_iteration": 2.5698354244232178 + }, + { + "auxiliary_loss_clip": 0.06524341, + "auxiliary_loss_mlp": 0.01283879, + "balance_loss_clip": 0.06305183, + "balance_loss_mlp": 0.01262302, + "epoch": 0.27440252517661207, + "flos": 21987565868160.0, + "grad_norm": 2.0193615345580085, + "language_loss": 0.6348893, + "learning_rate": 3.4054429242979213e-06, + "loss": 0.71297145, + "num_input_tokens_seen": 98625225, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.21569824, + "step": 4564, + "time_per_iteration": 2.545741558074951 + }, + { + "auxiliary_loss_clip": 0.06513885, + "auxiliary_loss_mlp": 0.01280066, + "balance_loss_clip": 0.06299828, + "balance_loss_mlp": 0.01260647, + "epoch": 0.27446264842928003, + "flos": 40196952737280.0, + "grad_norm": 2.2005709679787153, + "language_loss": 0.7878077, + "learning_rate": 3.4051658092697135e-06, + "loss": 0.86574721, + "num_input_tokens_seen": 98649470, + "router_z_loss_clip": 2.14160156, + "router_z_loss_mlp": 0.19433594, + "step": 4565, + "time_per_iteration": 2.7061169147491455 + }, + { + "auxiliary_loss_clip": 0.0650921, + "auxiliary_loss_mlp": 0.01277346, + "balance_loss_clip": 0.06296623, + "balance_loss_mlp": 0.01257903, + "epoch": 0.274522771681948, + "flos": 13484684620800.0, + "grad_norm": 1.9604173340299715, + "language_loss": 0.69729757, + "learning_rate": 3.404888640957477e-06, + "loss": 0.77516317, + "num_input_tokens_seen": 98666915, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.19458008, + "step": 4566, + "time_per_iteration": 3.9156126976013184 + }, + { + "auxiliary_loss_clip": 0.06511474, + "auxiliary_loss_mlp": 0.0128161, + "balance_loss_clip": 0.06300822, + "balance_loss_mlp": 0.0126318, + "epoch": 0.27458289493461596, + "flos": 28629812476800.0, + "grad_norm": 1.605297231279352, + "language_loss": 0.61699307, + "learning_rate": 3.404611419371723e-06, + "loss": 0.69492388, + "num_input_tokens_seen": 98688240, + "router_z_loss_clip": 2.10546875, + "router_z_loss_mlp": 0.18432617, + "step": 4567, + "time_per_iteration": 2.5721306800842285 + }, + { + "auxiliary_loss_clip": 0.06514515, + "auxiliary_loss_mlp": 0.01275467, + "balance_loss_clip": 0.06299441, + "balance_loss_mlp": 0.01255511, + "epoch": 0.2746430181872839, + "flos": 20125883053440.0, + "grad_norm": 1.9422441687055725, + "language_loss": 0.83055782, + "learning_rate": 3.4043341445229627e-06, + "loss": 0.90845764, + "num_input_tokens_seen": 98708245, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.19970703, + "step": 4568, + "time_per_iteration": 2.5616700649261475 + }, + { + "auxiliary_loss_clip": 0.06521738, + "auxiliary_loss_mlp": 0.01275653, + "balance_loss_clip": 0.06304733, + "balance_loss_mlp": 0.01255709, + "epoch": 0.2747031414399519, + "flos": 20199662173440.0, + "grad_norm": 2.1285143693034367, + "language_loss": 0.6896143, + "learning_rate": 3.4040568164217117e-06, + "loss": 0.76758814, + "num_input_tokens_seen": 98724575, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.19934082, + "step": 4569, + "time_per_iteration": 2.531096935272217 + }, + { + "auxiliary_loss_clip": 0.06517979, + "auxiliary_loss_mlp": 0.01281496, + "balance_loss_clip": 0.06303072, + "balance_loss_mlp": 0.0126216, + "epoch": 0.27476326469261986, + "flos": 13521385509120.0, + "grad_norm": 2.4613635331126926, + "language_loss": 0.71897286, + "learning_rate": 3.4037794350784848e-06, + "loss": 0.79696763, + "num_input_tokens_seen": 98740700, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.19360352, + "step": 4570, + "time_per_iteration": 2.5235774517059326 + }, + { + "auxiliary_loss_clip": 0.06414898, + "auxiliary_loss_mlp": 0.01257276, + "balance_loss_clip": 0.06312878, + "balance_loss_mlp": 0.01253897, + "epoch": 0.2748233879452878, + "flos": 65955486153600.0, + "grad_norm": 0.6977768363268191, + "language_loss": 0.5577414, + "learning_rate": 3.4035020005038014e-06, + "loss": 0.63446319, + "num_input_tokens_seen": 98803030, + "router_z_loss_clip": 1.015625, + "router_z_loss_mlp": 0.03387451, + "step": 4571, + "time_per_iteration": 3.234433889389038 + }, + { + "auxiliary_loss_clip": 0.06526154, + "auxiliary_loss_mlp": 0.01279423, + "balance_loss_clip": 0.06308736, + "balance_loss_mlp": 0.01260326, + "epoch": 0.2748835111979558, + "flos": 17389961464320.0, + "grad_norm": 2.165338105639142, + "language_loss": 0.78105313, + "learning_rate": 3.4032245127081812e-06, + "loss": 0.85910892, + "num_input_tokens_seen": 98820505, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.19104004, + "step": 4572, + "time_per_iteration": 2.562450647354126 + }, + { + "auxiliary_loss_clip": 0.06506811, + "auxiliary_loss_mlp": 0.01278507, + "balance_loss_clip": 0.06298923, + "balance_loss_mlp": 0.01261711, + "epoch": 0.27494363445062375, + "flos": 23594480743680.0, + "grad_norm": 2.0912194071895014, + "language_loss": 0.81855798, + "learning_rate": 3.402946971702147e-06, + "loss": 0.89641118, + "num_input_tokens_seen": 98842150, + "router_z_loss_clip": 2.078125, + "router_z_loss_mlp": 0.16809082, + "step": 4573, + "time_per_iteration": 2.575467824935913 + }, + { + "auxiliary_loss_clip": 0.06512269, + "auxiliary_loss_mlp": 0.01277933, + "balance_loss_clip": 0.06303579, + "balance_loss_mlp": 0.01258585, + "epoch": 0.2750037577032918, + "flos": 17170175404800.0, + "grad_norm": 1.5550185346959569, + "language_loss": 0.79688454, + "learning_rate": 3.402669377496223e-06, + "loss": 0.87478662, + "num_input_tokens_seen": 98861050, + "router_z_loss_clip": 2.08398438, + "router_z_loss_mlp": 0.19360352, + "step": 4574, + "time_per_iteration": 2.522381067276001 + }, + { + "auxiliary_loss_clip": 0.06514049, + "auxiliary_loss_mlp": 0.012813, + "balance_loss_clip": 0.06300252, + "balance_loss_mlp": 0.01263383, + "epoch": 0.27506388095595974, + "flos": 24497663904000.0, + "grad_norm": 1.9638366231768782, + "language_loss": 0.75217533, + "learning_rate": 3.402391730100936e-06, + "loss": 0.83012879, + "num_input_tokens_seen": 98879695, + "router_z_loss_clip": 2.14160156, + "router_z_loss_mlp": 0.17907715, + "step": 4575, + "time_per_iteration": 2.564023971557617 + }, + { + "auxiliary_loss_clip": 0.06513455, + "auxiliary_loss_mlp": 0.01285217, + "balance_loss_clip": 0.06304657, + "balance_loss_mlp": 0.01267562, + "epoch": 0.2751240042086277, + "flos": 38774003500800.0, + "grad_norm": 1.5894976166299741, + "language_loss": 0.71788073, + "learning_rate": 3.402114029526814e-06, + "loss": 0.79586744, + "num_input_tokens_seen": 98902035, + "router_z_loss_clip": 2.08789062, + "router_z_loss_mlp": 0.17663574, + "step": 4576, + "time_per_iteration": 2.6856141090393066 + }, + { + "auxiliary_loss_clip": 0.06515673, + "auxiliary_loss_mlp": 0.01294199, + "balance_loss_clip": 0.06304252, + "balance_loss_mlp": 0.0127447, + "epoch": 0.27518412746129567, + "flos": 26914388163840.0, + "grad_norm": 1.693116107866749, + "language_loss": 0.73358452, + "learning_rate": 3.4018362757843866e-06, + "loss": 0.81168324, + "num_input_tokens_seen": 98921835, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.19726562, + "step": 4577, + "time_per_iteration": 2.5795719623565674 + }, + { + "auxiliary_loss_clip": 0.06517484, + "auxiliary_loss_mlp": 0.01279945, + "balance_loss_clip": 0.0630409, + "balance_loss_mlp": 0.01260514, + "epoch": 0.27524425071396363, + "flos": 24907578877440.0, + "grad_norm": 1.9498672791378742, + "language_loss": 0.76234132, + "learning_rate": 3.401558468884188e-06, + "loss": 0.84031564, + "num_input_tokens_seen": 98939610, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.19433594, + "step": 4578, + "time_per_iteration": 2.5547378063201904 + }, + { + "auxiliary_loss_clip": 0.06518476, + "auxiliary_loss_mlp": 0.01286331, + "balance_loss_clip": 0.06304644, + "balance_loss_mlp": 0.01265255, + "epoch": 0.2753043739666316, + "flos": 26295504307200.0, + "grad_norm": 1.3718100748583155, + "language_loss": 0.66504484, + "learning_rate": 3.4012806088367516e-06, + "loss": 0.74309289, + "num_input_tokens_seen": 98962250, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.21069336, + "step": 4579, + "time_per_iteration": 2.6126484870910645 + }, + { + "auxiliary_loss_clip": 0.06516613, + "auxiliary_loss_mlp": 0.01291851, + "balance_loss_clip": 0.06301446, + "balance_loss_mlp": 0.01271753, + "epoch": 0.27536449721929956, + "flos": 24213616162560.0, + "grad_norm": 3.1986582184359853, + "language_loss": 0.80722374, + "learning_rate": 3.4010026956526137e-06, + "loss": 0.88530838, + "num_input_tokens_seen": 98981845, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.2010498, + "step": 4580, + "time_per_iteration": 2.571364164352417 + }, + { + "auxiliary_loss_clip": 0.06513728, + "auxiliary_loss_mlp": 0.01285107, + "balance_loss_clip": 0.06304168, + "balance_loss_mlp": 0.01264305, + "epoch": 0.27542462047196753, + "flos": 19543448522880.0, + "grad_norm": 1.580662182314359, + "language_loss": 0.68234229, + "learning_rate": 3.4007247293423137e-06, + "loss": 0.76033062, + "num_input_tokens_seen": 99001855, + "router_z_loss_clip": 2.09570312, + "router_z_loss_mlp": 0.20788574, + "step": 4581, + "time_per_iteration": 2.5507936477661133 + }, + { + "auxiliary_loss_clip": 0.06515522, + "auxiliary_loss_mlp": 0.01276377, + "balance_loss_clip": 0.06298342, + "balance_loss_mlp": 0.01258448, + "epoch": 0.2754847437246355, + "flos": 14324360785920.0, + "grad_norm": 1.5474830525473977, + "language_loss": 0.78408682, + "learning_rate": 3.400446709916392e-06, + "loss": 0.86200583, + "num_input_tokens_seen": 99019880, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.17919922, + "step": 4582, + "time_per_iteration": 2.511134624481201 + }, + { + "auxiliary_loss_clip": 0.06505451, + "auxiliary_loss_mlp": 0.01284248, + "balance_loss_clip": 0.06298563, + "balance_loss_mlp": 0.01266605, + "epoch": 0.27554486697730346, + "flos": 18843951438720.0, + "grad_norm": 1.627014419094476, + "language_loss": 0.84829235, + "learning_rate": 3.4001686373853895e-06, + "loss": 0.92618936, + "num_input_tokens_seen": 99037570, + "router_z_loss_clip": 2.06933594, + "router_z_loss_mlp": 0.17663574, + "step": 4583, + "time_per_iteration": 2.5625038146972656 + }, + { + "auxiliary_loss_clip": 0.065156, + "auxiliary_loss_mlp": 0.01295136, + "balance_loss_clip": 0.0629985, + "balance_loss_mlp": 0.01274799, + "epoch": 0.2756049902299714, + "flos": 22388801944320.0, + "grad_norm": 2.5216327683147104, + "language_loss": 0.67592049, + "learning_rate": 3.3998905117598528e-06, + "loss": 0.75402784, + "num_input_tokens_seen": 99056875, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.20349121, + "step": 4584, + "time_per_iteration": 2.5712413787841797 + }, + { + "auxiliary_loss_clip": 0.06508277, + "auxiliary_loss_mlp": 0.01286302, + "balance_loss_clip": 0.06299593, + "balance_loss_mlp": 0.01268385, + "epoch": 0.2756651134826394, + "flos": 19580107484160.0, + "grad_norm": 1.7056038485870715, + "language_loss": 0.77640843, + "learning_rate": 3.399612333050327e-06, + "loss": 0.8543542, + "num_input_tokens_seen": 99074685, + "router_z_loss_clip": 2.08789062, + "router_z_loss_mlp": 0.17919922, + "step": 4585, + "time_per_iteration": 2.5581910610198975 + }, + { + "auxiliary_loss_clip": 0.06520131, + "auxiliary_loss_mlp": 0.01290999, + "balance_loss_clip": 0.06302814, + "balance_loss_mlp": 0.01271151, + "epoch": 0.27572523673530736, + "flos": 23593306786560.0, + "grad_norm": 1.6012607614221503, + "language_loss": 0.72652835, + "learning_rate": 3.399334101267362e-06, + "loss": 0.8046397, + "num_input_tokens_seen": 99095300, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.1986084, + "step": 4586, + "time_per_iteration": 2.5581955909729004 + }, + { + "auxiliary_loss_clip": 0.06512299, + "auxiliary_loss_mlp": 0.01283131, + "balance_loss_clip": 0.06300563, + "balance_loss_mlp": 0.01264475, + "epoch": 0.2757853599879754, + "flos": 22826696981760.0, + "grad_norm": 1.4211606049909042, + "language_loss": 0.8102116, + "learning_rate": 3.3990558164215073e-06, + "loss": 0.88816595, + "num_input_tokens_seen": 99115965, + "router_z_loss_clip": 2.11523438, + "router_z_loss_mlp": 0.18664551, + "step": 4587, + "time_per_iteration": 2.6184678077697754 + }, + { + "auxiliary_loss_clip": 0.0651072, + "auxiliary_loss_mlp": 0.01292397, + "balance_loss_clip": 0.06300361, + "balance_loss_mlp": 0.01273037, + "epoch": 0.27584548324064334, + "flos": 18557639637120.0, + "grad_norm": 2.3677019636161716, + "language_loss": 0.83699477, + "learning_rate": 3.398777478523316e-06, + "loss": 0.91502589, + "num_input_tokens_seen": 99134265, + "router_z_loss_clip": 2.10351562, + "router_z_loss_mlp": 0.19348145, + "step": 4588, + "time_per_iteration": 2.5100526809692383 + }, + { + "auxiliary_loss_clip": 0.06502403, + "auxiliary_loss_mlp": 0.01287014, + "balance_loss_clip": 0.06294176, + "balance_loss_mlp": 0.0126856, + "epoch": 0.2759056064933113, + "flos": 23776811228160.0, + "grad_norm": 1.8520309888563375, + "language_loss": 0.76066566, + "learning_rate": 3.398499087583342e-06, + "loss": 0.83855987, + "num_input_tokens_seen": 99156185, + "router_z_loss_clip": 2.08398438, + "router_z_loss_mlp": 0.18457031, + "step": 4589, + "time_per_iteration": 2.5906028747558594 + }, + { + "auxiliary_loss_clip": 0.06503198, + "auxiliary_loss_mlp": 0.01281135, + "balance_loss_clip": 0.06293473, + "balance_loss_mlp": 0.01261703, + "epoch": 0.27596572974597927, + "flos": 24289114291200.0, + "grad_norm": 1.7619688929899446, + "language_loss": 0.88857687, + "learning_rate": 3.398220643612143e-06, + "loss": 0.96642017, + "num_input_tokens_seen": 99176735, + "router_z_loss_clip": 2.09570312, + "router_z_loss_mlp": 0.19421387, + "step": 4590, + "time_per_iteration": 2.5526933670043945 + }, + { + "auxiliary_loss_clip": 0.0650104, + "auxiliary_loss_mlp": 0.01279948, + "balance_loss_clip": 0.06291595, + "balance_loss_mlp": 0.01261041, + "epoch": 0.27602585299864724, + "flos": 35049296206080.0, + "grad_norm": 1.573202994920717, + "language_loss": 0.71835011, + "learning_rate": 3.397942146620277e-06, + "loss": 0.79615998, + "num_input_tokens_seen": 99199765, + "router_z_loss_clip": 2.09667969, + "router_z_loss_mlp": 0.18908691, + "step": 4591, + "time_per_iteration": 2.659573554992676 + }, + { + "auxiliary_loss_clip": 0.06502488, + "auxiliary_loss_mlp": 0.01277501, + "balance_loss_clip": 0.06290874, + "balance_loss_mlp": 0.01258964, + "epoch": 0.2760859762513152, + "flos": 24315123784320.0, + "grad_norm": 2.0980893762293866, + "language_loss": 0.80327255, + "learning_rate": 3.3976635966183046e-06, + "loss": 0.8810724, + "num_input_tokens_seen": 99218435, + "router_z_loss_clip": 2.11621094, + "router_z_loss_mlp": 0.18530273, + "step": 4592, + "time_per_iteration": 2.5534770488739014 + }, + { + "auxiliary_loss_clip": 0.06405188, + "auxiliary_loss_mlp": 0.01272136, + "balance_loss_clip": 0.06302959, + "balance_loss_mlp": 0.0126841, + "epoch": 0.27614609950398317, + "flos": 71279435675520.0, + "grad_norm": 0.6848268802880488, + "language_loss": 0.6162945, + "learning_rate": 3.3973849936167886e-06, + "loss": 0.69306767, + "num_input_tokens_seen": 99276200, + "router_z_loss_clip": 1.0234375, + "router_z_loss_mlp": 0.03717041, + "step": 4593, + "time_per_iteration": 3.127192735671997 + }, + { + "auxiliary_loss_clip": 0.06506699, + "auxiliary_loss_mlp": 0.01276217, + "balance_loss_clip": 0.0629646, + "balance_loss_mlp": 0.01256881, + "epoch": 0.27620622275665113, + "flos": 29681811688320.0, + "grad_norm": 2.6081053554454363, + "language_loss": 0.77380788, + "learning_rate": 3.3971063376262937e-06, + "loss": 0.85163713, + "num_input_tokens_seen": 99297625, + "router_z_loss_clip": 2.1015625, + "router_z_loss_mlp": 0.1932373, + "step": 4594, + "time_per_iteration": 2.5809319019317627 + }, + { + "auxiliary_loss_clip": 0.06503148, + "auxiliary_loss_mlp": 0.01273163, + "balance_loss_clip": 0.06295307, + "balance_loss_mlp": 0.01255138, + "epoch": 0.2762663460093191, + "flos": 15383571448320.0, + "grad_norm": 1.4453472339612206, + "language_loss": 0.9229176, + "learning_rate": 3.3968276286573866e-06, + "loss": 1.00068069, + "num_input_tokens_seen": 99315790, + "router_z_loss_clip": 2.08007812, + "router_z_loss_mlp": 0.18029785, + "step": 4595, + "time_per_iteration": 3.9466536045074463 + }, + { + "auxiliary_loss_clip": 0.06509015, + "auxiliary_loss_mlp": 0.01281786, + "balance_loss_clip": 0.06294905, + "balance_loss_mlp": 0.01261592, + "epoch": 0.27632646926198706, + "flos": 20710330081920.0, + "grad_norm": 1.8151181533722092, + "language_loss": 0.69491673, + "learning_rate": 3.3965488667206353e-06, + "loss": 0.77282476, + "num_input_tokens_seen": 99334615, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.2019043, + "step": 4596, + "time_per_iteration": 2.552893877029419 + }, + { + "auxiliary_loss_clip": 0.06517404, + "auxiliary_loss_mlp": 0.01272476, + "balance_loss_clip": 0.0629788, + "balance_loss_mlp": 0.0125382, + "epoch": 0.276386592514655, + "flos": 32820981851520.0, + "grad_norm": 1.6734752779014743, + "language_loss": 0.64091378, + "learning_rate": 3.3962700518266113e-06, + "loss": 0.71881258, + "num_input_tokens_seen": 99356685, + "router_z_loss_clip": 2.19335938, + "router_z_loss_mlp": 0.18652344, + "step": 4597, + "time_per_iteration": 2.61291766166687 + }, + { + "auxiliary_loss_clip": 0.06500123, + "auxiliary_loss_mlp": 0.01279427, + "balance_loss_clip": 0.0629456, + "balance_loss_mlp": 0.01260616, + "epoch": 0.276446715767323, + "flos": 18557639637120.0, + "grad_norm": 1.8925825739150304, + "language_loss": 0.86690855, + "learning_rate": 3.395991183985887e-06, + "loss": 0.94470406, + "num_input_tokens_seen": 99374810, + "router_z_loss_clip": 2.05664062, + "router_z_loss_mlp": 0.18835449, + "step": 4598, + "time_per_iteration": 2.5411598682403564 + }, + { + "auxiliary_loss_clip": 0.0650408, + "auxiliary_loss_mlp": 0.0127527, + "balance_loss_clip": 0.06291056, + "balance_loss_mlp": 0.01256554, + "epoch": 0.27650683901999096, + "flos": 22826110003200.0, + "grad_norm": 2.378506410601605, + "language_loss": 0.79588032, + "learning_rate": 3.395712263209037e-06, + "loss": 0.8736738, + "num_input_tokens_seen": 99391290, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.18725586, + "step": 4599, + "time_per_iteration": 2.515411138534546 + }, + { + "auxiliary_loss_clip": 0.06518425, + "auxiliary_loss_mlp": 0.01279235, + "balance_loss_clip": 0.06301137, + "balance_loss_mlp": 0.01259756, + "epoch": 0.276566962272659, + "flos": 21368011178880.0, + "grad_norm": 2.1602669865212487, + "language_loss": 0.80043805, + "learning_rate": 3.395433289506639e-06, + "loss": 0.87841463, + "num_input_tokens_seen": 99409120, + "router_z_loss_clip": 2.17480469, + "router_z_loss_mlp": 0.19482422, + "step": 4600, + "time_per_iteration": 5.317862033843994 + }, + { + "auxiliary_loss_clip": 0.06511359, + "auxiliary_loss_mlp": 0.01276749, + "balance_loss_clip": 0.06296661, + "balance_loss_mlp": 0.01258843, + "epoch": 0.27662708552532694, + "flos": 17716076755200.0, + "grad_norm": 12.932121146702709, + "language_loss": 0.73461431, + "learning_rate": 3.3951542628892694e-06, + "loss": 0.81249541, + "num_input_tokens_seen": 99426180, + "router_z_loss_clip": 2.14550781, + "router_z_loss_mlp": 0.17907715, + "step": 4601, + "time_per_iteration": 2.5192854404449463 + }, + { + "auxiliary_loss_clip": 0.0650773, + "auxiliary_loss_mlp": 0.01282643, + "balance_loss_clip": 0.06297003, + "balance_loss_mlp": 0.01263676, + "epoch": 0.2766872087779949, + "flos": 21259292106240.0, + "grad_norm": 1.833059055741047, + "language_loss": 0.8051585, + "learning_rate": 3.3948751833675113e-06, + "loss": 0.88306224, + "num_input_tokens_seen": 99447720, + "router_z_loss_clip": 2.10742188, + "router_z_loss_mlp": 0.18981934, + "step": 4602, + "time_per_iteration": 2.635265350341797 + }, + { + "auxiliary_loss_clip": 0.06517955, + "auxiliary_loss_mlp": 0.01279657, + "balance_loss_clip": 0.06297721, + "balance_loss_mlp": 0.01259749, + "epoch": 0.2767473320306629, + "flos": 12936728845440.0, + "grad_norm": 2.082735068257359, + "language_loss": 0.7691201, + "learning_rate": 3.3945960509519455e-06, + "loss": 0.8470962, + "num_input_tokens_seen": 99464720, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.19921875, + "step": 4603, + "time_per_iteration": 2.6102261543273926 + }, + { + "auxiliary_loss_clip": 0.06506386, + "auxiliary_loss_mlp": 0.01276601, + "balance_loss_clip": 0.06300791, + "balance_loss_mlp": 0.01259017, + "epoch": 0.27680745528333084, + "flos": 15018239646720.0, + "grad_norm": 1.5173997695974415, + "language_loss": 0.81704807, + "learning_rate": 3.3943168656531585e-06, + "loss": 0.89487797, + "num_input_tokens_seen": 99482310, + "router_z_loss_clip": 2.05859375, + "router_z_loss_mlp": 0.17578125, + "step": 4604, + "time_per_iteration": 2.5022366046905518 + }, + { + "auxiliary_loss_clip": 0.06510165, + "auxiliary_loss_mlp": 0.01279666, + "balance_loss_clip": 0.06295862, + "balance_loss_mlp": 0.01261367, + "epoch": 0.2768675785359988, + "flos": 22644408424320.0, + "grad_norm": 1.8407701121062605, + "language_loss": 0.70736969, + "learning_rate": 3.3940376274817363e-06, + "loss": 0.78526795, + "num_input_tokens_seen": 99501255, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.18310547, + "step": 4605, + "time_per_iteration": 4.068409442901611 + }, + { + "auxiliary_loss_clip": 0.06402105, + "auxiliary_loss_mlp": 0.01269906, + "balance_loss_clip": 0.0629937, + "balance_loss_mlp": 0.01266097, + "epoch": 0.27692770178866677, + "flos": 66150772093440.0, + "grad_norm": 0.7075303746126435, + "language_loss": 0.57218695, + "learning_rate": 3.3937583364482673e-06, + "loss": 0.64890707, + "num_input_tokens_seen": 99568925, + "router_z_loss_clip": 1.03027344, + "router_z_loss_mlp": 0.0380249, + "step": 4606, + "time_per_iteration": 3.269275426864624 + }, + { + "auxiliary_loss_clip": 0.06516754, + "auxiliary_loss_mlp": 0.01286288, + "balance_loss_clip": 0.06299627, + "balance_loss_mlp": 0.01266118, + "epoch": 0.27698782504133473, + "flos": 26471545735680.0, + "grad_norm": 1.9632725808751148, + "language_loss": 0.69427574, + "learning_rate": 3.3934789925633424e-06, + "loss": 0.77230614, + "num_input_tokens_seen": 99588455, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.20153809, + "step": 4607, + "time_per_iteration": 2.566908836364746 + }, + { + "auxiliary_loss_clip": 0.06512889, + "auxiliary_loss_mlp": 0.01276778, + "balance_loss_clip": 0.06304939, + "balance_loss_mlp": 0.01258849, + "epoch": 0.2770479482940027, + "flos": 25891878389760.0, + "grad_norm": 1.6636880421304368, + "language_loss": 0.70338356, + "learning_rate": 3.393199595837555e-06, + "loss": 0.78128028, + "num_input_tokens_seen": 99609355, + "router_z_loss_clip": 2.08203125, + "router_z_loss_mlp": 0.17919922, + "step": 4608, + "time_per_iteration": 2.709989309310913 + }, + { + "auxiliary_loss_clip": 0.06514756, + "auxiliary_loss_mlp": 0.01279509, + "balance_loss_clip": 0.06298438, + "balance_loss_mlp": 0.01260781, + "epoch": 0.27710807154667066, + "flos": 22863942921600.0, + "grad_norm": 1.8326330841759049, + "language_loss": 0.73323762, + "learning_rate": 3.392920146281499e-06, + "loss": 0.81118023, + "num_input_tokens_seen": 99628780, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.18725586, + "step": 4609, + "time_per_iteration": 2.530625581741333 + }, + { + "auxiliary_loss_clip": 0.06522895, + "auxiliary_loss_mlp": 0.01276886, + "balance_loss_clip": 0.063067, + "balance_loss_mlp": 0.0125749, + "epoch": 0.27716819479933863, + "flos": 17716621806720.0, + "grad_norm": 2.1915868475112714, + "language_loss": 0.84688777, + "learning_rate": 3.3926406439057714e-06, + "loss": 0.92488557, + "num_input_tokens_seen": 99644545, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.19396973, + "step": 4610, + "time_per_iteration": 2.578780174255371 + }, + { + "auxiliary_loss_clip": 0.06521606, + "auxiliary_loss_mlp": 0.01280928, + "balance_loss_clip": 0.06305112, + "balance_loss_mlp": 0.01260054, + "epoch": 0.2772283180520066, + "flos": 19652125668480.0, + "grad_norm": 1.9738462991775114, + "language_loss": 0.69718874, + "learning_rate": 3.3923610887209705e-06, + "loss": 0.77521408, + "num_input_tokens_seen": 99663125, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.20874023, + "step": 4611, + "time_per_iteration": 2.5499660968780518 + }, + { + "auxiliary_loss_clip": 0.0651576, + "auxiliary_loss_mlp": 0.0127314, + "balance_loss_clip": 0.06309414, + "balance_loss_mlp": 0.01254997, + "epoch": 0.27728844130467456, + "flos": 21038960995200.0, + "grad_norm": 1.8677227151172762, + "language_loss": 0.74507141, + "learning_rate": 3.392081480737698e-06, + "loss": 0.82296044, + "num_input_tokens_seen": 99682645, + "router_z_loss_clip": 2.06347656, + "router_z_loss_mlp": 0.18151855, + "step": 4612, + "time_per_iteration": 2.567218065261841 + }, + { + "auxiliary_loss_clip": 0.06522087, + "auxiliary_loss_mlp": 0.01282319, + "balance_loss_clip": 0.06306847, + "balance_loss_mlp": 0.01263067, + "epoch": 0.2773485645573425, + "flos": 18995157331200.0, + "grad_norm": 2.3882423035535063, + "language_loss": 0.67084455, + "learning_rate": 3.3918018199665563e-06, + "loss": 0.74888861, + "num_input_tokens_seen": 99700520, + "router_z_loss_clip": 2.14941406, + "router_z_loss_mlp": 0.19250488, + "step": 4613, + "time_per_iteration": 2.5458126068115234 + }, + { + "auxiliary_loss_clip": 0.06515062, + "auxiliary_loss_mlp": 0.01275499, + "balance_loss_clip": 0.06304698, + "balance_loss_mlp": 0.0125577, + "epoch": 0.27740868781001055, + "flos": 21474508118400.0, + "grad_norm": 1.6100748666203144, + "language_loss": 0.79936564, + "learning_rate": 3.39152210641815e-06, + "loss": 0.87727129, + "num_input_tokens_seen": 99720355, + "router_z_loss_clip": 2.1015625, + "router_z_loss_mlp": 0.19750977, + "step": 4614, + "time_per_iteration": 2.5586962699890137 + }, + { + "auxiliary_loss_clip": 0.06520429, + "auxiliary_loss_mlp": 0.01279079, + "balance_loss_clip": 0.06305806, + "balance_loss_mlp": 0.01257884, + "epoch": 0.2774688110626785, + "flos": 19833827247360.0, + "grad_norm": 2.249482091575283, + "language_loss": 0.81082475, + "learning_rate": 3.3912423401030865e-06, + "loss": 0.88881981, + "num_input_tokens_seen": 99736090, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.21179199, + "step": 4615, + "time_per_iteration": 2.5192136764526367 + }, + { + "auxiliary_loss_clip": 0.0652476, + "auxiliary_loss_mlp": 0.0127518, + "balance_loss_clip": 0.06306368, + "balance_loss_mlp": 0.01256655, + "epoch": 0.2775289343153465, + "flos": 18220916805120.0, + "grad_norm": 2.6879454427381715, + "language_loss": 0.64382082, + "learning_rate": 3.3909625210319735e-06, + "loss": 0.72182024, + "num_input_tokens_seen": 99751805, + "router_z_loss_clip": 2.18164062, + "router_z_loss_mlp": 0.18518066, + "step": 4616, + "time_per_iteration": 2.528766393661499 + }, + { + "auxiliary_loss_clip": 0.06523173, + "auxiliary_loss_mlp": 0.01284441, + "balance_loss_clip": 0.06308753, + "balance_loss_mlp": 0.0126377, + "epoch": 0.27758905756801444, + "flos": 16478141333760.0, + "grad_norm": 2.0768832102625296, + "language_loss": 0.82857239, + "learning_rate": 3.3906826492154226e-06, + "loss": 0.90664852, + "num_input_tokens_seen": 99770610, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.20678711, + "step": 4617, + "time_per_iteration": 2.5130555629730225 + }, + { + "auxiliary_loss_clip": 0.06522305, + "auxiliary_loss_mlp": 0.01278739, + "balance_loss_clip": 0.06308021, + "balance_loss_mlp": 0.01260059, + "epoch": 0.2776491808206824, + "flos": 18733219868160.0, + "grad_norm": 2.583119020836192, + "language_loss": 0.77338278, + "learning_rate": 3.3904027246640458e-06, + "loss": 0.85139322, + "num_input_tokens_seen": 99787305, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.18676758, + "step": 4618, + "time_per_iteration": 2.5491156578063965 + }, + { + "auxiliary_loss_clip": 0.06524394, + "auxiliary_loss_mlp": 0.01277476, + "balance_loss_clip": 0.06309742, + "balance_loss_mlp": 0.01260191, + "epoch": 0.27770930407335037, + "flos": 28045742791680.0, + "grad_norm": 1.764934716544716, + "language_loss": 0.85733759, + "learning_rate": 3.390122747388459e-06, + "loss": 0.93535626, + "num_input_tokens_seen": 99808940, + "router_z_loss_clip": 2.14648438, + "router_z_loss_mlp": 0.17297363, + "step": 4619, + "time_per_iteration": 2.5741615295410156 + }, + { + "auxiliary_loss_clip": 0.06514929, + "auxiliary_loss_mlp": 0.01285121, + "balance_loss_clip": 0.06308962, + "balance_loss_mlp": 0.01266798, + "epoch": 0.27776942732601834, + "flos": 23556522044160.0, + "grad_norm": 1.4813387132666624, + "language_loss": 0.77092409, + "learning_rate": 3.3898427173992778e-06, + "loss": 0.84892452, + "num_input_tokens_seen": 99829575, + "router_z_loss_clip": 2.0625, + "router_z_loss_mlp": 0.18322754, + "step": 4620, + "time_per_iteration": 2.690934658050537 + }, + { + "auxiliary_loss_clip": 0.0651743, + "auxiliary_loss_mlp": 0.01277569, + "balance_loss_clip": 0.06309397, + "balance_loss_mlp": 0.0125821, + "epoch": 0.2778295505786863, + "flos": 23914474686720.0, + "grad_norm": 1.8907472710416175, + "language_loss": 0.78585863, + "learning_rate": 3.389562634707122e-06, + "loss": 0.86380863, + "num_input_tokens_seen": 99847575, + "router_z_loss_clip": 2.07910156, + "router_z_loss_mlp": 0.19360352, + "step": 4621, + "time_per_iteration": 2.5846168994903564 + }, + { + "auxiliary_loss_clip": 0.06522836, + "auxiliary_loss_mlp": 0.01279001, + "balance_loss_clip": 0.0630835, + "balance_loss_mlp": 0.01259701, + "epoch": 0.27788967383135427, + "flos": 25561276905600.0, + "grad_norm": 2.170367430288875, + "language_loss": 0.88217753, + "learning_rate": 3.389282499322611e-06, + "loss": 0.96019584, + "num_input_tokens_seen": 99864995, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.1932373, + "step": 4622, + "time_per_iteration": 2.6036407947540283 + }, + { + "auxiliary_loss_clip": 0.06512653, + "auxiliary_loss_mlp": 0.01273349, + "balance_loss_clip": 0.06299745, + "balance_loss_mlp": 0.01254919, + "epoch": 0.27794979708402223, + "flos": 16258103712000.0, + "grad_norm": 2.5896700244630018, + "language_loss": 0.81515396, + "learning_rate": 3.389002311256369e-06, + "loss": 0.89301395, + "num_input_tokens_seen": 99881540, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.18432617, + "step": 4623, + "time_per_iteration": 2.539655923843384 + }, + { + "auxiliary_loss_clip": 0.06518189, + "auxiliary_loss_mlp": 0.01278229, + "balance_loss_clip": 0.06306686, + "balance_loss_mlp": 0.01258941, + "epoch": 0.2780099203366902, + "flos": 20673880755840.0, + "grad_norm": 1.9609752985345037, + "language_loss": 0.82099682, + "learning_rate": 3.3887220705190204e-06, + "loss": 0.89896095, + "num_input_tokens_seen": 99899595, + "router_z_loss_clip": 2.11621094, + "router_z_loss_mlp": 0.19274902, + "step": 4624, + "time_per_iteration": 2.5662107467651367 + }, + { + "auxiliary_loss_clip": 0.06512089, + "auxiliary_loss_mlp": 0.01276338, + "balance_loss_clip": 0.06303106, + "balance_loss_mlp": 0.01258004, + "epoch": 0.27807004358935816, + "flos": 17743805256960.0, + "grad_norm": 3.013190567677447, + "language_loss": 0.77269506, + "learning_rate": 3.388441777121191e-06, + "loss": 0.85057938, + "num_input_tokens_seen": 99913020, + "router_z_loss_clip": 2.08789062, + "router_z_loss_mlp": 0.18322754, + "step": 4625, + "time_per_iteration": 2.5685927867889404 + }, + { + "auxiliary_loss_clip": 0.06507699, + "auxiliary_loss_mlp": 0.01271539, + "balance_loss_clip": 0.06299223, + "balance_loss_mlp": 0.01253658, + "epoch": 0.2781301668420261, + "flos": 16732699637760.0, + "grad_norm": 1.9769276375727096, + "language_loss": 0.70884871, + "learning_rate": 3.388161431073511e-06, + "loss": 0.78664112, + "num_input_tokens_seen": 99931405, + "router_z_loss_clip": 2.08398438, + "router_z_loss_mlp": 0.17883301, + "step": 4626, + "time_per_iteration": 2.527975559234619 + }, + { + "auxiliary_loss_clip": 0.06520554, + "auxiliary_loss_mlp": 0.01273216, + "balance_loss_clip": 0.06304689, + "balance_loss_mlp": 0.01254798, + "epoch": 0.27819029009469415, + "flos": 13849848714240.0, + "grad_norm": 2.4481240639566013, + "language_loss": 0.93016249, + "learning_rate": 3.38788103238661e-06, + "loss": 1.00810015, + "num_input_tokens_seen": 99948100, + "router_z_loss_clip": 2.15917969, + "router_z_loss_mlp": 0.18432617, + "step": 4627, + "time_per_iteration": 2.551558494567871 + }, + { + "auxiliary_loss_clip": 0.06514014, + "auxiliary_loss_mlp": 0.01276758, + "balance_loss_clip": 0.06298277, + "balance_loss_mlp": 0.01258364, + "epoch": 0.2782504133473621, + "flos": 27096634794240.0, + "grad_norm": 1.6603793888564844, + "language_loss": 0.85558021, + "learning_rate": 3.387600581071121e-06, + "loss": 0.93348801, + "num_input_tokens_seen": 99966470, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.1842041, + "step": 4628, + "time_per_iteration": 2.56680965423584 + }, + { + "auxiliary_loss_clip": 0.06511193, + "auxiliary_loss_mlp": 0.01275379, + "balance_loss_clip": 0.06301076, + "balance_loss_mlp": 0.01257569, + "epoch": 0.2783105366000301, + "flos": 21075116832000.0, + "grad_norm": 1.7183700627805243, + "language_loss": 0.79370463, + "learning_rate": 3.387320077137679e-06, + "loss": 0.87157035, + "num_input_tokens_seen": 99985930, + "router_z_loss_clip": 2.1015625, + "router_z_loss_mlp": 0.17810059, + "step": 4629, + "time_per_iteration": 2.579024076461792 + }, + { + "auxiliary_loss_clip": 0.06504764, + "auxiliary_loss_mlp": 0.01277211, + "balance_loss_clip": 0.06300465, + "balance_loss_mlp": 0.01259699, + "epoch": 0.27837065985269804, + "flos": 26508456259200.0, + "grad_norm": 2.4632649346037856, + "language_loss": 0.84664094, + "learning_rate": 3.3870395205969208e-06, + "loss": 0.92446071, + "num_input_tokens_seen": 100006235, + "router_z_loss_clip": 2.04296875, + "router_z_loss_mlp": 0.17529297, + "step": 4630, + "time_per_iteration": 2.568190336227417 + }, + { + "auxiliary_loss_clip": 0.06516108, + "auxiliary_loss_mlp": 0.01271169, + "balance_loss_clip": 0.06302783, + "balance_loss_mlp": 0.01253395, + "epoch": 0.278430783105366, + "flos": 20228271143040.0, + "grad_norm": 1.8872458968592738, + "language_loss": 0.80858278, + "learning_rate": 3.386758911459485e-06, + "loss": 0.8864556, + "num_input_tokens_seen": 100023655, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.17773438, + "step": 4631, + "time_per_iteration": 2.5658912658691406 + }, + { + "auxiliary_loss_clip": 0.06512441, + "auxiliary_loss_mlp": 0.01275522, + "balance_loss_clip": 0.06299636, + "balance_loss_mlp": 0.01256866, + "epoch": 0.278490906358034, + "flos": 25599906437760.0, + "grad_norm": 2.407277572133289, + "language_loss": 0.715128, + "learning_rate": 3.3864782497360126e-06, + "loss": 0.79300761, + "num_input_tokens_seen": 100043280, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.18652344, + "step": 4632, + "time_per_iteration": 2.620729446411133 + }, + { + "auxiliary_loss_clip": 0.06502309, + "auxiliary_loss_mlp": 0.01270787, + "balance_loss_clip": 0.06296511, + "balance_loss_mlp": 0.01253502, + "epoch": 0.27855102961070194, + "flos": 16175645694720.0, + "grad_norm": 1.8302171024684264, + "language_loss": 0.82394838, + "learning_rate": 3.386197535437145e-06, + "loss": 0.9016794, + "num_input_tokens_seen": 100057690, + "router_z_loss_clip": 2.05859375, + "router_z_loss_mlp": 0.17297363, + "step": 4633, + "time_per_iteration": 2.513705015182495 + }, + { + "auxiliary_loss_clip": 0.06511516, + "auxiliary_loss_mlp": 0.01278904, + "balance_loss_clip": 0.06299913, + "balance_loss_mlp": 0.012597, + "epoch": 0.2786111528633699, + "flos": 22933864753920.0, + "grad_norm": 1.5843012688553681, + "language_loss": 0.8872478, + "learning_rate": 3.385916768573529e-06, + "loss": 0.96515197, + "num_input_tokens_seen": 100075875, + "router_z_loss_clip": 2.11523438, + "router_z_loss_mlp": 0.19213867, + "step": 4634, + "time_per_iteration": 2.5471088886260986 + }, + { + "auxiliary_loss_clip": 0.06514788, + "auxiliary_loss_mlp": 0.01276007, + "balance_loss_clip": 0.06301814, + "balance_loss_mlp": 0.01256588, + "epoch": 0.27867127611603787, + "flos": 23410934375040.0, + "grad_norm": 1.5369483246730489, + "language_loss": 0.77466059, + "learning_rate": 3.38563594915581e-06, + "loss": 0.85256851, + "num_input_tokens_seen": 100092930, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.19433594, + "step": 4635, + "time_per_iteration": 3.9016311168670654 + }, + { + "auxiliary_loss_clip": 0.06508552, + "auxiliary_loss_mlp": 0.01273062, + "balance_loss_clip": 0.06295648, + "balance_loss_mlp": 0.01254859, + "epoch": 0.27873139936870583, + "flos": 19835210839680.0, + "grad_norm": 1.7801998538005617, + "language_loss": 0.66571766, + "learning_rate": 3.385355077194637e-06, + "loss": 0.74353385, + "num_input_tokens_seen": 100110790, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.18188477, + "step": 4636, + "time_per_iteration": 2.5264599323272705 + }, + { + "auxiliary_loss_clip": 0.06519878, + "auxiliary_loss_mlp": 0.01275894, + "balance_loss_clip": 0.06302889, + "balance_loss_mlp": 0.01256392, + "epoch": 0.2787915226213738, + "flos": 17712638737920.0, + "grad_norm": 2.933733922484583, + "language_loss": 0.83255613, + "learning_rate": 3.3850741527006604e-06, + "loss": 0.91051382, + "num_input_tokens_seen": 100126970, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.19506836, + "step": 4637, + "time_per_iteration": 2.5344014167785645 + }, + { + "auxiliary_loss_clip": 0.06505676, + "auxiliary_loss_mlp": 0.01276787, + "balance_loss_clip": 0.06297021, + "balance_loss_mlp": 0.01258918, + "epoch": 0.27885164587404176, + "flos": 22097039627520.0, + "grad_norm": 1.4932909871395708, + "language_loss": 0.76038569, + "learning_rate": 3.384793175684533e-06, + "loss": 0.83821034, + "num_input_tokens_seen": 100146720, + "router_z_loss_clip": 2.08691406, + "router_z_loss_mlp": 0.17871094, + "step": 4638, + "time_per_iteration": 2.544187068939209 + }, + { + "auxiliary_loss_clip": 0.06510019, + "auxiliary_loss_mlp": 0.01280274, + "balance_loss_clip": 0.06297282, + "balance_loss_mlp": 0.01262511, + "epoch": 0.27891176912670973, + "flos": 19213601725440.0, + "grad_norm": 2.235877812045319, + "language_loss": 0.72492748, + "learning_rate": 3.38451214615691e-06, + "loss": 0.8028304, + "num_input_tokens_seen": 100165920, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.17749023, + "step": 4639, + "time_per_iteration": 4.002680063247681 + }, + { + "auxiliary_loss_clip": 0.06515414, + "auxiliary_loss_mlp": 0.0127372, + "balance_loss_clip": 0.06300536, + "balance_loss_mlp": 0.01254813, + "epoch": 0.27897189237937775, + "flos": 27607428483840.0, + "grad_norm": 1.8877142592522154, + "language_loss": 0.66217673, + "learning_rate": 3.384231064128447e-06, + "loss": 0.74006808, + "num_input_tokens_seen": 100185525, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.18896484, + "step": 4640, + "time_per_iteration": 4.054874420166016 + }, + { + "auxiliary_loss_clip": 0.0651349, + "auxiliary_loss_mlp": 0.01272631, + "balance_loss_clip": 0.06301108, + "balance_loss_mlp": 0.01254654, + "epoch": 0.2790320156320457, + "flos": 21184506737280.0, + "grad_norm": 2.077527470737851, + "language_loss": 0.72818768, + "learning_rate": 3.383949929609804e-06, + "loss": 0.80604887, + "num_input_tokens_seen": 100204850, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.1796875, + "step": 4641, + "time_per_iteration": 2.566758155822754 + }, + { + "auxiliary_loss_clip": 0.06517549, + "auxiliary_loss_mlp": 0.01276062, + "balance_loss_clip": 0.06298883, + "balance_loss_mlp": 0.01256488, + "epoch": 0.2790921388847137, + "flos": 22790541144960.0, + "grad_norm": 1.8548696214163785, + "language_loss": 0.75277239, + "learning_rate": 3.383668742611641e-06, + "loss": 0.8307085, + "num_input_tokens_seen": 100224520, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.19567871, + "step": 4642, + "time_per_iteration": 2.5531389713287354 + }, + { + "auxiliary_loss_clip": 0.0651103, + "auxiliary_loss_mlp": 0.01281312, + "balance_loss_clip": 0.06296819, + "balance_loss_mlp": 0.01261631, + "epoch": 0.27915226213738165, + "flos": 23406783598080.0, + "grad_norm": 1.8301300365045747, + "language_loss": 0.85787475, + "learning_rate": 3.3833875031446205e-06, + "loss": 0.93579817, + "num_input_tokens_seen": 100243935, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.19689941, + "step": 4643, + "time_per_iteration": 2.561692714691162 + }, + { + "auxiliary_loss_clip": 0.06505755, + "auxiliary_loss_mlp": 0.01281002, + "balance_loss_clip": 0.06292956, + "balance_loss_mlp": 0.01262572, + "epoch": 0.2792123853900496, + "flos": 22754469162240.0, + "grad_norm": 2.128449816262669, + "language_loss": 0.83027583, + "learning_rate": 3.383106211219407e-06, + "loss": 0.9081434, + "num_input_tokens_seen": 100262290, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.1842041, + "step": 4644, + "time_per_iteration": 2.5298962593078613 + }, + { + "auxiliary_loss_clip": 0.06505448, + "auxiliary_loss_mlp": 0.01273805, + "balance_loss_clip": 0.0629155, + "balance_loss_mlp": 0.01256174, + "epoch": 0.2792725086427176, + "flos": 15054772826880.0, + "grad_norm": 1.7497246062339578, + "language_loss": 0.79546082, + "learning_rate": 3.3828248668466673e-06, + "loss": 0.87325335, + "num_input_tokens_seen": 100280015, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.17626953, + "step": 4645, + "time_per_iteration": 3.9172677993774414 + }, + { + "auxiliary_loss_clip": 0.06419063, + "auxiliary_loss_mlp": 0.01254208, + "balance_loss_clip": 0.0631457, + "balance_loss_mlp": 0.0125017, + "epoch": 0.27933263189538554, + "flos": 62562805862400.0, + "grad_norm": 0.7707831229317741, + "language_loss": 0.62136066, + "learning_rate": 3.3825434700370705e-06, + "loss": 0.6980933, + "num_input_tokens_seen": 100338935, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.04037476, + "step": 4646, + "time_per_iteration": 3.1527390480041504 + }, + { + "auxiliary_loss_clip": 0.06500821, + "auxiliary_loss_mlp": 0.01275319, + "balance_loss_clip": 0.0629313, + "balance_loss_mlp": 0.01257581, + "epoch": 0.2793927551480535, + "flos": 25125268584960.0, + "grad_norm": 1.6018723981737446, + "language_loss": 0.89582062, + "learning_rate": 3.3822620208012865e-06, + "loss": 0.97358203, + "num_input_tokens_seen": 100359905, + "router_z_loss_clip": 2.078125, + "router_z_loss_mlp": 0.17736816, + "step": 4647, + "time_per_iteration": 2.564333915710449 + }, + { + "auxiliary_loss_clip": 0.06509704, + "auxiliary_loss_mlp": 0.01277108, + "balance_loss_clip": 0.06292088, + "balance_loss_mlp": 0.01258142, + "epoch": 0.27945287840072147, + "flos": 21330974874240.0, + "grad_norm": 1.6381839497334347, + "language_loss": 0.87525821, + "learning_rate": 3.381980519149988e-06, + "loss": 0.95312631, + "num_input_tokens_seen": 100376955, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.1895752, + "step": 4648, + "time_per_iteration": 2.5516953468322754 + }, + { + "auxiliary_loss_clip": 0.06507549, + "auxiliary_loss_mlp": 0.01274847, + "balance_loss_clip": 0.06291072, + "balance_loss_mlp": 0.01256643, + "epoch": 0.27951300165338944, + "flos": 27457354621440.0, + "grad_norm": 2.652634800411286, + "language_loss": 0.73020303, + "learning_rate": 3.38169896509385e-06, + "loss": 0.80802703, + "num_input_tokens_seen": 100397545, + "router_z_loss_clip": 2.1640625, + "router_z_loss_mlp": 0.18212891, + "step": 4649, + "time_per_iteration": 2.5767719745635986 + }, + { + "auxiliary_loss_clip": 0.06508242, + "auxiliary_loss_mlp": 0.01277361, + "balance_loss_clip": 0.0629622, + "balance_loss_mlp": 0.01259003, + "epoch": 0.2795731249060574, + "flos": 15164456221440.0, + "grad_norm": 2.110277953429804, + "language_loss": 0.81314564, + "learning_rate": 3.381417358643549e-06, + "loss": 0.8910017, + "num_input_tokens_seen": 100415080, + "router_z_loss_clip": 2.12109375, + "router_z_loss_mlp": 0.18347168, + "step": 4650, + "time_per_iteration": 2.663588285446167 + }, + { + "auxiliary_loss_clip": 0.06406052, + "auxiliary_loss_mlp": 0.01252705, + "balance_loss_clip": 0.06303374, + "balance_loss_mlp": 0.01248944, + "epoch": 0.27963324815872537, + "flos": 60140951775360.0, + "grad_norm": 0.800089640521837, + "language_loss": 0.5874877, + "learning_rate": 3.3811356998097624e-06, + "loss": 0.66407531, + "num_input_tokens_seen": 100471105, + "router_z_loss_clip": 1.0234375, + "router_z_loss_mlp": 0.03753662, + "step": 4651, + "time_per_iteration": 3.205563545227051 + }, + { + "auxiliary_loss_clip": 0.06513405, + "auxiliary_loss_mlp": 0.01276159, + "balance_loss_clip": 0.06293929, + "balance_loss_mlp": 0.01257205, + "epoch": 0.27969337141139333, + "flos": 21773020688640.0, + "grad_norm": 1.70848848544609, + "language_loss": 0.74928713, + "learning_rate": 3.3808539886031726e-06, + "loss": 0.82718277, + "num_input_tokens_seen": 100492520, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.18945312, + "step": 4652, + "time_per_iteration": 2.620284080505371 + }, + { + "auxiliary_loss_clip": 0.06513481, + "auxiliary_loss_mlp": 0.01277362, + "balance_loss_clip": 0.06297033, + "balance_loss_mlp": 0.01259517, + "epoch": 0.27975349466406135, + "flos": 39859559072640.0, + "grad_norm": 2.257859492249039, + "language_loss": 0.81193566, + "learning_rate": 3.380572225034461e-06, + "loss": 0.88984406, + "num_input_tokens_seen": 100512870, + "router_z_loss_clip": 2.16601562, + "router_z_loss_mlp": 0.17834473, + "step": 4653, + "time_per_iteration": 2.6902103424072266 + }, + { + "auxiliary_loss_clip": 0.06505801, + "auxiliary_loss_mlp": 0.01275903, + "balance_loss_clip": 0.06293398, + "balance_loss_mlp": 0.01257939, + "epoch": 0.2798136179167293, + "flos": 21586204010880.0, + "grad_norm": 2.2005279612587647, + "language_loss": 0.78939915, + "learning_rate": 3.380290409114312e-06, + "loss": 0.86721623, + "num_input_tokens_seen": 100531655, + "router_z_loss_clip": 2.12304688, + "router_z_loss_mlp": 0.17956543, + "step": 4654, + "time_per_iteration": 2.5862321853637695 + }, + { + "auxiliary_loss_clip": 0.06514826, + "auxiliary_loss_mlp": 0.01276603, + "balance_loss_clip": 0.06294681, + "balance_loss_mlp": 0.01256457, + "epoch": 0.2798737411693973, + "flos": 21543130212480.0, + "grad_norm": 2.786817882874951, + "language_loss": 0.81491858, + "learning_rate": 3.3800085408534127e-06, + "loss": 0.89283288, + "num_input_tokens_seen": 100548005, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.20153809, + "step": 4655, + "time_per_iteration": 2.5335962772369385 + }, + { + "auxiliary_loss_clip": 0.06503223, + "auxiliary_loss_mlp": 0.01276615, + "balance_loss_clip": 0.06287771, + "balance_loss_mlp": 0.0125778, + "epoch": 0.27993386442206525, + "flos": 26988586554240.0, + "grad_norm": 1.7572759264995625, + "language_loss": 0.82015479, + "learning_rate": 3.3797266202624506e-06, + "loss": 0.89795309, + "num_input_tokens_seen": 100567980, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.18847656, + "step": 4656, + "time_per_iteration": 2.5953826904296875 + }, + { + "auxiliary_loss_clip": 0.0650457, + "auxiliary_loss_mlp": 0.01280726, + "balance_loss_clip": 0.06291523, + "balance_loss_mlp": 0.01261319, + "epoch": 0.2799939876747332, + "flos": 24356268938880.0, + "grad_norm": 1.602501989097996, + "language_loss": 0.83292782, + "learning_rate": 3.3794446473521176e-06, + "loss": 0.91078079, + "num_input_tokens_seen": 100588630, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.19396973, + "step": 4657, + "time_per_iteration": 2.546698808670044 + }, + { + "auxiliary_loss_clip": 0.06501682, + "auxiliary_loss_mlp": 0.01283943, + "balance_loss_clip": 0.06287715, + "balance_loss_mlp": 0.01265847, + "epoch": 0.2800541109274012, + "flos": 33665479626240.0, + "grad_norm": 2.056920585114217, + "language_loss": 0.64474404, + "learning_rate": 3.379162622133105e-06, + "loss": 0.72260022, + "num_input_tokens_seen": 100608775, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.18103027, + "step": 4658, + "time_per_iteration": 2.633352041244507 + }, + { + "auxiliary_loss_clip": 0.0650496, + "auxiliary_loss_mlp": 0.01278289, + "balance_loss_clip": 0.06292152, + "balance_loss_mlp": 0.01258298, + "epoch": 0.28011423418006914, + "flos": 21620515057920.0, + "grad_norm": 1.9139831777919125, + "language_loss": 0.78200769, + "learning_rate": 3.3788805446161073e-06, + "loss": 0.85984015, + "num_input_tokens_seen": 100627975, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.19995117, + "step": 4659, + "time_per_iteration": 2.5146000385284424 + }, + { + "auxiliary_loss_clip": 0.06512548, + "auxiliary_loss_mlp": 0.01279668, + "balance_loss_clip": 0.06298335, + "balance_loss_mlp": 0.01260582, + "epoch": 0.2801743574327371, + "flos": 23119130131200.0, + "grad_norm": 1.8180566150817747, + "language_loss": 0.79711032, + "learning_rate": 3.3785984148118215e-06, + "loss": 0.87503254, + "num_input_tokens_seen": 100645430, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.1907959, + "step": 4660, + "time_per_iteration": 2.5558273792266846 + }, + { + "auxiliary_loss_clip": 0.06502102, + "auxiliary_loss_mlp": 0.01275983, + "balance_loss_clip": 0.06293646, + "balance_loss_mlp": 0.01257732, + "epoch": 0.2802344806854051, + "flos": 12646433975040.0, + "grad_norm": 2.0195446081970685, + "language_loss": 0.8127892, + "learning_rate": 3.3783162327309453e-06, + "loss": 0.89057004, + "num_input_tokens_seen": 100663775, + "router_z_loss_clip": 2.08300781, + "router_z_loss_mlp": 0.18237305, + "step": 4661, + "time_per_iteration": 2.475562572479248 + }, + { + "auxiliary_loss_clip": 0.06508808, + "auxiliary_loss_mlp": 0.01277709, + "balance_loss_clip": 0.06296618, + "balance_loss_mlp": 0.01258898, + "epoch": 0.28029460393807304, + "flos": 37276772019840.0, + "grad_norm": 2.0240330571158904, + "language_loss": 0.79226935, + "learning_rate": 3.3780339983841794e-06, + "loss": 0.87013447, + "num_input_tokens_seen": 100686085, + "router_z_loss_clip": 2.12109375, + "router_z_loss_mlp": 0.18823242, + "step": 4662, + "time_per_iteration": 2.6644277572631836 + }, + { + "auxiliary_loss_clip": 0.06515819, + "auxiliary_loss_mlp": 0.01277387, + "balance_loss_clip": 0.06296565, + "balance_loss_mlp": 0.01258349, + "epoch": 0.280354727190741, + "flos": 20747450240640.0, + "grad_norm": 1.722651872041065, + "language_loss": 0.70744783, + "learning_rate": 3.377751711782227e-06, + "loss": 0.78537989, + "num_input_tokens_seen": 100705135, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.19042969, + "step": 4663, + "time_per_iteration": 2.5365068912506104 + }, + { + "auxiliary_loss_clip": 0.06510712, + "auxiliary_loss_mlp": 0.01280818, + "balance_loss_clip": 0.06293653, + "balance_loss_mlp": 0.01259312, + "epoch": 0.28041485044340897, + "flos": 21477526865280.0, + "grad_norm": 1.8007469711633386, + "language_loss": 0.77919745, + "learning_rate": 3.377469372935791e-06, + "loss": 0.85711277, + "num_input_tokens_seen": 100724960, + "router_z_loss_clip": 2.17089844, + "router_z_loss_mlp": 0.21520996, + "step": 4664, + "time_per_iteration": 2.578552484512329 + }, + { + "auxiliary_loss_clip": 0.06500383, + "auxiliary_loss_mlp": 0.01277041, + "balance_loss_clip": 0.06293675, + "balance_loss_mlp": 0.01259374, + "epoch": 0.28047497369607693, + "flos": 14799669471360.0, + "grad_norm": 1.9758280924180103, + "language_loss": 0.80386382, + "learning_rate": 3.377186981855578e-06, + "loss": 0.88163805, + "num_input_tokens_seen": 100741995, + "router_z_loss_clip": 2.06738281, + "router_z_loss_mlp": 0.17675781, + "step": 4665, + "time_per_iteration": 2.5088212490081787 + }, + { + "auxiliary_loss_clip": 0.06506059, + "auxiliary_loss_mlp": 0.01274647, + "balance_loss_clip": 0.06294893, + "balance_loss_mlp": 0.01257397, + "epoch": 0.2805350969487449, + "flos": 23076559457280.0, + "grad_norm": 2.052054159073397, + "language_loss": 0.81109238, + "learning_rate": 3.3769045385522968e-06, + "loss": 0.88889945, + "num_input_tokens_seen": 100758985, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.17236328, + "step": 4666, + "time_per_iteration": 2.5765438079833984 + }, + { + "auxiliary_loss_clip": 0.06505027, + "auxiliary_loss_mlp": 0.01282246, + "balance_loss_clip": 0.0629367, + "balance_loss_mlp": 0.01263149, + "epoch": 0.2805952202014129, + "flos": 20485177361280.0, + "grad_norm": 2.1346617464039395, + "language_loss": 0.84940714, + "learning_rate": 3.376622043036658e-06, + "loss": 0.92727995, + "num_input_tokens_seen": 100777820, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.19104004, + "step": 4667, + "time_per_iteration": 2.536466360092163 + }, + { + "auxiliary_loss_clip": 0.06510031, + "auxiliary_loss_mlp": 0.01284991, + "balance_loss_clip": 0.0629562, + "balance_loss_mlp": 0.0126581, + "epoch": 0.2806553434540809, + "flos": 27424678728960.0, + "grad_norm": 1.8168022919289022, + "language_loss": 0.80077279, + "learning_rate": 3.376339495319373e-06, + "loss": 0.87872303, + "num_input_tokens_seen": 100798205, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.19177246, + "step": 4668, + "time_per_iteration": 2.620793581008911 + }, + { + "auxiliary_loss_clip": 0.06508033, + "auxiliary_loss_mlp": 0.01279574, + "balance_loss_clip": 0.06290744, + "balance_loss_mlp": 0.0126124, + "epoch": 0.28071546670674885, + "flos": 26512187765760.0, + "grad_norm": 1.3575587104794173, + "language_loss": 0.76748574, + "learning_rate": 3.3760568954111563e-06, + "loss": 0.84536183, + "num_input_tokens_seen": 100819800, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.18334961, + "step": 4669, + "time_per_iteration": 2.629755973815918 + }, + { + "auxiliary_loss_clip": 0.06511085, + "auxiliary_loss_mlp": 0.01281258, + "balance_loss_clip": 0.06298456, + "balance_loss_mlp": 0.01263376, + "epoch": 0.2807755899594168, + "flos": 20564993975040.0, + "grad_norm": 1.8976620486576934, + "language_loss": 0.79953671, + "learning_rate": 3.375774243322725e-06, + "loss": 0.87746012, + "num_input_tokens_seen": 100837880, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.17883301, + "step": 4670, + "time_per_iteration": 2.630960702896118 + }, + { + "auxiliary_loss_clip": 0.06512859, + "auxiliary_loss_mlp": 0.0128758, + "balance_loss_clip": 0.06296019, + "balance_loss_mlp": 0.0126859, + "epoch": 0.2808357132120848, + "flos": 24319693831680.0, + "grad_norm": 2.1242803821214915, + "language_loss": 0.79548872, + "learning_rate": 3.3754915390647955e-06, + "loss": 0.87349308, + "num_input_tokens_seen": 100856350, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.18981934, + "step": 4671, + "time_per_iteration": 2.5943963527679443 + }, + { + "auxiliary_loss_clip": 0.06499608, + "auxiliary_loss_mlp": 0.01282791, + "balance_loss_clip": 0.06293108, + "balance_loss_mlp": 0.01265124, + "epoch": 0.28089583646475275, + "flos": 26439624529920.0, + "grad_norm": 1.773606658736433, + "language_loss": 0.75789028, + "learning_rate": 3.37520878264809e-06, + "loss": 0.83571434, + "num_input_tokens_seen": 100876135, + "router_z_loss_clip": 2.06640625, + "router_z_loss_mlp": 0.17663574, + "step": 4672, + "time_per_iteration": 2.5819919109344482 + }, + { + "auxiliary_loss_clip": 0.06515782, + "auxiliary_loss_mlp": 0.0128082, + "balance_loss_clip": 0.06299746, + "balance_loss_mlp": 0.01260412, + "epoch": 0.2809559597174207, + "flos": 23118417371520.0, + "grad_norm": 2.723902952009536, + "language_loss": 0.76012361, + "learning_rate": 3.3749259740833286e-06, + "loss": 0.83808959, + "num_input_tokens_seen": 100894790, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.20410156, + "step": 4673, + "time_per_iteration": 2.579460859298706 + }, + { + "auxiliary_loss_clip": 0.06510463, + "auxiliary_loss_mlp": 0.01285315, + "balance_loss_clip": 0.06297876, + "balance_loss_mlp": 0.0126704, + "epoch": 0.2810160829700887, + "flos": 20929864579200.0, + "grad_norm": 1.8153863613356214, + "language_loss": 0.72824192, + "learning_rate": 3.374643113381237e-06, + "loss": 0.80619967, + "num_input_tokens_seen": 100915100, + "router_z_loss_clip": 2.12792969, + "router_z_loss_mlp": 0.18261719, + "step": 4674, + "time_per_iteration": 4.0586278438568115 + }, + { + "auxiliary_loss_clip": 0.06522093, + "auxiliary_loss_mlp": 0.01283708, + "balance_loss_clip": 0.06307152, + "balance_loss_mlp": 0.0126405, + "epoch": 0.28107620622275664, + "flos": 14361145528320.0, + "grad_norm": 1.8954321480679195, + "language_loss": 0.77875817, + "learning_rate": 3.374360200552541e-06, + "loss": 0.85681611, + "num_input_tokens_seen": 100932795, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.1965332, + "step": 4675, + "time_per_iteration": 2.550075054168701 + }, + { + "auxiliary_loss_clip": 0.06512761, + "auxiliary_loss_mlp": 0.01288962, + "balance_loss_clip": 0.06296991, + "balance_loss_mlp": 0.01269531, + "epoch": 0.2811363294754246, + "flos": 20924707553280.0, + "grad_norm": 3.9789590396078784, + "language_loss": 0.70705891, + "learning_rate": 3.374077235607968e-06, + "loss": 0.78507614, + "num_input_tokens_seen": 100950505, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.19433594, + "step": 4676, + "time_per_iteration": 2.519028425216675 + }, + { + "auxiliary_loss_clip": 0.06504105, + "auxiliary_loss_mlp": 0.01278874, + "balance_loss_clip": 0.0629884, + "balance_loss_mlp": 0.01260611, + "epoch": 0.28119645272809257, + "flos": 20601107884800.0, + "grad_norm": 1.5779309471284284, + "language_loss": 0.70529211, + "learning_rate": 3.3737942185582487e-06, + "loss": 0.78312188, + "num_input_tokens_seen": 100968790, + "router_z_loss_clip": 2.04882812, + "router_z_loss_mlp": 0.18286133, + "step": 4677, + "time_per_iteration": 2.5834195613861084 + }, + { + "auxiliary_loss_clip": 0.06516379, + "auxiliary_loss_mlp": 0.01281791, + "balance_loss_clip": 0.06302937, + "balance_loss_mlp": 0.0126193, + "epoch": 0.28125657598076054, + "flos": 25344383811840.0, + "grad_norm": 1.5021857900224345, + "language_loss": 0.64105308, + "learning_rate": 3.3735111494141153e-06, + "loss": 0.71903479, + "num_input_tokens_seen": 100990205, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.1986084, + "step": 4678, + "time_per_iteration": 2.618948221206665 + }, + { + "auxiliary_loss_clip": 0.06517099, + "auxiliary_loss_mlp": 0.01278079, + "balance_loss_clip": 0.06306246, + "balance_loss_mlp": 0.01259947, + "epoch": 0.2813166992334285, + "flos": 24834051319680.0, + "grad_norm": 1.437486997447774, + "language_loss": 0.71167207, + "learning_rate": 3.3732280281863013e-06, + "loss": 0.7896238, + "num_input_tokens_seen": 101009815, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.18139648, + "step": 4679, + "time_per_iteration": 5.466668128967285 + }, + { + "auxiliary_loss_clip": 0.06520079, + "auxiliary_loss_mlp": 0.0127734, + "balance_loss_clip": 0.06306013, + "balance_loss_mlp": 0.01257491, + "epoch": 0.2813768224860965, + "flos": 21766941267840.0, + "grad_norm": 1.8819388160659554, + "language_loss": 0.75122017, + "learning_rate": 3.3729448548855422e-06, + "loss": 0.82919437, + "num_input_tokens_seen": 101026780, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.19848633, + "step": 4680, + "time_per_iteration": 2.5146636962890625 + }, + { + "auxiliary_loss_clip": 0.06519224, + "auxiliary_loss_mlp": 0.01276065, + "balance_loss_clip": 0.06307293, + "balance_loss_mlp": 0.01257969, + "epoch": 0.2814369457387645, + "flos": 24323760754560.0, + "grad_norm": 2.4475033368931984, + "language_loss": 0.77670574, + "learning_rate": 3.3726616295225774e-06, + "loss": 0.8546586, + "num_input_tokens_seen": 101046215, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.18103027, + "step": 4681, + "time_per_iteration": 2.576263189315796 + }, + { + "auxiliary_loss_clip": 0.06524731, + "auxiliary_loss_mlp": 0.01277602, + "balance_loss_clip": 0.06309941, + "balance_loss_mlp": 0.01259208, + "epoch": 0.28149706899143245, + "flos": 18521274165120.0, + "grad_norm": 2.513172937911882, + "language_loss": 0.7420646, + "learning_rate": 3.372378352108146e-06, + "loss": 0.82008791, + "num_input_tokens_seen": 101063365, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.18383789, + "step": 4682, + "time_per_iteration": 2.5019047260284424 + }, + { + "auxiliary_loss_clip": 0.06516165, + "auxiliary_loss_mlp": 0.01280522, + "balance_loss_clip": 0.06307921, + "balance_loss_mlp": 0.01262879, + "epoch": 0.2815571922441004, + "flos": 24870165229440.0, + "grad_norm": 1.4634735151261165, + "language_loss": 0.81619561, + "learning_rate": 3.3720950226529894e-06, + "loss": 0.89416242, + "num_input_tokens_seen": 101083835, + "router_z_loss_clip": 2.08105469, + "router_z_loss_mlp": 0.17626953, + "step": 4683, + "time_per_iteration": 2.6108040809631348 + }, + { + "auxiliary_loss_clip": 0.06511167, + "auxiliary_loss_mlp": 0.01277368, + "balance_loss_clip": 0.06297079, + "balance_loss_mlp": 0.01258771, + "epoch": 0.2816173154967684, + "flos": 19907774075520.0, + "grad_norm": 1.6126473409715323, + "language_loss": 0.76514447, + "learning_rate": 3.371811641167852e-06, + "loss": 0.8430298, + "num_input_tokens_seen": 101101740, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.18579102, + "step": 4684, + "time_per_iteration": 3.9593515396118164 + }, + { + "auxiliary_loss_clip": 0.06509569, + "auxiliary_loss_mlp": 0.0127644, + "balance_loss_clip": 0.06298888, + "balance_loss_mlp": 0.01257474, + "epoch": 0.28167743874943635, + "flos": 17496709966080.0, + "grad_norm": 1.741664239740996, + "language_loss": 0.76634955, + "learning_rate": 3.3715282076634807e-06, + "loss": 0.84420967, + "num_input_tokens_seen": 101120480, + "router_z_loss_clip": 2.10742188, + "router_z_loss_mlp": 0.18969727, + "step": 4685, + "time_per_iteration": 2.533033847808838 + }, + { + "auxiliary_loss_clip": 0.06512235, + "auxiliary_loss_mlp": 0.01277016, + "balance_loss_clip": 0.06303049, + "balance_loss_mlp": 0.01258002, + "epoch": 0.2817375620021043, + "flos": 25309276151040.0, + "grad_norm": 1.5379443905684582, + "language_loss": 0.76075816, + "learning_rate": 3.3712447221506218e-06, + "loss": 0.8386507, + "num_input_tokens_seen": 101142910, + "router_z_loss_clip": 2.09179688, + "router_z_loss_mlp": 0.19006348, + "step": 4686, + "time_per_iteration": 2.5632452964782715 + }, + { + "auxiliary_loss_clip": 0.0651376, + "auxiliary_loss_mlp": 0.01282744, + "balance_loss_clip": 0.06298173, + "balance_loss_mlp": 0.01262705, + "epoch": 0.2817976852547723, + "flos": 18698447623680.0, + "grad_norm": 3.4763910689128945, + "language_loss": 0.63974833, + "learning_rate": 3.370961184640025e-06, + "loss": 0.71771336, + "num_input_tokens_seen": 101160030, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.20043945, + "step": 4687, + "time_per_iteration": 2.5520877838134766 + }, + { + "auxiliary_loss_clip": 0.0651626, + "auxiliary_loss_mlp": 0.01278308, + "balance_loss_clip": 0.06302825, + "balance_loss_mlp": 0.01258889, + "epoch": 0.28185780850744024, + "flos": 22748012398080.0, + "grad_norm": 2.5451270798344208, + "language_loss": 0.76514482, + "learning_rate": 3.3706775951424433e-06, + "loss": 0.84309042, + "num_input_tokens_seen": 101177675, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.1940918, + "step": 4688, + "time_per_iteration": 2.5427582263946533 + }, + { + "auxiliary_loss_clip": 0.06506021, + "auxiliary_loss_mlp": 0.01276039, + "balance_loss_clip": 0.06297493, + "balance_loss_mlp": 0.01258622, + "epoch": 0.2819179317601082, + "flos": 14938297251840.0, + "grad_norm": 2.0673048339937394, + "language_loss": 0.79160047, + "learning_rate": 3.37039395366863e-06, + "loss": 0.86942106, + "num_input_tokens_seen": 101192225, + "router_z_loss_clip": 2.0859375, + "router_z_loss_mlp": 0.17407227, + "step": 4689, + "time_per_iteration": 2.514857769012451 + }, + { + "auxiliary_loss_clip": 0.06505655, + "auxiliary_loss_mlp": 0.01279731, + "balance_loss_clip": 0.06295724, + "balance_loss_mlp": 0.0126098, + "epoch": 0.2819780550127762, + "flos": 23151428680320.0, + "grad_norm": 2.0480677905828664, + "language_loss": 0.78403682, + "learning_rate": 3.37011026022934e-06, + "loss": 0.86189067, + "num_input_tokens_seen": 101210870, + "router_z_loss_clip": 2.1015625, + "router_z_loss_mlp": 0.18762207, + "step": 4690, + "time_per_iteration": 2.5567362308502197 + }, + { + "auxiliary_loss_clip": 0.06514366, + "auxiliary_loss_mlp": 0.01275411, + "balance_loss_clip": 0.06301816, + "balance_loss_mlp": 0.01256981, + "epoch": 0.28203817826544414, + "flos": 21622779118080.0, + "grad_norm": 2.5530247222146976, + "language_loss": 0.87619591, + "learning_rate": 3.369826514835332e-06, + "loss": 0.95409369, + "num_input_tokens_seen": 101229965, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.18432617, + "step": 4691, + "time_per_iteration": 2.5987935066223145 + }, + { + "auxiliary_loss_clip": 0.0651565, + "auxiliary_loss_mlp": 0.0127711, + "balance_loss_clip": 0.0629878, + "balance_loss_mlp": 0.01258787, + "epoch": 0.2820983015181121, + "flos": 24034010935680.0, + "grad_norm": 1.7719901211447804, + "language_loss": 0.82443225, + "learning_rate": 3.3695427174973654e-06, + "loss": 0.90235984, + "num_input_tokens_seen": 101250980, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.18322754, + "step": 4692, + "time_per_iteration": 2.607388496398926 + }, + { + "auxiliary_loss_clip": 0.06515577, + "auxiliary_loss_mlp": 0.01278887, + "balance_loss_clip": 0.06304249, + "balance_loss_mlp": 0.01259921, + "epoch": 0.2821584247707801, + "flos": 30015725408640.0, + "grad_norm": 1.5203777397001885, + "language_loss": 0.74437934, + "learning_rate": 3.3692588682262022e-06, + "loss": 0.82232404, + "num_input_tokens_seen": 101273335, + "router_z_loss_clip": 2.11328125, + "router_z_loss_mlp": 0.1895752, + "step": 4693, + "time_per_iteration": 2.6104559898376465 + }, + { + "auxiliary_loss_clip": 0.06512225, + "auxiliary_loss_mlp": 0.0127432, + "balance_loss_clip": 0.06298921, + "balance_loss_mlp": 0.01255593, + "epoch": 0.2822185480234481, + "flos": 21403034985600.0, + "grad_norm": 1.7641787467317929, + "language_loss": 0.77641487, + "learning_rate": 3.3689749670326046e-06, + "loss": 0.85428035, + "num_input_tokens_seen": 101292110, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.18737793, + "step": 4694, + "time_per_iteration": 2.5619184970855713 + }, + { + "auxiliary_loss_clip": 0.06513312, + "auxiliary_loss_mlp": 0.01274888, + "balance_loss_clip": 0.0630666, + "balance_loss_mlp": 0.01255898, + "epoch": 0.28227867127611606, + "flos": 27459996024960.0, + "grad_norm": 2.064814820064932, + "language_loss": 0.67270994, + "learning_rate": 3.3686910139273392e-06, + "loss": 0.75059193, + "num_input_tokens_seen": 101312815, + "router_z_loss_clip": 2.06640625, + "router_z_loss_mlp": 0.18969727, + "step": 4695, + "time_per_iteration": 2.5849459171295166 + }, + { + "auxiliary_loss_clip": 0.06524754, + "auxiliary_loss_mlp": 0.01277288, + "balance_loss_clip": 0.06312457, + "balance_loss_mlp": 0.01255914, + "epoch": 0.282338794528784, + "flos": 22599028638720.0, + "grad_norm": 2.3022925444863747, + "language_loss": 0.75992346, + "learning_rate": 3.3684070089211736e-06, + "loss": 0.83794391, + "num_input_tokens_seen": 101329045, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.21362305, + "step": 4696, + "time_per_iteration": 2.5599312782287598 + }, + { + "auxiliary_loss_clip": 0.06528555, + "auxiliary_loss_mlp": 0.01276345, + "balance_loss_clip": 0.06319815, + "balance_loss_mlp": 0.01257915, + "epoch": 0.282398917781452, + "flos": 42020592998400.0, + "grad_norm": 1.6923608864022255, + "language_loss": 0.62607121, + "learning_rate": 3.368122952024877e-06, + "loss": 0.70412022, + "num_input_tokens_seen": 101352715, + "router_z_loss_clip": 2.08691406, + "router_z_loss_mlp": 0.1842041, + "step": 4697, + "time_per_iteration": 2.719783067703247 + }, + { + "auxiliary_loss_clip": 0.0651894, + "auxiliary_loss_mlp": 0.01278397, + "balance_loss_clip": 0.0631054, + "balance_loss_mlp": 0.01260564, + "epoch": 0.28245904103411995, + "flos": 23231916126720.0, + "grad_norm": 1.330125700327103, + "language_loss": 0.73835146, + "learning_rate": 3.3678388432492214e-06, + "loss": 0.81632483, + "num_input_tokens_seen": 101374640, + "router_z_loss_clip": 2.08398438, + "router_z_loss_mlp": 0.17834473, + "step": 4698, + "time_per_iteration": 2.671154260635376 + }, + { + "auxiliary_loss_clip": 0.06520095, + "auxiliary_loss_mlp": 0.01274177, + "balance_loss_clip": 0.06314629, + "balance_loss_mlp": 0.01255699, + "epoch": 0.2825191642867879, + "flos": 25381713605760.0, + "grad_norm": 1.8806904568543696, + "language_loss": 0.75498992, + "learning_rate": 3.3675546826049788e-06, + "loss": 0.83293265, + "num_input_tokens_seen": 101393595, + "router_z_loss_clip": 2.05371094, + "router_z_loss_mlp": 0.18481445, + "step": 4699, + "time_per_iteration": 2.749073028564453 + }, + { + "auxiliary_loss_clip": 0.06532586, + "auxiliary_loss_mlp": 0.0127858, + "balance_loss_clip": 0.06318063, + "balance_loss_mlp": 0.01257969, + "epoch": 0.2825792875394559, + "flos": 17242277443200.0, + "grad_norm": 2.5468251061801697, + "language_loss": 0.80103695, + "learning_rate": 3.3672704701029265e-06, + "loss": 0.87914866, + "num_input_tokens_seen": 101409265, + "router_z_loss_clip": 2.14355469, + "router_z_loss_mlp": 0.20617676, + "step": 4700, + "time_per_iteration": 2.539794683456421 + }, + { + "auxiliary_loss_clip": 0.06516679, + "auxiliary_loss_mlp": 0.01274136, + "balance_loss_clip": 0.06314512, + "balance_loss_mlp": 0.01257006, + "epoch": 0.28263941079212385, + "flos": 26731177211520.0, + "grad_norm": 2.1068022199140213, + "language_loss": 0.8243857, + "learning_rate": 3.3669862057538402e-06, + "loss": 0.90229392, + "num_input_tokens_seen": 101428365, + "router_z_loss_clip": 2.02246094, + "router_z_loss_mlp": 0.17114258, + "step": 4701, + "time_per_iteration": 2.5763485431671143 + }, + { + "auxiliary_loss_clip": 0.06520683, + "auxiliary_loss_mlp": 0.01274057, + "balance_loss_clip": 0.06312392, + "balance_loss_mlp": 0.01256116, + "epoch": 0.2826995340447918, + "flos": 25928411569920.0, + "grad_norm": 2.2990609650841276, + "language_loss": 0.73153478, + "learning_rate": 3.3667018895685004e-06, + "loss": 0.80948216, + "num_input_tokens_seen": 101447280, + "router_z_loss_clip": 2.08007812, + "router_z_loss_mlp": 0.17956543, + "step": 4702, + "time_per_iteration": 2.5968289375305176 + }, + { + "auxiliary_loss_clip": 0.06520355, + "auxiliary_loss_mlp": 0.01275823, + "balance_loss_clip": 0.06316096, + "balance_loss_mlp": 0.01258848, + "epoch": 0.2827596572974598, + "flos": 22385783197440.0, + "grad_norm": 1.6603391807745085, + "language_loss": 0.78883457, + "learning_rate": 3.3664175215576886e-06, + "loss": 0.86679637, + "num_input_tokens_seen": 101465435, + "router_z_loss_clip": 2.04296875, + "router_z_loss_mlp": 0.1697998, + "step": 4703, + "time_per_iteration": 2.56088924407959 + }, + { + "auxiliary_loss_clip": 0.06518066, + "auxiliary_loss_mlp": 0.01281519, + "balance_loss_clip": 0.06307587, + "balance_loss_mlp": 0.01261885, + "epoch": 0.28281978055012774, + "flos": 33555544669440.0, + "grad_norm": 1.530922589206002, + "language_loss": 0.69937778, + "learning_rate": 3.3661331017321867e-06, + "loss": 0.77737355, + "num_input_tokens_seen": 101486355, + "router_z_loss_clip": 2.10253906, + "router_z_loss_mlp": 0.19628906, + "step": 4704, + "time_per_iteration": 2.725234031677246 + }, + { + "auxiliary_loss_clip": 0.0652602, + "auxiliary_loss_mlp": 0.01283133, + "balance_loss_clip": 0.06319317, + "balance_loss_mlp": 0.01264119, + "epoch": 0.2828799038027957, + "flos": 23447635263360.0, + "grad_norm": 1.9265232828394878, + "language_loss": 0.70927215, + "learning_rate": 3.3658486301027807e-06, + "loss": 0.78736377, + "num_input_tokens_seen": 101505875, + "router_z_loss_clip": 2.06445312, + "router_z_loss_mlp": 0.19006348, + "step": 4705, + "time_per_iteration": 2.5391383171081543 + }, + { + "auxiliary_loss_clip": 0.06482799, + "auxiliary_loss_mlp": 0.0126888, + "balance_loss_clip": 0.06378852, + "balance_loss_mlp": 0.01263947, + "epoch": 0.2829400270554637, + "flos": 69892055297280.0, + "grad_norm": 0.9159756060868983, + "language_loss": 0.59201139, + "learning_rate": 3.3655641066802577e-06, + "loss": 0.66952819, + "num_input_tokens_seen": 101565045, + "router_z_loss_clip": 1.0390625, + "router_z_loss_mlp": 0.04928589, + "step": 4706, + "time_per_iteration": 3.219618797302246 + }, + { + "auxiliary_loss_clip": 0.06512764, + "auxiliary_loss_mlp": 0.01277701, + "balance_loss_clip": 0.06312177, + "balance_loss_mlp": 0.01260547, + "epoch": 0.2830001503081317, + "flos": 24795715276800.0, + "grad_norm": 1.373077415158703, + "language_loss": 0.82380199, + "learning_rate": 3.365279531475407e-06, + "loss": 0.90170658, + "num_input_tokens_seen": 101585825, + "router_z_loss_clip": 2.00683594, + "router_z_loss_mlp": 0.17138672, + "step": 4707, + "time_per_iteration": 2.5680840015411377 + }, + { + "auxiliary_loss_clip": 0.06518079, + "auxiliary_loss_mlp": 0.01276357, + "balance_loss_clip": 0.06304221, + "balance_loss_mlp": 0.01257391, + "epoch": 0.28306027356079966, + "flos": 27676218286080.0, + "grad_norm": 1.5569970524845527, + "language_loss": 0.81077999, + "learning_rate": 3.36499490449902e-06, + "loss": 0.88872433, + "num_input_tokens_seen": 101606105, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.18969727, + "step": 4708, + "time_per_iteration": 2.643389940261841 + }, + { + "auxiliary_loss_clip": 0.06443536, + "auxiliary_loss_mlp": 0.01268639, + "balance_loss_clip": 0.06339511, + "balance_loss_mlp": 0.01264025, + "epoch": 0.2831203968134676, + "flos": 60543837734400.0, + "grad_norm": 0.8586282544888121, + "language_loss": 0.62812036, + "learning_rate": 3.3647102257618895e-06, + "loss": 0.7052421, + "num_input_tokens_seen": 101656875, + "router_z_loss_clip": 1.04199219, + "router_z_loss_mlp": 0.04608154, + "step": 4709, + "time_per_iteration": 3.0554397106170654 + }, + { + "auxiliary_loss_clip": 0.06507774, + "auxiliary_loss_mlp": 0.01270408, + "balance_loss_clip": 0.06301016, + "balance_loss_mlp": 0.01253015, + "epoch": 0.2831805200661356, + "flos": 22061386915200.0, + "grad_norm": 1.4201642822404892, + "language_loss": 0.74412584, + "learning_rate": 3.3644254952748103e-06, + "loss": 0.82190764, + "num_input_tokens_seen": 101676225, + "router_z_loss_clip": 2.06835938, + "router_z_loss_mlp": 0.1739502, + "step": 4710, + "time_per_iteration": 2.555367946624756 + }, + { + "auxiliary_loss_clip": 0.06514937, + "auxiliary_loss_mlp": 0.01275331, + "balance_loss_clip": 0.06306514, + "balance_loss_mlp": 0.01256627, + "epoch": 0.28324064331880355, + "flos": 22607120557440.0, + "grad_norm": 1.9767009095982746, + "language_loss": 0.8018595, + "learning_rate": 3.364140713048579e-06, + "loss": 0.87976217, + "num_input_tokens_seen": 101693710, + "router_z_loss_clip": 2.08300781, + "router_z_loss_mlp": 0.18713379, + "step": 4711, + "time_per_iteration": 2.610027313232422 + }, + { + "auxiliary_loss_clip": 0.06509729, + "auxiliary_loss_mlp": 0.01278493, + "balance_loss_clip": 0.06300638, + "balance_loss_mlp": 0.01260385, + "epoch": 0.2833007665714715, + "flos": 30411133626240.0, + "grad_norm": 1.982526263820073, + "language_loss": 0.70604694, + "learning_rate": 3.363855879093996e-06, + "loss": 0.78392917, + "num_input_tokens_seen": 101714010, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 0.18103027, + "step": 4712, + "time_per_iteration": 2.602795124053955 + }, + { + "auxiliary_loss_clip": 0.06508194, + "auxiliary_loss_mlp": 0.01282495, + "balance_loss_clip": 0.06299947, + "balance_loss_mlp": 0.01262992, + "epoch": 0.2833608898241395, + "flos": 23556144700800.0, + "grad_norm": 1.7823239687069516, + "language_loss": 0.8193841, + "learning_rate": 3.3635709934218605e-06, + "loss": 0.89729095, + "num_input_tokens_seen": 101732995, + "router_z_loss_clip": 2.08398438, + "router_z_loss_mlp": 0.19494629, + "step": 4713, + "time_per_iteration": 2.6088523864746094 + }, + { + "auxiliary_loss_clip": 0.06512519, + "auxiliary_loss_mlp": 0.01275048, + "balance_loss_clip": 0.06304006, + "balance_loss_mlp": 0.01255236, + "epoch": 0.28342101307680745, + "flos": 20272980096000.0, + "grad_norm": 2.6212370689858493, + "language_loss": 0.75431275, + "learning_rate": 3.3632860560429766e-06, + "loss": 0.83218849, + "num_input_tokens_seen": 101751385, + "router_z_loss_clip": 2.08398438, + "router_z_loss_mlp": 0.19799805, + "step": 4714, + "time_per_iteration": 3.986696243286133 + }, + { + "auxiliary_loss_clip": 0.06505996, + "auxiliary_loss_mlp": 0.01276776, + "balance_loss_clip": 0.06297115, + "balance_loss_mlp": 0.01259324, + "epoch": 0.2834811363294754, + "flos": 30854982303360.0, + "grad_norm": 1.3268888753773178, + "language_loss": 0.78198218, + "learning_rate": 3.3630010669681494e-06, + "loss": 0.85980994, + "num_input_tokens_seen": 101773825, + "router_z_loss_clip": 2.08691406, + "router_z_loss_mlp": 0.17468262, + "step": 4715, + "time_per_iteration": 2.652470111846924 + }, + { + "auxiliary_loss_clip": 0.06506517, + "auxiliary_loss_mlp": 0.01277278, + "balance_loss_clip": 0.06300199, + "balance_loss_mlp": 0.01260088, + "epoch": 0.2835412595821434, + "flos": 22717642492800.0, + "grad_norm": 1.6173599581374518, + "language_loss": 0.74551272, + "learning_rate": 3.3627160262081845e-06, + "loss": 0.82335067, + "num_input_tokens_seen": 101791920, + "router_z_loss_clip": 2.06542969, + "router_z_loss_mlp": 0.17175293, + "step": 4716, + "time_per_iteration": 2.597083806991577 + }, + { + "auxiliary_loss_clip": 0.06516325, + "auxiliary_loss_mlp": 0.01281584, + "balance_loss_clip": 0.06298752, + "balance_loss_mlp": 0.0126189, + "epoch": 0.28360138283481134, + "flos": 18083630689920.0, + "grad_norm": 2.1150039301458112, + "language_loss": 0.75477433, + "learning_rate": 3.3624309337738917e-06, + "loss": 0.83275348, + "num_input_tokens_seen": 101809515, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.19702148, + "step": 4717, + "time_per_iteration": 2.5648136138916016 + }, + { + "auxiliary_loss_clip": 0.06514253, + "auxiliary_loss_mlp": 0.01277656, + "balance_loss_clip": 0.06302426, + "balance_loss_mlp": 0.01258606, + "epoch": 0.2836615060874793, + "flos": 17859987342720.0, + "grad_norm": 1.540618458402471, + "language_loss": 0.67445159, + "learning_rate": 3.3621457896760813e-06, + "loss": 0.75237072, + "num_input_tokens_seen": 101827735, + "router_z_loss_clip": 2.1171875, + "router_z_loss_mlp": 0.19042969, + "step": 4718, + "time_per_iteration": 3.962265968322754 + }, + { + "auxiliary_loss_clip": 0.06507722, + "auxiliary_loss_mlp": 0.01278787, + "balance_loss_clip": 0.06295013, + "balance_loss_mlp": 0.01258772, + "epoch": 0.2837216293401473, + "flos": 25747590458880.0, + "grad_norm": 1.8038295919740834, + "language_loss": 0.73164374, + "learning_rate": 3.361860593925566e-06, + "loss": 0.8095088, + "num_input_tokens_seen": 101845970, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.20007324, + "step": 4719, + "time_per_iteration": 4.095008134841919 + }, + { + "auxiliary_loss_clip": 0.0650832, + "auxiliary_loss_mlp": 0.01277839, + "balance_loss_clip": 0.06301163, + "balance_loss_mlp": 0.01259386, + "epoch": 0.2837817525928153, + "flos": 20929906506240.0, + "grad_norm": 1.8981156672354917, + "language_loss": 0.80600828, + "learning_rate": 3.3615753465331605e-06, + "loss": 0.88386989, + "num_input_tokens_seen": 101865040, + "router_z_loss_clip": 2.07324219, + "router_z_loss_mlp": 0.18444824, + "step": 4720, + "time_per_iteration": 2.53869366645813 + }, + { + "auxiliary_loss_clip": 0.06515027, + "auxiliary_loss_mlp": 0.01280641, + "balance_loss_clip": 0.06304276, + "balance_loss_mlp": 0.01261687, + "epoch": 0.28384187584548326, + "flos": 18922719876480.0, + "grad_norm": 1.7940545446838874, + "language_loss": 0.7966662, + "learning_rate": 3.3612900475096817e-06, + "loss": 0.87462288, + "num_input_tokens_seen": 101883735, + "router_z_loss_clip": 2.10742188, + "router_z_loss_mlp": 0.18945312, + "step": 4721, + "time_per_iteration": 2.5736734867095947 + }, + { + "auxiliary_loss_clip": 0.06507237, + "auxiliary_loss_mlp": 0.01272866, + "balance_loss_clip": 0.06298702, + "balance_loss_mlp": 0.01254996, + "epoch": 0.2839019990981512, + "flos": 27351235025280.0, + "grad_norm": 1.8504915753410351, + "language_loss": 0.83238685, + "learning_rate": 3.3610046968659474e-06, + "loss": 0.91018784, + "num_input_tokens_seen": 101903025, + "router_z_loss_clip": 2.0859375, + "router_z_loss_mlp": 0.17871094, + "step": 4722, + "time_per_iteration": 2.5798823833465576 + }, + { + "auxiliary_loss_clip": 0.06511718, + "auxiliary_loss_mlp": 0.01273786, + "balance_loss_clip": 0.06302544, + "balance_loss_mlp": 0.01255547, + "epoch": 0.2839621223508192, + "flos": 18120247724160.0, + "grad_norm": 1.9056364243243222, + "language_loss": 0.71157932, + "learning_rate": 3.3607192946127785e-06, + "loss": 0.78943431, + "num_input_tokens_seen": 101922255, + "router_z_loss_clip": 2.09179688, + "router_z_loss_mlp": 0.18225098, + "step": 4723, + "time_per_iteration": 2.5472381114959717 + }, + { + "auxiliary_loss_clip": 0.06508423, + "auxiliary_loss_mlp": 0.01279225, + "balance_loss_clip": 0.06299602, + "balance_loss_mlp": 0.01259937, + "epoch": 0.28402224560348716, + "flos": 26365384212480.0, + "grad_norm": 1.5487216964387416, + "language_loss": 0.7882036, + "learning_rate": 3.360433840760998e-06, + "loss": 0.86608005, + "num_input_tokens_seen": 101943100, + "router_z_loss_clip": 2.09082031, + "router_z_loss_mlp": 0.19299316, + "step": 4724, + "time_per_iteration": 4.039300203323364 + }, + { + "auxiliary_loss_clip": 0.0650482, + "auxiliary_loss_mlp": 0.01275588, + "balance_loss_clip": 0.06294143, + "balance_loss_mlp": 0.0125754, + "epoch": 0.2840823688561551, + "flos": 24067609223040.0, + "grad_norm": 1.5786087270385247, + "language_loss": 0.92781484, + "learning_rate": 3.36014833532143e-06, + "loss": 1.00561893, + "num_input_tokens_seen": 101963160, + "router_z_loss_clip": 2.10546875, + "router_z_loss_mlp": 0.18066406, + "step": 4725, + "time_per_iteration": 2.5839502811431885 + }, + { + "auxiliary_loss_clip": 0.06504668, + "auxiliary_loss_mlp": 0.01283756, + "balance_loss_clip": 0.06292438, + "balance_loss_mlp": 0.01263097, + "epoch": 0.2841424921088231, + "flos": 29467392289920.0, + "grad_norm": 1.5513315701194426, + "language_loss": 0.89446843, + "learning_rate": 3.3598627783049e-06, + "loss": 0.97235262, + "num_input_tokens_seen": 101984300, + "router_z_loss_clip": 2.12304688, + "router_z_loss_mlp": 0.20666504, + "step": 4726, + "time_per_iteration": 2.617002010345459 + }, + { + "auxiliary_loss_clip": 0.06507252, + "auxiliary_loss_mlp": 0.01284138, + "balance_loss_clip": 0.0629679, + "balance_loss_mlp": 0.01264409, + "epoch": 0.28420261536149105, + "flos": 48110439565440.0, + "grad_norm": 2.259876030173266, + "language_loss": 0.79337573, + "learning_rate": 3.359577169722238e-06, + "loss": 0.87128961, + "num_input_tokens_seen": 102005765, + "router_z_loss_clip": 2.1015625, + "router_z_loss_mlp": 0.19763184, + "step": 4727, + "time_per_iteration": 2.774508476257324 + }, + { + "auxiliary_loss_clip": 0.06499238, + "auxiliary_loss_mlp": 0.01275292, + "balance_loss_clip": 0.06294493, + "balance_loss_mlp": 0.01257483, + "epoch": 0.284262738614159, + "flos": 25673224360320.0, + "grad_norm": 2.051338722061539, + "language_loss": 0.67073631, + "learning_rate": 3.3592915095842733e-06, + "loss": 0.74848163, + "num_input_tokens_seen": 102022755, + "router_z_loss_clip": 2.04589844, + "router_z_loss_mlp": 0.17810059, + "step": 4728, + "time_per_iteration": 2.614614725112915 + }, + { + "auxiliary_loss_clip": 0.06494898, + "auxiliary_loss_mlp": 0.01274263, + "balance_loss_clip": 0.06287634, + "balance_loss_mlp": 0.01255702, + "epoch": 0.284322861866827, + "flos": 19725066247680.0, + "grad_norm": 2.0236031999203132, + "language_loss": 0.76682353, + "learning_rate": 3.3590057979018386e-06, + "loss": 0.84451514, + "num_input_tokens_seen": 102041850, + "router_z_loss_clip": 2.07617188, + "router_z_loss_mlp": 0.18554688, + "step": 4729, + "time_per_iteration": 2.542400360107422 + }, + { + "auxiliary_loss_clip": 0.06505589, + "auxiliary_loss_mlp": 0.01273011, + "balance_loss_clip": 0.06292985, + "balance_loss_mlp": 0.0125414, + "epoch": 0.28438298511949495, + "flos": 23922105408000.0, + "grad_norm": 1.7626205541686495, + "language_loss": 0.67443657, + "learning_rate": 3.3587200346857674e-06, + "loss": 0.75222254, + "num_input_tokens_seen": 102059500, + "router_z_loss_clip": 2.12304688, + "router_z_loss_mlp": 0.1887207, + "step": 4730, + "time_per_iteration": 2.6005139350891113 + }, + { + "auxiliary_loss_clip": 0.06503962, + "auxiliary_loss_mlp": 0.01275972, + "balance_loss_clip": 0.06292562, + "balance_loss_mlp": 0.01256219, + "epoch": 0.2844431083721629, + "flos": 26074460436480.0, + "grad_norm": 1.9951841893982447, + "language_loss": 0.74777246, + "learning_rate": 3.3584342199468965e-06, + "loss": 0.82557184, + "num_input_tokens_seen": 102080460, + "router_z_loss_clip": 2.1171875, + "router_z_loss_mlp": 0.1973877, + "step": 4731, + "time_per_iteration": 2.571259021759033 + }, + { + "auxiliary_loss_clip": 0.06501718, + "auxiliary_loss_mlp": 0.01275528, + "balance_loss_clip": 0.06291741, + "balance_loss_mlp": 0.01257384, + "epoch": 0.2845032316248309, + "flos": 25817260728960.0, + "grad_norm": 1.5216025808612688, + "language_loss": 0.8435545, + "learning_rate": 3.3581483536960638e-06, + "loss": 0.92132688, + "num_input_tokens_seen": 102100950, + "router_z_loss_clip": 2.09570312, + "router_z_loss_mlp": 0.18139648, + "step": 4732, + "time_per_iteration": 2.604717254638672 + }, + { + "auxiliary_loss_clip": 0.06508272, + "auxiliary_loss_mlp": 0.01277146, + "balance_loss_clip": 0.06295733, + "balance_loss_mlp": 0.01256082, + "epoch": 0.2845633548774989, + "flos": 19828418659200.0, + "grad_norm": 1.722472955192697, + "language_loss": 0.79522747, + "learning_rate": 3.357862435944109e-06, + "loss": 0.87308168, + "num_input_tokens_seen": 102119345, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.21069336, + "step": 4733, + "time_per_iteration": 2.517216444015503 + }, + { + "auxiliary_loss_clip": 0.06511072, + "auxiliary_loss_mlp": 0.01275761, + "balance_loss_clip": 0.06296709, + "balance_loss_mlp": 0.01256878, + "epoch": 0.28462347813016686, + "flos": 23189093890560.0, + "grad_norm": 2.336729990473161, + "language_loss": 0.72093451, + "learning_rate": 3.357576466701875e-06, + "loss": 0.79880273, + "num_input_tokens_seen": 102139050, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.1887207, + "step": 4734, + "time_per_iteration": 2.5948264598846436 + }, + { + "auxiliary_loss_clip": 0.06501292, + "auxiliary_loss_mlp": 0.01274129, + "balance_loss_clip": 0.06292972, + "balance_loss_mlp": 0.01256283, + "epoch": 0.2846836013828348, + "flos": 18666316782720.0, + "grad_norm": 1.7839237241912007, + "language_loss": 0.74739748, + "learning_rate": 3.3572904459802056e-06, + "loss": 0.82515168, + "num_input_tokens_seen": 102157935, + "router_z_loss_clip": 2.08203125, + "router_z_loss_mlp": 0.1784668, + "step": 4735, + "time_per_iteration": 2.5192623138427734 + }, + { + "auxiliary_loss_clip": 0.06500865, + "auxiliary_loss_mlp": 0.01274478, + "balance_loss_clip": 0.06291883, + "balance_loss_mlp": 0.01256096, + "epoch": 0.2847437246355028, + "flos": 14178731189760.0, + "grad_norm": 1.8549790130823454, + "language_loss": 0.81047934, + "learning_rate": 3.357004373789946e-06, + "loss": 0.88823277, + "num_input_tokens_seen": 102175325, + "router_z_loss_clip": 2.09179688, + "router_z_loss_mlp": 0.18383789, + "step": 4736, + "time_per_iteration": 2.593890905380249 + }, + { + "auxiliary_loss_clip": 0.06503595, + "auxiliary_loss_mlp": 0.01274596, + "balance_loss_clip": 0.06293313, + "balance_loss_mlp": 0.01256285, + "epoch": 0.28480384788817076, + "flos": 29286068054400.0, + "grad_norm": 3.1700593253391895, + "language_loss": 0.60580242, + "learning_rate": 3.3567182501419453e-06, + "loss": 0.68358433, + "num_input_tokens_seen": 102196625, + "router_z_loss_clip": 2.10644531, + "router_z_loss_mlp": 0.18310547, + "step": 4737, + "time_per_iteration": 2.591672897338867 + }, + { + "auxiliary_loss_clip": 0.06501776, + "auxiliary_loss_mlp": 0.01274486, + "balance_loss_clip": 0.06295541, + "balance_loss_mlp": 0.01256855, + "epoch": 0.2848639711408387, + "flos": 22607875244160.0, + "grad_norm": 1.8212806326874897, + "language_loss": 0.86685491, + "learning_rate": 3.356432075047052e-06, + "loss": 0.94461757, + "num_input_tokens_seen": 102214975, + "router_z_loss_clip": 2.06347656, + "router_z_loss_mlp": 0.1763916, + "step": 4738, + "time_per_iteration": 2.5788064002990723 + }, + { + "auxiliary_loss_clip": 0.06504256, + "auxiliary_loss_mlp": 0.01280924, + "balance_loss_clip": 0.06291994, + "balance_loss_mlp": 0.01260575, + "epoch": 0.2849240943935067, + "flos": 17604632424960.0, + "grad_norm": 2.187311269731562, + "language_loss": 0.90640962, + "learning_rate": 3.356145848516118e-06, + "loss": 0.98426139, + "num_input_tokens_seen": 102231885, + "router_z_loss_clip": 2.12304688, + "router_z_loss_mlp": 0.20336914, + "step": 4739, + "time_per_iteration": 2.491391897201538 + }, + { + "auxiliary_loss_clip": 0.06502014, + "auxiliary_loss_mlp": 0.01271887, + "balance_loss_clip": 0.06294325, + "balance_loss_mlp": 0.01254363, + "epoch": 0.28498421764617465, + "flos": 24869368615680.0, + "grad_norm": 1.2838984451042732, + "language_loss": 0.72652215, + "learning_rate": 3.355859570559998e-06, + "loss": 0.80426115, + "num_input_tokens_seen": 102252725, + "router_z_loss_clip": 2.07714844, + "router_z_loss_mlp": 0.17529297, + "step": 4740, + "time_per_iteration": 2.628420352935791 + }, + { + "auxiliary_loss_clip": 0.06497836, + "auxiliary_loss_mlp": 0.01273023, + "balance_loss_clip": 0.06293581, + "balance_loss_mlp": 0.01254069, + "epoch": 0.2850443408988426, + "flos": 22788947917440.0, + "grad_norm": 1.7372555552312992, + "language_loss": 0.77982342, + "learning_rate": 3.3555732411895477e-06, + "loss": 0.85753202, + "num_input_tokens_seen": 102271730, + "router_z_loss_clip": 2.04296875, + "router_z_loss_mlp": 0.1895752, + "step": 4741, + "time_per_iteration": 2.5205776691436768 + }, + { + "auxiliary_loss_clip": 0.06505083, + "auxiliary_loss_mlp": 0.01279172, + "balance_loss_clip": 0.06290049, + "balance_loss_mlp": 0.01260278, + "epoch": 0.2851044641515106, + "flos": 18850114713600.0, + "grad_norm": 2.3624012556043246, + "language_loss": 0.7702412, + "learning_rate": 3.3552868604156235e-06, + "loss": 0.84808373, + "num_input_tokens_seen": 102291325, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.18896484, + "step": 4742, + "time_per_iteration": 2.5852768421173096 + }, + { + "auxiliary_loss_clip": 0.06507465, + "auxiliary_loss_mlp": 0.01281198, + "balance_loss_clip": 0.06292667, + "balance_loss_mlp": 0.01260252, + "epoch": 0.28516458740417855, + "flos": 18886564039680.0, + "grad_norm": 2.066213096861692, + "language_loss": 0.57976151, + "learning_rate": 3.355000428249086e-06, + "loss": 0.65764809, + "num_input_tokens_seen": 102309000, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.20959473, + "step": 4743, + "time_per_iteration": 2.562298059463501 + }, + { + "auxiliary_loss_clip": 0.06507643, + "auxiliary_loss_mlp": 0.01278324, + "balance_loss_clip": 0.06297275, + "balance_loss_mlp": 0.01259787, + "epoch": 0.2852247106568465, + "flos": 25306592820480.0, + "grad_norm": 1.602300087654556, + "language_loss": 0.75013685, + "learning_rate": 3.354713944700797e-06, + "loss": 0.82799655, + "num_input_tokens_seen": 102329240, + "router_z_loss_clip": 2.10546875, + "router_z_loss_mlp": 0.1854248, + "step": 4744, + "time_per_iteration": 2.610302209854126 + }, + { + "auxiliary_loss_clip": 0.06500175, + "auxiliary_loss_mlp": 0.01276557, + "balance_loss_clip": 0.06292172, + "balance_loss_mlp": 0.01258794, + "epoch": 0.2852848339095145, + "flos": 11660080037760.0, + "grad_norm": 2.2644691376510844, + "language_loss": 0.78515136, + "learning_rate": 3.3544274097816185e-06, + "loss": 0.86291873, + "num_input_tokens_seen": 102344440, + "router_z_loss_clip": 2.08105469, + "router_z_loss_mlp": 0.17749023, + "step": 4745, + "time_per_iteration": 2.5170419216156006 + }, + { + "auxiliary_loss_clip": 0.06491117, + "auxiliary_loss_mlp": 0.01272956, + "balance_loss_clip": 0.06290857, + "balance_loss_mlp": 0.01254836, + "epoch": 0.2853449571621825, + "flos": 12938280145920.0, + "grad_norm": 1.7221704990089022, + "language_loss": 0.83220983, + "learning_rate": 3.3541408235024173e-06, + "loss": 0.9098506, + "num_input_tokens_seen": 102360985, + "router_z_loss_clip": 2.00097656, + "router_z_loss_mlp": 0.18127441, + "step": 4746, + "time_per_iteration": 2.6257071495056152 + }, + { + "auxiliary_loss_clip": 0.06514393, + "auxiliary_loss_mlp": 0.01278566, + "balance_loss_clip": 0.06295399, + "balance_loss_mlp": 0.01257943, + "epoch": 0.28540508041485046, + "flos": 20016660856320.0, + "grad_norm": 1.8084134515670756, + "language_loss": 0.80507863, + "learning_rate": 3.3538541858740604e-06, + "loss": 0.88300824, + "num_input_tokens_seen": 102380320, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.20617676, + "step": 4747, + "time_per_iteration": 2.5699074268341064 + }, + { + "auxiliary_loss_clip": 0.06375369, + "auxiliary_loss_mlp": 0.0127529, + "balance_loss_clip": 0.0627491, + "balance_loss_mlp": 0.01269043, + "epoch": 0.28546520366751843, + "flos": 68160264710400.0, + "grad_norm": 0.7514031277524565, + "language_loss": 0.60153103, + "learning_rate": 3.3535674969074173e-06, + "loss": 0.67803764, + "num_input_tokens_seen": 102439140, + "router_z_loss_clip": 1.0078125, + "router_z_loss_mlp": 0.06237793, + "step": 4748, + "time_per_iteration": 3.1155877113342285 + }, + { + "auxiliary_loss_clip": 0.06492989, + "auxiliary_loss_mlp": 0.01272874, + "balance_loss_clip": 0.06285426, + "balance_loss_mlp": 0.01255791, + "epoch": 0.2855253269201864, + "flos": 13254961852800.0, + "grad_norm": 2.1744647780903352, + "language_loss": 0.80643219, + "learning_rate": 3.3532807566133592e-06, + "loss": 0.88409078, + "num_input_tokens_seen": 102450990, + "router_z_loss_clip": 2.07617188, + "router_z_loss_mlp": 0.17089844, + "step": 4749, + "time_per_iteration": 2.5422439575195312 + }, + { + "auxiliary_loss_clip": 0.06506198, + "auxiliary_loss_mlp": 0.01278695, + "balance_loss_clip": 0.06295547, + "balance_loss_mlp": 0.0126011, + "epoch": 0.28558545017285436, + "flos": 28628345030400.0, + "grad_norm": 1.9900791940744995, + "language_loss": 0.70889151, + "learning_rate": 3.3529939650027587e-06, + "loss": 0.78674042, + "num_input_tokens_seen": 102471820, + "router_z_loss_clip": 2.10644531, + "router_z_loss_mlp": 0.18579102, + "step": 4750, + "time_per_iteration": 2.6223177909851074 + }, + { + "auxiliary_loss_clip": 0.06498066, + "auxiliary_loss_mlp": 0.01278692, + "balance_loss_clip": 0.06294224, + "balance_loss_mlp": 0.01261562, + "epoch": 0.2856455734255223, + "flos": 34138901594880.0, + "grad_norm": 1.523200352045364, + "language_loss": 0.82438904, + "learning_rate": 3.3527071220864917e-06, + "loss": 0.90215659, + "num_input_tokens_seen": 102492625, + "router_z_loss_clip": 2.04101562, + "router_z_loss_mlp": 0.17138672, + "step": 4751, + "time_per_iteration": 2.710822582244873 + }, + { + "auxiliary_loss_clip": 0.06498431, + "auxiliary_loss_mlp": 0.01276615, + "balance_loss_clip": 0.06291521, + "balance_loss_mlp": 0.01258424, + "epoch": 0.2857056966781903, + "flos": 39795590880000.0, + "grad_norm": 1.6833478059847915, + "language_loss": 0.80598158, + "learning_rate": 3.3524202278754353e-06, + "loss": 0.88373208, + "num_input_tokens_seen": 102514145, + "router_z_loss_clip": 2.06835938, + "router_z_loss_mlp": 0.1817627, + "step": 4752, + "time_per_iteration": 2.685669422149658 + }, + { + "auxiliary_loss_clip": 0.0649987, + "auxiliary_loss_mlp": 0.01272426, + "balance_loss_clip": 0.06292621, + "balance_loss_mlp": 0.01254223, + "epoch": 0.28576581993085826, + "flos": 21878846795520.0, + "grad_norm": 1.793038640961372, + "language_loss": 0.79062063, + "learning_rate": 3.3521332823804676e-06, + "loss": 0.86834359, + "num_input_tokens_seen": 102532365, + "router_z_loss_clip": 2.07324219, + "router_z_loss_mlp": 0.18200684, + "step": 4753, + "time_per_iteration": 2.612639904022217 + }, + { + "auxiliary_loss_clip": 0.06511062, + "auxiliary_loss_mlp": 0.01278051, + "balance_loss_clip": 0.06298037, + "balance_loss_mlp": 0.01257523, + "epoch": 0.2858259431835262, + "flos": 19096455317760.0, + "grad_norm": 2.5775982542053963, + "language_loss": 0.89774185, + "learning_rate": 3.3518462856124704e-06, + "loss": 0.97563303, + "num_input_tokens_seen": 102548425, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.20532227, + "step": 4754, + "time_per_iteration": 3.914802312850952 + }, + { + "auxiliary_loss_clip": 0.06494384, + "auxiliary_loss_mlp": 0.01278048, + "balance_loss_clip": 0.06293342, + "balance_loss_mlp": 0.01259988, + "epoch": 0.2858860664361942, + "flos": 20339673546240.0, + "grad_norm": 1.9874166310668562, + "language_loss": 0.82672411, + "learning_rate": 3.3515592375823267e-06, + "loss": 0.90444839, + "num_input_tokens_seen": 102566370, + "router_z_loss_clip": 2.01171875, + "router_z_loss_mlp": 0.18066406, + "step": 4755, + "time_per_iteration": 2.673158884048462 + }, + { + "auxiliary_loss_clip": 0.06498866, + "auxiliary_loss_mlp": 0.01274185, + "balance_loss_clip": 0.06291682, + "balance_loss_mlp": 0.0125721, + "epoch": 0.28594618968886215, + "flos": 24468551809920.0, + "grad_norm": 1.6562500913369433, + "language_loss": 0.83843541, + "learning_rate": 3.351272138300922e-06, + "loss": 0.91616589, + "num_input_tokens_seen": 102588715, + "router_z_loss_clip": 2.06835938, + "router_z_loss_mlp": 0.16992188, + "step": 4756, + "time_per_iteration": 2.6029391288757324 + }, + { + "auxiliary_loss_clip": 0.06377822, + "auxiliary_loss_mlp": 0.01262219, + "balance_loss_clip": 0.06279279, + "balance_loss_mlp": 0.01256002, + "epoch": 0.2860063129415301, + "flos": 71676170830080.0, + "grad_norm": 1.4612509113917642, + "language_loss": 0.6086607, + "learning_rate": 3.350984987779142e-06, + "loss": 0.68506116, + "num_input_tokens_seen": 102656715, + "router_z_loss_clip": 0.984375, + "router_z_loss_mlp": 0.06207275, + "step": 4757, + "time_per_iteration": 3.326833963394165 + }, + { + "auxiliary_loss_clip": 0.0650306, + "auxiliary_loss_mlp": 0.01277184, + "balance_loss_clip": 0.06298901, + "balance_loss_mlp": 0.01260459, + "epoch": 0.2860664361941981, + "flos": 20564993975040.0, + "grad_norm": 2.5468639815388996, + "language_loss": 0.66759324, + "learning_rate": 3.3506977860278756e-06, + "loss": 0.74539566, + "num_input_tokens_seen": 102676545, + "router_z_loss_clip": 2.04101562, + "router_z_loss_mlp": 0.1673584, + "step": 4758, + "time_per_iteration": 5.454218626022339 + }, + { + "auxiliary_loss_clip": 0.06503905, + "auxiliary_loss_mlp": 0.01277556, + "balance_loss_clip": 0.06294875, + "balance_loss_mlp": 0.01258817, + "epoch": 0.2861265594468661, + "flos": 36005992997760.0, + "grad_norm": 1.4420872105733484, + "language_loss": 0.63405287, + "learning_rate": 3.3504105330580143e-06, + "loss": 0.71186751, + "num_input_tokens_seen": 102702875, + "router_z_loss_clip": 2.09082031, + "router_z_loss_mlp": 0.1875, + "step": 4759, + "time_per_iteration": 2.745704174041748 + }, + { + "auxiliary_loss_clip": 0.06510226, + "auxiliary_loss_mlp": 0.01276918, + "balance_loss_clip": 0.06302258, + "balance_loss_mlp": 0.01257892, + "epoch": 0.28618668269953407, + "flos": 20053571379840.0, + "grad_norm": 2.14199936751817, + "language_loss": 0.74684435, + "learning_rate": 3.3501232288804496e-06, + "loss": 0.82471573, + "num_input_tokens_seen": 102723160, + "router_z_loss_clip": 2.078125, + "router_z_loss_mlp": 0.19030762, + "step": 4760, + "time_per_iteration": 2.541759490966797 + }, + { + "auxiliary_loss_clip": 0.06496474, + "auxiliary_loss_mlp": 0.01276289, + "balance_loss_clip": 0.06297328, + "balance_loss_mlp": 0.01260482, + "epoch": 0.28624680595220203, + "flos": 24978632739840.0, + "grad_norm": 1.8333731861449165, + "language_loss": 0.72652757, + "learning_rate": 3.3498358735060773e-06, + "loss": 0.80425525, + "num_input_tokens_seen": 102743855, + "router_z_loss_clip": 1.9921875, + "router_z_loss_mlp": 0.15795898, + "step": 4761, + "time_per_iteration": 2.57940673828125 + }, + { + "auxiliary_loss_clip": 0.06509258, + "auxiliary_loss_mlp": 0.01273154, + "balance_loss_clip": 0.06299996, + "balance_loss_mlp": 0.01256095, + "epoch": 0.28630692920487, + "flos": 22498862682240.0, + "grad_norm": 1.9183655494362113, + "language_loss": 0.74669504, + "learning_rate": 3.349548466945793e-06, + "loss": 0.82451922, + "num_input_tokens_seen": 102761370, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 0.1706543, + "step": 4762, + "time_per_iteration": 2.5321590900421143 + }, + { + "auxiliary_loss_clip": 0.06505883, + "auxiliary_loss_mlp": 0.01274368, + "balance_loss_clip": 0.06301434, + "balance_loss_mlp": 0.0125694, + "epoch": 0.28636705245753796, + "flos": 21255979870080.0, + "grad_norm": 2.6303759088840413, + "language_loss": 0.76297629, + "learning_rate": 3.349261009210496e-06, + "loss": 0.84077883, + "num_input_tokens_seen": 102780885, + "router_z_loss_clip": 2.04101562, + "router_z_loss_mlp": 0.17443848, + "step": 4763, + "time_per_iteration": 3.979782819747925 + }, + { + "auxiliary_loss_clip": 0.06506684, + "auxiliary_loss_mlp": 0.01275654, + "balance_loss_clip": 0.06298703, + "balance_loss_mlp": 0.012572, + "epoch": 0.28642717571020593, + "flos": 24102339540480.0, + "grad_norm": 1.7484925103151405, + "language_loss": 0.77499843, + "learning_rate": 3.348973500311086e-06, + "loss": 0.85282177, + "num_input_tokens_seen": 102801000, + "router_z_loss_clip": 2.08105469, + "router_z_loss_mlp": 0.18444824, + "step": 4764, + "time_per_iteration": 2.6036336421966553 + }, + { + "auxiliary_loss_clip": 0.0651267, + "auxiliary_loss_mlp": 0.01277486, + "balance_loss_clip": 0.06302905, + "balance_loss_mlp": 0.01257829, + "epoch": 0.2864872989628739, + "flos": 22607959098240.0, + "grad_norm": 5.154577786286556, + "language_loss": 0.71671587, + "learning_rate": 3.348685940258466e-06, + "loss": 0.79461741, + "num_input_tokens_seen": 102820230, + "router_z_loss_clip": 2.09765625, + "router_z_loss_mlp": 0.1965332, + "step": 4765, + "time_per_iteration": 2.5488131046295166 + }, + { + "auxiliary_loss_clip": 0.0651048, + "auxiliary_loss_mlp": 0.01272743, + "balance_loss_clip": 0.06304644, + "balance_loss_mlp": 0.01255684, + "epoch": 0.28654742221554186, + "flos": 32753449860480.0, + "grad_norm": 1.504395922922802, + "language_loss": 0.7630865, + "learning_rate": 3.3483983290635395e-06, + "loss": 0.84091872, + "num_input_tokens_seen": 102842670, + "router_z_loss_clip": 2.05957031, + "router_z_loss_mlp": 0.17053223, + "step": 4766, + "time_per_iteration": 2.659499406814575 + }, + { + "auxiliary_loss_clip": 0.0650377, + "auxiliary_loss_mlp": 0.01271145, + "balance_loss_clip": 0.0630042, + "balance_loss_mlp": 0.01254277, + "epoch": 0.2866075454682098, + "flos": 26989257386880.0, + "grad_norm": 2.0841406955827075, + "language_loss": 0.78443938, + "learning_rate": 3.348110666737214e-06, + "loss": 0.86218858, + "num_input_tokens_seen": 102864480, + "router_z_loss_clip": 2.03320312, + "router_z_loss_mlp": 0.16870117, + "step": 4767, + "time_per_iteration": 2.5891125202178955 + }, + { + "auxiliary_loss_clip": 0.06511022, + "auxiliary_loss_mlp": 0.01279425, + "balance_loss_clip": 0.06305116, + "balance_loss_mlp": 0.01261746, + "epoch": 0.2866676687208778, + "flos": 23259812336640.0, + "grad_norm": 2.0448044221544737, + "language_loss": 0.65430236, + "learning_rate": 3.3478229532903956e-06, + "loss": 0.73220682, + "num_input_tokens_seen": 102883740, + "router_z_loss_clip": 2.05761719, + "router_z_loss_mlp": 0.17675781, + "step": 4768, + "time_per_iteration": 2.572230815887451 + }, + { + "auxiliary_loss_clip": 0.0651636, + "auxiliary_loss_mlp": 0.01271508, + "balance_loss_clip": 0.06302489, + "balance_loss_mlp": 0.01253782, + "epoch": 0.28672779197354575, + "flos": 21586120156800.0, + "grad_norm": 1.6016626643500549, + "language_loss": 0.71173406, + "learning_rate": 3.3475351887339967e-06, + "loss": 0.78961271, + "num_input_tokens_seen": 102902945, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.17724609, + "step": 4769, + "time_per_iteration": 2.5180304050445557 + }, + { + "auxiliary_loss_clip": 0.06513099, + "auxiliary_loss_mlp": 0.01273812, + "balance_loss_clip": 0.06304821, + "balance_loss_mlp": 0.01256562, + "epoch": 0.2867879152262137, + "flos": 19871785946880.0, + "grad_norm": 1.7128041826885096, + "language_loss": 0.75347042, + "learning_rate": 3.3472473730789288e-06, + "loss": 0.83133948, + "num_input_tokens_seen": 102922405, + "router_z_loss_clip": 2.08496094, + "router_z_loss_mlp": 0.17248535, + "step": 4770, + "time_per_iteration": 2.575993537902832 + }, + { + "auxiliary_loss_clip": 0.06514675, + "auxiliary_loss_mlp": 0.01275884, + "balance_loss_clip": 0.06304142, + "balance_loss_mlp": 0.01257967, + "epoch": 0.2868480384788817, + "flos": 28219687868160.0, + "grad_norm": 4.606069071133779, + "language_loss": 0.68064034, + "learning_rate": 3.3469595063361045e-06, + "loss": 0.75854599, + "num_input_tokens_seen": 102938980, + "router_z_loss_clip": 2.10546875, + "router_z_loss_mlp": 0.17907715, + "step": 4771, + "time_per_iteration": 2.5533907413482666 + }, + { + "auxiliary_loss_clip": 0.06411134, + "auxiliary_loss_mlp": 0.0125763, + "balance_loss_clip": 0.06311508, + "balance_loss_mlp": 0.01253345, + "epoch": 0.2869081617315497, + "flos": 65442218768640.0, + "grad_norm": 0.7478629548239109, + "language_loss": 0.56696546, + "learning_rate": 3.3466715885164414e-06, + "loss": 0.64365304, + "num_input_tokens_seen": 103000405, + "router_z_loss_clip": 0.99609375, + "router_z_loss_mlp": 0.04290771, + "step": 4772, + "time_per_iteration": 3.1295437812805176 + }, + { + "auxiliary_loss_clip": 0.06515288, + "auxiliary_loss_mlp": 0.01274714, + "balance_loss_clip": 0.06305212, + "balance_loss_mlp": 0.01256165, + "epoch": 0.28696828498421767, + "flos": 18666610272000.0, + "grad_norm": 3.729070810615603, + "language_loss": 0.84013474, + "learning_rate": 3.346383619630856e-06, + "loss": 0.91803479, + "num_input_tokens_seen": 103017970, + "router_z_loss_clip": 2.09765625, + "router_z_loss_mlp": 0.1854248, + "step": 4773, + "time_per_iteration": 2.5181708335876465 + }, + { + "auxiliary_loss_clip": 0.06518447, + "auxiliary_loss_mlp": 0.01274166, + "balance_loss_clip": 0.06306095, + "balance_loss_mlp": 0.01254985, + "epoch": 0.28702840823688563, + "flos": 23666540855040.0, + "grad_norm": 2.856350636496585, + "language_loss": 0.78241181, + "learning_rate": 3.34609559969027e-06, + "loss": 0.86033797, + "num_input_tokens_seen": 103036385, + "router_z_loss_clip": 2.12109375, + "router_z_loss_mlp": 0.19177246, + "step": 4774, + "time_per_iteration": 2.5831918716430664 + }, + { + "auxiliary_loss_clip": 0.06519175, + "auxiliary_loss_mlp": 0.01275468, + "balance_loss_clip": 0.06307949, + "balance_loss_mlp": 0.01255703, + "epoch": 0.2870885314895536, + "flos": 13809248611200.0, + "grad_norm": 1.8762920881530476, + "language_loss": 0.74056339, + "learning_rate": 3.3458075287056034e-06, + "loss": 0.81850982, + "num_input_tokens_seen": 103052170, + "router_z_loss_clip": 2.11328125, + "router_z_loss_mlp": 0.19763184, + "step": 4775, + "time_per_iteration": 2.505293369293213 + }, + { + "auxiliary_loss_clip": 0.06520346, + "auxiliary_loss_mlp": 0.01275844, + "balance_loss_clip": 0.06309157, + "balance_loss_mlp": 0.01258142, + "epoch": 0.28714865474222157, + "flos": 17792790768000.0, + "grad_norm": 1.8823617406689648, + "language_loss": 0.88338864, + "learning_rate": 3.34551940668778e-06, + "loss": 0.96135056, + "num_input_tokens_seen": 103070510, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.17687988, + "step": 4776, + "time_per_iteration": 2.5638997554779053 + }, + { + "auxiliary_loss_clip": 0.06511634, + "auxiliary_loss_mlp": 0.01275769, + "balance_loss_clip": 0.06302971, + "balance_loss_mlp": 0.01258269, + "epoch": 0.28720877799488953, + "flos": 16002958429440.0, + "grad_norm": 2.648093963017482, + "language_loss": 0.74451852, + "learning_rate": 3.345231233647726e-06, + "loss": 0.82239252, + "num_input_tokens_seen": 103089590, + "router_z_loss_clip": 2.08789062, + "router_z_loss_mlp": 0.17492676, + "step": 4777, + "time_per_iteration": 2.5142223834991455 + }, + { + "auxiliary_loss_clip": 0.06527238, + "auxiliary_loss_mlp": 0.01280106, + "balance_loss_clip": 0.06311023, + "balance_loss_mlp": 0.01259924, + "epoch": 0.2872689012475575, + "flos": 20929445308800.0, + "grad_norm": 2.200879096052639, + "language_loss": 0.80539143, + "learning_rate": 3.3449430095963696e-06, + "loss": 0.88346487, + "num_input_tokens_seen": 103109080, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.20202637, + "step": 4778, + "time_per_iteration": 2.563994884490967 + }, + { + "auxiliary_loss_clip": 0.06511427, + "auxiliary_loss_mlp": 0.01281129, + "balance_loss_clip": 0.06304548, + "balance_loss_mlp": 0.01263223, + "epoch": 0.28732902450022546, + "flos": 21331603779840.0, + "grad_norm": 1.7996465112645923, + "language_loss": 0.73886508, + "learning_rate": 3.3446547345446386e-06, + "loss": 0.8167907, + "num_input_tokens_seen": 103127755, + "router_z_loss_clip": 2.06542969, + "router_z_loss_mlp": 0.17895508, + "step": 4779, + "time_per_iteration": 2.5394158363342285 + }, + { + "auxiliary_loss_clip": 0.06518923, + "auxiliary_loss_mlp": 0.01275383, + "balance_loss_clip": 0.06307982, + "balance_loss_mlp": 0.01255379, + "epoch": 0.2873891477528934, + "flos": 20856714364800.0, + "grad_norm": 1.509851280453794, + "language_loss": 0.76844704, + "learning_rate": 3.3443664085034656e-06, + "loss": 0.84639007, + "num_input_tokens_seen": 103147035, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.19995117, + "step": 4780, + "time_per_iteration": 2.5928425788879395 + }, + { + "auxiliary_loss_clip": 0.06507713, + "auxiliary_loss_mlp": 0.01271777, + "balance_loss_clip": 0.06302975, + "balance_loss_mlp": 0.01254014, + "epoch": 0.2874492710055614, + "flos": 17425698030720.0, + "grad_norm": 1.6471362454858889, + "language_loss": 0.81874287, + "learning_rate": 3.344078031483784e-06, + "loss": 0.89653778, + "num_input_tokens_seen": 103165410, + "router_z_loss_clip": 2.04296875, + "router_z_loss_mlp": 0.17773438, + "step": 4781, + "time_per_iteration": 2.6121537685394287 + }, + { + "auxiliary_loss_clip": 0.06521222, + "auxiliary_loss_mlp": 0.0127902, + "balance_loss_clip": 0.06306514, + "balance_loss_mlp": 0.01257002, + "epoch": 0.28750939425822936, + "flos": 13411827895680.0, + "grad_norm": 2.0671181517724966, + "language_loss": 0.86987036, + "learning_rate": 3.3437896034965283e-06, + "loss": 0.94787276, + "num_input_tokens_seen": 103183710, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.22009277, + "step": 4782, + "time_per_iteration": 2.554326057434082 + }, + { + "auxiliary_loss_clip": 0.06525762, + "auxiliary_loss_mlp": 0.01282396, + "balance_loss_clip": 0.06310341, + "balance_loss_mlp": 0.01262238, + "epoch": 0.2875695175108973, + "flos": 21876205392000.0, + "grad_norm": 1.4282255381090248, + "language_loss": 0.71525908, + "learning_rate": 3.3435011245526357e-06, + "loss": 0.79334062, + "num_input_tokens_seen": 103203790, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.20153809, + "step": 4783, + "time_per_iteration": 2.5632100105285645 + }, + { + "auxiliary_loss_clip": 0.06514136, + "auxiliary_loss_mlp": 0.01279499, + "balance_loss_clip": 0.06305264, + "balance_loss_mlp": 0.01259186, + "epoch": 0.2876296407635653, + "flos": 26251885457280.0, + "grad_norm": 1.5568964680804804, + "language_loss": 0.77152872, + "learning_rate": 3.343212594663047e-06, + "loss": 0.84946513, + "num_input_tokens_seen": 103223925, + "router_z_loss_clip": 2.08886719, + "router_z_loss_mlp": 0.203125, + "step": 4784, + "time_per_iteration": 2.589073657989502 + }, + { + "auxiliary_loss_clip": 0.06506136, + "auxiliary_loss_mlp": 0.01278073, + "balance_loss_clip": 0.06301259, + "balance_loss_mlp": 0.01257914, + "epoch": 0.28768976401623325, + "flos": 25380581575680.0, + "grad_norm": 1.5725877671574655, + "language_loss": 0.76106405, + "learning_rate": 3.3429240138387015e-06, + "loss": 0.83890617, + "num_input_tokens_seen": 103244760, + "router_z_loss_clip": 2.04785156, + "router_z_loss_mlp": 0.20153809, + "step": 4785, + "time_per_iteration": 2.6051061153411865 + }, + { + "auxiliary_loss_clip": 0.06513079, + "auxiliary_loss_mlp": 0.0127873, + "balance_loss_clip": 0.06302914, + "balance_loss_mlp": 0.01259394, + "epoch": 0.28774988726890127, + "flos": 30672232548480.0, + "grad_norm": 2.246179731229797, + "language_loss": 0.83339965, + "learning_rate": 3.3426353820905425e-06, + "loss": 0.91131771, + "num_input_tokens_seen": 103261995, + "router_z_loss_clip": 2.1015625, + "router_z_loss_mlp": 0.19348145, + "step": 4786, + "time_per_iteration": 2.6064071655273438 + }, + { + "auxiliary_loss_clip": 0.06512371, + "auxiliary_loss_mlp": 0.01278365, + "balance_loss_clip": 0.06303188, + "balance_loss_mlp": 0.01258934, + "epoch": 0.28781001052156924, + "flos": 20601820644480.0, + "grad_norm": 2.4876341958211037, + "language_loss": 0.80607671, + "learning_rate": 3.342346699429516e-06, + "loss": 0.88398409, + "num_input_tokens_seen": 103279780, + "router_z_loss_clip": 2.09179688, + "router_z_loss_mlp": 0.19433594, + "step": 4787, + "time_per_iteration": 2.574558973312378 + }, + { + "auxiliary_loss_clip": 0.06516974, + "auxiliary_loss_mlp": 0.01280481, + "balance_loss_clip": 0.0630367, + "balance_loss_mlp": 0.01260191, + "epoch": 0.2878701337742372, + "flos": 26549643340800.0, + "grad_norm": 1.713934654291453, + "language_loss": 0.84188497, + "learning_rate": 3.3420579658665677e-06, + "loss": 0.91985947, + "num_input_tokens_seen": 103300580, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.20288086, + "step": 4788, + "time_per_iteration": 2.610520362854004 + }, + { + "auxiliary_loss_clip": 0.06528202, + "auxiliary_loss_mlp": 0.01278372, + "balance_loss_clip": 0.06311956, + "balance_loss_mlp": 0.01257594, + "epoch": 0.28793025702690517, + "flos": 28154294156160.0, + "grad_norm": 1.8819133496848792, + "language_loss": 0.73887986, + "learning_rate": 3.3417691814126468e-06, + "loss": 0.81694555, + "num_input_tokens_seen": 103320430, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.2076416, + "step": 4789, + "time_per_iteration": 2.637234687805176 + }, + { + "auxiliary_loss_clip": 0.06504419, + "auxiliary_loss_mlp": 0.0127649, + "balance_loss_clip": 0.06300576, + "balance_loss_mlp": 0.01259014, + "epoch": 0.28799038027957313, + "flos": 23812254305280.0, + "grad_norm": 1.6484379512289788, + "language_loss": 0.84411776, + "learning_rate": 3.341480346078704e-06, + "loss": 0.92192692, + "num_input_tokens_seen": 103337695, + "router_z_loss_clip": 2.03808594, + "router_z_loss_mlp": 0.17492676, + "step": 4790, + "time_per_iteration": 2.5587222576141357 + }, + { + "auxiliary_loss_clip": 0.06518544, + "auxiliary_loss_mlp": 0.01278217, + "balance_loss_clip": 0.06308021, + "balance_loss_mlp": 0.01259728, + "epoch": 0.2880505035322411, + "flos": 22350340120320.0, + "grad_norm": 1.9872780385985664, + "language_loss": 0.78222489, + "learning_rate": 3.3411914598756922e-06, + "loss": 0.86019248, + "num_input_tokens_seen": 103357010, + "router_z_loss_clip": 2.10546875, + "router_z_loss_mlp": 0.18481445, + "step": 4791, + "time_per_iteration": 2.624457359313965 + }, + { + "auxiliary_loss_clip": 0.06518695, + "auxiliary_loss_mlp": 0.01277015, + "balance_loss_clip": 0.06302316, + "balance_loss_mlp": 0.01257286, + "epoch": 0.28811062678490906, + "flos": 18010061205120.0, + "grad_norm": 3.7561845310327002, + "language_loss": 0.71278274, + "learning_rate": 3.3409025228145654e-06, + "loss": 0.79073977, + "num_input_tokens_seen": 103375600, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.19726562, + "step": 4792, + "time_per_iteration": 2.5208675861358643 + }, + { + "auxiliary_loss_clip": 0.06512474, + "auxiliary_loss_mlp": 0.01276677, + "balance_loss_clip": 0.06301394, + "balance_loss_mlp": 0.01258391, + "epoch": 0.28817075003757703, + "flos": 22097416970880.0, + "grad_norm": 1.8001054572072859, + "language_loss": 0.80413318, + "learning_rate": 3.3406135349062812e-06, + "loss": 0.88202471, + "num_input_tokens_seen": 103395225, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 0.18286133, + "step": 4793, + "time_per_iteration": 4.170284271240234 + }, + { + "auxiliary_loss_clip": 0.06499149, + "auxiliary_loss_mlp": 0.01283104, + "balance_loss_clip": 0.06297339, + "balance_loss_mlp": 0.01264484, + "epoch": 0.288230873290245, + "flos": 41692842552960.0, + "grad_norm": 1.6709200510021447, + "language_loss": 0.78107667, + "learning_rate": 3.340324496161797e-06, + "loss": 0.85889918, + "num_input_tokens_seen": 103417245, + "router_z_loss_clip": 2.01757812, + "router_z_loss_mlp": 0.18603516, + "step": 4794, + "time_per_iteration": 2.8557510375976562 + }, + { + "auxiliary_loss_clip": 0.06510264, + "auxiliary_loss_mlp": 0.01279527, + "balance_loss_clip": 0.06298079, + "balance_loss_mlp": 0.01260882, + "epoch": 0.28829099654291296, + "flos": 18630328654080.0, + "grad_norm": 2.1208293695579608, + "language_loss": 0.83245766, + "learning_rate": 3.340035406592074e-06, + "loss": 0.91035557, + "num_input_tokens_seen": 103435500, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.18652344, + "step": 4795, + "time_per_iteration": 2.535163164138794 + }, + { + "auxiliary_loss_clip": 0.06498718, + "auxiliary_loss_mlp": 0.0128311, + "balance_loss_clip": 0.06297053, + "balance_loss_mlp": 0.01266099, + "epoch": 0.2883511197955809, + "flos": 24680707148160.0, + "grad_norm": 2.078774389913416, + "language_loss": 0.75219119, + "learning_rate": 3.339746266208074e-06, + "loss": 0.83000946, + "num_input_tokens_seen": 103451040, + "router_z_loss_clip": 2.015625, + "router_z_loss_mlp": 0.17004395, + "step": 4796, + "time_per_iteration": 2.567488670349121 + }, + { + "auxiliary_loss_clip": 0.06509424, + "auxiliary_loss_mlp": 0.01276979, + "balance_loss_clip": 0.06296358, + "balance_loss_mlp": 0.01257798, + "epoch": 0.2884112430482489, + "flos": 23118794714880.0, + "grad_norm": 2.1968759883463513, + "language_loss": 0.73290622, + "learning_rate": 3.3394570750207614e-06, + "loss": 0.81077027, + "num_input_tokens_seen": 103471330, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.19189453, + "step": 4797, + "time_per_iteration": 3.975389242172241 + }, + { + "auxiliary_loss_clip": 0.06507025, + "auxiliary_loss_mlp": 0.01273799, + "balance_loss_clip": 0.0629791, + "balance_loss_mlp": 0.0125556, + "epoch": 0.28847136630091685, + "flos": 16879000066560.0, + "grad_norm": 2.2937655739300373, + "language_loss": 0.74862409, + "learning_rate": 3.3391678330411017e-06, + "loss": 0.82643229, + "num_input_tokens_seen": 103488060, + "router_z_loss_clip": 2.08984375, + "router_z_loss_mlp": 0.18212891, + "step": 4798, + "time_per_iteration": 3.9849729537963867 + }, + { + "auxiliary_loss_clip": 0.06517179, + "auxiliary_loss_mlp": 0.01285883, + "balance_loss_clip": 0.06306559, + "balance_loss_mlp": 0.01266381, + "epoch": 0.2885314895535849, + "flos": 25663161870720.0, + "grad_norm": 2.626807334731923, + "language_loss": 0.65891635, + "learning_rate": 3.3388785402800642e-06, + "loss": 0.736947, + "num_input_tokens_seen": 103503600, + "router_z_loss_clip": 2.09960938, + "router_z_loss_mlp": 0.19494629, + "step": 4799, + "time_per_iteration": 2.6063008308410645 + }, + { + "auxiliary_loss_clip": 0.06513311, + "auxiliary_loss_mlp": 0.01278669, + "balance_loss_clip": 0.06300591, + "balance_loss_mlp": 0.01260013, + "epoch": 0.28859161280625284, + "flos": 21113872145280.0, + "grad_norm": 1.5942901452973643, + "language_loss": 0.82659006, + "learning_rate": 3.3385891967486178e-06, + "loss": 0.9045099, + "num_input_tokens_seen": 103524195, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.18664551, + "step": 4800, + "time_per_iteration": 2.5522704124450684 + }, + { + "auxiliary_loss_clip": 0.06498213, + "auxiliary_loss_mlp": 0.01277775, + "balance_loss_clip": 0.06294428, + "balance_loss_mlp": 0.01260609, + "epoch": 0.2886517360589208, + "flos": 26476870469760.0, + "grad_norm": 1.7957021177556654, + "language_loss": 0.91005886, + "learning_rate": 3.3382998024577347e-06, + "loss": 0.98781872, + "num_input_tokens_seen": 103545235, + "router_z_loss_clip": 2.03613281, + "router_z_loss_mlp": 0.17175293, + "step": 4801, + "time_per_iteration": 2.648975372314453 + }, + { + "auxiliary_loss_clip": 0.06509861, + "auxiliary_loss_mlp": 0.01278615, + "balance_loss_clip": 0.06299478, + "balance_loss_mlp": 0.01260722, + "epoch": 0.28871185931158877, + "flos": 25272365627520.0, + "grad_norm": 1.8432796050129874, + "language_loss": 0.74294543, + "learning_rate": 3.33801035741839e-06, + "loss": 0.82083023, + "num_input_tokens_seen": 103563305, + "router_z_loss_clip": 2.10546875, + "router_z_loss_mlp": 0.17895508, + "step": 4802, + "time_per_iteration": 2.5519795417785645 + }, + { + "auxiliary_loss_clip": 0.0639186, + "auxiliary_loss_mlp": 0.01290861, + "balance_loss_clip": 0.06293292, + "balance_loss_mlp": 0.01286456, + "epoch": 0.28877198256425674, + "flos": 66683676061440.0, + "grad_norm": 0.7742675136744124, + "language_loss": 0.62925327, + "learning_rate": 3.337720861641558e-06, + "loss": 0.70608056, + "num_input_tokens_seen": 103625025, + "router_z_loss_clip": 0.984375, + "router_z_loss_mlp": 0.04412842, + "step": 4803, + "time_per_iteration": 4.557742595672607 + }, + { + "auxiliary_loss_clip": 0.06504417, + "auxiliary_loss_mlp": 0.01273971, + "balance_loss_clip": 0.06297504, + "balance_loss_mlp": 0.01256721, + "epoch": 0.2888321058169247, + "flos": 20309261713920.0, + "grad_norm": 2.312081796144873, + "language_loss": 0.71418971, + "learning_rate": 3.3374313151382165e-06, + "loss": 0.79197359, + "num_input_tokens_seen": 103644235, + "router_z_loss_clip": 2.06738281, + "router_z_loss_mlp": 0.17248535, + "step": 4804, + "time_per_iteration": 2.5679221153259277 + }, + { + "auxiliary_loss_clip": 0.06511839, + "auxiliary_loss_mlp": 0.01276786, + "balance_loss_clip": 0.06299883, + "balance_loss_mlp": 0.01258892, + "epoch": 0.28889222906959267, + "flos": 25523192424960.0, + "grad_norm": 2.035708939634364, + "language_loss": 0.68254268, + "learning_rate": 3.337141717919346e-06, + "loss": 0.76042891, + "num_input_tokens_seen": 103664700, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.17907715, + "step": 4805, + "time_per_iteration": 2.5894699096679688 + }, + { + "auxiliary_loss_clip": 0.06510667, + "auxiliary_loss_mlp": 0.01276264, + "balance_loss_clip": 0.06300112, + "balance_loss_mlp": 0.01258955, + "epoch": 0.28895235232226063, + "flos": 32679544959360.0, + "grad_norm": 1.67836402891337, + "language_loss": 0.69622278, + "learning_rate": 3.3368520699959272e-06, + "loss": 0.77409214, + "num_input_tokens_seen": 103686595, + "router_z_loss_clip": 2.10546875, + "router_z_loss_mlp": 0.1730957, + "step": 4806, + "time_per_iteration": 2.6661036014556885 + }, + { + "auxiliary_loss_clip": 0.06499489, + "auxiliary_loss_mlp": 0.01273073, + "balance_loss_clip": 0.06297253, + "balance_loss_mlp": 0.01256133, + "epoch": 0.2890124755749286, + "flos": 29722202156160.0, + "grad_norm": 1.5048672267596763, + "language_loss": 0.71718901, + "learning_rate": 3.3365623713789443e-06, + "loss": 0.7949146, + "num_input_tokens_seen": 103707525, + "router_z_loss_clip": 2.02441406, + "router_z_loss_mlp": 0.16931152, + "step": 4807, + "time_per_iteration": 2.6082210540771484 + }, + { + "auxiliary_loss_clip": 0.06506096, + "auxiliary_loss_mlp": 0.01273173, + "balance_loss_clip": 0.06298453, + "balance_loss_mlp": 0.01255769, + "epoch": 0.28907259882759656, + "flos": 22681067385600.0, + "grad_norm": 1.6103433555287536, + "language_loss": 0.8189373, + "learning_rate": 3.336272622079382e-06, + "loss": 0.89672995, + "num_input_tokens_seen": 103727905, + "router_z_loss_clip": 2.07421875, + "router_z_loss_mlp": 0.17407227, + "step": 4808, + "time_per_iteration": 2.575005292892456 + }, + { + "auxiliary_loss_clip": 0.0649471, + "auxiliary_loss_mlp": 0.01279377, + "balance_loss_clip": 0.06293811, + "balance_loss_mlp": 0.01261543, + "epoch": 0.2891327220802645, + "flos": 22572809510400.0, + "grad_norm": 1.6658984409983257, + "language_loss": 0.79128641, + "learning_rate": 3.3359828221082276e-06, + "loss": 0.86902726, + "num_input_tokens_seen": 103748335, + "router_z_loss_clip": 2.00878906, + "router_z_loss_mlp": 0.17834473, + "step": 4809, + "time_per_iteration": 2.563202142715454 + }, + { + "auxiliary_loss_clip": 0.06509645, + "auxiliary_loss_mlp": 0.01274578, + "balance_loss_clip": 0.06294866, + "balance_loss_mlp": 0.01256411, + "epoch": 0.2891928453329325, + "flos": 21659228444160.0, + "grad_norm": 1.9154470794900575, + "language_loss": 0.79370517, + "learning_rate": 3.3356929714764714e-06, + "loss": 0.8715474, + "num_input_tokens_seen": 103767020, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.18151855, + "step": 4810, + "time_per_iteration": 2.555290460586548 + }, + { + "auxiliary_loss_clip": 0.06499892, + "auxiliary_loss_mlp": 0.01275722, + "balance_loss_clip": 0.06295595, + "balance_loss_mlp": 0.01259259, + "epoch": 0.28925296858560046, + "flos": 23228855452800.0, + "grad_norm": 1.5886971021791327, + "language_loss": 0.77595514, + "learning_rate": 3.3354030701951032e-06, + "loss": 0.85371131, + "num_input_tokens_seen": 103786355, + "router_z_loss_clip": 2.04199219, + "router_z_loss_mlp": 0.16467285, + "step": 4811, + "time_per_iteration": 2.5522642135620117 + }, + { + "auxiliary_loss_clip": 0.06509165, + "auxiliary_loss_mlp": 0.01277164, + "balance_loss_clip": 0.06302579, + "balance_loss_mlp": 0.01259497, + "epoch": 0.2893130918382685, + "flos": 28629267425280.0, + "grad_norm": 1.704164513062304, + "language_loss": 0.78002596, + "learning_rate": 3.335113118275117e-06, + "loss": 0.85788929, + "num_input_tokens_seen": 103809345, + "router_z_loss_clip": 2.06933594, + "router_z_loss_mlp": 0.17675781, + "step": 4812, + "time_per_iteration": 2.6069154739379883 + }, + { + "auxiliary_loss_clip": 0.06384769, + "auxiliary_loss_mlp": 0.01270413, + "balance_loss_clip": 0.06288065, + "balance_loss_mlp": 0.01266965, + "epoch": 0.28937321509093644, + "flos": 72323328240000.0, + "grad_norm": 0.7614773045430072, + "language_loss": 0.60086656, + "learning_rate": 3.3348231157275085e-06, + "loss": 0.67741829, + "num_input_tokens_seen": 103871180, + "router_z_loss_clip": 0.96875, + "router_z_loss_mlp": 0.03457642, + "step": 4813, + "time_per_iteration": 3.3377795219421387 + }, + { + "auxiliary_loss_clip": 0.06503347, + "auxiliary_loss_mlp": 0.01279669, + "balance_loss_clip": 0.0629978, + "balance_loss_mlp": 0.01262253, + "epoch": 0.2894333383436044, + "flos": 16221905948160.0, + "grad_norm": 2.095142654160917, + "language_loss": 0.83059847, + "learning_rate": 3.3345330625632725e-06, + "loss": 0.90842861, + "num_input_tokens_seen": 103889040, + "router_z_loss_clip": 2.03515625, + "router_z_loss_mlp": 0.17407227, + "step": 4814, + "time_per_iteration": 2.519822120666504 + }, + { + "auxiliary_loss_clip": 0.06510264, + "auxiliary_loss_mlp": 0.0128276, + "balance_loss_clip": 0.06297985, + "balance_loss_mlp": 0.01264389, + "epoch": 0.2894934615962724, + "flos": 24835434912000.0, + "grad_norm": 1.4921373382431753, + "language_loss": 0.72583377, + "learning_rate": 3.3342429587934094e-06, + "loss": 0.80376399, + "num_input_tokens_seen": 103910380, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.18371582, + "step": 4815, + "time_per_iteration": 2.613424301147461 + }, + { + "auxiliary_loss_clip": 0.06496876, + "auxiliary_loss_mlp": 0.01270189, + "balance_loss_clip": 0.06299625, + "balance_loss_mlp": 0.01253858, + "epoch": 0.28955358484894034, + "flos": 20456400683520.0, + "grad_norm": 1.478095248571898, + "language_loss": 0.71455014, + "learning_rate": 3.3339528044289198e-06, + "loss": 0.79222083, + "num_input_tokens_seen": 103929955, + "router_z_loss_clip": 1.97265625, + "router_z_loss_mlp": 0.16345215, + "step": 4816, + "time_per_iteration": 2.523789644241333 + }, + { + "auxiliary_loss_clip": 0.0651416, + "auxiliary_loss_mlp": 0.01273853, + "balance_loss_clip": 0.06301913, + "balance_loss_mlp": 0.01256007, + "epoch": 0.2896137081016083, + "flos": 22571803261440.0, + "grad_norm": 2.1886400582799643, + "language_loss": 0.75928313, + "learning_rate": 3.3336625994808055e-06, + "loss": 0.83716327, + "num_input_tokens_seen": 103948020, + "router_z_loss_clip": 2.12109375, + "router_z_loss_mlp": 0.17834473, + "step": 4817, + "time_per_iteration": 2.5829625129699707 + }, + { + "auxiliary_loss_clip": 0.0650699, + "auxiliary_loss_mlp": 0.0127444, + "balance_loss_clip": 0.06299114, + "balance_loss_mlp": 0.01255486, + "epoch": 0.28967383135427627, + "flos": 26695231009920.0, + "grad_norm": 2.009148210409016, + "language_loss": 0.77384543, + "learning_rate": 3.3333723439600723e-06, + "loss": 0.85165972, + "num_input_tokens_seen": 103968740, + "router_z_loss_clip": 2.078125, + "router_z_loss_mlp": 0.18933105, + "step": 4818, + "time_per_iteration": 2.583580732345581 + }, + { + "auxiliary_loss_clip": 0.06511898, + "auxiliary_loss_mlp": 0.01274642, + "balance_loss_clip": 0.063049, + "balance_loss_mlp": 0.01257833, + "epoch": 0.28973395460694423, + "flos": 15563428237440.0, + "grad_norm": 1.8180363278883531, + "language_loss": 0.80166686, + "learning_rate": 3.3330820378777263e-06, + "loss": 0.87953222, + "num_input_tokens_seen": 103986005, + "router_z_loss_clip": 2.07128906, + "router_z_loss_mlp": 0.16833496, + "step": 4819, + "time_per_iteration": 2.58598256111145 + }, + { + "auxiliary_loss_clip": 0.06512412, + "auxiliary_loss_mlp": 0.01275212, + "balance_loss_clip": 0.06301294, + "balance_loss_mlp": 0.01256543, + "epoch": 0.2897940778596122, + "flos": 18703395014400.0, + "grad_norm": 1.8889731698350438, + "language_loss": 0.79784238, + "learning_rate": 3.332791681244776e-06, + "loss": 0.87571859, + "num_input_tokens_seen": 104005070, + "router_z_loss_clip": 2.11328125, + "router_z_loss_mlp": 0.18664551, + "step": 4820, + "time_per_iteration": 2.514738082885742 + }, + { + "auxiliary_loss_clip": 0.06519003, + "auxiliary_loss_mlp": 0.01272112, + "balance_loss_clip": 0.06309246, + "balance_loss_mlp": 0.01254612, + "epoch": 0.28985420111228016, + "flos": 18776209812480.0, + "grad_norm": 1.948801074603747, + "language_loss": 0.73537958, + "learning_rate": 3.332501274072231e-06, + "loss": 0.81329072, + "num_input_tokens_seen": 104022945, + "router_z_loss_clip": 2.09863281, + "router_z_loss_mlp": 0.17492676, + "step": 4821, + "time_per_iteration": 2.6552352905273438 + }, + { + "auxiliary_loss_clip": 0.06509826, + "auxiliary_loss_mlp": 0.01279091, + "balance_loss_clip": 0.06303322, + "balance_loss_mlp": 0.01260733, + "epoch": 0.28991432436494813, + "flos": 23075511281280.0, + "grad_norm": 1.9415887628712303, + "language_loss": 0.7256397, + "learning_rate": 3.332210816371104e-06, + "loss": 0.8035289, + "num_input_tokens_seen": 104042080, + "router_z_loss_clip": 2.06347656, + "router_z_loss_mlp": 0.18347168, + "step": 4822, + "time_per_iteration": 2.5311806201934814 + }, + { + "auxiliary_loss_clip": 0.06508678, + "auxiliary_loss_mlp": 0.0127532, + "balance_loss_clip": 0.06304502, + "balance_loss_mlp": 0.01258237, + "epoch": 0.2899744476176161, + "flos": 17608992837120.0, + "grad_norm": 1.6868082855094653, + "language_loss": 0.66498971, + "learning_rate": 3.3319203081524102e-06, + "loss": 0.74282968, + "num_input_tokens_seen": 104060975, + "router_z_loss_clip": 2.04003906, + "router_z_loss_mlp": 0.17077637, + "step": 4823, + "time_per_iteration": 2.5582497119903564 + }, + { + "auxiliary_loss_clip": 0.06507877, + "auxiliary_loss_mlp": 0.0127093, + "balance_loss_clip": 0.06303018, + "balance_loss_mlp": 0.01253728, + "epoch": 0.29003457087028406, + "flos": 22315861365120.0, + "grad_norm": 2.007628710478466, + "language_loss": 0.81589168, + "learning_rate": 3.331629749427164e-06, + "loss": 0.89367974, + "num_input_tokens_seen": 104081395, + "router_z_loss_clip": 2.04882812, + "router_z_loss_mlp": 0.171875, + "step": 4824, + "time_per_iteration": 2.5258595943450928 + }, + { + "auxiliary_loss_clip": 0.06510833, + "auxiliary_loss_mlp": 0.01276305, + "balance_loss_clip": 0.06301483, + "balance_loss_mlp": 0.01258376, + "epoch": 0.2900946941229521, + "flos": 21951493885440.0, + "grad_norm": 1.837693758429887, + "language_loss": 0.73192668, + "learning_rate": 3.331339140206385e-06, + "loss": 0.80979806, + "num_input_tokens_seen": 104099995, + "router_z_loss_clip": 2.09179688, + "router_z_loss_mlp": 0.17932129, + "step": 4825, + "time_per_iteration": 2.558096170425415 + }, + { + "auxiliary_loss_clip": 0.0651435, + "auxiliary_loss_mlp": 0.01275324, + "balance_loss_clip": 0.06305824, + "balance_loss_mlp": 0.01257049, + "epoch": 0.29015481737562004, + "flos": 17938126874880.0, + "grad_norm": 2.202818652908599, + "language_loss": 0.7426061, + "learning_rate": 3.331048480501092e-06, + "loss": 0.82050288, + "num_input_tokens_seen": 104118930, + "router_z_loss_clip": 2.08496094, + "router_z_loss_mlp": 0.18273926, + "step": 4826, + "time_per_iteration": 2.497711420059204 + }, + { + "auxiliary_loss_clip": 0.06516986, + "auxiliary_loss_mlp": 0.01278902, + "balance_loss_clip": 0.06309567, + "balance_loss_mlp": 0.01262141, + "epoch": 0.290214940628288, + "flos": 22790079947520.0, + "grad_norm": 1.934932602801083, + "language_loss": 0.69077051, + "learning_rate": 3.3307577703223073e-06, + "loss": 0.76872945, + "num_input_tokens_seen": 104136940, + "router_z_loss_clip": 2.07324219, + "router_z_loss_mlp": 0.16748047, + "step": 4827, + "time_per_iteration": 2.5729641914367676 + }, + { + "auxiliary_loss_clip": 0.06517433, + "auxiliary_loss_mlp": 0.0127379, + "balance_loss_clip": 0.06311382, + "balance_loss_mlp": 0.01255646, + "epoch": 0.290275063880956, + "flos": 20011881173760.0, + "grad_norm": 1.8047855406998587, + "language_loss": 0.80766201, + "learning_rate": 3.3304670096810545e-06, + "loss": 0.88557422, + "num_input_tokens_seen": 104154280, + "router_z_loss_clip": 2.06054688, + "router_z_loss_mlp": 0.18151855, + "step": 4828, + "time_per_iteration": 2.5190348625183105 + }, + { + "auxiliary_loss_clip": 0.0651058, + "auxiliary_loss_mlp": 0.01278642, + "balance_loss_clip": 0.06308287, + "balance_loss_mlp": 0.01260809, + "epoch": 0.29033518713362394, + "flos": 22060003322880.0, + "grad_norm": 1.646725141321262, + "language_loss": 0.80908686, + "learning_rate": 3.33017619858836e-06, + "loss": 0.8869791, + "num_input_tokens_seen": 104172605, + "router_z_loss_clip": 2.02246094, + "router_z_loss_mlp": 0.17822266, + "step": 4829, + "time_per_iteration": 2.564837694168091 + }, + { + "auxiliary_loss_clip": 0.06503877, + "auxiliary_loss_mlp": 0.01277613, + "balance_loss_clip": 0.06304269, + "balance_loss_mlp": 0.0126059, + "epoch": 0.2903953103862919, + "flos": 25637194304640.0, + "grad_norm": 1.4271698228137566, + "language_loss": 0.82616186, + "learning_rate": 3.329885337055249e-06, + "loss": 0.90397674, + "num_input_tokens_seen": 104194120, + "router_z_loss_clip": 1.99414062, + "router_z_loss_mlp": 0.17016602, + "step": 4830, + "time_per_iteration": 2.557326555252075 + }, + { + "auxiliary_loss_clip": 0.0652103, + "auxiliary_loss_mlp": 0.01280335, + "balance_loss_clip": 0.06313583, + "balance_loss_mlp": 0.01262036, + "epoch": 0.29045543363895987, + "flos": 16951437521280.0, + "grad_norm": 2.247105417787089, + "language_loss": 0.79901475, + "learning_rate": 3.3295944250927546e-06, + "loss": 0.87702841, + "num_input_tokens_seen": 104210875, + "router_z_loss_clip": 2.07421875, + "router_z_loss_mlp": 0.18310547, + "step": 4831, + "time_per_iteration": 2.5306637287139893 + }, + { + "auxiliary_loss_clip": 0.06507042, + "auxiliary_loss_mlp": 0.01277723, + "balance_loss_clip": 0.06307022, + "balance_loss_mlp": 0.01261392, + "epoch": 0.29051555689162784, + "flos": 26402630152320.0, + "grad_norm": 2.3059080747570775, + "language_loss": 0.75331926, + "learning_rate": 3.3293034627119055e-06, + "loss": 0.83116686, + "num_input_tokens_seen": 104229875, + "router_z_loss_clip": 2.00097656, + "router_z_loss_mlp": 0.16333008, + "step": 4832, + "time_per_iteration": 2.5603439807891846 + }, + { + "auxiliary_loss_clip": 0.06503655, + "auxiliary_loss_mlp": 0.01283448, + "balance_loss_clip": 0.06302731, + "balance_loss_mlp": 0.01267271, + "epoch": 0.2905756801442958, + "flos": 21109931003520.0, + "grad_norm": 1.626645949157208, + "language_loss": 0.76312864, + "learning_rate": 3.329012449923736e-06, + "loss": 0.8409996, + "num_input_tokens_seen": 104250405, + "router_z_loss_clip": 2.00878906, + "router_z_loss_mlp": 0.16162109, + "step": 4833, + "time_per_iteration": 4.029958963394165 + }, + { + "auxiliary_loss_clip": 0.06504881, + "auxiliary_loss_mlp": 0.01280243, + "balance_loss_clip": 0.06303954, + "balance_loss_mlp": 0.01263363, + "epoch": 0.29063580339696377, + "flos": 15711573456000.0, + "grad_norm": 1.645904053352059, + "language_loss": 0.65383506, + "learning_rate": 3.3287213867392813e-06, + "loss": 0.73168635, + "num_input_tokens_seen": 104269185, + "router_z_loss_clip": 2.00976562, + "router_z_loss_mlp": 0.16882324, + "step": 4834, + "time_per_iteration": 2.5233187675476074 + }, + { + "auxiliary_loss_clip": 0.06499655, + "auxiliary_loss_mlp": 0.01274915, + "balance_loss_clip": 0.06299647, + "balance_loss_mlp": 0.01258893, + "epoch": 0.29069592664963173, + "flos": 24651972397440.0, + "grad_norm": 1.808411103531711, + "language_loss": 0.71914709, + "learning_rate": 3.3284302731695783e-06, + "loss": 0.79689276, + "num_input_tokens_seen": 104289400, + "router_z_loss_clip": 1.99902344, + "router_z_loss_mlp": 0.16027832, + "step": 4835, + "time_per_iteration": 2.555670738220215 + }, + { + "auxiliary_loss_clip": 0.06500543, + "auxiliary_loss_mlp": 0.01275594, + "balance_loss_clip": 0.06299368, + "balance_loss_mlp": 0.01259536, + "epoch": 0.2907560499022997, + "flos": 24980854872960.0, + "grad_norm": 1.750724607078226, + "language_loss": 0.80319953, + "learning_rate": 3.3281391092256668e-06, + "loss": 0.88096082, + "num_input_tokens_seen": 104310485, + "router_z_loss_clip": 2.00878906, + "router_z_loss_mlp": 0.16052246, + "step": 4836, + "time_per_iteration": 3.9953579902648926 + }, + { + "auxiliary_loss_clip": 0.0650623, + "auxiliary_loss_mlp": 0.01276306, + "balance_loss_clip": 0.06305872, + "balance_loss_mlp": 0.01260236, + "epoch": 0.29081617315496766, + "flos": 18662836838400.0, + "grad_norm": 1.8282626295265978, + "language_loss": 0.81337535, + "learning_rate": 3.3278478949185865e-06, + "loss": 0.89120078, + "num_input_tokens_seen": 104327330, + "router_z_loss_clip": 2.00585938, + "router_z_loss_mlp": 0.16064453, + "step": 4837, + "time_per_iteration": 3.9492576122283936 + }, + { + "auxiliary_loss_clip": 0.06508449, + "auxiliary_loss_mlp": 0.01275256, + "balance_loss_clip": 0.06305645, + "balance_loss_mlp": 0.01257362, + "epoch": 0.2908762964076356, + "flos": 35339087952000.0, + "grad_norm": 1.819350457328488, + "language_loss": 0.67809796, + "learning_rate": 3.327556630259381e-06, + "loss": 0.75593495, + "num_input_tokens_seen": 104350350, + "router_z_loss_clip": 2.02734375, + "router_z_loss_mlp": 0.17895508, + "step": 4838, + "time_per_iteration": 2.6575772762298584 + }, + { + "auxiliary_loss_clip": 0.06511781, + "auxiliary_loss_mlp": 0.01274117, + "balance_loss_clip": 0.06305051, + "balance_loss_mlp": 0.01256688, + "epoch": 0.29093641966030365, + "flos": 23083058148480.0, + "grad_norm": 2.3112745331966185, + "language_loss": 0.71775508, + "learning_rate": 3.327265315259095e-06, + "loss": 0.79561406, + "num_input_tokens_seen": 104369995, + "router_z_loss_clip": 2.06640625, + "router_z_loss_mlp": 0.17419434, + "step": 4839, + "time_per_iteration": 2.6057844161987305 + }, + { + "auxiliary_loss_clip": 0.06504601, + "auxiliary_loss_mlp": 0.0127457, + "balance_loss_clip": 0.06301045, + "balance_loss_mlp": 0.01258071, + "epoch": 0.2909965429129716, + "flos": 35964260864640.0, + "grad_norm": 1.8988017352340443, + "language_loss": 0.75792682, + "learning_rate": 3.326973949928776e-06, + "loss": 0.83571851, + "num_input_tokens_seen": 104392285, + "router_z_loss_clip": 2.03515625, + "router_z_loss_mlp": 0.16503906, + "step": 4840, + "time_per_iteration": 2.7049334049224854 + }, + { + "auxiliary_loss_clip": 0.06503059, + "auxiliary_loss_mlp": 0.0127188, + "balance_loss_clip": 0.06299757, + "balance_loss_mlp": 0.01255417, + "epoch": 0.2910566661656396, + "flos": 30887616268800.0, + "grad_norm": 1.8129671702232821, + "language_loss": 0.60949063, + "learning_rate": 3.326682534279471e-06, + "loss": 0.68724, + "num_input_tokens_seen": 104412640, + "router_z_loss_clip": 2.03125, + "router_z_loss_mlp": 0.16479492, + "step": 4841, + "time_per_iteration": 2.7237274646759033 + }, + { + "auxiliary_loss_clip": 0.06506652, + "auxiliary_loss_mlp": 0.01272342, + "balance_loss_clip": 0.06303366, + "balance_loss_mlp": 0.01255021, + "epoch": 0.29111678941830754, + "flos": 30018366812160.0, + "grad_norm": 1.3487344136639734, + "language_loss": 0.71762401, + "learning_rate": 3.326391068322232e-06, + "loss": 0.79541385, + "num_input_tokens_seen": 104435245, + "router_z_loss_clip": 2.03320312, + "router_z_loss_mlp": 0.17333984, + "step": 4842, + "time_per_iteration": 4.036385774612427 + }, + { + "auxiliary_loss_clip": 0.06507391, + "auxiliary_loss_mlp": 0.01271836, + "balance_loss_clip": 0.06304808, + "balance_loss_mlp": 0.01256423, + "epoch": 0.2911769126709755, + "flos": 22864110629760.0, + "grad_norm": 1.4808705717301018, + "language_loss": 0.74052906, + "learning_rate": 3.3260995520681098e-06, + "loss": 0.81832135, + "num_input_tokens_seen": 104455395, + "router_z_loss_clip": 2.0234375, + "router_z_loss_mlp": 0.1541748, + "step": 4843, + "time_per_iteration": 2.565093755722046 + }, + { + "auxiliary_loss_clip": 0.06510359, + "auxiliary_loss_mlp": 0.01272609, + "balance_loss_clip": 0.06305443, + "balance_loss_mlp": 0.01256742, + "epoch": 0.2912370359236435, + "flos": 21656545113600.0, + "grad_norm": 3.6041214714298806, + "language_loss": 0.5879783, + "learning_rate": 3.3258079855281602e-06, + "loss": 0.66580796, + "num_input_tokens_seen": 104473350, + "router_z_loss_clip": 2.04785156, + "router_z_loss_mlp": 0.15856934, + "step": 4844, + "time_per_iteration": 2.636667490005493 + }, + { + "auxiliary_loss_clip": 0.06518383, + "auxiliary_loss_mlp": 0.01278792, + "balance_loss_clip": 0.06309091, + "balance_loss_mlp": 0.01261566, + "epoch": 0.29129715917631144, + "flos": 22899972977280.0, + "grad_norm": 1.9195914149996331, + "language_loss": 0.86846137, + "learning_rate": 3.3255163687134396e-06, + "loss": 0.94643313, + "num_input_tokens_seen": 104492265, + "router_z_loss_clip": 2.09277344, + "router_z_loss_mlp": 0.17224121, + "step": 4845, + "time_per_iteration": 2.549297571182251 + }, + { + "auxiliary_loss_clip": 0.06508736, + "auxiliary_loss_mlp": 0.01273322, + "balance_loss_clip": 0.06304652, + "balance_loss_mlp": 0.01256144, + "epoch": 0.2913572824289794, + "flos": 22681067385600.0, + "grad_norm": 1.8711717874469986, + "language_loss": 0.67698014, + "learning_rate": 3.3252247016350046e-06, + "loss": 0.75480074, + "num_input_tokens_seen": 104510755, + "router_z_loss_clip": 2.04199219, + "router_z_loss_mlp": 0.17175293, + "step": 4846, + "time_per_iteration": 2.607025146484375 + }, + { + "auxiliary_loss_clip": 0.06502484, + "auxiliary_loss_mlp": 0.01275425, + "balance_loss_clip": 0.06301165, + "balance_loss_mlp": 0.01258771, + "epoch": 0.29141740568164737, + "flos": 23113260345600.0, + "grad_norm": 4.990917175371688, + "language_loss": 0.708718, + "learning_rate": 3.3249329843039166e-06, + "loss": 0.78649712, + "num_input_tokens_seen": 104530830, + "router_z_loss_clip": 2.01171875, + "router_z_loss_mlp": 0.16674805, + "step": 4847, + "time_per_iteration": 2.5293991565704346 + }, + { + "auxiliary_loss_clip": 0.06504785, + "auxiliary_loss_mlp": 0.01278673, + "balance_loss_clip": 0.06301495, + "balance_loss_mlp": 0.01261877, + "epoch": 0.29147752893431533, + "flos": 23593851838080.0, + "grad_norm": 1.4565796817402286, + "language_loss": 0.74258435, + "learning_rate": 3.324641216731237e-06, + "loss": 0.82041889, + "num_input_tokens_seen": 104550115, + "router_z_loss_clip": 2.03125, + "router_z_loss_mlp": 0.16796875, + "step": 4848, + "time_per_iteration": 2.585296630859375 + }, + { + "auxiliary_loss_clip": 0.06502895, + "auxiliary_loss_mlp": 0.01276049, + "balance_loss_clip": 0.06298006, + "balance_loss_mlp": 0.01259729, + "epoch": 0.2915376521869833, + "flos": 20597753721600.0, + "grad_norm": 2.1223800155182624, + "language_loss": 0.77561575, + "learning_rate": 3.3243493989280295e-06, + "loss": 0.85340518, + "num_input_tokens_seen": 104566255, + "router_z_loss_clip": 2.046875, + "router_z_loss_mlp": 0.16333008, + "step": 4849, + "time_per_iteration": 2.4936819076538086 + }, + { + "auxiliary_loss_clip": 0.06514408, + "auxiliary_loss_mlp": 0.01274174, + "balance_loss_clip": 0.06305997, + "balance_loss_mlp": 0.01257723, + "epoch": 0.29159777543965126, + "flos": 20817414000000.0, + "grad_norm": 1.652469266745217, + "language_loss": 0.79415965, + "learning_rate": 3.3240575309053596e-06, + "loss": 0.87204546, + "num_input_tokens_seen": 104585235, + "router_z_loss_clip": 2.08789062, + "router_z_loss_mlp": 0.16442871, + "step": 4850, + "time_per_iteration": 2.55340313911438 + }, + { + "auxiliary_loss_clip": 0.06494947, + "auxiliary_loss_mlp": 0.0127524, + "balance_loss_clip": 0.06295137, + "balance_loss_mlp": 0.01258479, + "epoch": 0.29165789869231923, + "flos": 24251155591680.0, + "grad_norm": 1.7747423674847125, + "language_loss": 0.76365012, + "learning_rate": 3.323765612674296e-06, + "loss": 0.84135199, + "num_input_tokens_seen": 104605315, + "router_z_loss_clip": 1.99902344, + "router_z_loss_mlp": 0.16748047, + "step": 4851, + "time_per_iteration": 2.5335612297058105 + }, + { + "auxiliary_loss_clip": 0.06499958, + "auxiliary_loss_mlp": 0.01272557, + "balance_loss_clip": 0.06300404, + "balance_loss_mlp": 0.01256929, + "epoch": 0.29171802194498725, + "flos": 28957562922240.0, + "grad_norm": 1.3481127708223366, + "language_loss": 0.7781775, + "learning_rate": 3.3234736442459078e-06, + "loss": 0.85590267, + "num_input_tokens_seen": 104626055, + "router_z_loss_clip": 1.99609375, + "router_z_loss_mlp": 0.15612793, + "step": 4852, + "time_per_iteration": 2.6266329288482666 + }, + { + "auxiliary_loss_clip": 0.06501517, + "auxiliary_loss_mlp": 0.0127959, + "balance_loss_clip": 0.06297216, + "balance_loss_mlp": 0.01261697, + "epoch": 0.2917781451976552, + "flos": 22604269518720.0, + "grad_norm": 1.5006442804531215, + "language_loss": 0.78676021, + "learning_rate": 3.3231816256312665e-06, + "loss": 0.86457133, + "num_input_tokens_seen": 104646005, + "router_z_loss_clip": 2.04296875, + "router_z_loss_mlp": 0.17883301, + "step": 4853, + "time_per_iteration": 2.5417568683624268 + }, + { + "auxiliary_loss_clip": 0.06501997, + "auxiliary_loss_mlp": 0.01270758, + "balance_loss_clip": 0.06296347, + "balance_loss_mlp": 0.01253818, + "epoch": 0.2918382684503232, + "flos": 21579956881920.0, + "grad_norm": 4.190137743849971, + "language_loss": 0.88580358, + "learning_rate": 3.322889556841445e-06, + "loss": 0.96353114, + "num_input_tokens_seen": 104661620, + "router_z_loss_clip": 2.05761719, + "router_z_loss_mlp": 0.16943359, + "step": 4854, + "time_per_iteration": 2.537247896194458 + }, + { + "auxiliary_loss_clip": 0.06492339, + "auxiliary_loss_mlp": 0.01274317, + "balance_loss_clip": 0.06290923, + "balance_loss_mlp": 0.01255517, + "epoch": 0.29189839170299114, + "flos": 24360503569920.0, + "grad_norm": 1.79615422427109, + "language_loss": 0.86863208, + "learning_rate": 3.322597437887519e-06, + "loss": 0.94629866, + "num_input_tokens_seen": 104681445, + "router_z_loss_clip": 2.01464844, + "router_z_loss_mlp": 0.18798828, + "step": 4855, + "time_per_iteration": 2.5408217906951904 + }, + { + "auxiliary_loss_clip": 0.06394155, + "auxiliary_loss_mlp": 0.01254999, + "balance_loss_clip": 0.0629582, + "balance_loss_mlp": 0.01250765, + "epoch": 0.2919585149556591, + "flos": 71338693311360.0, + "grad_norm": 0.8469602753394808, + "language_loss": 0.60232264, + "learning_rate": 3.322305268780566e-06, + "loss": 0.67881417, + "num_input_tokens_seen": 104747945, + "router_z_loss_clip": 0.98388672, + "router_z_loss_mlp": 0.04238892, + "step": 4856, + "time_per_iteration": 3.245720863342285 + }, + { + "auxiliary_loss_clip": 0.06496054, + "auxiliary_loss_mlp": 0.01271452, + "balance_loss_clip": 0.06293447, + "balance_loss_mlp": 0.01254966, + "epoch": 0.2920186382083271, + "flos": 15638716730880.0, + "grad_norm": 1.9340338412348166, + "language_loss": 0.69134986, + "learning_rate": 3.322013049531664e-06, + "loss": 0.76902497, + "num_input_tokens_seen": 104766225, + "router_z_loss_clip": 2.02734375, + "router_z_loss_mlp": 0.16479492, + "step": 4857, + "time_per_iteration": 2.492515802383423 + }, + { + "auxiliary_loss_clip": 0.0649875, + "auxiliary_loss_mlp": 0.01275648, + "balance_loss_clip": 0.06298544, + "balance_loss_mlp": 0.01258863, + "epoch": 0.29207876146099504, + "flos": 28373535164160.0, + "grad_norm": 2.0544380804392346, + "language_loss": 0.84425288, + "learning_rate": 3.321720780151895e-06, + "loss": 0.92199689, + "num_input_tokens_seen": 104785345, + "router_z_loss_clip": 2.00195312, + "router_z_loss_mlp": 0.16772461, + "step": 4858, + "time_per_iteration": 2.596036434173584 + }, + { + "auxiliary_loss_clip": 0.06500848, + "auxiliary_loss_mlp": 0.01274974, + "balance_loss_clip": 0.06300872, + "balance_loss_mlp": 0.01257879, + "epoch": 0.292138884713663, + "flos": 21877295495040.0, + "grad_norm": 1.6880642207641439, + "language_loss": 0.781169, + "learning_rate": 3.321428460652342e-06, + "loss": 0.85892725, + "num_input_tokens_seen": 104804560, + "router_z_loss_clip": 2.0, + "router_z_loss_mlp": 0.17102051, + "step": 4859, + "time_per_iteration": 2.5885818004608154 + }, + { + "auxiliary_loss_clip": 0.06508546, + "auxiliary_loss_mlp": 0.01274065, + "balance_loss_clip": 0.06301034, + "balance_loss_mlp": 0.0125684, + "epoch": 0.29219900796633097, + "flos": 20998277038080.0, + "grad_norm": 2.276956308498861, + "language_loss": 0.68823123, + "learning_rate": 3.3211360910440885e-06, + "loss": 0.76605731, + "num_input_tokens_seen": 104821105, + "router_z_loss_clip": 2.07421875, + "router_z_loss_mlp": 0.17224121, + "step": 4860, + "time_per_iteration": 2.6006133556365967 + }, + { + "auxiliary_loss_clip": 0.06497137, + "auxiliary_loss_mlp": 0.01273361, + "balance_loss_clip": 0.06296673, + "balance_loss_mlp": 0.01256743, + "epoch": 0.29225913121899894, + "flos": 35012930734080.0, + "grad_norm": 1.9621079535677741, + "language_loss": 0.75927335, + "learning_rate": 3.320843671338222e-06, + "loss": 0.83697826, + "num_input_tokens_seen": 104841440, + "router_z_loss_clip": 2.00292969, + "router_z_loss_mlp": 0.16625977, + "step": 4861, + "time_per_iteration": 2.6738815307617188 + }, + { + "auxiliary_loss_clip": 0.06498605, + "auxiliary_loss_mlp": 0.01278705, + "balance_loss_clip": 0.06298269, + "balance_loss_mlp": 0.0126229, + "epoch": 0.2923192544716669, + "flos": 13520588895360.0, + "grad_norm": 2.4944662876521027, + "language_loss": 0.91953582, + "learning_rate": 3.320551201545832e-06, + "loss": 0.99730897, + "num_input_tokens_seen": 104858210, + "router_z_loss_clip": 2.00097656, + "router_z_loss_mlp": 0.16418457, + "step": 4862, + "time_per_iteration": 2.523393392562866 + }, + { + "auxiliary_loss_clip": 0.06498902, + "auxiliary_loss_mlp": 0.01275122, + "balance_loss_clip": 0.06296849, + "balance_loss_mlp": 0.01258325, + "epoch": 0.29237937772433487, + "flos": 19469543621760.0, + "grad_norm": 2.367835349845546, + "language_loss": 0.74302417, + "learning_rate": 3.320258681678008e-06, + "loss": 0.82076436, + "num_input_tokens_seen": 104875620, + "router_z_loss_clip": 2.02441406, + "router_z_loss_mlp": 0.16809082, + "step": 4863, + "time_per_iteration": 2.5615665912628174 + }, + { + "auxiliary_loss_clip": 0.06495367, + "auxiliary_loss_mlp": 0.01274458, + "balance_loss_clip": 0.06298485, + "balance_loss_mlp": 0.01257041, + "epoch": 0.29243950097700283, + "flos": 20856965927040.0, + "grad_norm": 1.6096808438714836, + "language_loss": 0.78180861, + "learning_rate": 3.319966111745842e-06, + "loss": 0.85950685, + "num_input_tokens_seen": 104894600, + "router_z_loss_clip": 1.96679688, + "router_z_loss_mlp": 0.17419434, + "step": 4864, + "time_per_iteration": 2.543239116668701 + }, + { + "auxiliary_loss_clip": 0.06506015, + "auxiliary_loss_mlp": 0.01278091, + "balance_loss_clip": 0.06299396, + "balance_loss_mlp": 0.01260127, + "epoch": 0.29249962422967085, + "flos": 23590581528960.0, + "grad_norm": 1.7200803595236853, + "language_loss": 0.82166076, + "learning_rate": 3.319673491760429e-06, + "loss": 0.8995018, + "num_input_tokens_seen": 104914530, + "router_z_loss_clip": 2.06640625, + "router_z_loss_mlp": 0.1796875, + "step": 4865, + "time_per_iteration": 2.6162562370300293 + }, + { + "auxiliary_loss_clip": 0.06504746, + "auxiliary_loss_mlp": 0.01277785, + "balance_loss_clip": 0.06300808, + "balance_loss_mlp": 0.01258783, + "epoch": 0.2925597474823388, + "flos": 22279915163520.0, + "grad_norm": 1.8207973709117147, + "language_loss": 0.85861242, + "learning_rate": 3.3193808217328645e-06, + "loss": 0.93643779, + "num_input_tokens_seen": 104933460, + "router_z_loss_clip": 2.04003906, + "router_z_loss_mlp": 0.18994141, + "step": 4866, + "time_per_iteration": 2.5991125106811523 + }, + { + "auxiliary_loss_clip": 0.06498669, + "auxiliary_loss_mlp": 0.01277634, + "balance_loss_clip": 0.06298468, + "balance_loss_mlp": 0.0126005, + "epoch": 0.2926198707350068, + "flos": 34464136417920.0, + "grad_norm": 1.677629799943763, + "language_loss": 0.76065934, + "learning_rate": 3.3190881016742476e-06, + "loss": 0.83842242, + "num_input_tokens_seen": 104954495, + "router_z_loss_clip": 2.00097656, + "router_z_loss_mlp": 0.17578125, + "step": 4867, + "time_per_iteration": 2.652083396911621 + }, + { + "auxiliary_loss_clip": 0.06508122, + "auxiliary_loss_mlp": 0.01277995, + "balance_loss_clip": 0.06302974, + "balance_loss_mlp": 0.01260483, + "epoch": 0.29267999398767475, + "flos": 20710413936000.0, + "grad_norm": 2.5581846543962197, + "language_loss": 0.73412025, + "learning_rate": 3.3187953315956776e-06, + "loss": 0.81198144, + "num_input_tokens_seen": 104971915, + "router_z_loss_clip": 2.04980469, + "router_z_loss_mlp": 0.1751709, + "step": 4868, + "time_per_iteration": 2.5104074478149414 + }, + { + "auxiliary_loss_clip": 0.06504919, + "auxiliary_loss_mlp": 0.0127382, + "balance_loss_clip": 0.06304781, + "balance_loss_mlp": 0.01256558, + "epoch": 0.2927401172403427, + "flos": 18374470611840.0, + "grad_norm": 1.376823387605754, + "language_loss": 0.74768585, + "learning_rate": 3.3185025115082566e-06, + "loss": 0.82547319, + "num_input_tokens_seen": 104991335, + "router_z_loss_clip": 2.00292969, + "router_z_loss_mlp": 0.17260742, + "step": 4869, + "time_per_iteration": 2.517545461654663 + }, + { + "auxiliary_loss_clip": 0.06509744, + "auxiliary_loss_mlp": 0.01275578, + "balance_loss_clip": 0.06308037, + "balance_loss_mlp": 0.01258627, + "epoch": 0.2928002404930107, + "flos": 26111203251840.0, + "grad_norm": 1.453461002371515, + "language_loss": 0.76538026, + "learning_rate": 3.318209641423088e-06, + "loss": 0.84323347, + "num_input_tokens_seen": 105012015, + "router_z_loss_clip": 2.01660156, + "router_z_loss_mlp": 0.16931152, + "step": 4870, + "time_per_iteration": 2.571554183959961 + }, + { + "auxiliary_loss_clip": 0.06512202, + "auxiliary_loss_mlp": 0.01274146, + "balance_loss_clip": 0.06304315, + "balance_loss_mlp": 0.01255967, + "epoch": 0.29286036374567864, + "flos": 21331142582400.0, + "grad_norm": 3.1299518178223726, + "language_loss": 0.67793286, + "learning_rate": 3.3179167213512777e-06, + "loss": 0.75579637, + "num_input_tokens_seen": 105031460, + "router_z_loss_clip": 2.078125, + "router_z_loss_mlp": 0.18188477, + "step": 4871, + "time_per_iteration": 2.5867390632629395 + }, + { + "auxiliary_loss_clip": 0.06504084, + "auxiliary_loss_mlp": 0.01272553, + "balance_loss_clip": 0.0630291, + "balance_loss_mlp": 0.01256973, + "epoch": 0.2929204869983466, + "flos": 29577117611520.0, + "grad_norm": 1.7840080197301964, + "language_loss": 0.78071094, + "learning_rate": 3.317623751303933e-06, + "loss": 0.85847723, + "num_input_tokens_seen": 105052965, + "router_z_loss_clip": 2.01171875, + "router_z_loss_mlp": 0.15588379, + "step": 4872, + "time_per_iteration": 2.598357915878296 + }, + { + "auxiliary_loss_clip": 0.06511893, + "auxiliary_loss_mlp": 0.01279899, + "balance_loss_clip": 0.06305112, + "balance_loss_mlp": 0.01260313, + "epoch": 0.2929806102510146, + "flos": 19063569790080.0, + "grad_norm": 1.7763964443019538, + "language_loss": 0.72879624, + "learning_rate": 3.317330731292164e-06, + "loss": 0.80671406, + "num_input_tokens_seen": 105071840, + "router_z_loss_clip": 2.06640625, + "router_z_loss_mlp": 0.19580078, + "step": 4873, + "time_per_iteration": 3.9404540061950684 + }, + { + "auxiliary_loss_clip": 0.06511085, + "auxiliary_loss_mlp": 0.01274077, + "balance_loss_clip": 0.06303495, + "balance_loss_mlp": 0.01256386, + "epoch": 0.29304073350368254, + "flos": 21950613417600.0, + "grad_norm": 1.85182595241139, + "language_loss": 0.79023468, + "learning_rate": 3.3170376613270812e-06, + "loss": 0.86808634, + "num_input_tokens_seen": 105089445, + "router_z_loss_clip": 2.07617188, + "router_z_loss_mlp": 0.17675781, + "step": 4874, + "time_per_iteration": 2.523942470550537 + }, + { + "auxiliary_loss_clip": 0.06517696, + "auxiliary_loss_mlp": 0.01272827, + "balance_loss_clip": 0.06305568, + "balance_loss_mlp": 0.01255315, + "epoch": 0.2931008567563505, + "flos": 15456302392320.0, + "grad_norm": 2.3441988108556377, + "language_loss": 0.7791701, + "learning_rate": 3.3167445414197985e-06, + "loss": 0.85707539, + "num_input_tokens_seen": 105106210, + "router_z_loss_clip": 2.12304688, + "router_z_loss_mlp": 0.17504883, + "step": 4875, + "time_per_iteration": 2.4990556240081787 + }, + { + "auxiliary_loss_clip": 0.06506883, + "auxiliary_loss_mlp": 0.01280573, + "balance_loss_clip": 0.06301031, + "balance_loss_mlp": 0.01263252, + "epoch": 0.29316098000901847, + "flos": 16988893096320.0, + "grad_norm": 1.859745338516673, + "language_loss": 0.70031023, + "learning_rate": 3.316451371581431e-06, + "loss": 0.77818477, + "num_input_tokens_seen": 105124200, + "router_z_loss_clip": 2.05859375, + "router_z_loss_mlp": 0.17321777, + "step": 4876, + "time_per_iteration": 5.4681243896484375 + }, + { + "auxiliary_loss_clip": 0.06504045, + "auxiliary_loss_mlp": 0.01275518, + "balance_loss_clip": 0.06302452, + "balance_loss_mlp": 0.01259174, + "epoch": 0.29322110326168643, + "flos": 16362462372480.0, + "grad_norm": 1.8247622937841679, + "language_loss": 0.82480925, + "learning_rate": 3.316158151823096e-06, + "loss": 0.90260488, + "num_input_tokens_seen": 105140400, + "router_z_loss_clip": 2.015625, + "router_z_loss_mlp": 0.16345215, + "step": 4877, + "time_per_iteration": 2.5517635345458984 + }, + { + "auxiliary_loss_clip": 0.06509132, + "auxiliary_loss_mlp": 0.01278665, + "balance_loss_clip": 0.06299806, + "balance_loss_mlp": 0.0126064, + "epoch": 0.29328122651435445, + "flos": 13996023361920.0, + "grad_norm": 2.6416558700601334, + "language_loss": 0.6810987, + "learning_rate": 3.315864882155911e-06, + "loss": 0.75897658, + "num_input_tokens_seen": 105157535, + "router_z_loss_clip": 2.09179688, + "router_z_loss_mlp": 0.18017578, + "step": 4878, + "time_per_iteration": 2.511922597885132 + }, + { + "auxiliary_loss_clip": 0.0649902, + "auxiliary_loss_mlp": 0.01275226, + "balance_loss_clip": 0.06298085, + "balance_loss_mlp": 0.01257697, + "epoch": 0.2933413497670224, + "flos": 25271569013760.0, + "grad_norm": 1.8820124674491874, + "language_loss": 0.74030542, + "learning_rate": 3.3155715625909982e-06, + "loss": 0.81804794, + "num_input_tokens_seen": 105175185, + "router_z_loss_clip": 2.01074219, + "router_z_loss_mlp": 0.17510986, + "step": 4879, + "time_per_iteration": 2.6044318675994873 + }, + { + "auxiliary_loss_clip": 0.06501681, + "auxiliary_loss_mlp": 0.01277426, + "balance_loss_clip": 0.0629803, + "balance_loss_mlp": 0.01259187, + "epoch": 0.2934014730196904, + "flos": 32131840746240.0, + "grad_norm": 2.9151820016542183, + "language_loss": 0.67178017, + "learning_rate": 3.3152781931394803e-06, + "loss": 0.7495712, + "num_input_tokens_seen": 105194540, + "router_z_loss_clip": 2.03515625, + "router_z_loss_mlp": 0.18237305, + "step": 4880, + "time_per_iteration": 2.603761672973633 + }, + { + "auxiliary_loss_clip": 0.06503071, + "auxiliary_loss_mlp": 0.01271949, + "balance_loss_clip": 0.0629775, + "balance_loss_mlp": 0.01255367, + "epoch": 0.29346159627235835, + "flos": 24359329612800.0, + "grad_norm": 2.6105900749093633, + "language_loss": 0.71260536, + "learning_rate": 3.314984773812481e-06, + "loss": 0.79035556, + "num_input_tokens_seen": 105213215, + "router_z_loss_clip": 2.05273438, + "router_z_loss_mlp": 0.16577148, + "step": 4881, + "time_per_iteration": 2.593226432800293 + }, + { + "auxiliary_loss_clip": 0.06502824, + "auxiliary_loss_mlp": 0.01275283, + "balance_loss_clip": 0.06298223, + "balance_loss_mlp": 0.01256603, + "epoch": 0.2935217195250263, + "flos": 22753253278080.0, + "grad_norm": 1.6618295774620153, + "language_loss": 0.83893931, + "learning_rate": 3.314691304621127e-06, + "loss": 0.91672039, + "num_input_tokens_seen": 105231585, + "router_z_loss_clip": 2.04589844, + "router_z_loss_mlp": 0.18688965, + "step": 4882, + "time_per_iteration": 3.9488399028778076 + }, + { + "auxiliary_loss_clip": 0.06502259, + "auxiliary_loss_mlp": 0.01273532, + "balance_loss_clip": 0.06293593, + "balance_loss_mlp": 0.01255961, + "epoch": 0.2935818427776943, + "flos": 21731959388160.0, + "grad_norm": 4.210124979545191, + "language_loss": 0.72920972, + "learning_rate": 3.314397785576548e-06, + "loss": 0.80696762, + "num_input_tokens_seen": 105250120, + "router_z_loss_clip": 2.08789062, + "router_z_loss_mlp": 0.17565918, + "step": 4883, + "time_per_iteration": 2.557283878326416 + }, + { + "auxiliary_loss_clip": 0.06496279, + "auxiliary_loss_mlp": 0.01274258, + "balance_loss_clip": 0.06292833, + "balance_loss_mlp": 0.01257103, + "epoch": 0.29364196603036224, + "flos": 23811667326720.0, + "grad_norm": 2.0649535872154217, + "language_loss": 0.93051624, + "learning_rate": 3.3141042166898726e-06, + "loss": 1.00822163, + "num_input_tokens_seen": 105266065, + "router_z_loss_clip": 2.03417969, + "router_z_loss_mlp": 0.17150879, + "step": 4884, + "time_per_iteration": 2.5359458923339844 + }, + { + "auxiliary_loss_clip": 0.06506841, + "auxiliary_loss_mlp": 0.01273123, + "balance_loss_clip": 0.06302871, + "balance_loss_mlp": 0.01255409, + "epoch": 0.2937020892830302, + "flos": 23475615327360.0, + "grad_norm": 2.6201562161688017, + "language_loss": 0.73813069, + "learning_rate": 3.313810597972234e-06, + "loss": 0.81593031, + "num_input_tokens_seen": 105282155, + "router_z_loss_clip": 2.04101562, + "router_z_loss_mlp": 0.17712402, + "step": 4885, + "time_per_iteration": 2.547731637954712 + }, + { + "auxiliary_loss_clip": 0.06506574, + "auxiliary_loss_mlp": 0.01271233, + "balance_loss_clip": 0.06302118, + "balance_loss_mlp": 0.01253936, + "epoch": 0.2937622125356982, + "flos": 24278422896000.0, + "grad_norm": 2.0067568315745907, + "language_loss": 0.8568837, + "learning_rate": 3.3135169294347655e-06, + "loss": 0.93466175, + "num_input_tokens_seen": 105299225, + "router_z_loss_clip": 2.04492188, + "router_z_loss_mlp": 0.1730957, + "step": 4886, + "time_per_iteration": 2.5345749855041504 + }, + { + "auxiliary_loss_clip": 0.06516494, + "auxiliary_loss_mlp": 0.01282352, + "balance_loss_clip": 0.06309356, + "balance_loss_mlp": 0.01266223, + "epoch": 0.29382233578836614, + "flos": 20667843262080.0, + "grad_norm": 2.2972144011917863, + "language_loss": 0.7819618, + "learning_rate": 3.313223211088603e-06, + "loss": 0.85995024, + "num_input_tokens_seen": 105315710, + "router_z_loss_clip": 2.07128906, + "router_z_loss_mlp": 0.16137695, + "step": 4887, + "time_per_iteration": 2.5718464851379395 + }, + { + "auxiliary_loss_clip": 0.06508423, + "auxiliary_loss_mlp": 0.01281343, + "balance_loss_clip": 0.06301117, + "balance_loss_mlp": 0.01263962, + "epoch": 0.2938824590410341, + "flos": 16550662642560.0, + "grad_norm": 2.5346543108244366, + "language_loss": 0.80135798, + "learning_rate": 3.3129294429448855e-06, + "loss": 0.87925565, + "num_input_tokens_seen": 105333505, + "router_z_loss_clip": 2.07226562, + "router_z_loss_mlp": 0.1739502, + "step": 4888, + "time_per_iteration": 2.5823678970336914 + }, + { + "auxiliary_loss_clip": 0.06512221, + "auxiliary_loss_mlp": 0.01274662, + "balance_loss_clip": 0.06308408, + "balance_loss_mlp": 0.01257878, + "epoch": 0.29394258229370207, + "flos": 37934620824960.0, + "grad_norm": 1.521834171262281, + "language_loss": 0.55984998, + "learning_rate": 3.3126356250147517e-06, + "loss": 0.63771886, + "num_input_tokens_seen": 105355605, + "router_z_loss_clip": 2.04101562, + "router_z_loss_mlp": 0.16784668, + "step": 4889, + "time_per_iteration": 2.6925320625305176 + }, + { + "auxiliary_loss_clip": 0.06519246, + "auxiliary_loss_mlp": 0.01278013, + "balance_loss_clip": 0.06313413, + "balance_loss_mlp": 0.0126056, + "epoch": 0.29400270554637004, + "flos": 20050384924800.0, + "grad_norm": 1.7589662768394465, + "language_loss": 0.85257453, + "learning_rate": 3.3123417573093434e-06, + "loss": 0.93054712, + "num_input_tokens_seen": 105374225, + "router_z_loss_clip": 2.05761719, + "router_z_loss_mlp": 0.17443848, + "step": 4890, + "time_per_iteration": 2.546391010284424 + }, + { + "auxiliary_loss_clip": 0.06513973, + "auxiliary_loss_mlp": 0.01284253, + "balance_loss_clip": 0.06307942, + "balance_loss_mlp": 0.01266288, + "epoch": 0.294062828799038, + "flos": 15271498212480.0, + "grad_norm": 1.9077501912209676, + "language_loss": 0.73679662, + "learning_rate": 3.3120478398398046e-06, + "loss": 0.81477886, + "num_input_tokens_seen": 105391565, + "router_z_loss_clip": 2.06152344, + "router_z_loss_mlp": 0.17956543, + "step": 4891, + "time_per_iteration": 2.496230125427246 + }, + { + "auxiliary_loss_clip": 0.06519526, + "auxiliary_loss_mlp": 0.01285439, + "balance_loss_clip": 0.06312989, + "balance_loss_mlp": 0.01267468, + "epoch": 0.294122952051706, + "flos": 22753714475520.0, + "grad_norm": 1.802215562222595, + "language_loss": 0.77636111, + "learning_rate": 3.3117538726172797e-06, + "loss": 0.85441071, + "num_input_tokens_seen": 105409840, + "router_z_loss_clip": 2.06445312, + "router_z_loss_mlp": 0.17974854, + "step": 4892, + "time_per_iteration": 2.556626796722412 + }, + { + "auxiliary_loss_clip": 0.06508264, + "auxiliary_loss_mlp": 0.01274763, + "balance_loss_clip": 0.06305899, + "balance_loss_mlp": 0.01257096, + "epoch": 0.294183075304374, + "flos": 24979848624000.0, + "grad_norm": 1.857019535889917, + "language_loss": 0.78546309, + "learning_rate": 3.3114598556529164e-06, + "loss": 0.86329335, + "num_input_tokens_seen": 105428645, + "router_z_loss_clip": 2.02539062, + "router_z_loss_mlp": 0.17675781, + "step": 4893, + "time_per_iteration": 2.5583088397979736 + }, + { + "auxiliary_loss_clip": 0.06512541, + "auxiliary_loss_mlp": 0.01279131, + "balance_loss_clip": 0.06308632, + "balance_loss_mlp": 0.01262764, + "epoch": 0.29424319855704195, + "flos": 30960347212800.0, + "grad_norm": 7.778949224672863, + "language_loss": 0.85594332, + "learning_rate": 3.311165788957864e-06, + "loss": 0.93386006, + "num_input_tokens_seen": 105447480, + "router_z_loss_clip": 2.04101562, + "router_z_loss_mlp": 0.16357422, + "step": 4894, + "time_per_iteration": 2.642275094985962 + }, + { + "auxiliary_loss_clip": 0.06515005, + "auxiliary_loss_mlp": 0.01277674, + "balance_loss_clip": 0.06308285, + "balance_loss_mlp": 0.01260639, + "epoch": 0.2943033218097099, + "flos": 15236977530240.0, + "grad_norm": 2.7328127009682617, + "language_loss": 0.91485763, + "learning_rate": 3.310871672543274e-06, + "loss": 0.99278444, + "num_input_tokens_seen": 105464600, + "router_z_loss_clip": 2.06640625, + "router_z_loss_mlp": 0.17028809, + "step": 4895, + "time_per_iteration": 2.499884605407715 + }, + { + "auxiliary_loss_clip": 0.06521617, + "auxiliary_loss_mlp": 0.01275591, + "balance_loss_clip": 0.06309959, + "balance_loss_mlp": 0.01257519, + "epoch": 0.2943634450623779, + "flos": 21732336731520.0, + "grad_norm": 1.9156960384195119, + "language_loss": 0.86768568, + "learning_rate": 3.3105775064202982e-06, + "loss": 0.94565773, + "num_input_tokens_seen": 105481510, + "router_z_loss_clip": 2.11328125, + "router_z_loss_mlp": 0.18078613, + "step": 4896, + "time_per_iteration": 2.5482704639434814 + }, + { + "auxiliary_loss_clip": 0.06512056, + "auxiliary_loss_mlp": 0.01275376, + "balance_loss_clip": 0.06306215, + "balance_loss_mlp": 0.01257996, + "epoch": 0.29442356831504585, + "flos": 22608797639040.0, + "grad_norm": 2.0283086901116354, + "language_loss": 0.73915696, + "learning_rate": 3.3102832906000924e-06, + "loss": 0.81703126, + "num_input_tokens_seen": 105501390, + "router_z_loss_clip": 2.05566406, + "router_z_loss_mlp": 0.17382812, + "step": 4897, + "time_per_iteration": 2.5434658527374268 + }, + { + "auxiliary_loss_clip": 0.0652054, + "auxiliary_loss_mlp": 0.01280641, + "balance_loss_clip": 0.06307404, + "balance_loss_mlp": 0.01262378, + "epoch": 0.2944836915677138, + "flos": 20017625178240.0, + "grad_norm": 1.9321922101744466, + "language_loss": 0.74697995, + "learning_rate": 3.309989025093813e-06, + "loss": 0.82499176, + "num_input_tokens_seen": 105519600, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.18261719, + "step": 4898, + "time_per_iteration": 2.5770161151885986 + }, + { + "auxiliary_loss_clip": 0.06516017, + "auxiliary_loss_mlp": 0.01278564, + "balance_loss_clip": 0.06305353, + "balance_loss_mlp": 0.01259586, + "epoch": 0.2945438148203818, + "flos": 20051768517120.0, + "grad_norm": 2.462097706840479, + "language_loss": 0.71617198, + "learning_rate": 3.309694709912618e-06, + "loss": 0.79411781, + "num_input_tokens_seen": 105535970, + "router_z_loss_clip": 2.10742188, + "router_z_loss_mlp": 0.18969727, + "step": 4899, + "time_per_iteration": 2.5297536849975586 + }, + { + "auxiliary_loss_clip": 0.06510775, + "auxiliary_loss_mlp": 0.01278061, + "balance_loss_clip": 0.06304912, + "balance_loss_mlp": 0.01259727, + "epoch": 0.29460393807304974, + "flos": 23740487683200.0, + "grad_norm": 9.70716698994663, + "language_loss": 0.79828262, + "learning_rate": 3.3094003450676685e-06, + "loss": 0.87617099, + "num_input_tokens_seen": 105556735, + "router_z_loss_clip": 2.05859375, + "router_z_loss_mlp": 0.18322754, + "step": 4900, + "time_per_iteration": 2.589350461959839 + }, + { + "auxiliary_loss_clip": 0.06501958, + "auxiliary_loss_mlp": 0.01277561, + "balance_loss_clip": 0.06297968, + "balance_loss_mlp": 0.01260025, + "epoch": 0.2946640613257177, + "flos": 14981412977280.0, + "grad_norm": 1.6788003410312407, + "language_loss": 0.81419849, + "learning_rate": 3.3091059305701268e-06, + "loss": 0.89199364, + "num_input_tokens_seen": 105574875, + "router_z_loss_clip": 2.0390625, + "router_z_loss_mlp": 0.1751709, + "step": 4901, + "time_per_iteration": 2.4958457946777344 + }, + { + "auxiliary_loss_clip": 0.06498285, + "auxiliary_loss_mlp": 0.01276891, + "balance_loss_clip": 0.0630265, + "balance_loss_mlp": 0.01261095, + "epoch": 0.2947241845783857, + "flos": 24250862102400.0, + "grad_norm": 2.051988062923015, + "language_loss": 0.58211619, + "learning_rate": 3.308811466431157e-06, + "loss": 0.659868, + "num_input_tokens_seen": 105594225, + "router_z_loss_clip": 1.95800781, + "router_z_loss_mlp": 0.15783691, + "step": 4902, + "time_per_iteration": 2.5867393016815186 + }, + { + "auxiliary_loss_clip": 0.06509895, + "auxiliary_loss_mlp": 0.01278228, + "balance_loss_clip": 0.06304582, + "balance_loss_mlp": 0.01261825, + "epoch": 0.29478430783105364, + "flos": 19944600744960.0, + "grad_norm": 1.670035021285574, + "language_loss": 0.75883406, + "learning_rate": 3.308516952661925e-06, + "loss": 0.83671534, + "num_input_tokens_seen": 105614000, + "router_z_loss_clip": 2.05371094, + "router_z_loss_mlp": 0.16418457, + "step": 4903, + "time_per_iteration": 2.5120930671691895 + }, + { + "auxiliary_loss_clip": 0.06499215, + "auxiliary_loss_mlp": 0.01273387, + "balance_loss_clip": 0.06295954, + "balance_loss_mlp": 0.01255612, + "epoch": 0.2948444310837216, + "flos": 27388774454400.0, + "grad_norm": 1.8166217426315454, + "language_loss": 0.6305517, + "learning_rate": 3.3082223892736e-06, + "loss": 0.7082777, + "num_input_tokens_seen": 105634575, + "router_z_loss_clip": 2.03320312, + "router_z_loss_mlp": 0.17773438, + "step": 4904, + "time_per_iteration": 2.610600709915161 + }, + { + "auxiliary_loss_clip": 0.06509106, + "auxiliary_loss_mlp": 0.01272684, + "balance_loss_clip": 0.06301488, + "balance_loss_mlp": 0.01255983, + "epoch": 0.2949045543363896, + "flos": 23412401821440.0, + "grad_norm": 1.721115639485294, + "language_loss": 0.73724848, + "learning_rate": 3.3079277762773496e-06, + "loss": 0.8150664, + "num_input_tokens_seen": 105654385, + "router_z_loss_clip": 2.07519531, + "router_z_loss_mlp": 0.16711426, + "step": 4905, + "time_per_iteration": 2.5330429077148438 + }, + { + "auxiliary_loss_clip": 0.06501255, + "auxiliary_loss_mlp": 0.01270139, + "balance_loss_clip": 0.06297939, + "balance_loss_mlp": 0.01252508, + "epoch": 0.2949646775890576, + "flos": 23958303171840.0, + "grad_norm": 1.607284793713989, + "language_loss": 0.81930244, + "learning_rate": 3.3076331136843476e-06, + "loss": 0.89701641, + "num_input_tokens_seen": 105673570, + "router_z_loss_clip": 2.03320312, + "router_z_loss_mlp": 0.17614746, + "step": 4906, + "time_per_iteration": 2.5717568397521973 + }, + { + "auxiliary_loss_clip": 0.06499709, + "auxiliary_loss_mlp": 0.0127024, + "balance_loss_clip": 0.06300811, + "balance_loss_mlp": 0.01254051, + "epoch": 0.29502480084172555, + "flos": 22791002342400.0, + "grad_norm": 1.8767623479937394, + "language_loss": 0.88041449, + "learning_rate": 3.3073384015057667e-06, + "loss": 0.95811397, + "num_input_tokens_seen": 105691940, + "router_z_loss_clip": 1.98632812, + "router_z_loss_mlp": 0.16186523, + "step": 4907, + "time_per_iteration": 2.532233238220215 + }, + { + "auxiliary_loss_clip": 0.06504819, + "auxiliary_loss_mlp": 0.01277393, + "balance_loss_clip": 0.06294614, + "balance_loss_mlp": 0.01257592, + "epoch": 0.2950849240943935, + "flos": 19652838428160.0, + "grad_norm": 2.2863974346720837, + "language_loss": 0.82530308, + "learning_rate": 3.307043639752782e-06, + "loss": 0.90312517, + "num_input_tokens_seen": 105709825, + "router_z_loss_clip": 2.10058594, + "router_z_loss_mlp": 0.19812012, + "step": 4908, + "time_per_iteration": 2.6338536739349365 + }, + { + "auxiliary_loss_clip": 0.06393203, + "auxiliary_loss_mlp": 0.01256311, + "balance_loss_clip": 0.06296152, + "balance_loss_mlp": 0.01251251, + "epoch": 0.2951450473470615, + "flos": 71021062010880.0, + "grad_norm": 0.749349843123412, + "language_loss": 0.57384133, + "learning_rate": 3.3067488284365728e-06, + "loss": 0.65033644, + "num_input_tokens_seen": 105766880, + "router_z_loss_clip": 0.96875, + "router_z_loss_mlp": 0.05059814, + "step": 4909, + "time_per_iteration": 3.0084846019744873 + }, + { + "auxiliary_loss_clip": 0.06500423, + "auxiliary_loss_mlp": 0.01279147, + "balance_loss_clip": 0.06298146, + "balance_loss_mlp": 0.0126278, + "epoch": 0.29520517059972945, + "flos": 22972955483520.0, + "grad_norm": 1.5167904233162786, + "language_loss": 0.87274551, + "learning_rate": 3.3064539675683163e-06, + "loss": 0.9505412, + "num_input_tokens_seen": 105786875, + "router_z_loss_clip": 2.02441406, + "router_z_loss_mlp": 0.16381836, + "step": 4910, + "time_per_iteration": 2.615015745162964 + }, + { + "auxiliary_loss_clip": 0.06494174, + "auxiliary_loss_mlp": 0.01271849, + "balance_loss_clip": 0.06294993, + "balance_loss_mlp": 0.01255017, + "epoch": 0.2952652938523974, + "flos": 20491969541760.0, + "grad_norm": 1.9871602841434197, + "language_loss": 0.72998595, + "learning_rate": 3.3061590571591946e-06, + "loss": 0.80764621, + "num_input_tokens_seen": 105805315, + "router_z_loss_clip": 1.99316406, + "router_z_loss_mlp": 0.16821289, + "step": 4911, + "time_per_iteration": 2.5274527072906494 + }, + { + "auxiliary_loss_clip": 0.06493053, + "auxiliary_loss_mlp": 0.01276167, + "balance_loss_clip": 0.06295265, + "balance_loss_mlp": 0.01260122, + "epoch": 0.2953254171050654, + "flos": 19652754574080.0, + "grad_norm": 1.8153147203758204, + "language_loss": 0.90350848, + "learning_rate": 3.3058640972203904e-06, + "loss": 0.98120075, + "num_input_tokens_seen": 105825125, + "router_z_loss_clip": 1.9765625, + "router_z_loss_mlp": 0.16040039, + "step": 4912, + "time_per_iteration": 4.015045881271362 + }, + { + "auxiliary_loss_clip": 0.06500725, + "auxiliary_loss_mlp": 0.01273843, + "balance_loss_clip": 0.06298609, + "balance_loss_mlp": 0.01256474, + "epoch": 0.29538554035773334, + "flos": 22754678797440.0, + "grad_norm": 1.456675217678442, + "language_loss": 0.83491737, + "learning_rate": 3.3055690877630894e-06, + "loss": 0.91266304, + "num_input_tokens_seen": 105846085, + "router_z_loss_clip": 2.0234375, + "router_z_loss_mlp": 0.17370605, + "step": 4913, + "time_per_iteration": 2.5691113471984863 + }, + { + "auxiliary_loss_clip": 0.06499185, + "auxiliary_loss_mlp": 0.01271149, + "balance_loss_clip": 0.06297807, + "balance_loss_mlp": 0.01255163, + "epoch": 0.2954456636104013, + "flos": 21878343671040.0, + "grad_norm": 1.7751266266229593, + "language_loss": 0.77296054, + "learning_rate": 3.3052740287984765e-06, + "loss": 0.85066384, + "num_input_tokens_seen": 105865400, + "router_z_loss_clip": 2.01367188, + "router_z_loss_mlp": 0.15991211, + "step": 4914, + "time_per_iteration": 2.5379679203033447 + }, + { + "auxiliary_loss_clip": 0.06494316, + "auxiliary_loss_mlp": 0.01276253, + "balance_loss_clip": 0.06294423, + "balance_loss_mlp": 0.01259563, + "epoch": 0.2955057868630693, + "flos": 40452056092800.0, + "grad_norm": 1.8412710776020966, + "language_loss": 0.81848276, + "learning_rate": 3.3049789203377424e-06, + "loss": 0.89618844, + "num_input_tokens_seen": 105887920, + "router_z_loss_clip": 2.0, + "router_z_loss_mlp": 0.16674805, + "step": 4915, + "time_per_iteration": 4.123507261276245 + }, + { + "auxiliary_loss_clip": 0.06504083, + "auxiliary_loss_mlp": 0.01278112, + "balance_loss_clip": 0.06299824, + "balance_loss_mlp": 0.01260707, + "epoch": 0.29556591011573724, + "flos": 22571006647680.0, + "grad_norm": 1.7265680083109098, + "language_loss": 0.85337454, + "learning_rate": 3.3046837623920772e-06, + "loss": 0.93119645, + "num_input_tokens_seen": 105904035, + "router_z_loss_clip": 2.04296875, + "router_z_loss_mlp": 0.1739502, + "step": 4916, + "time_per_iteration": 3.964902400970459 + }, + { + "auxiliary_loss_clip": 0.06496175, + "auxiliary_loss_mlp": 0.01273483, + "balance_loss_clip": 0.06292706, + "balance_loss_mlp": 0.01257187, + "epoch": 0.2956260333684052, + "flos": 22095572181120.0, + "grad_norm": 2.6877460244099254, + "language_loss": 0.71410239, + "learning_rate": 3.3043885549726723e-06, + "loss": 0.79179895, + "num_input_tokens_seen": 105922685, + "router_z_loss_clip": 2.03613281, + "router_z_loss_mlp": 0.16296387, + "step": 4917, + "time_per_iteration": 2.510061502456665 + }, + { + "auxiliary_loss_clip": 0.06495264, + "auxiliary_loss_mlp": 0.01273068, + "balance_loss_clip": 0.06293347, + "balance_loss_mlp": 0.01255771, + "epoch": 0.2956861566210732, + "flos": 16441063102080.0, + "grad_norm": 1.9904514264943383, + "language_loss": 0.9154985, + "learning_rate": 3.3040932980907226e-06, + "loss": 0.99318182, + "num_input_tokens_seen": 105940425, + "router_z_loss_clip": 2.01757812, + "router_z_loss_mlp": 0.1730957, + "step": 4918, + "time_per_iteration": 2.5177812576293945 + }, + { + "auxiliary_loss_clip": 0.06500694, + "auxiliary_loss_mlp": 0.01270804, + "balance_loss_clip": 0.0629639, + "balance_loss_mlp": 0.01252887, + "epoch": 0.2957462798737412, + "flos": 25819189372800.0, + "grad_norm": 2.9632565132584587, + "language_loss": 0.73171133, + "learning_rate": 3.303797991757425e-06, + "loss": 0.80942631, + "num_input_tokens_seen": 105960550, + "router_z_loss_clip": 2.04394531, + "router_z_loss_mlp": 0.17919922, + "step": 4919, + "time_per_iteration": 2.548271656036377 + }, + { + "auxiliary_loss_clip": 0.06494663, + "auxiliary_loss_mlp": 0.01276246, + "balance_loss_clip": 0.062939, + "balance_loss_mlp": 0.01259104, + "epoch": 0.29580640312640916, + "flos": 16696459946880.0, + "grad_norm": 2.067015346809242, + "language_loss": 0.76653767, + "learning_rate": 3.3035026359839763e-06, + "loss": 0.84424675, + "num_input_tokens_seen": 105978820, + "router_z_loss_clip": 2.00878906, + "router_z_loss_mlp": 0.17138672, + "step": 4920, + "time_per_iteration": 2.5283315181732178 + }, + { + "auxiliary_loss_clip": 0.06505087, + "auxiliary_loss_mlp": 0.01280613, + "balance_loss_clip": 0.06298134, + "balance_loss_mlp": 0.01262886, + "epoch": 0.2958665263790771, + "flos": 23951427137280.0, + "grad_norm": 2.1683803944953786, + "language_loss": 0.69314063, + "learning_rate": 3.3032072307815774e-06, + "loss": 0.77099764, + "num_input_tokens_seen": 105997545, + "router_z_loss_clip": 2.06738281, + "router_z_loss_mlp": 0.17724609, + "step": 4921, + "time_per_iteration": 3.9904286861419678 + }, + { + "auxiliary_loss_clip": 0.06507339, + "auxiliary_loss_mlp": 0.01279047, + "balance_loss_clip": 0.06297763, + "balance_loss_mlp": 0.01261023, + "epoch": 0.2959266496317451, + "flos": 18484279787520.0, + "grad_norm": 1.8551497184563221, + "language_loss": 0.75478184, + "learning_rate": 3.3029117761614298e-06, + "loss": 0.83264565, + "num_input_tokens_seen": 106015320, + "router_z_loss_clip": 2.09570312, + "router_z_loss_mlp": 0.18017578, + "step": 4922, + "time_per_iteration": 2.5025644302368164 + }, + { + "auxiliary_loss_clip": 0.06508595, + "auxiliary_loss_mlp": 0.01277113, + "balance_loss_clip": 0.06298192, + "balance_loss_mlp": 0.01258051, + "epoch": 0.29598677288441305, + "flos": 25964525479680.0, + "grad_norm": 1.7877276864194063, + "language_loss": 0.77317607, + "learning_rate": 3.302616272134737e-06, + "loss": 0.85103309, + "num_input_tokens_seen": 106034555, + "router_z_loss_clip": 2.10742188, + "router_z_loss_mlp": 0.19067383, + "step": 4923, + "time_per_iteration": 2.57328462600708 + }, + { + "auxiliary_loss_clip": 0.06498858, + "auxiliary_loss_mlp": 0.01279587, + "balance_loss_clip": 0.06293048, + "balance_loss_mlp": 0.01262016, + "epoch": 0.296046896137081, + "flos": 25163101503360.0, + "grad_norm": 2.2992847921393174, + "language_loss": 0.8687042, + "learning_rate": 3.3023207187127042e-06, + "loss": 0.94648862, + "num_input_tokens_seen": 106054200, + "router_z_loss_clip": 2.05859375, + "router_z_loss_mlp": 0.17565918, + "step": 4924, + "time_per_iteration": 2.569819450378418 + }, + { + "auxiliary_loss_clip": 0.06495638, + "auxiliary_loss_mlp": 0.01274356, + "balance_loss_clip": 0.06293976, + "balance_loss_mlp": 0.01256891, + "epoch": 0.296107019389749, + "flos": 21767402465280.0, + "grad_norm": 1.4490170840920502, + "language_loss": 0.823627, + "learning_rate": 3.3020251159065396e-06, + "loss": 0.90132689, + "num_input_tokens_seen": 106074700, + "router_z_loss_clip": 2.015625, + "router_z_loss_mlp": 0.17468262, + "step": 4925, + "time_per_iteration": 2.586395025253296 + }, + { + "auxiliary_loss_clip": 0.06496158, + "auxiliary_loss_mlp": 0.01278426, + "balance_loss_clip": 0.06294197, + "balance_loss_mlp": 0.01261415, + "epoch": 0.29616714264241695, + "flos": 17964555638400.0, + "grad_norm": 3.115838377994743, + "language_loss": 0.87332439, + "learning_rate": 3.301729463727452e-06, + "loss": 0.95107025, + "num_input_tokens_seen": 106091415, + "router_z_loss_clip": 2.01953125, + "router_z_loss_mlp": 0.17016602, + "step": 4926, + "time_per_iteration": 2.480851411819458 + }, + { + "auxiliary_loss_clip": 0.06502646, + "auxiliary_loss_mlp": 0.01277188, + "balance_loss_clip": 0.06295682, + "balance_loss_mlp": 0.0125995, + "epoch": 0.2962272658950849, + "flos": 15018155792640.0, + "grad_norm": 2.5897634799766296, + "language_loss": 0.86097062, + "learning_rate": 3.3014337621866527e-06, + "loss": 0.93876898, + "num_input_tokens_seen": 106109135, + "router_z_loss_clip": 2.06738281, + "router_z_loss_mlp": 0.17236328, + "step": 4927, + "time_per_iteration": 2.524277687072754 + }, + { + "auxiliary_loss_clip": 0.06496821, + "auxiliary_loss_mlp": 0.01273329, + "balance_loss_clip": 0.06295302, + "balance_loss_mlp": 0.01256545, + "epoch": 0.2962873891477529, + "flos": 14726183840640.0, + "grad_norm": 1.628327768422068, + "language_loss": 0.80864251, + "learning_rate": 3.3011380112953553e-06, + "loss": 0.88634396, + "num_input_tokens_seen": 106125750, + "router_z_loss_clip": 2.01367188, + "router_z_loss_mlp": 0.16772461, + "step": 4928, + "time_per_iteration": 2.495842933654785 + }, + { + "auxiliary_loss_clip": 0.06510531, + "auxiliary_loss_mlp": 0.01280378, + "balance_loss_clip": 0.0629655, + "balance_loss_mlp": 0.012609, + "epoch": 0.29634751240042084, + "flos": 26730967576320.0, + "grad_norm": 3.186979474193142, + "language_loss": 0.72557974, + "learning_rate": 3.300842211064773e-06, + "loss": 0.80348885, + "num_input_tokens_seen": 106142835, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.19482422, + "step": 4929, + "time_per_iteration": 2.5845630168914795 + }, + { + "auxiliary_loss_clip": 0.06503193, + "auxiliary_loss_mlp": 0.01287506, + "balance_loss_clip": 0.06293295, + "balance_loss_mlp": 0.01268456, + "epoch": 0.2964076356530888, + "flos": 14575984197120.0, + "grad_norm": 2.811052251549286, + "language_loss": 0.73200721, + "learning_rate": 3.3005463615061246e-06, + "loss": 0.80991417, + "num_input_tokens_seen": 106160680, + "router_z_loss_clip": 2.09960938, + "router_z_loss_mlp": 0.19042969, + "step": 4930, + "time_per_iteration": 2.488785982131958 + }, + { + "auxiliary_loss_clip": 0.06387739, + "auxiliary_loss_mlp": 0.01269345, + "balance_loss_clip": 0.06290003, + "balance_loss_mlp": 0.0126519, + "epoch": 0.29646775890575683, + "flos": 63124387925760.0, + "grad_norm": 0.773484435694784, + "language_loss": 0.60626972, + "learning_rate": 3.3002504626306275e-06, + "loss": 0.68284053, + "num_input_tokens_seen": 106224415, + "router_z_loss_clip": 0.9765625, + "router_z_loss_mlp": 0.04156494, + "step": 4931, + "time_per_iteration": 3.1399567127227783 + }, + { + "auxiliary_loss_clip": 0.06390411, + "auxiliary_loss_mlp": 0.01264384, + "balance_loss_clip": 0.06293079, + "balance_loss_mlp": 0.0126054, + "epoch": 0.2965278821584248, + "flos": 63087728964480.0, + "grad_norm": 0.7260178151779769, + "language_loss": 0.52335358, + "learning_rate": 3.2999545144495023e-06, + "loss": 0.59990156, + "num_input_tokens_seen": 106279140, + "router_z_loss_clip": 0.97265625, + "router_z_loss_mlp": 0.03839111, + "step": 4932, + "time_per_iteration": 3.0242393016815186 + }, + { + "auxiliary_loss_clip": 0.06496995, + "auxiliary_loss_mlp": 0.01277379, + "balance_loss_clip": 0.06294326, + "balance_loss_mlp": 0.01260368, + "epoch": 0.29658800541109276, + "flos": 23775469562880.0, + "grad_norm": 1.6744964780290639, + "language_loss": 0.82042706, + "learning_rate": 3.299658516973972e-06, + "loss": 0.89817077, + "num_input_tokens_seen": 106298190, + "router_z_loss_clip": 2.02734375, + "router_z_loss_mlp": 0.17028809, + "step": 4933, + "time_per_iteration": 2.5955240726470947 + }, + { + "auxiliary_loss_clip": 0.06493178, + "auxiliary_loss_mlp": 0.01272888, + "balance_loss_clip": 0.06293809, + "balance_loss_mlp": 0.01256377, + "epoch": 0.2966481286637607, + "flos": 23995465257600.0, + "grad_norm": 1.8381459517159284, + "language_loss": 0.75639498, + "learning_rate": 3.299362470215261e-06, + "loss": 0.83405566, + "num_input_tokens_seen": 106319065, + "router_z_loss_clip": 1.99414062, + "router_z_loss_mlp": 0.16503906, + "step": 4934, + "time_per_iteration": 2.5714681148529053 + }, + { + "auxiliary_loss_clip": 0.06508597, + "auxiliary_loss_mlp": 0.01280413, + "balance_loss_clip": 0.06299804, + "balance_loss_mlp": 0.01261697, + "epoch": 0.2967082519164287, + "flos": 17170846237440.0, + "grad_norm": 1.723450067314057, + "language_loss": 0.63127494, + "learning_rate": 3.299066374184594e-06, + "loss": 0.70916504, + "num_input_tokens_seen": 106338040, + "router_z_loss_clip": 2.0859375, + "router_z_loss_mlp": 0.18713379, + "step": 4935, + "time_per_iteration": 2.513557195663452 + }, + { + "auxiliary_loss_clip": 0.06500618, + "auxiliary_loss_mlp": 0.01281806, + "balance_loss_clip": 0.06298316, + "balance_loss_mlp": 0.01263424, + "epoch": 0.29676837516909665, + "flos": 29395416032640.0, + "grad_norm": 1.6887254989691298, + "language_loss": 0.80239189, + "learning_rate": 3.2987702288932e-06, + "loss": 0.88021612, + "num_input_tokens_seen": 106358900, + "router_z_loss_clip": 2.02148438, + "router_z_loss_mlp": 0.18383789, + "step": 4936, + "time_per_iteration": 2.6222426891326904 + }, + { + "auxiliary_loss_clip": 0.06510909, + "auxiliary_loss_mlp": 0.0128109, + "balance_loss_clip": 0.06301413, + "balance_loss_mlp": 0.01261444, + "epoch": 0.2968284984217646, + "flos": 34759839876480.0, + "grad_norm": 1.4826285887608224, + "language_loss": 0.74831104, + "learning_rate": 3.298474034352309e-06, + "loss": 0.826231, + "num_input_tokens_seen": 106381805, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 0.19665527, + "step": 4937, + "time_per_iteration": 2.7231242656707764 + }, + { + "auxiliary_loss_clip": 0.06501779, + "auxiliary_loss_mlp": 0.01274387, + "balance_loss_clip": 0.06297591, + "balance_loss_mlp": 0.01256768, + "epoch": 0.2968886216744326, + "flos": 21550635152640.0, + "grad_norm": 1.507706154697653, + "language_loss": 0.78372371, + "learning_rate": 3.2981777905731526e-06, + "loss": 0.86148536, + "num_input_tokens_seen": 106402365, + "router_z_loss_clip": 2.04101562, + "router_z_loss_mlp": 0.17614746, + "step": 4938, + "time_per_iteration": 2.564958095550537 + }, + { + "auxiliary_loss_clip": 0.06506119, + "auxiliary_loss_mlp": 0.01279001, + "balance_loss_clip": 0.06296918, + "balance_loss_mlp": 0.01260643, + "epoch": 0.29694874492710055, + "flos": 12792357060480.0, + "grad_norm": 3.019574533594622, + "language_loss": 0.76788878, + "learning_rate": 3.297881497566964e-06, + "loss": 0.84574002, + "num_input_tokens_seen": 106419800, + "router_z_loss_clip": 2.09179688, + "router_z_loss_mlp": 0.18359375, + "step": 4939, + "time_per_iteration": 2.514143943786621 + }, + { + "auxiliary_loss_clip": 0.06509334, + "auxiliary_loss_mlp": 0.01271997, + "balance_loss_clip": 0.06296703, + "balance_loss_mlp": 0.01254259, + "epoch": 0.2970088681797685, + "flos": 24576600049920.0, + "grad_norm": 1.687046897883716, + "language_loss": 0.78335512, + "learning_rate": 3.297585155344979e-06, + "loss": 0.86116844, + "num_input_tokens_seen": 106440300, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.17736816, + "step": 4940, + "time_per_iteration": 2.570279359817505 + }, + { + "auxiliary_loss_clip": 0.06508817, + "auxiliary_loss_mlp": 0.01275865, + "balance_loss_clip": 0.06300067, + "balance_loss_mlp": 0.01257113, + "epoch": 0.2970689914324365, + "flos": 23665870022400.0, + "grad_norm": 1.5281741947741105, + "language_loss": 0.75415564, + "learning_rate": 3.297288763918435e-06, + "loss": 0.8320024, + "num_input_tokens_seen": 106460035, + "router_z_loss_clip": 2.08789062, + "router_z_loss_mlp": 0.1875, + "step": 4941, + "time_per_iteration": 2.549976348876953 + }, + { + "auxiliary_loss_clip": 0.06509985, + "auxiliary_loss_mlp": 0.01274098, + "balance_loss_clip": 0.06298217, + "balance_loss_mlp": 0.01254667, + "epoch": 0.29712911468510445, + "flos": 39678654107520.0, + "grad_norm": 2.245999939669129, + "language_loss": 0.74959898, + "learning_rate": 3.2969923232985712e-06, + "loss": 0.82743979, + "num_input_tokens_seen": 106481095, + "router_z_loss_clip": 2.1171875, + "router_z_loss_mlp": 0.19445801, + "step": 4942, + "time_per_iteration": 2.7199416160583496 + }, + { + "auxiliary_loss_clip": 0.0651295, + "auxiliary_loss_mlp": 0.01282177, + "balance_loss_clip": 0.06299168, + "balance_loss_mlp": 0.01261744, + "epoch": 0.2971892379377724, + "flos": 26402420517120.0, + "grad_norm": 1.727137408051059, + "language_loss": 0.70931113, + "learning_rate": 3.2966958334966287e-06, + "loss": 0.78726244, + "num_input_tokens_seen": 106501590, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.2043457, + "step": 4943, + "time_per_iteration": 2.5410006046295166 + }, + { + "auxiliary_loss_clip": 0.06508674, + "auxiliary_loss_mlp": 0.01274303, + "balance_loss_clip": 0.06296329, + "balance_loss_mlp": 0.01255599, + "epoch": 0.2972493611904404, + "flos": 17608992837120.0, + "grad_norm": 2.280832061666768, + "language_loss": 0.8012532, + "learning_rate": 3.2963992945238497e-06, + "loss": 0.87908292, + "num_input_tokens_seen": 106519430, + "router_z_loss_clip": 2.12207031, + "router_z_loss_mlp": 0.18725586, + "step": 4944, + "time_per_iteration": 2.5628697872161865 + }, + { + "auxiliary_loss_clip": 0.06495067, + "auxiliary_loss_mlp": 0.01272551, + "balance_loss_clip": 0.06293043, + "balance_loss_mlp": 0.01255194, + "epoch": 0.2973094844431084, + "flos": 20419070889600.0, + "grad_norm": 2.0196449856406704, + "language_loss": 0.83490258, + "learning_rate": 3.2961027063914795e-06, + "loss": 0.91257876, + "num_input_tokens_seen": 106535870, + "router_z_loss_clip": 2.01953125, + "router_z_loss_mlp": 0.17346191, + "step": 4945, + "time_per_iteration": 2.5184381008148193 + }, + { + "auxiliary_loss_clip": 0.06494735, + "auxiliary_loss_mlp": 0.01274271, + "balance_loss_clip": 0.0629338, + "balance_loss_mlp": 0.01257081, + "epoch": 0.29736960769577636, + "flos": 17499225588480.0, + "grad_norm": 1.8481246337269472, + "language_loss": 0.67665654, + "learning_rate": 3.2958060691107654e-06, + "loss": 0.75434661, + "num_input_tokens_seen": 106553560, + "router_z_loss_clip": 2.00976562, + "router_z_loss_mlp": 0.171875, + "step": 4946, + "time_per_iteration": 2.524073362350464 + }, + { + "auxiliary_loss_clip": 0.06500807, + "auxiliary_loss_mlp": 0.01272914, + "balance_loss_clip": 0.06294695, + "balance_loss_mlp": 0.01255462, + "epoch": 0.2974297309484443, + "flos": 26111119397760.0, + "grad_norm": 1.9041348906467674, + "language_loss": 0.74493206, + "learning_rate": 3.2955093826929547e-06, + "loss": 0.82266927, + "num_input_tokens_seen": 106574115, + "router_z_loss_clip": 2.05957031, + "router_z_loss_mlp": 0.17443848, + "step": 4947, + "time_per_iteration": 2.55096435546875 + }, + { + "auxiliary_loss_clip": 0.06508033, + "auxiliary_loss_mlp": 0.01274303, + "balance_loss_clip": 0.06299601, + "balance_loss_mlp": 0.01255396, + "epoch": 0.2974898542011123, + "flos": 25673559776640.0, + "grad_norm": 5.5840313105791894, + "language_loss": 0.73332673, + "learning_rate": 3.2952126471492985e-06, + "loss": 0.81115007, + "num_input_tokens_seen": 106593070, + "router_z_loss_clip": 2.08496094, + "router_z_loss_mlp": 0.18896484, + "step": 4948, + "time_per_iteration": 2.604213237762451 + }, + { + "auxiliary_loss_clip": 0.06494544, + "auxiliary_loss_mlp": 0.01275305, + "balance_loss_clip": 0.06292598, + "balance_loss_mlp": 0.01258687, + "epoch": 0.29754997745378026, + "flos": 18667323031680.0, + "grad_norm": 1.916403484704169, + "language_loss": 0.84057009, + "learning_rate": 3.2949158624910497e-06, + "loss": 0.91826856, + "num_input_tokens_seen": 106610695, + "router_z_loss_clip": 2.015625, + "router_z_loss_mlp": 0.1661377, + "step": 4949, + "time_per_iteration": 2.4725756645202637 + }, + { + "auxiliary_loss_clip": 0.06495193, + "auxiliary_loss_mlp": 0.01276752, + "balance_loss_clip": 0.06291104, + "balance_loss_mlp": 0.01258692, + "epoch": 0.2976101007064482, + "flos": 22281382609920.0, + "grad_norm": 2.0864257908602464, + "language_loss": 0.71227181, + "learning_rate": 3.2946190287294603e-06, + "loss": 0.78999126, + "num_input_tokens_seen": 106631300, + "router_z_loss_clip": 2.04101562, + "router_z_loss_mlp": 0.18078613, + "step": 4950, + "time_per_iteration": 2.5644164085388184 + }, + { + "auxiliary_loss_clip": 0.06486266, + "auxiliary_loss_mlp": 0.01272795, + "balance_loss_clip": 0.06290439, + "balance_loss_mlp": 0.01256308, + "epoch": 0.2976702239591162, + "flos": 21952290499200.0, + "grad_norm": 2.1576156011429597, + "language_loss": 0.83112931, + "learning_rate": 3.294322145875789e-06, + "loss": 0.9087199, + "num_input_tokens_seen": 106650065, + "router_z_loss_clip": 1.95898438, + "router_z_loss_mlp": 0.16467285, + "step": 4951, + "time_per_iteration": 2.5149009227752686 + }, + { + "auxiliary_loss_clip": 0.06493516, + "auxiliary_loss_mlp": 0.01274653, + "balance_loss_clip": 0.06287138, + "balance_loss_mlp": 0.01257248, + "epoch": 0.29773034721178415, + "flos": 24642874229760.0, + "grad_norm": 2.538162384222029, + "language_loss": 0.73777694, + "learning_rate": 3.2940252139412912e-06, + "loss": 0.81545866, + "num_input_tokens_seen": 106668230, + "router_z_loss_clip": 2.06347656, + "router_z_loss_mlp": 0.17407227, + "step": 4952, + "time_per_iteration": 3.9977774620056152 + }, + { + "auxiliary_loss_clip": 0.06494328, + "auxiliary_loss_mlp": 0.01279914, + "balance_loss_clip": 0.06291338, + "balance_loss_mlp": 0.01261472, + "epoch": 0.2977904704644521, + "flos": 20563694236800.0, + "grad_norm": 1.830993802630573, + "language_loss": 0.8420608, + "learning_rate": 3.293728232937228e-06, + "loss": 0.91980314, + "num_input_tokens_seen": 106687785, + "router_z_loss_clip": 2.03027344, + "router_z_loss_mlp": 0.18444824, + "step": 4953, + "time_per_iteration": 2.556278944015503 + }, + { + "auxiliary_loss_clip": 0.0649702, + "auxiliary_loss_mlp": 0.01271138, + "balance_loss_clip": 0.06289494, + "balance_loss_mlp": 0.01254246, + "epoch": 0.2978505937171201, + "flos": 18922426387200.0, + "grad_norm": 2.0824874332629113, + "language_loss": 0.74276727, + "learning_rate": 3.2934312028748597e-06, + "loss": 0.82044888, + "num_input_tokens_seen": 106706875, + "router_z_loss_clip": 2.07617188, + "router_z_loss_mlp": 0.16894531, + "step": 4954, + "time_per_iteration": 3.9108667373657227 + }, + { + "auxiliary_loss_clip": 0.06489201, + "auxiliary_loss_mlp": 0.01275174, + "balance_loss_clip": 0.06286507, + "balance_loss_mlp": 0.01259164, + "epoch": 0.29791071696978805, + "flos": 19323788244480.0, + "grad_norm": 1.865430683209025, + "language_loss": 0.75582623, + "learning_rate": 3.293134123765452e-06, + "loss": 0.83346999, + "num_input_tokens_seen": 106725105, + "router_z_loss_clip": 2.02929688, + "router_z_loss_mlp": 0.16003418, + "step": 4955, + "time_per_iteration": 4.034101724624634 + }, + { + "auxiliary_loss_clip": 0.06493168, + "auxiliary_loss_mlp": 0.01273359, + "balance_loss_clip": 0.06285557, + "balance_loss_mlp": 0.0125593, + "epoch": 0.297970840222456, + "flos": 18812742992640.0, + "grad_norm": 1.8893942834003292, + "language_loss": 0.72569048, + "learning_rate": 3.2928369956202684e-06, + "loss": 0.80335575, + "num_input_tokens_seen": 106744780, + "router_z_loss_clip": 2.07421875, + "router_z_loss_mlp": 0.17419434, + "step": 4956, + "time_per_iteration": 2.523688793182373 + }, + { + "auxiliary_loss_clip": 0.06498902, + "auxiliary_loss_mlp": 0.01272155, + "balance_loss_clip": 0.06287451, + "balance_loss_mlp": 0.01253141, + "epoch": 0.298030963475124, + "flos": 22858702041600.0, + "grad_norm": 1.7093127439145954, + "language_loss": 0.79588521, + "learning_rate": 3.2925398184505754e-06, + "loss": 0.87359571, + "num_input_tokens_seen": 106764670, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.19006348, + "step": 4957, + "time_per_iteration": 2.5350780487060547 + }, + { + "auxiliary_loss_clip": 0.0648672, + "auxiliary_loss_mlp": 0.01278155, + "balance_loss_clip": 0.06281397, + "balance_loss_mlp": 0.01261084, + "epoch": 0.298091086727792, + "flos": 21874402529280.0, + "grad_norm": 1.5033412482034976, + "language_loss": 0.70601791, + "learning_rate": 3.2922425922676437e-06, + "loss": 0.78366661, + "num_input_tokens_seen": 106783695, + "router_z_loss_clip": 2.05273438, + "router_z_loss_mlp": 0.17077637, + "step": 4958, + "time_per_iteration": 2.52998948097229 + }, + { + "auxiliary_loss_clip": 0.06484255, + "auxiliary_loss_mlp": 0.01275467, + "balance_loss_clip": 0.06283475, + "balance_loss_mlp": 0.01256954, + "epoch": 0.29815120998045996, + "flos": 21180775230720.0, + "grad_norm": 1.4471916983062794, + "language_loss": 0.78955591, + "learning_rate": 3.291945317082743e-06, + "loss": 0.86715317, + "num_input_tokens_seen": 106803150, + "router_z_loss_clip": 2.00683594, + "router_z_loss_mlp": 0.18505859, + "step": 4959, + "time_per_iteration": 2.5247116088867188 + }, + { + "auxiliary_loss_clip": 0.06484501, + "auxiliary_loss_mlp": 0.01275754, + "balance_loss_clip": 0.06281502, + "balance_loss_mlp": 0.01258183, + "epoch": 0.29821133323312793, + "flos": 19901526946560.0, + "grad_norm": 1.8097637226237389, + "language_loss": 0.79637736, + "learning_rate": 3.291647992907147e-06, + "loss": 0.87397993, + "num_input_tokens_seen": 106820705, + "router_z_loss_clip": 2.02734375, + "router_z_loss_mlp": 0.17578125, + "step": 4960, + "time_per_iteration": 2.544517755508423 + }, + { + "auxiliary_loss_clip": 0.06493803, + "auxiliary_loss_mlp": 0.01272699, + "balance_loss_clip": 0.06284714, + "balance_loss_mlp": 0.01254483, + "epoch": 0.2982714564857959, + "flos": 12755781953280.0, + "grad_norm": 2.226713674353186, + "language_loss": 0.74493575, + "learning_rate": 3.291350619752129e-06, + "loss": 0.82260078, + "num_input_tokens_seen": 106837335, + "router_z_loss_clip": 2.09082031, + "router_z_loss_mlp": 0.18225098, + "step": 4961, + "time_per_iteration": 3.9662065505981445 + }, + { + "auxiliary_loss_clip": 0.06486452, + "auxiliary_loss_mlp": 0.01274578, + "balance_loss_clip": 0.062804, + "balance_loss_mlp": 0.01256756, + "epoch": 0.29833157973846386, + "flos": 22278238081920.0, + "grad_norm": 2.8000667311611167, + "language_loss": 0.62968349, + "learning_rate": 3.291053197628967e-06, + "loss": 0.70729387, + "num_input_tokens_seen": 106856250, + "router_z_loss_clip": 2.06054688, + "router_z_loss_mlp": 0.17810059, + "step": 4962, + "time_per_iteration": 2.533984661102295 + }, + { + "auxiliary_loss_clip": 0.06485053, + "auxiliary_loss_mlp": 0.01276691, + "balance_loss_clip": 0.06281514, + "balance_loss_mlp": 0.01259596, + "epoch": 0.2983917029911318, + "flos": 15377659735680.0, + "grad_norm": 1.6706058401186525, + "language_loss": 0.83686638, + "learning_rate": 3.2907557265489375e-06, + "loss": 0.91448379, + "num_input_tokens_seen": 106873370, + "router_z_loss_clip": 2.03613281, + "router_z_loss_mlp": 0.17102051, + "step": 4963, + "time_per_iteration": 2.524486780166626 + }, + { + "auxiliary_loss_clip": 0.0648464, + "auxiliary_loss_mlp": 0.01276785, + "balance_loss_clip": 0.06283776, + "balance_loss_mlp": 0.01259572, + "epoch": 0.2984518262437998, + "flos": 15383068323840.0, + "grad_norm": 2.213795741630968, + "language_loss": 0.66932309, + "learning_rate": 3.290458206523322e-06, + "loss": 0.74693739, + "num_input_tokens_seen": 106890330, + "router_z_loss_clip": 2.01074219, + "router_z_loss_mlp": 0.17224121, + "step": 4964, + "time_per_iteration": 2.5100491046905518 + }, + { + "auxiliary_loss_clip": 0.06485043, + "auxiliary_loss_mlp": 0.01273472, + "balance_loss_clip": 0.06283367, + "balance_loss_mlp": 0.01257701, + "epoch": 0.29851194949646775, + "flos": 18113413616640.0, + "grad_norm": 1.8232440195867097, + "language_loss": 0.72163451, + "learning_rate": 3.2901606375634015e-06, + "loss": 0.79921961, + "num_input_tokens_seen": 106909190, + "router_z_loss_clip": 2.01660156, + "router_z_loss_mlp": 0.15771484, + "step": 4965, + "time_per_iteration": 2.5180373191833496 + }, + { + "auxiliary_loss_clip": 0.06490128, + "auxiliary_loss_mlp": 0.01278877, + "balance_loss_clip": 0.06284484, + "balance_loss_mlp": 0.01261139, + "epoch": 0.2985720727491357, + "flos": 22024811808000.0, + "grad_norm": 1.7919900337102326, + "language_loss": 0.66928089, + "learning_rate": 3.289863019680461e-06, + "loss": 0.74697095, + "num_input_tokens_seen": 106927825, + "router_z_loss_clip": 2.05664062, + "router_z_loss_mlp": 0.17724609, + "step": 4966, + "time_per_iteration": 2.5509839057922363 + }, + { + "auxiliary_loss_clip": 0.06492805, + "auxiliary_loss_mlp": 0.01279859, + "balance_loss_clip": 0.06288783, + "balance_loss_mlp": 0.01262026, + "epoch": 0.2986321960018037, + "flos": 13046202604800.0, + "grad_norm": 2.9983208236286862, + "language_loss": 0.74761832, + "learning_rate": 3.289565352885785e-06, + "loss": 0.82534492, + "num_input_tokens_seen": 106943155, + "router_z_loss_clip": 2.04199219, + "router_z_loss_mlp": 0.17822266, + "step": 4967, + "time_per_iteration": 2.5119001865386963 + }, + { + "auxiliary_loss_clip": 0.06492577, + "auxiliary_loss_mlp": 0.01276602, + "balance_loss_clip": 0.06288804, + "balance_loss_mlp": 0.01260294, + "epoch": 0.29869231925447165, + "flos": 14470241944320.0, + "grad_norm": 1.9901449284839132, + "language_loss": 0.72232509, + "learning_rate": 3.2892676371906614e-06, + "loss": 0.80001682, + "num_input_tokens_seen": 106960295, + "router_z_loss_clip": 2.03613281, + "router_z_loss_mlp": 0.16308594, + "step": 4968, + "time_per_iteration": 2.49646258354187 + }, + { + "auxiliary_loss_clip": 0.06497695, + "auxiliary_loss_mlp": 0.01278817, + "balance_loss_clip": 0.06290321, + "balance_loss_mlp": 0.01261007, + "epoch": 0.2987524425071396, + "flos": 31658376850560.0, + "grad_norm": 1.780098836704026, + "language_loss": 0.76775402, + "learning_rate": 3.2889698726063805e-06, + "loss": 0.84551913, + "num_input_tokens_seen": 106982870, + "router_z_loss_clip": 2.07324219, + "router_z_loss_mlp": 0.17810059, + "step": 4969, + "time_per_iteration": 2.677133321762085 + }, + { + "auxiliary_loss_clip": 0.0649517, + "auxiliary_loss_mlp": 0.01279823, + "balance_loss_clip": 0.06290856, + "balance_loss_mlp": 0.0126355, + "epoch": 0.2988125657598076, + "flos": 21439735873920.0, + "grad_norm": 1.6530964666677603, + "language_loss": 0.702811, + "learning_rate": 3.2886720591442327e-06, + "loss": 0.78056097, + "num_input_tokens_seen": 107002405, + "router_z_loss_clip": 2.04589844, + "router_z_loss_mlp": 0.16271973, + "step": 4970, + "time_per_iteration": 2.542041301727295 + }, + { + "auxiliary_loss_clip": 0.06501894, + "auxiliary_loss_mlp": 0.01279087, + "balance_loss_clip": 0.06289935, + "balance_loss_mlp": 0.01260336, + "epoch": 0.2988726890124756, + "flos": 18082750222080.0, + "grad_norm": 2.836679638175962, + "language_loss": 0.84790057, + "learning_rate": 3.2883741968155103e-06, + "loss": 0.92571044, + "num_input_tokens_seen": 107017310, + "router_z_loss_clip": 2.11816406, + "router_z_loss_mlp": 0.18737793, + "step": 4971, + "time_per_iteration": 2.5460052490234375 + }, + { + "auxiliary_loss_clip": 0.06490934, + "auxiliary_loss_mlp": 0.01273993, + "balance_loss_clip": 0.06292243, + "balance_loss_mlp": 0.01257691, + "epoch": 0.29893281226514357, + "flos": 21760987628160.0, + "grad_norm": 1.7104631490326472, + "language_loss": 0.79530191, + "learning_rate": 3.2880762856315107e-06, + "loss": 0.87295115, + "num_input_tokens_seen": 107034645, + "router_z_loss_clip": 1.98925781, + "router_z_loss_mlp": 0.16314697, + "step": 4972, + "time_per_iteration": 2.521575689315796 + }, + { + "auxiliary_loss_clip": 0.0650093, + "auxiliary_loss_mlp": 0.01282709, + "balance_loss_clip": 0.06297094, + "balance_loss_mlp": 0.01266234, + "epoch": 0.29899293551781153, + "flos": 16842341105280.0, + "grad_norm": 1.7682293865220609, + "language_loss": 0.85643351, + "learning_rate": 3.2877783256035285e-06, + "loss": 0.93426991, + "num_input_tokens_seen": 107051125, + "router_z_loss_clip": 2.04101562, + "router_z_loss_mlp": 0.16467285, + "step": 4973, + "time_per_iteration": 2.546552896499634 + }, + { + "auxiliary_loss_clip": 0.06486042, + "auxiliary_loss_mlp": 0.01280538, + "balance_loss_clip": 0.06291717, + "balance_loss_mlp": 0.01263539, + "epoch": 0.2990530587704795, + "flos": 11734068792960.0, + "grad_norm": 1.5403026658154284, + "language_loss": 0.78163445, + "learning_rate": 3.287480316742863e-06, + "loss": 0.85930026, + "num_input_tokens_seen": 107068815, + "router_z_loss_clip": 1.94433594, + "router_z_loss_mlp": 0.17004395, + "step": 4974, + "time_per_iteration": 2.519416093826294 + }, + { + "auxiliary_loss_clip": 0.06492939, + "auxiliary_loss_mlp": 0.01274131, + "balance_loss_clip": 0.06288281, + "balance_loss_mlp": 0.01257001, + "epoch": 0.29911318202314746, + "flos": 28047713362560.0, + "grad_norm": 1.767842246111843, + "language_loss": 0.73036933, + "learning_rate": 3.287182259060815e-06, + "loss": 0.80804002, + "num_input_tokens_seen": 107090420, + "router_z_loss_clip": 2.04589844, + "router_z_loss_mlp": 0.17126465, + "step": 4975, + "time_per_iteration": 2.6099252700805664 + }, + { + "auxiliary_loss_clip": 0.0649198, + "auxiliary_loss_mlp": 0.01278331, + "balance_loss_clip": 0.06288506, + "balance_loss_mlp": 0.0126163, + "epoch": 0.2991733052758154, + "flos": 18739425070080.0, + "grad_norm": 3.7568061887968374, + "language_loss": 0.76564699, + "learning_rate": 3.286884152568687e-06, + "loss": 0.84335011, + "num_input_tokens_seen": 107107255, + "router_z_loss_clip": 2.03222656, + "router_z_loss_mlp": 0.16711426, + "step": 4976, + "time_per_iteration": 2.4865057468414307 + }, + { + "auxiliary_loss_clip": 0.0649081, + "auxiliary_loss_mlp": 0.01274025, + "balance_loss_clip": 0.06290253, + "balance_loss_mlp": 0.01257574, + "epoch": 0.2992334285284834, + "flos": 15564476413440.0, + "grad_norm": 2.0027584051633256, + "language_loss": 0.86547983, + "learning_rate": 3.2865859972777827e-06, + "loss": 0.94312823, + "num_input_tokens_seen": 107123840, + "router_z_loss_clip": 2.00585938, + "router_z_loss_mlp": 0.16455078, + "step": 4977, + "time_per_iteration": 2.5564377307891846 + }, + { + "auxiliary_loss_clip": 0.06492308, + "auxiliary_loss_mlp": 0.01273791, + "balance_loss_clip": 0.06289831, + "balance_loss_mlp": 0.0125684, + "epoch": 0.29929355178115136, + "flos": 21803809864320.0, + "grad_norm": 1.498415139231663, + "language_loss": 0.69035208, + "learning_rate": 3.2862877931994088e-06, + "loss": 0.76801312, + "num_input_tokens_seen": 107143475, + "router_z_loss_clip": 2.02539062, + "router_z_loss_mlp": 0.16943359, + "step": 4978, + "time_per_iteration": 2.519927978515625 + }, + { + "auxiliary_loss_clip": 0.06498158, + "auxiliary_loss_mlp": 0.01273756, + "balance_loss_clip": 0.06295491, + "balance_loss_mlp": 0.0125634, + "epoch": 0.2993536750338193, + "flos": 21184884080640.0, + "grad_norm": 2.2981139003330924, + "language_loss": 0.76821494, + "learning_rate": 3.2859895403448726e-06, + "loss": 0.84593409, + "num_input_tokens_seen": 107161725, + "router_z_loss_clip": 2.02832031, + "router_z_loss_mlp": 0.17407227, + "step": 4979, + "time_per_iteration": 2.5783658027648926 + }, + { + "auxiliary_loss_clip": 0.06495501, + "auxiliary_loss_mlp": 0.01275001, + "balance_loss_clip": 0.06288472, + "balance_loss_mlp": 0.0125762, + "epoch": 0.2994137982864873, + "flos": 32129954029440.0, + "grad_norm": 1.9038495469030372, + "language_loss": 0.69286489, + "learning_rate": 3.285691238725484e-06, + "loss": 0.77056986, + "num_input_tokens_seen": 107183935, + "router_z_loss_clip": 2.06835938, + "router_z_loss_mlp": 0.17382812, + "step": 4980, + "time_per_iteration": 2.582043170928955 + }, + { + "auxiliary_loss_clip": 0.06490306, + "auxiliary_loss_mlp": 0.01274236, + "balance_loss_clip": 0.06288646, + "balance_loss_mlp": 0.01257177, + "epoch": 0.29947392153915525, + "flos": 21111733866240.0, + "grad_norm": 1.7308746684442236, + "language_loss": 0.74001658, + "learning_rate": 3.285392888352555e-06, + "loss": 0.817662, + "num_input_tokens_seen": 107204285, + "router_z_loss_clip": 2.015625, + "router_z_loss_mlp": 0.17053223, + "step": 4981, + "time_per_iteration": 2.580580711364746 + }, + { + "auxiliary_loss_clip": 0.06490904, + "auxiliary_loss_mlp": 0.01273981, + "balance_loss_clip": 0.06281741, + "balance_loss_mlp": 0.0125635, + "epoch": 0.2995340447918232, + "flos": 21548916144000.0, + "grad_norm": 1.9422940804684126, + "language_loss": 0.86877131, + "learning_rate": 3.2850944892373987e-06, + "loss": 0.94642013, + "num_input_tokens_seen": 107225265, + "router_z_loss_clip": 2.08984375, + "router_z_loss_mlp": 0.17626953, + "step": 4982, + "time_per_iteration": 2.4962990283966064 + }, + { + "auxiliary_loss_clip": 0.06497963, + "auxiliary_loss_mlp": 0.0127461, + "balance_loss_clip": 0.06287588, + "balance_loss_mlp": 0.01257241, + "epoch": 0.2995941680444912, + "flos": 16730393650560.0, + "grad_norm": 2.5640920256819886, + "language_loss": 0.87797368, + "learning_rate": 3.2847960413913307e-06, + "loss": 0.95569938, + "num_input_tokens_seen": 107241335, + "router_z_loss_clip": 2.1015625, + "router_z_loss_mlp": 0.17382812, + "step": 4983, + "time_per_iteration": 2.5295448303222656 + }, + { + "auxiliary_loss_clip": 0.0649021, + "auxiliary_loss_mlp": 0.01273363, + "balance_loss_clip": 0.06287163, + "balance_loss_mlp": 0.012569, + "epoch": 0.2996542912971592, + "flos": 20929864579200.0, + "grad_norm": 2.1931631477553943, + "language_loss": 0.78985476, + "learning_rate": 3.284497544825668e-06, + "loss": 0.86749053, + "num_input_tokens_seen": 107259375, + "router_z_loss_clip": 2.03027344, + "router_z_loss_mlp": 0.16467285, + "step": 4984, + "time_per_iteration": 2.510861873626709 + }, + { + "auxiliary_loss_clip": 0.06490169, + "auxiliary_loss_mlp": 0.01276988, + "balance_loss_clip": 0.06284384, + "balance_loss_mlp": 0.01259702, + "epoch": 0.29971441454982717, + "flos": 25086429417600.0, + "grad_norm": 1.6549542244227224, + "language_loss": 0.78558743, + "learning_rate": 3.2841989995517303e-06, + "loss": 0.86325896, + "num_input_tokens_seen": 107279890, + "router_z_loss_clip": 2.05859375, + "router_z_loss_mlp": 0.17285156, + "step": 4985, + "time_per_iteration": 2.6011219024658203 + }, + { + "auxiliary_loss_clip": 0.06501257, + "auxiliary_loss_mlp": 0.01278562, + "balance_loss_clip": 0.06288561, + "balance_loss_mlp": 0.0125968, + "epoch": 0.29977453780249513, + "flos": 52567445617920.0, + "grad_norm": 2.1128232330624757, + "language_loss": 0.71929544, + "learning_rate": 3.283900405580837e-06, + "loss": 0.79709363, + "num_input_tokens_seen": 107303430, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.1887207, + "step": 4986, + "time_per_iteration": 2.8261890411376953 + }, + { + "auxiliary_loss_clip": 0.06496918, + "auxiliary_loss_mlp": 0.01277715, + "balance_loss_clip": 0.06288348, + "balance_loss_mlp": 0.0125981, + "epoch": 0.2998346610551631, + "flos": 22243759326720.0, + "grad_norm": 2.0495005677193703, + "language_loss": 0.73353851, + "learning_rate": 3.283601762924312e-06, + "loss": 0.81128478, + "num_input_tokens_seen": 107323700, + "router_z_loss_clip": 2.08203125, + "router_z_loss_mlp": 0.17907715, + "step": 4987, + "time_per_iteration": 2.5969009399414062 + }, + { + "auxiliary_loss_clip": 0.06487568, + "auxiliary_loss_mlp": 0.01277048, + "balance_loss_clip": 0.06283796, + "balance_loss_mlp": 0.01260561, + "epoch": 0.29989478430783106, + "flos": 16878832358400.0, + "grad_norm": 1.677350703029162, + "language_loss": 0.80982405, + "learning_rate": 3.2833030715934793e-06, + "loss": 0.88747025, + "num_input_tokens_seen": 107341965, + "router_z_loss_clip": 2.03710938, + "router_z_loss_mlp": 0.16479492, + "step": 4988, + "time_per_iteration": 2.4802756309509277 + }, + { + "auxiliary_loss_clip": 0.06489251, + "auxiliary_loss_mlp": 0.0127416, + "balance_loss_clip": 0.06285515, + "balance_loss_mlp": 0.0125759, + "epoch": 0.29995490756049903, + "flos": 23775637271040.0, + "grad_norm": 1.830625198484136, + "language_loss": 0.7097913, + "learning_rate": 3.2830043315996658e-06, + "loss": 0.7874254, + "num_input_tokens_seen": 107362615, + "router_z_loss_clip": 2.03613281, + "router_z_loss_mlp": 0.16577148, + "step": 4989, + "time_per_iteration": 2.5968902111053467 + }, + { + "auxiliary_loss_clip": 0.06498987, + "auxiliary_loss_mlp": 0.01283365, + "balance_loss_clip": 0.0628901, + "balance_loss_mlp": 0.01264948, + "epoch": 0.300015030813167, + "flos": 14470577360640.0, + "grad_norm": 2.8004651200920576, + "language_loss": 0.85787904, + "learning_rate": 3.282705542954199e-06, + "loss": 0.93570256, + "num_input_tokens_seen": 107378980, + "router_z_loss_clip": 2.09570312, + "router_z_loss_mlp": 0.18408203, + "step": 4990, + "time_per_iteration": 2.4837355613708496 + }, + { + "auxiliary_loss_clip": 0.06499861, + "auxiliary_loss_mlp": 0.01278121, + "balance_loss_clip": 0.06287368, + "balance_loss_mlp": 0.01260204, + "epoch": 0.30007515406583496, + "flos": 25199005777920.0, + "grad_norm": 1.6608247288012334, + "language_loss": 0.67339301, + "learning_rate": 3.28240670566841e-06, + "loss": 0.75117278, + "num_input_tokens_seen": 107397640, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.17919922, + "step": 4991, + "time_per_iteration": 4.060553312301636 + }, + { + "auxiliary_loss_clip": 0.0649571, + "auxiliary_loss_mlp": 0.01277369, + "balance_loss_clip": 0.06284688, + "balance_loss_mlp": 0.01259022, + "epoch": 0.3001352773185029, + "flos": 19397315802240.0, + "grad_norm": 1.7545259775845383, + "language_loss": 0.79479051, + "learning_rate": 3.28210781975363e-06, + "loss": 0.87252128, + "num_input_tokens_seen": 107416020, + "router_z_loss_clip": 2.10546875, + "router_z_loss_mlp": 0.18347168, + "step": 4992, + "time_per_iteration": 2.5394246578216553 + }, + { + "auxiliary_loss_clip": 0.06496455, + "auxiliary_loss_mlp": 0.01272727, + "balance_loss_clip": 0.06291893, + "balance_loss_mlp": 0.01255061, + "epoch": 0.3001954005711709, + "flos": 21550341663360.0, + "grad_norm": 1.8174225064451806, + "language_loss": 0.83191693, + "learning_rate": 3.281808885221193e-06, + "loss": 0.90960872, + "num_input_tokens_seen": 107436340, + "router_z_loss_clip": 2.04589844, + "router_z_loss_mlp": 0.17675781, + "step": 4993, + "time_per_iteration": 2.536900520324707 + }, + { + "auxiliary_loss_clip": 0.06502604, + "auxiliary_loss_mlp": 0.0127659, + "balance_loss_clip": 0.06290129, + "balance_loss_mlp": 0.01257051, + "epoch": 0.30025552382383885, + "flos": 17390087245440.0, + "grad_norm": 2.3964724385856955, + "language_loss": 0.8713994, + "learning_rate": 3.2815099020824345e-06, + "loss": 0.94919133, + "num_input_tokens_seen": 107454585, + "router_z_loss_clip": 2.12109375, + "router_z_loss_mlp": 0.1953125, + "step": 4994, + "time_per_iteration": 5.451568603515625 + }, + { + "auxiliary_loss_clip": 0.06500117, + "auxiliary_loss_mlp": 0.01272707, + "balance_loss_clip": 0.06293428, + "balance_loss_mlp": 0.01255696, + "epoch": 0.3003156470765068, + "flos": 29541003701760.0, + "grad_norm": 1.492375768993242, + "language_loss": 0.81277597, + "learning_rate": 3.2812108703486924e-06, + "loss": 0.89050424, + "num_input_tokens_seen": 107477180, + "router_z_loss_clip": 2.06445312, + "router_z_loss_mlp": 0.17016602, + "step": 4995, + "time_per_iteration": 2.6498701572418213 + }, + { + "auxiliary_loss_clip": 0.06495272, + "auxiliary_loss_mlp": 0.01276355, + "balance_loss_clip": 0.06291361, + "balance_loss_mlp": 0.01257818, + "epoch": 0.3003757703291748, + "flos": 43655278302720.0, + "grad_norm": 1.561088997277918, + "language_loss": 0.67591625, + "learning_rate": 3.2809117900313055e-06, + "loss": 0.75363255, + "num_input_tokens_seen": 107500250, + "router_z_loss_clip": 2.03808594, + "router_z_loss_mlp": 0.18530273, + "step": 4996, + "time_per_iteration": 2.6940386295318604 + }, + { + "auxiliary_loss_clip": 0.06490915, + "auxiliary_loss_mlp": 0.01277922, + "balance_loss_clip": 0.06287466, + "balance_loss_mlp": 0.0125985, + "epoch": 0.30043589358184275, + "flos": 22534934664960.0, + "grad_norm": 1.8202769971321224, + "language_loss": 0.76585484, + "learning_rate": 3.280612661141615e-06, + "loss": 0.84354323, + "num_input_tokens_seen": 107520070, + "router_z_loss_clip": 2.03515625, + "router_z_loss_mlp": 0.18054199, + "step": 4997, + "time_per_iteration": 2.551025629043579 + }, + { + "auxiliary_loss_clip": 0.06488951, + "auxiliary_loss_mlp": 0.01282226, + "balance_loss_clip": 0.06286483, + "balance_loss_mlp": 0.01264785, + "epoch": 0.30049601683451077, + "flos": 21002176252800.0, + "grad_norm": 1.7136041248753544, + "language_loss": 0.78929758, + "learning_rate": 3.2803134836909646e-06, + "loss": 0.86700928, + "num_input_tokens_seen": 107539285, + "router_z_loss_clip": 2.0234375, + "router_z_loss_mlp": 0.17443848, + "step": 4998, + "time_per_iteration": 2.4853529930114746 + }, + { + "auxiliary_loss_clip": 0.06495959, + "auxiliary_loss_mlp": 0.01277634, + "balance_loss_clip": 0.06296599, + "balance_loss_mlp": 0.0126104, + "epoch": 0.30055614008717874, + "flos": 23922985875840.0, + "grad_norm": 1.6408959445510187, + "language_loss": 0.73985869, + "learning_rate": 3.2800142576906985e-06, + "loss": 0.81759465, + "num_input_tokens_seen": 107560260, + "router_z_loss_clip": 1.99121094, + "router_z_loss_mlp": 0.16589355, + "step": 4999, + "time_per_iteration": 2.565272331237793 + }, + { + "auxiliary_loss_clip": 0.06497648, + "auxiliary_loss_mlp": 0.01276599, + "balance_loss_clip": 0.06290608, + "balance_loss_mlp": 0.01258837, + "epoch": 0.3006162633398467, + "flos": 19175475317760.0, + "grad_norm": 1.6585129963537202, + "language_loss": 0.76246512, + "learning_rate": 3.2797149831521626e-06, + "loss": 0.84020758, + "num_input_tokens_seen": 107579260, + "router_z_loss_clip": 2.0703125, + "router_z_loss_mlp": 0.1776123, + "step": 5000, + "time_per_iteration": 3.978001117706299 + }, + { + "auxiliary_loss_clip": 0.06488875, + "auxiliary_loss_mlp": 0.01280464, + "balance_loss_clip": 0.06288455, + "balance_loss_mlp": 0.0126244, + "epoch": 0.30067638659251467, + "flos": 14683697020800.0, + "grad_norm": 1.838860389970219, + "language_loss": 0.81972182, + "learning_rate": 3.2794156600867073e-06, + "loss": 0.89741528, + "num_input_tokens_seen": 107595245, + "router_z_loss_clip": 2.00390625, + "router_z_loss_mlp": 0.18041992, + "step": 5001, + "time_per_iteration": 2.4995031356811523 + }, + { + "auxiliary_loss_clip": 0.06495227, + "auxiliary_loss_mlp": 0.01279132, + "balance_loss_clip": 0.06291329, + "balance_loss_mlp": 0.01261322, + "epoch": 0.30073650984518263, + "flos": 23374778538240.0, + "grad_norm": 1.6002838962292127, + "language_loss": 0.81160742, + "learning_rate": 3.2791162885056815e-06, + "loss": 0.88935101, + "num_input_tokens_seen": 107613985, + "router_z_loss_clip": 2.04003906, + "router_z_loss_mlp": 0.17797852, + "step": 5002, + "time_per_iteration": 2.549882650375366 + }, + { + "auxiliary_loss_clip": 0.06502556, + "auxiliary_loss_mlp": 0.01273216, + "balance_loss_clip": 0.06290467, + "balance_loss_mlp": 0.01255728, + "epoch": 0.3007966330978506, + "flos": 22973332826880.0, + "grad_norm": 1.7018817575326768, + "language_loss": 0.71524274, + "learning_rate": 3.2788168684204376e-06, + "loss": 0.79300046, + "num_input_tokens_seen": 107631435, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.17504883, + "step": 5003, + "time_per_iteration": 2.537760019302368 + }, + { + "auxiliary_loss_clip": 0.06502316, + "auxiliary_loss_mlp": 0.01275597, + "balance_loss_clip": 0.06290226, + "balance_loss_mlp": 0.01257441, + "epoch": 0.30085675635051856, + "flos": 27825830951040.0, + "grad_norm": 1.9954765529899763, + "language_loss": 0.706792, + "learning_rate": 3.27851739984233e-06, + "loss": 0.78457117, + "num_input_tokens_seen": 107650530, + "router_z_loss_clip": 2.12304688, + "router_z_loss_mlp": 0.18151855, + "step": 5004, + "time_per_iteration": 2.6357674598693848 + }, + { + "auxiliary_loss_clip": 0.06504735, + "auxiliary_loss_mlp": 0.01282861, + "balance_loss_clip": 0.06296123, + "balance_loss_mlp": 0.01263513, + "epoch": 0.3009168796031865, + "flos": 10886216855040.0, + "grad_norm": 2.7451882694975662, + "language_loss": 0.81914413, + "learning_rate": 3.278217882782715e-06, + "loss": 0.89702016, + "num_input_tokens_seen": 107662240, + "router_z_loss_clip": 2.08398438, + "router_z_loss_mlp": 0.19335938, + "step": 5005, + "time_per_iteration": 2.4386463165283203 + }, + { + "auxiliary_loss_clip": 0.06497307, + "auxiliary_loss_mlp": 0.01278667, + "balance_loss_clip": 0.06293161, + "balance_loss_mlp": 0.01261179, + "epoch": 0.3009770028558545, + "flos": 23812170451200.0, + "grad_norm": 3.689468326241579, + "language_loss": 0.74513727, + "learning_rate": 3.2779183172529497e-06, + "loss": 0.82289702, + "num_input_tokens_seen": 107680330, + "router_z_loss_clip": 2.04296875, + "router_z_loss_mlp": 0.17492676, + "step": 5006, + "time_per_iteration": 2.6309902667999268 + }, + { + "auxiliary_loss_clip": 0.06490835, + "auxiliary_loss_mlp": 0.01274011, + "balance_loss_clip": 0.06288077, + "balance_loss_mlp": 0.01255247, + "epoch": 0.30103712610852246, + "flos": 26475319169280.0, + "grad_norm": 1.9837745378518294, + "language_loss": 0.71514297, + "learning_rate": 3.2776187032643932e-06, + "loss": 0.79279143, + "num_input_tokens_seen": 107700020, + "router_z_loss_clip": 2.0234375, + "router_z_loss_mlp": 0.18762207, + "step": 5007, + "time_per_iteration": 2.5425140857696533 + }, + { + "auxiliary_loss_clip": 0.06499007, + "auxiliary_loss_mlp": 0.01277558, + "balance_loss_clip": 0.062922, + "balance_loss_mlp": 0.01258961, + "epoch": 0.3010972493611904, + "flos": 22863020526720.0, + "grad_norm": 2.135948160193648, + "language_loss": 0.76715112, + "learning_rate": 3.2773190408284075e-06, + "loss": 0.84491682, + "num_input_tokens_seen": 107718575, + "router_z_loss_clip": 2.06835938, + "router_z_loss_mlp": 0.18579102, + "step": 5008, + "time_per_iteration": 2.560136556625366 + }, + { + "auxiliary_loss_clip": 0.06498778, + "auxiliary_loss_mlp": 0.01277162, + "balance_loss_clip": 0.06291865, + "balance_loss_mlp": 0.01258959, + "epoch": 0.3011573726138584, + "flos": 24059307669120.0, + "grad_norm": 1.8647165617813573, + "language_loss": 0.85181898, + "learning_rate": 3.2770193299563564e-06, + "loss": 0.92957842, + "num_input_tokens_seen": 107738635, + "router_z_loss_clip": 2.06835938, + "router_z_loss_mlp": 0.18200684, + "step": 5009, + "time_per_iteration": 2.5235841274261475 + }, + { + "auxiliary_loss_clip": 0.06506295, + "auxiliary_loss_mlp": 0.01281474, + "balance_loss_clip": 0.06291408, + "balance_loss_mlp": 0.0126041, + "epoch": 0.30121749586652635, + "flos": 20264762396160.0, + "grad_norm": 1.8315766872525614, + "language_loss": 0.84202898, + "learning_rate": 3.276719570659604e-06, + "loss": 0.91990662, + "num_input_tokens_seen": 107753415, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.21069336, + "step": 5010, + "time_per_iteration": 2.5768747329711914 + }, + { + "auxiliary_loss_clip": 0.06499103, + "auxiliary_loss_mlp": 0.01276454, + "balance_loss_clip": 0.06292678, + "balance_loss_mlp": 0.01258728, + "epoch": 0.3012776191191944, + "flos": 26950334365440.0, + "grad_norm": 2.3479091749479593, + "language_loss": 0.85299456, + "learning_rate": 3.2764197629495176e-06, + "loss": 0.93075019, + "num_input_tokens_seen": 107773840, + "router_z_loss_clip": 2.06445312, + "router_z_loss_mlp": 0.17724609, + "step": 5011, + "time_per_iteration": 2.5496773719787598 + }, + { + "auxiliary_loss_clip": 0.06498772, + "auxiliary_loss_mlp": 0.01276485, + "balance_loss_clip": 0.06287067, + "balance_loss_mlp": 0.01258472, + "epoch": 0.30133774237186234, + "flos": 20418525838080.0, + "grad_norm": 2.2969937551574615, + "language_loss": 0.73043567, + "learning_rate": 3.2761199068374656e-06, + "loss": 0.80818832, + "num_input_tokens_seen": 107792020, + "router_z_loss_clip": 2.1171875, + "router_z_loss_mlp": 0.18017578, + "step": 5012, + "time_per_iteration": 2.5352632999420166 + }, + { + "auxiliary_loss_clip": 0.06502604, + "auxiliary_loss_mlp": 0.01275987, + "balance_loss_clip": 0.06294451, + "balance_loss_mlp": 0.01257581, + "epoch": 0.3013978656245303, + "flos": 19798635732480.0, + "grad_norm": 2.0714365992737247, + "language_loss": 0.88282806, + "learning_rate": 3.275820002334819e-06, + "loss": 0.96061397, + "num_input_tokens_seen": 107809595, + "router_z_loss_clip": 2.08007812, + "router_z_loss_mlp": 0.1842041, + "step": 5013, + "time_per_iteration": 2.5217273235321045 + }, + { + "auxiliary_loss_clip": 0.06510235, + "auxiliary_loss_mlp": 0.01281959, + "balance_loss_clip": 0.06297365, + "balance_loss_mlp": 0.01261956, + "epoch": 0.30145798887719827, + "flos": 16254623767680.0, + "grad_norm": 2.0397198762739253, + "language_loss": 0.8413021, + "learning_rate": 3.2755200494529496e-06, + "loss": 0.91922402, + "num_input_tokens_seen": 107827230, + "router_z_loss_clip": 2.12988281, + "router_z_loss_mlp": 0.19995117, + "step": 5014, + "time_per_iteration": 2.543929100036621 + }, + { + "auxiliary_loss_clip": 0.06496109, + "auxiliary_loss_mlp": 0.01278136, + "balance_loss_clip": 0.06295025, + "balance_loss_mlp": 0.01260934, + "epoch": 0.30151811212986623, + "flos": 24578654474880.0, + "grad_norm": 1.6793816963153507, + "language_loss": 0.68929201, + "learning_rate": 3.2752200482032323e-06, + "loss": 0.76703441, + "num_input_tokens_seen": 107847195, + "router_z_loss_clip": 2.0078125, + "router_z_loss_mlp": 0.17199707, + "step": 5015, + "time_per_iteration": 2.5488638877868652 + }, + { + "auxiliary_loss_clip": 0.06498226, + "auxiliary_loss_mlp": 0.01282599, + "balance_loss_clip": 0.06293575, + "balance_loss_mlp": 0.01262989, + "epoch": 0.3015782353825342, + "flos": 21878595233280.0, + "grad_norm": 2.19954780338382, + "language_loss": 0.75070626, + "learning_rate": 3.2749199985970436e-06, + "loss": 0.82851446, + "num_input_tokens_seen": 107866420, + "router_z_loss_clip": 2.046875, + "router_z_loss_mlp": 0.19604492, + "step": 5016, + "time_per_iteration": 2.6430094242095947 + }, + { + "auxiliary_loss_clip": 0.06498955, + "auxiliary_loss_mlp": 0.01278069, + "balance_loss_clip": 0.06290609, + "balance_loss_mlp": 0.01260009, + "epoch": 0.30163835863520216, + "flos": 28777244935680.0, + "grad_norm": 1.487936670829871, + "language_loss": 0.657938, + "learning_rate": 3.2746199006457603e-06, + "loss": 0.73570824, + "num_input_tokens_seen": 107889090, + "router_z_loss_clip": 2.08300781, + "router_z_loss_mlp": 0.18041992, + "step": 5017, + "time_per_iteration": 2.62882661819458 + }, + { + "auxiliary_loss_clip": 0.06504996, + "auxiliary_loss_mlp": 0.0127571, + "balance_loss_clip": 0.06297189, + "balance_loss_mlp": 0.01258019, + "epoch": 0.30169848188787013, + "flos": 22972829702400.0, + "grad_norm": 1.7163502989136974, + "language_loss": 0.68538272, + "learning_rate": 3.2743197543607628e-06, + "loss": 0.76318979, + "num_input_tokens_seen": 107907520, + "router_z_loss_clip": 2.078125, + "router_z_loss_mlp": 0.17675781, + "step": 5018, + "time_per_iteration": 2.5743629932403564 + }, + { + "auxiliary_loss_clip": 0.06490742, + "auxiliary_loss_mlp": 0.01280876, + "balance_loss_clip": 0.06291413, + "balance_loss_mlp": 0.01263102, + "epoch": 0.3017586051405381, + "flos": 21841726636800.0, + "grad_norm": 1.8632302123292983, + "language_loss": 0.79424834, + "learning_rate": 3.2740195597534327e-06, + "loss": 0.87196445, + "num_input_tokens_seen": 107925650, + "router_z_loss_clip": 1.99023438, + "router_z_loss_mlp": 0.17773438, + "step": 5019, + "time_per_iteration": 2.490190029144287 + }, + { + "auxiliary_loss_clip": 0.06497257, + "auxiliary_loss_mlp": 0.01272585, + "balance_loss_clip": 0.06291286, + "balance_loss_mlp": 0.01255932, + "epoch": 0.30181872839320606, + "flos": 22166374481280.0, + "grad_norm": 1.9171916392208899, + "language_loss": 0.70839167, + "learning_rate": 3.2737193168351527e-06, + "loss": 0.78609014, + "num_input_tokens_seen": 107943975, + "router_z_loss_clip": 2.05957031, + "router_z_loss_mlp": 0.16650391, + "step": 5020, + "time_per_iteration": 2.5635480880737305 + }, + { + "auxiliary_loss_clip": 0.06504546, + "auxiliary_loss_mlp": 0.01281398, + "balance_loss_clip": 0.06293903, + "balance_loss_mlp": 0.01263063, + "epoch": 0.301878851645874, + "flos": 18120080016000.0, + "grad_norm": 1.792157390717078, + "language_loss": 0.78276378, + "learning_rate": 3.2734190256173085e-06, + "loss": 0.86062324, + "num_input_tokens_seen": 107962950, + "router_z_loss_clip": 2.10742188, + "router_z_loss_mlp": 0.18347168, + "step": 5021, + "time_per_iteration": 2.4956390857696533 + }, + { + "auxiliary_loss_clip": 0.06497782, + "auxiliary_loss_mlp": 0.01276425, + "balance_loss_clip": 0.06289995, + "balance_loss_mlp": 0.01258758, + "epoch": 0.301938974898542, + "flos": 17607860807040.0, + "grad_norm": 2.1405998927344774, + "language_loss": 0.77019519, + "learning_rate": 3.2731186861112877e-06, + "loss": 0.84793723, + "num_input_tokens_seen": 107979700, + "router_z_loss_clip": 2.078125, + "router_z_loss_mlp": 0.17663574, + "step": 5022, + "time_per_iteration": 2.5157957077026367 + }, + { + "auxiliary_loss_clip": 0.06495966, + "auxiliary_loss_mlp": 0.01276385, + "balance_loss_clip": 0.0628897, + "balance_loss_mlp": 0.01258766, + "epoch": 0.30199909815120995, + "flos": 11185861455360.0, + "grad_norm": 1.768248661027107, + "language_loss": 0.70051187, + "learning_rate": 3.2728182983284793e-06, + "loss": 0.77823544, + "num_input_tokens_seen": 107996645, + "router_z_loss_clip": 2.06835938, + "router_z_loss_mlp": 0.17626953, + "step": 5023, + "time_per_iteration": 2.466554641723633 + }, + { + "auxiliary_loss_clip": 0.06500031, + "auxiliary_loss_mlp": 0.01272609, + "balance_loss_clip": 0.0628899, + "balance_loss_mlp": 0.0125586, + "epoch": 0.302059221403878, + "flos": 21914247945600.0, + "grad_norm": 1.9915350532209553, + "language_loss": 0.72159773, + "learning_rate": 3.2725178622802724e-06, + "loss": 0.7993241, + "num_input_tokens_seen": 108015020, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.16748047, + "step": 5024, + "time_per_iteration": 2.550529956817627 + }, + { + "auxiliary_loss_clip": 0.06490807, + "auxiliary_loss_mlp": 0.0127689, + "balance_loss_clip": 0.06288145, + "balance_loss_mlp": 0.01259068, + "epoch": 0.30211934465654594, + "flos": 26403678328320.0, + "grad_norm": 1.894121412902458, + "language_loss": 0.74805325, + "learning_rate": 3.272217377978061e-06, + "loss": 0.8257302, + "num_input_tokens_seen": 108036430, + "router_z_loss_clip": 2.02832031, + "router_z_loss_mlp": 0.17822266, + "step": 5025, + "time_per_iteration": 2.566805124282837 + }, + { + "auxiliary_loss_clip": 0.06489006, + "auxiliary_loss_mlp": 0.01277493, + "balance_loss_clip": 0.06288895, + "balance_loss_mlp": 0.01260649, + "epoch": 0.3021794679092139, + "flos": 23406573962880.0, + "grad_norm": 1.5421556017832176, + "language_loss": 0.67708206, + "learning_rate": 3.2719168454332387e-06, + "loss": 0.75474703, + "num_input_tokens_seen": 108054250, + "router_z_loss_clip": 1.99902344, + "router_z_loss_mlp": 0.16845703, + "step": 5026, + "time_per_iteration": 2.5388495922088623 + }, + { + "auxiliary_loss_clip": 0.06496219, + "auxiliary_loss_mlp": 0.01276315, + "balance_loss_clip": 0.0629091, + "balance_loss_mlp": 0.0125829, + "epoch": 0.30223959116188187, + "flos": 20266271769600.0, + "grad_norm": 1.7822947119811494, + "language_loss": 0.851165, + "learning_rate": 3.2716162646572034e-06, + "loss": 0.92889023, + "num_input_tokens_seen": 108071495, + "router_z_loss_clip": 2.05175781, + "router_z_loss_mlp": 0.18017578, + "step": 5027, + "time_per_iteration": 2.4944281578063965 + }, + { + "auxiliary_loss_clip": 0.06486274, + "auxiliary_loss_mlp": 0.01272499, + "balance_loss_clip": 0.06286463, + "balance_loss_mlp": 0.012555, + "epoch": 0.30229971441454984, + "flos": 26695105228800.0, + "grad_norm": 1.4959542036115716, + "language_loss": 0.79103637, + "learning_rate": 3.271315635661351e-06, + "loss": 0.86862409, + "num_input_tokens_seen": 108092135, + "router_z_loss_clip": 1.99804688, + "router_z_loss_mlp": 0.17004395, + "step": 5028, + "time_per_iteration": 2.559110403060913 + }, + { + "auxiliary_loss_clip": 0.06488896, + "auxiliary_loss_mlp": 0.01272685, + "balance_loss_clip": 0.06286621, + "balance_loss_mlp": 0.01255114, + "epoch": 0.3023598376672178, + "flos": 34353111358080.0, + "grad_norm": 2.034560710438702, + "language_loss": 0.777421, + "learning_rate": 3.2710149584570826e-06, + "loss": 0.8550368, + "num_input_tokens_seen": 108112945, + "router_z_loss_clip": 2.02246094, + "router_z_loss_mlp": 0.17553711, + "step": 5029, + "time_per_iteration": 2.616746187210083 + }, + { + "auxiliary_loss_clip": 0.06491397, + "auxiliary_loss_mlp": 0.012793, + "balance_loss_clip": 0.06285096, + "balance_loss_mlp": 0.0126112, + "epoch": 0.30241996091988577, + "flos": 23118794714880.0, + "grad_norm": 1.8709670039612754, + "language_loss": 0.83096594, + "learning_rate": 3.2707142330557993e-06, + "loss": 0.90867293, + "num_input_tokens_seen": 108130325, + "router_z_loss_clip": 2.06152344, + "router_z_loss_mlp": 0.1817627, + "step": 5030, + "time_per_iteration": 2.56754994392395 + }, + { + "auxiliary_loss_clip": 0.06496526, + "auxiliary_loss_mlp": 0.01269852, + "balance_loss_clip": 0.06289787, + "balance_loss_mlp": 0.01252817, + "epoch": 0.30248008417255373, + "flos": 19395932209920.0, + "grad_norm": 1.6009792224367259, + "language_loss": 0.70107001, + "learning_rate": 3.270413459468905e-06, + "loss": 0.77873379, + "num_input_tokens_seen": 108150300, + "router_z_loss_clip": 2.06542969, + "router_z_loss_mlp": 0.17028809, + "step": 5031, + "time_per_iteration": 3.9598355293273926 + }, + { + "auxiliary_loss_clip": 0.06489968, + "auxiliary_loss_mlp": 0.01272903, + "balance_loss_clip": 0.06286315, + "balance_loss_mlp": 0.01254843, + "epoch": 0.3025402074252217, + "flos": 23776601592960.0, + "grad_norm": 1.6577801639127376, + "language_loss": 0.83241403, + "learning_rate": 3.2701126377078047e-06, + "loss": 0.91004276, + "num_input_tokens_seen": 108170330, + "router_z_loss_clip": 2.03417969, + "router_z_loss_mlp": 0.18066406, + "step": 5032, + "time_per_iteration": 2.5589263439178467 + }, + { + "auxiliary_loss_clip": 0.064991, + "auxiliary_loss_mlp": 0.01275787, + "balance_loss_clip": 0.06290475, + "balance_loss_mlp": 0.01257846, + "epoch": 0.30260033067788966, + "flos": 26001184440960.0, + "grad_norm": 2.284722647008976, + "language_loss": 0.73521686, + "learning_rate": 3.269811767783906e-06, + "loss": 0.81296575, + "num_input_tokens_seen": 108191265, + "router_z_loss_clip": 2.08984375, + "router_z_loss_mlp": 0.17956543, + "step": 5033, + "time_per_iteration": 4.029735088348389 + }, + { + "auxiliary_loss_clip": 0.06487451, + "auxiliary_loss_mlp": 0.01273985, + "balance_loss_clip": 0.06287168, + "balance_loss_mlp": 0.01257201, + "epoch": 0.3026604539305576, + "flos": 25381629751680.0, + "grad_norm": 1.972268943863271, + "language_loss": 0.74434245, + "learning_rate": 3.2695108497086185e-06, + "loss": 0.82195687, + "num_input_tokens_seen": 108211615, + "router_z_loss_clip": 2.0, + "router_z_loss_mlp": 0.16784668, + "step": 5034, + "time_per_iteration": 4.0717785358428955 + }, + { + "auxiliary_loss_clip": 0.06489293, + "auxiliary_loss_mlp": 0.01272883, + "balance_loss_clip": 0.06285236, + "balance_loss_mlp": 0.01253785, + "epoch": 0.3027205771832256, + "flos": 25819944059520.0, + "grad_norm": 2.1341895685230434, + "language_loss": 0.72872615, + "learning_rate": 3.269209883493352e-06, + "loss": 0.80634785, + "num_input_tokens_seen": 108231080, + "router_z_loss_clip": 2.04003906, + "router_z_loss_mlp": 0.19104004, + "step": 5035, + "time_per_iteration": 2.552910804748535 + }, + { + "auxiliary_loss_clip": 0.06487517, + "auxiliary_loss_mlp": 0.01272592, + "balance_loss_clip": 0.06287874, + "balance_loss_mlp": 0.01255545, + "epoch": 0.30278070043589356, + "flos": 27351905857920.0, + "grad_norm": 2.3429469920607384, + "language_loss": 0.87837774, + "learning_rate": 3.2689088691495196e-06, + "loss": 0.95597875, + "num_input_tokens_seen": 108251125, + "router_z_loss_clip": 1.99414062, + "router_z_loss_mlp": 0.17041016, + "step": 5036, + "time_per_iteration": 2.5958964824676514 + }, + { + "auxiliary_loss_clip": 0.06487815, + "auxiliary_loss_mlp": 0.01273729, + "balance_loss_clip": 0.06288295, + "balance_loss_mlp": 0.0125574, + "epoch": 0.3028408236885616, + "flos": 24792444967680.0, + "grad_norm": 1.4626052772561229, + "language_loss": 0.77969307, + "learning_rate": 3.268607806688536e-06, + "loss": 0.85730845, + "num_input_tokens_seen": 108272545, + "router_z_loss_clip": 1.99609375, + "router_z_loss_mlp": 0.17980957, + "step": 5037, + "time_per_iteration": 2.556859016418457 + }, + { + "auxiliary_loss_clip": 0.06492691, + "auxiliary_loss_mlp": 0.01276846, + "balance_loss_clip": 0.06287664, + "balance_loss_mlp": 0.01258381, + "epoch": 0.30290094694122954, + "flos": 12937399678080.0, + "grad_norm": 2.1717737457337236, + "language_loss": 0.78095227, + "learning_rate": 3.268306696121816e-06, + "loss": 0.85864764, + "num_input_tokens_seen": 108289725, + "router_z_loss_clip": 2.04882812, + "router_z_loss_mlp": 0.18469238, + "step": 5038, + "time_per_iteration": 2.534095525741577 + }, + { + "auxiliary_loss_clip": 0.06487858, + "auxiliary_loss_mlp": 0.01274285, + "balance_loss_clip": 0.06289861, + "balance_loss_mlp": 0.01257631, + "epoch": 0.3029610701938975, + "flos": 25922709492480.0, + "grad_norm": 1.6864855803341283, + "language_loss": 0.74257523, + "learning_rate": 3.2680055374607804e-06, + "loss": 0.82019669, + "num_input_tokens_seen": 108310690, + "router_z_loss_clip": 1.9765625, + "router_z_loss_mlp": 0.16650391, + "step": 5039, + "time_per_iteration": 3.9620656967163086 + }, + { + "auxiliary_loss_clip": 0.06482661, + "auxiliary_loss_mlp": 0.01275025, + "balance_loss_clip": 0.06285235, + "balance_loss_mlp": 0.0125923, + "epoch": 0.3030211934465655, + "flos": 21987440087040.0, + "grad_norm": 1.8054159725903498, + "language_loss": 0.80141723, + "learning_rate": 3.267704330716847e-06, + "loss": 0.87899411, + "num_input_tokens_seen": 108328905, + "router_z_loss_clip": 1.97070312, + "router_z_loss_mlp": 0.15795898, + "step": 5040, + "time_per_iteration": 2.5038623809814453 + }, + { + "auxiliary_loss_clip": 0.06493679, + "auxiliary_loss_mlp": 0.01273287, + "balance_loss_clip": 0.06295684, + "balance_loss_mlp": 0.01256705, + "epoch": 0.30308131669923344, + "flos": 20997606205440.0, + "grad_norm": 1.5545793881611087, + "language_loss": 0.82498085, + "learning_rate": 3.267403075901438e-06, + "loss": 0.90265048, + "num_input_tokens_seen": 108346680, + "router_z_loss_clip": 1.97851562, + "router_z_loss_mlp": 0.16589355, + "step": 5041, + "time_per_iteration": 2.5619800090789795 + }, + { + "auxiliary_loss_clip": 0.06388037, + "auxiliary_loss_mlp": 0.01273694, + "balance_loss_clip": 0.062912, + "balance_loss_mlp": 0.012703, + "epoch": 0.3031414399519014, + "flos": 60568281198720.0, + "grad_norm": 0.7609258494567089, + "language_loss": 0.59132683, + "learning_rate": 3.267101773025978e-06, + "loss": 0.66794419, + "num_input_tokens_seen": 108413885, + "router_z_loss_clip": 0.96875, + "router_z_loss_mlp": 0.0340271, + "step": 5042, + "time_per_iteration": 3.2389016151428223 + }, + { + "auxiliary_loss_clip": 0.06493344, + "auxiliary_loss_mlp": 0.01274817, + "balance_loss_clip": 0.06288245, + "balance_loss_mlp": 0.0125808, + "epoch": 0.30320156320456937, + "flos": 21914038310400.0, + "grad_norm": 1.8743682054895758, + "language_loss": 0.71638298, + "learning_rate": 3.266800422101892e-06, + "loss": 0.79406464, + "num_input_tokens_seen": 108433640, + "router_z_loss_clip": 2.05175781, + "router_z_loss_mlp": 0.1673584, + "step": 5043, + "time_per_iteration": 2.5684726238250732 + }, + { + "auxiliary_loss_clip": 0.06492111, + "auxiliary_loss_mlp": 0.01274799, + "balance_loss_clip": 0.06289819, + "balance_loss_mlp": 0.01258121, + "epoch": 0.30326168645723733, + "flos": 21659186517120.0, + "grad_norm": 1.7052050019212173, + "language_loss": 0.70087332, + "learning_rate": 3.266499023140606e-06, + "loss": 0.7785424, + "num_input_tokens_seen": 108452640, + "router_z_loss_clip": 2.02441406, + "router_z_loss_mlp": 0.16699219, + "step": 5044, + "time_per_iteration": 2.517548084259033 + }, + { + "auxiliary_loss_clip": 0.06487354, + "auxiliary_loss_mlp": 0.01273722, + "balance_loss_clip": 0.06289065, + "balance_loss_mlp": 0.01257641, + "epoch": 0.3033218097099053, + "flos": 21877672838400.0, + "grad_norm": 1.4072868323237386, + "language_loss": 0.77798641, + "learning_rate": 3.2661975761535513e-06, + "loss": 0.85559714, + "num_input_tokens_seen": 108472470, + "router_z_loss_clip": 1.98242188, + "router_z_loss_mlp": 0.16088867, + "step": 5045, + "time_per_iteration": 2.5525407791137695 + }, + { + "auxiliary_loss_clip": 0.06487602, + "auxiliary_loss_mlp": 0.01277286, + "balance_loss_clip": 0.06286202, + "balance_loss_mlp": 0.01260096, + "epoch": 0.30338193296257326, + "flos": 27097137918720.0, + "grad_norm": 1.6677605508610576, + "language_loss": 0.72664404, + "learning_rate": 3.2658960811521564e-06, + "loss": 0.80429292, + "num_input_tokens_seen": 108493025, + "router_z_loss_clip": 2.01171875, + "router_z_loss_mlp": 0.171875, + "step": 5046, + "time_per_iteration": 2.5747427940368652 + }, + { + "auxiliary_loss_clip": 0.06495762, + "auxiliary_loss_mlp": 0.01276721, + "balance_loss_clip": 0.0629053, + "balance_loss_mlp": 0.0125897, + "epoch": 0.30344205621524123, + "flos": 19540052432640.0, + "grad_norm": 1.932306391246397, + "language_loss": 0.81483316, + "learning_rate": 3.2655945381478564e-06, + "loss": 0.89255798, + "num_input_tokens_seen": 108513480, + "router_z_loss_clip": 2.0546875, + "router_z_loss_mlp": 0.1776123, + "step": 5047, + "time_per_iteration": 2.5763392448425293 + }, + { + "auxiliary_loss_clip": 0.0648682, + "auxiliary_loss_mlp": 0.01271507, + "balance_loss_clip": 0.06287121, + "balance_loss_mlp": 0.01255568, + "epoch": 0.3035021794679092, + "flos": 23917116090240.0, + "grad_norm": 1.635585540948891, + "language_loss": 0.72204739, + "learning_rate": 3.265292947152084e-06, + "loss": 0.7996307, + "num_input_tokens_seen": 108533155, + "router_z_loss_clip": 1.99707031, + "router_z_loss_mlp": 0.15942383, + "step": 5048, + "time_per_iteration": 2.5134665966033936 + }, + { + "auxiliary_loss_clip": 0.06488065, + "auxiliary_loss_mlp": 0.01279017, + "balance_loss_clip": 0.0628863, + "balance_loss_mlp": 0.0126296, + "epoch": 0.30356230272057716, + "flos": 16149133077120.0, + "grad_norm": 2.0386560470204804, + "language_loss": 0.75622666, + "learning_rate": 3.2649913081762763e-06, + "loss": 0.83389747, + "num_input_tokens_seen": 108551900, + "router_z_loss_clip": 1.99707031, + "router_z_loss_mlp": 0.16052246, + "step": 5049, + "time_per_iteration": 2.516463279724121 + }, + { + "auxiliary_loss_clip": 0.06494351, + "auxiliary_loss_mlp": 0.01274287, + "balance_loss_clip": 0.06289351, + "balance_loss_mlp": 0.01257597, + "epoch": 0.3036224259732452, + "flos": 28922539115520.0, + "grad_norm": 1.525083803020086, + "language_loss": 0.82698894, + "learning_rate": 3.2646896212318717e-06, + "loss": 0.90467536, + "num_input_tokens_seen": 108574005, + "router_z_loss_clip": 2.05078125, + "router_z_loss_mlp": 0.16687012, + "step": 5050, + "time_per_iteration": 2.558199405670166 + }, + { + "auxiliary_loss_clip": 0.0649763, + "auxiliary_loss_mlp": 0.01273759, + "balance_loss_clip": 0.06295735, + "balance_loss_mlp": 0.01256617, + "epoch": 0.30368254922591315, + "flos": 21111943501440.0, + "grad_norm": 2.311701267026144, + "language_loss": 0.74346399, + "learning_rate": 3.2643878863303106e-06, + "loss": 0.82117784, + "num_input_tokens_seen": 108592715, + "router_z_loss_clip": 2.01953125, + "router_z_loss_mlp": 0.17150879, + "step": 5051, + "time_per_iteration": 2.530457019805908 + }, + { + "auxiliary_loss_clip": 0.06494159, + "auxiliary_loss_mlp": 0.01277624, + "balance_loss_clip": 0.06292571, + "balance_loss_mlp": 0.01260339, + "epoch": 0.3037426724785811, + "flos": 23008859758080.0, + "grad_norm": 1.7255753861859113, + "language_loss": 0.76444, + "learning_rate": 3.264086103483033e-06, + "loss": 0.84215784, + "num_input_tokens_seen": 108611770, + "router_z_loss_clip": 2.01367188, + "router_z_loss_mlp": 0.17297363, + "step": 5052, + "time_per_iteration": 2.596210479736328 + }, + { + "auxiliary_loss_clip": 0.06501957, + "auxiliary_loss_mlp": 0.01280226, + "balance_loss_clip": 0.06295583, + "balance_loss_mlp": 0.01262332, + "epoch": 0.3038027957312491, + "flos": 15638129752320.0, + "grad_norm": 1.9820354931454651, + "language_loss": 0.83096367, + "learning_rate": 3.2637842727014836e-06, + "loss": 0.90878546, + "num_input_tokens_seen": 108629070, + "router_z_loss_clip": 2.0625, + "router_z_loss_mlp": 0.17871094, + "step": 5053, + "time_per_iteration": 2.5384886264801025 + }, + { + "auxiliary_loss_clip": 0.06489826, + "auxiliary_loss_mlp": 0.0127909, + "balance_loss_clip": 0.06288566, + "balance_loss_mlp": 0.01262174, + "epoch": 0.30386291898391704, + "flos": 12718955283840.0, + "grad_norm": 1.6755872357210637, + "language_loss": 0.7197504, + "learning_rate": 3.2634823939971083e-06, + "loss": 0.79743958, + "num_input_tokens_seen": 108646315, + "router_z_loss_clip": 2.01171875, + "router_z_loss_mlp": 0.16906738, + "step": 5054, + "time_per_iteration": 2.4787559509277344 + }, + { + "auxiliary_loss_clip": 0.06500221, + "auxiliary_loss_mlp": 0.01282757, + "balance_loss_clip": 0.06298432, + "balance_loss_mlp": 0.01265805, + "epoch": 0.303923042236585, + "flos": 26366642023680.0, + "grad_norm": 1.8480883425842163, + "language_loss": 0.70137346, + "learning_rate": 3.2631804673813545e-06, + "loss": 0.77920318, + "num_input_tokens_seen": 108665920, + "router_z_loss_clip": 2.01757812, + "router_z_loss_mlp": 0.16943359, + "step": 5055, + "time_per_iteration": 2.5929152965545654 + }, + { + "auxiliary_loss_clip": 0.06494389, + "auxiliary_loss_mlp": 0.01279452, + "balance_loss_clip": 0.0629337, + "balance_loss_mlp": 0.01262488, + "epoch": 0.30398316548925297, + "flos": 19725359736960.0, + "grad_norm": 2.1405790356583516, + "language_loss": 0.68347496, + "learning_rate": 3.2628784928656707e-06, + "loss": 0.7612133, + "num_input_tokens_seen": 108683485, + "router_z_loss_clip": 2.00878906, + "router_z_loss_mlp": 0.16955566, + "step": 5056, + "time_per_iteration": 2.531677007675171 + }, + { + "auxiliary_loss_clip": 0.06490116, + "auxiliary_loss_mlp": 0.01281162, + "balance_loss_clip": 0.06292629, + "balance_loss_mlp": 0.01264377, + "epoch": 0.30404328874192094, + "flos": 24246124346880.0, + "grad_norm": 1.6503197514246037, + "language_loss": 0.83083463, + "learning_rate": 3.262576470461507e-06, + "loss": 0.9085474, + "num_input_tokens_seen": 108702700, + "router_z_loss_clip": 1.97753906, + "router_z_loss_mlp": 0.16796875, + "step": 5057, + "time_per_iteration": 2.5836069583892822 + }, + { + "auxiliary_loss_clip": 0.06484263, + "auxiliary_loss_mlp": 0.01272995, + "balance_loss_clip": 0.06286788, + "balance_loss_mlp": 0.01256603, + "epoch": 0.3041034119945889, + "flos": 24505881603840.0, + "grad_norm": 1.6860023663091837, + "language_loss": 0.89784855, + "learning_rate": 3.2622744001803176e-06, + "loss": 0.97542113, + "num_input_tokens_seen": 108721860, + "router_z_loss_clip": 1.97265625, + "router_z_loss_mlp": 0.16394043, + "step": 5058, + "time_per_iteration": 2.589932918548584 + }, + { + "auxiliary_loss_clip": 0.06495658, + "auxiliary_loss_mlp": 0.01274369, + "balance_loss_clip": 0.06294262, + "balance_loss_mlp": 0.01256524, + "epoch": 0.30416353524725687, + "flos": 28295689121280.0, + "grad_norm": 2.5117349508823392, + "language_loss": 0.71471179, + "learning_rate": 3.2619722820335564e-06, + "loss": 0.79241204, + "num_input_tokens_seen": 108743215, + "router_z_loss_clip": 2.01464844, + "router_z_loss_mlp": 0.17858887, + "step": 5059, + "time_per_iteration": 2.5827505588531494 + }, + { + "auxiliary_loss_clip": 0.06486548, + "auxiliary_loss_mlp": 0.01273567, + "balance_loss_clip": 0.06286812, + "balance_loss_mlp": 0.01257367, + "epoch": 0.30422365849992483, + "flos": 23667295541760.0, + "grad_norm": 1.868956784724377, + "language_loss": 0.73344606, + "learning_rate": 3.26167011603268e-06, + "loss": 0.8110472, + "num_input_tokens_seen": 108765505, + "router_z_loss_clip": 1.99609375, + "router_z_loss_mlp": 0.16174316, + "step": 5060, + "time_per_iteration": 2.624408006668091 + }, + { + "auxiliary_loss_clip": 0.06490071, + "auxiliary_loss_mlp": 0.01273663, + "balance_loss_clip": 0.06289257, + "balance_loss_mlp": 0.01257451, + "epoch": 0.3042837817525928, + "flos": 23004750908160.0, + "grad_norm": 1.75217091558972, + "language_loss": 0.7751621, + "learning_rate": 3.2613679021891463e-06, + "loss": 0.85279948, + "num_input_tokens_seen": 108783370, + "router_z_loss_clip": 2.0078125, + "router_z_loss_mlp": 0.16210938, + "step": 5061, + "time_per_iteration": 2.542299509048462 + }, + { + "auxiliary_loss_clip": 0.06496524, + "auxiliary_loss_mlp": 0.01274148, + "balance_loss_clip": 0.06292392, + "balance_loss_mlp": 0.01256362, + "epoch": 0.30434390500526076, + "flos": 22087438335360.0, + "grad_norm": 2.647933932315435, + "language_loss": 0.8275395, + "learning_rate": 3.261065640514415e-06, + "loss": 0.90524626, + "num_input_tokens_seen": 108797430, + "router_z_loss_clip": 2.03808594, + "router_z_loss_mlp": 0.17773438, + "step": 5062, + "time_per_iteration": 2.5313212871551514 + }, + { + "auxiliary_loss_clip": 0.06485732, + "auxiliary_loss_mlp": 0.01270116, + "balance_loss_clip": 0.06286077, + "balance_loss_mlp": 0.01253689, + "epoch": 0.3044040282579287, + "flos": 25490516532480.0, + "grad_norm": 1.803893214603413, + "language_loss": 0.74348861, + "learning_rate": 3.2607633310199483e-06, + "loss": 0.82104707, + "num_input_tokens_seen": 108816945, + "router_z_loss_clip": 1.99609375, + "router_z_loss_mlp": 0.16394043, + "step": 5063, + "time_per_iteration": 2.553527355194092 + }, + { + "auxiliary_loss_clip": 0.0649004, + "auxiliary_loss_mlp": 0.01274813, + "balance_loss_clip": 0.06291289, + "balance_loss_mlp": 0.01256753, + "epoch": 0.30446415151059675, + "flos": 21952080864000.0, + "grad_norm": 1.6090072895521823, + "language_loss": 0.84824491, + "learning_rate": 3.26046097371721e-06, + "loss": 0.92589343, + "num_input_tokens_seen": 108836615, + "router_z_loss_clip": 1.98925781, + "router_z_loss_mlp": 0.18066406, + "step": 5064, + "time_per_iteration": 2.558650493621826 + }, + { + "auxiliary_loss_clip": 0.06490266, + "auxiliary_loss_mlp": 0.01274023, + "balance_loss_clip": 0.06290541, + "balance_loss_mlp": 0.0125644, + "epoch": 0.3045242747632647, + "flos": 16440979248000.0, + "grad_norm": 2.1763674367183965, + "language_loss": 0.76565492, + "learning_rate": 3.2601585686176655e-06, + "loss": 0.84329784, + "num_input_tokens_seen": 108855165, + "router_z_loss_clip": 1.99804688, + "router_z_loss_mlp": 0.17578125, + "step": 5065, + "time_per_iteration": 2.50644588470459 + }, + { + "auxiliary_loss_clip": 0.06490786, + "auxiliary_loss_mlp": 0.01279051, + "balance_loss_clip": 0.06288782, + "balance_loss_mlp": 0.01260586, + "epoch": 0.3045843980159327, + "flos": 31548399966720.0, + "grad_norm": 1.8114152917186497, + "language_loss": 0.62859941, + "learning_rate": 3.2598561157327814e-06, + "loss": 0.70629776, + "num_input_tokens_seen": 108874690, + "router_z_loss_clip": 2.01953125, + "router_z_loss_mlp": 0.18469238, + "step": 5066, + "time_per_iteration": 2.6319751739501953 + }, + { + "auxiliary_loss_clip": 0.06499436, + "auxiliary_loss_mlp": 0.01273162, + "balance_loss_clip": 0.0629437, + "balance_loss_mlp": 0.01255602, + "epoch": 0.30464452126860064, + "flos": 17858645677440.0, + "grad_norm": 2.0549933694905653, + "language_loss": 0.82941914, + "learning_rate": 3.2595536150740265e-06, + "loss": 0.90714514, + "num_input_tokens_seen": 108893140, + "router_z_loss_clip": 2.05078125, + "router_z_loss_mlp": 0.17565918, + "step": 5067, + "time_per_iteration": 2.483863592147827 + }, + { + "auxiliary_loss_clip": 0.06485019, + "auxiliary_loss_mlp": 0.0127176, + "balance_loss_clip": 0.06289113, + "balance_loss_mlp": 0.01255643, + "epoch": 0.3047046445212686, + "flos": 20637682992000.0, + "grad_norm": 1.9234738451458053, + "language_loss": 0.63749218, + "learning_rate": 3.259251066652873e-06, + "loss": 0.71506, + "num_input_tokens_seen": 108911880, + "router_z_loss_clip": 1.95800781, + "router_z_loss_mlp": 0.16113281, + "step": 5068, + "time_per_iteration": 2.5133988857269287 + }, + { + "auxiliary_loss_clip": 0.06487909, + "auxiliary_loss_mlp": 0.01273097, + "balance_loss_clip": 0.06291264, + "balance_loss_mlp": 0.01256884, + "epoch": 0.3047647677739366, + "flos": 21293896642560.0, + "grad_norm": 1.767828765686575, + "language_loss": 0.75521863, + "learning_rate": 3.258948470480793e-06, + "loss": 0.8328287, + "num_input_tokens_seen": 108930440, + "router_z_loss_clip": 1.96679688, + "router_z_loss_mlp": 0.1619873, + "step": 5069, + "time_per_iteration": 2.5039985179901123 + }, + { + "auxiliary_loss_clip": 0.06492448, + "auxiliary_loss_mlp": 0.01270604, + "balance_loss_clip": 0.06298955, + "balance_loss_mlp": 0.01255047, + "epoch": 0.30482489102660454, + "flos": 21002218179840.0, + "grad_norm": 2.053197356954631, + "language_loss": 0.76551294, + "learning_rate": 3.258645826569261e-06, + "loss": 0.84314346, + "num_input_tokens_seen": 108949125, + "router_z_loss_clip": 1.93457031, + "router_z_loss_mlp": 0.15551758, + "step": 5070, + "time_per_iteration": 2.56703519821167 + }, + { + "auxiliary_loss_clip": 0.06501058, + "auxiliary_loss_mlp": 0.01275886, + "balance_loss_clip": 0.06296416, + "balance_loss_mlp": 0.01257742, + "epoch": 0.3048850142792725, + "flos": 26298732689280.0, + "grad_norm": 1.581704774716999, + "language_loss": 0.82567108, + "learning_rate": 3.2583431349297527e-06, + "loss": 0.90344059, + "num_input_tokens_seen": 108972190, + "router_z_loss_clip": 2.046875, + "router_z_loss_mlp": 0.18139648, + "step": 5071, + "time_per_iteration": 3.9534900188446045 + }, + { + "auxiliary_loss_clip": 0.06502657, + "auxiliary_loss_mlp": 0.01270862, + "balance_loss_clip": 0.06296133, + "balance_loss_mlp": 0.01253374, + "epoch": 0.30494513753194047, + "flos": 22352813815680.0, + "grad_norm": 1.6603887086526505, + "language_loss": 0.76386344, + "learning_rate": 3.2580403955737467e-06, + "loss": 0.84159869, + "num_input_tokens_seen": 108990325, + "router_z_loss_clip": 2.06542969, + "router_z_loss_mlp": 0.17492676, + "step": 5072, + "time_per_iteration": 3.9736859798431396 + }, + { + "auxiliary_loss_clip": 0.06492919, + "auxiliary_loss_mlp": 0.01277102, + "balance_loss_clip": 0.06293403, + "balance_loss_mlp": 0.01260544, + "epoch": 0.30500526078460843, + "flos": 19543909720320.0, + "grad_norm": 1.870095200943675, + "language_loss": 0.71741343, + "learning_rate": 3.257737608512723e-06, + "loss": 0.79511362, + "num_input_tokens_seen": 109009505, + "router_z_loss_clip": 1.99609375, + "router_z_loss_mlp": 0.16564941, + "step": 5073, + "time_per_iteration": 3.961787700653076 + }, + { + "auxiliary_loss_clip": 0.064973, + "auxiliary_loss_mlp": 0.01276358, + "balance_loss_clip": 0.06293514, + "balance_loss_mlp": 0.01259752, + "epoch": 0.3050653840372764, + "flos": 14470577360640.0, + "grad_norm": 2.0196062448027843, + "language_loss": 0.76699424, + "learning_rate": 3.257434773758163e-06, + "loss": 0.84473085, + "num_input_tokens_seen": 109026350, + "router_z_loss_clip": 2.03613281, + "router_z_loss_mlp": 0.16601562, + "step": 5074, + "time_per_iteration": 2.498986005783081 + }, + { + "auxiliary_loss_clip": 0.06498405, + "auxiliary_loss_mlp": 0.01271199, + "balance_loss_clip": 0.06298129, + "balance_loss_mlp": 0.01254534, + "epoch": 0.30512550728994436, + "flos": 24250736321280.0, + "grad_norm": 2.0830863268570496, + "language_loss": 0.75075227, + "learning_rate": 3.25713189132155e-06, + "loss": 0.8284483, + "num_input_tokens_seen": 109044165, + "router_z_loss_clip": 2.00292969, + "router_z_loss_mlp": 0.16662598, + "step": 5075, + "time_per_iteration": 2.586857557296753 + }, + { + "auxiliary_loss_clip": 0.06500411, + "auxiliary_loss_mlp": 0.01274386, + "balance_loss_clip": 0.06294686, + "balance_loss_mlp": 0.01256004, + "epoch": 0.30518563054261233, + "flos": 16365774608640.0, + "grad_norm": 1.8100237719305525, + "language_loss": 0.75655556, + "learning_rate": 3.2568289612143703e-06, + "loss": 0.8343035, + "num_input_tokens_seen": 109060665, + "router_z_loss_clip": 2.0546875, + "router_z_loss_mlp": 0.18371582, + "step": 5076, + "time_per_iteration": 2.4945309162139893 + }, + { + "auxiliary_loss_clip": 0.06496741, + "auxiliary_loss_mlp": 0.01270713, + "balance_loss_clip": 0.06296699, + "balance_loss_mlp": 0.01252712, + "epoch": 0.30524575379528035, + "flos": 21585952448640.0, + "grad_norm": 4.173383760279569, + "language_loss": 0.79782987, + "learning_rate": 3.25652598344811e-06, + "loss": 0.87550437, + "num_input_tokens_seen": 109080035, + "router_z_loss_clip": 2.0, + "router_z_loss_mlp": 0.17993164, + "step": 5077, + "time_per_iteration": 2.534932851791382 + }, + { + "auxiliary_loss_clip": 0.06489561, + "auxiliary_loss_mlp": 0.01270916, + "balance_loss_clip": 0.06295882, + "balance_loss_mlp": 0.01254012, + "epoch": 0.3053058770479483, + "flos": 16550872277760.0, + "grad_norm": 2.5701417949840146, + "language_loss": 0.7555238, + "learning_rate": 3.256222958034259e-06, + "loss": 0.83312857, + "num_input_tokens_seen": 109097385, + "router_z_loss_clip": 1.93652344, + "router_z_loss_mlp": 0.16894531, + "step": 5078, + "time_per_iteration": 2.530031442642212 + }, + { + "auxiliary_loss_clip": 0.06495726, + "auxiliary_loss_mlp": 0.01279629, + "balance_loss_clip": 0.06297612, + "balance_loss_mlp": 0.01262487, + "epoch": 0.3053660003006163, + "flos": 12317844988800.0, + "grad_norm": 1.8416681282179364, + "language_loss": 0.67517591, + "learning_rate": 3.255919884984307e-06, + "loss": 0.75292945, + "num_input_tokens_seen": 109115495, + "router_z_loss_clip": 1.98046875, + "router_z_loss_mlp": 0.17126465, + "step": 5079, + "time_per_iteration": 3.8981266021728516 + }, + { + "auxiliary_loss_clip": 0.06496017, + "auxiliary_loss_mlp": 0.01271448, + "balance_loss_clip": 0.06296019, + "balance_loss_mlp": 0.01253757, + "epoch": 0.30542612355328425, + "flos": 23118962423040.0, + "grad_norm": 1.7235884914338329, + "language_loss": 0.8044346, + "learning_rate": 3.2556167643097477e-06, + "loss": 0.88210917, + "num_input_tokens_seen": 109134235, + "router_z_loss_clip": 1.99707031, + "router_z_loss_mlp": 0.17687988, + "step": 5080, + "time_per_iteration": 2.562946081161499 + }, + { + "auxiliary_loss_clip": 0.06497588, + "auxiliary_loss_mlp": 0.01276495, + "balance_loss_clip": 0.06297643, + "balance_loss_mlp": 0.01259377, + "epoch": 0.3054862468059522, + "flos": 24396365917440.0, + "grad_norm": 2.5665035909877725, + "language_loss": 0.81653202, + "learning_rate": 3.255313596022074e-06, + "loss": 0.89427292, + "num_input_tokens_seen": 109152760, + "router_z_loss_clip": 1.99707031, + "router_z_loss_mlp": 0.17114258, + "step": 5081, + "time_per_iteration": 2.6026763916015625 + }, + { + "auxiliary_loss_clip": 0.06490453, + "auxiliary_loss_mlp": 0.0127058, + "balance_loss_clip": 0.06291625, + "balance_loss_mlp": 0.01253962, + "epoch": 0.3055463700586202, + "flos": 29393529315840.0, + "grad_norm": 1.580638075296793, + "language_loss": 0.72516012, + "learning_rate": 3.255010380132783e-06, + "loss": 0.80277044, + "num_input_tokens_seen": 109173925, + "router_z_loss_clip": 1.98632812, + "router_z_loss_mlp": 0.16619873, + "step": 5082, + "time_per_iteration": 2.650310516357422 + }, + { + "auxiliary_loss_clip": 0.06499462, + "auxiliary_loss_mlp": 0.01274957, + "balance_loss_clip": 0.06293429, + "balance_loss_mlp": 0.01257159, + "epoch": 0.30560649331128814, + "flos": 25598606699520.0, + "grad_norm": 2.3807589086926533, + "language_loss": 0.73733467, + "learning_rate": 3.2547071166533736e-06, + "loss": 0.81507885, + "num_input_tokens_seen": 109192510, + "router_z_loss_clip": 2.05761719, + "router_z_loss_mlp": 0.17797852, + "step": 5083, + "time_per_iteration": 2.595439910888672 + }, + { + "auxiliary_loss_clip": 0.06488115, + "auxiliary_loss_mlp": 0.01272372, + "balance_loss_clip": 0.0628676, + "balance_loss_mlp": 0.01254729, + "epoch": 0.3056666165639561, + "flos": 19133156206080.0, + "grad_norm": 1.8141392710911106, + "language_loss": 0.71165347, + "learning_rate": 3.254403805595344e-06, + "loss": 0.78925836, + "num_input_tokens_seen": 109210885, + "router_z_loss_clip": 2.01367188, + "router_z_loss_mlp": 0.17626953, + "step": 5084, + "time_per_iteration": 2.499873161315918 + }, + { + "auxiliary_loss_clip": 0.06505337, + "auxiliary_loss_mlp": 0.01276239, + "balance_loss_clip": 0.063004, + "balance_loss_mlp": 0.01260194, + "epoch": 0.30572673981662407, + "flos": 15529368752640.0, + "grad_norm": 2.0821129981034567, + "language_loss": 0.79337353, + "learning_rate": 3.2541004469701962e-06, + "loss": 0.87118936, + "num_input_tokens_seen": 109229180, + "router_z_loss_clip": 2.04882812, + "router_z_loss_mlp": 0.16027832, + "step": 5085, + "time_per_iteration": 2.479790449142456 + }, + { + "auxiliary_loss_clip": 0.06486039, + "auxiliary_loss_mlp": 0.01278912, + "balance_loss_clip": 0.06289506, + "balance_loss_mlp": 0.01260602, + "epoch": 0.30578686306929204, + "flos": 21512886088320.0, + "grad_norm": 2.123366644532801, + "language_loss": 0.78524947, + "learning_rate": 3.2537970407894342e-06, + "loss": 0.86289901, + "num_input_tokens_seen": 109249510, + "router_z_loss_clip": 1.96484375, + "router_z_loss_mlp": 0.18310547, + "step": 5086, + "time_per_iteration": 2.5372772216796875 + }, + { + "auxiliary_loss_clip": 0.06487311, + "auxiliary_loss_mlp": 0.01277834, + "balance_loss_clip": 0.06289313, + "balance_loss_mlp": 0.01259797, + "epoch": 0.30584698632196, + "flos": 20959689432960.0, + "grad_norm": 1.7535206397091907, + "language_loss": 0.77160186, + "learning_rate": 3.253493587064563e-06, + "loss": 0.8492533, + "num_input_tokens_seen": 109268200, + "router_z_loss_clip": 1.98144531, + "router_z_loss_mlp": 0.18041992, + "step": 5087, + "time_per_iteration": 2.4971578121185303 + }, + { + "auxiliary_loss_clip": 0.06492934, + "auxiliary_loss_mlp": 0.01277252, + "balance_loss_clip": 0.06288779, + "balance_loss_mlp": 0.01258154, + "epoch": 0.30590710957462797, + "flos": 24688044380160.0, + "grad_norm": 1.802467786704899, + "language_loss": 0.7266196, + "learning_rate": 3.2531900858070885e-06, + "loss": 0.80432141, + "num_input_tokens_seen": 109288370, + "router_z_loss_clip": 2.04492188, + "router_z_loss_mlp": 0.19091797, + "step": 5088, + "time_per_iteration": 2.5416259765625 + }, + { + "auxiliary_loss_clip": 0.06501624, + "auxiliary_loss_mlp": 0.0127311, + "balance_loss_clip": 0.06292014, + "balance_loss_mlp": 0.01253893, + "epoch": 0.30596723282729593, + "flos": 17091700456320.0, + "grad_norm": 2.3226252492467037, + "language_loss": 0.79702371, + "learning_rate": 3.252886537028521e-06, + "loss": 0.874771, + "num_input_tokens_seen": 109306730, + "router_z_loss_clip": 2.09570312, + "router_z_loss_mlp": 0.19226074, + "step": 5089, + "time_per_iteration": 2.4745559692382812 + }, + { + "auxiliary_loss_clip": 0.06491631, + "auxiliary_loss_mlp": 0.01275196, + "balance_loss_clip": 0.06291364, + "balance_loss_mlp": 0.01256981, + "epoch": 0.30602735607996395, + "flos": 22863775213440.0, + "grad_norm": 6.857787253608019, + "language_loss": 0.77299303, + "learning_rate": 3.2525829407403703e-06, + "loss": 0.85066134, + "num_input_tokens_seen": 109327360, + "router_z_loss_clip": 2.00097656, + "router_z_loss_mlp": 0.18225098, + "step": 5090, + "time_per_iteration": 2.5330631732940674 + }, + { + "auxiliary_loss_clip": 0.06500913, + "auxiliary_loss_mlp": 0.01279012, + "balance_loss_clip": 0.06295903, + "balance_loss_mlp": 0.01260773, + "epoch": 0.3060874793326319, + "flos": 29869173417600.0, + "grad_norm": 1.854909004407163, + "language_loss": 0.76970392, + "learning_rate": 3.2522792969541488e-06, + "loss": 0.84750324, + "num_input_tokens_seen": 109348135, + "router_z_loss_clip": 2.04882812, + "router_z_loss_mlp": 0.18237305, + "step": 5091, + "time_per_iteration": 2.561894178390503 + }, + { + "auxiliary_loss_clip": 0.06491988, + "auxiliary_loss_mlp": 0.01272552, + "balance_loss_clip": 0.06287533, + "balance_loss_mlp": 0.01254551, + "epoch": 0.3061476025852999, + "flos": 20454765528960.0, + "grad_norm": 1.7300285931862276, + "language_loss": 0.72878456, + "learning_rate": 3.2519756056813705e-06, + "loss": 0.80642998, + "num_input_tokens_seen": 109366220, + "router_z_loss_clip": 2.04589844, + "router_z_loss_mlp": 0.18005371, + "step": 5092, + "time_per_iteration": 2.5661561489105225 + }, + { + "auxiliary_loss_clip": 0.06495406, + "auxiliary_loss_mlp": 0.01276172, + "balance_loss_clip": 0.06294402, + "balance_loss_mlp": 0.01258696, + "epoch": 0.30620772583796785, + "flos": 19397651218560.0, + "grad_norm": 1.8286917674158676, + "language_loss": 0.83293521, + "learning_rate": 3.2516718669335522e-06, + "loss": 0.91065109, + "num_input_tokens_seen": 109385260, + "router_z_loss_clip": 2.00976562, + "router_z_loss_mlp": 0.17468262, + "step": 5093, + "time_per_iteration": 2.49686336517334 + }, + { + "auxiliary_loss_clip": 0.06495437, + "auxiliary_loss_mlp": 0.01277069, + "balance_loss_clip": 0.06295857, + "balance_loss_mlp": 0.01259652, + "epoch": 0.3062678490906358, + "flos": 24031411459200.0, + "grad_norm": 1.7386581048181018, + "language_loss": 0.74963737, + "learning_rate": 3.2513680807222114e-06, + "loss": 0.82736242, + "num_input_tokens_seen": 109405025, + "router_z_loss_clip": 1.99414062, + "router_z_loss_mlp": 0.17419434, + "step": 5094, + "time_per_iteration": 2.5497004985809326 + }, + { + "auxiliary_loss_clip": 0.06491575, + "auxiliary_loss_mlp": 0.01272234, + "balance_loss_clip": 0.06293601, + "balance_loss_mlp": 0.01255735, + "epoch": 0.3063279723433038, + "flos": 19760593178880.0, + "grad_norm": 1.8971341227661025, + "language_loss": 0.76389223, + "learning_rate": 3.251064247058868e-06, + "loss": 0.84153032, + "num_input_tokens_seen": 109422465, + "router_z_loss_clip": 1.9765625, + "router_z_loss_mlp": 0.16503906, + "step": 5095, + "time_per_iteration": 2.493479013442993 + }, + { + "auxiliary_loss_clip": 0.06485657, + "auxiliary_loss_mlp": 0.0128124, + "balance_loss_clip": 0.06288686, + "balance_loss_mlp": 0.01262727, + "epoch": 0.30638809559597174, + "flos": 22455663102720.0, + "grad_norm": 1.6310889817091494, + "language_loss": 0.81246006, + "learning_rate": 3.250760365955042e-06, + "loss": 0.89012897, + "num_input_tokens_seen": 109440575, + "router_z_loss_clip": 1.97070312, + "router_z_loss_mlp": 0.18518066, + "step": 5096, + "time_per_iteration": 2.606100559234619 + }, + { + "auxiliary_loss_clip": 0.06500001, + "auxiliary_loss_mlp": 0.01286183, + "balance_loss_clip": 0.06297529, + "balance_loss_mlp": 0.01269947, + "epoch": 0.3064482188486397, + "flos": 17170846237440.0, + "grad_norm": 2.1701963694762862, + "language_loss": 0.81871414, + "learning_rate": 3.250456437422258e-06, + "loss": 0.89657605, + "num_input_tokens_seen": 109459050, + "router_z_loss_clip": 2.0234375, + "router_z_loss_mlp": 0.16235352, + "step": 5097, + "time_per_iteration": 2.506908893585205 + }, + { + "auxiliary_loss_clip": 0.06498241, + "auxiliary_loss_mlp": 0.01288982, + "balance_loss_clip": 0.06297113, + "balance_loss_mlp": 0.01269647, + "epoch": 0.3065083421013077, + "flos": 23775176073600.0, + "grad_norm": 2.1266024193404385, + "language_loss": 0.7855283, + "learning_rate": 3.250152461472041e-06, + "loss": 0.86340058, + "num_input_tokens_seen": 109475860, + "router_z_loss_clip": 2.0078125, + "router_z_loss_mlp": 0.19335938, + "step": 5098, + "time_per_iteration": 2.546875238418579 + }, + { + "auxiliary_loss_clip": 0.06494713, + "auxiliary_loss_mlp": 0.01291897, + "balance_loss_clip": 0.06296527, + "balance_loss_mlp": 0.0127367, + "epoch": 0.30656846535397564, + "flos": 26438953697280.0, + "grad_norm": 1.8261556885246946, + "language_loss": 0.84430897, + "learning_rate": 3.249848438115917e-06, + "loss": 0.92217511, + "num_input_tokens_seen": 109494760, + "router_z_loss_clip": 1.98242188, + "router_z_loss_mlp": 0.18225098, + "step": 5099, + "time_per_iteration": 2.5726583003997803 + }, + { + "auxiliary_loss_clip": 0.06498358, + "auxiliary_loss_mlp": 0.01287293, + "balance_loss_clip": 0.06295489, + "balance_loss_mlp": 0.01268434, + "epoch": 0.3066285886066436, + "flos": 26659117100160.0, + "grad_norm": 1.588615118025773, + "language_loss": 0.86241573, + "learning_rate": 3.2495443673654148e-06, + "loss": 0.94027227, + "num_input_tokens_seen": 109516480, + "router_z_loss_clip": 2.02832031, + "router_z_loss_mlp": 0.18859863, + "step": 5100, + "time_per_iteration": 2.5711421966552734 + }, + { + "auxiliary_loss_clip": 0.06496789, + "auxiliary_loss_mlp": 0.01283562, + "balance_loss_clip": 0.06296922, + "balance_loss_mlp": 0.01264345, + "epoch": 0.30668871185931157, + "flos": 15055443659520.0, + "grad_norm": 1.7244173580954059, + "language_loss": 0.79369497, + "learning_rate": 3.249240249232065e-06, + "loss": 0.87149858, + "num_input_tokens_seen": 109534615, + "router_z_loss_clip": 1.99804688, + "router_z_loss_mlp": 0.19226074, + "step": 5101, + "time_per_iteration": 2.539132833480835 + }, + { + "auxiliary_loss_clip": 0.0650195, + "auxiliary_loss_mlp": 0.01287055, + "balance_loss_clip": 0.06299084, + "balance_loss_mlp": 0.01268172, + "epoch": 0.30674883511197953, + "flos": 20087966280960.0, + "grad_norm": 1.7739241542858428, + "language_loss": 0.80435872, + "learning_rate": 3.2489360837273998e-06, + "loss": 0.88224876, + "num_input_tokens_seen": 109554040, + "router_z_loss_clip": 2.02832031, + "router_z_loss_mlp": 0.1887207, + "step": 5102, + "time_per_iteration": 2.5558016300201416 + }, + { + "auxiliary_loss_clip": 0.06503183, + "auxiliary_loss_mlp": 0.01284648, + "balance_loss_clip": 0.06301928, + "balance_loss_mlp": 0.01265253, + "epoch": 0.30680895836464755, + "flos": 22900518028800.0, + "grad_norm": 1.6865927559982214, + "language_loss": 0.89335668, + "learning_rate": 3.2486318708629532e-06, + "loss": 0.97123504, + "num_input_tokens_seen": 109574345, + "router_z_loss_clip": 2.015625, + "router_z_loss_mlp": 0.19396973, + "step": 5103, + "time_per_iteration": 2.542555570602417 + }, + { + "auxiliary_loss_clip": 0.06501935, + "auxiliary_loss_mlp": 0.01286618, + "balance_loss_clip": 0.06302223, + "balance_loss_mlp": 0.0126876, + "epoch": 0.3068690816173155, + "flos": 23702948254080.0, + "grad_norm": 2.119732369805114, + "language_loss": 0.74448419, + "learning_rate": 3.2483276106502607e-06, + "loss": 0.82236969, + "num_input_tokens_seen": 109593670, + "router_z_loss_clip": 1.99707031, + "router_z_loss_mlp": 0.17871094, + "step": 5104, + "time_per_iteration": 2.560253143310547 + }, + { + "auxiliary_loss_clip": 0.06502049, + "auxiliary_loss_mlp": 0.01274873, + "balance_loss_clip": 0.06295487, + "balance_loss_mlp": 0.01257552, + "epoch": 0.3069292048699835, + "flos": 23557947563520.0, + "grad_norm": 1.7334515387821061, + "language_loss": 0.72909176, + "learning_rate": 3.2480233031008605e-06, + "loss": 0.80686092, + "num_input_tokens_seen": 109613385, + "router_z_loss_clip": 2.06835938, + "router_z_loss_mlp": 0.17321777, + "step": 5105, + "time_per_iteration": 2.5751454830169678 + }, + { + "auxiliary_loss_clip": 0.06498945, + "auxiliary_loss_mlp": 0.01282015, + "balance_loss_clip": 0.06297372, + "balance_loss_mlp": 0.01263907, + "epoch": 0.30698932812265145, + "flos": 24537970517760.0, + "grad_norm": 2.0977567017321608, + "language_loss": 0.87578112, + "learning_rate": 3.2477189482262916e-06, + "loss": 0.95359075, + "num_input_tokens_seen": 109632395, + "router_z_loss_clip": 2.01464844, + "router_z_loss_mlp": 0.18103027, + "step": 5106, + "time_per_iteration": 2.54413104057312 + }, + { + "auxiliary_loss_clip": 0.06503764, + "auxiliary_loss_mlp": 0.01279082, + "balance_loss_clip": 0.06296381, + "balance_loss_mlp": 0.01261189, + "epoch": 0.3070494513753194, + "flos": 21002805158400.0, + "grad_norm": 2.310425767564757, + "language_loss": 0.72092319, + "learning_rate": 3.2474145460380945e-06, + "loss": 0.79875165, + "num_input_tokens_seen": 109651380, + "router_z_loss_clip": 2.0703125, + "router_z_loss_mlp": 0.17883301, + "step": 5107, + "time_per_iteration": 2.571430206298828 + }, + { + "auxiliary_loss_clip": 0.06493405, + "auxiliary_loss_mlp": 0.01275594, + "balance_loss_clip": 0.06294269, + "balance_loss_mlp": 0.01256735, + "epoch": 0.3071095746279874, + "flos": 19031942073600.0, + "grad_norm": 1.99593781887154, + "language_loss": 0.72653455, + "learning_rate": 3.247110096547814e-06, + "loss": 0.80422449, + "num_input_tokens_seen": 109670240, + "router_z_loss_clip": 1.99121094, + "router_z_loss_mlp": 0.18847656, + "step": 5108, + "time_per_iteration": 2.497788190841675 + }, + { + "auxiliary_loss_clip": 0.06499057, + "auxiliary_loss_mlp": 0.01277116, + "balance_loss_clip": 0.06297708, + "balance_loss_mlp": 0.01259533, + "epoch": 0.30716969788065535, + "flos": 21221962312320.0, + "grad_norm": 1.48656392648579, + "language_loss": 0.86441541, + "learning_rate": 3.2468055997669926e-06, + "loss": 0.94217712, + "num_input_tokens_seen": 109690810, + "router_z_loss_clip": 2.01074219, + "router_z_loss_mlp": 0.17578125, + "step": 5109, + "time_per_iteration": 2.563480854034424 + }, + { + "auxiliary_loss_clip": 0.06501789, + "auxiliary_loss_mlp": 0.01278566, + "balance_loss_clip": 0.063005, + "balance_loss_mlp": 0.01260541, + "epoch": 0.3072298211333233, + "flos": 25779385883520.0, + "grad_norm": 1.8235353484155168, + "language_loss": 0.67904091, + "learning_rate": 3.2465010557071788e-06, + "loss": 0.75684446, + "num_input_tokens_seen": 109711145, + "router_z_loss_clip": 2.01171875, + "router_z_loss_mlp": 0.18029785, + "step": 5110, + "time_per_iteration": 3.9785540103912354 + }, + { + "auxiliary_loss_clip": 0.06493396, + "auxiliary_loss_mlp": 0.01273369, + "balance_loss_clip": 0.06295427, + "balance_loss_mlp": 0.01256727, + "epoch": 0.3072899443859913, + "flos": 25856099896320.0, + "grad_norm": 1.4123986071879864, + "language_loss": 0.76984161, + "learning_rate": 3.246196464379919e-06, + "loss": 0.84750926, + "num_input_tokens_seen": 109731425, + "router_z_loss_clip": 1.97753906, + "router_z_loss_mlp": 0.16638184, + "step": 5111, + "time_per_iteration": 2.5771117210388184 + }, + { + "auxiliary_loss_clip": 0.06498265, + "auxiliary_loss_mlp": 0.01277301, + "balance_loss_clip": 0.06293567, + "balance_loss_mlp": 0.01258585, + "epoch": 0.30735006763865924, + "flos": 25930130578560.0, + "grad_norm": 2.349951455822933, + "language_loss": 0.67755288, + "learning_rate": 3.245891825796765e-06, + "loss": 0.75530857, + "num_input_tokens_seen": 109752720, + "router_z_loss_clip": 2.04394531, + "router_z_loss_mlp": 0.18713379, + "step": 5112, + "time_per_iteration": 3.963136672973633 + }, + { + "auxiliary_loss_clip": 0.0650286, + "auxiliary_loss_mlp": 0.01277737, + "balance_loss_clip": 0.06295824, + "balance_loss_mlp": 0.01257614, + "epoch": 0.3074101908913272, + "flos": 30924442938240.0, + "grad_norm": 2.270303220058131, + "language_loss": 0.79939896, + "learning_rate": 3.2455871399692678e-06, + "loss": 0.87720484, + "num_input_tokens_seen": 109772840, + "router_z_loss_clip": 2.06640625, + "router_z_loss_mlp": 0.20117188, + "step": 5113, + "time_per_iteration": 4.084795236587524 + }, + { + "auxiliary_loss_clip": 0.06502695, + "auxiliary_loss_mlp": 0.01276516, + "balance_loss_clip": 0.06297943, + "balance_loss_mlp": 0.01258599, + "epoch": 0.30747031414399517, + "flos": 18406182182400.0, + "grad_norm": 2.072714063381377, + "language_loss": 0.77269047, + "learning_rate": 3.2452824069089815e-06, + "loss": 0.85048258, + "num_input_tokens_seen": 109790150, + "router_z_loss_clip": 2.04589844, + "router_z_loss_mlp": 0.17919922, + "step": 5114, + "time_per_iteration": 2.4906773567199707 + }, + { + "auxiliary_loss_clip": 0.06498024, + "auxiliary_loss_mlp": 0.01283612, + "balance_loss_clip": 0.06298083, + "balance_loss_mlp": 0.01265087, + "epoch": 0.30753043739666314, + "flos": 22638957909120.0, + "grad_norm": 1.8131309248321845, + "language_loss": 0.62640405, + "learning_rate": 3.2449776266274623e-06, + "loss": 0.70422041, + "num_input_tokens_seen": 109807985, + "router_z_loss_clip": 2.0, + "router_z_loss_mlp": 0.18530273, + "step": 5115, + "time_per_iteration": 2.5328574180603027 + }, + { + "auxiliary_loss_clip": 0.06499057, + "auxiliary_loss_mlp": 0.01274347, + "balance_loss_clip": 0.06295817, + "balance_loss_mlp": 0.0125513, + "epoch": 0.3075905606493311, + "flos": 27351360806400.0, + "grad_norm": 1.7894066300170501, + "language_loss": 0.83589995, + "learning_rate": 3.2446727991362657e-06, + "loss": 0.91363406, + "num_input_tokens_seen": 109825920, + "router_z_loss_clip": 2.03222656, + "router_z_loss_mlp": 0.19213867, + "step": 5116, + "time_per_iteration": 2.562014102935791 + }, + { + "auxiliary_loss_clip": 0.06500115, + "auxiliary_loss_mlp": 0.01273454, + "balance_loss_clip": 0.06298394, + "balance_loss_mlp": 0.0125512, + "epoch": 0.3076506839019991, + "flos": 22097333116800.0, + "grad_norm": 1.8649453582041782, + "language_loss": 0.76016742, + "learning_rate": 3.244367924446952e-06, + "loss": 0.83790314, + "num_input_tokens_seen": 109846220, + "router_z_loss_clip": 2.01660156, + "router_z_loss_mlp": 0.18322754, + "step": 5117, + "time_per_iteration": 2.5509209632873535 + }, + { + "auxiliary_loss_clip": 0.06498168, + "auxiliary_loss_mlp": 0.01274202, + "balance_loss_clip": 0.0629583, + "balance_loss_mlp": 0.01256142, + "epoch": 0.3077108071546671, + "flos": 21296160702720.0, + "grad_norm": 2.167097847201453, + "language_loss": 0.72108531, + "learning_rate": 3.2440630025710826e-06, + "loss": 0.79880905, + "num_input_tokens_seen": 109863870, + "router_z_loss_clip": 2.02246094, + "router_z_loss_mlp": 0.18054199, + "step": 5118, + "time_per_iteration": 2.5190913677215576 + }, + { + "auxiliary_loss_clip": 0.06502286, + "auxiliary_loss_mlp": 0.01275745, + "balance_loss_clip": 0.06299888, + "balance_loss_mlp": 0.01258198, + "epoch": 0.30777093040733505, + "flos": 21436884835200.0, + "grad_norm": 2.760855389686565, + "language_loss": 0.74956095, + "learning_rate": 3.243758033520219e-06, + "loss": 0.82734126, + "num_input_tokens_seen": 109883500, + "router_z_loss_clip": 2.0234375, + "router_z_loss_mlp": 0.17553711, + "step": 5119, + "time_per_iteration": 3.973721981048584 + }, + { + "auxiliary_loss_clip": 0.06494488, + "auxiliary_loss_mlp": 0.01279388, + "balance_loss_clip": 0.06289928, + "balance_loss_mlp": 0.01259814, + "epoch": 0.307831053660003, + "flos": 23156040654720.0, + "grad_norm": 1.7924264386276263, + "language_loss": 0.80264926, + "learning_rate": 3.243453017305926e-06, + "loss": 0.88038802, + "num_input_tokens_seen": 109904620, + "router_z_loss_clip": 2.04101562, + "router_z_loss_mlp": 0.19580078, + "step": 5120, + "time_per_iteration": 2.54705548286438 + }, + { + "auxiliary_loss_clip": 0.06492078, + "auxiliary_loss_mlp": 0.01273208, + "balance_loss_clip": 0.06292595, + "balance_loss_mlp": 0.01255445, + "epoch": 0.307891176912671, + "flos": 17025510130560.0, + "grad_norm": 1.642273509687288, + "language_loss": 0.80521786, + "learning_rate": 3.24314795393977e-06, + "loss": 0.88287073, + "num_input_tokens_seen": 109922275, + "router_z_loss_clip": 1.99707031, + "router_z_loss_mlp": 0.1776123, + "step": 5121, + "time_per_iteration": 2.515054702758789 + }, + { + "auxiliary_loss_clip": 0.06496292, + "auxiliary_loss_mlp": 0.0127453, + "balance_loss_clip": 0.06298114, + "balance_loss_mlp": 0.01256875, + "epoch": 0.30795130016533895, + "flos": 27711745217280.0, + "grad_norm": 1.3913461280715187, + "language_loss": 0.82847351, + "learning_rate": 3.242842843433319e-06, + "loss": 0.90618169, + "num_input_tokens_seen": 109944265, + "router_z_loss_clip": 1.98339844, + "router_z_loss_mlp": 0.17651367, + "step": 5122, + "time_per_iteration": 2.5832252502441406 + }, + { + "auxiliary_loss_clip": 0.06416376, + "auxiliary_loss_mlp": 0.01252861, + "balance_loss_clip": 0.0632116, + "balance_loss_mlp": 0.01249526, + "epoch": 0.3080114234180069, + "flos": 69080973373440.0, + "grad_norm": 0.7221499072225652, + "language_loss": 0.58650029, + "learning_rate": 3.242537685798143e-06, + "loss": 0.66319263, + "num_input_tokens_seen": 110014160, + "router_z_loss_clip": 0.953125, + "router_z_loss_mlp": 0.03341675, + "step": 5123, + "time_per_iteration": 3.3316402435302734 + }, + { + "auxiliary_loss_clip": 0.06503562, + "auxiliary_loss_mlp": 0.01279925, + "balance_loss_clip": 0.06296872, + "balance_loss_mlp": 0.01260744, + "epoch": 0.3080715466706749, + "flos": 24066938390400.0, + "grad_norm": 1.6584153298959496, + "language_loss": 0.83586073, + "learning_rate": 3.242232481045813e-06, + "loss": 0.91369557, + "num_input_tokens_seen": 110034865, + "router_z_loss_clip": 2.06640625, + "router_z_loss_mlp": 0.1920166, + "step": 5124, + "time_per_iteration": 2.589906930923462 + }, + { + "auxiliary_loss_clip": 0.06498908, + "auxiliary_loss_mlp": 0.01271737, + "balance_loss_clip": 0.06294107, + "balance_loss_mlp": 0.01253629, + "epoch": 0.30813166992334284, + "flos": 25855806407040.0, + "grad_norm": 2.061271988083176, + "language_loss": 0.79248756, + "learning_rate": 3.2419272291879035e-06, + "loss": 0.87019402, + "num_input_tokens_seen": 110052930, + "router_z_loss_clip": 2.04589844, + "router_z_loss_mlp": 0.1809082, + "step": 5125, + "time_per_iteration": 2.550884485244751 + }, + { + "auxiliary_loss_clip": 0.06501068, + "auxiliary_loss_mlp": 0.012774, + "balance_loss_clip": 0.06292764, + "balance_loss_mlp": 0.01258374, + "epoch": 0.3081917931760108, + "flos": 20455981413120.0, + "grad_norm": 2.085029494567846, + "language_loss": 0.64930958, + "learning_rate": 3.241621930235989e-06, + "loss": 0.72709423, + "num_input_tokens_seen": 110071765, + "router_z_loss_clip": 2.08203125, + "router_z_loss_mlp": 0.19018555, + "step": 5126, + "time_per_iteration": 2.576352119445801 + }, + { + "auxiliary_loss_clip": 0.06490224, + "auxiliary_loss_mlp": 0.01277045, + "balance_loss_clip": 0.06294391, + "balance_loss_mlp": 0.01259533, + "epoch": 0.3082519164286788, + "flos": 22173208588800.0, + "grad_norm": 1.5681866965441809, + "language_loss": 0.87117672, + "learning_rate": 3.241316584201646e-06, + "loss": 0.94884944, + "num_input_tokens_seen": 110092660, + "router_z_loss_clip": 1.95800781, + "router_z_loss_mlp": 0.17504883, + "step": 5127, + "time_per_iteration": 2.567615270614624 + }, + { + "auxiliary_loss_clip": 0.0649047, + "auxiliary_loss_mlp": 0.01273562, + "balance_loss_clip": 0.06291968, + "balance_loss_mlp": 0.0125593, + "epoch": 0.30831203968134674, + "flos": 28921029742080.0, + "grad_norm": 1.4544126326452276, + "language_loss": 0.69282925, + "learning_rate": 3.2410111910964538e-06, + "loss": 0.77046961, + "num_input_tokens_seen": 110114960, + "router_z_loss_clip": 1.984375, + "router_z_loss_mlp": 0.1763916, + "step": 5128, + "time_per_iteration": 2.6129322052001953 + }, + { + "auxiliary_loss_clip": 0.06499469, + "auxiliary_loss_mlp": 0.01276178, + "balance_loss_clip": 0.06295171, + "balance_loss_mlp": 0.01257843, + "epoch": 0.3083721629340147, + "flos": 25675069150080.0, + "grad_norm": 2.0282558045061396, + "language_loss": 0.7195785, + "learning_rate": 3.240705750931993e-06, + "loss": 0.79733503, + "num_input_tokens_seen": 110135750, + "router_z_loss_clip": 2.04394531, + "router_z_loss_mlp": 0.18334961, + "step": 5129, + "time_per_iteration": 2.5587165355682373 + }, + { + "auxiliary_loss_clip": 0.06388761, + "auxiliary_loss_mlp": 0.01275431, + "balance_loss_clip": 0.06292662, + "balance_loss_mlp": 0.01271816, + "epoch": 0.3084322861866827, + "flos": 68233666487040.0, + "grad_norm": 0.8077979927321801, + "language_loss": 0.58935201, + "learning_rate": 3.240400263719846e-06, + "loss": 0.66599393, + "num_input_tokens_seen": 110189480, + "router_z_loss_clip": 0.9609375, + "router_z_loss_mlp": 0.03607178, + "step": 5130, + "time_per_iteration": 3.2353098392486572 + }, + { + "auxiliary_loss_clip": 0.06498231, + "auxiliary_loss_mlp": 0.012758, + "balance_loss_clip": 0.0629265, + "balance_loss_mlp": 0.01258443, + "epoch": 0.3084924094393507, + "flos": 20301630992640.0, + "grad_norm": 2.071340626605126, + "language_loss": 0.73298538, + "learning_rate": 3.2400947294715957e-06, + "loss": 0.81072569, + "num_input_tokens_seen": 110206445, + "router_z_loss_clip": 2.05664062, + "router_z_loss_mlp": 0.17370605, + "step": 5131, + "time_per_iteration": 2.523510456085205 + }, + { + "auxiliary_loss_clip": 0.06487547, + "auxiliary_loss_mlp": 0.01274811, + "balance_loss_clip": 0.06290068, + "balance_loss_mlp": 0.01257728, + "epoch": 0.30855253269201866, + "flos": 23956374528000.0, + "grad_norm": 1.6208223340220833, + "language_loss": 0.71358359, + "learning_rate": 3.2397891481988303e-06, + "loss": 0.79120713, + "num_input_tokens_seen": 110226845, + "router_z_loss_clip": 1.97558594, + "router_z_loss_mlp": 0.17077637, + "step": 5132, + "time_per_iteration": 2.581470012664795 + }, + { + "auxiliary_loss_clip": 0.06487268, + "auxiliary_loss_mlp": 0.01273323, + "balance_loss_clip": 0.06290212, + "balance_loss_mlp": 0.01255262, + "epoch": 0.3086126559446866, + "flos": 19288009751040.0, + "grad_norm": 1.7801590489825803, + "language_loss": 0.90374929, + "learning_rate": 3.239483519913136e-06, + "loss": 0.98135513, + "num_input_tokens_seen": 110244095, + "router_z_loss_clip": 1.96972656, + "router_z_loss_mlp": 0.18066406, + "step": 5133, + "time_per_iteration": 2.5197763442993164 + }, + { + "auxiliary_loss_clip": 0.06499831, + "auxiliary_loss_mlp": 0.01275105, + "balance_loss_clip": 0.06295495, + "balance_loss_mlp": 0.01257105, + "epoch": 0.3086727791973546, + "flos": 33768328913280.0, + "grad_norm": 1.8524807236065886, + "language_loss": 0.67443442, + "learning_rate": 3.239177844626102e-06, + "loss": 0.75218379, + "num_input_tokens_seen": 110264240, + "router_z_loss_clip": 2.04101562, + "router_z_loss_mlp": 0.18017578, + "step": 5134, + "time_per_iteration": 2.664303779602051 + }, + { + "auxiliary_loss_clip": 0.06498815, + "auxiliary_loss_mlp": 0.01275704, + "balance_loss_clip": 0.06293166, + "balance_loss_mlp": 0.01257167, + "epoch": 0.30873290245002255, + "flos": 16039659317760.0, + "grad_norm": 1.8927812104332384, + "language_loss": 0.83517784, + "learning_rate": 3.2388721223493197e-06, + "loss": 0.91292304, + "num_input_tokens_seen": 110282450, + "router_z_loss_clip": 2.05664062, + "router_z_loss_mlp": 0.18518066, + "step": 5135, + "time_per_iteration": 2.505138397216797 + }, + { + "auxiliary_loss_clip": 0.06377634, + "auxiliary_loss_mlp": 0.01258895, + "balance_loss_clip": 0.06282344, + "balance_loss_mlp": 0.01255602, + "epoch": 0.3087930257026905, + "flos": 65070415474560.0, + "grad_norm": 0.6863645266912056, + "language_loss": 0.55337238, + "learning_rate": 3.2385663530943824e-06, + "loss": 0.62973773, + "num_input_tokens_seen": 110343715, + "router_z_loss_clip": 0.953125, + "router_z_loss_mlp": 0.0329895, + "step": 5136, + "time_per_iteration": 3.179166555404663 + }, + { + "auxiliary_loss_clip": 0.06488921, + "auxiliary_loss_mlp": 0.01274465, + "balance_loss_clip": 0.06290261, + "balance_loss_mlp": 0.01257085, + "epoch": 0.3088531489553585, + "flos": 74754001733760.0, + "grad_norm": 1.8635236180899502, + "language_loss": 0.76610464, + "learning_rate": 3.2382605368728852e-06, + "loss": 0.8437385, + "num_input_tokens_seen": 110368430, + "router_z_loss_clip": 1.98632812, + "router_z_loss_mlp": 0.1739502, + "step": 5137, + "time_per_iteration": 2.9993999004364014 + }, + { + "auxiliary_loss_clip": 0.06489644, + "auxiliary_loss_mlp": 0.01272707, + "balance_loss_clip": 0.06290223, + "balance_loss_mlp": 0.01255458, + "epoch": 0.30891327220802645, + "flos": 21148686316800.0, + "grad_norm": 1.7480087539569926, + "language_loss": 0.80450445, + "learning_rate": 3.237954673696424e-06, + "loss": 0.882128, + "num_input_tokens_seen": 110386735, + "router_z_loss_clip": 1.9921875, + "router_z_loss_mlp": 0.17248535, + "step": 5138, + "time_per_iteration": 2.531916856765747 + }, + { + "auxiliary_loss_clip": 0.06496161, + "auxiliary_loss_mlp": 0.01276289, + "balance_loss_clip": 0.06294001, + "balance_loss_mlp": 0.01258896, + "epoch": 0.3089733954606944, + "flos": 25671295716480.0, + "grad_norm": 1.629930216805369, + "language_loss": 0.81626344, + "learning_rate": 3.2376487635765983e-06, + "loss": 0.89398789, + "num_input_tokens_seen": 110406820, + "router_z_loss_clip": 2.02050781, + "router_z_loss_mlp": 0.1739502, + "step": 5139, + "time_per_iteration": 2.585380792617798 + }, + { + "auxiliary_loss_clip": 0.06501773, + "auxiliary_loss_mlp": 0.01277306, + "balance_loss_clip": 0.06292425, + "balance_loss_mlp": 0.01258817, + "epoch": 0.3090335187133624, + "flos": 19433429712000.0, + "grad_norm": 2.0033599705043854, + "language_loss": 0.77724934, + "learning_rate": 3.2373428065250067e-06, + "loss": 0.85504013, + "num_input_tokens_seen": 110424225, + "router_z_loss_clip": 2.09179688, + "router_z_loss_mlp": 0.18481445, + "step": 5140, + "time_per_iteration": 2.504387617111206 + }, + { + "auxiliary_loss_clip": 0.06482549, + "auxiliary_loss_mlp": 0.01272919, + "balance_loss_clip": 0.06290817, + "balance_loss_mlp": 0.0125741, + "epoch": 0.30909364196603034, + "flos": 20017541324160.0, + "grad_norm": 1.9132937458234096, + "language_loss": 0.78916645, + "learning_rate": 3.237036802553252e-06, + "loss": 0.86672109, + "num_input_tokens_seen": 110443310, + "router_z_loss_clip": 1.91601562, + "router_z_loss_mlp": 0.15515137, + "step": 5141, + "time_per_iteration": 2.5588464736938477 + }, + { + "auxiliary_loss_clip": 0.06494773, + "auxiliary_loss_mlp": 0.01277459, + "balance_loss_clip": 0.06291379, + "balance_loss_mlp": 0.01260543, + "epoch": 0.3091537652186983, + "flos": 19682830990080.0, + "grad_norm": 2.2087235088394728, + "language_loss": 0.8789897, + "learning_rate": 3.2367307516729377e-06, + "loss": 0.95671201, + "num_input_tokens_seen": 110460215, + "router_z_loss_clip": 2.03222656, + "router_z_loss_mlp": 0.16906738, + "step": 5142, + "time_per_iteration": 2.52750825881958 + }, + { + "auxiliary_loss_clip": 0.06498981, + "auxiliary_loss_mlp": 0.01276818, + "balance_loss_clip": 0.06294474, + "balance_loss_mlp": 0.01259438, + "epoch": 0.3092138884713663, + "flos": 17025845546880.0, + "grad_norm": 2.3473661014686984, + "language_loss": 0.7985431, + "learning_rate": 3.23642465389567e-06, + "loss": 0.87630117, + "num_input_tokens_seen": 110479385, + "router_z_loss_clip": 2.04785156, + "router_z_loss_mlp": 0.17382812, + "step": 5143, + "time_per_iteration": 2.658299207687378 + }, + { + "auxiliary_loss_clip": 0.06489455, + "auxiliary_loss_mlp": 0.01277055, + "balance_loss_clip": 0.06291586, + "balance_loss_mlp": 0.01260378, + "epoch": 0.3092740117240343, + "flos": 25017052636800.0, + "grad_norm": 1.6187717199492768, + "language_loss": 0.72479737, + "learning_rate": 3.236118509233055e-06, + "loss": 0.8024624, + "num_input_tokens_seen": 110499885, + "router_z_loss_clip": 1.97949219, + "router_z_loss_mlp": 0.16662598, + "step": 5144, + "time_per_iteration": 2.547358989715576 + }, + { + "auxiliary_loss_clip": 0.06496169, + "auxiliary_loss_mlp": 0.01272398, + "balance_loss_clip": 0.06292454, + "balance_loss_mlp": 0.01256138, + "epoch": 0.30933413497670226, + "flos": 25597013472000.0, + "grad_norm": 2.2714150562550466, + "language_loss": 0.74676621, + "learning_rate": 3.235812317696702e-06, + "loss": 0.82445192, + "num_input_tokens_seen": 110519690, + "router_z_loss_clip": 2.03515625, + "router_z_loss_mlp": 0.16271973, + "step": 5145, + "time_per_iteration": 2.6273365020751953 + }, + { + "auxiliary_loss_clip": 0.06490701, + "auxiliary_loss_mlp": 0.01273039, + "balance_loss_clip": 0.06289125, + "balance_loss_mlp": 0.01256296, + "epoch": 0.3093942582293702, + "flos": 24396617479680.0, + "grad_norm": 1.731689317121935, + "language_loss": 0.76830649, + "learning_rate": 3.2355060792982224e-06, + "loss": 0.84594393, + "num_input_tokens_seen": 110540520, + "router_z_loss_clip": 2.01464844, + "router_z_loss_mlp": 0.16729736, + "step": 5146, + "time_per_iteration": 2.5352702140808105 + }, + { + "auxiliary_loss_clip": 0.06485911, + "auxiliary_loss_mlp": 0.01273533, + "balance_loss_clip": 0.06287882, + "balance_loss_mlp": 0.0125707, + "epoch": 0.3094543814820382, + "flos": 19652586865920.0, + "grad_norm": 1.8011449994622988, + "language_loss": 0.66675043, + "learning_rate": 3.2351997940492286e-06, + "loss": 0.74434483, + "num_input_tokens_seen": 110557950, + "router_z_loss_clip": 1.97949219, + "router_z_loss_mlp": 0.16467285, + "step": 5147, + "time_per_iteration": 2.545940637588501 + }, + { + "auxiliary_loss_clip": 0.06492072, + "auxiliary_loss_mlp": 0.01271267, + "balance_loss_clip": 0.0628895, + "balance_loss_mlp": 0.01253731, + "epoch": 0.30951450473470615, + "flos": 25670499102720.0, + "grad_norm": 1.8580519203508368, + "language_loss": 0.74971956, + "learning_rate": 3.2348934619613346e-06, + "loss": 0.82735288, + "num_input_tokens_seen": 110578215, + "router_z_loss_clip": 2.03125, + "router_z_loss_mlp": 0.17529297, + "step": 5148, + "time_per_iteration": 2.5673537254333496 + }, + { + "auxiliary_loss_clip": 0.06501722, + "auxiliary_loss_mlp": 0.01278545, + "balance_loss_clip": 0.06290632, + "balance_loss_mlp": 0.01260342, + "epoch": 0.3095746279873741, + "flos": 12025202204160.0, + "grad_norm": 2.1335435485893166, + "language_loss": 0.73367, + "learning_rate": 3.2345870830461567e-06, + "loss": 0.81147265, + "num_input_tokens_seen": 110592990, + "router_z_loss_clip": 2.11230469, + "router_z_loss_mlp": 0.18212891, + "step": 5149, + "time_per_iteration": 2.682609796524048 + }, + { + "auxiliary_loss_clip": 0.06497431, + "auxiliary_loss_mlp": 0.01277143, + "balance_loss_clip": 0.06292653, + "balance_loss_mlp": 0.01258534, + "epoch": 0.3096347512400421, + "flos": 23629798039680.0, + "grad_norm": 1.913638713978071, + "language_loss": 0.85296845, + "learning_rate": 3.2342806573153132e-06, + "loss": 0.93071413, + "num_input_tokens_seen": 110612130, + "router_z_loss_clip": 2.04785156, + "router_z_loss_mlp": 0.18591309, + "step": 5150, + "time_per_iteration": 3.9813008308410645 + }, + { + "auxiliary_loss_clip": 0.06483387, + "auxiliary_loss_mlp": 0.01274387, + "balance_loss_clip": 0.06285527, + "balance_loss_mlp": 0.01256815, + "epoch": 0.30969487449271005, + "flos": 22536024768000.0, + "grad_norm": 1.8960829077128427, + "language_loss": 0.79181123, + "learning_rate": 3.233974184780424e-06, + "loss": 0.86938894, + "num_input_tokens_seen": 110632045, + "router_z_loss_clip": 1.97851562, + "router_z_loss_mlp": 0.17565918, + "step": 5151, + "time_per_iteration": 2.5336477756500244 + }, + { + "auxiliary_loss_clip": 0.06493182, + "auxiliary_loss_mlp": 0.01274378, + "balance_loss_clip": 0.06291731, + "balance_loss_mlp": 0.01257426, + "epoch": 0.309754997745378, + "flos": 15273301075200.0, + "grad_norm": 2.079664023782487, + "language_loss": 0.67843604, + "learning_rate": 3.2336676654531084e-06, + "loss": 0.75611162, + "num_input_tokens_seen": 110649340, + "router_z_loss_clip": 2.01367188, + "router_z_loss_mlp": 0.16931152, + "step": 5152, + "time_per_iteration": 5.332815647125244 + }, + { + "auxiliary_loss_clip": 0.06492282, + "auxiliary_loss_mlp": 0.01278303, + "balance_loss_clip": 0.06293005, + "balance_loss_mlp": 0.01261888, + "epoch": 0.309815120998046, + "flos": 26986532129280.0, + "grad_norm": 1.9990242894688834, + "language_loss": 0.83170605, + "learning_rate": 3.2333610993449926e-06, + "loss": 0.90941191, + "num_input_tokens_seen": 110668450, + "router_z_loss_clip": 1.98925781, + "router_z_loss_mlp": 0.16394043, + "step": 5153, + "time_per_iteration": 2.5944862365722656 + }, + { + "auxiliary_loss_clip": 0.06488585, + "auxiliary_loss_mlp": 0.0127453, + "balance_loss_clip": 0.06290878, + "balance_loss_mlp": 0.0125709, + "epoch": 0.30987524425071394, + "flos": 21149692565760.0, + "grad_norm": 1.7708804151784365, + "language_loss": 0.74136615, + "learning_rate": 3.2330544864676997e-06, + "loss": 0.81899732, + "num_input_tokens_seen": 110689410, + "router_z_loss_clip": 1.9765625, + "router_z_loss_mlp": 0.17456055, + "step": 5154, + "time_per_iteration": 2.529526948928833 + }, + { + "auxiliary_loss_clip": 0.0648791, + "auxiliary_loss_mlp": 0.01284436, + "balance_loss_clip": 0.06292189, + "balance_loss_mlp": 0.01267544, + "epoch": 0.3099353675033819, + "flos": 15273720345600.0, + "grad_norm": 2.7515131151360763, + "language_loss": 0.76419097, + "learning_rate": 3.232747826832858e-06, + "loss": 0.84191442, + "num_input_tokens_seen": 110707350, + "router_z_loss_clip": 1.95605469, + "router_z_loss_mlp": 0.16882324, + "step": 5155, + "time_per_iteration": 2.5338993072509766 + }, + { + "auxiliary_loss_clip": 0.06490543, + "auxiliary_loss_mlp": 0.01273122, + "balance_loss_clip": 0.06289169, + "balance_loss_mlp": 0.01256373, + "epoch": 0.30999549075604993, + "flos": 15419182233600.0, + "grad_norm": 1.684257178792462, + "language_loss": 0.79886794, + "learning_rate": 3.232441120452094e-06, + "loss": 0.87650466, + "num_input_tokens_seen": 110724910, + "router_z_loss_clip": 2.01367188, + "router_z_loss_mlp": 0.1673584, + "step": 5156, + "time_per_iteration": 2.5190272331237793 + }, + { + "auxiliary_loss_clip": 0.06493768, + "auxiliary_loss_mlp": 0.01281451, + "balance_loss_clip": 0.06290715, + "balance_loss_mlp": 0.01264821, + "epoch": 0.3100556140087179, + "flos": 23191106388480.0, + "grad_norm": 2.1803769191775197, + "language_loss": 0.74967813, + "learning_rate": 3.23213436733704e-06, + "loss": 0.82743037, + "num_input_tokens_seen": 110744010, + "router_z_loss_clip": 2.03125, + "router_z_loss_mlp": 0.16625977, + "step": 5157, + "time_per_iteration": 2.59045147895813 + }, + { + "auxiliary_loss_clip": 0.06486322, + "auxiliary_loss_mlp": 0.01274347, + "balance_loss_clip": 0.06289537, + "balance_loss_mlp": 0.01258921, + "epoch": 0.31011573726138586, + "flos": 25749770664960.0, + "grad_norm": 2.4337865277632065, + "language_loss": 0.69860423, + "learning_rate": 3.231827567499327e-06, + "loss": 0.7762109, + "num_input_tokens_seen": 110765835, + "router_z_loss_clip": 1.96777344, + "router_z_loss_mlp": 0.1541748, + "step": 5158, + "time_per_iteration": 4.041999578475952 + }, + { + "auxiliary_loss_clip": 0.06488799, + "auxiliary_loss_mlp": 0.0127365, + "balance_loss_clip": 0.0629247, + "balance_loss_mlp": 0.0125795, + "epoch": 0.3101758605140538, + "flos": 20017541324160.0, + "grad_norm": 2.0387737109261477, + "language_loss": 0.84883308, + "learning_rate": 3.2315207209505896e-06, + "loss": 0.92645758, + "num_input_tokens_seen": 110784655, + "router_z_loss_clip": 1.96484375, + "router_z_loss_mlp": 0.15673828, + "step": 5159, + "time_per_iteration": 2.5081369876861572 + }, + { + "auxiliary_loss_clip": 0.06491841, + "auxiliary_loss_mlp": 0.0127455, + "balance_loss_clip": 0.06293043, + "balance_loss_mlp": 0.01257002, + "epoch": 0.3102359837667218, + "flos": 19141751249280.0, + "grad_norm": 1.926707434190644, + "language_loss": 0.85498118, + "learning_rate": 3.231213827702462e-06, + "loss": 0.93264508, + "num_input_tokens_seen": 110802545, + "router_z_loss_clip": 1.98828125, + "router_z_loss_mlp": 0.17529297, + "step": 5160, + "time_per_iteration": 2.5466468334198 + }, + { + "auxiliary_loss_clip": 0.06486624, + "auxiliary_loss_mlp": 0.01270062, + "balance_loss_clip": 0.06291263, + "balance_loss_mlp": 0.01253945, + "epoch": 0.31029610701938976, + "flos": 22270649287680.0, + "grad_norm": 1.6869427612303989, + "language_loss": 0.75787026, + "learning_rate": 3.230906887766584e-06, + "loss": 0.83543712, + "num_input_tokens_seen": 110820265, + "router_z_loss_clip": 1.95214844, + "router_z_loss_mlp": 0.16113281, + "step": 5161, + "time_per_iteration": 2.518521785736084 + }, + { + "auxiliary_loss_clip": 0.06491208, + "auxiliary_loss_mlp": 0.0127494, + "balance_loss_clip": 0.06289751, + "balance_loss_mlp": 0.01256915, + "epoch": 0.3103562302720577, + "flos": 20810244476160.0, + "grad_norm": 2.463900279304932, + "language_loss": 0.8222912, + "learning_rate": 3.2305999011545924e-06, + "loss": 0.89995265, + "num_input_tokens_seen": 110836195, + "router_z_loss_clip": 2.01367188, + "router_z_loss_mlp": 0.18029785, + "step": 5162, + "time_per_iteration": 2.5057315826416016 + }, + { + "auxiliary_loss_clip": 0.06485277, + "auxiliary_loss_mlp": 0.01269002, + "balance_loss_clip": 0.06289959, + "balance_loss_mlp": 0.01253594, + "epoch": 0.3104163535247257, + "flos": 22350382047360.0, + "grad_norm": 1.4717884967200954, + "language_loss": 0.83087295, + "learning_rate": 3.2302928678781295e-06, + "loss": 0.90841573, + "num_input_tokens_seen": 110856420, + "router_z_loss_clip": 1.95507812, + "router_z_loss_mlp": 0.15423584, + "step": 5163, + "time_per_iteration": 2.542052745819092 + }, + { + "auxiliary_loss_clip": 0.06490193, + "auxiliary_loss_mlp": 0.01271791, + "balance_loss_clip": 0.06290418, + "balance_loss_mlp": 0.0125559, + "epoch": 0.31047647677739365, + "flos": 21695803551360.0, + "grad_norm": 1.756895513371669, + "language_loss": 0.76630449, + "learning_rate": 3.2299857879488376e-06, + "loss": 0.84392428, + "num_input_tokens_seen": 110876650, + "router_z_loss_clip": 1.99707031, + "router_z_loss_mlp": 0.16186523, + "step": 5164, + "time_per_iteration": 2.5616652965545654 + }, + { + "auxiliary_loss_clip": 0.06486434, + "auxiliary_loss_mlp": 0.01274591, + "balance_loss_clip": 0.0628885, + "balance_loss_mlp": 0.01258331, + "epoch": 0.3105366000300616, + "flos": 18923390709120.0, + "grad_norm": 1.866784827400394, + "language_loss": 0.75307393, + "learning_rate": 3.2296786613783626e-06, + "loss": 0.83068419, + "num_input_tokens_seen": 110894445, + "router_z_loss_clip": 1.97558594, + "router_z_loss_mlp": 0.16271973, + "step": 5165, + "time_per_iteration": 2.5190699100494385 + }, + { + "auxiliary_loss_clip": 0.06483215, + "auxiliary_loss_mlp": 0.01271797, + "balance_loss_clip": 0.062862, + "balance_loss_mlp": 0.01255096, + "epoch": 0.3105967232827296, + "flos": 18266380444800.0, + "grad_norm": 1.5432274368627708, + "language_loss": 0.76476973, + "learning_rate": 3.229371488178348e-06, + "loss": 0.84231985, + "num_input_tokens_seen": 110912855, + "router_z_loss_clip": 1.97070312, + "router_z_loss_mlp": 0.16699219, + "step": 5166, + "time_per_iteration": 2.5421557426452637 + }, + { + "auxiliary_loss_clip": 0.06486712, + "auxiliary_loss_mlp": 0.01273485, + "balance_loss_clip": 0.06287863, + "balance_loss_mlp": 0.01256796, + "epoch": 0.31065684653539755, + "flos": 17677279514880.0, + "grad_norm": 2.119255684006569, + "language_loss": 0.74129677, + "learning_rate": 3.229064268360444e-06, + "loss": 0.81889874, + "num_input_tokens_seen": 110928025, + "router_z_loss_clip": 1.98828125, + "router_z_loss_mlp": 0.16687012, + "step": 5167, + "time_per_iteration": 2.5039737224578857 + }, + { + "auxiliary_loss_clip": 0.06378125, + "auxiliary_loss_mlp": 0.01261765, + "balance_loss_clip": 0.06284033, + "balance_loss_mlp": 0.01258356, + "epoch": 0.3107169697880655, + "flos": 68551522151040.0, + "grad_norm": 0.7172817016896729, + "language_loss": 0.53065968, + "learning_rate": 3.2287570019362997e-06, + "loss": 0.60705864, + "num_input_tokens_seen": 110992215, + "router_z_loss_clip": 0.93945312, + "router_z_loss_mlp": 0.03417969, + "step": 5168, + "time_per_iteration": 3.211498737335205 + }, + { + "auxiliary_loss_clip": 0.06491841, + "auxiliary_loss_mlp": 0.0127061, + "balance_loss_clip": 0.06290184, + "balance_loss_mlp": 0.01254052, + "epoch": 0.3107770930407335, + "flos": 13193844698880.0, + "grad_norm": 1.7226101243088363, + "language_loss": 0.79536855, + "learning_rate": 3.2284496889175668e-06, + "loss": 0.87299311, + "num_input_tokens_seen": 111010400, + "router_z_loss_clip": 2.01757812, + "router_z_loss_mlp": 0.16552734, + "step": 5169, + "time_per_iteration": 2.526906728744507 + }, + { + "auxiliary_loss_clip": 0.06491011, + "auxiliary_loss_mlp": 0.01271387, + "balance_loss_clip": 0.06288561, + "balance_loss_mlp": 0.01254328, + "epoch": 0.3108372162934015, + "flos": 31589587048320.0, + "grad_norm": 1.7384868970357352, + "language_loss": 0.6439994, + "learning_rate": 3.2281423293158986e-06, + "loss": 0.7216233, + "num_input_tokens_seen": 111033960, + "router_z_loss_clip": 2.02636719, + "router_z_loss_mlp": 0.17077637, + "step": 5170, + "time_per_iteration": 2.659008264541626 + }, + { + "auxiliary_loss_clip": 0.06488822, + "auxiliary_loss_mlp": 0.01276189, + "balance_loss_clip": 0.06288925, + "balance_loss_mlp": 0.01258927, + "epoch": 0.31089733954606946, + "flos": 28737231811200.0, + "grad_norm": 2.2754975952460086, + "language_loss": 0.77238673, + "learning_rate": 3.22783492314295e-06, + "loss": 0.8500368, + "num_input_tokens_seen": 111053265, + "router_z_loss_clip": 1.99902344, + "router_z_loss_mlp": 0.17260742, + "step": 5171, + "time_per_iteration": 2.5726847648620605 + }, + { + "auxiliary_loss_clip": 0.06489364, + "auxiliary_loss_mlp": 0.01274912, + "balance_loss_clip": 0.06290348, + "balance_loss_mlp": 0.01258294, + "epoch": 0.3109574627987374, + "flos": 19689455462400.0, + "grad_norm": 1.774750718996553, + "language_loss": 0.84023309, + "learning_rate": 3.2275274704103785e-06, + "loss": 0.91787583, + "num_input_tokens_seen": 111071130, + "router_z_loss_clip": 1.9921875, + "router_z_loss_mlp": 0.16625977, + "step": 5172, + "time_per_iteration": 2.5289804935455322 + }, + { + "auxiliary_loss_clip": 0.06485899, + "auxiliary_loss_mlp": 0.01271683, + "balance_loss_clip": 0.06284268, + "balance_loss_mlp": 0.01254493, + "epoch": 0.3110175860514054, + "flos": 14689231390080.0, + "grad_norm": 2.444929493076507, + "language_loss": 0.8466565, + "learning_rate": 3.227219971129842e-06, + "loss": 0.92423236, + "num_input_tokens_seen": 111089560, + "router_z_loss_clip": 2.01855469, + "router_z_loss_mlp": 0.17199707, + "step": 5173, + "time_per_iteration": 2.477851629257202 + }, + { + "auxiliary_loss_clip": 0.06478094, + "auxiliary_loss_mlp": 0.01270979, + "balance_loss_clip": 0.06285643, + "balance_loss_mlp": 0.01255279, + "epoch": 0.31107770930407336, + "flos": 25746835772160.0, + "grad_norm": 1.6684709759498597, + "language_loss": 0.83928138, + "learning_rate": 3.226912425313001e-06, + "loss": 0.91677213, + "num_input_tokens_seen": 111109960, + "router_z_loss_clip": 1.92382812, + "router_z_loss_mlp": 0.15698242, + "step": 5174, + "time_per_iteration": 2.6188318729400635 + }, + { + "auxiliary_loss_clip": 0.06483682, + "auxiliary_loss_mlp": 0.0127308, + "balance_loss_clip": 0.06284115, + "balance_loss_mlp": 0.01256057, + "epoch": 0.3111378325567413, + "flos": 19214272558080.0, + "grad_norm": 2.0188284806938945, + "language_loss": 0.85820258, + "learning_rate": 3.2266048329715183e-06, + "loss": 0.93577021, + "num_input_tokens_seen": 111127960, + "router_z_loss_clip": 1.99414062, + "router_z_loss_mlp": 0.17016602, + "step": 5175, + "time_per_iteration": 2.489356756210327 + }, + { + "auxiliary_loss_clip": 0.06477995, + "auxiliary_loss_mlp": 0.01275126, + "balance_loss_clip": 0.0628527, + "balance_loss_mlp": 0.01257352, + "epoch": 0.3111979558094093, + "flos": 23703199816320.0, + "grad_norm": 1.907748003287586, + "language_loss": 0.84357607, + "learning_rate": 3.2262971941170575e-06, + "loss": 0.92110729, + "num_input_tokens_seen": 111146730, + "router_z_loss_clip": 1.92675781, + "router_z_loss_mlp": 0.17773438, + "step": 5176, + "time_per_iteration": 2.599229574203491 + }, + { + "auxiliary_loss_clip": 0.06476277, + "auxiliary_loss_mlp": 0.01273206, + "balance_loss_clip": 0.06279132, + "balance_loss_mlp": 0.01255468, + "epoch": 0.31125807906207725, + "flos": 21039422192640.0, + "grad_norm": 2.9714078029027977, + "language_loss": 0.80720133, + "learning_rate": 3.2259895087612837e-06, + "loss": 0.88469613, + "num_input_tokens_seen": 111166295, + "router_z_loss_clip": 1.96972656, + "router_z_loss_mlp": 0.17736816, + "step": 5177, + "time_per_iteration": 2.500892162322998 + }, + { + "auxiliary_loss_clip": 0.06482373, + "auxiliary_loss_mlp": 0.01272639, + "balance_loss_clip": 0.06283157, + "balance_loss_mlp": 0.01255353, + "epoch": 0.3113182023147452, + "flos": 23083435491840.0, + "grad_norm": 1.9531801027744504, + "language_loss": 0.81037831, + "learning_rate": 3.2256817769158657e-06, + "loss": 0.88792837, + "num_input_tokens_seen": 111185665, + "router_z_loss_clip": 1.9921875, + "router_z_loss_mlp": 0.17285156, + "step": 5178, + "time_per_iteration": 2.6086864471435547 + }, + { + "auxiliary_loss_clip": 0.06483644, + "auxiliary_loss_mlp": 0.01276661, + "balance_loss_clip": 0.06283852, + "balance_loss_mlp": 0.01259316, + "epoch": 0.3113783255674132, + "flos": 11843919895680.0, + "grad_norm": 1.9055325557306373, + "language_loss": 0.81524587, + "learning_rate": 3.225373998592471e-06, + "loss": 0.89284897, + "num_input_tokens_seen": 111201615, + "router_z_loss_clip": 1.99511719, + "router_z_loss_mlp": 0.17346191, + "step": 5179, + "time_per_iteration": 2.4582295417785645 + }, + { + "auxiliary_loss_clip": 0.06482498, + "auxiliary_loss_mlp": 0.01272412, + "balance_loss_clip": 0.06285708, + "balance_loss_mlp": 0.01255926, + "epoch": 0.31143844882008115, + "flos": 16295098089600.0, + "grad_norm": 1.625598326664227, + "language_loss": 0.78714401, + "learning_rate": 3.2250661738027715e-06, + "loss": 0.86469316, + "num_input_tokens_seen": 111220515, + "router_z_loss_clip": 1.96582031, + "router_z_loss_mlp": 0.16491699, + "step": 5180, + "time_per_iteration": 2.4980807304382324 + }, + { + "auxiliary_loss_clip": 0.06486566, + "auxiliary_loss_mlp": 0.01274849, + "balance_loss_clip": 0.06288585, + "balance_loss_mlp": 0.01257742, + "epoch": 0.3114985720727491, + "flos": 23223824208000.0, + "grad_norm": 4.8505374097148595, + "language_loss": 0.83649975, + "learning_rate": 3.22475830255844e-06, + "loss": 0.91411394, + "num_input_tokens_seen": 111240395, + "router_z_loss_clip": 1.98144531, + "router_z_loss_mlp": 0.17102051, + "step": 5181, + "time_per_iteration": 2.519810438156128 + }, + { + "auxiliary_loss_clip": 0.0648061, + "auxiliary_loss_mlp": 0.01273344, + "balance_loss_clip": 0.06285872, + "balance_loss_mlp": 0.01258348, + "epoch": 0.3115586953254171, + "flos": 30052468224000.0, + "grad_norm": 1.6592506395593873, + "language_loss": 0.74442661, + "learning_rate": 3.2244503848711516e-06, + "loss": 0.82196611, + "num_input_tokens_seen": 111261100, + "router_z_loss_clip": 1.94726562, + "router_z_loss_mlp": 0.15002441, + "step": 5182, + "time_per_iteration": 2.6227729320526123 + }, + { + "auxiliary_loss_clip": 0.06490366, + "auxiliary_loss_mlp": 0.01270872, + "balance_loss_clip": 0.06288615, + "balance_loss_mlp": 0.01254362, + "epoch": 0.3116188185780851, + "flos": 25673433995520.0, + "grad_norm": 2.0195817263542852, + "language_loss": 0.70974112, + "learning_rate": 3.2241424207525815e-06, + "loss": 0.78735352, + "num_input_tokens_seen": 111281320, + "router_z_loss_clip": 2.015625, + "router_z_loss_mlp": 0.16503906, + "step": 5183, + "time_per_iteration": 2.5801775455474854 + }, + { + "auxiliary_loss_clip": 0.06369011, + "auxiliary_loss_mlp": 0.0126694, + "balance_loss_clip": 0.06276023, + "balance_loss_mlp": 0.0126376, + "epoch": 0.31167894183075306, + "flos": 69528568285440.0, + "grad_norm": 0.9410725627351464, + "language_loss": 0.59133947, + "learning_rate": 3.223834410214408e-06, + "loss": 0.66769892, + "num_input_tokens_seen": 111341405, + "router_z_loss_clip": 0.9296875, + "router_z_loss_mlp": 0.03182983, + "step": 5184, + "time_per_iteration": 3.1446807384490967 + }, + { + "auxiliary_loss_clip": 0.06488199, + "auxiliary_loss_mlp": 0.01277241, + "balance_loss_clip": 0.06288702, + "balance_loss_mlp": 0.01260206, + "epoch": 0.31173906508342103, + "flos": 14945215213440.0, + "grad_norm": 2.5697318046341424, + "language_loss": 0.69689488, + "learning_rate": 3.223526353268311e-06, + "loss": 0.77454925, + "num_input_tokens_seen": 111358975, + "router_z_loss_clip": 1.99121094, + "router_z_loss_mlp": 0.17041016, + "step": 5185, + "time_per_iteration": 2.51505446434021 + }, + { + "auxiliary_loss_clip": 0.06492566, + "auxiliary_loss_mlp": 0.01273506, + "balance_loss_clip": 0.06291321, + "balance_loss_mlp": 0.01256507, + "epoch": 0.311799188336089, + "flos": 16180886574720.0, + "grad_norm": 2.500262239817252, + "language_loss": 0.63946617, + "learning_rate": 3.2232182499259725e-06, + "loss": 0.71712691, + "num_input_tokens_seen": 111375845, + "router_z_loss_clip": 2.01269531, + "router_z_loss_mlp": 0.17004395, + "step": 5186, + "time_per_iteration": 2.505030870437622 + }, + { + "auxiliary_loss_clip": 0.06492127, + "auxiliary_loss_mlp": 0.01277284, + "balance_loss_clip": 0.06286798, + "balance_loss_mlp": 0.01258592, + "epoch": 0.31185931158875696, + "flos": 25016633366400.0, + "grad_norm": 2.1681671670490603, + "language_loss": 0.86641979, + "learning_rate": 3.2229101001990747e-06, + "loss": 0.94411391, + "num_input_tokens_seen": 111394150, + "router_z_loss_clip": 2.05664062, + "router_z_loss_mlp": 0.18688965, + "step": 5187, + "time_per_iteration": 2.583510160446167 + }, + { + "auxiliary_loss_clip": 0.06487665, + "auxiliary_loss_mlp": 0.01281669, + "balance_loss_clip": 0.06287494, + "balance_loss_mlp": 0.01264527, + "epoch": 0.3119194348414249, + "flos": 37242041702400.0, + "grad_norm": 1.4465041932602023, + "language_loss": 0.6305244, + "learning_rate": 3.2226019040993036e-06, + "loss": 0.70821768, + "num_input_tokens_seen": 111418355, + "router_z_loss_clip": 2.00097656, + "router_z_loss_mlp": 0.17138672, + "step": 5188, + "time_per_iteration": 2.7036139965057373 + }, + { + "auxiliary_loss_clip": 0.06486794, + "auxiliary_loss_mlp": 0.01278194, + "balance_loss_clip": 0.06286722, + "balance_loss_mlp": 0.01261397, + "epoch": 0.3119795580940929, + "flos": 15018155792640.0, + "grad_norm": 2.1005201528303683, + "language_loss": 0.83722234, + "learning_rate": 3.222293661638346e-06, + "loss": 0.91487223, + "num_input_tokens_seen": 111435445, + "router_z_loss_clip": 1.99902344, + "router_z_loss_mlp": 0.16796875, + "step": 5189, + "time_per_iteration": 3.933061361312866 + }, + { + "auxiliary_loss_clip": 0.06481164, + "auxiliary_loss_mlp": 0.0127866, + "balance_loss_clip": 0.06286235, + "balance_loss_mlp": 0.01262602, + "epoch": 0.31203968134676086, + "flos": 16003755043200.0, + "grad_norm": 2.4405990352060862, + "language_loss": 0.79429829, + "learning_rate": 3.22198537282789e-06, + "loss": 0.87189662, + "num_input_tokens_seen": 111453430, + "router_z_loss_clip": 1.94824219, + "router_z_loss_mlp": 0.16064453, + "step": 5190, + "time_per_iteration": 2.479335308074951 + }, + { + "auxiliary_loss_clip": 0.0648755, + "auxiliary_loss_mlp": 0.01275874, + "balance_loss_clip": 0.06287287, + "balance_loss_mlp": 0.01259292, + "epoch": 0.3120998045994288, + "flos": 23843378897280.0, + "grad_norm": 1.451249914697294, + "language_loss": 0.75502658, + "learning_rate": 3.2216770376796262e-06, + "loss": 0.83266091, + "num_input_tokens_seen": 111475325, + "router_z_loss_clip": 2.00195312, + "router_z_loss_mlp": 0.16589355, + "step": 5191, + "time_per_iteration": 3.997621536254883 + }, + { + "auxiliary_loss_clip": 0.06364973, + "auxiliary_loss_mlp": 0.01267778, + "balance_loss_clip": 0.0627303, + "balance_loss_mlp": 0.01264178, + "epoch": 0.3121599278520968, + "flos": 69203081900160.0, + "grad_norm": 0.8286054534369729, + "language_loss": 0.63964236, + "learning_rate": 3.221368656205247e-06, + "loss": 0.71596992, + "num_input_tokens_seen": 111533960, + "router_z_loss_clip": 0.91796875, + "router_z_loss_mlp": 0.03594971, + "step": 5192, + "time_per_iteration": 4.631687879562378 + }, + { + "auxiliary_loss_clip": 0.06487048, + "auxiliary_loss_mlp": 0.01274026, + "balance_loss_clip": 0.06284614, + "balance_loss_mlp": 0.01254916, + "epoch": 0.31222005110476475, + "flos": 23813302481280.0, + "grad_norm": 1.6272414578256373, + "language_loss": 0.80280936, + "learning_rate": 3.221060228416446e-06, + "loss": 0.88042009, + "num_input_tokens_seen": 111554055, + "router_z_loss_clip": 2.02441406, + "router_z_loss_mlp": 0.19116211, + "step": 5193, + "time_per_iteration": 2.5469777584075928 + }, + { + "auxiliary_loss_clip": 0.06487141, + "auxiliary_loss_mlp": 0.01274246, + "balance_loss_clip": 0.06286725, + "balance_loss_mlp": 0.01255244, + "epoch": 0.3122801743574327, + "flos": 25232771773440.0, + "grad_norm": 1.8740192083695482, + "language_loss": 0.72266662, + "learning_rate": 3.2207517543249183e-06, + "loss": 0.80028057, + "num_input_tokens_seen": 111574305, + "router_z_loss_clip": 2.00292969, + "router_z_loss_mlp": 0.19006348, + "step": 5194, + "time_per_iteration": 2.5416929721832275 + }, + { + "auxiliary_loss_clip": 0.06483766, + "auxiliary_loss_mlp": 0.01273792, + "balance_loss_clip": 0.06285778, + "balance_loss_mlp": 0.01257604, + "epoch": 0.3123402976101007, + "flos": 22973165118720.0, + "grad_norm": 1.4810805631902553, + "language_loss": 0.77076054, + "learning_rate": 3.2204432339423616e-06, + "loss": 0.8483361, + "num_input_tokens_seen": 111595680, + "router_z_loss_clip": 1.97949219, + "router_z_loss_mlp": 0.16186523, + "step": 5195, + "time_per_iteration": 2.5890305042266846 + }, + { + "auxiliary_loss_clip": 0.06489303, + "auxiliary_loss_mlp": 0.01273064, + "balance_loss_clip": 0.06285589, + "balance_loss_mlp": 0.01256268, + "epoch": 0.3124004208627687, + "flos": 25199131559040.0, + "grad_norm": 1.3828607146804377, + "language_loss": 0.78218812, + "learning_rate": 3.220134667280476e-06, + "loss": 0.85981178, + "num_input_tokens_seen": 111618135, + "router_z_loss_clip": 2.03710938, + "router_z_loss_mlp": 0.16796875, + "step": 5196, + "time_per_iteration": 2.608607769012451 + }, + { + "auxiliary_loss_clip": 0.06360652, + "auxiliary_loss_mlp": 0.0126022, + "balance_loss_clip": 0.06268834, + "balance_loss_mlp": 0.01256831, + "epoch": 0.31246054411543667, + "flos": 67506398974080.0, + "grad_norm": 0.7576873975695796, + "language_loss": 0.54860902, + "learning_rate": 3.2198260543509613e-06, + "loss": 0.62481773, + "num_input_tokens_seen": 111682220, + "router_z_loss_clip": 0.91455078, + "router_z_loss_mlp": 0.03396606, + "step": 5197, + "time_per_iteration": 4.588749170303345 + }, + { + "auxiliary_loss_clip": 0.06482677, + "auxiliary_loss_mlp": 0.0127766, + "balance_loss_clip": 0.06286696, + "balance_loss_mlp": 0.01261424, + "epoch": 0.31252066736810463, + "flos": 17864347754880.0, + "grad_norm": 1.7824095594325715, + "language_loss": 0.67078102, + "learning_rate": 3.21951739516552e-06, + "loss": 0.74838442, + "num_input_tokens_seen": 111700815, + "router_z_loss_clip": 1.9609375, + "router_z_loss_mlp": 0.16247559, + "step": 5198, + "time_per_iteration": 2.5304651260375977 + }, + { + "auxiliary_loss_clip": 0.06490927, + "auxiliary_loss_mlp": 0.01280145, + "balance_loss_clip": 0.06286959, + "balance_loss_mlp": 0.01261596, + "epoch": 0.3125807906207726, + "flos": 18480338645760.0, + "grad_norm": 2.4146329055675264, + "language_loss": 0.70401263, + "learning_rate": 3.219208689735857e-06, + "loss": 0.78172338, + "num_input_tokens_seen": 111718195, + "router_z_loss_clip": 2.04101562, + "router_z_loss_mlp": 0.1854248, + "step": 5199, + "time_per_iteration": 2.5358517169952393 + }, + { + "auxiliary_loss_clip": 0.06486207, + "auxiliary_loss_mlp": 0.01275953, + "balance_loss_clip": 0.06286721, + "balance_loss_mlp": 0.01258751, + "epoch": 0.31264091387344056, + "flos": 18951454627200.0, + "grad_norm": 1.7917967449154466, + "language_loss": 0.79258394, + "learning_rate": 3.2188999380736785e-06, + "loss": 0.87020558, + "num_input_tokens_seen": 111734440, + "router_z_loss_clip": 1.99316406, + "router_z_loss_mlp": 0.17211914, + "step": 5200, + "time_per_iteration": 2.5519278049468994 + }, + { + "auxiliary_loss_clip": 0.06479242, + "auxiliary_loss_mlp": 0.0127792, + "balance_loss_clip": 0.06284697, + "balance_loss_mlp": 0.01261195, + "epoch": 0.3127010371261085, + "flos": 21474591972480.0, + "grad_norm": 1.8808343302197998, + "language_loss": 0.83758473, + "learning_rate": 3.2185911401906917e-06, + "loss": 0.91515636, + "num_input_tokens_seen": 111751960, + "router_z_loss_clip": 1.94726562, + "router_z_loss_mlp": 0.16711426, + "step": 5201, + "time_per_iteration": 2.509331226348877 + }, + { + "auxiliary_loss_clip": 0.06487838, + "auxiliary_loss_mlp": 0.0127922, + "balance_loss_clip": 0.06288306, + "balance_loss_mlp": 0.01262006, + "epoch": 0.3127611603787765, + "flos": 15340623431040.0, + "grad_norm": 2.173524859167814, + "language_loss": 0.69690537, + "learning_rate": 3.2182822960986072e-06, + "loss": 0.77457595, + "num_input_tokens_seen": 111769585, + "router_z_loss_clip": 1.99414062, + "router_z_loss_mlp": 0.17224121, + "step": 5202, + "time_per_iteration": 2.52652907371521 + }, + { + "auxiliary_loss_clip": 0.06486704, + "auxiliary_loss_mlp": 0.01278352, + "balance_loss_clip": 0.06286184, + "balance_loss_mlp": 0.01261257, + "epoch": 0.31282128363144446, + "flos": 17608741274880.0, + "grad_norm": 2.6038382996561604, + "language_loss": 0.83874559, + "learning_rate": 3.2179734058091358e-06, + "loss": 0.91639626, + "num_input_tokens_seen": 111787880, + "router_z_loss_clip": 2.00488281, + "router_z_loss_mlp": 0.17077637, + "step": 5203, + "time_per_iteration": 2.502721071243286 + }, + { + "auxiliary_loss_clip": 0.06488604, + "auxiliary_loss_mlp": 0.01274199, + "balance_loss_clip": 0.06287186, + "balance_loss_mlp": 0.01256604, + "epoch": 0.3128814068841124, + "flos": 26763349979520.0, + "grad_norm": 2.412675439541041, + "language_loss": 0.61310971, + "learning_rate": 3.2176644693339913e-06, + "loss": 0.69073772, + "num_input_tokens_seen": 111805950, + "router_z_loss_clip": 2.01464844, + "router_z_loss_mlp": 0.17602539, + "step": 5204, + "time_per_iteration": 2.62591814994812 + }, + { + "auxiliary_loss_clip": 0.06482827, + "auxiliary_loss_mlp": 0.01275158, + "balance_loss_clip": 0.0628654, + "balance_loss_mlp": 0.01259553, + "epoch": 0.3129415301367804, + "flos": 22278783133440.0, + "grad_norm": 1.7324044566720012, + "language_loss": 0.66418731, + "learning_rate": 3.217355486684887e-06, + "loss": 0.74176717, + "num_input_tokens_seen": 111826135, + "router_z_loss_clip": 1.96191406, + "router_z_loss_mlp": 0.15582275, + "step": 5205, + "time_per_iteration": 2.512777328491211 + }, + { + "auxiliary_loss_clip": 0.06487758, + "auxiliary_loss_mlp": 0.01277628, + "balance_loss_clip": 0.06287788, + "balance_loss_mlp": 0.01260021, + "epoch": 0.31300165338944835, + "flos": 26471461881600.0, + "grad_norm": 1.8344199627772577, + "language_loss": 0.77298087, + "learning_rate": 3.2170464578735414e-06, + "loss": 0.85063475, + "num_input_tokens_seen": 111844700, + "router_z_loss_clip": 1.99902344, + "router_z_loss_mlp": 0.17614746, + "step": 5206, + "time_per_iteration": 2.5712244510650635 + }, + { + "auxiliary_loss_clip": 0.06485735, + "auxiliary_loss_mlp": 0.01271701, + "balance_loss_clip": 0.06288184, + "balance_loss_mlp": 0.01255488, + "epoch": 0.3130617766421163, + "flos": 21951116542080.0, + "grad_norm": 2.0121384013718226, + "language_loss": 0.83184564, + "learning_rate": 3.216737382911672e-06, + "loss": 0.90941995, + "num_input_tokens_seen": 111861585, + "router_z_loss_clip": 1.97558594, + "router_z_loss_mlp": 0.16210938, + "step": 5207, + "time_per_iteration": 2.5004825592041016 + }, + { + "auxiliary_loss_clip": 0.06481713, + "auxiliary_loss_mlp": 0.01271341, + "balance_loss_clip": 0.06286129, + "balance_loss_mlp": 0.0125489, + "epoch": 0.3131218998947843, + "flos": 23299154628480.0, + "grad_norm": 2.0890442442793478, + "language_loss": 0.71795774, + "learning_rate": 3.216428261810999e-06, + "loss": 0.79548824, + "num_input_tokens_seen": 111882950, + "router_z_loss_clip": 1.95605469, + "router_z_loss_mlp": 0.16442871, + "step": 5208, + "time_per_iteration": 2.5763585567474365 + }, + { + "auxiliary_loss_clip": 0.06485837, + "auxiliary_loss_mlp": 0.01275661, + "balance_loss_clip": 0.06287587, + "balance_loss_mlp": 0.0125927, + "epoch": 0.3131820231474523, + "flos": 21145583715840.0, + "grad_norm": 1.890905451265213, + "language_loss": 0.74832964, + "learning_rate": 3.2161190945832445e-06, + "loss": 0.82594466, + "num_input_tokens_seen": 111901640, + "router_z_loss_clip": 1.98046875, + "router_z_loss_mlp": 0.1640625, + "step": 5209, + "time_per_iteration": 2.510582685470581 + }, + { + "auxiliary_loss_clip": 0.06483819, + "auxiliary_loss_mlp": 0.01271712, + "balance_loss_clip": 0.06284019, + "balance_loss_mlp": 0.01255678, + "epoch": 0.31324214640012027, + "flos": 23915816352000.0, + "grad_norm": 1.8368712630160764, + "language_loss": 0.77846575, + "learning_rate": 3.2158098812401325e-06, + "loss": 0.85602105, + "num_input_tokens_seen": 111919615, + "router_z_loss_clip": 1.99609375, + "router_z_loss_mlp": 0.16027832, + "step": 5210, + "time_per_iteration": 2.5457394123077393 + }, + { + "auxiliary_loss_clip": 0.06472643, + "auxiliary_loss_mlp": 0.01278598, + "balance_loss_clip": 0.06280389, + "balance_loss_mlp": 0.01262963, + "epoch": 0.31330226965278823, + "flos": 22243507764480.0, + "grad_norm": 1.7690758446531836, + "language_loss": 0.79563594, + "learning_rate": 3.2155006217933874e-06, + "loss": 0.87314838, + "num_input_tokens_seen": 111938485, + "router_z_loss_clip": 1.92285156, + "router_z_loss_mlp": 0.15643311, + "step": 5211, + "time_per_iteration": 2.5383517742156982 + }, + { + "auxiliary_loss_clip": 0.0648172, + "auxiliary_loss_mlp": 0.01270065, + "balance_loss_clip": 0.06285914, + "balance_loss_mlp": 0.01254699, + "epoch": 0.3133623929054562, + "flos": 19759838492160.0, + "grad_norm": 1.6892345584465767, + "language_loss": 0.79993588, + "learning_rate": 3.2151913162547367e-06, + "loss": 0.87745374, + "num_input_tokens_seen": 111956425, + "router_z_loss_clip": 1.95996094, + "router_z_loss_mlp": 0.15368652, + "step": 5212, + "time_per_iteration": 2.5550856590270996 + }, + { + "auxiliary_loss_clip": 0.06489062, + "auxiliary_loss_mlp": 0.01276168, + "balance_loss_clip": 0.06287421, + "balance_loss_mlp": 0.01258919, + "epoch": 0.31342251615812416, + "flos": 27169617300480.0, + "grad_norm": 2.030797991853156, + "language_loss": 0.71651685, + "learning_rate": 3.2148819646359097e-06, + "loss": 0.79416913, + "num_input_tokens_seen": 111975915, + "router_z_loss_clip": 2.01660156, + "router_z_loss_mlp": 0.17248535, + "step": 5213, + "time_per_iteration": 2.5827908515930176 + }, + { + "auxiliary_loss_clip": 0.06486979, + "auxiliary_loss_mlp": 0.01275678, + "balance_loss_clip": 0.06285015, + "balance_loss_mlp": 0.01258763, + "epoch": 0.31348263941079213, + "flos": 20235985718400.0, + "grad_norm": 2.164105834219518, + "language_loss": 0.77949297, + "learning_rate": 3.2145725669486374e-06, + "loss": 0.85711956, + "num_input_tokens_seen": 111995055, + "router_z_loss_clip": 2.01757812, + "router_z_loss_mlp": 0.16918945, + "step": 5214, + "time_per_iteration": 2.539149761199951 + }, + { + "auxiliary_loss_clip": 0.06478322, + "auxiliary_loss_mlp": 0.0127674, + "balance_loss_clip": 0.06285194, + "balance_loss_mlp": 0.01261267, + "epoch": 0.3135427626634601, + "flos": 24614474895360.0, + "grad_norm": 1.5354860146289633, + "language_loss": 0.82935429, + "learning_rate": 3.2142631232046517e-06, + "loss": 0.90690494, + "num_input_tokens_seen": 112015830, + "router_z_loss_clip": 1.92871094, + "router_z_loss_mlp": 0.15472412, + "step": 5215, + "time_per_iteration": 2.541269302368164 + }, + { + "auxiliary_loss_clip": 0.06486098, + "auxiliary_loss_mlp": 0.01273565, + "balance_loss_clip": 0.06288007, + "balance_loss_mlp": 0.01257186, + "epoch": 0.31360288591612806, + "flos": 20966230051200.0, + "grad_norm": 1.8278899125375987, + "language_loss": 0.79790628, + "learning_rate": 3.213953633415686e-06, + "loss": 0.87550294, + "num_input_tokens_seen": 112035065, + "router_z_loss_clip": 1.98144531, + "router_z_loss_mlp": 0.16369629, + "step": 5216, + "time_per_iteration": 2.5465261936187744 + }, + { + "auxiliary_loss_clip": 0.06489767, + "auxiliary_loss_mlp": 0.0127801, + "balance_loss_clip": 0.06286536, + "balance_loss_mlp": 0.01258722, + "epoch": 0.313663009168796, + "flos": 26987957648640.0, + "grad_norm": 1.8964979694160957, + "language_loss": 0.68953168, + "learning_rate": 3.213644097593477e-06, + "loss": 0.76720947, + "num_input_tokens_seen": 112058405, + "router_z_loss_clip": 2.03125, + "router_z_loss_mlp": 0.19299316, + "step": 5217, + "time_per_iteration": 2.5518875122070312 + }, + { + "auxiliary_loss_clip": 0.06480299, + "auxiliary_loss_mlp": 0.01275451, + "balance_loss_clip": 0.06283456, + "balance_loss_mlp": 0.01259298, + "epoch": 0.313723132421464, + "flos": 18046762093440.0, + "grad_norm": 1.6389262097165689, + "language_loss": 0.80772746, + "learning_rate": 3.2133345157497624e-06, + "loss": 0.88528496, + "num_input_tokens_seen": 112076420, + "router_z_loss_clip": 1.96679688, + "router_z_loss_mlp": 0.16149902, + "step": 5218, + "time_per_iteration": 2.5255727767944336 + }, + { + "auxiliary_loss_clip": 0.06485314, + "auxiliary_loss_mlp": 0.0127641, + "balance_loss_clip": 0.06285116, + "balance_loss_mlp": 0.01259363, + "epoch": 0.31378325567413196, + "flos": 22494963467520.0, + "grad_norm": 2.253901481236794, + "language_loss": 0.70057523, + "learning_rate": 3.2130248878962813e-06, + "loss": 0.77819252, + "num_input_tokens_seen": 112090775, + "router_z_loss_clip": 2.00195312, + "router_z_loss_mlp": 0.17047119, + "step": 5219, + "time_per_iteration": 2.487877368927002 + }, + { + "auxiliary_loss_clip": 0.06483484, + "auxiliary_loss_mlp": 0.0127692, + "balance_loss_clip": 0.06284904, + "balance_loss_mlp": 0.01259181, + "epoch": 0.3138433789267999, + "flos": 22425838248960.0, + "grad_norm": 1.9320324134388631, + "language_loss": 0.80156839, + "learning_rate": 3.2127152140447747e-06, + "loss": 0.87917244, + "num_input_tokens_seen": 112110980, + "router_z_loss_clip": 1.98535156, + "router_z_loss_mlp": 0.17736816, + "step": 5220, + "time_per_iteration": 2.5364530086517334 + }, + { + "auxiliary_loss_clip": 0.06484166, + "auxiliary_loss_mlp": 0.01276534, + "balance_loss_clip": 0.06287254, + "balance_loss_mlp": 0.01260751, + "epoch": 0.3139035021794679, + "flos": 13010927235840.0, + "grad_norm": 1.8390249578816682, + "language_loss": 0.73235905, + "learning_rate": 3.212405494206986e-06, + "loss": 0.80996603, + "num_input_tokens_seen": 112129020, + "router_z_loss_clip": 1.96777344, + "router_z_loss_mlp": 0.15771484, + "step": 5221, + "time_per_iteration": 2.477369546890259 + }, + { + "auxiliary_loss_clip": 0.06480553, + "auxiliary_loss_mlp": 0.0127616, + "balance_loss_clip": 0.0628504, + "balance_loss_mlp": 0.0125996, + "epoch": 0.31396362543213585, + "flos": 16951605229440.0, + "grad_norm": 1.9354629264259422, + "language_loss": 0.81906354, + "learning_rate": 3.2120957283946588e-06, + "loss": 0.89663064, + "num_input_tokens_seen": 112147865, + "router_z_loss_clip": 1.95605469, + "router_z_loss_mlp": 0.16223145, + "step": 5222, + "time_per_iteration": 2.5057129859924316 + }, + { + "auxiliary_loss_clip": 0.06490297, + "auxiliary_loss_mlp": 0.01284294, + "balance_loss_clip": 0.06288279, + "balance_loss_mlp": 0.01266555, + "epoch": 0.31402374868480387, + "flos": 20162877431040.0, + "grad_norm": 1.9084075298763516, + "language_loss": 0.70490289, + "learning_rate": 3.2117859166195407e-06, + "loss": 0.78264874, + "num_input_tokens_seen": 112166745, + "router_z_loss_clip": 2.02050781, + "router_z_loss_mlp": 0.17749023, + "step": 5223, + "time_per_iteration": 2.4747233390808105 + }, + { + "auxiliary_loss_clip": 0.06484593, + "auxiliary_loss_mlp": 0.01276253, + "balance_loss_clip": 0.06287414, + "balance_loss_mlp": 0.01259718, + "epoch": 0.31408387193747184, + "flos": 21257363462400.0, + "grad_norm": 1.5262001080385015, + "language_loss": 0.80608702, + "learning_rate": 3.211476058893379e-06, + "loss": 0.88369542, + "num_input_tokens_seen": 112185895, + "router_z_loss_clip": 1.96972656, + "router_z_loss_mlp": 0.1652832, + "step": 5224, + "time_per_iteration": 2.576864004135132 + }, + { + "auxiliary_loss_clip": 0.06497495, + "auxiliary_loss_mlp": 0.01279621, + "balance_loss_clip": 0.06291461, + "balance_loss_mlp": 0.01261632, + "epoch": 0.3141439951901398, + "flos": 27490617492480.0, + "grad_norm": 2.962077450034062, + "language_loss": 0.58624607, + "learning_rate": 3.2111661552279243e-06, + "loss": 0.66401726, + "num_input_tokens_seen": 112204465, + "router_z_loss_clip": 2.05859375, + "router_z_loss_mlp": 0.17993164, + "step": 5225, + "time_per_iteration": 2.558159828186035 + }, + { + "auxiliary_loss_clip": 0.06482717, + "auxiliary_loss_mlp": 0.0128044, + "balance_loss_clip": 0.06289019, + "balance_loss_mlp": 0.0126505, + "epoch": 0.31420411844280777, + "flos": 17857010522880.0, + "grad_norm": 1.7568792542410607, + "language_loss": 0.81975454, + "learning_rate": 3.2108562056349273e-06, + "loss": 0.89738619, + "num_input_tokens_seen": 112221635, + "router_z_loss_clip": 1.93847656, + "router_z_loss_mlp": 0.15380859, + "step": 5226, + "time_per_iteration": 2.5197925567626953 + }, + { + "auxiliary_loss_clip": 0.06493273, + "auxiliary_loss_mlp": 0.01283534, + "balance_loss_clip": 0.0629416, + "balance_loss_mlp": 0.01265998, + "epoch": 0.31426424169547573, + "flos": 21623491877760.0, + "grad_norm": 1.9094319640845634, + "language_loss": 0.74358761, + "learning_rate": 3.210546210126141e-06, + "loss": 0.8213557, + "num_input_tokens_seen": 112241240, + "router_z_loss_clip": 1.99023438, + "router_z_loss_mlp": 0.17529297, + "step": 5227, + "time_per_iteration": 2.6723456382751465 + }, + { + "auxiliary_loss_clip": 0.06493893, + "auxiliary_loss_mlp": 0.01287677, + "balance_loss_clip": 0.0629607, + "balance_loss_mlp": 0.01270392, + "epoch": 0.3143243649481437, + "flos": 30928677569280.0, + "grad_norm": 1.9492252245216757, + "language_loss": 0.68802202, + "learning_rate": 3.2102361687133213e-06, + "loss": 0.76583767, + "num_input_tokens_seen": 112262350, + "router_z_loss_clip": 1.97949219, + "router_z_loss_mlp": 0.17297363, + "step": 5228, + "time_per_iteration": 2.724705934524536 + }, + { + "auxiliary_loss_clip": 0.06488988, + "auxiliary_loss_mlp": 0.01281066, + "balance_loss_clip": 0.06292454, + "balance_loss_mlp": 0.01265044, + "epoch": 0.31438448820081166, + "flos": 22828206355200.0, + "grad_norm": 1.7089427628420442, + "language_loss": 0.80276144, + "learning_rate": 3.2099260814082254e-06, + "loss": 0.88046199, + "num_input_tokens_seen": 112283710, + "router_z_loss_clip": 1.96484375, + "router_z_loss_mlp": 0.16015625, + "step": 5229, + "time_per_iteration": 4.091265678405762 + }, + { + "auxiliary_loss_clip": 0.06481495, + "auxiliary_loss_mlp": 0.01275808, + "balance_loss_clip": 0.06287295, + "balance_loss_mlp": 0.01259428, + "epoch": 0.3144446114534796, + "flos": 23298399941760.0, + "grad_norm": 1.658320923858175, + "language_loss": 0.70112014, + "learning_rate": 3.209615948222611e-06, + "loss": 0.7786932, + "num_input_tokens_seen": 112304285, + "router_z_loss_clip": 1.93945312, + "router_z_loss_mlp": 0.16381836, + "step": 5230, + "time_per_iteration": 2.5652499198913574 + }, + { + "auxiliary_loss_clip": 0.06489812, + "auxiliary_loss_mlp": 0.01281571, + "balance_loss_clip": 0.06291179, + "balance_loss_mlp": 0.01264572, + "epoch": 0.3145047347061476, + "flos": 31363679640960.0, + "grad_norm": 2.930398163442548, + "language_loss": 0.80236816, + "learning_rate": 3.209305769168239e-06, + "loss": 0.88008201, + "num_input_tokens_seen": 112325110, + "router_z_loss_clip": 1.984375, + "router_z_loss_mlp": 0.17004395, + "step": 5231, + "time_per_iteration": 5.461926698684692 + }, + { + "auxiliary_loss_clip": 0.06483024, + "auxiliary_loss_mlp": 0.01279077, + "balance_loss_clip": 0.062879, + "balance_loss_mlp": 0.01262912, + "epoch": 0.31456485795881556, + "flos": 10894182992640.0, + "grad_norm": 3.377505802107346, + "language_loss": 0.85102671, + "learning_rate": 3.2089955442568704e-06, + "loss": 0.92864776, + "num_input_tokens_seen": 112339855, + "router_z_loss_clip": 1.95117188, + "router_z_loss_mlp": 0.16149902, + "step": 5232, + "time_per_iteration": 2.549555778503418 + }, + { + "auxiliary_loss_clip": 0.06479923, + "auxiliary_loss_mlp": 0.01286528, + "balance_loss_clip": 0.06287266, + "balance_loss_mlp": 0.01269779, + "epoch": 0.3146249812114835, + "flos": 17098157220480.0, + "grad_norm": 1.5771176865385883, + "language_loss": 0.80666757, + "learning_rate": 3.2086852735002692e-06, + "loss": 0.88433212, + "num_input_tokens_seen": 112358480, + "router_z_loss_clip": 1.92675781, + "router_z_loss_mlp": 0.16748047, + "step": 5233, + "time_per_iteration": 2.502790927886963 + }, + { + "auxiliary_loss_clip": 0.06496342, + "auxiliary_loss_mlp": 0.01276742, + "balance_loss_clip": 0.06294576, + "balance_loss_mlp": 0.01260768, + "epoch": 0.3146851044641515, + "flos": 55303283352960.0, + "grad_norm": 1.6501859452394316, + "language_loss": 0.71124518, + "learning_rate": 3.2083749569102024e-06, + "loss": 0.78897607, + "num_input_tokens_seen": 112382350, + "router_z_loss_clip": 2.01660156, + "router_z_loss_mlp": 0.15966797, + "step": 5234, + "time_per_iteration": 2.8301026821136475 + }, + { + "auxiliary_loss_clip": 0.06491733, + "auxiliary_loss_mlp": 0.01276589, + "balance_loss_clip": 0.06292239, + "balance_loss_mlp": 0.01259566, + "epoch": 0.31474522771681945, + "flos": 27023149163520.0, + "grad_norm": 1.9231261360365097, + "language_loss": 0.73437119, + "learning_rate": 3.2080645944984356e-06, + "loss": 0.8120544, + "num_input_tokens_seen": 112400260, + "router_z_loss_clip": 1.99902344, + "router_z_loss_mlp": 0.17004395, + "step": 5235, + "time_per_iteration": 2.543799638748169 + }, + { + "auxiliary_loss_clip": 0.0648193, + "auxiliary_loss_mlp": 0.0127527, + "balance_loss_clip": 0.0628682, + "balance_loss_mlp": 0.01259308, + "epoch": 0.3148053509694875, + "flos": 21258369711360.0, + "grad_norm": 1.9283939280374622, + "language_loss": 0.79554284, + "learning_rate": 3.2077541862767384e-06, + "loss": 0.87311482, + "num_input_tokens_seen": 112419400, + "router_z_loss_clip": 1.95214844, + "router_z_loss_mlp": 0.15942383, + "step": 5236, + "time_per_iteration": 2.5356431007385254 + }, + { + "auxiliary_loss_clip": 0.06493077, + "auxiliary_loss_mlp": 0.01277667, + "balance_loss_clip": 0.06288847, + "balance_loss_mlp": 0.01260942, + "epoch": 0.31486547422215544, + "flos": 31256721504000.0, + "grad_norm": 2.880510555000243, + "language_loss": 0.76337612, + "learning_rate": 3.207443732256881e-06, + "loss": 0.84108353, + "num_input_tokens_seen": 112440825, + "router_z_loss_clip": 2.04101562, + "router_z_loss_mlp": 0.16723633, + "step": 5237, + "time_per_iteration": 4.129598379135132 + }, + { + "auxiliary_loss_clip": 0.0648271, + "auxiliary_loss_mlp": 0.01277744, + "balance_loss_clip": 0.06291585, + "balance_loss_mlp": 0.01262843, + "epoch": 0.3149255974748234, + "flos": 19834749642240.0, + "grad_norm": 1.6736027402410734, + "language_loss": 0.7951014, + "learning_rate": 3.2071332324506372e-06, + "loss": 0.87270594, + "num_input_tokens_seen": 112459180, + "router_z_loss_clip": 1.91015625, + "router_z_loss_mlp": 0.14916992, + "step": 5238, + "time_per_iteration": 2.504612445831299 + }, + { + "auxiliary_loss_clip": 0.06376656, + "auxiliary_loss_mlp": 0.01267743, + "balance_loss_clip": 0.06282751, + "balance_loss_mlp": 0.01263604, + "epoch": 0.31498572072749137, + "flos": 67701867350400.0, + "grad_norm": 0.8276402478045692, + "language_loss": 0.68007928, + "learning_rate": 3.2068226868697795e-06, + "loss": 0.75652325, + "num_input_tokens_seen": 112516680, + "router_z_loss_clip": 0.9375, + "router_z_loss_mlp": 0.04141235, + "step": 5239, + "time_per_iteration": 3.174287796020508 + }, + { + "auxiliary_loss_clip": 0.06498836, + "auxiliary_loss_mlp": 0.01274257, + "balance_loss_clip": 0.06292844, + "balance_loss_mlp": 0.01256376, + "epoch": 0.31504584398015933, + "flos": 19799432346240.0, + "grad_norm": 2.176171670908613, + "language_loss": 0.82951081, + "learning_rate": 3.2065120955260846e-06, + "loss": 0.9072417, + "num_input_tokens_seen": 112535895, + "router_z_loss_clip": 2.0625, + "router_z_loss_mlp": 0.17883301, + "step": 5240, + "time_per_iteration": 2.509793996810913 + }, + { + "auxiliary_loss_clip": 0.06485248, + "auxiliary_loss_mlp": 0.01278588, + "balance_loss_clip": 0.06288239, + "balance_loss_mlp": 0.01262125, + "epoch": 0.3151059672328273, + "flos": 26622751628160.0, + "grad_norm": 1.8077188253124041, + "language_loss": 0.81193888, + "learning_rate": 3.2062014584313302e-06, + "loss": 0.88957721, + "num_input_tokens_seen": 112557490, + "router_z_loss_clip": 1.97363281, + "router_z_loss_mlp": 0.16455078, + "step": 5241, + "time_per_iteration": 2.571192502975464 + }, + { + "auxiliary_loss_clip": 0.06482153, + "auxiliary_loss_mlp": 0.01277268, + "balance_loss_clip": 0.06291743, + "balance_loss_mlp": 0.01260912, + "epoch": 0.31516609048549526, + "flos": 24210890904960.0, + "grad_norm": 1.4478120037649602, + "language_loss": 0.74484038, + "learning_rate": 3.2058907755972956e-06, + "loss": 0.82243454, + "num_input_tokens_seen": 112577075, + "router_z_loss_clip": 1.90332031, + "router_z_loss_mlp": 0.16357422, + "step": 5242, + "time_per_iteration": 2.526357650756836 + }, + { + "auxiliary_loss_clip": 0.06487267, + "auxiliary_loss_mlp": 0.01275494, + "balance_loss_clip": 0.06292535, + "balance_loss_mlp": 0.01259163, + "epoch": 0.31522621373816323, + "flos": 25965950999040.0, + "grad_norm": 1.6442244241642663, + "language_loss": 0.73668325, + "learning_rate": 3.2055800470357626e-06, + "loss": 0.81431091, + "num_input_tokens_seen": 112597620, + "router_z_loss_clip": 1.95019531, + "router_z_loss_mlp": 0.16320801, + "step": 5243, + "time_per_iteration": 2.606276273727417 + }, + { + "auxiliary_loss_clip": 0.06485401, + "auxiliary_loss_mlp": 0.01273843, + "balance_loss_clip": 0.0628818, + "balance_loss_mlp": 0.0125713, + "epoch": 0.3152863369908312, + "flos": 21915379975680.0, + "grad_norm": 1.7357669101009914, + "language_loss": 0.64914608, + "learning_rate": 3.205269272758513e-06, + "loss": 0.72673857, + "num_input_tokens_seen": 112617150, + "router_z_loss_clip": 1.97265625, + "router_z_loss_mlp": 0.16711426, + "step": 5244, + "time_per_iteration": 2.5950305461883545 + }, + { + "auxiliary_loss_clip": 0.06492754, + "auxiliary_loss_mlp": 0.01274277, + "balance_loss_clip": 0.06292984, + "balance_loss_mlp": 0.01257743, + "epoch": 0.31534646024349916, + "flos": 16285203308160.0, + "grad_norm": 2.8540583379791005, + "language_loss": 0.91357732, + "learning_rate": 3.2049584527773313e-06, + "loss": 0.99124765, + "num_input_tokens_seen": 112631090, + "router_z_loss_clip": 2.0, + "router_z_loss_mlp": 0.16540527, + "step": 5245, + "time_per_iteration": 2.510085105895996 + }, + { + "auxiliary_loss_clip": 0.06488977, + "auxiliary_loss_mlp": 0.01277309, + "balance_loss_clip": 0.06291293, + "balance_loss_mlp": 0.01260596, + "epoch": 0.3154065834961671, + "flos": 24724116362880.0, + "grad_norm": 1.9445780779956967, + "language_loss": 0.75699973, + "learning_rate": 3.2046475871040048e-06, + "loss": 0.83466256, + "num_input_tokens_seen": 112651220, + "router_z_loss_clip": 1.97851562, + "router_z_loss_mlp": 0.1673584, + "step": 5246, + "time_per_iteration": 2.543600559234619 + }, + { + "auxiliary_loss_clip": 0.06488622, + "auxiliary_loss_mlp": 0.01279725, + "balance_loss_clip": 0.06290317, + "balance_loss_mlp": 0.01262833, + "epoch": 0.3154667067488351, + "flos": 35379813836160.0, + "grad_norm": 1.6152414177037249, + "language_loss": 0.61608225, + "learning_rate": 3.204336675750321e-06, + "loss": 0.69376576, + "num_input_tokens_seen": 112671560, + "router_z_loss_clip": 1.98144531, + "router_z_loss_mlp": 0.16882324, + "step": 5247, + "time_per_iteration": 2.6849827766418457 + }, + { + "auxiliary_loss_clip": 0.06491058, + "auxiliary_loss_mlp": 0.01281873, + "balance_loss_clip": 0.06290263, + "balance_loss_mlp": 0.0126417, + "epoch": 0.31552683000150306, + "flos": 17462105429760.0, + "grad_norm": 2.6938697298202667, + "language_loss": 0.82848823, + "learning_rate": 3.2040257187280693e-06, + "loss": 0.90621758, + "num_input_tokens_seen": 112689790, + "router_z_loss_clip": 2.00585938, + "router_z_loss_mlp": 0.17687988, + "step": 5248, + "time_per_iteration": 2.4956586360931396 + }, + { + "auxiliary_loss_clip": 0.06488842, + "auxiliary_loss_mlp": 0.01282146, + "balance_loss_clip": 0.06291078, + "balance_loss_mlp": 0.01264121, + "epoch": 0.3155869532541711, + "flos": 18411674624640.0, + "grad_norm": 4.654519722073602, + "language_loss": 0.85721719, + "learning_rate": 3.2037147160490423e-06, + "loss": 0.93492711, + "num_input_tokens_seen": 112708265, + "router_z_loss_clip": 1.9765625, + "router_z_loss_mlp": 0.18029785, + "step": 5249, + "time_per_iteration": 2.568054437637329 + }, + { + "auxiliary_loss_clip": 0.06489561, + "auxiliary_loss_mlp": 0.01280069, + "balance_loss_clip": 0.06290483, + "balance_loss_mlp": 0.01261198, + "epoch": 0.31564707650683904, + "flos": 21586162083840.0, + "grad_norm": 1.7795262086342007, + "language_loss": 0.86067384, + "learning_rate": 3.2034036677250322e-06, + "loss": 0.93837023, + "num_input_tokens_seen": 112727820, + "router_z_loss_clip": 1.98730469, + "router_z_loss_mlp": 0.1887207, + "step": 5250, + "time_per_iteration": 2.508528709411621 + }, + { + "auxiliary_loss_clip": 0.06486481, + "auxiliary_loss_mlp": 0.01279989, + "balance_loss_clip": 0.06289366, + "balance_loss_mlp": 0.01262334, + "epoch": 0.315707199759507, + "flos": 21037032351360.0, + "grad_norm": 2.1261014211455063, + "language_loss": 0.6942147, + "learning_rate": 3.203092573767835e-06, + "loss": 0.77187943, + "num_input_tokens_seen": 112743140, + "router_z_loss_clip": 1.96777344, + "router_z_loss_mlp": 0.1763916, + "step": 5251, + "time_per_iteration": 2.526685953140259 + }, + { + "auxiliary_loss_clip": 0.06487083, + "auxiliary_loss_mlp": 0.01273182, + "balance_loss_clip": 0.06288725, + "balance_loss_mlp": 0.01255586, + "epoch": 0.31576732301217497, + "flos": 26835326236800.0, + "grad_norm": 2.019211823887184, + "language_loss": 0.78895354, + "learning_rate": 3.202781434189246e-06, + "loss": 0.86655623, + "num_input_tokens_seen": 112764705, + "router_z_loss_clip": 1.98339844, + "router_z_loss_mlp": 0.17602539, + "step": 5252, + "time_per_iteration": 2.570160150527954 + }, + { + "auxiliary_loss_clip": 0.06486022, + "auxiliary_loss_mlp": 0.01277329, + "balance_loss_clip": 0.06289184, + "balance_loss_mlp": 0.01261664, + "epoch": 0.31582744626484294, + "flos": 22717810200960.0, + "grad_norm": 1.5436537660689573, + "language_loss": 0.74377203, + "learning_rate": 3.202470249001066e-06, + "loss": 0.82140553, + "num_input_tokens_seen": 112785310, + "router_z_loss_clip": 1.96679688, + "router_z_loss_mlp": 0.15661621, + "step": 5253, + "time_per_iteration": 2.587277412414551 + }, + { + "auxiliary_loss_clip": 0.06489179, + "auxiliary_loss_mlp": 0.01281773, + "balance_loss_clip": 0.06290863, + "balance_loss_mlp": 0.01264309, + "epoch": 0.3158875695175109, + "flos": 23958806296320.0, + "grad_norm": 1.6773864910066614, + "language_loss": 0.73971915, + "learning_rate": 3.2021590182150924e-06, + "loss": 0.81742871, + "num_input_tokens_seen": 112802905, + "router_z_loss_clip": 1.98339844, + "router_z_loss_mlp": 0.17456055, + "step": 5254, + "time_per_iteration": 2.588543653488159 + }, + { + "auxiliary_loss_clip": 0.06491473, + "auxiliary_loss_mlp": 0.01275265, + "balance_loss_clip": 0.06290045, + "balance_loss_mlp": 0.01257408, + "epoch": 0.31594769277017887, + "flos": 13267036840320.0, + "grad_norm": 2.7381317978754933, + "language_loss": 0.78115344, + "learning_rate": 3.201847741843128e-06, + "loss": 0.85882092, + "num_input_tokens_seen": 112820305, + "router_z_loss_clip": 2.01367188, + "router_z_loss_mlp": 0.17858887, + "step": 5255, + "time_per_iteration": 2.5159435272216797 + }, + { + "auxiliary_loss_clip": 0.0648552, + "auxiliary_loss_mlp": 0.01275031, + "balance_loss_clip": 0.06288838, + "balance_loss_mlp": 0.01255921, + "epoch": 0.31600781602284683, + "flos": 23375072027520.0, + "grad_norm": 2.9601180138118286, + "language_loss": 0.78838313, + "learning_rate": 3.2015364198969772e-06, + "loss": 0.86598861, + "num_input_tokens_seen": 112841185, + "router_z_loss_clip": 1.96875, + "router_z_loss_mlp": 0.19104004, + "step": 5256, + "time_per_iteration": 2.560702085494995 + }, + { + "auxiliary_loss_clip": 0.06480406, + "auxiliary_loss_mlp": 0.01272902, + "balance_loss_clip": 0.06291319, + "balance_loss_mlp": 0.01257352, + "epoch": 0.3160679392755148, + "flos": 19834707715200.0, + "grad_norm": 1.443888473305352, + "language_loss": 0.71476674, + "learning_rate": 3.2012250523884453e-06, + "loss": 0.79229981, + "num_input_tokens_seen": 112860570, + "router_z_loss_clip": 1.89257812, + "router_z_loss_mlp": 0.15533447, + "step": 5257, + "time_per_iteration": 2.515044927597046 + }, + { + "auxiliary_loss_clip": 0.06490695, + "auxiliary_loss_mlp": 0.01275192, + "balance_loss_clip": 0.06291541, + "balance_loss_mlp": 0.01257787, + "epoch": 0.31612806252818276, + "flos": 20199368684160.0, + "grad_norm": 3.1125237193001967, + "language_loss": 0.77181315, + "learning_rate": 3.2009136393293393e-06, + "loss": 0.84947205, + "num_input_tokens_seen": 112877975, + "router_z_loss_clip": 1.9921875, + "router_z_loss_mlp": 0.17419434, + "step": 5258, + "time_per_iteration": 2.544926166534424 + }, + { + "auxiliary_loss_clip": 0.06484105, + "auxiliary_loss_mlp": 0.01276302, + "balance_loss_clip": 0.06286652, + "balance_loss_mlp": 0.01258624, + "epoch": 0.31618818578085073, + "flos": 24241596226560.0, + "grad_norm": 2.554871248122792, + "language_loss": 0.73012489, + "learning_rate": 3.200602180731467e-06, + "loss": 0.80772901, + "num_input_tokens_seen": 112896170, + "router_z_loss_clip": 1.97363281, + "router_z_loss_mlp": 0.17675781, + "step": 5259, + "time_per_iteration": 2.5244109630584717 + }, + { + "auxiliary_loss_clip": 0.06490766, + "auxiliary_loss_mlp": 0.01272581, + "balance_loss_clip": 0.06291697, + "balance_loss_mlp": 0.01256106, + "epoch": 0.3162483090335187, + "flos": 25088735404800.0, + "grad_norm": 2.502439629336286, + "language_loss": 0.66774327, + "learning_rate": 3.20029067660664e-06, + "loss": 0.74537671, + "num_input_tokens_seen": 112916180, + "router_z_loss_clip": 1.9921875, + "router_z_loss_mlp": 0.16455078, + "step": 5260, + "time_per_iteration": 2.575772762298584 + }, + { + "auxiliary_loss_clip": 0.06481651, + "auxiliary_loss_mlp": 0.01272837, + "balance_loss_clip": 0.06285223, + "balance_loss_mlp": 0.01256386, + "epoch": 0.31630843228618666, + "flos": 26330653895040.0, + "grad_norm": 2.0766337978972023, + "language_loss": 0.72817439, + "learning_rate": 3.1999791269666706e-06, + "loss": 0.80571926, + "num_input_tokens_seen": 112936745, + "router_z_loss_clip": 1.96679688, + "router_z_loss_mlp": 0.16455078, + "step": 5261, + "time_per_iteration": 2.559112548828125 + }, + { + "auxiliary_loss_clip": 0.06366719, + "auxiliary_loss_mlp": 0.01254616, + "balance_loss_clip": 0.06272718, + "balance_loss_mlp": 0.01250792, + "epoch": 0.3163685555388547, + "flos": 66780053856000.0, + "grad_norm": 0.7132570662369885, + "language_loss": 0.50697625, + "learning_rate": 3.1996675318233716e-06, + "loss": 0.58318961, + "num_input_tokens_seen": 112994845, + "router_z_loss_clip": 0.9375, + "router_z_loss_mlp": 0.03817749, + "step": 5262, + "time_per_iteration": 3.1381468772888184 + }, + { + "auxiliary_loss_clip": 0.06487425, + "auxiliary_loss_mlp": 0.01273056, + "balance_loss_clip": 0.06289163, + "balance_loss_mlp": 0.01256224, + "epoch": 0.31642867879152264, + "flos": 26002987303680.0, + "grad_norm": 1.713052875923359, + "language_loss": 0.85966682, + "learning_rate": 3.19935589118856e-06, + "loss": 0.9372716, + "num_input_tokens_seen": 113015125, + "router_z_loss_clip": 1.98046875, + "router_z_loss_mlp": 0.16833496, + "step": 5263, + "time_per_iteration": 2.5385844707489014 + }, + { + "auxiliary_loss_clip": 0.0647549, + "auxiliary_loss_mlp": 0.01273956, + "balance_loss_clip": 0.06283621, + "balance_loss_mlp": 0.01257695, + "epoch": 0.3164888020441906, + "flos": 25781943432960.0, + "grad_norm": 1.4697461293234868, + "language_loss": 0.82077682, + "learning_rate": 3.1990442050740535e-06, + "loss": 0.89827132, + "num_input_tokens_seen": 113035535, + "router_z_loss_clip": 1.91894531, + "router_z_loss_mlp": 0.16247559, + "step": 5264, + "time_per_iteration": 2.558708429336548 + }, + { + "auxiliary_loss_clip": 0.06488511, + "auxiliary_loss_mlp": 0.01271533, + "balance_loss_clip": 0.06288397, + "balance_loss_mlp": 0.01254117, + "epoch": 0.3165489252968586, + "flos": 19762437968640.0, + "grad_norm": 1.8601211050375244, + "language_loss": 0.80259931, + "learning_rate": 3.19873247349167e-06, + "loss": 0.88019973, + "num_input_tokens_seen": 113052720, + "router_z_loss_clip": 2.00097656, + "router_z_loss_mlp": 0.17419434, + "step": 5265, + "time_per_iteration": 2.492342948913574 + }, + { + "auxiliary_loss_clip": 0.06481829, + "auxiliary_loss_mlp": 0.01275233, + "balance_loss_clip": 0.06283312, + "balance_loss_mlp": 0.01257148, + "epoch": 0.31660904854952654, + "flos": 23190393628800.0, + "grad_norm": 2.032053662698869, + "language_loss": 0.75410831, + "learning_rate": 3.1984206964532307e-06, + "loss": 0.83167893, + "num_input_tokens_seen": 113071435, + "router_z_loss_clip": 1.98730469, + "router_z_loss_mlp": 0.1809082, + "step": 5266, + "time_per_iteration": 2.5563931465148926 + }, + { + "auxiliary_loss_clip": 0.06488708, + "auxiliary_loss_mlp": 0.01276821, + "balance_loss_clip": 0.06287502, + "balance_loss_mlp": 0.01258308, + "epoch": 0.3166691718021945, + "flos": 20414081571840.0, + "grad_norm": 2.020882594632444, + "language_loss": 0.79489279, + "learning_rate": 3.1981088739705585e-06, + "loss": 0.87254804, + "num_input_tokens_seen": 113088645, + "router_z_loss_clip": 2.01269531, + "router_z_loss_mlp": 0.18518066, + "step": 5267, + "time_per_iteration": 2.509413242340088 + }, + { + "auxiliary_loss_clip": 0.06371635, + "auxiliary_loss_mlp": 0.01254873, + "balance_loss_clip": 0.06277829, + "balance_loss_mlp": 0.01251359, + "epoch": 0.31672929505486247, + "flos": 70165816185600.0, + "grad_norm": 1.145238273522293, + "language_loss": 0.57623893, + "learning_rate": 3.197797006055478e-06, + "loss": 0.65250397, + "num_input_tokens_seen": 113152775, + "router_z_loss_clip": 0.9375, + "router_z_loss_mlp": 0.03518677, + "step": 5268, + "time_per_iteration": 4.6658477783203125 + }, + { + "auxiliary_loss_clip": 0.06486145, + "auxiliary_loss_mlp": 0.01271551, + "balance_loss_clip": 0.06287054, + "balance_loss_mlp": 0.01253884, + "epoch": 0.31678941830753043, + "flos": 14360977820160.0, + "grad_norm": 2.2953322915245784, + "language_loss": 0.73492396, + "learning_rate": 3.197485092719815e-06, + "loss": 0.81250083, + "num_input_tokens_seen": 113171410, + "router_z_loss_clip": 1.98925781, + "router_z_loss_mlp": 0.17651367, + "step": 5269, + "time_per_iteration": 2.500276565551758 + }, + { + "auxiliary_loss_clip": 0.06490922, + "auxiliary_loss_mlp": 0.01279355, + "balance_loss_clip": 0.06295022, + "balance_loss_mlp": 0.01261652, + "epoch": 0.3168495415601984, + "flos": 22754385308160.0, + "grad_norm": 1.8930521062253438, + "language_loss": 0.80391312, + "learning_rate": 3.1971731339753973e-06, + "loss": 0.88161588, + "num_input_tokens_seen": 113189965, + "router_z_loss_clip": 1.9609375, + "router_z_loss_mlp": 0.17700195, + "step": 5270, + "time_per_iteration": 4.030852794647217 + }, + { + "auxiliary_loss_clip": 0.0648749, + "auxiliary_loss_mlp": 0.01275027, + "balance_loss_clip": 0.06288311, + "balance_loss_mlp": 0.01257742, + "epoch": 0.31690966481286637, + "flos": 20120558319360.0, + "grad_norm": 2.0275703030815744, + "language_loss": 0.79860884, + "learning_rate": 3.1968611298340545e-06, + "loss": 0.87623405, + "num_input_tokens_seen": 113206355, + "router_z_loss_clip": 1.99121094, + "router_z_loss_mlp": 0.17285156, + "step": 5271, + "time_per_iteration": 3.963491201400757 + }, + { + "auxiliary_loss_clip": 0.06485552, + "auxiliary_loss_mlp": 0.01274595, + "balance_loss_clip": 0.06286864, + "balance_loss_mlp": 0.01256344, + "epoch": 0.31696978806553433, + "flos": 21185345278080.0, + "grad_norm": 2.0532864997035616, + "language_loss": 0.7348994, + "learning_rate": 3.1965490803076173e-06, + "loss": 0.81250083, + "num_input_tokens_seen": 113225440, + "router_z_loss_clip": 1.98828125, + "router_z_loss_mlp": 0.18237305, + "step": 5272, + "time_per_iteration": 2.5324926376342773 + }, + { + "auxiliary_loss_clip": 0.06497657, + "auxiliary_loss_mlp": 0.01275072, + "balance_loss_clip": 0.06294467, + "balance_loss_mlp": 0.01255629, + "epoch": 0.3170299113182023, + "flos": 43007030789760.0, + "grad_norm": 2.3636013379780083, + "language_loss": 0.69916022, + "learning_rate": 3.1962369854079194e-06, + "loss": 0.77688754, + "num_input_tokens_seen": 113248840, + "router_z_loss_clip": 2.03320312, + "router_z_loss_mlp": 0.19458008, + "step": 5273, + "time_per_iteration": 2.8313193321228027 + }, + { + "auxiliary_loss_clip": 0.0648469, + "auxiliary_loss_mlp": 0.01272488, + "balance_loss_clip": 0.06288255, + "balance_loss_mlp": 0.01255954, + "epoch": 0.31709003457087026, + "flos": 24466707020160.0, + "grad_norm": 3.373298123766896, + "language_loss": 0.68486917, + "learning_rate": 3.195924845146795e-06, + "loss": 0.76244098, + "num_input_tokens_seen": 113269630, + "router_z_loss_clip": 1.9609375, + "router_z_loss_mlp": 0.1652832, + "step": 5274, + "time_per_iteration": 2.5647053718566895 + }, + { + "auxiliary_loss_clip": 0.06486842, + "auxiliary_loss_mlp": 0.01272159, + "balance_loss_clip": 0.06295811, + "balance_loss_mlp": 0.01256114, + "epoch": 0.3171501578235382, + "flos": 24142394592000.0, + "grad_norm": 1.437173314012816, + "language_loss": 0.8105545, + "learning_rate": 3.195612659536081e-06, + "loss": 0.88814449, + "num_input_tokens_seen": 113291200, + "router_z_loss_clip": 1.91308594, + "router_z_loss_mlp": 0.16052246, + "step": 5275, + "time_per_iteration": 2.545689821243286 + }, + { + "auxiliary_loss_clip": 0.06496362, + "auxiliary_loss_mlp": 0.01272499, + "balance_loss_clip": 0.0629561, + "balance_loss_mlp": 0.01254296, + "epoch": 0.31721028107620625, + "flos": 18885641644800.0, + "grad_norm": 1.7797970991839078, + "language_loss": 0.73459136, + "learning_rate": 3.1953004285876147e-06, + "loss": 0.81228, + "num_input_tokens_seen": 113310170, + "router_z_loss_clip": 2.00683594, + "router_z_loss_mlp": 0.18212891, + "step": 5276, + "time_per_iteration": 3.978994131088257 + }, + { + "auxiliary_loss_clip": 0.06480486, + "auxiliary_loss_mlp": 0.01276369, + "balance_loss_clip": 0.06288992, + "balance_loss_mlp": 0.01259811, + "epoch": 0.3172704043288742, + "flos": 23154405500160.0, + "grad_norm": 1.4192945576637652, + "language_loss": 0.78409082, + "learning_rate": 3.194988152313236e-06, + "loss": 0.86165935, + "num_input_tokens_seen": 113331140, + "router_z_loss_clip": 1.91699219, + "router_z_loss_mlp": 0.16552734, + "step": 5277, + "time_per_iteration": 2.6181840896606445 + }, + { + "auxiliary_loss_clip": 0.06493685, + "auxiliary_loss_mlp": 0.01273951, + "balance_loss_clip": 0.06294833, + "balance_loss_mlp": 0.01256653, + "epoch": 0.3173305275815422, + "flos": 17864347754880.0, + "grad_norm": 1.9934204528772321, + "language_loss": 0.79709554, + "learning_rate": 3.1946758307247878e-06, + "loss": 0.87477195, + "num_input_tokens_seen": 113350030, + "router_z_loss_clip": 1.98828125, + "router_z_loss_mlp": 0.17297363, + "step": 5278, + "time_per_iteration": 2.4955894947052 + }, + { + "auxiliary_loss_clip": 0.06380783, + "auxiliary_loss_mlp": 0.01265109, + "balance_loss_clip": 0.06288064, + "balance_loss_mlp": 0.01260886, + "epoch": 0.31739065083421014, + "flos": 59988083529600.0, + "grad_norm": 0.841903886868049, + "language_loss": 0.62797457, + "learning_rate": 3.1943634638341114e-06, + "loss": 0.7044335, + "num_input_tokens_seen": 113395820, + "router_z_loss_clip": 0.92724609, + "router_z_loss_mlp": 0.04226685, + "step": 5279, + "time_per_iteration": 2.920987367630005 + }, + { + "auxiliary_loss_clip": 0.06489395, + "auxiliary_loss_mlp": 0.01285376, + "balance_loss_clip": 0.06287935, + "balance_loss_mlp": 0.01265265, + "epoch": 0.3174507740868781, + "flos": 23807013425280.0, + "grad_norm": 2.0709232065681475, + "language_loss": 0.81487882, + "learning_rate": 3.194051051653053e-06, + "loss": 0.89262652, + "num_input_tokens_seen": 113416835, + "router_z_loss_clip": 2.01367188, + "router_z_loss_mlp": 0.2010498, + "step": 5280, + "time_per_iteration": 2.537612199783325 + }, + { + "auxiliary_loss_clip": 0.06483282, + "auxiliary_loss_mlp": 0.01281645, + "balance_loss_clip": 0.06291374, + "balance_loss_mlp": 0.01264276, + "epoch": 0.31751089733954607, + "flos": 27646728848640.0, + "grad_norm": 1.437826441265799, + "language_loss": 0.78464299, + "learning_rate": 3.19373859419346e-06, + "loss": 0.86229229, + "num_input_tokens_seen": 113440850, + "router_z_loss_clip": 1.91699219, + "router_z_loss_mlp": 0.17358398, + "step": 5281, + "time_per_iteration": 2.6482186317443848 + }, + { + "auxiliary_loss_clip": 0.06485789, + "auxiliary_loss_mlp": 0.01283007, + "balance_loss_clip": 0.06290175, + "balance_loss_mlp": 0.01265424, + "epoch": 0.31757102059221404, + "flos": 23776098468480.0, + "grad_norm": 1.5338111796323235, + "language_loss": 0.78882301, + "learning_rate": 3.193426091467179e-06, + "loss": 0.86651099, + "num_input_tokens_seen": 113461000, + "router_z_loss_clip": 1.95410156, + "router_z_loss_mlp": 0.17590332, + "step": 5282, + "time_per_iteration": 2.5157217979431152 + }, + { + "auxiliary_loss_clip": 0.06494205, + "auxiliary_loss_mlp": 0.01276135, + "balance_loss_clip": 0.0629286, + "balance_loss_mlp": 0.01258373, + "epoch": 0.317631143844882, + "flos": 25271485159680.0, + "grad_norm": 2.0006947857157753, + "language_loss": 0.67952389, + "learning_rate": 3.193113543486061e-06, + "loss": 0.7572273, + "num_input_tokens_seen": 113480820, + "router_z_loss_clip": 2.01367188, + "router_z_loss_mlp": 0.1776123, + "step": 5283, + "time_per_iteration": 2.565925359725952 + }, + { + "auxiliary_loss_clip": 0.06373101, + "auxiliary_loss_mlp": 0.01271528, + "balance_loss_clip": 0.0628058, + "balance_loss_mlp": 0.01267352, + "epoch": 0.31769126709754997, + "flos": 55841832743040.0, + "grad_norm": 0.7241871595116953, + "language_loss": 0.52631503, + "learning_rate": 3.192800950261958e-06, + "loss": 0.60276127, + "num_input_tokens_seen": 113536910, + "router_z_loss_clip": 0.921875, + "router_z_loss_mlp": 0.04177856, + "step": 5284, + "time_per_iteration": 3.1037213802337646 + }, + { + "auxiliary_loss_clip": 0.0649649, + "auxiliary_loss_mlp": 0.01274319, + "balance_loss_clip": 0.06291351, + "balance_loss_mlp": 0.01257225, + "epoch": 0.31775139035021793, + "flos": 16696124530560.0, + "grad_norm": 2.2460762000689294, + "language_loss": 0.70842284, + "learning_rate": 3.1924883118067235e-06, + "loss": 0.78613091, + "num_input_tokens_seen": 113555480, + "router_z_loss_clip": 2.04980469, + "router_z_loss_mlp": 0.17102051, + "step": 5285, + "time_per_iteration": 2.5407655239105225 + }, + { + "auxiliary_loss_clip": 0.06366412, + "auxiliary_loss_mlp": 0.01262401, + "balance_loss_clip": 0.06274283, + "balance_loss_mlp": 0.01258384, + "epoch": 0.3178115136028859, + "flos": 64246141261440.0, + "grad_norm": 1.0137073922687154, + "language_loss": 0.60545647, + "learning_rate": 3.1921756281322123e-06, + "loss": 0.68174458, + "num_input_tokens_seen": 113616790, + "router_z_loss_clip": 0.921875, + "router_z_loss_mlp": 0.04016113, + "step": 5286, + "time_per_iteration": 3.1833202838897705 + }, + { + "auxiliary_loss_clip": 0.06498363, + "auxiliary_loss_mlp": 0.01284909, + "balance_loss_clip": 0.06297486, + "balance_loss_mlp": 0.01267051, + "epoch": 0.31787163685555386, + "flos": 18703395014400.0, + "grad_norm": 1.7319286904547555, + "language_loss": 0.72404122, + "learning_rate": 3.1918628992502826e-06, + "loss": 0.80187392, + "num_input_tokens_seen": 113635320, + "router_z_loss_clip": 2.0078125, + "router_z_loss_mlp": 0.17871094, + "step": 5287, + "time_per_iteration": 2.50571608543396 + }, + { + "auxiliary_loss_clip": 0.06495041, + "auxiliary_loss_mlp": 0.01276683, + "balance_loss_clip": 0.06292516, + "balance_loss_mlp": 0.012578, + "epoch": 0.31793176010822183, + "flos": 21331184509440.0, + "grad_norm": 1.978321388726588, + "language_loss": 0.76231503, + "learning_rate": 3.191550125172792e-06, + "loss": 0.84003228, + "num_input_tokens_seen": 113654000, + "router_z_loss_clip": 2.02539062, + "router_z_loss_mlp": 0.18884277, + "step": 5288, + "time_per_iteration": 2.5568416118621826 + }, + { + "auxiliary_loss_clip": 0.06485806, + "auxiliary_loss_mlp": 0.01283528, + "balance_loss_clip": 0.06293501, + "balance_loss_mlp": 0.01267816, + "epoch": 0.31799188336088985, + "flos": 20964846458880.0, + "grad_norm": 1.7076221862053031, + "language_loss": 0.88265222, + "learning_rate": 3.1912373059116007e-06, + "loss": 0.96034551, + "num_input_tokens_seen": 113672375, + "router_z_loss_clip": 1.92578125, + "router_z_loss_mlp": 0.15710449, + "step": 5289, + "time_per_iteration": 2.5359349250793457 + }, + { + "auxiliary_loss_clip": 0.06488061, + "auxiliary_loss_mlp": 0.01286652, + "balance_loss_clip": 0.06295781, + "balance_loss_mlp": 0.01269724, + "epoch": 0.3180520066135578, + "flos": 22498485338880.0, + "grad_norm": 1.4069348748047803, + "language_loss": 0.68210149, + "learning_rate": 3.190924441478572e-06, + "loss": 0.75984859, + "num_input_tokens_seen": 113692385, + "router_z_loss_clip": 1.92382812, + "router_z_loss_mlp": 0.16906738, + "step": 5290, + "time_per_iteration": 2.5393311977386475 + }, + { + "auxiliary_loss_clip": 0.06494544, + "auxiliary_loss_mlp": 0.0128386, + "balance_loss_clip": 0.06290419, + "balance_loss_mlp": 0.01265788, + "epoch": 0.3181121298662258, + "flos": 27242725587840.0, + "grad_norm": 3.4346413288346, + "language_loss": 0.79944348, + "learning_rate": 3.1906115318855687e-06, + "loss": 0.87722754, + "num_input_tokens_seen": 113712145, + "router_z_loss_clip": 2.04003906, + "router_z_loss_mlp": 0.18066406, + "step": 5291, + "time_per_iteration": 2.564091444015503 + }, + { + "auxiliary_loss_clip": 0.06485635, + "auxiliary_loss_mlp": 0.01278435, + "balance_loss_clip": 0.06287642, + "balance_loss_mlp": 0.01259361, + "epoch": 0.31817225311889374, + "flos": 23185991289600.0, + "grad_norm": 2.0451390273410004, + "language_loss": 0.79931051, + "learning_rate": 3.1902985771444577e-06, + "loss": 0.87695122, + "num_input_tokens_seen": 113731435, + "router_z_loss_clip": 1.97949219, + "router_z_loss_mlp": 0.19067383, + "step": 5292, + "time_per_iteration": 2.743156671524048 + }, + { + "auxiliary_loss_clip": 0.06476898, + "auxiliary_loss_mlp": 0.01275055, + "balance_loss_clip": 0.06287324, + "balance_loss_mlp": 0.01258044, + "epoch": 0.3182323763715617, + "flos": 23265598268160.0, + "grad_norm": 1.819133879513315, + "language_loss": 0.75602406, + "learning_rate": 3.1899855772671043e-06, + "loss": 0.8335436, + "num_input_tokens_seen": 113750825, + "router_z_loss_clip": 1.89550781, + "router_z_loss_mlp": 0.17004395, + "step": 5293, + "time_per_iteration": 2.523386001586914 + }, + { + "auxiliary_loss_clip": 0.06482453, + "auxiliary_loss_mlp": 0.01276012, + "balance_loss_clip": 0.06290737, + "balance_loss_mlp": 0.01260204, + "epoch": 0.3182924996242297, + "flos": 29023292050560.0, + "grad_norm": 2.0524562129349526, + "language_loss": 0.75145984, + "learning_rate": 3.189672532265379e-06, + "loss": 0.82904446, + "num_input_tokens_seen": 113770010, + "router_z_loss_clip": 1.91601562, + "router_z_loss_mlp": 0.15808105, + "step": 5294, + "time_per_iteration": 2.607849597930908 + }, + { + "auxiliary_loss_clip": 0.06489888, + "auxiliary_loss_mlp": 0.01277785, + "balance_loss_clip": 0.06293514, + "balance_loss_mlp": 0.01259201, + "epoch": 0.31835262287689764, + "flos": 20455478288640.0, + "grad_norm": 2.029675905915872, + "language_loss": 0.76497674, + "learning_rate": 3.189359442151152e-06, + "loss": 0.84265351, + "num_input_tokens_seen": 113788640, + "router_z_loss_clip": 1.96582031, + "router_z_loss_mlp": 0.18591309, + "step": 5295, + "time_per_iteration": 2.4980461597442627 + }, + { + "auxiliary_loss_clip": 0.06494178, + "auxiliary_loss_mlp": 0.01278535, + "balance_loss_clip": 0.06293284, + "balance_loss_mlp": 0.01261166, + "epoch": 0.3184127461295656, + "flos": 25126568323200.0, + "grad_norm": 2.03182891885516, + "language_loss": 0.70142519, + "learning_rate": 3.189046306936296e-06, + "loss": 0.77915227, + "num_input_tokens_seen": 113809515, + "router_z_loss_clip": 2.0078125, + "router_z_loss_mlp": 0.17358398, + "step": 5296, + "time_per_iteration": 2.610671043395996 + }, + { + "auxiliary_loss_clip": 0.06483515, + "auxiliary_loss_mlp": 0.01274893, + "balance_loss_clip": 0.0628704, + "balance_loss_mlp": 0.01258371, + "epoch": 0.31847286938223357, + "flos": 25557377690880.0, + "grad_norm": 1.5251920176335134, + "language_loss": 0.77957898, + "learning_rate": 3.1887331266326846e-06, + "loss": 0.85716307, + "num_input_tokens_seen": 113829770, + "router_z_loss_clip": 1.96484375, + "router_z_loss_mlp": 0.16516113, + "step": 5297, + "time_per_iteration": 2.539649486541748 + }, + { + "auxiliary_loss_clip": 0.06479752, + "auxiliary_loss_mlp": 0.01272766, + "balance_loss_clip": 0.06283344, + "balance_loss_mlp": 0.01255516, + "epoch": 0.31853299263490154, + "flos": 27789926676480.0, + "grad_norm": 1.8177911904554251, + "language_loss": 0.80074358, + "learning_rate": 3.1884199012521942e-06, + "loss": 0.87826872, + "num_input_tokens_seen": 113849320, + "router_z_loss_clip": 1.96386719, + "router_z_loss_mlp": 0.17248535, + "step": 5298, + "time_per_iteration": 2.6127634048461914 + }, + { + "auxiliary_loss_clip": 0.06487016, + "auxiliary_loss_mlp": 0.0127216, + "balance_loss_clip": 0.06284906, + "balance_loss_mlp": 0.01254815, + "epoch": 0.3185931158875695, + "flos": 22712653175040.0, + "grad_norm": 1.6158824069779534, + "language_loss": 0.74615932, + "learning_rate": 3.1881066308067016e-06, + "loss": 0.82375109, + "num_input_tokens_seen": 113867860, + "router_z_loss_clip": 2.02148438, + "router_z_loss_mlp": 0.17346191, + "step": 5299, + "time_per_iteration": 2.570178508758545 + }, + { + "auxiliary_loss_clip": 0.06491919, + "auxiliary_loss_mlp": 0.01275355, + "balance_loss_clip": 0.06290901, + "balance_loss_mlp": 0.01258249, + "epoch": 0.31865323914023747, + "flos": 24578402912640.0, + "grad_norm": 1.9760141697724851, + "language_loss": 0.78568625, + "learning_rate": 3.1877933153080873e-06, + "loss": 0.86335897, + "num_input_tokens_seen": 113886375, + "router_z_loss_clip": 2.0078125, + "router_z_loss_mlp": 0.17102051, + "step": 5300, + "time_per_iteration": 2.7260777950286865 + }, + { + "auxiliary_loss_clip": 0.06483838, + "auxiliary_loss_mlp": 0.01272854, + "balance_loss_clip": 0.06287212, + "balance_loss_mlp": 0.01254495, + "epoch": 0.31871336239290543, + "flos": 18192391689600.0, + "grad_norm": 2.1538981188283195, + "language_loss": 0.84250915, + "learning_rate": 3.1874799547682304e-06, + "loss": 0.92007607, + "num_input_tokens_seen": 113904065, + "router_z_loss_clip": 1.96386719, + "router_z_loss_mlp": 0.18347168, + "step": 5301, + "time_per_iteration": 2.485152244567871 + }, + { + "auxiliary_loss_clip": 0.06484723, + "auxiliary_loss_mlp": 0.01274861, + "balance_loss_clip": 0.06291914, + "balance_loss_mlp": 0.01256777, + "epoch": 0.31877348564557345, + "flos": 21831789928320.0, + "grad_norm": 2.0482094969798696, + "language_loss": 0.7812382, + "learning_rate": 3.187166549199015e-06, + "loss": 0.85883403, + "num_input_tokens_seen": 113918415, + "router_z_loss_clip": 1.92675781, + "router_z_loss_mlp": 0.18066406, + "step": 5302, + "time_per_iteration": 2.528764247894287 + }, + { + "auxiliary_loss_clip": 0.0648333, + "auxiliary_loss_mlp": 0.01275814, + "balance_loss_clip": 0.06290714, + "balance_loss_mlp": 0.01258159, + "epoch": 0.3188336088982414, + "flos": 22021331863680.0, + "grad_norm": 1.6144767194600491, + "language_loss": 0.79736584, + "learning_rate": 3.1868530986123255e-06, + "loss": 0.8749572, + "num_input_tokens_seen": 113938135, + "router_z_loss_clip": 1.92675781, + "router_z_loss_mlp": 0.17651367, + "step": 5303, + "time_per_iteration": 2.5235095024108887 + }, + { + "auxiliary_loss_clip": 0.06497993, + "auxiliary_loss_mlp": 0.01276899, + "balance_loss_clip": 0.06290174, + "balance_loss_mlp": 0.01258159, + "epoch": 0.3188937321509094, + "flos": 20054116431360.0, + "grad_norm": 1.7320090718032515, + "language_loss": 0.73529422, + "learning_rate": 3.186539603020047e-06, + "loss": 0.81304312, + "num_input_tokens_seen": 113957125, + "router_z_loss_clip": 2.08007812, + "router_z_loss_mlp": 0.18737793, + "step": 5304, + "time_per_iteration": 2.5141329765319824 + }, + { + "auxiliary_loss_clip": 0.06481734, + "auxiliary_loss_mlp": 0.01278154, + "balance_loss_clip": 0.06290816, + "balance_loss_mlp": 0.01260928, + "epoch": 0.31895385540357735, + "flos": 25855135574400.0, + "grad_norm": 1.8091269764667626, + "language_loss": 0.72548914, + "learning_rate": 3.186226062434068e-06, + "loss": 0.80308801, + "num_input_tokens_seen": 113974875, + "router_z_loss_clip": 1.91113281, + "router_z_loss_mlp": 0.17236328, + "step": 5305, + "time_per_iteration": 2.5648975372314453 + }, + { + "auxiliary_loss_clip": 0.06487268, + "auxiliary_loss_mlp": 0.01270708, + "balance_loss_clip": 0.06292576, + "balance_loss_mlp": 0.01254603, + "epoch": 0.3190139786562453, + "flos": 23484545786880.0, + "grad_norm": 2.116447005947582, + "language_loss": 0.64815247, + "learning_rate": 3.1859124768662778e-06, + "loss": 0.72573221, + "num_input_tokens_seen": 113994450, + "router_z_loss_clip": 1.94628906, + "router_z_loss_mlp": 0.16113281, + "step": 5306, + "time_per_iteration": 2.5745668411254883 + }, + { + "auxiliary_loss_clip": 0.06483987, + "auxiliary_loss_mlp": 0.01282676, + "balance_loss_clip": 0.0628574, + "balance_loss_mlp": 0.01264413, + "epoch": 0.3190741019089133, + "flos": 29103150591360.0, + "grad_norm": 2.0084949709877726, + "language_loss": 0.79260421, + "learning_rate": 3.1855988463285678e-06, + "loss": 0.87027091, + "num_input_tokens_seen": 114013945, + "router_z_loss_clip": 1.98144531, + "router_z_loss_mlp": 0.18273926, + "step": 5307, + "time_per_iteration": 2.557509183883667 + }, + { + "auxiliary_loss_clip": 0.06481419, + "auxiliary_loss_mlp": 0.01278653, + "balance_loss_clip": 0.06289747, + "balance_loss_mlp": 0.01260736, + "epoch": 0.31913422516158124, + "flos": 17135361233280.0, + "grad_norm": 3.9021838038471097, + "language_loss": 0.78660965, + "learning_rate": 3.1852851708328308e-06, + "loss": 0.86421037, + "num_input_tokens_seen": 114031375, + "router_z_loss_clip": 1.91796875, + "router_z_loss_mlp": 0.17907715, + "step": 5308, + "time_per_iteration": 3.906280994415283 + }, + { + "auxiliary_loss_clip": 0.06493698, + "auxiliary_loss_mlp": 0.01278493, + "balance_loss_clip": 0.06287338, + "balance_loss_mlp": 0.01259408, + "epoch": 0.3191943484142492, + "flos": 16075228176000.0, + "grad_norm": 3.1945469837170215, + "language_loss": 0.74758154, + "learning_rate": 3.184971450390961e-06, + "loss": 0.82530349, + "num_input_tokens_seen": 114048465, + "router_z_loss_clip": 2.06152344, + "router_z_loss_mlp": 0.19091797, + "step": 5309, + "time_per_iteration": 2.4796438217163086 + }, + { + "auxiliary_loss_clip": 0.06480245, + "auxiliary_loss_mlp": 0.01274762, + "balance_loss_clip": 0.06283399, + "balance_loss_mlp": 0.01257954, + "epoch": 0.3192544716669172, + "flos": 22972787775360.0, + "grad_norm": 1.6995242114780418, + "language_loss": 0.83242565, + "learning_rate": 3.184657685014856e-06, + "loss": 0.90997577, + "num_input_tokens_seen": 114068415, + "router_z_loss_clip": 1.96777344, + "router_z_loss_mlp": 0.16809082, + "step": 5310, + "time_per_iteration": 5.470219373703003 + }, + { + "auxiliary_loss_clip": 0.06475915, + "auxiliary_loss_mlp": 0.01272391, + "balance_loss_clip": 0.06281388, + "balance_loss_mlp": 0.01255868, + "epoch": 0.31931459491958514, + "flos": 26877645348480.0, + "grad_norm": 1.407923936832892, + "language_loss": 0.78906345, + "learning_rate": 3.184343874716412e-06, + "loss": 0.86654651, + "num_input_tokens_seen": 114088565, + "router_z_loss_clip": 1.94628906, + "router_z_loss_mlp": 0.1652832, + "step": 5311, + "time_per_iteration": 2.546112298965454 + }, + { + "auxiliary_loss_clip": 0.06477334, + "auxiliary_loss_mlp": 0.01272194, + "balance_loss_clip": 0.06282097, + "balance_loss_mlp": 0.01255254, + "epoch": 0.3193747181722531, + "flos": 21843194083200.0, + "grad_norm": 1.8192899238067177, + "language_loss": 0.84889889, + "learning_rate": 3.1840300195075295e-06, + "loss": 0.92639416, + "num_input_tokens_seen": 114107160, + "router_z_loss_clip": 1.95117188, + "router_z_loss_mlp": 0.16943359, + "step": 5312, + "time_per_iteration": 2.5534987449645996 + }, + { + "auxiliary_loss_clip": 0.06489489, + "auxiliary_loss_mlp": 0.01274677, + "balance_loss_clip": 0.06284228, + "balance_loss_mlp": 0.012567, + "epoch": 0.31943484142492107, + "flos": 18329593950720.0, + "grad_norm": 3.1557419136729536, + "language_loss": 0.79280984, + "learning_rate": 3.1837161194001102e-06, + "loss": 0.87045145, + "num_input_tokens_seen": 114123420, + "router_z_loss_clip": 2.05078125, + "router_z_loss_mlp": 0.17980957, + "step": 5313, + "time_per_iteration": 2.47098445892334 + }, + { + "auxiliary_loss_clip": 0.06477478, + "auxiliary_loss_mlp": 0.01274452, + "balance_loss_clip": 0.06281047, + "balance_loss_mlp": 0.01256618, + "epoch": 0.31949496467758903, + "flos": 21622150212480.0, + "grad_norm": 2.7721598847405584, + "language_loss": 0.86245549, + "learning_rate": 3.183402174406057e-06, + "loss": 0.93997484, + "num_input_tokens_seen": 114139230, + "router_z_loss_clip": 1.96484375, + "router_z_loss_mlp": 0.17834473, + "step": 5314, + "time_per_iteration": 2.531196117401123 + }, + { + "auxiliary_loss_clip": 0.0647811, + "auxiliary_loss_mlp": 0.0127239, + "balance_loss_clip": 0.06281686, + "balance_loss_mlp": 0.01255188, + "epoch": 0.31955508793025705, + "flos": 21766312362240.0, + "grad_norm": 1.712027342879292, + "language_loss": 0.80238831, + "learning_rate": 3.1830881845372747e-06, + "loss": 0.8798933, + "num_input_tokens_seen": 114159290, + "router_z_loss_clip": 1.96386719, + "router_z_loss_mlp": 0.17199707, + "step": 5315, + "time_per_iteration": 2.5066771507263184 + }, + { + "auxiliary_loss_clip": 0.06485026, + "auxiliary_loss_mlp": 0.01283831, + "balance_loss_clip": 0.06286455, + "balance_loss_mlp": 0.01265854, + "epoch": 0.319615211182925, + "flos": 17169881915520.0, + "grad_norm": 2.687676993792702, + "language_loss": 0.67569852, + "learning_rate": 3.18277414980567e-06, + "loss": 0.75338709, + "num_input_tokens_seen": 114177655, + "router_z_loss_clip": 1.98632812, + "router_z_loss_mlp": 0.17980957, + "step": 5316, + "time_per_iteration": 3.943110942840576 + }, + { + "auxiliary_loss_clip": 0.0648303, + "auxiliary_loss_mlp": 0.01272207, + "balance_loss_clip": 0.0628902, + "balance_loss_mlp": 0.01255566, + "epoch": 0.319675334435593, + "flos": 28120653941760.0, + "grad_norm": 1.5692381446514811, + "language_loss": 0.69637752, + "learning_rate": 3.1824600702231515e-06, + "loss": 0.77392983, + "num_input_tokens_seen": 114200880, + "router_z_loss_clip": 1.93652344, + "router_z_loss_mlp": 0.16650391, + "step": 5317, + "time_per_iteration": 2.642251491546631 + }, + { + "auxiliary_loss_clip": 0.06377298, + "auxiliary_loss_mlp": 0.0129256, + "balance_loss_clip": 0.06285109, + "balance_loss_mlp": 0.01288716, + "epoch": 0.31973545768826095, + "flos": 69524235072000.0, + "grad_norm": 0.7198160842036254, + "language_loss": 0.5281924, + "learning_rate": 3.182145945801628e-06, + "loss": 0.60489094, + "num_input_tokens_seen": 114267145, + "router_z_loss_clip": 0.921875, + "router_z_loss_mlp": 0.03839111, + "step": 5318, + "time_per_iteration": 3.2718679904937744 + }, + { + "auxiliary_loss_clip": 0.06479475, + "auxiliary_loss_mlp": 0.01271921, + "balance_loss_clip": 0.0628712, + "balance_loss_mlp": 0.01254969, + "epoch": 0.3197955809409289, + "flos": 13704344899200.0, + "grad_norm": 1.5995609143402318, + "language_loss": 0.84504628, + "learning_rate": 3.181831776553012e-06, + "loss": 0.92256021, + "num_input_tokens_seen": 114284630, + "router_z_loss_clip": 1.92480469, + "router_z_loss_mlp": 0.16955566, + "step": 5319, + "time_per_iteration": 2.5372629165649414 + }, + { + "auxiliary_loss_clip": 0.06480815, + "auxiliary_loss_mlp": 0.01279474, + "balance_loss_clip": 0.06286162, + "balance_loss_mlp": 0.01261199, + "epoch": 0.3198557041935969, + "flos": 33226368704640.0, + "grad_norm": 1.6136244255626262, + "language_loss": 0.64208525, + "learning_rate": 3.1815175624892165e-06, + "loss": 0.71968812, + "num_input_tokens_seen": 114305830, + "router_z_loss_clip": 1.9453125, + "router_z_loss_mlp": 0.18273926, + "step": 5320, + "time_per_iteration": 2.675477981567383 + }, + { + "auxiliary_loss_clip": 0.0648189, + "auxiliary_loss_mlp": 0.01271878, + "balance_loss_clip": 0.06280586, + "balance_loss_mlp": 0.01254402, + "epoch": 0.31991582744626484, + "flos": 23738726747520.0, + "grad_norm": 1.9696222638037655, + "language_loss": 0.71059012, + "learning_rate": 3.1812033036221567e-06, + "loss": 0.78812778, + "num_input_tokens_seen": 114325165, + "router_z_loss_clip": 2.01171875, + "router_z_loss_mlp": 0.17480469, + "step": 5321, + "time_per_iteration": 2.6383230686187744 + }, + { + "auxiliary_loss_clip": 0.06491005, + "auxiliary_loss_mlp": 0.01288903, + "balance_loss_clip": 0.06286187, + "balance_loss_mlp": 0.01270318, + "epoch": 0.3199759506989328, + "flos": 18556633388160.0, + "grad_norm": 2.30981924299517, + "language_loss": 0.86988461, + "learning_rate": 3.180888999963749e-06, + "loss": 0.94768369, + "num_input_tokens_seen": 114341310, + "router_z_loss_clip": 2.04980469, + "router_z_loss_mlp": 0.18591309, + "step": 5322, + "time_per_iteration": 2.4862442016601562 + }, + { + "auxiliary_loss_clip": 0.0648296, + "auxiliary_loss_mlp": 0.01273077, + "balance_loss_clip": 0.06285054, + "balance_loss_mlp": 0.01256722, + "epoch": 0.3200360739516008, + "flos": 22425418978560.0, + "grad_norm": 1.6041292280722281, + "language_loss": 0.83380175, + "learning_rate": 3.1805746515259123e-06, + "loss": 0.91136217, + "num_input_tokens_seen": 114360355, + "router_z_loss_clip": 1.9765625, + "router_z_loss_mlp": 0.16369629, + "step": 5323, + "time_per_iteration": 2.5262420177459717 + }, + { + "auxiliary_loss_clip": 0.06476378, + "auxiliary_loss_mlp": 0.01277972, + "balance_loss_clip": 0.06284457, + "balance_loss_mlp": 0.01258529, + "epoch": 0.32009619720426874, + "flos": 20601569082240.0, + "grad_norm": 1.775654796490425, + "language_loss": 0.78471839, + "learning_rate": 3.1802602583205663e-06, + "loss": 0.86226195, + "num_input_tokens_seen": 114379220, + "router_z_loss_clip": 1.91796875, + "router_z_loss_mlp": 0.19433594, + "step": 5324, + "time_per_iteration": 2.492380380630493 + }, + { + "auxiliary_loss_clip": 0.06478705, + "auxiliary_loss_mlp": 0.01274174, + "balance_loss_clip": 0.06283212, + "balance_loss_mlp": 0.01256042, + "epoch": 0.3201563204569367, + "flos": 18153049397760.0, + "grad_norm": 1.7224742254360714, + "language_loss": 0.80742848, + "learning_rate": 3.1799458203596333e-06, + "loss": 0.88495719, + "num_input_tokens_seen": 114396365, + "router_z_loss_clip": 1.95507812, + "router_z_loss_mlp": 0.18139648, + "step": 5325, + "time_per_iteration": 2.4962642192840576 + }, + { + "auxiliary_loss_clip": 0.06478769, + "auxiliary_loss_mlp": 0.01277308, + "balance_loss_clip": 0.06280222, + "balance_loss_mlp": 0.01259701, + "epoch": 0.32021644370960467, + "flos": 31691975137920.0, + "grad_norm": 1.8321318923341703, + "language_loss": 0.75898254, + "learning_rate": 3.179631337655037e-06, + "loss": 0.83654332, + "num_input_tokens_seen": 114416780, + "router_z_loss_clip": 1.98632812, + "router_z_loss_mlp": 0.17602539, + "step": 5326, + "time_per_iteration": 2.5752692222595215 + }, + { + "auxiliary_loss_clip": 0.06472234, + "auxiliary_loss_mlp": 0.01278108, + "balance_loss_clip": 0.06281741, + "balance_loss_mlp": 0.01260918, + "epoch": 0.32027656696227264, + "flos": 26872488322560.0, + "grad_norm": 1.458996564995821, + "language_loss": 0.81400204, + "learning_rate": 3.179316810218701e-06, + "loss": 0.89150548, + "num_input_tokens_seen": 114437405, + "router_z_loss_clip": 1.90527344, + "router_z_loss_mlp": 0.171875, + "step": 5327, + "time_per_iteration": 2.5635383129119873 + }, + { + "auxiliary_loss_clip": 0.06486546, + "auxiliary_loss_mlp": 0.01273421, + "balance_loss_clip": 0.062847, + "balance_loss_mlp": 0.01256207, + "epoch": 0.32033669021494066, + "flos": 24176705639040.0, + "grad_norm": 1.3787000535244864, + "language_loss": 0.77910948, + "learning_rate": 3.179002238062554e-06, + "loss": 0.85670912, + "num_input_tokens_seen": 114458505, + "router_z_loss_clip": 2.01855469, + "router_z_loss_mlp": 0.17211914, + "step": 5328, + "time_per_iteration": 2.514646053314209 + }, + { + "auxiliary_loss_clip": 0.06484267, + "auxiliary_loss_mlp": 0.01278516, + "balance_loss_clip": 0.06287045, + "balance_loss_mlp": 0.0125992, + "epoch": 0.3203968134676086, + "flos": 24467419779840.0, + "grad_norm": 1.5501370939230803, + "language_loss": 0.74267161, + "learning_rate": 3.178687621198524e-06, + "loss": 0.82029939, + "num_input_tokens_seen": 114479050, + "router_z_loss_clip": 1.97265625, + "router_z_loss_mlp": 0.18591309, + "step": 5329, + "time_per_iteration": 2.5436654090881348 + }, + { + "auxiliary_loss_clip": 0.06471072, + "auxiliary_loss_mlp": 0.01278598, + "balance_loss_clip": 0.06282842, + "balance_loss_mlp": 0.01262434, + "epoch": 0.3204569367202766, + "flos": 18010606256640.0, + "grad_norm": 1.7046636031855489, + "language_loss": 0.71222955, + "learning_rate": 3.1783729596385415e-06, + "loss": 0.78972626, + "num_input_tokens_seen": 114497415, + "router_z_loss_clip": 1.8828125, + "router_z_loss_mlp": 0.16162109, + "step": 5330, + "time_per_iteration": 2.479647397994995 + }, + { + "auxiliary_loss_clip": 0.06485157, + "auxiliary_loss_mlp": 0.01277162, + "balance_loss_clip": 0.0628237, + "balance_loss_mlp": 0.0125791, + "epoch": 0.32051705997294455, + "flos": 30597237544320.0, + "grad_norm": 1.705143811074938, + "language_loss": 0.80496192, + "learning_rate": 3.1780582533945376e-06, + "loss": 0.88258511, + "num_input_tokens_seen": 114518785, + "router_z_loss_clip": 2.02832031, + "router_z_loss_mlp": 0.19250488, + "step": 5331, + "time_per_iteration": 2.5741958618164062 + }, + { + "auxiliary_loss_clip": 0.06384323, + "auxiliary_loss_mlp": 0.0125803, + "balance_loss_clip": 0.06292279, + "balance_loss_mlp": 0.01253741, + "epoch": 0.3205771832256125, + "flos": 68436723657600.0, + "grad_norm": 0.7949538218297083, + "language_loss": 0.5776577, + "learning_rate": 3.177743502478447e-06, + "loss": 0.65408123, + "num_input_tokens_seen": 114577710, + "router_z_loss_clip": 0.921875, + "router_z_loss_mlp": 0.04293823, + "step": 5332, + "time_per_iteration": 3.084747314453125 + }, + { + "auxiliary_loss_clip": 0.06488422, + "auxiliary_loss_mlp": 0.01272523, + "balance_loss_clip": 0.06286052, + "balance_loss_mlp": 0.01255154, + "epoch": 0.3206373064782805, + "flos": 30451524094080.0, + "grad_norm": 1.5377704746044631, + "language_loss": 0.73702615, + "learning_rate": 3.177428706902205e-06, + "loss": 0.81463563, + "num_input_tokens_seen": 114598640, + "router_z_loss_clip": 2.0234375, + "router_z_loss_mlp": 0.17358398, + "step": 5333, + "time_per_iteration": 2.6130683422088623 + }, + { + "auxiliary_loss_clip": 0.06480561, + "auxiliary_loss_mlp": 0.01273615, + "balance_loss_clip": 0.06284031, + "balance_loss_mlp": 0.01256246, + "epoch": 0.32069742973094845, + "flos": 22061051498880.0, + "grad_norm": 1.6882238799892797, + "language_loss": 0.70957875, + "learning_rate": 3.1771138666777485e-06, + "loss": 0.78712052, + "num_input_tokens_seen": 114618780, + "router_z_loss_clip": 1.96484375, + "router_z_loss_mlp": 0.17382812, + "step": 5334, + "time_per_iteration": 2.5501654148101807 + }, + { + "auxiliary_loss_clip": 0.06476508, + "auxiliary_loss_mlp": 0.01276305, + "balance_loss_clip": 0.06281763, + "balance_loss_mlp": 0.01257947, + "epoch": 0.3207575529836164, + "flos": 22060464520320.0, + "grad_norm": 1.723674002448169, + "language_loss": 0.77349097, + "learning_rate": 3.1767989818170156e-06, + "loss": 0.85101908, + "num_input_tokens_seen": 114637525, + "router_z_loss_clip": 1.94433594, + "router_z_loss_mlp": 0.18347168, + "step": 5335, + "time_per_iteration": 2.5194711685180664 + }, + { + "auxiliary_loss_clip": 0.06479798, + "auxiliary_loss_mlp": 0.0127571, + "balance_loss_clip": 0.06285612, + "balance_loss_mlp": 0.0125889, + "epoch": 0.3208176762362844, + "flos": 34065961015680.0, + "grad_norm": 1.52521333905674, + "language_loss": 0.68891776, + "learning_rate": 3.1764840523319477e-06, + "loss": 0.76647282, + "num_input_tokens_seen": 114659705, + "router_z_loss_clip": 1.94238281, + "router_z_loss_mlp": 0.16809082, + "step": 5336, + "time_per_iteration": 2.6550848484039307 + }, + { + "auxiliary_loss_clip": 0.06481949, + "auxiliary_loss_mlp": 0.01285819, + "balance_loss_clip": 0.06286713, + "balance_loss_mlp": 0.01268343, + "epoch": 0.32087779948895234, + "flos": 21805151529600.0, + "grad_norm": 1.6666772631518172, + "language_loss": 0.79367507, + "learning_rate": 3.176169078234487e-06, + "loss": 0.87135273, + "num_input_tokens_seen": 114678340, + "router_z_loss_clip": 1.95410156, + "router_z_loss_mlp": 0.17480469, + "step": 5337, + "time_per_iteration": 2.5133795738220215 + }, + { + "auxiliary_loss_clip": 0.06473362, + "auxiliary_loss_mlp": 0.01277197, + "balance_loss_clip": 0.06284505, + "balance_loss_mlp": 0.01260865, + "epoch": 0.3209379227416203, + "flos": 21440532487680.0, + "grad_norm": 1.6244255970978692, + "language_loss": 0.75145769, + "learning_rate": 3.1758540595365766e-06, + "loss": 0.82896328, + "num_input_tokens_seen": 114696980, + "router_z_loss_clip": 1.88964844, + "router_z_loss_mlp": 0.16320801, + "step": 5338, + "time_per_iteration": 2.526841402053833 + }, + { + "auxiliary_loss_clip": 0.06482957, + "auxiliary_loss_mlp": 0.01277739, + "balance_loss_clip": 0.06285477, + "balance_loss_mlp": 0.01260216, + "epoch": 0.3209980459942883, + "flos": 25856267604480.0, + "grad_norm": 1.7965894601451369, + "language_loss": 0.63241929, + "learning_rate": 3.1755389962501626e-06, + "loss": 0.7100262, + "num_input_tokens_seen": 114717330, + "router_z_loss_clip": 1.97558594, + "router_z_loss_mlp": 0.17504883, + "step": 5339, + "time_per_iteration": 2.5847740173339844 + }, + { + "auxiliary_loss_clip": 0.06482022, + "auxiliary_loss_mlp": 0.0127165, + "balance_loss_clip": 0.06283947, + "balance_loss_mlp": 0.01255151, + "epoch": 0.32105816924695624, + "flos": 19105218069120.0, + "grad_norm": 2.418138513897033, + "language_loss": 0.81912339, + "learning_rate": 3.175223888387192e-06, + "loss": 0.89666009, + "num_input_tokens_seen": 114736320, + "router_z_loss_clip": 1.97949219, + "router_z_loss_mlp": 0.16491699, + "step": 5340, + "time_per_iteration": 2.5764145851135254 + }, + { + "auxiliary_loss_clip": 0.06475554, + "auxiliary_loss_mlp": 0.01271917, + "balance_loss_clip": 0.06281976, + "balance_loss_mlp": 0.01254774, + "epoch": 0.3211182924996242, + "flos": 16587531239040.0, + "grad_norm": 1.7719401771551753, + "language_loss": 0.76604897, + "learning_rate": 3.1749087359596137e-06, + "loss": 0.84352368, + "num_input_tokens_seen": 114754575, + "router_z_loss_clip": 1.93652344, + "router_z_loss_mlp": 0.17150879, + "step": 5341, + "time_per_iteration": 2.505668878555298 + }, + { + "auxiliary_loss_clip": 0.06474154, + "auxiliary_loss_mlp": 0.01272611, + "balance_loss_clip": 0.0628191, + "balance_loss_mlp": 0.01255969, + "epoch": 0.3211784157522922, + "flos": 22678425982080.0, + "grad_norm": 1.4764530250267398, + "language_loss": 0.79422891, + "learning_rate": 3.1745935389793786e-06, + "loss": 0.87169659, + "num_input_tokens_seen": 114773590, + "router_z_loss_clip": 1.92382812, + "router_z_loss_mlp": 0.16662598, + "step": 5342, + "time_per_iteration": 2.5391595363616943 + }, + { + "auxiliary_loss_clip": 0.06483465, + "auxiliary_loss_mlp": 0.01277474, + "balance_loss_clip": 0.06286988, + "balance_loss_mlp": 0.01260141, + "epoch": 0.3212385390049602, + "flos": 20565119756160.0, + "grad_norm": 2.45787142613039, + "language_loss": 0.75074786, + "learning_rate": 3.174278297458438e-06, + "loss": 0.82835722, + "num_input_tokens_seen": 114790775, + "router_z_loss_clip": 1.96484375, + "router_z_loss_mlp": 0.17321777, + "step": 5343, + "time_per_iteration": 2.4957783222198486 + }, + { + "auxiliary_loss_clip": 0.06479985, + "auxiliary_loss_mlp": 0.01272066, + "balance_loss_clip": 0.06286201, + "balance_loss_mlp": 0.01255043, + "epoch": 0.32129866225762815, + "flos": 24798188972160.0, + "grad_norm": 1.5494427093400844, + "language_loss": 0.82596725, + "learning_rate": 3.173963011408748e-06, + "loss": 0.9034878, + "num_input_tokens_seen": 114809835, + "router_z_loss_clip": 1.94042969, + "router_z_loss_mlp": 0.17028809, + "step": 5344, + "time_per_iteration": 2.5672519207000732 + }, + { + "auxiliary_loss_clip": 0.06478736, + "auxiliary_loss_mlp": 0.01273821, + "balance_loss_clip": 0.06282513, + "balance_loss_mlp": 0.0125731, + "epoch": 0.3213587855102961, + "flos": 18372374259840.0, + "grad_norm": 1.9111940233558649, + "language_loss": 0.80321491, + "learning_rate": 3.173647680842262e-06, + "loss": 0.8807404, + "num_input_tokens_seen": 114826505, + "router_z_loss_clip": 1.96289062, + "router_z_loss_mlp": 0.16516113, + "step": 5345, + "time_per_iteration": 2.479442834854126 + }, + { + "auxiliary_loss_clip": 0.06478975, + "auxiliary_loss_mlp": 0.01271046, + "balance_loss_clip": 0.06283471, + "balance_loss_mlp": 0.01254321, + "epoch": 0.3214189087629641, + "flos": 27023274944640.0, + "grad_norm": 1.7019036305222461, + "language_loss": 0.83604348, + "learning_rate": 3.1733323057709384e-06, + "loss": 0.9135437, + "num_input_tokens_seen": 114846140, + "router_z_loss_clip": 1.95507812, + "router_z_loss_mlp": 0.16723633, + "step": 5346, + "time_per_iteration": 2.549257755279541 + }, + { + "auxiliary_loss_clip": 0.0648382, + "auxiliary_loss_mlp": 0.01272196, + "balance_loss_clip": 0.06285056, + "balance_loss_mlp": 0.0125528, + "epoch": 0.32147903201563205, + "flos": 23154866697600.0, + "grad_norm": 1.4545038816344273, + "language_loss": 0.81656283, + "learning_rate": 3.1730168862067366e-06, + "loss": 0.89412296, + "num_input_tokens_seen": 114866660, + "router_z_loss_clip": 1.98925781, + "router_z_loss_mlp": 0.16918945, + "step": 5347, + "time_per_iteration": 2.5096054077148438 + }, + { + "auxiliary_loss_clip": 0.06480029, + "auxiliary_loss_mlp": 0.01274054, + "balance_loss_clip": 0.06286772, + "balance_loss_mlp": 0.01256673, + "epoch": 0.3215391552683, + "flos": 16586231500800.0, + "grad_norm": 2.536962878441814, + "language_loss": 0.80386555, + "learning_rate": 3.1727014221616164e-06, + "loss": 0.88140643, + "num_input_tokens_seen": 114882820, + "router_z_loss_clip": 1.9296875, + "router_z_loss_mlp": 0.1739502, + "step": 5348, + "time_per_iteration": 3.9639015197753906 + }, + { + "auxiliary_loss_clip": 0.06474565, + "auxiliary_loss_mlp": 0.01276371, + "balance_loss_clip": 0.06280862, + "balance_loss_mlp": 0.01259431, + "epoch": 0.321599278520968, + "flos": 17827604939520.0, + "grad_norm": 2.026618804026968, + "language_loss": 0.85758352, + "learning_rate": 3.172385913647542e-06, + "loss": 0.93509287, + "num_input_tokens_seen": 114900745, + "router_z_loss_clip": 1.93554688, + "router_z_loss_mlp": 0.16943359, + "step": 5349, + "time_per_iteration": 3.8848202228546143 + }, + { + "auxiliary_loss_clip": 0.06481349, + "auxiliary_loss_mlp": 0.01274724, + "balance_loss_clip": 0.06286412, + "balance_loss_mlp": 0.01257022, + "epoch": 0.32165940177363594, + "flos": 16257097463040.0, + "grad_norm": 1.7607877661370477, + "language_loss": 0.8123306, + "learning_rate": 3.172070360676475e-06, + "loss": 0.88989133, + "num_input_tokens_seen": 114917940, + "router_z_loss_clip": 1.95019531, + "router_z_loss_mlp": 0.17700195, + "step": 5350, + "time_per_iteration": 3.9589500427246094 + }, + { + "auxiliary_loss_clip": 0.06471309, + "auxiliary_loss_mlp": 0.01270957, + "balance_loss_clip": 0.06282239, + "balance_loss_mlp": 0.01255055, + "epoch": 0.3217195250263039, + "flos": 27607302702720.0, + "grad_norm": 1.8529018663543275, + "language_loss": 0.80116528, + "learning_rate": 3.1717547632603828e-06, + "loss": 0.87858802, + "num_input_tokens_seen": 114937735, + "router_z_loss_clip": 1.89257812, + "router_z_loss_mlp": 0.15905762, + "step": 5351, + "time_per_iteration": 2.562232732772827 + }, + { + "auxiliary_loss_clip": 0.06476732, + "auxiliary_loss_mlp": 0.01274907, + "balance_loss_clip": 0.06284767, + "balance_loss_mlp": 0.01256668, + "epoch": 0.3217796482789719, + "flos": 21477023740800.0, + "grad_norm": 2.0321110975992562, + "language_loss": 0.7641573, + "learning_rate": 3.1714391214112326e-06, + "loss": 0.84167361, + "num_input_tokens_seen": 114956630, + "router_z_loss_clip": 1.91699219, + "router_z_loss_mlp": 0.18249512, + "step": 5352, + "time_per_iteration": 2.5320773124694824 + }, + { + "auxiliary_loss_clip": 0.0648407, + "auxiliary_loss_mlp": 0.01278365, + "balance_loss_clip": 0.06291708, + "balance_loss_mlp": 0.0126133, + "epoch": 0.32183977153163984, + "flos": 21222046166400.0, + "grad_norm": 1.9188598206640457, + "language_loss": 0.82159722, + "learning_rate": 3.1711234351409933e-06, + "loss": 0.89922154, + "num_input_tokens_seen": 114976470, + "router_z_loss_clip": 1.92089844, + "router_z_loss_mlp": 0.17028809, + "step": 5353, + "time_per_iteration": 2.5061802864074707 + }, + { + "auxiliary_loss_clip": 0.06480308, + "auxiliary_loss_mlp": 0.01275858, + "balance_loss_clip": 0.0629053, + "balance_loss_mlp": 0.0125837, + "epoch": 0.3218998947843078, + "flos": 24615103800960.0, + "grad_norm": 1.8505936463490174, + "language_loss": 0.74125177, + "learning_rate": 3.1708077044616365e-06, + "loss": 0.81881344, + "num_input_tokens_seen": 114996710, + "router_z_loss_clip": 1.89941406, + "router_z_loss_mlp": 0.17480469, + "step": 5354, + "time_per_iteration": 2.5725185871124268 + }, + { + "auxiliary_loss_clip": 0.06479903, + "auxiliary_loss_mlp": 0.01277081, + "balance_loss_clip": 0.06284866, + "balance_loss_mlp": 0.01259951, + "epoch": 0.3219600180369758, + "flos": 22276686781440.0, + "grad_norm": 2.612968571970558, + "language_loss": 0.83769405, + "learning_rate": 3.1704919293851334e-06, + "loss": 0.91526389, + "num_input_tokens_seen": 115015775, + "router_z_loss_clip": 1.95019531, + "router_z_loss_mlp": 0.17126465, + "step": 5355, + "time_per_iteration": 3.985846757888794 + }, + { + "auxiliary_loss_clip": 0.0647967, + "auxiliary_loss_mlp": 0.01272253, + "balance_loss_clip": 0.0628608, + "balance_loss_mlp": 0.01255528, + "epoch": 0.3220201412896438, + "flos": 14944376672640.0, + "grad_norm": 1.8959584470465125, + "language_loss": 0.71344721, + "learning_rate": 3.1701761099234597e-06, + "loss": 0.79096651, + "num_input_tokens_seen": 115034265, + "router_z_loss_clip": 1.93554688, + "router_z_loss_mlp": 0.1673584, + "step": 5356, + "time_per_iteration": 2.5644400119781494 + }, + { + "auxiliary_loss_clip": 0.06494904, + "auxiliary_loss_mlp": 0.01280986, + "balance_loss_clip": 0.0629259, + "balance_loss_mlp": 0.01263367, + "epoch": 0.32208026454231176, + "flos": 22672807758720.0, + "grad_norm": 2.5335154176231525, + "language_loss": 0.67879629, + "learning_rate": 3.1698602460885903e-06, + "loss": 0.7565552, + "num_input_tokens_seen": 115051945, + "router_z_loss_clip": 2.02050781, + "router_z_loss_mlp": 0.17614746, + "step": 5357, + "time_per_iteration": 2.546654224395752 + }, + { + "auxiliary_loss_clip": 0.06384487, + "auxiliary_loss_mlp": 0.01261366, + "balance_loss_clip": 0.06294875, + "balance_loss_mlp": 0.01257649, + "epoch": 0.3221403877949797, + "flos": 64626273308160.0, + "grad_norm": 0.6824166316331671, + "language_loss": 0.58314437, + "learning_rate": 3.1695443378925035e-06, + "loss": 0.65960288, + "num_input_tokens_seen": 115119090, + "router_z_loss_clip": 0.89648438, + "router_z_loss_mlp": 0.03707886, + "step": 5358, + "time_per_iteration": 3.2290756702423096 + }, + { + "auxiliary_loss_clip": 0.06481851, + "auxiliary_loss_mlp": 0.01282518, + "balance_loss_clip": 0.06287378, + "balance_loss_mlp": 0.01264839, + "epoch": 0.3222005110476477, + "flos": 20163212847360.0, + "grad_norm": 1.9186908993809755, + "language_loss": 0.84190667, + "learning_rate": 3.1692283853471777e-06, + "loss": 0.91955042, + "num_input_tokens_seen": 115137755, + "router_z_loss_clip": 1.94628906, + "router_z_loss_mlp": 0.17675781, + "step": 5359, + "time_per_iteration": 2.531033754348755 + }, + { + "auxiliary_loss_clip": 0.06480163, + "auxiliary_loss_mlp": 0.01277134, + "balance_loss_clip": 0.06287846, + "balance_loss_mlp": 0.01260051, + "epoch": 0.32226063430031565, + "flos": 22680731969280.0, + "grad_norm": 1.6695480137557102, + "language_loss": 0.79997146, + "learning_rate": 3.168912388464595e-06, + "loss": 0.87754452, + "num_input_tokens_seen": 115158150, + "router_z_loss_clip": 1.921875, + "router_z_loss_mlp": 0.17077637, + "step": 5360, + "time_per_iteration": 2.544461727142334 + }, + { + "auxiliary_loss_clip": 0.06382456, + "auxiliary_loss_mlp": 0.01256795, + "balance_loss_clip": 0.06292457, + "balance_loss_mlp": 0.01253353, + "epoch": 0.3223207575529836, + "flos": 63847798151040.0, + "grad_norm": 0.6356253914940931, + "language_loss": 0.56731617, + "learning_rate": 3.168596347256737e-06, + "loss": 0.64370871, + "num_input_tokens_seen": 115212755, + "router_z_loss_clip": 0.8984375, + "router_z_loss_mlp": 0.03451538, + "step": 5361, + "time_per_iteration": 3.0336568355560303 + }, + { + "auxiliary_loss_clip": 0.06478466, + "auxiliary_loss_mlp": 0.01277797, + "balance_loss_clip": 0.06288562, + "balance_loss_mlp": 0.01261346, + "epoch": 0.3223808808056516, + "flos": 26877393786240.0, + "grad_norm": 2.167930910708006, + "language_loss": 0.71792114, + "learning_rate": 3.168280261735588e-06, + "loss": 0.79548371, + "num_input_tokens_seen": 115233090, + "router_z_loss_clip": 1.89746094, + "router_z_loss_mlp": 0.16442871, + "step": 5362, + "time_per_iteration": 2.561345338821411 + }, + { + "auxiliary_loss_clip": 0.06483887, + "auxiliary_loss_mlp": 0.01279203, + "balance_loss_clip": 0.06293412, + "balance_loss_mlp": 0.01262692, + "epoch": 0.32244100405831955, + "flos": 26768716640640.0, + "grad_norm": 1.5327886568658977, + "language_loss": 0.73854291, + "learning_rate": 3.167964131913135e-06, + "loss": 0.81617379, + "num_input_tokens_seen": 115252645, + "router_z_loss_clip": 1.90722656, + "router_z_loss_mlp": 0.16503906, + "step": 5363, + "time_per_iteration": 2.583064556121826 + }, + { + "auxiliary_loss_clip": 0.06489229, + "auxiliary_loss_mlp": 0.01275466, + "balance_loss_clip": 0.06291971, + "balance_loss_mlp": 0.01258717, + "epoch": 0.3225011273109875, + "flos": 23809403266560.0, + "grad_norm": 2.354374584633167, + "language_loss": 0.76664144, + "learning_rate": 3.167647957801365e-06, + "loss": 0.84428835, + "num_input_tokens_seen": 115269085, + "router_z_loss_clip": 1.97265625, + "router_z_loss_mlp": 0.16748047, + "step": 5364, + "time_per_iteration": 2.5177268981933594 + }, + { + "auxiliary_loss_clip": 0.06479897, + "auxiliary_loss_mlp": 0.01275674, + "balance_loss_clip": 0.06290577, + "balance_loss_mlp": 0.01259473, + "epoch": 0.3225612505636555, + "flos": 17280194215680.0, + "grad_norm": 2.1891061142162327, + "language_loss": 0.7715044, + "learning_rate": 3.1673317394122672e-06, + "loss": 0.84906018, + "num_input_tokens_seen": 115286470, + "router_z_loss_clip": 1.89257812, + "router_z_loss_mlp": 0.1619873, + "step": 5365, + "time_per_iteration": 2.5122928619384766 + }, + { + "auxiliary_loss_clip": 0.06484331, + "auxiliary_loss_mlp": 0.01277663, + "balance_loss_clip": 0.06292351, + "balance_loss_mlp": 0.01260711, + "epoch": 0.32262137381632344, + "flos": 23372724113280.0, + "grad_norm": 2.314444268247813, + "language_loss": 0.77153468, + "learning_rate": 3.1670154767578333e-06, + "loss": 0.84915465, + "num_input_tokens_seen": 115307000, + "router_z_loss_clip": 1.92089844, + "router_z_loss_mlp": 0.16955566, + "step": 5366, + "time_per_iteration": 2.514768362045288 + }, + { + "auxiliary_loss_clip": 0.06481092, + "auxiliary_loss_mlp": 0.01280366, + "balance_loss_clip": 0.0629226, + "balance_loss_mlp": 0.0126388, + "epoch": 0.3226814970689914, + "flos": 23265598268160.0, + "grad_norm": 1.8642315088319754, + "language_loss": 0.72423649, + "learning_rate": 3.166699169850055e-06, + "loss": 0.80185115, + "num_input_tokens_seen": 115325925, + "router_z_loss_clip": 1.88769531, + "router_z_loss_mlp": 0.16491699, + "step": 5367, + "time_per_iteration": 2.544145345687866 + }, + { + "auxiliary_loss_clip": 0.06480073, + "auxiliary_loss_mlp": 0.01278287, + "balance_loss_clip": 0.06290721, + "balance_loss_mlp": 0.01262248, + "epoch": 0.32274162032165943, + "flos": 16400127582720.0, + "grad_norm": 1.9542840286813894, + "language_loss": 0.74559301, + "learning_rate": 3.1663828187009274e-06, + "loss": 0.82317662, + "num_input_tokens_seen": 115343705, + "router_z_loss_clip": 1.89453125, + "router_z_loss_mlp": 0.16033936, + "step": 5368, + "time_per_iteration": 2.4653942584991455 + }, + { + "auxiliary_loss_clip": 0.06481207, + "auxiliary_loss_mlp": 0.01271425, + "balance_loss_clip": 0.06294385, + "balance_loss_mlp": 0.01255874, + "epoch": 0.3228017435743274, + "flos": 27862489912320.0, + "grad_norm": 2.016369988637382, + "language_loss": 0.79033995, + "learning_rate": 3.1660664233224467e-06, + "loss": 0.86786628, + "num_input_tokens_seen": 115364170, + "router_z_loss_clip": 1.86621094, + "router_z_loss_mlp": 0.15533447, + "step": 5369, + "time_per_iteration": 2.6923141479492188 + }, + { + "auxiliary_loss_clip": 0.06471382, + "auxiliary_loss_mlp": 0.01280148, + "balance_loss_clip": 0.0628759, + "balance_loss_mlp": 0.01264567, + "epoch": 0.32286186682699536, + "flos": 19614712020480.0, + "grad_norm": 1.8619928029866217, + "language_loss": 0.83607441, + "learning_rate": 3.16574998372661e-06, + "loss": 0.91358972, + "num_input_tokens_seen": 115382495, + "router_z_loss_clip": 1.83984375, + "router_z_loss_mlp": 0.15576172, + "step": 5370, + "time_per_iteration": 2.4963490962982178 + }, + { + "auxiliary_loss_clip": 0.06481104, + "auxiliary_loss_mlp": 0.01278081, + "balance_loss_clip": 0.062904, + "balance_loss_mlp": 0.01262703, + "epoch": 0.3229219900796633, + "flos": 24140885218560.0, + "grad_norm": 2.7780356443351146, + "language_loss": 0.83346975, + "learning_rate": 3.1654334999254177e-06, + "loss": 0.91106164, + "num_input_tokens_seen": 115399450, + "router_z_loss_clip": 1.90625, + "router_z_loss_mlp": 0.15368652, + "step": 5371, + "time_per_iteration": 2.554034948348999 + }, + { + "auxiliary_loss_clip": 0.06486623, + "auxiliary_loss_mlp": 0.01278101, + "balance_loss_clip": 0.0629211, + "balance_loss_mlp": 0.01260434, + "epoch": 0.3229821133323313, + "flos": 17754454725120.0, + "grad_norm": 2.279534384310274, + "language_loss": 0.89153087, + "learning_rate": 3.1651169719308695e-06, + "loss": 0.96917808, + "num_input_tokens_seen": 115417700, + "router_z_loss_clip": 1.94433594, + "router_z_loss_mlp": 0.17663574, + "step": 5372, + "time_per_iteration": 2.468693971633911 + }, + { + "auxiliary_loss_clip": 0.06478924, + "auxiliary_loss_mlp": 0.01278448, + "balance_loss_clip": 0.06288313, + "balance_loss_mlp": 0.01261843, + "epoch": 0.32304223658499925, + "flos": 22352562253440.0, + "grad_norm": 1.986067660558338, + "language_loss": 0.730793, + "learning_rate": 3.1648003997549694e-06, + "loss": 0.80836678, + "num_input_tokens_seen": 115435840, + "router_z_loss_clip": 1.90332031, + "router_z_loss_mlp": 0.16601562, + "step": 5373, + "time_per_iteration": 2.5757906436920166 + }, + { + "auxiliary_loss_clip": 0.06476311, + "auxiliary_loss_mlp": 0.0127432, + "balance_loss_clip": 0.06293686, + "balance_loss_mlp": 0.01258227, + "epoch": 0.3231023598376672, + "flos": 18484154006400.0, + "grad_norm": 2.1970042176000963, + "language_loss": 0.82592154, + "learning_rate": 3.1644837834097214e-06, + "loss": 0.90342778, + "num_input_tokens_seen": 115454210, + "router_z_loss_clip": 1.82714844, + "router_z_loss_mlp": 0.1607666, + "step": 5374, + "time_per_iteration": 2.4853713512420654 + }, + { + "auxiliary_loss_clip": 0.06474404, + "auxiliary_loss_mlp": 0.01271223, + "balance_loss_clip": 0.06291121, + "balance_loss_mlp": 0.0125544, + "epoch": 0.3231624830903352, + "flos": 27643710101760.0, + "grad_norm": 1.9120740622639463, + "language_loss": 0.88405079, + "learning_rate": 3.1641671229071317e-06, + "loss": 0.96150708, + "num_input_tokens_seen": 115471785, + "router_z_loss_clip": 1.83007812, + "router_z_loss_mlp": 0.15783691, + "step": 5375, + "time_per_iteration": 2.58644700050354 + }, + { + "auxiliary_loss_clip": 0.06483716, + "auxiliary_loss_mlp": 0.01275166, + "balance_loss_clip": 0.06289761, + "balance_loss_mlp": 0.01258799, + "epoch": 0.32322260634300315, + "flos": 21732965637120.0, + "grad_norm": 2.2884949024183983, + "language_loss": 0.76224899, + "learning_rate": 3.1638504182592076e-06, + "loss": 0.83983773, + "num_input_tokens_seen": 115491405, + "router_z_loss_clip": 1.94042969, + "router_z_loss_mlp": 0.16345215, + "step": 5376, + "time_per_iteration": 2.5090999603271484 + }, + { + "auxiliary_loss_clip": 0.0647772, + "auxiliary_loss_mlp": 0.01272254, + "balance_loss_clip": 0.06289793, + "balance_loss_mlp": 0.01256649, + "epoch": 0.3232827295956711, + "flos": 22644198789120.0, + "grad_norm": 1.5259481118475857, + "language_loss": 0.67275858, + "learning_rate": 3.1635336694779594e-06, + "loss": 0.75025833, + "num_input_tokens_seen": 115511555, + "router_z_loss_clip": 1.88085938, + "router_z_loss_mlp": 0.15594482, + "step": 5377, + "time_per_iteration": 2.592737913131714 + }, + { + "auxiliary_loss_clip": 0.06482306, + "auxiliary_loss_mlp": 0.01279693, + "balance_loss_clip": 0.06294581, + "balance_loss_mlp": 0.01262158, + "epoch": 0.3233428528483391, + "flos": 26329731500160.0, + "grad_norm": 1.747214931760967, + "language_loss": 0.73022175, + "learning_rate": 3.1632168765753982e-06, + "loss": 0.80784178, + "num_input_tokens_seen": 115532860, + "router_z_loss_clip": 1.87597656, + "router_z_loss_mlp": 0.17541504, + "step": 5378, + "time_per_iteration": 2.560969114303589 + }, + { + "auxiliary_loss_clip": 0.06476232, + "auxiliary_loss_mlp": 0.01272167, + "balance_loss_clip": 0.06289409, + "balance_loss_mlp": 0.01256598, + "epoch": 0.32340297610100704, + "flos": 28592818099200.0, + "grad_norm": 2.0362074337070832, + "language_loss": 0.82332939, + "learning_rate": 3.1629000395635357e-06, + "loss": 0.90081334, + "num_input_tokens_seen": 115553850, + "router_z_loss_clip": 1.86816406, + "router_z_loss_mlp": 0.15563965, + "step": 5379, + "time_per_iteration": 2.661787986755371 + }, + { + "auxiliary_loss_clip": 0.06481552, + "auxiliary_loss_mlp": 0.01276474, + "balance_loss_clip": 0.06288823, + "balance_loss_mlp": 0.01260548, + "epoch": 0.323463099353675, + "flos": 30781664380800.0, + "grad_norm": 1.6212615798097256, + "language_loss": 0.78942055, + "learning_rate": 3.162583158454388e-06, + "loss": 0.86700082, + "num_input_tokens_seen": 115575530, + "router_z_loss_clip": 1.92285156, + "router_z_loss_mlp": 0.15942383, + "step": 5380, + "time_per_iteration": 2.593618631362915 + }, + { + "auxiliary_loss_clip": 0.06489569, + "auxiliary_loss_mlp": 0.01272069, + "balance_loss_clip": 0.06298643, + "balance_loss_mlp": 0.01255368, + "epoch": 0.32352322260634303, + "flos": 25235664739200.0, + "grad_norm": 1.685322069138263, + "language_loss": 0.77853882, + "learning_rate": 3.1622662332599697e-06, + "loss": 0.85615522, + "num_input_tokens_seen": 115594885, + "router_z_loss_clip": 1.91015625, + "router_z_loss_mlp": 0.16699219, + "step": 5381, + "time_per_iteration": 2.5967609882354736 + }, + { + "auxiliary_loss_clip": 0.06476527, + "auxiliary_loss_mlp": 0.01269308, + "balance_loss_clip": 0.06292967, + "balance_loss_mlp": 0.01255438, + "epoch": 0.323583345859011, + "flos": 23337071400960.0, + "grad_norm": 1.9004028984655497, + "language_loss": 0.72391021, + "learning_rate": 3.1619492639922998e-06, + "loss": 0.80136859, + "num_input_tokens_seen": 115614080, + "router_z_loss_clip": 1.83496094, + "router_z_loss_mlp": 0.13848877, + "step": 5382, + "time_per_iteration": 2.5095293521881104 + }, + { + "auxiliary_loss_clip": 0.06488711, + "auxiliary_loss_mlp": 0.01277606, + "balance_loss_clip": 0.06295708, + "balance_loss_mlp": 0.01262157, + "epoch": 0.32364346911167896, + "flos": 26213675195520.0, + "grad_norm": 2.3447859303702883, + "language_loss": 0.71528596, + "learning_rate": 3.1616322506633964e-06, + "loss": 0.79294908, + "num_input_tokens_seen": 115632820, + "router_z_loss_clip": 1.9296875, + "router_z_loss_mlp": 0.15441895, + "step": 5383, + "time_per_iteration": 2.5806562900543213 + }, + { + "auxiliary_loss_clip": 0.06476977, + "auxiliary_loss_mlp": 0.01276799, + "balance_loss_clip": 0.06292375, + "balance_loss_mlp": 0.01261564, + "epoch": 0.3237035923643469, + "flos": 23702487056640.0, + "grad_norm": 1.948915226701978, + "language_loss": 0.78857487, + "learning_rate": 3.161315193285283e-06, + "loss": 0.86611259, + "num_input_tokens_seen": 115652860, + "router_z_loss_clip": 1.84472656, + "router_z_loss_mlp": 0.15234375, + "step": 5384, + "time_per_iteration": 2.548797369003296 + }, + { + "auxiliary_loss_clip": 0.06481218, + "auxiliary_loss_mlp": 0.01274762, + "balance_loss_clip": 0.06288576, + "balance_loss_mlp": 0.0125793, + "epoch": 0.3237637156170149, + "flos": 14433960326400.0, + "grad_norm": 1.885180362402172, + "language_loss": 0.75034815, + "learning_rate": 3.16099809186998e-06, + "loss": 0.82790792, + "num_input_tokens_seen": 115670940, + "router_z_loss_clip": 1.92675781, + "router_z_loss_mlp": 0.16821289, + "step": 5385, + "time_per_iteration": 2.577547073364258 + }, + { + "auxiliary_loss_clip": 0.06486371, + "auxiliary_loss_mlp": 0.0127007, + "balance_loss_clip": 0.06298091, + "balance_loss_mlp": 0.01255032, + "epoch": 0.32382383886968286, + "flos": 31070449877760.0, + "grad_norm": 1.8174179211363362, + "language_loss": 0.72224641, + "learning_rate": 3.1606809464295145e-06, + "loss": 0.79981083, + "num_input_tokens_seen": 115691155, + "router_z_loss_clip": 1.8828125, + "router_z_loss_mlp": 0.15032959, + "step": 5386, + "time_per_iteration": 2.585822820663452 + }, + { + "auxiliary_loss_clip": 0.06485418, + "auxiliary_loss_mlp": 0.01273325, + "balance_loss_clip": 0.06292341, + "balance_loss_mlp": 0.01256803, + "epoch": 0.3238839621223508, + "flos": 23263418062080.0, + "grad_norm": 3.182973165751226, + "language_loss": 0.95573068, + "learning_rate": 3.1603637569759095e-06, + "loss": 1.03331804, + "num_input_tokens_seen": 115710340, + "router_z_loss_clip": 1.93261719, + "router_z_loss_mlp": 0.16503906, + "step": 5387, + "time_per_iteration": 4.075104236602783 + }, + { + "auxiliary_loss_clip": 0.06490889, + "auxiliary_loss_mlp": 0.01270509, + "balance_loss_clip": 0.06298059, + "balance_loss_mlp": 0.0125376, + "epoch": 0.3239440853750188, + "flos": 22971026839680.0, + "grad_norm": 2.142304582151843, + "language_loss": 0.78141761, + "learning_rate": 3.1600465235211956e-06, + "loss": 0.85903162, + "num_input_tokens_seen": 115726745, + "router_z_loss_clip": 1.92675781, + "router_z_loss_mlp": 0.16748047, + "step": 5388, + "time_per_iteration": 2.623976707458496 + }, + { + "auxiliary_loss_clip": 0.06478786, + "auxiliary_loss_mlp": 0.01276501, + "balance_loss_clip": 0.06289905, + "balance_loss_mlp": 0.01259704, + "epoch": 0.32400420862768675, + "flos": 36255394275840.0, + "grad_norm": 1.9954909505528162, + "language_loss": 0.71735168, + "learning_rate": 3.1597292460774006e-06, + "loss": 0.79490453, + "num_input_tokens_seen": 115749385, + "router_z_loss_clip": 1.88867188, + "router_z_loss_mlp": 0.16796875, + "step": 5389, + "time_per_iteration": 4.133269309997559 + }, + { + "auxiliary_loss_clip": 0.06479806, + "auxiliary_loss_mlp": 0.01273464, + "balance_loss_clip": 0.06294239, + "balance_loss_mlp": 0.01257872, + "epoch": 0.3240643318803547, + "flos": 21622946826240.0, + "grad_norm": 1.7464997421167434, + "language_loss": 0.81443554, + "learning_rate": 3.159411924656557e-06, + "loss": 0.89196825, + "num_input_tokens_seen": 115768105, + "router_z_loss_clip": 1.85449219, + "router_z_loss_mlp": 0.15588379, + "step": 5390, + "time_per_iteration": 3.9378364086151123 + }, + { + "auxiliary_loss_clip": 0.06491944, + "auxiliary_loss_mlp": 0.01278594, + "balance_loss_clip": 0.06301276, + "balance_loss_mlp": 0.01261296, + "epoch": 0.3241244551330227, + "flos": 23302466864640.0, + "grad_norm": 1.9807661160762629, + "language_loss": 0.73182476, + "learning_rate": 3.1590945592706967e-06, + "loss": 0.80953014, + "num_input_tokens_seen": 115787340, + "router_z_loss_clip": 1.90625, + "router_z_loss_mlp": 0.1730957, + "step": 5391, + "time_per_iteration": 2.532317638397217 + }, + { + "auxiliary_loss_clip": 0.06482222, + "auxiliary_loss_mlp": 0.01278908, + "balance_loss_clip": 0.06294864, + "balance_loss_mlp": 0.0126241, + "epoch": 0.32418457838569065, + "flos": 14101891395840.0, + "grad_norm": 1.5457442510257688, + "language_loss": 0.77541089, + "learning_rate": 3.158777149931855e-06, + "loss": 0.85302216, + "num_input_tokens_seen": 115805565, + "router_z_loss_clip": 1.87304688, + "router_z_loss_mlp": 0.16491699, + "step": 5392, + "time_per_iteration": 2.486161470413208 + }, + { + "auxiliary_loss_clip": 0.06490408, + "auxiliary_loss_mlp": 0.01278183, + "balance_loss_clip": 0.0629712, + "balance_loss_mlp": 0.01261411, + "epoch": 0.3242447016383586, + "flos": 29760454344960.0, + "grad_norm": 1.849936210081937, + "language_loss": 0.63213563, + "learning_rate": 3.158459696652067e-06, + "loss": 0.70982158, + "num_input_tokens_seen": 115826725, + "router_z_loss_clip": 1.93066406, + "router_z_loss_mlp": 0.16760254, + "step": 5393, + "time_per_iteration": 2.5853707790374756 + }, + { + "auxiliary_loss_clip": 0.06489256, + "auxiliary_loss_mlp": 0.01282677, + "balance_loss_clip": 0.06301466, + "balance_loss_mlp": 0.01266011, + "epoch": 0.3243048248910266, + "flos": 24357820239360.0, + "grad_norm": 1.7023503315224988, + "language_loss": 0.82889545, + "learning_rate": 3.158142199443371e-06, + "loss": 0.90661478, + "num_input_tokens_seen": 115846955, + "router_z_loss_clip": 1.88085938, + "router_z_loss_mlp": 0.16674805, + "step": 5394, + "time_per_iteration": 3.946955680847168 + }, + { + "auxiliary_loss_clip": 0.06480435, + "auxiliary_loss_mlp": 0.01285084, + "balance_loss_clip": 0.06298714, + "balance_loss_mlp": 0.01269825, + "epoch": 0.3243649481436946, + "flos": 24359958518400.0, + "grad_norm": 2.1573093021253333, + "language_loss": 0.82280314, + "learning_rate": 3.1578246583178076e-06, + "loss": 0.90045834, + "num_input_tokens_seen": 115865975, + "router_z_loss_clip": 1.81738281, + "router_z_loss_mlp": 0.15270996, + "step": 5395, + "time_per_iteration": 2.537313222885132 + }, + { + "auxiliary_loss_clip": 0.06480338, + "auxiliary_loss_mlp": 0.01292267, + "balance_loss_clip": 0.06300412, + "balance_loss_mlp": 0.01276424, + "epoch": 0.32442507139636256, + "flos": 22931097569280.0, + "grad_norm": 1.7302006802896392, + "language_loss": 0.839818, + "learning_rate": 3.157507073287417e-06, + "loss": 0.91754401, + "num_input_tokens_seen": 115884950, + "router_z_loss_clip": 1.79785156, + "router_z_loss_mlp": 0.15844727, + "step": 5396, + "time_per_iteration": 2.6440067291259766 + }, + { + "auxiliary_loss_clip": 0.06491997, + "auxiliary_loss_mlp": 0.01291538, + "balance_loss_clip": 0.06299315, + "balance_loss_mlp": 0.01274121, + "epoch": 0.32448519464903053, + "flos": 22206723022080.0, + "grad_norm": 1.8684779143202024, + "language_loss": 0.76113403, + "learning_rate": 3.1571894443642414e-06, + "loss": 0.83896935, + "num_input_tokens_seen": 115904170, + "router_z_loss_clip": 1.92578125, + "router_z_loss_mlp": 0.17419434, + "step": 5397, + "time_per_iteration": 2.506601095199585 + }, + { + "auxiliary_loss_clip": 0.06473789, + "auxiliary_loss_mlp": 0.01290487, + "balance_loss_clip": 0.06290997, + "balance_loss_mlp": 0.0127387, + "epoch": 0.3245453179016985, + "flos": 18843574095360.0, + "grad_norm": 2.304762567896747, + "language_loss": 0.67975587, + "learning_rate": 3.1568717715603263e-06, + "loss": 0.75739866, + "num_input_tokens_seen": 115919255, + "router_z_loss_clip": 1.828125, + "router_z_loss_mlp": 0.1661377, + "step": 5398, + "time_per_iteration": 2.50168514251709 + }, + { + "auxiliary_loss_clip": 0.06478744, + "auxiliary_loss_mlp": 0.01288926, + "balance_loss_clip": 0.06293125, + "balance_loss_mlp": 0.01272189, + "epoch": 0.32460544115436646, + "flos": 21184716372480.0, + "grad_norm": 1.3685049489713428, + "language_loss": 0.73232323, + "learning_rate": 3.156554054887718e-06, + "loss": 0.80999994, + "num_input_tokens_seen": 115938535, + "router_z_loss_clip": 1.85742188, + "router_z_loss_mlp": 0.16748047, + "step": 5399, + "time_per_iteration": 2.5114216804504395 + }, + { + "auxiliary_loss_clip": 0.0648094, + "auxiliary_loss_mlp": 0.01289931, + "balance_loss_clip": 0.06293677, + "balance_loss_mlp": 0.01273241, + "epoch": 0.3246655644070344, + "flos": 21987607795200.0, + "grad_norm": 2.072173153822147, + "language_loss": 0.71044981, + "learning_rate": 3.1562362943584645e-06, + "loss": 0.78815848, + "num_input_tokens_seen": 115955005, + "router_z_loss_clip": 1.87207031, + "router_z_loss_mlp": 0.16687012, + "step": 5400, + "time_per_iteration": 2.548757314682007 + }, + { + "auxiliary_loss_clip": 0.06480449, + "auxiliary_loss_mlp": 0.01279651, + "balance_loss_clip": 0.06289301, + "balance_loss_mlp": 0.01263355, + "epoch": 0.3247256876597024, + "flos": 32167745020800.0, + "grad_norm": 2.104371315429844, + "language_loss": 0.80626661, + "learning_rate": 3.155918489984614e-06, + "loss": 0.88386756, + "num_input_tokens_seen": 115975305, + "router_z_loss_clip": 1.91210938, + "router_z_loss_mlp": 0.16296387, + "step": 5401, + "time_per_iteration": 2.59226393699646 + }, + { + "auxiliary_loss_clip": 0.06483636, + "auxiliary_loss_mlp": 0.01281263, + "balance_loss_clip": 0.06294005, + "balance_loss_mlp": 0.01264073, + "epoch": 0.32478581091237035, + "flos": 21004104896640.0, + "grad_norm": 1.4796090680940444, + "language_loss": 0.87935805, + "learning_rate": 3.1556006417782196e-06, + "loss": 0.95700705, + "num_input_tokens_seen": 115994810, + "router_z_loss_clip": 1.89746094, + "router_z_loss_mlp": 0.17175293, + "step": 5402, + "time_per_iteration": 2.5548956394195557 + }, + { + "auxiliary_loss_clip": 0.06474966, + "auxiliary_loss_mlp": 0.0127368, + "balance_loss_clip": 0.06291528, + "balance_loss_mlp": 0.01258767, + "epoch": 0.3248459341650383, + "flos": 17929741466880.0, + "grad_norm": 2.584856005153906, + "language_loss": 0.85243386, + "learning_rate": 3.155282749751332e-06, + "loss": 0.92992032, + "num_input_tokens_seen": 116011095, + "router_z_loss_clip": 1.83691406, + "router_z_loss_mlp": 0.14904785, + "step": 5403, + "time_per_iteration": 2.479205369949341 + }, + { + "auxiliary_loss_clip": 0.06468324, + "auxiliary_loss_mlp": 0.01277336, + "balance_loss_clip": 0.06290223, + "balance_loss_mlp": 0.01262667, + "epoch": 0.3249060574177063, + "flos": 24542582492160.0, + "grad_norm": 2.1052258035485214, + "language_loss": 0.8828373, + "learning_rate": 3.154964813916007e-06, + "loss": 0.96029389, + "num_input_tokens_seen": 116028805, + "router_z_loss_clip": 1.77832031, + "router_z_loss_mlp": 0.14672852, + "step": 5404, + "time_per_iteration": 2.5845093727111816 + }, + { + "auxiliary_loss_clip": 0.06473936, + "auxiliary_loss_mlp": 0.01275771, + "balance_loss_clip": 0.06291413, + "balance_loss_mlp": 0.01259368, + "epoch": 0.32496618067037425, + "flos": 26001939127680.0, + "grad_norm": 1.6833557203411496, + "language_loss": 0.72900558, + "learning_rate": 3.1546468342843008e-06, + "loss": 0.80650264, + "num_input_tokens_seen": 116047765, + "router_z_loss_clip": 1.82617188, + "router_z_loss_mlp": 0.1640625, + "step": 5405, + "time_per_iteration": 2.542433500289917 + }, + { + "auxiliary_loss_clip": 0.06474283, + "auxiliary_loss_mlp": 0.01273684, + "balance_loss_clip": 0.06290333, + "balance_loss_mlp": 0.01258264, + "epoch": 0.3250263039230422, + "flos": 19579939776000.0, + "grad_norm": 1.7320098663924197, + "language_loss": 0.83355331, + "learning_rate": 3.1543288108682707e-06, + "loss": 0.91103297, + "num_input_tokens_seen": 116068385, + "router_z_loss_clip": 1.83789062, + "router_z_loss_mlp": 0.15435791, + "step": 5406, + "time_per_iteration": 2.591207265853882 + }, + { + "auxiliary_loss_clip": 0.06474167, + "auxiliary_loss_mlp": 0.01270112, + "balance_loss_clip": 0.06290454, + "balance_loss_mlp": 0.01254949, + "epoch": 0.3250864271757102, + "flos": 16769232817920.0, + "grad_norm": 2.13827452533593, + "language_loss": 0.87879711, + "learning_rate": 3.1540107436799764e-06, + "loss": 0.95623994, + "num_input_tokens_seen": 116085350, + "router_z_loss_clip": 1.83886719, + "router_z_loss_mlp": 0.15161133, + "step": 5407, + "time_per_iteration": 2.4856173992156982 + }, + { + "auxiliary_loss_clip": 0.06469748, + "auxiliary_loss_mlp": 0.01276836, + "balance_loss_clip": 0.06284758, + "balance_loss_mlp": 0.01261195, + "epoch": 0.3251465504283782, + "flos": 27827004908160.0, + "grad_norm": 2.430972813034592, + "language_loss": 0.69975567, + "learning_rate": 3.153692632731479e-06, + "loss": 0.77722144, + "num_input_tokens_seen": 116107560, + "router_z_loss_clip": 1.84960938, + "router_z_loss_mlp": 0.15649414, + "step": 5408, + "time_per_iteration": 2.5838799476623535 + }, + { + "auxiliary_loss_clip": 0.06481153, + "auxiliary_loss_mlp": 0.01282988, + "balance_loss_clip": 0.06286341, + "balance_loss_mlp": 0.01267396, + "epoch": 0.32520667368104617, + "flos": 19069271867520.0, + "grad_norm": 3.909403651515765, + "language_loss": 0.78053123, + "learning_rate": 3.153374478034841e-06, + "loss": 0.85817266, + "num_input_tokens_seen": 116125980, + "router_z_loss_clip": 1.94824219, + "router_z_loss_mlp": 0.15588379, + "step": 5409, + "time_per_iteration": 2.5178377628326416 + }, + { + "auxiliary_loss_clip": 0.06473932, + "auxiliary_loss_mlp": 0.01272582, + "balance_loss_clip": 0.06286227, + "balance_loss_mlp": 0.01256202, + "epoch": 0.32526679693371413, + "flos": 29388917341440.0, + "grad_norm": 1.8050072916987376, + "language_loss": 0.83473468, + "learning_rate": 3.1530562796021285e-06, + "loss": 0.91219985, + "num_input_tokens_seen": 116146530, + "router_z_loss_clip": 1.87695312, + "router_z_loss_mlp": 0.16381836, + "step": 5410, + "time_per_iteration": 2.5948092937469482 + }, + { + "auxiliary_loss_clip": 0.06466505, + "auxiliary_loss_mlp": 0.01275621, + "balance_loss_clip": 0.06286819, + "balance_loss_mlp": 0.01261274, + "epoch": 0.3253269201863821, + "flos": 20710833206400.0, + "grad_norm": 1.580323990141508, + "language_loss": 0.72005814, + "learning_rate": 3.152738037445405e-06, + "loss": 0.79747939, + "num_input_tokens_seen": 116165695, + "router_z_loss_clip": 1.79589844, + "router_z_loss_mlp": 0.14349365, + "step": 5411, + "time_per_iteration": 2.515542507171631 + }, + { + "auxiliary_loss_clip": 0.06472497, + "auxiliary_loss_mlp": 0.0127713, + "balance_loss_clip": 0.06287136, + "balance_loss_mlp": 0.01261632, + "epoch": 0.32538704343905006, + "flos": 29101515436800.0, + "grad_norm": 1.470162471805647, + "language_loss": 0.83496881, + "learning_rate": 3.1524197515767403e-06, + "loss": 0.91246504, + "num_input_tokens_seen": 116185375, + "router_z_loss_clip": 1.85351562, + "router_z_loss_mlp": 0.15490723, + "step": 5412, + "time_per_iteration": 2.55008602142334 + }, + { + "auxiliary_loss_clip": 0.06476887, + "auxiliary_loss_mlp": 0.01277617, + "balance_loss_clip": 0.06287435, + "balance_loss_mlp": 0.01260904, + "epoch": 0.325447166691718, + "flos": 24682216521600.0, + "grad_norm": 1.5504273053971407, + "language_loss": 0.8129071, + "learning_rate": 3.152101422008203e-06, + "loss": 0.89045215, + "num_input_tokens_seen": 116204335, + "router_z_loss_clip": 1.89550781, + "router_z_loss_mlp": 0.16711426, + "step": 5413, + "time_per_iteration": 2.54195499420166 + }, + { + "auxiliary_loss_clip": 0.06477104, + "auxiliary_loss_mlp": 0.0127801, + "balance_loss_clip": 0.0628976, + "balance_loss_mlp": 0.01261643, + "epoch": 0.325507289944386, + "flos": 21549503122560.0, + "grad_norm": 1.5527044192655586, + "language_loss": 0.76985061, + "learning_rate": 3.151783048751864e-06, + "loss": 0.84740174, + "num_input_tokens_seen": 116222840, + "router_z_loss_clip": 1.87402344, + "router_z_loss_mlp": 0.16363525, + "step": 5414, + "time_per_iteration": 2.5435919761657715 + }, + { + "auxiliary_loss_clip": 0.063807, + "auxiliary_loss_mlp": 0.01284661, + "balance_loss_clip": 0.06291388, + "balance_loss_mlp": 0.01280793, + "epoch": 0.32556741319705396, + "flos": 71537893194240.0, + "grad_norm": 0.9015335749308697, + "language_loss": 0.64095414, + "learning_rate": 3.1514646318197965e-06, + "loss": 0.71760774, + "num_input_tokens_seen": 116274940, + "router_z_loss_clip": 0.890625, + "router_z_loss_mlp": 0.03863525, + "step": 5415, + "time_per_iteration": 3.0875957012176514 + }, + { + "auxiliary_loss_clip": 0.0647157, + "auxiliary_loss_mlp": 0.01275105, + "balance_loss_clip": 0.06285933, + "balance_loss_mlp": 0.01258845, + "epoch": 0.3256275364497219, + "flos": 23739187944960.0, + "grad_norm": 1.4815485577141352, + "language_loss": 0.74123245, + "learning_rate": 3.151146171224075e-06, + "loss": 0.81869924, + "num_input_tokens_seen": 116297300, + "router_z_loss_clip": 1.85546875, + "router_z_loss_mlp": 0.16235352, + "step": 5416, + "time_per_iteration": 2.5792665481567383 + }, + { + "auxiliary_loss_clip": 0.06381539, + "auxiliary_loss_mlp": 0.01266569, + "balance_loss_clip": 0.06293018, + "balance_loss_mlp": 0.01262769, + "epoch": 0.3256876597023899, + "flos": 67308136214400.0, + "grad_norm": 0.7704887993649999, + "language_loss": 0.57850802, + "learning_rate": 3.1508276669767757e-06, + "loss": 0.65498912, + "num_input_tokens_seen": 116362370, + "router_z_loss_clip": 0.88378906, + "router_z_loss_mlp": 0.03793335, + "step": 5417, + "time_per_iteration": 3.2770884037017822 + }, + { + "auxiliary_loss_clip": 0.06373264, + "auxiliary_loss_mlp": 0.01258837, + "balance_loss_clip": 0.06284805, + "balance_loss_mlp": 0.01254933, + "epoch": 0.32574778295505785, + "flos": 71304633826560.0, + "grad_norm": 0.8775074523137479, + "language_loss": 0.63674986, + "learning_rate": 3.150509119089975e-06, + "loss": 0.71307087, + "num_input_tokens_seen": 116430365, + "router_z_loss_clip": 0.8828125, + "router_z_loss_mlp": 0.03900146, + "step": 5418, + "time_per_iteration": 3.315948724746704 + }, + { + "auxiliary_loss_clip": 0.06476019, + "auxiliary_loss_mlp": 0.01273465, + "balance_loss_clip": 0.06290952, + "balance_loss_mlp": 0.01258111, + "epoch": 0.3258079062077258, + "flos": 20782515974400.0, + "grad_norm": 1.8847025208507953, + "language_loss": 0.6957128, + "learning_rate": 3.1501905275757537e-06, + "loss": 0.77320766, + "num_input_tokens_seen": 116447525, + "router_z_loss_clip": 1.85058594, + "router_z_loss_mlp": 0.15344238, + "step": 5419, + "time_per_iteration": 2.5722780227661133 + }, + { + "auxiliary_loss_clip": 0.06480842, + "auxiliary_loss_mlp": 0.01275789, + "balance_loss_clip": 0.06291591, + "balance_loss_mlp": 0.01260006, + "epoch": 0.3258680294603938, + "flos": 22241788755840.0, + "grad_norm": 2.023173952709465, + "language_loss": 0.77398664, + "learning_rate": 3.1498718924461926e-06, + "loss": 0.85155296, + "num_input_tokens_seen": 116466310, + "router_z_loss_clip": 1.890625, + "router_z_loss_mlp": 0.15783691, + "step": 5420, + "time_per_iteration": 2.5199873447418213 + }, + { + "auxiliary_loss_clip": 0.06478356, + "auxiliary_loss_mlp": 0.0127343, + "balance_loss_clip": 0.06290038, + "balance_loss_mlp": 0.0125798, + "epoch": 0.3259281527130618, + "flos": 26987328743040.0, + "grad_norm": 1.5124533627457746, + "language_loss": 0.80826706, + "learning_rate": 3.1495532137133736e-06, + "loss": 0.88578492, + "num_input_tokens_seen": 116487825, + "router_z_loss_clip": 1.88378906, + "router_z_loss_mlp": 0.15441895, + "step": 5421, + "time_per_iteration": 2.6014363765716553 + }, + { + "auxiliary_loss_clip": 0.06476312, + "auxiliary_loss_mlp": 0.0127337, + "balance_loss_clip": 0.06293876, + "balance_loss_mlp": 0.01258982, + "epoch": 0.32598827596572977, + "flos": 26221557479040.0, + "grad_norm": 1.4846059645471, + "language_loss": 0.76098251, + "learning_rate": 3.149234491389381e-06, + "loss": 0.8384794, + "num_input_tokens_seen": 116509950, + "router_z_loss_clip": 1.82421875, + "router_z_loss_mlp": 0.1439209, + "step": 5422, + "time_per_iteration": 2.5738978385925293 + }, + { + "auxiliary_loss_clip": 0.06480287, + "auxiliary_loss_mlp": 0.01270664, + "balance_loss_clip": 0.06288645, + "balance_loss_mlp": 0.01255095, + "epoch": 0.32604839921839773, + "flos": 17645567944320.0, + "grad_norm": 2.282982793788361, + "language_loss": 0.63826233, + "learning_rate": 3.1489157254863026e-06, + "loss": 0.71577179, + "num_input_tokens_seen": 116527695, + "router_z_loss_clip": 1.91699219, + "router_z_loss_mlp": 0.15576172, + "step": 5423, + "time_per_iteration": 2.5513644218444824 + }, + { + "auxiliary_loss_clip": 0.06469624, + "auxiliary_loss_mlp": 0.01273816, + "balance_loss_clip": 0.06290927, + "balance_loss_mlp": 0.01258748, + "epoch": 0.3261085224710657, + "flos": 23629420696320.0, + "grad_norm": 1.6690467832946037, + "language_loss": 0.75170749, + "learning_rate": 3.148596916016224e-06, + "loss": 0.82914186, + "num_input_tokens_seen": 116547800, + "router_z_loss_clip": 1.78710938, + "router_z_loss_mlp": 0.1505127, + "step": 5424, + "time_per_iteration": 2.546074151992798 + }, + { + "auxiliary_loss_clip": 0.06470636, + "auxiliary_loss_mlp": 0.01274311, + "balance_loss_clip": 0.06288706, + "balance_loss_mlp": 0.01258945, + "epoch": 0.32616864572373366, + "flos": 23267526912000.0, + "grad_norm": 1.6415169459291201, + "language_loss": 0.7718606, + "learning_rate": 3.1482780629912355e-06, + "loss": 0.84931004, + "num_input_tokens_seen": 116568460, + "router_z_loss_clip": 1.81933594, + "router_z_loss_mlp": 0.15368652, + "step": 5425, + "time_per_iteration": 2.5883710384368896 + }, + { + "auxiliary_loss_clip": 0.06476015, + "auxiliary_loss_mlp": 0.01273254, + "balance_loss_clip": 0.06284191, + "balance_loss_mlp": 0.01256589, + "epoch": 0.32622876897640163, + "flos": 25600535343360.0, + "grad_norm": 2.4681515054731924, + "language_loss": 0.78599709, + "learning_rate": 3.147959166423428e-06, + "loss": 0.86348987, + "num_input_tokens_seen": 116588705, + "router_z_loss_clip": 1.91992188, + "router_z_loss_mlp": 0.16650391, + "step": 5426, + "time_per_iteration": 2.569566488265991 + }, + { + "auxiliary_loss_clip": 0.06473041, + "auxiliary_loss_mlp": 0.01273059, + "balance_loss_clip": 0.06286261, + "balance_loss_mlp": 0.0125749, + "epoch": 0.3262888922290696, + "flos": 22425544759680.0, + "grad_norm": 1.6671872965592953, + "language_loss": 0.74719262, + "learning_rate": 3.147640226324893e-06, + "loss": 0.82465363, + "num_input_tokens_seen": 116608845, + "router_z_loss_clip": 1.86816406, + "router_z_loss_mlp": 0.15563965, + "step": 5427, + "time_per_iteration": 3.941770315170288 + }, + { + "auxiliary_loss_clip": 0.06474692, + "auxiliary_loss_mlp": 0.0127251, + "balance_loss_clip": 0.06285059, + "balance_loss_mlp": 0.01256154, + "epoch": 0.32634901548173756, + "flos": 19724982393600.0, + "grad_norm": 2.0508761677602965, + "language_loss": 0.79472262, + "learning_rate": 3.1473212427077266e-06, + "loss": 0.87219465, + "num_input_tokens_seen": 116628145, + "router_z_loss_clip": 1.89550781, + "router_z_loss_mlp": 0.16357422, + "step": 5428, + "time_per_iteration": 3.9950850009918213 + }, + { + "auxiliary_loss_clip": 0.06475013, + "auxiliary_loss_mlp": 0.01275116, + "balance_loss_clip": 0.0628937, + "balance_loss_mlp": 0.01259309, + "epoch": 0.3264091387344055, + "flos": 16148336463360.0, + "grad_norm": 1.5445825374219135, + "language_loss": 0.71770716, + "learning_rate": 3.147002215584023e-06, + "loss": 0.79520845, + "num_input_tokens_seen": 116646920, + "router_z_loss_clip": 1.85644531, + "router_z_loss_mlp": 0.15808105, + "step": 5429, + "time_per_iteration": 3.922197103500366 + }, + { + "auxiliary_loss_clip": 0.06468233, + "auxiliary_loss_mlp": 0.01269844, + "balance_loss_clip": 0.06283497, + "balance_loss_mlp": 0.01254466, + "epoch": 0.3264692619870735, + "flos": 16404655703040.0, + "grad_norm": 1.5791835311639297, + "language_loss": 0.78689212, + "learning_rate": 3.146683144965881e-06, + "loss": 0.86427283, + "num_input_tokens_seen": 116665100, + "router_z_loss_clip": 1.84863281, + "router_z_loss_mlp": 0.15380859, + "step": 5430, + "time_per_iteration": 2.4873790740966797 + }, + { + "auxiliary_loss_clip": 0.06468185, + "auxiliary_loss_mlp": 0.0127668, + "balance_loss_clip": 0.06281599, + "balance_loss_mlp": 0.01259561, + "epoch": 0.32652938523974145, + "flos": 22388843871360.0, + "grad_norm": 1.9481749952405665, + "language_loss": 0.84556186, + "learning_rate": 3.146364030865399e-06, + "loss": 0.92301053, + "num_input_tokens_seen": 116682205, + "router_z_loss_clip": 1.86425781, + "router_z_loss_mlp": 0.17126465, + "step": 5431, + "time_per_iteration": 2.522075653076172 + }, + { + "auxiliary_loss_clip": 0.06468672, + "auxiliary_loss_mlp": 0.01274085, + "balance_loss_clip": 0.06286903, + "balance_loss_mlp": 0.01259327, + "epoch": 0.3265895084924094, + "flos": 21914499507840.0, + "grad_norm": 1.6266920997971765, + "language_loss": 0.71123517, + "learning_rate": 3.146044873294678e-06, + "loss": 0.78866279, + "num_input_tokens_seen": 116702575, + "router_z_loss_clip": 1.81835938, + "router_z_loss_mlp": 0.14758301, + "step": 5432, + "time_per_iteration": 2.513209104537964 + }, + { + "auxiliary_loss_clip": 0.06469099, + "auxiliary_loss_mlp": 0.01272277, + "balance_loss_clip": 0.06282821, + "balance_loss_mlp": 0.01257424, + "epoch": 0.3266496317450774, + "flos": 16072083648000.0, + "grad_norm": 1.3982751613904698, + "language_loss": 0.84207368, + "learning_rate": 3.1457256722658203e-06, + "loss": 0.91948748, + "num_input_tokens_seen": 116720885, + "router_z_loss_clip": 1.86328125, + "router_z_loss_mlp": 0.14855957, + "step": 5433, + "time_per_iteration": 2.5324172973632812 + }, + { + "auxiliary_loss_clip": 0.06463822, + "auxiliary_loss_mlp": 0.01279207, + "balance_loss_clip": 0.06283711, + "balance_loss_mlp": 0.01264049, + "epoch": 0.3267097549977454, + "flos": 22534766956800.0, + "grad_norm": 1.4562075652627795, + "language_loss": 0.85916972, + "learning_rate": 3.145406427790931e-06, + "loss": 0.93660003, + "num_input_tokens_seen": 116740395, + "router_z_loss_clip": 1.80175781, + "router_z_loss_mlp": 0.15155029, + "step": 5434, + "time_per_iteration": 3.9434614181518555 + }, + { + "auxiliary_loss_clip": 0.06468898, + "auxiliary_loss_mlp": 0.01277076, + "balance_loss_clip": 0.06281307, + "balance_loss_mlp": 0.0126134, + "epoch": 0.32676987825041337, + "flos": 27277581686400.0, + "grad_norm": 1.6909362765146225, + "language_loss": 0.88470823, + "learning_rate": 3.1450871398821147e-06, + "loss": 0.96216792, + "num_input_tokens_seen": 116758870, + "router_z_loss_clip": 1.875, + "router_z_loss_mlp": 0.1574707, + "step": 5435, + "time_per_iteration": 2.5430006980895996 + }, + { + "auxiliary_loss_clip": 0.06469613, + "auxiliary_loss_mlp": 0.01271625, + "balance_loss_clip": 0.06283396, + "balance_loss_mlp": 0.01256306, + "epoch": 0.32683000150308134, + "flos": 11512731432960.0, + "grad_norm": 2.3091497119382733, + "language_loss": 0.77129918, + "learning_rate": 3.144767808551479e-06, + "loss": 0.84871155, + "num_input_tokens_seen": 116773440, + "router_z_loss_clip": 1.86035156, + "router_z_loss_mlp": 0.15307617, + "step": 5436, + "time_per_iteration": 2.486003875732422 + }, + { + "auxiliary_loss_clip": 0.06464212, + "auxiliary_loss_mlp": 0.01277236, + "balance_loss_clip": 0.0628064, + "balance_loss_mlp": 0.01261977, + "epoch": 0.3268901247557493, + "flos": 25637362012800.0, + "grad_norm": 1.5303988762112921, + "language_loss": 0.72448635, + "learning_rate": 3.144448433811134e-06, + "loss": 0.80190074, + "num_input_tokens_seen": 116794375, + "router_z_loss_clip": 1.83398438, + "router_z_loss_mlp": 0.15270996, + "step": 5437, + "time_per_iteration": 2.545548915863037 + }, + { + "auxiliary_loss_clip": 0.06472606, + "auxiliary_loss_mlp": 0.01275014, + "balance_loss_clip": 0.06282267, + "balance_loss_mlp": 0.01258253, + "epoch": 0.32695024800841727, + "flos": 24867356117760.0, + "grad_norm": 1.604360978002023, + "language_loss": 0.64194709, + "learning_rate": 3.144129015673189e-06, + "loss": 0.71942323, + "num_input_tokens_seen": 116815095, + "router_z_loss_clip": 1.90429688, + "router_z_loss_mlp": 0.16760254, + "step": 5438, + "time_per_iteration": 2.5657694339752197 + }, + { + "auxiliary_loss_clip": 0.06462848, + "auxiliary_loss_mlp": 0.01273041, + "balance_loss_clip": 0.0627985, + "balance_loss_mlp": 0.01257246, + "epoch": 0.32701037126108523, + "flos": 28846663643520.0, + "grad_norm": 1.637174889107761, + "language_loss": 0.74795192, + "learning_rate": 3.1438095541497576e-06, + "loss": 0.82531083, + "num_input_tokens_seen": 116836630, + "router_z_loss_clip": 1.82910156, + "router_z_loss_mlp": 0.15795898, + "step": 5439, + "time_per_iteration": 2.5655689239501953 + }, + { + "auxiliary_loss_clip": 0.06473309, + "auxiliary_loss_mlp": 0.01272512, + "balance_loss_clip": 0.06286814, + "balance_loss_mlp": 0.01257087, + "epoch": 0.3270704945137532, + "flos": 27972592577280.0, + "grad_norm": 1.745503595629167, + "language_loss": 0.74950606, + "learning_rate": 3.1434900492529527e-06, + "loss": 0.82696426, + "num_input_tokens_seen": 116856880, + "router_z_loss_clip": 1.86425781, + "router_z_loss_mlp": 0.1541748, + "step": 5440, + "time_per_iteration": 2.601821184158325 + }, + { + "auxiliary_loss_clip": 0.06460315, + "auxiliary_loss_mlp": 0.01269526, + "balance_loss_clip": 0.06277528, + "balance_loss_mlp": 0.01254947, + "epoch": 0.32713061776642116, + "flos": 23696575344000.0, + "grad_norm": 1.95462638600934, + "language_loss": 0.84695202, + "learning_rate": 3.1431705009948914e-06, + "loss": 0.92425048, + "num_input_tokens_seen": 116873770, + "router_z_loss_clip": 1.82910156, + "router_z_loss_mlp": 0.14599609, + "step": 5441, + "time_per_iteration": 2.5020570755004883 + }, + { + "auxiliary_loss_clip": 0.06466734, + "auxiliary_loss_mlp": 0.01272021, + "balance_loss_clip": 0.06280614, + "balance_loss_mlp": 0.01256798, + "epoch": 0.3271907410190891, + "flos": 22462203720960.0, + "grad_norm": 1.9620532707625304, + "language_loss": 0.86928713, + "learning_rate": 3.1428509093876897e-06, + "loss": 0.9466747, + "num_input_tokens_seen": 116891225, + "router_z_loss_clip": 1.86035156, + "router_z_loss_mlp": 0.15222168, + "step": 5442, + "time_per_iteration": 2.5388059616088867 + }, + { + "auxiliary_loss_clip": 0.06470812, + "auxiliary_loss_mlp": 0.0126936, + "balance_loss_clip": 0.06282146, + "balance_loss_mlp": 0.01254399, + "epoch": 0.3272508642717571, + "flos": 22826696981760.0, + "grad_norm": 1.5979656279548642, + "language_loss": 0.77388418, + "learning_rate": 3.1425312744434668e-06, + "loss": 0.85128593, + "num_input_tokens_seen": 116912300, + "router_z_loss_clip": 1.88671875, + "router_z_loss_mlp": 0.1496582, + "step": 5443, + "time_per_iteration": 2.5765621662139893 + }, + { + "auxiliary_loss_clip": 0.0646731, + "auxiliary_loss_mlp": 0.01271075, + "balance_loss_clip": 0.06280384, + "balance_loss_mlp": 0.01255518, + "epoch": 0.32731098752442506, + "flos": 11806086977280.0, + "grad_norm": 2.2200780771744073, + "language_loss": 0.82818562, + "learning_rate": 3.142211596174343e-06, + "loss": 0.90556955, + "num_input_tokens_seen": 116929425, + "router_z_loss_clip": 1.8671875, + "router_z_loss_mlp": 0.15551758, + "step": 5444, + "time_per_iteration": 2.5514841079711914 + }, + { + "auxiliary_loss_clip": 0.06468201, + "auxiliary_loss_mlp": 0.01274937, + "balance_loss_clip": 0.06282412, + "balance_loss_mlp": 0.01258295, + "epoch": 0.327371110777093, + "flos": 21033300844800.0, + "grad_norm": 2.365977713323657, + "language_loss": 0.59248179, + "learning_rate": 3.1418918745924423e-06, + "loss": 0.66991317, + "num_input_tokens_seen": 116948255, + "router_z_loss_clip": 1.85742188, + "router_z_loss_mlp": 0.16638184, + "step": 5445, + "time_per_iteration": 2.5325539112091064 + }, + { + "auxiliary_loss_clip": 0.06469189, + "auxiliary_loss_mlp": 0.01278146, + "balance_loss_clip": 0.0628283, + "balance_loss_mlp": 0.01261278, + "epoch": 0.327431234029761, + "flos": 19068055983360.0, + "grad_norm": 2.7570820492615886, + "language_loss": 0.89260846, + "learning_rate": 3.1415721097098865e-06, + "loss": 0.97008175, + "num_input_tokens_seen": 116964905, + "router_z_loss_clip": 1.86425781, + "router_z_loss_mlp": 0.16870117, + "step": 5446, + "time_per_iteration": 2.576833724975586 + }, + { + "auxiliary_loss_clip": 0.06476346, + "auxiliary_loss_mlp": 0.01274903, + "balance_loss_clip": 0.06282137, + "balance_loss_mlp": 0.01257403, + "epoch": 0.32749135728242895, + "flos": 25856435312640.0, + "grad_norm": 1.9641165872810087, + "language_loss": 0.79404771, + "learning_rate": 3.141252301538802e-06, + "loss": 0.87156022, + "num_input_tokens_seen": 116983650, + "router_z_loss_clip": 1.94238281, + "router_z_loss_mlp": 0.17480469, + "step": 5447, + "time_per_iteration": 2.5539090633392334 + }, + { + "auxiliary_loss_clip": 0.06462374, + "auxiliary_loss_mlp": 0.01278273, + "balance_loss_clip": 0.06277826, + "balance_loss_mlp": 0.01263277, + "epoch": 0.327551480535097, + "flos": 20126721594240.0, + "grad_norm": 1.953936246680755, + "language_loss": 0.73150277, + "learning_rate": 3.1409324500913157e-06, + "loss": 0.80890924, + "num_input_tokens_seen": 117003265, + "router_z_loss_clip": 1.84667969, + "router_z_loss_mlp": 0.14990234, + "step": 5448, + "time_per_iteration": 2.633612871170044 + }, + { + "auxiliary_loss_clip": 0.06464307, + "auxiliary_loss_mlp": 0.01272265, + "balance_loss_clip": 0.0628064, + "balance_loss_mlp": 0.01256291, + "epoch": 0.32761160378776494, + "flos": 28811094785280.0, + "grad_norm": 1.3623614976773524, + "language_loss": 0.67002481, + "learning_rate": 3.1406125553795567e-06, + "loss": 0.74739063, + "num_input_tokens_seen": 117025370, + "router_z_loss_clip": 1.83789062, + "router_z_loss_mlp": 0.15966797, + "step": 5449, + "time_per_iteration": 2.5777859687805176 + }, + { + "auxiliary_loss_clip": 0.0647198, + "auxiliary_loss_mlp": 0.01270062, + "balance_loss_clip": 0.0628611, + "balance_loss_mlp": 0.01254493, + "epoch": 0.3276717270404329, + "flos": 26944171090560.0, + "grad_norm": 1.378619651715801, + "language_loss": 0.65736711, + "learning_rate": 3.1402926174156556e-06, + "loss": 0.73478758, + "num_input_tokens_seen": 117044350, + "router_z_loss_clip": 1.85644531, + "router_z_loss_mlp": 0.15576172, + "step": 5450, + "time_per_iteration": 2.5971169471740723 + }, + { + "auxiliary_loss_clip": 0.06468028, + "auxiliary_loss_mlp": 0.01275162, + "balance_loss_clip": 0.06280884, + "balance_loss_mlp": 0.01258509, + "epoch": 0.32773185029310087, + "flos": 25345557768960.0, + "grad_norm": 7.041147023955008, + "language_loss": 0.77832162, + "learning_rate": 3.1399726362117437e-06, + "loss": 0.85575354, + "num_input_tokens_seen": 117064450, + "router_z_loss_clip": 1.87207031, + "router_z_loss_mlp": 0.16662598, + "step": 5451, + "time_per_iteration": 2.572112560272217 + }, + { + "auxiliary_loss_clip": 0.06472664, + "auxiliary_loss_mlp": 0.01278588, + "balance_loss_clip": 0.06283467, + "balance_loss_mlp": 0.01262042, + "epoch": 0.32779197354576883, + "flos": 26398227813120.0, + "grad_norm": 1.9495025825112327, + "language_loss": 0.70696288, + "learning_rate": 3.1396526117799555e-06, + "loss": 0.78447533, + "num_input_tokens_seen": 117083060, + "router_z_loss_clip": 1.88867188, + "router_z_loss_mlp": 0.16540527, + "step": 5452, + "time_per_iteration": 2.6081676483154297 + }, + { + "auxiliary_loss_clip": 0.0646618, + "auxiliary_loss_mlp": 0.01272924, + "balance_loss_clip": 0.06283787, + "balance_loss_mlp": 0.01256938, + "epoch": 0.3278520967984368, + "flos": 24906237212160.0, + "grad_norm": 1.6132254933408041, + "language_loss": 0.7924304, + "learning_rate": 3.1393325441324256e-06, + "loss": 0.86982143, + "num_input_tokens_seen": 117101860, + "router_z_loss_clip": 1.82519531, + "router_z_loss_mlp": 0.15979004, + "step": 5453, + "time_per_iteration": 2.5893869400024414 + }, + { + "auxiliary_loss_clip": 0.06469721, + "auxiliary_loss_mlp": 0.01274795, + "balance_loss_clip": 0.06282013, + "balance_loss_mlp": 0.01259309, + "epoch": 0.32791222005110476, + "flos": 29760831688320.0, + "grad_norm": 2.0442879632543476, + "language_loss": 0.758448, + "learning_rate": 3.1390124332812916e-06, + "loss": 0.83589315, + "num_input_tokens_seen": 117123100, + "router_z_loss_clip": 1.87792969, + "router_z_loss_mlp": 0.15478516, + "step": 5454, + "time_per_iteration": 2.590080499649048 + }, + { + "auxiliary_loss_clip": 0.06461332, + "auxiliary_loss_mlp": 0.01271865, + "balance_loss_clip": 0.06280516, + "balance_loss_mlp": 0.01257536, + "epoch": 0.32797234330377273, + "flos": 16513584410880.0, + "grad_norm": 2.183253633037468, + "language_loss": 0.77119774, + "learning_rate": 3.1386922792386924e-06, + "loss": 0.8485297, + "num_input_tokens_seen": 117140515, + "router_z_loss_clip": 1.80761719, + "router_z_loss_mlp": 0.14318848, + "step": 5455, + "time_per_iteration": 2.4873318672180176 + }, + { + "auxiliary_loss_clip": 0.06482153, + "auxiliary_loss_mlp": 0.01285817, + "balance_loss_clip": 0.06290287, + "balance_loss_mlp": 0.01268377, + "epoch": 0.3280324665564407, + "flos": 26585086417920.0, + "grad_norm": 1.6915080932551223, + "language_loss": 0.74407738, + "learning_rate": 3.138372082016768e-06, + "loss": 0.82175708, + "num_input_tokens_seen": 117161485, + "router_z_loss_clip": 1.91894531, + "router_z_loss_mlp": 0.17443848, + "step": 5456, + "time_per_iteration": 2.593258857727051 + }, + { + "auxiliary_loss_clip": 0.0646835, + "auxiliary_loss_mlp": 0.01277637, + "balance_loss_clip": 0.06282572, + "balance_loss_mlp": 0.01261306, + "epoch": 0.32809258980910866, + "flos": 22936631938560.0, + "grad_norm": 1.4862092693082851, + "language_loss": 0.78666067, + "learning_rate": 3.1380518416276596e-06, + "loss": 0.8641206, + "num_input_tokens_seen": 117181870, + "router_z_loss_clip": 1.85546875, + "router_z_loss_mlp": 0.16345215, + "step": 5457, + "time_per_iteration": 2.523540496826172 + }, + { + "auxiliary_loss_clip": 0.06473868, + "auxiliary_loss_mlp": 0.01274513, + "balance_loss_clip": 0.06281006, + "balance_loss_mlp": 0.0125873, + "epoch": 0.3281527130617766, + "flos": 22790457290880.0, + "grad_norm": 2.0769759307730644, + "language_loss": 0.78958774, + "learning_rate": 3.1377315580835115e-06, + "loss": 0.86707151, + "num_input_tokens_seen": 117201380, + "router_z_loss_clip": 1.92773438, + "router_z_loss_mlp": 0.15795898, + "step": 5458, + "time_per_iteration": 2.552680015563965 + }, + { + "auxiliary_loss_clip": 0.06469774, + "auxiliary_loss_mlp": 0.01274499, + "balance_loss_clip": 0.06284518, + "balance_loss_mlp": 0.01258215, + "epoch": 0.3282128363144446, + "flos": 21256902264960.0, + "grad_norm": 1.5512978296749391, + "language_loss": 0.73655844, + "learning_rate": 3.1374112313964686e-06, + "loss": 0.8140012, + "num_input_tokens_seen": 117221040, + "router_z_loss_clip": 1.84960938, + "router_z_loss_mlp": 0.1628418, + "step": 5459, + "time_per_iteration": 2.5166404247283936 + }, + { + "auxiliary_loss_clip": 0.0647283, + "auxiliary_loss_mlp": 0.01274033, + "balance_loss_clip": 0.0628351, + "balance_loss_mlp": 0.01257761, + "epoch": 0.32827295956711255, + "flos": 30850328401920.0, + "grad_norm": 2.2277675097031993, + "language_loss": 0.84476066, + "learning_rate": 3.1370908615786783e-06, + "loss": 0.92222929, + "num_input_tokens_seen": 117241395, + "router_z_loss_clip": 1.89257812, + "router_z_loss_mlp": 0.16271973, + "step": 5460, + "time_per_iteration": 2.6067721843719482 + }, + { + "auxiliary_loss_clip": 0.06469227, + "auxiliary_loss_mlp": 0.01276293, + "balance_loss_clip": 0.06282166, + "balance_loss_mlp": 0.01260319, + "epoch": 0.3283330828197806, + "flos": 25921032410880.0, + "grad_norm": 2.3722751928185297, + "language_loss": 0.78114808, + "learning_rate": 3.136770448642288e-06, + "loss": 0.8586033, + "num_input_tokens_seen": 117259340, + "router_z_loss_clip": 1.86914062, + "router_z_loss_mlp": 0.15991211, + "step": 5461, + "time_per_iteration": 2.550417184829712 + }, + { + "auxiliary_loss_clip": 0.06469681, + "auxiliary_loss_mlp": 0.01279493, + "balance_loss_clip": 0.06282061, + "balance_loss_mlp": 0.01261361, + "epoch": 0.32839320607244854, + "flos": 38591295672960.0, + "grad_norm": 1.5965953358146812, + "language_loss": 0.62925887, + "learning_rate": 3.1364499925994484e-06, + "loss": 0.70675063, + "num_input_tokens_seen": 117282375, + "router_z_loss_clip": 1.87695312, + "router_z_loss_mlp": 0.18115234, + "step": 5462, + "time_per_iteration": 2.7004194259643555 + }, + { + "auxiliary_loss_clip": 0.06467308, + "auxiliary_loss_mlp": 0.0128086, + "balance_loss_clip": 0.06284478, + "balance_loss_mlp": 0.01265077, + "epoch": 0.3284533293251165, + "flos": 26658068924160.0, + "grad_norm": 1.3126719376538145, + "language_loss": 0.78502059, + "learning_rate": 3.1361294934623115e-06, + "loss": 0.86250222, + "num_input_tokens_seen": 117303830, + "router_z_loss_clip": 1.828125, + "router_z_loss_mlp": 0.15783691, + "step": 5463, + "time_per_iteration": 2.6072070598602295 + }, + { + "auxiliary_loss_clip": 0.0647091, + "auxiliary_loss_mlp": 0.01272646, + "balance_loss_clip": 0.06283993, + "balance_loss_mlp": 0.01256589, + "epoch": 0.32851345257778447, + "flos": 15309498839040.0, + "grad_norm": 1.727782559794916, + "language_loss": 0.70068884, + "learning_rate": 3.1358089512430303e-06, + "loss": 0.77812445, + "num_input_tokens_seen": 117320665, + "router_z_loss_clip": 1.87109375, + "router_z_loss_mlp": 0.16064453, + "step": 5464, + "time_per_iteration": 2.519319534301758 + }, + { + "auxiliary_loss_clip": 0.06466094, + "auxiliary_loss_mlp": 0.01275271, + "balance_loss_clip": 0.06284432, + "balance_loss_mlp": 0.01257938, + "epoch": 0.32857357583045244, + "flos": 23520491988480.0, + "grad_norm": 1.6619431416557902, + "language_loss": 0.72759986, + "learning_rate": 3.1354883659537594e-06, + "loss": 0.80501354, + "num_input_tokens_seen": 117339795, + "router_z_loss_clip": 1.81445312, + "router_z_loss_mlp": 0.17333984, + "step": 5465, + "time_per_iteration": 2.573444366455078 + }, + { + "auxiliary_loss_clip": 0.06473792, + "auxiliary_loss_mlp": 0.01281793, + "balance_loss_clip": 0.06287415, + "balance_loss_mlp": 0.01265509, + "epoch": 0.3286336990831204, + "flos": 21001379639040.0, + "grad_norm": 1.5232981833560715, + "language_loss": 0.82967317, + "learning_rate": 3.1351677376066567e-06, + "loss": 0.90722907, + "num_input_tokens_seen": 117359525, + "router_z_loss_clip": 1.86328125, + "router_z_loss_mlp": 0.16271973, + "step": 5466, + "time_per_iteration": 4.012515306472778 + }, + { + "auxiliary_loss_clip": 0.0647275, + "auxiliary_loss_mlp": 0.01271061, + "balance_loss_clip": 0.06285034, + "balance_loss_mlp": 0.01254932, + "epoch": 0.32869382233578837, + "flos": 23665450752000.0, + "grad_norm": 1.6606265994221874, + "language_loss": 0.79192597, + "learning_rate": 3.134847066213879e-06, + "loss": 0.86936402, + "num_input_tokens_seen": 117380320, + "router_z_loss_clip": 1.87597656, + "router_z_loss_mlp": 0.16125488, + "step": 5467, + "time_per_iteration": 4.000247955322266 + }, + { + "auxiliary_loss_clip": 0.06467809, + "auxiliary_loss_mlp": 0.01271951, + "balance_loss_clip": 0.06279044, + "balance_loss_mlp": 0.01255333, + "epoch": 0.32875394558845633, + "flos": 25343335635840.0, + "grad_norm": 1.5510134892276737, + "language_loss": 0.74865687, + "learning_rate": 3.134526351787587e-06, + "loss": 0.82605445, + "num_input_tokens_seen": 117400695, + "router_z_loss_clip": 1.88574219, + "router_z_loss_mlp": 0.16601562, + "step": 5468, + "time_per_iteration": 2.5805253982543945 + }, + { + "auxiliary_loss_clip": 0.06474267, + "auxiliary_loss_mlp": 0.01276703, + "balance_loss_clip": 0.0628129, + "balance_loss_mlp": 0.01259108, + "epoch": 0.3288140688411243, + "flos": 14908430471040.0, + "grad_norm": 1.672146103500693, + "language_loss": 0.78728724, + "learning_rate": 3.134205594339942e-06, + "loss": 0.86479694, + "num_input_tokens_seen": 117418800, + "router_z_loss_clip": 1.9296875, + "router_z_loss_mlp": 0.17614746, + "step": 5469, + "time_per_iteration": 3.955373525619507 + }, + { + "auxiliary_loss_clip": 0.06466976, + "auxiliary_loss_mlp": 0.01273245, + "balance_loss_clip": 0.06279504, + "balance_loss_mlp": 0.01257224, + "epoch": 0.32887419209379226, + "flos": 18557220366720.0, + "grad_norm": 1.6018901390748483, + "language_loss": 0.82183433, + "learning_rate": 3.133884793883107e-06, + "loss": 0.89923656, + "num_input_tokens_seen": 117438220, + "router_z_loss_clip": 1.87402344, + "router_z_loss_mlp": 0.16015625, + "step": 5470, + "time_per_iteration": 2.5481319427490234 + }, + { + "auxiliary_loss_clip": 0.06467617, + "auxiliary_loss_mlp": 0.01271427, + "balance_loss_clip": 0.06279681, + "balance_loss_mlp": 0.01254869, + "epoch": 0.3289343153464602, + "flos": 48116560913280.0, + "grad_norm": 1.6166643495117736, + "language_loss": 0.68441176, + "learning_rate": 3.1335639504292478e-06, + "loss": 0.76180226, + "num_input_tokens_seen": 117462560, + "router_z_loss_clip": 1.87890625, + "router_z_loss_mlp": 0.16564941, + "step": 5471, + "time_per_iteration": 2.780454158782959 + }, + { + "auxiliary_loss_clip": 0.06479289, + "auxiliary_loss_mlp": 0.012789, + "balance_loss_clip": 0.06285035, + "balance_loss_mlp": 0.01260637, + "epoch": 0.3289944385991282, + "flos": 27607763900160.0, + "grad_norm": 1.5078842371471577, + "language_loss": 0.65564525, + "learning_rate": 3.1332430639905288e-06, + "loss": 0.73322713, + "num_input_tokens_seen": 117483665, + "router_z_loss_clip": 1.94140625, + "router_z_loss_mlp": 0.18273926, + "step": 5472, + "time_per_iteration": 2.580644369125366 + }, + { + "auxiliary_loss_clip": 0.06472386, + "auxiliary_loss_mlp": 0.01277133, + "balance_loss_clip": 0.06281875, + "balance_loss_mlp": 0.01259144, + "epoch": 0.32905456185179616, + "flos": 20126470032000.0, + "grad_norm": 1.614198879205061, + "language_loss": 0.88538003, + "learning_rate": 3.13292213457912e-06, + "loss": 0.96287525, + "num_input_tokens_seen": 117503565, + "router_z_loss_clip": 1.90527344, + "router_z_loss_mlp": 0.17993164, + "step": 5473, + "time_per_iteration": 4.021254062652588 + }, + { + "auxiliary_loss_clip": 0.06475069, + "auxiliary_loss_mlp": 0.01270272, + "balance_loss_clip": 0.06285396, + "balance_loss_mlp": 0.01253714, + "epoch": 0.3291146851044642, + "flos": 23186075143680.0, + "grad_norm": 1.7643015597930078, + "language_loss": 0.78719336, + "learning_rate": 3.1326011622071903e-06, + "loss": 0.86464679, + "num_input_tokens_seen": 117521460, + "router_z_loss_clip": 1.89453125, + "router_z_loss_mlp": 0.16552734, + "step": 5474, + "time_per_iteration": 2.5416688919067383 + }, + { + "auxiliary_loss_clip": 0.06379573, + "auxiliary_loss_mlp": 0.0134405, + "balance_loss_clip": 0.06291323, + "balance_loss_mlp": 0.01340224, + "epoch": 0.32917480835713214, + "flos": 67641630664320.0, + "grad_norm": 0.8577160187921843, + "language_loss": 0.60258645, + "learning_rate": 3.132280146886911e-06, + "loss": 0.67982268, + "num_input_tokens_seen": 117580550, + "router_z_loss_clip": 0.8828125, + "router_z_loss_mlp": 0.03820801, + "step": 5475, + "time_per_iteration": 3.1267805099487305 + }, + { + "auxiliary_loss_clip": 0.06479369, + "auxiliary_loss_mlp": 0.01279647, + "balance_loss_clip": 0.06284596, + "balance_loss_mlp": 0.01261599, + "epoch": 0.3292349316098001, + "flos": 27971963671680.0, + "grad_norm": 3.252822648856248, + "language_loss": 0.7712574, + "learning_rate": 3.131959088630455e-06, + "loss": 0.84884757, + "num_input_tokens_seen": 117600645, + "router_z_loss_clip": 1.94824219, + "router_z_loss_mlp": 0.18041992, + "step": 5476, + "time_per_iteration": 2.5819692611694336 + }, + { + "auxiliary_loss_clip": 0.06469015, + "auxiliary_loss_mlp": 0.01275163, + "balance_loss_clip": 0.06282525, + "balance_loss_mlp": 0.01258956, + "epoch": 0.3292950548624681, + "flos": 20269416297600.0, + "grad_norm": 1.7333439092472165, + "language_loss": 0.7556808, + "learning_rate": 3.131637987449997e-06, + "loss": 0.83312255, + "num_input_tokens_seen": 117618880, + "router_z_loss_clip": 1.86425781, + "router_z_loss_mlp": 0.1619873, + "step": 5477, + "time_per_iteration": 2.532106637954712 + }, + { + "auxiliary_loss_clip": 0.06470291, + "auxiliary_loss_mlp": 0.01275718, + "balance_loss_clip": 0.0628788, + "balance_loss_mlp": 0.01259541, + "epoch": 0.32935517811513604, + "flos": 20819174935680.0, + "grad_norm": 2.104456143380591, + "language_loss": 0.75728148, + "learning_rate": 3.131316843357713e-06, + "loss": 0.83474159, + "num_input_tokens_seen": 117636445, + "router_z_loss_clip": 1.82226562, + "router_z_loss_mlp": 0.16174316, + "step": 5478, + "time_per_iteration": 2.5293543338775635 + }, + { + "auxiliary_loss_clip": 0.06470281, + "auxiliary_loss_mlp": 0.01278094, + "balance_loss_clip": 0.06287058, + "balance_loss_mlp": 0.01261631, + "epoch": 0.329415301367804, + "flos": 18447704680320.0, + "grad_norm": 2.368560120299576, + "language_loss": 0.80772918, + "learning_rate": 3.1309956563657807e-06, + "loss": 0.8852129, + "num_input_tokens_seen": 117653105, + "router_z_loss_clip": 1.83203125, + "router_z_loss_mlp": 0.16455078, + "step": 5479, + "time_per_iteration": 2.5154647827148438 + }, + { + "auxiliary_loss_clip": 0.06362775, + "auxiliary_loss_mlp": 0.01272199, + "balance_loss_clip": 0.06275004, + "balance_loss_mlp": 0.01268579, + "epoch": 0.32947542462047197, + "flos": 66344967930240.0, + "grad_norm": 0.7366188072531391, + "language_loss": 0.56333017, + "learning_rate": 3.1306744264863804e-06, + "loss": 0.63967991, + "num_input_tokens_seen": 117719225, + "router_z_loss_clip": 0.87597656, + "router_z_loss_mlp": 0.03616333, + "step": 5480, + "time_per_iteration": 3.2369706630706787 + }, + { + "auxiliary_loss_clip": 0.06477214, + "auxiliary_loss_mlp": 0.01278618, + "balance_loss_clip": 0.06290235, + "balance_loss_mlp": 0.01262179, + "epoch": 0.32953554787313993, + "flos": 23228268474240.0, + "grad_norm": 1.631877255513098, + "language_loss": 0.7736274, + "learning_rate": 3.1303531537316915e-06, + "loss": 0.85118574, + "num_input_tokens_seen": 117738725, + "router_z_loss_clip": 1.8671875, + "router_z_loss_mlp": 0.16442871, + "step": 5481, + "time_per_iteration": 2.5206968784332275 + }, + { + "auxiliary_loss_clip": 0.06479073, + "auxiliary_loss_mlp": 0.01277292, + "balance_loss_clip": 0.0628771, + "balance_loss_mlp": 0.01260686, + "epoch": 0.3295956711258079, + "flos": 27015686150400.0, + "grad_norm": 1.3752047504599005, + "language_loss": 0.78639877, + "learning_rate": 3.130031838113899e-06, + "loss": 0.86396235, + "num_input_tokens_seen": 117757765, + "router_z_loss_clip": 1.91601562, + "router_z_loss_mlp": 0.16601562, + "step": 5482, + "time_per_iteration": 2.604720115661621 + }, + { + "auxiliary_loss_clip": 0.06475698, + "auxiliary_loss_mlp": 0.01274916, + "balance_loss_clip": 0.06286834, + "balance_loss_mlp": 0.01258274, + "epoch": 0.32965579437847586, + "flos": 19177697450880.0, + "grad_norm": 2.0027782692889358, + "language_loss": 0.74399549, + "learning_rate": 3.129710479645185e-06, + "loss": 0.82150161, + "num_input_tokens_seen": 117776810, + "router_z_loss_clip": 1.88964844, + "router_z_loss_mlp": 0.16662598, + "step": 5483, + "time_per_iteration": 2.5124409198760986 + }, + { + "auxiliary_loss_clip": 0.06472629, + "auxiliary_loss_mlp": 0.01273838, + "balance_loss_clip": 0.06286867, + "balance_loss_mlp": 0.01258472, + "epoch": 0.32971591763114383, + "flos": 30490447115520.0, + "grad_norm": 1.7640387903996015, + "language_loss": 0.7588225, + "learning_rate": 3.1293890783377366e-06, + "loss": 0.83628714, + "num_input_tokens_seen": 117797730, + "router_z_loss_clip": 1.85742188, + "router_z_loss_mlp": 0.15368652, + "step": 5484, + "time_per_iteration": 2.64021635055542 + }, + { + "auxiliary_loss_clip": 0.06469439, + "auxiliary_loss_mlp": 0.01274788, + "balance_loss_clip": 0.06284587, + "balance_loss_mlp": 0.01259232, + "epoch": 0.3297760408838118, + "flos": 16295140016640.0, + "grad_norm": 1.7787654746377481, + "language_loss": 0.72680974, + "learning_rate": 3.129067634203742e-06, + "loss": 0.80425203, + "num_input_tokens_seen": 117815365, + "router_z_loss_clip": 1.84960938, + "router_z_loss_mlp": 0.15563965, + "step": 5485, + "time_per_iteration": 2.516080379486084 + }, + { + "auxiliary_loss_clip": 0.06466281, + "auxiliary_loss_mlp": 0.01274799, + "balance_loss_clip": 0.06281459, + "balance_loss_mlp": 0.0125991, + "epoch": 0.32983616413647976, + "flos": 29538194590080.0, + "grad_norm": 2.336444213272706, + "language_loss": 0.80720758, + "learning_rate": 3.128746147255388e-06, + "loss": 0.8846184, + "num_input_tokens_seen": 117836095, + "router_z_loss_clip": 1.84765625, + "router_z_loss_mlp": 0.14904785, + "step": 5486, + "time_per_iteration": 2.633730173110962 + }, + { + "auxiliary_loss_clip": 0.06467714, + "auxiliary_loss_mlp": 0.01276658, + "balance_loss_clip": 0.06283799, + "balance_loss_mlp": 0.01261828, + "epoch": 0.3298962873891478, + "flos": 20637682992000.0, + "grad_norm": 1.9361428819205904, + "language_loss": 0.84726417, + "learning_rate": 3.1284246175048683e-06, + "loss": 0.92470789, + "num_input_tokens_seen": 117854655, + "router_z_loss_clip": 1.83886719, + "router_z_loss_mlp": 0.14819336, + "step": 5487, + "time_per_iteration": 2.5073888301849365 + }, + { + "auxiliary_loss_clip": 0.06473765, + "auxiliary_loss_mlp": 0.01275689, + "balance_loss_clip": 0.06283425, + "balance_loss_mlp": 0.01258845, + "epoch": 0.32995641064181574, + "flos": 14981329123200.0, + "grad_norm": 2.0510786453666707, + "language_loss": 0.74805683, + "learning_rate": 3.1281030449643735e-06, + "loss": 0.82555139, + "num_input_tokens_seen": 117873300, + "router_z_loss_clip": 1.90429688, + "router_z_loss_mlp": 0.16833496, + "step": 5488, + "time_per_iteration": 2.5195999145507812 + }, + { + "auxiliary_loss_clip": 0.06475645, + "auxiliary_loss_mlp": 0.01276585, + "balance_loss_clip": 0.06288432, + "balance_loss_mlp": 0.012611, + "epoch": 0.3300165338944837, + "flos": 18667448812800.0, + "grad_norm": 2.2567239989743912, + "language_loss": 0.73048651, + "learning_rate": 3.127781429646098e-06, + "loss": 0.80800879, + "num_input_tokens_seen": 117891540, + "router_z_loss_clip": 1.87207031, + "router_z_loss_mlp": 0.15466309, + "step": 5489, + "time_per_iteration": 2.489529609680176 + }, + { + "auxiliary_loss_clip": 0.06468415, + "auxiliary_loss_mlp": 0.01275877, + "balance_loss_clip": 0.06282636, + "balance_loss_mlp": 0.01260987, + "epoch": 0.3300766571471517, + "flos": 25589215042560.0, + "grad_norm": 2.1838257682132256, + "language_loss": 0.89381063, + "learning_rate": 3.127459771562238e-06, + "loss": 0.97125351, + "num_input_tokens_seen": 117907690, + "router_z_loss_clip": 1.859375, + "router_z_loss_mlp": 0.14898682, + "step": 5490, + "time_per_iteration": 2.583505153656006 + }, + { + "auxiliary_loss_clip": 0.06470391, + "auxiliary_loss_mlp": 0.01273693, + "balance_loss_clip": 0.06285221, + "balance_loss_mlp": 0.01258339, + "epoch": 0.33013678039981964, + "flos": 11368150012800.0, + "grad_norm": 1.8708534793530802, + "language_loss": 0.82974613, + "learning_rate": 3.1271380707249907e-06, + "loss": 0.90718699, + "num_input_tokens_seen": 117925640, + "router_z_loss_clip": 1.85253906, + "router_z_loss_mlp": 0.15344238, + "step": 5491, + "time_per_iteration": 2.4903311729431152 + }, + { + "auxiliary_loss_clip": 0.06473103, + "auxiliary_loss_mlp": 0.01274646, + "balance_loss_clip": 0.06285959, + "balance_loss_mlp": 0.01258589, + "epoch": 0.3301969036524876, + "flos": 24827175285120.0, + "grad_norm": 1.8609460693795263, + "language_loss": 0.77910721, + "learning_rate": 3.126816327146554e-06, + "loss": 0.85658479, + "num_input_tokens_seen": 117944525, + "router_z_loss_clip": 1.87402344, + "router_z_loss_mlp": 0.16052246, + "step": 5492, + "time_per_iteration": 2.5615334510803223 + }, + { + "auxiliary_loss_clip": 0.06478797, + "auxiliary_loss_mlp": 0.01277822, + "balance_loss_clip": 0.06287751, + "balance_loss_mlp": 0.01261324, + "epoch": 0.33025702690515557, + "flos": 15966634884480.0, + "grad_norm": 2.4722908606070875, + "language_loss": 0.75614154, + "learning_rate": 3.12649454083913e-06, + "loss": 0.83370769, + "num_input_tokens_seen": 117962515, + "router_z_loss_clip": 1.91015625, + "router_z_loss_mlp": 0.16503906, + "step": 5493, + "time_per_iteration": 2.489143133163452 + }, + { + "auxiliary_loss_clip": 0.06366986, + "auxiliary_loss_mlp": 0.01258616, + "balance_loss_clip": 0.06280049, + "balance_loss_mlp": 0.0125515, + "epoch": 0.33031715015782354, + "flos": 59435794540800.0, + "grad_norm": 0.7878547289977352, + "language_loss": 0.54030049, + "learning_rate": 3.12617271181492e-06, + "loss": 0.61655653, + "num_input_tokens_seen": 118018780, + "router_z_loss_clip": 0.86767578, + "router_z_loss_mlp": 0.03475952, + "step": 5494, + "time_per_iteration": 3.0869832038879395 + }, + { + "auxiliary_loss_clip": 0.06482484, + "auxiliary_loss_mlp": 0.01281394, + "balance_loss_clip": 0.0629174, + "balance_loss_mlp": 0.01264753, + "epoch": 0.3303772734104915, + "flos": 23190896753280.0, + "grad_norm": 1.4215593277180028, + "language_loss": 0.87367666, + "learning_rate": 3.1258508400861276e-06, + "loss": 0.9513154, + "num_input_tokens_seen": 118038610, + "router_z_loss_clip": 1.90625, + "router_z_loss_mlp": 0.16625977, + "step": 5495, + "time_per_iteration": 2.5188820362091064 + }, + { + "auxiliary_loss_clip": 0.06477214, + "auxiliary_loss_mlp": 0.0127749, + "balance_loss_clip": 0.06287535, + "balance_loss_mlp": 0.01260038, + "epoch": 0.33043739666315947, + "flos": 33080068275840.0, + "grad_norm": 2.0083800771900995, + "language_loss": 0.74168754, + "learning_rate": 3.1255289256649587e-06, + "loss": 0.81923461, + "num_input_tokens_seen": 118055905, + "router_z_loss_clip": 1.89550781, + "router_z_loss_mlp": 0.17443848, + "step": 5496, + "time_per_iteration": 2.6151347160339355 + }, + { + "auxiliary_loss_clip": 0.06470463, + "auxiliary_loss_mlp": 0.01272194, + "balance_loss_clip": 0.0628539, + "balance_loss_mlp": 0.01256434, + "epoch": 0.33049751991582743, + "flos": 24901625237760.0, + "grad_norm": 1.9468549986980455, + "language_loss": 0.72676557, + "learning_rate": 3.1252069685636196e-06, + "loss": 0.80419219, + "num_input_tokens_seen": 118073695, + "router_z_loss_clip": 1.84960938, + "router_z_loss_mlp": 0.15759277, + "step": 5497, + "time_per_iteration": 2.51874041557312 + }, + { + "auxiliary_loss_clip": 0.06472345, + "auxiliary_loss_mlp": 0.0127459, + "balance_loss_clip": 0.06286049, + "balance_loss_mlp": 0.01259343, + "epoch": 0.3305576431684954, + "flos": 29468272757760.0, + "grad_norm": 1.8137955115189202, + "language_loss": 0.80825889, + "learning_rate": 3.124884968794321e-06, + "loss": 0.88572824, + "num_input_tokens_seen": 118094030, + "router_z_loss_clip": 1.86328125, + "router_z_loss_mlp": 0.15234375, + "step": 5498, + "time_per_iteration": 2.6010656356811523 + }, + { + "auxiliary_loss_clip": 0.06476308, + "auxiliary_loss_mlp": 0.0127559, + "balance_loss_clip": 0.0628619, + "balance_loss_mlp": 0.01258281, + "epoch": 0.33061776642116336, + "flos": 22637951660160.0, + "grad_norm": 1.8227647554707032, + "language_loss": 0.76843095, + "learning_rate": 3.12456292636927e-06, + "loss": 0.84594989, + "num_input_tokens_seen": 118111665, + "router_z_loss_clip": 1.90039062, + "router_z_loss_mlp": 0.1730957, + "step": 5499, + "time_per_iteration": 2.5308327674865723 + }, + { + "auxiliary_loss_clip": 0.06475572, + "auxiliary_loss_mlp": 0.01277032, + "balance_loss_clip": 0.06287447, + "balance_loss_mlp": 0.01260832, + "epoch": 0.3306778896738313, + "flos": 25783536660480.0, + "grad_norm": 1.5377855738322084, + "language_loss": 0.79203349, + "learning_rate": 3.124240841300681e-06, + "loss": 0.86955953, + "num_input_tokens_seen": 118132435, + "router_z_loss_clip": 1.88183594, + "router_z_loss_mlp": 0.16186523, + "step": 5500, + "time_per_iteration": 2.5970370769500732 + }, + { + "auxiliary_loss_clip": 0.0648918, + "auxiliary_loss_mlp": 0.01275283, + "balance_loss_clip": 0.06298861, + "balance_loss_mlp": 0.01257544, + "epoch": 0.33073801292649935, + "flos": 36949566625920.0, + "grad_norm": 1.9211086255091194, + "language_loss": 0.66916561, + "learning_rate": 3.1239187136007665e-06, + "loss": 0.7468102, + "num_input_tokens_seen": 118155255, + "router_z_loss_clip": 1.90527344, + "router_z_loss_mlp": 0.17724609, + "step": 5501, + "time_per_iteration": 2.687847375869751 + }, + { + "auxiliary_loss_clip": 0.06481969, + "auxiliary_loss_mlp": 0.01273275, + "balance_loss_clip": 0.06291866, + "balance_loss_mlp": 0.01255763, + "epoch": 0.3307981361791673, + "flos": 12972465411840.0, + "grad_norm": 2.0893698607967957, + "language_loss": 0.77978551, + "learning_rate": 3.1235965432817417e-06, + "loss": 0.85733795, + "num_input_tokens_seen": 118169865, + "router_z_loss_clip": 1.90039062, + "router_z_loss_mlp": 0.17504883, + "step": 5502, + "time_per_iteration": 2.500303268432617 + }, + { + "auxiliary_loss_clip": 0.06481159, + "auxiliary_loss_mlp": 0.0127421, + "balance_loss_clip": 0.06290131, + "balance_loss_mlp": 0.01256424, + "epoch": 0.3308582594318353, + "flos": 25381420116480.0, + "grad_norm": 1.7450780858535315, + "language_loss": 0.72841054, + "learning_rate": 3.123274330355824e-06, + "loss": 0.80596423, + "num_input_tokens_seen": 118190760, + "router_z_loss_clip": 1.90917969, + "router_z_loss_mlp": 0.17773438, + "step": 5503, + "time_per_iteration": 2.5851874351501465 + }, + { + "auxiliary_loss_clip": 0.06475106, + "auxiliary_loss_mlp": 0.01274446, + "balance_loss_clip": 0.06287622, + "balance_loss_mlp": 0.01257769, + "epoch": 0.33091838268450324, + "flos": 26475738439680.0, + "grad_norm": 1.4901464435255347, + "language_loss": 0.7565586, + "learning_rate": 3.12295207483523e-06, + "loss": 0.83405411, + "num_input_tokens_seen": 118213620, + "router_z_loss_clip": 1.87597656, + "router_z_loss_mlp": 0.16674805, + "step": 5504, + "time_per_iteration": 2.5670559406280518 + }, + { + "auxiliary_loss_clip": 0.06476955, + "auxiliary_loss_mlp": 0.01276594, + "balance_loss_clip": 0.06289346, + "balance_loss_mlp": 0.01261025, + "epoch": 0.3309785059371712, + "flos": 24977836126080.0, + "grad_norm": 1.5646403370775293, + "language_loss": 0.70214427, + "learning_rate": 3.1226297767321816e-06, + "loss": 0.77967972, + "num_input_tokens_seen": 118235010, + "router_z_loss_clip": 1.87597656, + "router_z_loss_mlp": 0.15545654, + "step": 5505, + "time_per_iteration": 2.628267288208008 + }, + { + "auxiliary_loss_clip": 0.06474259, + "auxiliary_loss_mlp": 0.01275018, + "balance_loss_clip": 0.06288742, + "balance_loss_mlp": 0.01258543, + "epoch": 0.3310386291898392, + "flos": 20452585322880.0, + "grad_norm": 1.7982072656373813, + "language_loss": 0.8240785, + "learning_rate": 3.122307436058899e-06, + "loss": 0.90157127, + "num_input_tokens_seen": 118255820, + "router_z_loss_clip": 1.85644531, + "router_z_loss_mlp": 0.16467285, + "step": 5506, + "time_per_iteration": 4.10949444770813 + }, + { + "auxiliary_loss_clip": 0.06476486, + "auxiliary_loss_mlp": 0.01275135, + "balance_loss_clip": 0.0628888, + "balance_loss_mlp": 0.01258428, + "epoch": 0.33109875244250714, + "flos": 23188926182400.0, + "grad_norm": 1.740251919086934, + "language_loss": 0.79860532, + "learning_rate": 3.121985052827606e-06, + "loss": 0.87612152, + "num_input_tokens_seen": 118274160, + "router_z_loss_clip": 1.875, + "router_z_loss_mlp": 0.16705322, + "step": 5507, + "time_per_iteration": 4.12217903137207 + }, + { + "auxiliary_loss_clip": 0.06468768, + "auxiliary_loss_mlp": 0.01276749, + "balance_loss_clip": 0.06281893, + "balance_loss_mlp": 0.01260477, + "epoch": 0.3311588756951751, + "flos": 24174902776320.0, + "grad_norm": 1.6433149866128014, + "language_loss": 0.71967649, + "learning_rate": 3.1216626270505274e-06, + "loss": 0.79713166, + "num_input_tokens_seen": 118294385, + "router_z_loss_clip": 1.87109375, + "router_z_loss_mlp": 0.1628418, + "step": 5508, + "time_per_iteration": 2.5890002250671387 + }, + { + "auxiliary_loss_clip": 0.06468692, + "auxiliary_loss_mlp": 0.01272213, + "balance_loss_clip": 0.06284875, + "balance_loss_mlp": 0.01256788, + "epoch": 0.33121899894784307, + "flos": 28152994417920.0, + "grad_norm": 1.6757523088462936, + "language_loss": 0.71588784, + "learning_rate": 3.12134015873989e-06, + "loss": 0.79329687, + "num_input_tokens_seen": 118313105, + "router_z_loss_clip": 1.83691406, + "router_z_loss_mlp": 0.15429688, + "step": 5509, + "time_per_iteration": 3.976996660232544 + }, + { + "auxiliary_loss_clip": 0.06471643, + "auxiliary_loss_mlp": 0.01279857, + "balance_loss_clip": 0.06286702, + "balance_loss_mlp": 0.01264396, + "epoch": 0.33127912220051103, + "flos": 29574979332480.0, + "grad_norm": 1.5753317257606638, + "language_loss": 0.73806137, + "learning_rate": 3.121017647907921e-06, + "loss": 0.81557631, + "num_input_tokens_seen": 118335250, + "router_z_loss_clip": 1.84960938, + "router_z_loss_mlp": 0.15460205, + "step": 5510, + "time_per_iteration": 2.576838731765747 + }, + { + "auxiliary_loss_clip": 0.06473264, + "auxiliary_loss_mlp": 0.01276647, + "balance_loss_clip": 0.06286872, + "balance_loss_mlp": 0.01261019, + "epoch": 0.331339245453179, + "flos": 14434086107520.0, + "grad_norm": 2.529546935928515, + "language_loss": 0.88507652, + "learning_rate": 3.1206950945668508e-06, + "loss": 0.96257567, + "num_input_tokens_seen": 118351470, + "router_z_loss_clip": 1.86328125, + "router_z_loss_mlp": 0.15612793, + "step": 5511, + "time_per_iteration": 2.550442695617676 + }, + { + "auxiliary_loss_clip": 0.06464168, + "auxiliary_loss_mlp": 0.01275515, + "balance_loss_clip": 0.06286532, + "balance_loss_mlp": 0.01260494, + "epoch": 0.33139936870584696, + "flos": 20893499107200.0, + "grad_norm": 1.6341387009287651, + "language_loss": 0.73559558, + "learning_rate": 3.12037249872891e-06, + "loss": 0.81299245, + "num_input_tokens_seen": 118370970, + "router_z_loss_clip": 1.77441406, + "router_z_loss_mlp": 0.15026855, + "step": 5512, + "time_per_iteration": 2.5596871376037598 + }, + { + "auxiliary_loss_clip": 0.06468001, + "auxiliary_loss_mlp": 0.01278341, + "balance_loss_clip": 0.06286225, + "balance_loss_mlp": 0.01262438, + "epoch": 0.33145949195851493, + "flos": 36293352975360.0, + "grad_norm": 1.8738374179289, + "language_loss": 0.72677827, + "learning_rate": 3.1200498604063317e-06, + "loss": 0.80424166, + "num_input_tokens_seen": 118393125, + "router_z_loss_clip": 1.81738281, + "router_z_loss_mlp": 0.15905762, + "step": 5513, + "time_per_iteration": 4.148774147033691 + }, + { + "auxiliary_loss_clip": 0.06472933, + "auxiliary_loss_mlp": 0.01275876, + "balance_loss_clip": 0.06284368, + "balance_loss_mlp": 0.0125958, + "epoch": 0.33151961521118295, + "flos": 14284431515520.0, + "grad_norm": 1.8311253656567958, + "language_loss": 0.69026303, + "learning_rate": 3.1197271796113507e-06, + "loss": 0.7677511, + "num_input_tokens_seen": 118410860, + "router_z_loss_clip": 1.8828125, + "router_z_loss_mlp": 0.16296387, + "step": 5514, + "time_per_iteration": 2.486818313598633 + }, + { + "auxiliary_loss_clip": 0.06477968, + "auxiliary_loss_mlp": 0.0127816, + "balance_loss_clip": 0.06291951, + "balance_loss_mlp": 0.01261089, + "epoch": 0.3315797384638509, + "flos": 20780126133120.0, + "grad_norm": 1.9656560392088134, + "language_loss": 0.66393441, + "learning_rate": 3.1194044563562026e-06, + "loss": 0.74149573, + "num_input_tokens_seen": 118429570, + "router_z_loss_clip": 1.859375, + "router_z_loss_mlp": 0.17053223, + "step": 5515, + "time_per_iteration": 2.531658411026001 + }, + { + "auxiliary_loss_clip": 0.06473279, + "auxiliary_loss_mlp": 0.01275122, + "balance_loss_clip": 0.06286342, + "balance_loss_mlp": 0.01258885, + "epoch": 0.3316398617165189, + "flos": 24686115736320.0, + "grad_norm": 3.8914339391091732, + "language_loss": 0.69369388, + "learning_rate": 3.1190816906531257e-06, + "loss": 0.77117789, + "num_input_tokens_seen": 118450285, + "router_z_loss_clip": 1.8671875, + "router_z_loss_mlp": 0.16235352, + "step": 5516, + "time_per_iteration": 2.5392425060272217 + }, + { + "auxiliary_loss_clip": 0.06476592, + "auxiliary_loss_mlp": 0.01274968, + "balance_loss_clip": 0.06287026, + "balance_loss_mlp": 0.0125959, + "epoch": 0.33169998496918685, + "flos": 18593879328000.0, + "grad_norm": 2.757231582138207, + "language_loss": 0.80914545, + "learning_rate": 3.118758882514359e-06, + "loss": 0.88666099, + "num_input_tokens_seen": 118468270, + "router_z_loss_clip": 1.89550781, + "router_z_loss_mlp": 0.15368652, + "step": 5517, + "time_per_iteration": 2.4851818084716797 + }, + { + "auxiliary_loss_clip": 0.06465174, + "auxiliary_loss_mlp": 0.01279818, + "balance_loss_clip": 0.06284687, + "balance_loss_mlp": 0.01264142, + "epoch": 0.3317601082218548, + "flos": 20199871808640.0, + "grad_norm": 1.6705032998917397, + "language_loss": 0.74656814, + "learning_rate": 3.118436031952143e-06, + "loss": 0.82401806, + "num_input_tokens_seen": 118486615, + "router_z_loss_clip": 1.8046875, + "router_z_loss_mlp": 0.15686035, + "step": 5518, + "time_per_iteration": 2.518036127090454 + }, + { + "auxiliary_loss_clip": 0.06372921, + "auxiliary_loss_mlp": 0.01283465, + "balance_loss_clip": 0.06286249, + "balance_loss_mlp": 0.01279764, + "epoch": 0.3318202314745228, + "flos": 68995119265920.0, + "grad_norm": 0.7149144856696655, + "language_loss": 0.54263318, + "learning_rate": 3.1181131389787206e-06, + "loss": 0.61919701, + "num_input_tokens_seen": 118553580, + "router_z_loss_clip": 0.8671875, + "router_z_loss_mlp": 0.03692627, + "step": 5519, + "time_per_iteration": 3.246586322784424 + }, + { + "auxiliary_loss_clip": 0.06472577, + "auxiliary_loss_mlp": 0.01276695, + "balance_loss_clip": 0.06288108, + "balance_loss_mlp": 0.0125966, + "epoch": 0.33188035472719074, + "flos": 21505381148160.0, + "grad_norm": 2.182658812554146, + "language_loss": 0.79452467, + "learning_rate": 3.117790203606336e-06, + "loss": 0.87201744, + "num_input_tokens_seen": 118570280, + "router_z_loss_clip": 1.84375, + "router_z_loss_mlp": 0.17028809, + "step": 5520, + "time_per_iteration": 2.517853260040283 + }, + { + "auxiliary_loss_clip": 0.06465811, + "auxiliary_loss_mlp": 0.01271287, + "balance_loss_clip": 0.06283027, + "balance_loss_mlp": 0.01256279, + "epoch": 0.3319404779798587, + "flos": 28877033548800.0, + "grad_norm": 1.8300903967069966, + "language_loss": 0.77067709, + "learning_rate": 3.1174672258472344e-06, + "loss": 0.84804809, + "num_input_tokens_seen": 118590455, + "router_z_loss_clip": 1.82910156, + "router_z_loss_mlp": 0.15002441, + "step": 5521, + "time_per_iteration": 2.555697441101074 + }, + { + "auxiliary_loss_clip": 0.06478226, + "auxiliary_loss_mlp": 0.01278256, + "balance_loss_clip": 0.06288885, + "balance_loss_mlp": 0.01261542, + "epoch": 0.33200060123252667, + "flos": 23083770908160.0, + "grad_norm": 1.9119948906690396, + "language_loss": 0.70441258, + "learning_rate": 3.117144205713664e-06, + "loss": 0.78197736, + "num_input_tokens_seen": 118609495, + "router_z_loss_clip": 1.89355469, + "router_z_loss_mlp": 0.16699219, + "step": 5522, + "time_per_iteration": 2.5673933029174805 + }, + { + "auxiliary_loss_clip": 0.06474358, + "auxiliary_loss_mlp": 0.01271133, + "balance_loss_clip": 0.06290573, + "balance_loss_mlp": 0.01255255, + "epoch": 0.33206072448519464, + "flos": 21148895952000.0, + "grad_norm": 1.6906348218339255, + "language_loss": 0.74640656, + "learning_rate": 3.1168211432178735e-06, + "loss": 0.82386148, + "num_input_tokens_seen": 118628720, + "router_z_loss_clip": 1.83789062, + "router_z_loss_mlp": 0.15881348, + "step": 5523, + "time_per_iteration": 2.516275405883789 + }, + { + "auxiliary_loss_clip": 0.06473421, + "auxiliary_loss_mlp": 0.01271212, + "balance_loss_clip": 0.06292297, + "balance_loss_mlp": 0.01255763, + "epoch": 0.3321208477378626, + "flos": 13084161304320.0, + "grad_norm": 2.1726495268835024, + "language_loss": 0.82172406, + "learning_rate": 3.116498038372114e-06, + "loss": 0.8991704, + "num_input_tokens_seen": 118645955, + "router_z_loss_clip": 1.80957031, + "router_z_loss_mlp": 0.15454102, + "step": 5524, + "time_per_iteration": 2.557941198348999 + }, + { + "auxiliary_loss_clip": 0.06473367, + "auxiliary_loss_mlp": 0.01272915, + "balance_loss_clip": 0.06289522, + "balance_loss_mlp": 0.01257251, + "epoch": 0.33218097099053057, + "flos": 21221836531200.0, + "grad_norm": 1.6566666481357326, + "language_loss": 0.83100772, + "learning_rate": 3.116174891188636e-06, + "loss": 0.90847051, + "num_input_tokens_seen": 118665605, + "router_z_loss_clip": 1.8359375, + "router_z_loss_mlp": 0.15649414, + "step": 5525, + "time_per_iteration": 2.527944564819336 + }, + { + "auxiliary_loss_clip": 0.06379532, + "auxiliary_loss_mlp": 0.01265433, + "balance_loss_clip": 0.06292765, + "balance_loss_mlp": 0.01261484, + "epoch": 0.33224109424319853, + "flos": 64369954068480.0, + "grad_norm": 0.7407224947932968, + "language_loss": 0.52533764, + "learning_rate": 3.1158517016796945e-06, + "loss": 0.60178727, + "num_input_tokens_seen": 118728155, + "router_z_loss_clip": 0.8671875, + "router_z_loss_mlp": 0.03945923, + "step": 5526, + "time_per_iteration": 3.1679162979125977 + }, + { + "auxiliary_loss_clip": 0.0647909, + "auxiliary_loss_mlp": 0.01274604, + "balance_loss_clip": 0.06291543, + "balance_loss_mlp": 0.01258391, + "epoch": 0.33230121749586655, + "flos": 17351457713280.0, + "grad_norm": 1.970764365513445, + "language_loss": 0.79041827, + "learning_rate": 3.1155284698575445e-06, + "loss": 0.86795521, + "num_input_tokens_seen": 118743955, + "router_z_loss_clip": 1.87597656, + "router_z_loss_mlp": 0.1619873, + "step": 5527, + "time_per_iteration": 2.5327274799346924 + }, + { + "auxiliary_loss_clip": 0.06477004, + "auxiliary_loss_mlp": 0.01278538, + "balance_loss_clip": 0.06294803, + "balance_loss_mlp": 0.01263458, + "epoch": 0.3323613407485345, + "flos": 21003517918080.0, + "grad_norm": 1.6591522480418575, + "language_loss": 0.72383821, + "learning_rate": 3.1152051957344434e-06, + "loss": 0.80139363, + "num_input_tokens_seen": 118763275, + "router_z_loss_clip": 1.82226562, + "router_z_loss_mlp": 0.15063477, + "step": 5528, + "time_per_iteration": 2.6072213649749756 + }, + { + "auxiliary_loss_clip": 0.06477713, + "auxiliary_loss_mlp": 0.01274869, + "balance_loss_clip": 0.06292165, + "balance_loss_mlp": 0.01259396, + "epoch": 0.3324214640012025, + "flos": 13157688862080.0, + "grad_norm": 1.8543805866880412, + "language_loss": 0.8336091, + "learning_rate": 3.1148818793226497e-06, + "loss": 0.91113496, + "num_input_tokens_seen": 118781110, + "router_z_loss_clip": 1.85546875, + "router_z_loss_mlp": 0.15466309, + "step": 5529, + "time_per_iteration": 2.5001087188720703 + }, + { + "auxiliary_loss_clip": 0.06479646, + "auxiliary_loss_mlp": 0.01270144, + "balance_loss_clip": 0.06290583, + "balance_loss_mlp": 0.01254587, + "epoch": 0.33248158725387045, + "flos": 22280124798720.0, + "grad_norm": 1.7380748666321508, + "language_loss": 0.70133483, + "learning_rate": 3.114558520634423e-06, + "loss": 0.77883273, + "num_input_tokens_seen": 118800620, + "router_z_loss_clip": 1.89355469, + "router_z_loss_mlp": 0.15551758, + "step": 5530, + "time_per_iteration": 2.5806338787078857 + }, + { + "auxiliary_loss_clip": 0.06479505, + "auxiliary_loss_mlp": 0.01275357, + "balance_loss_clip": 0.06291899, + "balance_loss_mlp": 0.01258751, + "epoch": 0.3325417105065384, + "flos": 20747324459520.0, + "grad_norm": 2.7342028000668552, + "language_loss": 0.77694213, + "learning_rate": 3.1142351196820256e-06, + "loss": 0.85449082, + "num_input_tokens_seen": 118818725, + "router_z_loss_clip": 1.87597656, + "router_z_loss_mlp": 0.16589355, + "step": 5531, + "time_per_iteration": 2.5307323932647705 + }, + { + "auxiliary_loss_clip": 0.06477839, + "auxiliary_loss_mlp": 0.01280766, + "balance_loss_clip": 0.06290089, + "balance_loss_mlp": 0.01263552, + "epoch": 0.3326018337592064, + "flos": 24797476212480.0, + "grad_norm": 1.9473942094883194, + "language_loss": 0.73779702, + "learning_rate": 3.1139116764777206e-06, + "loss": 0.81538308, + "num_input_tokens_seen": 118839390, + "router_z_loss_clip": 1.87597656, + "router_z_loss_mlp": 0.17211914, + "step": 5532, + "time_per_iteration": 2.5989890098571777 + }, + { + "auxiliary_loss_clip": 0.06472681, + "auxiliary_loss_mlp": 0.01278728, + "balance_loss_clip": 0.06288014, + "balance_loss_mlp": 0.01263147, + "epoch": 0.33266195701187434, + "flos": 14506942832640.0, + "grad_norm": 1.825417572799306, + "language_loss": 0.66042602, + "learning_rate": 3.1135881910337735e-06, + "loss": 0.73794013, + "num_input_tokens_seen": 118856275, + "router_z_loss_clip": 1.84667969, + "router_z_loss_mlp": 0.15576172, + "step": 5533, + "time_per_iteration": 2.47566294670105 + }, + { + "auxiliary_loss_clip": 0.06474279, + "auxiliary_loss_mlp": 0.012755, + "balance_loss_clip": 0.06289338, + "balance_loss_mlp": 0.01258954, + "epoch": 0.3327220802645423, + "flos": 15309792328320.0, + "grad_norm": 1.6677538876536442, + "language_loss": 0.71568084, + "learning_rate": 3.113264663362451e-06, + "loss": 0.79317868, + "num_input_tokens_seen": 118873830, + "router_z_loss_clip": 1.84863281, + "router_z_loss_mlp": 0.16552734, + "step": 5534, + "time_per_iteration": 2.5140762329101562 + }, + { + "auxiliary_loss_clip": 0.06474573, + "auxiliary_loss_mlp": 0.01273002, + "balance_loss_clip": 0.06290095, + "balance_loss_mlp": 0.01257088, + "epoch": 0.3327822035172103, + "flos": 23484336151680.0, + "grad_norm": 1.635346823223845, + "language_loss": 0.67885029, + "learning_rate": 3.1129410934760204e-06, + "loss": 0.75632608, + "num_input_tokens_seen": 118891560, + "router_z_loss_clip": 1.84277344, + "router_z_loss_mlp": 0.15917969, + "step": 5535, + "time_per_iteration": 2.522270917892456 + }, + { + "auxiliary_loss_clip": 0.0647034, + "auxiliary_loss_mlp": 0.01273438, + "balance_loss_clip": 0.06284929, + "balance_loss_mlp": 0.01257547, + "epoch": 0.33284232676987824, + "flos": 25381587824640.0, + "grad_norm": 2.3715726564419155, + "language_loss": 0.72782886, + "learning_rate": 3.1126174813867517e-06, + "loss": 0.80526668, + "num_input_tokens_seen": 118910260, + "router_z_loss_clip": 1.85449219, + "router_z_loss_mlp": 0.15893555, + "step": 5536, + "time_per_iteration": 2.5831825733184814 + }, + { + "auxiliary_loss_clip": 0.06470598, + "auxiliary_loss_mlp": 0.01270866, + "balance_loss_clip": 0.06285597, + "balance_loss_mlp": 0.01255464, + "epoch": 0.3329024500225462, + "flos": 23700851902080.0, + "grad_norm": 1.6831469867631554, + "language_loss": 0.81958938, + "learning_rate": 3.112293827106917e-06, + "loss": 0.89700401, + "num_input_tokens_seen": 118929985, + "router_z_loss_clip": 1.84960938, + "router_z_loss_mlp": 0.15405273, + "step": 5537, + "time_per_iteration": 2.520211935043335 + }, + { + "auxiliary_loss_clip": 0.06473641, + "auxiliary_loss_mlp": 0.01270298, + "balance_loss_clip": 0.06284811, + "balance_loss_mlp": 0.01253799, + "epoch": 0.33296257327521417, + "flos": 31731317429760.0, + "grad_norm": 1.8576028267218818, + "language_loss": 0.71933794, + "learning_rate": 3.111970130648789e-06, + "loss": 0.79677737, + "num_input_tokens_seen": 118951355, + "router_z_loss_clip": 1.88671875, + "router_z_loss_mlp": 0.16491699, + "step": 5538, + "time_per_iteration": 2.6061229705810547 + }, + { + "auxiliary_loss_clip": 0.06466128, + "auxiliary_loss_mlp": 0.01271828, + "balance_loss_clip": 0.06283107, + "balance_loss_mlp": 0.01256784, + "epoch": 0.33302269652788213, + "flos": 22750863436800.0, + "grad_norm": 1.8542539639588682, + "language_loss": 0.75063813, + "learning_rate": 3.1116463920246424e-06, + "loss": 0.82801771, + "num_input_tokens_seen": 118970910, + "router_z_loss_clip": 1.83007812, + "router_z_loss_mlp": 0.15039062, + "step": 5539, + "time_per_iteration": 2.5176634788513184 + }, + { + "auxiliary_loss_clip": 0.06473792, + "auxiliary_loss_mlp": 0.0127244, + "balance_loss_clip": 0.06284824, + "balance_loss_mlp": 0.01255739, + "epoch": 0.33308281978055015, + "flos": 11478546167040.0, + "grad_norm": 1.8040392528519402, + "language_loss": 0.71489209, + "learning_rate": 3.1113226112467527e-06, + "loss": 0.79235446, + "num_input_tokens_seen": 118989200, + "router_z_loss_clip": 1.890625, + "router_z_loss_mlp": 0.16699219, + "step": 5540, + "time_per_iteration": 2.536752939224243 + }, + { + "auxiliary_loss_clip": 0.06462967, + "auxiliary_loss_mlp": 0.01271775, + "balance_loss_clip": 0.06280267, + "balance_loss_mlp": 0.01256576, + "epoch": 0.3331429430332181, + "flos": 38222274291840.0, + "grad_norm": 3.095851444688792, + "language_loss": 0.60970843, + "learning_rate": 3.1109987883273983e-06, + "loss": 0.68705589, + "num_input_tokens_seen": 119011030, + "router_z_loss_clip": 1.82519531, + "router_z_loss_mlp": 0.15197754, + "step": 5541, + "time_per_iteration": 2.6592354774475098 + }, + { + "auxiliary_loss_clip": 0.06472225, + "auxiliary_loss_mlp": 0.01276024, + "balance_loss_clip": 0.06284402, + "balance_loss_mlp": 0.01259872, + "epoch": 0.3332030662858861, + "flos": 22535270081280.0, + "grad_norm": 1.770287690308821, + "language_loss": 0.69711685, + "learning_rate": 3.1106749232788584e-06, + "loss": 0.77459931, + "num_input_tokens_seen": 119030620, + "router_z_loss_clip": 1.87597656, + "router_z_loss_mlp": 0.16149902, + "step": 5542, + "time_per_iteration": 2.5427184104919434 + }, + { + "auxiliary_loss_clip": 0.06473213, + "auxiliary_loss_mlp": 0.01276881, + "balance_loss_clip": 0.06286451, + "balance_loss_mlp": 0.01261658, + "epoch": 0.33326318953855405, + "flos": 16003293845760.0, + "grad_norm": 1.6729265705607443, + "language_loss": 0.75927889, + "learning_rate": 3.110351016113414e-06, + "loss": 0.83677983, + "num_input_tokens_seen": 119048015, + "router_z_loss_clip": 1.86914062, + "router_z_loss_mlp": 0.15222168, + "step": 5543, + "time_per_iteration": 2.4745616912841797 + }, + { + "auxiliary_loss_clip": 0.06475509, + "auxiliary_loss_mlp": 0.01276899, + "balance_loss_clip": 0.06287046, + "balance_loss_mlp": 0.01260281, + "epoch": 0.333323312791222, + "flos": 25600661124480.0, + "grad_norm": 1.7242995092969657, + "language_loss": 0.75332278, + "learning_rate": 3.110027066843348e-06, + "loss": 0.83084685, + "num_input_tokens_seen": 119066280, + "router_z_loss_clip": 1.88476562, + "router_z_loss_mlp": 0.16601562, + "step": 5544, + "time_per_iteration": 2.565572738647461 + }, + { + "auxiliary_loss_clip": 0.06467521, + "auxiliary_loss_mlp": 0.01270286, + "balance_loss_clip": 0.06283619, + "balance_loss_mlp": 0.01254848, + "epoch": 0.33338343604389, + "flos": 25126652177280.0, + "grad_norm": 1.4364166263140996, + "language_loss": 0.71556139, + "learning_rate": 3.1097030754809456e-06, + "loss": 0.79293942, + "num_input_tokens_seen": 119087680, + "router_z_loss_clip": 1.83984375, + "router_z_loss_mlp": 0.1541748, + "step": 5545, + "time_per_iteration": 3.9951117038726807 + }, + { + "auxiliary_loss_clip": 0.0646642, + "auxiliary_loss_mlp": 0.01275763, + "balance_loss_clip": 0.0628425, + "balance_loss_mlp": 0.01260063, + "epoch": 0.33344355929655795, + "flos": 16953114602880.0, + "grad_norm": 1.5928525652704049, + "language_loss": 0.69892073, + "learning_rate": 3.1093790420384894e-06, + "loss": 0.77634251, + "num_input_tokens_seen": 119105820, + "router_z_loss_clip": 1.8203125, + "router_z_loss_mlp": 0.15722656, + "step": 5546, + "time_per_iteration": 4.069552659988403 + }, + { + "auxiliary_loss_clip": 0.06469481, + "auxiliary_loss_mlp": 0.01273771, + "balance_loss_clip": 0.06280591, + "balance_loss_mlp": 0.01257308, + "epoch": 0.3335036825492259, + "flos": 27896675178240.0, + "grad_norm": 1.5973320112543803, + "language_loss": 0.65030676, + "learning_rate": 3.1090549665282702e-06, + "loss": 0.72773933, + "num_input_tokens_seen": 119126630, + "router_z_loss_clip": 1.88964844, + "router_z_loss_mlp": 0.16455078, + "step": 5547, + "time_per_iteration": 2.578320026397705 + }, + { + "auxiliary_loss_clip": 0.06468174, + "auxiliary_loss_mlp": 0.01274769, + "balance_loss_clip": 0.06284153, + "balance_loss_mlp": 0.01258736, + "epoch": 0.3335638058018939, + "flos": 16184995424640.0, + "grad_norm": 1.9789366990729325, + "language_loss": 0.85645819, + "learning_rate": 3.1087308489625742e-06, + "loss": 0.9338876, + "num_input_tokens_seen": 119143375, + "router_z_loss_clip": 1.84082031, + "router_z_loss_mlp": 0.16040039, + "step": 5548, + "time_per_iteration": 3.917346477508545 + }, + { + "auxiliary_loss_clip": 0.06473708, + "auxiliary_loss_mlp": 0.01275416, + "balance_loss_clip": 0.06283803, + "balance_loss_mlp": 0.01259264, + "epoch": 0.33362392905456184, + "flos": 39905651617920.0, + "grad_norm": 1.927393858225298, + "language_loss": 0.74956143, + "learning_rate": 3.1084066893536945e-06, + "loss": 0.82705271, + "num_input_tokens_seen": 119166450, + "router_z_loss_clip": 1.89453125, + "router_z_loss_mlp": 0.16149902, + "step": 5549, + "time_per_iteration": 2.662152051925659 + }, + { + "auxiliary_loss_clip": 0.0647629, + "auxiliary_loss_mlp": 0.01276829, + "balance_loss_clip": 0.06287523, + "balance_loss_mlp": 0.0125946, + "epoch": 0.3336840523072298, + "flos": 44280954339840.0, + "grad_norm": 3.284743863263659, + "language_loss": 0.68874133, + "learning_rate": 3.108082487713921e-06, + "loss": 0.76627254, + "num_input_tokens_seen": 119189645, + "router_z_loss_clip": 1.88476562, + "router_z_loss_mlp": 0.17370605, + "step": 5550, + "time_per_iteration": 2.703099250793457 + }, + { + "auxiliary_loss_clip": 0.06476407, + "auxiliary_loss_mlp": 0.01275354, + "balance_loss_clip": 0.06290508, + "balance_loss_mlp": 0.01259488, + "epoch": 0.33374417555989777, + "flos": 15091054444800.0, + "grad_norm": 2.6465919002896436, + "language_loss": 0.60992151, + "learning_rate": 3.1077582440555495e-06, + "loss": 0.6874392, + "num_input_tokens_seen": 119208045, + "router_z_loss_clip": 1.85839844, + "router_z_loss_mlp": 0.15856934, + "step": 5551, + "time_per_iteration": 2.5024354457855225 + }, + { + "auxiliary_loss_clip": 0.06471356, + "auxiliary_loss_mlp": 0.01275291, + "balance_loss_clip": 0.06287605, + "balance_loss_mlp": 0.01259985, + "epoch": 0.33380429881256574, + "flos": 15854226232320.0, + "grad_norm": 1.6170207033712265, + "language_loss": 0.71155131, + "learning_rate": 3.1074339583908746e-06, + "loss": 0.78901786, + "num_input_tokens_seen": 119224910, + "router_z_loss_clip": 1.83691406, + "router_z_loss_mlp": 0.15307617, + "step": 5552, + "time_per_iteration": 4.0786826610565186 + }, + { + "auxiliary_loss_clip": 0.06476602, + "auxiliary_loss_mlp": 0.01270143, + "balance_loss_clip": 0.06291272, + "balance_loss_mlp": 0.01255182, + "epoch": 0.33386442206523376, + "flos": 13485439307520.0, + "grad_norm": 2.244029622012826, + "language_loss": 0.83864999, + "learning_rate": 3.107109630732192e-06, + "loss": 0.91611743, + "num_input_tokens_seen": 119243290, + "router_z_loss_clip": 1.85644531, + "router_z_loss_mlp": 0.1496582, + "step": 5553, + "time_per_iteration": 2.603986978530884 + }, + { + "auxiliary_loss_clip": 0.06474789, + "auxiliary_loss_mlp": 0.0127187, + "balance_loss_clip": 0.06288507, + "balance_loss_mlp": 0.01255562, + "epoch": 0.3339245453179017, + "flos": 16696250311680.0, + "grad_norm": 2.098616423404285, + "language_loss": 0.81424135, + "learning_rate": 3.1067852610918017e-06, + "loss": 0.89170802, + "num_input_tokens_seen": 119261195, + "router_z_loss_clip": 1.86132812, + "router_z_loss_mlp": 0.16320801, + "step": 5554, + "time_per_iteration": 2.4884121417999268 + }, + { + "auxiliary_loss_clip": 0.06477922, + "auxiliary_loss_mlp": 0.01277907, + "balance_loss_clip": 0.06288742, + "balance_loss_mlp": 0.01261647, + "epoch": 0.3339846685705697, + "flos": 24617954839680.0, + "grad_norm": 1.4369599322997015, + "language_loss": 0.81866252, + "learning_rate": 3.1064608494820032e-06, + "loss": 0.89622086, + "num_input_tokens_seen": 119282845, + "router_z_loss_clip": 1.89160156, + "router_z_loss_mlp": 0.16259766, + "step": 5555, + "time_per_iteration": 2.6273152828216553 + }, + { + "auxiliary_loss_clip": 0.06478396, + "auxiliary_loss_mlp": 0.01271619, + "balance_loss_clip": 0.06292441, + "balance_loss_mlp": 0.01256325, + "epoch": 0.33404479182323765, + "flos": 30961311534720.0, + "grad_norm": 1.7387044564853729, + "language_loss": 0.74836755, + "learning_rate": 3.106136395915099e-06, + "loss": 0.82586771, + "num_input_tokens_seen": 119304430, + "router_z_loss_clip": 1.85839844, + "router_z_loss_mlp": 0.1529541, + "step": 5556, + "time_per_iteration": 2.5936899185180664 + }, + { + "auxiliary_loss_clip": 0.06476042, + "auxiliary_loss_mlp": 0.01275785, + "balance_loss_clip": 0.06293188, + "balance_loss_mlp": 0.01260562, + "epoch": 0.3341049150759056, + "flos": 23519988864000.0, + "grad_norm": 1.3815052276914728, + "language_loss": 0.82545519, + "learning_rate": 3.105811900403391e-06, + "loss": 0.90297353, + "num_input_tokens_seen": 119323830, + "router_z_loss_clip": 1.82617188, + "router_z_loss_mlp": 0.15222168, + "step": 5557, + "time_per_iteration": 2.5862598419189453 + }, + { + "auxiliary_loss_clip": 0.06477734, + "auxiliary_loss_mlp": 0.01279505, + "balance_loss_clip": 0.0629133, + "balance_loss_mlp": 0.01264067, + "epoch": 0.3341650383285736, + "flos": 24034052862720.0, + "grad_norm": 2.760917503655681, + "language_loss": 0.80188966, + "learning_rate": 3.1054873629591855e-06, + "loss": 0.87946206, + "num_input_tokens_seen": 119346340, + "router_z_loss_clip": 1.86230469, + "router_z_loss_mlp": 0.15429688, + "step": 5558, + "time_per_iteration": 2.596344232559204 + }, + { + "auxiliary_loss_clip": 0.06475051, + "auxiliary_loss_mlp": 0.01282797, + "balance_loss_clip": 0.06287208, + "balance_loss_mlp": 0.01267646, + "epoch": 0.33422516158124155, + "flos": 24909255959040.0, + "grad_norm": 1.7423955567809428, + "language_loss": 0.81954122, + "learning_rate": 3.105162783594788e-06, + "loss": 0.8971197, + "num_input_tokens_seen": 119367285, + "router_z_loss_clip": 1.87792969, + "router_z_loss_mlp": 0.1517334, + "step": 5559, + "time_per_iteration": 2.587005376815796 + }, + { + "auxiliary_loss_clip": 0.06467593, + "auxiliary_loss_mlp": 0.01279767, + "balance_loss_clip": 0.06286522, + "balance_loss_mlp": 0.01265224, + "epoch": 0.3342852848339095, + "flos": 18339404878080.0, + "grad_norm": 2.1220335034517093, + "language_loss": 0.72058392, + "learning_rate": 3.1048381623225074e-06, + "loss": 0.79805756, + "num_input_tokens_seen": 119385370, + "router_z_loss_clip": 1.81054688, + "router_z_loss_mlp": 0.14550781, + "step": 5560, + "time_per_iteration": 2.536546230316162 + }, + { + "auxiliary_loss_clip": 0.06481705, + "auxiliary_loss_mlp": 0.01285397, + "balance_loss_clip": 0.06292065, + "balance_loss_mlp": 0.01269458, + "epoch": 0.3343454080865775, + "flos": 30054690357120.0, + "grad_norm": 1.596178779859494, + "language_loss": 0.75386882, + "learning_rate": 3.1045134991546526e-06, + "loss": 0.83153981, + "num_input_tokens_seen": 119409150, + "router_z_loss_clip": 1.89453125, + "router_z_loss_mlp": 0.15930176, + "step": 5561, + "time_per_iteration": 2.672700881958008 + }, + { + "auxiliary_loss_clip": 0.06477022, + "auxiliary_loss_mlp": 0.01277798, + "balance_loss_clip": 0.06291385, + "balance_loss_mlp": 0.01262551, + "epoch": 0.33440553133924544, + "flos": 16404362213760.0, + "grad_norm": 1.6462526862455489, + "language_loss": 0.70108986, + "learning_rate": 3.1041887941035355e-06, + "loss": 0.77863806, + "num_input_tokens_seen": 119426475, + "router_z_loss_clip": 1.85742188, + "router_z_loss_mlp": 0.15246582, + "step": 5562, + "time_per_iteration": 2.501317024230957 + }, + { + "auxiliary_loss_clip": 0.06472157, + "auxiliary_loss_mlp": 0.01280428, + "balance_loss_clip": 0.06287345, + "balance_loss_mlp": 0.01265396, + "epoch": 0.3344656545919134, + "flos": 24248723823360.0, + "grad_norm": 1.5361546803562123, + "language_loss": 0.65648419, + "learning_rate": 3.1038640471814685e-06, + "loss": 0.7340101, + "num_input_tokens_seen": 119446900, + "router_z_loss_clip": 1.84667969, + "router_z_loss_mlp": 0.15026855, + "step": 5563, + "time_per_iteration": 2.5564165115356445 + }, + { + "auxiliary_loss_clip": 0.06477885, + "auxiliary_loss_mlp": 0.01282181, + "balance_loss_clip": 0.06290222, + "balance_loss_mlp": 0.01264752, + "epoch": 0.3345257778445814, + "flos": 52130431048320.0, + "grad_norm": 1.3531042812140452, + "language_loss": 0.74246049, + "learning_rate": 3.103539258400766e-06, + "loss": 0.82006115, + "num_input_tokens_seen": 119470945, + "router_z_loss_clip": 1.87792969, + "router_z_loss_mlp": 0.17431641, + "step": 5564, + "time_per_iteration": 2.810534715652466 + }, + { + "auxiliary_loss_clip": 0.06356741, + "auxiliary_loss_mlp": 0.01295627, + "balance_loss_clip": 0.06270711, + "balance_loss_mlp": 0.01291562, + "epoch": 0.33458590109724934, + "flos": 68066528319360.0, + "grad_norm": 0.78222915395806, + "language_loss": 0.55275309, + "learning_rate": 3.103214427773745e-06, + "loss": 0.62927675, + "num_input_tokens_seen": 119529925, + "router_z_loss_clip": 0.859375, + "router_z_loss_mlp": 0.04064941, + "step": 5565, + "time_per_iteration": 3.1279821395874023 + }, + { + "auxiliary_loss_clip": 0.06471252, + "auxiliary_loss_mlp": 0.01279791, + "balance_loss_clip": 0.06288698, + "balance_loss_mlp": 0.01264163, + "epoch": 0.3346460243499173, + "flos": 37423869062400.0, + "grad_norm": 1.705115292174207, + "language_loss": 0.65565574, + "learning_rate": 3.102889555312721e-06, + "loss": 0.73316622, + "num_input_tokens_seen": 119550700, + "router_z_loss_clip": 1.82519531, + "router_z_loss_mlp": 0.15625, + "step": 5566, + "time_per_iteration": 2.712435245513916 + }, + { + "auxiliary_loss_clip": 0.0647177, + "auxiliary_loss_mlp": 0.01282122, + "balance_loss_clip": 0.06289912, + "balance_loss_mlp": 0.01266529, + "epoch": 0.3347061476025853, + "flos": 18703269233280.0, + "grad_norm": 1.6655571733561654, + "language_loss": 0.77372861, + "learning_rate": 3.102564641030016e-06, + "loss": 0.85126758, + "num_input_tokens_seen": 119569295, + "router_z_loss_clip": 1.81835938, + "router_z_loss_mlp": 0.15588379, + "step": 5567, + "time_per_iteration": 2.4871251583099365 + }, + { + "auxiliary_loss_clip": 0.06471208, + "auxiliary_loss_mlp": 0.01275703, + "balance_loss_clip": 0.06285998, + "balance_loss_mlp": 0.01259491, + "epoch": 0.3347662708552533, + "flos": 13922957001600.0, + "grad_norm": 1.6558873666299474, + "language_loss": 0.77099127, + "learning_rate": 3.102239684937949e-06, + "loss": 0.84846038, + "num_input_tokens_seen": 119587375, + "router_z_loss_clip": 1.8515625, + "router_z_loss_mlp": 0.16223145, + "step": 5568, + "time_per_iteration": 2.5343427658081055 + }, + { + "auxiliary_loss_clip": 0.06472506, + "auxiliary_loss_mlp": 0.01277777, + "balance_loss_clip": 0.06286565, + "balance_loss_mlp": 0.01262136, + "epoch": 0.33482639410792125, + "flos": 19755645788160.0, + "grad_norm": 1.9310298365294178, + "language_loss": 0.71334505, + "learning_rate": 3.101914687048842e-06, + "loss": 0.7908479, + "num_input_tokens_seen": 119604530, + "router_z_loss_clip": 1.85742188, + "router_z_loss_mlp": 0.15643311, + "step": 5569, + "time_per_iteration": 2.5091118812561035 + }, + { + "auxiliary_loss_clip": 0.06473939, + "auxiliary_loss_mlp": 0.01271857, + "balance_loss_clip": 0.06285448, + "balance_loss_mlp": 0.01256479, + "epoch": 0.3348865173605892, + "flos": 16107820214400.0, + "grad_norm": 1.931700529164995, + "language_loss": 0.90211284, + "learning_rate": 3.10158964737502e-06, + "loss": 0.97957081, + "num_input_tokens_seen": 119621025, + "router_z_loss_clip": 1.88476562, + "router_z_loss_mlp": 0.15380859, + "step": 5570, + "time_per_iteration": 2.6067447662353516 + }, + { + "auxiliary_loss_clip": 0.06465288, + "auxiliary_loss_mlp": 0.01272678, + "balance_loss_clip": 0.06282274, + "balance_loss_mlp": 0.01257264, + "epoch": 0.3349466406132572, + "flos": 25015836752640.0, + "grad_norm": 1.5216158426421846, + "language_loss": 0.79890078, + "learning_rate": 3.101264565928808e-06, + "loss": 0.87628049, + "num_input_tokens_seen": 119641725, + "router_z_loss_clip": 1.82910156, + "router_z_loss_mlp": 0.15405273, + "step": 5571, + "time_per_iteration": 2.5423781871795654 + }, + { + "auxiliary_loss_clip": 0.06342317, + "auxiliary_loss_mlp": 0.01254883, + "balance_loss_clip": 0.06257176, + "balance_loss_mlp": 0.01251411, + "epoch": 0.33500676386592515, + "flos": 54340058413440.0, + "grad_norm": 0.8278358272998855, + "language_loss": 0.55695772, + "learning_rate": 3.1009394427225335e-06, + "loss": 0.63292974, + "num_input_tokens_seen": 119693560, + "router_z_loss_clip": 0.8515625, + "router_z_loss_mlp": 0.03482056, + "step": 5572, + "time_per_iteration": 3.1027615070343018 + }, + { + "auxiliary_loss_clip": 0.06472763, + "auxiliary_loss_mlp": 0.0127696, + "balance_loss_clip": 0.06287524, + "balance_loss_mlp": 0.01261677, + "epoch": 0.3350668871185931, + "flos": 26804620915200.0, + "grad_norm": 1.9863197052332227, + "language_loss": 0.78856999, + "learning_rate": 3.1006142777685257e-06, + "loss": 0.86606717, + "num_input_tokens_seen": 119712935, + "router_z_loss_clip": 1.85253906, + "router_z_loss_mlp": 0.15283203, + "step": 5573, + "time_per_iteration": 2.571803331375122 + }, + { + "auxiliary_loss_clip": 0.06473139, + "auxiliary_loss_mlp": 0.01274748, + "balance_loss_clip": 0.06286675, + "balance_loss_mlp": 0.01257999, + "epoch": 0.3351270103712611, + "flos": 33518885708160.0, + "grad_norm": 2.2174625445936256, + "language_loss": 0.72959399, + "learning_rate": 3.1002890710791133e-06, + "loss": 0.80707288, + "num_input_tokens_seen": 119731680, + "router_z_loss_clip": 1.86523438, + "router_z_loss_mlp": 0.16723633, + "step": 5574, + "time_per_iteration": 2.660301923751831 + }, + { + "auxiliary_loss_clip": 0.06465638, + "auxiliary_loss_mlp": 0.01271718, + "balance_loss_clip": 0.06284496, + "balance_loss_mlp": 0.01256042, + "epoch": 0.33518713362392905, + "flos": 26513613285120.0, + "grad_norm": 1.6818935039401424, + "language_loss": 0.88364851, + "learning_rate": 3.0999638226666287e-06, + "loss": 0.96102208, + "num_input_tokens_seen": 119752155, + "router_z_loss_clip": 1.81152344, + "router_z_loss_mlp": 0.15661621, + "step": 5575, + "time_per_iteration": 2.5729191303253174 + }, + { + "auxiliary_loss_clip": 0.0648465, + "auxiliary_loss_mlp": 0.01276363, + "balance_loss_clip": 0.06290504, + "balance_loss_mlp": 0.01259316, + "epoch": 0.335247256876597, + "flos": 17237078490240.0, + "grad_norm": 1.9893319880263207, + "language_loss": 0.83043218, + "learning_rate": 3.0996385325434063e-06, + "loss": 0.90804225, + "num_input_tokens_seen": 119769195, + "router_z_loss_clip": 1.94042969, + "router_z_loss_mlp": 0.17053223, + "step": 5576, + "time_per_iteration": 2.5360445976257324 + }, + { + "auxiliary_loss_clip": 0.06478332, + "auxiliary_loss_mlp": 0.01275534, + "balance_loss_clip": 0.06288211, + "balance_loss_mlp": 0.01259095, + "epoch": 0.335307380129265, + "flos": 25636397690880.0, + "grad_norm": 2.0001339744496622, + "language_loss": 0.73279572, + "learning_rate": 3.0993132007217806e-06, + "loss": 0.81033432, + "num_input_tokens_seen": 119786810, + "router_z_loss_clip": 1.89941406, + "router_z_loss_mlp": 0.16442871, + "step": 5577, + "time_per_iteration": 2.575026750564575 + }, + { + "auxiliary_loss_clip": 0.06475031, + "auxiliary_loss_mlp": 0.01274987, + "balance_loss_clip": 0.0628825, + "balance_loss_mlp": 0.01257689, + "epoch": 0.33536750338193294, + "flos": 19685765882880.0, + "grad_norm": 1.6019428598408136, + "language_loss": 0.82233781, + "learning_rate": 3.0989878272140883e-06, + "loss": 0.89983797, + "num_input_tokens_seen": 119805395, + "router_z_loss_clip": 1.86425781, + "router_z_loss_mlp": 0.17297363, + "step": 5578, + "time_per_iteration": 2.544978380203247 + }, + { + "auxiliary_loss_clip": 0.06461956, + "auxiliary_loss_mlp": 0.01278493, + "balance_loss_clip": 0.06282087, + "balance_loss_mlp": 0.01262907, + "epoch": 0.3354276266346009, + "flos": 18338482483200.0, + "grad_norm": 1.788420802177993, + "language_loss": 0.72050315, + "learning_rate": 3.0986624120326676e-06, + "loss": 0.79790771, + "num_input_tokens_seen": 119823135, + "router_z_loss_clip": 1.796875, + "router_z_loss_mlp": 0.15582275, + "step": 5579, + "time_per_iteration": 2.50080943107605 + }, + { + "auxiliary_loss_clip": 0.06478497, + "auxiliary_loss_mlp": 0.01282646, + "balance_loss_clip": 0.06290549, + "balance_loss_mlp": 0.01266898, + "epoch": 0.3354877498872689, + "flos": 17864389681920.0, + "grad_norm": 2.052679713623706, + "language_loss": 0.81401342, + "learning_rate": 3.0983369551898573e-06, + "loss": 0.89162487, + "num_input_tokens_seen": 119842265, + "router_z_loss_clip": 1.87597656, + "router_z_loss_mlp": 0.15734863, + "step": 5580, + "time_per_iteration": 2.566675901412964 + }, + { + "auxiliary_loss_clip": 0.06473458, + "auxiliary_loss_mlp": 0.0128019, + "balance_loss_clip": 0.06284851, + "balance_loss_mlp": 0.01263691, + "epoch": 0.3355478731399369, + "flos": 24724703341440.0, + "grad_norm": 1.6024353673136869, + "language_loss": 0.78190315, + "learning_rate": 3.0980114566980003e-06, + "loss": 0.85943961, + "num_input_tokens_seen": 119862500, + "router_z_loss_clip": 1.88378906, + "router_z_loss_mlp": 0.16485596, + "step": 5581, + "time_per_iteration": 2.539208173751831 + }, + { + "auxiliary_loss_clip": 0.06482114, + "auxiliary_loss_mlp": 0.01276643, + "balance_loss_clip": 0.06289735, + "balance_loss_mlp": 0.01259084, + "epoch": 0.33560799639260486, + "flos": 16879628972160.0, + "grad_norm": 2.359779356701633, + "language_loss": 0.74923486, + "learning_rate": 3.0976859165694384e-06, + "loss": 0.8268224, + "num_input_tokens_seen": 119880160, + "router_z_loss_clip": 1.92480469, + "router_z_loss_mlp": 0.17565918, + "step": 5582, + "time_per_iteration": 2.5489563941955566 + }, + { + "auxiliary_loss_clip": 0.06478906, + "auxiliary_loss_mlp": 0.01276582, + "balance_loss_clip": 0.06287926, + "balance_loss_mlp": 0.01260191, + "epoch": 0.3356681196452728, + "flos": 18339530659200.0, + "grad_norm": 1.5985505462491367, + "language_loss": 0.82591236, + "learning_rate": 3.0973603348165166e-06, + "loss": 0.90346718, + "num_input_tokens_seen": 119899040, + "router_z_loss_clip": 1.91113281, + "router_z_loss_mlp": 0.16369629, + "step": 5583, + "time_per_iteration": 2.4985439777374268 + }, + { + "auxiliary_loss_clip": 0.06466989, + "auxiliary_loss_mlp": 0.01276424, + "balance_loss_clip": 0.06282677, + "balance_loss_mlp": 0.01260664, + "epoch": 0.3357282428979408, + "flos": 34759127116800.0, + "grad_norm": 1.8261350586664176, + "language_loss": 0.77844834, + "learning_rate": 3.097034711451581e-06, + "loss": 0.85588253, + "num_input_tokens_seen": 119921120, + "router_z_loss_clip": 1.84277344, + "router_z_loss_mlp": 0.15771484, + "step": 5584, + "time_per_iteration": 2.649090051651001 + }, + { + "auxiliary_loss_clip": 0.06475179, + "auxiliary_loss_mlp": 0.01274752, + "balance_loss_clip": 0.06285385, + "balance_loss_mlp": 0.01259427, + "epoch": 0.33578836615060875, + "flos": 21586539427200.0, + "grad_norm": 1.6814695059799305, + "language_loss": 0.76339197, + "learning_rate": 3.0967090464869795e-06, + "loss": 0.84089124, + "num_input_tokens_seen": 119940165, + "router_z_loss_clip": 1.89941406, + "router_z_loss_mlp": 0.15313721, + "step": 5585, + "time_per_iteration": 5.408076763153076 + }, + { + "auxiliary_loss_clip": 0.06463687, + "auxiliary_loss_mlp": 0.01277288, + "balance_loss_clip": 0.06280811, + "balance_loss_mlp": 0.0126054, + "epoch": 0.3358484894032767, + "flos": 24536377290240.0, + "grad_norm": 1.7085225722674646, + "language_loss": 0.78121984, + "learning_rate": 3.0963833399350608e-06, + "loss": 0.85862964, + "num_input_tokens_seen": 119959730, + "router_z_loss_clip": 1.83007812, + "router_z_loss_mlp": 0.16760254, + "step": 5586, + "time_per_iteration": 2.5785536766052246 + }, + { + "auxiliary_loss_clip": 0.06482486, + "auxiliary_loss_mlp": 0.01271246, + "balance_loss_clip": 0.06290784, + "balance_loss_mlp": 0.01254902, + "epoch": 0.3359086126559447, + "flos": 22462161793920.0, + "grad_norm": 1.9607494340110725, + "language_loss": 0.81952178, + "learning_rate": 3.0960575918081756e-06, + "loss": 0.89705908, + "num_input_tokens_seen": 119979315, + "router_z_loss_clip": 1.91796875, + "router_z_loss_mlp": 0.16357422, + "step": 5587, + "time_per_iteration": 3.9456732273101807 + }, + { + "auxiliary_loss_clip": 0.06460288, + "auxiliary_loss_mlp": 0.01274939, + "balance_loss_clip": 0.06281327, + "balance_loss_mlp": 0.01259692, + "epoch": 0.33596873590861265, + "flos": 16549069415040.0, + "grad_norm": 1.7386991231776667, + "language_loss": 0.67118108, + "learning_rate": 3.095731802118677e-06, + "loss": 0.74853337, + "num_input_tokens_seen": 119996140, + "router_z_loss_clip": 1.79003906, + "router_z_loss_mlp": 0.15234375, + "step": 5588, + "time_per_iteration": 2.6328773498535156 + }, + { + "auxiliary_loss_clip": 0.06471635, + "auxiliary_loss_mlp": 0.01272286, + "balance_loss_clip": 0.0628484, + "balance_loss_mlp": 0.01255215, + "epoch": 0.3360288591612806, + "flos": 31183864778880.0, + "grad_norm": 2.547244730124186, + "language_loss": 0.70319438, + "learning_rate": 3.095405970878919e-06, + "loss": 0.78063357, + "num_input_tokens_seen": 120017720, + "router_z_loss_clip": 1.86621094, + "router_z_loss_mlp": 0.17077637, + "step": 5589, + "time_per_iteration": 2.631972074508667 + }, + { + "auxiliary_loss_clip": 0.06473772, + "auxiliary_loss_mlp": 0.01270331, + "balance_loss_clip": 0.06286001, + "balance_loss_mlp": 0.01255096, + "epoch": 0.3360889824139486, + "flos": 23703828721920.0, + "grad_norm": 1.7722032929069027, + "language_loss": 0.67818141, + "learning_rate": 3.0950800981012567e-06, + "loss": 0.75562239, + "num_input_tokens_seen": 120036335, + "router_z_loss_clip": 1.87695312, + "router_z_loss_mlp": 0.15258789, + "step": 5590, + "time_per_iteration": 2.582160711288452 + }, + { + "auxiliary_loss_clip": 0.0646477, + "auxiliary_loss_mlp": 0.01273314, + "balance_loss_clip": 0.06283349, + "balance_loss_mlp": 0.01257972, + "epoch": 0.33614910566661654, + "flos": 19324207514880.0, + "grad_norm": 1.8733623292805037, + "language_loss": 0.73821473, + "learning_rate": 3.094754183798047e-06, + "loss": 0.81559563, + "num_input_tokens_seen": 120056120, + "router_z_loss_clip": 1.8125, + "router_z_loss_mlp": 0.15344238, + "step": 5591, + "time_per_iteration": 2.5325355529785156 + }, + { + "auxiliary_loss_clip": 0.06462986, + "auxiliary_loss_mlp": 0.01270586, + "balance_loss_clip": 0.06280106, + "balance_loss_mlp": 0.01254945, + "epoch": 0.3362092289192845, + "flos": 16477889771520.0, + "grad_norm": 3.0838875929044036, + "language_loss": 0.70195794, + "learning_rate": 3.0944282279816493e-06, + "loss": 0.77929366, + "num_input_tokens_seen": 120073650, + "router_z_loss_clip": 1.828125, + "router_z_loss_mlp": 0.15637207, + "step": 5592, + "time_per_iteration": 3.919609546661377 + }, + { + "auxiliary_loss_clip": 0.06466913, + "auxiliary_loss_mlp": 0.01271712, + "balance_loss_clip": 0.06283789, + "balance_loss_mlp": 0.01257014, + "epoch": 0.33626935217195253, + "flos": 24250484759040.0, + "grad_norm": 2.017741256836838, + "language_loss": 0.76621854, + "learning_rate": 3.094102230664423e-06, + "loss": 0.8436048, + "num_input_tokens_seen": 120093260, + "router_z_loss_clip": 1.83007812, + "router_z_loss_mlp": 0.14697266, + "step": 5593, + "time_per_iteration": 2.582902431488037 + }, + { + "auxiliary_loss_clip": 0.06476289, + "auxiliary_loss_mlp": 0.01272909, + "balance_loss_clip": 0.06285767, + "balance_loss_mlp": 0.01255445, + "epoch": 0.3363294754246205, + "flos": 19724814685440.0, + "grad_norm": 3.212319882003512, + "language_loss": 0.72710228, + "learning_rate": 3.093776191858731e-06, + "loss": 0.80459422, + "num_input_tokens_seen": 120111830, + "router_z_loss_clip": 1.90625, + "router_z_loss_mlp": 0.17456055, + "step": 5594, + "time_per_iteration": 2.495196580886841 + }, + { + "auxiliary_loss_clip": 0.06477273, + "auxiliary_loss_mlp": 0.01272377, + "balance_loss_clip": 0.06289684, + "balance_loss_mlp": 0.01256379, + "epoch": 0.33638959867728846, + "flos": 22602005458560.0, + "grad_norm": 1.7565144487218112, + "language_loss": 0.8009572, + "learning_rate": 3.0934501115769363e-06, + "loss": 0.87845373, + "num_input_tokens_seen": 120130470, + "router_z_loss_clip": 1.87402344, + "router_z_loss_mlp": 0.16003418, + "step": 5595, + "time_per_iteration": 2.5639891624450684 + }, + { + "auxiliary_loss_clip": 0.06468762, + "auxiliary_loss_mlp": 0.01271282, + "balance_loss_clip": 0.06285411, + "balance_loss_mlp": 0.01256691, + "epoch": 0.3364497219299564, + "flos": 21000834587520.0, + "grad_norm": 1.6187307873664143, + "language_loss": 0.81718135, + "learning_rate": 3.0931239898314037e-06, + "loss": 0.89458185, + "num_input_tokens_seen": 120150735, + "router_z_loss_clip": 1.83203125, + "router_z_loss_mlp": 0.14587402, + "step": 5596, + "time_per_iteration": 2.579089403152466 + }, + { + "auxiliary_loss_clip": 0.06470582, + "auxiliary_loss_mlp": 0.01270351, + "balance_loss_clip": 0.06285384, + "balance_loss_mlp": 0.01256034, + "epoch": 0.3365098451826244, + "flos": 25235664739200.0, + "grad_norm": 1.5539796133352632, + "language_loss": 0.76225436, + "learning_rate": 3.0927978266344995e-06, + "loss": 0.83966368, + "num_input_tokens_seen": 120173230, + "router_z_loss_clip": 1.8515625, + "router_z_loss_mlp": 0.14318848, + "step": 5597, + "time_per_iteration": 2.6059625148773193 + }, + { + "auxiliary_loss_clip": 0.06473622, + "auxiliary_loss_mlp": 0.01271725, + "balance_loss_clip": 0.06290761, + "balance_loss_mlp": 0.01257206, + "epoch": 0.33656996843529235, + "flos": 24578612547840.0, + "grad_norm": 1.67554812607641, + "language_loss": 0.78886169, + "learning_rate": 3.0924716219985916e-06, + "loss": 0.86631513, + "num_input_tokens_seen": 120191860, + "router_z_loss_clip": 1.83007812, + "router_z_loss_mlp": 0.14520264, + "step": 5598, + "time_per_iteration": 2.54971981048584 + }, + { + "auxiliary_loss_clip": 0.06487022, + "auxiliary_loss_mlp": 0.01275679, + "balance_loss_clip": 0.0629402, + "balance_loss_mlp": 0.01259353, + "epoch": 0.3366300916879603, + "flos": 44101223331840.0, + "grad_norm": 1.966389459711274, + "language_loss": 0.64792764, + "learning_rate": 3.0921453759360514e-06, + "loss": 0.7255547, + "num_input_tokens_seen": 120219195, + "router_z_loss_clip": 1.92871094, + "router_z_loss_mlp": 0.16326904, + "step": 5599, + "time_per_iteration": 2.741544723510742 + }, + { + "auxiliary_loss_clip": 0.06483869, + "auxiliary_loss_mlp": 0.01276046, + "balance_loss_clip": 0.06290758, + "balance_loss_mlp": 0.01259118, + "epoch": 0.3366902149406283, + "flos": 13884746739840.0, + "grad_norm": 2.857086104177812, + "language_loss": 0.82787466, + "learning_rate": 3.091819088459249e-06, + "loss": 0.90547383, + "num_input_tokens_seen": 120232950, + "router_z_loss_clip": 1.93164062, + "router_z_loss_mlp": 0.16906738, + "step": 5600, + "time_per_iteration": 2.4761526584625244 + }, + { + "auxiliary_loss_clip": 0.06480727, + "auxiliary_loss_mlp": 0.01272907, + "balance_loss_clip": 0.06289887, + "balance_loss_mlp": 0.01257255, + "epoch": 0.33675033819329625, + "flos": 16258648763520.0, + "grad_norm": 2.1921833677853853, + "language_loss": 0.83268821, + "learning_rate": 3.0914927595805573e-06, + "loss": 0.91022456, + "num_input_tokens_seen": 120248865, + "router_z_loss_clip": 1.90625, + "router_z_loss_mlp": 0.15649414, + "step": 5601, + "time_per_iteration": 2.5205788612365723 + }, + { + "auxiliary_loss_clip": 0.06469133, + "auxiliary_loss_mlp": 0.01269312, + "balance_loss_clip": 0.0628977, + "balance_loss_mlp": 0.01255382, + "epoch": 0.3368104614459642, + "flos": 17061498259200.0, + "grad_norm": 1.6270640398275205, + "language_loss": 0.83791035, + "learning_rate": 3.0911663893123507e-06, + "loss": 0.91529477, + "num_input_tokens_seen": 120267820, + "router_z_loss_clip": 1.79199219, + "router_z_loss_mlp": 0.1394043, + "step": 5602, + "time_per_iteration": 2.5069589614868164 + }, + { + "auxiliary_loss_clip": 0.06479525, + "auxiliary_loss_mlp": 0.01274011, + "balance_loss_clip": 0.06294133, + "balance_loss_mlp": 0.01258645, + "epoch": 0.3368705846986322, + "flos": 17864473536000.0, + "grad_norm": 2.666791314538914, + "language_loss": 0.69934028, + "learning_rate": 3.0908399776670048e-06, + "loss": 0.77687562, + "num_input_tokens_seen": 120286540, + "router_z_loss_clip": 1.85351562, + "router_z_loss_mlp": 0.15380859, + "step": 5603, + "time_per_iteration": 2.5512561798095703 + }, + { + "auxiliary_loss_clip": 0.0648806, + "auxiliary_loss_mlp": 0.01271029, + "balance_loss_clip": 0.06298037, + "balance_loss_mlp": 0.01255376, + "epoch": 0.33693070795130015, + "flos": 22936086887040.0, + "grad_norm": 1.5393691582180518, + "language_loss": 0.83336604, + "learning_rate": 3.090513524656898e-06, + "loss": 0.91095686, + "num_input_tokens_seen": 120307305, + "router_z_loss_clip": 1.90234375, + "router_z_loss_mlp": 0.15661621, + "step": 5604, + "time_per_iteration": 2.542419910430908 + }, + { + "auxiliary_loss_clip": 0.06487563, + "auxiliary_loss_mlp": 0.01271201, + "balance_loss_clip": 0.06296179, + "balance_loss_mlp": 0.01255, + "epoch": 0.3369908312039681, + "flos": 22023889413120.0, + "grad_norm": 1.7290560496085086, + "language_loss": 0.74166059, + "learning_rate": 3.090187030294409e-06, + "loss": 0.8192482, + "num_input_tokens_seen": 120327845, + "router_z_loss_clip": 1.91015625, + "router_z_loss_mlp": 0.1619873, + "step": 5605, + "time_per_iteration": 2.551250696182251 + }, + { + "auxiliary_loss_clip": 0.0648852, + "auxiliary_loss_mlp": 0.01268868, + "balance_loss_clip": 0.06295876, + "balance_loss_mlp": 0.01253347, + "epoch": 0.33705095445663613, + "flos": 11806799736960.0, + "grad_norm": 2.683910051705504, + "language_loss": 0.84068418, + "learning_rate": 3.089860494591919e-06, + "loss": 0.91825807, + "num_input_tokens_seen": 120343255, + "router_z_loss_clip": 1.92675781, + "router_z_loss_mlp": 0.15515137, + "step": 5606, + "time_per_iteration": 2.4841489791870117 + }, + { + "auxiliary_loss_clip": 0.0647673, + "auxiliary_loss_mlp": 0.01269431, + "balance_loss_clip": 0.06290583, + "balance_loss_mlp": 0.01254721, + "epoch": 0.3371110777093041, + "flos": 25053460035840.0, + "grad_norm": 1.669780314791874, + "language_loss": 0.68210214, + "learning_rate": 3.089533917561809e-06, + "loss": 0.7595638, + "num_input_tokens_seen": 120361745, + "router_z_loss_clip": 1.85839844, + "router_z_loss_mlp": 0.14709473, + "step": 5607, + "time_per_iteration": 2.6018009185791016 + }, + { + "auxiliary_loss_clip": 0.0648887, + "auxiliary_loss_mlp": 0.01274582, + "balance_loss_clip": 0.06295381, + "balance_loss_mlp": 0.01258131, + "epoch": 0.33717120096197206, + "flos": 26586386156160.0, + "grad_norm": 1.643709475435958, + "language_loss": 0.71566343, + "learning_rate": 3.089207299216464e-06, + "loss": 0.79329789, + "num_input_tokens_seen": 120380565, + "router_z_loss_clip": 1.93261719, + "router_z_loss_mlp": 0.16442871, + "step": 5608, + "time_per_iteration": 2.5980639457702637 + }, + { + "auxiliary_loss_clip": 0.06479236, + "auxiliary_loss_mlp": 0.01274936, + "balance_loss_clip": 0.06291037, + "balance_loss_mlp": 0.01258712, + "epoch": 0.33723132421464, + "flos": 15163911169920.0, + "grad_norm": 1.8781248289320855, + "language_loss": 0.79662472, + "learning_rate": 3.088880639568269e-06, + "loss": 0.87416643, + "num_input_tokens_seen": 120399235, + "router_z_loss_clip": 1.8828125, + "router_z_loss_mlp": 0.16223145, + "step": 5609, + "time_per_iteration": 2.6196935176849365 + }, + { + "auxiliary_loss_clip": 0.06480544, + "auxiliary_loss_mlp": 0.01274048, + "balance_loss_clip": 0.06290779, + "balance_loss_mlp": 0.01256262, + "epoch": 0.337291447467308, + "flos": 23442058967040.0, + "grad_norm": 1.7293742366408622, + "language_loss": 0.83075953, + "learning_rate": 3.0885539386296114e-06, + "loss": 0.90830547, + "num_input_tokens_seen": 120420095, + "router_z_loss_clip": 1.89550781, + "router_z_loss_mlp": 0.17785645, + "step": 5610, + "time_per_iteration": 2.53485369682312 + }, + { + "auxiliary_loss_clip": 0.06471263, + "auxiliary_loss_mlp": 0.01269511, + "balance_loss_clip": 0.06288794, + "balance_loss_mlp": 0.01254097, + "epoch": 0.33735157071997596, + "flos": 17243870670720.0, + "grad_norm": 1.916021570377688, + "language_loss": 0.82657987, + "learning_rate": 3.088227196412879e-06, + "loss": 0.90398765, + "num_input_tokens_seen": 120437690, + "router_z_loss_clip": 1.82519531, + "router_z_loss_mlp": 0.1541748, + "step": 5611, + "time_per_iteration": 2.5164084434509277 + }, + { + "auxiliary_loss_clip": 0.06478009, + "auxiliary_loss_mlp": 0.01278112, + "balance_loss_clip": 0.0629037, + "balance_loss_mlp": 0.01260005, + "epoch": 0.3374116939726439, + "flos": 28265025726720.0, + "grad_norm": 3.0042840390827106, + "language_loss": 0.79815799, + "learning_rate": 3.0879004129304626e-06, + "loss": 0.87571925, + "num_input_tokens_seen": 120459240, + "router_z_loss_clip": 1.87597656, + "router_z_loss_mlp": 0.18084717, + "step": 5612, + "time_per_iteration": 2.582742929458618 + }, + { + "auxiliary_loss_clip": 0.06476334, + "auxiliary_loss_mlp": 0.0127707, + "balance_loss_clip": 0.06288031, + "balance_loss_mlp": 0.01261597, + "epoch": 0.3374718172253119, + "flos": 35928314663040.0, + "grad_norm": 2.3711016444568003, + "language_loss": 0.69757682, + "learning_rate": 3.087573588194753e-06, + "loss": 0.7751109, + "num_input_tokens_seen": 120481090, + "router_z_loss_clip": 1.88378906, + "router_z_loss_mlp": 0.15466309, + "step": 5613, + "time_per_iteration": 2.6553308963775635 + }, + { + "auxiliary_loss_clip": 0.06477948, + "auxiliary_loss_mlp": 0.01274833, + "balance_loss_clip": 0.06288674, + "balance_loss_mlp": 0.01259181, + "epoch": 0.33753194047797985, + "flos": 18192517470720.0, + "grad_norm": 1.7341744507496721, + "language_loss": 0.80043244, + "learning_rate": 3.087246722218144e-06, + "loss": 0.87796032, + "num_input_tokens_seen": 120500045, + "router_z_loss_clip": 1.89257812, + "router_z_loss_mlp": 0.15673828, + "step": 5614, + "time_per_iteration": 2.5162055492401123 + }, + { + "auxiliary_loss_clip": 0.06479318, + "auxiliary_loss_mlp": 0.01274123, + "balance_loss_clip": 0.06289384, + "balance_loss_mlp": 0.01257684, + "epoch": 0.3375920637306478, + "flos": 23155621384320.0, + "grad_norm": 1.8737965791301845, + "language_loss": 0.91138643, + "learning_rate": 3.086919815013031e-06, + "loss": 0.98892087, + "num_input_tokens_seen": 120521125, + "router_z_loss_clip": 1.90234375, + "router_z_loss_mlp": 0.16430664, + "step": 5615, + "time_per_iteration": 2.5491819381713867 + }, + { + "auxiliary_loss_clip": 0.0646698, + "auxiliary_loss_mlp": 0.01277747, + "balance_loss_clip": 0.06282586, + "balance_loss_mlp": 0.01261857, + "epoch": 0.3376521869833158, + "flos": 23118878568960.0, + "grad_norm": 1.8899714235087088, + "language_loss": 0.81227732, + "learning_rate": 3.086592866591809e-06, + "loss": 0.88972461, + "num_input_tokens_seen": 120539180, + "router_z_loss_clip": 1.84375, + "router_z_loss_mlp": 0.15881348, + "step": 5616, + "time_per_iteration": 2.551891803741455 + }, + { + "auxiliary_loss_clip": 0.0647929, + "auxiliary_loss_mlp": 0.01281624, + "balance_loss_clip": 0.06285349, + "balance_loss_mlp": 0.01263576, + "epoch": 0.33771231023598375, + "flos": 19279498561920.0, + "grad_norm": 1.7280186066143421, + "language_loss": 0.84097004, + "learning_rate": 3.0862658769668774e-06, + "loss": 0.91857922, + "num_input_tokens_seen": 120556280, + "router_z_loss_clip": 1.9375, + "router_z_loss_mlp": 0.18054199, + "step": 5617, + "time_per_iteration": 2.532703161239624 + }, + { + "auxiliary_loss_clip": 0.06466082, + "auxiliary_loss_mlp": 0.01273548, + "balance_loss_clip": 0.06279126, + "balance_loss_mlp": 0.01257073, + "epoch": 0.3377724334886517, + "flos": 18156026217600.0, + "grad_norm": 1.631465963150073, + "language_loss": 0.80857313, + "learning_rate": 3.0859388461506343e-06, + "loss": 0.8859694, + "num_input_tokens_seen": 120575395, + "router_z_loss_clip": 1.86816406, + "router_z_loss_mlp": 0.16467285, + "step": 5618, + "time_per_iteration": 2.5592081546783447 + }, + { + "auxiliary_loss_clip": 0.06473768, + "auxiliary_loss_mlp": 0.01275311, + "balance_loss_clip": 0.06286047, + "balance_loss_mlp": 0.01258514, + "epoch": 0.3378325567413197, + "flos": 25783159317120.0, + "grad_norm": 2.0305417192076267, + "language_loss": 0.71181929, + "learning_rate": 3.085611774155481e-06, + "loss": 0.7893101, + "num_input_tokens_seen": 120596075, + "router_z_loss_clip": 1.87695312, + "router_z_loss_mlp": 0.16809082, + "step": 5619, + "time_per_iteration": 2.5726358890533447 + }, + { + "auxiliary_loss_clip": 0.06476114, + "auxiliary_loss_mlp": 0.01271613, + "balance_loss_clip": 0.06289306, + "balance_loss_mlp": 0.01256688, + "epoch": 0.3378926799939877, + "flos": 21322254049920.0, + "grad_norm": 2.6280659122339496, + "language_loss": 0.70615005, + "learning_rate": 3.085284660993821e-06, + "loss": 0.78362733, + "num_input_tokens_seen": 120614195, + "router_z_loss_clip": 1.86523438, + "router_z_loss_mlp": 0.14929199, + "step": 5620, + "time_per_iteration": 2.604161500930786 + }, + { + "auxiliary_loss_clip": 0.06467394, + "auxiliary_loss_mlp": 0.0127348, + "balance_loss_clip": 0.0628472, + "balance_loss_mlp": 0.01258054, + "epoch": 0.33795280324665566, + "flos": 24906991898880.0, + "grad_norm": 2.3940060195146384, + "language_loss": 0.6847257, + "learning_rate": 3.084957506678058e-06, + "loss": 0.76213443, + "num_input_tokens_seen": 120634475, + "router_z_loss_clip": 1.82421875, + "router_z_loss_mlp": 0.1541748, + "step": 5621, + "time_per_iteration": 2.559730052947998 + }, + { + "auxiliary_loss_clip": 0.06469798, + "auxiliary_loss_mlp": 0.01273981, + "balance_loss_clip": 0.06287812, + "balance_loss_mlp": 0.0125914, + "epoch": 0.33801292649932363, + "flos": 24760859178240.0, + "grad_norm": 1.8671152624425502, + "language_loss": 0.82685888, + "learning_rate": 3.0846303112205975e-06, + "loss": 0.90429658, + "num_input_tokens_seen": 120654980, + "router_z_loss_clip": 1.81933594, + "router_z_loss_mlp": 0.1484375, + "step": 5622, + "time_per_iteration": 2.5722928047180176 + }, + { + "auxiliary_loss_clip": 0.06466316, + "auxiliary_loss_mlp": 0.01274625, + "balance_loss_clip": 0.06284748, + "balance_loss_mlp": 0.01260564, + "epoch": 0.3380730497519916, + "flos": 26731177211520.0, + "grad_norm": 1.4865849557607265, + "language_loss": 0.74114043, + "learning_rate": 3.0843030746338464e-06, + "loss": 0.81854987, + "num_input_tokens_seen": 120676245, + "router_z_loss_clip": 1.81738281, + "router_z_loss_mlp": 0.14056396, + "step": 5623, + "time_per_iteration": 2.5830907821655273 + }, + { + "auxiliary_loss_clip": 0.06389539, + "auxiliary_loss_mlp": 0.01273334, + "balance_loss_clip": 0.06299451, + "balance_loss_mlp": 0.01265943, + "epoch": 0.33813317300465956, + "flos": 70056845550720.0, + "grad_norm": 0.7132848624035326, + "language_loss": 0.54856884, + "learning_rate": 3.083975796930215e-06, + "loss": 0.62519753, + "num_input_tokens_seen": 120741965, + "router_z_loss_clip": 0.8984375, + "router_z_loss_mlp": 0.07373047, + "step": 5624, + "time_per_iteration": 4.680114030838013 + }, + { + "auxiliary_loss_clip": 0.06475174, + "auxiliary_loss_mlp": 0.01272775, + "balance_loss_clip": 0.06285602, + "balance_loss_mlp": 0.01256086, + "epoch": 0.3381932962573275, + "flos": 24104142403200.0, + "grad_norm": 3.6042241236842267, + "language_loss": 0.73496938, + "learning_rate": 3.083648478122111e-06, + "loss": 0.81244886, + "num_input_tokens_seen": 120760410, + "router_z_loss_clip": 1.89550781, + "router_z_loss_mlp": 0.16687012, + "step": 5625, + "time_per_iteration": 4.002846956253052 + }, + { + "auxiliary_loss_clip": 0.06480759, + "auxiliary_loss_mlp": 0.01274878, + "balance_loss_clip": 0.06288841, + "balance_loss_mlp": 0.01257021, + "epoch": 0.3382534195099955, + "flos": 19283775120000.0, + "grad_norm": 1.9831743515273117, + "language_loss": 0.7176404, + "learning_rate": 3.0833211182219497e-06, + "loss": 0.79519677, + "num_input_tokens_seen": 120777705, + "router_z_loss_clip": 1.91796875, + "router_z_loss_mlp": 0.17858887, + "step": 5626, + "time_per_iteration": 2.4999427795410156 + }, + { + "auxiliary_loss_clip": 0.06468458, + "auxiliary_loss_mlp": 0.01272986, + "balance_loss_clip": 0.06287608, + "balance_loss_mlp": 0.01257739, + "epoch": 0.33831354276266346, + "flos": 25232897554560.0, + "grad_norm": 2.987617225478933, + "language_loss": 0.81275499, + "learning_rate": 3.0829937172421425e-06, + "loss": 0.8901695, + "num_input_tokens_seen": 120798660, + "router_z_loss_clip": 1.80957031, + "router_z_loss_mlp": 0.15246582, + "step": 5627, + "time_per_iteration": 3.951984405517578 + }, + { + "auxiliary_loss_clip": 0.06478465, + "auxiliary_loss_mlp": 0.01272976, + "balance_loss_clip": 0.06288861, + "balance_loss_mlp": 0.0125668, + "epoch": 0.3383736660153314, + "flos": 23118627006720.0, + "grad_norm": 1.844905449272807, + "language_loss": 0.80405974, + "learning_rate": 3.0826662751951055e-06, + "loss": 0.88157415, + "num_input_tokens_seen": 120816705, + "router_z_loss_clip": 1.89648438, + "router_z_loss_mlp": 0.16296387, + "step": 5628, + "time_per_iteration": 2.5670697689056396 + }, + { + "auxiliary_loss_clip": 0.06477988, + "auxiliary_loss_mlp": 0.01270735, + "balance_loss_clip": 0.06288996, + "balance_loss_mlp": 0.0125457, + "epoch": 0.3384337892679994, + "flos": 23483874954240.0, + "grad_norm": 2.662319374226008, + "language_loss": 0.77757806, + "learning_rate": 3.082338792093254e-06, + "loss": 0.85506529, + "num_input_tokens_seen": 120835375, + "router_z_loss_clip": 1.88867188, + "router_z_loss_mlp": 0.16174316, + "step": 5629, + "time_per_iteration": 2.5463128089904785 + }, + { + "auxiliary_loss_clip": 0.06482605, + "auxiliary_loss_mlp": 0.01280413, + "balance_loss_clip": 0.06291752, + "balance_loss_mlp": 0.01262758, + "epoch": 0.33849391252066735, + "flos": 19431626849280.0, + "grad_norm": 1.826421419331283, + "language_loss": 0.85789764, + "learning_rate": 3.0820112679490074e-06, + "loss": 0.9355278, + "num_input_tokens_seen": 120854260, + "router_z_loss_clip": 1.90722656, + "router_z_loss_mlp": 0.17663574, + "step": 5630, + "time_per_iteration": 2.5818262100219727 + }, + { + "auxiliary_loss_clip": 0.06476109, + "auxiliary_loss_mlp": 0.01274878, + "balance_loss_clip": 0.06290477, + "balance_loss_mlp": 0.01260073, + "epoch": 0.3385540357733353, + "flos": 21070462930560.0, + "grad_norm": 2.179516256809373, + "language_loss": 0.72520673, + "learning_rate": 3.0816837027747857e-06, + "loss": 0.80271661, + "num_input_tokens_seen": 120871590, + "router_z_loss_clip": 1.85351562, + "router_z_loss_mlp": 0.14807129, + "step": 5631, + "time_per_iteration": 3.9340498447418213 + }, + { + "auxiliary_loss_clip": 0.06388511, + "auxiliary_loss_mlp": 0.01280567, + "balance_loss_clip": 0.06298131, + "balance_loss_mlp": 0.01272211, + "epoch": 0.3386141590260033, + "flos": 69224772908160.0, + "grad_norm": 0.8339652565495183, + "language_loss": 0.56105018, + "learning_rate": 3.0813560965830084e-06, + "loss": 0.63774097, + "num_input_tokens_seen": 120925550, + "router_z_loss_clip": 0.90234375, + "router_z_loss_mlp": 0.08361816, + "step": 5632, + "time_per_iteration": 3.215395450592041 + }, + { + "auxiliary_loss_clip": 0.06477562, + "auxiliary_loss_mlp": 0.01271677, + "balance_loss_clip": 0.06290288, + "balance_loss_mlp": 0.01256573, + "epoch": 0.3386742822786713, + "flos": 25526420807040.0, + "grad_norm": 3.459768837753136, + "language_loss": 0.81030583, + "learning_rate": 3.0810284493861005e-06, + "loss": 0.88779831, + "num_input_tokens_seen": 120947620, + "router_z_loss_clip": 1.87011719, + "router_z_loss_mlp": 0.15112305, + "step": 5633, + "time_per_iteration": 2.6278936862945557 + }, + { + "auxiliary_loss_clip": 0.06473435, + "auxiliary_loss_mlp": 0.01274796, + "balance_loss_clip": 0.06287597, + "balance_loss_mlp": 0.01258942, + "epoch": 0.33873440553133927, + "flos": 23629881893760.0, + "grad_norm": 2.634738846372382, + "language_loss": 0.59410667, + "learning_rate": 3.0807007611964855e-06, + "loss": 0.67158902, + "num_input_tokens_seen": 120965205, + "router_z_loss_clip": 1.85839844, + "router_z_loss_mlp": 0.15856934, + "step": 5634, + "time_per_iteration": 2.565622091293335 + }, + { + "auxiliary_loss_clip": 0.06475686, + "auxiliary_loss_mlp": 0.01270379, + "balance_loss_clip": 0.0628805, + "balance_loss_mlp": 0.01255216, + "epoch": 0.33879452878400723, + "flos": 17094006443520.0, + "grad_norm": 1.81394172090833, + "language_loss": 0.92877531, + "learning_rate": 3.080373032026589e-06, + "loss": 1.00623596, + "num_input_tokens_seen": 120983560, + "router_z_loss_clip": 1.87597656, + "router_z_loss_mlp": 0.15161133, + "step": 5635, + "time_per_iteration": 2.539051055908203 + }, + { + "auxiliary_loss_clip": 0.06470082, + "auxiliary_loss_mlp": 0.01273079, + "balance_loss_clip": 0.0629005, + "balance_loss_mlp": 0.01257457, + "epoch": 0.3388546520366752, + "flos": 15747477730560.0, + "grad_norm": 1.8703432540182672, + "language_loss": 0.75823128, + "learning_rate": 3.0800452618888386e-06, + "loss": 0.83566296, + "num_input_tokens_seen": 121001400, + "router_z_loss_clip": 1.80078125, + "router_z_loss_mlp": 0.15618896, + "step": 5636, + "time_per_iteration": 2.4998726844787598 + }, + { + "auxiliary_loss_clip": 0.064714, + "auxiliary_loss_mlp": 0.01275037, + "balance_loss_clip": 0.06288341, + "balance_loss_mlp": 0.01258848, + "epoch": 0.33891477528934316, + "flos": 22425251270400.0, + "grad_norm": 1.6981405891584176, + "language_loss": 0.83775222, + "learning_rate": 3.0797174507956637e-06, + "loss": 0.91521657, + "num_input_tokens_seen": 121021760, + "router_z_loss_clip": 1.828125, + "router_z_loss_mlp": 0.1619873, + "step": 5637, + "time_per_iteration": 2.551074981689453 + }, + { + "auxiliary_loss_clip": 0.06474115, + "auxiliary_loss_mlp": 0.01272331, + "balance_loss_clip": 0.06286962, + "balance_loss_mlp": 0.01254736, + "epoch": 0.3389748985420111, + "flos": 17280571559040.0, + "grad_norm": 1.787045955061502, + "language_loss": 0.70609659, + "learning_rate": 3.079389598759495e-06, + "loss": 0.78356105, + "num_input_tokens_seen": 121041070, + "router_z_loss_clip": 1.87304688, + "router_z_loss_mlp": 0.17590332, + "step": 5638, + "time_per_iteration": 2.5479955673217773 + }, + { + "auxiliary_loss_clip": 0.06478329, + "auxiliary_loss_mlp": 0.01289332, + "balance_loss_clip": 0.06293231, + "balance_loss_mlp": 0.01272404, + "epoch": 0.3390350217946791, + "flos": 27752261466240.0, + "grad_norm": 1.7018866339003167, + "language_loss": 0.81276166, + "learning_rate": 3.079061705792765e-06, + "loss": 0.89043832, + "num_input_tokens_seen": 121060890, + "router_z_loss_clip": 1.84960938, + "router_z_loss_mlp": 0.16931152, + "step": 5639, + "time_per_iteration": 2.614819288253784 + }, + { + "auxiliary_loss_clip": 0.06487049, + "auxiliary_loss_mlp": 0.01288743, + "balance_loss_clip": 0.06296147, + "balance_loss_mlp": 0.01270635, + "epoch": 0.33909514504734706, + "flos": 20346088383360.0, + "grad_norm": 6.449374256721531, + "language_loss": 0.68149316, + "learning_rate": 3.078733771907907e-06, + "loss": 0.75925112, + "num_input_tokens_seen": 121079135, + "router_z_loss_clip": 1.90917969, + "router_z_loss_mlp": 0.18103027, + "step": 5640, + "time_per_iteration": 2.496300220489502 + }, + { + "auxiliary_loss_clip": 0.06471096, + "auxiliary_loss_mlp": 0.01277542, + "balance_loss_clip": 0.06286727, + "balance_loss_mlp": 0.0125978, + "epoch": 0.339155268300015, + "flos": 14835322183680.0, + "grad_norm": 1.7549267997867504, + "language_loss": 0.70165765, + "learning_rate": 3.0784057971173554e-06, + "loss": 0.77914405, + "num_input_tokens_seen": 121097685, + "router_z_loss_clip": 1.84570312, + "router_z_loss_mlp": 0.1776123, + "step": 5641, + "time_per_iteration": 2.524548053741455 + }, + { + "auxiliary_loss_clip": 0.0647646, + "auxiliary_loss_mlp": 0.0128105, + "balance_loss_clip": 0.06289618, + "balance_loss_mlp": 0.01264611, + "epoch": 0.339215391552683, + "flos": 26075173196160.0, + "grad_norm": 2.2643311920206592, + "language_loss": 0.88204467, + "learning_rate": 3.0780777814335483e-06, + "loss": 0.95961982, + "num_input_tokens_seen": 121115640, + "router_z_loss_clip": 1.86914062, + "router_z_loss_mlp": 0.16430664, + "step": 5642, + "time_per_iteration": 2.551790237426758 + }, + { + "auxiliary_loss_clip": 0.06466684, + "auxiliary_loss_mlp": 0.01272488, + "balance_loss_clip": 0.06289211, + "balance_loss_mlp": 0.01258195, + "epoch": 0.33927551480535095, + "flos": 14579967265920.0, + "grad_norm": 2.023061860440481, + "language_loss": 0.84285331, + "learning_rate": 3.077749724868924e-06, + "loss": 0.92024505, + "num_input_tokens_seen": 121132485, + "router_z_loss_clip": 1.7734375, + "router_z_loss_mlp": 0.1428833, + "step": 5643, + "time_per_iteration": 2.542921304702759 + }, + { + "auxiliary_loss_clip": 0.06468654, + "auxiliary_loss_mlp": 0.01272873, + "balance_loss_clip": 0.06285787, + "balance_loss_mlp": 0.01256708, + "epoch": 0.3393356380580189, + "flos": 23812380086400.0, + "grad_norm": 6.736940029896959, + "language_loss": 0.77634799, + "learning_rate": 3.077421627435922e-06, + "loss": 0.85376322, + "num_input_tokens_seen": 121152935, + "router_z_loss_clip": 1.828125, + "router_z_loss_mlp": 0.16162109, + "step": 5644, + "time_per_iteration": 2.523386240005493 + }, + { + "auxiliary_loss_clip": 0.06472027, + "auxiliary_loss_mlp": 0.01274584, + "balance_loss_clip": 0.06288091, + "balance_loss_mlp": 0.0125873, + "epoch": 0.3393957613106869, + "flos": 17353637919360.0, + "grad_norm": 2.9654561398927752, + "language_loss": 0.6324017, + "learning_rate": 3.0770934891469832e-06, + "loss": 0.70986784, + "num_input_tokens_seen": 121169835, + "router_z_loss_clip": 1.84179688, + "router_z_loss_mlp": 0.15856934, + "step": 5645, + "time_per_iteration": 2.51273775100708 + }, + { + "auxiliary_loss_clip": 0.06466414, + "auxiliary_loss_mlp": 0.01272532, + "balance_loss_clip": 0.06285059, + "balance_loss_mlp": 0.01256284, + "epoch": 0.3394558845633549, + "flos": 28440647884800.0, + "grad_norm": 2.089100449350665, + "language_loss": 0.77295536, + "learning_rate": 3.076765310014552e-06, + "loss": 0.8503449, + "num_input_tokens_seen": 121190290, + "router_z_loss_clip": 1.8125, + "router_z_loss_mlp": 0.16247559, + "step": 5646, + "time_per_iteration": 2.5461859703063965 + }, + { + "auxiliary_loss_clip": 0.06477356, + "auxiliary_loss_mlp": 0.01274638, + "balance_loss_clip": 0.06287819, + "balance_loss_mlp": 0.01257568, + "epoch": 0.33951600781602287, + "flos": 22092804996480.0, + "grad_norm": 2.533529984962848, + "language_loss": 0.79702288, + "learning_rate": 3.0764370900510727e-06, + "loss": 0.87454283, + "num_input_tokens_seen": 121209060, + "router_z_loss_clip": 1.89550781, + "router_z_loss_mlp": 0.17077637, + "step": 5647, + "time_per_iteration": 2.5699684619903564 + }, + { + "auxiliary_loss_clip": 0.0647471, + "auxiliary_loss_mlp": 0.01272685, + "balance_loss_clip": 0.06288452, + "balance_loss_mlp": 0.01256067, + "epoch": 0.33957613106869083, + "flos": 23885027176320.0, + "grad_norm": 2.1454269075726535, + "language_loss": 0.78001738, + "learning_rate": 3.0761088292689904e-06, + "loss": 0.85749137, + "num_input_tokens_seen": 121227480, + "router_z_loss_clip": 1.86328125, + "router_z_loss_mlp": 0.16625977, + "step": 5648, + "time_per_iteration": 2.5294926166534424 + }, + { + "auxiliary_loss_clip": 0.063921, + "auxiliary_loss_mlp": 0.01261966, + "balance_loss_clip": 0.0630298, + "balance_loss_mlp": 0.01254759, + "epoch": 0.3396362543213588, + "flos": 71264411066880.0, + "grad_norm": 0.7604552176896413, + "language_loss": 0.56109136, + "learning_rate": 3.075780527680754e-06, + "loss": 0.63763207, + "num_input_tokens_seen": 121291305, + "router_z_loss_clip": 0.890625, + "router_z_loss_mlp": 0.07196045, + "step": 5649, + "time_per_iteration": 3.2003703117370605 + }, + { + "auxiliary_loss_clip": 0.06473398, + "auxiliary_loss_mlp": 0.01280094, + "balance_loss_clip": 0.06287606, + "balance_loss_mlp": 0.01263274, + "epoch": 0.33969637757402676, + "flos": 25928746986240.0, + "grad_norm": 1.4812234353432667, + "language_loss": 0.85783911, + "learning_rate": 3.0754521852988117e-06, + "loss": 0.93537402, + "num_input_tokens_seen": 121312740, + "router_z_loss_clip": 1.85839844, + "router_z_loss_mlp": 0.16821289, + "step": 5650, + "time_per_iteration": 2.551633834838867 + }, + { + "auxiliary_loss_clip": 0.06475022, + "auxiliary_loss_mlp": 0.01277613, + "balance_loss_clip": 0.06292272, + "balance_loss_mlp": 0.01261841, + "epoch": 0.33975650082669473, + "flos": 35270382003840.0, + "grad_norm": 3.382903843955623, + "language_loss": 0.71404934, + "learning_rate": 3.0751238021356152e-06, + "loss": 0.79157567, + "num_input_tokens_seen": 121334220, + "router_z_loss_clip": 1.82617188, + "router_z_loss_mlp": 0.15759277, + "step": 5651, + "time_per_iteration": 2.665083885192871 + }, + { + "auxiliary_loss_clip": 0.06471914, + "auxiliary_loss_mlp": 0.01278706, + "balance_loss_clip": 0.06286959, + "balance_loss_mlp": 0.01261922, + "epoch": 0.3398166240793627, + "flos": 16651373650560.0, + "grad_norm": 4.478617872089092, + "language_loss": 0.81850624, + "learning_rate": 3.074795378203616e-06, + "loss": 0.89601243, + "num_input_tokens_seen": 121351870, + "router_z_loss_clip": 1.84960938, + "router_z_loss_mlp": 0.16772461, + "step": 5652, + "time_per_iteration": 2.5136160850524902 + }, + { + "auxiliary_loss_clip": 0.06483054, + "auxiliary_loss_mlp": 0.01281024, + "balance_loss_clip": 0.06293614, + "balance_loss_mlp": 0.0126344, + "epoch": 0.33987674733203066, + "flos": 24069244377600.0, + "grad_norm": 3.0225456344203088, + "language_loss": 0.77707815, + "learning_rate": 3.0744669135152685e-06, + "loss": 0.85471892, + "num_input_tokens_seen": 121373400, + "router_z_loss_clip": 1.89257812, + "router_z_loss_mlp": 0.17590332, + "step": 5653, + "time_per_iteration": 2.6221256256103516 + }, + { + "auxiliary_loss_clip": 0.06478614, + "auxiliary_loss_mlp": 0.01275428, + "balance_loss_clip": 0.06293246, + "balance_loss_mlp": 0.01259788, + "epoch": 0.3399368705846986, + "flos": 13253955603840.0, + "grad_norm": 4.6454995512067745, + "language_loss": 0.86809218, + "learning_rate": 3.0741384080830278e-06, + "loss": 0.94563264, + "num_input_tokens_seen": 121385225, + "router_z_loss_clip": 1.85449219, + "router_z_loss_mlp": 0.15625, + "step": 5654, + "time_per_iteration": 2.4661965370178223 + }, + { + "auxiliary_loss_clip": 0.06476527, + "auxiliary_loss_mlp": 0.01283952, + "balance_loss_clip": 0.06292438, + "balance_loss_mlp": 0.01267584, + "epoch": 0.3399969938373666, + "flos": 27019585365120.0, + "grad_norm": 2.782601809339298, + "language_loss": 0.65974486, + "learning_rate": 3.073809861919351e-06, + "loss": 0.73734963, + "num_input_tokens_seen": 121404735, + "router_z_loss_clip": 1.83984375, + "router_z_loss_mlp": 0.16369629, + "step": 5655, + "time_per_iteration": 2.555647611618042 + }, + { + "auxiliary_loss_clip": 0.06478781, + "auxiliary_loss_mlp": 0.01275484, + "balance_loss_clip": 0.06293027, + "balance_loss_mlp": 0.01259558, + "epoch": 0.34005711709003456, + "flos": 28557920073600.0, + "grad_norm": 1.4106761603755547, + "language_loss": 0.76612461, + "learning_rate": 3.073481275036697e-06, + "loss": 0.84366733, + "num_input_tokens_seen": 121426780, + "router_z_loss_clip": 1.85839844, + "router_z_loss_mlp": 0.15917969, + "step": 5656, + "time_per_iteration": 2.644866466522217 + }, + { + "auxiliary_loss_clip": 0.06484362, + "auxiliary_loss_mlp": 0.01277113, + "balance_loss_clip": 0.06293096, + "balance_loss_mlp": 0.01260436, + "epoch": 0.3401172403427025, + "flos": 21623533804800.0, + "grad_norm": 1.950261924987131, + "language_loss": 0.83422613, + "learning_rate": 3.073152647447525e-06, + "loss": 0.9118408, + "num_input_tokens_seen": 121447245, + "router_z_loss_clip": 1.91210938, + "router_z_loss_mlp": 0.16674805, + "step": 5657, + "time_per_iteration": 2.701688051223755 + }, + { + "auxiliary_loss_clip": 0.06477939, + "auxiliary_loss_mlp": 0.01276671, + "balance_loss_clip": 0.06292981, + "balance_loss_mlp": 0.01259851, + "epoch": 0.3401773635953705, + "flos": 25893010419840.0, + "grad_norm": 5.064784702806917, + "language_loss": 0.86277437, + "learning_rate": 3.0728239791642976e-06, + "loss": 0.94032043, + "num_input_tokens_seen": 121468165, + "router_z_loss_clip": 1.84765625, + "router_z_loss_mlp": 0.16833496, + "step": 5658, + "time_per_iteration": 2.622107744216919 + }, + { + "auxiliary_loss_clip": 0.06400045, + "auxiliary_loss_mlp": 0.01275632, + "balance_loss_clip": 0.06310016, + "balance_loss_mlp": 0.01268671, + "epoch": 0.3402374868480385, + "flos": 65527737459840.0, + "grad_norm": 0.8082747939523138, + "language_loss": 0.59960568, + "learning_rate": 3.072495270199477e-06, + "loss": 0.67636251, + "num_input_tokens_seen": 121523795, + "router_z_loss_clip": 0.8984375, + "router_z_loss_mlp": 0.06970215, + "step": 5659, + "time_per_iteration": 3.1002566814422607 + }, + { + "auxiliary_loss_clip": 0.0647618, + "auxiliary_loss_mlp": 0.01281423, + "balance_loss_clip": 0.06294397, + "balance_loss_mlp": 0.01264591, + "epoch": 0.34029761010070647, + "flos": 24067357660800.0, + "grad_norm": 2.7764582815625514, + "language_loss": 0.68693221, + "learning_rate": 3.0721665205655284e-06, + "loss": 0.76450825, + "num_input_tokens_seen": 121542950, + "router_z_loss_clip": 1.81542969, + "router_z_loss_mlp": 0.16821289, + "step": 5660, + "time_per_iteration": 2.620135545730591 + }, + { + "auxiliary_loss_clip": 0.06473149, + "auxiliary_loss_mlp": 0.01278369, + "balance_loss_clip": 0.06289428, + "balance_loss_mlp": 0.01262157, + "epoch": 0.34035773335337444, + "flos": 27607093067520.0, + "grad_norm": 2.0682817387265477, + "language_loss": 0.6727913, + "learning_rate": 3.071837730274918e-06, + "loss": 0.75030649, + "num_input_tokens_seen": 121562765, + "router_z_loss_clip": 1.83691406, + "router_z_loss_mlp": 0.16210938, + "step": 5661, + "time_per_iteration": 2.56429123878479 + }, + { + "auxiliary_loss_clip": 0.06469939, + "auxiliary_loss_mlp": 0.01280149, + "balance_loss_clip": 0.06289508, + "balance_loss_mlp": 0.01264175, + "epoch": 0.3404178566060424, + "flos": 20818923373440.0, + "grad_norm": 1.802665197928241, + "language_loss": 0.79380333, + "learning_rate": 3.071508899340113e-06, + "loss": 0.87130427, + "num_input_tokens_seen": 121581610, + "router_z_loss_clip": 1.80566406, + "router_z_loss_mlp": 0.15966797, + "step": 5662, + "time_per_iteration": 2.552755832672119 + }, + { + "auxiliary_loss_clip": 0.06474, + "auxiliary_loss_mlp": 0.01278156, + "balance_loss_clip": 0.06290844, + "balance_loss_mlp": 0.01260454, + "epoch": 0.34047797985871037, + "flos": 26840818679040.0, + "grad_norm": 2.1558050020889894, + "language_loss": 0.73809367, + "learning_rate": 3.0711800277735833e-06, + "loss": 0.8156153, + "num_input_tokens_seen": 121601885, + "router_z_loss_clip": 1.83007812, + "router_z_loss_mlp": 0.17700195, + "step": 5663, + "time_per_iteration": 2.5490622520446777 + }, + { + "auxiliary_loss_clip": 0.06470126, + "auxiliary_loss_mlp": 0.01281986, + "balance_loss_clip": 0.06290488, + "balance_loss_mlp": 0.01265714, + "epoch": 0.34053810311137833, + "flos": 19688742702720.0, + "grad_norm": 1.852400144955729, + "language_loss": 0.86839676, + "learning_rate": 3.0708511155877997e-06, + "loss": 0.94591784, + "num_input_tokens_seen": 121621335, + "router_z_loss_clip": 1.796875, + "router_z_loss_mlp": 0.16259766, + "step": 5664, + "time_per_iteration": 5.419060707092285 + }, + { + "auxiliary_loss_clip": 0.06483276, + "auxiliary_loss_mlp": 0.01274362, + "balance_loss_clip": 0.06295361, + "balance_loss_mlp": 0.01257423, + "epoch": 0.3405982263640463, + "flos": 21732169023360.0, + "grad_norm": 1.8640809787797845, + "language_loss": 0.69509971, + "learning_rate": 3.070522162795235e-06, + "loss": 0.77267611, + "num_input_tokens_seen": 121641310, + "router_z_loss_clip": 1.88183594, + "router_z_loss_mlp": 0.16943359, + "step": 5665, + "time_per_iteration": 2.547194719314575 + }, + { + "auxiliary_loss_clip": 0.06482168, + "auxiliary_loss_mlp": 0.01274659, + "balance_loss_clip": 0.0629427, + "balance_loss_mlp": 0.01257648, + "epoch": 0.34065834961671426, + "flos": 18047600634240.0, + "grad_norm": 2.6257214905883237, + "language_loss": 0.73526829, + "learning_rate": 3.0701931694083626e-06, + "loss": 0.81283653, + "num_input_tokens_seen": 121659625, + "router_z_loss_clip": 1.87890625, + "router_z_loss_mlp": 0.17016602, + "step": 5666, + "time_per_iteration": 2.527994155883789 + }, + { + "auxiliary_loss_clip": 0.06482688, + "auxiliary_loss_mlp": 0.01272217, + "balance_loss_clip": 0.06295212, + "balance_loss_mlp": 0.01255373, + "epoch": 0.3407184728693822, + "flos": 21403705818240.0, + "grad_norm": 1.661941695135435, + "language_loss": 0.74005675, + "learning_rate": 3.0698641354396576e-06, + "loss": 0.81760579, + "num_input_tokens_seen": 121679205, + "router_z_loss_clip": 1.87304688, + "router_z_loss_mlp": 0.1685791, + "step": 5667, + "time_per_iteration": 4.029574155807495 + }, + { + "auxiliary_loss_clip": 0.06378959, + "auxiliary_loss_mlp": 0.01268313, + "balance_loss_clip": 0.06290369, + "balance_loss_mlp": 0.01260898, + "epoch": 0.3407785961220502, + "flos": 68709352515840.0, + "grad_norm": 0.8062084259911544, + "language_loss": 0.63318539, + "learning_rate": 3.069535060901597e-06, + "loss": 0.70965815, + "num_input_tokens_seen": 121751085, + "router_z_loss_clip": 0.8828125, + "router_z_loss_mlp": 0.07397461, + "step": 5668, + "time_per_iteration": 3.3641560077667236 + }, + { + "auxiliary_loss_clip": 0.06472414, + "auxiliary_loss_mlp": 0.01272754, + "balance_loss_clip": 0.0628752, + "balance_loss_mlp": 0.01256863, + "epoch": 0.34083871937471816, + "flos": 14069634773760.0, + "grad_norm": 2.007810831329869, + "language_loss": 0.73127198, + "learning_rate": 3.0692059458066596e-06, + "loss": 0.80872369, + "num_input_tokens_seen": 121768565, + "router_z_loss_clip": 1.84765625, + "router_z_loss_mlp": 0.15893555, + "step": 5669, + "time_per_iteration": 2.4918038845062256 + }, + { + "auxiliary_loss_clip": 0.06479842, + "auxiliary_loss_mlp": 0.0127954, + "balance_loss_clip": 0.06292197, + "balance_loss_mlp": 0.01263423, + "epoch": 0.3408988426273861, + "flos": 17089981447680.0, + "grad_norm": 2.0642744441347287, + "language_loss": 0.80626565, + "learning_rate": 3.0688767901673265e-06, + "loss": 0.88385952, + "num_input_tokens_seen": 121784925, + "router_z_loss_clip": 1.87597656, + "router_z_loss_mlp": 0.16125488, + "step": 5670, + "time_per_iteration": 2.5270040035247803 + }, + { + "auxiliary_loss_clip": 0.06481062, + "auxiliary_loss_mlp": 0.01275164, + "balance_loss_clip": 0.06291522, + "balance_loss_mlp": 0.0125838, + "epoch": 0.3409589658800541, + "flos": 24031411459200.0, + "grad_norm": 1.863009265742361, + "language_loss": 0.77916187, + "learning_rate": 3.068547593996078e-06, + "loss": 0.85672414, + "num_input_tokens_seen": 121804425, + "router_z_loss_clip": 1.89355469, + "router_z_loss_mlp": 0.16784668, + "step": 5671, + "time_per_iteration": 4.039815664291382 + }, + { + "auxiliary_loss_clip": 0.06473973, + "auxiliary_loss_mlp": 0.01276984, + "balance_loss_clip": 0.06289308, + "balance_loss_mlp": 0.01260712, + "epoch": 0.34101908913272205, + "flos": 21148350900480.0, + "grad_norm": 1.9142883162018633, + "language_loss": 0.74626315, + "learning_rate": 3.0682183573053974e-06, + "loss": 0.82377267, + "num_input_tokens_seen": 121825145, + "router_z_loss_clip": 1.84960938, + "router_z_loss_mlp": 0.16259766, + "step": 5672, + "time_per_iteration": 2.564887762069702 + }, + { + "auxiliary_loss_clip": 0.06475951, + "auxiliary_loss_mlp": 0.01275656, + "balance_loss_clip": 0.06287946, + "balance_loss_mlp": 0.01259265, + "epoch": 0.3410792123853901, + "flos": 15706835700480.0, + "grad_norm": 1.714309741158987, + "language_loss": 0.73791027, + "learning_rate": 3.06788908010777e-06, + "loss": 0.81542635, + "num_input_tokens_seen": 121842185, + "router_z_loss_clip": 1.88085938, + "router_z_loss_mlp": 0.16394043, + "step": 5673, + "time_per_iteration": 2.540194511413574 + }, + { + "auxiliary_loss_clip": 0.06466323, + "auxiliary_loss_mlp": 0.01283225, + "balance_loss_clip": 0.06284231, + "balance_loss_mlp": 0.01266584, + "epoch": 0.34113933563805804, + "flos": 23042122629120.0, + "grad_norm": 1.8379615104267257, + "language_loss": 0.7978701, + "learning_rate": 3.067559762415682e-06, + "loss": 0.87536556, + "num_input_tokens_seen": 121862260, + "router_z_loss_clip": 1.81835938, + "router_z_loss_mlp": 0.16638184, + "step": 5674, + "time_per_iteration": 2.5462148189544678 + }, + { + "auxiliary_loss_clip": 0.06364837, + "auxiliary_loss_mlp": 0.01262017, + "balance_loss_clip": 0.06277348, + "balance_loss_mlp": 0.01255442, + "epoch": 0.341199458890726, + "flos": 69631878769920.0, + "grad_norm": 0.7752872762952348, + "language_loss": 0.56147063, + "learning_rate": 3.0672304042416198e-06, + "loss": 0.63773918, + "num_input_tokens_seen": 121923560, + "router_z_loss_clip": 0.875, + "router_z_loss_mlp": 0.06585693, + "step": 5675, + "time_per_iteration": 3.370281457901001 + }, + { + "auxiliary_loss_clip": 0.0645988, + "auxiliary_loss_mlp": 0.01273396, + "balance_loss_clip": 0.06281768, + "balance_loss_mlp": 0.01257398, + "epoch": 0.34125958214339397, + "flos": 22352939596800.0, + "grad_norm": 2.600205708544321, + "language_loss": 0.79689062, + "learning_rate": 3.0669010055980734e-06, + "loss": 0.87422335, + "num_input_tokens_seen": 121943515, + "router_z_loss_clip": 1.78125, + "router_z_loss_mlp": 0.16003418, + "step": 5676, + "time_per_iteration": 2.5312321186065674 + }, + { + "auxiliary_loss_clip": 0.06470488, + "auxiliary_loss_mlp": 0.01271752, + "balance_loss_clip": 0.06286064, + "balance_loss_mlp": 0.01255051, + "epoch": 0.34131970539606193, + "flos": 21878427525120.0, + "grad_norm": 2.203551534393157, + "language_loss": 0.8601976, + "learning_rate": 3.0665715664975357e-06, + "loss": 0.93761992, + "num_input_tokens_seen": 121962540, + "router_z_loss_clip": 1.84375, + "router_z_loss_mlp": 0.16699219, + "step": 5677, + "time_per_iteration": 2.555037260055542 + }, + { + "auxiliary_loss_clip": 0.06463757, + "auxiliary_loss_mlp": 0.01274207, + "balance_loss_clip": 0.06280699, + "balance_loss_mlp": 0.01257268, + "epoch": 0.3413798286487299, + "flos": 24942560757120.0, + "grad_norm": 2.786164717546535, + "language_loss": 0.80252033, + "learning_rate": 3.0662420869524966e-06, + "loss": 0.87989998, + "num_input_tokens_seen": 121979830, + "router_z_loss_clip": 1.83203125, + "router_z_loss_mlp": 0.16955566, + "step": 5678, + "time_per_iteration": 2.6321489810943604 + }, + { + "auxiliary_loss_clip": 0.06467854, + "auxiliary_loss_mlp": 0.01270663, + "balance_loss_clip": 0.06282793, + "balance_loss_mlp": 0.01255404, + "epoch": 0.34143995190139786, + "flos": 25381420116480.0, + "grad_norm": 1.8772848902338297, + "language_loss": 0.75927806, + "learning_rate": 3.0659125669754506e-06, + "loss": 0.83666325, + "num_input_tokens_seen": 121999055, + "router_z_loss_clip": 1.85253906, + "router_z_loss_mlp": 0.15246582, + "step": 5679, + "time_per_iteration": 2.5981781482696533 + }, + { + "auxiliary_loss_clip": 0.06365222, + "auxiliary_loss_mlp": 0.01260685, + "balance_loss_clip": 0.06278291, + "balance_loss_mlp": 0.01253538, + "epoch": 0.34150007515406583, + "flos": 67804785763200.0, + "grad_norm": 0.7019635675964923, + "language_loss": 0.59521842, + "learning_rate": 3.0655830065788923e-06, + "loss": 0.67147756, + "num_input_tokens_seen": 122067015, + "router_z_loss_clip": 0.8671875, + "router_z_loss_mlp": 0.0713501, + "step": 5680, + "time_per_iteration": 3.2768852710723877 + }, + { + "auxiliary_loss_clip": 0.06464119, + "auxiliary_loss_mlp": 0.01271493, + "balance_loss_clip": 0.06282759, + "balance_loss_mlp": 0.01255602, + "epoch": 0.3415601984067338, + "flos": 20308548954240.0, + "grad_norm": 1.756785442101194, + "language_loss": 0.72804415, + "learning_rate": 3.0652534057753206e-06, + "loss": 0.80540025, + "num_input_tokens_seen": 122085295, + "router_z_loss_clip": 1.81445312, + "router_z_loss_mlp": 0.15881348, + "step": 5681, + "time_per_iteration": 2.540839195251465 + }, + { + "auxiliary_loss_clip": 0.06462204, + "auxiliary_loss_mlp": 0.01272244, + "balance_loss_clip": 0.06283034, + "balance_loss_mlp": 0.01256806, + "epoch": 0.34162032165940176, + "flos": 26038346526720.0, + "grad_norm": 5.204332383129175, + "language_loss": 0.71220171, + "learning_rate": 3.064923764577233e-06, + "loss": 0.78954625, + "num_input_tokens_seen": 122104020, + "router_z_loss_clip": 1.79199219, + "router_z_loss_mlp": 0.15454102, + "step": 5682, + "time_per_iteration": 2.5933032035827637 + }, + { + "auxiliary_loss_clip": 0.06466864, + "auxiliary_loss_mlp": 0.0127503, + "balance_loss_clip": 0.06286532, + "balance_loss_mlp": 0.01258711, + "epoch": 0.3416804449120697, + "flos": 28810843223040.0, + "grad_norm": 1.4703350638010875, + "language_loss": 0.83879244, + "learning_rate": 3.0645940829971295e-06, + "loss": 0.91621137, + "num_input_tokens_seen": 122125080, + "router_z_loss_clip": 1.80273438, + "router_z_loss_mlp": 0.16320801, + "step": 5683, + "time_per_iteration": 2.595921277999878 + }, + { + "auxiliary_loss_clip": 0.06468399, + "auxiliary_loss_mlp": 0.01274924, + "balance_loss_clip": 0.06284815, + "balance_loss_mlp": 0.01258354, + "epoch": 0.3417405681647377, + "flos": 22608210660480.0, + "grad_norm": 1.8188343464074745, + "language_loss": 0.71334541, + "learning_rate": 3.0642643610475116e-06, + "loss": 0.79077864, + "num_input_tokens_seen": 122146350, + "router_z_loss_clip": 1.8359375, + "router_z_loss_mlp": 0.16577148, + "step": 5684, + "time_per_iteration": 2.5821194648742676 + }, + { + "auxiliary_loss_clip": 0.06462076, + "auxiliary_loss_mlp": 0.01268234, + "balance_loss_clip": 0.06284268, + "balance_loss_mlp": 0.01253816, + "epoch": 0.34180069141740566, + "flos": 24722942405760.0, + "grad_norm": 1.4943065575919134, + "language_loss": 0.75352108, + "learning_rate": 3.0639345987408823e-06, + "loss": 0.8308242, + "num_input_tokens_seen": 122168085, + "router_z_loss_clip": 1.77636719, + "router_z_loss_mlp": 0.144104, + "step": 5685, + "time_per_iteration": 2.545419216156006 + }, + { + "auxiliary_loss_clip": 0.06457227, + "auxiliary_loss_mlp": 0.01270508, + "balance_loss_clip": 0.06281762, + "balance_loss_mlp": 0.0125501, + "epoch": 0.3418608146700737, + "flos": 30526644879360.0, + "grad_norm": 1.8907916568784255, + "language_loss": 0.70833004, + "learning_rate": 3.0636047960897468e-06, + "loss": 0.7856074, + "num_input_tokens_seen": 122191040, + "router_z_loss_clip": 1.75390625, + "router_z_loss_mlp": 0.1550293, + "step": 5686, + "time_per_iteration": 2.645081043243408 + }, + { + "auxiliary_loss_clip": 0.06467415, + "auxiliary_loss_mlp": 0.01269453, + "balance_loss_clip": 0.06284459, + "balance_loss_mlp": 0.01253407, + "epoch": 0.34192093792274164, + "flos": 15127755333120.0, + "grad_norm": 2.1973050683231303, + "language_loss": 0.77864039, + "learning_rate": 3.06327495310661e-06, + "loss": 0.85600907, + "num_input_tokens_seen": 122209225, + "router_z_loss_clip": 1.82910156, + "router_z_loss_mlp": 0.16052246, + "step": 5687, + "time_per_iteration": 2.501957654953003 + }, + { + "auxiliary_loss_clip": 0.06462508, + "auxiliary_loss_mlp": 0.01273993, + "balance_loss_clip": 0.06286186, + "balance_loss_mlp": 0.01257435, + "epoch": 0.3419810611754096, + "flos": 13192754595840.0, + "grad_norm": 1.8198375176693335, + "language_loss": 0.87159389, + "learning_rate": 3.062945069803981e-06, + "loss": 0.94895893, + "num_input_tokens_seen": 122226160, + "router_z_loss_clip": 1.76269531, + "router_z_loss_mlp": 0.16552734, + "step": 5688, + "time_per_iteration": 2.514558792114258 + }, + { + "auxiliary_loss_clip": 0.06470017, + "auxiliary_loss_mlp": 0.01272882, + "balance_loss_clip": 0.06283651, + "balance_loss_mlp": 0.01255025, + "epoch": 0.34204118442807757, + "flos": 19542274565760.0, + "grad_norm": 1.9150705307332732, + "language_loss": 0.80177575, + "learning_rate": 3.0626151461943684e-06, + "loss": 0.87920475, + "num_input_tokens_seen": 122243115, + "router_z_loss_clip": 1.86328125, + "router_z_loss_mlp": 0.17858887, + "step": 5689, + "time_per_iteration": 2.4941842555999756 + }, + { + "auxiliary_loss_clip": 0.06471369, + "auxiliary_loss_mlp": 0.01270545, + "balance_loss_clip": 0.06286988, + "balance_loss_mlp": 0.01254476, + "epoch": 0.34210130768074554, + "flos": 15200192787840.0, + "grad_norm": 1.8413075326603192, + "language_loss": 0.74004579, + "learning_rate": 3.0622851822902834e-06, + "loss": 0.81746483, + "num_input_tokens_seen": 122261105, + "router_z_loss_clip": 1.84277344, + "router_z_loss_mlp": 0.1607666, + "step": 5690, + "time_per_iteration": 2.5133728981018066 + }, + { + "auxiliary_loss_clip": 0.06470567, + "auxiliary_loss_mlp": 0.01270745, + "balance_loss_clip": 0.06288044, + "balance_loss_mlp": 0.01254854, + "epoch": 0.3421614309334135, + "flos": 24943147735680.0, + "grad_norm": 2.8439157619722666, + "language_loss": 0.76563686, + "learning_rate": 3.061955178104237e-06, + "loss": 0.84305, + "num_input_tokens_seen": 122279995, + "router_z_loss_clip": 1.82519531, + "router_z_loss_mlp": 0.15893555, + "step": 5691, + "time_per_iteration": 2.5346477031707764 + }, + { + "auxiliary_loss_clip": 0.06465675, + "auxiliary_loss_mlp": 0.01269129, + "balance_loss_clip": 0.06286939, + "balance_loss_mlp": 0.01254395, + "epoch": 0.34222155418608147, + "flos": 21915170340480.0, + "grad_norm": 1.7269103068173344, + "language_loss": 0.6888957, + "learning_rate": 3.0616251336487447e-06, + "loss": 0.7662437, + "num_input_tokens_seen": 122299070, + "router_z_loss_clip": 1.78710938, + "router_z_loss_mlp": 0.1472168, + "step": 5692, + "time_per_iteration": 2.544475793838501 + }, + { + "auxiliary_loss_clip": 0.06469652, + "auxiliary_loss_mlp": 0.01276187, + "balance_loss_clip": 0.06286649, + "balance_loss_mlp": 0.01259069, + "epoch": 0.34228167743874943, + "flos": 18119954234880.0, + "grad_norm": 2.5543870280075494, + "language_loss": 0.72691154, + "learning_rate": 3.06129504893632e-06, + "loss": 0.80436993, + "num_input_tokens_seen": 122316800, + "router_z_loss_clip": 1.83203125, + "router_z_loss_mlp": 0.17126465, + "step": 5693, + "time_per_iteration": 2.4823062419891357 + }, + { + "auxiliary_loss_clip": 0.06469734, + "auxiliary_loss_mlp": 0.01268069, + "balance_loss_clip": 0.06291726, + "balance_loss_mlp": 0.01253049, + "epoch": 0.3423418006914174, + "flos": 21295070599680.0, + "grad_norm": 1.6526919771326485, + "language_loss": 0.76433146, + "learning_rate": 3.0609649239794813e-06, + "loss": 0.84170949, + "num_input_tokens_seen": 122335275, + "router_z_loss_clip": 1.78027344, + "router_z_loss_mlp": 0.15008545, + "step": 5694, + "time_per_iteration": 2.5759999752044678 + }, + { + "auxiliary_loss_clip": 0.06469683, + "auxiliary_loss_mlp": 0.01269733, + "balance_loss_clip": 0.06292015, + "balance_loss_mlp": 0.01254498, + "epoch": 0.34240192394408536, + "flos": 19828754075520.0, + "grad_norm": 1.7073290043069882, + "language_loss": 0.80359411, + "learning_rate": 3.060634758790747e-06, + "loss": 0.88098824, + "num_input_tokens_seen": 122353215, + "router_z_loss_clip": 1.77734375, + "router_z_loss_mlp": 0.15222168, + "step": 5695, + "time_per_iteration": 2.53019118309021 + }, + { + "auxiliary_loss_clip": 0.06473886, + "auxiliary_loss_mlp": 0.01274215, + "balance_loss_clip": 0.06292082, + "balance_loss_mlp": 0.01257335, + "epoch": 0.3424620471967533, + "flos": 24542498638080.0, + "grad_norm": 2.150928833794339, + "language_loss": 0.74189723, + "learning_rate": 3.060304553382635e-06, + "loss": 0.81937826, + "num_input_tokens_seen": 122372495, + "router_z_loss_clip": 1.81640625, + "router_z_loss_mlp": 0.16882324, + "step": 5696, + "time_per_iteration": 2.6046504974365234 + }, + { + "auxiliary_loss_clip": 0.06472932, + "auxiliary_loss_mlp": 0.01273918, + "balance_loss_clip": 0.062935, + "balance_loss_mlp": 0.0125786, + "epoch": 0.3425221704494213, + "flos": 25856057969280.0, + "grad_norm": 1.9268953245740004, + "language_loss": 0.71419311, + "learning_rate": 3.0599743077676685e-06, + "loss": 0.79166162, + "num_input_tokens_seen": 122394600, + "router_z_loss_clip": 1.79394531, + "router_z_loss_mlp": 0.16052246, + "step": 5697, + "time_per_iteration": 2.565295696258545 + }, + { + "auxiliary_loss_clip": 0.06469944, + "auxiliary_loss_mlp": 0.01270077, + "balance_loss_clip": 0.06292768, + "balance_loss_mlp": 0.01254293, + "epoch": 0.34258229370208926, + "flos": 21546442448640.0, + "grad_norm": 1.77565898086167, + "language_loss": 0.82456839, + "learning_rate": 3.05964402195837e-06, + "loss": 0.90196872, + "num_input_tokens_seen": 122414700, + "router_z_loss_clip": 1.77050781, + "router_z_loss_mlp": 0.15795898, + "step": 5698, + "time_per_iteration": 2.636547327041626 + }, + { + "auxiliary_loss_clip": 0.06476933, + "auxiliary_loss_mlp": 0.01277942, + "balance_loss_clip": 0.06293021, + "balance_loss_mlp": 0.01260573, + "epoch": 0.3426424169547573, + "flos": 23658407009280.0, + "grad_norm": 1.9460205950694964, + "language_loss": 0.69722092, + "learning_rate": 3.0593136959672645e-06, + "loss": 0.77476966, + "num_input_tokens_seen": 122432760, + "router_z_loss_clip": 1.83886719, + "router_z_loss_mlp": 0.17358398, + "step": 5699, + "time_per_iteration": 2.523766040802002 + }, + { + "auxiliary_loss_clip": 0.06469926, + "auxiliary_loss_mlp": 0.0127405, + "balance_loss_clip": 0.06289239, + "balance_loss_mlp": 0.01257719, + "epoch": 0.34270254020742524, + "flos": 24651846616320.0, + "grad_norm": 2.105384484263751, + "language_loss": 0.72511256, + "learning_rate": 3.058983329806877e-06, + "loss": 0.80255234, + "num_input_tokens_seen": 122449105, + "router_z_loss_clip": 1.80566406, + "router_z_loss_mlp": 0.16320801, + "step": 5700, + "time_per_iteration": 2.57511568069458 + }, + { + "auxiliary_loss_clip": 0.06467311, + "auxiliary_loss_mlp": 0.01271093, + "balance_loss_clip": 0.06288276, + "balance_loss_mlp": 0.01254273, + "epoch": 0.3427626634600932, + "flos": 21003182501760.0, + "grad_norm": 2.114283139984186, + "language_loss": 0.82378924, + "learning_rate": 3.0586529234897354e-06, + "loss": 0.90117323, + "num_input_tokens_seen": 122468700, + "router_z_loss_clip": 1.79003906, + "router_z_loss_mlp": 0.16821289, + "step": 5701, + "time_per_iteration": 2.496392250061035 + }, + { + "auxiliary_loss_clip": 0.06469429, + "auxiliary_loss_mlp": 0.0127326, + "balance_loss_clip": 0.06287375, + "balance_loss_mlp": 0.01256439, + "epoch": 0.3428227867127612, + "flos": 21440155144320.0, + "grad_norm": 1.6330699344557849, + "language_loss": 0.71898985, + "learning_rate": 3.0583224770283694e-06, + "loss": 0.79641676, + "num_input_tokens_seen": 122488160, + "router_z_loss_clip": 1.8203125, + "router_z_loss_mlp": 0.16821289, + "step": 5702, + "time_per_iteration": 2.566856861114502 + }, + { + "auxiliary_loss_clip": 0.06377172, + "auxiliary_loss_mlp": 0.01259818, + "balance_loss_clip": 0.06290582, + "balance_loss_mlp": 0.01252552, + "epoch": 0.34288290996542914, + "flos": 55750219902720.0, + "grad_norm": 0.7671857510805999, + "language_loss": 0.56708395, + "learning_rate": 3.057991990435309e-06, + "loss": 0.64345384, + "num_input_tokens_seen": 122542890, + "router_z_loss_clip": 0.8671875, + "router_z_loss_mlp": 0.07244873, + "step": 5703, + "time_per_iteration": 4.447732925415039 + }, + { + "auxiliary_loss_clip": 0.06465772, + "auxiliary_loss_mlp": 0.01272683, + "balance_loss_clip": 0.06283242, + "balance_loss_mlp": 0.01255207, + "epoch": 0.3429430332180971, + "flos": 20162961285120.0, + "grad_norm": 1.88810633796735, + "language_loss": 0.74954486, + "learning_rate": 3.057661463723086e-06, + "loss": 0.82692933, + "num_input_tokens_seen": 122561770, + "router_z_loss_clip": 1.82617188, + "router_z_loss_mlp": 0.17468262, + "step": 5704, + "time_per_iteration": 4.062070608139038 + }, + { + "auxiliary_loss_clip": 0.06463447, + "auxiliary_loss_mlp": 0.01275499, + "balance_loss_clip": 0.06284866, + "balance_loss_mlp": 0.01259716, + "epoch": 0.34300315647076507, + "flos": 17971347818880.0, + "grad_norm": 2.0890845856962565, + "language_loss": 0.73438597, + "learning_rate": 3.0573308969042346e-06, + "loss": 0.81177545, + "num_input_tokens_seen": 122580580, + "router_z_loss_clip": 1.78417969, + "router_z_loss_mlp": 0.15795898, + "step": 5705, + "time_per_iteration": 2.5125277042388916 + }, + { + "auxiliary_loss_clip": 0.06466857, + "auxiliary_loss_mlp": 0.01271633, + "balance_loss_clip": 0.0628458, + "balance_loss_mlp": 0.01255194, + "epoch": 0.34306327972343303, + "flos": 22092679215360.0, + "grad_norm": 2.3658652894382075, + "language_loss": 0.80144984, + "learning_rate": 3.057000289991289e-06, + "loss": 0.87883472, + "num_input_tokens_seen": 122599810, + "router_z_loss_clip": 1.82226562, + "router_z_loss_mlp": 0.16430664, + "step": 5706, + "time_per_iteration": 2.524531364440918 + }, + { + "auxiliary_loss_clip": 0.06468605, + "auxiliary_loss_mlp": 0.01272132, + "balance_loss_clip": 0.06282079, + "balance_loss_mlp": 0.0125493, + "epoch": 0.343123402976101, + "flos": 18448669002240.0, + "grad_norm": 1.9272208577124825, + "language_loss": 0.83210528, + "learning_rate": 3.056669642996787e-06, + "loss": 0.90951264, + "num_input_tokens_seen": 122616035, + "router_z_loss_clip": 1.86621094, + "router_z_loss_mlp": 0.17199707, + "step": 5707, + "time_per_iteration": 4.017935514450073 + }, + { + "auxiliary_loss_clip": 0.06464301, + "auxiliary_loss_mlp": 0.01275983, + "balance_loss_clip": 0.06283538, + "balance_loss_mlp": 0.01259544, + "epoch": 0.34318352622876896, + "flos": 17169127228800.0, + "grad_norm": 1.5274992455100316, + "language_loss": 0.74774885, + "learning_rate": 3.056338955933266e-06, + "loss": 0.82515168, + "num_input_tokens_seen": 122633785, + "router_z_loss_clip": 1.80566406, + "router_z_loss_mlp": 0.16442871, + "step": 5708, + "time_per_iteration": 2.6189568042755127 + }, + { + "auxiliary_loss_clip": 0.06460952, + "auxiliary_loss_mlp": 0.01273078, + "balance_loss_clip": 0.06282704, + "balance_loss_mlp": 0.01256365, + "epoch": 0.34324364948143693, + "flos": 26695482572160.0, + "grad_norm": 1.5717787719434457, + "language_loss": 0.80904007, + "learning_rate": 3.0560082288132662e-06, + "loss": 0.88638043, + "num_input_tokens_seen": 122652100, + "router_z_loss_clip": 1.78515625, + "router_z_loss_mlp": 0.16711426, + "step": 5709, + "time_per_iteration": 2.563938617706299 + }, + { + "auxiliary_loss_clip": 0.06471742, + "auxiliary_loss_mlp": 0.01280104, + "balance_loss_clip": 0.06286193, + "balance_loss_mlp": 0.01260685, + "epoch": 0.3433037727341049, + "flos": 21257950440960.0, + "grad_norm": 2.571520261591023, + "language_loss": 0.79460347, + "learning_rate": 3.055677461649329e-06, + "loss": 0.87212193, + "num_input_tokens_seen": 122669720, + "router_z_loss_clip": 1.85449219, + "router_z_loss_mlp": 0.1940918, + "step": 5710, + "time_per_iteration": 2.5515291690826416 + }, + { + "auxiliary_loss_clip": 0.06468266, + "auxiliary_loss_mlp": 0.0127181, + "balance_loss_clip": 0.06282788, + "balance_loss_mlp": 0.01254334, + "epoch": 0.34336389598677286, + "flos": 20635377004800.0, + "grad_norm": 1.916674758610419, + "language_loss": 0.70532334, + "learning_rate": 3.055346654453996e-06, + "loss": 0.78272408, + "num_input_tokens_seen": 122688715, + "router_z_loss_clip": 1.85253906, + "router_z_loss_mlp": 0.17468262, + "step": 5711, + "time_per_iteration": 3.958890914916992 + }, + { + "auxiliary_loss_clip": 0.06467056, + "auxiliary_loss_mlp": 0.01273896, + "balance_loss_clip": 0.0628437, + "balance_loss_mlp": 0.01256909, + "epoch": 0.3434240192394409, + "flos": 14543895283200.0, + "grad_norm": 2.810027228242578, + "language_loss": 0.67786914, + "learning_rate": 3.055015807239812e-06, + "loss": 0.75527865, + "num_input_tokens_seen": 122706970, + "router_z_loss_clip": 1.82421875, + "router_z_loss_mlp": 0.16992188, + "step": 5712, + "time_per_iteration": 2.4752726554870605 + }, + { + "auxiliary_loss_clip": 0.06366295, + "auxiliary_loss_mlp": 0.01262856, + "balance_loss_clip": 0.06280869, + "balance_loss_mlp": 0.01254685, + "epoch": 0.34348414249210885, + "flos": 58067799183360.0, + "grad_norm": 0.8383081559544242, + "language_loss": 0.58214718, + "learning_rate": 3.0546849200193226e-06, + "loss": 0.65843868, + "num_input_tokens_seen": 122758095, + "router_z_loss_clip": 0.85205078, + "router_z_loss_mlp": 0.08172607, + "step": 5713, + "time_per_iteration": 3.11580491065979 + }, + { + "auxiliary_loss_clip": 0.06465655, + "auxiliary_loss_mlp": 0.01274581, + "balance_loss_clip": 0.06281169, + "balance_loss_mlp": 0.01257308, + "epoch": 0.3435442657447768, + "flos": 20710749352320.0, + "grad_norm": 1.8141637433077298, + "language_loss": 0.81045675, + "learning_rate": 3.054353992805076e-06, + "loss": 0.88785917, + "num_input_tokens_seen": 122777815, + "router_z_loss_clip": 1.84472656, + "router_z_loss_mlp": 0.17272949, + "step": 5714, + "time_per_iteration": 2.510929822921753 + }, + { + "auxiliary_loss_clip": 0.0646632, + "auxiliary_loss_mlp": 0.01276019, + "balance_loss_clip": 0.06283875, + "balance_loss_mlp": 0.01260045, + "epoch": 0.3436043889974448, + "flos": 22936967354880.0, + "grad_norm": 2.602776673257047, + "language_loss": 0.72001171, + "learning_rate": 3.05402302560962e-06, + "loss": 0.79743505, + "num_input_tokens_seen": 122797555, + "router_z_loss_clip": 1.82421875, + "router_z_loss_mlp": 0.15991211, + "step": 5715, + "time_per_iteration": 2.5680224895477295 + }, + { + "auxiliary_loss_clip": 0.06365244, + "auxiliary_loss_mlp": 0.01259148, + "balance_loss_clip": 0.06280053, + "balance_loss_mlp": 0.01251191, + "epoch": 0.34366451225011274, + "flos": 58423514964480.0, + "grad_norm": 0.8879413605742031, + "language_loss": 0.65628481, + "learning_rate": 3.053692018445505e-06, + "loss": 0.73252875, + "num_input_tokens_seen": 122863955, + "router_z_loss_clip": 0.8515625, + "router_z_loss_mlp": 0.07952881, + "step": 5716, + "time_per_iteration": 3.184952735900879 + }, + { + "auxiliary_loss_clip": 0.06463662, + "auxiliary_loss_mlp": 0.01279768, + "balance_loss_clip": 0.0628469, + "balance_loss_mlp": 0.01264509, + "epoch": 0.3437246355027807, + "flos": 15601722353280.0, + "grad_norm": 1.9800950186090778, + "language_loss": 0.74289393, + "learning_rate": 3.0533609713252838e-06, + "loss": 0.82032824, + "num_input_tokens_seen": 122883000, + "router_z_loss_clip": 1.78808594, + "router_z_loss_mlp": 0.15252686, + "step": 5717, + "time_per_iteration": 2.5220494270324707 + }, + { + "auxiliary_loss_clip": 0.06466433, + "auxiliary_loss_mlp": 0.01278824, + "balance_loss_clip": 0.0628383, + "balance_loss_mlp": 0.01262946, + "epoch": 0.34378475875544867, + "flos": 27679572449280.0, + "grad_norm": 1.8348085520910409, + "language_loss": 0.75694019, + "learning_rate": 3.0530298842615077e-06, + "loss": 0.83439279, + "num_input_tokens_seen": 122903265, + "router_z_loss_clip": 1.82617188, + "router_z_loss_mlp": 0.15869141, + "step": 5718, + "time_per_iteration": 2.5983147621154785 + }, + { + "auxiliary_loss_clip": 0.06468937, + "auxiliary_loss_mlp": 0.01273829, + "balance_loss_clip": 0.06284641, + "balance_loss_mlp": 0.01256829, + "epoch": 0.34384488200811664, + "flos": 31439638967040.0, + "grad_norm": 1.8816683210791167, + "language_loss": 0.6437763, + "learning_rate": 3.052698757266734e-06, + "loss": 0.72120392, + "num_input_tokens_seen": 122923860, + "router_z_loss_clip": 1.84179688, + "router_z_loss_mlp": 0.17004395, + "step": 5719, + "time_per_iteration": 2.7075517177581787 + }, + { + "auxiliary_loss_clip": 0.06472047, + "auxiliary_loss_mlp": 0.0127673, + "balance_loss_clip": 0.06285335, + "balance_loss_mlp": 0.012596, + "epoch": 0.3439050052607846, + "flos": 24906866117760.0, + "grad_norm": 1.6709560385881974, + "language_loss": 0.73730874, + "learning_rate": 3.0523675903535183e-06, + "loss": 0.81479651, + "num_input_tokens_seen": 122945305, + "router_z_loss_clip": 1.86816406, + "router_z_loss_mlp": 0.17150879, + "step": 5720, + "time_per_iteration": 2.5936295986175537 + }, + { + "auxiliary_loss_clip": 0.06469208, + "auxiliary_loss_mlp": 0.01280833, + "balance_loss_clip": 0.06286804, + "balance_loss_mlp": 0.01264072, + "epoch": 0.34396512851345257, + "flos": 18155900436480.0, + "grad_norm": 1.8909667336437188, + "language_loss": 0.74550021, + "learning_rate": 3.0520363835344173e-06, + "loss": 0.82300061, + "num_input_tokens_seen": 122962535, + "router_z_loss_clip": 1.82421875, + "router_z_loss_mlp": 0.16748047, + "step": 5721, + "time_per_iteration": 2.5109763145446777 + }, + { + "auxiliary_loss_clip": 0.06468637, + "auxiliary_loss_mlp": 0.01276688, + "balance_loss_clip": 0.06284628, + "balance_loss_mlp": 0.01260208, + "epoch": 0.34402525176612053, + "flos": 16039994734080.0, + "grad_norm": 3.7669546448597497, + "language_loss": 0.80102623, + "learning_rate": 3.051705136821992e-06, + "loss": 0.87847948, + "num_input_tokens_seen": 122979750, + "router_z_loss_clip": 1.83984375, + "router_z_loss_mlp": 0.16479492, + "step": 5722, + "time_per_iteration": 2.5231471061706543 + }, + { + "auxiliary_loss_clip": 0.06467631, + "auxiliary_loss_mlp": 0.01281232, + "balance_loss_clip": 0.06286201, + "balance_loss_mlp": 0.01265806, + "epoch": 0.3440853750187885, + "flos": 21185009861760.0, + "grad_norm": 1.9591310013999468, + "language_loss": 0.82034022, + "learning_rate": 3.051373850228801e-06, + "loss": 0.89782888, + "num_input_tokens_seen": 122998955, + "router_z_loss_clip": 1.81445312, + "router_z_loss_mlp": 0.1541748, + "step": 5723, + "time_per_iteration": 2.5556578636169434 + }, + { + "auxiliary_loss_clip": 0.06471531, + "auxiliary_loss_mlp": 0.01281521, + "balance_loss_clip": 0.0628756, + "balance_loss_mlp": 0.0126588, + "epoch": 0.34414549827145646, + "flos": 12682883301120.0, + "grad_norm": 1.867182825140108, + "language_loss": 0.8172524, + "learning_rate": 3.0510425237674096e-06, + "loss": 0.8947829, + "num_input_tokens_seen": 123016165, + "router_z_loss_clip": 1.83984375, + "router_z_loss_mlp": 0.15661621, + "step": 5724, + "time_per_iteration": 2.509129524230957 + }, + { + "auxiliary_loss_clip": 0.06476942, + "auxiliary_loss_mlp": 0.01281282, + "balance_loss_clip": 0.06292838, + "balance_loss_mlp": 0.01265237, + "epoch": 0.3442056215241244, + "flos": 31292458070400.0, + "grad_norm": 1.852126712281853, + "language_loss": 0.69186389, + "learning_rate": 3.05071115745038e-06, + "loss": 0.76944625, + "num_input_tokens_seen": 123036900, + "router_z_loss_clip": 1.83886719, + "router_z_loss_mlp": 0.16040039, + "step": 5725, + "time_per_iteration": 2.6253697872161865 + }, + { + "auxiliary_loss_clip": 0.06482734, + "auxiliary_loss_mlp": 0.01284248, + "balance_loss_clip": 0.06293113, + "balance_loss_mlp": 0.01266462, + "epoch": 0.34426574477679245, + "flos": 23373939997440.0, + "grad_norm": 1.5373453518160676, + "language_loss": 0.69532049, + "learning_rate": 3.0503797512902773e-06, + "loss": 0.77299035, + "num_input_tokens_seen": 123057480, + "router_z_loss_clip": 1.89648438, + "router_z_loss_mlp": 0.17785645, + "step": 5726, + "time_per_iteration": 2.5495173931121826 + }, + { + "auxiliary_loss_clip": 0.06477433, + "auxiliary_loss_mlp": 0.01281684, + "balance_loss_clip": 0.06292193, + "balance_loss_mlp": 0.01265948, + "epoch": 0.3443258680294604, + "flos": 24542372856960.0, + "grad_norm": 3.3735616171284453, + "language_loss": 0.73631704, + "learning_rate": 3.0500483052996703e-06, + "loss": 0.81390822, + "num_input_tokens_seen": 123076890, + "router_z_loss_clip": 1.85253906, + "router_z_loss_mlp": 0.15734863, + "step": 5727, + "time_per_iteration": 2.5395119190216064 + }, + { + "auxiliary_loss_clip": 0.06474276, + "auxiliary_loss_mlp": 0.01274594, + "balance_loss_clip": 0.06292102, + "balance_loss_mlp": 0.01259049, + "epoch": 0.3443859912821284, + "flos": 20236363061760.0, + "grad_norm": 1.756953821036591, + "language_loss": 0.88303459, + "learning_rate": 3.0497168194911257e-06, + "loss": 0.96052337, + "num_input_tokens_seen": 123092530, + "router_z_loss_clip": 1.8203125, + "router_z_loss_mlp": 0.15551758, + "step": 5728, + "time_per_iteration": 2.5943620204925537 + }, + { + "auxiliary_loss_clip": 0.06472028, + "auxiliary_loss_mlp": 0.01275786, + "balance_loss_clip": 0.06289984, + "balance_loss_mlp": 0.01259382, + "epoch": 0.34444611453479634, + "flos": 24323425338240.0, + "grad_norm": 1.9801243778486481, + "language_loss": 0.70532095, + "learning_rate": 3.0493852938772143e-06, + "loss": 0.78279907, + "num_input_tokens_seen": 123110560, + "router_z_loss_clip": 1.8203125, + "router_z_loss_mlp": 0.1640625, + "step": 5729, + "time_per_iteration": 2.5122504234313965 + }, + { + "auxiliary_loss_clip": 0.06472413, + "auxiliary_loss_mlp": 0.01278834, + "balance_loss_clip": 0.06293523, + "balance_loss_mlp": 0.01263123, + "epoch": 0.3445062377874643, + "flos": 16989186585600.0, + "grad_norm": 2.065738946159642, + "language_loss": 0.74902749, + "learning_rate": 3.0490537284705078e-06, + "loss": 0.82653993, + "num_input_tokens_seen": 123128655, + "router_z_loss_clip": 1.79003906, + "router_z_loss_mlp": 0.15710449, + "step": 5730, + "time_per_iteration": 2.4971024990081787 + }, + { + "auxiliary_loss_clip": 0.06477457, + "auxiliary_loss_mlp": 0.01272788, + "balance_loss_clip": 0.06295118, + "balance_loss_mlp": 0.01256921, + "epoch": 0.3445663610401323, + "flos": 20308884370560.0, + "grad_norm": 2.25692333978076, + "language_loss": 0.79881716, + "learning_rate": 3.048722123283578e-06, + "loss": 0.87631959, + "num_input_tokens_seen": 123145130, + "router_z_loss_clip": 1.82128906, + "router_z_loss_mlp": 0.15869141, + "step": 5731, + "time_per_iteration": 2.5055606365203857 + }, + { + "auxiliary_loss_clip": 0.0647382, + "auxiliary_loss_mlp": 0.01272088, + "balance_loss_clip": 0.06289574, + "balance_loss_mlp": 0.01256532, + "epoch": 0.34462648429280024, + "flos": 15893568524160.0, + "grad_norm": 2.0529883798711586, + "language_loss": 0.78536034, + "learning_rate": 3.0483904783290006e-06, + "loss": 0.86281943, + "num_input_tokens_seen": 123162265, + "router_z_loss_clip": 1.84277344, + "router_z_loss_mlp": 0.15545654, + "step": 5732, + "time_per_iteration": 2.58428692817688 + }, + { + "auxiliary_loss_clip": 0.06393671, + "auxiliary_loss_mlp": 0.01269392, + "balance_loss_clip": 0.06309536, + "balance_loss_mlp": 0.01263571, + "epoch": 0.3446866075454682, + "flos": 59330681193600.0, + "grad_norm": 0.7296400398421587, + "language_loss": 0.53166986, + "learning_rate": 3.0480587936193505e-06, + "loss": 0.60830045, + "num_input_tokens_seen": 123218620, + "router_z_loss_clip": 0.84375, + "router_z_loss_mlp": 0.05813599, + "step": 5733, + "time_per_iteration": 3.1921679973602295 + }, + { + "auxiliary_loss_clip": 0.06473544, + "auxiliary_loss_mlp": 0.01275818, + "balance_loss_clip": 0.06292105, + "balance_loss_mlp": 0.01259248, + "epoch": 0.34474673079813617, + "flos": 22349962776960.0, + "grad_norm": 1.6143563972241732, + "language_loss": 0.83787543, + "learning_rate": 3.047727069167207e-06, + "loss": 0.91536903, + "num_input_tokens_seen": 123237325, + "router_z_loss_clip": 1.81542969, + "router_z_loss_mlp": 0.16564941, + "step": 5734, + "time_per_iteration": 2.5630810260772705 + }, + { + "auxiliary_loss_clip": 0.06472072, + "auxiliary_loss_mlp": 0.01278915, + "balance_loss_clip": 0.0628967, + "balance_loss_mlp": 0.01262834, + "epoch": 0.34480685405080413, + "flos": 27677098753920.0, + "grad_norm": 1.7144738343554842, + "language_loss": 0.93389094, + "learning_rate": 3.0473953049851478e-06, + "loss": 1.01140082, + "num_input_tokens_seen": 123258650, + "router_z_loss_clip": 1.82324219, + "router_z_loss_mlp": 0.1607666, + "step": 5735, + "time_per_iteration": 2.5621798038482666 + }, + { + "auxiliary_loss_clip": 0.06471383, + "auxiliary_loss_mlp": 0.01276844, + "balance_loss_clip": 0.06284925, + "balance_loss_mlp": 0.01259273, + "epoch": 0.3448669773034721, + "flos": 22462664918400.0, + "grad_norm": 1.7840822264419087, + "language_loss": 0.77095437, + "learning_rate": 3.0470635010857533e-06, + "loss": 0.84843659, + "num_input_tokens_seen": 123277155, + "router_z_loss_clip": 1.86523438, + "router_z_loss_mlp": 0.17578125, + "step": 5736, + "time_per_iteration": 2.5377349853515625 + }, + { + "auxiliary_loss_clip": 0.06471781, + "auxiliary_loss_mlp": 0.01270645, + "balance_loss_clip": 0.06287266, + "balance_loss_mlp": 0.01255326, + "epoch": 0.34492710055614006, + "flos": 24943105808640.0, + "grad_norm": 1.6287034776462515, + "language_loss": 0.79113513, + "learning_rate": 3.0467316574816064e-06, + "loss": 0.86855936, + "num_input_tokens_seen": 123297640, + "router_z_loss_clip": 1.84667969, + "router_z_loss_mlp": 0.15319824, + "step": 5737, + "time_per_iteration": 2.5471904277801514 + }, + { + "auxiliary_loss_clip": 0.06473309, + "auxiliary_loss_mlp": 0.01276485, + "balance_loss_clip": 0.06285917, + "balance_loss_mlp": 0.0125976, + "epoch": 0.34498722380880803, + "flos": 20127057010560.0, + "grad_norm": 2.191814396638409, + "language_loss": 0.72072059, + "learning_rate": 3.0463997741852893e-06, + "loss": 0.79821849, + "num_input_tokens_seen": 123314370, + "router_z_loss_clip": 1.875, + "router_z_loss_mlp": 0.16723633, + "step": 5738, + "time_per_iteration": 2.540442943572998 + }, + { + "auxiliary_loss_clip": 0.06471272, + "auxiliary_loss_mlp": 0.01272808, + "balance_loss_clip": 0.06284432, + "balance_loss_mlp": 0.01255821, + "epoch": 0.34504734706147605, + "flos": 28445511421440.0, + "grad_norm": 1.9413212194180998, + "language_loss": 0.82238245, + "learning_rate": 3.046067851209389e-06, + "loss": 0.89982325, + "num_input_tokens_seen": 123336085, + "router_z_loss_clip": 1.8671875, + "router_z_loss_mlp": 0.16992188, + "step": 5739, + "time_per_iteration": 2.57327938079834 + }, + { + "auxiliary_loss_clip": 0.06469989, + "auxiliary_loss_mlp": 0.0127904, + "balance_loss_clip": 0.06284826, + "balance_loss_mlp": 0.01261862, + "epoch": 0.345107470314144, + "flos": 22681067385600.0, + "grad_norm": 1.914547064909644, + "language_loss": 0.83564734, + "learning_rate": 3.0457358885664898e-06, + "loss": 0.91313767, + "num_input_tokens_seen": 123354460, + "router_z_loss_clip": 1.84960938, + "router_z_loss_mlp": 0.171875, + "step": 5740, + "time_per_iteration": 2.5514895915985107 + }, + { + "auxiliary_loss_clip": 0.06466584, + "auxiliary_loss_mlp": 0.01275646, + "balance_loss_clip": 0.06283005, + "balance_loss_mlp": 0.01258921, + "epoch": 0.345167593566812, + "flos": 20636886378240.0, + "grad_norm": 2.1474795597791734, + "language_loss": 0.76802379, + "learning_rate": 3.045403886269181e-06, + "loss": 0.84544611, + "num_input_tokens_seen": 123373420, + "router_z_loss_clip": 1.83496094, + "router_z_loss_mlp": 0.16723633, + "step": 5741, + "time_per_iteration": 2.511997699737549 + }, + { + "auxiliary_loss_clip": 0.06466299, + "auxiliary_loss_mlp": 0.0127053, + "balance_loss_clip": 0.06279384, + "balance_loss_mlp": 0.01254544, + "epoch": 0.34522771681947995, + "flos": 26221683260160.0, + "grad_norm": 1.6006732343467382, + "language_loss": 0.77803171, + "learning_rate": 3.045071844330053e-06, + "loss": 0.85540009, + "num_input_tokens_seen": 123394730, + "router_z_loss_clip": 1.87011719, + "router_z_loss_mlp": 0.15966797, + "step": 5742, + "time_per_iteration": 2.5593955516815186 + }, + { + "auxiliary_loss_clip": 0.06464212, + "auxiliary_loss_mlp": 0.01272894, + "balance_loss_clip": 0.06281982, + "balance_loss_mlp": 0.01256074, + "epoch": 0.3452878400721479, + "flos": 19068349472640.0, + "grad_norm": 2.2544306863162538, + "language_loss": 0.76459014, + "learning_rate": 3.0447397627616955e-06, + "loss": 0.84196126, + "num_input_tokens_seen": 123412895, + "router_z_loss_clip": 1.82421875, + "router_z_loss_mlp": 0.16821289, + "step": 5743, + "time_per_iteration": 3.996267557144165 + }, + { + "auxiliary_loss_clip": 0.06462429, + "auxiliary_loss_mlp": 0.0126984, + "balance_loss_clip": 0.06281956, + "balance_loss_mlp": 0.01255118, + "epoch": 0.3453479633248159, + "flos": 27937442989440.0, + "grad_norm": 1.578255214465821, + "language_loss": 0.7080915, + "learning_rate": 3.0444076415767016e-06, + "loss": 0.78541422, + "num_input_tokens_seen": 123432320, + "router_z_loss_clip": 1.80761719, + "router_z_loss_mlp": 0.14727783, + "step": 5744, + "time_per_iteration": 2.5594234466552734 + }, + { + "auxiliary_loss_clip": 0.06462625, + "auxiliary_loss_mlp": 0.01272389, + "balance_loss_clip": 0.0628416, + "balance_loss_mlp": 0.01256523, + "epoch": 0.34540808657748384, + "flos": 19611609419520.0, + "grad_norm": 1.8945383960499247, + "language_loss": 0.79877782, + "learning_rate": 3.044075480787665e-06, + "loss": 0.87612802, + "num_input_tokens_seen": 123450980, + "router_z_loss_clip": 1.78320312, + "router_z_loss_mlp": 0.15881348, + "step": 5745, + "time_per_iteration": 2.5577902793884277 + }, + { + "auxiliary_loss_clip": 0.0646376, + "auxiliary_loss_mlp": 0.0127446, + "balance_loss_clip": 0.0627804, + "balance_loss_mlp": 0.01258343, + "epoch": 0.3454682098301518, + "flos": 20417771151360.0, + "grad_norm": 2.2215207406176063, + "language_loss": 0.90027881, + "learning_rate": 3.043743280407182e-06, + "loss": 0.97766101, + "num_input_tokens_seen": 123469365, + "router_z_loss_clip": 1.85742188, + "router_z_loss_mlp": 0.16113281, + "step": 5746, + "time_per_iteration": 4.126953840255737 + }, + { + "auxiliary_loss_clip": 0.06469168, + "auxiliary_loss_mlp": 0.01271588, + "balance_loss_clip": 0.06281114, + "balance_loss_mlp": 0.01254648, + "epoch": 0.34552833308281977, + "flos": 21331603779840.0, + "grad_norm": 1.8420175913064167, + "language_loss": 0.65233189, + "learning_rate": 3.043411040447849e-06, + "loss": 0.72973943, + "num_input_tokens_seen": 123489425, + "router_z_loss_clip": 1.88085938, + "router_z_loss_mlp": 0.16931152, + "step": 5747, + "time_per_iteration": 2.6445960998535156 + }, + { + "auxiliary_loss_clip": 0.06461484, + "auxiliary_loss_mlp": 0.01274425, + "balance_loss_clip": 0.06279166, + "balance_loss_mlp": 0.01259166, + "epoch": 0.34558845633548774, + "flos": 36251914331520.0, + "grad_norm": 1.6152983170909512, + "language_loss": 0.72912234, + "learning_rate": 3.043078760922264e-06, + "loss": 0.80648136, + "num_input_tokens_seen": 123509970, + "router_z_loss_clip": 1.82226562, + "router_z_loss_mlp": 0.15246582, + "step": 5748, + "time_per_iteration": 2.668628692626953 + }, + { + "auxiliary_loss_clip": 0.0646018, + "auxiliary_loss_mlp": 0.01271906, + "balance_loss_clip": 0.06281725, + "balance_loss_mlp": 0.01257268, + "epoch": 0.3456485795881557, + "flos": 22456292008320.0, + "grad_norm": 2.139365243179929, + "language_loss": 0.75935584, + "learning_rate": 3.042746441843029e-06, + "loss": 0.83667672, + "num_input_tokens_seen": 123531055, + "router_z_loss_clip": 1.78320312, + "router_z_loss_mlp": 0.14648438, + "step": 5749, + "time_per_iteration": 2.533357620239258 + }, + { + "auxiliary_loss_clip": 0.06372777, + "auxiliary_loss_mlp": 0.01259534, + "balance_loss_clip": 0.06290279, + "balance_loss_mlp": 0.0125392, + "epoch": 0.34570870284082367, + "flos": 62023277422080.0, + "grad_norm": 0.8741398929973155, + "language_loss": 0.62861037, + "learning_rate": 3.0424140832227437e-06, + "loss": 0.70493352, + "num_input_tokens_seen": 123584720, + "router_z_loss_clip": 0.82470703, + "router_z_loss_mlp": 0.05612183, + "step": 5750, + "time_per_iteration": 4.42021369934082 + }, + { + "auxiliary_loss_clip": 0.06455849, + "auxiliary_loss_mlp": 0.0126761, + "balance_loss_clip": 0.06279862, + "balance_loss_mlp": 0.01253383, + "epoch": 0.34576882609349163, + "flos": 22788528647040.0, + "grad_norm": 2.5604939014714043, + "language_loss": 0.80745482, + "learning_rate": 3.042081685074012e-06, + "loss": 0.88468945, + "num_input_tokens_seen": 123604465, + "router_z_loss_clip": 1.7578125, + "router_z_loss_mlp": 0.14227295, + "step": 5751, + "time_per_iteration": 2.610229730606079 + }, + { + "auxiliary_loss_clip": 0.06461278, + "auxiliary_loss_mlp": 0.01273124, + "balance_loss_clip": 0.06282206, + "balance_loss_mlp": 0.01258199, + "epoch": 0.34582894934615965, + "flos": 12353665409280.0, + "grad_norm": 2.333174149642167, + "language_loss": 0.85112172, + "learning_rate": 3.041749247409439e-06, + "loss": 0.92846578, + "num_input_tokens_seen": 123622320, + "router_z_loss_clip": 1.79101562, + "router_z_loss_mlp": 0.14904785, + "step": 5752, + "time_per_iteration": 2.49895977973938 + }, + { + "auxiliary_loss_clip": 0.06379203, + "auxiliary_loss_mlp": 0.01260282, + "balance_loss_clip": 0.06296635, + "balance_loss_mlp": 0.01254092, + "epoch": 0.3458890725988276, + "flos": 70186459017600.0, + "grad_norm": 0.7233537791569425, + "language_loss": 0.63163221, + "learning_rate": 3.0414167702416296e-06, + "loss": 0.70802706, + "num_input_tokens_seen": 123678010, + "router_z_loss_clip": 0.828125, + "router_z_loss_mlp": 0.06185913, + "step": 5753, + "time_per_iteration": 3.0605263710021973 + }, + { + "auxiliary_loss_clip": 0.06463367, + "auxiliary_loss_mlp": 0.01274407, + "balance_loss_clip": 0.06282756, + "balance_loss_mlp": 0.01258498, + "epoch": 0.3459491958514956, + "flos": 17098324928640.0, + "grad_norm": 2.0282181813946116, + "language_loss": 0.71483171, + "learning_rate": 3.0410842535831914e-06, + "loss": 0.79220951, + "num_input_tokens_seen": 123696830, + "router_z_loss_clip": 1.80566406, + "router_z_loss_mlp": 0.15899658, + "step": 5754, + "time_per_iteration": 2.499213457107544 + }, + { + "auxiliary_loss_clip": 0.06470741, + "auxiliary_loss_mlp": 0.01271896, + "balance_loss_clip": 0.06282809, + "balance_loss_mlp": 0.01255898, + "epoch": 0.34600931910416355, + "flos": 16655985624960.0, + "grad_norm": 2.0834630321372534, + "language_loss": 0.7328862, + "learning_rate": 3.0407516974467343e-06, + "loss": 0.81031251, + "num_input_tokens_seen": 123714360, + "router_z_loss_clip": 1.87695312, + "router_z_loss_mlp": 0.15979004, + "step": 5755, + "time_per_iteration": 2.540292263031006 + }, + { + "auxiliary_loss_clip": 0.0646005, + "auxiliary_loss_mlp": 0.01272619, + "balance_loss_clip": 0.06280342, + "balance_loss_mlp": 0.01257801, + "epoch": 0.3460694423568315, + "flos": 38555517179520.0, + "grad_norm": 1.432388080922509, + "language_loss": 0.7255426, + "learning_rate": 3.040419101844869e-06, + "loss": 0.80286932, + "num_input_tokens_seen": 123739250, + "router_z_loss_clip": 1.79589844, + "router_z_loss_mlp": 0.14813232, + "step": 5756, + "time_per_iteration": 2.679203510284424 + }, + { + "auxiliary_loss_clip": 0.06371044, + "auxiliary_loss_mlp": 0.01257585, + "balance_loss_clip": 0.06288835, + "balance_loss_mlp": 0.01251058, + "epoch": 0.3461295656094995, + "flos": 72103332545280.0, + "grad_norm": 0.6902951700774806, + "language_loss": 0.62318385, + "learning_rate": 3.040086466790207e-06, + "loss": 0.69947016, + "num_input_tokens_seen": 123802845, + "router_z_loss_clip": 0.8203125, + "router_z_loss_mlp": 0.06536865, + "step": 5757, + "time_per_iteration": 3.209688901901245 + }, + { + "auxiliary_loss_clip": 0.06363717, + "auxiliary_loss_mlp": 0.01259824, + "balance_loss_clip": 0.06281064, + "balance_loss_mlp": 0.01253244, + "epoch": 0.34618968886216744, + "flos": 65477913408000.0, + "grad_norm": 0.8114970964410039, + "language_loss": 0.59130025, + "learning_rate": 3.039753792295362e-06, + "loss": 0.66753566, + "num_input_tokens_seen": 123861805, + "router_z_loss_clip": 0.828125, + "router_z_loss_mlp": 0.06591797, + "step": 5758, + "time_per_iteration": 3.139495372772217 + }, + { + "auxiliary_loss_clip": 0.06467785, + "auxiliary_loss_mlp": 0.01274731, + "balance_loss_clip": 0.06288655, + "balance_loss_mlp": 0.01259747, + "epoch": 0.3462498121148354, + "flos": 23478508293120.0, + "grad_norm": 1.7665020183034759, + "language_loss": 0.72321635, + "learning_rate": 3.0394210783729487e-06, + "loss": 0.80064148, + "num_input_tokens_seen": 123881820, + "router_z_loss_clip": 1.79101562, + "router_z_loss_mlp": 0.14990234, + "step": 5759, + "time_per_iteration": 2.575479745864868 + }, + { + "auxiliary_loss_clip": 0.06456805, + "auxiliary_loss_mlp": 0.01274415, + "balance_loss_clip": 0.06277698, + "balance_loss_mlp": 0.01258632, + "epoch": 0.3463099353675034, + "flos": 24177711888000.0, + "grad_norm": 1.8760422141660649, + "language_loss": 0.83568478, + "learning_rate": 3.0390883250355836e-06, + "loss": 0.91299695, + "num_input_tokens_seen": 123903700, + "router_z_loss_clip": 1.79199219, + "router_z_loss_mlp": 0.15771484, + "step": 5760, + "time_per_iteration": 2.5610272884368896 + }, + { + "auxiliary_loss_clip": 0.06358143, + "auxiliary_loss_mlp": 0.01257449, + "balance_loss_clip": 0.06276596, + "balance_loss_mlp": 0.0125125, + "epoch": 0.34637005862017134, + "flos": 63716773893120.0, + "grad_norm": 0.8043642187655193, + "language_loss": 0.56576806, + "learning_rate": 3.0387555322958865e-06, + "loss": 0.64192402, + "num_input_tokens_seen": 123960075, + "router_z_loss_clip": 0.81591797, + "router_z_loss_mlp": 0.06195068, + "step": 5761, + "time_per_iteration": 3.2343695163726807 + }, + { + "auxiliary_loss_clip": 0.06453449, + "auxiliary_loss_mlp": 0.01270941, + "balance_loss_clip": 0.06277917, + "balance_loss_mlp": 0.01256457, + "epoch": 0.3464301818728393, + "flos": 13149513089280.0, + "grad_norm": 1.936786863895872, + "language_loss": 0.9549523, + "learning_rate": 3.038422700166474e-06, + "loss": 1.03219616, + "num_input_tokens_seen": 123975805, + "router_z_loss_clip": 1.75585938, + "router_z_loss_mlp": 0.14477539, + "step": 5762, + "time_per_iteration": 2.496039390563965 + }, + { + "auxiliary_loss_clip": 0.06467324, + "auxiliary_loss_mlp": 0.01276759, + "balance_loss_clip": 0.06279808, + "balance_loss_mlp": 0.01260928, + "epoch": 0.34649030512550727, + "flos": 29322936650880.0, + "grad_norm": 1.870020160295256, + "language_loss": 0.69913763, + "learning_rate": 3.0380898286599692e-06, + "loss": 0.77657849, + "num_input_tokens_seen": 123997530, + "router_z_loss_clip": 1.875, + "router_z_loss_mlp": 0.15820312, + "step": 5763, + "time_per_iteration": 2.5929718017578125 + }, + { + "auxiliary_loss_clip": 0.06466965, + "auxiliary_loss_mlp": 0.01270957, + "balance_loss_clip": 0.06278971, + "balance_loss_mlp": 0.01253922, + "epoch": 0.34655042837817523, + "flos": 23737385082240.0, + "grad_norm": 1.7922805842181977, + "language_loss": 0.83863467, + "learning_rate": 3.0377569177889945e-06, + "loss": 0.9160139, + "num_input_tokens_seen": 124016375, + "router_z_loss_clip": 1.88183594, + "router_z_loss_mlp": 0.17028809, + "step": 5764, + "time_per_iteration": 2.634692668914795 + }, + { + "auxiliary_loss_clip": 0.06459094, + "auxiliary_loss_mlp": 0.01274486, + "balance_loss_clip": 0.06279744, + "balance_loss_mlp": 0.01259263, + "epoch": 0.34661055163084326, + "flos": 22060716082560.0, + "grad_norm": 2.9007104109569943, + "language_loss": 0.67647815, + "learning_rate": 3.0374239675661722e-06, + "loss": 0.75381392, + "num_input_tokens_seen": 124033975, + "router_z_loss_clip": 1.79394531, + "router_z_loss_mlp": 0.15234375, + "step": 5765, + "time_per_iteration": 2.5028090476989746 + }, + { + "auxiliary_loss_clip": 0.06460512, + "auxiliary_loss_mlp": 0.0127692, + "balance_loss_clip": 0.06280708, + "balance_loss_mlp": 0.01262233, + "epoch": 0.3466706748835112, + "flos": 21805738508160.0, + "grad_norm": 3.5961884004183426, + "language_loss": 0.77947313, + "learning_rate": 3.03709097800413e-06, + "loss": 0.85684741, + "num_input_tokens_seen": 124051930, + "router_z_loss_clip": 1.79980469, + "router_z_loss_mlp": 0.14709473, + "step": 5766, + "time_per_iteration": 2.5584661960601807 + }, + { + "auxiliary_loss_clip": 0.06460432, + "auxiliary_loss_mlp": 0.01274096, + "balance_loss_clip": 0.06278767, + "balance_loss_mlp": 0.01260614, + "epoch": 0.3467307981361792, + "flos": 19467405342720.0, + "grad_norm": 1.5497773141022704, + "language_loss": 0.73886019, + "learning_rate": 3.0367579491154943e-06, + "loss": 0.8162055, + "num_input_tokens_seen": 124071220, + "router_z_loss_clip": 1.81640625, + "router_z_loss_mlp": 0.13500977, + "step": 5767, + "time_per_iteration": 2.571500062942505 + }, + { + "auxiliary_loss_clip": 0.06461183, + "auxiliary_loss_mlp": 0.01276021, + "balance_loss_clip": 0.06279645, + "balance_loss_mlp": 0.01260107, + "epoch": 0.34679092138884715, + "flos": 24834470590080.0, + "grad_norm": 2.0350854996297696, + "language_loss": 0.78955162, + "learning_rate": 3.036424880912893e-06, + "loss": 0.86692369, + "num_input_tokens_seen": 124090140, + "router_z_loss_clip": 1.8125, + "router_z_loss_mlp": 0.15917969, + "step": 5768, + "time_per_iteration": 2.5747995376586914 + }, + { + "auxiliary_loss_clip": 0.06369781, + "auxiliary_loss_mlp": 0.01257254, + "balance_loss_clip": 0.06288455, + "balance_loss_mlp": 0.01251723, + "epoch": 0.3468510446415151, + "flos": 63253791757440.0, + "grad_norm": 0.7431238132649503, + "language_loss": 0.57319033, + "learning_rate": 3.036091773408956e-06, + "loss": 0.64946061, + "num_input_tokens_seen": 124152025, + "router_z_loss_clip": 0.8125, + "router_z_loss_mlp": 0.05535889, + "step": 5769, + "time_per_iteration": 3.176074981689453 + }, + { + "auxiliary_loss_clip": 0.06479758, + "auxiliary_loss_mlp": 0.01277235, + "balance_loss_clip": 0.06285711, + "balance_loss_mlp": 0.01260212, + "epoch": 0.3469111678941831, + "flos": 12123984568320.0, + "grad_norm": 2.4016361546378158, + "language_loss": 0.85419703, + "learning_rate": 3.0357586266163154e-06, + "loss": 0.93176699, + "num_input_tokens_seen": 124165795, + "router_z_loss_clip": 1.94042969, + "router_z_loss_mlp": 0.17028809, + "step": 5770, + "time_per_iteration": 2.5156779289245605 + }, + { + "auxiliary_loss_clip": 0.06372644, + "auxiliary_loss_mlp": 0.01258777, + "balance_loss_clip": 0.0629043, + "balance_loss_mlp": 0.01253087, + "epoch": 0.34697129114685105, + "flos": 65951964282240.0, + "grad_norm": 0.7493725348793998, + "language_loss": 0.59862447, + "learning_rate": 3.0354254405476036e-06, + "loss": 0.67493868, + "num_input_tokens_seen": 124222925, + "router_z_loss_clip": 0.82080078, + "router_z_loss_mlp": 0.05685425, + "step": 5771, + "time_per_iteration": 2.938957691192627 + }, + { + "auxiliary_loss_clip": 0.0646434, + "auxiliary_loss_mlp": 0.012787, + "balance_loss_clip": 0.06282143, + "balance_loss_mlp": 0.01263572, + "epoch": 0.347031414399519, + "flos": 34461914284800.0, + "grad_norm": 1.9396999801577832, + "language_loss": 0.72527683, + "learning_rate": 3.0350922152154557e-06, + "loss": 0.80270731, + "num_input_tokens_seen": 124240915, + "router_z_loss_clip": 1.82324219, + "router_z_loss_mlp": 0.15136719, + "step": 5772, + "time_per_iteration": 2.6529078483581543 + }, + { + "auxiliary_loss_clip": 0.06462972, + "auxiliary_loss_mlp": 0.01272172, + "balance_loss_clip": 0.06281382, + "balance_loss_mlp": 0.01256246, + "epoch": 0.347091537652187, + "flos": 26951592176640.0, + "grad_norm": 1.5709710398058576, + "language_loss": 0.76695967, + "learning_rate": 3.034758950632507e-06, + "loss": 0.84431112, + "num_input_tokens_seen": 124262770, + "router_z_loss_clip": 1.81445312, + "router_z_loss_mlp": 0.15924072, + "step": 5773, + "time_per_iteration": 2.5785317420959473 + }, + { + "auxiliary_loss_clip": 0.06466497, + "auxiliary_loss_mlp": 0.01271256, + "balance_loss_clip": 0.06280655, + "balance_loss_mlp": 0.01255366, + "epoch": 0.34715166090485494, + "flos": 21148602462720.0, + "grad_norm": 2.4326309651076463, + "language_loss": 0.70796078, + "learning_rate": 3.034425646811396e-06, + "loss": 0.78533834, + "num_input_tokens_seen": 124280950, + "router_z_loss_clip": 1.85644531, + "router_z_loss_mlp": 0.15893555, + "step": 5774, + "time_per_iteration": 2.5585873126983643 + }, + { + "auxiliary_loss_clip": 0.06458526, + "auxiliary_loss_mlp": 0.01271942, + "balance_loss_clip": 0.06278332, + "balance_loss_mlp": 0.01256707, + "epoch": 0.3472117841575229, + "flos": 23484881203200.0, + "grad_norm": 2.2084812675777474, + "language_loss": 0.76485682, + "learning_rate": 3.0340923037647602e-06, + "loss": 0.84216148, + "num_input_tokens_seen": 124299540, + "router_z_loss_clip": 1.80175781, + "router_z_loss_mlp": 0.15228271, + "step": 5775, + "time_per_iteration": 2.5899477005004883 + }, + { + "auxiliary_loss_clip": 0.06472419, + "auxiliary_loss_mlp": 0.01271173, + "balance_loss_clip": 0.06281743, + "balance_loss_mlp": 0.01255163, + "epoch": 0.34727190741019087, + "flos": 17498428974720.0, + "grad_norm": 2.2070819655775282, + "language_loss": 0.7869916, + "learning_rate": 3.0337589215052404e-06, + "loss": 0.86442757, + "num_input_tokens_seen": 124316285, + "router_z_loss_clip": 1.90625, + "router_z_loss_mlp": 0.16009521, + "step": 5776, + "time_per_iteration": 2.5874037742614746 + }, + { + "auxiliary_loss_clip": 0.0636313, + "auxiliary_loss_mlp": 0.01265305, + "balance_loss_clip": 0.06280468, + "balance_loss_mlp": 0.0125983, + "epoch": 0.34733203066285884, + "flos": 65287350495360.0, + "grad_norm": 0.8333293277096808, + "language_loss": 0.63448966, + "learning_rate": 3.033425500045478e-06, + "loss": 0.710774, + "num_input_tokens_seen": 124376650, + "router_z_loss_clip": 0.82666016, + "router_z_loss_mlp": 0.05477905, + "step": 5777, + "time_per_iteration": 3.168325185775757 + }, + { + "auxiliary_loss_clip": 0.0646584, + "auxiliary_loss_mlp": 0.01270867, + "balance_loss_clip": 0.06279471, + "balance_loss_mlp": 0.01255048, + "epoch": 0.3473921539155268, + "flos": 28666429511040.0, + "grad_norm": 3.258496862714712, + "language_loss": 0.65075529, + "learning_rate": 3.033092039398119e-06, + "loss": 0.72812235, + "num_input_tokens_seen": 124396475, + "router_z_loss_clip": 1.86425781, + "router_z_loss_mlp": 0.15808105, + "step": 5778, + "time_per_iteration": 2.5797836780548096 + }, + { + "auxiliary_loss_clip": 0.06467149, + "auxiliary_loss_mlp": 0.01271344, + "balance_loss_clip": 0.06278305, + "balance_loss_mlp": 0.0125633, + "epoch": 0.3474522771681948, + "flos": 40845284104320.0, + "grad_norm": 1.7195764072446118, + "language_loss": 0.722601, + "learning_rate": 3.0327585395758046e-06, + "loss": 0.79998595, + "num_input_tokens_seen": 124416480, + "router_z_loss_clip": 1.88769531, + "router_z_loss_mlp": 0.15008545, + "step": 5779, + "time_per_iteration": 2.6901330947875977 + }, + { + "auxiliary_loss_clip": 0.06474127, + "auxiliary_loss_mlp": 0.01275722, + "balance_loss_clip": 0.06282836, + "balance_loss_mlp": 0.01259092, + "epoch": 0.3475124004208628, + "flos": 24615564998400.0, + "grad_norm": 2.601451729132101, + "language_loss": 0.62399209, + "learning_rate": 3.0324250005911837e-06, + "loss": 0.70149052, + "num_input_tokens_seen": 124435950, + "router_z_loss_clip": 1.9140625, + "router_z_loss_mlp": 0.1663208, + "step": 5780, + "time_per_iteration": 2.5493476390838623 + }, + { + "auxiliary_loss_clip": 0.0647147, + "auxiliary_loss_mlp": 0.01271785, + "balance_loss_clip": 0.06285025, + "balance_loss_mlp": 0.01256264, + "epoch": 0.34757252367353075, + "flos": 22717977909120.0, + "grad_norm": 3.4183593986527043, + "language_loss": 0.72164977, + "learning_rate": 3.0320914224569033e-06, + "loss": 0.79908228, + "num_input_tokens_seen": 124455410, + "router_z_loss_clip": 1.86523438, + "router_z_loss_mlp": 0.15515137, + "step": 5781, + "time_per_iteration": 2.610198974609375 + }, + { + "auxiliary_loss_clip": 0.06471756, + "auxiliary_loss_mlp": 0.01273476, + "balance_loss_clip": 0.06282213, + "balance_loss_mlp": 0.01257228, + "epoch": 0.3476326469261987, + "flos": 19834246517760.0, + "grad_norm": 2.4264406265191325, + "language_loss": 0.77686667, + "learning_rate": 3.031757805185612e-06, + "loss": 0.85431898, + "num_input_tokens_seen": 124474870, + "router_z_loss_clip": 1.89648438, + "router_z_loss_mlp": 0.16235352, + "step": 5782, + "time_per_iteration": 3.918602705001831 + }, + { + "auxiliary_loss_clip": 0.06470296, + "auxiliary_loss_mlp": 0.01277549, + "balance_loss_clip": 0.0628626, + "balance_loss_mlp": 0.01262695, + "epoch": 0.3476927701788667, + "flos": 19944265328640.0, + "grad_norm": 2.639685157679876, + "language_loss": 0.63410383, + "learning_rate": 3.0314241487899622e-06, + "loss": 0.7115823, + "num_input_tokens_seen": 124494105, + "router_z_loss_clip": 1.84082031, + "router_z_loss_mlp": 0.14855957, + "step": 5783, + "time_per_iteration": 4.021190881729126 + }, + { + "auxiliary_loss_clip": 0.06469369, + "auxiliary_loss_mlp": 0.01277895, + "balance_loss_clip": 0.06290524, + "balance_loss_mlp": 0.01264121, + "epoch": 0.34775289343153465, + "flos": 20740448424960.0, + "grad_norm": 1.686879732071426, + "language_loss": 0.89054763, + "learning_rate": 3.031090453282605e-06, + "loss": 0.9680202, + "num_input_tokens_seen": 124512030, + "router_z_loss_clip": 1.78808594, + "router_z_loss_mlp": 0.13763428, + "step": 5784, + "time_per_iteration": 2.553847074508667 + }, + { + "auxiliary_loss_clip": 0.06470798, + "auxiliary_loss_mlp": 0.01275566, + "balance_loss_clip": 0.06289466, + "balance_loss_mlp": 0.01260903, + "epoch": 0.3478130166842026, + "flos": 19360992257280.0, + "grad_norm": 1.643062521609265, + "language_loss": 0.82068878, + "learning_rate": 3.0307567186761946e-06, + "loss": 0.89815247, + "num_input_tokens_seen": 124530980, + "router_z_loss_clip": 1.8125, + "router_z_loss_mlp": 0.14672852, + "step": 5785, + "time_per_iteration": 2.5452024936676025 + }, + { + "auxiliary_loss_clip": 0.06472684, + "auxiliary_loss_mlp": 0.01281071, + "balance_loss_clip": 0.06290045, + "balance_loss_mlp": 0.01267004, + "epoch": 0.3478731399368706, + "flos": 22057194211200.0, + "grad_norm": 1.6654216237849466, + "language_loss": 0.80731958, + "learning_rate": 3.0304229449833862e-06, + "loss": 0.88485718, + "num_input_tokens_seen": 124549330, + "router_z_loss_clip": 1.82617188, + "router_z_loss_mlp": 0.14074707, + "step": 5786, + "time_per_iteration": 4.040801286697388 + }, + { + "auxiliary_loss_clip": 0.06468868, + "auxiliary_loss_mlp": 0.01275893, + "balance_loss_clip": 0.06289011, + "balance_loss_mlp": 0.01260515, + "epoch": 0.34793326318953854, + "flos": 18047390999040.0, + "grad_norm": 1.5833193798509506, + "language_loss": 0.75743961, + "learning_rate": 3.030089132216836e-06, + "loss": 0.83488721, + "num_input_tokens_seen": 124567200, + "router_z_loss_clip": 1.79785156, + "router_z_loss_mlp": 0.15368652, + "step": 5787, + "time_per_iteration": 2.5231845378875732 + }, + { + "auxiliary_loss_clip": 0.06470607, + "auxiliary_loss_mlp": 0.01273428, + "balance_loss_clip": 0.06287535, + "balance_loss_mlp": 0.01259111, + "epoch": 0.3479933864422065, + "flos": 29322349672320.0, + "grad_norm": 1.5447805606313796, + "language_loss": 0.81661141, + "learning_rate": 3.029755280389203e-06, + "loss": 0.89405167, + "num_input_tokens_seen": 124587025, + "router_z_loss_clip": 1.83007812, + "router_z_loss_mlp": 0.14312744, + "step": 5788, + "time_per_iteration": 2.5828304290771484 + }, + { + "auxiliary_loss_clip": 0.064804, + "auxiliary_loss_mlp": 0.01277805, + "balance_loss_clip": 0.06290662, + "balance_loss_mlp": 0.01261831, + "epoch": 0.3480535096948745, + "flos": 20126931229440.0, + "grad_norm": 1.9688082680528027, + "language_loss": 0.85984367, + "learning_rate": 3.029421389513147e-06, + "loss": 0.93742573, + "num_input_tokens_seen": 124605860, + "router_z_loss_clip": 1.8984375, + "router_z_loss_mlp": 0.15979004, + "step": 5789, + "time_per_iteration": 2.582662343978882 + }, + { + "auxiliary_loss_clip": 0.06479242, + "auxiliary_loss_mlp": 0.0127695, + "balance_loss_clip": 0.06292568, + "balance_loss_mlp": 0.0126178, + "epoch": 0.34811363294754244, + "flos": 18554453182080.0, + "grad_norm": 1.6869236803506542, + "language_loss": 0.84773821, + "learning_rate": 3.029087459601328e-06, + "loss": 0.92530012, + "num_input_tokens_seen": 124624270, + "router_z_loss_clip": 1.86816406, + "router_z_loss_mlp": 0.15185547, + "step": 5790, + "time_per_iteration": 3.942929983139038 + }, + { + "auxiliary_loss_clip": 0.06469919, + "auxiliary_loss_mlp": 0.01274378, + "balance_loss_clip": 0.0628828, + "balance_loss_mlp": 0.01259465, + "epoch": 0.3481737562002104, + "flos": 26877603421440.0, + "grad_norm": 1.9257745343225423, + "language_loss": 0.81410027, + "learning_rate": 3.0287534906664097e-06, + "loss": 0.89154327, + "num_input_tokens_seen": 124644005, + "router_z_loss_clip": 1.81738281, + "router_z_loss_mlp": 0.14904785, + "step": 5791, + "time_per_iteration": 2.5533103942871094 + }, + { + "auxiliary_loss_clip": 0.06478444, + "auxiliary_loss_mlp": 0.01278573, + "balance_loss_clip": 0.0629065, + "balance_loss_mlp": 0.01263356, + "epoch": 0.3482338794528784, + "flos": 28915495372800.0, + "grad_norm": 1.656722788090249, + "language_loss": 0.78119808, + "learning_rate": 3.028419482721056e-06, + "loss": 0.85876822, + "num_input_tokens_seen": 124663020, + "router_z_loss_clip": 1.87988281, + "router_z_loss_mlp": 0.15216064, + "step": 5792, + "time_per_iteration": 2.5784294605255127 + }, + { + "auxiliary_loss_clip": 0.06471643, + "auxiliary_loss_mlp": 0.01270557, + "balance_loss_clip": 0.06287935, + "balance_loss_mlp": 0.01255989, + "epoch": 0.3482940027055464, + "flos": 22207393854720.0, + "grad_norm": 1.5928062225109956, + "language_loss": 0.82187879, + "learning_rate": 3.0280854357779325e-06, + "loss": 0.89930081, + "num_input_tokens_seen": 124682975, + "router_z_loss_clip": 1.83496094, + "router_z_loss_mlp": 0.14575195, + "step": 5793, + "time_per_iteration": 2.545158624649048 + }, + { + "auxiliary_loss_clip": 0.06472721, + "auxiliary_loss_mlp": 0.01275633, + "balance_loss_clip": 0.06286202, + "balance_loss_mlp": 0.01259438, + "epoch": 0.34835412595821436, + "flos": 20308884370560.0, + "grad_norm": 1.8552979095996294, + "language_loss": 0.7616328, + "learning_rate": 3.027751349849706e-06, + "loss": 0.83911633, + "num_input_tokens_seen": 124701340, + "router_z_loss_clip": 1.86328125, + "router_z_loss_mlp": 0.1618042, + "step": 5794, + "time_per_iteration": 2.548841953277588 + }, + { + "auxiliary_loss_clip": 0.06468202, + "auxiliary_loss_mlp": 0.01277142, + "balance_loss_clip": 0.06286102, + "balance_loss_mlp": 0.01262271, + "epoch": 0.3484142492108823, + "flos": 20456065267200.0, + "grad_norm": 2.5979910850639336, + "language_loss": 0.57406038, + "learning_rate": 3.0274172249490456e-06, + "loss": 0.65151387, + "num_input_tokens_seen": 124719165, + "router_z_loss_clip": 1.82226562, + "router_z_loss_mlp": 0.14868164, + "step": 5795, + "time_per_iteration": 2.5222668647766113 + }, + { + "auxiliary_loss_clip": 0.06465806, + "auxiliary_loss_mlp": 0.01271041, + "balance_loss_clip": 0.06285395, + "balance_loss_mlp": 0.01257469, + "epoch": 0.3484743724635503, + "flos": 24359832737280.0, + "grad_norm": 1.8988060542741243, + "language_loss": 0.83093596, + "learning_rate": 3.0270830610886213e-06, + "loss": 0.90830439, + "num_input_tokens_seen": 124738670, + "router_z_loss_clip": 1.80371094, + "router_z_loss_mlp": 0.13580322, + "step": 5796, + "time_per_iteration": 2.5901992321014404 + }, + { + "auxiliary_loss_clip": 0.06459932, + "auxiliary_loss_mlp": 0.01272067, + "balance_loss_clip": 0.06285086, + "balance_loss_mlp": 0.01258692, + "epoch": 0.34853449571621825, + "flos": 24359916591360.0, + "grad_norm": 1.6441838604480552, + "language_loss": 0.83544898, + "learning_rate": 3.0267488582811033e-06, + "loss": 0.91276896, + "num_input_tokens_seen": 124758760, + "router_z_loss_clip": 1.74511719, + "router_z_loss_mlp": 0.13378906, + "step": 5797, + "time_per_iteration": 2.5595455169677734 + }, + { + "auxiliary_loss_clip": 0.06466283, + "auxiliary_loss_mlp": 0.01269705, + "balance_loss_clip": 0.06287575, + "balance_loss_mlp": 0.01256055, + "epoch": 0.3485946189688862, + "flos": 27274395231360.0, + "grad_norm": 1.5517160717894904, + "language_loss": 0.73727238, + "learning_rate": 3.026414616539167e-06, + "loss": 0.81463224, + "num_input_tokens_seen": 124777765, + "router_z_loss_clip": 1.78808594, + "router_z_loss_mlp": 0.13647461, + "step": 5798, + "time_per_iteration": 2.716830015182495 + }, + { + "auxiliary_loss_clip": 0.06466942, + "auxiliary_loss_mlp": 0.012712, + "balance_loss_clip": 0.06280895, + "balance_loss_mlp": 0.0125618, + "epoch": 0.3486547422215542, + "flos": 20162835504000.0, + "grad_norm": 1.8098383323780278, + "language_loss": 0.76806593, + "learning_rate": 3.026080335875485e-06, + "loss": 0.84544736, + "num_input_tokens_seen": 124796775, + "router_z_loss_clip": 1.86035156, + "router_z_loss_mlp": 0.15014648, + "step": 5799, + "time_per_iteration": 2.550356149673462 + }, + { + "auxiliary_loss_clip": 0.06464861, + "auxiliary_loss_mlp": 0.01267271, + "balance_loss_clip": 0.06284796, + "balance_loss_mlp": 0.01253735, + "epoch": 0.34871486547422215, + "flos": 20236614624000.0, + "grad_norm": 2.6888551620055363, + "language_loss": 0.75880742, + "learning_rate": 3.025746016302734e-06, + "loss": 0.83612871, + "num_input_tokens_seen": 124815825, + "router_z_loss_clip": 1.79980469, + "router_z_loss_mlp": 0.13543701, + "step": 5800, + "time_per_iteration": 2.559406042098999 + }, + { + "auxiliary_loss_clip": 0.06468332, + "auxiliary_loss_mlp": 0.01272895, + "balance_loss_clip": 0.06284243, + "balance_loss_mlp": 0.01258375, + "epoch": 0.3487749887268901, + "flos": 44063096924160.0, + "grad_norm": 1.6752863637060063, + "language_loss": 0.67620414, + "learning_rate": 3.025411657833591e-06, + "loss": 0.75361645, + "num_input_tokens_seen": 124838420, + "router_z_loss_clip": 1.84082031, + "router_z_loss_mlp": 0.14538574, + "step": 5801, + "time_per_iteration": 2.7286293506622314 + }, + { + "auxiliary_loss_clip": 0.064619, + "auxiliary_loss_mlp": 0.01268462, + "balance_loss_clip": 0.06283519, + "balance_loss_mlp": 0.01253406, + "epoch": 0.3488351119795581, + "flos": 23301921813120.0, + "grad_norm": 1.7427843167651098, + "language_loss": 0.76900619, + "learning_rate": 3.025077260480735e-06, + "loss": 0.84630978, + "num_input_tokens_seen": 124857320, + "router_z_loss_clip": 1.78320312, + "router_z_loss_mlp": 0.15075684, + "step": 5802, + "time_per_iteration": 2.5632455348968506 + }, + { + "auxiliary_loss_clip": 0.0645422, + "auxiliary_loss_mlp": 0.01273067, + "balance_loss_clip": 0.06281535, + "balance_loss_mlp": 0.01260109, + "epoch": 0.34889523523222604, + "flos": 19940449968000.0, + "grad_norm": 1.7168444943641856, + "language_loss": 0.79347479, + "learning_rate": 3.0247428242568474e-06, + "loss": 0.87074769, + "num_input_tokens_seen": 124875685, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.12957764, + "step": 5803, + "time_per_iteration": 2.5202274322509766 + }, + { + "auxiliary_loss_clip": 0.06462935, + "auxiliary_loss_mlp": 0.01269017, + "balance_loss_clip": 0.06277519, + "balance_loss_mlp": 0.01255212, + "epoch": 0.348955358484894, + "flos": 30454123570560.0, + "grad_norm": 2.672940484210586, + "language_loss": 0.67680007, + "learning_rate": 3.0244083491746085e-06, + "loss": 0.75411958, + "num_input_tokens_seen": 124895960, + "router_z_loss_clip": 1.85351562, + "router_z_loss_mlp": 0.13812256, + "step": 5804, + "time_per_iteration": 2.636371374130249 + }, + { + "auxiliary_loss_clip": 0.06455779, + "auxiliary_loss_mlp": 0.01267233, + "balance_loss_clip": 0.06282568, + "balance_loss_mlp": 0.01253989, + "epoch": 0.349015481737562, + "flos": 18005071887360.0, + "grad_norm": 1.776416664420285, + "language_loss": 0.76608741, + "learning_rate": 3.024073835246702e-06, + "loss": 0.84331751, + "num_input_tokens_seen": 124914140, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.13238525, + "step": 5805, + "time_per_iteration": 2.4746642112731934 + }, + { + "auxiliary_loss_clip": 0.06461459, + "auxiliary_loss_mlp": 0.01269872, + "balance_loss_clip": 0.06281143, + "balance_loss_mlp": 0.0125568, + "epoch": 0.34907560499023, + "flos": 27205815064320.0, + "grad_norm": 2.094620432718779, + "language_loss": 0.67626035, + "learning_rate": 3.023739282485814e-06, + "loss": 0.7535736, + "num_input_tokens_seen": 124934180, + "router_z_loss_clip": 1.80273438, + "router_z_loss_mlp": 0.14178467, + "step": 5806, + "time_per_iteration": 2.6109619140625 + }, + { + "auxiliary_loss_clip": 0.06461781, + "auxiliary_loss_mlp": 0.01268424, + "balance_loss_clip": 0.06281736, + "balance_loss_mlp": 0.01254596, + "epoch": 0.34913572824289796, + "flos": 30234714854400.0, + "grad_norm": 1.7462714312606824, + "language_loss": 0.71972066, + "learning_rate": 3.023404690904629e-06, + "loss": 0.7970227, + "num_input_tokens_seen": 124956060, + "router_z_loss_clip": 1.80273438, + "router_z_loss_mlp": 0.1383667, + "step": 5807, + "time_per_iteration": 2.6023621559143066 + }, + { + "auxiliary_loss_clip": 0.06464535, + "auxiliary_loss_mlp": 0.01272433, + "balance_loss_clip": 0.06279333, + "balance_loss_mlp": 0.01257425, + "epoch": 0.3491958514955659, + "flos": 29979779207040.0, + "grad_norm": 2.0002365662223727, + "language_loss": 0.74799109, + "learning_rate": 3.0230700605158364e-06, + "loss": 0.82536077, + "num_input_tokens_seen": 124976070, + "router_z_loss_clip": 1.8515625, + "router_z_loss_mlp": 0.15002441, + "step": 5808, + "time_per_iteration": 2.661327362060547 + }, + { + "auxiliary_loss_clip": 0.0645329, + "auxiliary_loss_mlp": 0.01272203, + "balance_loss_clip": 0.06278954, + "balance_loss_mlp": 0.0125828, + "epoch": 0.3492559747482339, + "flos": 22789786458240.0, + "grad_norm": 1.539446612060682, + "language_loss": 0.84555626, + "learning_rate": 3.0227353913321238e-06, + "loss": 0.92281115, + "num_input_tokens_seen": 124996995, + "router_z_loss_clip": 1.74316406, + "router_z_loss_mlp": 0.13922119, + "step": 5809, + "time_per_iteration": 2.577709197998047 + }, + { + "auxiliary_loss_clip": 0.06454454, + "auxiliary_loss_mlp": 0.01270466, + "balance_loss_clip": 0.06282149, + "balance_loss_mlp": 0.0125755, + "epoch": 0.34931609800090185, + "flos": 26075257050240.0, + "grad_norm": 1.9706347482771516, + "language_loss": 0.80724359, + "learning_rate": 3.0224006833661835e-06, + "loss": 0.88449275, + "num_input_tokens_seen": 125015600, + "router_z_loss_clip": 1.72167969, + "router_z_loss_mlp": 0.12921143, + "step": 5810, + "time_per_iteration": 2.583709955215454 + }, + { + "auxiliary_loss_clip": 0.06460047, + "auxiliary_loss_mlp": 0.01274437, + "balance_loss_clip": 0.06281585, + "balance_loss_mlp": 0.01260477, + "epoch": 0.3493762212535698, + "flos": 29249744509440.0, + "grad_norm": 1.580057936247994, + "language_loss": 0.75975537, + "learning_rate": 3.0220659366307057e-06, + "loss": 0.83710015, + "num_input_tokens_seen": 125035290, + "router_z_loss_clip": 1.78515625, + "router_z_loss_mlp": 0.1395874, + "step": 5811, + "time_per_iteration": 2.6304807662963867 + }, + { + "auxiliary_loss_clip": 0.06459605, + "auxiliary_loss_mlp": 0.01268711, + "balance_loss_clip": 0.06280548, + "balance_loss_mlp": 0.01254746, + "epoch": 0.3494363445062378, + "flos": 27133461463680.0, + "grad_norm": 1.6291603050336358, + "language_loss": 0.80527401, + "learning_rate": 3.021731151138386e-06, + "loss": 0.88255721, + "num_input_tokens_seen": 125057130, + "router_z_loss_clip": 1.79003906, + "router_z_loss_mlp": 0.1395874, + "step": 5812, + "time_per_iteration": 2.657989025115967 + }, + { + "auxiliary_loss_clip": 0.06462281, + "auxiliary_loss_mlp": 0.01270882, + "balance_loss_clip": 0.0628228, + "balance_loss_mlp": 0.01257179, + "epoch": 0.34949646775890575, + "flos": 12281102173440.0, + "grad_norm": 2.0118644405033463, + "language_loss": 0.701132, + "learning_rate": 3.021396326901918e-06, + "loss": 0.7784636, + "num_input_tokens_seen": 125073720, + "router_z_loss_clip": 1.80273438, + "router_z_loss_mlp": 0.137146, + "step": 5813, + "time_per_iteration": 2.47231388092041 + }, + { + "auxiliary_loss_clip": 0.06457584, + "auxiliary_loss_mlp": 0.01270878, + "balance_loss_clip": 0.06281666, + "balance_loss_mlp": 0.01257401, + "epoch": 0.3495565910115737, + "flos": 17171265507840.0, + "grad_norm": 1.9224367307793844, + "language_loss": 0.76310062, + "learning_rate": 3.0210614639339998e-06, + "loss": 0.8403852, + "num_input_tokens_seen": 125090635, + "router_z_loss_clip": 1.75976562, + "router_z_loss_mlp": 0.13482666, + "step": 5814, + "time_per_iteration": 2.4967095851898193 + }, + { + "auxiliary_loss_clip": 0.06471042, + "auxiliary_loss_mlp": 0.01271787, + "balance_loss_clip": 0.06288652, + "balance_loss_mlp": 0.01257076, + "epoch": 0.3496167142642417, + "flos": 26472342349440.0, + "grad_norm": 1.8186936331307002, + "language_loss": 0.85099685, + "learning_rate": 3.020726562247328e-06, + "loss": 0.92842519, + "num_input_tokens_seen": 125110070, + "router_z_loss_clip": 1.82519531, + "router_z_loss_mlp": 0.1472168, + "step": 5815, + "time_per_iteration": 2.597399950027466 + }, + { + "auxiliary_loss_clip": 0.06466906, + "auxiliary_loss_mlp": 0.01275707, + "balance_loss_clip": 0.06285352, + "balance_loss_mlp": 0.01261712, + "epoch": 0.34967683751690964, + "flos": 17419618609920.0, + "grad_norm": 2.3640337842934565, + "language_loss": 0.78006089, + "learning_rate": 3.0203916218546024e-06, + "loss": 0.85748702, + "num_input_tokens_seen": 125125730, + "router_z_loss_clip": 1.81542969, + "router_z_loss_mlp": 0.13995361, + "step": 5816, + "time_per_iteration": 2.5164036750793457 + }, + { + "auxiliary_loss_clip": 0.0646984, + "auxiliary_loss_mlp": 0.01273456, + "balance_loss_clip": 0.06286636, + "balance_loss_mlp": 0.01258692, + "epoch": 0.3497369607695776, + "flos": 22606365870720.0, + "grad_norm": 1.8515414586733512, + "language_loss": 0.59787703, + "learning_rate": 3.0200566427685246e-06, + "loss": 0.6753099, + "num_input_tokens_seen": 125146195, + "router_z_loss_clip": 1.83398438, + "router_z_loss_mlp": 0.14764404, + "step": 5817, + "time_per_iteration": 2.542877674102783 + }, + { + "auxiliary_loss_clip": 0.06358884, + "auxiliary_loss_mlp": 0.01261904, + "balance_loss_clip": 0.06277611, + "balance_loss_mlp": 0.01257669, + "epoch": 0.34979708402224563, + "flos": 68548461477120.0, + "grad_norm": 0.858700346008579, + "language_loss": 0.59824663, + "learning_rate": 3.0197216250017975e-06, + "loss": 0.67445457, + "num_input_tokens_seen": 125207790, + "router_z_loss_clip": 0.8125, + "router_z_loss_mlp": 0.04238892, + "step": 5818, + "time_per_iteration": 3.1992976665496826 + }, + { + "auxiliary_loss_clip": 0.06459703, + "auxiliary_loss_mlp": 0.01271152, + "balance_loss_clip": 0.06283519, + "balance_loss_mlp": 0.01257109, + "epoch": 0.3498572072749136, + "flos": 18995660455680.0, + "grad_norm": 1.926998914600137, + "language_loss": 0.83806789, + "learning_rate": 3.019386568567123e-06, + "loss": 0.91537642, + "num_input_tokens_seen": 125226220, + "router_z_loss_clip": 1.76074219, + "router_z_loss_mlp": 0.14031982, + "step": 5819, + "time_per_iteration": 2.5241613388061523 + }, + { + "auxiliary_loss_clip": 0.06466879, + "auxiliary_loss_mlp": 0.01269175, + "balance_loss_clip": 0.0628517, + "balance_loss_mlp": 0.0125493, + "epoch": 0.34991733052758156, + "flos": 27826334075520.0, + "grad_norm": 2.092302610514248, + "language_loss": 0.71273863, + "learning_rate": 3.0190514734772083e-06, + "loss": 0.79009914, + "num_input_tokens_seen": 125247485, + "router_z_loss_clip": 1.81835938, + "router_z_loss_mlp": 0.14245605, + "step": 5820, + "time_per_iteration": 2.569838762283325 + }, + { + "auxiliary_loss_clip": 0.06470378, + "auxiliary_loss_mlp": 0.01270567, + "balance_loss_clip": 0.06288413, + "balance_loss_mlp": 0.01256292, + "epoch": 0.3499774537802495, + "flos": 33592706755200.0, + "grad_norm": 2.4345068466865083, + "language_loss": 0.70581877, + "learning_rate": 3.018716339744759e-06, + "loss": 0.78322828, + "num_input_tokens_seen": 125268625, + "router_z_loss_clip": 1.81835938, + "router_z_loss_mlp": 0.14294434, + "step": 5821, + "time_per_iteration": 2.6535534858703613 + }, + { + "auxiliary_loss_clip": 0.06479154, + "auxiliary_loss_mlp": 0.0127118, + "balance_loss_clip": 0.06291604, + "balance_loss_mlp": 0.01254538, + "epoch": 0.3500375770329175, + "flos": 23483413756800.0, + "grad_norm": 1.9533795991074365, + "language_loss": 0.74227631, + "learning_rate": 3.0183811673824842e-06, + "loss": 0.81977963, + "num_input_tokens_seen": 125287530, + "router_z_loss_clip": 1.87402344, + "router_z_loss_mlp": 0.16650391, + "step": 5822, + "time_per_iteration": 5.406672716140747 + }, + { + "auxiliary_loss_clip": 0.06470097, + "auxiliary_loss_mlp": 0.01273086, + "balance_loss_clip": 0.06285684, + "balance_loss_mlp": 0.01257588, + "epoch": 0.35009770028558546, + "flos": 19032067854720.0, + "grad_norm": 2.646032233627204, + "language_loss": 0.7905609, + "learning_rate": 3.018045956403094e-06, + "loss": 0.86799276, + "num_input_tokens_seen": 125307020, + "router_z_loss_clip": 1.84570312, + "router_z_loss_mlp": 0.15496826, + "step": 5823, + "time_per_iteration": 2.5048515796661377 + }, + { + "auxiliary_loss_clip": 0.06353101, + "auxiliary_loss_mlp": 0.01254576, + "balance_loss_clip": 0.06271273, + "balance_loss_mlp": 0.01249748, + "epoch": 0.3501578235382534, + "flos": 68371749216000.0, + "grad_norm": 0.6915411290730273, + "language_loss": 0.58945203, + "learning_rate": 3.017710706819298e-06, + "loss": 0.66552877, + "num_input_tokens_seen": 125370445, + "router_z_loss_clip": 0.81689453, + "router_z_loss_mlp": 0.04821777, + "step": 5824, + "time_per_iteration": 3.209726333618164 + }, + { + "auxiliary_loss_clip": 0.06465952, + "auxiliary_loss_mlp": 0.01274281, + "balance_loss_clip": 0.06284555, + "balance_loss_mlp": 0.01258045, + "epoch": 0.3502179467909214, + "flos": 21257153827200.0, + "grad_norm": 3.0621504018438164, + "language_loss": 0.85168576, + "learning_rate": 3.017375418643811e-06, + "loss": 0.92908812, + "num_input_tokens_seen": 125388900, + "router_z_loss_clip": 1.81445312, + "router_z_loss_mlp": 0.16223145, + "step": 5825, + "time_per_iteration": 2.513498067855835 + }, + { + "auxiliary_loss_clip": 0.06462917, + "auxiliary_loss_mlp": 0.01268842, + "balance_loss_clip": 0.06283134, + "balance_loss_mlp": 0.01254275, + "epoch": 0.35027807004358935, + "flos": 11946978817920.0, + "grad_norm": 2.498923152973308, + "language_loss": 0.83643848, + "learning_rate": 3.0170400918893464e-06, + "loss": 0.91375613, + "num_input_tokens_seen": 125402675, + "router_z_loss_clip": 1.79882812, + "router_z_loss_mlp": 0.14556885, + "step": 5826, + "time_per_iteration": 3.9313511848449707 + }, + { + "auxiliary_loss_clip": 0.06470059, + "auxiliary_loss_mlp": 0.01272158, + "balance_loss_clip": 0.06284411, + "balance_loss_mlp": 0.01254956, + "epoch": 0.3503381932962573, + "flos": 21477401084160.0, + "grad_norm": 2.100708343809493, + "language_loss": 0.81216669, + "learning_rate": 3.0167047265686186e-06, + "loss": 0.88958883, + "num_input_tokens_seen": 125421360, + "router_z_loss_clip": 1.85644531, + "router_z_loss_mlp": 0.17211914, + "step": 5827, + "time_per_iteration": 2.556704044342041 + }, + { + "auxiliary_loss_clip": 0.06462219, + "auxiliary_loss_mlp": 0.01272255, + "balance_loss_clip": 0.06283772, + "balance_loss_mlp": 0.01257473, + "epoch": 0.3503983165489253, + "flos": 21257405389440.0, + "grad_norm": 2.0166313071454858, + "language_loss": 0.71145403, + "learning_rate": 3.0163693226943467e-06, + "loss": 0.78879881, + "num_input_tokens_seen": 125440000, + "router_z_loss_clip": 1.78515625, + "router_z_loss_mlp": 0.14794922, + "step": 5828, + "time_per_iteration": 2.5173239707946777 + }, + { + "auxiliary_loss_clip": 0.06467165, + "auxiliary_loss_mlp": 0.01274622, + "balance_loss_clip": 0.06285597, + "balance_loss_mlp": 0.01257539, + "epoch": 0.35045843980159325, + "flos": 27822644496000.0, + "grad_norm": 1.678964319221545, + "language_loss": 0.79897165, + "learning_rate": 3.016033880279248e-06, + "loss": 0.8763895, + "num_input_tokens_seen": 125460390, + "router_z_loss_clip": 1.81542969, + "router_z_loss_mlp": 0.17077637, + "step": 5829, + "time_per_iteration": 4.086450099945068 + }, + { + "auxiliary_loss_clip": 0.06475446, + "auxiliary_loss_mlp": 0.01275238, + "balance_loss_clip": 0.06286699, + "balance_loss_mlp": 0.01257988, + "epoch": 0.3505185630542612, + "flos": 25928201934720.0, + "grad_norm": 1.7428196933402165, + "language_loss": 0.72440839, + "learning_rate": 3.0156983993360417e-06, + "loss": 0.80191517, + "num_input_tokens_seen": 125478410, + "router_z_loss_clip": 1.88769531, + "router_z_loss_mlp": 0.17248535, + "step": 5830, + "time_per_iteration": 2.625723361968994 + }, + { + "auxiliary_loss_clip": 0.06461293, + "auxiliary_loss_mlp": 0.01273843, + "balance_loss_clip": 0.06283247, + "balance_loss_mlp": 0.01259633, + "epoch": 0.35057868630692923, + "flos": 20527999597440.0, + "grad_norm": 2.5118715805025884, + "language_loss": 0.88613749, + "learning_rate": 3.0153628798774513e-06, + "loss": 0.96348894, + "num_input_tokens_seen": 125495975, + "router_z_loss_clip": 1.78222656, + "router_z_loss_mlp": 0.14221191, + "step": 5831, + "time_per_iteration": 2.577260732650757 + }, + { + "auxiliary_loss_clip": 0.06466363, + "auxiliary_loss_mlp": 0.01273549, + "balance_loss_clip": 0.06284641, + "balance_loss_mlp": 0.01258672, + "epoch": 0.3506388095595972, + "flos": 20454849383040.0, + "grad_norm": 2.013142681723478, + "language_loss": 0.78719735, + "learning_rate": 3.0150273219161985e-06, + "loss": 0.86459637, + "num_input_tokens_seen": 125515035, + "router_z_loss_clip": 1.81738281, + "router_z_loss_mlp": 0.14868164, + "step": 5832, + "time_per_iteration": 2.584496021270752 + }, + { + "auxiliary_loss_clip": 0.06470136, + "auxiliary_loss_mlp": 0.01274951, + "balance_loss_clip": 0.06284127, + "balance_loss_mlp": 0.01258536, + "epoch": 0.35069893281226516, + "flos": 23115901749120.0, + "grad_norm": 3.869403317005625, + "language_loss": 0.71628016, + "learning_rate": 3.014691725465008e-06, + "loss": 0.79373109, + "num_input_tokens_seen": 125535555, + "router_z_loss_clip": 1.86230469, + "router_z_loss_mlp": 0.1640625, + "step": 5833, + "time_per_iteration": 2.559213161468506 + }, + { + "auxiliary_loss_clip": 0.06462866, + "auxiliary_loss_mlp": 0.01271192, + "balance_loss_clip": 0.06285653, + "balance_loss_mlp": 0.01256291, + "epoch": 0.35075905606493313, + "flos": 27279426476160.0, + "grad_norm": 2.081089463640026, + "language_loss": 0.80963689, + "learning_rate": 3.014356090536606e-06, + "loss": 0.88697743, + "num_input_tokens_seen": 125558195, + "router_z_loss_clip": 1.77246094, + "router_z_loss_mlp": 0.14892578, + "step": 5834, + "time_per_iteration": 2.6462955474853516 + }, + { + "auxiliary_loss_clip": 0.06469317, + "auxiliary_loss_mlp": 0.0127505, + "balance_loss_clip": 0.06288308, + "balance_loss_mlp": 0.01258634, + "epoch": 0.3508191793176011, + "flos": 19133491622400.0, + "grad_norm": 2.5340357013843566, + "language_loss": 0.84608614, + "learning_rate": 3.0140204171437183e-06, + "loss": 0.92352986, + "num_input_tokens_seen": 125575375, + "router_z_loss_clip": 1.80957031, + "router_z_loss_mlp": 0.1640625, + "step": 5835, + "time_per_iteration": 2.5068061351776123 + }, + { + "auxiliary_loss_clip": 0.06463549, + "auxiliary_loss_mlp": 0.01274357, + "balance_loss_clip": 0.0628426, + "balance_loss_mlp": 0.01259122, + "epoch": 0.35087930257026906, + "flos": 25564798776960.0, + "grad_norm": 1.6798272602016127, + "language_loss": 0.77162683, + "learning_rate": 3.0136847052990754e-06, + "loss": 0.84900588, + "num_input_tokens_seen": 125596745, + "router_z_loss_clip": 1.79296875, + "router_z_loss_mlp": 0.15234375, + "step": 5836, + "time_per_iteration": 2.628737449645996 + }, + { + "auxiliary_loss_clip": 0.06462973, + "auxiliary_loss_mlp": 0.01284097, + "balance_loss_clip": 0.06285001, + "balance_loss_mlp": 0.01268767, + "epoch": 0.350939425822937, + "flos": 18010061205120.0, + "grad_norm": 1.7914903677000888, + "language_loss": 0.7777887, + "learning_rate": 3.0133489550154074e-06, + "loss": 0.85525942, + "num_input_tokens_seen": 125613980, + "router_z_loss_clip": 1.77929688, + "router_z_loss_mlp": 0.15325928, + "step": 5837, + "time_per_iteration": 2.4906866550445557 + }, + { + "auxiliary_loss_clip": 0.06464779, + "auxiliary_loss_mlp": 0.0127724, + "balance_loss_clip": 0.0628402, + "balance_loss_mlp": 0.01261575, + "epoch": 0.350999549075605, + "flos": 22279747455360.0, + "grad_norm": 2.3774474075228995, + "language_loss": 0.68712002, + "learning_rate": 3.0130131663054442e-06, + "loss": 0.7645402, + "num_input_tokens_seen": 125632100, + "router_z_loss_clip": 1.8046875, + "router_z_loss_mlp": 0.15649414, + "step": 5838, + "time_per_iteration": 2.616330862045288 + }, + { + "auxiliary_loss_clip": 0.06463079, + "auxiliary_loss_mlp": 0.01275242, + "balance_loss_clip": 0.0628327, + "balance_loss_mlp": 0.01259554, + "epoch": 0.35105967232827295, + "flos": 14397511000320.0, + "grad_norm": 2.135026117356547, + "language_loss": 0.83941519, + "learning_rate": 3.0126773391819215e-06, + "loss": 0.91679841, + "num_input_tokens_seen": 125649190, + "router_z_loss_clip": 1.79785156, + "router_z_loss_mlp": 0.15686035, + "step": 5839, + "time_per_iteration": 2.475210428237915 + }, + { + "auxiliary_loss_clip": 0.06472797, + "auxiliary_loss_mlp": 0.01274732, + "balance_loss_clip": 0.06285894, + "balance_loss_mlp": 0.01258376, + "epoch": 0.3511197955809409, + "flos": 25089322383360.0, + "grad_norm": 2.313381638226651, + "language_loss": 0.58970249, + "learning_rate": 3.012341473657572e-06, + "loss": 0.6671778, + "num_input_tokens_seen": 125668680, + "router_z_loss_clip": 1.86914062, + "router_z_loss_mlp": 0.16357422, + "step": 5840, + "time_per_iteration": 2.5654497146606445 + }, + { + "auxiliary_loss_clip": 0.06465258, + "auxiliary_loss_mlp": 0.01277785, + "balance_loss_clip": 0.06280696, + "balance_loss_mlp": 0.0126174, + "epoch": 0.3511799188336089, + "flos": 25891123703040.0, + "grad_norm": 2.5798747861510254, + "language_loss": 0.87567091, + "learning_rate": 3.0120055697451322e-06, + "loss": 0.9531014, + "num_input_tokens_seen": 125686935, + "router_z_loss_clip": 1.84570312, + "router_z_loss_mlp": 0.16040039, + "step": 5841, + "time_per_iteration": 2.5275204181671143 + }, + { + "auxiliary_loss_clip": 0.06473795, + "auxiliary_loss_mlp": 0.01278097, + "balance_loss_clip": 0.0628502, + "balance_loss_mlp": 0.01261038, + "epoch": 0.35124004208627685, + "flos": 20089852997760.0, + "grad_norm": 1.7442007932185601, + "language_loss": 0.7546367, + "learning_rate": 3.0116696274573406e-06, + "loss": 0.83215564, + "num_input_tokens_seen": 125707180, + "router_z_loss_clip": 1.88964844, + "router_z_loss_mlp": 0.17077637, + "step": 5842, + "time_per_iteration": 2.5876784324645996 + }, + { + "auxiliary_loss_clip": 0.06465417, + "auxiliary_loss_mlp": 0.01280375, + "balance_loss_clip": 0.06280544, + "balance_loss_mlp": 0.01265105, + "epoch": 0.3513001653389448, + "flos": 17788891553280.0, + "grad_norm": 2.704982383226077, + "language_loss": 0.68951106, + "learning_rate": 3.0113336468069346e-06, + "loss": 0.76696897, + "num_input_tokens_seen": 125722780, + "router_z_loss_clip": 1.84765625, + "router_z_loss_mlp": 0.15258789, + "step": 5843, + "time_per_iteration": 2.4710304737091064 + }, + { + "auxiliary_loss_clip": 0.06466319, + "auxiliary_loss_mlp": 0.01285229, + "balance_loss_clip": 0.0628369, + "balance_loss_mlp": 0.01268892, + "epoch": 0.3513602885916128, + "flos": 29394745200000.0, + "grad_norm": 2.1140022916881525, + "language_loss": 0.66181982, + "learning_rate": 3.010997627806655e-06, + "loss": 0.7393353, + "num_input_tokens_seen": 125742110, + "router_z_loss_clip": 1.82714844, + "router_z_loss_mlp": 0.16326904, + "step": 5844, + "time_per_iteration": 2.585793972015381 + }, + { + "auxiliary_loss_clip": 0.06472903, + "auxiliary_loss_mlp": 0.01282408, + "balance_loss_clip": 0.0628912, + "balance_loss_mlp": 0.01265761, + "epoch": 0.3514204118442808, + "flos": 16185372768000.0, + "grad_norm": 2.0590361589883206, + "language_loss": 0.75743866, + "learning_rate": 3.010661570469245e-06, + "loss": 0.83499175, + "num_input_tokens_seen": 125759980, + "router_z_loss_clip": 1.83789062, + "router_z_loss_mlp": 0.1663208, + "step": 5845, + "time_per_iteration": 2.50748348236084 + }, + { + "auxiliary_loss_clip": 0.06463686, + "auxiliary_loss_mlp": 0.01285129, + "balance_loss_clip": 0.06284383, + "balance_loss_mlp": 0.01270102, + "epoch": 0.35148053509694877, + "flos": 23840234369280.0, + "grad_norm": 5.020955850717412, + "language_loss": 0.73988718, + "learning_rate": 3.0103254748074465e-06, + "loss": 0.8173753, + "num_input_tokens_seen": 125772660, + "router_z_loss_clip": 1.79101562, + "router_z_loss_mlp": 0.15032959, + "step": 5846, + "time_per_iteration": 2.626898765563965 + }, + { + "auxiliary_loss_clip": 0.06470932, + "auxiliary_loss_mlp": 0.01280544, + "balance_loss_clip": 0.06285631, + "balance_loss_mlp": 0.01265482, + "epoch": 0.35154065834961673, + "flos": 20996809591680.0, + "grad_norm": 1.7410870567887373, + "language_loss": 0.75501883, + "learning_rate": 3.0099893408340046e-06, + "loss": 0.8325336, + "num_input_tokens_seen": 125791935, + "router_z_loss_clip": 1.85253906, + "router_z_loss_mlp": 0.1506958, + "step": 5847, + "time_per_iteration": 2.559131383895874 + }, + { + "auxiliary_loss_clip": 0.06472816, + "auxiliary_loss_mlp": 0.01272158, + "balance_loss_clip": 0.06284919, + "balance_loss_mlp": 0.01257316, + "epoch": 0.3516007816022847, + "flos": 33263866206720.0, + "grad_norm": 1.8955744454716683, + "language_loss": 0.72774404, + "learning_rate": 3.009653168561666e-06, + "loss": 0.80519378, + "num_input_tokens_seen": 125813455, + "router_z_loss_clip": 1.87792969, + "router_z_loss_mlp": 0.1484375, + "step": 5848, + "time_per_iteration": 2.6645965576171875 + }, + { + "auxiliary_loss_clip": 0.06467354, + "auxiliary_loss_mlp": 0.01280776, + "balance_loss_clip": 0.06280826, + "balance_loss_mlp": 0.01265124, + "epoch": 0.35166090485495266, + "flos": 11731427389440.0, + "grad_norm": 2.1922530808110983, + "language_loss": 0.90064394, + "learning_rate": 3.009316958003178e-06, + "loss": 0.97812521, + "num_input_tokens_seen": 125827660, + "router_z_loss_clip": 1.86425781, + "router_z_loss_mlp": 0.15655518, + "step": 5849, + "time_per_iteration": 2.4567575454711914 + }, + { + "auxiliary_loss_clip": 0.06464183, + "auxiliary_loss_mlp": 0.01272929, + "balance_loss_clip": 0.06281896, + "balance_loss_mlp": 0.01257461, + "epoch": 0.3517210281076206, + "flos": 22645121184000.0, + "grad_norm": 2.4964624006606946, + "language_loss": 0.75405449, + "learning_rate": 3.0089807091712897e-06, + "loss": 0.83142555, + "num_input_tokens_seen": 125846655, + "router_z_loss_clip": 1.82324219, + "router_z_loss_mlp": 0.15472412, + "step": 5850, + "time_per_iteration": 2.5980029106140137 + }, + { + "auxiliary_loss_clip": 0.06463099, + "auxiliary_loss_mlp": 0.01274678, + "balance_loss_clip": 0.06282984, + "balance_loss_mlp": 0.01259842, + "epoch": 0.3517811513602886, + "flos": 21328836595200.0, + "grad_norm": 2.0250770904548303, + "language_loss": 0.76385641, + "learning_rate": 3.0086444220787515e-06, + "loss": 0.84123409, + "num_input_tokens_seen": 125866290, + "router_z_loss_clip": 1.80371094, + "router_z_loss_mlp": 0.14825439, + "step": 5851, + "time_per_iteration": 2.5065958499908447 + }, + { + "auxiliary_loss_clip": 0.06463097, + "auxiliary_loss_mlp": 0.01275014, + "balance_loss_clip": 0.06281513, + "balance_loss_mlp": 0.01258933, + "epoch": 0.35184127461295656, + "flos": 21039254484480.0, + "grad_norm": 1.95256002439052, + "language_loss": 0.88133335, + "learning_rate": 3.0083080967383165e-06, + "loss": 0.95871449, + "num_input_tokens_seen": 125884620, + "router_z_loss_clip": 1.81738281, + "router_z_loss_mlp": 0.1607666, + "step": 5852, + "time_per_iteration": 2.571439266204834 + }, + { + "auxiliary_loss_clip": 0.06461711, + "auxiliary_loss_mlp": 0.01273084, + "balance_loss_clip": 0.06282608, + "balance_loss_mlp": 0.01258087, + "epoch": 0.3519013978656245, + "flos": 22461784450560.0, + "grad_norm": 2.1690150127965038, + "language_loss": 0.68480182, + "learning_rate": 3.007971733162737e-06, + "loss": 0.76214981, + "num_input_tokens_seen": 125902430, + "router_z_loss_clip": 1.79101562, + "router_z_loss_mlp": 0.14990234, + "step": 5853, + "time_per_iteration": 2.5121214389801025 + }, + { + "auxiliary_loss_clip": 0.06466305, + "auxiliary_loss_mlp": 0.0127272, + "balance_loss_clip": 0.06282477, + "balance_loss_mlp": 0.01256972, + "epoch": 0.3519615211182925, + "flos": 13120317141120.0, + "grad_norm": 2.1084516189193403, + "language_loss": 0.81284809, + "learning_rate": 3.0076353313647686e-06, + "loss": 0.89023829, + "num_input_tokens_seen": 125920570, + "router_z_loss_clip": 1.83691406, + "router_z_loss_mlp": 0.15734863, + "step": 5854, + "time_per_iteration": 2.644672155380249 + }, + { + "auxiliary_loss_clip": 0.06456967, + "auxiliary_loss_mlp": 0.01267471, + "balance_loss_clip": 0.06279022, + "balance_loss_mlp": 0.01253481, + "epoch": 0.35202164437096045, + "flos": 19141122343680.0, + "grad_norm": 1.5283351736697255, + "language_loss": 0.73366165, + "learning_rate": 3.0072988913571666e-06, + "loss": 0.81090605, + "num_input_tokens_seen": 125939800, + "router_z_loss_clip": 1.77832031, + "router_z_loss_mlp": 0.13970947, + "step": 5855, + "time_per_iteration": 2.489614486694336 + }, + { + "auxiliary_loss_clip": 0.06458069, + "auxiliary_loss_mlp": 0.01271058, + "balance_loss_clip": 0.06279419, + "balance_loss_mlp": 0.01256717, + "epoch": 0.3520817676236284, + "flos": 26549475632640.0, + "grad_norm": 1.8023400431296785, + "language_loss": 0.71055883, + "learning_rate": 3.006962413152691e-06, + "loss": 0.78785008, + "num_input_tokens_seen": 125958720, + "router_z_loss_clip": 1.78613281, + "router_z_loss_mlp": 0.14337158, + "step": 5856, + "time_per_iteration": 2.5643463134765625 + }, + { + "auxiliary_loss_clip": 0.064651, + "auxiliary_loss_mlp": 0.01271649, + "balance_loss_clip": 0.062787, + "balance_loss_mlp": 0.01255663, + "epoch": 0.3521418908762964, + "flos": 44903653557120.0, + "grad_norm": 1.9243906825553334, + "language_loss": 0.61456323, + "learning_rate": 3.0066258967640987e-06, + "loss": 0.69193071, + "num_input_tokens_seen": 125984310, + "router_z_loss_clip": 1.86523438, + "router_z_loss_mlp": 0.16003418, + "step": 5857, + "time_per_iteration": 2.723026752471924 + }, + { + "auxiliary_loss_clip": 0.06463988, + "auxiliary_loss_mlp": 0.0126934, + "balance_loss_clip": 0.06281644, + "balance_loss_mlp": 0.01253569, + "epoch": 0.3522020141289644, + "flos": 20192576503680.0, + "grad_norm": 1.9490734994800325, + "language_loss": 0.73682863, + "learning_rate": 3.006289342204152e-06, + "loss": 0.8141619, + "num_input_tokens_seen": 126002410, + "router_z_loss_clip": 1.82324219, + "router_z_loss_mlp": 0.15765381, + "step": 5858, + "time_per_iteration": 2.5245583057403564 + }, + { + "auxiliary_loss_clip": 0.0646653, + "auxiliary_loss_mlp": 0.01270245, + "balance_loss_clip": 0.06283493, + "balance_loss_mlp": 0.01255368, + "epoch": 0.35226213738163237, + "flos": 27571398428160.0, + "grad_norm": 1.5191641480211209, + "language_loss": 0.76385832, + "learning_rate": 3.0059527494856126e-06, + "loss": 0.8412261, + "num_input_tokens_seen": 126022490, + "router_z_loss_clip": 1.83105469, + "router_z_loss_mlp": 0.14880371, + "step": 5859, + "time_per_iteration": 2.5650510787963867 + }, + { + "auxiliary_loss_clip": 0.06474233, + "auxiliary_loss_mlp": 0.01272168, + "balance_loss_clip": 0.06283402, + "balance_loss_mlp": 0.01256862, + "epoch": 0.35232226063430033, + "flos": 22972955483520.0, + "grad_norm": 2.0210321352313305, + "language_loss": 0.72436023, + "learning_rate": 3.0056161186212435e-06, + "loss": 0.80182427, + "num_input_tokens_seen": 126042895, + "router_z_loss_clip": 1.90820312, + "router_z_loss_mlp": 0.15307617, + "step": 5860, + "time_per_iteration": 2.557419776916504 + }, + { + "auxiliary_loss_clip": 0.06468037, + "auxiliary_loss_mlp": 0.01273009, + "balance_loss_clip": 0.06280215, + "balance_loss_mlp": 0.01257304, + "epoch": 0.3523823838869683, + "flos": 19173714382080.0, + "grad_norm": 2.1675794505809076, + "language_loss": 0.66646308, + "learning_rate": 3.005279449623811e-06, + "loss": 0.74387354, + "num_input_tokens_seen": 126060130, + "router_z_loss_clip": 1.87890625, + "router_z_loss_mlp": 0.15704346, + "step": 5861, + "time_per_iteration": 5.330287218093872 + }, + { + "auxiliary_loss_clip": 0.06464717, + "auxiliary_loss_mlp": 0.01272322, + "balance_loss_clip": 0.06283809, + "balance_loss_mlp": 0.01257331, + "epoch": 0.35244250713963626, + "flos": 17936743282560.0, + "grad_norm": 1.8073030876467324, + "language_loss": 0.67339319, + "learning_rate": 3.0049427425060815e-06, + "loss": 0.7507636, + "num_input_tokens_seen": 126077850, + "router_z_loss_clip": 1.80859375, + "router_z_loss_mlp": 0.15002441, + "step": 5862, + "time_per_iteration": 2.545534372329712 + }, + { + "auxiliary_loss_clip": 0.06465253, + "auxiliary_loss_mlp": 0.01277428, + "balance_loss_clip": 0.06279148, + "balance_loss_mlp": 0.01260775, + "epoch": 0.35250263039230423, + "flos": 21438687697920.0, + "grad_norm": 2.06594301339393, + "language_loss": 0.76956195, + "learning_rate": 3.0046059972808215e-06, + "loss": 0.8469888, + "num_input_tokens_seen": 126095985, + "router_z_loss_clip": 1.859375, + "router_z_loss_mlp": 0.16650391, + "step": 5863, + "time_per_iteration": 2.5614800453186035 + }, + { + "auxiliary_loss_clip": 0.06466909, + "auxiliary_loss_mlp": 0.01270449, + "balance_loss_clip": 0.06283094, + "balance_loss_mlp": 0.01255846, + "epoch": 0.3525627536449722, + "flos": 27424133677440.0, + "grad_norm": 1.7204880099735786, + "language_loss": 0.75455201, + "learning_rate": 3.0042692139608024e-06, + "loss": 0.83192563, + "num_input_tokens_seen": 126116070, + "router_z_loss_clip": 1.83789062, + "router_z_loss_mlp": 0.14605713, + "step": 5864, + "time_per_iteration": 2.590428113937378 + }, + { + "auxiliary_loss_clip": 0.06465425, + "auxiliary_loss_mlp": 0.01271849, + "balance_loss_clip": 0.06283714, + "balance_loss_mlp": 0.01257306, + "epoch": 0.35262287689764016, + "flos": 24796637671680.0, + "grad_norm": 2.274548371802061, + "language_loss": 0.79325253, + "learning_rate": 3.003932392558793e-06, + "loss": 0.87062526, + "num_input_tokens_seen": 126135205, + "router_z_loss_clip": 1.81738281, + "router_z_loss_mlp": 0.14550781, + "step": 5865, + "time_per_iteration": 4.090251922607422 + }, + { + "auxiliary_loss_clip": 0.06479216, + "auxiliary_loss_mlp": 0.01273849, + "balance_loss_clip": 0.06290671, + "balance_loss_mlp": 0.01257935, + "epoch": 0.3526830001503081, + "flos": 17827353377280.0, + "grad_norm": 3.6346687905375155, + "language_loss": 0.81561065, + "learning_rate": 3.0035955330875677e-06, + "loss": 0.89314139, + "num_input_tokens_seen": 126151895, + "router_z_loss_clip": 1.88476562, + "router_z_loss_mlp": 0.15917969, + "step": 5866, + "time_per_iteration": 2.5417611598968506 + }, + { + "auxiliary_loss_clip": 0.06481875, + "auxiliary_loss_mlp": 0.01272499, + "balance_loss_clip": 0.06287797, + "balance_loss_mlp": 0.01255226, + "epoch": 0.3527431234029761, + "flos": 18084091887360.0, + "grad_norm": 2.1275369997353692, + "language_loss": 0.84947896, + "learning_rate": 3.0032586355598986e-06, + "loss": 0.9270227, + "num_input_tokens_seen": 126168515, + "router_z_loss_clip": 1.94042969, + "router_z_loss_mlp": 0.17272949, + "step": 5867, + "time_per_iteration": 2.487138509750366 + }, + { + "auxiliary_loss_clip": 0.06472977, + "auxiliary_loss_mlp": 0.01270369, + "balance_loss_clip": 0.06285943, + "balance_loss_mlp": 0.01254431, + "epoch": 0.35280324665564405, + "flos": 19433429712000.0, + "grad_norm": 2.157782607866355, + "language_loss": 0.74828005, + "learning_rate": 3.0029216999885613e-06, + "loss": 0.82571352, + "num_input_tokens_seen": 126186460, + "router_z_loss_clip": 1.86816406, + "router_z_loss_mlp": 0.15942383, + "step": 5868, + "time_per_iteration": 2.536522150039673 + }, + { + "auxiliary_loss_clip": 0.06471637, + "auxiliary_loss_mlp": 0.01277122, + "balance_loss_clip": 0.06284134, + "balance_loss_mlp": 0.01260277, + "epoch": 0.352863369908312, + "flos": 21509951195520.0, + "grad_norm": 2.023756469283546, + "language_loss": 0.6153, + "learning_rate": 3.0025847263863327e-06, + "loss": 0.69278765, + "num_input_tokens_seen": 126206170, + "router_z_loss_clip": 1.87402344, + "router_z_loss_mlp": 0.16845703, + "step": 5869, + "time_per_iteration": 3.977250099182129 + }, + { + "auxiliary_loss_clip": 0.06469242, + "auxiliary_loss_mlp": 0.01275411, + "balance_loss_clip": 0.06282457, + "balance_loss_mlp": 0.01259985, + "epoch": 0.35292349316098, + "flos": 22316029073280.0, + "grad_norm": 3.8155591266042173, + "language_loss": 0.75253737, + "learning_rate": 3.0022477147659917e-06, + "loss": 0.82998383, + "num_input_tokens_seen": 126225605, + "router_z_loss_clip": 1.86914062, + "router_z_loss_mlp": 0.1541748, + "step": 5870, + "time_per_iteration": 2.5275635719299316 + }, + { + "auxiliary_loss_clip": 0.06466261, + "auxiliary_loss_mlp": 0.01271259, + "balance_loss_clip": 0.06282211, + "balance_loss_mlp": 0.01255964, + "epoch": 0.352983616413648, + "flos": 33118152756480.0, + "grad_norm": 1.8217533687724534, + "language_loss": 0.72204906, + "learning_rate": 3.001910665140316e-06, + "loss": 0.79942429, + "num_input_tokens_seen": 126250230, + "router_z_loss_clip": 1.84082031, + "router_z_loss_mlp": 0.1529541, + "step": 5871, + "time_per_iteration": 2.660351037979126 + }, + { + "auxiliary_loss_clip": 0.06463222, + "auxiliary_loss_mlp": 0.012708, + "balance_loss_clip": 0.0628562, + "balance_loss_mlp": 0.01257389, + "epoch": 0.35304373966631597, + "flos": 18702388765440.0, + "grad_norm": 1.8432981727531608, + "language_loss": 0.73899144, + "learning_rate": 3.0015735775220873e-06, + "loss": 0.81633162, + "num_input_tokens_seen": 126268315, + "router_z_loss_clip": 1.77636719, + "router_z_loss_mlp": 0.13415527, + "step": 5872, + "time_per_iteration": 2.501868724822998 + }, + { + "auxiliary_loss_clip": 0.06467956, + "auxiliary_loss_mlp": 0.01269552, + "balance_loss_clip": 0.06285646, + "balance_loss_mlp": 0.01255163, + "epoch": 0.35310386291898394, + "flos": 23371214739840.0, + "grad_norm": 1.6596154000518588, + "language_loss": 0.83059716, + "learning_rate": 3.001236451924089e-06, + "loss": 0.90797222, + "num_input_tokens_seen": 126288390, + "router_z_loss_clip": 1.8203125, + "router_z_loss_mlp": 0.14404297, + "step": 5873, + "time_per_iteration": 2.6044130325317383 + }, + { + "auxiliary_loss_clip": 0.06475792, + "auxiliary_loss_mlp": 0.01275098, + "balance_loss_clip": 0.06285458, + "balance_loss_mlp": 0.0125879, + "epoch": 0.3531639861716519, + "flos": 24468803372160.0, + "grad_norm": 2.6977932070351183, + "language_loss": 0.65726781, + "learning_rate": 3.000899288359104e-06, + "loss": 0.73477674, + "num_input_tokens_seen": 126305750, + "router_z_loss_clip": 1.90234375, + "router_z_loss_mlp": 0.16308594, + "step": 5874, + "time_per_iteration": 2.558915138244629 + }, + { + "auxiliary_loss_clip": 0.06370112, + "auxiliary_loss_mlp": 0.01273024, + "balance_loss_clip": 0.06287491, + "balance_loss_mlp": 0.01268941, + "epoch": 0.35322410942431987, + "flos": 70331040437760.0, + "grad_norm": 0.7490717453474699, + "language_loss": 0.616135, + "learning_rate": 3.000562086839917e-06, + "loss": 0.69256639, + "num_input_tokens_seen": 126362495, + "router_z_loss_clip": 0.82714844, + "router_z_loss_mlp": 0.04083252, + "step": 5875, + "time_per_iteration": 3.1286721229553223 + }, + { + "auxiliary_loss_clip": 0.06475496, + "auxiliary_loss_mlp": 0.01277595, + "balance_loss_clip": 0.06289661, + "balance_loss_mlp": 0.01262086, + "epoch": 0.35328423267698783, + "flos": 19825735328640.0, + "grad_norm": 2.073373185113386, + "language_loss": 0.8042345, + "learning_rate": 3.0002248473793163e-06, + "loss": 0.88176548, + "num_input_tokens_seen": 126378320, + "router_z_loss_clip": 1.85742188, + "router_z_loss_mlp": 0.15509033, + "step": 5876, + "time_per_iteration": 2.5174875259399414 + }, + { + "auxiliary_loss_clip": 0.063563, + "auxiliary_loss_mlp": 0.01261292, + "balance_loss_clip": 0.06274077, + "balance_loss_mlp": 0.01257364, + "epoch": 0.3533443559296558, + "flos": 60843398480640.0, + "grad_norm": 0.6578323239794136, + "language_loss": 0.56720114, + "learning_rate": 2.999887569990088e-06, + "loss": 0.64337707, + "num_input_tokens_seen": 126442735, + "router_z_loss_clip": 0.82128906, + "router_z_loss_mlp": 0.03924561, + "step": 5877, + "time_per_iteration": 3.239800214767456 + }, + { + "auxiliary_loss_clip": 0.0647119, + "auxiliary_loss_mlp": 0.01275609, + "balance_loss_clip": 0.06286252, + "balance_loss_mlp": 0.01259301, + "epoch": 0.35340447918232376, + "flos": 24762997457280.0, + "grad_norm": 1.7728898292153, + "language_loss": 0.72425848, + "learning_rate": 2.999550254685024e-06, + "loss": 0.80172646, + "num_input_tokens_seen": 126463090, + "router_z_loss_clip": 1.84863281, + "router_z_loss_mlp": 0.16308594, + "step": 5878, + "time_per_iteration": 2.576354742050171 + }, + { + "auxiliary_loss_clip": 0.06470102, + "auxiliary_loss_mlp": 0.01272441, + "balance_loss_clip": 0.06286008, + "balance_loss_mlp": 0.01256789, + "epoch": 0.3534646024349917, + "flos": 21802342417920.0, + "grad_norm": 2.4353464978664494, + "language_loss": 0.78682542, + "learning_rate": 2.9992129014769136e-06, + "loss": 0.86425084, + "num_input_tokens_seen": 126482105, + "router_z_loss_clip": 1.84082031, + "router_z_loss_mlp": 0.15649414, + "step": 5879, + "time_per_iteration": 2.535600423812866 + }, + { + "auxiliary_loss_clip": 0.06481053, + "auxiliary_loss_mlp": 0.01271703, + "balance_loss_clip": 0.0628894, + "balance_loss_mlp": 0.01253714, + "epoch": 0.3535247256876597, + "flos": 20018463719040.0, + "grad_norm": 2.0590866059314035, + "language_loss": 0.63551295, + "learning_rate": 2.9988755103785493e-06, + "loss": 0.71304053, + "num_input_tokens_seen": 126502125, + "router_z_loss_clip": 1.91992188, + "router_z_loss_mlp": 0.17980957, + "step": 5880, + "time_per_iteration": 2.5576937198638916 + }, + { + "auxiliary_loss_clip": 0.06481048, + "auxiliary_loss_mlp": 0.01274855, + "balance_loss_clip": 0.06292346, + "balance_loss_mlp": 0.01258035, + "epoch": 0.35358484894032766, + "flos": 18193984917120.0, + "grad_norm": 2.6506562916801273, + "language_loss": 0.66346908, + "learning_rate": 2.998538081402727e-06, + "loss": 0.74102807, + "num_input_tokens_seen": 126521950, + "router_z_loss_clip": 1.88671875, + "router_z_loss_mlp": 0.16821289, + "step": 5881, + "time_per_iteration": 2.5375049114227295 + }, + { + "auxiliary_loss_clip": 0.06465093, + "auxiliary_loss_mlp": 0.01272514, + "balance_loss_clip": 0.06285467, + "balance_loss_mlp": 0.0125818, + "epoch": 0.3536449721929956, + "flos": 22826990471040.0, + "grad_norm": 1.7415962616346485, + "language_loss": 0.75838578, + "learning_rate": 2.998200614562239e-06, + "loss": 0.8357619, + "num_input_tokens_seen": 126542445, + "router_z_loss_clip": 1.79492188, + "router_z_loss_mlp": 0.14337158, + "step": 5882, + "time_per_iteration": 2.546163558959961 + }, + { + "auxiliary_loss_clip": 0.06472618, + "auxiliary_loss_mlp": 0.01271877, + "balance_loss_clip": 0.06285325, + "balance_loss_mlp": 0.01256189, + "epoch": 0.3537050954456636, + "flos": 26439540675840.0, + "grad_norm": 2.210270342508568, + "language_loss": 0.70790988, + "learning_rate": 2.9978631098698847e-06, + "loss": 0.78535485, + "num_input_tokens_seen": 126560690, + "router_z_loss_clip": 1.87402344, + "router_z_loss_mlp": 0.15692139, + "step": 5883, + "time_per_iteration": 2.5813896656036377 + }, + { + "auxiliary_loss_clip": 0.06481725, + "auxiliary_loss_mlp": 0.01274676, + "balance_loss_clip": 0.0628854, + "balance_loss_mlp": 0.01258105, + "epoch": 0.3537652186983316, + "flos": 17202096610560.0, + "grad_norm": 3.5308447991949348, + "language_loss": 0.7912811, + "learning_rate": 2.9975255673384614e-06, + "loss": 0.86884505, + "num_input_tokens_seen": 126577620, + "router_z_loss_clip": 1.9296875, + "router_z_loss_mlp": 0.16564941, + "step": 5884, + "time_per_iteration": 2.564178228378296 + }, + { + "auxiliary_loss_clip": 0.06469014, + "auxiliary_loss_mlp": 0.01273424, + "balance_loss_clip": 0.06285414, + "balance_loss_mlp": 0.01258142, + "epoch": 0.3538253419509996, + "flos": 19542861544320.0, + "grad_norm": 3.0890260502514173, + "language_loss": 0.76079619, + "learning_rate": 2.9971879869807673e-06, + "loss": 0.83822054, + "num_input_tokens_seen": 126596235, + "router_z_loss_clip": 1.83691406, + "router_z_loss_mlp": 0.15283203, + "step": 5885, + "time_per_iteration": 2.5860350131988525 + }, + { + "auxiliary_loss_clip": 0.06473316, + "auxiliary_loss_mlp": 0.01274145, + "balance_loss_clip": 0.06285691, + "balance_loss_mlp": 0.01257766, + "epoch": 0.35388546520366754, + "flos": 12133166590080.0, + "grad_norm": 4.983567417880078, + "language_loss": 0.83563066, + "learning_rate": 2.996850368809606e-06, + "loss": 0.91310525, + "num_input_tokens_seen": 126612830, + "router_z_loss_clip": 1.87597656, + "router_z_loss_mlp": 0.16357422, + "step": 5886, + "time_per_iteration": 2.549227714538574 + }, + { + "auxiliary_loss_clip": 0.06464715, + "auxiliary_loss_mlp": 0.01274591, + "balance_loss_clip": 0.06282739, + "balance_loss_mlp": 0.0125851, + "epoch": 0.3539455884563355, + "flos": 19683501822720.0, + "grad_norm": 3.219387216821374, + "language_loss": 0.78429639, + "learning_rate": 2.9965127128377787e-06, + "loss": 0.86168945, + "num_input_tokens_seen": 126630910, + "router_z_loss_clip": 1.82128906, + "router_z_loss_mlp": 0.16088867, + "step": 5887, + "time_per_iteration": 2.523743152618408 + }, + { + "auxiliary_loss_clip": 0.0646676, + "auxiliary_loss_mlp": 0.0127383, + "balance_loss_clip": 0.06282511, + "balance_loss_mlp": 0.01258631, + "epoch": 0.35400571170900347, + "flos": 18077006217600.0, + "grad_norm": 1.8956957640615841, + "language_loss": 0.66116667, + "learning_rate": 2.996175019078089e-06, + "loss": 0.7385726, + "num_input_tokens_seen": 126648365, + "router_z_loss_clip": 1.84570312, + "router_z_loss_mlp": 0.15197754, + "step": 5888, + "time_per_iteration": 2.5279300212860107 + }, + { + "auxiliary_loss_clip": 0.06467725, + "auxiliary_loss_mlp": 0.01271718, + "balance_loss_clip": 0.06284125, + "balance_loss_mlp": 0.01256185, + "epoch": 0.35406583496167143, + "flos": 26075298977280.0, + "grad_norm": 2.3097601077816443, + "language_loss": 0.76721621, + "learning_rate": 2.9958372875433437e-06, + "loss": 0.84461069, + "num_input_tokens_seen": 126667500, + "router_z_loss_clip": 1.8359375, + "router_z_loss_mlp": 0.15527344, + "step": 5889, + "time_per_iteration": 2.564761161804199 + }, + { + "auxiliary_loss_clip": 0.06465457, + "auxiliary_loss_mlp": 0.01270164, + "balance_loss_clip": 0.06283142, + "balance_loss_mlp": 0.01254357, + "epoch": 0.3541259582143394, + "flos": 19798635732480.0, + "grad_norm": 2.1640548649274116, + "language_loss": 0.81408846, + "learning_rate": 2.9954995182463478e-06, + "loss": 0.89144462, + "num_input_tokens_seen": 126686820, + "router_z_loss_clip": 1.82421875, + "router_z_loss_mlp": 0.15808105, + "step": 5890, + "time_per_iteration": 2.5614936351776123 + }, + { + "auxiliary_loss_clip": 0.06466024, + "auxiliary_loss_mlp": 0.01270173, + "balance_loss_clip": 0.06285816, + "balance_loss_mlp": 0.01256094, + "epoch": 0.35418608146700736, + "flos": 24028518493440.0, + "grad_norm": 1.6495661544524922, + "language_loss": 0.80017459, + "learning_rate": 2.99516171119991e-06, + "loss": 0.87753654, + "num_input_tokens_seen": 126706965, + "router_z_loss_clip": 1.80078125, + "router_z_loss_mlp": 0.14074707, + "step": 5891, + "time_per_iteration": 2.553158760070801 + }, + { + "auxiliary_loss_clip": 0.06471643, + "auxiliary_loss_mlp": 0.01282427, + "balance_loss_clip": 0.06289162, + "balance_loss_mlp": 0.01265928, + "epoch": 0.35424620471967533, + "flos": 12390701713920.0, + "grad_norm": 1.7694155250203176, + "language_loss": 0.73450041, + "learning_rate": 2.9948238664168415e-06, + "loss": 0.81204116, + "num_input_tokens_seen": 126724015, + "router_z_loss_clip": 1.82226562, + "router_z_loss_mlp": 0.16516113, + "step": 5892, + "time_per_iteration": 2.529136896133423 + }, + { + "auxiliary_loss_clip": 0.06470741, + "auxiliary_loss_mlp": 0.01274401, + "balance_loss_clip": 0.06286078, + "balance_loss_mlp": 0.01259059, + "epoch": 0.3543063279723433, + "flos": 19678219015680.0, + "grad_norm": 3.019670501918518, + "language_loss": 0.67408991, + "learning_rate": 2.9944859839099518e-06, + "loss": 0.75154132, + "num_input_tokens_seen": 126737565, + "router_z_loss_clip": 1.84667969, + "router_z_loss_mlp": 0.15344238, + "step": 5893, + "time_per_iteration": 2.507456064224243 + }, + { + "auxiliary_loss_clip": 0.06469926, + "auxiliary_loss_mlp": 0.01274247, + "balance_loss_clip": 0.06287754, + "balance_loss_mlp": 0.01257545, + "epoch": 0.35436645122501126, + "flos": 21915841173120.0, + "grad_norm": 1.8801549379271045, + "language_loss": 0.70079887, + "learning_rate": 2.9941480636920533e-06, + "loss": 0.77824062, + "num_input_tokens_seen": 126756095, + "router_z_loss_clip": 1.82128906, + "router_z_loss_mlp": 0.16711426, + "step": 5894, + "time_per_iteration": 2.5596466064453125 + }, + { + "auxiliary_loss_clip": 0.0646911, + "auxiliary_loss_mlp": 0.0127714, + "balance_loss_clip": 0.06291118, + "balance_loss_mlp": 0.0126256, + "epoch": 0.3544265744776792, + "flos": 21724915645440.0, + "grad_norm": 1.8040348457355686, + "language_loss": 0.74516678, + "learning_rate": 2.9938101057759615e-06, + "loss": 0.82262927, + "num_input_tokens_seen": 126775455, + "router_z_loss_clip": 1.78027344, + "router_z_loss_mlp": 0.14569092, + "step": 5895, + "time_per_iteration": 2.602884531021118 + }, + { + "auxiliary_loss_clip": 0.06476314, + "auxiliary_loss_mlp": 0.01274747, + "balance_loss_clip": 0.06292941, + "balance_loss_mlp": 0.01259643, + "epoch": 0.3544866977303472, + "flos": 21219278981760.0, + "grad_norm": 1.7647167527567422, + "language_loss": 0.83600783, + "learning_rate": 2.993472110174491e-06, + "loss": 0.91351843, + "num_input_tokens_seen": 126792320, + "router_z_loss_clip": 1.83300781, + "router_z_loss_mlp": 0.15100098, + "step": 5896, + "time_per_iteration": 2.5642035007476807 + }, + { + "auxiliary_loss_clip": 0.06472465, + "auxiliary_loss_mlp": 0.01278933, + "balance_loss_clip": 0.06292751, + "balance_loss_mlp": 0.01261576, + "epoch": 0.35454682098301515, + "flos": 29318534311680.0, + "grad_norm": 1.8515152904238923, + "language_loss": 0.70294917, + "learning_rate": 2.9931340769004576e-06, + "loss": 0.7804631, + "num_input_tokens_seen": 126813680, + "router_z_loss_clip": 1.79785156, + "router_z_loss_mlp": 0.17346191, + "step": 5897, + "time_per_iteration": 2.613032341003418 + }, + { + "auxiliary_loss_clip": 0.06475735, + "auxiliary_loss_mlp": 0.01274261, + "balance_loss_clip": 0.06293957, + "balance_loss_mlp": 0.01259205, + "epoch": 0.3546069442356832, + "flos": 24323509192320.0, + "grad_norm": 1.6960731630978507, + "language_loss": 0.81964374, + "learning_rate": 2.9927960059666816e-06, + "loss": 0.89714372, + "num_input_tokens_seen": 126834395, + "router_z_loss_clip": 1.81933594, + "router_z_loss_mlp": 0.15063477, + "step": 5898, + "time_per_iteration": 2.6033098697662354 + }, + { + "auxiliary_loss_clip": 0.06471986, + "auxiliary_loss_mlp": 0.01279895, + "balance_loss_clip": 0.0629501, + "balance_loss_mlp": 0.01265173, + "epoch": 0.35466706748835114, + "flos": 22863984848640.0, + "grad_norm": 1.4933011631381068, + "language_loss": 0.74405515, + "learning_rate": 2.9924578973859804e-06, + "loss": 0.82157397, + "num_input_tokens_seen": 126855145, + "router_z_loss_clip": 1.76855469, + "router_z_loss_mlp": 0.14727783, + "step": 5899, + "time_per_iteration": 2.5492894649505615 + }, + { + "auxiliary_loss_clip": 0.0647797, + "auxiliary_loss_mlp": 0.01272872, + "balance_loss_clip": 0.06294148, + "balance_loss_mlp": 0.01257196, + "epoch": 0.3547271907410191, + "flos": 28337714743680.0, + "grad_norm": 3.4583325446366673, + "language_loss": 0.80211669, + "learning_rate": 2.9921197511711763e-06, + "loss": 0.87962508, + "num_input_tokens_seen": 126873790, + "router_z_loss_clip": 1.83886719, + "router_z_loss_mlp": 0.15698242, + "step": 5900, + "time_per_iteration": 5.435121774673462 + }, + { + "auxiliary_loss_clip": 0.06478105, + "auxiliary_loss_mlp": 0.01279951, + "balance_loss_clip": 0.06296446, + "balance_loss_mlp": 0.01263607, + "epoch": 0.35478731399368707, + "flos": 23520911258880.0, + "grad_norm": 2.0942596894242533, + "language_loss": 0.8216058, + "learning_rate": 2.991781567335093e-06, + "loss": 0.89918637, + "num_input_tokens_seen": 126892865, + "router_z_loss_clip": 1.81445312, + "router_z_loss_mlp": 0.16357422, + "step": 5901, + "time_per_iteration": 2.603769540786743 + }, + { + "auxiliary_loss_clip": 0.06480999, + "auxiliary_loss_mlp": 0.01277169, + "balance_loss_clip": 0.06295676, + "balance_loss_mlp": 0.01261899, + "epoch": 0.35484743724635504, + "flos": 18630202872960.0, + "grad_norm": 2.2545917554681663, + "language_loss": 0.75979805, + "learning_rate": 2.9914433458905525e-06, + "loss": 0.83737969, + "num_input_tokens_seen": 126911935, + "router_z_loss_clip": 1.85253906, + "router_z_loss_mlp": 0.152771, + "step": 5902, + "time_per_iteration": 2.5356359481811523 + }, + { + "auxiliary_loss_clip": 0.06482422, + "auxiliary_loss_mlp": 0.01280542, + "balance_loss_clip": 0.06300852, + "balance_loss_mlp": 0.01265331, + "epoch": 0.354907560499023, + "flos": 17390296880640.0, + "grad_norm": 1.6908684001073404, + "language_loss": 0.70729327, + "learning_rate": 2.991105086850381e-06, + "loss": 0.78492296, + "num_input_tokens_seen": 126930040, + "router_z_loss_clip": 1.81640625, + "router_z_loss_mlp": 0.15209961, + "step": 5903, + "time_per_iteration": 2.52494478225708 + }, + { + "auxiliary_loss_clip": 0.06482972, + "auxiliary_loss_mlp": 0.01276075, + "balance_loss_clip": 0.06297173, + "balance_loss_mlp": 0.0125929, + "epoch": 0.35496768375169097, + "flos": 19214607974400.0, + "grad_norm": 2.9744492269587153, + "language_loss": 0.75001359, + "learning_rate": 2.9907667902274053e-06, + "loss": 0.82760406, + "num_input_tokens_seen": 126948390, + "router_z_loss_clip": 1.85742188, + "router_z_loss_mlp": 0.16784668, + "step": 5904, + "time_per_iteration": 2.5316994190216064 + }, + { + "auxiliary_loss_clip": 0.0648163, + "auxiliary_loss_mlp": 0.01277137, + "balance_loss_clip": 0.06297497, + "balance_loss_mlp": 0.01261902, + "epoch": 0.35502780700435893, + "flos": 18338692118400.0, + "grad_norm": 2.2144866791488536, + "language_loss": 0.78981996, + "learning_rate": 2.9904284560344536e-06, + "loss": 0.86740756, + "num_input_tokens_seen": 126964905, + "router_z_loss_clip": 1.84179688, + "router_z_loss_mlp": 0.15246582, + "step": 5905, + "time_per_iteration": 3.9867374897003174 + }, + { + "auxiliary_loss_clip": 0.06472038, + "auxiliary_loss_mlp": 0.01276232, + "balance_loss_clip": 0.06301226, + "balance_loss_mlp": 0.01262249, + "epoch": 0.3550879302570269, + "flos": 15453660988800.0, + "grad_norm": 1.8340819850757704, + "language_loss": 0.72531646, + "learning_rate": 2.990090084284356e-06, + "loss": 0.80279917, + "num_input_tokens_seen": 126982000, + "router_z_loss_clip": 1.70703125, + "router_z_loss_mlp": 0.13977051, + "step": 5906, + "time_per_iteration": 2.5326547622680664 + }, + { + "auxiliary_loss_clip": 0.06491787, + "auxiliary_loss_mlp": 0.01272032, + "balance_loss_clip": 0.06306198, + "balance_loss_mlp": 0.01256046, + "epoch": 0.35514805350969486, + "flos": 21985343735040.0, + "grad_norm": 1.9483914182465616, + "language_loss": 0.75052631, + "learning_rate": 2.9897516749899426e-06, + "loss": 0.82816458, + "num_input_tokens_seen": 126998390, + "router_z_loss_clip": 1.85839844, + "router_z_loss_mlp": 0.15991211, + "step": 5907, + "time_per_iteration": 2.526137113571167 + }, + { + "auxiliary_loss_clip": 0.06486456, + "auxiliary_loss_mlp": 0.01280245, + "balance_loss_clip": 0.06305459, + "balance_loss_mlp": 0.01264271, + "epoch": 0.3552081767623628, + "flos": 29869718469120.0, + "grad_norm": 2.2786495725258424, + "language_loss": 0.76563632, + "learning_rate": 2.989413228164047e-06, + "loss": 0.84330332, + "num_input_tokens_seen": 127020220, + "router_z_loss_clip": 1.81152344, + "router_z_loss_mlp": 0.15966797, + "step": 5908, + "time_per_iteration": 4.063998222351074 + }, + { + "auxiliary_loss_clip": 0.06491728, + "auxiliary_loss_mlp": 0.01276886, + "balance_loss_clip": 0.06310974, + "balance_loss_mlp": 0.0126146, + "epoch": 0.3552683000150308, + "flos": 26439456821760.0, + "grad_norm": 2.352503484530038, + "language_loss": 0.68572766, + "learning_rate": 2.989074743819502e-06, + "loss": 0.76341379, + "num_input_tokens_seen": 127038585, + "router_z_loss_clip": 1.80566406, + "router_z_loss_mlp": 0.15429688, + "step": 5909, + "time_per_iteration": 2.6902143955230713 + }, + { + "auxiliary_loss_clip": 0.0648414, + "auxiliary_loss_mlp": 0.01282146, + "balance_loss_clip": 0.06310885, + "balance_loss_mlp": 0.01268061, + "epoch": 0.35532842326769876, + "flos": 19791088865280.0, + "grad_norm": 1.9680680199916993, + "language_loss": 0.79103023, + "learning_rate": 2.988736221969144e-06, + "loss": 0.86869311, + "num_input_tokens_seen": 127056215, + "router_z_loss_clip": 1.73144531, + "router_z_loss_mlp": 0.14086914, + "step": 5910, + "time_per_iteration": 2.535050630569458 + }, + { + "auxiliary_loss_clip": 0.06495271, + "auxiliary_loss_mlp": 0.01274944, + "balance_loss_clip": 0.06310071, + "balance_loss_mlp": 0.0125841, + "epoch": 0.3553885465203668, + "flos": 17245170408960.0, + "grad_norm": 1.607302447744311, + "language_loss": 0.7130779, + "learning_rate": 2.98839766262581e-06, + "loss": 0.79078007, + "num_input_tokens_seen": 127075825, + "router_z_loss_clip": 1.85253906, + "router_z_loss_mlp": 0.1652832, + "step": 5911, + "time_per_iteration": 2.572942018508911 + }, + { + "auxiliary_loss_clip": 0.06485709, + "auxiliary_loss_mlp": 0.01272785, + "balance_loss_clip": 0.06309631, + "balance_loss_mlp": 0.01258313, + "epoch": 0.35544866977303474, + "flos": 14938800376320.0, + "grad_norm": 2.1423891041027514, + "language_loss": 0.87973344, + "learning_rate": 2.9880590658023366e-06, + "loss": 0.95731837, + "num_input_tokens_seen": 127091205, + "router_z_loss_clip": 1.76074219, + "router_z_loss_mlp": 0.14477539, + "step": 5912, + "time_per_iteration": 2.4826059341430664 + }, + { + "auxiliary_loss_clip": 0.0648666, + "auxiliary_loss_mlp": 0.01278679, + "balance_loss_clip": 0.0630875, + "balance_loss_mlp": 0.0126441, + "epoch": 0.3555087930257027, + "flos": 19762228333440.0, + "grad_norm": 2.0928412919366477, + "language_loss": 0.77506435, + "learning_rate": 2.9877204315115646e-06, + "loss": 0.8527177, + "num_input_tokens_seen": 127109210, + "router_z_loss_clip": 1.77832031, + "router_z_loss_mlp": 0.14251709, + "step": 5913, + "time_per_iteration": 2.577362060546875 + }, + { + "auxiliary_loss_clip": 0.06486008, + "auxiliary_loss_mlp": 0.01273445, + "balance_loss_clip": 0.06311025, + "balance_loss_mlp": 0.01258789, + "epoch": 0.3555689162783707, + "flos": 21074445999360.0, + "grad_norm": 5.920108951080063, + "language_loss": 0.82525283, + "learning_rate": 2.9873817597663353e-06, + "loss": 0.90284735, + "num_input_tokens_seen": 127128400, + "router_z_loss_clip": 1.75097656, + "router_z_loss_mlp": 0.14660645, + "step": 5914, + "time_per_iteration": 2.521756649017334 + }, + { + "auxiliary_loss_clip": 0.06490604, + "auxiliary_loss_mlp": 0.01268632, + "balance_loss_clip": 0.06310836, + "balance_loss_mlp": 0.01254118, + "epoch": 0.35562903953103864, + "flos": 33077426872320.0, + "grad_norm": 3.2692214801304686, + "language_loss": 0.7113682, + "learning_rate": 2.98704305057949e-06, + "loss": 0.78896052, + "num_input_tokens_seen": 127149965, + "router_z_loss_clip": 1.79882812, + "router_z_loss_mlp": 0.14508057, + "step": 5915, + "time_per_iteration": 2.6931562423706055 + }, + { + "auxiliary_loss_clip": 0.06477264, + "auxiliary_loss_mlp": 0.01269512, + "balance_loss_clip": 0.06297429, + "balance_loss_mlp": 0.01254814, + "epoch": 0.3556891627837066, + "flos": 20564029653120.0, + "grad_norm": 4.458093980019367, + "language_loss": 0.76718718, + "learning_rate": 2.9867043039638737e-06, + "loss": 0.84465492, + "num_input_tokens_seen": 127169865, + "router_z_loss_clip": 1.796875, + "router_z_loss_mlp": 0.14697266, + "step": 5916, + "time_per_iteration": 2.5489182472229004 + }, + { + "auxiliary_loss_clip": 0.06487325, + "auxiliary_loss_mlp": 0.01272059, + "balance_loss_clip": 0.06307879, + "balance_loss_mlp": 0.01256651, + "epoch": 0.35574928603637457, + "flos": 20709449614080.0, + "grad_norm": 1.674174142445476, + "language_loss": 0.88208687, + "learning_rate": 2.986365519932332e-06, + "loss": 0.95968074, + "num_input_tokens_seen": 127188075, + "router_z_loss_clip": 1.79589844, + "router_z_loss_mlp": 0.1539917, + "step": 5917, + "time_per_iteration": 2.6043195724487305 + }, + { + "auxiliary_loss_clip": 0.0649041, + "auxiliary_loss_mlp": 0.0126981, + "balance_loss_clip": 0.0631107, + "balance_loss_mlp": 0.01254289, + "epoch": 0.35580940928904253, + "flos": 15199899298560.0, + "grad_norm": 3.6980401889874086, + "language_loss": 0.75538862, + "learning_rate": 2.98602669849771e-06, + "loss": 0.83299077, + "num_input_tokens_seen": 127206065, + "router_z_loss_clip": 1.79296875, + "router_z_loss_mlp": 0.15515137, + "step": 5918, + "time_per_iteration": 2.5186190605163574 + }, + { + "auxiliary_loss_clip": 0.06461592, + "auxiliary_loss_mlp": 0.01285001, + "balance_loss_clip": 0.06381316, + "balance_loss_mlp": 0.01279086, + "epoch": 0.3558695325417105, + "flos": 58656145426560.0, + "grad_norm": 0.8458689331650495, + "language_loss": 0.63255095, + "learning_rate": 2.985687839672857e-06, + "loss": 0.71001691, + "num_input_tokens_seen": 127257885, + "router_z_loss_clip": 0.80224609, + "router_z_loss_mlp": 0.05911255, + "step": 5919, + "time_per_iteration": 2.9552297592163086 + }, + { + "auxiliary_loss_clip": 0.06485933, + "auxiliary_loss_mlp": 0.01271829, + "balance_loss_clip": 0.06302524, + "balance_loss_mlp": 0.01255998, + "epoch": 0.35592965579437846, + "flos": 22024811808000.0, + "grad_norm": 2.2679396062128188, + "language_loss": 0.74402696, + "learning_rate": 2.9853489434706223e-06, + "loss": 0.82160461, + "num_input_tokens_seen": 127275550, + "router_z_loss_clip": 1.83496094, + "router_z_loss_mlp": 0.1583252, + "step": 5920, + "time_per_iteration": 2.54848313331604 + }, + { + "auxiliary_loss_clip": 0.06483243, + "auxiliary_loss_mlp": 0.01277956, + "balance_loss_clip": 0.06304519, + "balance_loss_mlp": 0.01262638, + "epoch": 0.35598977904704643, + "flos": 23374401194880.0, + "grad_norm": 3.1552684799501733, + "language_loss": 0.77735227, + "learning_rate": 2.985010009903857e-06, + "loss": 0.85496426, + "num_input_tokens_seen": 127295110, + "router_z_loss_clip": 1.78710938, + "router_z_loss_mlp": 0.15332031, + "step": 5921, + "time_per_iteration": 2.6517810821533203 + }, + { + "auxiliary_loss_clip": 0.06490617, + "auxiliary_loss_mlp": 0.01276672, + "balance_loss_clip": 0.06309058, + "balance_loss_mlp": 0.01261329, + "epoch": 0.3560499022997144, + "flos": 17791113686400.0, + "grad_norm": 2.349487021583332, + "language_loss": 0.6770314, + "learning_rate": 2.9846710389854133e-06, + "loss": 0.75470436, + "num_input_tokens_seen": 127312865, + "router_z_loss_clip": 1.81640625, + "router_z_loss_mlp": 0.15332031, + "step": 5922, + "time_per_iteration": 2.525566577911377 + }, + { + "auxiliary_loss_clip": 0.06484485, + "auxiliary_loss_mlp": 0.0127389, + "balance_loss_clip": 0.06306913, + "balance_loss_mlp": 0.01258524, + "epoch": 0.35611002555238236, + "flos": 20746695553920.0, + "grad_norm": 2.231194122260979, + "language_loss": 0.79304701, + "learning_rate": 2.9843320307281454e-06, + "loss": 0.87063074, + "num_input_tokens_seen": 127331710, + "router_z_loss_clip": 1.77441406, + "router_z_loss_mlp": 0.15380859, + "step": 5923, + "time_per_iteration": 2.5809409618377686 + }, + { + "auxiliary_loss_clip": 0.06479051, + "auxiliary_loss_mlp": 0.01272719, + "balance_loss_clip": 0.06301268, + "balance_loss_mlp": 0.01257579, + "epoch": 0.3561701488050504, + "flos": 19468034248320.0, + "grad_norm": 1.61778925366919, + "language_loss": 0.8543126, + "learning_rate": 2.983992985144908e-06, + "loss": 0.93183035, + "num_input_tokens_seen": 127350950, + "router_z_loss_clip": 1.77929688, + "router_z_loss_mlp": 0.15148926, + "step": 5924, + "time_per_iteration": 2.524949312210083 + }, + { + "auxiliary_loss_clip": 0.06478724, + "auxiliary_loss_mlp": 0.01271843, + "balance_loss_clip": 0.06301951, + "balance_loss_mlp": 0.01255797, + "epoch": 0.35623027205771834, + "flos": 30783006046080.0, + "grad_norm": 1.9504196686726267, + "language_loss": 0.77609557, + "learning_rate": 2.9836539022485578e-06, + "loss": 0.85360122, + "num_input_tokens_seen": 127369385, + "router_z_loss_clip": 1.76660156, + "router_z_loss_mlp": 0.16033936, + "step": 5925, + "time_per_iteration": 2.6268069744110107 + }, + { + "auxiliary_loss_clip": 0.06472521, + "auxiliary_loss_mlp": 0.01273729, + "balance_loss_clip": 0.06292735, + "balance_loss_mlp": 0.01258291, + "epoch": 0.3562903953103863, + "flos": 16986461328000.0, + "grad_norm": 1.8072288436418724, + "language_loss": 0.76488966, + "learning_rate": 2.9833147820519535e-06, + "loss": 0.84235215, + "num_input_tokens_seen": 127386965, + "router_z_loss_clip": 1.79589844, + "router_z_loss_mlp": 0.15441895, + "step": 5926, + "time_per_iteration": 2.492009401321411 + }, + { + "auxiliary_loss_clip": 0.064781, + "auxiliary_loss_mlp": 0.01271518, + "balance_loss_clip": 0.06293385, + "balance_loss_mlp": 0.01255478, + "epoch": 0.3563505185630543, + "flos": 23846271863040.0, + "grad_norm": 2.038892178711472, + "language_loss": 0.69665909, + "learning_rate": 2.9829756245679544e-06, + "loss": 0.77415526, + "num_input_tokens_seen": 127406075, + "router_z_loss_clip": 1.84863281, + "router_z_loss_mlp": 0.16046143, + "step": 5927, + "time_per_iteration": 2.555192708969116 + }, + { + "auxiliary_loss_clip": 0.06471409, + "auxiliary_loss_mlp": 0.01273845, + "balance_loss_clip": 0.06293224, + "balance_loss_mlp": 0.0125889, + "epoch": 0.35641064181572224, + "flos": 22280040944640.0, + "grad_norm": 1.7768317666214009, + "language_loss": 0.79454333, + "learning_rate": 2.9826364298094212e-06, + "loss": 0.87199581, + "num_input_tokens_seen": 127425350, + "router_z_loss_clip": 1.78027344, + "router_z_loss_mlp": 0.1494751, + "step": 5928, + "time_per_iteration": 2.5192928314208984 + }, + { + "auxiliary_loss_clip": 0.06473258, + "auxiliary_loss_mlp": 0.01271381, + "balance_loss_clip": 0.06294424, + "balance_loss_mlp": 0.01256439, + "epoch": 0.3564707650683902, + "flos": 23007643873920.0, + "grad_norm": 1.230692465633979, + "language_loss": 0.8197661, + "learning_rate": 2.982297197789215e-06, + "loss": 0.89721251, + "num_input_tokens_seen": 127446335, + "router_z_loss_clip": 1.78808594, + "router_z_loss_mlp": 0.1494751, + "step": 5929, + "time_per_iteration": 2.6044368743896484 + }, + { + "auxiliary_loss_clip": 0.0646459, + "auxiliary_loss_mlp": 0.01268428, + "balance_loss_clip": 0.06289564, + "balance_loss_mlp": 0.01253765, + "epoch": 0.35653088832105817, + "flos": 14689566806400.0, + "grad_norm": 1.5209281639747478, + "language_loss": 0.70385516, + "learning_rate": 2.981957928520201e-06, + "loss": 0.78118533, + "num_input_tokens_seen": 127462795, + "router_z_loss_clip": 1.75097656, + "router_z_loss_mlp": 0.14685059, + "step": 5930, + "time_per_iteration": 2.498253107070923 + }, + { + "auxiliary_loss_clip": 0.06473252, + "auxiliary_loss_mlp": 0.01273096, + "balance_loss_clip": 0.06289653, + "balance_loss_mlp": 0.01256943, + "epoch": 0.35659101157372614, + "flos": 23483791100160.0, + "grad_norm": 2.174064041384607, + "language_loss": 0.68760598, + "learning_rate": 2.981618622015244e-06, + "loss": 0.76506943, + "num_input_tokens_seen": 127482675, + "router_z_loss_clip": 1.83496094, + "router_z_loss_mlp": 0.16162109, + "step": 5931, + "time_per_iteration": 2.5391998291015625 + }, + { + "auxiliary_loss_clip": 0.06463969, + "auxiliary_loss_mlp": 0.0126845, + "balance_loss_clip": 0.06288578, + "balance_loss_mlp": 0.01253788, + "epoch": 0.3566511348263941, + "flos": 26585966885760.0, + "grad_norm": 1.5444695234240167, + "language_loss": 0.68331707, + "learning_rate": 2.981279278287211e-06, + "loss": 0.76064122, + "num_input_tokens_seen": 127502275, + "router_z_loss_clip": 1.75195312, + "router_z_loss_mlp": 0.14660645, + "step": 5932, + "time_per_iteration": 2.553738832473755 + }, + { + "auxiliary_loss_clip": 0.06465189, + "auxiliary_loss_mlp": 0.01272147, + "balance_loss_clip": 0.06290227, + "balance_loss_mlp": 0.01257854, + "epoch": 0.35671125807906207, + "flos": 13119981724800.0, + "grad_norm": 2.4744838507658917, + "language_loss": 0.79635656, + "learning_rate": 2.980939897348969e-06, + "loss": 0.87372994, + "num_input_tokens_seen": 127520195, + "router_z_loss_clip": 1.74902344, + "router_z_loss_mlp": 0.14294434, + "step": 5933, + "time_per_iteration": 2.573812961578369 + }, + { + "auxiliary_loss_clip": 0.06470121, + "auxiliary_loss_mlp": 0.01270309, + "balance_loss_clip": 0.06288668, + "balance_loss_mlp": 0.01255372, + "epoch": 0.35677138133173003, + "flos": 33009014413440.0, + "grad_norm": 1.4096936090904761, + "language_loss": 0.69970256, + "learning_rate": 2.980600479213388e-06, + "loss": 0.77710688, + "num_input_tokens_seen": 127544495, + "router_z_loss_clip": 1.81445312, + "router_z_loss_mlp": 0.14929199, + "step": 5934, + "time_per_iteration": 2.6381173133850098 + }, + { + "auxiliary_loss_clip": 0.06481285, + "auxiliary_loss_mlp": 0.01277705, + "balance_loss_clip": 0.06294179, + "balance_loss_mlp": 0.01260741, + "epoch": 0.356831504584398, + "flos": 20784234983040.0, + "grad_norm": 2.103415594097178, + "language_loss": 0.72006869, + "learning_rate": 2.9802610238933384e-06, + "loss": 0.79765862, + "num_input_tokens_seen": 127563810, + "router_z_loss_clip": 1.87304688, + "router_z_loss_mlp": 0.16967773, + "step": 5935, + "time_per_iteration": 2.620471954345703 + }, + { + "auxiliary_loss_clip": 0.06467808, + "auxiliary_loss_mlp": 0.01275583, + "balance_loss_clip": 0.06287988, + "balance_loss_mlp": 0.01261004, + "epoch": 0.35689162783706596, + "flos": 12170244821760.0, + "grad_norm": 2.011082803426264, + "language_loss": 0.78423738, + "learning_rate": 2.979921531401692e-06, + "loss": 0.86167133, + "num_input_tokens_seen": 127579065, + "router_z_loss_clip": 1.796875, + "router_z_loss_mlp": 0.14569092, + "step": 5936, + "time_per_iteration": 2.4827091693878174 + }, + { + "auxiliary_loss_clip": 0.06466486, + "auxiliary_loss_mlp": 0.01273239, + "balance_loss_clip": 0.06289199, + "balance_loss_mlp": 0.01258147, + "epoch": 0.356951751089734, + "flos": 23848200506880.0, + "grad_norm": 1.8250890312079233, + "language_loss": 0.64893055, + "learning_rate": 2.9795820017513242e-06, + "loss": 0.72632784, + "num_input_tokens_seen": 127599105, + "router_z_loss_clip": 1.77636719, + "router_z_loss_mlp": 0.15100098, + "step": 5937, + "time_per_iteration": 2.5968148708343506 + }, + { + "auxiliary_loss_clip": 0.06470716, + "auxiliary_loss_mlp": 0.01277052, + "balance_loss_clip": 0.06291182, + "balance_loss_mlp": 0.01261644, + "epoch": 0.35701187434240195, + "flos": 11725851093120.0, + "grad_norm": 3.2825373138133633, + "language_loss": 0.79029787, + "learning_rate": 2.9792424349551073e-06, + "loss": 0.86777556, + "num_input_tokens_seen": 127614940, + "router_z_loss_clip": 1.79492188, + "router_z_loss_mlp": 0.15429688, + "step": 5938, + "time_per_iteration": 2.4724228382110596 + }, + { + "auxiliary_loss_clip": 0.0646999, + "auxiliary_loss_mlp": 0.01275118, + "balance_loss_clip": 0.06289655, + "balance_loss_mlp": 0.01259835, + "epoch": 0.3570719975950699, + "flos": 24905650233600.0, + "grad_norm": 2.3707612213619624, + "language_loss": 0.80684471, + "learning_rate": 2.9789028310259202e-06, + "loss": 0.88429582, + "num_input_tokens_seen": 127634960, + "router_z_loss_clip": 1.80566406, + "router_z_loss_mlp": 0.15307617, + "step": 5939, + "time_per_iteration": 4.067660331726074 + }, + { + "auxiliary_loss_clip": 0.06474897, + "auxiliary_loss_mlp": 0.01278586, + "balance_loss_clip": 0.06288245, + "balance_loss_mlp": 0.01263357, + "epoch": 0.3571321208477379, + "flos": 26002022981760.0, + "grad_norm": 1.7209958005115653, + "language_loss": 0.79509544, + "learning_rate": 2.9785631899766395e-06, + "loss": 0.8726303, + "num_input_tokens_seen": 127654545, + "router_z_loss_clip": 1.8671875, + "router_z_loss_mlp": 0.15228271, + "step": 5940, + "time_per_iteration": 3.961956262588501 + }, + { + "auxiliary_loss_clip": 0.06472583, + "auxiliary_loss_mlp": 0.01274024, + "balance_loss_clip": 0.0628977, + "balance_loss_mlp": 0.01258223, + "epoch": 0.35719224410040584, + "flos": 14506900905600.0, + "grad_norm": 2.455654522420387, + "language_loss": 0.72918689, + "learning_rate": 2.9782235118201443e-06, + "loss": 0.80665296, + "num_input_tokens_seen": 127672320, + "router_z_loss_clip": 1.82714844, + "router_z_loss_mlp": 0.15802002, + "step": 5941, + "time_per_iteration": 2.529376745223999 + }, + { + "auxiliary_loss_clip": 0.06469624, + "auxiliary_loss_mlp": 0.01274223, + "balance_loss_clip": 0.06291723, + "balance_loss_mlp": 0.01258577, + "epoch": 0.3572523673530738, + "flos": 31183445508480.0, + "grad_norm": 1.9522398224767823, + "language_loss": 0.64961332, + "learning_rate": 2.9778837965693154e-06, + "loss": 0.72705185, + "num_input_tokens_seen": 127693315, + "router_z_loss_clip": 1.77636719, + "router_z_loss_mlp": 0.15667725, + "step": 5942, + "time_per_iteration": 2.6694955825805664 + }, + { + "auxiliary_loss_clip": 0.06470639, + "auxiliary_loss_mlp": 0.01273062, + "balance_loss_clip": 0.06291504, + "balance_loss_mlp": 0.01257124, + "epoch": 0.3573124906057418, + "flos": 15857496541440.0, + "grad_norm": 1.9232266262089555, + "language_loss": 0.7463761, + "learning_rate": 2.9775440442370354e-06, + "loss": 0.82381314, + "num_input_tokens_seen": 127711570, + "router_z_loss_clip": 1.79101562, + "router_z_loss_mlp": 0.1595459, + "step": 5943, + "time_per_iteration": 2.5988807678222656 + }, + { + "auxiliary_loss_clip": 0.06416824, + "auxiliary_loss_mlp": 0.01259877, + "balance_loss_clip": 0.06336363, + "balance_loss_mlp": 0.01254631, + "epoch": 0.35737261385840974, + "flos": 60839163849600.0, + "grad_norm": 0.8122274991603828, + "language_loss": 0.60684133, + "learning_rate": 2.9772042548361867e-06, + "loss": 0.68360829, + "num_input_tokens_seen": 127772475, + "router_z_loss_clip": 0.8046875, + "router_z_loss_mlp": 0.05249023, + "step": 5944, + "time_per_iteration": 3.2639529705047607 + }, + { + "auxiliary_loss_clip": 0.06467592, + "auxiliary_loss_mlp": 0.01274246, + "balance_loss_clip": 0.06290887, + "balance_loss_mlp": 0.01259464, + "epoch": 0.3574327371110777, + "flos": 18849779297280.0, + "grad_norm": 1.8477550360079977, + "language_loss": 0.7280755, + "learning_rate": 2.976864428379655e-06, + "loss": 0.80549395, + "num_input_tokens_seen": 127790940, + "router_z_loss_clip": 1.765625, + "router_z_loss_mlp": 0.14782715, + "step": 5945, + "time_per_iteration": 3.974971294403076 + }, + { + "auxiliary_loss_clip": 0.06464474, + "auxiliary_loss_mlp": 0.01274521, + "balance_loss_clip": 0.06288721, + "balance_loss_mlp": 0.01259619, + "epoch": 0.35749286036374567, + "flos": 23556354336000.0, + "grad_norm": 1.6530257311602492, + "language_loss": 0.8152287, + "learning_rate": 2.976524564880326e-06, + "loss": 0.89261866, + "num_input_tokens_seen": 127808275, + "router_z_loss_clip": 1.7578125, + "router_z_loss_mlp": 0.14892578, + "step": 5946, + "time_per_iteration": 2.567702531814575 + }, + { + "auxiliary_loss_clip": 0.06472433, + "auxiliary_loss_mlp": 0.01275229, + "balance_loss_clip": 0.06292298, + "balance_loss_mlp": 0.01260036, + "epoch": 0.35755298361641363, + "flos": 21111817720320.0, + "grad_norm": 1.4004407917222146, + "language_loss": 0.69023073, + "learning_rate": 2.9761846643510882e-06, + "loss": 0.76770723, + "num_input_tokens_seen": 127828840, + "router_z_loss_clip": 1.80273438, + "router_z_loss_mlp": 0.15209961, + "step": 5947, + "time_per_iteration": 2.531938076019287 + }, + { + "auxiliary_loss_clip": 0.06458312, + "auxiliary_loss_mlp": 0.01270008, + "balance_loss_clip": 0.06284653, + "balance_loss_mlp": 0.01256109, + "epoch": 0.3576131068690816, + "flos": 19251099227520.0, + "grad_norm": 2.059659188145791, + "language_loss": 0.75891036, + "learning_rate": 2.9758447268048297e-06, + "loss": 0.83619356, + "num_input_tokens_seen": 127846240, + "router_z_loss_clip": 1.73535156, + "router_z_loss_mlp": 0.13916016, + "step": 5948, + "time_per_iteration": 3.9236361980438232 + }, + { + "auxiliary_loss_clip": 0.06466205, + "auxiliary_loss_mlp": 0.01276458, + "balance_loss_clip": 0.06287337, + "balance_loss_mlp": 0.01261462, + "epoch": 0.35767323012174956, + "flos": 28661733682560.0, + "grad_norm": 1.6908098548641093, + "language_loss": 0.71228039, + "learning_rate": 2.9755047522544415e-06, + "loss": 0.78970701, + "num_input_tokens_seen": 127866880, + "router_z_loss_clip": 1.78808594, + "router_z_loss_mlp": 0.15002441, + "step": 5949, + "time_per_iteration": 2.56809663772583 + }, + { + "auxiliary_loss_clip": 0.06464282, + "auxiliary_loss_mlp": 0.01281848, + "balance_loss_clip": 0.06286816, + "balance_loss_mlp": 0.01266995, + "epoch": 0.35773335337441753, + "flos": 17089897593600.0, + "grad_norm": 1.7763817610233048, + "language_loss": 0.77821207, + "learning_rate": 2.9751647407128154e-06, + "loss": 0.85567343, + "num_input_tokens_seen": 127883560, + "router_z_loss_clip": 1.7734375, + "router_z_loss_mlp": 0.1484375, + "step": 5950, + "time_per_iteration": 2.529543876647949 + }, + { + "auxiliary_loss_clip": 0.06465182, + "auxiliary_loss_mlp": 0.01276208, + "balance_loss_clip": 0.0628643, + "balance_loss_mlp": 0.01261331, + "epoch": 0.35779347662708555, + "flos": 15894155502720.0, + "grad_norm": 2.1549260339424725, + "language_loss": 0.73109937, + "learning_rate": 2.9748246921928445e-06, + "loss": 0.80851334, + "num_input_tokens_seen": 127902330, + "router_z_loss_clip": 1.78710938, + "router_z_loss_mlp": 0.14892578, + "step": 5951, + "time_per_iteration": 2.5201168060302734 + }, + { + "auxiliary_loss_clip": 0.06470691, + "auxiliary_loss_mlp": 0.01277881, + "balance_loss_clip": 0.06287189, + "balance_loss_mlp": 0.01262181, + "epoch": 0.3578535998797535, + "flos": 28666555292160.0, + "grad_norm": 1.9784791605149854, + "language_loss": 0.7026071, + "learning_rate": 2.9744846067074236e-06, + "loss": 0.78009284, + "num_input_tokens_seen": 127922325, + "router_z_loss_clip": 1.83691406, + "router_z_loss_mlp": 0.15698242, + "step": 5952, + "time_per_iteration": 2.5931434631347656 + }, + { + "auxiliary_loss_clip": 0.0646029, + "auxiliary_loss_mlp": 0.01277333, + "balance_loss_clip": 0.06284408, + "balance_loss_mlp": 0.01263069, + "epoch": 0.3579137231324215, + "flos": 37861554464640.0, + "grad_norm": 1.6267089711440414, + "language_loss": 0.69578886, + "learning_rate": 2.974144484269449e-06, + "loss": 0.77316511, + "num_input_tokens_seen": 127942635, + "router_z_loss_clip": 1.75878906, + "router_z_loss_mlp": 0.14276123, + "step": 5953, + "time_per_iteration": 2.668464422225952 + }, + { + "auxiliary_loss_clip": 0.0645823, + "auxiliary_loss_mlp": 0.01275685, + "balance_loss_clip": 0.06282876, + "balance_loss_mlp": 0.01261117, + "epoch": 0.35797384638508944, + "flos": 22353526575360.0, + "grad_norm": 1.5719996722989455, + "language_loss": 0.67333478, + "learning_rate": 2.9738043248918175e-06, + "loss": 0.75067389, + "num_input_tokens_seen": 127962520, + "router_z_loss_clip": 1.75390625, + "router_z_loss_mlp": 0.14562988, + "step": 5954, + "time_per_iteration": 2.5791454315185547 + }, + { + "auxiliary_loss_clip": 0.06459846, + "auxiliary_loss_mlp": 0.01278708, + "balance_loss_clip": 0.06287006, + "balance_loss_mlp": 0.0126414, + "epoch": 0.3580339696377574, + "flos": 13594829212800.0, + "grad_norm": 1.8066455981447187, + "language_loss": 0.75335681, + "learning_rate": 2.9734641285874282e-06, + "loss": 0.83074236, + "num_input_tokens_seen": 127981180, + "router_z_loss_clip": 1.73144531, + "router_z_loss_mlp": 0.14556885, + "step": 5955, + "time_per_iteration": 2.5049943923950195 + }, + { + "auxiliary_loss_clip": 0.06458074, + "auxiliary_loss_mlp": 0.01270596, + "balance_loss_clip": 0.06286005, + "balance_loss_mlp": 0.01256595, + "epoch": 0.3580940928904254, + "flos": 23774882584320.0, + "grad_norm": 1.7018331496498176, + "language_loss": 0.76155579, + "learning_rate": 2.973123895369182e-06, + "loss": 0.83884245, + "num_input_tokens_seen": 127999725, + "router_z_loss_clip": 1.72070312, + "router_z_loss_mlp": 0.14007568, + "step": 5956, + "time_per_iteration": 2.565455675125122 + }, + { + "auxiliary_loss_clip": 0.06456999, + "auxiliary_loss_mlp": 0.01278066, + "balance_loss_clip": 0.06286499, + "balance_loss_mlp": 0.01263415, + "epoch": 0.35815421614309334, + "flos": 19469962892160.0, + "grad_norm": 1.5319401259692025, + "language_loss": 0.73558611, + "learning_rate": 2.9727836252499805e-06, + "loss": 0.81293678, + "num_input_tokens_seen": 128018885, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.14642334, + "step": 5957, + "time_per_iteration": 2.5241572856903076 + }, + { + "auxiliary_loss_clip": 0.064648, + "auxiliary_loss_mlp": 0.01274688, + "balance_loss_clip": 0.06291045, + "balance_loss_mlp": 0.01260204, + "epoch": 0.3582143393957613, + "flos": 23374988173440.0, + "grad_norm": 2.1285308943055727, + "language_loss": 0.71748459, + "learning_rate": 2.972443318242726e-06, + "loss": 0.79487944, + "num_input_tokens_seen": 128037875, + "router_z_loss_clip": 1.73828125, + "router_z_loss_mlp": 0.14477539, + "step": 5958, + "time_per_iteration": 2.566181182861328 + }, + { + "auxiliary_loss_clip": 0.06459813, + "auxiliary_loss_mlp": 0.01267621, + "balance_loss_clip": 0.06289116, + "balance_loss_mlp": 0.0125415, + "epoch": 0.35827446264842927, + "flos": 26330528113920.0, + "grad_norm": 1.6357791647016078, + "language_loss": 0.88725436, + "learning_rate": 2.972102974360324e-06, + "loss": 0.96452874, + "num_input_tokens_seen": 128056045, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.13452148, + "step": 5959, + "time_per_iteration": 2.6218011379241943 + }, + { + "auxiliary_loss_clip": 0.06463417, + "auxiliary_loss_mlp": 0.01271505, + "balance_loss_clip": 0.06288788, + "balance_loss_mlp": 0.0125816, + "epoch": 0.35833458590109724, + "flos": 30454626695040.0, + "grad_norm": 1.5143701220572547, + "language_loss": 0.58769095, + "learning_rate": 2.971762593615679e-06, + "loss": 0.66504014, + "num_input_tokens_seen": 128077815, + "router_z_loss_clip": 1.74707031, + "router_z_loss_mlp": 0.13348389, + "step": 5960, + "time_per_iteration": 2.636439800262451 + }, + { + "auxiliary_loss_clip": 0.06462947, + "auxiliary_loss_mlp": 0.01269103, + "balance_loss_clip": 0.06286879, + "balance_loss_mlp": 0.01253469, + "epoch": 0.3583947091537652, + "flos": 14835154475520.0, + "grad_norm": 2.541265940729937, + "language_loss": 0.76686686, + "learning_rate": 2.9714221760216993e-06, + "loss": 0.84418738, + "num_input_tokens_seen": 128095460, + "router_z_loss_clip": 1.75976562, + "router_z_loss_mlp": 0.15631104, + "step": 5961, + "time_per_iteration": 2.523674249649048 + }, + { + "auxiliary_loss_clip": 0.06464821, + "auxiliary_loss_mlp": 0.01275426, + "balance_loss_clip": 0.06287968, + "balance_loss_mlp": 0.01261324, + "epoch": 0.35845483240643317, + "flos": 34249213895040.0, + "grad_norm": 1.6475679018941416, + "language_loss": 0.70478481, + "learning_rate": 2.971081721591294e-06, + "loss": 0.78218734, + "num_input_tokens_seen": 128118605, + "router_z_loss_clip": 1.76953125, + "router_z_loss_mlp": 0.14099121, + "step": 5962, + "time_per_iteration": 2.6199357509613037 + }, + { + "auxiliary_loss_clip": 0.06464063, + "auxiliary_loss_mlp": 0.01269417, + "balance_loss_clip": 0.06289653, + "balance_loss_mlp": 0.01255207, + "epoch": 0.35851495565910113, + "flos": 20966481613440.0, + "grad_norm": 1.6496872805273144, + "language_loss": 0.75120842, + "learning_rate": 2.9707412303373716e-06, + "loss": 0.82854319, + "num_input_tokens_seen": 128139205, + "router_z_loss_clip": 1.74609375, + "router_z_loss_mlp": 0.14221191, + "step": 5963, + "time_per_iteration": 2.5526950359344482 + }, + { + "auxiliary_loss_clip": 0.06467253, + "auxiliary_loss_mlp": 0.01271151, + "balance_loss_clip": 0.06291784, + "balance_loss_mlp": 0.01256322, + "epoch": 0.35857507891176915, + "flos": 22316448343680.0, + "grad_norm": 1.675466861885377, + "language_loss": 0.78945208, + "learning_rate": 2.9704007022728447e-06, + "loss": 0.86683613, + "num_input_tokens_seen": 128158765, + "router_z_loss_clip": 1.75292969, + "router_z_loss_mlp": 0.14831543, + "step": 5964, + "time_per_iteration": 2.5257983207702637 + }, + { + "auxiliary_loss_clip": 0.0647264, + "auxiliary_loss_mlp": 0.01272042, + "balance_loss_clip": 0.06292663, + "balance_loss_mlp": 0.0125726, + "epoch": 0.3586352021644371, + "flos": 23374610830080.0, + "grad_norm": 3.2898914726182684, + "language_loss": 0.667786, + "learning_rate": 2.970060137410626e-06, + "loss": 0.74523282, + "num_input_tokens_seen": 128177850, + "router_z_loss_clip": 1.79785156, + "router_z_loss_mlp": 0.14764404, + "step": 5965, + "time_per_iteration": 2.5664315223693848 + }, + { + "auxiliary_loss_clip": 0.06463271, + "auxiliary_loss_mlp": 0.01271526, + "balance_loss_clip": 0.06287476, + "balance_loss_mlp": 0.01256773, + "epoch": 0.3586953254171051, + "flos": 27855655804800.0, + "grad_norm": 1.5935311272675807, + "language_loss": 0.79428947, + "learning_rate": 2.9697195357636294e-06, + "loss": 0.87163734, + "num_input_tokens_seen": 128196925, + "router_z_loss_clip": 1.75585938, + "router_z_loss_mlp": 0.14746094, + "step": 5966, + "time_per_iteration": 2.576537609100342 + }, + { + "auxiliary_loss_clip": 0.06467331, + "auxiliary_loss_mlp": 0.01268742, + "balance_loss_clip": 0.06291486, + "balance_loss_mlp": 0.01254717, + "epoch": 0.35875544866977305, + "flos": 19506621853440.0, + "grad_norm": 2.077713447457672, + "language_loss": 0.91477883, + "learning_rate": 2.9693788973447715e-06, + "loss": 0.99213958, + "num_input_tokens_seen": 128213955, + "router_z_loss_clip": 1.75683594, + "router_z_loss_mlp": 0.14044189, + "step": 5967, + "time_per_iteration": 2.553084135055542 + }, + { + "auxiliary_loss_clip": 0.06466691, + "auxiliary_loss_mlp": 0.01272699, + "balance_loss_clip": 0.06288824, + "balance_loss_mlp": 0.01257261, + "epoch": 0.358815571922441, + "flos": 21477652646400.0, + "grad_norm": 1.8463229992001005, + "language_loss": 0.80835712, + "learning_rate": 2.9690382221669682e-06, + "loss": 0.88575101, + "num_input_tokens_seen": 128232980, + "router_z_loss_clip": 1.77832031, + "router_z_loss_mlp": 0.15435791, + "step": 5968, + "time_per_iteration": 2.526298761367798 + }, + { + "auxiliary_loss_clip": 0.06467028, + "auxiliary_loss_mlp": 0.0127428, + "balance_loss_clip": 0.06287041, + "balance_loss_mlp": 0.012587, + "epoch": 0.358875695175109, + "flos": 21841894344960.0, + "grad_norm": 1.8179824378655614, + "language_loss": 0.84621, + "learning_rate": 2.9686975102431384e-06, + "loss": 0.92362314, + "num_input_tokens_seen": 128252795, + "router_z_loss_clip": 1.80078125, + "router_z_loss_mlp": 0.15588379, + "step": 5969, + "time_per_iteration": 2.5340397357940674 + }, + { + "auxiliary_loss_clip": 0.0646342, + "auxiliary_loss_mlp": 0.0127204, + "balance_loss_clip": 0.06288599, + "balance_loss_mlp": 0.01258664, + "epoch": 0.35893581842777694, + "flos": 32019264385920.0, + "grad_norm": 1.8505987075691241, + "language_loss": 0.72233456, + "learning_rate": 2.968356761586202e-06, + "loss": 0.79968911, + "num_input_tokens_seen": 128273115, + "router_z_loss_clip": 1.74804688, + "router_z_loss_mlp": 0.13366699, + "step": 5970, + "time_per_iteration": 2.581071615219116 + }, + { + "auxiliary_loss_clip": 0.06468321, + "auxiliary_loss_mlp": 0.01272468, + "balance_loss_clip": 0.06292167, + "balance_loss_mlp": 0.01258056, + "epoch": 0.3589959416804449, + "flos": 20492137249920.0, + "grad_norm": 1.5610077365233734, + "language_loss": 0.79753757, + "learning_rate": 2.9680159762090805e-06, + "loss": 0.87494546, + "num_input_tokens_seen": 128292220, + "router_z_loss_clip": 1.76269531, + "router_z_loss_mlp": 0.14422607, + "step": 5971, + "time_per_iteration": 2.5405828952789307 + }, + { + "auxiliary_loss_clip": 0.0646906, + "auxiliary_loss_mlp": 0.01270026, + "balance_loss_clip": 0.06288019, + "balance_loss_mlp": 0.01255006, + "epoch": 0.3590560649331129, + "flos": 16186295162880.0, + "grad_norm": 1.6291573791515084, + "language_loss": 0.78869599, + "learning_rate": 2.967675154124696e-06, + "loss": 0.86608684, + "num_input_tokens_seen": 128310305, + "router_z_loss_clip": 1.81054688, + "router_z_loss_mlp": 0.15026855, + "step": 5972, + "time_per_iteration": 2.4778740406036377 + }, + { + "auxiliary_loss_clip": 0.06465904, + "auxiliary_loss_mlp": 0.01274602, + "balance_loss_clip": 0.06286226, + "balance_loss_mlp": 0.01260201, + "epoch": 0.35911618818578084, + "flos": 20381531460480.0, + "grad_norm": 2.0141455740295875, + "language_loss": 0.81742013, + "learning_rate": 2.9673342953459722e-06, + "loss": 0.89482516, + "num_input_tokens_seen": 128328305, + "router_z_loss_clip": 1.79492188, + "router_z_loss_mlp": 0.1439209, + "step": 5973, + "time_per_iteration": 2.532027006149292 + }, + { + "auxiliary_loss_clip": 0.06404248, + "auxiliary_loss_mlp": 0.01258065, + "balance_loss_clip": 0.06324309, + "balance_loss_mlp": 0.01254096, + "epoch": 0.3591763114384488, + "flos": 41250991645440.0, + "grad_norm": 0.9082562918021452, + "language_loss": 0.56514442, + "learning_rate": 2.9669933998858355e-06, + "loss": 0.64176756, + "num_input_tokens_seen": 128378380, + "router_z_loss_clip": 0.796875, + "router_z_loss_mlp": 0.03967285, + "step": 5974, + "time_per_iteration": 3.0029375553131104 + }, + { + "auxiliary_loss_clip": 0.06464389, + "auxiliary_loss_mlp": 0.01272027, + "balance_loss_clip": 0.06286667, + "balance_loss_mlp": 0.01257781, + "epoch": 0.35923643469111677, + "flos": 18701047100160.0, + "grad_norm": 1.9591615340661908, + "language_loss": 0.69342583, + "learning_rate": 2.9666524677572114e-06, + "loss": 0.77078998, + "num_input_tokens_seen": 128394315, + "router_z_loss_clip": 1.77636719, + "router_z_loss_mlp": 0.14227295, + "step": 5975, + "time_per_iteration": 2.5330698490142822 + }, + { + "auxiliary_loss_clip": 0.06462636, + "auxiliary_loss_mlp": 0.0126783, + "balance_loss_clip": 0.06286036, + "balance_loss_mlp": 0.0125325, + "epoch": 0.35929655794378473, + "flos": 25017010709760.0, + "grad_norm": 1.597565036747504, + "language_loss": 0.8049522, + "learning_rate": 2.96631149897303e-06, + "loss": 0.88225687, + "num_input_tokens_seen": 128414515, + "router_z_loss_clip": 1.765625, + "router_z_loss_mlp": 0.14575195, + "step": 5976, + "time_per_iteration": 2.5599968433380127 + }, + { + "auxiliary_loss_clip": 0.0646351, + "auxiliary_loss_mlp": 0.0126845, + "balance_loss_clip": 0.06286681, + "balance_loss_mlp": 0.01253489, + "epoch": 0.35935668119645275, + "flos": 14980825998720.0, + "grad_norm": 1.8019140268476472, + "language_loss": 0.79171205, + "learning_rate": 2.9659704935462194e-06, + "loss": 0.86903155, + "num_input_tokens_seen": 128430615, + "router_z_loss_clip": 1.76757812, + "router_z_loss_mlp": 0.1496582, + "step": 5977, + "time_per_iteration": 2.4876949787139893 + }, + { + "auxiliary_loss_clip": 0.06459211, + "auxiliary_loss_mlp": 0.01266574, + "balance_loss_clip": 0.0628271, + "balance_loss_mlp": 0.0125324, + "epoch": 0.3594168044491207, + "flos": 21184422883200.0, + "grad_norm": 1.897291031169604, + "language_loss": 0.80843097, + "learning_rate": 2.9656294514897102e-06, + "loss": 0.88568884, + "num_input_tokens_seen": 128449480, + "router_z_loss_clip": 1.76464844, + "router_z_loss_mlp": 0.13342285, + "step": 5978, + "time_per_iteration": 2.5270771980285645 + }, + { + "auxiliary_loss_clip": 0.06458849, + "auxiliary_loss_mlp": 0.01272545, + "balance_loss_clip": 0.06279429, + "balance_loss_mlp": 0.01257703, + "epoch": 0.3594769277017887, + "flos": 27679446668160.0, + "grad_norm": 1.6570486295636508, + "language_loss": 0.67797875, + "learning_rate": 2.965288372816436e-06, + "loss": 0.75529265, + "num_input_tokens_seen": 128471465, + "router_z_loss_clip": 1.79394531, + "router_z_loss_mlp": 0.14819336, + "step": 5979, + "time_per_iteration": 5.427239179611206 + }, + { + "auxiliary_loss_clip": 0.06460471, + "auxiliary_loss_mlp": 0.01270252, + "balance_loss_clip": 0.06282781, + "balance_loss_mlp": 0.01256323, + "epoch": 0.35953705095445665, + "flos": 23008901685120.0, + "grad_norm": 2.1534655116077928, + "language_loss": 0.67667198, + "learning_rate": 2.9649472575393296e-06, + "loss": 0.75397921, + "num_input_tokens_seen": 128490645, + "router_z_loss_clip": 1.77929688, + "router_z_loss_mlp": 0.13928223, + "step": 5980, + "time_per_iteration": 2.538149833679199 + }, + { + "auxiliary_loss_clip": 0.0647162, + "auxiliary_loss_mlp": 0.01273216, + "balance_loss_clip": 0.06285568, + "balance_loss_mlp": 0.01257146, + "epoch": 0.3595971742071246, + "flos": 25520005969920.0, + "grad_norm": 2.2162969460708597, + "language_loss": 0.71122372, + "learning_rate": 2.964606105671327e-06, + "loss": 0.78867209, + "num_input_tokens_seen": 128510225, + "router_z_loss_clip": 1.86132812, + "router_z_loss_mlp": 0.16064453, + "step": 5981, + "time_per_iteration": 2.5711326599121094 + }, + { + "auxiliary_loss_clip": 0.06464566, + "auxiliary_loss_mlp": 0.01272445, + "balance_loss_clip": 0.06283125, + "balance_loss_mlp": 0.01256709, + "epoch": 0.3596572974597926, + "flos": 29870431228800.0, + "grad_norm": 2.0278025655936958, + "language_loss": 0.71914935, + "learning_rate": 2.9642649172253635e-06, + "loss": 0.7965194, + "num_input_tokens_seen": 128530195, + "router_z_loss_clip": 1.81640625, + "router_z_loss_mlp": 0.1572876, + "step": 5982, + "time_per_iteration": 2.6292126178741455 + }, + { + "auxiliary_loss_clip": 0.06458835, + "auxiliary_loss_mlp": 0.01267882, + "balance_loss_clip": 0.06286852, + "balance_loss_mlp": 0.0125428, + "epoch": 0.35971742071246054, + "flos": 23119255912320.0, + "grad_norm": 1.6791573126106523, + "language_loss": 0.7649492, + "learning_rate": 2.9639236922143786e-06, + "loss": 0.84221637, + "num_input_tokens_seen": 128549990, + "router_z_loss_clip": 1.72070312, + "router_z_loss_mlp": 0.13598633, + "step": 5983, + "time_per_iteration": 2.540801763534546 + }, + { + "auxiliary_loss_clip": 0.06468493, + "auxiliary_loss_mlp": 0.01273263, + "balance_loss_clip": 0.06285352, + "balance_loss_mlp": 0.01257206, + "epoch": 0.3597775439651285, + "flos": 16730645212800.0, + "grad_norm": 1.651729152091261, + "language_loss": 0.77260226, + "learning_rate": 2.96358243065131e-06, + "loss": 0.85001981, + "num_input_tokens_seen": 128567925, + "router_z_loss_clip": 1.83007812, + "router_z_loss_mlp": 0.16052246, + "step": 5984, + "time_per_iteration": 2.5278737545013428 + }, + { + "auxiliary_loss_clip": 0.06458455, + "auxiliary_loss_mlp": 0.01270496, + "balance_loss_clip": 0.0628411, + "balance_loss_mlp": 0.01256155, + "epoch": 0.3598376672177965, + "flos": 19725653226240.0, + "grad_norm": 2.0268922239891163, + "language_loss": 0.87093443, + "learning_rate": 2.9632411325490993e-06, + "loss": 0.94822395, + "num_input_tokens_seen": 128585655, + "router_z_loss_clip": 1.7421875, + "router_z_loss_mlp": 0.14355469, + "step": 5985, + "time_per_iteration": 3.9569170475006104 + }, + { + "auxiliary_loss_clip": 0.06461216, + "auxiliary_loss_mlp": 0.01272807, + "balance_loss_clip": 0.06284203, + "balance_loss_mlp": 0.01258109, + "epoch": 0.35989779047046444, + "flos": 17317314374400.0, + "grad_norm": 1.4939910635791536, + "language_loss": 0.72980917, + "learning_rate": 2.9628997979206884e-06, + "loss": 0.80714941, + "num_input_tokens_seen": 128604820, + "router_z_loss_clip": 1.77246094, + "router_z_loss_mlp": 0.14709473, + "step": 5986, + "time_per_iteration": 2.5065739154815674 + }, + { + "auxiliary_loss_clip": 0.06469383, + "auxiliary_loss_mlp": 0.0126965, + "balance_loss_clip": 0.06283881, + "balance_loss_mlp": 0.01254761, + "epoch": 0.3599579137231324, + "flos": 22717894055040.0, + "grad_norm": 2.903112824764454, + "language_loss": 0.73792106, + "learning_rate": 2.9625584267790204e-06, + "loss": 0.81531143, + "num_input_tokens_seen": 128623070, + "router_z_loss_clip": 1.85449219, + "router_z_loss_mlp": 0.14892578, + "step": 5987, + "time_per_iteration": 3.961486339569092 + }, + { + "auxiliary_loss_clip": 0.06467381, + "auxiliary_loss_mlp": 0.01269998, + "balance_loss_clip": 0.06286356, + "balance_loss_mlp": 0.01255347, + "epoch": 0.36001803697580037, + "flos": 20966230051200.0, + "grad_norm": 1.8945086710394061, + "language_loss": 0.69721663, + "learning_rate": 2.9622170191370404e-06, + "loss": 0.77459043, + "num_input_tokens_seen": 128642430, + "router_z_loss_clip": 1.80859375, + "router_z_loss_mlp": 0.14648438, + "step": 5988, + "time_per_iteration": 2.5483100414276123 + }, + { + "auxiliary_loss_clip": 0.0647547, + "auxiliary_loss_mlp": 0.01273545, + "balance_loss_clip": 0.06292704, + "balance_loss_mlp": 0.01258209, + "epoch": 0.36007816022846834, + "flos": 20491843760640.0, + "grad_norm": 1.7927951606002523, + "language_loss": 0.7305057, + "learning_rate": 2.9618755750076953e-06, + "loss": 0.80799592, + "num_input_tokens_seen": 128661285, + "router_z_loss_clip": 1.82714844, + "router_z_loss_mlp": 0.15344238, + "step": 5989, + "time_per_iteration": 2.5010430812835693 + }, + { + "auxiliary_loss_clip": 0.06467338, + "auxiliary_loss_mlp": 0.01268061, + "balance_loss_clip": 0.06289014, + "balance_loss_mlp": 0.01254173, + "epoch": 0.36013828348113636, + "flos": 28008706487040.0, + "grad_norm": 1.4999082498201763, + "language_loss": 0.80117184, + "learning_rate": 2.961534094403931e-06, + "loss": 0.87852585, + "num_input_tokens_seen": 128682210, + "router_z_loss_clip": 1.78515625, + "router_z_loss_mlp": 0.13897705, + "step": 5990, + "time_per_iteration": 2.6733410358428955 + }, + { + "auxiliary_loss_clip": 0.06464024, + "auxiliary_loss_mlp": 0.01270971, + "balance_loss_clip": 0.0628631, + "balance_loss_mlp": 0.01255938, + "epoch": 0.3601984067338043, + "flos": 20088050135040.0, + "grad_norm": 1.799909646769202, + "language_loss": 0.84338784, + "learning_rate": 2.961192577338698e-06, + "loss": 0.92073774, + "num_input_tokens_seen": 128700445, + "router_z_loss_clip": 1.77441406, + "router_z_loss_mlp": 0.15032959, + "step": 5991, + "time_per_iteration": 2.518554925918579 + }, + { + "auxiliary_loss_clip": 0.06474696, + "auxiliary_loss_mlp": 0.01276578, + "balance_loss_clip": 0.06292041, + "balance_loss_mlp": 0.01261367, + "epoch": 0.3602585299864723, + "flos": 18622362516480.0, + "grad_norm": 1.891276760716041, + "language_loss": 0.76406145, + "learning_rate": 2.9608510238249463e-06, + "loss": 0.84157419, + "num_input_tokens_seen": 128716855, + "router_z_loss_clip": 1.82617188, + "router_z_loss_mlp": 0.1519165, + "step": 5992, + "time_per_iteration": 2.5224106311798096 + }, + { + "auxiliary_loss_clip": 0.06471405, + "auxiliary_loss_mlp": 0.01273147, + "balance_loss_clip": 0.06294376, + "balance_loss_mlp": 0.01258496, + "epoch": 0.36031865323914025, + "flos": 19579059308160.0, + "grad_norm": 2.086772991356176, + "language_loss": 0.78120929, + "learning_rate": 2.960509433875627e-06, + "loss": 0.8586548, + "num_input_tokens_seen": 128735835, + "router_z_loss_clip": 1.76953125, + "router_z_loss_mlp": 0.14648438, + "step": 5993, + "time_per_iteration": 2.5155129432678223 + }, + { + "auxiliary_loss_clip": 0.06474859, + "auxiliary_loss_mlp": 0.01271898, + "balance_loss_clip": 0.06293729, + "balance_loss_mlp": 0.01257807, + "epoch": 0.3603787764918082, + "flos": 17495871425280.0, + "grad_norm": 1.6487847999674183, + "language_loss": 0.74534261, + "learning_rate": 2.9601678075036943e-06, + "loss": 0.82281017, + "num_input_tokens_seen": 128752465, + "router_z_loss_clip": 1.81445312, + "router_z_loss_mlp": 0.14086914, + "step": 5994, + "time_per_iteration": 2.647794723510742 + }, + { + "auxiliary_loss_clip": 0.06474246, + "auxiliary_loss_mlp": 0.01268785, + "balance_loss_clip": 0.06290799, + "balance_loss_mlp": 0.01254415, + "epoch": 0.3604388997444762, + "flos": 15528823701120.0, + "grad_norm": 1.8873654318884407, + "language_loss": 0.69500113, + "learning_rate": 2.9598261447221024e-06, + "loss": 0.77243149, + "num_input_tokens_seen": 128770865, + "router_z_loss_clip": 1.83691406, + "router_z_loss_mlp": 0.14361572, + "step": 5995, + "time_per_iteration": 2.501981019973755 + }, + { + "auxiliary_loss_clip": 0.06479774, + "auxiliary_loss_mlp": 0.01276345, + "balance_loss_clip": 0.06295834, + "balance_loss_mlp": 0.01261688, + "epoch": 0.36049902299714415, + "flos": 17316559687680.0, + "grad_norm": 1.8201062799427143, + "language_loss": 0.8309989, + "learning_rate": 2.9594844455438057e-06, + "loss": 0.90856004, + "num_input_tokens_seen": 128789730, + "router_z_loss_clip": 1.83886719, + "router_z_loss_mlp": 0.14642334, + "step": 5996, + "time_per_iteration": 2.551095962524414 + }, + { + "auxiliary_loss_clip": 0.06472808, + "auxiliary_loss_mlp": 0.01275418, + "balance_loss_clip": 0.06293936, + "balance_loss_mlp": 0.01260493, + "epoch": 0.3605591462498121, + "flos": 17061749821440.0, + "grad_norm": 2.2503529028172804, + "language_loss": 0.73762429, + "learning_rate": 2.959142709981763e-06, + "loss": 0.81510657, + "num_input_tokens_seen": 128806610, + "router_z_loss_clip": 1.79003906, + "router_z_loss_mlp": 0.14910889, + "step": 5997, + "time_per_iteration": 2.493100881576538 + }, + { + "auxiliary_loss_clip": 0.06465439, + "auxiliary_loss_mlp": 0.0127421, + "balance_loss_clip": 0.06288476, + "balance_loss_mlp": 0.0125944, + "epoch": 0.3606192695024801, + "flos": 16842508813440.0, + "grad_norm": 2.0075843423569326, + "language_loss": 0.69582814, + "learning_rate": 2.9588009380489337e-06, + "loss": 0.77322465, + "num_input_tokens_seen": 128824830, + "router_z_loss_clip": 1.76855469, + "router_z_loss_mlp": 0.14758301, + "step": 5998, + "time_per_iteration": 2.54227352142334 + }, + { + "auxiliary_loss_clip": 0.06468997, + "auxiliary_loss_mlp": 0.01272453, + "balance_loss_clip": 0.06292363, + "balance_loss_mlp": 0.01258243, + "epoch": 0.36067939275514804, + "flos": 12134424401280.0, + "grad_norm": 2.607888629955908, + "language_loss": 0.77566224, + "learning_rate": 2.9584591297582758e-06, + "loss": 0.8530767, + "num_input_tokens_seen": 128838170, + "router_z_loss_clip": 1.76757812, + "router_z_loss_mlp": 0.14208984, + "step": 5999, + "time_per_iteration": 2.456887722015381 + }, + { + "auxiliary_loss_clip": 0.06474666, + "auxiliary_loss_mlp": 0.01272087, + "balance_loss_clip": 0.06294585, + "balance_loss_mlp": 0.01257776, + "epoch": 0.360739516007816, + "flos": 18047390999040.0, + "grad_norm": 1.725953097254869, + "language_loss": 0.78777629, + "learning_rate": 2.9581172851227516e-06, + "loss": 0.86524385, + "num_input_tokens_seen": 128855625, + "router_z_loss_clip": 1.80273438, + "router_z_loss_mlp": 0.14300537, + "step": 6000, + "time_per_iteration": 2.523397445678711 + }, + { + "auxiliary_loss_clip": 0.06471578, + "auxiliary_loss_mlp": 0.01271527, + "balance_loss_clip": 0.06294253, + "balance_loss_mlp": 0.01257854, + "epoch": 0.360799639260484, + "flos": 18555417504000.0, + "grad_norm": 1.7389483603698193, + "language_loss": 0.78602117, + "learning_rate": 2.9577754041553243e-06, + "loss": 0.86345226, + "num_input_tokens_seen": 128873540, + "router_z_loss_clip": 1.77148438, + "router_z_loss_mlp": 0.13671875, + "step": 6001, + "time_per_iteration": 2.4887304306030273 + }, + { + "auxiliary_loss_clip": 0.06462014, + "auxiliary_loss_mlp": 0.01269074, + "balance_loss_clip": 0.06287026, + "balance_loss_mlp": 0.012549, + "epoch": 0.36085976251315194, + "flos": 19688029943040.0, + "grad_norm": 2.5640130860082206, + "language_loss": 0.83264118, + "learning_rate": 2.9574334868689575e-06, + "loss": 0.90995204, + "num_input_tokens_seen": 128889925, + "router_z_loss_clip": 1.74902344, + "router_z_loss_mlp": 0.14178467, + "step": 6002, + "time_per_iteration": 2.523263931274414 + }, + { + "auxiliary_loss_clip": 0.06462792, + "auxiliary_loss_mlp": 0.01274754, + "balance_loss_clip": 0.06293326, + "balance_loss_mlp": 0.01262034, + "epoch": 0.3609198857658199, + "flos": 24204476067840.0, + "grad_norm": 2.058215255218527, + "language_loss": 0.91365647, + "learning_rate": 2.9570915332766165e-06, + "loss": 0.991032, + "num_input_tokens_seen": 128906890, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.12713623, + "step": 6003, + "time_per_iteration": 2.5147922039031982 + }, + { + "auxiliary_loss_clip": 0.06424739, + "auxiliary_loss_mlp": 0.01257394, + "balance_loss_clip": 0.06345953, + "balance_loss_mlp": 0.01254351, + "epoch": 0.3609800090184879, + "flos": 57134288044800.0, + "grad_norm": 0.8495896975763515, + "language_loss": 0.53457719, + "learning_rate": 2.9567495433912693e-06, + "loss": 0.61139846, + "num_input_tokens_seen": 128965940, + "router_z_loss_clip": 0.78857422, + "router_z_loss_mlp": 0.03041077, + "step": 6004, + "time_per_iteration": 3.1006038188934326 + }, + { + "auxiliary_loss_clip": 0.06473242, + "auxiliary_loss_mlp": 0.01270523, + "balance_loss_clip": 0.06291834, + "balance_loss_mlp": 0.0125549, + "epoch": 0.3610401322711559, + "flos": 20817120510720.0, + "grad_norm": 1.7032625156204924, + "language_loss": 0.78291458, + "learning_rate": 2.956407517225883e-06, + "loss": 0.86035228, + "num_input_tokens_seen": 128985835, + "router_z_loss_clip": 1.81640625, + "router_z_loss_mlp": 0.15026855, + "step": 6005, + "time_per_iteration": 2.507681369781494 + }, + { + "auxiliary_loss_clip": 0.06466124, + "auxiliary_loss_mlp": 0.01274708, + "balance_loss_clip": 0.06289654, + "balance_loss_mlp": 0.01260373, + "epoch": 0.36110025552382385, + "flos": 13704302972160.0, + "grad_norm": 1.9788670063291258, + "language_loss": 0.79365236, + "learning_rate": 2.956065454793429e-06, + "loss": 0.87106061, + "num_input_tokens_seen": 129003120, + "router_z_loss_clip": 1.76464844, + "router_z_loss_mlp": 0.14349365, + "step": 6006, + "time_per_iteration": 2.6221675872802734 + }, + { + "auxiliary_loss_clip": 0.06467897, + "auxiliary_loss_mlp": 0.01276481, + "balance_loss_clip": 0.06290089, + "balance_loss_mlp": 0.01260317, + "epoch": 0.3611603787764918, + "flos": 22461490961280.0, + "grad_norm": 1.8947484153914913, + "language_loss": 0.84532005, + "learning_rate": 2.955723356106876e-06, + "loss": 0.92276382, + "num_input_tokens_seen": 129021645, + "router_z_loss_clip": 1.77734375, + "router_z_loss_mlp": 0.16162109, + "step": 6007, + "time_per_iteration": 2.5697944164276123 + }, + { + "auxiliary_loss_clip": 0.06477423, + "auxiliary_loss_mlp": 0.01275582, + "balance_loss_clip": 0.06289505, + "balance_loss_mlp": 0.0126018, + "epoch": 0.3612205020291598, + "flos": 20892954055680.0, + "grad_norm": 2.2451481952848953, + "language_loss": 0.73192191, + "learning_rate": 2.955381221179198e-06, + "loss": 0.80945194, + "num_input_tokens_seen": 129038375, + "router_z_loss_clip": 1.87890625, + "router_z_loss_mlp": 0.1541748, + "step": 6008, + "time_per_iteration": 2.5410661697387695 + }, + { + "auxiliary_loss_clip": 0.06468849, + "auxiliary_loss_mlp": 0.01276747, + "balance_loss_clip": 0.06288531, + "balance_loss_mlp": 0.01262036, + "epoch": 0.36128062528182775, + "flos": 15747393876480.0, + "grad_norm": 2.0636796050179194, + "language_loss": 0.83194089, + "learning_rate": 2.955039050023368e-06, + "loss": 0.90939683, + "num_input_tokens_seen": 129056235, + "router_z_loss_clip": 1.80273438, + "router_z_loss_mlp": 0.1472168, + "step": 6009, + "time_per_iteration": 2.4896605014801025 + }, + { + "auxiliary_loss_clip": 0.06467466, + "auxiliary_loss_mlp": 0.01270582, + "balance_loss_clip": 0.0629051, + "balance_loss_mlp": 0.012553, + "epoch": 0.3613407485344957, + "flos": 16770239066880.0, + "grad_norm": 1.996577445690206, + "language_loss": 0.7613554, + "learning_rate": 2.954696842652362e-06, + "loss": 0.83873594, + "num_input_tokens_seen": 129072405, + "router_z_loss_clip": 1.76953125, + "router_z_loss_mlp": 0.15258789, + "step": 6010, + "time_per_iteration": 2.501328468322754 + }, + { + "auxiliary_loss_clip": 0.064712, + "auxiliary_loss_mlp": 0.0127317, + "balance_loss_clip": 0.06292284, + "balance_loss_mlp": 0.01258734, + "epoch": 0.3614008717871637, + "flos": 20376625996800.0, + "grad_norm": 1.7565456089129825, + "language_loss": 0.8353886, + "learning_rate": 2.9543545990791554e-06, + "loss": 0.91283226, + "num_input_tokens_seen": 129090225, + "router_z_loss_clip": 1.78710938, + "router_z_loss_mlp": 0.14440918, + "step": 6011, + "time_per_iteration": 2.5080785751342773 + }, + { + "auxiliary_loss_clip": 0.06473367, + "auxiliary_loss_mlp": 0.01273027, + "balance_loss_clip": 0.06288376, + "balance_loss_mlp": 0.0125784, + "epoch": 0.36146099503983165, + "flos": 22782071882880.0, + "grad_norm": 2.5852128775447536, + "language_loss": 0.62982023, + "learning_rate": 2.954012319316727e-06, + "loss": 0.70728415, + "num_input_tokens_seen": 129107685, + "router_z_loss_clip": 1.84863281, + "router_z_loss_mlp": 0.15185547, + "step": 6012, + "time_per_iteration": 2.5285983085632324 + }, + { + "auxiliary_loss_clip": 0.06468817, + "auxiliary_loss_mlp": 0.01279391, + "balance_loss_clip": 0.06292222, + "balance_loss_mlp": 0.01264728, + "epoch": 0.3615211182924996, + "flos": 23002277212800.0, + "grad_norm": 2.060645495819417, + "language_loss": 0.83850408, + "learning_rate": 2.9536700033780565e-06, + "loss": 0.91598618, + "num_input_tokens_seen": 129125315, + "router_z_loss_clip": 1.76660156, + "router_z_loss_mlp": 0.14648438, + "step": 6013, + "time_per_iteration": 2.511187791824341 + }, + { + "auxiliary_loss_clip": 0.06469796, + "auxiliary_loss_mlp": 0.01276155, + "balance_loss_clip": 0.06291521, + "balance_loss_mlp": 0.01259501, + "epoch": 0.3615812415451676, + "flos": 16652631461760.0, + "grad_norm": 1.9072870373759168, + "language_loss": 0.92107058, + "learning_rate": 2.9533276512761228e-06, + "loss": 0.99853015, + "num_input_tokens_seen": 129141600, + "router_z_loss_clip": 1.78417969, + "router_z_loss_mlp": 0.16638184, + "step": 6014, + "time_per_iteration": 2.498011350631714 + }, + { + "auxiliary_loss_clip": 0.06466013, + "auxiliary_loss_mlp": 0.01275475, + "balance_loss_clip": 0.06290498, + "balance_loss_mlp": 0.01260097, + "epoch": 0.36164136479783554, + "flos": 21325733994240.0, + "grad_norm": 8.045361949377702, + "language_loss": 0.73973721, + "learning_rate": 2.95298526302391e-06, + "loss": 0.81715214, + "num_input_tokens_seen": 129160665, + "router_z_loss_clip": 1.75683594, + "router_z_loss_mlp": 0.15393066, + "step": 6015, + "time_per_iteration": 2.5139665603637695 + }, + { + "auxiliary_loss_clip": 0.0646963, + "auxiliary_loss_mlp": 0.01277804, + "balance_loss_clip": 0.06291166, + "balance_loss_mlp": 0.01262151, + "epoch": 0.3617014880505035, + "flos": 24176286368640.0, + "grad_norm": 1.9455925595590893, + "language_loss": 0.65181047, + "learning_rate": 2.9526428386344e-06, + "loss": 0.72928476, + "num_input_tokens_seen": 129179220, + "router_z_loss_clip": 1.78320312, + "router_z_loss_mlp": 0.15637207, + "step": 6016, + "time_per_iteration": 2.5485315322875977 + }, + { + "auxiliary_loss_clip": 0.06469464, + "auxiliary_loss_mlp": 0.01276058, + "balance_loss_clip": 0.06288736, + "balance_loss_mlp": 0.01259261, + "epoch": 0.3617616113031715, + "flos": 39023278997760.0, + "grad_norm": 1.6846943976812254, + "language_loss": 0.72102833, + "learning_rate": 2.9523003781205785e-06, + "loss": 0.79848349, + "num_input_tokens_seen": 129200385, + "router_z_loss_clip": 1.81152344, + "router_z_loss_mlp": 0.16784668, + "step": 6017, + "time_per_iteration": 2.6685996055603027 + }, + { + "auxiliary_loss_clip": 0.06470844, + "auxiliary_loss_mlp": 0.01272479, + "balance_loss_clip": 0.06287402, + "balance_loss_mlp": 0.01256886, + "epoch": 0.3618217345558395, + "flos": 12135807993600.0, + "grad_norm": 2.3155685522099962, + "language_loss": 0.74387789, + "learning_rate": 2.9519578814954307e-06, + "loss": 0.82131112, + "num_input_tokens_seen": 129217395, + "router_z_loss_clip": 1.83398438, + "router_z_loss_mlp": 0.15600586, + "step": 6018, + "time_per_iteration": 3.93249249458313 + }, + { + "auxiliary_loss_clip": 0.06458628, + "auxiliary_loss_mlp": 0.01273986, + "balance_loss_clip": 0.06287278, + "balance_loss_mlp": 0.0125856, + "epoch": 0.36188185780850746, + "flos": 24941722216320.0, + "grad_norm": 2.406612181934337, + "language_loss": 0.69554305, + "learning_rate": 2.9516153487719448e-06, + "loss": 0.77286923, + "num_input_tokens_seen": 129238940, + "router_z_loss_clip": 1.71191406, + "router_z_loss_mlp": 0.1541748, + "step": 6019, + "time_per_iteration": 4.000872373580933 + }, + { + "auxiliary_loss_clip": 0.06472806, + "auxiliary_loss_mlp": 0.01271681, + "balance_loss_clip": 0.0628852, + "balance_loss_mlp": 0.01255815, + "epoch": 0.3619419810611754, + "flos": 20965014167040.0, + "grad_norm": 2.953778610066193, + "language_loss": 0.76874363, + "learning_rate": 2.95127277996311e-06, + "loss": 0.84618843, + "num_input_tokens_seen": 129258240, + "router_z_loss_clip": 1.84277344, + "router_z_loss_mlp": 0.15869141, + "step": 6020, + "time_per_iteration": 2.5465614795684814 + }, + { + "auxiliary_loss_clip": 0.06471147, + "auxiliary_loss_mlp": 0.01273965, + "balance_loss_clip": 0.06288891, + "balance_loss_mlp": 0.01257264, + "epoch": 0.3620021043138434, + "flos": 22535521643520.0, + "grad_norm": 2.2311166939070097, + "language_loss": 0.74090236, + "learning_rate": 2.9509301750819156e-06, + "loss": 0.81835353, + "num_input_tokens_seen": 129279040, + "router_z_loss_clip": 1.82519531, + "router_z_loss_mlp": 0.16687012, + "step": 6021, + "time_per_iteration": 2.57817006111145 + }, + { + "auxiliary_loss_clip": 0.06467178, + "auxiliary_loss_mlp": 0.01270658, + "balance_loss_clip": 0.0628859, + "balance_loss_mlp": 0.01255685, + "epoch": 0.36206222756651135, + "flos": 15602183550720.0, + "grad_norm": 5.238961551513005, + "language_loss": 0.81591839, + "learning_rate": 2.9505875341413533e-06, + "loss": 0.89329672, + "num_input_tokens_seen": 129295415, + "router_z_loss_clip": 1.78710938, + "router_z_loss_mlp": 0.1496582, + "step": 6022, + "time_per_iteration": 2.5385305881500244 + }, + { + "auxiliary_loss_clip": 0.06457289, + "auxiliary_loss_mlp": 0.0127544, + "balance_loss_clip": 0.06285636, + "balance_loss_mlp": 0.01260349, + "epoch": 0.3621223508191793, + "flos": 23594019546240.0, + "grad_norm": 2.318322058767841, + "language_loss": 0.81707698, + "learning_rate": 2.950244857154417e-06, + "loss": 0.89440429, + "num_input_tokens_seen": 129312620, + "router_z_loss_clip": 1.71484375, + "router_z_loss_mlp": 0.15075684, + "step": 6023, + "time_per_iteration": 2.604048013687134 + }, + { + "auxiliary_loss_clip": 0.0647051, + "auxiliary_loss_mlp": 0.01276448, + "balance_loss_clip": 0.06288643, + "balance_loss_mlp": 0.01259795, + "epoch": 0.3621824740718473, + "flos": 22316490270720.0, + "grad_norm": 2.4056275848880038, + "language_loss": 0.80008531, + "learning_rate": 2.9499021441341e-06, + "loss": 0.87755489, + "num_input_tokens_seen": 129331825, + "router_z_loss_clip": 1.81640625, + "router_z_loss_mlp": 0.16650391, + "step": 6024, + "time_per_iteration": 3.9998557567596436 + }, + { + "auxiliary_loss_clip": 0.06462081, + "auxiliary_loss_mlp": 0.01273703, + "balance_loss_clip": 0.06288754, + "balance_loss_mlp": 0.01258599, + "epoch": 0.36224259732451525, + "flos": 16769232817920.0, + "grad_norm": 2.2201652107227354, + "language_loss": 0.75149572, + "learning_rate": 2.9495593950933997e-06, + "loss": 0.82885349, + "num_input_tokens_seen": 129350400, + "router_z_loss_clip": 1.73535156, + "router_z_loss_mlp": 0.15112305, + "step": 6025, + "time_per_iteration": 2.5139317512512207 + }, + { + "auxiliary_loss_clip": 0.06466474, + "auxiliary_loss_mlp": 0.01274175, + "balance_loss_clip": 0.06290425, + "balance_loss_mlp": 0.01260198, + "epoch": 0.3623027205771832, + "flos": 23156585706240.0, + "grad_norm": 1.704945166995659, + "language_loss": 0.72471905, + "learning_rate": 2.9492166100453107e-06, + "loss": 0.80212557, + "num_input_tokens_seen": 129371155, + "router_z_loss_clip": 1.76074219, + "router_z_loss_mlp": 0.13989258, + "step": 6026, + "time_per_iteration": 3.974848985671997 + }, + { + "auxiliary_loss_clip": 0.06476888, + "auxiliary_loss_mlp": 0.01276899, + "balance_loss_clip": 0.06290971, + "balance_loss_mlp": 0.01260233, + "epoch": 0.3623628438298512, + "flos": 28556829970560.0, + "grad_norm": 1.945563554904942, + "language_loss": 0.79502189, + "learning_rate": 2.948873789002833e-06, + "loss": 0.87255979, + "num_input_tokens_seen": 129391230, + "router_z_loss_clip": 1.859375, + "router_z_loss_mlp": 0.16662598, + "step": 6027, + "time_per_iteration": 2.614713430404663 + }, + { + "auxiliary_loss_clip": 0.06469107, + "auxiliary_loss_mlp": 0.01272818, + "balance_loss_clip": 0.06288799, + "balance_loss_mlp": 0.01256427, + "epoch": 0.36242296708251914, + "flos": 25492193614080.0, + "grad_norm": 4.95803648299326, + "language_loss": 0.68042505, + "learning_rate": 2.9485309319789667e-06, + "loss": 0.75784421, + "num_input_tokens_seen": 129410065, + "router_z_loss_clip": 1.80371094, + "router_z_loss_mlp": 0.16381836, + "step": 6028, + "time_per_iteration": 2.5680782794952393 + }, + { + "auxiliary_loss_clip": 0.06467344, + "auxiliary_loss_mlp": 0.01275782, + "balance_loss_clip": 0.0629041, + "balance_loss_mlp": 0.01260273, + "epoch": 0.3624830903351871, + "flos": 16296062411520.0, + "grad_norm": 2.2968183263714983, + "language_loss": 0.85463655, + "learning_rate": 2.9481880389867117e-06, + "loss": 0.93206775, + "num_input_tokens_seen": 129428655, + "router_z_loss_clip": 1.76953125, + "router_z_loss_mlp": 0.1550293, + "step": 6029, + "time_per_iteration": 2.519960403442383 + }, + { + "auxiliary_loss_clip": 0.06462874, + "auxiliary_loss_mlp": 0.01270115, + "balance_loss_clip": 0.0628645, + "balance_loss_mlp": 0.01255107, + "epoch": 0.36254321358785513, + "flos": 18302200865280.0, + "grad_norm": 1.7460468862336926, + "language_loss": 0.72888201, + "learning_rate": 2.9478451100390714e-06, + "loss": 0.80621189, + "num_input_tokens_seen": 129447845, + "router_z_loss_clip": 1.76367188, + "router_z_loss_mlp": 0.15008545, + "step": 6030, + "time_per_iteration": 2.480053663253784 + }, + { + "auxiliary_loss_clip": 0.06476077, + "auxiliary_loss_mlp": 0.01274605, + "balance_loss_clip": 0.06291036, + "balance_loss_mlp": 0.01257558, + "epoch": 0.3626033368405231, + "flos": 14870387917440.0, + "grad_norm": 3.30241855147188, + "language_loss": 0.75249928, + "learning_rate": 2.94750214514905e-06, + "loss": 0.83000606, + "num_input_tokens_seen": 129463275, + "router_z_loss_clip": 1.84960938, + "router_z_loss_mlp": 0.17041016, + "step": 6031, + "time_per_iteration": 2.4887540340423584 + }, + { + "auxiliary_loss_clip": 0.06465365, + "auxiliary_loss_mlp": 0.01279599, + "balance_loss_clip": 0.06287815, + "balance_loss_mlp": 0.01264245, + "epoch": 0.36266346009319106, + "flos": 22312632983040.0, + "grad_norm": 2.377019393957944, + "language_loss": 0.73490477, + "learning_rate": 2.9471591443296516e-06, + "loss": 0.81235439, + "num_input_tokens_seen": 129483205, + "router_z_loss_clip": 1.77539062, + "router_z_loss_mlp": 0.15344238, + "step": 6032, + "time_per_iteration": 2.5194106101989746 + }, + { + "auxiliary_loss_clip": 0.06471337, + "auxiliary_loss_mlp": 0.01274047, + "balance_loss_clip": 0.06290144, + "balance_loss_mlp": 0.01258776, + "epoch": 0.362723583345859, + "flos": 18228044401920.0, + "grad_norm": 1.8908046818451942, + "language_loss": 0.78089464, + "learning_rate": 2.946816107593884e-06, + "loss": 0.85834849, + "num_input_tokens_seen": 129499885, + "router_z_loss_clip": 1.81054688, + "router_z_loss_mlp": 0.15270996, + "step": 6033, + "time_per_iteration": 2.6062612533569336 + }, + { + "auxiliary_loss_clip": 0.06434236, + "auxiliary_loss_mlp": 0.01267532, + "balance_loss_clip": 0.06350702, + "balance_loss_mlp": 0.01264055, + "epoch": 0.362783706598527, + "flos": 68519307456000.0, + "grad_norm": 0.7613876705351186, + "language_loss": 0.64809752, + "learning_rate": 2.9464730349547547e-06, + "loss": 0.72511524, + "num_input_tokens_seen": 129561885, + "router_z_loss_clip": 0.8359375, + "router_z_loss_mlp": 0.03485107, + "step": 6034, + "time_per_iteration": 3.216454267501831 + }, + { + "auxiliary_loss_clip": 0.06466131, + "auxiliary_loss_mlp": 0.01276184, + "balance_loss_clip": 0.06289437, + "balance_loss_mlp": 0.01260222, + "epoch": 0.36284382985119495, + "flos": 26583535117440.0, + "grad_norm": 2.053623051898619, + "language_loss": 0.89456552, + "learning_rate": 2.946129926425273e-06, + "loss": 0.97198874, + "num_input_tokens_seen": 129582325, + "router_z_loss_clip": 1.765625, + "router_z_loss_mlp": 0.15966797, + "step": 6035, + "time_per_iteration": 2.5606629848480225 + }, + { + "auxiliary_loss_clip": 0.06479318, + "auxiliary_loss_mlp": 0.01272395, + "balance_loss_clip": 0.06295764, + "balance_loss_mlp": 0.0125592, + "epoch": 0.3629039531038629, + "flos": 20162919358080.0, + "grad_norm": 1.7740824971358589, + "language_loss": 0.73855877, + "learning_rate": 2.9457867820184496e-06, + "loss": 0.81607592, + "num_input_tokens_seen": 129600350, + "router_z_loss_clip": 1.83398438, + "router_z_loss_mlp": 0.16455078, + "step": 6036, + "time_per_iteration": 2.5144500732421875 + }, + { + "auxiliary_loss_clip": 0.06482191, + "auxiliary_loss_mlp": 0.01272832, + "balance_loss_clip": 0.06296846, + "balance_loss_mlp": 0.01256823, + "epoch": 0.3629640763565309, + "flos": 18631838027520.0, + "grad_norm": 1.8050884717083873, + "language_loss": 0.76438695, + "learning_rate": 2.945443601747297e-06, + "loss": 0.84193718, + "num_input_tokens_seen": 129618425, + "router_z_loss_clip": 1.8515625, + "router_z_loss_mlp": 0.16015625, + "step": 6037, + "time_per_iteration": 2.5286643505096436 + }, + { + "auxiliary_loss_clip": 0.06467965, + "auxiliary_loss_mlp": 0.01277972, + "balance_loss_clip": 0.06292737, + "balance_loss_mlp": 0.01262546, + "epoch": 0.36302419960919885, + "flos": 19577256445440.0, + "grad_norm": 1.633141884703147, + "language_loss": 0.78871524, + "learning_rate": 2.945100385624828e-06, + "loss": 0.86617458, + "num_input_tokens_seen": 129636750, + "router_z_loss_clip": 1.75097656, + "router_z_loss_mlp": 0.1541748, + "step": 6038, + "time_per_iteration": 2.5062947273254395 + }, + { + "auxiliary_loss_clip": 0.06400688, + "auxiliary_loss_mlp": 0.01261234, + "balance_loss_clip": 0.06318134, + "balance_loss_mlp": 0.01257723, + "epoch": 0.3630843228618668, + "flos": 63817805589120.0, + "grad_norm": 0.8140528620617334, + "language_loss": 0.63225597, + "learning_rate": 2.9447571336640573e-06, + "loss": 0.70887518, + "num_input_tokens_seen": 129699030, + "router_z_loss_clip": 0.82470703, + "router_z_loss_mlp": 0.03512573, + "step": 6039, + "time_per_iteration": 3.269761323928833 + }, + { + "auxiliary_loss_clip": 0.06467007, + "auxiliary_loss_mlp": 0.01269703, + "balance_loss_clip": 0.06289599, + "balance_loss_mlp": 0.01253932, + "epoch": 0.3631444461145348, + "flos": 21841600855680.0, + "grad_norm": 2.592040544468795, + "language_loss": 0.71409321, + "learning_rate": 2.944413845878002e-06, + "loss": 0.79146034, + "num_input_tokens_seen": 129717135, + "router_z_loss_clip": 1.7734375, + "router_z_loss_mlp": 0.15783691, + "step": 6040, + "time_per_iteration": 2.5549709796905518 + }, + { + "auxiliary_loss_clip": 0.06477243, + "auxiliary_loss_mlp": 0.01276394, + "balance_loss_clip": 0.06293249, + "balance_loss_mlp": 0.01260277, + "epoch": 0.36320456936720275, + "flos": 21727850538240.0, + "grad_norm": 1.6745525965006305, + "language_loss": 0.81387192, + "learning_rate": 2.9440705222796783e-06, + "loss": 0.89140832, + "num_input_tokens_seen": 129735940, + "router_z_loss_clip": 1.83789062, + "router_z_loss_mlp": 0.16113281, + "step": 6041, + "time_per_iteration": 2.529555320739746 + }, + { + "auxiliary_loss_clip": 0.06473525, + "auxiliary_loss_mlp": 0.01278326, + "balance_loss_clip": 0.0629223, + "balance_loss_mlp": 0.01261291, + "epoch": 0.3632646926198707, + "flos": 17024713516800.0, + "grad_norm": 3.0330286867158547, + "language_loss": 0.8477391, + "learning_rate": 2.943727162882107e-06, + "loss": 0.92525762, + "num_input_tokens_seen": 129752790, + "router_z_loss_clip": 1.81347656, + "router_z_loss_mlp": 0.17016602, + "step": 6042, + "time_per_iteration": 2.52242112159729 + }, + { + "auxiliary_loss_clip": 0.06469671, + "auxiliary_loss_mlp": 0.01277961, + "balance_loss_clip": 0.06290909, + "balance_loss_mlp": 0.01261892, + "epoch": 0.36332481587253873, + "flos": 23337868014720.0, + "grad_norm": 1.7311470578574424, + "language_loss": 0.78563523, + "learning_rate": 2.9433837676983064e-06, + "loss": 0.86311156, + "num_input_tokens_seen": 129773655, + "router_z_loss_clip": 1.78808594, + "router_z_loss_mlp": 0.16088867, + "step": 6043, + "time_per_iteration": 2.5507187843322754 + }, + { + "auxiliary_loss_clip": 0.06465393, + "auxiliary_loss_mlp": 0.0127573, + "balance_loss_clip": 0.06289753, + "balance_loss_mlp": 0.01258755, + "epoch": 0.3633849391252067, + "flos": 10748134126080.0, + "grad_norm": 2.0752100798218245, + "language_loss": 0.66141021, + "learning_rate": 2.943040336741298e-06, + "loss": 0.73882145, + "num_input_tokens_seen": 129791605, + "router_z_loss_clip": 1.75683594, + "router_z_loss_mlp": 0.16967773, + "step": 6044, + "time_per_iteration": 2.5431315898895264 + }, + { + "auxiliary_loss_clip": 0.06470387, + "auxiliary_loss_mlp": 0.0127397, + "balance_loss_clip": 0.06293066, + "balance_loss_mlp": 0.01258794, + "epoch": 0.36344506237787466, + "flos": 25856351458560.0, + "grad_norm": 1.7019744870222642, + "language_loss": 0.81317604, + "learning_rate": 2.9426968700241066e-06, + "loss": 0.89061964, + "num_input_tokens_seen": 129811075, + "router_z_loss_clip": 1.7734375, + "router_z_loss_mlp": 0.15185547, + "step": 6045, + "time_per_iteration": 2.578608274459839 + }, + { + "auxiliary_loss_clip": 0.06471765, + "auxiliary_loss_mlp": 0.01277035, + "balance_loss_clip": 0.06291001, + "balance_loss_mlp": 0.01260977, + "epoch": 0.3635051856305426, + "flos": 30161900056320.0, + "grad_norm": 1.9031490691130954, + "language_loss": 0.64869618, + "learning_rate": 2.942353367559755e-06, + "loss": 0.72618413, + "num_input_tokens_seen": 129833755, + "router_z_loss_clip": 1.80761719, + "router_z_loss_mlp": 0.16064453, + "step": 6046, + "time_per_iteration": 2.6581788063049316 + }, + { + "auxiliary_loss_clip": 0.06469898, + "auxiliary_loss_mlp": 0.01279877, + "balance_loss_clip": 0.06291277, + "balance_loss_mlp": 0.01264082, + "epoch": 0.3635653088832106, + "flos": 22204626670080.0, + "grad_norm": 1.4883910134219482, + "language_loss": 0.77790976, + "learning_rate": 2.9420098293612692e-06, + "loss": 0.85540754, + "num_input_tokens_seen": 129854475, + "router_z_loss_clip": 1.78417969, + "router_z_loss_mlp": 0.15783691, + "step": 6047, + "time_per_iteration": 2.59384822845459 + }, + { + "auxiliary_loss_clip": 0.06482202, + "auxiliary_loss_mlp": 0.01277437, + "balance_loss_clip": 0.0629375, + "balance_loss_mlp": 0.01259794, + "epoch": 0.36362543213587856, + "flos": 24793409289600.0, + "grad_norm": 2.402065763679051, + "language_loss": 0.79315472, + "learning_rate": 2.9416662554416767e-06, + "loss": 0.87075114, + "num_input_tokens_seen": 129873530, + "router_z_loss_clip": 1.88574219, + "router_z_loss_mlp": 0.1763916, + "step": 6048, + "time_per_iteration": 2.586355447769165 + }, + { + "auxiliary_loss_clip": 0.06388409, + "auxiliary_loss_mlp": 0.01275978, + "balance_loss_clip": 0.06308184, + "balance_loss_mlp": 0.01272211, + "epoch": 0.3636855553885465, + "flos": 62547320056320.0, + "grad_norm": 0.756250652706744, + "language_loss": 0.52505761, + "learning_rate": 2.9413226458140054e-06, + "loss": 0.6017015, + "num_input_tokens_seen": 129940400, + "router_z_loss_clip": 0.80126953, + "router_z_loss_mlp": 0.03759766, + "step": 6049, + "time_per_iteration": 3.1991608142852783 + }, + { + "auxiliary_loss_clip": 0.06471006, + "auxiliary_loss_mlp": 0.01281005, + "balance_loss_clip": 0.06289691, + "balance_loss_mlp": 0.01264518, + "epoch": 0.3637456786412145, + "flos": 24067441514880.0, + "grad_norm": 1.9518715754512581, + "language_loss": 0.8677333, + "learning_rate": 2.9409790004912845e-06, + "loss": 0.94525343, + "num_input_tokens_seen": 129958635, + "router_z_loss_clip": 1.81152344, + "router_z_loss_mlp": 0.16467285, + "step": 6050, + "time_per_iteration": 2.619880437850952 + }, + { + "auxiliary_loss_clip": 0.06465575, + "auxiliary_loss_mlp": 0.01288294, + "balance_loss_clip": 0.06288004, + "balance_loss_mlp": 0.01271784, + "epoch": 0.36380580189388245, + "flos": 16697214633600.0, + "grad_norm": 2.0514222430242937, + "language_loss": 0.78671187, + "learning_rate": 2.940635319486546e-06, + "loss": 0.86425054, + "num_input_tokens_seen": 129977685, + "router_z_loss_clip": 1.77539062, + "router_z_loss_mlp": 0.16491699, + "step": 6051, + "time_per_iteration": 2.5192694664001465 + }, + { + "auxiliary_loss_clip": 0.064697, + "auxiliary_loss_mlp": 0.0128748, + "balance_loss_clip": 0.06289212, + "balance_loss_mlp": 0.01271315, + "epoch": 0.3638659251465504, + "flos": 25120279267200.0, + "grad_norm": 2.1218426019343943, + "language_loss": 0.82423818, + "learning_rate": 2.940291602812822e-06, + "loss": 0.90180993, + "num_input_tokens_seen": 129997530, + "router_z_loss_clip": 1.8046875, + "router_z_loss_mlp": 0.16174316, + "step": 6052, + "time_per_iteration": 2.6190178394317627 + }, + { + "auxiliary_loss_clip": 0.06462704, + "auxiliary_loss_mlp": 0.01293914, + "balance_loss_clip": 0.06289209, + "balance_loss_mlp": 0.0127831, + "epoch": 0.3639260483992184, + "flos": 23009698298880.0, + "grad_norm": 1.6976848198598335, + "language_loss": 0.72702307, + "learning_rate": 2.939947850483145e-06, + "loss": 0.80458927, + "num_input_tokens_seen": 130017955, + "router_z_loss_clip": 1.73339844, + "router_z_loss_mlp": 0.15588379, + "step": 6053, + "time_per_iteration": 2.5632545948028564 + }, + { + "auxiliary_loss_clip": 0.0637124, + "auxiliary_loss_mlp": 0.0126271, + "balance_loss_clip": 0.06291765, + "balance_loss_mlp": 0.01258046, + "epoch": 0.36398617165188635, + "flos": 70735043698560.0, + "grad_norm": 0.7367280535398725, + "language_loss": 0.61109686, + "learning_rate": 2.9396040625105532e-06, + "loss": 0.68743634, + "num_input_tokens_seen": 130074275, + "router_z_loss_clip": 0.79541016, + "router_z_loss_mlp": 0.04656982, + "step": 6054, + "time_per_iteration": 3.1670703887939453 + }, + { + "auxiliary_loss_clip": 0.06468257, + "auxiliary_loss_mlp": 0.01284514, + "balance_loss_clip": 0.06288631, + "balance_loss_mlp": 0.01267062, + "epoch": 0.3640462949045543, + "flos": 22241788755840.0, + "grad_norm": 2.4941401517388795, + "language_loss": 0.76399368, + "learning_rate": 2.9392602389080802e-06, + "loss": 0.84152138, + "num_input_tokens_seen": 130091375, + "router_z_loss_clip": 1.79589844, + "router_z_loss_mlp": 0.17456055, + "step": 6055, + "time_per_iteration": 2.5719425678253174 + }, + { + "auxiliary_loss_clip": 0.06463572, + "auxiliary_loss_mlp": 0.0128082, + "balance_loss_clip": 0.06286994, + "balance_loss_mlp": 0.01264023, + "epoch": 0.3641064181572223, + "flos": 21549964320000.0, + "grad_norm": 1.5003458585655993, + "language_loss": 0.75247842, + "learning_rate": 2.938916379688765e-06, + "loss": 0.82992232, + "num_input_tokens_seen": 130111595, + "router_z_loss_clip": 1.765625, + "router_z_loss_mlp": 0.16784668, + "step": 6056, + "time_per_iteration": 2.548563241958618 + }, + { + "auxiliary_loss_clip": 0.06463505, + "auxiliary_loss_mlp": 0.01288137, + "balance_loss_clip": 0.06286436, + "balance_loss_mlp": 0.01271805, + "epoch": 0.3641665414098903, + "flos": 22279873236480.0, + "grad_norm": 1.8427248639079936, + "language_loss": 0.80231911, + "learning_rate": 2.9385724848656468e-06, + "loss": 0.87983549, + "num_input_tokens_seen": 130131440, + "router_z_loss_clip": 1.77050781, + "router_z_loss_mlp": 0.16320801, + "step": 6057, + "time_per_iteration": 2.590890645980835 + }, + { + "auxiliary_loss_clip": 0.06463237, + "auxiliary_loss_mlp": 0.01288366, + "balance_loss_clip": 0.06286855, + "balance_loss_mlp": 0.01271259, + "epoch": 0.36422666466255826, + "flos": 28337211619200.0, + "grad_norm": 2.0267495677395106, + "language_loss": 0.80895132, + "learning_rate": 2.9382285544517647e-06, + "loss": 0.88646734, + "num_input_tokens_seen": 130151375, + "router_z_loss_clip": 1.76367188, + "router_z_loss_mlp": 0.17114258, + "step": 6058, + "time_per_iteration": 3.9912350177764893 + }, + { + "auxiliary_loss_clip": 0.06462751, + "auxiliary_loss_mlp": 0.01284352, + "balance_loss_clip": 0.06282878, + "balance_loss_mlp": 0.01267794, + "epoch": 0.36428678791522623, + "flos": 24177376471680.0, + "grad_norm": 1.829086801108262, + "language_loss": 0.84467566, + "learning_rate": 2.9378845884601636e-06, + "loss": 0.9221468, + "num_input_tokens_seen": 130169960, + "router_z_loss_clip": 1.80078125, + "router_z_loss_mlp": 0.16552734, + "step": 6059, + "time_per_iteration": 3.9484288692474365 + }, + { + "auxiliary_loss_clip": 0.06463904, + "auxiliary_loss_mlp": 0.01290231, + "balance_loss_clip": 0.06284287, + "balance_loss_mlp": 0.01274006, + "epoch": 0.3643469111678942, + "flos": 22535018519040.0, + "grad_norm": 1.8662633122766634, + "language_loss": 0.88296366, + "learning_rate": 2.937540586903884e-06, + "loss": 0.96050501, + "num_input_tokens_seen": 130189800, + "router_z_loss_clip": 1.796875, + "router_z_loss_mlp": 0.16223145, + "step": 6060, + "time_per_iteration": 2.580472946166992 + }, + { + "auxiliary_loss_clip": 0.06469811, + "auxiliary_loss_mlp": 0.01278183, + "balance_loss_clip": 0.06287585, + "balance_loss_mlp": 0.01260611, + "epoch": 0.36440703442056216, + "flos": 19432549244160.0, + "grad_norm": 2.050716636944588, + "language_loss": 0.66968513, + "learning_rate": 2.937196549795971e-06, + "loss": 0.74716496, + "num_input_tokens_seen": 130206370, + "router_z_loss_clip": 1.82226562, + "router_z_loss_mlp": 0.17578125, + "step": 6061, + "time_per_iteration": 2.4934303760528564 + }, + { + "auxiliary_loss_clip": 0.06472699, + "auxiliary_loss_mlp": 0.01276187, + "balance_loss_clip": 0.06290831, + "balance_loss_mlp": 0.01259283, + "epoch": 0.3644671576732301, + "flos": 18046300896000.0, + "grad_norm": 2.6099029342135838, + "language_loss": 0.76223081, + "learning_rate": 2.9368524771494718e-06, + "loss": 0.83971971, + "num_input_tokens_seen": 130224445, + "router_z_loss_clip": 1.81835938, + "router_z_loss_mlp": 0.16918945, + "step": 6062, + "time_per_iteration": 2.5342442989349365 + }, + { + "auxiliary_loss_clip": 0.06462175, + "auxiliary_loss_mlp": 0.01277866, + "balance_loss_clip": 0.06284274, + "balance_loss_mlp": 0.01261844, + "epoch": 0.3645272809258981, + "flos": 21549125779200.0, + "grad_norm": 1.679264330509425, + "language_loss": 0.7250427, + "learning_rate": 2.936508368977432e-06, + "loss": 0.80244315, + "num_input_tokens_seen": 130245380, + "router_z_loss_clip": 1.77929688, + "router_z_loss_mlp": 0.16027832, + "step": 6063, + "time_per_iteration": 2.560140609741211 + }, + { + "auxiliary_loss_clip": 0.06463223, + "auxiliary_loss_mlp": 0.01278838, + "balance_loss_clip": 0.0628884, + "balance_loss_mlp": 0.0126256, + "epoch": 0.36458740417856605, + "flos": 22753379059200.0, + "grad_norm": 1.9927269992491163, + "language_loss": 0.67982519, + "learning_rate": 2.936164225292901e-06, + "loss": 0.75724578, + "num_input_tokens_seen": 130265575, + "router_z_loss_clip": 1.74414062, + "router_z_loss_mlp": 0.16265869, + "step": 6064, + "time_per_iteration": 4.001475095748901 + }, + { + "auxiliary_loss_clip": 0.06469691, + "auxiliary_loss_mlp": 0.01281677, + "balance_loss_clip": 0.06288914, + "balance_loss_mlp": 0.01265131, + "epoch": 0.364647527431234, + "flos": 26147862213120.0, + "grad_norm": 2.2981357468080725, + "language_loss": 0.75006247, + "learning_rate": 2.9358200461089297e-06, + "loss": 0.82757616, + "num_input_tokens_seen": 130286195, + "router_z_loss_clip": 1.80761719, + "router_z_loss_mlp": 0.16540527, + "step": 6065, + "time_per_iteration": 2.557175397872925 + }, + { + "auxiliary_loss_clip": 0.06475934, + "auxiliary_loss_mlp": 0.01274844, + "balance_loss_clip": 0.06292161, + "balance_loss_mlp": 0.01257487, + "epoch": 0.364707650683902, + "flos": 31037941693440.0, + "grad_norm": 1.8804228270875918, + "language_loss": 0.75913531, + "learning_rate": 2.9354758314385676e-06, + "loss": 0.8366431, + "num_input_tokens_seen": 130306095, + "router_z_loss_clip": 1.83691406, + "router_z_loss_mlp": 0.17370605, + "step": 6066, + "time_per_iteration": 4.028696537017822 + }, + { + "auxiliary_loss_clip": 0.06465262, + "auxiliary_loss_mlp": 0.01275132, + "balance_loss_clip": 0.06290717, + "balance_loss_mlp": 0.01260124, + "epoch": 0.36476777393656995, + "flos": 19578933527040.0, + "grad_norm": 2.1324188585544293, + "language_loss": 0.77645338, + "learning_rate": 2.9351315812948684e-06, + "loss": 0.85385728, + "num_input_tokens_seen": 130324685, + "router_z_loss_clip": 1.74414062, + "router_z_loss_mlp": 0.15014648, + "step": 6067, + "time_per_iteration": 2.5697665214538574 + }, + { + "auxiliary_loss_clip": 0.06463823, + "auxiliary_loss_mlp": 0.01273764, + "balance_loss_clip": 0.06289702, + "balance_loss_mlp": 0.01258684, + "epoch": 0.3648278971892379, + "flos": 17754622433280.0, + "grad_norm": 1.930394247385299, + "language_loss": 0.71678597, + "learning_rate": 2.934787295690886e-06, + "loss": 0.7941618, + "num_input_tokens_seen": 130343855, + "router_z_loss_clip": 1.74023438, + "router_z_loss_mlp": 0.15063477, + "step": 6068, + "time_per_iteration": 2.4845492839813232 + }, + { + "auxiliary_loss_clip": 0.06473656, + "auxiliary_loss_mlp": 0.0127485, + "balance_loss_clip": 0.06290961, + "balance_loss_mlp": 0.01258005, + "epoch": 0.3648880204419059, + "flos": 17936952917760.0, + "grad_norm": 1.8532098574136342, + "language_loss": 0.73989958, + "learning_rate": 2.9344429746396755e-06, + "loss": 0.8173846, + "num_input_tokens_seen": 130362320, + "router_z_loss_clip": 1.82519531, + "router_z_loss_mlp": 0.16845703, + "step": 6069, + "time_per_iteration": 2.508863687515259 + }, + { + "auxiliary_loss_clip": 0.06469753, + "auxiliary_loss_mlp": 0.01277718, + "balance_loss_clip": 0.06287999, + "balance_loss_mlp": 0.01261684, + "epoch": 0.3649481436945739, + "flos": 22644911548800.0, + "grad_norm": 1.9157179359535086, + "language_loss": 0.66736126, + "learning_rate": 2.9340986181542945e-06, + "loss": 0.74483597, + "num_input_tokens_seen": 130383165, + "router_z_loss_clip": 1.81738281, + "router_z_loss_mlp": 0.16027832, + "step": 6070, + "time_per_iteration": 2.516735076904297 + }, + { + "auxiliary_loss_clip": 0.06467332, + "auxiliary_loss_mlp": 0.01274362, + "balance_loss_clip": 0.06291667, + "balance_loss_mlp": 0.01259169, + "epoch": 0.36500826694724187, + "flos": 21586036302720.0, + "grad_norm": 1.8858284323375742, + "language_loss": 0.7453323, + "learning_rate": 2.9337542262477994e-06, + "loss": 0.82274926, + "num_input_tokens_seen": 130402425, + "router_z_loss_clip": 1.75878906, + "router_z_loss_mlp": 0.1519165, + "step": 6071, + "time_per_iteration": 2.566274642944336 + }, + { + "auxiliary_loss_clip": 0.06468312, + "auxiliary_loss_mlp": 0.01272795, + "balance_loss_clip": 0.0629068, + "balance_loss_mlp": 0.0125703, + "epoch": 0.36506839019990983, + "flos": 13777746675840.0, + "grad_norm": 1.7184690359068113, + "language_loss": 0.88681865, + "learning_rate": 2.9334097989332506e-06, + "loss": 0.96422982, + "num_input_tokens_seen": 130419440, + "router_z_loss_clip": 1.77539062, + "router_z_loss_mlp": 0.15771484, + "step": 6072, + "time_per_iteration": 2.510390043258667 + }, + { + "auxiliary_loss_clip": 0.06471045, + "auxiliary_loss_mlp": 0.01276068, + "balance_loss_clip": 0.06292107, + "balance_loss_mlp": 0.01260285, + "epoch": 0.3651285134525778, + "flos": 17280739267200.0, + "grad_norm": 2.591250971390436, + "language_loss": 0.72601849, + "learning_rate": 2.9330653362237094e-06, + "loss": 0.80348963, + "num_input_tokens_seen": 130438495, + "router_z_loss_clip": 1.79101562, + "router_z_loss_mlp": 0.15771484, + "step": 6073, + "time_per_iteration": 2.5448079109191895 + }, + { + "auxiliary_loss_clip": 0.06476631, + "auxiliary_loss_mlp": 0.012706, + "balance_loss_clip": 0.06296042, + "balance_loss_mlp": 0.0125422, + "epoch": 0.36518863670524576, + "flos": 21914415653760.0, + "grad_norm": 2.188049192517554, + "language_loss": 0.66876209, + "learning_rate": 2.932720838132236e-06, + "loss": 0.74623442, + "num_input_tokens_seen": 130455575, + "router_z_loss_clip": 1.8046875, + "router_z_loss_mlp": 0.16394043, + "step": 6074, + "time_per_iteration": 2.5186121463775635 + }, + { + "auxiliary_loss_clip": 0.06466351, + "auxiliary_loss_mlp": 0.01270864, + "balance_loss_clip": 0.06289779, + "balance_loss_mlp": 0.01255319, + "epoch": 0.3652487599579137, + "flos": 27128933343360.0, + "grad_norm": 1.455377552522792, + "language_loss": 0.73552799, + "learning_rate": 2.9323763046718954e-06, + "loss": 0.81290013, + "num_input_tokens_seen": 130476385, + "router_z_loss_clip": 1.76269531, + "router_z_loss_mlp": 0.15551758, + "step": 6075, + "time_per_iteration": 2.5611414909362793 + }, + { + "auxiliary_loss_clip": 0.06476435, + "auxiliary_loss_mlp": 0.01270879, + "balance_loss_clip": 0.06292082, + "balance_loss_mlp": 0.01255107, + "epoch": 0.3653088832105817, + "flos": 19761683281920.0, + "grad_norm": 3.551310730384351, + "language_loss": 0.89872956, + "learning_rate": 2.9320317358557524e-06, + "loss": 0.97620273, + "num_input_tokens_seen": 130493630, + "router_z_loss_clip": 1.84179688, + "router_z_loss_mlp": 0.15771484, + "step": 6076, + "time_per_iteration": 2.491070508956909 + }, + { + "auxiliary_loss_clip": 0.06471214, + "auxiliary_loss_mlp": 0.01269524, + "balance_loss_clip": 0.06294619, + "balance_loss_mlp": 0.01253782, + "epoch": 0.36536900646324966, + "flos": 13119981724800.0, + "grad_norm": 1.9522812947590364, + "language_loss": 0.69894624, + "learning_rate": 2.931687131696872e-06, + "loss": 0.7763536, + "num_input_tokens_seen": 130510735, + "router_z_loss_clip": 1.76855469, + "router_z_loss_mlp": 0.15740967, + "step": 6077, + "time_per_iteration": 2.5298445224761963 + }, + { + "auxiliary_loss_clip": 0.06367216, + "auxiliary_loss_mlp": 0.01255974, + "balance_loss_clip": 0.06288684, + "balance_loss_mlp": 0.0125196, + "epoch": 0.3654291297159176, + "flos": 71122848393600.0, + "grad_norm": 0.715882721223993, + "language_loss": 0.61670828, + "learning_rate": 2.9313424922083224e-06, + "loss": 0.69294018, + "num_input_tokens_seen": 130577050, + "router_z_loss_clip": 0.78271484, + "router_z_loss_mlp": 0.04013062, + "step": 6078, + "time_per_iteration": 3.245680093765259 + }, + { + "auxiliary_loss_clip": 0.06468864, + "auxiliary_loss_mlp": 0.01269715, + "balance_loss_clip": 0.0628942, + "balance_loss_mlp": 0.01254217, + "epoch": 0.3654892529685856, + "flos": 23623299348480.0, + "grad_norm": 2.6954686860737427, + "language_loss": 0.78565228, + "learning_rate": 2.930997817403173e-06, + "loss": 0.86303806, + "num_input_tokens_seen": 130593780, + "router_z_loss_clip": 1.79492188, + "router_z_loss_mlp": 0.1550293, + "step": 6079, + "time_per_iteration": 2.5243916511535645 + }, + { + "auxiliary_loss_clip": 0.06474455, + "auxiliary_loss_mlp": 0.0127227, + "balance_loss_clip": 0.06293908, + "balance_loss_mlp": 0.01255557, + "epoch": 0.36554937622125355, + "flos": 43480788174720.0, + "grad_norm": 2.827080544182906, + "language_loss": 0.62854588, + "learning_rate": 2.9306531072944913e-06, + "loss": 0.70601308, + "num_input_tokens_seen": 130615510, + "router_z_loss_clip": 1.80371094, + "router_z_loss_mlp": 0.16711426, + "step": 6080, + "time_per_iteration": 2.755979299545288 + }, + { + "auxiliary_loss_clip": 0.06473932, + "auxiliary_loss_mlp": 0.01273454, + "balance_loss_clip": 0.06292675, + "balance_loss_mlp": 0.012568, + "epoch": 0.3656094994739215, + "flos": 23301334834560.0, + "grad_norm": 2.0380719718304046, + "language_loss": 0.68215913, + "learning_rate": 2.930308361895352e-06, + "loss": 0.75963295, + "num_input_tokens_seen": 130635410, + "router_z_loss_clip": 1.8125, + "router_z_loss_mlp": 0.16674805, + "step": 6081, + "time_per_iteration": 2.5318713188171387 + }, + { + "auxiliary_loss_clip": 0.06476995, + "auxiliary_loss_mlp": 0.01283221, + "balance_loss_clip": 0.06289314, + "balance_loss_mlp": 0.01267021, + "epoch": 0.3656696227265895, + "flos": 24578947964160.0, + "grad_norm": 1.6214502004720641, + "language_loss": 0.75242162, + "learning_rate": 2.9299635812188257e-06, + "loss": 0.83002377, + "num_input_tokens_seen": 130657725, + "router_z_loss_clip": 1.87597656, + "router_z_loss_mlp": 0.1619873, + "step": 6082, + "time_per_iteration": 2.614473819732666 + }, + { + "auxiliary_loss_clip": 0.06474194, + "auxiliary_loss_mlp": 0.0127049, + "balance_loss_clip": 0.06295186, + "balance_loss_mlp": 0.01255851, + "epoch": 0.3657297459792575, + "flos": 27935849761920.0, + "grad_norm": 4.519769037138984, + "language_loss": 0.83192384, + "learning_rate": 2.929618765277987e-06, + "loss": 0.90937066, + "num_input_tokens_seen": 130678360, + "router_z_loss_clip": 1.79199219, + "router_z_loss_mlp": 0.14660645, + "step": 6083, + "time_per_iteration": 2.569382429122925 + }, + { + "auxiliary_loss_clip": 0.06373743, + "auxiliary_loss_mlp": 0.01258609, + "balance_loss_clip": 0.06293802, + "balance_loss_mlp": 0.01254855, + "epoch": 0.36578986923192547, + "flos": 67410566231040.0, + "grad_norm": 0.7897440828264927, + "language_loss": 0.59315842, + "learning_rate": 2.9292739140859125e-06, + "loss": 0.66948193, + "num_input_tokens_seen": 130742110, + "router_z_loss_clip": 0.796875, + "router_z_loss_mlp": 0.03747559, + "step": 6084, + "time_per_iteration": 3.2453150749206543 + }, + { + "auxiliary_loss_clip": 0.0646999, + "auxiliary_loss_mlp": 0.0127453, + "balance_loss_clip": 0.06292025, + "balance_loss_mlp": 0.01258801, + "epoch": 0.36584999248459343, + "flos": 20233302387840.0, + "grad_norm": 1.9605927592145687, + "language_loss": 0.73469806, + "learning_rate": 2.9289290276556767e-06, + "loss": 0.81214333, + "num_input_tokens_seen": 130759870, + "router_z_loss_clip": 1.78320312, + "router_z_loss_mlp": 0.15734863, + "step": 6085, + "time_per_iteration": 2.5149080753326416 + }, + { + "auxiliary_loss_clip": 0.06475443, + "auxiliary_loss_mlp": 0.01272781, + "balance_loss_clip": 0.06296027, + "balance_loss_mlp": 0.01256974, + "epoch": 0.3659101157372614, + "flos": 19068475253760.0, + "grad_norm": 1.7755618246241633, + "language_loss": 0.78367889, + "learning_rate": 2.9285841060003604e-06, + "loss": 0.86116111, + "num_input_tokens_seen": 130778510, + "router_z_loss_clip": 1.79492188, + "router_z_loss_mlp": 0.15802002, + "step": 6086, + "time_per_iteration": 2.6959855556488037 + }, + { + "auxiliary_loss_clip": 0.06460601, + "auxiliary_loss_mlp": 0.01277624, + "balance_loss_clip": 0.0628686, + "balance_loss_mlp": 0.01262449, + "epoch": 0.36597023898992936, + "flos": 30818658758400.0, + "grad_norm": 2.7333963743808387, + "language_loss": 0.77419388, + "learning_rate": 2.9282391491330416e-06, + "loss": 0.85157609, + "num_input_tokens_seen": 130798535, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.15185547, + "step": 6087, + "time_per_iteration": 2.660513401031494 + }, + { + "auxiliary_loss_clip": 0.06470397, + "auxiliary_loss_mlp": 0.01281375, + "balance_loss_clip": 0.06288096, + "balance_loss_mlp": 0.0126543, + "epoch": 0.36603036224259733, + "flos": 20528041524480.0, + "grad_norm": 2.0856395013908005, + "language_loss": 0.70779794, + "learning_rate": 2.9278941570668002e-06, + "loss": 0.78531569, + "num_input_tokens_seen": 130816655, + "router_z_loss_clip": 1.82226562, + "router_z_loss_mlp": 0.15948486, + "step": 6088, + "time_per_iteration": 2.5904111862182617 + }, + { + "auxiliary_loss_clip": 0.064822, + "auxiliary_loss_mlp": 0.0127711, + "balance_loss_clip": 0.06290494, + "balance_loss_mlp": 0.01258835, + "epoch": 0.3660904854952653, + "flos": 38339043356160.0, + "grad_norm": 1.5018444157956148, + "language_loss": 0.8073988, + "learning_rate": 2.92754912981472e-06, + "loss": 0.88499188, + "num_input_tokens_seen": 130841225, + "router_z_loss_clip": 1.9140625, + "router_z_loss_mlp": 0.18273926, + "step": 6089, + "time_per_iteration": 2.695387125015259 + }, + { + "auxiliary_loss_clip": 0.06466638, + "auxiliary_loss_mlp": 0.0126828, + "balance_loss_clip": 0.06289521, + "balance_loss_mlp": 0.01254065, + "epoch": 0.36615060874793326, + "flos": 21842062053120.0, + "grad_norm": 1.783943984741075, + "language_loss": 0.71745276, + "learning_rate": 2.927204067389884e-06, + "loss": 0.79480195, + "num_input_tokens_seen": 130861050, + "router_z_loss_clip": 1.77050781, + "router_z_loss_mlp": 0.14208984, + "step": 6090, + "time_per_iteration": 2.5730583667755127 + }, + { + "auxiliary_loss_clip": 0.06467035, + "auxiliary_loss_mlp": 0.01270022, + "balance_loss_clip": 0.06292006, + "balance_loss_mlp": 0.01254585, + "epoch": 0.3662107320006012, + "flos": 16587153895680.0, + "grad_norm": 1.8168526275922985, + "language_loss": 0.74269617, + "learning_rate": 2.9268589698053763e-06, + "loss": 0.82006675, + "num_input_tokens_seen": 130879775, + "router_z_loss_clip": 1.75195312, + "router_z_loss_mlp": 0.1541748, + "step": 6091, + "time_per_iteration": 2.5094668865203857 + }, + { + "auxiliary_loss_clip": 0.06470925, + "auxiliary_loss_mlp": 0.01271934, + "balance_loss_clip": 0.062924, + "balance_loss_mlp": 0.01256699, + "epoch": 0.3662708552532692, + "flos": 20964469115520.0, + "grad_norm": 2.9410218249320796, + "language_loss": 0.72888803, + "learning_rate": 2.926513837074284e-06, + "loss": 0.80631661, + "num_input_tokens_seen": 130898070, + "router_z_loss_clip": 1.78613281, + "router_z_loss_mlp": 0.15234375, + "step": 6092, + "time_per_iteration": 2.525499105453491 + }, + { + "auxiliary_loss_clip": 0.06472248, + "auxiliary_loss_mlp": 0.01276986, + "balance_loss_clip": 0.06288992, + "balance_loss_mlp": 0.01260833, + "epoch": 0.36633097850593715, + "flos": 21908252378880.0, + "grad_norm": 2.382181592286333, + "language_loss": 0.78829455, + "learning_rate": 2.9261686692096942e-06, + "loss": 0.86578685, + "num_input_tokens_seen": 130915250, + "router_z_loss_clip": 1.83105469, + "router_z_loss_mlp": 0.16174316, + "step": 6093, + "time_per_iteration": 2.519925355911255 + }, + { + "auxiliary_loss_clip": 0.06470528, + "auxiliary_loss_mlp": 0.01272915, + "balance_loss_clip": 0.06288898, + "balance_loss_mlp": 0.0125743, + "epoch": 0.3663911017586051, + "flos": 32862462422400.0, + "grad_norm": 1.6789792555665461, + "language_loss": 0.74561131, + "learning_rate": 2.925823466224696e-06, + "loss": 0.82304573, + "num_input_tokens_seen": 130936995, + "router_z_loss_clip": 1.81640625, + "router_z_loss_mlp": 0.15478516, + "step": 6094, + "time_per_iteration": 2.6374077796936035 + }, + { + "auxiliary_loss_clip": 0.06470601, + "auxiliary_loss_mlp": 0.01277645, + "balance_loss_clip": 0.06289363, + "balance_loss_mlp": 0.01261421, + "epoch": 0.3664512250112731, + "flos": 27279132986880.0, + "grad_norm": 1.6273421100585188, + "language_loss": 0.7975142, + "learning_rate": 2.9254782281323785e-06, + "loss": 0.87499666, + "num_input_tokens_seen": 130957970, + "router_z_loss_clip": 1.81152344, + "router_z_loss_mlp": 0.16223145, + "step": 6095, + "time_per_iteration": 2.565009117126465 + }, + { + "auxiliary_loss_clip": 0.06480707, + "auxiliary_loss_mlp": 0.01275122, + "balance_loss_clip": 0.06295107, + "balance_loss_mlp": 0.01258552, + "epoch": 0.3665113482639411, + "flos": 17790065510400.0, + "grad_norm": 2.4875649346087725, + "language_loss": 0.73963505, + "learning_rate": 2.925132954945834e-06, + "loss": 0.81719339, + "num_input_tokens_seen": 130974915, + "router_z_loss_clip": 1.859375, + "router_z_loss_mlp": 0.16577148, + "step": 6096, + "time_per_iteration": 2.5150070190429688 + }, + { + "auxiliary_loss_clip": 0.06474067, + "auxiliary_loss_mlp": 0.01271541, + "balance_loss_clip": 0.06288943, + "balance_loss_mlp": 0.01255901, + "epoch": 0.36657147151660907, + "flos": 27861944860800.0, + "grad_norm": 1.9533584433338151, + "language_loss": 0.67592847, + "learning_rate": 2.924787646678155e-06, + "loss": 0.75338453, + "num_input_tokens_seen": 130995745, + "router_z_loss_clip": 1.8515625, + "router_z_loss_mlp": 0.15649414, + "step": 6097, + "time_per_iteration": 4.085919618606567 + }, + { + "auxiliary_loss_clip": 0.06474558, + "auxiliary_loss_mlp": 0.01273059, + "balance_loss_clip": 0.06292384, + "balance_loss_mlp": 0.01257204, + "epoch": 0.36663159476927704, + "flos": 25381000846080.0, + "grad_norm": 1.4284875999183062, + "language_loss": 0.77924675, + "learning_rate": 2.9244423033424365e-06, + "loss": 0.85672289, + "num_input_tokens_seen": 131015545, + "router_z_loss_clip": 1.82421875, + "router_z_loss_mlp": 0.15856934, + "step": 6098, + "time_per_iteration": 4.075935363769531 + }, + { + "auxiliary_loss_clip": 0.06469452, + "auxiliary_loss_mlp": 0.01270135, + "balance_loss_clip": 0.06291129, + "balance_loss_mlp": 0.01254751, + "epoch": 0.366691718021945, + "flos": 21362979934080.0, + "grad_norm": 2.6338542151665862, + "language_loss": 0.73907244, + "learning_rate": 2.9240969249517723e-06, + "loss": 0.81646824, + "num_input_tokens_seen": 131033990, + "router_z_loss_clip": 1.78320312, + "router_z_loss_mlp": 0.15386963, + "step": 6099, + "time_per_iteration": 2.5343947410583496 + }, + { + "auxiliary_loss_clip": 0.06462912, + "auxiliary_loss_mlp": 0.01271369, + "balance_loss_clip": 0.06286579, + "balance_loss_mlp": 0.01256695, + "epoch": 0.36675184127461297, + "flos": 16806017560320.0, + "grad_norm": 1.7024924966611934, + "language_loss": 0.84795189, + "learning_rate": 2.9237515115192602e-06, + "loss": 0.92529464, + "num_input_tokens_seen": 131050710, + "router_z_loss_clip": 1.76171875, + "router_z_loss_mlp": 0.14660645, + "step": 6100, + "time_per_iteration": 2.5503897666931152 + }, + { + "auxiliary_loss_clip": 0.06478457, + "auxiliary_loss_mlp": 0.01268408, + "balance_loss_clip": 0.06293124, + "balance_loss_mlp": 0.0125216, + "epoch": 0.36681196452728093, + "flos": 21912696645120.0, + "grad_norm": 2.268106387872694, + "language_loss": 0.712331, + "learning_rate": 2.9234060630579992e-06, + "loss": 0.78979969, + "num_input_tokens_seen": 131071435, + "router_z_loss_clip": 1.85351562, + "router_z_loss_mlp": 0.16235352, + "step": 6101, + "time_per_iteration": 2.5698294639587402 + }, + { + "auxiliary_loss_clip": 0.06474541, + "auxiliary_loss_mlp": 0.01273553, + "balance_loss_clip": 0.0629383, + "balance_loss_mlp": 0.01257137, + "epoch": 0.3668720877799489, + "flos": 17718215034240.0, + "grad_norm": 2.179497141372214, + "language_loss": 0.76701671, + "learning_rate": 2.9230605795810865e-06, + "loss": 0.84449768, + "num_input_tokens_seen": 131088775, + "router_z_loss_clip": 1.8046875, + "router_z_loss_mlp": 0.16418457, + "step": 6102, + "time_per_iteration": 2.653047561645508 + }, + { + "auxiliary_loss_clip": 0.06477299, + "auxiliary_loss_mlp": 0.01279444, + "balance_loss_clip": 0.06290299, + "balance_loss_mlp": 0.01262099, + "epoch": 0.36693221103261686, + "flos": 47055882804480.0, + "grad_norm": 1.641444039565929, + "language_loss": 0.70188046, + "learning_rate": 2.922715061101625e-06, + "loss": 0.77944791, + "num_input_tokens_seen": 131112800, + "router_z_loss_clip": 1.86621094, + "router_z_loss_mlp": 0.17333984, + "step": 6103, + "time_per_iteration": 2.7502424716949463 + }, + { + "auxiliary_loss_clip": 0.06472746, + "auxiliary_loss_mlp": 0.01272056, + "balance_loss_clip": 0.06290279, + "balance_loss_mlp": 0.01255581, + "epoch": 0.3669923342852848, + "flos": 15966383322240.0, + "grad_norm": 1.6662921664183201, + "language_loss": 0.71920598, + "learning_rate": 2.922369507632716e-06, + "loss": 0.79665399, + "num_input_tokens_seen": 131131150, + "router_z_loss_clip": 1.82324219, + "router_z_loss_mlp": 0.16467285, + "step": 6104, + "time_per_iteration": 3.993805408477783 + }, + { + "auxiliary_loss_clip": 0.0647142, + "auxiliary_loss_mlp": 0.01272456, + "balance_loss_clip": 0.06291486, + "balance_loss_mlp": 0.01256494, + "epoch": 0.3670524575379528, + "flos": 19980630800640.0, + "grad_norm": 1.7978052174853272, + "language_loss": 0.81448174, + "learning_rate": 2.9220239191874617e-06, + "loss": 0.89192045, + "num_input_tokens_seen": 131150365, + "router_z_loss_clip": 1.79882812, + "router_z_loss_mlp": 0.15966797, + "step": 6105, + "time_per_iteration": 3.907820463180542 + }, + { + "auxiliary_loss_clip": 0.06477002, + "auxiliary_loss_mlp": 0.01272813, + "balance_loss_clip": 0.06288886, + "balance_loss_mlp": 0.01254896, + "epoch": 0.36711258079062076, + "flos": 25710092956800.0, + "grad_norm": 1.7139492182529468, + "language_loss": 0.81421959, + "learning_rate": 2.9216782957789692e-06, + "loss": 0.89171767, + "num_input_tokens_seen": 131169310, + "router_z_loss_clip": 1.88183594, + "router_z_loss_mlp": 0.17919922, + "step": 6106, + "time_per_iteration": 2.5623860359191895 + }, + { + "auxiliary_loss_clip": 0.06422871, + "auxiliary_loss_mlp": 0.01259281, + "balance_loss_clip": 0.06342293, + "balance_loss_mlp": 0.01254903, + "epoch": 0.3671727040432887, + "flos": 60793014648960.0, + "grad_norm": 0.6928078159632836, + "language_loss": 0.59215379, + "learning_rate": 2.9213326374203426e-06, + "loss": 0.66897523, + "num_input_tokens_seen": 131232900, + "router_z_loss_clip": 0.8046875, + "router_z_loss_mlp": 0.04385376, + "step": 6107, + "time_per_iteration": 3.2451207637786865 + }, + { + "auxiliary_loss_clip": 0.06468046, + "auxiliary_loss_mlp": 0.01273048, + "balance_loss_clip": 0.06291793, + "balance_loss_mlp": 0.01257396, + "epoch": 0.3672328272959567, + "flos": 18667281104640.0, + "grad_norm": 1.5826982165866754, + "language_loss": 0.74750638, + "learning_rate": 2.92098694412469e-06, + "loss": 0.82491726, + "num_input_tokens_seen": 131250920, + "router_z_loss_clip": 1.76171875, + "router_z_loss_mlp": 0.15631104, + "step": 6108, + "time_per_iteration": 2.5317509174346924 + }, + { + "auxiliary_loss_clip": 0.06472465, + "auxiliary_loss_mlp": 0.01275992, + "balance_loss_clip": 0.06289458, + "balance_loss_mlp": 0.01260482, + "epoch": 0.3672929505486247, + "flos": 15054395483520.0, + "grad_norm": 2.0251921146130547, + "language_loss": 0.74524188, + "learning_rate": 2.9206412159051213e-06, + "loss": 0.82272649, + "num_input_tokens_seen": 131267910, + "router_z_loss_clip": 1.83203125, + "router_z_loss_mlp": 0.15490723, + "step": 6109, + "time_per_iteration": 2.530214309692383 + }, + { + "auxiliary_loss_clip": 0.06464404, + "auxiliary_loss_mlp": 0.01270146, + "balance_loss_clip": 0.06286883, + "balance_loss_mlp": 0.0125503, + "epoch": 0.3673530738012927, + "flos": 20594693047680.0, + "grad_norm": 1.6431777634434088, + "language_loss": 0.53560948, + "learning_rate": 2.920295452774744e-06, + "loss": 0.61295497, + "num_input_tokens_seen": 131287150, + "router_z_loss_clip": 1.77734375, + "router_z_loss_mlp": 0.15112305, + "step": 6110, + "time_per_iteration": 2.5247035026550293 + }, + { + "auxiliary_loss_clip": 0.06459565, + "auxiliary_loss_mlp": 0.01275062, + "balance_loss_clip": 0.06284792, + "balance_loss_mlp": 0.01258957, + "epoch": 0.36741319705396064, + "flos": 21696348602880.0, + "grad_norm": 1.814369900920369, + "language_loss": 0.80767608, + "learning_rate": 2.919949654746672e-06, + "loss": 0.8850224, + "num_input_tokens_seen": 131308225, + "router_z_loss_clip": 1.74707031, + "router_z_loss_mlp": 0.16088867, + "step": 6111, + "time_per_iteration": 2.6213719844818115 + }, + { + "auxiliary_loss_clip": 0.06459287, + "auxiliary_loss_mlp": 0.01273038, + "balance_loss_clip": 0.06284556, + "balance_loss_mlp": 0.01256861, + "epoch": 0.3674733203066286, + "flos": 29870011958400.0, + "grad_norm": 1.7131296557309772, + "language_loss": 0.72860467, + "learning_rate": 2.9196038218340163e-06, + "loss": 0.80592787, + "num_input_tokens_seen": 131332115, + "router_z_loss_clip": 1.74609375, + "router_z_loss_mlp": 0.16174316, + "step": 6112, + "time_per_iteration": 2.656101703643799 + }, + { + "auxiliary_loss_clip": 0.06459092, + "auxiliary_loss_mlp": 0.01272993, + "balance_loss_clip": 0.06283998, + "balance_loss_mlp": 0.01257866, + "epoch": 0.36753344355929657, + "flos": 18262439303040.0, + "grad_norm": 1.5099687925303509, + "language_loss": 0.85667342, + "learning_rate": 2.919257954049892e-06, + "loss": 0.93399429, + "num_input_tokens_seen": 131351885, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.15124512, + "step": 6113, + "time_per_iteration": 2.5230536460876465 + }, + { + "auxiliary_loss_clip": 0.06460717, + "auxiliary_loss_mlp": 0.01276985, + "balance_loss_clip": 0.06281444, + "balance_loss_mlp": 0.01260439, + "epoch": 0.36759356681196453, + "flos": 25308144120960.0, + "grad_norm": 1.9025835930032806, + "language_loss": 0.78706479, + "learning_rate": 2.918912051407413e-06, + "loss": 0.86444181, + "num_input_tokens_seen": 131370245, + "router_z_loss_clip": 1.79101562, + "router_z_loss_mlp": 0.16540527, + "step": 6114, + "time_per_iteration": 2.6091229915618896 + }, + { + "auxiliary_loss_clip": 0.06466475, + "auxiliary_loss_mlp": 0.01272915, + "balance_loss_clip": 0.0628548, + "balance_loss_mlp": 0.01255725, + "epoch": 0.3676536900646325, + "flos": 21039338338560.0, + "grad_norm": 1.6305517572579116, + "language_loss": 0.67626929, + "learning_rate": 2.918566113919698e-06, + "loss": 0.75366318, + "num_input_tokens_seen": 131388115, + "router_z_loss_clip": 1.80859375, + "router_z_loss_mlp": 0.17199707, + "step": 6115, + "time_per_iteration": 2.5226221084594727 + }, + { + "auxiliary_loss_clip": 0.06454025, + "auxiliary_loss_mlp": 0.01272139, + "balance_loss_clip": 0.06280309, + "balance_loss_mlp": 0.01257077, + "epoch": 0.36771381331730046, + "flos": 16293882205440.0, + "grad_norm": 2.2835896682412105, + "language_loss": 0.76996851, + "learning_rate": 2.9182201415998636e-06, + "loss": 0.84723008, + "num_input_tokens_seen": 131404595, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.15063477, + "step": 6116, + "time_per_iteration": 2.504951238632202 + }, + { + "auxiliary_loss_clip": 0.06459618, + "auxiliary_loss_mlp": 0.01274615, + "balance_loss_clip": 0.06282905, + "balance_loss_mlp": 0.01259153, + "epoch": 0.36777393656996843, + "flos": 22316574124800.0, + "grad_norm": 1.8264539284878285, + "language_loss": 0.62890095, + "learning_rate": 2.9178741344610286e-06, + "loss": 0.70624328, + "num_input_tokens_seen": 131423760, + "router_z_loss_clip": 1.76367188, + "router_z_loss_mlp": 0.15454102, + "step": 6117, + "time_per_iteration": 2.529193639755249 + }, + { + "auxiliary_loss_clip": 0.06458353, + "auxiliary_loss_mlp": 0.01270127, + "balance_loss_clip": 0.06285255, + "balance_loss_mlp": 0.01254749, + "epoch": 0.3678340598226364, + "flos": 26841405657600.0, + "grad_norm": 1.7359331247938332, + "language_loss": 0.73532575, + "learning_rate": 2.9175280925163156e-06, + "loss": 0.81261057, + "num_input_tokens_seen": 131444955, + "router_z_loss_clip": 1.73046875, + "router_z_loss_mlp": 0.15368652, + "step": 6118, + "time_per_iteration": 2.6261374950408936 + }, + { + "auxiliary_loss_clip": 0.06469986, + "auxiliary_loss_mlp": 0.01276003, + "balance_loss_clip": 0.06289329, + "balance_loss_mlp": 0.01259707, + "epoch": 0.36789418307530436, + "flos": 21768073297920.0, + "grad_norm": 1.5781425493049515, + "language_loss": 0.73047614, + "learning_rate": 2.9171820157788445e-06, + "loss": 0.80793607, + "num_input_tokens_seen": 131465720, + "router_z_loss_clip": 1.80566406, + "router_z_loss_mlp": 0.16320801, + "step": 6119, + "time_per_iteration": 2.5320048332214355 + }, + { + "auxiliary_loss_clip": 0.06466002, + "auxiliary_loss_mlp": 0.0127303, + "balance_loss_clip": 0.06290065, + "balance_loss_mlp": 0.0125789, + "epoch": 0.3679543063279723, + "flos": 15929598579840.0, + "grad_norm": 2.0565678381587307, + "language_loss": 0.8018201, + "learning_rate": 2.9168359042617404e-06, + "loss": 0.87921047, + "num_input_tokens_seen": 131483080, + "router_z_loss_clip": 1.75976562, + "router_z_loss_mlp": 0.15136719, + "step": 6120, + "time_per_iteration": 2.5085418224334717 + }, + { + "auxiliary_loss_clip": 0.06467941, + "auxiliary_loss_mlp": 0.01276389, + "balance_loss_clip": 0.0629365, + "balance_loss_mlp": 0.01260868, + "epoch": 0.3680144295806403, + "flos": 24281693205120.0, + "grad_norm": 2.0719591239633703, + "language_loss": 0.64803445, + "learning_rate": 2.916489757978126e-06, + "loss": 0.72547781, + "num_input_tokens_seen": 131502545, + "router_z_loss_clip": 1.74316406, + "router_z_loss_mlp": 0.15515137, + "step": 6121, + "time_per_iteration": 2.532470703125 + }, + { + "auxiliary_loss_clip": 0.06466727, + "auxiliary_loss_mlp": 0.01268749, + "balance_loss_clip": 0.06293779, + "balance_loss_mlp": 0.01254527, + "epoch": 0.36807455283330826, + "flos": 26111329032960.0, + "grad_norm": 1.9648479350594452, + "language_loss": 0.71416938, + "learning_rate": 2.9161435769411286e-06, + "loss": 0.79152405, + "num_input_tokens_seen": 131522155, + "router_z_loss_clip": 1.72851562, + "router_z_loss_mlp": 0.14221191, + "step": 6122, + "time_per_iteration": 2.5836074352264404 + }, + { + "auxiliary_loss_clip": 0.06461313, + "auxiliary_loss_mlp": 0.01273307, + "balance_loss_clip": 0.06291762, + "balance_loss_mlp": 0.0125831, + "epoch": 0.3681346760859763, + "flos": 24651972397440.0, + "grad_norm": 1.8972357597085572, + "language_loss": 0.69858962, + "learning_rate": 2.915797361163875e-06, + "loss": 0.77593577, + "num_input_tokens_seen": 131543865, + "router_z_loss_clip": 1.69726562, + "router_z_loss_mlp": 0.15002441, + "step": 6123, + "time_per_iteration": 2.5574307441711426 + }, + { + "auxiliary_loss_clip": 0.06474412, + "auxiliary_loss_mlp": 0.0127264, + "balance_loss_clip": 0.06293641, + "balance_loss_mlp": 0.01256094, + "epoch": 0.36819479933864424, + "flos": 23885152957440.0, + "grad_norm": 2.796866262853862, + "language_loss": 0.74766016, + "learning_rate": 2.9154511106594933e-06, + "loss": 0.8251307, + "num_input_tokens_seen": 131562155, + "router_z_loss_clip": 1.80957031, + "router_z_loss_mlp": 0.16540527, + "step": 6124, + "time_per_iteration": 2.5769121646881104 + }, + { + "auxiliary_loss_clip": 0.06470435, + "auxiliary_loss_mlp": 0.01274758, + "balance_loss_clip": 0.06295419, + "balance_loss_mlp": 0.01258116, + "epoch": 0.3682549225913122, + "flos": 25560606072960.0, + "grad_norm": 3.2532876436035236, + "language_loss": 0.74467599, + "learning_rate": 2.915104825441114e-06, + "loss": 0.82212794, + "num_input_tokens_seen": 131581695, + "router_z_loss_clip": 1.75195312, + "router_z_loss_mlp": 0.16625977, + "step": 6125, + "time_per_iteration": 2.5822880268096924 + }, + { + "auxiliary_loss_clip": 0.06476732, + "auxiliary_loss_mlp": 0.01270787, + "balance_loss_clip": 0.06296605, + "balance_loss_mlp": 0.01253967, + "epoch": 0.36831504584398017, + "flos": 16952317989120.0, + "grad_norm": 1.938795434914092, + "language_loss": 0.7843706, + "learning_rate": 2.9147585055218686e-06, + "loss": 0.86184579, + "num_input_tokens_seen": 131599465, + "router_z_loss_clip": 1.80078125, + "router_z_loss_mlp": 0.16809082, + "step": 6126, + "time_per_iteration": 2.5298731327056885 + }, + { + "auxiliary_loss_clip": 0.06483818, + "auxiliary_loss_mlp": 0.01275366, + "balance_loss_clip": 0.06301596, + "balance_loss_mlp": 0.01257413, + "epoch": 0.36837516909664814, + "flos": 19871198968320.0, + "grad_norm": 2.3034543329783173, + "language_loss": 0.66139042, + "learning_rate": 2.914412150914888e-06, + "loss": 0.73898232, + "num_input_tokens_seen": 131618330, + "router_z_loss_clip": 1.82128906, + "router_z_loss_mlp": 0.17980957, + "step": 6127, + "time_per_iteration": 2.5208253860473633 + }, + { + "auxiliary_loss_clip": 0.06475674, + "auxiliary_loss_mlp": 0.01272228, + "balance_loss_clip": 0.06294744, + "balance_loss_mlp": 0.01256409, + "epoch": 0.3684352923493161, + "flos": 37634976224640.0, + "grad_norm": 1.7597572196634643, + "language_loss": 0.70472896, + "learning_rate": 2.9140657616333074e-06, + "loss": 0.78220791, + "num_input_tokens_seen": 131638960, + "router_z_loss_clip": 1.80761719, + "router_z_loss_mlp": 0.15808105, + "step": 6128, + "time_per_iteration": 2.6984474658966064 + }, + { + "auxiliary_loss_clip": 0.06467833, + "auxiliary_loss_mlp": 0.01270944, + "balance_loss_clip": 0.06293194, + "balance_loss_mlp": 0.01255613, + "epoch": 0.36849541560198407, + "flos": 14470786995840.0, + "grad_norm": 1.6868142680460214, + "language_loss": 0.7591843, + "learning_rate": 2.9137193376902614e-06, + "loss": 0.83657211, + "num_input_tokens_seen": 131657440, + "router_z_loss_clip": 1.74511719, + "router_z_loss_mlp": 0.15332031, + "step": 6129, + "time_per_iteration": 2.49924898147583 + }, + { + "auxiliary_loss_clip": 0.06473218, + "auxiliary_loss_mlp": 0.01270816, + "balance_loss_clip": 0.06296876, + "balance_loss_mlp": 0.01255844, + "epoch": 0.36855553885465203, + "flos": 25777037969280.0, + "grad_norm": 1.6502765336301308, + "language_loss": 0.85087365, + "learning_rate": 2.9133728790988868e-06, + "loss": 0.92831397, + "num_input_tokens_seen": 131678035, + "router_z_loss_clip": 1.76464844, + "router_z_loss_mlp": 0.1496582, + "step": 6130, + "time_per_iteration": 2.604851484298706 + }, + { + "auxiliary_loss_clip": 0.06391466, + "auxiliary_loss_mlp": 0.01263828, + "balance_loss_clip": 0.06313837, + "balance_loss_mlp": 0.01261091, + "epoch": 0.36861566210732, + "flos": 65071715212800.0, + "grad_norm": 0.7916436629428728, + "language_loss": 0.60275888, + "learning_rate": 2.913026385872321e-06, + "loss": 0.67931175, + "num_input_tokens_seen": 131742470, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.02740479, + "step": 6131, + "time_per_iteration": 3.228571891784668 + }, + { + "auxiliary_loss_clip": 0.0647023, + "auxiliary_loss_mlp": 0.01270609, + "balance_loss_clip": 0.06296837, + "balance_loss_mlp": 0.01255332, + "epoch": 0.36867578535998796, + "flos": 30962108148480.0, + "grad_norm": 1.7580055354180455, + "language_loss": 0.73204952, + "learning_rate": 2.9126798580237034e-06, + "loss": 0.8094579, + "num_input_tokens_seen": 131764570, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.152771, + "step": 6132, + "time_per_iteration": 2.6286978721618652 + }, + { + "auxiliary_loss_clip": 0.06478602, + "auxiliary_loss_mlp": 0.01273616, + "balance_loss_clip": 0.0629575, + "balance_loss_mlp": 0.0125738, + "epoch": 0.3687359086126559, + "flos": 28845154270080.0, + "grad_norm": 1.8077518075699008, + "language_loss": 0.7455107, + "learning_rate": 2.9123332955661736e-06, + "loss": 0.82303286, + "num_input_tokens_seen": 131785720, + "router_z_loss_clip": 1.83007812, + "router_z_loss_mlp": 0.16235352, + "step": 6133, + "time_per_iteration": 2.6024398803710938 + }, + { + "auxiliary_loss_clip": 0.06463782, + "auxiliary_loss_mlp": 0.0127464, + "balance_loss_clip": 0.06292324, + "balance_loss_mlp": 0.01258618, + "epoch": 0.3687960318653239, + "flos": 21403076912640.0, + "grad_norm": 1.7721182564640174, + "language_loss": 0.7199074, + "learning_rate": 2.911986698512874e-06, + "loss": 0.79729164, + "num_input_tokens_seen": 131804430, + "router_z_loss_clip": 1.71386719, + "router_z_loss_mlp": 0.16027832, + "step": 6134, + "time_per_iteration": 2.646097421646118 + }, + { + "auxiliary_loss_clip": 0.0646476, + "auxiliary_loss_mlp": 0.0126875, + "balance_loss_clip": 0.06289706, + "balance_loss_mlp": 0.01252288, + "epoch": 0.36885615511799186, + "flos": 20272183482240.0, + "grad_norm": 4.124945820193244, + "language_loss": 0.7570188, + "learning_rate": 2.9116400668769477e-06, + "loss": 0.83435392, + "num_input_tokens_seen": 131822060, + "router_z_loss_clip": 1.74804688, + "router_z_loss_mlp": 0.16455078, + "step": 6135, + "time_per_iteration": 2.6019539833068848 + }, + { + "auxiliary_loss_clip": 0.06382909, + "auxiliary_loss_mlp": 0.01256883, + "balance_loss_clip": 0.06304377, + "balance_loss_mlp": 0.0125392, + "epoch": 0.3689162783706599, + "flos": 63106317371520.0, + "grad_norm": 0.7816734524389999, + "language_loss": 0.58664352, + "learning_rate": 2.9112934006715376e-06, + "loss": 0.66304147, + "num_input_tokens_seen": 131880715, + "router_z_loss_clip": 0.78515625, + "router_z_loss_mlp": 0.02960205, + "step": 6136, + "time_per_iteration": 3.139789342880249 + }, + { + "auxiliary_loss_clip": 0.06465235, + "auxiliary_loss_mlp": 0.01270986, + "balance_loss_clip": 0.06292487, + "balance_loss_mlp": 0.012563, + "epoch": 0.36897640162332784, + "flos": 10966536593280.0, + "grad_norm": 2.7370945268269806, + "language_loss": 0.79547632, + "learning_rate": 2.9109466999097918e-06, + "loss": 0.8728385, + "num_input_tokens_seen": 131895850, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.14678955, + "step": 6137, + "time_per_iteration": 3.937328577041626 + }, + { + "auxiliary_loss_clip": 0.06472172, + "auxiliary_loss_mlp": 0.01271273, + "balance_loss_clip": 0.06297816, + "balance_loss_mlp": 0.01255764, + "epoch": 0.3690365248759958, + "flos": 20710581644160.0, + "grad_norm": 1.9257362559650297, + "language_loss": 0.74479491, + "learning_rate": 2.9105999646048552e-06, + "loss": 0.82222939, + "num_input_tokens_seen": 131915775, + "router_z_loss_clip": 1.74414062, + "router_z_loss_mlp": 0.15515137, + "step": 6138, + "time_per_iteration": 4.004723072052002 + }, + { + "auxiliary_loss_clip": 0.06475753, + "auxiliary_loss_mlp": 0.01270871, + "balance_loss_clip": 0.06296947, + "balance_loss_mlp": 0.01255827, + "epoch": 0.3690966481286638, + "flos": 31833495884160.0, + "grad_norm": 1.986271481109943, + "language_loss": 0.65762347, + "learning_rate": 2.9102531947698764e-06, + "loss": 0.73508972, + "num_input_tokens_seen": 131935715, + "router_z_loss_clip": 1.78808594, + "router_z_loss_mlp": 0.1505127, + "step": 6139, + "time_per_iteration": 2.621832847595215 + }, + { + "auxiliary_loss_clip": 0.06460394, + "auxiliary_loss_mlp": 0.01271698, + "balance_loss_clip": 0.06290884, + "balance_loss_mlp": 0.0125626, + "epoch": 0.36915677138133174, + "flos": 13119897870720.0, + "grad_norm": 1.9334180469367421, + "language_loss": 0.72060692, + "learning_rate": 2.909906390418006e-06, + "loss": 0.7979278, + "num_input_tokens_seen": 131954120, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.15429688, + "step": 6140, + "time_per_iteration": 2.542410135269165 + }, + { + "auxiliary_loss_clip": 0.06370358, + "auxiliary_loss_mlp": 0.01255246, + "balance_loss_clip": 0.06292184, + "balance_loss_mlp": 0.01252388, + "epoch": 0.3692168946339997, + "flos": 68707926996480.0, + "grad_norm": 0.7297912869343693, + "language_loss": 0.59210759, + "learning_rate": 2.9095595515623934e-06, + "loss": 0.66836369, + "num_input_tokens_seen": 132017485, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 0.02853394, + "step": 6141, + "time_per_iteration": 3.242342710494995 + }, + { + "auxiliary_loss_clip": 0.06465677, + "auxiliary_loss_mlp": 0.01272477, + "balance_loss_clip": 0.06289662, + "balance_loss_mlp": 0.01256336, + "epoch": 0.36927701788666767, + "flos": 22024392537600.0, + "grad_norm": 1.6449420117919953, + "language_loss": 0.75489783, + "learning_rate": 2.909212678216192e-06, + "loss": 0.83227944, + "num_input_tokens_seen": 132036760, + "router_z_loss_clip": 1.76074219, + "router_z_loss_mlp": 0.16149902, + "step": 6142, + "time_per_iteration": 2.552541732788086 + }, + { + "auxiliary_loss_clip": 0.06459697, + "auxiliary_loss_mlp": 0.01271426, + "balance_loss_clip": 0.06287819, + "balance_loss_mlp": 0.01256883, + "epoch": 0.36933714113933563, + "flos": 21842103980160.0, + "grad_norm": 2.1834908331499694, + "language_loss": 0.77180201, + "learning_rate": 2.908865770392555e-06, + "loss": 0.84911323, + "num_input_tokens_seen": 132056935, + "router_z_loss_clip": 1.71777344, + "router_z_loss_mlp": 0.14544678, + "step": 6143, + "time_per_iteration": 3.990859031677246 + }, + { + "auxiliary_loss_clip": 0.06461622, + "auxiliary_loss_mlp": 0.01265429, + "balance_loss_clip": 0.06289461, + "balance_loss_mlp": 0.01251565, + "epoch": 0.3693972643920036, + "flos": 23697749301120.0, + "grad_norm": 1.9416354027972629, + "language_loss": 0.82307315, + "learning_rate": 2.9085188281046364e-06, + "loss": 0.9003436, + "num_input_tokens_seen": 132077285, + "router_z_loss_clip": 1.72265625, + "router_z_loss_mlp": 0.13867188, + "step": 6144, + "time_per_iteration": 2.5504705905914307 + }, + { + "auxiliary_loss_clip": 0.06462898, + "auxiliary_loss_mlp": 0.01269861, + "balance_loss_clip": 0.06287374, + "balance_loss_mlp": 0.01255586, + "epoch": 0.36945738764467156, + "flos": 22863355943040.0, + "grad_norm": 2.172105123479451, + "language_loss": 0.78995448, + "learning_rate": 2.908171851365593e-06, + "loss": 0.86728209, + "num_input_tokens_seen": 132095520, + "router_z_loss_clip": 1.75390625, + "router_z_loss_mlp": 0.14282227, + "step": 6145, + "time_per_iteration": 3.9733781814575195 + }, + { + "auxiliary_loss_clip": 0.06468924, + "auxiliary_loss_mlp": 0.01271457, + "balance_loss_clip": 0.06291068, + "balance_loss_mlp": 0.01256067, + "epoch": 0.36951751089733953, + "flos": 16621213380480.0, + "grad_norm": 1.6722610276638135, + "language_loss": 0.77129662, + "learning_rate": 2.9078248401885815e-06, + "loss": 0.8487004, + "num_input_tokens_seen": 132112810, + "router_z_loss_clip": 1.77832031, + "router_z_loss_mlp": 0.15380859, + "step": 6146, + "time_per_iteration": 2.5411174297332764 + }, + { + "auxiliary_loss_clip": 0.06466483, + "auxiliary_loss_mlp": 0.0127594, + "balance_loss_clip": 0.06289164, + "balance_loss_mlp": 0.01260419, + "epoch": 0.3695776341500075, + "flos": 18920204254080.0, + "grad_norm": 1.6293394058894772, + "language_loss": 0.81346822, + "learning_rate": 2.907477794586761e-06, + "loss": 0.89089251, + "num_input_tokens_seen": 132131615, + "router_z_loss_clip": 1.7734375, + "router_z_loss_mlp": 0.1550293, + "step": 6147, + "time_per_iteration": 2.5456924438476562 + }, + { + "auxiliary_loss_clip": 0.06463629, + "auxiliary_loss_mlp": 0.01275917, + "balance_loss_clip": 0.06286413, + "balance_loss_mlp": 0.01261684, + "epoch": 0.36963775740267546, + "flos": 20813892128640.0, + "grad_norm": 1.8090658573318705, + "language_loss": 0.83484954, + "learning_rate": 2.9071307145732926e-06, + "loss": 0.91224504, + "num_input_tokens_seen": 132149585, + "router_z_loss_clip": 1.77246094, + "router_z_loss_mlp": 0.14227295, + "step": 6148, + "time_per_iteration": 2.6318178176879883 + }, + { + "auxiliary_loss_clip": 0.06458767, + "auxiliary_loss_mlp": 0.01266964, + "balance_loss_clip": 0.06284354, + "balance_loss_mlp": 0.01252814, + "epoch": 0.3696978806553435, + "flos": 26068087526400.0, + "grad_norm": 2.191330684134815, + "language_loss": 0.74277508, + "learning_rate": 2.9067836001613357e-06, + "loss": 0.82003242, + "num_input_tokens_seen": 132165555, + "router_z_loss_clip": 1.74414062, + "router_z_loss_mlp": 0.14147949, + "step": 6149, + "time_per_iteration": 2.6037940979003906 + }, + { + "auxiliary_loss_clip": 0.06464496, + "auxiliary_loss_mlp": 0.01271867, + "balance_loss_clip": 0.06287233, + "balance_loss_mlp": 0.01256203, + "epoch": 0.36975800390801145, + "flos": 26841237949440.0, + "grad_norm": 2.856714094904378, + "language_loss": 0.71066409, + "learning_rate": 2.906436451364054e-06, + "loss": 0.78802776, + "num_input_tokens_seen": 132185100, + "router_z_loss_clip": 1.77050781, + "router_z_loss_mlp": 0.15667725, + "step": 6150, + "time_per_iteration": 2.612860918045044 + }, + { + "auxiliary_loss_clip": 0.06457143, + "auxiliary_loss_mlp": 0.01270306, + "balance_loss_clip": 0.06283612, + "balance_loss_mlp": 0.01256341, + "epoch": 0.3698181271606794, + "flos": 21149063660160.0, + "grad_norm": 1.8423166255946122, + "language_loss": 0.81970799, + "learning_rate": 2.906089268194611e-06, + "loss": 0.89698249, + "num_input_tokens_seen": 132203930, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.1395874, + "step": 6151, + "time_per_iteration": 2.535888195037842 + }, + { + "auxiliary_loss_clip": 0.0635625, + "auxiliary_loss_mlp": 0.01266021, + "balance_loss_clip": 0.06277541, + "balance_loss_mlp": 0.01262752, + "epoch": 0.3698782504133474, + "flos": 66761605958400.0, + "grad_norm": 0.7660918799950965, + "language_loss": 0.63089043, + "learning_rate": 2.9057420506661726e-06, + "loss": 0.70711315, + "num_input_tokens_seen": 132263845, + "router_z_loss_clip": 0.78857422, + "router_z_loss_mlp": 0.03274536, + "step": 6152, + "time_per_iteration": 3.27481746673584 + }, + { + "auxiliary_loss_clip": 0.06456928, + "auxiliary_loss_mlp": 0.01269625, + "balance_loss_clip": 0.06289765, + "balance_loss_mlp": 0.01256709, + "epoch": 0.36993837366601534, + "flos": 24317597479680.0, + "grad_norm": 2.4460843976292455, + "language_loss": 0.7067228, + "learning_rate": 2.9053947987919044e-06, + "loss": 0.78398836, + "num_input_tokens_seen": 132282350, + "router_z_loss_clip": 1.671875, + "router_z_loss_mlp": 0.12921143, + "step": 6153, + "time_per_iteration": 2.561366319656372 + }, + { + "auxiliary_loss_clip": 0.06461591, + "auxiliary_loss_mlp": 0.01272426, + "balance_loss_clip": 0.06285959, + "balance_loss_mlp": 0.0125796, + "epoch": 0.3699984969186833, + "flos": 24355472325120.0, + "grad_norm": 1.7390512131477307, + "language_loss": 0.72820848, + "learning_rate": 2.9050475125849755e-06, + "loss": 0.80554867, + "num_input_tokens_seen": 132301930, + "router_z_loss_clip": 1.75585938, + "router_z_loss_mlp": 0.14459229, + "step": 6154, + "time_per_iteration": 2.6359784603118896 + }, + { + "auxiliary_loss_clip": 0.06464534, + "auxiliary_loss_mlp": 0.01270069, + "balance_loss_clip": 0.06290819, + "balance_loss_mlp": 0.01256468, + "epoch": 0.37005862017135127, + "flos": 19835378547840.0, + "grad_norm": 1.7720975153034155, + "language_loss": 0.68251342, + "learning_rate": 2.9047001920585534e-06, + "loss": 0.75985944, + "num_input_tokens_seen": 132320915, + "router_z_loss_clip": 1.73828125, + "router_z_loss_mlp": 0.1361084, + "step": 6155, + "time_per_iteration": 2.6026792526245117 + }, + { + "auxiliary_loss_clip": 0.06462097, + "auxiliary_loss_mlp": 0.01275284, + "balance_loss_clip": 0.06290478, + "balance_loss_mlp": 0.01261551, + "epoch": 0.37011874342401924, + "flos": 19579981703040.0, + "grad_norm": 1.763175663447542, + "language_loss": 0.68228447, + "learning_rate": 2.9043528372258097e-06, + "loss": 0.75965828, + "num_input_tokens_seen": 132340415, + "router_z_loss_clip": 1.71679688, + "router_z_loss_mlp": 0.13745117, + "step": 6156, + "time_per_iteration": 2.5805797576904297 + }, + { + "auxiliary_loss_clip": 0.06460856, + "auxiliary_loss_mlp": 0.01276122, + "balance_loss_clip": 0.06292138, + "balance_loss_mlp": 0.01263051, + "epoch": 0.3701788666766872, + "flos": 20380315576320.0, + "grad_norm": 2.4756712581972673, + "language_loss": 0.82280111, + "learning_rate": 2.904005448099916e-06, + "loss": 0.9001708, + "num_input_tokens_seen": 132358600, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.13061523, + "step": 6157, + "time_per_iteration": 2.5239012241363525 + }, + { + "auxiliary_loss_clip": 0.06472905, + "auxiliary_loss_mlp": 0.01276517, + "balance_loss_clip": 0.06294029, + "balance_loss_mlp": 0.0126136, + "epoch": 0.37023898992935517, + "flos": 15346325508480.0, + "grad_norm": 2.1879647979069055, + "language_loss": 0.77007514, + "learning_rate": 2.9036580246940444e-06, + "loss": 0.84756935, + "num_input_tokens_seen": 132373160, + "router_z_loss_clip": 1.78710938, + "router_z_loss_mlp": 0.15142822, + "step": 6158, + "time_per_iteration": 2.5507380962371826 + }, + { + "auxiliary_loss_clip": 0.06472066, + "auxiliary_loss_mlp": 0.01273585, + "balance_loss_clip": 0.0629342, + "balance_loss_mlp": 0.0125872, + "epoch": 0.37029911318202313, + "flos": 19580149411200.0, + "grad_norm": 1.9796058392103062, + "language_loss": 0.68833315, + "learning_rate": 2.9033105670213708e-06, + "loss": 0.76578963, + "num_input_tokens_seen": 132392345, + "router_z_loss_clip": 1.78710938, + "router_z_loss_mlp": 0.14880371, + "step": 6159, + "time_per_iteration": 2.4941582679748535 + }, + { + "auxiliary_loss_clip": 0.06464109, + "auxiliary_loss_mlp": 0.01275069, + "balance_loss_clip": 0.06292266, + "balance_loss_mlp": 0.01261986, + "epoch": 0.3703592364346911, + "flos": 26220509303040.0, + "grad_norm": 1.9367461088396363, + "language_loss": 0.71322787, + "learning_rate": 2.9029630750950697e-06, + "loss": 0.79061961, + "num_input_tokens_seen": 132412620, + "router_z_loss_clip": 1.71679688, + "router_z_loss_mlp": 0.13079834, + "step": 6160, + "time_per_iteration": 2.5934555530548096 + }, + { + "auxiliary_loss_clip": 0.06465742, + "auxiliary_loss_mlp": 0.01272758, + "balance_loss_clip": 0.06295532, + "balance_loss_mlp": 0.0125958, + "epoch": 0.37041935968735906, + "flos": 20054619555840.0, + "grad_norm": 1.6534007301448785, + "language_loss": 0.78978807, + "learning_rate": 2.9026155489283176e-06, + "loss": 0.86717302, + "num_input_tokens_seen": 132431570, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.1317749, + "step": 6161, + "time_per_iteration": 2.5337588787078857 + }, + { + "auxiliary_loss_clip": 0.06465232, + "auxiliary_loss_mlp": 0.01270423, + "balance_loss_clip": 0.06291839, + "balance_loss_mlp": 0.01255837, + "epoch": 0.3704794829400271, + "flos": 24140633656320.0, + "grad_norm": 1.7631614273732186, + "language_loss": 0.79746109, + "learning_rate": 2.902267988534295e-06, + "loss": 0.87481761, + "num_input_tokens_seen": 132451525, + "router_z_loss_clip": 1.73339844, + "router_z_loss_mlp": 0.14587402, + "step": 6162, + "time_per_iteration": 2.5815200805664062 + }, + { + "auxiliary_loss_clip": 0.06466715, + "auxiliary_loss_mlp": 0.01274307, + "balance_loss_clip": 0.06292939, + "balance_loss_mlp": 0.01260717, + "epoch": 0.37053960619269505, + "flos": 14872232707200.0, + "grad_norm": 1.8866019587111915, + "language_loss": 0.80318987, + "learning_rate": 2.9019203939261783e-06, + "loss": 0.88060015, + "num_input_tokens_seen": 132469875, + "router_z_loss_clip": 1.73925781, + "router_z_loss_mlp": 0.13580322, + "step": 6163, + "time_per_iteration": 2.501971483230591 + }, + { + "auxiliary_loss_clip": 0.06466764, + "auxiliary_loss_mlp": 0.01273928, + "balance_loss_clip": 0.0629348, + "balance_loss_mlp": 0.01260315, + "epoch": 0.370599729445363, + "flos": 21367969251840.0, + "grad_norm": 1.81392406825425, + "language_loss": 0.68857837, + "learning_rate": 2.9015727651171507e-06, + "loss": 0.76598537, + "num_input_tokens_seen": 132488360, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.13598633, + "step": 6164, + "time_per_iteration": 2.557870388031006 + }, + { + "auxiliary_loss_clip": 0.06463528, + "auxiliary_loss_mlp": 0.01275542, + "balance_loss_clip": 0.06290606, + "balance_loss_mlp": 0.0126064, + "epoch": 0.370659852698031, + "flos": 26835535872000.0, + "grad_norm": 2.3609289004256984, + "language_loss": 0.83364576, + "learning_rate": 2.9012251021203935e-06, + "loss": 0.91103643, + "num_input_tokens_seen": 132508630, + "router_z_loss_clip": 1.72851562, + "router_z_loss_mlp": 0.14916992, + "step": 6165, + "time_per_iteration": 2.5597267150878906 + }, + { + "auxiliary_loss_clip": 0.06475651, + "auxiliary_loss_mlp": 0.01276631, + "balance_loss_clip": 0.06294797, + "balance_loss_mlp": 0.01261086, + "epoch": 0.37071997595069894, + "flos": 19105050360960.0, + "grad_norm": 1.8212520052796557, + "language_loss": 0.69703627, + "learning_rate": 2.9008774049490896e-06, + "loss": 0.77455908, + "num_input_tokens_seen": 132527465, + "router_z_loss_clip": 1.81054688, + "router_z_loss_mlp": 0.15551758, + "step": 6166, + "time_per_iteration": 2.7443737983703613 + }, + { + "auxiliary_loss_clip": 0.06351966, + "auxiliary_loss_mlp": 0.01259396, + "balance_loss_clip": 0.0627325, + "balance_loss_mlp": 0.01255936, + "epoch": 0.3707800992033669, + "flos": 52193839461120.0, + "grad_norm": 0.7767712005900987, + "language_loss": 0.55992532, + "learning_rate": 2.9005296736164244e-06, + "loss": 0.6360389, + "num_input_tokens_seen": 132579940, + "router_z_loss_clip": 0.78808594, + "router_z_loss_mlp": 0.03469849, + "step": 6167, + "time_per_iteration": 3.122786045074463 + }, + { + "auxiliary_loss_clip": 0.06470326, + "auxiliary_loss_mlp": 0.01270542, + "balance_loss_clip": 0.06298738, + "balance_loss_mlp": 0.01256553, + "epoch": 0.3708402224560349, + "flos": 19908025637760.0, + "grad_norm": 1.887650816435161, + "language_loss": 0.75851792, + "learning_rate": 2.900181908135584e-06, + "loss": 0.83592659, + "num_input_tokens_seen": 132598390, + "router_z_loss_clip": 1.71484375, + "router_z_loss_mlp": 0.13983154, + "step": 6168, + "time_per_iteration": 2.516329050064087 + }, + { + "auxiliary_loss_clip": 0.06462339, + "auxiliary_loss_mlp": 0.01269774, + "balance_loss_clip": 0.0628986, + "balance_loss_mlp": 0.01255833, + "epoch": 0.37090034570870284, + "flos": 20013222839040.0, + "grad_norm": 1.688087532093935, + "language_loss": 0.74697542, + "learning_rate": 2.899834108519755e-06, + "loss": 0.82429659, + "num_input_tokens_seen": 132616920, + "router_z_loss_clip": 1.72363281, + "router_z_loss_mlp": 0.13946533, + "step": 6169, + "time_per_iteration": 2.571059226989746 + }, + { + "auxiliary_loss_clip": 0.06462043, + "auxiliary_loss_mlp": 0.01269285, + "balance_loss_clip": 0.06291892, + "balance_loss_mlp": 0.0125526, + "epoch": 0.3709604689613708, + "flos": 24141681832320.0, + "grad_norm": 1.6120375976718775, + "language_loss": 0.79462636, + "learning_rate": 2.899486274782127e-06, + "loss": 0.87193966, + "num_input_tokens_seen": 132637660, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.14007568, + "step": 6170, + "time_per_iteration": 2.539099931716919 + }, + { + "auxiliary_loss_clip": 0.06461793, + "auxiliary_loss_mlp": 0.01268893, + "balance_loss_clip": 0.06289523, + "balance_loss_mlp": 0.01254183, + "epoch": 0.37102059221403877, + "flos": 23882469626880.0, + "grad_norm": 1.7170622011660002, + "language_loss": 0.76363444, + "learning_rate": 2.8991384069358885e-06, + "loss": 0.84094131, + "num_input_tokens_seen": 132657635, + "router_z_loss_clip": 1.72167969, + "router_z_loss_mlp": 0.14703369, + "step": 6171, + "time_per_iteration": 2.5565338134765625 + }, + { + "auxiliary_loss_clip": 0.06464403, + "auxiliary_loss_mlp": 0.01269741, + "balance_loss_clip": 0.06292279, + "balance_loss_mlp": 0.0125568, + "epoch": 0.37108071546670673, + "flos": 14506439708160.0, + "grad_norm": 2.2434941236901222, + "language_loss": 0.80974334, + "learning_rate": 2.898790504994232e-06, + "loss": 0.88708472, + "num_input_tokens_seen": 132674455, + "router_z_loss_clip": 1.71972656, + "router_z_loss_mlp": 0.140625, + "step": 6172, + "time_per_iteration": 2.496101140975952 + }, + { + "auxiliary_loss_clip": 0.06468061, + "auxiliary_loss_mlp": 0.01272991, + "balance_loss_clip": 0.06291698, + "balance_loss_mlp": 0.01258352, + "epoch": 0.3711408387193747, + "flos": 34570172160000.0, + "grad_norm": 1.701200983183655, + "language_loss": 0.59536189, + "learning_rate": 2.89844256897035e-06, + "loss": 0.67277241, + "num_input_tokens_seen": 132695140, + "router_z_loss_clip": 1.76367188, + "router_z_loss_mlp": 0.14648438, + "step": 6173, + "time_per_iteration": 2.68860125541687 + }, + { + "auxiliary_loss_clip": 0.06465948, + "auxiliary_loss_mlp": 0.01267208, + "balance_loss_clip": 0.06291407, + "balance_loss_mlp": 0.01252825, + "epoch": 0.37120096197204266, + "flos": 17316350052480.0, + "grad_norm": 3.482738270256764, + "language_loss": 0.81161231, + "learning_rate": 2.898094598877435e-06, + "loss": 0.88894391, + "num_input_tokens_seen": 132712470, + "router_z_loss_clip": 1.74414062, + "router_z_loss_mlp": 0.1439209, + "step": 6174, + "time_per_iteration": 2.498631238937378 + }, + { + "auxiliary_loss_clip": 0.06459825, + "auxiliary_loss_mlp": 0.01267088, + "balance_loss_clip": 0.06290745, + "balance_loss_mlp": 0.01253826, + "epoch": 0.37126108522471063, + "flos": 30671855205120.0, + "grad_norm": 1.7762050826086826, + "language_loss": 0.79733562, + "learning_rate": 2.8977465947286826e-06, + "loss": 0.87460476, + "num_input_tokens_seen": 132732945, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.13275146, + "step": 6175, + "time_per_iteration": 2.6155989170074463 + }, + { + "auxiliary_loss_clip": 0.06469794, + "auxiliary_loss_mlp": 0.01268815, + "balance_loss_clip": 0.06296568, + "balance_loss_mlp": 0.01253926, + "epoch": 0.37132120847737865, + "flos": 25162682232960.0, + "grad_norm": 2.183025760433602, + "language_loss": 0.8886646, + "learning_rate": 2.89739855653729e-06, + "loss": 0.96605068, + "num_input_tokens_seen": 132752470, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.14880371, + "step": 6176, + "time_per_iteration": 3.9855380058288574 + }, + { + "auxiliary_loss_clip": 0.06463525, + "auxiliary_loss_mlp": 0.01266267, + "balance_loss_clip": 0.0629091, + "balance_loss_mlp": 0.01252331, + "epoch": 0.3713813317300466, + "flos": 21219572471040.0, + "grad_norm": 1.8377156327305517, + "language_loss": 0.73693877, + "learning_rate": 2.8970504843164546e-06, + "loss": 0.8142367, + "num_input_tokens_seen": 132771485, + "router_z_loss_clip": 1.72363281, + "router_z_loss_mlp": 0.13952637, + "step": 6177, + "time_per_iteration": 2.584007501602173 + }, + { + "auxiliary_loss_clip": 0.06460603, + "auxiliary_loss_mlp": 0.01270943, + "balance_loss_clip": 0.06288581, + "balance_loss_mlp": 0.01256722, + "epoch": 0.3714414549827146, + "flos": 21623114534400.0, + "grad_norm": 3.348536242845292, + "language_loss": 0.75657964, + "learning_rate": 2.896702378079374e-06, + "loss": 0.83389515, + "num_input_tokens_seen": 132791465, + "router_z_loss_clip": 1.72265625, + "router_z_loss_mlp": 0.14227295, + "step": 6178, + "time_per_iteration": 4.047810077667236 + }, + { + "auxiliary_loss_clip": 0.06459013, + "auxiliary_loss_mlp": 0.01268256, + "balance_loss_clip": 0.06288654, + "balance_loss_mlp": 0.01253796, + "epoch": 0.37150157823538255, + "flos": 19978073251200.0, + "grad_norm": 1.677068577007521, + "language_loss": 0.7243154, + "learning_rate": 2.8963542378392502e-06, + "loss": 0.80158818, + "num_input_tokens_seen": 132810160, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.14465332, + "step": 6179, + "time_per_iteration": 2.525162696838379 + }, + { + "auxiliary_loss_clip": 0.06464912, + "auxiliary_loss_mlp": 0.01268865, + "balance_loss_clip": 0.06289817, + "balance_loss_mlp": 0.01254506, + "epoch": 0.3715617014880505, + "flos": 24867020701440.0, + "grad_norm": 1.5744290711880986, + "language_loss": 0.70164317, + "learning_rate": 2.896006063609283e-06, + "loss": 0.77898097, + "num_input_tokens_seen": 132831265, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.14361572, + "step": 6180, + "time_per_iteration": 2.564251661300659 + }, + { + "auxiliary_loss_clip": 0.06459807, + "auxiliary_loss_mlp": 0.01269776, + "balance_loss_clip": 0.0628929, + "balance_loss_mlp": 0.01255173, + "epoch": 0.3716218247407185, + "flos": 20455352507520.0, + "grad_norm": 1.6669585833251956, + "language_loss": 0.78357702, + "learning_rate": 2.8956578554026767e-06, + "loss": 0.86087286, + "num_input_tokens_seen": 132850005, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.14605713, + "step": 6181, + "time_per_iteration": 2.5857934951782227 + }, + { + "auxiliary_loss_clip": 0.06456675, + "auxiliary_loss_mlp": 0.01268697, + "balance_loss_clip": 0.06286183, + "balance_loss_mlp": 0.01254195, + "epoch": 0.37168194799338644, + "flos": 24140256312960.0, + "grad_norm": 1.7806049549646892, + "language_loss": 0.78926349, + "learning_rate": 2.8953096132326343e-06, + "loss": 0.86651719, + "num_input_tokens_seen": 132865790, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.14520264, + "step": 6182, + "time_per_iteration": 2.572563409805298 + }, + { + "auxiliary_loss_clip": 0.0637676, + "auxiliary_loss_mlp": 0.01256678, + "balance_loss_clip": 0.06297279, + "balance_loss_mlp": 0.01253508, + "epoch": 0.3717420712460544, + "flos": 67429601107200.0, + "grad_norm": 0.7782169453066291, + "language_loss": 0.57265592, + "learning_rate": 2.894961337112362e-06, + "loss": 0.64899027, + "num_input_tokens_seen": 132921775, + "router_z_loss_clip": 0.79296875, + "router_z_loss_mlp": 0.03170776, + "step": 6183, + "time_per_iteration": 4.616533279418945 + }, + { + "auxiliary_loss_clip": 0.06460768, + "auxiliary_loss_mlp": 0.0127302, + "balance_loss_clip": 0.06282368, + "balance_loss_mlp": 0.01258059, + "epoch": 0.37180219449872237, + "flos": 22382512888320.0, + "grad_norm": 2.288371354177028, + "language_loss": 0.77116179, + "learning_rate": 2.894613027055066e-06, + "loss": 0.84849966, + "num_input_tokens_seen": 132941060, + "router_z_loss_clip": 1.78320312, + "router_z_loss_mlp": 0.1496582, + "step": 6184, + "time_per_iteration": 2.5182292461395264 + }, + { + "auxiliary_loss_clip": 0.06457444, + "auxiliary_loss_mlp": 0.01269752, + "balance_loss_clip": 0.0628842, + "balance_loss_mlp": 0.01255739, + "epoch": 0.37186231775139034, + "flos": 21876037683840.0, + "grad_norm": 2.2342830987852023, + "language_loss": 0.72608167, + "learning_rate": 2.894264683073954e-06, + "loss": 0.80335367, + "num_input_tokens_seen": 132961850, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.14007568, + "step": 6185, + "time_per_iteration": 3.928272247314453 + }, + { + "auxiliary_loss_clip": 0.06453837, + "auxiliary_loss_mlp": 0.01267225, + "balance_loss_clip": 0.06286646, + "balance_loss_mlp": 0.01253075, + "epoch": 0.3719224410040583, + "flos": 22421142420480.0, + "grad_norm": 1.6056881027286982, + "language_loss": 0.77329034, + "learning_rate": 2.8939163051822363e-06, + "loss": 0.85050094, + "num_input_tokens_seen": 132981625, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.14160156, + "step": 6186, + "time_per_iteration": 2.549499988555908 + }, + { + "auxiliary_loss_clip": 0.0646092, + "auxiliary_loss_mlp": 0.01274226, + "balance_loss_clip": 0.06283663, + "balance_loss_mlp": 0.01258121, + "epoch": 0.37198256425672627, + "flos": 25157525207040.0, + "grad_norm": 1.8763954627941488, + "language_loss": 0.84227252, + "learning_rate": 2.8935678933931224e-06, + "loss": 0.91962403, + "num_input_tokens_seen": 133001225, + "router_z_loss_clip": 1.77246094, + "router_z_loss_mlp": 0.16101074, + "step": 6187, + "time_per_iteration": 2.542978048324585 + }, + { + "auxiliary_loss_clip": 0.06456143, + "auxiliary_loss_mlp": 0.01269651, + "balance_loss_clip": 0.06286585, + "balance_loss_mlp": 0.01255919, + "epoch": 0.37204268750939423, + "flos": 21144032415360.0, + "grad_norm": 2.100791898470326, + "language_loss": 0.84696567, + "learning_rate": 2.893219447719824e-06, + "loss": 0.9242236, + "num_input_tokens_seen": 133018820, + "router_z_loss_clip": 1.69433594, + "router_z_loss_mlp": 0.13726807, + "step": 6188, + "time_per_iteration": 2.626126766204834 + }, + { + "auxiliary_loss_clip": 0.06458837, + "auxiliary_loss_mlp": 0.01269894, + "balance_loss_clip": 0.06288396, + "balance_loss_mlp": 0.01256232, + "epoch": 0.37210281076206225, + "flos": 21513221504640.0, + "grad_norm": 2.2586863759616564, + "language_loss": 0.66390121, + "learning_rate": 2.8928709681755548e-06, + "loss": 0.74118853, + "num_input_tokens_seen": 133040205, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.13653564, + "step": 6189, + "time_per_iteration": 2.5793135166168213 + }, + { + "auxiliary_loss_clip": 0.06460261, + "auxiliary_loss_mlp": 0.01271387, + "balance_loss_clip": 0.0628726, + "balance_loss_mlp": 0.01255926, + "epoch": 0.3721629340147302, + "flos": 17353595992320.0, + "grad_norm": 2.971940637043147, + "language_loss": 0.84218514, + "learning_rate": 2.8925224547735293e-06, + "loss": 0.91950166, + "num_input_tokens_seen": 133058095, + "router_z_loss_clip": 1.72949219, + "router_z_loss_mlp": 0.15466309, + "step": 6190, + "time_per_iteration": 2.530977487564087 + }, + { + "auxiliary_loss_clip": 0.06464738, + "auxiliary_loss_mlp": 0.01270544, + "balance_loss_clip": 0.06287063, + "balance_loss_mlp": 0.01255905, + "epoch": 0.3722230572673982, + "flos": 16437457376640.0, + "grad_norm": 2.7368484374177076, + "language_loss": 0.89274895, + "learning_rate": 2.8921739075269633e-06, + "loss": 0.97010183, + "num_input_tokens_seen": 133071530, + "router_z_loss_clip": 1.77441406, + "router_z_loss_mlp": 0.14648438, + "step": 6191, + "time_per_iteration": 2.4786319732666016 + }, + { + "auxiliary_loss_clip": 0.06463645, + "auxiliary_loss_mlp": 0.01271285, + "balance_loss_clip": 0.06286322, + "balance_loss_mlp": 0.01254465, + "epoch": 0.37228318052006615, + "flos": 22681360874880.0, + "grad_norm": 2.1321020045013577, + "language_loss": 0.74374199, + "learning_rate": 2.891825326449073e-06, + "loss": 0.82109123, + "num_input_tokens_seen": 133091410, + "router_z_loss_clip": 1.77148438, + "router_z_loss_mlp": 0.16790771, + "step": 6192, + "time_per_iteration": 2.6107547283172607 + }, + { + "auxiliary_loss_clip": 0.06461145, + "auxiliary_loss_mlp": 0.01269074, + "balance_loss_clip": 0.06288278, + "balance_loss_mlp": 0.0125493, + "epoch": 0.3723433037727341, + "flos": 25272617189760.0, + "grad_norm": 2.3785606336548124, + "language_loss": 0.79934001, + "learning_rate": 2.8914767115530766e-06, + "loss": 0.87664223, + "num_input_tokens_seen": 133110365, + "router_z_loss_clip": 1.72753906, + "router_z_loss_mlp": 0.14154053, + "step": 6193, + "time_per_iteration": 2.5584514141082764 + }, + { + "auxiliary_loss_clip": 0.06469596, + "auxiliary_loss_mlp": 0.01270113, + "balance_loss_clip": 0.06293128, + "balance_loss_mlp": 0.01255594, + "epoch": 0.3724034270254021, + "flos": 10529228534400.0, + "grad_norm": 1.7620775512614164, + "language_loss": 0.84889179, + "learning_rate": 2.891128062852194e-06, + "loss": 0.92628884, + "num_input_tokens_seen": 133128255, + "router_z_loss_clip": 1.76269531, + "router_z_loss_mlp": 0.14526367, + "step": 6194, + "time_per_iteration": 2.5419061183929443 + }, + { + "auxiliary_loss_clip": 0.06460975, + "auxiliary_loss_mlp": 0.01266847, + "balance_loss_clip": 0.06288271, + "balance_loss_mlp": 0.01253317, + "epoch": 0.37246355027807004, + "flos": 20272393117440.0, + "grad_norm": 2.226391461709797, + "language_loss": 0.78030515, + "learning_rate": 2.890779380359646e-06, + "loss": 0.85758334, + "num_input_tokens_seen": 133143975, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.13543701, + "step": 6195, + "time_per_iteration": 2.51361346244812 + }, + { + "auxiliary_loss_clip": 0.06459115, + "auxiliary_loss_mlp": 0.01274112, + "balance_loss_clip": 0.06288831, + "balance_loss_mlp": 0.01258955, + "epoch": 0.372523673530738, + "flos": 19506705707520.0, + "grad_norm": 1.8216220923823887, + "language_loss": 0.79924363, + "learning_rate": 2.890430664088655e-06, + "loss": 0.87657595, + "num_input_tokens_seen": 133162935, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.15155029, + "step": 6196, + "time_per_iteration": 2.6005568504333496 + }, + { + "auxiliary_loss_clip": 0.06458211, + "auxiliary_loss_mlp": 0.01270847, + "balance_loss_clip": 0.06289028, + "balance_loss_mlp": 0.01256888, + "epoch": 0.372583796783406, + "flos": 16769945577600.0, + "grad_norm": 2.2795878215352396, + "language_loss": 0.84059894, + "learning_rate": 2.890081914052443e-06, + "loss": 0.91788948, + "num_input_tokens_seen": 133181180, + "router_z_loss_clip": 1.69140625, + "router_z_loss_mlp": 0.13952637, + "step": 6197, + "time_per_iteration": 2.538058042526245 + }, + { + "auxiliary_loss_clip": 0.06456813, + "auxiliary_loss_mlp": 0.01270068, + "balance_loss_clip": 0.06289704, + "balance_loss_mlp": 0.01255095, + "epoch": 0.37264392003607394, + "flos": 22644576132480.0, + "grad_norm": 1.7143100919816474, + "language_loss": 0.64964151, + "learning_rate": 2.889733130264237e-06, + "loss": 0.72691035, + "num_input_tokens_seen": 133199615, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.14971924, + "step": 6198, + "time_per_iteration": 2.5891072750091553 + }, + { + "auxiliary_loss_clip": 0.06454235, + "auxiliary_loss_mlp": 0.0127235, + "balance_loss_clip": 0.0628581, + "balance_loss_mlp": 0.01258367, + "epoch": 0.3727040432887419, + "flos": 19979037573120.0, + "grad_norm": 1.4303592099178044, + "language_loss": 0.74534631, + "learning_rate": 2.889384312737261e-06, + "loss": 0.82261217, + "num_input_tokens_seen": 133219650, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.13977051, + "step": 6199, + "time_per_iteration": 2.5612289905548096 + }, + { + "auxiliary_loss_clip": 0.06453978, + "auxiliary_loss_mlp": 0.01269323, + "balance_loss_clip": 0.06284302, + "balance_loss_mlp": 0.01255095, + "epoch": 0.37276416654140987, + "flos": 63911906853120.0, + "grad_norm": 1.6001689252403943, + "language_loss": 0.81250614, + "learning_rate": 2.889035461484742e-06, + "loss": 0.88973916, + "num_input_tokens_seen": 133245675, + "router_z_loss_clip": 1.69433594, + "router_z_loss_mlp": 0.14227295, + "step": 6200, + "time_per_iteration": 2.9802377223968506 + }, + { + "auxiliary_loss_clip": 0.06452343, + "auxiliary_loss_mlp": 0.01273173, + "balance_loss_clip": 0.06282811, + "balance_loss_mlp": 0.0125907, + "epoch": 0.37282428979407783, + "flos": 39795381244800.0, + "grad_norm": 2.0282879733455776, + "language_loss": 0.61128068, + "learning_rate": 2.88868657651991e-06, + "loss": 0.68853581, + "num_input_tokens_seen": 133266905, + "router_z_loss_clip": 1.69238281, + "router_z_loss_mlp": 0.14123535, + "step": 6201, + "time_per_iteration": 2.6786048412323 + }, + { + "auxiliary_loss_clip": 0.06460309, + "auxiliary_loss_mlp": 0.01271912, + "balance_loss_clip": 0.06284842, + "balance_loss_mlp": 0.01257166, + "epoch": 0.37288441304674586, + "flos": 22715336505600.0, + "grad_norm": 1.562126243298772, + "language_loss": 0.73424393, + "learning_rate": 2.8883376578559934e-06, + "loss": 0.81156611, + "num_input_tokens_seen": 133286865, + "router_z_loss_clip": 1.75390625, + "router_z_loss_mlp": 0.14746094, + "step": 6202, + "time_per_iteration": 2.5774593353271484 + }, + { + "auxiliary_loss_clip": 0.06450565, + "auxiliary_loss_mlp": 0.01268741, + "balance_loss_clip": 0.06279768, + "balance_loss_mlp": 0.01253697, + "epoch": 0.3729445362994138, + "flos": 18776209812480.0, + "grad_norm": 3.8476229642649895, + "language_loss": 0.73690808, + "learning_rate": 2.8879887055062243e-06, + "loss": 0.81410116, + "num_input_tokens_seen": 133305295, + "router_z_loss_clip": 1.70898438, + "router_z_loss_mlp": 0.1505127, + "step": 6203, + "time_per_iteration": 2.4786221981048584 + }, + { + "auxiliary_loss_clip": 0.06448745, + "auxiliary_loss_mlp": 0.0126679, + "balance_loss_clip": 0.06280582, + "balance_loss_mlp": 0.01253402, + "epoch": 0.3730046595520818, + "flos": 22462874553600.0, + "grad_norm": 1.6222639611717555, + "language_loss": 0.82113981, + "learning_rate": 2.8876397194838353e-06, + "loss": 0.89829516, + "num_input_tokens_seen": 133324625, + "router_z_loss_clip": 1.68066406, + "router_z_loss_mlp": 0.13391113, + "step": 6204, + "time_per_iteration": 2.5474419593811035 + }, + { + "auxiliary_loss_clip": 0.06456085, + "auxiliary_loss_mlp": 0.01267649, + "balance_loss_clip": 0.06282973, + "balance_loss_mlp": 0.01253094, + "epoch": 0.37306478280474975, + "flos": 24323257630080.0, + "grad_norm": 1.5013454609640156, + "language_loss": 0.75699729, + "learning_rate": 2.8872906998020577e-06, + "loss": 0.8342346, + "num_input_tokens_seen": 133344625, + "router_z_loss_clip": 1.73144531, + "router_z_loss_mlp": 0.14562988, + "step": 6205, + "time_per_iteration": 2.5284838676452637 + }, + { + "auxiliary_loss_clip": 0.06453846, + "auxiliary_loss_mlp": 0.01269403, + "balance_loss_clip": 0.06283775, + "balance_loss_mlp": 0.01254538, + "epoch": 0.3731249060574177, + "flos": 15820627944960.0, + "grad_norm": 2.409990557003708, + "language_loss": 0.78042793, + "learning_rate": 2.886941646474128e-06, + "loss": 0.85766041, + "num_input_tokens_seen": 133363605, + "router_z_loss_clip": 1.70019531, + "router_z_loss_mlp": 0.14868164, + "step": 6206, + "time_per_iteration": 2.5130996704101562 + }, + { + "auxiliary_loss_clip": 0.06455843, + "auxiliary_loss_mlp": 0.01268821, + "balance_loss_clip": 0.06284125, + "balance_loss_mlp": 0.01253085, + "epoch": 0.3731850293100857, + "flos": 19834120736640.0, + "grad_norm": 3.8358433201526334, + "language_loss": 0.93966329, + "learning_rate": 2.886592559513283e-06, + "loss": 1.01690984, + "num_input_tokens_seen": 133379405, + "router_z_loss_clip": 1.71484375, + "router_z_loss_mlp": 0.15734863, + "step": 6207, + "time_per_iteration": 2.4994020462036133 + }, + { + "auxiliary_loss_clip": 0.06459471, + "auxiliary_loss_mlp": 0.01269129, + "balance_loss_clip": 0.06283936, + "balance_loss_mlp": 0.01254561, + "epoch": 0.37324515256275365, + "flos": 19068349472640.0, + "grad_norm": 2.1400449567396826, + "language_loss": 0.82643408, + "learning_rate": 2.886243438932759e-06, + "loss": 0.90372002, + "num_input_tokens_seen": 133397585, + "router_z_loss_clip": 1.75488281, + "router_z_loss_mlp": 0.14575195, + "step": 6208, + "time_per_iteration": 2.5359628200531006 + }, + { + "auxiliary_loss_clip": 0.06460227, + "auxiliary_loss_mlp": 0.01272188, + "balance_loss_clip": 0.06285752, + "balance_loss_mlp": 0.01255904, + "epoch": 0.3733052758154216, + "flos": 20710623571200.0, + "grad_norm": 2.148305950788212, + "language_loss": 0.73528939, + "learning_rate": 2.8858942847457953e-06, + "loss": 0.81261349, + "num_input_tokens_seen": 133415365, + "router_z_loss_clip": 1.7421875, + "router_z_loss_mlp": 0.1628418, + "step": 6209, + "time_per_iteration": 2.499209403991699 + }, + { + "auxiliary_loss_clip": 0.06455819, + "auxiliary_loss_mlp": 0.01273959, + "balance_loss_clip": 0.06285547, + "balance_loss_mlp": 0.01258593, + "epoch": 0.3733653990680896, + "flos": 20199704100480.0, + "grad_norm": 2.014449395888949, + "language_loss": 0.71212471, + "learning_rate": 2.8855450969656305e-06, + "loss": 0.78942245, + "num_input_tokens_seen": 133435700, + "router_z_loss_clip": 1.70117188, + "router_z_loss_mlp": 0.15368652, + "step": 6210, + "time_per_iteration": 2.5324270725250244 + }, + { + "auxiliary_loss_clip": 0.06468424, + "auxiliary_loss_mlp": 0.01268574, + "balance_loss_clip": 0.06295058, + "balance_loss_mlp": 0.01253631, + "epoch": 0.37342552232075754, + "flos": 20345920675200.0, + "grad_norm": 1.543701660359285, + "language_loss": 0.7823801, + "learning_rate": 2.8851958756055073e-06, + "loss": 0.85975003, + "num_input_tokens_seen": 133455180, + "router_z_loss_clip": 1.73535156, + "router_z_loss_mlp": 0.1494751, + "step": 6211, + "time_per_iteration": 2.5388078689575195 + }, + { + "auxiliary_loss_clip": 0.06464606, + "auxiliary_loss_mlp": 0.01268752, + "balance_loss_clip": 0.06291494, + "balance_loss_mlp": 0.0125347, + "epoch": 0.3734856455734255, + "flos": 35526701243520.0, + "grad_norm": 1.6765525733287814, + "language_loss": 0.73612988, + "learning_rate": 2.884846620678668e-06, + "loss": 0.81346345, + "num_input_tokens_seen": 133476715, + "router_z_loss_clip": 1.73046875, + "router_z_loss_mlp": 0.15283203, + "step": 6212, + "time_per_iteration": 2.663950204849243 + }, + { + "auxiliary_loss_clip": 0.06477734, + "auxiliary_loss_mlp": 0.01272795, + "balance_loss_clip": 0.06294222, + "balance_loss_mlp": 0.01256345, + "epoch": 0.37354576882609347, + "flos": 21148686316800.0, + "grad_norm": 1.865900947954382, + "language_loss": 0.82430422, + "learning_rate": 2.884497332198356e-06, + "loss": 0.90180945, + "num_input_tokens_seen": 133494550, + "router_z_loss_clip": 1.83496094, + "router_z_loss_mlp": 0.16455078, + "step": 6213, + "time_per_iteration": 2.541431427001953 + }, + { + "auxiliary_loss_clip": 0.06467836, + "auxiliary_loss_mlp": 0.01271096, + "balance_loss_clip": 0.06295606, + "balance_loss_mlp": 0.01255623, + "epoch": 0.37360589207876144, + "flos": 21513179577600.0, + "grad_norm": 2.345206885791162, + "language_loss": 0.7896657, + "learning_rate": 2.8841480101778167e-06, + "loss": 0.86705506, + "num_input_tokens_seen": 133512640, + "router_z_loss_clip": 1.72070312, + "router_z_loss_mlp": 0.15466309, + "step": 6214, + "time_per_iteration": 2.545792579650879 + }, + { + "auxiliary_loss_clip": 0.06466322, + "auxiliary_loss_mlp": 0.01270745, + "balance_loss_clip": 0.06297071, + "balance_loss_mlp": 0.01255981, + "epoch": 0.37366601533142946, + "flos": 38444953317120.0, + "grad_norm": 1.6116656191599898, + "language_loss": 0.85112274, + "learning_rate": 2.883798654630296e-06, + "loss": 0.92849338, + "num_input_tokens_seen": 133535540, + "router_z_loss_clip": 1.69433594, + "router_z_loss_mlp": 0.14758301, + "step": 6215, + "time_per_iteration": 2.70700740814209 + }, + { + "auxiliary_loss_clip": 0.06472297, + "auxiliary_loss_mlp": 0.01270089, + "balance_loss_clip": 0.06296762, + "balance_loss_mlp": 0.01254044, + "epoch": 0.3737261385840974, + "flos": 18446908066560.0, + "grad_norm": 1.6510257786225762, + "language_loss": 0.6833967, + "learning_rate": 2.8834492655690423e-06, + "loss": 0.76082057, + "num_input_tokens_seen": 133555795, + "router_z_loss_clip": 1.75683594, + "router_z_loss_mlp": 0.16040039, + "step": 6216, + "time_per_iteration": 3.941821575164795 + }, + { + "auxiliary_loss_clip": 0.06466141, + "auxiliary_loss_mlp": 0.01276294, + "balance_loss_clip": 0.06293347, + "balance_loss_mlp": 0.01260224, + "epoch": 0.3737862618367654, + "flos": 22936506157440.0, + "grad_norm": 2.1208446300989983, + "language_loss": 0.6621505, + "learning_rate": 2.883099843007303e-06, + "loss": 0.73957485, + "num_input_tokens_seen": 133575905, + "router_z_loss_clip": 1.72753906, + "router_z_loss_mlp": 0.1607666, + "step": 6217, + "time_per_iteration": 4.067852258682251 + }, + { + "auxiliary_loss_clip": 0.06468368, + "auxiliary_loss_mlp": 0.01272371, + "balance_loss_clip": 0.06294458, + "balance_loss_mlp": 0.0125772, + "epoch": 0.37384638508943335, + "flos": 15414360624000.0, + "grad_norm": 1.5564133784357135, + "language_loss": 0.80760753, + "learning_rate": 2.88275038695833e-06, + "loss": 0.88501501, + "num_input_tokens_seen": 133592585, + "router_z_loss_clip": 1.73925781, + "router_z_loss_mlp": 0.1463623, + "step": 6218, + "time_per_iteration": 2.5253372192382812 + }, + { + "auxiliary_loss_clip": 0.06465785, + "auxiliary_loss_mlp": 0.01272039, + "balance_loss_clip": 0.06298652, + "balance_loss_mlp": 0.01256661, + "epoch": 0.3739065083421013, + "flos": 24287856480000.0, + "grad_norm": 2.4835018506755566, + "language_loss": 0.79185957, + "learning_rate": 2.8824008974353736e-06, + "loss": 0.86923778, + "num_input_tokens_seen": 133615070, + "router_z_loss_clip": 1.671875, + "router_z_loss_mlp": 0.15380859, + "step": 6219, + "time_per_iteration": 2.595684289932251 + }, + { + "auxiliary_loss_clip": 0.06464131, + "auxiliary_loss_mlp": 0.01274727, + "balance_loss_clip": 0.06296779, + "balance_loss_mlp": 0.01260177, + "epoch": 0.3739666315947693, + "flos": 23009488663680.0, + "grad_norm": 2.098390778414135, + "language_loss": 0.77614415, + "learning_rate": 2.8820513744516866e-06, + "loss": 0.85353279, + "num_input_tokens_seen": 133633490, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.14538574, + "step": 6220, + "time_per_iteration": 2.5899298191070557 + }, + { + "auxiliary_loss_clip": 0.06466513, + "auxiliary_loss_mlp": 0.01270657, + "balance_loss_clip": 0.06292208, + "balance_loss_mlp": 0.0125541, + "epoch": 0.37402675484743725, + "flos": 19397231948160.0, + "grad_norm": 1.5821121915867322, + "language_loss": 0.83564717, + "learning_rate": 2.8817018180205235e-06, + "loss": 0.91301888, + "num_input_tokens_seen": 133653425, + "router_z_loss_clip": 1.74121094, + "router_z_loss_mlp": 0.15240479, + "step": 6221, + "time_per_iteration": 2.540102481842041 + }, + { + "auxiliary_loss_clip": 0.06464627, + "auxiliary_loss_mlp": 0.0127713, + "balance_loss_clip": 0.06293692, + "balance_loss_mlp": 0.01262647, + "epoch": 0.3740868781001052, + "flos": 17131420091520.0, + "grad_norm": 1.6401420513761291, + "language_loss": 0.76738596, + "learning_rate": 2.8813522281551387e-06, + "loss": 0.84480345, + "num_input_tokens_seen": 133670220, + "router_z_loss_clip": 1.7109375, + "router_z_loss_mlp": 0.14477539, + "step": 6222, + "time_per_iteration": 4.020254850387573 + }, + { + "auxiliary_loss_clip": 0.06466988, + "auxiliary_loss_mlp": 0.01277801, + "balance_loss_clip": 0.06296736, + "balance_loss_mlp": 0.01263467, + "epoch": 0.3741470013527732, + "flos": 20049001332480.0, + "grad_norm": 1.799306271558528, + "language_loss": 0.70768011, + "learning_rate": 2.881002604868789e-06, + "loss": 0.785128, + "num_input_tokens_seen": 133688910, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.14349365, + "step": 6223, + "time_per_iteration": 2.6146726608276367 + }, + { + "auxiliary_loss_clip": 0.0646846, + "auxiliary_loss_mlp": 0.01272132, + "balance_loss_clip": 0.06299432, + "balance_loss_mlp": 0.01258954, + "epoch": 0.37420712460544114, + "flos": 36905151162240.0, + "grad_norm": 1.9191598081110601, + "language_loss": 0.69292819, + "learning_rate": 2.8806529481747325e-06, + "loss": 0.77033412, + "num_input_tokens_seen": 133708690, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.1317749, + "step": 6224, + "time_per_iteration": 4.144296407699585 + }, + { + "auxiliary_loss_clip": 0.06463895, + "auxiliary_loss_mlp": 0.01274949, + "balance_loss_clip": 0.06296779, + "balance_loss_mlp": 0.01260126, + "epoch": 0.3742672478581091, + "flos": 22207896979200.0, + "grad_norm": 1.811742579086715, + "language_loss": 0.70166373, + "learning_rate": 2.880303258086228e-06, + "loss": 0.77905214, + "num_input_tokens_seen": 133728095, + "router_z_loss_clip": 1.66894531, + "router_z_loss_mlp": 0.14819336, + "step": 6225, + "time_per_iteration": 2.562023162841797 + }, + { + "auxiliary_loss_clip": 0.06462345, + "auxiliary_loss_mlp": 0.0127698, + "balance_loss_clip": 0.06296264, + "balance_loss_mlp": 0.01262257, + "epoch": 0.3743273711107771, + "flos": 24688547504640.0, + "grad_norm": 2.0306145345851614, + "language_loss": 0.79386592, + "learning_rate": 2.879953534616536e-06, + "loss": 0.87125921, + "num_input_tokens_seen": 133745590, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.14715576, + "step": 6226, + "time_per_iteration": 2.5372707843780518 + }, + { + "auxiliary_loss_clip": 0.06464548, + "auxiliary_loss_mlp": 0.01273743, + "balance_loss_clip": 0.0629389, + "balance_loss_mlp": 0.01259021, + "epoch": 0.37438749436344504, + "flos": 24466078114560.0, + "grad_norm": 1.6346435650910545, + "language_loss": 0.68240035, + "learning_rate": 2.879603777778917e-06, + "loss": 0.75978327, + "num_input_tokens_seen": 133766155, + "router_z_loss_clip": 1.70800781, + "router_z_loss_mlp": 0.14733887, + "step": 6227, + "time_per_iteration": 2.5752079486846924 + }, + { + "auxiliary_loss_clip": 0.06464467, + "auxiliary_loss_mlp": 0.01270066, + "balance_loss_clip": 0.06297411, + "balance_loss_mlp": 0.0125588, + "epoch": 0.374447617616113, + "flos": 21805193456640.0, + "grad_norm": 1.6298548281431393, + "language_loss": 0.83520573, + "learning_rate": 2.879253987586635e-06, + "loss": 0.91255105, + "num_input_tokens_seen": 133783185, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.14190674, + "step": 6228, + "time_per_iteration": 2.605607748031616 + }, + { + "auxiliary_loss_clip": 0.06458256, + "auxiliary_loss_mlp": 0.01270458, + "balance_loss_clip": 0.06288552, + "balance_loss_mlp": 0.01256033, + "epoch": 0.374507740868781, + "flos": 17974073076480.0, + "grad_norm": 1.5343038876343353, + "language_loss": 0.75450277, + "learning_rate": 2.8789041640529535e-06, + "loss": 0.83178985, + "num_input_tokens_seen": 133800975, + "router_z_loss_clip": 1.69824219, + "router_z_loss_mlp": 0.14428711, + "step": 6229, + "time_per_iteration": 2.607506036758423 + }, + { + "auxiliary_loss_clip": 0.06464534, + "auxiliary_loss_mlp": 0.012714, + "balance_loss_clip": 0.06293011, + "balance_loss_mlp": 0.01256249, + "epoch": 0.374567864121449, + "flos": 16111132450560.0, + "grad_norm": 3.0205318355467083, + "language_loss": 0.84065855, + "learning_rate": 2.8785543071911383e-06, + "loss": 0.91801792, + "num_input_tokens_seen": 133818020, + "router_z_loss_clip": 1.71386719, + "router_z_loss_mlp": 0.15142822, + "step": 6230, + "time_per_iteration": 2.4964523315429688 + }, + { + "auxiliary_loss_clip": 0.06463904, + "auxiliary_loss_mlp": 0.01275239, + "balance_loss_clip": 0.06291893, + "balance_loss_mlp": 0.01259569, + "epoch": 0.37462798737411696, + "flos": 25779847080960.0, + "grad_norm": 1.7178487844900587, + "language_loss": 0.73793018, + "learning_rate": 2.878204417014456e-06, + "loss": 0.81532168, + "num_input_tokens_seen": 133840690, + "router_z_loss_clip": 1.72167969, + "router_z_loss_mlp": 0.15667725, + "step": 6231, + "time_per_iteration": 2.589771270751953 + }, + { + "auxiliary_loss_clip": 0.06465879, + "auxiliary_loss_mlp": 0.01270223, + "balance_loss_clip": 0.06291361, + "balance_loss_mlp": 0.01255298, + "epoch": 0.3746881106267849, + "flos": 16660136401920.0, + "grad_norm": 1.8762806294571872, + "language_loss": 0.74086344, + "learning_rate": 2.8778544935361735e-06, + "loss": 0.81822443, + "num_input_tokens_seen": 133858350, + "router_z_loss_clip": 1.74414062, + "router_z_loss_mlp": 0.14929199, + "step": 6232, + "time_per_iteration": 2.483219861984253 + }, + { + "auxiliary_loss_clip": 0.06463014, + "auxiliary_loss_mlp": 0.01270796, + "balance_loss_clip": 0.06290261, + "balance_loss_mlp": 0.0125605, + "epoch": 0.3747482338794529, + "flos": 26185317788160.0, + "grad_norm": 1.743409558247901, + "language_loss": 0.77404612, + "learning_rate": 2.877504536769561e-06, + "loss": 0.85138428, + "num_input_tokens_seen": 133879775, + "router_z_loss_clip": 1.72753906, + "router_z_loss_mlp": 0.14758301, + "step": 6233, + "time_per_iteration": 2.5796406269073486 + }, + { + "auxiliary_loss_clip": 0.06463634, + "auxiliary_loss_mlp": 0.01269135, + "balance_loss_clip": 0.06292734, + "balance_loss_mlp": 0.01255432, + "epoch": 0.37480835713212085, + "flos": 12025956890880.0, + "grad_norm": 1.7958128584553208, + "language_loss": 0.69650698, + "learning_rate": 2.8771545467278883e-06, + "loss": 0.77383471, + "num_input_tokens_seen": 133898295, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.13690186, + "step": 6234, + "time_per_iteration": 2.524226188659668 + }, + { + "auxiliary_loss_clip": 0.06464471, + "auxiliary_loss_mlp": 0.01267248, + "balance_loss_clip": 0.06295948, + "balance_loss_mlp": 0.0125311, + "epoch": 0.3748684803847888, + "flos": 19684801560960.0, + "grad_norm": 2.1537876510353597, + "language_loss": 0.83551729, + "learning_rate": 2.8768045234244276e-06, + "loss": 0.91283447, + "num_input_tokens_seen": 133915230, + "router_z_loss_clip": 1.68554688, + "router_z_loss_mlp": 0.14135742, + "step": 6235, + "time_per_iteration": 2.5380606651306152 + }, + { + "auxiliary_loss_clip": 0.06462481, + "auxiliary_loss_mlp": 0.01267157, + "balance_loss_clip": 0.06289958, + "balance_loss_mlp": 0.0125222, + "epoch": 0.3749286036374568, + "flos": 20527328764800.0, + "grad_norm": 1.8434440291752416, + "language_loss": 0.78213942, + "learning_rate": 2.8764544668724517e-06, + "loss": 0.8594358, + "num_input_tokens_seen": 133934110, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.14941406, + "step": 6236, + "time_per_iteration": 2.507180690765381 + }, + { + "auxiliary_loss_clip": 0.06465082, + "auxiliary_loss_mlp": 0.0127323, + "balance_loss_clip": 0.06288011, + "balance_loss_mlp": 0.0125616, + "epoch": 0.37498872689012475, + "flos": 20710958987520.0, + "grad_norm": 1.9437086154972172, + "language_loss": 0.73305297, + "learning_rate": 2.876104377085234e-06, + "loss": 0.81043607, + "num_input_tokens_seen": 133952395, + "router_z_loss_clip": 1.76953125, + "router_z_loss_mlp": 0.17077637, + "step": 6237, + "time_per_iteration": 2.5545706748962402 + }, + { + "auxiliary_loss_clip": 0.06460923, + "auxiliary_loss_mlp": 0.01271336, + "balance_loss_clip": 0.0628608, + "balance_loss_mlp": 0.01256548, + "epoch": 0.3750488501427927, + "flos": 21580418079360.0, + "grad_norm": 2.5847168840400787, + "language_loss": 0.93616223, + "learning_rate": 2.8757542540760508e-06, + "loss": 1.01348472, + "num_input_tokens_seen": 133969635, + "router_z_loss_clip": 1.74804688, + "router_z_loss_mlp": 0.14788818, + "step": 6238, + "time_per_iteration": 2.544524669647217 + }, + { + "auxiliary_loss_clip": 0.06457306, + "auxiliary_loss_mlp": 0.01272243, + "balance_loss_clip": 0.06286643, + "balance_loss_mlp": 0.01257127, + "epoch": 0.3751089733954607, + "flos": 15929221236480.0, + "grad_norm": 2.2437121352489093, + "language_loss": 0.71661341, + "learning_rate": 2.8754040978581777e-06, + "loss": 0.79390883, + "num_input_tokens_seen": 133987215, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.15106201, + "step": 6239, + "time_per_iteration": 2.519807815551758 + }, + { + "auxiliary_loss_clip": 0.06461261, + "auxiliary_loss_mlp": 0.01271582, + "balance_loss_clip": 0.06287319, + "balance_loss_mlp": 0.01256485, + "epoch": 0.37516909664812864, + "flos": 36293688391680.0, + "grad_norm": 1.5212724151961043, + "language_loss": 0.65758455, + "learning_rate": 2.875053908444895e-06, + "loss": 0.73491299, + "num_input_tokens_seen": 134009250, + "router_z_loss_clip": 1.73730469, + "router_z_loss_mlp": 0.15118408, + "step": 6240, + "time_per_iteration": 2.6838748455047607 + }, + { + "auxiliary_loss_clip": 0.06461462, + "auxiliary_loss_mlp": 0.0126514, + "balance_loss_clip": 0.06288624, + "balance_loss_mlp": 0.01251258, + "epoch": 0.3752292199007966, + "flos": 13520882384640.0, + "grad_norm": 2.454894337240739, + "language_loss": 0.76209545, + "learning_rate": 2.8747036858494795e-06, + "loss": 0.83936143, + "num_input_tokens_seen": 134026875, + "router_z_loss_clip": 1.73046875, + "router_z_loss_mlp": 0.13867188, + "step": 6241, + "time_per_iteration": 2.498286008834839 + }, + { + "auxiliary_loss_clip": 0.06461808, + "auxiliary_loss_mlp": 0.01268507, + "balance_loss_clip": 0.06289176, + "balance_loss_mlp": 0.01253206, + "epoch": 0.3752893431534646, + "flos": 27205353866880.0, + "grad_norm": 2.0832931967812853, + "language_loss": 0.84671998, + "learning_rate": 2.874353430085213e-06, + "loss": 0.92402315, + "num_input_tokens_seen": 134047185, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.15313721, + "step": 6242, + "time_per_iteration": 2.6289877891540527 + }, + { + "auxiliary_loss_clip": 0.06457841, + "auxiliary_loss_mlp": 0.01272178, + "balance_loss_clip": 0.06285247, + "balance_loss_mlp": 0.01257379, + "epoch": 0.3753494664061326, + "flos": 30015431919360.0, + "grad_norm": 2.6434313807577112, + "language_loss": 0.68551457, + "learning_rate": 2.8740031411653766e-06, + "loss": 0.76281476, + "num_input_tokens_seen": 134067330, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.14813232, + "step": 6243, + "time_per_iteration": 2.7211153507232666 + }, + { + "auxiliary_loss_clip": 0.0645824, + "auxiliary_loss_mlp": 0.01270289, + "balance_loss_clip": 0.06286814, + "balance_loss_mlp": 0.01254482, + "epoch": 0.37540958965880056, + "flos": 24468803372160.0, + "grad_norm": 1.7478523324296555, + "language_loss": 0.8397631, + "learning_rate": 2.8736528191032535e-06, + "loss": 0.91704839, + "num_input_tokens_seen": 134085525, + "router_z_loss_clip": 1.71386719, + "router_z_loss_mlp": 0.15808105, + "step": 6244, + "time_per_iteration": 2.5738887786865234 + }, + { + "auxiliary_loss_clip": 0.0645659, + "auxiliary_loss_mlp": 0.01266605, + "balance_loss_clip": 0.06290226, + "balance_loss_mlp": 0.01252842, + "epoch": 0.3754697129114685, + "flos": 16513961754240.0, + "grad_norm": 3.8447339818169257, + "language_loss": 0.83823436, + "learning_rate": 2.8733024639121277e-06, + "loss": 0.91546631, + "num_input_tokens_seen": 134101855, + "router_z_loss_clip": 1.66113281, + "router_z_loss_mlp": 0.13751221, + "step": 6245, + "time_per_iteration": 2.5320816040039062 + }, + { + "auxiliary_loss_clip": 0.06453504, + "auxiliary_loss_mlp": 0.0127263, + "balance_loss_clip": 0.06282875, + "balance_loss_mlp": 0.01257633, + "epoch": 0.3755298361641365, + "flos": 19396980385920.0, + "grad_norm": 2.4621620681348295, + "language_loss": 0.64685225, + "learning_rate": 2.8729520756052853e-06, + "loss": 0.72411358, + "num_input_tokens_seen": 134119360, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.14990234, + "step": 6246, + "time_per_iteration": 2.58577561378479 + }, + { + "auxiliary_loss_clip": 0.06466524, + "auxiliary_loss_mlp": 0.01278259, + "balance_loss_clip": 0.06289048, + "balance_loss_mlp": 0.01262428, + "epoch": 0.37558995941680445, + "flos": 14725638789120.0, + "grad_norm": 2.3474335464279648, + "language_loss": 0.75348055, + "learning_rate": 2.8726016541960124e-06, + "loss": 0.83092844, + "num_input_tokens_seen": 134137475, + "router_z_loss_clip": 1.77148438, + "router_z_loss_mlp": 0.1583252, + "step": 6247, + "time_per_iteration": 2.47930908203125 + }, + { + "auxiliary_loss_clip": 0.06456453, + "auxiliary_loss_mlp": 0.012715, + "balance_loss_clip": 0.06282347, + "balance_loss_mlp": 0.01255503, + "epoch": 0.3756500826694724, + "flos": 21696432456960.0, + "grad_norm": 3.5646784592424017, + "language_loss": 0.55380279, + "learning_rate": 2.872251199697598e-06, + "loss": 0.6310823, + "num_input_tokens_seen": 134154580, + "router_z_loss_clip": 1.74121094, + "router_z_loss_mlp": 0.16003418, + "step": 6248, + "time_per_iteration": 2.5266313552856445 + }, + { + "auxiliary_loss_clip": 0.06453443, + "auxiliary_loss_mlp": 0.01268535, + "balance_loss_clip": 0.06283841, + "balance_loss_mlp": 0.01253109, + "epoch": 0.3757102059221404, + "flos": 26512942452480.0, + "grad_norm": 1.7302245846967215, + "language_loss": 0.84781861, + "learning_rate": 2.8719007121233297e-06, + "loss": 0.92503834, + "num_input_tokens_seen": 134174285, + "router_z_loss_clip": 1.69433594, + "router_z_loss_mlp": 0.15429688, + "step": 6249, + "time_per_iteration": 2.5590078830718994 + }, + { + "auxiliary_loss_clip": 0.06456596, + "auxiliary_loss_mlp": 0.01267858, + "balance_loss_clip": 0.0628508, + "balance_loss_mlp": 0.01253481, + "epoch": 0.37577032917480835, + "flos": 37346526144000.0, + "grad_norm": 1.6299752789251518, + "language_loss": 0.68482721, + "learning_rate": 2.8715501914864993e-06, + "loss": 0.76207179, + "num_input_tokens_seen": 134195940, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.14361572, + "step": 6250, + "time_per_iteration": 2.6926450729370117 + }, + { + "auxiliary_loss_clip": 0.06454285, + "auxiliary_loss_mlp": 0.01268088, + "balance_loss_clip": 0.06283066, + "balance_loss_mlp": 0.01254099, + "epoch": 0.3758304524274763, + "flos": 21915128413440.0, + "grad_norm": 2.0147801854845895, + "language_loss": 0.78550422, + "learning_rate": 2.8711996378003987e-06, + "loss": 0.862728, + "num_input_tokens_seen": 134212235, + "router_z_loss_clip": 1.71191406, + "router_z_loss_mlp": 0.13995361, + "step": 6251, + "time_per_iteration": 2.5072193145751953 + }, + { + "auxiliary_loss_clip": 0.06455163, + "auxiliary_loss_mlp": 0.01271265, + "balance_loss_clip": 0.06285167, + "balance_loss_mlp": 0.01257139, + "epoch": 0.3758905756801443, + "flos": 36577233008640.0, + "grad_norm": 2.2428429985343543, + "language_loss": 0.58560276, + "learning_rate": 2.8708490510783203e-06, + "loss": 0.66286701, + "num_input_tokens_seen": 134233810, + "router_z_loss_clip": 1.69824219, + "router_z_loss_mlp": 0.14111328, + "step": 6252, + "time_per_iteration": 2.684899091720581 + }, + { + "auxiliary_loss_clip": 0.06456266, + "auxiliary_loss_mlp": 0.01271539, + "balance_loss_clip": 0.06283682, + "balance_loss_mlp": 0.01255649, + "epoch": 0.37595069893281224, + "flos": 24534616354560.0, + "grad_norm": 1.5871699178816958, + "language_loss": 0.8998009, + "learning_rate": 2.8704984313335584e-06, + "loss": 0.97707891, + "num_input_tokens_seen": 134252020, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.15869141, + "step": 6253, + "time_per_iteration": 2.539088010787964 + }, + { + "auxiliary_loss_clip": 0.0645566, + "auxiliary_loss_mlp": 0.01270173, + "balance_loss_clip": 0.06288448, + "balance_loss_mlp": 0.01255523, + "epoch": 0.3760108221854802, + "flos": 16440518050560.0, + "grad_norm": 2.3821241740713086, + "language_loss": 0.77027023, + "learning_rate": 2.8701477785794097e-06, + "loss": 0.84752858, + "num_input_tokens_seen": 134269495, + "router_z_loss_clip": 1.671875, + "router_z_loss_mlp": 0.14648438, + "step": 6254, + "time_per_iteration": 2.545330047607422 + }, + { + "auxiliary_loss_clip": 0.06454843, + "auxiliary_loss_mlp": 0.01270718, + "balance_loss_clip": 0.06281418, + "balance_loss_mlp": 0.01254386, + "epoch": 0.37607094543814823, + "flos": 13776824280960.0, + "grad_norm": 2.2494955117694007, + "language_loss": 0.62504637, + "learning_rate": 2.869797092829169e-06, + "loss": 0.70230198, + "num_input_tokens_seen": 134287035, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.16333008, + "step": 6255, + "time_per_iteration": 3.937791109085083 + }, + { + "auxiliary_loss_clip": 0.06456207, + "auxiliary_loss_mlp": 0.0127009, + "balance_loss_clip": 0.06282066, + "balance_loss_mlp": 0.01253758, + "epoch": 0.3761310686908162, + "flos": 19862855487360.0, + "grad_norm": 2.2501042164391634, + "language_loss": 0.74801397, + "learning_rate": 2.869446374096135e-06, + "loss": 0.82527697, + "num_input_tokens_seen": 134304840, + "router_z_loss_clip": 1.74023438, + "router_z_loss_mlp": 0.16345215, + "step": 6256, + "time_per_iteration": 2.52768611907959 + }, + { + "auxiliary_loss_clip": 0.06456085, + "auxiliary_loss_mlp": 0.01270671, + "balance_loss_clip": 0.06281887, + "balance_loss_mlp": 0.01254637, + "epoch": 0.37619119194348416, + "flos": 12755823880320.0, + "grad_norm": 1.8167076240371511, + "language_loss": 0.70818299, + "learning_rate": 2.8690956223936088e-06, + "loss": 0.78545058, + "num_input_tokens_seen": 134323180, + "router_z_loss_clip": 1.74023438, + "router_z_loss_mlp": 0.16040039, + "step": 6257, + "time_per_iteration": 4.052328824996948 + }, + { + "auxiliary_loss_clip": 0.06452011, + "auxiliary_loss_mlp": 0.01268418, + "balance_loss_clip": 0.0628053, + "balance_loss_mlp": 0.01253743, + "epoch": 0.3762513151961521, + "flos": 17536387674240.0, + "grad_norm": 1.6926603581335775, + "language_loss": 0.85114312, + "learning_rate": 2.868744837734889e-06, + "loss": 0.92834735, + "num_input_tokens_seen": 134341390, + "router_z_loss_clip": 1.71386719, + "router_z_loss_mlp": 0.14672852, + "step": 6258, + "time_per_iteration": 2.50252366065979 + }, + { + "auxiliary_loss_clip": 0.06455131, + "auxiliary_loss_mlp": 0.0127104, + "balance_loss_clip": 0.06282814, + "balance_loss_mlp": 0.01256503, + "epoch": 0.3763114384488201, + "flos": 23623215494400.0, + "grad_norm": 1.3678719492617617, + "language_loss": 0.81156051, + "learning_rate": 2.868394020133277e-06, + "loss": 0.8888222, + "num_input_tokens_seen": 134360425, + "router_z_loss_clip": 1.72265625, + "router_z_loss_mlp": 0.14532471, + "step": 6259, + "time_per_iteration": 2.5430314540863037 + }, + { + "auxiliary_loss_clip": 0.06458686, + "auxiliary_loss_mlp": 0.01274293, + "balance_loss_clip": 0.06282908, + "balance_loss_mlp": 0.0125696, + "epoch": 0.37637156170148806, + "flos": 25413383249280.0, + "grad_norm": 1.809326583941318, + "language_loss": 0.71774137, + "learning_rate": 2.8680431696020783e-06, + "loss": 0.79507113, + "num_input_tokens_seen": 134379775, + "router_z_loss_clip": 1.75585938, + "router_z_loss_mlp": 0.17321777, + "step": 6260, + "time_per_iteration": 2.566267490386963 + }, + { + "auxiliary_loss_clip": 0.06455691, + "auxiliary_loss_mlp": 0.0127871, + "balance_loss_clip": 0.06279852, + "balance_loss_mlp": 0.01262128, + "epoch": 0.376431684954156, + "flos": 23447677190400.0, + "grad_norm": 1.8475234283885087, + "language_loss": 0.78925788, + "learning_rate": 2.867692286154594e-06, + "loss": 0.86660182, + "num_input_tokens_seen": 134400315, + "router_z_loss_clip": 1.75878906, + "router_z_loss_mlp": 0.16589355, + "step": 6261, + "time_per_iteration": 2.5848124027252197 + }, + { + "auxiliary_loss_clip": 0.06455033, + "auxiliary_loss_mlp": 0.01273009, + "balance_loss_clip": 0.06278862, + "balance_loss_mlp": 0.01257607, + "epoch": 0.376491808206824, + "flos": 34213099985280.0, + "grad_norm": 2.1653724604475255, + "language_loss": 0.80626601, + "learning_rate": 2.867341369804132e-06, + "loss": 0.88354641, + "num_input_tokens_seen": 134422875, + "router_z_loss_clip": 1.76171875, + "router_z_loss_mlp": 0.15405273, + "step": 6262, + "time_per_iteration": 4.146479368209839 + }, + { + "auxiliary_loss_clip": 0.06453078, + "auxiliary_loss_mlp": 0.01268581, + "balance_loss_clip": 0.06282018, + "balance_loss_mlp": 0.01253799, + "epoch": 0.37655193145949195, + "flos": 35193793772160.0, + "grad_norm": 1.6953841761456194, + "language_loss": 0.81274903, + "learning_rate": 2.866990420563998e-06, + "loss": 0.88996559, + "num_input_tokens_seen": 134443025, + "router_z_loss_clip": 1.70996094, + "router_z_loss_mlp": 0.14794922, + "step": 6263, + "time_per_iteration": 2.6529650688171387 + }, + { + "auxiliary_loss_clip": 0.06460523, + "auxiliary_loss_mlp": 0.01276014, + "balance_loss_clip": 0.06286405, + "balance_loss_mlp": 0.01261172, + "epoch": 0.3766120547121599, + "flos": 16767136465920.0, + "grad_norm": 1.8888627452248796, + "language_loss": 0.79794824, + "learning_rate": 2.866639438447501e-06, + "loss": 0.87531358, + "num_input_tokens_seen": 134460945, + "router_z_loss_clip": 1.74121094, + "router_z_loss_mlp": 0.14831543, + "step": 6264, + "time_per_iteration": 3.9715349674224854 + }, + { + "auxiliary_loss_clip": 0.06455237, + "auxiliary_loss_mlp": 0.01269266, + "balance_loss_clip": 0.06284397, + "balance_loss_mlp": 0.0125396, + "epoch": 0.3766721779648279, + "flos": 23557150949760.0, + "grad_norm": 1.690336708132248, + "language_loss": 0.7363869, + "learning_rate": 2.8662884234679497e-06, + "loss": 0.81363189, + "num_input_tokens_seen": 134480440, + "router_z_loss_clip": 1.70898438, + "router_z_loss_mlp": 0.15307617, + "step": 6265, + "time_per_iteration": 2.5544657707214355 + }, + { + "auxiliary_loss_clip": 0.06451584, + "auxiliary_loss_mlp": 0.01276088, + "balance_loss_clip": 0.06283864, + "balance_loss_mlp": 0.01262486, + "epoch": 0.37673230121749585, + "flos": 29136329608320.0, + "grad_norm": 1.6256668529315172, + "language_loss": 0.6925773, + "learning_rate": 2.865937375638654e-06, + "loss": 0.76985407, + "num_input_tokens_seen": 134501110, + "router_z_loss_clip": 1.67675781, + "router_z_loss_mlp": 0.1361084, + "step": 6266, + "time_per_iteration": 2.5735552310943604 + }, + { + "auxiliary_loss_clip": 0.06456051, + "auxiliary_loss_mlp": 0.01274095, + "balance_loss_clip": 0.06279004, + "balance_loss_mlp": 0.01258825, + "epoch": 0.3767924244701638, + "flos": 28154210302080.0, + "grad_norm": 2.361518747365002, + "language_loss": 0.63358176, + "learning_rate": 2.8655862949729264e-06, + "loss": 0.7108832, + "num_input_tokens_seen": 134522460, + "router_z_loss_clip": 1.76855469, + "router_z_loss_mlp": 0.15270996, + "step": 6267, + "time_per_iteration": 2.6408746242523193 + }, + { + "auxiliary_loss_clip": 0.0637848, + "auxiliary_loss_mlp": 0.01265654, + "balance_loss_clip": 0.0630175, + "balance_loss_mlp": 0.01263043, + "epoch": 0.37685254772283183, + "flos": 60815460343680.0, + "grad_norm": 0.7019670976586264, + "language_loss": 0.58932841, + "learning_rate": 2.8652351814840795e-06, + "loss": 0.66576976, + "num_input_tokens_seen": 134589545, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.02612305, + "step": 6268, + "time_per_iteration": 3.3041250705718994 + }, + { + "auxiliary_loss_clip": 0.06448595, + "auxiliary_loss_mlp": 0.01272563, + "balance_loss_clip": 0.06277184, + "balance_loss_mlp": 0.01256756, + "epoch": 0.3769126709754998, + "flos": 26039939754240.0, + "grad_norm": 1.4401012750228117, + "language_loss": 0.65166855, + "learning_rate": 2.8648840351854283e-06, + "loss": 0.72888005, + "num_input_tokens_seen": 134610550, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.15795898, + "step": 6269, + "time_per_iteration": 2.654707670211792 + }, + { + "auxiliary_loss_clip": 0.06454687, + "auxiliary_loss_mlp": 0.01276662, + "balance_loss_clip": 0.06286559, + "balance_loss_mlp": 0.01261296, + "epoch": 0.37697279422816776, + "flos": 23585508357120.0, + "grad_norm": 1.4576669810179597, + "language_loss": 0.71144199, + "learning_rate": 2.8645328560902874e-06, + "loss": 0.78875554, + "num_input_tokens_seen": 134630485, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.15362549, + "step": 6270, + "time_per_iteration": 2.5369231700897217 + }, + { + "auxiliary_loss_clip": 0.06374384, + "auxiliary_loss_mlp": 0.0126987, + "balance_loss_clip": 0.062971, + "balance_loss_mlp": 0.01266305, + "epoch": 0.3770329174808357, + "flos": 64766242753920.0, + "grad_norm": 0.6950430831807741, + "language_loss": 0.56232381, + "learning_rate": 2.8641816442119746e-06, + "loss": 0.63876635, + "num_input_tokens_seen": 134693510, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.03561401, + "step": 6271, + "time_per_iteration": 3.1599924564361572 + }, + { + "auxiliary_loss_clip": 0.06448443, + "auxiliary_loss_mlp": 0.01272708, + "balance_loss_clip": 0.06279441, + "balance_loss_mlp": 0.0125696, + "epoch": 0.3770930407335037, + "flos": 21841768563840.0, + "grad_norm": 1.6801171250404496, + "language_loss": 0.80461442, + "learning_rate": 2.8638303995638066e-06, + "loss": 0.88182592, + "num_input_tokens_seen": 134713115, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.1574707, + "step": 6272, + "time_per_iteration": 2.524846076965332 + }, + { + "auxiliary_loss_clip": 0.06450769, + "auxiliary_loss_mlp": 0.01273349, + "balance_loss_clip": 0.06283743, + "balance_loss_mlp": 0.01258329, + "epoch": 0.37715316398617166, + "flos": 22754594943360.0, + "grad_norm": 1.6672783573066894, + "language_loss": 0.74972034, + "learning_rate": 2.863479122159103e-06, + "loss": 0.82696146, + "num_input_tokens_seen": 134732635, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.15026855, + "step": 6273, + "time_per_iteration": 2.5571129322052 + }, + { + "auxiliary_loss_clip": 0.06449255, + "auxiliary_loss_mlp": 0.01271721, + "balance_loss_clip": 0.06280608, + "balance_loss_mlp": 0.01257148, + "epoch": 0.3772132872388396, + "flos": 18920246181120.0, + "grad_norm": 1.32773283576084, + "language_loss": 0.72241038, + "learning_rate": 2.8631278120111858e-06, + "loss": 0.79962015, + "num_input_tokens_seen": 134750695, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.14569092, + "step": 6274, + "time_per_iteration": 2.4966516494750977 + }, + { + "auxiliary_loss_clip": 0.06454083, + "auxiliary_loss_mlp": 0.01271444, + "balance_loss_clip": 0.06282286, + "balance_loss_mlp": 0.01257467, + "epoch": 0.3772734104915076, + "flos": 17351709275520.0, + "grad_norm": 1.8983068498635614, + "language_loss": 0.84638643, + "learning_rate": 2.8627764691333742e-06, + "loss": 0.92364168, + "num_input_tokens_seen": 134768935, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.13983154, + "step": 6275, + "time_per_iteration": 2.534308910369873 + }, + { + "auxiliary_loss_clip": 0.06448515, + "auxiliary_loss_mlp": 0.01272502, + "balance_loss_clip": 0.06282812, + "balance_loss_mlp": 0.01258865, + "epoch": 0.37733353374417555, + "flos": 32350452848640.0, + "grad_norm": 1.3669254528099, + "language_loss": 0.75387293, + "learning_rate": 2.8624250935389935e-06, + "loss": 0.83108306, + "num_input_tokens_seen": 134791260, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.13641357, + "step": 6276, + "time_per_iteration": 2.6563172340393066 + }, + { + "auxiliary_loss_clip": 0.06453335, + "auxiliary_loss_mlp": 0.0127286, + "balance_loss_clip": 0.06282486, + "balance_loss_mlp": 0.0125803, + "epoch": 0.3773936569968435, + "flos": 23366225422080.0, + "grad_norm": 1.9054341571687776, + "language_loss": 0.86016738, + "learning_rate": 2.862073685241366e-06, + "loss": 0.93742937, + "num_input_tokens_seen": 134808350, + "router_z_loss_clip": 1.70898438, + "router_z_loss_mlp": 0.1484375, + "step": 6277, + "time_per_iteration": 2.6153500080108643 + }, + { + "auxiliary_loss_clip": 0.06448077, + "auxiliary_loss_mlp": 0.01271912, + "balance_loss_clip": 0.0628462, + "balance_loss_mlp": 0.01257488, + "epoch": 0.3774537802495115, + "flos": 21472579474560.0, + "grad_norm": 1.5956300393708251, + "language_loss": 0.78636366, + "learning_rate": 2.861722244253818e-06, + "loss": 0.86356354, + "num_input_tokens_seen": 134826005, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.14428711, + "step": 6278, + "time_per_iteration": 2.564234495162964 + }, + { + "auxiliary_loss_clip": 0.06459187, + "auxiliary_loss_mlp": 0.01270608, + "balance_loss_clip": 0.06284142, + "balance_loss_mlp": 0.01255075, + "epoch": 0.37751390350217945, + "flos": 24980812945920.0, + "grad_norm": 1.8067410295121689, + "language_loss": 0.8371948, + "learning_rate": 2.8613707705896767e-06, + "loss": 0.91449273, + "num_input_tokens_seen": 134844995, + "router_z_loss_clip": 1.75097656, + "router_z_loss_mlp": 0.15527344, + "step": 6279, + "time_per_iteration": 2.6134567260742188 + }, + { + "auxiliary_loss_clip": 0.06454675, + "auxiliary_loss_mlp": 0.01271405, + "balance_loss_clip": 0.06282948, + "balance_loss_mlp": 0.01257117, + "epoch": 0.3775740267548474, + "flos": 27826585637760.0, + "grad_norm": 1.84994794715845, + "language_loss": 0.74995327, + "learning_rate": 2.861019264262269e-06, + "loss": 0.82721412, + "num_input_tokens_seen": 134865285, + "router_z_loss_clip": 1.71777344, + "router_z_loss_mlp": 0.1428833, + "step": 6280, + "time_per_iteration": 2.6029937267303467 + }, + { + "auxiliary_loss_clip": 0.06448464, + "auxiliary_loss_mlp": 0.01272763, + "balance_loss_clip": 0.06282684, + "balance_loss_mlp": 0.01259156, + "epoch": 0.3776341500075154, + "flos": 22571845188480.0, + "grad_norm": 1.3018494364650444, + "language_loss": 0.76205039, + "learning_rate": 2.8606677252849242e-06, + "loss": 0.83926266, + "num_input_tokens_seen": 134886535, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.13592529, + "step": 6281, + "time_per_iteration": 2.524489641189575 + }, + { + "auxiliary_loss_clip": 0.06448536, + "auxiliary_loss_mlp": 0.01271342, + "balance_loss_clip": 0.06279069, + "balance_loss_mlp": 0.0125718, + "epoch": 0.3776942732601834, + "flos": 23084148251520.0, + "grad_norm": 1.5306913056637732, + "language_loss": 0.84658033, + "learning_rate": 2.860316153670974e-06, + "loss": 0.92377913, + "num_input_tokens_seen": 134907435, + "router_z_loss_clip": 1.69238281, + "router_z_loss_mlp": 0.14160156, + "step": 6282, + "time_per_iteration": 2.6190710067749023 + }, + { + "auxiliary_loss_clip": 0.06449918, + "auxiliary_loss_mlp": 0.01269426, + "balance_loss_clip": 0.06282572, + "balance_loss_mlp": 0.0125555, + "epoch": 0.37775439651285136, + "flos": 21730617722880.0, + "grad_norm": 1.840636786741823, + "language_loss": 0.70143461, + "learning_rate": 2.8599645494337484e-06, + "loss": 0.77862805, + "num_input_tokens_seen": 134925360, + "router_z_loss_clip": 1.67578125, + "router_z_loss_mlp": 0.13879395, + "step": 6283, + "time_per_iteration": 2.555816411972046 + }, + { + "auxiliary_loss_clip": 0.06452499, + "auxiliary_loss_mlp": 0.01274632, + "balance_loss_clip": 0.06285429, + "balance_loss_mlp": 0.01259957, + "epoch": 0.37781451976551933, + "flos": 23994542862720.0, + "grad_norm": 1.743481736886233, + "language_loss": 0.76856482, + "learning_rate": 2.859612912586581e-06, + "loss": 0.8458361, + "num_input_tokens_seen": 134944205, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.14648438, + "step": 6284, + "time_per_iteration": 2.560770034790039 + }, + { + "auxiliary_loss_clip": 0.06464045, + "auxiliary_loss_mlp": 0.01271283, + "balance_loss_clip": 0.06286186, + "balance_loss_mlp": 0.01254725, + "epoch": 0.3778746430181873, + "flos": 13731821838720.0, + "grad_norm": 2.746966655353194, + "language_loss": 0.85536617, + "learning_rate": 2.8592612431428055e-06, + "loss": 0.93271947, + "num_input_tokens_seen": 134960255, + "router_z_loss_clip": 1.77636719, + "router_z_loss_mlp": 0.16564941, + "step": 6285, + "time_per_iteration": 2.5006392002105713 + }, + { + "auxiliary_loss_clip": 0.06451872, + "auxiliary_loss_mlp": 0.01271139, + "balance_loss_clip": 0.06279811, + "balance_loss_mlp": 0.01256065, + "epoch": 0.37793476627085526, + "flos": 19466021750400.0, + "grad_norm": 1.7632018529100697, + "language_loss": 0.84913701, + "learning_rate": 2.858909541115758e-06, + "loss": 0.9263671, + "num_input_tokens_seen": 134978605, + "router_z_loss_clip": 1.71777344, + "router_z_loss_mlp": 0.1506958, + "step": 6286, + "time_per_iteration": 2.566092014312744 + }, + { + "auxiliary_loss_clip": 0.06452557, + "auxiliary_loss_mlp": 0.01269453, + "balance_loss_clip": 0.06281806, + "balance_loss_mlp": 0.01254182, + "epoch": 0.3779948895235232, + "flos": 10711600945920.0, + "grad_norm": 1.9010574176879877, + "language_loss": 0.823708, + "learning_rate": 2.858557806518775e-06, + "loss": 0.90092808, + "num_input_tokens_seen": 134995020, + "router_z_loss_clip": 1.70800781, + "router_z_loss_mlp": 0.15258789, + "step": 6287, + "time_per_iteration": 2.4892444610595703 + }, + { + "auxiliary_loss_clip": 0.06454234, + "auxiliary_loss_mlp": 0.01274095, + "balance_loss_clip": 0.06284317, + "balance_loss_mlp": 0.01258408, + "epoch": 0.3780550127761912, + "flos": 22316616051840.0, + "grad_norm": 2.1030531862013584, + "language_loss": 0.7330361, + "learning_rate": 2.8582060393651927e-06, + "loss": 0.81031942, + "num_input_tokens_seen": 135012620, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.15679932, + "step": 6288, + "time_per_iteration": 2.5415592193603516 + }, + { + "auxiliary_loss_clip": 0.06453485, + "auxiliary_loss_mlp": 0.01269135, + "balance_loss_clip": 0.06282276, + "balance_loss_mlp": 0.01254359, + "epoch": 0.37811513602885916, + "flos": 28958401463040.0, + "grad_norm": 1.6277535048544236, + "language_loss": 0.75782627, + "learning_rate": 2.857854239668352e-06, + "loss": 0.83505249, + "num_input_tokens_seen": 135033365, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.14770508, + "step": 6289, + "time_per_iteration": 2.5579047203063965 + }, + { + "auxiliary_loss_clip": 0.06454412, + "auxiliary_loss_mlp": 0.01273518, + "balance_loss_clip": 0.06284275, + "balance_loss_mlp": 0.01257925, + "epoch": 0.3781752592815271, + "flos": 23119717109760.0, + "grad_norm": 1.945372772068441, + "language_loss": 0.74155736, + "learning_rate": 2.857502407441593e-06, + "loss": 0.81883669, + "num_input_tokens_seen": 135052185, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.15588379, + "step": 6290, + "time_per_iteration": 2.5697786808013916 + }, + { + "auxiliary_loss_clip": 0.06458094, + "auxiliary_loss_mlp": 0.01273362, + "balance_loss_clip": 0.06281058, + "balance_loss_mlp": 0.0125653, + "epoch": 0.3782353825341951, + "flos": 19762102552320.0, + "grad_norm": 2.4066647483264596, + "language_loss": 0.80529308, + "learning_rate": 2.8571505426982566e-06, + "loss": 0.88260764, + "num_input_tokens_seen": 135070425, + "router_z_loss_clip": 1.77050781, + "router_z_loss_mlp": 0.16833496, + "step": 6291, + "time_per_iteration": 2.4970998764038086 + }, + { + "auxiliary_loss_clip": 0.06456125, + "auxiliary_loss_mlp": 0.01270776, + "balance_loss_clip": 0.06283687, + "balance_loss_mlp": 0.01254933, + "epoch": 0.37829550578686305, + "flos": 22056774940800.0, + "grad_norm": 1.7419894192909393, + "language_loss": 0.76369846, + "learning_rate": 2.8567986454516854e-06, + "loss": 0.84096742, + "num_input_tokens_seen": 135090525, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.1583252, + "step": 6292, + "time_per_iteration": 2.572916030883789 + }, + { + "auxiliary_loss_clip": 0.06452248, + "auxiliary_loss_mlp": 0.0127064, + "balance_loss_clip": 0.06281239, + "balance_loss_mlp": 0.01255631, + "epoch": 0.378355629039531, + "flos": 16475667638400.0, + "grad_norm": 1.682972265329385, + "language_loss": 0.70006013, + "learning_rate": 2.856446715715224e-06, + "loss": 0.77728903, + "num_input_tokens_seen": 135109575, + "router_z_loss_clip": 1.70996094, + "router_z_loss_mlp": 0.15014648, + "step": 6293, + "time_per_iteration": 2.5161240100860596 + }, + { + "auxiliary_loss_clip": 0.06449296, + "auxiliary_loss_mlp": 0.01271246, + "balance_loss_clip": 0.06281447, + "balance_loss_mlp": 0.01255934, + "epoch": 0.378415752292199, + "flos": 19981050071040.0, + "grad_norm": 1.9898859900525039, + "language_loss": 0.7173214, + "learning_rate": 2.8560947535022173e-06, + "loss": 0.79452682, + "num_input_tokens_seen": 135127000, + "router_z_loss_clip": 1.6796875, + "router_z_loss_mlp": 0.15332031, + "step": 6294, + "time_per_iteration": 3.9304022789001465 + }, + { + "auxiliary_loss_clip": 0.06465693, + "auxiliary_loss_mlp": 0.01279732, + "balance_loss_clip": 0.06285857, + "balance_loss_mlp": 0.01264068, + "epoch": 0.378475875544867, + "flos": 14652614355840.0, + "grad_norm": 2.57033704665896, + "language_loss": 0.83215445, + "learning_rate": 2.855742758826011e-06, + "loss": 0.90960872, + "num_input_tokens_seen": 135145285, + "router_z_loss_clip": 1.79980469, + "router_z_loss_mlp": 0.15655518, + "step": 6295, + "time_per_iteration": 2.488780975341797 + }, + { + "auxiliary_loss_clip": 0.06459963, + "auxiliary_loss_mlp": 0.01268811, + "balance_loss_clip": 0.06288329, + "balance_loss_mlp": 0.01253255, + "epoch": 0.37853599879753497, + "flos": 26658194705280.0, + "grad_norm": 1.6154959379599871, + "language_loss": 0.71442378, + "learning_rate": 2.8553907316999547e-06, + "loss": 0.79171151, + "num_input_tokens_seen": 135165240, + "router_z_loss_clip": 1.71484375, + "router_z_loss_mlp": 0.15563965, + "step": 6296, + "time_per_iteration": 4.0578773021698 + }, + { + "auxiliary_loss_clip": 0.06454356, + "auxiliary_loss_mlp": 0.01274534, + "balance_loss_clip": 0.06287888, + "balance_loss_mlp": 0.01260455, + "epoch": 0.37859612205020293, + "flos": 17317817498880.0, + "grad_norm": 1.7695984237012152, + "language_loss": 0.77514613, + "learning_rate": 2.855038672137396e-06, + "loss": 0.85243499, + "num_input_tokens_seen": 135184045, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.14074707, + "step": 6297, + "time_per_iteration": 2.54968523979187 + }, + { + "auxiliary_loss_clip": 0.06462398, + "auxiliary_loss_mlp": 0.01275228, + "balance_loss_clip": 0.0628902, + "balance_loss_mlp": 0.01259481, + "epoch": 0.3786562453028709, + "flos": 18225780341760.0, + "grad_norm": 1.977165612519376, + "language_loss": 0.80132794, + "learning_rate": 2.854686580151684e-06, + "loss": 0.87870419, + "num_input_tokens_seen": 135202365, + "router_z_loss_clip": 1.73339844, + "router_z_loss_mlp": 0.1574707, + "step": 6298, + "time_per_iteration": 2.5013349056243896 + }, + { + "auxiliary_loss_clip": 0.06454945, + "auxiliary_loss_mlp": 0.01270815, + "balance_loss_clip": 0.06285203, + "balance_loss_mlp": 0.01255711, + "epoch": 0.37871636855553886, + "flos": 21221207625600.0, + "grad_norm": 1.480969598733767, + "language_loss": 0.8501091, + "learning_rate": 2.8543344557561722e-06, + "loss": 0.92736673, + "num_input_tokens_seen": 135220955, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.15087891, + "step": 6299, + "time_per_iteration": 2.5749709606170654 + }, + { + "auxiliary_loss_clip": 0.06460874, + "auxiliary_loss_mlp": 0.01272586, + "balance_loss_clip": 0.06288288, + "balance_loss_mlp": 0.01256844, + "epoch": 0.3787764918082068, + "flos": 20957886570240.0, + "grad_norm": 2.4357425027716895, + "language_loss": 0.77022231, + "learning_rate": 2.8539822989642116e-06, + "loss": 0.84755683, + "num_input_tokens_seen": 135239715, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.15740967, + "step": 6300, + "time_per_iteration": 2.521772623062134 + }, + { + "auxiliary_loss_clip": 0.06472084, + "auxiliary_loss_mlp": 0.01275415, + "balance_loss_clip": 0.06293886, + "balance_loss_mlp": 0.01258177, + "epoch": 0.3788366150608748, + "flos": 17313205524480.0, + "grad_norm": 1.8143586204861406, + "language_loss": 0.83141446, + "learning_rate": 2.8536301097891577e-06, + "loss": 0.90888953, + "num_input_tokens_seen": 135257035, + "router_z_loss_clip": 1.78125, + "router_z_loss_mlp": 0.17236328, + "step": 6301, + "time_per_iteration": 3.982780933380127 + }, + { + "auxiliary_loss_clip": 0.0646001, + "auxiliary_loss_mlp": 0.01270469, + "balance_loss_clip": 0.06287184, + "balance_loss_mlp": 0.0125428, + "epoch": 0.37889673831354276, + "flos": 24317094355200.0, + "grad_norm": 1.8203378599779103, + "language_loss": 0.68096328, + "learning_rate": 2.8532778882443636e-06, + "loss": 0.75826812, + "num_input_tokens_seen": 135275720, + "router_z_loss_clip": 1.72753906, + "router_z_loss_mlp": 0.16186523, + "step": 6302, + "time_per_iteration": 2.5983002185821533 + }, + { + "auxiliary_loss_clip": 0.06455475, + "auxiliary_loss_mlp": 0.01270441, + "balance_loss_clip": 0.06284864, + "balance_loss_mlp": 0.01255718, + "epoch": 0.3789568615662107, + "flos": 26690157838080.0, + "grad_norm": 2.521279180058548, + "language_loss": 0.68357861, + "learning_rate": 2.8529256343431867e-06, + "loss": 0.76083779, + "num_input_tokens_seen": 135294140, + "router_z_loss_clip": 1.70703125, + "router_z_loss_mlp": 0.1472168, + "step": 6303, + "time_per_iteration": 2.5610175132751465 + }, + { + "auxiliary_loss_clip": 0.06458124, + "auxiliary_loss_mlp": 0.01272095, + "balance_loss_clip": 0.06285581, + "balance_loss_mlp": 0.01257265, + "epoch": 0.3790169848188787, + "flos": 23591713559040.0, + "grad_norm": 1.604251878296904, + "language_loss": 0.78095663, + "learning_rate": 2.8525733480989846e-06, + "loss": 0.85825884, + "num_input_tokens_seen": 135314845, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.14807129, + "step": 6304, + "time_per_iteration": 3.994072437286377 + }, + { + "auxiliary_loss_clip": 0.06468576, + "auxiliary_loss_mlp": 0.01269708, + "balance_loss_clip": 0.06292479, + "balance_loss_mlp": 0.01253806, + "epoch": 0.37907710807154665, + "flos": 18442547654400.0, + "grad_norm": 1.8924180649319282, + "language_loss": 0.80524492, + "learning_rate": 2.8522210295251146e-06, + "loss": 0.88262779, + "num_input_tokens_seen": 135333055, + "router_z_loss_clip": 1.76074219, + "router_z_loss_mlp": 0.15881348, + "step": 6305, + "time_per_iteration": 2.5073235034942627 + }, + { + "auxiliary_loss_clip": 0.06370047, + "auxiliary_loss_mlp": 0.01262008, + "balance_loss_clip": 0.06291789, + "balance_loss_mlp": 0.01258527, + "epoch": 0.3791372313242146, + "flos": 50123690887680.0, + "grad_norm": 0.9538902579511545, + "language_loss": 0.64400995, + "learning_rate": 2.8518686786349387e-06, + "loss": 0.72033048, + "num_input_tokens_seen": 135387865, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 0.03491211, + "step": 6306, + "time_per_iteration": 3.106515645980835 + }, + { + "auxiliary_loss_clip": 0.06464424, + "auxiliary_loss_mlp": 0.01273174, + "balance_loss_clip": 0.06292081, + "balance_loss_mlp": 0.01257683, + "epoch": 0.3791973545768826, + "flos": 24323467265280.0, + "grad_norm": 1.5167178412192643, + "language_loss": 0.73534656, + "learning_rate": 2.851516295441817e-06, + "loss": 0.8127225, + "num_input_tokens_seen": 135409095, + "router_z_loss_clip": 1.72363281, + "router_z_loss_mlp": 0.15484619, + "step": 6307, + "time_per_iteration": 2.6272099018096924 + }, + { + "auxiliary_loss_clip": 0.06462627, + "auxiliary_loss_mlp": 0.01270499, + "balance_loss_clip": 0.06287986, + "balance_loss_mlp": 0.0125505, + "epoch": 0.3792574778295506, + "flos": 21586329792000.0, + "grad_norm": 1.8539993286062635, + "language_loss": 0.78603798, + "learning_rate": 2.851163879959112e-06, + "loss": 0.86336923, + "num_input_tokens_seen": 135429585, + "router_z_loss_clip": 1.74511719, + "router_z_loss_mlp": 0.15441895, + "step": 6308, + "time_per_iteration": 2.518927574157715 + }, + { + "auxiliary_loss_clip": 0.06459265, + "auxiliary_loss_mlp": 0.01272841, + "balance_loss_clip": 0.06287025, + "balance_loss_mlp": 0.01257028, + "epoch": 0.37931760108221857, + "flos": 22279202403840.0, + "grad_norm": 4.0253147283534, + "language_loss": 0.73503512, + "learning_rate": 2.8508114322001876e-06, + "loss": 0.81235617, + "num_input_tokens_seen": 135446320, + "router_z_loss_clip": 1.72265625, + "router_z_loss_mlp": 0.15814209, + "step": 6309, + "time_per_iteration": 2.539158344268799 + }, + { + "auxiliary_loss_clip": 0.06457806, + "auxiliary_loss_mlp": 0.01274513, + "balance_loss_clip": 0.06288067, + "balance_loss_mlp": 0.0125963, + "epoch": 0.37937772433488653, + "flos": 19689161973120.0, + "grad_norm": 1.3654110952225158, + "language_loss": 0.79184294, + "learning_rate": 2.8504589521784083e-06, + "loss": 0.86916614, + "num_input_tokens_seen": 135465720, + "router_z_loss_clip": 1.69726562, + "router_z_loss_mlp": 0.14886475, + "step": 6310, + "time_per_iteration": 2.4997847080230713 + }, + { + "auxiliary_loss_clip": 0.06457442, + "auxiliary_loss_mlp": 0.01268809, + "balance_loss_clip": 0.06285986, + "balance_loss_mlp": 0.01253586, + "epoch": 0.3794378475875545, + "flos": 19105469631360.0, + "grad_norm": 1.8573579951480166, + "language_loss": 0.76741791, + "learning_rate": 2.8501064399071403e-06, + "loss": 0.84468043, + "num_input_tokens_seen": 135485155, + "router_z_loss_clip": 1.71484375, + "router_z_loss_mlp": 0.15222168, + "step": 6311, + "time_per_iteration": 2.5216546058654785 + }, + { + "auxiliary_loss_clip": 0.06457929, + "auxiliary_loss_mlp": 0.01276784, + "balance_loss_clip": 0.06287444, + "balance_loss_mlp": 0.01261746, + "epoch": 0.37949797084022246, + "flos": 20345920675200.0, + "grad_norm": 1.4012846072012495, + "language_loss": 0.71063423, + "learning_rate": 2.8497538953997504e-06, + "loss": 0.78798139, + "num_input_tokens_seen": 135502675, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.15032959, + "step": 6312, + "time_per_iteration": 2.4909064769744873 + }, + { + "auxiliary_loss_clip": 0.06361144, + "auxiliary_loss_mlp": 0.01254908, + "balance_loss_clip": 0.06283364, + "balance_loss_mlp": 0.01251185, + "epoch": 0.37955809409289043, + "flos": 63991121760000.0, + "grad_norm": 0.7457914665340521, + "language_loss": 0.55941355, + "learning_rate": 2.849401318669608e-06, + "loss": 0.63557404, + "num_input_tokens_seen": 135562005, + "router_z_loss_clip": 0.77734375, + "router_z_loss_mlp": 0.03713989, + "step": 6313, + "time_per_iteration": 3.1312170028686523 + }, + { + "auxiliary_loss_clip": 0.06457204, + "auxiliary_loss_mlp": 0.0127245, + "balance_loss_clip": 0.06285529, + "balance_loss_mlp": 0.01258211, + "epoch": 0.3796182173455584, + "flos": 31548777310080.0, + "grad_norm": 1.7202421351204062, + "language_loss": 0.71222353, + "learning_rate": 2.849048709730083e-06, + "loss": 0.78952008, + "num_input_tokens_seen": 135582600, + "router_z_loss_clip": 1.71679688, + "router_z_loss_mlp": 0.14233398, + "step": 6314, + "time_per_iteration": 2.5876691341400146 + }, + { + "auxiliary_loss_clip": 0.06465393, + "auxiliary_loss_mlp": 0.01270992, + "balance_loss_clip": 0.06290812, + "balance_loss_mlp": 0.01254922, + "epoch": 0.37967834059822636, + "flos": 12135766066560.0, + "grad_norm": 2.8019471516683985, + "language_loss": 0.74203241, + "learning_rate": 2.848696068594545e-06, + "loss": 0.81939626, + "num_input_tokens_seen": 135600280, + "router_z_loss_clip": 1.74511719, + "router_z_loss_mlp": 0.16064453, + "step": 6315, + "time_per_iteration": 2.5312654972076416 + }, + { + "auxiliary_loss_clip": 0.06455735, + "auxiliary_loss_mlp": 0.01269414, + "balance_loss_clip": 0.0628659, + "balance_loss_mlp": 0.01253512, + "epoch": 0.3797384638508943, + "flos": 39357989331840.0, + "grad_norm": 5.544256779510487, + "language_loss": 0.7095021, + "learning_rate": 2.8483433952763677e-06, + "loss": 0.78675354, + "num_input_tokens_seen": 135621560, + "router_z_loss_clip": 1.69238281, + "router_z_loss_mlp": 0.15905762, + "step": 6316, + "time_per_iteration": 2.642946481704712 + }, + { + "auxiliary_loss_clip": 0.06458603, + "auxiliary_loss_mlp": 0.0127094, + "balance_loss_clip": 0.06288237, + "balance_loss_mlp": 0.01255991, + "epoch": 0.3797985871035623, + "flos": 34061852165760.0, + "grad_norm": 2.4477129072331656, + "language_loss": 0.65612113, + "learning_rate": 2.847990689788923e-06, + "loss": 0.7334165, + "num_input_tokens_seen": 135641745, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.1496582, + "step": 6317, + "time_per_iteration": 2.634066104888916 + }, + { + "auxiliary_loss_clip": 0.0645286, + "auxiliary_loss_mlp": 0.0127098, + "balance_loss_clip": 0.06285463, + "balance_loss_mlp": 0.0125702, + "epoch": 0.37985871035623026, + "flos": 23228939306880.0, + "grad_norm": 1.9893651635894969, + "language_loss": 0.86348939, + "learning_rate": 2.8476379521455877e-06, + "loss": 0.94072783, + "num_input_tokens_seen": 135660650, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.13964844, + "step": 6318, + "time_per_iteration": 2.50665545463562 + }, + { + "auxiliary_loss_clip": 0.06460046, + "auxiliary_loss_mlp": 0.01273041, + "balance_loss_clip": 0.06287004, + "balance_loss_mlp": 0.01257675, + "epoch": 0.3799188336088982, + "flos": 18121002410880.0, + "grad_norm": 2.356531700065532, + "language_loss": 0.76647675, + "learning_rate": 2.8472851823597354e-06, + "loss": 0.84380764, + "num_input_tokens_seen": 135679980, + "router_z_loss_clip": 1.73046875, + "router_z_loss_mlp": 0.15368652, + "step": 6319, + "time_per_iteration": 2.50382137298584 + }, + { + "auxiliary_loss_clip": 0.06453398, + "auxiliary_loss_mlp": 0.01272745, + "balance_loss_clip": 0.06284256, + "balance_loss_mlp": 0.01258082, + "epoch": 0.3799789568615662, + "flos": 21878385598080.0, + "grad_norm": 6.804259628026359, + "language_loss": 0.6451484, + "learning_rate": 2.846932380444744e-06, + "loss": 0.72240984, + "num_input_tokens_seen": 135699400, + "router_z_loss_clip": 1.69140625, + "router_z_loss_mlp": 0.14660645, + "step": 6320, + "time_per_iteration": 2.516150712966919 + }, + { + "auxiliary_loss_clip": 0.06456275, + "auxiliary_loss_mlp": 0.01268463, + "balance_loss_clip": 0.06285265, + "balance_loss_mlp": 0.01252846, + "epoch": 0.3800390801142342, + "flos": 32971181495040.0, + "grad_norm": 1.7343317020382172, + "language_loss": 0.71855223, + "learning_rate": 2.846579546413992e-06, + "loss": 0.79579961, + "num_input_tokens_seen": 135723455, + "router_z_loss_clip": 1.71191406, + "router_z_loss_mlp": 0.15612793, + "step": 6321, + "time_per_iteration": 2.6204988956451416 + }, + { + "auxiliary_loss_clip": 0.06458073, + "auxiliary_loss_mlp": 0.01268703, + "balance_loss_clip": 0.06285845, + "balance_loss_mlp": 0.01253784, + "epoch": 0.38009920336690217, + "flos": 26914430090880.0, + "grad_norm": 1.8398392312515923, + "language_loss": 0.75578612, + "learning_rate": 2.846226680280859e-06, + "loss": 0.83305389, + "num_input_tokens_seen": 135744335, + "router_z_loss_clip": 1.72070312, + "router_z_loss_mlp": 0.14923096, + "step": 6322, + "time_per_iteration": 2.5463461875915527 + }, + { + "auxiliary_loss_clip": 0.06454624, + "auxiliary_loss_mlp": 0.01271033, + "balance_loss_clip": 0.06285781, + "balance_loss_mlp": 0.01256823, + "epoch": 0.38015932661957014, + "flos": 22494963467520.0, + "grad_norm": 1.8201003599281902, + "language_loss": 0.85709381, + "learning_rate": 2.845873782058725e-06, + "loss": 0.93435031, + "num_input_tokens_seen": 135761440, + "router_z_loss_clip": 1.6875, + "router_z_loss_mlp": 0.14215088, + "step": 6323, + "time_per_iteration": 2.4927124977111816 + }, + { + "auxiliary_loss_clip": 0.06458908, + "auxiliary_loss_mlp": 0.01270641, + "balance_loss_clip": 0.06286593, + "balance_loss_mlp": 0.01254596, + "epoch": 0.3802194498722381, + "flos": 21987440087040.0, + "grad_norm": 2.2452863694907426, + "language_loss": 0.73932886, + "learning_rate": 2.845520851760973e-06, + "loss": 0.81662428, + "num_input_tokens_seen": 135779955, + "router_z_loss_clip": 1.72265625, + "router_z_loss_mlp": 0.16027832, + "step": 6324, + "time_per_iteration": 2.4913861751556396 + }, + { + "auxiliary_loss_clip": 0.06464465, + "auxiliary_loss_mlp": 0.01272923, + "balance_loss_clip": 0.06288414, + "balance_loss_mlp": 0.01257724, + "epoch": 0.38027957312490607, + "flos": 21331310290560.0, + "grad_norm": 1.7884051563809298, + "language_loss": 0.84122628, + "learning_rate": 2.8451678894009847e-06, + "loss": 0.91860014, + "num_input_tokens_seen": 135799840, + "router_z_loss_clip": 1.76171875, + "router_z_loss_mlp": 0.15203857, + "step": 6325, + "time_per_iteration": 2.6119046211242676 + }, + { + "auxiliary_loss_clip": 0.06455745, + "auxiliary_loss_mlp": 0.01266737, + "balance_loss_clip": 0.06285073, + "balance_loss_mlp": 0.01252712, + "epoch": 0.38033969637757403, + "flos": 16696921144320.0, + "grad_norm": 2.2200302984742915, + "language_loss": 0.79868543, + "learning_rate": 2.8448148949921465e-06, + "loss": 0.87591028, + "num_input_tokens_seen": 135817880, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.14019775, + "step": 6326, + "time_per_iteration": 2.5188262462615967 + }, + { + "auxiliary_loss_clip": 0.06455691, + "auxiliary_loss_mlp": 0.01270203, + "balance_loss_clip": 0.06286497, + "balance_loss_mlp": 0.01255242, + "epoch": 0.380399819630242, + "flos": 36219741563520.0, + "grad_norm": 3.3742704435112025, + "language_loss": 0.73389304, + "learning_rate": 2.844461868547842e-06, + "loss": 0.81115204, + "num_input_tokens_seen": 135838940, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.14978027, + "step": 6327, + "time_per_iteration": 2.649383783340454 + }, + { + "auxiliary_loss_clip": 0.06459647, + "auxiliary_loss_mlp": 0.01269027, + "balance_loss_clip": 0.06290785, + "balance_loss_mlp": 0.01255145, + "epoch": 0.38045994288290996, + "flos": 21295364088960.0, + "grad_norm": 1.4936601975654378, + "language_loss": 0.83229524, + "learning_rate": 2.844108810081459e-06, + "loss": 0.90958202, + "num_input_tokens_seen": 135858325, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.13867188, + "step": 6328, + "time_per_iteration": 2.527261972427368 + }, + { + "auxiliary_loss_clip": 0.06452741, + "auxiliary_loss_mlp": 0.01268758, + "balance_loss_clip": 0.06281206, + "balance_loss_mlp": 0.01253755, + "epoch": 0.38052006613557793, + "flos": 20929151819520.0, + "grad_norm": 1.5056942690240434, + "language_loss": 0.61757982, + "learning_rate": 2.843755719606385e-06, + "loss": 0.69479483, + "num_input_tokens_seen": 135878430, + "router_z_loss_clip": 1.71679688, + "router_z_loss_mlp": 0.15008545, + "step": 6329, + "time_per_iteration": 2.54025936126709 + }, + { + "auxiliary_loss_clip": 0.0645529, + "auxiliary_loss_mlp": 0.01268187, + "balance_loss_clip": 0.06283917, + "balance_loss_mlp": 0.01254037, + "epoch": 0.3805801893882459, + "flos": 20996138759040.0, + "grad_norm": 2.0488191193117316, + "language_loss": 0.56127822, + "learning_rate": 2.8434025971360104e-06, + "loss": 0.63851297, + "num_input_tokens_seen": 135894755, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.14160156, + "step": 6330, + "time_per_iteration": 2.4913628101348877 + }, + { + "auxiliary_loss_clip": 0.06449446, + "auxiliary_loss_mlp": 0.01269693, + "balance_loss_clip": 0.06282543, + "balance_loss_mlp": 0.01255781, + "epoch": 0.38064031264091386, + "flos": 25565972734080.0, + "grad_norm": 1.4483276491856993, + "language_loss": 0.65912807, + "learning_rate": 2.8430494426837243e-06, + "loss": 0.73631942, + "num_input_tokens_seen": 135918275, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.13903809, + "step": 6331, + "time_per_iteration": 2.6071105003356934 + }, + { + "auxiliary_loss_clip": 0.0645493, + "auxiliary_loss_mlp": 0.01269934, + "balance_loss_clip": 0.06284193, + "balance_loss_mlp": 0.01254312, + "epoch": 0.3807004358935818, + "flos": 15091264080000.0, + "grad_norm": 1.528944840420101, + "language_loss": 0.7597304, + "learning_rate": 2.842696256262919e-06, + "loss": 0.83697909, + "num_input_tokens_seen": 135937430, + "router_z_loss_clip": 1.70898438, + "router_z_loss_mlp": 0.15618896, + "step": 6332, + "time_per_iteration": 2.4808928966522217 + }, + { + "auxiliary_loss_clip": 0.06456427, + "auxiliary_loss_mlp": 0.01273089, + "balance_loss_clip": 0.06283183, + "balance_loss_mlp": 0.01257943, + "epoch": 0.3807605591462498, + "flos": 16405033046400.0, + "grad_norm": 2.2042220893600226, + "language_loss": 0.82397389, + "learning_rate": 2.842343037886987e-06, + "loss": 0.90126908, + "num_input_tokens_seen": 135954210, + "router_z_loss_clip": 1.73144531, + "router_z_loss_mlp": 0.15142822, + "step": 6333, + "time_per_iteration": 2.5033013820648193 + }, + { + "auxiliary_loss_clip": 0.06454624, + "auxiliary_loss_mlp": 0.01269205, + "balance_loss_clip": 0.06283775, + "balance_loss_mlp": 0.01254655, + "epoch": 0.3808206823989178, + "flos": 29064353351040.0, + "grad_norm": 1.4831969327294916, + "language_loss": 0.86723578, + "learning_rate": 2.8419897875693226e-06, + "loss": 0.9444741, + "num_input_tokens_seen": 135974425, + "router_z_loss_clip": 1.70703125, + "router_z_loss_mlp": 0.14538574, + "step": 6334, + "time_per_iteration": 4.024240493774414 + }, + { + "auxiliary_loss_clip": 0.06455058, + "auxiliary_loss_mlp": 0.01270467, + "balance_loss_clip": 0.06282362, + "balance_loss_mlp": 0.01255155, + "epoch": 0.3808808056515858, + "flos": 15711321893760.0, + "grad_norm": 2.3448311359770795, + "language_loss": 0.79450226, + "learning_rate": 2.841636505323321e-06, + "loss": 0.87175757, + "num_input_tokens_seen": 135991985, + "router_z_loss_clip": 1.72753906, + "router_z_loss_mlp": 0.15301514, + "step": 6335, + "time_per_iteration": 2.4698357582092285 + }, + { + "auxiliary_loss_clip": 0.06453745, + "auxiliary_loss_mlp": 0.0127096, + "balance_loss_clip": 0.06281872, + "balance_loss_mlp": 0.0125517, + "epoch": 0.38094092890425374, + "flos": 20710917060480.0, + "grad_norm": 1.9128487431319638, + "language_loss": 0.72795898, + "learning_rate": 2.8412831911623795e-06, + "loss": 0.80520606, + "num_input_tokens_seen": 136010015, + "router_z_loss_clip": 1.71777344, + "router_z_loss_mlp": 0.15802002, + "step": 6336, + "time_per_iteration": 3.9780919551849365 + }, + { + "auxiliary_loss_clip": 0.06449959, + "auxiliary_loss_mlp": 0.01267203, + "balance_loss_clip": 0.06281384, + "balance_loss_mlp": 0.01252826, + "epoch": 0.3810010521569217, + "flos": 20674258099200.0, + "grad_norm": 2.2277206975915362, + "language_loss": 0.69756234, + "learning_rate": 2.840929845099894e-06, + "loss": 0.77473396, + "num_input_tokens_seen": 136028440, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.14373779, + "step": 6337, + "time_per_iteration": 2.5475378036499023 + }, + { + "auxiliary_loss_clip": 0.06454941, + "auxiliary_loss_mlp": 0.01273075, + "balance_loss_clip": 0.06282912, + "balance_loss_mlp": 0.012579, + "epoch": 0.38106117540958967, + "flos": 31834963330560.0, + "grad_norm": 1.987280020069696, + "language_loss": 0.64026022, + "learning_rate": 2.8405764671492652e-06, + "loss": 0.71754032, + "num_input_tokens_seen": 136048360, + "router_z_loss_clip": 1.71777344, + "router_z_loss_mlp": 0.1517334, + "step": 6338, + "time_per_iteration": 2.5795555114746094 + }, + { + "auxiliary_loss_clip": 0.06456137, + "auxiliary_loss_mlp": 0.01271603, + "balance_loss_clip": 0.06282276, + "balance_loss_mlp": 0.01255772, + "epoch": 0.38112129866225763, + "flos": 16907231692800.0, + "grad_norm": 1.6550535893348008, + "language_loss": 0.69685936, + "learning_rate": 2.8402230573238923e-06, + "loss": 0.77413678, + "num_input_tokens_seen": 136065500, + "router_z_loss_clip": 1.74121094, + "router_z_loss_mlp": 0.15856934, + "step": 6339, + "time_per_iteration": 2.48705792427063 + }, + { + "auxiliary_loss_clip": 0.06451984, + "auxiliary_loss_mlp": 0.01267449, + "balance_loss_clip": 0.06281533, + "balance_loss_mlp": 0.01253913, + "epoch": 0.3811814219149256, + "flos": 20893624888320.0, + "grad_norm": 2.252585455539085, + "language_loss": 0.68345773, + "learning_rate": 2.839869615637177e-06, + "loss": 0.76065207, + "num_input_tokens_seen": 136084060, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.13519287, + "step": 6340, + "time_per_iteration": 2.5142812728881836 + }, + { + "auxiliary_loss_clip": 0.06456652, + "auxiliary_loss_mlp": 0.01275426, + "balance_loss_clip": 0.06282599, + "balance_loss_mlp": 0.01260083, + "epoch": 0.38124154516759357, + "flos": 16696418019840.0, + "grad_norm": 2.4997436549257754, + "language_loss": 0.89721388, + "learning_rate": 2.839516142102522e-06, + "loss": 0.97453463, + "num_input_tokens_seen": 136102310, + "router_z_loss_clip": 1.74023438, + "router_z_loss_mlp": 0.15332031, + "step": 6341, + "time_per_iteration": 4.08266806602478 + }, + { + "auxiliary_loss_clip": 0.06461132, + "auxiliary_loss_mlp": 0.01272557, + "balance_loss_clip": 0.06284279, + "balance_loss_mlp": 0.01255427, + "epoch": 0.38130166842026153, + "flos": 19687946088960.0, + "grad_norm": 1.4891162994718032, + "language_loss": 0.75298452, + "learning_rate": 2.83916263673333e-06, + "loss": 0.83032143, + "num_input_tokens_seen": 136120725, + "router_z_loss_clip": 1.76855469, + "router_z_loss_mlp": 0.17138672, + "step": 6342, + "time_per_iteration": 2.496697425842285 + }, + { + "auxiliary_loss_clip": 0.06453368, + "auxiliary_loss_mlp": 0.01271075, + "balance_loss_clip": 0.06281647, + "balance_loss_mlp": 0.0125646, + "epoch": 0.3813617916729295, + "flos": 22204668597120.0, + "grad_norm": 1.7145643847071266, + "language_loss": 0.83785719, + "learning_rate": 2.838809099543007e-06, + "loss": 0.91510159, + "num_input_tokens_seen": 136139105, + "router_z_loss_clip": 1.71777344, + "router_z_loss_mlp": 0.14599609, + "step": 6343, + "time_per_iteration": 4.049302339553833 + }, + { + "auxiliary_loss_clip": 0.0645491, + "auxiliary_loss_mlp": 0.01269585, + "balance_loss_clip": 0.06281073, + "balance_loss_mlp": 0.01254905, + "epoch": 0.38142191492559746, + "flos": 19102576665600.0, + "grad_norm": 1.619462393744454, + "language_loss": 0.77529186, + "learning_rate": 2.838455530544959e-06, + "loss": 0.8525368, + "num_input_tokens_seen": 136158265, + "router_z_loss_clip": 1.73828125, + "router_z_loss_mlp": 0.14678955, + "step": 6344, + "time_per_iteration": 2.579394817352295 + }, + { + "auxiliary_loss_clip": 0.06456682, + "auxiliary_loss_mlp": 0.01271203, + "balance_loss_clip": 0.06285504, + "balance_loss_mlp": 0.01256635, + "epoch": 0.3814820381782654, + "flos": 24104645527680.0, + "grad_norm": 1.8871239884396722, + "language_loss": 0.74166036, + "learning_rate": 2.838101929752593e-06, + "loss": 0.81893921, + "num_input_tokens_seen": 136176100, + "router_z_loss_clip": 1.7109375, + "router_z_loss_mlp": 0.14587402, + "step": 6345, + "time_per_iteration": 2.5367093086242676 + }, + { + "auxiliary_loss_clip": 0.06457509, + "auxiliary_loss_mlp": 0.0127188, + "balance_loss_clip": 0.06287415, + "balance_loss_mlp": 0.01257765, + "epoch": 0.3815421614309334, + "flos": 15783927056640.0, + "grad_norm": 1.7118462514914357, + "language_loss": 0.69868183, + "learning_rate": 2.8377482971793187e-06, + "loss": 0.7759757, + "num_input_tokens_seen": 136195125, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.14111328, + "step": 6346, + "time_per_iteration": 2.5815930366516113 + }, + { + "auxiliary_loss_clip": 0.06466204, + "auxiliary_loss_mlp": 0.0127262, + "balance_loss_clip": 0.06290555, + "balance_loss_mlp": 0.01257236, + "epoch": 0.38160228468360136, + "flos": 19905593869440.0, + "grad_norm": 1.781545419456976, + "language_loss": 0.7611326, + "learning_rate": 2.8373946328385437e-06, + "loss": 0.83852088, + "num_input_tokens_seen": 136213885, + "router_z_loss_clip": 1.75585938, + "router_z_loss_mlp": 0.15374756, + "step": 6347, + "time_per_iteration": 2.5027284622192383 + }, + { + "auxiliary_loss_clip": 0.06456521, + "auxiliary_loss_mlp": 0.01269003, + "balance_loss_clip": 0.06283832, + "balance_loss_mlp": 0.012553, + "epoch": 0.3816624079362694, + "flos": 19287045429120.0, + "grad_norm": 1.488288802844173, + "language_loss": 0.75192666, + "learning_rate": 2.8370409367436813e-06, + "loss": 0.82918191, + "num_input_tokens_seen": 136232700, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.13702393, + "step": 6348, + "time_per_iteration": 2.559131383895874 + }, + { + "auxiliary_loss_clip": 0.0645996, + "auxiliary_loss_mlp": 0.01270391, + "balance_loss_clip": 0.06286097, + "balance_loss_mlp": 0.01256599, + "epoch": 0.38172253118893734, + "flos": 21183752050560.0, + "grad_norm": 1.729316797973715, + "language_loss": 0.88237411, + "learning_rate": 2.836687208908142e-06, + "loss": 0.95967764, + "num_input_tokens_seen": 136248975, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.13775635, + "step": 6349, + "time_per_iteration": 2.525542974472046 + }, + { + "auxiliary_loss_clip": 0.06453095, + "auxiliary_loss_mlp": 0.0126974, + "balance_loss_clip": 0.06281723, + "balance_loss_mlp": 0.01255149, + "epoch": 0.3817826544416053, + "flos": 17534836373760.0, + "grad_norm": 1.7576595366031973, + "language_loss": 0.76939785, + "learning_rate": 2.836333449345341e-06, + "loss": 0.84662628, + "num_input_tokens_seen": 136266710, + "router_z_loss_clip": 1.71386719, + "router_z_loss_mlp": 0.14593506, + "step": 6350, + "time_per_iteration": 2.532376289367676 + }, + { + "auxiliary_loss_clip": 0.06458531, + "auxiliary_loss_mlp": 0.01273484, + "balance_loss_clip": 0.06286063, + "balance_loss_mlp": 0.01258231, + "epoch": 0.38184277769427327, + "flos": 16332176321280.0, + "grad_norm": 2.21296257119241, + "language_loss": 0.77054518, + "learning_rate": 2.8359796580686907e-06, + "loss": 0.84786528, + "num_input_tokens_seen": 136284445, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.15264893, + "step": 6351, + "time_per_iteration": 2.4930031299591064 + }, + { + "auxiliary_loss_clip": 0.06457832, + "auxiliary_loss_mlp": 0.01273263, + "balance_loss_clip": 0.0628476, + "balance_loss_mlp": 0.012577, + "epoch": 0.38190290094694124, + "flos": 30450937115520.0, + "grad_norm": 2.2550067272061254, + "language_loss": 0.74895489, + "learning_rate": 2.8356258350916085e-06, + "loss": 0.82626581, + "num_input_tokens_seen": 136305730, + "router_z_loss_clip": 1.73144531, + "router_z_loss_mlp": 0.15563965, + "step": 6352, + "time_per_iteration": 2.6078808307647705 + }, + { + "auxiliary_loss_clip": 0.0645259, + "auxiliary_loss_mlp": 0.01270341, + "balance_loss_clip": 0.06283389, + "balance_loss_mlp": 0.0125659, + "epoch": 0.3819630241996092, + "flos": 14215138588800.0, + "grad_norm": 2.0554991668998777, + "language_loss": 0.63961715, + "learning_rate": 2.8352719804275104e-06, + "loss": 0.71684647, + "num_input_tokens_seen": 136323850, + "router_z_loss_clip": 1.69140625, + "router_z_loss_mlp": 0.13739014, + "step": 6353, + "time_per_iteration": 2.476759433746338 + }, + { + "auxiliary_loss_clip": 0.06456264, + "auxiliary_loss_mlp": 0.01279815, + "balance_loss_clip": 0.06283177, + "balance_loss_mlp": 0.01266112, + "epoch": 0.38202314745227717, + "flos": 25016717220480.0, + "grad_norm": 1.720129608989886, + "language_loss": 0.83556378, + "learning_rate": 2.834918094089816e-06, + "loss": 0.91292459, + "num_input_tokens_seen": 136344880, + "router_z_loss_clip": 1.72949219, + "router_z_loss_mlp": 0.13702393, + "step": 6354, + "time_per_iteration": 2.5726418495178223 + }, + { + "auxiliary_loss_clip": 0.06456912, + "auxiliary_loss_mlp": 0.01271961, + "balance_loss_clip": 0.06290418, + "balance_loss_mlp": 0.0125911, + "epoch": 0.38208327070494513, + "flos": 20820935871360.0, + "grad_norm": 1.6482101436629937, + "language_loss": 0.81480742, + "learning_rate": 2.834564176091943e-06, + "loss": 0.89209616, + "num_input_tokens_seen": 136366060, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.12854004, + "step": 6355, + "time_per_iteration": 2.5225114822387695 + }, + { + "auxiliary_loss_clip": 0.06459523, + "auxiliary_loss_mlp": 0.01273228, + "balance_loss_clip": 0.06289364, + "balance_loss_mlp": 0.01259179, + "epoch": 0.3821433939576131, + "flos": 22644282643200.0, + "grad_norm": 1.8808367718392982, + "language_loss": 0.75647783, + "learning_rate": 2.8342102264473125e-06, + "loss": 0.83380532, + "num_input_tokens_seen": 136385625, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.14031982, + "step": 6356, + "time_per_iteration": 2.5584537982940674 + }, + { + "auxiliary_loss_clip": 0.0646046, + "auxiliary_loss_mlp": 0.01272045, + "balance_loss_clip": 0.06287301, + "balance_loss_mlp": 0.01257645, + "epoch": 0.38220351721028106, + "flos": 26877100296960.0, + "grad_norm": 1.8976132208861074, + "language_loss": 0.82161039, + "learning_rate": 2.833856245169348e-06, + "loss": 0.89893544, + "num_input_tokens_seen": 136405750, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.14398193, + "step": 6357, + "time_per_iteration": 2.546190023422241 + }, + { + "auxiliary_loss_clip": 0.06463508, + "auxiliary_loss_mlp": 0.01275628, + "balance_loss_clip": 0.0629019, + "balance_loss_mlp": 0.01260035, + "epoch": 0.38226364046294903, + "flos": 23374149632640.0, + "grad_norm": 1.7334885634957151, + "language_loss": 0.78531659, + "learning_rate": 2.8335022322714695e-06, + "loss": 0.86270791, + "num_input_tokens_seen": 136426085, + "router_z_loss_clip": 1.73535156, + "router_z_loss_mlp": 0.15612793, + "step": 6358, + "time_per_iteration": 2.5330071449279785 + }, + { + "auxiliary_loss_clip": 0.06462916, + "auxiliary_loss_mlp": 0.01271369, + "balance_loss_clip": 0.06287834, + "balance_loss_mlp": 0.01256086, + "epoch": 0.382323763715617, + "flos": 19652335303680.0, + "grad_norm": 1.9007754709735623, + "language_loss": 0.79191673, + "learning_rate": 2.8331481877671036e-06, + "loss": 0.86925954, + "num_input_tokens_seen": 136442670, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.15270996, + "step": 6359, + "time_per_iteration": 2.5185654163360596 + }, + { + "auxiliary_loss_clip": 0.06457044, + "auxiliary_loss_mlp": 0.01275796, + "balance_loss_clip": 0.06287733, + "balance_loss_mlp": 0.01261884, + "epoch": 0.38238388696828496, + "flos": 54136527575040.0, + "grad_norm": 1.6591220194179586, + "language_loss": 0.70001733, + "learning_rate": 2.8327941116696754e-06, + "loss": 0.77734572, + "num_input_tokens_seen": 136465730, + "router_z_loss_clip": 1.69238281, + "router_z_loss_mlp": 0.13903809, + "step": 6360, + "time_per_iteration": 2.8067054748535156 + }, + { + "auxiliary_loss_clip": 0.06461466, + "auxiliary_loss_mlp": 0.01277777, + "balance_loss_clip": 0.06292595, + "balance_loss_mlp": 0.01262923, + "epoch": 0.382444010220953, + "flos": 24943105808640.0, + "grad_norm": 1.5737902616354833, + "language_loss": 0.79093289, + "learning_rate": 2.83244000399261e-06, + "loss": 0.86832535, + "num_input_tokens_seen": 136487215, + "router_z_loss_clip": 1.6875, + "router_z_loss_mlp": 0.14849854, + "step": 6361, + "time_per_iteration": 2.558579683303833 + }, + { + "auxiliary_loss_clip": 0.0645285, + "auxiliary_loss_mlp": 0.01272146, + "balance_loss_clip": 0.06286099, + "balance_loss_mlp": 0.01257996, + "epoch": 0.38250413347362094, + "flos": 42346750216320.0, + "grad_norm": 1.4645255919949542, + "language_loss": 0.65580732, + "learning_rate": 2.832085864749337e-06, + "loss": 0.73305726, + "num_input_tokens_seen": 136510365, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.14154053, + "step": 6362, + "time_per_iteration": 2.709390878677368 + }, + { + "auxiliary_loss_clip": 0.06459438, + "auxiliary_loss_mlp": 0.01270758, + "balance_loss_clip": 0.06287294, + "balance_loss_mlp": 0.01255415, + "epoch": 0.3825642567262889, + "flos": 16294720746240.0, + "grad_norm": 1.6166481183320216, + "language_loss": 0.8211807, + "learning_rate": 2.8317316939532848e-06, + "loss": 0.89848268, + "num_input_tokens_seen": 136527100, + "router_z_loss_clip": 1.72070312, + "router_z_loss_mlp": 0.15332031, + "step": 6363, + "time_per_iteration": 2.468846559524536 + }, + { + "auxiliary_loss_clip": 0.06453779, + "auxiliary_loss_mlp": 0.01274743, + "balance_loss_clip": 0.06286556, + "balance_loss_mlp": 0.01259401, + "epoch": 0.3826243799789569, + "flos": 45664267795200.0, + "grad_norm": 1.6258867054195516, + "language_loss": 0.59107661, + "learning_rate": 2.8313774916178825e-06, + "loss": 0.6683619, + "num_input_tokens_seen": 136550870, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.15356445, + "step": 6364, + "time_per_iteration": 2.745589256286621 + }, + { + "auxiliary_loss_clip": 0.06465845, + "auxiliary_loss_mlp": 0.0127531, + "balance_loss_clip": 0.06290866, + "balance_loss_mlp": 0.01261058, + "epoch": 0.38268450323162484, + "flos": 25308647245440.0, + "grad_norm": 2.2940920681906873, + "language_loss": 0.6951021, + "learning_rate": 2.8310232577565635e-06, + "loss": 0.77251363, + "num_input_tokens_seen": 136569895, + "router_z_loss_clip": 1.74902344, + "router_z_loss_mlp": 0.14257812, + "step": 6365, + "time_per_iteration": 2.561795473098755 + }, + { + "auxiliary_loss_clip": 0.06461614, + "auxiliary_loss_mlp": 0.01269888, + "balance_loss_clip": 0.06285347, + "balance_loss_mlp": 0.0125451, + "epoch": 0.3827446264842928, + "flos": 21842607104640.0, + "grad_norm": 2.2040506714686208, + "language_loss": 0.73211187, + "learning_rate": 2.830668992382758e-06, + "loss": 0.8094269, + "num_input_tokens_seen": 136588585, + "router_z_loss_clip": 1.76464844, + "router_z_loss_mlp": 0.15374756, + "step": 6366, + "time_per_iteration": 2.527252435684204 + }, + { + "auxiliary_loss_clip": 0.06455328, + "auxiliary_loss_mlp": 0.01270912, + "balance_loss_clip": 0.06284537, + "balance_loss_mlp": 0.0125703, + "epoch": 0.38280474973696077, + "flos": 25740924059520.0, + "grad_norm": 2.537372436592335, + "language_loss": 0.69208872, + "learning_rate": 2.830314695509902e-06, + "loss": 0.76935112, + "num_input_tokens_seen": 136606640, + "router_z_loss_clip": 1.70898438, + "router_z_loss_mlp": 0.13885498, + "step": 6367, + "time_per_iteration": 2.563174247741699 + }, + { + "auxiliary_loss_clip": 0.06445135, + "auxiliary_loss_mlp": 0.01267364, + "balance_loss_clip": 0.06281811, + "balance_loss_mlp": 0.01253482, + "epoch": 0.38286487298962874, + "flos": 24902212216320.0, + "grad_norm": 2.529219827632029, + "language_loss": 0.64519894, + "learning_rate": 2.82996036715143e-06, + "loss": 0.72232389, + "num_input_tokens_seen": 136624940, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.13897705, + "step": 6368, + "time_per_iteration": 2.5240230560302734 + }, + { + "auxiliary_loss_clip": 0.0644632, + "auxiliary_loss_mlp": 0.0126879, + "balance_loss_clip": 0.06279288, + "balance_loss_mlp": 0.01255111, + "epoch": 0.3829249962422967, + "flos": 28550457060480.0, + "grad_norm": 1.3073196657605344, + "language_loss": 0.68441451, + "learning_rate": 2.8296060073207763e-06, + "loss": 0.76156569, + "num_input_tokens_seen": 136645540, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.13677979, + "step": 6369, + "time_per_iteration": 2.623020887374878 + }, + { + "auxiliary_loss_clip": 0.06452611, + "auxiliary_loss_mlp": 0.01268713, + "balance_loss_clip": 0.0628352, + "balance_loss_mlp": 0.01254724, + "epoch": 0.38298511949496467, + "flos": 21477736500480.0, + "grad_norm": 1.6896603918496267, + "language_loss": 0.79100078, + "learning_rate": 2.8292516160313804e-06, + "loss": 0.86821401, + "num_input_tokens_seen": 136664530, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.13995361, + "step": 6370, + "time_per_iteration": 2.5265746116638184 + }, + { + "auxiliary_loss_clip": 0.06451623, + "auxiliary_loss_mlp": 0.0127085, + "balance_loss_clip": 0.06281339, + "balance_loss_mlp": 0.01256265, + "epoch": 0.38304524274763263, + "flos": 31687027747200.0, + "grad_norm": 2.908092380852583, + "language_loss": 0.651667, + "learning_rate": 2.8288971932966805e-06, + "loss": 0.72889173, + "num_input_tokens_seen": 136682315, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.14587402, + "step": 6371, + "time_per_iteration": 2.6345784664154053 + }, + { + "auxiliary_loss_clip": 0.06459577, + "auxiliary_loss_mlp": 0.01272301, + "balance_loss_clip": 0.06283382, + "balance_loss_mlp": 0.01257543, + "epoch": 0.3831053660003006, + "flos": 25082865619200.0, + "grad_norm": 2.362243450203488, + "language_loss": 0.73142469, + "learning_rate": 2.8285427391301155e-06, + "loss": 0.80874348, + "num_input_tokens_seen": 136701185, + "router_z_loss_clip": 1.76464844, + "router_z_loss_mlp": 0.14746094, + "step": 6372, + "time_per_iteration": 2.5150070190429688 + }, + { + "auxiliary_loss_clip": 0.06454702, + "auxiliary_loss_mlp": 0.01266707, + "balance_loss_clip": 0.06282556, + "balance_loss_mlp": 0.01252485, + "epoch": 0.38316548925296856, + "flos": 23265849830400.0, + "grad_norm": 1.5439174716844835, + "language_loss": 0.85255867, + "learning_rate": 2.8281882535451266e-06, + "loss": 0.92977273, + "num_input_tokens_seen": 136721265, + "router_z_loss_clip": 1.72167969, + "router_z_loss_mlp": 0.14221191, + "step": 6373, + "time_per_iteration": 4.056765794754028 + }, + { + "auxiliary_loss_clip": 0.0645606, + "auxiliary_loss_mlp": 0.01272183, + "balance_loss_clip": 0.06281903, + "balance_loss_mlp": 0.01257431, + "epoch": 0.3832256125056366, + "flos": 34432131358080.0, + "grad_norm": 8.29118461423438, + "language_loss": 0.75127506, + "learning_rate": 2.8278337365551567e-06, + "loss": 0.82855743, + "num_input_tokens_seen": 136741885, + "router_z_loss_clip": 1.74121094, + "router_z_loss_mlp": 0.14758301, + "step": 6374, + "time_per_iteration": 2.739825963973999 + }, + { + "auxiliary_loss_clip": 0.06457414, + "auxiliary_loss_mlp": 0.01272454, + "balance_loss_clip": 0.0628335, + "balance_loss_mlp": 0.01258042, + "epoch": 0.38328573575830455, + "flos": 21769289182080.0, + "grad_norm": 1.9434329018980874, + "language_loss": 0.76033717, + "learning_rate": 2.8274791881736485e-06, + "loss": 0.83763582, + "num_input_tokens_seen": 136760905, + "router_z_loss_clip": 1.74121094, + "router_z_loss_mlp": 0.14416504, + "step": 6375, + "time_per_iteration": 2.521092176437378 + }, + { + "auxiliary_loss_clip": 0.06457017, + "auxiliary_loss_mlp": 0.01267252, + "balance_loss_clip": 0.06283681, + "balance_loss_mlp": 0.01252541, + "epoch": 0.3833458590109725, + "flos": 17385056000640.0, + "grad_norm": 2.081333613596134, + "language_loss": 0.73067588, + "learning_rate": 2.8271246084140457e-06, + "loss": 0.80791855, + "num_input_tokens_seen": 136777240, + "router_z_loss_clip": 1.73046875, + "router_z_loss_mlp": 0.1472168, + "step": 6376, + "time_per_iteration": 3.913828134536743 + }, + { + "auxiliary_loss_clip": 0.06451094, + "auxiliary_loss_mlp": 0.01266207, + "balance_loss_clip": 0.06282462, + "balance_loss_mlp": 0.01251294, + "epoch": 0.3834059822636405, + "flos": 29432326556160.0, + "grad_norm": 1.6469866452188906, + "language_loss": 0.68444526, + "learning_rate": 2.826769997289796e-06, + "loss": 0.76161826, + "num_input_tokens_seen": 136801040, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.14916992, + "step": 6377, + "time_per_iteration": 2.552703857421875 + }, + { + "auxiliary_loss_clip": 0.0646103, + "auxiliary_loss_mlp": 0.01268999, + "balance_loss_clip": 0.06285432, + "balance_loss_mlp": 0.01253413, + "epoch": 0.38346610551630844, + "flos": 21477191448960.0, + "grad_norm": 1.937210921117629, + "language_loss": 0.73608565, + "learning_rate": 2.826415354814344e-06, + "loss": 0.8133859, + "num_input_tokens_seen": 136819495, + "router_z_loss_clip": 1.75488281, + "router_z_loss_mlp": 0.15582275, + "step": 6378, + "time_per_iteration": 2.554784059524536 + }, + { + "auxiliary_loss_clip": 0.06455162, + "auxiliary_loss_mlp": 0.01271763, + "balance_loss_clip": 0.06283469, + "balance_loss_mlp": 0.01257661, + "epoch": 0.3835262287689764, + "flos": 27568253900160.0, + "grad_norm": 1.6187724503548255, + "language_loss": 0.69142127, + "learning_rate": 2.8260606810011396e-06, + "loss": 0.76869053, + "num_input_tokens_seen": 136838840, + "router_z_loss_clip": 1.72070312, + "router_z_loss_mlp": 0.14099121, + "step": 6379, + "time_per_iteration": 2.540184736251831 + }, + { + "auxiliary_loss_clip": 0.06449591, + "auxiliary_loss_mlp": 0.01271871, + "balance_loss_clip": 0.06281038, + "balance_loss_mlp": 0.01258209, + "epoch": 0.3835863520216444, + "flos": 15529201044480.0, + "grad_norm": 1.7677581121541173, + "language_loss": 0.8420229, + "learning_rate": 2.8257059758636315e-06, + "loss": 0.91923743, + "num_input_tokens_seen": 136854425, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.13659668, + "step": 6380, + "time_per_iteration": 3.9425628185272217 + }, + { + "auxiliary_loss_clip": 0.06454644, + "auxiliary_loss_mlp": 0.01270371, + "balance_loss_clip": 0.06286694, + "balance_loss_mlp": 0.01255786, + "epoch": 0.38364647527431234, + "flos": 21910851855360.0, + "grad_norm": 1.4264464063638025, + "language_loss": 0.81255281, + "learning_rate": 2.8253512394152697e-06, + "loss": 0.88980293, + "num_input_tokens_seen": 136874355, + "router_z_loss_clip": 1.67871094, + "router_z_loss_mlp": 0.14569092, + "step": 6381, + "time_per_iteration": 2.5692083835601807 + }, + { + "auxiliary_loss_clip": 0.06363897, + "auxiliary_loss_mlp": 0.0126892, + "balance_loss_clip": 0.06286111, + "balance_loss_mlp": 0.01265082, + "epoch": 0.3837065985269803, + "flos": 65553076120320.0, + "grad_norm": 0.8198763586735168, + "language_loss": 0.60085058, + "learning_rate": 2.8249964716695068e-06, + "loss": 0.67717874, + "num_input_tokens_seen": 136937475, + "router_z_loss_clip": 0.77734375, + "router_z_loss_mlp": 0.03833008, + "step": 6382, + "time_per_iteration": 3.1118690967559814 + }, + { + "auxiliary_loss_clip": 0.06458844, + "auxiliary_loss_mlp": 0.0127264, + "balance_loss_clip": 0.06285119, + "balance_loss_mlp": 0.01257375, + "epoch": 0.38376672177964827, + "flos": 28264103331840.0, + "grad_norm": 2.361672223919581, + "language_loss": 0.67004663, + "learning_rate": 2.824641672639794e-06, + "loss": 0.74736154, + "num_input_tokens_seen": 136955805, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.15264893, + "step": 6383, + "time_per_iteration": 3.949587345123291 + }, + { + "auxiliary_loss_clip": 0.06458098, + "auxiliary_loss_mlp": 0.01270272, + "balance_loss_clip": 0.06285569, + "balance_loss_mlp": 0.01255919, + "epoch": 0.38382684503231623, + "flos": 20637641064960.0, + "grad_norm": 1.580160930907899, + "language_loss": 0.75169957, + "learning_rate": 2.824286842339587e-06, + "loss": 0.82898319, + "num_input_tokens_seen": 136975240, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.14355469, + "step": 6384, + "time_per_iteration": 2.5578341484069824 + }, + { + "auxiliary_loss_clip": 0.0645394, + "auxiliary_loss_mlp": 0.01272921, + "balance_loss_clip": 0.06286485, + "balance_loss_mlp": 0.01259819, + "epoch": 0.3838869682849842, + "flos": 19611274003200.0, + "grad_norm": 1.4416039952500834, + "language_loss": 0.76348937, + "learning_rate": 2.823931980782341e-06, + "loss": 0.84075809, + "num_input_tokens_seen": 136994985, + "router_z_loss_clip": 1.67578125, + "router_z_loss_mlp": 0.13092041, + "step": 6385, + "time_per_iteration": 2.5225770473480225 + }, + { + "auxiliary_loss_clip": 0.06357871, + "auxiliary_loss_mlp": 0.01265462, + "balance_loss_clip": 0.06280675, + "balance_loss_mlp": 0.01261296, + "epoch": 0.38394709153765216, + "flos": 56572202856960.0, + "grad_norm": 1.1093406194632214, + "language_loss": 0.67841589, + "learning_rate": 2.82357708798151e-06, + "loss": 0.75464916, + "num_input_tokens_seen": 137046290, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.04168701, + "step": 6386, + "time_per_iteration": 3.0481390953063965 + }, + { + "auxiliary_loss_clip": 0.06453113, + "auxiliary_loss_mlp": 0.01268796, + "balance_loss_clip": 0.06286535, + "balance_loss_mlp": 0.01254777, + "epoch": 0.3840072147903202, + "flos": 15894323210880.0, + "grad_norm": 1.5665063027995272, + "language_loss": 0.72740716, + "learning_rate": 2.8232221639505547e-06, + "loss": 0.80462623, + "num_input_tokens_seen": 137064725, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.14025879, + "step": 6387, + "time_per_iteration": 2.514692783355713 + }, + { + "auxiliary_loss_clip": 0.06447147, + "auxiliary_loss_mlp": 0.01275854, + "balance_loss_clip": 0.06283197, + "balance_loss_mlp": 0.0126187, + "epoch": 0.38406733804298815, + "flos": 28225180310400.0, + "grad_norm": 2.2869557055676095, + "language_loss": 0.81707162, + "learning_rate": 2.822867208702932e-06, + "loss": 0.89430165, + "num_input_tokens_seen": 137086030, + "router_z_loss_clip": 1.63867188, + "router_z_loss_mlp": 0.13989258, + "step": 6388, + "time_per_iteration": 2.6592257022857666 + }, + { + "auxiliary_loss_clip": 0.06454118, + "auxiliary_loss_mlp": 0.01267752, + "balance_loss_clip": 0.0628527, + "balance_loss_mlp": 0.01253888, + "epoch": 0.3841274612956561, + "flos": 18229511848320.0, + "grad_norm": 1.6912658906890043, + "language_loss": 0.76762819, + "learning_rate": 2.8225122222521026e-06, + "loss": 0.84484684, + "num_input_tokens_seen": 137105400, + "router_z_loss_clip": 1.6875, + "router_z_loss_mlp": 0.13873291, + "step": 6389, + "time_per_iteration": 2.5315403938293457 + }, + { + "auxiliary_loss_clip": 0.06454799, + "auxiliary_loss_mlp": 0.01270773, + "balance_loss_clip": 0.06281878, + "balance_loss_mlp": 0.01254847, + "epoch": 0.3841875845483241, + "flos": 19799138856960.0, + "grad_norm": 1.6723623276481432, + "language_loss": 0.76991975, + "learning_rate": 2.8221572046115273e-06, + "loss": 0.84717548, + "num_input_tokens_seen": 137124985, + "router_z_loss_clip": 1.73046875, + "router_z_loss_mlp": 0.15905762, + "step": 6390, + "time_per_iteration": 2.5315029621124268 + }, + { + "auxiliary_loss_clip": 0.0646126, + "auxiliary_loss_mlp": 0.01271779, + "balance_loss_clip": 0.06286746, + "balance_loss_mlp": 0.01255572, + "epoch": 0.38424770780099204, + "flos": 29906670919680.0, + "grad_norm": 1.876202489708209, + "language_loss": 0.70321602, + "learning_rate": 2.821802155794668e-06, + "loss": 0.78054643, + "num_input_tokens_seen": 137146745, + "router_z_loss_clip": 1.74414062, + "router_z_loss_mlp": 0.1618042, + "step": 6391, + "time_per_iteration": 2.6110270023345947 + }, + { + "auxiliary_loss_clip": 0.06455616, + "auxiliary_loss_mlp": 0.01272965, + "balance_loss_clip": 0.06284156, + "balance_loss_mlp": 0.01258499, + "epoch": 0.38430783105366, + "flos": 20820013476480.0, + "grad_norm": 1.8135855175826887, + "language_loss": 0.83923954, + "learning_rate": 2.8214470758149884e-06, + "loss": 0.91652524, + "num_input_tokens_seen": 137163195, + "router_z_loss_clip": 1.71386719, + "router_z_loss_mlp": 0.14459229, + "step": 6392, + "time_per_iteration": 2.5735576152801514 + }, + { + "auxiliary_loss_clip": 0.06461488, + "auxiliary_loss_mlp": 0.01268829, + "balance_loss_clip": 0.06290185, + "balance_loss_mlp": 0.01255162, + "epoch": 0.384367954306328, + "flos": 11003153627520.0, + "grad_norm": 1.9242234625767662, + "language_loss": 0.61454862, + "learning_rate": 2.8210919646859536e-06, + "loss": 0.69185179, + "num_input_tokens_seen": 137179330, + "router_z_loss_clip": 1.7109375, + "router_z_loss_mlp": 0.13677979, + "step": 6393, + "time_per_iteration": 2.4626450538635254 + }, + { + "auxiliary_loss_clip": 0.06467697, + "auxiliary_loss_mlp": 0.01271997, + "balance_loss_clip": 0.06290497, + "balance_loss_mlp": 0.01256071, + "epoch": 0.38442807755899594, + "flos": 25345096571520.0, + "grad_norm": 2.1306446802295325, + "language_loss": 0.71410203, + "learning_rate": 2.820736822421029e-06, + "loss": 0.79149896, + "num_input_tokens_seen": 137198655, + "router_z_loss_clip": 1.7734375, + "router_z_loss_mlp": 0.15905762, + "step": 6394, + "time_per_iteration": 2.5997071266174316 + }, + { + "auxiliary_loss_clip": 0.06463788, + "auxiliary_loss_mlp": 0.01269365, + "balance_loss_clip": 0.0628664, + "balance_loss_mlp": 0.01254082, + "epoch": 0.3844882008116639, + "flos": 21076206935040.0, + "grad_norm": 1.9216116882295546, + "language_loss": 0.82087183, + "learning_rate": 2.8203816490336822e-06, + "loss": 0.89820337, + "num_input_tokens_seen": 137217120, + "router_z_loss_clip": 1.76953125, + "router_z_loss_mlp": 0.1529541, + "step": 6395, + "time_per_iteration": 2.517411470413208 + }, + { + "auxiliary_loss_clip": 0.06460339, + "auxiliary_loss_mlp": 0.01275993, + "balance_loss_clip": 0.06287727, + "balance_loss_mlp": 0.01261831, + "epoch": 0.38454832406433187, + "flos": 17968287144960.0, + "grad_norm": 2.112818402600052, + "language_loss": 0.70801687, + "learning_rate": 2.8200264445373813e-06, + "loss": 0.78538024, + "num_input_tokens_seen": 137234410, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.14160156, + "step": 6396, + "time_per_iteration": 2.50288987159729 + }, + { + "auxiliary_loss_clip": 0.06365301, + "auxiliary_loss_mlp": 0.01257609, + "balance_loss_clip": 0.06287754, + "balance_loss_mlp": 0.01253767, + "epoch": 0.38460844731699984, + "flos": 67946641925760.0, + "grad_norm": 0.873922952794391, + "language_loss": 0.59863293, + "learning_rate": 2.8196712089455954e-06, + "loss": 0.67486203, + "num_input_tokens_seen": 137294940, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.0383606, + "step": 6397, + "time_per_iteration": 3.206678628921509 + }, + { + "auxiliary_loss_clip": 0.06450997, + "auxiliary_loss_mlp": 0.01276354, + "balance_loss_clip": 0.06284742, + "balance_loss_mlp": 0.0126187, + "epoch": 0.3846685705696678, + "flos": 25856267604480.0, + "grad_norm": 1.772406293141946, + "language_loss": 0.85227352, + "learning_rate": 2.819315942271794e-06, + "loss": 0.92954701, + "num_input_tokens_seen": 137315035, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.14477539, + "step": 6398, + "time_per_iteration": 2.5761947631835938 + }, + { + "auxiliary_loss_clip": 0.06453151, + "auxiliary_loss_mlp": 0.01277177, + "balance_loss_clip": 0.06285614, + "balance_loss_mlp": 0.01262467, + "epoch": 0.38472869382233577, + "flos": 16295852776320.0, + "grad_norm": 2.386881726324987, + "language_loss": 0.80489028, + "learning_rate": 2.8189606445294515e-06, + "loss": 0.88219357, + "num_input_tokens_seen": 137333155, + "router_z_loss_clip": 1.67578125, + "router_z_loss_mlp": 0.14715576, + "step": 6399, + "time_per_iteration": 2.4882943630218506 + }, + { + "auxiliary_loss_clip": 0.06455526, + "auxiliary_loss_mlp": 0.01279196, + "balance_loss_clip": 0.06283697, + "balance_loss_mlp": 0.01263592, + "epoch": 0.38478881707500373, + "flos": 19358979759360.0, + "grad_norm": 1.8772073039605681, + "language_loss": 0.67565721, + "learning_rate": 2.818605315732038e-06, + "loss": 0.75300437, + "num_input_tokens_seen": 137351515, + "router_z_loss_clip": 1.71972656, + "router_z_loss_mlp": 0.15588379, + "step": 6400, + "time_per_iteration": 2.5162830352783203 + }, + { + "auxiliary_loss_clip": 0.06460319, + "auxiliary_loss_mlp": 0.01269914, + "balance_loss_clip": 0.06288355, + "balance_loss_mlp": 0.01255454, + "epoch": 0.38484894032767175, + "flos": 24867356117760.0, + "grad_norm": 1.6933093627789975, + "language_loss": 0.7382642, + "learning_rate": 2.81824995589303e-06, + "loss": 0.81556654, + "num_input_tokens_seen": 137371255, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.14459229, + "step": 6401, + "time_per_iteration": 2.5274739265441895 + }, + { + "auxiliary_loss_clip": 0.06457724, + "auxiliary_loss_mlp": 0.01277936, + "balance_loss_clip": 0.06285743, + "balance_loss_mlp": 0.01262296, + "epoch": 0.3849090635803397, + "flos": 14507068613760.0, + "grad_norm": 1.836175131611194, + "language_loss": 0.72368169, + "learning_rate": 2.8178945650259012e-06, + "loss": 0.80103827, + "num_input_tokens_seen": 137388980, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.15637207, + "step": 6402, + "time_per_iteration": 2.509624481201172 + }, + { + "auxiliary_loss_clip": 0.06455728, + "auxiliary_loss_mlp": 0.01275333, + "balance_loss_clip": 0.06288305, + "balance_loss_mlp": 0.01261195, + "epoch": 0.3849691868330077, + "flos": 18521903070720.0, + "grad_norm": 1.8063322577059318, + "language_loss": 0.83321881, + "learning_rate": 2.817539143144128e-06, + "loss": 0.91052943, + "num_input_tokens_seen": 137406885, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.14147949, + "step": 6403, + "time_per_iteration": 2.469576835632324 + }, + { + "auxiliary_loss_clip": 0.06451748, + "auxiliary_loss_mlp": 0.01274136, + "balance_loss_clip": 0.06283461, + "balance_loss_mlp": 0.01259813, + "epoch": 0.38502931008567565, + "flos": 21622821045120.0, + "grad_norm": 1.901744090638215, + "language_loss": 0.83685166, + "learning_rate": 2.817183690261189e-06, + "loss": 0.91411054, + "num_input_tokens_seen": 137425535, + "router_z_loss_clip": 1.68066406, + "router_z_loss_mlp": 0.14331055, + "step": 6404, + "time_per_iteration": 2.53399920463562 + }, + { + "auxiliary_loss_clip": 0.06460617, + "auxiliary_loss_mlp": 0.01279935, + "balance_loss_clip": 0.06287636, + "balance_loss_mlp": 0.01265844, + "epoch": 0.3850894333383436, + "flos": 25423152249600.0, + "grad_norm": 1.4804001380923333, + "language_loss": 0.70053053, + "learning_rate": 2.816828206390563e-06, + "loss": 0.77793604, + "num_input_tokens_seen": 137447700, + "router_z_loss_clip": 1.72949219, + "router_z_loss_mlp": 0.14105225, + "step": 6405, + "time_per_iteration": 2.577394485473633 + }, + { + "auxiliary_loss_clip": 0.06446706, + "auxiliary_loss_mlp": 0.01276604, + "balance_loss_clip": 0.06280848, + "balance_loss_mlp": 0.01263628, + "epoch": 0.3851495565910116, + "flos": 20233721658240.0, + "grad_norm": 1.9002503642999313, + "language_loss": 0.7926501, + "learning_rate": 2.816472691545729e-06, + "loss": 0.86988324, + "num_input_tokens_seen": 137462245, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.12976074, + "step": 6406, + "time_per_iteration": 2.491785764694214 + }, + { + "auxiliary_loss_clip": 0.06454885, + "auxiliary_loss_mlp": 0.01271692, + "balance_loss_clip": 0.06282916, + "balance_loss_mlp": 0.01256516, + "epoch": 0.38520967984367954, + "flos": 16514045608320.0, + "grad_norm": 2.2453520034380463, + "language_loss": 0.84628403, + "learning_rate": 2.8161171457401694e-06, + "loss": 0.92354977, + "num_input_tokens_seen": 137476455, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.1517334, + "step": 6407, + "time_per_iteration": 2.461927890777588 + }, + { + "auxiliary_loss_clip": 0.06351051, + "auxiliary_loss_mlp": 0.01274061, + "balance_loss_clip": 0.06273395, + "balance_loss_mlp": 0.01270625, + "epoch": 0.3852698030963475, + "flos": 61333088140800.0, + "grad_norm": 0.7518927461814024, + "language_loss": 0.64829391, + "learning_rate": 2.815761568987365e-06, + "loss": 0.72454506, + "num_input_tokens_seen": 137539845, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.03445435, + "step": 6408, + "time_per_iteration": 3.195535659790039 + }, + { + "auxiliary_loss_clip": 0.06454469, + "auxiliary_loss_mlp": 0.01271284, + "balance_loss_clip": 0.06283102, + "balance_loss_mlp": 0.01256383, + "epoch": 0.3853299263490155, + "flos": 22899595633920.0, + "grad_norm": 1.3862214198415879, + "language_loss": 0.73785079, + "learning_rate": 2.8154059613008e-06, + "loss": 0.8151083, + "num_input_tokens_seen": 137559880, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.14904785, + "step": 6409, + "time_per_iteration": 2.5463829040527344 + }, + { + "auxiliary_loss_clip": 0.06465833, + "auxiliary_loss_mlp": 0.01272782, + "balance_loss_clip": 0.06287792, + "balance_loss_mlp": 0.01257667, + "epoch": 0.38539004960168344, + "flos": 20053655233920.0, + "grad_norm": 2.2638026574615076, + "language_loss": 0.70597708, + "learning_rate": 2.81505032269396e-06, + "loss": 0.78336322, + "num_input_tokens_seen": 137578225, + "router_z_loss_clip": 1.78027344, + "router_z_loss_mlp": 0.15100098, + "step": 6410, + "time_per_iteration": 2.4989383220672607 + }, + { + "auxiliary_loss_clip": 0.06347367, + "auxiliary_loss_mlp": 0.01259072, + "balance_loss_clip": 0.06269964, + "balance_loss_mlp": 0.01255689, + "epoch": 0.3854501728543514, + "flos": 68752971365760.0, + "grad_norm": 0.6472142759451909, + "language_loss": 0.6009953, + "learning_rate": 2.81469465318033e-06, + "loss": 0.67705965, + "num_input_tokens_seen": 137645770, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.03390503, + "step": 6411, + "time_per_iteration": 3.221977472305298 + }, + { + "auxiliary_loss_clip": 0.06456396, + "auxiliary_loss_mlp": 0.01271512, + "balance_loss_clip": 0.06285078, + "balance_loss_mlp": 0.01257266, + "epoch": 0.38551029610701937, + "flos": 20491214855040.0, + "grad_norm": 1.7976443608036217, + "language_loss": 0.78197634, + "learning_rate": 2.814338952773397e-06, + "loss": 0.85925543, + "num_input_tokens_seen": 137664090, + "router_z_loss_clip": 1.71484375, + "router_z_loss_mlp": 0.14245605, + "step": 6412, + "time_per_iteration": 2.5103437900543213 + }, + { + "auxiliary_loss_clip": 0.06460511, + "auxiliary_loss_mlp": 0.01272302, + "balance_loss_clip": 0.06287103, + "balance_loss_mlp": 0.01255267, + "epoch": 0.38557041935968733, + "flos": 23477627825280.0, + "grad_norm": 1.8586112834781277, + "language_loss": 0.78031844, + "learning_rate": 2.8139832214866493e-06, + "loss": 0.85764652, + "num_input_tokens_seen": 137683190, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.17041016, + "step": 6413, + "time_per_iteration": 3.933619499206543 + }, + { + "auxiliary_loss_clip": 0.06342902, + "auxiliary_loss_mlp": 0.01258937, + "balance_loss_clip": 0.06265719, + "balance_loss_mlp": 0.01255421, + "epoch": 0.38563054261235535, + "flos": 63984623068800.0, + "grad_norm": 0.7920557210391271, + "language_loss": 0.61310911, + "learning_rate": 2.813627459333576e-06, + "loss": 0.6891275, + "num_input_tokens_seen": 137737315, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.03527832, + "step": 6414, + "time_per_iteration": 3.063016891479492 + }, + { + "auxiliary_loss_clip": 0.06460327, + "auxiliary_loss_mlp": 0.0126994, + "balance_loss_clip": 0.06286235, + "balance_loss_mlp": 0.01255552, + "epoch": 0.3856906658650233, + "flos": 23994584789760.0, + "grad_norm": 1.981122511442252, + "language_loss": 0.78303337, + "learning_rate": 2.8132716663276685e-06, + "loss": 0.86033607, + "num_input_tokens_seen": 137753535, + "router_z_loss_clip": 1.74121094, + "router_z_loss_mlp": 0.14379883, + "step": 6415, + "time_per_iteration": 3.915883779525757 + }, + { + "auxiliary_loss_clip": 0.06448652, + "auxiliary_loss_mlp": 0.0126708, + "balance_loss_clip": 0.06285002, + "balance_loss_mlp": 0.01253842, + "epoch": 0.3857507891176913, + "flos": 25014075816960.0, + "grad_norm": 1.7132059772930233, + "language_loss": 0.8030045, + "learning_rate": 2.8129158424824173e-06, + "loss": 0.88016176, + "num_input_tokens_seen": 137773405, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.13244629, + "step": 6416, + "time_per_iteration": 2.5699849128723145 + }, + { + "auxiliary_loss_clip": 0.06451176, + "auxiliary_loss_mlp": 0.01270271, + "balance_loss_clip": 0.06281747, + "balance_loss_mlp": 0.01256353, + "epoch": 0.38581091237035925, + "flos": 21542082036480.0, + "grad_norm": 1.7425936217489657, + "language_loss": 0.79650658, + "learning_rate": 2.8125599878113155e-06, + "loss": 0.87372106, + "num_input_tokens_seen": 137790810, + "router_z_loss_clip": 1.69433594, + "router_z_loss_mlp": 0.13909912, + "step": 6417, + "time_per_iteration": 2.490114450454712 + }, + { + "auxiliary_loss_clip": 0.06448381, + "auxiliary_loss_mlp": 0.01266538, + "balance_loss_clip": 0.06279223, + "balance_loss_mlp": 0.01252602, + "epoch": 0.3858710356230272, + "flos": 17389584120960.0, + "grad_norm": 1.6880082960892822, + "language_loss": 0.80518526, + "learning_rate": 2.8122041023278583e-06, + "loss": 0.88233447, + "num_input_tokens_seen": 137810265, + "router_z_loss_clip": 1.69042969, + "router_z_loss_mlp": 0.13922119, + "step": 6418, + "time_per_iteration": 2.5246312618255615 + }, + { + "auxiliary_loss_clip": 0.06443715, + "auxiliary_loss_mlp": 0.01268216, + "balance_loss_clip": 0.06276865, + "balance_loss_mlp": 0.01254662, + "epoch": 0.3859311588756952, + "flos": 20345836821120.0, + "grad_norm": 1.685120659988575, + "language_loss": 0.79909503, + "learning_rate": 2.8118481860455407e-06, + "loss": 0.87621439, + "num_input_tokens_seen": 137828580, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.13568115, + "step": 6419, + "time_per_iteration": 3.9288835525512695 + }, + { + "auxiliary_loss_clip": 0.06446663, + "auxiliary_loss_mlp": 0.01270123, + "balance_loss_clip": 0.06280138, + "balance_loss_mlp": 0.01254745, + "epoch": 0.38599128212836314, + "flos": 26328054418560.0, + "grad_norm": 1.9252922162684358, + "language_loss": 0.67831242, + "learning_rate": 2.8114922389778573e-06, + "loss": 0.75548029, + "num_input_tokens_seen": 137846145, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.15362549, + "step": 6420, + "time_per_iteration": 2.5568132400512695 + }, + { + "auxiliary_loss_clip": 0.06447464, + "auxiliary_loss_mlp": 0.0127397, + "balance_loss_clip": 0.06282772, + "balance_loss_mlp": 0.01260267, + "epoch": 0.3860514053810311, + "flos": 13559050719360.0, + "grad_norm": 1.8138727093850848, + "language_loss": 0.81903851, + "learning_rate": 2.8111362611383076e-06, + "loss": 0.89625287, + "num_input_tokens_seen": 137863705, + "router_z_loss_clip": 1.64746094, + "router_z_loss_mlp": 0.13690186, + "step": 6421, + "time_per_iteration": 2.6095190048217773 + }, + { + "auxiliary_loss_clip": 0.06448883, + "auxiliary_loss_mlp": 0.01270223, + "balance_loss_clip": 0.06279142, + "balance_loss_mlp": 0.01254654, + "epoch": 0.3861115286336991, + "flos": 20959689432960.0, + "grad_norm": 1.9472147710185277, + "language_loss": 0.72463268, + "learning_rate": 2.8107802525403886e-06, + "loss": 0.80182374, + "num_input_tokens_seen": 137880285, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.15576172, + "step": 6422, + "time_per_iteration": 3.9032654762268066 + }, + { + "auxiliary_loss_clip": 0.06443937, + "auxiliary_loss_mlp": 0.01268443, + "balance_loss_clip": 0.06280221, + "balance_loss_mlp": 0.01254925, + "epoch": 0.38617165188636704, + "flos": 16368290231040.0, + "grad_norm": 1.6312257254810183, + "language_loss": 0.66935605, + "learning_rate": 2.8104242131976025e-06, + "loss": 0.74647987, + "num_input_tokens_seen": 137898335, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.13531494, + "step": 6423, + "time_per_iteration": 2.4858603477478027 + }, + { + "auxiliary_loss_clip": 0.06452656, + "auxiliary_loss_mlp": 0.01269446, + "balance_loss_clip": 0.06281117, + "balance_loss_mlp": 0.01254771, + "epoch": 0.386231775139035, + "flos": 34795828005120.0, + "grad_norm": 1.7836916741722195, + "language_loss": 0.69448572, + "learning_rate": 2.810068143123449e-06, + "loss": 0.77170676, + "num_input_tokens_seen": 137918605, + "router_z_loss_clip": 1.71679688, + "router_z_loss_mlp": 0.14685059, + "step": 6424, + "time_per_iteration": 2.636545181274414 + }, + { + "auxiliary_loss_clip": 0.06446116, + "auxiliary_loss_mlp": 0.01269815, + "balance_loss_clip": 0.0628031, + "balance_loss_mlp": 0.0125616, + "epoch": 0.38629189839170297, + "flos": 21732672147840.0, + "grad_norm": 1.4876753960050375, + "language_loss": 0.72829968, + "learning_rate": 2.809712042331429e-06, + "loss": 0.80545902, + "num_input_tokens_seen": 137938245, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.13677979, + "step": 6425, + "time_per_iteration": 2.520872116088867 + }, + { + "auxiliary_loss_clip": 0.06454374, + "auxiliary_loss_mlp": 0.01269159, + "balance_loss_clip": 0.06279134, + "balance_loss_mlp": 0.01254383, + "epoch": 0.38635202164437094, + "flos": 27930315392640.0, + "grad_norm": 3.253764220801107, + "language_loss": 0.8113848, + "learning_rate": 2.8093559108350484e-06, + "loss": 0.88862014, + "num_input_tokens_seen": 137956770, + "router_z_loss_clip": 1.75097656, + "router_z_loss_mlp": 0.14752197, + "step": 6426, + "time_per_iteration": 2.577439785003662 + }, + { + "auxiliary_loss_clip": 0.06458677, + "auxiliary_loss_mlp": 0.01277199, + "balance_loss_clip": 0.06288534, + "balance_loss_mlp": 0.01261797, + "epoch": 0.38641214489703896, + "flos": 23593390640640.0, + "grad_norm": 1.9966810796758758, + "language_loss": 0.75299263, + "learning_rate": 2.80899974864781e-06, + "loss": 0.83035141, + "num_input_tokens_seen": 137977040, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.15393066, + "step": 6427, + "time_per_iteration": 2.538494825363159 + }, + { + "auxiliary_loss_clip": 0.06449243, + "auxiliary_loss_mlp": 0.01269948, + "balance_loss_clip": 0.0627961, + "balance_loss_mlp": 0.01255512, + "epoch": 0.3864722681497069, + "flos": 12646224339840.0, + "grad_norm": 1.7399599530073546, + "language_loss": 0.70451963, + "learning_rate": 2.8086435557832203e-06, + "loss": 0.78171146, + "num_input_tokens_seen": 137993545, + "router_z_loss_clip": 1.69726562, + "router_z_loss_mlp": 0.14428711, + "step": 6428, + "time_per_iteration": 2.501620292663574 + }, + { + "auxiliary_loss_clip": 0.06450263, + "auxiliary_loss_mlp": 0.01273584, + "balance_loss_clip": 0.06279485, + "balance_loss_mlp": 0.01259517, + "epoch": 0.3865323914023749, + "flos": 17604003519360.0, + "grad_norm": 1.9791686977360912, + "language_loss": 0.84605539, + "learning_rate": 2.8082873322547863e-06, + "loss": 0.92329377, + "num_input_tokens_seen": 138010140, + "router_z_loss_clip": 1.70898438, + "router_z_loss_mlp": 0.14074707, + "step": 6429, + "time_per_iteration": 2.4769797325134277 + }, + { + "auxiliary_loss_clip": 0.06453393, + "auxiliary_loss_mlp": 0.01272687, + "balance_loss_clip": 0.06283154, + "balance_loss_mlp": 0.01258679, + "epoch": 0.38659251465504285, + "flos": 18484908693120.0, + "grad_norm": 1.8799663311521415, + "language_loss": 0.81149292, + "learning_rate": 2.807931078076015e-06, + "loss": 0.88875371, + "num_input_tokens_seen": 138028880, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.13995361, + "step": 6430, + "time_per_iteration": 2.552243232727051 + }, + { + "auxiliary_loss_clip": 0.06342202, + "auxiliary_loss_mlp": 0.0126596, + "balance_loss_clip": 0.06266356, + "balance_loss_mlp": 0.0126256, + "epoch": 0.3866526379077108, + "flos": 64186533480960.0, + "grad_norm": 0.7018569193916078, + "language_loss": 0.58841789, + "learning_rate": 2.807574793260416e-06, + "loss": 0.66449958, + "num_input_tokens_seen": 138098090, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.03408813, + "step": 6431, + "time_per_iteration": 3.1865365505218506 + }, + { + "auxiliary_loss_clip": 0.06457522, + "auxiliary_loss_mlp": 0.01268383, + "balance_loss_clip": 0.06283836, + "balance_loss_mlp": 0.01253464, + "epoch": 0.3867127611603788, + "flos": 14392857098880.0, + "grad_norm": 1.8389423140015868, + "language_loss": 0.79719216, + "learning_rate": 2.8072184778215004e-06, + "loss": 0.87445116, + "num_input_tokens_seen": 138114735, + "router_z_loss_clip": 1.73828125, + "router_z_loss_mlp": 0.14910889, + "step": 6432, + "time_per_iteration": 2.5060834884643555 + }, + { + "auxiliary_loss_clip": 0.06456694, + "auxiliary_loss_mlp": 0.0127456, + "balance_loss_clip": 0.06279335, + "balance_loss_mlp": 0.01259217, + "epoch": 0.38677288441304675, + "flos": 20016870491520.0, + "grad_norm": 2.041684818915054, + "language_loss": 0.80982423, + "learning_rate": 2.806862131772779e-06, + "loss": 0.88713682, + "num_input_tokens_seen": 138130480, + "router_z_loss_clip": 1.77050781, + "router_z_loss_mlp": 0.15350342, + "step": 6433, + "time_per_iteration": 2.4978644847869873 + }, + { + "auxiliary_loss_clip": 0.06452015, + "auxiliary_loss_mlp": 0.01268045, + "balance_loss_clip": 0.06280582, + "balance_loss_mlp": 0.01251725, + "epoch": 0.3868330076657147, + "flos": 22243465837440.0, + "grad_norm": 1.5518308416482827, + "language_loss": 0.71316475, + "learning_rate": 2.806505755127765e-06, + "loss": 0.79036534, + "num_input_tokens_seen": 138150640, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.16308594, + "step": 6434, + "time_per_iteration": 2.5623676776885986 + }, + { + "auxiliary_loss_clip": 0.06457677, + "auxiliary_loss_mlp": 0.01269901, + "balance_loss_clip": 0.06278059, + "balance_loss_mlp": 0.01254547, + "epoch": 0.3868931309183827, + "flos": 16733076981120.0, + "grad_norm": 1.5292505515468358, + "language_loss": 0.77740347, + "learning_rate": 2.806149347899972e-06, + "loss": 0.85467923, + "num_input_tokens_seen": 138169700, + "router_z_loss_clip": 1.79394531, + "router_z_loss_mlp": 0.15350342, + "step": 6435, + "time_per_iteration": 2.4930777549743652 + }, + { + "auxiliary_loss_clip": 0.06446007, + "auxiliary_loss_mlp": 0.01272949, + "balance_loss_clip": 0.0627854, + "balance_loss_mlp": 0.01257594, + "epoch": 0.38695325417105064, + "flos": 22681360874880.0, + "grad_norm": 2.334489182765127, + "language_loss": 0.79902756, + "learning_rate": 2.805792910102915e-06, + "loss": 0.87621707, + "num_input_tokens_seen": 138185835, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.15362549, + "step": 6436, + "time_per_iteration": 2.595480442047119 + }, + { + "auxiliary_loss_clip": 0.06446151, + "auxiliary_loss_mlp": 0.01269254, + "balance_loss_clip": 0.0628051, + "balance_loss_mlp": 0.01255312, + "epoch": 0.3870133774237186, + "flos": 23118668933760.0, + "grad_norm": 1.736913277816888, + "language_loss": 0.77232099, + "learning_rate": 2.8054364417501093e-06, + "loss": 0.84947503, + "num_input_tokens_seen": 138204080, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.13934326, + "step": 6437, + "time_per_iteration": 2.6555299758911133 + }, + { + "auxiliary_loss_clip": 0.064465, + "auxiliary_loss_mlp": 0.01272869, + "balance_loss_clip": 0.06279578, + "balance_loss_mlp": 0.01259422, + "epoch": 0.3870735006763866, + "flos": 17681430291840.0, + "grad_norm": 2.573442514460841, + "language_loss": 0.81961322, + "learning_rate": 2.805079942855074e-06, + "loss": 0.89680696, + "num_input_tokens_seen": 138220710, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.13452148, + "step": 6438, + "time_per_iteration": 2.55658221244812 + }, + { + "auxiliary_loss_clip": 0.06449786, + "auxiliary_loss_mlp": 0.01268651, + "balance_loss_clip": 0.06278464, + "balance_loss_mlp": 0.01253869, + "epoch": 0.38713362392905454, + "flos": 23302676499840.0, + "grad_norm": 1.3535213690135137, + "language_loss": 0.75684851, + "learning_rate": 2.804723413431326e-06, + "loss": 0.83403289, + "num_input_tokens_seen": 138241720, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.14782715, + "step": 6439, + "time_per_iteration": 2.5023999214172363 + }, + { + "auxiliary_loss_clip": 0.06452194, + "auxiliary_loss_mlp": 0.01275332, + "balance_loss_clip": 0.06287295, + "balance_loss_mlp": 0.0126083, + "epoch": 0.38719374718172256, + "flos": 21037283913600.0, + "grad_norm": 2.8624272787557556, + "language_loss": 0.74227071, + "learning_rate": 2.8043668534923855e-06, + "loss": 0.81954598, + "num_input_tokens_seen": 138261885, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.1449585, + "step": 6440, + "time_per_iteration": 2.5370354652404785 + }, + { + "auxiliary_loss_clip": 0.06454886, + "auxiliary_loss_mlp": 0.01272767, + "balance_loss_clip": 0.06279822, + "balance_loss_mlp": 0.01257401, + "epoch": 0.3872538704343905, + "flos": 19615885977600.0, + "grad_norm": 1.8472167429080706, + "language_loss": 0.82205182, + "learning_rate": 2.804010263051774e-06, + "loss": 0.89932835, + "num_input_tokens_seen": 138280255, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.15368652, + "step": 6441, + "time_per_iteration": 2.4829154014587402 + }, + { + "auxiliary_loss_clip": 0.06449816, + "auxiliary_loss_mlp": 0.01273448, + "balance_loss_clip": 0.0628119, + "balance_loss_mlp": 0.01258833, + "epoch": 0.3873139936870585, + "flos": 17535800695680.0, + "grad_norm": 2.061540845511299, + "language_loss": 0.80687004, + "learning_rate": 2.8036536421230118e-06, + "loss": 0.8841027, + "num_input_tokens_seen": 138296675, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.14593506, + "step": 6442, + "time_per_iteration": 2.5348403453826904 + }, + { + "auxiliary_loss_clip": 0.0645024, + "auxiliary_loss_mlp": 0.01274941, + "balance_loss_clip": 0.0628161, + "balance_loss_mlp": 0.01260302, + "epoch": 0.38737411693972645, + "flos": 17792539205760.0, + "grad_norm": 1.5850563005203315, + "language_loss": 0.84242606, + "learning_rate": 2.803296990719624e-06, + "loss": 0.91967785, + "num_input_tokens_seen": 138314985, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.14642334, + "step": 6443, + "time_per_iteration": 2.475142240524292 + }, + { + "auxiliary_loss_clip": 0.06346577, + "auxiliary_loss_mlp": 0.01257136, + "balance_loss_clip": 0.06270638, + "balance_loss_mlp": 0.01253804, + "epoch": 0.3874342401923944, + "flos": 58320554624640.0, + "grad_norm": 0.7460963165264183, + "language_loss": 0.5025984, + "learning_rate": 2.8029403088551327e-06, + "loss": 0.57863545, + "num_input_tokens_seen": 138373275, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.03338623, + "step": 6444, + "time_per_iteration": 3.146993398666382 + }, + { + "auxiliary_loss_clip": 0.06439754, + "auxiliary_loss_mlp": 0.01267857, + "balance_loss_clip": 0.0627708, + "balance_loss_mlp": 0.01254088, + "epoch": 0.3874943634450624, + "flos": 17717628055680.0, + "grad_norm": 1.4103476418524727, + "language_loss": 0.79081571, + "learning_rate": 2.802583596543065e-06, + "loss": 0.86789179, + "num_input_tokens_seen": 138391145, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.13757324, + "step": 6445, + "time_per_iteration": 2.4769954681396484 + }, + { + "auxiliary_loss_clip": 0.06442489, + "auxiliary_loss_mlp": 0.01275349, + "balance_loss_clip": 0.06277544, + "balance_loss_mlp": 0.01261497, + "epoch": 0.38755448669773035, + "flos": 19250889592320.0, + "grad_norm": 1.890349589911811, + "language_loss": 0.81530821, + "learning_rate": 2.8022268537969474e-06, + "loss": 0.89248657, + "num_input_tokens_seen": 138409875, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.13861084, + "step": 6446, + "time_per_iteration": 2.5224525928497314 + }, + { + "auxiliary_loss_clip": 0.06442682, + "auxiliary_loss_mlp": 0.01277068, + "balance_loss_clip": 0.06275576, + "balance_loss_mlp": 0.01262489, + "epoch": 0.3876146099503983, + "flos": 20600437052160.0, + "grad_norm": 2.019397578580159, + "language_loss": 0.77555805, + "learning_rate": 2.801870080630306e-06, + "loss": 0.85275555, + "num_input_tokens_seen": 138428965, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.14575195, + "step": 6447, + "time_per_iteration": 2.4808783531188965 + }, + { + "auxiliary_loss_clip": 0.06441282, + "auxiliary_loss_mlp": 0.01273458, + "balance_loss_clip": 0.06277911, + "balance_loss_mlp": 0.01259355, + "epoch": 0.3876747332030663, + "flos": 19287129283200.0, + "grad_norm": 1.5926200346390118, + "language_loss": 0.76299512, + "learning_rate": 2.801513277056671e-06, + "loss": 0.84014249, + "num_input_tokens_seen": 138448090, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.14099121, + "step": 6448, + "time_per_iteration": 2.532101631164551 + }, + { + "auxiliary_loss_clip": 0.06445228, + "auxiliary_loss_mlp": 0.01276025, + "balance_loss_clip": 0.06280892, + "balance_loss_mlp": 0.01262363, + "epoch": 0.38773485645573424, + "flos": 18950699940480.0, + "grad_norm": 1.5288018173805344, + "language_loss": 0.76734072, + "learning_rate": 2.8011564430895725e-06, + "loss": 0.84455323, + "num_input_tokens_seen": 138466105, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.13647461, + "step": 6449, + "time_per_iteration": 2.515660524368286 + }, + { + "auxiliary_loss_clip": 0.06448871, + "auxiliary_loss_mlp": 0.01273884, + "balance_loss_clip": 0.0627744, + "balance_loss_mlp": 0.01258673, + "epoch": 0.3877949797084022, + "flos": 23077272216960.0, + "grad_norm": 1.7542495709483765, + "language_loss": 0.78832948, + "learning_rate": 2.800799578742542e-06, + "loss": 0.86555696, + "num_input_tokens_seen": 138485160, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.15209961, + "step": 6450, + "time_per_iteration": 2.5662050247192383 + }, + { + "auxiliary_loss_clip": 0.06452119, + "auxiliary_loss_mlp": 0.01276385, + "balance_loss_clip": 0.06276712, + "balance_loss_mlp": 0.01261317, + "epoch": 0.3878551029610702, + "flos": 29103150591360.0, + "grad_norm": 2.1638461576043095, + "language_loss": 0.78188771, + "learning_rate": 2.8004426840291106e-06, + "loss": 0.8591727, + "num_input_tokens_seen": 138504135, + "router_z_loss_clip": 1.75390625, + "router_z_loss_mlp": 0.15063477, + "step": 6451, + "time_per_iteration": 2.5734686851501465 + }, + { + "auxiliary_loss_clip": 0.06442447, + "auxiliary_loss_mlp": 0.01277813, + "balance_loss_clip": 0.06278168, + "balance_loss_mlp": 0.01263967, + "epoch": 0.38791522621373814, + "flos": 21002763231360.0, + "grad_norm": 1.7745661107883532, + "language_loss": 0.76657486, + "learning_rate": 2.800085758962812e-06, + "loss": 0.84377748, + "num_input_tokens_seen": 138523955, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.13842773, + "step": 6452, + "time_per_iteration": 4.083965301513672 + }, + { + "auxiliary_loss_clip": 0.06445795, + "auxiliary_loss_mlp": 0.01272941, + "balance_loss_clip": 0.06277959, + "balance_loss_mlp": 0.01258457, + "epoch": 0.3879753494664061, + "flos": 15492248593920.0, + "grad_norm": 1.5775897118958155, + "language_loss": 0.80075014, + "learning_rate": 2.799728803557182e-06, + "loss": 0.87793756, + "num_input_tokens_seen": 138541655, + "router_z_loss_clip": 1.67871094, + "router_z_loss_mlp": 0.14483643, + "step": 6453, + "time_per_iteration": 2.5186924934387207 + }, + { + "auxiliary_loss_clip": 0.06452494, + "auxiliary_loss_mlp": 0.01273182, + "balance_loss_clip": 0.06277925, + "balance_loss_mlp": 0.01258472, + "epoch": 0.3880354727190741, + "flos": 22060422593280.0, + "grad_norm": 1.7271767654368522, + "language_loss": 0.71748114, + "learning_rate": 2.7993718178257555e-06, + "loss": 0.79473794, + "num_input_tokens_seen": 138560860, + "router_z_loss_clip": 1.74609375, + "router_z_loss_mlp": 0.14697266, + "step": 6454, + "time_per_iteration": 2.516023635864258 + }, + { + "auxiliary_loss_clip": 0.0645522, + "auxiliary_loss_mlp": 0.01280556, + "balance_loss_clip": 0.06279911, + "balance_loss_mlp": 0.01263986, + "epoch": 0.3880955959717421, + "flos": 20346675361920.0, + "grad_norm": 2.0562500360548452, + "language_loss": 0.77941358, + "learning_rate": 2.7990148017820694e-06, + "loss": 0.85677135, + "num_input_tokens_seen": 138580200, + "router_z_loss_clip": 1.75097656, + "router_z_loss_mlp": 0.16577148, + "step": 6455, + "time_per_iteration": 3.9251530170440674 + }, + { + "auxiliary_loss_clip": 0.0644723, + "auxiliary_loss_mlp": 0.0127199, + "balance_loss_clip": 0.062791, + "balance_loss_mlp": 0.01257804, + "epoch": 0.38815571922441006, + "flos": 23082009972480.0, + "grad_norm": 1.5355571660803105, + "language_loss": 0.76081556, + "learning_rate": 2.798657755439662e-06, + "loss": 0.83800781, + "num_input_tokens_seen": 138598315, + "router_z_loss_clip": 1.68066406, + "router_z_loss_mlp": 0.14196777, + "step": 6456, + "time_per_iteration": 2.5377979278564453 + }, + { + "auxiliary_loss_clip": 0.064498, + "auxiliary_loss_mlp": 0.01279611, + "balance_loss_clip": 0.06277888, + "balance_loss_mlp": 0.01264811, + "epoch": 0.388215842477078, + "flos": 20783186807040.0, + "grad_norm": 2.2521174172947838, + "language_loss": 0.60975528, + "learning_rate": 2.7983006788120726e-06, + "loss": 0.68704933, + "num_input_tokens_seen": 138615695, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.14801025, + "step": 6457, + "time_per_iteration": 2.500054121017456 + }, + { + "auxiliary_loss_clip": 0.06447765, + "auxiliary_loss_mlp": 0.01274853, + "balance_loss_clip": 0.06275971, + "balance_loss_mlp": 0.01259308, + "epoch": 0.388275965729746, + "flos": 20454304331520.0, + "grad_norm": 3.4499577756661384, + "language_loss": 0.80527538, + "learning_rate": 2.797943571912841e-06, + "loss": 0.88250154, + "num_input_tokens_seen": 138633180, + "router_z_loss_clip": 1.71777344, + "router_z_loss_mlp": 0.15551758, + "step": 6458, + "time_per_iteration": 2.5349881649017334 + }, + { + "auxiliary_loss_clip": 0.06448271, + "auxiliary_loss_mlp": 0.01274317, + "balance_loss_clip": 0.06278434, + "balance_loss_mlp": 0.0125938, + "epoch": 0.38833608898241395, + "flos": 27899945487360.0, + "grad_norm": 3.532155031934189, + "language_loss": 0.8156774, + "learning_rate": 2.797586434755509e-06, + "loss": 0.89290321, + "num_input_tokens_seen": 138654785, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.14941406, + "step": 6459, + "time_per_iteration": 4.015187978744507 + }, + { + "auxiliary_loss_clip": 0.0644253, + "auxiliary_loss_mlp": 0.01277266, + "balance_loss_clip": 0.06278129, + "balance_loss_mlp": 0.01263789, + "epoch": 0.3883962122350819, + "flos": 18082079389440.0, + "grad_norm": 1.6405749509561738, + "language_loss": 0.62564123, + "learning_rate": 2.7972292673536202e-06, + "loss": 0.7028392, + "num_input_tokens_seen": 138673330, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.13470459, + "step": 6460, + "time_per_iteration": 2.497053861618042 + }, + { + "auxiliary_loss_clip": 0.06445154, + "auxiliary_loss_mlp": 0.01273315, + "balance_loss_clip": 0.06277992, + "balance_loss_mlp": 0.01259374, + "epoch": 0.3884563354877499, + "flos": 23628875644800.0, + "grad_norm": 1.560750838950793, + "language_loss": 0.86785483, + "learning_rate": 2.796872069720717e-06, + "loss": 0.94503951, + "num_input_tokens_seen": 138694185, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.1394043, + "step": 6461, + "time_per_iteration": 2.5308427810668945 + }, + { + "auxiliary_loss_clip": 0.06442384, + "auxiliary_loss_mlp": 0.01273139, + "balance_loss_clip": 0.06272203, + "balance_loss_mlp": 0.01258369, + "epoch": 0.38851645874041785, + "flos": 27460834565760.0, + "grad_norm": 2.5738865735247285, + "language_loss": 0.71770304, + "learning_rate": 2.7965148418703456e-06, + "loss": 0.79485828, + "num_input_tokens_seen": 138714625, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.14782715, + "step": 6462, + "time_per_iteration": 3.942819833755493 + }, + { + "auxiliary_loss_clip": 0.06442184, + "auxiliary_loss_mlp": 0.01271045, + "balance_loss_clip": 0.0627287, + "balance_loss_mlp": 0.01256036, + "epoch": 0.3885765819930858, + "flos": 25235035833600.0, + "grad_norm": 2.2250707690072886, + "language_loss": 0.76693827, + "learning_rate": 2.796157583816052e-06, + "loss": 0.84407055, + "num_input_tokens_seen": 138733585, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.15014648, + "step": 6463, + "time_per_iteration": 2.577254056930542 + }, + { + "auxiliary_loss_clip": 0.06458563, + "auxiliary_loss_mlp": 0.01275367, + "balance_loss_clip": 0.06282724, + "balance_loss_mlp": 0.01259441, + "epoch": 0.3886367052457538, + "flos": 16952317989120.0, + "grad_norm": 2.5235079856597196, + "language_loss": 0.70838499, + "learning_rate": 2.795800295571382e-06, + "loss": 0.78572428, + "num_input_tokens_seen": 138752335, + "router_z_loss_clip": 1.75683594, + "router_z_loss_mlp": 0.15930176, + "step": 6464, + "time_per_iteration": 2.501830816268921 + }, + { + "auxiliary_loss_clip": 0.06442419, + "auxiliary_loss_mlp": 0.01270994, + "balance_loss_clip": 0.06275325, + "balance_loss_mlp": 0.01255699, + "epoch": 0.38869682849842174, + "flos": 27160141789440.0, + "grad_norm": 1.8571499226781363, + "language_loss": 0.69473737, + "learning_rate": 2.7954429771498858e-06, + "loss": 0.77187151, + "num_input_tokens_seen": 138768450, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.15301514, + "step": 6465, + "time_per_iteration": 2.6060595512390137 + }, + { + "auxiliary_loss_clip": 0.06446355, + "auxiliary_loss_mlp": 0.01273054, + "balance_loss_clip": 0.06276145, + "balance_loss_mlp": 0.01257271, + "epoch": 0.3887569517510897, + "flos": 21069037411200.0, + "grad_norm": 2.3078416168388243, + "language_loss": 0.78628361, + "learning_rate": 2.7950856285651117e-06, + "loss": 0.86347771, + "num_input_tokens_seen": 138786775, + "router_z_loss_clip": 1.70019531, + "router_z_loss_mlp": 0.15771484, + "step": 6466, + "time_per_iteration": 2.503218650817871 + }, + { + "auxiliary_loss_clip": 0.06447446, + "auxiliary_loss_mlp": 0.01269245, + "balance_loss_clip": 0.0627599, + "balance_loss_mlp": 0.01255, + "epoch": 0.38881707500375773, + "flos": 29505141354240.0, + "grad_norm": 1.7748655394270907, + "language_loss": 0.695912, + "learning_rate": 2.794728249830611e-06, + "loss": 0.77307892, + "num_input_tokens_seen": 138810100, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.1427002, + "step": 6467, + "time_per_iteration": 2.6156952381134033 + }, + { + "auxiliary_loss_clip": 0.0644877, + "auxiliary_loss_mlp": 0.01269809, + "balance_loss_clip": 0.06277345, + "balance_loss_mlp": 0.01255403, + "epoch": 0.3888771982564257, + "flos": 17493146167680.0, + "grad_norm": 2.2278384059050285, + "language_loss": 0.83988351, + "learning_rate": 2.794370840959936e-06, + "loss": 0.91706932, + "num_input_tokens_seen": 138825140, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.14404297, + "step": 6468, + "time_per_iteration": 2.446979522705078 + }, + { + "auxiliary_loss_clip": 0.0644114, + "auxiliary_loss_mlp": 0.01268766, + "balance_loss_clip": 0.06273733, + "balance_loss_mlp": 0.01254628, + "epoch": 0.38893732150909366, + "flos": 21948517065600.0, + "grad_norm": 2.4269891965149837, + "language_loss": 0.84667963, + "learning_rate": 2.7940134019666383e-06, + "loss": 0.92377871, + "num_input_tokens_seen": 138844115, + "router_z_loss_clip": 1.67480469, + "router_z_loss_mlp": 0.14141846, + "step": 6469, + "time_per_iteration": 2.6123251914978027 + }, + { + "auxiliary_loss_clip": 0.06445388, + "auxiliary_loss_mlp": 0.01267071, + "balance_loss_clip": 0.06276623, + "balance_loss_mlp": 0.01252575, + "epoch": 0.3889974447617616, + "flos": 24282657527040.0, + "grad_norm": 1.7885497899924685, + "language_loss": 0.75114912, + "learning_rate": 2.793655932864273e-06, + "loss": 0.82827377, + "num_input_tokens_seen": 138860860, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.14508057, + "step": 6470, + "time_per_iteration": 2.5293121337890625 + }, + { + "auxiliary_loss_clip": 0.06447375, + "auxiliary_loss_mlp": 0.01272376, + "balance_loss_clip": 0.06277959, + "balance_loss_mlp": 0.01257785, + "epoch": 0.3890575680144296, + "flos": 25674356390400.0, + "grad_norm": 2.975621998510204, + "language_loss": 0.75126278, + "learning_rate": 2.7932984336663953e-06, + "loss": 0.8284604, + "num_input_tokens_seen": 138881910, + "router_z_loss_clip": 1.69433594, + "router_z_loss_mlp": 0.14575195, + "step": 6471, + "time_per_iteration": 2.6211233139038086 + }, + { + "auxiliary_loss_clip": 0.0644885, + "auxiliary_loss_mlp": 0.01268799, + "balance_loss_clip": 0.06277963, + "balance_loss_mlp": 0.01254291, + "epoch": 0.38911769126709755, + "flos": 22861636934400.0, + "grad_norm": 1.6871762941495017, + "language_loss": 0.68158531, + "learning_rate": 2.792940904386562e-06, + "loss": 0.75876176, + "num_input_tokens_seen": 138900975, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.1451416, + "step": 6472, + "time_per_iteration": 2.5192203521728516 + }, + { + "auxiliary_loss_clip": 0.06449802, + "auxiliary_loss_mlp": 0.01271384, + "balance_loss_clip": 0.06278318, + "balance_loss_mlp": 0.01256739, + "epoch": 0.3891778145197655, + "flos": 25454612257920.0, + "grad_norm": 1.6537492711017865, + "language_loss": 0.76761287, + "learning_rate": 2.7925833450383293e-06, + "loss": 0.84482473, + "num_input_tokens_seen": 138920795, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.14654541, + "step": 6473, + "time_per_iteration": 2.588179349899292 + }, + { + "auxiliary_loss_clip": 0.06451473, + "auxiliary_loss_mlp": 0.01269072, + "balance_loss_clip": 0.0627984, + "balance_loss_mlp": 0.01254803, + "epoch": 0.3892379377724335, + "flos": 14033227374720.0, + "grad_norm": 1.8453216957475485, + "language_loss": 0.71886337, + "learning_rate": 2.792225755635257e-06, + "loss": 0.79606879, + "num_input_tokens_seen": 138938770, + "router_z_loss_clip": 1.71582031, + "router_z_loss_mlp": 0.1427002, + "step": 6474, + "time_per_iteration": 2.5054657459259033 + }, + { + "auxiliary_loss_clip": 0.06452703, + "auxiliary_loss_mlp": 0.01266582, + "balance_loss_clip": 0.06280853, + "balance_loss_mlp": 0.01252945, + "epoch": 0.38929806102510145, + "flos": 20163715971840.0, + "grad_norm": 1.4152146042292184, + "language_loss": 0.68943882, + "learning_rate": 2.7918681361909046e-06, + "loss": 0.76663172, + "num_input_tokens_seen": 138958880, + "router_z_loss_clip": 1.71582031, + "router_z_loss_mlp": 0.1362915, + "step": 6475, + "time_per_iteration": 2.5646328926086426 + }, + { + "auxiliary_loss_clip": 0.06459899, + "auxiliary_loss_mlp": 0.01272247, + "balance_loss_clip": 0.06281739, + "balance_loss_mlp": 0.01257107, + "epoch": 0.3893581842777694, + "flos": 22170525258240.0, + "grad_norm": 1.7897820076570896, + "language_loss": 0.75474584, + "learning_rate": 2.7915104867188332e-06, + "loss": 0.83206725, + "num_input_tokens_seen": 138977240, + "router_z_loss_clip": 1.78125, + "router_z_loss_mlp": 0.15142822, + "step": 6476, + "time_per_iteration": 2.515145778656006 + }, + { + "auxiliary_loss_clip": 0.06356712, + "auxiliary_loss_mlp": 0.01262119, + "balance_loss_clip": 0.06275933, + "balance_loss_mlp": 0.01259353, + "epoch": 0.3894183075304374, + "flos": 67322936459520.0, + "grad_norm": 0.7612569916112396, + "language_loss": 0.58157814, + "learning_rate": 2.7911528072326055e-06, + "loss": 0.65776634, + "num_input_tokens_seen": 139039035, + "router_z_loss_clip": 0.80615234, + "router_z_loss_mlp": 0.0276947, + "step": 6477, + "time_per_iteration": 3.147226572036743 + }, + { + "auxiliary_loss_clip": 0.06461065, + "auxiliary_loss_mlp": 0.0127366, + "balance_loss_clip": 0.06287047, + "balance_loss_mlp": 0.01258711, + "epoch": 0.38947843078310534, + "flos": 18552734173440.0, + "grad_norm": 2.207057593016708, + "language_loss": 0.77832031, + "learning_rate": 2.7907950977457832e-06, + "loss": 0.85566759, + "num_input_tokens_seen": 139055560, + "router_z_loss_clip": 1.73925781, + "router_z_loss_mlp": 0.14953613, + "step": 6478, + "time_per_iteration": 2.5238850116729736 + }, + { + "auxiliary_loss_clip": 0.06450923, + "auxiliary_loss_mlp": 0.01273895, + "balance_loss_clip": 0.06281843, + "balance_loss_mlp": 0.01260162, + "epoch": 0.3895385540357733, + "flos": 14610253317120.0, + "grad_norm": 2.187508322407885, + "language_loss": 0.83306336, + "learning_rate": 2.7904373582719317e-06, + "loss": 0.91031158, + "num_input_tokens_seen": 139071865, + "router_z_loss_clip": 1.69140625, + "router_z_loss_mlp": 0.13739014, + "step": 6479, + "time_per_iteration": 2.5355920791625977 + }, + { + "auxiliary_loss_clip": 0.06451993, + "auxiliary_loss_mlp": 0.0126931, + "balance_loss_clip": 0.06282853, + "balance_loss_mlp": 0.01254414, + "epoch": 0.38959867728844133, + "flos": 19981469341440.0, + "grad_norm": 1.7759645272954405, + "language_loss": 0.80297941, + "learning_rate": 2.790079588824617e-06, + "loss": 0.8801924, + "num_input_tokens_seen": 139089640, + "router_z_loss_clip": 1.69140625, + "router_z_loss_mlp": 0.14892578, + "step": 6480, + "time_per_iteration": 2.51645565032959 + }, + { + "auxiliary_loss_clip": 0.06447603, + "auxiliary_loss_mlp": 0.01270991, + "balance_loss_clip": 0.06278986, + "balance_loss_mlp": 0.01256924, + "epoch": 0.3896588005411093, + "flos": 22678342128000.0, + "grad_norm": 1.6438066173178132, + "language_loss": 0.83259583, + "learning_rate": 2.7897217894174038e-06, + "loss": 0.90978175, + "num_input_tokens_seen": 139109365, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.140625, + "step": 6481, + "time_per_iteration": 2.542642116546631 + }, + { + "auxiliary_loss_clip": 0.06446713, + "auxiliary_loss_mlp": 0.0127065, + "balance_loss_clip": 0.0628217, + "balance_loss_mlp": 0.01257204, + "epoch": 0.38971892379377726, + "flos": 21002343960960.0, + "grad_norm": 1.5951406272778517, + "language_loss": 0.75640547, + "learning_rate": 2.789363960063863e-06, + "loss": 0.83357906, + "num_input_tokens_seen": 139128260, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.13458252, + "step": 6482, + "time_per_iteration": 2.5500056743621826 + }, + { + "auxiliary_loss_clip": 0.06452929, + "auxiliary_loss_mlp": 0.01268783, + "balance_loss_clip": 0.06281099, + "balance_loss_mlp": 0.01254853, + "epoch": 0.3897790470464452, + "flos": 22535060446080.0, + "grad_norm": 1.9197222218969183, + "language_loss": 0.78993875, + "learning_rate": 2.78900610077756e-06, + "loss": 0.86715591, + "num_input_tokens_seen": 139147315, + "router_z_loss_clip": 1.71777344, + "router_z_loss_mlp": 0.13922119, + "step": 6483, + "time_per_iteration": 2.5677597522735596 + }, + { + "auxiliary_loss_clip": 0.06452915, + "auxiliary_loss_mlp": 0.01271475, + "balance_loss_clip": 0.06281908, + "balance_loss_mlp": 0.01256157, + "epoch": 0.3898391702991132, + "flos": 26216484307200.0, + "grad_norm": 1.4915682478636534, + "language_loss": 0.80430162, + "learning_rate": 2.788648211572067e-06, + "loss": 0.88154554, + "num_input_tokens_seen": 139167270, + "router_z_loss_clip": 1.70898438, + "router_z_loss_mlp": 0.15307617, + "step": 6484, + "time_per_iteration": 2.582933187484741 + }, + { + "auxiliary_loss_clip": 0.06455952, + "auxiliary_loss_mlp": 0.01270999, + "balance_loss_clip": 0.06285131, + "balance_loss_mlp": 0.01255347, + "epoch": 0.38989929355178116, + "flos": 21071301471360.0, + "grad_norm": 1.959559170578303, + "language_loss": 0.7792083, + "learning_rate": 2.7882902924609557e-06, + "loss": 0.8564778, + "num_input_tokens_seen": 139185970, + "router_z_loss_clip": 1.70800781, + "router_z_loss_mlp": 0.15637207, + "step": 6485, + "time_per_iteration": 2.532944917678833 + }, + { + "auxiliary_loss_clip": 0.06453831, + "auxiliary_loss_mlp": 0.01268339, + "balance_loss_clip": 0.06280229, + "balance_loss_mlp": 0.01253444, + "epoch": 0.3899594168044491, + "flos": 25491229292160.0, + "grad_norm": 2.289645436499478, + "language_loss": 0.84979439, + "learning_rate": 2.7879323434577965e-06, + "loss": 0.92701602, + "num_input_tokens_seen": 139203730, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.14898682, + "step": 6486, + "time_per_iteration": 2.5743820667266846 + }, + { + "auxiliary_loss_clip": 0.06453397, + "auxiliary_loss_mlp": 0.01267827, + "balance_loss_clip": 0.06278502, + "balance_loss_mlp": 0.01253141, + "epoch": 0.3900195400571171, + "flos": 31147415452800.0, + "grad_norm": 1.9273192838933928, + "language_loss": 0.85622168, + "learning_rate": 2.7875743645761645e-06, + "loss": 0.93343389, + "num_input_tokens_seen": 139222560, + "router_z_loss_clip": 1.74902344, + "router_z_loss_mlp": 0.14672852, + "step": 6487, + "time_per_iteration": 2.580012321472168 + }, + { + "auxiliary_loss_clip": 0.06449067, + "auxiliary_loss_mlp": 0.01273707, + "balance_loss_clip": 0.06279142, + "balance_loss_mlp": 0.01259121, + "epoch": 0.39007966330978505, + "flos": 20236111499520.0, + "grad_norm": 1.468779525903349, + "language_loss": 0.73436427, + "learning_rate": 2.787216355829633e-06, + "loss": 0.81159198, + "num_input_tokens_seen": 139242165, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.14569092, + "step": 6488, + "time_per_iteration": 2.54925274848938 + }, + { + "auxiliary_loss_clip": 0.06455337, + "auxiliary_loss_mlp": 0.0127042, + "balance_loss_clip": 0.06281433, + "balance_loss_mlp": 0.01255072, + "epoch": 0.390139786562453, + "flos": 22535353935360.0, + "grad_norm": 1.7339556546984902, + "language_loss": 0.68455738, + "learning_rate": 2.786858317231779e-06, + "loss": 0.76181495, + "num_input_tokens_seen": 139262525, + "router_z_loss_clip": 1.73730469, + "router_z_loss_mlp": 0.15344238, + "step": 6489, + "time_per_iteration": 2.529337167739868 + }, + { + "auxiliary_loss_clip": 0.06445001, + "auxiliary_loss_mlp": 0.01269777, + "balance_loss_clip": 0.0627808, + "balance_loss_mlp": 0.01256079, + "epoch": 0.390199909815121, + "flos": 26440211508480.0, + "grad_norm": 1.5752653046558913, + "language_loss": 0.81221771, + "learning_rate": 2.7865002487961788e-06, + "loss": 0.88936543, + "num_input_tokens_seen": 139282835, + "router_z_loss_clip": 1.66894531, + "router_z_loss_mlp": 0.13690186, + "step": 6490, + "time_per_iteration": 2.580287218093872 + }, + { + "auxiliary_loss_clip": 0.06445351, + "auxiliary_loss_mlp": 0.01270566, + "balance_loss_clip": 0.06275269, + "balance_loss_mlp": 0.01255784, + "epoch": 0.39026003306778895, + "flos": 17280278069760.0, + "grad_norm": 1.8612382479767444, + "language_loss": 0.89715946, + "learning_rate": 2.7861421505364104e-06, + "loss": 0.97431856, + "num_input_tokens_seen": 139299490, + "router_z_loss_clip": 1.69824219, + "router_z_loss_mlp": 0.14782715, + "step": 6491, + "time_per_iteration": 2.476393461227417 + }, + { + "auxiliary_loss_clip": 0.06446734, + "auxiliary_loss_mlp": 0.01271059, + "balance_loss_clip": 0.06275047, + "balance_loss_mlp": 0.01256325, + "epoch": 0.3903201563204569, + "flos": 24539354110080.0, + "grad_norm": 1.7715634168525083, + "language_loss": 0.78570807, + "learning_rate": 2.7857840224660523e-06, + "loss": 0.86288601, + "num_input_tokens_seen": 139317865, + "router_z_loss_clip": 1.71679688, + "router_z_loss_mlp": 0.14746094, + "step": 6492, + "time_per_iteration": 3.918022871017456 + }, + { + "auxiliary_loss_clip": 0.06448489, + "auxiliary_loss_mlp": 0.01270077, + "balance_loss_clip": 0.06278895, + "balance_loss_mlp": 0.01255528, + "epoch": 0.39038027957312493, + "flos": 23774547168000.0, + "grad_norm": 1.9649032306705667, + "language_loss": 0.74995399, + "learning_rate": 2.7854258645986857e-06, + "loss": 0.82713962, + "num_input_tokens_seen": 139339840, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.14544678, + "step": 6493, + "time_per_iteration": 2.5337636470794678 + }, + { + "auxiliary_loss_clip": 0.06457585, + "auxiliary_loss_mlp": 0.0126917, + "balance_loss_clip": 0.06280027, + "balance_loss_mlp": 0.0125341, + "epoch": 0.3904404028257929, + "flos": 14105832537600.0, + "grad_norm": 2.4323863844033498, + "language_loss": 0.76480663, + "learning_rate": 2.7850676769478916e-06, + "loss": 0.84207416, + "num_input_tokens_seen": 139357555, + "router_z_loss_clip": 1.77636719, + "router_z_loss_mlp": 0.15771484, + "step": 6494, + "time_per_iteration": 3.9828202724456787 + }, + { + "auxiliary_loss_clip": 0.06461826, + "auxiliary_loss_mlp": 0.01272307, + "balance_loss_clip": 0.06279928, + "balance_loss_mlp": 0.01255582, + "epoch": 0.39050052607846086, + "flos": 16915742881920.0, + "grad_norm": 1.9306711407360488, + "language_loss": 0.74818373, + "learning_rate": 2.7847094595272525e-06, + "loss": 0.82552505, + "num_input_tokens_seen": 139374455, + "router_z_loss_clip": 1.81933594, + "router_z_loss_mlp": 0.16723633, + "step": 6495, + "time_per_iteration": 2.5104000568389893 + }, + { + "auxiliary_loss_clip": 0.06450078, + "auxiliary_loss_mlp": 0.01273142, + "balance_loss_clip": 0.06281738, + "balance_loss_mlp": 0.01257358, + "epoch": 0.39056064933112883, + "flos": 25921912878720.0, + "grad_norm": 2.748187950361319, + "language_loss": 0.68202364, + "learning_rate": 2.784351212350352e-06, + "loss": 0.75925589, + "num_input_tokens_seen": 139394770, + "router_z_loss_clip": 1.68554688, + "router_z_loss_mlp": 0.15783691, + "step": 6496, + "time_per_iteration": 2.550957202911377 + }, + { + "auxiliary_loss_clip": 0.0637021, + "auxiliary_loss_mlp": 0.01254222, + "balance_loss_clip": 0.06292024, + "balance_loss_mlp": 0.01251394, + "epoch": 0.3906207725837968, + "flos": 60046125281280.0, + "grad_norm": 0.6447698339715318, + "language_loss": 0.53706288, + "learning_rate": 2.783992935430775e-06, + "loss": 0.61330724, + "num_input_tokens_seen": 139454760, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 0.02824402, + "step": 6497, + "time_per_iteration": 3.2988505363464355 + }, + { + "auxiliary_loss_clip": 0.06453034, + "auxiliary_loss_mlp": 0.01276113, + "balance_loss_clip": 0.06281406, + "balance_loss_mlp": 0.01261265, + "epoch": 0.39068089583646476, + "flos": 21074949123840.0, + "grad_norm": 2.0090604178847795, + "language_loss": 0.68947327, + "learning_rate": 2.7836346287821068e-06, + "loss": 0.76676476, + "num_input_tokens_seen": 139472645, + "router_z_loss_clip": 1.71679688, + "router_z_loss_mlp": 0.14837646, + "step": 6498, + "time_per_iteration": 3.9722609519958496 + }, + { + "auxiliary_loss_clip": 0.06365327, + "auxiliary_loss_mlp": 0.01254987, + "balance_loss_clip": 0.06287005, + "balance_loss_mlp": 0.01252178, + "epoch": 0.3907410190891327, + "flos": 70468269897600.0, + "grad_norm": 0.719858085665683, + "language_loss": 0.51721394, + "learning_rate": 2.783276292417936e-06, + "loss": 0.59341711, + "num_input_tokens_seen": 139536730, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 0.02807617, + "step": 6499, + "time_per_iteration": 3.209885835647583 + }, + { + "auxiliary_loss_clip": 0.06452541, + "auxiliary_loss_mlp": 0.01273785, + "balance_loss_clip": 0.06277416, + "balance_loss_mlp": 0.0125681, + "epoch": 0.3908011423418007, + "flos": 27969531903360.0, + "grad_norm": 1.5964691032272669, + "language_loss": 0.7347858, + "learning_rate": 2.7829179263518487e-06, + "loss": 0.81204903, + "num_input_tokens_seen": 139557540, + "router_z_loss_clip": 1.75195312, + "router_z_loss_mlp": 0.16992188, + "step": 6500, + "time_per_iteration": 2.5915534496307373 + }, + { + "auxiliary_loss_clip": 0.06456988, + "auxiliary_loss_mlp": 0.01269402, + "balance_loss_clip": 0.06284038, + "balance_loss_mlp": 0.01254728, + "epoch": 0.39086126559446865, + "flos": 24468971080320.0, + "grad_norm": 2.170342944486325, + "language_loss": 0.68858671, + "learning_rate": 2.7825595305974354e-06, + "loss": 0.7658506, + "num_input_tokens_seen": 139576875, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.14691162, + "step": 6501, + "time_per_iteration": 3.948155164718628 + }, + { + "auxiliary_loss_clip": 0.06445958, + "auxiliary_loss_mlp": 0.01271431, + "balance_loss_clip": 0.06277448, + "balance_loss_mlp": 0.01256327, + "epoch": 0.3909213888471366, + "flos": 16946406276480.0, + "grad_norm": 1.631531331045391, + "language_loss": 0.78994954, + "learning_rate": 2.782201105168287e-06, + "loss": 0.86712337, + "num_input_tokens_seen": 139594295, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.15100098, + "step": 6502, + "time_per_iteration": 2.505021810531616 + }, + { + "auxiliary_loss_clip": 0.06451446, + "auxiliary_loss_mlp": 0.01272758, + "balance_loss_clip": 0.06288067, + "balance_loss_mlp": 0.01259133, + "epoch": 0.3909815120998046, + "flos": 29286109981440.0, + "grad_norm": 4.8026818588998115, + "language_loss": 0.80286908, + "learning_rate": 2.7818426500779932e-06, + "loss": 0.88011116, + "num_input_tokens_seen": 139614080, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.13623047, + "step": 6503, + "time_per_iteration": 2.6041667461395264 + }, + { + "auxiliary_loss_clip": 0.06444375, + "auxiliary_loss_mlp": 0.01267951, + "balance_loss_clip": 0.06278107, + "balance_loss_mlp": 0.01253574, + "epoch": 0.39104163535247255, + "flos": 18956947069440.0, + "grad_norm": 1.8714653526076386, + "language_loss": 0.71717298, + "learning_rate": 2.7814841653401485e-06, + "loss": 0.79429626, + "num_input_tokens_seen": 139632755, + "router_z_loss_clip": 1.66113281, + "router_z_loss_mlp": 0.14379883, + "step": 6504, + "time_per_iteration": 2.499645471572876 + }, + { + "auxiliary_loss_clip": 0.06449269, + "auxiliary_loss_mlp": 0.01267487, + "balance_loss_clip": 0.06279607, + "balance_loss_mlp": 0.0125379, + "epoch": 0.3911017586051405, + "flos": 26330611968000.0, + "grad_norm": 1.7094242767760466, + "language_loss": 0.83403468, + "learning_rate": 2.7811256509683454e-06, + "loss": 0.91120219, + "num_input_tokens_seen": 139654205, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.137146, + "step": 6505, + "time_per_iteration": 2.5698060989379883 + }, + { + "auxiliary_loss_clip": 0.06447234, + "auxiliary_loss_mlp": 0.01267488, + "balance_loss_clip": 0.06281015, + "balance_loss_mlp": 0.01253022, + "epoch": 0.3911618818578085, + "flos": 21842313615360.0, + "grad_norm": 2.3254017668705083, + "language_loss": 0.71427596, + "learning_rate": 2.7807671069761797e-06, + "loss": 0.7914232, + "num_input_tokens_seen": 139673595, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.14465332, + "step": 6506, + "time_per_iteration": 2.4988996982574463 + }, + { + "auxiliary_loss_clip": 0.06443267, + "auxiliary_loss_mlp": 0.01271489, + "balance_loss_clip": 0.0628104, + "balance_loss_mlp": 0.01258149, + "epoch": 0.3912220051104765, + "flos": 16364768359680.0, + "grad_norm": 2.639532414168514, + "language_loss": 0.75588799, + "learning_rate": 2.7804085333772477e-06, + "loss": 0.83303547, + "num_input_tokens_seen": 139690565, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.13348389, + "step": 6507, + "time_per_iteration": 2.506723403930664 + }, + { + "auxiliary_loss_clip": 0.06355534, + "auxiliary_loss_mlp": 0.01255368, + "balance_loss_clip": 0.0627788, + "balance_loss_mlp": 0.01252429, + "epoch": 0.39128212836314447, + "flos": 71071179552000.0, + "grad_norm": 0.751869236178363, + "language_loss": 0.56649405, + "learning_rate": 2.7800499301851446e-06, + "loss": 0.64260316, + "num_input_tokens_seen": 139749420, + "router_z_loss_clip": 0.77441406, + "router_z_loss_mlp": 0.02935791, + "step": 6508, + "time_per_iteration": 3.282604455947876 + }, + { + "auxiliary_loss_clip": 0.06448714, + "auxiliary_loss_mlp": 0.01268575, + "balance_loss_clip": 0.06280237, + "balance_loss_mlp": 0.01254294, + "epoch": 0.39134225161581243, + "flos": 20336948288640.0, + "grad_norm": 1.8618605672003898, + "language_loss": 0.76758552, + "learning_rate": 2.779691297413471e-06, + "loss": 0.84475839, + "num_input_tokens_seen": 139766265, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.14276123, + "step": 6509, + "time_per_iteration": 2.5330445766448975 + }, + { + "auxiliary_loss_clip": 0.0644654, + "auxiliary_loss_mlp": 0.01272023, + "balance_loss_clip": 0.06278333, + "balance_loss_mlp": 0.01256073, + "epoch": 0.3914023748684804, + "flos": 17023916903040.0, + "grad_norm": 3.0317271524647427, + "language_loss": 0.83418059, + "learning_rate": 2.779332635075825e-06, + "loss": 0.91136616, + "num_input_tokens_seen": 139782400, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.1595459, + "step": 6510, + "time_per_iteration": 2.484149217605591 + }, + { + "auxiliary_loss_clip": 0.06450167, + "auxiliary_loss_mlp": 0.01268149, + "balance_loss_clip": 0.06277542, + "balance_loss_mlp": 0.01254463, + "epoch": 0.39146249812114836, + "flos": 18411045719040.0, + "grad_norm": 1.8343195842354416, + "language_loss": 0.77659726, + "learning_rate": 2.7789739431858073e-06, + "loss": 0.85378045, + "num_input_tokens_seen": 139801435, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.13684082, + "step": 6511, + "time_per_iteration": 2.493088722229004 + }, + { + "auxiliary_loss_clip": 0.06343137, + "auxiliary_loss_mlp": 0.01261237, + "balance_loss_clip": 0.06266295, + "balance_loss_mlp": 0.01258513, + "epoch": 0.3915226213738163, + "flos": 67659659291520.0, + "grad_norm": 0.7080449531762238, + "language_loss": 0.57720256, + "learning_rate": 2.7786152217570196e-06, + "loss": 0.65324628, + "num_input_tokens_seen": 139869700, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.02726746, + "step": 6512, + "time_per_iteration": 3.217658042907715 + }, + { + "auxiliary_loss_clip": 0.06445479, + "auxiliary_loss_mlp": 0.01273045, + "balance_loss_clip": 0.06275767, + "balance_loss_mlp": 0.01257452, + "epoch": 0.3915827446264843, + "flos": 26366516242560.0, + "grad_norm": 1.5252758876056967, + "language_loss": 0.69950658, + "learning_rate": 2.7782564708030647e-06, + "loss": 0.77669179, + "num_input_tokens_seen": 139890140, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.15600586, + "step": 6513, + "time_per_iteration": 2.560802936553955 + }, + { + "auxiliary_loss_clip": 0.06451759, + "auxiliary_loss_mlp": 0.01273121, + "balance_loss_clip": 0.06276838, + "balance_loss_mlp": 0.01258208, + "epoch": 0.39164286787915226, + "flos": 21950236074240.0, + "grad_norm": 2.7587511630204777, + "language_loss": 0.76322639, + "learning_rate": 2.7778976903375464e-06, + "loss": 0.8404752, + "num_input_tokens_seen": 139908020, + "router_z_loss_clip": 1.74707031, + "router_z_loss_mlp": 0.14916992, + "step": 6514, + "time_per_iteration": 2.499101400375366 + }, + { + "auxiliary_loss_clip": 0.0644438, + "auxiliary_loss_mlp": 0.01269565, + "balance_loss_clip": 0.06276566, + "balance_loss_mlp": 0.0125619, + "epoch": 0.3917029911318202, + "flos": 16405536170880.0, + "grad_norm": 1.811906351936664, + "language_loss": 0.782359, + "learning_rate": 2.7775388803740693e-06, + "loss": 0.8594985, + "num_input_tokens_seen": 139926180, + "router_z_loss_clip": 1.67773438, + "router_z_loss_mlp": 0.13378906, + "step": 6515, + "time_per_iteration": 2.5104947090148926 + }, + { + "auxiliary_loss_clip": 0.06443886, + "auxiliary_loss_mlp": 0.01270163, + "balance_loss_clip": 0.06277545, + "balance_loss_mlp": 0.0125705, + "epoch": 0.3917631143844882, + "flos": 26218580659200.0, + "grad_norm": 1.4298617884300358, + "language_loss": 0.79790455, + "learning_rate": 2.7771800409262406e-06, + "loss": 0.87504506, + "num_input_tokens_seen": 139947420, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.13122559, + "step": 6516, + "time_per_iteration": 2.5912764072418213 + }, + { + "auxiliary_loss_clip": 0.06446922, + "auxiliary_loss_mlp": 0.0126951, + "balance_loss_clip": 0.06278265, + "balance_loss_mlp": 0.0125511, + "epoch": 0.39182323763715615, + "flos": 18553740422400.0, + "grad_norm": 1.8457537699229483, + "language_loss": 0.70234001, + "learning_rate": 2.7768211720076665e-06, + "loss": 0.7795043, + "num_input_tokens_seen": 139965800, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.14404297, + "step": 6517, + "time_per_iteration": 2.630155324935913 + }, + { + "auxiliary_loss_clip": 0.06449963, + "auxiliary_loss_mlp": 0.01269735, + "balance_loss_clip": 0.06279542, + "balance_loss_mlp": 0.01254905, + "epoch": 0.3918833608898241, + "flos": 34322112547200.0, + "grad_norm": 1.6944592538331644, + "language_loss": 0.72209281, + "learning_rate": 2.776462273631956e-06, + "loss": 0.79928982, + "num_input_tokens_seen": 139988140, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.1484375, + "step": 6518, + "time_per_iteration": 2.6439340114593506 + }, + { + "auxiliary_loss_clip": 0.06453219, + "auxiliary_loss_mlp": 0.0127268, + "balance_loss_clip": 0.06280756, + "balance_loss_mlp": 0.0125751, + "epoch": 0.3919434841424921, + "flos": 36948434595840.0, + "grad_norm": 1.7409198797741048, + "language_loss": 0.62180024, + "learning_rate": 2.7761033458127177e-06, + "loss": 0.69905925, + "num_input_tokens_seen": 140010060, + "router_z_loss_clip": 1.72363281, + "router_z_loss_mlp": 0.15179443, + "step": 6519, + "time_per_iteration": 2.6407580375671387 + }, + { + "auxiliary_loss_clip": 0.06457552, + "auxiliary_loss_mlp": 0.01269986, + "balance_loss_clip": 0.06280086, + "balance_loss_mlp": 0.01253535, + "epoch": 0.3920036073951601, + "flos": 23514915692160.0, + "grad_norm": 2.3243103288051485, + "language_loss": 0.6728406, + "learning_rate": 2.775744388563563e-06, + "loss": 0.75011599, + "num_input_tokens_seen": 140029400, + "router_z_loss_clip": 1.77441406, + "router_z_loss_mlp": 0.16442871, + "step": 6520, + "time_per_iteration": 2.557736396789551 + }, + { + "auxiliary_loss_clip": 0.06452015, + "auxiliary_loss_mlp": 0.01272672, + "balance_loss_clip": 0.06281003, + "balance_loss_mlp": 0.0125845, + "epoch": 0.39206373064782807, + "flos": 18412051968000.0, + "grad_norm": 5.792319014223258, + "language_loss": 0.79119205, + "learning_rate": 2.775385401898104e-06, + "loss": 0.86843884, + "num_input_tokens_seen": 140048940, + "router_z_loss_clip": 1.70898438, + "router_z_loss_mlp": 0.14233398, + "step": 6521, + "time_per_iteration": 2.487144947052002 + }, + { + "auxiliary_loss_clip": 0.0645816, + "auxiliary_loss_mlp": 0.01271766, + "balance_loss_clip": 0.06282392, + "balance_loss_mlp": 0.01255297, + "epoch": 0.39212385390049603, + "flos": 12318012696960.0, + "grad_norm": 2.63137671789129, + "language_loss": 0.70893902, + "learning_rate": 2.775026385829952e-06, + "loss": 0.78623831, + "num_input_tokens_seen": 140066380, + "router_z_loss_clip": 1.75976562, + "router_z_loss_mlp": 0.16473389, + "step": 6522, + "time_per_iteration": 2.501777410507202 + }, + { + "auxiliary_loss_clip": 0.06455532, + "auxiliary_loss_mlp": 0.01272148, + "balance_loss_clip": 0.06282486, + "balance_loss_mlp": 0.01257693, + "epoch": 0.392183977153164, + "flos": 19725275882880.0, + "grad_norm": 2.1277990565539087, + "language_loss": 0.77424598, + "learning_rate": 2.774667340372722e-06, + "loss": 0.8515228, + "num_input_tokens_seen": 140085275, + "router_z_loss_clip": 1.73046875, + "router_z_loss_mlp": 0.14453125, + "step": 6523, + "time_per_iteration": 2.494900941848755 + }, + { + "auxiliary_loss_clip": 0.0645543, + "auxiliary_loss_mlp": 0.01272716, + "balance_loss_clip": 0.06282179, + "balance_loss_mlp": 0.01258769, + "epoch": 0.39224410040583196, + "flos": 33153092709120.0, + "grad_norm": 2.7826558407508855, + "language_loss": 0.62314886, + "learning_rate": 2.7743082655400293e-06, + "loss": 0.70043033, + "num_input_tokens_seen": 140105105, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.13964844, + "step": 6524, + "time_per_iteration": 2.6380085945129395 + }, + { + "auxiliary_loss_clip": 0.06452876, + "auxiliary_loss_mlp": 0.01268165, + "balance_loss_clip": 0.06281661, + "balance_loss_mlp": 0.01252895, + "epoch": 0.39230422365849993, + "flos": 27789884749440.0, + "grad_norm": 1.7105729654368218, + "language_loss": 0.74638754, + "learning_rate": 2.773949161345489e-06, + "loss": 0.82359803, + "num_input_tokens_seen": 140125645, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.15264893, + "step": 6525, + "time_per_iteration": 2.5430080890655518 + }, + { + "auxiliary_loss_clip": 0.06454577, + "auxiliary_loss_mlp": 0.0126824, + "balance_loss_clip": 0.06280737, + "balance_loss_mlp": 0.01253863, + "epoch": 0.3923643469111679, + "flos": 17937497969280.0, + "grad_norm": 2.1060109606385673, + "language_loss": 0.8182255, + "learning_rate": 2.773590027802719e-06, + "loss": 0.89545369, + "num_input_tokens_seen": 140141925, + "router_z_loss_clip": 1.73925781, + "router_z_loss_mlp": 0.14367676, + "step": 6526, + "time_per_iteration": 2.4994354248046875 + }, + { + "auxiliary_loss_clip": 0.06454204, + "auxiliary_loss_mlp": 0.01269978, + "balance_loss_clip": 0.06281518, + "balance_loss_mlp": 0.01255482, + "epoch": 0.39242447016383586, + "flos": 24066141776640.0, + "grad_norm": 1.5927090967738864, + "language_loss": 0.70157206, + "learning_rate": 2.7732308649253383e-06, + "loss": 0.77881384, + "num_input_tokens_seen": 140160965, + "router_z_loss_clip": 1.72753906, + "router_z_loss_mlp": 0.14501953, + "step": 6527, + "time_per_iteration": 2.5232326984405518 + }, + { + "auxiliary_loss_clip": 0.06452368, + "auxiliary_loss_mlp": 0.01268854, + "balance_loss_clip": 0.06281934, + "balance_loss_mlp": 0.01254245, + "epoch": 0.3924845934165038, + "flos": 10667562825600.0, + "grad_norm": 3.256824520755738, + "language_loss": 0.82039493, + "learning_rate": 2.772871672726965e-06, + "loss": 0.89760715, + "num_input_tokens_seen": 140177780, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.14605713, + "step": 6528, + "time_per_iteration": 2.498852014541626 + }, + { + "auxiliary_loss_clip": 0.06450985, + "auxiliary_loss_mlp": 0.0127277, + "balance_loss_clip": 0.06284485, + "balance_loss_mlp": 0.01258048, + "epoch": 0.3925447166691718, + "flos": 31253493121920.0, + "grad_norm": 1.712128770360143, + "language_loss": 0.68666142, + "learning_rate": 2.7725124512212205e-06, + "loss": 0.76389897, + "num_input_tokens_seen": 140201660, + "router_z_loss_clip": 1.66308594, + "router_z_loss_mlp": 0.14733887, + "step": 6529, + "time_per_iteration": 2.588303565979004 + }, + { + "auxiliary_loss_clip": 0.06454393, + "auxiliary_loss_mlp": 0.01267174, + "balance_loss_clip": 0.06281163, + "balance_loss_mlp": 0.01252213, + "epoch": 0.39260483992183975, + "flos": 29421215890560.0, + "grad_norm": 2.512935177473184, + "language_loss": 0.80622673, + "learning_rate": 2.7721532004217267e-06, + "loss": 0.8834424, + "num_input_tokens_seen": 140218585, + "router_z_loss_clip": 1.73339844, + "router_z_loss_mlp": 0.14959717, + "step": 6530, + "time_per_iteration": 2.5896732807159424 + }, + { + "auxiliary_loss_clip": 0.06449011, + "auxiliary_loss_mlp": 0.01267415, + "balance_loss_clip": 0.06279489, + "balance_loss_mlp": 0.0125252, + "epoch": 0.3926649631745077, + "flos": 22864571827200.0, + "grad_norm": 1.8446830755174628, + "language_loss": 0.76176864, + "learning_rate": 2.7717939203421063e-06, + "loss": 0.83893287, + "num_input_tokens_seen": 140239905, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.14892578, + "step": 6531, + "time_per_iteration": 3.9335060119628906 + }, + { + "auxiliary_loss_clip": 0.06348795, + "auxiliary_loss_mlp": 0.01256081, + "balance_loss_clip": 0.06271987, + "balance_loss_mlp": 0.01253434, + "epoch": 0.3927250864271757, + "flos": 63911892124800.0, + "grad_norm": 0.7987882767963658, + "language_loss": 0.6030035, + "learning_rate": 2.7714346109959822e-06, + "loss": 0.67905223, + "num_input_tokens_seen": 140293820, + "router_z_loss_clip": 0.76611328, + "router_z_loss_mlp": 0.02648926, + "step": 6532, + "time_per_iteration": 3.023615598678589 + }, + { + "auxiliary_loss_clip": 0.06346735, + "auxiliary_loss_mlp": 0.01258162, + "balance_loss_clip": 0.06270058, + "balance_loss_mlp": 0.01255445, + "epoch": 0.3927852096798437, + "flos": 68931486489600.0, + "grad_norm": 0.7618686105615924, + "language_loss": 0.55496854, + "learning_rate": 2.771075272396981e-06, + "loss": 0.63101745, + "num_input_tokens_seen": 140360420, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.02720642, + "step": 6533, + "time_per_iteration": 3.2504148483276367 + }, + { + "auxiliary_loss_clip": 0.06452841, + "auxiliary_loss_mlp": 0.01269959, + "balance_loss_clip": 0.06277935, + "balance_loss_mlp": 0.01254557, + "epoch": 0.39284533293251167, + "flos": 29723711529600.0, + "grad_norm": 1.823371664681604, + "language_loss": 0.76552856, + "learning_rate": 2.7707159045587284e-06, + "loss": 0.84275657, + "num_input_tokens_seen": 140381950, + "router_z_loss_clip": 1.74804688, + "router_z_loss_mlp": 0.15405273, + "step": 6534, + "time_per_iteration": 4.098775148391724 + }, + { + "auxiliary_loss_clip": 0.06459314, + "auxiliary_loss_mlp": 0.01269352, + "balance_loss_clip": 0.06282811, + "balance_loss_mlp": 0.01253974, + "epoch": 0.39290545618517964, + "flos": 18558016980480.0, + "grad_norm": 2.2164588420846267, + "language_loss": 0.78656316, + "learning_rate": 2.770356507494851e-06, + "loss": 0.86384982, + "num_input_tokens_seen": 140399410, + "router_z_loss_clip": 1.76464844, + "router_z_loss_mlp": 0.15380859, + "step": 6535, + "time_per_iteration": 2.4923341274261475 + }, + { + "auxiliary_loss_clip": 0.06449763, + "auxiliary_loss_mlp": 0.01266735, + "balance_loss_clip": 0.06282885, + "balance_loss_mlp": 0.01253592, + "epoch": 0.3929655794378476, + "flos": 26256581285760.0, + "grad_norm": 2.2738959430224326, + "language_loss": 0.69076276, + "learning_rate": 2.769997081218978e-06, + "loss": 0.76792771, + "num_input_tokens_seen": 140419055, + "router_z_loss_clip": 1.66699219, + "router_z_loss_mlp": 0.1315918, + "step": 6536, + "time_per_iteration": 2.5980727672576904 + }, + { + "auxiliary_loss_clip": 0.06448898, + "auxiliary_loss_mlp": 0.0127095, + "balance_loss_clip": 0.0628474, + "balance_loss_mlp": 0.01257265, + "epoch": 0.39302570269051557, + "flos": 29285564929920.0, + "grad_norm": 1.8741537429596062, + "language_loss": 0.69716197, + "learning_rate": 2.769637625744738e-06, + "loss": 0.77436042, + "num_input_tokens_seen": 140438800, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.13684082, + "step": 6537, + "time_per_iteration": 4.096014499664307 + }, + { + "auxiliary_loss_clip": 0.064602, + "auxiliary_loss_mlp": 0.01269576, + "balance_loss_clip": 0.06288625, + "balance_loss_mlp": 0.01255432, + "epoch": 0.39308582594318353, + "flos": 17353134794880.0, + "grad_norm": 1.7942703591990323, + "language_loss": 0.79606509, + "learning_rate": 2.769278141085763e-06, + "loss": 0.8733629, + "num_input_tokens_seen": 140456880, + "router_z_loss_clip": 1.71582031, + "router_z_loss_mlp": 0.14129639, + "step": 6538, + "time_per_iteration": 2.578815221786499 + }, + { + "auxiliary_loss_clip": 0.06359898, + "auxiliary_loss_mlp": 0.01255927, + "balance_loss_clip": 0.06283404, + "balance_loss_mlp": 0.0125297, + "epoch": 0.3931459491958515, + "flos": 61023884175360.0, + "grad_norm": 0.7947880980854773, + "language_loss": 0.61826062, + "learning_rate": 2.768918627255683e-06, + "loss": 0.69441885, + "num_input_tokens_seen": 140507510, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.02955627, + "step": 6539, + "time_per_iteration": 2.9553403854370117 + }, + { + "auxiliary_loss_clip": 0.06458268, + "auxiliary_loss_mlp": 0.01272646, + "balance_loss_clip": 0.06289513, + "balance_loss_mlp": 0.01257339, + "epoch": 0.39320607244851946, + "flos": 39024662590080.0, + "grad_norm": 2.4294685123961295, + "language_loss": 0.68263721, + "learning_rate": 2.7685590842681315e-06, + "loss": 0.75994635, + "num_input_tokens_seen": 140528740, + "router_z_loss_clip": 1.68554688, + "router_z_loss_mlp": 0.15307617, + "step": 6540, + "time_per_iteration": 2.732541799545288 + }, + { + "auxiliary_loss_clip": 0.06455955, + "auxiliary_loss_mlp": 0.01271651, + "balance_loss_clip": 0.06287128, + "balance_loss_mlp": 0.0125613, + "epoch": 0.3932661957011874, + "flos": 24686451152640.0, + "grad_norm": 1.7600019176005988, + "language_loss": 0.72681171, + "learning_rate": 2.7681995121367433e-06, + "loss": 0.80408776, + "num_input_tokens_seen": 140547560, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.15527344, + "step": 6541, + "time_per_iteration": 4.03834342956543 + }, + { + "auxiliary_loss_clip": 0.06358681, + "auxiliary_loss_mlp": 0.01262146, + "balance_loss_clip": 0.06282184, + "balance_loss_mlp": 0.01259297, + "epoch": 0.3933263189538554, + "flos": 70115614790400.0, + "grad_norm": 0.7938144397826515, + "language_loss": 0.60408866, + "learning_rate": 2.7678399108751516e-06, + "loss": 0.6802969, + "num_input_tokens_seen": 140601175, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.02844238, + "step": 6542, + "time_per_iteration": 3.0015151500701904 + }, + { + "auxiliary_loss_clip": 0.06453243, + "auxiliary_loss_mlp": 0.01279318, + "balance_loss_clip": 0.0628323, + "balance_loss_mlp": 0.01265305, + "epoch": 0.39338644220652336, + "flos": 22935583762560.0, + "grad_norm": 1.4413337304531033, + "language_loss": 0.82278919, + "learning_rate": 2.7674802804969947e-06, + "loss": 0.90011483, + "num_input_tokens_seen": 140622200, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.14013672, + "step": 6543, + "time_per_iteration": 2.6289048194885254 + }, + { + "auxiliary_loss_clip": 0.06454003, + "auxiliary_loss_mlp": 0.01270252, + "balance_loss_clip": 0.06284549, + "balance_loss_mlp": 0.01255768, + "epoch": 0.3934465654591913, + "flos": 30856282041600.0, + "grad_norm": 1.7408174737933344, + "language_loss": 0.69224536, + "learning_rate": 2.767120621015908e-06, + "loss": 0.76948798, + "num_input_tokens_seen": 140643125, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.14489746, + "step": 6544, + "time_per_iteration": 2.6554784774780273 + }, + { + "auxiliary_loss_clip": 0.06466363, + "auxiliary_loss_mlp": 0.01274712, + "balance_loss_clip": 0.06291823, + "balance_loss_mlp": 0.01258524, + "epoch": 0.3935066887118593, + "flos": 29243329672320.0, + "grad_norm": 2.0329338261061887, + "language_loss": 0.75462705, + "learning_rate": 2.76676093244553e-06, + "loss": 0.83203781, + "num_input_tokens_seen": 140662500, + "router_z_loss_clip": 1.74511719, + "router_z_loss_mlp": 0.1619873, + "step": 6545, + "time_per_iteration": 2.606234312057495 + }, + { + "auxiliary_loss_clip": 0.06446254, + "auxiliary_loss_mlp": 0.01275344, + "balance_loss_clip": 0.06285709, + "balance_loss_mlp": 0.01262309, + "epoch": 0.3935668119645273, + "flos": 19141290051840.0, + "grad_norm": 1.4467327313094591, + "language_loss": 0.75122333, + "learning_rate": 2.7664012147995015e-06, + "loss": 0.82843935, + "num_input_tokens_seen": 140681960, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.13043213, + "step": 6546, + "time_per_iteration": 2.5514185428619385 + }, + { + "auxiliary_loss_clip": 0.06461848, + "auxiliary_loss_mlp": 0.01270617, + "balance_loss_clip": 0.06285486, + "balance_loss_mlp": 0.01254822, + "epoch": 0.3936269352171953, + "flos": 18522196560000.0, + "grad_norm": 2.187625212538507, + "language_loss": 0.82285661, + "learning_rate": 2.7660414680914617e-06, + "loss": 0.90018129, + "num_input_tokens_seen": 140699170, + "router_z_loss_clip": 1.76367188, + "router_z_loss_mlp": 0.15783691, + "step": 6547, + "time_per_iteration": 2.536921501159668 + }, + { + "auxiliary_loss_clip": 0.06454909, + "auxiliary_loss_mlp": 0.01273072, + "balance_loss_clip": 0.06285325, + "balance_loss_mlp": 0.01259685, + "epoch": 0.39368705846986324, + "flos": 15638255533440.0, + "grad_norm": 1.8611217813328955, + "language_loss": 0.84309554, + "learning_rate": 2.7656816923350525e-06, + "loss": 0.92037535, + "num_input_tokens_seen": 140714920, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.1340332, + "step": 6548, + "time_per_iteration": 2.586596727371216 + }, + { + "auxiliary_loss_clip": 0.06451154, + "auxiliary_loss_mlp": 0.01275141, + "balance_loss_clip": 0.06285168, + "balance_loss_mlp": 0.01261325, + "epoch": 0.3937471817225312, + "flos": 21332442320640.0, + "grad_norm": 1.5541020214417252, + "language_loss": 0.7306931, + "learning_rate": 2.7653218875439174e-06, + "loss": 0.8079561, + "num_input_tokens_seen": 140734595, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.13842773, + "step": 6549, + "time_per_iteration": 2.5176355838775635 + }, + { + "auxiliary_loss_clip": 0.06453951, + "auxiliary_loss_mlp": 0.0127461, + "balance_loss_clip": 0.06282675, + "balance_loss_mlp": 0.01258398, + "epoch": 0.39380730497519917, + "flos": 20782893317760.0, + "grad_norm": 1.443831260247086, + "language_loss": 0.77958995, + "learning_rate": 2.764962053731699e-06, + "loss": 0.85687554, + "num_input_tokens_seen": 140754050, + "router_z_loss_clip": 1.71191406, + "router_z_loss_mlp": 0.16204834, + "step": 6550, + "time_per_iteration": 2.5665266513824463 + }, + { + "auxiliary_loss_clip": 0.06449334, + "auxiliary_loss_mlp": 0.01268564, + "balance_loss_clip": 0.0628082, + "balance_loss_mlp": 0.01254455, + "epoch": 0.39386742822786713, + "flos": 21615106469760.0, + "grad_norm": 1.5479702434138036, + "language_loss": 0.81395853, + "learning_rate": 2.7646021909120434e-06, + "loss": 0.89113748, + "num_input_tokens_seen": 140771440, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.14129639, + "step": 6551, + "time_per_iteration": 2.509472370147705 + }, + { + "auxiliary_loss_clip": 0.06452134, + "auxiliary_loss_mlp": 0.01274621, + "balance_loss_clip": 0.06282679, + "balance_loss_mlp": 0.01259791, + "epoch": 0.3939275514805351, + "flos": 12418304434560.0, + "grad_norm": 2.3772322810911892, + "language_loss": 0.80163503, + "learning_rate": 2.764242299098596e-06, + "loss": 0.87890255, + "num_input_tokens_seen": 140786715, + "router_z_loss_clip": 1.69433594, + "router_z_loss_mlp": 0.14825439, + "step": 6552, + "time_per_iteration": 2.512632369995117 + }, + { + "auxiliary_loss_clip": 0.06458388, + "auxiliary_loss_mlp": 0.01271806, + "balance_loss_clip": 0.06285821, + "balance_loss_mlp": 0.01256803, + "epoch": 0.39398767473320306, + "flos": 18558016980480.0, + "grad_norm": 1.9836463121020687, + "language_loss": 0.71468151, + "learning_rate": 2.763882378305003e-06, + "loss": 0.79198349, + "num_input_tokens_seen": 140804950, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.14996338, + "step": 6553, + "time_per_iteration": 2.4973459243774414 + }, + { + "auxiliary_loss_clip": 0.06447914, + "auxiliary_loss_mlp": 0.01269169, + "balance_loss_clip": 0.06280744, + "balance_loss_mlp": 0.0125422, + "epoch": 0.39404779798587103, + "flos": 29315599418880.0, + "grad_norm": 1.8230931816174483, + "language_loss": 0.64176017, + "learning_rate": 2.7635224285449144e-06, + "loss": 0.71893102, + "num_input_tokens_seen": 140822800, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.14941406, + "step": 6554, + "time_per_iteration": 2.6340816020965576 + }, + { + "auxiliary_loss_clip": 0.06448209, + "auxiliary_loss_mlp": 0.01269545, + "balance_loss_clip": 0.06281387, + "balance_loss_mlp": 0.0125561, + "epoch": 0.394107921238539, + "flos": 34905679107840.0, + "grad_norm": 1.8577413865682035, + "language_loss": 0.79801202, + "learning_rate": 2.7631624498319796e-06, + "loss": 0.8751896, + "num_input_tokens_seen": 140842940, + "router_z_loss_clip": 1.66894531, + "router_z_loss_mlp": 0.13934326, + "step": 6555, + "time_per_iteration": 2.673266887664795 + }, + { + "auxiliary_loss_clip": 0.06451041, + "auxiliary_loss_mlp": 0.01267708, + "balance_loss_clip": 0.06280783, + "balance_loss_mlp": 0.01252748, + "epoch": 0.39416804449120696, + "flos": 25088232280320.0, + "grad_norm": 1.8326733466575391, + "language_loss": 0.72028196, + "learning_rate": 2.7628024421798473e-06, + "loss": 0.79746938, + "num_input_tokens_seen": 140863060, + "router_z_loss_clip": 1.70117188, + "router_z_loss_mlp": 0.1496582, + "step": 6556, + "time_per_iteration": 2.572880744934082 + }, + { + "auxiliary_loss_clip": 0.06448796, + "auxiliary_loss_mlp": 0.01268731, + "balance_loss_clip": 0.06281175, + "balance_loss_mlp": 0.01254348, + "epoch": 0.3942281677438749, + "flos": 32314842063360.0, + "grad_norm": 2.2262653228658666, + "language_loss": 0.83903825, + "learning_rate": 2.7624424056021705e-06, + "loss": 0.91621351, + "num_input_tokens_seen": 140883795, + "router_z_loss_clip": 1.67675781, + "router_z_loss_mlp": 0.14373779, + "step": 6557, + "time_per_iteration": 2.605922222137451 + }, + { + "auxiliary_loss_clip": 0.06447846, + "auxiliary_loss_mlp": 0.01272636, + "balance_loss_clip": 0.06281336, + "balance_loss_mlp": 0.01258671, + "epoch": 0.3942882909965429, + "flos": 24943608933120.0, + "grad_norm": 2.1784611950300605, + "language_loss": 0.80248392, + "learning_rate": 2.7620823401126004e-06, + "loss": 0.87968874, + "num_input_tokens_seen": 140903055, + "router_z_loss_clip": 1.6640625, + "router_z_loss_mlp": 0.1395874, + "step": 6558, + "time_per_iteration": 2.5902092456817627 + }, + { + "auxiliary_loss_clip": 0.06445447, + "auxiliary_loss_mlp": 0.01267686, + "balance_loss_clip": 0.06280681, + "balance_loss_mlp": 0.01253816, + "epoch": 0.39434841424921085, + "flos": 11879614535040.0, + "grad_norm": 2.1357186014692546, + "language_loss": 0.71689725, + "learning_rate": 2.761722245724792e-06, + "loss": 0.79402852, + "num_input_tokens_seen": 140920685, + "router_z_loss_clip": 1.64648438, + "router_z_loss_mlp": 0.13873291, + "step": 6559, + "time_per_iteration": 2.4894917011260986 + }, + { + "auxiliary_loss_clip": 0.06456885, + "auxiliary_loss_mlp": 0.01269254, + "balance_loss_clip": 0.0628094, + "balance_loss_mlp": 0.01254622, + "epoch": 0.3944085375018789, + "flos": 16367032419840.0, + "grad_norm": 2.0841749511208705, + "language_loss": 0.81285572, + "learning_rate": 2.7613621224524003e-06, + "loss": 0.89011705, + "num_input_tokens_seen": 140937320, + "router_z_loss_clip": 1.7578125, + "router_z_loss_mlp": 0.14630127, + "step": 6560, + "time_per_iteration": 2.522434711456299 + }, + { + "auxiliary_loss_clip": 0.06452034, + "auxiliary_loss_mlp": 0.0126948, + "balance_loss_clip": 0.06282307, + "balance_loss_mlp": 0.01254078, + "epoch": 0.39446866075454684, + "flos": 10637821825920.0, + "grad_norm": 3.641985825462619, + "language_loss": 0.83127379, + "learning_rate": 2.7610019703090803e-06, + "loss": 0.90848899, + "num_input_tokens_seen": 140954855, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.15386963, + "step": 6561, + "time_per_iteration": 2.4804983139038086 + }, + { + "auxiliary_loss_clip": 0.06450383, + "auxiliary_loss_mlp": 0.0127031, + "balance_loss_clip": 0.06283262, + "balance_loss_mlp": 0.01257102, + "epoch": 0.3945287840072148, + "flos": 18193481792640.0, + "grad_norm": 2.043086634933395, + "language_loss": 0.80616236, + "learning_rate": 2.7606417893084887e-06, + "loss": 0.88336933, + "num_input_tokens_seen": 140973250, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.13208008, + "step": 6562, + "time_per_iteration": 2.5335006713867188 + }, + { + "auxiliary_loss_clip": 0.06448314, + "auxiliary_loss_mlp": 0.01268686, + "balance_loss_clip": 0.06283693, + "balance_loss_mlp": 0.01254476, + "epoch": 0.39458890725988277, + "flos": 23046650749440.0, + "grad_norm": 1.5717146465742573, + "language_loss": 0.81509531, + "learning_rate": 2.7602815794642853e-06, + "loss": 0.89226532, + "num_input_tokens_seen": 140993050, + "router_z_loss_clip": 1.64648438, + "router_z_loss_mlp": 0.14215088, + "step": 6563, + "time_per_iteration": 2.5315918922424316 + }, + { + "auxiliary_loss_clip": 0.06453238, + "auxiliary_loss_mlp": 0.01270349, + "balance_loss_clip": 0.0628344, + "balance_loss_mlp": 0.0125608, + "epoch": 0.39464903051255074, + "flos": 17163718640640.0, + "grad_norm": 1.8608988788141587, + "language_loss": 0.70080984, + "learning_rate": 2.759921340790127e-06, + "loss": 0.77804577, + "num_input_tokens_seen": 141010815, + "router_z_loss_clip": 1.69726562, + "router_z_loss_mlp": 0.14257812, + "step": 6564, + "time_per_iteration": 2.543459415435791 + }, + { + "auxiliary_loss_clip": 0.06449583, + "auxiliary_loss_mlp": 0.01269395, + "balance_loss_clip": 0.06281252, + "balance_loss_mlp": 0.01254648, + "epoch": 0.3947091537652187, + "flos": 15894616700160.0, + "grad_norm": 2.288586168499947, + "language_loss": 0.83967394, + "learning_rate": 2.759561073299676e-06, + "loss": 0.91686368, + "num_input_tokens_seen": 141028720, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.14746094, + "step": 6565, + "time_per_iteration": 2.5438666343688965 + }, + { + "auxiliary_loss_clip": 0.06447474, + "auxiliary_loss_mlp": 0.01269356, + "balance_loss_clip": 0.06280743, + "balance_loss_mlp": 0.01255229, + "epoch": 0.39476927701788667, + "flos": 18550386259200.0, + "grad_norm": 2.0020652066074285, + "language_loss": 0.83519006, + "learning_rate": 2.7592007770065937e-06, + "loss": 0.91235834, + "num_input_tokens_seen": 141046025, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.14129639, + "step": 6566, + "time_per_iteration": 2.550548791885376 + }, + { + "auxiliary_loss_clip": 0.06459671, + "auxiliary_loss_mlp": 0.01271058, + "balance_loss_clip": 0.06282969, + "balance_loss_mlp": 0.01255072, + "epoch": 0.39482940027055463, + "flos": 22282682348160.0, + "grad_norm": 1.770017298907609, + "language_loss": 0.77499187, + "learning_rate": 2.7588404519245403e-06, + "loss": 0.85229909, + "num_input_tokens_seen": 141066865, + "router_z_loss_clip": 1.76660156, + "router_z_loss_mlp": 0.15979004, + "step": 6567, + "time_per_iteration": 2.535980463027954 + }, + { + "auxiliary_loss_clip": 0.0644526, + "auxiliary_loss_mlp": 0.01270792, + "balance_loss_clip": 0.06283294, + "balance_loss_mlp": 0.01257851, + "epoch": 0.3948895235232226, + "flos": 14763010510080.0, + "grad_norm": 1.9280900707618294, + "language_loss": 0.80259991, + "learning_rate": 2.758480098067182e-06, + "loss": 0.87976044, + "num_input_tokens_seen": 141084210, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.12945557, + "step": 6568, + "time_per_iteration": 2.56528639793396 + }, + { + "auxiliary_loss_clip": 0.06451409, + "auxiliary_loss_mlp": 0.01272888, + "balance_loss_clip": 0.06283959, + "balance_loss_mlp": 0.01258356, + "epoch": 0.39494964677589056, + "flos": 22572474094080.0, + "grad_norm": 2.8189067544408166, + "language_loss": 0.84836519, + "learning_rate": 2.7581197154481816e-06, + "loss": 0.9256081, + "num_input_tokens_seen": 141103895, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.1451416, + "step": 6569, + "time_per_iteration": 2.512678623199463 + }, + { + "auxiliary_loss_clip": 0.06448043, + "auxiliary_loss_mlp": 0.01269688, + "balance_loss_clip": 0.06284526, + "balance_loss_mlp": 0.01255538, + "epoch": 0.3950097700285585, + "flos": 22969307831040.0, + "grad_norm": 1.7602858722639216, + "language_loss": 0.74665594, + "learning_rate": 2.7577593040812066e-06, + "loss": 0.82383323, + "num_input_tokens_seen": 141124000, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.14147949, + "step": 6570, + "time_per_iteration": 2.611072063446045 + }, + { + "auxiliary_loss_clip": 0.06447589, + "auxiliary_loss_mlp": 0.01270515, + "balance_loss_clip": 0.06279834, + "balance_loss_mlp": 0.01256305, + "epoch": 0.3950698932812265, + "flos": 20601569082240.0, + "grad_norm": 1.9769080404363342, + "language_loss": 0.80472994, + "learning_rate": 2.757398863979922e-06, + "loss": 0.88191104, + "num_input_tokens_seen": 141142535, + "router_z_loss_clip": 1.67871094, + "router_z_loss_mlp": 0.14196777, + "step": 6571, + "time_per_iteration": 4.037761688232422 + }, + { + "auxiliary_loss_clip": 0.06446905, + "auxiliary_loss_mlp": 0.012706, + "balance_loss_clip": 0.06278758, + "balance_loss_mlp": 0.01257022, + "epoch": 0.39513001653389446, + "flos": 20381992657920.0, + "grad_norm": 1.599556952476494, + "language_loss": 0.78081018, + "learning_rate": 2.757038395157997e-06, + "loss": 0.8579852, + "num_input_tokens_seen": 141161575, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.13574219, + "step": 6572, + "time_per_iteration": 2.542388439178467 + }, + { + "auxiliary_loss_clip": 0.06450671, + "auxiliary_loss_mlp": 0.01268422, + "balance_loss_clip": 0.06281148, + "balance_loss_mlp": 0.01253991, + "epoch": 0.3951901397865625, + "flos": 26469994435200.0, + "grad_norm": 1.9679034095416588, + "language_loss": 0.74861181, + "learning_rate": 2.7566778976291002e-06, + "loss": 0.8258028, + "num_input_tokens_seen": 141181150, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.14434814, + "step": 6573, + "time_per_iteration": 3.9954564571380615 + }, + { + "auxiliary_loss_clip": 0.06447303, + "auxiliary_loss_mlp": 0.012677, + "balance_loss_clip": 0.06282736, + "balance_loss_mlp": 0.0125492, + "epoch": 0.39525026303923044, + "flos": 43848845233920.0, + "grad_norm": 1.4348738267970096, + "language_loss": 0.67874503, + "learning_rate": 2.7563173714069017e-06, + "loss": 0.75589502, + "num_input_tokens_seen": 141206310, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.12799072, + "step": 6574, + "time_per_iteration": 2.75056791305542 + }, + { + "auxiliary_loss_clip": 0.06451984, + "auxiliary_loss_mlp": 0.01270185, + "balance_loss_clip": 0.06284595, + "balance_loss_mlp": 0.01255832, + "epoch": 0.3953103862918984, + "flos": 18046636312320.0, + "grad_norm": 3.0759560063082736, + "language_loss": 0.72770178, + "learning_rate": 2.755956816505072e-06, + "loss": 0.80492353, + "num_input_tokens_seen": 141223925, + "router_z_loss_clip": 1.67675781, + "router_z_loss_mlp": 0.14355469, + "step": 6575, + "time_per_iteration": 2.508314847946167 + }, + { + "auxiliary_loss_clip": 0.06452627, + "auxiliary_loss_mlp": 0.01270422, + "balance_loss_clip": 0.0628259, + "balance_loss_mlp": 0.01256015, + "epoch": 0.3953705095445664, + "flos": 16980549615360.0, + "grad_norm": 2.3956956088423382, + "language_loss": 0.73929548, + "learning_rate": 2.7555962329372845e-06, + "loss": 0.816526, + "num_input_tokens_seen": 141239010, + "router_z_loss_clip": 1.69824219, + "router_z_loss_mlp": 0.1439209, + "step": 6576, + "time_per_iteration": 2.4877238273620605 + }, + { + "auxiliary_loss_clip": 0.06453596, + "auxiliary_loss_mlp": 0.01269813, + "balance_loss_clip": 0.06286615, + "balance_loss_mlp": 0.0125704, + "epoch": 0.39543063279723434, + "flos": 17415300124800.0, + "grad_norm": 2.3089155525157397, + "language_loss": 0.8424108, + "learning_rate": 2.7552356207172124e-06, + "loss": 0.91964483, + "num_input_tokens_seen": 141252255, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.12786865, + "step": 6577, + "time_per_iteration": 3.9026546478271484 + }, + { + "auxiliary_loss_clip": 0.06447916, + "auxiliary_loss_mlp": 0.01269176, + "balance_loss_clip": 0.06283568, + "balance_loss_mlp": 0.01255788, + "epoch": 0.3954907560499023, + "flos": 22790876561280.0, + "grad_norm": 2.6090797034217603, + "language_loss": 0.90399998, + "learning_rate": 2.75487497985853e-06, + "loss": 0.98117089, + "num_input_tokens_seen": 141269325, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.1338501, + "step": 6578, + "time_per_iteration": 2.624901533126831 + }, + { + "auxiliary_loss_clip": 0.06451896, + "auxiliary_loss_mlp": 0.01269341, + "balance_loss_clip": 0.06281315, + "balance_loss_mlp": 0.01254284, + "epoch": 0.39555087930257027, + "flos": 21950823052800.0, + "grad_norm": 1.8247592517251146, + "language_loss": 0.78543842, + "learning_rate": 2.7545143103749117e-06, + "loss": 0.86265075, + "num_input_tokens_seen": 141288505, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.15063477, + "step": 6579, + "time_per_iteration": 2.5111443996429443 + }, + { + "auxiliary_loss_clip": 0.06456701, + "auxiliary_loss_mlp": 0.01273715, + "balance_loss_clip": 0.0628474, + "balance_loss_mlp": 0.01258492, + "epoch": 0.39561100255523823, + "flos": 20409553451520.0, + "grad_norm": 2.1653293739232753, + "language_loss": 0.68659246, + "learning_rate": 2.754153612280037e-06, + "loss": 0.76389658, + "num_input_tokens_seen": 141303680, + "router_z_loss_clip": 1.71972656, + "router_z_loss_mlp": 0.15216064, + "step": 6580, + "time_per_iteration": 4.038321495056152 + }, + { + "auxiliary_loss_clip": 0.06448758, + "auxiliary_loss_mlp": 0.01270958, + "balance_loss_clip": 0.06283981, + "balance_loss_mlp": 0.01256635, + "epoch": 0.3956711258079062, + "flos": 27972005598720.0, + "grad_norm": 1.867170796056586, + "language_loss": 0.58577931, + "learning_rate": 2.7537928855875797e-06, + "loss": 0.6629765, + "num_input_tokens_seen": 141324090, + "router_z_loss_clip": 1.64648438, + "router_z_loss_mlp": 0.14318848, + "step": 6581, + "time_per_iteration": 2.618917942047119 + }, + { + "auxiliary_loss_clip": 0.0645448, + "auxiliary_loss_mlp": 0.0127135, + "balance_loss_clip": 0.06288571, + "balance_loss_mlp": 0.01256413, + "epoch": 0.39573124906057416, + "flos": 14433457201920.0, + "grad_norm": 2.002939068333409, + "language_loss": 0.69910431, + "learning_rate": 2.7534321303112224e-06, + "loss": 0.77636254, + "num_input_tokens_seen": 141342235, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.14929199, + "step": 6582, + "time_per_iteration": 2.530895709991455 + }, + { + "auxiliary_loss_clip": 0.06451949, + "auxiliary_loss_mlp": 0.01273006, + "balance_loss_clip": 0.06283893, + "balance_loss_mlp": 0.01258546, + "epoch": 0.39579137231324213, + "flos": 18739592778240.0, + "grad_norm": 2.2302551557868457, + "language_loss": 0.76587689, + "learning_rate": 2.753071346464642e-06, + "loss": 0.84312642, + "num_input_tokens_seen": 141361195, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.14453125, + "step": 6583, + "time_per_iteration": 2.5276317596435547 + }, + { + "auxiliary_loss_clip": 0.0645259, + "auxiliary_loss_mlp": 0.0127002, + "balance_loss_clip": 0.06284047, + "balance_loss_mlp": 0.01256562, + "epoch": 0.3958514955659101, + "flos": 17682268832640.0, + "grad_norm": 1.926047340176765, + "language_loss": 0.66262352, + "learning_rate": 2.7527105340615207e-06, + "loss": 0.73984963, + "num_input_tokens_seen": 141378275, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.13458252, + "step": 6584, + "time_per_iteration": 2.501209259033203 + }, + { + "auxiliary_loss_clip": 0.06456675, + "auxiliary_loss_mlp": 0.01270923, + "balance_loss_clip": 0.06285589, + "balance_loss_mlp": 0.01256803, + "epoch": 0.39591161881857806, + "flos": 29315850981120.0, + "grad_norm": 1.992954295318491, + "language_loss": 0.72398281, + "learning_rate": 2.7523496931155413e-06, + "loss": 0.8012588, + "num_input_tokens_seen": 141396960, + "router_z_loss_clip": 1.7109375, + "router_z_loss_mlp": 0.14111328, + "step": 6585, + "time_per_iteration": 2.617694616317749 + }, + { + "auxiliary_loss_clip": 0.06457305, + "auxiliary_loss_mlp": 0.0127182, + "balance_loss_clip": 0.06288064, + "balance_loss_mlp": 0.01257336, + "epoch": 0.3959717420712461, + "flos": 25778295780480.0, + "grad_norm": 1.6889684303793513, + "language_loss": 0.73472714, + "learning_rate": 2.7519888236403856e-06, + "loss": 0.81201839, + "num_input_tokens_seen": 141417320, + "router_z_loss_clip": 1.69238281, + "router_z_loss_mlp": 0.14477539, + "step": 6586, + "time_per_iteration": 2.565883159637451 + }, + { + "auxiliary_loss_clip": 0.06454571, + "auxiliary_loss_mlp": 0.01267143, + "balance_loss_clip": 0.06286268, + "balance_loss_mlp": 0.01252969, + "epoch": 0.39603186532391405, + "flos": 20930199995520.0, + "grad_norm": 1.6150585752618039, + "language_loss": 0.71662915, + "learning_rate": 2.7516279256497382e-06, + "loss": 0.79384637, + "num_input_tokens_seen": 141435985, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.14160156, + "step": 6587, + "time_per_iteration": 2.5788414478302 + }, + { + "auxiliary_loss_clip": 0.06362241, + "auxiliary_loss_mlp": 0.01254401, + "balance_loss_clip": 0.06286076, + "balance_loss_mlp": 0.01251419, + "epoch": 0.396091988576582, + "flos": 54897336720000.0, + "grad_norm": 0.8108180128275717, + "language_loss": 0.60705078, + "learning_rate": 2.751266999157285e-06, + "loss": 0.68321717, + "num_input_tokens_seen": 141486075, + "router_z_loss_clip": 0.75830078, + "router_z_loss_mlp": 0.02980042, + "step": 6588, + "time_per_iteration": 2.973475217819214 + }, + { + "auxiliary_loss_clip": 0.06457016, + "auxiliary_loss_mlp": 0.01266428, + "balance_loss_clip": 0.06285909, + "balance_loss_mlp": 0.01251873, + "epoch": 0.39615211182925, + "flos": 20708946489600.0, + "grad_norm": 1.752385405351709, + "language_loss": 0.81335068, + "learning_rate": 2.7509060441767115e-06, + "loss": 0.89058518, + "num_input_tokens_seen": 141505280, + "router_z_loss_clip": 1.71191406, + "router_z_loss_mlp": 0.14575195, + "step": 6589, + "time_per_iteration": 2.557732582092285 + }, + { + "auxiliary_loss_clip": 0.06456019, + "auxiliary_loss_mlp": 0.01269797, + "balance_loss_clip": 0.06286196, + "balance_loss_mlp": 0.01254431, + "epoch": 0.39621223508191794, + "flos": 21000331463040.0, + "grad_norm": 1.8508577793480634, + "language_loss": 0.71167219, + "learning_rate": 2.7505450607217057e-06, + "loss": 0.7889303, + "num_input_tokens_seen": 141523930, + "router_z_loss_clip": 1.69726562, + "router_z_loss_mlp": 0.15368652, + "step": 6590, + "time_per_iteration": 2.5155017375946045 + }, + { + "auxiliary_loss_clip": 0.06451933, + "auxiliary_loss_mlp": 0.01266373, + "balance_loss_clip": 0.06285245, + "balance_loss_mlp": 0.01253284, + "epoch": 0.3962723583345859, + "flos": 23375742860160.0, + "grad_norm": 1.6853348593397999, + "language_loss": 0.75984478, + "learning_rate": 2.750184048805956e-06, + "loss": 0.83702791, + "num_input_tokens_seen": 141541320, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.13098145, + "step": 6591, + "time_per_iteration": 2.569958448410034 + }, + { + "auxiliary_loss_clip": 0.06454425, + "auxiliary_loss_mlp": 0.01268025, + "balance_loss_clip": 0.06288329, + "balance_loss_mlp": 0.01254215, + "epoch": 0.39633248158725387, + "flos": 25122040202880.0, + "grad_norm": 1.5542594066551045, + "language_loss": 0.78422546, + "learning_rate": 2.749823008443152e-06, + "loss": 0.8614499, + "num_input_tokens_seen": 141561880, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.13806152, + "step": 6592, + "time_per_iteration": 2.5509040355682373 + }, + { + "auxiliary_loss_clip": 0.06448938, + "auxiliary_loss_mlp": 0.0127036, + "balance_loss_clip": 0.062861, + "balance_loss_mlp": 0.01256615, + "epoch": 0.39639260483992184, + "flos": 39797309888640.0, + "grad_norm": 1.716432087396327, + "language_loss": 0.69405383, + "learning_rate": 2.7494619396469843e-06, + "loss": 0.77124685, + "num_input_tokens_seen": 141586460, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.13751221, + "step": 6593, + "time_per_iteration": 2.742421865463257 + }, + { + "auxiliary_loss_clip": 0.06455009, + "auxiliary_loss_mlp": 0.01268833, + "balance_loss_clip": 0.06285039, + "balance_loss_mlp": 0.01253896, + "epoch": 0.3964527280925898, + "flos": 17352673597440.0, + "grad_norm": 2.6756229463225134, + "language_loss": 0.78082192, + "learning_rate": 2.7491008424311452e-06, + "loss": 0.85806036, + "num_input_tokens_seen": 141605955, + "router_z_loss_clip": 1.69824219, + "router_z_loss_mlp": 0.14929199, + "step": 6594, + "time_per_iteration": 2.5240583419799805 + }, + { + "auxiliary_loss_clip": 0.06345355, + "auxiliary_loss_mlp": 0.01253278, + "balance_loss_clip": 0.06269702, + "balance_loss_mlp": 0.0125056, + "epoch": 0.39651285134525777, + "flos": 71739845533440.0, + "grad_norm": 0.9367359782969226, + "language_loss": 0.6293599, + "learning_rate": 2.7487397168093265e-06, + "loss": 0.70534623, + "num_input_tokens_seen": 141673140, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.02722168, + "step": 6595, + "time_per_iteration": 3.195411205291748 + }, + { + "auxiliary_loss_clip": 0.06455558, + "auxiliary_loss_mlp": 0.01273293, + "balance_loss_clip": 0.0628309, + "balance_loss_mlp": 0.0125714, + "epoch": 0.39657297459792573, + "flos": 25782823900800.0, + "grad_norm": 2.0629727816625656, + "language_loss": 0.63503623, + "learning_rate": 2.748378562795223e-06, + "loss": 0.71232474, + "num_input_tokens_seen": 141692955, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.16149902, + "step": 6596, + "time_per_iteration": 2.564436197280884 + }, + { + "auxiliary_loss_clip": 0.06445512, + "auxiliary_loss_mlp": 0.01270278, + "balance_loss_clip": 0.0628349, + "balance_loss_mlp": 0.01256086, + "epoch": 0.3966330978505937, + "flos": 20272267336320.0, + "grad_norm": 3.0845696935228646, + "language_loss": 0.79033494, + "learning_rate": 2.7480173804025293e-06, + "loss": 0.86749279, + "num_input_tokens_seen": 141710680, + "router_z_loss_clip": 1.61914062, + "router_z_loss_mlp": 0.14202881, + "step": 6597, + "time_per_iteration": 2.5187220573425293 + }, + { + "auxiliary_loss_clip": 0.0645806, + "auxiliary_loss_mlp": 0.01272047, + "balance_loss_clip": 0.06285266, + "balance_loss_mlp": 0.01257259, + "epoch": 0.39669322110326166, + "flos": 20637431429760.0, + "grad_norm": 1.9127598273467419, + "language_loss": 0.67675543, + "learning_rate": 2.747656169644941e-06, + "loss": 0.75405657, + "num_input_tokens_seen": 141729860, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.14776611, + "step": 6598, + "time_per_iteration": 2.5287654399871826 + }, + { + "auxiliary_loss_clip": 0.06448894, + "auxiliary_loss_mlp": 0.01270917, + "balance_loss_clip": 0.06280929, + "balance_loss_mlp": 0.01257643, + "epoch": 0.3967533443559297, + "flos": 21732546366720.0, + "grad_norm": 1.6941457063111416, + "language_loss": 0.79130334, + "learning_rate": 2.747294930536157e-06, + "loss": 0.86850142, + "num_input_tokens_seen": 141749060, + "router_z_loss_clip": 1.68066406, + "router_z_loss_mlp": 0.13269043, + "step": 6599, + "time_per_iteration": 2.564073324203491 + }, + { + "auxiliary_loss_clip": 0.06447926, + "auxiliary_loss_mlp": 0.01270436, + "balance_loss_clip": 0.06279482, + "balance_loss_mlp": 0.01254289, + "epoch": 0.39681346760859765, + "flos": 25491271219200.0, + "grad_norm": 1.7355689440790156, + "language_loss": 0.72895992, + "learning_rate": 2.7469336630898737e-06, + "loss": 0.80614352, + "num_input_tokens_seen": 141769860, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.16149902, + "step": 6600, + "time_per_iteration": 2.6141197681427 + }, + { + "auxiliary_loss_clip": 0.06448444, + "auxiliary_loss_mlp": 0.01274951, + "balance_loss_clip": 0.06280382, + "balance_loss_mlp": 0.01261045, + "epoch": 0.3968735908612656, + "flos": 20965894634880.0, + "grad_norm": 1.918502465070546, + "language_loss": 0.85902363, + "learning_rate": 2.746572367319791e-06, + "loss": 0.9362576, + "num_input_tokens_seen": 141788465, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.13909912, + "step": 6601, + "time_per_iteration": 2.539337396621704 + }, + { + "auxiliary_loss_clip": 0.06455625, + "auxiliary_loss_mlp": 0.01273924, + "balance_loss_clip": 0.06281834, + "balance_loss_mlp": 0.0125773, + "epoch": 0.3969337141139336, + "flos": 10711684800000.0, + "grad_norm": 2.4177834123100412, + "language_loss": 0.70406669, + "learning_rate": 2.7462110432396095e-06, + "loss": 0.78136218, + "num_input_tokens_seen": 141804955, + "router_z_loss_clip": 1.74023438, + "router_z_loss_mlp": 0.16192627, + "step": 6602, + "time_per_iteration": 2.5344958305358887 + }, + { + "auxiliary_loss_clip": 0.06450728, + "auxiliary_loss_mlp": 0.01272133, + "balance_loss_clip": 0.06280322, + "balance_loss_mlp": 0.01257583, + "epoch": 0.39699383736660154, + "flos": 17597924098560.0, + "grad_norm": 4.3880896635048865, + "language_loss": 0.84332073, + "learning_rate": 2.7458496908630305e-06, + "loss": 0.92054927, + "num_input_tokens_seen": 141820025, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.14550781, + "step": 6603, + "time_per_iteration": 2.4587697982788086 + }, + { + "auxiliary_loss_clip": 0.06445679, + "auxiliary_loss_mlp": 0.01276756, + "balance_loss_clip": 0.06278397, + "balance_loss_mlp": 0.01263017, + "epoch": 0.3970539606192695, + "flos": 17791826446080.0, + "grad_norm": 1.5258003920697418, + "language_loss": 0.7302916, + "learning_rate": 2.7454883102037563e-06, + "loss": 0.80751598, + "num_input_tokens_seen": 141838735, + "router_z_loss_clip": 1.67480469, + "router_z_loss_mlp": 0.13751221, + "step": 6604, + "time_per_iteration": 2.525475025177002 + }, + { + "auxiliary_loss_clip": 0.06437713, + "auxiliary_loss_mlp": 0.01269691, + "balance_loss_clip": 0.06277181, + "balance_loss_mlp": 0.0125609, + "epoch": 0.3971140838719375, + "flos": 24796260328320.0, + "grad_norm": 1.5312177971095886, + "language_loss": 0.82809514, + "learning_rate": 2.745126901275491e-06, + "loss": 0.90516913, + "num_input_tokens_seen": 141858090, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.13598633, + "step": 6605, + "time_per_iteration": 2.5601069927215576 + }, + { + "auxiliary_loss_clip": 0.06439412, + "auxiliary_loss_mlp": 0.01269635, + "balance_loss_clip": 0.06274941, + "balance_loss_mlp": 0.01256337, + "epoch": 0.39717420712460544, + "flos": 24250484759040.0, + "grad_norm": 1.721474173213711, + "language_loss": 0.74617773, + "learning_rate": 2.7447654640919383e-06, + "loss": 0.82326818, + "num_input_tokens_seen": 141877540, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.13293457, + "step": 6606, + "time_per_iteration": 2.570338726043701 + }, + { + "auxiliary_loss_clip": 0.06450282, + "auxiliary_loss_mlp": 0.01268462, + "balance_loss_clip": 0.06279129, + "balance_loss_mlp": 0.01255343, + "epoch": 0.3972343303772734, + "flos": 25891752608640.0, + "grad_norm": 1.7826498780228273, + "language_loss": 0.74625784, + "learning_rate": 2.744403998666805e-06, + "loss": 0.8234452, + "num_input_tokens_seen": 141897315, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.13122559, + "step": 6607, + "time_per_iteration": 2.554779052734375 + }, + { + "auxiliary_loss_clip": 0.06451584, + "auxiliary_loss_mlp": 0.01271624, + "balance_loss_clip": 0.0628166, + "balance_loss_mlp": 0.01257366, + "epoch": 0.39729445362994137, + "flos": 45634107525120.0, + "grad_norm": 2.013518755058626, + "language_loss": 0.68503535, + "learning_rate": 2.744042505013797e-06, + "loss": 0.76226741, + "num_input_tokens_seen": 141919580, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.1427002, + "step": 6608, + "time_per_iteration": 2.814741611480713 + }, + { + "auxiliary_loss_clip": 0.06453016, + "auxiliary_loss_mlp": 0.01270819, + "balance_loss_clip": 0.06280445, + "balance_loss_mlp": 0.01256496, + "epoch": 0.39735457688260933, + "flos": 20200249152000.0, + "grad_norm": 2.238404873213265, + "language_loss": 0.74168068, + "learning_rate": 2.7436809831466233e-06, + "loss": 0.818919, + "num_input_tokens_seen": 141937045, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.14318848, + "step": 6609, + "time_per_iteration": 2.549020767211914 + }, + { + "auxiliary_loss_clip": 0.06450722, + "auxiliary_loss_mlp": 0.01268408, + "balance_loss_clip": 0.06281993, + "balance_loss_mlp": 0.0125424, + "epoch": 0.3974147001352773, + "flos": 23337868014720.0, + "grad_norm": 1.4758458837885644, + "language_loss": 0.71468556, + "learning_rate": 2.7433194330789927e-06, + "loss": 0.79187685, + "num_input_tokens_seen": 141956695, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.14154053, + "step": 6610, + "time_per_iteration": 3.985957622528076 + }, + { + "auxiliary_loss_clip": 0.06440872, + "auxiliary_loss_mlp": 0.01270494, + "balance_loss_clip": 0.062764, + "balance_loss_mlp": 0.01256559, + "epoch": 0.39747482338794526, + "flos": 21694965010560.0, + "grad_norm": 1.555692262156073, + "language_loss": 0.7854501, + "learning_rate": 2.7429578548246133e-06, + "loss": 0.86256385, + "num_input_tokens_seen": 141975935, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.13934326, + "step": 6611, + "time_per_iteration": 2.5972208976745605 + }, + { + "auxiliary_loss_clip": 0.06447503, + "auxiliary_loss_mlp": 0.01268941, + "balance_loss_clip": 0.06280762, + "balance_loss_mlp": 0.01255065, + "epoch": 0.3975349466406133, + "flos": 30995957998080.0, + "grad_norm": 2.19308398220208, + "language_loss": 0.79606485, + "learning_rate": 2.7425962483971985e-06, + "loss": 0.87322932, + "num_input_tokens_seen": 141995750, + "router_z_loss_clip": 1.66699219, + "router_z_loss_mlp": 0.13891602, + "step": 6612, + "time_per_iteration": 2.6106274127960205 + }, + { + "auxiliary_loss_clip": 0.0634682, + "auxiliary_loss_mlp": 0.01253265, + "balance_loss_clip": 0.06270839, + "balance_loss_mlp": 0.01250469, + "epoch": 0.39759506989328125, + "flos": 63703426366080.0, + "grad_norm": 0.8245936024085626, + "language_loss": 0.6463905, + "learning_rate": 2.742234613810459e-06, + "loss": 0.72239137, + "num_input_tokens_seen": 142057655, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.02796936, + "step": 6613, + "time_per_iteration": 4.473678112030029 + }, + { + "auxiliary_loss_clip": 0.06450668, + "auxiliary_loss_mlp": 0.01269678, + "balance_loss_clip": 0.06282368, + "balance_loss_mlp": 0.01255367, + "epoch": 0.3976551931459492, + "flos": 23702570910720.0, + "grad_norm": 2.448614415916545, + "language_loss": 0.72596258, + "learning_rate": 2.741872951078109e-06, + "loss": 0.80316603, + "num_input_tokens_seen": 142076020, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.14312744, + "step": 6614, + "time_per_iteration": 2.5691444873809814 + }, + { + "auxiliary_loss_clip": 0.06449673, + "auxiliary_loss_mlp": 0.0127007, + "balance_loss_clip": 0.06283288, + "balance_loss_mlp": 0.01256051, + "epoch": 0.3977153163986172, + "flos": 15675166056960.0, + "grad_norm": 2.2284862441621995, + "language_loss": 0.81666011, + "learning_rate": 2.741511260213862e-06, + "loss": 0.89385748, + "num_input_tokens_seen": 142093790, + "router_z_loss_clip": 1.6640625, + "router_z_loss_mlp": 0.14013672, + "step": 6615, + "time_per_iteration": 2.55078387260437 + }, + { + "auxiliary_loss_clip": 0.06452717, + "auxiliary_loss_mlp": 0.01269531, + "balance_loss_clip": 0.06284063, + "balance_loss_mlp": 0.01255679, + "epoch": 0.39777543965128515, + "flos": 14070012117120.0, + "grad_norm": 1.96274897748641, + "language_loss": 0.67687142, + "learning_rate": 2.741149541231434e-06, + "loss": 0.75409389, + "num_input_tokens_seen": 142110545, + "router_z_loss_clip": 1.68554688, + "router_z_loss_mlp": 0.13842773, + "step": 6616, + "time_per_iteration": 2.533982992172241 + }, + { + "auxiliary_loss_clip": 0.06455097, + "auxiliary_loss_mlp": 0.0126897, + "balance_loss_clip": 0.06281532, + "balance_loss_mlp": 0.01253986, + "epoch": 0.3978355629039531, + "flos": 23374149632640.0, + "grad_norm": 2.1811174101900552, + "language_loss": 0.8396368, + "learning_rate": 2.740787794144541e-06, + "loss": 0.91687751, + "num_input_tokens_seen": 142128695, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.14978027, + "step": 6617, + "time_per_iteration": 3.9742090702056885 + }, + { + "auxiliary_loss_clip": 0.06446042, + "auxiliary_loss_mlp": 0.01268103, + "balance_loss_clip": 0.06283504, + "balance_loss_mlp": 0.01255556, + "epoch": 0.3978956861566211, + "flos": 19068852597120.0, + "grad_norm": 1.7253210008214133, + "language_loss": 0.73000187, + "learning_rate": 2.7404260189669e-06, + "loss": 0.80714333, + "num_input_tokens_seen": 142148375, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.12536621, + "step": 6618, + "time_per_iteration": 2.562913179397583 + }, + { + "auxiliary_loss_clip": 0.06454587, + "auxiliary_loss_mlp": 0.01274299, + "balance_loss_clip": 0.06285769, + "balance_loss_mlp": 0.01258576, + "epoch": 0.39795580940928904, + "flos": 30235679176320.0, + "grad_norm": 1.6365941861062427, + "language_loss": 0.65343797, + "learning_rate": 2.740064215712231e-06, + "loss": 0.73072684, + "num_input_tokens_seen": 142169735, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.15710449, + "step": 6619, + "time_per_iteration": 2.598667860031128 + }, + { + "auxiliary_loss_clip": 0.06341819, + "auxiliary_loss_mlp": 0.01254465, + "balance_loss_clip": 0.06266081, + "balance_loss_mlp": 0.01251738, + "epoch": 0.398015932661957, + "flos": 69867261688320.0, + "grad_norm": 0.7579483566665592, + "language_loss": 0.582268, + "learning_rate": 2.7397023843942527e-06, + "loss": 0.65823084, + "num_input_tokens_seen": 142229520, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.02731323, + "step": 6620, + "time_per_iteration": 4.528149604797363 + }, + { + "auxiliary_loss_clip": 0.06446633, + "auxiliary_loss_mlp": 0.01270084, + "balance_loss_clip": 0.06280729, + "balance_loss_mlp": 0.01256858, + "epoch": 0.39807605591462497, + "flos": 20164093315200.0, + "grad_norm": 1.5024608902652035, + "language_loss": 0.79499102, + "learning_rate": 2.739340525026686e-06, + "loss": 0.87215811, + "num_input_tokens_seen": 142247660, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.13232422, + "step": 6621, + "time_per_iteration": 2.559305191040039 + }, + { + "auxiliary_loss_clip": 0.06445563, + "auxiliary_loss_mlp": 0.01270989, + "balance_loss_clip": 0.06279579, + "balance_loss_mlp": 0.01257435, + "epoch": 0.39813617916729294, + "flos": 21148057411200.0, + "grad_norm": 1.7591122738615637, + "language_loss": 0.78347874, + "learning_rate": 2.738978637623252e-06, + "loss": 0.86064428, + "num_input_tokens_seen": 142266990, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.13568115, + "step": 6622, + "time_per_iteration": 2.541325569152832 + }, + { + "auxiliary_loss_clip": 0.06444648, + "auxiliary_loss_mlp": 0.01270694, + "balance_loss_clip": 0.06278688, + "balance_loss_mlp": 0.01255948, + "epoch": 0.3981963024199609, + "flos": 18994318790400.0, + "grad_norm": 9.51473607747463, + "language_loss": 0.75430334, + "learning_rate": 2.738616722197674e-06, + "loss": 0.83145678, + "num_input_tokens_seen": 142287170, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.14733887, + "step": 6623, + "time_per_iteration": 2.5859150886535645 + }, + { + "auxiliary_loss_clip": 0.06449074, + "auxiliary_loss_mlp": 0.0127457, + "balance_loss_clip": 0.06282511, + "balance_loss_mlp": 0.01260551, + "epoch": 0.39825642567262887, + "flos": 16579648955520.0, + "grad_norm": 1.7143371951380526, + "language_loss": 0.79926246, + "learning_rate": 2.7382547787636766e-06, + "loss": 0.87649894, + "num_input_tokens_seen": 142305405, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.14025879, + "step": 6624, + "time_per_iteration": 2.509500026702881 + }, + { + "auxiliary_loss_clip": 0.06454292, + "auxiliary_loss_mlp": 0.01269994, + "balance_loss_clip": 0.06280515, + "balance_loss_mlp": 0.01254234, + "epoch": 0.39831654892529683, + "flos": 22206303751680.0, + "grad_norm": 2.195062259081814, + "language_loss": 0.84314877, + "learning_rate": 2.7378928073349832e-06, + "loss": 0.92039162, + "num_input_tokens_seen": 142322710, + "router_z_loss_clip": 1.73535156, + "router_z_loss_mlp": 0.15759277, + "step": 6625, + "time_per_iteration": 2.5617175102233887 + }, + { + "auxiliary_loss_clip": 0.06446299, + "auxiliary_loss_mlp": 0.01272387, + "balance_loss_clip": 0.06279518, + "balance_loss_mlp": 0.01258517, + "epoch": 0.39837667217796485, + "flos": 10492485719040.0, + "grad_norm": 1.8250293636172175, + "language_loss": 0.8709324, + "learning_rate": 2.737530807925321e-06, + "loss": 0.94811928, + "num_input_tokens_seen": 142338535, + "router_z_loss_clip": 1.66699219, + "router_z_loss_mlp": 0.13867188, + "step": 6626, + "time_per_iteration": 2.72031307220459 + }, + { + "auxiliary_loss_clip": 0.06447423, + "auxiliary_loss_mlp": 0.01271086, + "balance_loss_clip": 0.0627908, + "balance_loss_mlp": 0.01256531, + "epoch": 0.3984367954306328, + "flos": 17970676986240.0, + "grad_norm": 2.760632977827581, + "language_loss": 0.84402627, + "learning_rate": 2.737168780548417e-06, + "loss": 0.9212113, + "num_input_tokens_seen": 142354570, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.14575195, + "step": 6627, + "time_per_iteration": 2.6228654384613037 + }, + { + "auxiliary_loss_clip": 0.06445234, + "auxiliary_loss_mlp": 0.01268693, + "balance_loss_clip": 0.0627917, + "balance_loss_mlp": 0.01255443, + "epoch": 0.3984969186833008, + "flos": 22717684419840.0, + "grad_norm": 3.2429830324928095, + "language_loss": 0.83402491, + "learning_rate": 2.736806725217998e-06, + "loss": 0.91116416, + "num_input_tokens_seen": 142374395, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.13250732, + "step": 6628, + "time_per_iteration": 2.6287484169006348 + }, + { + "auxiliary_loss_clip": 0.06449139, + "auxiliary_loss_mlp": 0.01271852, + "balance_loss_clip": 0.06279008, + "balance_loss_mlp": 0.01256981, + "epoch": 0.39855704193596875, + "flos": 23412779164800.0, + "grad_norm": 1.5731823007903518, + "language_loss": 0.71793973, + "learning_rate": 2.7364446419477945e-06, + "loss": 0.79514968, + "num_input_tokens_seen": 142396040, + "router_z_loss_clip": 1.70117188, + "router_z_loss_mlp": 0.14868164, + "step": 6629, + "time_per_iteration": 2.5752875804901123 + }, + { + "auxiliary_loss_clip": 0.06441505, + "auxiliary_loss_mlp": 0.01268472, + "balance_loss_clip": 0.06280406, + "balance_loss_mlp": 0.01254834, + "epoch": 0.3986171651886367, + "flos": 21258369711360.0, + "grad_norm": 2.035566678796665, + "language_loss": 0.80905473, + "learning_rate": 2.7360825307515366e-06, + "loss": 0.88615453, + "num_input_tokens_seen": 142415495, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.1362915, + "step": 6630, + "time_per_iteration": 2.5329513549804688 + }, + { + "auxiliary_loss_clip": 0.06445715, + "auxiliary_loss_mlp": 0.01269878, + "balance_loss_clip": 0.06276714, + "balance_loss_mlp": 0.01255693, + "epoch": 0.3986772884413047, + "flos": 12463642293120.0, + "grad_norm": 2.1251751047068783, + "language_loss": 0.75146663, + "learning_rate": 2.7357203916429555e-06, + "loss": 0.82862258, + "num_input_tokens_seen": 142431865, + "router_z_loss_clip": 1.69042969, + "router_z_loss_mlp": 0.14190674, + "step": 6631, + "time_per_iteration": 2.5500082969665527 + }, + { + "auxiliary_loss_clip": 0.06448178, + "auxiliary_loss_mlp": 0.01269111, + "balance_loss_clip": 0.06279311, + "balance_loss_mlp": 0.0125505, + "epoch": 0.39873741169397264, + "flos": 19652209522560.0, + "grad_norm": 1.6915315525927903, + "language_loss": 0.71496904, + "learning_rate": 2.735358224635783e-06, + "loss": 0.79214191, + "num_input_tokens_seen": 142450595, + "router_z_loss_clip": 1.69042969, + "router_z_loss_mlp": 0.140625, + "step": 6632, + "time_per_iteration": 2.563776731491089 + }, + { + "auxiliary_loss_clip": 0.06444843, + "auxiliary_loss_mlp": 0.01269449, + "balance_loss_clip": 0.06279632, + "balance_loss_mlp": 0.01255955, + "epoch": 0.3987975349466406, + "flos": 21690436890240.0, + "grad_norm": 1.8116978167005697, + "language_loss": 0.75623924, + "learning_rate": 2.7349960297437533e-06, + "loss": 0.83338219, + "num_input_tokens_seen": 142466650, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.13494873, + "step": 6633, + "time_per_iteration": 2.5171151161193848 + }, + { + "auxiliary_loss_clip": 0.06449188, + "auxiliary_loss_mlp": 0.01272467, + "balance_loss_clip": 0.06280442, + "balance_loss_mlp": 0.0125846, + "epoch": 0.3988576581993086, + "flos": 23920721815680.0, + "grad_norm": 1.9002609831735993, + "language_loss": 0.81678545, + "learning_rate": 2.7346338069806e-06, + "loss": 0.89400202, + "num_input_tokens_seen": 142486165, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.14001465, + "step": 6634, + "time_per_iteration": 2.539128065109253 + }, + { + "auxiliary_loss_clip": 0.06453361, + "auxiliary_loss_mlp": 0.01269766, + "balance_loss_clip": 0.06283009, + "balance_loss_mlp": 0.01255449, + "epoch": 0.39891778145197654, + "flos": 18155690801280.0, + "grad_norm": 1.9946050359209588, + "language_loss": 0.7547667, + "learning_rate": 2.7342715563600597e-06, + "loss": 0.83199799, + "num_input_tokens_seen": 142505035, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.14306641, + "step": 6635, + "time_per_iteration": 2.5426242351531982 + }, + { + "auxiliary_loss_clip": 0.06468328, + "auxiliary_loss_mlp": 0.01272826, + "balance_loss_clip": 0.06289048, + "balance_loss_mlp": 0.01256053, + "epoch": 0.3989779047046445, + "flos": 22600831501440.0, + "grad_norm": 1.9740114535883675, + "language_loss": 0.66474432, + "learning_rate": 2.733909277895868e-06, + "loss": 0.74215585, + "num_input_tokens_seen": 142521870, + "router_z_loss_clip": 1.79296875, + "router_z_loss_mlp": 0.16760254, + "step": 6636, + "time_per_iteration": 2.5290956497192383 + }, + { + "auxiliary_loss_clip": 0.06452767, + "auxiliary_loss_mlp": 0.01270258, + "balance_loss_clip": 0.06285115, + "balance_loss_mlp": 0.01255012, + "epoch": 0.39903802795731247, + "flos": 18083043711360.0, + "grad_norm": 1.6936131920640751, + "language_loss": 0.82211542, + "learning_rate": 2.733546971601763e-06, + "loss": 0.89934564, + "num_input_tokens_seen": 142540455, + "router_z_loss_clip": 1.67773438, + "router_z_loss_mlp": 0.15246582, + "step": 6637, + "time_per_iteration": 2.516279458999634 + }, + { + "auxiliary_loss_clip": 0.06353697, + "auxiliary_loss_mlp": 0.01252791, + "balance_loss_clip": 0.06278069, + "balance_loss_mlp": 0.01250418, + "epoch": 0.39909815120998043, + "flos": 70463238652800.0, + "grad_norm": 0.7262189478909644, + "language_loss": 0.531524, + "learning_rate": 2.733184637491484e-06, + "loss": 0.60758889, + "num_input_tokens_seen": 142599665, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.0236969, + "step": 6638, + "time_per_iteration": 3.2179603576660156 + }, + { + "auxiliary_loss_clip": 0.06449973, + "auxiliary_loss_mlp": 0.01277744, + "balance_loss_clip": 0.06279011, + "balance_loss_mlp": 0.0126304, + "epoch": 0.39915827446264845, + "flos": 18554788598400.0, + "grad_norm": 1.4980640352775056, + "language_loss": 0.75670731, + "learning_rate": 2.732822275578769e-06, + "loss": 0.83398449, + "num_input_tokens_seen": 142618845, + "router_z_loss_clip": 1.70898438, + "router_z_loss_mlp": 0.14715576, + "step": 6639, + "time_per_iteration": 2.5319011211395264 + }, + { + "auxiliary_loss_clip": 0.06442601, + "auxiliary_loss_mlp": 0.01272751, + "balance_loss_clip": 0.0627881, + "balance_loss_mlp": 0.01258249, + "epoch": 0.3992183977153164, + "flos": 29904826129920.0, + "grad_norm": 2.014095124557279, + "language_loss": 0.76376802, + "learning_rate": 2.7324598858773603e-06, + "loss": 0.84092152, + "num_input_tokens_seen": 142640885, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.1451416, + "step": 6640, + "time_per_iteration": 2.642223834991455 + }, + { + "auxiliary_loss_clip": 0.06449724, + "auxiliary_loss_mlp": 0.01270265, + "balance_loss_clip": 0.06280393, + "balance_loss_mlp": 0.01255757, + "epoch": 0.3992785209679844, + "flos": 22571677480320.0, + "grad_norm": 2.238528881986372, + "language_loss": 0.8211664, + "learning_rate": 2.7320974684009996e-06, + "loss": 0.89836633, + "num_input_tokens_seen": 142659340, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.14501953, + "step": 6641, + "time_per_iteration": 2.530189275741577 + }, + { + "auxiliary_loss_clip": 0.06456075, + "auxiliary_loss_mlp": 0.01270045, + "balance_loss_clip": 0.06284191, + "balance_loss_mlp": 0.01254971, + "epoch": 0.39933864422065235, + "flos": 19688784629760.0, + "grad_norm": 1.8306704082742173, + "language_loss": 0.77208257, + "learning_rate": 2.7317350231634288e-06, + "loss": 0.84934378, + "num_input_tokens_seen": 142677085, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.15081787, + "step": 6642, + "time_per_iteration": 2.5495219230651855 + }, + { + "auxiliary_loss_clip": 0.06453043, + "auxiliary_loss_mlp": 0.01270555, + "balance_loss_clip": 0.06281064, + "balance_loss_mlp": 0.01255564, + "epoch": 0.3993987674733203, + "flos": 23045015594880.0, + "grad_norm": 2.242078242091602, + "language_loss": 0.72883618, + "learning_rate": 2.731372550178393e-06, + "loss": 0.80607212, + "num_input_tokens_seen": 142694595, + "router_z_loss_clip": 1.71972656, + "router_z_loss_mlp": 0.14984131, + "step": 6643, + "time_per_iteration": 2.521857500076294 + }, + { + "auxiliary_loss_clip": 0.06456347, + "auxiliary_loss_mlp": 0.01273961, + "balance_loss_clip": 0.06283459, + "balance_loss_mlp": 0.01259317, + "epoch": 0.3994588907259883, + "flos": 19396896531840.0, + "grad_norm": 1.7649027305896348, + "language_loss": 0.66785717, + "learning_rate": 2.7310100494596375e-06, + "loss": 0.74516022, + "num_input_tokens_seen": 142714175, + "router_z_loss_clip": 1.72851562, + "router_z_loss_mlp": 0.14642334, + "step": 6644, + "time_per_iteration": 2.571690320968628 + }, + { + "auxiliary_loss_clip": 0.06454624, + "auxiliary_loss_mlp": 0.0127806, + "balance_loss_clip": 0.06282313, + "balance_loss_mlp": 0.01263737, + "epoch": 0.39951901397865625, + "flos": 13739326778880.0, + "grad_norm": 1.9095077452421072, + "language_loss": 0.78757256, + "learning_rate": 2.730647521020907e-06, + "loss": 0.86489946, + "num_input_tokens_seen": 142730955, + "router_z_loss_clip": 1.72363281, + "router_z_loss_mlp": 0.14312744, + "step": 6645, + "time_per_iteration": 2.499361753463745 + }, + { + "auxiliary_loss_clip": 0.06458238, + "auxiliary_loss_mlp": 0.01274341, + "balance_loss_clip": 0.06283879, + "balance_loss_mlp": 0.01259321, + "epoch": 0.3995791372313242, + "flos": 23593181005440.0, + "grad_norm": 1.5926569767996783, + "language_loss": 0.7044934, + "learning_rate": 2.73028496487595e-06, + "loss": 0.78181922, + "num_input_tokens_seen": 142751200, + "router_z_loss_clip": 1.74414062, + "router_z_loss_mlp": 0.15026855, + "step": 6646, + "time_per_iteration": 2.619114875793457 + }, + { + "auxiliary_loss_clip": 0.06456489, + "auxiliary_loss_mlp": 0.01271766, + "balance_loss_clip": 0.06284152, + "balance_loss_mlp": 0.01257103, + "epoch": 0.3996392604839922, + "flos": 21361428633600.0, + "grad_norm": 2.2667385155288917, + "language_loss": 0.72035694, + "learning_rate": 2.729922381038513e-06, + "loss": 0.79763949, + "num_input_tokens_seen": 142770170, + "router_z_loss_clip": 1.72265625, + "router_z_loss_mlp": 0.14660645, + "step": 6647, + "time_per_iteration": 2.58251953125 + }, + { + "auxiliary_loss_clip": 0.06449988, + "auxiliary_loss_mlp": 0.01272061, + "balance_loss_clip": 0.06284988, + "balance_loss_mlp": 0.01257195, + "epoch": 0.39969938373666014, + "flos": 26039604337920.0, + "grad_norm": 1.4692875023338006, + "language_loss": 0.74830031, + "learning_rate": 2.7295597695223463e-06, + "loss": 0.82552081, + "num_input_tokens_seen": 142792680, + "router_z_loss_clip": 1.64941406, + "router_z_loss_mlp": 0.14849854, + "step": 6648, + "time_per_iteration": 2.7020201683044434 + }, + { + "auxiliary_loss_clip": 0.06453955, + "auxiliary_loss_mlp": 0.0126884, + "balance_loss_clip": 0.06283584, + "balance_loss_mlp": 0.0125472, + "epoch": 0.3997595069893281, + "flos": 20121858057600.0, + "grad_norm": 2.0106261298514907, + "language_loss": 0.65986454, + "learning_rate": 2.7291971303412006e-06, + "loss": 0.73709244, + "num_input_tokens_seen": 142810510, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.14117432, + "step": 6649, + "time_per_iteration": 3.9323928356170654 + }, + { + "auxiliary_loss_clip": 0.06463098, + "auxiliary_loss_mlp": 0.0127713, + "balance_loss_clip": 0.06290667, + "balance_loss_mlp": 0.01260774, + "epoch": 0.39981963024199607, + "flos": 27791016779520.0, + "grad_norm": 1.831691866077207, + "language_loss": 0.75774682, + "learning_rate": 2.728834463508826e-06, + "loss": 0.83514905, + "num_input_tokens_seen": 142832455, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.16357422, + "step": 6650, + "time_per_iteration": 2.6374714374542236 + }, + { + "auxiliary_loss_clip": 0.06454846, + "auxiliary_loss_mlp": 0.01272611, + "balance_loss_clip": 0.06283177, + "balance_loss_mlp": 0.01257782, + "epoch": 0.39987975349466404, + "flos": 21950864979840.0, + "grad_norm": 1.4608995971033776, + "language_loss": 0.7199676, + "learning_rate": 2.728471769038975e-06, + "loss": 0.79724216, + "num_input_tokens_seen": 142852590, + "router_z_loss_clip": 1.71582031, + "router_z_loss_mlp": 0.14831543, + "step": 6651, + "time_per_iteration": 2.5789706707000732 + }, + { + "auxiliary_loss_clip": 0.06457064, + "auxiliary_loss_mlp": 0.01269592, + "balance_loss_clip": 0.06283179, + "balance_loss_mlp": 0.01255245, + "epoch": 0.39993987674733206, + "flos": 20710707425280.0, + "grad_norm": 1.930350074981486, + "language_loss": 0.73724478, + "learning_rate": 2.728109046945403e-06, + "loss": 0.8145113, + "num_input_tokens_seen": 142870595, + "router_z_loss_clip": 1.74023438, + "router_z_loss_mlp": 0.14331055, + "step": 6652, + "time_per_iteration": 3.9592838287353516 + }, + { + "auxiliary_loss_clip": 0.06347093, + "auxiliary_loss_mlp": 0.01255075, + "balance_loss_clip": 0.06271589, + "balance_loss_mlp": 0.01252878, + "epoch": 0.4, + "flos": 61543566397440.0, + "grad_norm": 0.8159851457251004, + "language_loss": 0.60542929, + "learning_rate": 2.727746297241862e-06, + "loss": 0.68145096, + "num_input_tokens_seen": 142925805, + "router_z_loss_clip": 0.75732422, + "router_z_loss_mlp": 0.02201843, + "step": 6653, + "time_per_iteration": 3.0700466632843018 + }, + { + "auxiliary_loss_clip": 0.06454087, + "auxiliary_loss_mlp": 0.01272182, + "balance_loss_clip": 0.0629051, + "balance_loss_mlp": 0.01257698, + "epoch": 0.400060123252668, + "flos": 14507655592320.0, + "grad_norm": 1.9278074838902122, + "language_loss": 0.66929328, + "learning_rate": 2.7273835199421085e-06, + "loss": 0.74655592, + "num_input_tokens_seen": 142943145, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.14477539, + "step": 6654, + "time_per_iteration": 2.5292413234710693 + }, + { + "auxiliary_loss_clip": 0.06457023, + "auxiliary_loss_mlp": 0.01271182, + "balance_loss_clip": 0.06287654, + "balance_loss_mlp": 0.01257396, + "epoch": 0.40012024650533595, + "flos": 19098383961600.0, + "grad_norm": 1.998304088554008, + "language_loss": 0.90550762, + "learning_rate": 2.7270207150599e-06, + "loss": 0.98278964, + "num_input_tokens_seen": 142956925, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.13775635, + "step": 6655, + "time_per_iteration": 2.529496192932129 + }, + { + "auxiliary_loss_clip": 0.06450539, + "auxiliary_loss_mlp": 0.012675, + "balance_loss_clip": 0.06286812, + "balance_loss_mlp": 0.01254899, + "epoch": 0.4001803697580039, + "flos": 29358673217280.0, + "grad_norm": 1.6559902316252946, + "language_loss": 0.73729336, + "learning_rate": 2.7266578826089917e-06, + "loss": 0.81447375, + "num_input_tokens_seen": 142978040, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.1260376, + "step": 6656, + "time_per_iteration": 4.062687158584595 + }, + { + "auxiliary_loss_clip": 0.0645894, + "auxiliary_loss_mlp": 0.01271003, + "balance_loss_clip": 0.06288408, + "balance_loss_mlp": 0.01255696, + "epoch": 0.4002404930106719, + "flos": 20925839583360.0, + "grad_norm": 1.4738199157728433, + "language_loss": 0.73207194, + "learning_rate": 2.726295022603144e-06, + "loss": 0.80937135, + "num_input_tokens_seen": 142998390, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.15307617, + "step": 6657, + "time_per_iteration": 2.5996904373168945 + }, + { + "auxiliary_loss_clip": 0.06458808, + "auxiliary_loss_mlp": 0.0127186, + "balance_loss_clip": 0.06288153, + "balance_loss_mlp": 0.01256506, + "epoch": 0.40030061626333985, + "flos": 28413799850880.0, + "grad_norm": 1.489557881553797, + "language_loss": 0.79247761, + "learning_rate": 2.725932135056117e-06, + "loss": 0.86978424, + "num_input_tokens_seen": 143021505, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.15350342, + "step": 6658, + "time_per_iteration": 2.7172279357910156 + }, + { + "auxiliary_loss_clip": 0.06459276, + "auxiliary_loss_mlp": 0.01278121, + "balance_loss_clip": 0.06289512, + "balance_loss_mlp": 0.01264084, + "epoch": 0.4003607395160078, + "flos": 25928746986240.0, + "grad_norm": 2.1209995886317956, + "language_loss": 0.77640641, + "learning_rate": 2.72556921998167e-06, + "loss": 0.85378039, + "num_input_tokens_seen": 143041375, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.14050293, + "step": 6659, + "time_per_iteration": 4.3210484981536865 + }, + { + "auxiliary_loss_clip": 0.06450686, + "auxiliary_loss_mlp": 0.01279792, + "balance_loss_clip": 0.06291049, + "balance_loss_mlp": 0.01267442, + "epoch": 0.4004208627686758, + "flos": 20773501660800.0, + "grad_norm": 1.7380110296153854, + "language_loss": 0.73432875, + "learning_rate": 2.7252062773935662e-06, + "loss": 0.81163359, + "num_input_tokens_seen": 143058725, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.12359619, + "step": 6660, + "time_per_iteration": 2.668088436126709 + }, + { + "auxiliary_loss_clip": 0.06457424, + "auxiliary_loss_mlp": 0.01270844, + "balance_loss_clip": 0.06287603, + "balance_loss_mlp": 0.01258077, + "epoch": 0.40048098602134374, + "flos": 24688170161280.0, + "grad_norm": 2.131845423391088, + "language_loss": 0.71318859, + "learning_rate": 2.7248433073055674e-06, + "loss": 0.79047126, + "num_input_tokens_seen": 143076995, + "router_z_loss_clip": 1.69726562, + "router_z_loss_mlp": 0.12786865, + "step": 6661, + "time_per_iteration": 2.5673065185546875 + }, + { + "auxiliary_loss_clip": 0.06462744, + "auxiliary_loss_mlp": 0.01272248, + "balance_loss_clip": 0.06291083, + "balance_loss_mlp": 0.01257889, + "epoch": 0.4005411092740117, + "flos": 23192448053760.0, + "grad_norm": 1.7831816831822005, + "language_loss": 0.75751495, + "learning_rate": 2.724480309731437e-06, + "loss": 0.83486485, + "num_input_tokens_seen": 143096780, + "router_z_loss_clip": 1.71582031, + "router_z_loss_mlp": 0.14361572, + "step": 6662, + "time_per_iteration": 2.5870559215545654 + }, + { + "auxiliary_loss_clip": 0.06461672, + "auxiliary_loss_mlp": 0.01271183, + "balance_loss_clip": 0.0628756, + "balance_loss_mlp": 0.01256175, + "epoch": 0.4006012325266797, + "flos": 17526786382080.0, + "grad_norm": 2.241735466255753, + "language_loss": 0.66247231, + "learning_rate": 2.7241172846849417e-06, + "loss": 0.73980081, + "num_input_tokens_seen": 143112590, + "router_z_loss_clip": 1.7421875, + "router_z_loss_mlp": 0.15014648, + "step": 6663, + "time_per_iteration": 2.5879623889923096 + }, + { + "auxiliary_loss_clip": 0.06461117, + "auxiliary_loss_mlp": 0.01271573, + "balance_loss_clip": 0.06290103, + "balance_loss_mlp": 0.01257316, + "epoch": 0.40066135577934764, + "flos": 19862016946560.0, + "grad_norm": 2.129058070747091, + "language_loss": 0.86377645, + "learning_rate": 2.7237542321798455e-06, + "loss": 0.94110334, + "num_input_tokens_seen": 143130220, + "router_z_loss_clip": 1.70898438, + "router_z_loss_mlp": 0.14251709, + "step": 6664, + "time_per_iteration": 2.580240249633789 + }, + { + "auxiliary_loss_clip": 0.06459028, + "auxiliary_loss_mlp": 0.01272821, + "balance_loss_clip": 0.06287652, + "balance_loss_mlp": 0.01259064, + "epoch": 0.40072147903201566, + "flos": 18155816582400.0, + "grad_norm": 1.9805392577959038, + "language_loss": 0.84895325, + "learning_rate": 2.723391152229917e-06, + "loss": 0.92627168, + "num_input_tokens_seen": 143147160, + "router_z_loss_clip": 1.71191406, + "router_z_loss_mlp": 0.13751221, + "step": 6665, + "time_per_iteration": 2.50386381149292 + }, + { + "auxiliary_loss_clip": 0.06457423, + "auxiliary_loss_mlp": 0.01268968, + "balance_loss_clip": 0.06286919, + "balance_loss_mlp": 0.0125458, + "epoch": 0.4007816022846836, + "flos": 18667239177600.0, + "grad_norm": 1.826402815553393, + "language_loss": 0.78598213, + "learning_rate": 2.7230280448489236e-06, + "loss": 0.86324608, + "num_input_tokens_seen": 143164605, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.14404297, + "step": 6666, + "time_per_iteration": 2.5133461952209473 + }, + { + "auxiliary_loss_clip": 0.06465514, + "auxiliary_loss_mlp": 0.01268831, + "balance_loss_clip": 0.06295928, + "balance_loss_mlp": 0.01253834, + "epoch": 0.4008417255373516, + "flos": 25710344519040.0, + "grad_norm": 1.8943268651740763, + "language_loss": 0.74139559, + "learning_rate": 2.7226649100506333e-06, + "loss": 0.81873906, + "num_input_tokens_seen": 143183965, + "router_z_loss_clip": 1.69433594, + "router_z_loss_mlp": 0.14990234, + "step": 6667, + "time_per_iteration": 2.635195732116699 + }, + { + "auxiliary_loss_clip": 0.06460091, + "auxiliary_loss_mlp": 0.01273802, + "balance_loss_clip": 0.06287248, + "balance_loss_mlp": 0.01258519, + "epoch": 0.40090184879001955, + "flos": 22865536149120.0, + "grad_norm": 1.4912552700664468, + "language_loss": 0.75818384, + "learning_rate": 2.7223017478488183e-06, + "loss": 0.83552277, + "num_input_tokens_seen": 143204965, + "router_z_loss_clip": 1.72851562, + "router_z_loss_mlp": 0.15270996, + "step": 6668, + "time_per_iteration": 2.567748546600342 + }, + { + "auxiliary_loss_clip": 0.06454465, + "auxiliary_loss_mlp": 0.01272615, + "balance_loss_clip": 0.0628936, + "balance_loss_mlp": 0.01258572, + "epoch": 0.4009619720426875, + "flos": 29067581733120.0, + "grad_norm": 1.8066450616757106, + "language_loss": 0.82171971, + "learning_rate": 2.721938558257248e-06, + "loss": 0.89899051, + "num_input_tokens_seen": 143225015, + "router_z_loss_clip": 1.64941406, + "router_z_loss_mlp": 0.14050293, + "step": 6669, + "time_per_iteration": 2.614875555038452 + }, + { + "auxiliary_loss_clip": 0.06349576, + "auxiliary_loss_mlp": 0.01259788, + "balance_loss_clip": 0.06273951, + "balance_loss_mlp": 0.01257549, + "epoch": 0.4010220952953555, + "flos": 66080347136640.0, + "grad_norm": 0.6837113267664942, + "language_loss": 0.53268963, + "learning_rate": 2.721575341289695e-06, + "loss": 0.60878325, + "num_input_tokens_seen": 143294925, + "router_z_loss_clip": 0.75683594, + "router_z_loss_mlp": 0.02243042, + "step": 6670, + "time_per_iteration": 3.2985219955444336 + }, + { + "auxiliary_loss_clip": 0.06453651, + "auxiliary_loss_mlp": 0.01274966, + "balance_loss_clip": 0.06286684, + "balance_loss_mlp": 0.01260405, + "epoch": 0.40108221854802345, + "flos": 29650519388160.0, + "grad_norm": 1.6370315093264123, + "language_loss": 0.88528681, + "learning_rate": 2.7212120969599333e-06, + "loss": 0.96257305, + "num_input_tokens_seen": 143314170, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.14556885, + "step": 6671, + "time_per_iteration": 2.6268246173858643 + }, + { + "auxiliary_loss_clip": 0.06460971, + "auxiliary_loss_mlp": 0.01272066, + "balance_loss_clip": 0.06289764, + "balance_loss_mlp": 0.01256861, + "epoch": 0.4011423418006914, + "flos": 19934286693120.0, + "grad_norm": 1.7015153377224497, + "language_loss": 0.78868973, + "learning_rate": 2.720848825281736e-06, + "loss": 0.86602008, + "num_input_tokens_seen": 143330050, + "router_z_loss_clip": 1.7109375, + "router_z_loss_mlp": 0.1519165, + "step": 6672, + "time_per_iteration": 2.4949698448181152 + }, + { + "auxiliary_loss_clip": 0.06458279, + "auxiliary_loss_mlp": 0.01271887, + "balance_loss_clip": 0.06290099, + "balance_loss_mlp": 0.01257701, + "epoch": 0.4012024650533594, + "flos": 20090523830400.0, + "grad_norm": 2.076088840896174, + "language_loss": 0.63474464, + "learning_rate": 2.72048552626888e-06, + "loss": 0.71204633, + "num_input_tokens_seen": 143348650, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.1418457, + "step": 6673, + "time_per_iteration": 2.644050121307373 + }, + { + "auxiliary_loss_clip": 0.06458048, + "auxiliary_loss_mlp": 0.0127375, + "balance_loss_clip": 0.062879, + "balance_loss_mlp": 0.01259827, + "epoch": 0.40126258830602735, + "flos": 21703224637440.0, + "grad_norm": 1.4478595936596839, + "language_loss": 0.80581552, + "learning_rate": 2.7201221999351402e-06, + "loss": 0.88313353, + "num_input_tokens_seen": 143370275, + "router_z_loss_clip": 1.70117188, + "router_z_loss_mlp": 0.13903809, + "step": 6674, + "time_per_iteration": 2.559034824371338 + }, + { + "auxiliary_loss_clip": 0.0646532, + "auxiliary_loss_mlp": 0.01272896, + "balance_loss_clip": 0.06289816, + "balance_loss_mlp": 0.01258269, + "epoch": 0.4013227115586953, + "flos": 12025160277120.0, + "grad_norm": 2.4455561687367195, + "language_loss": 0.82561237, + "learning_rate": 2.719758846294294e-06, + "loss": 0.90299457, + "num_input_tokens_seen": 143385390, + "router_z_loss_clip": 1.75390625, + "router_z_loss_mlp": 0.14624023, + "step": 6675, + "time_per_iteration": 2.5448951721191406 + }, + { + "auxiliary_loss_clip": 0.06465134, + "auxiliary_loss_mlp": 0.01268709, + "balance_loss_clip": 0.06295693, + "balance_loss_mlp": 0.01254106, + "epoch": 0.4013828348113633, + "flos": 25454612257920.0, + "grad_norm": 1.6408733853472015, + "language_loss": 0.93777156, + "learning_rate": 2.71939546536012e-06, + "loss": 1.01511002, + "num_input_tokens_seen": 143404215, + "router_z_loss_clip": 1.69433594, + "router_z_loss_mlp": 0.14581299, + "step": 6676, + "time_per_iteration": 2.5721349716186523 + }, + { + "auxiliary_loss_clip": 0.06469207, + "auxiliary_loss_mlp": 0.01274451, + "balance_loss_clip": 0.06291738, + "balance_loss_mlp": 0.01258274, + "epoch": 0.40144295806403124, + "flos": 18588009542400.0, + "grad_norm": 2.5026106137632222, + "language_loss": 0.80060673, + "learning_rate": 2.719032057146399e-06, + "loss": 0.87804335, + "num_input_tokens_seen": 143422245, + "router_z_loss_clip": 1.7734375, + "router_z_loss_mlp": 0.16186523, + "step": 6677, + "time_per_iteration": 2.5438191890716553 + }, + { + "auxiliary_loss_clip": 0.06455022, + "auxiliary_loss_mlp": 0.01270715, + "balance_loss_clip": 0.0628567, + "balance_loss_mlp": 0.01256934, + "epoch": 0.4015030813166992, + "flos": 22936925427840.0, + "grad_norm": 1.8567640541952835, + "language_loss": 0.83925951, + "learning_rate": 2.71866862166691e-06, + "loss": 0.9165169, + "num_input_tokens_seen": 143443130, + "router_z_loss_clip": 1.69042969, + "router_z_loss_mlp": 0.13793945, + "step": 6678, + "time_per_iteration": 2.5458457469940186 + }, + { + "auxiliary_loss_clip": 0.06455562, + "auxiliary_loss_mlp": 0.0127344, + "balance_loss_clip": 0.06287661, + "balance_loss_mlp": 0.01258325, + "epoch": 0.4015632045693672, + "flos": 20601359447040.0, + "grad_norm": 2.2595275456436767, + "language_loss": 0.6400671, + "learning_rate": 2.718305158935434e-06, + "loss": 0.7173571, + "num_input_tokens_seen": 143461385, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.15124512, + "step": 6679, + "time_per_iteration": 2.553312063217163 + }, + { + "auxiliary_loss_clip": 0.0645475, + "auxiliary_loss_mlp": 0.01270251, + "balance_loss_clip": 0.06285992, + "balance_loss_mlp": 0.01256268, + "epoch": 0.4016233278220352, + "flos": 23445371203200.0, + "grad_norm": 1.525723625053638, + "language_loss": 0.78686285, + "learning_rate": 2.7179416689657554e-06, + "loss": 0.86411297, + "num_input_tokens_seen": 143481750, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.14001465, + "step": 6680, + "time_per_iteration": 2.5376389026641846 + }, + { + "auxiliary_loss_clip": 0.0646753, + "auxiliary_loss_mlp": 0.0127372, + "balance_loss_clip": 0.06289258, + "balance_loss_mlp": 0.01258008, + "epoch": 0.40168345107470316, + "flos": 21436968689280.0, + "grad_norm": 1.5038657697958466, + "language_loss": 0.76059246, + "learning_rate": 2.7175781517716556e-06, + "loss": 0.83800501, + "num_input_tokens_seen": 143501540, + "router_z_loss_clip": 1.78417969, + "router_z_loss_mlp": 0.15710449, + "step": 6681, + "time_per_iteration": 2.532668352127075 + }, + { + "auxiliary_loss_clip": 0.06461542, + "auxiliary_loss_mlp": 0.01268459, + "balance_loss_clip": 0.06289437, + "balance_loss_mlp": 0.01254285, + "epoch": 0.4017435743273711, + "flos": 22863900994560.0, + "grad_norm": 2.212326324471445, + "language_loss": 0.6446861, + "learning_rate": 2.7172146073669213e-06, + "loss": 0.72198606, + "num_input_tokens_seen": 143520530, + "router_z_loss_clip": 1.72070312, + "router_z_loss_mlp": 0.1416626, + "step": 6682, + "time_per_iteration": 2.585963010787964 + }, + { + "auxiliary_loss_clip": 0.06452938, + "auxiliary_loss_mlp": 0.01271302, + "balance_loss_clip": 0.06279296, + "balance_loss_mlp": 0.01257288, + "epoch": 0.4018036975800391, + "flos": 28630022112000.0, + "grad_norm": 1.839007150843812, + "language_loss": 0.73340857, + "learning_rate": 2.716851035765337e-06, + "loss": 0.81065094, + "num_input_tokens_seen": 143540210, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.14013672, + "step": 6683, + "time_per_iteration": 2.5977652072906494 + }, + { + "auxiliary_loss_clip": 0.06452199, + "auxiliary_loss_mlp": 0.01270902, + "balance_loss_clip": 0.0628196, + "balance_loss_mlp": 0.01257252, + "epoch": 0.40186382083270705, + "flos": 26658446267520.0, + "grad_norm": 1.545951486041889, + "language_loss": 0.73326242, + "learning_rate": 2.7164874369806896e-06, + "loss": 0.81049347, + "num_input_tokens_seen": 143560940, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.13671875, + "step": 6684, + "time_per_iteration": 2.579061985015869 + }, + { + "auxiliary_loss_clip": 0.06341122, + "auxiliary_loss_mlp": 0.01263578, + "balance_loss_clip": 0.06265609, + "balance_loss_mlp": 0.01260683, + "epoch": 0.401923944085375, + "flos": 59277167562240.0, + "grad_norm": 0.7966859396902427, + "language_loss": 0.60515714, + "learning_rate": 2.716123811026767e-06, + "loss": 0.68120408, + "num_input_tokens_seen": 143624015, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 0.02891541, + "step": 6685, + "time_per_iteration": 3.2738587856292725 + }, + { + "auxiliary_loss_clip": 0.06456321, + "auxiliary_loss_mlp": 0.01269632, + "balance_loss_clip": 0.06278493, + "balance_loss_mlp": 0.01255291, + "epoch": 0.401984067338043, + "flos": 16988473825920.0, + "grad_norm": 1.7615677724791905, + "language_loss": 0.70125616, + "learning_rate": 2.715760157917357e-06, + "loss": 0.77851576, + "num_input_tokens_seen": 143642750, + "router_z_loss_clip": 1.77929688, + "router_z_loss_mlp": 0.14343262, + "step": 6686, + "time_per_iteration": 2.565185070037842 + }, + { + "auxiliary_loss_clip": 0.06450202, + "auxiliary_loss_mlp": 0.01269094, + "balance_loss_clip": 0.06279442, + "balance_loss_mlp": 0.0125554, + "epoch": 0.40204419059071095, + "flos": 24979387426560.0, + "grad_norm": 1.3440220766592053, + "language_loss": 0.74867636, + "learning_rate": 2.7153964776662504e-06, + "loss": 0.82586932, + "num_input_tokens_seen": 143664515, + "router_z_loss_clip": 1.70800781, + "router_z_loss_mlp": 0.13549805, + "step": 6687, + "time_per_iteration": 2.6009433269500732 + }, + { + "auxiliary_loss_clip": 0.06451625, + "auxiliary_loss_mlp": 0.01275028, + "balance_loss_clip": 0.06281097, + "balance_loss_mlp": 0.01261164, + "epoch": 0.4021043138433789, + "flos": 23484252297600.0, + "grad_norm": 1.7565801002117698, + "language_loss": 0.71198428, + "learning_rate": 2.7150327702872385e-06, + "loss": 0.78925073, + "num_input_tokens_seen": 143683135, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.13873291, + "step": 6688, + "time_per_iteration": 3.9550609588623047 + }, + { + "auxiliary_loss_clip": 0.06455014, + "auxiliary_loss_mlp": 0.01278979, + "balance_loss_clip": 0.06277826, + "balance_loss_mlp": 0.01263506, + "epoch": 0.4021644370960469, + "flos": 26003155011840.0, + "grad_norm": 1.6503070586239919, + "language_loss": 0.64854121, + "learning_rate": 2.7146690357941112e-06, + "loss": 0.7258811, + "num_input_tokens_seen": 143703985, + "router_z_loss_clip": 1.77246094, + "router_z_loss_mlp": 0.15478516, + "step": 6689, + "time_per_iteration": 2.552058458328247 + }, + { + "auxiliary_loss_clip": 0.06450799, + "auxiliary_loss_mlp": 0.01267992, + "balance_loss_clip": 0.06276366, + "balance_loss_mlp": 0.0125417, + "epoch": 0.40222456034871484, + "flos": 13592816714880.0, + "grad_norm": 1.9543405887805447, + "language_loss": 0.73594153, + "learning_rate": 2.7143052742006632e-06, + "loss": 0.81312943, + "num_input_tokens_seen": 143719245, + "router_z_loss_clip": 1.74414062, + "router_z_loss_mlp": 0.13824463, + "step": 6690, + "time_per_iteration": 2.5484251976013184 + }, + { + "auxiliary_loss_clip": 0.06448495, + "auxiliary_loss_mlp": 0.0127057, + "balance_loss_clip": 0.06278096, + "balance_loss_mlp": 0.01256682, + "epoch": 0.4022846836013828, + "flos": 24284586170880.0, + "grad_norm": 1.722227920192768, + "language_loss": 0.74861401, + "learning_rate": 2.7139414855206872e-06, + "loss": 0.82580471, + "num_input_tokens_seen": 143739575, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.13903809, + "step": 6691, + "time_per_iteration": 3.9708051681518555 + }, + { + "auxiliary_loss_clip": 0.06451076, + "auxiliary_loss_mlp": 0.01277672, + "balance_loss_clip": 0.0627808, + "balance_loss_mlp": 0.01262151, + "epoch": 0.40234480685405083, + "flos": 20156881864320.0, + "grad_norm": 1.7761891830354823, + "language_loss": 0.72677463, + "learning_rate": 2.7135776697679785e-06, + "loss": 0.80406213, + "num_input_tokens_seen": 143758515, + "router_z_loss_clip": 1.72949219, + "router_z_loss_mlp": 0.15515137, + "step": 6692, + "time_per_iteration": 2.5179357528686523 + }, + { + "auxiliary_loss_clip": 0.06447224, + "auxiliary_loss_mlp": 0.01270814, + "balance_loss_clip": 0.06276847, + "balance_loss_mlp": 0.0125664, + "epoch": 0.4024049301067188, + "flos": 22936925427840.0, + "grad_norm": 1.7625804596819372, + "language_loss": 0.8401857, + "learning_rate": 2.7132138269563333e-06, + "loss": 0.91736615, + "num_input_tokens_seen": 143776770, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.1418457, + "step": 6693, + "time_per_iteration": 2.707941770553589 + }, + { + "auxiliary_loss_clip": 0.06452498, + "auxiliary_loss_mlp": 0.01266294, + "balance_loss_clip": 0.06281643, + "balance_loss_mlp": 0.01252865, + "epoch": 0.40246505335938676, + "flos": 36037285297920.0, + "grad_norm": 1.8844808694168769, + "language_loss": 0.70966387, + "learning_rate": 2.7128499570995483e-06, + "loss": 0.78685182, + "num_input_tokens_seen": 143798450, + "router_z_loss_clip": 1.70996094, + "router_z_loss_mlp": 0.13433838, + "step": 6694, + "time_per_iteration": 2.637481927871704 + }, + { + "auxiliary_loss_clip": 0.06444509, + "auxiliary_loss_mlp": 0.01272964, + "balance_loss_clip": 0.0627351, + "balance_loss_mlp": 0.01258552, + "epoch": 0.4025251766120547, + "flos": 20600478979200.0, + "grad_norm": 1.9746374404018712, + "language_loss": 0.68475246, + "learning_rate": 2.7124860602114212e-06, + "loss": 0.76192719, + "num_input_tokens_seen": 143816995, + "router_z_loss_clip": 1.7109375, + "router_z_loss_mlp": 0.14428711, + "step": 6695, + "time_per_iteration": 3.9740405082702637 + }, + { + "auxiliary_loss_clip": 0.06446315, + "auxiliary_loss_mlp": 0.01270396, + "balance_loss_clip": 0.06276862, + "balance_loss_mlp": 0.01256484, + "epoch": 0.4025852998647227, + "flos": 64537582890240.0, + "grad_norm": 2.0865884556399363, + "language_loss": 0.79765463, + "learning_rate": 2.7121221363057515e-06, + "loss": 0.87482178, + "num_input_tokens_seen": 143842090, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.13897705, + "step": 6696, + "time_per_iteration": 3.0413708686828613 + }, + { + "auxiliary_loss_clip": 0.06454235, + "auxiliary_loss_mlp": 0.01269123, + "balance_loss_clip": 0.06281278, + "balance_loss_mlp": 0.01254473, + "epoch": 0.40264542311739066, + "flos": 20892534785280.0, + "grad_norm": 1.7976365729577468, + "language_loss": 0.71608603, + "learning_rate": 2.7117581853963393e-06, + "loss": 0.79331958, + "num_input_tokens_seen": 143860800, + "router_z_loss_clip": 1.72851562, + "router_z_loss_mlp": 0.14660645, + "step": 6697, + "time_per_iteration": 2.5200350284576416 + }, + { + "auxiliary_loss_clip": 0.06445032, + "auxiliary_loss_mlp": 0.01270069, + "balance_loss_clip": 0.06276169, + "balance_loss_mlp": 0.0125658, + "epoch": 0.4027055463700586, + "flos": 26257419826560.0, + "grad_norm": 1.9918981514977272, + "language_loss": 0.61230171, + "learning_rate": 2.711394207496984e-06, + "loss": 0.68945277, + "num_input_tokens_seen": 143878950, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.13464355, + "step": 6698, + "time_per_iteration": 2.576472520828247 + }, + { + "auxiliary_loss_clip": 0.06449181, + "auxiliary_loss_mlp": 0.0126685, + "balance_loss_clip": 0.06276856, + "balance_loss_mlp": 0.01252849, + "epoch": 0.4027656696227266, + "flos": 20637682992000.0, + "grad_norm": 2.0070875825685266, + "language_loss": 0.77479243, + "learning_rate": 2.711030202621491e-06, + "loss": 0.85195273, + "num_input_tokens_seen": 143898385, + "router_z_loss_clip": 1.72363281, + "router_z_loss_mlp": 0.14001465, + "step": 6699, + "time_per_iteration": 3.937375545501709 + }, + { + "auxiliary_loss_clip": 0.0644554, + "auxiliary_loss_mlp": 0.01267758, + "balance_loss_clip": 0.0627719, + "balance_loss_mlp": 0.01253977, + "epoch": 0.40282579287539455, + "flos": 22352855742720.0, + "grad_norm": 1.735185416550665, + "language_loss": 0.80698907, + "learning_rate": 2.7106661707836605e-06, + "loss": 0.88412201, + "num_input_tokens_seen": 143918795, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.13793945, + "step": 6700, + "time_per_iteration": 2.535510540008545 + }, + { + "auxiliary_loss_clip": 0.06459837, + "auxiliary_loss_mlp": 0.01268332, + "balance_loss_clip": 0.06282608, + "balance_loss_mlp": 0.01253157, + "epoch": 0.4028859161280625, + "flos": 29282126912640.0, + "grad_norm": 1.7653471156752092, + "language_loss": 0.74938649, + "learning_rate": 2.7103021119972977e-06, + "loss": 0.82666814, + "num_input_tokens_seen": 143938245, + "router_z_loss_clip": 1.77246094, + "router_z_loss_mlp": 0.1517334, + "step": 6701, + "time_per_iteration": 2.6509363651275635 + }, + { + "auxiliary_loss_clip": 0.06451308, + "auxiliary_loss_mlp": 0.01270948, + "balance_loss_clip": 0.06281418, + "balance_loss_mlp": 0.01257329, + "epoch": 0.4029460393807305, + "flos": 28630022112000.0, + "grad_norm": 1.48917022125432, + "language_loss": 0.66283298, + "learning_rate": 2.709938026276208e-06, + "loss": 0.74005556, + "num_input_tokens_seen": 143960995, + "router_z_loss_clip": 1.69824219, + "router_z_loss_mlp": 0.13641357, + "step": 6702, + "time_per_iteration": 2.6183536052703857 + }, + { + "auxiliary_loss_clip": 0.06460792, + "auxiliary_loss_mlp": 0.0127397, + "balance_loss_clip": 0.06286055, + "balance_loss_mlp": 0.01259117, + "epoch": 0.40300616263339845, + "flos": 22608588003840.0, + "grad_norm": 1.5996325972429297, + "language_loss": 0.66632348, + "learning_rate": 2.7095739136341964e-06, + "loss": 0.74367112, + "num_input_tokens_seen": 143979910, + "router_z_loss_clip": 1.74511719, + "router_z_loss_mlp": 0.14849854, + "step": 6703, + "time_per_iteration": 2.583040237426758 + }, + { + "auxiliary_loss_clip": 0.06456298, + "auxiliary_loss_mlp": 0.01273361, + "balance_loss_clip": 0.06282306, + "balance_loss_mlp": 0.012584, + "epoch": 0.4030662858860664, + "flos": 25527385128960.0, + "grad_norm": 1.7345540067512994, + "language_loss": 0.82398093, + "learning_rate": 2.709209774085071e-06, + "loss": 0.90127754, + "num_input_tokens_seen": 144000095, + "router_z_loss_clip": 1.73925781, + "router_z_loss_mlp": 0.14959717, + "step": 6704, + "time_per_iteration": 2.564052104949951 + }, + { + "auxiliary_loss_clip": 0.06457714, + "auxiliary_loss_mlp": 0.01272416, + "balance_loss_clip": 0.06283459, + "balance_loss_mlp": 0.01258332, + "epoch": 0.40312640913873443, + "flos": 23593474494720.0, + "grad_norm": 1.6434462448941187, + "language_loss": 0.73919153, + "learning_rate": 2.7088456076426407e-06, + "loss": 0.81649286, + "num_input_tokens_seen": 144019695, + "router_z_loss_clip": 1.74316406, + "router_z_loss_mlp": 0.140625, + "step": 6705, + "time_per_iteration": 2.609738349914551 + }, + { + "auxiliary_loss_clip": 0.06450006, + "auxiliary_loss_mlp": 0.01270089, + "balance_loss_clip": 0.06282469, + "balance_loss_mlp": 0.01256481, + "epoch": 0.4031865323914024, + "flos": 20017205907840.0, + "grad_norm": 1.6242014521871173, + "language_loss": 0.66795284, + "learning_rate": 2.708481414320713e-06, + "loss": 0.74515378, + "num_input_tokens_seen": 144038525, + "router_z_loss_clip": 1.67480469, + "router_z_loss_mlp": 0.1361084, + "step": 6706, + "time_per_iteration": 2.5215423107147217 + }, + { + "auxiliary_loss_clip": 0.06452154, + "auxiliary_loss_mlp": 0.01268976, + "balance_loss_clip": 0.06282388, + "balance_loss_mlp": 0.0125513, + "epoch": 0.40324665564407036, + "flos": 21877840546560.0, + "grad_norm": 1.6449246324910813, + "language_loss": 0.71481538, + "learning_rate": 2.7081171941330992e-06, + "loss": 0.79202664, + "num_input_tokens_seen": 144059485, + "router_z_loss_clip": 1.70019531, + "router_z_loss_mlp": 0.13842773, + "step": 6707, + "time_per_iteration": 2.5762581825256348 + }, + { + "auxiliary_loss_clip": 0.0644149, + "auxiliary_loss_mlp": 0.01271296, + "balance_loss_clip": 0.06278867, + "balance_loss_mlp": 0.01258379, + "epoch": 0.4033067788967383, + "flos": 23885572227840.0, + "grad_norm": 1.6148090336243837, + "language_loss": 0.80062628, + "learning_rate": 2.707752947093611e-06, + "loss": 0.87775409, + "num_input_tokens_seen": 144080265, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.12908936, + "step": 6708, + "time_per_iteration": 2.5509586334228516 + }, + { + "auxiliary_loss_clip": 0.06459241, + "auxiliary_loss_mlp": 0.01271237, + "balance_loss_clip": 0.0628079, + "balance_loss_mlp": 0.01256133, + "epoch": 0.4033669021494063, + "flos": 17425530322560.0, + "grad_norm": 2.5431099630067435, + "language_loss": 0.8334195, + "learning_rate": 2.70738867321606e-06, + "loss": 0.91072428, + "num_input_tokens_seen": 144098040, + "router_z_loss_clip": 1.78417969, + "router_z_loss_mlp": 0.15100098, + "step": 6709, + "time_per_iteration": 2.5844790935516357 + }, + { + "auxiliary_loss_clip": 0.06454608, + "auxiliary_loss_mlp": 0.01274744, + "balance_loss_clip": 0.0628157, + "balance_loss_mlp": 0.01259211, + "epoch": 0.40342702540207426, + "flos": 29607277881600.0, + "grad_norm": 1.5307534200842645, + "language_loss": 0.71642667, + "learning_rate": 2.70702437251426e-06, + "loss": 0.79372019, + "num_input_tokens_seen": 144118265, + "router_z_loss_clip": 1.72753906, + "router_z_loss_mlp": 0.15527344, + "step": 6710, + "time_per_iteration": 2.5950214862823486 + }, + { + "auxiliary_loss_clip": 0.06448973, + "auxiliary_loss_mlp": 0.01270551, + "balance_loss_clip": 0.06280518, + "balance_loss_mlp": 0.01256037, + "epoch": 0.4034871486547422, + "flos": 11288249544960.0, + "grad_norm": 5.632076524924719, + "language_loss": 0.85771239, + "learning_rate": 2.7066600450020236e-06, + "loss": 0.93490767, + "num_input_tokens_seen": 144133865, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.1451416, + "step": 6711, + "time_per_iteration": 2.530691146850586 + }, + { + "auxiliary_loss_clip": 0.06457499, + "auxiliary_loss_mlp": 0.01273198, + "balance_loss_clip": 0.0628542, + "balance_loss_mlp": 0.01258732, + "epoch": 0.4035472719074102, + "flos": 15557097254400.0, + "grad_norm": 2.360012043566648, + "language_loss": 0.76516247, + "learning_rate": 2.706295690693168e-06, + "loss": 0.84246945, + "num_input_tokens_seen": 144150125, + "router_z_loss_clip": 1.72167969, + "router_z_loss_mlp": 0.14471436, + "step": 6712, + "time_per_iteration": 2.485973358154297 + }, + { + "auxiliary_loss_clip": 0.06453355, + "auxiliary_loss_mlp": 0.01270625, + "balance_loss_clip": 0.06282951, + "balance_loss_mlp": 0.01256249, + "epoch": 0.40360739516007815, + "flos": 24680162096640.0, + "grad_norm": 2.2673991582834803, + "language_loss": 0.80280489, + "learning_rate": 2.7059313096015096e-06, + "loss": 0.88004464, + "num_input_tokens_seen": 144169295, + "router_z_loss_clip": 1.70117188, + "router_z_loss_mlp": 0.14379883, + "step": 6713, + "time_per_iteration": 2.604844093322754 + }, + { + "auxiliary_loss_clip": 0.06452335, + "auxiliary_loss_mlp": 0.01272867, + "balance_loss_clip": 0.06279401, + "balance_loss_mlp": 0.01258824, + "epoch": 0.4036675184127461, + "flos": 17308635477120.0, + "grad_norm": 2.487123438751718, + "language_loss": 0.88458717, + "learning_rate": 2.705566901740865e-06, + "loss": 0.9618392, + "num_input_tokens_seen": 144185790, + "router_z_loss_clip": 1.73144531, + "router_z_loss_mlp": 0.14038086, + "step": 6714, + "time_per_iteration": 2.4827568531036377 + }, + { + "auxiliary_loss_clip": 0.06454237, + "auxiliary_loss_mlp": 0.01268245, + "balance_loss_clip": 0.06281483, + "balance_loss_mlp": 0.01254011, + "epoch": 0.4037276416654141, + "flos": 19869983084160.0, + "grad_norm": 1.5212273970247687, + "language_loss": 0.69752967, + "learning_rate": 2.7052024671250527e-06, + "loss": 0.77475452, + "num_input_tokens_seen": 144205190, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.14233398, + "step": 6715, + "time_per_iteration": 2.5602893829345703 + }, + { + "auxiliary_loss_clip": 0.06458366, + "auxiliary_loss_mlp": 0.01269769, + "balance_loss_clip": 0.06281729, + "balance_loss_mlp": 0.0125541, + "epoch": 0.40378776491808205, + "flos": 18302158938240.0, + "grad_norm": 1.8718399277124913, + "language_loss": 0.78095776, + "learning_rate": 2.704838005767892e-06, + "loss": 0.85823905, + "num_input_tokens_seen": 144222705, + "router_z_loss_clip": 1.76757812, + "router_z_loss_mlp": 0.14367676, + "step": 6716, + "time_per_iteration": 2.4911210536956787 + }, + { + "auxiliary_loss_clip": 0.06449929, + "auxiliary_loss_mlp": 0.01275524, + "balance_loss_clip": 0.0628348, + "balance_loss_mlp": 0.01262185, + "epoch": 0.40384788817075, + "flos": 15054772826880.0, + "grad_norm": 1.8985450182353327, + "language_loss": 0.76491797, + "learning_rate": 2.7044735176832037e-06, + "loss": 0.8421725, + "num_input_tokens_seen": 144239545, + "router_z_loss_clip": 1.66308594, + "router_z_loss_mlp": 0.13342285, + "step": 6717, + "time_per_iteration": 2.5457956790924072 + }, + { + "auxiliary_loss_clip": 0.0634857, + "auxiliary_loss_mlp": 0.01256954, + "balance_loss_clip": 0.06272445, + "balance_loss_mlp": 0.01254165, + "epoch": 0.40390801142341803, + "flos": 61948659761280.0, + "grad_norm": 0.8842261639057883, + "language_loss": 0.60140264, + "learning_rate": 2.7041090028848084e-06, + "loss": 0.67745787, + "num_input_tokens_seen": 144288145, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.02790833, + "step": 6718, + "time_per_iteration": 2.9733822345733643 + }, + { + "auxiliary_loss_clip": 0.06457312, + "auxiliary_loss_mlp": 0.0127584, + "balance_loss_clip": 0.06279647, + "balance_loss_mlp": 0.01260366, + "epoch": 0.403968134676086, + "flos": 22743945475200.0, + "grad_norm": 1.799198719667369, + "language_loss": 0.75286412, + "learning_rate": 2.7037444613865306e-06, + "loss": 0.83019567, + "num_input_tokens_seen": 144302315, + "router_z_loss_clip": 1.77441406, + "router_z_loss_mlp": 0.15490723, + "step": 6719, + "time_per_iteration": 2.5417115688323975 + }, + { + "auxiliary_loss_clip": 0.06454173, + "auxiliary_loss_mlp": 0.01269672, + "balance_loss_clip": 0.06282561, + "balance_loss_mlp": 0.01254592, + "epoch": 0.40402825792875396, + "flos": 19789244075520.0, + "grad_norm": 2.1951890128687257, + "language_loss": 0.81351668, + "learning_rate": 2.7033798932021906e-06, + "loss": 0.89075512, + "num_input_tokens_seen": 144318990, + "router_z_loss_clip": 1.71484375, + "router_z_loss_mlp": 0.15100098, + "step": 6720, + "time_per_iteration": 2.4906880855560303 + }, + { + "auxiliary_loss_clip": 0.06453006, + "auxiliary_loss_mlp": 0.01269643, + "balance_loss_clip": 0.06279179, + "balance_loss_mlp": 0.01254742, + "epoch": 0.40408838118142193, + "flos": 19615298999040.0, + "grad_norm": 1.8273574705972042, + "language_loss": 0.77227581, + "learning_rate": 2.7030152983456153e-06, + "loss": 0.84950233, + "num_input_tokens_seen": 144335765, + "router_z_loss_clip": 1.73925781, + "router_z_loss_mlp": 0.14904785, + "step": 6721, + "time_per_iteration": 2.5645196437835693 + }, + { + "auxiliary_loss_clip": 0.06447627, + "auxiliary_loss_mlp": 0.01264811, + "balance_loss_clip": 0.06279851, + "balance_loss_mlp": 0.01251931, + "epoch": 0.4041485044340899, + "flos": 24432982951680.0, + "grad_norm": 1.7503779333013576, + "language_loss": 0.72784024, + "learning_rate": 2.7026506768306304e-06, + "loss": 0.80496466, + "num_input_tokens_seen": 144355825, + "router_z_loss_clip": 1.67773438, + "router_z_loss_mlp": 0.12884521, + "step": 6722, + "time_per_iteration": 2.5520758628845215 + }, + { + "auxiliary_loss_clip": 0.06450947, + "auxiliary_loss_mlp": 0.01270139, + "balance_loss_clip": 0.06280953, + "balance_loss_mlp": 0.01256972, + "epoch": 0.40420862768675786, + "flos": 16765207822080.0, + "grad_norm": 1.6533819858806273, + "language_loss": 0.65986466, + "learning_rate": 2.7022860286710602e-06, + "loss": 0.73707551, + "num_input_tokens_seen": 144374320, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.13165283, + "step": 6723, + "time_per_iteration": 2.5385141372680664 + }, + { + "auxiliary_loss_clip": 0.06456833, + "auxiliary_loss_mlp": 0.01276273, + "balance_loss_clip": 0.06280676, + "balance_loss_mlp": 0.01262039, + "epoch": 0.4042687509394258, + "flos": 22498066068480.0, + "grad_norm": 1.4281101192387737, + "language_loss": 0.74082482, + "learning_rate": 2.701921353880734e-06, + "loss": 0.81815588, + "num_input_tokens_seen": 144394325, + "router_z_loss_clip": 1.76464844, + "router_z_loss_mlp": 0.14227295, + "step": 6724, + "time_per_iteration": 2.5705087184906006 + }, + { + "auxiliary_loss_clip": 0.06445859, + "auxiliary_loss_mlp": 0.01269625, + "balance_loss_clip": 0.06280795, + "balance_loss_mlp": 0.01256226, + "epoch": 0.4043288741920938, + "flos": 30343978978560.0, + "grad_norm": 1.716107680872733, + "language_loss": 0.75255632, + "learning_rate": 2.7015566524734787e-06, + "loss": 0.8297112, + "num_input_tokens_seen": 144412765, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.13409424, + "step": 6725, + "time_per_iteration": 2.6433653831481934 + }, + { + "auxiliary_loss_clip": 0.06451583, + "auxiliary_loss_mlp": 0.01271794, + "balance_loss_clip": 0.06282748, + "balance_loss_mlp": 0.01257054, + "epoch": 0.40438899744476176, + "flos": 46357978947840.0, + "grad_norm": 1.593616701788039, + "language_loss": 0.77198207, + "learning_rate": 2.701191924463126e-06, + "loss": 0.84921581, + "num_input_tokens_seen": 144435400, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.14733887, + "step": 6726, + "time_per_iteration": 2.8469409942626953 + }, + { + "auxiliary_loss_clip": 0.06452948, + "auxiliary_loss_mlp": 0.0127047, + "balance_loss_clip": 0.06279704, + "balance_loss_mlp": 0.01256058, + "epoch": 0.4044491206974297, + "flos": 13338468046080.0, + "grad_norm": 2.072990787427281, + "language_loss": 0.82297921, + "learning_rate": 2.7008271698635054e-06, + "loss": 0.90021348, + "num_input_tokens_seen": 144452925, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.14404297, + "step": 6727, + "time_per_iteration": 2.5381619930267334 + }, + { + "auxiliary_loss_clip": 0.06453642, + "auxiliary_loss_mlp": 0.01266247, + "balance_loss_clip": 0.06281026, + "balance_loss_mlp": 0.01252413, + "epoch": 0.4045092439500977, + "flos": 12098603980800.0, + "grad_norm": 2.0199249210029055, + "language_loss": 0.86119437, + "learning_rate": 2.700462388688447e-06, + "loss": 0.93839324, + "num_input_tokens_seen": 144470195, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.13830566, + "step": 6728, + "time_per_iteration": 3.903547763824463 + }, + { + "auxiliary_loss_clip": 0.06450571, + "auxiliary_loss_mlp": 0.01275259, + "balance_loss_clip": 0.06281772, + "balance_loss_mlp": 0.01260567, + "epoch": 0.40456936720276565, + "flos": 21186225745920.0, + "grad_norm": 1.6307737524107195, + "language_loss": 0.82346553, + "learning_rate": 2.700097580951786e-06, + "loss": 0.90072381, + "num_input_tokens_seen": 144490320, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.14697266, + "step": 6729, + "time_per_iteration": 2.5673158168792725 + }, + { + "auxiliary_loss_clip": 0.06454299, + "auxiliary_loss_mlp": 0.01268394, + "balance_loss_clip": 0.06281105, + "balance_loss_mlp": 0.01253755, + "epoch": 0.4046294904554336, + "flos": 23922147335040.0, + "grad_norm": 1.7857320211804986, + "language_loss": 0.73840159, + "learning_rate": 2.6997327466673533e-06, + "loss": 0.81562853, + "num_input_tokens_seen": 144508990, + "router_z_loss_clip": 1.73144531, + "router_z_loss_mlp": 0.14630127, + "step": 6730, + "time_per_iteration": 4.11122727394104 + }, + { + "auxiliary_loss_clip": 0.0645189, + "auxiliary_loss_mlp": 0.01268684, + "balance_loss_clip": 0.06282154, + "balance_loss_mlp": 0.01254767, + "epoch": 0.4046896137081016, + "flos": 38080376202240.0, + "grad_norm": 1.7383158082611918, + "language_loss": 0.67290312, + "learning_rate": 2.699367885848985e-06, + "loss": 0.75010884, + "num_input_tokens_seen": 144529550, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.13922119, + "step": 6731, + "time_per_iteration": 2.8046634197235107 + }, + { + "auxiliary_loss_clip": 0.06450266, + "auxiliary_loss_mlp": 0.01270158, + "balance_loss_clip": 0.0628126, + "balance_loss_mlp": 0.01256175, + "epoch": 0.4047497369607696, + "flos": 23623047786240.0, + "grad_norm": 1.7716081402001673, + "language_loss": 0.74489558, + "learning_rate": 2.699002998510517e-06, + "loss": 0.8220998, + "num_input_tokens_seen": 144549310, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.13977051, + "step": 6732, + "time_per_iteration": 2.608191728591919 + }, + { + "auxiliary_loss_clip": 0.06450449, + "auxiliary_loss_mlp": 0.01269365, + "balance_loss_clip": 0.06283008, + "balance_loss_mlp": 0.01255978, + "epoch": 0.40480986021343757, + "flos": 12828596751360.0, + "grad_norm": 1.6538752037468725, + "language_loss": 0.77253687, + "learning_rate": 2.6986380846657852e-06, + "loss": 0.84973502, + "num_input_tokens_seen": 144567430, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.13391113, + "step": 6733, + "time_per_iteration": 2.525399923324585 + }, + { + "auxiliary_loss_clip": 0.06457898, + "auxiliary_loss_mlp": 0.01270828, + "balance_loss_clip": 0.06280859, + "balance_loss_mlp": 0.01255176, + "epoch": 0.40486998346610553, + "flos": 23775511489920.0, + "grad_norm": 4.637374264151728, + "language_loss": 0.76891112, + "learning_rate": 2.698273144328627e-06, + "loss": 0.84619832, + "num_input_tokens_seen": 144585975, + "router_z_loss_clip": 1.76953125, + "router_z_loss_mlp": 0.15661621, + "step": 6734, + "time_per_iteration": 4.040409564971924 + }, + { + "auxiliary_loss_clip": 0.06455547, + "auxiliary_loss_mlp": 0.01267949, + "balance_loss_clip": 0.0627891, + "balance_loss_mlp": 0.0125421, + "epoch": 0.4049301067187735, + "flos": 22863439797120.0, + "grad_norm": 2.24732512167567, + "language_loss": 0.64935613, + "learning_rate": 2.6979081775128805e-06, + "loss": 0.72659111, + "num_input_tokens_seen": 144605225, + "router_z_loss_clip": 1.76757812, + "router_z_loss_mlp": 0.13745117, + "step": 6735, + "time_per_iteration": 2.5326993465423584 + }, + { + "auxiliary_loss_clip": 0.06448689, + "auxiliary_loss_mlp": 0.01271873, + "balance_loss_clip": 0.06279301, + "balance_loss_mlp": 0.01258849, + "epoch": 0.40499022997144146, + "flos": 22790624999040.0, + "grad_norm": 1.962844708798157, + "language_loss": 0.83769405, + "learning_rate": 2.697543184232387e-06, + "loss": 0.91489971, + "num_input_tokens_seen": 144624145, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.13024902, + "step": 6736, + "time_per_iteration": 2.5863215923309326 + }, + { + "auxiliary_loss_clip": 0.06454039, + "auxiliary_loss_mlp": 0.01271412, + "balance_loss_clip": 0.06281038, + "balance_loss_mlp": 0.01256832, + "epoch": 0.4050503532241094, + "flos": 23046021843840.0, + "grad_norm": 1.714368942149708, + "language_loss": 0.75428641, + "learning_rate": 2.6971781645009863e-06, + "loss": 0.83154088, + "num_input_tokens_seen": 144644470, + "router_z_loss_clip": 1.72949219, + "router_z_loss_mlp": 0.14569092, + "step": 6737, + "time_per_iteration": 2.6163716316223145 + }, + { + "auxiliary_loss_clip": 0.06448484, + "auxiliary_loss_mlp": 0.01271121, + "balance_loss_clip": 0.06280237, + "balance_loss_mlp": 0.01257644, + "epoch": 0.4051104764767774, + "flos": 16652254118400.0, + "grad_norm": 4.810644037565116, + "language_loss": 0.72306561, + "learning_rate": 2.696813118332519e-06, + "loss": 0.80026174, + "num_input_tokens_seen": 144661055, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.13470459, + "step": 6738, + "time_per_iteration": 4.0618274211883545 + }, + { + "auxiliary_loss_clip": 0.06449332, + "auxiliary_loss_mlp": 0.01270399, + "balance_loss_clip": 0.06280854, + "balance_loss_mlp": 0.01257399, + "epoch": 0.40517059972944536, + "flos": 16363929818880.0, + "grad_norm": 1.8147061411614016, + "language_loss": 0.75123262, + "learning_rate": 2.696448045740828e-06, + "loss": 0.82842994, + "num_input_tokens_seen": 144677935, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.13000488, + "step": 6739, + "time_per_iteration": 2.489001512527466 + }, + { + "auxiliary_loss_clip": 0.06454495, + "auxiliary_loss_mlp": 0.0126968, + "balance_loss_clip": 0.06282163, + "balance_loss_mlp": 0.01255405, + "epoch": 0.4052307229821133, + "flos": 28810885150080.0, + "grad_norm": 1.87280601387568, + "language_loss": 0.74278009, + "learning_rate": 2.6960829467397576e-06, + "loss": 0.82002187, + "num_input_tokens_seen": 144697725, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.14257812, + "step": 6740, + "time_per_iteration": 2.616560220718384 + }, + { + "auxiliary_loss_clip": 0.0644789, + "auxiliary_loss_mlp": 0.01270934, + "balance_loss_clip": 0.06280458, + "balance_loss_mlp": 0.01257076, + "epoch": 0.4052908462347813, + "flos": 21404334723840.0, + "grad_norm": 1.6527814212000655, + "language_loss": 0.77083528, + "learning_rate": 2.695717821343153e-06, + "loss": 0.84802353, + "num_input_tokens_seen": 144718805, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.1385498, + "step": 6741, + "time_per_iteration": 2.5236477851867676 + }, + { + "auxiliary_loss_clip": 0.06449165, + "auxiliary_loss_mlp": 0.01274329, + "balance_loss_clip": 0.06278783, + "balance_loss_mlp": 0.01259606, + "epoch": 0.40535096948744925, + "flos": 22425628613760.0, + "grad_norm": 1.6285650306233073, + "language_loss": 0.7166388, + "learning_rate": 2.6953526695648577e-06, + "loss": 0.79387373, + "num_input_tokens_seen": 144737105, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.1472168, + "step": 6742, + "time_per_iteration": 2.588928699493408 + }, + { + "auxiliary_loss_clip": 0.06454468, + "auxiliary_loss_mlp": 0.01273335, + "balance_loss_clip": 0.06282452, + "balance_loss_mlp": 0.01258016, + "epoch": 0.4054110927401172, + "flos": 17015028370560.0, + "grad_norm": 2.751799665484638, + "language_loss": 0.73206228, + "learning_rate": 2.6949874914187202e-06, + "loss": 0.80934024, + "num_input_tokens_seen": 144751350, + "router_z_loss_clip": 1.72167969, + "router_z_loss_mlp": 0.15332031, + "step": 6743, + "time_per_iteration": 2.519907236099243 + }, + { + "auxiliary_loss_clip": 0.0645441, + "auxiliary_loss_mlp": 0.01272217, + "balance_loss_clip": 0.06280394, + "balance_loss_mlp": 0.01257494, + "epoch": 0.4054712159927852, + "flos": 21621018182400.0, + "grad_norm": 2.0068914143371623, + "language_loss": 0.7128458, + "learning_rate": 2.694622286918588e-06, + "loss": 0.79011208, + "num_input_tokens_seen": 144770030, + "router_z_loss_clip": 1.73925781, + "router_z_loss_mlp": 0.14733887, + "step": 6744, + "time_per_iteration": 2.641242742538452 + }, + { + "auxiliary_loss_clip": 0.06447047, + "auxiliary_loss_mlp": 0.01269556, + "balance_loss_clip": 0.06280165, + "balance_loss_mlp": 0.01255722, + "epoch": 0.4055313392454532, + "flos": 25819734424320.0, + "grad_norm": 1.5431481906112547, + "language_loss": 0.80460721, + "learning_rate": 2.6942570560783076e-06, + "loss": 0.88177323, + "num_input_tokens_seen": 144790965, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.13830566, + "step": 6745, + "time_per_iteration": 2.563445806503296 + }, + { + "auxiliary_loss_clip": 0.06450857, + "auxiliary_loss_mlp": 0.01269463, + "balance_loss_clip": 0.06282623, + "balance_loss_mlp": 0.01255009, + "epoch": 0.40559146249812117, + "flos": 14142323790720.0, + "grad_norm": 1.9690336991849304, + "language_loss": 0.67176485, + "learning_rate": 2.693891798911731e-06, + "loss": 0.74896801, + "num_input_tokens_seen": 144807755, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.14465332, + "step": 6746, + "time_per_iteration": 2.532186508178711 + }, + { + "auxiliary_loss_clip": 0.064533, + "auxiliary_loss_mlp": 0.01266546, + "balance_loss_clip": 0.06283557, + "balance_loss_mlp": 0.01253272, + "epoch": 0.40565158575078913, + "flos": 41365259815680.0, + "grad_norm": 1.4380414737187444, + "language_loss": 0.57222033, + "learning_rate": 2.6935265154327075e-06, + "loss": 0.64941883, + "num_input_tokens_seen": 144832405, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.1328125, + "step": 6747, + "time_per_iteration": 2.7487149238586426 + }, + { + "auxiliary_loss_clip": 0.06454123, + "auxiliary_loss_mlp": 0.01269064, + "balance_loss_clip": 0.06282702, + "balance_loss_mlp": 0.01255319, + "epoch": 0.4057117090034571, + "flos": 28551421382400.0, + "grad_norm": 2.093705794925994, + "language_loss": 0.84795344, + "learning_rate": 2.693161205655089e-06, + "loss": 0.92518532, + "num_input_tokens_seen": 144853890, + "router_z_loss_clip": 1.71582031, + "router_z_loss_mlp": 0.13739014, + "step": 6748, + "time_per_iteration": 2.5967648029327393 + }, + { + "auxiliary_loss_clip": 0.06453951, + "auxiliary_loss_mlp": 0.01269749, + "balance_loss_clip": 0.06281549, + "balance_loss_mlp": 0.01254794, + "epoch": 0.40577183225612506, + "flos": 18009851569920.0, + "grad_norm": 1.9056349360303495, + "language_loss": 0.81943792, + "learning_rate": 2.6927958695927287e-06, + "loss": 0.89667493, + "num_input_tokens_seen": 144871395, + "router_z_loss_clip": 1.72265625, + "router_z_loss_mlp": 0.14953613, + "step": 6749, + "time_per_iteration": 2.546419143676758 + }, + { + "auxiliary_loss_clip": 0.06450339, + "auxiliary_loss_mlp": 0.01270578, + "balance_loss_clip": 0.06281818, + "balance_loss_mlp": 0.01256762, + "epoch": 0.40583195550879303, + "flos": 19542819617280.0, + "grad_norm": 1.7354001752331154, + "language_loss": 0.75251377, + "learning_rate": 2.6924305072594784e-06, + "loss": 0.82972294, + "num_input_tokens_seen": 144890975, + "router_z_loss_clip": 1.68554688, + "router_z_loss_mlp": 0.13824463, + "step": 6750, + "time_per_iteration": 2.633349895477295 + }, + { + "auxiliary_loss_clip": 0.06461279, + "auxiliary_loss_mlp": 0.01270913, + "balance_loss_clip": 0.06282868, + "balance_loss_mlp": 0.01256441, + "epoch": 0.405892078761461, + "flos": 22315987146240.0, + "grad_norm": 2.3215315740209026, + "language_loss": 0.73715317, + "learning_rate": 2.692065118669195e-06, + "loss": 0.81447506, + "num_input_tokens_seen": 144908170, + "router_z_loss_clip": 1.78125, + "router_z_loss_mlp": 0.14459229, + "step": 6751, + "time_per_iteration": 2.579233169555664 + }, + { + "auxiliary_loss_clip": 0.06456044, + "auxiliary_loss_mlp": 0.01276434, + "balance_loss_clip": 0.06282923, + "balance_loss_mlp": 0.01261622, + "epoch": 0.40595220201412896, + "flos": 25491564708480.0, + "grad_norm": 1.5288716905414277, + "language_loss": 0.66520017, + "learning_rate": 2.6916997038357326e-06, + "loss": 0.74252492, + "num_input_tokens_seen": 144928020, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.14788818, + "step": 6752, + "time_per_iteration": 2.5768818855285645 + }, + { + "auxiliary_loss_clip": 0.06457777, + "auxiliary_loss_mlp": 0.01274224, + "balance_loss_clip": 0.06281942, + "balance_loss_mlp": 0.01259025, + "epoch": 0.4060123252667969, + "flos": 49867092887040.0, + "grad_norm": 1.7025851849816316, + "language_loss": 0.71210098, + "learning_rate": 2.691334262772948e-06, + "loss": 0.78942096, + "num_input_tokens_seen": 144951240, + "router_z_loss_clip": 1.75976562, + "router_z_loss_mlp": 0.15197754, + "step": 6753, + "time_per_iteration": 2.807713031768799 + }, + { + "auxiliary_loss_clip": 0.06455305, + "auxiliary_loss_mlp": 0.01268505, + "balance_loss_clip": 0.06281379, + "balance_loss_mlp": 0.01254736, + "epoch": 0.4060724485194649, + "flos": 21140720179200.0, + "grad_norm": 2.0551663576230657, + "language_loss": 0.72102135, + "learning_rate": 2.690968795494699e-06, + "loss": 0.7982595, + "num_input_tokens_seen": 144969100, + "router_z_loss_clip": 1.73925781, + "router_z_loss_mlp": 0.13763428, + "step": 6754, + "time_per_iteration": 2.5342867374420166 + }, + { + "auxiliary_loss_clip": 0.0645773, + "auxiliary_loss_mlp": 0.01273848, + "balance_loss_clip": 0.06283537, + "balance_loss_mlp": 0.0125931, + "epoch": 0.40613257177213286, + "flos": 21763796739840.0, + "grad_norm": 1.762365568083109, + "language_loss": 0.83186102, + "learning_rate": 2.690603302014844e-06, + "loss": 0.90917671, + "num_input_tokens_seen": 144987065, + "router_z_loss_clip": 1.74121094, + "router_z_loss_mlp": 0.14520264, + "step": 6755, + "time_per_iteration": 2.6024997234344482 + }, + { + "auxiliary_loss_clip": 0.06461492, + "auxiliary_loss_mlp": 0.01268966, + "balance_loss_clip": 0.06283044, + "balance_loss_mlp": 0.01254047, + "epoch": 0.4061926950248008, + "flos": 25561863884160.0, + "grad_norm": 1.6099502444653784, + "language_loss": 0.71436989, + "learning_rate": 2.6902377823472426e-06, + "loss": 0.79167449, + "num_input_tokens_seen": 145007310, + "router_z_loss_clip": 1.78417969, + "router_z_loss_mlp": 0.14923096, + "step": 6756, + "time_per_iteration": 2.5427916049957275 + }, + { + "auxiliary_loss_clip": 0.06455702, + "auxiliary_loss_mlp": 0.01272698, + "balance_loss_clip": 0.06279261, + "balance_loss_mlp": 0.01257726, + "epoch": 0.4062528182774688, + "flos": 23702528983680.0, + "grad_norm": 1.686471122095966, + "language_loss": 0.79134113, + "learning_rate": 2.689872236505755e-06, + "loss": 0.86862516, + "num_input_tokens_seen": 145026210, + "router_z_loss_clip": 1.76464844, + "router_z_loss_mlp": 0.14990234, + "step": 6757, + "time_per_iteration": 2.573546886444092 + }, + { + "auxiliary_loss_clip": 0.06451409, + "auxiliary_loss_mlp": 0.01275677, + "balance_loss_clip": 0.0627944, + "balance_loss_mlp": 0.01260561, + "epoch": 0.4063129415301368, + "flos": 21732504439680.0, + "grad_norm": 1.6631673854083442, + "language_loss": 0.78665155, + "learning_rate": 2.6895066645042437e-06, + "loss": 0.86392242, + "num_input_tokens_seen": 145045475, + "router_z_loss_clip": 1.71972656, + "router_z_loss_mlp": 0.15100098, + "step": 6758, + "time_per_iteration": 2.5283167362213135 + }, + { + "auxiliary_loss_clip": 0.06450847, + "auxiliary_loss_mlp": 0.01276876, + "balance_loss_clip": 0.06280972, + "balance_loss_mlp": 0.0126331, + "epoch": 0.40637306478280477, + "flos": 12792650549760.0, + "grad_norm": 2.0123521464099183, + "language_loss": 0.89116049, + "learning_rate": 2.6891410663565703e-06, + "loss": 0.96843767, + "num_input_tokens_seen": 145062260, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.13568115, + "step": 6759, + "time_per_iteration": 2.5211679935455322 + }, + { + "auxiliary_loss_clip": 0.06457647, + "auxiliary_loss_mlp": 0.01273439, + "balance_loss_clip": 0.06284226, + "balance_loss_mlp": 0.01259742, + "epoch": 0.40643318803547274, + "flos": 24031327605120.0, + "grad_norm": 2.379594130925159, + "language_loss": 0.64235389, + "learning_rate": 2.688775442076598e-06, + "loss": 0.71966481, + "num_input_tokens_seen": 145082470, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.13690186, + "step": 6760, + "time_per_iteration": 2.546807050704956 + }, + { + "auxiliary_loss_clip": 0.0645775, + "auxiliary_loss_mlp": 0.01275543, + "balance_loss_clip": 0.06282319, + "balance_loss_mlp": 0.01260856, + "epoch": 0.4064933112881407, + "flos": 25599361386240.0, + "grad_norm": 1.4617486076979092, + "language_loss": 0.75530171, + "learning_rate": 2.688409791678193e-06, + "loss": 0.83263463, + "num_input_tokens_seen": 145105685, + "router_z_loss_clip": 1.75390625, + "router_z_loss_mlp": 0.14666748, + "step": 6761, + "time_per_iteration": 2.635345935821533 + }, + { + "auxiliary_loss_clip": 0.0645279, + "auxiliary_loss_mlp": 0.01275826, + "balance_loss_clip": 0.06285599, + "balance_loss_mlp": 0.01262183, + "epoch": 0.40655343454080867, + "flos": 22060841863680.0, + "grad_norm": 1.3772427401241372, + "language_loss": 0.70268184, + "learning_rate": 2.6880441151752185e-06, + "loss": 0.77996796, + "num_input_tokens_seen": 145125590, + "router_z_loss_clip": 1.671875, + "router_z_loss_mlp": 0.1362915, + "step": 6762, + "time_per_iteration": 2.5381741523742676 + }, + { + "auxiliary_loss_clip": 0.06454535, + "auxiliary_loss_mlp": 0.01269241, + "balance_loss_clip": 0.06282306, + "balance_loss_mlp": 0.01255532, + "epoch": 0.40661355779347663, + "flos": 26476115783040.0, + "grad_norm": 2.097586218934523, + "language_loss": 0.74072015, + "learning_rate": 2.6876784125815433e-06, + "loss": 0.81795788, + "num_input_tokens_seen": 145146810, + "router_z_loss_clip": 1.72265625, + "router_z_loss_mlp": 0.13708496, + "step": 6763, + "time_per_iteration": 2.6068081855773926 + }, + { + "auxiliary_loss_clip": 0.06460483, + "auxiliary_loss_mlp": 0.01272662, + "balance_loss_clip": 0.06284823, + "balance_loss_mlp": 0.01257946, + "epoch": 0.4066736810461446, + "flos": 13266156372480.0, + "grad_norm": 1.6908157420926835, + "language_loss": 0.69497877, + "learning_rate": 2.687312683911033e-06, + "loss": 0.77231026, + "num_input_tokens_seen": 145163130, + "router_z_loss_clip": 1.7578125, + "router_z_loss_mlp": 0.14703369, + "step": 6764, + "time_per_iteration": 2.511901378631592 + }, + { + "auxiliary_loss_clip": 0.06461611, + "auxiliary_loss_mlp": 0.01272386, + "balance_loss_clip": 0.06284289, + "balance_loss_mlp": 0.01255995, + "epoch": 0.40673380429881256, + "flos": 28811178639360.0, + "grad_norm": 2.09874166778498, + "language_loss": 0.91354716, + "learning_rate": 2.686946929177557e-06, + "loss": 0.99088717, + "num_input_tokens_seen": 145181420, + "router_z_loss_clip": 1.7734375, + "router_z_loss_mlp": 0.16381836, + "step": 6765, + "time_per_iteration": 2.614131450653076 + }, + { + "auxiliary_loss_clip": 0.06467324, + "auxiliary_loss_mlp": 0.01271556, + "balance_loss_clip": 0.06289016, + "balance_loss_mlp": 0.01256959, + "epoch": 0.4067939275514805, + "flos": 12500301254400.0, + "grad_norm": 2.6861779086384945, + "language_loss": 0.7896508, + "learning_rate": 2.6865811483949855e-06, + "loss": 0.86703956, + "num_input_tokens_seen": 145198545, + "router_z_loss_clip": 1.78222656, + "router_z_loss_mlp": 0.14599609, + "step": 6766, + "time_per_iteration": 2.5117299556732178 + }, + { + "auxiliary_loss_clip": 0.06462067, + "auxiliary_loss_mlp": 0.01273332, + "balance_loss_clip": 0.0628517, + "balance_loss_mlp": 0.01258306, + "epoch": 0.4068540508041485, + "flos": 18776461374720.0, + "grad_norm": 40.22612567694579, + "language_loss": 0.77094513, + "learning_rate": 2.6862153415771867e-06, + "loss": 0.84829921, + "num_input_tokens_seen": 145215835, + "router_z_loss_clip": 1.76855469, + "router_z_loss_mlp": 0.15020752, + "step": 6767, + "time_per_iteration": 2.5433967113494873 + }, + { + "auxiliary_loss_clip": 0.06456982, + "auxiliary_loss_mlp": 0.01274714, + "balance_loss_clip": 0.06286283, + "balance_loss_mlp": 0.01260784, + "epoch": 0.40691417405681646, + "flos": 28520506425600.0, + "grad_norm": 1.6477494711234055, + "language_loss": 0.77846849, + "learning_rate": 2.685849508738034e-06, + "loss": 0.85578549, + "num_input_tokens_seen": 145236555, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.1394043, + "step": 6768, + "time_per_iteration": 4.049299478530884 + }, + { + "auxiliary_loss_clip": 0.06460279, + "auxiliary_loss_mlp": 0.0127197, + "balance_loss_clip": 0.06286994, + "balance_loss_mlp": 0.01258213, + "epoch": 0.4069742973094844, + "flos": 20820390819840.0, + "grad_norm": 1.9557468193178857, + "language_loss": 0.87631512, + "learning_rate": 2.6854836498913995e-06, + "loss": 0.9536376, + "num_input_tokens_seen": 145254595, + "router_z_loss_clip": 1.73339844, + "router_z_loss_mlp": 0.13757324, + "step": 6769, + "time_per_iteration": 2.540104389190674 + }, + { + "auxiliary_loss_clip": 0.06461371, + "auxiliary_loss_mlp": 0.01272921, + "balance_loss_clip": 0.06292167, + "balance_loss_mlp": 0.01259504, + "epoch": 0.4070344205621524, + "flos": 21476646397440.0, + "grad_norm": 2.001246026688969, + "language_loss": 0.80859989, + "learning_rate": 2.685117765051156e-06, + "loss": 0.88594282, + "num_input_tokens_seen": 145274005, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.13421631, + "step": 6770, + "time_per_iteration": 3.9851884841918945 + }, + { + "auxiliary_loss_clip": 0.06465216, + "auxiliary_loss_mlp": 0.01270985, + "balance_loss_clip": 0.06288273, + "balance_loss_mlp": 0.01256203, + "epoch": 0.4070945438148204, + "flos": 26836709829120.0, + "grad_norm": 1.8007492597774561, + "language_loss": 0.80221689, + "learning_rate": 2.6847518542311783e-06, + "loss": 0.87957895, + "num_input_tokens_seen": 145294850, + "router_z_loss_clip": 1.77148438, + "router_z_loss_mlp": 0.14770508, + "step": 6771, + "time_per_iteration": 2.5747835636138916 + }, + { + "auxiliary_loss_clip": 0.06460344, + "auxiliary_loss_mlp": 0.01270623, + "balance_loss_clip": 0.06287014, + "balance_loss_mlp": 0.01256926, + "epoch": 0.4071546670674884, + "flos": 26360478748800.0, + "grad_norm": 1.364923552922522, + "language_loss": 0.7623316, + "learning_rate": 2.6843859174453417e-06, + "loss": 0.83964121, + "num_input_tokens_seen": 145317050, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.13696289, + "step": 6772, + "time_per_iteration": 2.628304958343506 + }, + { + "auxiliary_loss_clip": 0.06461407, + "auxiliary_loss_mlp": 0.01270312, + "balance_loss_clip": 0.06287165, + "balance_loss_mlp": 0.01255471, + "epoch": 0.40721479032015634, + "flos": 17901300205440.0, + "grad_norm": 1.7629352970283074, + "language_loss": 0.81345379, + "learning_rate": 2.6840199547075218e-06, + "loss": 0.89077097, + "num_input_tokens_seen": 145334480, + "router_z_loss_clip": 1.74316406, + "router_z_loss_mlp": 0.1484375, + "step": 6773, + "time_per_iteration": 2.5225751399993896 + }, + { + "auxiliary_loss_clip": 0.06368425, + "auxiliary_loss_mlp": 0.01263617, + "balance_loss_clip": 0.06289985, + "balance_loss_mlp": 0.01259653, + "epoch": 0.4072749135728243, + "flos": 49871522424960.0, + "grad_norm": 0.8094154348681942, + "language_loss": 0.64365125, + "learning_rate": 2.683653966031597e-06, + "loss": 0.71997166, + "num_input_tokens_seen": 145388695, + "router_z_loss_clip": 0.78515625, + "router_z_loss_mlp": 0.03961182, + "step": 6774, + "time_per_iteration": 4.446218967437744 + }, + { + "auxiliary_loss_clip": 0.06460027, + "auxiliary_loss_mlp": 0.01268161, + "balance_loss_clip": 0.06283361, + "balance_loss_mlp": 0.01254481, + "epoch": 0.40733503682549227, + "flos": 27571063011840.0, + "grad_norm": 1.7398483222375367, + "language_loss": 0.7269184, + "learning_rate": 2.683287951431446e-06, + "loss": 0.80420029, + "num_input_tokens_seen": 145408240, + "router_z_loss_clip": 1.765625, + "router_z_loss_mlp": 0.13659668, + "step": 6775, + "time_per_iteration": 2.599534511566162 + }, + { + "auxiliary_loss_clip": 0.0645956, + "auxiliary_loss_mlp": 0.01271281, + "balance_loss_clip": 0.06285449, + "balance_loss_mlp": 0.01257328, + "epoch": 0.40739516007816023, + "flos": 22133447026560.0, + "grad_norm": 1.36694346344043, + "language_loss": 0.78053248, + "learning_rate": 2.6829219109209474e-06, + "loss": 0.8578409, + "num_input_tokens_seen": 145428395, + "router_z_loss_clip": 1.74023438, + "router_z_loss_mlp": 0.13946533, + "step": 6776, + "time_per_iteration": 2.6111807823181152 + }, + { + "auxiliary_loss_clip": 0.06466034, + "auxiliary_loss_mlp": 0.01268413, + "balance_loss_clip": 0.06288318, + "balance_loss_mlp": 0.01254358, + "epoch": 0.4074552833308282, + "flos": 23849080974720.0, + "grad_norm": 2.6992343713036933, + "language_loss": 0.79444098, + "learning_rate": 2.682555844513981e-06, + "loss": 0.87178552, + "num_input_tokens_seen": 145448290, + "router_z_loss_clip": 1.77734375, + "router_z_loss_mlp": 0.14056396, + "step": 6777, + "time_per_iteration": 2.6968321800231934 + }, + { + "auxiliary_loss_clip": 0.0635563, + "auxiliary_loss_mlp": 0.01254556, + "balance_loss_clip": 0.06276868, + "balance_loss_mlp": 0.01251499, + "epoch": 0.40751540658349616, + "flos": 58019847120000.0, + "grad_norm": 0.6740608536307336, + "language_loss": 0.53006828, + "learning_rate": 2.6821897522244286e-06, + "loss": 0.60617012, + "num_input_tokens_seen": 145509785, + "router_z_loss_clip": 0.7890625, + "router_z_loss_mlp": 0.0305481, + "step": 6778, + "time_per_iteration": 4.5793616771698 + }, + { + "auxiliary_loss_clip": 0.0645799, + "auxiliary_loss_mlp": 0.01272337, + "balance_loss_clip": 0.06285123, + "balance_loss_mlp": 0.01257996, + "epoch": 0.40757552983616413, + "flos": 21220956063360.0, + "grad_norm": 2.166644010842874, + "language_loss": 0.8325671, + "learning_rate": 2.6818236340661718e-06, + "loss": 0.90987039, + "num_input_tokens_seen": 145528620, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.14349365, + "step": 6779, + "time_per_iteration": 2.5122289657592773 + }, + { + "auxiliary_loss_clip": 0.06459656, + "auxiliary_loss_mlp": 0.01270176, + "balance_loss_clip": 0.06286415, + "balance_loss_mlp": 0.01255752, + "epoch": 0.4076356530888321, + "flos": 26840776752000.0, + "grad_norm": 1.555798351548063, + "language_loss": 0.76392281, + "learning_rate": 2.6814574900530957e-06, + "loss": 0.84122109, + "num_input_tokens_seen": 145547775, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.14440918, + "step": 6780, + "time_per_iteration": 2.5635926723480225 + }, + { + "auxiliary_loss_clip": 0.06453321, + "auxiliary_loss_mlp": 0.01268481, + "balance_loss_clip": 0.06285319, + "balance_loss_mlp": 0.01255964, + "epoch": 0.40769577634150006, + "flos": 12207868104960.0, + "grad_norm": 2.3318684771465388, + "language_loss": 0.66762495, + "learning_rate": 2.6810913201990827e-06, + "loss": 0.74484301, + "num_input_tokens_seen": 145564465, + "router_z_loss_clip": 1.67871094, + "router_z_loss_mlp": 0.12512207, + "step": 6781, + "time_per_iteration": 2.4998953342437744 + }, + { + "auxiliary_loss_clip": 0.06457075, + "auxiliary_loss_mlp": 0.01270756, + "balance_loss_clip": 0.06285501, + "balance_loss_mlp": 0.01257005, + "epoch": 0.407755899594168, + "flos": 33663467128320.0, + "grad_norm": 1.4801990709986605, + "language_loss": 0.71833825, + "learning_rate": 2.6807251245180183e-06, + "loss": 0.79561651, + "num_input_tokens_seen": 145585965, + "router_z_loss_clip": 1.71679688, + "router_z_loss_mlp": 0.13757324, + "step": 6782, + "time_per_iteration": 2.6407761573791504 + }, + { + "auxiliary_loss_clip": 0.06455722, + "auxiliary_loss_mlp": 0.01265619, + "balance_loss_clip": 0.06282325, + "balance_loss_mlp": 0.01252804, + "epoch": 0.407816022846836, + "flos": 20163590190720.0, + "grad_norm": 1.6531823939859909, + "language_loss": 0.82546687, + "learning_rate": 2.6803589030237897e-06, + "loss": 0.90268028, + "num_input_tokens_seen": 145605000, + "router_z_loss_clip": 1.73535156, + "router_z_loss_mlp": 0.12823486, + "step": 6783, + "time_per_iteration": 2.521007776260376 + }, + { + "auxiliary_loss_clip": 0.06456424, + "auxiliary_loss_mlp": 0.01272041, + "balance_loss_clip": 0.06284439, + "balance_loss_mlp": 0.01258504, + "epoch": 0.40787614609950396, + "flos": 21185219496960.0, + "grad_norm": 3.105146861858365, + "language_loss": 0.80980694, + "learning_rate": 2.679992655730283e-06, + "loss": 0.88709158, + "num_input_tokens_seen": 145623740, + "router_z_loss_clip": 1.71972656, + "router_z_loss_mlp": 0.13549805, + "step": 6784, + "time_per_iteration": 2.555502414703369 + }, + { + "auxiliary_loss_clip": 0.06462008, + "auxiliary_loss_mlp": 0.01270528, + "balance_loss_clip": 0.06282149, + "balance_loss_mlp": 0.01254888, + "epoch": 0.407936269352172, + "flos": 20526699859200.0, + "grad_norm": 1.8248584482375538, + "language_loss": 0.65994555, + "learning_rate": 2.679626382651386e-06, + "loss": 0.73727089, + "num_input_tokens_seen": 145643515, + "router_z_loss_clip": 1.79882812, + "router_z_loss_mlp": 0.15661621, + "step": 6785, + "time_per_iteration": 2.5122246742248535 + }, + { + "auxiliary_loss_clip": 0.06453374, + "auxiliary_loss_mlp": 0.01270477, + "balance_loss_clip": 0.06281981, + "balance_loss_mlp": 0.01256505, + "epoch": 0.40799639260483994, + "flos": 20124709096320.0, + "grad_norm": 2.5052548980669487, + "language_loss": 0.80350053, + "learning_rate": 2.679260083800989e-06, + "loss": 0.88073903, + "num_input_tokens_seen": 145660890, + "router_z_loss_clip": 1.71191406, + "router_z_loss_mlp": 0.13970947, + "step": 6786, + "time_per_iteration": 2.554553985595703 + }, + { + "auxiliary_loss_clip": 0.0645851, + "auxiliary_loss_mlp": 0.01272529, + "balance_loss_clip": 0.06286281, + "balance_loss_mlp": 0.01258874, + "epoch": 0.4080565158575079, + "flos": 21003853334400.0, + "grad_norm": 1.5530341827396597, + "language_loss": 0.81621969, + "learning_rate": 2.678893759192982e-06, + "loss": 0.89353013, + "num_input_tokens_seen": 145680070, + "router_z_loss_clip": 1.72070312, + "router_z_loss_mlp": 0.13665771, + "step": 6787, + "time_per_iteration": 2.536215305328369 + }, + { + "auxiliary_loss_clip": 0.06458452, + "auxiliary_loss_mlp": 0.01268932, + "balance_loss_clip": 0.0628721, + "balance_loss_mlp": 0.01255623, + "epoch": 0.40811663911017587, + "flos": 19323746317440.0, + "grad_norm": 1.9049170263972377, + "language_loss": 0.6798445, + "learning_rate": 2.678527408841255e-06, + "loss": 0.75711828, + "num_input_tokens_seen": 145698010, + "router_z_loss_clip": 1.7109375, + "router_z_loss_mlp": 0.13323975, + "step": 6788, + "time_per_iteration": 2.533457040786743 + }, + { + "auxiliary_loss_clip": 0.06456561, + "auxiliary_loss_mlp": 0.01272482, + "balance_loss_clip": 0.06284444, + "balance_loss_mlp": 0.01258952, + "epoch": 0.40817676236284384, + "flos": 40634973555840.0, + "grad_norm": 1.8916550457168047, + "language_loss": 0.66478348, + "learning_rate": 2.678161032759701e-06, + "loss": 0.74207389, + "num_input_tokens_seen": 145722215, + "router_z_loss_clip": 1.72167969, + "router_z_loss_mlp": 0.13537598, + "step": 6789, + "time_per_iteration": 2.726292371749878 + }, + { + "auxiliary_loss_clip": 0.06456382, + "auxiliary_loss_mlp": 0.01270282, + "balance_loss_clip": 0.06284897, + "balance_loss_mlp": 0.01256383, + "epoch": 0.4082368856155118, + "flos": 20528376940800.0, + "grad_norm": 1.5670896359254076, + "language_loss": 0.61192298, + "learning_rate": 2.6777946309622123e-06, + "loss": 0.68918967, + "num_input_tokens_seen": 145741090, + "router_z_loss_clip": 1.71484375, + "router_z_loss_mlp": 0.13885498, + "step": 6790, + "time_per_iteration": 2.5437731742858887 + }, + { + "auxiliary_loss_clip": 0.06455828, + "auxiliary_loss_mlp": 0.01271387, + "balance_loss_clip": 0.062863, + "balance_loss_mlp": 0.01257928, + "epoch": 0.40829700886817977, + "flos": 11430944248320.0, + "grad_norm": 3.0698605132878076, + "language_loss": 0.69964224, + "learning_rate": 2.677428203462683e-06, + "loss": 0.77691442, + "num_input_tokens_seen": 145754985, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.13452148, + "step": 6791, + "time_per_iteration": 2.4941210746765137 + }, + { + "auxiliary_loss_clip": 0.0635563, + "auxiliary_loss_mlp": 0.01262815, + "balance_loss_clip": 0.06277826, + "balance_loss_mlp": 0.01259486, + "epoch": 0.40835713212084773, + "flos": 67350455326080.0, + "grad_norm": 0.7295736549212738, + "language_loss": 0.59295797, + "learning_rate": 2.6770617502750093e-06, + "loss": 0.66914248, + "num_input_tokens_seen": 145815260, + "router_z_loss_clip": 0.77734375, + "router_z_loss_mlp": 0.03335571, + "step": 6792, + "time_per_iteration": 3.153479814529419 + }, + { + "auxiliary_loss_clip": 0.06459208, + "auxiliary_loss_mlp": 0.01270498, + "balance_loss_clip": 0.06285354, + "balance_loss_mlp": 0.01256193, + "epoch": 0.4084172553735157, + "flos": 21768408714240.0, + "grad_norm": 1.6689878199369865, + "language_loss": 0.80186534, + "learning_rate": 2.6766952714130857e-06, + "loss": 0.87916243, + "num_input_tokens_seen": 145832665, + "router_z_loss_clip": 1.73828125, + "router_z_loss_mlp": 0.14306641, + "step": 6793, + "time_per_iteration": 2.562311887741089 + }, + { + "auxiliary_loss_clip": 0.06458702, + "auxiliary_loss_mlp": 0.01272476, + "balance_loss_clip": 0.06283591, + "balance_loss_mlp": 0.01258237, + "epoch": 0.40847737862618366, + "flos": 27424594874880.0, + "grad_norm": 3.9059129474249, + "language_loss": 0.85597503, + "learning_rate": 2.6763287668908094e-06, + "loss": 0.93328679, + "num_input_tokens_seen": 145850240, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.14227295, + "step": 6794, + "time_per_iteration": 2.558554172515869 + }, + { + "auxiliary_loss_clip": 0.06457786, + "auxiliary_loss_mlp": 0.01274296, + "balance_loss_clip": 0.0628652, + "balance_loss_mlp": 0.01259991, + "epoch": 0.4085375018788516, + "flos": 18593040787200.0, + "grad_norm": 1.7852935587618148, + "language_loss": 0.80216181, + "learning_rate": 2.6759622367220788e-06, + "loss": 0.87948263, + "num_input_tokens_seen": 145869545, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.14306641, + "step": 6795, + "time_per_iteration": 2.540349006652832 + }, + { + "auxiliary_loss_clip": 0.06465046, + "auxiliary_loss_mlp": 0.01270762, + "balance_loss_clip": 0.0628596, + "balance_loss_mlp": 0.01255718, + "epoch": 0.4085976251315196, + "flos": 15416834319360.0, + "grad_norm": 2.647671549267762, + "language_loss": 0.70204669, + "learning_rate": 2.675595680920792e-06, + "loss": 0.77940476, + "num_input_tokens_seen": 145884025, + "router_z_loss_clip": 1.79101562, + "router_z_loss_mlp": 0.15057373, + "step": 6796, + "time_per_iteration": 2.483670711517334 + }, + { + "auxiliary_loss_clip": 0.06458762, + "auxiliary_loss_mlp": 0.01269742, + "balance_loss_clip": 0.06285367, + "balance_loss_mlp": 0.01256558, + "epoch": 0.40865774838418756, + "flos": 21258705127680.0, + "grad_norm": 1.5727118215642113, + "language_loss": 0.78255171, + "learning_rate": 2.6752290995008498e-06, + "loss": 0.85983676, + "num_input_tokens_seen": 145903210, + "router_z_loss_clip": 1.73535156, + "router_z_loss_mlp": 0.13189697, + "step": 6797, + "time_per_iteration": 2.580595016479492 + }, + { + "auxiliary_loss_clip": 0.06459324, + "auxiliary_loss_mlp": 0.01274053, + "balance_loss_clip": 0.06286809, + "balance_loss_mlp": 0.01260183, + "epoch": 0.4087178716368556, + "flos": 13777411259520.0, + "grad_norm": 1.8045279385790254, + "language_loss": 0.86005986, + "learning_rate": 2.6748624924761523e-06, + "loss": 0.93739361, + "num_input_tokens_seen": 145920985, + "router_z_loss_clip": 1.72363281, + "router_z_loss_mlp": 0.13885498, + "step": 6798, + "time_per_iteration": 2.525223970413208 + }, + { + "auxiliary_loss_clip": 0.0645816, + "auxiliary_loss_mlp": 0.01271081, + "balance_loss_clip": 0.06287363, + "balance_loss_mlp": 0.01258308, + "epoch": 0.40877799488952354, + "flos": 23628288666240.0, + "grad_norm": 1.532136532380416, + "language_loss": 0.84202659, + "learning_rate": 2.674495859860601e-06, + "loss": 0.91931903, + "num_input_tokens_seen": 145940350, + "router_z_loss_clip": 1.70800781, + "router_z_loss_mlp": 0.12774658, + "step": 6799, + "time_per_iteration": 2.5898637771606445 + }, + { + "auxiliary_loss_clip": 0.06456885, + "auxiliary_loss_mlp": 0.01270815, + "balance_loss_clip": 0.06284514, + "balance_loss_mlp": 0.01256695, + "epoch": 0.4088381181421915, + "flos": 20924372136960.0, + "grad_norm": 3.2861641598601516, + "language_loss": 0.83725351, + "learning_rate": 2.6741292016681e-06, + "loss": 0.91453052, + "num_input_tokens_seen": 145957460, + "router_z_loss_clip": 1.72265625, + "router_z_loss_mlp": 0.14129639, + "step": 6800, + "time_per_iteration": 2.5050573348999023 + }, + { + "auxiliary_loss_clip": 0.06460495, + "auxiliary_loss_mlp": 0.0127488, + "balance_loss_clip": 0.06284706, + "balance_loss_mlp": 0.01260324, + "epoch": 0.4088982413948595, + "flos": 13302605698560.0, + "grad_norm": 2.1402246624759225, + "language_loss": 0.74944514, + "learning_rate": 2.6737625179125514e-06, + "loss": 0.82679886, + "num_input_tokens_seen": 145975285, + "router_z_loss_clip": 1.7578125, + "router_z_loss_mlp": 0.14532471, + "step": 6801, + "time_per_iteration": 2.546226978302002 + }, + { + "auxiliary_loss_clip": 0.0646005, + "auxiliary_loss_mlp": 0.0127012, + "balance_loss_clip": 0.06286253, + "balance_loss_mlp": 0.01256358, + "epoch": 0.40895836464752744, + "flos": 15273007585920.0, + "grad_norm": 2.8712837575861316, + "language_loss": 0.80348778, + "learning_rate": 2.673395808607861e-06, + "loss": 0.8807894, + "num_input_tokens_seen": 145989150, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.13775635, + "step": 6802, + "time_per_iteration": 2.4804327487945557 + }, + { + "auxiliary_loss_clip": 0.06463334, + "auxiliary_loss_mlp": 0.01271488, + "balance_loss_clip": 0.06286001, + "balance_loss_mlp": 0.01256813, + "epoch": 0.4090184879001954, + "flos": 14506607416320.0, + "grad_norm": 2.1610413406346147, + "language_loss": 0.7616486, + "learning_rate": 2.673029073767934e-06, + "loss": 0.83899677, + "num_input_tokens_seen": 146006980, + "router_z_loss_clip": 1.77246094, + "router_z_loss_mlp": 0.14660645, + "step": 6803, + "time_per_iteration": 2.5792553424835205 + }, + { + "auxiliary_loss_clip": 0.06459032, + "auxiliary_loss_mlp": 0.01268618, + "balance_loss_clip": 0.06286538, + "balance_loss_mlp": 0.01255017, + "epoch": 0.40907861115286337, + "flos": 13886759237760.0, + "grad_norm": 1.7652651103072021, + "language_loss": 0.79160619, + "learning_rate": 2.6726623134066764e-06, + "loss": 0.86888266, + "num_input_tokens_seen": 146025125, + "router_z_loss_clip": 1.72363281, + "router_z_loss_mlp": 0.1361084, + "step": 6804, + "time_per_iteration": 2.489569902420044 + }, + { + "auxiliary_loss_clip": 0.06464031, + "auxiliary_loss_mlp": 0.01273102, + "balance_loss_clip": 0.06285653, + "balance_loss_mlp": 0.0125919, + "epoch": 0.40913873440553133, + "flos": 28045071959040.0, + "grad_norm": 1.8644340771163777, + "language_loss": 0.75315928, + "learning_rate": 2.672295527537998e-06, + "loss": 0.83053064, + "num_input_tokens_seen": 146044990, + "router_z_loss_clip": 1.78125, + "router_z_loss_mlp": 0.13909912, + "step": 6805, + "time_per_iteration": 2.6142778396606445 + }, + { + "auxiliary_loss_clip": 0.06465782, + "auxiliary_loss_mlp": 0.01272786, + "balance_loss_clip": 0.06288569, + "balance_loss_mlp": 0.01257957, + "epoch": 0.4091988576581993, + "flos": 21624917397120.0, + "grad_norm": 1.7712960163929097, + "language_loss": 0.7965951, + "learning_rate": 2.671928716175804e-06, + "loss": 0.87398076, + "num_input_tokens_seen": 146066045, + "router_z_loss_clip": 1.77050781, + "router_z_loss_mlp": 0.14825439, + "step": 6806, + "time_per_iteration": 2.567579984664917 + }, + { + "auxiliary_loss_clip": 0.06464592, + "auxiliary_loss_mlp": 0.01268771, + "balance_loss_clip": 0.06287415, + "balance_loss_mlp": 0.01254609, + "epoch": 0.40925898091086726, + "flos": 25230381932160.0, + "grad_norm": 1.8487150493759184, + "language_loss": 0.725999, + "learning_rate": 2.671561879334007e-06, + "loss": 0.80333263, + "num_input_tokens_seen": 146086280, + "router_z_loss_clip": 1.77148438, + "router_z_loss_mlp": 0.14147949, + "step": 6807, + "time_per_iteration": 4.0469160079956055 + }, + { + "auxiliary_loss_clip": 0.06359696, + "auxiliary_loss_mlp": 0.012552, + "balance_loss_clip": 0.06279803, + "balance_loss_mlp": 0.01251397, + "epoch": 0.40931910416353523, + "flos": 68949697553280.0, + "grad_norm": 0.8076862955861985, + "language_loss": 0.5884732, + "learning_rate": 2.6711950170265155e-06, + "loss": 0.66462219, + "num_input_tokens_seen": 146148840, + "router_z_loss_clip": 0.79833984, + "router_z_loss_mlp": 0.03796387, + "step": 6808, + "time_per_iteration": 3.236466407775879 + }, + { + "auxiliary_loss_clip": 0.0646228, + "auxiliary_loss_mlp": 0.01268444, + "balance_loss_clip": 0.06290961, + "balance_loss_mlp": 0.0125511, + "epoch": 0.4093792274162032, + "flos": 20195092126080.0, + "grad_norm": 2.068974912031903, + "language_loss": 0.54879391, + "learning_rate": 2.670828129267242e-06, + "loss": 0.62610114, + "num_input_tokens_seen": 146166195, + "router_z_loss_clip": 1.71386719, + "router_z_loss_mlp": 0.13342285, + "step": 6809, + "time_per_iteration": 4.028552055358887 + }, + { + "auxiliary_loss_clip": 0.06460767, + "auxiliary_loss_mlp": 0.01271891, + "balance_loss_clip": 0.06288341, + "balance_loss_mlp": 0.0125805, + "epoch": 0.40943935066887116, + "flos": 25235832447360.0, + "grad_norm": 1.6877735836202645, + "language_loss": 0.83297133, + "learning_rate": 2.6704612160700983e-06, + "loss": 0.91029787, + "num_input_tokens_seen": 146185045, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.13830566, + "step": 6810, + "time_per_iteration": 2.5688657760620117 + }, + { + "auxiliary_loss_clip": 0.06467541, + "auxiliary_loss_mlp": 0.01274919, + "balance_loss_clip": 0.06291755, + "balance_loss_mlp": 0.01260376, + "epoch": 0.4094994739215392, + "flos": 23261531345280.0, + "grad_norm": 2.1410482965152475, + "language_loss": 0.78002244, + "learning_rate": 2.670094277448999e-06, + "loss": 0.85744703, + "num_input_tokens_seen": 146204655, + "router_z_loss_clip": 1.75878906, + "router_z_loss_mlp": 0.14526367, + "step": 6811, + "time_per_iteration": 2.5859668254852295 + }, + { + "auxiliary_loss_clip": 0.06461761, + "auxiliary_loss_mlp": 0.01270439, + "balance_loss_clip": 0.06286068, + "balance_loss_mlp": 0.01255705, + "epoch": 0.40955959717420715, + "flos": 17387571623040.0, + "grad_norm": 1.532323288412775, + "language_loss": 0.70159924, + "learning_rate": 2.669727313417857e-06, + "loss": 0.77892125, + "num_input_tokens_seen": 146222000, + "router_z_loss_clip": 1.75683594, + "router_z_loss_mlp": 0.1472168, + "step": 6812, + "time_per_iteration": 2.5128583908081055 + }, + { + "auxiliary_loss_clip": 0.06459609, + "auxiliary_loss_mlp": 0.01271673, + "balance_loss_clip": 0.06286342, + "balance_loss_mlp": 0.01257689, + "epoch": 0.4096197204268751, + "flos": 25089406237440.0, + "grad_norm": 1.5016829758663763, + "language_loss": 0.6657182, + "learning_rate": 2.6693603239905872e-06, + "loss": 0.74303102, + "num_input_tokens_seen": 146242630, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.13989258, + "step": 6813, + "time_per_iteration": 4.086791515350342 + }, + { + "auxiliary_loss_clip": 0.06457571, + "auxiliary_loss_mlp": 0.01273443, + "balance_loss_clip": 0.06284814, + "balance_loss_mlp": 0.01259186, + "epoch": 0.4096798436795431, + "flos": 30593841454080.0, + "grad_norm": 3.468085127477164, + "language_loss": 0.74528515, + "learning_rate": 2.6689933091811087e-06, + "loss": 0.82259536, + "num_input_tokens_seen": 146263070, + "router_z_loss_clip": 1.72949219, + "router_z_loss_mlp": 0.14282227, + "step": 6814, + "time_per_iteration": 2.6079764366149902 + }, + { + "auxiliary_loss_clip": 0.06469103, + "auxiliary_loss_mlp": 0.0126922, + "balance_loss_clip": 0.06290863, + "balance_loss_mlp": 0.01254927, + "epoch": 0.40973996693221104, + "flos": 24140424021120.0, + "grad_norm": 2.1723549744151573, + "language_loss": 0.66418713, + "learning_rate": 2.6686262690033357e-06, + "loss": 0.74157035, + "num_input_tokens_seen": 146282890, + "router_z_loss_clip": 1.78222656, + "router_z_loss_mlp": 0.14276123, + "step": 6815, + "time_per_iteration": 2.574538469314575 + }, + { + "auxiliary_loss_clip": 0.06459038, + "auxiliary_loss_mlp": 0.01277533, + "balance_loss_clip": 0.06290913, + "balance_loss_mlp": 0.01264116, + "epoch": 0.409800090184879, + "flos": 23995968382080.0, + "grad_norm": 1.5545179592453178, + "language_loss": 0.76523387, + "learning_rate": 2.668259203471188e-06, + "loss": 0.84259957, + "num_input_tokens_seen": 146301755, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.13433838, + "step": 6816, + "time_per_iteration": 2.5691564083099365 + }, + { + "auxiliary_loss_clip": 0.06462897, + "auxiliary_loss_mlp": 0.01272633, + "balance_loss_clip": 0.06288977, + "balance_loss_mlp": 0.01258834, + "epoch": 0.40986021343754697, + "flos": 16149216931200.0, + "grad_norm": 2.0573498340626957, + "language_loss": 0.82244468, + "learning_rate": 2.6678921125985843e-06, + "loss": 0.8998, + "num_input_tokens_seen": 146316835, + "router_z_loss_clip": 1.73925781, + "router_z_loss_mlp": 0.13812256, + "step": 6817, + "time_per_iteration": 3.992452621459961 + }, + { + "auxiliary_loss_clip": 0.06471414, + "auxiliary_loss_mlp": 0.0127126, + "balance_loss_clip": 0.06288736, + "balance_loss_mlp": 0.0125556, + "epoch": 0.40992033669021494, + "flos": 24797811628800.0, + "grad_norm": 1.5933135055943601, + "language_loss": 0.80022383, + "learning_rate": 2.667524996399444e-06, + "loss": 0.87765062, + "num_input_tokens_seen": 146336650, + "router_z_loss_clip": 1.82714844, + "router_z_loss_mlp": 0.15698242, + "step": 6818, + "time_per_iteration": 2.6226916313171387 + }, + { + "auxiliary_loss_clip": 0.06458658, + "auxiliary_loss_mlp": 0.01265615, + "balance_loss_clip": 0.06287554, + "balance_loss_mlp": 0.01252609, + "epoch": 0.4099804599428829, + "flos": 29649429285120.0, + "grad_norm": 1.5014418509343528, + "language_loss": 0.66358954, + "learning_rate": 2.66715785488769e-06, + "loss": 0.74083227, + "num_input_tokens_seen": 146357640, + "router_z_loss_clip": 1.7109375, + "router_z_loss_mlp": 0.13006592, + "step": 6819, + "time_per_iteration": 2.5726187229156494 + }, + { + "auxiliary_loss_clip": 0.06472912, + "auxiliary_loss_mlp": 0.01275099, + "balance_loss_clip": 0.06290931, + "balance_loss_mlp": 0.01259566, + "epoch": 0.41004058319555087, + "flos": 24833464341120.0, + "grad_norm": 1.4779477588129932, + "language_loss": 0.85265613, + "learning_rate": 2.6667906880772428e-06, + "loss": 0.9301362, + "num_input_tokens_seen": 146379325, + "router_z_loss_clip": 1.8203125, + "router_z_loss_mlp": 0.15527344, + "step": 6820, + "time_per_iteration": 2.5997445583343506 + }, + { + "auxiliary_loss_clip": 0.06459977, + "auxiliary_loss_mlp": 0.01274929, + "balance_loss_clip": 0.06289133, + "balance_loss_mlp": 0.01261571, + "epoch": 0.41010070644821883, + "flos": 25744278222720.0, + "grad_norm": 1.6716831778372079, + "language_loss": 0.71520668, + "learning_rate": 2.6664234959820256e-06, + "loss": 0.79255575, + "num_input_tokens_seen": 146398635, + "router_z_loss_clip": 1.70800781, + "router_z_loss_mlp": 0.13360596, + "step": 6821, + "time_per_iteration": 2.5686511993408203 + }, + { + "auxiliary_loss_clip": 0.06462038, + "auxiliary_loss_mlp": 0.01275085, + "balance_loss_clip": 0.06288444, + "balance_loss_mlp": 0.01262037, + "epoch": 0.4101608297008868, + "flos": 22352604180480.0, + "grad_norm": 1.920651769082741, + "language_loss": 0.74875939, + "learning_rate": 2.6660562786159634e-06, + "loss": 0.82613057, + "num_input_tokens_seen": 146417585, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.13049316, + "step": 6822, + "time_per_iteration": 2.5453121662139893 + }, + { + "auxiliary_loss_clip": 0.0646743, + "auxiliary_loss_mlp": 0.01270606, + "balance_loss_clip": 0.06293608, + "balance_loss_mlp": 0.01256408, + "epoch": 0.41022095295355476, + "flos": 21951619666560.0, + "grad_norm": 2.1329933375936045, + "language_loss": 0.75859648, + "learning_rate": 2.6656890359929796e-06, + "loss": 0.83597684, + "num_input_tokens_seen": 146437035, + "router_z_loss_clip": 1.73730469, + "router_z_loss_mlp": 0.14208984, + "step": 6823, + "time_per_iteration": 2.514934539794922 + }, + { + "auxiliary_loss_clip": 0.06469562, + "auxiliary_loss_mlp": 0.01272535, + "balance_loss_clip": 0.06289219, + "balance_loss_mlp": 0.01257276, + "epoch": 0.4102810762062228, + "flos": 27457312694400.0, + "grad_norm": 5.1897859223278004, + "language_loss": 0.74005461, + "learning_rate": 2.665321768127001e-06, + "loss": 0.81747556, + "num_input_tokens_seen": 146457370, + "router_z_loss_clip": 1.80078125, + "router_z_loss_mlp": 0.15258789, + "step": 6824, + "time_per_iteration": 2.645362615585327 + }, + { + "auxiliary_loss_clip": 0.06472579, + "auxiliary_loss_mlp": 0.01268406, + "balance_loss_clip": 0.06292652, + "balance_loss_mlp": 0.01253589, + "epoch": 0.41034119945889075, + "flos": 24506258947200.0, + "grad_norm": 2.0548664701913215, + "language_loss": 0.72348672, + "learning_rate": 2.6649544750319548e-06, + "loss": 0.80089658, + "num_input_tokens_seen": 146478105, + "router_z_loss_clip": 1.79882812, + "router_z_loss_mlp": 0.14788818, + "step": 6825, + "time_per_iteration": 2.5779926776885986 + }, + { + "auxiliary_loss_clip": 0.0646458, + "auxiliary_loss_mlp": 0.01269358, + "balance_loss_clip": 0.06292018, + "balance_loss_mlp": 0.01255822, + "epoch": 0.4104013227115587, + "flos": 24359497320960.0, + "grad_norm": 2.1141131447671, + "language_loss": 0.85571408, + "learning_rate": 2.664587156721768e-06, + "loss": 0.93305349, + "num_input_tokens_seen": 146497835, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.13537598, + "step": 6826, + "time_per_iteration": 2.556445598602295 + }, + { + "auxiliary_loss_clip": 0.06462094, + "auxiliary_loss_mlp": 0.01278764, + "balance_loss_clip": 0.0629297, + "balance_loss_mlp": 0.0126468, + "epoch": 0.4104614459642267, + "flos": 23735582219520.0, + "grad_norm": 2.6430290167775037, + "language_loss": 0.6714378, + "learning_rate": 2.6642198132103696e-06, + "loss": 0.74884635, + "num_input_tokens_seen": 146517735, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.14080811, + "step": 6827, + "time_per_iteration": 2.55556058883667 + }, + { + "auxiliary_loss_clip": 0.06463977, + "auxiliary_loss_mlp": 0.01267684, + "balance_loss_clip": 0.06292337, + "balance_loss_mlp": 0.01254017, + "epoch": 0.41052156921689464, + "flos": 22134620983680.0, + "grad_norm": 1.346138162541555, + "language_loss": 0.72310138, + "learning_rate": 2.663852444511689e-06, + "loss": 0.80041802, + "num_input_tokens_seen": 146537640, + "router_z_loss_clip": 1.71777344, + "router_z_loss_mlp": 0.13665771, + "step": 6828, + "time_per_iteration": 2.6050894260406494 + }, + { + "auxiliary_loss_clip": 0.06477004, + "auxiliary_loss_mlp": 0.01275424, + "balance_loss_clip": 0.06296174, + "balance_loss_mlp": 0.01259855, + "epoch": 0.4105816924695626, + "flos": 20090607684480.0, + "grad_norm": 2.1527229818824196, + "language_loss": 0.84003794, + "learning_rate": 2.6634850506396574e-06, + "loss": 0.91756219, + "num_input_tokens_seen": 146554695, + "router_z_loss_clip": 1.81054688, + "router_z_loss_mlp": 0.15588379, + "step": 6829, + "time_per_iteration": 2.5358362197875977 + }, + { + "auxiliary_loss_clip": 0.06466494, + "auxiliary_loss_mlp": 0.01273558, + "balance_loss_clip": 0.0629379, + "balance_loss_mlp": 0.01259789, + "epoch": 0.4106418157222306, + "flos": 18082540586880.0, + "grad_norm": 1.474811924806309, + "language_loss": 0.90568459, + "learning_rate": 2.663117631608206e-06, + "loss": 0.98308516, + "num_input_tokens_seen": 146573740, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.13781738, + "step": 6830, + "time_per_iteration": 2.5749125480651855 + }, + { + "auxiliary_loss_clip": 0.06471005, + "auxiliary_loss_mlp": 0.01271813, + "balance_loss_clip": 0.06296638, + "balance_loss_mlp": 0.01257729, + "epoch": 0.41070193897489854, + "flos": 21653442512640.0, + "grad_norm": 1.8339460976388509, + "language_loss": 0.6606307, + "learning_rate": 2.662750187431268e-06, + "loss": 0.73805887, + "num_input_tokens_seen": 146592885, + "router_z_loss_clip": 1.74414062, + "router_z_loss_mlp": 0.14080811, + "step": 6831, + "time_per_iteration": 2.5448153018951416 + }, + { + "auxiliary_loss_clip": 0.06473927, + "auxiliary_loss_mlp": 0.01269964, + "balance_loss_clip": 0.06301369, + "balance_loss_mlp": 0.01256613, + "epoch": 0.4107620622275665, + "flos": 26654924396160.0, + "grad_norm": 2.1106075691496766, + "language_loss": 0.69853723, + "learning_rate": 2.662382718122776e-06, + "loss": 0.77597612, + "num_input_tokens_seen": 146611995, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.13360596, + "step": 6832, + "time_per_iteration": 2.61200213432312 + }, + { + "auxiliary_loss_clip": 0.06467804, + "auxiliary_loss_mlp": 0.01274675, + "balance_loss_clip": 0.06296351, + "balance_loss_mlp": 0.01261586, + "epoch": 0.41082218548023447, + "flos": 18740305537920.0, + "grad_norm": 3.2749058883058177, + "language_loss": 0.73955101, + "learning_rate": 2.662015223696666e-06, + "loss": 0.81697583, + "num_input_tokens_seen": 146628045, + "router_z_loss_clip": 1.71386719, + "router_z_loss_mlp": 0.13092041, + "step": 6833, + "time_per_iteration": 2.5293643474578857 + }, + { + "auxiliary_loss_clip": 0.06477401, + "auxiliary_loss_mlp": 0.01270878, + "balance_loss_clip": 0.06301869, + "balance_loss_mlp": 0.01256334, + "epoch": 0.41088230873290243, + "flos": 22900476101760.0, + "grad_norm": 1.6362019789175348, + "language_loss": 0.72870773, + "learning_rate": 2.6616477041668713e-06, + "loss": 0.80619049, + "num_input_tokens_seen": 146648355, + "router_z_loss_clip": 1.75488281, + "router_z_loss_mlp": 0.14532471, + "step": 6834, + "time_per_iteration": 2.5534543991088867 + }, + { + "auxiliary_loss_clip": 0.06479818, + "auxiliary_loss_mlp": 0.01271417, + "balance_loss_clip": 0.0630189, + "balance_loss_mlp": 0.01257601, + "epoch": 0.4109424319855704, + "flos": 24283370286720.0, + "grad_norm": 2.482567827780577, + "language_loss": 0.71274042, + "learning_rate": 2.661280159547329e-06, + "loss": 0.7902528, + "num_input_tokens_seen": 146668370, + "router_z_loss_clip": 1.78027344, + "router_z_loss_mlp": 0.13824463, + "step": 6835, + "time_per_iteration": 2.6012609004974365 + }, + { + "auxiliary_loss_clip": 0.06481166, + "auxiliary_loss_mlp": 0.012697, + "balance_loss_clip": 0.06306168, + "balance_loss_mlp": 0.01255318, + "epoch": 0.41100255523823837, + "flos": 12974100566400.0, + "grad_norm": 1.7690004377507398, + "language_loss": 0.87590879, + "learning_rate": 2.660912589851978e-06, + "loss": 0.95341742, + "num_input_tokens_seen": 146686665, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.14373779, + "step": 6836, + "time_per_iteration": 2.5210461616516113 + }, + { + "auxiliary_loss_clip": 0.06475058, + "auxiliary_loss_mlp": 0.0127358, + "balance_loss_clip": 0.06304475, + "balance_loss_mlp": 0.01259937, + "epoch": 0.4110626784909064, + "flos": 23151806023680.0, + "grad_norm": 1.7062413123689164, + "language_loss": 0.69134921, + "learning_rate": 2.6605449950947547e-06, + "loss": 0.76883554, + "num_input_tokens_seen": 146706570, + "router_z_loss_clip": 1.70703125, + "router_z_loss_mlp": 0.13641357, + "step": 6837, + "time_per_iteration": 2.58320689201355 + }, + { + "auxiliary_loss_clip": 0.06479225, + "auxiliary_loss_mlp": 0.01273179, + "balance_loss_clip": 0.06301909, + "balance_loss_mlp": 0.01258248, + "epoch": 0.41112280174357435, + "flos": 22754007964800.0, + "grad_norm": 1.9797600155486905, + "language_loss": 0.7565136, + "learning_rate": 2.660177375289599e-06, + "loss": 0.83403766, + "num_input_tokens_seen": 146723425, + "router_z_loss_clip": 1.7734375, + "router_z_loss_mlp": 0.1494751, + "step": 6838, + "time_per_iteration": 2.5357375144958496 + }, + { + "auxiliary_loss_clip": 0.06478335, + "auxiliary_loss_mlp": 0.01273659, + "balance_loss_clip": 0.06305958, + "balance_loss_mlp": 0.01259318, + "epoch": 0.4111829249962423, + "flos": 21108211994880.0, + "grad_norm": 2.0771476339041635, + "language_loss": 0.82403398, + "learning_rate": 2.659809730450451e-06, + "loss": 0.90155393, + "num_input_tokens_seen": 146741640, + "router_z_loss_clip": 1.72265625, + "router_z_loss_mlp": 0.14343262, + "step": 6839, + "time_per_iteration": 2.596498489379883 + }, + { + "auxiliary_loss_clip": 0.06477809, + "auxiliary_loss_mlp": 0.01273131, + "balance_loss_clip": 0.06305793, + "balance_loss_mlp": 0.01259404, + "epoch": 0.4112430482489103, + "flos": 21512005620480.0, + "grad_norm": 1.908617135949294, + "language_loss": 0.8080616, + "learning_rate": 2.6594420605912523e-06, + "loss": 0.885571, + "num_input_tokens_seen": 146759195, + "router_z_loss_clip": 1.72070312, + "router_z_loss_mlp": 0.13726807, + "step": 6840, + "time_per_iteration": 2.575131893157959 + }, + { + "auxiliary_loss_clip": 0.06480156, + "auxiliary_loss_mlp": 0.01275329, + "balance_loss_clip": 0.06307412, + "balance_loss_mlp": 0.01262639, + "epoch": 0.41130317150157825, + "flos": 19575579363840.0, + "grad_norm": 1.874526459917051, + "language_loss": 0.67950094, + "learning_rate": 2.6590743657259442e-06, + "loss": 0.75705582, + "num_input_tokens_seen": 146774990, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.12701416, + "step": 6841, + "time_per_iteration": 2.5642948150634766 + }, + { + "auxiliary_loss_clip": 0.06386833, + "auxiliary_loss_mlp": 0.01258898, + "balance_loss_clip": 0.06308911, + "balance_loss_mlp": 0.01256092, + "epoch": 0.4113632947542462, + "flos": 62404541498880.0, + "grad_norm": 0.7544179812034518, + "language_loss": 0.59557825, + "learning_rate": 2.65870664586847e-06, + "loss": 0.67203557, + "num_input_tokens_seen": 146839610, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 0.02804565, + "step": 6842, + "time_per_iteration": 3.2257192134857178 + }, + { + "auxiliary_loss_clip": 0.06472278, + "auxiliary_loss_mlp": 0.01271531, + "balance_loss_clip": 0.06304677, + "balance_loss_mlp": 0.01257977, + "epoch": 0.4114234180069142, + "flos": 13923879396480.0, + "grad_norm": 2.0142050293437803, + "language_loss": 0.70280814, + "learning_rate": 2.6583389010327742e-06, + "loss": 0.78024626, + "num_input_tokens_seen": 146857360, + "router_z_loss_clip": 1.67480469, + "router_z_loss_mlp": 0.13562012, + "step": 6843, + "time_per_iteration": 2.565969944000244 + }, + { + "auxiliary_loss_clip": 0.06380486, + "auxiliary_loss_mlp": 0.01256868, + "balance_loss_clip": 0.06302112, + "balance_loss_mlp": 0.01253599, + "epoch": 0.41148354125958214, + "flos": 64948866727680.0, + "grad_norm": 0.7130365683812196, + "language_loss": 0.53645009, + "learning_rate": 2.6579711312328013e-06, + "loss": 0.61282361, + "num_input_tokens_seen": 146917055, + "router_z_loss_clip": 0.78222656, + "router_z_loss_mlp": 0.03274536, + "step": 6844, + "time_per_iteration": 3.16054105758667 + }, + { + "auxiliary_loss_clip": 0.06475421, + "auxiliary_loss_mlp": 0.0126646, + "balance_loss_clip": 0.06304798, + "balance_loss_mlp": 0.01253144, + "epoch": 0.4115436645122501, + "flos": 18733848773760.0, + "grad_norm": 1.6055019254999645, + "language_loss": 0.66105658, + "learning_rate": 2.6576033364824967e-06, + "loss": 0.73847538, + "num_input_tokens_seen": 146935215, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.13317871, + "step": 6845, + "time_per_iteration": 2.5785298347473145 + }, + { + "auxiliary_loss_clip": 0.06478415, + "auxiliary_loss_mlp": 0.01267629, + "balance_loss_clip": 0.06307876, + "balance_loss_mlp": 0.01254176, + "epoch": 0.41160378776491807, + "flos": 16258439128320.0, + "grad_norm": 2.0979946916750594, + "language_loss": 0.70201457, + "learning_rate": 2.657235516795808e-06, + "loss": 0.77947497, + "num_input_tokens_seen": 146951970, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.13446045, + "step": 6846, + "time_per_iteration": 2.510215997695923 + }, + { + "auxiliary_loss_clip": 0.06481081, + "auxiliary_loss_mlp": 0.01271315, + "balance_loss_clip": 0.06309364, + "balance_loss_mlp": 0.01257391, + "epoch": 0.41166391101758604, + "flos": 27978378508800.0, + "grad_norm": 1.4002739744354715, + "language_loss": 0.65459704, + "learning_rate": 2.6568676721866826e-06, + "loss": 0.73212105, + "num_input_tokens_seen": 146975615, + "router_z_loss_clip": 1.71777344, + "router_z_loss_mlp": 0.13922119, + "step": 6847, + "time_per_iteration": 4.048614025115967 + }, + { + "auxiliary_loss_clip": 0.06476664, + "auxiliary_loss_mlp": 0.01270454, + "balance_loss_clip": 0.06304531, + "balance_loss_mlp": 0.01256459, + "epoch": 0.411724034270254, + "flos": 34139865916800.0, + "grad_norm": 1.3666484547506623, + "language_loss": 0.7086308, + "learning_rate": 2.656499802669069e-06, + "loss": 0.78610194, + "num_input_tokens_seen": 146998855, + "router_z_loss_clip": 1.72363281, + "router_z_loss_mlp": 0.13983154, + "step": 6848, + "time_per_iteration": 4.219269037246704 + }, + { + "auxiliary_loss_clip": 0.06375948, + "auxiliary_loss_mlp": 0.01253417, + "balance_loss_clip": 0.06298448, + "balance_loss_mlp": 0.01250777, + "epoch": 0.41178415752292197, + "flos": 67945090625280.0, + "grad_norm": 0.8791919044020794, + "language_loss": 0.56300032, + "learning_rate": 2.6561319082569174e-06, + "loss": 0.63929397, + "num_input_tokens_seen": 147062710, + "router_z_loss_clip": 0.77441406, + "router_z_loss_mlp": 0.02642822, + "step": 6849, + "time_per_iteration": 3.226757287979126 + }, + { + "auxiliary_loss_clip": 0.06472921, + "auxiliary_loss_mlp": 0.0127066, + "balance_loss_clip": 0.06303038, + "balance_loss_mlp": 0.0125707, + "epoch": 0.41184428077558993, + "flos": 34322573744640.0, + "grad_norm": 1.830210581648694, + "language_loss": 0.76533353, + "learning_rate": 2.6557639889641783e-06, + "loss": 0.84276927, + "num_input_tokens_seen": 147086075, + "router_z_loss_clip": 1.70019531, + "router_z_loss_mlp": 0.13598633, + "step": 6850, + "time_per_iteration": 2.653665542602539 + }, + { + "auxiliary_loss_clip": 0.06475841, + "auxiliary_loss_mlp": 0.01268752, + "balance_loss_clip": 0.06303935, + "balance_loss_mlp": 0.0125484, + "epoch": 0.41190440402825795, + "flos": 35452796342400.0, + "grad_norm": 1.6037978840830116, + "language_loss": 0.68379039, + "learning_rate": 2.6553960448048025e-06, + "loss": 0.76123631, + "num_input_tokens_seen": 147107590, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.13909912, + "step": 6851, + "time_per_iteration": 2.72273588180542 + }, + { + "auxiliary_loss_clip": 0.06482952, + "auxiliary_loss_mlp": 0.01272578, + "balance_loss_clip": 0.06306773, + "balance_loss_mlp": 0.01256437, + "epoch": 0.4119645272809259, + "flos": 20856127386240.0, + "grad_norm": 2.4937650031840275, + "language_loss": 0.80344605, + "learning_rate": 2.655028075792743e-06, + "loss": 0.88100129, + "num_input_tokens_seen": 147123715, + "router_z_loss_clip": 1.76171875, + "router_z_loss_mlp": 0.16162109, + "step": 6852, + "time_per_iteration": 2.563422679901123 + }, + { + "auxiliary_loss_clip": 0.06490047, + "auxiliary_loss_mlp": 0.01270823, + "balance_loss_clip": 0.06310906, + "balance_loss_mlp": 0.01256267, + "epoch": 0.4120246505335939, + "flos": 27569218222080.0, + "grad_norm": 2.025784739879877, + "language_loss": 0.77943873, + "learning_rate": 2.6546600819419537e-06, + "loss": 0.8570475, + "num_input_tokens_seen": 147144290, + "router_z_loss_clip": 1.79003906, + "router_z_loss_mlp": 0.14538574, + "step": 6853, + "time_per_iteration": 4.108957290649414 + }, + { + "auxiliary_loss_clip": 0.06493531, + "auxiliary_loss_mlp": 0.0127083, + "balance_loss_clip": 0.06310283, + "balance_loss_mlp": 0.01254618, + "epoch": 0.41208477378626185, + "flos": 37824476232960.0, + "grad_norm": 1.7138113243533049, + "language_loss": 0.66213286, + "learning_rate": 2.6542920632663883e-06, + "loss": 0.73977649, + "num_input_tokens_seen": 147166340, + "router_z_loss_clip": 1.83203125, + "router_z_loss_mlp": 0.16223145, + "step": 6854, + "time_per_iteration": 2.706514596939087 + }, + { + "auxiliary_loss_clip": 0.06481706, + "auxiliary_loss_mlp": 0.012695, + "balance_loss_clip": 0.06308492, + "balance_loss_mlp": 0.01256268, + "epoch": 0.4121448970389298, + "flos": 23447509482240.0, + "grad_norm": 1.8819465084993465, + "language_loss": 0.83935457, + "learning_rate": 2.6539240197800023e-06, + "loss": 0.9168666, + "num_input_tokens_seen": 147184025, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.13238525, + "step": 6855, + "time_per_iteration": 2.6131205558776855 + }, + { + "auxiliary_loss_clip": 0.06478727, + "auxiliary_loss_mlp": 0.01272662, + "balance_loss_clip": 0.06308559, + "balance_loss_mlp": 0.01258524, + "epoch": 0.4122050202915978, + "flos": 21331813415040.0, + "grad_norm": 1.6556690578140216, + "language_loss": 0.79642534, + "learning_rate": 2.6535559514967517e-06, + "loss": 0.87393928, + "num_input_tokens_seen": 147202730, + "router_z_loss_clip": 1.70019531, + "router_z_loss_mlp": 0.14129639, + "step": 6856, + "time_per_iteration": 2.6186776161193848 + }, + { + "auxiliary_loss_clip": 0.06486623, + "auxiliary_loss_mlp": 0.01271133, + "balance_loss_clip": 0.06312534, + "balance_loss_mlp": 0.01257383, + "epoch": 0.41226514354426574, + "flos": 17311193026560.0, + "grad_norm": 2.5768867092656516, + "language_loss": 0.80543911, + "learning_rate": 2.6531878584305935e-06, + "loss": 0.88301665, + "num_input_tokens_seen": 147215315, + "router_z_loss_clip": 1.74023438, + "router_z_loss_mlp": 0.13739014, + "step": 6857, + "time_per_iteration": 4.0222320556640625 + }, + { + "auxiliary_loss_clip": 0.06484015, + "auxiliary_loss_mlp": 0.01273092, + "balance_loss_clip": 0.06307175, + "balance_loss_mlp": 0.01259168, + "epoch": 0.4123252667969337, + "flos": 17644519768320.0, + "grad_norm": 1.8891533513627916, + "language_loss": 0.71074593, + "learning_rate": 2.6528197405954873e-06, + "loss": 0.78831697, + "num_input_tokens_seen": 147233330, + "router_z_loss_clip": 1.765625, + "router_z_loss_mlp": 0.13934326, + "step": 6858, + "time_per_iteration": 2.598215341567993 + }, + { + "auxiliary_loss_clip": 0.06484012, + "auxiliary_loss_mlp": 0.01270468, + "balance_loss_clip": 0.06310833, + "balance_loss_mlp": 0.01256109, + "epoch": 0.4123853900496017, + "flos": 46435070304000.0, + "grad_norm": 1.791293678645808, + "language_loss": 0.59712768, + "learning_rate": 2.652451598005391e-06, + "loss": 0.67467248, + "num_input_tokens_seen": 147257780, + "router_z_loss_clip": 1.73339844, + "router_z_loss_mlp": 0.14361572, + "step": 6859, + "time_per_iteration": 2.818535804748535 + }, + { + "auxiliary_loss_clip": 0.0648525, + "auxiliary_loss_mlp": 0.01269281, + "balance_loss_clip": 0.06306802, + "balance_loss_mlp": 0.01255423, + "epoch": 0.41244551330226964, + "flos": 17680801386240.0, + "grad_norm": 3.190643468711074, + "language_loss": 0.73818636, + "learning_rate": 2.652083430674264e-06, + "loss": 0.81573164, + "num_input_tokens_seen": 147276055, + "router_z_loss_clip": 1.78417969, + "router_z_loss_mlp": 0.13861084, + "step": 6860, + "time_per_iteration": 2.559460163116455 + }, + { + "auxiliary_loss_clip": 0.06473921, + "auxiliary_loss_mlp": 0.01270813, + "balance_loss_clip": 0.06301314, + "balance_loss_mlp": 0.01257706, + "epoch": 0.4125056365549376, + "flos": 18699034602240.0, + "grad_norm": 1.5713730110506565, + "language_loss": 0.74087375, + "learning_rate": 2.651715238616068e-06, + "loss": 0.81832111, + "num_input_tokens_seen": 147293200, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.13110352, + "step": 6861, + "time_per_iteration": 2.563107967376709 + }, + { + "auxiliary_loss_clip": 0.06476536, + "auxiliary_loss_mlp": 0.01266788, + "balance_loss_clip": 0.06306636, + "balance_loss_mlp": 0.01253425, + "epoch": 0.41256575980760557, + "flos": 17901174424320.0, + "grad_norm": 2.040837827964215, + "language_loss": 0.8021872, + "learning_rate": 2.651347021844765e-06, + "loss": 0.87962043, + "num_input_tokens_seen": 147310640, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.13354492, + "step": 6862, + "time_per_iteration": 2.4968619346618652 + }, + { + "auxiliary_loss_clip": 0.06481781, + "auxiliary_loss_mlp": 0.01269578, + "balance_loss_clip": 0.06308153, + "balance_loss_mlp": 0.01255881, + "epoch": 0.41262588306027354, + "flos": 21987817430400.0, + "grad_norm": 2.204342418200638, + "language_loss": 0.767263, + "learning_rate": 2.650978780374318e-06, + "loss": 0.84477663, + "num_input_tokens_seen": 147329435, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.13708496, + "step": 6863, + "time_per_iteration": 2.5787971019744873 + }, + { + "auxiliary_loss_clip": 0.06377177, + "auxiliary_loss_mlp": 0.01254592, + "balance_loss_clip": 0.06300335, + "balance_loss_mlp": 0.01252135, + "epoch": 0.41268600631294156, + "flos": 53366339243520.0, + "grad_norm": 0.6821216328900507, + "language_loss": 0.52583742, + "learning_rate": 2.650610514218691e-06, + "loss": 0.60215503, + "num_input_tokens_seen": 147385805, + "router_z_loss_clip": 0.77001953, + "router_z_loss_mlp": 0.02455139, + "step": 6864, + "time_per_iteration": 3.1086013317108154 + }, + { + "auxiliary_loss_clip": 0.06480177, + "auxiliary_loss_mlp": 0.01271204, + "balance_loss_clip": 0.06300756, + "balance_loss_mlp": 0.01256714, + "epoch": 0.4127461295656095, + "flos": 24391586234880.0, + "grad_norm": 1.7134572277425464, + "language_loss": 0.72468507, + "learning_rate": 2.6502422233918468e-06, + "loss": 0.80219889, + "num_input_tokens_seen": 147405160, + "router_z_loss_clip": 1.79394531, + "router_z_loss_mlp": 0.14489746, + "step": 6865, + "time_per_iteration": 2.6081020832061768 + }, + { + "auxiliary_loss_clip": 0.06375298, + "auxiliary_loss_mlp": 0.01255641, + "balance_loss_clip": 0.06298722, + "balance_loss_mlp": 0.01252579, + "epoch": 0.4128062528182775, + "flos": 71725129142400.0, + "grad_norm": 0.9099190790692077, + "language_loss": 0.66497219, + "learning_rate": 2.649873907907753e-06, + "loss": 0.74128163, + "num_input_tokens_seen": 147460245, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.03059387, + "step": 6866, + "time_per_iteration": 3.0357213020324707 + }, + { + "auxiliary_loss_clip": 0.06476509, + "auxiliary_loss_mlp": 0.01269311, + "balance_loss_clip": 0.06301893, + "balance_loss_mlp": 0.01255799, + "epoch": 0.41286637607094545, + "flos": 17853362870400.0, + "grad_norm": 2.1198776843792357, + "language_loss": 0.81617618, + "learning_rate": 2.649505567780375e-06, + "loss": 0.89363438, + "num_input_tokens_seen": 147476200, + "router_z_loss_clip": 1.74414062, + "router_z_loss_mlp": 0.13500977, + "step": 6867, + "time_per_iteration": 2.6095240116119385 + }, + { + "auxiliary_loss_clip": 0.06482062, + "auxiliary_loss_mlp": 0.01270795, + "balance_loss_clip": 0.06303717, + "balance_loss_mlp": 0.01256657, + "epoch": 0.4129264993236134, + "flos": 25555407120000.0, + "grad_norm": 2.8405529060711006, + "language_loss": 0.78333044, + "learning_rate": 2.6491372030236815e-06, + "loss": 0.86085904, + "num_input_tokens_seen": 147494315, + "router_z_loss_clip": 1.78222656, + "router_z_loss_mlp": 0.14147949, + "step": 6868, + "time_per_iteration": 2.558155059814453 + }, + { + "auxiliary_loss_clip": 0.06374986, + "auxiliary_loss_mlp": 0.01255045, + "balance_loss_clip": 0.06298015, + "balance_loss_mlp": 0.01251991, + "epoch": 0.4129866225762814, + "flos": 65430730759680.0, + "grad_norm": 0.8212939455862347, + "language_loss": 0.57654673, + "learning_rate": 2.64876881365164e-06, + "loss": 0.65284705, + "num_input_tokens_seen": 147543665, + "router_z_loss_clip": 0.77001953, + "router_z_loss_mlp": 0.03051758, + "step": 6869, + "time_per_iteration": 2.9284112453460693 + }, + { + "auxiliary_loss_clip": 0.06481783, + "auxiliary_loss_mlp": 0.01277222, + "balance_loss_clip": 0.06310707, + "balance_loss_mlp": 0.01263472, + "epoch": 0.41304674582894935, + "flos": 28884622343040.0, + "grad_norm": 2.4401499988028594, + "language_loss": 0.75528967, + "learning_rate": 2.64840039967822e-06, + "loss": 0.83287978, + "num_input_tokens_seen": 147564870, + "router_z_loss_clip": 1.70996094, + "router_z_loss_mlp": 0.13763428, + "step": 6870, + "time_per_iteration": 2.6844911575317383 + }, + { + "auxiliary_loss_clip": 0.0647882, + "auxiliary_loss_mlp": 0.01278278, + "balance_loss_clip": 0.06302784, + "balance_loss_mlp": 0.0126414, + "epoch": 0.4131068690816173, + "flos": 22898379749760.0, + "grad_norm": 1.5575458850844177, + "language_loss": 0.83697838, + "learning_rate": 2.6480319611173912e-06, + "loss": 0.91454935, + "num_input_tokens_seen": 147584840, + "router_z_loss_clip": 1.76074219, + "router_z_loss_mlp": 0.14135742, + "step": 6871, + "time_per_iteration": 2.636808156967163 + }, + { + "auxiliary_loss_clip": 0.06479517, + "auxiliary_loss_mlp": 0.0126964, + "balance_loss_clip": 0.06303998, + "balance_loss_mlp": 0.01256033, + "epoch": 0.4131669923342853, + "flos": 26071944814080.0, + "grad_norm": 2.2227773400911732, + "language_loss": 0.69246161, + "learning_rate": 2.6476634979831263e-06, + "loss": 0.76995325, + "num_input_tokens_seen": 147604635, + "router_z_loss_clip": 1.75585938, + "router_z_loss_mlp": 0.1361084, + "step": 6872, + "time_per_iteration": 2.6492373943328857 + }, + { + "auxiliary_loss_clip": 0.06480041, + "auxiliary_loss_mlp": 0.01273197, + "balance_loss_clip": 0.06303592, + "balance_loss_mlp": 0.01259494, + "epoch": 0.41322711558695324, + "flos": 19250554176000.0, + "grad_norm": 1.8563624048188305, + "language_loss": 0.76261687, + "learning_rate": 2.6472950102893964e-06, + "loss": 0.84014916, + "num_input_tokens_seen": 147620700, + "router_z_loss_clip": 1.76464844, + "router_z_loss_mlp": 0.13696289, + "step": 6873, + "time_per_iteration": 2.5294342041015625 + }, + { + "auxiliary_loss_clip": 0.06480598, + "auxiliary_loss_mlp": 0.01273623, + "balance_loss_clip": 0.06302338, + "balance_loss_mlp": 0.0125958, + "epoch": 0.4132872388396212, + "flos": 22681067385600.0, + "grad_norm": 1.8281818605346505, + "language_loss": 0.83432305, + "learning_rate": 2.6469264980501746e-06, + "loss": 0.91186529, + "num_input_tokens_seen": 147639490, + "router_z_loss_clip": 1.78027344, + "router_z_loss_mlp": 0.14031982, + "step": 6874, + "time_per_iteration": 2.6135475635528564 + }, + { + "auxiliary_loss_clip": 0.06483124, + "auxiliary_loss_mlp": 0.01273525, + "balance_loss_clip": 0.06306563, + "balance_loss_mlp": 0.01258498, + "epoch": 0.4133473620922892, + "flos": 20155246709760.0, + "grad_norm": 1.7886089381127788, + "language_loss": 0.72210878, + "learning_rate": 2.646557961279436e-06, + "loss": 0.79967523, + "num_input_tokens_seen": 147657205, + "router_z_loss_clip": 1.76367188, + "router_z_loss_mlp": 0.15020752, + "step": 6875, + "time_per_iteration": 2.535613536834717 + }, + { + "auxiliary_loss_clip": 0.06467389, + "auxiliary_loss_mlp": 0.01270264, + "balance_loss_clip": 0.06301813, + "balance_loss_mlp": 0.01257151, + "epoch": 0.41340748534495714, + "flos": 24249520437120.0, + "grad_norm": 1.4522680677637643, + "language_loss": 0.82662565, + "learning_rate": 2.646189399991154e-06, + "loss": 0.90400219, + "num_input_tokens_seen": 147677005, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.13098145, + "step": 6876, + "time_per_iteration": 2.631683111190796 + }, + { + "auxiliary_loss_clip": 0.06476636, + "auxiliary_loss_mlp": 0.0126976, + "balance_loss_clip": 0.06298597, + "balance_loss_mlp": 0.01255198, + "epoch": 0.41346760859762516, + "flos": 14397385219200.0, + "grad_norm": 2.4272621941749044, + "language_loss": 0.65427208, + "learning_rate": 2.6458208141993048e-06, + "loss": 0.73173606, + "num_input_tokens_seen": 147693435, + "router_z_loss_clip": 1.77929688, + "router_z_loss_mlp": 0.14556885, + "step": 6877, + "time_per_iteration": 2.5211727619171143 + }, + { + "auxiliary_loss_clip": 0.06477489, + "auxiliary_loss_mlp": 0.01272334, + "balance_loss_clip": 0.06304673, + "balance_loss_mlp": 0.0125853, + "epoch": 0.4135277318502931, + "flos": 22498569192960.0, + "grad_norm": 1.7887587996629348, + "language_loss": 0.77271414, + "learning_rate": 2.6454522039178668e-06, + "loss": 0.85021234, + "num_input_tokens_seen": 147714000, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.13800049, + "step": 6878, + "time_per_iteration": 2.591952085494995 + }, + { + "auxiliary_loss_clip": 0.06478719, + "auxiliary_loss_mlp": 0.01271871, + "balance_loss_clip": 0.06303747, + "balance_loss_mlp": 0.01258525, + "epoch": 0.4135878551029611, + "flos": 22425251270400.0, + "grad_norm": 1.9381355665838014, + "language_loss": 0.8049022, + "learning_rate": 2.6450835691608154e-06, + "loss": 0.88240814, + "num_input_tokens_seen": 147731010, + "router_z_loss_clip": 1.74609375, + "router_z_loss_mlp": 0.13354492, + "step": 6879, + "time_per_iteration": 2.565875291824341 + }, + { + "auxiliary_loss_clip": 0.06476135, + "auxiliary_loss_mlp": 0.0127254, + "balance_loss_clip": 0.06301241, + "balance_loss_mlp": 0.01258688, + "epoch": 0.41364797835562905, + "flos": 27060646665600.0, + "grad_norm": 1.8294611042748399, + "language_loss": 0.8543402, + "learning_rate": 2.6447149099421315e-06, + "loss": 0.93182689, + "num_input_tokens_seen": 147750880, + "router_z_loss_clip": 1.74804688, + "router_z_loss_mlp": 0.13861084, + "step": 6880, + "time_per_iteration": 2.6438286304473877 + }, + { + "auxiliary_loss_clip": 0.06478438, + "auxiliary_loss_mlp": 0.01270379, + "balance_loss_clip": 0.06301369, + "balance_loss_mlp": 0.01256258, + "epoch": 0.413708101608297, + "flos": 22974464856960.0, + "grad_norm": 2.0767525842165413, + "language_loss": 0.70694637, + "learning_rate": 2.6443462262757927e-06, + "loss": 0.78443456, + "num_input_tokens_seen": 147771360, + "router_z_loss_clip": 1.77050781, + "router_z_loss_mlp": 0.14129639, + "step": 6881, + "time_per_iteration": 2.57663893699646 + }, + { + "auxiliary_loss_clip": 0.06468567, + "auxiliary_loss_mlp": 0.01269061, + "balance_loss_clip": 0.06300917, + "balance_loss_mlp": 0.01255978, + "epoch": 0.413768224860965, + "flos": 13339013097600.0, + "grad_norm": 1.7206029499163673, + "language_loss": 0.81694102, + "learning_rate": 2.6439775181757805e-06, + "loss": 0.89431733, + "num_input_tokens_seen": 147787440, + "router_z_loss_clip": 1.67578125, + "router_z_loss_mlp": 0.13092041, + "step": 6882, + "time_per_iteration": 2.572300672531128 + }, + { + "auxiliary_loss_clip": 0.06484764, + "auxiliary_loss_mlp": 0.01273853, + "balance_loss_clip": 0.06306723, + "balance_loss_mlp": 0.0125776, + "epoch": 0.41382834811363295, + "flos": 20820306965760.0, + "grad_norm": 2.0204096459019176, + "language_loss": 0.69182575, + "learning_rate": 2.643608785656077e-06, + "loss": 0.76941192, + "num_input_tokens_seen": 147805720, + "router_z_loss_clip": 1.77929688, + "router_z_loss_mlp": 0.16088867, + "step": 6883, + "time_per_iteration": 2.5611510276794434 + }, + { + "auxiliary_loss_clip": 0.06472149, + "auxiliary_loss_mlp": 0.0126815, + "balance_loss_clip": 0.06297622, + "balance_loss_mlp": 0.01255061, + "epoch": 0.4138884713663009, + "flos": 20673293777280.0, + "grad_norm": 2.0786241324697, + "language_loss": 0.75945485, + "learning_rate": 2.643240028730663e-06, + "loss": 0.83685786, + "num_input_tokens_seen": 147824605, + "router_z_loss_clip": 1.74609375, + "router_z_loss_mlp": 0.13092041, + "step": 6884, + "time_per_iteration": 2.5788567066192627 + }, + { + "auxiliary_loss_clip": 0.06477202, + "auxiliary_loss_mlp": 0.01273717, + "balance_loss_clip": 0.06298974, + "balance_loss_mlp": 0.01260008, + "epoch": 0.4139485946189689, + "flos": 29063808299520.0, + "grad_norm": 3.0401310083666444, + "language_loss": 0.76198518, + "learning_rate": 2.642871247413523e-06, + "loss": 0.83949435, + "num_input_tokens_seen": 147845445, + "router_z_loss_clip": 1.78125, + "router_z_loss_mlp": 0.13720703, + "step": 6885, + "time_per_iteration": 2.5964529514312744 + }, + { + "auxiliary_loss_clip": 0.06475228, + "auxiliary_loss_mlp": 0.01270635, + "balance_loss_clip": 0.06299268, + "balance_loss_mlp": 0.01256187, + "epoch": 0.41400871787163684, + "flos": 24432605608320.0, + "grad_norm": 1.9051304938208142, + "language_loss": 0.70031226, + "learning_rate": 2.6425024417186414e-06, + "loss": 0.77777094, + "num_input_tokens_seen": 147865580, + "router_z_loss_clip": 1.75878906, + "router_z_loss_mlp": 0.14447021, + "step": 6886, + "time_per_iteration": 4.101384878158569 + }, + { + "auxiliary_loss_clip": 0.06475122, + "auxiliary_loss_mlp": 0.01275658, + "balance_loss_clip": 0.06297341, + "balance_loss_mlp": 0.01260423, + "epoch": 0.4140688411243048, + "flos": 19470172527360.0, + "grad_norm": 1.459976196778311, + "language_loss": 0.75538456, + "learning_rate": 2.642133611660002e-06, + "loss": 0.83289236, + "num_input_tokens_seen": 147885230, + "router_z_loss_clip": 1.77832031, + "router_z_loss_mlp": 0.15234375, + "step": 6887, + "time_per_iteration": 2.5979294776916504 + }, + { + "auxiliary_loss_clip": 0.06468056, + "auxiliary_loss_mlp": 0.01273257, + "balance_loss_clip": 0.06294202, + "balance_loss_mlp": 0.0125916, + "epoch": 0.4141289643769728, + "flos": 19319008561920.0, + "grad_norm": 2.153365375528394, + "language_loss": 0.70707798, + "learning_rate": 2.641764757251592e-06, + "loss": 0.78449106, + "num_input_tokens_seen": 147903035, + "router_z_loss_clip": 1.73730469, + "router_z_loss_mlp": 0.14099121, + "step": 6888, + "time_per_iteration": 4.008386850357056 + }, + { + "auxiliary_loss_clip": 0.06466109, + "auxiliary_loss_mlp": 0.01273102, + "balance_loss_clip": 0.0629206, + "balance_loss_mlp": 0.0125863, + "epoch": 0.41418908762964074, + "flos": 16732448075520.0, + "grad_norm": 2.015209624353795, + "language_loss": 0.76631236, + "learning_rate": 2.6413958785073976e-06, + "loss": 0.84370446, + "num_input_tokens_seen": 147918745, + "router_z_loss_clip": 1.74121094, + "router_z_loss_mlp": 0.14477539, + "step": 6889, + "time_per_iteration": 2.5270447731018066 + }, + { + "auxiliary_loss_clip": 0.06466071, + "auxiliary_loss_mlp": 0.012722, + "balance_loss_clip": 0.06294381, + "balance_loss_mlp": 0.01258628, + "epoch": 0.41424921088230876, + "flos": 25303112876160.0, + "grad_norm": 1.5878983493356928, + "language_loss": 0.80245477, + "learning_rate": 2.6410269754414074e-06, + "loss": 0.87983751, + "num_input_tokens_seen": 147938265, + "router_z_loss_clip": 1.71679688, + "router_z_loss_mlp": 0.13568115, + "step": 6890, + "time_per_iteration": 2.5559017658233643 + }, + { + "auxiliary_loss_clip": 0.06465066, + "auxiliary_loss_mlp": 0.01273625, + "balance_loss_clip": 0.06294424, + "balance_loss_mlp": 0.01258592, + "epoch": 0.4143093341349767, + "flos": 20966984737920.0, + "grad_norm": 1.4631338633868025, + "language_loss": 0.74175858, + "learning_rate": 2.6406580480676113e-06, + "loss": 0.81914544, + "num_input_tokens_seen": 147957320, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.15014648, + "step": 6891, + "time_per_iteration": 2.5313403606414795 + }, + { + "auxiliary_loss_clip": 0.06475316, + "auxiliary_loss_mlp": 0.01269526, + "balance_loss_clip": 0.0629719, + "balance_loss_mlp": 0.01253283, + "epoch": 0.4143694573876447, + "flos": 22024182902400.0, + "grad_norm": 2.801103384820577, + "language_loss": 0.84378529, + "learning_rate": 2.6402890963999963e-06, + "loss": 0.92123371, + "num_input_tokens_seen": 147977045, + "router_z_loss_clip": 1.78125, + "router_z_loss_mlp": 0.16247559, + "step": 6892, + "time_per_iteration": 3.9777607917785645 + }, + { + "auxiliary_loss_clip": 0.06465086, + "auxiliary_loss_mlp": 0.01270368, + "balance_loss_clip": 0.06295982, + "balance_loss_mlp": 0.01257339, + "epoch": 0.41442958064031266, + "flos": 35705761418880.0, + "grad_norm": 1.735816743811137, + "language_loss": 0.70161885, + "learning_rate": 2.6399201204525554e-06, + "loss": 0.7789734, + "num_input_tokens_seen": 147996905, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.13037109, + "step": 6893, + "time_per_iteration": 2.6909854412078857 + }, + { + "auxiliary_loss_clip": 0.06467048, + "auxiliary_loss_mlp": 0.01267192, + "balance_loss_clip": 0.0629535, + "balance_loss_mlp": 0.01253799, + "epoch": 0.4144897038929806, + "flos": 28301391198720.0, + "grad_norm": 1.3940088969507989, + "language_loss": 0.73223269, + "learning_rate": 2.639551120239279e-06, + "loss": 0.80957508, + "num_input_tokens_seen": 148017875, + "router_z_loss_clip": 1.71484375, + "router_z_loss_mlp": 0.13378906, + "step": 6894, + "time_per_iteration": 2.5950350761413574 + }, + { + "auxiliary_loss_clip": 0.06476665, + "auxiliary_loss_mlp": 0.01273362, + "balance_loss_clip": 0.06300536, + "balance_loss_mlp": 0.0125867, + "epoch": 0.4145498271456486, + "flos": 11651568848640.0, + "grad_norm": 2.440609351676066, + "language_loss": 0.62663507, + "learning_rate": 2.63918209577416e-06, + "loss": 0.7041353, + "num_input_tokens_seen": 148032300, + "router_z_loss_clip": 1.76074219, + "router_z_loss_mlp": 0.14697266, + "step": 6895, + "time_per_iteration": 2.471320390701294 + }, + { + "auxiliary_loss_clip": 0.0646576, + "auxiliary_loss_mlp": 0.01272394, + "balance_loss_clip": 0.06296334, + "balance_loss_mlp": 0.01258589, + "epoch": 0.41460995039831655, + "flos": 27243061004160.0, + "grad_norm": 3.24758428503537, + "language_loss": 0.70684588, + "learning_rate": 2.638813047071192e-06, + "loss": 0.78422737, + "num_input_tokens_seen": 148053260, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.13806152, + "step": 6896, + "time_per_iteration": 2.5871524810791016 + }, + { + "auxiliary_loss_clip": 0.06475289, + "auxiliary_loss_mlp": 0.01275214, + "balance_loss_clip": 0.06299431, + "balance_loss_mlp": 0.01260164, + "epoch": 0.4146700736509845, + "flos": 25929627454080.0, + "grad_norm": 1.8920871134817128, + "language_loss": 0.73144394, + "learning_rate": 2.6384439741443696e-06, + "loss": 0.80894893, + "num_input_tokens_seen": 148072965, + "router_z_loss_clip": 1.7578125, + "router_z_loss_mlp": 0.15057373, + "step": 6897, + "time_per_iteration": 4.0778656005859375 + }, + { + "auxiliary_loss_clip": 0.0646714, + "auxiliary_loss_mlp": 0.01271778, + "balance_loss_clip": 0.06293359, + "balance_loss_mlp": 0.01257371, + "epoch": 0.4147301969036525, + "flos": 26840441335680.0, + "grad_norm": 6.247593775216772, + "language_loss": 0.84715986, + "learning_rate": 2.6380748770076873e-06, + "loss": 0.92454904, + "num_input_tokens_seen": 148093240, + "router_z_loss_clip": 1.73730469, + "router_z_loss_mlp": 0.14404297, + "step": 6898, + "time_per_iteration": 2.5603139400482178 + }, + { + "auxiliary_loss_clip": 0.06469397, + "auxiliary_loss_mlp": 0.01267488, + "balance_loss_clip": 0.06293289, + "balance_loss_mlp": 0.01253678, + "epoch": 0.41479032015632045, + "flos": 20303727344640.0, + "grad_norm": 2.0378276609946098, + "language_loss": 0.74898899, + "learning_rate": 2.6377057556751416e-06, + "loss": 0.82635784, + "num_input_tokens_seen": 148110925, + "router_z_loss_clip": 1.76074219, + "router_z_loss_mlp": 0.13812256, + "step": 6899, + "time_per_iteration": 2.53822660446167 + }, + { + "auxiliary_loss_clip": 0.06477535, + "auxiliary_loss_mlp": 0.01273796, + "balance_loss_clip": 0.06297705, + "balance_loss_mlp": 0.01258239, + "epoch": 0.4148504434089884, + "flos": 25272030211200.0, + "grad_norm": 2.0370175779228465, + "language_loss": 0.75786376, + "learning_rate": 2.6373366101607306e-06, + "loss": 0.83537704, + "num_input_tokens_seen": 148130670, + "router_z_loss_clip": 1.796875, + "router_z_loss_mlp": 0.15563965, + "step": 6900, + "time_per_iteration": 2.5547776222229004 + }, + { + "auxiliary_loss_clip": 0.06470095, + "auxiliary_loss_mlp": 0.01275828, + "balance_loss_clip": 0.06298018, + "balance_loss_mlp": 0.01260057, + "epoch": 0.4149105666616564, + "flos": 12827087377920.0, + "grad_norm": 3.426788101109298, + "language_loss": 0.80153453, + "learning_rate": 2.6369674404784503e-06, + "loss": 0.87899375, + "num_input_tokens_seen": 148148350, + "router_z_loss_clip": 1.72070312, + "router_z_loss_mlp": 0.15783691, + "step": 6901, + "time_per_iteration": 2.5724570751190186 + }, + { + "auxiliary_loss_clip": 0.06464257, + "auxiliary_loss_mlp": 0.01273382, + "balance_loss_clip": 0.06292327, + "balance_loss_mlp": 0.01258791, + "epoch": 0.41497068991432434, + "flos": 16769526307200.0, + "grad_norm": 2.2871359145608507, + "language_loss": 0.70271528, + "learning_rate": 2.6365982466423014e-06, + "loss": 0.78009164, + "num_input_tokens_seen": 148167550, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.14593506, + "step": 6902, + "time_per_iteration": 2.518018960952759 + }, + { + "auxiliary_loss_clip": 0.06463319, + "auxiliary_loss_mlp": 0.01270625, + "balance_loss_clip": 0.06294475, + "balance_loss_mlp": 0.01255706, + "epoch": 0.4150308131669923, + "flos": 18006161990400.0, + "grad_norm": 2.0523680752477906, + "language_loss": 0.8405019, + "learning_rate": 2.6362290286662834e-06, + "loss": 0.91784132, + "num_input_tokens_seen": 148184740, + "router_z_loss_clip": 1.68554688, + "router_z_loss_mlp": 0.14923096, + "step": 6903, + "time_per_iteration": 2.719252586364746 + }, + { + "auxiliary_loss_clip": 0.06478511, + "auxiliary_loss_mlp": 0.01270948, + "balance_loss_clip": 0.06298795, + "balance_loss_mlp": 0.01254282, + "epoch": 0.41509093641966033, + "flos": 30052635932160.0, + "grad_norm": 2.3513516306772826, + "language_loss": 0.67960835, + "learning_rate": 2.6358597865643968e-06, + "loss": 0.75710285, + "num_input_tokens_seen": 148204605, + "router_z_loss_clip": 1.796875, + "router_z_loss_mlp": 0.16674805, + "step": 6904, + "time_per_iteration": 2.605834484100342 + }, + { + "auxiliary_loss_clip": 0.06473922, + "auxiliary_loss_mlp": 0.01267185, + "balance_loss_clip": 0.06295053, + "balance_loss_mlp": 0.01252678, + "epoch": 0.4151510596723283, + "flos": 24286892158080.0, + "grad_norm": 1.8668907258080212, + "language_loss": 0.77697861, + "learning_rate": 2.635490520350643e-06, + "loss": 0.85438967, + "num_input_tokens_seen": 148224675, + "router_z_loss_clip": 1.78710938, + "router_z_loss_mlp": 0.14508057, + "step": 6905, + "time_per_iteration": 2.6073246002197266 + }, + { + "auxiliary_loss_clip": 0.06477012, + "auxiliary_loss_mlp": 0.01269791, + "balance_loss_clip": 0.06300149, + "balance_loss_mlp": 0.01255391, + "epoch": 0.41521118292499626, + "flos": 23482784851200.0, + "grad_norm": 2.106489831039321, + "language_loss": 0.68546331, + "learning_rate": 2.635121230039025e-06, + "loss": 0.76293135, + "num_input_tokens_seen": 148243375, + "router_z_loss_clip": 1.76855469, + "router_z_loss_mlp": 0.1439209, + "step": 6906, + "time_per_iteration": 2.5378260612487793 + }, + { + "auxiliary_loss_clip": 0.06470662, + "auxiliary_loss_mlp": 0.01269025, + "balance_loss_clip": 0.06298003, + "balance_loss_mlp": 0.01254839, + "epoch": 0.4152713061776642, + "flos": 22131728017920.0, + "grad_norm": 2.406599601104124, + "language_loss": 0.68275452, + "learning_rate": 2.6347519156435467e-06, + "loss": 0.76015139, + "num_input_tokens_seen": 148261140, + "router_z_loss_clip": 1.72851562, + "router_z_loss_mlp": 0.14196777, + "step": 6907, + "time_per_iteration": 2.548020124435425 + }, + { + "auxiliary_loss_clip": 0.06477083, + "auxiliary_loss_mlp": 0.01270349, + "balance_loss_clip": 0.06301615, + "balance_loss_mlp": 0.01256342, + "epoch": 0.4153314294303322, + "flos": 21257740805760.0, + "grad_norm": 2.5393224991434398, + "language_loss": 0.77004838, + "learning_rate": 2.6343825771782123e-06, + "loss": 0.84752274, + "num_input_tokens_seen": 148279655, + "router_z_loss_clip": 1.75585938, + "router_z_loss_mlp": 0.14013672, + "step": 6908, + "time_per_iteration": 2.52205753326416 + }, + { + "auxiliary_loss_clip": 0.0635362, + "auxiliary_loss_mlp": 0.01259834, + "balance_loss_clip": 0.06277395, + "balance_loss_mlp": 0.01256612, + "epoch": 0.41539155268300015, + "flos": 57939443527680.0, + "grad_norm": 0.769240592375345, + "language_loss": 0.64804208, + "learning_rate": 2.634013214657026e-06, + "loss": 0.72417659, + "num_input_tokens_seen": 148339005, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.03225708, + "step": 6909, + "time_per_iteration": 3.109095573425293 + }, + { + "auxiliary_loss_clip": 0.06469519, + "auxiliary_loss_mlp": 0.01271461, + "balance_loss_clip": 0.06297643, + "balance_loss_mlp": 0.0125746, + "epoch": 0.4154516759356681, + "flos": 21909384408960.0, + "grad_norm": 1.4248669333769037, + "language_loss": 0.87550539, + "learning_rate": 2.633643828093996e-06, + "loss": 0.95291519, + "num_input_tokens_seen": 148358715, + "router_z_loss_clip": 1.71777344, + "router_z_loss_mlp": 0.13989258, + "step": 6910, + "time_per_iteration": 2.5253639221191406 + }, + { + "auxiliary_loss_clip": 0.06354217, + "auxiliary_loss_mlp": 0.01257534, + "balance_loss_clip": 0.0627715, + "balance_loss_mlp": 0.01254598, + "epoch": 0.4155117991883361, + "flos": 67852234702080.0, + "grad_norm": 0.8147918233574727, + "language_loss": 0.62098897, + "learning_rate": 2.633274417503128e-06, + "loss": 0.69710648, + "num_input_tokens_seen": 148417280, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.02932739, + "step": 6911, + "time_per_iteration": 3.1515297889709473 + }, + { + "auxiliary_loss_clip": 0.06486405, + "auxiliary_loss_mlp": 0.01269938, + "balance_loss_clip": 0.06302486, + "balance_loss_mlp": 0.01254393, + "epoch": 0.41557192244100405, + "flos": 14287869532800.0, + "grad_norm": 2.853367345352451, + "language_loss": 0.88092077, + "learning_rate": 2.6329049828984312e-06, + "loss": 0.95848417, + "num_input_tokens_seen": 148432610, + "router_z_loss_clip": 1.83691406, + "router_z_loss_mlp": 0.15551758, + "step": 6912, + "time_per_iteration": 2.5334529876708984 + }, + { + "auxiliary_loss_clip": 0.06480967, + "auxiliary_loss_mlp": 0.01268043, + "balance_loss_clip": 0.06303312, + "balance_loss_mlp": 0.01253451, + "epoch": 0.415632045693672, + "flos": 24468803372160.0, + "grad_norm": 2.9756004279328945, + "language_loss": 0.63331664, + "learning_rate": 2.632535524293914e-06, + "loss": 0.71080673, + "num_input_tokens_seen": 148451510, + "router_z_loss_clip": 1.77734375, + "router_z_loss_mlp": 0.14581299, + "step": 6913, + "time_per_iteration": 2.547567129135132 + }, + { + "auxiliary_loss_clip": 0.06471419, + "auxiliary_loss_mlp": 0.01270035, + "balance_loss_clip": 0.06297998, + "balance_loss_mlp": 0.01256249, + "epoch": 0.41569216894634, + "flos": 20120600246400.0, + "grad_norm": 1.832366261637427, + "language_loss": 0.75605875, + "learning_rate": 2.632166041703586e-06, + "loss": 0.83347332, + "num_input_tokens_seen": 148469945, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.13787842, + "step": 6914, + "time_per_iteration": 2.5624208450317383 + }, + { + "auxiliary_loss_clip": 0.06479953, + "auxiliary_loss_mlp": 0.01273918, + "balance_loss_clip": 0.06302451, + "balance_loss_mlp": 0.01257897, + "epoch": 0.41575229219900794, + "flos": 23804497802880.0, + "grad_norm": 2.012818087979969, + "language_loss": 0.87586981, + "learning_rate": 2.631796535141458e-06, + "loss": 0.95340854, + "num_input_tokens_seen": 148486655, + "router_z_loss_clip": 1.77441406, + "router_z_loss_mlp": 0.16015625, + "step": 6915, + "time_per_iteration": 2.545825481414795 + }, + { + "auxiliary_loss_clip": 0.06478707, + "auxiliary_loss_mlp": 0.01273084, + "balance_loss_clip": 0.06302266, + "balance_loss_mlp": 0.01259273, + "epoch": 0.4158124154516759, + "flos": 23114224667520.0, + "grad_norm": 2.419843437778294, + "language_loss": 0.71605122, + "learning_rate": 2.6314270046215426e-06, + "loss": 0.79356909, + "num_input_tokens_seen": 148505035, + "router_z_loss_clip": 1.765625, + "router_z_loss_mlp": 0.13818359, + "step": 6916, + "time_per_iteration": 2.59429669380188 + }, + { + "auxiliary_loss_clip": 0.06477056, + "auxiliary_loss_mlp": 0.01267217, + "balance_loss_clip": 0.06298968, + "balance_loss_mlp": 0.01252208, + "epoch": 0.41587253870434393, + "flos": 24249771999360.0, + "grad_norm": 1.4428572529082921, + "language_loss": 0.71931446, + "learning_rate": 2.631057450157852e-06, + "loss": 0.7967571, + "num_input_tokens_seen": 148525575, + "router_z_loss_clip": 1.78222656, + "router_z_loss_mlp": 0.15002441, + "step": 6917, + "time_per_iteration": 2.56001877784729 + }, + { + "auxiliary_loss_clip": 0.06469631, + "auxiliary_loss_mlp": 0.01267681, + "balance_loss_clip": 0.06294615, + "balance_loss_mlp": 0.01253089, + "epoch": 0.4159326619570119, + "flos": 23888926391040.0, + "grad_norm": 4.142003179261072, + "language_loss": 0.80924189, + "learning_rate": 2.6306878717643988e-06, + "loss": 0.88661504, + "num_input_tokens_seen": 148547270, + "router_z_loss_clip": 1.75097656, + "router_z_loss_mlp": 0.14599609, + "step": 6918, + "time_per_iteration": 2.6182031631469727 + }, + { + "auxiliary_loss_clip": 0.06479505, + "auxiliary_loss_mlp": 0.01270819, + "balance_loss_clip": 0.06299014, + "balance_loss_mlp": 0.01255, + "epoch": 0.41599278520967986, + "flos": 40636315221120.0, + "grad_norm": 1.446116397311604, + "language_loss": 0.70620072, + "learning_rate": 2.6303182694551995e-06, + "loss": 0.78370392, + "num_input_tokens_seen": 148572100, + "router_z_loss_clip": 1.8046875, + "router_z_loss_mlp": 0.1583252, + "step": 6919, + "time_per_iteration": 2.7974801063537598 + }, + { + "auxiliary_loss_clip": 0.06470604, + "auxiliary_loss_mlp": 0.01270956, + "balance_loss_clip": 0.06293205, + "balance_loss_mlp": 0.01255697, + "epoch": 0.4160529084623478, + "flos": 18228757161600.0, + "grad_norm": 1.8139422387612383, + "language_loss": 0.81669927, + "learning_rate": 2.6299486432442677e-06, + "loss": 0.89411485, + "num_input_tokens_seen": 148591245, + "router_z_loss_clip": 1.77441406, + "router_z_loss_mlp": 0.15258789, + "step": 6920, + "time_per_iteration": 2.652277708053589 + }, + { + "auxiliary_loss_clip": 0.06476951, + "auxiliary_loss_mlp": 0.01273828, + "balance_loss_clip": 0.06298292, + "balance_loss_mlp": 0.01258724, + "epoch": 0.4161130317150158, + "flos": 13666973178240.0, + "grad_norm": 2.775667367204969, + "language_loss": 0.65528631, + "learning_rate": 2.6295789931456195e-06, + "loss": 0.73279405, + "num_input_tokens_seen": 148607980, + "router_z_loss_clip": 1.7890625, + "router_z_loss_mlp": 0.15100098, + "step": 6921, + "time_per_iteration": 2.543761968612671 + }, + { + "auxiliary_loss_clip": 0.0647813, + "auxiliary_loss_mlp": 0.01273522, + "balance_loss_clip": 0.06301805, + "balance_loss_mlp": 0.01258168, + "epoch": 0.41617315496768376, + "flos": 16183779540480.0, + "grad_norm": 2.038581093377189, + "language_loss": 0.80900288, + "learning_rate": 2.629209319173274e-06, + "loss": 0.88651937, + "num_input_tokens_seen": 148624490, + "router_z_loss_clip": 1.76171875, + "router_z_loss_mlp": 0.15368652, + "step": 6922, + "time_per_iteration": 2.5606656074523926 + }, + { + "auxiliary_loss_clip": 0.06480581, + "auxiliary_loss_mlp": 0.01270422, + "balance_loss_clip": 0.06301428, + "balance_loss_mlp": 0.01255163, + "epoch": 0.4162332782203517, + "flos": 26220467376000.0, + "grad_norm": 1.63600266107907, + "language_loss": 0.6809119, + "learning_rate": 2.628839621341247e-06, + "loss": 0.7584219, + "num_input_tokens_seen": 148646490, + "router_z_loss_clip": 1.79101562, + "router_z_loss_mlp": 0.15258789, + "step": 6923, + "time_per_iteration": 2.5789952278137207 + }, + { + "auxiliary_loss_clip": 0.06474873, + "auxiliary_loss_mlp": 0.0126996, + "balance_loss_clip": 0.06299335, + "balance_loss_mlp": 0.01254152, + "epoch": 0.4162934014730197, + "flos": 28191540096000.0, + "grad_norm": 1.91165548300248, + "language_loss": 0.76249051, + "learning_rate": 2.6284698996635593e-06, + "loss": 0.83993888, + "num_input_tokens_seen": 148668580, + "router_z_loss_clip": 1.75585938, + "router_z_loss_mlp": 0.15795898, + "step": 6924, + "time_per_iteration": 2.6209194660186768 + }, + { + "auxiliary_loss_clip": 0.06473987, + "auxiliary_loss_mlp": 0.01272207, + "balance_loss_clip": 0.06295989, + "balance_loss_mlp": 0.01257759, + "epoch": 0.41635352472568765, + "flos": 19871492457600.0, + "grad_norm": 1.5667233765254498, + "language_loss": 0.73101473, + "learning_rate": 2.62810015415423e-06, + "loss": 0.80847669, + "num_input_tokens_seen": 148688410, + "router_z_loss_clip": 1.78027344, + "router_z_loss_mlp": 0.14465332, + "step": 6925, + "time_per_iteration": 2.5133748054504395 + }, + { + "auxiliary_loss_clip": 0.0646892, + "auxiliary_loss_mlp": 0.01268263, + "balance_loss_clip": 0.06293461, + "balance_loss_mlp": 0.0125391, + "epoch": 0.4164136479783556, + "flos": 14939974333440.0, + "grad_norm": 2.1337011873068445, + "language_loss": 0.84242827, + "learning_rate": 2.6277303848272792e-06, + "loss": 0.91980004, + "num_input_tokens_seen": 148704855, + "router_z_loss_clip": 1.75488281, + "router_z_loss_mlp": 0.14361572, + "step": 6926, + "time_per_iteration": 3.923924446105957 + }, + { + "auxiliary_loss_clip": 0.06465639, + "auxiliary_loss_mlp": 0.01268265, + "balance_loss_clip": 0.06292935, + "balance_loss_mlp": 0.01254574, + "epoch": 0.4164737712310236, + "flos": 21763251688320.0, + "grad_norm": 1.56658623429888, + "language_loss": 0.86570489, + "learning_rate": 2.6273605916967302e-06, + "loss": 0.94304395, + "num_input_tokens_seen": 148723065, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.13696289, + "step": 6927, + "time_per_iteration": 3.9643561840057373 + }, + { + "auxiliary_loss_clip": 0.06468353, + "auxiliary_loss_mlp": 0.01275736, + "balance_loss_clip": 0.06293458, + "balance_loss_mlp": 0.01260287, + "epoch": 0.41653389448369155, + "flos": 20746318210560.0, + "grad_norm": 2.3770101780600976, + "language_loss": 0.72583216, + "learning_rate": 2.626990774776604e-06, + "loss": 0.80327296, + "num_input_tokens_seen": 148741780, + "router_z_loss_clip": 1.74902344, + "router_z_loss_mlp": 0.15447998, + "step": 6928, + "time_per_iteration": 2.5111186504364014 + }, + { + "auxiliary_loss_clip": 0.06468435, + "auxiliary_loss_mlp": 0.01272442, + "balance_loss_clip": 0.062929, + "balance_loss_mlp": 0.0125735, + "epoch": 0.4165940177363595, + "flos": 24979848624000.0, + "grad_norm": 1.9381497388164433, + "language_loss": 0.78399348, + "learning_rate": 2.6266209340809254e-06, + "loss": 0.86140227, + "num_input_tokens_seen": 148759795, + "router_z_loss_clip": 1.75488281, + "router_z_loss_mlp": 0.15087891, + "step": 6929, + "time_per_iteration": 2.6066014766693115 + }, + { + "auxiliary_loss_clip": 0.0646543, + "auxiliary_loss_mlp": 0.01268046, + "balance_loss_clip": 0.06291193, + "balance_loss_mlp": 0.01253842, + "epoch": 0.41665414098902753, + "flos": 20527957670400.0, + "grad_norm": 1.8432748306405895, + "language_loss": 0.71154583, + "learning_rate": 2.6262510696237182e-06, + "loss": 0.78888059, + "num_input_tokens_seen": 148778680, + "router_z_loss_clip": 1.74316406, + "router_z_loss_mlp": 0.14190674, + "step": 6930, + "time_per_iteration": 2.5052478313446045 + }, + { + "auxiliary_loss_clip": 0.06468388, + "auxiliary_loss_mlp": 0.01269408, + "balance_loss_clip": 0.06291626, + "balance_loss_mlp": 0.01255067, + "epoch": 0.4167142642416955, + "flos": 19689078119040.0, + "grad_norm": 1.7731266468983917, + "language_loss": 0.81487417, + "learning_rate": 2.625881181419007e-06, + "loss": 0.89225209, + "num_input_tokens_seen": 148796470, + "router_z_loss_clip": 1.76855469, + "router_z_loss_mlp": 0.14355469, + "step": 6931, + "time_per_iteration": 2.555651903152466 + }, + { + "auxiliary_loss_clip": 0.0646255, + "auxiliary_loss_mlp": 0.01270611, + "balance_loss_clip": 0.06289293, + "balance_loss_mlp": 0.01255233, + "epoch": 0.41677438749436346, + "flos": 23769641704320.0, + "grad_norm": 2.211036345176988, + "language_loss": 0.79310054, + "learning_rate": 2.6255112694808193e-06, + "loss": 0.87043214, + "num_input_tokens_seen": 148815300, + "router_z_loss_clip": 1.73144531, + "router_z_loss_mlp": 0.15362549, + "step": 6932, + "time_per_iteration": 4.05314040184021 + }, + { + "auxiliary_loss_clip": 0.06464541, + "auxiliary_loss_mlp": 0.01269463, + "balance_loss_clip": 0.06289106, + "balance_loss_mlp": 0.01254752, + "epoch": 0.41683451074703143, + "flos": 30418051587840.0, + "grad_norm": 2.244908394273299, + "language_loss": 0.82220912, + "learning_rate": 2.6251413338231813e-06, + "loss": 0.89954913, + "num_input_tokens_seen": 148834315, + "router_z_loss_clip": 1.75292969, + "router_z_loss_mlp": 0.14727783, + "step": 6933, + "time_per_iteration": 2.715542793273926 + }, + { + "auxiliary_loss_clip": 0.06467043, + "auxiliary_loss_mlp": 0.01272262, + "balance_loss_clip": 0.06287256, + "balance_loss_mlp": 0.01257963, + "epoch": 0.4168946339996994, + "flos": 21513137650560.0, + "grad_norm": 1.8583396237684835, + "language_loss": 0.76938605, + "learning_rate": 2.624771374460121e-06, + "loss": 0.84677911, + "num_input_tokens_seen": 148852420, + "router_z_loss_clip": 1.796875, + "router_z_loss_mlp": 0.14300537, + "step": 6934, + "time_per_iteration": 2.630192279815674 + }, + { + "auxiliary_loss_clip": 0.06469443, + "auxiliary_loss_mlp": 0.0126919, + "balance_loss_clip": 0.06293288, + "balance_loss_mlp": 0.01254586, + "epoch": 0.41695475725236736, + "flos": 17644310133120.0, + "grad_norm": 2.110423315639561, + "language_loss": 0.67164314, + "learning_rate": 2.624401391405668e-06, + "loss": 0.74902946, + "num_input_tokens_seen": 148869305, + "router_z_loss_clip": 1.76074219, + "router_z_loss_mlp": 0.14599609, + "step": 6935, + "time_per_iteration": 2.484464168548584 + }, + { + "auxiliary_loss_clip": 0.0646461, + "auxiliary_loss_mlp": 0.01269491, + "balance_loss_clip": 0.06289718, + "balance_loss_mlp": 0.01254458, + "epoch": 0.4170148805050353, + "flos": 15674285589120.0, + "grad_norm": 2.4566205528754033, + "language_loss": 0.7383365, + "learning_rate": 2.6240313846738513e-06, + "loss": 0.81567752, + "num_input_tokens_seen": 148886395, + "router_z_loss_clip": 1.74902344, + "router_z_loss_mlp": 0.15039062, + "step": 6936, + "time_per_iteration": 3.9171254634857178 + }, + { + "auxiliary_loss_clip": 0.06457968, + "auxiliary_loss_mlp": 0.01275405, + "balance_loss_clip": 0.06285361, + "balance_loss_mlp": 0.01262184, + "epoch": 0.4170750037577033, + "flos": 15164623929600.0, + "grad_norm": 4.126334603160969, + "language_loss": 0.74596691, + "learning_rate": 2.6236613542787024e-06, + "loss": 0.8233006, + "num_input_tokens_seen": 148905235, + "router_z_loss_clip": 1.72851562, + "router_z_loss_mlp": 0.13226318, + "step": 6937, + "time_per_iteration": 2.5286996364593506 + }, + { + "auxiliary_loss_clip": 0.06462386, + "auxiliary_loss_mlp": 0.01273752, + "balance_loss_clip": 0.06289354, + "balance_loss_mlp": 0.01259727, + "epoch": 0.41713512701037125, + "flos": 28776029051520.0, + "grad_norm": 1.4497703642581674, + "language_loss": 0.84985441, + "learning_rate": 2.6232913002342518e-06, + "loss": 0.92721575, + "num_input_tokens_seen": 148928130, + "router_z_loss_clip": 1.72949219, + "router_z_loss_mlp": 0.14031982, + "step": 6938, + "time_per_iteration": 2.594024419784546 + }, + { + "auxiliary_loss_clip": 0.06468149, + "auxiliary_loss_mlp": 0.01274736, + "balance_loss_clip": 0.06289169, + "balance_loss_mlp": 0.01259114, + "epoch": 0.4171952502630392, + "flos": 28264564529280.0, + "grad_norm": 1.8332960409763566, + "language_loss": 0.74288213, + "learning_rate": 2.6229212225545334e-06, + "loss": 0.82031095, + "num_input_tokens_seen": 148948790, + "router_z_loss_clip": 1.7890625, + "router_z_loss_mlp": 0.15618896, + "step": 6939, + "time_per_iteration": 2.628620147705078 + }, + { + "auxiliary_loss_clip": 0.06462568, + "auxiliary_loss_mlp": 0.01269134, + "balance_loss_clip": 0.06289193, + "balance_loss_mlp": 0.01254817, + "epoch": 0.4172553735157072, + "flos": 24578612547840.0, + "grad_norm": 1.6044361894616455, + "language_loss": 0.75275123, + "learning_rate": 2.622551121253579e-06, + "loss": 0.83006829, + "num_input_tokens_seen": 148967690, + "router_z_loss_clip": 1.73339844, + "router_z_loss_mlp": 0.14331055, + "step": 6940, + "time_per_iteration": 2.55566143989563 + }, + { + "auxiliary_loss_clip": 0.06464436, + "auxiliary_loss_mlp": 0.01269512, + "balance_loss_clip": 0.0628769, + "balance_loss_mlp": 0.01255338, + "epoch": 0.41731549676837515, + "flos": 27051967768320.0, + "grad_norm": 1.7023568307679129, + "language_loss": 0.71513987, + "learning_rate": 2.622180996345424e-06, + "loss": 0.79247934, + "num_input_tokens_seen": 148987150, + "router_z_loss_clip": 1.76855469, + "router_z_loss_mlp": 0.1416626, + "step": 6941, + "time_per_iteration": 2.628779649734497 + }, + { + "auxiliary_loss_clip": 0.06464395, + "auxiliary_loss_mlp": 0.0127035, + "balance_loss_clip": 0.06285797, + "balance_loss_mlp": 0.01255342, + "epoch": 0.4173756200210431, + "flos": 28400173562880.0, + "grad_norm": 3.007655990717308, + "language_loss": 0.73701853, + "learning_rate": 2.621810847844104e-06, + "loss": 0.81436592, + "num_input_tokens_seen": 149004895, + "router_z_loss_clip": 1.78613281, + "router_z_loss_mlp": 0.15008545, + "step": 6942, + "time_per_iteration": 2.579085350036621 + }, + { + "auxiliary_loss_clip": 0.06469673, + "auxiliary_loss_mlp": 0.01269256, + "balance_loss_clip": 0.06289446, + "balance_loss_mlp": 0.01254587, + "epoch": 0.41743574327371114, + "flos": 22526968527360.0, + "grad_norm": 2.366625341311562, + "language_loss": 0.73327738, + "learning_rate": 2.6214406757636534e-06, + "loss": 0.81066668, + "num_input_tokens_seen": 149020970, + "router_z_loss_clip": 1.80273438, + "router_z_loss_mlp": 0.14672852, + "step": 6943, + "time_per_iteration": 2.5890767574310303 + }, + { + "auxiliary_loss_clip": 0.06466928, + "auxiliary_loss_mlp": 0.01267232, + "balance_loss_clip": 0.06290001, + "balance_loss_mlp": 0.01252998, + "epoch": 0.4174958665263791, + "flos": 30120587193600.0, + "grad_norm": 2.3204117950268817, + "language_loss": 0.63901597, + "learning_rate": 2.621070480118111e-06, + "loss": 0.71635759, + "num_input_tokens_seen": 149041795, + "router_z_loss_clip": 1.76855469, + "router_z_loss_mlp": 0.14245605, + "step": 6944, + "time_per_iteration": 2.586949586868286 + }, + { + "auxiliary_loss_clip": 0.06466375, + "auxiliary_loss_mlp": 0.01271741, + "balance_loss_clip": 0.0628995, + "balance_loss_mlp": 0.0125684, + "epoch": 0.41755598977904707, + "flos": 25270227348480.0, + "grad_norm": 11.202050930016789, + "language_loss": 0.70295048, + "learning_rate": 2.620700260921513e-06, + "loss": 0.78033161, + "num_input_tokens_seen": 149063700, + "router_z_loss_clip": 1.76269531, + "router_z_loss_mlp": 0.14898682, + "step": 6945, + "time_per_iteration": 2.6323587894439697 + }, + { + "auxiliary_loss_clip": 0.06460019, + "auxiliary_loss_mlp": 0.01270496, + "balance_loss_clip": 0.06285217, + "balance_loss_mlp": 0.01255219, + "epoch": 0.41761611303171503, + "flos": 19834707715200.0, + "grad_norm": 1.6201275470111005, + "language_loss": 0.8079865, + "learning_rate": 2.620330018187899e-06, + "loss": 0.88529164, + "num_input_tokens_seen": 149082410, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.152771, + "step": 6946, + "time_per_iteration": 2.5303776264190674 + }, + { + "auxiliary_loss_clip": 0.064612, + "auxiliary_loss_mlp": 0.01269709, + "balance_loss_clip": 0.06288694, + "balance_loss_mlp": 0.0125569, + "epoch": 0.417676236284383, + "flos": 15528655992960.0, + "grad_norm": 2.2948583781036027, + "language_loss": 0.77726543, + "learning_rate": 2.6199597519313086e-06, + "loss": 0.85457456, + "num_input_tokens_seen": 149098745, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.14038086, + "step": 6947, + "time_per_iteration": 2.5844216346740723 + }, + { + "auxiliary_loss_clip": 0.06465282, + "auxiliary_loss_mlp": 0.01267852, + "balance_loss_clip": 0.06289726, + "balance_loss_mlp": 0.01252844, + "epoch": 0.41773635953705096, + "flos": 32532531770880.0, + "grad_norm": 1.6041388362904736, + "language_loss": 0.71914941, + "learning_rate": 2.6195894621657825e-06, + "loss": 0.79648077, + "num_input_tokens_seen": 149122255, + "router_z_loss_clip": 1.75488281, + "router_z_loss_mlp": 0.15014648, + "step": 6948, + "time_per_iteration": 2.632211685180664 + }, + { + "auxiliary_loss_clip": 0.06460577, + "auxiliary_loss_mlp": 0.01271252, + "balance_loss_clip": 0.06288102, + "balance_loss_mlp": 0.01256303, + "epoch": 0.4177964827897189, + "flos": 23447719117440.0, + "grad_norm": 1.868509756028272, + "language_loss": 0.76914591, + "learning_rate": 2.619219148905362e-06, + "loss": 0.84646416, + "num_input_tokens_seen": 149142845, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.14941406, + "step": 6949, + "time_per_iteration": 2.5791566371917725 + }, + { + "auxiliary_loss_clip": 0.06466889, + "auxiliary_loss_mlp": 0.01270609, + "balance_loss_clip": 0.06288934, + "balance_loss_mlp": 0.01255476, + "epoch": 0.4178566060423869, + "flos": 22755768900480.0, + "grad_norm": 1.6605109484051197, + "language_loss": 0.81921285, + "learning_rate": 2.6188488121640888e-06, + "loss": 0.89658785, + "num_input_tokens_seen": 149163375, + "router_z_loss_clip": 1.77734375, + "router_z_loss_mlp": 0.15148926, + "step": 6950, + "time_per_iteration": 2.550705909729004 + }, + { + "auxiliary_loss_clip": 0.06457172, + "auxiliary_loss_mlp": 0.01266593, + "balance_loss_clip": 0.062898, + "balance_loss_mlp": 0.01253319, + "epoch": 0.41791672929505486, + "flos": 26040233243520.0, + "grad_norm": 1.3162845057727355, + "language_loss": 0.76396811, + "learning_rate": 2.618478451956007e-06, + "loss": 0.84120584, + "num_input_tokens_seen": 149185610, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.13275146, + "step": 6951, + "time_per_iteration": 2.6047768592834473 + }, + { + "auxiliary_loss_clip": 0.06472172, + "auxiliary_loss_mlp": 0.01271966, + "balance_loss_clip": 0.06291625, + "balance_loss_mlp": 0.01256988, + "epoch": 0.4179768525477228, + "flos": 19574028063360.0, + "grad_norm": 1.8780871701618023, + "language_loss": 0.72956991, + "learning_rate": 2.61810806829516e-06, + "loss": 0.80701125, + "num_input_tokens_seen": 149203990, + "router_z_loss_clip": 1.80566406, + "router_z_loss_mlp": 0.14978027, + "step": 6952, + "time_per_iteration": 2.498915910720825 + }, + { + "auxiliary_loss_clip": 0.06467617, + "auxiliary_loss_mlp": 0.01270698, + "balance_loss_clip": 0.06290505, + "balance_loss_mlp": 0.01256286, + "epoch": 0.4180369758003908, + "flos": 17789352750720.0, + "grad_norm": 3.5208466342014444, + "language_loss": 0.72192442, + "learning_rate": 2.617737661195593e-06, + "loss": 0.79930753, + "num_input_tokens_seen": 149221385, + "router_z_loss_clip": 1.77050781, + "router_z_loss_mlp": 0.14428711, + "step": 6953, + "time_per_iteration": 2.5105345249176025 + }, + { + "auxiliary_loss_clip": 0.06460451, + "auxiliary_loss_mlp": 0.01269376, + "balance_loss_clip": 0.0629045, + "balance_loss_mlp": 0.01255143, + "epoch": 0.41809709905305875, + "flos": 20967152446080.0, + "grad_norm": 1.9107321624636409, + "language_loss": 0.76574248, + "learning_rate": 2.617367230671353e-06, + "loss": 0.8430407, + "num_input_tokens_seen": 149241175, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.14233398, + "step": 6954, + "time_per_iteration": 2.5424091815948486 + }, + { + "auxiliary_loss_clip": 0.06461184, + "auxiliary_loss_mlp": 0.01271375, + "balance_loss_clip": 0.06286837, + "balance_loss_mlp": 0.01255866, + "epoch": 0.4181572223057267, + "flos": 22024099048320.0, + "grad_norm": 2.2757291119189693, + "language_loss": 0.84719867, + "learning_rate": 2.616996776736485e-06, + "loss": 0.92452419, + "num_input_tokens_seen": 149259115, + "router_z_loss_clip": 1.74511719, + "router_z_loss_mlp": 0.15490723, + "step": 6955, + "time_per_iteration": 2.5423128604888916 + }, + { + "auxiliary_loss_clip": 0.06460696, + "auxiliary_loss_mlp": 0.01269449, + "balance_loss_clip": 0.06289047, + "balance_loss_mlp": 0.01255001, + "epoch": 0.4182173455583947, + "flos": 26251969311360.0, + "grad_norm": 1.5480485879739414, + "language_loss": 0.83159053, + "learning_rate": 2.616626299405037e-06, + "loss": 0.90889192, + "num_input_tokens_seen": 149278705, + "router_z_loss_clip": 1.71582031, + "router_z_loss_mlp": 0.14453125, + "step": 6956, + "time_per_iteration": 2.5377910137176514 + }, + { + "auxiliary_loss_clip": 0.06470253, + "auxiliary_loss_mlp": 0.01272951, + "balance_loss_clip": 0.06292067, + "balance_loss_mlp": 0.01258163, + "epoch": 0.4182774688110627, + "flos": 14796566870400.0, + "grad_norm": 2.2161530875987205, + "language_loss": 0.72170293, + "learning_rate": 2.616255798691059e-06, + "loss": 0.79913497, + "num_input_tokens_seen": 149294040, + "router_z_loss_clip": 1.77929688, + "router_z_loss_mlp": 0.14801025, + "step": 6957, + "time_per_iteration": 2.5512890815734863 + }, + { + "auxiliary_loss_clip": 0.06465964, + "auxiliary_loss_mlp": 0.01272907, + "balance_loss_clip": 0.06289618, + "balance_loss_mlp": 0.01258745, + "epoch": 0.41833759206373067, + "flos": 20418190421760.0, + "grad_norm": 1.9534240722910163, + "language_loss": 0.75827634, + "learning_rate": 2.6158852746085982e-06, + "loss": 0.83566499, + "num_input_tokens_seen": 149310385, + "router_z_loss_clip": 1.76269531, + "router_z_loss_mlp": 0.14147949, + "step": 6958, + "time_per_iteration": 2.5025634765625 + }, + { + "auxiliary_loss_clip": 0.06461923, + "auxiliary_loss_mlp": 0.01277567, + "balance_loss_clip": 0.06289306, + "balance_loss_mlp": 0.01262505, + "epoch": 0.41839771531639863, + "flos": 23662557786240.0, + "grad_norm": 1.62032760192947, + "language_loss": 0.77450699, + "learning_rate": 2.6155147271717066e-06, + "loss": 0.85190189, + "num_input_tokens_seen": 149328235, + "router_z_loss_clip": 1.72851562, + "router_z_loss_mlp": 0.15075684, + "step": 6959, + "time_per_iteration": 2.5644967555999756 + }, + { + "auxiliary_loss_clip": 0.06462178, + "auxiliary_loss_mlp": 0.01275343, + "balance_loss_clip": 0.06288128, + "balance_loss_mlp": 0.01259423, + "epoch": 0.4184578385690666, + "flos": 19760006200320.0, + "grad_norm": 1.8483570445524284, + "language_loss": 0.77022827, + "learning_rate": 2.6151441563944347e-06, + "loss": 0.84760344, + "num_input_tokens_seen": 149347465, + "router_z_loss_clip": 1.74121094, + "router_z_loss_mlp": 0.15924072, + "step": 6960, + "time_per_iteration": 2.5269885063171387 + }, + { + "auxiliary_loss_clip": 0.06453702, + "auxiliary_loss_mlp": 0.01269309, + "balance_loss_clip": 0.06288585, + "balance_loss_mlp": 0.01255552, + "epoch": 0.41851796182173456, + "flos": 20199578319360.0, + "grad_norm": 2.3993036704472717, + "language_loss": 0.75495946, + "learning_rate": 2.614773562290835e-06, + "loss": 0.83218956, + "num_input_tokens_seen": 149366685, + "router_z_loss_clip": 1.64941406, + "router_z_loss_mlp": 0.13769531, + "step": 6961, + "time_per_iteration": 2.571563243865967 + }, + { + "auxiliary_loss_clip": 0.06367883, + "auxiliary_loss_mlp": 0.0126221, + "balance_loss_clip": 0.06291385, + "balance_loss_mlp": 0.01259577, + "epoch": 0.41857808507440253, + "flos": 59038331898240.0, + "grad_norm": 0.8546546360875583, + "language_loss": 0.54730451, + "learning_rate": 2.61440294487496e-06, + "loss": 0.62360549, + "num_input_tokens_seen": 149422925, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.02635193, + "step": 6962, + "time_per_iteration": 3.0928165912628174 + }, + { + "auxiliary_loss_clip": 0.06468143, + "auxiliary_loss_mlp": 0.0127052, + "balance_loss_clip": 0.06293048, + "balance_loss_mlp": 0.01256423, + "epoch": 0.4186382083270705, + "flos": 18484740984960.0, + "grad_norm": 2.146654503648622, + "language_loss": 0.8523612, + "learning_rate": 2.614032304160864e-06, + "loss": 0.92974788, + "num_input_tokens_seen": 149440820, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.14093018, + "step": 6963, + "time_per_iteration": 2.4891340732574463 + }, + { + "auxiliary_loss_clip": 0.06465001, + "auxiliary_loss_mlp": 0.01271241, + "balance_loss_clip": 0.06290912, + "balance_loss_mlp": 0.01256453, + "epoch": 0.41869833157973846, + "flos": 21584988126720.0, + "grad_norm": 1.5636714712462336, + "language_loss": 0.70520425, + "learning_rate": 2.6136616401626014e-06, + "loss": 0.78256667, + "num_input_tokens_seen": 149461060, + "router_z_loss_clip": 1.74023438, + "router_z_loss_mlp": 0.14788818, + "step": 6964, + "time_per_iteration": 2.6037514209747314 + }, + { + "auxiliary_loss_clip": 0.06460649, + "auxiliary_loss_mlp": 0.01270666, + "balance_loss_clip": 0.06289357, + "balance_loss_mlp": 0.01257034, + "epoch": 0.4187584548324064, + "flos": 35526156192000.0, + "grad_norm": 2.108688626905877, + "language_loss": 0.71782613, + "learning_rate": 2.6132909528942273e-06, + "loss": 0.79513931, + "num_input_tokens_seen": 149483115, + "router_z_loss_clip": 1.71191406, + "router_z_loss_mlp": 0.1362915, + "step": 6965, + "time_per_iteration": 4.077980279922485 + }, + { + "auxiliary_loss_clip": 0.06453691, + "auxiliary_loss_mlp": 0.0126997, + "balance_loss_clip": 0.06286767, + "balance_loss_mlp": 0.01257173, + "epoch": 0.4188185780850744, + "flos": 18660950121600.0, + "grad_norm": 1.7018758391145836, + "language_loss": 0.72080678, + "learning_rate": 2.6129202423697997e-06, + "loss": 0.79804349, + "num_input_tokens_seen": 149501495, + "router_z_loss_clip": 1.66894531, + "router_z_loss_mlp": 0.12792969, + "step": 6966, + "time_per_iteration": 2.5740551948547363 + }, + { + "auxiliary_loss_clip": 0.06466748, + "auxiliary_loss_mlp": 0.0127158, + "balance_loss_clip": 0.06288405, + "balance_loss_mlp": 0.0125625, + "epoch": 0.41887870133774235, + "flos": 40342959676800.0, + "grad_norm": 4.506306240026155, + "language_loss": 0.71212667, + "learning_rate": 2.612549508603375e-06, + "loss": 0.78950995, + "num_input_tokens_seen": 149523170, + "router_z_loss_clip": 1.78125, + "router_z_loss_mlp": 0.15338135, + "step": 6967, + "time_per_iteration": 4.179578065872192 + }, + { + "auxiliary_loss_clip": 0.0636977, + "auxiliary_loss_mlp": 0.01256477, + "balance_loss_clip": 0.06291805, + "balance_loss_mlp": 0.01253975, + "epoch": 0.4189388245904103, + "flos": 61388083946880.0, + "grad_norm": 0.6570416522373307, + "language_loss": 0.45988834, + "learning_rate": 2.612178751609011e-06, + "loss": 0.53615081, + "num_input_tokens_seen": 149583955, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 0.02500916, + "step": 6968, + "time_per_iteration": 3.1288843154907227 + }, + { + "auxiliary_loss_clip": 0.06467855, + "auxiliary_loss_mlp": 0.01273397, + "balance_loss_clip": 0.06290668, + "balance_loss_mlp": 0.01257685, + "epoch": 0.4189989478430783, + "flos": 28222371198720.0, + "grad_norm": 1.7081344299750898, + "language_loss": 0.75350499, + "learning_rate": 2.6118079714007685e-06, + "loss": 0.8309176, + "num_input_tokens_seen": 149604440, + "router_z_loss_clip": 1.77246094, + "router_z_loss_mlp": 0.15710449, + "step": 6969, + "time_per_iteration": 2.5936050415039062 + }, + { + "auxiliary_loss_clip": 0.06460407, + "auxiliary_loss_mlp": 0.01272467, + "balance_loss_clip": 0.06287546, + "balance_loss_mlp": 0.01258365, + "epoch": 0.4190590710957463, + "flos": 24571820367360.0, + "grad_norm": 1.8003201263588986, + "language_loss": 0.80904478, + "learning_rate": 2.611437167992705e-06, + "loss": 0.88637358, + "num_input_tokens_seen": 149623745, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.14099121, + "step": 6970, + "time_per_iteration": 2.5366463661193848 + }, + { + "auxiliary_loss_clip": 0.06461529, + "auxiliary_loss_mlp": 0.01271512, + "balance_loss_clip": 0.06291033, + "balance_loss_mlp": 0.01257594, + "epoch": 0.41911919434841427, + "flos": 21732504439680.0, + "grad_norm": 2.0427263912189098, + "language_loss": 0.83781362, + "learning_rate": 2.6110663413988835e-06, + "loss": 0.91514409, + "num_input_tokens_seen": 149643025, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.13922119, + "step": 6971, + "time_per_iteration": 4.038029909133911 + }, + { + "auxiliary_loss_clip": 0.06459013, + "auxiliary_loss_mlp": 0.01277453, + "balance_loss_clip": 0.06292501, + "balance_loss_mlp": 0.01262766, + "epoch": 0.41917931760108224, + "flos": 17607064193280.0, + "grad_norm": 1.8913036217137231, + "language_loss": 0.74956995, + "learning_rate": 2.6106954916333648e-06, + "loss": 0.82693458, + "num_input_tokens_seen": 149660695, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.14685059, + "step": 6972, + "time_per_iteration": 2.5450055599212646 + }, + { + "auxiliary_loss_clip": 0.06463002, + "auxiliary_loss_mlp": 0.01269114, + "balance_loss_clip": 0.06289829, + "balance_loss_mlp": 0.01255405, + "epoch": 0.4192394408537502, + "flos": 37825943679360.0, + "grad_norm": 1.6425528401757075, + "language_loss": 0.73133683, + "learning_rate": 2.610324618710212e-06, + "loss": 0.808658, + "num_input_tokens_seen": 149682040, + "router_z_loss_clip": 1.73339844, + "router_z_loss_mlp": 0.13684082, + "step": 6973, + "time_per_iteration": 2.6852450370788574 + }, + { + "auxiliary_loss_clip": 0.06474721, + "auxiliary_loss_mlp": 0.01271721, + "balance_loss_clip": 0.06293075, + "balance_loss_mlp": 0.01257272, + "epoch": 0.41929956410641817, + "flos": 23113637688960.0, + "grad_norm": 1.8862458299453466, + "language_loss": 0.74830127, + "learning_rate": 2.609953722643489e-06, + "loss": 0.82576567, + "num_input_tokens_seen": 149700855, + "router_z_loss_clip": 1.81542969, + "router_z_loss_mlp": 0.14453125, + "step": 6974, + "time_per_iteration": 2.5765645503997803 + }, + { + "auxiliary_loss_clip": 0.06460831, + "auxiliary_loss_mlp": 0.01266173, + "balance_loss_clip": 0.0628831, + "balance_loss_mlp": 0.01252744, + "epoch": 0.41935968735908613, + "flos": 22530448471680.0, + "grad_norm": 1.902296645802657, + "language_loss": 0.73513019, + "learning_rate": 2.609582803447259e-06, + "loss": 0.81240016, + "num_input_tokens_seen": 149717360, + "router_z_loss_clip": 1.72363281, + "router_z_loss_mlp": 0.13421631, + "step": 6975, + "time_per_iteration": 2.4907052516937256 + }, + { + "auxiliary_loss_clip": 0.06461257, + "auxiliary_loss_mlp": 0.0127025, + "balance_loss_clip": 0.06293045, + "balance_loss_mlp": 0.01256172, + "epoch": 0.4194198106117541, + "flos": 26877771129600.0, + "grad_norm": 1.432926445179704, + "language_loss": 0.80820251, + "learning_rate": 2.6092118611355885e-06, + "loss": 0.8855176, + "num_input_tokens_seen": 149738975, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.14086914, + "step": 6976, + "time_per_iteration": 4.015337705612183 + }, + { + "auxiliary_loss_clip": 0.06465544, + "auxiliary_loss_mlp": 0.01265752, + "balance_loss_clip": 0.06291896, + "balance_loss_mlp": 0.01252174, + "epoch": 0.41947993386442206, + "flos": 19908696470400.0, + "grad_norm": 6.530638917868016, + "language_loss": 0.67613435, + "learning_rate": 2.6088408957225425e-06, + "loss": 0.75344729, + "num_input_tokens_seen": 149757055, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.13592529, + "step": 6977, + "time_per_iteration": 2.5907933712005615 + }, + { + "auxiliary_loss_clip": 0.06466645, + "auxiliary_loss_mlp": 0.012707, + "balance_loss_clip": 0.06291468, + "balance_loss_mlp": 0.01257104, + "epoch": 0.41954005711709, + "flos": 17389584120960.0, + "grad_norm": 2.431968733580352, + "language_loss": 0.8152501, + "learning_rate": 2.6084699072221898e-06, + "loss": 0.89262354, + "num_input_tokens_seen": 149772885, + "router_z_loss_clip": 1.75390625, + "router_z_loss_mlp": 0.13604736, + "step": 6978, + "time_per_iteration": 2.5534939765930176 + }, + { + "auxiliary_loss_clip": 0.06466036, + "auxiliary_loss_mlp": 0.01269917, + "balance_loss_clip": 0.06288658, + "balance_loss_mlp": 0.012561, + "epoch": 0.419600180369758, + "flos": 25009254207360.0, + "grad_norm": 1.7617066668945498, + "language_loss": 0.83044857, + "learning_rate": 2.6080988956485964e-06, + "loss": 0.90780807, + "num_input_tokens_seen": 149791515, + "router_z_loss_clip": 1.77539062, + "router_z_loss_mlp": 0.13824463, + "step": 6979, + "time_per_iteration": 2.5991194248199463 + }, + { + "auxiliary_loss_clip": 0.06464113, + "auxiliary_loss_mlp": 0.01266396, + "balance_loss_clip": 0.0629217, + "balance_loss_mlp": 0.01253313, + "epoch": 0.41966030362242596, + "flos": 17389458339840.0, + "grad_norm": 2.43413237172065, + "language_loss": 0.83727056, + "learning_rate": 2.6077278610158325e-06, + "loss": 0.9145757, + "num_input_tokens_seen": 149807250, + "router_z_loss_clip": 1.71972656, + "router_z_loss_mlp": 0.13079834, + "step": 6980, + "time_per_iteration": 2.4868295192718506 + }, + { + "auxiliary_loss_clip": 0.06469644, + "auxiliary_loss_mlp": 0.01274217, + "balance_loss_clip": 0.06293017, + "balance_loss_mlp": 0.01260061, + "epoch": 0.4197204268750939, + "flos": 22161427090560.0, + "grad_norm": 2.953064628504675, + "language_loss": 0.79802233, + "learning_rate": 2.6073568033379665e-06, + "loss": 0.87546098, + "num_input_tokens_seen": 149821640, + "router_z_loss_clip": 1.765625, + "router_z_loss_mlp": 0.14172363, + "step": 6981, + "time_per_iteration": 2.572671890258789 + }, + { + "auxiliary_loss_clip": 0.06461273, + "auxiliary_loss_mlp": 0.01268979, + "balance_loss_clip": 0.06293882, + "balance_loss_mlp": 0.01256152, + "epoch": 0.4197805501277619, + "flos": 22089534687360.0, + "grad_norm": 1.8874441419731374, + "language_loss": 0.84437835, + "learning_rate": 2.6069857226290696e-06, + "loss": 0.92168081, + "num_input_tokens_seen": 149840545, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.12823486, + "step": 6982, + "time_per_iteration": 2.515719413757324 + }, + { + "auxiliary_loss_clip": 0.06468281, + "auxiliary_loss_mlp": 0.0127262, + "balance_loss_clip": 0.06291284, + "balance_loss_mlp": 0.0125844, + "epoch": 0.4198406733804299, + "flos": 26439372967680.0, + "grad_norm": 2.198770889515785, + "language_loss": 0.57229298, + "learning_rate": 2.606614618903214e-06, + "loss": 0.64970195, + "num_input_tokens_seen": 149860375, + "router_z_loss_clip": 1.76953125, + "router_z_loss_mlp": 0.1418457, + "step": 6983, + "time_per_iteration": 2.589905023574829 + }, + { + "auxiliary_loss_clip": 0.06459898, + "auxiliary_loss_mlp": 0.01268511, + "balance_loss_clip": 0.0629196, + "balance_loss_mlp": 0.01255922, + "epoch": 0.4199007966330979, + "flos": 12535870112640.0, + "grad_norm": 1.9546340544122036, + "language_loss": 0.82430601, + "learning_rate": 2.606243492174471e-06, + "loss": 0.90159011, + "num_input_tokens_seen": 149877850, + "router_z_loss_clip": 1.6796875, + "router_z_loss_mlp": 0.1260376, + "step": 6984, + "time_per_iteration": 2.4837801456451416 + }, + { + "auxiliary_loss_clip": 0.06465998, + "auxiliary_loss_mlp": 0.0127065, + "balance_loss_clip": 0.06293395, + "balance_loss_mlp": 0.01257698, + "epoch": 0.41996091988576584, + "flos": 21769498817280.0, + "grad_norm": 1.6572496297875159, + "language_loss": 0.79565531, + "learning_rate": 2.605872342456914e-06, + "loss": 0.87302184, + "num_input_tokens_seen": 149896110, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.12963867, + "step": 6985, + "time_per_iteration": 2.558382511138916 + }, + { + "auxiliary_loss_clip": 0.06471538, + "auxiliary_loss_mlp": 0.01269266, + "balance_loss_clip": 0.06292171, + "balance_loss_mlp": 0.01254425, + "epoch": 0.4200210431384338, + "flos": 26549182143360.0, + "grad_norm": 1.7232010674189546, + "language_loss": 0.78413719, + "learning_rate": 2.6055011697646173e-06, + "loss": 0.86154521, + "num_input_tokens_seen": 149916495, + "router_z_loss_clip": 1.79492188, + "router_z_loss_mlp": 0.14831543, + "step": 6986, + "time_per_iteration": 2.557201385498047 + }, + { + "auxiliary_loss_clip": 0.06457713, + "auxiliary_loss_mlp": 0.01264036, + "balance_loss_clip": 0.06290729, + "balance_loss_mlp": 0.0125171, + "epoch": 0.42008116639110177, + "flos": 26802859979520.0, + "grad_norm": 1.5119871943534449, + "language_loss": 0.72772801, + "learning_rate": 2.605129974111655e-06, + "loss": 0.80494547, + "num_input_tokens_seen": 149936445, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.12310791, + "step": 6987, + "time_per_iteration": 2.590758800506592 + }, + { + "auxiliary_loss_clip": 0.06464639, + "auxiliary_loss_mlp": 0.01270518, + "balance_loss_clip": 0.06291942, + "balance_loss_mlp": 0.01256994, + "epoch": 0.42014128964376973, + "flos": 32095433347200.0, + "grad_norm": 1.493413355723003, + "language_loss": 0.75077468, + "learning_rate": 2.604758755512104e-06, + "loss": 0.82812625, + "num_input_tokens_seen": 149959430, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.13519287, + "step": 6988, + "time_per_iteration": 2.6159229278564453 + }, + { + "auxiliary_loss_clip": 0.064705, + "auxiliary_loss_mlp": 0.01272645, + "balance_loss_clip": 0.06293759, + "balance_loss_mlp": 0.01258256, + "epoch": 0.4202014128964377, + "flos": 26474061358080.0, + "grad_norm": 1.4960604967721163, + "language_loss": 0.7416907, + "learning_rate": 2.60438751398004e-06, + "loss": 0.81912208, + "num_input_tokens_seen": 149980365, + "router_z_loss_clip": 1.76855469, + "router_z_loss_mlp": 0.14385986, + "step": 6989, + "time_per_iteration": 2.6082265377044678 + }, + { + "auxiliary_loss_clip": 0.06467222, + "auxiliary_loss_mlp": 0.01268972, + "balance_loss_clip": 0.06291176, + "balance_loss_mlp": 0.0125413, + "epoch": 0.42026153614910566, + "flos": 13405287277440.0, + "grad_norm": 2.240751664581705, + "language_loss": 0.70939904, + "learning_rate": 2.6040162495295404e-06, + "loss": 0.78676105, + "num_input_tokens_seen": 149997375, + "router_z_loss_clip": 1.75976562, + "router_z_loss_mlp": 0.14831543, + "step": 6990, + "time_per_iteration": 2.5301413536071777 + }, + { + "auxiliary_loss_clip": 0.06372039, + "auxiliary_loss_mlp": 0.01262281, + "balance_loss_clip": 0.06294142, + "balance_loss_mlp": 0.01259734, + "epoch": 0.42032165940177363, + "flos": 60268720452480.0, + "grad_norm": 0.7958876139316734, + "language_loss": 0.6024788, + "learning_rate": 2.603644962174685e-06, + "loss": 0.67882204, + "num_input_tokens_seen": 150051230, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 0.02546692, + "step": 6991, + "time_per_iteration": 3.036398410797119 + }, + { + "auxiliary_loss_clip": 0.06468751, + "auxiliary_loss_mlp": 0.0127226, + "balance_loss_clip": 0.06294238, + "balance_loss_mlp": 0.01257251, + "epoch": 0.4203817826544416, + "flos": 24542121294720.0, + "grad_norm": 1.5524019758451273, + "language_loss": 0.83787376, + "learning_rate": 2.6032736519295517e-06, + "loss": 0.91528386, + "num_input_tokens_seen": 150071135, + "router_z_loss_clip": 1.74511719, + "router_z_loss_mlp": 0.15014648, + "step": 6992, + "time_per_iteration": 2.5513317584991455 + }, + { + "auxiliary_loss_clip": 0.06374694, + "auxiliary_loss_mlp": 0.01259872, + "balance_loss_clip": 0.06295739, + "balance_loss_mlp": 0.01257284, + "epoch": 0.42044190590710956, + "flos": 58837679297280.0, + "grad_norm": 0.7870388441722128, + "language_loss": 0.65295899, + "learning_rate": 2.6029023188082217e-06, + "loss": 0.72930467, + "num_input_tokens_seen": 150125220, + "router_z_loss_clip": 0.7890625, + "router_z_loss_mlp": 0.02589417, + "step": 6993, + "time_per_iteration": 3.139356851577759 + }, + { + "auxiliary_loss_clip": 0.06475414, + "auxiliary_loss_mlp": 0.01273103, + "balance_loss_clip": 0.06293732, + "balance_loss_mlp": 0.01257534, + "epoch": 0.4205020291597775, + "flos": 16441733934720.0, + "grad_norm": 2.0884817814411307, + "language_loss": 0.83771634, + "learning_rate": 2.6025309628247746e-06, + "loss": 0.91520149, + "num_input_tokens_seen": 150142300, + "router_z_loss_clip": 1.81738281, + "router_z_loss_mlp": 0.15576172, + "step": 6994, + "time_per_iteration": 2.5307908058166504 + }, + { + "auxiliary_loss_clip": 0.06461746, + "auxiliary_loss_mlp": 0.01269563, + "balance_loss_clip": 0.06292755, + "balance_loss_mlp": 0.01255544, + "epoch": 0.4205621524124455, + "flos": 18411548843520.0, + "grad_norm": 1.728991128313806, + "language_loss": 0.79243588, + "learning_rate": 2.6021595839932934e-06, + "loss": 0.86974895, + "num_input_tokens_seen": 150161345, + "router_z_loss_clip": 1.69140625, + "router_z_loss_mlp": 0.14013672, + "step": 6995, + "time_per_iteration": 2.5054030418395996 + }, + { + "auxiliary_loss_clip": 0.06461824, + "auxiliary_loss_mlp": 0.0126885, + "balance_loss_clip": 0.06293637, + "balance_loss_mlp": 0.01255433, + "epoch": 0.4206222756651135, + "flos": 25527133566720.0, + "grad_norm": 1.491511685078805, + "language_loss": 0.80235636, + "learning_rate": 2.60178818232786e-06, + "loss": 0.87966311, + "num_input_tokens_seen": 150182420, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.13409424, + "step": 6996, + "time_per_iteration": 2.6613996028900146 + }, + { + "auxiliary_loss_clip": 0.06466329, + "auxiliary_loss_mlp": 0.01268157, + "balance_loss_clip": 0.06293097, + "balance_loss_mlp": 0.01254466, + "epoch": 0.4206823989177815, + "flos": 15309708474240.0, + "grad_norm": 2.3637588948298998, + "language_loss": 0.76051879, + "learning_rate": 2.601416757842559e-06, + "loss": 0.83786368, + "num_input_tokens_seen": 150200175, + "router_z_loss_clip": 1.73339844, + "router_z_loss_mlp": 0.13690186, + "step": 6997, + "time_per_iteration": 2.484876871109009 + }, + { + "auxiliary_loss_clip": 0.06463061, + "auxiliary_loss_mlp": 0.0126838, + "balance_loss_clip": 0.06288689, + "balance_loss_mlp": 0.01253789, + "epoch": 0.42074252217044944, + "flos": 15558564700800.0, + "grad_norm": 2.0514206793414345, + "language_loss": 0.76478076, + "learning_rate": 2.6010453105514743e-06, + "loss": 0.84209514, + "num_input_tokens_seen": 150217100, + "router_z_loss_clip": 1.74316406, + "router_z_loss_mlp": 0.14599609, + "step": 6998, + "time_per_iteration": 2.5640127658843994 + }, + { + "auxiliary_loss_clip": 0.06466474, + "auxiliary_loss_mlp": 0.01275488, + "balance_loss_clip": 0.06289443, + "balance_loss_mlp": 0.01260587, + "epoch": 0.4208026454231174, + "flos": 26153941633920.0, + "grad_norm": 1.581279992496262, + "language_loss": 0.76102519, + "learning_rate": 2.60067384046869e-06, + "loss": 0.83844483, + "num_input_tokens_seen": 150239830, + "router_z_loss_clip": 1.76757812, + "router_z_loss_mlp": 0.14892578, + "step": 6999, + "time_per_iteration": 2.6406025886535645 + }, + { + "auxiliary_loss_clip": 0.06461642, + "auxiliary_loss_mlp": 0.01267644, + "balance_loss_clip": 0.06291209, + "balance_loss_mlp": 0.01254382, + "epoch": 0.42086276867578537, + "flos": 23556857460480.0, + "grad_norm": 1.988296138175356, + "language_loss": 0.64461291, + "learning_rate": 2.600302347608295e-06, + "loss": 0.72190583, + "num_input_tokens_seen": 150260690, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.13244629, + "step": 7000, + "time_per_iteration": 2.6081695556640625 + }, + { + "auxiliary_loss_clip": 0.06469343, + "auxiliary_loss_mlp": 0.01270405, + "balance_loss_clip": 0.06294516, + "balance_loss_mlp": 0.01256076, + "epoch": 0.42092289192845334, + "flos": 18119199548160.0, + "grad_norm": 1.6363851387704167, + "language_loss": 0.77022576, + "learning_rate": 2.5999308319843743e-06, + "loss": 0.84762329, + "num_input_tokens_seen": 150279885, + "router_z_loss_clip": 1.74804688, + "router_z_loss_mlp": 0.14318848, + "step": 7001, + "time_per_iteration": 2.5761475563049316 + }, + { + "auxiliary_loss_clip": 0.06461353, + "auxiliary_loss_mlp": 0.01268364, + "balance_loss_clip": 0.06290751, + "balance_loss_mlp": 0.01254882, + "epoch": 0.4209830151811213, + "flos": 20012006954880.0, + "grad_norm": 1.5030484792833017, + "language_loss": 0.86740428, + "learning_rate": 2.5995592936110154e-06, + "loss": 0.94470143, + "num_input_tokens_seen": 150297390, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.13482666, + "step": 7002, + "time_per_iteration": 2.585397958755493 + }, + { + "auxiliary_loss_clip": 0.06461627, + "auxiliary_loss_mlp": 0.01271644, + "balance_loss_clip": 0.06290498, + "balance_loss_mlp": 0.01258251, + "epoch": 0.42104313843378927, + "flos": 21985050245760.0, + "grad_norm": 2.152971198745627, + "language_loss": 0.68539977, + "learning_rate": 2.5991877325023096e-06, + "loss": 0.76273245, + "num_input_tokens_seen": 150317390, + "router_z_loss_clip": 1.70996094, + "router_z_loss_mlp": 0.1338501, + "step": 7003, + "time_per_iteration": 2.5039963722229004 + }, + { + "auxiliary_loss_clip": 0.06469242, + "auxiliary_loss_mlp": 0.01271214, + "balance_loss_clip": 0.06293743, + "balance_loss_mlp": 0.01255747, + "epoch": 0.42110326168645723, + "flos": 25450461480960.0, + "grad_norm": 1.8015075946869743, + "language_loss": 0.77306843, + "learning_rate": 2.598816148672344e-06, + "loss": 0.85047305, + "num_input_tokens_seen": 150337455, + "router_z_loss_clip": 1.75585938, + "router_z_loss_mlp": 0.15472412, + "step": 7004, + "time_per_iteration": 2.6128745079040527 + }, + { + "auxiliary_loss_clip": 0.06462541, + "auxiliary_loss_mlp": 0.01273285, + "balance_loss_clip": 0.06294234, + "balance_loss_mlp": 0.0125873, + "epoch": 0.4211633849391252, + "flos": 17828485407360.0, + "grad_norm": 1.7810886301824922, + "language_loss": 0.68804276, + "learning_rate": 2.59844454213521e-06, + "loss": 0.76540101, + "num_input_tokens_seen": 150355385, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.14562988, + "step": 7005, + "time_per_iteration": 3.888760566711426 + }, + { + "auxiliary_loss_clip": 0.06465107, + "auxiliary_loss_mlp": 0.01269773, + "balance_loss_clip": 0.0629124, + "balance_loss_mlp": 0.01255593, + "epoch": 0.42122350819179316, + "flos": 16286796535680.0, + "grad_norm": 1.8605985429595449, + "language_loss": 0.72998816, + "learning_rate": 2.5980729129049994e-06, + "loss": 0.80733699, + "num_input_tokens_seen": 150371750, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.14178467, + "step": 7006, + "time_per_iteration": 3.991835832595825 + }, + { + "auxiliary_loss_clip": 0.06464688, + "auxiliary_loss_mlp": 0.01266849, + "balance_loss_clip": 0.06289375, + "balance_loss_mlp": 0.01252424, + "epoch": 0.4212836314444611, + "flos": 19651916033280.0, + "grad_norm": 1.623062925912009, + "language_loss": 0.7118417, + "learning_rate": 2.5977012609958033e-06, + "loss": 0.78915709, + "num_input_tokens_seen": 150389955, + "router_z_loss_clip": 1.75585938, + "router_z_loss_mlp": 0.14416504, + "step": 7007, + "time_per_iteration": 2.5425753593444824 + }, + { + "auxiliary_loss_clip": 0.06463595, + "auxiliary_loss_mlp": 0.01271642, + "balance_loss_clip": 0.06289028, + "balance_loss_mlp": 0.01257581, + "epoch": 0.4213437546971291, + "flos": 18374889882240.0, + "grad_norm": 2.097779928402724, + "language_loss": 0.82573175, + "learning_rate": 2.5973295864217166e-06, + "loss": 0.90308416, + "num_input_tokens_seen": 150405780, + "router_z_loss_clip": 1.74316406, + "router_z_loss_mlp": 0.140625, + "step": 7008, + "time_per_iteration": 2.492260456085205 + }, + { + "auxiliary_loss_clip": 0.0646316, + "auxiliary_loss_mlp": 0.01269434, + "balance_loss_clip": 0.06289843, + "balance_loss_mlp": 0.01255129, + "epoch": 0.42140387794979706, + "flos": 27711116311680.0, + "grad_norm": 1.9580680041192111, + "language_loss": 0.72638381, + "learning_rate": 2.596957889196831e-06, + "loss": 0.80370975, + "num_input_tokens_seen": 150425615, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.14318848, + "step": 7009, + "time_per_iteration": 2.6216533184051514 + }, + { + "auxiliary_loss_clip": 0.06466616, + "auxiliary_loss_mlp": 0.0126722, + "balance_loss_clip": 0.06289244, + "balance_loss_mlp": 0.01253338, + "epoch": 0.4214640012024651, + "flos": 28154545718400.0, + "grad_norm": 2.5692415195563543, + "language_loss": 0.66926241, + "learning_rate": 2.596586169335243e-06, + "loss": 0.74660075, + "num_input_tokens_seen": 150445765, + "router_z_loss_clip": 1.77441406, + "router_z_loss_mlp": 0.13873291, + "step": 7010, + "time_per_iteration": 2.606501579284668 + }, + { + "auxiliary_loss_clip": 0.06462754, + "auxiliary_loss_mlp": 0.01271396, + "balance_loss_clip": 0.06290238, + "balance_loss_mlp": 0.01256662, + "epoch": 0.42152412445513304, + "flos": 23002989972480.0, + "grad_norm": 1.6839098151972378, + "language_loss": 0.7266804, + "learning_rate": 2.5962144268510477e-06, + "loss": 0.80402195, + "num_input_tokens_seen": 150464405, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.14727783, + "step": 7011, + "time_per_iteration": 4.0488903522491455 + }, + { + "auxiliary_loss_clip": 0.06363396, + "auxiliary_loss_mlp": 0.01255682, + "balance_loss_clip": 0.06285673, + "balance_loss_mlp": 0.01253149, + "epoch": 0.421584247707801, + "flos": 63767855756160.0, + "grad_norm": 0.7737758086067837, + "language_loss": 0.54255652, + "learning_rate": 2.5958426617583417e-06, + "loss": 0.61874723, + "num_input_tokens_seen": 150520430, + "router_z_loss_clip": 0.77734375, + "router_z_loss_mlp": 0.02532959, + "step": 7012, + "time_per_iteration": 3.0473456382751465 + }, + { + "auxiliary_loss_clip": 0.06465481, + "auxiliary_loss_mlp": 0.01272136, + "balance_loss_clip": 0.06289969, + "balance_loss_mlp": 0.01256656, + "epoch": 0.421644370960469, + "flos": 24321203205120.0, + "grad_norm": 1.3531523641491952, + "language_loss": 0.78821653, + "learning_rate": 2.5954708740712215e-06, + "loss": 0.86559272, + "num_input_tokens_seen": 150542610, + "router_z_loss_clip": 1.75292969, + "router_z_loss_mlp": 0.15472412, + "step": 7013, + "time_per_iteration": 2.5436811447143555 + }, + { + "auxiliary_loss_clip": 0.06463543, + "auxiliary_loss_mlp": 0.0127162, + "balance_loss_clip": 0.06287397, + "balance_loss_mlp": 0.01256516, + "epoch": 0.42170449421313694, + "flos": 23447425628160.0, + "grad_norm": 1.8634561108800796, + "language_loss": 0.81284738, + "learning_rate": 2.595099063803787e-06, + "loss": 0.89019895, + "num_input_tokens_seen": 150560970, + "router_z_loss_clip": 1.76269531, + "router_z_loss_mlp": 0.15100098, + "step": 7014, + "time_per_iteration": 2.6464757919311523 + }, + { + "auxiliary_loss_clip": 0.06460524, + "auxiliary_loss_mlp": 0.01273083, + "balance_loss_clip": 0.06287747, + "balance_loss_mlp": 0.01259225, + "epoch": 0.4217646174658049, + "flos": 23702151640320.0, + "grad_norm": 1.4680948866945018, + "language_loss": 0.77888769, + "learning_rate": 2.5947272309701354e-06, + "loss": 0.85622376, + "num_input_tokens_seen": 150582615, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.1385498, + "step": 7015, + "time_per_iteration": 4.043898582458496 + }, + { + "auxiliary_loss_clip": 0.06464352, + "auxiliary_loss_mlp": 0.01268474, + "balance_loss_clip": 0.06287283, + "balance_loss_mlp": 0.01253394, + "epoch": 0.42182474071847287, + "flos": 24978297323520.0, + "grad_norm": 1.853408702102599, + "language_loss": 0.82096922, + "learning_rate": 2.594355375584368e-06, + "loss": 0.89829755, + "num_input_tokens_seen": 150603640, + "router_z_loss_clip": 1.77050781, + "router_z_loss_mlp": 0.15075684, + "step": 7016, + "time_per_iteration": 2.5523900985717773 + }, + { + "auxiliary_loss_clip": 0.06465739, + "auxiliary_loss_mlp": 0.01271643, + "balance_loss_clip": 0.06291386, + "balance_loss_mlp": 0.01256527, + "epoch": 0.42188486397114083, + "flos": 22863230161920.0, + "grad_norm": 2.845700477826224, + "language_loss": 0.6853466, + "learning_rate": 2.593983497660586e-06, + "loss": 0.76272047, + "num_input_tokens_seen": 150622490, + "router_z_loss_clip": 1.74121094, + "router_z_loss_mlp": 0.15112305, + "step": 7017, + "time_per_iteration": 2.57027530670166 + }, + { + "auxiliary_loss_clip": 0.0636536, + "auxiliary_loss_mlp": 0.01255401, + "balance_loss_clip": 0.06287346, + "balance_loss_mlp": 0.01252595, + "epoch": 0.4219449872238088, + "flos": 66997072730880.0, + "grad_norm": 0.6666550742113542, + "language_loss": 0.59442866, + "learning_rate": 2.5936115972128895e-06, + "loss": 0.67063624, + "num_input_tokens_seen": 150689545, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 0.02804565, + "step": 7018, + "time_per_iteration": 3.1860194206237793 + }, + { + "auxiliary_loss_clip": 0.0646835, + "auxiliary_loss_mlp": 0.01271161, + "balance_loss_clip": 0.0628873, + "balance_loss_mlp": 0.0125617, + "epoch": 0.42200511047647676, + "flos": 13120400995200.0, + "grad_norm": 1.8819765217055724, + "language_loss": 0.75926054, + "learning_rate": 2.593239674255382e-06, + "loss": 0.83665562, + "num_input_tokens_seen": 150707610, + "router_z_loss_clip": 1.79492188, + "router_z_loss_mlp": 0.14990234, + "step": 7019, + "time_per_iteration": 2.542468309402466 + }, + { + "auxiliary_loss_clip": 0.06462015, + "auxiliary_loss_mlp": 0.01273146, + "balance_loss_clip": 0.06287961, + "balance_loss_mlp": 0.01257864, + "epoch": 0.42206523372914473, + "flos": 13996400705280.0, + "grad_norm": 1.899626408213008, + "language_loss": 0.69618917, + "learning_rate": 2.592867728802166e-06, + "loss": 0.77354079, + "num_input_tokens_seen": 150724530, + "router_z_loss_clip": 1.74121094, + "router_z_loss_mlp": 0.15283203, + "step": 7020, + "time_per_iteration": 2.4884140491485596 + }, + { + "auxiliary_loss_clip": 0.06459437, + "auxiliary_loss_mlp": 0.01272473, + "balance_loss_clip": 0.06290746, + "balance_loss_mlp": 0.01258347, + "epoch": 0.4221253569818127, + "flos": 21948391284480.0, + "grad_norm": 1.6760812445081854, + "language_loss": 0.81457055, + "learning_rate": 2.592495760867347e-06, + "loss": 0.89188963, + "num_input_tokens_seen": 150742870, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.14135742, + "step": 7021, + "time_per_iteration": 2.60335111618042 + }, + { + "auxiliary_loss_clip": 0.06460646, + "auxiliary_loss_mlp": 0.01268222, + "balance_loss_clip": 0.06286098, + "balance_loss_mlp": 0.01253869, + "epoch": 0.42218548023448066, + "flos": 32200001642880.0, + "grad_norm": 1.5750279801473723, + "language_loss": 0.70101392, + "learning_rate": 2.5921237704650293e-06, + "loss": 0.77830255, + "num_input_tokens_seen": 150765500, + "router_z_loss_clip": 1.74609375, + "router_z_loss_mlp": 0.14355469, + "step": 7022, + "time_per_iteration": 2.605795383453369 + }, + { + "auxiliary_loss_clip": 0.06450655, + "auxiliary_loss_mlp": 0.01272538, + "balance_loss_clip": 0.06284072, + "balance_loss_mlp": 0.01258788, + "epoch": 0.4222456034871487, + "flos": 30127043957760.0, + "grad_norm": 1.5974321201389856, + "language_loss": 0.67428911, + "learning_rate": 2.5917517576093188e-06, + "loss": 0.75152111, + "num_input_tokens_seen": 150784945, + "router_z_loss_clip": 1.66699219, + "router_z_loss_mlp": 0.13751221, + "step": 7023, + "time_per_iteration": 2.6615898609161377 + }, + { + "auxiliary_loss_clip": 0.06455819, + "auxiliary_loss_mlp": 0.01270862, + "balance_loss_clip": 0.06287459, + "balance_loss_mlp": 0.01255508, + "epoch": 0.42230572673981664, + "flos": 22134537129600.0, + "grad_norm": 1.6408413231786074, + "language_loss": 0.69710904, + "learning_rate": 2.591379722314322e-06, + "loss": 0.77437586, + "num_input_tokens_seen": 150803120, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.15356445, + "step": 7024, + "time_per_iteration": 2.531874895095825 + }, + { + "auxiliary_loss_clip": 0.06457987, + "auxiliary_loss_mlp": 0.01269795, + "balance_loss_clip": 0.06283922, + "balance_loss_mlp": 0.01255598, + "epoch": 0.4223658499924846, + "flos": 22061722331520.0, + "grad_norm": 2.1972757713163102, + "language_loss": 0.76880538, + "learning_rate": 2.591007664594147e-06, + "loss": 0.84608328, + "num_input_tokens_seen": 150823135, + "router_z_loss_clip": 1.73925781, + "router_z_loss_mlp": 0.14196777, + "step": 7025, + "time_per_iteration": 2.568814754486084 + }, + { + "auxiliary_loss_clip": 0.06457998, + "auxiliary_loss_mlp": 0.01277209, + "balance_loss_clip": 0.06287608, + "balance_loss_mlp": 0.01263017, + "epoch": 0.4224259732451526, + "flos": 20416681048320.0, + "grad_norm": 1.910881237925828, + "language_loss": 0.80124468, + "learning_rate": 2.5906355844629024e-06, + "loss": 0.87859672, + "num_input_tokens_seen": 150842070, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.14208984, + "step": 7026, + "time_per_iteration": 2.4988901615142822 + }, + { + "auxiliary_loss_clip": 0.06353324, + "auxiliary_loss_mlp": 0.01252769, + "balance_loss_clip": 0.06275862, + "balance_loss_mlp": 0.01250106, + "epoch": 0.42248609649782054, + "flos": 62866307750400.0, + "grad_norm": 0.7325438580667073, + "language_loss": 0.62037623, + "learning_rate": 2.5902634819346966e-06, + "loss": 0.69643718, + "num_input_tokens_seen": 150907450, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.0266571, + "step": 7027, + "time_per_iteration": 3.230607748031616 + }, + { + "auxiliary_loss_clip": 0.06460012, + "auxiliary_loss_mlp": 0.01272089, + "balance_loss_clip": 0.06290331, + "balance_loss_mlp": 0.01257456, + "epoch": 0.4225462197504885, + "flos": 26257126337280.0, + "grad_norm": 2.572422824646089, + "language_loss": 0.71053827, + "learning_rate": 2.5898913570236414e-06, + "loss": 0.78785932, + "num_input_tokens_seen": 150928040, + "router_z_loss_clip": 1.69824219, + "router_z_loss_mlp": 0.14642334, + "step": 7028, + "time_per_iteration": 2.5667781829833984 + }, + { + "auxiliary_loss_clip": 0.06463138, + "auxiliary_loss_mlp": 0.01269354, + "balance_loss_clip": 0.06289553, + "balance_loss_mlp": 0.01255437, + "epoch": 0.42260634300315647, + "flos": 20528209232640.0, + "grad_norm": 1.948126664005559, + "language_loss": 0.82621461, + "learning_rate": 2.589519209743846e-06, + "loss": 0.90353954, + "num_input_tokens_seen": 150945760, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.13928223, + "step": 7029, + "time_per_iteration": 2.5936038494110107 + }, + { + "auxiliary_loss_clip": 0.06468205, + "auxiliary_loss_mlp": 0.01274403, + "balance_loss_clip": 0.06289516, + "balance_loss_mlp": 0.01258441, + "epoch": 0.42266646625582444, + "flos": 24323676900480.0, + "grad_norm": 1.8377333901506168, + "language_loss": 0.75193119, + "learning_rate": 2.589147040109424e-06, + "loss": 0.82935727, + "num_input_tokens_seen": 150965665, + "router_z_loss_clip": 1.78613281, + "router_z_loss_mlp": 0.15966797, + "step": 7030, + "time_per_iteration": 2.6162269115448 + }, + { + "auxiliary_loss_clip": 0.06462294, + "auxiliary_loss_mlp": 0.01267502, + "balance_loss_clip": 0.06287964, + "balance_loss_mlp": 0.01251421, + "epoch": 0.4227265895084924, + "flos": 24210555488640.0, + "grad_norm": 1.9734407814648771, + "language_loss": 0.86909479, + "learning_rate": 2.588774848134486e-06, + "loss": 0.94639277, + "num_input_tokens_seen": 150982260, + "router_z_loss_clip": 1.74511719, + "router_z_loss_mlp": 0.1607666, + "step": 7031, + "time_per_iteration": 2.5292763710021973 + }, + { + "auxiliary_loss_clip": 0.06460671, + "auxiliary_loss_mlp": 0.01269226, + "balance_loss_clip": 0.06286174, + "balance_loss_mlp": 0.01255171, + "epoch": 0.42278671276116037, + "flos": 16915407465600.0, + "grad_norm": 1.893963671956315, + "language_loss": 0.73803562, + "learning_rate": 2.5884026338331473e-06, + "loss": 0.81533462, + "num_input_tokens_seen": 150999990, + "router_z_loss_clip": 1.74511719, + "router_z_loss_mlp": 0.140625, + "step": 7032, + "time_per_iteration": 2.5382707118988037 + }, + { + "auxiliary_loss_clip": 0.06463667, + "auxiliary_loss_mlp": 0.0126981, + "balance_loss_clip": 0.06286915, + "balance_loss_mlp": 0.01254874, + "epoch": 0.42284683601382833, + "flos": 25418162931840.0, + "grad_norm": 1.9439146678532522, + "language_loss": 0.70438349, + "learning_rate": 2.5880303972195222e-06, + "loss": 0.78171825, + "num_input_tokens_seen": 151021105, + "router_z_loss_clip": 1.76660156, + "router_z_loss_mlp": 0.1496582, + "step": 7033, + "time_per_iteration": 2.5798444747924805 + }, + { + "auxiliary_loss_clip": 0.06464536, + "auxiliary_loss_mlp": 0.01270969, + "balance_loss_clip": 0.06288149, + "balance_loss_mlp": 0.01256282, + "epoch": 0.4229069592664963, + "flos": 23047153873920.0, + "grad_norm": 1.8861418032064503, + "language_loss": 0.90879869, + "learning_rate": 2.5876581383077256e-06, + "loss": 0.98615378, + "num_input_tokens_seen": 151040665, + "router_z_loss_clip": 1.76269531, + "router_z_loss_mlp": 0.14685059, + "step": 7034, + "time_per_iteration": 2.5370678901672363 + }, + { + "auxiliary_loss_clip": 0.06455763, + "auxiliary_loss_mlp": 0.01270773, + "balance_loss_clip": 0.06283915, + "balance_loss_mlp": 0.01256676, + "epoch": 0.42296708251916426, + "flos": 26074586217600.0, + "grad_norm": 1.9962240812191803, + "language_loss": 0.77578306, + "learning_rate": 2.5872858571118723e-06, + "loss": 0.85304844, + "num_input_tokens_seen": 151061240, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.14080811, + "step": 7035, + "time_per_iteration": 2.542121648788452 + }, + { + "auxiliary_loss_clip": 0.06464495, + "auxiliary_loss_mlp": 0.01274418, + "balance_loss_clip": 0.06287753, + "balance_loss_mlp": 0.01259863, + "epoch": 0.4230272057718323, + "flos": 19463548055040.0, + "grad_norm": 2.323654021784471, + "language_loss": 0.83016878, + "learning_rate": 2.5869135536460817e-06, + "loss": 0.90755796, + "num_input_tokens_seen": 151076870, + "router_z_loss_clip": 1.76855469, + "router_z_loss_mlp": 0.14538574, + "step": 7036, + "time_per_iteration": 2.5446789264678955 + }, + { + "auxiliary_loss_clip": 0.06461224, + "auxiliary_loss_mlp": 0.01270872, + "balance_loss_clip": 0.06292447, + "balance_loss_mlp": 0.01256859, + "epoch": 0.42308732902450025, + "flos": 22389975901440.0, + "grad_norm": 1.9007003646753964, + "language_loss": 0.70561719, + "learning_rate": 2.58654122792447e-06, + "loss": 0.78293824, + "num_input_tokens_seen": 151095110, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.14031982, + "step": 7037, + "time_per_iteration": 2.5331337451934814 + }, + { + "auxiliary_loss_clip": 0.06462964, + "auxiliary_loss_mlp": 0.01269409, + "balance_loss_clip": 0.06289166, + "balance_loss_mlp": 0.01253923, + "epoch": 0.4231474522771682, + "flos": 21001631201280.0, + "grad_norm": 1.6547666669933128, + "language_loss": 0.77886164, + "learning_rate": 2.586168879961155e-06, + "loss": 0.85618538, + "num_input_tokens_seen": 151114355, + "router_z_loss_clip": 1.73828125, + "router_z_loss_mlp": 0.1550293, + "step": 7038, + "time_per_iteration": 2.547067165374756 + }, + { + "auxiliary_loss_clip": 0.06470759, + "auxiliary_loss_mlp": 0.01270751, + "balance_loss_clip": 0.06292742, + "balance_loss_mlp": 0.01255432, + "epoch": 0.4232075755298362, + "flos": 14981161415040.0, + "grad_norm": 2.6561544689274714, + "language_loss": 0.67851424, + "learning_rate": 2.585796509770259e-06, + "loss": 0.75592935, + "num_input_tokens_seen": 151131505, + "router_z_loss_clip": 1.77929688, + "router_z_loss_mlp": 0.15301514, + "step": 7039, + "time_per_iteration": 2.5148706436157227 + }, + { + "auxiliary_loss_clip": 0.06471442, + "auxiliary_loss_mlp": 0.01274269, + "balance_loss_clip": 0.06291762, + "balance_loss_mlp": 0.01258962, + "epoch": 0.42326769878250414, + "flos": 24539144474880.0, + "grad_norm": 1.5526791387199284, + "language_loss": 0.75859225, + "learning_rate": 2.5854241173658996e-06, + "loss": 0.83604932, + "num_input_tokens_seen": 151151555, + "router_z_loss_clip": 1.79492188, + "router_z_loss_mlp": 0.15307617, + "step": 7040, + "time_per_iteration": 2.6170670986175537 + }, + { + "auxiliary_loss_clip": 0.0646336, + "auxiliary_loss_mlp": 0.01267915, + "balance_loss_clip": 0.06288165, + "balance_loss_mlp": 0.01253199, + "epoch": 0.4233278220351721, + "flos": 26877603421440.0, + "grad_norm": 2.185572961013026, + "language_loss": 0.65619481, + "learning_rate": 2.5850517027621996e-06, + "loss": 0.73350751, + "num_input_tokens_seen": 151172385, + "router_z_loss_clip": 1.75097656, + "router_z_loss_mlp": 0.14715576, + "step": 7041, + "time_per_iteration": 2.5701920986175537 + }, + { + "auxiliary_loss_clip": 0.06470653, + "auxiliary_loss_mlp": 0.01271372, + "balance_loss_clip": 0.06294046, + "balance_loss_mlp": 0.01256626, + "epoch": 0.4233879452878401, + "flos": 42824951867520.0, + "grad_norm": 2.182989579985364, + "language_loss": 0.73763824, + "learning_rate": 2.5846792659732803e-06, + "loss": 0.81505847, + "num_input_tokens_seen": 151194930, + "router_z_loss_clip": 1.76367188, + "router_z_loss_mlp": 0.14752197, + "step": 7042, + "time_per_iteration": 2.7377729415893555 + }, + { + "auxiliary_loss_clip": 0.06466709, + "auxiliary_loss_mlp": 0.01270508, + "balance_loss_clip": 0.06294659, + "balance_loss_mlp": 0.01256119, + "epoch": 0.42344806854050804, + "flos": 25236125936640.0, + "grad_norm": 1.357775127981886, + "language_loss": 0.82479644, + "learning_rate": 2.5843068070132643e-06, + "loss": 0.90216863, + "num_input_tokens_seen": 151217905, + "router_z_loss_clip": 1.72167969, + "router_z_loss_mlp": 0.14379883, + "step": 7043, + "time_per_iteration": 2.6002635955810547 + }, + { + "auxiliary_loss_clip": 0.06466006, + "auxiliary_loss_mlp": 0.01268509, + "balance_loss_clip": 0.06294385, + "balance_loss_mlp": 0.01252749, + "epoch": 0.423508191793176, + "flos": 22784587505280.0, + "grad_norm": 2.981661405110402, + "language_loss": 0.65042412, + "learning_rate": 2.5839343258962763e-06, + "loss": 0.72776926, + "num_input_tokens_seen": 151234580, + "router_z_loss_clip": 1.71679688, + "router_z_loss_mlp": 0.1574707, + "step": 7044, + "time_per_iteration": 4.032661437988281 + }, + { + "auxiliary_loss_clip": 0.06473978, + "auxiliary_loss_mlp": 0.01277434, + "balance_loss_clip": 0.06294475, + "balance_loss_mlp": 0.01261793, + "epoch": 0.42356831504584397, + "flos": 34645376799360.0, + "grad_norm": 1.8091896069955142, + "language_loss": 0.74864423, + "learning_rate": 2.5835618226364393e-06, + "loss": 0.82615834, + "num_input_tokens_seen": 151254765, + "router_z_loss_clip": 1.79394531, + "router_z_loss_mlp": 0.15649414, + "step": 7045, + "time_per_iteration": 2.6634554862976074 + }, + { + "auxiliary_loss_clip": 0.06458761, + "auxiliary_loss_mlp": 0.01272071, + "balance_loss_clip": 0.06289783, + "balance_loss_mlp": 0.01258177, + "epoch": 0.42362843829851193, + "flos": 17601487896960.0, + "grad_norm": 2.434331790625752, + "language_loss": 0.8101598, + "learning_rate": 2.5831892972478797e-06, + "loss": 0.88746816, + "num_input_tokens_seen": 151269045, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.13885498, + "step": 7046, + "time_per_iteration": 3.8471035957336426 + }, + { + "auxiliary_loss_clip": 0.06470428, + "auxiliary_loss_mlp": 0.01270077, + "balance_loss_clip": 0.06293224, + "balance_loss_mlp": 0.01255635, + "epoch": 0.4236885615511799, + "flos": 22572390240000.0, + "grad_norm": 1.5654922866483163, + "language_loss": 0.77272886, + "learning_rate": 2.5828167497447242e-06, + "loss": 0.8501339, + "num_input_tokens_seen": 151287530, + "router_z_loss_clip": 1.77050781, + "router_z_loss_mlp": 0.14416504, + "step": 7047, + "time_per_iteration": 2.5323123931884766 + }, + { + "auxiliary_loss_clip": 0.06461948, + "auxiliary_loss_mlp": 0.01271728, + "balance_loss_clip": 0.06291857, + "balance_loss_mlp": 0.01258245, + "epoch": 0.42374868480384786, + "flos": 26476493126400.0, + "grad_norm": 1.7230664508561655, + "language_loss": 0.68109751, + "learning_rate": 2.582444180141098e-06, + "loss": 0.75843424, + "num_input_tokens_seen": 151308905, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.13482666, + "step": 7048, + "time_per_iteration": 2.5632970333099365 + }, + { + "auxiliary_loss_clip": 0.06464637, + "auxiliary_loss_mlp": 0.01268497, + "balance_loss_clip": 0.06289657, + "balance_loss_mlp": 0.01253263, + "epoch": 0.4238088080565159, + "flos": 20375493966720.0, + "grad_norm": 1.6594147848364105, + "language_loss": 0.78005636, + "learning_rate": 2.5820715884511307e-06, + "loss": 0.85738766, + "num_input_tokens_seen": 151326525, + "router_z_loss_clip": 1.74707031, + "router_z_loss_mlp": 0.15234375, + "step": 7049, + "time_per_iteration": 2.5366568565368652 + }, + { + "auxiliary_loss_clip": 0.06468852, + "auxiliary_loss_mlp": 0.01270789, + "balance_loss_clip": 0.06292627, + "balance_loss_mlp": 0.01256067, + "epoch": 0.42386893130918385, + "flos": 21177379140480.0, + "grad_norm": 1.886460992095426, + "language_loss": 0.83185136, + "learning_rate": 2.5816989746889504e-06, + "loss": 0.90924776, + "num_input_tokens_seen": 151344675, + "router_z_loss_clip": 1.76269531, + "router_z_loss_mlp": 0.1472168, + "step": 7050, + "time_per_iteration": 2.5130441188812256 + }, + { + "auxiliary_loss_clip": 0.06460265, + "auxiliary_loss_mlp": 0.01271009, + "balance_loss_clip": 0.06286017, + "balance_loss_mlp": 0.01255738, + "epoch": 0.4239290545618518, + "flos": 17681346437760.0, + "grad_norm": 2.0965482043088968, + "language_loss": 0.73218369, + "learning_rate": 2.581326338868687e-06, + "loss": 0.80949646, + "num_input_tokens_seen": 151360730, + "router_z_loss_clip": 1.74316406, + "router_z_loss_mlp": 0.15283203, + "step": 7051, + "time_per_iteration": 3.92645263671875 + }, + { + "auxiliary_loss_clip": 0.06464715, + "auxiliary_loss_mlp": 0.01268876, + "balance_loss_clip": 0.06291503, + "balance_loss_mlp": 0.01254595, + "epoch": 0.4239891778145198, + "flos": 24321077424000.0, + "grad_norm": 1.57175281695923, + "language_loss": 0.86744994, + "learning_rate": 2.5809536810044706e-06, + "loss": 0.94478583, + "num_input_tokens_seen": 151380445, + "router_z_loss_clip": 1.73144531, + "router_z_loss_mlp": 0.1427002, + "step": 7052, + "time_per_iteration": 2.584425210952759 + }, + { + "auxiliary_loss_clip": 0.06467065, + "auxiliary_loss_mlp": 0.01277353, + "balance_loss_clip": 0.06289236, + "balance_loss_mlp": 0.01262559, + "epoch": 0.42404930106718774, + "flos": 20564700485760.0, + "grad_norm": 1.3965954512003949, + "language_loss": 0.72571224, + "learning_rate": 2.5805810011104323e-06, + "loss": 0.80315644, + "num_input_tokens_seen": 151399325, + "router_z_loss_clip": 1.77734375, + "router_z_loss_mlp": 0.14794922, + "step": 7053, + "time_per_iteration": 2.5454976558685303 + }, + { + "auxiliary_loss_clip": 0.06462884, + "auxiliary_loss_mlp": 0.01267759, + "balance_loss_clip": 0.06288673, + "balance_loss_mlp": 0.01253251, + "epoch": 0.4241094243198557, + "flos": 22314351991680.0, + "grad_norm": 1.5249079777591508, + "language_loss": 0.82902604, + "learning_rate": 2.580208299200704e-06, + "loss": 0.90633249, + "num_input_tokens_seen": 151417240, + "router_z_loss_clip": 1.74023438, + "router_z_loss_mlp": 0.14508057, + "step": 7054, + "time_per_iteration": 4.019419193267822 + }, + { + "auxiliary_loss_clip": 0.06381379, + "auxiliary_loss_mlp": 0.01253973, + "balance_loss_clip": 0.06300146, + "balance_loss_mlp": 0.01250773, + "epoch": 0.4241695475725237, + "flos": 70632445973760.0, + "grad_norm": 0.7904217901105888, + "language_loss": 0.60280955, + "learning_rate": 2.5798355752894183e-06, + "loss": 0.6791631, + "num_input_tokens_seen": 151476015, + "router_z_loss_clip": 0.8125, + "router_z_loss_mlp": 0.03204346, + "step": 7055, + "time_per_iteration": 3.152217388153076 + }, + { + "auxiliary_loss_clip": 0.06467455, + "auxiliary_loss_mlp": 0.01267499, + "balance_loss_clip": 0.06290264, + "balance_loss_mlp": 0.01252717, + "epoch": 0.42422967082519164, + "flos": 14032640396160.0, + "grad_norm": 2.414100924234879, + "language_loss": 0.77460873, + "learning_rate": 2.5794628293907107e-06, + "loss": 0.85195827, + "num_input_tokens_seen": 151492035, + "router_z_loss_clip": 1.7734375, + "router_z_loss_mlp": 0.14782715, + "step": 7056, + "time_per_iteration": 2.469475746154785 + }, + { + "auxiliary_loss_clip": 0.06476917, + "auxiliary_loss_mlp": 0.01275416, + "balance_loss_clip": 0.06295634, + "balance_loss_mlp": 0.01259013, + "epoch": 0.4242897940778596, + "flos": 22351975274880.0, + "grad_norm": 2.3823515442172187, + "language_loss": 0.84773225, + "learning_rate": 2.579090061518714e-06, + "loss": 0.92525554, + "num_input_tokens_seen": 151508970, + "router_z_loss_clip": 1.8125, + "router_z_loss_mlp": 0.1640625, + "step": 7057, + "time_per_iteration": 2.559659481048584 + }, + { + "auxiliary_loss_clip": 0.06472223, + "auxiliary_loss_mlp": 0.01277699, + "balance_loss_clip": 0.06293373, + "balance_loss_mlp": 0.01262202, + "epoch": 0.42434991733052757, + "flos": 22601502334080.0, + "grad_norm": 3.5122040291641583, + "language_loss": 0.83485544, + "learning_rate": 2.5787172716875642e-06, + "loss": 0.91235471, + "num_input_tokens_seen": 151525295, + "router_z_loss_clip": 1.78808594, + "router_z_loss_mlp": 0.15490723, + "step": 7058, + "time_per_iteration": 2.4998161792755127 + }, + { + "auxiliary_loss_clip": 0.06459209, + "auxiliary_loss_mlp": 0.01270641, + "balance_loss_clip": 0.06288499, + "balance_loss_mlp": 0.01256205, + "epoch": 0.42441004058319554, + "flos": 20017667105280.0, + "grad_norm": 2.0122152391379498, + "language_loss": 0.80975556, + "learning_rate": 2.5783444599113973e-06, + "loss": 0.88705409, + "num_input_tokens_seen": 151544435, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.14440918, + "step": 7059, + "time_per_iteration": 2.581310987472534 + }, + { + "auxiliary_loss_clip": 0.06467164, + "auxiliary_loss_mlp": 0.0127411, + "balance_loss_clip": 0.06288522, + "balance_loss_mlp": 0.01258053, + "epoch": 0.4244701638358635, + "flos": 11149663691520.0, + "grad_norm": 2.3594129001130963, + "language_loss": 0.70608068, + "learning_rate": 2.57797162620435e-06, + "loss": 0.7834934, + "num_input_tokens_seen": 151559520, + "router_z_loss_clip": 1.78710938, + "router_z_loss_mlp": 0.16064453, + "step": 7060, + "time_per_iteration": 2.485072612762451 + }, + { + "auxiliary_loss_clip": 0.06469266, + "auxiliary_loss_mlp": 0.01274664, + "balance_loss_clip": 0.06293246, + "balance_loss_mlp": 0.01260317, + "epoch": 0.42453028708853147, + "flos": 23994542862720.0, + "grad_norm": 1.485543893241047, + "language_loss": 0.76297516, + "learning_rate": 2.577598770580562e-06, + "loss": 0.84041446, + "num_input_tokens_seen": 151579790, + "router_z_loss_clip": 1.76171875, + "router_z_loss_mlp": 0.14324951, + "step": 7061, + "time_per_iteration": 2.594430685043335 + }, + { + "auxiliary_loss_clip": 0.06469865, + "auxiliary_loss_mlp": 0.01271574, + "balance_loss_clip": 0.06291063, + "balance_loss_mlp": 0.01256643, + "epoch": 0.42459041034119943, + "flos": 18412345457280.0, + "grad_norm": 1.9822246970542112, + "language_loss": 0.72630441, + "learning_rate": 2.5772258930541693e-06, + "loss": 0.80371881, + "num_input_tokens_seen": 151598285, + "router_z_loss_clip": 1.79296875, + "router_z_loss_mlp": 0.14935303, + "step": 7062, + "time_per_iteration": 2.64372181892395 + }, + { + "auxiliary_loss_clip": 0.06460352, + "auxiliary_loss_mlp": 0.01277188, + "balance_loss_clip": 0.06284757, + "balance_loss_mlp": 0.01262215, + "epoch": 0.42465053359386745, + "flos": 20964049845120.0, + "grad_norm": 2.6818567528078923, + "language_loss": 0.66330427, + "learning_rate": 2.5768529936393137e-06, + "loss": 0.74067968, + "num_input_tokens_seen": 151615430, + "router_z_loss_clip": 1.7578125, + "router_z_loss_mlp": 0.1496582, + "step": 7063, + "time_per_iteration": 2.5413248538970947 + }, + { + "auxiliary_loss_clip": 0.06452604, + "auxiliary_loss_mlp": 0.01267624, + "balance_loss_clip": 0.062814, + "balance_loss_mlp": 0.01254195, + "epoch": 0.4247106568465354, + "flos": 33114001979520.0, + "grad_norm": 1.5147527354116395, + "language_loss": 0.78917265, + "learning_rate": 2.5764800723501354e-06, + "loss": 0.86637491, + "num_input_tokens_seen": 151637030, + "router_z_loss_clip": 1.71191406, + "router_z_loss_mlp": 0.13446045, + "step": 7064, + "time_per_iteration": 2.610231876373291 + }, + { + "auxiliary_loss_clip": 0.06469544, + "auxiliary_loss_mlp": 0.01271013, + "balance_loss_clip": 0.06291715, + "balance_loss_mlp": 0.01256267, + "epoch": 0.4247707800992034, + "flos": 20052984401280.0, + "grad_norm": 1.8682780470126852, + "language_loss": 0.75125778, + "learning_rate": 2.5761071292007736e-06, + "loss": 0.82866335, + "num_input_tokens_seen": 151655745, + "router_z_loss_clip": 1.77832031, + "router_z_loss_mlp": 0.14733887, + "step": 7065, + "time_per_iteration": 2.583846092224121 + }, + { + "auxiliary_loss_clip": 0.06463289, + "auxiliary_loss_mlp": 0.01272027, + "balance_loss_clip": 0.06289071, + "balance_loss_mlp": 0.01256971, + "epoch": 0.42483090335187135, + "flos": 22392114180480.0, + "grad_norm": 1.5143179334948575, + "language_loss": 0.72187293, + "learning_rate": 2.5757341642053725e-06, + "loss": 0.79922605, + "num_input_tokens_seen": 151678040, + "router_z_loss_clip": 1.7421875, + "router_z_loss_mlp": 0.1505127, + "step": 7066, + "time_per_iteration": 2.5569074153900146 + }, + { + "auxiliary_loss_clip": 0.06467879, + "auxiliary_loss_mlp": 0.01269525, + "balance_loss_clip": 0.06290474, + "balance_loss_mlp": 0.01254231, + "epoch": 0.4248910266045393, + "flos": 21362518736640.0, + "grad_norm": 2.6158792173392484, + "language_loss": 0.79757857, + "learning_rate": 2.5753611773780745e-06, + "loss": 0.87495261, + "num_input_tokens_seen": 151696410, + "router_z_loss_clip": 1.77441406, + "router_z_loss_mlp": 0.15289307, + "step": 7067, + "time_per_iteration": 2.5845797061920166 + }, + { + "auxiliary_loss_clip": 0.06384341, + "auxiliary_loss_mlp": 0.01254549, + "balance_loss_clip": 0.06303053, + "balance_loss_mlp": 0.01250746, + "epoch": 0.4249511498572073, + "flos": 64026942180480.0, + "grad_norm": 1.3506219442036578, + "language_loss": 0.63354319, + "learning_rate": 2.574988168733022e-06, + "loss": 0.70993209, + "num_input_tokens_seen": 151756365, + "router_z_loss_clip": 0.8125, + "router_z_loss_mlp": 0.03796387, + "step": 7068, + "time_per_iteration": 3.082864284515381 + }, + { + "auxiliary_loss_clip": 0.06464778, + "auxiliary_loss_mlp": 0.0127101, + "balance_loss_clip": 0.06287815, + "balance_loss_mlp": 0.01255155, + "epoch": 0.42501127310987524, + "flos": 19612699522560.0, + "grad_norm": 2.0360912712095875, + "language_loss": 0.72778141, + "learning_rate": 2.574615138284361e-06, + "loss": 0.8051393, + "num_input_tokens_seen": 151775165, + "router_z_loss_clip": 1.76953125, + "router_z_loss_mlp": 0.15844727, + "step": 7069, + "time_per_iteration": 2.560899257659912 + }, + { + "auxiliary_loss_clip": 0.06466071, + "auxiliary_loss_mlp": 0.01271316, + "balance_loss_clip": 0.06289013, + "balance_loss_mlp": 0.01255378, + "epoch": 0.4250713963625432, + "flos": 19468160029440.0, + "grad_norm": 2.1627827730841074, + "language_loss": 0.79640651, + "learning_rate": 2.5742420860462364e-06, + "loss": 0.87378043, + "num_input_tokens_seen": 151792620, + "router_z_loss_clip": 1.77050781, + "router_z_loss_mlp": 0.15930176, + "step": 7070, + "time_per_iteration": 2.507615327835083 + }, + { + "auxiliary_loss_clip": 0.06461551, + "auxiliary_loss_mlp": 0.01270578, + "balance_loss_clip": 0.06285524, + "balance_loss_mlp": 0.01255117, + "epoch": 0.4251315196152112, + "flos": 25344719228160.0, + "grad_norm": 1.9437385428250697, + "language_loss": 0.70912981, + "learning_rate": 2.573869012032795e-06, + "loss": 0.7864511, + "num_input_tokens_seen": 151812850, + "router_z_loss_clip": 1.76074219, + "router_z_loss_mlp": 0.15454102, + "step": 7071, + "time_per_iteration": 2.5730371475219727 + }, + { + "auxiliary_loss_clip": 0.06465049, + "auxiliary_loss_mlp": 0.01271451, + "balance_loss_clip": 0.06289509, + "balance_loss_mlp": 0.01256896, + "epoch": 0.42519164286787914, + "flos": 26366348534400.0, + "grad_norm": 2.618295142810269, + "language_loss": 0.71212989, + "learning_rate": 2.5734959162581824e-06, + "loss": 0.78949487, + "num_input_tokens_seen": 151831785, + "router_z_loss_clip": 1.75390625, + "router_z_loss_mlp": 0.14544678, + "step": 7072, + "time_per_iteration": 2.5560264587402344 + }, + { + "auxiliary_loss_clip": 0.06469329, + "auxiliary_loss_mlp": 0.01270547, + "balance_loss_clip": 0.06289761, + "balance_loss_mlp": 0.01256182, + "epoch": 0.4252517661205471, + "flos": 26038220745600.0, + "grad_norm": 1.647981639391401, + "language_loss": 0.81448823, + "learning_rate": 2.5731227987365475e-06, + "loss": 0.89188695, + "num_input_tokens_seen": 151853885, + "router_z_loss_clip": 1.79589844, + "router_z_loss_mlp": 0.14385986, + "step": 7073, + "time_per_iteration": 2.5955123901367188 + }, + { + "auxiliary_loss_clip": 0.06462769, + "auxiliary_loss_mlp": 0.01273163, + "balance_loss_clip": 0.06288294, + "balance_loss_mlp": 0.01259204, + "epoch": 0.42531188937321507, + "flos": 12718536013440.0, + "grad_norm": 2.653395632366352, + "language_loss": 0.91860557, + "learning_rate": 2.5727496594820386e-06, + "loss": 0.99596488, + "num_input_tokens_seen": 151871780, + "router_z_loss_clip": 1.74316406, + "router_z_loss_mlp": 0.1395874, + "step": 7074, + "time_per_iteration": 2.4894237518310547 + }, + { + "auxiliary_loss_clip": 0.06467288, + "auxiliary_loss_mlp": 0.01273087, + "balance_loss_clip": 0.06287881, + "balance_loss_mlp": 0.0125827, + "epoch": 0.42537201262588303, + "flos": 22098339365760.0, + "grad_norm": 1.877755960639547, + "language_loss": 0.64814276, + "learning_rate": 2.572376498508805e-06, + "loss": 0.72554648, + "num_input_tokens_seen": 151891600, + "router_z_loss_clip": 1.79296875, + "router_z_loss_mlp": 0.14807129, + "step": 7075, + "time_per_iteration": 2.598754644393921 + }, + { + "auxiliary_loss_clip": 0.06455241, + "auxiliary_loss_mlp": 0.01269515, + "balance_loss_clip": 0.06284718, + "balance_loss_mlp": 0.01255246, + "epoch": 0.42543213587855105, + "flos": 23009824080000.0, + "grad_norm": 2.0883967049140666, + "language_loss": 0.74251705, + "learning_rate": 2.5720033158309973e-06, + "loss": 0.81976461, + "num_input_tokens_seen": 151911330, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.1427002, + "step": 7076, + "time_per_iteration": 2.537986993789673 + }, + { + "auxiliary_loss_clip": 0.0646292, + "auxiliary_loss_mlp": 0.01270865, + "balance_loss_clip": 0.06284414, + "balance_loss_mlp": 0.01256334, + "epoch": 0.425492259131219, + "flos": 25089448164480.0, + "grad_norm": 3.3689754116422335, + "language_loss": 0.79212517, + "learning_rate": 2.571630111462766e-06, + "loss": 0.86946297, + "num_input_tokens_seen": 151930355, + "router_z_loss_clip": 1.78417969, + "router_z_loss_mlp": 0.14520264, + "step": 7077, + "time_per_iteration": 2.6490280628204346 + }, + { + "auxiliary_loss_clip": 0.06455311, + "auxiliary_loss_mlp": 0.01267846, + "balance_loss_clip": 0.06287791, + "balance_loss_mlp": 0.01254721, + "epoch": 0.425552382383887, + "flos": 22822881621120.0, + "grad_norm": 1.7167135286528112, + "language_loss": 0.7317155, + "learning_rate": 2.571256885418265e-06, + "loss": 0.80894709, + "num_input_tokens_seen": 151949695, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.13116455, + "step": 7078, + "time_per_iteration": 2.5729281902313232 + }, + { + "auxiliary_loss_clip": 0.06459501, + "auxiliary_loss_mlp": 0.01269381, + "balance_loss_clip": 0.06290293, + "balance_loss_mlp": 0.01256173, + "epoch": 0.42561250563655495, + "flos": 13558757230080.0, + "grad_norm": 1.6803598980459025, + "language_loss": 0.80183727, + "learning_rate": 2.5708836377116445e-06, + "loss": 0.87912607, + "num_input_tokens_seen": 151967640, + "router_z_loss_clip": 1.69238281, + "router_z_loss_mlp": 0.13201904, + "step": 7079, + "time_per_iteration": 2.4937188625335693 + }, + { + "auxiliary_loss_clip": 0.06460771, + "auxiliary_loss_mlp": 0.0127097, + "balance_loss_clip": 0.06287594, + "balance_loss_mlp": 0.01257481, + "epoch": 0.4256726288892229, + "flos": 46989692478720.0, + "grad_norm": 1.4689183555154843, + "language_loss": 0.71987867, + "learning_rate": 2.5705103683570592e-06, + "loss": 0.79719609, + "num_input_tokens_seen": 151994020, + "router_z_loss_clip": 1.73046875, + "router_z_loss_mlp": 0.13500977, + "step": 7080, + "time_per_iteration": 2.774247884750366 + }, + { + "auxiliary_loss_clip": 0.06462272, + "auxiliary_loss_mlp": 0.01269683, + "balance_loss_clip": 0.0628937, + "balance_loss_mlp": 0.01256505, + "epoch": 0.4257327521418909, + "flos": 23593181005440.0, + "grad_norm": 1.9610396393278133, + "language_loss": 0.80520535, + "learning_rate": 2.5701370773686646e-06, + "loss": 0.88252497, + "num_input_tokens_seen": 152013415, + "router_z_loss_clip": 1.72949219, + "router_z_loss_mlp": 0.13165283, + "step": 7081, + "time_per_iteration": 2.53387451171875 + }, + { + "auxiliary_loss_clip": 0.06452817, + "auxiliary_loss_mlp": 0.01271536, + "balance_loss_clip": 0.06286353, + "balance_loss_mlp": 0.01257844, + "epoch": 0.42579287539455885, + "flos": 18996079726080.0, + "grad_norm": 1.496926936820616, + "language_loss": 0.81558168, + "learning_rate": 2.5697637647606138e-06, + "loss": 0.89282513, + "num_input_tokens_seen": 152030860, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.13702393, + "step": 7082, + "time_per_iteration": 2.50972580909729 + }, + { + "auxiliary_loss_clip": 0.06462308, + "auxiliary_loss_mlp": 0.01271701, + "balance_loss_clip": 0.06289167, + "balance_loss_mlp": 0.0125745, + "epoch": 0.4258529986472268, + "flos": 25198921923840.0, + "grad_norm": 1.6583429285627758, + "language_loss": 0.70258069, + "learning_rate": 2.569390430547065e-06, + "loss": 0.77992082, + "num_input_tokens_seen": 152050395, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.14251709, + "step": 7083, + "time_per_iteration": 2.543390989303589 + }, + { + "auxiliary_loss_clip": 0.06373302, + "auxiliary_loss_mlp": 0.01258345, + "balance_loss_clip": 0.06290752, + "balance_loss_mlp": 0.01254316, + "epoch": 0.4259131218998948, + "flos": 69990277881600.0, + "grad_norm": 0.8555028711944374, + "language_loss": 0.67011017, + "learning_rate": 2.569017074742173e-06, + "loss": 0.74642664, + "num_input_tokens_seen": 152113555, + "router_z_loss_clip": 0.82714844, + "router_z_loss_mlp": 0.0402832, + "step": 7084, + "time_per_iteration": 4.592621803283691 + }, + { + "auxiliary_loss_clip": 0.0645996, + "auxiliary_loss_mlp": 0.01273486, + "balance_loss_clip": 0.06287397, + "balance_loss_mlp": 0.01259348, + "epoch": 0.42597324515256274, + "flos": 18010899745920.0, + "grad_norm": 6.078178213614668, + "language_loss": 0.78467649, + "learning_rate": 2.5686436973600964e-06, + "loss": 0.86201096, + "num_input_tokens_seen": 152131575, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.14135742, + "step": 7085, + "time_per_iteration": 4.053593635559082 + }, + { + "auxiliary_loss_clip": 0.0647409, + "auxiliary_loss_mlp": 0.01277113, + "balance_loss_clip": 0.0629435, + "balance_loss_mlp": 0.01262158, + "epoch": 0.4260333684052307, + "flos": 15164204659200.0, + "grad_norm": 2.149155774842141, + "language_loss": 0.7699095, + "learning_rate": 2.568270298414995e-06, + "loss": 0.84742153, + "num_input_tokens_seen": 152149435, + "router_z_loss_clip": 1.79785156, + "router_z_loss_mlp": 0.1496582, + "step": 7086, + "time_per_iteration": 2.480053424835205 + }, + { + "auxiliary_loss_clip": 0.06458418, + "auxiliary_loss_mlp": 0.01275137, + "balance_loss_clip": 0.06286179, + "balance_loss_mlp": 0.01260129, + "epoch": 0.42609349165789867, + "flos": 14944628234880.0, + "grad_norm": 1.8417550415955477, + "language_loss": 0.80286872, + "learning_rate": 2.5678968779210255e-06, + "loss": 0.88020432, + "num_input_tokens_seen": 152166860, + "router_z_loss_clip": 1.72070312, + "router_z_loss_mlp": 0.15026855, + "step": 7087, + "time_per_iteration": 2.5487940311431885 + }, + { + "auxiliary_loss_clip": 0.06464538, + "auxiliary_loss_mlp": 0.01271303, + "balance_loss_clip": 0.06291935, + "balance_loss_mlp": 0.01257183, + "epoch": 0.42615361491056664, + "flos": 23738642893440.0, + "grad_norm": 2.1069826106325213, + "language_loss": 0.66537511, + "learning_rate": 2.5675234358923505e-06, + "loss": 0.7427336, + "num_input_tokens_seen": 152187475, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.14111328, + "step": 7088, + "time_per_iteration": 2.5807759761810303 + }, + { + "auxiliary_loss_clip": 0.06470972, + "auxiliary_loss_mlp": 0.01274052, + "balance_loss_clip": 0.06293773, + "balance_loss_mlp": 0.01260402, + "epoch": 0.42621373816323466, + "flos": 24943399297920.0, + "grad_norm": 2.133950232933384, + "language_loss": 0.69013214, + "learning_rate": 2.56714997234313e-06, + "loss": 0.76758242, + "num_input_tokens_seen": 152207235, + "router_z_loss_clip": 1.76660156, + "router_z_loss_mlp": 0.13665771, + "step": 7089, + "time_per_iteration": 2.5817432403564453 + }, + { + "auxiliary_loss_clip": 0.06463064, + "auxiliary_loss_mlp": 0.0127013, + "balance_loss_clip": 0.0628805, + "balance_loss_mlp": 0.0125598, + "epoch": 0.4262738614159026, + "flos": 13558044470400.0, + "grad_norm": 4.212045379455766, + "language_loss": 0.74597216, + "learning_rate": 2.566776487287525e-06, + "loss": 0.82330406, + "num_input_tokens_seen": 152224240, + "router_z_loss_clip": 1.74804688, + "router_z_loss_mlp": 0.14141846, + "step": 7090, + "time_per_iteration": 3.9426205158233643 + }, + { + "auxiliary_loss_clip": 0.06464858, + "auxiliary_loss_mlp": 0.01272944, + "balance_loss_clip": 0.06287836, + "balance_loss_mlp": 0.01259211, + "epoch": 0.4263339846685706, + "flos": 29755926224640.0, + "grad_norm": 2.684790824023287, + "language_loss": 0.75386477, + "learning_rate": 2.5664029807396994e-06, + "loss": 0.8312428, + "num_input_tokens_seen": 152242595, + "router_z_loss_clip": 1.77148438, + "router_z_loss_mlp": 0.13745117, + "step": 7091, + "time_per_iteration": 2.563892126083374 + }, + { + "auxiliary_loss_clip": 0.0645293, + "auxiliary_loss_mlp": 0.01269396, + "balance_loss_clip": 0.06285767, + "balance_loss_mlp": 0.01257278, + "epoch": 0.42639410792123855, + "flos": 16839406212480.0, + "grad_norm": 1.8445868770478253, + "language_loss": 0.82496071, + "learning_rate": 2.5660294527138156e-06, + "loss": 0.90218395, + "num_input_tokens_seen": 152260840, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.12121582, + "step": 7092, + "time_per_iteration": 2.55583119392395 + }, + { + "auxiliary_loss_clip": 0.06467807, + "auxiliary_loss_mlp": 0.01271484, + "balance_loss_clip": 0.06288138, + "balance_loss_mlp": 0.01257567, + "epoch": 0.4264542311739065, + "flos": 28769991557760.0, + "grad_norm": 1.5226511822280566, + "language_loss": 0.73850381, + "learning_rate": 2.565655903224038e-06, + "loss": 0.81589675, + "num_input_tokens_seen": 152280580, + "router_z_loss_clip": 1.796875, + "router_z_loss_mlp": 0.13922119, + "step": 7093, + "time_per_iteration": 4.021864414215088 + }, + { + "auxiliary_loss_clip": 0.06460725, + "auxiliary_loss_mlp": 0.012688, + "balance_loss_clip": 0.06287876, + "balance_loss_mlp": 0.01254512, + "epoch": 0.4265143544265745, + "flos": 24719881731840.0, + "grad_norm": 2.2430846112789617, + "language_loss": 0.70883787, + "learning_rate": 2.565282332284532e-06, + "loss": 0.78613305, + "num_input_tokens_seen": 152298455, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.14300537, + "step": 7094, + "time_per_iteration": 2.5826168060302734 + }, + { + "auxiliary_loss_clip": 0.06461484, + "auxiliary_loss_mlp": 0.01268246, + "balance_loss_clip": 0.06287476, + "balance_loss_mlp": 0.0125381, + "epoch": 0.42657447767924245, + "flos": 21871467636480.0, + "grad_norm": 1.4959257312535472, + "language_loss": 0.81979394, + "learning_rate": 2.564908739909464e-06, + "loss": 0.89709127, + "num_input_tokens_seen": 152316995, + "router_z_loss_clip": 1.73925781, + "router_z_loss_mlp": 0.14428711, + "step": 7095, + "time_per_iteration": 2.5714282989501953 + }, + { + "auxiliary_loss_clip": 0.06464021, + "auxiliary_loss_mlp": 0.0127044, + "balance_loss_clip": 0.06287175, + "balance_loss_mlp": 0.01255831, + "epoch": 0.4266346009319104, + "flos": 21476604470400.0, + "grad_norm": 2.7630559086257533, + "language_loss": 0.80476701, + "learning_rate": 2.5645351261129996e-06, + "loss": 0.88211161, + "num_input_tokens_seen": 152334800, + "router_z_loss_clip": 1.76757812, + "router_z_loss_mlp": 0.1461792, + "step": 7096, + "time_per_iteration": 2.52101731300354 + }, + { + "auxiliary_loss_clip": 0.06471846, + "auxiliary_loss_mlp": 0.0126828, + "balance_loss_clip": 0.06290311, + "balance_loss_mlp": 0.01253946, + "epoch": 0.4266947241845784, + "flos": 25526295025920.0, + "grad_norm": 2.003429077322888, + "language_loss": 0.65857691, + "learning_rate": 2.5641614909093066e-06, + "loss": 0.73597825, + "num_input_tokens_seen": 152355175, + "router_z_loss_clip": 1.81542969, + "router_z_loss_mlp": 0.14331055, + "step": 7097, + "time_per_iteration": 2.6010050773620605 + }, + { + "auxiliary_loss_clip": 0.0645384, + "auxiliary_loss_mlp": 0.01272923, + "balance_loss_clip": 0.06282586, + "balance_loss_mlp": 0.01259601, + "epoch": 0.42675484743724634, + "flos": 26548343602560.0, + "grad_norm": 1.7498935394273216, + "language_loss": 0.75170088, + "learning_rate": 2.5637878343125535e-06, + "loss": 0.82896858, + "num_input_tokens_seen": 152377245, + "router_z_loss_clip": 1.71386719, + "router_z_loss_mlp": 0.13317871, + "step": 7098, + "time_per_iteration": 2.5674946308135986 + }, + { + "auxiliary_loss_clip": 0.06458846, + "auxiliary_loss_mlp": 0.01274446, + "balance_loss_clip": 0.0628911, + "balance_loss_mlp": 0.01260033, + "epoch": 0.4268149706899143, + "flos": 23119465547520.0, + "grad_norm": 1.6850998762786562, + "language_loss": 0.75184697, + "learning_rate": 2.5634141563369086e-06, + "loss": 0.82917988, + "num_input_tokens_seen": 152396985, + "router_z_loss_clip": 1.69726562, + "router_z_loss_mlp": 0.14428711, + "step": 7099, + "time_per_iteration": 2.5784735679626465 + }, + { + "auxiliary_loss_clip": 0.06459826, + "auxiliary_loss_mlp": 0.01273278, + "balance_loss_clip": 0.06283994, + "balance_loss_mlp": 0.01259116, + "epoch": 0.4268750939425823, + "flos": 22712401612800.0, + "grad_norm": 2.0765509228592802, + "language_loss": 0.83059096, + "learning_rate": 2.5630404569965432e-06, + "loss": 0.90792197, + "num_input_tokens_seen": 152415590, + "router_z_loss_clip": 1.75683594, + "router_z_loss_mlp": 0.14172363, + "step": 7100, + "time_per_iteration": 2.520923614501953 + }, + { + "auxiliary_loss_clip": 0.06459752, + "auxiliary_loss_mlp": 0.01269142, + "balance_loss_clip": 0.06284218, + "balance_loss_mlp": 0.01255839, + "epoch": 0.42693521719525024, + "flos": 25382007095040.0, + "grad_norm": 1.4351436052366604, + "language_loss": 0.82259512, + "learning_rate": 2.562666736305627e-06, + "loss": 0.8998841, + "num_input_tokens_seen": 152436735, + "router_z_loss_clip": 1.75292969, + "router_z_loss_mlp": 0.13311768, + "step": 7101, + "time_per_iteration": 2.595768451690674 + }, + { + "auxiliary_loss_clip": 0.06466523, + "auxiliary_loss_mlp": 0.01273606, + "balance_loss_clip": 0.06287891, + "balance_loss_mlp": 0.01259099, + "epoch": 0.42699534044791826, + "flos": 18156613196160.0, + "grad_norm": 2.266580923573967, + "language_loss": 0.72800845, + "learning_rate": 2.5622929942783314e-06, + "loss": 0.80540979, + "num_input_tokens_seen": 152455685, + "router_z_loss_clip": 1.78613281, + "router_z_loss_mlp": 0.14501953, + "step": 7102, + "time_per_iteration": 2.494673252105713 + }, + { + "auxiliary_loss_clip": 0.06457532, + "auxiliary_loss_mlp": 0.0127168, + "balance_loss_clip": 0.06287985, + "balance_loss_mlp": 0.01257935, + "epoch": 0.4270554637005862, + "flos": 13703422504320.0, + "grad_norm": 2.1781975733094936, + "language_loss": 0.83514953, + "learning_rate": 2.5619192309288297e-06, + "loss": 0.91244167, + "num_input_tokens_seen": 152473500, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.13751221, + "step": 7103, + "time_per_iteration": 2.506204128265381 + }, + { + "auxiliary_loss_clip": 0.06465043, + "auxiliary_loss_mlp": 0.01274672, + "balance_loss_clip": 0.0628773, + "balance_loss_mlp": 0.01259753, + "epoch": 0.4271155869532542, + "flos": 17499351369600.0, + "grad_norm": 2.042502996026563, + "language_loss": 0.73773789, + "learning_rate": 2.561545446271294e-06, + "loss": 0.815135, + "num_input_tokens_seen": 152491320, + "router_z_loss_clip": 1.77246094, + "router_z_loss_mlp": 0.14916992, + "step": 7104, + "time_per_iteration": 2.5006070137023926 + }, + { + "auxiliary_loss_clip": 0.06459317, + "auxiliary_loss_mlp": 0.01274322, + "balance_loss_clip": 0.0628491, + "balance_loss_mlp": 0.01260494, + "epoch": 0.42717571020592215, + "flos": 32460471659520.0, + "grad_norm": 3.22189729136274, + "language_loss": 0.75052768, + "learning_rate": 2.5611716403198987e-06, + "loss": 0.82786405, + "num_input_tokens_seen": 152511970, + "router_z_loss_clip": 1.74609375, + "router_z_loss_mlp": 0.13830566, + "step": 7105, + "time_per_iteration": 2.607759475708008 + }, + { + "auxiliary_loss_clip": 0.06461999, + "auxiliary_loss_mlp": 0.01274519, + "balance_loss_clip": 0.06286199, + "balance_loss_mlp": 0.01261168, + "epoch": 0.4272358334585901, + "flos": 16258606836480.0, + "grad_norm": 17.703344591331568, + "language_loss": 0.77349067, + "learning_rate": 2.560797813088819e-06, + "loss": 0.85085583, + "num_input_tokens_seen": 152530515, + "router_z_loss_clip": 1.75878906, + "router_z_loss_mlp": 0.13354492, + "step": 7106, + "time_per_iteration": 2.4834203720092773 + }, + { + "auxiliary_loss_clip": 0.06461152, + "auxiliary_loss_mlp": 0.01276721, + "balance_loss_clip": 0.06287872, + "balance_loss_mlp": 0.01262499, + "epoch": 0.4272959567112581, + "flos": 24205817733120.0, + "grad_norm": 1.9445558892844073, + "language_loss": 0.8013317, + "learning_rate": 2.560423964592229e-06, + "loss": 0.87871039, + "num_input_tokens_seen": 152549295, + "router_z_loss_clip": 1.73046875, + "router_z_loss_mlp": 0.14233398, + "step": 7107, + "time_per_iteration": 2.5639657974243164 + }, + { + "auxiliary_loss_clip": 0.06454289, + "auxiliary_loss_mlp": 0.01267783, + "balance_loss_clip": 0.06283173, + "balance_loss_mlp": 0.01253424, + "epoch": 0.42735607996392605, + "flos": 27970747787520.0, + "grad_norm": 1.710799907332892, + "language_loss": 0.68469441, + "learning_rate": 2.5600500948443075e-06, + "loss": 0.76191515, + "num_input_tokens_seen": 152570725, + "router_z_loss_clip": 1.70996094, + "router_z_loss_mlp": 0.14349365, + "step": 7108, + "time_per_iteration": 2.5538556575775146 + }, + { + "auxiliary_loss_clip": 0.06460684, + "auxiliary_loss_mlp": 0.01273244, + "balance_loss_clip": 0.06285615, + "balance_loss_mlp": 0.01258712, + "epoch": 0.427416203216594, + "flos": 20300582816640.0, + "grad_norm": 2.1700047707431342, + "language_loss": 0.72192961, + "learning_rate": 2.5596762038592294e-06, + "loss": 0.79926884, + "num_input_tokens_seen": 152588950, + "router_z_loss_clip": 1.75097656, + "router_z_loss_mlp": 0.14520264, + "step": 7109, + "time_per_iteration": 2.5418453216552734 + }, + { + "auxiliary_loss_clip": 0.06462875, + "auxiliary_loss_mlp": 0.01279728, + "balance_loss_clip": 0.06288399, + "balance_loss_mlp": 0.01264159, + "epoch": 0.427476326469262, + "flos": 26951382541440.0, + "grad_norm": 2.7192306397859034, + "language_loss": 0.64651388, + "learning_rate": 2.559302291651174e-06, + "loss": 0.7239399, + "num_input_tokens_seen": 152608965, + "router_z_loss_clip": 1.74511719, + "router_z_loss_mlp": 0.15551758, + "step": 7110, + "time_per_iteration": 2.6708264350891113 + }, + { + "auxiliary_loss_clip": 0.06457267, + "auxiliary_loss_mlp": 0.01278945, + "balance_loss_clip": 0.06284395, + "balance_loss_mlp": 0.01264056, + "epoch": 0.42753644972192995, + "flos": 25709967175680.0, + "grad_norm": 2.127603657525877, + "language_loss": 0.76798368, + "learning_rate": 2.5589283582343197e-06, + "loss": 0.84534585, + "num_input_tokens_seen": 152630220, + "router_z_loss_clip": 1.72851562, + "router_z_loss_mlp": 0.14880371, + "step": 7111, + "time_per_iteration": 2.678954601287842 + }, + { + "auxiliary_loss_clip": 0.0646024, + "auxiliary_loss_mlp": 0.01269729, + "balance_loss_clip": 0.06282812, + "balance_loss_mlp": 0.01255352, + "epoch": 0.4275965729745979, + "flos": 18772855649280.0, + "grad_norm": 1.9451066993795918, + "language_loss": 0.73479104, + "learning_rate": 2.558554403622845e-06, + "loss": 0.81209064, + "num_input_tokens_seen": 152648835, + "router_z_loss_clip": 1.77441406, + "router_z_loss_mlp": 0.1439209, + "step": 7112, + "time_per_iteration": 2.4913687705993652 + }, + { + "auxiliary_loss_clip": 0.06453889, + "auxiliary_loss_mlp": 0.01274214, + "balance_loss_clip": 0.06283249, + "balance_loss_mlp": 0.01260248, + "epoch": 0.4276566962272659, + "flos": 23770438318080.0, + "grad_norm": 1.6965987454612683, + "language_loss": 0.71646041, + "learning_rate": 2.5581804278309323e-06, + "loss": 0.79374146, + "num_input_tokens_seen": 152668375, + "router_z_loss_clip": 1.70800781, + "router_z_loss_mlp": 0.13964844, + "step": 7113, + "time_per_iteration": 2.567722797393799 + }, + { + "auxiliary_loss_clip": 0.06462316, + "auxiliary_loss_mlp": 0.01277106, + "balance_loss_clip": 0.06286302, + "balance_loss_mlp": 0.01262157, + "epoch": 0.42771681947993384, + "flos": 22499156171520.0, + "grad_norm": 1.507728091462329, + "language_loss": 0.61987239, + "learning_rate": 2.5578064308727617e-06, + "loss": 0.69726658, + "num_input_tokens_seen": 152689725, + "router_z_loss_clip": 1.75878906, + "router_z_loss_mlp": 0.14953613, + "step": 7114, + "time_per_iteration": 2.5800352096557617 + }, + { + "auxiliary_loss_clip": 0.06466354, + "auxiliary_loss_mlp": 0.01281834, + "balance_loss_clip": 0.06284335, + "balance_loss_mlp": 0.01264895, + "epoch": 0.42777694273260186, + "flos": 25051489464960.0, + "grad_norm": 1.9424022728130763, + "language_loss": 0.64557558, + "learning_rate": 2.5574324127625153e-06, + "loss": 0.72305751, + "num_input_tokens_seen": 152709375, + "router_z_loss_clip": 1.8203125, + "router_z_loss_mlp": 0.16943359, + "step": 7115, + "time_per_iteration": 2.625234603881836 + }, + { + "auxiliary_loss_clip": 0.06458592, + "auxiliary_loss_mlp": 0.01271806, + "balance_loss_clip": 0.06283341, + "balance_loss_mlp": 0.01257668, + "epoch": 0.4278370659852698, + "flos": 18667532666880.0, + "grad_norm": 1.4802584121928888, + "language_loss": 0.73841792, + "learning_rate": 2.5570583735143753e-06, + "loss": 0.81572187, + "num_input_tokens_seen": 152727510, + "router_z_loss_clip": 1.75195312, + "router_z_loss_mlp": 0.14141846, + "step": 7116, + "time_per_iteration": 2.517512798309326 + }, + { + "auxiliary_loss_clip": 0.06453552, + "auxiliary_loss_mlp": 0.0127651, + "balance_loss_clip": 0.06284202, + "balance_loss_mlp": 0.01262461, + "epoch": 0.4278971892379378, + "flos": 27315666167040.0, + "grad_norm": 1.6819154869474044, + "language_loss": 0.69691694, + "learning_rate": 2.5566843131425275e-06, + "loss": 0.77421755, + "num_input_tokens_seen": 152746670, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.14044189, + "step": 7117, + "time_per_iteration": 2.5842087268829346 + }, + { + "auxiliary_loss_clip": 0.06455907, + "auxiliary_loss_mlp": 0.01274379, + "balance_loss_clip": 0.06285148, + "balance_loss_mlp": 0.0126008, + "epoch": 0.42795731249060576, + "flos": 12892397235840.0, + "grad_norm": 2.190420439429125, + "language_loss": 0.69763142, + "learning_rate": 2.5563102316611536e-06, + "loss": 0.77493429, + "num_input_tokens_seen": 152760545, + "router_z_loss_clip": 1.70800781, + "router_z_loss_mlp": 0.14306641, + "step": 7118, + "time_per_iteration": 2.480435609817505 + }, + { + "auxiliary_loss_clip": 0.06457028, + "auxiliary_loss_mlp": 0.01277321, + "balance_loss_clip": 0.06285428, + "balance_loss_mlp": 0.01262109, + "epoch": 0.4280174357432737, + "flos": 33409873146240.0, + "grad_norm": 2.392758427844577, + "language_loss": 0.74691743, + "learning_rate": 2.55593612908444e-06, + "loss": 0.82426095, + "num_input_tokens_seen": 152780970, + "router_z_loss_clip": 1.71582031, + "router_z_loss_mlp": 0.15197754, + "step": 7119, + "time_per_iteration": 2.633418083190918 + }, + { + "auxiliary_loss_clip": 0.06453852, + "auxiliary_loss_mlp": 0.01276265, + "balance_loss_clip": 0.06282485, + "balance_loss_mlp": 0.0126134, + "epoch": 0.4280775589959417, + "flos": 18264871071360.0, + "grad_norm": 2.26485992413173, + "language_loss": 0.75017536, + "learning_rate": 2.555562005426573e-06, + "loss": 0.8274765, + "num_input_tokens_seen": 152798475, + "router_z_loss_clip": 1.71386719, + "router_z_loss_mlp": 0.14916992, + "step": 7120, + "time_per_iteration": 2.4857230186462402 + }, + { + "auxiliary_loss_clip": 0.06459665, + "auxiliary_loss_mlp": 0.01279872, + "balance_loss_clip": 0.062869, + "balance_loss_mlp": 0.01265883, + "epoch": 0.42813768224860965, + "flos": 21477820354560.0, + "grad_norm": 1.904077899556691, + "language_loss": 0.77223492, + "learning_rate": 2.5551878607017385e-06, + "loss": 0.8496303, + "num_input_tokens_seen": 152817555, + "router_z_loss_clip": 1.72753906, + "router_z_loss_mlp": 0.13989258, + "step": 7121, + "time_per_iteration": 2.547011375427246 + }, + { + "auxiliary_loss_clip": 0.06450777, + "auxiliary_loss_mlp": 0.01281298, + "balance_loss_clip": 0.06280679, + "balance_loss_mlp": 0.01267255, + "epoch": 0.4281978055012776, + "flos": 15674704859520.0, + "grad_norm": 1.7733631777850345, + "language_loss": 0.85767531, + "learning_rate": 2.554813694924126e-06, + "loss": 0.93499613, + "num_input_tokens_seen": 152836295, + "router_z_loss_clip": 1.70117188, + "router_z_loss_mlp": 0.14056396, + "step": 7122, + "time_per_iteration": 2.488633155822754 + }, + { + "auxiliary_loss_clip": 0.06454846, + "auxiliary_loss_mlp": 0.01275392, + "balance_loss_clip": 0.06284268, + "balance_loss_mlp": 0.01261022, + "epoch": 0.4282579287539456, + "flos": 17717711909760.0, + "grad_norm": 2.3186837977879886, + "language_loss": 0.8157897, + "learning_rate": 2.554439508107921e-06, + "loss": 0.89309216, + "num_input_tokens_seen": 152854950, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.14355469, + "step": 7123, + "time_per_iteration": 3.969069719314575 + }, + { + "auxiliary_loss_clip": 0.06453736, + "auxiliary_loss_mlp": 0.01276304, + "balance_loss_clip": 0.06284729, + "balance_loss_mlp": 0.01262034, + "epoch": 0.42831805200661355, + "flos": 19287171210240.0, + "grad_norm": 1.594767030772038, + "language_loss": 0.80927598, + "learning_rate": 2.5540653002673153e-06, + "loss": 0.88657635, + "num_input_tokens_seen": 152873995, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.14257812, + "step": 7124, + "time_per_iteration": 3.901512861251831 + }, + { + "auxiliary_loss_clip": 0.06454194, + "auxiliary_loss_mlp": 0.01273804, + "balance_loss_clip": 0.06283361, + "balance_loss_mlp": 0.01258312, + "epoch": 0.4283781752592815, + "flos": 19798845367680.0, + "grad_norm": 1.7493536594312618, + "language_loss": 0.81056678, + "learning_rate": 2.553691071416498e-06, + "loss": 0.88784677, + "num_input_tokens_seen": 152892925, + "router_z_loss_clip": 1.70703125, + "router_z_loss_mlp": 0.15484619, + "step": 7125, + "time_per_iteration": 2.561479091644287 + }, + { + "auxiliary_loss_clip": 0.06453275, + "auxiliary_loss_mlp": 0.0127252, + "balance_loss_clip": 0.06283629, + "balance_loss_mlp": 0.01259467, + "epoch": 0.4284382985119495, + "flos": 16513584410880.0, + "grad_norm": 2.012470201752393, + "language_loss": 0.75256401, + "learning_rate": 2.553316821569659e-06, + "loss": 0.829822, + "num_input_tokens_seen": 152910935, + "router_z_loss_clip": 1.69824219, + "router_z_loss_mlp": 0.13037109, + "step": 7126, + "time_per_iteration": 2.550835371017456 + }, + { + "auxiliary_loss_clip": 0.06454661, + "auxiliary_loss_mlp": 0.01269423, + "balance_loss_clip": 0.06280357, + "balance_loss_mlp": 0.01255518, + "epoch": 0.42849842176461744, + "flos": 23337406817280.0, + "grad_norm": 1.7018740006461155, + "language_loss": 0.81619167, + "learning_rate": 2.5529425507409913e-06, + "loss": 0.8934325, + "num_input_tokens_seen": 152931030, + "router_z_loss_clip": 1.74511719, + "router_z_loss_mlp": 0.13916016, + "step": 7127, + "time_per_iteration": 2.512833833694458 + }, + { + "auxiliary_loss_clip": 0.06455937, + "auxiliary_loss_mlp": 0.01269506, + "balance_loss_clip": 0.06282341, + "balance_loss_mlp": 0.01254659, + "epoch": 0.4285585450172854, + "flos": 17280110361600.0, + "grad_norm": 1.7733778395824964, + "language_loss": 0.76877725, + "learning_rate": 2.5525682589446867e-06, + "loss": 0.84603173, + "num_input_tokens_seen": 152948085, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.14837646, + "step": 7128, + "time_per_iteration": 2.54837703704834 + }, + { + "auxiliary_loss_clip": 0.06458156, + "auxiliary_loss_mlp": 0.01271641, + "balance_loss_clip": 0.06282061, + "balance_loss_mlp": 0.01255726, + "epoch": 0.42861866826995343, + "flos": 24286430960640.0, + "grad_norm": 1.8449893243882522, + "language_loss": 0.74647015, + "learning_rate": 2.552193946194937e-06, + "loss": 0.82376814, + "num_input_tokens_seen": 152966265, + "router_z_loss_clip": 1.76074219, + "router_z_loss_mlp": 0.15917969, + "step": 7129, + "time_per_iteration": 2.5513017177581787 + }, + { + "auxiliary_loss_clip": 0.06454159, + "auxiliary_loss_mlp": 0.0127295, + "balance_loss_clip": 0.06282164, + "balance_loss_mlp": 0.01258949, + "epoch": 0.4286787915226214, + "flos": 24360042372480.0, + "grad_norm": 1.8999084688655365, + "language_loss": 0.7830866, + "learning_rate": 2.5518196125059394e-06, + "loss": 0.86035764, + "num_input_tokens_seen": 152986775, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.14007568, + "step": 7130, + "time_per_iteration": 3.9916892051696777 + }, + { + "auxiliary_loss_clip": 0.06456774, + "auxiliary_loss_mlp": 0.01278579, + "balance_loss_clip": 0.06282126, + "balance_loss_mlp": 0.01263618, + "epoch": 0.42873891477528936, + "flos": 15455338070400.0, + "grad_norm": 2.1626861971351263, + "language_loss": 0.73881406, + "learning_rate": 2.551445257891886e-06, + "loss": 0.81616759, + "num_input_tokens_seen": 153003595, + "router_z_loss_clip": 1.74707031, + "router_z_loss_mlp": 0.1496582, + "step": 7131, + "time_per_iteration": 2.504786252975464 + }, + { + "auxiliary_loss_clip": 0.06455156, + "auxiliary_loss_mlp": 0.01273453, + "balance_loss_clip": 0.06282241, + "balance_loss_mlp": 0.01258183, + "epoch": 0.4287990380279573, + "flos": 17645358309120.0, + "grad_norm": 2.0546861067047533, + "language_loss": 0.77884281, + "learning_rate": 2.551070882366973e-06, + "loss": 0.85612893, + "num_input_tokens_seen": 153021960, + "router_z_loss_clip": 1.72753906, + "router_z_loss_mlp": 0.15270996, + "step": 7132, + "time_per_iteration": 2.5048811435699463 + }, + { + "auxiliary_loss_clip": 0.06456134, + "auxiliary_loss_mlp": 0.01270516, + "balance_loss_clip": 0.06281912, + "balance_loss_mlp": 0.01254542, + "epoch": 0.4288591612806253, + "flos": 27169701154560.0, + "grad_norm": 1.7726331897563596, + "language_loss": 0.78733218, + "learning_rate": 2.550696485945397e-06, + "loss": 0.86459869, + "num_input_tokens_seen": 153042110, + "router_z_loss_clip": 1.74121094, + "router_z_loss_mlp": 0.1595459, + "step": 7133, + "time_per_iteration": 4.068531036376953 + }, + { + "auxiliary_loss_clip": 0.06450784, + "auxiliary_loss_mlp": 0.01268858, + "balance_loss_clip": 0.06277733, + "balance_loss_mlp": 0.01254785, + "epoch": 0.42891928453329325, + "flos": 17168540250240.0, + "grad_norm": 1.7118267088696246, + "language_loss": 0.7483775, + "learning_rate": 2.550322068641355e-06, + "loss": 0.82557392, + "num_input_tokens_seen": 153058925, + "router_z_loss_clip": 1.73046875, + "router_z_loss_mlp": 0.14068604, + "step": 7134, + "time_per_iteration": 2.504011631011963 + }, + { + "auxiliary_loss_clip": 0.06450233, + "auxiliary_loss_mlp": 0.01272762, + "balance_loss_clip": 0.06279828, + "balance_loss_mlp": 0.0125882, + "epoch": 0.4289794077859612, + "flos": 18192936741120.0, + "grad_norm": 1.9195667435408965, + "language_loss": 0.84458339, + "learning_rate": 2.5499476304690455e-06, + "loss": 0.92181337, + "num_input_tokens_seen": 153078070, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.13946533, + "step": 7135, + "time_per_iteration": 2.4924819469451904 + }, + { + "auxiliary_loss_clip": 0.06447092, + "auxiliary_loss_mlp": 0.01268039, + "balance_loss_clip": 0.06279005, + "balance_loss_mlp": 0.01253949, + "epoch": 0.4290395310386292, + "flos": 28264438748160.0, + "grad_norm": 2.116473983113214, + "language_loss": 0.754601, + "learning_rate": 2.549573171442666e-06, + "loss": 0.8317523, + "num_input_tokens_seen": 153096680, + "router_z_loss_clip": 1.6796875, + "router_z_loss_mlp": 0.14099121, + "step": 7136, + "time_per_iteration": 2.579450845718384 + }, + { + "auxiliary_loss_clip": 0.06453092, + "auxiliary_loss_mlp": 0.01272367, + "balance_loss_clip": 0.06277236, + "balance_loss_mlp": 0.01257895, + "epoch": 0.42909965429129715, + "flos": 16221528604800.0, + "grad_norm": 1.8728665886520197, + "language_loss": 0.79211873, + "learning_rate": 2.5491986915764175e-06, + "loss": 0.86937326, + "num_input_tokens_seen": 153113305, + "router_z_loss_clip": 1.75976562, + "router_z_loss_mlp": 0.14465332, + "step": 7137, + "time_per_iteration": 2.485880136489868 + }, + { + "auxiliary_loss_clip": 0.06452384, + "auxiliary_loss_mlp": 0.01271962, + "balance_loss_clip": 0.06279657, + "balance_loss_mlp": 0.01257359, + "epoch": 0.4291597775439651, + "flos": 23119633255680.0, + "grad_norm": 1.8713356259191796, + "language_loss": 0.76152903, + "learning_rate": 2.548824190884499e-06, + "loss": 0.83877248, + "num_input_tokens_seen": 153132735, + "router_z_loss_clip": 1.72753906, + "router_z_loss_mlp": 0.14605713, + "step": 7138, + "time_per_iteration": 2.5630223751068115 + }, + { + "auxiliary_loss_clip": 0.06367285, + "auxiliary_loss_mlp": 0.01254388, + "balance_loss_clip": 0.06288805, + "balance_loss_mlp": 0.01250711, + "epoch": 0.4292199007966331, + "flos": 67565461703040.0, + "grad_norm": 0.7609122933706777, + "language_loss": 0.5608238, + "learning_rate": 2.548449669381113e-06, + "loss": 0.63704056, + "num_input_tokens_seen": 153187925, + "router_z_loss_clip": 0.78515625, + "router_z_loss_mlp": 0.03668213, + "step": 7139, + "time_per_iteration": 3.0345327854156494 + }, + { + "auxiliary_loss_clip": 0.06448679, + "auxiliary_loss_mlp": 0.01269902, + "balance_loss_clip": 0.06282055, + "balance_loss_mlp": 0.01256861, + "epoch": 0.42928002404930105, + "flos": 23006008719360.0, + "grad_norm": 1.7405631209015646, + "language_loss": 0.81563902, + "learning_rate": 2.5480751270804595e-06, + "loss": 0.89282477, + "num_input_tokens_seen": 153206990, + "router_z_loss_clip": 1.66699219, + "router_z_loss_mlp": 0.13049316, + "step": 7140, + "time_per_iteration": 2.5697882175445557 + }, + { + "auxiliary_loss_clip": 0.06455392, + "auxiliary_loss_mlp": 0.01267223, + "balance_loss_clip": 0.0628099, + "balance_loss_mlp": 0.01252543, + "epoch": 0.429340147301969, + "flos": 11549432321280.0, + "grad_norm": 1.8011940744465647, + "language_loss": 0.82215559, + "learning_rate": 2.5477005639967424e-06, + "loss": 0.89938176, + "num_input_tokens_seen": 153222345, + "router_z_loss_clip": 1.74316406, + "router_z_loss_mlp": 0.14678955, + "step": 7141, + "time_per_iteration": 2.4844813346862793 + }, + { + "auxiliary_loss_clip": 0.0646215, + "auxiliary_loss_mlp": 0.0128237, + "balance_loss_clip": 0.06283965, + "balance_loss_mlp": 0.01266336, + "epoch": 0.42940027055463703, + "flos": 25272030211200.0, + "grad_norm": 2.0081644747821947, + "language_loss": 0.86468136, + "learning_rate": 2.547325980144166e-06, + "loss": 0.94212657, + "num_input_tokens_seen": 153240570, + "router_z_loss_clip": 1.78222656, + "router_z_loss_mlp": 0.16027832, + "step": 7142, + "time_per_iteration": 2.570967674255371 + }, + { + "auxiliary_loss_clip": 0.0645667, + "auxiliary_loss_mlp": 0.01269132, + "balance_loss_clip": 0.06288485, + "balance_loss_mlp": 0.01255596, + "epoch": 0.429460393807305, + "flos": 23811709253760.0, + "grad_norm": 2.010483035293097, + "language_loss": 0.78394985, + "learning_rate": 2.5469513755369323e-06, + "loss": 0.86120784, + "num_input_tokens_seen": 153259575, + "router_z_loss_clip": 1.68066406, + "router_z_loss_mlp": 0.13549805, + "step": 7143, + "time_per_iteration": 2.5245959758758545 + }, + { + "auxiliary_loss_clip": 0.06458203, + "auxiliary_loss_mlp": 0.01271797, + "balance_loss_clip": 0.06286128, + "balance_loss_mlp": 0.01257689, + "epoch": 0.42952051705997296, + "flos": 13923502053120.0, + "grad_norm": 1.8646185905931467, + "language_loss": 0.77133417, + "learning_rate": 2.5465767501892484e-06, + "loss": 0.84863412, + "num_input_tokens_seen": 153276650, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.14117432, + "step": 7144, + "time_per_iteration": 2.5442261695861816 + }, + { + "auxiliary_loss_clip": 0.0645657, + "auxiliary_loss_mlp": 0.01274131, + "balance_loss_clip": 0.06283006, + "balance_loss_mlp": 0.0125973, + "epoch": 0.4295806403126409, + "flos": 26767584610560.0, + "grad_norm": 1.5670382727140026, + "language_loss": 0.74293256, + "learning_rate": 2.54620210411532e-06, + "loss": 0.8202396, + "num_input_tokens_seen": 153298025, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.14404297, + "step": 7145, + "time_per_iteration": 2.5812947750091553 + }, + { + "auxiliary_loss_clip": 0.06458145, + "auxiliary_loss_mlp": 0.01276391, + "balance_loss_clip": 0.06281675, + "balance_loss_mlp": 0.01261585, + "epoch": 0.4296407635653089, + "flos": 20957760789120.0, + "grad_norm": 2.084760622121642, + "language_loss": 0.79444236, + "learning_rate": 2.545827437329352e-06, + "loss": 0.87178773, + "num_input_tokens_seen": 153315775, + "router_z_loss_clip": 1.76269531, + "router_z_loss_mlp": 0.14807129, + "step": 7146, + "time_per_iteration": 2.5411908626556396 + }, + { + "auxiliary_loss_clip": 0.0645076, + "auxiliary_loss_mlp": 0.01276231, + "balance_loss_clip": 0.06280234, + "balance_loss_mlp": 0.01262373, + "epoch": 0.42970088681797686, + "flos": 15857915811840.0, + "grad_norm": 1.9977945232207481, + "language_loss": 0.83012491, + "learning_rate": 2.5454527498455532e-06, + "loss": 0.90739477, + "num_input_tokens_seen": 153332765, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.13867188, + "step": 7147, + "time_per_iteration": 2.4752652645111084 + }, + { + "auxiliary_loss_clip": 0.06456682, + "auxiliary_loss_mlp": 0.01274227, + "balance_loss_clip": 0.06283284, + "balance_loss_mlp": 0.01258622, + "epoch": 0.4297610100706448, + "flos": 22389179287680.0, + "grad_norm": 1.9494252458685553, + "language_loss": 0.87818855, + "learning_rate": 2.545078041678131e-06, + "loss": 0.95549762, + "num_input_tokens_seen": 153350760, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.15612793, + "step": 7148, + "time_per_iteration": 2.5504684448242188 + }, + { + "auxiliary_loss_clip": 0.06459592, + "auxiliary_loss_mlp": 0.0127006, + "balance_loss_clip": 0.06287406, + "balance_loss_mlp": 0.01255689, + "epoch": 0.4298211333233128, + "flos": 27932705233920.0, + "grad_norm": 1.7901480630114543, + "language_loss": 0.78474885, + "learning_rate": 2.5447033128412957e-06, + "loss": 0.86204541, + "num_input_tokens_seen": 153370765, + "router_z_loss_clip": 1.72265625, + "router_z_loss_mlp": 0.14373779, + "step": 7149, + "time_per_iteration": 2.5467026233673096 + }, + { + "auxiliary_loss_clip": 0.06454438, + "auxiliary_loss_mlp": 0.01275691, + "balance_loss_clip": 0.06285315, + "balance_loss_mlp": 0.01261153, + "epoch": 0.42988125657598075, + "flos": 24432479827200.0, + "grad_norm": 1.6909372302648806, + "language_loss": 0.79794931, + "learning_rate": 2.544328563349256e-06, + "loss": 0.87525058, + "num_input_tokens_seen": 153390725, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.14550781, + "step": 7150, + "time_per_iteration": 2.5642549991607666 + }, + { + "auxiliary_loss_clip": 0.06463797, + "auxiliary_loss_mlp": 0.01273266, + "balance_loss_clip": 0.06283444, + "balance_loss_mlp": 0.01256636, + "epoch": 0.4299413798286487, + "flos": 15855400189440.0, + "grad_norm": 1.6104667865383644, + "language_loss": 0.75438166, + "learning_rate": 2.5439537932162222e-06, + "loss": 0.8317523, + "num_input_tokens_seen": 153408010, + "router_z_loss_clip": 1.80371094, + "router_z_loss_mlp": 0.16638184, + "step": 7151, + "time_per_iteration": 2.47206711769104 + }, + { + "auxiliary_loss_clip": 0.06463672, + "auxiliary_loss_mlp": 0.01271158, + "balance_loss_clip": 0.06284998, + "balance_loss_mlp": 0.01256179, + "epoch": 0.4300015030813167, + "flos": 22316029073280.0, + "grad_norm": 1.9504143763164294, + "language_loss": 0.70926738, + "learning_rate": 2.543579002456406e-06, + "loss": 0.78661567, + "num_input_tokens_seen": 153426865, + "router_z_loss_clip": 1.78808594, + "router_z_loss_mlp": 0.14984131, + "step": 7152, + "time_per_iteration": 2.541208267211914 + }, + { + "auxiliary_loss_clip": 0.06452823, + "auxiliary_loss_mlp": 0.01271847, + "balance_loss_clip": 0.06279409, + "balance_loss_mlp": 0.01257482, + "epoch": 0.43006162633398465, + "flos": 34906391867520.0, + "grad_norm": 1.81395768481921, + "language_loss": 0.7223562, + "learning_rate": 2.54320419108402e-06, + "loss": 0.79960287, + "num_input_tokens_seen": 153449410, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.14361572, + "step": 7153, + "time_per_iteration": 2.6242926120758057 + }, + { + "auxiliary_loss_clip": 0.064519, + "auxiliary_loss_mlp": 0.01271383, + "balance_loss_clip": 0.06279962, + "balance_loss_mlp": 0.01257018, + "epoch": 0.4301217495866526, + "flos": 15967138008960.0, + "grad_norm": 2.006134184464422, + "language_loss": 0.78977376, + "learning_rate": 2.542829359113276e-06, + "loss": 0.8670066, + "num_input_tokens_seen": 153467910, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.14367676, + "step": 7154, + "time_per_iteration": 2.5568442344665527 + }, + { + "auxiliary_loss_clip": 0.06457433, + "auxiliary_loss_mlp": 0.01273105, + "balance_loss_clip": 0.06286051, + "balance_loss_mlp": 0.01258943, + "epoch": 0.43018187283932063, + "flos": 18776293666560.0, + "grad_norm": 1.5037130128548426, + "language_loss": 0.78947407, + "learning_rate": 2.542454506558389e-06, + "loss": 0.86677945, + "num_input_tokens_seen": 153487100, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.14172363, + "step": 7155, + "time_per_iteration": 2.5090463161468506 + }, + { + "auxiliary_loss_clip": 0.06448177, + "auxiliary_loss_mlp": 0.01271989, + "balance_loss_clip": 0.06280203, + "balance_loss_mlp": 0.01258613, + "epoch": 0.4302419960919886, + "flos": 20157007645440.0, + "grad_norm": 4.525310176173048, + "language_loss": 0.89197671, + "learning_rate": 2.5420796334335723e-06, + "loss": 0.96917844, + "num_input_tokens_seen": 153505565, + "router_z_loss_clip": 1.67871094, + "router_z_loss_mlp": 0.13397217, + "step": 7156, + "time_per_iteration": 2.5620951652526855 + }, + { + "auxiliary_loss_clip": 0.0645663, + "auxiliary_loss_mlp": 0.01274773, + "balance_loss_clip": 0.06281747, + "balance_loss_mlp": 0.01259836, + "epoch": 0.43030211934465656, + "flos": 26440001873280.0, + "grad_norm": 2.4796677358200423, + "language_loss": 0.82988536, + "learning_rate": 2.541704739753042e-06, + "loss": 0.90719938, + "num_input_tokens_seen": 153526130, + "router_z_loss_clip": 1.74707031, + "router_z_loss_mlp": 0.14929199, + "step": 7157, + "time_per_iteration": 2.5528175830841064 + }, + { + "auxiliary_loss_clip": 0.06457967, + "auxiliary_loss_mlp": 0.01275139, + "balance_loss_clip": 0.06280558, + "balance_loss_mlp": 0.01258974, + "epoch": 0.43036224259732453, + "flos": 24396114355200.0, + "grad_norm": 1.7333061296854189, + "language_loss": 0.71840358, + "learning_rate": 2.5413298255310132e-06, + "loss": 0.79573464, + "num_input_tokens_seen": 153546370, + "router_z_loss_clip": 1.77441406, + "router_z_loss_mlp": 0.16162109, + "step": 7158, + "time_per_iteration": 2.540012836456299 + }, + { + "auxiliary_loss_clip": 0.06449466, + "auxiliary_loss_mlp": 0.01275077, + "balance_loss_clip": 0.06278417, + "balance_loss_mlp": 0.01260355, + "epoch": 0.4304223658499925, + "flos": 17207421344640.0, + "grad_norm": 2.0047997442662684, + "language_loss": 0.82936633, + "learning_rate": 2.5409548907817034e-06, + "loss": 0.9066118, + "num_input_tokens_seen": 153562800, + "router_z_loss_clip": 1.7109375, + "router_z_loss_mlp": 0.14709473, + "step": 7159, + "time_per_iteration": 2.550978183746338 + }, + { + "auxiliary_loss_clip": 0.0645431, + "auxiliary_loss_mlp": 0.01270347, + "balance_loss_clip": 0.06281546, + "balance_loss_mlp": 0.01256048, + "epoch": 0.43048248910266046, + "flos": 14908304689920.0, + "grad_norm": 2.57539664943107, + "language_loss": 0.82999021, + "learning_rate": 2.54057993551933e-06, + "loss": 0.90723681, + "num_input_tokens_seen": 153578395, + "router_z_loss_clip": 1.72851562, + "router_z_loss_mlp": 0.1428833, + "step": 7160, + "time_per_iteration": 2.525343894958496 + }, + { + "auxiliary_loss_clip": 0.0645951, + "auxiliary_loss_mlp": 0.01269507, + "balance_loss_clip": 0.06281772, + "balance_loss_mlp": 0.01252675, + "epoch": 0.4305426123553284, + "flos": 21586245937920.0, + "grad_norm": 3.3699216716451046, + "language_loss": 0.77364504, + "learning_rate": 2.5402049597581116e-06, + "loss": 0.85093522, + "num_input_tokens_seen": 153596880, + "router_z_loss_clip": 1.77929688, + "router_z_loss_mlp": 0.16845703, + "step": 7161, + "time_per_iteration": 2.5307719707489014 + }, + { + "auxiliary_loss_clip": 0.06452791, + "auxiliary_loss_mlp": 0.0127042, + "balance_loss_clip": 0.06280292, + "balance_loss_mlp": 0.01256449, + "epoch": 0.4306027356079964, + "flos": 22607833317120.0, + "grad_norm": 2.044056208596942, + "language_loss": 0.73045391, + "learning_rate": 2.5398299635122662e-06, + "loss": 0.80768597, + "num_input_tokens_seen": 153616570, + "router_z_loss_clip": 1.72363281, + "router_z_loss_mlp": 0.13964844, + "step": 7162, + "time_per_iteration": 2.53442645072937 + }, + { + "auxiliary_loss_clip": 0.06358678, + "auxiliary_loss_mlp": 0.01256162, + "balance_loss_clip": 0.06279682, + "balance_loss_mlp": 0.01252738, + "epoch": 0.43066285886066435, + "flos": 70689873548160.0, + "grad_norm": 0.805422068373614, + "language_loss": 0.58694339, + "learning_rate": 2.5394549467960147e-06, + "loss": 0.66309178, + "num_input_tokens_seen": 153671450, + "router_z_loss_clip": 0.7890625, + "router_z_loss_mlp": 0.03433228, + "step": 7163, + "time_per_iteration": 4.420603036880493 + }, + { + "auxiliary_loss_clip": 0.06450315, + "auxiliary_loss_mlp": 0.01271156, + "balance_loss_clip": 0.06279671, + "balance_loss_mlp": 0.01257298, + "epoch": 0.4307229821133323, + "flos": 26727236069760.0, + "grad_norm": 1.7043821860128514, + "language_loss": 0.79015797, + "learning_rate": 2.5390799096235783e-06, + "loss": 0.86737275, + "num_input_tokens_seen": 153691405, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.13842773, + "step": 7164, + "time_per_iteration": 4.077051162719727 + }, + { + "auxiliary_loss_clip": 0.0645581, + "auxiliary_loss_mlp": 0.01269266, + "balance_loss_clip": 0.06279337, + "balance_loss_mlp": 0.01254222, + "epoch": 0.4307831053660003, + "flos": 26184311539200.0, + "grad_norm": 1.6263476545367235, + "language_loss": 0.68622434, + "learning_rate": 2.538704852009177e-06, + "loss": 0.76347512, + "num_input_tokens_seen": 153711555, + "router_z_loss_clip": 1.765625, + "router_z_loss_mlp": 0.1505127, + "step": 7165, + "time_per_iteration": 2.5447044372558594 + }, + { + "auxiliary_loss_clip": 0.06454252, + "auxiliary_loss_mlp": 0.01269461, + "balance_loss_clip": 0.06280573, + "balance_loss_mlp": 0.01254733, + "epoch": 0.43084322861866825, + "flos": 18915298790400.0, + "grad_norm": 2.036386887615401, + "language_loss": 0.75601453, + "learning_rate": 2.538329773967034e-06, + "loss": 0.83325171, + "num_input_tokens_seen": 153730095, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.14758301, + "step": 7166, + "time_per_iteration": 2.5380423069000244 + }, + { + "auxiliary_loss_clip": 0.06447423, + "auxiliary_loss_mlp": 0.01267427, + "balance_loss_clip": 0.06278174, + "balance_loss_mlp": 0.0125401, + "epoch": 0.4309033518713362, + "flos": 26440211508480.0, + "grad_norm": 1.6055464610704053, + "language_loss": 0.72472453, + "learning_rate": 2.537954675511372e-06, + "loss": 0.80187303, + "num_input_tokens_seen": 153749320, + "router_z_loss_clip": 1.69140625, + "router_z_loss_mlp": 0.13415527, + "step": 7167, + "time_per_iteration": 2.581911563873291 + }, + { + "auxiliary_loss_clip": 0.06445278, + "auxiliary_loss_mlp": 0.01267536, + "balance_loss_clip": 0.06279434, + "balance_loss_mlp": 0.01253398, + "epoch": 0.43096347512400424, + "flos": 21219362835840.0, + "grad_norm": 1.5535022771303773, + "language_loss": 0.78678393, + "learning_rate": 2.537579556656414e-06, + "loss": 0.86391199, + "num_input_tokens_seen": 153767825, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.14135742, + "step": 7168, + "time_per_iteration": 2.5395426750183105 + }, + { + "auxiliary_loss_clip": 0.06449728, + "auxiliary_loss_mlp": 0.0127075, + "balance_loss_clip": 0.06278324, + "balance_loss_mlp": 0.01257095, + "epoch": 0.4310235983766722, + "flos": 16544918638080.0, + "grad_norm": 2.3704233546720936, + "language_loss": 0.82314277, + "learning_rate": 2.537204417416387e-06, + "loss": 0.90034759, + "num_input_tokens_seen": 153785350, + "router_z_loss_clip": 1.71191406, + "router_z_loss_mlp": 0.13647461, + "step": 7169, + "time_per_iteration": 3.8934504985809326 + }, + { + "auxiliary_loss_clip": 0.06353073, + "auxiliary_loss_mlp": 0.01255187, + "balance_loss_clip": 0.0627488, + "balance_loss_mlp": 0.01251897, + "epoch": 0.43108372162934017, + "flos": 64794893650560.0, + "grad_norm": 0.6586067859139012, + "language_loss": 0.60826671, + "learning_rate": 2.5368292578055132e-06, + "loss": 0.6843493, + "num_input_tokens_seen": 153856400, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 0.03295898, + "step": 7170, + "time_per_iteration": 3.303295612335205 + }, + { + "auxiliary_loss_clip": 0.06446448, + "auxiliary_loss_mlp": 0.01267633, + "balance_loss_clip": 0.06276239, + "balance_loss_mlp": 0.01253841, + "epoch": 0.43114384488200813, + "flos": 13449241543680.0, + "grad_norm": 1.7965809828184895, + "language_loss": 0.76463991, + "learning_rate": 2.536454077838021e-06, + "loss": 0.84178072, + "num_input_tokens_seen": 153875230, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.13787842, + "step": 7171, + "time_per_iteration": 2.4991650581359863 + }, + { + "auxiliary_loss_clip": 0.06446211, + "auxiliary_loss_mlp": 0.01267534, + "balance_loss_clip": 0.06277663, + "balance_loss_mlp": 0.01253592, + "epoch": 0.4312039681346761, + "flos": 26293911079680.0, + "grad_norm": 1.4736819236139371, + "language_loss": 0.77570975, + "learning_rate": 2.5360788775281357e-06, + "loss": 0.8528471, + "num_input_tokens_seen": 153894740, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.13934326, + "step": 7172, + "time_per_iteration": 2.540095567703247 + }, + { + "auxiliary_loss_clip": 0.06448045, + "auxiliary_loss_mlp": 0.01271237, + "balance_loss_clip": 0.062755, + "balance_loss_mlp": 0.01256449, + "epoch": 0.43126409138734406, + "flos": 20383040833920.0, + "grad_norm": 1.8735364024745536, + "language_loss": 0.76837397, + "learning_rate": 2.535703656890086e-06, + "loss": 0.84556675, + "num_input_tokens_seen": 153913230, + "router_z_loss_clip": 1.72363281, + "router_z_loss_mlp": 0.14776611, + "step": 7173, + "time_per_iteration": 3.998828887939453 + }, + { + "auxiliary_loss_clip": 0.06449778, + "auxiliary_loss_mlp": 0.0126907, + "balance_loss_clip": 0.06280752, + "balance_loss_mlp": 0.0125529, + "epoch": 0.431324214640012, + "flos": 22128918906240.0, + "grad_norm": 1.4124937065278635, + "language_loss": 0.76940411, + "learning_rate": 2.5353284159381e-06, + "loss": 0.84659261, + "num_input_tokens_seen": 153933250, + "router_z_loss_clip": 1.69042969, + "router_z_loss_mlp": 0.13800049, + "step": 7174, + "time_per_iteration": 2.510742425918579 + }, + { + "auxiliary_loss_clip": 0.06448075, + "auxiliary_loss_mlp": 0.01271664, + "balance_loss_clip": 0.06275856, + "balance_loss_mlp": 0.01256477, + "epoch": 0.43138433789268, + "flos": 15236306697600.0, + "grad_norm": 1.9136821796322663, + "language_loss": 0.82178259, + "learning_rate": 2.534953154686407e-06, + "loss": 0.89898002, + "num_input_tokens_seen": 153951325, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.15185547, + "step": 7175, + "time_per_iteration": 2.5317423343658447 + }, + { + "auxiliary_loss_clip": 0.06456869, + "auxiliary_loss_mlp": 0.01274036, + "balance_loss_clip": 0.06277366, + "balance_loss_mlp": 0.01256935, + "epoch": 0.43144446114534796, + "flos": 18156151998720.0, + "grad_norm": 2.207412358761708, + "language_loss": 0.74869847, + "learning_rate": 2.5345778731492366e-06, + "loss": 0.82600749, + "num_input_tokens_seen": 153966975, + "router_z_loss_clip": 1.79492188, + "router_z_loss_mlp": 0.17095947, + "step": 7176, + "time_per_iteration": 2.4871389865875244 + }, + { + "auxiliary_loss_clip": 0.0645103, + "auxiliary_loss_mlp": 0.01269847, + "balance_loss_clip": 0.06277142, + "balance_loss_mlp": 0.01255565, + "epoch": 0.4315045843980159, + "flos": 22936506157440.0, + "grad_norm": 1.949576719813971, + "language_loss": 0.73992217, + "learning_rate": 2.534202571340819e-06, + "loss": 0.81713092, + "num_input_tokens_seen": 153986695, + "router_z_loss_clip": 1.73828125, + "router_z_loss_mlp": 0.14294434, + "step": 7177, + "time_per_iteration": 2.5317373275756836 + }, + { + "auxiliary_loss_clip": 0.06461225, + "auxiliary_loss_mlp": 0.01270022, + "balance_loss_clip": 0.06277613, + "balance_loss_mlp": 0.01253667, + "epoch": 0.4315647076506839, + "flos": 22133321245440.0, + "grad_norm": 1.7707547745548928, + "language_loss": 0.81576592, + "learning_rate": 2.533827249275387e-06, + "loss": 0.89307833, + "num_input_tokens_seen": 154004710, + "router_z_loss_clip": 1.83691406, + "router_z_loss_mlp": 0.16357422, + "step": 7178, + "time_per_iteration": 2.5210797786712646 + }, + { + "auxiliary_loss_clip": 0.06445872, + "auxiliary_loss_mlp": 0.01271308, + "balance_loss_clip": 0.06281172, + "balance_loss_mlp": 0.01257962, + "epoch": 0.43162483090335185, + "flos": 26878567743360.0, + "grad_norm": 1.4959775860860902, + "language_loss": 0.84818423, + "learning_rate": 2.5334519069671725e-06, + "loss": 0.92535609, + "num_input_tokens_seen": 154024320, + "router_z_loss_clip": 1.64746094, + "router_z_loss_mlp": 0.13360596, + "step": 7179, + "time_per_iteration": 2.6229355335235596 + }, + { + "auxiliary_loss_clip": 0.06446353, + "auxiliary_loss_mlp": 0.01270616, + "balance_loss_clip": 0.06276584, + "balance_loss_mlp": 0.01256096, + "epoch": 0.4316849541560198, + "flos": 13917464559360.0, + "grad_norm": 1.6356598233983888, + "language_loss": 0.75595218, + "learning_rate": 2.5330765444304075e-06, + "loss": 0.83312184, + "num_input_tokens_seen": 154041755, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.1451416, + "step": 7180, + "time_per_iteration": 2.4882874488830566 + }, + { + "auxiliary_loss_clip": 0.06450133, + "auxiliary_loss_mlp": 0.01266264, + "balance_loss_clip": 0.0627453, + "balance_loss_mlp": 0.01251023, + "epoch": 0.4317450774086878, + "flos": 16440685758720.0, + "grad_norm": 1.8060434620212955, + "language_loss": 0.81820869, + "learning_rate": 2.5327011616793274e-06, + "loss": 0.89537263, + "num_input_tokens_seen": 154056775, + "router_z_loss_clip": 1.7578125, + "router_z_loss_mlp": 0.15252686, + "step": 7181, + "time_per_iteration": 2.534747838973999 + }, + { + "auxiliary_loss_clip": 0.0644898, + "auxiliary_loss_mlp": 0.0127112, + "balance_loss_clip": 0.06274159, + "balance_loss_mlp": 0.01256189, + "epoch": 0.4318052006613558, + "flos": 20560675489920.0, + "grad_norm": 1.632078496987146, + "language_loss": 0.88980561, + "learning_rate": 2.532325758728165e-06, + "loss": 0.96700662, + "num_input_tokens_seen": 154075015, + "router_z_loss_clip": 1.74804688, + "router_z_loss_mlp": 0.14923096, + "step": 7182, + "time_per_iteration": 2.493427038192749 + }, + { + "auxiliary_loss_clip": 0.06446697, + "auxiliary_loss_mlp": 0.01267064, + "balance_loss_clip": 0.06278539, + "balance_loss_mlp": 0.01254052, + "epoch": 0.43186532391402377, + "flos": 22826613127680.0, + "grad_norm": 1.9212724157627075, + "language_loss": 0.75858486, + "learning_rate": 2.5319503355911566e-06, + "loss": 0.83572245, + "num_input_tokens_seen": 154095170, + "router_z_loss_clip": 1.68554688, + "router_z_loss_mlp": 0.13012695, + "step": 7183, + "time_per_iteration": 2.552116870880127 + }, + { + "auxiliary_loss_clip": 0.06451686, + "auxiliary_loss_mlp": 0.01267536, + "balance_loss_clip": 0.06278371, + "balance_loss_mlp": 0.01253923, + "epoch": 0.43192544716669173, + "flos": 25563624819840.0, + "grad_norm": 1.5103875784905794, + "language_loss": 0.77652711, + "learning_rate": 2.5315748922825393e-06, + "loss": 0.85371935, + "num_input_tokens_seen": 154116895, + "router_z_loss_clip": 1.73339844, + "router_z_loss_mlp": 0.13604736, + "step": 7184, + "time_per_iteration": 2.5299277305603027 + }, + { + "auxiliary_loss_clip": 0.06444119, + "auxiliary_loss_mlp": 0.01269203, + "balance_loss_clip": 0.06279948, + "balance_loss_mlp": 0.01255494, + "epoch": 0.4319855704193597, + "flos": 30962317783680.0, + "grad_norm": 1.4924548432613554, + "language_loss": 0.73502755, + "learning_rate": 2.5311994288165474e-06, + "loss": 0.81216079, + "num_input_tokens_seen": 154138395, + "router_z_loss_clip": 1.63964844, + "router_z_loss_mlp": 0.13720703, + "step": 7185, + "time_per_iteration": 2.5939247608184814 + }, + { + "auxiliary_loss_clip": 0.06455707, + "auxiliary_loss_mlp": 0.01271443, + "balance_loss_clip": 0.06279209, + "balance_loss_mlp": 0.0125684, + "epoch": 0.43204569367202766, + "flos": 24244824608640.0, + "grad_norm": 2.4112385113933015, + "language_loss": 0.75683951, + "learning_rate": 2.530823945207421e-06, + "loss": 0.83411103, + "num_input_tokens_seen": 154156775, + "router_z_loss_clip": 1.76464844, + "router_z_loss_mlp": 0.14611816, + "step": 7186, + "time_per_iteration": 2.543679714202881 + }, + { + "auxiliary_loss_clip": 0.06451818, + "auxiliary_loss_mlp": 0.01273087, + "balance_loss_clip": 0.06278853, + "balance_loss_mlp": 0.01259068, + "epoch": 0.43210581692469563, + "flos": 18413058216960.0, + "grad_norm": 2.2976206703160065, + "language_loss": 0.76516449, + "learning_rate": 2.5304484414693962e-06, + "loss": 0.84241354, + "num_input_tokens_seen": 154177500, + "router_z_loss_clip": 1.73046875, + "router_z_loss_mlp": 0.14038086, + "step": 7187, + "time_per_iteration": 2.530064105987549 + }, + { + "auxiliary_loss_clip": 0.06368419, + "auxiliary_loss_mlp": 0.01252589, + "balance_loss_clip": 0.06291005, + "balance_loss_mlp": 0.01249776, + "epoch": 0.4321659401773636, + "flos": 49851718133760.0, + "grad_norm": 0.8382360401327144, + "language_loss": 0.68072379, + "learning_rate": 2.530072917616714e-06, + "loss": 0.75693387, + "num_input_tokens_seen": 154237110, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.02812195, + "step": 7188, + "time_per_iteration": 3.1670610904693604 + }, + { + "auxiliary_loss_clip": 0.06446176, + "auxiliary_loss_mlp": 0.01270026, + "balance_loss_clip": 0.06279401, + "balance_loss_mlp": 0.01256913, + "epoch": 0.43222606343003156, + "flos": 17134229203200.0, + "grad_norm": 1.9056972558163987, + "language_loss": 0.7844317, + "learning_rate": 2.529697373663614e-06, + "loss": 0.86159372, + "num_input_tokens_seen": 154253910, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.13110352, + "step": 7189, + "time_per_iteration": 2.491743564605713 + }, + { + "auxiliary_loss_clip": 0.06457567, + "auxiliary_loss_mlp": 0.01270927, + "balance_loss_clip": 0.06278813, + "balance_loss_mlp": 0.01255906, + "epoch": 0.4322861866826995, + "flos": 22756984784640.0, + "grad_norm": 1.8601510823080152, + "language_loss": 0.72126836, + "learning_rate": 2.5293218096243364e-06, + "loss": 0.79855329, + "num_input_tokens_seen": 154274770, + "router_z_loss_clip": 1.78808594, + "router_z_loss_mlp": 0.15020752, + "step": 7190, + "time_per_iteration": 2.5745973587036133 + }, + { + "auxiliary_loss_clip": 0.06452946, + "auxiliary_loss_mlp": 0.01274284, + "balance_loss_clip": 0.06282853, + "balance_loss_mlp": 0.0125992, + "epoch": 0.4323463099353675, + "flos": 27899400435840.0, + "grad_norm": 1.5852812804273753, + "language_loss": 0.79949737, + "learning_rate": 2.5289462255131223e-06, + "loss": 0.87676966, + "num_input_tokens_seen": 154295035, + "router_z_loss_clip": 1.70019531, + "router_z_loss_mlp": 0.14355469, + "step": 7191, + "time_per_iteration": 2.5719873905181885 + }, + { + "auxiliary_loss_clip": 0.06448484, + "auxiliary_loss_mlp": 0.01269731, + "balance_loss_clip": 0.06279992, + "balance_loss_mlp": 0.01255694, + "epoch": 0.43240643318803546, + "flos": 21620892401280.0, + "grad_norm": 3.0880415359088467, + "language_loss": 0.75279927, + "learning_rate": 2.5285706213442146e-06, + "loss": 0.82998139, + "num_input_tokens_seen": 154314905, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.14056396, + "step": 7192, + "time_per_iteration": 2.536587715148926 + }, + { + "auxiliary_loss_clip": 0.0644784, + "auxiliary_loss_mlp": 0.01276118, + "balance_loss_clip": 0.06277698, + "balance_loss_mlp": 0.01260883, + "epoch": 0.4324665564407034, + "flos": 17562774510720.0, + "grad_norm": 2.069328799544239, + "language_loss": 0.79199994, + "learning_rate": 2.5281949971318557e-06, + "loss": 0.86923951, + "num_input_tokens_seen": 154331740, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.15216064, + "step": 7193, + "time_per_iteration": 2.483978033065796 + }, + { + "auxiliary_loss_clip": 0.06449077, + "auxiliary_loss_mlp": 0.01277624, + "balance_loss_clip": 0.06278618, + "balance_loss_mlp": 0.01263212, + "epoch": 0.4325266796933714, + "flos": 18407775409920.0, + "grad_norm": 2.329186427032778, + "language_loss": 0.76053572, + "learning_rate": 2.5278193528902897e-06, + "loss": 0.83780271, + "num_input_tokens_seen": 154348740, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.14404297, + "step": 7194, + "time_per_iteration": 2.5057263374328613 + }, + { + "auxiliary_loss_clip": 0.06451394, + "auxiliary_loss_mlp": 0.01275378, + "balance_loss_clip": 0.06279992, + "balance_loss_mlp": 0.01260847, + "epoch": 0.4325868029460394, + "flos": 22571342064000.0, + "grad_norm": 1.9582306658700896, + "language_loss": 0.60073519, + "learning_rate": 2.5274436886337613e-06, + "loss": 0.67800295, + "num_input_tokens_seen": 154368835, + "router_z_loss_clip": 1.71191406, + "router_z_loss_mlp": 0.14532471, + "step": 7195, + "time_per_iteration": 2.5116991996765137 + }, + { + "auxiliary_loss_clip": 0.06458029, + "auxiliary_loss_mlp": 0.01275051, + "balance_loss_clip": 0.06281463, + "balance_loss_mlp": 0.01259989, + "epoch": 0.43264692619870737, + "flos": 14609834046720.0, + "grad_norm": 1.968403141706004, + "language_loss": 0.65685856, + "learning_rate": 2.527068004376515e-06, + "loss": 0.73418939, + "num_input_tokens_seen": 154384620, + "router_z_loss_clip": 1.76660156, + "router_z_loss_mlp": 0.1506958, + "step": 7196, + "time_per_iteration": 2.5037827491760254 + }, + { + "auxiliary_loss_clip": 0.06456476, + "auxiliary_loss_mlp": 0.01272338, + "balance_loss_clip": 0.06280259, + "balance_loss_mlp": 0.01257151, + "epoch": 0.43270704945137534, + "flos": 21507184010880.0, + "grad_norm": 2.17558250449299, + "language_loss": 0.72638965, + "learning_rate": 2.526692300132797e-06, + "loss": 0.8036778, + "num_input_tokens_seen": 154402865, + "router_z_loss_clip": 1.76171875, + "router_z_loss_mlp": 0.15197754, + "step": 7197, + "time_per_iteration": 2.4931299686431885 + }, + { + "auxiliary_loss_clip": 0.0645181, + "auxiliary_loss_mlp": 0.01280731, + "balance_loss_clip": 0.06284913, + "balance_loss_mlp": 0.01265627, + "epoch": 0.4327671727040433, + "flos": 25162975722240.0, + "grad_norm": 1.6800922175899422, + "language_loss": 0.72821289, + "learning_rate": 2.5263165759168547e-06, + "loss": 0.8055383, + "num_input_tokens_seen": 154423625, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.15100098, + "step": 7198, + "time_per_iteration": 2.574894428253174 + }, + { + "auxiliary_loss_clip": 0.06448364, + "auxiliary_loss_mlp": 0.01268994, + "balance_loss_clip": 0.06280281, + "balance_loss_mlp": 0.01254969, + "epoch": 0.43282729595671127, + "flos": 25454192987520.0, + "grad_norm": 1.3407856907116962, + "language_loss": 0.8128798, + "learning_rate": 2.525940831742934e-06, + "loss": 0.89005339, + "num_input_tokens_seen": 154444775, + "router_z_loss_clip": 1.68066406, + "router_z_loss_mlp": 0.14013672, + "step": 7199, + "time_per_iteration": 2.5314407348632812 + }, + { + "auxiliary_loss_clip": 0.06450363, + "auxiliary_loss_mlp": 0.01269925, + "balance_loss_clip": 0.06280895, + "balance_loss_mlp": 0.01255918, + "epoch": 0.43288741920937923, + "flos": 24131661269760.0, + "grad_norm": 2.374744791798318, + "language_loss": 0.68757379, + "learning_rate": 2.525565067625286e-06, + "loss": 0.76477665, + "num_input_tokens_seen": 154460815, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.14013672, + "step": 7200, + "time_per_iteration": 2.5569095611572266 + }, + { + "auxiliary_loss_clip": 0.06449814, + "auxiliary_loss_mlp": 0.01269719, + "balance_loss_clip": 0.06278992, + "balance_loss_mlp": 0.01254925, + "epoch": 0.4329475424620472, + "flos": 19210415270400.0, + "grad_norm": 1.7756006077325563, + "language_loss": 0.87039292, + "learning_rate": 2.525189283578157e-06, + "loss": 0.94758821, + "num_input_tokens_seen": 154479145, + "router_z_loss_clip": 1.70898438, + "router_z_loss_mlp": 0.14807129, + "step": 7201, + "time_per_iteration": 2.4946835041046143 + }, + { + "auxiliary_loss_clip": 0.06464264, + "auxiliary_loss_mlp": 0.0127186, + "balance_loss_clip": 0.06283499, + "balance_loss_mlp": 0.01255016, + "epoch": 0.43300766571471516, + "flos": 22645037329920.0, + "grad_norm": 5.903168179153311, + "language_loss": 0.64564252, + "learning_rate": 2.5248134796157974e-06, + "loss": 0.72300375, + "num_input_tokens_seen": 154498905, + "router_z_loss_clip": 1.80664062, + "router_z_loss_mlp": 0.16845703, + "step": 7202, + "time_per_iteration": 2.5667803287506104 + }, + { + "auxiliary_loss_clip": 0.06448028, + "auxiliary_loss_mlp": 0.01268297, + "balance_loss_clip": 0.06278727, + "balance_loss_mlp": 0.01254838, + "epoch": 0.4330677889673831, + "flos": 22126570992000.0, + "grad_norm": 2.072135817395126, + "language_loss": 0.8230809, + "learning_rate": 2.5244376557524586e-06, + "loss": 0.90024418, + "num_input_tokens_seen": 154517270, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.13470459, + "step": 7203, + "time_per_iteration": 5.375681161880493 + }, + { + "auxiliary_loss_clip": 0.06458279, + "auxiliary_loss_mlp": 0.01268927, + "balance_loss_clip": 0.06282033, + "balance_loss_mlp": 0.01254169, + "epoch": 0.4331279122200511, + "flos": 23228184620160.0, + "grad_norm": 2.3968905297379024, + "language_loss": 0.81134045, + "learning_rate": 2.5240618120023912e-06, + "loss": 0.88861251, + "num_input_tokens_seen": 154535945, + "router_z_loss_clip": 1.75976562, + "router_z_loss_mlp": 0.14764404, + "step": 7204, + "time_per_iteration": 2.524557113647461 + }, + { + "auxiliary_loss_clip": 0.06450962, + "auxiliary_loss_mlp": 0.01270518, + "balance_loss_clip": 0.06281083, + "balance_loss_mlp": 0.0125691, + "epoch": 0.43318803547271906, + "flos": 18265625758080.0, + "grad_norm": 2.088854485199162, + "language_loss": 0.7413221, + "learning_rate": 2.5236859483798468e-06, + "loss": 0.81853694, + "num_input_tokens_seen": 154554935, + "router_z_loss_clip": 1.69726562, + "router_z_loss_mlp": 0.13604736, + "step": 7205, + "time_per_iteration": 2.519554376602173 + }, + { + "auxiliary_loss_clip": 0.0644919, + "auxiliary_loss_mlp": 0.01273515, + "balance_loss_clip": 0.06284859, + "balance_loss_mlp": 0.01259908, + "epoch": 0.433248158725387, + "flos": 27425936540160.0, + "grad_norm": 1.5872196628882773, + "language_loss": 0.75603741, + "learning_rate": 2.5233100648990803e-06, + "loss": 0.83326447, + "num_input_tokens_seen": 154576065, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.13598633, + "step": 7206, + "time_per_iteration": 2.5732641220092773 + }, + { + "auxiliary_loss_clip": 0.0644986, + "auxiliary_loss_mlp": 0.01269665, + "balance_loss_clip": 0.06280635, + "balance_loss_mlp": 0.01254728, + "epoch": 0.433308281978055, + "flos": 23224075770240.0, + "grad_norm": 1.828436296505125, + "language_loss": 0.78923273, + "learning_rate": 2.522934161574342e-06, + "loss": 0.86642796, + "num_input_tokens_seen": 154595110, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.1496582, + "step": 7207, + "time_per_iteration": 2.6846628189086914 + }, + { + "auxiliary_loss_clip": 0.06456017, + "auxiliary_loss_mlp": 0.01270448, + "balance_loss_clip": 0.06279423, + "balance_loss_mlp": 0.0125513, + "epoch": 0.433368405230723, + "flos": 15857999665920.0, + "grad_norm": 2.196810095173743, + "language_loss": 0.81095958, + "learning_rate": 2.5225582384198888e-06, + "loss": 0.8882243, + "num_input_tokens_seen": 154612255, + "router_z_loss_clip": 1.76367188, + "router_z_loss_mlp": 0.15307617, + "step": 7208, + "time_per_iteration": 2.4724419116973877 + }, + { + "auxiliary_loss_clip": 0.0645436, + "auxiliary_loss_mlp": 0.01269383, + "balance_loss_clip": 0.0628323, + "balance_loss_mlp": 0.0125481, + "epoch": 0.433428528483391, + "flos": 19032109781760.0, + "grad_norm": 2.1243132825557107, + "language_loss": 0.71321076, + "learning_rate": 2.5221822954499744e-06, + "loss": 0.79044819, + "num_input_tokens_seen": 154630440, + "router_z_loss_clip": 1.7109375, + "router_z_loss_mlp": 0.14581299, + "step": 7209, + "time_per_iteration": 3.9143481254577637 + }, + { + "auxiliary_loss_clip": 0.06450495, + "auxiliary_loss_mlp": 0.01271038, + "balance_loss_clip": 0.06281973, + "balance_loss_mlp": 0.01255517, + "epoch": 0.43348865173605894, + "flos": 24725290320000.0, + "grad_norm": 1.4388803928851785, + "language_loss": 0.8148647, + "learning_rate": 2.5218063326788557e-06, + "loss": 0.89208007, + "num_input_tokens_seen": 154652515, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.15515137, + "step": 7210, + "time_per_iteration": 2.564333915710449 + }, + { + "auxiliary_loss_clip": 0.06451392, + "auxiliary_loss_mlp": 0.01274146, + "balance_loss_clip": 0.06281275, + "balance_loss_mlp": 0.01261045, + "epoch": 0.4335487749887269, + "flos": 22097165408640.0, + "grad_norm": 1.8576931130518815, + "language_loss": 0.82474005, + "learning_rate": 2.5214303501207885e-06, + "loss": 0.90199542, + "num_input_tokens_seen": 154670965, + "router_z_loss_clip": 1.70019531, + "router_z_loss_mlp": 0.13110352, + "step": 7211, + "time_per_iteration": 2.491514205932617 + }, + { + "auxiliary_loss_clip": 0.06452142, + "auxiliary_loss_mlp": 0.01271809, + "balance_loss_clip": 0.06280628, + "balance_loss_mlp": 0.01258362, + "epoch": 0.43360889824139487, + "flos": 22389556631040.0, + "grad_norm": 12.106558391415842, + "language_loss": 0.7536357, + "learning_rate": 2.521054347790029e-06, + "loss": 0.83087522, + "num_input_tokens_seen": 154689980, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.13452148, + "step": 7212, + "time_per_iteration": 2.551093816757202 + }, + { + "auxiliary_loss_clip": 0.06452519, + "auxiliary_loss_mlp": 0.01272111, + "balance_loss_clip": 0.06284005, + "balance_loss_mlp": 0.01259517, + "epoch": 0.43366902149406283, + "flos": 17533746270720.0, + "grad_norm": 1.8081714291238689, + "language_loss": 0.77247733, + "learning_rate": 2.5206783257008375e-06, + "loss": 0.84972358, + "num_input_tokens_seen": 154706570, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.1260376, + "step": 7213, + "time_per_iteration": 3.8823790550231934 + }, + { + "auxiliary_loss_clip": 0.06452443, + "auxiliary_loss_mlp": 0.01274704, + "balance_loss_clip": 0.06281798, + "balance_loss_mlp": 0.01261245, + "epoch": 0.4337291447467308, + "flos": 19028126712960.0, + "grad_norm": 1.4293111519880635, + "language_loss": 0.65090191, + "learning_rate": 2.520302283867471e-06, + "loss": 0.72817338, + "num_input_tokens_seen": 154725210, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.13446045, + "step": 7214, + "time_per_iteration": 2.512341260910034 + }, + { + "auxiliary_loss_clip": 0.0644484, + "auxiliary_loss_mlp": 0.01268075, + "balance_loss_clip": 0.06280676, + "balance_loss_mlp": 0.01255319, + "epoch": 0.43378926799939876, + "flos": 27241216214400.0, + "grad_norm": 1.6847650033402397, + "language_loss": 0.7180531, + "learning_rate": 2.519926222304191e-06, + "loss": 0.79518223, + "num_input_tokens_seen": 154745945, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.12750244, + "step": 7215, + "time_per_iteration": 2.5413544178009033 + }, + { + "auxiliary_loss_clip": 0.06451561, + "auxiliary_loss_mlp": 0.01271937, + "balance_loss_clip": 0.06284516, + "balance_loss_mlp": 0.01258365, + "epoch": 0.43384939125206673, + "flos": 15966592957440.0, + "grad_norm": 1.7641597528508168, + "language_loss": 0.75291193, + "learning_rate": 2.519550141025255e-06, + "loss": 0.83014691, + "num_input_tokens_seen": 154763580, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.13574219, + "step": 7216, + "time_per_iteration": 2.539677143096924 + }, + { + "auxiliary_loss_clip": 0.06459753, + "auxiliary_loss_mlp": 0.01268936, + "balance_loss_clip": 0.06280532, + "balance_loss_mlp": 0.01254256, + "epoch": 0.4339095145047347, + "flos": 21798736692480.0, + "grad_norm": 2.367070732862923, + "language_loss": 0.7623983, + "learning_rate": 2.519174040044927e-06, + "loss": 0.8396852, + "num_input_tokens_seen": 154776825, + "router_z_loss_clip": 1.79394531, + "router_z_loss_mlp": 0.14685059, + "step": 7217, + "time_per_iteration": 2.491522789001465 + }, + { + "auxiliary_loss_clip": 0.06451164, + "auxiliary_loss_mlp": 0.01267926, + "balance_loss_clip": 0.0628095, + "balance_loss_mlp": 0.01254389, + "epoch": 0.43396963775740266, + "flos": 14215054734720.0, + "grad_norm": 2.758270274773255, + "language_loss": 0.74231893, + "learning_rate": 2.5187979193774664e-06, + "loss": 0.81950986, + "num_input_tokens_seen": 154794025, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.13531494, + "step": 7218, + "time_per_iteration": 2.5123910903930664 + }, + { + "auxiliary_loss_clip": 0.06450492, + "auxiliary_loss_mlp": 0.01270563, + "balance_loss_clip": 0.06277994, + "balance_loss_mlp": 0.01256443, + "epoch": 0.4340297610100706, + "flos": 19725150101760.0, + "grad_norm": 1.5975368135070402, + "language_loss": 0.69353253, + "learning_rate": 2.5184217790371367e-06, + "loss": 0.77074307, + "num_input_tokens_seen": 154813105, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.14117432, + "step": 7219, + "time_per_iteration": 2.502150297164917 + }, + { + "auxiliary_loss_clip": 0.06450121, + "auxiliary_loss_mlp": 0.01273865, + "balance_loss_clip": 0.06280973, + "balance_loss_mlp": 0.01259482, + "epoch": 0.4340898842627386, + "flos": 18959588472960.0, + "grad_norm": 2.696483499139917, + "language_loss": 0.77797616, + "learning_rate": 2.518045619038202e-06, + "loss": 0.85521603, + "num_input_tokens_seen": 154833525, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.1439209, + "step": 7220, + "time_per_iteration": 2.5805821418762207 + }, + { + "auxiliary_loss_clip": 0.06449743, + "auxiliary_loss_mlp": 0.01270897, + "balance_loss_clip": 0.06280366, + "balance_loss_mlp": 0.01257331, + "epoch": 0.4341500075154066, + "flos": 22024895662080.0, + "grad_norm": 2.140213938529436, + "language_loss": 0.69858402, + "learning_rate": 2.5176694393949243e-06, + "loss": 0.77579045, + "num_input_tokens_seen": 154853090, + "router_z_loss_clip": 1.69238281, + "router_z_loss_mlp": 0.13562012, + "step": 7221, + "time_per_iteration": 2.556913137435913 + }, + { + "auxiliary_loss_clip": 0.06448823, + "auxiliary_loss_mlp": 0.01267968, + "balance_loss_clip": 0.06277943, + "balance_loss_mlp": 0.01254188, + "epoch": 0.4342101307680746, + "flos": 23588527104000.0, + "grad_norm": 1.6725579163220456, + "language_loss": 0.65062654, + "learning_rate": 2.51729324012157e-06, + "loss": 0.72779441, + "num_input_tokens_seen": 154872055, + "router_z_loss_clip": 1.70898438, + "router_z_loss_mlp": 0.13793945, + "step": 7222, + "time_per_iteration": 2.642038345336914 + }, + { + "auxiliary_loss_clip": 0.0644563, + "auxiliary_loss_mlp": 0.01269163, + "balance_loss_clip": 0.06277044, + "balance_loss_mlp": 0.01254912, + "epoch": 0.43427025402074254, + "flos": 17973821514240.0, + "grad_norm": 2.158287657708821, + "language_loss": 0.73335516, + "learning_rate": 2.5169170212324053e-06, + "loss": 0.81050307, + "num_input_tokens_seen": 154886645, + "router_z_loss_clip": 1.68554688, + "router_z_loss_mlp": 0.14257812, + "step": 7223, + "time_per_iteration": 2.5124166011810303 + }, + { + "auxiliary_loss_clip": 0.06448437, + "auxiliary_loss_mlp": 0.01270913, + "balance_loss_clip": 0.06275682, + "balance_loss_mlp": 0.0125746, + "epoch": 0.4343303772734105, + "flos": 26293575663360.0, + "grad_norm": 1.9810355285503365, + "language_loss": 0.94283241, + "learning_rate": 2.516540782741694e-06, + "loss": 1.02002597, + "num_input_tokens_seen": 154906775, + "router_z_loss_clip": 1.72753906, + "router_z_loss_mlp": 0.13458252, + "step": 7224, + "time_per_iteration": 2.5581512451171875 + }, + { + "auxiliary_loss_clip": 0.06445128, + "auxiliary_loss_mlp": 0.01270275, + "balance_loss_clip": 0.06277162, + "balance_loss_mlp": 0.01257383, + "epoch": 0.43439050052607847, + "flos": 26841279876480.0, + "grad_norm": 2.0217716161026624, + "language_loss": 0.61832798, + "learning_rate": 2.5161645246637056e-06, + "loss": 0.69548196, + "num_input_tokens_seen": 154926990, + "router_z_loss_clip": 1.6796875, + "router_z_loss_mlp": 0.12890625, + "step": 7225, + "time_per_iteration": 2.5797905921936035 + }, + { + "auxiliary_loss_clip": 0.06447432, + "auxiliary_loss_mlp": 0.01269551, + "balance_loss_clip": 0.06278066, + "balance_loss_mlp": 0.01255895, + "epoch": 0.43445062377874644, + "flos": 21404083161600.0, + "grad_norm": 2.452465231522654, + "language_loss": 0.77966076, + "learning_rate": 2.5157882470127054e-06, + "loss": 0.8568306, + "num_input_tokens_seen": 154946210, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.13653564, + "step": 7226, + "time_per_iteration": 2.511101722717285 + }, + { + "auxiliary_loss_clip": 0.06444375, + "auxiliary_loss_mlp": 0.01273195, + "balance_loss_clip": 0.06280836, + "balance_loss_mlp": 0.01260553, + "epoch": 0.4345107470314144, + "flos": 19908151418880.0, + "grad_norm": 1.6845072318289191, + "language_loss": 0.84942114, + "learning_rate": 2.515411949802964e-06, + "loss": 0.92659688, + "num_input_tokens_seen": 154964995, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.12652588, + "step": 7227, + "time_per_iteration": 2.525317430496216 + }, + { + "auxiliary_loss_clip": 0.06449986, + "auxiliary_loss_mlp": 0.01270041, + "balance_loss_clip": 0.06281552, + "balance_loss_mlp": 0.0125601, + "epoch": 0.43457087028408237, + "flos": 26439876092160.0, + "grad_norm": 2.0880007397823714, + "language_loss": 0.77098775, + "learning_rate": 2.5150356330487498e-06, + "loss": 0.84818804, + "num_input_tokens_seen": 154984775, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.14025879, + "step": 7228, + "time_per_iteration": 2.5491206645965576 + }, + { + "auxiliary_loss_clip": 0.06447831, + "auxiliary_loss_mlp": 0.01269154, + "balance_loss_clip": 0.06281967, + "balance_loss_mlp": 0.0125486, + "epoch": 0.43463099353675033, + "flos": 31876947025920.0, + "grad_norm": 1.527689344505128, + "language_loss": 0.80533445, + "learning_rate": 2.5146592967643324e-06, + "loss": 0.88250422, + "num_input_tokens_seen": 155008125, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.14294434, + "step": 7229, + "time_per_iteration": 2.6139633655548096 + }, + { + "auxiliary_loss_clip": 0.06448658, + "auxiliary_loss_mlp": 0.01272316, + "balance_loss_clip": 0.0627811, + "balance_loss_mlp": 0.01258208, + "epoch": 0.4346911167894183, + "flos": 24578109423360.0, + "grad_norm": 1.897670481755329, + "language_loss": 0.8187139, + "learning_rate": 2.5142829409639834e-06, + "loss": 0.89592373, + "num_input_tokens_seen": 155027885, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.14117432, + "step": 7230, + "time_per_iteration": 2.535597085952759 + }, + { + "auxiliary_loss_clip": 0.06454149, + "auxiliary_loss_mlp": 0.01272993, + "balance_loss_clip": 0.06280425, + "balance_loss_mlp": 0.01258849, + "epoch": 0.43475124004208626, + "flos": 17096102795520.0, + "grad_norm": 2.6326033188165012, + "language_loss": 0.77091682, + "learning_rate": 2.513906565661973e-06, + "loss": 0.84818828, + "num_input_tokens_seen": 155043375, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.14135742, + "step": 7231, + "time_per_iteration": 2.509392738342285 + }, + { + "auxiliary_loss_clip": 0.064488, + "auxiliary_loss_mlp": 0.01274763, + "balance_loss_clip": 0.06282736, + "balance_loss_mlp": 0.01262162, + "epoch": 0.4348113632947542, + "flos": 26111874084480.0, + "grad_norm": 2.1662461953899044, + "language_loss": 0.69288278, + "learning_rate": 2.513530170872575e-06, + "loss": 0.77011836, + "num_input_tokens_seen": 155062930, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.1260376, + "step": 7232, + "time_per_iteration": 2.547469139099121 + }, + { + "auxiliary_loss_clip": 0.0645097, + "auxiliary_loss_mlp": 0.01271517, + "balance_loss_clip": 0.06279375, + "balance_loss_mlp": 0.01256431, + "epoch": 0.4348714865474222, + "flos": 34208446083840.0, + "grad_norm": 2.030594980717477, + "language_loss": 0.72046328, + "learning_rate": 2.5131537566100605e-06, + "loss": 0.79768813, + "num_input_tokens_seen": 155084980, + "router_z_loss_clip": 1.71386719, + "router_z_loss_mlp": 0.15075684, + "step": 7233, + "time_per_iteration": 2.633953332901001 + }, + { + "auxiliary_loss_clip": 0.06453332, + "auxiliary_loss_mlp": 0.01271348, + "balance_loss_clip": 0.06279553, + "balance_loss_mlp": 0.01257466, + "epoch": 0.43493160980009016, + "flos": 31545045803520.0, + "grad_norm": 1.5667863682634524, + "language_loss": 0.75517476, + "learning_rate": 2.5127773228887053e-06, + "loss": 0.83242154, + "num_input_tokens_seen": 155107260, + "router_z_loss_clip": 1.73730469, + "router_z_loss_mlp": 0.13885498, + "step": 7234, + "time_per_iteration": 2.592467784881592 + }, + { + "auxiliary_loss_clip": 0.06464201, + "auxiliary_loss_mlp": 0.01272529, + "balance_loss_clip": 0.06286918, + "balance_loss_mlp": 0.01258003, + "epoch": 0.4349917330527582, + "flos": 24068238128640.0, + "grad_norm": 2.6345915143615284, + "language_loss": 0.5890404, + "learning_rate": 2.512400869722782e-06, + "loss": 0.6664077, + "num_input_tokens_seen": 155126720, + "router_z_loss_clip": 1.77246094, + "router_z_loss_mlp": 0.14520264, + "step": 7235, + "time_per_iteration": 2.5652947425842285 + }, + { + "auxiliary_loss_clip": 0.06449015, + "auxiliary_loss_mlp": 0.01271774, + "balance_loss_clip": 0.06278686, + "balance_loss_mlp": 0.01257754, + "epoch": 0.43505185630542614, + "flos": 30527315712000.0, + "grad_norm": 1.3439257210534017, + "language_loss": 0.77555895, + "learning_rate": 2.512024397126566e-06, + "loss": 0.85276687, + "num_input_tokens_seen": 155148640, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.14019775, + "step": 7236, + "time_per_iteration": 2.600897789001465 + }, + { + "auxiliary_loss_clip": 0.06450135, + "auxiliary_loss_mlp": 0.01275561, + "balance_loss_clip": 0.06283981, + "balance_loss_mlp": 0.01260833, + "epoch": 0.4351119795580941, + "flos": 15739427738880.0, + "grad_norm": 1.5753739577535406, + "language_loss": 0.81058431, + "learning_rate": 2.5116479051143345e-06, + "loss": 0.88784134, + "num_input_tokens_seen": 155165870, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.14733887, + "step": 7237, + "time_per_iteration": 2.515153169631958 + }, + { + "auxiliary_loss_clip": 0.0644604, + "auxiliary_loss_mlp": 0.01270202, + "balance_loss_clip": 0.0627768, + "balance_loss_mlp": 0.0125607, + "epoch": 0.4351721028107621, + "flos": 18737328718080.0, + "grad_norm": 1.5657016421471992, + "language_loss": 0.63616467, + "learning_rate": 2.5112713937003623e-06, + "loss": 0.71332717, + "num_input_tokens_seen": 155185315, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.14129639, + "step": 7238, + "time_per_iteration": 2.4845099449157715 + }, + { + "auxiliary_loss_clip": 0.06448185, + "auxiliary_loss_mlp": 0.01273501, + "balance_loss_clip": 0.06281941, + "balance_loss_mlp": 0.01260162, + "epoch": 0.43523222606343004, + "flos": 25233652241280.0, + "grad_norm": 1.9152472058436172, + "language_loss": 0.85898602, + "learning_rate": 2.510894862898928e-06, + "loss": 0.93620288, + "num_input_tokens_seen": 155205790, + "router_z_loss_clip": 1.66308594, + "router_z_loss_mlp": 0.13342285, + "step": 7239, + "time_per_iteration": 2.579202175140381 + }, + { + "auxiliary_loss_clip": 0.06452584, + "auxiliary_loss_mlp": 0.01267786, + "balance_loss_clip": 0.06283215, + "balance_loss_mlp": 0.01253987, + "epoch": 0.435292349316098, + "flos": 22715504213760.0, + "grad_norm": 1.439066736410537, + "language_loss": 0.72456282, + "learning_rate": 2.510518312724309e-06, + "loss": 0.80176651, + "num_input_tokens_seen": 155226475, + "router_z_loss_clip": 1.69433594, + "router_z_loss_mlp": 0.13793945, + "step": 7240, + "time_per_iteration": 2.5192179679870605 + }, + { + "auxiliary_loss_clip": 0.06454788, + "auxiliary_loss_mlp": 0.01270866, + "balance_loss_clip": 0.06282151, + "balance_loss_mlp": 0.01256913, + "epoch": 0.43535247256876597, + "flos": 25783033536000.0, + "grad_norm": 2.0220617163145485, + "language_loss": 0.81900156, + "learning_rate": 2.5101417431907842e-06, + "loss": 0.89625818, + "num_input_tokens_seen": 155247110, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.1394043, + "step": 7241, + "time_per_iteration": 2.5792059898376465 + }, + { + "auxiliary_loss_clip": 0.06460294, + "auxiliary_loss_mlp": 0.01275581, + "balance_loss_clip": 0.0628238, + "balance_loss_mlp": 0.01260346, + "epoch": 0.43541259582143393, + "flos": 17533578562560.0, + "grad_norm": 2.581589278543144, + "language_loss": 0.79383838, + "learning_rate": 2.5097651543126345e-06, + "loss": 0.8711971, + "num_input_tokens_seen": 155261335, + "router_z_loss_clip": 1.77929688, + "router_z_loss_mlp": 0.15246582, + "step": 7242, + "time_per_iteration": 3.918156623840332 + }, + { + "auxiliary_loss_clip": 0.06452459, + "auxiliary_loss_mlp": 0.01271144, + "balance_loss_clip": 0.06278028, + "balance_loss_mlp": 0.01257405, + "epoch": 0.4354727190741019, + "flos": 15200612058240.0, + "grad_norm": 2.430343835688426, + "language_loss": 0.69088292, + "learning_rate": 2.509388546104138e-06, + "loss": 0.76811898, + "num_input_tokens_seen": 155278510, + "router_z_loss_clip": 1.7421875, + "router_z_loss_mlp": 0.13745117, + "step": 7243, + "time_per_iteration": 3.900606632232666 + }, + { + "auxiliary_loss_clip": 0.06444837, + "auxiliary_loss_mlp": 0.01271827, + "balance_loss_clip": 0.06279606, + "balance_loss_mlp": 0.01258655, + "epoch": 0.43553284232676986, + "flos": 16654015054080.0, + "grad_norm": 1.5901355562967736, + "language_loss": 0.81475091, + "learning_rate": 2.5090119185795766e-06, + "loss": 0.89191759, + "num_input_tokens_seen": 155296450, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.1317749, + "step": 7244, + "time_per_iteration": 2.581033229827881 + }, + { + "auxiliary_loss_clip": 0.06446069, + "auxiliary_loss_mlp": 0.01268137, + "balance_loss_clip": 0.06277774, + "balance_loss_mlp": 0.01255596, + "epoch": 0.43559296557943783, + "flos": 23407035160320.0, + "grad_norm": 1.5978807757182665, + "language_loss": 0.73241115, + "learning_rate": 2.508635271753234e-06, + "loss": 0.80955315, + "num_input_tokens_seen": 155316080, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.12554932, + "step": 7245, + "time_per_iteration": 2.5589826107025146 + }, + { + "auxiliary_loss_clip": 0.06452223, + "auxiliary_loss_mlp": 0.01269888, + "balance_loss_clip": 0.06282671, + "balance_loss_mlp": 0.01255792, + "epoch": 0.4356530888321058, + "flos": 22425628613760.0, + "grad_norm": 1.6720109050482812, + "language_loss": 0.77539527, + "learning_rate": 2.508258605639389e-06, + "loss": 0.85261637, + "num_input_tokens_seen": 155336765, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.14111328, + "step": 7246, + "time_per_iteration": 2.593538999557495 + }, + { + "auxiliary_loss_clip": 0.06448724, + "auxiliary_loss_mlp": 0.01267563, + "balance_loss_clip": 0.06280839, + "balance_loss_mlp": 0.01254033, + "epoch": 0.43571321208477376, + "flos": 21622527555840.0, + "grad_norm": 3.3071750834647426, + "language_loss": 0.86156344, + "learning_rate": 2.5078819202523275e-06, + "loss": 0.93872631, + "num_input_tokens_seen": 155356440, + "router_z_loss_clip": 1.67871094, + "router_z_loss_mlp": 0.13531494, + "step": 7247, + "time_per_iteration": 2.5369882583618164 + }, + { + "auxiliary_loss_clip": 0.06446265, + "auxiliary_loss_mlp": 0.01269788, + "balance_loss_clip": 0.06277846, + "balance_loss_mlp": 0.01257194, + "epoch": 0.4357733353374418, + "flos": 23994081665280.0, + "grad_norm": 1.7467086672612386, + "language_loss": 0.73132598, + "learning_rate": 2.507505215606333e-06, + "loss": 0.80848658, + "num_input_tokens_seen": 155377070, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.12597656, + "step": 7248, + "time_per_iteration": 3.9830687046051025 + }, + { + "auxiliary_loss_clip": 0.06447548, + "auxiliary_loss_mlp": 0.01267385, + "balance_loss_clip": 0.06279291, + "balance_loss_mlp": 0.01254022, + "epoch": 0.43583345859010975, + "flos": 25271736721920.0, + "grad_norm": 1.509350817375945, + "language_loss": 0.87227005, + "learning_rate": 2.5071284917156893e-06, + "loss": 0.94941938, + "num_input_tokens_seen": 155398415, + "router_z_loss_clip": 1.68066406, + "router_z_loss_mlp": 0.13378906, + "step": 7249, + "time_per_iteration": 2.565516948699951 + }, + { + "auxiliary_loss_clip": 0.06451611, + "auxiliary_loss_mlp": 0.01267914, + "balance_loss_clip": 0.06279075, + "balance_loss_mlp": 0.01254223, + "epoch": 0.4358935818427777, + "flos": 23703115962240.0, + "grad_norm": 1.8925784396827436, + "language_loss": 0.8199448, + "learning_rate": 2.506751748594683e-06, + "loss": 0.89714003, + "num_input_tokens_seen": 155415625, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.13690186, + "step": 7250, + "time_per_iteration": 2.5410354137420654 + }, + { + "auxiliary_loss_clip": 0.06454265, + "auxiliary_loss_mlp": 0.01273165, + "balance_loss_clip": 0.06283678, + "balance_loss_mlp": 0.01258901, + "epoch": 0.4359537050954457, + "flos": 29540416723200.0, + "grad_norm": 2.0613712873147723, + "language_loss": 0.85409963, + "learning_rate": 2.5063749862575988e-06, + "loss": 0.93137395, + "num_input_tokens_seen": 155435505, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.14251709, + "step": 7251, + "time_per_iteration": 2.5893919467926025 + }, + { + "auxiliary_loss_clip": 0.06448197, + "auxiliary_loss_mlp": 0.01270693, + "balance_loss_clip": 0.06280132, + "balance_loss_mlp": 0.01257431, + "epoch": 0.43601382834811364, + "flos": 22717935982080.0, + "grad_norm": 1.9454057009257966, + "language_loss": 0.69792974, + "learning_rate": 2.5059982047187245e-06, + "loss": 0.77511865, + "num_input_tokens_seen": 155455425, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.13262939, + "step": 7252, + "time_per_iteration": 2.518423080444336 + }, + { + "auxiliary_loss_clip": 0.06442783, + "auxiliary_loss_mlp": 0.01269502, + "balance_loss_clip": 0.06278728, + "balance_loss_mlp": 0.01256336, + "epoch": 0.4360739516007816, + "flos": 19104714944640.0, + "grad_norm": 1.67696041016681, + "language_loss": 0.83826983, + "learning_rate": 2.505621403992348e-06, + "loss": 0.91539264, + "num_input_tokens_seen": 155474250, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.13146973, + "step": 7253, + "time_per_iteration": 3.929287910461426 + }, + { + "auxiliary_loss_clip": 0.06446494, + "auxiliary_loss_mlp": 0.01271781, + "balance_loss_clip": 0.06278495, + "balance_loss_mlp": 0.01257095, + "epoch": 0.43613407485344957, + "flos": 23411185937280.0, + "grad_norm": 1.865330471105, + "language_loss": 0.7061553, + "learning_rate": 2.505244584092757e-06, + "loss": 0.78333807, + "num_input_tokens_seen": 155494685, + "router_z_loss_clip": 1.68066406, + "router_z_loss_mlp": 0.14678955, + "step": 7254, + "time_per_iteration": 2.5348615646362305 + }, + { + "auxiliary_loss_clip": 0.06446688, + "auxiliary_loss_mlp": 0.01270934, + "balance_loss_clip": 0.0628084, + "balance_loss_mlp": 0.01257249, + "epoch": 0.43619419810611754, + "flos": 22644366497280.0, + "grad_norm": 1.8869772682878516, + "language_loss": 0.81010306, + "learning_rate": 2.5048677450342406e-06, + "loss": 0.88727921, + "num_input_tokens_seen": 155513040, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.13671875, + "step": 7255, + "time_per_iteration": 2.6183383464813232 + }, + { + "auxiliary_loss_clip": 0.06450298, + "auxiliary_loss_mlp": 0.01267933, + "balance_loss_clip": 0.06279971, + "balance_loss_mlp": 0.01254772, + "epoch": 0.4362543213587855, + "flos": 20054200285440.0, + "grad_norm": 1.8086691858124306, + "language_loss": 0.78106731, + "learning_rate": 2.504490886831089e-06, + "loss": 0.85824955, + "num_input_tokens_seen": 155530100, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.13165283, + "step": 7256, + "time_per_iteration": 2.5364508628845215 + }, + { + "auxiliary_loss_clip": 0.06446915, + "auxiliary_loss_mlp": 0.01269551, + "balance_loss_clip": 0.06280836, + "balance_loss_mlp": 0.01256122, + "epoch": 0.43631444461145347, + "flos": 21367759616640.0, + "grad_norm": 1.5279282177598472, + "language_loss": 0.75952047, + "learning_rate": 2.5041140094975922e-06, + "loss": 0.83668512, + "num_input_tokens_seen": 155549375, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.13452148, + "step": 7257, + "time_per_iteration": 2.5156846046447754 + }, + { + "auxiliary_loss_clip": 0.06452259, + "auxiliary_loss_mlp": 0.01269452, + "balance_loss_clip": 0.06281701, + "balance_loss_mlp": 0.01255123, + "epoch": 0.43637456786412143, + "flos": 22424999708160.0, + "grad_norm": 1.7230532534800784, + "language_loss": 0.73248196, + "learning_rate": 2.5037371130480417e-06, + "loss": 0.80969918, + "num_input_tokens_seen": 155569395, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.14324951, + "step": 7258, + "time_per_iteration": 2.6132447719573975 + }, + { + "auxiliary_loss_clip": 0.06453618, + "auxiliary_loss_mlp": 0.01267142, + "balance_loss_clip": 0.06282197, + "balance_loss_mlp": 0.01253725, + "epoch": 0.4364346911167894, + "flos": 28556452627200.0, + "grad_norm": 1.8100021880336497, + "language_loss": 0.77633202, + "learning_rate": 2.5033601974967297e-06, + "loss": 0.85353959, + "num_input_tokens_seen": 155589090, + "router_z_loss_clip": 1.71191406, + "router_z_loss_mlp": 0.13415527, + "step": 7259, + "time_per_iteration": 2.589134931564331 + }, + { + "auxiliary_loss_clip": 0.06393245, + "auxiliary_loss_mlp": 0.01278627, + "balance_loss_clip": 0.0631365, + "balance_loss_mlp": 0.01275647, + "epoch": 0.43649481436945736, + "flos": 62678149407360.0, + "grad_norm": 0.7458705100033151, + "language_loss": 0.56939262, + "learning_rate": 2.5029832628579483e-06, + "loss": 0.64611137, + "num_input_tokens_seen": 155648660, + "router_z_loss_clip": 0.796875, + "router_z_loss_mlp": 0.02978516, + "step": 7260, + "time_per_iteration": 3.11572265625 + }, + { + "auxiliary_loss_clip": 0.06454421, + "auxiliary_loss_mlp": 0.01272288, + "balance_loss_clip": 0.06285764, + "balance_loss_mlp": 0.01257494, + "epoch": 0.4365549376221254, + "flos": 30600088583040.0, + "grad_norm": 1.806363539403124, + "language_loss": 0.71915948, + "learning_rate": 2.5026063091459907e-06, + "loss": 0.79642659, + "num_input_tokens_seen": 155669945, + "router_z_loss_clip": 1.6875, + "router_z_loss_mlp": 0.14794922, + "step": 7261, + "time_per_iteration": 2.6100480556488037 + }, + { + "auxiliary_loss_clip": 0.06453972, + "auxiliary_loss_mlp": 0.01271962, + "balance_loss_clip": 0.06284794, + "balance_loss_mlp": 0.0125836, + "epoch": 0.43661506087479335, + "flos": 17171684778240.0, + "grad_norm": 2.033659544742114, + "language_loss": 0.69274759, + "learning_rate": 2.5022293363751522e-06, + "loss": 0.77000701, + "num_input_tokens_seen": 155688555, + "router_z_loss_clip": 1.69042969, + "router_z_loss_mlp": 0.13604736, + "step": 7262, + "time_per_iteration": 2.556318521499634 + }, + { + "auxiliary_loss_clip": 0.0644339, + "auxiliary_loss_mlp": 0.01266124, + "balance_loss_clip": 0.06282735, + "balance_loss_mlp": 0.01253345, + "epoch": 0.4366751841274613, + "flos": 22052875726080.0, + "grad_norm": 1.6437752521732585, + "language_loss": 0.80115777, + "learning_rate": 2.501852344559726e-06, + "loss": 0.87825286, + "num_input_tokens_seen": 155705370, + "router_z_loss_clip": 1.60546875, + "router_z_loss_mlp": 0.12780762, + "step": 7263, + "time_per_iteration": 2.509807825088501 + }, + { + "auxiliary_loss_clip": 0.06448945, + "auxiliary_loss_mlp": 0.01267422, + "balance_loss_clip": 0.06281485, + "balance_loss_mlp": 0.01254076, + "epoch": 0.4367353073801293, + "flos": 16002748794240.0, + "grad_norm": 1.6772415302555446, + "language_loss": 0.76036841, + "learning_rate": 2.50147533371401e-06, + "loss": 0.83753204, + "num_input_tokens_seen": 155721890, + "router_z_loss_clip": 1.67480469, + "router_z_loss_mlp": 0.13354492, + "step": 7264, + "time_per_iteration": 2.523973226547241 + }, + { + "auxiliary_loss_clip": 0.06444526, + "auxiliary_loss_mlp": 0.01267772, + "balance_loss_clip": 0.06279328, + "balance_loss_mlp": 0.01253997, + "epoch": 0.43679543063279724, + "flos": 38226760485120.0, + "grad_norm": 2.1479145935669615, + "language_loss": 0.61845875, + "learning_rate": 2.501098303852298e-06, + "loss": 0.69558173, + "num_input_tokens_seen": 155743970, + "router_z_loss_clip": 1.65234375, + "router_z_loss_mlp": 0.13787842, + "step": 7265, + "time_per_iteration": 2.6696202754974365 + }, + { + "auxiliary_loss_clip": 0.06447139, + "auxiliary_loss_mlp": 0.01269097, + "balance_loss_clip": 0.06282498, + "balance_loss_mlp": 0.01256211, + "epoch": 0.4368555538854652, + "flos": 15198306071040.0, + "grad_norm": 1.934873925186605, + "language_loss": 0.73721504, + "learning_rate": 2.5007212549888884e-06, + "loss": 0.81437743, + "num_input_tokens_seen": 155761830, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.12896729, + "step": 7266, + "time_per_iteration": 2.5559945106506348 + }, + { + "auxiliary_loss_clip": 0.0644975, + "auxiliary_loss_mlp": 0.01273187, + "balance_loss_clip": 0.06282988, + "balance_loss_mlp": 0.01260432, + "epoch": 0.4369156771381332, + "flos": 23074630813440.0, + "grad_norm": 2.1253877681457904, + "language_loss": 0.82184762, + "learning_rate": 2.5003441871380794e-06, + "loss": 0.899077, + "num_input_tokens_seen": 155779610, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.12762451, + "step": 7267, + "time_per_iteration": 2.534639358520508 + }, + { + "auxiliary_loss_clip": 0.06444408, + "auxiliary_loss_mlp": 0.01269536, + "balance_loss_clip": 0.06281124, + "balance_loss_mlp": 0.01256459, + "epoch": 0.43697580039080114, + "flos": 23447886825600.0, + "grad_norm": 2.09966668439896, + "language_loss": 0.75195235, + "learning_rate": 2.4999671003141674e-06, + "loss": 0.82909179, + "num_input_tokens_seen": 155798765, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.13085938, + "step": 7268, + "time_per_iteration": 2.6128745079040527 + }, + { + "auxiliary_loss_clip": 0.06451406, + "auxiliary_loss_mlp": 0.0126863, + "balance_loss_clip": 0.06280525, + "balance_loss_mlp": 0.0125451, + "epoch": 0.4370359236434691, + "flos": 18520519478400.0, + "grad_norm": 3.050341004743464, + "language_loss": 0.79660171, + "learning_rate": 2.499589994531454e-06, + "loss": 0.87380207, + "num_input_tokens_seen": 155817750, + "router_z_loss_clip": 1.70898438, + "router_z_loss_mlp": 0.14099121, + "step": 7269, + "time_per_iteration": 2.516211986541748 + }, + { + "auxiliary_loss_clip": 0.06446489, + "auxiliary_loss_mlp": 0.01273185, + "balance_loss_clip": 0.06281964, + "balance_loss_mlp": 0.01260174, + "epoch": 0.43709604689613707, + "flos": 23229316650240.0, + "grad_norm": 1.8886828014681587, + "language_loss": 0.75057715, + "learning_rate": 2.499212869804237e-06, + "loss": 0.82777393, + "num_input_tokens_seen": 155836490, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.13024902, + "step": 7270, + "time_per_iteration": 2.5755550861358643 + }, + { + "auxiliary_loss_clip": 0.06447008, + "auxiliary_loss_mlp": 0.01268284, + "balance_loss_clip": 0.06279345, + "balance_loss_mlp": 0.01255064, + "epoch": 0.43715617014880503, + "flos": 23810199880320.0, + "grad_norm": 1.808972971243201, + "language_loss": 0.79453981, + "learning_rate": 2.4988357261468182e-06, + "loss": 0.87169278, + "num_input_tokens_seen": 155856225, + "router_z_loss_clip": 1.67773438, + "router_z_loss_mlp": 0.13220215, + "step": 7271, + "time_per_iteration": 2.564471960067749 + }, + { + "auxiliary_loss_clip": 0.06369642, + "auxiliary_loss_mlp": 0.01258814, + "balance_loss_clip": 0.0629034, + "balance_loss_mlp": 0.01255858, + "epoch": 0.437216293401473, + "flos": 61961824851840.0, + "grad_norm": 0.6886560925106296, + "language_loss": 0.54733157, + "learning_rate": 2.4984585635734993e-06, + "loss": 0.62361616, + "num_input_tokens_seen": 155916770, + "router_z_loss_clip": 0.79443359, + "router_z_loss_mlp": 0.02954102, + "step": 7272, + "time_per_iteration": 3.208707332611084 + }, + { + "auxiliary_loss_clip": 0.06451584, + "auxiliary_loss_mlp": 0.01270794, + "balance_loss_clip": 0.06281105, + "balance_loss_mlp": 0.01256757, + "epoch": 0.43727641665414096, + "flos": 21988907533440.0, + "grad_norm": 1.571184799437717, + "language_loss": 0.70994467, + "learning_rate": 2.498081382098581e-06, + "loss": 0.78716844, + "num_input_tokens_seen": 155936490, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.14031982, + "step": 7273, + "time_per_iteration": 2.540081262588501 + }, + { + "auxiliary_loss_clip": 0.06448624, + "auxiliary_loss_mlp": 0.0126917, + "balance_loss_clip": 0.06279367, + "balance_loss_mlp": 0.01255271, + "epoch": 0.437336539906809, + "flos": 39540277889280.0, + "grad_norm": 1.8107596290780341, + "language_loss": 0.7551834, + "learning_rate": 2.497704181736367e-06, + "loss": 0.83236134, + "num_input_tokens_seen": 155957595, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.13903809, + "step": 7274, + "time_per_iteration": 2.6836495399475098 + }, + { + "auxiliary_loss_clip": 0.06441884, + "auxiliary_loss_mlp": 0.01265059, + "balance_loss_clip": 0.06277934, + "balance_loss_mlp": 0.01252703, + "epoch": 0.43739666315947695, + "flos": 17462902043520.0, + "grad_norm": 1.9085211858375455, + "language_loss": 0.80314881, + "learning_rate": 2.49732696250116e-06, + "loss": 0.88021827, + "num_input_tokens_seen": 155975710, + "router_z_loss_clip": 1.63867188, + "router_z_loss_mlp": 0.12353516, + "step": 7275, + "time_per_iteration": 2.5408823490142822 + }, + { + "auxiliary_loss_clip": 0.06450746, + "auxiliary_loss_mlp": 0.01272848, + "balance_loss_clip": 0.06284586, + "balance_loss_mlp": 0.01259753, + "epoch": 0.4374567864121449, + "flos": 16363678256640.0, + "grad_norm": 1.98644372860744, + "language_loss": 0.81298435, + "learning_rate": 2.496949724407266e-06, + "loss": 0.89022022, + "num_input_tokens_seen": 155993090, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.13092041, + "step": 7276, + "time_per_iteration": 2.4871010780334473 + }, + { + "auxiliary_loss_clip": 0.06454313, + "auxiliary_loss_mlp": 0.01266955, + "balance_loss_clip": 0.06281172, + "balance_loss_mlp": 0.01253013, + "epoch": 0.4375169096648129, + "flos": 30594721921920.0, + "grad_norm": 1.9320579241517422, + "language_loss": 0.73048055, + "learning_rate": 2.496572467468988e-06, + "loss": 0.8076933, + "num_input_tokens_seen": 156013685, + "router_z_loss_clip": 1.73339844, + "router_z_loss_mlp": 0.1394043, + "step": 7277, + "time_per_iteration": 2.6151673793792725 + }, + { + "auxiliary_loss_clip": 0.06445154, + "auxiliary_loss_mlp": 0.01272648, + "balance_loss_clip": 0.06279732, + "balance_loss_mlp": 0.01258939, + "epoch": 0.43757703291748085, + "flos": 30563555402880.0, + "grad_norm": 1.9557335242574223, + "language_loss": 0.72527206, + "learning_rate": 2.4961951917006317e-06, + "loss": 0.80245006, + "num_input_tokens_seen": 156034300, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.13696289, + "step": 7278, + "time_per_iteration": 2.583293914794922 + }, + { + "auxiliary_loss_clip": 0.06440841, + "auxiliary_loss_mlp": 0.01270709, + "balance_loss_clip": 0.06277154, + "balance_loss_mlp": 0.01258371, + "epoch": 0.4376371561701488, + "flos": 21403747745280.0, + "grad_norm": 1.4778175335443475, + "language_loss": 0.65870327, + "learning_rate": 2.4958178971165046e-06, + "loss": 0.73581874, + "num_input_tokens_seen": 156053805, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.12329102, + "step": 7279, + "time_per_iteration": 2.5419130325317383 + }, + { + "auxiliary_loss_clip": 0.06451775, + "auxiliary_loss_mlp": 0.01270137, + "balance_loss_clip": 0.06279162, + "balance_loss_mlp": 0.01256559, + "epoch": 0.4376972794228168, + "flos": 23411144010240.0, + "grad_norm": 1.7454635588007905, + "language_loss": 0.8264519, + "learning_rate": 2.4954405837309126e-06, + "loss": 0.90367103, + "num_input_tokens_seen": 156073295, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.13568115, + "step": 7280, + "time_per_iteration": 2.5270493030548096 + }, + { + "auxiliary_loss_clip": 0.06438784, + "auxiliary_loss_mlp": 0.01272842, + "balance_loss_clip": 0.06277376, + "balance_loss_mlp": 0.01259848, + "epoch": 0.43775740267548474, + "flos": 22899511779840.0, + "grad_norm": 1.6085189920631162, + "language_loss": 0.7756325, + "learning_rate": 2.4950632515581653e-06, + "loss": 0.85274875, + "num_input_tokens_seen": 156094540, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.13000488, + "step": 7281, + "time_per_iteration": 2.614102602005005 + }, + { + "auxiliary_loss_clip": 0.0644282, + "auxiliary_loss_mlp": 0.01275956, + "balance_loss_clip": 0.06276567, + "balance_loss_mlp": 0.01263028, + "epoch": 0.4378175259281527, + "flos": 23301041345280.0, + "grad_norm": 1.8125010794319167, + "language_loss": 0.7622053, + "learning_rate": 2.494685900612569e-06, + "loss": 0.83939308, + "num_input_tokens_seen": 156114070, + "router_z_loss_clip": 1.66308594, + "router_z_loss_mlp": 0.12915039, + "step": 7282, + "time_per_iteration": 3.9149930477142334 + }, + { + "auxiliary_loss_clip": 0.06446523, + "auxiliary_loss_mlp": 0.01267087, + "balance_loss_clip": 0.06279582, + "balance_loss_mlp": 0.01254438, + "epoch": 0.43787764918082067, + "flos": 23883433948800.0, + "grad_norm": 2.0076194716834874, + "language_loss": 0.85396934, + "learning_rate": 2.4943085309084333e-06, + "loss": 0.93110549, + "num_input_tokens_seen": 156132130, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.12652588, + "step": 7283, + "time_per_iteration": 3.9656553268432617 + }, + { + "auxiliary_loss_clip": 0.0644891, + "auxiliary_loss_mlp": 0.01268213, + "balance_loss_clip": 0.06279234, + "balance_loss_mlp": 0.01254999, + "epoch": 0.43793777243348864, + "flos": 23995004060160.0, + "grad_norm": 1.8602515290448327, + "language_loss": 0.8091675, + "learning_rate": 2.49393114246007e-06, + "loss": 0.88633871, + "num_input_tokens_seen": 156150820, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.13214111, + "step": 7284, + "time_per_iteration": 2.566521167755127 + }, + { + "auxiliary_loss_clip": 0.06443676, + "auxiliary_loss_mlp": 0.0127107, + "balance_loss_clip": 0.06278057, + "balance_loss_mlp": 0.01258774, + "epoch": 0.4379978956861566, + "flos": 18629909383680.0, + "grad_norm": 1.7731724137458924, + "language_loss": 0.80635571, + "learning_rate": 2.493553735281787e-06, + "loss": 0.8835032, + "num_input_tokens_seen": 156170125, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.12310791, + "step": 7285, + "time_per_iteration": 2.5004618167877197 + }, + { + "auxiliary_loss_clip": 0.0643899, + "auxiliary_loss_mlp": 0.01269665, + "balance_loss_clip": 0.06274976, + "balance_loss_mlp": 0.01256642, + "epoch": 0.43805801893882457, + "flos": 21987901284480.0, + "grad_norm": 1.9005617879541583, + "language_loss": 0.75070119, + "learning_rate": 2.493176309387897e-06, + "loss": 0.82778776, + "num_input_tokens_seen": 156187320, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.13031006, + "step": 7286, + "time_per_iteration": 2.5617265701293945 + }, + { + "auxiliary_loss_clip": 0.0644343, + "auxiliary_loss_mlp": 0.01269982, + "balance_loss_clip": 0.06274993, + "balance_loss_mlp": 0.01257239, + "epoch": 0.43811814219149253, + "flos": 26400114529920.0, + "grad_norm": 2.124374396883661, + "language_loss": 0.73769003, + "learning_rate": 2.492798864792712e-06, + "loss": 0.81482422, + "num_input_tokens_seen": 156207455, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.12738037, + "step": 7287, + "time_per_iteration": 2.5709421634674072 + }, + { + "auxiliary_loss_clip": 0.06442735, + "auxiliary_loss_mlp": 0.01272914, + "balance_loss_clip": 0.06276426, + "balance_loss_mlp": 0.01259115, + "epoch": 0.43817826544416055, + "flos": 17499015953280.0, + "grad_norm": 1.6607447345750057, + "language_loss": 0.82538438, + "learning_rate": 2.492421401510545e-06, + "loss": 0.90254092, + "num_input_tokens_seen": 156226560, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.13812256, + "step": 7288, + "time_per_iteration": 3.92202091217041 + }, + { + "auxiliary_loss_clip": 0.06447385, + "auxiliary_loss_mlp": 0.01268847, + "balance_loss_clip": 0.06276591, + "balance_loss_mlp": 0.01254888, + "epoch": 0.4382383886968285, + "flos": 21587629530240.0, + "grad_norm": 1.4460149141548964, + "language_loss": 0.84252048, + "learning_rate": 2.4920439195557093e-06, + "loss": 0.9196828, + "num_input_tokens_seen": 156246740, + "router_z_loss_clip": 1.70703125, + "router_z_loss_mlp": 0.1395874, + "step": 7289, + "time_per_iteration": 2.557433843612671 + }, + { + "auxiliary_loss_clip": 0.06446871, + "auxiliary_loss_mlp": 0.01267959, + "balance_loss_clip": 0.06274465, + "balance_loss_mlp": 0.01254912, + "epoch": 0.4382985119494965, + "flos": 27930441173760.0, + "grad_norm": 2.36337419111835, + "language_loss": 0.78573066, + "learning_rate": 2.4916664189425183e-06, + "loss": 0.86287904, + "num_input_tokens_seen": 156266440, + "router_z_loss_clip": 1.72753906, + "router_z_loss_mlp": 0.13067627, + "step": 7290, + "time_per_iteration": 2.5970215797424316 + }, + { + "auxiliary_loss_clip": 0.06439934, + "auxiliary_loss_mlp": 0.01272143, + "balance_loss_clip": 0.06275328, + "balance_loss_mlp": 0.0125903, + "epoch": 0.43835863520216445, + "flos": 24943860495360.0, + "grad_norm": 1.8528017599911322, + "language_loss": 0.7800144, + "learning_rate": 2.491288899685288e-06, + "loss": 0.85713518, + "num_input_tokens_seen": 156286900, + "router_z_loss_clip": 1.64648438, + "router_z_loss_mlp": 0.13110352, + "step": 7291, + "time_per_iteration": 2.5944950580596924 + }, + { + "auxiliary_loss_clip": 0.06443708, + "auxiliary_loss_mlp": 0.01274453, + "balance_loss_clip": 0.06277154, + "balance_loss_mlp": 0.0126106, + "epoch": 0.4384187584548324, + "flos": 33518634145920.0, + "grad_norm": 1.8972630881774872, + "language_loss": 0.64874315, + "learning_rate": 2.4909113617983325e-06, + "loss": 0.72592473, + "num_input_tokens_seen": 156307690, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.13391113, + "step": 7292, + "time_per_iteration": 2.628173351287842 + }, + { + "auxiliary_loss_clip": 0.06447129, + "auxiliary_loss_mlp": 0.01269671, + "balance_loss_clip": 0.06278794, + "balance_loss_mlp": 0.01256653, + "epoch": 0.4384788817075004, + "flos": 23957800047360.0, + "grad_norm": 1.5925770854238166, + "language_loss": 0.74671286, + "learning_rate": 2.49053380529597e-06, + "loss": 0.82388091, + "num_input_tokens_seen": 156326620, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.13031006, + "step": 7293, + "time_per_iteration": 3.9379074573516846 + }, + { + "auxiliary_loss_clip": 0.06446324, + "auxiliary_loss_mlp": 0.01270789, + "balance_loss_clip": 0.06279649, + "balance_loss_mlp": 0.0125668, + "epoch": 0.43853900496016834, + "flos": 19104463382400.0, + "grad_norm": 4.9627482836353165, + "language_loss": 0.7920171, + "learning_rate": 2.490156230192516e-06, + "loss": 0.86918819, + "num_input_tokens_seen": 156345495, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.14099121, + "step": 7294, + "time_per_iteration": 2.4718902111053467 + }, + { + "auxiliary_loss_clip": 0.06450905, + "auxiliary_loss_mlp": 0.01269807, + "balance_loss_clip": 0.06283231, + "balance_loss_mlp": 0.01256252, + "epoch": 0.4385991282128363, + "flos": 13230503660160.0, + "grad_norm": 1.631074893492929, + "language_loss": 0.73162925, + "learning_rate": 2.4897786365022883e-06, + "loss": 0.80883634, + "num_input_tokens_seen": 156363155, + "router_z_loss_clip": 1.67480469, + "router_z_loss_mlp": 0.13574219, + "step": 7295, + "time_per_iteration": 2.531641721725464 + }, + { + "auxiliary_loss_clip": 0.06452312, + "auxiliary_loss_mlp": 0.01270937, + "balance_loss_clip": 0.06283045, + "balance_loss_mlp": 0.01256298, + "epoch": 0.4386592514655043, + "flos": 14325199326720.0, + "grad_norm": 2.435451861079371, + "language_loss": 0.75030828, + "learning_rate": 2.4894010242396063e-06, + "loss": 0.8275407, + "num_input_tokens_seen": 156380940, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.14648438, + "step": 7296, + "time_per_iteration": 2.4799978733062744 + }, + { + "auxiliary_loss_clip": 0.06443385, + "auxiliary_loss_mlp": 0.01270746, + "balance_loss_clip": 0.06278379, + "balance_loss_mlp": 0.01257598, + "epoch": 0.43871937471817224, + "flos": 22791128123520.0, + "grad_norm": 1.513671798105688, + "language_loss": 0.69379568, + "learning_rate": 2.4890233934187873e-06, + "loss": 0.77093697, + "num_input_tokens_seen": 156400415, + "router_z_loss_clip": 1.64941406, + "router_z_loss_mlp": 0.13146973, + "step": 7297, + "time_per_iteration": 2.5378599166870117 + }, + { + "auxiliary_loss_clip": 0.06447895, + "auxiliary_loss_mlp": 0.01268794, + "balance_loss_clip": 0.06281355, + "balance_loss_mlp": 0.01255878, + "epoch": 0.4387794979708402, + "flos": 28079466860160.0, + "grad_norm": 1.3753147611046208, + "language_loss": 0.70496702, + "learning_rate": 2.4886457440541535e-06, + "loss": 0.78213394, + "num_input_tokens_seen": 156421120, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.12902832, + "step": 7298, + "time_per_iteration": 2.5667972564697266 + }, + { + "auxiliary_loss_clip": 0.06442846, + "auxiliary_loss_mlp": 0.01270993, + "balance_loss_clip": 0.06279726, + "balance_loss_mlp": 0.01258023, + "epoch": 0.43883962122350817, + "flos": 26256665139840.0, + "grad_norm": 1.5271246100670304, + "language_loss": 0.72762883, + "learning_rate": 2.4882680761600238e-06, + "loss": 0.80476719, + "num_input_tokens_seen": 156441535, + "router_z_loss_clip": 1.63085938, + "router_z_loss_mlp": 0.12976074, + "step": 7299, + "time_per_iteration": 2.567258834838867 + }, + { + "auxiliary_loss_clip": 0.06449576, + "auxiliary_loss_mlp": 0.012749, + "balance_loss_clip": 0.06281091, + "balance_loss_mlp": 0.01260142, + "epoch": 0.43889974447617613, + "flos": 25890662505600.0, + "grad_norm": 1.7549107290593968, + "language_loss": 0.76878119, + "learning_rate": 2.487890389750719e-06, + "loss": 0.84602594, + "num_input_tokens_seen": 156462015, + "router_z_loss_clip": 1.6875, + "router_z_loss_mlp": 0.14758301, + "step": 7300, + "time_per_iteration": 2.541740655899048 + }, + { + "auxiliary_loss_clip": 0.06448291, + "auxiliary_loss_mlp": 0.01268162, + "balance_loss_clip": 0.06281555, + "balance_loss_mlp": 0.01254346, + "epoch": 0.43895986772884416, + "flos": 25053711598080.0, + "grad_norm": 2.544712476821277, + "language_loss": 0.71268392, + "learning_rate": 2.4875126848405626e-06, + "loss": 0.78984845, + "num_input_tokens_seen": 156482165, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.13824463, + "step": 7301, + "time_per_iteration": 2.547846794128418 + }, + { + "auxiliary_loss_clip": 0.06445279, + "auxiliary_loss_mlp": 0.01269466, + "balance_loss_clip": 0.06277898, + "balance_loss_mlp": 0.01254434, + "epoch": 0.4390199909815121, + "flos": 26001729492480.0, + "grad_norm": 4.607507625532986, + "language_loss": 0.71274817, + "learning_rate": 2.4871349614438757e-06, + "loss": 0.78989553, + "num_input_tokens_seen": 156503170, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.15026855, + "step": 7302, + "time_per_iteration": 2.531633138656616 + }, + { + "auxiliary_loss_clip": 0.06444067, + "auxiliary_loss_mlp": 0.0126751, + "balance_loss_clip": 0.06280646, + "balance_loss_mlp": 0.01254618, + "epoch": 0.4390801142341801, + "flos": 29029790741760.0, + "grad_norm": 1.545722029471357, + "language_loss": 0.82388735, + "learning_rate": 2.486757219574983e-06, + "loss": 0.90100312, + "num_input_tokens_seen": 156523005, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.12908936, + "step": 7303, + "time_per_iteration": 2.6841824054718018 + }, + { + "auxiliary_loss_clip": 0.06456171, + "auxiliary_loss_mlp": 0.01271253, + "balance_loss_clip": 0.06284264, + "balance_loss_mlp": 0.01256649, + "epoch": 0.43914023748684805, + "flos": 33447077159040.0, + "grad_norm": 2.3091286506484034, + "language_loss": 0.69152826, + "learning_rate": 2.4863794592482067e-06, + "loss": 0.76880252, + "num_input_tokens_seen": 156544440, + "router_z_loss_clip": 1.71972656, + "router_z_loss_mlp": 0.1461792, + "step": 7304, + "time_per_iteration": 2.6893982887268066 + }, + { + "auxiliary_loss_clip": 0.06439492, + "auxiliary_loss_mlp": 0.01269095, + "balance_loss_clip": 0.06278437, + "balance_loss_mlp": 0.01256507, + "epoch": 0.439200360739516, + "flos": 34540347306240.0, + "grad_norm": 1.5007015420493954, + "language_loss": 0.78744507, + "learning_rate": 2.486001680477873e-06, + "loss": 0.86453092, + "num_input_tokens_seen": 156565410, + "router_z_loss_clip": 1.61132812, + "router_z_loss_mlp": 0.12573242, + "step": 7305, + "time_per_iteration": 2.6403284072875977 + }, + { + "auxiliary_loss_clip": 0.06446742, + "auxiliary_loss_mlp": 0.01269235, + "balance_loss_clip": 0.06281108, + "balance_loss_mlp": 0.01255019, + "epoch": 0.439260483992184, + "flos": 21914247945600.0, + "grad_norm": 1.7423010107893722, + "language_loss": 0.68937683, + "learning_rate": 2.485623883278308e-06, + "loss": 0.76653659, + "num_input_tokens_seen": 156584210, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.14221191, + "step": 7306, + "time_per_iteration": 2.5665781497955322 + }, + { + "auxiliary_loss_clip": 0.06446797, + "auxiliary_loss_mlp": 0.01272443, + "balance_loss_clip": 0.06279111, + "balance_loss_mlp": 0.01258877, + "epoch": 0.43932060724485195, + "flos": 21002805158400.0, + "grad_norm": 1.5749593715316206, + "language_loss": 0.63249755, + "learning_rate": 2.4852460676638344e-06, + "loss": 0.70968997, + "num_input_tokens_seen": 156602730, + "router_z_loss_clip": 1.67675781, + "router_z_loss_mlp": 0.13562012, + "step": 7307, + "time_per_iteration": 2.5204410552978516 + }, + { + "auxiliary_loss_clip": 0.06449466, + "auxiliary_loss_mlp": 0.0126805, + "balance_loss_clip": 0.06279462, + "balance_loss_mlp": 0.01254305, + "epoch": 0.4393807304975199, + "flos": 17752526081280.0, + "grad_norm": 1.900088770074622, + "language_loss": 0.72216207, + "learning_rate": 2.4848682336487828e-06, + "loss": 0.79933721, + "num_input_tokens_seen": 156619405, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.13745117, + "step": 7308, + "time_per_iteration": 2.4988410472869873 + }, + { + "auxiliary_loss_clip": 0.06445662, + "auxiliary_loss_mlp": 0.01268116, + "balance_loss_clip": 0.06277111, + "balance_loss_mlp": 0.01254669, + "epoch": 0.4394408537501879, + "flos": 22535102373120.0, + "grad_norm": 2.200318468716899, + "language_loss": 0.76911771, + "learning_rate": 2.4844903812474787e-06, + "loss": 0.84625548, + "num_input_tokens_seen": 156638165, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.13458252, + "step": 7309, + "time_per_iteration": 2.521385431289673 + }, + { + "auxiliary_loss_clip": 0.06438792, + "auxiliary_loss_mlp": 0.01270246, + "balance_loss_clip": 0.06277418, + "balance_loss_mlp": 0.01257908, + "epoch": 0.43950097700285584, + "flos": 23447383701120.0, + "grad_norm": 3.092354645663241, + "language_loss": 0.71101463, + "learning_rate": 2.484112510474251e-06, + "loss": 0.78810501, + "num_input_tokens_seen": 156658845, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.12335205, + "step": 7310, + "time_per_iteration": 2.609769344329834 + }, + { + "auxiliary_loss_clip": 0.06452246, + "auxiliary_loss_mlp": 0.01269049, + "balance_loss_clip": 0.06282806, + "balance_loss_mlp": 0.0125624, + "epoch": 0.4395611002555238, + "flos": 23186620195200.0, + "grad_norm": 3.6443795998554744, + "language_loss": 0.76179528, + "learning_rate": 2.483734621343429e-06, + "loss": 0.83900821, + "num_input_tokens_seen": 156677275, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.12817383, + "step": 7311, + "time_per_iteration": 2.5347063541412354 + }, + { + "auxiliary_loss_clip": 0.06451476, + "auxiliary_loss_mlp": 0.01270936, + "balance_loss_clip": 0.0628207, + "balance_loss_mlp": 0.01258043, + "epoch": 0.43962122350819177, + "flos": 22133908224000.0, + "grad_norm": 1.9101034753519561, + "language_loss": 0.81546378, + "learning_rate": 2.483356713869341e-06, + "loss": 0.89268786, + "num_input_tokens_seen": 156695815, + "router_z_loss_clip": 1.69042969, + "router_z_loss_mlp": 0.12890625, + "step": 7312, + "time_per_iteration": 2.5744950771331787 + }, + { + "auxiliary_loss_clip": 0.06441756, + "auxiliary_loss_mlp": 0.01268695, + "balance_loss_clip": 0.06277572, + "balance_loss_mlp": 0.01255713, + "epoch": 0.43968134676085974, + "flos": 17426285009280.0, + "grad_norm": 1.9172183853591918, + "language_loss": 0.86001694, + "learning_rate": 2.482978788066318e-06, + "loss": 0.93712139, + "num_input_tokens_seen": 156714385, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.12982178, + "step": 7313, + "time_per_iteration": 2.536870241165161 + }, + { + "auxiliary_loss_clip": 0.06445049, + "auxiliary_loss_mlp": 0.01271249, + "balance_loss_clip": 0.06276917, + "balance_loss_mlp": 0.01258184, + "epoch": 0.43974147001352776, + "flos": 18958582224000.0, + "grad_norm": 6.24702313006486, + "language_loss": 0.679317, + "learning_rate": 2.4826008439486904e-06, + "loss": 0.75647992, + "num_input_tokens_seen": 156732615, + "router_z_loss_clip": 1.68066406, + "router_z_loss_mlp": 0.13061523, + "step": 7314, + "time_per_iteration": 2.5457370281219482 + }, + { + "auxiliary_loss_clip": 0.06448518, + "auxiliary_loss_mlp": 0.01271322, + "balance_loss_clip": 0.06279253, + "balance_loss_mlp": 0.01258209, + "epoch": 0.4398015932661957, + "flos": 18959588472960.0, + "grad_norm": 1.6336273312910292, + "language_loss": 0.76986659, + "learning_rate": 2.4822228815307915e-06, + "loss": 0.84706497, + "num_input_tokens_seen": 156750920, + "router_z_loss_clip": 1.69042969, + "router_z_loss_mlp": 0.13098145, + "step": 7315, + "time_per_iteration": 2.5225329399108887 + }, + { + "auxiliary_loss_clip": 0.06442133, + "auxiliary_loss_mlp": 0.01268226, + "balance_loss_clip": 0.06276898, + "balance_loss_mlp": 0.01255447, + "epoch": 0.4398617165188637, + "flos": 24205608097920.0, + "grad_norm": 2.1993234427936637, + "language_loss": 0.74934149, + "learning_rate": 2.4818449008269523e-06, + "loss": 0.8264451, + "num_input_tokens_seen": 156768520, + "router_z_loss_clip": 1.65234375, + "router_z_loss_mlp": 0.12780762, + "step": 7316, + "time_per_iteration": 2.5561742782592773 + }, + { + "auxiliary_loss_clip": 0.06444536, + "auxiliary_loss_mlp": 0.01271979, + "balance_loss_clip": 0.06280385, + "balance_loss_mlp": 0.01259289, + "epoch": 0.43992183977153165, + "flos": 22243214275200.0, + "grad_norm": 2.7598614180807814, + "language_loss": 0.65349543, + "learning_rate": 2.481466901851506e-06, + "loss": 0.73066062, + "num_input_tokens_seen": 156788700, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.12695312, + "step": 7317, + "time_per_iteration": 2.5142266750335693 + }, + { + "auxiliary_loss_clip": 0.06450248, + "auxiliary_loss_mlp": 0.01270442, + "balance_loss_clip": 0.06283192, + "balance_loss_mlp": 0.01256929, + "epoch": 0.4399819630241996, + "flos": 18703395014400.0, + "grad_norm": 1.826408349581849, + "language_loss": 0.80062312, + "learning_rate": 2.4810888846187865e-06, + "loss": 0.87783003, + "num_input_tokens_seen": 156806470, + "router_z_loss_clip": 1.66894531, + "router_z_loss_mlp": 0.13519287, + "step": 7318, + "time_per_iteration": 2.519906520843506 + }, + { + "auxiliary_loss_clip": 0.06445621, + "auxiliary_loss_mlp": 0.01269422, + "balance_loss_clip": 0.06275794, + "balance_loss_mlp": 0.01255725, + "epoch": 0.4400420862768676, + "flos": 23886326914560.0, + "grad_norm": 1.6582419144412086, + "language_loss": 0.79880667, + "learning_rate": 2.4807108491431283e-06, + "loss": 0.87595713, + "num_input_tokens_seen": 156825895, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.13708496, + "step": 7319, + "time_per_iteration": 2.593442440032959 + }, + { + "auxiliary_loss_clip": 0.06445733, + "auxiliary_loss_mlp": 0.01274619, + "balance_loss_clip": 0.06279506, + "balance_loss_mlp": 0.01260547, + "epoch": 0.44010220952953555, + "flos": 28045071959040.0, + "grad_norm": 2.6685359162637172, + "language_loss": 0.80292428, + "learning_rate": 2.4803327954388667e-06, + "loss": 0.88012779, + "num_input_tokens_seen": 156845990, + "router_z_loss_clip": 1.66113281, + "router_z_loss_mlp": 0.14074707, + "step": 7320, + "time_per_iteration": 2.576824188232422 + }, + { + "auxiliary_loss_clip": 0.06443729, + "auxiliary_loss_mlp": 0.01271309, + "balance_loss_clip": 0.06278579, + "balance_loss_mlp": 0.01258333, + "epoch": 0.4401623327822035, + "flos": 23775763052160.0, + "grad_norm": 3.573791590582856, + "language_loss": 0.69620574, + "learning_rate": 2.4799547235203376e-06, + "loss": 0.77335614, + "num_input_tokens_seen": 156866685, + "router_z_loss_clip": 1.65136719, + "router_z_loss_mlp": 0.12969971, + "step": 7321, + "time_per_iteration": 4.008130311965942 + }, + { + "auxiliary_loss_clip": 0.06352215, + "auxiliary_loss_mlp": 0.01268902, + "balance_loss_clip": 0.06277325, + "balance_loss_mlp": 0.01265612, + "epoch": 0.4402224560348715, + "flos": 70797320081280.0, + "grad_norm": 0.8902034574652531, + "language_loss": 0.56966496, + "learning_rate": 2.4795766334018763e-06, + "loss": 0.64587617, + "num_input_tokens_seen": 156923450, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.03295898, + "step": 7322, + "time_per_iteration": 4.591723680496216 + }, + { + "auxiliary_loss_clip": 0.06443685, + "auxiliary_loss_mlp": 0.01271286, + "balance_loss_clip": 0.06277888, + "balance_loss_mlp": 0.01258787, + "epoch": 0.44028257928753944, + "flos": 22898170114560.0, + "grad_norm": 1.423216656342095, + "language_loss": 0.76491451, + "learning_rate": 2.479198525097822e-06, + "loss": 0.8420642, + "num_input_tokens_seen": 156944795, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.12493896, + "step": 7323, + "time_per_iteration": 2.5367372035980225 + }, + { + "auxiliary_loss_clip": 0.06449594, + "auxiliary_loss_mlp": 0.01277882, + "balance_loss_clip": 0.06282798, + "balance_loss_mlp": 0.01265216, + "epoch": 0.4403427025402074, + "flos": 17901719475840.0, + "grad_norm": 1.6412485345287482, + "language_loss": 0.80679965, + "learning_rate": 2.478820398622511e-06, + "loss": 0.88407433, + "num_input_tokens_seen": 156962755, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.12670898, + "step": 7324, + "time_per_iteration": 2.496735095977783 + }, + { + "auxiliary_loss_clip": 0.0634661, + "auxiliary_loss_mlp": 0.01259308, + "balance_loss_clip": 0.06271856, + "balance_loss_mlp": 0.01255979, + "epoch": 0.4404028257928754, + "flos": 69583717071360.0, + "grad_norm": 0.6517122364434149, + "language_loss": 0.54482663, + "learning_rate": 2.478442253990283e-06, + "loss": 0.62088585, + "num_input_tokens_seen": 157028095, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.03335571, + "step": 7325, + "time_per_iteration": 3.1927096843719482 + }, + { + "auxiliary_loss_clip": 0.06445315, + "auxiliary_loss_mlp": 0.01266283, + "balance_loss_clip": 0.06281503, + "balance_loss_mlp": 0.01253981, + "epoch": 0.44046294904554334, + "flos": 20930074214400.0, + "grad_norm": 1.5304533021700073, + "language_loss": 0.69945073, + "learning_rate": 2.4780640912154766e-06, + "loss": 0.77656674, + "num_input_tokens_seen": 157048365, + "router_z_loss_clip": 1.63964844, + "router_z_loss_mlp": 0.12298584, + "step": 7326, + "time_per_iteration": 2.5716168880462646 + }, + { + "auxiliary_loss_clip": 0.06441578, + "auxiliary_loss_mlp": 0.01266878, + "balance_loss_clip": 0.06279023, + "balance_loss_mlp": 0.01254402, + "epoch": 0.44052307229821136, + "flos": 23630301164160.0, + "grad_norm": 1.488040619087652, + "language_loss": 0.76529855, + "learning_rate": 2.477685910312432e-06, + "loss": 0.84238315, + "num_input_tokens_seen": 157069130, + "router_z_loss_clip": 1.62402344, + "router_z_loss_mlp": 0.12481689, + "step": 7327, + "time_per_iteration": 3.997654676437378 + }, + { + "auxiliary_loss_clip": 0.06439877, + "auxiliary_loss_mlp": 0.01269684, + "balance_loss_clip": 0.06277373, + "balance_loss_mlp": 0.01256744, + "epoch": 0.4405831955508793, + "flos": 17602536072960.0, + "grad_norm": 2.6410067735498512, + "language_loss": 0.83833683, + "learning_rate": 2.4773077112954897e-06, + "loss": 0.91543245, + "num_input_tokens_seen": 157084940, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.1295166, + "step": 7328, + "time_per_iteration": 2.520899534225464 + }, + { + "auxiliary_loss_clip": 0.06445633, + "auxiliary_loss_mlp": 0.01268864, + "balance_loss_clip": 0.06283547, + "balance_loss_mlp": 0.01255703, + "epoch": 0.4406433188035473, + "flos": 21468596405760.0, + "grad_norm": 3.134642090151518, + "language_loss": 0.77723283, + "learning_rate": 2.4769294941789908e-06, + "loss": 0.85437775, + "num_input_tokens_seen": 157102770, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.13165283, + "step": 7329, + "time_per_iteration": 2.5004947185516357 + }, + { + "auxiliary_loss_clip": 0.06448144, + "auxiliary_loss_mlp": 0.01272671, + "balance_loss_clip": 0.06280035, + "balance_loss_mlp": 0.01259176, + "epoch": 0.44070344205621526, + "flos": 22680019209600.0, + "grad_norm": 1.6769566948090702, + "language_loss": 0.74290001, + "learning_rate": 2.476551258977278e-06, + "loss": 0.82010818, + "num_input_tokens_seen": 157122035, + "router_z_loss_clip": 1.6796875, + "router_z_loss_mlp": 0.1348877, + "step": 7330, + "time_per_iteration": 2.534775733947754 + }, + { + "auxiliary_loss_clip": 0.06448483, + "auxiliary_loss_mlp": 0.01270882, + "balance_loss_clip": 0.06283589, + "balance_loss_mlp": 0.01258127, + "epoch": 0.4407635653088832, + "flos": 23448012606720.0, + "grad_norm": 1.699983061814717, + "language_loss": 0.74538559, + "learning_rate": 2.4761730057046936e-06, + "loss": 0.82257915, + "num_input_tokens_seen": 157142800, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.12762451, + "step": 7331, + "time_per_iteration": 2.5442659854888916 + }, + { + "auxiliary_loss_clip": 0.06442808, + "auxiliary_loss_mlp": 0.01270328, + "balance_loss_clip": 0.06279509, + "balance_loss_mlp": 0.01256667, + "epoch": 0.4408236885615512, + "flos": 24027596098560.0, + "grad_norm": 1.6889636086213913, + "language_loss": 0.76643395, + "learning_rate": 2.475794734375581e-06, + "loss": 0.84356534, + "num_input_tokens_seen": 157163295, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.13659668, + "step": 7332, + "time_per_iteration": 2.5714762210845947 + }, + { + "auxiliary_loss_clip": 0.06442308, + "auxiliary_loss_mlp": 0.01271754, + "balance_loss_clip": 0.06277508, + "balance_loss_mlp": 0.01258272, + "epoch": 0.44088381181421915, + "flos": 12681667416960.0, + "grad_norm": 1.845933322464005, + "language_loss": 0.73768836, + "learning_rate": 2.475416445004285e-06, + "loss": 0.81482899, + "num_input_tokens_seen": 157180890, + "router_z_loss_clip": 1.64746094, + "router_z_loss_mlp": 0.1348877, + "step": 7333, + "time_per_iteration": 3.9176201820373535 + }, + { + "auxiliary_loss_clip": 0.06439593, + "auxiliary_loss_mlp": 0.01265669, + "balance_loss_clip": 0.06280486, + "balance_loss_mlp": 0.01253486, + "epoch": 0.4409439350668871, + "flos": 24576474268800.0, + "grad_norm": 1.6297964144317614, + "language_loss": 0.79249531, + "learning_rate": 2.4750381376051493e-06, + "loss": 0.8695479, + "num_input_tokens_seen": 157200580, + "router_z_loss_clip": 1.59277344, + "router_z_loss_mlp": 0.12200928, + "step": 7334, + "time_per_iteration": 2.530762195587158 + }, + { + "auxiliary_loss_clip": 0.06456793, + "auxiliary_loss_mlp": 0.01269696, + "balance_loss_clip": 0.06281539, + "balance_loss_mlp": 0.01254747, + "epoch": 0.4410040583195551, + "flos": 22674191351040.0, + "grad_norm": 7.845487214918662, + "language_loss": 0.7603153, + "learning_rate": 2.47465981219252e-06, + "loss": 0.83758014, + "num_input_tokens_seen": 157218345, + "router_z_loss_clip": 1.75097656, + "router_z_loss_mlp": 0.1496582, + "step": 7335, + "time_per_iteration": 2.5146994590759277 + }, + { + "auxiliary_loss_clip": 0.06445056, + "auxiliary_loss_mlp": 0.01269223, + "balance_loss_clip": 0.06279862, + "balance_loss_mlp": 0.01254942, + "epoch": 0.44106418157222305, + "flos": 10857062833920.0, + "grad_norm": 1.9701535584859973, + "language_loss": 0.72720182, + "learning_rate": 2.4742814687807423e-06, + "loss": 0.80434465, + "num_input_tokens_seen": 157234395, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.14263916, + "step": 7336, + "time_per_iteration": 2.470501661300659 + }, + { + "auxiliary_loss_clip": 0.06448875, + "auxiliary_loss_mlp": 0.01272884, + "balance_loss_clip": 0.06281201, + "balance_loss_mlp": 0.01259079, + "epoch": 0.441124304824891, + "flos": 21733301053440.0, + "grad_norm": 2.690720747597236, + "language_loss": 0.62764168, + "learning_rate": 2.473903107384165e-06, + "loss": 0.70485932, + "num_input_tokens_seen": 157254805, + "router_z_loss_clip": 1.67578125, + "router_z_loss_mlp": 0.13812256, + "step": 7337, + "time_per_iteration": 2.5464730262756348 + }, + { + "auxiliary_loss_clip": 0.06339368, + "auxiliary_loss_mlp": 0.01255392, + "balance_loss_clip": 0.06265444, + "balance_loss_mlp": 0.01252635, + "epoch": 0.441184428077559, + "flos": 63241702041600.0, + "grad_norm": 0.7296971987367982, + "language_loss": 0.52622962, + "learning_rate": 2.473524728017134e-06, + "loss": 0.60217726, + "num_input_tokens_seen": 157317870, + "router_z_loss_clip": 0.73876953, + "router_z_loss_mlp": 0.02761841, + "step": 7338, + "time_per_iteration": 3.1634135246276855 + }, + { + "auxiliary_loss_clip": 0.06451306, + "auxiliary_loss_mlp": 0.0127376, + "balance_loss_clip": 0.06278681, + "balance_loss_mlp": 0.01259133, + "epoch": 0.44124455133022694, + "flos": 21184213248000.0, + "grad_norm": 2.888450189779477, + "language_loss": 0.71053195, + "learning_rate": 2.473146330693997e-06, + "loss": 0.78778255, + "num_input_tokens_seen": 157336505, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.14611816, + "step": 7339, + "time_per_iteration": 2.526179552078247 + }, + { + "auxiliary_loss_clip": 0.06437125, + "auxiliary_loss_mlp": 0.01265386, + "balance_loss_clip": 0.06279349, + "balance_loss_mlp": 0.01252833, + "epoch": 0.4413046745828949, + "flos": 17463740584320.0, + "grad_norm": 1.6365123651784117, + "language_loss": 0.70282859, + "learning_rate": 2.472767915429105e-06, + "loss": 0.77985364, + "num_input_tokens_seen": 157354995, + "router_z_loss_clip": 1.57714844, + "router_z_loss_mlp": 0.12554932, + "step": 7340, + "time_per_iteration": 2.4790234565734863 + }, + { + "auxiliary_loss_clip": 0.06342094, + "auxiliary_loss_mlp": 0.01254424, + "balance_loss_clip": 0.06268074, + "balance_loss_mlp": 0.01251767, + "epoch": 0.4413647978355629, + "flos": 61602251783040.0, + "grad_norm": 0.8821319445569078, + "language_loss": 0.64009017, + "learning_rate": 2.4723894822368054e-06, + "loss": 0.71605539, + "num_input_tokens_seen": 157404260, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.02659607, + "step": 7341, + "time_per_iteration": 2.9593453407287598 + }, + { + "auxiliary_loss_clip": 0.06446001, + "auxiliary_loss_mlp": 0.0127129, + "balance_loss_clip": 0.06280506, + "balance_loss_mlp": 0.01257992, + "epoch": 0.4414249210882309, + "flos": 27534404050560.0, + "grad_norm": 1.9827417031820809, + "language_loss": 0.73812068, + "learning_rate": 2.47201103113145e-06, + "loss": 0.81529361, + "num_input_tokens_seen": 157423045, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.13299561, + "step": 7342, + "time_per_iteration": 2.5592381954193115 + }, + { + "auxiliary_loss_clip": 0.06443819, + "auxiliary_loss_mlp": 0.01272391, + "balance_loss_clip": 0.06280041, + "balance_loss_mlp": 0.01258497, + "epoch": 0.44148504434089886, + "flos": 23520785477760.0, + "grad_norm": 1.7847903417039304, + "language_loss": 0.80326116, + "learning_rate": 2.4716325621273886e-06, + "loss": 0.88042319, + "num_input_tokens_seen": 157441815, + "router_z_loss_clip": 1.63769531, + "router_z_loss_mlp": 0.13885498, + "step": 7343, + "time_per_iteration": 2.567669630050659 + }, + { + "auxiliary_loss_clip": 0.0644604, + "auxiliary_loss_mlp": 0.01268371, + "balance_loss_clip": 0.06281629, + "balance_loss_mlp": 0.01254382, + "epoch": 0.4415451675935668, + "flos": 21587126405760.0, + "grad_norm": 1.6274174275387656, + "language_loss": 0.7678231, + "learning_rate": 2.4712540752389725e-06, + "loss": 0.84496725, + "num_input_tokens_seen": 157460470, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.14001465, + "step": 7344, + "time_per_iteration": 2.50498628616333 + }, + { + "auxiliary_loss_clip": 0.06331868, + "auxiliary_loss_mlp": 0.01254509, + "balance_loss_clip": 0.06258254, + "balance_loss_mlp": 0.01251979, + "epoch": 0.4416052908462348, + "flos": 59023825142400.0, + "grad_norm": 0.9594048262741005, + "language_loss": 0.63725042, + "learning_rate": 2.470875570480556e-06, + "loss": 0.71311414, + "num_input_tokens_seen": 157512655, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.02529907, + "step": 7345, + "time_per_iteration": 2.9305789470672607 + }, + { + "auxiliary_loss_clip": 0.06448534, + "auxiliary_loss_mlp": 0.01269691, + "balance_loss_clip": 0.06281187, + "balance_loss_mlp": 0.01255386, + "epoch": 0.44166541409890275, + "flos": 26364545671680.0, + "grad_norm": 1.5861169822925434, + "language_loss": 0.86231661, + "learning_rate": 2.470497047866489e-06, + "loss": 0.9394989, + "num_input_tokens_seen": 157533700, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.14306641, + "step": 7346, + "time_per_iteration": 2.566326141357422 + }, + { + "auxiliary_loss_clip": 0.06448992, + "auxiliary_loss_mlp": 0.01268131, + "balance_loss_clip": 0.06282933, + "balance_loss_mlp": 0.01253909, + "epoch": 0.4417255373515707, + "flos": 20198739778560.0, + "grad_norm": 1.9006247897038917, + "language_loss": 0.80872411, + "learning_rate": 2.470118507411128e-06, + "loss": 0.88589537, + "num_input_tokens_seen": 157551105, + "router_z_loss_clip": 1.66113281, + "router_z_loss_mlp": 0.14221191, + "step": 7347, + "time_per_iteration": 2.4968490600585938 + }, + { + "auxiliary_loss_clip": 0.06445403, + "auxiliary_loss_mlp": 0.01269031, + "balance_loss_clip": 0.06279784, + "balance_loss_mlp": 0.01254166, + "epoch": 0.4417856606042387, + "flos": 17892537454080.0, + "grad_norm": 1.9280841383218132, + "language_loss": 0.83507645, + "learning_rate": 2.4697399491288263e-06, + "loss": 0.91222078, + "num_input_tokens_seen": 157568285, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.14868164, + "step": 7348, + "time_per_iteration": 2.5483500957489014 + }, + { + "auxiliary_loss_clip": 0.06451687, + "auxiliary_loss_mlp": 0.01270301, + "balance_loss_clip": 0.06282644, + "balance_loss_mlp": 0.0125571, + "epoch": 0.44184578385690665, + "flos": 27971376693120.0, + "grad_norm": 2.209333058456871, + "language_loss": 0.70229864, + "learning_rate": 2.469361373033938e-06, + "loss": 0.77951854, + "num_input_tokens_seen": 157590405, + "router_z_loss_clip": 1.69042969, + "router_z_loss_mlp": 0.14593506, + "step": 7349, + "time_per_iteration": 2.5552031993865967 + }, + { + "auxiliary_loss_clip": 0.06448848, + "auxiliary_loss_mlp": 0.01269717, + "balance_loss_clip": 0.06281149, + "balance_loss_mlp": 0.01254858, + "epoch": 0.4419059071095746, + "flos": 23374652757120.0, + "grad_norm": 1.8931524120790788, + "language_loss": 0.74732667, + "learning_rate": 2.468982779140819e-06, + "loss": 0.82451236, + "num_input_tokens_seen": 157607420, + "router_z_loss_clip": 1.67578125, + "router_z_loss_mlp": 0.14855957, + "step": 7350, + "time_per_iteration": 2.5428407192230225 + }, + { + "auxiliary_loss_clip": 0.06449752, + "auxiliary_loss_mlp": 0.01269052, + "balance_loss_clip": 0.06283528, + "balance_loss_mlp": 0.01254591, + "epoch": 0.4419660303622426, + "flos": 15017443032960.0, + "grad_norm": 2.6211867622298626, + "language_loss": 0.81412131, + "learning_rate": 2.468604167463827e-06, + "loss": 0.89130938, + "num_input_tokens_seen": 157624990, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.14453125, + "step": 7351, + "time_per_iteration": 2.5310895442962646 + }, + { + "auxiliary_loss_clip": 0.06439559, + "auxiliary_loss_mlp": 0.01271292, + "balance_loss_clip": 0.06278528, + "balance_loss_mlp": 0.01258537, + "epoch": 0.44202615361491054, + "flos": 25378359442560.0, + "grad_norm": 1.998249332467298, + "language_loss": 0.73669267, + "learning_rate": 2.4682255380173176e-06, + "loss": 0.81380117, + "num_input_tokens_seen": 157645300, + "router_z_loss_clip": 1.61035156, + "router_z_loss_mlp": 0.12774658, + "step": 7352, + "time_per_iteration": 2.6823537349700928 + }, + { + "auxiliary_loss_clip": 0.06450884, + "auxiliary_loss_mlp": 0.01267651, + "balance_loss_clip": 0.06284234, + "balance_loss_mlp": 0.01253584, + "epoch": 0.4420862768675785, + "flos": 24688044380160.0, + "grad_norm": 1.9707834429969424, + "language_loss": 0.87580955, + "learning_rate": 2.467846890815649e-06, + "loss": 0.95299494, + "num_input_tokens_seen": 157664060, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.14086914, + "step": 7353, + "time_per_iteration": 2.531208038330078 + }, + { + "auxiliary_loss_clip": 0.06445745, + "auxiliary_loss_mlp": 0.01274404, + "balance_loss_clip": 0.06277722, + "balance_loss_mlp": 0.01260659, + "epoch": 0.44214640012024653, + "flos": 19533134471040.0, + "grad_norm": 2.5061219192509676, + "language_loss": 0.76425511, + "learning_rate": 2.4674682258731795e-06, + "loss": 0.84145659, + "num_input_tokens_seen": 157680905, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.13751221, + "step": 7354, + "time_per_iteration": 2.5208046436309814 + }, + { + "auxiliary_loss_clip": 0.06442366, + "auxiliary_loss_mlp": 0.01269638, + "balance_loss_clip": 0.06279345, + "balance_loss_mlp": 0.01256894, + "epoch": 0.4422065233729145, + "flos": 47568143940480.0, + "grad_norm": 2.32689870132585, + "language_loss": 0.65273595, + "learning_rate": 2.467089543204268e-06, + "loss": 0.72985595, + "num_input_tokens_seen": 157701980, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.12768555, + "step": 7355, + "time_per_iteration": 2.7359063625335693 + }, + { + "auxiliary_loss_clip": 0.06452843, + "auxiliary_loss_mlp": 0.01272532, + "balance_loss_clip": 0.06279876, + "balance_loss_mlp": 0.01257225, + "epoch": 0.44226664662558246, + "flos": 19287045429120.0, + "grad_norm": 1.8090120162092156, + "language_loss": 0.78513968, + "learning_rate": 2.466710842823274e-06, + "loss": 0.86239338, + "num_input_tokens_seen": 157720555, + "router_z_loss_clip": 1.72753906, + "router_z_loss_mlp": 0.15307617, + "step": 7356, + "time_per_iteration": 2.5535836219787598 + }, + { + "auxiliary_loss_clip": 0.0645135, + "auxiliary_loss_mlp": 0.01270574, + "balance_loss_clip": 0.0628085, + "balance_loss_mlp": 0.01255184, + "epoch": 0.4423267698782504, + "flos": 17827604939520.0, + "grad_norm": 1.5923292427452285, + "language_loss": 0.77331412, + "learning_rate": 2.4663321247445577e-06, + "loss": 0.85053337, + "num_input_tokens_seen": 157739160, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.1539917, + "step": 7357, + "time_per_iteration": 2.472616195678711 + }, + { + "auxiliary_loss_clip": 0.06444242, + "auxiliary_loss_mlp": 0.0127409, + "balance_loss_clip": 0.06277513, + "balance_loss_mlp": 0.01259112, + "epoch": 0.4423868931309184, + "flos": 29211953518080.0, + "grad_norm": 1.4316006976636513, + "language_loss": 0.73656726, + "learning_rate": 2.465953388982481e-06, + "loss": 0.81375057, + "num_input_tokens_seen": 157760020, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.14971924, + "step": 7358, + "time_per_iteration": 2.596794366836548 + }, + { + "auxiliary_loss_clip": 0.06449263, + "auxiliary_loss_mlp": 0.01268513, + "balance_loss_clip": 0.06281863, + "balance_loss_mlp": 0.01255131, + "epoch": 0.44244701638358636, + "flos": 29720399293440.0, + "grad_norm": 1.5482043588344903, + "language_loss": 0.75746959, + "learning_rate": 2.465574635551405e-06, + "loss": 0.83464736, + "num_input_tokens_seen": 157780435, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.13378906, + "step": 7359, + "time_per_iteration": 2.565152168273926 + }, + { + "auxiliary_loss_clip": 0.06449427, + "auxiliary_loss_mlp": 0.01273427, + "balance_loss_clip": 0.06282771, + "balance_loss_mlp": 0.01258907, + "epoch": 0.4425071396362543, + "flos": 22936715792640.0, + "grad_norm": 1.7006216058888692, + "language_loss": 0.70234901, + "learning_rate": 2.4651958644656923e-06, + "loss": 0.77957749, + "num_input_tokens_seen": 157799420, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.14526367, + "step": 7360, + "time_per_iteration": 3.9516735076904297 + }, + { + "auxiliary_loss_clip": 0.06450445, + "auxiliary_loss_mlp": 0.01276643, + "balance_loss_clip": 0.06282296, + "balance_loss_mlp": 0.01262028, + "epoch": 0.4425672628889223, + "flos": 19798509951360.0, + "grad_norm": 2.334645337647824, + "language_loss": 0.69802427, + "learning_rate": 2.4648170757397053e-06, + "loss": 0.77529514, + "num_input_tokens_seen": 157817025, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.14599609, + "step": 7361, + "time_per_iteration": 3.9590420722961426 + }, + { + "auxiliary_loss_clip": 0.06448395, + "auxiliary_loss_mlp": 0.01271063, + "balance_loss_clip": 0.06281347, + "balance_loss_mlp": 0.01256287, + "epoch": 0.44262738614159025, + "flos": 13667266667520.0, + "grad_norm": 1.9889994262633817, + "language_loss": 0.82882756, + "learning_rate": 2.464438269387809e-06, + "loss": 0.90602213, + "num_input_tokens_seen": 157834345, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.14770508, + "step": 7362, + "time_per_iteration": 2.4627645015716553 + }, + { + "auxiliary_loss_clip": 0.06458044, + "auxiliary_loss_mlp": 0.01274491, + "balance_loss_clip": 0.06284538, + "balance_loss_mlp": 0.01258111, + "epoch": 0.4426875093942582, + "flos": 14215474005120.0, + "grad_norm": 1.7592716332344263, + "language_loss": 0.75051332, + "learning_rate": 2.464059445424366e-06, + "loss": 0.82783866, + "num_input_tokens_seen": 157852290, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.16381836, + "step": 7363, + "time_per_iteration": 2.526925802230835 + }, + { + "auxiliary_loss_clip": 0.0633463, + "auxiliary_loss_mlp": 0.01256608, + "balance_loss_clip": 0.06260501, + "balance_loss_mlp": 0.01253844, + "epoch": 0.4427476326469262, + "flos": 70140100181760.0, + "grad_norm": 0.6687771463902197, + "language_loss": 0.55581295, + "learning_rate": 2.463680603863743e-06, + "loss": 0.63172531, + "num_input_tokens_seen": 157923060, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.02767944, + "step": 7364, + "time_per_iteration": 3.2234084606170654 + }, + { + "auxiliary_loss_clip": 0.06445954, + "auxiliary_loss_mlp": 0.01269396, + "balance_loss_clip": 0.06280937, + "balance_loss_mlp": 0.01255479, + "epoch": 0.44280775589959415, + "flos": 25451761219200.0, + "grad_norm": 6.076987981061014, + "language_loss": 0.75066888, + "learning_rate": 2.463301744720305e-06, + "loss": 0.82782239, + "num_input_tokens_seen": 157944110, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.13928223, + "step": 7365, + "time_per_iteration": 2.606168746948242 + }, + { + "auxiliary_loss_clip": 0.06448679, + "auxiliary_loss_mlp": 0.01268458, + "balance_loss_clip": 0.06282686, + "balance_loss_mlp": 0.01253724, + "epoch": 0.4428678791522621, + "flos": 22863900994560.0, + "grad_norm": 1.5120042705282817, + "language_loss": 0.74655497, + "learning_rate": 2.4629228680084184e-06, + "loss": 0.82372636, + "num_input_tokens_seen": 157964295, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.1473999, + "step": 7366, + "time_per_iteration": 2.5269834995269775 + }, + { + "auxiliary_loss_clip": 0.06449491, + "auxiliary_loss_mlp": 0.0127034, + "balance_loss_clip": 0.06283636, + "balance_loss_mlp": 0.01255438, + "epoch": 0.44292800240493013, + "flos": 25819608643200.0, + "grad_norm": 2.3253747528787447, + "language_loss": 0.7339704, + "learning_rate": 2.46254397374245e-06, + "loss": 0.81116873, + "num_input_tokens_seen": 157983970, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.14904785, + "step": 7367, + "time_per_iteration": 4.017570495605469 + }, + { + "auxiliary_loss_clip": 0.06453082, + "auxiliary_loss_mlp": 0.01276023, + "balance_loss_clip": 0.06286091, + "balance_loss_mlp": 0.01260979, + "epoch": 0.4429881256575981, + "flos": 32425238217600.0, + "grad_norm": 1.584590811661976, + "language_loss": 0.73953557, + "learning_rate": 2.4621650619367677e-06, + "loss": 0.81682664, + "num_input_tokens_seen": 158006515, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.15057373, + "step": 7368, + "time_per_iteration": 2.6219804286956787 + }, + { + "auxiliary_loss_clip": 0.06446074, + "auxiliary_loss_mlp": 0.01270312, + "balance_loss_clip": 0.06281151, + "balance_loss_mlp": 0.01256007, + "epoch": 0.44304824891026606, + "flos": 22170231768960.0, + "grad_norm": 1.6442785623938219, + "language_loss": 0.79845673, + "learning_rate": 2.4617861326057403e-06, + "loss": 0.8756206, + "num_input_tokens_seen": 158025565, + "router_z_loss_clip": 1.64941406, + "router_z_loss_mlp": 0.14306641, + "step": 7369, + "time_per_iteration": 2.5048859119415283 + }, + { + "auxiliary_loss_clip": 0.06445719, + "auxiliary_loss_mlp": 0.01268056, + "balance_loss_clip": 0.0628242, + "balance_loss_mlp": 0.01253524, + "epoch": 0.443108372162934, + "flos": 25345725477120.0, + "grad_norm": 1.8080912741875748, + "language_loss": 0.72226167, + "learning_rate": 2.461407185763737e-06, + "loss": 0.79939938, + "num_input_tokens_seen": 158045620, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.14538574, + "step": 7370, + "time_per_iteration": 2.59167218208313 + }, + { + "auxiliary_loss_clip": 0.06444093, + "auxiliary_loss_mlp": 0.01274154, + "balance_loss_clip": 0.06279977, + "balance_loss_mlp": 0.01259741, + "epoch": 0.443168495415602, + "flos": 23337616452480.0, + "grad_norm": 2.642683672552081, + "language_loss": 0.70957971, + "learning_rate": 2.461028221425126e-06, + "loss": 0.78676224, + "num_input_tokens_seen": 158063505, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.14428711, + "step": 7371, + "time_per_iteration": 2.5119266510009766 + }, + { + "auxiliary_loss_clip": 0.0644391, + "auxiliary_loss_mlp": 0.01268622, + "balance_loss_clip": 0.06280756, + "balance_loss_mlp": 0.01255288, + "epoch": 0.44322861866826996, + "flos": 21877924400640.0, + "grad_norm": 2.5641722247612977, + "language_loss": 0.69211292, + "learning_rate": 2.4606492396042786e-06, + "loss": 0.76923823, + "num_input_tokens_seen": 158080335, + "router_z_loss_clip": 1.63085938, + "router_z_loss_mlp": 0.13330078, + "step": 7372, + "time_per_iteration": 2.575803518295288 + }, + { + "auxiliary_loss_clip": 0.06450622, + "auxiliary_loss_mlp": 0.01273627, + "balance_loss_clip": 0.06281562, + "balance_loss_mlp": 0.01257855, + "epoch": 0.4432887419209379, + "flos": 20090649611520.0, + "grad_norm": 1.7339006835744544, + "language_loss": 0.83742619, + "learning_rate": 2.4602702403155664e-06, + "loss": 0.91466868, + "num_input_tokens_seen": 158098955, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.15765381, + "step": 7373, + "time_per_iteration": 4.006488084793091 + }, + { + "auxiliary_loss_clip": 0.06340961, + "auxiliary_loss_mlp": 0.01252329, + "balance_loss_clip": 0.06267951, + "balance_loss_mlp": 0.01249765, + "epoch": 0.4433488651736059, + "flos": 70056593988480.0, + "grad_norm": 0.7566866942124226, + "language_loss": 0.55204445, + "learning_rate": 2.4598912235733604e-06, + "loss": 0.62797731, + "num_input_tokens_seen": 158164110, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.02565002, + "step": 7374, + "time_per_iteration": 3.1780457496643066 + }, + { + "auxiliary_loss_clip": 0.06443411, + "auxiliary_loss_mlp": 0.01275671, + "balance_loss_clip": 0.06280876, + "balance_loss_mlp": 0.01260198, + "epoch": 0.44340898842627385, + "flos": 16286838462720.0, + "grad_norm": 2.3260457628480617, + "language_loss": 0.82868445, + "learning_rate": 2.4595121893920327e-06, + "loss": 0.90587527, + "num_input_tokens_seen": 158179850, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.15478516, + "step": 7375, + "time_per_iteration": 2.5473110675811768 + }, + { + "auxiliary_loss_clip": 0.0644948, + "auxiliary_loss_mlp": 0.01269753, + "balance_loss_clip": 0.06282064, + "balance_loss_mlp": 0.01255388, + "epoch": 0.4434691116789418, + "flos": 16616601406080.0, + "grad_norm": 2.217281539940859, + "language_loss": 0.83904636, + "learning_rate": 2.4591331377859578e-06, + "loss": 0.91623867, + "num_input_tokens_seen": 158196590, + "router_z_loss_clip": 1.671875, + "router_z_loss_mlp": 0.1439209, + "step": 7376, + "time_per_iteration": 2.4960668087005615 + }, + { + "auxiliary_loss_clip": 0.06447101, + "auxiliary_loss_mlp": 0.01271986, + "balance_loss_clip": 0.06282647, + "balance_loss_mlp": 0.01257573, + "epoch": 0.4435292349316098, + "flos": 19069397648640.0, + "grad_norm": 1.7110647715019258, + "language_loss": 0.77357483, + "learning_rate": 2.4587540687695077e-06, + "loss": 0.85076571, + "num_input_tokens_seen": 158216355, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.14422607, + "step": 7377, + "time_per_iteration": 2.5489466190338135 + }, + { + "auxiliary_loss_clip": 0.064443, + "auxiliary_loss_mlp": 0.01269165, + "balance_loss_clip": 0.06284986, + "balance_loss_mlp": 0.01255396, + "epoch": 0.44358935818427775, + "flos": 21257656951680.0, + "grad_norm": 1.7746716431943175, + "language_loss": 0.75928617, + "learning_rate": 2.458374982357057e-06, + "loss": 0.83642089, + "num_input_tokens_seen": 158235825, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.13763428, + "step": 7378, + "time_per_iteration": 2.498782157897949 + }, + { + "auxiliary_loss_clip": 0.06446375, + "auxiliary_loss_mlp": 0.01269929, + "balance_loss_clip": 0.06281648, + "balance_loss_mlp": 0.01255106, + "epoch": 0.4436494814369457, + "flos": 12500259327360.0, + "grad_norm": 1.8740687903376234, + "language_loss": 0.69627756, + "learning_rate": 2.457995878562982e-06, + "loss": 0.77344066, + "num_input_tokens_seen": 158254230, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.14825439, + "step": 7379, + "time_per_iteration": 2.5212602615356445 + }, + { + "auxiliary_loss_clip": 0.0645185, + "auxiliary_loss_mlp": 0.01266938, + "balance_loss_clip": 0.0628576, + "balance_loss_mlp": 0.01252556, + "epoch": 0.44370960468961373, + "flos": 23666666636160.0, + "grad_norm": 2.508566876625721, + "language_loss": 0.73565447, + "learning_rate": 2.457616757401656e-06, + "loss": 0.81284231, + "num_input_tokens_seen": 158273400, + "router_z_loss_clip": 1.66113281, + "router_z_loss_mlp": 0.1439209, + "step": 7380, + "time_per_iteration": 2.500859260559082 + }, + { + "auxiliary_loss_clip": 0.06449685, + "auxiliary_loss_mlp": 0.01268804, + "balance_loss_clip": 0.06285541, + "balance_loss_mlp": 0.01255452, + "epoch": 0.4437697279422817, + "flos": 32425196290560.0, + "grad_norm": 1.7107220322970214, + "language_loss": 0.65104783, + "learning_rate": 2.457237618887458e-06, + "loss": 0.72823262, + "num_input_tokens_seen": 158296840, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.13336182, + "step": 7381, + "time_per_iteration": 2.618229627609253 + }, + { + "auxiliary_loss_clip": 0.06454551, + "auxiliary_loss_mlp": 0.01272971, + "balance_loss_clip": 0.06288015, + "balance_loss_mlp": 0.01258773, + "epoch": 0.44382985119494966, + "flos": 18118570642560.0, + "grad_norm": 2.331874867497661, + "language_loss": 0.80543017, + "learning_rate": 2.456858463034763e-06, + "loss": 0.88270545, + "num_input_tokens_seen": 158314935, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.14190674, + "step": 7382, + "time_per_iteration": 2.4738404750823975 + }, + { + "auxiliary_loss_clip": 0.06452931, + "auxiliary_loss_mlp": 0.01272481, + "balance_loss_clip": 0.06287742, + "balance_loss_mlp": 0.01258486, + "epoch": 0.44388997444761763, + "flos": 30782083651200.0, + "grad_norm": 1.5922456749371714, + "language_loss": 0.65226638, + "learning_rate": 2.456479289857949e-06, + "loss": 0.72952044, + "num_input_tokens_seen": 158334620, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.13983154, + "step": 7383, + "time_per_iteration": 2.614912986755371 + }, + { + "auxiliary_loss_clip": 0.0645685, + "auxiliary_loss_mlp": 0.01272667, + "balance_loss_clip": 0.0628838, + "balance_loss_mlp": 0.01258088, + "epoch": 0.4439500977002856, + "flos": 20345333696640.0, + "grad_norm": 2.064556949518224, + "language_loss": 0.76699257, + "learning_rate": 2.4561000993713953e-06, + "loss": 0.84428775, + "num_input_tokens_seen": 158350550, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.14587402, + "step": 7384, + "time_per_iteration": 2.4842731952667236 + }, + { + "auxiliary_loss_clip": 0.06456664, + "auxiliary_loss_mlp": 0.012692, + "balance_loss_clip": 0.06288753, + "balance_loss_mlp": 0.01254442, + "epoch": 0.44401022095295356, + "flos": 20376667923840.0, + "grad_norm": 2.2924078267975605, + "language_loss": 0.80810666, + "learning_rate": 2.4557208915894796e-06, + "loss": 0.88536537, + "num_input_tokens_seen": 158369555, + "router_z_loss_clip": 1.67871094, + "router_z_loss_mlp": 0.14758301, + "step": 7385, + "time_per_iteration": 2.5268380641937256 + }, + { + "auxiliary_loss_clip": 0.0645503, + "auxiliary_loss_mlp": 0.01272748, + "balance_loss_clip": 0.06290472, + "balance_loss_mlp": 0.01257013, + "epoch": 0.4440703442056215, + "flos": 20236950040320.0, + "grad_norm": 1.6897241264536553, + "language_loss": 0.82179439, + "learning_rate": 2.455341666526582e-06, + "loss": 0.89907217, + "num_input_tokens_seen": 158388045, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.15734863, + "step": 7386, + "time_per_iteration": 2.497891426086426 + }, + { + "auxiliary_loss_clip": 0.06463334, + "auxiliary_loss_mlp": 0.01273049, + "balance_loss_clip": 0.06290253, + "balance_loss_mlp": 0.01257683, + "epoch": 0.4441304674582895, + "flos": 39504163979520.0, + "grad_norm": 2.9557468241194624, + "language_loss": 0.70275033, + "learning_rate": 2.4549624241970832e-06, + "loss": 0.78011411, + "num_input_tokens_seen": 158410115, + "router_z_loss_clip": 1.73046875, + "router_z_loss_mlp": 0.15356445, + "step": 7387, + "time_per_iteration": 2.6782705783843994 + }, + { + "auxiliary_loss_clip": 0.06455649, + "auxiliary_loss_mlp": 0.01272917, + "balance_loss_clip": 0.06289866, + "balance_loss_mlp": 0.01258206, + "epoch": 0.44419059071095746, + "flos": 14834902913280.0, + "grad_norm": 1.9684531060003607, + "language_loss": 0.72165161, + "learning_rate": 2.4545831646153628e-06, + "loss": 0.79893732, + "num_input_tokens_seen": 158427765, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.14715576, + "step": 7388, + "time_per_iteration": 2.5119476318359375 + }, + { + "auxiliary_loss_clip": 0.06464041, + "auxiliary_loss_mlp": 0.01270575, + "balance_loss_clip": 0.06293739, + "balance_loss_mlp": 0.01255113, + "epoch": 0.4442507139636254, + "flos": 22644408424320.0, + "grad_norm": 1.566920019209845, + "language_loss": 0.69646138, + "learning_rate": 2.4542038877958044e-06, + "loss": 0.77380753, + "num_input_tokens_seen": 158446375, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.15454102, + "step": 7389, + "time_per_iteration": 2.671290874481201 + }, + { + "auxiliary_loss_clip": 0.06455444, + "auxiliary_loss_mlp": 0.01269625, + "balance_loss_clip": 0.06289597, + "balance_loss_mlp": 0.01255487, + "epoch": 0.4443108372162934, + "flos": 38299994553600.0, + "grad_norm": 1.918848783354648, + "language_loss": 0.74912727, + "learning_rate": 2.453824593752788e-06, + "loss": 0.82637799, + "num_input_tokens_seen": 158467260, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.14135742, + "step": 7390, + "time_per_iteration": 2.6656923294067383 + }, + { + "auxiliary_loss_clip": 0.06453501, + "auxiliary_loss_mlp": 0.01269903, + "balance_loss_clip": 0.06290193, + "balance_loss_mlp": 0.0125657, + "epoch": 0.44437096046896135, + "flos": 17754790141440.0, + "grad_norm": 1.7902511429273704, + "language_loss": 0.82203722, + "learning_rate": 2.4534452825006988e-06, + "loss": 0.89927119, + "num_input_tokens_seen": 158486720, + "router_z_loss_clip": 1.63476562, + "router_z_loss_mlp": 0.13323975, + "step": 7391, + "time_per_iteration": 2.5425097942352295 + }, + { + "auxiliary_loss_clip": 0.06451984, + "auxiliary_loss_mlp": 0.01268602, + "balance_loss_clip": 0.06289234, + "balance_loss_mlp": 0.01254547, + "epoch": 0.4444310837216293, + "flos": 13736936937600.0, + "grad_norm": 1.5949305897923123, + "language_loss": 0.73880637, + "learning_rate": 2.4530659540539185e-06, + "loss": 0.81601214, + "num_input_tokens_seen": 158502530, + "router_z_loss_clip": 1.62890625, + "router_z_loss_mlp": 0.14044189, + "step": 7392, + "time_per_iteration": 2.509695053100586 + }, + { + "auxiliary_loss_clip": 0.06450866, + "auxiliary_loss_mlp": 0.01269173, + "balance_loss_clip": 0.06287552, + "balance_loss_mlp": 0.01256424, + "epoch": 0.44449120697429734, + "flos": 25017346126080.0, + "grad_norm": 1.7319744549950544, + "language_loss": 0.79953551, + "learning_rate": 2.4526866084268313e-06, + "loss": 0.87673593, + "num_input_tokens_seen": 158522715, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.12744141, + "step": 7393, + "time_per_iteration": 2.6058006286621094 + }, + { + "auxiliary_loss_clip": 0.06460646, + "auxiliary_loss_mlp": 0.01270821, + "balance_loss_clip": 0.06291801, + "balance_loss_mlp": 0.01255276, + "epoch": 0.4445513302269653, + "flos": 32680006156800.0, + "grad_norm": 1.76893741086752, + "language_loss": 0.8113097, + "learning_rate": 2.4523072456338226e-06, + "loss": 0.88862437, + "num_input_tokens_seen": 158543615, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.15551758, + "step": 7394, + "time_per_iteration": 2.6408586502075195 + }, + { + "auxiliary_loss_clip": 0.06448914, + "auxiliary_loss_mlp": 0.01267892, + "balance_loss_clip": 0.06286056, + "balance_loss_mlp": 0.01254796, + "epoch": 0.44461145347963327, + "flos": 11660583162240.0, + "grad_norm": 2.0227503675909646, + "language_loss": 0.79471397, + "learning_rate": 2.4519278656892785e-06, + "loss": 0.87188208, + "num_input_tokens_seen": 158560330, + "router_z_loss_clip": 1.62988281, + "router_z_loss_mlp": 0.13092041, + "step": 7395, + "time_per_iteration": 2.482771158218384 + }, + { + "auxiliary_loss_clip": 0.06457528, + "auxiliary_loss_mlp": 0.01269923, + "balance_loss_clip": 0.06293359, + "balance_loss_mlp": 0.01255838, + "epoch": 0.44467157673230123, + "flos": 20893079836800.0, + "grad_norm": 1.8465254869377097, + "language_loss": 0.68925393, + "learning_rate": 2.451548468607584e-06, + "loss": 0.76652849, + "num_input_tokens_seen": 158579735, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.14074707, + "step": 7396, + "time_per_iteration": 2.526031017303467 + }, + { + "auxiliary_loss_clip": 0.06458125, + "auxiliary_loss_mlp": 0.01266336, + "balance_loss_clip": 0.06290217, + "balance_loss_mlp": 0.0125299, + "epoch": 0.4447316999849692, + "flos": 18551140945920.0, + "grad_norm": 2.1703937468753964, + "language_loss": 0.80956584, + "learning_rate": 2.451169054403126e-06, + "loss": 0.88681042, + "num_input_tokens_seen": 158597075, + "router_z_loss_clip": 1.67773438, + "router_z_loss_mlp": 0.13342285, + "step": 7397, + "time_per_iteration": 2.482004404067993 + }, + { + "auxiliary_loss_clip": 0.06453413, + "auxiliary_loss_mlp": 0.01269867, + "balance_loss_clip": 0.06290947, + "balance_loss_mlp": 0.01256814, + "epoch": 0.44479182323763716, + "flos": 23775846906240.0, + "grad_norm": 2.7975733901761672, + "language_loss": 0.67842102, + "learning_rate": 2.450789623090293e-06, + "loss": 0.75565386, + "num_input_tokens_seen": 158616650, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.13067627, + "step": 7398, + "time_per_iteration": 2.579227924346924 + }, + { + "auxiliary_loss_clip": 0.06451767, + "auxiliary_loss_mlp": 0.01268989, + "balance_loss_clip": 0.06290427, + "balance_loss_mlp": 0.01256097, + "epoch": 0.44485194649030513, + "flos": 16549237123200.0, + "grad_norm": 1.6886298033370946, + "language_loss": 0.70454216, + "learning_rate": 2.450410174683472e-06, + "loss": 0.78174973, + "num_input_tokens_seen": 158634515, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.12896729, + "step": 7399, + "time_per_iteration": 2.491422653198242 + }, + { + "auxiliary_loss_clip": 0.06448349, + "auxiliary_loss_mlp": 0.01267519, + "balance_loss_clip": 0.06287403, + "balance_loss_mlp": 0.01254543, + "epoch": 0.4449120697429731, + "flos": 22607455973760.0, + "grad_norm": 1.7365156462421643, + "language_loss": 0.72588718, + "learning_rate": 2.4500307091970514e-06, + "loss": 0.80304587, + "num_input_tokens_seen": 158653760, + "router_z_loss_clip": 1.61035156, + "router_z_loss_mlp": 0.12963867, + "step": 7400, + "time_per_iteration": 3.9914138317108154 + }, + { + "auxiliary_loss_clip": 0.06451382, + "auxiliary_loss_mlp": 0.01270619, + "balance_loss_clip": 0.06288703, + "balance_loss_mlp": 0.0125738, + "epoch": 0.44497219299564106, + "flos": 20009994456960.0, + "grad_norm": 1.5547932465186114, + "language_loss": 0.85223019, + "learning_rate": 2.449651226645422e-06, + "loss": 0.92945021, + "num_input_tokens_seen": 158672190, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.13250732, + "step": 7401, + "time_per_iteration": 3.972844123840332 + }, + { + "auxiliary_loss_clip": 0.0644277, + "auxiliary_loss_mlp": 0.01266074, + "balance_loss_clip": 0.06283394, + "balance_loss_mlp": 0.01254099, + "epoch": 0.445032316248309, + "flos": 25601499665280.0, + "grad_norm": 1.7738805367720483, + "language_loss": 0.8345179, + "learning_rate": 2.449271727042973e-06, + "loss": 0.91160637, + "num_input_tokens_seen": 158694115, + "router_z_loss_clip": 1.59277344, + "router_z_loss_mlp": 0.11968994, + "step": 7402, + "time_per_iteration": 2.546557664871216 + }, + { + "auxiliary_loss_clip": 0.06449325, + "auxiliary_loss_mlp": 0.0126916, + "balance_loss_clip": 0.06285563, + "balance_loss_mlp": 0.01255898, + "epoch": 0.445092439500977, + "flos": 21256608775680.0, + "grad_norm": 1.6765614973905527, + "language_loss": 0.77230763, + "learning_rate": 2.4488922104040947e-06, + "loss": 0.84949255, + "num_input_tokens_seen": 158711000, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.13275146, + "step": 7403, + "time_per_iteration": 2.540351152420044 + }, + { + "auxiliary_loss_clip": 0.06362203, + "auxiliary_loss_mlp": 0.01255762, + "balance_loss_clip": 0.0628911, + "balance_loss_mlp": 0.01252394, + "epoch": 0.44515256275364495, + "flos": 57781990506240.0, + "grad_norm": 0.751382178532419, + "language_loss": 0.60078514, + "learning_rate": 2.4485126767431793e-06, + "loss": 0.67696476, + "num_input_tokens_seen": 158769675, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.03375244, + "step": 7404, + "time_per_iteration": 3.1188013553619385 + }, + { + "auxiliary_loss_clip": 0.06455964, + "auxiliary_loss_mlp": 0.01272779, + "balance_loss_clip": 0.06287853, + "balance_loss_mlp": 0.01258462, + "epoch": 0.4452126860063129, + "flos": 15601386936960.0, + "grad_norm": 1.4877710129276585, + "language_loss": 0.82279229, + "learning_rate": 2.4481331260746177e-06, + "loss": 0.90007967, + "num_input_tokens_seen": 158788215, + "router_z_loss_clip": 1.68066406, + "router_z_loss_mlp": 0.14312744, + "step": 7405, + "time_per_iteration": 2.5388095378875732 + }, + { + "auxiliary_loss_clip": 0.06447265, + "auxiliary_loss_mlp": 0.01267875, + "balance_loss_clip": 0.06283686, + "balance_loss_mlp": 0.0125512, + "epoch": 0.4452728092589809, + "flos": 21623995002240.0, + "grad_norm": 1.5786988713847923, + "language_loss": 0.75529754, + "learning_rate": 2.4477535584128036e-06, + "loss": 0.83244896, + "num_input_tokens_seen": 158809090, + "router_z_loss_clip": 1.63476562, + "router_z_loss_mlp": 0.12744141, + "step": 7406, + "time_per_iteration": 2.5249385833740234 + }, + { + "auxiliary_loss_clip": 0.06440533, + "auxiliary_loss_mlp": 0.01271164, + "balance_loss_clip": 0.06282739, + "balance_loss_mlp": 0.01259094, + "epoch": 0.4453329325116489, + "flos": 29505267135360.0, + "grad_norm": 1.6524917293298949, + "language_loss": 0.65847838, + "learning_rate": 2.447373973772129e-06, + "loss": 0.73559535, + "num_input_tokens_seen": 158828320, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.12060547, + "step": 7407, + "time_per_iteration": 3.998326063156128 + }, + { + "auxiliary_loss_clip": 0.06449907, + "auxiliary_loss_mlp": 0.01269795, + "balance_loss_clip": 0.06284529, + "balance_loss_mlp": 0.01256777, + "epoch": 0.44539305576431687, + "flos": 21367549981440.0, + "grad_norm": 1.547450204556426, + "language_loss": 0.68216872, + "learning_rate": 2.4469943721669887e-06, + "loss": 0.75936574, + "num_input_tokens_seen": 158847040, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.13018799, + "step": 7408, + "time_per_iteration": 2.5295586585998535 + }, + { + "auxiliary_loss_clip": 0.06449315, + "auxiliary_loss_mlp": 0.01269644, + "balance_loss_clip": 0.06285807, + "balance_loss_mlp": 0.01256508, + "epoch": 0.44545317901698483, + "flos": 41437278000000.0, + "grad_norm": 2.0427525389439443, + "language_loss": 0.720608, + "learning_rate": 2.4466147536117776e-06, + "loss": 0.79779756, + "num_input_tokens_seen": 158870490, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.13134766, + "step": 7409, + "time_per_iteration": 2.678666114807129 + }, + { + "auxiliary_loss_clip": 0.06448312, + "auxiliary_loss_mlp": 0.01270862, + "balance_loss_clip": 0.06284307, + "balance_loss_mlp": 0.01257045, + "epoch": 0.4455133022696528, + "flos": 22061638477440.0, + "grad_norm": 1.7184461657241017, + "language_loss": 0.65940762, + "learning_rate": 2.4462351181208895e-06, + "loss": 0.73659933, + "num_input_tokens_seen": 158889920, + "router_z_loss_clip": 1.63769531, + "router_z_loss_mlp": 0.13818359, + "step": 7410, + "time_per_iteration": 2.5486950874328613 + }, + { + "auxiliary_loss_clip": 0.06453686, + "auxiliary_loss_mlp": 0.01268565, + "balance_loss_clip": 0.06284985, + "balance_loss_mlp": 0.0125522, + "epoch": 0.44557342552232077, + "flos": 23483665319040.0, + "grad_norm": 3.696220183147237, + "language_loss": 0.74690163, + "learning_rate": 2.4458554657087217e-06, + "loss": 0.82412422, + "num_input_tokens_seen": 158909580, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.13360596, + "step": 7411, + "time_per_iteration": 2.5290050506591797 + }, + { + "auxiliary_loss_clip": 0.0644176, + "auxiliary_loss_mlp": 0.01268016, + "balance_loss_clip": 0.06284117, + "balance_loss_mlp": 0.01256166, + "epoch": 0.44563354877498873, + "flos": 19140577292160.0, + "grad_norm": 2.065063291172047, + "language_loss": 0.7906481, + "learning_rate": 2.4454757963896695e-06, + "loss": 0.86774588, + "num_input_tokens_seen": 158924600, + "router_z_loss_clip": 1.57617188, + "router_z_loss_mlp": 0.11859131, + "step": 7412, + "time_per_iteration": 2.5156190395355225 + }, + { + "auxiliary_loss_clip": 0.0645022, + "auxiliary_loss_mlp": 0.01268988, + "balance_loss_clip": 0.06282784, + "balance_loss_mlp": 0.01255792, + "epoch": 0.4456936720276567, + "flos": 13625744169600.0, + "grad_norm": 2.15802472542835, + "language_loss": 0.80199099, + "learning_rate": 2.4450961101781304e-06, + "loss": 0.87918305, + "num_input_tokens_seen": 158939345, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.13195801, + "step": 7413, + "time_per_iteration": 3.9694504737854004 + }, + { + "auxiliary_loss_clip": 0.06443125, + "auxiliary_loss_mlp": 0.01265994, + "balance_loss_clip": 0.0628258, + "balance_loss_mlp": 0.01254037, + "epoch": 0.44575379528032466, + "flos": 14717840359680.0, + "grad_norm": 1.9357576200238034, + "language_loss": 0.76531088, + "learning_rate": 2.4447164070885026e-06, + "loss": 0.8424021, + "num_input_tokens_seen": 158955855, + "router_z_loss_clip": 1.60546875, + "router_z_loss_mlp": 0.11956787, + "step": 7414, + "time_per_iteration": 2.515110731124878 + }, + { + "auxiliary_loss_clip": 0.06447163, + "auxiliary_loss_mlp": 0.01269628, + "balance_loss_clip": 0.06286051, + "balance_loss_mlp": 0.01257177, + "epoch": 0.4458139185329926, + "flos": 24177586106880.0, + "grad_norm": 1.4166090983539044, + "language_loss": 0.84000552, + "learning_rate": 2.4443366871351837e-06, + "loss": 0.91717345, + "num_input_tokens_seen": 158976315, + "router_z_loss_clip": 1.61132812, + "router_z_loss_mlp": 0.12457275, + "step": 7415, + "time_per_iteration": 2.528939723968506 + }, + { + "auxiliary_loss_clip": 0.06442896, + "auxiliary_loss_mlp": 0.01267494, + "balance_loss_clip": 0.06282021, + "balance_loss_mlp": 0.01254733, + "epoch": 0.4458740417856606, + "flos": 21768660276480.0, + "grad_norm": 1.9578275078246672, + "language_loss": 0.84485269, + "learning_rate": 2.4439569503325732e-06, + "loss": 0.92195654, + "num_input_tokens_seen": 158996725, + "router_z_loss_clip": 1.60839844, + "router_z_loss_mlp": 0.12756348, + "step": 7416, + "time_per_iteration": 2.57027268409729 + }, + { + "auxiliary_loss_clip": 0.06451635, + "auxiliary_loss_mlp": 0.0126797, + "balance_loss_clip": 0.06285699, + "balance_loss_mlp": 0.01255298, + "epoch": 0.44593416503832856, + "flos": 21075074904960.0, + "grad_norm": 1.7085615846271827, + "language_loss": 0.81362593, + "learning_rate": 2.4435771966950706e-06, + "loss": 0.89082199, + "num_input_tokens_seen": 159017255, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.12670898, + "step": 7417, + "time_per_iteration": 2.547837734222412 + }, + { + "auxiliary_loss_clip": 0.06448114, + "auxiliary_loss_mlp": 0.01267636, + "balance_loss_clip": 0.06283562, + "balance_loss_mlp": 0.01255601, + "epoch": 0.4459942882909965, + "flos": 22606910922240.0, + "grad_norm": 1.8801354401717048, + "language_loss": 0.81286234, + "learning_rate": 2.443197426237077e-06, + "loss": 0.89001989, + "num_input_tokens_seen": 159035010, + "router_z_loss_clip": 1.64648438, + "router_z_loss_mlp": 0.12042236, + "step": 7418, + "time_per_iteration": 2.5529236793518066 + }, + { + "auxiliary_loss_clip": 0.06449951, + "auxiliary_loss_mlp": 0.01268288, + "balance_loss_clip": 0.06284475, + "balance_loss_mlp": 0.01255652, + "epoch": 0.4460544115436645, + "flos": 26512732817280.0, + "grad_norm": 1.8068813549808598, + "language_loss": 0.77866399, + "learning_rate": 2.442817638972991e-06, + "loss": 0.85584641, + "num_input_tokens_seen": 159055345, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.12646484, + "step": 7419, + "time_per_iteration": 2.637568235397339 + }, + { + "auxiliary_loss_clip": 0.06446308, + "auxiliary_loss_mlp": 0.01271146, + "balance_loss_clip": 0.06283416, + "balance_loss_mlp": 0.01258349, + "epoch": 0.4461145347963325, + "flos": 17609957159040.0, + "grad_norm": 3.5469346323262068, + "language_loss": 0.73053217, + "learning_rate": 2.4424378349172176e-06, + "loss": 0.80770659, + "num_input_tokens_seen": 159074225, + "router_z_loss_clip": 1.63085938, + "router_z_loss_mlp": 0.12805176, + "step": 7420, + "time_per_iteration": 2.4839932918548584 + }, + { + "auxiliary_loss_clip": 0.06441851, + "auxiliary_loss_mlp": 0.01268009, + "balance_loss_clip": 0.06283888, + "balance_loss_mlp": 0.01255176, + "epoch": 0.44617465804900047, + "flos": 27274982209920.0, + "grad_norm": 1.4177043979342248, + "language_loss": 0.75314558, + "learning_rate": 2.442058014084156e-06, + "loss": 0.83024418, + "num_input_tokens_seen": 159095415, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.12823486, + "step": 7421, + "time_per_iteration": 2.6001040935516357 + }, + { + "auxiliary_loss_clip": 0.06439819, + "auxiliary_loss_mlp": 0.01266608, + "balance_loss_clip": 0.06281345, + "balance_loss_mlp": 0.01254073, + "epoch": 0.44623478130166844, + "flos": 17792371497600.0, + "grad_norm": 1.9155365450665858, + "language_loss": 0.75864565, + "learning_rate": 2.44167817648821e-06, + "loss": 0.83570993, + "num_input_tokens_seen": 159114615, + "router_z_loss_clip": 1.58398438, + "router_z_loss_mlp": 0.12536621, + "step": 7422, + "time_per_iteration": 2.481241226196289 + }, + { + "auxiliary_loss_clip": 0.06447253, + "auxiliary_loss_mlp": 0.01267362, + "balance_loss_clip": 0.06284253, + "balance_loss_mlp": 0.01254804, + "epoch": 0.4462949045543364, + "flos": 23009698298880.0, + "grad_norm": 1.7347835392128452, + "language_loss": 0.65679651, + "learning_rate": 2.441298322143784e-06, + "loss": 0.73394263, + "num_input_tokens_seen": 159134370, + "router_z_loss_clip": 1.62890625, + "router_z_loss_mlp": 0.12573242, + "step": 7423, + "time_per_iteration": 2.539268732070923 + }, + { + "auxiliary_loss_clip": 0.06440745, + "auxiliary_loss_mlp": 0.01268488, + "balance_loss_clip": 0.06283564, + "balance_loss_mlp": 0.01256591, + "epoch": 0.44635502780700437, + "flos": 17825592441600.0, + "grad_norm": 1.4381231336851048, + "language_loss": 0.79473054, + "learning_rate": 2.4409184510652807e-06, + "loss": 0.87182289, + "num_input_tokens_seen": 159152540, + "router_z_loss_clip": 1.57128906, + "router_z_loss_mlp": 0.11901855, + "step": 7424, + "time_per_iteration": 2.488111972808838 + }, + { + "auxiliary_loss_clip": 0.06437074, + "auxiliary_loss_mlp": 0.01267937, + "balance_loss_clip": 0.06280597, + "balance_loss_mlp": 0.01256148, + "epoch": 0.44641515105967233, + "flos": 26695314864000.0, + "grad_norm": 1.3471148592694158, + "language_loss": 0.8055563, + "learning_rate": 2.4405385632671063e-06, + "loss": 0.88260639, + "num_input_tokens_seen": 159173425, + "router_z_loss_clip": 1.56542969, + "router_z_loss_mlp": 0.11791992, + "step": 7425, + "time_per_iteration": 2.598731756210327 + }, + { + "auxiliary_loss_clip": 0.06439465, + "auxiliary_loss_mlp": 0.01271755, + "balance_loss_clip": 0.06279327, + "balance_loss_mlp": 0.01259536, + "epoch": 0.4464752743123403, + "flos": 18918778734720.0, + "grad_norm": 1.4143607287110962, + "language_loss": 0.77488291, + "learning_rate": 2.4401586587636655e-06, + "loss": 0.85199511, + "num_input_tokens_seen": 159191210, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.12207031, + "step": 7426, + "time_per_iteration": 2.494330406188965 + }, + { + "auxiliary_loss_clip": 0.06445856, + "auxiliary_loss_mlp": 0.01267303, + "balance_loss_clip": 0.06281333, + "balance_loss_mlp": 0.01253773, + "epoch": 0.44653539756500826, + "flos": 29578081933440.0, + "grad_norm": 1.9924998088803147, + "language_loss": 0.64776599, + "learning_rate": 2.4397787375693634e-06, + "loss": 0.72489762, + "num_input_tokens_seen": 159211755, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.13513184, + "step": 7427, + "time_per_iteration": 2.611482858657837 + }, + { + "auxiliary_loss_clip": 0.06441574, + "auxiliary_loss_mlp": 0.01275968, + "balance_loss_clip": 0.06284505, + "balance_loss_mlp": 0.0126372, + "epoch": 0.44659552081767623, + "flos": 21475137024000.0, + "grad_norm": 1.5780428941103348, + "language_loss": 0.75530696, + "learning_rate": 2.439398799698608e-06, + "loss": 0.8324824, + "num_input_tokens_seen": 159230315, + "router_z_loss_clip": 1.56933594, + "router_z_loss_mlp": 0.12268066, + "step": 7428, + "time_per_iteration": 2.505094051361084 + }, + { + "auxiliary_loss_clip": 0.06441561, + "auxiliary_loss_mlp": 0.01271156, + "balance_loss_clip": 0.06281359, + "balance_loss_mlp": 0.0125843, + "epoch": 0.4466556440703442, + "flos": 17937791458560.0, + "grad_norm": 1.912744298925221, + "language_loss": 0.78478271, + "learning_rate": 2.439018845165806e-06, + "loss": 0.86190987, + "num_input_tokens_seen": 159249810, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.12731934, + "step": 7429, + "time_per_iteration": 2.5107972621917725 + }, + { + "auxiliary_loss_clip": 0.06447433, + "auxiliary_loss_mlp": 0.0127403, + "balance_loss_clip": 0.06283738, + "balance_loss_mlp": 0.01260667, + "epoch": 0.44671576732301216, + "flos": 21114081780480.0, + "grad_norm": 1.7694096542013318, + "language_loss": 0.91354167, + "learning_rate": 2.438638873985366e-06, + "loss": 0.99075633, + "num_input_tokens_seen": 159271715, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.13366699, + "step": 7430, + "time_per_iteration": 2.537428140640259 + }, + { + "auxiliary_loss_clip": 0.06451312, + "auxiliary_loss_mlp": 0.01271269, + "balance_loss_clip": 0.06282946, + "balance_loss_mlp": 0.01257792, + "epoch": 0.4467758905756801, + "flos": 23514873765120.0, + "grad_norm": 1.610238873942938, + "language_loss": 0.80143106, + "learning_rate": 2.4382588861716954e-06, + "loss": 0.87865686, + "num_input_tokens_seen": 159290690, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.1348877, + "step": 7431, + "time_per_iteration": 2.5611300468444824 + }, + { + "auxiliary_loss_clip": 0.06447126, + "auxiliary_loss_mlp": 0.01271916, + "balance_loss_clip": 0.06282945, + "balance_loss_mlp": 0.01258374, + "epoch": 0.4468360138283481, + "flos": 18739970121600.0, + "grad_norm": 1.9551980798487134, + "language_loss": 0.80273902, + "learning_rate": 2.437878881739204e-06, + "loss": 0.87992942, + "num_input_tokens_seen": 159309400, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.13543701, + "step": 7432, + "time_per_iteration": 2.500554084777832 + }, + { + "auxiliary_loss_clip": 0.06450094, + "auxiliary_loss_mlp": 0.01273992, + "balance_loss_clip": 0.06283274, + "balance_loss_mlp": 0.0126073, + "epoch": 0.4468961370810161, + "flos": 23483874954240.0, + "grad_norm": 1.835454334349629, + "language_loss": 0.76644909, + "learning_rate": 2.437498860702301e-06, + "loss": 0.84368992, + "num_input_tokens_seen": 159327425, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.13269043, + "step": 7433, + "time_per_iteration": 2.5840916633605957 + }, + { + "auxiliary_loss_clip": 0.06435596, + "auxiliary_loss_mlp": 0.01271551, + "balance_loss_clip": 0.06279343, + "balance_loss_mlp": 0.01260047, + "epoch": 0.4469562603336841, + "flos": 30081873807360.0, + "grad_norm": 1.6012992804544768, + "language_loss": 0.77581275, + "learning_rate": 2.437118823075398e-06, + "loss": 0.85288417, + "num_input_tokens_seen": 159345805, + "router_z_loss_clip": 1.5625, + "router_z_loss_mlp": 0.1151123, + "step": 7434, + "time_per_iteration": 2.579667329788208 + }, + { + "auxiliary_loss_clip": 0.06443198, + "auxiliary_loss_mlp": 0.01270182, + "balance_loss_clip": 0.06278063, + "balance_loss_mlp": 0.01257439, + "epoch": 0.44701638358635204, + "flos": 22463126115840.0, + "grad_norm": 1.683412458990524, + "language_loss": 0.63887638, + "learning_rate": 2.436738768872905e-06, + "loss": 0.71601021, + "num_input_tokens_seen": 159364595, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.12750244, + "step": 7435, + "time_per_iteration": 2.5773611068725586 + }, + { + "auxiliary_loss_clip": 0.06444404, + "auxiliary_loss_mlp": 0.01272477, + "balance_loss_clip": 0.06280479, + "balance_loss_mlp": 0.01258714, + "epoch": 0.44707650683902, + "flos": 24064171205760.0, + "grad_norm": 1.5617494879233198, + "language_loss": 0.83911443, + "learning_rate": 2.4363586981092346e-06, + "loss": 0.91628319, + "num_input_tokens_seen": 159385265, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.13763428, + "step": 7436, + "time_per_iteration": 2.5204451084136963 + }, + { + "auxiliary_loss_clip": 0.0644998, + "auxiliary_loss_mlp": 0.01269044, + "balance_loss_clip": 0.0628316, + "balance_loss_mlp": 0.01254226, + "epoch": 0.44713663009168797, + "flos": 23773373210880.0, + "grad_norm": 1.7812959316100008, + "language_loss": 0.79632622, + "learning_rate": 2.435978610798798e-06, + "loss": 0.87351644, + "num_input_tokens_seen": 159405080, + "router_z_loss_clip": 1.66699219, + "router_z_loss_mlp": 0.14819336, + "step": 7437, + "time_per_iteration": 2.564180374145508 + }, + { + "auxiliary_loss_clip": 0.0644551, + "auxiliary_loss_mlp": 0.01269936, + "balance_loss_clip": 0.06279416, + "balance_loss_mlp": 0.01256829, + "epoch": 0.44719675334435594, + "flos": 24506258947200.0, + "grad_norm": 1.814975751419929, + "language_loss": 0.72632974, + "learning_rate": 2.435598506956009e-06, + "loss": 0.8034842, + "num_input_tokens_seen": 159424595, + "router_z_loss_clip": 1.66113281, + "router_z_loss_mlp": 0.13116455, + "step": 7438, + "time_per_iteration": 2.601855993270874 + }, + { + "auxiliary_loss_clip": 0.06445266, + "auxiliary_loss_mlp": 0.01270946, + "balance_loss_clip": 0.06279082, + "balance_loss_mlp": 0.01257046, + "epoch": 0.4472568765970239, + "flos": 29788308627840.0, + "grad_norm": 3.3026679320519716, + "language_loss": 0.67660618, + "learning_rate": 2.4352183865952808e-06, + "loss": 0.75376832, + "num_input_tokens_seen": 159443865, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.13903809, + "step": 7439, + "time_per_iteration": 2.6503498554229736 + }, + { + "auxiliary_loss_clip": 0.06447087, + "auxiliary_loss_mlp": 0.01272251, + "balance_loss_clip": 0.06280239, + "balance_loss_mlp": 0.01257648, + "epoch": 0.44731699984969187, + "flos": 24649792191360.0, + "grad_norm": 1.6003212894552636, + "language_loss": 0.73896551, + "learning_rate": 2.4348382497310285e-06, + "loss": 0.81615895, + "num_input_tokens_seen": 159464525, + "router_z_loss_clip": 1.66894531, + "router_z_loss_mlp": 0.14605713, + "step": 7440, + "time_per_iteration": 4.026291608810425 + }, + { + "auxiliary_loss_clip": 0.06441355, + "auxiliary_loss_mlp": 0.01270172, + "balance_loss_clip": 0.06277838, + "balance_loss_mlp": 0.0125722, + "epoch": 0.44737712310235983, + "flos": 29462570680320.0, + "grad_norm": 1.5530123963175664, + "language_loss": 0.74356592, + "learning_rate": 2.4344580963776655e-06, + "loss": 0.82068115, + "num_input_tokens_seen": 159486385, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.12963867, + "step": 7441, + "time_per_iteration": 2.5968191623687744 + }, + { + "auxiliary_loss_clip": 0.06443278, + "auxiliary_loss_mlp": 0.01268347, + "balance_loss_clip": 0.06277753, + "balance_loss_mlp": 0.01254983, + "epoch": 0.4474372463550278, + "flos": 24903260392320.0, + "grad_norm": 2.4580446492601014, + "language_loss": 0.75523049, + "learning_rate": 2.4340779265496082e-06, + "loss": 0.83234674, + "num_input_tokens_seen": 159503880, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.13378906, + "step": 7442, + "time_per_iteration": 2.6050899028778076 + }, + { + "auxiliary_loss_clip": 0.0645077, + "auxiliary_loss_mlp": 0.01276603, + "balance_loss_clip": 0.06281515, + "balance_loss_mlp": 0.01262644, + "epoch": 0.44749736960769576, + "flos": 33189835524480.0, + "grad_norm": 1.8304580376547321, + "language_loss": 0.74504036, + "learning_rate": 2.433697740261273e-06, + "loss": 0.82231408, + "num_input_tokens_seen": 159522980, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.13952637, + "step": 7443, + "time_per_iteration": 2.590211868286133 + }, + { + "auxiliary_loss_clip": 0.06441949, + "auxiliary_loss_mlp": 0.01270493, + "balance_loss_clip": 0.06278961, + "balance_loss_mlp": 0.01256605, + "epoch": 0.4475574928603637, + "flos": 21078596776320.0, + "grad_norm": 1.7164366382085705, + "language_loss": 0.78287792, + "learning_rate": 2.4333175375270748e-06, + "loss": 0.86000234, + "num_input_tokens_seen": 159543340, + "router_z_loss_clip": 1.62890625, + "router_z_loss_mlp": 0.13891602, + "step": 7444, + "time_per_iteration": 2.554215669631958 + }, + { + "auxiliary_loss_clip": 0.06437638, + "auxiliary_loss_mlp": 0.01276986, + "balance_loss_clip": 0.06276217, + "balance_loss_mlp": 0.01263664, + "epoch": 0.4476176161130317, + "flos": 21867442640640.0, + "grad_norm": 2.3488437532538735, + "language_loss": 0.85014707, + "learning_rate": 2.4329373183614333e-06, + "loss": 0.9272933, + "num_input_tokens_seen": 159558210, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.13317871, + "step": 7445, + "time_per_iteration": 2.463123321533203 + }, + { + "auxiliary_loss_clip": 0.06446601, + "auxiliary_loss_mlp": 0.0127394, + "balance_loss_clip": 0.06279677, + "balance_loss_mlp": 0.01258312, + "epoch": 0.4476777393656997, + "flos": 22535270081280.0, + "grad_norm": 2.2137135091267135, + "language_loss": 0.64567178, + "learning_rate": 2.432557082778765e-06, + "loss": 0.72287714, + "num_input_tokens_seen": 159577920, + "router_z_loss_clip": 1.66894531, + "router_z_loss_mlp": 0.15631104, + "step": 7446, + "time_per_iteration": 3.9910571575164795 + }, + { + "auxiliary_loss_clip": 0.06349403, + "auxiliary_loss_mlp": 0.01256298, + "balance_loss_clip": 0.06276181, + "balance_loss_mlp": 0.01253975, + "epoch": 0.4477378626183677, + "flos": 49034236101120.0, + "grad_norm": 0.7348354325841562, + "language_loss": 0.49922079, + "learning_rate": 2.4321768307934884e-06, + "loss": 0.57527786, + "num_input_tokens_seen": 159632295, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.0231781, + "step": 7447, + "time_per_iteration": 3.0209667682647705 + }, + { + "auxiliary_loss_clip": 0.06344398, + "auxiliary_loss_mlp": 0.01262514, + "balance_loss_clip": 0.06271263, + "balance_loss_mlp": 0.01260019, + "epoch": 0.44779798587103564, + "flos": 56562041784960.0, + "grad_norm": 0.8026230684928909, + "language_loss": 0.59334445, + "learning_rate": 2.4317965624200235e-06, + "loss": 0.66941357, + "num_input_tokens_seen": 159698435, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.02493286, + "step": 7448, + "time_per_iteration": 3.2380871772766113 + }, + { + "auxiliary_loss_clip": 0.06443155, + "auxiliary_loss_mlp": 0.01270524, + "balance_loss_clip": 0.06277426, + "balance_loss_mlp": 0.01256994, + "epoch": 0.4478581091237036, + "flos": 46508933278080.0, + "grad_norm": 1.7384627548967189, + "language_loss": 0.59131092, + "learning_rate": 2.431416277672789e-06, + "loss": 0.66844773, + "num_input_tokens_seen": 159722150, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.13537598, + "step": 7449, + "time_per_iteration": 2.7783467769622803 + }, + { + "auxiliary_loss_clip": 0.06440828, + "auxiliary_loss_mlp": 0.01272088, + "balance_loss_clip": 0.06277853, + "balance_loss_mlp": 0.01258868, + "epoch": 0.4479182323763716, + "flos": 20820768163200.0, + "grad_norm": 1.956040680672474, + "language_loss": 0.81008971, + "learning_rate": 2.4310359765662065e-06, + "loss": 0.88721895, + "num_input_tokens_seen": 159740550, + "router_z_loss_clip": 1.62890625, + "router_z_loss_mlp": 0.13220215, + "step": 7450, + "time_per_iteration": 2.488323450088501 + }, + { + "auxiliary_loss_clip": 0.06442301, + "auxiliary_loss_mlp": 0.01273054, + "balance_loss_clip": 0.06277788, + "balance_loss_mlp": 0.01259172, + "epoch": 0.44797835562903954, + "flos": 14251126717440.0, + "grad_norm": 2.5451576111358136, + "language_loss": 0.79348361, + "learning_rate": 2.430655659114697e-06, + "loss": 0.87063718, + "num_input_tokens_seen": 159758245, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.13885498, + "step": 7451, + "time_per_iteration": 2.4923946857452393 + }, + { + "auxiliary_loss_clip": 0.06344576, + "auxiliary_loss_mlp": 0.0125349, + "balance_loss_clip": 0.06271936, + "balance_loss_mlp": 0.0125126, + "epoch": 0.4480384788817075, + "flos": 63553436357760.0, + "grad_norm": 0.7850742570611701, + "language_loss": 0.62791413, + "learning_rate": 2.430275325332681e-06, + "loss": 0.70389479, + "num_input_tokens_seen": 159826790, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.02233887, + "step": 7452, + "time_per_iteration": 3.2259254455566406 + }, + { + "auxiliary_loss_clip": 0.06441975, + "auxiliary_loss_mlp": 0.01272416, + "balance_loss_clip": 0.06277539, + "balance_loss_mlp": 0.01258874, + "epoch": 0.44809860213437547, + "flos": 21659018808960.0, + "grad_norm": 1.8053672901244522, + "language_loss": 0.62585479, + "learning_rate": 2.429894975234582e-06, + "loss": 0.70299876, + "num_input_tokens_seen": 159845805, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.13537598, + "step": 7453, + "time_per_iteration": 3.928234577178955 + }, + { + "auxiliary_loss_clip": 0.06345223, + "auxiliary_loss_mlp": 0.01256622, + "balance_loss_clip": 0.06272231, + "balance_loss_mlp": 0.01254279, + "epoch": 0.44815872538704343, + "flos": 69210586840320.0, + "grad_norm": 0.747363028090033, + "language_loss": 0.5699693, + "learning_rate": 2.4295146088348224e-06, + "loss": 0.64598775, + "num_input_tokens_seen": 159898860, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.02339172, + "step": 7454, + "time_per_iteration": 3.0569918155670166 + }, + { + "auxiliary_loss_clip": 0.06447325, + "auxiliary_loss_mlp": 0.01268938, + "balance_loss_clip": 0.06281178, + "balance_loss_mlp": 0.01255705, + "epoch": 0.4482188486397114, + "flos": 12602186219520.0, + "grad_norm": 1.9501180256269237, + "language_loss": 0.75448847, + "learning_rate": 2.4291342261478255e-06, + "loss": 0.83165109, + "num_input_tokens_seen": 159911555, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.13220215, + "step": 7455, + "time_per_iteration": 2.4410433769226074 + }, + { + "auxiliary_loss_clip": 0.06442874, + "auxiliary_loss_mlp": 0.0126888, + "balance_loss_clip": 0.06278916, + "balance_loss_mlp": 0.01254932, + "epoch": 0.44827897189237936, + "flos": 34066715702400.0, + "grad_norm": 1.6532992970231903, + "language_loss": 0.76341856, + "learning_rate": 2.428753827188016e-06, + "loss": 0.84053606, + "num_input_tokens_seen": 159931470, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.1394043, + "step": 7456, + "time_per_iteration": 2.6695046424865723 + }, + { + "auxiliary_loss_clip": 0.06443818, + "auxiliary_loss_mlp": 0.01274223, + "balance_loss_clip": 0.06283055, + "balance_loss_mlp": 0.01261087, + "epoch": 0.44833909514504733, + "flos": 25153080940800.0, + "grad_norm": 1.8332154029673087, + "language_loss": 0.7703625, + "learning_rate": 2.428373411969818e-06, + "loss": 0.84754294, + "num_input_tokens_seen": 159946115, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.13122559, + "step": 7457, + "time_per_iteration": 2.4982032775878906 + }, + { + "auxiliary_loss_clip": 0.06449621, + "auxiliary_loss_mlp": 0.0126721, + "balance_loss_clip": 0.06282188, + "balance_loss_mlp": 0.01253269, + "epoch": 0.4483992183977153, + "flos": 16185498549120.0, + "grad_norm": 2.4281328609676254, + "language_loss": 0.68744391, + "learning_rate": 2.4279929805076576e-06, + "loss": 0.7646122, + "num_input_tokens_seen": 159963915, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.1394043, + "step": 7458, + "time_per_iteration": 2.4979610443115234 + }, + { + "auxiliary_loss_clip": 0.06448827, + "auxiliary_loss_mlp": 0.01274875, + "balance_loss_clip": 0.06280437, + "balance_loss_mlp": 0.01259592, + "epoch": 0.44845934165038326, + "flos": 17751352124160.0, + "grad_norm": 1.539492966179865, + "language_loss": 0.71756333, + "learning_rate": 2.427612532815961e-06, + "loss": 0.79480034, + "num_input_tokens_seen": 159982140, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.15283203, + "step": 7459, + "time_per_iteration": 2.482675075531006 + }, + { + "auxiliary_loss_clip": 0.06445904, + "auxiliary_loss_mlp": 0.01268873, + "balance_loss_clip": 0.06281781, + "balance_loss_mlp": 0.01255343, + "epoch": 0.4485194649030513, + "flos": 21842481323520.0, + "grad_norm": 1.7620296739852843, + "language_loss": 0.69945031, + "learning_rate": 2.427232068909154e-06, + "loss": 0.7765981, + "num_input_tokens_seen": 160002280, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.13525391, + "step": 7460, + "time_per_iteration": 2.548891067504883 + }, + { + "auxiliary_loss_clip": 0.06446661, + "auxiliary_loss_mlp": 0.01267799, + "balance_loss_clip": 0.06281269, + "balance_loss_mlp": 0.01253744, + "epoch": 0.44857958815571924, + "flos": 20090775392640.0, + "grad_norm": 2.1567039258492637, + "language_loss": 0.77558124, + "learning_rate": 2.4268515888016635e-06, + "loss": 0.85272586, + "num_input_tokens_seen": 160020260, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.14068604, + "step": 7461, + "time_per_iteration": 2.488675832748413 + }, + { + "auxiliary_loss_clip": 0.0644468, + "auxiliary_loss_mlp": 0.01266891, + "balance_loss_clip": 0.0627977, + "balance_loss_mlp": 0.01252514, + "epoch": 0.4486397114083872, + "flos": 27060982081920.0, + "grad_norm": 1.6449935173844783, + "language_loss": 0.68081152, + "learning_rate": 2.4264710925079184e-06, + "loss": 0.75792718, + "num_input_tokens_seen": 160040240, + "router_z_loss_clip": 1.64941406, + "router_z_loss_mlp": 0.14367676, + "step": 7462, + "time_per_iteration": 2.5873477458953857 + }, + { + "auxiliary_loss_clip": 0.06346884, + "auxiliary_loss_mlp": 0.01259781, + "balance_loss_clip": 0.06274216, + "balance_loss_mlp": 0.01257521, + "epoch": 0.4486998346610552, + "flos": 67339386587520.0, + "grad_norm": 0.7371865357722727, + "language_loss": 0.54459572, + "learning_rate": 2.4260905800423462e-06, + "loss": 0.62066233, + "num_input_tokens_seen": 160093865, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.0226593, + "step": 7463, + "time_per_iteration": 3.135831594467163 + }, + { + "auxiliary_loss_clip": 0.06446455, + "auxiliary_loss_mlp": 0.01271071, + "balance_loss_clip": 0.06283797, + "balance_loss_mlp": 0.01257344, + "epoch": 0.44875995791372314, + "flos": 27644297080320.0, + "grad_norm": 1.768714620285087, + "language_loss": 0.76698768, + "learning_rate": 2.4257100514193775e-06, + "loss": 0.844163, + "num_input_tokens_seen": 160113590, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.13726807, + "step": 7464, + "time_per_iteration": 2.5624353885650635 + }, + { + "auxiliary_loss_clip": 0.06442145, + "auxiliary_loss_mlp": 0.01270123, + "balance_loss_clip": 0.06281784, + "balance_loss_mlp": 0.01257063, + "epoch": 0.4488200811663911, + "flos": 13010969162880.0, + "grad_norm": 1.8955897931068166, + "language_loss": 0.74468267, + "learning_rate": 2.425329506653441e-06, + "loss": 0.82180536, + "num_input_tokens_seen": 160131795, + "router_z_loss_clip": 1.60351562, + "router_z_loss_mlp": 0.13043213, + "step": 7465, + "time_per_iteration": 2.4702823162078857 + }, + { + "auxiliary_loss_clip": 0.0645618, + "auxiliary_loss_mlp": 0.01272918, + "balance_loss_clip": 0.06284305, + "balance_loss_mlp": 0.01257391, + "epoch": 0.44888020441905907, + "flos": 27497283891840.0, + "grad_norm": 2.0464026275546314, + "language_loss": 0.80248308, + "learning_rate": 2.424948945758966e-06, + "loss": 0.87977397, + "num_input_tokens_seen": 160150635, + "router_z_loss_clip": 1.71777344, + "router_z_loss_mlp": 0.1552124, + "step": 7466, + "time_per_iteration": 2.542721748352051 + }, + { + "auxiliary_loss_clip": 0.06448439, + "auxiliary_loss_mlp": 0.01269021, + "balance_loss_clip": 0.0628207, + "balance_loss_mlp": 0.01255735, + "epoch": 0.44894032767172704, + "flos": 18265541904000.0, + "grad_norm": 2.2890338528416416, + "language_loss": 0.80875736, + "learning_rate": 2.4245683687503844e-06, + "loss": 0.88593197, + "num_input_tokens_seen": 160168615, + "router_z_loss_clip": 1.66308594, + "router_z_loss_mlp": 0.13293457, + "step": 7467, + "time_per_iteration": 2.4503378868103027 + }, + { + "auxiliary_loss_clip": 0.06442044, + "auxiliary_loss_mlp": 0.01269059, + "balance_loss_clip": 0.06284908, + "balance_loss_mlp": 0.01256465, + "epoch": 0.449000450924395, + "flos": 21586245937920.0, + "grad_norm": 2.2421166338055762, + "language_loss": 0.75738609, + "learning_rate": 2.424187775642129e-06, + "loss": 0.83449709, + "num_input_tokens_seen": 160187295, + "router_z_loss_clip": 1.5703125, + "router_z_loss_mlp": 0.12597656, + "step": 7468, + "time_per_iteration": 2.5120036602020264 + }, + { + "auxiliary_loss_clip": 0.06448267, + "auxiliary_loss_mlp": 0.01270415, + "balance_loss_clip": 0.06286301, + "balance_loss_mlp": 0.01257993, + "epoch": 0.44906057417706297, + "flos": 17973737660160.0, + "grad_norm": 2.1198815882874626, + "language_loss": 0.71292973, + "learning_rate": 2.4238071664486297e-06, + "loss": 0.79011655, + "num_input_tokens_seen": 160205115, + "router_z_loss_clip": 1.62109375, + "router_z_loss_mlp": 0.12414551, + "step": 7469, + "time_per_iteration": 2.4725160598754883 + }, + { + "auxiliary_loss_clip": 0.06450349, + "auxiliary_loss_mlp": 0.01268175, + "balance_loss_clip": 0.06284628, + "balance_loss_mlp": 0.0125427, + "epoch": 0.44912069742973093, + "flos": 20053487525760.0, + "grad_norm": 1.6969020049584582, + "language_loss": 0.7254343, + "learning_rate": 2.4234265411843203e-06, + "loss": 0.80261958, + "num_input_tokens_seen": 160222580, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.13903809, + "step": 7470, + "time_per_iteration": 2.5212604999542236 + }, + { + "auxiliary_loss_clip": 0.06447989, + "auxiliary_loss_mlp": 0.01269333, + "balance_loss_clip": 0.0628368, + "balance_loss_mlp": 0.01255951, + "epoch": 0.4491808206823989, + "flos": 21040009171200.0, + "grad_norm": 2.607168963621531, + "language_loss": 0.77266711, + "learning_rate": 2.423045899863634e-06, + "loss": 0.84984034, + "num_input_tokens_seen": 160241520, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.13397217, + "step": 7471, + "time_per_iteration": 2.4833462238311768 + }, + { + "auxiliary_loss_clip": 0.0644739, + "auxiliary_loss_mlp": 0.01274961, + "balance_loss_clip": 0.06286953, + "balance_loss_mlp": 0.01261579, + "epoch": 0.44924094393506686, + "flos": 22973919805440.0, + "grad_norm": 1.613716342828386, + "language_loss": 0.69996417, + "learning_rate": 2.4226652425010048e-06, + "loss": 0.77718765, + "num_input_tokens_seen": 160261815, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.1338501, + "step": 7472, + "time_per_iteration": 2.5575385093688965 + }, + { + "auxiliary_loss_clip": 0.06348881, + "auxiliary_loss_mlp": 0.01263011, + "balance_loss_clip": 0.0627597, + "balance_loss_mlp": 0.01260363, + "epoch": 0.4493010671877349, + "flos": 59252332026240.0, + "grad_norm": 0.7278471165666979, + "language_loss": 0.61657208, + "learning_rate": 2.4222845691108676e-06, + "loss": 0.69269097, + "num_input_tokens_seen": 160317070, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.02650452, + "step": 7473, + "time_per_iteration": 3.1560816764831543 + }, + { + "auxiliary_loss_clip": 0.06448925, + "auxiliary_loss_mlp": 0.01270251, + "balance_loss_clip": 0.0628556, + "balance_loss_mlp": 0.01256417, + "epoch": 0.44936119044040285, + "flos": 18010815891840.0, + "grad_norm": 2.7240719920550873, + "language_loss": 0.77420998, + "learning_rate": 2.421903879707657e-06, + "loss": 0.85140175, + "num_input_tokens_seen": 160334980, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.13830566, + "step": 7474, + "time_per_iteration": 2.4717578887939453 + }, + { + "auxiliary_loss_clip": 0.06442197, + "auxiliary_loss_mlp": 0.01276021, + "balance_loss_clip": 0.06283113, + "balance_loss_mlp": 0.0126264, + "epoch": 0.4494213136930708, + "flos": 21258243930240.0, + "grad_norm": 2.650117553560035, + "language_loss": 0.72072601, + "learning_rate": 2.4215231743058086e-06, + "loss": 0.79790819, + "num_input_tokens_seen": 160354500, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.1338501, + "step": 7475, + "time_per_iteration": 2.513819456100464 + }, + { + "auxiliary_loss_clip": 0.06442311, + "auxiliary_loss_mlp": 0.01269894, + "balance_loss_clip": 0.06277612, + "balance_loss_mlp": 0.01256954, + "epoch": 0.4494814369457388, + "flos": 27426271956480.0, + "grad_norm": 1.759412456892788, + "language_loss": 0.77338856, + "learning_rate": 2.4211424529197594e-06, + "loss": 0.8505106, + "num_input_tokens_seen": 160373650, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.1295166, + "step": 7476, + "time_per_iteration": 2.5318853855133057 + }, + { + "auxiliary_loss_clip": 0.06449737, + "auxiliary_loss_mlp": 0.01271172, + "balance_loss_clip": 0.06281359, + "balance_loss_mlp": 0.01256754, + "epoch": 0.44954156019840674, + "flos": 22860211415040.0, + "grad_norm": 1.712065897066968, + "language_loss": 0.71606135, + "learning_rate": 2.4207617155639464e-06, + "loss": 0.79327047, + "num_input_tokens_seen": 160393430, + "router_z_loss_clip": 1.68066406, + "router_z_loss_mlp": 0.144104, + "step": 7477, + "time_per_iteration": 2.532437324523926 + }, + { + "auxiliary_loss_clip": 0.06452323, + "auxiliary_loss_mlp": 0.01271774, + "balance_loss_clip": 0.06283113, + "balance_loss_mlp": 0.01257457, + "epoch": 0.4496016834510747, + "flos": 17207253636480.0, + "grad_norm": 8.505711381360525, + "language_loss": 0.68249893, + "learning_rate": 2.4203809622528062e-06, + "loss": 0.75973988, + "num_input_tokens_seen": 160410545, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.14331055, + "step": 7478, + "time_per_iteration": 2.4901106357574463 + }, + { + "auxiliary_loss_clip": 0.06438291, + "auxiliary_loss_mlp": 0.01274211, + "balance_loss_clip": 0.06278055, + "balance_loss_mlp": 0.01261676, + "epoch": 0.4496618067037427, + "flos": 18922636022400.0, + "grad_norm": 1.7939017561082606, + "language_loss": 0.89897281, + "learning_rate": 2.420000193000779e-06, + "loss": 0.97609776, + "num_input_tokens_seen": 160428105, + "router_z_loss_clip": 1.60058594, + "router_z_loss_mlp": 0.12518311, + "step": 7479, + "time_per_iteration": 3.9324028491973877 + }, + { + "auxiliary_loss_clip": 0.06445809, + "auxiliary_loss_mlp": 0.01275156, + "balance_loss_clip": 0.06282537, + "balance_loss_mlp": 0.01261304, + "epoch": 0.44972192995641064, + "flos": 21037828965120.0, + "grad_norm": 1.5817445570827902, + "language_loss": 0.75620329, + "learning_rate": 2.419619407822302e-06, + "loss": 0.833413, + "num_input_tokens_seen": 160448815, + "router_z_loss_clip": 1.63085938, + "router_z_loss_mlp": 0.13861084, + "step": 7480, + "time_per_iteration": 2.519364595413208 + }, + { + "auxiliary_loss_clip": 0.06450936, + "auxiliary_loss_mlp": 0.01270868, + "balance_loss_clip": 0.06283928, + "balance_loss_mlp": 0.01257033, + "epoch": 0.4497820532090786, + "flos": 20783354515200.0, + "grad_norm": 2.4818923045987233, + "language_loss": 0.79794782, + "learning_rate": 2.419238606731815e-06, + "loss": 0.87516582, + "num_input_tokens_seen": 160465940, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.1385498, + "step": 7481, + "time_per_iteration": 2.511104106903076 + }, + { + "auxiliary_loss_clip": 0.06439544, + "auxiliary_loss_mlp": 0.01274879, + "balance_loss_clip": 0.06280965, + "balance_loss_mlp": 0.01261003, + "epoch": 0.44984217646174657, + "flos": 33811067295360.0, + "grad_norm": 1.5325857273153378, + "language_loss": 0.68501163, + "learning_rate": 2.418857789743758e-06, + "loss": 0.76215583, + "num_input_tokens_seen": 160486710, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 0.13873291, + "step": 7482, + "time_per_iteration": 2.6323177814483643 + }, + { + "auxiliary_loss_clip": 0.06449723, + "auxiliary_loss_mlp": 0.01275016, + "balance_loss_clip": 0.06284413, + "balance_loss_mlp": 0.01261236, + "epoch": 0.44990229971441453, + "flos": 15522953915520.0, + "grad_norm": 2.4692742165129347, + "language_loss": 0.85184467, + "learning_rate": 2.418476956872571e-06, + "loss": 0.92909217, + "num_input_tokens_seen": 160503405, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.13775635, + "step": 7483, + "time_per_iteration": 2.5510005950927734 + }, + { + "auxiliary_loss_clip": 0.0644832, + "auxiliary_loss_mlp": 0.01272458, + "balance_loss_clip": 0.06278956, + "balance_loss_mlp": 0.01259017, + "epoch": 0.4499624229670825, + "flos": 29869676542080.0, + "grad_norm": 2.2555510336477362, + "language_loss": 0.81026614, + "learning_rate": 2.4180961081326967e-06, + "loss": 0.88747394, + "num_input_tokens_seen": 160525080, + "router_z_loss_clip": 1.69238281, + "router_z_loss_mlp": 0.13439941, + "step": 7484, + "time_per_iteration": 2.5549514293670654 + }, + { + "auxiliary_loss_clip": 0.06454043, + "auxiliary_loss_mlp": 0.01271307, + "balance_loss_clip": 0.06282799, + "balance_loss_mlp": 0.01257133, + "epoch": 0.45002254621975046, + "flos": 18519345521280.0, + "grad_norm": 3.0066277785462296, + "language_loss": 0.75523663, + "learning_rate": 2.4177152435385754e-06, + "loss": 0.83249015, + "num_input_tokens_seen": 160540895, + "router_z_loss_clip": 1.71191406, + "router_z_loss_mlp": 0.14172363, + "step": 7485, + "time_per_iteration": 2.5260515213012695 + }, + { + "auxiliary_loss_clip": 0.06353837, + "auxiliary_loss_mlp": 0.01254878, + "balance_loss_clip": 0.06280266, + "balance_loss_mlp": 0.01252054, + "epoch": 0.4500826694724185, + "flos": 70438753261440.0, + "grad_norm": 0.7710237062022668, + "language_loss": 0.58055162, + "learning_rate": 2.4173343631046504e-06, + "loss": 0.65663874, + "num_input_tokens_seen": 160598270, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.02819824, + "step": 7486, + "time_per_iteration": 4.631975173950195 + }, + { + "auxiliary_loss_clip": 0.06445555, + "auxiliary_loss_mlp": 0.0126857, + "balance_loss_clip": 0.06281094, + "balance_loss_mlp": 0.0125523, + "epoch": 0.45014279272508645, + "flos": 15784388254080.0, + "grad_norm": 2.313810641491004, + "language_loss": 0.83291382, + "learning_rate": 2.4169534668453654e-06, + "loss": 0.91005504, + "num_input_tokens_seen": 160614720, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.13336182, + "step": 7487, + "time_per_iteration": 2.4474549293518066 + }, + { + "auxiliary_loss_clip": 0.06440553, + "auxiliary_loss_mlp": 0.01274868, + "balance_loss_clip": 0.06278186, + "balance_loss_mlp": 0.01260879, + "epoch": 0.4502029159777544, + "flos": 21806157778560.0, + "grad_norm": 1.8256144522955593, + "language_loss": 0.77817398, + "learning_rate": 2.4165725547751622e-06, + "loss": 0.8553282, + "num_input_tokens_seen": 160635170, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.13983154, + "step": 7488, + "time_per_iteration": 2.5497655868530273 + }, + { + "auxiliary_loss_clip": 0.0645895, + "auxiliary_loss_mlp": 0.01273187, + "balance_loss_clip": 0.06284817, + "balance_loss_mlp": 0.01257773, + "epoch": 0.4502630392304224, + "flos": 28775651708160.0, + "grad_norm": 2.1057521417086194, + "language_loss": 0.72464138, + "learning_rate": 2.4161916269084858e-06, + "loss": 0.80196273, + "num_input_tokens_seen": 160654490, + "router_z_loss_clip": 1.7421875, + "router_z_loss_mlp": 0.15405273, + "step": 7489, + "time_per_iteration": 2.536022186279297 + }, + { + "auxiliary_loss_clip": 0.06449728, + "auxiliary_loss_mlp": 0.01273963, + "balance_loss_clip": 0.06281854, + "balance_loss_mlp": 0.012597, + "epoch": 0.45032316248309034, + "flos": 15848398373760.0, + "grad_norm": 2.178444480440472, + "language_loss": 0.70506239, + "learning_rate": 2.4158106832597817e-06, + "loss": 0.78229928, + "num_input_tokens_seen": 160669400, + "router_z_loss_clip": 1.6796875, + "router_z_loss_mlp": 0.14263916, + "step": 7490, + "time_per_iteration": 2.5048370361328125 + }, + { + "auxiliary_loss_clip": 0.06351414, + "auxiliary_loss_mlp": 0.01254304, + "balance_loss_clip": 0.06277761, + "balance_loss_mlp": 0.01251552, + "epoch": 0.4503832857357583, + "flos": 57873337056000.0, + "grad_norm": 0.766905441156629, + "language_loss": 0.56608462, + "learning_rate": 2.415429723843495e-06, + "loss": 0.64214182, + "num_input_tokens_seen": 160733820, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.02757263, + "step": 7491, + "time_per_iteration": 3.1021111011505127 + }, + { + "auxiliary_loss_clip": 0.06440033, + "auxiliary_loss_mlp": 0.01270664, + "balance_loss_clip": 0.06278066, + "balance_loss_mlp": 0.01257217, + "epoch": 0.4504434089884263, + "flos": 23884817541120.0, + "grad_norm": 1.940533812141729, + "language_loss": 0.79471588, + "learning_rate": 2.4150487486740713e-06, + "loss": 0.87182283, + "num_input_tokens_seen": 160753175, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.13446045, + "step": 7492, + "time_per_iteration": 3.906813144683838 + }, + { + "auxiliary_loss_clip": 0.06454505, + "auxiliary_loss_mlp": 0.01272683, + "balance_loss_clip": 0.06282404, + "balance_loss_mlp": 0.01257925, + "epoch": 0.45050353224109424, + "flos": 17790820197120.0, + "grad_norm": 2.4926790281130566, + "language_loss": 0.92799652, + "learning_rate": 2.4146677577659573e-06, + "loss": 1.00526834, + "num_input_tokens_seen": 160768310, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.14758301, + "step": 7493, + "time_per_iteration": 2.516523838043213 + }, + { + "auxiliary_loss_clip": 0.06351101, + "auxiliary_loss_mlp": 0.01253906, + "balance_loss_clip": 0.06277501, + "balance_loss_mlp": 0.01251232, + "epoch": 0.4505636554937622, + "flos": 65081960138880.0, + "grad_norm": 0.7917943169613642, + "language_loss": 0.62850708, + "learning_rate": 2.4142867511336e-06, + "loss": 0.70455718, + "num_input_tokens_seen": 160827370, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.02676392, + "step": 7494, + "time_per_iteration": 3.200533866882324 + }, + { + "auxiliary_loss_clip": 0.06439039, + "auxiliary_loss_mlp": 0.01268167, + "balance_loss_clip": 0.06275568, + "balance_loss_mlp": 0.01255305, + "epoch": 0.45062377874643017, + "flos": 22206597240960.0, + "grad_norm": 1.3576432808579277, + "language_loss": 0.8187722, + "learning_rate": 2.4139057287914484e-06, + "loss": 0.89584428, + "num_input_tokens_seen": 160849140, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.12860107, + "step": 7495, + "time_per_iteration": 2.6740329265594482 + }, + { + "auxiliary_loss_clip": 0.06444755, + "auxiliary_loss_mlp": 0.01267118, + "balance_loss_clip": 0.06279008, + "balance_loss_mlp": 0.01253344, + "epoch": 0.45068390199909814, + "flos": 37679433615360.0, + "grad_norm": 3.4533684270887988, + "language_loss": 0.85559022, + "learning_rate": 2.41352469075395e-06, + "loss": 0.93270886, + "num_input_tokens_seen": 160871280, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.13775635, + "step": 7496, + "time_per_iteration": 2.6514453887939453 + }, + { + "auxiliary_loss_clip": 0.06445448, + "auxiliary_loss_mlp": 0.01271465, + "balance_loss_clip": 0.06277982, + "balance_loss_mlp": 0.01258042, + "epoch": 0.4507440252517661, + "flos": 22307853300480.0, + "grad_norm": 2.147795774994512, + "language_loss": 0.76396865, + "learning_rate": 2.4131436370355534e-06, + "loss": 0.84113777, + "num_input_tokens_seen": 160888625, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.13427734, + "step": 7497, + "time_per_iteration": 2.5248610973358154 + }, + { + "auxiliary_loss_clip": 0.0644587, + "auxiliary_loss_mlp": 0.01268435, + "balance_loss_clip": 0.062753, + "balance_loss_mlp": 0.01254189, + "epoch": 0.45080414850443407, + "flos": 13193425428480.0, + "grad_norm": 1.9297018893586142, + "language_loss": 0.75253481, + "learning_rate": 2.4127625676507088e-06, + "loss": 0.82967794, + "num_input_tokens_seen": 160907040, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.14245605, + "step": 7498, + "time_per_iteration": 2.482625722885132 + }, + { + "auxiliary_loss_clip": 0.06447846, + "auxiliary_loss_mlp": 0.01269776, + "balance_loss_clip": 0.06277958, + "balance_loss_mlp": 0.01255697, + "epoch": 0.4508642717571021, + "flos": 21951451958400.0, + "grad_norm": 1.9463705761270829, + "language_loss": 0.70564914, + "learning_rate": 2.4123814826138663e-06, + "loss": 0.78282535, + "num_input_tokens_seen": 160927115, + "router_z_loss_clip": 1.69824219, + "router_z_loss_mlp": 0.14093018, + "step": 7499, + "time_per_iteration": 2.5338642597198486 + }, + { + "auxiliary_loss_clip": 0.06449613, + "auxiliary_loss_mlp": 0.01268145, + "balance_loss_clip": 0.06278396, + "balance_loss_mlp": 0.0125412, + "epoch": 0.45092439500977005, + "flos": 23374149632640.0, + "grad_norm": 2.119825325087625, + "language_loss": 0.77484369, + "learning_rate": 2.412000381939477e-06, + "loss": 0.85202128, + "num_input_tokens_seen": 160944405, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.14025879, + "step": 7500, + "time_per_iteration": 2.5290849208831787 + }, + { + "auxiliary_loss_clip": 0.06441833, + "auxiliary_loss_mlp": 0.01275038, + "balance_loss_clip": 0.06276967, + "balance_loss_mlp": 0.01262211, + "epoch": 0.450984518262438, + "flos": 20778532905600.0, + "grad_norm": 2.0513851791377014, + "language_loss": 0.62714708, + "learning_rate": 2.411619265641992e-06, + "loss": 0.70431578, + "num_input_tokens_seen": 160961345, + "router_z_loss_clip": 1.64746094, + "router_z_loss_mlp": 0.12823486, + "step": 7501, + "time_per_iteration": 2.513014316558838 + }, + { + "auxiliary_loss_clip": 0.06447023, + "auxiliary_loss_mlp": 0.01269353, + "balance_loss_clip": 0.0627754, + "balance_loss_mlp": 0.01255251, + "epoch": 0.451044641515106, + "flos": 17712303321600.0, + "grad_norm": 1.7676077358786102, + "language_loss": 0.8475225, + "learning_rate": 2.411238133735863e-06, + "loss": 0.92468631, + "num_input_tokens_seen": 160977330, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.14111328, + "step": 7502, + "time_per_iteration": 2.502213954925537 + }, + { + "auxiliary_loss_clip": 0.06440664, + "auxiliary_loss_mlp": 0.01270795, + "balance_loss_clip": 0.06275544, + "balance_loss_mlp": 0.01256967, + "epoch": 0.45110476476777395, + "flos": 20600940176640.0, + "grad_norm": 1.2963550821027272, + "language_loss": 0.79440266, + "learning_rate": 2.4108569862355418e-06, + "loss": 0.8715173, + "num_input_tokens_seen": 160997280, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.13824463, + "step": 7503, + "time_per_iteration": 2.539870023727417 + }, + { + "auxiliary_loss_clip": 0.0643944, + "auxiliary_loss_mlp": 0.01270582, + "balance_loss_clip": 0.06278714, + "balance_loss_mlp": 0.01257213, + "epoch": 0.4511648880204419, + "flos": 16039533536640.0, + "grad_norm": 2.8864102182872746, + "language_loss": 0.80966014, + "learning_rate": 2.410475823155484e-06, + "loss": 0.88676035, + "num_input_tokens_seen": 161014235, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.13354492, + "step": 7504, + "time_per_iteration": 2.4834609031677246 + }, + { + "auxiliary_loss_clip": 0.06439783, + "auxiliary_loss_mlp": 0.01267614, + "balance_loss_clip": 0.06277721, + "balance_loss_mlp": 0.0125412, + "epoch": 0.4512250112731099, + "flos": 23984103029760.0, + "grad_norm": 1.8935476867238503, + "language_loss": 0.63783783, + "learning_rate": 2.4100946445101405e-06, + "loss": 0.71491182, + "num_input_tokens_seen": 161032360, + "router_z_loss_clip": 1.62109375, + "router_z_loss_mlp": 0.1350708, + "step": 7505, + "time_per_iteration": 2.5183863639831543 + }, + { + "auxiliary_loss_clip": 0.06338686, + "auxiliary_loss_mlp": 0.0125649, + "balance_loss_clip": 0.06265638, + "balance_loss_mlp": 0.01253881, + "epoch": 0.45128513452577784, + "flos": 71484239053440.0, + "grad_norm": 0.8179087732062593, + "language_loss": 0.58726048, + "learning_rate": 2.409713450313968e-06, + "loss": 0.66321218, + "num_input_tokens_seen": 161091360, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.02610779, + "step": 7506, + "time_per_iteration": 3.2057392597198486 + }, + { + "auxiliary_loss_clip": 0.06438521, + "auxiliary_loss_mlp": 0.01269482, + "balance_loss_clip": 0.0627608, + "balance_loss_mlp": 0.01255987, + "epoch": 0.4513452577784458, + "flos": 22097375043840.0, + "grad_norm": 1.6199933066680872, + "language_loss": 0.79207951, + "learning_rate": 2.40933224058142e-06, + "loss": 0.86915958, + "num_input_tokens_seen": 161110825, + "router_z_loss_clip": 1.62402344, + "router_z_loss_mlp": 0.1348877, + "step": 7507, + "time_per_iteration": 2.485177993774414 + }, + { + "auxiliary_loss_clip": 0.0644455, + "auxiliary_loss_mlp": 0.01270991, + "balance_loss_clip": 0.06277668, + "balance_loss_mlp": 0.01256543, + "epoch": 0.4514053810311138, + "flos": 24282699454080.0, + "grad_norm": 1.6041025363642085, + "language_loss": 0.74460357, + "learning_rate": 2.4089510153269526e-06, + "loss": 0.82175899, + "num_input_tokens_seen": 161130685, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.14440918, + "step": 7508, + "time_per_iteration": 2.5957343578338623 + }, + { + "auxiliary_loss_clip": 0.06439587, + "auxiliary_loss_mlp": 0.01271402, + "balance_loss_clip": 0.06279378, + "balance_loss_mlp": 0.01258552, + "epoch": 0.45146550428378174, + "flos": 17891237715840.0, + "grad_norm": 2.0541508842975946, + "language_loss": 0.79828942, + "learning_rate": 2.4085697745650217e-06, + "loss": 0.87539923, + "num_input_tokens_seen": 161147555, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.12841797, + "step": 7509, + "time_per_iteration": 2.4700090885162354 + }, + { + "auxiliary_loss_clip": 0.06441342, + "auxiliary_loss_mlp": 0.01270525, + "balance_loss_clip": 0.06278946, + "balance_loss_mlp": 0.01257746, + "epoch": 0.4515256275364497, + "flos": 24250317050880.0, + "grad_norm": 1.7065874480024321, + "language_loss": 0.73257631, + "learning_rate": 2.4081885183100837e-06, + "loss": 0.80969501, + "num_input_tokens_seen": 161166255, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.12774658, + "step": 7510, + "time_per_iteration": 2.5448224544525146 + }, + { + "auxiliary_loss_clip": 0.06438527, + "auxiliary_loss_mlp": 0.01269291, + "balance_loss_clip": 0.06273279, + "balance_loss_mlp": 0.01255707, + "epoch": 0.45158575078911767, + "flos": 20637263721600.0, + "grad_norm": 1.688618785836195, + "language_loss": 0.77059448, + "learning_rate": 2.4078072465765964e-06, + "loss": 0.8476727, + "num_input_tokens_seen": 161184720, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.13598633, + "step": 7511, + "time_per_iteration": 2.48913311958313 + }, + { + "auxiliary_loss_clip": 0.06443627, + "auxiliary_loss_mlp": 0.01270366, + "balance_loss_clip": 0.06277004, + "balance_loss_mlp": 0.0125543, + "epoch": 0.45164587404178563, + "flos": 23333884945920.0, + "grad_norm": 1.5549799825793658, + "language_loss": 0.79259372, + "learning_rate": 2.4074259593790174e-06, + "loss": 0.86973357, + "num_input_tokens_seen": 161204360, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.14929199, + "step": 7512, + "time_per_iteration": 2.5429651737213135 + }, + { + "auxiliary_loss_clip": 0.06447546, + "auxiliary_loss_mlp": 0.01266751, + "balance_loss_clip": 0.06275645, + "balance_loss_mlp": 0.01252219, + "epoch": 0.45170599729445365, + "flos": 23812841283840.0, + "grad_norm": 2.088368619040166, + "language_loss": 0.87660837, + "learning_rate": 2.4070446567318053e-06, + "loss": 0.95375133, + "num_input_tokens_seen": 161223575, + "router_z_loss_clip": 1.71777344, + "router_z_loss_mlp": 0.14538574, + "step": 7513, + "time_per_iteration": 2.50119686126709 + }, + { + "auxiliary_loss_clip": 0.06437154, + "auxiliary_loss_mlp": 0.01272349, + "balance_loss_clip": 0.06280629, + "balance_loss_mlp": 0.01259963, + "epoch": 0.4517661205471216, + "flos": 23519569593600.0, + "grad_norm": 1.9321046654640033, + "language_loss": 0.67692971, + "learning_rate": 2.406663338649419e-06, + "loss": 0.75402474, + "num_input_tokens_seen": 161243805, + "router_z_loss_clip": 1.56738281, + "router_z_loss_mlp": 0.1237793, + "step": 7514, + "time_per_iteration": 2.548349618911743 + }, + { + "auxiliary_loss_clip": 0.0644633, + "auxiliary_loss_mlp": 0.01272615, + "balance_loss_clip": 0.06280062, + "balance_loss_mlp": 0.01258017, + "epoch": 0.4518262437997896, + "flos": 23520743550720.0, + "grad_norm": 2.108913826152056, + "language_loss": 0.69738746, + "learning_rate": 2.406282005146318e-06, + "loss": 0.7745769, + "num_input_tokens_seen": 161261450, + "router_z_loss_clip": 1.6640625, + "router_z_loss_mlp": 0.14587402, + "step": 7515, + "time_per_iteration": 2.5203166007995605 + }, + { + "auxiliary_loss_clip": 0.06448089, + "auxiliary_loss_mlp": 0.01273292, + "balance_loss_clip": 0.06278358, + "balance_loss_mlp": 0.01258379, + "epoch": 0.45188636705245755, + "flos": 14572210763520.0, + "grad_norm": 2.327142049261069, + "language_loss": 0.81245089, + "learning_rate": 2.405900656236963e-06, + "loss": 0.88966471, + "num_input_tokens_seen": 161276965, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.14916992, + "step": 7516, + "time_per_iteration": 2.5070860385894775 + }, + { + "auxiliary_loss_clip": 0.06440821, + "auxiliary_loss_mlp": 0.01272469, + "balance_loss_clip": 0.0627999, + "balance_loss_mlp": 0.01259899, + "epoch": 0.4519464903051255, + "flos": 19907690221440.0, + "grad_norm": 1.8586788547852597, + "language_loss": 0.65825433, + "learning_rate": 2.4055192919358137e-06, + "loss": 0.73538721, + "num_input_tokens_seen": 161295375, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.12573242, + "step": 7517, + "time_per_iteration": 2.4824438095092773 + }, + { + "auxiliary_loss_clip": 0.06439231, + "auxiliary_loss_mlp": 0.01270445, + "balance_loss_clip": 0.06279515, + "balance_loss_mlp": 0.01257923, + "epoch": 0.4520066135577935, + "flos": 18850492056960.0, + "grad_norm": 1.7463164288041955, + "language_loss": 0.63218093, + "learning_rate": 2.405137912257333e-06, + "loss": 0.70927775, + "num_input_tokens_seen": 161313010, + "router_z_loss_clip": 1.59570312, + "router_z_loss_mlp": 0.12524414, + "step": 7518, + "time_per_iteration": 2.5339365005493164 + }, + { + "auxiliary_loss_clip": 0.0644324, + "auxiliary_loss_mlp": 0.01270416, + "balance_loss_clip": 0.06278235, + "balance_loss_mlp": 0.0125713, + "epoch": 0.45206673681046144, + "flos": 48225279985920.0, + "grad_norm": 1.4167266474258036, + "language_loss": 0.59749353, + "learning_rate": 2.404756517215982e-06, + "loss": 0.67463017, + "num_input_tokens_seen": 161336690, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.13287354, + "step": 7519, + "time_per_iteration": 4.238602876663208 + }, + { + "auxiliary_loss_clip": 0.06444496, + "auxiliary_loss_mlp": 0.01271755, + "balance_loss_clip": 0.06278859, + "balance_loss_mlp": 0.0125789, + "epoch": 0.4521268600631294, + "flos": 23848997120640.0, + "grad_norm": 1.307309529899749, + "language_loss": 0.72893107, + "learning_rate": 2.404375106826223e-06, + "loss": 0.80609363, + "num_input_tokens_seen": 161357845, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.13848877, + "step": 7520, + "time_per_iteration": 2.5295658111572266 + }, + { + "auxiliary_loss_clip": 0.06438812, + "auxiliary_loss_mlp": 0.01272031, + "balance_loss_clip": 0.062758, + "balance_loss_mlp": 0.01257875, + "epoch": 0.4521869833157974, + "flos": 18849611589120.0, + "grad_norm": 1.9694306251575102, + "language_loss": 0.75821477, + "learning_rate": 2.4039936811025194e-06, + "loss": 0.83532321, + "num_input_tokens_seen": 161375160, + "router_z_loss_clip": 1.62890625, + "router_z_loss_mlp": 0.14147949, + "step": 7521, + "time_per_iteration": 2.51493763923645 + }, + { + "auxiliary_loss_clip": 0.06448258, + "auxiliary_loss_mlp": 0.01268765, + "balance_loss_clip": 0.06277934, + "balance_loss_mlp": 0.01255485, + "epoch": 0.45224710656846534, + "flos": 19793520633600.0, + "grad_norm": 2.0145516283749334, + "language_loss": 0.68112928, + "learning_rate": 2.4036122400593343e-06, + "loss": 0.75829947, + "num_input_tokens_seen": 161393690, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.1328125, + "step": 7522, + "time_per_iteration": 2.4986941814422607 + }, + { + "auxiliary_loss_clip": 0.06441501, + "auxiliary_loss_mlp": 0.0127253, + "balance_loss_clip": 0.06278691, + "balance_loss_mlp": 0.01258797, + "epoch": 0.4523072298211333, + "flos": 28263558280320.0, + "grad_norm": 1.4118666030005445, + "language_loss": 0.61165464, + "learning_rate": 2.403230783711134e-06, + "loss": 0.68879497, + "num_input_tokens_seen": 161415015, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.13739014, + "step": 7523, + "time_per_iteration": 2.5918800830841064 + }, + { + "auxiliary_loss_clip": 0.06446532, + "auxiliary_loss_mlp": 0.01271231, + "balance_loss_clip": 0.06278014, + "balance_loss_mlp": 0.01256187, + "epoch": 0.45236735307380127, + "flos": 11185651820160.0, + "grad_norm": 1.7682897571754845, + "language_loss": 0.78361082, + "learning_rate": 2.4028493120723813e-06, + "loss": 0.86078846, + "num_input_tokens_seen": 161432940, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.15057373, + "step": 7524, + "time_per_iteration": 2.4915785789489746 + }, + { + "auxiliary_loss_clip": 0.06441181, + "auxiliary_loss_mlp": 0.01272652, + "balance_loss_clip": 0.06277032, + "balance_loss_mlp": 0.01259527, + "epoch": 0.45242747632646924, + "flos": 22607959098240.0, + "grad_norm": 1.5918865124670334, + "language_loss": 0.63704681, + "learning_rate": 2.4024678251575417e-06, + "loss": 0.71418512, + "num_input_tokens_seen": 161452215, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.13122559, + "step": 7525, + "time_per_iteration": 4.0678441524505615 + }, + { + "auxiliary_loss_clip": 0.06439088, + "auxiliary_loss_mlp": 0.01272795, + "balance_loss_clip": 0.06279112, + "balance_loss_mlp": 0.01260153, + "epoch": 0.45248759957913726, + "flos": 18261558835200.0, + "grad_norm": 33.97196740045056, + "language_loss": 0.78961569, + "learning_rate": 2.402086322981083e-06, + "loss": 0.8667345, + "num_input_tokens_seen": 161469520, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.12664795, + "step": 7526, + "time_per_iteration": 2.4813144207000732 + }, + { + "auxiliary_loss_clip": 0.06437138, + "auxiliary_loss_mlp": 0.01271118, + "balance_loss_clip": 0.06276058, + "balance_loss_mlp": 0.01257493, + "epoch": 0.4525477228318052, + "flos": 22455746956800.0, + "grad_norm": 1.6415997795559136, + "language_loss": 0.81301343, + "learning_rate": 2.40170480555747e-06, + "loss": 0.89009607, + "num_input_tokens_seen": 161487335, + "router_z_loss_clip": 1.61132812, + "router_z_loss_mlp": 0.13641357, + "step": 7527, + "time_per_iteration": 2.5056183338165283 + }, + { + "auxiliary_loss_clip": 0.06441762, + "auxiliary_loss_mlp": 0.01270981, + "balance_loss_clip": 0.06280501, + "balance_loss_mlp": 0.01258106, + "epoch": 0.4526078460844732, + "flos": 29652909229440.0, + "grad_norm": 1.731340365534577, + "language_loss": 0.65853465, + "learning_rate": 2.4013232729011706e-06, + "loss": 0.73566198, + "num_input_tokens_seen": 161510095, + "router_z_loss_clip": 1.61230469, + "router_z_loss_mlp": 0.12866211, + "step": 7528, + "time_per_iteration": 2.6073391437530518 + }, + { + "auxiliary_loss_clip": 0.06439637, + "auxiliary_loss_mlp": 0.0127116, + "balance_loss_clip": 0.06280227, + "balance_loss_mlp": 0.01257296, + "epoch": 0.45266796933714115, + "flos": 23046483041280.0, + "grad_norm": 1.6874802957215247, + "language_loss": 0.75494301, + "learning_rate": 2.4009417250266525e-06, + "loss": 0.83205104, + "num_input_tokens_seen": 161528725, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.13867188, + "step": 7529, + "time_per_iteration": 2.5490171909332275 + }, + { + "auxiliary_loss_clip": 0.06443143, + "auxiliary_loss_mlp": 0.01270284, + "balance_loss_clip": 0.06278682, + "balance_loss_mlp": 0.0125614, + "epoch": 0.4527280925898091, + "flos": 14433582983040.0, + "grad_norm": 5.318026120447717, + "language_loss": 0.73199093, + "learning_rate": 2.400560161948384e-06, + "loss": 0.80912519, + "num_input_tokens_seen": 161547195, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.14160156, + "step": 7530, + "time_per_iteration": 2.4709434509277344 + }, + { + "auxiliary_loss_clip": 0.06441925, + "auxiliary_loss_mlp": 0.01267178, + "balance_loss_clip": 0.06279813, + "balance_loss_mlp": 0.01253857, + "epoch": 0.4527882158424771, + "flos": 22931432985600.0, + "grad_norm": 1.7055117614079858, + "language_loss": 0.76767921, + "learning_rate": 2.400178583680834e-06, + "loss": 0.84477019, + "num_input_tokens_seen": 161565565, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.13336182, + "step": 7531, + "time_per_iteration": 3.9209694862365723 + }, + { + "auxiliary_loss_clip": 0.06439964, + "auxiliary_loss_mlp": 0.01266077, + "balance_loss_clip": 0.06281422, + "balance_loss_mlp": 0.01253018, + "epoch": 0.45284833909514505, + "flos": 25562157373440.0, + "grad_norm": 1.5452453614533965, + "language_loss": 0.67367595, + "learning_rate": 2.3997969902384717e-06, + "loss": 0.75073636, + "num_input_tokens_seen": 161586630, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.1305542, + "step": 7532, + "time_per_iteration": 2.5799813270568848 + }, + { + "auxiliary_loss_clip": 0.06441537, + "auxiliary_loss_mlp": 0.01269682, + "balance_loss_clip": 0.06280663, + "balance_loss_mlp": 0.01257206, + "epoch": 0.452908462347813, + "flos": 18155816582400.0, + "grad_norm": 2.362226158293886, + "language_loss": 0.78750062, + "learning_rate": 2.399415381635768e-06, + "loss": 0.86461282, + "num_input_tokens_seen": 161603815, + "router_z_loss_clip": 1.60839844, + "router_z_loss_mlp": 0.12481689, + "step": 7533, + "time_per_iteration": 2.4713315963745117 + }, + { + "auxiliary_loss_clip": 0.06451754, + "auxiliary_loss_mlp": 0.01272809, + "balance_loss_clip": 0.06279968, + "balance_loss_mlp": 0.01257849, + "epoch": 0.452968585600481, + "flos": 19068810670080.0, + "grad_norm": 1.7736608700696739, + "language_loss": 0.83544481, + "learning_rate": 2.3990337578871927e-06, + "loss": 0.9126904, + "num_input_tokens_seen": 161622900, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.1494751, + "step": 7534, + "time_per_iteration": 2.632647752761841 + }, + { + "auxiliary_loss_clip": 0.06447195, + "auxiliary_loss_mlp": 0.01272735, + "balance_loss_clip": 0.06281491, + "balance_loss_mlp": 0.01258597, + "epoch": 0.45302870885314894, + "flos": 22057823116800.0, + "grad_norm": 1.5477368000033016, + "language_loss": 0.77199811, + "learning_rate": 2.3986521190072176e-06, + "loss": 0.84919739, + "num_input_tokens_seen": 161641700, + "router_z_loss_clip": 1.65625, + "router_z_loss_mlp": 0.14129639, + "step": 7535, + "time_per_iteration": 2.504075765609741 + }, + { + "auxiliary_loss_clip": 0.06444988, + "auxiliary_loss_mlp": 0.01267095, + "balance_loss_clip": 0.06283444, + "balance_loss_mlp": 0.01254453, + "epoch": 0.4530888321058169, + "flos": 20382495782400.0, + "grad_norm": 1.553658728431748, + "language_loss": 0.80988163, + "learning_rate": 2.3982704650103138e-06, + "loss": 0.88700247, + "num_input_tokens_seen": 161661955, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.12640381, + "step": 7536, + "time_per_iteration": 2.5701963901519775 + }, + { + "auxiliary_loss_clip": 0.06448273, + "auxiliary_loss_mlp": 0.01269034, + "balance_loss_clip": 0.06281114, + "balance_loss_mlp": 0.01255617, + "epoch": 0.4531489553584849, + "flos": 14835783381120.0, + "grad_norm": 1.8444336957712972, + "language_loss": 0.76206815, + "learning_rate": 2.3978887959109544e-06, + "loss": 0.83924115, + "num_input_tokens_seen": 161679245, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.13427734, + "step": 7537, + "time_per_iteration": 2.4535741806030273 + }, + { + "auxiliary_loss_clip": 0.06453362, + "auxiliary_loss_mlp": 0.01269094, + "balance_loss_clip": 0.06287456, + "balance_loss_mlp": 0.0125526, + "epoch": 0.45320907861115284, + "flos": 21951493885440.0, + "grad_norm": 1.8251133101176713, + "language_loss": 0.75698435, + "learning_rate": 2.3975071117236118e-06, + "loss": 0.83420891, + "num_input_tokens_seen": 161698795, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.13830566, + "step": 7538, + "time_per_iteration": 2.5437614917755127 + }, + { + "auxiliary_loss_clip": 0.06342177, + "auxiliary_loss_mlp": 0.01255931, + "balance_loss_clip": 0.06267795, + "balance_loss_mlp": 0.01253302, + "epoch": 0.45326920186382086, + "flos": 66273620578560.0, + "grad_norm": 1.09487044177016, + "language_loss": 0.62420493, + "learning_rate": 2.3971254124627593e-06, + "loss": 0.70018601, + "num_input_tokens_seen": 161761980, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.02630615, + "step": 7539, + "time_per_iteration": 3.1658005714416504 + }, + { + "auxiliary_loss_clip": 0.06450586, + "auxiliary_loss_mlp": 0.01270155, + "balance_loss_clip": 0.06287818, + "balance_loss_mlp": 0.01256404, + "epoch": 0.4533293251164888, + "flos": 14689524879360.0, + "grad_norm": 1.7102983978579578, + "language_loss": 0.65674543, + "learning_rate": 2.396743698142872e-06, + "loss": 0.73395288, + "num_input_tokens_seen": 161779455, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.13757324, + "step": 7540, + "time_per_iteration": 2.5642666816711426 + }, + { + "auxiliary_loss_clip": 0.06454974, + "auxiliary_loss_mlp": 0.01269021, + "balance_loss_clip": 0.06285828, + "balance_loss_mlp": 0.01254179, + "epoch": 0.4533894483691568, + "flos": 22607749463040.0, + "grad_norm": 2.019177110810713, + "language_loss": 0.84982491, + "learning_rate": 2.396361968778424e-06, + "loss": 0.92706484, + "num_input_tokens_seen": 161798980, + "router_z_loss_clip": 1.69238281, + "router_z_loss_mlp": 0.1484375, + "step": 7541, + "time_per_iteration": 2.515012741088867 + }, + { + "auxiliary_loss_clip": 0.06444205, + "auxiliary_loss_mlp": 0.01270638, + "balance_loss_clip": 0.06281162, + "balance_loss_mlp": 0.01257853, + "epoch": 0.45344957162182475, + "flos": 34760301073920.0, + "grad_norm": 1.6772641382422697, + "language_loss": 0.77260393, + "learning_rate": 2.395980224383889e-06, + "loss": 0.84975231, + "num_input_tokens_seen": 161819745, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.12780762, + "step": 7542, + "time_per_iteration": 2.6276772022247314 + }, + { + "auxiliary_loss_clip": 0.06447195, + "auxiliary_loss_mlp": 0.01266644, + "balance_loss_clip": 0.06281827, + "balance_loss_mlp": 0.01252398, + "epoch": 0.4535096948744927, + "flos": 23556983241600.0, + "grad_norm": 1.679511772595701, + "language_loss": 0.80522043, + "learning_rate": 2.395598464973746e-06, + "loss": 0.88235873, + "num_input_tokens_seen": 161838575, + "router_z_loss_clip": 1.65234375, + "router_z_loss_mlp": 0.14233398, + "step": 7543, + "time_per_iteration": 2.5102038383483887 + }, + { + "auxiliary_loss_clip": 0.06448692, + "auxiliary_loss_mlp": 0.01269791, + "balance_loss_clip": 0.06283225, + "balance_loss_mlp": 0.01256339, + "epoch": 0.4535698181271607, + "flos": 25564756849920.0, + "grad_norm": 1.5595363191014409, + "language_loss": 0.76234162, + "learning_rate": 2.395216690562469e-06, + "loss": 0.83952641, + "num_input_tokens_seen": 161858590, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.13446045, + "step": 7544, + "time_per_iteration": 2.613546371459961 + }, + { + "auxiliary_loss_clip": 0.06450664, + "auxiliary_loss_mlp": 0.0127145, + "balance_loss_clip": 0.06283042, + "balance_loss_mlp": 0.01257747, + "epoch": 0.45362994137982865, + "flos": 24871171478400.0, + "grad_norm": 1.656067150864753, + "language_loss": 0.75691646, + "learning_rate": 2.3948349011645355e-06, + "loss": 0.83413762, + "num_input_tokens_seen": 161878390, + "router_z_loss_clip": 1.67578125, + "router_z_loss_mlp": 0.137146, + "step": 7545, + "time_per_iteration": 2.5587077140808105 + }, + { + "auxiliary_loss_clip": 0.06444206, + "auxiliary_loss_mlp": 0.01276554, + "balance_loss_clip": 0.06279359, + "balance_loss_mlp": 0.01263161, + "epoch": 0.4536900646324966, + "flos": 30814088711040.0, + "grad_norm": 1.7013764448707542, + "language_loss": 0.72677243, + "learning_rate": 2.394453096794423e-06, + "loss": 0.80397999, + "num_input_tokens_seen": 161898610, + "router_z_loss_clip": 1.64941406, + "router_z_loss_mlp": 0.13391113, + "step": 7546, + "time_per_iteration": 2.582507371902466 + }, + { + "auxiliary_loss_clip": 0.06454303, + "auxiliary_loss_mlp": 0.01276587, + "balance_loss_clip": 0.06282242, + "balance_loss_mlp": 0.01261531, + "epoch": 0.4537501878851646, + "flos": 23411060156160.0, + "grad_norm": 1.4140833040204603, + "language_loss": 0.76407051, + "learning_rate": 2.394071277466609e-06, + "loss": 0.8413794, + "num_input_tokens_seen": 161918210, + "router_z_loss_clip": 1.72070312, + "router_z_loss_mlp": 0.1505127, + "step": 7547, + "time_per_iteration": 2.5376148223876953 + }, + { + "auxiliary_loss_clip": 0.06452849, + "auxiliary_loss_mlp": 0.0127245, + "balance_loss_clip": 0.06284454, + "balance_loss_mlp": 0.01258086, + "epoch": 0.45381031113783254, + "flos": 18154978041600.0, + "grad_norm": 1.9572251150113926, + "language_loss": 0.70011902, + "learning_rate": 2.393689443195573e-06, + "loss": 0.777372, + "num_input_tokens_seen": 161936950, + "router_z_loss_clip": 1.68554688, + "router_z_loss_mlp": 0.14367676, + "step": 7548, + "time_per_iteration": 2.519615650177002 + }, + { + "auxiliary_loss_clip": 0.0644725, + "auxiliary_loss_mlp": 0.01271972, + "balance_loss_clip": 0.06283379, + "balance_loss_mlp": 0.01258638, + "epoch": 0.4538704343905005, + "flos": 25343503344000.0, + "grad_norm": 2.0312160927741933, + "language_loss": 0.72993481, + "learning_rate": 2.393307593995794e-06, + "loss": 0.80712706, + "num_input_tokens_seen": 161955550, + "router_z_loss_clip": 1.63769531, + "router_z_loss_mlp": 0.13342285, + "step": 7549, + "time_per_iteration": 2.57501482963562 + }, + { + "auxiliary_loss_clip": 0.06446082, + "auxiliary_loss_mlp": 0.01269972, + "balance_loss_clip": 0.06283575, + "balance_loss_mlp": 0.01257312, + "epoch": 0.4539305576431685, + "flos": 28739118528000.0, + "grad_norm": 1.441987244253853, + "language_loss": 0.65387678, + "learning_rate": 2.392925729881751e-06, + "loss": 0.73103732, + "num_input_tokens_seen": 161976760, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.12658691, + "step": 7550, + "time_per_iteration": 2.5835819244384766 + }, + { + "auxiliary_loss_clip": 0.06445216, + "auxiliary_loss_mlp": 0.01271365, + "balance_loss_clip": 0.06284294, + "balance_loss_mlp": 0.01258162, + "epoch": 0.45399068089583644, + "flos": 22499030390400.0, + "grad_norm": 1.5764003430967004, + "language_loss": 0.6906575, + "learning_rate": 2.3925438508679263e-06, + "loss": 0.76782334, + "num_input_tokens_seen": 161996120, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.13189697, + "step": 7551, + "time_per_iteration": 2.562033176422119 + }, + { + "auxiliary_loss_clip": 0.06442459, + "auxiliary_loss_mlp": 0.01272903, + "balance_loss_clip": 0.06276844, + "balance_loss_mlp": 0.01259504, + "epoch": 0.45405080414850446, + "flos": 12897889678080.0, + "grad_norm": 1.6874134559177159, + "language_loss": 0.79426885, + "learning_rate": 2.392161956968798e-06, + "loss": 0.87142253, + "num_input_tokens_seen": 162011125, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.13409424, + "step": 7552, + "time_per_iteration": 2.4449541568756104 + }, + { + "auxiliary_loss_clip": 0.063404, + "auxiliary_loss_mlp": 0.01262626, + "balance_loss_clip": 0.06265783, + "balance_loss_mlp": 0.01260128, + "epoch": 0.4541109274011724, + "flos": 59783558912640.0, + "grad_norm": 0.8094629177090237, + "language_loss": 0.57832247, + "learning_rate": 2.39178004819885e-06, + "loss": 0.65435266, + "num_input_tokens_seen": 162068705, + "router_z_loss_clip": 0.74707031, + "router_z_loss_mlp": 0.02496338, + "step": 7553, + "time_per_iteration": 3.089684247970581 + }, + { + "auxiliary_loss_clip": 0.06443945, + "auxiliary_loss_mlp": 0.01272453, + "balance_loss_clip": 0.06280293, + "balance_loss_mlp": 0.01258946, + "epoch": 0.4541710506538404, + "flos": 28519248614400.0, + "grad_norm": 1.8062911390055711, + "language_loss": 0.76727033, + "learning_rate": 2.3913981245725626e-06, + "loss": 0.84443438, + "num_input_tokens_seen": 162089655, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.13494873, + "step": 7554, + "time_per_iteration": 2.541727066040039 + }, + { + "auxiliary_loss_clip": 0.06449907, + "auxiliary_loss_mlp": 0.0126986, + "balance_loss_clip": 0.06284112, + "balance_loss_mlp": 0.0125559, + "epoch": 0.45423117390650836, + "flos": 17681304510720.0, + "grad_norm": 3.221825223389834, + "language_loss": 0.76701951, + "learning_rate": 2.3910161861044194e-06, + "loss": 0.84421712, + "num_input_tokens_seen": 162108465, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.1427002, + "step": 7555, + "time_per_iteration": 2.5190746784210205 + }, + { + "auxiliary_loss_clip": 0.06447887, + "auxiliary_loss_mlp": 0.01270234, + "balance_loss_clip": 0.06284074, + "balance_loss_mlp": 0.01256292, + "epoch": 0.4542912971591763, + "flos": 28079760349440.0, + "grad_norm": 1.2938327471401587, + "language_loss": 0.7293222, + "learning_rate": 2.390634232808903e-06, + "loss": 0.80650342, + "num_input_tokens_seen": 162129910, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.13946533, + "step": 7556, + "time_per_iteration": 2.559330940246582 + }, + { + "auxiliary_loss_clip": 0.06452744, + "auxiliary_loss_mlp": 0.0127062, + "balance_loss_clip": 0.06282438, + "balance_loss_mlp": 0.01256351, + "epoch": 0.4543514204118443, + "flos": 22677922857600.0, + "grad_norm": 1.9930550713200077, + "language_loss": 0.63614035, + "learning_rate": 2.3902522647004982e-06, + "loss": 0.71337396, + "num_input_tokens_seen": 162148840, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.14294434, + "step": 7557, + "time_per_iteration": 2.555694580078125 + }, + { + "auxiliary_loss_clip": 0.06341553, + "auxiliary_loss_mlp": 0.01256007, + "balance_loss_clip": 0.06267436, + "balance_loss_mlp": 0.01253351, + "epoch": 0.45441154366451225, + "flos": 58236027454080.0, + "grad_norm": 0.6640379644801875, + "language_loss": 0.57562745, + "learning_rate": 2.3898702817936875e-06, + "loss": 0.65160298, + "num_input_tokens_seen": 162208500, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.02658081, + "step": 7558, + "time_per_iteration": 5.871712684631348 + }, + { + "auxiliary_loss_clip": 0.06449831, + "auxiliary_loss_mlp": 0.01270129, + "balance_loss_clip": 0.06282432, + "balance_loss_mlp": 0.01255216, + "epoch": 0.4544716669171802, + "flos": 16769987504640.0, + "grad_norm": 2.2880587940678927, + "language_loss": 0.56438738, + "learning_rate": 2.3894882841029573e-06, + "loss": 0.64158702, + "num_input_tokens_seen": 162224650, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.14904785, + "step": 7559, + "time_per_iteration": 2.4660634994506836 + }, + { + "auxiliary_loss_clip": 0.06446083, + "auxiliary_loss_mlp": 0.01271383, + "balance_loss_clip": 0.06282272, + "balance_loss_mlp": 0.01257728, + "epoch": 0.4545317901698482, + "flos": 15930814464000.0, + "grad_norm": 1.794091833084443, + "language_loss": 0.72316611, + "learning_rate": 2.389106271642792e-06, + "loss": 0.80034077, + "num_input_tokens_seen": 162242930, + "router_z_loss_clip": 1.63867188, + "router_z_loss_mlp": 0.13671875, + "step": 7560, + "time_per_iteration": 2.497083902359009 + }, + { + "auxiliary_loss_clip": 0.06455533, + "auxiliary_loss_mlp": 0.01271449, + "balance_loss_clip": 0.0628465, + "balance_loss_mlp": 0.01257096, + "epoch": 0.45459191342251615, + "flos": 17645567944320.0, + "grad_norm": 2.9678955818231167, + "language_loss": 0.69120479, + "learning_rate": 2.3887242444276775e-06, + "loss": 0.76847458, + "num_input_tokens_seen": 162261455, + "router_z_loss_clip": 1.70800781, + "router_z_loss_mlp": 0.14355469, + "step": 7561, + "time_per_iteration": 2.463433027267456 + }, + { + "auxiliary_loss_clip": 0.06447616, + "auxiliary_loss_mlp": 0.01269071, + "balance_loss_clip": 0.06286462, + "balance_loss_mlp": 0.01256161, + "epoch": 0.4546520366751841, + "flos": 16181557407360.0, + "grad_norm": 2.3534128933362277, + "language_loss": 0.85417646, + "learning_rate": 2.3883422024721015e-06, + "loss": 0.93134332, + "num_input_tokens_seen": 162279725, + "router_z_loss_clip": 1.61132812, + "router_z_loss_mlp": 0.12908936, + "step": 7562, + "time_per_iteration": 2.5475013256073 + }, + { + "auxiliary_loss_clip": 0.06445649, + "auxiliary_loss_mlp": 0.01271177, + "balance_loss_clip": 0.06284063, + "balance_loss_mlp": 0.01257504, + "epoch": 0.4547121599278521, + "flos": 19756861672320.0, + "grad_norm": 1.7772924752060992, + "language_loss": 0.89642298, + "learning_rate": 2.38796014579055e-06, + "loss": 0.97359127, + "num_input_tokens_seen": 162297865, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.13684082, + "step": 7563, + "time_per_iteration": 2.489121675491333 + }, + { + "auxiliary_loss_clip": 0.06453149, + "auxiliary_loss_mlp": 0.01274815, + "balance_loss_clip": 0.06286659, + "balance_loss_mlp": 0.01260397, + "epoch": 0.45477228318052004, + "flos": 19943510641920.0, + "grad_norm": 1.9263110789996643, + "language_loss": 0.71668887, + "learning_rate": 2.3875780743975097e-06, + "loss": 0.79396844, + "num_input_tokens_seen": 162316010, + "router_z_loss_clip": 1.6640625, + "router_z_loss_mlp": 0.14428711, + "step": 7564, + "time_per_iteration": 2.4964044094085693 + }, + { + "auxiliary_loss_clip": 0.06450239, + "auxiliary_loss_mlp": 0.01267794, + "balance_loss_clip": 0.06283273, + "balance_loss_mlp": 0.01253912, + "epoch": 0.454832406433188, + "flos": 21294735183360.0, + "grad_norm": 2.0561067408009994, + "language_loss": 0.68633133, + "learning_rate": 2.3871959883074713e-06, + "loss": 0.7635116, + "num_input_tokens_seen": 162336115, + "router_z_loss_clip": 1.66894531, + "router_z_loss_mlp": 0.13879395, + "step": 7565, + "time_per_iteration": 4.080512762069702 + }, + { + "auxiliary_loss_clip": 0.06446166, + "auxiliary_loss_mlp": 0.01274343, + "balance_loss_clip": 0.06282604, + "balance_loss_mlp": 0.01260247, + "epoch": 0.45489252968585603, + "flos": 24505630041600.0, + "grad_norm": 2.0436514367854413, + "language_loss": 0.802881, + "learning_rate": 2.386813887534922e-06, + "loss": 0.88008606, + "num_input_tokens_seen": 162355705, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.14105225, + "step": 7566, + "time_per_iteration": 2.521056890487671 + }, + { + "auxiliary_loss_clip": 0.06452477, + "auxiliary_loss_mlp": 0.01273216, + "balance_loss_clip": 0.06286022, + "balance_loss_mlp": 0.01257558, + "epoch": 0.454952652938524, + "flos": 17098199147520.0, + "grad_norm": 2.208842453595512, + "language_loss": 0.74317467, + "learning_rate": 2.3864317720943508e-06, + "loss": 0.82043159, + "num_input_tokens_seen": 162374055, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.15661621, + "step": 7567, + "time_per_iteration": 2.515658140182495 + }, + { + "auxiliary_loss_clip": 0.06459296, + "auxiliary_loss_mlp": 0.01271605, + "balance_loss_clip": 0.06291091, + "balance_loss_mlp": 0.0125801, + "epoch": 0.45501277619119196, + "flos": 27636792140160.0, + "grad_norm": 1.5215577708435108, + "language_loss": 0.80959934, + "learning_rate": 2.386049642000249e-06, + "loss": 0.88690829, + "num_input_tokens_seen": 162393560, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.13604736, + "step": 7568, + "time_per_iteration": 2.558258533477783 + }, + { + "auxiliary_loss_clip": 0.06466229, + "auxiliary_loss_mlp": 0.01276365, + "balance_loss_clip": 0.06294216, + "balance_loss_mlp": 0.01260176, + "epoch": 0.4550728994438599, + "flos": 19980840435840.0, + "grad_norm": 1.8148678559144198, + "language_loss": 0.80280846, + "learning_rate": 2.3856674972671055e-06, + "loss": 0.88023436, + "num_input_tokens_seen": 162413170, + "router_z_loss_clip": 1.71972656, + "router_z_loss_mlp": 0.16186523, + "step": 7569, + "time_per_iteration": 2.531153917312622 + }, + { + "auxiliary_loss_clip": 0.06458277, + "auxiliary_loss_mlp": 0.01268707, + "balance_loss_clip": 0.06287743, + "balance_loss_mlp": 0.01254176, + "epoch": 0.4551330226965279, + "flos": 26073915384960.0, + "grad_norm": 1.3474740501928035, + "language_loss": 0.75202894, + "learning_rate": 2.385285337909412e-06, + "loss": 0.82929879, + "num_input_tokens_seen": 162434080, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.14538574, + "step": 7570, + "time_per_iteration": 2.543170690536499 + }, + { + "auxiliary_loss_clip": 0.06452256, + "auxiliary_loss_mlp": 0.01273702, + "balance_loss_clip": 0.06289603, + "balance_loss_mlp": 0.01259826, + "epoch": 0.45519314594919585, + "flos": 32789396062080.0, + "grad_norm": 1.7878922954829848, + "language_loss": 0.74832451, + "learning_rate": 2.3849031639416596e-06, + "loss": 0.82558417, + "num_input_tokens_seen": 162455445, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.13879395, + "step": 7571, + "time_per_iteration": 4.052931308746338 + }, + { + "auxiliary_loss_clip": 0.06451707, + "auxiliary_loss_mlp": 0.01275937, + "balance_loss_clip": 0.06292738, + "balance_loss_mlp": 0.01261954, + "epoch": 0.4552532692018638, + "flos": 19178829480960.0, + "grad_norm": 1.5879241198756615, + "language_loss": 0.81163442, + "learning_rate": 2.3845209753783414e-06, + "loss": 0.88891089, + "num_input_tokens_seen": 162474940, + "router_z_loss_clip": 1.58886719, + "router_z_loss_mlp": 0.13983154, + "step": 7572, + "time_per_iteration": 2.511032819747925 + }, + { + "auxiliary_loss_clip": 0.06461887, + "auxiliary_loss_mlp": 0.01269094, + "balance_loss_clip": 0.06292465, + "balance_loss_mlp": 0.01254306, + "epoch": 0.4553133924545318, + "flos": 26033650698240.0, + "grad_norm": 2.340526601051543, + "language_loss": 0.72866237, + "learning_rate": 2.3841387722339486e-06, + "loss": 0.80597222, + "num_input_tokens_seen": 162493340, + "router_z_loss_clip": 1.69238281, + "router_z_loss_mlp": 0.14788818, + "step": 7573, + "time_per_iteration": 2.5469906330108643 + }, + { + "auxiliary_loss_clip": 0.06470129, + "auxiliary_loss_mlp": 0.0127089, + "balance_loss_clip": 0.06300491, + "balance_loss_mlp": 0.01255094, + "epoch": 0.45537351570719975, + "flos": 30668920312320.0, + "grad_norm": 1.9189620807456311, + "language_loss": 0.74504352, + "learning_rate": 2.3837565545229748e-06, + "loss": 0.82245368, + "num_input_tokens_seen": 162514360, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.15783691, + "step": 7574, + "time_per_iteration": 2.6484622955322266 + }, + { + "auxiliary_loss_clip": 0.06463373, + "auxiliary_loss_mlp": 0.01271034, + "balance_loss_clip": 0.06294367, + "balance_loss_mlp": 0.0125661, + "epoch": 0.4554336389598677, + "flos": 24360377788800.0, + "grad_norm": 1.669597443611077, + "language_loss": 0.71544576, + "learning_rate": 2.383374322259915e-06, + "loss": 0.79278982, + "num_input_tokens_seen": 162535240, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.14428711, + "step": 7575, + "time_per_iteration": 2.544975519180298 + }, + { + "auxiliary_loss_clip": 0.06456485, + "auxiliary_loss_mlp": 0.01268004, + "balance_loss_clip": 0.06290726, + "balance_loss_mlp": 0.01253794, + "epoch": 0.4554937622125357, + "flos": 20564113507200.0, + "grad_norm": 1.7578928676474412, + "language_loss": 0.7370066, + "learning_rate": 2.3829920754592617e-06, + "loss": 0.81425148, + "num_input_tokens_seen": 162553880, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.14202881, + "step": 7576, + "time_per_iteration": 2.534135580062866 + }, + { + "auxiliary_loss_clip": 0.06453636, + "auxiliary_loss_mlp": 0.0127588, + "balance_loss_clip": 0.06290971, + "balance_loss_mlp": 0.01261551, + "epoch": 0.45555388546520365, + "flos": 22827451668480.0, + "grad_norm": 2.007695048360481, + "language_loss": 0.66580224, + "learning_rate": 2.382609814135511e-06, + "loss": 0.74309736, + "num_input_tokens_seen": 162574485, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.14312744, + "step": 7577, + "time_per_iteration": 2.5095431804656982 + }, + { + "auxiliary_loss_clip": 0.06452672, + "auxiliary_loss_mlp": 0.01272369, + "balance_loss_clip": 0.0628684, + "balance_loss_mlp": 0.01256538, + "epoch": 0.4556140087178716, + "flos": 21732462512640.0, + "grad_norm": 1.904316861437945, + "language_loss": 0.74386835, + "learning_rate": 2.382227538303157e-06, + "loss": 0.82111871, + "num_input_tokens_seen": 162595130, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.15820312, + "step": 7578, + "time_per_iteration": 2.5497546195983887 + }, + { + "auxiliary_loss_clip": 0.06453466, + "auxiliary_loss_mlp": 0.01270181, + "balance_loss_clip": 0.06290053, + "balance_loss_mlp": 0.01256645, + "epoch": 0.45567413197053963, + "flos": 26001645638400.0, + "grad_norm": 1.7724513927111563, + "language_loss": 0.70436674, + "learning_rate": 2.381845247976697e-06, + "loss": 0.78160322, + "num_input_tokens_seen": 162615720, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.13531494, + "step": 7579, + "time_per_iteration": 2.5318000316619873 + }, + { + "auxiliary_loss_clip": 0.06449443, + "auxiliary_loss_mlp": 0.01270557, + "balance_loss_clip": 0.06286655, + "balance_loss_mlp": 0.01257664, + "epoch": 0.4557342552232076, + "flos": 21543046358400.0, + "grad_norm": 1.8462396851301097, + "language_loss": 0.78760922, + "learning_rate": 2.381462943170627e-06, + "loss": 0.86480927, + "num_input_tokens_seen": 162635825, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.12902832, + "step": 7580, + "time_per_iteration": 2.5358526706695557 + }, + { + "auxiliary_loss_clip": 0.06450854, + "auxiliary_loss_mlp": 0.0127087, + "balance_loss_clip": 0.06288584, + "balance_loss_mlp": 0.01257822, + "epoch": 0.45579437847587556, + "flos": 40010932673280.0, + "grad_norm": 1.6599136037597217, + "language_loss": 0.68708634, + "learning_rate": 2.381080623899444e-06, + "loss": 0.76430357, + "num_input_tokens_seen": 162659130, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.13049316, + "step": 7581, + "time_per_iteration": 2.667543888092041 + }, + { + "auxiliary_loss_clip": 0.06448796, + "auxiliary_loss_mlp": 0.01272512, + "balance_loss_clip": 0.06289542, + "balance_loss_mlp": 0.01258678, + "epoch": 0.4558545017285435, + "flos": 31146409203840.0, + "grad_norm": 1.6471906775179725, + "language_loss": 0.7358638, + "learning_rate": 2.3806982901776455e-06, + "loss": 0.81307691, + "num_input_tokens_seen": 162681665, + "router_z_loss_clip": 1.59277344, + "router_z_loss_mlp": 0.1383667, + "step": 7582, + "time_per_iteration": 2.6570708751678467 + }, + { + "auxiliary_loss_clip": 0.06455518, + "auxiliary_loss_mlp": 0.01272969, + "balance_loss_clip": 0.06286626, + "balance_loss_mlp": 0.01257818, + "epoch": 0.4559146249812115, + "flos": 21732210950400.0, + "grad_norm": 1.8620959272942483, + "language_loss": 0.73187852, + "learning_rate": 2.380315942019729e-06, + "loss": 0.80916339, + "num_input_tokens_seen": 162702040, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.15148926, + "step": 7583, + "time_per_iteration": 2.510700225830078 + }, + { + "auxiliary_loss_clip": 0.06455322, + "auxiliary_loss_mlp": 0.01272152, + "balance_loss_clip": 0.06287013, + "balance_loss_mlp": 0.01256202, + "epoch": 0.45597474823387946, + "flos": 23812841283840.0, + "grad_norm": 1.81949303768272, + "language_loss": 0.72839421, + "learning_rate": 2.379933579440195e-06, + "loss": 0.80566895, + "num_input_tokens_seen": 162722375, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.1595459, + "step": 7584, + "time_per_iteration": 2.5747973918914795 + }, + { + "auxiliary_loss_clip": 0.06447833, + "auxiliary_loss_mlp": 0.01268136, + "balance_loss_clip": 0.0628446, + "balance_loss_mlp": 0.01255357, + "epoch": 0.4560348714865474, + "flos": 31913857549440.0, + "grad_norm": 1.7864940938501939, + "language_loss": 0.67957801, + "learning_rate": 2.379551202453541e-06, + "loss": 0.75673771, + "num_input_tokens_seen": 162746095, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.12792969, + "step": 7585, + "time_per_iteration": 2.6153225898742676 + }, + { + "auxiliary_loss_clip": 0.0645072, + "auxiliary_loss_mlp": 0.01268647, + "balance_loss_clip": 0.06284043, + "balance_loss_mlp": 0.01254449, + "epoch": 0.4560949947392154, + "flos": 22054427026560.0, + "grad_norm": 1.7083540410775564, + "language_loss": 0.76353097, + "learning_rate": 2.379168811074267e-06, + "loss": 0.84072465, + "num_input_tokens_seen": 162766330, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.14190674, + "step": 7586, + "time_per_iteration": 2.5682435035705566 + }, + { + "auxiliary_loss_clip": 0.06448488, + "auxiliary_loss_mlp": 0.01267379, + "balance_loss_clip": 0.0628647, + "balance_loss_mlp": 0.01254182, + "epoch": 0.45615511799188335, + "flos": 24578738328960.0, + "grad_norm": 1.819670635232321, + "language_loss": 0.78360641, + "learning_rate": 2.3787864053168747e-06, + "loss": 0.86076516, + "num_input_tokens_seen": 162784755, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.13189697, + "step": 7587, + "time_per_iteration": 2.5558509826660156 + }, + { + "auxiliary_loss_clip": 0.06459979, + "auxiliary_loss_mlp": 0.01275995, + "balance_loss_clip": 0.06286488, + "balance_loss_mlp": 0.01260152, + "epoch": 0.4562152412445513, + "flos": 18336260350080.0, + "grad_norm": 1.7968748305561377, + "language_loss": 0.69667047, + "learning_rate": 2.378403985195863e-06, + "loss": 0.77403021, + "num_input_tokens_seen": 162803850, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.1583252, + "step": 7588, + "time_per_iteration": 2.5365071296691895 + }, + { + "auxiliary_loss_clip": 0.06447656, + "auxiliary_loss_mlp": 0.01274434, + "balance_loss_clip": 0.06286096, + "balance_loss_mlp": 0.01261422, + "epoch": 0.4562753644972193, + "flos": 13521595144320.0, + "grad_norm": 1.6774091429175193, + "language_loss": 0.79575098, + "learning_rate": 2.378021550725735e-06, + "loss": 0.87297189, + "num_input_tokens_seen": 162820775, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.13006592, + "step": 7589, + "time_per_iteration": 2.484713315963745 + }, + { + "auxiliary_loss_clip": 0.06452583, + "auxiliary_loss_mlp": 0.01271771, + "balance_loss_clip": 0.06289135, + "balance_loss_mlp": 0.0125774, + "epoch": 0.45633548774988725, + "flos": 29646871735680.0, + "grad_norm": 2.003946782113331, + "language_loss": 0.62696528, + "learning_rate": 2.377639101920992e-06, + "loss": 0.70420885, + "num_input_tokens_seen": 162839695, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.14044189, + "step": 7590, + "time_per_iteration": 2.609936475753784 + }, + { + "auxiliary_loss_clip": 0.06445528, + "auxiliary_loss_mlp": 0.01270847, + "balance_loss_clip": 0.06280724, + "balance_loss_mlp": 0.01257496, + "epoch": 0.4563956110025552, + "flos": 22239398914560.0, + "grad_norm": 1.8300596662255737, + "language_loss": 0.73085624, + "learning_rate": 2.377256638796135e-06, + "loss": 0.80802, + "num_input_tokens_seen": 162856095, + "router_z_loss_clip": 1.64746094, + "router_z_loss_mlp": 0.13330078, + "step": 7591, + "time_per_iteration": 2.47824764251709 + }, + { + "auxiliary_loss_clip": 0.06452768, + "auxiliary_loss_mlp": 0.01273962, + "balance_loss_clip": 0.0628728, + "balance_loss_mlp": 0.01260205, + "epoch": 0.45645573425522323, + "flos": 17097696023040.0, + "grad_norm": 1.9979722051509847, + "language_loss": 0.77518493, + "learning_rate": 2.3768741613656695e-06, + "loss": 0.85245228, + "num_input_tokens_seen": 162874070, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.13751221, + "step": 7592, + "time_per_iteration": 2.5239169597625732 + }, + { + "auxiliary_loss_clip": 0.06449406, + "auxiliary_loss_mlp": 0.01273175, + "balance_loss_clip": 0.06284081, + "balance_loss_mlp": 0.01259954, + "epoch": 0.4565158575078912, + "flos": 20337367559040.0, + "grad_norm": 2.421698823443505, + "language_loss": 0.6941641, + "learning_rate": 2.376491669644098e-06, + "loss": 0.77138984, + "num_input_tokens_seen": 162891000, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.13232422, + "step": 7593, + "time_per_iteration": 2.5688788890838623 + }, + { + "auxiliary_loss_clip": 0.06437326, + "auxiliary_loss_mlp": 0.01268265, + "balance_loss_clip": 0.06278698, + "balance_loss_mlp": 0.01256034, + "epoch": 0.45657598076055916, + "flos": 23989008493440.0, + "grad_norm": 2.02887277896486, + "language_loss": 0.8417384, + "learning_rate": 2.3761091636459248e-06, + "loss": 0.91879439, + "num_input_tokens_seen": 162910120, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.12237549, + "step": 7594, + "time_per_iteration": 2.5792298316955566 + }, + { + "auxiliary_loss_clip": 0.06341574, + "auxiliary_loss_mlp": 0.01258819, + "balance_loss_clip": 0.06267718, + "balance_loss_mlp": 0.0125595, + "epoch": 0.45663610401322713, + "flos": 69382812908160.0, + "grad_norm": 0.7684087429591354, + "language_loss": 0.52710819, + "learning_rate": 2.375726643385654e-06, + "loss": 0.60311204, + "num_input_tokens_seen": 162963720, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.02864075, + "step": 7595, + "time_per_iteration": 3.150902509689331 + }, + { + "auxiliary_loss_clip": 0.06451569, + "auxiliary_loss_mlp": 0.01268714, + "balance_loss_clip": 0.06282795, + "balance_loss_mlp": 0.0125491, + "epoch": 0.4566962272658951, + "flos": 15152884358400.0, + "grad_norm": 2.304862186673624, + "language_loss": 0.8729161, + "learning_rate": 2.3753441088777915e-06, + "loss": 0.95011896, + "num_input_tokens_seen": 162975760, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.13824463, + "step": 7596, + "time_per_iteration": 2.490346908569336 + }, + { + "auxiliary_loss_clip": 0.0644666, + "auxiliary_loss_mlp": 0.01270115, + "balance_loss_clip": 0.06282236, + "balance_loss_mlp": 0.01257324, + "epoch": 0.45675635051856306, + "flos": 18703395014400.0, + "grad_norm": 1.5857620712679525, + "language_loss": 0.77719533, + "learning_rate": 2.374961560136843e-06, + "loss": 0.85436308, + "num_input_tokens_seen": 162994865, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.12792969, + "step": 7597, + "time_per_iteration": 2.5043859481811523 + }, + { + "auxiliary_loss_clip": 0.0644691, + "auxiliary_loss_mlp": 0.01271101, + "balance_loss_clip": 0.06280024, + "balance_loss_mlp": 0.01256587, + "epoch": 0.456816473771231, + "flos": 19104211820160.0, + "grad_norm": 1.619707981694153, + "language_loss": 0.78513646, + "learning_rate": 2.374578997177314e-06, + "loss": 0.86231661, + "num_input_tokens_seen": 163014730, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.14501953, + "step": 7598, + "time_per_iteration": 3.9724912643432617 + }, + { + "auxiliary_loss_clip": 0.06447135, + "auxiliary_loss_mlp": 0.01268948, + "balance_loss_clip": 0.06284773, + "balance_loss_mlp": 0.01255508, + "epoch": 0.456876597023899, + "flos": 28957730630400.0, + "grad_norm": 2.2287540067942957, + "language_loss": 0.72171777, + "learning_rate": 2.374196420013712e-06, + "loss": 0.79887861, + "num_input_tokens_seen": 163033405, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.13458252, + "step": 7599, + "time_per_iteration": 2.594240188598633 + }, + { + "auxiliary_loss_clip": 0.06445186, + "auxiliary_loss_mlp": 0.0126948, + "balance_loss_clip": 0.06281814, + "balance_loss_mlp": 0.01256021, + "epoch": 0.45693672027656695, + "flos": 23295297340800.0, + "grad_norm": 1.7934880288039583, + "language_loss": 0.70205128, + "learning_rate": 2.373813828660544e-06, + "loss": 0.77919793, + "num_input_tokens_seen": 163051400, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.13439941, + "step": 7600, + "time_per_iteration": 2.5063295364379883 + }, + { + "auxiliary_loss_clip": 0.06449603, + "auxiliary_loss_mlp": 0.01270393, + "balance_loss_clip": 0.06284294, + "balance_loss_mlp": 0.01256571, + "epoch": 0.4569968435292349, + "flos": 20564448923520.0, + "grad_norm": 2.031833923402261, + "language_loss": 0.78985888, + "learning_rate": 2.373431223132319e-06, + "loss": 0.86705881, + "num_input_tokens_seen": 163069250, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.13824463, + "step": 7601, + "time_per_iteration": 2.559072494506836 + }, + { + "auxiliary_loss_clip": 0.06449661, + "auxiliary_loss_mlp": 0.0127022, + "balance_loss_clip": 0.06283583, + "balance_loss_mlp": 0.01257089, + "epoch": 0.4570569667819029, + "flos": 41292403090560.0, + "grad_norm": 1.9704151582810323, + "language_loss": 0.71676505, + "learning_rate": 2.3730486034435448e-06, + "loss": 0.79396379, + "num_input_tokens_seen": 163091755, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.13134766, + "step": 7602, + "time_per_iteration": 2.6897006034851074 + }, + { + "auxiliary_loss_clip": 0.06446967, + "auxiliary_loss_mlp": 0.01270876, + "balance_loss_clip": 0.06280911, + "balance_loss_mlp": 0.01255843, + "epoch": 0.45711709003457085, + "flos": 26038807724160.0, + "grad_norm": 1.8547506252317059, + "language_loss": 0.73479527, + "learning_rate": 2.372665969608729e-06, + "loss": 0.81197369, + "num_input_tokens_seen": 163111600, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.15026855, + "step": 7603, + "time_per_iteration": 2.5908169746398926 + }, + { + "auxiliary_loss_clip": 0.06447335, + "auxiliary_loss_mlp": 0.01269467, + "balance_loss_clip": 0.0628283, + "balance_loss_mlp": 0.01254077, + "epoch": 0.4571772132872388, + "flos": 22163649223680.0, + "grad_norm": 1.7365999934209901, + "language_loss": 0.83048642, + "learning_rate": 2.372283321642383e-06, + "loss": 0.90765446, + "num_input_tokens_seen": 163127350, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.15374756, + "step": 7604, + "time_per_iteration": 2.462653636932373 + }, + { + "auxiliary_loss_clip": 0.0645724, + "auxiliary_loss_mlp": 0.01271667, + "balance_loss_clip": 0.06285316, + "balance_loss_mlp": 0.01256456, + "epoch": 0.45723733653990684, + "flos": 23885739936000.0, + "grad_norm": 1.8384947858044167, + "language_loss": 0.86237913, + "learning_rate": 2.371900659559016e-06, + "loss": 0.93966818, + "num_input_tokens_seen": 163145855, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.15209961, + "step": 7605, + "time_per_iteration": 3.9711341857910156 + }, + { + "auxiliary_loss_clip": 0.0645397, + "auxiliary_loss_mlp": 0.01268015, + "balance_loss_clip": 0.06283225, + "balance_loss_mlp": 0.01253686, + "epoch": 0.4572974597925748, + "flos": 16877197203840.0, + "grad_norm": 1.5621441730902494, + "language_loss": 0.73368603, + "learning_rate": 2.371517983373138e-06, + "loss": 0.81090587, + "num_input_tokens_seen": 163163830, + "router_z_loss_clip": 1.70898438, + "router_z_loss_mlp": 0.14343262, + "step": 7606, + "time_per_iteration": 2.53171968460083 + }, + { + "auxiliary_loss_clip": 0.06450876, + "auxiliary_loss_mlp": 0.01272472, + "balance_loss_clip": 0.06281146, + "balance_loss_mlp": 0.01257118, + "epoch": 0.45735758304524277, + "flos": 13776530791680.0, + "grad_norm": 2.9980100906386324, + "language_loss": 0.80445778, + "learning_rate": 2.371135293099262e-06, + "loss": 0.88169128, + "num_input_tokens_seen": 163180700, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.15356445, + "step": 7607, + "time_per_iteration": 2.4730136394500732 + }, + { + "auxiliary_loss_clip": 0.06449468, + "auxiliary_loss_mlp": 0.01267355, + "balance_loss_clip": 0.06282607, + "balance_loss_mlp": 0.01252216, + "epoch": 0.45741770629791073, + "flos": 21106283351040.0, + "grad_norm": 1.9890456967063905, + "language_loss": 0.80849135, + "learning_rate": 2.3707525887518982e-06, + "loss": 0.88565969, + "num_input_tokens_seen": 163199450, + "router_z_loss_clip": 1.66894531, + "router_z_loss_mlp": 0.15130615, + "step": 7608, + "time_per_iteration": 2.5604805946350098 + }, + { + "auxiliary_loss_clip": 0.06445852, + "auxiliary_loss_mlp": 0.01268416, + "balance_loss_clip": 0.06281331, + "balance_loss_mlp": 0.01254576, + "epoch": 0.4574778295505787, + "flos": 23119675182720.0, + "grad_norm": 1.6776975313937859, + "language_loss": 0.68550682, + "learning_rate": 2.370369870345559e-06, + "loss": 0.76264954, + "num_input_tokens_seen": 163217875, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.1385498, + "step": 7609, + "time_per_iteration": 2.5249829292297363 + }, + { + "auxiliary_loss_clip": 0.06446596, + "auxiliary_loss_mlp": 0.01267793, + "balance_loss_clip": 0.06279876, + "balance_loss_mlp": 0.01253917, + "epoch": 0.45753795280324666, + "flos": 24359832737280.0, + "grad_norm": 4.839518120228961, + "language_loss": 0.81053591, + "learning_rate": 2.369987137894757e-06, + "loss": 0.88767982, + "num_input_tokens_seen": 163237430, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.13879395, + "step": 7610, + "time_per_iteration": 3.9629292488098145 + }, + { + "auxiliary_loss_clip": 0.06456244, + "auxiliary_loss_mlp": 0.01272187, + "balance_loss_clip": 0.06284218, + "balance_loss_mlp": 0.01258359, + "epoch": 0.4575980760559146, + "flos": 16659297861120.0, + "grad_norm": 2.22162560638367, + "language_loss": 0.82538879, + "learning_rate": 2.3696043914140057e-06, + "loss": 0.90267307, + "num_input_tokens_seen": 163253905, + "router_z_loss_clip": 1.71972656, + "router_z_loss_mlp": 0.13848877, + "step": 7611, + "time_per_iteration": 2.483184337615967 + }, + { + "auxiliary_loss_clip": 0.06450104, + "auxiliary_loss_mlp": 0.01268987, + "balance_loss_clip": 0.06284404, + "balance_loss_mlp": 0.01254753, + "epoch": 0.4576581993085826, + "flos": 35919006860160.0, + "grad_norm": 1.7486456420241998, + "language_loss": 0.73840886, + "learning_rate": 2.369221630917819e-06, + "loss": 0.81559974, + "num_input_tokens_seen": 163274285, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.14239502, + "step": 7612, + "time_per_iteration": 2.629122734069824 + }, + { + "auxiliary_loss_clip": 0.06446031, + "auxiliary_loss_mlp": 0.0126785, + "balance_loss_clip": 0.06281702, + "balance_loss_mlp": 0.01253711, + "epoch": 0.45771832256125056, + "flos": 20085995710080.0, + "grad_norm": 1.498537690587119, + "language_loss": 0.85104787, + "learning_rate": 2.368838856420711e-06, + "loss": 0.92818671, + "num_input_tokens_seen": 163293150, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.14160156, + "step": 7613, + "time_per_iteration": 2.4995853900909424 + }, + { + "auxiliary_loss_clip": 0.06450839, + "auxiliary_loss_mlp": 0.01271405, + "balance_loss_clip": 0.062853, + "balance_loss_mlp": 0.01257458, + "epoch": 0.4577784458139185, + "flos": 10749056520960.0, + "grad_norm": 2.317250545042104, + "language_loss": 0.75818133, + "learning_rate": 2.3684560679371965e-06, + "loss": 0.8354038, + "num_input_tokens_seen": 163310065, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.13946533, + "step": 7614, + "time_per_iteration": 2.5512688159942627 + }, + { + "auxiliary_loss_clip": 0.06447698, + "auxiliary_loss_mlp": 0.01269104, + "balance_loss_clip": 0.06284869, + "balance_loss_mlp": 0.01254513, + "epoch": 0.4578385690665865, + "flos": 21913577112960.0, + "grad_norm": 1.7278714332693421, + "language_loss": 0.7495364, + "learning_rate": 2.368073265481791e-06, + "loss": 0.82670438, + "num_input_tokens_seen": 163329415, + "router_z_loss_clip": 1.62890625, + "router_z_loss_mlp": 0.14587402, + "step": 7615, + "time_per_iteration": 2.4959964752197266 + }, + { + "auxiliary_loss_clip": 0.06341572, + "auxiliary_loss_mlp": 0.01260056, + "balance_loss_clip": 0.06266811, + "balance_loss_mlp": 0.01256924, + "epoch": 0.45789869231925445, + "flos": 64774559036160.0, + "grad_norm": 0.7564263714074747, + "language_loss": 0.57682395, + "learning_rate": 2.3676904490690105e-06, + "loss": 0.65284026, + "num_input_tokens_seen": 163385875, + "router_z_loss_clip": 0.74804688, + "router_z_loss_mlp": 0.03129578, + "step": 7616, + "time_per_iteration": 3.1225674152374268 + }, + { + "auxiliary_loss_clip": 0.06451499, + "auxiliary_loss_mlp": 0.01269699, + "balance_loss_clip": 0.06287209, + "balance_loss_mlp": 0.01255299, + "epoch": 0.4579588155719224, + "flos": 16149594274560.0, + "grad_norm": 2.222129623674548, + "language_loss": 0.71319497, + "learning_rate": 2.3673076187133704e-06, + "loss": 0.790407, + "num_input_tokens_seen": 163405170, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.144104, + "step": 7617, + "time_per_iteration": 2.535795211791992 + }, + { + "auxiliary_loss_clip": 0.06453606, + "auxiliary_loss_mlp": 0.01272033, + "balance_loss_clip": 0.06288601, + "balance_loss_mlp": 0.0125749, + "epoch": 0.45801893882459044, + "flos": 21401609466240.0, + "grad_norm": 1.7708953304075432, + "language_loss": 0.7611897, + "learning_rate": 2.36692477442939e-06, + "loss": 0.83844614, + "num_input_tokens_seen": 163423155, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.14538574, + "step": 7618, + "time_per_iteration": 2.486976146697998 + }, + { + "auxiliary_loss_clip": 0.06453368, + "auxiliary_loss_mlp": 0.01269962, + "balance_loss_clip": 0.06288654, + "balance_loss_mlp": 0.01256778, + "epoch": 0.4580790620772584, + "flos": 19542609982080.0, + "grad_norm": 1.989312042597275, + "language_loss": 0.76642346, + "learning_rate": 2.366541916231585e-06, + "loss": 0.84365678, + "num_input_tokens_seen": 163442450, + "router_z_loss_clip": 1.64648438, + "router_z_loss_mlp": 0.13195801, + "step": 7619, + "time_per_iteration": 2.5505213737487793 + }, + { + "auxiliary_loss_clip": 0.06448688, + "auxiliary_loss_mlp": 0.01269236, + "balance_loss_clip": 0.06287201, + "balance_loss_mlp": 0.01256242, + "epoch": 0.45813918532992637, + "flos": 16586608844160.0, + "grad_norm": 1.7634638926548802, + "language_loss": 0.72444797, + "learning_rate": 2.366159044134473e-06, + "loss": 0.80162722, + "num_input_tokens_seen": 163459810, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.13018799, + "step": 7620, + "time_per_iteration": 2.5020828247070312 + }, + { + "auxiliary_loss_clip": 0.06448015, + "auxiliary_loss_mlp": 0.0127207, + "balance_loss_clip": 0.06286486, + "balance_loss_mlp": 0.01259243, + "epoch": 0.45819930858259433, + "flos": 42240085568640.0, + "grad_norm": 2.4478513756868168, + "language_loss": 0.77894747, + "learning_rate": 2.3657761581525748e-06, + "loss": 0.8561483, + "num_input_tokens_seen": 163482970, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.12835693, + "step": 7621, + "time_per_iteration": 2.7115588188171387 + }, + { + "auxiliary_loss_clip": 0.06339111, + "auxiliary_loss_mlp": 0.01257981, + "balance_loss_clip": 0.06264743, + "balance_loss_mlp": 0.01255324, + "epoch": 0.4582594318352623, + "flos": 63733335073920.0, + "grad_norm": 0.7682856550602313, + "language_loss": 0.64809114, + "learning_rate": 2.3653932583004063e-06, + "loss": 0.72406203, + "num_input_tokens_seen": 163545330, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.02659607, + "step": 7622, + "time_per_iteration": 3.13112473487854 + }, + { + "auxiliary_loss_clip": 0.06452725, + "auxiliary_loss_mlp": 0.01272617, + "balance_loss_clip": 0.06286744, + "balance_loss_mlp": 0.01258449, + "epoch": 0.45831955508793026, + "flos": 26877226078080.0, + "grad_norm": 1.7433537302254658, + "language_loss": 0.79958743, + "learning_rate": 2.3650103445924903e-06, + "loss": 0.87684089, + "num_input_tokens_seen": 163564620, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.1416626, + "step": 7623, + "time_per_iteration": 2.6407015323638916 + }, + { + "auxiliary_loss_clip": 0.0645254, + "auxiliary_loss_mlp": 0.0127269, + "balance_loss_clip": 0.06285348, + "balance_loss_mlp": 0.01258528, + "epoch": 0.45837967834059823, + "flos": 18739886267520.0, + "grad_norm": 2.305548200028626, + "language_loss": 0.71172595, + "learning_rate": 2.3646274170433452e-06, + "loss": 0.78897822, + "num_input_tokens_seen": 163581010, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.14160156, + "step": 7624, + "time_per_iteration": 2.4580042362213135 + }, + { + "auxiliary_loss_clip": 0.06451602, + "auxiliary_loss_mlp": 0.01273069, + "balance_loss_clip": 0.06285381, + "balance_loss_mlp": 0.012593, + "epoch": 0.4584398015932662, + "flos": 21184380956160.0, + "grad_norm": 1.776025787081333, + "language_loss": 0.73132861, + "learning_rate": 2.364244475667491e-06, + "loss": 0.80857527, + "num_input_tokens_seen": 163599955, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.13763428, + "step": 7625, + "time_per_iteration": 2.5352139472961426 + }, + { + "auxiliary_loss_clip": 0.06452388, + "auxiliary_loss_mlp": 0.01273572, + "balance_loss_clip": 0.06287026, + "balance_loss_mlp": 0.01259857, + "epoch": 0.45849992484593416, + "flos": 19795826620800.0, + "grad_norm": 3.130746647878431, + "language_loss": 0.78340298, + "learning_rate": 2.363861520479451e-06, + "loss": 0.86066258, + "num_input_tokens_seen": 163618545, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.137146, + "step": 7626, + "time_per_iteration": 2.4839165210723877 + }, + { + "auxiliary_loss_clip": 0.06454711, + "auxiliary_loss_mlp": 0.01271249, + "balance_loss_clip": 0.06286182, + "balance_loss_mlp": 0.01257284, + "epoch": 0.4585600480986021, + "flos": 18229134504960.0, + "grad_norm": 1.6201293476115848, + "language_loss": 0.85071468, + "learning_rate": 2.3634785514937445e-06, + "loss": 0.92797422, + "num_input_tokens_seen": 163636055, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.1394043, + "step": 7627, + "time_per_iteration": 2.5822484493255615 + }, + { + "auxiliary_loss_clip": 0.06454201, + "auxiliary_loss_mlp": 0.01270166, + "balance_loss_clip": 0.06285322, + "balance_loss_mlp": 0.01255634, + "epoch": 0.4586201713512701, + "flos": 29029748814720.0, + "grad_norm": 1.6524494424678404, + "language_loss": 0.69812655, + "learning_rate": 2.3630955687248953e-06, + "loss": 0.77537024, + "num_input_tokens_seen": 163657485, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.14544678, + "step": 7628, + "time_per_iteration": 2.5642716884613037 + }, + { + "auxiliary_loss_clip": 0.06450283, + "auxiliary_loss_mlp": 0.01272737, + "balance_loss_clip": 0.06287684, + "balance_loss_mlp": 0.01258492, + "epoch": 0.45868029460393805, + "flos": 23411395572480.0, + "grad_norm": 1.512396631295222, + "language_loss": 0.78590345, + "learning_rate": 2.3627125721874265e-06, + "loss": 0.86313355, + "num_input_tokens_seen": 163676030, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.14245605, + "step": 7629, + "time_per_iteration": 2.5380680561065674 + }, + { + "auxiliary_loss_clip": 0.0645413, + "auxiliary_loss_mlp": 0.01273786, + "balance_loss_clip": 0.06283213, + "balance_loss_mlp": 0.01258372, + "epoch": 0.458740417856606, + "flos": 18227625131520.0, + "grad_norm": 2.58579854057945, + "language_loss": 0.7964831, + "learning_rate": 2.3623295618958595e-06, + "loss": 0.87376225, + "num_input_tokens_seen": 163694490, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.1541748, + "step": 7630, + "time_per_iteration": 2.4736902713775635 + }, + { + "auxiliary_loss_clip": 0.0645593, + "auxiliary_loss_mlp": 0.01273082, + "balance_loss_clip": 0.06288286, + "balance_loss_mlp": 0.01258378, + "epoch": 0.458800541109274, + "flos": 34577341683840.0, + "grad_norm": 2.0263904819558243, + "language_loss": 0.72204614, + "learning_rate": 2.3619465378647198e-06, + "loss": 0.79933631, + "num_input_tokens_seen": 163717035, + "router_z_loss_clip": 1.67675781, + "router_z_loss_mlp": 0.14715576, + "step": 7631, + "time_per_iteration": 2.8143060207366943 + }, + { + "auxiliary_loss_clip": 0.06451838, + "auxiliary_loss_mlp": 0.01269985, + "balance_loss_clip": 0.06285281, + "balance_loss_mlp": 0.0125565, + "epoch": 0.458860664361942, + "flos": 17717837690880.0, + "grad_norm": 2.417001672331849, + "language_loss": 0.71850061, + "learning_rate": 2.361563500108531e-06, + "loss": 0.79571879, + "num_input_tokens_seen": 163734525, + "router_z_loss_clip": 1.66113281, + "router_z_loss_mlp": 0.14324951, + "step": 7632, + "time_per_iteration": 2.616152048110962 + }, + { + "auxiliary_loss_clip": 0.0645618, + "auxiliary_loss_mlp": 0.01272337, + "balance_loss_clip": 0.06285533, + "balance_loss_mlp": 0.01258055, + "epoch": 0.45892078761460997, + "flos": 18447746607360.0, + "grad_norm": 2.3994338935229784, + "language_loss": 0.69457287, + "learning_rate": 2.3611804486418178e-06, + "loss": 0.7718581, + "num_input_tokens_seen": 163752860, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.14294434, + "step": 7633, + "time_per_iteration": 2.544916868209839 + }, + { + "auxiliary_loss_clip": 0.06450637, + "auxiliary_loss_mlp": 0.01269265, + "balance_loss_clip": 0.06284192, + "balance_loss_mlp": 0.01255055, + "epoch": 0.45898091086727794, + "flos": 22679306449920.0, + "grad_norm": 1.6111707393144439, + "language_loss": 0.81188464, + "learning_rate": 2.3607973834791062e-06, + "loss": 0.88908368, + "num_input_tokens_seen": 163772495, + "router_z_loss_clip": 1.6640625, + "router_z_loss_mlp": 0.14208984, + "step": 7634, + "time_per_iteration": 2.508498430252075 + }, + { + "auxiliary_loss_clip": 0.06458217, + "auxiliary_loss_mlp": 0.0127198, + "balance_loss_clip": 0.06285305, + "balance_loss_mlp": 0.01256995, + "epoch": 0.4590410341199459, + "flos": 21659396152320.0, + "grad_norm": 1.6788945577423258, + "language_loss": 0.8141619, + "learning_rate": 2.3604143046349216e-06, + "loss": 0.89146382, + "num_input_tokens_seen": 163791475, + "router_z_loss_clip": 1.72851562, + "router_z_loss_mlp": 0.15002441, + "step": 7635, + "time_per_iteration": 2.5435891151428223 + }, + { + "auxiliary_loss_clip": 0.06450347, + "auxiliary_loss_mlp": 0.01272084, + "balance_loss_clip": 0.06285377, + "balance_loss_mlp": 0.01258095, + "epoch": 0.45910115737261387, + "flos": 36543676648320.0, + "grad_norm": 1.5202825589824251, + "language_loss": 0.65088654, + "learning_rate": 2.3600312121237905e-06, + "loss": 0.72811085, + "num_input_tokens_seen": 163812995, + "router_z_loss_clip": 1.64746094, + "router_z_loss_mlp": 0.13995361, + "step": 7636, + "time_per_iteration": 2.6333730220794678 + }, + { + "auxiliary_loss_clip": 0.06449063, + "auxiliary_loss_mlp": 0.01267464, + "balance_loss_clip": 0.06286588, + "balance_loss_mlp": 0.0125376, + "epoch": 0.45916128062528183, + "flos": 24425771500800.0, + "grad_norm": 1.3857173948582018, + "language_loss": 0.80552399, + "learning_rate": 2.3596481059602395e-06, + "loss": 0.88268924, + "num_input_tokens_seen": 163833945, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.13702393, + "step": 7637, + "time_per_iteration": 4.1112189292907715 + }, + { + "auxiliary_loss_clip": 0.06456389, + "auxiliary_loss_mlp": 0.0127208, + "balance_loss_clip": 0.06286228, + "balance_loss_mlp": 0.01257089, + "epoch": 0.4592214038779498, + "flos": 23228687744640.0, + "grad_norm": 2.823234077565048, + "language_loss": 0.75517625, + "learning_rate": 2.3592649861587965e-06, + "loss": 0.83246088, + "num_input_tokens_seen": 163853885, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.14990234, + "step": 7638, + "time_per_iteration": 3.910426616668701 + }, + { + "auxiliary_loss_clip": 0.06446041, + "auxiliary_loss_mlp": 0.01269213, + "balance_loss_clip": 0.06283274, + "balance_loss_mlp": 0.01254824, + "epoch": 0.45928152713061776, + "flos": 19178200575360.0, + "grad_norm": 1.717868731304971, + "language_loss": 0.74023581, + "learning_rate": 2.358881852733989e-06, + "loss": 0.81738836, + "num_input_tokens_seen": 163871855, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.14373779, + "step": 7639, + "time_per_iteration": 2.566300630569458 + }, + { + "auxiliary_loss_clip": 0.06454983, + "auxiliary_loss_mlp": 0.01270543, + "balance_loss_clip": 0.06286465, + "balance_loss_mlp": 0.01255165, + "epoch": 0.4593416503832857, + "flos": 22420513514880.0, + "grad_norm": 1.8698154023651474, + "language_loss": 0.683029, + "learning_rate": 2.358498705700346e-06, + "loss": 0.76028425, + "num_input_tokens_seen": 163891450, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.15380859, + "step": 7640, + "time_per_iteration": 2.5371484756469727 + }, + { + "auxiliary_loss_clip": 0.06455723, + "auxiliary_loss_mlp": 0.01270807, + "balance_loss_clip": 0.06285085, + "balance_loss_mlp": 0.01256454, + "epoch": 0.4594017736359537, + "flos": 18886228623360.0, + "grad_norm": 1.657871276405927, + "language_loss": 0.76190329, + "learning_rate": 2.3581155450723958e-06, + "loss": 0.83916861, + "num_input_tokens_seen": 163909345, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.14367676, + "step": 7641, + "time_per_iteration": 2.633190631866455 + }, + { + "auxiliary_loss_clip": 0.06450865, + "auxiliary_loss_mlp": 0.01271757, + "balance_loss_clip": 0.06281709, + "balance_loss_mlp": 0.01256749, + "epoch": 0.45946189688862166, + "flos": 20524268090880.0, + "grad_norm": 2.1109400166256753, + "language_loss": 0.75088501, + "learning_rate": 2.357732370864668e-06, + "loss": 0.82811123, + "num_input_tokens_seen": 163926940, + "router_z_loss_clip": 1.69238281, + "router_z_loss_mlp": 0.15008545, + "step": 7642, + "time_per_iteration": 2.497342824935913 + }, + { + "auxiliary_loss_clip": 0.06325873, + "auxiliary_loss_mlp": 0.01255986, + "balance_loss_clip": 0.06252096, + "balance_loss_mlp": 0.01253583, + "epoch": 0.4595220201412896, + "flos": 61422436920960.0, + "grad_norm": 0.8082143270085457, + "language_loss": 0.58238232, + "learning_rate": 2.357349183091694e-06, + "loss": 0.65820098, + "num_input_tokens_seen": 163977785, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.02400208, + "step": 7643, + "time_per_iteration": 2.9001851081848145 + }, + { + "auxiliary_loss_clip": 0.06454818, + "auxiliary_loss_mlp": 0.01269178, + "balance_loss_clip": 0.06279951, + "balance_loss_mlp": 0.01254467, + "epoch": 0.4595821433939576, + "flos": 23337616452480.0, + "grad_norm": 1.460564072578963, + "language_loss": 0.93123877, + "learning_rate": 2.3569659817680016e-06, + "loss": 1.00847864, + "num_input_tokens_seen": 163996630, + "router_z_loss_clip": 1.74902344, + "router_z_loss_mlp": 0.14709473, + "step": 7644, + "time_per_iteration": 3.956286668777466 + }, + { + "auxiliary_loss_clip": 0.06453376, + "auxiliary_loss_mlp": 0.01272616, + "balance_loss_clip": 0.06283151, + "balance_loss_mlp": 0.01258591, + "epoch": 0.4596422666466256, + "flos": 14287492189440.0, + "grad_norm": 2.5856018073831954, + "language_loss": 0.82780254, + "learning_rate": 2.3565827669081243e-06, + "loss": 0.90506244, + "num_input_tokens_seen": 164013190, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.14031982, + "step": 7645, + "time_per_iteration": 2.5230045318603516 + }, + { + "auxiliary_loss_clip": 0.0632263, + "auxiliary_loss_mlp": 0.0125685, + "balance_loss_clip": 0.06249407, + "balance_loss_mlp": 0.01254095, + "epoch": 0.4597023898992936, + "flos": 65747188103040.0, + "grad_norm": 0.7461836102968291, + "language_loss": 0.59904981, + "learning_rate": 2.356199538526593e-06, + "loss": 0.67484462, + "num_input_tokens_seen": 164074030, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.02758789, + "step": 7646, + "time_per_iteration": 3.0677428245544434 + }, + { + "auxiliary_loss_clip": 0.06451902, + "auxiliary_loss_mlp": 0.01272391, + "balance_loss_clip": 0.06282644, + "balance_loss_mlp": 0.01257931, + "epoch": 0.45976251315196154, + "flos": 26914430090880.0, + "grad_norm": 1.5401961064627432, + "language_loss": 0.72954202, + "learning_rate": 2.355816296637939e-06, + "loss": 0.80678499, + "num_input_tokens_seen": 164095515, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.14465332, + "step": 7647, + "time_per_iteration": 2.5715911388397217 + }, + { + "auxiliary_loss_clip": 0.06455843, + "auxiliary_loss_mlp": 0.01270403, + "balance_loss_clip": 0.06283608, + "balance_loss_mlp": 0.0125586, + "epoch": 0.4598226364046295, + "flos": 26625854229120.0, + "grad_norm": 1.5262276937698116, + "language_loss": 0.66966379, + "learning_rate": 2.3554330412566957e-06, + "loss": 0.74692625, + "num_input_tokens_seen": 164117270, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.14526367, + "step": 7648, + "time_per_iteration": 2.6032962799072266 + }, + { + "auxiliary_loss_clip": 0.06453076, + "auxiliary_loss_mlp": 0.01270647, + "balance_loss_clip": 0.06283541, + "balance_loss_mlp": 0.01256562, + "epoch": 0.45988275965729747, + "flos": 24394395346560.0, + "grad_norm": 1.3937992948207578, + "language_loss": 0.78837889, + "learning_rate": 2.3550497723973953e-06, + "loss": 0.86561614, + "num_input_tokens_seen": 164137850, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.14093018, + "step": 7649, + "time_per_iteration": 3.961230754852295 + }, + { + "auxiliary_loss_clip": 0.06449774, + "auxiliary_loss_mlp": 0.01273295, + "balance_loss_clip": 0.06282938, + "balance_loss_mlp": 0.01258221, + "epoch": 0.45994288290996543, + "flos": 24542834054400.0, + "grad_norm": 2.427132979105608, + "language_loss": 0.694453, + "learning_rate": 2.3546664900745726e-06, + "loss": 0.77168369, + "num_input_tokens_seen": 164157960, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.15087891, + "step": 7650, + "time_per_iteration": 2.5870516300201416 + }, + { + "auxiliary_loss_clip": 0.06454967, + "auxiliary_loss_mlp": 0.01271386, + "balance_loss_clip": 0.06281558, + "balance_loss_mlp": 0.01255876, + "epoch": 0.4600030061626334, + "flos": 14835573745920.0, + "grad_norm": 2.508823744651641, + "language_loss": 0.84580773, + "learning_rate": 2.354283194302761e-06, + "loss": 0.92307127, + "num_input_tokens_seen": 164174590, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.15515137, + "step": 7651, + "time_per_iteration": 2.4682910442352295 + }, + { + "auxiliary_loss_clip": 0.06447899, + "auxiliary_loss_mlp": 0.01269723, + "balance_loss_clip": 0.06282218, + "balance_loss_mlp": 0.01255567, + "epoch": 0.46006312941530136, + "flos": 18119702672640.0, + "grad_norm": 2.0398588051370536, + "language_loss": 0.75204146, + "learning_rate": 2.3538998850964948e-06, + "loss": 0.82921767, + "num_input_tokens_seen": 164192935, + "router_z_loss_clip": 1.65625, + "router_z_loss_mlp": 0.14160156, + "step": 7652, + "time_per_iteration": 2.533160448074341 + }, + { + "auxiliary_loss_clip": 0.06453463, + "auxiliary_loss_mlp": 0.01267977, + "balance_loss_clip": 0.06283025, + "balance_loss_mlp": 0.01253803, + "epoch": 0.46012325266796933, + "flos": 21982157280000.0, + "grad_norm": 1.8219910575186118, + "language_loss": 0.76111704, + "learning_rate": 2.3535165624703097e-06, + "loss": 0.83833146, + "num_input_tokens_seen": 164213160, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.14154053, + "step": 7653, + "time_per_iteration": 2.607556104660034 + }, + { + "auxiliary_loss_clip": 0.06466014, + "auxiliary_loss_mlp": 0.01279742, + "balance_loss_clip": 0.06286691, + "balance_loss_mlp": 0.01262618, + "epoch": 0.4601833759206373, + "flos": 15273468783360.0, + "grad_norm": 1.9930521100890286, + "language_loss": 0.66339052, + "learning_rate": 2.353133226438741e-06, + "loss": 0.74084806, + "num_input_tokens_seen": 164229330, + "router_z_loss_clip": 1.79199219, + "router_z_loss_mlp": 0.17132568, + "step": 7654, + "time_per_iteration": 2.5845115184783936 + }, + { + "auxiliary_loss_clip": 0.06450775, + "auxiliary_loss_mlp": 0.01273684, + "balance_loss_clip": 0.06283066, + "balance_loss_mlp": 0.01260524, + "epoch": 0.46024349917330526, + "flos": 27096299377920.0, + "grad_norm": 1.834954182024095, + "language_loss": 0.79552221, + "learning_rate": 2.3527498770163248e-06, + "loss": 0.87276679, + "num_input_tokens_seen": 164248240, + "router_z_loss_clip": 1.67675781, + "router_z_loss_mlp": 0.1315918, + "step": 7655, + "time_per_iteration": 2.5619075298309326 + }, + { + "auxiliary_loss_clip": 0.06446843, + "auxiliary_loss_mlp": 0.01271784, + "balance_loss_clip": 0.06282479, + "balance_loss_mlp": 0.0125795, + "epoch": 0.4603036224259732, + "flos": 24469935402240.0, + "grad_norm": 1.525008853184554, + "language_loss": 0.68020397, + "learning_rate": 2.3523665142175985e-06, + "loss": 0.7573902, + "num_input_tokens_seen": 164268020, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.13824463, + "step": 7656, + "time_per_iteration": 2.534085988998413 + }, + { + "auxiliary_loss_clip": 0.06450829, + "auxiliary_loss_mlp": 0.0126853, + "balance_loss_clip": 0.06280753, + "balance_loss_mlp": 0.01254249, + "epoch": 0.4603637456786412, + "flos": 28116545091840.0, + "grad_norm": 1.6883930229899933, + "language_loss": 0.81940675, + "learning_rate": 2.351983138057098e-06, + "loss": 0.89660037, + "num_input_tokens_seen": 164287305, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.14300537, + "step": 7657, + "time_per_iteration": 2.6093909740448 + }, + { + "auxiliary_loss_clip": 0.06452166, + "auxiliary_loss_mlp": 0.01272452, + "balance_loss_clip": 0.06283732, + "balance_loss_mlp": 0.01257598, + "epoch": 0.4604238689313092, + "flos": 24355178835840.0, + "grad_norm": 1.9081069655960825, + "language_loss": 0.70684779, + "learning_rate": 2.3515997485493623e-06, + "loss": 0.78409398, + "num_input_tokens_seen": 164306835, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.1484375, + "step": 7658, + "time_per_iteration": 2.5257532596588135 + }, + { + "auxiliary_loss_clip": 0.06333129, + "auxiliary_loss_mlp": 0.01254207, + "balance_loss_clip": 0.06259783, + "balance_loss_mlp": 0.01251698, + "epoch": 0.4604839921839772, + "flos": 53622742337280.0, + "grad_norm": 1.3056028191134426, + "language_loss": 0.6180622, + "learning_rate": 2.351216345708928e-06, + "loss": 0.69393557, + "num_input_tokens_seen": 164367095, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.02508545, + "step": 7659, + "time_per_iteration": 3.2051191329956055 + }, + { + "auxiliary_loss_clip": 0.06450778, + "auxiliary_loss_mlp": 0.01270415, + "balance_loss_clip": 0.06284198, + "balance_loss_mlp": 0.01254692, + "epoch": 0.46054411543664514, + "flos": 31256428014720.0, + "grad_norm": 1.6821089703035916, + "language_loss": 0.68614, + "learning_rate": 2.350832929550336e-06, + "loss": 0.76335192, + "num_input_tokens_seen": 164388895, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.1572876, + "step": 7660, + "time_per_iteration": 2.5768120288848877 + }, + { + "auxiliary_loss_clip": 0.06455722, + "auxiliary_loss_mlp": 0.01269625, + "balance_loss_clip": 0.06285393, + "balance_loss_mlp": 0.01254843, + "epoch": 0.4606042386893131, + "flos": 24098943450240.0, + "grad_norm": 1.8024702284570222, + "language_loss": 0.76982367, + "learning_rate": 2.3504495000881227e-06, + "loss": 0.84707713, + "num_input_tokens_seen": 164409080, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.14782715, + "step": 7661, + "time_per_iteration": 2.5556533336639404 + }, + { + "auxiliary_loss_clip": 0.06448123, + "auxiliary_loss_mlp": 0.01270523, + "balance_loss_clip": 0.06284644, + "balance_loss_mlp": 0.01257511, + "epoch": 0.46066436194198107, + "flos": 26585715323520.0, + "grad_norm": 1.64374674726695, + "language_loss": 0.75330603, + "learning_rate": 2.3500660573368305e-06, + "loss": 0.8304925, + "num_input_tokens_seen": 164427585, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.13000488, + "step": 7662, + "time_per_iteration": 2.5430636405944824 + }, + { + "auxiliary_loss_clip": 0.064645, + "auxiliary_loss_mlp": 0.01271435, + "balance_loss_clip": 0.06287506, + "balance_loss_mlp": 0.01255807, + "epoch": 0.46072448519464904, + "flos": 17779751458560.0, + "grad_norm": 2.8997354943734144, + "language_loss": 0.79542935, + "learning_rate": 2.349682601310998e-06, + "loss": 0.87278873, + "num_input_tokens_seen": 164438455, + "router_z_loss_clip": 1.76953125, + "router_z_loss_mlp": 0.15625, + "step": 7663, + "time_per_iteration": 2.4792025089263916 + }, + { + "auxiliary_loss_clip": 0.06451327, + "auxiliary_loss_mlp": 0.01270399, + "balance_loss_clip": 0.0628781, + "balance_loss_mlp": 0.01256344, + "epoch": 0.460784608447317, + "flos": 15091557569280.0, + "grad_norm": 1.9500633364095115, + "language_loss": 0.73664737, + "learning_rate": 2.3492991320251653e-06, + "loss": 0.81386459, + "num_input_tokens_seen": 164456830, + "router_z_loss_clip": 1.63476562, + "router_z_loss_mlp": 0.14050293, + "step": 7664, + "time_per_iteration": 2.5058319568634033 + }, + { + "auxiliary_loss_clip": 0.06454196, + "auxiliary_loss_mlp": 0.01269654, + "balance_loss_clip": 0.06286658, + "balance_loss_mlp": 0.01255403, + "epoch": 0.46084473169998497, + "flos": 18594214744320.0, + "grad_norm": 1.4541358898310397, + "language_loss": 0.72731769, + "learning_rate": 2.3489156494938753e-06, + "loss": 0.80455625, + "num_input_tokens_seen": 164475375, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.14257812, + "step": 7665, + "time_per_iteration": 2.5651309490203857 + }, + { + "auxiliary_loss_clip": 0.06452034, + "auxiliary_loss_mlp": 0.01269476, + "balance_loss_clip": 0.06283794, + "balance_loss_mlp": 0.01255016, + "epoch": 0.46090485495265293, + "flos": 19499955454080.0, + "grad_norm": 1.6858212343920378, + "language_loss": 0.78057897, + "learning_rate": 2.348532153731669e-06, + "loss": 0.85779405, + "num_input_tokens_seen": 164492040, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.14459229, + "step": 7666, + "time_per_iteration": 2.4884724617004395 + }, + { + "auxiliary_loss_clip": 0.06454702, + "auxiliary_loss_mlp": 0.01278259, + "balance_loss_clip": 0.06288874, + "balance_loss_mlp": 0.01262982, + "epoch": 0.4609649782053209, + "flos": 33373339966080.0, + "grad_norm": 1.3323556356345916, + "language_loss": 0.7438637, + "learning_rate": 2.348148644753088e-06, + "loss": 0.82119334, + "num_input_tokens_seen": 164513665, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.15270996, + "step": 7667, + "time_per_iteration": 2.6961426734924316 + }, + { + "auxiliary_loss_clip": 0.06450665, + "auxiliary_loss_mlp": 0.01267319, + "balance_loss_clip": 0.06283414, + "balance_loss_mlp": 0.01253574, + "epoch": 0.46102510145798886, + "flos": 23775972687360.0, + "grad_norm": 1.463924526715157, + "language_loss": 0.76157856, + "learning_rate": 2.347765122572676e-06, + "loss": 0.83875835, + "num_input_tokens_seen": 164533890, + "router_z_loss_clip": 1.67480469, + "router_z_loss_mlp": 0.1373291, + "step": 7668, + "time_per_iteration": 2.517401933670044 + }, + { + "auxiliary_loss_clip": 0.06446877, + "auxiliary_loss_mlp": 0.0126819, + "balance_loss_clip": 0.06284317, + "balance_loss_mlp": 0.01254982, + "epoch": 0.4610852247106568, + "flos": 23301544469760.0, + "grad_norm": 1.5533292001822034, + "language_loss": 0.78315312, + "learning_rate": 2.347381587204975e-06, + "loss": 0.86030376, + "num_input_tokens_seen": 164553815, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.13208008, + "step": 7669, + "time_per_iteration": 2.58445405960083 + }, + { + "auxiliary_loss_clip": 0.06450041, + "auxiliary_loss_mlp": 0.01264673, + "balance_loss_clip": 0.06282575, + "balance_loss_mlp": 0.01251286, + "epoch": 0.4611453479633248, + "flos": 25454528403840.0, + "grad_norm": 1.739851036429443, + "language_loss": 0.83272684, + "learning_rate": 2.34699803866453e-06, + "loss": 0.90987396, + "num_input_tokens_seen": 164573125, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.13391113, + "step": 7670, + "time_per_iteration": 2.5387001037597656 + }, + { + "auxiliary_loss_clip": 0.06451756, + "auxiliary_loss_mlp": 0.01270534, + "balance_loss_clip": 0.06288445, + "balance_loss_mlp": 0.01257129, + "epoch": 0.4612054712159928, + "flos": 21145541788800.0, + "grad_norm": 1.8274954721629995, + "language_loss": 0.63656652, + "learning_rate": 2.3466144769658845e-06, + "loss": 0.7137894, + "num_input_tokens_seen": 164592575, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.1340332, + "step": 7671, + "time_per_iteration": 2.5336413383483887 + }, + { + "auxiliary_loss_clip": 0.06335695, + "auxiliary_loss_mlp": 0.01251787, + "balance_loss_clip": 0.0626289, + "balance_loss_mlp": 0.01249119, + "epoch": 0.4612655944686608, + "flos": 69979754194560.0, + "grad_norm": 0.792480479203595, + "language_loss": 0.55791217, + "learning_rate": 2.346230902123583e-06, + "loss": 0.63378698, + "num_input_tokens_seen": 164659795, + "router_z_loss_clip": 0.72705078, + "router_z_loss_mlp": 0.02670288, + "step": 7672, + "time_per_iteration": 3.2302184104919434 + }, + { + "auxiliary_loss_clip": 0.06453065, + "auxiliary_loss_mlp": 0.01266934, + "balance_loss_clip": 0.06283592, + "balance_loss_mlp": 0.01253213, + "epoch": 0.46132571772132874, + "flos": 16842844229760.0, + "grad_norm": 2.026723370874256, + "language_loss": 0.71486014, + "learning_rate": 2.3458473141521715e-06, + "loss": 0.79206014, + "num_input_tokens_seen": 164678735, + "router_z_loss_clip": 1.69238281, + "router_z_loss_mlp": 0.13720703, + "step": 7673, + "time_per_iteration": 2.5307891368865967 + }, + { + "auxiliary_loss_clip": 0.06444372, + "auxiliary_loss_mlp": 0.01267461, + "balance_loss_clip": 0.06280223, + "balance_loss_mlp": 0.01254014, + "epoch": 0.4613858409739967, + "flos": 35817666946560.0, + "grad_norm": 1.6118988477871892, + "language_loss": 0.70779812, + "learning_rate": 2.345463713066195e-06, + "loss": 0.7849164, + "num_input_tokens_seen": 164700885, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.13446045, + "step": 7674, + "time_per_iteration": 2.67787766456604 + }, + { + "auxiliary_loss_clip": 0.06445141, + "auxiliary_loss_mlp": 0.01269162, + "balance_loss_clip": 0.06278897, + "balance_loss_mlp": 0.01255554, + "epoch": 0.4614459642266647, + "flos": 35276251789440.0, + "grad_norm": 1.4817902433092767, + "language_loss": 0.65456873, + "learning_rate": 2.3450800988801996e-06, + "loss": 0.73171175, + "num_input_tokens_seen": 164726960, + "router_z_loss_clip": 1.66113281, + "router_z_loss_mlp": 0.1362915, + "step": 7675, + "time_per_iteration": 2.683043956756592 + }, + { + "auxiliary_loss_clip": 0.06330552, + "auxiliary_loss_mlp": 0.01253837, + "balance_loss_clip": 0.06257802, + "balance_loss_mlp": 0.01251083, + "epoch": 0.46150608747933264, + "flos": 66723311842560.0, + "grad_norm": 0.7159632658119685, + "language_loss": 0.58438665, + "learning_rate": 2.3446964716087327e-06, + "loss": 0.66023052, + "num_input_tokens_seen": 164788525, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.02758789, + "step": 7676, + "time_per_iteration": 3.2052080631256104 + }, + { + "auxiliary_loss_clip": 0.06331712, + "auxiliary_loss_mlp": 0.01253621, + "balance_loss_clip": 0.06258753, + "balance_loss_mlp": 0.01250806, + "epoch": 0.4615662107320006, + "flos": 55846780133760.0, + "grad_norm": 0.7666580083801284, + "language_loss": 0.62806678, + "learning_rate": 2.344312831266341e-06, + "loss": 0.70392013, + "num_input_tokens_seen": 164843525, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.02810669, + "step": 7677, + "time_per_iteration": 5.753543853759766 + }, + { + "auxiliary_loss_clip": 0.06441256, + "auxiliary_loss_mlp": 0.01269221, + "balance_loss_clip": 0.06278154, + "balance_loss_mlp": 0.012564, + "epoch": 0.46162633398466857, + "flos": 15488055889920.0, + "grad_norm": 2.0928007642005224, + "language_loss": 0.7694543, + "learning_rate": 2.3439291778675718e-06, + "loss": 0.84655911, + "num_input_tokens_seen": 164859895, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.12817383, + "step": 7678, + "time_per_iteration": 2.5979206562042236 + }, + { + "auxiliary_loss_clip": 0.06447493, + "auxiliary_loss_mlp": 0.01267035, + "balance_loss_clip": 0.06279032, + "balance_loss_mlp": 0.01253672, + "epoch": 0.46168645723733653, + "flos": 20017667105280.0, + "grad_norm": 1.9130482273301792, + "language_loss": 0.66792345, + "learning_rate": 2.343545511426974e-06, + "loss": 0.74506873, + "num_input_tokens_seen": 164878030, + "router_z_loss_clip": 1.68554688, + "router_z_loss_mlp": 0.13360596, + "step": 7679, + "time_per_iteration": 2.548025131225586 + }, + { + "auxiliary_loss_clip": 0.06445532, + "auxiliary_loss_mlp": 0.0127232, + "balance_loss_clip": 0.06279338, + "balance_loss_mlp": 0.01259409, + "epoch": 0.4617465804900045, + "flos": 20304020833920.0, + "grad_norm": 2.6299917180378203, + "language_loss": 0.702595, + "learning_rate": 2.3431618319590963e-06, + "loss": 0.77977353, + "num_input_tokens_seen": 164895710, + "router_z_loss_clip": 1.66113281, + "router_z_loss_mlp": 0.12921143, + "step": 7680, + "time_per_iteration": 2.475419282913208 + }, + { + "auxiliary_loss_clip": 0.06449848, + "auxiliary_loss_mlp": 0.01274843, + "balance_loss_clip": 0.06279959, + "balance_loss_mlp": 0.01260454, + "epoch": 0.46180670374267246, + "flos": 22352897669760.0, + "grad_norm": 1.6539051623213383, + "language_loss": 0.63903129, + "learning_rate": 2.342778139478487e-06, + "loss": 0.7162782, + "num_input_tokens_seen": 164913365, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.14398193, + "step": 7681, + "time_per_iteration": 2.518878698348999 + }, + { + "auxiliary_loss_clip": 0.06438938, + "auxiliary_loss_mlp": 0.01267755, + "balance_loss_clip": 0.06277744, + "balance_loss_mlp": 0.01255566, + "epoch": 0.46186682699534043, + "flos": 19900856113920.0, + "grad_norm": 1.5795449228659066, + "language_loss": 0.67458999, + "learning_rate": 2.342394433999697e-06, + "loss": 0.75165695, + "num_input_tokens_seen": 164931620, + "router_z_loss_clip": 1.61230469, + "router_z_loss_mlp": 0.12194824, + "step": 7682, + "time_per_iteration": 2.4734294414520264 + }, + { + "auxiliary_loss_clip": 0.06442823, + "auxiliary_loss_mlp": 0.01267731, + "balance_loss_clip": 0.06276147, + "balance_loss_mlp": 0.01254564, + "epoch": 0.4619269502480084, + "flos": 31511573297280.0, + "grad_norm": 2.0778412213868025, + "language_loss": 0.74573362, + "learning_rate": 2.342010715537275e-06, + "loss": 0.82283914, + "num_input_tokens_seen": 164950905, + "router_z_loss_clip": 1.66699219, + "router_z_loss_mlp": 0.1317749, + "step": 7683, + "time_per_iteration": 2.5680744647979736 + }, + { + "auxiliary_loss_clip": 0.0644316, + "auxiliary_loss_mlp": 0.01269615, + "balance_loss_clip": 0.06278165, + "balance_loss_mlp": 0.01255995, + "epoch": 0.46198707350067636, + "flos": 25016465658240.0, + "grad_norm": 2.034673139361796, + "language_loss": 0.77701104, + "learning_rate": 2.3416269841057726e-06, + "loss": 0.85413885, + "num_input_tokens_seen": 164970950, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.13604736, + "step": 7684, + "time_per_iteration": 3.9865663051605225 + }, + { + "auxiliary_loss_clip": 0.06455924, + "auxiliary_loss_mlp": 0.01269534, + "balance_loss_clip": 0.06282193, + "balance_loss_mlp": 0.01255074, + "epoch": 0.4620471967533444, + "flos": 18297588890880.0, + "grad_norm": 1.7679070884814239, + "language_loss": 0.79849184, + "learning_rate": 2.3412432397197412e-06, + "loss": 0.87574637, + "num_input_tokens_seen": 164989855, + "router_z_loss_clip": 1.73828125, + "router_z_loss_mlp": 0.14471436, + "step": 7685, + "time_per_iteration": 2.4874165058135986 + }, + { + "auxiliary_loss_clip": 0.06442665, + "auxiliary_loss_mlp": 0.01268831, + "balance_loss_clip": 0.06282581, + "balance_loss_mlp": 0.01254151, + "epoch": 0.46210732000601235, + "flos": 33993607415040.0, + "grad_norm": 2.697729181890728, + "language_loss": 0.66966581, + "learning_rate": 2.340859482393731e-06, + "loss": 0.74678075, + "num_input_tokens_seen": 165012290, + "router_z_loss_clip": 1.59863281, + "router_z_loss_mlp": 0.14678955, + "step": 7686, + "time_per_iteration": 2.673029661178589 + }, + { + "auxiliary_loss_clip": 0.06450719, + "auxiliary_loss_mlp": 0.01270437, + "balance_loss_clip": 0.06281859, + "balance_loss_mlp": 0.01255929, + "epoch": 0.4621674432586803, + "flos": 25016381804160.0, + "grad_norm": 1.8957956969587364, + "language_loss": 0.7416718, + "learning_rate": 2.340475712142296e-06, + "loss": 0.81888342, + "num_input_tokens_seen": 165030810, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.14508057, + "step": 7687, + "time_per_iteration": 2.520526885986328 + }, + { + "auxiliary_loss_clip": 0.06441881, + "auxiliary_loss_mlp": 0.01268556, + "balance_loss_clip": 0.06278582, + "balance_loss_mlp": 0.01254943, + "epoch": 0.4622275665113483, + "flos": 22019906344320.0, + "grad_norm": 2.1641165257521098, + "language_loss": 0.75034606, + "learning_rate": 2.3400919289799873e-06, + "loss": 0.82745045, + "num_input_tokens_seen": 165050205, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.13623047, + "step": 7688, + "time_per_iteration": 2.6087183952331543 + }, + { + "auxiliary_loss_clip": 0.06442745, + "auxiliary_loss_mlp": 0.01266791, + "balance_loss_clip": 0.06279768, + "balance_loss_mlp": 0.0125375, + "epoch": 0.46228768976401624, + "flos": 24065303235840.0, + "grad_norm": 1.76695871159964, + "language_loss": 0.78822517, + "learning_rate": 2.3397081329213585e-06, + "loss": 0.86532056, + "num_input_tokens_seen": 165069370, + "router_z_loss_clip": 1.62988281, + "router_z_loss_mlp": 0.13043213, + "step": 7689, + "time_per_iteration": 4.008488416671753 + }, + { + "auxiliary_loss_clip": 0.0644816, + "auxiliary_loss_mlp": 0.01269125, + "balance_loss_clip": 0.06278446, + "balance_loss_mlp": 0.01254116, + "epoch": 0.4623478130166842, + "flos": 26658655902720.0, + "grad_norm": 2.4003711776889936, + "language_loss": 0.56824899, + "learning_rate": 2.339324323980964e-06, + "loss": 0.6454218, + "num_input_tokens_seen": 165089610, + "router_z_loss_clip": 1.69726562, + "router_z_loss_mlp": 0.15020752, + "step": 7690, + "time_per_iteration": 2.586726665496826 + }, + { + "auxiliary_loss_clip": 0.0644986, + "auxiliary_loss_mlp": 0.01270548, + "balance_loss_clip": 0.06281572, + "balance_loss_mlp": 0.01256421, + "epoch": 0.46240793626935217, + "flos": 20564700485760.0, + "grad_norm": 2.1153050114919387, + "language_loss": 0.83470464, + "learning_rate": 2.3389405021733562e-06, + "loss": 0.91190875, + "num_input_tokens_seen": 165109050, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.14135742, + "step": 7691, + "time_per_iteration": 2.5688517093658447 + }, + { + "auxiliary_loss_clip": 0.06446303, + "auxiliary_loss_mlp": 0.01268112, + "balance_loss_clip": 0.06280233, + "balance_loss_mlp": 0.01254528, + "epoch": 0.46246805952202014, + "flos": 22462706845440.0, + "grad_norm": 1.4394066258336355, + "language_loss": 0.75601387, + "learning_rate": 2.338556667513091e-06, + "loss": 0.83315802, + "num_input_tokens_seen": 165130130, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.13604736, + "step": 7692, + "time_per_iteration": 2.537447929382324 + }, + { + "auxiliary_loss_clip": 0.06447245, + "auxiliary_loss_mlp": 0.01269367, + "balance_loss_clip": 0.06279314, + "balance_loss_mlp": 0.01255324, + "epoch": 0.4625281827746881, + "flos": 35049673549440.0, + "grad_norm": 1.4816622996820314, + "language_loss": 0.74488908, + "learning_rate": 2.338172820014723e-06, + "loss": 0.82205522, + "num_input_tokens_seen": 165152685, + "router_z_loss_clip": 1.67871094, + "router_z_loss_mlp": 0.14038086, + "step": 7693, + "time_per_iteration": 2.655733823776245 + }, + { + "auxiliary_loss_clip": 0.06448781, + "auxiliary_loss_mlp": 0.01269271, + "balance_loss_clip": 0.06283827, + "balance_loss_mlp": 0.01255496, + "epoch": 0.46258830602735607, + "flos": 21074907196800.0, + "grad_norm": 1.4111581138712515, + "language_loss": 0.85637844, + "learning_rate": 2.337788959692808e-06, + "loss": 0.93355894, + "num_input_tokens_seen": 165173315, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.13781738, + "step": 7694, + "time_per_iteration": 2.5321285724639893 + }, + { + "auxiliary_loss_clip": 0.06447286, + "auxiliary_loss_mlp": 0.01268569, + "balance_loss_clip": 0.06280261, + "balance_loss_mlp": 0.01254979, + "epoch": 0.46264842928002403, + "flos": 26184437320320.0, + "grad_norm": 2.8233556574725744, + "language_loss": 0.79577935, + "learning_rate": 2.337405086561902e-06, + "loss": 0.87293792, + "num_input_tokens_seen": 165192395, + "router_z_loss_clip": 1.66699219, + "router_z_loss_mlp": 0.13586426, + "step": 7695, + "time_per_iteration": 2.569974660873413 + }, + { + "auxiliary_loss_clip": 0.06442414, + "auxiliary_loss_mlp": 0.01270579, + "balance_loss_clip": 0.0628098, + "balance_loss_mlp": 0.01258432, + "epoch": 0.462708552532692, + "flos": 16769903650560.0, + "grad_norm": 1.6398131561505984, + "language_loss": 0.72464627, + "learning_rate": 2.3370212006365606e-06, + "loss": 0.80177617, + "num_input_tokens_seen": 165211355, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.12133789, + "step": 7696, + "time_per_iteration": 2.49324369430542 + }, + { + "auxiliary_loss_clip": 0.06448425, + "auxiliary_loss_mlp": 0.01269091, + "balance_loss_clip": 0.06281986, + "balance_loss_mlp": 0.01256139, + "epoch": 0.46276867578535996, + "flos": 15565985786880.0, + "grad_norm": 1.5682310460433448, + "language_loss": 0.69151074, + "learning_rate": 2.3366373019313423e-06, + "loss": 0.76868594, + "num_input_tokens_seen": 165229380, + "router_z_loss_clip": 1.66308594, + "router_z_loss_mlp": 0.12945557, + "step": 7697, + "time_per_iteration": 2.5437402725219727 + }, + { + "auxiliary_loss_clip": 0.06445374, + "auxiliary_loss_mlp": 0.01272368, + "balance_loss_clip": 0.06278891, + "balance_loss_mlp": 0.01258903, + "epoch": 0.462828799038028, + "flos": 22421352055680.0, + "grad_norm": 2.477481810490018, + "language_loss": 0.84870285, + "learning_rate": 2.3362533904608025e-06, + "loss": 0.92588031, + "num_input_tokens_seen": 165247200, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.13470459, + "step": 7698, + "time_per_iteration": 2.5088558197021484 + }, + { + "auxiliary_loss_clip": 0.06449191, + "auxiliary_loss_mlp": 0.01269693, + "balance_loss_clip": 0.06284188, + "balance_loss_mlp": 0.01255883, + "epoch": 0.46288892229069595, + "flos": 21075997299840.0, + "grad_norm": 1.5978854439043657, + "language_loss": 0.71711451, + "learning_rate": 2.335869466239502e-06, + "loss": 0.79430336, + "num_input_tokens_seen": 165265825, + "router_z_loss_clip": 1.64941406, + "router_z_loss_mlp": 0.13824463, + "step": 7699, + "time_per_iteration": 2.572908639907837 + }, + { + "auxiliary_loss_clip": 0.06453253, + "auxiliary_loss_mlp": 0.01268472, + "balance_loss_clip": 0.06283245, + "balance_loss_mlp": 0.01253952, + "epoch": 0.4629490455433639, + "flos": 23192448053760.0, + "grad_norm": 3.9296940778908724, + "language_loss": 0.71994227, + "learning_rate": 2.335485529281996e-06, + "loss": 0.79715955, + "num_input_tokens_seen": 165284380, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.1451416, + "step": 7700, + "time_per_iteration": 2.5155210494995117 + }, + { + "auxiliary_loss_clip": 0.06446292, + "auxiliary_loss_mlp": 0.01271375, + "balance_loss_clip": 0.0628306, + "balance_loss_mlp": 0.01258608, + "epoch": 0.4630091687960319, + "flos": 18840178005120.0, + "grad_norm": 2.0219592023308297, + "language_loss": 0.72735655, + "learning_rate": 2.3351015796028467e-06, + "loss": 0.80453324, + "num_input_tokens_seen": 165300320, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.12780762, + "step": 7701, + "time_per_iteration": 2.5208041667938232 + }, + { + "auxiliary_loss_clip": 0.06455772, + "auxiliary_loss_mlp": 0.01272275, + "balance_loss_clip": 0.06285252, + "balance_loss_mlp": 0.01258768, + "epoch": 0.46306929204869984, + "flos": 38915733882240.0, + "grad_norm": 1.8677153728043454, + "language_loss": 0.64857763, + "learning_rate": 2.3347176172166114e-06, + "loss": 0.72585809, + "num_input_tokens_seen": 165318130, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.13519287, + "step": 7702, + "time_per_iteration": 2.6274476051330566 + }, + { + "auxiliary_loss_clip": 0.06443912, + "auxiliary_loss_mlp": 0.01267806, + "balance_loss_clip": 0.06281176, + "balance_loss_mlp": 0.01255181, + "epoch": 0.4631294153013678, + "flos": 19649945462400.0, + "grad_norm": 1.8702283374659314, + "language_loss": 0.73327863, + "learning_rate": 2.33433364213785e-06, + "loss": 0.81039578, + "num_input_tokens_seen": 165336225, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.12640381, + "step": 7703, + "time_per_iteration": 2.505009651184082 + }, + { + "auxiliary_loss_clip": 0.06456561, + "auxiliary_loss_mlp": 0.01272434, + "balance_loss_clip": 0.0628607, + "balance_loss_mlp": 0.0125776, + "epoch": 0.4631895385540358, + "flos": 24615187655040.0, + "grad_norm": 1.7291559958554978, + "language_loss": 0.68770319, + "learning_rate": 2.3339496543811243e-06, + "loss": 0.76499313, + "num_input_tokens_seen": 165355005, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.14666748, + "step": 7704, + "time_per_iteration": 2.5337138175964355 + }, + { + "auxiliary_loss_clip": 0.06456052, + "auxiliary_loss_mlp": 0.01269056, + "balance_loss_clip": 0.06286585, + "balance_loss_mlp": 0.01255693, + "epoch": 0.46324966180670374, + "flos": 26326838534400.0, + "grad_norm": 2.021774763699282, + "language_loss": 0.81483209, + "learning_rate": 2.3335656539609934e-06, + "loss": 0.89208323, + "num_input_tokens_seen": 165374910, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.13378906, + "step": 7705, + "time_per_iteration": 2.612663745880127 + }, + { + "auxiliary_loss_clip": 0.06459744, + "auxiliary_loss_mlp": 0.01269987, + "balance_loss_clip": 0.06288762, + "balance_loss_mlp": 0.01256313, + "epoch": 0.4633097850593717, + "flos": 19245816420480.0, + "grad_norm": 1.7146225700720175, + "language_loss": 0.77885628, + "learning_rate": 2.3331816408920196e-06, + "loss": 0.85615361, + "num_input_tokens_seen": 165392590, + "router_z_loss_clip": 1.70898438, + "router_z_loss_mlp": 0.13684082, + "step": 7706, + "time_per_iteration": 2.508925437927246 + }, + { + "auxiliary_loss_clip": 0.06446654, + "auxiliary_loss_mlp": 0.01269933, + "balance_loss_clip": 0.06285432, + "balance_loss_mlp": 0.01256254, + "epoch": 0.46336990831203967, + "flos": 22789660677120.0, + "grad_norm": 1.8229249281456994, + "language_loss": 0.70008546, + "learning_rate": 2.3327976151887654e-06, + "loss": 0.77725136, + "num_input_tokens_seen": 165411195, + "router_z_loss_clip": 1.61230469, + "router_z_loss_mlp": 0.13671875, + "step": 7707, + "time_per_iteration": 2.5517148971557617 + }, + { + "auxiliary_loss_clip": 0.06460145, + "auxiliary_loss_mlp": 0.01269807, + "balance_loss_clip": 0.06290638, + "balance_loss_mlp": 0.01255716, + "epoch": 0.46343003156470763, + "flos": 38218668566400.0, + "grad_norm": 2.701141573629833, + "language_loss": 0.61044616, + "learning_rate": 2.332413576865791e-06, + "loss": 0.68774569, + "num_input_tokens_seen": 165430150, + "router_z_loss_clip": 1.69433594, + "router_z_loss_mlp": 0.14093018, + "step": 7708, + "time_per_iteration": 2.6566975116729736 + }, + { + "auxiliary_loss_clip": 0.06457859, + "auxiliary_loss_mlp": 0.01269726, + "balance_loss_clip": 0.06291145, + "balance_loss_mlp": 0.01255946, + "epoch": 0.4634901548173756, + "flos": 31946156098560.0, + "grad_norm": 2.0418964495503125, + "language_loss": 0.77915132, + "learning_rate": 2.3320295259376614e-06, + "loss": 0.85642713, + "num_input_tokens_seen": 165450595, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.13781738, + "step": 7709, + "time_per_iteration": 2.6596858501434326 + }, + { + "auxiliary_loss_clip": 0.06459823, + "auxiliary_loss_mlp": 0.01271527, + "balance_loss_clip": 0.06291819, + "balance_loss_mlp": 0.01256756, + "epoch": 0.46355027807004356, + "flos": 20088469405440.0, + "grad_norm": 1.5745013311626586, + "language_loss": 0.77581245, + "learning_rate": 2.3316454624189385e-06, + "loss": 0.85312593, + "num_input_tokens_seen": 165469515, + "router_z_loss_clip": 1.67773438, + "router_z_loss_mlp": 0.14764404, + "step": 7710, + "time_per_iteration": 2.5101842880249023 + }, + { + "auxiliary_loss_clip": 0.06457606, + "auxiliary_loss_mlp": 0.01274408, + "balance_loss_clip": 0.06287406, + "balance_loss_mlp": 0.01260151, + "epoch": 0.4636104013227116, + "flos": 24068280055680.0, + "grad_norm": 2.3601088939338086, + "language_loss": 0.73606086, + "learning_rate": 2.3312613863241865e-06, + "loss": 0.81338096, + "num_input_tokens_seen": 165488125, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.14257812, + "step": 7711, + "time_per_iteration": 2.590855598449707 + }, + { + "auxiliary_loss_clip": 0.06459524, + "auxiliary_loss_mlp": 0.01272046, + "balance_loss_clip": 0.06293879, + "balance_loss_mlp": 0.01257354, + "epoch": 0.46367052457537955, + "flos": 23921392648320.0, + "grad_norm": 1.4235356855228358, + "language_loss": 0.71632046, + "learning_rate": 2.33087729766797e-06, + "loss": 0.7936362, + "num_input_tokens_seen": 165509225, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.14685059, + "step": 7712, + "time_per_iteration": 2.524653434753418 + }, + { + "auxiliary_loss_clip": 0.06464949, + "auxiliary_loss_mlp": 0.01272658, + "balance_loss_clip": 0.06290694, + "balance_loss_mlp": 0.01257709, + "epoch": 0.4637306478280475, + "flos": 26403846036480.0, + "grad_norm": 2.2505033505731493, + "language_loss": 0.73737693, + "learning_rate": 2.3304931964648524e-06, + "loss": 0.81475306, + "num_input_tokens_seen": 165529945, + "router_z_loss_clip": 1.74121094, + "router_z_loss_mlp": 0.14941406, + "step": 7713, + "time_per_iteration": 2.5624618530273438 + }, + { + "auxiliary_loss_clip": 0.06466722, + "auxiliary_loss_mlp": 0.01276857, + "balance_loss_clip": 0.06292763, + "balance_loss_mlp": 0.01261372, + "epoch": 0.4637907710807155, + "flos": 21987104670720.0, + "grad_norm": 1.4954624193011212, + "language_loss": 0.58918363, + "learning_rate": 2.3301090827294e-06, + "loss": 0.66661942, + "num_input_tokens_seen": 165550690, + "router_z_loss_clip": 1.74023438, + "router_z_loss_mlp": 0.15466309, + "step": 7714, + "time_per_iteration": 2.510551929473877 + }, + { + "auxiliary_loss_clip": 0.06456332, + "auxiliary_loss_mlp": 0.01271959, + "balance_loss_clip": 0.06290398, + "balance_loss_mlp": 0.01257427, + "epoch": 0.46385089433338345, + "flos": 12427234894080.0, + "grad_norm": 2.7033660685293186, + "language_loss": 0.70470357, + "learning_rate": 2.3297249564761784e-06, + "loss": 0.78198647, + "num_input_tokens_seen": 165567775, + "router_z_loss_clip": 1.66113281, + "router_z_loss_mlp": 0.14538574, + "step": 7715, + "time_per_iteration": 2.533158779144287 + }, + { + "auxiliary_loss_clip": 0.06470867, + "auxiliary_loss_mlp": 0.01270095, + "balance_loss_clip": 0.06294338, + "balance_loss_mlp": 0.01255731, + "epoch": 0.4639110175860514, + "flos": 23922692386560.0, + "grad_norm": 1.7790063066577455, + "language_loss": 0.68472731, + "learning_rate": 2.3293408177197527e-06, + "loss": 0.762137, + "num_input_tokens_seen": 165587010, + "router_z_loss_clip": 1.76660156, + "router_z_loss_mlp": 0.14355469, + "step": 7716, + "time_per_iteration": 4.020689249038696 + }, + { + "auxiliary_loss_clip": 0.06459275, + "auxiliary_loss_mlp": 0.01270908, + "balance_loss_clip": 0.06288785, + "balance_loss_mlp": 0.01255858, + "epoch": 0.4639711408387194, + "flos": 25307263653120.0, + "grad_norm": 1.603260424737227, + "language_loss": 0.81029081, + "learning_rate": 2.328956666474691e-06, + "loss": 0.88759267, + "num_input_tokens_seen": 165607850, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.1505127, + "step": 7717, + "time_per_iteration": 3.932593584060669 + }, + { + "auxiliary_loss_clip": 0.06454346, + "auxiliary_loss_mlp": 0.01273075, + "balance_loss_clip": 0.06284629, + "balance_loss_mlp": 0.01258127, + "epoch": 0.46403126409138734, + "flos": 21217643827200.0, + "grad_norm": 1.6983648240686933, + "language_loss": 0.73560178, + "learning_rate": 2.3285725027555593e-06, + "loss": 0.81287599, + "num_input_tokens_seen": 165627175, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.14929199, + "step": 7718, + "time_per_iteration": 2.567814350128174 + }, + { + "auxiliary_loss_clip": 0.06461985, + "auxiliary_loss_mlp": 0.0127191, + "balance_loss_clip": 0.06294554, + "balance_loss_mlp": 0.01257384, + "epoch": 0.4640913873440553, + "flos": 35854325907840.0, + "grad_norm": 1.9528130818693374, + "language_loss": 0.70908272, + "learning_rate": 2.3281883265769254e-06, + "loss": 0.78642172, + "num_input_tokens_seen": 165648340, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.14526367, + "step": 7719, + "time_per_iteration": 2.6412456035614014 + }, + { + "auxiliary_loss_clip": 0.06458225, + "auxiliary_loss_mlp": 0.01272538, + "balance_loss_clip": 0.06287955, + "balance_loss_mlp": 0.01258793, + "epoch": 0.46415151059672327, + "flos": 19171282613760.0, + "grad_norm": 2.2400961683609473, + "language_loss": 0.86823237, + "learning_rate": 2.327804137953357e-06, + "loss": 0.94553995, + "num_input_tokens_seen": 165667195, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.13745117, + "step": 7720, + "time_per_iteration": 2.5479180812835693 + }, + { + "auxiliary_loss_clip": 0.06346954, + "auxiliary_loss_mlp": 0.01257869, + "balance_loss_clip": 0.06273555, + "balance_loss_mlp": 0.01255387, + "epoch": 0.46421163384939124, + "flos": 58932841207680.0, + "grad_norm": 0.7060507258277461, + "language_loss": 0.54935473, + "learning_rate": 2.3274199368994226e-06, + "loss": 0.62540293, + "num_input_tokens_seen": 165726760, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.02481079, + "step": 7721, + "time_per_iteration": 3.185922861099243 + }, + { + "auxiliary_loss_clip": 0.06453753, + "auxiliary_loss_mlp": 0.01271222, + "balance_loss_clip": 0.0628788, + "balance_loss_mlp": 0.01257227, + "epoch": 0.4642717571020592, + "flos": 20163590190720.0, + "grad_norm": 1.901448408880664, + "language_loss": 0.80108112, + "learning_rate": 2.3270357234296918e-06, + "loss": 0.87833083, + "num_input_tokens_seen": 165745005, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.13995361, + "step": 7722, + "time_per_iteration": 2.524707317352295 + }, + { + "auxiliary_loss_clip": 0.06454173, + "auxiliary_loss_mlp": 0.01270539, + "balance_loss_clip": 0.06282455, + "balance_loss_mlp": 0.0125627, + "epoch": 0.46433188035472717, + "flos": 25053208473600.0, + "grad_norm": 1.90118065677523, + "language_loss": 0.78278601, + "learning_rate": 2.3266514975587332e-06, + "loss": 0.86003315, + "num_input_tokens_seen": 165765750, + "router_z_loss_clip": 1.71777344, + "router_z_loss_mlp": 0.1427002, + "step": 7723, + "time_per_iteration": 3.9820849895477295 + }, + { + "auxiliary_loss_clip": 0.06448075, + "auxiliary_loss_mlp": 0.01267351, + "balance_loss_clip": 0.06282157, + "balance_loss_mlp": 0.01253046, + "epoch": 0.4643920036073952, + "flos": 28083366074880.0, + "grad_norm": 1.6378874340525207, + "language_loss": 0.68861282, + "learning_rate": 2.326267259301118e-06, + "loss": 0.7657671, + "num_input_tokens_seen": 165787515, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.14306641, + "step": 7724, + "time_per_iteration": 2.550832748413086 + }, + { + "auxiliary_loss_clip": 0.06449208, + "auxiliary_loss_mlp": 0.01272875, + "balance_loss_clip": 0.06283656, + "balance_loss_mlp": 0.01259297, + "epoch": 0.46445212686006315, + "flos": 18375267225600.0, + "grad_norm": 2.354559005563411, + "language_loss": 0.67722934, + "learning_rate": 2.325883008671415e-06, + "loss": 0.7544502, + "num_input_tokens_seen": 165806675, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.13592529, + "step": 7725, + "time_per_iteration": 2.534698009490967 + }, + { + "auxiliary_loss_clip": 0.0644237, + "auxiliary_loss_mlp": 0.01270691, + "balance_loss_clip": 0.06281108, + "balance_loss_mlp": 0.01258108, + "epoch": 0.4645122501127311, + "flos": 31729514567040.0, + "grad_norm": 1.5959059771038482, + "language_loss": 0.65303701, + "learning_rate": 2.3254987456841955e-06, + "loss": 0.73016763, + "num_input_tokens_seen": 165829835, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.12585449, + "step": 7726, + "time_per_iteration": 2.6071393489837646 + }, + { + "auxiliary_loss_clip": 0.06452325, + "auxiliary_loss_mlp": 0.01268836, + "balance_loss_clip": 0.06286149, + "balance_loss_mlp": 0.01255312, + "epoch": 0.4645723733653991, + "flos": 23775553416960.0, + "grad_norm": 2.198219591713496, + "language_loss": 0.75535023, + "learning_rate": 2.3251144703540307e-06, + "loss": 0.83256185, + "num_input_tokens_seen": 165849380, + "router_z_loss_clip": 1.66113281, + "router_z_loss_mlp": 0.13525391, + "step": 7727, + "time_per_iteration": 2.5323383808135986 + }, + { + "auxiliary_loss_clip": 0.06449004, + "auxiliary_loss_mlp": 0.01272292, + "balance_loss_clip": 0.06281407, + "balance_loss_mlp": 0.01258166, + "epoch": 0.46463249661806705, + "flos": 33153805468800.0, + "grad_norm": 1.912145195790545, + "language_loss": 0.78694946, + "learning_rate": 2.3247301826954936e-06, + "loss": 0.86416245, + "num_input_tokens_seen": 165868620, + "router_z_loss_clip": 1.67675781, + "router_z_loss_mlp": 0.14147949, + "step": 7728, + "time_per_iteration": 3.998812437057495 + }, + { + "auxiliary_loss_clip": 0.06450211, + "auxiliary_loss_mlp": 0.01270241, + "balance_loss_clip": 0.06282613, + "balance_loss_mlp": 0.0125658, + "epoch": 0.464692619870735, + "flos": 18301865448960.0, + "grad_norm": 2.3670866338465295, + "language_loss": 0.76134968, + "learning_rate": 2.324345882723155e-06, + "loss": 0.83855414, + "num_input_tokens_seen": 165885915, + "router_z_loss_clip": 1.67675781, + "router_z_loss_mlp": 0.13659668, + "step": 7729, + "time_per_iteration": 2.459913730621338 + }, + { + "auxiliary_loss_clip": 0.06449223, + "auxiliary_loss_mlp": 0.01270726, + "balance_loss_clip": 0.06283462, + "balance_loss_mlp": 0.01257339, + "epoch": 0.464752743123403, + "flos": 22644659986560.0, + "grad_norm": 1.7402612149106196, + "language_loss": 0.80316758, + "learning_rate": 2.323961570451588e-06, + "loss": 0.88036704, + "num_input_tokens_seen": 165905465, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.13378906, + "step": 7730, + "time_per_iteration": 2.5472798347473145 + }, + { + "auxiliary_loss_clip": 0.06447513, + "auxiliary_loss_mlp": 0.01272657, + "balance_loss_clip": 0.06282953, + "balance_loss_mlp": 0.01258924, + "epoch": 0.46481286637607094, + "flos": 20418316202880.0, + "grad_norm": 1.544685409716396, + "language_loss": 0.77440143, + "learning_rate": 2.3235772458953655e-06, + "loss": 0.85160315, + "num_input_tokens_seen": 165924640, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.13726807, + "step": 7731, + "time_per_iteration": 2.539971351623535 + }, + { + "auxiliary_loss_clip": 0.06444095, + "auxiliary_loss_mlp": 0.01267001, + "balance_loss_clip": 0.06280014, + "balance_loss_mlp": 0.01253984, + "epoch": 0.4648729896287389, + "flos": 34283692650240.0, + "grad_norm": 1.8393249998070078, + "language_loss": 0.66022158, + "learning_rate": 2.323192909069061e-06, + "loss": 0.73733258, + "num_input_tokens_seen": 165945765, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.13006592, + "step": 7732, + "time_per_iteration": 2.6860389709472656 + }, + { + "auxiliary_loss_clip": 0.0645274, + "auxiliary_loss_mlp": 0.01268588, + "balance_loss_clip": 0.0628058, + "balance_loss_mlp": 0.01254474, + "epoch": 0.4649331128814069, + "flos": 21327704565120.0, + "grad_norm": 2.1920635353287157, + "language_loss": 0.73225021, + "learning_rate": 2.32280855998725e-06, + "loss": 0.8094635, + "num_input_tokens_seen": 165964025, + "router_z_loss_clip": 1.72363281, + "router_z_loss_mlp": 0.14123535, + "step": 7733, + "time_per_iteration": 2.4875564575195312 + }, + { + "auxiliary_loss_clip": 0.06338679, + "auxiliary_loss_mlp": 0.01252754, + "balance_loss_clip": 0.0626616, + "balance_loss_mlp": 0.0124981, + "epoch": 0.46499323613407484, + "flos": 58325082744960.0, + "grad_norm": 1.3051386869973822, + "language_loss": 0.52022988, + "learning_rate": 2.3224241986645057e-06, + "loss": 0.5961442, + "num_input_tokens_seen": 166021950, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.02941895, + "step": 7734, + "time_per_iteration": 3.0869898796081543 + }, + { + "auxiliary_loss_clip": 0.0644846, + "auxiliary_loss_mlp": 0.01271308, + "balance_loss_clip": 0.06283916, + "balance_loss_mlp": 0.01257856, + "epoch": 0.4650533593867428, + "flos": 10894308773760.0, + "grad_norm": 2.170877243914886, + "language_loss": 0.75776118, + "learning_rate": 2.3220398251154035e-06, + "loss": 0.83495891, + "num_input_tokens_seen": 166039675, + "router_z_loss_clip": 1.64648438, + "router_z_loss_mlp": 0.13464355, + "step": 7735, + "time_per_iteration": 2.478837490081787 + }, + { + "auxiliary_loss_clip": 0.06441534, + "auxiliary_loss_mlp": 0.01268486, + "balance_loss_clip": 0.0627993, + "balance_loss_mlp": 0.01255009, + "epoch": 0.46511348263941077, + "flos": 19980756581760.0, + "grad_norm": 2.0032469234086507, + "language_loss": 0.6994068, + "learning_rate": 2.321655439354519e-06, + "loss": 0.77650702, + "num_input_tokens_seen": 166057745, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.13482666, + "step": 7736, + "time_per_iteration": 2.6353659629821777 + }, + { + "auxiliary_loss_clip": 0.06442849, + "auxiliary_loss_mlp": 0.01268241, + "balance_loss_clip": 0.0628303, + "balance_loss_mlp": 0.01256237, + "epoch": 0.46517360589207873, + "flos": 19683795312000.0, + "grad_norm": 1.6634794649969447, + "language_loss": 0.72674608, + "learning_rate": 2.321271041396427e-06, + "loss": 0.80385697, + "num_input_tokens_seen": 166076440, + "router_z_loss_clip": 1.59570312, + "router_z_loss_mlp": 0.12005615, + "step": 7737, + "time_per_iteration": 2.5038952827453613 + }, + { + "auxiliary_loss_clip": 0.06449911, + "auxiliary_loss_mlp": 0.01268223, + "balance_loss_clip": 0.06283341, + "balance_loss_mlp": 0.01254603, + "epoch": 0.46523372914474675, + "flos": 16878203452800.0, + "grad_norm": 1.9711860161800356, + "language_loss": 0.84095049, + "learning_rate": 2.3208866312557065e-06, + "loss": 0.91813183, + "num_input_tokens_seen": 166092520, + "router_z_loss_clip": 1.66699219, + "router_z_loss_mlp": 0.1361084, + "step": 7738, + "time_per_iteration": 2.5216240882873535 + }, + { + "auxiliary_loss_clip": 0.06338458, + "auxiliary_loss_mlp": 0.01253722, + "balance_loss_clip": 0.06265976, + "balance_loss_mlp": 0.01250617, + "epoch": 0.4652938523974147, + "flos": 53458188917760.0, + "grad_norm": 0.7399188166866549, + "language_loss": 0.57646966, + "learning_rate": 2.320502208946932e-06, + "loss": 0.65239149, + "num_input_tokens_seen": 166156285, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.03102112, + "step": 7739, + "time_per_iteration": 3.215662717819214 + }, + { + "auxiliary_loss_clip": 0.06450304, + "auxiliary_loss_mlp": 0.01271295, + "balance_loss_clip": 0.06285876, + "balance_loss_mlp": 0.01257299, + "epoch": 0.4653539756500827, + "flos": 15236642113920.0, + "grad_norm": 1.7449085109148506, + "language_loss": 0.85184145, + "learning_rate": 2.3201177744846815e-06, + "loss": 0.92905748, + "num_input_tokens_seen": 166173455, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.14013672, + "step": 7740, + "time_per_iteration": 2.4736168384552 + }, + { + "auxiliary_loss_clip": 0.0644415, + "auxiliary_loss_mlp": 0.01270653, + "balance_loss_clip": 0.06281894, + "balance_loss_mlp": 0.01256706, + "epoch": 0.46541409890275065, + "flos": 23738978309760.0, + "grad_norm": 1.5125636475233326, + "language_loss": 0.76338875, + "learning_rate": 2.3197333278835327e-06, + "loss": 0.84053683, + "num_input_tokens_seen": 166194370, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.1394043, + "step": 7741, + "time_per_iteration": 2.56061053276062 + }, + { + "auxiliary_loss_clip": 0.06456167, + "auxiliary_loss_mlp": 0.01268672, + "balance_loss_clip": 0.06284943, + "balance_loss_mlp": 0.01254838, + "epoch": 0.4654742221554186, + "flos": 20853150566400.0, + "grad_norm": 1.6688490987186926, + "language_loss": 0.81291914, + "learning_rate": 2.319348869158064e-06, + "loss": 0.89016759, + "num_input_tokens_seen": 166213195, + "router_z_loss_clip": 1.71386719, + "router_z_loss_mlp": 0.13812256, + "step": 7742, + "time_per_iteration": 2.5372226238250732 + }, + { + "auxiliary_loss_clip": 0.06456183, + "auxiliary_loss_mlp": 0.01268485, + "balance_loss_clip": 0.06287557, + "balance_loss_mlp": 0.01254264, + "epoch": 0.4655343454080866, + "flos": 20711210549760.0, + "grad_norm": 1.6329017257985423, + "language_loss": 0.72620338, + "learning_rate": 2.3189643983228555e-06, + "loss": 0.80345011, + "num_input_tokens_seen": 166231350, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.14227295, + "step": 7743, + "time_per_iteration": 2.561323404312134 + }, + { + "auxiliary_loss_clip": 0.0644543, + "auxiliary_loss_mlp": 0.01269888, + "balance_loss_clip": 0.06280947, + "balance_loss_mlp": 0.01256036, + "epoch": 0.46559446866075455, + "flos": 18995912017920.0, + "grad_norm": 1.7294678893011792, + "language_loss": 0.71235406, + "learning_rate": 2.318579915392483e-06, + "loss": 0.78950727, + "num_input_tokens_seen": 166250530, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.13842773, + "step": 7744, + "time_per_iteration": 2.491428852081299 + }, + { + "auxiliary_loss_clip": 0.06446386, + "auxiliary_loss_mlp": 0.01264964, + "balance_loss_clip": 0.06285123, + "balance_loss_mlp": 0.01252513, + "epoch": 0.4656545919134225, + "flos": 34505030010240.0, + "grad_norm": 1.6678897715471863, + "language_loss": 0.84893715, + "learning_rate": 2.31819542038153e-06, + "loss": 0.92605066, + "num_input_tokens_seen": 166272545, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.12451172, + "step": 7745, + "time_per_iteration": 2.759547233581543 + }, + { + "auxiliary_loss_clip": 0.064444, + "auxiliary_loss_mlp": 0.01268532, + "balance_loss_clip": 0.06282735, + "balance_loss_mlp": 0.01255824, + "epoch": 0.4657147151660905, + "flos": 24316465449600.0, + "grad_norm": 1.3285756054685907, + "language_loss": 0.73465878, + "learning_rate": 2.317810913304574e-06, + "loss": 0.81178808, + "num_input_tokens_seen": 166292135, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.12701416, + "step": 7746, + "time_per_iteration": 2.5268633365631104 + }, + { + "auxiliary_loss_clip": 0.064431, + "auxiliary_loss_mlp": 0.01272209, + "balance_loss_clip": 0.06282558, + "balance_loss_mlp": 0.0125931, + "epoch": 0.46577483841875844, + "flos": 58807743390720.0, + "grad_norm": 1.6027404056917662, + "language_loss": 0.69721079, + "learning_rate": 2.3174263941761963e-06, + "loss": 0.77436388, + "num_input_tokens_seen": 166316710, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.12896729, + "step": 7747, + "time_per_iteration": 2.8772974014282227 + }, + { + "auxiliary_loss_clip": 0.06441785, + "auxiliary_loss_mlp": 0.01269191, + "balance_loss_clip": 0.06279266, + "balance_loss_mlp": 0.01255631, + "epoch": 0.4658349616714264, + "flos": 31330081353600.0, + "grad_norm": 1.8250767057505617, + "language_loss": 0.68153578, + "learning_rate": 2.317041863010978e-06, + "loss": 0.75864553, + "num_input_tokens_seen": 166338535, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.13543701, + "step": 7748, + "time_per_iteration": 2.576828956604004 + }, + { + "auxiliary_loss_clip": 0.06449303, + "auxiliary_loss_mlp": 0.01269068, + "balance_loss_clip": 0.06280029, + "balance_loss_mlp": 0.01254768, + "epoch": 0.46589508492409437, + "flos": 14864601985920.0, + "grad_norm": 2.1691376792383554, + "language_loss": 0.64591479, + "learning_rate": 2.3166573198235007e-06, + "loss": 0.72309858, + "num_input_tokens_seen": 166355540, + "router_z_loss_clip": 1.69140625, + "router_z_loss_mlp": 0.14306641, + "step": 7749, + "time_per_iteration": 2.5408928394317627 + }, + { + "auxiliary_loss_clip": 0.06452534, + "auxiliary_loss_mlp": 0.01273929, + "balance_loss_clip": 0.06283832, + "balance_loss_mlp": 0.01258795, + "epoch": 0.46595520817676234, + "flos": 12900908424960.0, + "grad_norm": 2.0171049134441237, + "language_loss": 0.74442625, + "learning_rate": 2.3162727646283456e-06, + "loss": 0.82169086, + "num_input_tokens_seen": 166372635, + "router_z_loss_clip": 1.6875, + "router_z_loss_mlp": 0.15142822, + "step": 7750, + "time_per_iteration": 2.4698846340179443 + }, + { + "auxiliary_loss_clip": 0.06444734, + "auxiliary_loss_mlp": 0.01270437, + "balance_loss_clip": 0.06276895, + "balance_loss_mlp": 0.01255811, + "epoch": 0.46601533142943036, + "flos": 32862504349440.0, + "grad_norm": 1.8980956421649817, + "language_loss": 0.7426213, + "learning_rate": 2.3158881974400963e-06, + "loss": 0.81977308, + "num_input_tokens_seen": 166393175, + "router_z_loss_clip": 1.67871094, + "router_z_loss_mlp": 0.14624023, + "step": 7751, + "time_per_iteration": 2.6534221172332764 + }, + { + "auxiliary_loss_clip": 0.06449904, + "auxiliary_loss_mlp": 0.01267221, + "balance_loss_clip": 0.06280084, + "balance_loss_mlp": 0.01253017, + "epoch": 0.4660754546820983, + "flos": 19972496954880.0, + "grad_norm": 1.7579709538150943, + "language_loss": 0.73910719, + "learning_rate": 2.3155036182733345e-06, + "loss": 0.81627846, + "num_input_tokens_seen": 166408630, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.14202881, + "step": 7752, + "time_per_iteration": 2.474492311477661 + }, + { + "auxiliary_loss_clip": 0.06447943, + "auxiliary_loss_mlp": 0.01268609, + "balance_loss_clip": 0.06279718, + "balance_loss_mlp": 0.01254578, + "epoch": 0.4661355779347663, + "flos": 26695482572160.0, + "grad_norm": 2.190938043745359, + "language_loss": 0.69726032, + "learning_rate": 2.315119027142644e-06, + "loss": 0.7744258, + "num_input_tokens_seen": 166428170, + "router_z_loss_clip": 1.68066406, + "router_z_loss_mlp": 0.14038086, + "step": 7753, + "time_per_iteration": 2.604612350463867 + }, + { + "auxiliary_loss_clip": 0.06438763, + "auxiliary_loss_mlp": 0.01269724, + "balance_loss_clip": 0.0627787, + "balance_loss_mlp": 0.01256777, + "epoch": 0.46619570118743425, + "flos": 20965726926720.0, + "grad_norm": 1.7706266197381177, + "language_loss": 0.73293746, + "learning_rate": 2.3147344240626076e-06, + "loss": 0.81002235, + "num_input_tokens_seen": 166446705, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.12963867, + "step": 7754, + "time_per_iteration": 2.491225242614746 + }, + { + "auxiliary_loss_clip": 0.06444383, + "auxiliary_loss_mlp": 0.01271714, + "balance_loss_clip": 0.06278208, + "balance_loss_mlp": 0.01256855, + "epoch": 0.4662558244401022, + "flos": 24433024878720.0, + "grad_norm": 1.5728879839910523, + "language_loss": 0.79001075, + "learning_rate": 2.3143498090478114e-06, + "loss": 0.8671717, + "num_input_tokens_seen": 166466750, + "router_z_loss_clip": 1.6640625, + "router_z_loss_mlp": 0.14868164, + "step": 7755, + "time_per_iteration": 2.562178134918213 + }, + { + "auxiliary_loss_clip": 0.06436031, + "auxiliary_loss_mlp": 0.01269294, + "balance_loss_clip": 0.06276575, + "balance_loss_mlp": 0.01256181, + "epoch": 0.4663159476927702, + "flos": 20601820644480.0, + "grad_norm": 1.5633103047544015, + "language_loss": 0.72593671, + "learning_rate": 2.3139651821128382e-06, + "loss": 0.80299002, + "num_input_tokens_seen": 166485400, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.13116455, + "step": 7756, + "time_per_iteration": 4.01608943939209 + }, + { + "auxiliary_loss_clip": 0.06436817, + "auxiliary_loss_mlp": 0.01269611, + "balance_loss_clip": 0.06276436, + "balance_loss_mlp": 0.01256897, + "epoch": 0.46637607094543815, + "flos": 25668235042560.0, + "grad_norm": 1.701604485790762, + "language_loss": 0.7836898, + "learning_rate": 2.313580543272274e-06, + "loss": 0.86075413, + "num_input_tokens_seen": 166505730, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.12719727, + "step": 7757, + "time_per_iteration": 2.555097818374634 + }, + { + "auxiliary_loss_clip": 0.06441291, + "auxiliary_loss_mlp": 0.01274403, + "balance_loss_clip": 0.06277295, + "balance_loss_mlp": 0.01261123, + "epoch": 0.4664361941981061, + "flos": 24279722634240.0, + "grad_norm": 1.9711907960618857, + "language_loss": 0.66213286, + "learning_rate": 2.313195892540705e-06, + "loss": 0.73928982, + "num_input_tokens_seen": 166523770, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.13275146, + "step": 7758, + "time_per_iteration": 2.569962739944458 + }, + { + "auxiliary_loss_clip": 0.06442615, + "auxiliary_loss_mlp": 0.01273146, + "balance_loss_clip": 0.0627957, + "balance_loss_mlp": 0.01260629, + "epoch": 0.4664963174507741, + "flos": 18411800405760.0, + "grad_norm": 1.9738824417509344, + "language_loss": 0.74950838, + "learning_rate": 2.3128112299327147e-06, + "loss": 0.826666, + "num_input_tokens_seen": 166542935, + "router_z_loss_clip": 1.63085938, + "router_z_loss_mlp": 0.12518311, + "step": 7759, + "time_per_iteration": 2.47729229927063 + }, + { + "auxiliary_loss_clip": 0.06440781, + "auxiliary_loss_mlp": 0.01272683, + "balance_loss_clip": 0.06281125, + "balance_loss_mlp": 0.01259827, + "epoch": 0.46655644070344204, + "flos": 22461616742400.0, + "grad_norm": 3.1770723580201103, + "language_loss": 0.77710176, + "learning_rate": 2.312426555462893e-06, + "loss": 0.85423636, + "num_input_tokens_seen": 166563935, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.12860107, + "step": 7760, + "time_per_iteration": 2.555143117904663 + }, + { + "auxiliary_loss_clip": 0.06438316, + "auxiliary_loss_mlp": 0.01270754, + "balance_loss_clip": 0.06279285, + "balance_loss_mlp": 0.01256675, + "epoch": 0.46661656395611, + "flos": 13813525169280.0, + "grad_norm": 1.6658245877843647, + "language_loss": 0.7447418, + "learning_rate": 2.3120418691458237e-06, + "loss": 0.82183254, + "num_input_tokens_seen": 166582175, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.14099121, + "step": 7761, + "time_per_iteration": 2.493032217025757 + }, + { + "auxiliary_loss_clip": 0.06446707, + "auxiliary_loss_mlp": 0.01275728, + "balance_loss_clip": 0.06281132, + "balance_loss_mlp": 0.0126094, + "epoch": 0.466676687208778, + "flos": 21658473757440.0, + "grad_norm": 1.6817719059657052, + "language_loss": 0.78770381, + "learning_rate": 2.3116571709960956e-06, + "loss": 0.86492819, + "num_input_tokens_seen": 166601870, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.14788818, + "step": 7762, + "time_per_iteration": 2.5613081455230713 + }, + { + "auxiliary_loss_clip": 0.06338885, + "auxiliary_loss_mlp": 0.01268455, + "balance_loss_clip": 0.06268312, + "balance_loss_mlp": 0.01265552, + "epoch": 0.46673681046144594, + "flos": 68554163554560.0, + "grad_norm": 0.7818830178478652, + "language_loss": 0.59643799, + "learning_rate": 2.311272461028297e-06, + "loss": 0.67251134, + "num_input_tokens_seen": 166668960, + "router_z_loss_clip": 0.70605469, + "router_z_loss_mlp": 0.0289917, + "step": 7763, + "time_per_iteration": 4.584456443786621 + }, + { + "auxiliary_loss_clip": 0.06446124, + "auxiliary_loss_mlp": 0.01269966, + "balance_loss_clip": 0.06278878, + "balance_loss_mlp": 0.01255559, + "epoch": 0.46679693371411396, + "flos": 15819789404160.0, + "grad_norm": 1.948864663001373, + "language_loss": 0.79278809, + "learning_rate": 2.3108877392570146e-06, + "loss": 0.86994898, + "num_input_tokens_seen": 166686110, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.14398193, + "step": 7764, + "time_per_iteration": 2.465179920196533 + }, + { + "auxiliary_loss_clip": 0.06441632, + "auxiliary_loss_mlp": 0.01267635, + "balance_loss_clip": 0.06281599, + "balance_loss_mlp": 0.01255035, + "epoch": 0.4668570569667819, + "flos": 18520393697280.0, + "grad_norm": 2.0437394229584123, + "language_loss": 0.72096646, + "learning_rate": 2.310503005696839e-06, + "loss": 0.79805923, + "num_input_tokens_seen": 166703930, + "router_z_loss_clip": 1.59863281, + "router_z_loss_mlp": 0.12597656, + "step": 7765, + "time_per_iteration": 2.5701630115509033 + }, + { + "auxiliary_loss_clip": 0.06443523, + "auxiliary_loss_mlp": 0.01272136, + "balance_loss_clip": 0.06278671, + "balance_loss_mlp": 0.01258141, + "epoch": 0.4669171802194499, + "flos": 19212385841280.0, + "grad_norm": 2.21059711365052, + "language_loss": 0.77947736, + "learning_rate": 2.3101182603623576e-06, + "loss": 0.85663396, + "num_input_tokens_seen": 166719940, + "router_z_loss_clip": 1.64746094, + "router_z_loss_mlp": 0.14001465, + "step": 7766, + "time_per_iteration": 2.481160879135132 + }, + { + "auxiliary_loss_clip": 0.06441876, + "auxiliary_loss_mlp": 0.01272138, + "balance_loss_clip": 0.06280202, + "balance_loss_mlp": 0.01258489, + "epoch": 0.46697730347211786, + "flos": 12281018319360.0, + "grad_norm": 2.232432946710323, + "language_loss": 0.65461195, + "learning_rate": 2.3097335032681607e-06, + "loss": 0.73175204, + "num_input_tokens_seen": 166738285, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.13653564, + "step": 7767, + "time_per_iteration": 2.5368387699127197 + }, + { + "auxiliary_loss_clip": 0.06442834, + "auxiliary_loss_mlp": 0.01272968, + "balance_loss_clip": 0.06280966, + "balance_loss_mlp": 0.01259307, + "epoch": 0.4670374267247858, + "flos": 23593516421760.0, + "grad_norm": 2.313152144280668, + "language_loss": 0.75071919, + "learning_rate": 2.3093487344288393e-06, + "loss": 0.82787716, + "num_input_tokens_seen": 166758170, + "router_z_loss_clip": 1.61914062, + "router_z_loss_mlp": 0.13677979, + "step": 7768, + "time_per_iteration": 3.9271702766418457 + }, + { + "auxiliary_loss_clip": 0.06441817, + "auxiliary_loss_mlp": 0.0126721, + "balance_loss_clip": 0.06279824, + "balance_loss_mlp": 0.01253697, + "epoch": 0.4670975499774538, + "flos": 15995495416320.0, + "grad_norm": 1.5695198160982793, + "language_loss": 0.71176434, + "learning_rate": 2.308963953858982e-06, + "loss": 0.7888546, + "num_input_tokens_seen": 166775750, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.1350708, + "step": 7769, + "time_per_iteration": 2.5253636837005615 + }, + { + "auxiliary_loss_clip": 0.06441696, + "auxiliary_loss_mlp": 0.01271746, + "balance_loss_clip": 0.06279374, + "balance_loss_mlp": 0.01258305, + "epoch": 0.46715767323012175, + "flos": 15383026396800.0, + "grad_norm": 1.8223238330296296, + "language_loss": 0.81503379, + "learning_rate": 2.3085791615731803e-06, + "loss": 0.89216816, + "num_input_tokens_seen": 166791720, + "router_z_loss_clip": 1.61914062, + "router_z_loss_mlp": 0.13446045, + "step": 7770, + "time_per_iteration": 2.468287706375122 + }, + { + "auxiliary_loss_clip": 0.06346406, + "auxiliary_loss_mlp": 0.01251242, + "balance_loss_clip": 0.06275694, + "balance_loss_mlp": 0.01249068, + "epoch": 0.4672177964827897, + "flos": 60270774877440.0, + "grad_norm": 0.8490857527823061, + "language_loss": 0.55591935, + "learning_rate": 2.3081943575860265e-06, + "loss": 0.63189584, + "num_input_tokens_seen": 166856360, + "router_z_loss_clip": 0.70703125, + "router_z_loss_mlp": 0.02177429, + "step": 7771, + "time_per_iteration": 3.1719799041748047 + }, + { + "auxiliary_loss_clip": 0.064445, + "auxiliary_loss_mlp": 0.01269252, + "balance_loss_clip": 0.06282087, + "balance_loss_mlp": 0.01256234, + "epoch": 0.4672779197354577, + "flos": 27643500466560.0, + "grad_norm": 2.2149063838305363, + "language_loss": 0.65989488, + "learning_rate": 2.3078095419121117e-06, + "loss": 0.73703241, + "num_input_tokens_seen": 166875925, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.13024902, + "step": 7772, + "time_per_iteration": 2.616668939590454 + }, + { + "auxiliary_loss_clip": 0.06441614, + "auxiliary_loss_mlp": 0.01269621, + "balance_loss_clip": 0.06282961, + "balance_loss_mlp": 0.01257009, + "epoch": 0.46733804298812565, + "flos": 31402267246080.0, + "grad_norm": 2.671628135597842, + "language_loss": 0.64495057, + "learning_rate": 2.3074247145660283e-06, + "loss": 0.72206295, + "num_input_tokens_seen": 166896520, + "router_z_loss_clip": 1.58789062, + "router_z_loss_mlp": 0.1260376, + "step": 7773, + "time_per_iteration": 2.5923900604248047 + }, + { + "auxiliary_loss_clip": 0.06442621, + "auxiliary_loss_mlp": 0.01269928, + "balance_loss_clip": 0.06280822, + "balance_loss_mlp": 0.01256457, + "epoch": 0.4673981662407936, + "flos": 19506747634560.0, + "grad_norm": 1.7164237292195044, + "language_loss": 0.80045915, + "learning_rate": 2.3070398755623685e-06, + "loss": 0.87758458, + "num_input_tokens_seen": 166915370, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.13464355, + "step": 7774, + "time_per_iteration": 2.577458620071411 + }, + { + "auxiliary_loss_clip": 0.06444994, + "auxiliary_loss_mlp": 0.01268006, + "balance_loss_clip": 0.06279732, + "balance_loss_mlp": 0.01254583, + "epoch": 0.4674582894934616, + "flos": 20528083451520.0, + "grad_norm": 1.5985457295090966, + "language_loss": 0.78042519, + "learning_rate": 2.306655024915726e-06, + "loss": 0.85755515, + "num_input_tokens_seen": 166934875, + "router_z_loss_clip": 1.65234375, + "router_z_loss_mlp": 0.13439941, + "step": 7775, + "time_per_iteration": 2.5538787841796875 + }, + { + "auxiliary_loss_clip": 0.06442325, + "auxiliary_loss_mlp": 0.0127297, + "balance_loss_clip": 0.06282222, + "balance_loss_mlp": 0.01259988, + "epoch": 0.46751841274612954, + "flos": 22097500824960.0, + "grad_norm": 1.8860444903676625, + "language_loss": 0.69909471, + "learning_rate": 2.306270162640694e-06, + "loss": 0.77624762, + "num_input_tokens_seen": 166954285, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.12963867, + "step": 7776, + "time_per_iteration": 2.561692237854004 + }, + { + "auxiliary_loss_clip": 0.0644502, + "auxiliary_loss_mlp": 0.01270071, + "balance_loss_clip": 0.06284119, + "balance_loss_mlp": 0.01257244, + "epoch": 0.46757853599879756, + "flos": 26987454524160.0, + "grad_norm": 1.3861659298765134, + "language_loss": 0.74096608, + "learning_rate": 2.3058852887518678e-06, + "loss": 0.81811702, + "num_input_tokens_seen": 166975975, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.1282959, + "step": 7777, + "time_per_iteration": 2.536015510559082 + }, + { + "auxiliary_loss_clip": 0.06447745, + "auxiliary_loss_mlp": 0.01270612, + "balance_loss_clip": 0.06284414, + "balance_loss_mlp": 0.01256921, + "epoch": 0.4676386592514655, + "flos": 24140927145600.0, + "grad_norm": 1.9470179218555579, + "language_loss": 0.69820189, + "learning_rate": 2.3055004032638394e-06, + "loss": 0.77538544, + "num_input_tokens_seen": 166996140, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.13690186, + "step": 7778, + "time_per_iteration": 2.548154354095459 + }, + { + "auxiliary_loss_clip": 0.06447626, + "auxiliary_loss_mlp": 0.01267681, + "balance_loss_clip": 0.06282265, + "balance_loss_mlp": 0.01253513, + "epoch": 0.4676987825041335, + "flos": 25490768094720.0, + "grad_norm": 1.4247023457023664, + "language_loss": 0.73440385, + "learning_rate": 2.305115506191206e-06, + "loss": 0.81155688, + "num_input_tokens_seen": 167016105, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.14160156, + "step": 7779, + "time_per_iteration": 2.5291388034820557 + }, + { + "auxiliary_loss_clip": 0.06443821, + "auxiliary_loss_mlp": 0.01265717, + "balance_loss_clip": 0.06285408, + "balance_loss_mlp": 0.01253379, + "epoch": 0.46775890575680146, + "flos": 21951871228800.0, + "grad_norm": 1.9613896423037807, + "language_loss": 0.72685552, + "learning_rate": 2.304730597548562e-06, + "loss": 0.80395079, + "num_input_tokens_seen": 167036185, + "router_z_loss_clip": 1.58398438, + "router_z_loss_mlp": 0.12353516, + "step": 7780, + "time_per_iteration": 2.5508480072021484 + }, + { + "auxiliary_loss_clip": 0.06447856, + "auxiliary_loss_mlp": 0.01269851, + "balance_loss_clip": 0.06280719, + "balance_loss_mlp": 0.01256273, + "epoch": 0.4678190290094694, + "flos": 25235413176960.0, + "grad_norm": 1.8471847442174032, + "language_loss": 0.74638426, + "learning_rate": 2.3043456773505023e-06, + "loss": 0.82356131, + "num_input_tokens_seen": 167054515, + "router_z_loss_clip": 1.671875, + "router_z_loss_mlp": 0.13586426, + "step": 7781, + "time_per_iteration": 2.527614116668701 + }, + { + "auxiliary_loss_clip": 0.06446712, + "auxiliary_loss_mlp": 0.01269353, + "balance_loss_clip": 0.06281281, + "balance_loss_mlp": 0.0125528, + "epoch": 0.4678791522621374, + "flos": 32276254458240.0, + "grad_norm": 1.845752858447898, + "language_loss": 0.63050562, + "learning_rate": 2.3039607456116252e-06, + "loss": 0.70766628, + "num_input_tokens_seen": 167077245, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.140625, + "step": 7782, + "time_per_iteration": 2.650505304336548 + }, + { + "auxiliary_loss_clip": 0.06445308, + "auxiliary_loss_mlp": 0.01268795, + "balance_loss_clip": 0.06280467, + "balance_loss_mlp": 0.01255306, + "epoch": 0.46793927551480535, + "flos": 27052764382080.0, + "grad_norm": 2.229893941722145, + "language_loss": 0.63585413, + "learning_rate": 2.3035758023465254e-06, + "loss": 0.71299517, + "num_input_tokens_seen": 167097235, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.13494873, + "step": 7783, + "time_per_iteration": 2.5537588596343994 + }, + { + "auxiliary_loss_clip": 0.0645118, + "auxiliary_loss_mlp": 0.01271407, + "balance_loss_clip": 0.06280845, + "balance_loss_mlp": 0.01257245, + "epoch": 0.4679993987674733, + "flos": 17463195532800.0, + "grad_norm": 2.4083561383098004, + "language_loss": 0.68662858, + "learning_rate": 2.303190847569801e-06, + "loss": 0.7638545, + "num_input_tokens_seen": 167113155, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.1418457, + "step": 7784, + "time_per_iteration": 2.560459613800049 + }, + { + "auxiliary_loss_clip": 0.06438549, + "auxiliary_loss_mlp": 0.01266567, + "balance_loss_clip": 0.06278238, + "balance_loss_mlp": 0.01254003, + "epoch": 0.4680595220201413, + "flos": 17170804310400.0, + "grad_norm": 1.9765250646873525, + "language_loss": 0.84616911, + "learning_rate": 2.3028058812960497e-06, + "loss": 0.92322016, + "num_input_tokens_seen": 167131765, + "router_z_loss_clip": 1.60351562, + "router_z_loss_mlp": 0.12567139, + "step": 7785, + "time_per_iteration": 2.5567643642425537 + }, + { + "auxiliary_loss_clip": 0.06444662, + "auxiliary_loss_mlp": 0.01268089, + "balance_loss_clip": 0.06281722, + "balance_loss_mlp": 0.01254225, + "epoch": 0.46811964527280925, + "flos": 11332329592320.0, + "grad_norm": 1.9719414675879272, + "language_loss": 0.77991092, + "learning_rate": 2.3024209035398678e-06, + "loss": 0.85703844, + "num_input_tokens_seen": 167149030, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.13867188, + "step": 7786, + "time_per_iteration": 2.507206439971924 + }, + { + "auxiliary_loss_clip": 0.06440122, + "auxiliary_loss_mlp": 0.01265794, + "balance_loss_clip": 0.06281641, + "balance_loss_mlp": 0.01253897, + "epoch": 0.4681797685254772, + "flos": 24285508565760.0, + "grad_norm": 2.2497529795631817, + "language_loss": 0.74387538, + "learning_rate": 2.302035914315856e-06, + "loss": 0.82093459, + "num_input_tokens_seen": 167167375, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.11901855, + "step": 7787, + "time_per_iteration": 2.498021125793457 + }, + { + "auxiliary_loss_clip": 0.06439888, + "auxiliary_loss_mlp": 0.01272631, + "balance_loss_clip": 0.06278901, + "balance_loss_mlp": 0.01258785, + "epoch": 0.4682398917781452, + "flos": 31658544558720.0, + "grad_norm": 1.7533783368280031, + "language_loss": 0.66132212, + "learning_rate": 2.3016509136386116e-06, + "loss": 0.73844731, + "num_input_tokens_seen": 167188065, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.1383667, + "step": 7788, + "time_per_iteration": 2.650092363357544 + }, + { + "auxiliary_loss_clip": 0.06441839, + "auxiliary_loss_mlp": 0.01268022, + "balance_loss_clip": 0.06280681, + "balance_loss_mlp": 0.01256036, + "epoch": 0.46830001503081314, + "flos": 28118264100480.0, + "grad_norm": 1.5278727961877703, + "language_loss": 0.64315766, + "learning_rate": 2.3012659015227343e-06, + "loss": 0.72025621, + "num_input_tokens_seen": 167209675, + "router_z_loss_clip": 1.61132812, + "router_z_loss_mlp": 0.11987305, + "step": 7789, + "time_per_iteration": 2.5806198120117188 + }, + { + "auxiliary_loss_clip": 0.06338993, + "auxiliary_loss_mlp": 0.01252338, + "balance_loss_clip": 0.06268935, + "balance_loss_mlp": 0.01250063, + "epoch": 0.4683601382834811, + "flos": 57900059308800.0, + "grad_norm": 0.6904155708009142, + "language_loss": 0.61868596, + "learning_rate": 2.300880877982825e-06, + "loss": 0.69459921, + "num_input_tokens_seen": 167273940, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 0.02276611, + "step": 7790, + "time_per_iteration": 3.2271504402160645 + }, + { + "auxiliary_loss_clip": 0.06442016, + "auxiliary_loss_mlp": 0.01269711, + "balance_loss_clip": 0.06283005, + "balance_loss_mlp": 0.01257111, + "epoch": 0.46842026153614913, + "flos": 21878427525120.0, + "grad_norm": 1.6377280327187325, + "language_loss": 0.79426539, + "learning_rate": 2.3004958430334808e-06, + "loss": 0.87138271, + "num_input_tokens_seen": 167292730, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.12597656, + "step": 7791, + "time_per_iteration": 2.490171194076538 + }, + { + "auxiliary_loss_clip": 0.06441824, + "auxiliary_loss_mlp": 0.01269493, + "balance_loss_clip": 0.06283456, + "balance_loss_mlp": 0.01256899, + "epoch": 0.4684803847888171, + "flos": 24907914293760.0, + "grad_norm": 1.496703208223837, + "language_loss": 0.74930024, + "learning_rate": 2.3001107966893052e-06, + "loss": 0.82641351, + "num_input_tokens_seen": 167313460, + "router_z_loss_clip": 1.58300781, + "router_z_loss_mlp": 0.12573242, + "step": 7792, + "time_per_iteration": 2.5588057041168213 + }, + { + "auxiliary_loss_clip": 0.0643919, + "auxiliary_loss_mlp": 0.01267774, + "balance_loss_clip": 0.06282478, + "balance_loss_mlp": 0.01255972, + "epoch": 0.46854050804148506, + "flos": 26259138835200.0, + "grad_norm": 1.9488467409065784, + "language_loss": 0.68353844, + "learning_rate": 2.299725738964898e-06, + "loss": 0.76060808, + "num_input_tokens_seen": 167335385, + "router_z_loss_clip": 1.56640625, + "router_z_loss_mlp": 0.11804199, + "step": 7793, + "time_per_iteration": 2.543156147003174 + }, + { + "auxiliary_loss_clip": 0.06441274, + "auxiliary_loss_mlp": 0.01273582, + "balance_loss_clip": 0.0628298, + "balance_loss_mlp": 0.01261387, + "epoch": 0.468600631294153, + "flos": 21586204010880.0, + "grad_norm": 1.8535654365133143, + "language_loss": 0.74367434, + "learning_rate": 2.2993406698748607e-06, + "loss": 0.82082289, + "num_input_tokens_seen": 167353625, + "router_z_loss_clip": 1.58105469, + "router_z_loss_mlp": 0.12194824, + "step": 7794, + "time_per_iteration": 2.6082603931427 + }, + { + "auxiliary_loss_clip": 0.06445156, + "auxiliary_loss_mlp": 0.01268892, + "balance_loss_clip": 0.06285646, + "balance_loss_mlp": 0.01255343, + "epoch": 0.468660754546821, + "flos": 25892842711680.0, + "grad_norm": 2.128212140250663, + "language_loss": 0.64027059, + "learning_rate": 2.2989555894337953e-06, + "loss": 0.71741104, + "num_input_tokens_seen": 167374565, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.13537598, + "step": 7795, + "time_per_iteration": 2.554871082305908 + }, + { + "auxiliary_loss_clip": 0.06440422, + "auxiliary_loss_mlp": 0.01266239, + "balance_loss_clip": 0.06283793, + "balance_loss_mlp": 0.01253067, + "epoch": 0.46872087779948896, + "flos": 35482746977280.0, + "grad_norm": 1.4934025143707166, + "language_loss": 0.6791029, + "learning_rate": 2.298570497656304e-06, + "loss": 0.7561695, + "num_input_tokens_seen": 167395010, + "router_z_loss_clip": 1.56542969, + "router_z_loss_mlp": 0.13171387, + "step": 7796, + "time_per_iteration": 4.070605754852295 + }, + { + "auxiliary_loss_clip": 0.06441301, + "auxiliary_loss_mlp": 0.01267111, + "balance_loss_clip": 0.06280352, + "balance_loss_mlp": 0.0125435, + "epoch": 0.4687810010521569, + "flos": 26403720255360.0, + "grad_norm": 1.619506492510176, + "language_loss": 0.70710748, + "learning_rate": 2.2981853945569894e-06, + "loss": 0.78419161, + "num_input_tokens_seen": 167415285, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.12762451, + "step": 7797, + "time_per_iteration": 2.574291706085205 + }, + { + "auxiliary_loss_clip": 0.06443868, + "auxiliary_loss_mlp": 0.01272473, + "balance_loss_clip": 0.0628204, + "balance_loss_mlp": 0.01258472, + "epoch": 0.4688411243048249, + "flos": 19978618302720.0, + "grad_norm": 1.9026226114754317, + "language_loss": 0.67159688, + "learning_rate": 2.297800280150454e-06, + "loss": 0.74876028, + "num_input_tokens_seen": 167432405, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.14007568, + "step": 7798, + "time_per_iteration": 2.4703564643859863 + }, + { + "auxiliary_loss_clip": 0.06331287, + "auxiliary_loss_mlp": 0.01256102, + "balance_loss_clip": 0.06261373, + "balance_loss_mlp": 0.01253898, + "epoch": 0.46890124755749285, + "flos": 63996739983360.0, + "grad_norm": 0.926390069403038, + "language_loss": 0.64518279, + "learning_rate": 2.2974151544513033e-06, + "loss": 0.7210567, + "num_input_tokens_seen": 167499365, + "router_z_loss_clip": 0.69921875, + "router_z_loss_mlp": 0.02207947, + "step": 7799, + "time_per_iteration": 3.3128738403320312 + }, + { + "auxiliary_loss_clip": 0.06441961, + "auxiliary_loss_mlp": 0.01271763, + "balance_loss_clip": 0.06283548, + "balance_loss_mlp": 0.01258429, + "epoch": 0.4689613708101608, + "flos": 23775763052160.0, + "grad_norm": 1.2629628474735628, + "language_loss": 0.72331405, + "learning_rate": 2.2970300174741395e-06, + "loss": 0.80045128, + "num_input_tokens_seen": 167520390, + "router_z_loss_clip": 1.58300781, + "router_z_loss_mlp": 0.13330078, + "step": 7800, + "time_per_iteration": 2.5339090824127197 + }, + { + "auxiliary_loss_clip": 0.06436972, + "auxiliary_loss_mlp": 0.01269738, + "balance_loss_clip": 0.06279731, + "balance_loss_mlp": 0.01257406, + "epoch": 0.4690214940628288, + "flos": 24795337933440.0, + "grad_norm": 2.7480307453946726, + "language_loss": 0.72682166, + "learning_rate": 2.296644869233568e-06, + "loss": 0.80388874, + "num_input_tokens_seen": 167539865, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.12335205, + "step": 7801, + "time_per_iteration": 2.552154541015625 + }, + { + "auxiliary_loss_clip": 0.06449857, + "auxiliary_loss_mlp": 0.01274232, + "balance_loss_clip": 0.06283514, + "balance_loss_mlp": 0.01260094, + "epoch": 0.46908161731549675, + "flos": 18083169492480.0, + "grad_norm": 1.9453242658612842, + "language_loss": 0.62466741, + "learning_rate": 2.2962597097441936e-06, + "loss": 0.70190829, + "num_input_tokens_seen": 167558190, + "router_z_loss_clip": 1.66308594, + "router_z_loss_mlp": 0.14135742, + "step": 7802, + "time_per_iteration": 3.9707396030426025 + }, + { + "auxiliary_loss_clip": 0.06437971, + "auxiliary_loss_mlp": 0.01270017, + "balance_loss_clip": 0.06277081, + "balance_loss_mlp": 0.01257459, + "epoch": 0.4691417405681647, + "flos": 25710554154240.0, + "grad_norm": 1.8844359624083942, + "language_loss": 0.73532665, + "learning_rate": 2.2958745390206206e-06, + "loss": 0.81240654, + "num_input_tokens_seen": 167577685, + "router_z_loss_clip": 1.60839844, + "router_z_loss_mlp": 0.12554932, + "step": 7803, + "time_per_iteration": 2.554459810256958 + }, + { + "auxiliary_loss_clip": 0.06438211, + "auxiliary_loss_mlp": 0.01272362, + "balance_loss_clip": 0.06278156, + "balance_loss_mlp": 0.01259338, + "epoch": 0.46920186382083273, + "flos": 17462776262400.0, + "grad_norm": 1.58578754852504, + "language_loss": 0.77327907, + "learning_rate": 2.2954893570774558e-06, + "loss": 0.85038471, + "num_input_tokens_seen": 167596390, + "router_z_loss_clip": 1.60058594, + "router_z_loss_mlp": 0.13012695, + "step": 7804, + "time_per_iteration": 2.543470621109009 + }, + { + "auxiliary_loss_clip": 0.06432682, + "auxiliary_loss_mlp": 0.0126654, + "balance_loss_clip": 0.06275688, + "balance_loss_mlp": 0.01254298, + "epoch": 0.4692619870735007, + "flos": 20345669112960.0, + "grad_norm": 1.787683586047485, + "language_loss": 0.77375299, + "learning_rate": 2.295104163929305e-06, + "loss": 0.8507452, + "num_input_tokens_seen": 167614980, + "router_z_loss_clip": 1.56933594, + "router_z_loss_mlp": 0.12231445, + "step": 7805, + "time_per_iteration": 2.501739740371704 + }, + { + "auxiliary_loss_clip": 0.0644381, + "auxiliary_loss_mlp": 0.01270681, + "balance_loss_clip": 0.06276695, + "balance_loss_mlp": 0.01257163, + "epoch": 0.46932211032616866, + "flos": 29504177032320.0, + "grad_norm": 1.522976757050157, + "language_loss": 0.83108258, + "learning_rate": 2.2947189595907742e-06, + "loss": 0.90822744, + "num_input_tokens_seen": 167635895, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.13519287, + "step": 7806, + "time_per_iteration": 2.6634225845336914 + }, + { + "auxiliary_loss_clip": 0.06437123, + "auxiliary_loss_mlp": 0.01266513, + "balance_loss_clip": 0.06274477, + "balance_loss_mlp": 0.01253496, + "epoch": 0.4693822335788366, + "flos": 36220202760960.0, + "grad_norm": 1.6923542734381007, + "language_loss": 0.77444482, + "learning_rate": 2.294333744076472e-06, + "loss": 0.8514812, + "num_input_tokens_seen": 167657440, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.13006592, + "step": 7807, + "time_per_iteration": 4.0442986488342285 + }, + { + "auxiliary_loss_clip": 0.06438392, + "auxiliary_loss_mlp": 0.01270643, + "balance_loss_clip": 0.06276641, + "balance_loss_mlp": 0.01257024, + "epoch": 0.4694423568315046, + "flos": 20345124061440.0, + "grad_norm": 1.7839407979100135, + "language_loss": 0.51769608, + "learning_rate": 2.2939485174010035e-06, + "loss": 0.59478641, + "num_input_tokens_seen": 167675025, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.13635254, + "step": 7808, + "time_per_iteration": 2.4910712242126465 + }, + { + "auxiliary_loss_clip": 0.06328695, + "auxiliary_loss_mlp": 0.01252926, + "balance_loss_clip": 0.06259091, + "balance_loss_mlp": 0.01250451, + "epoch": 0.46950248008417256, + "flos": 64343540033280.0, + "grad_norm": 0.7688077124363479, + "language_loss": 0.57691324, + "learning_rate": 2.293563279578978e-06, + "loss": 0.65272945, + "num_input_tokens_seen": 167729635, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 0.0247345, + "step": 7809, + "time_per_iteration": 3.055589199066162 + }, + { + "auxiliary_loss_clip": 0.06439595, + "auxiliary_loss_mlp": 0.01268316, + "balance_loss_clip": 0.06276885, + "balance_loss_mlp": 0.01254845, + "epoch": 0.4695626033368405, + "flos": 19204755120000.0, + "grad_norm": 2.3576337237105425, + "language_loss": 0.71649069, + "learning_rate": 2.2931780306250045e-06, + "loss": 0.7935698, + "num_input_tokens_seen": 167745135, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.13470459, + "step": 7810, + "time_per_iteration": 2.5001537799835205 + }, + { + "auxiliary_loss_clip": 0.06435918, + "auxiliary_loss_mlp": 0.01272852, + "balance_loss_clip": 0.06275883, + "balance_loss_mlp": 0.01259113, + "epoch": 0.4696227265895085, + "flos": 23009027466240.0, + "grad_norm": 3.6880824309964617, + "language_loss": 0.81146425, + "learning_rate": 2.29279277055369e-06, + "loss": 0.88855195, + "num_input_tokens_seen": 167763875, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.13726807, + "step": 7811, + "time_per_iteration": 2.5971217155456543 + }, + { + "auxiliary_loss_clip": 0.06437828, + "auxiliary_loss_mlp": 0.01267753, + "balance_loss_clip": 0.06276736, + "balance_loss_mlp": 0.0125405, + "epoch": 0.46968284984217645, + "flos": 21877169713920.0, + "grad_norm": 1.5426371434141024, + "language_loss": 0.80606401, + "learning_rate": 2.292407499379644e-06, + "loss": 0.88311982, + "num_input_tokens_seen": 167784895, + "router_z_loss_clip": 1.61035156, + "router_z_loss_mlp": 0.13708496, + "step": 7812, + "time_per_iteration": 2.5140600204467773 + }, + { + "auxiliary_loss_clip": 0.06435272, + "auxiliary_loss_mlp": 0.01271707, + "balance_loss_clip": 0.06277305, + "balance_loss_mlp": 0.01258445, + "epoch": 0.4697429730948444, + "flos": 19981217779200.0, + "grad_norm": 1.702985157553907, + "language_loss": 0.74653876, + "learning_rate": 2.292022217117477e-06, + "loss": 0.82360852, + "num_input_tokens_seen": 167803185, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.13256836, + "step": 7813, + "time_per_iteration": 2.530773401260376 + }, + { + "auxiliary_loss_clip": 0.06438613, + "auxiliary_loss_mlp": 0.01270357, + "balance_loss_clip": 0.06279637, + "balance_loss_mlp": 0.01256755, + "epoch": 0.4698030963475124, + "flos": 15161185912320.0, + "grad_norm": 2.103167897479233, + "language_loss": 0.84843278, + "learning_rate": 2.291636923781798e-06, + "loss": 0.92552245, + "num_input_tokens_seen": 167816550, + "router_z_loss_clip": 1.58886719, + "router_z_loss_mlp": 0.13604736, + "step": 7814, + "time_per_iteration": 2.550631046295166 + }, + { + "auxiliary_loss_clip": 0.06432581, + "auxiliary_loss_mlp": 0.01265742, + "balance_loss_clip": 0.06276342, + "balance_loss_mlp": 0.01252856, + "epoch": 0.46986321960018035, + "flos": 15155316126720.0, + "grad_norm": 2.71974016097947, + "language_loss": 0.82219559, + "learning_rate": 2.291251619387217e-06, + "loss": 0.89917886, + "num_input_tokens_seen": 167831845, + "router_z_loss_clip": 1.56152344, + "router_z_loss_mlp": 0.12896729, + "step": 7815, + "time_per_iteration": 2.508582592010498 + }, + { + "auxiliary_loss_clip": 0.06434117, + "auxiliary_loss_mlp": 0.01273411, + "balance_loss_clip": 0.06275953, + "balance_loss_mlp": 0.01259952, + "epoch": 0.4699233428528483, + "flos": 23115021281280.0, + "grad_norm": 2.356408218131492, + "language_loss": 0.77761489, + "learning_rate": 2.2908663039483468e-06, + "loss": 0.85469019, + "num_input_tokens_seen": 167850360, + "router_z_loss_clip": 1.58300781, + "router_z_loss_mlp": 0.13452148, + "step": 7816, + "time_per_iteration": 2.505244493484497 + }, + { + "auxiliary_loss_clip": 0.06334539, + "auxiliary_loss_mlp": 0.01254323, + "balance_loss_clip": 0.06264929, + "balance_loss_mlp": 0.01251993, + "epoch": 0.46998346610551633, + "flos": 68126917985280.0, + "grad_norm": 0.8142436419344395, + "language_loss": 0.58616334, + "learning_rate": 2.290480977479796e-06, + "loss": 0.66205192, + "num_input_tokens_seen": 167908660, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 0.02325439, + "step": 7817, + "time_per_iteration": 3.1171398162841797 + }, + { + "auxiliary_loss_clip": 0.0643587, + "auxiliary_loss_mlp": 0.01268626, + "balance_loss_clip": 0.06280724, + "balance_loss_mlp": 0.01255119, + "epoch": 0.4700435893581843, + "flos": 24135560484480.0, + "grad_norm": 1.6087842481989176, + "language_loss": 0.7922467, + "learning_rate": 2.2900956399961775e-06, + "loss": 0.8692916, + "num_input_tokens_seen": 167927905, + "router_z_loss_clip": 1.54980469, + "router_z_loss_mlp": 0.13513184, + "step": 7818, + "time_per_iteration": 2.5133657455444336 + }, + { + "auxiliary_loss_clip": 0.06435841, + "auxiliary_loss_mlp": 0.01270106, + "balance_loss_clip": 0.06278426, + "balance_loss_mlp": 0.01257279, + "epoch": 0.47010371261085226, + "flos": 20155624053120.0, + "grad_norm": 1.9598217577618973, + "language_loss": 0.83629054, + "learning_rate": 2.289710291512104e-06, + "loss": 0.91334999, + "num_input_tokens_seen": 167945995, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.12841797, + "step": 7819, + "time_per_iteration": 2.512434482574463 + }, + { + "auxiliary_loss_clip": 0.06440641, + "auxiliary_loss_mlp": 0.01268241, + "balance_loss_clip": 0.06277996, + "balance_loss_mlp": 0.01253519, + "epoch": 0.47016383586352023, + "flos": 15127587624960.0, + "grad_norm": 1.951811924314391, + "language_loss": 0.76718354, + "learning_rate": 2.289324932042186e-06, + "loss": 0.84427238, + "num_input_tokens_seen": 167963380, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.1472168, + "step": 7820, + "time_per_iteration": 2.4596121311187744 + }, + { + "auxiliary_loss_clip": 0.06434815, + "auxiliary_loss_mlp": 0.01270743, + "balance_loss_clip": 0.06279559, + "balance_loss_mlp": 0.01257636, + "epoch": 0.4702239591161882, + "flos": 13558044470400.0, + "grad_norm": 1.9648943700675503, + "language_loss": 0.74081844, + "learning_rate": 2.288939561601039e-06, + "loss": 0.81787401, + "num_input_tokens_seen": 167981740, + "router_z_loss_clip": 1.55175781, + "router_z_loss_mlp": 0.13116455, + "step": 7821, + "time_per_iteration": 2.4793312549591064 + }, + { + "auxiliary_loss_clip": 0.06431578, + "auxiliary_loss_mlp": 0.01268853, + "balance_loss_clip": 0.06276228, + "balance_loss_mlp": 0.01256658, + "epoch": 0.47028408236885616, + "flos": 24282825235200.0, + "grad_norm": 1.6413236035832721, + "language_loss": 0.89491117, + "learning_rate": 2.2885541802032746e-06, + "loss": 0.97191548, + "num_input_tokens_seen": 167999380, + "router_z_loss_clip": 1.54980469, + "router_z_loss_mlp": 0.12207031, + "step": 7822, + "time_per_iteration": 2.5880398750305176 + }, + { + "auxiliary_loss_clip": 0.06433522, + "auxiliary_loss_mlp": 0.01266311, + "balance_loss_clip": 0.06277143, + "balance_loss_mlp": 0.01254062, + "epoch": 0.4703442056215241, + "flos": 22863565578240.0, + "grad_norm": 1.438932852866735, + "language_loss": 0.79699898, + "learning_rate": 2.2881687878635055e-06, + "loss": 0.87399733, + "num_input_tokens_seen": 168018395, + "router_z_loss_clip": 1.56445312, + "router_z_loss_mlp": 0.12255859, + "step": 7823, + "time_per_iteration": 2.5661919116973877 + }, + { + "auxiliary_loss_clip": 0.06324597, + "auxiliary_loss_mlp": 0.01253174, + "balance_loss_clip": 0.06255165, + "balance_loss_mlp": 0.01250784, + "epoch": 0.4704043288741921, + "flos": 69262381463040.0, + "grad_norm": 0.6854102840454825, + "language_loss": 0.56514406, + "learning_rate": 2.2877833845963487e-06, + "loss": 0.64092177, + "num_input_tokens_seen": 168084080, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 0.02386475, + "step": 7824, + "time_per_iteration": 3.223728656768799 + }, + { + "auxiliary_loss_clip": 0.06442541, + "auxiliary_loss_mlp": 0.01269654, + "balance_loss_clip": 0.06281068, + "balance_loss_mlp": 0.01255837, + "epoch": 0.47046445212686006, + "flos": 18046971728640.0, + "grad_norm": 1.8116047863427858, + "language_loss": 0.81242847, + "learning_rate": 2.2873979704164157e-06, + "loss": 0.88955039, + "num_input_tokens_seen": 168101555, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.13818359, + "step": 7825, + "time_per_iteration": 2.4815890789031982 + }, + { + "auxiliary_loss_clip": 0.06441189, + "auxiliary_loss_mlp": 0.01270609, + "balance_loss_clip": 0.06280564, + "balance_loss_mlp": 0.01257443, + "epoch": 0.470524575379528, + "flos": 23958261244800.0, + "grad_norm": 2.19673184020816, + "language_loss": 0.67126369, + "learning_rate": 2.287012545338324e-06, + "loss": 0.74838167, + "num_input_tokens_seen": 168121530, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.1317749, + "step": 7826, + "time_per_iteration": 2.5820834636688232 + }, + { + "auxiliary_loss_clip": 0.06443623, + "auxiliary_loss_mlp": 0.01268086, + "balance_loss_clip": 0.06281798, + "balance_loss_mlp": 0.01254824, + "epoch": 0.470584698632196, + "flos": 18119367256320.0, + "grad_norm": 1.7021383964965269, + "language_loss": 0.8395251, + "learning_rate": 2.2866271093766877e-06, + "loss": 0.91664219, + "num_input_tokens_seen": 168140335, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.13250732, + "step": 7827, + "time_per_iteration": 2.4966769218444824 + }, + { + "auxiliary_loss_clip": 0.06333943, + "auxiliary_loss_mlp": 0.01253247, + "balance_loss_clip": 0.06264865, + "balance_loss_mlp": 0.01250913, + "epoch": 0.47064482188486395, + "flos": 57268555413120.0, + "grad_norm": 0.786622619089935, + "language_loss": 0.55656797, + "learning_rate": 2.286241662546122e-06, + "loss": 0.63243991, + "num_input_tokens_seen": 168200535, + "router_z_loss_clip": 0.6875, + "router_z_loss_mlp": 0.02328491, + "step": 7828, + "time_per_iteration": 3.1594009399414062 + }, + { + "auxiliary_loss_clip": 0.06439656, + "auxiliary_loss_mlp": 0.01268005, + "balance_loss_clip": 0.06281954, + "balance_loss_mlp": 0.01254743, + "epoch": 0.4707049451375319, + "flos": 17900922862080.0, + "grad_norm": 1.8377127056601934, + "language_loss": 0.80904895, + "learning_rate": 2.285856204861245e-06, + "loss": 0.88612556, + "num_input_tokens_seen": 168219610, + "router_z_loss_clip": 1.57519531, + "router_z_loss_mlp": 0.13256836, + "step": 7829, + "time_per_iteration": 2.485140800476074 + }, + { + "auxiliary_loss_clip": 0.0643746, + "auxiliary_loss_mlp": 0.01272596, + "balance_loss_clip": 0.06279843, + "balance_loss_mlp": 0.0126024, + "epoch": 0.47076506839019994, + "flos": 25240402494720.0, + "grad_norm": 1.2696703606336757, + "language_loss": 0.76018727, + "learning_rate": 2.2854707363366703e-06, + "loss": 0.83728784, + "num_input_tokens_seen": 168242505, + "router_z_loss_clip": 1.57519531, + "router_z_loss_mlp": 0.12359619, + "step": 7830, + "time_per_iteration": 2.6114325523376465 + }, + { + "auxiliary_loss_clip": 0.06438384, + "auxiliary_loss_mlp": 0.01269492, + "balance_loss_clip": 0.06283822, + "balance_loss_mlp": 0.01257016, + "epoch": 0.4708251916428679, + "flos": 13484684620800.0, + "grad_norm": 2.037519777934202, + "language_loss": 0.78570348, + "learning_rate": 2.2850852569870177e-06, + "loss": 0.86278224, + "num_input_tokens_seen": 168260220, + "router_z_loss_clip": 1.54785156, + "router_z_loss_mlp": 0.12463379, + "step": 7831, + "time_per_iteration": 2.4759325981140137 + }, + { + "auxiliary_loss_clip": 0.06447008, + "auxiliary_loss_mlp": 0.01269408, + "balance_loss_clip": 0.06280228, + "balance_loss_mlp": 0.01255365, + "epoch": 0.47088531489553587, + "flos": 30154646678400.0, + "grad_norm": 1.667499960909574, + "language_loss": 0.7574442, + "learning_rate": 2.2846997668269033e-06, + "loss": 0.83460832, + "num_input_tokens_seen": 168277360, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.140625, + "step": 7832, + "time_per_iteration": 2.6298487186431885 + }, + { + "auxiliary_loss_clip": 0.06434175, + "auxiliary_loss_mlp": 0.01267877, + "balance_loss_clip": 0.0627791, + "balance_loss_mlp": 0.01256844, + "epoch": 0.47094543814820383, + "flos": 21804648405120.0, + "grad_norm": 1.2855995862723888, + "language_loss": 0.74791807, + "learning_rate": 2.2843142658709454e-06, + "loss": 0.82493854, + "num_input_tokens_seen": 168296605, + "router_z_loss_clip": 1.5625, + "router_z_loss_mlp": 0.1104126, + "step": 7833, + "time_per_iteration": 2.5464203357696533 + }, + { + "auxiliary_loss_clip": 0.06437977, + "auxiliary_loss_mlp": 0.01266439, + "balance_loss_clip": 0.06281009, + "balance_loss_mlp": 0.01254118, + "epoch": 0.4710055614008718, + "flos": 23009698298880.0, + "grad_norm": 1.569702279619268, + "language_loss": 0.76145566, + "learning_rate": 2.283928754133762e-06, + "loss": 0.83849978, + "num_input_tokens_seen": 168316205, + "router_z_loss_clip": 1.57128906, + "router_z_loss_mlp": 0.12329102, + "step": 7834, + "time_per_iteration": 2.6125214099884033 + }, + { + "auxiliary_loss_clip": 0.06433094, + "auxiliary_loss_mlp": 0.01266226, + "balance_loss_clip": 0.06278115, + "balance_loss_mlp": 0.01254078, + "epoch": 0.47106568465353976, + "flos": 42751256601600.0, + "grad_norm": 1.4292072421609816, + "language_loss": 0.66957295, + "learning_rate": 2.283543231629972e-06, + "loss": 0.74656606, + "num_input_tokens_seen": 168338935, + "router_z_loss_clip": 1.54882812, + "router_z_loss_mlp": 0.12158203, + "step": 7835, + "time_per_iteration": 5.518744707107544 + }, + { + "auxiliary_loss_clip": 0.06330478, + "auxiliary_loss_mlp": 0.01256395, + "balance_loss_clip": 0.06261497, + "balance_loss_mlp": 0.01253791, + "epoch": 0.4711258079062077, + "flos": 68571116807040.0, + "grad_norm": 0.853960187866431, + "language_loss": 0.62259066, + "learning_rate": 2.283157698374194e-06, + "loss": 0.69845939, + "num_input_tokens_seen": 168392800, + "router_z_loss_clip": 0.68798828, + "router_z_loss_mlp": 0.02604675, + "step": 7836, + "time_per_iteration": 3.1000564098358154 + }, + { + "auxiliary_loss_clip": 0.06439401, + "auxiliary_loss_mlp": 0.01267232, + "balance_loss_clip": 0.06274831, + "balance_loss_mlp": 0.01254006, + "epoch": 0.4711859311588757, + "flos": 25453522154880.0, + "grad_norm": 1.6974399997165228, + "language_loss": 0.69606686, + "learning_rate": 2.2827721543810475e-06, + "loss": 0.7731331, + "num_input_tokens_seen": 168412940, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.13238525, + "step": 7837, + "time_per_iteration": 2.5282108783721924 + }, + { + "auxiliary_loss_clip": 0.06437849, + "auxiliary_loss_mlp": 0.01268383, + "balance_loss_clip": 0.06277718, + "balance_loss_mlp": 0.01255061, + "epoch": 0.47124605441154366, + "flos": 21988488263040.0, + "grad_norm": 1.9658270715858404, + "language_loss": 0.66562694, + "learning_rate": 2.282386599665153e-06, + "loss": 0.74268925, + "num_input_tokens_seen": 168431995, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.13311768, + "step": 7838, + "time_per_iteration": 2.5846638679504395 + }, + { + "auxiliary_loss_clip": 0.06440166, + "auxiliary_loss_mlp": 0.01268362, + "balance_loss_clip": 0.06276537, + "balance_loss_mlp": 0.01255082, + "epoch": 0.4713061776642116, + "flos": 25420049648640.0, + "grad_norm": 5.850528361960432, + "language_loss": 0.77699667, + "learning_rate": 2.2820010342411304e-06, + "loss": 0.85408199, + "num_input_tokens_seen": 168454585, + "router_z_loss_clip": 1.63476562, + "router_z_loss_mlp": 0.1328125, + "step": 7839, + "time_per_iteration": 2.5414958000183105 + }, + { + "auxiliary_loss_clip": 0.06429788, + "auxiliary_loss_mlp": 0.01268311, + "balance_loss_clip": 0.06275208, + "balance_loss_mlp": 0.0125592, + "epoch": 0.4713663009168796, + "flos": 26549559486720.0, + "grad_norm": 2.242315176037199, + "language_loss": 0.73086643, + "learning_rate": 2.2816154581235993e-06, + "loss": 0.80784744, + "num_input_tokens_seen": 168471265, + "router_z_loss_clip": 1.54492188, + "router_z_loss_mlp": 0.12390137, + "step": 7840, + "time_per_iteration": 2.5519280433654785 + }, + { + "auxiliary_loss_clip": 0.06431505, + "auxiliary_loss_mlp": 0.01263733, + "balance_loss_clip": 0.06274457, + "balance_loss_mlp": 0.01251562, + "epoch": 0.47142642416954755, + "flos": 23630426945280.0, + "grad_norm": 1.566587637557085, + "language_loss": 0.75317335, + "learning_rate": 2.2812298713271833e-06, + "loss": 0.83012575, + "num_input_tokens_seen": 168491360, + "router_z_loss_clip": 1.5703125, + "router_z_loss_mlp": 0.1217041, + "step": 7841, + "time_per_iteration": 2.552835702896118 + }, + { + "auxiliary_loss_clip": 0.06436779, + "auxiliary_loss_mlp": 0.01265983, + "balance_loss_clip": 0.06277694, + "balance_loss_mlp": 0.01252947, + "epoch": 0.4714865474222155, + "flos": 22316783760000.0, + "grad_norm": 1.5550986710562988, + "language_loss": 0.70513815, + "learning_rate": 2.280844273866501e-06, + "loss": 0.78216577, + "num_input_tokens_seen": 168511335, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.13049316, + "step": 7842, + "time_per_iteration": 3.933955192565918 + }, + { + "auxiliary_loss_clip": 0.06436103, + "auxiliary_loss_mlp": 0.01268574, + "balance_loss_clip": 0.0627934, + "balance_loss_mlp": 0.01255891, + "epoch": 0.4715466706748835, + "flos": 17828317699200.0, + "grad_norm": 1.9804632158033957, + "language_loss": 0.79634649, + "learning_rate": 2.280458665756177e-06, + "loss": 0.87339324, + "num_input_tokens_seen": 168529920, + "router_z_loss_clip": 1.56738281, + "router_z_loss_mlp": 0.12677002, + "step": 7843, + "time_per_iteration": 2.4907753467559814 + }, + { + "auxiliary_loss_clip": 0.06434722, + "auxiliary_loss_mlp": 0.01265319, + "balance_loss_clip": 0.06276919, + "balance_loss_mlp": 0.0125301, + "epoch": 0.4716067939275515, + "flos": 23666289292800.0, + "grad_norm": 1.6302002599700955, + "language_loss": 0.74402809, + "learning_rate": 2.280073047010832e-06, + "loss": 0.82102847, + "num_input_tokens_seen": 168550595, + "router_z_loss_clip": 1.57714844, + "router_z_loss_mlp": 0.12298584, + "step": 7844, + "time_per_iteration": 2.5746476650238037 + }, + { + "auxiliary_loss_clip": 0.06436022, + "auxiliary_loss_mlp": 0.0127037, + "balance_loss_clip": 0.0627865, + "balance_loss_mlp": 0.01257138, + "epoch": 0.47166691718021947, + "flos": 17935778960640.0, + "grad_norm": 2.158450508091108, + "language_loss": 0.78678179, + "learning_rate": 2.279687417645088e-06, + "loss": 0.86384571, + "num_input_tokens_seen": 168569765, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.13238525, + "step": 7845, + "time_per_iteration": 2.4827558994293213 + }, + { + "auxiliary_loss_clip": 0.06430048, + "auxiliary_loss_mlp": 0.01266435, + "balance_loss_clip": 0.06273912, + "balance_loss_mlp": 0.01254991, + "epoch": 0.47172704043288743, + "flos": 26621787306240.0, + "grad_norm": 1.2653259456946966, + "language_loss": 0.73458219, + "learning_rate": 2.2793017776735703e-06, + "loss": 0.81154698, + "num_input_tokens_seen": 168591525, + "router_z_loss_clip": 1.56152344, + "router_z_loss_mlp": 0.11450195, + "step": 7846, + "time_per_iteration": 2.586641550064087 + }, + { + "auxiliary_loss_clip": 0.06430165, + "auxiliary_loss_mlp": 0.01268985, + "balance_loss_clip": 0.06277196, + "balance_loss_mlp": 0.01256754, + "epoch": 0.4717871636855554, + "flos": 27929225289600.0, + "grad_norm": 1.2918573904220954, + "language_loss": 0.74434412, + "learning_rate": 2.2789161271109e-06, + "loss": 0.82133555, + "num_input_tokens_seen": 168611235, + "router_z_loss_clip": 1.52929688, + "router_z_loss_mlp": 0.12243652, + "step": 7847, + "time_per_iteration": 3.984661817550659 + }, + { + "auxiliary_loss_clip": 0.06434786, + "auxiliary_loss_mlp": 0.0126996, + "balance_loss_clip": 0.06276622, + "balance_loss_mlp": 0.01258123, + "epoch": 0.47184728693822336, + "flos": 14507571738240.0, + "grad_norm": 1.68455833448323, + "language_loss": 0.81004, + "learning_rate": 2.278530465971703e-06, + "loss": 0.88708746, + "num_input_tokens_seen": 168628710, + "router_z_loss_clip": 1.58105469, + "router_z_loss_mlp": 0.1184082, + "step": 7848, + "time_per_iteration": 2.482759714126587 + }, + { + "auxiliary_loss_clip": 0.06438575, + "auxiliary_loss_mlp": 0.01265775, + "balance_loss_clip": 0.06279046, + "balance_loss_mlp": 0.01252394, + "epoch": 0.47190741019089133, + "flos": 17862041767680.0, + "grad_norm": 1.8089027190058555, + "language_loss": 0.70106918, + "learning_rate": 2.2781447942706032e-06, + "loss": 0.77811265, + "num_input_tokens_seen": 168645645, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.1338501, + "step": 7849, + "time_per_iteration": 2.5101277828216553 + }, + { + "auxiliary_loss_clip": 0.06444675, + "auxiliary_loss_mlp": 0.01269385, + "balance_loss_clip": 0.06280467, + "balance_loss_mlp": 0.0125539, + "epoch": 0.4719675334435593, + "flos": 17901384059520.0, + "grad_norm": 1.915736246727948, + "language_loss": 0.69964916, + "learning_rate": 2.277759112022224e-06, + "loss": 0.77678978, + "num_input_tokens_seen": 168664165, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.14001465, + "step": 7850, + "time_per_iteration": 2.46455979347229 + }, + { + "auxiliary_loss_clip": 0.06441706, + "auxiliary_loss_mlp": 0.01269243, + "balance_loss_clip": 0.0627879, + "balance_loss_mlp": 0.01255951, + "epoch": 0.47202765669622726, + "flos": 20710665498240.0, + "grad_norm": 1.953909301983903, + "language_loss": 0.75806379, + "learning_rate": 2.2773734192411916e-06, + "loss": 0.83517331, + "num_input_tokens_seen": 168681940, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.13305664, + "step": 7851, + "time_per_iteration": 2.5298452377319336 + }, + { + "auxiliary_loss_clip": 0.06440549, + "auxiliary_loss_mlp": 0.01271731, + "balance_loss_clip": 0.06277989, + "balance_loss_mlp": 0.01257534, + "epoch": 0.4720877799488952, + "flos": 16365439192320.0, + "grad_norm": 1.905541371588542, + "language_loss": 0.76767981, + "learning_rate": 2.276987715942132e-06, + "loss": 0.84480262, + "num_input_tokens_seen": 168698830, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.14196777, + "step": 7852, + "time_per_iteration": 2.473349094390869 + }, + { + "auxiliary_loss_clip": 0.06431545, + "auxiliary_loss_mlp": 0.01270384, + "balance_loss_clip": 0.06273787, + "balance_loss_mlp": 0.01257742, + "epoch": 0.4721479032015632, + "flos": 20674509661440.0, + "grad_norm": 2.394869083314355, + "language_loss": 0.69452804, + "learning_rate": 2.2766020021396696e-06, + "loss": 0.77154732, + "num_input_tokens_seen": 168718305, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.12658691, + "step": 7853, + "time_per_iteration": 2.537550210952759 + }, + { + "auxiliary_loss_clip": 0.06333929, + "auxiliary_loss_mlp": 0.01250651, + "balance_loss_clip": 0.06264801, + "balance_loss_mlp": 0.01248457, + "epoch": 0.47220802645423116, + "flos": 67773367681920.0, + "grad_norm": 0.6896509796832918, + "language_loss": 0.50247812, + "learning_rate": 2.276216277848432e-06, + "loss": 0.57832396, + "num_input_tokens_seen": 168782365, + "router_z_loss_clip": 0.69140625, + "router_z_loss_mlp": 0.02197266, + "step": 7854, + "time_per_iteration": 3.2550642490386963 + }, + { + "auxiliary_loss_clip": 0.06436136, + "auxiliary_loss_mlp": 0.0126914, + "balance_loss_clip": 0.06276229, + "balance_loss_mlp": 0.0125583, + "epoch": 0.4722681497068991, + "flos": 20927474737920.0, + "grad_norm": 1.8228483302344913, + "language_loss": 0.63672256, + "learning_rate": 2.2758305430830455e-06, + "loss": 0.71377528, + "num_input_tokens_seen": 168800485, + "router_z_loss_clip": 1.60058594, + "router_z_loss_mlp": 0.13317871, + "step": 7855, + "time_per_iteration": 2.5252599716186523 + }, + { + "auxiliary_loss_clip": 0.06439453, + "auxiliary_loss_mlp": 0.01268333, + "balance_loss_clip": 0.06280654, + "balance_loss_mlp": 0.01255715, + "epoch": 0.4723282729595671, + "flos": 28300594584960.0, + "grad_norm": 1.8174966086465816, + "language_loss": 0.76136196, + "learning_rate": 2.2754447978581376e-06, + "loss": 0.83843982, + "num_input_tokens_seen": 168818965, + "router_z_loss_clip": 1.58789062, + "router_z_loss_mlp": 0.1262207, + "step": 7856, + "time_per_iteration": 2.560236692428589 + }, + { + "auxiliary_loss_clip": 0.06436295, + "auxiliary_loss_mlp": 0.01269996, + "balance_loss_clip": 0.06279726, + "balance_loss_mlp": 0.01258284, + "epoch": 0.4723883962122351, + "flos": 27132287506560.0, + "grad_norm": 1.7138943667728106, + "language_loss": 0.750875, + "learning_rate": 2.2750590421883347e-06, + "loss": 0.8279379, + "num_input_tokens_seen": 168840355, + "router_z_loss_clip": 1.56347656, + "router_z_loss_mlp": 0.11706543, + "step": 7857, + "time_per_iteration": 2.5613489151000977 + }, + { + "auxiliary_loss_clip": 0.06436294, + "auxiliary_loss_mlp": 0.01270819, + "balance_loss_clip": 0.0628143, + "balance_loss_mlp": 0.01258946, + "epoch": 0.47244851946490307, + "flos": 31544794241280.0, + "grad_norm": 1.4694813046790665, + "language_loss": 0.64839488, + "learning_rate": 2.2746732760882655e-06, + "loss": 0.72546607, + "num_input_tokens_seen": 168861765, + "router_z_loss_clip": 1.54882812, + "router_z_loss_mlp": 0.11889648, + "step": 7858, + "time_per_iteration": 2.5957653522491455 + }, + { + "auxiliary_loss_clip": 0.06431169, + "auxiliary_loss_mlp": 0.01271908, + "balance_loss_clip": 0.06278542, + "balance_loss_mlp": 0.01259719, + "epoch": 0.47250864271757104, + "flos": 20892828274560.0, + "grad_norm": 1.741748713475879, + "language_loss": 0.71104157, + "learning_rate": 2.2742874995725575e-06, + "loss": 0.78807235, + "num_input_tokens_seen": 168881310, + "router_z_loss_clip": 1.52539062, + "router_z_loss_mlp": 0.12194824, + "step": 7859, + "time_per_iteration": 2.541404962539673 + }, + { + "auxiliary_loss_clip": 0.06440333, + "auxiliary_loss_mlp": 0.01270209, + "balance_loss_clip": 0.06277637, + "balance_loss_mlp": 0.01257776, + "epoch": 0.472568765970239, + "flos": 20528376940800.0, + "grad_norm": 1.7364161900477437, + "language_loss": 0.62341475, + "learning_rate": 2.2739017126558413e-06, + "loss": 0.70052016, + "num_input_tokens_seen": 168899470, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.12426758, + "step": 7860, + "time_per_iteration": 2.5165910720825195 + }, + { + "auxiliary_loss_clip": 0.06438711, + "auxiliary_loss_mlp": 0.01267574, + "balance_loss_clip": 0.06280093, + "balance_loss_mlp": 0.01254914, + "epoch": 0.47262888922290697, + "flos": 35813306534400.0, + "grad_norm": 2.092826385669962, + "language_loss": 0.72540921, + "learning_rate": 2.2735159153527445e-06, + "loss": 0.80247205, + "num_input_tokens_seen": 168921495, + "router_z_loss_clip": 1.58398438, + "router_z_loss_mlp": 0.12658691, + "step": 7861, + "time_per_iteration": 2.6575915813446045 + }, + { + "auxiliary_loss_clip": 0.06439754, + "auxiliary_loss_mlp": 0.01268288, + "balance_loss_clip": 0.0628088, + "balance_loss_mlp": 0.01254734, + "epoch": 0.47268901247557493, + "flos": 20674006536960.0, + "grad_norm": 2.2960282018232965, + "language_loss": 0.85134012, + "learning_rate": 2.273130107677896e-06, + "loss": 0.92842054, + "num_input_tokens_seen": 168940515, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 0.13555908, + "step": 7862, + "time_per_iteration": 2.4969582557678223 + }, + { + "auxiliary_loss_clip": 0.06443156, + "auxiliary_loss_mlp": 0.01269094, + "balance_loss_clip": 0.06283151, + "balance_loss_mlp": 0.012566, + "epoch": 0.4727491357282429, + "flos": 19579394724480.0, + "grad_norm": 1.7759944267926648, + "language_loss": 0.84885079, + "learning_rate": 2.272744289645927e-06, + "loss": 0.92597324, + "num_input_tokens_seen": 168958340, + "router_z_loss_clip": 1.60058594, + "router_z_loss_mlp": 0.12506104, + "step": 7863, + "time_per_iteration": 2.545445442199707 + }, + { + "auxiliary_loss_clip": 0.06435807, + "auxiliary_loss_mlp": 0.01268812, + "balance_loss_clip": 0.06279373, + "balance_loss_mlp": 0.01256873, + "epoch": 0.47280925898091086, + "flos": 18222090762240.0, + "grad_norm": 1.953539417417106, + "language_loss": 0.6582734, + "learning_rate": 2.272358461271467e-06, + "loss": 0.73531955, + "num_input_tokens_seen": 168974850, + "router_z_loss_clip": 1.5625, + "router_z_loss_mlp": 0.11950684, + "step": 7864, + "time_per_iteration": 2.4730403423309326 + }, + { + "auxiliary_loss_clip": 0.06438613, + "auxiliary_loss_mlp": 0.01269576, + "balance_loss_clip": 0.06280264, + "balance_loss_mlp": 0.01257619, + "epoch": 0.4728693822335788, + "flos": 17827604939520.0, + "grad_norm": 1.945688521953863, + "language_loss": 0.65635985, + "learning_rate": 2.271972622569147e-06, + "loss": 0.73344177, + "num_input_tokens_seen": 168992860, + "router_z_loss_clip": 1.58398438, + "router_z_loss_mlp": 0.11962891, + "step": 7865, + "time_per_iteration": 2.498135805130005 + }, + { + "auxiliary_loss_clip": 0.06430352, + "auxiliary_loss_mlp": 0.01270111, + "balance_loss_clip": 0.06277367, + "balance_loss_mlp": 0.01257671, + "epoch": 0.4729295054862468, + "flos": 20601359447040.0, + "grad_norm": 2.5713138482446234, + "language_loss": 0.73970878, + "learning_rate": 2.2715867735535976e-06, + "loss": 0.81671345, + "num_input_tokens_seen": 169010325, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.12445068, + "step": 7866, + "time_per_iteration": 2.495232582092285 + }, + { + "auxiliary_loss_clip": 0.06437797, + "auxiliary_loss_mlp": 0.01265922, + "balance_loss_clip": 0.06279261, + "balance_loss_mlp": 0.01254347, + "epoch": 0.47298962873891476, + "flos": 23374862392320.0, + "grad_norm": 2.8570557032751522, + "language_loss": 0.83387589, + "learning_rate": 2.271200914239451e-06, + "loss": 0.91091311, + "num_input_tokens_seen": 169029840, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.11578369, + "step": 7867, + "time_per_iteration": 2.565706968307495 + }, + { + "auxiliary_loss_clip": 0.06430209, + "auxiliary_loss_mlp": 0.01265413, + "balance_loss_clip": 0.06275865, + "balance_loss_mlp": 0.01253391, + "epoch": 0.4730497519915827, + "flos": 22058410095360.0, + "grad_norm": 1.6535025871822049, + "language_loss": 0.79521739, + "learning_rate": 2.2708150446413385e-06, + "loss": 0.87217355, + "num_input_tokens_seen": 169049975, + "router_z_loss_clip": 1.54492188, + "router_z_loss_mlp": 0.12036133, + "step": 7868, + "time_per_iteration": 2.549220561981201 + }, + { + "auxiliary_loss_clip": 0.06442262, + "auxiliary_loss_mlp": 0.01268103, + "balance_loss_clip": 0.06279381, + "balance_loss_mlp": 0.01255169, + "epoch": 0.4731098752442507, + "flos": 21076165008000.0, + "grad_norm": 1.8227151972017304, + "language_loss": 0.75178695, + "learning_rate": 2.2704291647738915e-06, + "loss": 0.82889056, + "num_input_tokens_seen": 169069540, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.12945557, + "step": 7869, + "time_per_iteration": 2.5188441276550293 + }, + { + "auxiliary_loss_clip": 0.06441551, + "auxiliary_loss_mlp": 0.01271574, + "balance_loss_clip": 0.06282122, + "balance_loss_mlp": 0.01258014, + "epoch": 0.4731699984969187, + "flos": 22535395862400.0, + "grad_norm": 1.4513841331120019, + "language_loss": 0.73749697, + "learning_rate": 2.2700432746517443e-06, + "loss": 0.81462824, + "num_input_tokens_seen": 169089940, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.13555908, + "step": 7870, + "time_per_iteration": 2.520761251449585 + }, + { + "auxiliary_loss_clip": 0.0644481, + "auxiliary_loss_mlp": 0.01272916, + "balance_loss_clip": 0.06280311, + "balance_loss_mlp": 0.01259231, + "epoch": 0.4732301217495867, + "flos": 24904769765760.0, + "grad_norm": 1.9907019842809281, + "language_loss": 0.81971508, + "learning_rate": 2.2696573742895292e-06, + "loss": 0.89689231, + "num_input_tokens_seen": 169109650, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.13684082, + "step": 7871, + "time_per_iteration": 2.7390120029449463 + }, + { + "auxiliary_loss_clip": 0.06436551, + "auxiliary_loss_mlp": 0.01267445, + "balance_loss_clip": 0.06278443, + "balance_loss_mlp": 0.01254261, + "epoch": 0.47329024500225464, + "flos": 22791128123520.0, + "grad_norm": 1.7255093919697873, + "language_loss": 0.76232624, + "learning_rate": 2.269271463701879e-06, + "loss": 0.8393662, + "num_input_tokens_seen": 169128990, + "router_z_loss_clip": 1.58203125, + "router_z_loss_mlp": 0.13189697, + "step": 7872, + "time_per_iteration": 2.6356093883514404 + }, + { + "auxiliary_loss_clip": 0.06438267, + "auxiliary_loss_mlp": 0.0127024, + "balance_loss_clip": 0.06279084, + "balance_loss_mlp": 0.01256847, + "epoch": 0.4733503682549226, + "flos": 38705884531200.0, + "grad_norm": 1.877318740282883, + "language_loss": 0.67809367, + "learning_rate": 2.268885542903428e-06, + "loss": 0.75517869, + "num_input_tokens_seen": 169154645, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.1338501, + "step": 7873, + "time_per_iteration": 2.7092511653900146 + }, + { + "auxiliary_loss_clip": 0.06434255, + "auxiliary_loss_mlp": 0.01269292, + "balance_loss_clip": 0.06277623, + "balance_loss_mlp": 0.0125699, + "epoch": 0.47341049150759057, + "flos": 22973584389120.0, + "grad_norm": 1.442307420398724, + "language_loss": 0.72792107, + "learning_rate": 2.26849961190881e-06, + "loss": 0.80495656, + "num_input_tokens_seen": 169174995, + "router_z_loss_clip": 1.56738281, + "router_z_loss_mlp": 0.12298584, + "step": 7874, + "time_per_iteration": 3.9462826251983643 + }, + { + "auxiliary_loss_clip": 0.06440391, + "auxiliary_loss_mlp": 0.01271103, + "balance_loss_clip": 0.06281446, + "balance_loss_mlp": 0.01258431, + "epoch": 0.47347061476025853, + "flos": 14543769502080.0, + "grad_norm": 2.253933500743018, + "language_loss": 0.65938866, + "learning_rate": 2.26811367073266e-06, + "loss": 0.7365036, + "num_input_tokens_seen": 169191815, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.12658691, + "step": 7875, + "time_per_iteration": 4.013593435287476 + }, + { + "auxiliary_loss_clip": 0.06443131, + "auxiliary_loss_mlp": 0.01267762, + "balance_loss_clip": 0.06284615, + "balance_loss_mlp": 0.01254571, + "epoch": 0.4735307380129265, + "flos": 30271080326400.0, + "grad_norm": 2.373261357507393, + "language_loss": 0.80868709, + "learning_rate": 2.2677277193896125e-06, + "loss": 0.88579601, + "num_input_tokens_seen": 169210430, + "router_z_loss_clip": 1.58398438, + "router_z_loss_mlp": 0.13183594, + "step": 7876, + "time_per_iteration": 2.577624797821045 + }, + { + "auxiliary_loss_clip": 0.06439028, + "auxiliary_loss_mlp": 0.01268228, + "balance_loss_clip": 0.0628099, + "balance_loss_mlp": 0.0125583, + "epoch": 0.47359086126559446, + "flos": 19397148094080.0, + "grad_norm": 1.7113236821341018, + "language_loss": 0.792979, + "learning_rate": 2.267341757894304e-06, + "loss": 0.87005162, + "num_input_tokens_seen": 169229295, + "router_z_loss_clip": 1.58007812, + "router_z_loss_mlp": 0.12402344, + "step": 7877, + "time_per_iteration": 2.5248916149139404 + }, + { + "auxiliary_loss_clip": 0.06431633, + "auxiliary_loss_mlp": 0.01269276, + "balance_loss_clip": 0.0627646, + "balance_loss_mlp": 0.01256938, + "epoch": 0.47365098451826243, + "flos": 21944995194240.0, + "grad_norm": 1.9478135029908927, + "language_loss": 0.70673579, + "learning_rate": 2.2669557862613685e-06, + "loss": 0.78374487, + "num_input_tokens_seen": 169247855, + "router_z_loss_clip": 1.55273438, + "router_z_loss_mlp": 0.12335205, + "step": 7878, + "time_per_iteration": 2.5023298263549805 + }, + { + "auxiliary_loss_clip": 0.06432398, + "auxiliary_loss_mlp": 0.01268548, + "balance_loss_clip": 0.06278147, + "balance_loss_mlp": 0.01256382, + "epoch": 0.4737111077709304, + "flos": 25851571776000.0, + "grad_norm": 1.6314467446120229, + "language_loss": 0.75137293, + "learning_rate": 2.2665698045054425e-06, + "loss": 0.82838243, + "num_input_tokens_seen": 169268860, + "router_z_loss_clip": 1.54199219, + "router_z_loss_mlp": 0.1217041, + "step": 7879, + "time_per_iteration": 2.623811960220337 + }, + { + "auxiliary_loss_clip": 0.06320075, + "auxiliary_loss_mlp": 0.01265678, + "balance_loss_clip": 0.06251323, + "balance_loss_mlp": 0.01262992, + "epoch": 0.47377123102359836, + "flos": 67779461831040.0, + "grad_norm": 0.7167002771941348, + "language_loss": 0.6131798, + "learning_rate": 2.266183812641164e-06, + "loss": 0.68903732, + "num_input_tokens_seen": 169331855, + "router_z_loss_clip": 0.6875, + "router_z_loss_mlp": 0.02690125, + "step": 7880, + "time_per_iteration": 3.159388303756714 + }, + { + "auxiliary_loss_clip": 0.06434937, + "auxiliary_loss_mlp": 0.01268898, + "balance_loss_clip": 0.06278567, + "balance_loss_mlp": 0.01256035, + "epoch": 0.4738313542762663, + "flos": 24322796432640.0, + "grad_norm": 1.5964233369580554, + "language_loss": 0.68369412, + "learning_rate": 2.2657978106831675e-06, + "loss": 0.76073253, + "num_input_tokens_seen": 169352175, + "router_z_loss_clip": 1.56542969, + "router_z_loss_mlp": 0.12866211, + "step": 7881, + "time_per_iteration": 4.010294198989868 + }, + { + "auxiliary_loss_clip": 0.06434233, + "auxiliary_loss_mlp": 0.01267509, + "balance_loss_clip": 0.06279774, + "balance_loss_mlp": 0.01255964, + "epoch": 0.4738914775289343, + "flos": 20711797528320.0, + "grad_norm": 1.8204307046333812, + "language_loss": 0.77692872, + "learning_rate": 2.265411798646092e-06, + "loss": 0.85394609, + "num_input_tokens_seen": 169371215, + "router_z_loss_clip": 1.54492188, + "router_z_loss_mlp": 0.11541748, + "step": 7882, + "time_per_iteration": 2.5205814838409424 + }, + { + "auxiliary_loss_clip": 0.06437336, + "auxiliary_loss_mlp": 0.01269511, + "balance_loss_clip": 0.06279442, + "balance_loss_mlp": 0.01257208, + "epoch": 0.4739516007816023, + "flos": 25453228665600.0, + "grad_norm": 1.3763225621826927, + "language_loss": 0.76357329, + "learning_rate": 2.2650257765445747e-06, + "loss": 0.84064174, + "num_input_tokens_seen": 169391745, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.12304688, + "step": 7883, + "time_per_iteration": 2.5500354766845703 + }, + { + "auxiliary_loss_clip": 0.0643235, + "auxiliary_loss_mlp": 0.0126636, + "balance_loss_clip": 0.06278035, + "balance_loss_mlp": 0.01255101, + "epoch": 0.4740117240342703, + "flos": 19980463092480.0, + "grad_norm": 1.6935272320670107, + "language_loss": 0.72225314, + "learning_rate": 2.2646397443932525e-06, + "loss": 0.79924023, + "num_input_tokens_seen": 169409845, + "router_z_loss_clip": 1.54101562, + "router_z_loss_mlp": 0.1126709, + "step": 7884, + "time_per_iteration": 2.5347273349761963 + }, + { + "auxiliary_loss_clip": 0.06443354, + "auxiliary_loss_mlp": 0.01266451, + "balance_loss_clip": 0.06279097, + "balance_loss_mlp": 0.01252944, + "epoch": 0.47407184728693824, + "flos": 15665229348480.0, + "grad_norm": 2.6351569696409314, + "language_loss": 0.82340348, + "learning_rate": 2.2642537022067655e-06, + "loss": 0.90050149, + "num_input_tokens_seen": 169426085, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.13513184, + "step": 7885, + "time_per_iteration": 2.482201099395752 + }, + { + "auxiliary_loss_clip": 0.06433931, + "auxiliary_loss_mlp": 0.01271088, + "balance_loss_clip": 0.06277239, + "balance_loss_mlp": 0.01259262, + "epoch": 0.4741319705396062, + "flos": 18594843649920.0, + "grad_norm": 1.913533031103811, + "language_loss": 0.7349298, + "learning_rate": 2.263867649999751e-06, + "loss": 0.81198001, + "num_input_tokens_seen": 169444705, + "router_z_loss_clip": 1.56738281, + "router_z_loss_mlp": 0.11816406, + "step": 7886, + "time_per_iteration": 3.95589017868042 + }, + { + "auxiliary_loss_clip": 0.06445764, + "auxiliary_loss_mlp": 0.01269023, + "balance_loss_clip": 0.0628106, + "balance_loss_mlp": 0.01256655, + "epoch": 0.47419209379227417, + "flos": 13266114445440.0, + "grad_norm": 1.8957247676006206, + "language_loss": 0.74131465, + "learning_rate": 2.263481587786849e-06, + "loss": 0.81846249, + "num_input_tokens_seen": 169460850, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.12384033, + "step": 7887, + "time_per_iteration": 2.558175563812256 + }, + { + "auxiliary_loss_clip": 0.06431396, + "auxiliary_loss_mlp": 0.01269479, + "balance_loss_clip": 0.06276178, + "balance_loss_mlp": 0.01257499, + "epoch": 0.47425221704494214, + "flos": 20049630238080.0, + "grad_norm": 2.0468025330010016, + "language_loss": 0.7742272, + "learning_rate": 2.2630955155826993e-06, + "loss": 0.85123587, + "num_input_tokens_seen": 169478890, + "router_z_loss_clip": 1.55273438, + "router_z_loss_mlp": 0.11987305, + "step": 7888, + "time_per_iteration": 2.5532913208007812 + }, + { + "auxiliary_loss_clip": 0.06440586, + "auxiliary_loss_mlp": 0.01268245, + "balance_loss_clip": 0.06282103, + "balance_loss_mlp": 0.01255978, + "epoch": 0.4743123402976101, + "flos": 27279300695040.0, + "grad_norm": 1.7248476258859713, + "language_loss": 0.72833514, + "learning_rate": 2.2627094334019406e-06, + "loss": 0.80542344, + "num_input_tokens_seen": 169499690, + "router_z_loss_clip": 1.58398438, + "router_z_loss_mlp": 0.1227417, + "step": 7889, + "time_per_iteration": 2.635697603225708 + }, + { + "auxiliary_loss_clip": 0.06323753, + "auxiliary_loss_mlp": 0.01252671, + "balance_loss_clip": 0.0625556, + "balance_loss_mlp": 0.01250217, + "epoch": 0.47437246355027807, + "flos": 55410771813120.0, + "grad_norm": 0.6980000025852627, + "language_loss": 0.55692458, + "learning_rate": 2.262323341259214e-06, + "loss": 0.63268882, + "num_input_tokens_seen": 169560475, + "router_z_loss_clip": 0.68261719, + "router_z_loss_mlp": 0.02452087, + "step": 7890, + "time_per_iteration": 3.196005344390869 + }, + { + "auxiliary_loss_clip": 0.06440383, + "auxiliary_loss_mlp": 0.01269286, + "balance_loss_clip": 0.06280889, + "balance_loss_mlp": 0.01255929, + "epoch": 0.47443258680294603, + "flos": 23885278738560.0, + "grad_norm": 1.7863596191541609, + "language_loss": 0.65755105, + "learning_rate": 2.2619372391691605e-06, + "loss": 0.73464775, + "num_input_tokens_seen": 169580110, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.13366699, + "step": 7891, + "time_per_iteration": 2.5535497665405273 + }, + { + "auxiliary_loss_clip": 0.06448144, + "auxiliary_loss_mlp": 0.01270649, + "balance_loss_clip": 0.06284909, + "balance_loss_mlp": 0.01256892, + "epoch": 0.474492710055614, + "flos": 21983666653440.0, + "grad_norm": 2.0785188787991133, + "language_loss": 0.70081401, + "learning_rate": 2.26155112714642e-06, + "loss": 0.77800196, + "num_input_tokens_seen": 169597510, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.13757324, + "step": 7892, + "time_per_iteration": 2.512953519821167 + }, + { + "auxiliary_loss_clip": 0.06322581, + "auxiliary_loss_mlp": 0.01253797, + "balance_loss_clip": 0.06254438, + "balance_loss_mlp": 0.01251454, + "epoch": 0.47455283330828196, + "flos": 62577186837120.0, + "grad_norm": 0.7954751994073583, + "language_loss": 0.58515328, + "learning_rate": 2.2611650052056355e-06, + "loss": 0.66091704, + "num_input_tokens_seen": 169660010, + "router_z_loss_clip": 0.6796875, + "router_z_loss_mlp": 0.02337646, + "step": 7893, + "time_per_iteration": 3.2652807235717773 + }, + { + "auxiliary_loss_clip": 0.06435462, + "auxiliary_loss_mlp": 0.01271377, + "balance_loss_clip": 0.06278428, + "balance_loss_mlp": 0.01259498, + "epoch": 0.47461295656094993, + "flos": 12098478199680.0, + "grad_norm": 1.6548256161788057, + "language_loss": 0.77515912, + "learning_rate": 2.2607788733614463e-06, + "loss": 0.85222745, + "num_input_tokens_seen": 169678485, + "router_z_loss_clip": 1.57226562, + "router_z_loss_mlp": 0.11871338, + "step": 7894, + "time_per_iteration": 2.4962351322174072 + }, + { + "auxiliary_loss_clip": 0.06436545, + "auxiliary_loss_mlp": 0.01267591, + "balance_loss_clip": 0.06277076, + "balance_loss_mlp": 0.01254883, + "epoch": 0.4746730798136179, + "flos": 20890522287360.0, + "grad_norm": 1.8932038979458137, + "language_loss": 0.75310624, + "learning_rate": 2.260392731628497e-06, + "loss": 0.83014762, + "num_input_tokens_seen": 169697335, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.1270752, + "step": 7895, + "time_per_iteration": 2.536651611328125 + }, + { + "auxiliary_loss_clip": 0.06438908, + "auxiliary_loss_mlp": 0.0126825, + "balance_loss_clip": 0.06280944, + "balance_loss_mlp": 0.012559, + "epoch": 0.4747332030662859, + "flos": 19981008144000.0, + "grad_norm": 1.9186877339725528, + "language_loss": 0.824898, + "learning_rate": 2.260006580021429e-06, + "loss": 0.90196961, + "num_input_tokens_seen": 169715395, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.12341309, + "step": 7896, + "time_per_iteration": 2.5451180934906006 + }, + { + "auxiliary_loss_clip": 0.06438936, + "auxiliary_loss_mlp": 0.0126766, + "balance_loss_clip": 0.06281327, + "balance_loss_mlp": 0.01254964, + "epoch": 0.4747933263189539, + "flos": 16039701244800.0, + "grad_norm": 4.910262672985542, + "language_loss": 0.76465023, + "learning_rate": 2.259620418554886e-06, + "loss": 0.84171617, + "num_input_tokens_seen": 169733755, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.12689209, + "step": 7897, + "time_per_iteration": 2.529157876968384 + }, + { + "auxiliary_loss_clip": 0.06443989, + "auxiliary_loss_mlp": 0.012709, + "balance_loss_clip": 0.0627964, + "balance_loss_mlp": 0.01257376, + "epoch": 0.47485344957162184, + "flos": 13960370649600.0, + "grad_norm": 1.9701771451271233, + "language_loss": 0.64411497, + "learning_rate": 2.25923424724351e-06, + "loss": 0.72126389, + "num_input_tokens_seen": 169751390, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.13519287, + "step": 7898, + "time_per_iteration": 2.4861059188842773 + }, + { + "auxiliary_loss_clip": 0.06443477, + "auxiliary_loss_mlp": 0.01269988, + "balance_loss_clip": 0.0628337, + "balance_loss_mlp": 0.01256774, + "epoch": 0.4749135728242898, + "flos": 20455352507520.0, + "grad_norm": 2.55946780946792, + "language_loss": 0.70317411, + "learning_rate": 2.258848066101946e-06, + "loss": 0.78030878, + "num_input_tokens_seen": 169769500, + "router_z_loss_clip": 1.60058594, + "router_z_loss_mlp": 0.13201904, + "step": 7899, + "time_per_iteration": 2.5035181045532227 + }, + { + "auxiliary_loss_clip": 0.06438522, + "auxiliary_loss_mlp": 0.0127023, + "balance_loss_clip": 0.06280558, + "balance_loss_mlp": 0.01257701, + "epoch": 0.4749736960769578, + "flos": 28957604849280.0, + "grad_norm": 1.797290129910965, + "language_loss": 0.68821597, + "learning_rate": 2.258461875144837e-06, + "loss": 0.76530349, + "num_input_tokens_seen": 169789215, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.12536621, + "step": 7900, + "time_per_iteration": 2.638021469116211 + }, + { + "auxiliary_loss_clip": 0.06435557, + "auxiliary_loss_mlp": 0.0126873, + "balance_loss_clip": 0.06277159, + "balance_loss_mlp": 0.01254216, + "epoch": 0.47503381932962574, + "flos": 31946407660800.0, + "grad_norm": 2.027602507157595, + "language_loss": 0.70583236, + "learning_rate": 2.2580756743868273e-06, + "loss": 0.78287518, + "num_input_tokens_seen": 169808825, + "router_z_loss_clip": 1.58300781, + "router_z_loss_mlp": 0.14501953, + "step": 7901, + "time_per_iteration": 2.6210362911224365 + }, + { + "auxiliary_loss_clip": 0.06438562, + "auxiliary_loss_mlp": 0.01269369, + "balance_loss_clip": 0.06280936, + "balance_loss_mlp": 0.01256817, + "epoch": 0.4750939425822937, + "flos": 22133782442880.0, + "grad_norm": 1.48556411263083, + "language_loss": 0.73796129, + "learning_rate": 2.2576894638425636e-06, + "loss": 0.81504059, + "num_input_tokens_seen": 169827590, + "router_z_loss_clip": 1.57617188, + "router_z_loss_mlp": 0.12542725, + "step": 7902, + "time_per_iteration": 2.5175282955169678 + }, + { + "auxiliary_loss_clip": 0.06431635, + "auxiliary_loss_mlp": 0.01269606, + "balance_loss_clip": 0.06275479, + "balance_loss_mlp": 0.0125747, + "epoch": 0.47515406583496167, + "flos": 20856378948480.0, + "grad_norm": 3.332476837285125, + "language_loss": 0.69285202, + "learning_rate": 2.257303243526688e-06, + "loss": 0.76986444, + "num_input_tokens_seen": 169844925, + "router_z_loss_clip": 1.5625, + "router_z_loss_mlp": 0.12139893, + "step": 7903, + "time_per_iteration": 2.5292611122131348 + }, + { + "auxiliary_loss_clip": 0.06430157, + "auxiliary_loss_mlp": 0.01266387, + "balance_loss_clip": 0.06276098, + "balance_loss_mlp": 0.01255015, + "epoch": 0.47521418908762963, + "flos": 17529679347840.0, + "grad_norm": 1.464561850634071, + "language_loss": 0.72526675, + "learning_rate": 2.256917013453848e-06, + "loss": 0.80223215, + "num_input_tokens_seen": 169862705, + "router_z_loss_clip": 1.54003906, + "router_z_loss_mlp": 0.1137085, + "step": 7904, + "time_per_iteration": 2.491152286529541 + }, + { + "auxiliary_loss_clip": 0.06430416, + "auxiliary_loss_mlp": 0.01265335, + "balance_loss_clip": 0.06276643, + "balance_loss_mlp": 0.01253706, + "epoch": 0.4752743123402976, + "flos": 20565874442880.0, + "grad_norm": 1.4968424405470007, + "language_loss": 0.86079156, + "learning_rate": 2.25653077363869e-06, + "loss": 0.93774903, + "num_input_tokens_seen": 169880155, + "router_z_loss_clip": 1.53710938, + "router_z_loss_mlp": 0.11633301, + "step": 7905, + "time_per_iteration": 2.5502467155456543 + }, + { + "auxiliary_loss_clip": 0.06426042, + "auxiliary_loss_mlp": 0.01267894, + "balance_loss_clip": 0.06274827, + "balance_loss_mlp": 0.01256146, + "epoch": 0.47533443559296557, + "flos": 26368025616000.0, + "grad_norm": 2.2485080153720425, + "language_loss": 0.82345891, + "learning_rate": 2.2561445240958583e-06, + "loss": 0.90039825, + "num_input_tokens_seen": 169901525, + "router_z_loss_clip": 1.51269531, + "router_z_loss_mlp": 0.11749268, + "step": 7906, + "time_per_iteration": 2.5368199348449707 + }, + { + "auxiliary_loss_clip": 0.06321883, + "auxiliary_loss_mlp": 0.01254668, + "balance_loss_clip": 0.06254389, + "balance_loss_mlp": 0.01251897, + "epoch": 0.47539455884563353, + "flos": 65970118690560.0, + "grad_norm": 0.659791256047387, + "language_loss": 0.5900293, + "learning_rate": 2.255758264840002e-06, + "loss": 0.66579485, + "num_input_tokens_seen": 169970345, + "router_z_loss_clip": 0.67529297, + "router_z_loss_mlp": 0.02775574, + "step": 7907, + "time_per_iteration": 3.279963254928589 + }, + { + "auxiliary_loss_clip": 0.06431986, + "auxiliary_loss_mlp": 0.01269488, + "balance_loss_clip": 0.06276301, + "balance_loss_mlp": 0.01256721, + "epoch": 0.4754546820983015, + "flos": 17243828743680.0, + "grad_norm": 1.7704403118247245, + "language_loss": 0.81422615, + "learning_rate": 2.255371995885765e-06, + "loss": 0.89124084, + "num_input_tokens_seen": 169986440, + "router_z_loss_clip": 1.55664062, + "router_z_loss_mlp": 0.12756348, + "step": 7908, + "time_per_iteration": 2.5366125106811523 + }, + { + "auxiliary_loss_clip": 0.0643681, + "auxiliary_loss_mlp": 0.01270103, + "balance_loss_clip": 0.06278989, + "balance_loss_mlp": 0.01257258, + "epoch": 0.47551480535096946, + "flos": 19831563187200.0, + "grad_norm": 1.6522879253580633, + "language_loss": 0.74338585, + "learning_rate": 2.254985717247797e-06, + "loss": 0.82045496, + "num_input_tokens_seen": 170005705, + "router_z_loss_clip": 1.57714844, + "router_z_loss_mlp": 0.12841797, + "step": 7909, + "time_per_iteration": 2.5318603515625 + }, + { + "auxiliary_loss_clip": 0.06431618, + "auxiliary_loss_mlp": 0.01267166, + "balance_loss_clip": 0.0627422, + "balance_loss_mlp": 0.01255192, + "epoch": 0.4755749286036375, + "flos": 22170525258240.0, + "grad_norm": 1.5977935042114109, + "language_loss": 0.75628603, + "learning_rate": 2.2545994289407457e-06, + "loss": 0.83327389, + "num_input_tokens_seen": 170023415, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.11987305, + "step": 7910, + "time_per_iteration": 2.5529162883758545 + }, + { + "auxiliary_loss_clip": 0.0643287, + "auxiliary_loss_mlp": 0.01264956, + "balance_loss_clip": 0.06276555, + "balance_loss_mlp": 0.01253488, + "epoch": 0.47563505185630545, + "flos": 21653945637120.0, + "grad_norm": 1.8732404582916444, + "language_loss": 0.7930491, + "learning_rate": 2.2542131309792577e-06, + "loss": 0.8700273, + "num_input_tokens_seen": 170042395, + "router_z_loss_clip": 1.56347656, + "router_z_loss_mlp": 0.11474609, + "step": 7911, + "time_per_iteration": 2.5172598361968994 + }, + { + "auxiliary_loss_clip": 0.0643772, + "auxiliary_loss_mlp": 0.01268087, + "balance_loss_clip": 0.06276082, + "balance_loss_mlp": 0.01253854, + "epoch": 0.4756951751089734, + "flos": 20634622318080.0, + "grad_norm": 1.775078995772379, + "language_loss": 0.76487613, + "learning_rate": 2.253826823377983e-06, + "loss": 0.8419342, + "num_input_tokens_seen": 170061610, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.14239502, + "step": 7912, + "time_per_iteration": 2.5627753734588623 + }, + { + "auxiliary_loss_clip": 0.06432701, + "auxiliary_loss_mlp": 0.01273558, + "balance_loss_clip": 0.06275164, + "balance_loss_mlp": 0.01260797, + "epoch": 0.4757552983616414, + "flos": 25855932188160.0, + "grad_norm": 1.3867905424321492, + "language_loss": 0.74749589, + "learning_rate": 2.253440506151569e-06, + "loss": 0.82455844, + "num_input_tokens_seen": 170083505, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.12762451, + "step": 7913, + "time_per_iteration": 2.539555549621582 + }, + { + "auxiliary_loss_clip": 0.06434918, + "auxiliary_loss_mlp": 0.01269661, + "balance_loss_clip": 0.06277134, + "balance_loss_mlp": 0.01257418, + "epoch": 0.47581542161430934, + "flos": 18228841015680.0, + "grad_norm": 1.9858873239790236, + "language_loss": 0.72184181, + "learning_rate": 2.253054179314666e-06, + "loss": 0.79888761, + "num_input_tokens_seen": 170100690, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.12249756, + "step": 7914, + "time_per_iteration": 3.9911863803863525 + }, + { + "auxiliary_loss_clip": 0.06440303, + "auxiliary_loss_mlp": 0.01270006, + "balance_loss_clip": 0.06281254, + "balance_loss_mlp": 0.0125737, + "epoch": 0.4758755448669773, + "flos": 21586162083840.0, + "grad_norm": 1.8571830642758371, + "language_loss": 0.65017748, + "learning_rate": 2.2526678428819227e-06, + "loss": 0.72728062, + "num_input_tokens_seen": 170119240, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.12628174, + "step": 7915, + "time_per_iteration": 3.94254207611084 + }, + { + "auxiliary_loss_clip": 0.06428695, + "auxiliary_loss_mlp": 0.01268984, + "balance_loss_clip": 0.06276157, + "balance_loss_mlp": 0.01257027, + "epoch": 0.47593566811964527, + "flos": 15236474405760.0, + "grad_norm": 1.6782618347522322, + "language_loss": 0.77118516, + "learning_rate": 2.2522814968679896e-06, + "loss": 0.84816194, + "num_input_tokens_seen": 170136450, + "router_z_loss_clip": 1.52636719, + "router_z_loss_mlp": 0.11950684, + "step": 7916, + "time_per_iteration": 2.5071310997009277 + }, + { + "auxiliary_loss_clip": 0.0642941, + "auxiliary_loss_mlp": 0.01270125, + "balance_loss_clip": 0.06275692, + "balance_loss_mlp": 0.01258842, + "epoch": 0.47599579137231324, + "flos": 21549628903680.0, + "grad_norm": 2.1020342658546878, + "language_loss": 0.64506871, + "learning_rate": 2.2518951412875173e-06, + "loss": 0.72206402, + "num_input_tokens_seen": 170155295, + "router_z_loss_clip": 1.53710938, + "router_z_loss_mlp": 0.112854, + "step": 7917, + "time_per_iteration": 2.660997152328491 + }, + { + "auxiliary_loss_clip": 0.06322742, + "auxiliary_loss_mlp": 0.01267172, + "balance_loss_clip": 0.06253887, + "balance_loss_mlp": 0.01264125, + "epoch": 0.4760559146249812, + "flos": 64573388582400.0, + "grad_norm": 0.81764582989578, + "language_loss": 0.65507567, + "learning_rate": 2.2515087761551557e-06, + "loss": 0.73097479, + "num_input_tokens_seen": 170222325, + "router_z_loss_clip": 0.6875, + "router_z_loss_mlp": 0.03042603, + "step": 7918, + "time_per_iteration": 3.185194492340088 + }, + { + "auxiliary_loss_clip": 0.06435688, + "auxiliary_loss_mlp": 0.01270072, + "balance_loss_clip": 0.06277663, + "balance_loss_mlp": 0.01257781, + "epoch": 0.47611603787764917, + "flos": 22239943966080.0, + "grad_norm": 1.5442115166230013, + "language_loss": 0.69113988, + "learning_rate": 2.2511224014855563e-06, + "loss": 0.76819742, + "num_input_tokens_seen": 170241625, + "router_z_loss_clip": 1.58007812, + "router_z_loss_mlp": 0.12286377, + "step": 7919, + "time_per_iteration": 2.5625159740448 + }, + { + "auxiliary_loss_clip": 0.06440815, + "auxiliary_loss_mlp": 0.01266869, + "balance_loss_clip": 0.06280257, + "balance_loss_mlp": 0.01254966, + "epoch": 0.47617616113031713, + "flos": 22785971097600.0, + "grad_norm": 1.4153562055419862, + "language_loss": 0.75135148, + "learning_rate": 2.2507360172933694e-06, + "loss": 0.82842833, + "num_input_tokens_seen": 170262470, + "router_z_loss_clip": 1.60742188, + "router_z_loss_mlp": 0.11914062, + "step": 7920, + "time_per_iteration": 2.606783866882324 + }, + { + "auxiliary_loss_clip": 0.06442747, + "auxiliary_loss_mlp": 0.01268403, + "balance_loss_clip": 0.06280643, + "balance_loss_mlp": 0.01255391, + "epoch": 0.4762362843829851, + "flos": 24140633656320.0, + "grad_norm": 1.5595930907743143, + "language_loss": 0.77291155, + "learning_rate": 2.2503496235932487e-06, + "loss": 0.85002303, + "num_input_tokens_seen": 170283460, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.13000488, + "step": 7921, + "time_per_iteration": 4.0331573486328125 + }, + { + "auxiliary_loss_clip": 0.06441253, + "auxiliary_loss_mlp": 0.01270198, + "balance_loss_clip": 0.06281719, + "balance_loss_mlp": 0.01256859, + "epoch": 0.47629640763565306, + "flos": 22458052944000.0, + "grad_norm": 1.5318798569312555, + "language_loss": 0.78402638, + "learning_rate": 2.249963220399845e-06, + "loss": 0.86114085, + "num_input_tokens_seen": 170304225, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.13342285, + "step": 7922, + "time_per_iteration": 2.615656614303589 + }, + { + "auxiliary_loss_clip": 0.06443102, + "auxiliary_loss_mlp": 0.01266921, + "balance_loss_clip": 0.06280392, + "balance_loss_mlp": 0.01253426, + "epoch": 0.4763565308883211, + "flos": 11186071090560.0, + "grad_norm": 1.9566034639967664, + "language_loss": 0.72915596, + "learning_rate": 2.2495768077278104e-06, + "loss": 0.80625618, + "num_input_tokens_seen": 170322110, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.1350708, + "step": 7923, + "time_per_iteration": 2.495023727416992 + }, + { + "auxiliary_loss_clip": 0.06440397, + "auxiliary_loss_mlp": 0.01267365, + "balance_loss_clip": 0.06280472, + "balance_loss_mlp": 0.01255772, + "epoch": 0.47641665414098905, + "flos": 22388634236160.0, + "grad_norm": 2.175648520453788, + "language_loss": 0.82023257, + "learning_rate": 2.2491903855917992e-06, + "loss": 0.8973102, + "num_input_tokens_seen": 170340700, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.11590576, + "step": 7924, + "time_per_iteration": 2.5592448711395264 + }, + { + "auxiliary_loss_clip": 0.06449094, + "auxiliary_loss_mlp": 0.01271258, + "balance_loss_clip": 0.06283164, + "balance_loss_mlp": 0.01257191, + "epoch": 0.476476777393657, + "flos": 25053166546560.0, + "grad_norm": 1.6497722763363074, + "language_loss": 0.80566549, + "learning_rate": 2.2488039540064626e-06, + "loss": 0.88286906, + "num_input_tokens_seen": 170359780, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.14074707, + "step": 7925, + "time_per_iteration": 2.5462217330932617 + }, + { + "auxiliary_loss_clip": 0.06433398, + "auxiliary_loss_mlp": 0.01273204, + "balance_loss_clip": 0.06273591, + "balance_loss_mlp": 0.01259984, + "epoch": 0.476536900646325, + "flos": 27276994707840.0, + "grad_norm": 1.5163925310357687, + "language_loss": 0.72183931, + "learning_rate": 2.2484175129864558e-06, + "loss": 0.79890537, + "num_input_tokens_seen": 170381260, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.13214111, + "step": 7926, + "time_per_iteration": 4.022697448730469 + }, + { + "auxiliary_loss_clip": 0.06443252, + "auxiliary_loss_mlp": 0.01270757, + "balance_loss_clip": 0.062805, + "balance_loss_mlp": 0.01257304, + "epoch": 0.47659702389899294, + "flos": 25308437610240.0, + "grad_norm": 2.540030120332383, + "language_loss": 0.69248974, + "learning_rate": 2.248031062546432e-06, + "loss": 0.76962984, + "num_input_tokens_seen": 170400595, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.13452148, + "step": 7927, + "time_per_iteration": 2.651005744934082 + }, + { + "auxiliary_loss_clip": 0.06432809, + "auxiliary_loss_mlp": 0.01274998, + "balance_loss_clip": 0.06276157, + "balance_loss_mlp": 0.01262928, + "epoch": 0.4766571471516609, + "flos": 25999716994560.0, + "grad_norm": 1.8555909912878064, + "language_loss": 0.68153882, + "learning_rate": 2.247644602701045e-06, + "loss": 0.75861686, + "num_input_tokens_seen": 170421110, + "router_z_loss_clip": 1.56640625, + "router_z_loss_mlp": 0.12072754, + "step": 7928, + "time_per_iteration": 2.6001169681549072 + }, + { + "auxiliary_loss_clip": 0.06439018, + "auxiliary_loss_mlp": 0.01266996, + "balance_loss_clip": 0.06277569, + "balance_loss_mlp": 0.01254497, + "epoch": 0.4767172704043289, + "flos": 16037395257600.0, + "grad_norm": 2.030081429010121, + "language_loss": 0.79402888, + "learning_rate": 2.2472581334649496e-06, + "loss": 0.87108904, + "num_input_tokens_seen": 170436700, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.12506104, + "step": 7929, + "time_per_iteration": 2.4979782104492188 + }, + { + "auxiliary_loss_clip": 0.06434054, + "auxiliary_loss_mlp": 0.0127525, + "balance_loss_clip": 0.06276359, + "balance_loss_mlp": 0.01263496, + "epoch": 0.47677739365699684, + "flos": 39244113233280.0, + "grad_norm": 1.8073767988538123, + "language_loss": 0.67109072, + "learning_rate": 2.2468716548528016e-06, + "loss": 0.74818379, + "num_input_tokens_seen": 170459555, + "router_z_loss_clip": 1.57617188, + "router_z_loss_mlp": 0.11749268, + "step": 7930, + "time_per_iteration": 2.64865779876709 + }, + { + "auxiliary_loss_clip": 0.06440657, + "auxiliary_loss_mlp": 0.01272697, + "balance_loss_clip": 0.06280986, + "balance_loss_mlp": 0.01260484, + "epoch": 0.4768375169096648, + "flos": 24724745268480.0, + "grad_norm": 1.7506463735046407, + "language_loss": 0.79864836, + "learning_rate": 2.2464851668792555e-06, + "loss": 0.87578189, + "num_input_tokens_seen": 170479175, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.12207031, + "step": 7931, + "time_per_iteration": 2.5824391841888428 + }, + { + "auxiliary_loss_clip": 0.06435428, + "auxiliary_loss_mlp": 0.01273232, + "balance_loss_clip": 0.06274468, + "balance_loss_mlp": 0.01260203, + "epoch": 0.47689764016233277, + "flos": 22535270081280.0, + "grad_norm": 2.3707401208689753, + "language_loss": 0.76826382, + "learning_rate": 2.2460986695589678e-06, + "loss": 0.8453505, + "num_input_tokens_seen": 170498450, + "router_z_loss_clip": 1.61035156, + "router_z_loss_mlp": 0.13043213, + "step": 7932, + "time_per_iteration": 2.510439157485962 + }, + { + "auxiliary_loss_clip": 0.06434679, + "auxiliary_loss_mlp": 0.01279125, + "balance_loss_clip": 0.06279778, + "balance_loss_mlp": 0.01266101, + "epoch": 0.47695776341500074, + "flos": 15125742835200.0, + "grad_norm": 3.7494408598150946, + "language_loss": 0.79909194, + "learning_rate": 2.245712162906593e-06, + "loss": 0.87623, + "num_input_tokens_seen": 170516255, + "router_z_loss_clip": 1.54589844, + "router_z_loss_mlp": 0.13012695, + "step": 7933, + "time_per_iteration": 2.5868406295776367 + }, + { + "auxiliary_loss_clip": 0.06440616, + "auxiliary_loss_mlp": 0.01270557, + "balance_loss_clip": 0.06276172, + "balance_loss_mlp": 0.01256889, + "epoch": 0.4770178866676687, + "flos": 14683319677440.0, + "grad_norm": 1.845903856635024, + "language_loss": 0.74363738, + "learning_rate": 2.2453256469367888e-06, + "loss": 0.8207491, + "num_input_tokens_seen": 170532705, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.13677979, + "step": 7934, + "time_per_iteration": 2.467625141143799 + }, + { + "auxiliary_loss_clip": 0.06439498, + "auxiliary_loss_mlp": 0.01269177, + "balance_loss_clip": 0.06278646, + "balance_loss_mlp": 0.01256213, + "epoch": 0.47707800992033667, + "flos": 22572264458880.0, + "grad_norm": 2.1751877197221847, + "language_loss": 0.80426806, + "learning_rate": 2.244939121664211e-06, + "loss": 0.88135481, + "num_input_tokens_seen": 170551925, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.12963867, + "step": 7935, + "time_per_iteration": 2.57150936126709 + }, + { + "auxiliary_loss_clip": 0.06443004, + "auxiliary_loss_mlp": 0.01271494, + "balance_loss_clip": 0.06275547, + "balance_loss_mlp": 0.01257249, + "epoch": 0.4771381331730047, + "flos": 30925868457600.0, + "grad_norm": 1.696374515888555, + "language_loss": 0.71442336, + "learning_rate": 2.2445525871035177e-06, + "loss": 0.7915684, + "num_input_tokens_seen": 170572320, + "router_z_loss_clip": 1.67480469, + "router_z_loss_mlp": 0.14245605, + "step": 7936, + "time_per_iteration": 2.577134609222412 + }, + { + "auxiliary_loss_clip": 0.06440726, + "auxiliary_loss_mlp": 0.01267366, + "balance_loss_clip": 0.06278887, + "balance_loss_mlp": 0.01254593, + "epoch": 0.47719825642567265, + "flos": 25745955304320.0, + "grad_norm": 1.9394747057802306, + "language_loss": 0.68651855, + "learning_rate": 2.2441660432693656e-06, + "loss": 0.76359951, + "num_input_tokens_seen": 170589470, + "router_z_loss_clip": 1.61914062, + "router_z_loss_mlp": 0.12774658, + "step": 7937, + "time_per_iteration": 2.5523571968078613 + }, + { + "auxiliary_loss_clip": 0.06332788, + "auxiliary_loss_mlp": 0.01255518, + "balance_loss_clip": 0.06264147, + "balance_loss_mlp": 0.01252959, + "epoch": 0.4772583796783406, + "flos": 66376344084480.0, + "grad_norm": 0.7063710164794027, + "language_loss": 0.56256598, + "learning_rate": 2.2437794901764128e-06, + "loss": 0.63844901, + "num_input_tokens_seen": 170662265, + "router_z_loss_clip": 0.6875, + "router_z_loss_mlp": 0.02558899, + "step": 7938, + "time_per_iteration": 3.3101401329040527 + }, + { + "auxiliary_loss_clip": 0.06435397, + "auxiliary_loss_mlp": 0.0126663, + "balance_loss_clip": 0.06278569, + "balance_loss_mlp": 0.01252927, + "epoch": 0.4773185029310086, + "flos": 22057068430080.0, + "grad_norm": 1.5498541545702798, + "language_loss": 0.89232612, + "learning_rate": 2.243392927839317e-06, + "loss": 0.96934634, + "num_input_tokens_seen": 170679680, + "router_z_loss_clip": 1.56738281, + "router_z_loss_mlp": 0.13702393, + "step": 7939, + "time_per_iteration": 2.559797525405884 + }, + { + "auxiliary_loss_clip": 0.06434917, + "auxiliary_loss_mlp": 0.01268488, + "balance_loss_clip": 0.06277393, + "balance_loss_mlp": 0.01256239, + "epoch": 0.47737862618367655, + "flos": 16733496251520.0, + "grad_norm": 2.4258721196632456, + "language_loss": 0.77298427, + "learning_rate": 2.2430063562727367e-06, + "loss": 0.85001838, + "num_input_tokens_seen": 170697340, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.12249756, + "step": 7940, + "time_per_iteration": 2.5268869400024414 + }, + { + "auxiliary_loss_clip": 0.06430884, + "auxiliary_loss_mlp": 0.01269812, + "balance_loss_clip": 0.0627719, + "balance_loss_mlp": 0.01257373, + "epoch": 0.4774387494363445, + "flos": 19615508634240.0, + "grad_norm": 1.6559533080399789, + "language_loss": 0.85386801, + "learning_rate": 2.2426197754913322e-06, + "loss": 0.930875, + "num_input_tokens_seen": 170714905, + "router_z_loss_clip": 1.53710938, + "router_z_loss_mlp": 0.12432861, + "step": 7941, + "time_per_iteration": 2.547070264816284 + }, + { + "auxiliary_loss_clip": 0.06437483, + "auxiliary_loss_mlp": 0.01270392, + "balance_loss_clip": 0.06277451, + "balance_loss_mlp": 0.01257965, + "epoch": 0.4774988726890125, + "flos": 16659507496320.0, + "grad_norm": 1.9070361015512296, + "language_loss": 0.76308775, + "learning_rate": 2.24223318550976e-06, + "loss": 0.84016657, + "num_input_tokens_seen": 170731810, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.12420654, + "step": 7942, + "time_per_iteration": 2.4842329025268555 + }, + { + "auxiliary_loss_clip": 0.06440963, + "auxiliary_loss_mlp": 0.01266017, + "balance_loss_clip": 0.06282113, + "balance_loss_mlp": 0.01253601, + "epoch": 0.47755899594168044, + "flos": 20491843760640.0, + "grad_norm": 1.6294214929971118, + "language_loss": 0.64313745, + "learning_rate": 2.241846586342682e-06, + "loss": 0.72020721, + "num_input_tokens_seen": 170750270, + "router_z_loss_clip": 1.58789062, + "router_z_loss_mlp": 0.12402344, + "step": 7943, + "time_per_iteration": 2.5384066104888916 + }, + { + "auxiliary_loss_clip": 0.06444484, + "auxiliary_loss_mlp": 0.01267261, + "balance_loss_clip": 0.06280033, + "balance_loss_mlp": 0.01253493, + "epoch": 0.4776191191943484, + "flos": 21659228444160.0, + "grad_norm": 1.6943023581153507, + "language_loss": 0.73866045, + "learning_rate": 2.2414599780047577e-06, + "loss": 0.8157779, + "num_input_tokens_seen": 170769015, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.13781738, + "step": 7944, + "time_per_iteration": 2.5201148986816406 + }, + { + "auxiliary_loss_clip": 0.06447009, + "auxiliary_loss_mlp": 0.01271608, + "balance_loss_clip": 0.06287117, + "balance_loss_mlp": 0.01258459, + "epoch": 0.4776792424470164, + "flos": 18776125958400.0, + "grad_norm": 2.2429214657199257, + "language_loss": 0.68437827, + "learning_rate": 2.2410733605106456e-06, + "loss": 0.76156443, + "num_input_tokens_seen": 170785725, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.13153076, + "step": 7945, + "time_per_iteration": 2.5126469135284424 + }, + { + "auxiliary_loss_clip": 0.06440154, + "auxiliary_loss_mlp": 0.01270213, + "balance_loss_clip": 0.06280819, + "balance_loss_mlp": 0.01257577, + "epoch": 0.47773936569968434, + "flos": 29723543821440.0, + "grad_norm": 1.8191434389659598, + "language_loss": 0.75203103, + "learning_rate": 2.240686733875009e-06, + "loss": 0.8291347, + "num_input_tokens_seen": 170804600, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.12628174, + "step": 7946, + "time_per_iteration": 2.5952818393707275 + }, + { + "auxiliary_loss_clip": 0.06450987, + "auxiliary_loss_mlp": 0.0126674, + "balance_loss_clip": 0.06288904, + "balance_loss_mlp": 0.0125368, + "epoch": 0.4777994889523523, + "flos": 24798650169600.0, + "grad_norm": 2.1264871549136566, + "language_loss": 0.79598629, + "learning_rate": 2.240300098112506e-06, + "loss": 0.87316352, + "num_input_tokens_seen": 170824230, + "router_z_loss_clip": 1.62109375, + "router_z_loss_mlp": 0.13043213, + "step": 7947, + "time_per_iteration": 2.561429023742676 + }, + { + "auxiliary_loss_clip": 0.06437, + "auxiliary_loss_mlp": 0.01268516, + "balance_loss_clip": 0.06282562, + "balance_loss_mlp": 0.01255302, + "epoch": 0.47785961220502027, + "flos": 17863928484480.0, + "grad_norm": 1.6733844414372485, + "language_loss": 0.73571151, + "learning_rate": 2.2399134532377998e-06, + "loss": 0.81276667, + "num_input_tokens_seen": 170843365, + "router_z_loss_clip": 1.54199219, + "router_z_loss_mlp": 0.13220215, + "step": 7948, + "time_per_iteration": 2.5309975147247314 + }, + { + "auxiliary_loss_clip": 0.06442553, + "auxiliary_loss_mlp": 0.01267736, + "balance_loss_clip": 0.06283022, + "balance_loss_mlp": 0.01253848, + "epoch": 0.4779197354576883, + "flos": 20272770460800.0, + "grad_norm": 2.2305312131568256, + "language_loss": 0.78282905, + "learning_rate": 2.2395267992655514e-06, + "loss": 0.85993195, + "num_input_tokens_seen": 170863515, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.13891602, + "step": 7949, + "time_per_iteration": 2.5135691165924072 + }, + { + "auxiliary_loss_clip": 0.06441014, + "auxiliary_loss_mlp": 0.01264008, + "balance_loss_clip": 0.06285359, + "balance_loss_mlp": 0.01251849, + "epoch": 0.47797985871035625, + "flos": 17062420654080.0, + "grad_norm": 2.4211239692864686, + "language_loss": 0.75134766, + "learning_rate": 2.2391401362104227e-06, + "loss": 0.82839787, + "num_input_tokens_seen": 170881245, + "router_z_loss_clip": 1.55566406, + "router_z_loss_mlp": 0.12164307, + "step": 7950, + "time_per_iteration": 2.5256588459014893 + }, + { + "auxiliary_loss_clip": 0.06439517, + "auxiliary_loss_mlp": 0.01271424, + "balance_loss_clip": 0.0628176, + "balance_loss_mlp": 0.01258668, + "epoch": 0.4780399819630242, + "flos": 31366530679680.0, + "grad_norm": 1.6557560470716002, + "language_loss": 0.744519, + "learning_rate": 2.2387534640870756e-06, + "loss": 0.82162845, + "num_input_tokens_seen": 170901285, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.12756348, + "step": 7951, + "time_per_iteration": 2.6257662773132324 + }, + { + "auxiliary_loss_clip": 0.0644564, + "auxiliary_loss_mlp": 0.0126871, + "balance_loss_clip": 0.06285301, + "balance_loss_mlp": 0.01255925, + "epoch": 0.4781001052156922, + "flos": 24906488774400.0, + "grad_norm": 2.0941094174335, + "language_loss": 0.80880862, + "learning_rate": 2.238366782910174e-06, + "loss": 0.88595212, + "num_input_tokens_seen": 170919740, + "router_z_loss_clip": 1.60351562, + "router_z_loss_mlp": 0.12786865, + "step": 7952, + "time_per_iteration": 2.6039650440216064 + }, + { + "auxiliary_loss_clip": 0.06449462, + "auxiliary_loss_mlp": 0.01273751, + "balance_loss_clip": 0.06286798, + "balance_loss_mlp": 0.01259684, + "epoch": 0.47816022846836015, + "flos": 18703688503680.0, + "grad_norm": 1.7383850677064194, + "language_loss": 0.78965735, + "learning_rate": 2.23798009269438e-06, + "loss": 0.86688948, + "num_input_tokens_seen": 170938510, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.14068604, + "step": 7953, + "time_per_iteration": 3.9394986629486084 + }, + { + "auxiliary_loss_clip": 0.0644647, + "auxiliary_loss_mlp": 0.0126971, + "balance_loss_clip": 0.0628321, + "balance_loss_mlp": 0.01256793, + "epoch": 0.4782203517210281, + "flos": 11981289864960.0, + "grad_norm": 2.1105030234958733, + "language_loss": 0.84721971, + "learning_rate": 2.2375933934543566e-06, + "loss": 0.92438149, + "num_input_tokens_seen": 170951170, + "router_z_loss_clip": 1.62988281, + "router_z_loss_mlp": 0.12921143, + "step": 7954, + "time_per_iteration": 3.9196231365203857 + }, + { + "auxiliary_loss_clip": 0.06440185, + "auxiliary_loss_mlp": 0.0126799, + "balance_loss_clip": 0.06283759, + "balance_loss_mlp": 0.01255282, + "epoch": 0.4782804749736961, + "flos": 20819761914240.0, + "grad_norm": 1.4881886911999394, + "language_loss": 0.70481235, + "learning_rate": 2.237206685204768e-06, + "loss": 0.78189409, + "num_input_tokens_seen": 170970990, + "router_z_loss_clip": 1.56347656, + "router_z_loss_mlp": 0.1270752, + "step": 7955, + "time_per_iteration": 2.5434484481811523 + }, + { + "auxiliary_loss_clip": 0.064454, + "auxiliary_loss_mlp": 0.01270242, + "balance_loss_clip": 0.06284527, + "balance_loss_mlp": 0.01257326, + "epoch": 0.47834059822636404, + "flos": 23846816914560.0, + "grad_norm": 1.553979149808007, + "language_loss": 0.823044, + "learning_rate": 2.2368199679602787e-06, + "loss": 0.90020043, + "num_input_tokens_seen": 170991215, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.12902832, + "step": 7956, + "time_per_iteration": 2.545602560043335 + }, + { + "auxiliary_loss_clip": 0.06441168, + "auxiliary_loss_mlp": 0.01269938, + "balance_loss_clip": 0.06284995, + "balance_loss_mlp": 0.01255627, + "epoch": 0.478400721479032, + "flos": 22639670668800.0, + "grad_norm": 1.9591153371347299, + "language_loss": 0.85127819, + "learning_rate": 2.2364332417355516e-06, + "loss": 0.92838925, + "num_input_tokens_seen": 171007325, + "router_z_loss_clip": 1.56152344, + "router_z_loss_mlp": 0.14300537, + "step": 7957, + "time_per_iteration": 2.548643112182617 + }, + { + "auxiliary_loss_clip": 0.06441608, + "auxiliary_loss_mlp": 0.01269143, + "balance_loss_clip": 0.06285611, + "balance_loss_mlp": 0.01257001, + "epoch": 0.4784608447317, + "flos": 19361118038400.0, + "grad_norm": 7.050300940807432, + "language_loss": 0.79869133, + "learning_rate": 2.2360465065452527e-06, + "loss": 0.87579882, + "num_input_tokens_seen": 171025650, + "router_z_loss_clip": 1.55957031, + "router_z_loss_mlp": 0.12139893, + "step": 7958, + "time_per_iteration": 2.5078237056732178 + }, + { + "auxiliary_loss_clip": 0.06441762, + "auxiliary_loss_mlp": 0.01268959, + "balance_loss_clip": 0.06283723, + "balance_loss_mlp": 0.0125534, + "epoch": 0.47852096798436794, + "flos": 24027386463360.0, + "grad_norm": 1.6951891176109464, + "language_loss": 0.82802176, + "learning_rate": 2.235659762404047e-06, + "loss": 0.90512896, + "num_input_tokens_seen": 171045045, + "router_z_loss_clip": 1.58203125, + "router_z_loss_mlp": 0.1361084, + "step": 7959, + "time_per_iteration": 2.565302610397339 + }, + { + "auxiliary_loss_clip": 0.06438372, + "auxiliary_loss_mlp": 0.01267095, + "balance_loss_clip": 0.06285324, + "balance_loss_mlp": 0.01255615, + "epoch": 0.4785810912370359, + "flos": 25673559776640.0, + "grad_norm": 2.330976037710063, + "language_loss": 0.73464501, + "learning_rate": 2.235273009326599e-06, + "loss": 0.81169969, + "num_input_tokens_seen": 171062910, + "router_z_loss_clip": 1.53027344, + "router_z_loss_mlp": 0.1149292, + "step": 7960, + "time_per_iteration": 4.027269124984741 + }, + { + "auxiliary_loss_clip": 0.06436551, + "auxiliary_loss_mlp": 0.01270036, + "balance_loss_clip": 0.0628148, + "balance_loss_mlp": 0.01258014, + "epoch": 0.47864121448970387, + "flos": 21438226500480.0, + "grad_norm": 3.172971837567245, + "language_loss": 0.77372915, + "learning_rate": 2.2348862473275745e-06, + "loss": 0.85079503, + "num_input_tokens_seen": 171080875, + "router_z_loss_clip": 1.55078125, + "router_z_loss_mlp": 0.12036133, + "step": 7961, + "time_per_iteration": 2.5147969722747803 + }, + { + "auxiliary_loss_clip": 0.06435739, + "auxiliary_loss_mlp": 0.01267875, + "balance_loss_clip": 0.06278273, + "balance_loss_mlp": 0.01255269, + "epoch": 0.47870133774237184, + "flos": 16149468493440.0, + "grad_norm": 1.5337652867811775, + "language_loss": 0.78017688, + "learning_rate": 2.2344994764216405e-06, + "loss": 0.85721302, + "num_input_tokens_seen": 171099190, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.12597656, + "step": 7962, + "time_per_iteration": 2.513148307800293 + }, + { + "auxiliary_loss_clip": 0.06441396, + "auxiliary_loss_mlp": 0.01270097, + "balance_loss_clip": 0.06281849, + "balance_loss_mlp": 0.01257646, + "epoch": 0.47876146099503986, + "flos": 26914094674560.0, + "grad_norm": 1.8277818369463197, + "language_loss": 0.65211046, + "learning_rate": 2.2341126966234635e-06, + "loss": 0.7292254, + "num_input_tokens_seen": 171119060, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.12457275, + "step": 7963, + "time_per_iteration": 2.601811647415161 + }, + { + "auxiliary_loss_clip": 0.06439337, + "auxiliary_loss_mlp": 0.01266508, + "balance_loss_clip": 0.06280507, + "balance_loss_mlp": 0.01253621, + "epoch": 0.4788215842477078, + "flos": 45342470989440.0, + "grad_norm": 2.309935013710649, + "language_loss": 0.77810884, + "learning_rate": 2.2337259079477083e-06, + "loss": 0.85516727, + "num_input_tokens_seen": 171141900, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.12890625, + "step": 7964, + "time_per_iteration": 2.747879981994629 + }, + { + "auxiliary_loss_clip": 0.06446981, + "auxiliary_loss_mlp": 0.01271797, + "balance_loss_clip": 0.06283239, + "balance_loss_mlp": 0.01257218, + "epoch": 0.4788817075003758, + "flos": 22243801253760.0, + "grad_norm": 1.6568781202078557, + "language_loss": 0.76541996, + "learning_rate": 2.233339110409044e-06, + "loss": 0.84260774, + "num_input_tokens_seen": 171161045, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.14587402, + "step": 7965, + "time_per_iteration": 2.562894344329834 + }, + { + "auxiliary_loss_clip": 0.06441608, + "auxiliary_loss_mlp": 0.01268659, + "balance_loss_clip": 0.06281182, + "balance_loss_mlp": 0.01256434, + "epoch": 0.47894183075304375, + "flos": 16476631960320.0, + "grad_norm": 1.6972134667517975, + "language_loss": 0.74819887, + "learning_rate": 2.232952304022137e-06, + "loss": 0.82530153, + "num_input_tokens_seen": 171179675, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.12237549, + "step": 7966, + "time_per_iteration": 4.023793697357178 + }, + { + "auxiliary_loss_clip": 0.06437664, + "auxiliary_loss_mlp": 0.01265562, + "balance_loss_clip": 0.06279117, + "balance_loss_mlp": 0.01253033, + "epoch": 0.4790019540057117, + "flos": 24290036686080.0, + "grad_norm": 1.5237416858661557, + "language_loss": 0.73335361, + "learning_rate": 2.232565488801655e-06, + "loss": 0.81038582, + "num_input_tokens_seen": 171201175, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.12518311, + "step": 7967, + "time_per_iteration": 2.586228847503662 + }, + { + "auxiliary_loss_clip": 0.06429637, + "auxiliary_loss_mlp": 0.01267705, + "balance_loss_clip": 0.06277768, + "balance_loss_mlp": 0.01254825, + "epoch": 0.4790620772583797, + "flos": 25673601703680.0, + "grad_norm": 2.2388113154567058, + "language_loss": 0.79254079, + "learning_rate": 2.232178664762267e-06, + "loss": 0.86951417, + "num_input_tokens_seen": 171221750, + "router_z_loss_clip": 1.51855469, + "router_z_loss_mlp": 0.12896729, + "step": 7968, + "time_per_iteration": 2.569835901260376 + }, + { + "auxiliary_loss_clip": 0.06330545, + "auxiliary_loss_mlp": 0.01255481, + "balance_loss_clip": 0.06260878, + "balance_loss_mlp": 0.01252947, + "epoch": 0.47912220051104765, + "flos": 69451168711680.0, + "grad_norm": 0.7701358383106056, + "language_loss": 0.62163401, + "learning_rate": 2.2317918319186408e-06, + "loss": 0.69749427, + "num_input_tokens_seen": 171292235, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 0.02534485, + "step": 7969, + "time_per_iteration": 3.2898826599121094 + }, + { + "auxiliary_loss_clip": 0.06435778, + "auxiliary_loss_mlp": 0.01265918, + "balance_loss_clip": 0.06281342, + "balance_loss_mlp": 0.012529, + "epoch": 0.4791823237637156, + "flos": 24175531681920.0, + "grad_norm": 1.7909857243287752, + "language_loss": 0.77847564, + "learning_rate": 2.2314049902854446e-06, + "loss": 0.85549259, + "num_input_tokens_seen": 171312215, + "router_z_loss_clip": 1.54296875, + "router_z_loss_mlp": 0.13006592, + "step": 7970, + "time_per_iteration": 2.5170607566833496 + }, + { + "auxiliary_loss_clip": 0.06435491, + "auxiliary_loss_mlp": 0.01267513, + "balance_loss_clip": 0.06276551, + "balance_loss_mlp": 0.0125384, + "epoch": 0.4792424470163836, + "flos": 24757966212480.0, + "grad_norm": 1.6160167990193877, + "language_loss": 0.71182537, + "learning_rate": 2.231018139877349e-06, + "loss": 0.78885543, + "num_input_tokens_seen": 171332975, + "router_z_loss_clip": 1.58886719, + "router_z_loss_mlp": 0.13665771, + "step": 7971, + "time_per_iteration": 2.572124719619751 + }, + { + "auxiliary_loss_clip": 0.06436221, + "auxiliary_loss_mlp": 0.01271919, + "balance_loss_clip": 0.06279434, + "balance_loss_mlp": 0.01258836, + "epoch": 0.47930257026905154, + "flos": 23264550092160.0, + "grad_norm": 1.2950674857674533, + "language_loss": 0.80144143, + "learning_rate": 2.230631280709021e-06, + "loss": 0.87852287, + "num_input_tokens_seen": 171353880, + "router_z_loss_clip": 1.56738281, + "router_z_loss_mlp": 0.1307373, + "step": 7972, + "time_per_iteration": 2.545262575149536 + }, + { + "auxiliary_loss_clip": 0.06442808, + "auxiliary_loss_mlp": 0.01270328, + "balance_loss_clip": 0.06281324, + "balance_loss_mlp": 0.01256392, + "epoch": 0.4793626935217195, + "flos": 14069299357440.0, + "grad_norm": 2.062531710859889, + "language_loss": 0.70572007, + "learning_rate": 2.2302444127951327e-06, + "loss": 0.7828514, + "num_input_tokens_seen": 171370930, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.13934326, + "step": 7973, + "time_per_iteration": 2.5338237285614014 + }, + { + "auxiliary_loss_clip": 0.064371, + "auxiliary_loss_mlp": 0.01270261, + "balance_loss_clip": 0.06283109, + "balance_loss_mlp": 0.0125806, + "epoch": 0.4794228167743875, + "flos": 21805319237760.0, + "grad_norm": 1.7273933233655367, + "language_loss": 0.79198468, + "learning_rate": 2.2298575361503523e-06, + "loss": 0.86905837, + "num_input_tokens_seen": 171387575, + "router_z_loss_clip": 1.54101562, + "router_z_loss_mlp": 0.12200928, + "step": 7974, + "time_per_iteration": 2.5069854259490967 + }, + { + "auxiliary_loss_clip": 0.06339005, + "auxiliary_loss_mlp": 0.01258702, + "balance_loss_clip": 0.06269643, + "balance_loss_mlp": 0.01255866, + "epoch": 0.47948294002705544, + "flos": 66989022739200.0, + "grad_norm": 0.7443790840370731, + "language_loss": 0.53920376, + "learning_rate": 2.2294706507893517e-06, + "loss": 0.61518085, + "num_input_tokens_seen": 171449980, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 0.02832031, + "step": 7975, + "time_per_iteration": 3.2263216972351074 + }, + { + "auxiliary_loss_clip": 0.06450166, + "auxiliary_loss_mlp": 0.01269981, + "balance_loss_clip": 0.06283702, + "balance_loss_mlp": 0.0125465, + "epoch": 0.47954306327972346, + "flos": 12427444529280.0, + "grad_norm": 1.9824704830592612, + "language_loss": 0.90397954, + "learning_rate": 2.2290837567268008e-06, + "loss": 0.98118103, + "num_input_tokens_seen": 171465290, + "router_z_loss_clip": 1.6640625, + "router_z_loss_mlp": 0.15313721, + "step": 7976, + "time_per_iteration": 2.5806965827941895 + }, + { + "auxiliary_loss_clip": 0.06448781, + "auxiliary_loss_mlp": 0.01272852, + "balance_loss_clip": 0.06284519, + "balance_loss_mlp": 0.01257629, + "epoch": 0.4796031865323914, + "flos": 18366630255360.0, + "grad_norm": 3.7288296944586166, + "language_loss": 0.73905623, + "learning_rate": 2.2286968539773713e-06, + "loss": 0.81627262, + "num_input_tokens_seen": 171481130, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.15209961, + "step": 7977, + "time_per_iteration": 2.5562849044799805 + }, + { + "auxiliary_loss_clip": 0.06437217, + "auxiliary_loss_mlp": 0.01268705, + "balance_loss_clip": 0.06283021, + "balance_loss_mlp": 0.01255741, + "epoch": 0.4796633097850594, + "flos": 21841517001600.0, + "grad_norm": 1.607227573724713, + "language_loss": 0.78873986, + "learning_rate": 2.228309942555734e-06, + "loss": 0.86579907, + "num_input_tokens_seen": 171501140, + "router_z_loss_clip": 1.54296875, + "router_z_loss_mlp": 0.12976074, + "step": 7978, + "time_per_iteration": 2.558842420578003 + }, + { + "auxiliary_loss_clip": 0.06440634, + "auxiliary_loss_mlp": 0.01269299, + "balance_loss_clip": 0.06280127, + "balance_loss_mlp": 0.01255214, + "epoch": 0.47972343303772735, + "flos": 23443526413440.0, + "grad_norm": 1.9276236664860738, + "language_loss": 0.89800453, + "learning_rate": 2.22792302247656e-06, + "loss": 0.97510386, + "num_input_tokens_seen": 171519835, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.14099121, + "step": 7979, + "time_per_iteration": 2.5952987670898438 + }, + { + "auxiliary_loss_clip": 0.06446249, + "auxiliary_loss_mlp": 0.01270987, + "balance_loss_clip": 0.06283665, + "balance_loss_mlp": 0.01256378, + "epoch": 0.4797835562903953, + "flos": 24906698409600.0, + "grad_norm": 1.4562164603157606, + "language_loss": 0.7704469, + "learning_rate": 2.227536093754523e-06, + "loss": 0.8476193, + "num_input_tokens_seen": 171540980, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.14605713, + "step": 7980, + "time_per_iteration": 2.5736522674560547 + }, + { + "auxiliary_loss_clip": 0.06447264, + "auxiliary_loss_mlp": 0.01273404, + "balance_loss_clip": 0.06281359, + "balance_loss_mlp": 0.01258938, + "epoch": 0.4798436795430633, + "flos": 35051644120320.0, + "grad_norm": 1.875578547391537, + "language_loss": 0.71508431, + "learning_rate": 2.227149156404295e-06, + "loss": 0.79229099, + "num_input_tokens_seen": 171563600, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.14459229, + "step": 7981, + "time_per_iteration": 2.6367290019989014 + }, + { + "auxiliary_loss_clip": 0.06439552, + "auxiliary_loss_mlp": 0.01273941, + "balance_loss_clip": 0.06281938, + "balance_loss_mlp": 0.01258998, + "epoch": 0.47990380279573125, + "flos": 20595699296640.0, + "grad_norm": 1.7763359166784585, + "language_loss": 0.70155972, + "learning_rate": 2.2267622104405473e-06, + "loss": 0.77869463, + "num_input_tokens_seen": 171580700, + "router_z_loss_clip": 1.57519531, + "router_z_loss_mlp": 0.14935303, + "step": 7982, + "time_per_iteration": 2.5258874893188477 + }, + { + "auxiliary_loss_clip": 0.06432236, + "auxiliary_loss_mlp": 0.0126906, + "balance_loss_clip": 0.06278554, + "balance_loss_mlp": 0.01257079, + "epoch": 0.4799639260483992, + "flos": 26366600096640.0, + "grad_norm": 1.7437778110304778, + "language_loss": 0.71608925, + "learning_rate": 2.2263752558779544e-06, + "loss": 0.79310226, + "num_input_tokens_seen": 171602035, + "router_z_loss_clip": 1.53417969, + "router_z_loss_mlp": 0.11975098, + "step": 7983, + "time_per_iteration": 2.568826913833618 + }, + { + "auxiliary_loss_clip": 0.06340544, + "auxiliary_loss_mlp": 0.01252804, + "balance_loss_clip": 0.06272082, + "balance_loss_mlp": 0.01249972, + "epoch": 0.4800240493010672, + "flos": 70999371002880.0, + "grad_norm": 0.765879442061108, + "language_loss": 0.59357727, + "learning_rate": 2.2259882927311883e-06, + "loss": 0.66951072, + "num_input_tokens_seen": 171659215, + "router_z_loss_clip": 0.68554688, + "router_z_loss_mlp": 0.02828979, + "step": 7984, + "time_per_iteration": 3.1084651947021484 + }, + { + "auxiliary_loss_clip": 0.0643955, + "auxiliary_loss_mlp": 0.01275134, + "balance_loss_clip": 0.0628207, + "balance_loss_mlp": 0.01262152, + "epoch": 0.48008417255373514, + "flos": 17091406967040.0, + "grad_norm": 1.5773823669430012, + "language_loss": 0.67127079, + "learning_rate": 2.2256013210149247e-06, + "loss": 0.74841756, + "num_input_tokens_seen": 171675710, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.12988281, + "step": 7985, + "time_per_iteration": 2.4906041622161865 + }, + { + "auxiliary_loss_clip": 0.06439713, + "auxiliary_loss_mlp": 0.01270507, + "balance_loss_clip": 0.0627727, + "balance_loss_mlp": 0.01256458, + "epoch": 0.4801442958064031, + "flos": 15418762963200.0, + "grad_norm": 1.6902399231491212, + "language_loss": 0.70749509, + "learning_rate": 2.225214340743835e-06, + "loss": 0.78459728, + "num_input_tokens_seen": 171692510, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.14056396, + "step": 7986, + "time_per_iteration": 2.52093243598938 + }, + { + "auxiliary_loss_clip": 0.06445119, + "auxiliary_loss_mlp": 0.01273703, + "balance_loss_clip": 0.06282695, + "balance_loss_mlp": 0.0125972, + "epoch": 0.4802044190590711, + "flos": 11478546167040.0, + "grad_norm": 1.9459651571320913, + "language_loss": 0.79178715, + "learning_rate": 2.2248273519325956e-06, + "loss": 0.86897534, + "num_input_tokens_seen": 171710235, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.13983154, + "step": 7987, + "time_per_iteration": 2.498640537261963 + }, + { + "auxiliary_loss_clip": 0.06442459, + "auxiliary_loss_mlp": 0.01274239, + "balance_loss_clip": 0.06282187, + "balance_loss_mlp": 0.01260029, + "epoch": 0.48026454231173904, + "flos": 20955874072320.0, + "grad_norm": 2.568897435463935, + "language_loss": 0.75366008, + "learning_rate": 2.2244403545958812e-06, + "loss": 0.83082712, + "num_input_tokens_seen": 171726715, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.14215088, + "step": 7988, + "time_per_iteration": 2.516512632369995 + }, + { + "auxiliary_loss_clip": 0.0644449, + "auxiliary_loss_mlp": 0.01267812, + "balance_loss_clip": 0.06284034, + "balance_loss_mlp": 0.01254651, + "epoch": 0.48032466556440706, + "flos": 20454220477440.0, + "grad_norm": 2.121657383550553, + "language_loss": 0.79781222, + "learning_rate": 2.224053348748365e-06, + "loss": 0.87493527, + "num_input_tokens_seen": 171743605, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.13140869, + "step": 7989, + "time_per_iteration": 2.5021252632141113 + }, + { + "auxiliary_loss_clip": 0.06450642, + "auxiliary_loss_mlp": 0.01272628, + "balance_loss_clip": 0.0628516, + "balance_loss_mlp": 0.01259277, + "epoch": 0.480384788817075, + "flos": 37129507269120.0, + "grad_norm": 1.6027553338262992, + "language_loss": 0.73628318, + "learning_rate": 2.223666334404724e-06, + "loss": 0.81351584, + "num_input_tokens_seen": 171765445, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.13360596, + "step": 7990, + "time_per_iteration": 2.678316593170166 + }, + { + "auxiliary_loss_clip": 0.06340674, + "auxiliary_loss_mlp": 0.01254539, + "balance_loss_clip": 0.06272323, + "balance_loss_mlp": 0.01252124, + "epoch": 0.480444912069743, + "flos": 69572103281280.0, + "grad_norm": 0.7463246314152452, + "language_loss": 0.59028065, + "learning_rate": 2.223279311579633e-06, + "loss": 0.66623276, + "num_input_tokens_seen": 171830115, + "router_z_loss_clip": 0.68359375, + "router_z_loss_mlp": 0.02412415, + "step": 7991, + "time_per_iteration": 3.2123708724975586 + }, + { + "auxiliary_loss_clip": 0.06440669, + "auxiliary_loss_mlp": 0.0127166, + "balance_loss_clip": 0.06280738, + "balance_loss_mlp": 0.01258493, + "epoch": 0.48050503532241096, + "flos": 29829453782400.0, + "grad_norm": 1.8077991766436714, + "language_loss": 0.67425305, + "learning_rate": 2.222892280287768e-06, + "loss": 0.75137639, + "num_input_tokens_seen": 171849135, + "router_z_loss_clip": 1.60058594, + "router_z_loss_mlp": 0.1317749, + "step": 7992, + "time_per_iteration": 4.022457599639893 + }, + { + "auxiliary_loss_clip": 0.06441684, + "auxiliary_loss_mlp": 0.01270903, + "balance_loss_clip": 0.06280079, + "balance_loss_mlp": 0.01257289, + "epoch": 0.4805651585750789, + "flos": 23954865154560.0, + "grad_norm": 1.520335815005364, + "language_loss": 0.76567221, + "learning_rate": 2.2225052405438056e-06, + "loss": 0.84279805, + "num_input_tokens_seen": 171868880, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.13616943, + "step": 7993, + "time_per_iteration": 2.5975513458251953 + }, + { + "auxiliary_loss_clip": 0.0643717, + "auxiliary_loss_mlp": 0.012705, + "balance_loss_clip": 0.06281759, + "balance_loss_mlp": 0.01257101, + "epoch": 0.4806252818277469, + "flos": 25672385819520.0, + "grad_norm": 1.5304271246014225, + "language_loss": 0.78575444, + "learning_rate": 2.222118192362422e-06, + "loss": 0.86283118, + "num_input_tokens_seen": 171889455, + "router_z_loss_clip": 1.55273438, + "router_z_loss_mlp": 0.1340332, + "step": 7994, + "time_per_iteration": 3.9770989418029785 + }, + { + "auxiliary_loss_clip": 0.06441342, + "auxiliary_loss_mlp": 0.01268981, + "balance_loss_clip": 0.06282856, + "balance_loss_mlp": 0.01255284, + "epoch": 0.48068540508041485, + "flos": 13157059956480.0, + "grad_norm": 1.7612496141579397, + "language_loss": 0.80023497, + "learning_rate": 2.2217311357582946e-06, + "loss": 0.87733817, + "num_input_tokens_seen": 171906070, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.13702393, + "step": 7995, + "time_per_iteration": 2.565765380859375 + }, + { + "auxiliary_loss_clip": 0.06436922, + "auxiliary_loss_mlp": 0.01271915, + "balance_loss_clip": 0.06281693, + "balance_loss_mlp": 0.01259499, + "epoch": 0.4807455283330828, + "flos": 21182787728640.0, + "grad_norm": 1.7014068364920145, + "language_loss": 0.82857656, + "learning_rate": 2.2213440707461e-06, + "loss": 0.90566498, + "num_input_tokens_seen": 171926515, + "router_z_loss_clip": 1.55078125, + "router_z_loss_mlp": 0.12408447, + "step": 7996, + "time_per_iteration": 2.5223636627197266 + }, + { + "auxiliary_loss_clip": 0.06437848, + "auxiliary_loss_mlp": 0.01273993, + "balance_loss_clip": 0.06283682, + "balance_loss_mlp": 0.0126104, + "epoch": 0.4808056515857508, + "flos": 12280850611200.0, + "grad_norm": 2.0553444119055095, + "language_loss": 0.81048906, + "learning_rate": 2.220956997340516e-06, + "loss": 0.88760751, + "num_input_tokens_seen": 171943845, + "router_z_loss_clip": 1.54003906, + "router_z_loss_mlp": 0.12957764, + "step": 7997, + "time_per_iteration": 2.5387723445892334 + }, + { + "auxiliary_loss_clip": 0.06439243, + "auxiliary_loss_mlp": 0.01272881, + "balance_loss_clip": 0.06278609, + "balance_loss_mlp": 0.01258886, + "epoch": 0.48086577483841875, + "flos": 24832835435520.0, + "grad_norm": 1.673774189345091, + "language_loss": 0.72584945, + "learning_rate": 2.220569915556221e-06, + "loss": 0.80297071, + "num_input_tokens_seen": 171964970, + "router_z_loss_clip": 1.60742188, + "router_z_loss_mlp": 0.13989258, + "step": 7998, + "time_per_iteration": 2.5332131385803223 + }, + { + "auxiliary_loss_clip": 0.06438513, + "auxiliary_loss_mlp": 0.0127211, + "balance_loss_clip": 0.06282588, + "balance_loss_mlp": 0.01258931, + "epoch": 0.4809258980910867, + "flos": 24472786440960.0, + "grad_norm": 1.7584112558628078, + "language_loss": 0.71207035, + "learning_rate": 2.220182825407892e-06, + "loss": 0.78917658, + "num_input_tokens_seen": 171986340, + "router_z_loss_clip": 1.55664062, + "router_z_loss_mlp": 0.1317749, + "step": 7999, + "time_per_iteration": 2.5675172805786133 + }, + { + "auxiliary_loss_clip": 0.06447413, + "auxiliary_loss_mlp": 0.01268559, + "balance_loss_clip": 0.06285158, + "balance_loss_mlp": 0.01254581, + "epoch": 0.4809860213437547, + "flos": 21222465436800.0, + "grad_norm": 1.5803850534596136, + "language_loss": 0.71622467, + "learning_rate": 2.2197957269102083e-06, + "loss": 0.79338437, + "num_input_tokens_seen": 172007300, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.13983154, + "step": 8000, + "time_per_iteration": 4.0574305057525635 + }, + { + "auxiliary_loss_clip": 0.06440975, + "auxiliary_loss_mlp": 0.01266748, + "balance_loss_clip": 0.06282955, + "balance_loss_mlp": 0.01253558, + "epoch": 0.48104614459642264, + "flos": 37640929864320.0, + "grad_norm": 1.3783876991224597, + "language_loss": 0.75060636, + "learning_rate": 2.2194086200778485e-06, + "loss": 0.82768357, + "num_input_tokens_seen": 172029585, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.13189697, + "step": 8001, + "time_per_iteration": 2.6750619411468506 + }, + { + "auxiliary_loss_clip": 0.06444116, + "auxiliary_loss_mlp": 0.01269598, + "balance_loss_clip": 0.06285578, + "balance_loss_mlp": 0.0125667, + "epoch": 0.48110626784909066, + "flos": 18412093895040.0, + "grad_norm": 3.3850625220280066, + "language_loss": 0.81721932, + "learning_rate": 2.219021504925493e-06, + "loss": 0.89435649, + "num_input_tokens_seen": 172047495, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.12921143, + "step": 8002, + "time_per_iteration": 2.537611961364746 + }, + { + "auxiliary_loss_clip": 0.06444092, + "auxiliary_loss_mlp": 0.01266064, + "balance_loss_clip": 0.06282309, + "balance_loss_mlp": 0.0125232, + "epoch": 0.48116639110175863, + "flos": 28447481992320.0, + "grad_norm": 1.6717054522334394, + "language_loss": 0.71586967, + "learning_rate": 2.218634381467819e-06, + "loss": 0.79297119, + "num_input_tokens_seen": 172067625, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.13739014, + "step": 8003, + "time_per_iteration": 2.586836576461792 + }, + { + "auxiliary_loss_clip": 0.06435338, + "auxiliary_loss_mlp": 0.01268946, + "balance_loss_clip": 0.0628237, + "balance_loss_mlp": 0.01256375, + "epoch": 0.4812265143544266, + "flos": 21731582044800.0, + "grad_norm": 1.5740971137450945, + "language_loss": 0.82286322, + "learning_rate": 2.218247249719507e-06, + "loss": 0.89990604, + "num_input_tokens_seen": 172087885, + "router_z_loss_clip": 1.52832031, + "router_z_loss_mlp": 0.12561035, + "step": 8004, + "time_per_iteration": 2.5606155395507812 + }, + { + "auxiliary_loss_clip": 0.06454347, + "auxiliary_loss_mlp": 0.01272857, + "balance_loss_clip": 0.06285338, + "balance_loss_mlp": 0.01258004, + "epoch": 0.48128663760709456, + "flos": 13229707046400.0, + "grad_norm": 2.0390359670143465, + "language_loss": 0.77871376, + "learning_rate": 2.217860109695239e-06, + "loss": 0.85598582, + "num_input_tokens_seen": 172105815, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.14837646, + "step": 8005, + "time_per_iteration": 2.47816801071167 + }, + { + "auxiliary_loss_clip": 0.06444031, + "auxiliary_loss_mlp": 0.01265777, + "balance_loss_clip": 0.06283107, + "balance_loss_mlp": 0.01252902, + "epoch": 0.4813467608597625, + "flos": 24250317050880.0, + "grad_norm": 8.997763816911675, + "language_loss": 0.71145892, + "learning_rate": 2.217472961409692e-06, + "loss": 0.78855699, + "num_input_tokens_seen": 172126125, + "router_z_loss_clip": 1.61132812, + "router_z_loss_mlp": 0.12866211, + "step": 8006, + "time_per_iteration": 3.998465061187744 + }, + { + "auxiliary_loss_clip": 0.06443979, + "auxiliary_loss_mlp": 0.0126724, + "balance_loss_clip": 0.06283164, + "balance_loss_mlp": 0.01253502, + "epoch": 0.4814068841124305, + "flos": 27486131299200.0, + "grad_norm": 1.774717747938, + "language_loss": 0.7057631, + "learning_rate": 2.2170858048775495e-06, + "loss": 0.78287524, + "num_input_tokens_seen": 172141945, + "router_z_loss_clip": 1.60839844, + "router_z_loss_mlp": 0.13726807, + "step": 8007, + "time_per_iteration": 2.6010959148406982 + }, + { + "auxiliary_loss_clip": 0.06445048, + "auxiliary_loss_mlp": 0.01270091, + "balance_loss_clip": 0.06283326, + "balance_loss_mlp": 0.01256382, + "epoch": 0.48146700736509845, + "flos": 19578933527040.0, + "grad_norm": 1.7543289086675633, + "language_loss": 0.72215438, + "learning_rate": 2.2166986401134914e-06, + "loss": 0.79930574, + "num_input_tokens_seen": 172161095, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.137146, + "step": 8008, + "time_per_iteration": 2.5119597911834717 + }, + { + "auxiliary_loss_clip": 0.064485, + "auxiliary_loss_mlp": 0.01270116, + "balance_loss_clip": 0.06287649, + "balance_loss_mlp": 0.01256699, + "epoch": 0.4815271306177664, + "flos": 20633448360960.0, + "grad_norm": 2.3493781090087427, + "language_loss": 0.61680824, + "learning_rate": 2.216311467132199e-06, + "loss": 0.6939944, + "num_input_tokens_seen": 172178750, + "router_z_loss_clip": 1.60839844, + "router_z_loss_mlp": 0.13421631, + "step": 8009, + "time_per_iteration": 2.531614303588867 + }, + { + "auxiliary_loss_clip": 0.06337314, + "auxiliary_loss_mlp": 0.01256915, + "balance_loss_clip": 0.062691, + "balance_loss_mlp": 0.01254566, + "epoch": 0.4815872538704344, + "flos": 67710168904320.0, + "grad_norm": 0.8824544242806498, + "language_loss": 0.61164761, + "learning_rate": 2.2159242859483547e-06, + "loss": 0.68758988, + "num_input_tokens_seen": 172240235, + "router_z_loss_clip": 0.68017578, + "router_z_loss_mlp": 0.0234375, + "step": 8010, + "time_per_iteration": 3.1565909385681152 + }, + { + "auxiliary_loss_clip": 0.06445675, + "auxiliary_loss_mlp": 0.01270127, + "balance_loss_clip": 0.06287005, + "balance_loss_mlp": 0.01256364, + "epoch": 0.48164737712310235, + "flos": 22827451668480.0, + "grad_norm": 1.6746394307020662, + "language_loss": 0.73637664, + "learning_rate": 2.215537096576639e-06, + "loss": 0.81353462, + "num_input_tokens_seen": 172259875, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.1373291, + "step": 8011, + "time_per_iteration": 2.6046555042266846 + }, + { + "auxiliary_loss_clip": 0.0643819, + "auxiliary_loss_mlp": 0.01270392, + "balance_loss_clip": 0.06284268, + "balance_loss_mlp": 0.01257887, + "epoch": 0.4817075003757703, + "flos": 23740865026560.0, + "grad_norm": 1.8215201759984196, + "language_loss": 0.79494172, + "learning_rate": 2.2151498990317354e-06, + "loss": 0.87202752, + "num_input_tokens_seen": 172280150, + "router_z_loss_clip": 1.5390625, + "router_z_loss_mlp": 0.125, + "step": 8012, + "time_per_iteration": 2.5538861751556396 + }, + { + "auxiliary_loss_clip": 0.06444636, + "auxiliary_loss_mlp": 0.0127321, + "balance_loss_clip": 0.0628611, + "balance_loss_mlp": 0.01259501, + "epoch": 0.4817676236284383, + "flos": 28190282284800.0, + "grad_norm": 1.6047815948624113, + "language_loss": 0.73606604, + "learning_rate": 2.214762693328326e-06, + "loss": 0.81324452, + "num_input_tokens_seen": 172300810, + "router_z_loss_clip": 1.58398438, + "router_z_loss_mlp": 0.1373291, + "step": 8013, + "time_per_iteration": 2.6944220066070557 + }, + { + "auxiliary_loss_clip": 0.06441531, + "auxiliary_loss_mlp": 0.01264943, + "balance_loss_clip": 0.06285915, + "balance_loss_mlp": 0.01253094, + "epoch": 0.48182774688110624, + "flos": 17097360606720.0, + "grad_norm": 1.8755216355849496, + "language_loss": 0.91141838, + "learning_rate": 2.214375479481094e-06, + "loss": 0.98848319, + "num_input_tokens_seen": 172317930, + "router_z_loss_clip": 1.55371094, + "router_z_loss_mlp": 0.11859131, + "step": 8014, + "time_per_iteration": 2.501678466796875 + }, + { + "auxiliary_loss_clip": 0.06448989, + "auxiliary_loss_mlp": 0.0126993, + "balance_loss_clip": 0.06285382, + "balance_loss_mlp": 0.01256149, + "epoch": 0.4818878701337742, + "flos": 12572780636160.0, + "grad_norm": 2.068904383285823, + "language_loss": 0.75191212, + "learning_rate": 2.213988257504722e-06, + "loss": 0.82910132, + "num_input_tokens_seen": 172336340, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.13775635, + "step": 8015, + "time_per_iteration": 2.574915885925293 + }, + { + "auxiliary_loss_clip": 0.06450102, + "auxiliary_loss_mlp": 0.01268556, + "balance_loss_clip": 0.06285062, + "balance_loss_mlp": 0.01254942, + "epoch": 0.48194799338644223, + "flos": 24615481144320.0, + "grad_norm": 2.7940595212226693, + "language_loss": 0.80323374, + "learning_rate": 2.213601027413894e-06, + "loss": 0.88042033, + "num_input_tokens_seen": 172354315, + "router_z_loss_clip": 1.65136719, + "router_z_loss_mlp": 0.13604736, + "step": 8016, + "time_per_iteration": 2.545562744140625 + }, + { + "auxiliary_loss_clip": 0.06441234, + "auxiliary_loss_mlp": 0.01268233, + "balance_loss_clip": 0.06288698, + "balance_loss_mlp": 0.01255996, + "epoch": 0.4820081166391102, + "flos": 21111482304000.0, + "grad_norm": 1.7856263642868424, + "language_loss": 0.77840865, + "learning_rate": 2.2132137892232933e-06, + "loss": 0.85550332, + "num_input_tokens_seen": 172372695, + "router_z_loss_clip": 1.52636719, + "router_z_loss_mlp": 0.12237549, + "step": 8017, + "time_per_iteration": 2.548884153366089 + }, + { + "auxiliary_loss_clip": 0.06442289, + "auxiliary_loss_mlp": 0.01274842, + "balance_loss_clip": 0.06287417, + "balance_loss_mlp": 0.01261729, + "epoch": 0.48206823989177816, + "flos": 25271569013760.0, + "grad_norm": 1.8858588216369734, + "language_loss": 0.80356038, + "learning_rate": 2.2128265429476043e-06, + "loss": 0.8807317, + "num_input_tokens_seen": 172390905, + "router_z_loss_clip": 1.54785156, + "router_z_loss_mlp": 0.13098145, + "step": 8018, + "time_per_iteration": 2.5485877990722656 + }, + { + "auxiliary_loss_clip": 0.06443836, + "auxiliary_loss_mlp": 0.01268171, + "balance_loss_clip": 0.06283845, + "balance_loss_mlp": 0.01255177, + "epoch": 0.4821283631444461, + "flos": 24652056251520.0, + "grad_norm": 1.8013341989070415, + "language_loss": 0.76402384, + "learning_rate": 2.2124392886015124e-06, + "loss": 0.84114391, + "num_input_tokens_seen": 172412295, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.12988281, + "step": 8019, + "time_per_iteration": 2.583380937576294 + }, + { + "auxiliary_loss_clip": 0.06444359, + "auxiliary_loss_mlp": 0.01271658, + "balance_loss_clip": 0.06285813, + "balance_loss_mlp": 0.01258826, + "epoch": 0.4821884863971141, + "flos": 23959015931520.0, + "grad_norm": 1.6800720935629156, + "language_loss": 0.79355383, + "learning_rate": 2.212052026199701e-06, + "loss": 0.87071395, + "num_input_tokens_seen": 172432625, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 0.12841797, + "step": 8020, + "time_per_iteration": 2.531282663345337 + }, + { + "auxiliary_loss_clip": 0.06436829, + "auxiliary_loss_mlp": 0.01270595, + "balance_loss_clip": 0.06283663, + "balance_loss_mlp": 0.01257655, + "epoch": 0.48224860964978206, + "flos": 17165605357440.0, + "grad_norm": 1.8962985695511603, + "language_loss": 0.70203435, + "learning_rate": 2.211664755756855e-06, + "loss": 0.77910858, + "num_input_tokens_seen": 172450010, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.12945557, + "step": 8021, + "time_per_iteration": 2.5050454139709473 + }, + { + "auxiliary_loss_clip": 0.06448636, + "auxiliary_loss_mlp": 0.01267557, + "balance_loss_clip": 0.06284462, + "balance_loss_mlp": 0.01253568, + "epoch": 0.48230873290245, + "flos": 23082513096960.0, + "grad_norm": 1.8444275684859448, + "language_loss": 0.63131356, + "learning_rate": 2.2112774772876603e-06, + "loss": 0.70847559, + "num_input_tokens_seen": 172469080, + "router_z_loss_clip": 1.63964844, + "router_z_loss_mlp": 0.14001465, + "step": 8022, + "time_per_iteration": 2.5153286457061768 + }, + { + "auxiliary_loss_clip": 0.06439438, + "auxiliary_loss_mlp": 0.0127221, + "balance_loss_clip": 0.06284659, + "balance_loss_mlp": 0.01259544, + "epoch": 0.482368856155118, + "flos": 19359440956800.0, + "grad_norm": 2.0552590280374625, + "language_loss": 0.67256629, + "learning_rate": 2.2108901908068028e-06, + "loss": 0.74968272, + "num_input_tokens_seen": 172484850, + "router_z_loss_clip": 1.54785156, + "router_z_loss_mlp": 0.12664795, + "step": 8023, + "time_per_iteration": 2.5504207611083984 + }, + { + "auxiliary_loss_clip": 0.06441902, + "auxiliary_loss_mlp": 0.01274331, + "balance_loss_clip": 0.06284256, + "balance_loss_mlp": 0.01261426, + "epoch": 0.48242897940778595, + "flos": 20084318628480.0, + "grad_norm": 1.5610336564699971, + "language_loss": 0.76933229, + "learning_rate": 2.2105028963289683e-06, + "loss": 0.84649462, + "num_input_tokens_seen": 172503525, + "router_z_loss_clip": 1.57714844, + "router_z_loss_mlp": 0.12915039, + "step": 8024, + "time_per_iteration": 2.576347589492798 + }, + { + "auxiliary_loss_clip": 0.06441621, + "auxiliary_loss_mlp": 0.01268624, + "balance_loss_clip": 0.06283119, + "balance_loss_mlp": 0.01255553, + "epoch": 0.4824891026604539, + "flos": 23410682812800.0, + "grad_norm": 1.519749434932375, + "language_loss": 0.75555682, + "learning_rate": 2.2101155938688423e-06, + "loss": 0.83265924, + "num_input_tokens_seen": 172524360, + "router_z_loss_clip": 1.58203125, + "router_z_loss_mlp": 0.13067627, + "step": 8025, + "time_per_iteration": 2.559722900390625 + }, + { + "auxiliary_loss_clip": 0.06445173, + "auxiliary_loss_mlp": 0.01270078, + "balance_loss_clip": 0.06286605, + "balance_loss_mlp": 0.01256536, + "epoch": 0.4825492259131219, + "flos": 20373691104000.0, + "grad_norm": 3.210842824131336, + "language_loss": 0.71099132, + "learning_rate": 2.209728283441112e-06, + "loss": 0.78814387, + "num_input_tokens_seen": 172541480, + "router_z_loss_clip": 1.58398438, + "router_z_loss_mlp": 0.13543701, + "step": 8026, + "time_per_iteration": 2.512563943862915 + }, + { + "auxiliary_loss_clip": 0.06450065, + "auxiliary_loss_mlp": 0.0127128, + "balance_loss_clip": 0.06287996, + "balance_loss_mlp": 0.01257094, + "epoch": 0.48260934916578985, + "flos": 14324193077760.0, + "grad_norm": 2.0787728376845385, + "language_loss": 0.74646676, + "learning_rate": 2.209340965060465e-06, + "loss": 0.82368022, + "num_input_tokens_seen": 172559005, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.14190674, + "step": 8027, + "time_per_iteration": 2.523252248764038 + }, + { + "auxiliary_loss_clip": 0.06445143, + "auxiliary_loss_mlp": 0.01269951, + "balance_loss_clip": 0.06285772, + "balance_loss_mlp": 0.01257166, + "epoch": 0.4826694724184578, + "flos": 22126654846080.0, + "grad_norm": 1.6924958309049165, + "language_loss": 0.67414463, + "learning_rate": 2.2089536387415868e-06, + "loss": 0.75129557, + "num_input_tokens_seen": 172578435, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.12792969, + "step": 8028, + "time_per_iteration": 2.5118508338928223 + }, + { + "auxiliary_loss_clip": 0.06443746, + "auxiliary_loss_mlp": 0.01268069, + "balance_loss_clip": 0.06285068, + "balance_loss_mlp": 0.01254926, + "epoch": 0.48272959567112583, + "flos": 16186882141440.0, + "grad_norm": 1.4109383431826554, + "language_loss": 0.73031461, + "learning_rate": 2.2085663044991655e-06, + "loss": 0.80743277, + "num_input_tokens_seen": 172596095, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.13134766, + "step": 8029, + "time_per_iteration": 2.513986587524414 + }, + { + "auxiliary_loss_clip": 0.06447576, + "auxiliary_loss_mlp": 0.01267005, + "balance_loss_clip": 0.0628765, + "balance_loss_mlp": 0.01253755, + "epoch": 0.4827897189237938, + "flos": 23186326705920.0, + "grad_norm": 2.2851559020013994, + "language_loss": 0.84759653, + "learning_rate": 2.2081789623478896e-06, + "loss": 0.92474234, + "num_input_tokens_seen": 172615255, + "router_z_loss_clip": 1.59863281, + "router_z_loss_mlp": 0.13256836, + "step": 8030, + "time_per_iteration": 2.523336410522461 + }, + { + "auxiliary_loss_clip": 0.0644383, + "auxiliary_loss_mlp": 0.0126632, + "balance_loss_clip": 0.06286349, + "balance_loss_mlp": 0.01253374, + "epoch": 0.48284984217646176, + "flos": 21659018808960.0, + "grad_norm": 2.6563677126547858, + "language_loss": 0.73703504, + "learning_rate": 2.2077916123024466e-06, + "loss": 0.81413656, + "num_input_tokens_seen": 172633185, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.12945557, + "step": 8031, + "time_per_iteration": 2.523465633392334 + }, + { + "auxiliary_loss_clip": 0.06451262, + "auxiliary_loss_mlp": 0.01268996, + "balance_loss_clip": 0.06285872, + "balance_loss_mlp": 0.01254548, + "epoch": 0.48290996542912973, + "flos": 31475501314560.0, + "grad_norm": 1.5957405541522132, + "language_loss": 0.71345282, + "learning_rate": 2.2074042543775245e-06, + "loss": 0.79065537, + "num_input_tokens_seen": 172654280, + "router_z_loss_clip": 1.65234375, + "router_z_loss_mlp": 0.14434814, + "step": 8032, + "time_per_iteration": 4.084775924682617 + }, + { + "auxiliary_loss_clip": 0.06441716, + "auxiliary_loss_mlp": 0.01271696, + "balance_loss_clip": 0.06285156, + "balance_loss_mlp": 0.01259066, + "epoch": 0.4829700886817977, + "flos": 24468803372160.0, + "grad_norm": 1.3669631944631024, + "language_loss": 0.74361598, + "learning_rate": 2.2070168885878126e-06, + "loss": 0.82075012, + "num_input_tokens_seen": 172675545, + "router_z_loss_clip": 1.56542969, + "router_z_loss_mlp": 0.12609863, + "step": 8033, + "time_per_iteration": 2.558655023574829 + }, + { + "auxiliary_loss_clip": 0.06455428, + "auxiliary_loss_mlp": 0.0126933, + "balance_loss_clip": 0.06290704, + "balance_loss_mlp": 0.01255436, + "epoch": 0.48303021193446566, + "flos": 25709170561920.0, + "grad_norm": 1.5251236339326817, + "language_loss": 0.83579373, + "learning_rate": 2.2066295149479996e-06, + "loss": 0.91304129, + "num_input_tokens_seen": 172696455, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.13909912, + "step": 8034, + "time_per_iteration": 4.034566402435303 + }, + { + "auxiliary_loss_clip": 0.06441804, + "auxiliary_loss_mlp": 0.01267333, + "balance_loss_clip": 0.06286483, + "balance_loss_mlp": 0.01255162, + "epoch": 0.4830903351871336, + "flos": 20091613933440.0, + "grad_norm": 1.4995747649605073, + "language_loss": 0.80011666, + "learning_rate": 2.2062421334727744e-06, + "loss": 0.87720799, + "num_input_tokens_seen": 172716720, + "router_z_loss_clip": 1.55273438, + "router_z_loss_mlp": 0.12176514, + "step": 8035, + "time_per_iteration": 2.560216188430786 + }, + { + "auxiliary_loss_clip": 0.06443267, + "auxiliary_loss_mlp": 0.01272391, + "balance_loss_clip": 0.06284694, + "balance_loss_mlp": 0.01257996, + "epoch": 0.4831504584398016, + "flos": 39460670910720.0, + "grad_norm": 2.4180718513556196, + "language_loss": 0.69735384, + "learning_rate": 2.2058547441768267e-06, + "loss": 0.77451038, + "num_input_tokens_seen": 172737435, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.14385986, + "step": 8036, + "time_per_iteration": 2.676248550415039 + }, + { + "auxiliary_loss_clip": 0.06441773, + "auxiliary_loss_mlp": 0.01267179, + "balance_loss_clip": 0.06283154, + "balance_loss_mlp": 0.01254638, + "epoch": 0.48321058169246955, + "flos": 20012006954880.0, + "grad_norm": 1.964916404489229, + "language_loss": 0.7269727, + "learning_rate": 2.205467347074847e-06, + "loss": 0.80406225, + "num_input_tokens_seen": 172755700, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.12536621, + "step": 8037, + "time_per_iteration": 2.5361721515655518 + }, + { + "auxiliary_loss_clip": 0.06449978, + "auxiliary_loss_mlp": 0.01267952, + "balance_loss_clip": 0.06284893, + "balance_loss_mlp": 0.01254594, + "epoch": 0.4832707049451375, + "flos": 20747869511040.0, + "grad_norm": 2.294242093364334, + "language_loss": 0.69135344, + "learning_rate": 2.205079942181525e-06, + "loss": 0.76853275, + "num_input_tokens_seen": 172775185, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.13366699, + "step": 8038, + "time_per_iteration": 2.5300488471984863 + }, + { + "auxiliary_loss_clip": 0.06441218, + "auxiliary_loss_mlp": 0.01266351, + "balance_loss_clip": 0.06284897, + "balance_loss_mlp": 0.01253161, + "epoch": 0.4833308281978055, + "flos": 33153889322880.0, + "grad_norm": 1.5080177559172256, + "language_loss": 0.79238868, + "learning_rate": 2.20469252951155e-06, + "loss": 0.8694644, + "num_input_tokens_seen": 172796990, + "router_z_loss_clip": 1.56347656, + "router_z_loss_mlp": 0.13201904, + "step": 8039, + "time_per_iteration": 4.106697082519531 + }, + { + "auxiliary_loss_clip": 0.06443603, + "auxiliary_loss_mlp": 0.01270239, + "balance_loss_clip": 0.06284612, + "balance_loss_mlp": 0.01257221, + "epoch": 0.48339095145047345, + "flos": 19105301923200.0, + "grad_norm": 2.5245127885531926, + "language_loss": 0.78196943, + "learning_rate": 2.2043051090796143e-06, + "loss": 0.85910785, + "num_input_tokens_seen": 172814915, + "router_z_loss_clip": 1.58886719, + "router_z_loss_mlp": 0.13024902, + "step": 8040, + "time_per_iteration": 2.51356840133667 + }, + { + "auxiliary_loss_clip": 0.06449578, + "auxiliary_loss_mlp": 0.01268689, + "balance_loss_clip": 0.06287356, + "balance_loss_mlp": 0.01254342, + "epoch": 0.4834510747031414, + "flos": 34468035632640.0, + "grad_norm": 1.5686841461958603, + "language_loss": 0.75648201, + "learning_rate": 2.203917680900409e-06, + "loss": 0.83366466, + "num_input_tokens_seen": 172837060, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.14337158, + "step": 8041, + "time_per_iteration": 2.6821110248565674 + }, + { + "auxiliary_loss_clip": 0.06444554, + "auxiliary_loss_mlp": 0.01274011, + "balance_loss_clip": 0.06290209, + "balance_loss_mlp": 0.01261244, + "epoch": 0.48351119795580944, + "flos": 27388187475840.0, + "grad_norm": 1.655786729526556, + "language_loss": 0.66309774, + "learning_rate": 2.203530244988624e-06, + "loss": 0.74028337, + "num_input_tokens_seen": 172856545, + "router_z_loss_clip": 1.54296875, + "router_z_loss_mlp": 0.12756348, + "step": 8042, + "time_per_iteration": 2.587979316711426 + }, + { + "auxiliary_loss_clip": 0.0635567, + "auxiliary_loss_mlp": 0.01262787, + "balance_loss_clip": 0.06287327, + "balance_loss_mlp": 0.012603, + "epoch": 0.4835713212084774, + "flos": 67162967815680.0, + "grad_norm": 0.683297043643475, + "language_loss": 0.58432257, + "learning_rate": 2.2031428013589517e-06, + "loss": 0.66050708, + "num_input_tokens_seen": 172923055, + "router_z_loss_clip": 0.68359375, + "router_z_loss_mlp": 0.02485657, + "step": 8043, + "time_per_iteration": 3.240037441253662 + }, + { + "auxiliary_loss_clip": 0.06448962, + "auxiliary_loss_mlp": 0.01270561, + "balance_loss_clip": 0.06288527, + "balance_loss_mlp": 0.01256548, + "epoch": 0.48363144446114537, + "flos": 17973234535680.0, + "grad_norm": 8.666689726695457, + "language_loss": 0.71932065, + "learning_rate": 2.2027553500260847e-06, + "loss": 0.79651588, + "num_input_tokens_seen": 172940700, + "router_z_loss_clip": 1.60546875, + "router_z_loss_mlp": 0.14013672, + "step": 8044, + "time_per_iteration": 2.557222604751587 + }, + { + "auxiliary_loss_clip": 0.06443186, + "auxiliary_loss_mlp": 0.01271215, + "balance_loss_clip": 0.06287612, + "balance_loss_mlp": 0.01257667, + "epoch": 0.48369156771381333, + "flos": 20599556584320.0, + "grad_norm": 1.2792089170093015, + "language_loss": 0.76084363, + "learning_rate": 2.202367891004714e-06, + "loss": 0.83798766, + "num_input_tokens_seen": 172961125, + "router_z_loss_clip": 1.55371094, + "router_z_loss_mlp": 0.13549805, + "step": 8045, + "time_per_iteration": 3.9927117824554443 + }, + { + "auxiliary_loss_clip": 0.06452677, + "auxiliary_loss_mlp": 0.01268119, + "balance_loss_clip": 0.06291251, + "balance_loss_mlp": 0.01255274, + "epoch": 0.4837516909664813, + "flos": 22681780145280.0, + "grad_norm": 1.8159113209886955, + "language_loss": 0.69591677, + "learning_rate": 2.201980424309533e-06, + "loss": 0.77312469, + "num_input_tokens_seen": 172980405, + "router_z_loss_clip": 1.61230469, + "router_z_loss_mlp": 0.12854004, + "step": 8046, + "time_per_iteration": 2.563061237335205 + }, + { + "auxiliary_loss_clip": 0.06444287, + "auxiliary_loss_mlp": 0.01272531, + "balance_loss_clip": 0.06285235, + "balance_loss_mlp": 0.01259674, + "epoch": 0.48381181421914926, + "flos": 25525414558080.0, + "grad_norm": 1.7918831202662233, + "language_loss": 0.83005214, + "learning_rate": 2.2015929499552337e-06, + "loss": 0.90722024, + "num_input_tokens_seen": 172999105, + "router_z_loss_clip": 1.58886719, + "router_z_loss_mlp": 0.12866211, + "step": 8047, + "time_per_iteration": 2.5624239444732666 + }, + { + "auxiliary_loss_clip": 0.06441472, + "auxiliary_loss_mlp": 0.01268193, + "balance_loss_clip": 0.06286557, + "balance_loss_mlp": 0.01255522, + "epoch": 0.4838719374718172, + "flos": 24214454703360.0, + "grad_norm": 3.8503425220093273, + "language_loss": 0.8051095, + "learning_rate": 2.2012054679565092e-06, + "loss": 0.88220614, + "num_input_tokens_seen": 173019935, + "router_z_loss_clip": 1.54785156, + "router_z_loss_mlp": 0.12664795, + "step": 8048, + "time_per_iteration": 2.5535151958465576 + }, + { + "auxiliary_loss_clip": 0.06450336, + "auxiliary_loss_mlp": 0.01269587, + "balance_loss_clip": 0.06287669, + "balance_loss_mlp": 0.01255091, + "epoch": 0.4839320607244852, + "flos": 26731889971200.0, + "grad_norm": 1.601579819484506, + "language_loss": 0.8118276, + "learning_rate": 2.200817978328054e-06, + "loss": 0.88902682, + "num_input_tokens_seen": 173039700, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.14477539, + "step": 8049, + "time_per_iteration": 2.576237440109253 + }, + { + "auxiliary_loss_clip": 0.0644124, + "auxiliary_loss_mlp": 0.01266224, + "balance_loss_clip": 0.0628837, + "balance_loss_mlp": 0.01254392, + "epoch": 0.48399218397715316, + "flos": 20455142872320.0, + "grad_norm": 1.6782620987313854, + "language_loss": 0.7275942, + "learning_rate": 2.2004304810845602e-06, + "loss": 0.8046689, + "num_input_tokens_seen": 173059170, + "router_z_loss_clip": 1.52832031, + "router_z_loss_mlp": 0.1184082, + "step": 8050, + "time_per_iteration": 2.5001842975616455 + }, + { + "auxiliary_loss_clip": 0.06348944, + "auxiliary_loss_mlp": 0.01254327, + "balance_loss_clip": 0.06280461, + "balance_loss_mlp": 0.01252052, + "epoch": 0.4840523072298211, + "flos": 67199626776960.0, + "grad_norm": 0.6876828937687306, + "language_loss": 0.56319511, + "learning_rate": 2.200042976240723e-06, + "loss": 0.63922787, + "num_input_tokens_seen": 173119000, + "router_z_loss_clip": 0.6875, + "router_z_loss_mlp": 0.02278137, + "step": 8051, + "time_per_iteration": 3.1732234954833984 + }, + { + "auxiliary_loss_clip": 0.06445932, + "auxiliary_loss_mlp": 0.01267371, + "balance_loss_clip": 0.06285888, + "balance_loss_mlp": 0.01254806, + "epoch": 0.4841124304824891, + "flos": 22416782008320.0, + "grad_norm": 1.9466323687223244, + "language_loss": 0.75329518, + "learning_rate": 2.199655463811236e-06, + "loss": 0.83042824, + "num_input_tokens_seen": 173137570, + "router_z_loss_clip": 1.60058594, + "router_z_loss_mlp": 0.12554932, + "step": 8052, + "time_per_iteration": 2.525742769241333 + }, + { + "auxiliary_loss_clip": 0.06445011, + "auxiliary_loss_mlp": 0.01268398, + "balance_loss_clip": 0.0628748, + "balance_loss_mlp": 0.01255797, + "epoch": 0.48417255373515705, + "flos": 13848926319360.0, + "grad_norm": 9.22847684329053, + "language_loss": 0.65932119, + "learning_rate": 2.1992679438107936e-06, + "loss": 0.73645532, + "num_input_tokens_seen": 173154355, + "router_z_loss_clip": 1.57226562, + "router_z_loss_mlp": 0.1260376, + "step": 8053, + "time_per_iteration": 2.508634328842163 + }, + { + "auxiliary_loss_clip": 0.06439514, + "auxiliary_loss_mlp": 0.01270848, + "balance_loss_clip": 0.06286003, + "balance_loss_mlp": 0.01258242, + "epoch": 0.484232676987825, + "flos": 31657747944960.0, + "grad_norm": 1.9001102819500506, + "language_loss": 0.69764733, + "learning_rate": 2.198880416254091e-06, + "loss": 0.77475095, + "num_input_tokens_seen": 173174845, + "router_z_loss_clip": 1.53613281, + "router_z_loss_mlp": 0.12609863, + "step": 8054, + "time_per_iteration": 2.6046009063720703 + }, + { + "auxiliary_loss_clip": 0.06439343, + "auxiliary_loss_mlp": 0.01266256, + "balance_loss_clip": 0.062842, + "balance_loss_mlp": 0.01253578, + "epoch": 0.48429280024049304, + "flos": 24101878343040.0, + "grad_norm": 1.6288967613161636, + "language_loss": 0.69845426, + "learning_rate": 2.1984928811558233e-06, + "loss": 0.77551031, + "num_input_tokens_seen": 173195025, + "router_z_loss_clip": 1.55273438, + "router_z_loss_mlp": 0.12683105, + "step": 8055, + "time_per_iteration": 2.5645036697387695 + }, + { + "auxiliary_loss_clip": 0.06441051, + "auxiliary_loss_mlp": 0.01270203, + "balance_loss_clip": 0.06283379, + "balance_loss_mlp": 0.01257621, + "epoch": 0.484352923493161, + "flos": 17535842622720.0, + "grad_norm": 2.1100630556312256, + "language_loss": 0.63363564, + "learning_rate": 2.198105338530685e-06, + "loss": 0.71074814, + "num_input_tokens_seen": 173213065, + "router_z_loss_clip": 1.57617188, + "router_z_loss_mlp": 0.12597656, + "step": 8056, + "time_per_iteration": 2.4887776374816895 + }, + { + "auxiliary_loss_clip": 0.06441829, + "auxiliary_loss_mlp": 0.01269551, + "balance_loss_clip": 0.06283918, + "balance_loss_mlp": 0.0125639, + "epoch": 0.48441304674582897, + "flos": 29174204453760.0, + "grad_norm": 1.7583270452203597, + "language_loss": 0.67791545, + "learning_rate": 2.1977177883933726e-06, + "loss": 0.75502926, + "num_input_tokens_seen": 173234545, + "router_z_loss_clip": 1.58007812, + "router_z_loss_mlp": 0.1315918, + "step": 8057, + "time_per_iteration": 2.6147687435150146 + }, + { + "auxiliary_loss_clip": 0.06438136, + "auxiliary_loss_mlp": 0.01270959, + "balance_loss_clip": 0.06284122, + "balance_loss_mlp": 0.0125933, + "epoch": 0.48447316999849693, + "flos": 15891933369600.0, + "grad_norm": 1.7129310149903716, + "language_loss": 0.81615114, + "learning_rate": 2.1973302307585827e-06, + "loss": 0.89324206, + "num_input_tokens_seen": 173252175, + "router_z_loss_clip": 1.5390625, + "router_z_loss_mlp": 0.11627197, + "step": 8058, + "time_per_iteration": 2.499464273452759 + }, + { + "auxiliary_loss_clip": 0.06444308, + "auxiliary_loss_mlp": 0.01272607, + "balance_loss_clip": 0.06283933, + "balance_loss_mlp": 0.01259619, + "epoch": 0.4845332932511649, + "flos": 24386974260480.0, + "grad_norm": 1.694669299967896, + "language_loss": 0.79782939, + "learning_rate": 2.1969426656410097e-06, + "loss": 0.87499857, + "num_input_tokens_seen": 173268790, + "router_z_loss_clip": 1.60351562, + "router_z_loss_mlp": 0.12988281, + "step": 8059, + "time_per_iteration": 2.5456764698028564 + }, + { + "auxiliary_loss_clip": 0.06445169, + "auxiliary_loss_mlp": 0.0126972, + "balance_loss_clip": 0.06283809, + "balance_loss_mlp": 0.01256065, + "epoch": 0.48459341650383286, + "flos": 37124434097280.0, + "grad_norm": 2.171534570518566, + "language_loss": 0.67115712, + "learning_rate": 2.196555093055352e-06, + "loss": 0.74830604, + "num_input_tokens_seen": 173288030, + "router_z_loss_clip": 1.61230469, + "router_z_loss_mlp": 0.13659668, + "step": 8060, + "time_per_iteration": 2.639552593231201 + }, + { + "auxiliary_loss_clip": 0.06448266, + "auxiliary_loss_mlp": 0.01267897, + "balance_loss_clip": 0.06291284, + "balance_loss_mlp": 0.01255404, + "epoch": 0.48465353975650083, + "flos": 22973500535040.0, + "grad_norm": 1.9145476252385885, + "language_loss": 0.67691833, + "learning_rate": 2.1961675130163046e-06, + "loss": 0.75407994, + "num_input_tokens_seen": 173305965, + "router_z_loss_clip": 1.56835938, + "router_z_loss_mlp": 0.12506104, + "step": 8061, + "time_per_iteration": 2.636291265487671 + }, + { + "auxiliary_loss_clip": 0.06440581, + "auxiliary_loss_mlp": 0.012731, + "balance_loss_clip": 0.06285343, + "balance_loss_mlp": 0.01259581, + "epoch": 0.4847136630091688, + "flos": 17712680664960.0, + "grad_norm": 1.8103717294603696, + "language_loss": 0.83217871, + "learning_rate": 2.1957799255385653e-06, + "loss": 0.90931553, + "num_input_tokens_seen": 173321985, + "router_z_loss_clip": 1.55078125, + "router_z_loss_mlp": 0.13531494, + "step": 8062, + "time_per_iteration": 2.5335779190063477 + }, + { + "auxiliary_loss_clip": 0.06441268, + "auxiliary_loss_mlp": 0.01271147, + "balance_loss_clip": 0.06286018, + "balance_loss_mlp": 0.01259077, + "epoch": 0.48477378626183676, + "flos": 22024853735040.0, + "grad_norm": 1.4198166357723545, + "language_loss": 0.74425852, + "learning_rate": 2.1953923306368325e-06, + "loss": 0.82138264, + "num_input_tokens_seen": 173341315, + "router_z_loss_clip": 1.55273438, + "router_z_loss_mlp": 0.1206665, + "step": 8063, + "time_per_iteration": 2.575752019882202 + }, + { + "auxiliary_loss_clip": 0.06438752, + "auxiliary_loss_mlp": 0.01268531, + "balance_loss_clip": 0.06282612, + "balance_loss_mlp": 0.01256276, + "epoch": 0.4848339095145047, + "flos": 27970118881920.0, + "grad_norm": 1.5830553745787852, + "language_loss": 0.79034185, + "learning_rate": 2.1950047283258023e-06, + "loss": 0.86741465, + "num_input_tokens_seen": 173361055, + "router_z_loss_clip": 1.56054688, + "router_z_loss_mlp": 0.12255859, + "step": 8064, + "time_per_iteration": 2.601557731628418 + }, + { + "auxiliary_loss_clip": 0.06441826, + "auxiliary_loss_mlp": 0.01266756, + "balance_loss_clip": 0.06290108, + "balance_loss_mlp": 0.01254817, + "epoch": 0.4848940327671727, + "flos": 21695090791680.0, + "grad_norm": 1.71958305783472, + "language_loss": 0.795892, + "learning_rate": 2.194617118620173e-06, + "loss": 0.87297779, + "num_input_tokens_seen": 173379255, + "router_z_loss_clip": 1.51757812, + "router_z_loss_mlp": 0.1194458, + "step": 8065, + "time_per_iteration": 2.5325217247009277 + }, + { + "auxiliary_loss_clip": 0.06434904, + "auxiliary_loss_mlp": 0.0126868, + "balance_loss_clip": 0.06285697, + "balance_loss_mlp": 0.01256813, + "epoch": 0.48495415601984065, + "flos": 20637892627200.0, + "grad_norm": 1.7068711802888106, + "language_loss": 0.76162863, + "learning_rate": 2.194229501534644e-06, + "loss": 0.83866447, + "num_input_tokens_seen": 173398370, + "router_z_loss_clip": 1.49121094, + "router_z_loss_mlp": 0.11865234, + "step": 8066, + "time_per_iteration": 2.506598949432373 + }, + { + "auxiliary_loss_clip": 0.06438506, + "auxiliary_loss_mlp": 0.01268819, + "balance_loss_clip": 0.06285724, + "balance_loss_mlp": 0.01257375, + "epoch": 0.4850142792725086, + "flos": 25634972171520.0, + "grad_norm": 1.302389197624331, + "language_loss": 0.72176784, + "learning_rate": 2.193841877083912e-06, + "loss": 0.79884112, + "num_input_tokens_seen": 173419595, + "router_z_loss_clip": 1.52636719, + "router_z_loss_mlp": 0.11444092, + "step": 8067, + "time_per_iteration": 2.5921640396118164 + }, + { + "auxiliary_loss_clip": 0.06438944, + "auxiliary_loss_mlp": 0.01268187, + "balance_loss_clip": 0.06282091, + "balance_loss_mlp": 0.01255986, + "epoch": 0.4850744025251766, + "flos": 13777075843200.0, + "grad_norm": 2.2825284137915975, + "language_loss": 0.79257572, + "learning_rate": 2.1934542452826767e-06, + "loss": 0.86964703, + "num_input_tokens_seen": 173435390, + "router_z_loss_clip": 1.5703125, + "router_z_loss_mlp": 0.12219238, + "step": 8068, + "time_per_iteration": 2.5287444591522217 + }, + { + "auxiliary_loss_clip": 0.06435382, + "auxiliary_loss_mlp": 0.01268403, + "balance_loss_clip": 0.06280828, + "balance_loss_mlp": 0.012565, + "epoch": 0.4851345257778446, + "flos": 20266691040000.0, + "grad_norm": 1.4034205816126453, + "language_loss": 0.84740359, + "learning_rate": 2.193066606145638e-06, + "loss": 0.92444146, + "num_input_tokens_seen": 173454095, + "router_z_loss_clip": 1.54394531, + "router_z_loss_mlp": 0.11901855, + "step": 8069, + "time_per_iteration": 2.548593044281006 + }, + { + "auxiliary_loss_clip": 0.06435016, + "auxiliary_loss_mlp": 0.01266308, + "balance_loss_clip": 0.06280835, + "balance_loss_mlp": 0.01254763, + "epoch": 0.48519464903051257, + "flos": 27097095991680.0, + "grad_norm": 1.771109080244907, + "language_loss": 0.78544027, + "learning_rate": 2.192678959687493e-06, + "loss": 0.86245352, + "num_input_tokens_seen": 173475300, + "router_z_loss_clip": 1.54003906, + "router_z_loss_mlp": 0.11553955, + "step": 8070, + "time_per_iteration": 2.581026315689087 + }, + { + "auxiliary_loss_clip": 0.06432221, + "auxiliary_loss_mlp": 0.01268982, + "balance_loss_clip": 0.06279641, + "balance_loss_mlp": 0.01256239, + "epoch": 0.48525477228318054, + "flos": 17132677902720.0, + "grad_norm": 3.597843949572919, + "language_loss": 0.77929389, + "learning_rate": 2.192291305922943e-06, + "loss": 0.85630596, + "num_input_tokens_seen": 173492005, + "router_z_loss_clip": 1.52539062, + "router_z_loss_mlp": 0.12756348, + "step": 8071, + "time_per_iteration": 3.963555335998535 + }, + { + "auxiliary_loss_clip": 0.06438918, + "auxiliary_loss_mlp": 0.01270068, + "balance_loss_clip": 0.06282261, + "balance_loss_mlp": 0.01256777, + "epoch": 0.4853148955358485, + "flos": 28187263537920.0, + "grad_norm": 2.115731418126265, + "language_loss": 0.72008896, + "learning_rate": 2.1919036448666873e-06, + "loss": 0.7971788, + "num_input_tokens_seen": 173511995, + "router_z_loss_clip": 1.56835938, + "router_z_loss_mlp": 0.13299561, + "step": 8072, + "time_per_iteration": 2.6861536502838135 + }, + { + "auxiliary_loss_clip": 0.06439583, + "auxiliary_loss_mlp": 0.0126679, + "balance_loss_clip": 0.06282715, + "balance_loss_mlp": 0.01253761, + "epoch": 0.48537501878851647, + "flos": 17499015953280.0, + "grad_norm": 1.8999559951356444, + "language_loss": 0.88288134, + "learning_rate": 2.1915159765334262e-06, + "loss": 0.95994508, + "num_input_tokens_seen": 173530215, + "router_z_loss_clip": 1.56835938, + "router_z_loss_mlp": 0.13037109, + "step": 8073, + "time_per_iteration": 2.4814834594726562 + }, + { + "auxiliary_loss_clip": 0.06432822, + "auxiliary_loss_mlp": 0.01269151, + "balance_loss_clip": 0.06283282, + "balance_loss_mlp": 0.01257731, + "epoch": 0.48543514204118443, + "flos": 28592398828800.0, + "grad_norm": 2.458004055687259, + "language_loss": 0.61317194, + "learning_rate": 2.19112830093786e-06, + "loss": 0.69019163, + "num_input_tokens_seen": 173550920, + "router_z_loss_clip": 1.49316406, + "router_z_loss_mlp": 0.11413574, + "step": 8074, + "time_per_iteration": 3.984229326248169 + }, + { + "auxiliary_loss_clip": 0.06435922, + "auxiliary_loss_mlp": 0.01265981, + "balance_loss_clip": 0.0627804, + "balance_loss_mlp": 0.01254024, + "epoch": 0.4854952652938524, + "flos": 20966355832320.0, + "grad_norm": 1.641968552330247, + "language_loss": 0.73514569, + "learning_rate": 2.19074061809469e-06, + "loss": 0.81216466, + "num_input_tokens_seen": 173569065, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.11962891, + "step": 8075, + "time_per_iteration": 2.5479941368103027 + }, + { + "auxiliary_loss_clip": 0.06429431, + "auxiliary_loss_mlp": 0.01268393, + "balance_loss_clip": 0.06278814, + "balance_loss_mlp": 0.01256704, + "epoch": 0.48555538854652036, + "flos": 66543344000640.0, + "grad_norm": 1.7202852105657789, + "language_loss": 0.81976241, + "learning_rate": 2.1903529280186163e-06, + "loss": 0.89674067, + "num_input_tokens_seen": 173596085, + "router_z_loss_clip": 1.50488281, + "router_z_loss_mlp": 0.11676025, + "step": 8076, + "time_per_iteration": 2.9675233364105225 + }, + { + "auxiliary_loss_clip": 0.06435271, + "auxiliary_loss_mlp": 0.01273017, + "balance_loss_clip": 0.06280246, + "balance_loss_mlp": 0.01259242, + "epoch": 0.4856155117991883, + "flos": 15930520974720.0, + "grad_norm": 1.9409864090603182, + "language_loss": 0.86392474, + "learning_rate": 2.1899652307243407e-06, + "loss": 0.94100761, + "num_input_tokens_seen": 173613900, + "router_z_loss_clip": 1.55078125, + "router_z_loss_mlp": 0.13781738, + "step": 8077, + "time_per_iteration": 2.5062685012817383 + }, + { + "auxiliary_loss_clip": 0.06325787, + "auxiliary_loss_mlp": 0.01252172, + "balance_loss_clip": 0.062584, + "balance_loss_mlp": 0.0125022, + "epoch": 0.4856756350518563, + "flos": 71066986848000.0, + "grad_norm": 0.9289783803731909, + "language_loss": 0.58378243, + "learning_rate": 2.189577526226564e-06, + "loss": 0.65956199, + "num_input_tokens_seen": 173671305, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 0.01950073, + "step": 8078, + "time_per_iteration": 4.502991199493408 + }, + { + "auxiliary_loss_clip": 0.06440585, + "auxiliary_loss_mlp": 0.01268963, + "balance_loss_clip": 0.06280588, + "balance_loss_mlp": 0.01255886, + "epoch": 0.48573575830452426, + "flos": 29833478778240.0, + "grad_norm": 2.317528327629363, + "language_loss": 0.72874224, + "learning_rate": 2.1891898145399884e-06, + "loss": 0.80583775, + "num_input_tokens_seen": 173692070, + "router_z_loss_clip": 1.59863281, + "router_z_loss_mlp": 0.1307373, + "step": 8079, + "time_per_iteration": 2.5839955806732178 + }, + { + "auxiliary_loss_clip": 0.06440279, + "auxiliary_loss_mlp": 0.01268912, + "balance_loss_clip": 0.06283288, + "balance_loss_mlp": 0.01256925, + "epoch": 0.4857958815571922, + "flos": 17645274455040.0, + "grad_norm": 2.8950752184508843, + "language_loss": 0.80285943, + "learning_rate": 2.1888020956793172e-06, + "loss": 0.87995136, + "num_input_tokens_seen": 173709785, + "router_z_loss_clip": 1.5703125, + "router_z_loss_mlp": 0.11999512, + "step": 8080, + "time_per_iteration": 2.542607307434082 + }, + { + "auxiliary_loss_clip": 0.06436758, + "auxiliary_loss_mlp": 0.01265336, + "balance_loss_clip": 0.06281016, + "balance_loss_mlp": 0.01252754, + "epoch": 0.4858560048098602, + "flos": 21111817720320.0, + "grad_norm": 1.934060586134842, + "language_loss": 0.84237295, + "learning_rate": 2.188414369659251e-06, + "loss": 0.9193939, + "num_input_tokens_seen": 173728770, + "router_z_loss_clip": 1.55957031, + "router_z_loss_mlp": 0.12579346, + "step": 8081, + "time_per_iteration": 2.523787021636963 + }, + { + "auxiliary_loss_clip": 0.06433021, + "auxiliary_loss_mlp": 0.01268596, + "balance_loss_clip": 0.06277841, + "balance_loss_mlp": 0.0125512, + "epoch": 0.4859161280625282, + "flos": 22097375043840.0, + "grad_norm": 1.530246142437005, + "language_loss": 0.83824933, + "learning_rate": 2.1880266364944924e-06, + "loss": 0.91526556, + "num_input_tokens_seen": 173747355, + "router_z_loss_clip": 1.55175781, + "router_z_loss_mlp": 0.13464355, + "step": 8082, + "time_per_iteration": 2.562739372253418 + }, + { + "auxiliary_loss_clip": 0.0643435, + "auxiliary_loss_mlp": 0.01268115, + "balance_loss_clip": 0.06283809, + "balance_loss_mlp": 0.01255849, + "epoch": 0.4859762513151962, + "flos": 17499183661440.0, + "grad_norm": 1.9064651850671037, + "language_loss": 0.87366831, + "learning_rate": 2.187638896199746e-06, + "loss": 0.95069289, + "num_input_tokens_seen": 173764825, + "router_z_loss_clip": 1.50292969, + "router_z_loss_mlp": 0.12268066, + "step": 8083, + "time_per_iteration": 2.5062954425811768 + }, + { + "auxiliary_loss_clip": 0.064337, + "auxiliary_loss_mlp": 0.01268463, + "balance_loss_clip": 0.06281679, + "balance_loss_mlp": 0.01255356, + "epoch": 0.48603637456786414, + "flos": 18010061205120.0, + "grad_norm": 1.6184381568123027, + "language_loss": 0.81531483, + "learning_rate": 2.1872511487897126e-06, + "loss": 0.89233649, + "num_input_tokens_seen": 173783215, + "router_z_loss_clip": 1.51855469, + "router_z_loss_mlp": 0.13110352, + "step": 8084, + "time_per_iteration": 3.9548635482788086 + }, + { + "auxiliary_loss_clip": 0.06438272, + "auxiliary_loss_mlp": 0.01269138, + "balance_loss_clip": 0.06283273, + "balance_loss_mlp": 0.01256645, + "epoch": 0.4860964978205321, + "flos": 22498611120000.0, + "grad_norm": 1.8856401579659385, + "language_loss": 0.68814772, + "learning_rate": 2.186863394279098e-06, + "loss": 0.76522183, + "num_input_tokens_seen": 173801905, + "router_z_loss_clip": 1.54980469, + "router_z_loss_mlp": 0.12475586, + "step": 8085, + "time_per_iteration": 2.525697708129883 + }, + { + "auxiliary_loss_clip": 0.06434157, + "auxiliary_loss_mlp": 0.01270175, + "balance_loss_clip": 0.0627964, + "balance_loss_mlp": 0.01257158, + "epoch": 0.48615662107320007, + "flos": 23380061345280.0, + "grad_norm": 1.4159205206948002, + "language_loss": 0.77895916, + "learning_rate": 2.1864756326826046e-06, + "loss": 0.85600245, + "num_input_tokens_seen": 173824690, + "router_z_loss_clip": 1.54492188, + "router_z_loss_mlp": 0.13024902, + "step": 8086, + "time_per_iteration": 2.5914857387542725 + }, + { + "auxiliary_loss_clip": 0.06433852, + "auxiliary_loss_mlp": 0.01265857, + "balance_loss_clip": 0.06279776, + "balance_loss_mlp": 0.01253292, + "epoch": 0.48621674432586803, + "flos": 34426722769920.0, + "grad_norm": 1.8125320165569008, + "language_loss": 0.69750226, + "learning_rate": 2.1860878640149355e-06, + "loss": 0.7744993, + "num_input_tokens_seen": 173844450, + "router_z_loss_clip": 1.54101562, + "router_z_loss_mlp": 0.12573242, + "step": 8087, + "time_per_iteration": 2.611724615097046 + }, + { + "auxiliary_loss_clip": 0.06440983, + "auxiliary_loss_mlp": 0.01266005, + "balance_loss_clip": 0.06277409, + "balance_loss_mlp": 0.0125254, + "epoch": 0.486276867578536, + "flos": 33115595207040.0, + "grad_norm": 1.9401027694089865, + "language_loss": 0.73050213, + "learning_rate": 2.1857000882907974e-06, + "loss": 0.80757201, + "num_input_tokens_seen": 173864975, + "router_z_loss_clip": 1.63476562, + "router_z_loss_mlp": 0.13482666, + "step": 8088, + "time_per_iteration": 2.6235716342926025 + }, + { + "auxiliary_loss_clip": 0.06434947, + "auxiliary_loss_mlp": 0.01270457, + "balance_loss_clip": 0.06279397, + "balance_loss_mlp": 0.01257982, + "epoch": 0.48633699083120396, + "flos": 21477149521920.0, + "grad_norm": 1.5117477196191362, + "language_loss": 0.75765258, + "learning_rate": 2.185312305524892e-06, + "loss": 0.83470654, + "num_input_tokens_seen": 173883805, + "router_z_loss_clip": 1.55566406, + "router_z_loss_mlp": 0.12481689, + "step": 8089, + "time_per_iteration": 2.522033214569092 + }, + { + "auxiliary_loss_clip": 0.06432723, + "auxiliary_loss_mlp": 0.01266623, + "balance_loss_clip": 0.06276575, + "balance_loss_mlp": 0.01254702, + "epoch": 0.48639711408387193, + "flos": 20090565757440.0, + "grad_norm": 2.0719257974800307, + "language_loss": 0.84617764, + "learning_rate": 2.184924515731926e-06, + "loss": 0.92317104, + "num_input_tokens_seen": 173903520, + "router_z_loss_clip": 1.5625, + "router_z_loss_mlp": 0.1192627, + "step": 8090, + "time_per_iteration": 2.6032962799072266 + }, + { + "auxiliary_loss_clip": 0.06428317, + "auxiliary_loss_mlp": 0.01267937, + "balance_loss_clip": 0.06279606, + "balance_loss_mlp": 0.01256362, + "epoch": 0.4864572373365399, + "flos": 20785450867200.0, + "grad_norm": 1.460241002220635, + "language_loss": 0.76103806, + "learning_rate": 2.1845367189266045e-06, + "loss": 0.8380006, + "num_input_tokens_seen": 173924255, + "router_z_loss_clip": 1.48632812, + "router_z_loss_mlp": 0.11578369, + "step": 8091, + "time_per_iteration": 2.534083127975464 + }, + { + "auxiliary_loss_clip": 0.06434517, + "auxiliary_loss_mlp": 0.01264632, + "balance_loss_clip": 0.0627959, + "balance_loss_mlp": 0.01252651, + "epoch": 0.48651736058920786, + "flos": 26031554346240.0, + "grad_norm": 1.4698762569471817, + "language_loss": 0.8086524, + "learning_rate": 2.184148915123631e-06, + "loss": 0.88564396, + "num_input_tokens_seen": 173943285, + "router_z_loss_clip": 1.546875, + "router_z_loss_mlp": 0.11987305, + "step": 8092, + "time_per_iteration": 2.5732295513153076 + }, + { + "auxiliary_loss_clip": 0.06434911, + "auxiliary_loss_mlp": 0.01268235, + "balance_loss_clip": 0.06279235, + "balance_loss_mlp": 0.01254711, + "epoch": 0.4865774838418758, + "flos": 20491885687680.0, + "grad_norm": 1.359461965274961, + "language_loss": 0.71901554, + "learning_rate": 2.1837611043377126e-06, + "loss": 0.79604697, + "num_input_tokens_seen": 173962205, + "router_z_loss_clip": 1.55566406, + "router_z_loss_mlp": 0.13537598, + "step": 8093, + "time_per_iteration": 2.5315988063812256 + }, + { + "auxiliary_loss_clip": 0.06430057, + "auxiliary_loss_mlp": 0.01268667, + "balance_loss_clip": 0.06278083, + "balance_loss_mlp": 0.01256424, + "epoch": 0.4866376070945438, + "flos": 23554048348800.0, + "grad_norm": 1.746145283456106, + "language_loss": 0.68340707, + "learning_rate": 2.1833732865835545e-06, + "loss": 0.76039433, + "num_input_tokens_seen": 173980945, + "router_z_loss_clip": 1.51855469, + "router_z_loss_mlp": 0.12237549, + "step": 8094, + "time_per_iteration": 2.5621020793914795 + }, + { + "auxiliary_loss_clip": 0.06439431, + "auxiliary_loss_mlp": 0.01276508, + "balance_loss_clip": 0.06280254, + "balance_loss_mlp": 0.01263502, + "epoch": 0.4866977303472118, + "flos": 16696166457600.0, + "grad_norm": 2.187009986392795, + "language_loss": 0.66443598, + "learning_rate": 2.1829854618758636e-06, + "loss": 0.74159545, + "num_input_tokens_seen": 173998860, + "router_z_loss_clip": 1.59277344, + "router_z_loss_mlp": 0.13006592, + "step": 8095, + "time_per_iteration": 2.4823923110961914 + }, + { + "auxiliary_loss_clip": 0.06436304, + "auxiliary_loss_mlp": 0.01266824, + "balance_loss_clip": 0.06279348, + "balance_loss_mlp": 0.01254444, + "epoch": 0.4867578535998798, + "flos": 17902012965120.0, + "grad_norm": 1.919238290363099, + "language_loss": 0.79046065, + "learning_rate": 2.182597630229345e-06, + "loss": 0.86749196, + "num_input_tokens_seen": 174016665, + "router_z_loss_clip": 1.56933594, + "router_z_loss_mlp": 0.12384033, + "step": 8096, + "time_per_iteration": 2.507293701171875 + }, + { + "auxiliary_loss_clip": 0.06432957, + "auxiliary_loss_mlp": 0.01269945, + "balance_loss_clip": 0.06279905, + "balance_loss_mlp": 0.01257154, + "epoch": 0.48681797685254774, + "flos": 22644366497280.0, + "grad_norm": 2.003337305767246, + "language_loss": 0.68162191, + "learning_rate": 2.1822097916587067e-06, + "loss": 0.75865096, + "num_input_tokens_seen": 174034800, + "router_z_loss_clip": 1.53222656, + "router_z_loss_mlp": 0.12799072, + "step": 8097, + "time_per_iteration": 2.5473361015319824 + }, + { + "auxiliary_loss_clip": 0.06432723, + "auxiliary_loss_mlp": 0.01272073, + "balance_loss_clip": 0.06279548, + "balance_loss_mlp": 0.01259944, + "epoch": 0.4868781001052157, + "flos": 20892283223040.0, + "grad_norm": 1.4401604045572658, + "language_loss": 0.71418583, + "learning_rate": 2.1818219461786543e-06, + "loss": 0.79123378, + "num_input_tokens_seen": 174054445, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.12127686, + "step": 8098, + "time_per_iteration": 2.5543363094329834 + }, + { + "auxiliary_loss_clip": 0.06441437, + "auxiliary_loss_mlp": 0.01269071, + "balance_loss_clip": 0.06279659, + "balance_loss_mlp": 0.01255725, + "epoch": 0.48693822335788367, + "flos": 41984688723840.0, + "grad_norm": 1.4376447542768653, + "language_loss": 0.66435724, + "learning_rate": 2.1814340938038956e-06, + "loss": 0.74146235, + "num_input_tokens_seen": 174077890, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.13348389, + "step": 8099, + "time_per_iteration": 2.711822032928467 + }, + { + "auxiliary_loss_clip": 0.0643863, + "auxiliary_loss_mlp": 0.01271817, + "balance_loss_clip": 0.06281494, + "balance_loss_mlp": 0.01259485, + "epoch": 0.48699834661055164, + "flos": 24250149342720.0, + "grad_norm": 1.5852242434455028, + "language_loss": 0.66993374, + "learning_rate": 2.181046234549138e-06, + "loss": 0.74703825, + "num_input_tokens_seen": 174097460, + "router_z_loss_clip": 1.5703125, + "router_z_loss_mlp": 0.12329102, + "step": 8100, + "time_per_iteration": 2.5218353271484375 + }, + { + "auxiliary_loss_clip": 0.0643635, + "auxiliary_loss_mlp": 0.0127283, + "balance_loss_clip": 0.06283123, + "balance_loss_mlp": 0.01260176, + "epoch": 0.4870584698632196, + "flos": 25931388389760.0, + "grad_norm": 1.294146562327305, + "language_loss": 0.76505142, + "learning_rate": 2.180658368429088e-06, + "loss": 0.84214324, + "num_input_tokens_seen": 174120775, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.12664795, + "step": 8101, + "time_per_iteration": 2.645095109939575 + }, + { + "auxiliary_loss_clip": 0.06345028, + "auxiliary_loss_mlp": 0.01254744, + "balance_loss_clip": 0.06277841, + "balance_loss_mlp": 0.01252564, + "epoch": 0.48711859311588757, + "flos": 70232006511360.0, + "grad_norm": 0.6692636412141889, + "language_loss": 0.5212009, + "learning_rate": 2.1802704954584565e-06, + "loss": 0.59719861, + "num_input_tokens_seen": 174189135, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 0.02183533, + "step": 8102, + "time_per_iteration": 3.2782585620880127 + }, + { + "auxiliary_loss_clip": 0.06439511, + "auxiliary_loss_mlp": 0.01266928, + "balance_loss_clip": 0.06284305, + "balance_loss_mlp": 0.01253523, + "epoch": 0.48717871636855553, + "flos": 12346831301760.0, + "grad_norm": 2.023585148758525, + "language_loss": 0.7395249, + "learning_rate": 2.1798826156519484e-06, + "loss": 0.81658924, + "num_input_tokens_seen": 174203250, + "router_z_loss_clip": 1.55078125, + "router_z_loss_mlp": 0.13415527, + "step": 8103, + "time_per_iteration": 2.5020487308502197 + }, + { + "auxiliary_loss_clip": 0.06437068, + "auxiliary_loss_mlp": 0.01271054, + "balance_loss_clip": 0.06280553, + "balance_loss_mlp": 0.01257059, + "epoch": 0.4872388396212235, + "flos": 23483874954240.0, + "grad_norm": 1.425095223977108, + "language_loss": 0.6284436, + "learning_rate": 2.1794947290242737e-06, + "loss": 0.70552492, + "num_input_tokens_seen": 174224145, + "router_z_loss_clip": 1.56835938, + "router_z_loss_mlp": 0.13989258, + "step": 8104, + "time_per_iteration": 2.5457305908203125 + }, + { + "auxiliary_loss_clip": 0.06436496, + "auxiliary_loss_mlp": 0.012688, + "balance_loss_clip": 0.06281868, + "balance_loss_mlp": 0.01255759, + "epoch": 0.48729896287389146, + "flos": 31435068919680.0, + "grad_norm": 2.8385892248494575, + "language_loss": 0.69637764, + "learning_rate": 2.1791068355901413e-06, + "loss": 0.77343059, + "num_input_tokens_seen": 174244435, + "router_z_loss_clip": 1.54492188, + "router_z_loss_mlp": 0.13043213, + "step": 8105, + "time_per_iteration": 2.6453042030334473 + }, + { + "auxiliary_loss_clip": 0.0643308, + "auxiliary_loss_mlp": 0.01270898, + "balance_loss_clip": 0.06279837, + "balance_loss_mlp": 0.01258464, + "epoch": 0.4873590861265594, + "flos": 19063192446720.0, + "grad_norm": 1.510355754545757, + "language_loss": 0.73659271, + "learning_rate": 2.178718935364259e-06, + "loss": 0.81363249, + "num_input_tokens_seen": 174262710, + "router_z_loss_clip": 1.53222656, + "router_z_loss_mlp": 0.12451172, + "step": 8106, + "time_per_iteration": 2.4909706115722656 + }, + { + "auxiliary_loss_clip": 0.0644394, + "auxiliary_loss_mlp": 0.01272973, + "balance_loss_clip": 0.06283985, + "balance_loss_mlp": 0.01258888, + "epoch": 0.4874192093792274, + "flos": 24354424149120.0, + "grad_norm": 1.669305756095907, + "language_loss": 0.77040148, + "learning_rate": 2.1783310283613373e-06, + "loss": 0.84757066, + "num_input_tokens_seen": 174281545, + "router_z_loss_clip": 1.59863281, + "router_z_loss_mlp": 0.14080811, + "step": 8107, + "time_per_iteration": 2.5784239768981934 + }, + { + "auxiliary_loss_clip": 0.06432547, + "auxiliary_loss_mlp": 0.01266802, + "balance_loss_clip": 0.06281953, + "balance_loss_mlp": 0.01254971, + "epoch": 0.4874793326318954, + "flos": 23119339766400.0, + "grad_norm": 3.7362093355788857, + "language_loss": 0.75508547, + "learning_rate": 2.1779431145960853e-06, + "loss": 0.83207899, + "num_input_tokens_seen": 174300290, + "router_z_loss_clip": 1.50585938, + "router_z_loss_mlp": 0.1182251, + "step": 8108, + "time_per_iteration": 2.51676607131958 + }, + { + "auxiliary_loss_clip": 0.06434841, + "auxiliary_loss_mlp": 0.01268609, + "balance_loss_clip": 0.06281565, + "balance_loss_mlp": 0.01257522, + "epoch": 0.4875394558845634, + "flos": 19032193635840.0, + "grad_norm": 1.6826296910838767, + "language_loss": 0.73853874, + "learning_rate": 2.177555194083212e-06, + "loss": 0.81557322, + "num_input_tokens_seen": 174318490, + "router_z_loss_clip": 1.53320312, + "router_z_loss_mlp": 0.11090088, + "step": 8109, + "time_per_iteration": 2.594315767288208 + }, + { + "auxiliary_loss_clip": 0.06429494, + "auxiliary_loss_mlp": 0.01265982, + "balance_loss_clip": 0.0628022, + "balance_loss_mlp": 0.01253853, + "epoch": 0.48759957913723134, + "flos": 21439945509120.0, + "grad_norm": 1.7035668673577407, + "language_loss": 0.78900838, + "learning_rate": 2.177167266837428e-06, + "loss": 0.86596316, + "num_input_tokens_seen": 174335505, + "router_z_loss_clip": 1.49414062, + "router_z_loss_mlp": 0.12121582, + "step": 8110, + "time_per_iteration": 2.517711639404297 + }, + { + "auxiliary_loss_clip": 0.06435961, + "auxiliary_loss_mlp": 0.01271496, + "balance_loss_clip": 0.06281072, + "balance_loss_mlp": 0.01259265, + "epoch": 0.4876597023898993, + "flos": 17754412798080.0, + "grad_norm": 2.2958034596154238, + "language_loss": 0.72586286, + "learning_rate": 2.176779332873444e-06, + "loss": 0.80293739, + "num_input_tokens_seen": 174353990, + "router_z_loss_clip": 1.55078125, + "router_z_loss_mlp": 0.12231445, + "step": 8111, + "time_per_iteration": 3.939528465270996 + }, + { + "auxiliary_loss_clip": 0.06434079, + "auxiliary_loss_mlp": 0.01270804, + "balance_loss_clip": 0.06283166, + "balance_loss_mlp": 0.01257947, + "epoch": 0.4877198256425673, + "flos": 17025384349440.0, + "grad_norm": 1.699620610729742, + "language_loss": 0.76073879, + "learning_rate": 2.17639139220597e-06, + "loss": 0.83778763, + "num_input_tokens_seen": 174373425, + "router_z_loss_clip": 1.50976562, + "router_z_loss_mlp": 0.128479, + "step": 8112, + "time_per_iteration": 2.614734172821045 + }, + { + "auxiliary_loss_clip": 0.06443445, + "auxiliary_loss_mlp": 0.01270845, + "balance_loss_clip": 0.06281452, + "balance_loss_mlp": 0.01257445, + "epoch": 0.48777994889523524, + "flos": 22390898296320.0, + "grad_norm": 1.829058055025175, + "language_loss": 0.756136, + "learning_rate": 2.1760034448497166e-06, + "loss": 0.83327889, + "num_input_tokens_seen": 174393070, + "router_z_loss_clip": 1.61914062, + "router_z_loss_mlp": 0.13397217, + "step": 8113, + "time_per_iteration": 3.978013277053833 + }, + { + "auxiliary_loss_clip": 0.0633374, + "auxiliary_loss_mlp": 0.01252792, + "balance_loss_clip": 0.06267424, + "balance_loss_mlp": 0.0125078, + "epoch": 0.4878400721479032, + "flos": 61261237664640.0, + "grad_norm": 0.785084950627043, + "language_loss": 0.48805469, + "learning_rate": 2.1756154908193943e-06, + "loss": 0.56391996, + "num_input_tokens_seen": 174446880, + "router_z_loss_clip": 0.6640625, + "router_z_loss_mlp": 0.02011108, + "step": 8114, + "time_per_iteration": 3.0476014614105225 + }, + { + "auxiliary_loss_clip": 0.06435857, + "auxiliary_loss_mlp": 0.01268853, + "balance_loss_clip": 0.06280373, + "balance_loss_mlp": 0.01255507, + "epoch": 0.48790019540057117, + "flos": 24543756449280.0, + "grad_norm": 1.6081028897323706, + "language_loss": 0.77215505, + "learning_rate": 2.1752275301297155e-06, + "loss": 0.84920216, + "num_input_tokens_seen": 174468485, + "router_z_loss_clip": 1.55664062, + "router_z_loss_mlp": 0.13348389, + "step": 8115, + "time_per_iteration": 2.615709066390991 + }, + { + "auxiliary_loss_clip": 0.06438144, + "auxiliary_loss_mlp": 0.01270465, + "balance_loss_clip": 0.06279679, + "balance_loss_mlp": 0.01256858, + "epoch": 0.48796031865323913, + "flos": 21840175336320.0, + "grad_norm": 1.938320357328723, + "language_loss": 0.72471654, + "learning_rate": 2.1748395627953915e-06, + "loss": 0.80180264, + "num_input_tokens_seen": 174486360, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.13586426, + "step": 8116, + "time_per_iteration": 2.502880573272705 + }, + { + "auxiliary_loss_clip": 0.06428684, + "auxiliary_loss_mlp": 0.01266227, + "balance_loss_clip": 0.06277922, + "balance_loss_mlp": 0.0125349, + "epoch": 0.4880204419059071, + "flos": 18594969431040.0, + "grad_norm": 1.5984683769851484, + "language_loss": 0.63217908, + "learning_rate": 2.1744515888311335e-06, + "loss": 0.70912814, + "num_input_tokens_seen": 174505075, + "router_z_loss_clip": 1.5078125, + "router_z_loss_mlp": 0.12750244, + "step": 8117, + "time_per_iteration": 2.5082454681396484 + }, + { + "auxiliary_loss_clip": 0.06432296, + "auxiliary_loss_mlp": 0.01268211, + "balance_loss_clip": 0.06278604, + "balance_loss_mlp": 0.0125558, + "epoch": 0.48808056515857506, + "flos": 19178242502400.0, + "grad_norm": 1.8182073979213524, + "language_loss": 0.79733717, + "learning_rate": 2.1740636082516533e-06, + "loss": 0.87434226, + "num_input_tokens_seen": 174523385, + "router_z_loss_clip": 1.53808594, + "router_z_loss_mlp": 0.1262207, + "step": 8118, + "time_per_iteration": 3.925899028778076 + }, + { + "auxiliary_loss_clip": 0.06436172, + "auxiliary_loss_mlp": 0.01267812, + "balance_loss_clip": 0.06280739, + "balance_loss_mlp": 0.01254669, + "epoch": 0.48814068841124303, + "flos": 20126679667200.0, + "grad_norm": 1.6934286727955359, + "language_loss": 0.63701898, + "learning_rate": 2.1736756210716645e-06, + "loss": 0.71405882, + "num_input_tokens_seen": 174542200, + "router_z_loss_clip": 1.55273438, + "router_z_loss_mlp": 0.13134766, + "step": 8119, + "time_per_iteration": 2.575894832611084 + }, + { + "auxiliary_loss_clip": 0.06432833, + "auxiliary_loss_mlp": 0.01267436, + "balance_loss_clip": 0.0627794, + "balance_loss_mlp": 0.01254698, + "epoch": 0.488200811663911, + "flos": 22972116942720.0, + "grad_norm": 1.6464989706708673, + "language_loss": 0.72632396, + "learning_rate": 2.173287627305878e-06, + "loss": 0.80332661, + "num_input_tokens_seen": 174563620, + "router_z_loss_clip": 1.54882812, + "router_z_loss_mlp": 0.12744141, + "step": 8120, + "time_per_iteration": 2.5209426879882812 + }, + { + "auxiliary_loss_clip": 0.06438597, + "auxiliary_loss_mlp": 0.01268649, + "balance_loss_clip": 0.06279586, + "balance_loss_mlp": 0.01255297, + "epoch": 0.48826093491657896, + "flos": 33918947827200.0, + "grad_norm": 1.7374615150704595, + "language_loss": 0.63695973, + "learning_rate": 2.1728996269690075e-06, + "loss": 0.71403223, + "num_input_tokens_seen": 174586465, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.13336182, + "step": 8121, + "time_per_iteration": 2.619035005569458 + }, + { + "auxiliary_loss_clip": 0.0644285, + "auxiliary_loss_mlp": 0.01267435, + "balance_loss_clip": 0.06282102, + "balance_loss_mlp": 0.01253643, + "epoch": 0.488321058169247, + "flos": 23076056332800.0, + "grad_norm": 1.857577186148328, + "language_loss": 0.82684505, + "learning_rate": 2.1725116200757664e-06, + "loss": 0.90394789, + "num_input_tokens_seen": 174604035, + "router_z_loss_clip": 1.60546875, + "router_z_loss_mlp": 0.13800049, + "step": 8122, + "time_per_iteration": 2.5246660709381104 + }, + { + "auxiliary_loss_clip": 0.06440943, + "auxiliary_loss_mlp": 0.01268474, + "balance_loss_clip": 0.06282523, + "balance_loss_mlp": 0.01255397, + "epoch": 0.48838118142191494, + "flos": 19323746317440.0, + "grad_norm": 1.8250600769951077, + "language_loss": 0.85500193, + "learning_rate": 2.172123606640866e-06, + "loss": 0.93209612, + "num_input_tokens_seen": 174621715, + "router_z_loss_clip": 1.58300781, + "router_z_loss_mlp": 0.13085938, + "step": 8123, + "time_per_iteration": 2.5317881107330322 + }, + { + "auxiliary_loss_clip": 0.06441107, + "auxiliary_loss_mlp": 0.01272894, + "balance_loss_clip": 0.06282164, + "balance_loss_mlp": 0.0125934, + "epoch": 0.4884413046745829, + "flos": 25417701734400.0, + "grad_norm": 1.3930130047769251, + "language_loss": 0.85569358, + "learning_rate": 2.1717355866790227e-06, + "loss": 0.93283355, + "num_input_tokens_seen": 174643835, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.13549805, + "step": 8124, + "time_per_iteration": 4.062820196151733 + }, + { + "auxiliary_loss_clip": 0.0644336, + "auxiliary_loss_mlp": 0.01266972, + "balance_loss_clip": 0.06285739, + "balance_loss_mlp": 0.01253769, + "epoch": 0.4885014279272509, + "flos": 20997103080960.0, + "grad_norm": 2.2053414232015363, + "language_loss": 0.80210352, + "learning_rate": 2.171347560204948e-06, + "loss": 0.87920684, + "num_input_tokens_seen": 174660955, + "router_z_loss_clip": 1.57519531, + "router_z_loss_mlp": 0.13201904, + "step": 8125, + "time_per_iteration": 2.5117287635803223 + }, + { + "auxiliary_loss_clip": 0.06437683, + "auxiliary_loss_mlp": 0.01269334, + "balance_loss_clip": 0.06283394, + "balance_loss_mlp": 0.01255976, + "epoch": 0.48856155117991884, + "flos": 13776656572800.0, + "grad_norm": 2.5222320452086016, + "language_loss": 0.72852308, + "learning_rate": 2.170959527233356e-06, + "loss": 0.80559325, + "num_input_tokens_seen": 174678270, + "router_z_loss_clip": 1.54296875, + "router_z_loss_mlp": 0.13348389, + "step": 8126, + "time_per_iteration": 2.5177037715911865 + }, + { + "auxiliary_loss_clip": 0.06445107, + "auxiliary_loss_mlp": 0.01269465, + "balance_loss_clip": 0.06285033, + "balance_loss_mlp": 0.01256113, + "epoch": 0.4886216744325868, + "flos": 32095936471680.0, + "grad_norm": 1.5739512034612657, + "language_loss": 0.68640763, + "learning_rate": 2.1705714877789633e-06, + "loss": 0.76355338, + "num_input_tokens_seen": 174698360, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.13372803, + "step": 8127, + "time_per_iteration": 2.606557846069336 + }, + { + "auxiliary_loss_clip": 0.06442467, + "auxiliary_loss_mlp": 0.01268043, + "balance_loss_clip": 0.06283246, + "balance_loss_mlp": 0.01254972, + "epoch": 0.48868179768525477, + "flos": 19616221393920.0, + "grad_norm": 1.6528567440124056, + "language_loss": 0.7688967, + "learning_rate": 2.170183441856481e-06, + "loss": 0.84600174, + "num_input_tokens_seen": 174716755, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.13085938, + "step": 8128, + "time_per_iteration": 2.564112901687622 + }, + { + "auxiliary_loss_clip": 0.06448022, + "auxiliary_loss_mlp": 0.01274106, + "balance_loss_clip": 0.06289175, + "balance_loss_mlp": 0.01260653, + "epoch": 0.48874192093792274, + "flos": 21293100028800.0, + "grad_norm": 1.6046032409788031, + "language_loss": 0.76479989, + "learning_rate": 2.1697953894806265e-06, + "loss": 0.84202117, + "num_input_tokens_seen": 174735560, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.13452148, + "step": 8129, + "time_per_iteration": 2.5374317169189453 + }, + { + "auxiliary_loss_clip": 0.06444047, + "auxiliary_loss_mlp": 0.01266373, + "balance_loss_clip": 0.06286857, + "balance_loss_mlp": 0.01252944, + "epoch": 0.4888020441905907, + "flos": 14178647335680.0, + "grad_norm": 2.0974560904884867, + "language_loss": 0.65812773, + "learning_rate": 2.169407330666114e-06, + "loss": 0.735232, + "num_input_tokens_seen": 174752730, + "router_z_loss_clip": 1.5703125, + "router_z_loss_mlp": 0.13452148, + "step": 8130, + "time_per_iteration": 2.5409111976623535 + }, + { + "auxiliary_loss_clip": 0.06440154, + "auxiliary_loss_mlp": 0.01269301, + "balance_loss_clip": 0.06286357, + "balance_loss_mlp": 0.01256528, + "epoch": 0.48886216744325867, + "flos": 24104813235840.0, + "grad_norm": 1.7915788803825166, + "language_loss": 0.72896582, + "learning_rate": 2.169019265427658e-06, + "loss": 0.80606037, + "num_input_tokens_seen": 174772520, + "router_z_loss_clip": 1.53808594, + "router_z_loss_mlp": 0.12768555, + "step": 8131, + "time_per_iteration": 2.56299090385437 + }, + { + "auxiliary_loss_clip": 0.06451105, + "auxiliary_loss_mlp": 0.01270383, + "balance_loss_clip": 0.06289683, + "balance_loss_mlp": 0.01256811, + "epoch": 0.48892229069592663, + "flos": 38439838218240.0, + "grad_norm": 1.2588039875779695, + "language_loss": 0.69597721, + "learning_rate": 2.1686311937799745e-06, + "loss": 0.77319217, + "num_input_tokens_seen": 174796540, + "router_z_loss_clip": 1.61230469, + "router_z_loss_mlp": 0.13586426, + "step": 8132, + "time_per_iteration": 2.70053768157959 + }, + { + "auxiliary_loss_clip": 0.06438366, + "auxiliary_loss_mlp": 0.01270585, + "balance_loss_clip": 0.06285742, + "balance_loss_mlp": 0.01257436, + "epoch": 0.4889824139485946, + "flos": 23850338785920.0, + "grad_norm": 2.3033814193981454, + "language_loss": 0.70031691, + "learning_rate": 2.1682431157377797e-06, + "loss": 0.77740639, + "num_input_tokens_seen": 174817840, + "router_z_loss_clip": 1.52636719, + "router_z_loss_mlp": 0.13146973, + "step": 8133, + "time_per_iteration": 2.5559158325195312 + }, + { + "auxiliary_loss_clip": 0.06443258, + "auxiliary_loss_mlp": 0.01270512, + "balance_loss_clip": 0.0629006, + "balance_loss_mlp": 0.01257548, + "epoch": 0.48904253720126256, + "flos": 24432731389440.0, + "grad_norm": 1.67073327790382, + "language_loss": 0.71227533, + "learning_rate": 2.1678550313157883e-06, + "loss": 0.78941303, + "num_input_tokens_seen": 174837885, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.12957764, + "step": 8134, + "time_per_iteration": 2.5545125007629395 + }, + { + "auxiliary_loss_clip": 0.06444804, + "auxiliary_loss_mlp": 0.01271014, + "balance_loss_clip": 0.06283658, + "balance_loss_mlp": 0.01257055, + "epoch": 0.4891026604539306, + "flos": 24177586106880.0, + "grad_norm": 1.7998075455300961, + "language_loss": 0.80179673, + "learning_rate": 2.167466940528718e-06, + "loss": 0.87895489, + "num_input_tokens_seen": 174855240, + "router_z_loss_clip": 1.61132812, + "router_z_loss_mlp": 0.13977051, + "step": 8135, + "time_per_iteration": 2.54832124710083 + }, + { + "auxiliary_loss_clip": 0.06439205, + "auxiliary_loss_mlp": 0.01267223, + "balance_loss_clip": 0.06284894, + "balance_loss_mlp": 0.01255004, + "epoch": 0.48916278370659855, + "flos": 21477443011200.0, + "grad_norm": 1.5753098834035062, + "language_loss": 0.74565232, + "learning_rate": 2.1670788433912843e-06, + "loss": 0.82271659, + "num_input_tokens_seen": 174875145, + "router_z_loss_clip": 1.54199219, + "router_z_loss_mlp": 0.12213135, + "step": 8136, + "time_per_iteration": 2.5225162506103516 + }, + { + "auxiliary_loss_clip": 0.06440099, + "auxiliary_loss_mlp": 0.01265964, + "balance_loss_clip": 0.06286249, + "balance_loss_mlp": 0.01253519, + "epoch": 0.4892229069592665, + "flos": 22316322562560.0, + "grad_norm": 1.5544220345156794, + "language_loss": 0.73698246, + "learning_rate": 2.166690739918204e-06, + "loss": 0.81404305, + "num_input_tokens_seen": 174894770, + "router_z_loss_clip": 1.5390625, + "router_z_loss_mlp": 0.12451172, + "step": 8137, + "time_per_iteration": 2.5138792991638184 + }, + { + "auxiliary_loss_clip": 0.06443799, + "auxiliary_loss_mlp": 0.01270566, + "balance_loss_clip": 0.06287944, + "balance_loss_mlp": 0.01257673, + "epoch": 0.4892830302119345, + "flos": 12791812008960.0, + "grad_norm": 2.1813813764641448, + "language_loss": 0.75360358, + "learning_rate": 2.1663026301241944e-06, + "loss": 0.83074719, + "num_input_tokens_seen": 174912780, + "router_z_loss_clip": 1.55761719, + "router_z_loss_mlp": 0.12890625, + "step": 8138, + "time_per_iteration": 2.52406644821167 + }, + { + "auxiliary_loss_clip": 0.06443107, + "auxiliary_loss_mlp": 0.01267703, + "balance_loss_clip": 0.06287149, + "balance_loss_mlp": 0.01255192, + "epoch": 0.48934315346460244, + "flos": 20820223111680.0, + "grad_norm": 1.5609881437350468, + "language_loss": 0.74361938, + "learning_rate": 2.165914514023972e-06, + "loss": 0.82072747, + "num_input_tokens_seen": 174931250, + "router_z_loss_clip": 1.56347656, + "router_z_loss_mlp": 0.12518311, + "step": 8139, + "time_per_iteration": 2.5139529705047607 + }, + { + "auxiliary_loss_clip": 0.0643822, + "auxiliary_loss_mlp": 0.01266126, + "balance_loss_clip": 0.06281914, + "balance_loss_mlp": 0.01253144, + "epoch": 0.4894032767172704, + "flos": 19761641354880.0, + "grad_norm": 2.1585110635090388, + "language_loss": 0.62118167, + "learning_rate": 2.165526391632255e-06, + "loss": 0.69822514, + "num_input_tokens_seen": 174951105, + "router_z_loss_clip": 1.56347656, + "router_z_loss_mlp": 0.12988281, + "step": 8140, + "time_per_iteration": 2.5321638584136963 + }, + { + "auxiliary_loss_clip": 0.06444136, + "auxiliary_loss_mlp": 0.01271459, + "balance_loss_clip": 0.06286128, + "balance_loss_mlp": 0.01257506, + "epoch": 0.4894633999699384, + "flos": 17824292703360.0, + "grad_norm": 1.8580247423308633, + "language_loss": 0.82388717, + "learning_rate": 2.1651382629637608e-06, + "loss": 0.90104312, + "num_input_tokens_seen": 174969120, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.13946533, + "step": 8141, + "time_per_iteration": 2.4724786281585693 + }, + { + "auxiliary_loss_clip": 0.06448226, + "auxiliary_loss_mlp": 0.01272495, + "balance_loss_clip": 0.06290399, + "balance_loss_mlp": 0.01258279, + "epoch": 0.48952352322260634, + "flos": 25530781219200.0, + "grad_norm": 1.6913372633538968, + "language_loss": 0.72726512, + "learning_rate": 2.1647501280332066e-06, + "loss": 0.80447233, + "num_input_tokens_seen": 174991295, + "router_z_loss_clip": 1.57714844, + "router_z_loss_mlp": 0.14208984, + "step": 8142, + "time_per_iteration": 2.5858702659606934 + }, + { + "auxiliary_loss_clip": 0.06437673, + "auxiliary_loss_mlp": 0.01270492, + "balance_loss_clip": 0.062835, + "balance_loss_mlp": 0.01257624, + "epoch": 0.4895836464752743, + "flos": 29062508561280.0, + "grad_norm": 1.575435552323968, + "language_loss": 0.6727252, + "learning_rate": 2.1643619868553105e-06, + "loss": 0.74980688, + "num_input_tokens_seen": 175012830, + "router_z_loss_clip": 1.54003906, + "router_z_loss_mlp": 0.12860107, + "step": 8143, + "time_per_iteration": 2.576084613800049 + }, + { + "auxiliary_loss_clip": 0.06441937, + "auxiliary_loss_mlp": 0.01266921, + "balance_loss_clip": 0.06288718, + "balance_loss_mlp": 0.01254678, + "epoch": 0.48964376972794227, + "flos": 33555335034240.0, + "grad_norm": 1.550815752793646, + "language_loss": 0.75150239, + "learning_rate": 2.163973839444793e-06, + "loss": 0.82859099, + "num_input_tokens_seen": 175035695, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.12243652, + "step": 8144, + "time_per_iteration": 2.641314744949341 + }, + { + "auxiliary_loss_clip": 0.06442292, + "auxiliary_loss_mlp": 0.01272411, + "balance_loss_clip": 0.06287357, + "balance_loss_mlp": 0.01259089, + "epoch": 0.48970389298061023, + "flos": 22060506447360.0, + "grad_norm": 1.55007225141579, + "language_loss": 0.75850821, + "learning_rate": 2.1635856858163695e-06, + "loss": 0.83565521, + "num_input_tokens_seen": 175056425, + "router_z_loss_clip": 1.54785156, + "router_z_loss_mlp": 0.13311768, + "step": 8145, + "time_per_iteration": 2.5283498764038086 + }, + { + "auxiliary_loss_clip": 0.0644419, + "auxiliary_loss_mlp": 0.0126844, + "balance_loss_clip": 0.0628912, + "balance_loss_mlp": 0.01254564, + "epoch": 0.4897640162332782, + "flos": 20090523830400.0, + "grad_norm": 1.8073715924768365, + "language_loss": 0.8057586, + "learning_rate": 2.163197525984761e-06, + "loss": 0.88288498, + "num_input_tokens_seen": 175074800, + "router_z_loss_clip": 1.54980469, + "router_z_loss_mlp": 0.13861084, + "step": 8146, + "time_per_iteration": 2.5433614253997803 + }, + { + "auxiliary_loss_clip": 0.06439323, + "auxiliary_loss_mlp": 0.01272664, + "balance_loss_clip": 0.06288785, + "balance_loss_mlp": 0.01260737, + "epoch": 0.48982413948594616, + "flos": 23813134773120.0, + "grad_norm": 1.5096911604618644, + "language_loss": 0.74847698, + "learning_rate": 2.162809359964687e-06, + "loss": 0.82559681, + "num_input_tokens_seen": 175094500, + "router_z_loss_clip": 1.50585938, + "router_z_loss_mlp": 0.11920166, + "step": 8147, + "time_per_iteration": 2.5623743534088135 + }, + { + "auxiliary_loss_clip": 0.06440282, + "auxiliary_loss_mlp": 0.01269967, + "balance_loss_clip": 0.06287088, + "balance_loss_mlp": 0.01256615, + "epoch": 0.4898842627386142, + "flos": 17645442163200.0, + "grad_norm": 1.9926710345073115, + "language_loss": 0.82984591, + "learning_rate": 2.162421187770864e-06, + "loss": 0.90694839, + "num_input_tokens_seen": 175112920, + "router_z_loss_clip": 1.53222656, + "router_z_loss_mlp": 0.13360596, + "step": 8148, + "time_per_iteration": 2.5547962188720703 + }, + { + "auxiliary_loss_clip": 0.0644103, + "auxiliary_loss_mlp": 0.01267177, + "balance_loss_clip": 0.0629115, + "balance_loss_mlp": 0.01255363, + "epoch": 0.48994438599128215, + "flos": 16623519367680.0, + "grad_norm": 2.084842951303776, + "language_loss": 0.74672109, + "learning_rate": 2.162033009418015e-06, + "loss": 0.82380313, + "num_input_tokens_seen": 175129910, + "router_z_loss_clip": 1.49804688, + "router_z_loss_mlp": 0.11810303, + "step": 8149, + "time_per_iteration": 2.533867120742798 + }, + { + "auxiliary_loss_clip": 0.06448293, + "auxiliary_loss_mlp": 0.01270293, + "balance_loss_clip": 0.06289135, + "balance_loss_mlp": 0.01256507, + "epoch": 0.4900045092439501, + "flos": 26622080795520.0, + "grad_norm": 1.692853589800977, + "language_loss": 0.76331913, + "learning_rate": 2.1616448249208567e-06, + "loss": 0.840505, + "num_input_tokens_seen": 175148705, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.13787842, + "step": 8150, + "time_per_iteration": 3.964707374572754 + }, + { + "auxiliary_loss_clip": 0.06450059, + "auxiliary_loss_mlp": 0.01271131, + "balance_loss_clip": 0.06294075, + "balance_loss_mlp": 0.01257833, + "epoch": 0.4900646324966181, + "flos": 19908361054080.0, + "grad_norm": 2.244817701974514, + "language_loss": 0.72999722, + "learning_rate": 2.1612566342941106e-06, + "loss": 0.80720913, + "num_input_tokens_seen": 175167425, + "router_z_loss_clip": 1.56054688, + "router_z_loss_mlp": 0.13299561, + "step": 8151, + "time_per_iteration": 2.5549871921539307 + }, + { + "auxiliary_loss_clip": 0.06359711, + "auxiliary_loss_mlp": 0.01259283, + "balance_loss_clip": 0.06292651, + "balance_loss_mlp": 0.01257264, + "epoch": 0.49012475574928605, + "flos": 59207245729920.0, + "grad_norm": 0.8143029783085558, + "language_loss": 0.54076481, + "learning_rate": 2.1608684375524977e-06, + "loss": 0.6169548, + "num_input_tokens_seen": 175227985, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 0.02018738, + "step": 8152, + "time_per_iteration": 3.1047332286834717 + }, + { + "auxiliary_loss_clip": 0.06453663, + "auxiliary_loss_mlp": 0.01270304, + "balance_loss_clip": 0.06293964, + "balance_loss_mlp": 0.01257018, + "epoch": 0.490184879001954, + "flos": 45270285096960.0, + "grad_norm": 1.7665437022978014, + "language_loss": 0.6121304, + "learning_rate": 2.1604802347107364e-06, + "loss": 0.68937004, + "num_input_tokens_seen": 175251895, + "router_z_loss_clip": 1.59570312, + "router_z_loss_mlp": 0.13293457, + "step": 8153, + "time_per_iteration": 4.15813422203064 + }, + { + "auxiliary_loss_clip": 0.06445354, + "auxiliary_loss_mlp": 0.01267264, + "balance_loss_clip": 0.06291656, + "balance_loss_mlp": 0.01254074, + "epoch": 0.490245002254622, + "flos": 28009754663040.0, + "grad_norm": 1.583608688205754, + "language_loss": 0.76979434, + "learning_rate": 2.160092025783549e-06, + "loss": 0.84692061, + "num_input_tokens_seen": 175272770, + "router_z_loss_clip": 1.53417969, + "router_z_loss_mlp": 0.13195801, + "step": 8154, + "time_per_iteration": 2.5994982719421387 + }, + { + "auxiliary_loss_clip": 0.06359019, + "auxiliary_loss_mlp": 0.01255517, + "balance_loss_clip": 0.06291451, + "balance_loss_mlp": 0.01253472, + "epoch": 0.49030512550728994, + "flos": 58971764229120.0, + "grad_norm": 1.0610708177187165, + "language_loss": 0.669397, + "learning_rate": 2.1597038107856564e-06, + "loss": 0.74554235, + "num_input_tokens_seen": 175336320, + "router_z_loss_clip": 0.67578125, + "router_z_loss_mlp": 0.02046204, + "step": 8155, + "time_per_iteration": 3.2433578968048096 + }, + { + "auxiliary_loss_clip": 0.06448951, + "auxiliary_loss_mlp": 0.01269488, + "balance_loss_clip": 0.06294696, + "balance_loss_mlp": 0.0125743, + "epoch": 0.4903652487599579, + "flos": 19797922972800.0, + "grad_norm": 1.7256067083752205, + "language_loss": 0.77014565, + "learning_rate": 2.1593155897317784e-06, + "loss": 0.84733009, + "num_input_tokens_seen": 175353540, + "router_z_loss_clip": 1.54296875, + "router_z_loss_mlp": 0.12072754, + "step": 8156, + "time_per_iteration": 2.5398688316345215 + }, + { + "auxiliary_loss_clip": 0.06449247, + "auxiliary_loss_mlp": 0.01273385, + "balance_loss_clip": 0.06294699, + "balance_loss_mlp": 0.01259384, + "epoch": 0.49042537201262587, + "flos": 21768492568320.0, + "grad_norm": 1.9286441434498818, + "language_loss": 0.84019762, + "learning_rate": 2.1589273626366377e-06, + "loss": 0.91742396, + "num_input_tokens_seen": 175370445, + "router_z_loss_clip": 1.54492188, + "router_z_loss_mlp": 0.14007568, + "step": 8157, + "time_per_iteration": 2.5673582553863525 + }, + { + "auxiliary_loss_clip": 0.06449863, + "auxiliary_loss_mlp": 0.01266635, + "balance_loss_clip": 0.06293592, + "balance_loss_mlp": 0.01253701, + "epoch": 0.49048549526529384, + "flos": 18959043421440.0, + "grad_norm": 1.7147218979138201, + "language_loss": 0.79903084, + "learning_rate": 2.158539129514956e-06, + "loss": 0.87619579, + "num_input_tokens_seen": 175389020, + "router_z_loss_clip": 1.56152344, + "router_z_loss_mlp": 0.12927246, + "step": 8158, + "time_per_iteration": 3.982774496078491 + }, + { + "auxiliary_loss_clip": 0.0645184, + "auxiliary_loss_mlp": 0.01273348, + "balance_loss_clip": 0.06292954, + "balance_loss_mlp": 0.01259615, + "epoch": 0.4905456185179618, + "flos": 26913633477120.0, + "grad_norm": 1.6654114756309404, + "language_loss": 0.69551659, + "learning_rate": 2.158150890381454e-06, + "loss": 0.77276844, + "num_input_tokens_seen": 175409545, + "router_z_loss_clip": 1.58886719, + "router_z_loss_mlp": 0.1373291, + "step": 8159, + "time_per_iteration": 2.6114954948425293 + }, + { + "auxiliary_loss_clip": 0.06446424, + "auxiliary_loss_mlp": 0.01266602, + "balance_loss_clip": 0.06292199, + "balance_loss_mlp": 0.01253591, + "epoch": 0.49060574177062977, + "flos": 20418567765120.0, + "grad_norm": 1.7624184717579066, + "language_loss": 0.73495585, + "learning_rate": 2.157762645250854e-06, + "loss": 0.81208611, + "num_input_tokens_seen": 175429335, + "router_z_loss_clip": 1.54101562, + "router_z_loss_mlp": 0.13006592, + "step": 8160, + "time_per_iteration": 2.5310287475585938 + }, + { + "auxiliary_loss_clip": 0.06446327, + "auxiliary_loss_mlp": 0.01268684, + "balance_loss_clip": 0.06286773, + "balance_loss_mlp": 0.01254718, + "epoch": 0.4906658650232978, + "flos": 17499477150720.0, + "grad_norm": 1.9303786573731354, + "language_loss": 0.71921647, + "learning_rate": 2.1573743941378796e-06, + "loss": 0.79636657, + "num_input_tokens_seen": 175446955, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.13952637, + "step": 8161, + "time_per_iteration": 2.548387050628662 + }, + { + "auxiliary_loss_clip": 0.06438495, + "auxiliary_loss_mlp": 0.01269834, + "balance_loss_clip": 0.06285487, + "balance_loss_mlp": 0.01257102, + "epoch": 0.49072598827596575, + "flos": 26621619598080.0, + "grad_norm": 1.7423183419157489, + "language_loss": 0.68838918, + "learning_rate": 2.1569861370572517e-06, + "loss": 0.76547247, + "num_input_tokens_seen": 175468195, + "router_z_loss_clip": 1.53027344, + "router_z_loss_mlp": 0.12738037, + "step": 8162, + "time_per_iteration": 2.5565345287323 + }, + { + "auxiliary_loss_clip": 0.06445014, + "auxiliary_loss_mlp": 0.01271543, + "balance_loss_clip": 0.06284854, + "balance_loss_mlp": 0.01258048, + "epoch": 0.4907861115286337, + "flos": 20418861254400.0, + "grad_norm": 1.5998221011516633, + "language_loss": 0.6369257, + "learning_rate": 2.1565978740236944e-06, + "loss": 0.7140913, + "num_input_tokens_seen": 175487455, + "router_z_loss_clip": 1.60058594, + "router_z_loss_mlp": 0.1350708, + "step": 8163, + "time_per_iteration": 2.545926094055176 + }, + { + "auxiliary_loss_clip": 0.0643242, + "auxiliary_loss_mlp": 0.01272916, + "balance_loss_clip": 0.06283394, + "balance_loss_mlp": 0.01260471, + "epoch": 0.4908462347813017, + "flos": 14069508992640.0, + "grad_norm": 1.9421890992027433, + "language_loss": 0.77104688, + "learning_rate": 2.1562096050519293e-06, + "loss": 0.84810019, + "num_input_tokens_seen": 175504450, + "router_z_loss_clip": 1.48828125, + "router_z_loss_mlp": 0.12438965, + "step": 8164, + "time_per_iteration": 3.93280029296875 + }, + { + "auxiliary_loss_clip": 0.06443131, + "auxiliary_loss_mlp": 0.01271936, + "balance_loss_clip": 0.06285694, + "balance_loss_mlp": 0.01258382, + "epoch": 0.49090635803396965, + "flos": 18741227932800.0, + "grad_norm": 1.56961735096587, + "language_loss": 0.77229172, + "learning_rate": 2.1558213301566806e-06, + "loss": 0.84944236, + "num_input_tokens_seen": 175523600, + "router_z_loss_clip": 1.57324219, + "router_z_loss_mlp": 0.13562012, + "step": 8165, + "time_per_iteration": 2.493861436843872 + }, + { + "auxiliary_loss_clip": 0.06434909, + "auxiliary_loss_mlp": 0.01271922, + "balance_loss_clip": 0.06283913, + "balance_loss_mlp": 0.01258922, + "epoch": 0.4909664812866376, + "flos": 20564784339840.0, + "grad_norm": 2.2518376482371862, + "language_loss": 0.77749753, + "learning_rate": 2.1554330493526716e-06, + "loss": 0.85456586, + "num_input_tokens_seen": 175542720, + "router_z_loss_clip": 1.50683594, + "router_z_loss_mlp": 0.13006592, + "step": 8166, + "time_per_iteration": 2.578685760498047 + }, + { + "auxiliary_loss_clip": 0.06343444, + "auxiliary_loss_mlp": 0.01254597, + "balance_loss_clip": 0.06276363, + "balance_loss_mlp": 0.01252508, + "epoch": 0.4910266045393056, + "flos": 54704006622720.0, + "grad_norm": 0.7970989298383858, + "language_loss": 0.54202092, + "learning_rate": 2.1550447626546253e-06, + "loss": 0.61800134, + "num_input_tokens_seen": 175598640, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 0.02090454, + "step": 8167, + "time_per_iteration": 3.1805777549743652 + }, + { + "auxiliary_loss_clip": 0.06435132, + "auxiliary_loss_mlp": 0.01271015, + "balance_loss_clip": 0.06282446, + "balance_loss_mlp": 0.01257902, + "epoch": 0.49108672779197354, + "flos": 16250892261120.0, + "grad_norm": 1.7548504171286585, + "language_loss": 0.86375958, + "learning_rate": 2.1546564700772665e-06, + "loss": 0.94082105, + "num_input_tokens_seen": 175615675, + "router_z_loss_clip": 1.52832031, + "router_z_loss_mlp": 0.13110352, + "step": 8168, + "time_per_iteration": 2.5346431732177734 + }, + { + "auxiliary_loss_clip": 0.06439523, + "auxiliary_loss_mlp": 0.01270106, + "balance_loss_clip": 0.06287682, + "balance_loss_mlp": 0.01257667, + "epoch": 0.4911468510446415, + "flos": 19831018135680.0, + "grad_norm": 1.6618595444085258, + "language_loss": 0.73708379, + "learning_rate": 2.1542681716353193e-06, + "loss": 0.81418014, + "num_input_tokens_seen": 175632255, + "router_z_loss_clip": 1.51757812, + "router_z_loss_mlp": 0.12438965, + "step": 8169, + "time_per_iteration": 2.519845962524414 + }, + { + "auxiliary_loss_clip": 0.06435073, + "auxiliary_loss_mlp": 0.01267032, + "balance_loss_clip": 0.06282359, + "balance_loss_mlp": 0.01254795, + "epoch": 0.4912069742973095, + "flos": 21218650076160.0, + "grad_norm": 1.7105636772686297, + "language_loss": 0.78364748, + "learning_rate": 2.1538798673435068e-06, + "loss": 0.86066854, + "num_input_tokens_seen": 175651625, + "router_z_loss_clip": 1.52441406, + "router_z_loss_mlp": 0.12237549, + "step": 8170, + "time_per_iteration": 2.5751500129699707 + }, + { + "auxiliary_loss_clip": 0.06441889, + "auxiliary_loss_mlp": 0.01268553, + "balance_loss_clip": 0.06285594, + "balance_loss_mlp": 0.01255547, + "epoch": 0.49126709754997744, + "flos": 19543280814720.0, + "grad_norm": 2.6389457816540527, + "language_loss": 0.76311809, + "learning_rate": 2.1534915572165545e-06, + "loss": 0.84022248, + "num_input_tokens_seen": 175669265, + "router_z_loss_clip": 1.56347656, + "router_z_loss_mlp": 0.12988281, + "step": 8171, + "time_per_iteration": 2.5004677772521973 + }, + { + "auxiliary_loss_clip": 0.06443939, + "auxiliary_loss_mlp": 0.01268404, + "balance_loss_clip": 0.06285004, + "balance_loss_mlp": 0.01255947, + "epoch": 0.4913272208026454, + "flos": 12244568993280.0, + "grad_norm": 2.2552468133898684, + "language_loss": 0.81709123, + "learning_rate": 2.1531032412691875e-06, + "loss": 0.89421463, + "num_input_tokens_seen": 175686065, + "router_z_loss_clip": 1.58789062, + "router_z_loss_mlp": 0.12457275, + "step": 8172, + "time_per_iteration": 2.5347814559936523 + }, + { + "auxiliary_loss_clip": 0.06338271, + "auxiliary_loss_mlp": 0.01256316, + "balance_loss_clip": 0.06271008, + "balance_loss_mlp": 0.0125441, + "epoch": 0.49138734405531337, + "flos": 65484663661440.0, + "grad_norm": 0.6802144154671269, + "language_loss": 0.5333854, + "learning_rate": 2.1527149195161295e-06, + "loss": 0.60933125, + "num_input_tokens_seen": 175748595, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 0.01902771, + "step": 8173, + "time_per_iteration": 3.1376869678497314 + }, + { + "auxiliary_loss_clip": 0.06444144, + "auxiliary_loss_mlp": 0.01267282, + "balance_loss_clip": 0.0628697, + "balance_loss_mlp": 0.01253663, + "epoch": 0.4914474673079814, + "flos": 18444434371200.0, + "grad_norm": 1.9185770389222636, + "language_loss": 0.6246022, + "learning_rate": 2.152326591972107e-06, + "loss": 0.70171648, + "num_input_tokens_seen": 175766770, + "router_z_loss_clip": 1.57226562, + "router_z_loss_mlp": 0.1361084, + "step": 8174, + "time_per_iteration": 2.5815811157226562 + }, + { + "auxiliary_loss_clip": 0.06439996, + "auxiliary_loss_mlp": 0.01273325, + "balance_loss_clip": 0.0628511, + "balance_loss_mlp": 0.0126051, + "epoch": 0.49150759056064935, + "flos": 21690772306560.0, + "grad_norm": 2.0568306898238045, + "language_loss": 0.69594127, + "learning_rate": 2.1519382586518445e-06, + "loss": 0.77307451, + "num_input_tokens_seen": 175783605, + "router_z_loss_clip": 1.54980469, + "router_z_loss_mlp": 0.1282959, + "step": 8175, + "time_per_iteration": 2.5219566822052 + }, + { + "auxiliary_loss_clip": 0.06442218, + "auxiliary_loss_mlp": 0.0126783, + "balance_loss_clip": 0.06288453, + "balance_loss_mlp": 0.01255021, + "epoch": 0.4915677138133173, + "flos": 22388969652480.0, + "grad_norm": 1.5433299767806794, + "language_loss": 0.74403, + "learning_rate": 2.151549919570068e-06, + "loss": 0.82113051, + "num_input_tokens_seen": 175801390, + "router_z_loss_clip": 1.53613281, + "router_z_loss_mlp": 0.12805176, + "step": 8176, + "time_per_iteration": 2.5598292350769043 + }, + { + "auxiliary_loss_clip": 0.0643885, + "auxiliary_loss_mlp": 0.01272965, + "balance_loss_clip": 0.0628263, + "balance_loss_mlp": 0.01259977, + "epoch": 0.4916278370659853, + "flos": 18408320461440.0, + "grad_norm": 1.8239688366126487, + "language_loss": 0.70529395, + "learning_rate": 2.1511615747415036e-06, + "loss": 0.78241211, + "num_input_tokens_seen": 175819830, + "router_z_loss_clip": 1.56054688, + "router_z_loss_mlp": 0.12988281, + "step": 8177, + "time_per_iteration": 2.5329604148864746 + }, + { + "auxiliary_loss_clip": 0.06340313, + "auxiliary_loss_mlp": 0.01256045, + "balance_loss_clip": 0.06273533, + "balance_loss_mlp": 0.01253889, + "epoch": 0.49168796031865325, + "flos": 66630147701760.0, + "grad_norm": 0.6656640602529083, + "language_loss": 0.46068031, + "learning_rate": 2.150773224180877e-06, + "loss": 0.53664386, + "num_input_tokens_seen": 175881765, + "router_z_loss_clip": 0.66796875, + "router_z_loss_mlp": 0.02159119, + "step": 8178, + "time_per_iteration": 3.170982837677002 + }, + { + "auxiliary_loss_clip": 0.06445555, + "auxiliary_loss_mlp": 0.01272894, + "balance_loss_clip": 0.06286988, + "balance_loss_mlp": 0.01259597, + "epoch": 0.4917480835713212, + "flos": 20965601145600.0, + "grad_norm": 2.2617000627187407, + "language_loss": 0.6597743, + "learning_rate": 2.1503848679029147e-06, + "loss": 0.73695886, + "num_input_tokens_seen": 175901795, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.13299561, + "step": 8179, + "time_per_iteration": 2.5594394207000732 + }, + { + "auxiliary_loss_clip": 0.06447062, + "auxiliary_loss_mlp": 0.01267463, + "balance_loss_clip": 0.06285466, + "balance_loss_mlp": 0.01254088, + "epoch": 0.4918082068239892, + "flos": 15777386438400.0, + "grad_norm": 2.2633588866978442, + "language_loss": 0.70069337, + "learning_rate": 2.149996505922343e-06, + "loss": 0.77783871, + "num_input_tokens_seen": 175917770, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.1338501, + "step": 8180, + "time_per_iteration": 2.489649772644043 + }, + { + "auxiliary_loss_clip": 0.0643749, + "auxiliary_loss_mlp": 0.01267489, + "balance_loss_clip": 0.06285596, + "balance_loss_mlp": 0.01254406, + "epoch": 0.49186833007665715, + "flos": 24611162659200.0, + "grad_norm": 1.7052643417851399, + "language_loss": 0.84654552, + "learning_rate": 2.1496081382538895e-06, + "loss": 0.92359537, + "num_input_tokens_seen": 175937000, + "router_z_loss_clip": 1.51953125, + "router_z_loss_mlp": 0.13098145, + "step": 8181, + "time_per_iteration": 2.570831298828125 + }, + { + "auxiliary_loss_clip": 0.06432545, + "auxiliary_loss_mlp": 0.0127158, + "balance_loss_clip": 0.06282885, + "balance_loss_mlp": 0.01259843, + "epoch": 0.4919284533293251, + "flos": 22097039627520.0, + "grad_norm": 1.9771399001803804, + "language_loss": 0.73092818, + "learning_rate": 2.1492197649122793e-06, + "loss": 0.80796945, + "num_input_tokens_seen": 175955170, + "router_z_loss_clip": 1.49609375, + "router_z_loss_mlp": 0.11743164, + "step": 8182, + "time_per_iteration": 2.4966702461242676 + }, + { + "auxiliary_loss_clip": 0.06435409, + "auxiliary_loss_mlp": 0.01272985, + "balance_loss_clip": 0.06280539, + "balance_loss_mlp": 0.01260826, + "epoch": 0.4919885765819931, + "flos": 23374820465280.0, + "grad_norm": 1.9470010509475855, + "language_loss": 0.73167384, + "learning_rate": 2.1488313859122412e-06, + "loss": 0.80875778, + "num_input_tokens_seen": 175973725, + "router_z_loss_clip": 1.54882812, + "router_z_loss_mlp": 0.1217041, + "step": 8183, + "time_per_iteration": 2.5529325008392334 + }, + { + "auxiliary_loss_clip": 0.06441429, + "auxiliary_loss_mlp": 0.01268017, + "balance_loss_clip": 0.06279727, + "balance_loss_mlp": 0.01254523, + "epoch": 0.49204869983466104, + "flos": 21366795294720.0, + "grad_norm": 2.013163662705091, + "language_loss": 0.77443838, + "learning_rate": 2.1484430012685015e-06, + "loss": 0.85153282, + "num_input_tokens_seen": 175993885, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.1348877, + "step": 8184, + "time_per_iteration": 2.508230209350586 + }, + { + "auxiliary_loss_clip": 0.06435518, + "auxiliary_loss_mlp": 0.01266873, + "balance_loss_clip": 0.06281742, + "balance_loss_mlp": 0.01254523, + "epoch": 0.492108823087329, + "flos": 21149147514240.0, + "grad_norm": 2.3088868689892674, + "language_loss": 0.71377504, + "learning_rate": 2.148054610995789e-06, + "loss": 0.79079902, + "num_input_tokens_seen": 176014210, + "router_z_loss_clip": 1.53710938, + "router_z_loss_mlp": 0.12347412, + "step": 8185, + "time_per_iteration": 2.545316219329834 + }, + { + "auxiliary_loss_clip": 0.06437825, + "auxiliary_loss_mlp": 0.01266771, + "balance_loss_clip": 0.06280625, + "balance_loss_mlp": 0.01253074, + "epoch": 0.49216894633999697, + "flos": 25123214160000.0, + "grad_norm": 1.8318004423040046, + "language_loss": 0.75395268, + "learning_rate": 2.147666215108831e-06, + "loss": 0.8309986, + "num_input_tokens_seen": 176033890, + "router_z_loss_clip": 1.57128906, + "router_z_loss_mlp": 0.13684082, + "step": 8186, + "time_per_iteration": 2.5238165855407715 + }, + { + "auxiliary_loss_clip": 0.06435218, + "auxiliary_loss_mlp": 0.01274022, + "balance_loss_clip": 0.06281888, + "balance_loss_mlp": 0.01261124, + "epoch": 0.49222906959266494, + "flos": 22644534205440.0, + "grad_norm": 2.2257308208746975, + "language_loss": 0.68571508, + "learning_rate": 2.1472778136223545e-06, + "loss": 0.76280749, + "num_input_tokens_seen": 176052720, + "router_z_loss_clip": 1.53515625, + "router_z_loss_mlp": 0.12908936, + "step": 8187, + "time_per_iteration": 2.561488151550293 + }, + { + "auxiliary_loss_clip": 0.06434098, + "auxiliary_loss_mlp": 0.01272206, + "balance_loss_clip": 0.06280753, + "balance_loss_mlp": 0.01259653, + "epoch": 0.49228919284533296, + "flos": 20416471413120.0, + "grad_norm": 1.3887162782350388, + "language_loss": 0.67211652, + "learning_rate": 2.1468894065510894e-06, + "loss": 0.7491796, + "num_input_tokens_seen": 176072545, + "router_z_loss_clip": 1.53417969, + "router_z_loss_mlp": 0.12567139, + "step": 8188, + "time_per_iteration": 2.5019164085388184 + }, + { + "auxiliary_loss_clip": 0.06437577, + "auxiliary_loss_mlp": 0.01267268, + "balance_loss_clip": 0.06282844, + "balance_loss_mlp": 0.012549, + "epoch": 0.4923493160980009, + "flos": 27129142978560.0, + "grad_norm": 1.6466242872646388, + "language_loss": 0.74921268, + "learning_rate": 2.1465009939097623e-06, + "loss": 0.8262611, + "num_input_tokens_seen": 176091490, + "router_z_loss_clip": 1.54785156, + "router_z_loss_mlp": 0.12365723, + "step": 8189, + "time_per_iteration": 2.6160171031951904 + }, + { + "auxiliary_loss_clip": 0.06432211, + "auxiliary_loss_mlp": 0.01271904, + "balance_loss_clip": 0.0627953, + "balance_loss_mlp": 0.01259363, + "epoch": 0.4924094393506689, + "flos": 35745522981120.0, + "grad_norm": 1.6094215463667148, + "language_loss": 0.64780444, + "learning_rate": 2.146112575713104e-06, + "loss": 0.72484565, + "num_input_tokens_seen": 176113200, + "router_z_loss_clip": 1.52734375, + "router_z_loss_mlp": 0.12542725, + "step": 8190, + "time_per_iteration": 4.0641090869903564 + }, + { + "auxiliary_loss_clip": 0.06438321, + "auxiliary_loss_mlp": 0.01273117, + "balance_loss_clip": 0.06285122, + "balance_loss_mlp": 0.01260486, + "epoch": 0.49246956260333685, + "flos": 20418735473280.0, + "grad_norm": 1.8613448606205585, + "language_loss": 0.71446037, + "learning_rate": 2.1457241519758413e-06, + "loss": 0.79157472, + "num_input_tokens_seen": 176132485, + "router_z_loss_clip": 1.53222656, + "router_z_loss_mlp": 0.12628174, + "step": 8191, + "time_per_iteration": 2.5388033390045166 + }, + { + "auxiliary_loss_clip": 0.06437817, + "auxiliary_loss_mlp": 0.01265513, + "balance_loss_clip": 0.06282701, + "balance_loss_mlp": 0.01253193, + "epoch": 0.4925296858560048, + "flos": 38985152590080.0, + "grad_norm": 1.8396866027790106, + "language_loss": 0.72404003, + "learning_rate": 2.1453357227127043e-06, + "loss": 0.80107331, + "num_input_tokens_seen": 176155755, + "router_z_loss_clip": 1.55078125, + "router_z_loss_mlp": 0.12335205, + "step": 8192, + "time_per_iteration": 2.696115255355835 + }, + { + "auxiliary_loss_clip": 0.06334923, + "auxiliary_loss_mlp": 0.01254622, + "balance_loss_clip": 0.06267789, + "balance_loss_mlp": 0.01252217, + "epoch": 0.4925898091086728, + "flos": 64300367652480.0, + "grad_norm": 0.7283072322766662, + "language_loss": 0.51975358, + "learning_rate": 2.1449472879384224e-06, + "loss": 0.59564906, + "num_input_tokens_seen": 176216295, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 0.02401733, + "step": 8193, + "time_per_iteration": 4.540759086608887 + }, + { + "auxiliary_loss_clip": 0.06434911, + "auxiliary_loss_mlp": 0.01271982, + "balance_loss_clip": 0.06282961, + "balance_loss_mlp": 0.01259417, + "epoch": 0.49264993236134075, + "flos": 23042541899520.0, + "grad_norm": 1.3982393371006636, + "language_loss": 0.77103728, + "learning_rate": 2.1445588476677246e-06, + "loss": 0.84810621, + "num_input_tokens_seen": 176235925, + "router_z_loss_clip": 1.51953125, + "router_z_loss_mlp": 0.12554932, + "step": 8194, + "time_per_iteration": 2.585632085800171 + }, + { + "auxiliary_loss_clip": 0.06434575, + "auxiliary_loss_mlp": 0.01269697, + "balance_loss_clip": 0.06280608, + "balance_loss_mlp": 0.01257376, + "epoch": 0.4927100556140087, + "flos": 24725248392960.0, + "grad_norm": 2.1551580003064186, + "language_loss": 0.70539922, + "learning_rate": 2.144170401915341e-06, + "loss": 0.78244197, + "num_input_tokens_seen": 176253865, + "router_z_loss_clip": 1.5390625, + "router_z_loss_mlp": 0.12329102, + "step": 8195, + "time_per_iteration": 2.5881664752960205 + }, + { + "auxiliary_loss_clip": 0.06438025, + "auxiliary_loss_mlp": 0.01269625, + "balance_loss_clip": 0.06284925, + "balance_loss_mlp": 0.01257687, + "epoch": 0.4927701788666767, + "flos": 23510932623360.0, + "grad_norm": 2.3036054872688765, + "language_loss": 0.81165189, + "learning_rate": 2.143781950696001e-06, + "loss": 0.88872838, + "num_input_tokens_seen": 176271525, + "router_z_loss_clip": 1.52929688, + "router_z_loss_mlp": 0.11932373, + "step": 8196, + "time_per_iteration": 2.5550785064697266 + }, + { + "auxiliary_loss_clip": 0.06437081, + "auxiliary_loss_mlp": 0.01270899, + "balance_loss_clip": 0.06279114, + "balance_loss_mlp": 0.01258311, + "epoch": 0.49283030211934464, + "flos": 22935374127360.0, + "grad_norm": 1.9095456135696567, + "language_loss": 0.70909548, + "learning_rate": 2.1433934940244356e-06, + "loss": 0.78617525, + "num_input_tokens_seen": 176290810, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.12597656, + "step": 8197, + "time_per_iteration": 4.003530263900757 + }, + { + "auxiliary_loss_clip": 0.06434973, + "auxiliary_loss_mlp": 0.01271256, + "balance_loss_clip": 0.0628255, + "balance_loss_mlp": 0.01259699, + "epoch": 0.4928904253720126, + "flos": 16878622723200.0, + "grad_norm": 1.745870627956974, + "language_loss": 0.84271383, + "learning_rate": 2.143005031915374e-06, + "loss": 0.91977608, + "num_input_tokens_seen": 176309165, + "router_z_loss_clip": 1.52441406, + "router_z_loss_mlp": 0.11553955, + "step": 8198, + "time_per_iteration": 2.498107671737671 + }, + { + "auxiliary_loss_clip": 0.06443786, + "auxiliary_loss_mlp": 0.01268462, + "balance_loss_clip": 0.06287393, + "balance_loss_mlp": 0.01254521, + "epoch": 0.4929505486246806, + "flos": 14871855363840.0, + "grad_norm": 1.7338591596570678, + "language_loss": 0.76126587, + "learning_rate": 2.1426165643835467e-06, + "loss": 0.83838832, + "num_input_tokens_seen": 176324960, + "router_z_loss_clip": 1.56347656, + "router_z_loss_mlp": 0.13946533, + "step": 8199, + "time_per_iteration": 2.5254313945770264 + }, + { + "auxiliary_loss_clip": 0.06436033, + "auxiliary_loss_mlp": 0.01266476, + "balance_loss_clip": 0.06279432, + "balance_loss_mlp": 0.01252808, + "epoch": 0.49301067187734854, + "flos": 23849206755840.0, + "grad_norm": 1.3683337876027823, + "language_loss": 0.60070461, + "learning_rate": 2.1422280914436864e-06, + "loss": 0.67772967, + "num_input_tokens_seen": 176346195, + "router_z_loss_clip": 1.56542969, + "router_z_loss_mlp": 0.13647461, + "step": 8200, + "time_per_iteration": 2.54241943359375 + }, + { + "auxiliary_loss_clip": 0.06429607, + "auxiliary_loss_mlp": 0.01273188, + "balance_loss_clip": 0.06281705, + "balance_loss_mlp": 0.01261541, + "epoch": 0.49307079513001656, + "flos": 22497730652160.0, + "grad_norm": 1.4845406915411774, + "language_loss": 0.79454738, + "learning_rate": 2.1418396131105213e-06, + "loss": 0.87157536, + "num_input_tokens_seen": 176366735, + "router_z_loss_clip": 1.47851562, + "router_z_loss_mlp": 0.11657715, + "step": 8201, + "time_per_iteration": 2.590289831161499 + }, + { + "auxiliary_loss_clip": 0.0644393, + "auxiliary_loss_mlp": 0.01272695, + "balance_loss_clip": 0.06281954, + "balance_loss_mlp": 0.01259171, + "epoch": 0.4931309183826845, + "flos": 15930059777280.0, + "grad_norm": 1.9752291134223394, + "language_loss": 0.66993362, + "learning_rate": 2.141451129398785e-06, + "loss": 0.74709988, + "num_input_tokens_seen": 176384475, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.13525391, + "step": 8202, + "time_per_iteration": 2.5706307888031006 + }, + { + "auxiliary_loss_clip": 0.06429332, + "auxiliary_loss_mlp": 0.01267886, + "balance_loss_clip": 0.06277282, + "balance_loss_mlp": 0.01256055, + "epoch": 0.4931910416353525, + "flos": 27316588561920.0, + "grad_norm": 1.8969992308716948, + "language_loss": 0.75337243, + "learning_rate": 2.1410626403232076e-06, + "loss": 0.83034456, + "num_input_tokens_seen": 176402645, + "router_z_loss_clip": 1.52148438, + "router_z_loss_mlp": 0.11834717, + "step": 8203, + "time_per_iteration": 4.0727972984313965 + }, + { + "auxiliary_loss_clip": 0.06434371, + "auxiliary_loss_mlp": 0.01265731, + "balance_loss_clip": 0.06279419, + "balance_loss_mlp": 0.01253626, + "epoch": 0.49325116488802045, + "flos": 20811166871040.0, + "grad_norm": 2.0494104605673935, + "language_loss": 0.80605292, + "learning_rate": 2.1406741458985197e-06, + "loss": 0.8830539, + "num_input_tokens_seen": 176416715, + "router_z_loss_clip": 1.54980469, + "router_z_loss_mlp": 0.12103271, + "step": 8204, + "time_per_iteration": 2.6136350631713867 + }, + { + "auxiliary_loss_clip": 0.0643463, + "auxiliary_loss_mlp": 0.0126736, + "balance_loss_clip": 0.06280951, + "balance_loss_mlp": 0.01254664, + "epoch": 0.4933112881406884, + "flos": 19872247144320.0, + "grad_norm": 1.7256783924705517, + "language_loss": 0.65881336, + "learning_rate": 2.140285646139455e-06, + "loss": 0.73583329, + "num_input_tokens_seen": 176435755, + "router_z_loss_clip": 1.53613281, + "router_z_loss_mlp": 0.12695312, + "step": 8205, + "time_per_iteration": 2.5172812938690186 + }, + { + "auxiliary_loss_clip": 0.06445079, + "auxiliary_loss_mlp": 0.01273568, + "balance_loss_clip": 0.06283986, + "balance_loss_mlp": 0.0125971, + "epoch": 0.4933714113933564, + "flos": 21833215447680.0, + "grad_norm": 1.6546444342030124, + "language_loss": 0.66620767, + "learning_rate": 2.139897141060744e-06, + "loss": 0.74339426, + "num_input_tokens_seen": 176453915, + "router_z_loss_clip": 1.61035156, + "router_z_loss_mlp": 0.13861084, + "step": 8206, + "time_per_iteration": 2.556596040725708 + }, + { + "auxiliary_loss_clip": 0.06434575, + "auxiliary_loss_mlp": 0.0126512, + "balance_loss_clip": 0.06278799, + "balance_loss_mlp": 0.01253539, + "epoch": 0.49343153464602435, + "flos": 27897304083840.0, + "grad_norm": 1.8364733010130068, + "language_loss": 0.77070463, + "learning_rate": 2.1395086306771196e-06, + "loss": 0.84770155, + "num_input_tokens_seen": 176475175, + "router_z_loss_clip": 1.55761719, + "router_z_loss_mlp": 0.11584473, + "step": 8207, + "time_per_iteration": 2.591074228286743 + }, + { + "auxiliary_loss_clip": 0.06430385, + "auxiliary_loss_mlp": 0.01268434, + "balance_loss_clip": 0.06277601, + "balance_loss_mlp": 0.01256174, + "epoch": 0.4934916578986923, + "flos": 24688002453120.0, + "grad_norm": 2.876199477758729, + "language_loss": 0.60526079, + "learning_rate": 2.1391201150033147e-06, + "loss": 0.68224895, + "num_input_tokens_seen": 176494250, + "router_z_loss_clip": 1.53027344, + "router_z_loss_mlp": 0.12261963, + "step": 8208, + "time_per_iteration": 2.5641872882843018 + }, + { + "auxiliary_loss_clip": 0.06432977, + "auxiliary_loss_mlp": 0.01268916, + "balance_loss_clip": 0.06279885, + "balance_loss_mlp": 0.01256548, + "epoch": 0.4935517811513603, + "flos": 23412024478080.0, + "grad_norm": 2.3268226049750025, + "language_loss": 0.79136336, + "learning_rate": 2.1387315940540598e-06, + "loss": 0.86838233, + "num_input_tokens_seen": 176513325, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.12365723, + "step": 8209, + "time_per_iteration": 2.5345427989959717 + }, + { + "auxiliary_loss_clip": 0.06431048, + "auxiliary_loss_mlp": 0.01266279, + "balance_loss_clip": 0.06279348, + "balance_loss_mlp": 0.01253917, + "epoch": 0.49361190440402825, + "flos": 21950948833920.0, + "grad_norm": 3.2965997735856423, + "language_loss": 0.79514015, + "learning_rate": 2.138343067844089e-06, + "loss": 0.87211347, + "num_input_tokens_seen": 176532915, + "router_z_loss_clip": 1.51757812, + "router_z_loss_mlp": 0.12359619, + "step": 8210, + "time_per_iteration": 2.5686817169189453 + }, + { + "auxiliary_loss_clip": 0.06438643, + "auxiliary_loss_mlp": 0.01268716, + "balance_loss_clip": 0.06280634, + "balance_loss_mlp": 0.01256629, + "epoch": 0.4936720276566962, + "flos": 25122124056960.0, + "grad_norm": 2.539502696257949, + "language_loss": 0.81421793, + "learning_rate": 2.1379545363881363e-06, + "loss": 0.8912915, + "num_input_tokens_seen": 176552775, + "router_z_loss_clip": 1.58007812, + "router_z_loss_mlp": 0.12084961, + "step": 8211, + "time_per_iteration": 2.5667943954467773 + }, + { + "auxiliary_loss_clip": 0.06429391, + "auxiliary_loss_mlp": 0.0126729, + "balance_loss_clip": 0.06274866, + "balance_loss_mlp": 0.01254803, + "epoch": 0.4937321509093642, + "flos": 26366055045120.0, + "grad_norm": 2.1078758653058913, + "language_loss": 0.91783321, + "learning_rate": 2.137565999700933e-06, + "loss": 0.99480009, + "num_input_tokens_seen": 176572185, + "router_z_loss_clip": 1.54394531, + "router_z_loss_mlp": 0.12506104, + "step": 8212, + "time_per_iteration": 2.5892627239227295 + }, + { + "auxiliary_loss_clip": 0.06437102, + "auxiliary_loss_mlp": 0.01269581, + "balance_loss_clip": 0.06282008, + "balance_loss_mlp": 0.01257511, + "epoch": 0.49379227416203214, + "flos": 22967211479040.0, + "grad_norm": 1.9203573298750467, + "language_loss": 0.65474772, + "learning_rate": 2.1371774577972138e-06, + "loss": 0.7318145, + "num_input_tokens_seen": 176591490, + "router_z_loss_clip": 1.55078125, + "router_z_loss_mlp": 0.1206665, + "step": 8213, + "time_per_iteration": 2.5766966342926025 + }, + { + "auxiliary_loss_clip": 0.06435272, + "auxiliary_loss_mlp": 0.01267129, + "balance_loss_clip": 0.06281263, + "balance_loss_mlp": 0.01254957, + "epoch": 0.49385239741470016, + "flos": 32497340256000.0, + "grad_norm": 5.5178519689557435, + "language_loss": 0.76015925, + "learning_rate": 2.136788910691711e-06, + "loss": 0.83718324, + "num_input_tokens_seen": 176612715, + "router_z_loss_clip": 1.54003906, + "router_z_loss_mlp": 0.1217041, + "step": 8214, + "time_per_iteration": 2.6815552711486816 + }, + { + "auxiliary_loss_clip": 0.06435767, + "auxiliary_loss_mlp": 0.01267382, + "balance_loss_clip": 0.06282468, + "balance_loss_mlp": 0.0125508, + "epoch": 0.4939125206673681, + "flos": 22499575441920.0, + "grad_norm": 1.6727543381074526, + "language_loss": 0.84167933, + "learning_rate": 2.1364003583991594e-06, + "loss": 0.91871083, + "num_input_tokens_seen": 176631950, + "router_z_loss_clip": 1.53417969, + "router_z_loss_mlp": 0.12298584, + "step": 8215, + "time_per_iteration": 2.6213715076446533 + }, + { + "auxiliary_loss_clip": 0.06426814, + "auxiliary_loss_mlp": 0.01268273, + "balance_loss_clip": 0.06280927, + "balance_loss_mlp": 0.0125696, + "epoch": 0.4939726439200361, + "flos": 31184493684480.0, + "grad_norm": 1.9918722360209278, + "language_loss": 0.83712834, + "learning_rate": 2.136011800934292e-06, + "loss": 0.91407919, + "num_input_tokens_seen": 176653060, + "router_z_loss_clip": 1.45996094, + "router_z_loss_mlp": 0.11315918, + "step": 8216, + "time_per_iteration": 2.619922637939453 + }, + { + "auxiliary_loss_clip": 0.06434111, + "auxiliary_loss_mlp": 0.0127241, + "balance_loss_clip": 0.06283373, + "balance_loss_mlp": 0.01260614, + "epoch": 0.49403276717270406, + "flos": 22680773896320.0, + "grad_norm": 1.6954468061355052, + "language_loss": 0.75099367, + "learning_rate": 2.1356232383118442e-06, + "loss": 0.82805896, + "num_input_tokens_seen": 176673895, + "router_z_loss_clip": 1.50585938, + "router_z_loss_mlp": 0.11791992, + "step": 8217, + "time_per_iteration": 2.5473809242248535 + }, + { + "auxiliary_loss_clip": 0.06434639, + "auxiliary_loss_mlp": 0.01271118, + "balance_loss_clip": 0.06285703, + "balance_loss_mlp": 0.01258422, + "epoch": 0.494092890425372, + "flos": 20747408313600.0, + "grad_norm": 1.6176152886760666, + "language_loss": 0.78781378, + "learning_rate": 2.1352346705465494e-06, + "loss": 0.86487138, + "num_input_tokens_seen": 176692550, + "router_z_loss_clip": 1.48925781, + "router_z_loss_mlp": 0.12689209, + "step": 8218, + "time_per_iteration": 2.542994976043701 + }, + { + "auxiliary_loss_clip": 0.06433167, + "auxiliary_loss_mlp": 0.01265257, + "balance_loss_clip": 0.06283546, + "balance_loss_mlp": 0.01253628, + "epoch": 0.49415301367804, + "flos": 18374889882240.0, + "grad_norm": 2.39829798701753, + "language_loss": 0.77065396, + "learning_rate": 2.134846097653142e-06, + "loss": 0.84763819, + "num_input_tokens_seen": 176709335, + "router_z_loss_clip": 1.49609375, + "router_z_loss_mlp": 0.11639404, + "step": 8219, + "time_per_iteration": 2.5450475215911865 + }, + { + "auxiliary_loss_clip": 0.06439486, + "auxiliary_loss_mlp": 0.01269777, + "balance_loss_clip": 0.06285974, + "balance_loss_mlp": 0.01258321, + "epoch": 0.49421313693070795, + "flos": 17536471528320.0, + "grad_norm": 2.258549541306087, + "language_loss": 0.62705898, + "learning_rate": 2.134457519646357e-06, + "loss": 0.70415157, + "num_input_tokens_seen": 176727715, + "router_z_loss_clip": 1.53613281, + "router_z_loss_mlp": 0.11462402, + "step": 8220, + "time_per_iteration": 2.5296928882598877 + }, + { + "auxiliary_loss_clip": 0.06433114, + "auxiliary_loss_mlp": 0.01270633, + "balance_loss_clip": 0.06280304, + "balance_loss_mlp": 0.01259076, + "epoch": 0.4942732601833759, + "flos": 20818210613760.0, + "grad_norm": 1.8931623619102378, + "language_loss": 0.72802091, + "learning_rate": 2.1340689365409296e-06, + "loss": 0.80505836, + "num_input_tokens_seen": 176747530, + "router_z_loss_clip": 1.52929688, + "router_z_loss_mlp": 0.11572266, + "step": 8221, + "time_per_iteration": 2.521430253982544 + }, + { + "auxiliary_loss_clip": 0.06441319, + "auxiliary_loss_mlp": 0.01270693, + "balance_loss_clip": 0.06292681, + "balance_loss_mlp": 0.01258761, + "epoch": 0.4943333834360439, + "flos": 15054269702400.0, + "grad_norm": 1.6896047494674526, + "language_loss": 0.79253769, + "learning_rate": 2.133680348351595e-06, + "loss": 0.86965781, + "num_input_tokens_seen": 176765260, + "router_z_loss_clip": 1.48828125, + "router_z_loss_mlp": 0.11920166, + "step": 8222, + "time_per_iteration": 2.533997058868408 + }, + { + "auxiliary_loss_clip": 0.06434612, + "auxiliary_loss_mlp": 0.01272431, + "balance_loss_clip": 0.06282104, + "balance_loss_mlp": 0.0126051, + "epoch": 0.49439350668871185, + "flos": 16075899008640.0, + "grad_norm": 6.490136916654426, + "language_loss": 0.72483402, + "learning_rate": 2.133291755093088e-06, + "loss": 0.80190444, + "num_input_tokens_seen": 176781770, + "router_z_loss_clip": 1.5234375, + "router_z_loss_mlp": 0.1192627, + "step": 8223, + "time_per_iteration": 2.457361936569214 + }, + { + "auxiliary_loss_clip": 0.06444422, + "auxiliary_loss_mlp": 0.01270468, + "balance_loss_clip": 0.06287469, + "balance_loss_mlp": 0.01257367, + "epoch": 0.4944536299413798, + "flos": 20885281407360.0, + "grad_norm": 1.6318042764148617, + "language_loss": 0.75256205, + "learning_rate": 2.132903156780144e-06, + "loss": 0.82971096, + "num_input_tokens_seen": 176800655, + "router_z_loss_clip": 1.56933594, + "router_z_loss_mlp": 0.13122559, + "step": 8224, + "time_per_iteration": 2.5326499938964844 + }, + { + "auxiliary_loss_clip": 0.06441943, + "auxiliary_loss_mlp": 0.01267954, + "balance_loss_clip": 0.06287307, + "balance_loss_mlp": 0.01255646, + "epoch": 0.4945137531940478, + "flos": 26615162833920.0, + "grad_norm": 2.58625148433793, + "language_loss": 0.64002287, + "learning_rate": 2.1325145534274997e-06, + "loss": 0.71712184, + "num_input_tokens_seen": 176820610, + "router_z_loss_clip": 1.54394531, + "router_z_loss_mlp": 0.12322998, + "step": 8225, + "time_per_iteration": 2.555088996887207 + }, + { + "auxiliary_loss_clip": 0.06438252, + "auxiliary_loss_mlp": 0.01269636, + "balance_loss_clip": 0.06283222, + "balance_loss_mlp": 0.01258007, + "epoch": 0.49457387644671574, + "flos": 23995004060160.0, + "grad_norm": 2.0569415863505554, + "language_loss": 0.77084112, + "learning_rate": 2.1321259450498893e-06, + "loss": 0.84792, + "num_input_tokens_seen": 176840520, + "router_z_loss_clip": 1.54980469, + "router_z_loss_mlp": 0.11627197, + "step": 8226, + "time_per_iteration": 2.557900905609131 + }, + { + "auxiliary_loss_clip": 0.06436731, + "auxiliary_loss_mlp": 0.01270529, + "balance_loss_clip": 0.06281079, + "balance_loss_mlp": 0.01256958, + "epoch": 0.49463399969938376, + "flos": 26983387601280.0, + "grad_norm": 1.6446627405679832, + "language_loss": 0.71402973, + "learning_rate": 2.131737331662051e-06, + "loss": 0.79110235, + "num_input_tokens_seen": 176860265, + "router_z_loss_clip": 1.55664062, + "router_z_loss_mlp": 0.13568115, + "step": 8227, + "time_per_iteration": 2.533468246459961 + }, + { + "auxiliary_loss_clip": 0.06441461, + "auxiliary_loss_mlp": 0.01270684, + "balance_loss_clip": 0.06282251, + "balance_loss_mlp": 0.01258477, + "epoch": 0.49469412295205173, + "flos": 29689610117760.0, + "grad_norm": 1.6469495440568809, + "language_loss": 0.7179364, + "learning_rate": 2.131348713278718e-06, + "loss": 0.79505783, + "num_input_tokens_seen": 176882910, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.12213135, + "step": 8228, + "time_per_iteration": 2.621777296066284 + }, + { + "auxiliary_loss_clip": 0.06432875, + "auxiliary_loss_mlp": 0.01271248, + "balance_loss_clip": 0.06283268, + "balance_loss_mlp": 0.01259768, + "epoch": 0.4947542462047197, + "flos": 24138285742080.0, + "grad_norm": 1.3686875437171686, + "language_loss": 0.84044397, + "learning_rate": 2.1309600899146304e-06, + "loss": 0.91748512, + "num_input_tokens_seen": 176903030, + "router_z_loss_clip": 1.49511719, + "router_z_loss_mlp": 0.1149292, + "step": 8229, + "time_per_iteration": 2.620849609375 + }, + { + "auxiliary_loss_clip": 0.06443636, + "auxiliary_loss_mlp": 0.01271474, + "balance_loss_clip": 0.0628624, + "balance_loss_mlp": 0.01258134, + "epoch": 0.49481436945738766, + "flos": 20050804195200.0, + "grad_norm": 2.3211713476829656, + "language_loss": 0.75208747, + "learning_rate": 2.1305714615845227e-06, + "loss": 0.82923853, + "num_input_tokens_seen": 176919025, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.13342285, + "step": 8230, + "time_per_iteration": 3.9126293659210205 + }, + { + "auxiliary_loss_clip": 0.06439002, + "auxiliary_loss_mlp": 0.01268847, + "balance_loss_clip": 0.06284901, + "balance_loss_mlp": 0.01256432, + "epoch": 0.4948744927100556, + "flos": 15675040275840.0, + "grad_norm": 1.9615207178823395, + "language_loss": 0.80548179, + "learning_rate": 2.1301828283031314e-06, + "loss": 0.88256031, + "num_input_tokens_seen": 176937945, + "router_z_loss_clip": 1.54101562, + "router_z_loss_mlp": 0.1239624, + "step": 8231, + "time_per_iteration": 2.525049924850464 + }, + { + "auxiliary_loss_clip": 0.06329959, + "auxiliary_loss_mlp": 0.01257972, + "balance_loss_clip": 0.06262948, + "balance_loss_mlp": 0.0125556, + "epoch": 0.4949346159627236, + "flos": 68893611644160.0, + "grad_norm": 0.7512177245674743, + "language_loss": 0.60052431, + "learning_rate": 2.1297941900851944e-06, + "loss": 0.67640364, + "num_input_tokens_seen": 177004575, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 0.02409363, + "step": 8232, + "time_per_iteration": 4.674450159072876 + }, + { + "auxiliary_loss_clip": 0.06440374, + "auxiliary_loss_mlp": 0.01269686, + "balance_loss_clip": 0.06279664, + "balance_loss_mlp": 0.0125631, + "epoch": 0.49499473921539155, + "flos": 24797182723200.0, + "grad_norm": 1.782814520641974, + "language_loss": 0.68933427, + "learning_rate": 2.1294055469454496e-06, + "loss": 0.76643485, + "num_input_tokens_seen": 177024155, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.13366699, + "step": 8233, + "time_per_iteration": 2.574759006500244 + }, + { + "auxiliary_loss_clip": 0.06426412, + "auxiliary_loss_mlp": 0.01270358, + "balance_loss_clip": 0.06276375, + "balance_loss_mlp": 0.01258508, + "epoch": 0.4950548624680595, + "flos": 32716161993600.0, + "grad_norm": 2.8586701341507355, + "language_loss": 0.6684472, + "learning_rate": 2.129016898898633e-06, + "loss": 0.74541491, + "num_input_tokens_seen": 177046185, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.1184082, + "step": 8234, + "time_per_iteration": 2.653381824493408 + }, + { + "auxiliary_loss_clip": 0.06329186, + "auxiliary_loss_mlp": 0.0125637, + "balance_loss_clip": 0.06261852, + "balance_loss_mlp": 0.01254119, + "epoch": 0.4951149857207275, + "flos": 50100616287360.0, + "grad_norm": 0.7779673724008701, + "language_loss": 0.58149666, + "learning_rate": 2.128628245959482e-06, + "loss": 0.65735215, + "num_input_tokens_seen": 177099025, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 0.02255249, + "step": 8235, + "time_per_iteration": 3.0858991146087646 + }, + { + "auxiliary_loss_clip": 0.06437027, + "auxiliary_loss_mlp": 0.01272544, + "balance_loss_clip": 0.06281243, + "balance_loss_mlp": 0.01259401, + "epoch": 0.49517510897339545, + "flos": 22243340056320.0, + "grad_norm": 1.7279160321905627, + "language_loss": 0.77504063, + "learning_rate": 2.1282395881427355e-06, + "loss": 0.85213637, + "num_input_tokens_seen": 177118365, + "router_z_loss_clip": 1.55664062, + "router_z_loss_mlp": 0.13134766, + "step": 8236, + "time_per_iteration": 2.5753977298736572 + }, + { + "auxiliary_loss_clip": 0.06428996, + "auxiliary_loss_mlp": 0.01267571, + "balance_loss_clip": 0.06278376, + "balance_loss_mlp": 0.01256037, + "epoch": 0.4952352322260634, + "flos": 25381126627200.0, + "grad_norm": 1.6842676088909172, + "language_loss": 0.72880518, + "learning_rate": 2.1278509254631315e-06, + "loss": 0.80577087, + "num_input_tokens_seen": 177136415, + "router_z_loss_clip": 1.50390625, + "router_z_loss_mlp": 0.11529541, + "step": 8237, + "time_per_iteration": 4.036882400512695 + }, + { + "auxiliary_loss_clip": 0.06434725, + "auxiliary_loss_mlp": 0.01270554, + "balance_loss_clip": 0.06283747, + "balance_loss_mlp": 0.0125787, + "epoch": 0.4952953554787314, + "flos": 24615732706560.0, + "grad_norm": 2.2000126991913285, + "language_loss": 0.75703216, + "learning_rate": 2.127462257935406e-06, + "loss": 0.83408493, + "num_input_tokens_seen": 177155690, + "router_z_loss_clip": 1.50976562, + "router_z_loss_mlp": 0.12664795, + "step": 8238, + "time_per_iteration": 2.549431085586548 + }, + { + "auxiliary_loss_clip": 0.06435382, + "auxiliary_loss_mlp": 0.01269682, + "balance_loss_clip": 0.06280845, + "balance_loss_mlp": 0.01257081, + "epoch": 0.49535547873139935, + "flos": 17317020885120.0, + "grad_norm": 2.278500195677925, + "language_loss": 0.74391794, + "learning_rate": 2.1270735855743008e-06, + "loss": 0.82096863, + "num_input_tokens_seen": 177173350, + "router_z_loss_clip": 1.54492188, + "router_z_loss_mlp": 0.12615967, + "step": 8239, + "time_per_iteration": 2.571343183517456 + }, + { + "auxiliary_loss_clip": 0.06438212, + "auxiliary_loss_mlp": 0.01271609, + "balance_loss_clip": 0.06280148, + "balance_loss_mlp": 0.01257917, + "epoch": 0.4954156019840673, + "flos": 20746527845760.0, + "grad_norm": 2.0000035114581927, + "language_loss": 0.79093564, + "learning_rate": 2.126684908394552e-06, + "loss": 0.86803377, + "num_input_tokens_seen": 177191115, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.13684082, + "step": 8240, + "time_per_iteration": 2.531712532043457 + }, + { + "auxiliary_loss_clip": 0.06430051, + "auxiliary_loss_mlp": 0.01267271, + "balance_loss_clip": 0.06279683, + "balance_loss_mlp": 0.0125594, + "epoch": 0.49547572523673533, + "flos": 12825200661120.0, + "grad_norm": 2.1298693498085592, + "language_loss": 0.86484092, + "learning_rate": 2.126296226410898e-06, + "loss": 0.94181418, + "num_input_tokens_seen": 177206155, + "router_z_loss_clip": 1.50292969, + "router_z_loss_mlp": 0.11334229, + "step": 8241, + "time_per_iteration": 2.5414860248565674 + }, + { + "auxiliary_loss_clip": 0.06427231, + "auxiliary_loss_mlp": 0.01270719, + "balance_loss_clip": 0.06279866, + "balance_loss_mlp": 0.01260003, + "epoch": 0.4955358484894033, + "flos": 15602602821120.0, + "grad_norm": 1.7100085929309539, + "language_loss": 0.77987742, + "learning_rate": 2.1259075396380794e-06, + "loss": 0.85685694, + "num_input_tokens_seen": 177224815, + "router_z_loss_clip": 1.47460938, + "router_z_loss_mlp": 0.10723877, + "step": 8242, + "time_per_iteration": 2.500761032104492 + }, + { + "auxiliary_loss_clip": 0.06436419, + "auxiliary_loss_mlp": 0.01265138, + "balance_loss_clip": 0.06284536, + "balance_loss_mlp": 0.0125308, + "epoch": 0.49559597174207126, + "flos": 26470832976000.0, + "grad_norm": 1.8102794432235507, + "language_loss": 0.67317849, + "learning_rate": 2.125518848090833e-06, + "loss": 0.75019407, + "num_input_tokens_seen": 177244490, + "router_z_loss_clip": 1.51855469, + "router_z_loss_mlp": 0.1206665, + "step": 8243, + "time_per_iteration": 4.062270641326904 + }, + { + "auxiliary_loss_clip": 0.06430024, + "auxiliary_loss_mlp": 0.01269105, + "balance_loss_clip": 0.06279217, + "balance_loss_mlp": 0.0125722, + "epoch": 0.4956560949947392, + "flos": 23154824770560.0, + "grad_norm": 2.721585758888369, + "language_loss": 0.68786383, + "learning_rate": 2.125130151783901e-06, + "loss": 0.76485521, + "num_input_tokens_seen": 177264340, + "router_z_loss_clip": 1.5078125, + "router_z_loss_mlp": 0.11889648, + "step": 8244, + "time_per_iteration": 2.55732798576355 + }, + { + "auxiliary_loss_clip": 0.06434646, + "auxiliary_loss_mlp": 0.01266504, + "balance_loss_clip": 0.06280981, + "balance_loss_mlp": 0.01254541, + "epoch": 0.4957162182474072, + "flos": 20779119884160.0, + "grad_norm": 2.485823072522516, + "language_loss": 0.75575739, + "learning_rate": 2.12474145073202e-06, + "loss": 0.83276892, + "num_input_tokens_seen": 177283055, + "router_z_loss_clip": 1.53515625, + "router_z_loss_mlp": 0.11962891, + "step": 8245, + "time_per_iteration": 2.5086231231689453 + }, + { + "auxiliary_loss_clip": 0.06428742, + "auxiliary_loss_mlp": 0.01268325, + "balance_loss_clip": 0.06280199, + "balance_loss_mlp": 0.01256762, + "epoch": 0.49577634150007516, + "flos": 18740179756800.0, + "grad_norm": 1.8890947976192427, + "language_loss": 0.81602311, + "learning_rate": 2.1243527449499306e-06, + "loss": 0.89299381, + "num_input_tokens_seen": 177301140, + "router_z_loss_clip": 1.48632812, + "router_z_loss_mlp": 0.11572266, + "step": 8246, + "time_per_iteration": 2.534557342529297 + }, + { + "auxiliary_loss_clip": 0.06440324, + "auxiliary_loss_mlp": 0.01268715, + "balance_loss_clip": 0.06283663, + "balance_loss_mlp": 0.01256347, + "epoch": 0.4958364647527431, + "flos": 25560815708160.0, + "grad_norm": 1.7539344008969155, + "language_loss": 0.84379256, + "learning_rate": 2.1239640344523733e-06, + "loss": 0.92088294, + "num_input_tokens_seen": 177323095, + "router_z_loss_clip": 1.56640625, + "router_z_loss_mlp": 0.12359619, + "step": 8247, + "time_per_iteration": 2.5563809871673584 + }, + { + "auxiliary_loss_clip": 0.06436694, + "auxiliary_loss_mlp": 0.01269797, + "balance_loss_clip": 0.06282616, + "balance_loss_mlp": 0.01257798, + "epoch": 0.4958965880054111, + "flos": 24432144410880.0, + "grad_norm": 2.2837128243369658, + "language_loss": 0.84184051, + "learning_rate": 2.123575319254087e-06, + "loss": 0.91890538, + "num_input_tokens_seen": 177339845, + "router_z_loss_clip": 1.54101562, + "router_z_loss_mlp": 0.12011719, + "step": 8248, + "time_per_iteration": 2.566392660140991 + }, + { + "auxiliary_loss_clip": 0.0643697, + "auxiliary_loss_mlp": 0.01268541, + "balance_loss_clip": 0.06282248, + "balance_loss_mlp": 0.01256024, + "epoch": 0.49595671125807905, + "flos": 25090622121600.0, + "grad_norm": 1.727142692455913, + "language_loss": 0.73609596, + "learning_rate": 2.123186599369812e-06, + "loss": 0.813151, + "num_input_tokens_seen": 177359980, + "router_z_loss_clip": 1.54492188, + "router_z_loss_mlp": 0.12518311, + "step": 8249, + "time_per_iteration": 2.548520088195801 + }, + { + "auxiliary_loss_clip": 0.06441288, + "auxiliary_loss_mlp": 0.01269234, + "balance_loss_clip": 0.06283297, + "balance_loss_mlp": 0.01256365, + "epoch": 0.496016834510747, + "flos": 16441524299520.0, + "grad_norm": 2.7229998624345115, + "language_loss": 0.76506901, + "learning_rate": 2.122797874814289e-06, + "loss": 0.84217423, + "num_input_tokens_seen": 177378580, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.12860107, + "step": 8250, + "time_per_iteration": 2.524714231491089 + }, + { + "auxiliary_loss_clip": 0.06438759, + "auxiliary_loss_mlp": 0.01269282, + "balance_loss_clip": 0.06282068, + "balance_loss_mlp": 0.01256551, + "epoch": 0.496076957763415, + "flos": 23444197246080.0, + "grad_norm": 1.6959600873244032, + "language_loss": 0.7021333, + "learning_rate": 2.1224091456022585e-06, + "loss": 0.77921373, + "num_input_tokens_seen": 177398790, + "router_z_loss_clip": 1.56542969, + "router_z_loss_mlp": 0.12738037, + "step": 8251, + "time_per_iteration": 2.531841516494751 + }, + { + "auxiliary_loss_clip": 0.06437311, + "auxiliary_loss_mlp": 0.01271839, + "balance_loss_clip": 0.06285296, + "balance_loss_mlp": 0.01259871, + "epoch": 0.49613708101608295, + "flos": 16915113976320.0, + "grad_norm": 1.8201441219473296, + "language_loss": 0.7993809, + "learning_rate": 2.122020411748461e-06, + "loss": 0.87647241, + "num_input_tokens_seen": 177416515, + "router_z_loss_clip": 1.51855469, + "router_z_loss_mlp": 0.11975098, + "step": 8252, + "time_per_iteration": 2.5806944370269775 + }, + { + "auxiliary_loss_clip": 0.06434863, + "auxiliary_loss_mlp": 0.01270348, + "balance_loss_clip": 0.06282027, + "balance_loss_mlp": 0.01255905, + "epoch": 0.4961972042687509, + "flos": 16623729002880.0, + "grad_norm": 1.8109031344325417, + "language_loss": 0.81898755, + "learning_rate": 2.1216316732676363e-06, + "loss": 0.89603961, + "num_input_tokens_seen": 177434425, + "router_z_loss_clip": 1.52636719, + "router_z_loss_mlp": 0.14447021, + "step": 8253, + "time_per_iteration": 2.4936153888702393 + }, + { + "auxiliary_loss_clip": 0.0643016, + "auxiliary_loss_mlp": 0.01264565, + "balance_loss_clip": 0.06279143, + "balance_loss_mlp": 0.01253139, + "epoch": 0.49625732752141893, + "flos": 28965529059840.0, + "grad_norm": 1.4049535238306547, + "language_loss": 0.67659622, + "learning_rate": 2.1212429301745275e-06, + "loss": 0.7535435, + "num_input_tokens_seen": 177459675, + "router_z_loss_clip": 1.51074219, + "router_z_loss_mlp": 0.11437988, + "step": 8254, + "time_per_iteration": 2.681328058242798 + }, + { + "auxiliary_loss_clip": 0.06436362, + "auxiliary_loss_mlp": 0.01267121, + "balance_loss_clip": 0.06281647, + "balance_loss_mlp": 0.01254729, + "epoch": 0.4963174507740869, + "flos": 23119046277120.0, + "grad_norm": 6.04751780380752, + "language_loss": 0.74611968, + "learning_rate": 2.1208541824838743e-06, + "loss": 0.82315457, + "num_input_tokens_seen": 177478895, + "router_z_loss_clip": 1.54785156, + "router_z_loss_mlp": 0.12384033, + "step": 8255, + "time_per_iteration": 2.5586442947387695 + }, + { + "auxiliary_loss_clip": 0.06430424, + "auxiliary_loss_mlp": 0.01268774, + "balance_loss_clip": 0.06278734, + "balance_loss_mlp": 0.01256972, + "epoch": 0.49637757402675486, + "flos": 13922998928640.0, + "grad_norm": 1.9051204382469373, + "language_loss": 0.81712639, + "learning_rate": 2.1204654302104183e-06, + "loss": 0.89411843, + "num_input_tokens_seen": 177494920, + "router_z_loss_clip": 1.515625, + "router_z_loss_mlp": 0.11798096, + "step": 8256, + "time_per_iteration": 2.525191307067871 + }, + { + "auxiliary_loss_clip": 0.06430264, + "auxiliary_loss_mlp": 0.01267515, + "balance_loss_clip": 0.06279526, + "balance_loss_mlp": 0.01256035, + "epoch": 0.49643769727942283, + "flos": 22315442094720.0, + "grad_norm": 1.4246388626256767, + "language_loss": 0.81285727, + "learning_rate": 2.120076673368901e-06, + "loss": 0.889835, + "num_input_tokens_seen": 177515455, + "router_z_loss_clip": 1.50585938, + "router_z_loss_mlp": 0.11474609, + "step": 8257, + "time_per_iteration": 2.5366289615631104 + }, + { + "auxiliary_loss_clip": 0.06441522, + "auxiliary_loss_mlp": 0.01265551, + "balance_loss_clip": 0.06281207, + "balance_loss_mlp": 0.01253153, + "epoch": 0.4964978205320908, + "flos": 19506328364160.0, + "grad_norm": 1.7556989119603337, + "language_loss": 0.66651785, + "learning_rate": 2.1196879119740647e-06, + "loss": 0.74358857, + "num_input_tokens_seen": 177534040, + "router_z_loss_clip": 1.60351562, + "router_z_loss_mlp": 0.1239624, + "step": 8258, + "time_per_iteration": 2.567802667617798 + }, + { + "auxiliary_loss_clip": 0.06427691, + "auxiliary_loss_mlp": 0.01266798, + "balance_loss_clip": 0.06277505, + "balance_loss_mlp": 0.0125607, + "epoch": 0.49655794378475876, + "flos": 23442562091520.0, + "grad_norm": 1.5238866764667018, + "language_loss": 0.7778039, + "learning_rate": 2.1192991460406502e-06, + "loss": 0.85474873, + "num_input_tokens_seen": 177554510, + "router_z_loss_clip": 1.50195312, + "router_z_loss_mlp": 0.10723877, + "step": 8259, + "time_per_iteration": 2.5521552562713623 + }, + { + "auxiliary_loss_clip": 0.06430545, + "auxiliary_loss_mlp": 0.01266762, + "balance_loss_clip": 0.06279439, + "balance_loss_mlp": 0.01254954, + "epoch": 0.4966180670374267, + "flos": 26837967640320.0, + "grad_norm": 1.4589343239403403, + "language_loss": 0.78972054, + "learning_rate": 2.1189103755834e-06, + "loss": 0.86669362, + "num_input_tokens_seen": 177575780, + "router_z_loss_clip": 1.51171875, + "router_z_loss_mlp": 0.11816406, + "step": 8260, + "time_per_iteration": 2.6012649536132812 + }, + { + "auxiliary_loss_clip": 0.06434717, + "auxiliary_loss_mlp": 0.01267655, + "balance_loss_clip": 0.06279895, + "balance_loss_mlp": 0.01255055, + "epoch": 0.4966781902900947, + "flos": 22014413902080.0, + "grad_norm": 2.8586716221878206, + "language_loss": 0.76515198, + "learning_rate": 2.1185216006170573e-06, + "loss": 0.84217566, + "num_input_tokens_seen": 177588965, + "router_z_loss_clip": 1.54785156, + "router_z_loss_mlp": 0.12591553, + "step": 8261, + "time_per_iteration": 2.4737415313720703 + }, + { + "auxiliary_loss_clip": 0.06427643, + "auxiliary_loss_mlp": 0.01267002, + "balance_loss_clip": 0.0627794, + "balance_loss_mlp": 0.01255772, + "epoch": 0.49673831354276266, + "flos": 26220509303040.0, + "grad_norm": 1.7291004140234418, + "language_loss": 0.89456958, + "learning_rate": 2.1181328211563627e-06, + "loss": 0.97151601, + "num_input_tokens_seen": 177608425, + "router_z_loss_clip": 1.49609375, + "router_z_loss_mlp": 0.11230469, + "step": 8262, + "time_per_iteration": 2.613236665725708 + }, + { + "auxiliary_loss_clip": 0.06431636, + "auxiliary_loss_mlp": 0.01268648, + "balance_loss_clip": 0.06281907, + "balance_loss_mlp": 0.01256817, + "epoch": 0.4967984367954306, + "flos": 23188464984960.0, + "grad_norm": 1.4347791599980126, + "language_loss": 0.73918176, + "learning_rate": 2.11774403721606e-06, + "loss": 0.81618452, + "num_input_tokens_seen": 177628240, + "router_z_loss_clip": 1.49804688, + "router_z_loss_mlp": 0.11834717, + "step": 8263, + "time_per_iteration": 2.595635414123535 + }, + { + "auxiliary_loss_clip": 0.06439725, + "auxiliary_loss_mlp": 0.01274389, + "balance_loss_clip": 0.06283052, + "balance_loss_mlp": 0.01260239, + "epoch": 0.4968585600480986, + "flos": 19287506626560.0, + "grad_norm": 2.258936930728745, + "language_loss": 0.69678748, + "learning_rate": 2.1173552488108923e-06, + "loss": 0.77392858, + "num_input_tokens_seen": 177645920, + "router_z_loss_clip": 1.56542969, + "router_z_loss_mlp": 0.14147949, + "step": 8264, + "time_per_iteration": 2.5913755893707275 + }, + { + "auxiliary_loss_clip": 0.06438377, + "auxiliary_loss_mlp": 0.01267325, + "balance_loss_clip": 0.06281792, + "balance_loss_mlp": 0.01255136, + "epoch": 0.49691868330076655, + "flos": 22535312008320.0, + "grad_norm": 1.388736059607974, + "language_loss": 0.65131235, + "learning_rate": 2.1169664559556007e-06, + "loss": 0.72836947, + "num_input_tokens_seen": 177667185, + "router_z_loss_clip": 1.56933594, + "router_z_loss_mlp": 0.12188721, + "step": 8265, + "time_per_iteration": 2.528193473815918 + }, + { + "auxiliary_loss_clip": 0.06333993, + "auxiliary_loss_mlp": 0.01255399, + "balance_loss_clip": 0.06266748, + "balance_loss_mlp": 0.01253268, + "epoch": 0.4969788065534345, + "flos": 66598897328640.0, + "grad_norm": 0.8036364801041208, + "language_loss": 0.53402334, + "learning_rate": 2.1165776586649304e-06, + "loss": 0.60991728, + "num_input_tokens_seen": 177733020, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 0.02133179, + "step": 8266, + "time_per_iteration": 3.1838197708129883 + }, + { + "auxiliary_loss_clip": 0.06428756, + "auxiliary_loss_mlp": 0.01272627, + "balance_loss_clip": 0.06282037, + "balance_loss_mlp": 0.01260592, + "epoch": 0.49703892980610254, + "flos": 24066099849600.0, + "grad_norm": 1.4975664699088878, + "language_loss": 0.79899192, + "learning_rate": 2.1161888569536223e-06, + "loss": 0.87600571, + "num_input_tokens_seen": 177753370, + "router_z_loss_clip": 1.46679688, + "router_z_loss_mlp": 0.12036133, + "step": 8267, + "time_per_iteration": 2.556995391845703 + }, + { + "auxiliary_loss_clip": 0.06434017, + "auxiliary_loss_mlp": 0.01269443, + "balance_loss_clip": 0.06279886, + "balance_loss_mlp": 0.01256295, + "epoch": 0.4970990530587705, + "flos": 29132807736960.0, + "grad_norm": 3.0454644456900155, + "language_loss": 0.75843596, + "learning_rate": 2.1158000508364223e-06, + "loss": 0.83547056, + "num_input_tokens_seen": 177771530, + "router_z_loss_clip": 1.54003906, + "router_z_loss_mlp": 0.13146973, + "step": 8268, + "time_per_iteration": 2.6049721240997314 + }, + { + "auxiliary_loss_clip": 0.06435575, + "auxiliary_loss_mlp": 0.01270084, + "balance_loss_clip": 0.06281421, + "balance_loss_mlp": 0.01257185, + "epoch": 0.49715917631143847, + "flos": 46036811047680.0, + "grad_norm": 1.4862794016102487, + "language_loss": 0.68007714, + "learning_rate": 2.115411240328073e-06, + "loss": 0.75713372, + "num_input_tokens_seen": 177796355, + "router_z_loss_clip": 1.54003906, + "router_z_loss_mlp": 0.12902832, + "step": 8269, + "time_per_iteration": 4.128691911697388 + }, + { + "auxiliary_loss_clip": 0.06433591, + "auxiliary_loss_mlp": 0.01270109, + "balance_loss_clip": 0.06283623, + "balance_loss_mlp": 0.01258444, + "epoch": 0.49721929956410643, + "flos": 20197104624000.0, + "grad_norm": 1.5327488108804688, + "language_loss": 0.85668087, + "learning_rate": 2.1150224254433167e-06, + "loss": 0.93371785, + "num_input_tokens_seen": 177814300, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.11669922, + "step": 8270, + "time_per_iteration": 2.518367290496826 + }, + { + "auxiliary_loss_clip": 0.06438391, + "auxiliary_loss_mlp": 0.012695, + "balance_loss_clip": 0.06282806, + "balance_loss_mlp": 0.01258443, + "epoch": 0.4972794228167744, + "flos": 21660108912000.0, + "grad_norm": 1.8194061326909323, + "language_loss": 0.71364737, + "learning_rate": 2.114633606196899e-06, + "loss": 0.7907263, + "num_input_tokens_seen": 177833615, + "router_z_loss_clip": 1.55664062, + "router_z_loss_mlp": 0.1105957, + "step": 8271, + "time_per_iteration": 2.5573620796203613 + }, + { + "auxiliary_loss_clip": 0.06437098, + "auxiliary_loss_mlp": 0.01269156, + "balance_loss_clip": 0.06284092, + "balance_loss_mlp": 0.0125646, + "epoch": 0.49733954606944236, + "flos": 24286598668800.0, + "grad_norm": 1.3024187792808712, + "language_loss": 0.78511107, + "learning_rate": 2.1142447826035635e-06, + "loss": 0.86217368, + "num_input_tokens_seen": 177855315, + "router_z_loss_clip": 1.52832031, + "router_z_loss_mlp": 0.12677002, + "step": 8272, + "time_per_iteration": 4.061326742172241 + }, + { + "auxiliary_loss_clip": 0.06438889, + "auxiliary_loss_mlp": 0.01270506, + "balance_loss_clip": 0.06285517, + "balance_loss_mlp": 0.01257548, + "epoch": 0.4973996693221103, + "flos": 37861722172800.0, + "grad_norm": 2.25975995369767, + "language_loss": 0.66725254, + "learning_rate": 2.1138559546780544e-06, + "loss": 0.7443465, + "num_input_tokens_seen": 177875590, + "router_z_loss_clip": 1.53320312, + "router_z_loss_mlp": 0.12957764, + "step": 8273, + "time_per_iteration": 2.645908832550049 + }, + { + "auxiliary_loss_clip": 0.06436634, + "auxiliary_loss_mlp": 0.01276274, + "balance_loss_clip": 0.06285357, + "balance_loss_mlp": 0.01264109, + "epoch": 0.4974597925747783, + "flos": 21367885397760.0, + "grad_norm": 1.5281958400790516, + "language_loss": 0.78156513, + "learning_rate": 2.1134671224351163e-06, + "loss": 0.8586942, + "num_input_tokens_seen": 177894175, + "router_z_loss_clip": 1.51171875, + "router_z_loss_mlp": 0.12182617, + "step": 8274, + "time_per_iteration": 2.535804271697998 + }, + { + "auxiliary_loss_clip": 0.06437881, + "auxiliary_loss_mlp": 0.0127292, + "balance_loss_clip": 0.06281041, + "balance_loss_mlp": 0.01259992, + "epoch": 0.49751991582744626, + "flos": 30746137449600.0, + "grad_norm": 1.6098675264323796, + "language_loss": 0.76012516, + "learning_rate": 2.113078285889493e-06, + "loss": 0.83723313, + "num_input_tokens_seen": 177913920, + "router_z_loss_clip": 1.56640625, + "router_z_loss_mlp": 0.12939453, + "step": 8275, + "time_per_iteration": 2.5787549018859863 + }, + { + "auxiliary_loss_clip": 0.06438003, + "auxiliary_loss_mlp": 0.01271635, + "balance_loss_clip": 0.06282246, + "balance_loss_mlp": 0.01257789, + "epoch": 0.4975800390801142, + "flos": 14105748683520.0, + "grad_norm": 1.8196816586022186, + "language_loss": 0.84079218, + "learning_rate": 2.1126894450559303e-06, + "loss": 0.91788852, + "num_input_tokens_seen": 177930425, + "router_z_loss_clip": 1.55664062, + "router_z_loss_mlp": 0.1385498, + "step": 8276, + "time_per_iteration": 2.5156893730163574 + }, + { + "auxiliary_loss_clip": 0.06426419, + "auxiliary_loss_mlp": 0.01277009, + "balance_loss_clip": 0.06279768, + "balance_loss_mlp": 0.01265398, + "epoch": 0.4976401623327822, + "flos": 24214203141120.0, + "grad_norm": 1.3141436658277077, + "language_loss": 0.70087981, + "learning_rate": 2.112300599949172e-06, + "loss": 0.77791417, + "num_input_tokens_seen": 177949885, + "router_z_loss_clip": 1.46582031, + "router_z_loss_mlp": 0.1161499, + "step": 8277, + "time_per_iteration": 3.9860711097717285 + }, + { + "auxiliary_loss_clip": 0.06429198, + "auxiliary_loss_mlp": 0.01270973, + "balance_loss_clip": 0.06280812, + "balance_loss_mlp": 0.01258754, + "epoch": 0.49770028558545015, + "flos": 21142229552640.0, + "grad_norm": 1.8219149953370526, + "language_loss": 0.82141137, + "learning_rate": 2.111911750583964e-06, + "loss": 0.89841306, + "num_input_tokens_seen": 177965720, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.12231445, + "step": 8278, + "time_per_iteration": 2.5353100299835205 + }, + { + "auxiliary_loss_clip": 0.06435424, + "auxiliary_loss_mlp": 0.01268936, + "balance_loss_clip": 0.06279474, + "balance_loss_mlp": 0.01256246, + "epoch": 0.4977604088381181, + "flos": 16769568234240.0, + "grad_norm": 1.8298360040603827, + "language_loss": 0.68205428, + "learning_rate": 2.111522896975052e-06, + "loss": 0.75909793, + "num_input_tokens_seen": 177983190, + "router_z_loss_clip": 1.55957031, + "router_z_loss_mlp": 0.12695312, + "step": 8279, + "time_per_iteration": 2.538273334503174 + }, + { + "auxiliary_loss_clip": 0.06430422, + "auxiliary_loss_mlp": 0.01271809, + "balance_loss_clip": 0.06277534, + "balance_loss_mlp": 0.01258129, + "epoch": 0.49782053209078614, + "flos": 15708596636160.0, + "grad_norm": 1.929140490148881, + "language_loss": 0.70948005, + "learning_rate": 2.1111340391371794e-06, + "loss": 0.78650236, + "num_input_tokens_seen": 178000155, + "router_z_loss_clip": 1.52832031, + "router_z_loss_mlp": 0.13665771, + "step": 8280, + "time_per_iteration": 2.5344486236572266 + }, + { + "auxiliary_loss_clip": 0.06432884, + "auxiliary_loss_mlp": 0.01270682, + "balance_loss_clip": 0.06279922, + "balance_loss_mlp": 0.01257331, + "epoch": 0.4978806553434541, + "flos": 24760565688960.0, + "grad_norm": 1.4498126802552027, + "language_loss": 0.6468308, + "learning_rate": 2.1107451770850936e-06, + "loss": 0.72386646, + "num_input_tokens_seen": 178021060, + "router_z_loss_clip": 1.53027344, + "router_z_loss_mlp": 0.13366699, + "step": 8281, + "time_per_iteration": 2.5905003547668457 + }, + { + "auxiliary_loss_clip": 0.06432123, + "auxiliary_loss_mlp": 0.01269379, + "balance_loss_clip": 0.06277686, + "balance_loss_mlp": 0.01256141, + "epoch": 0.49794077859612207, + "flos": 13120820265600.0, + "grad_norm": 2.543831826961268, + "language_loss": 0.73404002, + "learning_rate": 2.1103563108335387e-06, + "loss": 0.81105494, + "num_input_tokens_seen": 178038180, + "router_z_loss_clip": 1.54199219, + "router_z_loss_mlp": 0.13226318, + "step": 8282, + "time_per_iteration": 2.481513023376465 + }, + { + "auxiliary_loss_clip": 0.06433594, + "auxiliary_loss_mlp": 0.01272132, + "balance_loss_clip": 0.062822, + "balance_loss_mlp": 0.01260748, + "epoch": 0.49800090184879003, + "flos": 27532223844480.0, + "grad_norm": 1.4555237952962066, + "language_loss": 0.7312296, + "learning_rate": 2.109967440397263e-06, + "loss": 0.80828691, + "num_input_tokens_seen": 178057565, + "router_z_loss_clip": 1.51464844, + "router_z_loss_mlp": 0.1138916, + "step": 8283, + "time_per_iteration": 4.015530824661255 + }, + { + "auxiliary_loss_clip": 0.06430134, + "auxiliary_loss_mlp": 0.01267653, + "balance_loss_clip": 0.06279625, + "balance_loss_mlp": 0.01254791, + "epoch": 0.498061025101458, + "flos": 19798677659520.0, + "grad_norm": 1.429490370630744, + "language_loss": 0.78535879, + "learning_rate": 2.1095785657910095e-06, + "loss": 0.8623367, + "num_input_tokens_seen": 178076965, + "router_z_loss_clip": 1.50488281, + "router_z_loss_mlp": 0.12860107, + "step": 8284, + "time_per_iteration": 2.4994332790374756 + }, + { + "auxiliary_loss_clip": 0.06437389, + "auxiliary_loss_mlp": 0.01269907, + "balance_loss_clip": 0.06278685, + "balance_loss_mlp": 0.01255864, + "epoch": 0.49812114835412596, + "flos": 29900926915200.0, + "grad_norm": 1.711585124439885, + "language_loss": 0.7343573, + "learning_rate": 2.109189687029526e-06, + "loss": 0.81143022, + "num_input_tokens_seen": 178095105, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 0.14044189, + "step": 8285, + "time_per_iteration": 2.566572904586792 + }, + { + "auxiliary_loss_clip": 0.06430154, + "auxiliary_loss_mlp": 0.01270611, + "balance_loss_clip": 0.0627718, + "balance_loss_mlp": 0.01258404, + "epoch": 0.49818127160679393, + "flos": 23153441178240.0, + "grad_norm": 1.4871294259616603, + "language_loss": 0.74281567, + "learning_rate": 2.1088008041275598e-06, + "loss": 0.81982332, + "num_input_tokens_seen": 178114505, + "router_z_loss_clip": 1.52832031, + "router_z_loss_mlp": 0.12207031, + "step": 8286, + "time_per_iteration": 2.5136756896972656 + }, + { + "auxiliary_loss_clip": 0.06434155, + "auxiliary_loss_mlp": 0.01266169, + "balance_loss_clip": 0.06279751, + "balance_loss_mlp": 0.0125358, + "epoch": 0.4982413948594619, + "flos": 21659228444160.0, + "grad_norm": 1.6982664351725185, + "language_loss": 0.85701174, + "learning_rate": 2.1084119170998545e-06, + "loss": 0.93401492, + "num_input_tokens_seen": 178131595, + "router_z_loss_clip": 1.54492188, + "router_z_loss_mlp": 0.12579346, + "step": 8287, + "time_per_iteration": 2.518136501312256 + }, + { + "auxiliary_loss_clip": 0.06432185, + "auxiliary_loss_mlp": 0.01270528, + "balance_loss_clip": 0.06276216, + "balance_loss_mlp": 0.01256801, + "epoch": 0.49830151811212986, + "flos": 32494866560640.0, + "grad_norm": 1.6945408763753198, + "language_loss": 0.72708082, + "learning_rate": 2.108023025961159e-06, + "loss": 0.80410802, + "num_input_tokens_seen": 178152055, + "router_z_loss_clip": 1.55957031, + "router_z_loss_mlp": 0.13745117, + "step": 8288, + "time_per_iteration": 2.590862512588501 + }, + { + "auxiliary_loss_clip": 0.06436619, + "auxiliary_loss_mlp": 0.01272174, + "balance_loss_clip": 0.0627879, + "balance_loss_mlp": 0.01258972, + "epoch": 0.4983616413647978, + "flos": 18146886122880.0, + "grad_norm": 4.0455531591406855, + "language_loss": 0.81054366, + "learning_rate": 2.10763413072622e-06, + "loss": 0.8876316, + "num_input_tokens_seen": 178168150, + "router_z_loss_clip": 1.57617188, + "router_z_loss_mlp": 0.13201904, + "step": 8289, + "time_per_iteration": 2.504817008972168 + }, + { + "auxiliary_loss_clip": 0.06432903, + "auxiliary_loss_mlp": 0.01270371, + "balance_loss_clip": 0.06279443, + "balance_loss_mlp": 0.01257074, + "epoch": 0.4984217646174658, + "flos": 19724898539520.0, + "grad_norm": 2.471620750065275, + "language_loss": 0.73847377, + "learning_rate": 2.107245231409784e-06, + "loss": 0.81550646, + "num_input_tokens_seen": 178186150, + "router_z_loss_clip": 1.53613281, + "router_z_loss_mlp": 0.13305664, + "step": 8290, + "time_per_iteration": 2.492176055908203 + }, + { + "auxiliary_loss_clip": 0.0643364, + "auxiliary_loss_mlp": 0.01275224, + "balance_loss_clip": 0.06278273, + "balance_loss_mlp": 0.01261157, + "epoch": 0.49848188787013376, + "flos": 24943525079040.0, + "grad_norm": 1.4456375643187662, + "language_loss": 0.84330356, + "learning_rate": 2.106856328026598e-06, + "loss": 0.92039216, + "num_input_tokens_seen": 178207665, + "router_z_loss_clip": 1.55273438, + "router_z_loss_mlp": 0.140625, + "step": 8291, + "time_per_iteration": 2.5577101707458496 + }, + { + "auxiliary_loss_clip": 0.06438746, + "auxiliary_loss_mlp": 0.01270664, + "balance_loss_clip": 0.06277075, + "balance_loss_mlp": 0.01257379, + "epoch": 0.4985420111228017, + "flos": 22388969652480.0, + "grad_norm": 1.8626179833436056, + "language_loss": 0.67868197, + "learning_rate": 2.106467420591409e-06, + "loss": 0.75577605, + "num_input_tokens_seen": 178226325, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.13275146, + "step": 8292, + "time_per_iteration": 2.5227880477905273 + }, + { + "auxiliary_loss_clip": 0.06428275, + "auxiliary_loss_mlp": 0.01268796, + "balance_loss_clip": 0.06275518, + "balance_loss_mlp": 0.01256977, + "epoch": 0.4986021343754697, + "flos": 16221989802240.0, + "grad_norm": 1.635019918785358, + "language_loss": 0.67247725, + "learning_rate": 2.106078509118965e-06, + "loss": 0.749448, + "num_input_tokens_seen": 178244960, + "router_z_loss_clip": 1.52832031, + "router_z_loss_mlp": 0.11798096, + "step": 8293, + "time_per_iteration": 2.5051913261413574 + }, + { + "auxiliary_loss_clip": 0.0643108, + "auxiliary_loss_mlp": 0.01270371, + "balance_loss_clip": 0.06275735, + "balance_loss_mlp": 0.01258891, + "epoch": 0.4986622576281377, + "flos": 23410221615360.0, + "grad_norm": 1.789605024821123, + "language_loss": 0.82488304, + "learning_rate": 2.1056895936240133e-06, + "loss": 0.90189755, + "num_input_tokens_seen": 178265400, + "router_z_loss_clip": 1.5546875, + "router_z_loss_mlp": 0.11480713, + "step": 8294, + "time_per_iteration": 2.5429139137268066 + }, + { + "auxiliary_loss_clip": 0.06432615, + "auxiliary_loss_mlp": 0.01272563, + "balance_loss_clip": 0.06277893, + "balance_loss_mlp": 0.01260315, + "epoch": 0.49872238088080567, + "flos": 19980714654720.0, + "grad_norm": 2.5766475970916285, + "language_loss": 0.73639232, + "learning_rate": 2.1053006741213016e-06, + "loss": 0.81344408, + "num_input_tokens_seen": 178284535, + "router_z_loss_clip": 1.546875, + "router_z_loss_mlp": 0.12249756, + "step": 8295, + "time_per_iteration": 2.535090923309326 + }, + { + "auxiliary_loss_clip": 0.06427556, + "auxiliary_loss_mlp": 0.01272493, + "balance_loss_clip": 0.06276329, + "balance_loss_mlp": 0.01259911, + "epoch": 0.49878250413347364, + "flos": 22899595633920.0, + "grad_norm": 1.8257233918976585, + "language_loss": 0.68199098, + "learning_rate": 2.1049117506255775e-06, + "loss": 0.75899148, + "num_input_tokens_seen": 178302425, + "router_z_loss_clip": 1.50976562, + "router_z_loss_mlp": 0.12591553, + "step": 8296, + "time_per_iteration": 2.5079848766326904 + }, + { + "auxiliary_loss_clip": 0.06433527, + "auxiliary_loss_mlp": 0.01272036, + "balance_loss_clip": 0.06276954, + "balance_loss_mlp": 0.0125878, + "epoch": 0.4988426273861416, + "flos": 32606688234240.0, + "grad_norm": 1.801119189108274, + "language_loss": 0.64925557, + "learning_rate": 2.1045228231515895e-06, + "loss": 0.72631121, + "num_input_tokens_seen": 178323065, + "router_z_loss_clip": 1.56347656, + "router_z_loss_mlp": 0.13256836, + "step": 8297, + "time_per_iteration": 2.6275887489318848 + }, + { + "auxiliary_loss_clip": 0.06427586, + "auxiliary_loss_mlp": 0.01270462, + "balance_loss_clip": 0.06278079, + "balance_loss_mlp": 0.01258845, + "epoch": 0.49890275063880957, + "flos": 20929990360320.0, + "grad_norm": 1.5890674789628483, + "language_loss": 0.69987392, + "learning_rate": 2.1041338917140857e-06, + "loss": 0.77685434, + "num_input_tokens_seen": 178343985, + "router_z_loss_clip": 1.49511719, + "router_z_loss_mlp": 0.11621094, + "step": 8298, + "time_per_iteration": 2.527082681655884 + }, + { + "auxiliary_loss_clip": 0.06428695, + "auxiliary_loss_mlp": 0.01265195, + "balance_loss_clip": 0.06276681, + "balance_loss_mlp": 0.01253668, + "epoch": 0.49896287389147753, + "flos": 18630370581120.0, + "grad_norm": 3.032196085375079, + "language_loss": 0.85047698, + "learning_rate": 2.103744956327814e-06, + "loss": 0.92741591, + "num_input_tokens_seen": 178362345, + "router_z_loss_clip": 1.51953125, + "router_z_loss_mlp": 0.11517334, + "step": 8299, + "time_per_iteration": 2.531541585922241 + }, + { + "auxiliary_loss_clip": 0.06429411, + "auxiliary_loss_mlp": 0.01267168, + "balance_loss_clip": 0.06274673, + "balance_loss_mlp": 0.0125412, + "epoch": 0.4990229971441455, + "flos": 24833422414080.0, + "grad_norm": 2.041795476236588, + "language_loss": 0.69284618, + "learning_rate": 2.1033560170075234e-06, + "loss": 0.76981199, + "num_input_tokens_seen": 178383190, + "router_z_loss_clip": 1.54296875, + "router_z_loss_mlp": 0.13061523, + "step": 8300, + "time_per_iteration": 2.562002658843994 + }, + { + "auxiliary_loss_clip": 0.0633271, + "auxiliary_loss_mlp": 0.01269781, + "balance_loss_clip": 0.06265618, + "balance_loss_mlp": 0.01267531, + "epoch": 0.49908312039681346, + "flos": 71405638323840.0, + "grad_norm": 0.7392878070409407, + "language_loss": 0.51101816, + "learning_rate": 2.1029670737679623e-06, + "loss": 0.58704311, + "num_input_tokens_seen": 178444250, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 0.02253723, + "step": 8301, + "time_per_iteration": 3.3210127353668213 + }, + { + "auxiliary_loss_clip": 0.06423864, + "auxiliary_loss_mlp": 0.01270768, + "balance_loss_clip": 0.06275457, + "balance_loss_mlp": 0.01258173, + "epoch": 0.4991432436494814, + "flos": 19834791569280.0, + "grad_norm": 2.2486532521822302, + "language_loss": 0.84452468, + "learning_rate": 2.102578126623879e-06, + "loss": 0.921471, + "num_input_tokens_seen": 178463250, + "router_z_loss_clip": 1.48339844, + "router_z_loss_mlp": 0.12591553, + "step": 8302, + "time_per_iteration": 2.547562837600708 + }, + { + "auxiliary_loss_clip": 0.06428537, + "auxiliary_loss_mlp": 0.01271397, + "balance_loss_clip": 0.06279141, + "balance_loss_mlp": 0.01259607, + "epoch": 0.4992033669021494, + "flos": 15127252208640.0, + "grad_norm": 1.6659174741740037, + "language_loss": 0.69610626, + "learning_rate": 2.102189175590024e-06, + "loss": 0.77310562, + "num_input_tokens_seen": 178481340, + "router_z_loss_clip": 1.49414062, + "router_z_loss_mlp": 0.11785889, + "step": 8303, + "time_per_iteration": 2.473879337310791 + }, + { + "auxiliary_loss_clip": 0.06429437, + "auxiliary_loss_mlp": 0.01266243, + "balance_loss_clip": 0.0627458, + "balance_loss_mlp": 0.01253851, + "epoch": 0.49926349015481736, + "flos": 31215282860160.0, + "grad_norm": 1.7036998151712766, + "language_loss": 0.72999942, + "learning_rate": 2.101800220681144e-06, + "loss": 0.80695617, + "num_input_tokens_seen": 178501545, + "router_z_loss_clip": 1.54785156, + "router_z_loss_mlp": 0.1239624, + "step": 8304, + "time_per_iteration": 2.611502170562744 + }, + { + "auxiliary_loss_clip": 0.0642409, + "auxiliary_loss_mlp": 0.0126995, + "balance_loss_clip": 0.0627369, + "balance_loss_mlp": 0.01257683, + "epoch": 0.4993236134074853, + "flos": 24907201534080.0, + "grad_norm": 2.0593873642803486, + "language_loss": 0.81677687, + "learning_rate": 2.10141126191199e-06, + "loss": 0.89371729, + "num_input_tokens_seen": 178519700, + "router_z_loss_clip": 1.50292969, + "router_z_loss_mlp": 0.1227417, + "step": 8305, + "time_per_iteration": 2.57425594329834 + }, + { + "auxiliary_loss_clip": 0.0632831, + "auxiliary_loss_mlp": 0.01255041, + "balance_loss_clip": 0.06261367, + "balance_loss_mlp": 0.01252826, + "epoch": 0.4993837366601533, + "flos": 70438962896640.0, + "grad_norm": 0.7837813432026206, + "language_loss": 0.56909657, + "learning_rate": 2.1010222992973107e-06, + "loss": 0.64493006, + "num_input_tokens_seen": 178576740, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 0.02220154, + "step": 8306, + "time_per_iteration": 3.2806143760681152 + }, + { + "auxiliary_loss_clip": 0.06430675, + "auxiliary_loss_mlp": 0.01269703, + "balance_loss_clip": 0.06278585, + "balance_loss_mlp": 0.01255422, + "epoch": 0.4994438599128213, + "flos": 15966718738560.0, + "grad_norm": 1.7475082532303507, + "language_loss": 0.83157074, + "learning_rate": 2.1006333328518556e-06, + "loss": 0.90857446, + "num_input_tokens_seen": 178594745, + "router_z_loss_clip": 1.52148438, + "router_z_loss_mlp": 0.1427002, + "step": 8307, + "time_per_iteration": 2.4851419925689697 + }, + { + "auxiliary_loss_clip": 0.06426803, + "auxiliary_loss_mlp": 0.01271631, + "balance_loss_clip": 0.06277731, + "balance_loss_mlp": 0.01258458, + "epoch": 0.4995039831654893, + "flos": 27935765907840.0, + "grad_norm": 1.9977557260500436, + "language_loss": 0.61003512, + "learning_rate": 2.1002443625903748e-06, + "loss": 0.68701947, + "num_input_tokens_seen": 178614110, + "router_z_loss_clip": 1.48925781, + "router_z_loss_mlp": 0.13189697, + "step": 8308, + "time_per_iteration": 2.5943245887756348 + }, + { + "auxiliary_loss_clip": 0.06426641, + "auxiliary_loss_mlp": 0.01271422, + "balance_loss_clip": 0.06278297, + "balance_loss_mlp": 0.01259948, + "epoch": 0.49956410641815724, + "flos": 24211310175360.0, + "grad_norm": 1.573691211270805, + "language_loss": 0.74911636, + "learning_rate": 2.0998553885276168e-06, + "loss": 0.82609695, + "num_input_tokens_seen": 178634170, + "router_z_loss_clip": 1.48242188, + "router_z_loss_mlp": 0.11468506, + "step": 8309, + "time_per_iteration": 3.9743635654449463 + }, + { + "auxiliary_loss_clip": 0.06430435, + "auxiliary_loss_mlp": 0.01268231, + "balance_loss_clip": 0.0627753, + "balance_loss_mlp": 0.0125578, + "epoch": 0.4996242296708252, + "flos": 16185666257280.0, + "grad_norm": 2.033466484631739, + "language_loss": 0.80080384, + "learning_rate": 2.0994664106783335e-06, + "loss": 0.87779051, + "num_input_tokens_seen": 178651775, + "router_z_loss_clip": 1.52734375, + "router_z_loss_mlp": 0.12438965, + "step": 8310, + "time_per_iteration": 2.475815534591675 + }, + { + "auxiliary_loss_clip": 0.06429116, + "auxiliary_loss_mlp": 0.01267368, + "balance_loss_clip": 0.06274112, + "balance_loss_mlp": 0.01254541, + "epoch": 0.49968435292349317, + "flos": 16879209701760.0, + "grad_norm": 1.5486293297173337, + "language_loss": 0.71370041, + "learning_rate": 2.0990774290572735e-06, + "loss": 0.79066527, + "num_input_tokens_seen": 178669720, + "router_z_loss_clip": 1.55078125, + "router_z_loss_mlp": 0.12823486, + "step": 8311, + "time_per_iteration": 4.01245641708374 + }, + { + "auxiliary_loss_clip": 0.06428856, + "auxiliary_loss_mlp": 0.01266033, + "balance_loss_clip": 0.06277557, + "balance_loss_mlp": 0.01254636, + "epoch": 0.49974447617616113, + "flos": 14944837870080.0, + "grad_norm": 1.8003339909908787, + "language_loss": 0.77129757, + "learning_rate": 2.098688443679187e-06, + "loss": 0.8482464, + "num_input_tokens_seen": 178686765, + "router_z_loss_clip": 1.51074219, + "router_z_loss_mlp": 0.11401367, + "step": 8312, + "time_per_iteration": 2.4761128425598145 + }, + { + "auxiliary_loss_clip": 0.0643132, + "auxiliary_loss_mlp": 0.01266437, + "balance_loss_clip": 0.06279029, + "balance_loss_mlp": 0.01254206, + "epoch": 0.4998045994288291, + "flos": 26658823610880.0, + "grad_norm": 1.6524127143489034, + "language_loss": 0.84981465, + "learning_rate": 2.0982994545588256e-06, + "loss": 0.9267922, + "num_input_tokens_seen": 178705845, + "router_z_loss_clip": 1.52246094, + "router_z_loss_mlp": 0.12231445, + "step": 8313, + "time_per_iteration": 2.6057398319244385 + }, + { + "auxiliary_loss_clip": 0.06431891, + "auxiliary_loss_mlp": 0.01267877, + "balance_loss_clip": 0.06279939, + "balance_loss_mlp": 0.01256224, + "epoch": 0.49986472268149706, + "flos": 20959102454400.0, + "grad_norm": 1.6979548607445847, + "language_loss": 0.81193811, + "learning_rate": 2.097910461710939e-06, + "loss": 0.8889358, + "num_input_tokens_seen": 178723410, + "router_z_loss_clip": 1.52050781, + "router_z_loss_mlp": 0.11657715, + "step": 8314, + "time_per_iteration": 2.5246880054473877 + }, + { + "auxiliary_loss_clip": 0.06430186, + "auxiliary_loss_mlp": 0.01269627, + "balance_loss_clip": 0.06278808, + "balance_loss_mlp": 0.01256341, + "epoch": 0.49992484593416503, + "flos": 22790499217920.0, + "grad_norm": 1.7217224756504992, + "language_loss": 0.79857439, + "learning_rate": 2.0975214651502773e-06, + "loss": 0.8755725, + "num_input_tokens_seen": 178743560, + "router_z_loss_clip": 1.51464844, + "router_z_loss_mlp": 0.13305664, + "step": 8315, + "time_per_iteration": 2.5382394790649414 + }, + { + "auxiliary_loss_clip": 0.06427336, + "auxiliary_loss_mlp": 0.01267686, + "balance_loss_clip": 0.06276156, + "balance_loss_mlp": 0.0125595, + "epoch": 0.499984969186833, + "flos": 46796838307200.0, + "grad_norm": 1.6656557215916168, + "language_loss": 0.74803257, + "learning_rate": 2.0971324648915926e-06, + "loss": 0.82498288, + "num_input_tokens_seen": 178767225, + "router_z_loss_clip": 1.50976562, + "router_z_loss_mlp": 0.11749268, + "step": 8316, + "time_per_iteration": 4.178734540939331 + }, + { + "auxiliary_loss_clip": 0.06424455, + "auxiliary_loss_mlp": 0.01269425, + "balance_loss_clip": 0.0627817, + "balance_loss_mlp": 0.01258083, + "epoch": 0.500045092439501, + "flos": 25564086017280.0, + "grad_norm": 1.744541126829246, + "language_loss": 0.81478661, + "learning_rate": 2.0967434609496343e-06, + "loss": 0.89172542, + "num_input_tokens_seen": 178786810, + "router_z_loss_clip": 1.46191406, + "router_z_loss_mlp": 0.11346436, + "step": 8317, + "time_per_iteration": 2.537320613861084 + }, + { + "auxiliary_loss_clip": 0.06427011, + "auxiliary_loss_mlp": 0.01270425, + "balance_loss_clip": 0.06274804, + "balance_loss_mlp": 0.01257586, + "epoch": 0.5001052156921689, + "flos": 20711126695680.0, + "grad_norm": 1.5732702518161361, + "language_loss": 0.83390272, + "learning_rate": 2.0963544533391548e-06, + "loss": 0.91087711, + "num_input_tokens_seen": 178805660, + "router_z_loss_clip": 1.52050781, + "router_z_loss_mlp": 0.12835693, + "step": 8318, + "time_per_iteration": 2.534135103225708 + }, + { + "auxiliary_loss_clip": 0.06428336, + "auxiliary_loss_mlp": 0.01269138, + "balance_loss_clip": 0.06277522, + "balance_loss_mlp": 0.01257109, + "epoch": 0.500165338944837, + "flos": 21257405389440.0, + "grad_norm": 1.6807233025456896, + "language_loss": 0.82012349, + "learning_rate": 2.0959654420749045e-06, + "loss": 0.89709824, + "num_input_tokens_seen": 178824780, + "router_z_loss_clip": 1.5078125, + "router_z_loss_mlp": 0.12030029, + "step": 8319, + "time_per_iteration": 2.515835762023926 + }, + { + "auxiliary_loss_clip": 0.06428086, + "auxiliary_loss_mlp": 0.01265652, + "balance_loss_clip": 0.0627624, + "balance_loss_mlp": 0.01254697, + "epoch": 0.5002254621975049, + "flos": 27861693298560.0, + "grad_norm": 1.6360150103182107, + "language_loss": 0.72118968, + "learning_rate": 2.095576427171635e-06, + "loss": 0.79812706, + "num_input_tokens_seen": 178845640, + "router_z_loss_clip": 1.51757812, + "router_z_loss_mlp": 0.10955811, + "step": 8320, + "time_per_iteration": 2.5796635150909424 + }, + { + "auxiliary_loss_clip": 0.06441814, + "auxiliary_loss_mlp": 0.01267293, + "balance_loss_clip": 0.06280147, + "balance_loss_mlp": 0.01253858, + "epoch": 0.5002855854501729, + "flos": 15556049078400.0, + "grad_norm": 2.4313263695255696, + "language_loss": 0.76678413, + "learning_rate": 2.0951874086440978e-06, + "loss": 0.84387517, + "num_input_tokens_seen": 178862290, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.13439941, + "step": 8321, + "time_per_iteration": 2.4691002368927 + }, + { + "auxiliary_loss_clip": 0.06428922, + "auxiliary_loss_mlp": 0.01268744, + "balance_loss_clip": 0.06276058, + "balance_loss_mlp": 0.0125556, + "epoch": 0.5003457087028408, + "flos": 16112977240320.0, + "grad_norm": 1.7492839336280708, + "language_loss": 0.82910907, + "learning_rate": 2.0947983865070455e-06, + "loss": 0.90608579, + "num_input_tokens_seen": 178879805, + "router_z_loss_clip": 1.52636719, + "router_z_loss_mlp": 0.13183594, + "step": 8322, + "time_per_iteration": 2.515460252761841 + }, + { + "auxiliary_loss_clip": 0.06431515, + "auxiliary_loss_mlp": 0.0126974, + "balance_loss_clip": 0.06279334, + "balance_loss_mlp": 0.01256973, + "epoch": 0.5004058319555088, + "flos": 22717055514240.0, + "grad_norm": 3.787468052495824, + "language_loss": 0.74021679, + "learning_rate": 2.094409360775228e-06, + "loss": 0.81722933, + "num_input_tokens_seen": 178896985, + "router_z_loss_clip": 1.52246094, + "router_z_loss_mlp": 0.12774658, + "step": 8323, + "time_per_iteration": 3.9577157497406006 + }, + { + "auxiliary_loss_clip": 0.06425107, + "auxiliary_loss_mlp": 0.01267421, + "balance_loss_clip": 0.06273489, + "balance_loss_mlp": 0.01254761, + "epoch": 0.5004659552081767, + "flos": 30125870000640.0, + "grad_norm": 1.569659839153646, + "language_loss": 0.69694078, + "learning_rate": 2.0940203314633977e-06, + "loss": 0.77386606, + "num_input_tokens_seen": 178920605, + "router_z_loss_clip": 1.515625, + "router_z_loss_mlp": 0.12670898, + "step": 8324, + "time_per_iteration": 2.5927038192749023 + }, + { + "auxiliary_loss_clip": 0.06426285, + "auxiliary_loss_mlp": 0.01267566, + "balance_loss_clip": 0.06274655, + "balance_loss_mlp": 0.012554, + "epoch": 0.5005260784608447, + "flos": 18630664070400.0, + "grad_norm": 1.9637621432589805, + "language_loss": 0.72455752, + "learning_rate": 2.0936312985863077e-06, + "loss": 0.80149603, + "num_input_tokens_seen": 178937760, + "router_z_loss_clip": 1.51367188, + "router_z_loss_mlp": 0.12164307, + "step": 8325, + "time_per_iteration": 2.5748932361602783 + }, + { + "auxiliary_loss_clip": 0.06431422, + "auxiliary_loss_mlp": 0.01266152, + "balance_loss_clip": 0.06278826, + "balance_loss_mlp": 0.01253069, + "epoch": 0.5005862017135126, + "flos": 24866349868800.0, + "grad_norm": 1.7160687334315328, + "language_loss": 0.73386943, + "learning_rate": 2.093242262158709e-06, + "loss": 0.8108452, + "num_input_tokens_seen": 178957985, + "router_z_loss_clip": 1.52636719, + "router_z_loss_mlp": 0.13085938, + "step": 8326, + "time_per_iteration": 2.5720608234405518 + }, + { + "auxiliary_loss_clip": 0.06426796, + "auxiliary_loss_mlp": 0.01267135, + "balance_loss_clip": 0.06276905, + "balance_loss_mlp": 0.01255763, + "epoch": 0.5006463249661807, + "flos": 18740389392000.0, + "grad_norm": 1.5629486934520718, + "language_loss": 0.78059208, + "learning_rate": 2.0928532221953544e-06, + "loss": 0.85753143, + "num_input_tokens_seen": 178977070, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.11364746, + "step": 8327, + "time_per_iteration": 2.5033681392669678 + }, + { + "auxiliary_loss_clip": 0.06429915, + "auxiliary_loss_mlp": 0.01266866, + "balance_loss_clip": 0.06277432, + "balance_loss_mlp": 0.01254533, + "epoch": 0.5007064482188487, + "flos": 13047124999680.0, + "grad_norm": 2.5584329331081253, + "language_loss": 0.88066995, + "learning_rate": 2.092464178710997e-06, + "loss": 0.95763773, + "num_input_tokens_seen": 178994175, + "router_z_loss_clip": 1.52441406, + "router_z_loss_mlp": 0.12329102, + "step": 8328, + "time_per_iteration": 2.469723701477051 + }, + { + "auxiliary_loss_clip": 0.06430298, + "auxiliary_loss_mlp": 0.0126735, + "balance_loss_clip": 0.06274554, + "balance_loss_mlp": 0.01254302, + "epoch": 0.5007665714715166, + "flos": 21295154453760.0, + "grad_norm": 2.120857663767784, + "language_loss": 0.74578768, + "learning_rate": 2.092075131720388e-06, + "loss": 0.82276416, + "num_input_tokens_seen": 179013710, + "router_z_loss_clip": 1.55664062, + "router_z_loss_mlp": 0.1305542, + "step": 8329, + "time_per_iteration": 2.527421236038208 + }, + { + "auxiliary_loss_clip": 0.06427623, + "auxiliary_loss_mlp": 0.01269321, + "balance_loss_clip": 0.06278372, + "balance_loss_mlp": 0.01257626, + "epoch": 0.5008266947241846, + "flos": 29762676478080.0, + "grad_norm": 1.5806360237517383, + "language_loss": 0.80007339, + "learning_rate": 2.091686081238281e-06, + "loss": 0.87704277, + "num_input_tokens_seen": 179035255, + "router_z_loss_clip": 1.49316406, + "router_z_loss_mlp": 0.11688232, + "step": 8330, + "time_per_iteration": 2.589132785797119 + }, + { + "auxiliary_loss_clip": 0.063256, + "auxiliary_loss_mlp": 0.01256172, + "balance_loss_clip": 0.06259131, + "balance_loss_mlp": 0.0125421, + "epoch": 0.5008868179768525, + "flos": 63574498460160.0, + "grad_norm": 0.7051231310601146, + "language_loss": 0.56005836, + "learning_rate": 2.0912970272794282e-06, + "loss": 0.63587606, + "num_input_tokens_seen": 179090915, + "router_z_loss_clip": 0.6640625, + "router_z_loss_mlp": 0.01960754, + "step": 8331, + "time_per_iteration": 2.9798707962036133 + }, + { + "auxiliary_loss_clip": 0.06425481, + "auxiliary_loss_mlp": 0.01267706, + "balance_loss_clip": 0.06278575, + "balance_loss_mlp": 0.01256125, + "epoch": 0.5009469412295205, + "flos": 27382108055040.0, + "grad_norm": 1.8793466545943338, + "language_loss": 0.65444684, + "learning_rate": 2.0909079698585833e-06, + "loss": 0.73137867, + "num_input_tokens_seen": 179109160, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 0.11584473, + "step": 8332, + "time_per_iteration": 2.548846483230591 + }, + { + "auxiliary_loss_clip": 0.06424412, + "auxiliary_loss_mlp": 0.01265076, + "balance_loss_clip": 0.06275713, + "balance_loss_mlp": 0.01253578, + "epoch": 0.5010070644821885, + "flos": 27385839561600.0, + "grad_norm": 1.4154143625456153, + "language_loss": 0.75122535, + "learning_rate": 2.0905189089904993e-06, + "loss": 0.82812029, + "num_input_tokens_seen": 179130610, + "router_z_loss_clip": 1.48632812, + "router_z_loss_mlp": 0.1149292, + "step": 8333, + "time_per_iteration": 2.600377082824707 + }, + { + "auxiliary_loss_clip": 0.06429033, + "auxiliary_loss_mlp": 0.01268641, + "balance_loss_clip": 0.06276083, + "balance_loss_mlp": 0.01256481, + "epoch": 0.5010671877348565, + "flos": 20668178678400.0, + "grad_norm": 1.9411742898612023, + "language_loss": 0.80806357, + "learning_rate": 2.090129844689929e-06, + "loss": 0.88504034, + "num_input_tokens_seen": 179147860, + "router_z_loss_clip": 1.53222656, + "router_z_loss_mlp": 0.12158203, + "step": 8334, + "time_per_iteration": 2.490330457687378 + }, + { + "auxiliary_loss_clip": 0.0633373, + "auxiliary_loss_mlp": 0.01254486, + "balance_loss_clip": 0.06267349, + "balance_loss_mlp": 0.01252466, + "epoch": 0.5011273109875244, + "flos": 59148266855040.0, + "grad_norm": 0.880609822046852, + "language_loss": 0.62818438, + "learning_rate": 2.089740776971626e-06, + "loss": 0.70406651, + "num_input_tokens_seen": 179210490, + "router_z_loss_clip": 0.6640625, + "router_z_loss_mlp": 0.02020264, + "step": 8335, + "time_per_iteration": 3.1081318855285645 + }, + { + "auxiliary_loss_clip": 0.06426011, + "auxiliary_loss_mlp": 0.01265932, + "balance_loss_clip": 0.06278515, + "balance_loss_mlp": 0.01255334, + "epoch": 0.5011874342401924, + "flos": 25343126000640.0, + "grad_norm": 1.3778270209342711, + "language_loss": 0.80092967, + "learning_rate": 2.0893517058503435e-06, + "loss": 0.8778491, + "num_input_tokens_seen": 179231360, + "router_z_loss_clip": 1.47558594, + "router_z_loss_mlp": 0.105896, + "step": 8336, + "time_per_iteration": 2.5390379428863525 + }, + { + "auxiliary_loss_clip": 0.06428748, + "auxiliary_loss_mlp": 0.0126676, + "balance_loss_clip": 0.06278357, + "balance_loss_mlp": 0.01254923, + "epoch": 0.5012475574928603, + "flos": 20236153426560.0, + "grad_norm": 1.7537768303990948, + "language_loss": 0.81054461, + "learning_rate": 2.088962631340836e-06, + "loss": 0.88749969, + "num_input_tokens_seen": 179250625, + "router_z_loss_clip": 1.50585938, + "router_z_loss_mlp": 0.11834717, + "step": 8337, + "time_per_iteration": 2.5480427742004395 + }, + { + "auxiliary_loss_clip": 0.06436703, + "auxiliary_loss_mlp": 0.01267216, + "balance_loss_clip": 0.06279006, + "balance_loss_mlp": 0.01254973, + "epoch": 0.5013076807455283, + "flos": 22716594316800.0, + "grad_norm": 1.7916878418610642, + "language_loss": 0.79506505, + "learning_rate": 2.0885735534578555e-06, + "loss": 0.87210429, + "num_input_tokens_seen": 179267360, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.12255859, + "step": 8338, + "time_per_iteration": 2.5164718627929688 + }, + { + "auxiliary_loss_clip": 0.0643065, + "auxiliary_loss_mlp": 0.01265282, + "balance_loss_clip": 0.06277832, + "balance_loss_mlp": 0.01253176, + "epoch": 0.5013678039981962, + "flos": 24252329548800.0, + "grad_norm": 1.5889596080337545, + "language_loss": 0.85034919, + "learning_rate": 2.0881844722161583e-06, + "loss": 0.9273085, + "num_input_tokens_seen": 179289810, + "router_z_loss_clip": 1.52636719, + "router_z_loss_mlp": 0.12127686, + "step": 8339, + "time_per_iteration": 2.5785508155822754 + }, + { + "auxiliary_loss_clip": 0.06426719, + "auxiliary_loss_mlp": 0.01269107, + "balance_loss_clip": 0.06276147, + "balance_loss_mlp": 0.0125814, + "epoch": 0.5014279272508643, + "flos": 26183808414720.0, + "grad_norm": 1.5165096284579775, + "language_loss": 0.71162677, + "learning_rate": 2.0877953876304962e-06, + "loss": 0.78858501, + "num_input_tokens_seen": 179310620, + "router_z_loss_clip": 1.50488281, + "router_z_loss_mlp": 0.10968018, + "step": 8340, + "time_per_iteration": 2.5929582118988037 + }, + { + "auxiliary_loss_clip": 0.06433477, + "auxiliary_loss_mlp": 0.01270076, + "balance_loss_clip": 0.06277999, + "balance_loss_mlp": 0.01256867, + "epoch": 0.5014880505035323, + "flos": 21436255929600.0, + "grad_norm": 2.442832877053188, + "language_loss": 0.7829324, + "learning_rate": 2.0874062997156245e-06, + "loss": 0.85996789, + "num_input_tokens_seen": 179329005, + "router_z_loss_clip": 1.5546875, + "router_z_loss_mlp": 0.13208008, + "step": 8341, + "time_per_iteration": 2.5200908184051514 + }, + { + "auxiliary_loss_clip": 0.06435034, + "auxiliary_loss_mlp": 0.01267489, + "balance_loss_clip": 0.062792, + "balance_loss_mlp": 0.01255407, + "epoch": 0.5015481737562002, + "flos": 15774870816000.0, + "grad_norm": 2.1824930872588917, + "language_loss": 0.89806843, + "learning_rate": 2.0870172084862975e-06, + "loss": 0.97509372, + "num_input_tokens_seen": 179343785, + "router_z_loss_clip": 1.5546875, + "router_z_loss_mlp": 0.12091064, + "step": 8342, + "time_per_iteration": 2.502265691757202 + }, + { + "auxiliary_loss_clip": 0.06427857, + "auxiliary_loss_mlp": 0.01264552, + "balance_loss_clip": 0.06276843, + "balance_loss_mlp": 0.0125275, + "epoch": 0.5016082970088682, + "flos": 26837590296960.0, + "grad_norm": 1.7003073455140034, + "language_loss": 0.76872855, + "learning_rate": 2.0866281139572682e-06, + "loss": 0.84565264, + "num_input_tokens_seen": 179364070, + "router_z_loss_clip": 1.50878906, + "router_z_loss_mlp": 0.11804199, + "step": 8343, + "time_per_iteration": 2.5502099990844727 + }, + { + "auxiliary_loss_clip": 0.06426306, + "auxiliary_loss_mlp": 0.01267626, + "balance_loss_clip": 0.0627844, + "balance_loss_mlp": 0.01256724, + "epoch": 0.5016684202615361, + "flos": 21477023740800.0, + "grad_norm": 3.7325470711422466, + "language_loss": 0.67772466, + "learning_rate": 2.086239016143293e-06, + "loss": 0.75466394, + "num_input_tokens_seen": 179384225, + "router_z_loss_clip": 1.47949219, + "router_z_loss_mlp": 0.10900879, + "step": 8344, + "time_per_iteration": 2.5443081855773926 + }, + { + "auxiliary_loss_clip": 0.06429319, + "auxiliary_loss_mlp": 0.01271563, + "balance_loss_clip": 0.06277445, + "balance_loss_mlp": 0.01259803, + "epoch": 0.5017285435142042, + "flos": 26253478684800.0, + "grad_norm": 2.15637603402593, + "language_loss": 0.75492197, + "learning_rate": 2.0858499150591258e-06, + "loss": 0.83193076, + "num_input_tokens_seen": 179402595, + "router_z_loss_clip": 1.51757812, + "router_z_loss_mlp": 0.11767578, + "step": 8345, + "time_per_iteration": 2.5757455825805664 + }, + { + "auxiliary_loss_clip": 0.06426319, + "auxiliary_loss_mlp": 0.01267207, + "balance_loss_clip": 0.06275543, + "balance_loss_mlp": 0.0125441, + "epoch": 0.5017886667668721, + "flos": 20783899566720.0, + "grad_norm": 2.131359070350305, + "language_loss": 0.78573453, + "learning_rate": 2.0854608107195203e-06, + "loss": 0.86266983, + "num_input_tokens_seen": 179419635, + "router_z_loss_clip": 1.50585938, + "router_z_loss_mlp": 0.12805176, + "step": 8346, + "time_per_iteration": 2.5463459491729736 + }, + { + "auxiliary_loss_clip": 0.06428749, + "auxiliary_loss_mlp": 0.012678, + "balance_loss_clip": 0.0627691, + "balance_loss_mlp": 0.01256201, + "epoch": 0.5018487900195401, + "flos": 20162500087680.0, + "grad_norm": 1.4665059060371557, + "language_loss": 0.69395542, + "learning_rate": 2.0850717031392333e-06, + "loss": 0.77092093, + "num_input_tokens_seen": 179438770, + "router_z_loss_clip": 1.51757812, + "router_z_loss_mlp": 0.11608887, + "step": 8347, + "time_per_iteration": 2.5277669429779053 + }, + { + "auxiliary_loss_clip": 0.06433204, + "auxiliary_loss_mlp": 0.0126827, + "balance_loss_clip": 0.06278361, + "balance_loss_mlp": 0.01256236, + "epoch": 0.501908913272208, + "flos": 18156613196160.0, + "grad_norm": 2.582566868470837, + "language_loss": 0.7215631, + "learning_rate": 2.0846825923330174e-06, + "loss": 0.79857785, + "num_input_tokens_seen": 179457475, + "router_z_loss_clip": 1.54882812, + "router_z_loss_mlp": 0.12030029, + "step": 8348, + "time_per_iteration": 3.996784210205078 + }, + { + "auxiliary_loss_clip": 0.06424178, + "auxiliary_loss_mlp": 0.01269515, + "balance_loss_clip": 0.06277803, + "balance_loss_mlp": 0.01258166, + "epoch": 0.501969036524876, + "flos": 23118962423040.0, + "grad_norm": 1.4308074213434065, + "language_loss": 0.74796462, + "learning_rate": 2.0842934783156303e-06, + "loss": 0.82490146, + "num_input_tokens_seen": 179478140, + "router_z_loss_clip": 1.46386719, + "router_z_loss_mlp": 0.11346436, + "step": 8349, + "time_per_iteration": 2.5489115715026855 + }, + { + "auxiliary_loss_clip": 0.06429881, + "auxiliary_loss_mlp": 0.01269935, + "balance_loss_clip": 0.06276442, + "balance_loss_mlp": 0.01257442, + "epoch": 0.5020291597775439, + "flos": 11367814596480.0, + "grad_norm": 1.898459652208493, + "language_loss": 0.63674343, + "learning_rate": 2.0839043611018266e-06, + "loss": 0.71374166, + "num_input_tokens_seen": 179494325, + "router_z_loss_clip": 1.53417969, + "router_z_loss_mlp": 0.12493896, + "step": 8350, + "time_per_iteration": 2.487217426300049 + }, + { + "auxiliary_loss_clip": 0.06323833, + "auxiliary_loss_mlp": 0.01259522, + "balance_loss_clip": 0.06257538, + "balance_loss_mlp": 0.01257642, + "epoch": 0.5020892830302119, + "flos": 64030422124800.0, + "grad_norm": 0.7586308907420236, + "language_loss": 0.59914774, + "learning_rate": 2.0835152407063597e-06, + "loss": 0.6749813, + "num_input_tokens_seen": 179553545, + "router_z_loss_clip": 0.6640625, + "router_z_loss_mlp": 0.01876831, + "step": 8351, + "time_per_iteration": 4.69463324546814 + }, + { + "auxiliary_loss_clip": 0.06434566, + "auxiliary_loss_mlp": 0.01269503, + "balance_loss_clip": 0.06280354, + "balance_loss_mlp": 0.01258029, + "epoch": 0.5021494062828799, + "flos": 23739691069440.0, + "grad_norm": 1.6219034526425078, + "language_loss": 0.75496215, + "learning_rate": 2.0831261171439873e-06, + "loss": 0.83200288, + "num_input_tokens_seen": 179573645, + "router_z_loss_clip": 1.54296875, + "router_z_loss_mlp": 0.11474609, + "step": 8352, + "time_per_iteration": 2.5164549350738525 + }, + { + "auxiliary_loss_clip": 0.06428628, + "auxiliary_loss_mlp": 0.01267422, + "balance_loss_clip": 0.06277371, + "balance_loss_mlp": 0.01254845, + "epoch": 0.5022095295355479, + "flos": 21582640212480.0, + "grad_norm": 1.8174761726271038, + "language_loss": 0.71818656, + "learning_rate": 2.082736990429464e-06, + "loss": 0.795147, + "num_input_tokens_seen": 179591435, + "router_z_loss_clip": 1.51171875, + "router_z_loss_mlp": 0.12573242, + "step": 8353, + "time_per_iteration": 2.51479172706604 + }, + { + "auxiliary_loss_clip": 0.06434356, + "auxiliary_loss_mlp": 0.01268164, + "balance_loss_clip": 0.06281401, + "balance_loss_mlp": 0.01256105, + "epoch": 0.5022696527882159, + "flos": 21403580037120.0, + "grad_norm": 2.9144841273148154, + "language_loss": 0.74235505, + "learning_rate": 2.0823478605775455e-06, + "loss": 0.81938022, + "num_input_tokens_seen": 179609955, + "router_z_loss_clip": 1.52734375, + "router_z_loss_mlp": 0.12060547, + "step": 8354, + "time_per_iteration": 2.5085036754608154 + }, + { + "auxiliary_loss_clip": 0.06431521, + "auxiliary_loss_mlp": 0.0126797, + "balance_loss_clip": 0.06281638, + "balance_loss_mlp": 0.01256216, + "epoch": 0.5023297760408838, + "flos": 27167814437760.0, + "grad_norm": 1.5801517406711547, + "language_loss": 0.7257005, + "learning_rate": 2.0819587276029884e-06, + "loss": 0.80269539, + "num_input_tokens_seen": 179630875, + "router_z_loss_clip": 1.49804688, + "router_z_loss_mlp": 0.11755371, + "step": 8355, + "time_per_iteration": 2.559136152267456 + }, + { + "auxiliary_loss_clip": 0.06435544, + "auxiliary_loss_mlp": 0.01267978, + "balance_loss_clip": 0.06278937, + "balance_loss_mlp": 0.01255134, + "epoch": 0.5023898992935518, + "flos": 26221054354560.0, + "grad_norm": 1.801551244152151, + "language_loss": 0.8142066, + "learning_rate": 2.081569591520548e-06, + "loss": 0.89124179, + "num_input_tokens_seen": 179649835, + "router_z_loss_clip": 1.56445312, + "router_z_loss_mlp": 0.1282959, + "step": 8356, + "time_per_iteration": 3.978407144546509 + }, + { + "auxiliary_loss_clip": 0.06435513, + "auxiliary_loss_mlp": 0.01268474, + "balance_loss_clip": 0.06275411, + "balance_loss_mlp": 0.01255272, + "epoch": 0.5024500225462197, + "flos": 13444839204480.0, + "grad_norm": 2.072167033386685, + "language_loss": 0.7662456, + "learning_rate": 2.0811804523449803e-06, + "loss": 0.84328556, + "num_input_tokens_seen": 179667605, + "router_z_loss_clip": 1.60058594, + "router_z_loss_mlp": 0.13201904, + "step": 8357, + "time_per_iteration": 2.488581657409668 + }, + { + "auxiliary_loss_clip": 0.06431419, + "auxiliary_loss_mlp": 0.01272086, + "balance_loss_clip": 0.06275965, + "balance_loss_mlp": 0.01258758, + "epoch": 0.5025101457988878, + "flos": 21585952448640.0, + "grad_norm": 1.5828459742560037, + "language_loss": 0.76457655, + "learning_rate": 2.0807913100910417e-06, + "loss": 0.84161162, + "num_input_tokens_seen": 179686910, + "router_z_loss_clip": 1.55371094, + "router_z_loss_mlp": 0.13342285, + "step": 8358, + "time_per_iteration": 2.62697434425354 + }, + { + "auxiliary_loss_clip": 0.06429468, + "auxiliary_loss_mlp": 0.01266352, + "balance_loss_clip": 0.06276305, + "balance_loss_mlp": 0.01253877, + "epoch": 0.5025702690515557, + "flos": 24652140105600.0, + "grad_norm": 2.247340947262335, + "language_loss": 0.72276986, + "learning_rate": 2.0804021647734887e-06, + "loss": 0.79972816, + "num_input_tokens_seen": 179706395, + "router_z_loss_clip": 1.53027344, + "router_z_loss_mlp": 0.12481689, + "step": 8359, + "time_per_iteration": 2.577232599258423 + }, + { + "auxiliary_loss_clip": 0.0642844, + "auxiliary_loss_mlp": 0.01267714, + "balance_loss_clip": 0.06277584, + "balance_loss_mlp": 0.01255263, + "epoch": 0.5026303923042237, + "flos": 22096578430080.0, + "grad_norm": 1.7221298639434877, + "language_loss": 0.77017021, + "learning_rate": 2.080013016407077e-06, + "loss": 0.84713173, + "num_input_tokens_seen": 179725735, + "router_z_loss_clip": 1.50683594, + "router_z_loss_mlp": 0.12451172, + "step": 8360, + "time_per_iteration": 2.5449211597442627 + }, + { + "auxiliary_loss_clip": 0.0642498, + "auxiliary_loss_mlp": 0.01267029, + "balance_loss_clip": 0.06274442, + "balance_loss_mlp": 0.0125571, + "epoch": 0.5026905155568916, + "flos": 23704164138240.0, + "grad_norm": 3.319216273479951, + "language_loss": 0.76811969, + "learning_rate": 2.0796238650065645e-06, + "loss": 0.84503973, + "num_input_tokens_seen": 179746150, + "router_z_loss_clip": 1.50488281, + "router_z_loss_mlp": 0.11322021, + "step": 8361, + "time_per_iteration": 2.5360496044158936 + }, + { + "auxiliary_loss_clip": 0.06433755, + "auxiliary_loss_mlp": 0.01271718, + "balance_loss_clip": 0.06276754, + "balance_loss_mlp": 0.01258641, + "epoch": 0.5027506388095596, + "flos": 25819566716160.0, + "grad_norm": 1.6478894806212292, + "language_loss": 0.85182559, + "learning_rate": 2.0792347105867065e-06, + "loss": 0.92888033, + "num_input_tokens_seen": 179767550, + "router_z_loss_clip": 1.56835938, + "router_z_loss_mlp": 0.13067627, + "step": 8362, + "time_per_iteration": 4.023087739944458 + }, + { + "auxiliary_loss_clip": 0.06433062, + "auxiliary_loss_mlp": 0.01266272, + "balance_loss_clip": 0.06277543, + "balance_loss_mlp": 0.01253851, + "epoch": 0.5028107620622275, + "flos": 27533942853120.0, + "grad_norm": 1.6676304720736304, + "language_loss": 0.79210544, + "learning_rate": 2.0788455531622605e-06, + "loss": 0.86909878, + "num_input_tokens_seen": 179790075, + "router_z_loss_clip": 1.5546875, + "router_z_loss_mlp": 0.12420654, + "step": 8363, + "time_per_iteration": 2.610635757446289 + }, + { + "auxiliary_loss_clip": 0.0642155, + "auxiliary_loss_mlp": 0.0126839, + "balance_loss_clip": 0.06275487, + "balance_loss_mlp": 0.01255903, + "epoch": 0.5028708853148955, + "flos": 24541031191680.0, + "grad_norm": 2.470464307064636, + "language_loss": 0.76251006, + "learning_rate": 2.0784563927479838e-06, + "loss": 0.83940947, + "num_input_tokens_seen": 179806515, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.12493896, + "step": 8364, + "time_per_iteration": 2.510077953338623 + }, + { + "auxiliary_loss_clip": 0.06429755, + "auxiliary_loss_mlp": 0.01267093, + "balance_loss_clip": 0.0627771, + "balance_loss_mlp": 0.0125556, + "epoch": 0.5029310085675635, + "flos": 20819887695360.0, + "grad_norm": 1.5150578704653515, + "language_loss": 0.69785869, + "learning_rate": 2.0780672293586317e-06, + "loss": 0.77482712, + "num_input_tokens_seen": 179826450, + "router_z_loss_clip": 1.52050781, + "router_z_loss_mlp": 0.11529541, + "step": 8365, + "time_per_iteration": 2.523810386657715 + }, + { + "auxiliary_loss_clip": 0.064358, + "auxiliary_loss_mlp": 0.01267788, + "balance_loss_clip": 0.06276847, + "balance_loss_mlp": 0.01254365, + "epoch": 0.5029911318202315, + "flos": 22348411476480.0, + "grad_norm": 1.5746180090110224, + "language_loss": 0.73351806, + "learning_rate": 2.0776780630089635e-06, + "loss": 0.81055391, + "num_input_tokens_seen": 179846770, + "router_z_loss_clip": 1.58789062, + "router_z_loss_mlp": 0.13439941, + "step": 8366, + "time_per_iteration": 2.538522481918335 + }, + { + "auxiliary_loss_clip": 0.06433431, + "auxiliary_loss_mlp": 0.01266603, + "balance_loss_clip": 0.06282506, + "balance_loss_mlp": 0.01254324, + "epoch": 0.5030512550728995, + "flos": 24359581175040.0, + "grad_norm": 1.43168858878555, + "language_loss": 0.78766662, + "learning_rate": 2.077288893713735e-06, + "loss": 0.86466694, + "num_input_tokens_seen": 179866585, + "router_z_loss_clip": 1.50976562, + "router_z_loss_mlp": 0.12268066, + "step": 8367, + "time_per_iteration": 2.58542799949646 + }, + { + "auxiliary_loss_clip": 0.064292, + "auxiliary_loss_mlp": 0.01267625, + "balance_loss_clip": 0.06276654, + "balance_loss_mlp": 0.01255835, + "epoch": 0.5031113783255674, + "flos": 18265835393280.0, + "grad_norm": 1.7642536194953051, + "language_loss": 0.70319581, + "learning_rate": 2.0768997214877035e-06, + "loss": 0.78016406, + "num_input_tokens_seen": 179885575, + "router_z_loss_clip": 1.5234375, + "router_z_loss_mlp": 0.11804199, + "step": 8368, + "time_per_iteration": 2.4808216094970703 + }, + { + "auxiliary_loss_clip": 0.06318872, + "auxiliary_loss_mlp": 0.01256661, + "balance_loss_clip": 0.06252527, + "balance_loss_mlp": 0.01254704, + "epoch": 0.5031715015782354, + "flos": 57270022859520.0, + "grad_norm": 0.9058846668072361, + "language_loss": 0.63429594, + "learning_rate": 2.0765105463456274e-06, + "loss": 0.7100513, + "num_input_tokens_seen": 179939650, + "router_z_loss_clip": 0.6640625, + "router_z_loss_mlp": 0.01954651, + "step": 8369, + "time_per_iteration": 3.0813984870910645 + }, + { + "auxiliary_loss_clip": 0.06425582, + "auxiliary_loss_mlp": 0.0126821, + "balance_loss_clip": 0.06275157, + "balance_loss_mlp": 0.01256873, + "epoch": 0.5032316248309033, + "flos": 27534823320960.0, + "grad_norm": 1.9780482072247232, + "language_loss": 0.60450232, + "learning_rate": 2.076121368302263e-06, + "loss": 0.68144017, + "num_input_tokens_seen": 179961765, + "router_z_loss_clip": 1.50390625, + "router_z_loss_mlp": 0.11328125, + "step": 8370, + "time_per_iteration": 2.6361827850341797 + }, + { + "auxiliary_loss_clip": 0.06429368, + "auxiliary_loss_mlp": 0.01269199, + "balance_loss_clip": 0.06273827, + "balance_loss_mlp": 0.01255901, + "epoch": 0.5032917480835714, + "flos": 34504401104640.0, + "grad_norm": 1.6209694165930644, + "language_loss": 0.68475735, + "learning_rate": 2.0757321873723695e-06, + "loss": 0.76174301, + "num_input_tokens_seen": 179983015, + "router_z_loss_clip": 1.55664062, + "router_z_loss_mlp": 0.13293457, + "step": 8371, + "time_per_iteration": 2.6757090091705322 + }, + { + "auxiliary_loss_clip": 0.06428707, + "auxiliary_loss_mlp": 0.01269533, + "balance_loss_clip": 0.06274853, + "balance_loss_mlp": 0.01256158, + "epoch": 0.5033518713362393, + "flos": 33665228064000.0, + "grad_norm": 1.992355635042309, + "language_loss": 0.67781597, + "learning_rate": 2.0753430035707042e-06, + "loss": 0.75479841, + "num_input_tokens_seen": 180003210, + "router_z_loss_clip": 1.54199219, + "router_z_loss_mlp": 0.13397217, + "step": 8372, + "time_per_iteration": 2.625875234603882 + }, + { + "auxiliary_loss_clip": 0.06429783, + "auxiliary_loss_mlp": 0.0126941, + "balance_loss_clip": 0.06275001, + "balance_loss_mlp": 0.0125582, + "epoch": 0.5034119945889073, + "flos": 28193301031680.0, + "grad_norm": 1.502668832263038, + "language_loss": 0.67200899, + "learning_rate": 2.0749538169120235e-06, + "loss": 0.74900091, + "num_input_tokens_seen": 180025530, + "router_z_loss_clip": 1.546875, + "router_z_loss_mlp": 0.13604736, + "step": 8373, + "time_per_iteration": 2.605649709701538 + }, + { + "auxiliary_loss_clip": 0.06426984, + "auxiliary_loss_mlp": 0.01270724, + "balance_loss_clip": 0.06274835, + "balance_loss_mlp": 0.01258362, + "epoch": 0.5034721178415752, + "flos": 21364698942720.0, + "grad_norm": 1.6635937081301206, + "language_loss": 0.75186062, + "learning_rate": 2.0745646274110872e-06, + "loss": 0.82883763, + "num_input_tokens_seen": 180043180, + "router_z_loss_clip": 1.52050781, + "router_z_loss_mlp": 0.12365723, + "step": 8374, + "time_per_iteration": 2.503739595413208 + }, + { + "auxiliary_loss_clip": 0.06431206, + "auxiliary_loss_mlp": 0.01268819, + "balance_loss_clip": 0.06274216, + "balance_loss_mlp": 0.01255945, + "epoch": 0.5035322410942432, + "flos": 22681486656000.0, + "grad_norm": 1.5469346618590563, + "language_loss": 0.68547672, + "learning_rate": 2.0741754350826525e-06, + "loss": 0.76247704, + "num_input_tokens_seen": 180062905, + "router_z_loss_clip": 1.5703125, + "router_z_loss_mlp": 0.12878418, + "step": 8375, + "time_per_iteration": 2.5394840240478516 + }, + { + "auxiliary_loss_clip": 0.06436669, + "auxiliary_loss_mlp": 0.0127122, + "balance_loss_clip": 0.06277038, + "balance_loss_mlp": 0.01257285, + "epoch": 0.5035923643469111, + "flos": 19834875423360.0, + "grad_norm": 1.6007016499880733, + "language_loss": 0.78976023, + "learning_rate": 2.0737862399414777e-06, + "loss": 0.86683917, + "num_input_tokens_seen": 180082000, + "router_z_loss_clip": 1.59570312, + "router_z_loss_mlp": 0.1394043, + "step": 8376, + "time_per_iteration": 2.480931520462036 + }, + { + "auxiliary_loss_clip": 0.06429401, + "auxiliary_loss_mlp": 0.01267025, + "balance_loss_clip": 0.06272124, + "balance_loss_mlp": 0.01254722, + "epoch": 0.5036524875995791, + "flos": 30521823269760.0, + "grad_norm": 2.1513689232389686, + "language_loss": 0.59716964, + "learning_rate": 2.0733970420023213e-06, + "loss": 0.6741339, + "num_input_tokens_seen": 180101340, + "router_z_loss_clip": 1.57226562, + "router_z_loss_mlp": 0.12304688, + "step": 8377, + "time_per_iteration": 2.5793137550354004 + }, + { + "auxiliary_loss_clip": 0.06430321, + "auxiliary_loss_mlp": 0.01267909, + "balance_loss_clip": 0.06277174, + "balance_loss_mlp": 0.01254617, + "epoch": 0.5037126108522471, + "flos": 14725848424320.0, + "grad_norm": 1.9178870854351904, + "language_loss": 0.76377517, + "learning_rate": 2.0730078412799425e-06, + "loss": 0.84075749, + "num_input_tokens_seen": 180119160, + "router_z_loss_clip": 1.53320312, + "router_z_loss_mlp": 0.13305664, + "step": 8378, + "time_per_iteration": 2.4622483253479004 + }, + { + "auxiliary_loss_clip": 0.06432158, + "auxiliary_loss_mlp": 0.01267261, + "balance_loss_clip": 0.06278415, + "balance_loss_mlp": 0.01254815, + "epoch": 0.5037727341049151, + "flos": 25304119125120.0, + "grad_norm": 1.5376418940503571, + "language_loss": 0.746418, + "learning_rate": 2.0726186377890985e-06, + "loss": 0.82341218, + "num_input_tokens_seen": 180138730, + "router_z_loss_clip": 1.53710938, + "router_z_loss_mlp": 0.12457275, + "step": 8379, + "time_per_iteration": 2.55764102935791 + }, + { + "auxiliary_loss_clip": 0.06427328, + "auxiliary_loss_mlp": 0.01273275, + "balance_loss_clip": 0.06276823, + "balance_loss_mlp": 0.01260138, + "epoch": 0.5038328573575831, + "flos": 28548193000320.0, + "grad_norm": 1.8355606211356674, + "language_loss": 0.66636741, + "learning_rate": 2.072229431544548e-06, + "loss": 0.74337339, + "num_input_tokens_seen": 180158810, + "router_z_loss_clip": 1.50195312, + "router_z_loss_mlp": 0.13146973, + "step": 8380, + "time_per_iteration": 2.566993474960327 + }, + { + "auxiliary_loss_clip": 0.06426656, + "auxiliary_loss_mlp": 0.01266484, + "balance_loss_clip": 0.0627608, + "balance_loss_mlp": 0.01254259, + "epoch": 0.503892980610251, + "flos": 31657957580160.0, + "grad_norm": 1.8901892775526132, + "language_loss": 0.63646573, + "learning_rate": 2.071840222561051e-06, + "loss": 0.71339715, + "num_input_tokens_seen": 180179700, + "router_z_loss_clip": 1.50488281, + "router_z_loss_mlp": 0.12213135, + "step": 8381, + "time_per_iteration": 2.5915544033050537 + }, + { + "auxiliary_loss_clip": 0.06427336, + "auxiliary_loss_mlp": 0.01268764, + "balance_loss_clip": 0.06275158, + "balance_loss_mlp": 0.01257087, + "epoch": 0.503953103862919, + "flos": 27096718648320.0, + "grad_norm": 1.5372847630358786, + "language_loss": 0.67925096, + "learning_rate": 2.071451010853365e-06, + "loss": 0.756212, + "num_input_tokens_seen": 180199890, + "router_z_loss_clip": 1.52246094, + "router_z_loss_mlp": 0.11676025, + "step": 8382, + "time_per_iteration": 2.553654432296753 + }, + { + "auxiliary_loss_clip": 0.06443429, + "auxiliary_loss_mlp": 0.01271028, + "balance_loss_clip": 0.06281322, + "balance_loss_mlp": 0.0125745, + "epoch": 0.5040132271155869, + "flos": 15638423241600.0, + "grad_norm": 1.8104420976136362, + "language_loss": 0.62072217, + "learning_rate": 2.0710617964362506e-06, + "loss": 0.69786668, + "num_input_tokens_seen": 180217840, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.13598633, + "step": 8383, + "time_per_iteration": 2.525148630142212 + }, + { + "auxiliary_loss_clip": 0.06426074, + "auxiliary_loss_mlp": 0.01267471, + "balance_loss_clip": 0.06277263, + "balance_loss_mlp": 0.01255609, + "epoch": 0.504073350368255, + "flos": 13595290410240.0, + "grad_norm": 1.7264517386370961, + "language_loss": 0.6736567, + "learning_rate": 2.070672579324465e-06, + "loss": 0.75059223, + "num_input_tokens_seen": 180236465, + "router_z_loss_clip": 1.48730469, + "router_z_loss_mlp": 0.11853027, + "step": 8384, + "time_per_iteration": 2.4712305068969727 + }, + { + "auxiliary_loss_clip": 0.064311, + "auxiliary_loss_mlp": 0.01267671, + "balance_loss_clip": 0.06277114, + "balance_loss_mlp": 0.01255059, + "epoch": 0.5041334736209229, + "flos": 29065611162240.0, + "grad_norm": 1.6378210813415193, + "language_loss": 0.71431983, + "learning_rate": 2.0702833595327674e-06, + "loss": 0.79130751, + "num_input_tokens_seen": 180258025, + "router_z_loss_clip": 1.54003906, + "router_z_loss_mlp": 0.12609863, + "step": 8385, + "time_per_iteration": 2.573953151702881 + }, + { + "auxiliary_loss_clip": 0.06426452, + "auxiliary_loss_mlp": 0.01264681, + "balance_loss_clip": 0.0627909, + "balance_loss_mlp": 0.01252916, + "epoch": 0.5041935968735909, + "flos": 24615313436160.0, + "grad_norm": 1.6953325653845304, + "language_loss": 0.83098906, + "learning_rate": 2.069894137075919e-06, + "loss": 0.90790039, + "num_input_tokens_seen": 180277825, + "router_z_loss_clip": 1.47167969, + "router_z_loss_mlp": 0.11767578, + "step": 8386, + "time_per_iteration": 2.5524075031280518 + }, + { + "auxiliary_loss_clip": 0.06431791, + "auxiliary_loss_mlp": 0.01268931, + "balance_loss_clip": 0.06277502, + "balance_loss_mlp": 0.01256146, + "epoch": 0.5042537201262588, + "flos": 26294204568960.0, + "grad_norm": 1.4563010196783333, + "language_loss": 0.669891, + "learning_rate": 2.0695049119686766e-06, + "loss": 0.74689829, + "num_input_tokens_seen": 180300465, + "router_z_loss_clip": 1.54296875, + "router_z_loss_mlp": 0.12780762, + "step": 8387, + "time_per_iteration": 3.9810335636138916 + }, + { + "auxiliary_loss_clip": 0.064284, + "auxiliary_loss_mlp": 0.01266601, + "balance_loss_clip": 0.06276827, + "balance_loss_mlp": 0.01254608, + "epoch": 0.5043138433789268, + "flos": 22023805559040.0, + "grad_norm": 3.745410743833339, + "language_loss": 0.80531698, + "learning_rate": 2.0691156842258016e-06, + "loss": 0.882267, + "num_input_tokens_seen": 180321050, + "router_z_loss_clip": 1.51464844, + "router_z_loss_mlp": 0.11999512, + "step": 8388, + "time_per_iteration": 2.5729317665100098 + }, + { + "auxiliary_loss_clip": 0.06426677, + "auxiliary_loss_mlp": 0.01268377, + "balance_loss_clip": 0.06274392, + "balance_loss_mlp": 0.01256075, + "epoch": 0.5043739666315947, + "flos": 28774645459200.0, + "grad_norm": 1.9801629056940246, + "language_loss": 0.70134413, + "learning_rate": 2.0687264538620537e-06, + "loss": 0.77829468, + "num_input_tokens_seen": 180338870, + "router_z_loss_clip": 1.52246094, + "router_z_loss_mlp": 0.12298584, + "step": 8389, + "time_per_iteration": 2.5604100227355957 + }, + { + "auxiliary_loss_clip": 0.06432408, + "auxiliary_loss_mlp": 0.01269066, + "balance_loss_clip": 0.06276394, + "balance_loss_mlp": 0.01256328, + "epoch": 0.5044340898842627, + "flos": 27606548016000.0, + "grad_norm": 1.4709504779743863, + "language_loss": 0.69360697, + "learning_rate": 2.068337220892191e-06, + "loss": 0.77062166, + "num_input_tokens_seen": 180361285, + "router_z_loss_clip": 1.56054688, + "router_z_loss_mlp": 0.12750244, + "step": 8390, + "time_per_iteration": 4.074434041976929 + }, + { + "auxiliary_loss_clip": 0.06327184, + "auxiliary_loss_mlp": 0.01253766, + "balance_loss_clip": 0.06261003, + "balance_loss_mlp": 0.01251581, + "epoch": 0.5044942131369307, + "flos": 67474744058880.0, + "grad_norm": 0.7911094819234682, + "language_loss": 0.52874231, + "learning_rate": 2.067947985330974e-06, + "loss": 0.60455179, + "num_input_tokens_seen": 180415170, + "router_z_loss_clip": 0.6640625, + "router_z_loss_mlp": 0.0218811, + "step": 8391, + "time_per_iteration": 2.939533233642578 + }, + { + "auxiliary_loss_clip": 0.06334387, + "auxiliary_loss_mlp": 0.01253845, + "balance_loss_clip": 0.06267701, + "balance_loss_mlp": 0.01251732, + "epoch": 0.5045543363895987, + "flos": 58646460280320.0, + "grad_norm": 0.8187125498801333, + "language_loss": 0.60630977, + "learning_rate": 2.0675587471931628e-06, + "loss": 0.68219203, + "num_input_tokens_seen": 180468060, + "router_z_loss_clip": 0.66796875, + "router_z_loss_mlp": 0.02114868, + "step": 8392, + "time_per_iteration": 2.9839742183685303 + }, + { + "auxiliary_loss_clip": 0.06425072, + "auxiliary_loss_mlp": 0.01265494, + "balance_loss_clip": 0.06275131, + "balance_loss_mlp": 0.01252631, + "epoch": 0.5046144596422667, + "flos": 22532880240000.0, + "grad_norm": 1.6790063296091327, + "language_loss": 0.85000169, + "learning_rate": 2.067169506493517e-06, + "loss": 0.9269073, + "num_input_tokens_seen": 180486610, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.12866211, + "step": 8393, + "time_per_iteration": 2.5764622688293457 + }, + { + "auxiliary_loss_clip": 0.06430794, + "auxiliary_loss_mlp": 0.01270713, + "balance_loss_clip": 0.06278183, + "balance_loss_mlp": 0.01258869, + "epoch": 0.5046745828949346, + "flos": 27461673106560.0, + "grad_norm": 1.8013259480756436, + "language_loss": 0.5139519, + "learning_rate": 2.0667802632467974e-06, + "loss": 0.590967, + "num_input_tokens_seen": 180508135, + "router_z_loss_clip": 1.52734375, + "router_z_loss_mlp": 0.11834717, + "step": 8394, + "time_per_iteration": 2.5577075481414795 + }, + { + "auxiliary_loss_clip": 0.06430504, + "auxiliary_loss_mlp": 0.012693, + "balance_loss_clip": 0.06275499, + "balance_loss_mlp": 0.01256664, + "epoch": 0.5047347061476026, + "flos": 17280236142720.0, + "grad_norm": 1.62433976950566, + "language_loss": 0.75468862, + "learning_rate": 2.0663910174677627e-06, + "loss": 0.83168674, + "num_input_tokens_seen": 180527000, + "router_z_loss_clip": 1.54980469, + "router_z_loss_mlp": 0.12628174, + "step": 8395, + "time_per_iteration": 4.00100040435791 + }, + { + "auxiliary_loss_clip": 0.06430663, + "auxiliary_loss_mlp": 0.01264928, + "balance_loss_clip": 0.06276973, + "balance_loss_mlp": 0.01252876, + "epoch": 0.5047948294002705, + "flos": 16654308543360.0, + "grad_norm": 3.1739634410128446, + "language_loss": 0.68759549, + "learning_rate": 2.0660017691711737e-06, + "loss": 0.76455134, + "num_input_tokens_seen": 180544715, + "router_z_loss_clip": 1.53613281, + "router_z_loss_mlp": 0.1206665, + "step": 8396, + "time_per_iteration": 2.5608737468719482 + }, + { + "auxiliary_loss_clip": 0.0643612, + "auxiliary_loss_mlp": 0.01265513, + "balance_loss_clip": 0.06282924, + "balance_loss_mlp": 0.01253235, + "epoch": 0.5048549526529386, + "flos": 26872236760320.0, + "grad_norm": 1.7251064316936986, + "language_loss": 0.7921707, + "learning_rate": 2.065612518371792e-06, + "loss": 0.869187, + "num_input_tokens_seen": 180565365, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.12268066, + "step": 8397, + "time_per_iteration": 2.5829713344573975 + }, + { + "auxiliary_loss_clip": 0.06430176, + "auxiliary_loss_mlp": 0.01271123, + "balance_loss_clip": 0.06278492, + "balance_loss_mlp": 0.01258571, + "epoch": 0.5049150759056065, + "flos": 21840175336320.0, + "grad_norm": 1.4916236371554883, + "language_loss": 0.66563869, + "learning_rate": 2.065223265084376e-06, + "loss": 0.7426517, + "num_input_tokens_seen": 180586670, + "router_z_loss_clip": 1.51660156, + "router_z_loss_mlp": 0.12554932, + "step": 8398, + "time_per_iteration": 2.5790011882781982 + }, + { + "auxiliary_loss_clip": 0.06432331, + "auxiliary_loss_mlp": 0.01272223, + "balance_loss_clip": 0.06280147, + "balance_loss_mlp": 0.01259688, + "epoch": 0.5049751991582745, + "flos": 21691652774400.0, + "grad_norm": 1.5799272085735376, + "language_loss": 0.72252852, + "learning_rate": 2.064834009323688e-06, + "loss": 0.79957408, + "num_input_tokens_seen": 180605085, + "router_z_loss_clip": 1.52148438, + "router_z_loss_mlp": 0.12524414, + "step": 8399, + "time_per_iteration": 2.5528035163879395 + }, + { + "auxiliary_loss_clip": 0.06433836, + "auxiliary_loss_mlp": 0.01270059, + "balance_loss_clip": 0.06277353, + "balance_loss_mlp": 0.01257267, + "epoch": 0.5050353224109424, + "flos": 21365495556480.0, + "grad_norm": 1.7587629772693838, + "language_loss": 0.81515628, + "learning_rate": 2.0644447511044878e-06, + "loss": 0.89219522, + "num_input_tokens_seen": 180624370, + "router_z_loss_clip": 1.56347656, + "router_z_loss_mlp": 0.12792969, + "step": 8400, + "time_per_iteration": 2.550828456878662 + }, + { + "auxiliary_loss_clip": 0.06428652, + "auxiliary_loss_mlp": 0.01266624, + "balance_loss_clip": 0.06276295, + "balance_loss_mlp": 0.01254852, + "epoch": 0.5050954456636104, + "flos": 22826655054720.0, + "grad_norm": 2.5272013560823403, + "language_loss": 0.79016161, + "learning_rate": 2.0640554904415362e-06, + "loss": 0.86711431, + "num_input_tokens_seen": 180642450, + "router_z_loss_clip": 1.5234375, + "router_z_loss_mlp": 0.11779785, + "step": 8401, + "time_per_iteration": 2.525132894515991 + }, + { + "auxiliary_loss_clip": 0.06433861, + "auxiliary_loss_mlp": 0.01265271, + "balance_loss_clip": 0.06275853, + "balance_loss_mlp": 0.01252778, + "epoch": 0.5051555689162783, + "flos": 30456513411840.0, + "grad_norm": 1.509144939938127, + "language_loss": 0.70489848, + "learning_rate": 2.063666227349593e-06, + "loss": 0.7818898, + "num_input_tokens_seen": 180665250, + "router_z_loss_clip": 1.58203125, + "router_z_loss_mlp": 0.125, + "step": 8402, + "time_per_iteration": 4.0306360721588135 + }, + { + "auxiliary_loss_clip": 0.06429238, + "auxiliary_loss_mlp": 0.01267033, + "balance_loss_clip": 0.06274545, + "balance_loss_mlp": 0.01254915, + "epoch": 0.5052156921689464, + "flos": 21294315912960.0, + "grad_norm": 1.5960111955062717, + "language_loss": 0.6935674, + "learning_rate": 2.063276961843422e-06, + "loss": 0.77053005, + "num_input_tokens_seen": 180687425, + "router_z_loss_clip": 1.54589844, + "router_z_loss_mlp": 0.12121582, + "step": 8403, + "time_per_iteration": 2.558231830596924 + }, + { + "auxiliary_loss_clip": 0.06433211, + "auxiliary_loss_mlp": 0.01267338, + "balance_loss_clip": 0.06281981, + "balance_loss_mlp": 0.01255799, + "epoch": 0.5052758154216143, + "flos": 25088106499200.0, + "grad_norm": 1.463323664554185, + "language_loss": 0.86018717, + "learning_rate": 2.062887693937781e-06, + "loss": 0.93719262, + "num_input_tokens_seen": 180708725, + "router_z_loss_clip": 1.51269531, + "router_z_loss_mlp": 0.11547852, + "step": 8404, + "time_per_iteration": 2.618649959564209 + }, + { + "auxiliary_loss_clip": 0.06428184, + "auxiliary_loss_mlp": 0.01270079, + "balance_loss_clip": 0.06276304, + "balance_loss_mlp": 0.01258092, + "epoch": 0.5053359386742823, + "flos": 20891612390400.0, + "grad_norm": 1.5475179634828664, + "language_loss": 0.75802314, + "learning_rate": 2.0624984236474322e-06, + "loss": 0.83500576, + "num_input_tokens_seen": 180727990, + "router_z_loss_clip": 1.51757812, + "router_z_loss_mlp": 0.11987305, + "step": 8405, + "time_per_iteration": 2.5067524909973145 + }, + { + "auxiliary_loss_clip": 0.0643079, + "auxiliary_loss_mlp": 0.01267126, + "balance_loss_clip": 0.0627564, + "balance_loss_mlp": 0.01253882, + "epoch": 0.5053960619269503, + "flos": 37752499975680.0, + "grad_norm": 1.6248618607930092, + "language_loss": 0.73678941, + "learning_rate": 2.0621091509871378e-06, + "loss": 0.81376863, + "num_input_tokens_seen": 180749765, + "router_z_loss_clip": 1.55175781, + "router_z_loss_mlp": 0.13250732, + "step": 8406, + "time_per_iteration": 2.8841259479522705 + }, + { + "auxiliary_loss_clip": 0.06424634, + "auxiliary_loss_mlp": 0.01267238, + "balance_loss_clip": 0.06275164, + "balance_loss_mlp": 0.01254662, + "epoch": 0.5054561851796182, + "flos": 23520617769600.0, + "grad_norm": 1.7553784713680058, + "language_loss": 0.77329504, + "learning_rate": 2.0617198759716568e-06, + "loss": 0.85021389, + "num_input_tokens_seen": 180769580, + "router_z_loss_clip": 1.49414062, + "router_z_loss_mlp": 0.12579346, + "step": 8407, + "time_per_iteration": 2.5749242305755615 + }, + { + "auxiliary_loss_clip": 0.06430455, + "auxiliary_loss_mlp": 0.01267206, + "balance_loss_clip": 0.06274534, + "balance_loss_mlp": 0.01255434, + "epoch": 0.5055163084322862, + "flos": 30418261223040.0, + "grad_norm": 1.7587183909270583, + "language_loss": 0.63584411, + "learning_rate": 2.0613305986157535e-06, + "loss": 0.71282065, + "num_input_tokens_seen": 180790295, + "router_z_loss_clip": 1.55859375, + "router_z_loss_mlp": 0.11767578, + "step": 8408, + "time_per_iteration": 2.5872433185577393 + }, + { + "auxiliary_loss_clip": 0.06432275, + "auxiliary_loss_mlp": 0.01267048, + "balance_loss_clip": 0.06279387, + "balance_loss_mlp": 0.01253387, + "epoch": 0.5055764316849541, + "flos": 20264720469120.0, + "grad_norm": 2.4280351300793086, + "language_loss": 0.63813823, + "learning_rate": 2.0609413189341865e-06, + "loss": 0.71513146, + "num_input_tokens_seen": 180807875, + "router_z_loss_clip": 1.52734375, + "router_z_loss_mlp": 0.13659668, + "step": 8409, + "time_per_iteration": 2.5165858268737793 + }, + { + "auxiliary_loss_clip": 0.064235, + "auxiliary_loss_mlp": 0.01266011, + "balance_loss_clip": 0.06273322, + "balance_loss_mlp": 0.01254895, + "epoch": 0.5056365549376222, + "flos": 26078611213440.0, + "grad_norm": 1.3852804971458688, + "language_loss": 0.71039546, + "learning_rate": 2.0605520369417193e-06, + "loss": 0.78729057, + "num_input_tokens_seen": 180831300, + "router_z_loss_clip": 1.50097656, + "router_z_loss_mlp": 0.11132812, + "step": 8410, + "time_per_iteration": 2.594809055328369 + }, + { + "auxiliary_loss_clip": 0.0643055, + "auxiliary_loss_mlp": 0.01267353, + "balance_loss_clip": 0.0627602, + "balance_loss_mlp": 0.01254437, + "epoch": 0.5056966781902901, + "flos": 19284739441920.0, + "grad_norm": 1.6144456520966346, + "language_loss": 0.79591584, + "learning_rate": 2.060162752653113e-06, + "loss": 0.87289482, + "num_input_tokens_seen": 180849055, + "router_z_loss_clip": 1.546875, + "router_z_loss_mlp": 0.12921143, + "step": 8411, + "time_per_iteration": 2.53426194190979 + }, + { + "auxiliary_loss_clip": 0.06433219, + "auxiliary_loss_mlp": 0.0126873, + "balance_loss_clip": 0.06276312, + "balance_loss_mlp": 0.01254979, + "epoch": 0.5057568014429581, + "flos": 21329507427840.0, + "grad_norm": 1.7389096144894618, + "language_loss": 0.81907368, + "learning_rate": 2.0597734660831285e-06, + "loss": 0.89609325, + "num_input_tokens_seen": 180867395, + "router_z_loss_clip": 1.56835938, + "router_z_loss_mlp": 0.13757324, + "step": 8412, + "time_per_iteration": 2.517216444015503 + }, + { + "auxiliary_loss_clip": 0.06429601, + "auxiliary_loss_mlp": 0.01270568, + "balance_loss_clip": 0.0627761, + "balance_loss_mlp": 0.01258134, + "epoch": 0.505816924695626, + "flos": 17499351369600.0, + "grad_norm": 1.7713461187517285, + "language_loss": 0.80336094, + "learning_rate": 2.0593841772465283e-06, + "loss": 0.88036257, + "num_input_tokens_seen": 180886670, + "router_z_loss_clip": 1.51953125, + "router_z_loss_mlp": 0.12438965, + "step": 8413, + "time_per_iteration": 2.524210214614868 + }, + { + "auxiliary_loss_clip": 0.06428088, + "auxiliary_loss_mlp": 0.01274079, + "balance_loss_clip": 0.06273276, + "balance_loss_mlp": 0.01260328, + "epoch": 0.505877047948294, + "flos": 21148434754560.0, + "grad_norm": 1.7829708596435327, + "language_loss": 0.80812234, + "learning_rate": 2.0589948861580737e-06, + "loss": 0.885144, + "num_input_tokens_seen": 180904645, + "router_z_loss_clip": 1.54589844, + "router_z_loss_mlp": 0.1373291, + "step": 8414, + "time_per_iteration": 2.5200514793395996 + }, + { + "auxiliary_loss_clip": 0.06426316, + "auxiliary_loss_mlp": 0.01270081, + "balance_loss_clip": 0.06272253, + "balance_loss_mlp": 0.01257468, + "epoch": 0.5059371712009619, + "flos": 36357824292480.0, + "grad_norm": 2.3266509400680935, + "language_loss": 0.62741381, + "learning_rate": 2.058605592832528e-06, + "loss": 0.70437777, + "num_input_tokens_seen": 180922340, + "router_z_loss_clip": 1.5390625, + "router_z_loss_mlp": 0.12615967, + "step": 8415, + "time_per_iteration": 2.676204204559326 + }, + { + "auxiliary_loss_clip": 0.06428116, + "auxiliary_loss_mlp": 0.01272149, + "balance_loss_clip": 0.06274984, + "balance_loss_mlp": 0.01259882, + "epoch": 0.50599729445363, + "flos": 22679809574400.0, + "grad_norm": 1.4983327127759412, + "language_loss": 0.82398355, + "learning_rate": 2.0582162972846515e-06, + "loss": 0.90098619, + "num_input_tokens_seen": 180941350, + "router_z_loss_clip": 1.53320312, + "router_z_loss_mlp": 0.12261963, + "step": 8416, + "time_per_iteration": 2.540487289428711 + }, + { + "auxiliary_loss_clip": 0.06427394, + "auxiliary_loss_mlp": 0.01269018, + "balance_loss_clip": 0.06278178, + "balance_loss_mlp": 0.01257705, + "epoch": 0.5060574177062979, + "flos": 22754553016320.0, + "grad_norm": 1.8321417063208305, + "language_loss": 0.79700905, + "learning_rate": 2.0578269995292078e-06, + "loss": 0.87397313, + "num_input_tokens_seen": 180960720, + "router_z_loss_clip": 1.49023438, + "router_z_loss_mlp": 0.11328125, + "step": 8417, + "time_per_iteration": 2.5462777614593506 + }, + { + "auxiliary_loss_clip": 0.06425334, + "auxiliary_loss_mlp": 0.01268694, + "balance_loss_clip": 0.06277245, + "balance_loss_mlp": 0.01256875, + "epoch": 0.5061175409589659, + "flos": 21659689641600.0, + "grad_norm": 1.7824010317095476, + "language_loss": 0.63313794, + "learning_rate": 2.0574376995809588e-06, + "loss": 0.71007824, + "num_input_tokens_seen": 180979725, + "router_z_loss_clip": 1.48046875, + "router_z_loss_mlp": 0.11816406, + "step": 8418, + "time_per_iteration": 2.5203146934509277 + }, + { + "auxiliary_loss_clip": 0.0643232, + "auxiliary_loss_mlp": 0.01270126, + "balance_loss_clip": 0.06277534, + "balance_loss_mlp": 0.01257877, + "epoch": 0.5061776642116339, + "flos": 21622653336960.0, + "grad_norm": 1.6210660838966935, + "language_loss": 0.77937323, + "learning_rate": 2.0570483974546653e-06, + "loss": 0.85639775, + "num_input_tokens_seen": 180998980, + "router_z_loss_clip": 1.54492188, + "router_z_loss_mlp": 0.12249756, + "step": 8419, + "time_per_iteration": 2.549057722091675 + }, + { + "auxiliary_loss_clip": 0.06433055, + "auxiliary_loss_mlp": 0.01272716, + "balance_loss_clip": 0.06277718, + "balance_loss_mlp": 0.01259955, + "epoch": 0.5062377874643018, + "flos": 24433276440960.0, + "grad_norm": 1.7091767496398438, + "language_loss": 0.77142859, + "learning_rate": 2.0566590931650917e-06, + "loss": 0.8484863, + "num_input_tokens_seen": 181019165, + "router_z_loss_clip": 1.55273438, + "router_z_loss_mlp": 0.12762451, + "step": 8420, + "time_per_iteration": 2.533263921737671 + }, + { + "auxiliary_loss_clip": 0.06430572, + "auxiliary_loss_mlp": 0.0127647, + "balance_loss_clip": 0.06276705, + "balance_loss_mlp": 0.01264311, + "epoch": 0.5062979107169698, + "flos": 22530322690560.0, + "grad_norm": 1.6514243222666503, + "language_loss": 0.77777469, + "learning_rate": 2.056269786726999e-06, + "loss": 0.85484511, + "num_input_tokens_seen": 181037110, + "router_z_loss_clip": 1.5390625, + "router_z_loss_mlp": 0.121521, + "step": 8421, + "time_per_iteration": 2.535022497177124 + }, + { + "auxiliary_loss_clip": 0.06429385, + "auxiliary_loss_mlp": 0.01273249, + "balance_loss_clip": 0.06276778, + "balance_loss_mlp": 0.01261895, + "epoch": 0.5063580339696377, + "flos": 24578947964160.0, + "grad_norm": 1.4350674480860695, + "language_loss": 0.67189109, + "learning_rate": 2.0558804781551512e-06, + "loss": 0.74891746, + "num_input_tokens_seen": 181057775, + "router_z_loss_clip": 1.52636719, + "router_z_loss_mlp": 0.11352539, + "step": 8422, + "time_per_iteration": 2.555051803588867 + }, + { + "auxiliary_loss_clip": 0.064266, + "auxiliary_loss_mlp": 0.01271001, + "balance_loss_clip": 0.06276479, + "balance_loss_mlp": 0.01259241, + "epoch": 0.5064181572223058, + "flos": 22601837750400.0, + "grad_norm": 1.5827559778751017, + "language_loss": 0.81783563, + "learning_rate": 2.05549116746431e-06, + "loss": 0.89481163, + "num_input_tokens_seen": 181078260, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.11755371, + "step": 8423, + "time_per_iteration": 2.606844663619995 + }, + { + "auxiliary_loss_clip": 0.06427386, + "auxiliary_loss_mlp": 0.01268856, + "balance_loss_clip": 0.06273049, + "balance_loss_mlp": 0.01256411, + "epoch": 0.5064782804749737, + "flos": 26002148762880.0, + "grad_norm": 2.1055931359181086, + "language_loss": 0.74535251, + "learning_rate": 2.055101854669237e-06, + "loss": 0.82231486, + "num_input_tokens_seen": 181098755, + "router_z_loss_clip": 1.54199219, + "router_z_loss_mlp": 0.12451172, + "step": 8424, + "time_per_iteration": 2.5353689193725586 + }, + { + "auxiliary_loss_clip": 0.06427233, + "auxiliary_loss_mlp": 0.01264898, + "balance_loss_clip": 0.06278618, + "balance_loss_mlp": 0.0125268, + "epoch": 0.5065384037276417, + "flos": 28561358090880.0, + "grad_norm": 1.333495130602937, + "language_loss": 0.71332014, + "learning_rate": 2.0547125397846975e-06, + "loss": 0.79024142, + "num_input_tokens_seen": 181121570, + "router_z_loss_clip": 1.48632812, + "router_z_loss_mlp": 0.12231445, + "step": 8425, + "time_per_iteration": 2.624431610107422 + }, + { + "auxiliary_loss_clip": 0.06429943, + "auxiliary_loss_mlp": 0.01268875, + "balance_loss_clip": 0.06278015, + "balance_loss_mlp": 0.01257187, + "epoch": 0.5065985269803096, + "flos": 22972620067200.0, + "grad_norm": 1.8777832339890803, + "language_loss": 0.78901541, + "learning_rate": 2.0543232228254524e-06, + "loss": 0.86600357, + "num_input_tokens_seen": 181140240, + "router_z_loss_clip": 1.51855469, + "router_z_loss_mlp": 0.11700439, + "step": 8426, + "time_per_iteration": 3.936661958694458 + }, + { + "auxiliary_loss_clip": 0.06432042, + "auxiliary_loss_mlp": 0.0127276, + "balance_loss_clip": 0.06277739, + "balance_loss_mlp": 0.01260768, + "epoch": 0.5066586502329776, + "flos": 21613680950400.0, + "grad_norm": 2.2511428758914325, + "language_loss": 0.7803759, + "learning_rate": 2.053933903806265e-06, + "loss": 0.85742396, + "num_input_tokens_seen": 181158630, + "router_z_loss_clip": 1.54199219, + "router_z_loss_mlp": 0.12005615, + "step": 8427, + "time_per_iteration": 2.5481557846069336 + }, + { + "auxiliary_loss_clip": 0.06424822, + "auxiliary_loss_mlp": 0.01267004, + "balance_loss_clip": 0.06275385, + "balance_loss_mlp": 0.01255268, + "epoch": 0.5067187734856455, + "flos": 20346214164480.0, + "grad_norm": 1.5242931798978783, + "language_loss": 0.719284, + "learning_rate": 2.0535445827418997e-06, + "loss": 0.79620224, + "num_input_tokens_seen": 181176405, + "router_z_loss_clip": 1.49316406, + "router_z_loss_mlp": 0.11737061, + "step": 8428, + "time_per_iteration": 2.5370116233825684 + }, + { + "auxiliary_loss_clip": 0.06427782, + "auxiliary_loss_mlp": 0.01268707, + "balance_loss_clip": 0.0627581, + "balance_loss_mlp": 0.0125799, + "epoch": 0.5067788967383136, + "flos": 28848801922560.0, + "grad_norm": 1.7598513800416933, + "language_loss": 0.83218622, + "learning_rate": 2.0531552596471168e-06, + "loss": 0.90915114, + "num_input_tokens_seen": 181197595, + "router_z_loss_clip": 1.51953125, + "router_z_loss_mlp": 0.10717773, + "step": 8429, + "time_per_iteration": 2.5739033222198486 + }, + { + "auxiliary_loss_clip": 0.06435312, + "auxiliary_loss_mlp": 0.01266816, + "balance_loss_clip": 0.06276707, + "balance_loss_mlp": 0.01254013, + "epoch": 0.5068390199909815, + "flos": 32457997964160.0, + "grad_norm": 4.868596583088969, + "language_loss": 0.7373606, + "learning_rate": 2.052765934536682e-06, + "loss": 0.8143819, + "num_input_tokens_seen": 181218560, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 0.12805176, + "step": 8430, + "time_per_iteration": 4.062525749206543 + }, + { + "auxiliary_loss_clip": 0.06428299, + "auxiliary_loss_mlp": 0.01270046, + "balance_loss_clip": 0.06275186, + "balance_loss_mlp": 0.01258334, + "epoch": 0.5068991432436495, + "flos": 23152896126720.0, + "grad_norm": 1.801463516744859, + "language_loss": 0.76942408, + "learning_rate": 2.0523766074253575e-06, + "loss": 0.84640753, + "num_input_tokens_seen": 181237095, + "router_z_loss_clip": 1.53027344, + "router_z_loss_mlp": 0.1171875, + "step": 8431, + "time_per_iteration": 2.535198211669922 + }, + { + "auxiliary_loss_clip": 0.06426188, + "auxiliary_loss_mlp": 0.01266777, + "balance_loss_clip": 0.06275809, + "balance_loss_mlp": 0.0125488, + "epoch": 0.5069592664963174, + "flos": 19941917414400.0, + "grad_norm": 1.5385752235820749, + "language_loss": 0.72917402, + "learning_rate": 2.0519872783279074e-06, + "loss": 0.80610371, + "num_input_tokens_seen": 181255940, + "router_z_loss_clip": 1.50292969, + "router_z_loss_mlp": 0.11901855, + "step": 8432, + "time_per_iteration": 2.5343048572540283 + }, + { + "auxiliary_loss_clip": 0.06319194, + "auxiliary_loss_mlp": 0.01252325, + "balance_loss_clip": 0.06253257, + "balance_loss_mlp": 0.01250496, + "epoch": 0.5070193897489854, + "flos": 65812539888000.0, + "grad_norm": 0.7543358557352665, + "language_loss": 0.63621199, + "learning_rate": 2.0515979472590945e-06, + "loss": 0.71192724, + "num_input_tokens_seen": 181316945, + "router_z_loss_clip": 0.66015625, + "router_z_loss_mlp": 0.01824951, + "step": 8433, + "time_per_iteration": 3.1825270652770996 + }, + { + "auxiliary_loss_clip": 0.06432432, + "auxiliary_loss_mlp": 0.01266931, + "balance_loss_clip": 0.06279546, + "balance_loss_mlp": 0.01254414, + "epoch": 0.5070795130016534, + "flos": 17281158537600.0, + "grad_norm": 2.2002665512489505, + "language_loss": 0.77719331, + "learning_rate": 2.051208614233681e-06, + "loss": 0.85418689, + "num_input_tokens_seen": 181335555, + "router_z_loss_clip": 1.52832031, + "router_z_loss_mlp": 0.12512207, + "step": 8434, + "time_per_iteration": 2.51298451423645 + }, + { + "auxiliary_loss_clip": 0.06435563, + "auxiliary_loss_mlp": 0.01265248, + "balance_loss_clip": 0.06278686, + "balance_loss_mlp": 0.01253047, + "epoch": 0.5071396362543213, + "flos": 21076416570240.0, + "grad_norm": 1.9257186196996396, + "language_loss": 0.7107513, + "learning_rate": 2.0508192792664326e-06, + "loss": 0.78775942, + "num_input_tokens_seen": 181354580, + "router_z_loss_clip": 1.56738281, + "router_z_loss_mlp": 0.12207031, + "step": 8435, + "time_per_iteration": 3.9952967166900635 + }, + { + "auxiliary_loss_clip": 0.06431434, + "auxiliary_loss_mlp": 0.01269503, + "balance_loss_clip": 0.06278223, + "balance_loss_mlp": 0.01256646, + "epoch": 0.5071997595069894, + "flos": 23150841701760.0, + "grad_norm": 1.974114732671287, + "language_loss": 0.72623628, + "learning_rate": 2.050429942372112e-06, + "loss": 0.80324566, + "num_input_tokens_seen": 181374320, + "router_z_loss_clip": 1.53222656, + "router_z_loss_mlp": 0.128479, + "step": 8436, + "time_per_iteration": 2.5126936435699463 + }, + { + "auxiliary_loss_clip": 0.06431168, + "auxiliary_loss_mlp": 0.01266132, + "balance_loss_clip": 0.06278354, + "balance_loss_mlp": 0.01253449, + "epoch": 0.5072598827596573, + "flos": 22753756402560.0, + "grad_norm": 2.390958224451536, + "language_loss": 0.84374195, + "learning_rate": 2.050040603565483e-06, + "loss": 0.92071497, + "num_input_tokens_seen": 181392190, + "router_z_loss_clip": 1.52832031, + "router_z_loss_mlp": 0.12701416, + "step": 8437, + "time_per_iteration": 2.5411131381988525 + }, + { + "auxiliary_loss_clip": 0.06423598, + "auxiliary_loss_mlp": 0.01265882, + "balance_loss_clip": 0.06273607, + "balance_loss_mlp": 0.01254128, + "epoch": 0.5073200060123253, + "flos": 22573102999680.0, + "grad_norm": 1.4207198809320167, + "language_loss": 0.80947453, + "learning_rate": 2.049651262861309e-06, + "loss": 0.88636929, + "num_input_tokens_seen": 181413890, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.11749268, + "step": 8438, + "time_per_iteration": 2.5992414951324463 + }, + { + "auxiliary_loss_clip": 0.06431951, + "auxiliary_loss_mlp": 0.01267455, + "balance_loss_clip": 0.06277303, + "balance_loss_mlp": 0.0125458, + "epoch": 0.5073801292649932, + "flos": 25812481046400.0, + "grad_norm": 1.639362892711676, + "language_loss": 0.7992267, + "learning_rate": 2.0492619202743543e-06, + "loss": 0.87622082, + "num_input_tokens_seen": 181433240, + "router_z_loss_clip": 1.54589844, + "router_z_loss_mlp": 0.12872314, + "step": 8439, + "time_per_iteration": 2.5635995864868164 + }, + { + "auxiliary_loss_clip": 0.06422722, + "auxiliary_loss_mlp": 0.01265384, + "balance_loss_clip": 0.06272503, + "balance_loss_mlp": 0.01253833, + "epoch": 0.5074402525176612, + "flos": 25380916992000.0, + "grad_norm": 1.6123120964481592, + "language_loss": 0.71044374, + "learning_rate": 2.048872575819383e-06, + "loss": 0.78732479, + "num_input_tokens_seen": 181453535, + "router_z_loss_clip": 1.50292969, + "router_z_loss_mlp": 0.11560059, + "step": 8440, + "time_per_iteration": 2.54082989692688 + }, + { + "auxiliary_loss_clip": 0.0642738, + "auxiliary_loss_mlp": 0.0126602, + "balance_loss_clip": 0.06274064, + "balance_loss_mlp": 0.01254278, + "epoch": 0.5075003757703291, + "flos": 26071064346240.0, + "grad_norm": 1.625029424987906, + "language_loss": 0.71058178, + "learning_rate": 2.048483229511158e-06, + "loss": 0.78751576, + "num_input_tokens_seen": 181474195, + "router_z_loss_clip": 1.53417969, + "router_z_loss_mlp": 0.11743164, + "step": 8441, + "time_per_iteration": 2.5597851276397705 + }, + { + "auxiliary_loss_clip": 0.06432067, + "auxiliary_loss_mlp": 0.0126825, + "balance_loss_clip": 0.06275806, + "balance_loss_mlp": 0.01255608, + "epoch": 0.5075604990229972, + "flos": 21841936272000.0, + "grad_norm": 1.6251927502787415, + "language_loss": 0.64299369, + "learning_rate": 2.0480938813644445e-06, + "loss": 0.71999681, + "num_input_tokens_seen": 181494000, + "router_z_loss_clip": 1.56152344, + "router_z_loss_mlp": 0.12634277, + "step": 8442, + "time_per_iteration": 3.9658992290496826 + }, + { + "auxiliary_loss_clip": 0.06421914, + "auxiliary_loss_mlp": 0.01270692, + "balance_loss_clip": 0.06274506, + "balance_loss_mlp": 0.01259475, + "epoch": 0.5076206222756651, + "flos": 31986923909760.0, + "grad_norm": 1.4468343781265969, + "language_loss": 0.71796834, + "learning_rate": 2.047704531394006e-06, + "loss": 0.7948944, + "num_input_tokens_seen": 181515955, + "router_z_loss_clip": 1.47460938, + "router_z_loss_mlp": 0.11212158, + "step": 8443, + "time_per_iteration": 2.6133296489715576 + }, + { + "auxiliary_loss_clip": 0.06430129, + "auxiliary_loss_mlp": 0.01267886, + "balance_loss_clip": 0.06273551, + "balance_loss_mlp": 0.01255506, + "epoch": 0.5076807455283331, + "flos": 36913033445760.0, + "grad_norm": 1.2663152678698668, + "language_loss": 0.62379253, + "learning_rate": 2.047315179614607e-06, + "loss": 0.70077264, + "num_input_tokens_seen": 181540225, + "router_z_loss_clip": 1.56542969, + "router_z_loss_mlp": 0.12390137, + "step": 8444, + "time_per_iteration": 2.670844554901123 + }, + { + "auxiliary_loss_clip": 0.06426448, + "auxiliary_loss_mlp": 0.01266149, + "balance_loss_clip": 0.06273904, + "balance_loss_mlp": 0.01255158, + "epoch": 0.507740868781001, + "flos": 29870263520640.0, + "grad_norm": 1.5635527032998127, + "language_loss": 0.64163882, + "learning_rate": 2.046925826041012e-06, + "loss": 0.71856481, + "num_input_tokens_seen": 181560125, + "router_z_loss_clip": 1.52441406, + "router_z_loss_mlp": 0.10992432, + "step": 8445, + "time_per_iteration": 2.564972162246704 + }, + { + "auxiliary_loss_clip": 0.06326441, + "auxiliary_loss_mlp": 0.01258393, + "balance_loss_clip": 0.06260093, + "balance_loss_mlp": 0.0125657, + "epoch": 0.507800992033669, + "flos": 61935872014080.0, + "grad_norm": 0.8045039829713045, + "language_loss": 0.61588788, + "learning_rate": 2.0465364706879845e-06, + "loss": 0.69173622, + "num_input_tokens_seen": 181618830, + "router_z_loss_clip": 0.6640625, + "router_z_loss_mlp": 0.01817322, + "step": 8446, + "time_per_iteration": 3.1747779846191406 + }, + { + "auxiliary_loss_clip": 0.06424413, + "auxiliary_loss_mlp": 0.01266643, + "balance_loss_clip": 0.06272733, + "balance_loss_mlp": 0.01254394, + "epoch": 0.507861115286337, + "flos": 20706137377920.0, + "grad_norm": 4.618603604158377, + "language_loss": 0.80737472, + "learning_rate": 2.04614711357029e-06, + "loss": 0.88428527, + "num_input_tokens_seen": 181637120, + "router_z_loss_clip": 1.51757812, + "router_z_loss_mlp": 0.12243652, + "step": 8447, + "time_per_iteration": 2.510443687438965 + }, + { + "auxiliary_loss_clip": 0.06422254, + "auxiliary_loss_mlp": 0.01267237, + "balance_loss_clip": 0.06272172, + "balance_loss_mlp": 0.01255775, + "epoch": 0.507921238539005, + "flos": 30854982303360.0, + "grad_norm": 1.2702922663182385, + "language_loss": 0.70493698, + "learning_rate": 2.0457577547026916e-06, + "loss": 0.78183186, + "num_input_tokens_seen": 181659965, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.11456299, + "step": 8448, + "time_per_iteration": 2.6021034717559814 + }, + { + "auxiliary_loss_clip": 0.06427675, + "auxiliary_loss_mlp": 0.01268661, + "balance_loss_clip": 0.0627776, + "balance_loss_mlp": 0.0125745, + "epoch": 0.507981361791673, + "flos": 35709031728000.0, + "grad_norm": 1.3111664343686333, + "language_loss": 0.72171003, + "learning_rate": 2.045368394099955e-06, + "loss": 0.79867339, + "num_input_tokens_seen": 181685290, + "router_z_loss_clip": 1.49804688, + "router_z_loss_mlp": 0.11199951, + "step": 8449, + "time_per_iteration": 2.6752874851226807 + }, + { + "auxiliary_loss_clip": 0.06426987, + "auxiliary_loss_mlp": 0.01268113, + "balance_loss_clip": 0.06274859, + "balance_loss_mlp": 0.0125686, + "epoch": 0.5080414850443409, + "flos": 27168694905600.0, + "grad_norm": 1.3940572087719376, + "language_loss": 0.73039591, + "learning_rate": 2.044979031776844e-06, + "loss": 0.80734688, + "num_input_tokens_seen": 181706080, + "router_z_loss_clip": 1.52050781, + "router_z_loss_mlp": 0.11254883, + "step": 8450, + "time_per_iteration": 2.6428375244140625 + }, + { + "auxiliary_loss_clip": 0.06430449, + "auxiliary_loss_mlp": 0.0127298, + "balance_loss_clip": 0.06278583, + "balance_loss_mlp": 0.01261148, + "epoch": 0.5081016082970089, + "flos": 27091855111680.0, + "grad_norm": 1.6054602673211236, + "language_loss": 0.7744205, + "learning_rate": 2.0445896677481234e-06, + "loss": 0.85145479, + "num_input_tokens_seen": 181724805, + "router_z_loss_clip": 1.51757812, + "router_z_loss_mlp": 0.1184082, + "step": 8451, + "time_per_iteration": 2.6066558361053467 + }, + { + "auxiliary_loss_clip": 0.06429529, + "auxiliary_loss_mlp": 0.01266696, + "balance_loss_clip": 0.06276423, + "balance_loss_mlp": 0.01254531, + "epoch": 0.5081617315496768, + "flos": 22863104380800.0, + "grad_norm": 1.825930217148951, + "language_loss": 0.85374677, + "learning_rate": 2.044200302028559e-06, + "loss": 0.930709, + "num_input_tokens_seen": 181743725, + "router_z_loss_clip": 1.53027344, + "router_z_loss_mlp": 0.12158203, + "step": 8452, + "time_per_iteration": 2.5062003135681152 + }, + { + "auxiliary_loss_clip": 0.06431726, + "auxiliary_loss_mlp": 0.01267185, + "balance_loss_clip": 0.06276073, + "balance_loss_mlp": 0.01254716, + "epoch": 0.5082218548023448, + "flos": 16286167630080.0, + "grad_norm": 2.3752555926719343, + "language_loss": 0.77806371, + "learning_rate": 2.0438109346329143e-06, + "loss": 0.85505283, + "num_input_tokens_seen": 181757720, + "router_z_loss_clip": 1.55566406, + "router_z_loss_mlp": 0.12463379, + "step": 8453, + "time_per_iteration": 2.4981954097747803 + }, + { + "auxiliary_loss_clip": 0.06430794, + "auxiliary_loss_mlp": 0.01268463, + "balance_loss_clip": 0.06281981, + "balance_loss_mlp": 0.0125774, + "epoch": 0.5082819780550127, + "flos": 24467419779840.0, + "grad_norm": 1.5957908763151711, + "language_loss": 0.76932752, + "learning_rate": 2.0434215655759544e-06, + "loss": 0.84632009, + "num_input_tokens_seen": 181778545, + "router_z_loss_clip": 1.48828125, + "router_z_loss_mlp": 0.1072998, + "step": 8454, + "time_per_iteration": 2.6134133338928223 + }, + { + "auxiliary_loss_clip": 0.06431732, + "auxiliary_loss_mlp": 0.01271277, + "balance_loss_clip": 0.06279022, + "balance_loss_mlp": 0.01259118, + "epoch": 0.5083421013076808, + "flos": 23409844272000.0, + "grad_norm": 1.4822981638740835, + "language_loss": 0.89621413, + "learning_rate": 2.0430321948724446e-06, + "loss": 0.97324431, + "num_input_tokens_seen": 181799495, + "router_z_loss_clip": 1.52734375, + "router_z_loss_mlp": 0.1217041, + "step": 8455, + "time_per_iteration": 2.6085920333862305 + }, + { + "auxiliary_loss_clip": 0.06434034, + "auxiliary_loss_mlp": 0.01274373, + "balance_loss_clip": 0.06275303, + "balance_loss_mlp": 0.01260831, + "epoch": 0.5084022245603487, + "flos": 23878528485120.0, + "grad_norm": 1.6442671341978696, + "language_loss": 0.62785953, + "learning_rate": 2.042642822537149e-06, + "loss": 0.7049436, + "num_input_tokens_seen": 181818400, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.13555908, + "step": 8456, + "time_per_iteration": 2.5377745628356934 + }, + { + "auxiliary_loss_clip": 0.06329988, + "auxiliary_loss_mlp": 0.01255905, + "balance_loss_clip": 0.06263152, + "balance_loss_mlp": 0.01253715, + "epoch": 0.5084623478130167, + "flos": 62891352921600.0, + "grad_norm": 0.8103581861082657, + "language_loss": 0.62548244, + "learning_rate": 2.0422534485848343e-06, + "loss": 0.70134139, + "num_input_tokens_seen": 181875975, + "router_z_loss_clip": 0.67138672, + "router_z_loss_mlp": 0.02194214, + "step": 8457, + "time_per_iteration": 3.0378763675689697 + }, + { + "auxiliary_loss_clip": 0.06436984, + "auxiliary_loss_mlp": 0.01271319, + "balance_loss_clip": 0.06280852, + "balance_loss_mlp": 0.01258337, + "epoch": 0.5085224710656846, + "flos": 22352688034560.0, + "grad_norm": 1.5276658426580998, + "language_loss": 0.67559206, + "learning_rate": 2.0418640730302644e-06, + "loss": 0.75267512, + "num_input_tokens_seen": 181896450, + "router_z_loss_clip": 1.56152344, + "router_z_loss_mlp": 0.12976074, + "step": 8458, + "time_per_iteration": 2.5329530239105225 + }, + { + "auxiliary_loss_clip": 0.06432781, + "auxiliary_loss_mlp": 0.01272615, + "balance_loss_clip": 0.0627652, + "balance_loss_mlp": 0.01260015, + "epoch": 0.5085825943183526, + "flos": 26073202625280.0, + "grad_norm": 1.618055128351248, + "language_loss": 0.77449083, + "learning_rate": 2.0414746958882043e-06, + "loss": 0.85154486, + "num_input_tokens_seen": 181916770, + "router_z_loss_clip": 1.56152344, + "router_z_loss_mlp": 0.1260376, + "step": 8459, + "time_per_iteration": 2.5590224266052246 + }, + { + "auxiliary_loss_clip": 0.06437792, + "auxiliary_loss_mlp": 0.01271084, + "balance_loss_clip": 0.06279328, + "balance_loss_mlp": 0.01258132, + "epoch": 0.5086427175710206, + "flos": 17426494644480.0, + "grad_norm": 2.2202109072156664, + "language_loss": 0.81101096, + "learning_rate": 2.0410853171734196e-06, + "loss": 0.88809973, + "num_input_tokens_seen": 181932710, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 0.12945557, + "step": 8460, + "time_per_iteration": 2.4797065258026123 + }, + { + "auxiliary_loss_clip": 0.06432672, + "auxiliary_loss_mlp": 0.01272652, + "balance_loss_clip": 0.06276584, + "balance_loss_mlp": 0.01259968, + "epoch": 0.5087028408236886, + "flos": 20638102262400.0, + "grad_norm": 1.6011145053716882, + "language_loss": 0.69150776, + "learning_rate": 2.0406959369006754e-06, + "loss": 0.76856101, + "num_input_tokens_seen": 181950665, + "router_z_loss_clip": 1.56054688, + "router_z_loss_mlp": 0.12677002, + "step": 8461, + "time_per_iteration": 2.5423507690429688 + }, + { + "auxiliary_loss_clip": 0.06423958, + "auxiliary_loss_mlp": 0.01270241, + "balance_loss_clip": 0.06275716, + "balance_loss_mlp": 0.01258052, + "epoch": 0.5087629640763566, + "flos": 25600996540800.0, + "grad_norm": 1.5704547594862186, + "language_loss": 0.76788783, + "learning_rate": 2.0403065550847375e-06, + "loss": 0.84482986, + "num_input_tokens_seen": 181971270, + "router_z_loss_clip": 1.48046875, + "router_z_loss_mlp": 0.12200928, + "step": 8462, + "time_per_iteration": 2.5558974742889404 + }, + { + "auxiliary_loss_clip": 0.06431352, + "auxiliary_loss_mlp": 0.01267196, + "balance_loss_clip": 0.06279621, + "balance_loss_mlp": 0.01255251, + "epoch": 0.5088230873290245, + "flos": 13266743351040.0, + "grad_norm": 1.98943246577739, + "language_loss": 0.81940925, + "learning_rate": 2.0399171717403706e-06, + "loss": 0.89639473, + "num_input_tokens_seen": 181988410, + "router_z_loss_clip": 1.515625, + "router_z_loss_mlp": 0.11938477, + "step": 8463, + "time_per_iteration": 2.5092854499816895 + }, + { + "auxiliary_loss_clip": 0.06429717, + "auxiliary_loss_mlp": 0.01268295, + "balance_loss_clip": 0.06277439, + "balance_loss_mlp": 0.01255974, + "epoch": 0.5088832105816925, + "flos": 20048959405440.0, + "grad_norm": 4.395577464341562, + "language_loss": 0.76639092, + "learning_rate": 2.039527786882341e-06, + "loss": 0.84337103, + "num_input_tokens_seen": 182006530, + "router_z_loss_clip": 1.5234375, + "router_z_loss_mlp": 0.12310791, + "step": 8464, + "time_per_iteration": 2.5100886821746826 + }, + { + "auxiliary_loss_clip": 0.06332754, + "auxiliary_loss_mlp": 0.01251908, + "balance_loss_clip": 0.06266724, + "balance_loss_mlp": 0.01250196, + "epoch": 0.5089433338343604, + "flos": 67445072184960.0, + "grad_norm": 0.674227101372006, + "language_loss": 0.59172922, + "learning_rate": 2.0391384005254133e-06, + "loss": 0.66757584, + "num_input_tokens_seen": 182074240, + "router_z_loss_clip": 0.66357422, + "router_z_loss_mlp": 0.01716614, + "step": 8465, + "time_per_iteration": 3.288703441619873 + }, + { + "auxiliary_loss_clip": 0.06429654, + "auxiliary_loss_mlp": 0.01267036, + "balance_loss_clip": 0.06277246, + "balance_loss_mlp": 0.01255026, + "epoch": 0.5090034570870284, + "flos": 22716845879040.0, + "grad_norm": 1.7766724873518385, + "language_loss": 0.80341208, + "learning_rate": 2.038749012684354e-06, + "loss": 0.88037896, + "num_input_tokens_seen": 182093360, + "router_z_loss_clip": 1.52246094, + "router_z_loss_mlp": 0.12005615, + "step": 8466, + "time_per_iteration": 3.9034652709960938 + }, + { + "auxiliary_loss_clip": 0.06428038, + "auxiliary_loss_mlp": 0.01262494, + "balance_loss_clip": 0.06276771, + "balance_loss_mlp": 0.01250603, + "epoch": 0.5090635803396963, + "flos": 20451537146880.0, + "grad_norm": 1.506058765425311, + "language_loss": 0.78925973, + "learning_rate": 2.0383596233739286e-06, + "loss": 0.86616498, + "num_input_tokens_seen": 182110170, + "router_z_loss_clip": 1.51074219, + "router_z_loss_mlp": 0.11895752, + "step": 8467, + "time_per_iteration": 2.483701229095459 + }, + { + "auxiliary_loss_clip": 0.06425558, + "auxiliary_loss_mlp": 0.01269027, + "balance_loss_clip": 0.06277174, + "balance_loss_mlp": 0.01257565, + "epoch": 0.5091237035923644, + "flos": 23775637271040.0, + "grad_norm": 1.593164773968791, + "language_loss": 0.74572229, + "learning_rate": 2.0379702326089013e-06, + "loss": 0.82266819, + "num_input_tokens_seen": 182129570, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.11468506, + "step": 8468, + "time_per_iteration": 2.550657033920288 + }, + { + "auxiliary_loss_clip": 0.06425174, + "auxiliary_loss_mlp": 0.01264118, + "balance_loss_clip": 0.06274162, + "balance_loss_mlp": 0.01252108, + "epoch": 0.5091838268450323, + "flos": 18332990040960.0, + "grad_norm": 1.7522760366327397, + "language_loss": 0.78574747, + "learning_rate": 2.03758084040404e-06, + "loss": 0.86264038, + "num_input_tokens_seen": 182147565, + "router_z_loss_clip": 1.50976562, + "router_z_loss_mlp": 0.12011719, + "step": 8469, + "time_per_iteration": 2.4776134490966797 + }, + { + "auxiliary_loss_clip": 0.06431125, + "auxiliary_loss_mlp": 0.012685, + "balance_loss_clip": 0.0627888, + "balance_loss_mlp": 0.01256526, + "epoch": 0.5092439500977003, + "flos": 29064982256640.0, + "grad_norm": 1.429622552318455, + "language_loss": 0.6959703, + "learning_rate": 2.037191446774109e-06, + "loss": 0.7729665, + "num_input_tokens_seen": 182169695, + "router_z_loss_clip": 1.52050781, + "router_z_loss_mlp": 0.11968994, + "step": 8470, + "time_per_iteration": 4.06356954574585 + }, + { + "auxiliary_loss_clip": 0.06432179, + "auxiliary_loss_mlp": 0.01268896, + "balance_loss_clip": 0.06276524, + "balance_loss_mlp": 0.01256278, + "epoch": 0.5093040733503682, + "flos": 13559134573440.0, + "grad_norm": 1.739958995441318, + "language_loss": 0.73736298, + "learning_rate": 2.0368020517338745e-06, + "loss": 0.81437373, + "num_input_tokens_seen": 182186385, + "router_z_loss_clip": 1.55664062, + "router_z_loss_mlp": 0.12615967, + "step": 8471, + "time_per_iteration": 2.5252416133880615 + }, + { + "auxiliary_loss_clip": 0.06330768, + "auxiliary_loss_mlp": 0.01255323, + "balance_loss_clip": 0.06264758, + "balance_loss_mlp": 0.01253313, + "epoch": 0.5093641966030362, + "flos": 68927838837120.0, + "grad_norm": 0.738097810584446, + "language_loss": 0.58042324, + "learning_rate": 2.036412655298103e-06, + "loss": 0.65628415, + "num_input_tokens_seen": 182247095, + "router_z_loss_clip": 0.66015625, + "router_z_loss_mlp": 0.02009583, + "step": 8472, + "time_per_iteration": 3.1610372066497803 + }, + { + "auxiliary_loss_clip": 0.06430018, + "auxiliary_loss_mlp": 0.01266308, + "balance_loss_clip": 0.06275266, + "balance_loss_mlp": 0.01254953, + "epoch": 0.5094243198557042, + "flos": 21587545676160.0, + "grad_norm": 1.8344067804800992, + "language_loss": 0.69000626, + "learning_rate": 2.03602325748156e-06, + "loss": 0.76696956, + "num_input_tokens_seen": 182266380, + "router_z_loss_clip": 1.54492188, + "router_z_loss_mlp": 0.11358643, + "step": 8473, + "time_per_iteration": 2.5834267139434814 + }, + { + "auxiliary_loss_clip": 0.06430315, + "auxiliary_loss_mlp": 0.01267159, + "balance_loss_clip": 0.06279565, + "balance_loss_mlp": 0.01255143, + "epoch": 0.5094844431083722, + "flos": 28848382652160.0, + "grad_norm": 2.5664905714857422, + "language_loss": 0.85103536, + "learning_rate": 2.0356338582990105e-06, + "loss": 0.92801011, + "num_input_tokens_seen": 182284685, + "router_z_loss_clip": 1.50585938, + "router_z_loss_mlp": 0.12011719, + "step": 8474, + "time_per_iteration": 2.5577685832977295 + }, + { + "auxiliary_loss_clip": 0.06432322, + "auxiliary_loss_mlp": 0.0126557, + "balance_loss_clip": 0.06278027, + "balance_loss_mlp": 0.01253488, + "epoch": 0.5095445663610402, + "flos": 14981454904320.0, + "grad_norm": 1.910358455820602, + "language_loss": 0.64868319, + "learning_rate": 2.035244457765222e-06, + "loss": 0.72566211, + "num_input_tokens_seen": 182301810, + "router_z_loss_clip": 1.54101562, + "router_z_loss_mlp": 0.12091064, + "step": 8475, + "time_per_iteration": 3.9494359493255615 + }, + { + "auxiliary_loss_clip": 0.06435733, + "auxiliary_loss_mlp": 0.01268463, + "balance_loss_clip": 0.0627934, + "balance_loss_mlp": 0.01255779, + "epoch": 0.5096046896137081, + "flos": 20783354515200.0, + "grad_norm": 2.1677913618760623, + "language_loss": 0.8248105, + "learning_rate": 2.0348550558949605e-06, + "loss": 0.90185243, + "num_input_tokens_seen": 182320285, + "router_z_loss_clip": 1.56445312, + "router_z_loss_mlp": 0.12689209, + "step": 8476, + "time_per_iteration": 2.533986806869507 + }, + { + "auxiliary_loss_clip": 0.06432153, + "auxiliary_loss_mlp": 0.01267228, + "balance_loss_clip": 0.06275326, + "balance_loss_mlp": 0.01254628, + "epoch": 0.5096648128663761, + "flos": 23191735294080.0, + "grad_norm": 2.112211155301917, + "language_loss": 0.81339389, + "learning_rate": 2.0344656527029917e-06, + "loss": 0.89038771, + "num_input_tokens_seen": 182339465, + "router_z_loss_clip": 1.56738281, + "router_z_loss_mlp": 0.12609863, + "step": 8477, + "time_per_iteration": 2.614363193511963 + }, + { + "auxiliary_loss_clip": 0.06429507, + "auxiliary_loss_mlp": 0.01267776, + "balance_loss_clip": 0.0627466, + "balance_loss_mlp": 0.01254741, + "epoch": 0.509724936119044, + "flos": 22315945219200.0, + "grad_norm": 1.7511302636686703, + "language_loss": 0.61918831, + "learning_rate": 2.034076248204082e-06, + "loss": 0.69616115, + "num_input_tokens_seen": 182358375, + "router_z_loss_clip": 1.546875, + "router_z_loss_mlp": 0.13024902, + "step": 8478, + "time_per_iteration": 2.5054080486297607 + }, + { + "auxiliary_loss_clip": 0.06424017, + "auxiliary_loss_mlp": 0.01267914, + "balance_loss_clip": 0.06273499, + "balance_loss_mlp": 0.01256136, + "epoch": 0.509785059371712, + "flos": 26294372277120.0, + "grad_norm": 1.8013233320362476, + "language_loss": 0.66670853, + "learning_rate": 2.0336868424129968e-06, + "loss": 0.74362785, + "num_input_tokens_seen": 182377935, + "router_z_loss_clip": 1.50390625, + "router_z_loss_mlp": 0.11773682, + "step": 8479, + "time_per_iteration": 2.5773558616638184 + }, + { + "auxiliary_loss_clip": 0.06427336, + "auxiliary_loss_mlp": 0.01266645, + "balance_loss_clip": 0.06276052, + "balance_loss_mlp": 0.01254795, + "epoch": 0.50984518262438, + "flos": 22970942985600.0, + "grad_norm": 1.5048945656562989, + "language_loss": 0.69523573, + "learning_rate": 2.0332974353445037e-06, + "loss": 0.77217555, + "num_input_tokens_seen": 182396440, + "router_z_loss_clip": 1.51367188, + "router_z_loss_mlp": 0.1184082, + "step": 8480, + "time_per_iteration": 2.5308327674865723 + }, + { + "auxiliary_loss_clip": 0.06433358, + "auxiliary_loss_mlp": 0.01264781, + "balance_loss_clip": 0.06277278, + "balance_loss_mlp": 0.01252908, + "epoch": 0.509905305877048, + "flos": 26220551230080.0, + "grad_norm": 1.695627830792001, + "language_loss": 0.79513025, + "learning_rate": 2.0329080270133688e-06, + "loss": 0.87211168, + "num_input_tokens_seen": 182415890, + "router_z_loss_clip": 1.5625, + "router_z_loss_mlp": 0.11865234, + "step": 8481, + "time_per_iteration": 3.9862852096557617 + }, + { + "auxiliary_loss_clip": 0.06423856, + "auxiliary_loss_mlp": 0.01267023, + "balance_loss_clip": 0.06274414, + "balance_loss_mlp": 0.01255186, + "epoch": 0.5099654291297159, + "flos": 20346381872640.0, + "grad_norm": 1.4463685523965593, + "language_loss": 0.83447778, + "learning_rate": 2.0325186174343578e-06, + "loss": 0.91138661, + "num_input_tokens_seen": 182434235, + "router_z_loss_clip": 1.49414062, + "router_z_loss_mlp": 0.1184082, + "step": 8482, + "time_per_iteration": 2.539057970046997 + }, + { + "auxiliary_loss_clip": 0.06432243, + "auxiliary_loss_mlp": 0.01269925, + "balance_loss_clip": 0.0627501, + "balance_loss_mlp": 0.01257682, + "epoch": 0.5100255523823839, + "flos": 29061711947520.0, + "grad_norm": 1.7174746607832896, + "language_loss": 0.85923511, + "learning_rate": 2.032129206622238e-06, + "loss": 0.93625677, + "num_input_tokens_seen": 182454360, + "router_z_loss_clip": 1.57128906, + "router_z_loss_mlp": 0.12243652, + "step": 8483, + "time_per_iteration": 2.5567803382873535 + }, + { + "auxiliary_loss_clip": 0.06428108, + "auxiliary_loss_mlp": 0.01267951, + "balance_loss_clip": 0.06273945, + "balance_loss_mlp": 0.01256352, + "epoch": 0.5100856756350518, + "flos": 22462539137280.0, + "grad_norm": 3.7192784343186367, + "language_loss": 0.83011222, + "learning_rate": 2.031739794591775e-06, + "loss": 0.90707278, + "num_input_tokens_seen": 182471940, + "router_z_loss_clip": 1.54101562, + "router_z_loss_mlp": 0.11590576, + "step": 8484, + "time_per_iteration": 2.50913143157959 + }, + { + "auxiliary_loss_clip": 0.0642792, + "auxiliary_loss_mlp": 0.0126741, + "balance_loss_clip": 0.06274521, + "balance_loss_mlp": 0.01254953, + "epoch": 0.5101457988877198, + "flos": 19176942764160.0, + "grad_norm": 1.8545423824290383, + "language_loss": 0.81929463, + "learning_rate": 2.031350381357736e-06, + "loss": 0.89624798, + "num_input_tokens_seen": 182490685, + "router_z_loss_clip": 1.53222656, + "router_z_loss_mlp": 0.12463379, + "step": 8485, + "time_per_iteration": 2.479165554046631 + }, + { + "auxiliary_loss_clip": 0.06421156, + "auxiliary_loss_mlp": 0.01266312, + "balance_loss_clip": 0.06272486, + "balance_loss_mlp": 0.01254522, + "epoch": 0.5102059221403878, + "flos": 14871645728640.0, + "grad_norm": 1.8580884452241668, + "language_loss": 0.73778898, + "learning_rate": 2.0309609669348874e-06, + "loss": 0.81466365, + "num_input_tokens_seen": 182508325, + "router_z_loss_clip": 1.48632812, + "router_z_loss_mlp": 0.11791992, + "step": 8486, + "time_per_iteration": 2.502035140991211 + }, + { + "auxiliary_loss_clip": 0.06432486, + "auxiliary_loss_mlp": 0.01268204, + "balance_loss_clip": 0.06276038, + "balance_loss_mlp": 0.01255115, + "epoch": 0.5102660453930558, + "flos": 22966876062720.0, + "grad_norm": 1.455931130318143, + "language_loss": 0.6993084, + "learning_rate": 2.0305715513379953e-06, + "loss": 0.77631527, + "num_input_tokens_seen": 182527020, + "router_z_loss_clip": 1.56542969, + "router_z_loss_mlp": 0.13092041, + "step": 8487, + "time_per_iteration": 2.5022764205932617 + }, + { + "auxiliary_loss_clip": 0.06425266, + "auxiliary_loss_mlp": 0.01265042, + "balance_loss_clip": 0.06274921, + "balance_loss_mlp": 0.01252072, + "epoch": 0.5103261686457238, + "flos": 23156082581760.0, + "grad_norm": 2.025146562514191, + "language_loss": 0.72757244, + "learning_rate": 2.030182134581827e-06, + "loss": 0.80447549, + "num_input_tokens_seen": 182543505, + "router_z_loss_clip": 1.50097656, + "router_z_loss_mlp": 0.12963867, + "step": 8488, + "time_per_iteration": 2.5181195735931396 + }, + { + "auxiliary_loss_clip": 0.06435129, + "auxiliary_loss_mlp": 0.01271711, + "balance_loss_clip": 0.06278089, + "balance_loss_mlp": 0.01259861, + "epoch": 0.5103862918983917, + "flos": 14324444640000.0, + "grad_norm": 1.9274143081394266, + "language_loss": 0.69714773, + "learning_rate": 2.0297927166811503e-06, + "loss": 0.77421612, + "num_input_tokens_seen": 182562250, + "router_z_loss_clip": 1.56933594, + "router_z_loss_mlp": 0.11846924, + "step": 8489, + "time_per_iteration": 2.491626739501953 + }, + { + "auxiliary_loss_clip": 0.06427855, + "auxiliary_loss_mlp": 0.01262645, + "balance_loss_clip": 0.06272568, + "balance_loss_mlp": 0.01251231, + "epoch": 0.5104464151510597, + "flos": 25855638698880.0, + "grad_norm": 1.7641928011440773, + "language_loss": 0.73334658, + "learning_rate": 2.0294032976507297e-06, + "loss": 0.81025159, + "num_input_tokens_seen": 182581910, + "router_z_loss_clip": 1.55078125, + "router_z_loss_mlp": 0.11407471, + "step": 8490, + "time_per_iteration": 2.6192476749420166 + }, + { + "auxiliary_loss_clip": 0.06422485, + "auxiliary_loss_mlp": 0.01268102, + "balance_loss_clip": 0.06271752, + "balance_loss_mlp": 0.01256628, + "epoch": 0.5105065384037276, + "flos": 21659354225280.0, + "grad_norm": 1.995020059533993, + "language_loss": 0.8080864, + "learning_rate": 2.0290138775053337e-06, + "loss": 0.8849923, + "num_input_tokens_seen": 182601350, + "router_z_loss_clip": 1.50585938, + "router_z_loss_mlp": 0.11474609, + "step": 8491, + "time_per_iteration": 2.5444910526275635 + }, + { + "auxiliary_loss_clip": 0.0642098, + "auxiliary_loss_mlp": 0.01268766, + "balance_loss_clip": 0.06274496, + "balance_loss_mlp": 0.01257089, + "epoch": 0.5105666616563956, + "flos": 22498066068480.0, + "grad_norm": 2.247071959069697, + "language_loss": 0.79263282, + "learning_rate": 2.028624456259728e-06, + "loss": 0.86953026, + "num_input_tokens_seen": 182619660, + "router_z_loss_clip": 1.46386719, + "router_z_loss_mlp": 0.11676025, + "step": 8492, + "time_per_iteration": 2.656888008117676 + }, + { + "auxiliary_loss_clip": 0.06433547, + "auxiliary_loss_mlp": 0.01271088, + "balance_loss_clip": 0.06276479, + "balance_loss_mlp": 0.01257838, + "epoch": 0.5106267849090635, + "flos": 22462371429120.0, + "grad_norm": 1.9309641209432507, + "language_loss": 0.77830237, + "learning_rate": 2.0282350339286804e-06, + "loss": 0.85534871, + "num_input_tokens_seen": 182639815, + "router_z_loss_clip": 1.5703125, + "router_z_loss_mlp": 0.13256836, + "step": 8493, + "time_per_iteration": 2.550326347351074 + }, + { + "auxiliary_loss_clip": 0.06427996, + "auxiliary_loss_mlp": 0.01265337, + "balance_loss_clip": 0.06275648, + "balance_loss_mlp": 0.01252879, + "epoch": 0.5106869081617316, + "flos": 23553335589120.0, + "grad_norm": 1.7342765336142327, + "language_loss": 0.84044284, + "learning_rate": 2.0278456105269574e-06, + "loss": 0.91737616, + "num_input_tokens_seen": 182659655, + "router_z_loss_clip": 1.52539062, + "router_z_loss_mlp": 0.12457275, + "step": 8494, + "time_per_iteration": 2.582463026046753 + }, + { + "auxiliary_loss_clip": 0.06430838, + "auxiliary_loss_mlp": 0.0126671, + "balance_loss_clip": 0.0627555, + "balance_loss_mlp": 0.0125492, + "epoch": 0.5107470314143995, + "flos": 26799547743360.0, + "grad_norm": 2.0062643152671877, + "language_loss": 0.79773927, + "learning_rate": 2.027456186069326e-06, + "loss": 0.87471473, + "num_input_tokens_seen": 182677075, + "router_z_loss_clip": 1.55273438, + "router_z_loss_mlp": 0.11798096, + "step": 8495, + "time_per_iteration": 2.5472564697265625 + }, + { + "auxiliary_loss_clip": 0.06425454, + "auxiliary_loss_mlp": 0.01268533, + "balance_loss_clip": 0.06273226, + "balance_loss_mlp": 0.01256308, + "epoch": 0.5108071546670675, + "flos": 25746877699200.0, + "grad_norm": 1.417654874659872, + "language_loss": 0.78675163, + "learning_rate": 2.0270667605705535e-06, + "loss": 0.86369145, + "num_input_tokens_seen": 182699625, + "router_z_loss_clip": 1.52148438, + "router_z_loss_mlp": 0.12231445, + "step": 8496, + "time_per_iteration": 2.5841569900512695 + }, + { + "auxiliary_loss_clip": 0.06422253, + "auxiliary_loss_mlp": 0.01267746, + "balance_loss_clip": 0.06273818, + "balance_loss_mlp": 0.01255998, + "epoch": 0.5108672779197354, + "flos": 18703478868480.0, + "grad_norm": 1.866540646775448, + "language_loss": 0.7912823, + "learning_rate": 2.0266773340454066e-06, + "loss": 0.8681823, + "num_input_tokens_seen": 182717020, + "router_z_loss_clip": 1.48046875, + "router_z_loss_mlp": 0.11755371, + "step": 8497, + "time_per_iteration": 2.5111966133117676 + }, + { + "auxiliary_loss_clip": 0.06429158, + "auxiliary_loss_mlp": 0.0126396, + "balance_loss_clip": 0.06277271, + "balance_loss_mlp": 0.01252277, + "epoch": 0.5109274011724034, + "flos": 26695482572160.0, + "grad_norm": 1.6666059931479484, + "language_loss": 0.81941032, + "learning_rate": 2.0262879065086525e-06, + "loss": 0.89634144, + "num_input_tokens_seen": 182736955, + "router_z_loss_clip": 1.51757812, + "router_z_loss_mlp": 0.11682129, + "step": 8498, + "time_per_iteration": 2.608631134033203 + }, + { + "auxiliary_loss_clip": 0.06424002, + "auxiliary_loss_mlp": 0.01271992, + "balance_loss_clip": 0.06274551, + "balance_loss_mlp": 0.01260267, + "epoch": 0.5109875244250714, + "flos": 22790666926080.0, + "grad_norm": 1.6923312462183162, + "language_loss": 0.71301198, + "learning_rate": 2.0258984779750584e-06, + "loss": 0.78997189, + "num_input_tokens_seen": 182757620, + "router_z_loss_clip": 1.49316406, + "router_z_loss_mlp": 0.11724854, + "step": 8499, + "time_per_iteration": 2.5150094032287598 + }, + { + "auxiliary_loss_clip": 0.06427284, + "auxiliary_loss_mlp": 0.01266703, + "balance_loss_clip": 0.06273851, + "balance_loss_mlp": 0.01255003, + "epoch": 0.5110476476777394, + "flos": 35596958492160.0, + "grad_norm": 1.3954443671639698, + "language_loss": 0.72611153, + "learning_rate": 2.0255090484593914e-06, + "loss": 0.80305135, + "num_input_tokens_seen": 182780195, + "router_z_loss_clip": 1.53320312, + "router_z_loss_mlp": 0.11694336, + "step": 8500, + "time_per_iteration": 2.633239269256592 + }, + { + "auxiliary_loss_clip": 0.06435662, + "auxiliary_loss_mlp": 0.01270607, + "balance_loss_clip": 0.06276006, + "balance_loss_mlp": 0.01256803, + "epoch": 0.5111077709304074, + "flos": 19286751939840.0, + "grad_norm": 2.7349973685574973, + "language_loss": 0.63562721, + "learning_rate": 2.0251196179764183e-06, + "loss": 0.71268988, + "num_input_tokens_seen": 182795765, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.13800049, + "step": 8501, + "time_per_iteration": 2.5091230869293213 + }, + { + "auxiliary_loss_clip": 0.06434844, + "auxiliary_loss_mlp": 0.01273353, + "balance_loss_clip": 0.06276836, + "balance_loss_mlp": 0.01260848, + "epoch": 0.5111678941830753, + "flos": 20674551588480.0, + "grad_norm": 1.8816899756355796, + "language_loss": 0.88057411, + "learning_rate": 2.024730186540907e-06, + "loss": 0.95765609, + "num_input_tokens_seen": 182813120, + "router_z_loss_clip": 1.58007812, + "router_z_loss_mlp": 0.12506104, + "step": 8502, + "time_per_iteration": 2.517728090286255 + }, + { + "auxiliary_loss_clip": 0.06425811, + "auxiliary_loss_mlp": 0.01265447, + "balance_loss_clip": 0.06274389, + "balance_loss_mlp": 0.01253663, + "epoch": 0.5112280174357433, + "flos": 26295336599040.0, + "grad_norm": 1.4524091598864723, + "language_loss": 0.82627225, + "learning_rate": 2.0243407541676253e-06, + "loss": 0.90318477, + "num_input_tokens_seen": 182835745, + "router_z_loss_clip": 1.51367188, + "router_z_loss_mlp": 0.11779785, + "step": 8503, + "time_per_iteration": 2.711451768875122 + }, + { + "auxiliary_loss_clip": 0.06333953, + "auxiliary_loss_mlp": 0.01255603, + "balance_loss_clip": 0.06268184, + "balance_loss_mlp": 0.0125384, + "epoch": 0.5112881406884112, + "flos": 59490706492800.0, + "grad_norm": 0.8512772291593351, + "language_loss": 0.63800937, + "learning_rate": 2.023951320871339e-06, + "loss": 0.71390492, + "num_input_tokens_seen": 182892540, + "router_z_loss_clip": 0.65966797, + "router_z_loss_mlp": 0.01766968, + "step": 8504, + "time_per_iteration": 3.1690919399261475 + }, + { + "auxiliary_loss_clip": 0.06425914, + "auxiliary_loss_mlp": 0.01265825, + "balance_loss_clip": 0.06275845, + "balance_loss_mlp": 0.01253576, + "epoch": 0.5113482639410792, + "flos": 26476073856000.0, + "grad_norm": 1.7986544100736102, + "language_loss": 0.84377933, + "learning_rate": 2.023561886666816e-06, + "loss": 0.92069674, + "num_input_tokens_seen": 182911515, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.12261963, + "step": 8505, + "time_per_iteration": 2.5755858421325684 + }, + { + "auxiliary_loss_clip": 0.0643035, + "auxiliary_loss_mlp": 0.01265823, + "balance_loss_clip": 0.06279911, + "balance_loss_mlp": 0.01254229, + "epoch": 0.5114083871937471, + "flos": 29903190975360.0, + "grad_norm": 1.7295208629505698, + "language_loss": 0.75707996, + "learning_rate": 2.0231724515688246e-06, + "loss": 0.83404166, + "num_input_tokens_seen": 182930860, + "router_z_loss_clip": 1.50585938, + "router_z_loss_mlp": 0.11590576, + "step": 8506, + "time_per_iteration": 3.947927713394165 + }, + { + "auxiliary_loss_clip": 0.0642788, + "auxiliary_loss_mlp": 0.01268518, + "balance_loss_clip": 0.06276722, + "balance_loss_mlp": 0.01255303, + "epoch": 0.5114685104464152, + "flos": 24321161278080.0, + "grad_norm": 1.7165713389532073, + "language_loss": 0.58250427, + "learning_rate": 2.022783015592131e-06, + "loss": 0.65946829, + "num_input_tokens_seen": 182949960, + "router_z_loss_clip": 1.50976562, + "router_z_loss_mlp": 0.13214111, + "step": 8507, + "time_per_iteration": 2.5460915565490723 + }, + { + "auxiliary_loss_clip": 0.06432099, + "auxiliary_loss_mlp": 0.01269517, + "balance_loss_clip": 0.06281347, + "balance_loss_mlp": 0.01257023, + "epoch": 0.5115286336990831, + "flos": 17024965079040.0, + "grad_norm": 1.7959155859668763, + "language_loss": 0.8588531, + "learning_rate": 2.022393578751503e-06, + "loss": 0.93586934, + "num_input_tokens_seen": 182968085, + "router_z_loss_clip": 1.5078125, + "router_z_loss_mlp": 0.12475586, + "step": 8508, + "time_per_iteration": 2.501931667327881 + }, + { + "auxiliary_loss_clip": 0.06430113, + "auxiliary_loss_mlp": 0.01267037, + "balance_loss_clip": 0.06279224, + "balance_loss_mlp": 0.012544, + "epoch": 0.5115887569517511, + "flos": 23666121584640.0, + "grad_norm": 1.985741338533524, + "language_loss": 0.72740698, + "learning_rate": 2.022004141061709e-06, + "loss": 0.80437851, + "num_input_tokens_seen": 182987275, + "router_z_loss_clip": 1.50976562, + "router_z_loss_mlp": 0.12640381, + "step": 8509, + "time_per_iteration": 3.9570322036743164 + }, + { + "auxiliary_loss_clip": 0.06425552, + "auxiliary_loss_mlp": 0.01264949, + "balance_loss_clip": 0.06277531, + "balance_loss_mlp": 0.01254476, + "epoch": 0.511648880204419, + "flos": 16112725678080.0, + "grad_norm": 1.6522242028614569, + "language_loss": 0.76532018, + "learning_rate": 2.0216147025375153e-06, + "loss": 0.84222525, + "num_input_tokens_seen": 183004700, + "router_z_loss_clip": 1.47949219, + "router_z_loss_mlp": 0.10479736, + "step": 8510, + "time_per_iteration": 2.5000293254852295 + }, + { + "auxiliary_loss_clip": 0.06424148, + "auxiliary_loss_mlp": 0.01267464, + "balance_loss_clip": 0.06276409, + "balance_loss_mlp": 0.01256402, + "epoch": 0.511709003457087, + "flos": 32643221414400.0, + "grad_norm": 1.8483097722803792, + "language_loss": 0.71295965, + "learning_rate": 2.0212252631936907e-06, + "loss": 0.78987575, + "num_input_tokens_seen": 183025830, + "router_z_loss_clip": 1.47753906, + "router_z_loss_mlp": 0.11053467, + "step": 8511, + "time_per_iteration": 2.5970981121063232 + }, + { + "auxiliary_loss_clip": 0.06426742, + "auxiliary_loss_mlp": 0.01265633, + "balance_loss_clip": 0.06277999, + "balance_loss_mlp": 0.0125404, + "epoch": 0.511769126709755, + "flos": 21768492568320.0, + "grad_norm": 1.8966780464465567, + "language_loss": 0.67139721, + "learning_rate": 2.020835823045001e-06, + "loss": 0.74832094, + "num_input_tokens_seen": 183045140, + "router_z_loss_clip": 1.48632812, + "router_z_loss_mlp": 0.11584473, + "step": 8512, + "time_per_iteration": 2.5369138717651367 + }, + { + "auxiliary_loss_clip": 0.06426971, + "auxiliary_loss_mlp": 0.01266218, + "balance_loss_clip": 0.06273089, + "balance_loss_mlp": 0.01253588, + "epoch": 0.511829249962423, + "flos": 23922231189120.0, + "grad_norm": 1.7695600544803753, + "language_loss": 0.67171764, + "learning_rate": 2.0204463821062146e-06, + "loss": 0.7486496, + "num_input_tokens_seen": 183063935, + "router_z_loss_clip": 1.54003906, + "router_z_loss_mlp": 0.12628174, + "step": 8513, + "time_per_iteration": 2.517648220062256 + }, + { + "auxiliary_loss_clip": 0.06423096, + "auxiliary_loss_mlp": 0.01268209, + "balance_loss_clip": 0.06275445, + "balance_loss_mlp": 0.01255948, + "epoch": 0.511889373215091, + "flos": 23732856961920.0, + "grad_norm": 1.8747309224946216, + "language_loss": 0.68931103, + "learning_rate": 2.0200569403921e-06, + "loss": 0.76622409, + "num_input_tokens_seen": 183084135, + "router_z_loss_clip": 1.47753906, + "router_z_loss_mlp": 0.1227417, + "step": 8514, + "time_per_iteration": 3.969726085662842 + }, + { + "auxiliary_loss_clip": 0.06422693, + "auxiliary_loss_mlp": 0.01265425, + "balance_loss_clip": 0.06273951, + "balance_loss_mlp": 0.01254357, + "epoch": 0.5119494964677589, + "flos": 28119144568320.0, + "grad_norm": 1.955376754159203, + "language_loss": 0.66104603, + "learning_rate": 2.019667497917424e-06, + "loss": 0.7379272, + "num_input_tokens_seen": 183104570, + "router_z_loss_clip": 1.48632812, + "router_z_loss_mlp": 0.11065674, + "step": 8515, + "time_per_iteration": 2.586984872817993 + }, + { + "auxiliary_loss_clip": 0.06415779, + "auxiliary_loss_mlp": 0.01265644, + "balance_loss_clip": 0.0627024, + "balance_loss_mlp": 0.01254754, + "epoch": 0.5120096197204269, + "flos": 24980225967360.0, + "grad_norm": 1.8485741123105555, + "language_loss": 0.76016974, + "learning_rate": 2.019278054696955e-06, + "loss": 0.83698404, + "num_input_tokens_seen": 183123850, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.10894775, + "step": 8516, + "time_per_iteration": 2.5933895111083984 + }, + { + "auxiliary_loss_clip": 0.06425153, + "auxiliary_loss_mlp": 0.01265819, + "balance_loss_clip": 0.0627657, + "balance_loss_mlp": 0.01254136, + "epoch": 0.5120697429730948, + "flos": 17973863441280.0, + "grad_norm": 1.9611042257937292, + "language_loss": 0.78053069, + "learning_rate": 2.0188886107454595e-06, + "loss": 0.85744041, + "num_input_tokens_seen": 183141725, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.11694336, + "step": 8517, + "time_per_iteration": 2.4962363243103027 + }, + { + "auxiliary_loss_clip": 0.06430522, + "auxiliary_loss_mlp": 0.01271394, + "balance_loss_clip": 0.06276728, + "balance_loss_mlp": 0.01259211, + "epoch": 0.5121298662257628, + "flos": 23298651504000.0, + "grad_norm": 1.7759167489555023, + "language_loss": 0.74719632, + "learning_rate": 2.0184991660777063e-06, + "loss": 0.82421547, + "num_input_tokens_seen": 183161300, + "router_z_loss_clip": 1.53710938, + "router_z_loss_mlp": 0.12164307, + "step": 8518, + "time_per_iteration": 2.5037240982055664 + }, + { + "auxiliary_loss_clip": 0.06424905, + "auxiliary_loss_mlp": 0.0126823, + "balance_loss_clip": 0.06274471, + "balance_loss_mlp": 0.01256529, + "epoch": 0.5121899894784308, + "flos": 17316769322880.0, + "grad_norm": 1.687169580100827, + "language_loss": 0.78467947, + "learning_rate": 2.0181097207084625e-06, + "loss": 0.86161083, + "num_input_tokens_seen": 183180495, + "router_z_loss_clip": 1.50097656, + "router_z_loss_mlp": 0.11706543, + "step": 8519, + "time_per_iteration": 2.524724006652832 + }, + { + "auxiliary_loss_clip": 0.06422982, + "auxiliary_loss_mlp": 0.01264919, + "balance_loss_clip": 0.06273712, + "balance_loss_mlp": 0.01253016, + "epoch": 0.5122501127310988, + "flos": 24935978211840.0, + "grad_norm": 1.6239003664198155, + "language_loss": 0.79446238, + "learning_rate": 2.017720274652497e-06, + "loss": 0.87134135, + "num_input_tokens_seen": 183200330, + "router_z_loss_clip": 1.49121094, + "router_z_loss_mlp": 0.11907959, + "step": 8520, + "time_per_iteration": 2.522068500518799 + }, + { + "auxiliary_loss_clip": 0.06431363, + "auxiliary_loss_mlp": 0.01269526, + "balance_loss_clip": 0.06276108, + "balance_loss_mlp": 0.01256151, + "epoch": 0.5123102359837667, + "flos": 18448878637440.0, + "grad_norm": 1.8569595834923718, + "language_loss": 0.81725198, + "learning_rate": 2.0173308279245765e-06, + "loss": 0.89426088, + "num_input_tokens_seen": 183218230, + "router_z_loss_clip": 1.55175781, + "router_z_loss_mlp": 0.13366699, + "step": 8521, + "time_per_iteration": 3.956547498703003 + }, + { + "auxiliary_loss_clip": 0.06422685, + "auxiliary_loss_mlp": 0.01264857, + "balance_loss_clip": 0.0627308, + "balance_loss_mlp": 0.01253383, + "epoch": 0.5123703592364347, + "flos": 26691625284480.0, + "grad_norm": 3.145804815574879, + "language_loss": 0.68764591, + "learning_rate": 2.0169413805394692e-06, + "loss": 0.7645213, + "num_input_tokens_seen": 183236735, + "router_z_loss_clip": 1.49316406, + "router_z_loss_mlp": 0.11462402, + "step": 8522, + "time_per_iteration": 2.53696608543396 + }, + { + "auxiliary_loss_clip": 0.06430639, + "auxiliary_loss_mlp": 0.01269235, + "balance_loss_clip": 0.06276414, + "balance_loss_mlp": 0.01256039, + "epoch": 0.5124304824891026, + "flos": 28811555982720.0, + "grad_norm": 1.853417160064295, + "language_loss": 0.622962, + "learning_rate": 2.0165519325119433e-06, + "loss": 0.69996071, + "num_input_tokens_seen": 183257550, + "router_z_loss_clip": 1.54199219, + "router_z_loss_mlp": 0.13201904, + "step": 8523, + "time_per_iteration": 2.589885950088501 + }, + { + "auxiliary_loss_clip": 0.06424818, + "auxiliary_loss_mlp": 0.01265688, + "balance_loss_clip": 0.06274516, + "balance_loss_mlp": 0.01254685, + "epoch": 0.5124906057417706, + "flos": 21768199079040.0, + "grad_norm": 1.9669486922935226, + "language_loss": 0.77939785, + "learning_rate": 2.0161624838567656e-06, + "loss": 0.85630286, + "num_input_tokens_seen": 183275515, + "router_z_loss_clip": 1.50195312, + "router_z_loss_mlp": 0.11004639, + "step": 8524, + "time_per_iteration": 2.506647825241089 + }, + { + "auxiliary_loss_clip": 0.06424855, + "auxiliary_loss_mlp": 0.01266329, + "balance_loss_clip": 0.06275764, + "balance_loss_mlp": 0.01255344, + "epoch": 0.5125507289944387, + "flos": 18886605966720.0, + "grad_norm": 1.985021925330002, + "language_loss": 0.74904448, + "learning_rate": 2.015773034588706e-06, + "loss": 0.82595634, + "num_input_tokens_seen": 183293880, + "router_z_loss_clip": 1.49121094, + "router_z_loss_mlp": 0.10986328, + "step": 8525, + "time_per_iteration": 2.509902000427246 + }, + { + "auxiliary_loss_clip": 0.06429298, + "auxiliary_loss_mlp": 0.01270559, + "balance_loss_clip": 0.06276, + "balance_loss_mlp": 0.01258412, + "epoch": 0.5126108522471066, + "flos": 35636761981440.0, + "grad_norm": 1.5788283001431092, + "language_loss": 0.74868685, + "learning_rate": 2.015383584722531e-06, + "loss": 0.82568544, + "num_input_tokens_seen": 183315860, + "router_z_loss_clip": 1.53222656, + "router_z_loss_mlp": 0.12127686, + "step": 8526, + "time_per_iteration": 2.640554428100586 + }, + { + "auxiliary_loss_clip": 0.06428048, + "auxiliary_loss_mlp": 0.01267884, + "balance_loss_clip": 0.06275488, + "balance_loss_mlp": 0.01256613, + "epoch": 0.5126709754997746, + "flos": 20196685353600.0, + "grad_norm": 1.5376970768591331, + "language_loss": 0.658445, + "learning_rate": 2.0149941342730088e-06, + "loss": 0.73540437, + "num_input_tokens_seen": 183335480, + "router_z_loss_clip": 1.52539062, + "router_z_loss_mlp": 0.11279297, + "step": 8527, + "time_per_iteration": 2.5079874992370605 + }, + { + "auxiliary_loss_clip": 0.06421998, + "auxiliary_loss_mlp": 0.01268926, + "balance_loss_clip": 0.0627896, + "balance_loss_mlp": 0.01258644, + "epoch": 0.5127310987524425, + "flos": 18594550160640.0, + "grad_norm": 1.4224570841542155, + "language_loss": 0.74258637, + "learning_rate": 2.014604683254908e-06, + "loss": 0.81949556, + "num_input_tokens_seen": 183354395, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.10290527, + "step": 8528, + "time_per_iteration": 2.5583620071411133 + }, + { + "auxiliary_loss_clip": 0.06424492, + "auxiliary_loss_mlp": 0.01266445, + "balance_loss_clip": 0.06275051, + "balance_loss_mlp": 0.01254816, + "epoch": 0.5127912220051105, + "flos": 22461113617920.0, + "grad_norm": 1.747082224822374, + "language_loss": 0.83357608, + "learning_rate": 2.014215231682995e-06, + "loss": 0.91048539, + "num_input_tokens_seen": 183372980, + "router_z_loss_clip": 1.49316406, + "router_z_loss_mlp": 0.11621094, + "step": 8529, + "time_per_iteration": 2.5290021896362305 + }, + { + "auxiliary_loss_clip": 0.06427129, + "auxiliary_loss_mlp": 0.01266072, + "balance_loss_clip": 0.06279376, + "balance_loss_mlp": 0.01255206, + "epoch": 0.5128513452577784, + "flos": 19098845159040.0, + "grad_norm": 1.7753814294124612, + "language_loss": 0.7435441, + "learning_rate": 2.01382577957204e-06, + "loss": 0.82047611, + "num_input_tokens_seen": 183390160, + "router_z_loss_clip": 1.47753906, + "router_z_loss_mlp": 0.10852051, + "step": 8530, + "time_per_iteration": 2.5009660720825195 + }, + { + "auxiliary_loss_clip": 0.06336609, + "auxiliary_loss_mlp": 0.01264939, + "balance_loss_clip": 0.062712, + "balance_loss_mlp": 0.01263291, + "epoch": 0.5129114685104464, + "flos": 67914553011840.0, + "grad_norm": 0.7560442553547831, + "language_loss": 0.60794806, + "learning_rate": 2.0134363269368095e-06, + "loss": 0.68396354, + "num_input_tokens_seen": 183455280, + "router_z_loss_clip": 0.65625, + "router_z_loss_mlp": 0.01651001, + "step": 8531, + "time_per_iteration": 3.2641408443450928 + }, + { + "auxiliary_loss_clip": 0.06436025, + "auxiliary_loss_mlp": 0.0126867, + "balance_loss_clip": 0.062833, + "balance_loss_mlp": 0.0125722, + "epoch": 0.5129715917631144, + "flos": 20455436361600.0, + "grad_norm": 1.5619116128751078, + "language_loss": 0.76922929, + "learning_rate": 2.0130468737920725e-06, + "loss": 0.84627628, + "num_input_tokens_seen": 183473955, + "router_z_loss_clip": 1.52832031, + "router_z_loss_mlp": 0.11444092, + "step": 8532, + "time_per_iteration": 2.54885196685791 + }, + { + "auxiliary_loss_clip": 0.06429256, + "auxiliary_loss_mlp": 0.01269177, + "balance_loss_clip": 0.0627965, + "balance_loss_mlp": 0.0125747, + "epoch": 0.5130317150157824, + "flos": 35124836261760.0, + "grad_norm": 2.143443364581078, + "language_loss": 0.67464834, + "learning_rate": 2.012657420152597e-06, + "loss": 0.75163269, + "num_input_tokens_seen": 183497195, + "router_z_loss_clip": 1.49609375, + "router_z_loss_mlp": 0.11706543, + "step": 8533, + "time_per_iteration": 2.634751081466675 + }, + { + "auxiliary_loss_clip": 0.06435291, + "auxiliary_loss_mlp": 0.01270583, + "balance_loss_clip": 0.06282294, + "balance_loss_mlp": 0.01257995, + "epoch": 0.5130918382684503, + "flos": 19797671410560.0, + "grad_norm": 2.0992969405941526, + "language_loss": 0.82022768, + "learning_rate": 2.01226796603315e-06, + "loss": 0.89728636, + "num_input_tokens_seen": 183513675, + "router_z_loss_clip": 1.52734375, + "router_z_loss_mlp": 0.12585449, + "step": 8534, + "time_per_iteration": 2.527186632156372 + }, + { + "auxiliary_loss_clip": 0.06432565, + "auxiliary_loss_mlp": 0.01272989, + "balance_loss_clip": 0.06280594, + "balance_loss_mlp": 0.0126077, + "epoch": 0.5131519615211183, + "flos": 26330318478720.0, + "grad_norm": 1.396585887996991, + "language_loss": 0.64072168, + "learning_rate": 2.0118785114485017e-06, + "loss": 0.71777725, + "num_input_tokens_seen": 183535165, + "router_z_loss_clip": 1.51953125, + "router_z_loss_mlp": 0.12225342, + "step": 8535, + "time_per_iteration": 2.5608325004577637 + }, + { + "auxiliary_loss_clip": 0.06432404, + "auxiliary_loss_mlp": 0.01265724, + "balance_loss_clip": 0.06282519, + "balance_loss_mlp": 0.01254036, + "epoch": 0.5132120847737862, + "flos": 19177949013120.0, + "grad_norm": 1.677219086168078, + "language_loss": 0.70047057, + "learning_rate": 2.011489056413418e-06, + "loss": 0.77745175, + "num_input_tokens_seen": 183553780, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.11682129, + "step": 8536, + "time_per_iteration": 2.562103509902954 + }, + { + "auxiliary_loss_clip": 0.06443835, + "auxiliary_loss_mlp": 0.01273704, + "balance_loss_clip": 0.06287554, + "balance_loss_mlp": 0.01260359, + "epoch": 0.5132722080264542, + "flos": 20236698478080.0, + "grad_norm": 2.053357085489985, + "language_loss": 0.71648562, + "learning_rate": 2.011099600942669e-06, + "loss": 0.793661, + "num_input_tokens_seen": 183572285, + "router_z_loss_clip": 1.56054688, + "router_z_loss_mlp": 0.13348389, + "step": 8537, + "time_per_iteration": 2.5208451747894287 + }, + { + "auxiliary_loss_clip": 0.06435503, + "auxiliary_loss_mlp": 0.01264426, + "balance_loss_clip": 0.06282058, + "balance_loss_mlp": 0.01252559, + "epoch": 0.5133323312791223, + "flos": 16474619462400.0, + "grad_norm": 2.3096480270315487, + "language_loss": 0.80560482, + "learning_rate": 2.0107101450510214e-06, + "loss": 0.88260412, + "num_input_tokens_seen": 183589330, + "router_z_loss_clip": 1.53222656, + "router_z_loss_mlp": 0.11859131, + "step": 8538, + "time_per_iteration": 2.5136818885803223 + }, + { + "auxiliary_loss_clip": 0.06432489, + "auxiliary_loss_mlp": 0.01269896, + "balance_loss_clip": 0.06280679, + "balance_loss_mlp": 0.01258177, + "epoch": 0.5133924545317902, + "flos": 26075340904320.0, + "grad_norm": 1.6767929293826078, + "language_loss": 0.78499532, + "learning_rate": 2.0103206887532437e-06, + "loss": 0.86201918, + "num_input_tokens_seen": 183609205, + "router_z_loss_clip": 1.51757812, + "router_z_loss_mlp": 0.1171875, + "step": 8539, + "time_per_iteration": 2.5898549556732178 + }, + { + "auxiliary_loss_clip": 0.06434882, + "auxiliary_loss_mlp": 0.01267576, + "balance_loss_clip": 0.06283914, + "balance_loss_mlp": 0.01255703, + "epoch": 0.5134525777844582, + "flos": 29138467887360.0, + "grad_norm": 1.6389084641418472, + "language_loss": 0.76422769, + "learning_rate": 2.009931232064105e-06, + "loss": 0.84125227, + "num_input_tokens_seen": 183629985, + "router_z_loss_clip": 1.50976562, + "router_z_loss_mlp": 0.11877441, + "step": 8540, + "time_per_iteration": 2.695279359817505 + }, + { + "auxiliary_loss_clip": 0.06437706, + "auxiliary_loss_mlp": 0.01272086, + "balance_loss_clip": 0.06283282, + "balance_loss_mlp": 0.01258812, + "epoch": 0.5135127010371261, + "flos": 17460134858880.0, + "grad_norm": 1.735384048528371, + "language_loss": 0.74720204, + "learning_rate": 2.0095417749983724e-06, + "loss": 0.82429993, + "num_input_tokens_seen": 183648220, + "router_z_loss_clip": 1.54296875, + "router_z_loss_mlp": 0.1328125, + "step": 8541, + "time_per_iteration": 2.5028650760650635 + }, + { + "auxiliary_loss_clip": 0.06433722, + "auxiliary_loss_mlp": 0.01268404, + "balance_loss_clip": 0.06282187, + "balance_loss_mlp": 0.01255905, + "epoch": 0.5135728242897941, + "flos": 21951493885440.0, + "grad_norm": 1.7658048645767805, + "language_loss": 0.71345925, + "learning_rate": 2.0091523175708162e-06, + "loss": 0.79048049, + "num_input_tokens_seen": 183668230, + "router_z_loss_clip": 1.51367188, + "router_z_loss_mlp": 0.12493896, + "step": 8542, + "time_per_iteration": 2.55663800239563 + }, + { + "auxiliary_loss_clip": 0.06432796, + "auxiliary_loss_mlp": 0.01267795, + "balance_loss_clip": 0.06282645, + "balance_loss_mlp": 0.01255939, + "epoch": 0.513632947542462, + "flos": 22681528583040.0, + "grad_norm": 1.8429175926110044, + "language_loss": 0.79735661, + "learning_rate": 2.0087628597962023e-06, + "loss": 0.87436259, + "num_input_tokens_seen": 183687800, + "router_z_loss_clip": 1.50195312, + "router_z_loss_mlp": 0.11846924, + "step": 8543, + "time_per_iteration": 2.530942440032959 + }, + { + "auxiliary_loss_clip": 0.06431838, + "auxiliary_loss_mlp": 0.01268863, + "balance_loss_clip": 0.06281078, + "balance_loss_mlp": 0.0125693, + "epoch": 0.51369307079513, + "flos": 29464289688960.0, + "grad_norm": 1.9724623685644402, + "language_loss": 0.68434304, + "learning_rate": 2.008373401689299e-06, + "loss": 0.76135004, + "num_input_tokens_seen": 183709025, + "router_z_loss_clip": 1.50878906, + "router_z_loss_mlp": 0.11932373, + "step": 8544, + "time_per_iteration": 2.581965684890747 + }, + { + "auxiliary_loss_clip": 0.06435554, + "auxiliary_loss_mlp": 0.01269408, + "balance_loss_clip": 0.0628157, + "balance_loss_mlp": 0.01257314, + "epoch": 0.513753194047798, + "flos": 18995325039360.0, + "grad_norm": 1.9173308249452852, + "language_loss": 0.73101795, + "learning_rate": 2.0079839432648765e-06, + "loss": 0.80806756, + "num_input_tokens_seen": 183725740, + "router_z_loss_clip": 1.53808594, + "router_z_loss_mlp": 0.12103271, + "step": 8545, + "time_per_iteration": 3.9112906455993652 + }, + { + "auxiliary_loss_clip": 0.06434133, + "auxiliary_loss_mlp": 0.01273161, + "balance_loss_clip": 0.06280358, + "balance_loss_mlp": 0.01260745, + "epoch": 0.513813317300466, + "flos": 17827646866560.0, + "grad_norm": 2.3149125381427322, + "language_loss": 0.82387555, + "learning_rate": 2.0075944845377016e-06, + "loss": 0.90094852, + "num_input_tokens_seen": 183743995, + "router_z_loss_clip": 1.53613281, + "router_z_loss_mlp": 0.12408447, + "step": 8546, + "time_per_iteration": 2.4859204292297363 + }, + { + "auxiliary_loss_clip": 0.06431763, + "auxiliary_loss_mlp": 0.0126748, + "balance_loss_clip": 0.062795, + "balance_loss_mlp": 0.01255101, + "epoch": 0.5138734405531339, + "flos": 24068070420480.0, + "grad_norm": 1.656069587269211, + "language_loss": 0.73464745, + "learning_rate": 2.007205025522544e-06, + "loss": 0.81163985, + "num_input_tokens_seen": 183764150, + "router_z_loss_clip": 1.52539062, + "router_z_loss_mlp": 0.12384033, + "step": 8547, + "time_per_iteration": 2.5682289600372314 + }, + { + "auxiliary_loss_clip": 0.0643255, + "auxiliary_loss_mlp": 0.01266832, + "balance_loss_clip": 0.06281269, + "balance_loss_mlp": 0.01254697, + "epoch": 0.5139335638058019, + "flos": 26103279041280.0, + "grad_norm": 1.7029090715356687, + "language_loss": 0.7379564, + "learning_rate": 2.0068155662341702e-06, + "loss": 0.81495023, + "num_input_tokens_seen": 183783280, + "router_z_loss_clip": 1.51269531, + "router_z_loss_mlp": 0.12121582, + "step": 8548, + "time_per_iteration": 2.534795045852661 + }, + { + "auxiliary_loss_clip": 0.06433449, + "auxiliary_loss_mlp": 0.01270968, + "balance_loss_clip": 0.06279913, + "balance_loss_mlp": 0.01259124, + "epoch": 0.5139936870584698, + "flos": 18923181073920.0, + "grad_norm": 1.5199417717256292, + "language_loss": 0.82597619, + "learning_rate": 2.0064261066873495e-06, + "loss": 0.90302038, + "num_input_tokens_seen": 183800725, + "router_z_loss_clip": 1.53417969, + "router_z_loss_mlp": 0.11853027, + "step": 8549, + "time_per_iteration": 3.9844579696655273 + }, + { + "auxiliary_loss_clip": 0.06431821, + "auxiliary_loss_mlp": 0.01268578, + "balance_loss_clip": 0.06283253, + "balance_loss_mlp": 0.01256913, + "epoch": 0.5140538103111378, + "flos": 16149594274560.0, + "grad_norm": 1.7893333067818897, + "language_loss": 0.72460294, + "learning_rate": 2.0060366468968504e-06, + "loss": 0.80160695, + "num_input_tokens_seen": 183818735, + "router_z_loss_clip": 1.48339844, + "router_z_loss_mlp": 0.11669922, + "step": 8550, + "time_per_iteration": 2.6143221855163574 + }, + { + "auxiliary_loss_clip": 0.06436016, + "auxiliary_loss_mlp": 0.01265894, + "balance_loss_clip": 0.06278858, + "balance_loss_mlp": 0.01253341, + "epoch": 0.5141139335638057, + "flos": 22426886424960.0, + "grad_norm": 1.3843612466681816, + "language_loss": 0.7537846, + "learning_rate": 2.0056471868774408e-06, + "loss": 0.83080363, + "num_input_tokens_seen": 183840015, + "router_z_loss_clip": 1.56933594, + "router_z_loss_mlp": 0.12536621, + "step": 8551, + "time_per_iteration": 2.563551664352417 + }, + { + "auxiliary_loss_clip": 0.06427439, + "auxiliary_loss_mlp": 0.01266176, + "balance_loss_clip": 0.06281094, + "balance_loss_mlp": 0.01255233, + "epoch": 0.5141740568164738, + "flos": 27097054064640.0, + "grad_norm": 1.547590229430392, + "language_loss": 0.69192576, + "learning_rate": 2.0052577266438897e-06, + "loss": 0.76886189, + "num_input_tokens_seen": 183860145, + "router_z_loss_clip": 1.46191406, + "router_z_loss_mlp": 0.10949707, + "step": 8552, + "time_per_iteration": 2.598309278488159 + }, + { + "auxiliary_loss_clip": 0.06434312, + "auxiliary_loss_mlp": 0.01271227, + "balance_loss_clip": 0.06280888, + "balance_loss_mlp": 0.01258972, + "epoch": 0.5142341800691418, + "flos": 24980267894400.0, + "grad_norm": 1.7162445999633908, + "language_loss": 0.75295067, + "learning_rate": 2.004868266210965e-06, + "loss": 0.830006, + "num_input_tokens_seen": 183880540, + "router_z_loss_clip": 1.53515625, + "router_z_loss_mlp": 0.12255859, + "step": 8553, + "time_per_iteration": 2.56817364692688 + }, + { + "auxiliary_loss_clip": 0.06427588, + "auxiliary_loss_mlp": 0.01265909, + "balance_loss_clip": 0.06277347, + "balance_loss_mlp": 0.01253642, + "epoch": 0.5142943033218097, + "flos": 20710833206400.0, + "grad_norm": 1.5512777085285745, + "language_loss": 0.68091589, + "learning_rate": 2.004478805593435e-06, + "loss": 0.75785089, + "num_input_tokens_seen": 183900895, + "router_z_loss_clip": 1.50195312, + "router_z_loss_mlp": 0.1227417, + "step": 8554, + "time_per_iteration": 4.041098117828369 + }, + { + "auxiliary_loss_clip": 0.06434806, + "auxiliary_loss_mlp": 0.01269189, + "balance_loss_clip": 0.0627867, + "balance_loss_mlp": 0.0125514, + "epoch": 0.5143544265744777, + "flos": 22931391058560.0, + "grad_norm": 1.9544744043919176, + "language_loss": 0.73420155, + "learning_rate": 2.004089344806068e-06, + "loss": 0.81124151, + "num_input_tokens_seen": 183920335, + "router_z_loss_clip": 1.56054688, + "router_z_loss_mlp": 0.14050293, + "step": 8555, + "time_per_iteration": 2.560406446456909 + }, + { + "auxiliary_loss_clip": 0.0643023, + "auxiliary_loss_mlp": 0.01264405, + "balance_loss_clip": 0.06277946, + "balance_loss_mlp": 0.0125305, + "epoch": 0.5144145498271456, + "flos": 15926328270720.0, + "grad_norm": 3.1721710851325478, + "language_loss": 0.74827576, + "learning_rate": 2.003699883863633e-06, + "loss": 0.82522213, + "num_input_tokens_seen": 183936220, + "router_z_loss_clip": 1.52246094, + "router_z_loss_mlp": 0.11346436, + "step": 8556, + "time_per_iteration": 2.510631561279297 + }, + { + "auxiliary_loss_clip": 0.06426013, + "auxiliary_loss_mlp": 0.01266484, + "balance_loss_clip": 0.06279086, + "balance_loss_mlp": 0.01255374, + "epoch": 0.5144746730798136, + "flos": 19687107548160.0, + "grad_norm": 1.7802365486116365, + "language_loss": 0.86600292, + "learning_rate": 2.003310422780898e-06, + "loss": 0.9429279, + "num_input_tokens_seen": 183953250, + "router_z_loss_clip": 1.46972656, + "router_z_loss_mlp": 0.11114502, + "step": 8557, + "time_per_iteration": 2.4897682666778564 + }, + { + "auxiliary_loss_clip": 0.06427194, + "auxiliary_loss_mlp": 0.01265116, + "balance_loss_clip": 0.06280152, + "balance_loss_mlp": 0.0125372, + "epoch": 0.5145347963324816, + "flos": 23921476502400.0, + "grad_norm": 1.7088292247190593, + "language_loss": 0.89943027, + "learning_rate": 2.0029209615726307e-06, + "loss": 0.97635341, + "num_input_tokens_seen": 183973865, + "router_z_loss_clip": 1.46972656, + "router_z_loss_mlp": 0.11407471, + "step": 8558, + "time_per_iteration": 2.552520513534546 + }, + { + "auxiliary_loss_clip": 0.06426296, + "auxiliary_loss_mlp": 0.01270393, + "balance_loss_clip": 0.06281744, + "balance_loss_mlp": 0.01259337, + "epoch": 0.5145949195851496, + "flos": 18265919247360.0, + "grad_norm": 1.814909546317071, + "language_loss": 0.65665084, + "learning_rate": 2.002531500253602e-06, + "loss": 0.73361778, + "num_input_tokens_seen": 183992555, + "router_z_loss_clip": 1.4453125, + "router_z_loss_mlp": 0.1105957, + "step": 8559, + "time_per_iteration": 2.5509958267211914 + }, + { + "auxiliary_loss_clip": 0.06428455, + "auxiliary_loss_mlp": 0.0126663, + "balance_loss_clip": 0.0628074, + "balance_loss_mlp": 0.0125527, + "epoch": 0.5146550428378175, + "flos": 26220593157120.0, + "grad_norm": 1.5790337478872891, + "language_loss": 0.63388872, + "learning_rate": 2.002142038838577e-06, + "loss": 0.71083951, + "num_input_tokens_seen": 184010825, + "router_z_loss_clip": 1.47851562, + "router_z_loss_mlp": 0.11358643, + "step": 8560, + "time_per_iteration": 2.5824177265167236 + }, + { + "auxiliary_loss_clip": 0.06429952, + "auxiliary_loss_mlp": 0.01265572, + "balance_loss_clip": 0.06279366, + "balance_loss_mlp": 0.01253597, + "epoch": 0.5147151660904855, + "flos": 22680731969280.0, + "grad_norm": 1.6548160663474087, + "language_loss": 0.70604181, + "learning_rate": 2.0017525773423265e-06, + "loss": 0.78299701, + "num_input_tokens_seen": 184030155, + "router_z_loss_clip": 1.50390625, + "router_z_loss_mlp": 0.11975098, + "step": 8561, + "time_per_iteration": 4.051865816116333 + }, + { + "auxiliary_loss_clip": 0.06432293, + "auxiliary_loss_mlp": 0.01266304, + "balance_loss_clip": 0.0628119, + "balance_loss_mlp": 0.01254937, + "epoch": 0.5147752893431534, + "flos": 24979261645440.0, + "grad_norm": 1.5164557892601689, + "language_loss": 0.67091215, + "learning_rate": 2.0013631157796177e-06, + "loss": 0.7478981, + "num_input_tokens_seen": 184051440, + "router_z_loss_clip": 1.51171875, + "router_z_loss_mlp": 0.1137085, + "step": 8562, + "time_per_iteration": 2.587117910385132 + }, + { + "auxiliary_loss_clip": 0.06434688, + "auxiliary_loss_mlp": 0.0126818, + "balance_loss_clip": 0.06283362, + "balance_loss_mlp": 0.01256945, + "epoch": 0.5148354125958214, + "flos": 22750821509760.0, + "grad_norm": 1.6017474228640745, + "language_loss": 0.77982432, + "learning_rate": 2.0009736541652188e-06, + "loss": 0.85685301, + "num_input_tokens_seen": 184070205, + "router_z_loss_clip": 1.51367188, + "router_z_loss_mlp": 0.11248779, + "step": 8563, + "time_per_iteration": 2.5995922088623047 + }, + { + "auxiliary_loss_clip": 0.06441233, + "auxiliary_loss_mlp": 0.01269901, + "balance_loss_clip": 0.06284129, + "balance_loss_mlp": 0.01257235, + "epoch": 0.5148955358484893, + "flos": 23074253470080.0, + "grad_norm": 2.0871441030394426, + "language_loss": 0.83276081, + "learning_rate": 2.0005841925139e-06, + "loss": 0.90987211, + "num_input_tokens_seen": 184087345, + "router_z_loss_clip": 1.57128906, + "router_z_loss_mlp": 0.12658691, + "step": 8564, + "time_per_iteration": 2.5510189533233643 + }, + { + "auxiliary_loss_clip": 0.06436282, + "auxiliary_loss_mlp": 0.01266369, + "balance_loss_clip": 0.06281953, + "balance_loss_mlp": 0.01253918, + "epoch": 0.5149556591011574, + "flos": 20346465726720.0, + "grad_norm": 3.2981963875061915, + "language_loss": 0.73735076, + "learning_rate": 2.0001947308404283e-06, + "loss": 0.81437725, + "num_input_tokens_seen": 184107110, + "router_z_loss_clip": 1.54589844, + "router_z_loss_mlp": 0.12451172, + "step": 8565, + "time_per_iteration": 2.565485715866089 + }, + { + "auxiliary_loss_clip": 0.06439919, + "auxiliary_loss_mlp": 0.01271905, + "balance_loss_clip": 0.06283022, + "balance_loss_mlp": 0.01259478, + "epoch": 0.5150157823538254, + "flos": 22644869621760.0, + "grad_norm": 2.0080537974138424, + "language_loss": 0.6841439, + "learning_rate": 1.9998052691595715e-06, + "loss": 0.76126206, + "num_input_tokens_seen": 184127105, + "router_z_loss_clip": 1.56542969, + "router_z_loss_mlp": 0.12438965, + "step": 8566, + "time_per_iteration": 2.540060520172119 + }, + { + "auxiliary_loss_clip": 0.06439756, + "auxiliary_loss_mlp": 0.01270124, + "balance_loss_clip": 0.06282447, + "balance_loss_mlp": 0.0125828, + "epoch": 0.5150759056064933, + "flos": 26074795852800.0, + "grad_norm": 1.7193676063763261, + "language_loss": 0.78763425, + "learning_rate": 1.9994158074861005e-06, + "loss": 0.86473316, + "num_input_tokens_seen": 184148060, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.11834717, + "step": 8567, + "time_per_iteration": 2.610316276550293 + }, + { + "auxiliary_loss_clip": 0.06433998, + "auxiliary_loss_mlp": 0.0126364, + "balance_loss_clip": 0.06282104, + "balance_loss_mlp": 0.01251535, + "epoch": 0.5151360288591613, + "flos": 25958865329280.0, + "grad_norm": 1.8031823951648205, + "language_loss": 0.79058564, + "learning_rate": 1.9990263458347806e-06, + "loss": 0.86756206, + "num_input_tokens_seen": 184166175, + "router_z_loss_clip": 1.51855469, + "router_z_loss_mlp": 0.12091064, + "step": 8568, + "time_per_iteration": 2.5746078491210938 + }, + { + "auxiliary_loss_clip": 0.06425972, + "auxiliary_loss_mlp": 0.01263804, + "balance_loss_clip": 0.06277977, + "balance_loss_mlp": 0.01252705, + "epoch": 0.5151961521118292, + "flos": 18511840581120.0, + "grad_norm": 2.107330893228774, + "language_loss": 0.90881652, + "learning_rate": 1.9986368842203825e-06, + "loss": 0.98571432, + "num_input_tokens_seen": 184182600, + "router_z_loss_clip": 1.48046875, + "router_z_loss_mlp": 0.11096191, + "step": 8569, + "time_per_iteration": 2.5259969234466553 + }, + { + "auxiliary_loss_clip": 0.06436515, + "auxiliary_loss_mlp": 0.01273396, + "balance_loss_clip": 0.06282495, + "balance_loss_mlp": 0.01261225, + "epoch": 0.5152562753644973, + "flos": 22239734330880.0, + "grad_norm": 1.7160477900396784, + "language_loss": 0.77020866, + "learning_rate": 1.998247422657674e-06, + "loss": 0.84730774, + "num_input_tokens_seen": 184202020, + "router_z_loss_clip": 1.53808594, + "router_z_loss_mlp": 0.12188721, + "step": 8570, + "time_per_iteration": 2.5214664936065674 + }, + { + "auxiliary_loss_clip": 0.06435493, + "auxiliary_loss_mlp": 0.01269852, + "balance_loss_clip": 0.06284317, + "balance_loss_mlp": 0.01256817, + "epoch": 0.5153163986171652, + "flos": 38445833784960.0, + "grad_norm": 1.5069722692963965, + "language_loss": 0.73508942, + "learning_rate": 1.9978579611614227e-06, + "loss": 0.81214285, + "num_input_tokens_seen": 184224850, + "router_z_loss_clip": 1.51269531, + "router_z_loss_mlp": 0.1305542, + "step": 8571, + "time_per_iteration": 2.6566643714904785 + }, + { + "auxiliary_loss_clip": 0.06335695, + "auxiliary_loss_mlp": 0.01251905, + "balance_loss_clip": 0.06270696, + "balance_loss_mlp": 0.01250073, + "epoch": 0.5153765218698332, + "flos": 66404533783680.0, + "grad_norm": 0.7650204220049751, + "language_loss": 0.52955389, + "learning_rate": 1.9974684997463984e-06, + "loss": 0.60542989, + "num_input_tokens_seen": 184288520, + "router_z_loss_clip": 0.64990234, + "router_z_loss_mlp": 0.01826477, + "step": 8572, + "time_per_iteration": 3.231537103652954 + }, + { + "auxiliary_loss_clip": 0.06429811, + "auxiliary_loss_mlp": 0.01269417, + "balance_loss_clip": 0.06284182, + "balance_loss_mlp": 0.01257622, + "epoch": 0.5154366451225011, + "flos": 24031537240320.0, + "grad_norm": 1.6307698114257092, + "language_loss": 0.76929724, + "learning_rate": 1.9970790384273687e-06, + "loss": 0.84628952, + "num_input_tokens_seen": 184308565, + "router_z_loss_clip": 1.45703125, + "router_z_loss_mlp": 0.11791992, + "step": 8573, + "time_per_iteration": 2.5637993812561035 + }, + { + "auxiliary_loss_clip": 0.06429262, + "auxiliary_loss_mlp": 0.01267008, + "balance_loss_clip": 0.06281111, + "balance_loss_mlp": 0.01255099, + "epoch": 0.5154967683751691, + "flos": 23474189808000.0, + "grad_norm": 2.3679054324331967, + "language_loss": 0.77109015, + "learning_rate": 1.996689577219102e-06, + "loss": 0.84805286, + "num_input_tokens_seen": 184326795, + "router_z_loss_clip": 1.48046875, + "router_z_loss_mlp": 0.11914062, + "step": 8574, + "time_per_iteration": 2.53300404548645 + }, + { + "auxiliary_loss_clip": 0.06429033, + "auxiliary_loss_mlp": 0.01263951, + "balance_loss_clip": 0.06281316, + "balance_loss_mlp": 0.01252691, + "epoch": 0.515556891627837, + "flos": 23812463940480.0, + "grad_norm": 1.7644957150045186, + "language_loss": 0.85785985, + "learning_rate": 1.996300116136367e-06, + "loss": 0.93478966, + "num_input_tokens_seen": 184345990, + "router_z_loss_clip": 1.4765625, + "router_z_loss_mlp": 0.11248779, + "step": 8575, + "time_per_iteration": 2.577409029006958 + }, + { + "auxiliary_loss_clip": 0.06435408, + "auxiliary_loss_mlp": 0.01265893, + "balance_loss_clip": 0.06283233, + "balance_loss_mlp": 0.01253859, + "epoch": 0.515617014880505, + "flos": 19834665788160.0, + "grad_norm": 1.5082721708333224, + "language_loss": 0.76947051, + "learning_rate": 1.995910655193932e-06, + "loss": 0.84648347, + "num_input_tokens_seen": 184366300, + "router_z_loss_clip": 1.52050781, + "router_z_loss_mlp": 0.1204834, + "step": 8576, + "time_per_iteration": 2.5881736278533936 + }, + { + "auxiliary_loss_clip": 0.06444222, + "auxiliary_loss_mlp": 0.01270832, + "balance_loss_clip": 0.06283684, + "balance_loss_mlp": 0.01258083, + "epoch": 0.515677138133173, + "flos": 14251042863360.0, + "grad_norm": 2.2995750246066406, + "language_loss": 0.75517124, + "learning_rate": 1.9955211944065654e-06, + "loss": 0.83232176, + "num_input_tokens_seen": 184383030, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.12762451, + "step": 8577, + "time_per_iteration": 2.518495559692383 + }, + { + "auxiliary_loss_clip": 0.06436984, + "auxiliary_loss_mlp": 0.01270084, + "balance_loss_clip": 0.0628281, + "balance_loss_mlp": 0.01257037, + "epoch": 0.515737261385841, + "flos": 28296653443200.0, + "grad_norm": 4.0524023742876345, + "language_loss": 0.81602645, + "learning_rate": 1.9951317337890353e-06, + "loss": 0.89309716, + "num_input_tokens_seen": 184403410, + "router_z_loss_clip": 1.54101562, + "router_z_loss_mlp": 0.13049316, + "step": 8578, + "time_per_iteration": 2.5854508876800537 + }, + { + "auxiliary_loss_clip": 0.06431551, + "auxiliary_loss_mlp": 0.01266524, + "balance_loss_clip": 0.06281303, + "balance_loss_mlp": 0.01254746, + "epoch": 0.515797384638509, + "flos": 27899400435840.0, + "grad_norm": 1.724028071509101, + "language_loss": 0.7613306, + "learning_rate": 1.9947422733561105e-06, + "loss": 0.83831137, + "num_input_tokens_seen": 184423830, + "router_z_loss_clip": 1.50097656, + "router_z_loss_mlp": 0.11785889, + "step": 8579, + "time_per_iteration": 2.5765621662139893 + }, + { + "auxiliary_loss_clip": 0.06434369, + "auxiliary_loss_mlp": 0.01265499, + "balance_loss_clip": 0.06280281, + "balance_loss_mlp": 0.01253053, + "epoch": 0.5158575078911769, + "flos": 23046860384640.0, + "grad_norm": 1.6181814769530192, + "language_loss": 0.79290402, + "learning_rate": 1.994352813122559e-06, + "loss": 0.86990273, + "num_input_tokens_seen": 184445050, + "router_z_loss_clip": 1.54199219, + "router_z_loss_mlp": 0.12457275, + "step": 8580, + "time_per_iteration": 2.5879290103912354 + }, + { + "auxiliary_loss_clip": 0.0643789, + "auxiliary_loss_mlp": 0.01268597, + "balance_loss_clip": 0.06283616, + "balance_loss_mlp": 0.01254763, + "epoch": 0.5159176311438449, + "flos": 12646350120960.0, + "grad_norm": 1.9944005001089613, + "language_loss": 0.73488963, + "learning_rate": 1.99396335310315e-06, + "loss": 0.81195444, + "num_input_tokens_seen": 184460775, + "router_z_loss_clip": 1.54199219, + "router_z_loss_mlp": 0.1383667, + "step": 8581, + "time_per_iteration": 2.500063180923462 + }, + { + "auxiliary_loss_clip": 0.06434488, + "auxiliary_loss_mlp": 0.01266672, + "balance_loss_clip": 0.06284754, + "balance_loss_mlp": 0.01254781, + "epoch": 0.5159777543965128, + "flos": 15563302456320.0, + "grad_norm": 1.882801773214852, + "language_loss": 0.74207276, + "learning_rate": 1.9935738933126508e-06, + "loss": 0.81908435, + "num_input_tokens_seen": 184477365, + "router_z_loss_clip": 1.49511719, + "router_z_loss_mlp": 0.11901855, + "step": 8582, + "time_per_iteration": 2.518564462661743 + }, + { + "auxiliary_loss_clip": 0.06429887, + "auxiliary_loss_mlp": 0.01265806, + "balance_loss_clip": 0.06280613, + "balance_loss_mlp": 0.01254648, + "epoch": 0.5160378776491809, + "flos": 23228352328320.0, + "grad_norm": 1.8807127189493567, + "language_loss": 0.66238904, + "learning_rate": 1.99318443376583e-06, + "loss": 0.73934591, + "num_input_tokens_seen": 184497045, + "router_z_loss_clip": 1.49316406, + "router_z_loss_mlp": 0.11157227, + "step": 8583, + "time_per_iteration": 2.542539119720459 + }, + { + "auxiliary_loss_clip": 0.06437095, + "auxiliary_loss_mlp": 0.01269933, + "balance_loss_clip": 0.06283841, + "balance_loss_mlp": 0.01257404, + "epoch": 0.5160980009018488, + "flos": 21951074615040.0, + "grad_norm": 1.3417837681818925, + "language_loss": 0.760252, + "learning_rate": 1.9927949744774568e-06, + "loss": 0.83732229, + "num_input_tokens_seen": 184517675, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.12524414, + "step": 8584, + "time_per_iteration": 2.587082624435425 + }, + { + "auxiliary_loss_clip": 0.06437847, + "auxiliary_loss_mlp": 0.01266727, + "balance_loss_clip": 0.06283042, + "balance_loss_mlp": 0.01253579, + "epoch": 0.5161581241545168, + "flos": 22790708853120.0, + "grad_norm": 1.8159571462416286, + "language_loss": 0.78972226, + "learning_rate": 1.9924055154622983e-06, + "loss": 0.866768, + "num_input_tokens_seen": 184537745, + "router_z_loss_clip": 1.54785156, + "router_z_loss_mlp": 0.13153076, + "step": 8585, + "time_per_iteration": 3.918409824371338 + }, + { + "auxiliary_loss_clip": 0.06432407, + "auxiliary_loss_mlp": 0.01268664, + "balance_loss_clip": 0.06287332, + "balance_loss_mlp": 0.01257076, + "epoch": 0.5162182474071847, + "flos": 19680273440640.0, + "grad_norm": 1.974004410778628, + "language_loss": 0.81013006, + "learning_rate": 1.9920160567351238e-06, + "loss": 0.88714075, + "num_input_tokens_seen": 184553630, + "router_z_loss_clip": 1.45117188, + "router_z_loss_mlp": 0.11578369, + "step": 8586, + "time_per_iteration": 2.4944536685943604 + }, + { + "auxiliary_loss_clip": 0.06434685, + "auxiliary_loss_mlp": 0.01270978, + "balance_loss_clip": 0.06284505, + "balance_loss_mlp": 0.01258473, + "epoch": 0.5162783706598527, + "flos": 20052145860480.0, + "grad_norm": 2.892216813448522, + "language_loss": 0.71914274, + "learning_rate": 1.991626598310701e-06, + "loss": 0.79619938, + "num_input_tokens_seen": 184573530, + "router_z_loss_clip": 1.50097656, + "router_z_loss_mlp": 0.125, + "step": 8587, + "time_per_iteration": 2.500964403152466 + }, + { + "auxiliary_loss_clip": 0.06328937, + "auxiliary_loss_mlp": 0.01260473, + "balance_loss_clip": 0.06264381, + "balance_loss_mlp": 0.01258639, + "epoch": 0.5163384939125206, + "flos": 69980089610880.0, + "grad_norm": 0.7154986672608752, + "language_loss": 0.57844335, + "learning_rate": 1.9912371402037984e-06, + "loss": 0.65433741, + "num_input_tokens_seen": 184637875, + "router_z_loss_clip": 0.6484375, + "router_z_loss_mlp": 0.01829529, + "step": 8588, + "time_per_iteration": 4.569206476211548 + }, + { + "auxiliary_loss_clip": 0.06434999, + "auxiliary_loss_mlp": 0.01267755, + "balance_loss_clip": 0.06281946, + "balance_loss_mlp": 0.01254618, + "epoch": 0.5163986171651886, + "flos": 17422176159360.0, + "grad_norm": 8.344302755834537, + "language_loss": 0.75224382, + "learning_rate": 1.990847682429185e-06, + "loss": 0.82927144, + "num_input_tokens_seen": 184656125, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.13134766, + "step": 8589, + "time_per_iteration": 2.551936388015747 + }, + { + "auxiliary_loss_clip": 0.06436837, + "auxiliary_loss_mlp": 0.01265639, + "balance_loss_clip": 0.0628375, + "balance_loss_mlp": 0.01254607, + "epoch": 0.5164587404178566, + "flos": 21328752741120.0, + "grad_norm": 1.4649655682055334, + "language_loss": 0.67921245, + "learning_rate": 1.990458225001627e-06, + "loss": 0.75623721, + "num_input_tokens_seen": 184675920, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.11035156, + "step": 8590, + "time_per_iteration": 2.5104808807373047 + }, + { + "auxiliary_loss_clip": 0.06330067, + "auxiliary_loss_mlp": 0.01255277, + "balance_loss_clip": 0.06265621, + "balance_loss_mlp": 0.01253319, + "epoch": 0.5165188636705246, + "flos": 68076506954880.0, + "grad_norm": 0.7672531816981234, + "language_loss": 0.55843657, + "learning_rate": 1.990068767935895e-06, + "loss": 0.63428998, + "num_input_tokens_seen": 184730520, + "router_z_loss_clip": 0.64550781, + "router_z_loss_mlp": 0.01956177, + "step": 8591, + "time_per_iteration": 3.0606987476348877 + }, + { + "auxiliary_loss_clip": 0.06426874, + "auxiliary_loss_mlp": 0.01264002, + "balance_loss_clip": 0.06283274, + "balance_loss_mlp": 0.01253261, + "epoch": 0.5165789869231926, + "flos": 19390859038080.0, + "grad_norm": 1.5432128891960295, + "language_loss": 0.81508362, + "learning_rate": 1.9896793112467566e-06, + "loss": 0.89199233, + "num_input_tokens_seen": 184748340, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.10736084, + "step": 8592, + "time_per_iteration": 2.5063397884368896 + }, + { + "auxiliary_loss_clip": 0.0642782, + "auxiliary_loss_mlp": 0.01262629, + "balance_loss_clip": 0.06281757, + "balance_loss_mlp": 0.01251626, + "epoch": 0.5166391101758605, + "flos": 20966607394560.0, + "grad_norm": 1.7131386706837877, + "language_loss": 0.83462119, + "learning_rate": 1.989289854948979e-06, + "loss": 0.91152561, + "num_input_tokens_seen": 184766615, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.11010742, + "step": 8593, + "time_per_iteration": 3.951284170150757 + }, + { + "auxiliary_loss_clip": 0.06431139, + "auxiliary_loss_mlp": 0.01265605, + "balance_loss_clip": 0.06281991, + "balance_loss_mlp": 0.01253833, + "epoch": 0.5166992334285285, + "flos": 29470411036800.0, + "grad_norm": 1.8647556534792968, + "language_loss": 0.69381714, + "learning_rate": 1.9889003990573314e-06, + "loss": 0.77078462, + "num_input_tokens_seen": 184788075, + "router_z_loss_clip": 1.49121094, + "router_z_loss_mlp": 0.11761475, + "step": 8594, + "time_per_iteration": 2.600724220275879 + }, + { + "auxiliary_loss_clip": 0.06431773, + "auxiliary_loss_mlp": 0.01266128, + "balance_loss_clip": 0.06282206, + "balance_loss_mlp": 0.0125441, + "epoch": 0.5167593566811964, + "flos": 20310813014400.0, + "grad_norm": 1.4700297891307748, + "language_loss": 0.77611995, + "learning_rate": 1.988510943586582e-06, + "loss": 0.85309899, + "num_input_tokens_seen": 184808710, + "router_z_loss_clip": 1.49511719, + "router_z_loss_mlp": 0.1171875, + "step": 8595, + "time_per_iteration": 2.5478954315185547 + }, + { + "auxiliary_loss_clip": 0.06431342, + "auxiliary_loss_mlp": 0.01266673, + "balance_loss_clip": 0.06281155, + "balance_loss_mlp": 0.01255563, + "epoch": 0.5168194799338645, + "flos": 14616668154240.0, + "grad_norm": 1.457832438333805, + "language_loss": 0.65828246, + "learning_rate": 1.9881214885514986e-06, + "loss": 0.73526263, + "num_input_tokens_seen": 184826475, + "router_z_loss_clip": 1.50292969, + "router_z_loss_mlp": 0.11114502, + "step": 8596, + "time_per_iteration": 2.5720162391662598 + }, + { + "auxiliary_loss_clip": 0.06432624, + "auxiliary_loss_mlp": 0.01271477, + "balance_loss_clip": 0.06281975, + "balance_loss_mlp": 0.01258483, + "epoch": 0.5168796031865324, + "flos": 25013866181760.0, + "grad_norm": 1.4915456509806782, + "language_loss": 0.75734007, + "learning_rate": 1.9877320339668492e-06, + "loss": 0.8343811, + "num_input_tokens_seen": 184845245, + "router_z_loss_clip": 1.50683594, + "router_z_loss_mlp": 0.12988281, + "step": 8597, + "time_per_iteration": 2.5495989322662354 + }, + { + "auxiliary_loss_clip": 0.06427812, + "auxiliary_loss_mlp": 0.01266343, + "balance_loss_clip": 0.06278015, + "balance_loss_mlp": 0.01254583, + "epoch": 0.5169397264392004, + "flos": 26946728640000.0, + "grad_norm": 1.7231987845025152, + "language_loss": 0.8152492, + "learning_rate": 1.987342579847403e-06, + "loss": 0.89219069, + "num_input_tokens_seen": 184866605, + "router_z_loss_clip": 1.49609375, + "router_z_loss_mlp": 0.11773682, + "step": 8598, + "time_per_iteration": 2.6746177673339844 + }, + { + "auxiliary_loss_clip": 0.06427282, + "auxiliary_loss_mlp": 0.0126742, + "balance_loss_clip": 0.06279184, + "balance_loss_mlp": 0.0125523, + "epoch": 0.5169998496918683, + "flos": 25414347571200.0, + "grad_norm": 1.537627068096994, + "language_loss": 0.7597698, + "learning_rate": 1.9869531262079273e-06, + "loss": 0.83671683, + "num_input_tokens_seen": 184886945, + "router_z_loss_clip": 1.47851562, + "router_z_loss_mlp": 0.12194824, + "step": 8599, + "time_per_iteration": 2.548478841781616 + }, + { + "auxiliary_loss_clip": 0.06428513, + "auxiliary_loss_mlp": 0.01264151, + "balance_loss_clip": 0.06279552, + "balance_loss_mlp": 0.01253291, + "epoch": 0.5170599729445363, + "flos": 24687667036800.0, + "grad_norm": 4.521028695007152, + "language_loss": 0.72775459, + "learning_rate": 1.9865636730631904e-06, + "loss": 0.80468118, + "num_input_tokens_seen": 184905590, + "router_z_loss_clip": 1.48828125, + "router_z_loss_mlp": 0.10852051, + "step": 8600, + "time_per_iteration": 3.977342367172241 + }, + { + "auxiliary_loss_clip": 0.06427286, + "auxiliary_loss_mlp": 0.01268182, + "balance_loss_clip": 0.06278619, + "balance_loss_mlp": 0.01256732, + "epoch": 0.5171200961972042, + "flos": 21000499171200.0, + "grad_norm": 1.369345328324843, + "language_loss": 0.74472946, + "learning_rate": 1.9861742204279602e-06, + "loss": 0.82168412, + "num_input_tokens_seen": 184925555, + "router_z_loss_clip": 1.48535156, + "router_z_loss_mlp": 0.11444092, + "step": 8601, + "time_per_iteration": 2.5409762859344482 + }, + { + "auxiliary_loss_clip": 0.06429532, + "auxiliary_loss_mlp": 0.01271067, + "balance_loss_clip": 0.06278992, + "balance_loss_mlp": 0.01258467, + "epoch": 0.5171802194498722, + "flos": 22751953539840.0, + "grad_norm": 1.8713669852223682, + "language_loss": 0.83940291, + "learning_rate": 1.9857847683170045e-06, + "loss": 0.9164089, + "num_input_tokens_seen": 184944490, + "router_z_loss_clip": 1.50585938, + "router_z_loss_mlp": 0.12597656, + "step": 8602, + "time_per_iteration": 2.5086002349853516 + }, + { + "auxiliary_loss_clip": 0.06431083, + "auxiliary_loss_mlp": 0.01265946, + "balance_loss_clip": 0.06279787, + "balance_loss_mlp": 0.01254026, + "epoch": 0.5172403427025402, + "flos": 28183070833920.0, + "grad_norm": 1.835239532551919, + "language_loss": 0.74816436, + "learning_rate": 1.9853953167450926e-06, + "loss": 0.82513469, + "num_input_tokens_seen": 184963190, + "router_z_loss_clip": 1.51269531, + "router_z_loss_mlp": 0.1192627, + "step": 8603, + "time_per_iteration": 2.628830909729004 + }, + { + "auxiliary_loss_clip": 0.06434101, + "auxiliary_loss_mlp": 0.01267589, + "balance_loss_clip": 0.06281082, + "balance_loss_mlp": 0.01255566, + "epoch": 0.5173004659552082, + "flos": 20343782396160.0, + "grad_norm": 2.436721116583926, + "language_loss": 0.73165393, + "learning_rate": 1.9850058657269915e-06, + "loss": 0.80867082, + "num_input_tokens_seen": 184981220, + "router_z_loss_clip": 1.52734375, + "router_z_loss_mlp": 0.12017822, + "step": 8604, + "time_per_iteration": 2.521681785583496 + }, + { + "auxiliary_loss_clip": 0.06440152, + "auxiliary_loss_mlp": 0.01268375, + "balance_loss_clip": 0.06279815, + "balance_loss_mlp": 0.01254469, + "epoch": 0.5173605892078762, + "flos": 19069481502720.0, + "grad_norm": 1.6971244246662016, + "language_loss": 0.85418487, + "learning_rate": 1.984616415277469e-06, + "loss": 0.93127012, + "num_input_tokens_seen": 184998810, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.13922119, + "step": 8605, + "time_per_iteration": 2.5182762145996094 + }, + { + "auxiliary_loss_clip": 0.06430884, + "auxiliary_loss_mlp": 0.01270289, + "balance_loss_clip": 0.06279606, + "balance_loss_mlp": 0.01258893, + "epoch": 0.5174207124605441, + "flos": 28001620817280.0, + "grad_norm": 1.308601391892793, + "language_loss": 0.64964187, + "learning_rate": 1.984226965411294e-06, + "loss": 0.72665358, + "num_input_tokens_seen": 185021185, + "router_z_loss_clip": 1.51269531, + "router_z_loss_mlp": 0.1138916, + "step": 8606, + "time_per_iteration": 2.5762083530426025 + }, + { + "auxiliary_loss_clip": 0.06431288, + "auxiliary_loss_mlp": 0.01265541, + "balance_loss_clip": 0.06280211, + "balance_loss_mlp": 0.0125362, + "epoch": 0.5174808357132121, + "flos": 19502135660160.0, + "grad_norm": 1.5729301555613031, + "language_loss": 0.78141046, + "learning_rate": 1.983837516143234e-06, + "loss": 0.85837877, + "num_input_tokens_seen": 185038465, + "router_z_loss_clip": 1.50976562, + "router_z_loss_mlp": 0.11914062, + "step": 8607, + "time_per_iteration": 2.5321435928344727 + }, + { + "auxiliary_loss_clip": 0.06431965, + "auxiliary_loss_mlp": 0.01271738, + "balance_loss_clip": 0.06280412, + "balance_loss_mlp": 0.01259049, + "epoch": 0.51754095896588, + "flos": 22790834634240.0, + "grad_norm": 1.7409540075434562, + "language_loss": 0.72313815, + "learning_rate": 1.983448067488057e-06, + "loss": 0.80017519, + "num_input_tokens_seen": 185057340, + "router_z_loss_clip": 1.51660156, + "router_z_loss_mlp": 0.12677002, + "step": 8608, + "time_per_iteration": 2.52758526802063 + }, + { + "auxiliary_loss_clip": 0.06435958, + "auxiliary_loss_mlp": 0.01273384, + "balance_loss_clip": 0.06279105, + "balance_loss_mlp": 0.01261046, + "epoch": 0.5176010822185481, + "flos": 22674987964800.0, + "grad_norm": 1.7194792439439102, + "language_loss": 0.86816031, + "learning_rate": 1.983058619460531e-06, + "loss": 0.94525373, + "num_input_tokens_seen": 185074935, + "router_z_loss_clip": 1.56738281, + "router_z_loss_mlp": 0.12341309, + "step": 8609, + "time_per_iteration": 2.538146495819092 + }, + { + "auxiliary_loss_clip": 0.06431948, + "auxiliary_loss_mlp": 0.0126355, + "balance_loss_clip": 0.06280786, + "balance_loss_mlp": 0.01252201, + "epoch": 0.517661205471216, + "flos": 23957967755520.0, + "grad_norm": 2.0604849644666943, + "language_loss": 0.73853832, + "learning_rate": 1.9826691720754237e-06, + "loss": 0.81549335, + "num_input_tokens_seen": 185095050, + "router_z_loss_clip": 1.51074219, + "router_z_loss_mlp": 0.11352539, + "step": 8610, + "time_per_iteration": 2.5313732624053955 + }, + { + "auxiliary_loss_clip": 0.064363, + "auxiliary_loss_mlp": 0.01270735, + "balance_loss_clip": 0.06279181, + "balance_loss_mlp": 0.01258051, + "epoch": 0.517721328723884, + "flos": 15601470791040.0, + "grad_norm": 2.184245135297296, + "language_loss": 0.67738098, + "learning_rate": 1.9822797253475034e-06, + "loss": 0.75445139, + "num_input_tokens_seen": 185112275, + "router_z_loss_clip": 1.57128906, + "router_z_loss_mlp": 0.12689209, + "step": 8611, + "time_per_iteration": 2.510500431060791 + }, + { + "auxiliary_loss_clip": 0.06427399, + "auxiliary_loss_mlp": 0.0126573, + "balance_loss_clip": 0.06275965, + "balance_loss_mlp": 0.01253153, + "epoch": 0.5177814519765519, + "flos": 20966607394560.0, + "grad_norm": 1.678614110348905, + "language_loss": 0.77387339, + "learning_rate": 1.9818902792915373e-06, + "loss": 0.85080469, + "num_input_tokens_seen": 185132165, + "router_z_loss_clip": 1.51367188, + "router_z_loss_mlp": 0.12573242, + "step": 8612, + "time_per_iteration": 2.5206472873687744 + }, + { + "auxiliary_loss_clip": 0.064338, + "auxiliary_loss_mlp": 0.01269204, + "balance_loss_clip": 0.0628019, + "balance_loss_mlp": 0.01257641, + "epoch": 0.5178415752292199, + "flos": 17973653806080.0, + "grad_norm": 1.9437798274552756, + "language_loss": 0.82318223, + "learning_rate": 1.981500833922294e-06, + "loss": 0.90021223, + "num_input_tokens_seen": 185151025, + "router_z_loss_clip": 1.53613281, + "router_z_loss_mlp": 0.11560059, + "step": 8613, + "time_per_iteration": 2.4999184608459473 + }, + { + "auxiliary_loss_clip": 0.06431679, + "auxiliary_loss_mlp": 0.01268922, + "balance_loss_clip": 0.062784, + "balance_loss_mlp": 0.01255511, + "epoch": 0.5179016984818878, + "flos": 17827227596160.0, + "grad_norm": 2.2958122780571473, + "language_loss": 0.66944718, + "learning_rate": 1.981111389254541e-06, + "loss": 0.74645323, + "num_input_tokens_seen": 185168455, + "router_z_loss_clip": 1.53320312, + "router_z_loss_mlp": 0.1340332, + "step": 8614, + "time_per_iteration": 2.480762004852295 + }, + { + "auxiliary_loss_clip": 0.06432712, + "auxiliary_loss_mlp": 0.0126997, + "balance_loss_clip": 0.06278278, + "balance_loss_mlp": 0.01257465, + "epoch": 0.5179618217345558, + "flos": 17826011712000.0, + "grad_norm": 1.8941766649542733, + "language_loss": 0.87114352, + "learning_rate": 1.9807219453030453e-06, + "loss": 0.94817036, + "num_input_tokens_seen": 185184415, + "router_z_loss_clip": 1.54492188, + "router_z_loss_mlp": 0.12493896, + "step": 8615, + "time_per_iteration": 2.500279188156128 + }, + { + "auxiliary_loss_clip": 0.06431083, + "auxiliary_loss_mlp": 0.01270372, + "balance_loss_clip": 0.06278686, + "balance_loss_mlp": 0.01258731, + "epoch": 0.5180219449872238, + "flos": 22527639360000.0, + "grad_norm": 1.466896191984659, + "language_loss": 0.80947113, + "learning_rate": 1.9803325020825763e-06, + "loss": 0.8864857, + "num_input_tokens_seen": 185202910, + "router_z_loss_clip": 1.5234375, + "router_z_loss_mlp": 0.11639404, + "step": 8616, + "time_per_iteration": 2.523977279663086 + }, + { + "auxiliary_loss_clip": 0.06436383, + "auxiliary_loss_mlp": 0.01270292, + "balance_loss_clip": 0.0627937, + "balance_loss_mlp": 0.01257554, + "epoch": 0.5180820682398918, + "flos": 23922356970240.0, + "grad_norm": 2.681335053285678, + "language_loss": 0.75563776, + "learning_rate": 1.9799430596079e-06, + "loss": 0.83270454, + "num_input_tokens_seen": 185223085, + "router_z_loss_clip": 1.56835938, + "router_z_loss_mlp": 0.12744141, + "step": 8617, + "time_per_iteration": 2.5584635734558105 + }, + { + "auxiliary_loss_clip": 0.0643236, + "auxiliary_loss_mlp": 0.01270738, + "balance_loss_clip": 0.06279179, + "balance_loss_mlp": 0.01258215, + "epoch": 0.5181421914925598, + "flos": 16985119662720.0, + "grad_norm": 2.384459515549961, + "language_loss": 0.70321333, + "learning_rate": 1.979553617893785e-06, + "loss": 0.78024429, + "num_input_tokens_seen": 185241295, + "router_z_loss_clip": 1.53320312, + "router_z_loss_mlp": 0.12518311, + "step": 8618, + "time_per_iteration": 2.4864299297332764 + }, + { + "auxiliary_loss_clip": 0.06326556, + "auxiliary_loss_mlp": 0.01258187, + "balance_loss_clip": 0.0626248, + "balance_loss_mlp": 0.01256348, + "epoch": 0.5182023147452277, + "flos": 66080472917760.0, + "grad_norm": 0.9021946533901657, + "language_loss": 0.6731512, + "learning_rate": 1.979164176954999e-06, + "loss": 0.74899864, + "num_input_tokens_seen": 185298295, + "router_z_loss_clip": 0.640625, + "router_z_loss_mlp": 0.01834106, + "step": 8619, + "time_per_iteration": 3.1113593578338623 + }, + { + "auxiliary_loss_clip": 0.06429242, + "auxiliary_loss_mlp": 0.01268132, + "balance_loss_clip": 0.06279487, + "balance_loss_mlp": 0.01256235, + "epoch": 0.5182624379978957, + "flos": 18193775281920.0, + "grad_norm": 1.7875432352275369, + "language_loss": 0.79252517, + "learning_rate": 1.97877473680631e-06, + "loss": 0.86949891, + "num_input_tokens_seen": 185317000, + "router_z_loss_clip": 1.49804688, + "router_z_loss_mlp": 0.11883545, + "step": 8620, + "time_per_iteration": 2.490337371826172 + }, + { + "auxiliary_loss_clip": 0.06426805, + "auxiliary_loss_mlp": 0.01265045, + "balance_loss_clip": 0.06278054, + "balance_loss_mlp": 0.01253815, + "epoch": 0.5183225612505636, + "flos": 14031759928320.0, + "grad_norm": 2.0424555394318347, + "language_loss": 0.82670712, + "learning_rate": 1.9783852974624846e-06, + "loss": 0.90362567, + "num_input_tokens_seen": 185331185, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.11236572, + "step": 8621, + "time_per_iteration": 2.5358636379241943 + }, + { + "auxiliary_loss_clip": 0.06430708, + "auxiliary_loss_mlp": 0.01270453, + "balance_loss_clip": 0.06278727, + "balance_loss_mlp": 0.01257787, + "epoch": 0.5183826845032317, + "flos": 23666582782080.0, + "grad_norm": 3.572556492630201, + "language_loss": 0.65903664, + "learning_rate": 1.9779958589382905e-06, + "loss": 0.73604816, + "num_input_tokens_seen": 185348955, + "router_z_loss_clip": 1.51757812, + "router_z_loss_mlp": 0.12664795, + "step": 8622, + "time_per_iteration": 2.5054616928100586 + }, + { + "auxiliary_loss_clip": 0.06440182, + "auxiliary_loss_mlp": 0.0126943, + "balance_loss_clip": 0.06282417, + "balance_loss_mlp": 0.01257419, + "epoch": 0.5184428077558996, + "flos": 15894155502720.0, + "grad_norm": 2.003886693767472, + "language_loss": 0.60810971, + "learning_rate": 1.977606421248497e-06, + "loss": 0.68520582, + "num_input_tokens_seen": 185367330, + "router_z_loss_clip": 1.57617188, + "router_z_loss_mlp": 0.12011719, + "step": 8623, + "time_per_iteration": 2.517026662826538 + }, + { + "auxiliary_loss_clip": 0.06431899, + "auxiliary_loss_mlp": 0.01268479, + "balance_loss_clip": 0.06278786, + "balance_loss_mlp": 0.01256766, + "epoch": 0.5185029310085676, + "flos": 21036864643200.0, + "grad_norm": 1.709310334319468, + "language_loss": 0.76342779, + "learning_rate": 1.9772169844078685e-06, + "loss": 0.84043157, + "num_input_tokens_seen": 185385060, + "router_z_loss_clip": 1.53027344, + "router_z_loss_mlp": 0.11712646, + "step": 8624, + "time_per_iteration": 2.5128896236419678 + }, + { + "auxiliary_loss_clip": 0.0643063, + "auxiliary_loss_mlp": 0.01264535, + "balance_loss_clip": 0.06277324, + "balance_loss_mlp": 0.01251684, + "epoch": 0.5185630542612355, + "flos": 26550062611200.0, + "grad_norm": 2.453361725716909, + "language_loss": 0.71663254, + "learning_rate": 1.9768275484311756e-06, + "loss": 0.79358423, + "num_input_tokens_seen": 185403745, + "router_z_loss_clip": 1.53320312, + "router_z_loss_mlp": 0.12854004, + "step": 8625, + "time_per_iteration": 3.9488492012023926 + }, + { + "auxiliary_loss_clip": 0.06427859, + "auxiliary_loss_mlp": 0.01267162, + "balance_loss_clip": 0.06276631, + "balance_loss_mlp": 0.01255378, + "epoch": 0.5186231775139035, + "flos": 20674803150720.0, + "grad_norm": 1.8867804759418334, + "language_loss": 0.68206352, + "learning_rate": 1.976438113333184e-06, + "loss": 0.75901365, + "num_input_tokens_seen": 185422620, + "router_z_loss_clip": 1.51171875, + "router_z_loss_mlp": 0.11785889, + "step": 8626, + "time_per_iteration": 2.5555548667907715 + }, + { + "auxiliary_loss_clip": 0.06429964, + "auxiliary_loss_mlp": 0.01270465, + "balance_loss_clip": 0.06278128, + "balance_loss_mlp": 0.01257459, + "epoch": 0.5186833007665714, + "flos": 20891612390400.0, + "grad_norm": 1.918580922134282, + "language_loss": 0.70565557, + "learning_rate": 1.9760486791286612e-06, + "loss": 0.78265989, + "num_input_tokens_seen": 185439380, + "router_z_loss_clip": 1.51757812, + "router_z_loss_mlp": 0.13006592, + "step": 8627, + "time_per_iteration": 2.481426954269409 + }, + { + "auxiliary_loss_clip": 0.0643362, + "auxiliary_loss_mlp": 0.01266564, + "balance_loss_clip": 0.06277519, + "balance_loss_mlp": 0.01254399, + "epoch": 0.5187434240192395, + "flos": 20893247544960.0, + "grad_norm": 1.7293286755655957, + "language_loss": 0.73529112, + "learning_rate": 1.9756592458323753e-06, + "loss": 0.81229293, + "num_input_tokens_seen": 185458830, + "router_z_loss_clip": 1.56152344, + "router_z_loss_mlp": 0.12164307, + "step": 8628, + "time_per_iteration": 3.9418892860412598 + }, + { + "auxiliary_loss_clip": 0.0642761, + "auxiliary_loss_mlp": 0.01268136, + "balance_loss_clip": 0.06276411, + "balance_loss_mlp": 0.01255851, + "epoch": 0.5188035472719074, + "flos": 19865203401600.0, + "grad_norm": 1.86469754984735, + "language_loss": 0.77606678, + "learning_rate": 1.9752698134590927e-06, + "loss": 0.85302424, + "num_input_tokens_seen": 185477270, + "router_z_loss_clip": 1.51074219, + "router_z_loss_mlp": 0.1229248, + "step": 8629, + "time_per_iteration": 2.536813974380493 + }, + { + "auxiliary_loss_clip": 0.06431592, + "auxiliary_loss_mlp": 0.01268458, + "balance_loss_clip": 0.06276736, + "balance_loss_mlp": 0.01255923, + "epoch": 0.5188636705245754, + "flos": 21144032415360.0, + "grad_norm": 2.295438438275443, + "language_loss": 0.74746907, + "learning_rate": 1.9748803820235815e-06, + "loss": 0.82446957, + "num_input_tokens_seen": 185495795, + "router_z_loss_clip": 1.546875, + "router_z_loss_mlp": 0.12536621, + "step": 8630, + "time_per_iteration": 2.5338122844696045 + }, + { + "auxiliary_loss_clip": 0.06432383, + "auxiliary_loss_mlp": 0.0126778, + "balance_loss_clip": 0.06276915, + "balance_loss_mlp": 0.01253636, + "epoch": 0.5189237937772434, + "flos": 22426467154560.0, + "grad_norm": 1.6718033524216807, + "language_loss": 0.80433989, + "learning_rate": 1.9744909515406093e-06, + "loss": 0.88134158, + "num_input_tokens_seen": 185514885, + "router_z_loss_clip": 1.5546875, + "router_z_loss_mlp": 0.14141846, + "step": 8631, + "time_per_iteration": 2.5228912830352783 + }, + { + "auxiliary_loss_clip": 0.06431842, + "auxiliary_loss_mlp": 0.01268253, + "balance_loss_clip": 0.06276682, + "balance_loss_mlp": 0.01255187, + "epoch": 0.5189839170299113, + "flos": 25453647936000.0, + "grad_norm": 1.4304618482279687, + "language_loss": 0.74388516, + "learning_rate": 1.974101522024942e-06, + "loss": 0.82088614, + "num_input_tokens_seen": 185537155, + "router_z_loss_clip": 1.55175781, + "router_z_loss_mlp": 0.1305542, + "step": 8632, + "time_per_iteration": 2.5850229263305664 + }, + { + "auxiliary_loss_clip": 0.06424779, + "auxiliary_loss_mlp": 0.01268865, + "balance_loss_clip": 0.06277869, + "balance_loss_mlp": 0.01255865, + "epoch": 0.5190440402825793, + "flos": 18593585838720.0, + "grad_norm": 1.7732237266140687, + "language_loss": 0.79105878, + "learning_rate": 1.9737120934913477e-06, + "loss": 0.86799526, + "num_input_tokens_seen": 185555520, + "router_z_loss_clip": 1.46777344, + "router_z_loss_mlp": 0.13018799, + "step": 8633, + "time_per_iteration": 3.944106340408325 + }, + { + "auxiliary_loss_clip": 0.06433854, + "auxiliary_loss_mlp": 0.01265699, + "balance_loss_clip": 0.06279819, + "balance_loss_mlp": 0.01253492, + "epoch": 0.5191041635352472, + "flos": 21915170340480.0, + "grad_norm": 1.7747709828095277, + "language_loss": 0.80929339, + "learning_rate": 1.9733226659545936e-06, + "loss": 0.88628888, + "num_input_tokens_seen": 185573855, + "router_z_loss_clip": 1.5390625, + "router_z_loss_mlp": 0.12200928, + "step": 8634, + "time_per_iteration": 2.4922289848327637 + }, + { + "auxiliary_loss_clip": 0.0643179, + "auxiliary_loss_mlp": 0.01268829, + "balance_loss_clip": 0.06280308, + "balance_loss_mlp": 0.01256985, + "epoch": 0.5191642867879153, + "flos": 27535536080640.0, + "grad_norm": 1.4623629686344204, + "language_loss": 0.69064617, + "learning_rate": 1.9729332394294467e-06, + "loss": 0.76765239, + "num_input_tokens_seen": 185595145, + "router_z_loss_clip": 1.51660156, + "router_z_loss_mlp": 0.11846924, + "step": 8635, + "time_per_iteration": 2.5806636810302734 + }, + { + "auxiliary_loss_clip": 0.06433641, + "auxiliary_loss_mlp": 0.01269766, + "balance_loss_clip": 0.06278556, + "balance_loss_mlp": 0.01257356, + "epoch": 0.5192244100405832, + "flos": 15711489601920.0, + "grad_norm": 1.5680222184402974, + "language_loss": 0.77829492, + "learning_rate": 1.9725438139306742e-06, + "loss": 0.85532898, + "num_input_tokens_seen": 185613320, + "router_z_loss_clip": 1.55371094, + "router_z_loss_mlp": 0.12414551, + "step": 8636, + "time_per_iteration": 2.5346691608428955 + }, + { + "auxiliary_loss_clip": 0.0643746, + "auxiliary_loss_mlp": 0.01268889, + "balance_loss_clip": 0.06281422, + "balance_loss_mlp": 0.01256122, + "epoch": 0.5192845332932512, + "flos": 12061903092480.0, + "grad_norm": 2.0443106284945016, + "language_loss": 0.72005326, + "learning_rate": 1.9721543894730425e-06, + "loss": 0.7971167, + "num_input_tokens_seen": 185630730, + "router_z_loss_clip": 1.55859375, + "router_z_loss_mlp": 0.12768555, + "step": 8637, + "time_per_iteration": 2.5669779777526855 + }, + { + "auxiliary_loss_clip": 0.06428012, + "auxiliary_loss_mlp": 0.01270032, + "balance_loss_clip": 0.06279644, + "balance_loss_mlp": 0.01257724, + "epoch": 0.5193446565459191, + "flos": 18959211129600.0, + "grad_norm": 2.0277263511036625, + "language_loss": 0.76600313, + "learning_rate": 1.9717649660713194e-06, + "loss": 0.8429836, + "num_input_tokens_seen": 185648515, + "router_z_loss_clip": 1.48339844, + "router_z_loss_mlp": 0.12298584, + "step": 8638, + "time_per_iteration": 2.4836151599884033 + }, + { + "auxiliary_loss_clip": 0.06427278, + "auxiliary_loss_mlp": 0.012673, + "balance_loss_clip": 0.06276545, + "balance_loss_mlp": 0.0125548, + "epoch": 0.5194047797985871, + "flos": 20381028336000.0, + "grad_norm": 1.8081920937255338, + "language_loss": 0.74863744, + "learning_rate": 1.971375543740272e-06, + "loss": 0.82558322, + "num_input_tokens_seen": 185665220, + "router_z_loss_clip": 1.50878906, + "router_z_loss_mlp": 0.11828613, + "step": 8639, + "time_per_iteration": 2.508589029312134 + }, + { + "auxiliary_loss_clip": 0.06432048, + "auxiliary_loss_mlp": 0.01270657, + "balance_loss_clip": 0.06280512, + "balance_loss_mlp": 0.01258045, + "epoch": 0.519464903051255, + "flos": 24359916591360.0, + "grad_norm": 1.679129082437046, + "language_loss": 0.77792585, + "learning_rate": 1.9709861224946665e-06, + "loss": 0.85495287, + "num_input_tokens_seen": 185683750, + "router_z_loss_clip": 1.51367188, + "router_z_loss_mlp": 0.12628174, + "step": 8640, + "time_per_iteration": 4.030183553695679 + }, + { + "auxiliary_loss_clip": 0.06430673, + "auxiliary_loss_mlp": 0.012682, + "balance_loss_clip": 0.06282452, + "balance_loss_mlp": 0.01256482, + "epoch": 0.519525026303923, + "flos": 14066657953920.0, + "grad_norm": 1.8086687453592558, + "language_loss": 0.66518152, + "learning_rate": 1.97059670234927e-06, + "loss": 0.74217027, + "num_input_tokens_seen": 185700625, + "router_z_loss_clip": 1.48046875, + "router_z_loss_mlp": 0.11700439, + "step": 8641, + "time_per_iteration": 2.471047878265381 + }, + { + "auxiliary_loss_clip": 0.06427969, + "auxiliary_loss_mlp": 0.01270672, + "balance_loss_clip": 0.06279019, + "balance_loss_mlp": 0.01259228, + "epoch": 0.519585149556591, + "flos": 28842722501760.0, + "grad_norm": 1.7536948571823123, + "language_loss": 0.76330602, + "learning_rate": 1.97020728331885e-06, + "loss": 0.84029233, + "num_input_tokens_seen": 185721155, + "router_z_loss_clip": 1.48730469, + "router_z_loss_mlp": 0.11456299, + "step": 8642, + "time_per_iteration": 2.5977513790130615 + }, + { + "auxiliary_loss_clip": 0.06428998, + "auxiliary_loss_mlp": 0.01266151, + "balance_loss_clip": 0.06280753, + "balance_loss_mlp": 0.01254374, + "epoch": 0.519645272809259, + "flos": 25379826888960.0, + "grad_norm": 21.827473826572724, + "language_loss": 0.83256245, + "learning_rate": 1.9698178654181726e-06, + "loss": 0.90951395, + "num_input_tokens_seen": 185740990, + "router_z_loss_clip": 1.48339844, + "router_z_loss_mlp": 0.11767578, + "step": 8643, + "time_per_iteration": 2.547438621520996 + }, + { + "auxiliary_loss_clip": 0.06436369, + "auxiliary_loss_mlp": 0.01268573, + "balance_loss_clip": 0.06280598, + "balance_loss_mlp": 0.01255508, + "epoch": 0.519705396061927, + "flos": 25379659180800.0, + "grad_norm": 1.5731350893002956, + "language_loss": 0.70531744, + "learning_rate": 1.969428448662004e-06, + "loss": 0.78236687, + "num_input_tokens_seen": 185762235, + "router_z_loss_clip": 1.55664062, + "router_z_loss_mlp": 0.13067627, + "step": 8644, + "time_per_iteration": 2.5876879692077637 + }, + { + "auxiliary_loss_clip": 0.06430183, + "auxiliary_loss_mlp": 0.01266621, + "balance_loss_clip": 0.0627798, + "balance_loss_mlp": 0.01254825, + "epoch": 0.5197655193145949, + "flos": 28483889391360.0, + "grad_norm": 1.5934186274855324, + "language_loss": 0.80385697, + "learning_rate": 1.9690390330651133e-06, + "loss": 0.88082504, + "num_input_tokens_seen": 185783415, + "router_z_loss_clip": 1.52148438, + "router_z_loss_mlp": 0.11804199, + "step": 8645, + "time_per_iteration": 2.574620246887207 + }, + { + "auxiliary_loss_clip": 0.06430401, + "auxiliary_loss_mlp": 0.01271116, + "balance_loss_clip": 0.06280167, + "balance_loss_mlp": 0.01258898, + "epoch": 0.5198256425672629, + "flos": 20014983774720.0, + "grad_norm": 1.690489867798711, + "language_loss": 0.78455305, + "learning_rate": 1.968649618642264e-06, + "loss": 0.86156821, + "num_input_tokens_seen": 185801345, + "router_z_loss_clip": 1.50292969, + "router_z_loss_mlp": 0.12207031, + "step": 8646, + "time_per_iteration": 2.6401519775390625 + }, + { + "auxiliary_loss_clip": 0.06429573, + "auxiliary_loss_mlp": 0.01268342, + "balance_loss_clip": 0.06279829, + "balance_loss_mlp": 0.01256243, + "epoch": 0.5198857658199308, + "flos": 19835043131520.0, + "grad_norm": 2.3656488760516132, + "language_loss": 0.66367847, + "learning_rate": 1.9682602054082252e-06, + "loss": 0.74065757, + "num_input_tokens_seen": 185820815, + "router_z_loss_clip": 1.49804688, + "router_z_loss_mlp": 0.12091064, + "step": 8647, + "time_per_iteration": 2.599353551864624 + }, + { + "auxiliary_loss_clip": 0.06438218, + "auxiliary_loss_mlp": 0.01268132, + "balance_loss_clip": 0.06282619, + "balance_loss_mlp": 0.0125462, + "epoch": 0.5199458890725989, + "flos": 24468761445120.0, + "grad_norm": 1.778197055342432, + "language_loss": 0.71491444, + "learning_rate": 1.967870793377763e-06, + "loss": 0.79197794, + "num_input_tokens_seen": 185841450, + "router_z_loss_clip": 1.55566406, + "router_z_loss_mlp": 0.13513184, + "step": 8648, + "time_per_iteration": 2.572368860244751 + }, + { + "auxiliary_loss_clip": 0.06438164, + "auxiliary_loss_mlp": 0.01268937, + "balance_loss_clip": 0.06285776, + "balance_loss_mlp": 0.01255884, + "epoch": 0.5200060123252668, + "flos": 23411605207680.0, + "grad_norm": 2.1583755088943875, + "language_loss": 0.64699459, + "learning_rate": 1.967481382565642e-06, + "loss": 0.72406554, + "num_input_tokens_seen": 185859935, + "router_z_loss_clip": 1.5234375, + "router_z_loss_mlp": 0.13031006, + "step": 8649, + "time_per_iteration": 2.5117433071136475 + }, + { + "auxiliary_loss_clip": 0.06439677, + "auxiliary_loss_mlp": 0.01274224, + "balance_loss_clip": 0.06281672, + "balance_loss_mlp": 0.01260778, + "epoch": 0.5200661355779348, + "flos": 17207002074240.0, + "grad_norm": 5.161359302041442, + "language_loss": 0.70409989, + "learning_rate": 1.9670919729866315e-06, + "loss": 0.78123897, + "num_input_tokens_seen": 185876795, + "router_z_loss_clip": 1.58105469, + "router_z_loss_mlp": 0.13446045, + "step": 8650, + "time_per_iteration": 2.5144400596618652 + }, + { + "auxiliary_loss_clip": 0.06431218, + "auxiliary_loss_mlp": 0.01268732, + "balance_loss_clip": 0.06279574, + "balance_loss_mlp": 0.01256936, + "epoch": 0.5201262588306027, + "flos": 18520980675840.0, + "grad_norm": 1.6145243882323275, + "language_loss": 0.78030795, + "learning_rate": 1.966702564655496e-06, + "loss": 0.85730743, + "num_input_tokens_seen": 185895570, + "router_z_loss_clip": 1.515625, + "router_z_loss_mlp": 0.11791992, + "step": 8651, + "time_per_iteration": 2.467643976211548 + }, + { + "auxiliary_loss_clip": 0.06437017, + "auxiliary_loss_mlp": 0.01266893, + "balance_loss_clip": 0.06283189, + "balance_loss_mlp": 0.01253709, + "epoch": 0.5201863820832707, + "flos": 18624458868480.0, + "grad_norm": 1.6266187944599841, + "language_loss": 0.79176587, + "learning_rate": 1.966313157587003e-06, + "loss": 0.86880493, + "num_input_tokens_seen": 185913700, + "router_z_loss_clip": 1.53613281, + "router_z_loss_mlp": 0.13171387, + "step": 8652, + "time_per_iteration": 2.5569629669189453 + }, + { + "auxiliary_loss_clip": 0.06434878, + "auxiliary_loss_mlp": 0.01268954, + "balance_loss_clip": 0.0628317, + "balance_loss_mlp": 0.01255919, + "epoch": 0.5202465053359386, + "flos": 22863817140480.0, + "grad_norm": 1.9022927985659936, + "language_loss": 0.70460284, + "learning_rate": 1.9659237517959187e-06, + "loss": 0.78164113, + "num_input_tokens_seen": 185932460, + "router_z_loss_clip": 1.515625, + "router_z_loss_mlp": 0.13049316, + "step": 8653, + "time_per_iteration": 2.5013556480407715 + }, + { + "auxiliary_loss_clip": 0.06435711, + "auxiliary_loss_mlp": 0.01270582, + "balance_loss_clip": 0.06279919, + "balance_loss_mlp": 0.01257124, + "epoch": 0.5203066285886067, + "flos": 21988068992640.0, + "grad_norm": 1.7386916801416297, + "language_loss": 0.78877962, + "learning_rate": 1.965534347297008e-06, + "loss": 0.86584258, + "num_input_tokens_seen": 185952030, + "router_z_loss_clip": 1.55859375, + "router_z_loss_mlp": 0.13452148, + "step": 8654, + "time_per_iteration": 2.5205516815185547 + }, + { + "auxiliary_loss_clip": 0.06439671, + "auxiliary_loss_mlp": 0.01271817, + "balance_loss_clip": 0.06283241, + "balance_loss_mlp": 0.01258763, + "epoch": 0.5203667518412746, + "flos": 20240094568320.0, + "grad_norm": 1.7537160659546802, + "language_loss": 0.84438735, + "learning_rate": 1.9651449441050393e-06, + "loss": 0.92150223, + "num_input_tokens_seen": 185973130, + "router_z_loss_clip": 1.56347656, + "router_z_loss_mlp": 0.13043213, + "step": 8655, + "time_per_iteration": 2.523545026779175 + }, + { + "auxiliary_loss_clip": 0.06427735, + "auxiliary_loss_mlp": 0.01264722, + "balance_loss_clip": 0.06279121, + "balance_loss_mlp": 0.01253027, + "epoch": 0.5204268750939426, + "flos": 15710860696320.0, + "grad_norm": 2.477748600032862, + "language_loss": 0.66631675, + "learning_rate": 1.9647555422347777e-06, + "loss": 0.74324131, + "num_input_tokens_seen": 185990200, + "router_z_loss_clip": 1.48632812, + "router_z_loss_mlp": 0.11688232, + "step": 8656, + "time_per_iteration": 2.504314661026001 + }, + { + "auxiliary_loss_clip": 0.06430535, + "auxiliary_loss_mlp": 0.01266767, + "balance_loss_clip": 0.06278028, + "balance_loss_mlp": 0.01254203, + "epoch": 0.5204869983466105, + "flos": 27456096810240.0, + "grad_norm": 1.7743424381892883, + "language_loss": 0.73250526, + "learning_rate": 1.9643661417009893e-06, + "loss": 0.80947828, + "num_input_tokens_seen": 186009880, + "router_z_loss_clip": 1.52539062, + "router_z_loss_mlp": 0.12567139, + "step": 8657, + "time_per_iteration": 2.547746419906616 + }, + { + "auxiliary_loss_clip": 0.06431027, + "auxiliary_loss_mlp": 0.01268378, + "balance_loss_clip": 0.06281261, + "balance_loss_mlp": 0.01255611, + "epoch": 0.5205471215992785, + "flos": 20601820644480.0, + "grad_norm": 1.9136699042437477, + "language_loss": 0.71553123, + "learning_rate": 1.9639767425184408e-06, + "loss": 0.79252529, + "num_input_tokens_seen": 186026680, + "router_z_loss_clip": 1.49804688, + "router_z_loss_mlp": 0.12756348, + "step": 8658, + "time_per_iteration": 2.523796796798706 + }, + { + "auxiliary_loss_clip": 0.06426262, + "auxiliary_loss_mlp": 0.01268222, + "balance_loss_clip": 0.06275812, + "balance_loss_mlp": 0.01255669, + "epoch": 0.5206072448519465, + "flos": 22134537129600.0, + "grad_norm": 1.8507369766537312, + "language_loss": 0.83638287, + "learning_rate": 1.963587344701897e-06, + "loss": 0.91332769, + "num_input_tokens_seen": 186046920, + "router_z_loss_clip": 1.50585938, + "router_z_loss_mlp": 0.12554932, + "step": 8659, + "time_per_iteration": 2.5169432163238525 + }, + { + "auxiliary_loss_clip": 0.06437267, + "auxiliary_loss_mlp": 0.01269684, + "balance_loss_clip": 0.06277223, + "balance_loss_mlp": 0.01255587, + "epoch": 0.5206673681046144, + "flos": 18335924933760.0, + "grad_norm": 2.050641453841446, + "language_loss": 0.75738013, + "learning_rate": 1.9631979482661253e-06, + "loss": 0.83444965, + "num_input_tokens_seen": 186062090, + "router_z_loss_clip": 1.59863281, + "router_z_loss_mlp": 0.14093018, + "step": 8660, + "time_per_iteration": 2.557415723800659 + }, + { + "auxiliary_loss_clip": 0.06428091, + "auxiliary_loss_mlp": 0.0126833, + "balance_loss_clip": 0.06277187, + "balance_loss_mlp": 0.01256105, + "epoch": 0.5207274913572825, + "flos": 20236488842880.0, + "grad_norm": 1.6215362458867588, + "language_loss": 0.77692747, + "learning_rate": 1.9628085532258906e-06, + "loss": 0.85389173, + "num_input_tokens_seen": 186081135, + "router_z_loss_clip": 1.50683594, + "router_z_loss_mlp": 0.12231445, + "step": 8661, + "time_per_iteration": 2.509428024291992 + }, + { + "auxiliary_loss_clip": 0.06431398, + "auxiliary_loss_mlp": 0.01266033, + "balance_loss_clip": 0.06278183, + "balance_loss_mlp": 0.01254112, + "epoch": 0.5207876146099504, + "flos": 22133530880640.0, + "grad_norm": 1.7321078317719976, + "language_loss": 0.70359308, + "learning_rate": 1.9624191595959603e-06, + "loss": 0.78056741, + "num_input_tokens_seen": 186099700, + "router_z_loss_clip": 1.53027344, + "router_z_loss_mlp": 0.1192627, + "step": 8662, + "time_per_iteration": 2.5810325145721436 + }, + { + "auxiliary_loss_clip": 0.0642472, + "auxiliary_loss_mlp": 0.01270038, + "balance_loss_clip": 0.06276304, + "balance_loss_mlp": 0.01257169, + "epoch": 0.5208477378626184, + "flos": 23885781863040.0, + "grad_norm": 1.845579934529664, + "language_loss": 0.70074278, + "learning_rate": 1.962029767391098e-06, + "loss": 0.77769035, + "num_input_tokens_seen": 186119740, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.12872314, + "step": 8663, + "time_per_iteration": 2.528122901916504 + }, + { + "auxiliary_loss_clip": 0.06433125, + "auxiliary_loss_mlp": 0.01272195, + "balance_loss_clip": 0.06282328, + "balance_loss_mlp": 0.01259619, + "epoch": 0.5209078611152863, + "flos": 20968158695040.0, + "grad_norm": 1.5162641399491859, + "language_loss": 0.77111858, + "learning_rate": 1.961640376626072e-06, + "loss": 0.84817183, + "num_input_tokens_seen": 186140645, + "router_z_loss_clip": 1.50878906, + "router_z_loss_mlp": 0.12591553, + "step": 8664, + "time_per_iteration": 3.9675118923187256 + }, + { + "auxiliary_loss_clip": 0.06426796, + "auxiliary_loss_mlp": 0.01274545, + "balance_loss_clip": 0.06275374, + "balance_loss_mlp": 0.01261641, + "epoch": 0.5209679843679543, + "flos": 20674006536960.0, + "grad_norm": 1.9585914111684504, + "language_loss": 0.76477247, + "learning_rate": 1.961250987315646e-06, + "loss": 0.84178591, + "num_input_tokens_seen": 186160130, + "router_z_loss_clip": 1.51269531, + "router_z_loss_mlp": 0.12915039, + "step": 8665, + "time_per_iteration": 2.541412830352783 + }, + { + "auxiliary_loss_clip": 0.06427725, + "auxiliary_loss_mlp": 0.01272532, + "balance_loss_clip": 0.06278466, + "balance_loss_mlp": 0.01260593, + "epoch": 0.5210281076206222, + "flos": 20233050825600.0, + "grad_norm": 1.6923585849410518, + "language_loss": 0.72734976, + "learning_rate": 1.960861599474586e-06, + "loss": 0.80435228, + "num_input_tokens_seen": 186179485, + "router_z_loss_clip": 1.4921875, + "router_z_loss_mlp": 0.11920166, + "step": 8666, + "time_per_iteration": 2.4996509552001953 + }, + { + "auxiliary_loss_clip": 0.06442789, + "auxiliary_loss_mlp": 0.01270993, + "balance_loss_clip": 0.0628055, + "balance_loss_mlp": 0.01256199, + "epoch": 0.5210882308732903, + "flos": 16075395884160.0, + "grad_norm": 2.8085912573953093, + "language_loss": 0.69292629, + "learning_rate": 1.9604722131176592e-06, + "loss": 0.77006412, + "num_input_tokens_seen": 186197140, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.14794922, + "step": 8667, + "time_per_iteration": 3.966068744659424 + }, + { + "auxiliary_loss_clip": 0.06427799, + "auxiliary_loss_mlp": 0.0127319, + "balance_loss_clip": 0.06280097, + "balance_loss_mlp": 0.01261793, + "epoch": 0.5211483541259582, + "flos": 24831954967680.0, + "grad_norm": 1.4529640974986662, + "language_loss": 0.8142345, + "learning_rate": 1.960082828259629e-06, + "loss": 0.89124429, + "num_input_tokens_seen": 186216800, + "router_z_loss_clip": 1.47558594, + "router_z_loss_mlp": 0.11401367, + "step": 8668, + "time_per_iteration": 2.531757116317749 + }, + { + "auxiliary_loss_clip": 0.06428734, + "auxiliary_loss_mlp": 0.01265972, + "balance_loss_clip": 0.06277529, + "balance_loss_mlp": 0.01253485, + "epoch": 0.5212084773786262, + "flos": 20375997091200.0, + "grad_norm": 2.3545461183864793, + "language_loss": 0.6399523, + "learning_rate": 1.9596934449152623e-06, + "loss": 0.71689939, + "num_input_tokens_seen": 186235320, + "router_z_loss_clip": 1.51171875, + "router_z_loss_mlp": 0.12493896, + "step": 8669, + "time_per_iteration": 2.582458019256592 + }, + { + "auxiliary_loss_clip": 0.06433244, + "auxiliary_loss_mlp": 0.01270095, + "balance_loss_clip": 0.06281579, + "balance_loss_mlp": 0.01257846, + "epoch": 0.5212686006312941, + "flos": 23151596388480.0, + "grad_norm": 1.5489696479352357, + "language_loss": 0.66586244, + "learning_rate": 1.959304063099325e-06, + "loss": 0.74289578, + "num_input_tokens_seen": 186254460, + "router_z_loss_clip": 1.51757812, + "router_z_loss_mlp": 0.12261963, + "step": 8670, + "time_per_iteration": 2.5730559825897217 + }, + { + "auxiliary_loss_clip": 0.0642543, + "auxiliary_loss_mlp": 0.01273699, + "balance_loss_clip": 0.06278989, + "balance_loss_mlp": 0.01262195, + "epoch": 0.5213287238839621, + "flos": 27780073822080.0, + "grad_norm": 2.549693242202028, + "language_loss": 0.76187384, + "learning_rate": 1.9589146828265806e-06, + "loss": 0.83886516, + "num_input_tokens_seen": 186269465, + "router_z_loss_clip": 1.46386719, + "router_z_loss_mlp": 0.11505127, + "step": 8671, + "time_per_iteration": 2.5233168601989746 + }, + { + "auxiliary_loss_clip": 0.064327, + "auxiliary_loss_mlp": 0.01274872, + "balance_loss_clip": 0.06278658, + "balance_loss_mlp": 0.01262534, + "epoch": 0.5213888471366301, + "flos": 19943762204160.0, + "grad_norm": 1.8121341163261586, + "language_loss": 0.78893673, + "learning_rate": 1.958525304111796e-06, + "loss": 0.86601251, + "num_input_tokens_seen": 186288660, + "router_z_loss_clip": 1.54003906, + "router_z_loss_mlp": 0.12341309, + "step": 8672, + "time_per_iteration": 3.9492485523223877 + }, + { + "auxiliary_loss_clip": 0.06431769, + "auxiliary_loss_mlp": 0.01269371, + "balance_loss_clip": 0.06282303, + "balance_loss_mlp": 0.01257957, + "epoch": 0.521448970389298, + "flos": 16988389971840.0, + "grad_norm": 2.0794497937850327, + "language_loss": 0.72609621, + "learning_rate": 1.958135926969736e-06, + "loss": 0.80310762, + "num_input_tokens_seen": 186305760, + "router_z_loss_clip": 1.49414062, + "router_z_loss_mlp": 0.11425781, + "step": 8673, + "time_per_iteration": 2.4858944416046143 + }, + { + "auxiliary_loss_clip": 0.06430827, + "auxiliary_loss_mlp": 0.01267899, + "balance_loss_clip": 0.06280996, + "balance_loss_mlp": 0.01256133, + "epoch": 0.5215090936419661, + "flos": 18995744309760.0, + "grad_norm": 1.6692646430310563, + "language_loss": 0.75224721, + "learning_rate": 1.957746551415166e-06, + "loss": 0.82923448, + "num_input_tokens_seen": 186324135, + "router_z_loss_clip": 1.49609375, + "router_z_loss_mlp": 0.11755371, + "step": 8674, + "time_per_iteration": 2.528323173522949 + }, + { + "auxiliary_loss_clip": 0.06432723, + "auxiliary_loss_mlp": 0.01271657, + "balance_loss_clip": 0.06279887, + "balance_loss_mlp": 0.01258812, + "epoch": 0.521569216894634, + "flos": 16148923441920.0, + "grad_norm": 2.0098628900715694, + "language_loss": 0.86161578, + "learning_rate": 1.9573571774628506e-06, + "loss": 0.93865955, + "num_input_tokens_seen": 186340205, + "router_z_loss_clip": 1.52832031, + "router_z_loss_mlp": 0.128479, + "step": 8675, + "time_per_iteration": 2.486656665802002 + }, + { + "auxiliary_loss_clip": 0.06328152, + "auxiliary_loss_mlp": 0.0125317, + "balance_loss_clip": 0.06263625, + "balance_loss_mlp": 0.01251218, + "epoch": 0.521629340147302, + "flos": 57596054296320.0, + "grad_norm": 0.8389911483177593, + "language_loss": 0.62711406, + "learning_rate": 1.9569678051275556e-06, + "loss": 0.70292729, + "num_input_tokens_seen": 186396940, + "router_z_loss_clip": 0.6484375, + "router_z_loss_mlp": 0.01950073, + "step": 8676, + "time_per_iteration": 3.09920597076416 + }, + { + "auxiliary_loss_clip": 0.06427533, + "auxiliary_loss_mlp": 0.01264396, + "balance_loss_clip": 0.06277495, + "balance_loss_mlp": 0.01252839, + "epoch": 0.5216894633999699, + "flos": 26804117790720.0, + "grad_norm": 1.458201451867465, + "language_loss": 0.69111204, + "learning_rate": 1.956578434424046e-06, + "loss": 0.7680313, + "num_input_tokens_seen": 186418680, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.11572266, + "step": 8677, + "time_per_iteration": 2.5477073192596436 + }, + { + "auxiliary_loss_clip": 0.06427766, + "auxiliary_loss_mlp": 0.01266893, + "balance_loss_clip": 0.06279552, + "balance_loss_mlp": 0.01255127, + "epoch": 0.5217495866526379, + "flos": 26365803482880.0, + "grad_norm": 1.7210863244717929, + "language_loss": 0.65549737, + "learning_rate": 1.956189065367086e-06, + "loss": 0.73244393, + "num_input_tokens_seen": 186438265, + "router_z_loss_clip": 1.48046875, + "router_z_loss_mlp": 0.11749268, + "step": 8678, + "time_per_iteration": 2.566591739654541 + }, + { + "auxiliary_loss_clip": 0.06434263, + "auxiliary_loss_mlp": 0.01268698, + "balance_loss_clip": 0.06280728, + "balance_loss_mlp": 0.01255531, + "epoch": 0.5218097099053058, + "flos": 23590329966720.0, + "grad_norm": 2.9370978110790507, + "language_loss": 0.68504936, + "learning_rate": 1.9557996979714414e-06, + "loss": 0.762079, + "num_input_tokens_seen": 186456870, + "router_z_loss_clip": 1.53417969, + "router_z_loss_mlp": 0.1317749, + "step": 8679, + "time_per_iteration": 2.510748863220215 + }, + { + "auxiliary_loss_clip": 0.06433919, + "auxiliary_loss_mlp": 0.01268379, + "balance_loss_clip": 0.06281881, + "balance_loss_mlp": 0.01256345, + "epoch": 0.5218698331579739, + "flos": 18083253346560.0, + "grad_norm": 1.6397075137651071, + "language_loss": 0.67471087, + "learning_rate": 1.9554103322518764e-06, + "loss": 0.7517339, + "num_input_tokens_seen": 186476425, + "router_z_loss_clip": 1.51855469, + "router_z_loss_mlp": 0.12036133, + "step": 8680, + "time_per_iteration": 3.9219276905059814 + }, + { + "auxiliary_loss_clip": 0.06433384, + "auxiliary_loss_mlp": 0.01271487, + "balance_loss_clip": 0.06281422, + "balance_loss_mlp": 0.01259595, + "epoch": 0.5219299564106418, + "flos": 19287129283200.0, + "grad_norm": 1.8649470617465917, + "language_loss": 0.83311534, + "learning_rate": 1.955020968223156e-06, + "loss": 0.91016412, + "num_input_tokens_seen": 186492555, + "router_z_loss_clip": 1.51953125, + "router_z_loss_mlp": 0.11889648, + "step": 8681, + "time_per_iteration": 2.516465663909912 + }, + { + "auxiliary_loss_clip": 0.06426493, + "auxiliary_loss_mlp": 0.0126523, + "balance_loss_clip": 0.06276904, + "balance_loss_mlp": 0.01253792, + "epoch": 0.5219900796633098, + "flos": 26658613975680.0, + "grad_norm": 1.6454147062415487, + "language_loss": 0.77514279, + "learning_rate": 1.9546316059000454e-06, + "loss": 0.85205996, + "num_input_tokens_seen": 186513190, + "router_z_loss_clip": 1.49511719, + "router_z_loss_mlp": 0.11437988, + "step": 8682, + "time_per_iteration": 2.554325819015503 + }, + { + "auxiliary_loss_clip": 0.06427193, + "auxiliary_loss_mlp": 0.01266482, + "balance_loss_clip": 0.06279279, + "balance_loss_mlp": 0.01254949, + "epoch": 0.5220502029159777, + "flos": 34321148225280.0, + "grad_norm": 1.635540508166305, + "language_loss": 0.693317, + "learning_rate": 1.9542422452973082e-06, + "loss": 0.77025378, + "num_input_tokens_seen": 186534830, + "router_z_loss_clip": 1.47949219, + "router_z_loss_mlp": 0.11529541, + "step": 8683, + "time_per_iteration": 2.6571457386016846 + }, + { + "auxiliary_loss_clip": 0.06430393, + "auxiliary_loss_mlp": 0.01269896, + "balance_loss_clip": 0.06278116, + "balance_loss_mlp": 0.01257629, + "epoch": 0.5221103261686457, + "flos": 22161804433920.0, + "grad_norm": 1.5499745188789709, + "language_loss": 0.76029563, + "learning_rate": 1.9538528864297104e-06, + "loss": 0.83729851, + "num_input_tokens_seen": 186554390, + "router_z_loss_clip": 1.52441406, + "router_z_loss_mlp": 0.12255859, + "step": 8684, + "time_per_iteration": 2.5611672401428223 + }, + { + "auxiliary_loss_clip": 0.06422482, + "auxiliary_loss_mlp": 0.01267711, + "balance_loss_clip": 0.06276357, + "balance_loss_mlp": 0.01256123, + "epoch": 0.5221704494213137, + "flos": 19214440266240.0, + "grad_norm": 1.9689133598672337, + "language_loss": 0.75993264, + "learning_rate": 1.9534635293120153e-06, + "loss": 0.83683455, + "num_input_tokens_seen": 186572360, + "router_z_loss_clip": 1.46191406, + "router_z_loss_mlp": 0.11590576, + "step": 8685, + "time_per_iteration": 2.592336416244507 + }, + { + "auxiliary_loss_clip": 0.06433201, + "auxiliary_loss_mlp": 0.01267661, + "balance_loss_clip": 0.06280906, + "balance_loss_mlp": 0.01255549, + "epoch": 0.5222305726739817, + "flos": 19360069862400.0, + "grad_norm": 1.8592295664699974, + "language_loss": 0.81054503, + "learning_rate": 1.9530741739589876e-06, + "loss": 0.88755369, + "num_input_tokens_seen": 186590655, + "router_z_loss_clip": 1.52246094, + "router_z_loss_mlp": 0.12103271, + "step": 8686, + "time_per_iteration": 2.529801845550537 + }, + { + "auxiliary_loss_clip": 0.06419135, + "auxiliary_loss_mlp": 0.01266554, + "balance_loss_clip": 0.06276063, + "balance_loss_mlp": 0.01255021, + "epoch": 0.5222906959266497, + "flos": 27821554392960.0, + "grad_norm": 1.7724306724007597, + "language_loss": 0.7060039, + "learning_rate": 1.9526848203853927e-06, + "loss": 0.78286076, + "num_input_tokens_seen": 186610345, + "router_z_loss_clip": 1.43164062, + "router_z_loss_mlp": 0.11535645, + "step": 8687, + "time_per_iteration": 2.580845594406128 + }, + { + "auxiliary_loss_clip": 0.06421649, + "auxiliary_loss_mlp": 0.01267038, + "balance_loss_clip": 0.06277607, + "balance_loss_mlp": 0.01256297, + "epoch": 0.5223508191793176, + "flos": 12717781326720.0, + "grad_norm": 2.573153086937961, + "language_loss": 0.82975262, + "learning_rate": 1.9522954686059936e-06, + "loss": 0.90663946, + "num_input_tokens_seen": 186624360, + "router_z_loss_clip": 1.43945312, + "router_z_loss_mlp": 0.10736084, + "step": 8688, + "time_per_iteration": 2.479219436645508 + }, + { + "auxiliary_loss_clip": 0.06427407, + "auxiliary_loss_mlp": 0.01268772, + "balance_loss_clip": 0.06280096, + "balance_loss_mlp": 0.01256345, + "epoch": 0.5224109424319856, + "flos": 15637584700800.0, + "grad_norm": 2.221621058495187, + "language_loss": 0.74186772, + "learning_rate": 1.9519061186355558e-06, + "loss": 0.81882954, + "num_input_tokens_seen": 186638680, + "router_z_loss_clip": 1.47363281, + "router_z_loss_mlp": 0.12426758, + "step": 8689, + "time_per_iteration": 2.519578456878662 + }, + { + "auxiliary_loss_clip": 0.06423427, + "auxiliary_loss_mlp": 0.01264867, + "balance_loss_clip": 0.06277696, + "balance_loss_mlp": 0.01253858, + "epoch": 0.5224710656846535, + "flos": 15747687365760.0, + "grad_norm": 1.8795858532487468, + "language_loss": 0.8292582, + "learning_rate": 1.9515167704888417e-06, + "loss": 0.90614116, + "num_input_tokens_seen": 186655840, + "router_z_loss_clip": 1.45703125, + "router_z_loss_mlp": 0.11022949, + "step": 8690, + "time_per_iteration": 2.4795632362365723 + }, + { + "auxiliary_loss_clip": 0.06425175, + "auxiliary_loss_mlp": 0.01267616, + "balance_loss_clip": 0.06276759, + "balance_loss_mlp": 0.0125542, + "epoch": 0.5225311889373215, + "flos": 26038136891520.0, + "grad_norm": 1.8859654188369186, + "language_loss": 0.79290485, + "learning_rate": 1.9511274241806173e-06, + "loss": 0.86983275, + "num_input_tokens_seen": 186674150, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.12200928, + "step": 8691, + "time_per_iteration": 2.554316520690918 + }, + { + "auxiliary_loss_clip": 0.06425714, + "auxiliary_loss_mlp": 0.01267876, + "balance_loss_clip": 0.06275393, + "balance_loss_mlp": 0.01255044, + "epoch": 0.5225913121899894, + "flos": 18375183371520.0, + "grad_norm": 2.097465391576973, + "language_loss": 0.76909935, + "learning_rate": 1.950738079725646e-06, + "loss": 0.84603524, + "num_input_tokens_seen": 186690675, + "router_z_loss_clip": 1.50292969, + "router_z_loss_mlp": 0.12835693, + "step": 8692, + "time_per_iteration": 2.508985757827759 + }, + { + "auxiliary_loss_clip": 0.06422729, + "auxiliary_loss_mlp": 0.01266471, + "balance_loss_clip": 0.06279368, + "balance_loss_mlp": 0.01254872, + "epoch": 0.5226514354426575, + "flos": 29280407904000.0, + "grad_norm": 1.831817200061648, + "language_loss": 0.73045087, + "learning_rate": 1.950348737138691e-06, + "loss": 0.80734289, + "num_input_tokens_seen": 186710380, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.11608887, + "step": 8693, + "time_per_iteration": 2.5672616958618164 + }, + { + "auxiliary_loss_clip": 0.06430539, + "auxiliary_loss_mlp": 0.01265444, + "balance_loss_clip": 0.06276198, + "balance_loss_mlp": 0.01252802, + "epoch": 0.5227115586953254, + "flos": 22859330947200.0, + "grad_norm": 2.034375584307348, + "language_loss": 0.8244431, + "learning_rate": 1.949959396434517e-06, + "loss": 0.90140283, + "num_input_tokens_seen": 186729135, + "router_z_loss_clip": 1.54199219, + "router_z_loss_mlp": 0.12640381, + "step": 8694, + "time_per_iteration": 2.511063814163208 + }, + { + "auxiliary_loss_clip": 0.06334698, + "auxiliary_loss_mlp": 0.01264, + "balance_loss_clip": 0.06270603, + "balance_loss_mlp": 0.01262187, + "epoch": 0.5227716819479934, + "flos": 57491695635840.0, + "grad_norm": 0.936740482735722, + "language_loss": 0.55577236, + "learning_rate": 1.949570057627888e-06, + "loss": 0.63175929, + "num_input_tokens_seen": 186791115, + "router_z_loss_clip": 0.640625, + "router_z_loss_mlp": 0.01809692, + "step": 8695, + "time_per_iteration": 3.201383113861084 + }, + { + "auxiliary_loss_clip": 0.06426679, + "auxiliary_loss_mlp": 0.01263614, + "balance_loss_clip": 0.06277943, + "balance_loss_mlp": 0.01252074, + "epoch": 0.5228318052006613, + "flos": 13813357461120.0, + "grad_norm": 1.622631737546212, + "language_loss": 0.73801219, + "learning_rate": 1.9491807207335672e-06, + "loss": 0.81491518, + "num_input_tokens_seen": 186808660, + "router_z_loss_clip": 1.48730469, + "router_z_loss_mlp": 0.11547852, + "step": 8696, + "time_per_iteration": 2.542386770248413 + }, + { + "auxiliary_loss_clip": 0.06429457, + "auxiliary_loss_mlp": 0.01266915, + "balance_loss_clip": 0.06279002, + "balance_loss_mlp": 0.01254589, + "epoch": 0.5228919284533293, + "flos": 15601596572160.0, + "grad_norm": 1.5536675741091566, + "language_loss": 0.71410191, + "learning_rate": 1.948791385766319e-06, + "loss": 0.79106563, + "num_input_tokens_seen": 186825900, + "router_z_loss_clip": 1.50488281, + "router_z_loss_mlp": 0.12341309, + "step": 8697, + "time_per_iteration": 2.520252227783203 + }, + { + "auxiliary_loss_clip": 0.06423891, + "auxiliary_loss_mlp": 0.01265854, + "balance_loss_clip": 0.0627815, + "balance_loss_mlp": 0.0125453, + "epoch": 0.5229520517059973, + "flos": 22497982214400.0, + "grad_norm": 1.650008991843684, + "language_loss": 0.80845451, + "learning_rate": 1.948402052740906e-06, + "loss": 0.88535196, + "num_input_tokens_seen": 186843735, + "router_z_loss_clip": 1.45605469, + "router_z_loss_mlp": 0.11328125, + "step": 8698, + "time_per_iteration": 2.5636022090911865 + }, + { + "auxiliary_loss_clip": 0.06426111, + "auxiliary_loss_mlp": 0.01266716, + "balance_loss_clip": 0.06278659, + "balance_loss_mlp": 0.01254908, + "epoch": 0.5230121749586653, + "flos": 22097416970880.0, + "grad_norm": 3.7708298280456023, + "language_loss": 0.74449289, + "learning_rate": 1.948012721672093e-06, + "loss": 0.82142115, + "num_input_tokens_seen": 186862440, + "router_z_loss_clip": 1.47167969, + "router_z_loss_mlp": 0.1182251, + "step": 8699, + "time_per_iteration": 2.531606912612915 + }, + { + "auxiliary_loss_clip": 0.06432469, + "auxiliary_loss_mlp": 0.0126789, + "balance_loss_clip": 0.06277843, + "balance_loss_mlp": 0.01255325, + "epoch": 0.5230722982113333, + "flos": 22133656661760.0, + "grad_norm": 1.5875927962566738, + "language_loss": 0.73680252, + "learning_rate": 1.947623392574642e-06, + "loss": 0.81380606, + "num_input_tokens_seen": 186880940, + "router_z_loss_clip": 1.546875, + "router_z_loss_mlp": 0.12561035, + "step": 8700, + "time_per_iteration": 2.542734146118164 + }, + { + "auxiliary_loss_clip": 0.06429377, + "auxiliary_loss_mlp": 0.01275322, + "balance_loss_clip": 0.06277132, + "balance_loss_mlp": 0.01263127, + "epoch": 0.5231324214640012, + "flos": 25016214096000.0, + "grad_norm": 1.8967545071734793, + "language_loss": 0.67123276, + "learning_rate": 1.947234065463318e-06, + "loss": 0.74827981, + "num_input_tokens_seen": 186900785, + "router_z_loss_clip": 1.52148438, + "router_z_loss_mlp": 0.12207031, + "step": 8701, + "time_per_iteration": 2.543332815170288 + }, + { + "auxiliary_loss_clip": 0.06421816, + "auxiliary_loss_mlp": 0.01266038, + "balance_loss_clip": 0.06274643, + "balance_loss_mlp": 0.01254696, + "epoch": 0.5231925447166692, + "flos": 25747842021120.0, + "grad_norm": 1.6886589098280236, + "language_loss": 0.66874444, + "learning_rate": 1.9468447403528826e-06, + "loss": 0.74562299, + "num_input_tokens_seen": 186920895, + "router_z_loss_clip": 1.46972656, + "router_z_loss_mlp": 0.11340332, + "step": 8702, + "time_per_iteration": 2.5511581897735596 + }, + { + "auxiliary_loss_clip": 0.06426294, + "auxiliary_loss_mlp": 0.01268357, + "balance_loss_clip": 0.06277906, + "balance_loss_mlp": 0.01255906, + "epoch": 0.5232526679693371, + "flos": 21440322852480.0, + "grad_norm": 3.970152828937024, + "language_loss": 0.76360488, + "learning_rate": 1.946455417258101e-06, + "loss": 0.84055138, + "num_input_tokens_seen": 186940605, + "router_z_loss_clip": 1.48632812, + "router_z_loss_mlp": 0.12457275, + "step": 8703, + "time_per_iteration": 2.4913880825042725 + }, + { + "auxiliary_loss_clip": 0.06434231, + "auxiliary_loss_mlp": 0.01268071, + "balance_loss_clip": 0.06279694, + "balance_loss_mlp": 0.01255471, + "epoch": 0.5233127912220051, + "flos": 35307082892160.0, + "grad_norm": 2.0695890072195344, + "language_loss": 0.77554905, + "learning_rate": 1.9460660961937348e-06, + "loss": 0.85257214, + "num_input_tokens_seen": 186960820, + "router_z_loss_clip": 1.54589844, + "router_z_loss_mlp": 0.1260376, + "step": 8704, + "time_per_iteration": 4.093170642852783 + }, + { + "auxiliary_loss_clip": 0.06425636, + "auxiliary_loss_mlp": 0.01277604, + "balance_loss_clip": 0.06278675, + "balance_loss_mlp": 0.012665, + "epoch": 0.523372914474673, + "flos": 17056257379200.0, + "grad_norm": 1.7488135640398956, + "language_loss": 0.78527272, + "learning_rate": 1.9456767771745474e-06, + "loss": 0.86230516, + "num_input_tokens_seen": 186976240, + "router_z_loss_clip": 1.47070312, + "router_z_loss_mlp": 0.11108398, + "step": 8705, + "time_per_iteration": 2.487792730331421 + }, + { + "auxiliary_loss_clip": 0.06433457, + "auxiliary_loss_mlp": 0.01264626, + "balance_loss_clip": 0.06280416, + "balance_loss_mlp": 0.0125221, + "epoch": 0.5234330377273411, + "flos": 18412303530240.0, + "grad_norm": 1.822089906899261, + "language_loss": 0.69768077, + "learning_rate": 1.9452874602153027e-06, + "loss": 0.77466154, + "num_input_tokens_seen": 186992855, + "router_z_loss_clip": 1.52832031, + "router_z_loss_mlp": 0.12408447, + "step": 8706, + "time_per_iteration": 2.52415132522583 + }, + { + "auxiliary_loss_clip": 0.06339821, + "auxiliary_loss_mlp": 0.01262622, + "balance_loss_clip": 0.06275055, + "balance_loss_mlp": 0.01260974, + "epoch": 0.523493160980009, + "flos": 65872426429440.0, + "grad_norm": 0.668265925718786, + "language_loss": 0.52398658, + "learning_rate": 1.9448981453307623e-06, + "loss": 0.60001105, + "num_input_tokens_seen": 187051205, + "router_z_loss_clip": 0.6484375, + "router_z_loss_mlp": 0.01651001, + "step": 8707, + "time_per_iteration": 4.596412658691406 + }, + { + "auxiliary_loss_clip": 0.06431062, + "auxiliary_loss_mlp": 0.01267285, + "balance_loss_clip": 0.06282815, + "balance_loss_mlp": 0.01255829, + "epoch": 0.523553284232677, + "flos": 21878595233280.0, + "grad_norm": 1.763620445487087, + "language_loss": 0.75447237, + "learning_rate": 1.9445088325356904e-06, + "loss": 0.83145583, + "num_input_tokens_seen": 187070540, + "router_z_loss_clip": 1.47949219, + "router_z_loss_mlp": 0.11450195, + "step": 8708, + "time_per_iteration": 2.515388011932373 + }, + { + "auxiliary_loss_clip": 0.06425884, + "auxiliary_loss_mlp": 0.01269189, + "balance_loss_clip": 0.06279897, + "balance_loss_mlp": 0.01258252, + "epoch": 0.5236134074853449, + "flos": 20854156815360.0, + "grad_norm": 1.5562083670602136, + "language_loss": 0.78041285, + "learning_rate": 1.944119521844849e-06, + "loss": 0.85736358, + "num_input_tokens_seen": 187089975, + "router_z_loss_clip": 1.45800781, + "router_z_loss_mlp": 0.109375, + "step": 8709, + "time_per_iteration": 2.569312810897827 + }, + { + "auxiliary_loss_clip": 0.06434496, + "auxiliary_loss_mlp": 0.01269997, + "balance_loss_clip": 0.062785, + "balance_loss_mlp": 0.01256872, + "epoch": 0.5236735307380129, + "flos": 25527510910080.0, + "grad_norm": 1.8691534112354709, + "language_loss": 0.83896649, + "learning_rate": 1.9437302132730003e-06, + "loss": 0.91601145, + "num_input_tokens_seen": 187108775, + "router_z_loss_clip": 1.55859375, + "router_z_loss_mlp": 0.13128662, + "step": 8710, + "time_per_iteration": 2.5364856719970703 + }, + { + "auxiliary_loss_clip": 0.06424439, + "auxiliary_loss_mlp": 0.01271523, + "balance_loss_clip": 0.06278566, + "balance_loss_mlp": 0.01260347, + "epoch": 0.523733653990681, + "flos": 23589281790720.0, + "grad_norm": 1.796806294076298, + "language_loss": 0.69453466, + "learning_rate": 1.943340906834908e-06, + "loss": 0.77149427, + "num_input_tokens_seen": 187128830, + "router_z_loss_clip": 1.45898438, + "router_z_loss_mlp": 0.11181641, + "step": 8711, + "time_per_iteration": 2.5488204956054688 + }, + { + "auxiliary_loss_clip": 0.06423855, + "auxiliary_loss_mlp": 0.01267852, + "balance_loss_clip": 0.06275582, + "balance_loss_mlp": 0.01256539, + "epoch": 0.5237937772433489, + "flos": 21112698188160.0, + "grad_norm": 1.676774757059823, + "language_loss": 0.82997072, + "learning_rate": 1.9429516025453345e-06, + "loss": 0.90688783, + "num_input_tokens_seen": 187149570, + "router_z_loss_clip": 1.48242188, + "router_z_loss_mlp": 0.11322021, + "step": 8712, + "time_per_iteration": 4.064100980758667 + }, + { + "auxiliary_loss_clip": 0.0643232, + "auxiliary_loss_mlp": 0.01271192, + "balance_loss_clip": 0.06279981, + "balance_loss_mlp": 0.01259051, + "epoch": 0.5238539004960169, + "flos": 19179081043200.0, + "grad_norm": 1.8094880941691576, + "language_loss": 0.6993227, + "learning_rate": 1.9425623004190415e-06, + "loss": 0.77635783, + "num_input_tokens_seen": 187170575, + "router_z_loss_clip": 1.52246094, + "router_z_loss_mlp": 0.121521, + "step": 8713, + "time_per_iteration": 2.544586420059204 + }, + { + "auxiliary_loss_clip": 0.06435391, + "auxiliary_loss_mlp": 0.01268239, + "balance_loss_clip": 0.06280154, + "balance_loss_mlp": 0.01254834, + "epoch": 0.5239140237486848, + "flos": 17892914797440.0, + "grad_norm": 2.8365689324721597, + "language_loss": 0.76947498, + "learning_rate": 1.9421730004707925e-06, + "loss": 0.84651124, + "num_input_tokens_seen": 187187190, + "router_z_loss_clip": 1.55078125, + "router_z_loss_mlp": 0.13409424, + "step": 8714, + "time_per_iteration": 2.5225958824157715 + }, + { + "auxiliary_loss_clip": 0.06430446, + "auxiliary_loss_mlp": 0.01267137, + "balance_loss_clip": 0.06279821, + "balance_loss_mlp": 0.01255085, + "epoch": 0.5239741470013528, + "flos": 17936072449920.0, + "grad_norm": 1.8206248729771282, + "language_loss": 0.76218581, + "learning_rate": 1.9417837027153483e-06, + "loss": 0.83916163, + "num_input_tokens_seen": 187204350, + "router_z_loss_clip": 1.50390625, + "router_z_loss_mlp": 0.12060547, + "step": 8715, + "time_per_iteration": 2.479482650756836 + }, + { + "auxiliary_loss_clip": 0.06428694, + "auxiliary_loss_mlp": 0.01265255, + "balance_loss_clip": 0.06280876, + "balance_loss_mlp": 0.01253537, + "epoch": 0.5240342702540207, + "flos": 31001408513280.0, + "grad_norm": 1.518077309755953, + "language_loss": 0.71405065, + "learning_rate": 1.9413944071674723e-06, + "loss": 0.79099017, + "num_input_tokens_seen": 187225605, + "router_z_loss_clip": 1.47851562, + "router_z_loss_mlp": 0.1171875, + "step": 8716, + "time_per_iteration": 2.6313345432281494 + }, + { + "auxiliary_loss_clip": 0.06429261, + "auxiliary_loss_mlp": 0.01264727, + "balance_loss_clip": 0.06279399, + "balance_loss_mlp": 0.012541, + "epoch": 0.5240943935066887, + "flos": 25011308632320.0, + "grad_norm": 2.053994478361076, + "language_loss": 0.87371016, + "learning_rate": 1.941005113841926e-06, + "loss": 0.95065004, + "num_input_tokens_seen": 187241335, + "router_z_loss_clip": 1.49804688, + "router_z_loss_mlp": 0.10626221, + "step": 8717, + "time_per_iteration": 2.5242137908935547 + }, + { + "auxiliary_loss_clip": 0.06427871, + "auxiliary_loss_mlp": 0.01272314, + "balance_loss_clip": 0.06276905, + "balance_loss_mlp": 0.01260184, + "epoch": 0.5241545167593566, + "flos": 23665786168320.0, + "grad_norm": 1.9379813616750423, + "language_loss": 0.62001824, + "learning_rate": 1.9406158227534723e-06, + "loss": 0.69702005, + "num_input_tokens_seen": 187259925, + "router_z_loss_clip": 1.5078125, + "router_z_loss_mlp": 0.12139893, + "step": 8718, + "time_per_iteration": 2.5543830394744873 + }, + { + "auxiliary_loss_clip": 0.06436223, + "auxiliary_loss_mlp": 0.01271154, + "balance_loss_clip": 0.06282552, + "balance_loss_mlp": 0.01259006, + "epoch": 0.5242146400120247, + "flos": 23406490108800.0, + "grad_norm": 1.965252740565909, + "language_loss": 0.72457337, + "learning_rate": 1.940226533916872e-06, + "loss": 0.80164713, + "num_input_tokens_seen": 187279035, + "router_z_loss_clip": 1.53613281, + "router_z_loss_mlp": 0.12145996, + "step": 8719, + "time_per_iteration": 3.9948794841766357 + }, + { + "auxiliary_loss_clip": 0.06428128, + "auxiliary_loss_mlp": 0.01267914, + "balance_loss_clip": 0.0628122, + "balance_loss_mlp": 0.01256983, + "epoch": 0.5242747632646926, + "flos": 17754873995520.0, + "grad_norm": 2.179080036180393, + "language_loss": 0.73360658, + "learning_rate": 1.9398372473468877e-06, + "loss": 0.81056702, + "num_input_tokens_seen": 187297555, + "router_z_loss_clip": 1.46777344, + "router_z_loss_mlp": 0.10919189, + "step": 8720, + "time_per_iteration": 2.561491012573242 + }, + { + "auxiliary_loss_clip": 0.06431387, + "auxiliary_loss_mlp": 0.0126878, + "balance_loss_clip": 0.06281313, + "balance_loss_mlp": 0.01256227, + "epoch": 0.5243348865173606, + "flos": 32605849693440.0, + "grad_norm": 1.7043415367979953, + "language_loss": 0.70633399, + "learning_rate": 1.939447963058281e-06, + "loss": 0.78333569, + "num_input_tokens_seen": 187320265, + "router_z_loss_clip": 1.49902344, + "router_z_loss_mlp": 0.12561035, + "step": 8721, + "time_per_iteration": 2.6254172325134277 + }, + { + "auxiliary_loss_clip": 0.06427501, + "auxiliary_loss_mlp": 0.01269506, + "balance_loss_clip": 0.06277889, + "balance_loss_mlp": 0.01258008, + "epoch": 0.5243950097700285, + "flos": 25491229292160.0, + "grad_norm": 1.669973954204285, + "language_loss": 0.86888224, + "learning_rate": 1.939058681065813e-06, + "loss": 0.94585228, + "num_input_tokens_seen": 187338045, + "router_z_loss_clip": 1.49707031, + "router_z_loss_mlp": 0.1151123, + "step": 8722, + "time_per_iteration": 2.532735586166382 + }, + { + "auxiliary_loss_clip": 0.06423786, + "auxiliary_loss_mlp": 0.01270795, + "balance_loss_clip": 0.06276488, + "balance_loss_mlp": 0.01259041, + "epoch": 0.5244551330226965, + "flos": 15273846126720.0, + "grad_norm": 1.6547564845342364, + "language_loss": 0.80303264, + "learning_rate": 1.938669401384247e-06, + "loss": 0.87997842, + "num_input_tokens_seen": 187356040, + "router_z_loss_clip": 1.47265625, + "router_z_loss_mlp": 0.11743164, + "step": 8723, + "time_per_iteration": 2.519230842590332 + }, + { + "auxiliary_loss_clip": 0.06433833, + "auxiliary_loss_mlp": 0.01269065, + "balance_loss_clip": 0.06281124, + "balance_loss_mlp": 0.01256286, + "epoch": 0.5245152562753645, + "flos": 22243717399680.0, + "grad_norm": 1.8110090728616772, + "language_loss": 0.75572187, + "learning_rate": 1.9382801240283426e-06, + "loss": 0.83275086, + "num_input_tokens_seen": 187374185, + "router_z_loss_clip": 1.52539062, + "router_z_loss_mlp": 0.12780762, + "step": 8724, + "time_per_iteration": 2.503331422805786 + }, + { + "auxiliary_loss_clip": 0.06439602, + "auxiliary_loss_mlp": 0.01267267, + "balance_loss_clip": 0.06280126, + "balance_loss_mlp": 0.01254428, + "epoch": 0.5245753795280325, + "flos": 29434548689280.0, + "grad_norm": 1.6762764466906133, + "language_loss": 0.70858645, + "learning_rate": 1.9378908490128625e-06, + "loss": 0.78565514, + "num_input_tokens_seen": 187396640, + "router_z_loss_clip": 1.59179688, + "router_z_loss_mlp": 0.12835693, + "step": 8725, + "time_per_iteration": 2.6268577575683594 + }, + { + "auxiliary_loss_clip": 0.06331155, + "auxiliary_loss_mlp": 0.01254987, + "balance_loss_clip": 0.06266978, + "balance_loss_mlp": 0.01252628, + "epoch": 0.5246355027807005, + "flos": 58853569645440.0, + "grad_norm": 0.7398874669792804, + "language_loss": 0.55689812, + "learning_rate": 1.937501576352568e-06, + "loss": 0.63275951, + "num_input_tokens_seen": 187455945, + "router_z_loss_clip": 0.64111328, + "router_z_loss_mlp": 0.02354431, + "step": 8726, + "time_per_iteration": 3.1253981590270996 + }, + { + "auxiliary_loss_clip": 0.06326637, + "auxiliary_loss_mlp": 0.01254365, + "balance_loss_clip": 0.06262497, + "balance_loss_mlp": 0.01252303, + "epoch": 0.5246956260333684, + "flos": 64546792110720.0, + "grad_norm": 0.7865731844335093, + "language_loss": 0.58442128, + "learning_rate": 1.937112306062219e-06, + "loss": 0.66023123, + "num_input_tokens_seen": 187519975, + "router_z_loss_clip": 0.640625, + "router_z_loss_mlp": 0.02062988, + "step": 8727, + "time_per_iteration": 3.176279306411743 + }, + { + "auxiliary_loss_clip": 0.06432917, + "auxiliary_loss_mlp": 0.01270503, + "balance_loss_clip": 0.06279024, + "balance_loss_mlp": 0.01258118, + "epoch": 0.5247557492860364, + "flos": 24540276504960.0, + "grad_norm": 1.4599497814344178, + "language_loss": 0.70513123, + "learning_rate": 1.9367230381565786e-06, + "loss": 0.78216541, + "num_input_tokens_seen": 187541775, + "router_z_loss_clip": 1.53808594, + "router_z_loss_mlp": 0.12390137, + "step": 8728, + "time_per_iteration": 2.635087728500366 + }, + { + "auxiliary_loss_clip": 0.06426623, + "auxiliary_loss_mlp": 0.01271129, + "balance_loss_clip": 0.06274961, + "balance_loss_mlp": 0.01258815, + "epoch": 0.5248158725387043, + "flos": 18811946378880.0, + "grad_norm": 1.5300920869777792, + "language_loss": 0.69649124, + "learning_rate": 1.9363337726504062e-06, + "loss": 0.77346873, + "num_input_tokens_seen": 187560425, + "router_z_loss_clip": 1.51757812, + "router_z_loss_mlp": 0.12310791, + "step": 8729, + "time_per_iteration": 2.5286824703216553 + }, + { + "auxiliary_loss_clip": 0.06429707, + "auxiliary_loss_mlp": 0.01272402, + "balance_loss_clip": 0.06276232, + "balance_loss_mlp": 0.01260112, + "epoch": 0.5248759957913723, + "flos": 20961534222720.0, + "grad_norm": 1.931767440888087, + "language_loss": 0.83841878, + "learning_rate": 1.935944509558464e-06, + "loss": 0.91543984, + "num_input_tokens_seen": 187579930, + "router_z_loss_clip": 1.53417969, + "router_z_loss_mlp": 0.12280273, + "step": 8730, + "time_per_iteration": 2.50693678855896 + }, + { + "auxiliary_loss_clip": 0.06424531, + "auxiliary_loss_mlp": 0.01265175, + "balance_loss_clip": 0.06273736, + "balance_loss_mlp": 0.01253301, + "epoch": 0.5249361190440403, + "flos": 18666903761280.0, + "grad_norm": 2.7205788659727634, + "language_loss": 0.79795074, + "learning_rate": 1.9355552488955125e-06, + "loss": 0.87484777, + "num_input_tokens_seen": 187595365, + "router_z_loss_clip": 1.50976562, + "router_z_loss_mlp": 0.11877441, + "step": 8731, + "time_per_iteration": 2.5262162685394287 + }, + { + "auxiliary_loss_clip": 0.06421249, + "auxiliary_loss_mlp": 0.01268831, + "balance_loss_clip": 0.06275119, + "balance_loss_mlp": 0.01256653, + "epoch": 0.5249962422967083, + "flos": 24870249083520.0, + "grad_norm": 2.282421292997204, + "language_loss": 0.83455729, + "learning_rate": 1.935165990676312e-06, + "loss": 0.91145802, + "num_input_tokens_seen": 187614715, + "router_z_loss_clip": 1.46191406, + "router_z_loss_mlp": 0.12182617, + "step": 8732, + "time_per_iteration": 2.5442264080047607 + }, + { + "auxiliary_loss_clip": 0.06426094, + "auxiliary_loss_mlp": 0.01271634, + "balance_loss_clip": 0.06276669, + "balance_loss_mlp": 0.01259654, + "epoch": 0.5250563655493762, + "flos": 15267179727360.0, + "grad_norm": 1.5246135300121169, + "language_loss": 0.77770185, + "learning_rate": 1.9347767349156237e-06, + "loss": 0.85467911, + "num_input_tokens_seen": 187630745, + "router_z_loss_clip": 1.49414062, + "router_z_loss_mlp": 0.11975098, + "step": 8733, + "time_per_iteration": 2.5826051235198975 + }, + { + "auxiliary_loss_clip": 0.0643189, + "auxiliary_loss_mlp": 0.01266095, + "balance_loss_clip": 0.0627751, + "balance_loss_mlp": 0.01253655, + "epoch": 0.5251164888020442, + "flos": 18631209121920.0, + "grad_norm": 3.9739558224943683, + "language_loss": 0.81671995, + "learning_rate": 1.934387481628208e-06, + "loss": 0.89369977, + "num_input_tokens_seen": 187648200, + "router_z_loss_clip": 1.54492188, + "router_z_loss_mlp": 0.12445068, + "step": 8734, + "time_per_iteration": 2.496502637863159 + }, + { + "auxiliary_loss_clip": 0.0642469, + "auxiliary_loss_mlp": 0.01264669, + "balance_loss_clip": 0.06276481, + "balance_loss_mlp": 0.01253041, + "epoch": 0.5251766120547121, + "flos": 29717632108800.0, + "grad_norm": 1.407036688227265, + "language_loss": 0.77114183, + "learning_rate": 1.933998230828826e-06, + "loss": 0.84803545, + "num_input_tokens_seen": 187669205, + "router_z_loss_clip": 1.48144531, + "router_z_loss_mlp": 0.11627197, + "step": 8735, + "time_per_iteration": 2.5745790004730225 + }, + { + "auxiliary_loss_clip": 0.06423082, + "auxiliary_loss_mlp": 0.01265046, + "balance_loss_clip": 0.06273593, + "balance_loss_mlp": 0.01253632, + "epoch": 0.5252367353073801, + "flos": 23446964430720.0, + "grad_norm": 1.5621679512535565, + "language_loss": 0.80604559, + "learning_rate": 1.9336089825322376e-06, + "loss": 0.88292682, + "num_input_tokens_seen": 187690890, + "router_z_loss_clip": 1.49414062, + "router_z_loss_mlp": 0.11419678, + "step": 8736, + "time_per_iteration": 2.5257420539855957 + }, + { + "auxiliary_loss_clip": 0.06425665, + "auxiliary_loss_mlp": 0.0127044, + "balance_loss_clip": 0.06277201, + "balance_loss_mlp": 0.01258334, + "epoch": 0.5252968585600482, + "flos": 30818658758400.0, + "grad_norm": 2.1177707386756697, + "language_loss": 0.70240873, + "learning_rate": 1.9332197367532033e-06, + "loss": 0.77936983, + "num_input_tokens_seen": 187713045, + "router_z_loss_clip": 1.48339844, + "router_z_loss_mlp": 0.12097168, + "step": 8737, + "time_per_iteration": 2.5996742248535156 + }, + { + "auxiliary_loss_clip": 0.06423551, + "auxiliary_loss_mlp": 0.01268169, + "balance_loss_clip": 0.06272718, + "balance_loss_mlp": 0.01256564, + "epoch": 0.5253569818127161, + "flos": 20634035339520.0, + "grad_norm": 1.5486622918302246, + "language_loss": 0.7715745, + "learning_rate": 1.9328304935064833e-06, + "loss": 0.84849167, + "num_input_tokens_seen": 187733640, + "router_z_loss_clip": 1.50878906, + "router_z_loss_mlp": 0.11608887, + "step": 8738, + "time_per_iteration": 2.5352158546447754 + }, + { + "auxiliary_loss_clip": 0.06323943, + "auxiliary_loss_mlp": 0.01255398, + "balance_loss_clip": 0.06260057, + "balance_loss_mlp": 0.01253626, + "epoch": 0.5254171050653841, + "flos": 63448155302400.0, + "grad_norm": 0.7261228489339219, + "language_loss": 0.54416603, + "learning_rate": 1.932441252806837e-06, + "loss": 0.61995941, + "num_input_tokens_seen": 187792930, + "router_z_loss_clip": 0.640625, + "router_z_loss_mlp": 0.01774597, + "step": 8739, + "time_per_iteration": 3.1277644634246826 + }, + { + "auxiliary_loss_clip": 0.06426128, + "auxiliary_loss_mlp": 0.01267449, + "balance_loss_clip": 0.06276017, + "balance_loss_mlp": 0.01255457, + "epoch": 0.525477228318052, + "flos": 34678136545920.0, + "grad_norm": 1.6647555558701046, + "language_loss": 0.84639645, + "learning_rate": 1.9320520146690263e-06, + "loss": 0.92333221, + "num_input_tokens_seen": 187812495, + "router_z_loss_clip": 1.50195312, + "router_z_loss_mlp": 0.11993408, + "step": 8740, + "time_per_iteration": 2.658111572265625 + }, + { + "auxiliary_loss_clip": 0.06423901, + "auxiliary_loss_mlp": 0.01263794, + "balance_loss_clip": 0.06275214, + "balance_loss_mlp": 0.01251843, + "epoch": 0.52553735157072, + "flos": 17936575574400.0, + "grad_norm": 2.0969213447662156, + "language_loss": 0.69862366, + "learning_rate": 1.9316627791078093e-06, + "loss": 0.77550066, + "num_input_tokens_seen": 187829685, + "router_z_loss_clip": 1.48535156, + "router_z_loss_mlp": 0.11938477, + "step": 8741, + "time_per_iteration": 2.4757626056671143 + }, + { + "auxiliary_loss_clip": 0.0642582, + "auxiliary_loss_mlp": 0.01266561, + "balance_loss_clip": 0.06271701, + "balance_loss_mlp": 0.01254378, + "epoch": 0.5255974748233879, + "flos": 9945326557440.0, + "grad_norm": 2.083494644749303, + "language_loss": 0.66346633, + "learning_rate": 1.931273546137947e-06, + "loss": 0.74039018, + "num_input_tokens_seen": 187846495, + "router_z_loss_clip": 1.5390625, + "router_z_loss_mlp": 0.12188721, + "step": 8742, + "time_per_iteration": 2.4912760257720947 + }, + { + "auxiliary_loss_clip": 0.06430671, + "auxiliary_loss_mlp": 0.01267776, + "balance_loss_clip": 0.06273881, + "balance_loss_mlp": 0.01254592, + "epoch": 0.5256575980760559, + "flos": 16873256062080.0, + "grad_norm": 2.278792899782439, + "language_loss": 0.62974113, + "learning_rate": 1.9308843157741983e-06, + "loss": 0.7067256, + "num_input_tokens_seen": 187862010, + "router_z_loss_clip": 1.56835938, + "router_z_loss_mlp": 0.13195801, + "step": 8743, + "time_per_iteration": 3.8745810985565186 + }, + { + "auxiliary_loss_clip": 0.06328367, + "auxiliary_loss_mlp": 0.01251768, + "balance_loss_clip": 0.06264926, + "balance_loss_mlp": 0.01249956, + "epoch": 0.5257177213287239, + "flos": 62408105297280.0, + "grad_norm": 0.7594186151089873, + "language_loss": 0.54170012, + "learning_rate": 1.930495088031323e-06, + "loss": 0.6175015, + "num_input_tokens_seen": 187922730, + "router_z_loss_clip": 0.63671875, + "router_z_loss_mlp": 0.01808167, + "step": 8744, + "time_per_iteration": 3.2680962085723877 + }, + { + "auxiliary_loss_clip": 0.06434917, + "auxiliary_loss_mlp": 0.01266273, + "balance_loss_clip": 0.0627819, + "balance_loss_mlp": 0.01252635, + "epoch": 0.5257778445813919, + "flos": 20783144880000.0, + "grad_norm": 1.988296485781083, + "language_loss": 0.76358819, + "learning_rate": 1.9301058629240814e-06, + "loss": 0.84060007, + "num_input_tokens_seen": 187940160, + "router_z_loss_clip": 1.56640625, + "router_z_loss_mlp": 0.13653564, + "step": 8745, + "time_per_iteration": 2.5416345596313477 + }, + { + "auxiliary_loss_clip": 0.06422935, + "auxiliary_loss_mlp": 0.01269048, + "balance_loss_clip": 0.06273594, + "balance_loss_mlp": 0.0125733, + "epoch": 0.5258379678340598, + "flos": 17024168465280.0, + "grad_norm": 2.2863222877599703, + "language_loss": 0.81917781, + "learning_rate": 1.9297166404672324e-06, + "loss": 0.8960976, + "num_input_tokens_seen": 187958625, + "router_z_loss_clip": 1.49316406, + "router_z_loss_mlp": 0.1171875, + "step": 8746, + "time_per_iteration": 3.8924081325531006 + }, + { + "auxiliary_loss_clip": 0.06420557, + "auxiliary_loss_mlp": 0.01268181, + "balance_loss_clip": 0.06274772, + "balance_loss_mlp": 0.01257011, + "epoch": 0.5258980910867278, + "flos": 21075032977920.0, + "grad_norm": 1.8269554832422097, + "language_loss": 0.76250327, + "learning_rate": 1.9293274206755353e-06, + "loss": 0.83939064, + "num_input_tokens_seen": 187977575, + "router_z_loss_clip": 1.45703125, + "router_z_loss_mlp": 0.11157227, + "step": 8747, + "time_per_iteration": 2.5338385105133057 + }, + { + "auxiliary_loss_clip": 0.0641925, + "auxiliary_loss_mlp": 0.01266781, + "balance_loss_clip": 0.06273648, + "balance_loss_mlp": 0.01254443, + "epoch": 0.5259582143393957, + "flos": 18010312767360.0, + "grad_norm": 1.781184467493656, + "language_loss": 0.82852685, + "learning_rate": 1.9289382035637505e-06, + "loss": 0.90538716, + "num_input_tokens_seen": 187996650, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.12353516, + "step": 8748, + "time_per_iteration": 2.4989612102508545 + }, + { + "auxiliary_loss_clip": 0.06428373, + "auxiliary_loss_mlp": 0.0126857, + "balance_loss_clip": 0.06276021, + "balance_loss_mlp": 0.01255803, + "epoch": 0.5260183375920637, + "flos": 22790457290880.0, + "grad_norm": 2.0798716741461862, + "language_loss": 0.81033522, + "learning_rate": 1.9285489891466345e-06, + "loss": 0.88730466, + "num_input_tokens_seen": 188013510, + "router_z_loss_clip": 1.5234375, + "router_z_loss_mlp": 0.12756348, + "step": 8749, + "time_per_iteration": 2.541492462158203 + }, + { + "auxiliary_loss_clip": 0.06426647, + "auxiliary_loss_mlp": 0.01269736, + "balance_loss_clip": 0.06276764, + "balance_loss_mlp": 0.01257857, + "epoch": 0.5260784608447318, + "flos": 27059682343680.0, + "grad_norm": 1.8461671999009361, + "language_loss": 0.72827047, + "learning_rate": 1.9281597774389487e-06, + "loss": 0.80523431, + "num_input_tokens_seen": 188032085, + "router_z_loss_clip": 1.49804688, + "router_z_loss_mlp": 0.11877441, + "step": 8750, + "time_per_iteration": 2.55197811126709 + }, + { + "auxiliary_loss_clip": 0.06428036, + "auxiliary_loss_mlp": 0.01265815, + "balance_loss_clip": 0.06278102, + "balance_loss_mlp": 0.0125393, + "epoch": 0.5261385840973997, + "flos": 20668262532480.0, + "grad_norm": 1.3256906405876772, + "language_loss": 0.76755565, + "learning_rate": 1.9277705684554517e-06, + "loss": 0.8444941, + "num_input_tokens_seen": 188050590, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.11883545, + "step": 8751, + "time_per_iteration": 3.989189624786377 + }, + { + "auxiliary_loss_clip": 0.06427495, + "auxiliary_loss_mlp": 0.01266896, + "balance_loss_clip": 0.0627936, + "balance_loss_mlp": 0.01255286, + "epoch": 0.5261987073500677, + "flos": 23629336842240.0, + "grad_norm": 1.3401050149591014, + "language_loss": 0.76360512, + "learning_rate": 1.927381362210902e-06, + "loss": 0.84054899, + "num_input_tokens_seen": 188071620, + "router_z_loss_clip": 1.48242188, + "router_z_loss_mlp": 0.11608887, + "step": 8752, + "time_per_iteration": 2.6008472442626953 + }, + { + "auxiliary_loss_clip": 0.06432231, + "auxiliary_loss_mlp": 0.01266695, + "balance_loss_clip": 0.06278201, + "balance_loss_mlp": 0.01253487, + "epoch": 0.5262588306027356, + "flos": 27643626247680.0, + "grad_norm": 1.396446170400335, + "language_loss": 0.68317235, + "learning_rate": 1.926992158720058e-06, + "loss": 0.76016164, + "num_input_tokens_seen": 188091740, + "router_z_loss_clip": 1.53808594, + "router_z_loss_mlp": 0.13208008, + "step": 8753, + "time_per_iteration": 2.5851571559906006 + }, + { + "auxiliary_loss_clip": 0.06430234, + "auxiliary_loss_mlp": 0.01269545, + "balance_loss_clip": 0.06281005, + "balance_loss_mlp": 0.01257142, + "epoch": 0.5263189538554036, + "flos": 21765725383680.0, + "grad_norm": 1.5666571832863774, + "language_loss": 0.8392294, + "learning_rate": 1.9266029579976785e-06, + "loss": 0.91622722, + "num_input_tokens_seen": 188111165, + "router_z_loss_clip": 1.49121094, + "router_z_loss_mlp": 0.12384033, + "step": 8754, + "time_per_iteration": 2.552424907684326 + }, + { + "auxiliary_loss_clip": 0.06431299, + "auxiliary_loss_mlp": 0.01267122, + "balance_loss_clip": 0.06278868, + "balance_loss_mlp": 0.01254969, + "epoch": 0.5263790771080715, + "flos": 14280490373760.0, + "grad_norm": 9.005791031911038, + "language_loss": 0.87464845, + "learning_rate": 1.926213760058522e-06, + "loss": 0.95163268, + "num_input_tokens_seen": 188127825, + "router_z_loss_clip": 1.52539062, + "router_z_loss_mlp": 0.12139893, + "step": 8755, + "time_per_iteration": 2.4848403930664062 + }, + { + "auxiliary_loss_clip": 0.06329039, + "auxiliary_loss_mlp": 0.01251879, + "balance_loss_clip": 0.06265183, + "balance_loss_mlp": 0.01250204, + "epoch": 0.5264392003607395, + "flos": 65827298206080.0, + "grad_norm": 0.7019882104343015, + "language_loss": 0.5870319, + "learning_rate": 1.9258245649173477e-06, + "loss": 0.66284108, + "num_input_tokens_seen": 188194050, + "router_z_loss_clip": 0.640625, + "router_z_loss_mlp": 0.01678467, + "step": 8756, + "time_per_iteration": 3.275596857070923 + }, + { + "auxiliary_loss_clip": 0.06435139, + "auxiliary_loss_mlp": 0.0126978, + "balance_loss_clip": 0.06280214, + "balance_loss_mlp": 0.01257001, + "epoch": 0.5264993236134075, + "flos": 21038709432960.0, + "grad_norm": 1.5391071607522773, + "language_loss": 0.70246553, + "learning_rate": 1.925435372588913e-06, + "loss": 0.77951479, + "num_input_tokens_seen": 188212565, + "router_z_loss_clip": 1.54882812, + "router_z_loss_mlp": 0.12762451, + "step": 8757, + "time_per_iteration": 2.5078463554382324 + }, + { + "auxiliary_loss_clip": 0.06425242, + "auxiliary_loss_mlp": 0.01271353, + "balance_loss_clip": 0.06274789, + "balance_loss_mlp": 0.01259015, + "epoch": 0.5265594468660755, + "flos": 16623854784000.0, + "grad_norm": 1.5949031044885071, + "language_loss": 0.88366896, + "learning_rate": 1.9250461830879768e-06, + "loss": 0.96063495, + "num_input_tokens_seen": 188229505, + "router_z_loss_clip": 1.50683594, + "router_z_loss_mlp": 0.12341309, + "step": 8758, + "time_per_iteration": 2.503643751144409 + }, + { + "auxiliary_loss_clip": 0.06431897, + "auxiliary_loss_mlp": 0.01273559, + "balance_loss_clip": 0.06277955, + "balance_loss_mlp": 0.01260165, + "epoch": 0.5266195701187434, + "flos": 24141010999680.0, + "grad_norm": 1.3529199811462889, + "language_loss": 0.76677716, + "learning_rate": 1.9246569964292965e-06, + "loss": 0.84383172, + "num_input_tokens_seen": 188250395, + "router_z_loss_clip": 1.53808594, + "router_z_loss_mlp": 0.13391113, + "step": 8759, + "time_per_iteration": 4.0746564865112305 + }, + { + "auxiliary_loss_clip": 0.06426352, + "auxiliary_loss_mlp": 0.01272091, + "balance_loss_clip": 0.06278519, + "balance_loss_mlp": 0.01258603, + "epoch": 0.5266796933714114, + "flos": 15848314519680.0, + "grad_norm": 1.866695897182309, + "language_loss": 0.72062105, + "learning_rate": 1.9242678126276307e-06, + "loss": 0.79760551, + "num_input_tokens_seen": 188266785, + "router_z_loss_clip": 1.47753906, + "router_z_loss_mlp": 0.1348877, + "step": 8760, + "time_per_iteration": 2.4678292274475098 + }, + { + "auxiliary_loss_clip": 0.06434111, + "auxiliary_loss_mlp": 0.01266301, + "balance_loss_clip": 0.06277363, + "balance_loss_mlp": 0.01253152, + "epoch": 0.5267398166240793, + "flos": 20956377196800.0, + "grad_norm": 2.1261739839163263, + "language_loss": 0.76520377, + "learning_rate": 1.923878631697736e-06, + "loss": 0.84220791, + "num_input_tokens_seen": 188282525, + "router_z_loss_clip": 1.56835938, + "router_z_loss_mlp": 0.13140869, + "step": 8761, + "time_per_iteration": 2.5250892639160156 + }, + { + "auxiliary_loss_clip": 0.06431311, + "auxiliary_loss_mlp": 0.01268211, + "balance_loss_clip": 0.06277812, + "balance_loss_mlp": 0.01256696, + "epoch": 0.5267999398767473, + "flos": 21002763231360.0, + "grad_norm": 1.6289028393625449, + "language_loss": 0.7137605, + "learning_rate": 1.923489453654373e-06, + "loss": 0.79075569, + "num_input_tokens_seen": 188301395, + "router_z_loss_clip": 1.53417969, + "router_z_loss_mlp": 0.1151123, + "step": 8762, + "time_per_iteration": 2.50102162361145 + }, + { + "auxiliary_loss_clip": 0.06330161, + "auxiliary_loss_mlp": 0.01253956, + "balance_loss_clip": 0.06266978, + "balance_loss_mlp": 0.01252303, + "epoch": 0.5268600631294152, + "flos": 66867935189760.0, + "grad_norm": 0.9166133094312116, + "language_loss": 0.65129638, + "learning_rate": 1.9231002785122963e-06, + "loss": 0.72713745, + "num_input_tokens_seen": 188357665, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 0.01655579, + "step": 8763, + "time_per_iteration": 3.076136827468872 + }, + { + "auxiliary_loss_clip": 0.06428451, + "auxiliary_loss_mlp": 0.01268489, + "balance_loss_clip": 0.06276652, + "balance_loss_mlp": 0.01255918, + "epoch": 0.5269201863820833, + "flos": 17171307434880.0, + "grad_norm": 1.6120731347351738, + "language_loss": 0.71481144, + "learning_rate": 1.922711106286265e-06, + "loss": 0.79178083, + "num_input_tokens_seen": 188376935, + "router_z_loss_clip": 1.51660156, + "router_z_loss_mlp": 0.12579346, + "step": 8764, + "time_per_iteration": 2.5250110626220703 + }, + { + "auxiliary_loss_clip": 0.06431142, + "auxiliary_loss_mlp": 0.01269659, + "balance_loss_clip": 0.06278007, + "balance_loss_mlp": 0.01256141, + "epoch": 0.5269803096347513, + "flos": 20528963919360.0, + "grad_norm": 1.6456726211241999, + "language_loss": 0.74125087, + "learning_rate": 1.9223219369910368e-06, + "loss": 0.81825888, + "num_input_tokens_seen": 188394995, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.13531494, + "step": 8765, + "time_per_iteration": 2.552011251449585 + }, + { + "auxiliary_loss_clip": 0.06432463, + "auxiliary_loss_mlp": 0.01265724, + "balance_loss_clip": 0.06276315, + "balance_loss_mlp": 0.01253076, + "epoch": 0.5270404328874192, + "flos": 27237652416000.0, + "grad_norm": 1.4730640837864142, + "language_loss": 0.8564899, + "learning_rate": 1.9219327706413677e-06, + "loss": 0.9334718, + "num_input_tokens_seen": 188415475, + "router_z_loss_clip": 1.55957031, + "router_z_loss_mlp": 0.12640381, + "step": 8766, + "time_per_iteration": 2.5471248626708984 + }, + { + "auxiliary_loss_clip": 0.06432243, + "auxiliary_loss_mlp": 0.01271497, + "balance_loss_clip": 0.06278689, + "balance_loss_mlp": 0.01257812, + "epoch": 0.5271005561400872, + "flos": 23116866071040.0, + "grad_norm": 1.6309488802468612, + "language_loss": 0.79294145, + "learning_rate": 1.921543607252017e-06, + "loss": 0.8699789, + "num_input_tokens_seen": 188435665, + "router_z_loss_clip": 1.53515625, + "router_z_loss_mlp": 0.13690186, + "step": 8767, + "time_per_iteration": 2.5700509548187256 + }, + { + "auxiliary_loss_clip": 0.06431086, + "auxiliary_loss_mlp": 0.01269174, + "balance_loss_clip": 0.06277132, + "balance_loss_mlp": 0.01256532, + "epoch": 0.5271606793927551, + "flos": 22571342064000.0, + "grad_norm": 1.7993411408437945, + "language_loss": 0.73931158, + "learning_rate": 1.9211544468377394e-06, + "loss": 0.81631416, + "num_input_tokens_seen": 188455405, + "router_z_loss_clip": 1.5390625, + "router_z_loss_mlp": 0.12646484, + "step": 8768, + "time_per_iteration": 2.5251431465148926 + }, + { + "auxiliary_loss_clip": 0.06428067, + "auxiliary_loss_mlp": 0.01269059, + "balance_loss_clip": 0.0627723, + "balance_loss_mlp": 0.01257174, + "epoch": 0.5272208026454231, + "flos": 18769166069760.0, + "grad_norm": 1.6856667564577028, + "language_loss": 0.74105024, + "learning_rate": 1.9207652894132933e-06, + "loss": 0.81802148, + "num_input_tokens_seen": 188472940, + "router_z_loss_clip": 1.50878906, + "router_z_loss_mlp": 0.11883545, + "step": 8769, + "time_per_iteration": 2.518446683883667 + }, + { + "auxiliary_loss_clip": 0.06431002, + "auxiliary_loss_mlp": 0.01267563, + "balance_loss_clip": 0.06279421, + "balance_loss_mlp": 0.01255172, + "epoch": 0.5272809258980911, + "flos": 20418358129920.0, + "grad_norm": 1.672714058447801, + "language_loss": 0.74041271, + "learning_rate": 1.920376134993436e-06, + "loss": 0.81739843, + "num_input_tokens_seen": 188493035, + "router_z_loss_clip": 1.51367188, + "router_z_loss_mlp": 0.1239624, + "step": 8770, + "time_per_iteration": 2.5188913345336914 + }, + { + "auxiliary_loss_clip": 0.06428713, + "auxiliary_loss_mlp": 0.01271059, + "balance_loss_clip": 0.06278759, + "balance_loss_mlp": 0.01259085, + "epoch": 0.5273410491507591, + "flos": 28264271040000.0, + "grad_norm": 1.8244918854449486, + "language_loss": 0.68641269, + "learning_rate": 1.9199869835929224e-06, + "loss": 0.76341033, + "num_input_tokens_seen": 188513860, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.11987305, + "step": 8771, + "time_per_iteration": 2.5867247581481934 + }, + { + "auxiliary_loss_clip": 0.06424269, + "auxiliary_loss_mlp": 0.01271661, + "balance_loss_clip": 0.06276186, + "balance_loss_mlp": 0.01259704, + "epoch": 0.527401172403427, + "flos": 22461658669440.0, + "grad_norm": 11.676913645943259, + "language_loss": 0.7669906, + "learning_rate": 1.9195978352265115e-06, + "loss": 0.84394991, + "num_input_tokens_seen": 188533345, + "router_z_loss_clip": 1.48242188, + "router_z_loss_mlp": 0.11938477, + "step": 8772, + "time_per_iteration": 2.5199668407440186 + }, + { + "auxiliary_loss_clip": 0.06429616, + "auxiliary_loss_mlp": 0.01267782, + "balance_loss_clip": 0.0627689, + "balance_loss_mlp": 0.01255599, + "epoch": 0.527461295656095, + "flos": 21037158132480.0, + "grad_norm": 2.161876297932061, + "language_loss": 0.66294622, + "learning_rate": 1.9192086899089585e-06, + "loss": 0.73992014, + "num_input_tokens_seen": 188551550, + "router_z_loss_clip": 1.52734375, + "router_z_loss_mlp": 0.12176514, + "step": 8773, + "time_per_iteration": 2.5476229190826416 + }, + { + "auxiliary_loss_clip": 0.06430208, + "auxiliary_loss_mlp": 0.01267896, + "balance_loss_clip": 0.06277348, + "balance_loss_mlp": 0.01256643, + "epoch": 0.5275214189087629, + "flos": 26329060667520.0, + "grad_norm": 1.7199176113539936, + "language_loss": 0.86321867, + "learning_rate": 1.91881954765502e-06, + "loss": 0.94019973, + "num_input_tokens_seen": 188571615, + "router_z_loss_clip": 1.52929688, + "router_z_loss_mlp": 0.11254883, + "step": 8774, + "time_per_iteration": 2.545171022415161 + }, + { + "auxiliary_loss_clip": 0.06427547, + "auxiliary_loss_mlp": 0.01271648, + "balance_loss_clip": 0.06276767, + "balance_loss_mlp": 0.01259525, + "epoch": 0.5275815421614309, + "flos": 20053110182400.0, + "grad_norm": 1.6744248524719214, + "language_loss": 0.80195713, + "learning_rate": 1.9184304084794523e-06, + "loss": 0.87894905, + "num_input_tokens_seen": 188591965, + "router_z_loss_clip": 1.50683594, + "router_z_loss_mlp": 0.12121582, + "step": 8775, + "time_per_iteration": 2.544409990310669 + }, + { + "auxiliary_loss_clip": 0.06422298, + "auxiliary_loss_mlp": 0.01270371, + "balance_loss_clip": 0.06275839, + "balance_loss_mlp": 0.01257968, + "epoch": 0.5276416654140988, + "flos": 21438310354560.0, + "grad_norm": 1.5933640173688606, + "language_loss": 0.83310181, + "learning_rate": 1.918041272397012e-06, + "loss": 0.91002852, + "num_input_tokens_seen": 188610675, + "router_z_loss_clip": 1.46289062, + "router_z_loss_mlp": 0.1239624, + "step": 8776, + "time_per_iteration": 2.5175352096557617 + }, + { + "auxiliary_loss_clip": 0.06428739, + "auxiliary_loss_mlp": 0.012708, + "balance_loss_clip": 0.06277907, + "balance_loss_mlp": 0.0125867, + "epoch": 0.5277017886667669, + "flos": 17170762383360.0, + "grad_norm": 1.5849666431846519, + "language_loss": 0.67932826, + "learning_rate": 1.9176521394224547e-06, + "loss": 0.7563237, + "num_input_tokens_seen": 188628235, + "router_z_loss_clip": 1.50878906, + "router_z_loss_mlp": 0.12127686, + "step": 8777, + "time_per_iteration": 2.5778138637542725 + }, + { + "auxiliary_loss_clip": 0.06429909, + "auxiliary_loss_mlp": 0.01265517, + "balance_loss_clip": 0.06281164, + "balance_loss_mlp": 0.01253935, + "epoch": 0.5277619119194349, + "flos": 20454262404480.0, + "grad_norm": 1.855602906151282, + "language_loss": 0.82547855, + "learning_rate": 1.9172630095705358e-06, + "loss": 0.90243274, + "num_input_tokens_seen": 188648925, + "router_z_loss_clip": 1.48632812, + "router_z_loss_mlp": 0.11584473, + "step": 8778, + "time_per_iteration": 2.571700096130371 + }, + { + "auxiliary_loss_clip": 0.06433128, + "auxiliary_loss_mlp": 0.01269297, + "balance_loss_clip": 0.06280521, + "balance_loss_mlp": 0.01257114, + "epoch": 0.5278220351721028, + "flos": 24067944639360.0, + "grad_norm": 1.9512823836083997, + "language_loss": 0.79944891, + "learning_rate": 1.916873882856013e-06, + "loss": 0.87647313, + "num_input_tokens_seen": 188668125, + "router_z_loss_clip": 1.52441406, + "router_z_loss_mlp": 0.1217041, + "step": 8779, + "time_per_iteration": 2.562757968902588 + }, + { + "auxiliary_loss_clip": 0.06427805, + "auxiliary_loss_mlp": 0.01263718, + "balance_loss_clip": 0.06278832, + "balance_loss_mlp": 0.01252429, + "epoch": 0.5278821584247708, + "flos": 24649540629120.0, + "grad_norm": 2.3350915047762957, + "language_loss": 0.77251387, + "learning_rate": 1.9164847592936406e-06, + "loss": 0.84942913, + "num_input_tokens_seen": 188684410, + "router_z_loss_clip": 1.48828125, + "router_z_loss_mlp": 0.11291504, + "step": 8780, + "time_per_iteration": 2.517606258392334 + }, + { + "auxiliary_loss_clip": 0.0643455, + "auxiliary_loss_mlp": 0.01267518, + "balance_loss_clip": 0.06281555, + "balance_loss_mlp": 0.01254507, + "epoch": 0.5279422816774387, + "flos": 35417017848960.0, + "grad_norm": 1.6574386864631518, + "language_loss": 0.69489729, + "learning_rate": 1.916095638898174e-06, + "loss": 0.77191794, + "num_input_tokens_seen": 188706130, + "router_z_loss_clip": 1.52832031, + "router_z_loss_mlp": 0.13018799, + "step": 8781, + "time_per_iteration": 2.693525791168213 + }, + { + "auxiliary_loss_clip": 0.06421035, + "auxiliary_loss_mlp": 0.01270298, + "balance_loss_clip": 0.06274436, + "balance_loss_mlp": 0.01259051, + "epoch": 0.5280024049301068, + "flos": 22973794024320.0, + "grad_norm": 1.4417281394316688, + "language_loss": 0.7270093, + "learning_rate": 1.9157065216843696e-06, + "loss": 0.80392265, + "num_input_tokens_seen": 188725030, + "router_z_loss_clip": 1.46679688, + "router_z_loss_mlp": 0.11254883, + "step": 8782, + "time_per_iteration": 2.5421454906463623 + }, + { + "auxiliary_loss_clip": 0.06428084, + "auxiliary_loss_mlp": 0.01267241, + "balance_loss_clip": 0.06279479, + "balance_loss_mlp": 0.01255314, + "epoch": 0.5280625281827747, + "flos": 21514143899520.0, + "grad_norm": 1.839654531053583, + "language_loss": 0.68914783, + "learning_rate": 1.915317407666982e-06, + "loss": 0.76610112, + "num_input_tokens_seen": 188744325, + "router_z_loss_clip": 1.48535156, + "router_z_loss_mlp": 0.1192627, + "step": 8783, + "time_per_iteration": 4.037707328796387 + }, + { + "auxiliary_loss_clip": 0.06440329, + "auxiliary_loss_mlp": 0.01270068, + "balance_loss_clip": 0.06282043, + "balance_loss_mlp": 0.01256281, + "epoch": 0.5281226514354427, + "flos": 31215534422400.0, + "grad_norm": 1.947626233704344, + "language_loss": 0.69763857, + "learning_rate": 1.9149282968607674e-06, + "loss": 0.77474254, + "num_input_tokens_seen": 188765100, + "router_z_loss_clip": 1.58203125, + "router_z_loss_mlp": 0.13793945, + "step": 8784, + "time_per_iteration": 2.6415882110595703 + }, + { + "auxiliary_loss_clip": 0.06436743, + "auxiliary_loss_mlp": 0.01269839, + "balance_loss_clip": 0.06277036, + "balance_loss_mlp": 0.01256393, + "epoch": 0.5281827746881106, + "flos": 25084039576320.0, + "grad_norm": 1.9575438568521135, + "language_loss": 0.75138849, + "learning_rate": 1.91453918928048e-06, + "loss": 0.82845432, + "num_input_tokens_seen": 188783995, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.13458252, + "step": 8785, + "time_per_iteration": 2.5360119342803955 + }, + { + "auxiliary_loss_clip": 0.06430692, + "auxiliary_loss_mlp": 0.01270335, + "balance_loss_clip": 0.06279787, + "balance_loss_mlp": 0.01257806, + "epoch": 0.5282428979407786, + "flos": 20637515283840.0, + "grad_norm": 2.81532856062796, + "language_loss": 0.83379281, + "learning_rate": 1.9141500849408745e-06, + "loss": 0.91080302, + "num_input_tokens_seen": 188803120, + "router_z_loss_clip": 1.50976562, + "router_z_loss_mlp": 0.12518311, + "step": 8786, + "time_per_iteration": 3.923038959503174 + }, + { + "auxiliary_loss_clip": 0.06426571, + "auxiliary_loss_mlp": 0.01265911, + "balance_loss_clip": 0.0628151, + "balance_loss_mlp": 0.01255248, + "epoch": 0.5283030211934465, + "flos": 22426005957120.0, + "grad_norm": 2.0503071903036134, + "language_loss": 0.82639015, + "learning_rate": 1.9137609838567076e-06, + "loss": 0.90331495, + "num_input_tokens_seen": 188820960, + "router_z_loss_clip": 1.45117188, + "router_z_loss_mlp": 0.10650635, + "step": 8787, + "time_per_iteration": 2.549422025680542 + }, + { + "auxiliary_loss_clip": 0.06423321, + "auxiliary_loss_mlp": 0.01271192, + "balance_loss_clip": 0.06276572, + "balance_loss_mlp": 0.01259932, + "epoch": 0.5283631444461145, + "flos": 23620951434240.0, + "grad_norm": 1.6336970157139816, + "language_loss": 0.83324271, + "learning_rate": 1.9133718860427316e-06, + "loss": 0.91018784, + "num_input_tokens_seen": 188837165, + "router_z_loss_clip": 1.46582031, + "router_z_loss_mlp": 0.11260986, + "step": 8788, + "time_per_iteration": 2.4937057495117188 + }, + { + "auxiliary_loss_clip": 0.06426245, + "auxiliary_loss_mlp": 0.01271299, + "balance_loss_clip": 0.06279786, + "balance_loss_mlp": 0.0125886, + "epoch": 0.5284232676987825, + "flos": 32680341573120.0, + "grad_norm": 1.675322731323109, + "language_loss": 0.75004017, + "learning_rate": 1.9129827915137027e-06, + "loss": 0.82701558, + "num_input_tokens_seen": 188858555, + "router_z_loss_clip": 1.46484375, + "router_z_loss_mlp": 0.12451172, + "step": 8789, + "time_per_iteration": 2.6138312816619873 + }, + { + "auxiliary_loss_clip": 0.06430633, + "auxiliary_loss_mlp": 0.01265881, + "balance_loss_clip": 0.06280988, + "balance_loss_mlp": 0.01254139, + "epoch": 0.5284833909514505, + "flos": 26768213516160.0, + "grad_norm": 1.5707088647426293, + "language_loss": 0.70574284, + "learning_rate": 1.9125937002843754e-06, + "loss": 0.78270793, + "num_input_tokens_seen": 188879050, + "router_z_loss_clip": 1.49609375, + "router_z_loss_mlp": 0.11743164, + "step": 8790, + "time_per_iteration": 2.5883655548095703 + }, + { + "auxiliary_loss_clip": 0.06427436, + "auxiliary_loss_mlp": 0.01266819, + "balance_loss_clip": 0.06280458, + "balance_loss_mlp": 0.01255506, + "epoch": 0.5285435142041185, + "flos": 22097207335680.0, + "grad_norm": 1.512627214826232, + "language_loss": 0.79474425, + "learning_rate": 1.9122046123695036e-06, + "loss": 0.87168682, + "num_input_tokens_seen": 188898885, + "router_z_loss_clip": 1.46972656, + "router_z_loss_mlp": 0.11309814, + "step": 8791, + "time_per_iteration": 4.033270835876465 + }, + { + "auxiliary_loss_clip": 0.06429024, + "auxiliary_loss_mlp": 0.01266875, + "balance_loss_clip": 0.06280901, + "balance_loss_mlp": 0.01255205, + "epoch": 0.5286036374567864, + "flos": 20381615314560.0, + "grad_norm": 2.07521505612664, + "language_loss": 0.65493345, + "learning_rate": 1.9118155277838423e-06, + "loss": 0.73189247, + "num_input_tokens_seen": 188917225, + "router_z_loss_clip": 1.48144531, + "router_z_loss_mlp": 0.11676025, + "step": 8792, + "time_per_iteration": 2.521308183670044 + }, + { + "auxiliary_loss_clip": 0.06423797, + "auxiliary_loss_mlp": 0.01264198, + "balance_loss_clip": 0.06276767, + "balance_loss_mlp": 0.01253415, + "epoch": 0.5286637607094544, + "flos": 24358952269440.0, + "grad_norm": 2.076646851589869, + "language_loss": 0.79861224, + "learning_rate": 1.9114264465421443e-06, + "loss": 0.87549216, + "num_input_tokens_seen": 188936120, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 0.10778809, + "step": 8793, + "time_per_iteration": 2.5511038303375244 + }, + { + "auxiliary_loss_clip": 0.06422493, + "auxiliary_loss_mlp": 0.01268071, + "balance_loss_clip": 0.062755, + "balance_loss_mlp": 0.01256168, + "epoch": 0.5287238839621223, + "flos": 17276295000960.0, + "grad_norm": 2.078436862745294, + "language_loss": 0.85337698, + "learning_rate": 1.9110373686591645e-06, + "loss": 0.93028271, + "num_input_tokens_seen": 188953405, + "router_z_loss_clip": 1.46972656, + "router_z_loss_mlp": 0.11901855, + "step": 8794, + "time_per_iteration": 2.4898123741149902 + }, + { + "auxiliary_loss_clip": 0.06434184, + "auxiliary_loss_mlp": 0.01268266, + "balance_loss_clip": 0.0627749, + "balance_loss_mlp": 0.01255284, + "epoch": 0.5287840072147904, + "flos": 17572711219200.0, + "grad_norm": 2.1545808018265427, + "language_loss": 0.67890751, + "learning_rate": 1.9106482941496564e-06, + "loss": 0.75593209, + "num_input_tokens_seen": 188971150, + "router_z_loss_clip": 1.56542969, + "router_z_loss_mlp": 0.12982178, + "step": 8795, + "time_per_iteration": 2.5213987827301025 + }, + { + "auxiliary_loss_clip": 0.0642955, + "auxiliary_loss_mlp": 0.01269682, + "balance_loss_clip": 0.06279209, + "balance_loss_mlp": 0.01257714, + "epoch": 0.5288441304674583, + "flos": 18558100834560.0, + "grad_norm": 1.7521680482784363, + "language_loss": 0.80681872, + "learning_rate": 1.910259223028374e-06, + "loss": 0.88381112, + "num_input_tokens_seen": 188989550, + "router_z_loss_clip": 1.50390625, + "router_z_loss_mlp": 0.11968994, + "step": 8796, + "time_per_iteration": 2.4875407218933105 + }, + { + "auxiliary_loss_clip": 0.06428242, + "auxiliary_loss_mlp": 0.01267761, + "balance_loss_clip": 0.06279264, + "balance_loss_mlp": 0.01255656, + "epoch": 0.5289042537201263, + "flos": 20820935871360.0, + "grad_norm": 1.952583587455058, + "language_loss": 0.69353104, + "learning_rate": 1.909870155310071e-06, + "loss": 0.770491, + "num_input_tokens_seen": 189008795, + "router_z_loss_clip": 1.48925781, + "router_z_loss_mlp": 0.12097168, + "step": 8797, + "time_per_iteration": 2.5311903953552246 + }, + { + "auxiliary_loss_clip": 0.06424771, + "auxiliary_loss_mlp": 0.01268361, + "balance_loss_clip": 0.06280869, + "balance_loss_mlp": 0.01256857, + "epoch": 0.5289643769727942, + "flos": 15739553520000.0, + "grad_norm": 1.4672049002002021, + "language_loss": 0.82371795, + "learning_rate": 1.9094810910095005e-06, + "loss": 0.90064925, + "num_input_tokens_seen": 189025540, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.11499023, + "step": 8798, + "time_per_iteration": 3.947748899459839 + }, + { + "auxiliary_loss_clip": 0.06430193, + "auxiliary_loss_mlp": 0.01268372, + "balance_loss_clip": 0.06277348, + "balance_loss_mlp": 0.01255181, + "epoch": 0.5290245002254622, + "flos": 19543490449920.0, + "grad_norm": 2.0391495748491133, + "language_loss": 0.71206701, + "learning_rate": 1.9090920301414166e-06, + "loss": 0.78905261, + "num_input_tokens_seen": 189044885, + "router_z_loss_clip": 1.52929688, + "router_z_loss_mlp": 0.13201904, + "step": 8799, + "time_per_iteration": 2.5031862258911133 + }, + { + "auxiliary_loss_clip": 0.06420026, + "auxiliary_loss_mlp": 0.01267776, + "balance_loss_clip": 0.06277078, + "balance_loss_mlp": 0.01256124, + "epoch": 0.5290846234781301, + "flos": 15820586017920.0, + "grad_norm": 1.9322407735459124, + "language_loss": 0.69337815, + "learning_rate": 1.9087029727205716e-06, + "loss": 0.77025622, + "num_input_tokens_seen": 189061280, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.11657715, + "step": 8800, + "time_per_iteration": 2.5130701065063477 + }, + { + "auxiliary_loss_clip": 0.06335981, + "auxiliary_loss_mlp": 0.01252268, + "balance_loss_clip": 0.06272759, + "balance_loss_mlp": 0.01250352, + "epoch": 0.5291447467307981, + "flos": 70076272498560.0, + "grad_norm": 0.8722049049478691, + "language_loss": 0.5706265, + "learning_rate": 1.9083139187617193e-06, + "loss": 0.64650893, + "num_input_tokens_seen": 189114775, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 0.01913452, + "step": 8801, + "time_per_iteration": 3.0075480937957764 + }, + { + "auxiliary_loss_clip": 0.06425781, + "auxiliary_loss_mlp": 0.01269363, + "balance_loss_clip": 0.06275494, + "balance_loss_mlp": 0.01257978, + "epoch": 0.529204869983466, + "flos": 28371396885120.0, + "grad_norm": 1.559087936128458, + "language_loss": 0.64462554, + "learning_rate": 1.9079248682796123e-06, + "loss": 0.72157693, + "num_input_tokens_seen": 189134700, + "router_z_loss_clip": 1.50292969, + "router_z_loss_mlp": 0.1138916, + "step": 8802, + "time_per_iteration": 2.568263053894043 + }, + { + "auxiliary_loss_clip": 0.06423493, + "auxiliary_loss_mlp": 0.01268948, + "balance_loss_clip": 0.06277072, + "balance_loss_mlp": 0.01257969, + "epoch": 0.5292649932361341, + "flos": 33766064853120.0, + "grad_norm": 1.9436732858799899, + "language_loss": 0.69115645, + "learning_rate": 1.907535821289003e-06, + "loss": 0.76808089, + "num_input_tokens_seen": 189155365, + "router_z_loss_clip": 1.46289062, + "router_z_loss_mlp": 0.10980225, + "step": 8803, + "time_per_iteration": 2.637096881866455 + }, + { + "auxiliary_loss_clip": 0.06421783, + "auxiliary_loss_mlp": 0.01270558, + "balance_loss_clip": 0.0627604, + "balance_loss_mlp": 0.01258596, + "epoch": 0.5293251164888021, + "flos": 20453717352960.0, + "grad_norm": 1.815171914881367, + "language_loss": 0.75997305, + "learning_rate": 1.9071467778046458e-06, + "loss": 0.83689642, + "num_input_tokens_seen": 189173885, + "router_z_loss_clip": 1.45800781, + "router_z_loss_mlp": 0.11962891, + "step": 8804, + "time_per_iteration": 2.5163068771362305 + }, + { + "auxiliary_loss_clip": 0.0632845, + "auxiliary_loss_mlp": 0.01252381, + "balance_loss_clip": 0.06265265, + "balance_loss_mlp": 0.01250461, + "epoch": 0.52938523974147, + "flos": 66567856590720.0, + "grad_norm": 0.7410273965373205, + "language_loss": 0.52945232, + "learning_rate": 1.906757737841291e-06, + "loss": 0.60526061, + "num_input_tokens_seen": 189236515, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 0.01916504, + "step": 8805, + "time_per_iteration": 3.24060320854187 + }, + { + "auxiliary_loss_clip": 0.06328098, + "auxiliary_loss_mlp": 0.01252617, + "balance_loss_clip": 0.06265187, + "balance_loss_mlp": 0.01250968, + "epoch": 0.529445362994138, + "flos": 67172065983360.0, + "grad_norm": 1.018872897712542, + "language_loss": 0.63735455, + "learning_rate": 1.906368701413693e-06, + "loss": 0.71316171, + "num_input_tokens_seen": 189300500, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 0.01652527, + "step": 8806, + "time_per_iteration": 3.1444826126098633 + }, + { + "auxiliary_loss_clip": 0.06429877, + "auxiliary_loss_mlp": 0.01268417, + "balance_loss_clip": 0.06274825, + "balance_loss_mlp": 0.01256073, + "epoch": 0.5295054862468059, + "flos": 17755167484800.0, + "grad_norm": 1.837636262170248, + "language_loss": 0.7251606, + "learning_rate": 1.9059796685366026e-06, + "loss": 0.80214357, + "num_input_tokens_seen": 189319745, + "router_z_loss_clip": 1.54980469, + "router_z_loss_mlp": 0.12335205, + "step": 8807, + "time_per_iteration": 2.513139247894287 + }, + { + "auxiliary_loss_clip": 0.06424799, + "auxiliary_loss_mlp": 0.01267744, + "balance_loss_clip": 0.06278958, + "balance_loss_mlp": 0.01257241, + "epoch": 0.529565609499474, + "flos": 11401622519040.0, + "grad_norm": 2.5266289150801295, + "language_loss": 0.69956362, + "learning_rate": 1.9055906392247723e-06, + "loss": 0.77648908, + "num_input_tokens_seen": 189334550, + "router_z_loss_clip": 1.45800781, + "router_z_loss_mlp": 0.1050415, + "step": 8808, + "time_per_iteration": 2.472822666168213 + }, + { + "auxiliary_loss_clip": 0.06422195, + "auxiliary_loss_mlp": 0.0126947, + "balance_loss_clip": 0.06274572, + "balance_loss_mlp": 0.01258861, + "epoch": 0.5296257327521419, + "flos": 17201174215680.0, + "grad_norm": 2.036831994826339, + "language_loss": 0.87141514, + "learning_rate": 1.9052016134929554e-06, + "loss": 0.94833171, + "num_input_tokens_seen": 189351735, + "router_z_loss_clip": 1.47851562, + "router_z_loss_mlp": 0.10614014, + "step": 8809, + "time_per_iteration": 2.5245158672332764 + }, + { + "auxiliary_loss_clip": 0.06436493, + "auxiliary_loss_mlp": 0.01270155, + "balance_loss_clip": 0.062795, + "balance_loss_mlp": 0.01257138, + "epoch": 0.5296858560048099, + "flos": 39972806265600.0, + "grad_norm": 1.6505081453472243, + "language_loss": 0.64378583, + "learning_rate": 1.9048125913559016e-06, + "loss": 0.72085232, + "num_input_tokens_seen": 189373105, + "router_z_loss_clip": 1.56835938, + "router_z_loss_mlp": 0.13037109, + "step": 8810, + "time_per_iteration": 2.6857082843780518 + }, + { + "auxiliary_loss_clip": 0.06422746, + "auxiliary_loss_mlp": 0.01270623, + "balance_loss_clip": 0.06277126, + "balance_loss_mlp": 0.01259012, + "epoch": 0.5297459792574778, + "flos": 20968032913920.0, + "grad_norm": 1.5863211204070509, + "language_loss": 0.68117309, + "learning_rate": 1.9044235728283646e-06, + "loss": 0.75810677, + "num_input_tokens_seen": 189394615, + "router_z_loss_clip": 1.45605469, + "router_z_loss_mlp": 0.11608887, + "step": 8811, + "time_per_iteration": 2.5947864055633545 + }, + { + "auxiliary_loss_clip": 0.06326769, + "auxiliary_loss_mlp": 0.01252115, + "balance_loss_clip": 0.06264065, + "balance_loss_mlp": 0.0125052, + "epoch": 0.5298061025101458, + "flos": 66542532658560.0, + "grad_norm": 0.6560344299955198, + "language_loss": 0.53324163, + "learning_rate": 1.9040345579250953e-06, + "loss": 0.60903049, + "num_input_tokens_seen": 189459750, + "router_z_loss_clip": 0.62890625, + "router_z_loss_mlp": 0.01597595, + "step": 8812, + "time_per_iteration": 3.2503774166107178 + }, + { + "auxiliary_loss_clip": 0.06327102, + "auxiliary_loss_mlp": 0.01252134, + "balance_loss_clip": 0.06264044, + "balance_loss_mlp": 0.01250548, + "epoch": 0.5298662257628137, + "flos": 67683488578560.0, + "grad_norm": 0.7118690065629296, + "language_loss": 0.56452167, + "learning_rate": 1.9036455466608453e-06, + "loss": 0.64031398, + "num_input_tokens_seen": 189527540, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 0.01586151, + "step": 8813, + "time_per_iteration": 3.211704730987549 + }, + { + "auxiliary_loss_clip": 0.06420116, + "auxiliary_loss_mlp": 0.0126288, + "balance_loss_clip": 0.06277177, + "balance_loss_mlp": 0.01252223, + "epoch": 0.5299263490154817, + "flos": 19652544938880.0, + "grad_norm": 1.6476785970765333, + "language_loss": 0.82062042, + "learning_rate": 1.9032565390503657e-06, + "loss": 0.89745033, + "num_input_tokens_seen": 189546900, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.10656738, + "step": 8814, + "time_per_iteration": 2.5407004356384277 + }, + { + "auxiliary_loss_clip": 0.06433088, + "auxiliary_loss_mlp": 0.01266965, + "balance_loss_clip": 0.062782, + "balance_loss_mlp": 0.01255646, + "epoch": 0.5299864722681497, + "flos": 22061638477440.0, + "grad_norm": 1.5146312250557674, + "language_loss": 0.85424864, + "learning_rate": 1.9028675351084076e-06, + "loss": 0.93124914, + "num_input_tokens_seen": 189566490, + "router_z_loss_clip": 1.54882812, + "router_z_loss_mlp": 0.11322021, + "step": 8815, + "time_per_iteration": 2.511718273162842 + }, + { + "auxiliary_loss_clip": 0.06421779, + "auxiliary_loss_mlp": 0.01265999, + "balance_loss_clip": 0.0627707, + "balance_loss_mlp": 0.01254573, + "epoch": 0.5300465955208177, + "flos": 21770379285120.0, + "grad_norm": 2.2057457770846947, + "language_loss": 0.67210793, + "learning_rate": 1.9024785348497225e-06, + "loss": 0.74898565, + "num_input_tokens_seen": 189585580, + "router_z_loss_clip": 1.44824219, + "router_z_loss_mlp": 0.11431885, + "step": 8816, + "time_per_iteration": 2.564680576324463 + }, + { + "auxiliary_loss_clip": 0.06425485, + "auxiliary_loss_mlp": 0.01269628, + "balance_loss_clip": 0.06278205, + "balance_loss_mlp": 0.01258106, + "epoch": 0.5301067187734857, + "flos": 43006401884160.0, + "grad_norm": 1.5302739112082, + "language_loss": 0.72652006, + "learning_rate": 1.9020895382890611e-06, + "loss": 0.80347115, + "num_input_tokens_seen": 189608485, + "router_z_loss_clip": 1.47265625, + "router_z_loss_mlp": 0.1151123, + "step": 8817, + "time_per_iteration": 2.719486951828003 + }, + { + "auxiliary_loss_clip": 0.06425378, + "auxiliary_loss_mlp": 0.0126821, + "balance_loss_clip": 0.06274515, + "balance_loss_mlp": 0.01256957, + "epoch": 0.5301668420261536, + "flos": 20559878876160.0, + "grad_norm": 1.5998738611170542, + "language_loss": 0.65166581, + "learning_rate": 1.9017005454411743e-06, + "loss": 0.72860169, + "num_input_tokens_seen": 189627815, + "router_z_loss_clip": 1.5078125, + "router_z_loss_mlp": 0.11242676, + "step": 8818, + "time_per_iteration": 2.573202610015869 + }, + { + "auxiliary_loss_clip": 0.06425599, + "auxiliary_loss_mlp": 0.01266023, + "balance_loss_clip": 0.06275538, + "balance_loss_mlp": 0.0125378, + "epoch": 0.5302269652788216, + "flos": 17491259450880.0, + "grad_norm": 1.7883158874481297, + "language_loss": 0.75112927, + "learning_rate": 1.9013115563208126e-06, + "loss": 0.82804549, + "num_input_tokens_seen": 189644850, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.12249756, + "step": 8819, + "time_per_iteration": 2.4882779121398926 + }, + { + "auxiliary_loss_clip": 0.06426901, + "auxiliary_loss_mlp": 0.01268351, + "balance_loss_clip": 0.06273513, + "balance_loss_mlp": 0.01255995, + "epoch": 0.5302870885314895, + "flos": 14579380287360.0, + "grad_norm": 2.7239673645734905, + "language_loss": 0.82232261, + "learning_rate": 1.9009225709427267e-06, + "loss": 0.89927506, + "num_input_tokens_seen": 189660945, + "router_z_loss_clip": 1.53027344, + "router_z_loss_mlp": 0.12353516, + "step": 8820, + "time_per_iteration": 2.5082767009735107 + }, + { + "auxiliary_loss_clip": 0.06421572, + "auxiliary_loss_mlp": 0.0126791, + "balance_loss_clip": 0.06271127, + "balance_loss_mlp": 0.01257437, + "epoch": 0.5303472117841576, + "flos": 23444323027200.0, + "grad_norm": 1.7959737859178544, + "language_loss": 0.72743207, + "learning_rate": 1.9005335893216667e-06, + "loss": 0.80432689, + "num_input_tokens_seen": 189680425, + "router_z_loss_clip": 1.50390625, + "router_z_loss_mlp": 0.10479736, + "step": 8821, + "time_per_iteration": 2.5132317543029785 + }, + { + "auxiliary_loss_clip": 0.06418677, + "auxiliary_loss_mlp": 0.01266676, + "balance_loss_clip": 0.06273392, + "balance_loss_mlp": 0.01255643, + "epoch": 0.5304073350368255, + "flos": 22715294578560.0, + "grad_norm": 1.486709371307985, + "language_loss": 0.74618089, + "learning_rate": 1.9001446114723824e-06, + "loss": 0.82303441, + "num_input_tokens_seen": 189700375, + "router_z_loss_clip": 1.45214844, + "router_z_loss_mlp": 0.11035156, + "step": 8822, + "time_per_iteration": 2.528388261795044 + }, + { + "auxiliary_loss_clip": 0.06422541, + "auxiliary_loss_mlp": 0.0126649, + "balance_loss_clip": 0.06275284, + "balance_loss_mlp": 0.01255094, + "epoch": 0.5304674582894935, + "flos": 27936059397120.0, + "grad_norm": 1.8362514047395362, + "language_loss": 0.67618608, + "learning_rate": 1.8997556374096257e-06, + "loss": 0.75307631, + "num_input_tokens_seen": 189721225, + "router_z_loss_clip": 1.47167969, + "router_z_loss_mlp": 0.11401367, + "step": 8823, + "time_per_iteration": 3.9042444229125977 + }, + { + "auxiliary_loss_clip": 0.06425376, + "auxiliary_loss_mlp": 0.01269944, + "balance_loss_clip": 0.06273329, + "balance_loss_mlp": 0.01257969, + "epoch": 0.5305275815421614, + "flos": 21256860337920.0, + "grad_norm": 1.7650443733670647, + "language_loss": 0.69634396, + "learning_rate": 1.8993666671481444e-06, + "loss": 0.77329719, + "num_input_tokens_seen": 189740170, + "router_z_loss_clip": 1.51953125, + "router_z_loss_mlp": 0.11968994, + "step": 8824, + "time_per_iteration": 2.5146212577819824 + }, + { + "auxiliary_loss_clip": 0.06418572, + "auxiliary_loss_mlp": 0.01266795, + "balance_loss_clip": 0.06275523, + "balance_loss_mlp": 0.01256292, + "epoch": 0.5305877047948294, + "flos": 17608867056000.0, + "grad_norm": 1.7570108593506664, + "language_loss": 0.76559019, + "learning_rate": 1.898977700702689e-06, + "loss": 0.84244382, + "num_input_tokens_seen": 189757890, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.1050415, + "step": 8825, + "time_per_iteration": 2.4815242290496826 + }, + { + "auxiliary_loss_clip": 0.06420843, + "auxiliary_loss_mlp": 0.01268607, + "balance_loss_clip": 0.06275746, + "balance_loss_mlp": 0.01257335, + "epoch": 0.5306478280474973, + "flos": 15200947474560.0, + "grad_norm": 2.5706419514423526, + "language_loss": 0.85959315, + "learning_rate": 1.8985887380880103e-06, + "loss": 0.93648767, + "num_input_tokens_seen": 189775390, + "router_z_loss_clip": 1.45117188, + "router_z_loss_mlp": 0.11279297, + "step": 8826, + "time_per_iteration": 3.921194076538086 + }, + { + "auxiliary_loss_clip": 0.06417906, + "auxiliary_loss_mlp": 0.01264941, + "balance_loss_clip": 0.06272666, + "balance_loss_mlp": 0.01253759, + "epoch": 0.5307079513001653, + "flos": 15346660924800.0, + "grad_norm": 1.4506860249913964, + "language_loss": 0.64565361, + "learning_rate": 1.8981997793188558e-06, + "loss": 0.72248203, + "num_input_tokens_seen": 189793975, + "router_z_loss_clip": 1.45117188, + "router_z_loss_mlp": 0.11181641, + "step": 8827, + "time_per_iteration": 2.4920613765716553 + }, + { + "auxiliary_loss_clip": 0.06420277, + "auxiliary_loss_mlp": 0.01268465, + "balance_loss_clip": 0.06272143, + "balance_loss_mlp": 0.01256961, + "epoch": 0.5307680745528333, + "flos": 43554567294720.0, + "grad_norm": 1.8307336922940562, + "language_loss": 0.59537661, + "learning_rate": 1.8978108244099762e-06, + "loss": 0.6722641, + "num_input_tokens_seen": 189817870, + "router_z_loss_clip": 1.48046875, + "router_z_loss_mlp": 0.11499023, + "step": 8828, + "time_per_iteration": 2.7917306423187256 + }, + { + "auxiliary_loss_clip": 0.06423927, + "auxiliary_loss_mlp": 0.012663, + "balance_loss_clip": 0.06272669, + "balance_loss_mlp": 0.01254725, + "epoch": 0.5308281978055013, + "flos": 20055332315520.0, + "grad_norm": 1.5709125682754386, + "language_loss": 0.81926584, + "learning_rate": 1.8974218733761208e-06, + "loss": 0.89616817, + "num_input_tokens_seen": 189837905, + "router_z_loss_clip": 1.51171875, + "router_z_loss_mlp": 0.11578369, + "step": 8829, + "time_per_iteration": 2.606851100921631 + }, + { + "auxiliary_loss_clip": 0.06417149, + "auxiliary_loss_mlp": 0.01263824, + "balance_loss_clip": 0.06271777, + "balance_loss_mlp": 0.01253316, + "epoch": 0.5308883210581693, + "flos": 20710162373760.0, + "grad_norm": 1.3864012566435717, + "language_loss": 0.78353059, + "learning_rate": 1.8970329262320375e-06, + "loss": 0.86034036, + "num_input_tokens_seen": 189856970, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.1050415, + "step": 8830, + "time_per_iteration": 3.954951286315918 + }, + { + "auxiliary_loss_clip": 0.06420083, + "auxiliary_loss_mlp": 0.01268446, + "balance_loss_clip": 0.06272915, + "balance_loss_mlp": 0.01256924, + "epoch": 0.5309484443108372, + "flos": 14360684330880.0, + "grad_norm": 2.11171769837039, + "language_loss": 0.81423479, + "learning_rate": 1.8966439829924768e-06, + "loss": 0.89112008, + "num_input_tokens_seen": 189872830, + "router_z_loss_clip": 1.47167969, + "router_z_loss_mlp": 0.11517334, + "step": 8831, + "time_per_iteration": 2.469822883605957 + }, + { + "auxiliary_loss_clip": 0.06415518, + "auxiliary_loss_mlp": 0.01266871, + "balance_loss_clip": 0.06270008, + "balance_loss_mlp": 0.0125579, + "epoch": 0.5310085675635052, + "flos": 20016577002240.0, + "grad_norm": 1.695592927900533, + "language_loss": 0.73638004, + "learning_rate": 1.896255043672186e-06, + "loss": 0.81320393, + "num_input_tokens_seen": 189891635, + "router_z_loss_clip": 1.45507812, + "router_z_loss_mlp": 0.11071777, + "step": 8832, + "time_per_iteration": 2.527545213699341 + }, + { + "auxiliary_loss_clip": 0.06424195, + "auxiliary_loss_mlp": 0.01266175, + "balance_loss_clip": 0.06271979, + "balance_loss_mlp": 0.01253831, + "epoch": 0.5310686908161731, + "flos": 22133824369920.0, + "grad_norm": 1.9494235860340738, + "language_loss": 0.75823116, + "learning_rate": 1.8958661082859143e-06, + "loss": 0.83513486, + "num_input_tokens_seen": 189909050, + "router_z_loss_clip": 1.52246094, + "router_z_loss_mlp": 0.12341309, + "step": 8833, + "time_per_iteration": 2.497962236404419 + }, + { + "auxiliary_loss_clip": 0.06426589, + "auxiliary_loss_mlp": 0.01264835, + "balance_loss_clip": 0.06274767, + "balance_loss_mlp": 0.01252861, + "epoch": 0.5311288140688412, + "flos": 24724871049600.0, + "grad_norm": 1.6156023907192425, + "language_loss": 0.7400462, + "learning_rate": 1.8954771768484103e-06, + "loss": 0.81696039, + "num_input_tokens_seen": 189927405, + "router_z_loss_clip": 1.51660156, + "router_z_loss_mlp": 0.11975098, + "step": 8834, + "time_per_iteration": 2.5790417194366455 + }, + { + "auxiliary_loss_clip": 0.06429796, + "auxiliary_loss_mlp": 0.01267125, + "balance_loss_clip": 0.06274004, + "balance_loss_mlp": 0.01254322, + "epoch": 0.5311889373215091, + "flos": 24104603600640.0, + "grad_norm": 1.6077843194652517, + "language_loss": 0.77900589, + "learning_rate": 1.8950882493744226e-06, + "loss": 0.85597509, + "num_input_tokens_seen": 189947740, + "router_z_loss_clip": 1.55761719, + "router_z_loss_mlp": 0.12817383, + "step": 8835, + "time_per_iteration": 2.5299718379974365 + }, + { + "auxiliary_loss_clip": 0.06422241, + "auxiliary_loss_mlp": 0.01265258, + "balance_loss_clip": 0.06272303, + "balance_loss_mlp": 0.01253147, + "epoch": 0.5312490605741771, + "flos": 22023386288640.0, + "grad_norm": 1.8854276384026003, + "language_loss": 0.72502893, + "learning_rate": 1.8946993258786985e-06, + "loss": 0.80190396, + "num_input_tokens_seen": 189966495, + "router_z_loss_clip": 1.49804688, + "router_z_loss_mlp": 0.12115479, + "step": 8836, + "time_per_iteration": 2.548025131225586 + }, + { + "auxiliary_loss_clip": 0.06424102, + "auxiliary_loss_mlp": 0.01268272, + "balance_loss_clip": 0.06273144, + "balance_loss_mlp": 0.01255815, + "epoch": 0.531309183826845, + "flos": 19396561115520.0, + "grad_norm": 1.819661501339542, + "language_loss": 0.81157684, + "learning_rate": 1.894310406375987e-06, + "loss": 0.88850057, + "num_input_tokens_seen": 189985325, + "router_z_loss_clip": 1.50976562, + "router_z_loss_mlp": 0.12463379, + "step": 8837, + "time_per_iteration": 2.484968662261963 + }, + { + "auxiliary_loss_clip": 0.06418987, + "auxiliary_loss_mlp": 0.0126777, + "balance_loss_clip": 0.06274254, + "balance_loss_mlp": 0.01255778, + "epoch": 0.531369307079513, + "flos": 20195679104640.0, + "grad_norm": 1.8987589865078431, + "language_loss": 0.86269474, + "learning_rate": 1.893921490881035e-06, + "loss": 0.93956232, + "num_input_tokens_seen": 190003290, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.11981201, + "step": 8838, + "time_per_iteration": 3.9265315532684326 + }, + { + "auxiliary_loss_clip": 0.06418579, + "auxiliary_loss_mlp": 0.01264671, + "balance_loss_clip": 0.06271757, + "balance_loss_mlp": 0.01253584, + "epoch": 0.5314294303321809, + "flos": 18886144769280.0, + "grad_norm": 1.6029216559450563, + "language_loss": 0.73087633, + "learning_rate": 1.8935325794085906e-06, + "loss": 0.8077088, + "num_input_tokens_seen": 190023260, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 0.11077881, + "step": 8839, + "time_per_iteration": 2.595414876937866 + }, + { + "auxiliary_loss_clip": 0.06421834, + "auxiliary_loss_mlp": 0.01265258, + "balance_loss_clip": 0.06271024, + "balance_loss_mlp": 0.01253551, + "epoch": 0.531489553584849, + "flos": 23046818457600.0, + "grad_norm": 1.6603149015146987, + "language_loss": 0.76847923, + "learning_rate": 1.8931436719734023e-06, + "loss": 0.84535015, + "num_input_tokens_seen": 190042035, + "router_z_loss_clip": 1.5078125, + "router_z_loss_mlp": 0.11712646, + "step": 8840, + "time_per_iteration": 2.543708086013794 + }, + { + "auxiliary_loss_clip": 0.06426372, + "auxiliary_loss_mlp": 0.01267236, + "balance_loss_clip": 0.06275196, + "balance_loss_mlp": 0.01255291, + "epoch": 0.5315496768375169, + "flos": 19796329745280.0, + "grad_norm": 3.0684588696132553, + "language_loss": 0.7743901, + "learning_rate": 1.892754768590216e-06, + "loss": 0.85132617, + "num_input_tokens_seen": 190057545, + "router_z_loss_clip": 1.51171875, + "router_z_loss_mlp": 0.11932373, + "step": 8841, + "time_per_iteration": 2.5301966667175293 + }, + { + "auxiliary_loss_clip": 0.0631949, + "auxiliary_loss_mlp": 0.01253613, + "balance_loss_clip": 0.06256352, + "balance_loss_mlp": 0.01251976, + "epoch": 0.5316098000901849, + "flos": 71044876569600.0, + "grad_norm": 0.6765052539549429, + "language_loss": 0.56618965, + "learning_rate": 1.8923658692737793e-06, + "loss": 0.64192069, + "num_input_tokens_seen": 190123800, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 0.0164032, + "step": 8842, + "time_per_iteration": 3.2740724086761475 + }, + { + "auxiliary_loss_clip": 0.06425814, + "auxiliary_loss_mlp": 0.01266185, + "balance_loss_clip": 0.06272734, + "balance_loss_mlp": 0.01252876, + "epoch": 0.5316699233428529, + "flos": 16441146956160.0, + "grad_norm": 1.7388474755658287, + "language_loss": 0.73801279, + "learning_rate": 1.8919769740388407e-06, + "loss": 0.81493276, + "num_input_tokens_seen": 190141625, + "router_z_loss_clip": 1.53027344, + "router_z_loss_mlp": 0.13317871, + "step": 8843, + "time_per_iteration": 2.5188851356506348 + }, + { + "auxiliary_loss_clip": 0.06319, + "auxiliary_loss_mlp": 0.01253092, + "balance_loss_clip": 0.06256077, + "balance_loss_mlp": 0.01251205, + "epoch": 0.5317300465955208, + "flos": 67443478957440.0, + "grad_norm": 0.8484317442594647, + "language_loss": 0.60991502, + "learning_rate": 1.891588082900145e-06, + "loss": 0.68563592, + "num_input_tokens_seen": 190198110, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 0.01882935, + "step": 8844, + "time_per_iteration": 3.1943981647491455 + }, + { + "auxiliary_loss_clip": 0.06316474, + "auxiliary_loss_mlp": 0.01252227, + "balance_loss_clip": 0.06253788, + "balance_loss_mlp": 0.01250519, + "epoch": 0.5317901698481888, + "flos": 59524095144960.0, + "grad_norm": 0.8355266908782794, + "language_loss": 0.62249273, + "learning_rate": 1.8911991958724411e-06, + "loss": 0.69817972, + "num_input_tokens_seen": 190259950, + "router_z_loss_clip": 0.62744141, + "router_z_loss_mlp": 0.01712036, + "step": 8845, + "time_per_iteration": 3.149904727935791 + }, + { + "auxiliary_loss_clip": 0.06421602, + "auxiliary_loss_mlp": 0.01271191, + "balance_loss_clip": 0.06273656, + "balance_loss_mlp": 0.01258424, + "epoch": 0.5318502931008567, + "flos": 19134204382080.0, + "grad_norm": 1.8837935046538667, + "language_loss": 0.7569865, + "learning_rate": 1.890810312970474e-06, + "loss": 0.8339144, + "num_input_tokens_seen": 190278265, + "router_z_loss_clip": 1.48046875, + "router_z_loss_mlp": 0.12774658, + "step": 8846, + "time_per_iteration": 2.5158872604370117 + }, + { + "auxiliary_loss_clip": 0.0642429, + "auxiliary_loss_mlp": 0.01267758, + "balance_loss_clip": 0.06273554, + "balance_loss_mlp": 0.01256838, + "epoch": 0.5319104163535248, + "flos": 24687960526080.0, + "grad_norm": 1.6867562646607668, + "language_loss": 0.75546432, + "learning_rate": 1.8904214342089903e-06, + "loss": 0.83238477, + "num_input_tokens_seen": 190298400, + "router_z_loss_clip": 1.50585938, + "router_z_loss_mlp": 0.10913086, + "step": 8847, + "time_per_iteration": 2.5634870529174805 + }, + { + "auxiliary_loss_clip": 0.06415805, + "auxiliary_loss_mlp": 0.01265969, + "balance_loss_clip": 0.06269352, + "balance_loss_mlp": 0.01254823, + "epoch": 0.5319705396061927, + "flos": 19390691329920.0, + "grad_norm": 1.5354205561883685, + "language_loss": 0.87653261, + "learning_rate": 1.8900325596027378e-06, + "loss": 0.95335042, + "num_input_tokens_seen": 190316235, + "router_z_loss_clip": 1.46386719, + "router_z_loss_mlp": 0.1114502, + "step": 8848, + "time_per_iteration": 2.4771876335144043 + }, + { + "auxiliary_loss_clip": 0.06423473, + "auxiliary_loss_mlp": 0.01274581, + "balance_loss_clip": 0.06273171, + "balance_loss_mlp": 0.01261564, + "epoch": 0.5320306628588607, + "flos": 18265122633600.0, + "grad_norm": 1.744694135662772, + "language_loss": 0.74510658, + "learning_rate": 1.8896436891664609e-06, + "loss": 0.82208717, + "num_input_tokens_seen": 190335060, + "router_z_loss_clip": 1.50195312, + "router_z_loss_mlp": 0.13012695, + "step": 8849, + "time_per_iteration": 2.5036580562591553 + }, + { + "auxiliary_loss_clip": 0.06429593, + "auxiliary_loss_mlp": 0.01265611, + "balance_loss_clip": 0.06274542, + "balance_loss_mlp": 0.01253761, + "epoch": 0.5320907861115286, + "flos": 23739062163840.0, + "grad_norm": 1.9586489533772713, + "language_loss": 0.79968703, + "learning_rate": 1.8892548229149066e-06, + "loss": 0.87663901, + "num_input_tokens_seen": 190353265, + "router_z_loss_clip": 1.54980469, + "router_z_loss_mlp": 0.11853027, + "step": 8850, + "time_per_iteration": 2.5143027305603027 + }, + { + "auxiliary_loss_clip": 0.06426045, + "auxiliary_loss_mlp": 0.0126479, + "balance_loss_clip": 0.06276459, + "balance_loss_mlp": 0.01254086, + "epoch": 0.5321509093641966, + "flos": 34503730272000.0, + "grad_norm": 1.273724424531188, + "language_loss": 0.55058682, + "learning_rate": 1.888865960862821e-06, + "loss": 0.62749517, + "num_input_tokens_seen": 190376575, + "router_z_loss_clip": 1.49609375, + "router_z_loss_mlp": 0.1071167, + "step": 8851, + "time_per_iteration": 2.6221299171447754 + }, + { + "auxiliary_loss_clip": 0.06426491, + "auxiliary_loss_mlp": 0.01267353, + "balance_loss_clip": 0.06274278, + "balance_loss_mlp": 0.01255844, + "epoch": 0.5322110326168645, + "flos": 20017080126720.0, + "grad_norm": 1.7230657412679744, + "language_loss": 0.69354177, + "learning_rate": 1.8884771030249484e-06, + "loss": 0.77048028, + "num_input_tokens_seen": 190395185, + "router_z_loss_clip": 1.52050781, + "router_z_loss_mlp": 0.11517334, + "step": 8852, + "time_per_iteration": 2.483614206314087 + }, + { + "auxiliary_loss_clip": 0.06316812, + "auxiliary_loss_mlp": 0.01252104, + "balance_loss_clip": 0.06254005, + "balance_loss_mlp": 0.01250446, + "epoch": 0.5322711558695326, + "flos": 64650563792640.0, + "grad_norm": 0.7839220079179184, + "language_loss": 0.62548178, + "learning_rate": 1.8880882494160357e-06, + "loss": 0.70117098, + "num_input_tokens_seen": 190452595, + "router_z_loss_clip": 0.62890625, + "router_z_loss_mlp": 0.01661682, + "step": 8853, + "time_per_iteration": 3.085580587387085 + }, + { + "auxiliary_loss_clip": 0.06429263, + "auxiliary_loss_mlp": 0.01267576, + "balance_loss_clip": 0.06274428, + "balance_loss_mlp": 0.01256364, + "epoch": 0.5323312791222005, + "flos": 14944628234880.0, + "grad_norm": 2.314845805246822, + "language_loss": 0.79806542, + "learning_rate": 1.8876994000508278e-06, + "loss": 0.87503386, + "num_input_tokens_seen": 190469140, + "router_z_loss_clip": 1.54785156, + "router_z_loss_mlp": 0.11212158, + "step": 8854, + "time_per_iteration": 2.5530436038970947 + }, + { + "auxiliary_loss_clip": 0.06415577, + "auxiliary_loss_mlp": 0.01266542, + "balance_loss_clip": 0.06272486, + "balance_loss_mlp": 0.0125663, + "epoch": 0.5323914023748685, + "flos": 23447593336320.0, + "grad_norm": 2.5938972527955038, + "language_loss": 0.74205482, + "learning_rate": 1.8873105549440698e-06, + "loss": 0.81887597, + "num_input_tokens_seen": 190489015, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.09912109, + "step": 8855, + "time_per_iteration": 2.527981996536255 + }, + { + "auxiliary_loss_clip": 0.0641944, + "auxiliary_loss_mlp": 0.01263629, + "balance_loss_clip": 0.06272254, + "balance_loss_mlp": 0.01253371, + "epoch": 0.5324515256275365, + "flos": 26293324101120.0, + "grad_norm": 4.18366969320272, + "language_loss": 0.64945328, + "learning_rate": 1.886921714110507e-06, + "loss": 0.72628403, + "num_input_tokens_seen": 190508065, + "router_z_loss_clip": 1.47070312, + "router_z_loss_mlp": 0.10266113, + "step": 8856, + "time_per_iteration": 2.5942611694335938 + }, + { + "auxiliary_loss_clip": 0.06428003, + "auxiliary_loss_mlp": 0.01267402, + "balance_loss_clip": 0.06274043, + "balance_loss_mlp": 0.01255177, + "epoch": 0.5325116488802044, + "flos": 26878316181120.0, + "grad_norm": 1.8445625051613121, + "language_loss": 0.77944165, + "learning_rate": 1.8865328775648842e-06, + "loss": 0.85639572, + "num_input_tokens_seen": 190527045, + "router_z_loss_clip": 1.53808594, + "router_z_loss_mlp": 0.12231445, + "step": 8857, + "time_per_iteration": 2.551980972290039 + }, + { + "auxiliary_loss_clip": 0.06420985, + "auxiliary_loss_mlp": 0.01266182, + "balance_loss_clip": 0.06271584, + "balance_loss_mlp": 0.01254422, + "epoch": 0.5325717721328724, + "flos": 25891794535680.0, + "grad_norm": 1.6903303041385833, + "language_loss": 0.71116436, + "learning_rate": 1.8861440453219456e-06, + "loss": 0.78803611, + "num_input_tokens_seen": 190544075, + "router_z_loss_clip": 1.49316406, + "router_z_loss_mlp": 0.11749268, + "step": 8858, + "time_per_iteration": 2.564082384109497 + }, + { + "auxiliary_loss_clip": 0.0642374, + "auxiliary_loss_mlp": 0.01268133, + "balance_loss_clip": 0.06274494, + "balance_loss_mlp": 0.01255968, + "epoch": 0.5326318953855403, + "flos": 21805864289280.0, + "grad_norm": 3.8992078644613217, + "language_loss": 0.69476694, + "learning_rate": 1.8857552173964367e-06, + "loss": 0.77168566, + "num_input_tokens_seen": 190566030, + "router_z_loss_clip": 1.49316406, + "router_z_loss_mlp": 0.12158203, + "step": 8859, + "time_per_iteration": 2.5558056831359863 + }, + { + "auxiliary_loss_clip": 0.06418291, + "auxiliary_loss_mlp": 0.01266588, + "balance_loss_clip": 0.06275187, + "balance_loss_mlp": 0.0125624, + "epoch": 0.5326920186382084, + "flos": 20929193746560.0, + "grad_norm": 1.4322040270296341, + "language_loss": 0.69681478, + "learning_rate": 1.8853663938031013e-06, + "loss": 0.77366364, + "num_input_tokens_seen": 190585605, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.10339355, + "step": 8860, + "time_per_iteration": 2.5150671005249023 + }, + { + "auxiliary_loss_clip": 0.06419887, + "auxiliary_loss_mlp": 0.01266208, + "balance_loss_clip": 0.06273462, + "balance_loss_mlp": 0.01255259, + "epoch": 0.5327521418908763, + "flos": 21439735873920.0, + "grad_norm": 1.9652920134152139, + "language_loss": 0.77936381, + "learning_rate": 1.884977574556683e-06, + "loss": 0.85622478, + "num_input_tokens_seen": 190604625, + "router_z_loss_clip": 1.46484375, + "router_z_loss_mlp": 0.10955811, + "step": 8861, + "time_per_iteration": 2.527064561843872 + }, + { + "auxiliary_loss_clip": 0.06428909, + "auxiliary_loss_mlp": 0.01269839, + "balance_loss_clip": 0.06279886, + "balance_loss_mlp": 0.012579, + "epoch": 0.5328122651435443, + "flos": 21766354289280.0, + "grad_norm": 1.487259241409864, + "language_loss": 0.8585394, + "learning_rate": 1.8845887596719279e-06, + "loss": 0.93552685, + "num_input_tokens_seen": 190625060, + "router_z_loss_clip": 1.48828125, + "router_z_loss_mlp": 0.11938477, + "step": 8862, + "time_per_iteration": 4.031865358352661 + }, + { + "auxiliary_loss_clip": 0.06431703, + "auxiliary_loss_mlp": 0.01269915, + "balance_loss_clip": 0.06279312, + "balance_loss_mlp": 0.01257046, + "epoch": 0.5328723883962122, + "flos": 18302410500480.0, + "grad_norm": 1.6037650471474167, + "language_loss": 0.61557126, + "learning_rate": 1.8841999491635778e-06, + "loss": 0.69258749, + "num_input_tokens_seen": 190643150, + "router_z_loss_clip": 1.5234375, + "router_z_loss_mlp": 0.12866211, + "step": 8863, + "time_per_iteration": 2.499657154083252 + }, + { + "auxiliary_loss_clip": 0.06422713, + "auxiliary_loss_mlp": 0.01268054, + "balance_loss_clip": 0.06278422, + "balance_loss_mlp": 0.01257736, + "epoch": 0.5329325116488802, + "flos": 25382049022080.0, + "grad_norm": 1.8448114340212167, + "language_loss": 0.73693913, + "learning_rate": 1.883811143046377e-06, + "loss": 0.81384677, + "num_input_tokens_seen": 190662725, + "router_z_loss_clip": 1.44335938, + "router_z_loss_mlp": 0.10314941, + "step": 8864, + "time_per_iteration": 2.549104928970337 + }, + { + "auxiliary_loss_clip": 0.06424475, + "auxiliary_loss_mlp": 0.01267423, + "balance_loss_clip": 0.06276639, + "balance_loss_mlp": 0.0125636, + "epoch": 0.5329926349015481, + "flos": 25598984042880.0, + "grad_norm": 1.865165386122464, + "language_loss": 0.64464402, + "learning_rate": 1.8834223413350702e-06, + "loss": 0.72156298, + "num_input_tokens_seen": 190683680, + "router_z_loss_clip": 1.47851562, + "router_z_loss_mlp": 0.11065674, + "step": 8865, + "time_per_iteration": 4.099254608154297 + }, + { + "auxiliary_loss_clip": 0.0642702, + "auxiliary_loss_mlp": 0.01269229, + "balance_loss_clip": 0.06277309, + "balance_loss_mlp": 0.01257874, + "epoch": 0.5330527581542162, + "flos": 22895612565120.0, + "grad_norm": 1.6799514905357744, + "language_loss": 0.78778207, + "learning_rate": 1.8830335440443989e-06, + "loss": 0.86474454, + "num_input_tokens_seen": 190703350, + "router_z_loss_clip": 1.49511719, + "router_z_loss_mlp": 0.11346436, + "step": 8866, + "time_per_iteration": 2.505974531173706 + }, + { + "auxiliary_loss_clip": 0.06424611, + "auxiliary_loss_mlp": 0.01266962, + "balance_loss_clip": 0.06276287, + "balance_loss_mlp": 0.01255333, + "epoch": 0.5331128814068841, + "flos": 16031022347520.0, + "grad_norm": 1.850684934112151, + "language_loss": 0.74175781, + "learning_rate": 1.882644751189108e-06, + "loss": 0.81867361, + "num_input_tokens_seen": 190721170, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.11633301, + "step": 8867, + "time_per_iteration": 2.5437192916870117 + }, + { + "auxiliary_loss_clip": 0.0642608, + "auxiliary_loss_mlp": 0.0126656, + "balance_loss_clip": 0.06276974, + "balance_loss_mlp": 0.01254204, + "epoch": 0.5331730046595521, + "flos": 39353461211520.0, + "grad_norm": 1.4678278533937592, + "language_loss": 0.72377831, + "learning_rate": 1.88225596278394e-06, + "loss": 0.80070472, + "num_input_tokens_seen": 190743795, + "router_z_loss_clip": 1.49121094, + "router_z_loss_mlp": 0.12353516, + "step": 8868, + "time_per_iteration": 2.6680116653442383 + }, + { + "auxiliary_loss_clip": 0.06425264, + "auxiliary_loss_mlp": 0.01269354, + "balance_loss_clip": 0.06276006, + "balance_loss_mlp": 0.01258345, + "epoch": 0.5332331279122201, + "flos": 24031201824000.0, + "grad_norm": 1.7262272651388555, + "language_loss": 0.78884375, + "learning_rate": 1.881867178843637e-06, + "loss": 0.86578989, + "num_input_tokens_seen": 190761560, + "router_z_loss_clip": 1.49121094, + "router_z_loss_mlp": 0.11016846, + "step": 8869, + "time_per_iteration": 3.9937024116516113 + }, + { + "auxiliary_loss_clip": 0.06438692, + "auxiliary_loss_mlp": 0.0126748, + "balance_loss_clip": 0.06282986, + "balance_loss_mlp": 0.01255434, + "epoch": 0.533293251164888, + "flos": 17135109671040.0, + "grad_norm": 2.017265080243192, + "language_loss": 0.7622692, + "learning_rate": 1.8814783993829434e-06, + "loss": 0.83933091, + "num_input_tokens_seen": 190778875, + "router_z_loss_clip": 1.55566406, + "router_z_loss_mlp": 0.1204834, + "step": 8870, + "time_per_iteration": 2.520585536956787 + }, + { + "auxiliary_loss_clip": 0.06435512, + "auxiliary_loss_mlp": 0.01273068, + "balance_loss_clip": 0.06280903, + "balance_loss_mlp": 0.01260366, + "epoch": 0.533353374417556, + "flos": 22132734266880.0, + "grad_norm": 2.1166188019250316, + "language_loss": 0.76185441, + "learning_rate": 1.8810896244165997e-06, + "loss": 0.83894014, + "num_input_tokens_seen": 190799830, + "router_z_loss_clip": 1.546875, + "router_z_loss_mlp": 0.12713623, + "step": 8871, + "time_per_iteration": 2.5372307300567627 + }, + { + "auxiliary_loss_clip": 0.06427529, + "auxiliary_loss_mlp": 0.01272588, + "balance_loss_clip": 0.06279083, + "balance_loss_mlp": 0.01261383, + "epoch": 0.533413497670224, + "flos": 15016185221760.0, + "grad_norm": 1.8709318225271354, + "language_loss": 0.72608036, + "learning_rate": 1.8807008539593498e-06, + "loss": 0.80308151, + "num_input_tokens_seen": 190817155, + "router_z_loss_clip": 1.48339844, + "router_z_loss_mlp": 0.11206055, + "step": 8872, + "time_per_iteration": 2.486344337463379 + }, + { + "auxiliary_loss_clip": 0.06426945, + "auxiliary_loss_mlp": 0.01270876, + "balance_loss_clip": 0.06280041, + "balance_loss_mlp": 0.01258925, + "epoch": 0.533473620922892, + "flos": 19616095612800.0, + "grad_norm": 1.6405410033387824, + "language_loss": 0.65059078, + "learning_rate": 1.880312088025936e-06, + "loss": 0.72756892, + "num_input_tokens_seen": 190835240, + "router_z_loss_clip": 1.46777344, + "router_z_loss_mlp": 0.11956787, + "step": 8873, + "time_per_iteration": 2.4989571571350098 + }, + { + "auxiliary_loss_clip": 0.06430013, + "auxiliary_loss_mlp": 0.01270669, + "balance_loss_clip": 0.06281542, + "balance_loss_mlp": 0.01260113, + "epoch": 0.5335337441755599, + "flos": 14287827605760.0, + "grad_norm": 2.154155286859053, + "language_loss": 0.80397201, + "learning_rate": 1.879923326631099e-06, + "loss": 0.88097882, + "num_input_tokens_seen": 190851620, + "router_z_loss_clip": 1.48535156, + "router_z_loss_mlp": 0.10559082, + "step": 8874, + "time_per_iteration": 2.5248029232025146 + }, + { + "auxiliary_loss_clip": 0.06429289, + "auxiliary_loss_mlp": 0.01270488, + "balance_loss_clip": 0.06281012, + "balance_loss_mlp": 0.01259306, + "epoch": 0.5335938674282279, + "flos": 20821313214720.0, + "grad_norm": 1.9252791788754828, + "language_loss": 0.70199001, + "learning_rate": 1.879534569789582e-06, + "loss": 0.77898782, + "num_input_tokens_seen": 190870545, + "router_z_loss_clip": 1.48339844, + "router_z_loss_mlp": 0.11181641, + "step": 8875, + "time_per_iteration": 2.514606475830078 + }, + { + "auxiliary_loss_clip": 0.06327371, + "auxiliary_loss_mlp": 0.01252854, + "balance_loss_clip": 0.06264151, + "balance_loss_mlp": 0.01251167, + "epoch": 0.5336539906808958, + "flos": 71419558101120.0, + "grad_norm": 0.7076326652144627, + "language_loss": 0.59621203, + "learning_rate": 1.879145817516126e-06, + "loss": 0.6720143, + "num_input_tokens_seen": 190931995, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 0.01690674, + "step": 8876, + "time_per_iteration": 3.2623958587646484 + }, + { + "auxiliary_loss_clip": 0.06431912, + "auxiliary_loss_mlp": 0.0127027, + "balance_loss_clip": 0.06282675, + "balance_loss_mlp": 0.01259833, + "epoch": 0.5337141139335638, + "flos": 20158517018880.0, + "grad_norm": 1.761940945107411, + "language_loss": 0.75235462, + "learning_rate": 1.8787570698254727e-06, + "loss": 0.8293764, + "num_input_tokens_seen": 190949890, + "router_z_loss_clip": 1.49121094, + "router_z_loss_mlp": 0.10437012, + "step": 8877, + "time_per_iteration": 4.019563674926758 + }, + { + "auxiliary_loss_clip": 0.06329054, + "auxiliary_loss_mlp": 0.01254827, + "balance_loss_clip": 0.06265914, + "balance_loss_mlp": 0.01253019, + "epoch": 0.5337742371862317, + "flos": 67747624479360.0, + "grad_norm": 0.7353643225564799, + "language_loss": 0.57172877, + "learning_rate": 1.8783683267323629e-06, + "loss": 0.64756757, + "num_input_tokens_seen": 191008480, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 0.01803589, + "step": 8878, + "time_per_iteration": 3.0581912994384766 + }, + { + "auxiliary_loss_clip": 0.06440037, + "auxiliary_loss_mlp": 0.0127241, + "balance_loss_clip": 0.06285742, + "balance_loss_mlp": 0.01260573, + "epoch": 0.5338343604388998, + "flos": 25015794825600.0, + "grad_norm": 1.5270572668187339, + "language_loss": 0.7260288, + "learning_rate": 1.8779795882515395e-06, + "loss": 0.80315328, + "num_input_tokens_seen": 191028995, + "router_z_loss_clip": 1.54492188, + "router_z_loss_mlp": 0.11834717, + "step": 8879, + "time_per_iteration": 2.594075918197632 + }, + { + "auxiliary_loss_clip": 0.06432897, + "auxiliary_loss_mlp": 0.01271434, + "balance_loss_clip": 0.06280228, + "balance_loss_mlp": 0.01259644, + "epoch": 0.5338944836915677, + "flos": 17606728776960.0, + "grad_norm": 2.8683921774089445, + "language_loss": 0.84095323, + "learning_rate": 1.8775908543977416e-06, + "loss": 0.91799653, + "num_input_tokens_seen": 191045285, + "router_z_loss_clip": 1.52636719, + "router_z_loss_mlp": 0.11785889, + "step": 8880, + "time_per_iteration": 2.4828426837921143 + }, + { + "auxiliary_loss_clip": 0.06424058, + "auxiliary_loss_mlp": 0.01273011, + "balance_loss_clip": 0.06279065, + "balance_loss_mlp": 0.01262277, + "epoch": 0.5339546069442357, + "flos": 21730282306560.0, + "grad_norm": 1.3465483600758703, + "language_loss": 0.79582727, + "learning_rate": 1.8772021251857107e-06, + "loss": 0.87279797, + "num_input_tokens_seen": 191066105, + "router_z_loss_clip": 1.45019531, + "router_z_loss_mlp": 0.1072998, + "step": 8881, + "time_per_iteration": 2.5683958530426025 + }, + { + "auxiliary_loss_clip": 0.06324948, + "auxiliary_loss_mlp": 0.01252734, + "balance_loss_clip": 0.06261811, + "balance_loss_mlp": 0.01251199, + "epoch": 0.5340147301969036, + "flos": 69741226748160.0, + "grad_norm": 0.7871410050477539, + "language_loss": 0.5924378, + "learning_rate": 1.8768134006301882e-06, + "loss": 0.66821468, + "num_input_tokens_seen": 191126315, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 0.01533508, + "step": 8882, + "time_per_iteration": 3.0768346786499023 + }, + { + "auxiliary_loss_clip": 0.06325522, + "auxiliary_loss_mlp": 0.01253695, + "balance_loss_clip": 0.06262392, + "balance_loss_mlp": 0.01252035, + "epoch": 0.5340748534495716, + "flos": 63896504901120.0, + "grad_norm": 0.885852476410532, + "language_loss": 0.63786471, + "learning_rate": 1.876424680745913e-06, + "loss": 0.7136569, + "num_input_tokens_seen": 191174240, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 0.01663208, + "step": 8883, + "time_per_iteration": 2.967287063598633 + }, + { + "auxiliary_loss_clip": 0.06432307, + "auxiliary_loss_mlp": 0.01267155, + "balance_loss_clip": 0.06278822, + "balance_loss_mlp": 0.01254942, + "epoch": 0.5341349767022396, + "flos": 28701872588160.0, + "grad_norm": 2.199844959316804, + "language_loss": 0.82043612, + "learning_rate": 1.8760359655476272e-06, + "loss": 0.89743072, + "num_input_tokens_seen": 191193335, + "router_z_loss_clip": 1.53515625, + "router_z_loss_mlp": 0.12200928, + "step": 8884, + "time_per_iteration": 2.5675361156463623 + }, + { + "auxiliary_loss_clip": 0.06425676, + "auxiliary_loss_mlp": 0.01268668, + "balance_loss_clip": 0.06280383, + "balance_loss_mlp": 0.01257873, + "epoch": 0.5341950999549075, + "flos": 16295265797760.0, + "grad_norm": 1.5488539614491517, + "language_loss": 0.72820723, + "learning_rate": 1.8756472550500695e-06, + "loss": 0.80515063, + "num_input_tokens_seen": 191210900, + "router_z_loss_clip": 1.45214844, + "router_z_loss_mlp": 0.10784912, + "step": 8885, + "time_per_iteration": 2.5164196491241455 + }, + { + "auxiliary_loss_clip": 0.06432982, + "auxiliary_loss_mlp": 0.01266357, + "balance_loss_clip": 0.06277923, + "balance_loss_mlp": 0.01254525, + "epoch": 0.5342552232075756, + "flos": 14360852039040.0, + "grad_norm": 1.8494222651114738, + "language_loss": 0.78934276, + "learning_rate": 1.87525854926798e-06, + "loss": 0.86633611, + "num_input_tokens_seen": 191226730, + "router_z_loss_clip": 1.54980469, + "router_z_loss_mlp": 0.11834717, + "step": 8886, + "time_per_iteration": 2.524366855621338 + }, + { + "auxiliary_loss_clip": 0.06429981, + "auxiliary_loss_mlp": 0.01268189, + "balance_loss_clip": 0.06279354, + "balance_loss_mlp": 0.01255606, + "epoch": 0.5343153464602435, + "flos": 30305517154560.0, + "grad_norm": 1.3913460534471052, + "language_loss": 0.75135863, + "learning_rate": 1.8748698482160996e-06, + "loss": 0.82834035, + "num_input_tokens_seen": 191250435, + "router_z_loss_clip": 1.50683594, + "router_z_loss_mlp": 0.12579346, + "step": 8887, + "time_per_iteration": 2.6564323902130127 + }, + { + "auxiliary_loss_clip": 0.06427558, + "auxiliary_loss_mlp": 0.01265573, + "balance_loss_clip": 0.06278411, + "balance_loss_mlp": 0.0125401, + "epoch": 0.5343754697129115, + "flos": 15601722353280.0, + "grad_norm": 2.357980716065106, + "language_loss": 0.69295096, + "learning_rate": 1.8744811519091663e-06, + "loss": 0.76988232, + "num_input_tokens_seen": 191268315, + "router_z_loss_clip": 1.49023438, + "router_z_loss_mlp": 0.11560059, + "step": 8888, + "time_per_iteration": 2.4917025566101074 + }, + { + "auxiliary_loss_clip": 0.06442724, + "auxiliary_loss_mlp": 0.01272933, + "balance_loss_clip": 0.06283408, + "balance_loss_mlp": 0.01260935, + "epoch": 0.5344355929655794, + "flos": 16915239757440.0, + "grad_norm": 1.9387999695924976, + "language_loss": 0.78584576, + "learning_rate": 1.8740924603619208e-06, + "loss": 0.8630023, + "num_input_tokens_seen": 191287000, + "router_z_loss_clip": 1.59277344, + "router_z_loss_mlp": 0.12005615, + "step": 8889, + "time_per_iteration": 2.5028741359710693 + }, + { + "auxiliary_loss_clip": 0.06424284, + "auxiliary_loss_mlp": 0.01268375, + "balance_loss_clip": 0.06276136, + "balance_loss_mlp": 0.01256431, + "epoch": 0.5344957162182474, + "flos": 16803460010880.0, + "grad_norm": 1.9089962398127316, + "language_loss": 0.69733131, + "learning_rate": 1.873703773589102e-06, + "loss": 0.7742579, + "num_input_tokens_seen": 191304565, + "router_z_loss_clip": 1.47949219, + "router_z_loss_mlp": 0.1194458, + "step": 8890, + "time_per_iteration": 2.4705469608306885 + }, + { + "auxiliary_loss_clip": 0.06430273, + "auxiliary_loss_mlp": 0.01267824, + "balance_loss_clip": 0.0627601, + "balance_loss_mlp": 0.01255635, + "epoch": 0.5345558394709153, + "flos": 12709144356480.0, + "grad_norm": 3.2953855429591536, + "language_loss": 0.77688992, + "learning_rate": 1.8733150916054483e-06, + "loss": 0.85387087, + "num_input_tokens_seen": 191318300, + "router_z_loss_clip": 1.54101562, + "router_z_loss_mlp": 0.12182617, + "step": 8891, + "time_per_iteration": 2.500333547592163 + }, + { + "auxiliary_loss_clip": 0.06428199, + "auxiliary_loss_mlp": 0.01268573, + "balance_loss_clip": 0.06281698, + "balance_loss_mlp": 0.01257486, + "epoch": 0.5346159627235834, + "flos": 22461532888320.0, + "grad_norm": 1.516620120390114, + "language_loss": 0.74519014, + "learning_rate": 1.872926414425699e-06, + "loss": 0.82215786, + "num_input_tokens_seen": 191337925, + "router_z_loss_clip": 1.46484375, + "router_z_loss_mlp": 0.11102295, + "step": 8892, + "time_per_iteration": 2.4968128204345703 + }, + { + "auxiliary_loss_clip": 0.06427278, + "auxiliary_loss_mlp": 0.01264312, + "balance_loss_clip": 0.06277005, + "balance_loss_mlp": 0.01253566, + "epoch": 0.5346760859762513, + "flos": 22421771326080.0, + "grad_norm": 1.6631056082688196, + "language_loss": 0.87902844, + "learning_rate": 1.8725377420645932e-06, + "loss": 0.95594442, + "num_input_tokens_seen": 191357120, + "router_z_loss_clip": 1.50292969, + "router_z_loss_mlp": 0.10742188, + "step": 8893, + "time_per_iteration": 2.5580215454101562 + }, + { + "auxiliary_loss_clip": 0.06429157, + "auxiliary_loss_mlp": 0.01263801, + "balance_loss_clip": 0.06281421, + "balance_loss_mlp": 0.01253155, + "epoch": 0.5347362092289193, + "flos": 22822043080320.0, + "grad_norm": 1.612055893952936, + "language_loss": 0.72799695, + "learning_rate": 1.872149074536869e-06, + "loss": 0.80492651, + "num_input_tokens_seen": 191375395, + "router_z_loss_clip": 1.4765625, + "router_z_loss_mlp": 0.10650635, + "step": 8894, + "time_per_iteration": 2.54834246635437 + }, + { + "auxiliary_loss_clip": 0.06422012, + "auxiliary_loss_mlp": 0.01266432, + "balance_loss_clip": 0.06275687, + "balance_loss_mlp": 0.01254571, + "epoch": 0.5347963324815872, + "flos": 23225794778880.0, + "grad_norm": 1.4320398201671862, + "language_loss": 0.75047934, + "learning_rate": 1.8717604118572648e-06, + "loss": 0.82736373, + "num_input_tokens_seen": 191395595, + "router_z_loss_clip": 1.46289062, + "router_z_loss_mlp": 0.11865234, + "step": 8895, + "time_per_iteration": 2.5309391021728516 + }, + { + "auxiliary_loss_clip": 0.06432986, + "auxiliary_loss_mlp": 0.01266799, + "balance_loss_clip": 0.06282157, + "balance_loss_mlp": 0.01255606, + "epoch": 0.5348564557342552, + "flos": 22607917171200.0, + "grad_norm": 1.7183644079473714, + "language_loss": 0.77449572, + "learning_rate": 1.8713717540405178e-06, + "loss": 0.8514936, + "num_input_tokens_seen": 191413730, + "router_z_loss_clip": 1.50683594, + "router_z_loss_mlp": 0.11181641, + "step": 8896, + "time_per_iteration": 2.5175390243530273 + }, + { + "auxiliary_loss_clip": 0.06424737, + "auxiliary_loss_mlp": 0.01267928, + "balance_loss_clip": 0.06278285, + "balance_loss_mlp": 0.01256639, + "epoch": 0.5349165789869232, + "flos": 18007880999040.0, + "grad_norm": 1.7578614055599853, + "language_loss": 0.79043764, + "learning_rate": 1.8709831011013676e-06, + "loss": 0.86736429, + "num_input_tokens_seen": 191432400, + "router_z_loss_clip": 1.46386719, + "router_z_loss_mlp": 0.11297607, + "step": 8897, + "time_per_iteration": 2.5068724155426025 + }, + { + "auxiliary_loss_clip": 0.06429999, + "auxiliary_loss_mlp": 0.01264934, + "balance_loss_clip": 0.06279507, + "balance_loss_mlp": 0.01253365, + "epoch": 0.5349767022395912, + "flos": 17164557181440.0, + "grad_norm": 1.7104987912832146, + "language_loss": 0.76011693, + "learning_rate": 1.8705944530545509e-06, + "loss": 0.83706623, + "num_input_tokens_seen": 191448855, + "router_z_loss_clip": 1.50488281, + "router_z_loss_mlp": 0.11566162, + "step": 8898, + "time_per_iteration": 2.5468573570251465 + }, + { + "auxiliary_loss_clip": 0.06323466, + "auxiliary_loss_mlp": 0.01262304, + "balance_loss_clip": 0.06260733, + "balance_loss_mlp": 0.01260944, + "epoch": 0.5350368254922592, + "flos": 71014590518400.0, + "grad_norm": 0.8026406428525971, + "language_loss": 0.57916105, + "learning_rate": 1.8702058099148052e-06, + "loss": 0.65501881, + "num_input_tokens_seen": 191519690, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 0.01361847, + "step": 8899, + "time_per_iteration": 3.354367256164551 + }, + { + "auxiliary_loss_clip": 0.06428243, + "auxiliary_loss_mlp": 0.01266735, + "balance_loss_clip": 0.06281818, + "balance_loss_mlp": 0.01255857, + "epoch": 0.5350969487449271, + "flos": 27425265707520.0, + "grad_norm": 1.5056303351191316, + "language_loss": 0.70071346, + "learning_rate": 1.869817171696868e-06, + "loss": 0.77766323, + "num_input_tokens_seen": 191539380, + "router_z_loss_clip": 1.46484375, + "router_z_loss_mlp": 0.10882568, + "step": 8900, + "time_per_iteration": 2.596675395965576 + }, + { + "auxiliary_loss_clip": 0.0643241, + "auxiliary_loss_mlp": 0.01268767, + "balance_loss_clip": 0.06280074, + "balance_loss_mlp": 0.0125743, + "epoch": 0.5351570719975951, + "flos": 19321901527680.0, + "grad_norm": 1.5148336766284718, + "language_loss": 0.71324182, + "learning_rate": 1.8694285384154777e-06, + "loss": 0.79025364, + "num_input_tokens_seen": 191557400, + "router_z_loss_clip": 1.52148438, + "router_z_loss_mlp": 0.11346436, + "step": 8901, + "time_per_iteration": 2.526811122894287 + }, + { + "auxiliary_loss_clip": 0.06432061, + "auxiliary_loss_mlp": 0.01269417, + "balance_loss_clip": 0.06280375, + "balance_loss_mlp": 0.01257377, + "epoch": 0.535217195250263, + "flos": 19834707715200.0, + "grad_norm": 1.961594084549487, + "language_loss": 0.77373689, + "learning_rate": 1.8690399100853699e-06, + "loss": 0.85075164, + "num_input_tokens_seen": 191575860, + "router_z_loss_clip": 1.51757812, + "router_z_loss_mlp": 0.1204834, + "step": 8902, + "time_per_iteration": 3.931328773498535 + }, + { + "auxiliary_loss_clip": 0.06422594, + "auxiliary_loss_mlp": 0.01261364, + "balance_loss_clip": 0.0627951, + "balance_loss_mlp": 0.01250188, + "epoch": 0.535277318502931, + "flos": 22134495202560.0, + "grad_norm": 1.5214881410098744, + "language_loss": 0.7052539, + "learning_rate": 1.868651286721281e-06, + "loss": 0.78209347, + "num_input_tokens_seen": 191595775, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.1116333, + "step": 8903, + "time_per_iteration": 2.5344340801239014 + }, + { + "auxiliary_loss_clip": 0.06433277, + "auxiliary_loss_mlp": 0.01267717, + "balance_loss_clip": 0.06279396, + "balance_loss_mlp": 0.01255426, + "epoch": 0.5353374417555989, + "flos": 25052873057280.0, + "grad_norm": 1.5307499252390009, + "language_loss": 0.72374737, + "learning_rate": 1.86826266833795e-06, + "loss": 0.80075729, + "num_input_tokens_seen": 191617785, + "router_z_loss_clip": 1.53710938, + "router_z_loss_mlp": 0.12304688, + "step": 8904, + "time_per_iteration": 3.979325294494629 + }, + { + "auxiliary_loss_clip": 0.06430352, + "auxiliary_loss_mlp": 0.0127012, + "balance_loss_clip": 0.06280231, + "balance_loss_mlp": 0.01257961, + "epoch": 0.535397565008267, + "flos": 19394422836480.0, + "grad_norm": 1.7887132092295748, + "language_loss": 0.73359382, + "learning_rate": 1.8678740549501103e-06, + "loss": 0.81059849, + "num_input_tokens_seen": 191636900, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.121521, + "step": 8905, + "time_per_iteration": 2.5468502044677734 + }, + { + "auxiliary_loss_clip": 0.06426303, + "auxiliary_loss_mlp": 0.01263381, + "balance_loss_clip": 0.06282683, + "balance_loss_mlp": 0.01252402, + "epoch": 0.5354576882609349, + "flos": 21477736500480.0, + "grad_norm": 1.458955847450215, + "language_loss": 0.83904094, + "learning_rate": 1.8674854465725005e-06, + "loss": 0.91593778, + "num_input_tokens_seen": 191656720, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.10980225, + "step": 8906, + "time_per_iteration": 2.5199477672576904 + }, + { + "auxiliary_loss_clip": 0.06430362, + "auxiliary_loss_mlp": 0.01270808, + "balance_loss_clip": 0.06278186, + "balance_loss_mlp": 0.01258416, + "epoch": 0.5355178115136029, + "flos": 20783857639680.0, + "grad_norm": 1.893504710630849, + "language_loss": 0.74486792, + "learning_rate": 1.8670968432198563e-06, + "loss": 0.82187963, + "num_input_tokens_seen": 191674445, + "router_z_loss_clip": 1.52246094, + "router_z_loss_mlp": 0.1237793, + "step": 8907, + "time_per_iteration": 2.5200021266937256 + }, + { + "auxiliary_loss_clip": 0.06428273, + "auxiliary_loss_mlp": 0.01264992, + "balance_loss_clip": 0.06280483, + "balance_loss_mlp": 0.0125421, + "epoch": 0.5355779347662708, + "flos": 23520827404800.0, + "grad_norm": 1.6955230805298804, + "language_loss": 0.76706243, + "learning_rate": 1.866708244906912e-06, + "loss": 0.84399509, + "num_input_tokens_seen": 191695000, + "router_z_loss_clip": 1.47851562, + "router_z_loss_mlp": 0.10772705, + "step": 8908, + "time_per_iteration": 4.040110349655151 + }, + { + "auxiliary_loss_clip": 0.06432807, + "auxiliary_loss_mlp": 0.01271179, + "balance_loss_clip": 0.06280953, + "balance_loss_mlp": 0.01258835, + "epoch": 0.5356380580189388, + "flos": 20309471349120.0, + "grad_norm": 2.626231250487559, + "language_loss": 0.74318033, + "learning_rate": 1.8663196516484055e-06, + "loss": 0.82022017, + "num_input_tokens_seen": 191713295, + "router_z_loss_clip": 1.51855469, + "router_z_loss_mlp": 0.12347412, + "step": 8909, + "time_per_iteration": 2.503324031829834 + }, + { + "auxiliary_loss_clip": 0.06428281, + "auxiliary_loss_mlp": 0.01268046, + "balance_loss_clip": 0.06279926, + "balance_loss_mlp": 0.0125724, + "epoch": 0.5356981812716068, + "flos": 21368136960000.0, + "grad_norm": 2.2429477917403435, + "language_loss": 0.84013373, + "learning_rate": 1.8659310634590702e-06, + "loss": 0.91709697, + "num_input_tokens_seen": 191732725, + "router_z_loss_clip": 1.48242188, + "router_z_loss_mlp": 0.10803223, + "step": 8910, + "time_per_iteration": 2.532768726348877 + }, + { + "auxiliary_loss_clip": 0.06428899, + "auxiliary_loss_mlp": 0.01267044, + "balance_loss_clip": 0.06278617, + "balance_loss_mlp": 0.01255152, + "epoch": 0.5357583045242748, + "flos": 23117746538880.0, + "grad_norm": 1.5068539432144845, + "language_loss": 0.82170522, + "learning_rate": 1.8655424803536427e-06, + "loss": 0.89866459, + "num_input_tokens_seen": 191753765, + "router_z_loss_clip": 1.50292969, + "router_z_loss_mlp": 0.11895752, + "step": 8911, + "time_per_iteration": 2.530242681503296 + }, + { + "auxiliary_loss_clip": 0.06427851, + "auxiliary_loss_mlp": 0.01268226, + "balance_loss_clip": 0.06279093, + "balance_loss_mlp": 0.01256794, + "epoch": 0.5358184277769428, + "flos": 21148057411200.0, + "grad_norm": 1.7566097539058134, + "language_loss": 0.6953544, + "learning_rate": 1.8651539023468585e-06, + "loss": 0.7723152, + "num_input_tokens_seen": 191773560, + "router_z_loss_clip": 1.48730469, + "router_z_loss_mlp": 0.11425781, + "step": 8912, + "time_per_iteration": 2.52546763420105 + }, + { + "auxiliary_loss_clip": 0.06429117, + "auxiliary_loss_mlp": 0.01266082, + "balance_loss_clip": 0.06281352, + "balance_loss_mlp": 0.01255234, + "epoch": 0.5358785510296107, + "flos": 16286754608640.0, + "grad_norm": 1.7988140692342254, + "language_loss": 0.71504682, + "learning_rate": 1.8647653294534509e-06, + "loss": 0.79199886, + "num_input_tokens_seen": 191791255, + "router_z_loss_clip": 1.4765625, + "router_z_loss_mlp": 0.10858154, + "step": 8913, + "time_per_iteration": 2.4723551273345947 + }, + { + "auxiliary_loss_clip": 0.06437049, + "auxiliary_loss_mlp": 0.01269643, + "balance_loss_clip": 0.06283163, + "balance_loss_mlp": 0.01257883, + "epoch": 0.5359386742822787, + "flos": 16981555864320.0, + "grad_norm": 1.6333944745256754, + "language_loss": 0.72038394, + "learning_rate": 1.864376761688156e-06, + "loss": 0.7974509, + "num_input_tokens_seen": 191809325, + "router_z_loss_clip": 1.5390625, + "router_z_loss_mlp": 0.11761475, + "step": 8914, + "time_per_iteration": 2.5807461738586426 + }, + { + "auxiliary_loss_clip": 0.06438086, + "auxiliary_loss_mlp": 0.01272172, + "balance_loss_clip": 0.06283066, + "balance_loss_mlp": 0.01259327, + "epoch": 0.5359987975349466, + "flos": 20819091081600.0, + "grad_norm": 1.7157890571158112, + "language_loss": 0.706487, + "learning_rate": 1.8639881990657079e-06, + "loss": 0.7835896, + "num_input_tokens_seen": 191829795, + "router_z_loss_clip": 1.54785156, + "router_z_loss_mlp": 0.12841797, + "step": 8915, + "time_per_iteration": 2.542787790298462 + }, + { + "auxiliary_loss_clip": 0.06428587, + "auxiliary_loss_mlp": 0.01269302, + "balance_loss_clip": 0.06281634, + "balance_loss_mlp": 0.01257918, + "epoch": 0.5360589207876146, + "flos": 22206429532800.0, + "grad_norm": 1.674776865577312, + "language_loss": 0.75600839, + "learning_rate": 1.8635996416008408e-06, + "loss": 0.83298731, + "num_input_tokens_seen": 191850840, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 0.11383057, + "step": 8916, + "time_per_iteration": 2.5621731281280518 + }, + { + "auxiliary_loss_clip": 0.06429151, + "auxiliary_loss_mlp": 0.01268516, + "balance_loss_clip": 0.06277589, + "balance_loss_mlp": 0.01256995, + "epoch": 0.5361190440402825, + "flos": 31402393027200.0, + "grad_norm": 2.5448267428400655, + "language_loss": 0.72810572, + "learning_rate": 1.863211089308289e-06, + "loss": 0.80508238, + "num_input_tokens_seen": 191869520, + "router_z_loss_clip": 1.51660156, + "router_z_loss_mlp": 0.1151123, + "step": 8917, + "time_per_iteration": 4.027824401855469 + }, + { + "auxiliary_loss_clip": 0.06433325, + "auxiliary_loss_mlp": 0.01268717, + "balance_loss_clip": 0.06283134, + "balance_loss_mlp": 0.01257195, + "epoch": 0.5361791672929506, + "flos": 16075270103040.0, + "grad_norm": 1.844905450054995, + "language_loss": 0.71658254, + "learning_rate": 1.8628225422027865e-06, + "loss": 0.793603, + "num_input_tokens_seen": 191887240, + "router_z_loss_clip": 1.50097656, + "router_z_loss_mlp": 0.11529541, + "step": 8918, + "time_per_iteration": 2.5032598972320557 + }, + { + "auxiliary_loss_clip": 0.06431636, + "auxiliary_loss_mlp": 0.01270312, + "balance_loss_clip": 0.06282899, + "balance_loss_mlp": 0.01258933, + "epoch": 0.5362392905456185, + "flos": 20747240605440.0, + "grad_norm": 1.4549229797282903, + "language_loss": 0.75235254, + "learning_rate": 1.862434000299067e-06, + "loss": 0.82937205, + "num_input_tokens_seen": 191905690, + "router_z_loss_clip": 1.48632812, + "router_z_loss_mlp": 0.11383057, + "step": 8919, + "time_per_iteration": 2.5361175537109375 + }, + { + "auxiliary_loss_clip": 0.06430984, + "auxiliary_loss_mlp": 0.01266509, + "balance_loss_clip": 0.06280042, + "balance_loss_mlp": 0.01255244, + "epoch": 0.5362994137982865, + "flos": 17344539751680.0, + "grad_norm": 10.323313850773834, + "language_loss": 0.71843415, + "learning_rate": 1.862045463611864e-06, + "loss": 0.79540908, + "num_input_tokens_seen": 191920725, + "router_z_loss_clip": 1.50878906, + "router_z_loss_mlp": 0.11254883, + "step": 8920, + "time_per_iteration": 2.481144666671753 + }, + { + "auxiliary_loss_clip": 0.06425787, + "auxiliary_loss_mlp": 0.0126502, + "balance_loss_clip": 0.06276651, + "balance_loss_mlp": 0.01253659, + "epoch": 0.5363595370509544, + "flos": 42823819837440.0, + "grad_norm": 1.3389140049198536, + "language_loss": 0.68970168, + "learning_rate": 1.8616569321559105e-06, + "loss": 0.76660967, + "num_input_tokens_seen": 191944645, + "router_z_loss_clip": 1.49023438, + "router_z_loss_mlp": 0.11352539, + "step": 8921, + "time_per_iteration": 2.7377495765686035 + }, + { + "auxiliary_loss_clip": 0.06429093, + "auxiliary_loss_mlp": 0.01267258, + "balance_loss_clip": 0.06280531, + "balance_loss_mlp": 0.01255575, + "epoch": 0.5364196603036224, + "flos": 19177990940160.0, + "grad_norm": 2.2769865828018516, + "language_loss": 0.81912661, + "learning_rate": 1.86126840594594e-06, + "loss": 0.89609009, + "num_input_tokens_seen": 191962265, + "router_z_loss_clip": 1.48632812, + "router_z_loss_mlp": 0.11676025, + "step": 8922, + "time_per_iteration": 2.491041660308838 + }, + { + "auxiliary_loss_clip": 0.06431051, + "auxiliary_loss_mlp": 0.01267721, + "balance_loss_clip": 0.06279019, + "balance_loss_mlp": 0.01256539, + "epoch": 0.5364797835562904, + "flos": 17936827136640.0, + "grad_norm": 1.913279005224502, + "language_loss": 0.76818264, + "learning_rate": 1.860879884996686e-06, + "loss": 0.84517032, + "num_input_tokens_seen": 191978850, + "router_z_loss_clip": 1.52050781, + "router_z_loss_mlp": 0.11175537, + "step": 8923, + "time_per_iteration": 2.502797842025757 + }, + { + "auxiliary_loss_clip": 0.06430578, + "auxiliary_loss_mlp": 0.01268847, + "balance_loss_clip": 0.06277579, + "balance_loss_mlp": 0.01257052, + "epoch": 0.5365399068089584, + "flos": 30236098446720.0, + "grad_norm": 1.4167756526815838, + "language_loss": 0.70506531, + "learning_rate": 1.8604913693228804e-06, + "loss": 0.78205955, + "num_input_tokens_seen": 192002000, + "router_z_loss_clip": 1.53027344, + "router_z_loss_mlp": 0.11791992, + "step": 8924, + "time_per_iteration": 2.5783135890960693 + }, + { + "auxiliary_loss_clip": 0.06433783, + "auxiliary_loss_mlp": 0.01269029, + "balance_loss_clip": 0.06280564, + "balance_loss_mlp": 0.01256804, + "epoch": 0.5366000300616264, + "flos": 24897264825600.0, + "grad_norm": 2.5342740284522516, + "language_loss": 0.87064564, + "learning_rate": 1.8601028589392558e-06, + "loss": 0.9476738, + "num_input_tokens_seen": 192019100, + "router_z_loss_clip": 1.53320312, + "router_z_loss_mlp": 0.12231445, + "step": 8925, + "time_per_iteration": 2.555947780609131 + }, + { + "auxiliary_loss_clip": 0.0643315, + "auxiliary_loss_mlp": 0.012686, + "balance_loss_clip": 0.06278683, + "balance_loss_mlp": 0.01256911, + "epoch": 0.5366601533142943, + "flos": 29834610808320.0, + "grad_norm": 1.6615305931190325, + "language_loss": 0.78511882, + "learning_rate": 1.8597143538605455e-06, + "loss": 0.86213624, + "num_input_tokens_seen": 192041660, + "router_z_loss_clip": 1.54296875, + "router_z_loss_mlp": 0.11694336, + "step": 8926, + "time_per_iteration": 2.575540781021118 + }, + { + "auxiliary_loss_clip": 0.06420288, + "auxiliary_loss_mlp": 0.01265367, + "balance_loss_clip": 0.06276788, + "balance_loss_mlp": 0.0125437, + "epoch": 0.5367202765669623, + "flos": 27206821313280.0, + "grad_norm": 1.3335091711279083, + "language_loss": 0.66572356, + "learning_rate": 1.85932585410148e-06, + "loss": 0.74258018, + "num_input_tokens_seen": 192063540, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.11004639, + "step": 8927, + "time_per_iteration": 2.574263572692871 + }, + { + "auxiliary_loss_clip": 0.06429082, + "auxiliary_loss_mlp": 0.0126451, + "balance_loss_clip": 0.06277999, + "balance_loss_mlp": 0.0125309, + "epoch": 0.5367803998196302, + "flos": 20236153426560.0, + "grad_norm": 1.7727091217622297, + "language_loss": 0.73473167, + "learning_rate": 1.8589373596767929e-06, + "loss": 0.81166756, + "num_input_tokens_seen": 192081760, + "router_z_loss_clip": 1.51074219, + "router_z_loss_mlp": 0.11413574, + "step": 8928, + "time_per_iteration": 2.4792275428771973 + }, + { + "auxiliary_loss_clip": 0.06429128, + "auxiliary_loss_mlp": 0.01265529, + "balance_loss_clip": 0.06278329, + "balance_loss_mlp": 0.01254609, + "epoch": 0.5368405230722982, + "flos": 32161791381120.0, + "grad_norm": 1.7479222402462038, + "language_loss": 0.62972343, + "learning_rate": 1.8585488706012154e-06, + "loss": 0.70666999, + "num_input_tokens_seen": 192101620, + "router_z_loss_clip": 1.50683594, + "router_z_loss_mlp": 0.10919189, + "step": 8929, + "time_per_iteration": 2.622292995452881 + }, + { + "auxiliary_loss_clip": 0.06432647, + "auxiliary_loss_mlp": 0.01265269, + "balance_loss_clip": 0.0628202, + "balance_loss_mlp": 0.01254433, + "epoch": 0.5369006463249661, + "flos": 26254778423040.0, + "grad_norm": 1.591710131173975, + "language_loss": 0.66400939, + "learning_rate": 1.8581603868894781e-06, + "loss": 0.74098849, + "num_input_tokens_seen": 192121805, + "router_z_loss_clip": 1.50585938, + "router_z_loss_mlp": 0.10845947, + "step": 8930, + "time_per_iteration": 2.543949604034424 + }, + { + "auxiliary_loss_clip": 0.06424774, + "auxiliary_loss_mlp": 0.01264361, + "balance_loss_clip": 0.06279226, + "balance_loss_mlp": 0.01253299, + "epoch": 0.5369607695776342, + "flos": 26218119461760.0, + "grad_norm": 1.4676781117198738, + "language_loss": 0.67308921, + "learning_rate": 1.8577719085563136e-06, + "loss": 0.74998057, + "num_input_tokens_seen": 192141765, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.1105957, + "step": 8931, + "time_per_iteration": 2.5630295276641846 + }, + { + "auxiliary_loss_clip": 0.06432625, + "auxiliary_loss_mlp": 0.01268662, + "balance_loss_clip": 0.0628577, + "balance_loss_mlp": 0.01256598, + "epoch": 0.5370208928303021, + "flos": 25015920606720.0, + "grad_norm": 1.565512656212007, + "language_loss": 0.76494187, + "learning_rate": 1.8573834356164525e-06, + "loss": 0.84195477, + "num_input_tokens_seen": 192161560, + "router_z_loss_clip": 1.46777344, + "router_z_loss_mlp": 0.12072754, + "step": 8932, + "time_per_iteration": 2.5423011779785156 + }, + { + "auxiliary_loss_clip": 0.0642775, + "auxiliary_loss_mlp": 0.01267942, + "balance_loss_clip": 0.06280537, + "balance_loss_mlp": 0.01255723, + "epoch": 0.5370810160829701, + "flos": 31799646034560.0, + "grad_norm": 1.681669184165067, + "language_loss": 0.66588402, + "learning_rate": 1.8569949680846261e-06, + "loss": 0.74284095, + "num_input_tokens_seen": 192180190, + "router_z_loss_clip": 1.47167969, + "router_z_loss_mlp": 0.12219238, + "step": 8933, + "time_per_iteration": 2.6461243629455566 + }, + { + "auxiliary_loss_clip": 0.0642833, + "auxiliary_loss_mlp": 0.01268413, + "balance_loss_clip": 0.06281729, + "balance_loss_mlp": 0.01256515, + "epoch": 0.537141139335638, + "flos": 23849500245120.0, + "grad_norm": 1.5934461108199862, + "language_loss": 0.83294082, + "learning_rate": 1.856606505975565e-06, + "loss": 0.90990818, + "num_input_tokens_seen": 192198855, + "router_z_loss_clip": 1.46582031, + "router_z_loss_mlp": 0.11895752, + "step": 8934, + "time_per_iteration": 2.5241549015045166 + }, + { + "auxiliary_loss_clip": 0.06428687, + "auxiliary_loss_mlp": 0.01267543, + "balance_loss_clip": 0.06283442, + "balance_loss_mlp": 0.01256033, + "epoch": 0.537201262588306, + "flos": 18513685370880.0, + "grad_norm": 1.6222709830765285, + "language_loss": 0.7995823, + "learning_rate": 1.856218049303999e-06, + "loss": 0.87654459, + "num_input_tokens_seen": 192216555, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.11517334, + "step": 8935, + "time_per_iteration": 2.5692355632781982 + }, + { + "auxiliary_loss_clip": 0.06432107, + "auxiliary_loss_mlp": 0.01265757, + "balance_loss_clip": 0.06282724, + "balance_loss_mlp": 0.01253556, + "epoch": 0.537261385840974, + "flos": 25669492853760.0, + "grad_norm": 4.395420873174801, + "language_loss": 0.83744997, + "learning_rate": 1.855829598084659e-06, + "loss": 0.91442859, + "num_input_tokens_seen": 192236910, + "router_z_loss_clip": 1.49414062, + "router_z_loss_mlp": 0.12200928, + "step": 8936, + "time_per_iteration": 2.53723406791687 + }, + { + "auxiliary_loss_clip": 0.06430986, + "auxiliary_loss_mlp": 0.0126655, + "balance_loss_clip": 0.06284051, + "balance_loss_mlp": 0.01255458, + "epoch": 0.537321509093642, + "flos": 40744656950400.0, + "grad_norm": 1.238966659536207, + "language_loss": 0.73065245, + "learning_rate": 1.8554411523322754e-06, + "loss": 0.8076278, + "num_input_tokens_seen": 192260790, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 0.11096191, + "step": 8937, + "time_per_iteration": 2.7185041904449463 + }, + { + "auxiliary_loss_clip": 0.06432244, + "auxiliary_loss_mlp": 0.01269226, + "balance_loss_clip": 0.06281854, + "balance_loss_mlp": 0.01257591, + "epoch": 0.53738163234631, + "flos": 17244248014080.0, + "grad_norm": 2.3423795733880506, + "language_loss": 0.82399505, + "learning_rate": 1.8550527120615778e-06, + "loss": 0.90100974, + "num_input_tokens_seen": 192277230, + "router_z_loss_clip": 1.50585938, + "router_z_loss_mlp": 0.11645508, + "step": 8938, + "time_per_iteration": 2.497788906097412 + }, + { + "auxiliary_loss_clip": 0.06440363, + "auxiliary_loss_mlp": 0.01269336, + "balance_loss_clip": 0.06284846, + "balance_loss_mlp": 0.01257505, + "epoch": 0.5374417555989779, + "flos": 12826710034560.0, + "grad_norm": 2.237788663184982, + "language_loss": 0.80566859, + "learning_rate": 1.8546642772872957e-06, + "loss": 0.88276565, + "num_input_tokens_seen": 192292840, + "router_z_loss_clip": 1.5546875, + "router_z_loss_mlp": 0.1184082, + "step": 8939, + "time_per_iteration": 2.506603479385376 + }, + { + "auxiliary_loss_clip": 0.06330699, + "auxiliary_loss_mlp": 0.01256495, + "balance_loss_clip": 0.06268299, + "balance_loss_mlp": 0.01254887, + "epoch": 0.5375018788516459, + "flos": 67275502248960.0, + "grad_norm": 0.6889137998662954, + "language_loss": 0.5233649, + "learning_rate": 1.8542758480241589e-06, + "loss": 0.59923685, + "num_input_tokens_seen": 192358240, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 0.01609802, + "step": 8940, + "time_per_iteration": 3.1455881595611572 + }, + { + "auxiliary_loss_clip": 0.06424817, + "auxiliary_loss_mlp": 0.01265039, + "balance_loss_clip": 0.06280527, + "balance_loss_mlp": 0.01254197, + "epoch": 0.5375620021043138, + "flos": 18120080016000.0, + "grad_norm": 1.7572331791906293, + "language_loss": 0.71456778, + "learning_rate": 1.8538874242868965e-06, + "loss": 0.7914663, + "num_input_tokens_seen": 192377370, + "router_z_loss_clip": 1.44238281, + "router_z_loss_mlp": 0.1083374, + "step": 8941, + "time_per_iteration": 3.9169673919677734 + }, + { + "auxiliary_loss_clip": 0.06423429, + "auxiliary_loss_mlp": 0.01266734, + "balance_loss_clip": 0.06280611, + "balance_loss_mlp": 0.01256554, + "epoch": 0.5376221253569818, + "flos": 23156166435840.0, + "grad_norm": 1.5985240277338788, + "language_loss": 0.79660439, + "learning_rate": 1.853499006090237e-06, + "loss": 0.87350607, + "num_input_tokens_seen": 192396450, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.10174561, + "step": 8942, + "time_per_iteration": 2.5441763401031494 + }, + { + "auxiliary_loss_clip": 0.06433077, + "auxiliary_loss_mlp": 0.01269882, + "balance_loss_clip": 0.06281331, + "balance_loss_mlp": 0.01258229, + "epoch": 0.5376822486096497, + "flos": 29980240404480.0, + "grad_norm": 1.695957968467341, + "language_loss": 0.7061829, + "learning_rate": 1.853110593448911e-06, + "loss": 0.78321248, + "num_input_tokens_seen": 192417390, + "router_z_loss_clip": 1.51855469, + "router_z_loss_mlp": 0.11645508, + "step": 8943, + "time_per_iteration": 2.5876903533935547 + }, + { + "auxiliary_loss_clip": 0.06327454, + "auxiliary_loss_mlp": 0.01255314, + "balance_loss_clip": 0.06264913, + "balance_loss_mlp": 0.0125356, + "epoch": 0.5377423718623178, + "flos": 54188139761280.0, + "grad_norm": 0.7834151101556619, + "language_loss": 0.59688759, + "learning_rate": 1.852722186377645e-06, + "loss": 0.67271525, + "num_input_tokens_seen": 192478060, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 0.01757812, + "step": 8944, + "time_per_iteration": 4.5469114780426025 + }, + { + "auxiliary_loss_clip": 0.06439775, + "auxiliary_loss_mlp": 0.01267766, + "balance_loss_clip": 0.06283297, + "balance_loss_mlp": 0.01256066, + "epoch": 0.5378024951149857, + "flos": 23263585770240.0, + "grad_norm": 2.6705245070619754, + "language_loss": 0.776173, + "learning_rate": 1.852333784891169e-06, + "loss": 0.85324842, + "num_input_tokens_seen": 192495985, + "router_z_loss_clip": 1.56347656, + "router_z_loss_mlp": 0.11706543, + "step": 8945, + "time_per_iteration": 2.61606502532959 + }, + { + "auxiliary_loss_clip": 0.06428292, + "auxiliary_loss_mlp": 0.01263773, + "balance_loss_clip": 0.06278516, + "balance_loss_mlp": 0.01252883, + "epoch": 0.5378626183676537, + "flos": 24030866407680.0, + "grad_norm": 1.7469475045380867, + "language_loss": 0.68958521, + "learning_rate": 1.8519453890042112e-06, + "loss": 0.76650584, + "num_input_tokens_seen": 192515445, + "router_z_loss_clip": 1.49804688, + "router_z_loss_mlp": 0.10888672, + "step": 8946, + "time_per_iteration": 2.6660590171813965 + }, + { + "auxiliary_loss_clip": 0.06427687, + "auxiliary_loss_mlp": 0.0126763, + "balance_loss_clip": 0.06282603, + "balance_loss_mlp": 0.01256704, + "epoch": 0.5379227416203216, + "flos": 27169072248960.0, + "grad_norm": 1.5118478086705984, + "language_loss": 0.77489585, + "learning_rate": 1.851556998731498e-06, + "loss": 0.85184896, + "num_input_tokens_seen": 192536530, + "router_z_loss_clip": 1.45019531, + "router_z_loss_mlp": 0.10925293, + "step": 8947, + "time_per_iteration": 2.618797779083252 + }, + { + "auxiliary_loss_clip": 0.06429853, + "auxiliary_loss_mlp": 0.0126878, + "balance_loss_clip": 0.06282403, + "balance_loss_mlp": 0.01257688, + "epoch": 0.5379828648729896, + "flos": 24688631358720.0, + "grad_norm": 1.962883252611848, + "language_loss": 0.60299599, + "learning_rate": 1.8511686140877592e-06, + "loss": 0.6799823, + "num_input_tokens_seen": 192556075, + "router_z_loss_clip": 1.47265625, + "router_z_loss_mlp": 0.11090088, + "step": 8948, + "time_per_iteration": 3.99113392829895 + }, + { + "auxiliary_loss_clip": 0.06430186, + "auxiliary_loss_mlp": 0.01265436, + "balance_loss_clip": 0.06282011, + "balance_loss_mlp": 0.01254629, + "epoch": 0.5380429881256577, + "flos": 22528981025280.0, + "grad_norm": 1.6036817147437437, + "language_loss": 0.7965849, + "learning_rate": 1.8507802350877205e-06, + "loss": 0.87354112, + "num_input_tokens_seen": 192575535, + "router_z_loss_clip": 1.48046875, + "router_z_loss_mlp": 0.10803223, + "step": 8949, + "time_per_iteration": 2.5306220054626465 + }, + { + "auxiliary_loss_clip": 0.06424635, + "auxiliary_loss_mlp": 0.01267697, + "balance_loss_clip": 0.06281022, + "balance_loss_mlp": 0.01256796, + "epoch": 0.5381031113783256, + "flos": 26986825618560.0, + "grad_norm": 1.5758786571118277, + "language_loss": 0.78447008, + "learning_rate": 1.850391861746111e-06, + "loss": 0.86139345, + "num_input_tokens_seen": 192594490, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.10900879, + "step": 8950, + "time_per_iteration": 2.5665290355682373 + }, + { + "auxiliary_loss_clip": 0.0642289, + "auxiliary_loss_mlp": 0.01269045, + "balance_loss_clip": 0.06281261, + "balance_loss_mlp": 0.01258793, + "epoch": 0.5381632346309936, + "flos": 24761026886400.0, + "grad_norm": 1.6449806756094487, + "language_loss": 0.72907847, + "learning_rate": 1.8500034940776573e-06, + "loss": 0.80599785, + "num_input_tokens_seen": 192615650, + "router_z_loss_clip": 1.41503906, + "router_z_loss_mlp": 0.10253906, + "step": 8951, + "time_per_iteration": 2.5389561653137207 + }, + { + "auxiliary_loss_clip": 0.0643057, + "auxiliary_loss_mlp": 0.01265397, + "balance_loss_clip": 0.06280816, + "balance_loss_mlp": 0.01254626, + "epoch": 0.5382233578836615, + "flos": 15565524589440.0, + "grad_norm": 1.8886102084278436, + "language_loss": 0.75767493, + "learning_rate": 1.849615132097085e-06, + "loss": 0.83463454, + "num_input_tokens_seen": 192633840, + "router_z_loss_clip": 1.49707031, + "router_z_loss_mlp": 0.10760498, + "step": 8952, + "time_per_iteration": 2.5009233951568604 + }, + { + "auxiliary_loss_clip": 0.06423527, + "auxiliary_loss_mlp": 0.01265834, + "balance_loss_clip": 0.0627749, + "balance_loss_mlp": 0.01254384, + "epoch": 0.5382834811363295, + "flos": 25091838005760.0, + "grad_norm": 1.352822721598185, + "language_loss": 0.79742837, + "learning_rate": 1.8492267758191228e-06, + "loss": 0.87432194, + "num_input_tokens_seen": 192655890, + "router_z_loss_clip": 1.45996094, + "router_z_loss_mlp": 0.11456299, + "step": 8953, + "time_per_iteration": 2.5382277965545654 + }, + { + "auxiliary_loss_clip": 0.06422, + "auxiliary_loss_mlp": 0.01263666, + "balance_loss_clip": 0.06278996, + "balance_loss_mlp": 0.01253193, + "epoch": 0.5383436043889974, + "flos": 13302983041920.0, + "grad_norm": 1.682075048645487, + "language_loss": 0.80507964, + "learning_rate": 1.8488384252584964e-06, + "loss": 0.88193631, + "num_input_tokens_seen": 192673025, + "router_z_loss_clip": 1.42871094, + "router_z_loss_mlp": 0.10473633, + "step": 8954, + "time_per_iteration": 2.5006446838378906 + }, + { + "auxiliary_loss_clip": 0.06425533, + "auxiliary_loss_mlp": 0.01268977, + "balance_loss_clip": 0.06279075, + "balance_loss_mlp": 0.01258123, + "epoch": 0.5384037276416654, + "flos": 23046063770880.0, + "grad_norm": 2.297323300751636, + "language_loss": 0.77060652, + "learning_rate": 1.8484500804299318e-06, + "loss": 0.84755164, + "num_input_tokens_seen": 192692190, + "router_z_loss_clip": 1.46386719, + "router_z_loss_mlp": 0.10858154, + "step": 8955, + "time_per_iteration": 2.5469982624053955 + }, + { + "auxiliary_loss_clip": 0.06422862, + "auxiliary_loss_mlp": 0.01268692, + "balance_loss_clip": 0.06278117, + "balance_loss_mlp": 0.01257624, + "epoch": 0.5384638508943334, + "flos": 20637389502720.0, + "grad_norm": 1.4766809485278785, + "language_loss": 0.78634906, + "learning_rate": 1.8480617413481557e-06, + "loss": 0.86326456, + "num_input_tokens_seen": 192710380, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.11071777, + "step": 8956, + "time_per_iteration": 3.9486958980560303 + }, + { + "auxiliary_loss_clip": 0.06328554, + "auxiliary_loss_mlp": 0.01254386, + "balance_loss_clip": 0.0626571, + "balance_loss_mlp": 0.01252584, + "epoch": 0.5385239741470014, + "flos": 66755820026880.0, + "grad_norm": 0.8475755828975666, + "language_loss": 0.63483834, + "learning_rate": 1.8476734080278932e-06, + "loss": 0.71066773, + "num_input_tokens_seen": 192768995, + "router_z_loss_clip": 0.62890625, + "router_z_loss_mlp": 0.01797485, + "step": 8957, + "time_per_iteration": 3.0589206218719482 + }, + { + "auxiliary_loss_clip": 0.06326501, + "auxiliary_loss_mlp": 0.01256038, + "balance_loss_clip": 0.06263363, + "balance_loss_mlp": 0.01254215, + "epoch": 0.5385840973996693, + "flos": 64737466076160.0, + "grad_norm": 0.6942778211869604, + "language_loss": 0.51190817, + "learning_rate": 1.8472850804838705e-06, + "loss": 0.58773351, + "num_input_tokens_seen": 192825585, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 0.01818848, + "step": 8958, + "time_per_iteration": 3.1954948902130127 + }, + { + "auxiliary_loss_clip": 0.06433147, + "auxiliary_loss_mlp": 0.01266859, + "balance_loss_clip": 0.06283388, + "balance_loss_mlp": 0.01255189, + "epoch": 0.5386442206523373, + "flos": 26149161951360.0, + "grad_norm": 1.5085241385719446, + "language_loss": 0.77482343, + "learning_rate": 1.8468967587308128e-06, + "loss": 0.85182357, + "num_input_tokens_seen": 192847335, + "router_z_loss_clip": 1.49609375, + "router_z_loss_mlp": 0.11669922, + "step": 8959, + "time_per_iteration": 2.595390558242798 + }, + { + "auxiliary_loss_clip": 0.06429408, + "auxiliary_loss_mlp": 0.01266713, + "balance_loss_clip": 0.06280766, + "balance_loss_mlp": 0.01255269, + "epoch": 0.5387043439050052, + "flos": 18256401809280.0, + "grad_norm": 2.0832623304514373, + "language_loss": 0.84442693, + "learning_rate": 1.8465084427834455e-06, + "loss": 0.92138815, + "num_input_tokens_seen": 192862205, + "router_z_loss_clip": 1.48632812, + "router_z_loss_mlp": 0.11437988, + "step": 8960, + "time_per_iteration": 2.459411382675171 + }, + { + "auxiliary_loss_clip": 0.0642896, + "auxiliary_loss_mlp": 0.01265224, + "balance_loss_clip": 0.06281836, + "balance_loss_mlp": 0.01254495, + "epoch": 0.5387644671576732, + "flos": 29795939349120.0, + "grad_norm": 1.5299241540989073, + "language_loss": 0.78738272, + "learning_rate": 1.8461201326564933e-06, + "loss": 0.86432457, + "num_input_tokens_seen": 192883695, + "router_z_loss_clip": 1.47070312, + "router_z_loss_mlp": 0.1072998, + "step": 8961, + "time_per_iteration": 2.6379730701446533 + }, + { + "auxiliary_loss_clip": 0.06425574, + "auxiliary_loss_mlp": 0.01265079, + "balance_loss_clip": 0.06280299, + "balance_loss_mlp": 0.01254106, + "epoch": 0.5388245904103413, + "flos": 22379661849600.0, + "grad_norm": 1.7063822520278231, + "language_loss": 0.85018182, + "learning_rate": 1.845731828364681e-06, + "loss": 0.92708838, + "num_input_tokens_seen": 192900190, + "router_z_loss_clip": 1.45214844, + "router_z_loss_mlp": 0.10980225, + "step": 8962, + "time_per_iteration": 2.495314359664917 + }, + { + "auxiliary_loss_clip": 0.06324032, + "auxiliary_loss_mlp": 0.01253937, + "balance_loss_clip": 0.06261306, + "balance_loss_mlp": 0.01252085, + "epoch": 0.5388847136630092, + "flos": 69827332417920.0, + "grad_norm": 0.7252434381461927, + "language_loss": 0.54196495, + "learning_rate": 1.8453435299227333e-06, + "loss": 0.61774462, + "num_input_tokens_seen": 192958675, + "router_z_loss_clip": 0.62792969, + "router_z_loss_mlp": 0.01847839, + "step": 8963, + "time_per_iteration": 3.0685930252075195 + }, + { + "auxiliary_loss_clip": 0.06319527, + "auxiliary_loss_mlp": 0.01253383, + "balance_loss_clip": 0.0625699, + "balance_loss_mlp": 0.01251595, + "epoch": 0.5389448369156772, + "flos": 69844270942080.0, + "grad_norm": 0.7817796987422422, + "language_loss": 0.62972116, + "learning_rate": 1.8449552373453744e-06, + "loss": 0.7054503, + "num_input_tokens_seen": 193033135, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 0.01786804, + "step": 8964, + "time_per_iteration": 3.2163538932800293 + }, + { + "auxiliary_loss_clip": 0.0643357, + "auxiliary_loss_mlp": 0.01266947, + "balance_loss_clip": 0.06280617, + "balance_loss_mlp": 0.01255462, + "epoch": 0.5390049601683451, + "flos": 31730478888960.0, + "grad_norm": 1.575337207693627, + "language_loss": 0.70121396, + "learning_rate": 1.8445669506473287e-06, + "loss": 0.77821916, + "num_input_tokens_seen": 193055570, + "router_z_loss_clip": 1.52734375, + "router_z_loss_mlp": 0.11499023, + "step": 8965, + "time_per_iteration": 2.6127662658691406 + }, + { + "auxiliary_loss_clip": 0.06431293, + "auxiliary_loss_mlp": 0.01269597, + "balance_loss_clip": 0.06281815, + "balance_loss_mlp": 0.01258546, + "epoch": 0.5390650834210131, + "flos": 18119283402240.0, + "grad_norm": 2.027850604452939, + "language_loss": 0.82445288, + "learning_rate": 1.8441786698433192e-06, + "loss": 0.90146178, + "num_input_tokens_seen": 193073120, + "router_z_loss_clip": 1.49414062, + "router_z_loss_mlp": 0.11047363, + "step": 8966, + "time_per_iteration": 2.472459554672241 + }, + { + "auxiliary_loss_clip": 0.06426321, + "auxiliary_loss_mlp": 0.01267306, + "balance_loss_clip": 0.06281838, + "balance_loss_mlp": 0.01256326, + "epoch": 0.539125206673681, + "flos": 17421798816000.0, + "grad_norm": 2.5704499610569282, + "language_loss": 0.72936428, + "learning_rate": 1.8437903949480706e-06, + "loss": 0.80630052, + "num_input_tokens_seen": 193090105, + "router_z_loss_clip": 1.44335938, + "router_z_loss_mlp": 0.10980225, + "step": 8967, + "time_per_iteration": 2.4896764755249023 + }, + { + "auxiliary_loss_clip": 0.06424848, + "auxiliary_loss_mlp": 0.01264578, + "balance_loss_clip": 0.06278098, + "balance_loss_mlp": 0.01254493, + "epoch": 0.539185329926349, + "flos": 22205255575680.0, + "grad_norm": 1.5589784366040595, + "language_loss": 0.81895125, + "learning_rate": 1.8434021259763065e-06, + "loss": 0.89584547, + "num_input_tokens_seen": 193109325, + "router_z_loss_clip": 1.46679688, + "router_z_loss_mlp": 0.10083008, + "step": 8968, + "time_per_iteration": 2.5401480197906494 + }, + { + "auxiliary_loss_clip": 0.06428899, + "auxiliary_loss_mlp": 0.01265753, + "balance_loss_clip": 0.0628034, + "balance_loss_mlp": 0.01254118, + "epoch": 0.539245453179017, + "flos": 21440867904000.0, + "grad_norm": 1.4575649765742498, + "language_loss": 0.74243855, + "learning_rate": 1.8430138629427484e-06, + "loss": 0.81938505, + "num_input_tokens_seen": 193130595, + "router_z_loss_clip": 1.48632812, + "router_z_loss_mlp": 0.11633301, + "step": 8969, + "time_per_iteration": 2.553879976272583 + }, + { + "auxiliary_loss_clip": 0.06430885, + "auxiliary_loss_mlp": 0.01265593, + "balance_loss_clip": 0.06278199, + "balance_loss_mlp": 0.01254214, + "epoch": 0.539305576431685, + "flos": 20740322643840.0, + "grad_norm": 2.1595830648072347, + "language_loss": 0.827712, + "learning_rate": 1.8426256058621205e-06, + "loss": 0.90467674, + "num_input_tokens_seen": 193148930, + "router_z_loss_clip": 1.52636719, + "router_z_loss_mlp": 0.1137085, + "step": 8970, + "time_per_iteration": 2.478726863861084 + }, + { + "auxiliary_loss_clip": 0.06422678, + "auxiliary_loss_mlp": 0.01263676, + "balance_loss_clip": 0.06278254, + "balance_loss_mlp": 0.01253185, + "epoch": 0.5393656996843529, + "flos": 30928467934080.0, + "grad_norm": 1.400352356553148, + "language_loss": 0.75607336, + "learning_rate": 1.842237354749146e-06, + "loss": 0.83293688, + "num_input_tokens_seen": 193170140, + "router_z_loss_clip": 1.4453125, + "router_z_loss_mlp": 0.1048584, + "step": 8971, + "time_per_iteration": 2.5901689529418945 + }, + { + "auxiliary_loss_clip": 0.06318198, + "auxiliary_loss_mlp": 0.01253533, + "balance_loss_clip": 0.06255443, + "balance_loss_mlp": 0.0125168, + "epoch": 0.5394258229370209, + "flos": 50332953260160.0, + "grad_norm": 0.8588377208931133, + "language_loss": 0.60451257, + "learning_rate": 1.8418491096185465e-06, + "loss": 0.68022978, + "num_input_tokens_seen": 193227235, + "router_z_loss_clip": 0.62841797, + "router_z_loss_mlp": 0.01847839, + "step": 8972, + "time_per_iteration": 3.1413605213165283 + }, + { + "auxiliary_loss_clip": 0.06426257, + "auxiliary_loss_mlp": 0.01269177, + "balance_loss_clip": 0.06278059, + "balance_loss_mlp": 0.01257918, + "epoch": 0.5394859461896888, + "flos": 25419169180800.0, + "grad_norm": 1.5980875117754325, + "language_loss": 0.787233, + "learning_rate": 1.841460870485045e-06, + "loss": 0.8641873, + "num_input_tokens_seen": 193248435, + "router_z_loss_clip": 1.48046875, + "router_z_loss_mlp": 0.1126709, + "step": 8973, + "time_per_iteration": 2.5336296558380127 + }, + { + "auxiliary_loss_clip": 0.06433228, + "auxiliary_loss_mlp": 0.01267524, + "balance_loss_clip": 0.06279569, + "balance_loss_mlp": 0.0125546, + "epoch": 0.5395460694423568, + "flos": 25484646746880.0, + "grad_norm": 1.7949926655699973, + "language_loss": 0.7381959, + "learning_rate": 1.8410726373633623e-06, + "loss": 0.81520343, + "num_input_tokens_seen": 193267490, + "router_z_loss_clip": 1.53710938, + "router_z_loss_mlp": 0.12078857, + "step": 8974, + "time_per_iteration": 2.5483648777008057 + }, + { + "auxiliary_loss_clip": 0.06318444, + "auxiliary_loss_mlp": 0.01253276, + "balance_loss_clip": 0.06255525, + "balance_loss_mlp": 0.01251373, + "epoch": 0.5396061926950249, + "flos": 53267305317120.0, + "grad_norm": 0.7276638901828621, + "language_loss": 0.50946128, + "learning_rate": 1.8406844102682215e-06, + "loss": 0.58517849, + "num_input_tokens_seen": 193326050, + "router_z_loss_clip": 0.62890625, + "router_z_loss_mlp": 0.01899719, + "step": 8975, + "time_per_iteration": 3.125056028366089 + }, + { + "auxiliary_loss_clip": 0.06423691, + "auxiliary_loss_mlp": 0.01264945, + "balance_loss_clip": 0.06277017, + "balance_loss_mlp": 0.01253215, + "epoch": 0.5396663159476928, + "flos": 26732476949760.0, + "grad_norm": 1.546051077066994, + "language_loss": 0.72722358, + "learning_rate": 1.840296189214344e-06, + "loss": 0.80410993, + "num_input_tokens_seen": 193348785, + "router_z_loss_clip": 1.46777344, + "router_z_loss_mlp": 0.11724854, + "step": 8976, + "time_per_iteration": 2.563626766204834 + }, + { + "auxiliary_loss_clip": 0.06424834, + "auxiliary_loss_mlp": 0.01268763, + "balance_loss_clip": 0.06278136, + "balance_loss_mlp": 0.01257999, + "epoch": 0.5397264392003608, + "flos": 23259267285120.0, + "grad_norm": 1.9541916066514684, + "language_loss": 0.70649612, + "learning_rate": 1.8399079742164509e-06, + "loss": 0.78343207, + "num_input_tokens_seen": 193367080, + "router_z_loss_clip": 1.46679688, + "router_z_loss_mlp": 0.10766602, + "step": 8977, + "time_per_iteration": 2.5443131923675537 + }, + { + "auxiliary_loss_clip": 0.06428454, + "auxiliary_loss_mlp": 0.01267706, + "balance_loss_clip": 0.06278601, + "balance_loss_mlp": 0.01256691, + "epoch": 0.5397865624530287, + "flos": 18299727169920.0, + "grad_norm": 1.8457096410810847, + "language_loss": 0.72901827, + "learning_rate": 1.8395197652892636e-06, + "loss": 0.80597985, + "num_input_tokens_seen": 193383715, + "router_z_loss_clip": 1.49804688, + "router_z_loss_mlp": 0.11016846, + "step": 8978, + "time_per_iteration": 2.511715888977051 + }, + { + "auxiliary_loss_clip": 0.06434547, + "auxiliary_loss_mlp": 0.01269171, + "balance_loss_clip": 0.0627895, + "balance_loss_mlp": 0.01256821, + "epoch": 0.5398466857056967, + "flos": 15301742336640.0, + "grad_norm": 1.7083695222951265, + "language_loss": 0.74513042, + "learning_rate": 1.8391315624475028e-06, + "loss": 0.82216758, + "num_input_tokens_seen": 193400560, + "router_z_loss_clip": 1.5546875, + "router_z_loss_mlp": 0.12347412, + "step": 8979, + "time_per_iteration": 2.4654295444488525 + }, + { + "auxiliary_loss_clip": 0.06435215, + "auxiliary_loss_mlp": 0.01268104, + "balance_loss_clip": 0.062815, + "balance_loss_mlp": 0.0125551, + "epoch": 0.5399068089583646, + "flos": 17827521085440.0, + "grad_norm": 2.1729763122828567, + "language_loss": 0.77298462, + "learning_rate": 1.8387433657058892e-06, + "loss": 0.85001791, + "num_input_tokens_seen": 193418680, + "router_z_loss_clip": 1.53710938, + "router_z_loss_mlp": 0.12609863, + "step": 8980, + "time_per_iteration": 2.5131070613861084 + }, + { + "auxiliary_loss_clip": 0.06428653, + "auxiliary_loss_mlp": 0.01266817, + "balance_loss_clip": 0.06278711, + "balance_loss_mlp": 0.01256202, + "epoch": 0.5399669322110326, + "flos": 27389109870720.0, + "grad_norm": 1.7146505379249901, + "language_loss": 0.82213032, + "learning_rate": 1.8383551750791431e-06, + "loss": 0.89908504, + "num_input_tokens_seen": 193439310, + "router_z_loss_clip": 1.49902344, + "router_z_loss_mlp": 0.10626221, + "step": 8981, + "time_per_iteration": 4.00026273727417 + }, + { + "auxiliary_loss_clip": 0.06430832, + "auxiliary_loss_mlp": 0.01266682, + "balance_loss_clip": 0.06279931, + "balance_loss_mlp": 0.01255292, + "epoch": 0.5400270554637006, + "flos": 20455394434560.0, + "grad_norm": 1.8197401655909293, + "language_loss": 0.67626458, + "learning_rate": 1.8379669905819857e-06, + "loss": 0.75323975, + "num_input_tokens_seen": 193458115, + "router_z_loss_clip": 1.5078125, + "router_z_loss_mlp": 0.11395264, + "step": 8982, + "time_per_iteration": 2.7018609046936035 + }, + { + "auxiliary_loss_clip": 0.06430931, + "auxiliary_loss_mlp": 0.01272335, + "balance_loss_clip": 0.06282471, + "balance_loss_mlp": 0.0126123, + "epoch": 0.5400871787163686, + "flos": 21696055113600.0, + "grad_norm": 1.5105940902505235, + "language_loss": 0.82925522, + "learning_rate": 1.8375788122291358e-06, + "loss": 0.90628791, + "num_input_tokens_seen": 193477365, + "router_z_loss_clip": 1.48632812, + "router_z_loss_mlp": 0.11108398, + "step": 8983, + "time_per_iteration": 4.0147035121917725 + }, + { + "auxiliary_loss_clip": 0.06427681, + "auxiliary_loss_mlp": 0.01265838, + "balance_loss_clip": 0.06280811, + "balance_loss_mlp": 0.01254233, + "epoch": 0.5401473019690365, + "flos": 19210163708160.0, + "grad_norm": 2.5381589556683752, + "language_loss": 0.70748949, + "learning_rate": 1.8371906400353138e-06, + "loss": 0.78442466, + "num_input_tokens_seen": 193495595, + "router_z_loss_clip": 1.46679688, + "router_z_loss_mlp": 0.11608887, + "step": 8984, + "time_per_iteration": 2.485203742980957 + }, + { + "auxiliary_loss_clip": 0.06436664, + "auxiliary_loss_mlp": 0.01270492, + "balance_loss_clip": 0.06283301, + "balance_loss_mlp": 0.01258702, + "epoch": 0.5402074252217045, + "flos": 20632987163520.0, + "grad_norm": 1.6283776116809212, + "language_loss": 0.80336136, + "learning_rate": 1.8368024740152386e-06, + "loss": 0.88043296, + "num_input_tokens_seen": 193514035, + "router_z_loss_clip": 1.53417969, + "router_z_loss_mlp": 0.11798096, + "step": 8985, + "time_per_iteration": 2.5176138877868652 + }, + { + "auxiliary_loss_clip": 0.06421156, + "auxiliary_loss_mlp": 0.01266547, + "balance_loss_clip": 0.06279361, + "balance_loss_mlp": 0.01255497, + "epoch": 0.5402675484743724, + "flos": 24980519456640.0, + "grad_norm": 1.4261046169392377, + "language_loss": 0.79538441, + "learning_rate": 1.83641431418363e-06, + "loss": 0.87226146, + "num_input_tokens_seen": 193535445, + "router_z_loss_clip": 1.41796875, + "router_z_loss_mlp": 0.11053467, + "step": 8986, + "time_per_iteration": 2.528057098388672 + }, + { + "auxiliary_loss_clip": 0.06426872, + "auxiliary_loss_mlp": 0.01269311, + "balance_loss_clip": 0.06277602, + "balance_loss_mlp": 0.01258636, + "epoch": 0.5403276717270404, + "flos": 19464302741760.0, + "grad_norm": 1.7453745991771563, + "language_loss": 0.77310205, + "learning_rate": 1.8360261605552075e-06, + "loss": 0.85006386, + "num_input_tokens_seen": 193554780, + "router_z_loss_clip": 1.4921875, + "router_z_loss_mlp": 0.10681152, + "step": 8987, + "time_per_iteration": 3.9355413913726807 + }, + { + "auxiliary_loss_clip": 0.06426796, + "auxiliary_loss_mlp": 0.01265394, + "balance_loss_clip": 0.06278582, + "balance_loss_mlp": 0.01254147, + "epoch": 0.5403877949797083, + "flos": 18448040096640.0, + "grad_norm": 1.594164869128485, + "language_loss": 0.70988709, + "learning_rate": 1.8356380131446887e-06, + "loss": 0.78680897, + "num_input_tokens_seen": 193573580, + "router_z_loss_clip": 1.48046875, + "router_z_loss_mlp": 0.11248779, + "step": 8988, + "time_per_iteration": 2.529665470123291 + }, + { + "auxiliary_loss_clip": 0.06432524, + "auxiliary_loss_mlp": 0.01266539, + "balance_loss_clip": 0.06283048, + "balance_loss_mlp": 0.0125528, + "epoch": 0.5404479182323764, + "flos": 28300343022720.0, + "grad_norm": 2.353153070088846, + "language_loss": 0.68308997, + "learning_rate": 1.8352498719667934e-06, + "loss": 0.76008058, + "num_input_tokens_seen": 193590490, + "router_z_loss_clip": 1.49414062, + "router_z_loss_mlp": 0.11260986, + "step": 8989, + "time_per_iteration": 2.541705846786499 + }, + { + "auxiliary_loss_clip": 0.06425673, + "auxiliary_loss_mlp": 0.0126717, + "balance_loss_clip": 0.06277242, + "balance_loss_mlp": 0.01255071, + "epoch": 0.5405080414850444, + "flos": 23373981924480.0, + "grad_norm": 1.5774927452360248, + "language_loss": 0.77866185, + "learning_rate": 1.8348617370362399e-06, + "loss": 0.85559022, + "num_input_tokens_seen": 193609900, + "router_z_loss_clip": 1.48339844, + "router_z_loss_mlp": 0.12091064, + "step": 8990, + "time_per_iteration": 2.570016384124756 + }, + { + "auxiliary_loss_clip": 0.06423812, + "auxiliary_loss_mlp": 0.01264876, + "balance_loss_clip": 0.06277065, + "balance_loss_mlp": 0.01254517, + "epoch": 0.5405681647377123, + "flos": 21112907823360.0, + "grad_norm": 1.4794826200904196, + "language_loss": 0.69081038, + "learning_rate": 1.834473608367745e-06, + "loss": 0.76769722, + "num_input_tokens_seen": 193629775, + "router_z_loss_clip": 1.46679688, + "router_z_loss_mlp": 0.10357666, + "step": 8991, + "time_per_iteration": 2.491284132003784 + }, + { + "auxiliary_loss_clip": 0.06430428, + "auxiliary_loss_mlp": 0.01268215, + "balance_loss_clip": 0.06280528, + "balance_loss_mlp": 0.01256598, + "epoch": 0.5406282879903803, + "flos": 20455478288640.0, + "grad_norm": 1.6151673604367662, + "language_loss": 0.76260269, + "learning_rate": 1.8340854859760277e-06, + "loss": 0.83958906, + "num_input_tokens_seen": 193648070, + "router_z_loss_clip": 1.49707031, + "router_z_loss_mlp": 0.11621094, + "step": 8992, + "time_per_iteration": 2.506131649017334 + }, + { + "auxiliary_loss_clip": 0.06429817, + "auxiliary_loss_mlp": 0.01266516, + "balance_loss_clip": 0.06278399, + "balance_loss_mlp": 0.01255871, + "epoch": 0.5406884112430482, + "flos": 14214635464320.0, + "grad_norm": 2.867003800231527, + "language_loss": 0.7616564, + "learning_rate": 1.8336973698758056e-06, + "loss": 0.83861977, + "num_input_tokens_seen": 193665060, + "router_z_loss_clip": 1.51367188, + "router_z_loss_mlp": 0.10644531, + "step": 8993, + "time_per_iteration": 2.5104384422302246 + }, + { + "auxiliary_loss_clip": 0.06425033, + "auxiliary_loss_mlp": 0.01270182, + "balance_loss_clip": 0.06278533, + "balance_loss_mlp": 0.01259024, + "epoch": 0.5407485344957162, + "flos": 23881882648320.0, + "grad_norm": 1.5714876378286171, + "language_loss": 0.70600474, + "learning_rate": 1.8333092600817959e-06, + "loss": 0.78295696, + "num_input_tokens_seen": 193683620, + "router_z_loss_clip": 1.46289062, + "router_z_loss_mlp": 0.11151123, + "step": 8994, + "time_per_iteration": 2.557224988937378 + }, + { + "auxiliary_loss_clip": 0.06430587, + "auxiliary_loss_mlp": 0.01267062, + "balance_loss_clip": 0.06279735, + "balance_loss_mlp": 0.01255397, + "epoch": 0.5408086577483842, + "flos": 23155118259840.0, + "grad_norm": 1.7868138082728735, + "language_loss": 0.7559076, + "learning_rate": 1.8329211566087157e-06, + "loss": 0.83288407, + "num_input_tokens_seen": 193702990, + "router_z_loss_clip": 1.50878906, + "router_z_loss_mlp": 0.11657715, + "step": 8995, + "time_per_iteration": 4.038757085800171 + }, + { + "auxiliary_loss_clip": 0.06426084, + "auxiliary_loss_mlp": 0.01266333, + "balance_loss_clip": 0.06281247, + "balance_loss_mlp": 0.01255748, + "epoch": 0.5408687810010522, + "flos": 18777090280320.0, + "grad_norm": 1.7506118703188027, + "language_loss": 0.73407996, + "learning_rate": 1.832533059471282e-06, + "loss": 0.81100416, + "num_input_tokens_seen": 193721785, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.105896, + "step": 8996, + "time_per_iteration": 2.4787185192108154 + }, + { + "auxiliary_loss_clip": 0.06423852, + "auxiliary_loss_mlp": 0.01266299, + "balance_loss_clip": 0.06280176, + "balance_loss_mlp": 0.01254801, + "epoch": 0.5409289042537201, + "flos": 13886717310720.0, + "grad_norm": 1.8157411884483814, + "language_loss": 0.73422438, + "learning_rate": 1.8321449686842115e-06, + "loss": 0.81112587, + "num_input_tokens_seen": 193740315, + "router_z_loss_clip": 1.43652344, + "router_z_loss_mlp": 0.11499023, + "step": 8997, + "time_per_iteration": 2.5067830085754395 + }, + { + "auxiliary_loss_clip": 0.0643085, + "auxiliary_loss_mlp": 0.01267668, + "balance_loss_clip": 0.06281897, + "balance_loss_mlp": 0.01256802, + "epoch": 0.5409890275063881, + "flos": 14470619287680.0, + "grad_norm": 2.2163933004413625, + "language_loss": 0.72107315, + "learning_rate": 1.8317568842622207e-06, + "loss": 0.79805827, + "num_input_tokens_seen": 193757580, + "router_z_loss_clip": 1.48925781, + "router_z_loss_mlp": 0.10870361, + "step": 8998, + "time_per_iteration": 2.499892234802246 + }, + { + "auxiliary_loss_clip": 0.06424686, + "auxiliary_loss_mlp": 0.01266284, + "balance_loss_clip": 0.0627818, + "balance_loss_mlp": 0.01255281, + "epoch": 0.541049150759056, + "flos": 48987906721920.0, + "grad_norm": 1.4223172525448995, + "language_loss": 0.7060768, + "learning_rate": 1.8313688062200256e-06, + "loss": 0.78298652, + "num_input_tokens_seen": 193780965, + "router_z_loss_clip": 1.46386719, + "router_z_loss_mlp": 0.11004639, + "step": 8999, + "time_per_iteration": 2.75883412361145 + }, + { + "auxiliary_loss_clip": 0.06424989, + "auxiliary_loss_mlp": 0.01267453, + "balance_loss_clip": 0.06280144, + "balance_loss_mlp": 0.01255818, + "epoch": 0.541109274011724, + "flos": 18153007470720.0, + "grad_norm": 3.0241903502045884, + "language_loss": 0.8099103, + "learning_rate": 1.8309807345723422e-06, + "loss": 0.88683468, + "num_input_tokens_seen": 193797855, + "router_z_loss_clip": 1.44726562, + "router_z_loss_mlp": 0.11639404, + "step": 9000, + "time_per_iteration": 2.4591987133026123 + }, + { + "auxiliary_loss_clip": 0.06425589, + "auxiliary_loss_mlp": 0.01267626, + "balance_loss_clip": 0.0628029, + "balance_loss_mlp": 0.01256438, + "epoch": 0.541169397264392, + "flos": 20528921992320.0, + "grad_norm": 1.444857324942775, + "language_loss": 0.73542678, + "learning_rate": 1.8305926693338863e-06, + "loss": 0.81235898, + "num_input_tokens_seen": 193817375, + "router_z_loss_clip": 1.45117188, + "router_z_loss_mlp": 0.11193848, + "step": 9001, + "time_per_iteration": 2.5392372608184814 + }, + { + "auxiliary_loss_clip": 0.06428811, + "auxiliary_loss_mlp": 0.0126804, + "balance_loss_clip": 0.0627747, + "balance_loss_mlp": 0.01256489, + "epoch": 0.54122952051706, + "flos": 20049630238080.0, + "grad_norm": 2.1661909625933675, + "language_loss": 0.85214329, + "learning_rate": 1.8302046105193734e-06, + "loss": 0.92911184, + "num_input_tokens_seen": 193832205, + "router_z_loss_clip": 1.51171875, + "router_z_loss_mlp": 0.11560059, + "step": 9002, + "time_per_iteration": 2.4666826725006104 + }, + { + "auxiliary_loss_clip": 0.06425083, + "auxiliary_loss_mlp": 0.01263895, + "balance_loss_clip": 0.06280569, + "balance_loss_mlp": 0.01253792, + "epoch": 0.541289643769728, + "flos": 19068223691520.0, + "grad_norm": 1.8644067392145132, + "language_loss": 0.78467226, + "learning_rate": 1.8298165581435183e-06, + "loss": 0.86156201, + "num_input_tokens_seen": 193849830, + "router_z_loss_clip": 1.44433594, + "router_z_loss_mlp": 0.10101318, + "step": 9003, + "time_per_iteration": 2.536766767501831 + }, + { + "auxiliary_loss_clip": 0.06424496, + "auxiliary_loss_mlp": 0.01263823, + "balance_loss_clip": 0.06279116, + "balance_loss_mlp": 0.01253005, + "epoch": 0.5413497670223959, + "flos": 22388801944320.0, + "grad_norm": 1.7504010601062234, + "language_loss": 0.69487125, + "learning_rate": 1.8294285122210372e-06, + "loss": 0.77175444, + "num_input_tokens_seen": 193869945, + "router_z_loss_clip": 1.45019531, + "router_z_loss_mlp": 0.1081543, + "step": 9004, + "time_per_iteration": 2.522757053375244 + }, + { + "auxiliary_loss_clip": 0.06323519, + "auxiliary_loss_mlp": 0.01256562, + "balance_loss_clip": 0.0626113, + "balance_loss_mlp": 0.01254622, + "epoch": 0.5414098902750639, + "flos": 70052149722240.0, + "grad_norm": 0.9317133774182984, + "language_loss": 0.58728683, + "learning_rate": 1.8290404727666434e-06, + "loss": 0.66308761, + "num_input_tokens_seen": 193930860, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 0.01937866, + "step": 9005, + "time_per_iteration": 3.227922201156616 + }, + { + "auxiliary_loss_clip": 0.06426564, + "auxiliary_loss_mlp": 0.01264985, + "balance_loss_clip": 0.06276372, + "balance_loss_mlp": 0.01254477, + "epoch": 0.5414700135277318, + "flos": 21805445018880.0, + "grad_norm": 2.0206216562473416, + "language_loss": 0.78202778, + "learning_rate": 1.8286524397950517e-06, + "loss": 0.85894328, + "num_input_tokens_seen": 193949075, + "router_z_loss_clip": 1.50097656, + "router_z_loss_mlp": 0.10510254, + "step": 9006, + "time_per_iteration": 2.557199001312256 + }, + { + "auxiliary_loss_clip": 0.06423091, + "auxiliary_loss_mlp": 0.01269943, + "balance_loss_clip": 0.06278808, + "balance_loss_mlp": 0.01259965, + "epoch": 0.5415301367803999, + "flos": 16913269186560.0, + "grad_norm": 3.052189299631263, + "language_loss": 0.8345896, + "learning_rate": 1.8282644133209777e-06, + "loss": 0.91152, + "num_input_tokens_seen": 193967630, + "router_z_loss_clip": 1.44238281, + "router_z_loss_mlp": 0.09979248, + "step": 9007, + "time_per_iteration": 2.5309536457061768 + }, + { + "auxiliary_loss_clip": 0.06427018, + "auxiliary_loss_mlp": 0.01265497, + "balance_loss_clip": 0.06280112, + "balance_loss_mlp": 0.01254089, + "epoch": 0.5415902600330678, + "flos": 25711518476160.0, + "grad_norm": 1.8242309219870276, + "language_loss": 0.67383778, + "learning_rate": 1.8278763933591334e-06, + "loss": 0.750763, + "num_input_tokens_seen": 193988730, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 0.11401367, + "step": 9008, + "time_per_iteration": 2.5476038455963135 + }, + { + "auxiliary_loss_clip": 0.0643273, + "auxiliary_loss_mlp": 0.01271282, + "balance_loss_clip": 0.06281075, + "balance_loss_mlp": 0.01259432, + "epoch": 0.5416503832857358, + "flos": 19214146776960.0, + "grad_norm": 1.9758514689639541, + "language_loss": 0.7415235, + "learning_rate": 1.827488379924234e-06, + "loss": 0.81856364, + "num_input_tokens_seen": 194005160, + "router_z_loss_clip": 1.51464844, + "router_z_loss_mlp": 0.11846924, + "step": 9009, + "time_per_iteration": 2.519923448562622 + }, + { + "auxiliary_loss_clip": 0.06433536, + "auxiliary_loss_mlp": 0.012676, + "balance_loss_clip": 0.0628282, + "balance_loss_mlp": 0.01255691, + "epoch": 0.5417105065384037, + "flos": 12718619867520.0, + "grad_norm": 2.008927815850951, + "language_loss": 0.88025904, + "learning_rate": 1.8271003730309923e-06, + "loss": 0.95727038, + "num_input_tokens_seen": 194021700, + "router_z_loss_clip": 1.50683594, + "router_z_loss_mlp": 0.11907959, + "step": 9010, + "time_per_iteration": 2.4986653327941895 + }, + { + "auxiliary_loss_clip": 0.06425326, + "auxiliary_loss_mlp": 0.01266313, + "balance_loss_clip": 0.06279215, + "balance_loss_mlp": 0.0125562, + "epoch": 0.5417706297910717, + "flos": 30343727416320.0, + "grad_norm": 1.9869037800658418, + "language_loss": 0.64700162, + "learning_rate": 1.826712372694122e-06, + "loss": 0.72391802, + "num_input_tokens_seen": 194042620, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.10693359, + "step": 9011, + "time_per_iteration": 2.639526605606079 + }, + { + "auxiliary_loss_clip": 0.06426919, + "auxiliary_loss_mlp": 0.0126718, + "balance_loss_clip": 0.06280383, + "balance_loss_mlp": 0.01256368, + "epoch": 0.5418307530437396, + "flos": 29028323295360.0, + "grad_norm": 2.488283502034593, + "language_loss": 0.79704046, + "learning_rate": 1.8263243789283362e-06, + "loss": 0.87398142, + "num_input_tokens_seen": 194061800, + "router_z_loss_clip": 1.46386719, + "router_z_loss_mlp": 0.1081543, + "step": 9012, + "time_per_iteration": 2.546048641204834 + }, + { + "auxiliary_loss_clip": 0.06429458, + "auxiliary_loss_mlp": 0.01265294, + "balance_loss_clip": 0.06280975, + "balance_loss_mlp": 0.01254464, + "epoch": 0.5418908762964076, + "flos": 16879125847680.0, + "grad_norm": 2.3471098958204712, + "language_loss": 0.74353266, + "learning_rate": 1.8259363917483466e-06, + "loss": 0.82048023, + "num_input_tokens_seen": 194079890, + "router_z_loss_clip": 1.48242188, + "router_z_loss_mlp": 0.10839844, + "step": 9013, + "time_per_iteration": 2.544989585876465 + }, + { + "auxiliary_loss_clip": 0.06429175, + "auxiliary_loss_mlp": 0.01265654, + "balance_loss_clip": 0.06277567, + "balance_loss_mlp": 0.01254806, + "epoch": 0.5419509995490756, + "flos": 18955144206720.0, + "grad_norm": 2.592240526053277, + "language_loss": 0.72416294, + "learning_rate": 1.8255484111688667e-06, + "loss": 0.80111116, + "num_input_tokens_seen": 194097625, + "router_z_loss_clip": 1.51464844, + "router_z_loss_mlp": 0.10852051, + "step": 9014, + "time_per_iteration": 2.4757673740386963 + }, + { + "auxiliary_loss_clip": 0.06427553, + "auxiliary_loss_mlp": 0.01267434, + "balance_loss_clip": 0.06280749, + "balance_loss_mlp": 0.01256413, + "epoch": 0.5420111228017436, + "flos": 18083630689920.0, + "grad_norm": 1.4576837239395228, + "language_loss": 0.80686474, + "learning_rate": 1.8251604372046085e-06, + "loss": 0.88381469, + "num_input_tokens_seen": 194116055, + "router_z_loss_clip": 1.46777344, + "router_z_loss_mlp": 0.11010742, + "step": 9015, + "time_per_iteration": 2.50618839263916 + }, + { + "auxiliary_loss_clip": 0.06436689, + "auxiliary_loss_mlp": 0.01270112, + "balance_loss_clip": 0.06286176, + "balance_loss_mlp": 0.01259061, + "epoch": 0.5420712460544116, + "flos": 19067678640000.0, + "grad_norm": 2.2120132338352105, + "language_loss": 0.81892127, + "learning_rate": 1.8247724698702843e-06, + "loss": 0.8959893, + "num_input_tokens_seen": 194130365, + "router_z_loss_clip": 1.50488281, + "router_z_loss_mlp": 0.11053467, + "step": 9016, + "time_per_iteration": 2.475426197052002 + }, + { + "auxiliary_loss_clip": 0.06424853, + "auxiliary_loss_mlp": 0.01269653, + "balance_loss_clip": 0.06277838, + "balance_loss_mlp": 0.01259258, + "epoch": 0.5421313693070795, + "flos": 18193020595200.0, + "grad_norm": 1.7396358642065415, + "language_loss": 0.81981838, + "learning_rate": 1.8243845091806053e-06, + "loss": 0.89676344, + "num_input_tokens_seen": 194148975, + "router_z_loss_clip": 1.46777344, + "router_z_loss_mlp": 0.10388184, + "step": 9017, + "time_per_iteration": 2.4966297149658203 + }, + { + "auxiliary_loss_clip": 0.06421264, + "auxiliary_loss_mlp": 0.01267488, + "balance_loss_clip": 0.06278099, + "balance_loss_mlp": 0.01256301, + "epoch": 0.5421914925597475, + "flos": 13010969162880.0, + "grad_norm": 1.7307795983641447, + "language_loss": 0.77940953, + "learning_rate": 1.8239965551502837e-06, + "loss": 0.85629702, + "num_input_tokens_seen": 194167185, + "router_z_loss_clip": 1.43164062, + "router_z_loss_mlp": 0.11193848, + "step": 9018, + "time_per_iteration": 2.4861438274383545 + }, + { + "auxiliary_loss_clip": 0.0643111, + "auxiliary_loss_mlp": 0.01264327, + "balance_loss_clip": 0.06279995, + "balance_loss_mlp": 0.01253557, + "epoch": 0.5422516158124154, + "flos": 46769654856960.0, + "grad_norm": 1.436078593305458, + "language_loss": 0.66629684, + "learning_rate": 1.8236086077940303e-06, + "loss": 0.7432512, + "num_input_tokens_seen": 194192840, + "router_z_loss_clip": 1.51074219, + "router_z_loss_mlp": 0.10772705, + "step": 9019, + "time_per_iteration": 2.793942928314209 + }, + { + "auxiliary_loss_clip": 0.06420586, + "auxiliary_loss_mlp": 0.01266098, + "balance_loss_clip": 0.06277826, + "balance_loss_mlp": 0.01256627, + "epoch": 0.5423117390650835, + "flos": 31766634725760.0, + "grad_norm": 1.5531318778473993, + "language_loss": 0.69972849, + "learning_rate": 1.8232206671265555e-06, + "loss": 0.77659535, + "num_input_tokens_seen": 194213150, + "router_z_loss_clip": 1.42871094, + "router_z_loss_mlp": 0.0947876, + "step": 9020, + "time_per_iteration": 3.977450132369995 + }, + { + "auxiliary_loss_clip": 0.0642193, + "auxiliary_loss_mlp": 0.01268231, + "balance_loss_clip": 0.0627913, + "balance_loss_mlp": 0.01257586, + "epoch": 0.5423718623177514, + "flos": 27209881987200.0, + "grad_norm": 1.41400284004279, + "language_loss": 0.80270976, + "learning_rate": 1.8228327331625717e-06, + "loss": 0.87961137, + "num_input_tokens_seen": 194234665, + "router_z_loss_clip": 1.42871094, + "router_z_loss_mlp": 0.10650635, + "step": 9021, + "time_per_iteration": 2.5875015258789062 + }, + { + "auxiliary_loss_clip": 0.06426784, + "auxiliary_loss_mlp": 0.0126779, + "balance_loss_clip": 0.0628023, + "balance_loss_mlp": 0.01257162, + "epoch": 0.5424319855704194, + "flos": 23552580902400.0, + "grad_norm": 2.7424242746142298, + "language_loss": 0.78868818, + "learning_rate": 1.822444805916788e-06, + "loss": 0.86563396, + "num_input_tokens_seen": 194253790, + "router_z_loss_clip": 1.46582031, + "router_z_loss_mlp": 0.10626221, + "step": 9022, + "time_per_iteration": 2.6569435596466064 + }, + { + "auxiliary_loss_clip": 0.06421105, + "auxiliary_loss_mlp": 0.01267956, + "balance_loss_clip": 0.06275026, + "balance_loss_mlp": 0.01257132, + "epoch": 0.5424921088230873, + "flos": 26623003190400.0, + "grad_norm": 2.014349133750916, + "language_loss": 0.82876647, + "learning_rate": 1.822056885403915e-06, + "loss": 0.90565705, + "num_input_tokens_seen": 194274950, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.10827637, + "step": 9023, + "time_per_iteration": 4.035135746002197 + }, + { + "auxiliary_loss_clip": 0.06427208, + "auxiliary_loss_mlp": 0.01266773, + "balance_loss_clip": 0.06280831, + "balance_loss_mlp": 0.01256718, + "epoch": 0.5425522320757553, + "flos": 23593600275840.0, + "grad_norm": 1.5793438869499181, + "language_loss": 0.71421236, + "learning_rate": 1.8216689716386627e-06, + "loss": 0.79115218, + "num_input_tokens_seen": 194296155, + "router_z_loss_clip": 1.46386719, + "router_z_loss_mlp": 0.10058594, + "step": 9024, + "time_per_iteration": 2.540205717086792 + }, + { + "auxiliary_loss_clip": 0.06424701, + "auxiliary_loss_mlp": 0.01264518, + "balance_loss_clip": 0.06276532, + "balance_loss_mlp": 0.01253908, + "epoch": 0.5426123553284232, + "flos": 30600256291200.0, + "grad_norm": 1.6177082091395079, + "language_loss": 0.65074164, + "learning_rate": 1.8212810646357405e-06, + "loss": 0.72763383, + "num_input_tokens_seen": 194318025, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.10601807, + "step": 9025, + "time_per_iteration": 2.6120383739471436 + }, + { + "auxiliary_loss_clip": 0.06428426, + "auxiliary_loss_mlp": 0.01269591, + "balance_loss_clip": 0.06278306, + "balance_loss_mlp": 0.0125891, + "epoch": 0.5426724785810912, + "flos": 12500049692160.0, + "grad_norm": 9.095866287209772, + "language_loss": 0.73753297, + "learning_rate": 1.8208931644098591e-06, + "loss": 0.81451309, + "num_input_tokens_seen": 194336150, + "router_z_loss_clip": 1.49902344, + "router_z_loss_mlp": 0.10681152, + "step": 9026, + "time_per_iteration": 2.47986102104187 + }, + { + "auxiliary_loss_clip": 0.06430142, + "auxiliary_loss_mlp": 0.01269421, + "balance_loss_clip": 0.06282182, + "balance_loss_mlp": 0.01256993, + "epoch": 0.5427326018337592, + "flos": 26071273981440.0, + "grad_norm": 2.23504413576904, + "language_loss": 0.78765059, + "learning_rate": 1.8205052709757265e-06, + "loss": 0.8646462, + "num_input_tokens_seen": 194355980, + "router_z_loss_clip": 1.47949219, + "router_z_loss_mlp": 0.12432861, + "step": 9027, + "time_per_iteration": 3.9859650135040283 + }, + { + "auxiliary_loss_clip": 0.06320234, + "auxiliary_loss_mlp": 0.01252608, + "balance_loss_clip": 0.06257887, + "balance_loss_mlp": 0.01250684, + "epoch": 0.5427927250864272, + "flos": 66004974789120.0, + "grad_norm": 0.7416092139326844, + "language_loss": 0.56562424, + "learning_rate": 1.8201173843480515e-06, + "loss": 0.64135265, + "num_input_tokens_seen": 194422660, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 0.01921082, + "step": 9028, + "time_per_iteration": 3.155468702316284 + }, + { + "auxiliary_loss_clip": 0.06432774, + "auxiliary_loss_mlp": 0.01272049, + "balance_loss_clip": 0.06283672, + "balance_loss_mlp": 0.01260158, + "epoch": 0.5428528483390952, + "flos": 19981678976640.0, + "grad_norm": 2.1493249613849015, + "language_loss": 0.78262091, + "learning_rate": 1.8197295045415442e-06, + "loss": 0.85966909, + "num_input_tokens_seen": 194438545, + "router_z_loss_clip": 1.49023438, + "router_z_loss_mlp": 0.11883545, + "step": 9029, + "time_per_iteration": 2.59745192527771 + }, + { + "auxiliary_loss_clip": 0.06422626, + "auxiliary_loss_mlp": 0.0127098, + "balance_loss_clip": 0.06278758, + "balance_loss_mlp": 0.01260108, + "epoch": 0.5429129715917631, + "flos": 21838288619520.0, + "grad_norm": 1.5330300742008836, + "language_loss": 0.83522928, + "learning_rate": 1.8193416315709112e-06, + "loss": 0.9121654, + "num_input_tokens_seen": 194458060, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.10870361, + "step": 9030, + "time_per_iteration": 2.579742670059204 + }, + { + "auxiliary_loss_clip": 0.06426223, + "auxiliary_loss_mlp": 0.01263686, + "balance_loss_clip": 0.06282306, + "balance_loss_mlp": 0.01252903, + "epoch": 0.5429730948444311, + "flos": 27790178238720.0, + "grad_norm": 1.5430505390577234, + "language_loss": 0.75487745, + "learning_rate": 1.8189537654508623e-06, + "loss": 0.8317765, + "num_input_tokens_seen": 194477405, + "router_z_loss_clip": 1.43847656, + "router_z_loss_mlp": 0.10784912, + "step": 9031, + "time_per_iteration": 2.5645737648010254 + }, + { + "auxiliary_loss_clip": 0.06421311, + "auxiliary_loss_mlp": 0.01265953, + "balance_loss_clip": 0.0628026, + "balance_loss_mlp": 0.01256226, + "epoch": 0.543033218097099, + "flos": 26767668464640.0, + "grad_norm": 1.6242541501700514, + "language_loss": 0.85659242, + "learning_rate": 1.8185659061961045e-06, + "loss": 0.933465, + "num_input_tokens_seen": 194497085, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.097229, + "step": 9032, + "time_per_iteration": 2.5913095474243164 + }, + { + "auxiliary_loss_clip": 0.06434417, + "auxiliary_loss_mlp": 0.01272349, + "balance_loss_clip": 0.06282632, + "balance_loss_mlp": 0.01260815, + "epoch": 0.5430933413497671, + "flos": 22681989780480.0, + "grad_norm": 1.5840496509982642, + "language_loss": 0.74130201, + "learning_rate": 1.8181780538213457e-06, + "loss": 0.81836969, + "num_input_tokens_seen": 194516785, + "router_z_loss_clip": 1.515625, + "router_z_loss_mlp": 0.11535645, + "step": 9033, + "time_per_iteration": 2.546196937561035 + }, + { + "auxiliary_loss_clip": 0.06426211, + "auxiliary_loss_mlp": 0.01268108, + "balance_loss_clip": 0.06281157, + "balance_loss_mlp": 0.01256569, + "epoch": 0.543153464602435, + "flos": 24614307187200.0, + "grad_norm": 1.5750334880362715, + "language_loss": 0.76250172, + "learning_rate": 1.8177902083412935e-06, + "loss": 0.83944499, + "num_input_tokens_seen": 194536475, + "router_z_loss_clip": 1.44921875, + "router_z_loss_mlp": 0.11535645, + "step": 9034, + "time_per_iteration": 2.5637965202331543 + }, + { + "auxiliary_loss_clip": 0.0642693, + "auxiliary_loss_mlp": 0.0126457, + "balance_loss_clip": 0.06282238, + "balance_loss_mlp": 0.01254002, + "epoch": 0.543213587855103, + "flos": 19031690511360.0, + "grad_norm": 1.6968779523598936, + "language_loss": 0.84307218, + "learning_rate": 1.817402369770655e-06, + "loss": 0.91998708, + "num_input_tokens_seen": 194554495, + "router_z_loss_clip": 1.4453125, + "router_z_loss_mlp": 0.10583496, + "step": 9035, + "time_per_iteration": 4.028722524642944 + }, + { + "auxiliary_loss_clip": 0.063224, + "auxiliary_loss_mlp": 0.01251692, + "balance_loss_clip": 0.06260421, + "balance_loss_mlp": 0.01250003, + "epoch": 0.5432737111077709, + "flos": 65705539824000.0, + "grad_norm": 0.6842717349937131, + "language_loss": 0.55272961, + "learning_rate": 1.8170145381241364e-06, + "loss": 0.62847054, + "num_input_tokens_seen": 194617620, + "router_z_loss_clip": 0.61816406, + "router_z_loss_mlp": 0.01693726, + "step": 9036, + "time_per_iteration": 3.117825746536255 + }, + { + "auxiliary_loss_clip": 0.06427496, + "auxiliary_loss_mlp": 0.01266068, + "balance_loss_clip": 0.06278114, + "balance_loss_mlp": 0.0125423, + "epoch": 0.5433338343604389, + "flos": 22098339365760.0, + "grad_norm": 1.6522952339212897, + "language_loss": 0.75599706, + "learning_rate": 1.8166267134164451e-06, + "loss": 0.83293271, + "num_input_tokens_seen": 194637690, + "router_z_loss_clip": 1.4921875, + "router_z_loss_mlp": 0.1184082, + "step": 9037, + "time_per_iteration": 2.520371913909912 + }, + { + "auxiliary_loss_clip": 0.06428872, + "auxiliary_loss_mlp": 0.01263373, + "balance_loss_clip": 0.06282881, + "balance_loss_mlp": 0.01252561, + "epoch": 0.5433939576131068, + "flos": 34680316752000.0, + "grad_norm": 1.5920545337485463, + "language_loss": 0.66775727, + "learning_rate": 1.8162388956622875e-06, + "loss": 0.74467969, + "num_input_tokens_seen": 194659520, + "router_z_loss_clip": 1.45800781, + "router_z_loss_mlp": 0.1081543, + "step": 9038, + "time_per_iteration": 2.6492366790771484 + }, + { + "auxiliary_loss_clip": 0.06424891, + "auxiliary_loss_mlp": 0.01265017, + "balance_loss_clip": 0.06279261, + "balance_loss_mlp": 0.01254395, + "epoch": 0.5434540808657748, + "flos": 20309639057280.0, + "grad_norm": 2.8075357913922687, + "language_loss": 0.78373635, + "learning_rate": 1.8158510848763692e-06, + "loss": 0.8606354, + "num_input_tokens_seen": 194677645, + "router_z_loss_clip": 1.45507812, + "router_z_loss_mlp": 0.10626221, + "step": 9039, + "time_per_iteration": 2.528156280517578 + }, + { + "auxiliary_loss_clip": 0.06428317, + "auxiliary_loss_mlp": 0.01269709, + "balance_loss_clip": 0.06281251, + "balance_loss_mlp": 0.01258677, + "epoch": 0.5435142041184428, + "flos": 23119549401600.0, + "grad_norm": 1.7481925172590123, + "language_loss": 0.76885521, + "learning_rate": 1.8154632810733962e-06, + "loss": 0.84583545, + "num_input_tokens_seen": 194697400, + "router_z_loss_clip": 1.46777344, + "router_z_loss_mlp": 0.11029053, + "step": 9040, + "time_per_iteration": 2.5517256259918213 + }, + { + "auxiliary_loss_clip": 0.06319717, + "auxiliary_loss_mlp": 0.01257021, + "balance_loss_clip": 0.06257772, + "balance_loss_mlp": 0.01255075, + "epoch": 0.5435743273711108, + "flos": 64032350768640.0, + "grad_norm": 0.6699998863594594, + "language_loss": 0.52323502, + "learning_rate": 1.815075484268074e-06, + "loss": 0.59900236, + "num_input_tokens_seen": 194761205, + "router_z_loss_clip": 0.61865234, + "router_z_loss_mlp": 0.0194397, + "step": 9041, + "time_per_iteration": 3.166306972503662 + }, + { + "auxiliary_loss_clip": 0.06428386, + "auxiliary_loss_mlp": 0.01265505, + "balance_loss_clip": 0.06280383, + "balance_loss_mlp": 0.01254687, + "epoch": 0.5436344506237788, + "flos": 25125897490560.0, + "grad_norm": 1.7575616905304456, + "language_loss": 0.762761, + "learning_rate": 1.8146876944751078e-06, + "loss": 0.83969998, + "num_input_tokens_seen": 194782445, + "router_z_loss_clip": 1.48242188, + "router_z_loss_mlp": 0.10821533, + "step": 9042, + "time_per_iteration": 2.5450282096862793 + }, + { + "auxiliary_loss_clip": 0.0642225, + "auxiliary_loss_mlp": 0.01265245, + "balance_loss_clip": 0.06278253, + "balance_loss_mlp": 0.01254176, + "epoch": 0.5436945738764467, + "flos": 19579017381120.0, + "grad_norm": 2.3576554691894054, + "language_loss": 0.6770978, + "learning_rate": 1.8142999117092033e-06, + "loss": 0.75397277, + "num_input_tokens_seen": 194800325, + "router_z_loss_clip": 1.43945312, + "router_z_loss_mlp": 0.11065674, + "step": 9043, + "time_per_iteration": 2.5310070514678955 + }, + { + "auxiliary_loss_clip": 0.06421092, + "auxiliary_loss_mlp": 0.01266758, + "balance_loss_clip": 0.06277125, + "balance_loss_mlp": 0.01256065, + "epoch": 0.5437546971291147, + "flos": 21148937879040.0, + "grad_norm": 1.5176966924106092, + "language_loss": 0.84091616, + "learning_rate": 1.8139121359850644e-06, + "loss": 0.91779459, + "num_input_tokens_seen": 194818675, + "router_z_loss_clip": 1.43847656, + "router_z_loss_mlp": 0.10699463, + "step": 9044, + "time_per_iteration": 2.4937691688537598 + }, + { + "auxiliary_loss_clip": 0.06427783, + "auxiliary_loss_mlp": 0.01267965, + "balance_loss_clip": 0.06275944, + "balance_loss_mlp": 0.01256056, + "epoch": 0.5438148203817826, + "flos": 25125645928320.0, + "grad_norm": 1.559720453478778, + "language_loss": 0.62531364, + "learning_rate": 1.8135243673173956e-06, + "loss": 0.70227116, + "num_input_tokens_seen": 194836595, + "router_z_loss_clip": 1.51660156, + "router_z_loss_mlp": 0.11914062, + "step": 9045, + "time_per_iteration": 2.558842182159424 + }, + { + "auxiliary_loss_clip": 0.06425174, + "auxiliary_loss_mlp": 0.01267999, + "balance_loss_clip": 0.06278486, + "balance_loss_mlp": 0.01257312, + "epoch": 0.5438749436344507, + "flos": 23009614444800.0, + "grad_norm": 1.4475609839642107, + "language_loss": 0.70189548, + "learning_rate": 1.8131366057209023e-06, + "loss": 0.77882719, + "num_input_tokens_seen": 194857520, + "router_z_loss_clip": 1.46679688, + "router_z_loss_mlp": 0.10687256, + "step": 9046, + "time_per_iteration": 2.546400785446167 + }, + { + "auxiliary_loss_clip": 0.06422587, + "auxiliary_loss_mlp": 0.01263416, + "balance_loss_clip": 0.06278922, + "balance_loss_mlp": 0.01253087, + "epoch": 0.5439350668871186, + "flos": 15492458229120.0, + "grad_norm": 1.7829079763234368, + "language_loss": 0.77310658, + "learning_rate": 1.8127488512102868e-06, + "loss": 0.84996659, + "num_input_tokens_seen": 194876020, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.10333252, + "step": 9047, + "time_per_iteration": 2.5223042964935303 + }, + { + "auxiliary_loss_clip": 0.06424624, + "auxiliary_loss_mlp": 0.01269137, + "balance_loss_clip": 0.06278106, + "balance_loss_mlp": 0.01257598, + "epoch": 0.5439951901397866, + "flos": 17244164160000.0, + "grad_norm": 2.1796692597227363, + "language_loss": 0.73181236, + "learning_rate": 1.8123611038002547e-06, + "loss": 0.80874991, + "num_input_tokens_seen": 194894650, + "router_z_loss_clip": 1.46484375, + "router_z_loss_mlp": 0.11547852, + "step": 9048, + "time_per_iteration": 2.4901275634765625 + }, + { + "auxiliary_loss_clip": 0.06419719, + "auxiliary_loss_mlp": 0.01268414, + "balance_loss_clip": 0.06275168, + "balance_loss_mlp": 0.01256773, + "epoch": 0.5440553133924545, + "flos": 18666945688320.0, + "grad_norm": 2.2913555210162535, + "language_loss": 0.93342638, + "learning_rate": 1.8119733635055076e-06, + "loss": 1.01030767, + "num_input_tokens_seen": 194911935, + "router_z_loss_clip": 1.4453125, + "router_z_loss_mlp": 0.11639404, + "step": 9049, + "time_per_iteration": 2.5185091495513916 + }, + { + "auxiliary_loss_clip": 0.0641875, + "auxiliary_loss_mlp": 0.01267443, + "balance_loss_clip": 0.06274416, + "balance_loss_mlp": 0.01257155, + "epoch": 0.5441154366451225, + "flos": 27129813811200.0, + "grad_norm": 1.6778604645700708, + "language_loss": 0.74161297, + "learning_rate": 1.8115856303407492e-06, + "loss": 0.81847489, + "num_input_tokens_seen": 194931620, + "router_z_loss_clip": 1.44140625, + "router_z_loss_mlp": 0.10284424, + "step": 9050, + "time_per_iteration": 2.551227331161499 + }, + { + "auxiliary_loss_clip": 0.06424956, + "auxiliary_loss_mlp": 0.01268538, + "balance_loss_clip": 0.06277525, + "balance_loss_mlp": 0.01257684, + "epoch": 0.5441755598977904, + "flos": 26000890951680.0, + "grad_norm": 1.7704942450323604, + "language_loss": 0.67003465, + "learning_rate": 1.8111979043206832e-06, + "loss": 0.74696958, + "num_input_tokens_seen": 194952560, + "router_z_loss_clip": 1.47460938, + "router_z_loss_mlp": 0.10852051, + "step": 9051, + "time_per_iteration": 2.586360454559326 + }, + { + "auxiliary_loss_clip": 0.06422283, + "auxiliary_loss_mlp": 0.01264215, + "balance_loss_clip": 0.06277864, + "balance_loss_mlp": 0.01253629, + "epoch": 0.5442356831504584, + "flos": 32388327694080.0, + "grad_norm": 1.6805683860476124, + "language_loss": 0.68003166, + "learning_rate": 1.810810185460011e-06, + "loss": 0.75689662, + "num_input_tokens_seen": 194973915, + "router_z_loss_clip": 1.44238281, + "router_z_loss_mlp": 0.10583496, + "step": 9052, + "time_per_iteration": 2.595308303833008 + }, + { + "auxiliary_loss_clip": 0.0642236, + "auxiliary_loss_mlp": 0.01267208, + "balance_loss_clip": 0.06275343, + "balance_loss_mlp": 0.01255413, + "epoch": 0.5442958064031264, + "flos": 24170123093760.0, + "grad_norm": 1.9713868762163456, + "language_loss": 0.93283188, + "learning_rate": 1.810422473773436e-06, + "loss": 1.0097276, + "num_input_tokens_seen": 194990170, + "router_z_loss_clip": 1.46972656, + "router_z_loss_mlp": 0.11791992, + "step": 9053, + "time_per_iteration": 2.5700409412384033 + }, + { + "auxiliary_loss_clip": 0.06427357, + "auxiliary_loss_mlp": 0.0127068, + "balance_loss_clip": 0.06279093, + "balance_loss_mlp": 0.01258509, + "epoch": 0.5443559296557944, + "flos": 18769669194240.0, + "grad_norm": 1.9808667763978582, + "language_loss": 0.83683395, + "learning_rate": 1.8100347692756595e-06, + "loss": 0.91381431, + "num_input_tokens_seen": 195006395, + "router_z_loss_clip": 1.48339844, + "router_z_loss_mlp": 0.1217041, + "step": 9054, + "time_per_iteration": 2.4873886108398438 + }, + { + "auxiliary_loss_clip": 0.06424912, + "auxiliary_loss_mlp": 0.01271948, + "balance_loss_clip": 0.06277627, + "balance_loss_mlp": 0.01260021, + "epoch": 0.5444160529084624, + "flos": 22638245149440.0, + "grad_norm": 1.9496494567304603, + "language_loss": 0.68541598, + "learning_rate": 1.8096470719813836e-06, + "loss": 0.76238453, + "num_input_tokens_seen": 195025080, + "router_z_loss_clip": 1.47265625, + "router_z_loss_mlp": 0.11920166, + "step": 9055, + "time_per_iteration": 2.5629093647003174 + }, + { + "auxiliary_loss_clip": 0.06326497, + "auxiliary_loss_mlp": 0.01261063, + "balance_loss_clip": 0.06264114, + "balance_loss_mlp": 0.01259381, + "epoch": 0.5444761761611303, + "flos": 69693106976640.0, + "grad_norm": 0.7193405715621726, + "language_loss": 0.57599837, + "learning_rate": 1.80925938190531e-06, + "loss": 0.65187401, + "num_input_tokens_seen": 195085725, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 0.01686096, + "step": 9056, + "time_per_iteration": 3.1249008178710938 + }, + { + "auxiliary_loss_clip": 0.06428131, + "auxiliary_loss_mlp": 0.01267471, + "balance_loss_clip": 0.06279279, + "balance_loss_mlp": 0.01255676, + "epoch": 0.5445362994137983, + "flos": 14282922142080.0, + "grad_norm": 1.7879789013056906, + "language_loss": 0.69611216, + "learning_rate": 1.8088716990621395e-06, + "loss": 0.77306819, + "num_input_tokens_seen": 195102585, + "router_z_loss_clip": 1.48828125, + "router_z_loss_mlp": 0.11798096, + "step": 9057, + "time_per_iteration": 2.498568296432495 + }, + { + "auxiliary_loss_clip": 0.06425367, + "auxiliary_loss_mlp": 0.01267238, + "balance_loss_clip": 0.06281108, + "balance_loss_mlp": 0.01255651, + "epoch": 0.5445964226664662, + "flos": 28993802613120.0, + "grad_norm": 1.9346963255645138, + "language_loss": 0.75279379, + "learning_rate": 1.8084840234665738e-06, + "loss": 0.8297199, + "num_input_tokens_seen": 195120055, + "router_z_loss_clip": 1.44433594, + "router_z_loss_mlp": 0.11578369, + "step": 9058, + "time_per_iteration": 2.569481134414673 + }, + { + "auxiliary_loss_clip": 0.06324711, + "auxiliary_loss_mlp": 0.01255513, + "balance_loss_clip": 0.06262248, + "balance_loss_mlp": 0.01253708, + "epoch": 0.5446565459191343, + "flos": 68642323649280.0, + "grad_norm": 0.781118187376451, + "language_loss": 0.62576413, + "learning_rate": 1.808096355133312e-06, + "loss": 0.7015664, + "num_input_tokens_seen": 195181045, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 0.01800537, + "step": 9059, + "time_per_iteration": 4.5610737800598145 + }, + { + "auxiliary_loss_clip": 0.06421264, + "auxiliary_loss_mlp": 0.01268955, + "balance_loss_clip": 0.06278148, + "balance_loss_mlp": 0.01257993, + "epoch": 0.5447166691718022, + "flos": 16221989802240.0, + "grad_norm": 1.8006783567998876, + "language_loss": 0.79601544, + "learning_rate": 1.8077086940770572e-06, + "loss": 0.87291771, + "num_input_tokens_seen": 195198840, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.10961914, + "step": 9060, + "time_per_iteration": 2.511836290359497 + }, + { + "auxiliary_loss_clip": 0.06426552, + "auxiliary_loss_mlp": 0.0126624, + "balance_loss_clip": 0.06279396, + "balance_loss_mlp": 0.0125454, + "epoch": 0.5447767924244702, + "flos": 25856225677440.0, + "grad_norm": 1.542760917466334, + "language_loss": 0.80138546, + "learning_rate": 1.8073210403125072e-06, + "loss": 0.87831336, + "num_input_tokens_seen": 195218720, + "router_z_loss_clip": 1.46972656, + "router_z_loss_mlp": 0.11700439, + "step": 9061, + "time_per_iteration": 2.5398924350738525 + }, + { + "auxiliary_loss_clip": 0.06425673, + "auxiliary_loss_mlp": 0.01265991, + "balance_loss_clip": 0.06280909, + "balance_loss_mlp": 0.01255221, + "epoch": 0.5448369156771381, + "flos": 19682998698240.0, + "grad_norm": 1.6196021204279303, + "language_loss": 0.87203825, + "learning_rate": 1.8069333938543627e-06, + "loss": 0.94895482, + "num_input_tokens_seen": 195235770, + "router_z_loss_clip": 1.44921875, + "router_z_loss_mlp": 0.10772705, + "step": 9062, + "time_per_iteration": 4.0366997718811035 + }, + { + "auxiliary_loss_clip": 0.06433238, + "auxiliary_loss_mlp": 0.01268748, + "balance_loss_clip": 0.0628314, + "balance_loss_mlp": 0.01256392, + "epoch": 0.5448970389298061, + "flos": 19287925896960.0, + "grad_norm": 1.7163800985020743, + "language_loss": 0.82674021, + "learning_rate": 1.8065457547173233e-06, + "loss": 0.90376008, + "num_input_tokens_seen": 195254870, + "router_z_loss_clip": 1.50097656, + "router_z_loss_mlp": 0.12359619, + "step": 9063, + "time_per_iteration": 2.5397801399230957 + }, + { + "auxiliary_loss_clip": 0.06429115, + "auxiliary_loss_mlp": 0.01269549, + "balance_loss_clip": 0.0628127, + "balance_loss_mlp": 0.01257264, + "epoch": 0.544957162182474, + "flos": 20997270789120.0, + "grad_norm": 1.590898869425655, + "language_loss": 0.63855612, + "learning_rate": 1.8061581229160878e-06, + "loss": 0.71554273, + "num_input_tokens_seen": 195273390, + "router_z_loss_clip": 1.47558594, + "router_z_loss_mlp": 0.1227417, + "step": 9064, + "time_per_iteration": 2.511350631713867 + }, + { + "auxiliary_loss_clip": 0.06432661, + "auxiliary_loss_mlp": 0.01263974, + "balance_loss_clip": 0.06282693, + "balance_loss_mlp": 0.01251863, + "epoch": 0.545017285435142, + "flos": 25381671678720.0, + "grad_norm": 1.596100575558465, + "language_loss": 0.80746907, + "learning_rate": 1.8057704984653566e-06, + "loss": 0.88443542, + "num_input_tokens_seen": 195295635, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.12115479, + "step": 9065, + "time_per_iteration": 2.589707136154175 + }, + { + "auxiliary_loss_clip": 0.06425799, + "auxiliary_loss_mlp": 0.01266335, + "balance_loss_clip": 0.06280494, + "balance_loss_mlp": 0.01255916, + "epoch": 0.54507740868781, + "flos": 19140661146240.0, + "grad_norm": 1.9404249818077939, + "language_loss": 0.78152055, + "learning_rate": 1.805382881379827e-06, + "loss": 0.85844183, + "num_input_tokens_seen": 195312545, + "router_z_loss_clip": 1.45214844, + "router_z_loss_mlp": 0.10412598, + "step": 9066, + "time_per_iteration": 2.5037317276000977 + }, + { + "auxiliary_loss_clip": 0.06434928, + "auxiliary_loss_mlp": 0.01268701, + "balance_loss_clip": 0.06284117, + "balance_loss_mlp": 0.01256714, + "epoch": 0.545137531940478, + "flos": 26256958629120.0, + "grad_norm": 1.5302055737642422, + "language_loss": 0.76331961, + "learning_rate": 1.8049952716741975e-06, + "loss": 0.84035593, + "num_input_tokens_seen": 195332955, + "router_z_loss_clip": 1.5078125, + "router_z_loss_mlp": 0.11993408, + "step": 9067, + "time_per_iteration": 4.019241571426392 + }, + { + "auxiliary_loss_clip": 0.06438933, + "auxiliary_loss_mlp": 0.01268386, + "balance_loss_clip": 0.06285474, + "balance_loss_mlp": 0.01255685, + "epoch": 0.545197655193146, + "flos": 37563880435200.0, + "grad_norm": 1.8087199149855477, + "language_loss": 0.62992573, + "learning_rate": 1.8046076693631682e-06, + "loss": 0.70699894, + "num_input_tokens_seen": 195355930, + "router_z_loss_clip": 1.53417969, + "router_z_loss_mlp": 0.12701416, + "step": 9068, + "time_per_iteration": 2.6678848266601562 + }, + { + "auxiliary_loss_clip": 0.06424262, + "auxiliary_loss_mlp": 0.01267107, + "balance_loss_clip": 0.06280495, + "balance_loss_mlp": 0.01256163, + "epoch": 0.5452577784458139, + "flos": 26038430380800.0, + "grad_norm": 1.5391820181686233, + "language_loss": 0.72328687, + "learning_rate": 1.8042200744614343e-06, + "loss": 0.80020058, + "num_input_tokens_seen": 195376445, + "router_z_loss_clip": 1.43847656, + "router_z_loss_mlp": 0.10949707, + "step": 9069, + "time_per_iteration": 2.555837631225586 + }, + { + "auxiliary_loss_clip": 0.06424727, + "auxiliary_loss_mlp": 0.0126738, + "balance_loss_clip": 0.0628207, + "balance_loss_mlp": 0.01256723, + "epoch": 0.5453179016984819, + "flos": 17644729403520.0, + "grad_norm": 1.699483734463513, + "language_loss": 0.74651837, + "learning_rate": 1.8038324869836957e-06, + "loss": 0.82343948, + "num_input_tokens_seen": 195393725, + "router_z_loss_clip": 1.42480469, + "router_z_loss_mlp": 0.10662842, + "step": 9070, + "time_per_iteration": 2.493806838989258 + }, + { + "auxiliary_loss_clip": 0.06424981, + "auxiliary_loss_mlp": 0.01264741, + "balance_loss_clip": 0.06277809, + "balance_loss_mlp": 0.01253839, + "epoch": 0.5453780249511498, + "flos": 23222524469760.0, + "grad_norm": 1.8987434929949667, + "language_loss": 0.61238426, + "learning_rate": 1.8034449069446489e-06, + "loss": 0.68928152, + "num_input_tokens_seen": 195411380, + "router_z_loss_clip": 1.47167969, + "router_z_loss_mlp": 0.10900879, + "step": 9071, + "time_per_iteration": 2.522620677947998 + }, + { + "auxiliary_loss_clip": 0.06331067, + "auxiliary_loss_mlp": 0.01252658, + "balance_loss_clip": 0.06269144, + "balance_loss_mlp": 0.01250867, + "epoch": 0.5454381482038179, + "flos": 68719163443200.0, + "grad_norm": 0.6892933067721945, + "language_loss": 0.57065922, + "learning_rate": 1.80305733435899e-06, + "loss": 0.64649647, + "num_input_tokens_seen": 195482015, + "router_z_loss_clip": 0.61767578, + "router_z_loss_mlp": 0.01786804, + "step": 9072, + "time_per_iteration": 3.235288381576538 + }, + { + "auxiliary_loss_clip": 0.06422395, + "auxiliary_loss_mlp": 0.01268326, + "balance_loss_clip": 0.06280763, + "balance_loss_mlp": 0.01257424, + "epoch": 0.5454982714564858, + "flos": 13265569393920.0, + "grad_norm": 1.8411374110080903, + "language_loss": 0.69644904, + "learning_rate": 1.8026697692414174e-06, + "loss": 0.77335626, + "num_input_tokens_seen": 195500440, + "router_z_loss_clip": 1.41601562, + "router_z_loss_mlp": 0.10906982, + "step": 9073, + "time_per_iteration": 2.476053237915039 + }, + { + "auxiliary_loss_clip": 0.06421326, + "auxiliary_loss_mlp": 0.01272164, + "balance_loss_clip": 0.06280228, + "balance_loss_mlp": 0.01261477, + "epoch": 0.5455583947091538, + "flos": 21842439396480.0, + "grad_norm": 1.836952800264558, + "language_loss": 0.71413183, + "learning_rate": 1.802282211606627e-06, + "loss": 0.79106677, + "num_input_tokens_seen": 195520860, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.10687256, + "step": 9074, + "time_per_iteration": 3.981220006942749 + }, + { + "auxiliary_loss_clip": 0.06424403, + "auxiliary_loss_mlp": 0.01266647, + "balance_loss_clip": 0.06278551, + "balance_loss_mlp": 0.01255364, + "epoch": 0.5456185179618217, + "flos": 17822489840640.0, + "grad_norm": 1.975994190229167, + "language_loss": 0.68697762, + "learning_rate": 1.8018946614693148e-06, + "loss": 0.76388818, + "num_input_tokens_seen": 195538615, + "router_z_loss_clip": 1.45800781, + "router_z_loss_mlp": 0.112854, + "step": 9075, + "time_per_iteration": 2.506155490875244 + }, + { + "auxiliary_loss_clip": 0.06425694, + "auxiliary_loss_mlp": 0.01265713, + "balance_loss_clip": 0.06281726, + "balance_loss_mlp": 0.01254942, + "epoch": 0.5456786412144897, + "flos": 21075787664640.0, + "grad_norm": 1.6135772994791406, + "language_loss": 0.80784404, + "learning_rate": 1.8015071188441768e-06, + "loss": 0.88475811, + "num_input_tokens_seen": 195557460, + "router_z_loss_clip": 1.43847656, + "router_z_loss_mlp": 0.10778809, + "step": 9076, + "time_per_iteration": 2.538940906524658 + }, + { + "auxiliary_loss_clip": 0.06430642, + "auxiliary_loss_mlp": 0.01272688, + "balance_loss_clip": 0.06283286, + "balance_loss_mlp": 0.01261005, + "epoch": 0.5457387644671576, + "flos": 23301712177920.0, + "grad_norm": 1.7804219771063188, + "language_loss": 0.80408549, + "learning_rate": 1.8011195837459089e-06, + "loss": 0.88111883, + "num_input_tokens_seen": 195577985, + "router_z_loss_clip": 1.47363281, + "router_z_loss_mlp": 0.11682129, + "step": 9077, + "time_per_iteration": 2.6752305030822754 + }, + { + "auxiliary_loss_clip": 0.06424201, + "auxiliary_loss_mlp": 0.01267583, + "balance_loss_clip": 0.06278477, + "balance_loss_mlp": 0.0125698, + "epoch": 0.5457988877198257, + "flos": 21623575731840.0, + "grad_norm": 1.8316897806182997, + "language_loss": 0.67871404, + "learning_rate": 1.8007320561892064e-06, + "loss": 0.75563186, + "num_input_tokens_seen": 195597620, + "router_z_loss_clip": 1.45800781, + "router_z_loss_mlp": 0.1060791, + "step": 9078, + "time_per_iteration": 2.5634307861328125 + }, + { + "auxiliary_loss_clip": 0.06428619, + "auxiliary_loss_mlp": 0.01268679, + "balance_loss_clip": 0.0628078, + "balance_loss_mlp": 0.01256722, + "epoch": 0.5458590109724936, + "flos": 23768174257920.0, + "grad_norm": 2.0367985655242116, + "language_loss": 0.81582344, + "learning_rate": 1.800344536188764e-06, + "loss": 0.8927964, + "num_input_tokens_seen": 195615910, + "router_z_loss_clip": 1.4765625, + "router_z_loss_mlp": 0.1194458, + "step": 9079, + "time_per_iteration": 2.563260078430176 + }, + { + "auxiliary_loss_clip": 0.06434448, + "auxiliary_loss_mlp": 0.01267346, + "balance_loss_clip": 0.06280699, + "balance_loss_mlp": 0.01255341, + "epoch": 0.5459191342251616, + "flos": 24430928526720.0, + "grad_norm": 1.7111364231373303, + "language_loss": 0.76216662, + "learning_rate": 1.799957023759277e-06, + "loss": 0.83918452, + "num_input_tokens_seen": 195635620, + "router_z_loss_clip": 1.53710938, + "router_z_loss_mlp": 0.12011719, + "step": 9080, + "time_per_iteration": 2.538072347640991 + }, + { + "auxiliary_loss_clip": 0.06429628, + "auxiliary_loss_mlp": 0.0126983, + "balance_loss_clip": 0.06281854, + "balance_loss_mlp": 0.0125816, + "epoch": 0.5459792574778296, + "flos": 23629756112640.0, + "grad_norm": 1.9762884364861095, + "language_loss": 0.83489871, + "learning_rate": 1.7995695189154392e-06, + "loss": 0.91189325, + "num_input_tokens_seen": 195652495, + "router_z_loss_clip": 1.47753906, + "router_z_loss_mlp": 0.11669922, + "step": 9081, + "time_per_iteration": 2.583111047744751 + }, + { + "auxiliary_loss_clip": 0.06430145, + "auxiliary_loss_mlp": 0.01267495, + "balance_loss_clip": 0.0628006, + "balance_loss_mlp": 0.01256552, + "epoch": 0.5460393807304975, + "flos": 19141583541120.0, + "grad_norm": 2.327386206353707, + "language_loss": 0.70079756, + "learning_rate": 1.7991820216719461e-06, + "loss": 0.77777398, + "num_input_tokens_seen": 195671965, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.10943604, + "step": 9082, + "time_per_iteration": 2.5038371086120605 + }, + { + "auxiliary_loss_clip": 0.06421287, + "auxiliary_loss_mlp": 0.01265183, + "balance_loss_clip": 0.06277952, + "balance_loss_mlp": 0.01253959, + "epoch": 0.5460995039831655, + "flos": 35927308414080.0, + "grad_norm": 1.8952773157154152, + "language_loss": 0.66865891, + "learning_rate": 1.7987945320434906e-06, + "loss": 0.74552357, + "num_input_tokens_seen": 195694725, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.11224365, + "step": 9083, + "time_per_iteration": 2.6453137397766113 + }, + { + "auxiliary_loss_clip": 0.06418573, + "auxiliary_loss_mlp": 0.01266425, + "balance_loss_clip": 0.06276823, + "balance_loss_mlp": 0.01256019, + "epoch": 0.5461596272358334, + "flos": 26766242945280.0, + "grad_norm": 1.5423197483893423, + "language_loss": 0.7895304, + "learning_rate": 1.798407050044766e-06, + "loss": 0.86638033, + "num_input_tokens_seen": 195714090, + "router_z_loss_clip": 1.41992188, + "router_z_loss_mlp": 0.10406494, + "step": 9084, + "time_per_iteration": 2.5392911434173584 + }, + { + "auxiliary_loss_clip": 0.06427852, + "auxiliary_loss_mlp": 0.01262899, + "balance_loss_clip": 0.06280479, + "balance_loss_mlp": 0.01252004, + "epoch": 0.5462197504885015, + "flos": 20892870201600.0, + "grad_norm": 1.8818428979315067, + "language_loss": 0.75159836, + "learning_rate": 1.7980195756904675e-06, + "loss": 0.82850587, + "num_input_tokens_seen": 195733585, + "router_z_loss_clip": 1.47265625, + "router_z_loss_mlp": 0.10900879, + "step": 9085, + "time_per_iteration": 2.5238590240478516 + }, + { + "auxiliary_loss_clip": 0.06428534, + "auxiliary_loss_mlp": 0.012646, + "balance_loss_clip": 0.06279323, + "balance_loss_mlp": 0.01252995, + "epoch": 0.5462798737411694, + "flos": 25810887818880.0, + "grad_norm": 1.69825848629267, + "language_loss": 0.74606055, + "learning_rate": 1.7976321089952857e-06, + "loss": 0.82299185, + "num_input_tokens_seen": 195752820, + "router_z_loss_clip": 1.4921875, + "router_z_loss_mlp": 0.1161499, + "step": 9086, + "time_per_iteration": 2.5416669845581055 + }, + { + "auxiliary_loss_clip": 0.06424639, + "auxiliary_loss_mlp": 0.01267462, + "balance_loss_clip": 0.06277122, + "balance_loss_mlp": 0.01255834, + "epoch": 0.5463399969938374, + "flos": 25782027287040.0, + "grad_norm": 1.4075791244754594, + "language_loss": 0.76979077, + "learning_rate": 1.7972446499739155e-06, + "loss": 0.84671181, + "num_input_tokens_seen": 195773740, + "router_z_loss_clip": 1.47363281, + "router_z_loss_mlp": 0.11633301, + "step": 9087, + "time_per_iteration": 2.5764284133911133 + }, + { + "auxiliary_loss_clip": 0.0642488, + "auxiliary_loss_mlp": 0.01270837, + "balance_loss_clip": 0.06278133, + "balance_loss_mlp": 0.01258088, + "epoch": 0.5464001202465053, + "flos": 18849234245760.0, + "grad_norm": 1.6014949266825944, + "language_loss": 0.77368462, + "learning_rate": 1.7968571986410484e-06, + "loss": 0.85064179, + "num_input_tokens_seen": 195792125, + "router_z_loss_clip": 1.46777344, + "router_z_loss_mlp": 0.12744141, + "step": 9088, + "time_per_iteration": 2.4971888065338135 + }, + { + "auxiliary_loss_clip": 0.06317829, + "auxiliary_loss_mlp": 0.01258554, + "balance_loss_clip": 0.062563, + "balance_loss_mlp": 0.0125685, + "epoch": 0.5464602434991733, + "flos": 69070281978240.0, + "grad_norm": 0.7120973935253039, + "language_loss": 0.57630938, + "learning_rate": 1.7964697550113758e-06, + "loss": 0.6520732, + "num_input_tokens_seen": 195854935, + "router_z_loss_clip": 0.6171875, + "router_z_loss_mlp": 0.01708984, + "step": 9089, + "time_per_iteration": 3.251268148422241 + }, + { + "auxiliary_loss_clip": 0.06429952, + "auxiliary_loss_mlp": 0.01270687, + "balance_loss_clip": 0.06279282, + "balance_loss_mlp": 0.01258945, + "epoch": 0.5465203667518412, + "flos": 27566870307840.0, + "grad_norm": 1.7671189132091156, + "language_loss": 0.77121699, + "learning_rate": 1.7960823190995918e-06, + "loss": 0.84822339, + "num_input_tokens_seen": 195874715, + "router_z_loss_clip": 1.50585938, + "router_z_loss_mlp": 0.11743164, + "step": 9090, + "time_per_iteration": 2.5513298511505127 + }, + { + "auxiliary_loss_clip": 0.06428426, + "auxiliary_loss_mlp": 0.01269928, + "balance_loss_clip": 0.06277205, + "balance_loss_mlp": 0.01257268, + "epoch": 0.5465804900045093, + "flos": 21215757110400.0, + "grad_norm": 1.8390444270451474, + "language_loss": 0.73801088, + "learning_rate": 1.7956948909203855e-06, + "loss": 0.81499445, + "num_input_tokens_seen": 195892610, + "router_z_loss_clip": 1.51269531, + "router_z_loss_mlp": 0.12670898, + "step": 9091, + "time_per_iteration": 2.5593018531799316 + }, + { + "auxiliary_loss_clip": 0.06426038, + "auxiliary_loss_mlp": 0.01268102, + "balance_loss_clip": 0.06278463, + "balance_loss_mlp": 0.01255948, + "epoch": 0.5466406132571772, + "flos": 22495005394560.0, + "grad_norm": 3.020884161734631, + "language_loss": 0.77827132, + "learning_rate": 1.7953074704884498e-06, + "loss": 0.85521269, + "num_input_tokens_seen": 195911085, + "router_z_loss_clip": 1.47363281, + "router_z_loss_mlp": 0.12164307, + "step": 9092, + "time_per_iteration": 2.5000102519989014 + }, + { + "auxiliary_loss_clip": 0.06431385, + "auxiliary_loss_mlp": 0.01266206, + "balance_loss_clip": 0.06280962, + "balance_loss_mlp": 0.01254583, + "epoch": 0.5467007365098452, + "flos": 17681598000000.0, + "grad_norm": 2.033807673433485, + "language_loss": 0.75258666, + "learning_rate": 1.794920057818476e-06, + "loss": 0.82956254, + "num_input_tokens_seen": 195929845, + "router_z_loss_clip": 1.50195312, + "router_z_loss_mlp": 0.11627197, + "step": 9093, + "time_per_iteration": 2.5118560791015625 + }, + { + "auxiliary_loss_clip": 0.06426246, + "auxiliary_loss_mlp": 0.01271687, + "balance_loss_clip": 0.06277527, + "balance_loss_mlp": 0.01258634, + "epoch": 0.5467608597625132, + "flos": 15703146120960.0, + "grad_norm": 3.7072671758327993, + "language_loss": 0.69514894, + "learning_rate": 1.7945326529251533e-06, + "loss": 0.77212822, + "num_input_tokens_seen": 195946350, + "router_z_loss_clip": 1.48828125, + "router_z_loss_mlp": 0.13067627, + "step": 9094, + "time_per_iteration": 2.471296787261963 + }, + { + "auxiliary_loss_clip": 0.06427498, + "auxiliary_loss_mlp": 0.01268457, + "balance_loss_clip": 0.06281194, + "balance_loss_mlp": 0.0125799, + "epoch": 0.5468209830151811, + "flos": 24319106853120.0, + "grad_norm": 3.067574771902978, + "language_loss": 0.68405867, + "learning_rate": 1.7941452558231731e-06, + "loss": 0.76101816, + "num_input_tokens_seen": 195959840, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.10467529, + "step": 9095, + "time_per_iteration": 2.559969186782837 + }, + { + "auxiliary_loss_clip": 0.06427877, + "auxiliary_loss_mlp": 0.01266121, + "balance_loss_clip": 0.06280283, + "balance_loss_mlp": 0.01255058, + "epoch": 0.5468811062678491, + "flos": 29173575548160.0, + "grad_norm": 1.4017188918581747, + "language_loss": 0.67021394, + "learning_rate": 1.7937578665272256e-06, + "loss": 0.747154, + "num_input_tokens_seen": 195981125, + "router_z_loss_clip": 1.4765625, + "router_z_loss_mlp": 0.11065674, + "step": 9096, + "time_per_iteration": 2.5755646228790283 + }, + { + "auxiliary_loss_clip": 0.06321621, + "auxiliary_loss_mlp": 0.01252605, + "balance_loss_clip": 0.06259765, + "balance_loss_mlp": 0.01250808, + "epoch": 0.546941229520517, + "flos": 67885078302720.0, + "grad_norm": 0.7312259601273227, + "language_loss": 0.57564938, + "learning_rate": 1.7933704850520007e-06, + "loss": 0.65139174, + "num_input_tokens_seen": 196038880, + "router_z_loss_clip": 0.6171875, + "router_z_loss_mlp": 0.01792908, + "step": 9097, + "time_per_iteration": 3.239208698272705 + }, + { + "auxiliary_loss_clip": 0.06323195, + "auxiliary_loss_mlp": 0.01252523, + "balance_loss_clip": 0.06261444, + "balance_loss_mlp": 0.01250845, + "epoch": 0.5470013527731851, + "flos": 58286578993920.0, + "grad_norm": 0.8922489191245683, + "language_loss": 0.64733016, + "learning_rate": 1.7929831114121868e-06, + "loss": 0.72308731, + "num_input_tokens_seen": 196099215, + "router_z_loss_clip": 0.6171875, + "router_z_loss_mlp": 0.01681519, + "step": 9098, + "time_per_iteration": 4.485429763793945 + }, + { + "auxiliary_loss_clip": 0.06427541, + "auxiliary_loss_mlp": 0.01271404, + "balance_loss_clip": 0.06279691, + "balance_loss_mlp": 0.0125937, + "epoch": 0.547061476025853, + "flos": 22972494286080.0, + "grad_norm": 1.4988253633991158, + "language_loss": 0.73256373, + "learning_rate": 1.7925957456224753e-06, + "loss": 0.80955321, + "num_input_tokens_seen": 196120370, + "router_z_loss_clip": 1.47753906, + "router_z_loss_mlp": 0.12042236, + "step": 9099, + "time_per_iteration": 2.5771172046661377 + }, + { + "auxiliary_loss_clip": 0.06428638, + "auxiliary_loss_mlp": 0.01265011, + "balance_loss_clip": 0.06282665, + "balance_loss_mlp": 0.01254712, + "epoch": 0.547121599278521, + "flos": 29975502648960.0, + "grad_norm": 1.9003011025398133, + "language_loss": 0.73232269, + "learning_rate": 1.7922083876975537e-06, + "loss": 0.80925912, + "num_input_tokens_seen": 196139075, + "router_z_loss_clip": 1.45898438, + "router_z_loss_mlp": 0.10296631, + "step": 9100, + "time_per_iteration": 2.613353967666626 + }, + { + "auxiliary_loss_clip": 0.06426845, + "auxiliary_loss_mlp": 0.01268034, + "balance_loss_clip": 0.06282172, + "balance_loss_mlp": 0.01256376, + "epoch": 0.5471817225311889, + "flos": 36543760502400.0, + "grad_norm": 3.16405552040578, + "language_loss": 0.68177283, + "learning_rate": 1.7918210376521102e-06, + "loss": 0.75872165, + "num_input_tokens_seen": 196159990, + "router_z_loss_clip": 1.44726562, + "router_z_loss_mlp": 0.11663818, + "step": 9101, + "time_per_iteration": 2.645268440246582 + }, + { + "auxiliary_loss_clip": 0.06429439, + "auxiliary_loss_mlp": 0.01267587, + "balance_loss_clip": 0.06282283, + "balance_loss_mlp": 0.01256482, + "epoch": 0.5472418457838569, + "flos": 25782278849280.0, + "grad_norm": 1.6236525701759785, + "language_loss": 0.78028667, + "learning_rate": 1.7914336955008343e-06, + "loss": 0.85725689, + "num_input_tokens_seen": 196180570, + "router_z_loss_clip": 1.46972656, + "router_z_loss_mlp": 0.11114502, + "step": 9102, + "time_per_iteration": 4.018383264541626 + }, + { + "auxiliary_loss_clip": 0.06425326, + "auxiliary_loss_mlp": 0.01265935, + "balance_loss_clip": 0.06284064, + "balance_loss_mlp": 0.01255659, + "epoch": 0.5473019690365248, + "flos": 27894453045120.0, + "grad_norm": 1.4050316255430886, + "language_loss": 0.72370696, + "learning_rate": 1.791046361258413e-06, + "loss": 0.80061954, + "num_input_tokens_seen": 196200300, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.1027832, + "step": 9103, + "time_per_iteration": 2.613557815551758 + }, + { + "auxiliary_loss_clip": 0.06427938, + "auxiliary_loss_mlp": 0.01268597, + "balance_loss_clip": 0.06282217, + "balance_loss_mlp": 0.01257237, + "epoch": 0.5473620922891929, + "flos": 57644551411200.0, + "grad_norm": 1.2696818989696173, + "language_loss": 0.65471172, + "learning_rate": 1.7906590349395356e-06, + "loss": 0.73167711, + "num_input_tokens_seen": 196228525, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.11352539, + "step": 9104, + "time_per_iteration": 2.8648996353149414 + }, + { + "auxiliary_loss_clip": 0.0643408, + "auxiliary_loss_mlp": 0.01271697, + "balance_loss_clip": 0.06284557, + "balance_loss_mlp": 0.01259174, + "epoch": 0.5474222155418608, + "flos": 19360069862400.0, + "grad_norm": 1.73787664165883, + "language_loss": 0.8214826, + "learning_rate": 1.790271716558888e-06, + "loss": 0.89854038, + "num_input_tokens_seen": 196247690, + "router_z_loss_clip": 1.49609375, + "router_z_loss_mlp": 0.12536621, + "step": 9105, + "time_per_iteration": 2.5110819339752197 + }, + { + "auxiliary_loss_clip": 0.06424334, + "auxiliary_loss_mlp": 0.01267412, + "balance_loss_clip": 0.06280238, + "balance_loss_mlp": 0.01256474, + "epoch": 0.5474823387945288, + "flos": 25127700353280.0, + "grad_norm": 1.5738849579324676, + "language_loss": 0.80505264, + "learning_rate": 1.7898844061311575e-06, + "loss": 0.88197005, + "num_input_tokens_seen": 196268555, + "router_z_loss_clip": 1.43945312, + "router_z_loss_mlp": 0.10943604, + "step": 9106, + "time_per_iteration": 2.545797824859619 + }, + { + "auxiliary_loss_clip": 0.0642664, + "auxiliary_loss_mlp": 0.01267343, + "balance_loss_clip": 0.06280842, + "balance_loss_mlp": 0.01256334, + "epoch": 0.5475424620471967, + "flos": 18009977351040.0, + "grad_norm": 1.8936776188065845, + "language_loss": 0.69983113, + "learning_rate": 1.7894971036710322e-06, + "loss": 0.77677101, + "num_input_tokens_seen": 196285585, + "router_z_loss_clip": 1.45800781, + "router_z_loss_mlp": 0.11010742, + "step": 9107, + "time_per_iteration": 3.930511474609375 + }, + { + "auxiliary_loss_clip": 0.06431143, + "auxiliary_loss_mlp": 0.01263745, + "balance_loss_clip": 0.06281775, + "balance_loss_mlp": 0.01252438, + "epoch": 0.5476025852998647, + "flos": 22315819438080.0, + "grad_norm": 1.6441057037047366, + "language_loss": 0.63668221, + "learning_rate": 1.789109809193197e-06, + "loss": 0.71363103, + "num_input_tokens_seen": 196305085, + "router_z_loss_clip": 1.49316406, + "router_z_loss_mlp": 0.11309814, + "step": 9108, + "time_per_iteration": 2.548469305038452 + }, + { + "auxiliary_loss_clip": 0.06427735, + "auxiliary_loss_mlp": 0.01264812, + "balance_loss_clip": 0.06281575, + "balance_loss_mlp": 0.01254632, + "epoch": 0.5476627085525327, + "flos": 20126679667200.0, + "grad_norm": 1.6544017163405356, + "language_loss": 0.75096864, + "learning_rate": 1.7887225227123396e-06, + "loss": 0.82789409, + "num_input_tokens_seen": 196323945, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.10174561, + "step": 9109, + "time_per_iteration": 2.505537748336792 + }, + { + "auxiliary_loss_clip": 0.06426554, + "auxiliary_loss_mlp": 0.01271245, + "balance_loss_clip": 0.06282739, + "balance_loss_mlp": 0.01259235, + "epoch": 0.5477228318052006, + "flos": 17718382742400.0, + "grad_norm": 1.7609925306613563, + "language_loss": 0.78101015, + "learning_rate": 1.7883352442431457e-06, + "loss": 0.85798812, + "num_input_tokens_seen": 196342200, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.12005615, + "step": 9110, + "time_per_iteration": 2.5898001194000244 + }, + { + "auxiliary_loss_clip": 0.0642444, + "auxiliary_loss_mlp": 0.01264653, + "balance_loss_clip": 0.06281163, + "balance_loss_mlp": 0.01253948, + "epoch": 0.5477829550578687, + "flos": 25856057969280.0, + "grad_norm": 1.4117567478996924, + "language_loss": 0.71281165, + "learning_rate": 1.7879479738002993e-06, + "loss": 0.78970265, + "num_input_tokens_seen": 196362940, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.10699463, + "step": 9111, + "time_per_iteration": 2.5514800548553467 + }, + { + "auxiliary_loss_clip": 0.06428348, + "auxiliary_loss_mlp": 0.01265751, + "balance_loss_clip": 0.06282744, + "balance_loss_mlp": 0.01254021, + "epoch": 0.5478430783105366, + "flos": 23046399187200.0, + "grad_norm": 1.7318252125729088, + "language_loss": 0.71129775, + "learning_rate": 1.7875607113984876e-06, + "loss": 0.7882387, + "num_input_tokens_seen": 196383070, + "router_z_loss_clip": 1.45605469, + "router_z_loss_mlp": 0.1171875, + "step": 9112, + "time_per_iteration": 2.5733911991119385 + }, + { + "auxiliary_loss_clip": 0.06428306, + "auxiliary_loss_mlp": 0.01265183, + "balance_loss_clip": 0.06280322, + "balance_loss_mlp": 0.0125412, + "epoch": 0.5479032015632046, + "flos": 16076821403520.0, + "grad_norm": 1.865243038866792, + "language_loss": 0.88150853, + "learning_rate": 1.7871734570523953e-06, + "loss": 0.95844346, + "num_input_tokens_seen": 196398485, + "router_z_loss_clip": 1.47949219, + "router_z_loss_mlp": 0.1105957, + "step": 9113, + "time_per_iteration": 4.03569483757019 + }, + { + "auxiliary_loss_clip": 0.06427854, + "auxiliary_loss_mlp": 0.01265805, + "balance_loss_clip": 0.0628054, + "balance_loss_mlp": 0.01254171, + "epoch": 0.5479633248158725, + "flos": 24285382784640.0, + "grad_norm": 1.9056802782338742, + "language_loss": 0.73404038, + "learning_rate": 1.7867862107767067e-06, + "loss": 0.81097698, + "num_input_tokens_seen": 196417725, + "router_z_loss_clip": 1.47363281, + "router_z_loss_mlp": 0.11633301, + "step": 9114, + "time_per_iteration": 2.552778959274292 + }, + { + "auxiliary_loss_clip": 0.06424817, + "auxiliary_loss_mlp": 0.0126649, + "balance_loss_clip": 0.06279442, + "balance_loss_mlp": 0.0125582, + "epoch": 0.5480234480685405, + "flos": 26365216504320.0, + "grad_norm": 1.4540698273743113, + "language_loss": 0.72457099, + "learning_rate": 1.7863989725861066e-06, + "loss": 0.80148405, + "num_input_tokens_seen": 196437840, + "router_z_loss_clip": 1.45214844, + "router_z_loss_mlp": 0.10662842, + "step": 9115, + "time_per_iteration": 2.5838403701782227 + }, + { + "auxiliary_loss_clip": 0.06436512, + "auxiliary_loss_mlp": 0.01267671, + "balance_loss_clip": 0.06284098, + "balance_loss_mlp": 0.01256066, + "epoch": 0.5480835713212084, + "flos": 22061722331520.0, + "grad_norm": 1.7541916767056687, + "language_loss": 0.72373956, + "learning_rate": 1.7860117424952781e-06, + "loss": 0.80078137, + "num_input_tokens_seen": 196457300, + "router_z_loss_clip": 1.52539062, + "router_z_loss_mlp": 0.1161499, + "step": 9116, + "time_per_iteration": 2.5292439460754395 + }, + { + "auxiliary_loss_clip": 0.06426133, + "auxiliary_loss_mlp": 0.01267118, + "balance_loss_clip": 0.06279518, + "balance_loss_mlp": 0.01256205, + "epoch": 0.5481436945738765, + "flos": 25308018339840.0, + "grad_norm": 1.941043285146296, + "language_loss": 0.76906073, + "learning_rate": 1.7856245205189063e-06, + "loss": 0.84599322, + "num_input_tokens_seen": 196476720, + "router_z_loss_clip": 1.46582031, + "router_z_loss_mlp": 0.10906982, + "step": 9117, + "time_per_iteration": 2.5854122638702393 + }, + { + "auxiliary_loss_clip": 0.06421119, + "auxiliary_loss_mlp": 0.01264207, + "balance_loss_clip": 0.06279179, + "balance_loss_mlp": 0.01253532, + "epoch": 0.5482038178265444, + "flos": 33588807540480.0, + "grad_norm": 1.613198613591587, + "language_loss": 0.62954283, + "learning_rate": 1.785237306671674e-06, + "loss": 0.7063961, + "num_input_tokens_seen": 196496765, + "router_z_loss_clip": 1.41894531, + "router_z_loss_mlp": 0.10675049, + "step": 9118, + "time_per_iteration": 2.61136531829834 + }, + { + "auxiliary_loss_clip": 0.06429429, + "auxiliary_loss_mlp": 0.01266281, + "balance_loss_clip": 0.06280537, + "balance_loss_mlp": 0.0125436, + "epoch": 0.5482639410792124, + "flos": 19032235562880.0, + "grad_norm": 1.6774564392555322, + "language_loss": 0.79138243, + "learning_rate": 1.7848501009682646e-06, + "loss": 0.86833954, + "num_input_tokens_seen": 196516220, + "router_z_loss_clip": 1.48925781, + "router_z_loss_mlp": 0.11920166, + "step": 9119, + "time_per_iteration": 2.5309953689575195 + }, + { + "auxiliary_loss_clip": 0.06425598, + "auxiliary_loss_mlp": 0.01271106, + "balance_loss_clip": 0.06281713, + "balance_loss_mlp": 0.0126033, + "epoch": 0.5483240643318803, + "flos": 25417282464000.0, + "grad_norm": 1.5630724809093546, + "language_loss": 0.82719064, + "learning_rate": 1.7844629034233604e-06, + "loss": 0.9041577, + "num_input_tokens_seen": 196533860, + "router_z_loss_clip": 1.43945312, + "router_z_loss_mlp": 0.10772705, + "step": 9120, + "time_per_iteration": 2.551790952682495 + }, + { + "auxiliary_loss_clip": 0.06432922, + "auxiliary_loss_mlp": 0.01264861, + "balance_loss_clip": 0.06284823, + "balance_loss_mlp": 0.01253292, + "epoch": 0.5483841875845483, + "flos": 21472705255680.0, + "grad_norm": 1.7308751336861314, + "language_loss": 0.80248237, + "learning_rate": 1.7840757140516455e-06, + "loss": 0.87946028, + "num_input_tokens_seen": 196551305, + "router_z_loss_clip": 1.48242188, + "router_z_loss_mlp": 0.11566162, + "step": 9121, + "time_per_iteration": 2.5354321002960205 + }, + { + "auxiliary_loss_clip": 0.06429829, + "auxiliary_loss_mlp": 0.01267, + "balance_loss_clip": 0.06280297, + "balance_loss_mlp": 0.01255651, + "epoch": 0.5484443108372163, + "flos": 24753060748800.0, + "grad_norm": 1.8214688446413962, + "language_loss": 0.6171329, + "learning_rate": 1.7836885328678008e-06, + "loss": 0.69410121, + "num_input_tokens_seen": 196569420, + "router_z_loss_clip": 1.49609375, + "router_z_loss_mlp": 0.11352539, + "step": 9122, + "time_per_iteration": 2.536548614501953 + }, + { + "auxiliary_loss_clip": 0.06426375, + "auxiliary_loss_mlp": 0.01268013, + "balance_loss_clip": 0.06283108, + "balance_loss_mlp": 0.0125729, + "epoch": 0.5485044340898843, + "flos": 25382594073600.0, + "grad_norm": 1.6758320366866328, + "language_loss": 0.71812153, + "learning_rate": 1.7833013598865084e-06, + "loss": 0.7950654, + "num_input_tokens_seen": 196590610, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.1071167, + "step": 9123, + "time_per_iteration": 2.563128709793091 + }, + { + "auxiliary_loss_clip": 0.06422795, + "auxiliary_loss_mlp": 0.01264644, + "balance_loss_clip": 0.06277866, + "balance_loss_mlp": 0.01254839, + "epoch": 0.5485645573425523, + "flos": 12646140485760.0, + "grad_norm": 2.0499300220900367, + "language_loss": 0.83466411, + "learning_rate": 1.7829141951224505e-06, + "loss": 0.91153848, + "num_input_tokens_seen": 196606495, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.09802246, + "step": 9124, + "time_per_iteration": 2.4774932861328125 + }, + { + "auxiliary_loss_clip": 0.06423289, + "auxiliary_loss_mlp": 0.01272789, + "balance_loss_clip": 0.06280372, + "balance_loss_mlp": 0.01262054, + "epoch": 0.5486246805952202, + "flos": 28336918129920.0, + "grad_norm": 1.5704023496451165, + "language_loss": 0.80787551, + "learning_rate": 1.7825270385903075e-06, + "loss": 0.88483626, + "num_input_tokens_seen": 196626365, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.10736084, + "step": 9125, + "time_per_iteration": 2.6640827655792236 + }, + { + "auxiliary_loss_clip": 0.06429766, + "auxiliary_loss_mlp": 0.01266738, + "balance_loss_clip": 0.06281759, + "balance_loss_mlp": 0.0125558, + "epoch": 0.5486848038478882, + "flos": 16805598289920.0, + "grad_norm": 1.778522251586277, + "language_loss": 0.74475932, + "learning_rate": 1.7821398903047617e-06, + "loss": 0.82172436, + "num_input_tokens_seen": 196644465, + "router_z_loss_clip": 1.47753906, + "router_z_loss_mlp": 0.1114502, + "step": 9126, + "time_per_iteration": 2.4920494556427 + }, + { + "auxiliary_loss_clip": 0.0643461, + "auxiliary_loss_mlp": 0.01271917, + "balance_loss_clip": 0.06284419, + "balance_loss_mlp": 0.01260383, + "epoch": 0.5487449271005561, + "flos": 17241606610560.0, + "grad_norm": 2.5065680491325217, + "language_loss": 0.66843152, + "learning_rate": 1.7817527502804928e-06, + "loss": 0.74549675, + "num_input_tokens_seen": 196659160, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.11535645, + "step": 9127, + "time_per_iteration": 2.498995304107666 + }, + { + "auxiliary_loss_clip": 0.0642729, + "auxiliary_loss_mlp": 0.0126947, + "balance_loss_clip": 0.06281507, + "balance_loss_mlp": 0.01257072, + "epoch": 0.5488050503532241, + "flos": 17345462146560.0, + "grad_norm": 1.8347258108428224, + "language_loss": 0.83430481, + "learning_rate": 1.781365618532181e-06, + "loss": 0.91127241, + "num_input_tokens_seen": 196677410, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.1239624, + "step": 9128, + "time_per_iteration": 2.4851553440093994 + }, + { + "auxiliary_loss_clip": 0.06423862, + "auxiliary_loss_mlp": 0.01267411, + "balance_loss_clip": 0.06279477, + "balance_loss_mlp": 0.01256032, + "epoch": 0.548865173605892, + "flos": 17245044627840.0, + "grad_norm": 1.9721748285442382, + "language_loss": 0.73992771, + "learning_rate": 1.7809784950745078e-06, + "loss": 0.81684041, + "num_input_tokens_seen": 196696765, + "router_z_loss_clip": 1.44335938, + "router_z_loss_mlp": 0.1138916, + "step": 9129, + "time_per_iteration": 2.5088050365448 + }, + { + "auxiliary_loss_clip": 0.06436306, + "auxiliary_loss_mlp": 0.0126816, + "balance_loss_clip": 0.0628598, + "balance_loss_mlp": 0.01256108, + "epoch": 0.5489252968585601, + "flos": 17462398919040.0, + "grad_norm": 2.1982698674747745, + "language_loss": 0.63327444, + "learning_rate": 1.7805913799221511e-06, + "loss": 0.7103191, + "num_input_tokens_seen": 196714895, + "router_z_loss_clip": 1.50292969, + "router_z_loss_mlp": 0.12054443, + "step": 9130, + "time_per_iteration": 2.4861414432525635 + }, + { + "auxiliary_loss_clip": 0.06431893, + "auxiliary_loss_mlp": 0.01266818, + "balance_loss_clip": 0.06281481, + "balance_loss_mlp": 0.01255046, + "epoch": 0.548985420111228, + "flos": 26330653895040.0, + "grad_norm": 1.729948569228587, + "language_loss": 0.63358611, + "learning_rate": 1.7802042730897915e-06, + "loss": 0.71057326, + "num_input_tokens_seen": 196735510, + "router_z_loss_clip": 1.50390625, + "router_z_loss_mlp": 0.11773682, + "step": 9131, + "time_per_iteration": 2.589580535888672 + }, + { + "auxiliary_loss_clip": 0.0643028, + "auxiliary_loss_mlp": 0.01268323, + "balance_loss_clip": 0.06282265, + "balance_loss_mlp": 0.01255955, + "epoch": 0.549045543363896, + "flos": 18699034602240.0, + "grad_norm": 1.7539544854272515, + "language_loss": 0.75148702, + "learning_rate": 1.7798171745921084e-06, + "loss": 0.82847303, + "num_input_tokens_seen": 196752855, + "router_z_loss_clip": 1.47851562, + "router_z_loss_mlp": 0.12353516, + "step": 9132, + "time_per_iteration": 2.461970329284668 + }, + { + "auxiliary_loss_clip": 0.06429279, + "auxiliary_loss_mlp": 0.01266105, + "balance_loss_clip": 0.06280597, + "balance_loss_mlp": 0.01255234, + "epoch": 0.5491056666165639, + "flos": 24724284071040.0, + "grad_norm": 2.6052413777049144, + "language_loss": 0.8162328, + "learning_rate": 1.7794300844437795e-06, + "loss": 0.89318669, + "num_input_tokens_seen": 196772230, + "router_z_loss_clip": 1.48730469, + "router_z_loss_mlp": 0.10870361, + "step": 9133, + "time_per_iteration": 2.5799684524536133 + }, + { + "auxiliary_loss_clip": 0.06426433, + "auxiliary_loss_mlp": 0.01271009, + "balance_loss_clip": 0.06280407, + "balance_loss_mlp": 0.01259691, + "epoch": 0.5491657898692319, + "flos": 21582849847680.0, + "grad_norm": 1.8788464104374898, + "language_loss": 0.70385146, + "learning_rate": 1.7790430026594841e-06, + "loss": 0.78082585, + "num_input_tokens_seen": 196790405, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.11328125, + "step": 9134, + "time_per_iteration": 2.5116565227508545 + }, + { + "auxiliary_loss_clip": 0.06431407, + "auxiliary_loss_mlp": 0.01267106, + "balance_loss_clip": 0.06281983, + "balance_loss_mlp": 0.01256062, + "epoch": 0.5492259131219, + "flos": 50487653825280.0, + "grad_norm": 2.3217483044436955, + "language_loss": 0.61379695, + "learning_rate": 1.7786559292539004e-06, + "loss": 0.69078213, + "num_input_tokens_seen": 196813785, + "router_z_loss_clip": 1.49316406, + "router_z_loss_mlp": 0.11035156, + "step": 9135, + "time_per_iteration": 2.8019859790802 + }, + { + "auxiliary_loss_clip": 0.06430922, + "auxiliary_loss_mlp": 0.01266434, + "balance_loss_clip": 0.06280293, + "balance_loss_mlp": 0.01254591, + "epoch": 0.5492860363745679, + "flos": 25126316760960.0, + "grad_norm": 1.8569102400294533, + "language_loss": 0.72833902, + "learning_rate": 1.7782688642417058e-06, + "loss": 0.80531251, + "num_input_tokens_seen": 196834390, + "router_z_loss_clip": 1.50488281, + "router_z_loss_mlp": 0.11846924, + "step": 9136, + "time_per_iteration": 2.5313796997070312 + }, + { + "auxiliary_loss_clip": 0.06434008, + "auxiliary_loss_mlp": 0.01267878, + "balance_loss_clip": 0.06279632, + "balance_loss_mlp": 0.01255551, + "epoch": 0.5493461596272359, + "flos": 22639670668800.0, + "grad_norm": 2.4335907064216302, + "language_loss": 0.6873585, + "learning_rate": 1.7778818076375781e-06, + "loss": 0.76437736, + "num_input_tokens_seen": 196853290, + "router_z_loss_clip": 1.54296875, + "router_z_loss_mlp": 0.12329102, + "step": 9137, + "time_per_iteration": 2.606400489807129 + }, + { + "auxiliary_loss_clip": 0.06325421, + "auxiliary_loss_mlp": 0.01260391, + "balance_loss_clip": 0.06263588, + "balance_loss_mlp": 0.01258753, + "epoch": 0.5494062828799038, + "flos": 66169486281600.0, + "grad_norm": 0.7309885412732349, + "language_loss": 0.65176189, + "learning_rate": 1.7774947594561947e-06, + "loss": 0.72762001, + "num_input_tokens_seen": 196913120, + "router_z_loss_clip": 0.61767578, + "router_z_loss_mlp": 0.0164032, + "step": 9138, + "time_per_iteration": 4.603189945220947 + }, + { + "auxiliary_loss_clip": 0.06431855, + "auxiliary_loss_mlp": 0.01265593, + "balance_loss_clip": 0.06282654, + "balance_loss_mlp": 0.01253803, + "epoch": 0.5494664061325718, + "flos": 21112362771840.0, + "grad_norm": 1.7352131741027665, + "language_loss": 0.75659418, + "learning_rate": 1.7771077197122321e-06, + "loss": 0.83356863, + "num_input_tokens_seen": 196931530, + "router_z_loss_clip": 1.49121094, + "router_z_loss_mlp": 0.11785889, + "step": 9139, + "time_per_iteration": 2.5063250064849854 + }, + { + "auxiliary_loss_clip": 0.06427477, + "auxiliary_loss_mlp": 0.01268876, + "balance_loss_clip": 0.06281833, + "balance_loss_mlp": 0.01257599, + "epoch": 0.5495265293852397, + "flos": 14397846416640.0, + "grad_norm": 2.090947018102217, + "language_loss": 0.71453607, + "learning_rate": 1.7767206884203672e-06, + "loss": 0.79149961, + "num_input_tokens_seen": 196949430, + "router_z_loss_clip": 1.45507812, + "router_z_loss_mlp": 0.11273193, + "step": 9140, + "time_per_iteration": 2.516493558883667 + }, + { + "auxiliary_loss_clip": 0.06426564, + "auxiliary_loss_mlp": 0.01265679, + "balance_loss_clip": 0.06279987, + "balance_loss_mlp": 0.01254623, + "epoch": 0.5495866526379077, + "flos": 25554945922560.0, + "grad_norm": 1.591757169874098, + "language_loss": 0.76439172, + "learning_rate": 1.7763336655952762e-06, + "loss": 0.84131408, + "num_input_tokens_seen": 196968265, + "router_z_loss_clip": 1.46484375, + "router_z_loss_mlp": 0.1105957, + "step": 9141, + "time_per_iteration": 4.032621383666992 + }, + { + "auxiliary_loss_clip": 0.06420414, + "auxiliary_loss_mlp": 0.01268222, + "balance_loss_clip": 0.06278077, + "balance_loss_mlp": 0.01257648, + "epoch": 0.5496467758905756, + "flos": 21322421758080.0, + "grad_norm": 1.9135284052459163, + "language_loss": 0.75301933, + "learning_rate": 1.7759466512516346e-06, + "loss": 0.82990575, + "num_input_tokens_seen": 196984930, + "router_z_loss_clip": 1.42285156, + "router_z_loss_mlp": 0.10577393, + "step": 9142, + "time_per_iteration": 2.517458438873291 + }, + { + "auxiliary_loss_clip": 0.06433351, + "auxiliary_loss_mlp": 0.01271982, + "balance_loss_clip": 0.06284253, + "balance_loss_mlp": 0.01259895, + "epoch": 0.5497068991432437, + "flos": 22239021571200.0, + "grad_norm": 1.7111366793556597, + "language_loss": 0.77014959, + "learning_rate": 1.7755596454041192e-06, + "loss": 0.84720296, + "num_input_tokens_seen": 197002320, + "router_z_loss_clip": 1.49121094, + "router_z_loss_mlp": 0.12091064, + "step": 9143, + "time_per_iteration": 2.516505002975464 + }, + { + "auxiliary_loss_clip": 0.06424481, + "auxiliary_loss_mlp": 0.01268074, + "balance_loss_clip": 0.06278251, + "balance_loss_mlp": 0.01256416, + "epoch": 0.5497670223959116, + "flos": 18485076401280.0, + "grad_norm": 3.356687572137957, + "language_loss": 0.79973668, + "learning_rate": 1.7751726480674044e-06, + "loss": 0.87666219, + "num_input_tokens_seen": 197020825, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.11663818, + "step": 9144, + "time_per_iteration": 2.4832475185394287 + }, + { + "auxiliary_loss_clip": 0.0642961, + "auxiliary_loss_mlp": 0.01268496, + "balance_loss_clip": 0.06281358, + "balance_loss_mlp": 0.01257153, + "epoch": 0.5498271456485796, + "flos": 29212750131840.0, + "grad_norm": 1.7313830940317911, + "language_loss": 0.7154156, + "learning_rate": 1.7747856592561645e-06, + "loss": 0.79239666, + "num_input_tokens_seen": 197040450, + "router_z_loss_clip": 1.48242188, + "router_z_loss_mlp": 0.11346436, + "step": 9145, + "time_per_iteration": 2.6261048316955566 + }, + { + "auxiliary_loss_clip": 0.06426725, + "auxiliary_loss_mlp": 0.01264568, + "balance_loss_clip": 0.06279887, + "balance_loss_mlp": 0.01254197, + "epoch": 0.5498872689012475, + "flos": 34833032017920.0, + "grad_norm": 1.5682468167397778, + "language_loss": 0.70529747, + "learning_rate": 1.774398678985076e-06, + "loss": 0.78221035, + "num_input_tokens_seen": 197063930, + "router_z_loss_clip": 1.46582031, + "router_z_loss_mlp": 0.10369873, + "step": 9146, + "time_per_iteration": 4.087557315826416 + }, + { + "auxiliary_loss_clip": 0.06419109, + "auxiliary_loss_mlp": 0.01264014, + "balance_loss_clip": 0.06276917, + "balance_loss_mlp": 0.01253923, + "epoch": 0.5499473921539155, + "flos": 25929124329600.0, + "grad_norm": 2.0128119517228305, + "language_loss": 0.64188051, + "learning_rate": 1.7740117072688113e-06, + "loss": 0.71871173, + "num_input_tokens_seen": 197082660, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.10095215, + "step": 9147, + "time_per_iteration": 2.5406603813171387 + }, + { + "auxiliary_loss_clip": 0.06424303, + "auxiliary_loss_mlp": 0.01265827, + "balance_loss_clip": 0.06279408, + "balance_loss_mlp": 0.01255122, + "epoch": 0.5500075154065835, + "flos": 22280334433920.0, + "grad_norm": 1.893989099652022, + "language_loss": 0.81534255, + "learning_rate": 1.7736247441220458e-06, + "loss": 0.89224386, + "num_input_tokens_seen": 197100675, + "router_z_loss_clip": 1.44921875, + "router_z_loss_mlp": 0.1071167, + "step": 9148, + "time_per_iteration": 2.5051376819610596 + }, + { + "auxiliary_loss_clip": 0.06424436, + "auxiliary_loss_mlp": 0.01270935, + "balance_loss_clip": 0.06277981, + "balance_loss_mlp": 0.0125992, + "epoch": 0.5500676386592515, + "flos": 28044946177920.0, + "grad_norm": 1.7460739337347344, + "language_loss": 0.7916007, + "learning_rate": 1.773237789559453e-06, + "loss": 0.86855441, + "num_input_tokens_seen": 197121320, + "router_z_loss_clip": 1.46484375, + "router_z_loss_mlp": 0.11016846, + "step": 9149, + "time_per_iteration": 2.5586931705474854 + }, + { + "auxiliary_loss_clip": 0.0642364, + "auxiliary_loss_mlp": 0.01264747, + "balance_loss_clip": 0.06277739, + "balance_loss_mlp": 0.01253852, + "epoch": 0.5501277619119195, + "flos": 23921602283520.0, + "grad_norm": 2.0079288501902965, + "language_loss": 0.7263124, + "learning_rate": 1.7728508435957052e-06, + "loss": 0.80319625, + "num_input_tokens_seen": 197138965, + "router_z_loss_clip": 1.45996094, + "router_z_loss_mlp": 0.10888672, + "step": 9150, + "time_per_iteration": 2.5097196102142334 + }, + { + "auxiliary_loss_clip": 0.06428004, + "auxiliary_loss_mlp": 0.01265548, + "balance_loss_clip": 0.06278474, + "balance_loss_mlp": 0.01253454, + "epoch": 0.5501878851645874, + "flos": 20930199995520.0, + "grad_norm": 1.7516173490285718, + "language_loss": 0.74991822, + "learning_rate": 1.772463906245477e-06, + "loss": 0.82685369, + "num_input_tokens_seen": 197156460, + "router_z_loss_clip": 1.49316406, + "router_z_loss_mlp": 0.12103271, + "step": 9151, + "time_per_iteration": 2.4953532218933105 + }, + { + "auxiliary_loss_clip": 0.06421181, + "auxiliary_loss_mlp": 0.01264237, + "balance_loss_clip": 0.06275992, + "balance_loss_mlp": 0.01253317, + "epoch": 0.5502480084172554, + "flos": 20671155498240.0, + "grad_norm": 1.7180580365194615, + "language_loss": 0.76128006, + "learning_rate": 1.7720769775234394e-06, + "loss": 0.83813429, + "num_input_tokens_seen": 197175140, + "router_z_loss_clip": 1.45117188, + "router_z_loss_mlp": 0.10925293, + "step": 9152, + "time_per_iteration": 2.5041630268096924 + }, + { + "auxiliary_loss_clip": 0.06418908, + "auxiliary_loss_mlp": 0.01264981, + "balance_loss_clip": 0.06276076, + "balance_loss_mlp": 0.01254336, + "epoch": 0.5503081316699233, + "flos": 26439792238080.0, + "grad_norm": 3.86516963702514, + "language_loss": 0.82636946, + "learning_rate": 1.7716900574442662e-06, + "loss": 0.90320837, + "num_input_tokens_seen": 197194345, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.10650635, + "step": 9153, + "time_per_iteration": 4.000823259353638 + }, + { + "auxiliary_loss_clip": 0.06419568, + "auxiliary_loss_mlp": 0.01266317, + "balance_loss_clip": 0.0627673, + "balance_loss_mlp": 0.01254682, + "epoch": 0.5503682549225913, + "flos": 30637208741760.0, + "grad_norm": 1.7185020713354737, + "language_loss": 0.7442615, + "learning_rate": 1.7713031460226294e-06, + "loss": 0.82112032, + "num_input_tokens_seen": 197215535, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.11633301, + "step": 9154, + "time_per_iteration": 2.619478225708008 + }, + { + "auxiliary_loss_clip": 0.06431979, + "auxiliary_loss_mlp": 0.01267491, + "balance_loss_clip": 0.0627896, + "balance_loss_mlp": 0.01256273, + "epoch": 0.5504283781752592, + "flos": 22572096750720.0, + "grad_norm": 1.5448619232700234, + "language_loss": 0.73359931, + "learning_rate": 1.770916243273199e-06, + "loss": 0.81059402, + "num_input_tokens_seen": 197234945, + "router_z_loss_clip": 1.53027344, + "router_z_loss_mlp": 0.11212158, + "step": 9155, + "time_per_iteration": 2.5512940883636475 + }, + { + "auxiliary_loss_clip": 0.0632084, + "auxiliary_loss_mlp": 0.01252943, + "balance_loss_clip": 0.06258567, + "balance_loss_mlp": 0.01251311, + "epoch": 0.5504885014279273, + "flos": 67918634663040.0, + "grad_norm": 0.7176527357407121, + "language_loss": 0.5550307, + "learning_rate": 1.7705293492106483e-06, + "loss": 0.63076854, + "num_input_tokens_seen": 197302285, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 0.01634216, + "step": 9156, + "time_per_iteration": 3.3401191234588623 + }, + { + "auxiliary_loss_clip": 0.06423487, + "auxiliary_loss_mlp": 0.01263997, + "balance_loss_clip": 0.06277417, + "balance_loss_mlp": 0.01254115, + "epoch": 0.5505486246805952, + "flos": 22455705029760.0, + "grad_norm": 1.7228062733410818, + "language_loss": 0.82601535, + "learning_rate": 1.7701424638496475e-06, + "loss": 0.90289015, + "num_input_tokens_seen": 197321575, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.09881592, + "step": 9157, + "time_per_iteration": 2.5331945419311523 + }, + { + "auxiliary_loss_clip": 0.06433383, + "auxiliary_loss_mlp": 0.01267609, + "balance_loss_clip": 0.06279938, + "balance_loss_mlp": 0.01255885, + "epoch": 0.5506087479332632, + "flos": 26914220455680.0, + "grad_norm": 2.384583042502796, + "language_loss": 0.7632947, + "learning_rate": 1.7697555872048677e-06, + "loss": 0.84030461, + "num_input_tokens_seen": 197340255, + "router_z_loss_clip": 1.53320312, + "router_z_loss_mlp": 0.11743164, + "step": 9158, + "time_per_iteration": 2.5622854232788086 + }, + { + "auxiliary_loss_clip": 0.06422579, + "auxiliary_loss_mlp": 0.01265094, + "balance_loss_clip": 0.06281133, + "balance_loss_mlp": 0.01255134, + "epoch": 0.5506688711859311, + "flos": 22936967354880.0, + "grad_norm": 1.858566635879154, + "language_loss": 0.70421213, + "learning_rate": 1.769368719290979e-06, + "loss": 0.78108883, + "num_input_tokens_seen": 197360360, + "router_z_loss_clip": 1.41503906, + "router_z_loss_mlp": 0.09967041, + "step": 9159, + "time_per_iteration": 2.5299885272979736 + }, + { + "auxiliary_loss_clip": 0.06426555, + "auxiliary_loss_mlp": 0.01265176, + "balance_loss_clip": 0.06279982, + "balance_loss_mlp": 0.01254114, + "epoch": 0.5507289944385991, + "flos": 29614111989120.0, + "grad_norm": 1.5102709537150474, + "language_loss": 0.68438101, + "learning_rate": 1.7689818601226516e-06, + "loss": 0.7612983, + "num_input_tokens_seen": 197381905, + "router_z_loss_clip": 1.46484375, + "router_z_loss_mlp": 0.11065674, + "step": 9160, + "time_per_iteration": 2.5797348022460938 + }, + { + "auxiliary_loss_clip": 0.06423666, + "auxiliary_loss_mlp": 0.01264259, + "balance_loss_clip": 0.06278166, + "balance_loss_mlp": 0.01252774, + "epoch": 0.5507891176912671, + "flos": 15338736714240.0, + "grad_norm": 1.8978617290593418, + "language_loss": 0.7231009, + "learning_rate": 1.7685950097145552e-06, + "loss": 0.79998016, + "num_input_tokens_seen": 197398555, + "router_z_loss_clip": 1.45605469, + "router_z_loss_mlp": 0.11474609, + "step": 9161, + "time_per_iteration": 2.4746181964874268 + }, + { + "auxiliary_loss_clip": 0.06425308, + "auxiliary_loss_mlp": 0.01270177, + "balance_loss_clip": 0.0627985, + "balance_loss_mlp": 0.01259472, + "epoch": 0.5508492409439351, + "flos": 26585547615360.0, + "grad_norm": 4.143741197260591, + "language_loss": 0.69514179, + "learning_rate": 1.768208168081359e-06, + "loss": 0.77209663, + "num_input_tokens_seen": 197419630, + "router_z_loss_clip": 1.45507812, + "router_z_loss_mlp": 0.10717773, + "step": 9162, + "time_per_iteration": 2.601036548614502 + }, + { + "auxiliary_loss_clip": 0.06422161, + "auxiliary_loss_mlp": 0.01271792, + "balance_loss_clip": 0.06278013, + "balance_loss_mlp": 0.01261164, + "epoch": 0.5509093641966031, + "flos": 25449832575360.0, + "grad_norm": 1.6789972101454846, + "language_loss": 0.85959709, + "learning_rate": 1.767821335237733e-06, + "loss": 0.93653667, + "num_input_tokens_seen": 197438480, + "router_z_loss_clip": 1.44140625, + "router_z_loss_mlp": 0.10638428, + "step": 9163, + "time_per_iteration": 2.539546489715576 + }, + { + "auxiliary_loss_clip": 0.06425934, + "auxiliary_loss_mlp": 0.0126949, + "balance_loss_clip": 0.06282654, + "balance_loss_mlp": 0.01258856, + "epoch": 0.550969487449271, + "flos": 18704652825600.0, + "grad_norm": 1.572244133846192, + "language_loss": 0.81101871, + "learning_rate": 1.7674345111983441e-06, + "loss": 0.88797295, + "num_input_tokens_seen": 197456755, + "router_z_loss_clip": 1.43359375, + "router_z_loss_mlp": 0.10638428, + "step": 9164, + "time_per_iteration": 2.5266709327697754 + }, + { + "auxiliary_loss_clip": 0.06427547, + "auxiliary_loss_mlp": 0.01271715, + "balance_loss_clip": 0.06278498, + "balance_loss_mlp": 0.01260026, + "epoch": 0.551029610701939, + "flos": 22714959162240.0, + "grad_norm": 1.8760540237074659, + "language_loss": 0.73664248, + "learning_rate": 1.767047695977863e-06, + "loss": 0.81363511, + "num_input_tokens_seen": 197475530, + "router_z_loss_clip": 1.49121094, + "router_z_loss_mlp": 0.11688232, + "step": 9165, + "time_per_iteration": 2.511892318725586 + }, + { + "auxiliary_loss_clip": 0.06419477, + "auxiliary_loss_mlp": 0.01269172, + "balance_loss_clip": 0.06277155, + "balance_loss_mlp": 0.01258479, + "epoch": 0.5510897339546069, + "flos": 12425138542080.0, + "grad_norm": 2.0479120482719084, + "language_loss": 0.79496598, + "learning_rate": 1.7666608895909563e-06, + "loss": 0.87185252, + "num_input_tokens_seen": 197490835, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.10687256, + "step": 9166, + "time_per_iteration": 2.5217325687408447 + }, + { + "auxiliary_loss_clip": 0.06426241, + "auxiliary_loss_mlp": 0.01268783, + "balance_loss_clip": 0.06279847, + "balance_loss_mlp": 0.01257232, + "epoch": 0.5511498572072749, + "flos": 18776545228800.0, + "grad_norm": 2.094065158330193, + "language_loss": 0.77047074, + "learning_rate": 1.7662740920522913e-06, + "loss": 0.84742099, + "num_input_tokens_seen": 197508770, + "router_z_loss_clip": 1.46386719, + "router_z_loss_mlp": 0.11560059, + "step": 9167, + "time_per_iteration": 2.5210516452789307 + }, + { + "auxiliary_loss_clip": 0.06422734, + "auxiliary_loss_mlp": 0.01276612, + "balance_loss_clip": 0.06278996, + "balance_loss_mlp": 0.01264995, + "epoch": 0.5512099804599428, + "flos": 19579436651520.0, + "grad_norm": 1.8110306936777156, + "language_loss": 0.80698925, + "learning_rate": 1.7658873033765374e-06, + "loss": 0.88398266, + "num_input_tokens_seen": 197527340, + "router_z_loss_clip": 1.43847656, + "router_z_loss_mlp": 0.11627197, + "step": 9168, + "time_per_iteration": 2.5044801235198975 + }, + { + "auxiliary_loss_clip": 0.06426235, + "auxiliary_loss_mlp": 0.01266078, + "balance_loss_clip": 0.06278569, + "balance_loss_mlp": 0.01255206, + "epoch": 0.5512701037126109, + "flos": 26252053165440.0, + "grad_norm": 1.768039916500128, + "language_loss": 0.6941396, + "learning_rate": 1.7655005235783591e-06, + "loss": 0.77106273, + "num_input_tokens_seen": 197547280, + "router_z_loss_clip": 1.47460938, + "router_z_loss_mlp": 0.10876465, + "step": 9169, + "time_per_iteration": 2.5712435245513916 + }, + { + "auxiliary_loss_clip": 0.06426435, + "auxiliary_loss_mlp": 0.01277267, + "balance_loss_clip": 0.06284146, + "balance_loss_mlp": 0.01267092, + "epoch": 0.5513302269652788, + "flos": 21951997009920.0, + "grad_norm": 1.7919633768432253, + "language_loss": 0.85238504, + "learning_rate": 1.7651137526724251e-06, + "loss": 0.92942202, + "num_input_tokens_seen": 197565045, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.10174561, + "step": 9170, + "time_per_iteration": 2.6517226696014404 + }, + { + "auxiliary_loss_clip": 0.06339835, + "auxiliary_loss_mlp": 0.01252247, + "balance_loss_clip": 0.06277715, + "balance_loss_mlp": 0.01250597, + "epoch": 0.5513903502179468, + "flos": 68254728589440.0, + "grad_norm": 0.7663699077680228, + "language_loss": 0.59884483, + "learning_rate": 1.7647269906734017e-06, + "loss": 0.67476565, + "num_input_tokens_seen": 197625005, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 0.01652527, + "step": 9171, + "time_per_iteration": 3.190981864929199 + }, + { + "auxiliary_loss_clip": 0.06426144, + "auxiliary_loss_mlp": 0.01271114, + "balance_loss_clip": 0.06280371, + "balance_loss_mlp": 0.01260159, + "epoch": 0.5514504734706147, + "flos": 18740221683840.0, + "grad_norm": 1.5861452481841698, + "language_loss": 0.7047599, + "learning_rate": 1.7643402375959533e-06, + "loss": 0.78173256, + "num_input_tokens_seen": 197645050, + "router_z_loss_clip": 1.45800781, + "router_z_loss_mlp": 0.10961914, + "step": 9172, + "time_per_iteration": 2.5032176971435547 + }, + { + "auxiliary_loss_clip": 0.06426188, + "auxiliary_loss_mlp": 0.01273715, + "balance_loss_clip": 0.06281123, + "balance_loss_mlp": 0.01263218, + "epoch": 0.5515105967232827, + "flos": 22277147978880.0, + "grad_norm": 1.7175476935278873, + "language_loss": 0.76203263, + "learning_rate": 1.7639534934547474e-06, + "loss": 0.8390317, + "num_input_tokens_seen": 197663910, + "router_z_loss_clip": 1.44921875, + "router_z_loss_mlp": 0.10498047, + "step": 9173, + "time_per_iteration": 2.577878713607788 + }, + { + "auxiliary_loss_clip": 0.06421756, + "auxiliary_loss_mlp": 0.01264421, + "balance_loss_clip": 0.0627896, + "balance_loss_mlp": 0.01253359, + "epoch": 0.5515707199759508, + "flos": 22563040510080.0, + "grad_norm": 1.5999460100016771, + "language_loss": 0.75182664, + "learning_rate": 1.7635667582644484e-06, + "loss": 0.82868844, + "num_input_tokens_seen": 197681580, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.11077881, + "step": 9174, + "time_per_iteration": 2.520578384399414 + }, + { + "auxiliary_loss_clip": 0.06429856, + "auxiliary_loss_mlp": 0.0126509, + "balance_loss_clip": 0.06282729, + "balance_loss_mlp": 0.01253866, + "epoch": 0.5516308432286187, + "flos": 28298246670720.0, + "grad_norm": 1.7068220971376928, + "language_loss": 0.72958624, + "learning_rate": 1.7631800320397217e-06, + "loss": 0.80653572, + "num_input_tokens_seen": 197702095, + "router_z_loss_clip": 1.47070312, + "router_z_loss_mlp": 0.11206055, + "step": 9175, + "time_per_iteration": 2.5991220474243164 + }, + { + "auxiliary_loss_clip": 0.06423448, + "auxiliary_loss_mlp": 0.01272105, + "balance_loss_clip": 0.06278881, + "balance_loss_mlp": 0.01261192, + "epoch": 0.5516909664812867, + "flos": 18769417632000.0, + "grad_norm": 1.996679187528513, + "language_loss": 0.69295454, + "learning_rate": 1.7627933147952318e-06, + "loss": 0.7699101, + "num_input_tokens_seen": 197720720, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.10919189, + "step": 9176, + "time_per_iteration": 2.4903998374938965 + }, + { + "auxiliary_loss_clip": 0.06421016, + "auxiliary_loss_mlp": 0.01270885, + "balance_loss_clip": 0.06278497, + "balance_loss_mlp": 0.01260467, + "epoch": 0.5517510897339546, + "flos": 27746852878080.0, + "grad_norm": 1.714802927656724, + "language_loss": 0.71279752, + "learning_rate": 1.7624066065456435e-06, + "loss": 0.78971648, + "num_input_tokens_seen": 197741820, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.10418701, + "step": 9177, + "time_per_iteration": 3.9531290531158447 + }, + { + "auxiliary_loss_clip": 0.06428478, + "auxiliary_loss_mlp": 0.01269605, + "balance_loss_clip": 0.06282966, + "balance_loss_mlp": 0.0125924, + "epoch": 0.5518112129866226, + "flos": 18410165251200.0, + "grad_norm": 1.801915682479776, + "language_loss": 0.80691963, + "learning_rate": 1.7620199073056204e-06, + "loss": 0.8839004, + "num_input_tokens_seen": 197759160, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.10369873, + "step": 9178, + "time_per_iteration": 2.5356597900390625 + }, + { + "auxiliary_loss_clip": 0.06432515, + "auxiliary_loss_mlp": 0.01265625, + "balance_loss_clip": 0.06282209, + "balance_loss_mlp": 0.01254228, + "epoch": 0.5518713362392905, + "flos": 25089699726720.0, + "grad_norm": 1.5622133019409348, + "language_loss": 0.7545979, + "learning_rate": 1.761633217089826e-06, + "loss": 0.83157933, + "num_input_tokens_seen": 197779760, + "router_z_loss_clip": 1.50292969, + "router_z_loss_mlp": 0.11395264, + "step": 9179, + "time_per_iteration": 2.598055124282837 + }, + { + "auxiliary_loss_clip": 0.06425376, + "auxiliary_loss_mlp": 0.01269609, + "balance_loss_clip": 0.06280036, + "balance_loss_mlp": 0.01259005, + "epoch": 0.5519314594919585, + "flos": 36547911279360.0, + "grad_norm": 1.6999645614086591, + "language_loss": 0.70073718, + "learning_rate": 1.761246535912924e-06, + "loss": 0.77768701, + "num_input_tokens_seen": 197801545, + "router_z_loss_clip": 1.453125, + "router_z_loss_mlp": 0.1060791, + "step": 9180, + "time_per_iteration": 2.6791419982910156 + }, + { + "auxiliary_loss_clip": 0.06424871, + "auxiliary_loss_mlp": 0.01268506, + "balance_loss_clip": 0.06279478, + "balance_loss_mlp": 0.01257121, + "epoch": 0.5519915827446265, + "flos": 20454807456000.0, + "grad_norm": 1.7661274413355668, + "language_loss": 0.67505682, + "learning_rate": 1.7608598637895776e-06, + "loss": 0.75199056, + "num_input_tokens_seen": 197820760, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.11376953, + "step": 9181, + "time_per_iteration": 4.004978656768799 + }, + { + "auxiliary_loss_clip": 0.06431428, + "auxiliary_loss_mlp": 0.01267631, + "balance_loss_clip": 0.06280805, + "balance_loss_mlp": 0.01256682, + "epoch": 0.5520517059972945, + "flos": 23774672949120.0, + "grad_norm": 1.9095811471330626, + "language_loss": 0.79281217, + "learning_rate": 1.7604732007344486e-06, + "loss": 0.86980277, + "num_input_tokens_seen": 197840195, + "router_z_loss_clip": 1.50683594, + "router_z_loss_mlp": 0.10949707, + "step": 9182, + "time_per_iteration": 2.537867546081543 + }, + { + "auxiliary_loss_clip": 0.06428897, + "auxiliary_loss_mlp": 0.0126956, + "balance_loss_clip": 0.06281601, + "balance_loss_mlp": 0.01258259, + "epoch": 0.5521118292499624, + "flos": 22202362609920.0, + "grad_norm": 1.7640468757897252, + "language_loss": 0.83230162, + "learning_rate": 1.7600865467622003e-06, + "loss": 0.9092862, + "num_input_tokens_seen": 197859475, + "router_z_loss_clip": 1.47167969, + "router_z_loss_mlp": 0.11303711, + "step": 9183, + "time_per_iteration": 2.5279808044433594 + }, + { + "auxiliary_loss_clip": 0.0642349, + "auxiliary_loss_mlp": 0.01270068, + "balance_loss_clip": 0.0627853, + "balance_loss_mlp": 0.01259632, + "epoch": 0.5521719525026304, + "flos": 23589491425920.0, + "grad_norm": 1.2800662076099543, + "language_loss": 0.67446053, + "learning_rate": 1.7596999018874936e-06, + "loss": 0.75139618, + "num_input_tokens_seen": 197879395, + "router_z_loss_clip": 1.45019531, + "router_z_loss_mlp": 0.10437012, + "step": 9184, + "time_per_iteration": 2.684945821762085 + }, + { + "auxiliary_loss_clip": 0.06425154, + "auxiliary_loss_mlp": 0.01269673, + "balance_loss_clip": 0.06279694, + "balance_loss_mlp": 0.01258652, + "epoch": 0.5522320757552983, + "flos": 26144298414720.0, + "grad_norm": 1.5606033277911597, + "language_loss": 0.76214409, + "learning_rate": 1.7593132661249917e-06, + "loss": 0.83909237, + "num_input_tokens_seen": 197900815, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.11016846, + "step": 9185, + "time_per_iteration": 2.654999017715454 + }, + { + "auxiliary_loss_clip": 0.06428938, + "auxiliary_loss_mlp": 0.01270824, + "balance_loss_clip": 0.06280778, + "balance_loss_mlp": 0.01259661, + "epoch": 0.5522921990079663, + "flos": 24682258448640.0, + "grad_norm": 1.714573937603497, + "language_loss": 0.73903292, + "learning_rate": 1.7589266394893536e-06, + "loss": 0.8160305, + "num_input_tokens_seen": 197918985, + "router_z_loss_clip": 1.47949219, + "router_z_loss_mlp": 0.1116333, + "step": 9186, + "time_per_iteration": 4.173564672470093 + }, + { + "auxiliary_loss_clip": 0.06430478, + "auxiliary_loss_mlp": 0.0127082, + "balance_loss_clip": 0.06282008, + "balance_loss_mlp": 0.01260032, + "epoch": 0.5523523222606344, + "flos": 22754888432640.0, + "grad_norm": 1.9890242222634391, + "language_loss": 0.66822404, + "learning_rate": 1.7585400219952421e-06, + "loss": 0.74523699, + "num_input_tokens_seen": 197937725, + "router_z_loss_clip": 1.48339844, + "router_z_loss_mlp": 0.10784912, + "step": 9187, + "time_per_iteration": 2.5402488708496094 + }, + { + "auxiliary_loss_clip": 0.06424463, + "auxiliary_loss_mlp": 0.01272464, + "balance_loss_clip": 0.06278258, + "balance_loss_mlp": 0.01261663, + "epoch": 0.5524124455133023, + "flos": 19761976771200.0, + "grad_norm": 1.6249988598177185, + "language_loss": 0.77965587, + "learning_rate": 1.758153413657318e-06, + "loss": 0.85662508, + "num_input_tokens_seen": 197955635, + "router_z_loss_clip": 1.45996094, + "router_z_loss_mlp": 0.10803223, + "step": 9188, + "time_per_iteration": 2.4915547370910645 + }, + { + "auxiliary_loss_clip": 0.06426179, + "auxiliary_loss_mlp": 0.01274155, + "balance_loss_clip": 0.06280048, + "balance_loss_mlp": 0.01262579, + "epoch": 0.5524725687659703, + "flos": 23301544469760.0, + "grad_norm": 1.615723789328545, + "language_loss": 0.81586993, + "learning_rate": 1.7577668144902394e-06, + "loss": 0.89287329, + "num_input_tokens_seen": 197974490, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.11572266, + "step": 9189, + "time_per_iteration": 2.540083885192871 + }, + { + "auxiliary_loss_clip": 0.06419186, + "auxiliary_loss_mlp": 0.01269353, + "balance_loss_clip": 0.06276601, + "balance_loss_mlp": 0.0125776, + "epoch": 0.5525326920186382, + "flos": 24868907418240.0, + "grad_norm": 1.331008644060519, + "language_loss": 0.76847303, + "learning_rate": 1.7573802245086684e-06, + "loss": 0.84535837, + "num_input_tokens_seen": 197995735, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.1159668, + "step": 9190, + "time_per_iteration": 2.597717046737671 + }, + { + "auxiliary_loss_clip": 0.0643147, + "auxiliary_loss_mlp": 0.01272383, + "balance_loss_clip": 0.06278718, + "balance_loss_mlp": 0.01260438, + "epoch": 0.5525928152713062, + "flos": 13740710371200.0, + "grad_norm": 2.3910114977567787, + "language_loss": 0.79437977, + "learning_rate": 1.7569936437272627e-06, + "loss": 0.87141836, + "num_input_tokens_seen": 198009685, + "router_z_loss_clip": 1.52636719, + "router_z_loss_mlp": 0.11950684, + "step": 9191, + "time_per_iteration": 2.547445774078369 + }, + { + "auxiliary_loss_clip": 0.06422585, + "auxiliary_loss_mlp": 0.01264097, + "balance_loss_clip": 0.06276913, + "balance_loss_mlp": 0.01253624, + "epoch": 0.5526529385239741, + "flos": 13075398552960.0, + "grad_norm": 2.207227027061606, + "language_loss": 0.6899271, + "learning_rate": 1.7566070721606829e-06, + "loss": 0.76679391, + "num_input_tokens_seen": 198026845, + "router_z_loss_clip": 1.45800781, + "router_z_loss_mlp": 0.10473633, + "step": 9192, + "time_per_iteration": 2.4774858951568604 + }, + { + "auxiliary_loss_clip": 0.06421191, + "auxiliary_loss_mlp": 0.01268175, + "balance_loss_clip": 0.06277353, + "balance_loss_mlp": 0.01257786, + "epoch": 0.5527130617766421, + "flos": 23154992478720.0, + "grad_norm": 1.5351732563488263, + "language_loss": 0.77348876, + "learning_rate": 1.756220509823588e-06, + "loss": 0.85038239, + "num_input_tokens_seen": 198045275, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.10400391, + "step": 9193, + "time_per_iteration": 3.9115588665008545 + }, + { + "auxiliary_loss_clip": 0.06421337, + "auxiliary_loss_mlp": 0.01271193, + "balance_loss_clip": 0.06275223, + "balance_loss_mlp": 0.01260357, + "epoch": 0.55277318502931, + "flos": 21291506801280.0, + "grad_norm": 1.5126002389204065, + "language_loss": 0.79036456, + "learning_rate": 1.7558339567306344e-06, + "loss": 0.8672899, + "num_input_tokens_seen": 198065760, + "router_z_loss_clip": 1.45996094, + "router_z_loss_mlp": 0.1083374, + "step": 9194, + "time_per_iteration": 2.5319602489471436 + }, + { + "auxiliary_loss_clip": 0.06427231, + "auxiliary_loss_mlp": 0.01269531, + "balance_loss_clip": 0.06274066, + "balance_loss_mlp": 0.01258189, + "epoch": 0.5528333082819781, + "flos": 38333383205760.0, + "grad_norm": 1.8079647356103097, + "language_loss": 0.70506799, + "learning_rate": 1.7554474128964825e-06, + "loss": 0.78203559, + "num_input_tokens_seen": 198087595, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.11340332, + "step": 9195, + "time_per_iteration": 2.6384387016296387 + }, + { + "auxiliary_loss_clip": 0.06436112, + "auxiliary_loss_mlp": 0.01266283, + "balance_loss_clip": 0.06281462, + "balance_loss_mlp": 0.01253778, + "epoch": 0.552893431534646, + "flos": 13558799157120.0, + "grad_norm": 2.003941554047622, + "language_loss": 0.74570775, + "learning_rate": 1.7550608783357887e-06, + "loss": 0.82273173, + "num_input_tokens_seen": 198104620, + "router_z_loss_clip": 1.546875, + "router_z_loss_mlp": 0.12506104, + "step": 9196, + "time_per_iteration": 2.5033600330352783 + }, + { + "auxiliary_loss_clip": 0.06429259, + "auxiliary_loss_mlp": 0.01263784, + "balance_loss_clip": 0.0628302, + "balance_loss_mlp": 0.01252656, + "epoch": 0.552953554787314, + "flos": 21944995194240.0, + "grad_norm": 1.6318385903460113, + "language_loss": 0.77179539, + "learning_rate": 1.7546743530632115e-06, + "loss": 0.8487258, + "num_input_tokens_seen": 198123565, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.11126709, + "step": 9197, + "time_per_iteration": 2.500624895095825 + }, + { + "auxiliary_loss_clip": 0.06421226, + "auxiliary_loss_mlp": 0.01269574, + "balance_loss_clip": 0.06276499, + "balance_loss_mlp": 0.01259316, + "epoch": 0.5530136780399819, + "flos": 43668820736640.0, + "grad_norm": 1.4562548285485233, + "language_loss": 0.76468647, + "learning_rate": 1.754287837093407e-06, + "loss": 0.84159452, + "num_input_tokens_seen": 198148270, + "router_z_loss_clip": 1.44824219, + "router_z_loss_mlp": 0.1026001, + "step": 9198, + "time_per_iteration": 2.7432668209075928 + }, + { + "auxiliary_loss_clip": 0.06427757, + "auxiliary_loss_mlp": 0.0126746, + "balance_loss_clip": 0.06281044, + "balance_loss_mlp": 0.01256994, + "epoch": 0.5530738012926499, + "flos": 25052411859840.0, + "grad_norm": 1.5004430901507595, + "language_loss": 0.79301012, + "learning_rate": 1.7539013304410327e-06, + "loss": 0.86996233, + "num_input_tokens_seen": 198168810, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 0.10461426, + "step": 9199, + "time_per_iteration": 2.547755241394043 + }, + { + "auxiliary_loss_clip": 0.06422742, + "auxiliary_loss_mlp": 0.01266548, + "balance_loss_clip": 0.06276976, + "balance_loss_mlp": 0.01255962, + "epoch": 0.553133924545318, + "flos": 16477680136320.0, + "grad_norm": 1.9305306774012563, + "language_loss": 0.63492346, + "learning_rate": 1.7535148331207443e-06, + "loss": 0.71181637, + "num_input_tokens_seen": 198186200, + "router_z_loss_clip": 1.45605469, + "router_z_loss_mlp": 0.10577393, + "step": 9200, + "time_per_iteration": 2.5127363204956055 + }, + { + "auxiliary_loss_clip": 0.06431345, + "auxiliary_loss_mlp": 0.01265429, + "balance_loss_clip": 0.06280623, + "balance_loss_mlp": 0.01253866, + "epoch": 0.5531940477979859, + "flos": 24612797813760.0, + "grad_norm": 1.757338852617271, + "language_loss": 0.66817963, + "learning_rate": 1.7531283451471978e-06, + "loss": 0.74514735, + "num_input_tokens_seen": 198207050, + "router_z_loss_clip": 1.50683594, + "router_z_loss_mlp": 0.11560059, + "step": 9201, + "time_per_iteration": 2.5651068687438965 + }, + { + "auxiliary_loss_clip": 0.06425701, + "auxiliary_loss_mlp": 0.01270434, + "balance_loss_clip": 0.06278911, + "balance_loss_mlp": 0.0125871, + "epoch": 0.5532541710506539, + "flos": 22165410159360.0, + "grad_norm": 2.045638683899954, + "language_loss": 0.61266994, + "learning_rate": 1.7527418665350502e-06, + "loss": 0.68963134, + "num_input_tokens_seen": 198224565, + "router_z_loss_clip": 1.46679688, + "router_z_loss_mlp": 0.11737061, + "step": 9202, + "time_per_iteration": 2.5841257572174072 + }, + { + "auxiliary_loss_clip": 0.06419975, + "auxiliary_loss_mlp": 0.01264358, + "balance_loss_clip": 0.06278098, + "balance_loss_mlp": 0.01253493, + "epoch": 0.5533142943033218, + "flos": 21403621964160.0, + "grad_norm": 1.6777411475808515, + "language_loss": 0.64766765, + "learning_rate": 1.7523553972989548e-06, + "loss": 0.72451103, + "num_input_tokens_seen": 198244790, + "router_z_loss_clip": 1.41894531, + "router_z_loss_mlp": 0.10864258, + "step": 9203, + "time_per_iteration": 2.502300977706909 + }, + { + "auxiliary_loss_clip": 0.06425197, + "auxiliary_loss_mlp": 0.01269086, + "balance_loss_clip": 0.06279255, + "balance_loss_mlp": 0.01258065, + "epoch": 0.5533744175559898, + "flos": 23557360584960.0, + "grad_norm": 1.630044734052438, + "language_loss": 0.63918829, + "learning_rate": 1.7519689374535683e-06, + "loss": 0.71613109, + "num_input_tokens_seen": 198264375, + "router_z_loss_clip": 1.45996094, + "router_z_loss_mlp": 0.11022949, + "step": 9204, + "time_per_iteration": 2.5487308502197266 + }, + { + "auxiliary_loss_clip": 0.0642142, + "auxiliary_loss_mlp": 0.01264869, + "balance_loss_clip": 0.06278381, + "balance_loss_mlp": 0.01254451, + "epoch": 0.5534345408086577, + "flos": 24068447763840.0, + "grad_norm": 1.4496742073495597, + "language_loss": 0.77449042, + "learning_rate": 1.7515824870135445e-06, + "loss": 0.85135335, + "num_input_tokens_seen": 198283895, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.10418701, + "step": 9205, + "time_per_iteration": 2.5445451736450195 + }, + { + "auxiliary_loss_clip": 0.06419459, + "auxiliary_loss_mlp": 0.01264463, + "balance_loss_clip": 0.06277758, + "balance_loss_mlp": 0.01254104, + "epoch": 0.5534946640613257, + "flos": 33781242441600.0, + "grad_norm": 1.38023808830968, + "language_loss": 0.72729224, + "learning_rate": 1.751196045993537e-06, + "loss": 0.80413151, + "num_input_tokens_seen": 198310035, + "router_z_loss_clip": 1.41699219, + "router_z_loss_mlp": 0.1036377, + "step": 9206, + "time_per_iteration": 2.7339117527008057 + }, + { + "auxiliary_loss_clip": 0.06421407, + "auxiliary_loss_mlp": 0.01265704, + "balance_loss_clip": 0.06274875, + "balance_loss_mlp": 0.01255005, + "epoch": 0.5535547873139937, + "flos": 15164707783680.0, + "grad_norm": 1.9977188658051825, + "language_loss": 0.7547437, + "learning_rate": 1.7508096144082012e-06, + "loss": 0.83161485, + "num_input_tokens_seen": 198327810, + "router_z_loss_clip": 1.46386719, + "router_z_loss_mlp": 0.10699463, + "step": 9207, + "time_per_iteration": 2.482356548309326 + }, + { + "auxiliary_loss_clip": 0.06436527, + "auxiliary_loss_mlp": 0.01265889, + "balance_loss_clip": 0.06285885, + "balance_loss_mlp": 0.01254493, + "epoch": 0.5536149105666617, + "flos": 16986209765760.0, + "grad_norm": 2.498092208232672, + "language_loss": 0.61888683, + "learning_rate": 1.750423192272189e-06, + "loss": 0.69591099, + "num_input_tokens_seen": 198343150, + "router_z_loss_clip": 1.50488281, + "router_z_loss_mlp": 0.1138916, + "step": 9208, + "time_per_iteration": 2.493628740310669 + }, + { + "auxiliary_loss_clip": 0.06428279, + "auxiliary_loss_mlp": 0.01268207, + "balance_loss_clip": 0.06278799, + "balance_loss_mlp": 0.01256543, + "epoch": 0.5536750338193296, + "flos": 18155732728320.0, + "grad_norm": 2.094677241914043, + "language_loss": 0.64708155, + "learning_rate": 1.7500367796001547e-06, + "loss": 0.72404641, + "num_input_tokens_seen": 198360925, + "router_z_loss_clip": 1.49511719, + "router_z_loss_mlp": 0.11663818, + "step": 9209, + "time_per_iteration": 2.4616804122924805 + }, + { + "auxiliary_loss_clip": 0.06424735, + "auxiliary_loss_mlp": 0.01272111, + "balance_loss_clip": 0.06279891, + "balance_loss_mlp": 0.01260863, + "epoch": 0.5537351570719976, + "flos": 22754469162240.0, + "grad_norm": 1.8280568303571236, + "language_loss": 0.82967091, + "learning_rate": 1.7496503764067513e-06, + "loss": 0.90663934, + "num_input_tokens_seen": 198379265, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.11242676, + "step": 9210, + "time_per_iteration": 2.564713954925537 + }, + { + "auxiliary_loss_clip": 0.06418703, + "auxiliary_loss_mlp": 0.01265805, + "balance_loss_clip": 0.06275869, + "balance_loss_mlp": 0.01255381, + "epoch": 0.5537952803246655, + "flos": 26362658954880.0, + "grad_norm": 1.71176011345987, + "language_loss": 0.72960317, + "learning_rate": 1.74926398270663e-06, + "loss": 0.80644828, + "num_input_tokens_seen": 198399490, + "router_z_loss_clip": 1.42675781, + "router_z_loss_mlp": 0.10430908, + "step": 9211, + "time_per_iteration": 2.5312066078186035 + }, + { + "auxiliary_loss_clip": 0.06431179, + "auxiliary_loss_mlp": 0.01267507, + "balance_loss_clip": 0.06280635, + "balance_loss_mlp": 0.01256045, + "epoch": 0.5538554035773335, + "flos": 18042695170560.0, + "grad_norm": 2.3508559175952803, + "language_loss": 0.67497891, + "learning_rate": 1.7488775985144437e-06, + "loss": 0.75196576, + "num_input_tokens_seen": 198419110, + "router_z_loss_clip": 1.50585938, + "router_z_loss_mlp": 0.11462402, + "step": 9212, + "time_per_iteration": 2.5141408443450928 + }, + { + "auxiliary_loss_clip": 0.06429373, + "auxiliary_loss_mlp": 0.01268343, + "balance_loss_clip": 0.06279323, + "balance_loss_mlp": 0.0125554, + "epoch": 0.5539155268300014, + "flos": 31694323052160.0, + "grad_norm": 1.4365879651928444, + "language_loss": 0.5225575, + "learning_rate": 1.7484912238448443e-06, + "loss": 0.59953463, + "num_input_tokens_seen": 198441360, + "router_z_loss_clip": 1.49902344, + "router_z_loss_mlp": 0.12792969, + "step": 9213, + "time_per_iteration": 2.5764448642730713 + }, + { + "auxiliary_loss_clip": 0.06431083, + "auxiliary_loss_mlp": 0.01264603, + "balance_loss_clip": 0.06282363, + "balance_loss_mlp": 0.01253302, + "epoch": 0.5539756500826695, + "flos": 15198934976640.0, + "grad_norm": 1.6892906357761146, + "language_loss": 0.85764515, + "learning_rate": 1.7481048587124827e-06, + "loss": 0.93460202, + "num_input_tokens_seen": 198459835, + "router_z_loss_clip": 1.48632812, + "router_z_loss_mlp": 0.11303711, + "step": 9214, + "time_per_iteration": 2.5433578491210938 + }, + { + "auxiliary_loss_clip": 0.06422558, + "auxiliary_loss_mlp": 0.01262824, + "balance_loss_clip": 0.06277601, + "balance_loss_mlp": 0.01252333, + "epoch": 0.5540357733353375, + "flos": 26359262864640.0, + "grad_norm": 1.8961662277212366, + "language_loss": 0.70100081, + "learning_rate": 1.7477185031320108e-06, + "loss": 0.77785456, + "num_input_tokens_seen": 198478955, + "router_z_loss_clip": 1.45019531, + "router_z_loss_mlp": 0.10491943, + "step": 9215, + "time_per_iteration": 2.548687696456909 + }, + { + "auxiliary_loss_clip": 0.06428155, + "auxiliary_loss_mlp": 0.01266334, + "balance_loss_clip": 0.06279612, + "balance_loss_mlp": 0.01254825, + "epoch": 0.5540958965880054, + "flos": 21329926698240.0, + "grad_norm": 1.6927060371572338, + "language_loss": 0.73713386, + "learning_rate": 1.7473321571180773e-06, + "loss": 0.81407875, + "num_input_tokens_seen": 198499030, + "router_z_loss_clip": 1.48535156, + "router_z_loss_mlp": 0.1151123, + "step": 9216, + "time_per_iteration": 2.541210174560547 + }, + { + "auxiliary_loss_clip": 0.06421469, + "auxiliary_loss_mlp": 0.01265486, + "balance_loss_clip": 0.06278324, + "balance_loss_mlp": 0.01254471, + "epoch": 0.5541560198406734, + "flos": 25674020974080.0, + "grad_norm": 1.768513313341331, + "language_loss": 0.71651757, + "learning_rate": 1.7469458206853345e-06, + "loss": 0.79338706, + "num_input_tokens_seen": 198520265, + "router_z_loss_clip": 1.43164062, + "router_z_loss_mlp": 0.11029053, + "step": 9217, + "time_per_iteration": 4.048692226409912 + }, + { + "auxiliary_loss_clip": 0.0642062, + "auxiliary_loss_mlp": 0.01262573, + "balance_loss_clip": 0.06274968, + "balance_loss_mlp": 0.01251993, + "epoch": 0.5542161430933413, + "flos": 21945246756480.0, + "grad_norm": 1.641855173543887, + "language_loss": 0.78896093, + "learning_rate": 1.7465594938484315e-06, + "loss": 0.86579281, + "num_input_tokens_seen": 198539645, + "router_z_loss_clip": 1.45703125, + "router_z_loss_mlp": 0.10577393, + "step": 9218, + "time_per_iteration": 2.5090229511260986 + }, + { + "auxiliary_loss_clip": 0.06429659, + "auxiliary_loss_mlp": 0.01266909, + "balance_loss_clip": 0.06280088, + "balance_loss_mlp": 0.01255023, + "epoch": 0.5542762663460093, + "flos": 19577256445440.0, + "grad_norm": 1.9145093316494244, + "language_loss": 0.72342837, + "learning_rate": 1.7461731766220176e-06, + "loss": 0.80039406, + "num_input_tokens_seen": 198558710, + "router_z_loss_clip": 1.49316406, + "router_z_loss_mlp": 0.11889648, + "step": 9219, + "time_per_iteration": 2.6097207069396973 + }, + { + "auxiliary_loss_clip": 0.06423312, + "auxiliary_loss_mlp": 0.01267842, + "balance_loss_clip": 0.06275792, + "balance_loss_mlp": 0.01256809, + "epoch": 0.5543363895986773, + "flos": 19504944771840.0, + "grad_norm": 1.6265573389583097, + "language_loss": 0.7175796, + "learning_rate": 1.7457868690207426e-06, + "loss": 0.79449117, + "num_input_tokens_seen": 198577050, + "router_z_loss_clip": 1.47558594, + "router_z_loss_mlp": 0.11035156, + "step": 9220, + "time_per_iteration": 3.953366756439209 + }, + { + "auxiliary_loss_clip": 0.0641966, + "auxiliary_loss_mlp": 0.01266715, + "balance_loss_clip": 0.06276264, + "balance_loss_mlp": 0.01256154, + "epoch": 0.5543965128513453, + "flos": 22641808947840.0, + "grad_norm": 1.5837082117197903, + "language_loss": 0.79554594, + "learning_rate": 1.7454005710592547e-06, + "loss": 0.8724097, + "num_input_tokens_seen": 198595290, + "router_z_loss_clip": 1.43359375, + "router_z_loss_mlp": 0.10565186, + "step": 9221, + "time_per_iteration": 2.6012284755706787 + }, + { + "auxiliary_loss_clip": 0.06419835, + "auxiliary_loss_mlp": 0.01268367, + "balance_loss_clip": 0.06276818, + "balance_loss_mlp": 0.0125715, + "epoch": 0.5544566361040132, + "flos": 25996320904320.0, + "grad_norm": 1.7031606951897913, + "language_loss": 0.8378005, + "learning_rate": 1.7450142827522027e-06, + "loss": 0.91468251, + "num_input_tokens_seen": 198614110, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.11224365, + "step": 9222, + "time_per_iteration": 2.5621228218078613 + }, + { + "auxiliary_loss_clip": 0.06426205, + "auxiliary_loss_mlp": 0.01268401, + "balance_loss_clip": 0.06276226, + "balance_loss_mlp": 0.01256236, + "epoch": 0.5545167593566812, + "flos": 28265235361920.0, + "grad_norm": 1.624171595552914, + "language_loss": 0.75644016, + "learning_rate": 1.7446280041142344e-06, + "loss": 0.83338618, + "num_input_tokens_seen": 198633880, + "router_z_loss_clip": 1.50097656, + "router_z_loss_mlp": 0.1217041, + "step": 9223, + "time_per_iteration": 2.6189255714416504 + }, + { + "auxiliary_loss_clip": 0.06421085, + "auxiliary_loss_mlp": 0.012666, + "balance_loss_clip": 0.06275317, + "balance_loss_mlp": 0.01255168, + "epoch": 0.5545768826093491, + "flos": 28484266734720.0, + "grad_norm": 1.537609394832996, + "language_loss": 0.81879461, + "learning_rate": 1.7442417351599986e-06, + "loss": 0.89567149, + "num_input_tokens_seen": 198653505, + "router_z_loss_clip": 1.45800781, + "router_z_loss_mlp": 0.11425781, + "step": 9224, + "time_per_iteration": 2.5794196128845215 + }, + { + "auxiliary_loss_clip": 0.06424309, + "auxiliary_loss_mlp": 0.01271127, + "balance_loss_clip": 0.06276202, + "balance_loss_mlp": 0.01259432, + "epoch": 0.5546370058620171, + "flos": 18483860517120.0, + "grad_norm": 1.6794429489770297, + "language_loss": 0.57241935, + "learning_rate": 1.743855475904141e-06, + "loss": 0.64937371, + "num_input_tokens_seen": 198671890, + "router_z_loss_clip": 1.47949219, + "router_z_loss_mlp": 0.11688232, + "step": 9225, + "time_per_iteration": 3.9698383808135986 + }, + { + "auxiliary_loss_clip": 0.06422257, + "auxiliary_loss_mlp": 0.01267893, + "balance_loss_clip": 0.06275012, + "balance_loss_mlp": 0.01257009, + "epoch": 0.554697129114685, + "flos": 22937260844160.0, + "grad_norm": 1.5804786041677554, + "language_loss": 0.6778791, + "learning_rate": 1.7434692263613098e-06, + "loss": 0.75478059, + "num_input_tokens_seen": 198691995, + "router_z_loss_clip": 1.47460938, + "router_z_loss_mlp": 0.10870361, + "step": 9226, + "time_per_iteration": 2.5307633876800537 + }, + { + "auxiliary_loss_clip": 0.06423603, + "auxiliary_loss_mlp": 0.01267041, + "balance_loss_clip": 0.06275073, + "balance_loss_mlp": 0.01256002, + "epoch": 0.5547572523673531, + "flos": 21803348666880.0, + "grad_norm": 1.2977635143377364, + "language_loss": 0.74954712, + "learning_rate": 1.7430829865461518e-06, + "loss": 0.82645351, + "num_input_tokens_seen": 198712440, + "router_z_loss_clip": 1.48339844, + "router_z_loss_mlp": 0.11047363, + "step": 9227, + "time_per_iteration": 2.5083706378936768 + }, + { + "auxiliary_loss_clip": 0.06423934, + "auxiliary_loss_mlp": 0.01266224, + "balance_loss_clip": 0.06275739, + "balance_loss_mlp": 0.01254768, + "epoch": 0.5548173756200211, + "flos": 22348830746880.0, + "grad_norm": 1.524887798675916, + "language_loss": 0.73794919, + "learning_rate": 1.7426967564733118e-06, + "loss": 0.81485081, + "num_input_tokens_seen": 198731515, + "router_z_loss_clip": 1.48046875, + "router_z_loss_mlp": 0.11444092, + "step": 9228, + "time_per_iteration": 2.555020809173584 + }, + { + "auxiliary_loss_clip": 0.06423147, + "auxiliary_loss_mlp": 0.01263866, + "balance_loss_clip": 0.06276013, + "balance_loss_mlp": 0.01253465, + "epoch": 0.554877498872689, + "flos": 17864599317120.0, + "grad_norm": 1.7043498128680434, + "language_loss": 0.76352561, + "learning_rate": 1.7423105361574373e-06, + "loss": 0.84039581, + "num_input_tokens_seen": 198749750, + "router_z_loss_clip": 1.47167969, + "router_z_loss_mlp": 0.10400391, + "step": 9229, + "time_per_iteration": 2.4959444999694824 + }, + { + "auxiliary_loss_clip": 0.06423293, + "auxiliary_loss_mlp": 0.01266918, + "balance_loss_clip": 0.06275852, + "balance_loss_mlp": 0.0125464, + "epoch": 0.554937622125357, + "flos": 17244080305920.0, + "grad_norm": 1.4897541866361217, + "language_loss": 0.69068646, + "learning_rate": 1.741924325613172e-06, + "loss": 0.76758856, + "num_input_tokens_seen": 198768320, + "router_z_loss_clip": 1.47363281, + "router_z_loss_mlp": 0.12280273, + "step": 9230, + "time_per_iteration": 2.5090713500976562 + }, + { + "auxiliary_loss_clip": 0.06427252, + "auxiliary_loss_mlp": 0.01267128, + "balance_loss_clip": 0.06276985, + "balance_loss_mlp": 0.01254587, + "epoch": 0.5549977453780249, + "flos": 25374082884480.0, + "grad_norm": 2.3665837136773047, + "language_loss": 0.68808627, + "learning_rate": 1.741538124855163e-06, + "loss": 0.76503003, + "num_input_tokens_seen": 198787230, + "router_z_loss_clip": 1.50097656, + "router_z_loss_mlp": 0.12554932, + "step": 9231, + "time_per_iteration": 2.5350747108459473 + }, + { + "auxiliary_loss_clip": 0.06429425, + "auxiliary_loss_mlp": 0.01269438, + "balance_loss_clip": 0.06277338, + "balance_loss_mlp": 0.01256885, + "epoch": 0.555057868630693, + "flos": 25085548949760.0, + "grad_norm": 1.6698826084601515, + "language_loss": 0.78408533, + "learning_rate": 1.7411519338980548e-06, + "loss": 0.86107397, + "num_input_tokens_seen": 198806720, + "router_z_loss_clip": 1.51953125, + "router_z_loss_mlp": 0.12542725, + "step": 9232, + "time_per_iteration": 4.055214881896973 + }, + { + "auxiliary_loss_clip": 0.06416719, + "auxiliary_loss_mlp": 0.01266689, + "balance_loss_clip": 0.06273052, + "balance_loss_mlp": 0.01255972, + "epoch": 0.5551179918833609, + "flos": 26111412887040.0, + "grad_norm": 1.627879634610194, + "language_loss": 0.83063745, + "learning_rate": 1.7407657527564898e-06, + "loss": 0.90747154, + "num_input_tokens_seen": 198826235, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.10723877, + "step": 9233, + "time_per_iteration": 2.6376969814300537 + }, + { + "auxiliary_loss_clip": 0.06430396, + "auxiliary_loss_mlp": 0.01266353, + "balance_loss_clip": 0.06277359, + "balance_loss_mlp": 0.01254927, + "epoch": 0.5551781151360289, + "flos": 19389810862080.0, + "grad_norm": 2.483522309942904, + "language_loss": 0.7549684, + "learning_rate": 1.7403795814451142e-06, + "loss": 0.83193588, + "num_input_tokens_seen": 198842655, + "router_z_loss_clip": 1.52832031, + "router_z_loss_mlp": 0.11431885, + "step": 9234, + "time_per_iteration": 2.4859883785247803 + }, + { + "auxiliary_loss_clip": 0.06418739, + "auxiliary_loss_mlp": 0.01265554, + "balance_loss_clip": 0.06273869, + "balance_loss_mlp": 0.01255129, + "epoch": 0.5552382383886968, + "flos": 21732420585600.0, + "grad_norm": 1.8065340969909298, + "language_loss": 0.64963275, + "learning_rate": 1.7399934199785706e-06, + "loss": 0.72647566, + "num_input_tokens_seen": 198861210, + "router_z_loss_clip": 1.44824219, + "router_z_loss_mlp": 0.10418701, + "step": 9235, + "time_per_iteration": 2.523128032684326 + }, + { + "auxiliary_loss_clip": 0.06420863, + "auxiliary_loss_mlp": 0.01267129, + "balance_loss_clip": 0.06272598, + "balance_loss_mlp": 0.0125519, + "epoch": 0.5552983616413648, + "flos": 14361480944640.0, + "grad_norm": 1.6397834212981734, + "language_loss": 0.68087149, + "learning_rate": 1.7396072683715029e-06, + "loss": 0.75775141, + "num_input_tokens_seen": 198880045, + "router_z_loss_clip": 1.48339844, + "router_z_loss_mlp": 0.11932373, + "step": 9236, + "time_per_iteration": 2.506023406982422 + }, + { + "auxiliary_loss_clip": 0.06416081, + "auxiliary_loss_mlp": 0.01266517, + "balance_loss_clip": 0.06273347, + "balance_loss_mlp": 0.01256068, + "epoch": 0.5553584848940327, + "flos": 25484730600960.0, + "grad_norm": 1.5459271274239896, + "language_loss": 0.86436939, + "learning_rate": 1.7392211266385536e-06, + "loss": 0.94119537, + "num_input_tokens_seen": 198900210, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.10449219, + "step": 9237, + "time_per_iteration": 2.580103874206543 + }, + { + "auxiliary_loss_clip": 0.0641643, + "auxiliary_loss_mlp": 0.01267385, + "balance_loss_clip": 0.06273238, + "balance_loss_mlp": 0.01255875, + "epoch": 0.5554186081467007, + "flos": 22170399477120.0, + "grad_norm": 1.8042242059193758, + "language_loss": 0.73774469, + "learning_rate": 1.7388349947943652e-06, + "loss": 0.81458282, + "num_input_tokens_seen": 198919055, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.11517334, + "step": 9238, + "time_per_iteration": 2.5031590461730957 + }, + { + "auxiliary_loss_clip": 0.0642554, + "auxiliary_loss_mlp": 0.01266682, + "balance_loss_clip": 0.06275032, + "balance_loss_mlp": 0.01255924, + "epoch": 0.5554787313993687, + "flos": 49757744908800.0, + "grad_norm": 1.5320503148177431, + "language_loss": 0.78384852, + "learning_rate": 1.73844887285358e-06, + "loss": 0.86077076, + "num_input_tokens_seen": 198943505, + "router_z_loss_clip": 1.50585938, + "router_z_loss_mlp": 0.10766602, + "step": 9239, + "time_per_iteration": 2.7739756107330322 + }, + { + "auxiliary_loss_clip": 0.06423195, + "auxiliary_loss_mlp": 0.01266863, + "balance_loss_clip": 0.06276082, + "balance_loss_mlp": 0.0125546, + "epoch": 0.5555388546520367, + "flos": 22133908224000.0, + "grad_norm": 1.4777059666754715, + "language_loss": 0.80562818, + "learning_rate": 1.7380627608308393e-06, + "loss": 0.88252878, + "num_input_tokens_seen": 198963590, + "router_z_loss_clip": 1.47070312, + "router_z_loss_mlp": 0.11401367, + "step": 9240, + "time_per_iteration": 2.5036380290985107 + }, + { + "auxiliary_loss_clip": 0.06419357, + "auxiliary_loss_mlp": 0.01266651, + "balance_loss_clip": 0.06273453, + "balance_loss_mlp": 0.01255142, + "epoch": 0.5555989779047047, + "flos": 24689218337280.0, + "grad_norm": 1.7126628457644222, + "language_loss": 0.65465248, + "learning_rate": 1.737676658740786e-06, + "loss": 0.73151255, + "num_input_tokens_seen": 198982680, + "router_z_loss_clip": 1.45996094, + "router_z_loss_mlp": 0.1151123, + "step": 9241, + "time_per_iteration": 2.5851833820343018 + }, + { + "auxiliary_loss_clip": 0.06422672, + "auxiliary_loss_mlp": 0.01264033, + "balance_loss_clip": 0.06276439, + "balance_loss_mlp": 0.01252566, + "epoch": 0.5556591011573726, + "flos": 16111929064320.0, + "grad_norm": 1.8766289396676605, + "language_loss": 0.73123193, + "learning_rate": 1.7372905665980594e-06, + "loss": 0.80809897, + "num_input_tokens_seen": 199000185, + "router_z_loss_clip": 1.46289062, + "router_z_loss_mlp": 0.11474609, + "step": 9242, + "time_per_iteration": 2.467933416366577 + }, + { + "auxiliary_loss_clip": 0.06423976, + "auxiliary_loss_mlp": 0.0126539, + "balance_loss_clip": 0.06276064, + "balance_loss_mlp": 0.01253022, + "epoch": 0.5557192244100406, + "flos": 12938825197440.0, + "grad_norm": 6.974019127266796, + "language_loss": 0.64053857, + "learning_rate": 1.7369044844173012e-06, + "loss": 0.71743226, + "num_input_tokens_seen": 199018380, + "router_z_loss_clip": 1.47851562, + "router_z_loss_mlp": 0.12365723, + "step": 9243, + "time_per_iteration": 2.528529167175293 + }, + { + "auxiliary_loss_clip": 0.0642553, + "auxiliary_loss_mlp": 0.01269814, + "balance_loss_clip": 0.06280211, + "balance_loss_mlp": 0.01258614, + "epoch": 0.5557793476627085, + "flos": 23118291590400.0, + "grad_norm": 3.1703508621435095, + "language_loss": 0.75212169, + "learning_rate": 1.7365184122131509e-06, + "loss": 0.82907516, + "num_input_tokens_seen": 199037115, + "router_z_loss_clip": 1.45214844, + "router_z_loss_mlp": 0.11199951, + "step": 9244, + "time_per_iteration": 2.5159640312194824 + }, + { + "auxiliary_loss_clip": 0.06417421, + "auxiliary_loss_mlp": 0.01263368, + "balance_loss_clip": 0.06277108, + "balance_loss_mlp": 0.01252938, + "epoch": 0.5558394709153766, + "flos": 21433446817920.0, + "grad_norm": 2.161992759062338, + "language_loss": 0.74536991, + "learning_rate": 1.7361323500002486e-06, + "loss": 0.82217783, + "num_input_tokens_seen": 199053375, + "router_z_loss_clip": 1.40332031, + "router_z_loss_mlp": 0.10437012, + "step": 9245, + "time_per_iteration": 2.5320873260498047 + }, + { + "auxiliary_loss_clip": 0.06425805, + "auxiliary_loss_mlp": 0.01268074, + "balance_loss_clip": 0.06274477, + "balance_loss_mlp": 0.01255533, + "epoch": 0.5558995941680445, + "flos": 25084626554880.0, + "grad_norm": 2.1186554191459575, + "language_loss": 0.79345202, + "learning_rate": 1.7357462977932348e-06, + "loss": 0.87039083, + "num_input_tokens_seen": 199070930, + "router_z_loss_clip": 1.51269531, + "router_z_loss_mlp": 0.12530518, + "step": 9246, + "time_per_iteration": 2.5617494583129883 + }, + { + "auxiliary_loss_clip": 0.06425521, + "auxiliary_loss_mlp": 0.01270795, + "balance_loss_clip": 0.06276709, + "balance_loss_mlp": 0.01258993, + "epoch": 0.5559597174207125, + "flos": 20017331688960.0, + "grad_norm": 1.8080775090170724, + "language_loss": 0.7423467, + "learning_rate": 1.7353602556067471e-06, + "loss": 0.81930989, + "num_input_tokens_seen": 199088675, + "router_z_loss_clip": 1.48730469, + "router_z_loss_mlp": 0.11810303, + "step": 9247, + "time_per_iteration": 2.5472562313079834 + }, + { + "auxiliary_loss_clip": 0.06421669, + "auxiliary_loss_mlp": 0.01265666, + "balance_loss_clip": 0.06275357, + "balance_loss_mlp": 0.01254007, + "epoch": 0.5560198406733804, + "flos": 16841125221120.0, + "grad_norm": 2.9360607038713127, + "language_loss": 0.75686443, + "learning_rate": 1.7349742234554254e-06, + "loss": 0.83373785, + "num_input_tokens_seen": 199103075, + "router_z_loss_clip": 1.46386719, + "router_z_loss_mlp": 0.11645508, + "step": 9248, + "time_per_iteration": 2.4991230964660645 + }, + { + "auxiliary_loss_clip": 0.06332292, + "auxiliary_loss_mlp": 0.01252325, + "balance_loss_clip": 0.06269792, + "balance_loss_mlp": 0.01250564, + "epoch": 0.5560799639260484, + "flos": 70719012840960.0, + "grad_norm": 0.8521249277155936, + "language_loss": 0.5948171, + "learning_rate": 1.7345882013539081e-06, + "loss": 0.67066324, + "num_input_tokens_seen": 199160325, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 0.01763916, + "step": 9249, + "time_per_iteration": 3.2450287342071533 + }, + { + "auxiliary_loss_clip": 0.06424973, + "auxiliary_loss_mlp": 0.0126469, + "balance_loss_clip": 0.06276406, + "balance_loss_mlp": 0.01253943, + "epoch": 0.5561400871787163, + "flos": 23155244040960.0, + "grad_norm": 2.0335955894649036, + "language_loss": 0.79889202, + "learning_rate": 1.734202189316832e-06, + "loss": 0.87578869, + "num_input_tokens_seen": 199179760, + "router_z_loss_clip": 1.48339844, + "router_z_loss_mlp": 0.10748291, + "step": 9250, + "time_per_iteration": 2.5372138023376465 + }, + { + "auxiliary_loss_clip": 0.06427802, + "auxiliary_loss_mlp": 0.0126907, + "balance_loss_clip": 0.06277002, + "balance_loss_mlp": 0.01257471, + "epoch": 0.5562002104313843, + "flos": 17572166167680.0, + "grad_norm": 3.4851408255327856, + "language_loss": 0.69400316, + "learning_rate": 1.733816187358836e-06, + "loss": 0.77097189, + "num_input_tokens_seen": 199196695, + "router_z_loss_clip": 1.5078125, + "router_z_loss_mlp": 0.11584473, + "step": 9251, + "time_per_iteration": 2.554487943649292 + }, + { + "auxiliary_loss_clip": 0.06422772, + "auxiliary_loss_mlp": 0.01265424, + "balance_loss_clip": 0.06275512, + "balance_loss_mlp": 0.01253676, + "epoch": 0.5562603336840523, + "flos": 25052328005760.0, + "grad_norm": 1.4438817767967254, + "language_loss": 0.75297302, + "learning_rate": 1.7334301954945569e-06, + "loss": 0.82985497, + "num_input_tokens_seen": 199217845, + "router_z_loss_clip": 1.47265625, + "router_z_loss_mlp": 0.11743164, + "step": 9252, + "time_per_iteration": 2.554103374481201 + }, + { + "auxiliary_loss_clip": 0.06427599, + "auxiliary_loss_mlp": 0.01265088, + "balance_loss_clip": 0.0627709, + "balance_loss_mlp": 0.01254115, + "epoch": 0.5563204569367203, + "flos": 29066617411200.0, + "grad_norm": 1.5076691298158018, + "language_loss": 0.72903025, + "learning_rate": 1.7330442137386313e-06, + "loss": 0.80595708, + "num_input_tokens_seen": 199239250, + "router_z_loss_clip": 1.50488281, + "router_z_loss_mlp": 0.10980225, + "step": 9253, + "time_per_iteration": 2.5654473304748535 + }, + { + "auxiliary_loss_clip": 0.06422551, + "auxiliary_loss_mlp": 0.01269621, + "balance_loss_clip": 0.06276682, + "balance_loss_mlp": 0.01259161, + "epoch": 0.5563805801893883, + "flos": 22096913846400.0, + "grad_norm": 1.9717474280435598, + "language_loss": 0.83141911, + "learning_rate": 1.7326582421056965e-06, + "loss": 0.90834075, + "num_input_tokens_seen": 199258320, + "router_z_loss_clip": 1.45996094, + "router_z_loss_mlp": 0.10455322, + "step": 9254, + "time_per_iteration": 2.5113630294799805 + }, + { + "auxiliary_loss_clip": 0.06332405, + "auxiliary_loss_mlp": 0.01255231, + "balance_loss_clip": 0.06269685, + "balance_loss_mlp": 0.01253453, + "epoch": 0.5564407034420562, + "flos": 58652623555200.0, + "grad_norm": 0.8548643960281289, + "language_loss": 0.64887053, + "learning_rate": 1.732272280610387e-06, + "loss": 0.72474694, + "num_input_tokens_seen": 199314840, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 0.01777649, + "step": 9255, + "time_per_iteration": 2.980931043624878 + }, + { + "auxiliary_loss_clip": 0.06420524, + "auxiliary_loss_mlp": 0.01265076, + "balance_loss_clip": 0.06275329, + "balance_loss_mlp": 0.01254175, + "epoch": 0.5565008266947242, + "flos": 23119004350080.0, + "grad_norm": 1.731717948076331, + "language_loss": 0.69607276, + "learning_rate": 1.7318863292673399e-06, + "loss": 0.77292871, + "num_input_tokens_seen": 199335405, + "router_z_loss_clip": 1.45117188, + "router_z_loss_mlp": 0.10900879, + "step": 9256, + "time_per_iteration": 3.9532642364501953 + }, + { + "auxiliary_loss_clip": 0.06418847, + "auxiliary_loss_mlp": 0.01264994, + "balance_loss_clip": 0.06276, + "balance_loss_mlp": 0.01254551, + "epoch": 0.5565609499473921, + "flos": 21584568856320.0, + "grad_norm": 1.4749881970234011, + "language_loss": 0.76680368, + "learning_rate": 1.73150038809119e-06, + "loss": 0.84364206, + "num_input_tokens_seen": 199354345, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.10443115, + "step": 9257, + "time_per_iteration": 2.4937705993652344 + }, + { + "auxiliary_loss_clip": 0.06425476, + "auxiliary_loss_mlp": 0.01273625, + "balance_loss_clip": 0.0627654, + "balance_loss_mlp": 0.01262735, + "epoch": 0.5566210732000602, + "flos": 18375602641920.0, + "grad_norm": 2.7130999997532563, + "language_loss": 0.61334699, + "learning_rate": 1.7311144570965724e-06, + "loss": 0.69033802, + "num_input_tokens_seen": 199372250, + "router_z_loss_clip": 1.48828125, + "router_z_loss_mlp": 0.10894775, + "step": 9258, + "time_per_iteration": 2.5560710430145264 + }, + { + "auxiliary_loss_clip": 0.06420255, + "auxiliary_loss_mlp": 0.01266708, + "balance_loss_clip": 0.0627327, + "balance_loss_mlp": 0.01255431, + "epoch": 0.5566811964527281, + "flos": 25710554154240.0, + "grad_norm": 1.5983859944569927, + "language_loss": 0.79631943, + "learning_rate": 1.7307285362981215e-06, + "loss": 0.87318903, + "num_input_tokens_seen": 199392815, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 0.11279297, + "step": 9259, + "time_per_iteration": 2.582550525665283 + }, + { + "auxiliary_loss_clip": 0.06421982, + "auxiliary_loss_mlp": 0.01267837, + "balance_loss_clip": 0.06275143, + "balance_loss_mlp": 0.01257013, + "epoch": 0.5567413197053961, + "flos": 26951424468480.0, + "grad_norm": 1.7768491917262519, + "language_loss": 0.81632483, + "learning_rate": 1.7303426257104712e-06, + "loss": 0.89322305, + "num_input_tokens_seen": 199412375, + "router_z_loss_clip": 1.46777344, + "router_z_loss_mlp": 0.10821533, + "step": 9260, + "time_per_iteration": 3.994185209274292 + }, + { + "auxiliary_loss_clip": 0.0642475, + "auxiliary_loss_mlp": 0.0126987, + "balance_loss_clip": 0.06276532, + "balance_loss_mlp": 0.01257598, + "epoch": 0.556801442958064, + "flos": 20856965927040.0, + "grad_norm": 1.6577209620324271, + "language_loss": 0.69569898, + "learning_rate": 1.729956725348256e-06, + "loss": 0.77264518, + "num_input_tokens_seen": 199431490, + "router_z_loss_clip": 1.48144531, + "router_z_loss_mlp": 0.1227417, + "step": 9261, + "time_per_iteration": 2.558511734008789 + }, + { + "auxiliary_loss_clip": 0.06317247, + "auxiliary_loss_mlp": 0.01254512, + "balance_loss_clip": 0.06255186, + "balance_loss_mlp": 0.01252651, + "epoch": 0.556861566210732, + "flos": 70517395918080.0, + "grad_norm": 0.7170849600938061, + "language_loss": 0.61090672, + "learning_rate": 1.729570835226108e-06, + "loss": 0.68662429, + "num_input_tokens_seen": 199495855, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 0.01856995, + "step": 9262, + "time_per_iteration": 3.134216070175171 + }, + { + "auxiliary_loss_clip": 0.06422806, + "auxiliary_loss_mlp": 0.01270562, + "balance_loss_clip": 0.06273758, + "balance_loss_mlp": 0.01259214, + "epoch": 0.5569216894633999, + "flos": 25344216103680.0, + "grad_norm": 1.5027402480240113, + "language_loss": 0.64822662, + "learning_rate": 1.7291849553586622e-06, + "loss": 0.72516024, + "num_input_tokens_seen": 199515870, + "router_z_loss_clip": 1.48925781, + "router_z_loss_mlp": 0.11340332, + "step": 9263, + "time_per_iteration": 2.5533127784729004 + }, + { + "auxiliary_loss_clip": 0.06420417, + "auxiliary_loss_mlp": 0.01271706, + "balance_loss_clip": 0.06274161, + "balance_loss_mlp": 0.01260679, + "epoch": 0.556981812716068, + "flos": 22645456600320.0, + "grad_norm": 1.647856593864945, + "language_loss": 0.73077464, + "learning_rate": 1.7287990857605497e-06, + "loss": 0.80769587, + "num_input_tokens_seen": 199535745, + "router_z_loss_clip": 1.46484375, + "router_z_loss_mlp": 0.11035156, + "step": 9264, + "time_per_iteration": 2.5055153369903564 + }, + { + "auxiliary_loss_clip": 0.06421056, + "auxiliary_loss_mlp": 0.01267322, + "balance_loss_clip": 0.06273742, + "balance_loss_mlp": 0.01255765, + "epoch": 0.5570419359687359, + "flos": 11040567275520.0, + "grad_norm": 1.7723772076526776, + "language_loss": 0.7667138, + "learning_rate": 1.7284132264464022e-06, + "loss": 0.84359753, + "num_input_tokens_seen": 199554035, + "router_z_loss_clip": 1.47265625, + "router_z_loss_mlp": 0.11553955, + "step": 9265, + "time_per_iteration": 3.964038372039795 + }, + { + "auxiliary_loss_clip": 0.064167, + "auxiliary_loss_mlp": 0.01273186, + "balance_loss_clip": 0.06276511, + "balance_loss_mlp": 0.01262368, + "epoch": 0.5571020592214039, + "flos": 22830218853120.0, + "grad_norm": 1.7025735740351078, + "language_loss": 0.71389985, + "learning_rate": 1.7280273774308536e-06, + "loss": 0.79079872, + "num_input_tokens_seen": 199576120, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.1081543, + "step": 9266, + "time_per_iteration": 2.5572071075439453 + }, + { + "auxiliary_loss_clip": 0.06418756, + "auxiliary_loss_mlp": 0.01270352, + "balance_loss_clip": 0.06275168, + "balance_loss_mlp": 0.01259701, + "epoch": 0.5571621824740719, + "flos": 22934074389120.0, + "grad_norm": 1.5846567867344512, + "language_loss": 0.68614411, + "learning_rate": 1.727641538728533e-06, + "loss": 0.76303518, + "num_input_tokens_seen": 199593780, + "router_z_loss_clip": 1.43457031, + "router_z_loss_mlp": 0.10656738, + "step": 9267, + "time_per_iteration": 2.4949660301208496 + }, + { + "auxiliary_loss_clip": 0.06419186, + "auxiliary_loss_mlp": 0.01266996, + "balance_loss_clip": 0.06277707, + "balance_loss_mlp": 0.01255677, + "epoch": 0.5572223057267398, + "flos": 22973416680960.0, + "grad_norm": 2.0664301257613684, + "language_loss": 0.75132561, + "learning_rate": 1.7272557103540736e-06, + "loss": 0.82818741, + "num_input_tokens_seen": 199613220, + "router_z_loss_clip": 1.41601562, + "router_z_loss_mlp": 0.11315918, + "step": 9268, + "time_per_iteration": 2.5834717750549316 + }, + { + "auxiliary_loss_clip": 0.06420539, + "auxiliary_loss_mlp": 0.01262996, + "balance_loss_clip": 0.06276375, + "balance_loss_mlp": 0.01252184, + "epoch": 0.5572824289794078, + "flos": 20966439686400.0, + "grad_norm": 2.076388090189787, + "language_loss": 0.75247812, + "learning_rate": 1.726869892322104e-06, + "loss": 0.8293134, + "num_input_tokens_seen": 199632085, + "router_z_loss_clip": 1.44042969, + "router_z_loss_mlp": 0.10803223, + "step": 9269, + "time_per_iteration": 2.6340525150299072 + }, + { + "auxiliary_loss_clip": 0.06420279, + "auxiliary_loss_mlp": 0.01268076, + "balance_loss_clip": 0.06274693, + "balance_loss_mlp": 0.01257091, + "epoch": 0.5573425522320757, + "flos": 25048806134400.0, + "grad_norm": 1.9328220368280318, + "language_loss": 0.82704222, + "learning_rate": 1.726484084647256e-06, + "loss": 0.90392578, + "num_input_tokens_seen": 199649295, + "router_z_loss_clip": 1.45507812, + "router_z_loss_mlp": 0.10986328, + "step": 9270, + "time_per_iteration": 2.6455605030059814 + }, + { + "auxiliary_loss_clip": 0.06426194, + "auxiliary_loss_mlp": 0.01267053, + "balance_loss_clip": 0.06276838, + "balance_loss_mlp": 0.01255657, + "epoch": 0.5574026754847438, + "flos": 23666415073920.0, + "grad_norm": 1.8553396052443616, + "language_loss": 0.79884106, + "learning_rate": 1.7260982873441591e-06, + "loss": 0.87577355, + "num_input_tokens_seen": 199668870, + "router_z_loss_clip": 1.49414062, + "router_z_loss_mlp": 0.1138916, + "step": 9271, + "time_per_iteration": 4.060855388641357 + }, + { + "auxiliary_loss_clip": 0.0642622, + "auxiliary_loss_mlp": 0.01265728, + "balance_loss_clip": 0.0627868, + "balance_loss_mlp": 0.01254153, + "epoch": 0.5574627987374117, + "flos": 24787791066240.0, + "grad_norm": 1.7644146130703546, + "language_loss": 0.90646034, + "learning_rate": 1.725712500427442e-06, + "loss": 0.9833799, + "num_input_tokens_seen": 199684870, + "router_z_loss_clip": 1.47265625, + "router_z_loss_mlp": 0.11572266, + "step": 9272, + "time_per_iteration": 2.534665107727051 + }, + { + "auxiliary_loss_clip": 0.0641982, + "auxiliary_loss_mlp": 0.01265463, + "balance_loss_clip": 0.06279024, + "balance_loss_mlp": 0.0125446, + "epoch": 0.5575229219900797, + "flos": 21841349293440.0, + "grad_norm": 1.8989818213493146, + "language_loss": 0.84368634, + "learning_rate": 1.7253267239117347e-06, + "loss": 0.92053914, + "num_input_tokens_seen": 199701975, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.10992432, + "step": 9273, + "time_per_iteration": 2.5200788974761963 + }, + { + "auxiliary_loss_clip": 0.06423581, + "auxiliary_loss_mlp": 0.01268606, + "balance_loss_clip": 0.06278996, + "balance_loss_mlp": 0.01256059, + "epoch": 0.5575830452427476, + "flos": 27821973663360.0, + "grad_norm": 1.9193499092419828, + "language_loss": 0.75017828, + "learning_rate": 1.7249409578116655e-06, + "loss": 0.82710016, + "num_input_tokens_seen": 199721865, + "router_z_loss_clip": 1.44433594, + "router_z_loss_mlp": 0.12548828, + "step": 9274, + "time_per_iteration": 2.548865795135498 + }, + { + "auxiliary_loss_clip": 0.06435296, + "auxiliary_loss_mlp": 0.01273341, + "balance_loss_clip": 0.06282236, + "balance_loss_mlp": 0.01260806, + "epoch": 0.5576431684954156, + "flos": 17817081252480.0, + "grad_norm": 2.8160029917848397, + "language_loss": 0.78999293, + "learning_rate": 1.7245552021418629e-06, + "loss": 0.86707926, + "num_input_tokens_seen": 199736455, + "router_z_loss_clip": 1.52832031, + "router_z_loss_mlp": 0.12530518, + "step": 9275, + "time_per_iteration": 2.503168821334839 + }, + { + "auxiliary_loss_clip": 0.06426495, + "auxiliary_loss_mlp": 0.01264959, + "balance_loss_clip": 0.06279385, + "balance_loss_mlp": 0.01253372, + "epoch": 0.5577032917480835, + "flos": 15492290520960.0, + "grad_norm": 1.5722489245589244, + "language_loss": 0.75639874, + "learning_rate": 1.7241694569169546e-06, + "loss": 0.83331323, + "num_input_tokens_seen": 199753125, + "router_z_loss_clip": 1.47070312, + "router_z_loss_mlp": 0.11584473, + "step": 9276, + "time_per_iteration": 2.466275215148926 + }, + { + "auxiliary_loss_clip": 0.06423229, + "auxiliary_loss_mlp": 0.012674, + "balance_loss_clip": 0.06277048, + "balance_loss_mlp": 0.01256379, + "epoch": 0.5577634150007516, + "flos": 21586162083840.0, + "grad_norm": 1.8200099839217898, + "language_loss": 0.75387412, + "learning_rate": 1.7237837221515678e-06, + "loss": 0.83078039, + "num_input_tokens_seen": 199771365, + "router_z_loss_clip": 1.46191406, + "router_z_loss_mlp": 0.11022949, + "step": 9277, + "time_per_iteration": 2.514432907104492 + }, + { + "auxiliary_loss_clip": 0.06420221, + "auxiliary_loss_mlp": 0.01266216, + "balance_loss_clip": 0.06277104, + "balance_loss_mlp": 0.01255535, + "epoch": 0.5578235382534195, + "flos": 21145709496960.0, + "grad_norm": 1.5944068660293211, + "language_loss": 0.7198559, + "learning_rate": 1.7233979978603304e-06, + "loss": 0.79672027, + "num_input_tokens_seen": 199790035, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.10681152, + "step": 9278, + "time_per_iteration": 2.4954776763916016 + }, + { + "auxiliary_loss_clip": 0.06425839, + "auxiliary_loss_mlp": 0.01267939, + "balance_loss_clip": 0.06276909, + "balance_loss_mlp": 0.01255166, + "epoch": 0.5578836615060875, + "flos": 26512397400960.0, + "grad_norm": 1.4623548994871365, + "language_loss": 0.75693482, + "learning_rate": 1.723012284057868e-06, + "loss": 0.83387262, + "num_input_tokens_seen": 199811125, + "router_z_loss_clip": 1.48925781, + "router_z_loss_mlp": 0.12786865, + "step": 9279, + "time_per_iteration": 2.5537941455841064 + }, + { + "auxiliary_loss_clip": 0.06422286, + "auxiliary_loss_mlp": 0.01267149, + "balance_loss_clip": 0.06276134, + "balance_loss_mlp": 0.01255354, + "epoch": 0.5579437847587555, + "flos": 20159439413760.0, + "grad_norm": 1.637545301877737, + "language_loss": 0.67443848, + "learning_rate": 1.7226265807588082e-06, + "loss": 0.75133282, + "num_input_tokens_seen": 199829915, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.11791992, + "step": 9280, + "time_per_iteration": 2.489867925643921 + }, + { + "auxiliary_loss_clip": 0.06426547, + "auxiliary_loss_mlp": 0.01266943, + "balance_loss_clip": 0.06276332, + "balance_loss_mlp": 0.01255851, + "epoch": 0.5580039080114234, + "flos": 26109148826880.0, + "grad_norm": 1.5394249927656036, + "language_loss": 0.7336756, + "learning_rate": 1.7222408879777763e-06, + "loss": 0.81061053, + "num_input_tokens_seen": 199850670, + "router_z_loss_clip": 1.50195312, + "router_z_loss_mlp": 0.11090088, + "step": 9281, + "time_per_iteration": 2.693004846572876 + }, + { + "auxiliary_loss_clip": 0.06420805, + "auxiliary_loss_mlp": 0.01265902, + "balance_loss_clip": 0.06277525, + "balance_loss_mlp": 0.01255244, + "epoch": 0.5580640312640914, + "flos": 13776740426880.0, + "grad_norm": 2.347269898773066, + "language_loss": 0.75313729, + "learning_rate": 1.7218552057293974e-06, + "loss": 0.83000439, + "num_input_tokens_seen": 199867645, + "router_z_loss_clip": 1.43359375, + "router_z_loss_mlp": 0.10662842, + "step": 9282, + "time_per_iteration": 2.472775936126709 + }, + { + "auxiliary_loss_clip": 0.06421494, + "auxiliary_loss_mlp": 0.01270202, + "balance_loss_clip": 0.0627737, + "balance_loss_mlp": 0.01258871, + "epoch": 0.5581241545167593, + "flos": 17681765708160.0, + "grad_norm": 1.6208158464679243, + "language_loss": 0.66451746, + "learning_rate": 1.721469534028297e-06, + "loss": 0.74143445, + "num_input_tokens_seen": 199886320, + "router_z_loss_clip": 1.44140625, + "router_z_loss_mlp": 0.11334229, + "step": 9283, + "time_per_iteration": 2.495039224624634 + }, + { + "auxiliary_loss_clip": 0.06423882, + "auxiliary_loss_mlp": 0.01268075, + "balance_loss_clip": 0.06277125, + "balance_loss_mlp": 0.01257489, + "epoch": 0.5581842777694274, + "flos": 19574573114880.0, + "grad_norm": 1.8440828180500004, + "language_loss": 0.83265072, + "learning_rate": 1.7210838728890994e-06, + "loss": 0.90957028, + "num_input_tokens_seen": 199904895, + "router_z_loss_clip": 1.46582031, + "router_z_loss_mlp": 0.10583496, + "step": 9284, + "time_per_iteration": 2.479743719100952 + }, + { + "auxiliary_loss_clip": 0.06423684, + "auxiliary_loss_mlp": 0.01266546, + "balance_loss_clip": 0.06278059, + "balance_loss_mlp": 0.01255412, + "epoch": 0.5582444010220953, + "flos": 20601485228160.0, + "grad_norm": 2.4189186360573407, + "language_loss": 0.86142218, + "learning_rate": 1.7206982223264304e-06, + "loss": 0.93832451, + "num_input_tokens_seen": 199921090, + "router_z_loss_clip": 1.45800781, + "router_z_loss_mlp": 0.11132812, + "step": 9285, + "time_per_iteration": 2.528890371322632 + }, + { + "auxiliary_loss_clip": 0.06422924, + "auxiliary_loss_mlp": 0.01266589, + "balance_loss_clip": 0.06277917, + "balance_loss_mlp": 0.01255818, + "epoch": 0.5583045242747633, + "flos": 19141541614080.0, + "grad_norm": 2.3862114712175013, + "language_loss": 0.74476177, + "learning_rate": 1.720312582354912e-06, + "loss": 0.82165694, + "num_input_tokens_seen": 199939925, + "router_z_loss_clip": 1.44726562, + "router_z_loss_mlp": 0.10772705, + "step": 9286, + "time_per_iteration": 2.502807378768921 + }, + { + "auxiliary_loss_clip": 0.06421416, + "auxiliary_loss_mlp": 0.01267963, + "balance_loss_clip": 0.06276793, + "balance_loss_mlp": 0.01256448, + "epoch": 0.5583646475274312, + "flos": 27462050449920.0, + "grad_norm": 1.681368685974995, + "language_loss": 0.74959427, + "learning_rate": 1.7199269529891684e-06, + "loss": 0.82648808, + "num_input_tokens_seen": 199960015, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.11529541, + "step": 9287, + "time_per_iteration": 2.5700645446777344 + }, + { + "auxiliary_loss_clip": 0.06430193, + "auxiliary_loss_mlp": 0.01266629, + "balance_loss_clip": 0.06279745, + "balance_loss_mlp": 0.01254601, + "epoch": 0.5584247707800992, + "flos": 23659580966400.0, + "grad_norm": 1.4753035778898818, + "language_loss": 0.75157738, + "learning_rate": 1.7195413342438233e-06, + "loss": 0.82854563, + "num_input_tokens_seen": 199980505, + "router_z_loss_clip": 1.50488281, + "router_z_loss_mlp": 0.12036133, + "step": 9288, + "time_per_iteration": 2.529250383377075 + }, + { + "auxiliary_loss_clip": 0.06424332, + "auxiliary_loss_mlp": 0.01267185, + "balance_loss_clip": 0.06280167, + "balance_loss_mlp": 0.01254847, + "epoch": 0.5584848940327671, + "flos": 13703967555840.0, + "grad_norm": 2.2558701039351696, + "language_loss": 0.78180242, + "learning_rate": 1.7191557261334984e-06, + "loss": 0.85871768, + "num_input_tokens_seen": 199999020, + "router_z_loss_clip": 1.44238281, + "router_z_loss_mlp": 0.12329102, + "step": 9289, + "time_per_iteration": 2.5093841552734375 + }, + { + "auxiliary_loss_clip": 0.06428449, + "auxiliary_loss_mlp": 0.0126783, + "balance_loss_clip": 0.06276964, + "balance_loss_mlp": 0.01255921, + "epoch": 0.5585450172854352, + "flos": 27023526506880.0, + "grad_norm": 1.7277790144481269, + "language_loss": 0.61688149, + "learning_rate": 1.718770128672817e-06, + "loss": 0.69384426, + "num_input_tokens_seen": 200019020, + "router_z_loss_clip": 1.51464844, + "router_z_loss_mlp": 0.11914062, + "step": 9290, + "time_per_iteration": 2.5534214973449707 + }, + { + "auxiliary_loss_clip": 0.0642647, + "auxiliary_loss_mlp": 0.01268365, + "balance_loss_clip": 0.06277582, + "balance_loss_mlp": 0.01256581, + "epoch": 0.5586051405381031, + "flos": 23192406126720.0, + "grad_norm": 2.1760973422208965, + "language_loss": 0.67914414, + "learning_rate": 1.7183845418764e-06, + "loss": 0.75609255, + "num_input_tokens_seen": 200038110, + "router_z_loss_clip": 1.48925781, + "router_z_loss_mlp": 0.11767578, + "step": 9291, + "time_per_iteration": 2.5376763343811035 + }, + { + "auxiliary_loss_clip": 0.0642361, + "auxiliary_loss_mlp": 0.01267339, + "balance_loss_clip": 0.06277996, + "balance_loss_mlp": 0.01255764, + "epoch": 0.5586652637907711, + "flos": 20781551652480.0, + "grad_norm": 1.760966459417108, + "language_loss": 0.84366935, + "learning_rate": 1.7179989657588698e-06, + "loss": 0.92057884, + "num_input_tokens_seen": 200056210, + "router_z_loss_clip": 1.45703125, + "router_z_loss_mlp": 0.11578369, + "step": 9292, + "time_per_iteration": 2.5204405784606934 + }, + { + "auxiliary_loss_clip": 0.06422292, + "auxiliary_loss_mlp": 0.01268661, + "balance_loss_clip": 0.06279489, + "balance_loss_mlp": 0.01257848, + "epoch": 0.5587253870434391, + "flos": 28227360516480.0, + "grad_norm": 1.8754942991534513, + "language_loss": 0.7459076, + "learning_rate": 1.7176134003348476e-06, + "loss": 0.82281709, + "num_input_tokens_seen": 200075620, + "router_z_loss_clip": 1.42871094, + "router_z_loss_mlp": 0.10821533, + "step": 9293, + "time_per_iteration": 2.6592154502868652 + }, + { + "auxiliary_loss_clip": 0.06418014, + "auxiliary_loss_mlp": 0.01265353, + "balance_loss_clip": 0.06274671, + "balance_loss_mlp": 0.01254809, + "epoch": 0.558785510296107, + "flos": 26623128971520.0, + "grad_norm": 1.7285534178917525, + "language_loss": 0.72416651, + "learning_rate": 1.7172278456189523e-06, + "loss": 0.80100018, + "num_input_tokens_seen": 200095945, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.10546875, + "step": 9294, + "time_per_iteration": 2.538320779800415 + }, + { + "auxiliary_loss_clip": 0.06421927, + "auxiliary_loss_mlp": 0.01268134, + "balance_loss_clip": 0.06276325, + "balance_loss_mlp": 0.01257208, + "epoch": 0.558845633548775, + "flos": 20162919358080.0, + "grad_norm": 2.7937117268116656, + "language_loss": 0.69210899, + "learning_rate": 1.716842301625806e-06, + "loss": 0.76900959, + "num_input_tokens_seen": 200114185, + "router_z_loss_clip": 1.45507812, + "router_z_loss_mlp": 0.109375, + "step": 9295, + "time_per_iteration": 2.5218520164489746 + }, + { + "auxiliary_loss_clip": 0.06418794, + "auxiliary_loss_mlp": 0.0126519, + "balance_loss_clip": 0.06273518, + "balance_loss_mlp": 0.01253776, + "epoch": 0.5589057568014429, + "flos": 24357317114880.0, + "grad_norm": 1.5440712557728564, + "language_loss": 0.80893242, + "learning_rate": 1.7164567683700281e-06, + "loss": 0.88577229, + "num_input_tokens_seen": 200135030, + "router_z_loss_clip": 1.45117188, + "router_z_loss_mlp": 0.11419678, + "step": 9296, + "time_per_iteration": 3.9467618465423584 + }, + { + "auxiliary_loss_clip": 0.06419219, + "auxiliary_loss_mlp": 0.01265962, + "balance_loss_clip": 0.06275849, + "balance_loss_mlp": 0.01255019, + "epoch": 0.558965880054111, + "flos": 21111440376960.0, + "grad_norm": 1.9869508208087105, + "language_loss": 0.65690488, + "learning_rate": 1.7160712458662379e-06, + "loss": 0.73375666, + "num_input_tokens_seen": 200154290, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.10955811, + "step": 9297, + "time_per_iteration": 2.528181791305542 + }, + { + "auxiliary_loss_clip": 0.06424123, + "auxiliary_loss_mlp": 0.01267328, + "balance_loss_clip": 0.06275574, + "balance_loss_mlp": 0.0125527, + "epoch": 0.5590260033067789, + "flos": 18440954426880.0, + "grad_norm": 1.490575561372924, + "language_loss": 0.75263643, + "learning_rate": 1.7156857341290544e-06, + "loss": 0.82955098, + "num_input_tokens_seen": 200171555, + "router_z_loss_clip": 1.48632812, + "router_z_loss_mlp": 0.12054443, + "step": 9298, + "time_per_iteration": 2.5208308696746826 + }, + { + "auxiliary_loss_clip": 0.06311645, + "auxiliary_loss_mlp": 0.01252986, + "balance_loss_clip": 0.06249566, + "balance_loss_mlp": 0.01251184, + "epoch": 0.5590861265594469, + "flos": 70597673729280.0, + "grad_norm": 0.6945904868111653, + "language_loss": 0.52248931, + "learning_rate": 1.7153002331730967e-06, + "loss": 0.59813559, + "num_input_tokens_seen": 200237010, + "router_z_loss_clip": 0.62158203, + "router_z_loss_mlp": 0.01797485, + "step": 9299, + "time_per_iteration": 4.702880144119263 + }, + { + "auxiliary_loss_clip": 0.06418106, + "auxiliary_loss_mlp": 0.01267473, + "balance_loss_clip": 0.06276019, + "balance_loss_mlp": 0.01256905, + "epoch": 0.5591462498121148, + "flos": 30672274475520.0, + "grad_norm": 1.7758709427362191, + "language_loss": 0.68987107, + "learning_rate": 1.7149147430129824e-06, + "loss": 0.76672685, + "num_input_tokens_seen": 200260820, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.10571289, + "step": 9300, + "time_per_iteration": 2.6169886589050293 + }, + { + "auxiliary_loss_clip": 0.06428309, + "auxiliary_loss_mlp": 0.01266499, + "balance_loss_clip": 0.06278549, + "balance_loss_mlp": 0.01254727, + "epoch": 0.5592063730647828, + "flos": 18156319706880.0, + "grad_norm": 3.029569475440017, + "language_loss": 0.81908011, + "learning_rate": 1.7145292636633293e-06, + "loss": 0.89602816, + "num_input_tokens_seen": 200278035, + "router_z_loss_clip": 1.49804688, + "router_z_loss_mlp": 0.11761475, + "step": 9301, + "time_per_iteration": 2.4880383014678955 + }, + { + "auxiliary_loss_clip": 0.06421784, + "auxiliary_loss_mlp": 0.0126742, + "balance_loss_clip": 0.06274376, + "balance_loss_mlp": 0.01256101, + "epoch": 0.5592664963174507, + "flos": 24067148025600.0, + "grad_norm": 2.0495431587104216, + "language_loss": 0.67981839, + "learning_rate": 1.714143795138756e-06, + "loss": 0.75671041, + "num_input_tokens_seen": 200297255, + "router_z_loss_clip": 1.47265625, + "router_z_loss_mlp": 0.11315918, + "step": 9302, + "time_per_iteration": 2.5440263748168945 + }, + { + "auxiliary_loss_clip": 0.06427488, + "auxiliary_loss_mlp": 0.01266173, + "balance_loss_clip": 0.0627801, + "balance_loss_mlp": 0.01254121, + "epoch": 0.5593266195701188, + "flos": 19833911101440.0, + "grad_norm": 1.543967288464222, + "language_loss": 0.70932961, + "learning_rate": 1.713758337453878e-06, + "loss": 0.78626627, + "num_input_tokens_seen": 200317505, + "router_z_loss_clip": 1.49609375, + "router_z_loss_mlp": 0.12042236, + "step": 9303, + "time_per_iteration": 2.52182674407959 + }, + { + "auxiliary_loss_clip": 0.06417537, + "auxiliary_loss_mlp": 0.01265621, + "balance_loss_clip": 0.06276484, + "balance_loss_mlp": 0.01255453, + "epoch": 0.5593867428227867, + "flos": 25307682923520.0, + "grad_norm": 1.5891501411536748, + "language_loss": 0.73189592, + "learning_rate": 1.7133728906233124e-06, + "loss": 0.8087275, + "num_input_tokens_seen": 200338350, + "router_z_loss_clip": 1.41210938, + "router_z_loss_mlp": 0.10168457, + "step": 9304, + "time_per_iteration": 3.999878406524658 + }, + { + "auxiliary_loss_clip": 0.06421353, + "auxiliary_loss_mlp": 0.01266821, + "balance_loss_clip": 0.06276563, + "balance_loss_mlp": 0.01255693, + "epoch": 0.5594468660754547, + "flos": 12938028583680.0, + "grad_norm": 2.1417504305353563, + "language_loss": 0.78262866, + "learning_rate": 1.7129874546616763e-06, + "loss": 0.85951042, + "num_input_tokens_seen": 200353965, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.11132812, + "step": 9305, + "time_per_iteration": 2.5058751106262207 + }, + { + "auxiliary_loss_clip": 0.06419225, + "auxiliary_loss_mlp": 0.0126404, + "balance_loss_clip": 0.06278518, + "balance_loss_mlp": 0.01253341, + "epoch": 0.5595069893281227, + "flos": 19068768743040.0, + "grad_norm": 1.6214418695958237, + "language_loss": 0.69748855, + "learning_rate": 1.7126020295835836e-06, + "loss": 0.7743212, + "num_input_tokens_seen": 200373595, + "router_z_loss_clip": 1.40429688, + "router_z_loss_mlp": 0.10705566, + "step": 9306, + "time_per_iteration": 2.5216495990753174 + }, + { + "auxiliary_loss_clip": 0.06329086, + "auxiliary_loss_mlp": 0.01251264, + "balance_loss_clip": 0.06266434, + "balance_loss_mlp": 0.01249626, + "epoch": 0.5595671125807906, + "flos": 70291530437760.0, + "grad_norm": 0.8883282828550626, + "language_loss": 0.60321748, + "learning_rate": 1.7122166154036518e-06, + "loss": 0.679021, + "num_input_tokens_seen": 200429155, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 0.0164032, + "step": 9307, + "time_per_iteration": 3.2440812587738037 + }, + { + "auxiliary_loss_clip": 0.06421244, + "auxiliary_loss_mlp": 0.01267485, + "balance_loss_clip": 0.06278248, + "balance_loss_mlp": 0.01257013, + "epoch": 0.5596272358334586, + "flos": 20671407060480.0, + "grad_norm": 1.5654652346016935, + "language_loss": 0.7418704, + "learning_rate": 1.7118312121364943e-06, + "loss": 0.81875765, + "num_input_tokens_seen": 200448290, + "router_z_loss_clip": 1.42871094, + "router_z_loss_mlp": 0.10467529, + "step": 9308, + "time_per_iteration": 2.527722120285034 + }, + { + "auxiliary_loss_clip": 0.06423165, + "auxiliary_loss_mlp": 0.01265447, + "balance_loss_clip": 0.06275736, + "balance_loss_mlp": 0.01253371, + "epoch": 0.5596873590861265, + "flos": 25047170979840.0, + "grad_norm": 1.7977154981427412, + "language_loss": 0.70390081, + "learning_rate": 1.7114458197967257e-06, + "loss": 0.78078693, + "num_input_tokens_seen": 200466555, + "router_z_loss_clip": 1.47363281, + "router_z_loss_mlp": 0.12072754, + "step": 9309, + "time_per_iteration": 2.5592753887176514 + }, + { + "auxiliary_loss_clip": 0.06425751, + "auxiliary_loss_mlp": 0.01268716, + "balance_loss_clip": 0.06278521, + "balance_loss_mlp": 0.01255889, + "epoch": 0.5597474823387946, + "flos": 25965573655680.0, + "grad_norm": 1.826608872454741, + "language_loss": 0.7546587, + "learning_rate": 1.7110604383989613e-06, + "loss": 0.83160329, + "num_input_tokens_seen": 200485980, + "router_z_loss_clip": 1.47265625, + "router_z_loss_mlp": 0.12835693, + "step": 9310, + "time_per_iteration": 2.5775809288024902 + }, + { + "auxiliary_loss_clip": 0.06428897, + "auxiliary_loss_mlp": 0.01266019, + "balance_loss_clip": 0.06280525, + "balance_loss_mlp": 0.0125343, + "epoch": 0.5598076055914625, + "flos": 26184688882560.0, + "grad_norm": 2.287225356977705, + "language_loss": 0.70149207, + "learning_rate": 1.7106750679578133e-06, + "loss": 0.77844125, + "num_input_tokens_seen": 200504555, + "router_z_loss_clip": 1.48339844, + "router_z_loss_mlp": 0.12579346, + "step": 9311, + "time_per_iteration": 3.9833383560180664 + }, + { + "auxiliary_loss_clip": 0.06422099, + "auxiliary_loss_mlp": 0.01265696, + "balance_loss_clip": 0.06277782, + "balance_loss_mlp": 0.01254061, + "epoch": 0.5598677288441305, + "flos": 11660541235200.0, + "grad_norm": 2.2749325214124605, + "language_loss": 0.72917002, + "learning_rate": 1.7102897084878962e-06, + "loss": 0.80604798, + "num_input_tokens_seen": 200522700, + "router_z_loss_clip": 1.44335938, + "router_z_loss_mlp": 0.11645508, + "step": 9312, + "time_per_iteration": 2.5323050022125244 + }, + { + "auxiliary_loss_clip": 0.06420854, + "auxiliary_loss_mlp": 0.01267281, + "balance_loss_clip": 0.06276432, + "balance_loss_mlp": 0.01255772, + "epoch": 0.5599278520967984, + "flos": 22973290899840.0, + "grad_norm": 1.8427769518341257, + "language_loss": 0.89498973, + "learning_rate": 1.709904360003822e-06, + "loss": 0.97187102, + "num_input_tokens_seen": 200541910, + "router_z_loss_clip": 1.44335938, + "router_z_loss_mlp": 0.1151123, + "step": 9313, + "time_per_iteration": 2.5141191482543945 + }, + { + "auxiliary_loss_clip": 0.06423395, + "auxiliary_loss_mlp": 0.01268039, + "balance_loss_clip": 0.06279235, + "balance_loss_mlp": 0.01256804, + "epoch": 0.5599879753494664, + "flos": 21222004239360.0, + "grad_norm": 1.3323867384007686, + "language_loss": 0.7802453, + "learning_rate": 1.709519022520204e-06, + "loss": 0.85715961, + "num_input_tokens_seen": 200562600, + "router_z_loss_clip": 1.44140625, + "router_z_loss_mlp": 0.11242676, + "step": 9314, + "time_per_iteration": 2.587451934814453 + }, + { + "auxiliary_loss_clip": 0.06420899, + "auxiliary_loss_mlp": 0.01265189, + "balance_loss_clip": 0.06276683, + "balance_loss_mlp": 0.01254109, + "epoch": 0.5600480986021343, + "flos": 31911006510720.0, + "grad_norm": 1.5829567025911722, + "language_loss": 0.70587456, + "learning_rate": 1.7091336960516537e-06, + "loss": 0.78273547, + "num_input_tokens_seen": 200584795, + "router_z_loss_clip": 1.44140625, + "router_z_loss_mlp": 0.11083984, + "step": 9315, + "time_per_iteration": 2.585667371749878 + }, + { + "auxiliary_loss_clip": 0.06425041, + "auxiliary_loss_mlp": 0.01268206, + "balance_loss_clip": 0.06275864, + "balance_loss_mlp": 0.01256571, + "epoch": 0.5601082218548024, + "flos": 28483679756160.0, + "grad_norm": 1.7585144874491871, + "language_loss": 0.67066777, + "learning_rate": 1.7087483806127824e-06, + "loss": 0.7476002, + "num_input_tokens_seen": 200606945, + "router_z_loss_clip": 1.49023438, + "router_z_loss_mlp": 0.11645508, + "step": 9316, + "time_per_iteration": 2.5536792278289795 + }, + { + "auxiliary_loss_clip": 0.06421398, + "auxiliary_loss_mlp": 0.01264577, + "balance_loss_clip": 0.06276462, + "balance_loss_mlp": 0.01253324, + "epoch": 0.5601683451074703, + "flos": 24103974695040.0, + "grad_norm": 1.9270955506174936, + "language_loss": 0.87415564, + "learning_rate": 1.7083630762182022e-06, + "loss": 0.95101541, + "num_input_tokens_seen": 200626340, + "router_z_loss_clip": 1.45019531, + "router_z_loss_mlp": 0.11236572, + "step": 9317, + "time_per_iteration": 2.6297550201416016 + }, + { + "auxiliary_loss_clip": 0.06425779, + "auxiliary_loss_mlp": 0.01267741, + "balance_loss_clip": 0.06277692, + "balance_loss_mlp": 0.01255122, + "epoch": 0.5602284683601383, + "flos": 26362868590080.0, + "grad_norm": 1.81541721599753, + "language_loss": 0.77282947, + "learning_rate": 1.7079777828825233e-06, + "loss": 0.84976465, + "num_input_tokens_seen": 200644520, + "router_z_loss_clip": 1.47949219, + "router_z_loss_mlp": 0.1260376, + "step": 9318, + "time_per_iteration": 2.558359146118164 + }, + { + "auxiliary_loss_clip": 0.06418364, + "auxiliary_loss_mlp": 0.01266654, + "balance_loss_clip": 0.06273092, + "balance_loss_mlp": 0.01256301, + "epoch": 0.5602885916128063, + "flos": 24502904784000.0, + "grad_norm": 1.570238706906967, + "language_loss": 0.76465648, + "learning_rate": 1.7075925006203558e-06, + "loss": 0.84150666, + "num_input_tokens_seen": 200664845, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.10357666, + "step": 9319, + "time_per_iteration": 2.526543617248535 + }, + { + "auxiliary_loss_clip": 0.06418289, + "auxiliary_loss_mlp": 0.01264734, + "balance_loss_clip": 0.06273629, + "balance_loss_mlp": 0.01253427, + "epoch": 0.5603487148654742, + "flos": 27352450909440.0, + "grad_norm": 1.3333617188310043, + "language_loss": 0.85846102, + "learning_rate": 1.7072072294463101e-06, + "loss": 0.93529117, + "num_input_tokens_seen": 200686535, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.11309814, + "step": 9320, + "time_per_iteration": 2.5673651695251465 + }, + { + "auxiliary_loss_clip": 0.06334086, + "auxiliary_loss_mlp": 0.01252081, + "balance_loss_clip": 0.06272272, + "balance_loss_mlp": 0.01250187, + "epoch": 0.5604088381181422, + "flos": 54105555962880.0, + "grad_norm": 0.7541324814402665, + "language_loss": 0.52607638, + "learning_rate": 1.706821969374996e-06, + "loss": 0.60193801, + "num_input_tokens_seen": 200736965, + "router_z_loss_clip": 0.6171875, + "router_z_loss_mlp": 0.01890564, + "step": 9321, + "time_per_iteration": 2.977881908416748 + }, + { + "auxiliary_loss_clip": 0.06418586, + "auxiliary_loss_mlp": 0.01265276, + "balance_loss_clip": 0.06276635, + "balance_loss_mlp": 0.01254208, + "epoch": 0.5604689613708101, + "flos": 22242878858880.0, + "grad_norm": 1.3667787345793438, + "language_loss": 0.7480129, + "learning_rate": 1.7064367204210216e-06, + "loss": 0.82485151, + "num_input_tokens_seen": 200757420, + "router_z_loss_clip": 1.41699219, + "router_z_loss_mlp": 0.1105957, + "step": 9322, + "time_per_iteration": 2.532274007797241 + }, + { + "auxiliary_loss_clip": 0.06422681, + "auxiliary_loss_mlp": 0.01271383, + "balance_loss_clip": 0.06276275, + "balance_loss_mlp": 0.01258842, + "epoch": 0.5605290846234782, + "flos": 35306370132480.0, + "grad_norm": 1.7253794934771503, + "language_loss": 0.73680359, + "learning_rate": 1.7060514825989963e-06, + "loss": 0.81374425, + "num_input_tokens_seen": 200779520, + "router_z_loss_clip": 1.46289062, + "router_z_loss_mlp": 0.12542725, + "step": 9323, + "time_per_iteration": 2.6399970054626465 + }, + { + "auxiliary_loss_clip": 0.06425279, + "auxiliary_loss_mlp": 0.01266665, + "balance_loss_clip": 0.06275266, + "balance_loss_mlp": 0.01254505, + "epoch": 0.5605892078761461, + "flos": 20268997027200.0, + "grad_norm": 1.5398366577575928, + "language_loss": 0.62584162, + "learning_rate": 1.7056662559235286e-06, + "loss": 0.70276111, + "num_input_tokens_seen": 200799485, + "router_z_loss_clip": 1.49902344, + "router_z_loss_mlp": 0.12164307, + "step": 9324, + "time_per_iteration": 2.5179386138916016 + }, + { + "auxiliary_loss_clip": 0.06420085, + "auxiliary_loss_mlp": 0.01268132, + "balance_loss_clip": 0.0627415, + "balance_loss_mlp": 0.01255055, + "epoch": 0.5606493311288141, + "flos": 17313582867840.0, + "grad_norm": 2.467078298144656, + "language_loss": 0.88032669, + "learning_rate": 1.705281040409226e-06, + "loss": 0.95720887, + "num_input_tokens_seen": 200817540, + "router_z_loss_clip": 1.45898438, + "router_z_loss_mlp": 0.13092041, + "step": 9325, + "time_per_iteration": 2.5009984970092773 + }, + { + "auxiliary_loss_clip": 0.06425651, + "auxiliary_loss_mlp": 0.01271739, + "balance_loss_clip": 0.0627806, + "balance_loss_mlp": 0.01259454, + "epoch": 0.560709454381482, + "flos": 21659438079360.0, + "grad_norm": 1.5802994463075606, + "language_loss": 0.74048662, + "learning_rate": 1.7048958360706952e-06, + "loss": 0.81746054, + "num_input_tokens_seen": 200838380, + "router_z_loss_clip": 1.47558594, + "router_z_loss_mlp": 0.1229248, + "step": 9326, + "time_per_iteration": 2.53534197807312 + }, + { + "auxiliary_loss_clip": 0.06427591, + "auxiliary_loss_mlp": 0.0127498, + "balance_loss_clip": 0.06276761, + "balance_loss_mlp": 0.01262648, + "epoch": 0.56076957763415, + "flos": 20309639057280.0, + "grad_norm": 1.7151684776487535, + "language_loss": 0.79090071, + "learning_rate": 1.7045106429225447e-06, + "loss": 0.86792642, + "num_input_tokens_seen": 200855640, + "router_z_loss_clip": 1.50976562, + "router_z_loss_mlp": 0.12329102, + "step": 9327, + "time_per_iteration": 2.505734920501709 + }, + { + "auxiliary_loss_clip": 0.06422938, + "auxiliary_loss_mlp": 0.01268373, + "balance_loss_clip": 0.06277183, + "balance_loss_mlp": 0.01256201, + "epoch": 0.5608297008868179, + "flos": 25052873057280.0, + "grad_norm": 1.3540928387883675, + "language_loss": 0.7848016, + "learning_rate": 1.7041254609793795e-06, + "loss": 0.86171472, + "num_input_tokens_seen": 200876585, + "router_z_loss_clip": 1.45605469, + "router_z_loss_mlp": 0.12176514, + "step": 9328, + "time_per_iteration": 2.5479724407196045 + }, + { + "auxiliary_loss_clip": 0.06421052, + "auxiliary_loss_mlp": 0.01265937, + "balance_loss_clip": 0.06277333, + "balance_loss_mlp": 0.01255023, + "epoch": 0.560889824139486, + "flos": 19873253393280.0, + "grad_norm": 1.4144017329991472, + "language_loss": 0.7383225, + "learning_rate": 1.7037402902558066e-06, + "loss": 0.8151924, + "num_input_tokens_seen": 200898175, + "router_z_loss_clip": 1.43652344, + "router_z_loss_mlp": 0.10913086, + "step": 9329, + "time_per_iteration": 2.665193796157837 + }, + { + "auxiliary_loss_clip": 0.06430677, + "auxiliary_loss_mlp": 0.01265446, + "balance_loss_clip": 0.06278004, + "balance_loss_mlp": 0.01253269, + "epoch": 0.5609499473921539, + "flos": 22935961105920.0, + "grad_norm": 1.4811079467360542, + "language_loss": 0.83903289, + "learning_rate": 1.7033551307664324e-06, + "loss": 0.91599417, + "num_input_tokens_seen": 200917515, + "router_z_loss_clip": 1.5234375, + "router_z_loss_mlp": 0.12176514, + "step": 9330, + "time_per_iteration": 2.574812650680542 + }, + { + "auxiliary_loss_clip": 0.06343255, + "auxiliary_loss_mlp": 0.01254504, + "balance_loss_clip": 0.06281585, + "balance_loss_mlp": 0.01252853, + "epoch": 0.5610100706448219, + "flos": 53054479146240.0, + "grad_norm": 0.7010589280292991, + "language_loss": 0.57785869, + "learning_rate": 1.7029699825258603e-06, + "loss": 0.65383625, + "num_input_tokens_seen": 200978615, + "router_z_loss_clip": 0.6171875, + "router_z_loss_mlp": 0.01654053, + "step": 9331, + "time_per_iteration": 3.16204833984375 + }, + { + "auxiliary_loss_clip": 0.06429492, + "auxiliary_loss_mlp": 0.01266406, + "balance_loss_clip": 0.06280065, + "balance_loss_mlp": 0.01254723, + "epoch": 0.5610701938974898, + "flos": 21841349293440.0, + "grad_norm": 1.62115536838187, + "language_loss": 0.81915009, + "learning_rate": 1.7025848455486971e-06, + "loss": 0.89610904, + "num_input_tokens_seen": 200997745, + "router_z_loss_clip": 1.49414062, + "router_z_loss_mlp": 0.11682129, + "step": 9332, + "time_per_iteration": 2.503162145614624 + }, + { + "auxiliary_loss_clip": 0.06436246, + "auxiliary_loss_mlp": 0.01268376, + "balance_loss_clip": 0.06285603, + "balance_loss_mlp": 0.01255936, + "epoch": 0.5611303171501578, + "flos": 17462943970560.0, + "grad_norm": 2.4447262023658314, + "language_loss": 0.8238855, + "learning_rate": 1.7021997198495454e-06, + "loss": 0.90093172, + "num_input_tokens_seen": 201016370, + "router_z_loss_clip": 1.50585938, + "router_z_loss_mlp": 0.12451172, + "step": 9333, + "time_per_iteration": 2.5434911251068115 + }, + { + "auxiliary_loss_clip": 0.06429712, + "auxiliary_loss_mlp": 0.01266007, + "balance_loss_clip": 0.062811, + "balance_loss_mlp": 0.01254843, + "epoch": 0.5611904404028258, + "flos": 22644366497280.0, + "grad_norm": 1.7517485290647843, + "language_loss": 0.73036361, + "learning_rate": 1.7018146054430108e-06, + "loss": 0.80732077, + "num_input_tokens_seen": 201034310, + "router_z_loss_clip": 1.48535156, + "router_z_loss_mlp": 0.11157227, + "step": 9334, + "time_per_iteration": 2.5099892616271973 + }, + { + "auxiliary_loss_clip": 0.06427494, + "auxiliary_loss_mlp": 0.01271173, + "balance_loss_clip": 0.06281948, + "balance_loss_mlp": 0.01259771, + "epoch": 0.5612505636554938, + "flos": 14321048549760.0, + "grad_norm": 1.6258746678295788, + "language_loss": 0.71251893, + "learning_rate": 1.7014295023436961e-06, + "loss": 0.7895056, + "num_input_tokens_seen": 201052030, + "router_z_loss_clip": 1.453125, + "router_z_loss_mlp": 0.11395264, + "step": 9335, + "time_per_iteration": 3.8910462856292725 + }, + { + "auxiliary_loss_clip": 0.06430685, + "auxiliary_loss_mlp": 0.01266094, + "balance_loss_clip": 0.06283418, + "balance_loss_mlp": 0.01254149, + "epoch": 0.5613106869081618, + "flos": 16513835973120.0, + "grad_norm": 1.6562270786725333, + "language_loss": 0.7703501, + "learning_rate": 1.701044410566205e-06, + "loss": 0.84731793, + "num_input_tokens_seen": 201068445, + "router_z_loss_clip": 1.46972656, + "router_z_loss_mlp": 0.11932373, + "step": 9336, + "time_per_iteration": 2.5473687648773193 + }, + { + "auxiliary_loss_clip": 0.0642574, + "auxiliary_loss_mlp": 0.01265654, + "balance_loss_clip": 0.06282386, + "balance_loss_mlp": 0.0125489, + "epoch": 0.5613708101608297, + "flos": 24065009746560.0, + "grad_norm": 2.1630350478443625, + "language_loss": 0.64571506, + "learning_rate": 1.7006593301251393e-06, + "loss": 0.72262907, + "num_input_tokens_seen": 201082140, + "router_z_loss_clip": 1.43359375, + "router_z_loss_mlp": 0.10766602, + "step": 9337, + "time_per_iteration": 2.5193097591400146 + }, + { + "auxiliary_loss_clip": 0.06341661, + "auxiliary_loss_mlp": 0.01252845, + "balance_loss_clip": 0.06279477, + "balance_loss_mlp": 0.01251057, + "epoch": 0.5614309334134977, + "flos": 64922284984320.0, + "grad_norm": 0.883081868959654, + "language_loss": 0.62614578, + "learning_rate": 1.700274261035102e-06, + "loss": 0.7020908, + "num_input_tokens_seen": 201137245, + "router_z_loss_clip": 0.62353516, + "router_z_loss_mlp": 0.01785278, + "step": 9338, + "time_per_iteration": 3.115088939666748 + }, + { + "auxiliary_loss_clip": 0.06430536, + "auxiliary_loss_mlp": 0.01265908, + "balance_loss_clip": 0.0628281, + "balance_loss_mlp": 0.01254428, + "epoch": 0.5614910566661656, + "flos": 32926975666560.0, + "grad_norm": 1.7643724476932883, + "language_loss": 0.66069186, + "learning_rate": 1.6998892033106946e-06, + "loss": 0.73765635, + "num_input_tokens_seen": 201157270, + "router_z_loss_clip": 1.47851562, + "router_z_loss_mlp": 0.11474609, + "step": 9339, + "time_per_iteration": 4.156280040740967 + }, + { + "auxiliary_loss_clip": 0.06427112, + "auxiliary_loss_mlp": 0.01266835, + "balance_loss_clip": 0.06283177, + "balance_loss_mlp": 0.01254055, + "epoch": 0.5615511799188336, + "flos": 18594927504000.0, + "grad_norm": 1.6693116386089952, + "language_loss": 0.69893128, + "learning_rate": 1.6995041569665184e-06, + "loss": 0.77587074, + "num_input_tokens_seen": 201174530, + "router_z_loss_clip": 1.44042969, + "router_z_loss_mlp": 0.12774658, + "step": 9340, + "time_per_iteration": 2.4951670169830322 + }, + { + "auxiliary_loss_clip": 0.06425936, + "auxiliary_loss_mlp": 0.0126872, + "balance_loss_clip": 0.06286716, + "balance_loss_mlp": 0.01257168, + "epoch": 0.5616113031715015, + "flos": 22826571200640.0, + "grad_norm": 1.554264314492227, + "language_loss": 0.77897537, + "learning_rate": 1.6991191220171756e-06, + "loss": 0.85592192, + "num_input_tokens_seen": 201194905, + "router_z_loss_clip": 1.39257812, + "router_z_loss_mlp": 0.11566162, + "step": 9341, + "time_per_iteration": 2.557020902633667 + }, + { + "auxiliary_loss_clip": 0.06432091, + "auxiliary_loss_mlp": 0.01269049, + "balance_loss_clip": 0.06284195, + "balance_loss_mlp": 0.01256776, + "epoch": 0.5616714264241696, + "flos": 22352184910080.0, + "grad_norm": 1.797407374183417, + "language_loss": 0.80132401, + "learning_rate": 1.6987340984772653e-06, + "loss": 0.87833536, + "num_input_tokens_seen": 201213715, + "router_z_loss_clip": 1.47851562, + "router_z_loss_mlp": 0.12261963, + "step": 9342, + "time_per_iteration": 2.5441479682922363 + }, + { + "auxiliary_loss_clip": 0.06439396, + "auxiliary_loss_mlp": 0.01269037, + "balance_loss_clip": 0.06290646, + "balance_loss_mlp": 0.01257325, + "epoch": 0.5617315496768375, + "flos": 18813875022720.0, + "grad_norm": 2.3951377685236346, + "language_loss": 0.75757158, + "learning_rate": 1.6983490863613882e-06, + "loss": 0.83465594, + "num_input_tokens_seen": 201231415, + "router_z_loss_clip": 1.48828125, + "router_z_loss_mlp": 0.1171875, + "step": 9343, + "time_per_iteration": 2.552783489227295 + }, + { + "auxiliary_loss_clip": 0.06435137, + "auxiliary_loss_mlp": 0.01268416, + "balance_loss_clip": 0.06290908, + "balance_loss_mlp": 0.0125656, + "epoch": 0.5617916729295055, + "flos": 18375225298560.0, + "grad_norm": 1.7365132961619254, + "language_loss": 0.69429743, + "learning_rate": 1.6979640856841442e-06, + "loss": 0.77133292, + "num_input_tokens_seen": 201249625, + "router_z_loss_clip": 1.44140625, + "router_z_loss_mlp": 0.11853027, + "step": 9344, + "time_per_iteration": 3.940319061279297 + }, + { + "auxiliary_loss_clip": 0.06436205, + "auxiliary_loss_mlp": 0.01265611, + "balance_loss_clip": 0.06290596, + "balance_loss_mlp": 0.01254048, + "epoch": 0.5618517961821734, + "flos": 28186844267520.0, + "grad_norm": 2.084209166838754, + "language_loss": 0.66667032, + "learning_rate": 1.6975790964601318e-06, + "loss": 0.74368846, + "num_input_tokens_seen": 201271205, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.11560059, + "step": 9345, + "time_per_iteration": 2.5695786476135254 + }, + { + "auxiliary_loss_clip": 0.06434141, + "auxiliary_loss_mlp": 0.01269002, + "balance_loss_clip": 0.06287882, + "balance_loss_mlp": 0.01257683, + "epoch": 0.5619119194348414, + "flos": 15492290520960.0, + "grad_norm": 1.7418235878832828, + "language_loss": 0.88078266, + "learning_rate": 1.6971941187039512e-06, + "loss": 0.9578141, + "num_input_tokens_seen": 201287700, + "router_z_loss_clip": 1.46289062, + "router_z_loss_mlp": 0.11328125, + "step": 9346, + "time_per_iteration": 2.470212697982788 + }, + { + "auxiliary_loss_clip": 0.06433322, + "auxiliary_loss_mlp": 0.01273387, + "balance_loss_clip": 0.06289656, + "balance_loss_mlp": 0.01261257, + "epoch": 0.5619720426875094, + "flos": 29135700702720.0, + "grad_norm": 2.0124429779516335, + "language_loss": 0.5980221, + "learning_rate": 1.6968091524301993e-06, + "loss": 0.67508924, + "num_input_tokens_seen": 201307530, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.12139893, + "step": 9347, + "time_per_iteration": 2.5825982093811035 + }, + { + "auxiliary_loss_clip": 0.06435403, + "auxiliary_loss_mlp": 0.01270938, + "balance_loss_clip": 0.06288013, + "balance_loss_mlp": 0.01258349, + "epoch": 0.5620321659401774, + "flos": 18009474226560.0, + "grad_norm": 2.2126455504112066, + "language_loss": 0.69822383, + "learning_rate": 1.6964241976534745e-06, + "loss": 0.77528727, + "num_input_tokens_seen": 201326210, + "router_z_loss_clip": 1.47265625, + "router_z_loss_mlp": 0.12609863, + "step": 9348, + "time_per_iteration": 2.5037167072296143 + }, + { + "auxiliary_loss_clip": 0.0644159, + "auxiliary_loss_mlp": 0.01266077, + "balance_loss_clip": 0.06289469, + "balance_loss_mlp": 0.01254037, + "epoch": 0.5620922891928454, + "flos": 20600730541440.0, + "grad_norm": 3.445873194626742, + "language_loss": 0.79441649, + "learning_rate": 1.6960392543883754e-06, + "loss": 0.87149316, + "num_input_tokens_seen": 201346120, + "router_z_loss_clip": 1.52148438, + "router_z_loss_mlp": 0.12036133, + "step": 9349, + "time_per_iteration": 2.5519816875457764 + }, + { + "auxiliary_loss_clip": 0.06431362, + "auxiliary_loss_mlp": 0.01269513, + "balance_loss_clip": 0.06285249, + "balance_loss_mlp": 0.01257014, + "epoch": 0.5621524124455133, + "flos": 26294288423040.0, + "grad_norm": 2.015932955485816, + "language_loss": 0.67743355, + "learning_rate": 1.6956543226494975e-06, + "loss": 0.75444239, + "num_input_tokens_seen": 201365700, + "router_z_loss_clip": 1.45996094, + "router_z_loss_mlp": 0.12493896, + "step": 9350, + "time_per_iteration": 4.01330304145813 + }, + { + "auxiliary_loss_clip": 0.06434298, + "auxiliary_loss_mlp": 0.012681, + "balance_loss_clip": 0.06285301, + "balance_loss_mlp": 0.01256281, + "epoch": 0.5622125356981813, + "flos": 12755236901760.0, + "grad_norm": 2.011118504157059, + "language_loss": 0.78970456, + "learning_rate": 1.6952694024514381e-06, + "loss": 0.86672854, + "num_input_tokens_seen": 201382795, + "router_z_loss_clip": 1.49023438, + "router_z_loss_mlp": 0.11834717, + "step": 9351, + "time_per_iteration": 2.502434015274048 + }, + { + "auxiliary_loss_clip": 0.06430681, + "auxiliary_loss_mlp": 0.01265572, + "balance_loss_clip": 0.06279105, + "balance_loss_mlp": 0.01252894, + "epoch": 0.5622726589508492, + "flos": 23812086597120.0, + "grad_norm": 1.4860121982116354, + "language_loss": 0.59339732, + "learning_rate": 1.6948844938087945e-06, + "loss": 0.67035985, + "num_input_tokens_seen": 201402780, + "router_z_loss_clip": 1.51660156, + "router_z_loss_mlp": 0.12677002, + "step": 9352, + "time_per_iteration": 2.5574684143066406 + }, + { + "auxiliary_loss_clip": 0.06420172, + "auxiliary_loss_mlp": 0.01265668, + "balance_loss_clip": 0.062802, + "balance_loss_mlp": 0.01255041, + "epoch": 0.5623327822035172, + "flos": 24725248392960.0, + "grad_norm": 2.450009031651053, + "language_loss": 0.72177416, + "learning_rate": 1.6944995967361604e-06, + "loss": 0.7986325, + "num_input_tokens_seen": 201424140, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.10632324, + "step": 9353, + "time_per_iteration": 2.5429112911224365 + }, + { + "auxiliary_loss_clip": 0.06427602, + "auxiliary_loss_mlp": 0.01266418, + "balance_loss_clip": 0.06280185, + "balance_loss_mlp": 0.01255207, + "epoch": 0.5623929054561851, + "flos": 14023081031040.0, + "grad_norm": 3.091375667054191, + "language_loss": 0.7687071, + "learning_rate": 1.6941147112481327e-06, + "loss": 0.84564734, + "num_input_tokens_seen": 201439645, + "router_z_loss_clip": 1.47460938, + "router_z_loss_mlp": 0.11212158, + "step": 9354, + "time_per_iteration": 2.511843204498291 + }, + { + "auxiliary_loss_clip": 0.0643307, + "auxiliary_loss_mlp": 0.01268158, + "balance_loss_clip": 0.0628096, + "balance_loss_mlp": 0.01256672, + "epoch": 0.5624530287088532, + "flos": 20710707425280.0, + "grad_norm": 1.9243574999426976, + "language_loss": 0.72663665, + "learning_rate": 1.6937298373593056e-06, + "loss": 0.80364901, + "num_input_tokens_seen": 201459970, + "router_z_loss_clip": 1.52050781, + "router_z_loss_mlp": 0.1149292, + "step": 9355, + "time_per_iteration": 2.5472323894500732 + }, + { + "auxiliary_loss_clip": 0.06422609, + "auxiliary_loss_mlp": 0.01264166, + "balance_loss_clip": 0.06276853, + "balance_loss_mlp": 0.01252638, + "epoch": 0.5625131519615211, + "flos": 21477401084160.0, + "grad_norm": 1.4661709593952188, + "language_loss": 0.73949313, + "learning_rate": 1.693344975084274e-06, + "loss": 0.81636083, + "num_input_tokens_seen": 201480055, + "router_z_loss_clip": 1.45800781, + "router_z_loss_mlp": 0.11535645, + "step": 9356, + "time_per_iteration": 2.5417375564575195 + }, + { + "auxiliary_loss_clip": 0.06421204, + "auxiliary_loss_mlp": 0.01265523, + "balance_loss_clip": 0.0627971, + "balance_loss_mlp": 0.01254043, + "epoch": 0.5625732752141891, + "flos": 18704023920000.0, + "grad_norm": 1.8811670281572186, + "language_loss": 0.83384252, + "learning_rate": 1.6929601244376318e-06, + "loss": 0.9107098, + "num_input_tokens_seen": 201497645, + "router_z_loss_clip": 1.41601562, + "router_z_loss_mlp": 0.11480713, + "step": 9357, + "time_per_iteration": 2.4678521156311035 + }, + { + "auxiliary_loss_clip": 0.06426045, + "auxiliary_loss_mlp": 0.01266314, + "balance_loss_clip": 0.06279635, + "balance_loss_mlp": 0.01255705, + "epoch": 0.562633398466857, + "flos": 16222492926720.0, + "grad_norm": 2.0645024289256293, + "language_loss": 0.7263062, + "learning_rate": 1.6925752854339722e-06, + "loss": 0.80322981, + "num_input_tokens_seen": 201515455, + "router_z_loss_clip": 1.46289062, + "router_z_loss_mlp": 0.1060791, + "step": 9358, + "time_per_iteration": 2.5186126232147217 + }, + { + "auxiliary_loss_clip": 0.06416523, + "auxiliary_loss_mlp": 0.01266054, + "balance_loss_clip": 0.06273469, + "balance_loss_mlp": 0.0125408, + "epoch": 0.562693521719525, + "flos": 22498485338880.0, + "grad_norm": 1.808809546066597, + "language_loss": 0.78313565, + "learning_rate": 1.6921904580878885e-06, + "loss": 0.85996139, + "num_input_tokens_seen": 201534500, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.11981201, + "step": 9359, + "time_per_iteration": 2.4950146675109863 + }, + { + "auxiliary_loss_clip": 0.06422278, + "auxiliary_loss_mlp": 0.01266031, + "balance_loss_clip": 0.06277263, + "balance_loss_mlp": 0.01254123, + "epoch": 0.562753644972193, + "flos": 25337088506880.0, + "grad_norm": 1.6393117198147682, + "language_loss": 0.70198202, + "learning_rate": 1.6918056424139736e-06, + "loss": 0.77886516, + "num_input_tokens_seen": 201553280, + "router_z_loss_clip": 1.44921875, + "router_z_loss_mlp": 0.11920166, + "step": 9360, + "time_per_iteration": 2.5677337646484375 + }, + { + "auxiliary_loss_clip": 0.06333196, + "auxiliary_loss_mlp": 0.01259618, + "balance_loss_clip": 0.06271995, + "balance_loss_mlp": 0.01258209, + "epoch": 0.562813768224861, + "flos": 67410566231040.0, + "grad_norm": 0.7608015706194778, + "language_loss": 0.55599511, + "learning_rate": 1.6914208384268197e-06, + "loss": 0.63192326, + "num_input_tokens_seen": 201610030, + "router_z_loss_clip": 0.61328125, + "router_z_loss_mlp": 0.0140686, + "step": 9361, + "time_per_iteration": 3.047746419906616 + }, + { + "auxiliary_loss_clip": 0.06421309, + "auxiliary_loss_mlp": 0.01270958, + "balance_loss_clip": 0.06278641, + "balance_loss_mlp": 0.01260271, + "epoch": 0.562873891477529, + "flos": 23337868014720.0, + "grad_norm": 1.4415772957289732, + "language_loss": 0.82031697, + "learning_rate": 1.691036046141018e-06, + "loss": 0.89723963, + "num_input_tokens_seen": 201628370, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.10687256, + "step": 9362, + "time_per_iteration": 2.5085341930389404 + }, + { + "auxiliary_loss_clip": 0.06425183, + "auxiliary_loss_mlp": 0.01265052, + "balance_loss_clip": 0.06282046, + "balance_loss_mlp": 0.01254067, + "epoch": 0.5629340147301969, + "flos": 38482073475840.0, + "grad_norm": 1.5514506959778531, + "language_loss": 0.74991751, + "learning_rate": 1.6906512655711614e-06, + "loss": 0.8268199, + "num_input_tokens_seen": 201649790, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.10992432, + "step": 9363, + "time_per_iteration": 2.6483652591705322 + }, + { + "auxiliary_loss_clip": 0.06428041, + "auxiliary_loss_mlp": 0.01269517, + "balance_loss_clip": 0.06280389, + "balance_loss_mlp": 0.01257573, + "epoch": 0.5629941379828649, + "flos": 29249744509440.0, + "grad_norm": 1.527132274705304, + "language_loss": 0.82966727, + "learning_rate": 1.690266496731839e-06, + "loss": 0.90664279, + "num_input_tokens_seen": 201669175, + "router_z_loss_clip": 1.47460938, + "router_z_loss_mlp": 0.11962891, + "step": 9364, + "time_per_iteration": 2.585028648376465 + }, + { + "auxiliary_loss_clip": 0.06420554, + "auxiliary_loss_mlp": 0.01264228, + "balance_loss_clip": 0.06281281, + "balance_loss_mlp": 0.01253207, + "epoch": 0.5630542612355328, + "flos": 19425882844800.0, + "grad_norm": 1.9441356766600106, + "language_loss": 0.65449685, + "learning_rate": 1.689881739637642e-06, + "loss": 0.7313447, + "num_input_tokens_seen": 201687000, + "router_z_loss_clip": 1.39257812, + "router_z_loss_mlp": 0.11022949, + "step": 9365, + "time_per_iteration": 2.5320210456848145 + }, + { + "auxiliary_loss_clip": 0.06432588, + "auxiliary_loss_mlp": 0.01270623, + "balance_loss_clip": 0.06279749, + "balance_loss_mlp": 0.0125841, + "epoch": 0.5631143844882008, + "flos": 22271697463680.0, + "grad_norm": 2.4081978900655114, + "language_loss": 0.81779563, + "learning_rate": 1.6894969943031611e-06, + "loss": 0.89482784, + "num_input_tokens_seen": 201703335, + "router_z_loss_clip": 1.52929688, + "router_z_loss_mlp": 0.12213135, + "step": 9366, + "time_per_iteration": 2.5602293014526367 + }, + { + "auxiliary_loss_clip": 0.06419416, + "auxiliary_loss_mlp": 0.01263434, + "balance_loss_clip": 0.06277686, + "balance_loss_mlp": 0.01253033, + "epoch": 0.5631745077408687, + "flos": 22971781526400.0, + "grad_norm": 1.4555155937951827, + "language_loss": 0.73903221, + "learning_rate": 1.6891122607429845e-06, + "loss": 0.81586075, + "num_input_tokens_seen": 201723495, + "router_z_loss_clip": 1.41894531, + "router_z_loss_mlp": 0.10400391, + "step": 9367, + "time_per_iteration": 2.5222184658050537 + }, + { + "auxiliary_loss_clip": 0.0633425, + "auxiliary_loss_mlp": 0.01256933, + "balance_loss_clip": 0.06272865, + "balance_loss_mlp": 0.01255295, + "epoch": 0.5632346309935368, + "flos": 65101917409920.0, + "grad_norm": 0.6175920076853201, + "language_loss": 0.5334087, + "learning_rate": 1.6887275389717028e-06, + "loss": 0.60932058, + "num_input_tokens_seen": 201792615, + "router_z_loss_clip": 0.61669922, + "router_z_loss_mlp": 0.0164032, + "step": 9368, + "time_per_iteration": 3.3093104362487793 + }, + { + "auxiliary_loss_clip": 0.06421301, + "auxiliary_loss_mlp": 0.0127307, + "balance_loss_clip": 0.06277905, + "balance_loss_mlp": 0.01261757, + "epoch": 0.5632947542462047, + "flos": 23009572517760.0, + "grad_norm": 1.6075197920052449, + "language_loss": 0.69183493, + "learning_rate": 1.6883428290039046e-06, + "loss": 0.76877862, + "num_input_tokens_seen": 201812520, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.11315918, + "step": 9369, + "time_per_iteration": 2.5406625270843506 + }, + { + "auxiliary_loss_clip": 0.06420332, + "auxiliary_loss_mlp": 0.01269293, + "balance_loss_clip": 0.06275883, + "balance_loss_mlp": 0.01258105, + "epoch": 0.5633548774988727, + "flos": 30490530969600.0, + "grad_norm": 1.6779781841725052, + "language_loss": 0.76048809, + "learning_rate": 1.6879581308541763e-06, + "loss": 0.83738434, + "num_input_tokens_seen": 201834185, + "router_z_loss_clip": 1.4453125, + "router_z_loss_mlp": 0.11175537, + "step": 9370, + "time_per_iteration": 2.591212272644043 + }, + { + "auxiliary_loss_clip": 0.06424968, + "auxiliary_loss_mlp": 0.01266151, + "balance_loss_clip": 0.06275932, + "balance_loss_mlp": 0.01253908, + "epoch": 0.5634150007515406, + "flos": 18520938748800.0, + "grad_norm": 1.8374331787518619, + "language_loss": 0.76029092, + "learning_rate": 1.687573444537108e-06, + "loss": 0.83720207, + "num_input_tokens_seen": 201851305, + "router_z_loss_clip": 1.49023438, + "router_z_loss_mlp": 0.12237549, + "step": 9371, + "time_per_iteration": 2.5327818393707275 + }, + { + "auxiliary_loss_clip": 0.06421979, + "auxiliary_loss_mlp": 0.01268189, + "balance_loss_clip": 0.06277596, + "balance_loss_mlp": 0.01256739, + "epoch": 0.5634751240042086, + "flos": 19250679957120.0, + "grad_norm": 1.7360135917661768, + "language_loss": 0.762514, + "learning_rate": 1.687188770067285e-06, + "loss": 0.83941567, + "num_input_tokens_seen": 201870350, + "router_z_loss_clip": 1.44335938, + "router_z_loss_mlp": 0.11456299, + "step": 9372, + "time_per_iteration": 2.519404411315918 + }, + { + "auxiliary_loss_clip": 0.06422761, + "auxiliary_loss_mlp": 0.01266353, + "balance_loss_clip": 0.06280088, + "balance_loss_mlp": 0.01255016, + "epoch": 0.5635352472568766, + "flos": 12025453766400.0, + "grad_norm": 1.884768041604824, + "language_loss": 0.71853095, + "learning_rate": 1.6868041074592956e-06, + "loss": 0.79542208, + "num_input_tokens_seen": 201886800, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.11334229, + "step": 9373, + "time_per_iteration": 2.5053837299346924 + }, + { + "auxiliary_loss_clip": 0.06422034, + "auxiliary_loss_mlp": 0.01268801, + "balance_loss_clip": 0.06277832, + "balance_loss_mlp": 0.01256367, + "epoch": 0.5635953705095446, + "flos": 21878092108800.0, + "grad_norm": 1.841933865019323, + "language_loss": 0.83263683, + "learning_rate": 1.6864194567277264e-06, + "loss": 0.90954518, + "num_input_tokens_seen": 201904730, + "router_z_loss_clip": 1.44238281, + "router_z_loss_mlp": 0.12438965, + "step": 9374, + "time_per_iteration": 3.904900074005127 + }, + { + "auxiliary_loss_clip": 0.06420377, + "auxiliary_loss_mlp": 0.01266878, + "balance_loss_clip": 0.06277412, + "balance_loss_mlp": 0.01256131, + "epoch": 0.5636554937622126, + "flos": 27133587244800.0, + "grad_norm": 2.5670866003984583, + "language_loss": 0.66696084, + "learning_rate": 1.6860348178871618e-06, + "loss": 0.74383336, + "num_input_tokens_seen": 201924850, + "router_z_loss_clip": 1.42871094, + "router_z_loss_mlp": 0.10754395, + "step": 9375, + "time_per_iteration": 2.581921339035034 + }, + { + "auxiliary_loss_clip": 0.06426428, + "auxiliary_loss_mlp": 0.01265809, + "balance_loss_clip": 0.06279501, + "balance_loss_mlp": 0.0125433, + "epoch": 0.5637156170148805, + "flos": 12930314008320.0, + "grad_norm": 12.279905367602915, + "language_loss": 0.81403673, + "learning_rate": 1.6856501909521889e-06, + "loss": 0.89095908, + "num_input_tokens_seen": 201939500, + "router_z_loss_clip": 1.46972656, + "router_z_loss_mlp": 0.11474609, + "step": 9376, + "time_per_iteration": 2.5271008014678955 + }, + { + "auxiliary_loss_clip": 0.06430367, + "auxiliary_loss_mlp": 0.01265466, + "balance_loss_clip": 0.06280433, + "balance_loss_mlp": 0.01253974, + "epoch": 0.5637757402675485, + "flos": 45561460435200.0, + "grad_norm": 1.3765625381603785, + "language_loss": 0.69569075, + "learning_rate": 1.6852655759373925e-06, + "loss": 0.77264911, + "num_input_tokens_seen": 201963000, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.1149292, + "step": 9377, + "time_per_iteration": 2.7878713607788086 + }, + { + "auxiliary_loss_clip": 0.06418754, + "auxiliary_loss_mlp": 0.01265562, + "balance_loss_clip": 0.06278635, + "balance_loss_mlp": 0.01254887, + "epoch": 0.5638358635202164, + "flos": 20892241296000.0, + "grad_norm": 1.4815499035204616, + "language_loss": 0.75006419, + "learning_rate": 1.6848809728573565e-06, + "loss": 0.82690734, + "num_input_tokens_seen": 201983145, + "router_z_loss_clip": 1.40039062, + "router_z_loss_mlp": 0.10668945, + "step": 9378, + "time_per_iteration": 2.5742552280426025 + }, + { + "auxiliary_loss_clip": 0.06432593, + "auxiliary_loss_mlp": 0.01271419, + "balance_loss_clip": 0.06279133, + "balance_loss_mlp": 0.01258837, + "epoch": 0.5638959867728844, + "flos": 18812449503360.0, + "grad_norm": 2.3058329321149555, + "language_loss": 0.81874716, + "learning_rate": 1.6844963817266656e-06, + "loss": 0.8957873, + "num_input_tokens_seen": 202000335, + "router_z_loss_clip": 1.53710938, + "router_z_loss_mlp": 0.12585449, + "step": 9379, + "time_per_iteration": 3.9022350311279297 + }, + { + "auxiliary_loss_clip": 0.06428088, + "auxiliary_loss_mlp": 0.012688, + "balance_loss_clip": 0.06281307, + "balance_loss_mlp": 0.01256933, + "epoch": 0.5639561100255523, + "flos": 27497703162240.0, + "grad_norm": 1.9515300720121755, + "language_loss": 0.71783185, + "learning_rate": 1.6841118025599042e-06, + "loss": 0.79480064, + "num_input_tokens_seen": 202018275, + "router_z_loss_clip": 1.46582031, + "router_z_loss_mlp": 0.11859131, + "step": 9380, + "time_per_iteration": 2.6338086128234863 + }, + { + "auxiliary_loss_clip": 0.0642691, + "auxiliary_loss_mlp": 0.01266641, + "balance_loss_clip": 0.06279925, + "balance_loss_mlp": 0.01254857, + "epoch": 0.5640162332782204, + "flos": 18082289024640.0, + "grad_norm": 2.0751114915079687, + "language_loss": 0.75207865, + "learning_rate": 1.6837272353716542e-06, + "loss": 0.82901412, + "num_input_tokens_seen": 202034330, + "router_z_loss_clip": 1.46777344, + "router_z_loss_mlp": 0.11779785, + "step": 9381, + "time_per_iteration": 2.4637959003448486 + }, + { + "auxiliary_loss_clip": 0.06430316, + "auxiliary_loss_mlp": 0.01273879, + "balance_loss_clip": 0.06282466, + "balance_loss_mlp": 0.01262822, + "epoch": 0.5640763565308883, + "flos": 20890857703680.0, + "grad_norm": 2.2840815632275846, + "language_loss": 0.72823429, + "learning_rate": 1.683342680176499e-06, + "loss": 0.80527627, + "num_input_tokens_seen": 202053100, + "router_z_loss_clip": 1.47753906, + "router_z_loss_mlp": 0.11053467, + "step": 9382, + "time_per_iteration": 2.6038217544555664 + }, + { + "auxiliary_loss_clip": 0.0632898, + "auxiliary_loss_mlp": 0.01252773, + "balance_loss_clip": 0.06268109, + "balance_loss_mlp": 0.01251134, + "epoch": 0.5641364797835563, + "flos": 64467143205120.0, + "grad_norm": 0.7593633930380659, + "language_loss": 0.54457784, + "learning_rate": 1.682958136989022e-06, + "loss": 0.62039542, + "num_input_tokens_seen": 202120125, + "router_z_loss_clip": 0.609375, + "router_z_loss_mlp": 0.01641846, + "step": 9383, + "time_per_iteration": 4.702574253082275 + }, + { + "auxiliary_loss_clip": 0.06430694, + "auxiliary_loss_mlp": 0.01271925, + "balance_loss_clip": 0.06278884, + "balance_loss_mlp": 0.01260129, + "epoch": 0.5641966030362242, + "flos": 18666861834240.0, + "grad_norm": 1.6723183303987958, + "language_loss": 0.71441197, + "learning_rate": 1.6825736058238033e-06, + "loss": 0.79143822, + "num_input_tokens_seen": 202138030, + "router_z_loss_clip": 1.51757812, + "router_z_loss_mlp": 0.11798096, + "step": 9384, + "time_per_iteration": 2.4753105640411377 + }, + { + "auxiliary_loss_clip": 0.06421386, + "auxiliary_loss_mlp": 0.01266582, + "balance_loss_clip": 0.0627472, + "balance_loss_mlp": 0.01254626, + "epoch": 0.5642567262888922, + "flos": 22498946536320.0, + "grad_norm": 1.9187169203117838, + "language_loss": 0.76415217, + "learning_rate": 1.6821890866954263e-06, + "loss": 0.84103185, + "num_input_tokens_seen": 202155580, + "router_z_loss_clip": 1.46386719, + "router_z_loss_mlp": 0.1194458, + "step": 9385, + "time_per_iteration": 2.5245208740234375 + }, + { + "auxiliary_loss_clip": 0.06417953, + "auxiliary_loss_mlp": 0.01265769, + "balance_loss_clip": 0.0627504, + "balance_loss_mlp": 0.01255028, + "epoch": 0.5643168495415603, + "flos": 13008663175680.0, + "grad_norm": 1.914249541829808, + "language_loss": 0.82386243, + "learning_rate": 1.6818045796184703e-06, + "loss": 0.90069962, + "num_input_tokens_seen": 202170365, + "router_z_loss_clip": 1.42871094, + "router_z_loss_mlp": 0.10748291, + "step": 9386, + "time_per_iteration": 2.4669172763824463 + }, + { + "auxiliary_loss_clip": 0.06427868, + "auxiliary_loss_mlp": 0.01266292, + "balance_loss_clip": 0.06277144, + "balance_loss_mlp": 0.01255014, + "epoch": 0.5643769727942282, + "flos": 18594256671360.0, + "grad_norm": 1.9656567849197715, + "language_loss": 0.70471108, + "learning_rate": 1.681420084607516e-06, + "loss": 0.78165275, + "num_input_tokens_seen": 202189095, + "router_z_loss_clip": 1.50488281, + "router_z_loss_mlp": 0.112854, + "step": 9387, + "time_per_iteration": 2.5076122283935547 + }, + { + "auxiliary_loss_clip": 0.0642679, + "auxiliary_loss_mlp": 0.01267525, + "balance_loss_clip": 0.06276885, + "balance_loss_mlp": 0.01255348, + "epoch": 0.5644370960468962, + "flos": 33815343853440.0, + "grad_norm": 1.4623673546412521, + "language_loss": 0.75064629, + "learning_rate": 1.6810356016771452e-06, + "loss": 0.82758939, + "num_input_tokens_seen": 202213500, + "router_z_loss_clip": 1.49707031, + "router_z_loss_mlp": 0.12176514, + "step": 9388, + "time_per_iteration": 2.651616096496582 + }, + { + "auxiliary_loss_clip": 0.06417996, + "auxiliary_loss_mlp": 0.01267245, + "balance_loss_clip": 0.06276226, + "balance_loss_mlp": 0.01256892, + "epoch": 0.5644972192995641, + "flos": 21221249552640.0, + "grad_norm": 1.4874039445981817, + "language_loss": 0.82212514, + "learning_rate": 1.6806511308419353e-06, + "loss": 0.89897752, + "num_input_tokens_seen": 202231920, + "router_z_loss_clip": 1.41699219, + "router_z_loss_mlp": 0.10357666, + "step": 9389, + "time_per_iteration": 2.5609359741210938 + }, + { + "auxiliary_loss_clip": 0.06426319, + "auxiliary_loss_mlp": 0.01270818, + "balance_loss_clip": 0.06278206, + "balance_loss_mlp": 0.01258468, + "epoch": 0.5645573425522321, + "flos": 18593585838720.0, + "grad_norm": 2.1560569688057036, + "language_loss": 0.64486635, + "learning_rate": 1.680266672116467e-06, + "loss": 0.72183776, + "num_input_tokens_seen": 202247600, + "router_z_loss_clip": 1.47949219, + "router_z_loss_mlp": 0.12329102, + "step": 9390, + "time_per_iteration": 3.8905534744262695 + }, + { + "auxiliary_loss_clip": 0.06417844, + "auxiliary_loss_mlp": 0.01265997, + "balance_loss_clip": 0.06276117, + "balance_loss_mlp": 0.01255334, + "epoch": 0.5646174658049, + "flos": 18119660745600.0, + "grad_norm": 1.743379462466535, + "language_loss": 0.92393249, + "learning_rate": 1.6798822255153192e-06, + "loss": 1.00077093, + "num_input_tokens_seen": 202265350, + "router_z_loss_clip": 1.41992188, + "router_z_loss_mlp": 0.10662842, + "step": 9391, + "time_per_iteration": 2.4846012592315674 + }, + { + "auxiliary_loss_clip": 0.06426747, + "auxiliary_loss_mlp": 0.01269795, + "balance_loss_clip": 0.06274952, + "balance_loss_mlp": 0.0125751, + "epoch": 0.564677589057568, + "flos": 28337547035520.0, + "grad_norm": 2.079245602273352, + "language_loss": 0.60616773, + "learning_rate": 1.6794977910530684e-06, + "loss": 0.68313313, + "num_input_tokens_seen": 202284285, + "router_z_loss_clip": 1.51660156, + "router_z_loss_mlp": 0.12286377, + "step": 9392, + "time_per_iteration": 2.5709118843078613 + }, + { + "auxiliary_loss_clip": 0.06418676, + "auxiliary_loss_mlp": 0.01266956, + "balance_loss_clip": 0.06274032, + "balance_loss_mlp": 0.01255619, + "epoch": 0.564737712310236, + "flos": 22170273696000.0, + "grad_norm": 2.32400153493691, + "language_loss": 0.81762815, + "learning_rate": 1.6791133687442937e-06, + "loss": 0.8944844, + "num_input_tokens_seen": 202303450, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.11334229, + "step": 9393, + "time_per_iteration": 2.49820613861084 + }, + { + "auxiliary_loss_clip": 0.06420048, + "auxiliary_loss_mlp": 0.01268955, + "balance_loss_clip": 0.06274317, + "balance_loss_mlp": 0.01257434, + "epoch": 0.564797835562904, + "flos": 20965223802240.0, + "grad_norm": 1.8189771095125196, + "language_loss": 0.87738705, + "learning_rate": 1.6787289586035725e-06, + "loss": 0.95427704, + "num_input_tokens_seen": 202322315, + "router_z_loss_clip": 1.45703125, + "router_z_loss_mlp": 0.11523438, + "step": 9394, + "time_per_iteration": 2.5385193824768066 + }, + { + "auxiliary_loss_clip": 0.06421189, + "auxiliary_loss_mlp": 0.01271733, + "balance_loss_clip": 0.06278495, + "balance_loss_mlp": 0.01261135, + "epoch": 0.5648579588155719, + "flos": 17425991520000.0, + "grad_norm": 1.7000053900358165, + "language_loss": 0.84579873, + "learning_rate": 1.6783445606454814e-06, + "loss": 0.92272794, + "num_input_tokens_seen": 202339905, + "router_z_loss_clip": 1.42871094, + "router_z_loss_mlp": 0.1060791, + "step": 9395, + "time_per_iteration": 2.470017433166504 + }, + { + "auxiliary_loss_clip": 0.06326792, + "auxiliary_loss_mlp": 0.01253109, + "balance_loss_clip": 0.06265698, + "balance_loss_mlp": 0.01251535, + "epoch": 0.5649180820682399, + "flos": 69951187152000.0, + "grad_norm": 0.7657809500788333, + "language_loss": 0.57918489, + "learning_rate": 1.677960174884597e-06, + "loss": 0.65498388, + "num_input_tokens_seen": 202397320, + "router_z_loss_clip": 0.61132812, + "router_z_loss_mlp": 0.01573944, + "step": 9396, + "time_per_iteration": 3.1468727588653564 + }, + { + "auxiliary_loss_clip": 0.06423569, + "auxiliary_loss_mlp": 0.01267357, + "balance_loss_clip": 0.06276156, + "balance_loss_mlp": 0.01256205, + "epoch": 0.5649782053209078, + "flos": 24980058259200.0, + "grad_norm": 1.9294071175656426, + "language_loss": 0.70135093, + "learning_rate": 1.6775758013354943e-06, + "loss": 0.77826023, + "num_input_tokens_seen": 202416865, + "router_z_loss_clip": 1.47460938, + "router_z_loss_mlp": 0.11157227, + "step": 9397, + "time_per_iteration": 2.5551769733428955 + }, + { + "auxiliary_loss_clip": 0.06421924, + "auxiliary_loss_mlp": 0.01267113, + "balance_loss_clip": 0.06274733, + "balance_loss_mlp": 0.01256277, + "epoch": 0.5650383285735758, + "flos": 21733175272320.0, + "grad_norm": 3.1535749018048094, + "language_loss": 0.67165595, + "learning_rate": 1.67719144001275e-06, + "loss": 0.74854636, + "num_input_tokens_seen": 202436210, + "router_z_loss_clip": 1.47167969, + "router_z_loss_mlp": 0.10839844, + "step": 9398, + "time_per_iteration": 2.5690701007843018 + }, + { + "auxiliary_loss_clip": 0.06324084, + "auxiliary_loss_mlp": 0.01251867, + "balance_loss_clip": 0.06263297, + "balance_loss_mlp": 0.01250375, + "epoch": 0.5650984518262439, + "flos": 65923481093760.0, + "grad_norm": 0.7518933539640298, + "language_loss": 0.58143103, + "learning_rate": 1.6768070909309386e-06, + "loss": 0.65719062, + "num_input_tokens_seen": 202492925, + "router_z_loss_clip": 0.609375, + "router_z_loss_mlp": 0.01491547, + "step": 9399, + "time_per_iteration": 3.073493719100952 + }, + { + "auxiliary_loss_clip": 0.06425194, + "auxiliary_loss_mlp": 0.01269138, + "balance_loss_clip": 0.06275368, + "balance_loss_mlp": 0.01257158, + "epoch": 0.5651585750789118, + "flos": 21038919068160.0, + "grad_norm": 2.9284187471842213, + "language_loss": 0.73483676, + "learning_rate": 1.6764227541046347e-06, + "loss": 0.8117801, + "num_input_tokens_seen": 202511905, + "router_z_loss_clip": 1.49902344, + "router_z_loss_mlp": 0.11987305, + "step": 9400, + "time_per_iteration": 2.5129287242889404 + }, + { + "auxiliary_loss_clip": 0.06431332, + "auxiliary_loss_mlp": 0.01270587, + "balance_loss_clip": 0.06281202, + "balance_loss_mlp": 0.01258267, + "epoch": 0.5652186983315798, + "flos": 18557891199360.0, + "grad_norm": 1.781312568353633, + "language_loss": 0.61062682, + "learning_rate": 1.676038429548412e-06, + "loss": 0.68764603, + "num_input_tokens_seen": 202529815, + "router_z_loss_clip": 1.50097656, + "router_z_loss_mlp": 0.12322998, + "step": 9401, + "time_per_iteration": 2.484562397003174 + }, + { + "auxiliary_loss_clip": 0.06419288, + "auxiliary_loss_mlp": 0.01272594, + "balance_loss_clip": 0.06274588, + "balance_loss_mlp": 0.01261859, + "epoch": 0.5652788215842477, + "flos": 18484573276800.0, + "grad_norm": 1.8682667341725439, + "language_loss": 0.81175613, + "learning_rate": 1.6756541172768453e-06, + "loss": 0.88867497, + "num_input_tokens_seen": 202547710, + "router_z_loss_clip": 1.4453125, + "router_z_loss_mlp": 0.10736084, + "step": 9402, + "time_per_iteration": 2.5402467250823975 + }, + { + "auxiliary_loss_clip": 0.0641814, + "auxiliary_loss_mlp": 0.01269888, + "balance_loss_clip": 0.06276071, + "balance_loss_mlp": 0.0125898, + "epoch": 0.5653389448369157, + "flos": 30051797391360.0, + "grad_norm": 1.3435358668606565, + "language_loss": 0.77710259, + "learning_rate": 1.6752698173045068e-06, + "loss": 0.85398287, + "num_input_tokens_seen": 202568835, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.10900879, + "step": 9403, + "time_per_iteration": 2.5728204250335693 + }, + { + "auxiliary_loss_clip": 0.06421928, + "auxiliary_loss_mlp": 0.01268633, + "balance_loss_clip": 0.06276687, + "balance_loss_mlp": 0.01257458, + "epoch": 0.5653990680895836, + "flos": 16733202762240.0, + "grad_norm": 1.6255859835861872, + "language_loss": 0.69364876, + "learning_rate": 1.6748855296459685e-06, + "loss": 0.7705543, + "num_input_tokens_seen": 202587385, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.11187744, + "step": 9404, + "time_per_iteration": 2.5076894760131836 + }, + { + "auxiliary_loss_clip": 0.06414986, + "auxiliary_loss_mlp": 0.01268861, + "balance_loss_clip": 0.06274591, + "balance_loss_mlp": 0.01258156, + "epoch": 0.5654591913422516, + "flos": 14543517939840.0, + "grad_norm": 1.937007916536723, + "language_loss": 0.6753332, + "learning_rate": 1.6745012543158045e-06, + "loss": 0.75217164, + "num_input_tokens_seen": 202604815, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.1071167, + "step": 9405, + "time_per_iteration": 2.4678986072540283 + }, + { + "auxiliary_loss_clip": 0.06417301, + "auxiliary_loss_mlp": 0.01269096, + "balance_loss_clip": 0.0627932, + "balance_loss_mlp": 0.0125891, + "epoch": 0.5655193145949196, + "flos": 26216484307200.0, + "grad_norm": 1.7078210782531607, + "language_loss": 0.74488431, + "learning_rate": 1.6741169913285852e-06, + "loss": 0.82174826, + "num_input_tokens_seen": 202623775, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.10180664, + "step": 9406, + "time_per_iteration": 2.5344419479370117 + }, + { + "auxiliary_loss_clip": 0.06423233, + "auxiliary_loss_mlp": 0.01269998, + "balance_loss_clip": 0.06274547, + "balance_loss_mlp": 0.01258101, + "epoch": 0.5655794378475876, + "flos": 25053669671040.0, + "grad_norm": 1.6572482823915473, + "language_loss": 0.80165344, + "learning_rate": 1.673732740698882e-06, + "loss": 0.87858582, + "num_input_tokens_seen": 202643375, + "router_z_loss_clip": 1.48828125, + "router_z_loss_mlp": 0.11901855, + "step": 9407, + "time_per_iteration": 2.5318515300750732 + }, + { + "auxiliary_loss_clip": 0.06414818, + "auxiliary_loss_mlp": 0.01281674, + "balance_loss_clip": 0.06276679, + "balance_loss_mlp": 0.01270641, + "epoch": 0.5656395611002555, + "flos": 31041379710720.0, + "grad_norm": 1.3106223538314048, + "language_loss": 0.71445584, + "learning_rate": 1.6733485024412666e-06, + "loss": 0.79142082, + "num_input_tokens_seen": 202668400, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.1104126, + "step": 9408, + "time_per_iteration": 2.6315321922302246 + }, + { + "auxiliary_loss_clip": 0.06416275, + "auxiliary_loss_mlp": 0.01273077, + "balance_loss_clip": 0.06275165, + "balance_loss_mlp": 0.01262151, + "epoch": 0.5656996843529235, + "flos": 20235650302080.0, + "grad_norm": 1.8647463769564316, + "language_loss": 0.81496549, + "learning_rate": 1.672964276570308e-06, + "loss": 0.89185899, + "num_input_tokens_seen": 202685125, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.109375, + "step": 9409, + "time_per_iteration": 2.4874367713928223 + }, + { + "auxiliary_loss_clip": 0.06420213, + "auxiliary_loss_mlp": 0.01273147, + "balance_loss_clip": 0.06275219, + "balance_loss_mlp": 0.01261953, + "epoch": 0.5657598076055914, + "flos": 21002595523200.0, + "grad_norm": 1.5982364261864173, + "language_loss": 0.78488803, + "learning_rate": 1.6725800631005776e-06, + "loss": 0.86182165, + "num_input_tokens_seen": 202703830, + "router_z_loss_clip": 1.44921875, + "router_z_loss_mlp": 0.11187744, + "step": 9410, + "time_per_iteration": 2.568018913269043 + }, + { + "auxiliary_loss_clip": 0.06420635, + "auxiliary_loss_mlp": 0.01269622, + "balance_loss_clip": 0.06277133, + "balance_loss_mlp": 0.01258607, + "epoch": 0.5658199308582594, + "flos": 11550690132480.0, + "grad_norm": 1.9303419986806551, + "language_loss": 0.83679706, + "learning_rate": 1.6721958620466432e-06, + "loss": 0.91369963, + "num_input_tokens_seen": 202719835, + "router_z_loss_clip": 1.43457031, + "router_z_loss_mlp": 0.11016846, + "step": 9411, + "time_per_iteration": 2.4616551399230957 + }, + { + "auxiliary_loss_clip": 0.06428169, + "auxiliary_loss_mlp": 0.01269272, + "balance_loss_clip": 0.06277955, + "balance_loss_mlp": 0.01256725, + "epoch": 0.5658800541109275, + "flos": 14177137962240.0, + "grad_norm": 2.370687982223235, + "language_loss": 0.67829227, + "learning_rate": 1.6718116734230749e-06, + "loss": 0.75526661, + "num_input_tokens_seen": 202736795, + "router_z_loss_clip": 1.50292969, + "router_z_loss_mlp": 0.12548828, + "step": 9412, + "time_per_iteration": 2.5216641426086426 + }, + { + "auxiliary_loss_clip": 0.06415425, + "auxiliary_loss_mlp": 0.01268228, + "balance_loss_clip": 0.06277046, + "balance_loss_mlp": 0.01258488, + "epoch": 0.5659401773635954, + "flos": 27311934660480.0, + "grad_norm": 1.581889394574198, + "language_loss": 0.58742762, + "learning_rate": 1.6714274972444413e-06, + "loss": 0.6642642, + "num_input_tokens_seen": 202756900, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.09741211, + "step": 9413, + "time_per_iteration": 2.564143657684326 + }, + { + "auxiliary_loss_clip": 0.06415551, + "auxiliary_loss_mlp": 0.01265095, + "balance_loss_clip": 0.06274314, + "balance_loss_mlp": 0.01254294, + "epoch": 0.5660003006162634, + "flos": 16733957448960.0, + "grad_norm": 2.47913455673049, + "language_loss": 0.69196904, + "learning_rate": 1.6710433335253092e-06, + "loss": 0.76877546, + "num_input_tokens_seen": 202775145, + "router_z_loss_clip": 1.40917969, + "router_z_loss_mlp": 0.10791016, + "step": 9414, + "time_per_iteration": 3.924028158187866 + }, + { + "auxiliary_loss_clip": 0.0641676, + "auxiliary_loss_mlp": 0.01269168, + "balance_loss_clip": 0.06275219, + "balance_loss_mlp": 0.01258475, + "epoch": 0.5660604238689313, + "flos": 21659983130880.0, + "grad_norm": 1.6269222060357784, + "language_loss": 0.78177273, + "learning_rate": 1.670659182280247e-06, + "loss": 0.85863203, + "num_input_tokens_seen": 202794505, + "router_z_loss_clip": 1.41601562, + "router_z_loss_mlp": 0.10693359, + "step": 9415, + "time_per_iteration": 2.5426433086395264 + }, + { + "auxiliary_loss_clip": 0.06321331, + "auxiliary_loss_mlp": 0.01255911, + "balance_loss_clip": 0.06260875, + "balance_loss_mlp": 0.01254426, + "epoch": 0.5661205471215993, + "flos": 68843619884160.0, + "grad_norm": 0.6697066651048145, + "language_loss": 0.48973382, + "learning_rate": 1.670275043523822e-06, + "loss": 0.56550622, + "num_input_tokens_seen": 202858580, + "router_z_loss_clip": 0.60546875, + "router_z_loss_mlp": 0.0148468, + "step": 9416, + "time_per_iteration": 3.2625491619110107 + }, + { + "auxiliary_loss_clip": 0.06421995, + "auxiliary_loss_mlp": 0.01268122, + "balance_loss_clip": 0.06277312, + "balance_loss_mlp": 0.01256416, + "epoch": 0.5661806703742672, + "flos": 28629393206400.0, + "grad_norm": 1.9136616805420137, + "language_loss": 0.63439846, + "learning_rate": 1.6698909172706e-06, + "loss": 0.7112996, + "num_input_tokens_seen": 202878565, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.11706543, + "step": 9417, + "time_per_iteration": 2.5860400199890137 + }, + { + "auxiliary_loss_clip": 0.06423697, + "auxiliary_loss_mlp": 0.01269251, + "balance_loss_clip": 0.06277792, + "balance_loss_mlp": 0.01257419, + "epoch": 0.5662407936269352, + "flos": 21404418577920.0, + "grad_norm": 2.3766145169256485, + "language_loss": 0.6936692, + "learning_rate": 1.6695068035351479e-06, + "loss": 0.77059871, + "num_input_tokens_seen": 202897350, + "router_z_loss_clip": 1.45703125, + "router_z_loss_mlp": 0.1184082, + "step": 9418, + "time_per_iteration": 3.955557346343994 + }, + { + "auxiliary_loss_clip": 0.0642141, + "auxiliary_loss_mlp": 0.01267462, + "balance_loss_clip": 0.06276925, + "balance_loss_mlp": 0.01255261, + "epoch": 0.5663009168796032, + "flos": 25666054836480.0, + "grad_norm": 1.7349550199621107, + "language_loss": 0.65210938, + "learning_rate": 1.6691227023320304e-06, + "loss": 0.72899818, + "num_input_tokens_seen": 202916745, + "router_z_loss_clip": 1.44335938, + "router_z_loss_mlp": 0.12219238, + "step": 9419, + "time_per_iteration": 2.5426688194274902 + }, + { + "auxiliary_loss_clip": 0.06328249, + "auxiliary_loss_mlp": 0.01252694, + "balance_loss_clip": 0.06267616, + "balance_loss_mlp": 0.01251344, + "epoch": 0.5663610401322712, + "flos": 67953014835840.0, + "grad_norm": 0.7058455662611458, + "language_loss": 0.59640646, + "learning_rate": 1.6687386136758135e-06, + "loss": 0.67221588, + "num_input_tokens_seen": 202982375, + "router_z_loss_clip": 0.609375, + "router_z_loss_mlp": 0.01351929, + "step": 9420, + "time_per_iteration": 3.2174880504608154 + }, + { + "auxiliary_loss_clip": 0.064177, + "auxiliary_loss_mlp": 0.0126554, + "balance_loss_clip": 0.06276117, + "balance_loss_mlp": 0.01255235, + "epoch": 0.5664211633849391, + "flos": 24616487393280.0, + "grad_norm": 1.6106095517088517, + "language_loss": 0.74370563, + "learning_rate": 1.6683545375810618e-06, + "loss": 0.82053804, + "num_input_tokens_seen": 203002430, + "router_z_loss_clip": 1.41503906, + "router_z_loss_mlp": 0.10308838, + "step": 9421, + "time_per_iteration": 2.5415146350860596 + }, + { + "auxiliary_loss_clip": 0.06425875, + "auxiliary_loss_mlp": 0.0127111, + "balance_loss_clip": 0.0627939, + "balance_loss_mlp": 0.0125941, + "epoch": 0.5664812866376071, + "flos": 11652407389440.0, + "grad_norm": 1.8136120935488778, + "language_loss": 0.73536521, + "learning_rate": 1.6679704740623389e-06, + "loss": 0.81233501, + "num_input_tokens_seen": 203019425, + "router_z_loss_clip": 1.46679688, + "router_z_loss_mlp": 0.11700439, + "step": 9422, + "time_per_iteration": 2.4822769165039062 + }, + { + "auxiliary_loss_clip": 0.06420115, + "auxiliary_loss_mlp": 0.01264, + "balance_loss_clip": 0.06278713, + "balance_loss_mlp": 0.01253355, + "epoch": 0.566541409890275, + "flos": 24650798440320.0, + "grad_norm": 1.7038149529307767, + "language_loss": 0.8178972, + "learning_rate": 1.6675864231342085e-06, + "loss": 0.89473832, + "num_input_tokens_seen": 203039035, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.10656738, + "step": 9423, + "time_per_iteration": 4.039041519165039 + }, + { + "auxiliary_loss_clip": 0.06420702, + "auxiliary_loss_mlp": 0.01272474, + "balance_loss_clip": 0.06276573, + "balance_loss_mlp": 0.01260392, + "epoch": 0.566601533142943, + "flos": 22276686781440.0, + "grad_norm": 2.1916345423108092, + "language_loss": 0.81182116, + "learning_rate": 1.6672023848112353e-06, + "loss": 0.88875294, + "num_input_tokens_seen": 203059320, + "router_z_loss_clip": 1.44140625, + "router_z_loss_mlp": 0.12091064, + "step": 9424, + "time_per_iteration": 2.6186363697052 + }, + { + "auxiliary_loss_clip": 0.06424181, + "auxiliary_loss_mlp": 0.01267084, + "balance_loss_clip": 0.06276239, + "balance_loss_mlp": 0.01254788, + "epoch": 0.5666616563956111, + "flos": 29979485717760.0, + "grad_norm": 1.8421028893936136, + "language_loss": 0.79108143, + "learning_rate": 1.6668183591079805e-06, + "loss": 0.86799419, + "num_input_tokens_seen": 203078490, + "router_z_loss_clip": 1.47949219, + "router_z_loss_mlp": 0.1229248, + "step": 9425, + "time_per_iteration": 2.6103405952453613 + }, + { + "auxiliary_loss_clip": 0.06423585, + "auxiliary_loss_mlp": 0.01266807, + "balance_loss_clip": 0.06280398, + "balance_loss_mlp": 0.01254958, + "epoch": 0.566721779648279, + "flos": 17786585566080.0, + "grad_norm": 1.8792171756054583, + "language_loss": 0.59002221, + "learning_rate": 1.6664343460390064e-06, + "loss": 0.66692609, + "num_input_tokens_seen": 203096065, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.11853027, + "step": 9426, + "time_per_iteration": 2.5017449855804443 + }, + { + "auxiliary_loss_clip": 0.06425668, + "auxiliary_loss_mlp": 0.01271587, + "balance_loss_clip": 0.06278071, + "balance_loss_mlp": 0.01259881, + "epoch": 0.566781902900947, + "flos": 21039967244160.0, + "grad_norm": 1.8634987355301997, + "language_loss": 0.82228333, + "learning_rate": 1.6660503456188764e-06, + "loss": 0.89925593, + "num_input_tokens_seen": 203115270, + "router_z_loss_clip": 1.47558594, + "router_z_loss_mlp": 0.1171875, + "step": 9427, + "time_per_iteration": 2.565479040145874 + }, + { + "auxiliary_loss_clip": 0.06418218, + "auxiliary_loss_mlp": 0.01268516, + "balance_loss_clip": 0.06277822, + "balance_loss_mlp": 0.01257853, + "epoch": 0.5668420261536149, + "flos": 23155244040960.0, + "grad_norm": 1.8170517561621367, + "language_loss": 0.86107284, + "learning_rate": 1.6656663578621498e-06, + "loss": 0.93794018, + "num_input_tokens_seen": 203134290, + "router_z_loss_clip": 1.40332031, + "router_z_loss_mlp": 0.10662842, + "step": 9428, + "time_per_iteration": 2.5440726280212402 + }, + { + "auxiliary_loss_clip": 0.06425078, + "auxiliary_loss_mlp": 0.01266256, + "balance_loss_clip": 0.06276559, + "balance_loss_mlp": 0.01254549, + "epoch": 0.5669021494062829, + "flos": 22608210660480.0, + "grad_norm": 1.979218692390264, + "language_loss": 0.74058932, + "learning_rate": 1.6652823827833886e-06, + "loss": 0.81750262, + "num_input_tokens_seen": 203152935, + "router_z_loss_clip": 1.48339844, + "router_z_loss_mlp": 0.11700439, + "step": 9429, + "time_per_iteration": 2.5536460876464844 + }, + { + "auxiliary_loss_clip": 0.06425272, + "auxiliary_loss_mlp": 0.01264673, + "balance_loss_clip": 0.06277645, + "balance_loss_mlp": 0.01252943, + "epoch": 0.5669622726589508, + "flos": 17386481520000.0, + "grad_norm": 1.7940156011993331, + "language_loss": 0.75663137, + "learning_rate": 1.6648984203971538e-06, + "loss": 0.8335309, + "num_input_tokens_seen": 203170110, + "router_z_loss_clip": 1.47460938, + "router_z_loss_mlp": 0.11724854, + "step": 9430, + "time_per_iteration": 3.9432384967803955 + }, + { + "auxiliary_loss_clip": 0.06418042, + "auxiliary_loss_mlp": 0.01265203, + "balance_loss_clip": 0.06273438, + "balance_loss_mlp": 0.01254498, + "epoch": 0.5670223959116188, + "flos": 18767992112640.0, + "grad_norm": 1.7725274526585868, + "language_loss": 0.73046589, + "learning_rate": 1.6645144707180032e-06, + "loss": 0.80729836, + "num_input_tokens_seen": 203188825, + "router_z_loss_clip": 1.44726562, + "router_z_loss_mlp": 0.10705566, + "step": 9431, + "time_per_iteration": 2.4891881942749023 + }, + { + "auxiliary_loss_clip": 0.06413169, + "auxiliary_loss_mlp": 0.01269495, + "balance_loss_clip": 0.06278919, + "balance_loss_mlp": 0.0125907, + "epoch": 0.5670825191642868, + "flos": 13558463740800.0, + "grad_norm": 1.5232840780961514, + "language_loss": 0.7352109, + "learning_rate": 1.6641305337604984e-06, + "loss": 0.81203753, + "num_input_tokens_seen": 203206860, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.10424805, + "step": 9432, + "time_per_iteration": 2.539503812789917 + }, + { + "auxiliary_loss_clip": 0.06419028, + "auxiliary_loss_mlp": 0.0126609, + "balance_loss_clip": 0.0627542, + "balance_loss_mlp": 0.01254914, + "epoch": 0.5671426424169548, + "flos": 22060506447360.0, + "grad_norm": 1.4799006758092328, + "language_loss": 0.78516906, + "learning_rate": 1.663746609539197e-06, + "loss": 0.86202025, + "num_input_tokens_seen": 203225625, + "router_z_loss_clip": 1.43457031, + "router_z_loss_mlp": 0.11169434, + "step": 9433, + "time_per_iteration": 2.5004031658172607 + }, + { + "auxiliary_loss_clip": 0.06427075, + "auxiliary_loss_mlp": 0.01270712, + "balance_loss_clip": 0.06279536, + "balance_loss_mlp": 0.01257569, + "epoch": 0.5672027656696227, + "flos": 21330262114560.0, + "grad_norm": 1.7709414309866778, + "language_loss": 0.63719839, + "learning_rate": 1.6633626980686582e-06, + "loss": 0.71417624, + "num_input_tokens_seen": 203242920, + "router_z_loss_clip": 1.47363281, + "router_z_loss_mlp": 0.13134766, + "step": 9434, + "time_per_iteration": 2.5424575805664062 + }, + { + "auxiliary_loss_clip": 0.06413743, + "auxiliary_loss_mlp": 0.0126735, + "balance_loss_clip": 0.06274401, + "balance_loss_mlp": 0.01257188, + "epoch": 0.5672628889222907, + "flos": 23520869331840.0, + "grad_norm": 1.9335938837076005, + "language_loss": 0.66754067, + "learning_rate": 1.6629787993634399e-06, + "loss": 0.74435163, + "num_input_tokens_seen": 203261995, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.10162354, + "step": 9435, + "time_per_iteration": 2.5177414417266846 + }, + { + "auxiliary_loss_clip": 0.06416117, + "auxiliary_loss_mlp": 0.0126839, + "balance_loss_clip": 0.06274259, + "balance_loss_mlp": 0.01257333, + "epoch": 0.5673230121749586, + "flos": 27128639854080.0, + "grad_norm": 1.3319121805553942, + "language_loss": 0.71799958, + "learning_rate": 1.6625949134380984e-06, + "loss": 0.79484463, + "num_input_tokens_seen": 203280670, + "router_z_loss_clip": 1.41992188, + "router_z_loss_mlp": 0.11053467, + "step": 9436, + "time_per_iteration": 2.6037702560424805 + }, + { + "auxiliary_loss_clip": 0.06424177, + "auxiliary_loss_mlp": 0.01266696, + "balance_loss_clip": 0.06276658, + "balance_loss_mlp": 0.01254548, + "epoch": 0.5673831354276266, + "flos": 31150476126720.0, + "grad_norm": 1.399584944388347, + "language_loss": 0.7441892, + "learning_rate": 1.6622110403071921e-06, + "loss": 0.82109791, + "num_input_tokens_seen": 203304800, + "router_z_loss_clip": 1.47460938, + "router_z_loss_mlp": 0.12145996, + "step": 9437, + "time_per_iteration": 2.5982627868652344 + }, + { + "auxiliary_loss_clip": 0.0642609, + "auxiliary_loss_mlp": 0.01270521, + "balance_loss_clip": 0.06280209, + "balance_loss_mlp": 0.01258719, + "epoch": 0.5674432586802945, + "flos": 27680662552320.0, + "grad_norm": 1.8153515221603815, + "language_loss": 0.61647224, + "learning_rate": 1.661827179985277e-06, + "loss": 0.69343835, + "num_input_tokens_seen": 203324060, + "router_z_loss_clip": 1.45800781, + "router_z_loss_mlp": 0.11798096, + "step": 9438, + "time_per_iteration": 2.6188385486602783 + }, + { + "auxiliary_loss_clip": 0.0642384, + "auxiliary_loss_mlp": 0.01263986, + "balance_loss_clip": 0.06276964, + "balance_loss_mlp": 0.01252935, + "epoch": 0.5675033819329626, + "flos": 26622458138880.0, + "grad_norm": 1.4984637138093548, + "language_loss": 0.75628054, + "learning_rate": 1.661443332486909e-06, + "loss": 0.83315879, + "num_input_tokens_seen": 203344360, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 0.11053467, + "step": 9439, + "time_per_iteration": 2.5383174419403076 + }, + { + "auxiliary_loss_clip": 0.06420992, + "auxiliary_loss_mlp": 0.01270038, + "balance_loss_clip": 0.06280455, + "balance_loss_mlp": 0.0125798, + "epoch": 0.5675635051856306, + "flos": 19104295674240.0, + "grad_norm": 1.7526345830300347, + "language_loss": 0.8402319, + "learning_rate": 1.6610594978266438e-06, + "loss": 0.91714221, + "num_input_tokens_seen": 203362115, + "router_z_loss_clip": 1.40332031, + "router_z_loss_mlp": 0.1206665, + "step": 9440, + "time_per_iteration": 2.5894699096679688 + }, + { + "auxiliary_loss_clip": 0.06425986, + "auxiliary_loss_mlp": 0.01267618, + "balance_loss_clip": 0.06275898, + "balance_loss_mlp": 0.01255393, + "epoch": 0.5676236284382985, + "flos": 17572040386560.0, + "grad_norm": 2.304829714160468, + "language_loss": 0.75825876, + "learning_rate": 1.6606756760190365e-06, + "loss": 0.83519483, + "num_input_tokens_seen": 203380550, + "router_z_loss_clip": 1.50097656, + "router_z_loss_mlp": 0.12231445, + "step": 9441, + "time_per_iteration": 2.4910314083099365 + }, + { + "auxiliary_loss_clip": 0.0641818, + "auxiliary_loss_mlp": 0.0126441, + "balance_loss_clip": 0.0627504, + "balance_loss_mlp": 0.01253454, + "epoch": 0.5676837516909665, + "flos": 15958375257600.0, + "grad_norm": 1.9240949658540871, + "language_loss": 0.83086008, + "learning_rate": 1.6602918670786413e-06, + "loss": 0.907686, + "num_input_tokens_seen": 203396590, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.10955811, + "step": 9442, + "time_per_iteration": 2.53488826751709 + }, + { + "auxiliary_loss_clip": 0.06416862, + "auxiliary_loss_mlp": 0.01269111, + "balance_loss_clip": 0.06279622, + "balance_loss_mlp": 0.01258543, + "epoch": 0.5677438749436344, + "flos": 18301739667840.0, + "grad_norm": 1.8387898612646743, + "language_loss": 0.74695265, + "learning_rate": 1.6599080710200126e-06, + "loss": 0.82381237, + "num_input_tokens_seen": 203414280, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.10571289, + "step": 9443, + "time_per_iteration": 2.4844577312469482 + }, + { + "auxiliary_loss_clip": 0.06418682, + "auxiliary_loss_mlp": 0.01270397, + "balance_loss_clip": 0.06275757, + "balance_loss_mlp": 0.01258947, + "epoch": 0.5678039981963025, + "flos": 17937120625920.0, + "grad_norm": 2.224999400227568, + "language_loss": 0.77901411, + "learning_rate": 1.6595242878577046e-06, + "loss": 0.85590482, + "num_input_tokens_seen": 203433280, + "router_z_loss_clip": 1.42871094, + "router_z_loss_mlp": 0.11450195, + "step": 9444, + "time_per_iteration": 2.5525596141815186 + }, + { + "auxiliary_loss_clip": 0.06428226, + "auxiliary_loss_mlp": 0.01266607, + "balance_loss_clip": 0.06281613, + "balance_loss_mlp": 0.01255228, + "epoch": 0.5678641214489704, + "flos": 19322153089920.0, + "grad_norm": 1.7258632756557413, + "language_loss": 0.81218302, + "learning_rate": 1.6591405176062687e-06, + "loss": 0.88913137, + "num_input_tokens_seen": 203449935, + "router_z_loss_clip": 1.46484375, + "router_z_loss_mlp": 0.11376953, + "step": 9445, + "time_per_iteration": 2.501241683959961 + }, + { + "auxiliary_loss_clip": 0.06419222, + "auxiliary_loss_mlp": 0.01265991, + "balance_loss_clip": 0.06275924, + "balance_loss_mlp": 0.01255548, + "epoch": 0.5679242447016384, + "flos": 27759389063040.0, + "grad_norm": 1.2498061463372896, + "language_loss": 0.71243447, + "learning_rate": 1.658756760280259e-06, + "loss": 0.78928661, + "num_input_tokens_seen": 203473025, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.10443115, + "step": 9446, + "time_per_iteration": 2.6276121139526367 + }, + { + "auxiliary_loss_clip": 0.06425235, + "auxiliary_loss_mlp": 0.01269109, + "balance_loss_clip": 0.06276199, + "balance_loss_mlp": 0.01257277, + "epoch": 0.5679843679543063, + "flos": 23775888833280.0, + "grad_norm": 1.7407480451238082, + "language_loss": 0.73674792, + "learning_rate": 1.6583730158942276e-06, + "loss": 0.81369138, + "num_input_tokens_seen": 203492895, + "router_z_loss_clip": 1.48925781, + "router_z_loss_mlp": 0.11828613, + "step": 9447, + "time_per_iteration": 2.5189285278320312 + }, + { + "auxiliary_loss_clip": 0.06428251, + "auxiliary_loss_mlp": 0.01269652, + "balance_loss_clip": 0.06280248, + "balance_loss_mlp": 0.01257272, + "epoch": 0.5680444912069743, + "flos": 25598732480640.0, + "grad_norm": 1.8734928972182148, + "language_loss": 0.75381124, + "learning_rate": 1.657989284462725e-06, + "loss": 0.83079028, + "num_input_tokens_seen": 203513710, + "router_z_loss_clip": 1.47851562, + "router_z_loss_mlp": 0.1239624, + "step": 9448, + "time_per_iteration": 2.5984859466552734 + }, + { + "auxiliary_loss_clip": 0.06428179, + "auxiliary_loss_mlp": 0.01269794, + "balance_loss_clip": 0.0627953, + "balance_loss_mlp": 0.01258415, + "epoch": 0.5681046144596422, + "flos": 23702528983680.0, + "grad_norm": 2.0524228921166556, + "language_loss": 0.76618403, + "learning_rate": 1.6576055660003038e-06, + "loss": 0.84316373, + "num_input_tokens_seen": 203531630, + "router_z_loss_clip": 1.48632812, + "router_z_loss_mlp": 0.1137085, + "step": 9449, + "time_per_iteration": 2.515456438064575 + }, + { + "auxiliary_loss_clip": 0.06423233, + "auxiliary_loss_mlp": 0.012706, + "balance_loss_clip": 0.06278287, + "balance_loss_mlp": 0.01259174, + "epoch": 0.5681647377123102, + "flos": 28008161435520.0, + "grad_norm": 1.4260887566171934, + "language_loss": 0.74914038, + "learning_rate": 1.6572218605215128e-06, + "loss": 0.82607877, + "num_input_tokens_seen": 203551885, + "router_z_loss_clip": 1.44921875, + "router_z_loss_mlp": 0.11425781, + "step": 9450, + "time_per_iteration": 2.5997612476348877 + }, + { + "auxiliary_loss_clip": 0.06425043, + "auxiliary_loss_mlp": 0.01263493, + "balance_loss_clip": 0.06278814, + "balance_loss_mlp": 0.01252526, + "epoch": 0.5682248609649782, + "flos": 22754427235200.0, + "grad_norm": 1.6712621343134006, + "language_loss": 0.66650134, + "learning_rate": 1.6568381680409038e-06, + "loss": 0.74338675, + "num_input_tokens_seen": 203572250, + "router_z_loss_clip": 1.46191406, + "router_z_loss_mlp": 0.10974121, + "step": 9451, + "time_per_iteration": 2.5041069984436035 + }, + { + "auxiliary_loss_clip": 0.06437647, + "auxiliary_loss_mlp": 0.01268609, + "balance_loss_clip": 0.06282589, + "balance_loss_mlp": 0.01255126, + "epoch": 0.5682849842176462, + "flos": 21295070599680.0, + "grad_norm": 1.8399857372619135, + "language_loss": 0.72354877, + "learning_rate": 1.656454488573026e-06, + "loss": 0.80061138, + "num_input_tokens_seen": 203590605, + "router_z_loss_clip": 1.54980469, + "router_z_loss_mlp": 0.1348877, + "step": 9452, + "time_per_iteration": 2.529772996902466 + }, + { + "auxiliary_loss_clip": 0.06419612, + "auxiliary_loss_mlp": 0.01265219, + "balance_loss_clip": 0.06277338, + "balance_loss_mlp": 0.01253799, + "epoch": 0.5683451074703142, + "flos": 21147973557120.0, + "grad_norm": 1.3918203076927713, + "language_loss": 0.70862073, + "learning_rate": 1.656070822132428e-06, + "loss": 0.78546906, + "num_input_tokens_seen": 203610080, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.11419678, + "step": 9453, + "time_per_iteration": 3.975252151489258 + }, + { + "auxiliary_loss_clip": 0.06420393, + "auxiliary_loss_mlp": 0.01265438, + "balance_loss_clip": 0.06276751, + "balance_loss_mlp": 0.01255001, + "epoch": 0.5684052307229821, + "flos": 22350759390720.0, + "grad_norm": 1.7444047953592532, + "language_loss": 0.70346195, + "learning_rate": 1.6556871687336592e-06, + "loss": 0.78032023, + "num_input_tokens_seen": 203630060, + "router_z_loss_clip": 1.43652344, + "router_z_loss_mlp": 0.10443115, + "step": 9454, + "time_per_iteration": 2.530397415161133 + }, + { + "auxiliary_loss_clip": 0.06417777, + "auxiliary_loss_mlp": 0.01265567, + "balance_loss_clip": 0.06276377, + "balance_loss_mlp": 0.01255572, + "epoch": 0.5684653539756501, + "flos": 21805067675520.0, + "grad_norm": 2.3221034941278256, + "language_loss": 0.6090889, + "learning_rate": 1.6553035283912671e-06, + "loss": 0.68592238, + "num_input_tokens_seen": 203649065, + "router_z_loss_clip": 1.41308594, + "router_z_loss_mlp": 0.10003662, + "step": 9455, + "time_per_iteration": 2.5284998416900635 + }, + { + "auxiliary_loss_clip": 0.06432047, + "auxiliary_loss_mlp": 0.01270821, + "balance_loss_clip": 0.06281373, + "balance_loss_mlp": 0.01259144, + "epoch": 0.568525477228318, + "flos": 23005757157120.0, + "grad_norm": 1.7024948062012655, + "language_loss": 0.73315781, + "learning_rate": 1.6549199011198e-06, + "loss": 0.81018651, + "num_input_tokens_seen": 203667545, + "router_z_loss_clip": 1.5078125, + "router_z_loss_mlp": 0.11669922, + "step": 9456, + "time_per_iteration": 2.5266809463500977 + }, + { + "auxiliary_loss_clip": 0.06419168, + "auxiliary_loss_mlp": 0.01265297, + "balance_loss_clip": 0.06275652, + "balance_loss_mlp": 0.01254771, + "epoch": 0.568585600480986, + "flos": 21398045667840.0, + "grad_norm": 1.7476092517075434, + "language_loss": 0.77197653, + "learning_rate": 1.6545362869338048e-06, + "loss": 0.84882128, + "num_input_tokens_seen": 203686025, + "router_z_loss_clip": 1.43457031, + "router_z_loss_mlp": 0.10534668, + "step": 9457, + "time_per_iteration": 2.6098482608795166 + }, + { + "auxiliary_loss_clip": 0.06424686, + "auxiliary_loss_mlp": 0.01267717, + "balance_loss_clip": 0.06278071, + "balance_loss_mlp": 0.01255969, + "epoch": 0.568645723733654, + "flos": 30015054576000.0, + "grad_norm": 1.8479320449106564, + "language_loss": 0.6697377, + "learning_rate": 1.6541526858478285e-06, + "loss": 0.74666172, + "num_input_tokens_seen": 203705540, + "router_z_loss_clip": 1.46484375, + "router_z_loss_mlp": 0.11749268, + "step": 9458, + "time_per_iteration": 4.003401756286621 + }, + { + "auxiliary_loss_clip": 0.06424286, + "auxiliary_loss_mlp": 0.01264614, + "balance_loss_clip": 0.06276263, + "balance_loss_mlp": 0.01253295, + "epoch": 0.568705846986322, + "flos": 20418945108480.0, + "grad_norm": 2.1992346625709427, + "language_loss": 0.68311954, + "learning_rate": 1.6537690978764167e-06, + "loss": 0.76000857, + "num_input_tokens_seen": 203723670, + "router_z_loss_clip": 1.47949219, + "router_z_loss_mlp": 0.11315918, + "step": 9459, + "time_per_iteration": 2.5213470458984375 + }, + { + "auxiliary_loss_clip": 0.06427266, + "auxiliary_loss_mlp": 0.01264866, + "balance_loss_clip": 0.06277259, + "balance_loss_mlp": 0.01253756, + "epoch": 0.5687659702389899, + "flos": 17462440846080.0, + "grad_norm": 2.588089844490271, + "language_loss": 0.77003014, + "learning_rate": 1.6533855230341155e-06, + "loss": 0.84695148, + "num_input_tokens_seen": 203739705, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.11102295, + "step": 9460, + "time_per_iteration": 2.5016860961914062 + }, + { + "auxiliary_loss_clip": 0.06424034, + "auxiliary_loss_mlp": 0.01270464, + "balance_loss_clip": 0.0627469, + "balance_loss_mlp": 0.01258865, + "epoch": 0.5688260934916579, + "flos": 25412335073280.0, + "grad_norm": 1.5686079353810067, + "language_loss": 0.72504562, + "learning_rate": 1.65300196133547e-06, + "loss": 0.80199063, + "num_input_tokens_seen": 203759000, + "router_z_loss_clip": 1.49316406, + "router_z_loss_mlp": 0.11602783, + "step": 9461, + "time_per_iteration": 2.652650833129883 + }, + { + "auxiliary_loss_clip": 0.06420281, + "auxiliary_loss_mlp": 0.01265549, + "balance_loss_clip": 0.06276302, + "balance_loss_mlp": 0.01254707, + "epoch": 0.5688862167443258, + "flos": 21613052044800.0, + "grad_norm": 1.8456676032626356, + "language_loss": 0.73588586, + "learning_rate": 1.6526184127950249e-06, + "loss": 0.81274414, + "num_input_tokens_seen": 203774295, + "router_z_loss_clip": 1.43945312, + "router_z_loss_mlp": 0.10839844, + "step": 9462, + "time_per_iteration": 3.9915239810943604 + }, + { + "auxiliary_loss_clip": 0.06414893, + "auxiliary_loss_mlp": 0.01264818, + "balance_loss_clip": 0.06275715, + "balance_loss_mlp": 0.01254715, + "epoch": 0.5689463399969938, + "flos": 22425544759680.0, + "grad_norm": 2.0067901163228212, + "language_loss": 0.72924364, + "learning_rate": 1.6522348774273246e-06, + "loss": 0.80604076, + "num_input_tokens_seen": 203792710, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.10107422, + "step": 9463, + "time_per_iteration": 2.5026743412017822 + }, + { + "auxiliary_loss_clip": 0.06417, + "auxiliary_loss_mlp": 0.01266249, + "balance_loss_clip": 0.06272251, + "balance_loss_mlp": 0.01255115, + "epoch": 0.5690064632496618, + "flos": 18302787843840.0, + "grad_norm": 1.7796234570298675, + "language_loss": 0.7436375, + "learning_rate": 1.6518513552469123e-06, + "loss": 0.82046998, + "num_input_tokens_seen": 203811645, + "router_z_loss_clip": 1.4453125, + "router_z_loss_mlp": 0.11126709, + "step": 9464, + "time_per_iteration": 2.5418522357940674 + }, + { + "auxiliary_loss_clip": 0.06420638, + "auxiliary_loss_mlp": 0.01265209, + "balance_loss_clip": 0.06273931, + "balance_loss_mlp": 0.01253169, + "epoch": 0.5690665865023298, + "flos": 21585575105280.0, + "grad_norm": 1.531985348456469, + "language_loss": 0.84518385, + "learning_rate": 1.6514678462683312e-06, + "loss": 0.92204237, + "num_input_tokens_seen": 203830040, + "router_z_loss_clip": 1.46679688, + "router_z_loss_mlp": 0.12060547, + "step": 9465, + "time_per_iteration": 2.501640558242798 + }, + { + "auxiliary_loss_clip": 0.06416291, + "auxiliary_loss_mlp": 0.01262582, + "balance_loss_clip": 0.06275291, + "balance_loss_mlp": 0.01251954, + "epoch": 0.5691267097549978, + "flos": 24427616290560.0, + "grad_norm": 1.5399864144711508, + "language_loss": 0.72636294, + "learning_rate": 1.651084350506125e-06, + "loss": 0.80315161, + "num_input_tokens_seen": 203851245, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.10638428, + "step": 9466, + "time_per_iteration": 2.5872812271118164 + }, + { + "auxiliary_loss_clip": 0.06322309, + "auxiliary_loss_mlp": 0.01252779, + "balance_loss_clip": 0.06261392, + "balance_loss_mlp": 0.01251253, + "epoch": 0.5691868330076657, + "flos": 61679915389440.0, + "grad_norm": 0.706168287542021, + "language_loss": 0.55225098, + "learning_rate": 1.6507008679748343e-06, + "loss": 0.62800181, + "num_input_tokens_seen": 203916400, + "router_z_loss_clip": 0.609375, + "router_z_loss_mlp": 0.01525879, + "step": 9467, + "time_per_iteration": 3.1809115409851074 + }, + { + "auxiliary_loss_clip": 0.06421535, + "auxiliary_loss_mlp": 0.01265338, + "balance_loss_clip": 0.06275938, + "balance_loss_mlp": 0.01253471, + "epoch": 0.5692469562603337, + "flos": 21331687633920.0, + "grad_norm": 1.821723086609738, + "language_loss": 0.64103729, + "learning_rate": 1.6503173986890023e-06, + "loss": 0.717906, + "num_input_tokens_seen": 203935870, + "router_z_loss_clip": 1.45605469, + "router_z_loss_mlp": 0.11865234, + "step": 9468, + "time_per_iteration": 2.5419483184814453 + }, + { + "auxiliary_loss_clip": 0.06420718, + "auxiliary_loss_mlp": 0.01268612, + "balance_loss_clip": 0.06276828, + "balance_loss_mlp": 0.01257508, + "epoch": 0.5693070795130016, + "flos": 23374652757120.0, + "grad_norm": 2.0216455322076885, + "language_loss": 0.79510915, + "learning_rate": 1.64993394266317e-06, + "loss": 0.87200236, + "num_input_tokens_seen": 203954950, + "router_z_loss_clip": 1.43945312, + "router_z_loss_mlp": 0.11102295, + "step": 9469, + "time_per_iteration": 3.974965810775757 + }, + { + "auxiliary_loss_clip": 0.06424933, + "auxiliary_loss_mlp": 0.01267744, + "balance_loss_clip": 0.06275818, + "balance_loss_mlp": 0.01256133, + "epoch": 0.5693672027656697, + "flos": 18703143452160.0, + "grad_norm": 1.8253898689046395, + "language_loss": 0.69934285, + "learning_rate": 1.6495504999118769e-06, + "loss": 0.77626961, + "num_input_tokens_seen": 203972715, + "router_z_loss_clip": 1.48925781, + "router_z_loss_mlp": 0.11608887, + "step": 9470, + "time_per_iteration": 2.490144729614258 + }, + { + "auxiliary_loss_clip": 0.06418116, + "auxiliary_loss_mlp": 0.01266169, + "balance_loss_clip": 0.06273302, + "balance_loss_mlp": 0.01254391, + "epoch": 0.5694273260183376, + "flos": 20455478288640.0, + "grad_norm": 2.1472118271494574, + "language_loss": 0.75247335, + "learning_rate": 1.6491670704496644e-06, + "loss": 0.82931614, + "num_input_tokens_seen": 203990775, + "router_z_loss_clip": 1.44824219, + "router_z_loss_mlp": 0.11785889, + "step": 9471, + "time_per_iteration": 2.5518500804901123 + }, + { + "auxiliary_loss_clip": 0.06417546, + "auxiliary_loss_mlp": 0.01266321, + "balance_loss_clip": 0.06276481, + "balance_loss_mlp": 0.01255616, + "epoch": 0.5694874492710056, + "flos": 17608992837120.0, + "grad_norm": 1.6827496814774499, + "language_loss": 0.57877314, + "learning_rate": 1.6487836542910716e-06, + "loss": 0.65561181, + "num_input_tokens_seen": 204008845, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.10705566, + "step": 9472, + "time_per_iteration": 2.535846710205078 + }, + { + "auxiliary_loss_clip": 0.06416848, + "auxiliary_loss_mlp": 0.01268789, + "balance_loss_clip": 0.06277969, + "balance_loss_mlp": 0.01257411, + "epoch": 0.5695475725236735, + "flos": 13375923621120.0, + "grad_norm": 1.7815747768820038, + "language_loss": 0.73987466, + "learning_rate": 1.648400251450638e-06, + "loss": 0.81673104, + "num_input_tokens_seen": 204023755, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.11376953, + "step": 9473, + "time_per_iteration": 2.4858133792877197 + }, + { + "auxiliary_loss_clip": 0.06327727, + "auxiliary_loss_mlp": 0.01252353, + "balance_loss_clip": 0.06266978, + "balance_loss_mlp": 0.01250914, + "epoch": 0.5696076957763415, + "flos": 68195078881920.0, + "grad_norm": 0.6484051468543478, + "language_loss": 0.57388628, + "learning_rate": 1.6480168619429023e-06, + "loss": 0.64968711, + "num_input_tokens_seen": 204091255, + "router_z_loss_clip": 0.609375, + "router_z_loss_mlp": 0.01437378, + "step": 9474, + "time_per_iteration": 3.1554436683654785 + }, + { + "auxiliary_loss_clip": 0.06415011, + "auxiliary_loss_mlp": 0.01264959, + "balance_loss_clip": 0.06274811, + "balance_loss_mlp": 0.01254111, + "epoch": 0.5696678190290094, + "flos": 33846636153600.0, + "grad_norm": 1.6105466561987234, + "language_loss": 0.54358017, + "learning_rate": 1.6476334857824017e-06, + "loss": 0.62037987, + "num_input_tokens_seen": 204113285, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.10845947, + "step": 9475, + "time_per_iteration": 2.6193020343780518 + }, + { + "auxiliary_loss_clip": 0.06419323, + "auxiliary_loss_mlp": 0.01263613, + "balance_loss_clip": 0.06274848, + "balance_loss_mlp": 0.01252234, + "epoch": 0.5697279422816774, + "flos": 26363329787520.0, + "grad_norm": 2.008545727860435, + "language_loss": 0.79765999, + "learning_rate": 1.647250122983675e-06, + "loss": 0.87448931, + "num_input_tokens_seen": 204133045, + "router_z_loss_clip": 1.44726562, + "router_z_loss_mlp": 0.11383057, + "step": 9476, + "time_per_iteration": 2.543100595474243 + }, + { + "auxiliary_loss_clip": 0.06428041, + "auxiliary_loss_mlp": 0.01271624, + "balance_loss_clip": 0.06279552, + "balance_loss_mlp": 0.01260209, + "epoch": 0.5697880655343454, + "flos": 22937260844160.0, + "grad_norm": 1.735529425276041, + "language_loss": 0.66121185, + "learning_rate": 1.6468667735612592e-06, + "loss": 0.73820853, + "num_input_tokens_seen": 204152590, + "router_z_loss_clip": 1.48339844, + "router_z_loss_mlp": 0.11407471, + "step": 9477, + "time_per_iteration": 2.5366005897521973 + }, + { + "auxiliary_loss_clip": 0.06423311, + "auxiliary_loss_mlp": 0.01266336, + "balance_loss_clip": 0.06277082, + "balance_loss_mlp": 0.0125553, + "epoch": 0.5698481887870134, + "flos": 26768674713600.0, + "grad_norm": 1.6190739346076362, + "language_loss": 0.71115196, + "learning_rate": 1.6464834375296906e-06, + "loss": 0.78804839, + "num_input_tokens_seen": 204171815, + "router_z_loss_clip": 1.46289062, + "router_z_loss_mlp": 0.1081543, + "step": 9478, + "time_per_iteration": 2.5513012409210205 + }, + { + "auxiliary_loss_clip": 0.06415288, + "auxiliary_loss_mlp": 0.01266638, + "balance_loss_clip": 0.06277218, + "balance_loss_mlp": 0.01255718, + "epoch": 0.5699083120396814, + "flos": 15747729292800.0, + "grad_norm": 1.4794360727515914, + "language_loss": 0.69306439, + "learning_rate": 1.6461001149035055e-06, + "loss": 0.76988363, + "num_input_tokens_seen": 204188535, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.10913086, + "step": 9479, + "time_per_iteration": 2.5828471183776855 + }, + { + "auxiliary_loss_clip": 0.06413876, + "auxiliary_loss_mlp": 0.01267563, + "balance_loss_clip": 0.06275865, + "balance_loss_mlp": 0.0125734, + "epoch": 0.5699684352923493, + "flos": 19543448522880.0, + "grad_norm": 1.5013072139655574, + "language_loss": 0.71621788, + "learning_rate": 1.6457168056972392e-06, + "loss": 0.79303229, + "num_input_tokens_seen": 204208365, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.10223389, + "step": 9480, + "time_per_iteration": 2.5247299671173096 + }, + { + "auxiliary_loss_clip": 0.06418922, + "auxiliary_loss_mlp": 0.01267091, + "balance_loss_clip": 0.06276189, + "balance_loss_mlp": 0.01255319, + "epoch": 0.5700285585450173, + "flos": 16258942252800.0, + "grad_norm": 4.885605743124815, + "language_loss": 0.72444856, + "learning_rate": 1.6453335099254276e-06, + "loss": 0.80130869, + "num_input_tokens_seen": 204226560, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.11779785, + "step": 9481, + "time_per_iteration": 2.508589506149292 + }, + { + "auxiliary_loss_clip": 0.06421519, + "auxiliary_loss_mlp": 0.01270221, + "balance_loss_clip": 0.06279288, + "balance_loss_mlp": 0.01258461, + "epoch": 0.5700886817976852, + "flos": 19871115114240.0, + "grad_norm": 1.897422682992244, + "language_loss": 0.78625083, + "learning_rate": 1.6449502276026041e-06, + "loss": 0.86316824, + "num_input_tokens_seen": 204245410, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.11761475, + "step": 9482, + "time_per_iteration": 2.5139269828796387 + }, + { + "auxiliary_loss_clip": 0.06417527, + "auxiliary_loss_mlp": 0.01263816, + "balance_loss_clip": 0.06276704, + "balance_loss_mlp": 0.01253242, + "epoch": 0.5701488050503533, + "flos": 23848452069120.0, + "grad_norm": 2.496783055499815, + "language_loss": 0.78338385, + "learning_rate": 1.6445669587433043e-06, + "loss": 0.86019731, + "num_input_tokens_seen": 204264840, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.10571289, + "step": 9483, + "time_per_iteration": 2.547522783279419 + }, + { + "auxiliary_loss_clip": 0.06420138, + "auxiliary_loss_mlp": 0.01264109, + "balance_loss_clip": 0.06276282, + "balance_loss_mlp": 0.0125369, + "epoch": 0.5702089283030212, + "flos": 23666457000960.0, + "grad_norm": 1.5289248173251733, + "language_loss": 0.81642497, + "learning_rate": 1.6441837033620612e-06, + "loss": 0.89326739, + "num_input_tokens_seen": 204284335, + "router_z_loss_clip": 1.43652344, + "router_z_loss_mlp": 0.10424805, + "step": 9484, + "time_per_iteration": 2.546597719192505 + }, + { + "auxiliary_loss_clip": 0.06420925, + "auxiliary_loss_mlp": 0.01267488, + "balance_loss_clip": 0.06277504, + "balance_loss_mlp": 0.01255924, + "epoch": 0.5702690515556892, + "flos": 27898519968000.0, + "grad_norm": 1.8682928794178455, + "language_loss": 0.61101806, + "learning_rate": 1.6438004614734073e-06, + "loss": 0.68790221, + "num_input_tokens_seen": 204302590, + "router_z_loss_clip": 1.43457031, + "router_z_loss_mlp": 0.11560059, + "step": 9485, + "time_per_iteration": 2.5931575298309326 + }, + { + "auxiliary_loss_clip": 0.06421611, + "auxiliary_loss_mlp": 0.01267401, + "balance_loss_clip": 0.06277725, + "balance_loss_mlp": 0.01255748, + "epoch": 0.5703291748083571, + "flos": 24030698699520.0, + "grad_norm": 1.7282499785723824, + "language_loss": 0.65970731, + "learning_rate": 1.6434172330918757e-06, + "loss": 0.73659742, + "num_input_tokens_seen": 204323055, + "router_z_loss_clip": 1.43945312, + "router_z_loss_mlp": 0.11645508, + "step": 9486, + "time_per_iteration": 2.546604871749878 + }, + { + "auxiliary_loss_clip": 0.06330933, + "auxiliary_loss_mlp": 0.01257137, + "balance_loss_clip": 0.06271148, + "balance_loss_mlp": 0.01255769, + "epoch": 0.5703892980610251, + "flos": 57044478067200.0, + "grad_norm": 0.6556389442355417, + "language_loss": 0.47978726, + "learning_rate": 1.6430340182319978e-06, + "loss": 0.55566794, + "num_input_tokens_seen": 204386160, + "router_z_loss_clip": 0.60058594, + "router_z_loss_mlp": 0.01370239, + "step": 9487, + "time_per_iteration": 3.216449499130249 + }, + { + "auxiliary_loss_clip": 0.06419921, + "auxiliary_loss_mlp": 0.01266304, + "balance_loss_clip": 0.06275571, + "balance_loss_mlp": 0.01255212, + "epoch": 0.570449421313693, + "flos": 24357610604160.0, + "grad_norm": 1.4009858057112485, + "language_loss": 0.8597424, + "learning_rate": 1.6426508169083067e-06, + "loss": 0.93660462, + "num_input_tokens_seen": 204406315, + "router_z_loss_clip": 1.44335938, + "router_z_loss_mlp": 0.11102295, + "step": 9488, + "time_per_iteration": 2.5608506202697754 + }, + { + "auxiliary_loss_clip": 0.06428364, + "auxiliary_loss_mlp": 0.01270308, + "balance_loss_clip": 0.06281118, + "balance_loss_mlp": 0.01259055, + "epoch": 0.570509544566361, + "flos": 24835770328320.0, + "grad_norm": 1.8825828159705935, + "language_loss": 0.79195142, + "learning_rate": 1.6422676291353314e-06, + "loss": 0.86893809, + "num_input_tokens_seen": 204427645, + "router_z_loss_clip": 1.47265625, + "router_z_loss_mlp": 0.11260986, + "step": 9489, + "time_per_iteration": 2.553471088409424 + }, + { + "auxiliary_loss_clip": 0.06419341, + "auxiliary_loss_mlp": 0.01263993, + "balance_loss_clip": 0.06276694, + "balance_loss_mlp": 0.01253646, + "epoch": 0.570569667819029, + "flos": 21403663891200.0, + "grad_norm": 1.6360729178743676, + "language_loss": 0.7047472, + "learning_rate": 1.641884454927604e-06, + "loss": 0.78158057, + "num_input_tokens_seen": 204445910, + "router_z_loss_clip": 1.42480469, + "router_z_loss_mlp": 0.10345459, + "step": 9490, + "time_per_iteration": 2.5905275344848633 + }, + { + "auxiliary_loss_clip": 0.06421432, + "auxiliary_loss_mlp": 0.01269104, + "balance_loss_clip": 0.06279342, + "balance_loss_mlp": 0.01257803, + "epoch": 0.570629791071697, + "flos": 23222608323840.0, + "grad_norm": 1.4492809017584538, + "language_loss": 0.76252091, + "learning_rate": 1.6415012942996548e-06, + "loss": 0.83942628, + "num_input_tokens_seen": 204464680, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.11291504, + "step": 9491, + "time_per_iteration": 2.523472309112549 + }, + { + "auxiliary_loss_clip": 0.06328943, + "auxiliary_loss_mlp": 0.01263516, + "balance_loss_clip": 0.06268945, + "balance_loss_mlp": 0.01261694, + "epoch": 0.570689914324365, + "flos": 65303632915200.0, + "grad_norm": 0.7890932915341226, + "language_loss": 0.57371008, + "learning_rate": 1.641118147266011e-06, + "loss": 0.64963466, + "num_input_tokens_seen": 204525580, + "router_z_loss_clip": 0.6015625, + "router_z_loss_mlp": 0.01817322, + "step": 9492, + "time_per_iteration": 4.556811571121216 + }, + { + "auxiliary_loss_clip": 0.06420883, + "auxiliary_loss_mlp": 0.01266854, + "balance_loss_clip": 0.0627829, + "balance_loss_mlp": 0.01255809, + "epoch": 0.5707500375770329, + "flos": 21148225119360.0, + "grad_norm": 2.4823752626433357, + "language_loss": 0.71714401, + "learning_rate": 1.6407350138412035e-06, + "loss": 0.79402137, + "num_input_tokens_seen": 204541320, + "router_z_loss_clip": 1.42480469, + "router_z_loss_mlp": 0.1104126, + "step": 9493, + "time_per_iteration": 2.5404999256134033 + }, + { + "auxiliary_loss_clip": 0.06425234, + "auxiliary_loss_mlp": 0.01270244, + "balance_loss_clip": 0.06277438, + "balance_loss_mlp": 0.01258812, + "epoch": 0.5708101608297009, + "flos": 20818881446400.0, + "grad_norm": 1.6649189140980358, + "language_loss": 0.77940559, + "learning_rate": 1.6403518940397606e-06, + "loss": 0.85636032, + "num_input_tokens_seen": 204560275, + "router_z_loss_clip": 1.4765625, + "router_z_loss_mlp": 0.11431885, + "step": 9494, + "time_per_iteration": 2.5486340522766113 + }, + { + "auxiliary_loss_clip": 0.06427161, + "auxiliary_loss_mlp": 0.01267162, + "balance_loss_clip": 0.06276955, + "balance_loss_mlp": 0.01255026, + "epoch": 0.5708702840823688, + "flos": 25819482862080.0, + "grad_norm": 2.058789415113096, + "language_loss": 0.80377084, + "learning_rate": 1.6399687878762096e-06, + "loss": 0.88071406, + "num_input_tokens_seen": 204579430, + "router_z_loss_clip": 1.49902344, + "router_z_loss_mlp": 0.12127686, + "step": 9495, + "time_per_iteration": 2.5960187911987305 + }, + { + "auxiliary_loss_clip": 0.06429706, + "auxiliary_loss_mlp": 0.01275013, + "balance_loss_clip": 0.06277497, + "balance_loss_mlp": 0.01261567, + "epoch": 0.5709304073350369, + "flos": 23657400760320.0, + "grad_norm": 1.9375866549540641, + "language_loss": 0.66475153, + "learning_rate": 1.6395856953650784e-06, + "loss": 0.74179876, + "num_input_tokens_seen": 204597710, + "router_z_loss_clip": 1.5234375, + "router_z_loss_mlp": 0.13446045, + "step": 9496, + "time_per_iteration": 2.536844253540039 + }, + { + "auxiliary_loss_clip": 0.06424591, + "auxiliary_loss_mlp": 0.0126837, + "balance_loss_clip": 0.06275633, + "balance_loss_mlp": 0.01256485, + "epoch": 0.5709905305877048, + "flos": 16113144948480.0, + "grad_norm": 2.1097086993227068, + "language_loss": 0.70119512, + "learning_rate": 1.6392026165208938e-06, + "loss": 0.77812475, + "num_input_tokens_seen": 204616140, + "router_z_loss_clip": 1.49023438, + "router_z_loss_mlp": 0.11877441, + "step": 9497, + "time_per_iteration": 2.5001566410064697 + }, + { + "auxiliary_loss_clip": 0.06421457, + "auxiliary_loss_mlp": 0.01273203, + "balance_loss_clip": 0.06275579, + "balance_loss_mlp": 0.01261455, + "epoch": 0.5710506538403728, + "flos": 24757211525760.0, + "grad_norm": 5.203790092819982, + "language_loss": 0.81695306, + "learning_rate": 1.638819551358182e-06, + "loss": 0.89389962, + "num_input_tokens_seen": 204636470, + "router_z_loss_clip": 1.45800781, + "router_z_loss_mlp": 0.11755371, + "step": 9498, + "time_per_iteration": 3.979785203933716 + }, + { + "auxiliary_loss_clip": 0.06421061, + "auxiliary_loss_mlp": 0.01268836, + "balance_loss_clip": 0.06273817, + "balance_loss_mlp": 0.0125707, + "epoch": 0.5711107770930407, + "flos": 21988907533440.0, + "grad_norm": 1.778867640796668, + "language_loss": 0.66763413, + "learning_rate": 1.638436499891469e-06, + "loss": 0.74453306, + "num_input_tokens_seen": 204656640, + "router_z_loss_clip": 1.47167969, + "router_z_loss_mlp": 0.11767578, + "step": 9499, + "time_per_iteration": 2.560131788253784 + }, + { + "auxiliary_loss_clip": 0.06422064, + "auxiliary_loss_mlp": 0.01267168, + "balance_loss_clip": 0.06276537, + "balance_loss_mlp": 0.01255432, + "epoch": 0.5711709003457087, + "flos": 19580233265280.0, + "grad_norm": 1.5461706893268885, + "language_loss": 0.71884078, + "learning_rate": 1.6380534621352805e-06, + "loss": 0.79573303, + "num_input_tokens_seen": 204675475, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.11743164, + "step": 9500, + "time_per_iteration": 2.51857852935791 + }, + { + "auxiliary_loss_clip": 0.06426705, + "auxiliary_loss_mlp": 0.01270529, + "balance_loss_clip": 0.06277592, + "balance_loss_mlp": 0.01257893, + "epoch": 0.5712310235983766, + "flos": 24249436583040.0, + "grad_norm": 1.9132916799477426, + "language_loss": 0.76773643, + "learning_rate": 1.6376704381041407e-06, + "loss": 0.8447088, + "num_input_tokens_seen": 204695385, + "router_z_loss_clip": 1.4921875, + "router_z_loss_mlp": 0.12640381, + "step": 9501, + "time_per_iteration": 2.585303544998169 + }, + { + "auxiliary_loss_clip": 0.06424866, + "auxiliary_loss_mlp": 0.01265647, + "balance_loss_clip": 0.06278552, + "balance_loss_mlp": 0.01254233, + "epoch": 0.5712911468510447, + "flos": 21002469742080.0, + "grad_norm": 1.6366629976038132, + "language_loss": 0.75004148, + "learning_rate": 1.6372874278125742e-06, + "loss": 0.82694662, + "num_input_tokens_seen": 204714730, + "router_z_loss_clip": 1.46289062, + "router_z_loss_mlp": 0.11419678, + "step": 9502, + "time_per_iteration": 3.9893364906311035 + }, + { + "auxiliary_loss_clip": 0.06420161, + "auxiliary_loss_mlp": 0.0126738, + "balance_loss_clip": 0.0627653, + "balance_loss_mlp": 0.01256561, + "epoch": 0.5713512701037126, + "flos": 18923055292800.0, + "grad_norm": 1.7156142062685982, + "language_loss": 0.82350051, + "learning_rate": 1.636904431275105e-06, + "loss": 0.90037596, + "num_input_tokens_seen": 204735025, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.10827637, + "step": 9503, + "time_per_iteration": 2.5289459228515625 + }, + { + "auxiliary_loss_clip": 0.06420251, + "auxiliary_loss_mlp": 0.01271521, + "balance_loss_clip": 0.06276201, + "balance_loss_mlp": 0.01260375, + "epoch": 0.5714113933563806, + "flos": 17417983455360.0, + "grad_norm": 2.1350982520901827, + "language_loss": 0.86264861, + "learning_rate": 1.6365214485062553e-06, + "loss": 0.93956631, + "num_input_tokens_seen": 204751365, + "router_z_loss_clip": 1.44140625, + "router_z_loss_mlp": 0.1114502, + "step": 9504, + "time_per_iteration": 2.5180015563964844 + }, + { + "auxiliary_loss_clip": 0.06417073, + "auxiliary_loss_mlp": 0.01266636, + "balance_loss_clip": 0.06275576, + "balance_loss_mlp": 0.01255651, + "epoch": 0.5714715166090486, + "flos": 20199536392320.0, + "grad_norm": 2.0316869593340265, + "language_loss": 0.75480437, + "learning_rate": 1.6361384795205496e-06, + "loss": 0.83164144, + "num_input_tokens_seen": 204768980, + "router_z_loss_clip": 1.41503906, + "router_z_loss_mlp": 0.10980225, + "step": 9505, + "time_per_iteration": 2.497009754180908 + }, + { + "auxiliary_loss_clip": 0.06418754, + "auxiliary_loss_mlp": 0.01267922, + "balance_loss_clip": 0.06276823, + "balance_loss_mlp": 0.01256419, + "epoch": 0.5715316398617165, + "flos": 18557597710080.0, + "grad_norm": 1.6474042198541896, + "language_loss": 0.82215714, + "learning_rate": 1.635755524332509e-06, + "loss": 0.89902395, + "num_input_tokens_seen": 204788110, + "router_z_loss_clip": 1.41796875, + "router_z_loss_mlp": 0.1151123, + "step": 9506, + "time_per_iteration": 2.5657498836517334 + }, + { + "auxiliary_loss_clip": 0.06418438, + "auxiliary_loss_mlp": 0.01265118, + "balance_loss_clip": 0.0627599, + "balance_loss_mlp": 0.01254568, + "epoch": 0.5715917631143845, + "flos": 18484028225280.0, + "grad_norm": 1.482727560680873, + "language_loss": 0.77285796, + "learning_rate": 1.6353725829566552e-06, + "loss": 0.84969354, + "num_input_tokens_seen": 204807240, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.10546875, + "step": 9507, + "time_per_iteration": 2.485496997833252 + }, + { + "auxiliary_loss_clip": 0.06422855, + "auxiliary_loss_mlp": 0.01269089, + "balance_loss_clip": 0.06276034, + "balance_loss_mlp": 0.01257091, + "epoch": 0.5716518863670524, + "flos": 24026128652160.0, + "grad_norm": 1.4323391248104125, + "language_loss": 0.68799454, + "learning_rate": 1.63498965540751e-06, + "loss": 0.76491398, + "num_input_tokens_seen": 204826415, + "router_z_loss_clip": 1.46777344, + "router_z_loss_mlp": 0.12005615, + "step": 9508, + "time_per_iteration": 2.5643258094787598 + }, + { + "auxiliary_loss_clip": 0.06422228, + "auxiliary_loss_mlp": 0.01264898, + "balance_loss_clip": 0.06276274, + "balance_loss_mlp": 0.012529, + "epoch": 0.5717120096197205, + "flos": 17824879681920.0, + "grad_norm": 2.05386002816889, + "language_loss": 0.80054557, + "learning_rate": 1.634606741699593e-06, + "loss": 0.87741685, + "num_input_tokens_seen": 204844305, + "router_z_loss_clip": 1.45605469, + "router_z_loss_mlp": 0.11987305, + "step": 9509, + "time_per_iteration": 3.8947436809539795 + }, + { + "auxiliary_loss_clip": 0.06415324, + "auxiliary_loss_mlp": 0.0126599, + "balance_loss_clip": 0.06274744, + "balance_loss_mlp": 0.01255691, + "epoch": 0.5717721328723884, + "flos": 21871551490560.0, + "grad_norm": 1.798702817725972, + "language_loss": 0.72265553, + "learning_rate": 1.6342238418474255e-06, + "loss": 0.79946876, + "num_input_tokens_seen": 204861765, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.10302734, + "step": 9510, + "time_per_iteration": 2.496246099472046 + }, + { + "auxiliary_loss_clip": 0.06419715, + "auxiliary_loss_mlp": 0.01266842, + "balance_loss_clip": 0.0627699, + "balance_loss_mlp": 0.01255946, + "epoch": 0.5718322561250564, + "flos": 28444924442880.0, + "grad_norm": 1.3126461366590796, + "language_loss": 0.69652188, + "learning_rate": 1.6338409558655264e-06, + "loss": 0.77338743, + "num_input_tokens_seen": 204882505, + "router_z_loss_clip": 1.42480469, + "router_z_loss_mlp": 0.10906982, + "step": 9511, + "time_per_iteration": 2.5713541507720947 + }, + { + "auxiliary_loss_clip": 0.06420782, + "auxiliary_loss_mlp": 0.01268426, + "balance_loss_clip": 0.06277648, + "balance_loss_mlp": 0.01257136, + "epoch": 0.5718923793777243, + "flos": 13556702805120.0, + "grad_norm": 2.0681515910732715, + "language_loss": 0.61827439, + "learning_rate": 1.6334580837684152e-06, + "loss": 0.69516647, + "num_input_tokens_seen": 204899830, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.112854, + "step": 9512, + "time_per_iteration": 2.49580454826355 + }, + { + "auxiliary_loss_clip": 0.06421502, + "auxiliary_loss_mlp": 0.01268423, + "balance_loss_clip": 0.06278209, + "balance_loss_mlp": 0.01257498, + "epoch": 0.5719525026303923, + "flos": 17827856501760.0, + "grad_norm": 2.3676523534955685, + "language_loss": 0.76396298, + "learning_rate": 1.6330752255706104e-06, + "loss": 0.84086221, + "num_input_tokens_seen": 204918100, + "router_z_loss_clip": 1.43359375, + "router_z_loss_mlp": 0.10919189, + "step": 9513, + "time_per_iteration": 2.500870704650879 + }, + { + "auxiliary_loss_clip": 0.06326592, + "auxiliary_loss_mlp": 0.01253708, + "balance_loss_clip": 0.06266873, + "balance_loss_mlp": 0.01252076, + "epoch": 0.5720126258830602, + "flos": 61314724097280.0, + "grad_norm": 0.891161207726192, + "language_loss": 0.66879886, + "learning_rate": 1.6326923812866288e-06, + "loss": 0.74460191, + "num_input_tokens_seen": 204972925, + "router_z_loss_clip": 0.59765625, + "router_z_loss_mlp": 0.01634216, + "step": 9514, + "time_per_iteration": 3.1455137729644775 + }, + { + "auxiliary_loss_clip": 0.06430741, + "auxiliary_loss_mlp": 0.0127094, + "balance_loss_clip": 0.06282684, + "balance_loss_mlp": 0.01258941, + "epoch": 0.5720727491357283, + "flos": 23994878279040.0, + "grad_norm": 2.149685980416527, + "language_loss": 0.81938076, + "learning_rate": 1.63230955093099e-06, + "loss": 0.89639759, + "num_input_tokens_seen": 204990910, + "router_z_loss_clip": 1.48046875, + "router_z_loss_mlp": 0.12005615, + "step": 9515, + "time_per_iteration": 2.5996580123901367 + }, + { + "auxiliary_loss_clip": 0.0641297, + "auxiliary_loss_mlp": 0.01267881, + "balance_loss_clip": 0.06274894, + "balance_loss_mlp": 0.01257259, + "epoch": 0.5721328723883962, + "flos": 23412359894400.0, + "grad_norm": 1.6126279146943563, + "language_loss": 0.86095083, + "learning_rate": 1.6319267345182092e-06, + "loss": 0.93775928, + "num_input_tokens_seen": 205010500, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.10620117, + "step": 9516, + "time_per_iteration": 2.5553810596466064 + }, + { + "auxiliary_loss_clip": 0.06417726, + "auxiliary_loss_mlp": 0.01271814, + "balance_loss_clip": 0.06275768, + "balance_loss_mlp": 0.01260572, + "epoch": 0.5721929956410642, + "flos": 18810520859520.0, + "grad_norm": 2.197571780359881, + "language_loss": 0.87770617, + "learning_rate": 1.6315439320628038e-06, + "loss": 0.95460165, + "num_input_tokens_seen": 205028560, + "router_z_loss_clip": 1.41992188, + "router_z_loss_mlp": 0.11242676, + "step": 9517, + "time_per_iteration": 2.5858652591705322 + }, + { + "auxiliary_loss_clip": 0.06417002, + "auxiliary_loss_mlp": 0.01265386, + "balance_loss_clip": 0.0627486, + "balance_loss_mlp": 0.01254114, + "epoch": 0.5722531188937322, + "flos": 27203676785280.0, + "grad_norm": 1.5341934137919409, + "language_loss": 0.85065883, + "learning_rate": 1.6311611435792893e-06, + "loss": 0.92748272, + "num_input_tokens_seen": 205048650, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.11273193, + "step": 9518, + "time_per_iteration": 2.5850136280059814 + }, + { + "auxiliary_loss_clip": 0.06417416, + "auxiliary_loss_mlp": 0.01266921, + "balance_loss_clip": 0.06278273, + "balance_loss_mlp": 0.01256044, + "epoch": 0.5723132421464001, + "flos": 15201157109760.0, + "grad_norm": 1.5672659775495308, + "language_loss": 0.78797317, + "learning_rate": 1.6307783690821812e-06, + "loss": 0.86481655, + "num_input_tokens_seen": 205066480, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.10870361, + "step": 9519, + "time_per_iteration": 2.5459818840026855 + }, + { + "auxiliary_loss_clip": 0.06418845, + "auxiliary_loss_mlp": 0.01271535, + "balance_loss_clip": 0.06277601, + "balance_loss_mlp": 0.01260675, + "epoch": 0.5723733653990681, + "flos": 27606757651200.0, + "grad_norm": 1.4075514987328583, + "language_loss": 0.83134615, + "learning_rate": 1.6303956085859944e-06, + "loss": 0.90824991, + "num_input_tokens_seen": 205087475, + "router_z_loss_clip": 1.41210938, + "router_z_loss_mlp": 0.10864258, + "step": 9520, + "time_per_iteration": 2.66892671585083 + }, + { + "auxiliary_loss_clip": 0.06426139, + "auxiliary_loss_mlp": 0.01264412, + "balance_loss_clip": 0.06279796, + "balance_loss_mlp": 0.01253022, + "epoch": 0.572433488651736, + "flos": 18228673307520.0, + "grad_norm": 1.9996427544433133, + "language_loss": 0.73064411, + "learning_rate": 1.630012862105243e-06, + "loss": 0.80754966, + "num_input_tokens_seen": 205106495, + "router_z_loss_clip": 1.46484375, + "router_z_loss_mlp": 0.11383057, + "step": 9521, + "time_per_iteration": 2.5980701446533203 + }, + { + "auxiliary_loss_clip": 0.06419297, + "auxiliary_loss_mlp": 0.01270088, + "balance_loss_clip": 0.06276461, + "balance_loss_mlp": 0.01259073, + "epoch": 0.5724936119044041, + "flos": 31257224628480.0, + "grad_norm": 1.5867052207792396, + "language_loss": 0.77991247, + "learning_rate": 1.6296301296544415e-06, + "loss": 0.85680634, + "num_input_tokens_seen": 205128285, + "router_z_loss_clip": 1.42675781, + "router_z_loss_mlp": 0.11022949, + "step": 9522, + "time_per_iteration": 2.5890755653381348 + }, + { + "auxiliary_loss_clip": 0.06416851, + "auxiliary_loss_mlp": 0.01267889, + "balance_loss_clip": 0.06278282, + "balance_loss_mlp": 0.01257649, + "epoch": 0.572553735157072, + "flos": 19207186888320.0, + "grad_norm": 1.441878230551161, + "language_loss": 0.72110128, + "learning_rate": 1.629247411248102e-06, + "loss": 0.79794878, + "num_input_tokens_seen": 205146595, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.10235596, + "step": 9523, + "time_per_iteration": 2.511115789413452 + }, + { + "auxiliary_loss_clip": 0.06417882, + "auxiliary_loss_mlp": 0.0126736, + "balance_loss_clip": 0.06277744, + "balance_loss_mlp": 0.01257025, + "epoch": 0.57261385840974, + "flos": 21221249552640.0, + "grad_norm": 1.7953059857975224, + "language_loss": 0.70372975, + "learning_rate": 1.628864706900738e-06, + "loss": 0.78058219, + "num_input_tokens_seen": 205164295, + "router_z_loss_clip": 1.40039062, + "router_z_loss_mlp": 0.10339355, + "step": 9524, + "time_per_iteration": 2.507387161254883 + }, + { + "auxiliary_loss_clip": 0.0641823, + "auxiliary_loss_mlp": 0.0127028, + "balance_loss_clip": 0.06276852, + "balance_loss_mlp": 0.01259188, + "epoch": 0.5726739816624079, + "flos": 33992936582400.0, + "grad_norm": 1.3727338087163001, + "language_loss": 0.6519655, + "learning_rate": 1.6284820166268615e-06, + "loss": 0.7288506, + "num_input_tokens_seen": 205185380, + "router_z_loss_clip": 1.41503906, + "router_z_loss_mlp": 0.11096191, + "step": 9525, + "time_per_iteration": 2.6264822483062744 + }, + { + "auxiliary_loss_clip": 0.0641274, + "auxiliary_loss_mlp": 0.01266201, + "balance_loss_clip": 0.06272839, + "balance_loss_mlp": 0.01255842, + "epoch": 0.5727341049150759, + "flos": 24282196329600.0, + "grad_norm": 1.6388418597669483, + "language_loss": 0.72797775, + "learning_rate": 1.628099340440984e-06, + "loss": 0.80476719, + "num_input_tokens_seen": 205204895, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.10351562, + "step": 9526, + "time_per_iteration": 2.5209100246429443 + }, + { + "auxiliary_loss_clip": 0.06418388, + "auxiliary_loss_mlp": 0.01268542, + "balance_loss_clip": 0.06280835, + "balance_loss_mlp": 0.01257897, + "epoch": 0.5727942281677438, + "flos": 28407762357120.0, + "grad_norm": 1.5546981496666945, + "language_loss": 0.80170763, + "learning_rate": 1.6277166783576176e-06, + "loss": 0.87857693, + "num_input_tokens_seen": 205223440, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.10650635, + "step": 9527, + "time_per_iteration": 2.6143245697021484 + }, + { + "auxiliary_loss_clip": 0.06413873, + "auxiliary_loss_mlp": 0.01269872, + "balance_loss_clip": 0.06275712, + "balance_loss_mlp": 0.01258983, + "epoch": 0.5728543514204119, + "flos": 19542861544320.0, + "grad_norm": 2.5128112924339585, + "language_loss": 0.72641492, + "learning_rate": 1.6273340303912713e-06, + "loss": 0.8032524, + "num_input_tokens_seen": 205242800, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.10894775, + "step": 9528, + "time_per_iteration": 2.4896552562713623 + }, + { + "auxiliary_loss_clip": 0.06418886, + "auxiliary_loss_mlp": 0.01267185, + "balance_loss_clip": 0.06277183, + "balance_loss_mlp": 0.0125577, + "epoch": 0.5729144746730798, + "flos": 21513137650560.0, + "grad_norm": 1.7938485336826149, + "language_loss": 0.85978115, + "learning_rate": 1.6269513965564557e-06, + "loss": 0.93664181, + "num_input_tokens_seen": 205259465, + "router_z_loss_clip": 1.41699219, + "router_z_loss_mlp": 0.11407471, + "step": 9529, + "time_per_iteration": 2.539447784423828 + }, + { + "auxiliary_loss_clip": 0.063314, + "auxiliary_loss_mlp": 0.01256121, + "balance_loss_clip": 0.06271826, + "balance_loss_mlp": 0.0125448, + "epoch": 0.5729745979257478, + "flos": 58699638495360.0, + "grad_norm": 0.750499003321047, + "language_loss": 0.55969286, + "learning_rate": 1.6265687768676813e-06, + "loss": 0.63556802, + "num_input_tokens_seen": 205314100, + "router_z_loss_clip": 0.59375, + "router_z_loss_mlp": 0.01643372, + "step": 9530, + "time_per_iteration": 3.007678747177124 + }, + { + "auxiliary_loss_clip": 0.06425051, + "auxiliary_loss_mlp": 0.01265649, + "balance_loss_clip": 0.06280611, + "balance_loss_mlp": 0.01254276, + "epoch": 0.5730347211784158, + "flos": 18558100834560.0, + "grad_norm": 1.9102815745402744, + "language_loss": 0.66843903, + "learning_rate": 1.6261861713394553e-06, + "loss": 0.74534607, + "num_input_tokens_seen": 205333420, + "router_z_loss_clip": 1.4453125, + "router_z_loss_mlp": 0.1137085, + "step": 9531, + "time_per_iteration": 3.9059529304504395 + }, + { + "auxiliary_loss_clip": 0.06417044, + "auxiliary_loss_mlp": 0.01269124, + "balance_loss_clip": 0.06274498, + "balance_loss_mlp": 0.01257966, + "epoch": 0.5730948444310837, + "flos": 38040069588480.0, + "grad_norm": 1.9862057863273674, + "language_loss": 0.75881588, + "learning_rate": 1.6258035799862876e-06, + "loss": 0.83567762, + "num_input_tokens_seen": 205350995, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.11169434, + "step": 9532, + "time_per_iteration": 2.640389919281006 + }, + { + "auxiliary_loss_clip": 0.06421025, + "auxiliary_loss_mlp": 0.01267077, + "balance_loss_clip": 0.06278558, + "balance_loss_mlp": 0.01255794, + "epoch": 0.5731549676837517, + "flos": 25233861876480.0, + "grad_norm": 1.2592580925122039, + "language_loss": 0.79252976, + "learning_rate": 1.625421002822686e-06, + "loss": 0.86941075, + "num_input_tokens_seen": 205372675, + "router_z_loss_clip": 1.42480469, + "router_z_loss_mlp": 0.11291504, + "step": 9533, + "time_per_iteration": 2.559293508529663 + }, + { + "auxiliary_loss_clip": 0.06417587, + "auxiliary_loss_mlp": 0.01266539, + "balance_loss_clip": 0.06278279, + "balance_loss_mlp": 0.01256067, + "epoch": 0.5732150909364196, + "flos": 23375030100480.0, + "grad_norm": 3.634749275276224, + "language_loss": 0.8597486, + "learning_rate": 1.6250384398631574e-06, + "loss": 0.93658984, + "num_input_tokens_seen": 205392590, + "router_z_loss_clip": 1.39257812, + "router_z_loss_mlp": 0.10467529, + "step": 9534, + "time_per_iteration": 2.539487838745117 + }, + { + "auxiliary_loss_clip": 0.06421855, + "auxiliary_loss_mlp": 0.01269069, + "balance_loss_clip": 0.06279784, + "balance_loss_mlp": 0.01257625, + "epoch": 0.5732752141890877, + "flos": 23086621946880.0, + "grad_norm": 1.944302626791885, + "language_loss": 0.75668436, + "learning_rate": 1.6246558911222085e-06, + "loss": 0.83359355, + "num_input_tokens_seen": 205414885, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.11444092, + "step": 9535, + "time_per_iteration": 2.5488839149475098 + }, + { + "auxiliary_loss_clip": 0.06425361, + "auxiliary_loss_mlp": 0.01268179, + "balance_loss_clip": 0.06278601, + "balance_loss_mlp": 0.01256288, + "epoch": 0.5733353374417556, + "flos": 24359078050560.0, + "grad_norm": 1.5155376410848522, + "language_loss": 0.71395552, + "learning_rate": 1.624273356614346e-06, + "loss": 0.79089081, + "num_input_tokens_seen": 205434440, + "router_z_loss_clip": 1.46582031, + "router_z_loss_mlp": 0.11895752, + "step": 9536, + "time_per_iteration": 2.553239345550537 + }, + { + "auxiliary_loss_clip": 0.06416988, + "auxiliary_loss_mlp": 0.01269432, + "balance_loss_clip": 0.06275923, + "balance_loss_mlp": 0.01258244, + "epoch": 0.5733954606944236, + "flos": 27206234334720.0, + "grad_norm": 1.742372783929404, + "language_loss": 0.70031548, + "learning_rate": 1.6238908363540755e-06, + "loss": 0.77717972, + "num_input_tokens_seen": 205454225, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.11187744, + "step": 9537, + "time_per_iteration": 2.5490598678588867 + }, + { + "auxiliary_loss_clip": 0.06419763, + "auxiliary_loss_mlp": 0.0126804, + "balance_loss_clip": 0.06277005, + "balance_loss_mlp": 0.01257317, + "epoch": 0.5734555839470915, + "flos": 28772339472000.0, + "grad_norm": 2.334146865026381, + "language_loss": 0.63052773, + "learning_rate": 1.623508330355902e-06, + "loss": 0.70740581, + "num_input_tokens_seen": 205474750, + "router_z_loss_clip": 1.42480469, + "router_z_loss_mlp": 0.10723877, + "step": 9538, + "time_per_iteration": 4.013959169387817 + }, + { + "auxiliary_loss_clip": 0.0641904, + "auxiliary_loss_mlp": 0.01273663, + "balance_loss_clip": 0.06277157, + "balance_loss_mlp": 0.0126136, + "epoch": 0.5735157071997595, + "flos": 22973542462080.0, + "grad_norm": 1.806157803076428, + "language_loss": 0.82720077, + "learning_rate": 1.6231258386343306e-06, + "loss": 0.90412778, + "num_input_tokens_seen": 205495495, + "router_z_loss_clip": 1.41796875, + "router_z_loss_mlp": 0.12310791, + "step": 9539, + "time_per_iteration": 2.554189682006836 + }, + { + "auxiliary_loss_clip": 0.06422378, + "auxiliary_loss_mlp": 0.01264392, + "balance_loss_clip": 0.06276339, + "balance_loss_mlp": 0.01253115, + "epoch": 0.5735758304524274, + "flos": 18995450820480.0, + "grad_norm": 2.0055639259958107, + "language_loss": 0.73150325, + "learning_rate": 1.6227433612038647e-06, + "loss": 0.80837095, + "num_input_tokens_seen": 205510070, + "router_z_loss_clip": 1.45996094, + "router_z_loss_mlp": 0.11279297, + "step": 9540, + "time_per_iteration": 2.500077486038208 + }, + { + "auxiliary_loss_clip": 0.0641907, + "auxiliary_loss_mlp": 0.01265571, + "balance_loss_clip": 0.06276584, + "balance_loss_mlp": 0.01255039, + "epoch": 0.5736359537050955, + "flos": 28404701683200.0, + "grad_norm": 2.024476848130698, + "language_loss": 0.80249465, + "learning_rate": 1.6223608980790089e-06, + "loss": 0.87934107, + "num_input_tokens_seen": 205530190, + "router_z_loss_clip": 1.42675781, + "router_z_loss_mlp": 0.10528564, + "step": 9541, + "time_per_iteration": 4.051165342330933 + }, + { + "auxiliary_loss_clip": 0.06425047, + "auxiliary_loss_mlp": 0.01265692, + "balance_loss_clip": 0.06278428, + "balance_loss_mlp": 0.01253998, + "epoch": 0.5736960769577634, + "flos": 15631714915200.0, + "grad_norm": 2.008860171144918, + "language_loss": 0.64482939, + "learning_rate": 1.6219784492742654e-06, + "loss": 0.72173679, + "num_input_tokens_seen": 205547380, + "router_z_loss_clip": 1.46386719, + "router_z_loss_mlp": 0.11700439, + "step": 9542, + "time_per_iteration": 2.5055642127990723 + }, + { + "auxiliary_loss_clip": 0.06417751, + "auxiliary_loss_mlp": 0.01265337, + "balance_loss_clip": 0.0627488, + "balance_loss_mlp": 0.01254691, + "epoch": 0.5737562002104314, + "flos": 18009767715840.0, + "grad_norm": 2.2598183554381146, + "language_loss": 0.83200055, + "learning_rate": 1.6215960148041365e-06, + "loss": 0.90883142, + "num_input_tokens_seen": 205566540, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.10638428, + "step": 9543, + "time_per_iteration": 2.4916088581085205 + }, + { + "auxiliary_loss_clip": 0.06426359, + "auxiliary_loss_mlp": 0.0126626, + "balance_loss_clip": 0.06279086, + "balance_loss_mlp": 0.01254422, + "epoch": 0.5738163234630994, + "flos": 20703454047360.0, + "grad_norm": 1.617850922862876, + "language_loss": 0.74024302, + "learning_rate": 1.6212135946831257e-06, + "loss": 0.81716919, + "num_input_tokens_seen": 205584200, + "router_z_loss_clip": 1.47460938, + "router_z_loss_mlp": 0.1184082, + "step": 9544, + "time_per_iteration": 2.536583662033081 + }, + { + "auxiliary_loss_clip": 0.06424204, + "auxiliary_loss_mlp": 0.01268479, + "balance_loss_clip": 0.06278355, + "balance_loss_mlp": 0.01256809, + "epoch": 0.5738764467157673, + "flos": 23156082581760.0, + "grad_norm": 3.1974440280178595, + "language_loss": 0.76412272, + "learning_rate": 1.620831188925733e-06, + "loss": 0.84104949, + "num_input_tokens_seen": 205604675, + "router_z_loss_clip": 1.45898438, + "router_z_loss_mlp": 0.11676025, + "step": 9545, + "time_per_iteration": 2.5427141189575195 + }, + { + "auxiliary_loss_clip": 0.06423136, + "auxiliary_loss_mlp": 0.01267499, + "balance_loss_clip": 0.06279323, + "balance_loss_mlp": 0.01256162, + "epoch": 0.5739365699684353, + "flos": 29499942401280.0, + "grad_norm": 2.3578945444753447, + "language_loss": 0.56573224, + "learning_rate": 1.620448797546459e-06, + "loss": 0.64263856, + "num_input_tokens_seen": 205624680, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.11334229, + "step": 9546, + "time_per_iteration": 2.608128309249878 + }, + { + "auxiliary_loss_clip": 0.06422536, + "auxiliary_loss_mlp": 0.01268737, + "balance_loss_clip": 0.0627693, + "balance_loss_mlp": 0.01257746, + "epoch": 0.5739966932211032, + "flos": 14032388833920.0, + "grad_norm": 2.2022917684402996, + "language_loss": 0.76728261, + "learning_rate": 1.6200664205598055e-06, + "loss": 0.84419537, + "num_input_tokens_seen": 205641950, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.10980225, + "step": 9547, + "time_per_iteration": 2.5017452239990234 + }, + { + "auxiliary_loss_clip": 0.06421655, + "auxiliary_loss_mlp": 0.01268546, + "balance_loss_clip": 0.06277436, + "balance_loss_mlp": 0.01257114, + "epoch": 0.5740568164737713, + "flos": 19067972129280.0, + "grad_norm": 1.9505887412268983, + "language_loss": 0.7442795, + "learning_rate": 1.6196840579802704e-06, + "loss": 0.82118154, + "num_input_tokens_seen": 205660130, + "router_z_loss_clip": 1.44042969, + "router_z_loss_mlp": 0.11444092, + "step": 9548, + "time_per_iteration": 2.549558639526367 + }, + { + "auxiliary_loss_clip": 0.06418206, + "auxiliary_loss_mlp": 0.01266472, + "balance_loss_clip": 0.0627545, + "balance_loss_mlp": 0.01255064, + "epoch": 0.5741169397264392, + "flos": 22134453275520.0, + "grad_norm": 2.3791642109865228, + "language_loss": 0.69704068, + "learning_rate": 1.619301709822355e-06, + "loss": 0.77388746, + "num_input_tokens_seen": 205678895, + "router_z_loss_clip": 1.42871094, + "router_z_loss_mlp": 0.11419678, + "step": 9549, + "time_per_iteration": 3.933781147003174 + }, + { + "auxiliary_loss_clip": 0.06420065, + "auxiliary_loss_mlp": 0.01265483, + "balance_loss_clip": 0.06279664, + "balance_loss_mlp": 0.01254611, + "epoch": 0.5741770629791072, + "flos": 24943860495360.0, + "grad_norm": 1.461228472430463, + "language_loss": 0.79521686, + "learning_rate": 1.6189193761005564e-06, + "loss": 0.87207234, + "num_input_tokens_seen": 205698450, + "router_z_loss_clip": 1.40527344, + "router_z_loss_mlp": 0.10870361, + "step": 9550, + "time_per_iteration": 2.577768087387085 + }, + { + "auxiliary_loss_clip": 0.06419414, + "auxiliary_loss_mlp": 0.01265674, + "balance_loss_clip": 0.06276274, + "balance_loss_mlp": 0.01254832, + "epoch": 0.5742371862317751, + "flos": 18806495863680.0, + "grad_norm": 2.119345289493334, + "language_loss": 0.68877375, + "learning_rate": 1.6185370568293727e-06, + "loss": 0.76562458, + "num_input_tokens_seen": 205714870, + "router_z_loss_clip": 1.43164062, + "router_z_loss_mlp": 0.10845947, + "step": 9551, + "time_per_iteration": 2.480468273162842 + }, + { + "auxiliary_loss_clip": 0.06424205, + "auxiliary_loss_mlp": 0.01267294, + "balance_loss_clip": 0.06276421, + "balance_loss_mlp": 0.0125579, + "epoch": 0.5742973094844431, + "flos": 24467293998720.0, + "grad_norm": 1.5487820488887025, + "language_loss": 0.72033125, + "learning_rate": 1.6181547520233031e-06, + "loss": 0.79724622, + "num_input_tokens_seen": 205736045, + "router_z_loss_clip": 1.47460938, + "router_z_loss_mlp": 0.11505127, + "step": 9552, + "time_per_iteration": 2.5759360790252686 + }, + { + "auxiliary_loss_clip": 0.06417461, + "auxiliary_loss_mlp": 0.01265348, + "balance_loss_clip": 0.06274983, + "balance_loss_mlp": 0.0125469, + "epoch": 0.574357432737111, + "flos": 21659186517120.0, + "grad_norm": 3.0495771997900163, + "language_loss": 0.79982221, + "learning_rate": 1.617772461696843e-06, + "loss": 0.87665033, + "num_input_tokens_seen": 205754445, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.10662842, + "step": 9553, + "time_per_iteration": 2.49290132522583 + }, + { + "auxiliary_loss_clip": 0.06423397, + "auxiliary_loss_mlp": 0.01264041, + "balance_loss_clip": 0.06275378, + "balance_loss_mlp": 0.0125333, + "epoch": 0.5744175559897791, + "flos": 16550285299200.0, + "grad_norm": 2.1324379432349425, + "language_loss": 0.83817756, + "learning_rate": 1.6173901858644895e-06, + "loss": 0.91505194, + "num_input_tokens_seen": 205770595, + "router_z_loss_clip": 1.48144531, + "router_z_loss_mlp": 0.1071167, + "step": 9554, + "time_per_iteration": 2.5118370056152344 + }, + { + "auxiliary_loss_clip": 0.06422277, + "auxiliary_loss_mlp": 0.01267015, + "balance_loss_clip": 0.06277014, + "balance_loss_mlp": 0.0125575, + "epoch": 0.574477679242447, + "flos": 24214580484480.0, + "grad_norm": 1.3861221814355518, + "language_loss": 0.71406233, + "learning_rate": 1.6170079245407385e-06, + "loss": 0.79095531, + "num_input_tokens_seen": 205791935, + "router_z_loss_clip": 1.45117188, + "router_z_loss_mlp": 0.11254883, + "step": 9555, + "time_per_iteration": 2.5466480255126953 + }, + { + "auxiliary_loss_clip": 0.06421511, + "auxiliary_loss_mlp": 0.01268077, + "balance_loss_clip": 0.06277835, + "balance_loss_mlp": 0.01256478, + "epoch": 0.574537802495115, + "flos": 14908304689920.0, + "grad_norm": 2.185347344801511, + "language_loss": 0.73004574, + "learning_rate": 1.6166256777400853e-06, + "loss": 0.80694163, + "num_input_tokens_seen": 205807260, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.1159668, + "step": 9556, + "time_per_iteration": 2.4900078773498535 + }, + { + "auxiliary_loss_clip": 0.0641879, + "auxiliary_loss_mlp": 0.01265172, + "balance_loss_clip": 0.06276919, + "balance_loss_mlp": 0.01253406, + "epoch": 0.5745979257477829, + "flos": 24941680289280.0, + "grad_norm": 1.5306662340422301, + "language_loss": 0.74479866, + "learning_rate": 1.6162434454770248e-06, + "loss": 0.82163835, + "num_input_tokens_seen": 205826885, + "router_z_loss_clip": 1.41796875, + "router_z_loss_mlp": 0.11761475, + "step": 9557, + "time_per_iteration": 2.576296329498291 + }, + { + "auxiliary_loss_clip": 0.06420197, + "auxiliary_loss_mlp": 0.01263736, + "balance_loss_clip": 0.06277291, + "balance_loss_mlp": 0.01252572, + "epoch": 0.5746580490004509, + "flos": 17241061559040.0, + "grad_norm": 1.5775139248237169, + "language_loss": 0.68007201, + "learning_rate": 1.6158612277660514e-06, + "loss": 0.75691128, + "num_input_tokens_seen": 205844630, + "router_z_loss_clip": 1.42871094, + "router_z_loss_mlp": 0.11157227, + "step": 9558, + "time_per_iteration": 2.531812906265259 + }, + { + "auxiliary_loss_clip": 0.06424935, + "auxiliary_loss_mlp": 0.01267243, + "balance_loss_clip": 0.06275487, + "balance_loss_mlp": 0.01253779, + "epoch": 0.5747181722531189, + "flos": 13192838449920.0, + "grad_norm": 2.425506842460266, + "language_loss": 0.71628273, + "learning_rate": 1.615479024621659e-06, + "loss": 0.79320455, + "num_input_tokens_seen": 205860960, + "router_z_loss_clip": 1.49511719, + "router_z_loss_mlp": 0.13482666, + "step": 9559, + "time_per_iteration": 2.473419189453125 + }, + { + "auxiliary_loss_clip": 0.06419484, + "auxiliary_loss_mlp": 0.01266983, + "balance_loss_clip": 0.06277612, + "balance_loss_mlp": 0.01256921, + "epoch": 0.5747782955057869, + "flos": 22969098195840.0, + "grad_norm": 1.5670628486073652, + "language_loss": 0.79416776, + "learning_rate": 1.6150968360583398e-06, + "loss": 0.87103242, + "num_input_tokens_seen": 205880675, + "router_z_loss_clip": 1.41796875, + "router_z_loss_mlp": 0.10064697, + "step": 9560, + "time_per_iteration": 2.532862663269043 + }, + { + "auxiliary_loss_clip": 0.06421925, + "auxiliary_loss_mlp": 0.01267007, + "balance_loss_clip": 0.06276737, + "balance_loss_mlp": 0.01255581, + "epoch": 0.5748384187584549, + "flos": 23409802344960.0, + "grad_norm": 1.793006683486937, + "language_loss": 0.64777875, + "learning_rate": 1.614714662090588e-06, + "loss": 0.72466803, + "num_input_tokens_seen": 205900050, + "router_z_loss_clip": 1.45214844, + "router_z_loss_mlp": 0.11431885, + "step": 9561, + "time_per_iteration": 2.5111758708953857 + }, + { + "auxiliary_loss_clip": 0.06426983, + "auxiliary_loss_mlp": 0.01268046, + "balance_loss_clip": 0.06277155, + "balance_loss_mlp": 0.01256369, + "epoch": 0.5748985420111228, + "flos": 17791323321600.0, + "grad_norm": 1.4966227163397983, + "language_loss": 0.7114228, + "learning_rate": 1.6143325027328945e-06, + "loss": 0.78837311, + "num_input_tokens_seen": 205918855, + "router_z_loss_clip": 1.49804688, + "router_z_loss_mlp": 0.11682129, + "step": 9562, + "time_per_iteration": 2.5162081718444824 + }, + { + "auxiliary_loss_clip": 0.06425486, + "auxiliary_loss_mlp": 0.01266976, + "balance_loss_clip": 0.06280454, + "balance_loss_mlp": 0.01256081, + "epoch": 0.5749586652637908, + "flos": 19872582560640.0, + "grad_norm": 1.4328664867345224, + "language_loss": 0.84269559, + "learning_rate": 1.613950357999751e-06, + "loss": 0.91962022, + "num_input_tokens_seen": 205936970, + "router_z_loss_clip": 1.44921875, + "router_z_loss_mlp": 0.10888672, + "step": 9563, + "time_per_iteration": 2.5183188915252686 + }, + { + "auxiliary_loss_clip": 0.06421089, + "auxiliary_loss_mlp": 0.01268857, + "balance_loss_clip": 0.06273992, + "balance_loss_mlp": 0.01256733, + "epoch": 0.5750187885164587, + "flos": 21293477372160.0, + "grad_norm": 2.089685167133714, + "language_loss": 0.57297182, + "learning_rate": 1.6135682279056488e-06, + "loss": 0.64987123, + "num_input_tokens_seen": 205954630, + "router_z_loss_clip": 1.46972656, + "router_z_loss_mlp": 0.12127686, + "step": 9564, + "time_per_iteration": 2.5219571590423584 + }, + { + "auxiliary_loss_clip": 0.06414357, + "auxiliary_loss_mlp": 0.01264565, + "balance_loss_clip": 0.06276927, + "balance_loss_mlp": 0.0125389, + "epoch": 0.5750789117691267, + "flos": 18810227370240.0, + "grad_norm": 1.5824685354584669, + "language_loss": 0.76484299, + "learning_rate": 1.613186112465078e-06, + "loss": 0.84163225, + "num_input_tokens_seen": 205971510, + "router_z_loss_clip": 1.37402344, + "router_z_loss_mlp": 0.10681152, + "step": 9565, + "time_per_iteration": 2.4752280712127686 + }, + { + "auxiliary_loss_clip": 0.06321105, + "auxiliary_loss_mlp": 0.01250694, + "balance_loss_clip": 0.06260607, + "balance_loss_mlp": 0.01249219, + "epoch": 0.5751390350217946, + "flos": 70685624188800.0, + "grad_norm": 0.721103953507815, + "language_loss": 0.6068033, + "learning_rate": 1.6128040116925287e-06, + "loss": 0.68252128, + "num_input_tokens_seen": 206035125, + "router_z_loss_clip": 0.609375, + "router_z_loss_mlp": 0.01473999, + "step": 9566, + "time_per_iteration": 3.222144603729248 + }, + { + "auxiliary_loss_clip": 0.06420306, + "auxiliary_loss_mlp": 0.01268432, + "balance_loss_clip": 0.06276581, + "balance_loss_mlp": 0.01257673, + "epoch": 0.5751991582744627, + "flos": 14251545987840.0, + "grad_norm": 2.0959328312792467, + "language_loss": 0.75654471, + "learning_rate": 1.6124219256024901e-06, + "loss": 0.83343208, + "num_input_tokens_seen": 206052075, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.10760498, + "step": 9567, + "time_per_iteration": 2.4892570972442627 + }, + { + "auxiliary_loss_clip": 0.06417775, + "auxiliary_loss_mlp": 0.01267193, + "balance_loss_clip": 0.06274199, + "balance_loss_mlp": 0.01255875, + "epoch": 0.5752592815271306, + "flos": 18333283530240.0, + "grad_norm": 1.4488652909067903, + "language_loss": 0.75253701, + "learning_rate": 1.6120398542094504e-06, + "loss": 0.82938665, + "num_input_tokens_seen": 206069970, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.11322021, + "step": 9568, + "time_per_iteration": 2.473475217819214 + }, + { + "auxiliary_loss_clip": 0.06419896, + "auxiliary_loss_mlp": 0.01265316, + "balance_loss_clip": 0.06276227, + "balance_loss_mlp": 0.01254349, + "epoch": 0.5753194047797986, + "flos": 20928984111360.0, + "grad_norm": 1.5107907301615, + "language_loss": 0.71293747, + "learning_rate": 1.6116577975278994e-06, + "loss": 0.78978956, + "num_input_tokens_seen": 206088950, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.10968018, + "step": 9569, + "time_per_iteration": 2.6541481018066406 + }, + { + "auxiliary_loss_clip": 0.06420765, + "auxiliary_loss_mlp": 0.01267458, + "balance_loss_clip": 0.06275727, + "balance_loss_mlp": 0.01255764, + "epoch": 0.5753795280324665, + "flos": 19287925896960.0, + "grad_norm": 2.027519323892087, + "language_loss": 0.56120193, + "learning_rate": 1.6112757555723223e-06, + "loss": 0.63808417, + "num_input_tokens_seen": 206107780, + "router_z_loss_clip": 1.45019531, + "router_z_loss_mlp": 0.11694336, + "step": 9570, + "time_per_iteration": 2.5568745136260986 + }, + { + "auxiliary_loss_clip": 0.0641574, + "auxiliary_loss_mlp": 0.01264384, + "balance_loss_clip": 0.06274444, + "balance_loss_mlp": 0.01253715, + "epoch": 0.5754396512851345, + "flos": 21659312298240.0, + "grad_norm": 3.8103947749492355, + "language_loss": 0.64502007, + "learning_rate": 1.6108937283572082e-06, + "loss": 0.72182131, + "num_input_tokens_seen": 206127445, + "router_z_loss_clip": 1.41210938, + "router_z_loss_mlp": 0.10675049, + "step": 9571, + "time_per_iteration": 3.9861292839050293 + }, + { + "auxiliary_loss_clip": 0.06417111, + "auxiliary_loss_mlp": 0.01267965, + "balance_loss_clip": 0.06275386, + "balance_loss_mlp": 0.01257153, + "epoch": 0.5754997745378025, + "flos": 51032674707840.0, + "grad_norm": 1.44401056534108, + "language_loss": 0.67167187, + "learning_rate": 1.6105117158970434e-06, + "loss": 0.74852264, + "num_input_tokens_seen": 206152005, + "router_z_loss_clip": 1.41601562, + "router_z_loss_mlp": 0.10821533, + "step": 9572, + "time_per_iteration": 2.775322198867798 + }, + { + "auxiliary_loss_clip": 0.06417632, + "auxiliary_loss_mlp": 0.0126415, + "balance_loss_clip": 0.06276821, + "balance_loss_mlp": 0.01252378, + "epoch": 0.5755598977904705, + "flos": 22863523651200.0, + "grad_norm": 1.9643261986613603, + "language_loss": 0.72534865, + "learning_rate": 1.6101297182063123e-06, + "loss": 0.80216646, + "num_input_tokens_seen": 206169875, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.11767578, + "step": 9573, + "time_per_iteration": 2.504248857498169 + }, + { + "auxiliary_loss_clip": 0.06413124, + "auxiliary_loss_mlp": 0.01264891, + "balance_loss_clip": 0.06276227, + "balance_loss_mlp": 0.0125495, + "epoch": 0.5756200210431385, + "flos": 38482073475840.0, + "grad_norm": 1.6390607800794645, + "language_loss": 0.76527274, + "learning_rate": 1.6097477352995022e-06, + "loss": 0.84205294, + "num_input_tokens_seen": 206192635, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.09954834, + "step": 9574, + "time_per_iteration": 2.675445079803467 + }, + { + "auxiliary_loss_clip": 0.06426176, + "auxiliary_loss_mlp": 0.01264732, + "balance_loss_clip": 0.06277125, + "balance_loss_mlp": 0.01252865, + "epoch": 0.5756801442958064, + "flos": 23915984060160.0, + "grad_norm": 3.486560074307127, + "language_loss": 0.67186499, + "learning_rate": 1.6093657671910968e-06, + "loss": 0.74877405, + "num_input_tokens_seen": 206211485, + "router_z_loss_clip": 1.49121094, + "router_z_loss_mlp": 0.11877441, + "step": 9575, + "time_per_iteration": 2.5086028575897217 + }, + { + "auxiliary_loss_clip": 0.06414266, + "auxiliary_loss_mlp": 0.01263942, + "balance_loss_clip": 0.06275645, + "balance_loss_mlp": 0.01253899, + "epoch": 0.5757402675484744, + "flos": 21111566158080.0, + "grad_norm": 1.4184952738773886, + "language_loss": 0.80574554, + "learning_rate": 1.6089838138955804e-06, + "loss": 0.88252765, + "num_input_tokens_seen": 206231740, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.1005249, + "step": 9576, + "time_per_iteration": 2.502372980117798 + }, + { + "auxiliary_loss_clip": 0.06413178, + "auxiliary_loss_mlp": 0.01266947, + "balance_loss_clip": 0.06273341, + "balance_loss_mlp": 0.01256439, + "epoch": 0.5758003908011423, + "flos": 20565497099520.0, + "grad_norm": 1.5791511975506907, + "language_loss": 0.69807208, + "learning_rate": 1.6086018754274372e-06, + "loss": 0.77487338, + "num_input_tokens_seen": 206250975, + "router_z_loss_clip": 1.39648438, + "router_z_loss_mlp": 0.10510254, + "step": 9577, + "time_per_iteration": 4.000526428222656 + }, + { + "auxiliary_loss_clip": 0.06420817, + "auxiliary_loss_mlp": 0.0126492, + "balance_loss_clip": 0.06274913, + "balance_loss_mlp": 0.012544, + "epoch": 0.5758605140538103, + "flos": 16478770239360.0, + "grad_norm": 1.7483336770936004, + "language_loss": 0.66710907, + "learning_rate": 1.6082199518011504e-06, + "loss": 0.74396646, + "num_input_tokens_seen": 206268800, + "router_z_loss_clip": 1.45800781, + "router_z_loss_mlp": 0.10510254, + "step": 9578, + "time_per_iteration": 2.495589256286621 + }, + { + "auxiliary_loss_clip": 0.06417773, + "auxiliary_loss_mlp": 0.01264669, + "balance_loss_clip": 0.06276586, + "balance_loss_mlp": 0.01254274, + "epoch": 0.5759206373064782, + "flos": 21293854715520.0, + "grad_norm": 1.4632151435184575, + "language_loss": 0.72808439, + "learning_rate": 1.6078380430312016e-06, + "loss": 0.80490887, + "num_input_tokens_seen": 206287190, + "router_z_loss_clip": 1.41210938, + "router_z_loss_mlp": 0.10388184, + "step": 9579, + "time_per_iteration": 2.4900078773498535 + }, + { + "auxiliary_loss_clip": 0.06426738, + "auxiliary_loss_mlp": 0.01266533, + "balance_loss_clip": 0.06278113, + "balance_loss_mlp": 0.01254451, + "epoch": 0.5759807605591463, + "flos": 26075089342080.0, + "grad_norm": 2.9637416190029597, + "language_loss": 0.64800644, + "learning_rate": 1.6074561491320742e-06, + "loss": 0.72493923, + "num_input_tokens_seen": 206307020, + "router_z_loss_clip": 1.48535156, + "router_z_loss_mlp": 0.12072754, + "step": 9580, + "time_per_iteration": 2.532273292541504 + }, + { + "auxiliary_loss_clip": 0.06420532, + "auxiliary_loss_mlp": 0.01266688, + "balance_loss_clip": 0.06275357, + "balance_loss_mlp": 0.01255554, + "epoch": 0.5760408838118142, + "flos": 18877885142400.0, + "grad_norm": 1.6521602857434026, + "language_loss": 0.85497582, + "learning_rate": 1.6070742701182486e-06, + "loss": 0.93184799, + "num_input_tokens_seen": 206324095, + "router_z_loss_clip": 1.45214844, + "router_z_loss_mlp": 0.11132812, + "step": 9581, + "time_per_iteration": 3.9159321784973145 + }, + { + "auxiliary_loss_clip": 0.06425697, + "auxiliary_loss_mlp": 0.01268939, + "balance_loss_clip": 0.06276281, + "balance_loss_mlp": 0.01257483, + "epoch": 0.5761010070644822, + "flos": 15383655302400.0, + "grad_norm": 2.053627577895993, + "language_loss": 0.67847329, + "learning_rate": 1.6066924060042057e-06, + "loss": 0.75541961, + "num_input_tokens_seen": 206343210, + "router_z_loss_clip": 1.49414062, + "router_z_loss_mlp": 0.11450195, + "step": 9582, + "time_per_iteration": 2.468289613723755 + }, + { + "auxiliary_loss_clip": 0.06323063, + "auxiliary_loss_mlp": 0.0125238, + "balance_loss_clip": 0.06262786, + "balance_loss_mlp": 0.01250932, + "epoch": 0.5761611303171501, + "flos": 71495475500160.0, + "grad_norm": 0.6295597289579254, + "language_loss": 0.5722791, + "learning_rate": 1.6063105568044271e-06, + "loss": 0.64803356, + "num_input_tokens_seen": 206415935, + "router_z_loss_clip": 0.60253906, + "router_z_loss_mlp": 0.0144577, + "step": 9583, + "time_per_iteration": 3.280832052230835 + }, + { + "auxiliary_loss_clip": 0.06416009, + "auxiliary_loss_mlp": 0.0126413, + "balance_loss_clip": 0.06274246, + "balance_loss_mlp": 0.01253437, + "epoch": 0.5762212535698181, + "flos": 16250556844800.0, + "grad_norm": 1.895482028357212, + "language_loss": 0.82933408, + "learning_rate": 1.6059287225333912e-06, + "loss": 0.90613544, + "num_input_tokens_seen": 206431900, + "router_z_loss_clip": 1.41894531, + "router_z_loss_mlp": 0.10693359, + "step": 9584, + "time_per_iteration": 2.473771333694458 + }, + { + "auxiliary_loss_clip": 0.06325932, + "auxiliary_loss_mlp": 0.01252168, + "balance_loss_clip": 0.06265227, + "balance_loss_mlp": 0.01250696, + "epoch": 0.5762813768224861, + "flos": 70207254829440.0, + "grad_norm": 0.6148723792494001, + "language_loss": 0.49547607, + "learning_rate": 1.6055469032055773e-06, + "loss": 0.57125711, + "num_input_tokens_seen": 206501200, + "router_z_loss_clip": 0.609375, + "router_z_loss_mlp": 0.0147171, + "step": 9585, + "time_per_iteration": 3.220283031463623 + }, + { + "auxiliary_loss_clip": 0.06417918, + "auxiliary_loss_mlp": 0.0126733, + "balance_loss_clip": 0.06276701, + "balance_loss_mlp": 0.01256446, + "epoch": 0.5763415000751541, + "flos": 20523639185280.0, + "grad_norm": 1.396891707955096, + "language_loss": 0.84832788, + "learning_rate": 1.605165098835465e-06, + "loss": 0.92518032, + "num_input_tokens_seen": 206520575, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.10876465, + "step": 9586, + "time_per_iteration": 2.5044658184051514 + }, + { + "auxiliary_loss_clip": 0.0641425, + "auxiliary_loss_mlp": 0.01268611, + "balance_loss_clip": 0.06270906, + "balance_loss_mlp": 0.01257584, + "epoch": 0.5764016233278221, + "flos": 15821047215360.0, + "grad_norm": 1.5476594832750246, + "language_loss": 0.80150878, + "learning_rate": 1.6047833094375308e-06, + "loss": 0.87833744, + "num_input_tokens_seen": 206538060, + "router_z_loss_clip": 1.43457031, + "router_z_loss_mlp": 0.11035156, + "step": 9587, + "time_per_iteration": 2.494929552078247 + }, + { + "auxiliary_loss_clip": 0.06421454, + "auxiliary_loss_mlp": 0.01267229, + "balance_loss_clip": 0.06277972, + "balance_loss_mlp": 0.01256184, + "epoch": 0.57646174658049, + "flos": 20777778218880.0, + "grad_norm": 1.3785070074858572, + "language_loss": 0.6626485, + "learning_rate": 1.6044015350262542e-06, + "loss": 0.73953533, + "num_input_tokens_seen": 206557320, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.11047363, + "step": 9588, + "time_per_iteration": 3.990769863128662 + }, + { + "auxiliary_loss_clip": 0.06420319, + "auxiliary_loss_mlp": 0.01268847, + "balance_loss_clip": 0.0627601, + "balance_loss_mlp": 0.01256491, + "epoch": 0.576521869833158, + "flos": 23556647825280.0, + "grad_norm": 1.8252792275452514, + "language_loss": 0.79050291, + "learning_rate": 1.6040197756161104e-06, + "loss": 0.86739457, + "num_input_tokens_seen": 206575780, + "router_z_loss_clip": 1.44335938, + "router_z_loss_mlp": 0.1237793, + "step": 9589, + "time_per_iteration": 2.5151610374450684 + }, + { + "auxiliary_loss_clip": 0.06414266, + "auxiliary_loss_mlp": 0.01264887, + "balance_loss_clip": 0.06275681, + "balance_loss_mlp": 0.01254652, + "epoch": 0.5765819930858259, + "flos": 20272812387840.0, + "grad_norm": 1.9044444718181142, + "language_loss": 0.79799986, + "learning_rate": 1.6036380312215762e-06, + "loss": 0.87479138, + "num_input_tokens_seen": 206594100, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.10229492, + "step": 9590, + "time_per_iteration": 2.502588987350464 + }, + { + "auxiliary_loss_clip": 0.06424554, + "auxiliary_loss_mlp": 0.01266306, + "balance_loss_clip": 0.06279668, + "balance_loss_mlp": 0.01256096, + "epoch": 0.5766421163384939, + "flos": 23155453676160.0, + "grad_norm": 1.9323149052957644, + "language_loss": 0.63195986, + "learning_rate": 1.6032563018571283e-06, + "loss": 0.7088685, + "num_input_tokens_seen": 206613325, + "router_z_loss_clip": 1.44921875, + "router_z_loss_mlp": 0.10217285, + "step": 9591, + "time_per_iteration": 2.5217199325561523 + }, + { + "auxiliary_loss_clip": 0.0641837, + "auxiliary_loss_mlp": 0.0126852, + "balance_loss_clip": 0.06274436, + "balance_loss_mlp": 0.01258048, + "epoch": 0.5767022395911618, + "flos": 25856057969280.0, + "grad_norm": 1.7751118346977903, + "language_loss": 0.78161305, + "learning_rate": 1.6028745875372406e-06, + "loss": 0.85848188, + "num_input_tokens_seen": 206634265, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.10473633, + "step": 9592, + "time_per_iteration": 2.586398124694824 + }, + { + "auxiliary_loss_clip": 0.06325077, + "auxiliary_loss_mlp": 0.0125376, + "balance_loss_clip": 0.06264462, + "balance_loss_mlp": 0.01252203, + "epoch": 0.5767623628438299, + "flos": 68315579452800.0, + "grad_norm": 0.723864489522512, + "language_loss": 0.59626555, + "learning_rate": 1.6024928882763885e-06, + "loss": 0.67205393, + "num_input_tokens_seen": 206696990, + "router_z_loss_clip": 0.60888672, + "router_z_loss_mlp": 0.01555634, + "step": 9593, + "time_per_iteration": 3.245339870452881 + }, + { + "auxiliary_loss_clip": 0.06419121, + "auxiliary_loss_mlp": 0.01266388, + "balance_loss_clip": 0.06272256, + "balance_loss_mlp": 0.01254432, + "epoch": 0.5768224860964978, + "flos": 30195959541120.0, + "grad_norm": 1.4712512924104606, + "language_loss": 0.70970887, + "learning_rate": 1.6021112040890463e-06, + "loss": 0.78656393, + "num_input_tokens_seen": 206717815, + "router_z_loss_clip": 1.46972656, + "router_z_loss_mlp": 0.11956787, + "step": 9594, + "time_per_iteration": 2.575716018676758 + }, + { + "auxiliary_loss_clip": 0.06417293, + "auxiliary_loss_mlp": 0.01269346, + "balance_loss_clip": 0.0627408, + "balance_loss_mlp": 0.01259237, + "epoch": 0.5768826093491658, + "flos": 17900880935040.0, + "grad_norm": 1.6705807126416699, + "language_loss": 0.71305418, + "learning_rate": 1.6017295349896863e-06, + "loss": 0.78992057, + "num_input_tokens_seen": 206735985, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.10101318, + "step": 9595, + "time_per_iteration": 2.492614269256592 + }, + { + "auxiliary_loss_clip": 0.06416321, + "auxiliary_loss_mlp": 0.01269009, + "balance_loss_clip": 0.06273369, + "balance_loss_mlp": 0.01257481, + "epoch": 0.5769427326018337, + "flos": 17462943970560.0, + "grad_norm": 1.9433978950195214, + "language_loss": 0.69787997, + "learning_rate": 1.6013478809927828e-06, + "loss": 0.77473325, + "num_input_tokens_seen": 206753370, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.11529541, + "step": 9596, + "time_per_iteration": 2.527899742126465 + }, + { + "auxiliary_loss_clip": 0.06425576, + "auxiliary_loss_mlp": 0.01267355, + "balance_loss_clip": 0.06275462, + "balance_loss_mlp": 0.01254558, + "epoch": 0.5770028558545017, + "flos": 39431181473280.0, + "grad_norm": 1.7020557646527, + "language_loss": 0.67913234, + "learning_rate": 1.6009662421128074e-06, + "loss": 0.75606167, + "num_input_tokens_seen": 206777645, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.12792969, + "step": 9597, + "time_per_iteration": 2.6754841804504395 + }, + { + "auxiliary_loss_clip": 0.06417054, + "auxiliary_loss_mlp": 0.01265896, + "balance_loss_clip": 0.06273974, + "balance_loss_mlp": 0.01255322, + "epoch": 0.5770629791071697, + "flos": 21541620839040.0, + "grad_norm": 1.8412029810529236, + "language_loss": 0.82291842, + "learning_rate": 1.6005846183642323e-06, + "loss": 0.89974791, + "num_input_tokens_seen": 206794865, + "router_z_loss_clip": 1.43164062, + "router_z_loss_mlp": 0.105896, + "step": 9598, + "time_per_iteration": 2.510817527770996 + }, + { + "auxiliary_loss_clip": 0.06420396, + "auxiliary_loss_mlp": 0.01268157, + "balance_loss_clip": 0.06275374, + "balance_loss_mlp": 0.01256511, + "epoch": 0.5771231023598377, + "flos": 20893121763840.0, + "grad_norm": 1.43847663479929, + "language_loss": 0.73386133, + "learning_rate": 1.6002030097615277e-06, + "loss": 0.81074691, + "num_input_tokens_seen": 206814095, + "router_z_loss_clip": 1.44921875, + "router_z_loss_mlp": 0.11639404, + "step": 9599, + "time_per_iteration": 2.492751121520996 + }, + { + "auxiliary_loss_clip": 0.06411996, + "auxiliary_loss_mlp": 0.01265144, + "balance_loss_clip": 0.06272705, + "balance_loss_mlp": 0.01254772, + "epoch": 0.5771832256125057, + "flos": 18083043711360.0, + "grad_norm": 1.7867114623476337, + "language_loss": 0.78284144, + "learning_rate": 1.5998214163191663e-06, + "loss": 0.85961294, + "num_input_tokens_seen": 206832245, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.10369873, + "step": 9600, + "time_per_iteration": 2.4890565872192383 + }, + { + "auxiliary_loss_clip": 0.06422748, + "auxiliary_loss_mlp": 0.01268331, + "balance_loss_clip": 0.06276144, + "balance_loss_mlp": 0.01256893, + "epoch": 0.5772433488651736, + "flos": 26366222753280.0, + "grad_norm": 1.8856132517408855, + "language_loss": 0.72472572, + "learning_rate": 1.5994398380516163e-06, + "loss": 0.80163646, + "num_input_tokens_seen": 206851535, + "router_z_loss_clip": 1.46582031, + "router_z_loss_mlp": 0.11450195, + "step": 9601, + "time_per_iteration": 2.536994218826294 + }, + { + "auxiliary_loss_clip": 0.06415705, + "auxiliary_loss_mlp": 0.0126476, + "balance_loss_clip": 0.06274568, + "balance_loss_mlp": 0.01253506, + "epoch": 0.5773034721178416, + "flos": 19686814058880.0, + "grad_norm": 1.49916876372247, + "language_loss": 0.68989396, + "learning_rate": 1.599058274973348e-06, + "loss": 0.7666986, + "num_input_tokens_seen": 206870595, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.11254883, + "step": 9602, + "time_per_iteration": 2.4855434894561768 + }, + { + "auxiliary_loss_clip": 0.06409699, + "auxiliary_loss_mlp": 0.01267414, + "balance_loss_clip": 0.06272521, + "balance_loss_mlp": 0.01257287, + "epoch": 0.5773635953705095, + "flos": 25089951288960.0, + "grad_norm": 1.4178586949074146, + "language_loss": 0.73199558, + "learning_rate": 1.5986767270988297e-06, + "loss": 0.80876672, + "num_input_tokens_seen": 206892320, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.10125732, + "step": 9603, + "time_per_iteration": 2.5496528148651123 + }, + { + "auxiliary_loss_clip": 0.06418322, + "auxiliary_loss_mlp": 0.01267189, + "balance_loss_clip": 0.06276152, + "balance_loss_mlp": 0.01256162, + "epoch": 0.5774237186231775, + "flos": 21039380265600.0, + "grad_norm": 1.5159674911644692, + "language_loss": 0.76686621, + "learning_rate": 1.5982951944425298e-06, + "loss": 0.84372133, + "num_input_tokens_seen": 206912485, + "router_z_loss_clip": 1.41992188, + "router_z_loss_mlp": 0.11035156, + "step": 9604, + "time_per_iteration": 2.522033452987671 + }, + { + "auxiliary_loss_clip": 0.06420808, + "auxiliary_loss_mlp": 0.01271467, + "balance_loss_clip": 0.06277063, + "balance_loss_mlp": 0.01259373, + "epoch": 0.5774838418758454, + "flos": 15237145238400.0, + "grad_norm": 2.0065352138527808, + "language_loss": 0.83384192, + "learning_rate": 1.5979136770189174e-06, + "loss": 0.91076463, + "num_input_tokens_seen": 206929100, + "router_z_loss_clip": 1.43652344, + "router_z_loss_mlp": 0.12097168, + "step": 9605, + "time_per_iteration": 2.4643824100494385 + }, + { + "auxiliary_loss_clip": 0.0643101, + "auxiliary_loss_mlp": 0.01267132, + "balance_loss_clip": 0.06278086, + "balance_loss_mlp": 0.01254913, + "epoch": 0.5775439651285135, + "flos": 23588694812160.0, + "grad_norm": 1.6400067603153077, + "language_loss": 0.78330255, + "learning_rate": 1.5975321748424581e-06, + "loss": 0.86028397, + "num_input_tokens_seen": 206947020, + "router_z_loss_clip": 1.52832031, + "router_z_loss_mlp": 0.12207031, + "step": 9606, + "time_per_iteration": 2.5217928886413574 + }, + { + "auxiliary_loss_clip": 0.06417712, + "auxiliary_loss_mlp": 0.0126431, + "balance_loss_clip": 0.06273665, + "balance_loss_mlp": 0.01252687, + "epoch": 0.5776040883811814, + "flos": 18046300896000.0, + "grad_norm": 1.7192315062710783, + "language_loss": 0.73891246, + "learning_rate": 1.597150687927619e-06, + "loss": 0.81573272, + "num_input_tokens_seen": 206964065, + "router_z_loss_clip": 1.43847656, + "router_z_loss_mlp": 0.11633301, + "step": 9607, + "time_per_iteration": 2.4798216819763184 + }, + { + "auxiliary_loss_clip": 0.06424229, + "auxiliary_loss_mlp": 0.01268528, + "balance_loss_clip": 0.06277244, + "balance_loss_mlp": 0.01256368, + "epoch": 0.5776642116338494, + "flos": 18630580216320.0, + "grad_norm": 1.602339688767026, + "language_loss": 0.69749868, + "learning_rate": 1.5967692162888664e-06, + "loss": 0.77442622, + "num_input_tokens_seen": 206981940, + "router_z_loss_clip": 1.46972656, + "router_z_loss_mlp": 0.121521, + "step": 9608, + "time_per_iteration": 2.5238630771636963 + }, + { + "auxiliary_loss_clip": 0.06419271, + "auxiliary_loss_mlp": 0.01267568, + "balance_loss_clip": 0.06275126, + "balance_loss_mlp": 0.01255814, + "epoch": 0.5777243348865173, + "flos": 28410068344320.0, + "grad_norm": 1.9615645043462706, + "language_loss": 0.76945466, + "learning_rate": 1.596387759940665e-06, + "loss": 0.84632301, + "num_input_tokens_seen": 207002365, + "router_z_loss_clip": 1.44042969, + "router_z_loss_mlp": 0.11749268, + "step": 9609, + "time_per_iteration": 2.549933671951294 + }, + { + "auxiliary_loss_clip": 0.0642001, + "auxiliary_loss_mlp": 0.01267285, + "balance_loss_clip": 0.06273153, + "balance_loss_mlp": 0.01255084, + "epoch": 0.5777844581391853, + "flos": 24031579167360.0, + "grad_norm": 1.544459178362984, + "language_loss": 0.77057648, + "learning_rate": 1.5960063188974808e-06, + "loss": 0.84744948, + "num_input_tokens_seen": 207021195, + "router_z_loss_clip": 1.46777344, + "router_z_loss_mlp": 0.12200928, + "step": 9610, + "time_per_iteration": 2.5409657955169678 + }, + { + "auxiliary_loss_clip": 0.06419136, + "auxiliary_loss_mlp": 0.01273329, + "balance_loss_clip": 0.06273989, + "balance_loss_mlp": 0.01261104, + "epoch": 0.5778445813918534, + "flos": 17781805883520.0, + "grad_norm": 2.0334076468596463, + "language_loss": 0.69377804, + "learning_rate": 1.5956248931737777e-06, + "loss": 0.77070266, + "num_input_tokens_seen": 207037465, + "router_z_loss_clip": 1.44921875, + "router_z_loss_mlp": 0.12231445, + "step": 9611, + "time_per_iteration": 3.8771145343780518 + }, + { + "auxiliary_loss_clip": 0.06415454, + "auxiliary_loss_mlp": 0.01265667, + "balance_loss_clip": 0.06272358, + "balance_loss_mlp": 0.01254795, + "epoch": 0.5779047046445213, + "flos": 22239147352320.0, + "grad_norm": 1.7756554406320284, + "language_loss": 0.84048247, + "learning_rate": 1.5952434827840185e-06, + "loss": 0.91729373, + "num_input_tokens_seen": 207054230, + "router_z_loss_clip": 1.43359375, + "router_z_loss_mlp": 0.10876465, + "step": 9612, + "time_per_iteration": 2.4897758960723877 + }, + { + "auxiliary_loss_clip": 0.06417899, + "auxiliary_loss_mlp": 0.01267936, + "balance_loss_clip": 0.06275887, + "balance_loss_mlp": 0.01257046, + "epoch": 0.5779648278971893, + "flos": 21440825976960.0, + "grad_norm": 1.4853190478070708, + "language_loss": 0.80038643, + "learning_rate": 1.594862087742667e-06, + "loss": 0.87724483, + "num_input_tokens_seen": 207073150, + "router_z_loss_clip": 1.41894531, + "router_z_loss_mlp": 0.10894775, + "step": 9613, + "time_per_iteration": 2.512202501296997 + }, + { + "auxiliary_loss_clip": 0.06417654, + "auxiliary_loss_mlp": 0.01265916, + "balance_loss_clip": 0.06274515, + "balance_loss_mlp": 0.01254996, + "epoch": 0.5780249511498572, + "flos": 19032151708800.0, + "grad_norm": 1.6718641196950235, + "language_loss": 0.7774657, + "learning_rate": 1.5944807080641863e-06, + "loss": 0.85430139, + "num_input_tokens_seen": 207090375, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.10925293, + "step": 9614, + "time_per_iteration": 2.4882118701934814 + }, + { + "auxiliary_loss_clip": 0.06421545, + "auxiliary_loss_mlp": 0.0126591, + "balance_loss_clip": 0.06274751, + "balance_loss_mlp": 0.01254543, + "epoch": 0.5780850744025252, + "flos": 12128596542720.0, + "grad_norm": 2.0494146854902175, + "language_loss": 0.82224047, + "learning_rate": 1.5940993437630375e-06, + "loss": 0.89911503, + "num_input_tokens_seen": 207106030, + "router_z_loss_clip": 1.46679688, + "router_z_loss_mlp": 0.1137085, + "step": 9615, + "time_per_iteration": 2.472621440887451 + }, + { + "auxiliary_loss_clip": 0.0642141, + "auxiliary_loss_mlp": 0.01267646, + "balance_loss_clip": 0.06274787, + "balance_loss_mlp": 0.01255552, + "epoch": 0.5781451976551931, + "flos": 25051154048640.0, + "grad_norm": 1.4669220513135932, + "language_loss": 0.67472255, + "learning_rate": 1.5937179948536825e-06, + "loss": 0.75161308, + "num_input_tokens_seen": 207125435, + "router_z_loss_clip": 1.46679688, + "router_z_loss_mlp": 0.12097168, + "step": 9616, + "time_per_iteration": 2.534846782684326 + }, + { + "auxiliary_loss_clip": 0.06417294, + "auxiliary_loss_mlp": 0.01269205, + "balance_loss_clip": 0.06275527, + "balance_loss_mlp": 0.01257528, + "epoch": 0.5782053209078611, + "flos": 19251770060160.0, + "grad_norm": 1.8155832257801603, + "language_loss": 0.77963018, + "learning_rate": 1.5933366613505812e-06, + "loss": 0.85649514, + "num_input_tokens_seen": 207145095, + "router_z_loss_clip": 1.41699219, + "router_z_loss_mlp": 0.11669922, + "step": 9617, + "time_per_iteration": 4.014554977416992 + }, + { + "auxiliary_loss_clip": 0.064207, + "auxiliary_loss_mlp": 0.01269929, + "balance_loss_clip": 0.06277206, + "balance_loss_mlp": 0.012578, + "epoch": 0.578265444160529, + "flos": 26000849024640.0, + "grad_norm": 1.3678407791087424, + "language_loss": 0.75333905, + "learning_rate": 1.5929553432681947e-06, + "loss": 0.83024538, + "num_input_tokens_seen": 207166045, + "router_z_loss_clip": 1.43457031, + "router_z_loss_mlp": 0.12139893, + "step": 9618, + "time_per_iteration": 2.5390572547912598 + }, + { + "auxiliary_loss_clip": 0.06416163, + "auxiliary_loss_mlp": 0.01265823, + "balance_loss_clip": 0.06273779, + "balance_loss_mlp": 0.01254355, + "epoch": 0.5783255674131971, + "flos": 21805025748480.0, + "grad_norm": 1.6109172194310035, + "language_loss": 0.81657064, + "learning_rate": 1.5925740406209826e-06, + "loss": 0.89339048, + "num_input_tokens_seen": 207185290, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.11468506, + "step": 9619, + "time_per_iteration": 2.505831718444824 + }, + { + "auxiliary_loss_clip": 0.06420539, + "auxiliary_loss_mlp": 0.01265219, + "balance_loss_clip": 0.06275585, + "balance_loss_mlp": 0.01253972, + "epoch": 0.578385690665865, + "flos": 24796553817600.0, + "grad_norm": 1.540190718879446, + "language_loss": 0.72668874, + "learning_rate": 1.5921927534234039e-06, + "loss": 0.80354631, + "num_input_tokens_seen": 207205505, + "router_z_loss_clip": 1.44921875, + "router_z_loss_mlp": 0.11248779, + "step": 9620, + "time_per_iteration": 3.9673268795013428 + }, + { + "auxiliary_loss_clip": 0.06423381, + "auxiliary_loss_mlp": 0.01270714, + "balance_loss_clip": 0.06277235, + "balance_loss_mlp": 0.01258942, + "epoch": 0.578445813918533, + "flos": 21218859711360.0, + "grad_norm": 1.6605075192862409, + "language_loss": 0.77349472, + "learning_rate": 1.591811481689916e-06, + "loss": 0.85043567, + "num_input_tokens_seen": 207225315, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.11767578, + "step": 9621, + "time_per_iteration": 2.5077648162841797 + }, + { + "auxiliary_loss_clip": 0.06420489, + "auxiliary_loss_mlp": 0.01264338, + "balance_loss_clip": 0.0627306, + "balance_loss_mlp": 0.01252477, + "epoch": 0.5785059371712009, + "flos": 25053921233280.0, + "grad_norm": 1.4404835359445094, + "language_loss": 0.7094593, + "learning_rate": 1.5914302254349787e-06, + "loss": 0.78630757, + "num_input_tokens_seen": 207247690, + "router_z_loss_clip": 1.47558594, + "router_z_loss_mlp": 0.11859131, + "step": 9622, + "time_per_iteration": 2.5468451976776123 + }, + { + "auxiliary_loss_clip": 0.06311069, + "auxiliary_loss_mlp": 0.01252444, + "balance_loss_clip": 0.06251176, + "balance_loss_mlp": 0.01250508, + "epoch": 0.5785660604238689, + "flos": 70865187488640.0, + "grad_norm": 0.7596176351080388, + "language_loss": 0.55852556, + "learning_rate": 1.5910489846730476e-06, + "loss": 0.6341607, + "num_input_tokens_seen": 207301735, + "router_z_loss_clip": 0.6015625, + "router_z_loss_mlp": 0.01933289, + "step": 9623, + "time_per_iteration": 3.153353452682495 + }, + { + "auxiliary_loss_clip": 0.06425077, + "auxiliary_loss_mlp": 0.01267172, + "balance_loss_clip": 0.06277281, + "balance_loss_mlp": 0.01255233, + "epoch": 0.578626183676537, + "flos": 31658083361280.0, + "grad_norm": 2.2034040135587936, + "language_loss": 0.71319884, + "learning_rate": 1.5906677594185799e-06, + "loss": 0.79012132, + "num_input_tokens_seen": 207321240, + "router_z_loss_clip": 1.47949219, + "router_z_loss_mlp": 0.1194458, + "step": 9624, + "time_per_iteration": 2.5956926345825195 + }, + { + "auxiliary_loss_clip": 0.06420659, + "auxiliary_loss_mlp": 0.01270578, + "balance_loss_clip": 0.06275962, + "balance_loss_mlp": 0.01258222, + "epoch": 0.5786863069292049, + "flos": 21870545241600.0, + "grad_norm": 1.7015470008848133, + "language_loss": 0.82409322, + "learning_rate": 1.5902865496860322e-06, + "loss": 0.90100557, + "num_input_tokens_seen": 207339540, + "router_z_loss_clip": 1.44824219, + "router_z_loss_mlp": 0.12353516, + "step": 9625, + "time_per_iteration": 2.5166807174682617 + }, + { + "auxiliary_loss_clip": 0.06417123, + "auxiliary_loss_mlp": 0.01266026, + "balance_loss_clip": 0.06274764, + "balance_loss_mlp": 0.01253647, + "epoch": 0.5787464301818729, + "flos": 23371214739840.0, + "grad_norm": 1.4015207824111633, + "language_loss": 0.70712119, + "learning_rate": 1.5899053554898591e-06, + "loss": 0.78395265, + "num_input_tokens_seen": 207360470, + "router_z_loss_clip": 1.42382812, + "router_z_loss_mlp": 0.12384033, + "step": 9626, + "time_per_iteration": 2.5232555866241455 + }, + { + "auxiliary_loss_clip": 0.06417292, + "auxiliary_loss_mlp": 0.01266097, + "balance_loss_clip": 0.06275232, + "balance_loss_mlp": 0.01255278, + "epoch": 0.5788065534345408, + "flos": 30011155361280.0, + "grad_norm": 1.650883867076693, + "language_loss": 0.71934295, + "learning_rate": 1.5895241768445166e-06, + "loss": 0.79617685, + "num_input_tokens_seen": 207383080, + "router_z_loss_clip": 1.41992188, + "router_z_loss_mlp": 0.10827637, + "step": 9627, + "time_per_iteration": 2.5862505435943604 + }, + { + "auxiliary_loss_clip": 0.06419323, + "auxiliary_loss_mlp": 0.01268778, + "balance_loss_clip": 0.06276532, + "balance_loss_mlp": 0.01257643, + "epoch": 0.5788666766872088, + "flos": 24533526251520.0, + "grad_norm": 1.6845581870111699, + "language_loss": 0.84154361, + "learning_rate": 1.589143013764458e-06, + "loss": 0.91842461, + "num_input_tokens_seen": 207401000, + "router_z_loss_clip": 1.42675781, + "router_z_loss_mlp": 0.11138916, + "step": 9628, + "time_per_iteration": 4.011742830276489 + }, + { + "auxiliary_loss_clip": 0.06420035, + "auxiliary_loss_mlp": 0.01267996, + "balance_loss_clip": 0.06274278, + "balance_loss_mlp": 0.01255443, + "epoch": 0.5789267999398767, + "flos": 23739649142400.0, + "grad_norm": 1.4211285900013286, + "language_loss": 0.72366357, + "learning_rate": 1.5887618662641376e-06, + "loss": 0.8005439, + "num_input_tokens_seen": 207419230, + "router_z_loss_clip": 1.45898438, + "router_z_loss_mlp": 0.12548828, + "step": 9629, + "time_per_iteration": 2.535161018371582 + }, + { + "auxiliary_loss_clip": 0.06419079, + "auxiliary_loss_mlp": 0.01266785, + "balance_loss_clip": 0.06275524, + "balance_loss_mlp": 0.01254894, + "epoch": 0.5789869231925447, + "flos": 21140217054720.0, + "grad_norm": 1.8234862135922645, + "language_loss": 0.74396068, + "learning_rate": 1.5883807343580087e-06, + "loss": 0.82081938, + "num_input_tokens_seen": 207437615, + "router_z_loss_clip": 1.43652344, + "router_z_loss_mlp": 0.11883545, + "step": 9630, + "time_per_iteration": 2.4906413555145264 + }, + { + "auxiliary_loss_clip": 0.06409539, + "auxiliary_loss_mlp": 0.0126483, + "balance_loss_clip": 0.06270717, + "balance_loss_mlp": 0.0125344, + "epoch": 0.5790470464452127, + "flos": 21215086277760.0, + "grad_norm": 1.5521366007555986, + "language_loss": 0.78864127, + "learning_rate": 1.587999618060523e-06, + "loss": 0.86538494, + "num_input_tokens_seen": 207457270, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.11395264, + "step": 9631, + "time_per_iteration": 2.500326633453369 + }, + { + "auxiliary_loss_clip": 0.06417775, + "auxiliary_loss_mlp": 0.01264538, + "balance_loss_clip": 0.06272215, + "balance_loss_mlp": 0.01253147, + "epoch": 0.5791071696978807, + "flos": 23411144010240.0, + "grad_norm": 1.6622191818478913, + "language_loss": 0.7546376, + "learning_rate": 1.5876185173861333e-06, + "loss": 0.83146071, + "num_input_tokens_seen": 207477890, + "router_z_loss_clip": 1.45605469, + "router_z_loss_mlp": 0.1138916, + "step": 9632, + "time_per_iteration": 2.5060648918151855 + }, + { + "auxiliary_loss_clip": 0.06419455, + "auxiliary_loss_mlp": 0.01267637, + "balance_loss_clip": 0.06274837, + "balance_loss_mlp": 0.0125562, + "epoch": 0.5791672929505486, + "flos": 24213322673280.0, + "grad_norm": 1.7292582736877316, + "language_loss": 0.79532528, + "learning_rate": 1.5872374323492915e-06, + "loss": 0.8721962, + "num_input_tokens_seen": 207497670, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.12011719, + "step": 9633, + "time_per_iteration": 2.516359567642212 + }, + { + "auxiliary_loss_clip": 0.0643272, + "auxiliary_loss_mlp": 0.01269361, + "balance_loss_clip": 0.06278707, + "balance_loss_mlp": 0.01256635, + "epoch": 0.5792274162032166, + "flos": 24355094981760.0, + "grad_norm": 1.6340208840931036, + "language_loss": 0.7790345, + "learning_rate": 1.5868563629644464e-06, + "loss": 0.85605538, + "num_input_tokens_seen": 207516105, + "router_z_loss_clip": 1.5390625, + "router_z_loss_mlp": 0.1272583, + "step": 9634, + "time_per_iteration": 2.541090488433838 + }, + { + "auxiliary_loss_clip": 0.06422533, + "auxiliary_loss_mlp": 0.01268072, + "balance_loss_clip": 0.06273677, + "balance_loss_mlp": 0.01255406, + "epoch": 0.5792875394558845, + "flos": 20455729850880.0, + "grad_norm": 1.975369322400224, + "language_loss": 0.64063549, + "learning_rate": 1.5864753092460502e-06, + "loss": 0.71754158, + "num_input_tokens_seen": 207533685, + "router_z_loss_clip": 1.48828125, + "router_z_loss_mlp": 0.12652588, + "step": 9635, + "time_per_iteration": 2.4916157722473145 + }, + { + "auxiliary_loss_clip": 0.06417014, + "auxiliary_loss_mlp": 0.01265438, + "balance_loss_clip": 0.06273466, + "balance_loss_mlp": 0.01253327, + "epoch": 0.5793476627085525, + "flos": 24067064171520.0, + "grad_norm": 1.4766518541506428, + "language_loss": 0.77494228, + "learning_rate": 1.5860942712085516e-06, + "loss": 0.85176682, + "num_input_tokens_seen": 207552840, + "router_z_loss_clip": 1.43359375, + "router_z_loss_mlp": 0.12115479, + "step": 9636, + "time_per_iteration": 2.516622304916382 + }, + { + "auxiliary_loss_clip": 0.06411137, + "auxiliary_loss_mlp": 0.01268567, + "balance_loss_clip": 0.06271987, + "balance_loss_mlp": 0.01258226, + "epoch": 0.5794077859612206, + "flos": 22060799936640.0, + "grad_norm": 1.6556351940576073, + "language_loss": 0.68772542, + "learning_rate": 1.5857132488663998e-06, + "loss": 0.76452249, + "num_input_tokens_seen": 207572095, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.10333252, + "step": 9637, + "time_per_iteration": 2.509833812713623 + }, + { + "auxiliary_loss_clip": 0.06421766, + "auxiliary_loss_mlp": 0.0126905, + "balance_loss_clip": 0.06273458, + "balance_loss_mlp": 0.01256784, + "epoch": 0.5794679092138885, + "flos": 11439245802240.0, + "grad_norm": 2.540580609640148, + "language_loss": 0.72712755, + "learning_rate": 1.585332242234043e-06, + "loss": 0.80403578, + "num_input_tokens_seen": 207587495, + "router_z_loss_clip": 1.48339844, + "router_z_loss_mlp": 0.12261963, + "step": 9638, + "time_per_iteration": 2.4528071880340576 + }, + { + "auxiliary_loss_clip": 0.06416277, + "auxiliary_loss_mlp": 0.01266332, + "balance_loss_clip": 0.06273618, + "balance_loss_mlp": 0.0125521, + "epoch": 0.5795280324665565, + "flos": 18886228623360.0, + "grad_norm": 1.607875789180523, + "language_loss": 0.72792935, + "learning_rate": 1.5849512513259291e-06, + "loss": 0.80475545, + "num_input_tokens_seen": 207606795, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.11120605, + "step": 9639, + "time_per_iteration": 2.510347604751587 + }, + { + "auxiliary_loss_clip": 0.06418437, + "auxiliary_loss_mlp": 0.01269692, + "balance_loss_clip": 0.06273493, + "balance_loss_mlp": 0.01258332, + "epoch": 0.5795881557192244, + "flos": 13010969162880.0, + "grad_norm": 1.751039086833101, + "language_loss": 0.69813907, + "learning_rate": 1.5845702761565054e-06, + "loss": 0.7750203, + "num_input_tokens_seen": 207623620, + "router_z_loss_clip": 1.44824219, + "router_z_loss_mlp": 0.11364746, + "step": 9640, + "time_per_iteration": 2.453831672668457 + }, + { + "auxiliary_loss_clip": 0.06430758, + "auxiliary_loss_mlp": 0.01271889, + "balance_loss_clip": 0.0627775, + "balance_loss_mlp": 0.01259509, + "epoch": 0.5796482789718924, + "flos": 19937598929280.0, + "grad_norm": 2.3188274360648298, + "language_loss": 0.78378308, + "learning_rate": 1.5841893167402183e-06, + "loss": 0.8608095, + "num_input_tokens_seen": 207639380, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.12371826, + "step": 9641, + "time_per_iteration": 2.487333059310913 + }, + { + "auxiliary_loss_clip": 0.06416615, + "auxiliary_loss_mlp": 0.01268516, + "balance_loss_clip": 0.06271899, + "balance_loss_mlp": 0.01256685, + "epoch": 0.5797084022245603, + "flos": 21656880529920.0, + "grad_norm": 2.422042135441505, + "language_loss": 0.74201375, + "learning_rate": 1.5838083730915143e-06, + "loss": 0.81886506, + "num_input_tokens_seen": 207657915, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.1182251, + "step": 9642, + "time_per_iteration": 2.4917688369750977 + }, + { + "auxiliary_loss_clip": 0.06419542, + "auxiliary_loss_mlp": 0.01264152, + "balance_loss_clip": 0.06275794, + "balance_loss_mlp": 0.01252582, + "epoch": 0.5797685254772283, + "flos": 26038807724160.0, + "grad_norm": 1.4983613319397562, + "language_loss": 0.73538697, + "learning_rate": 1.5834274452248378e-06, + "loss": 0.81222391, + "num_input_tokens_seen": 207678620, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.11566162, + "step": 9643, + "time_per_iteration": 2.5357465744018555 + }, + { + "auxiliary_loss_clip": 0.06417159, + "auxiliary_loss_mlp": 0.01264721, + "balance_loss_clip": 0.06270476, + "balance_loss_mlp": 0.01253175, + "epoch": 0.5798286487298963, + "flos": 22710808385280.0, + "grad_norm": 1.6774180539317567, + "language_loss": 0.67605746, + "learning_rate": 1.5830465331546352e-06, + "loss": 0.75287628, + "num_input_tokens_seen": 207696980, + "router_z_loss_clip": 1.46679688, + "router_z_loss_mlp": 0.11547852, + "step": 9644, + "time_per_iteration": 2.485366106033325 + }, + { + "auxiliary_loss_clip": 0.06425455, + "auxiliary_loss_mlp": 0.01268613, + "balance_loss_clip": 0.06276956, + "balance_loss_mlp": 0.01256078, + "epoch": 0.5798887719825643, + "flos": 23155705238400.0, + "grad_norm": 2.0120452642465865, + "language_loss": 0.85497642, + "learning_rate": 1.5826656368953496e-06, + "loss": 0.93191713, + "num_input_tokens_seen": 207714065, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.12542725, + "step": 9645, + "time_per_iteration": 2.505467414855957 + }, + { + "auxiliary_loss_clip": 0.06418729, + "auxiliary_loss_mlp": 0.01266861, + "balance_loss_clip": 0.06275458, + "balance_loss_mlp": 0.01255774, + "epoch": 0.5799488952352322, + "flos": 24432982951680.0, + "grad_norm": 1.7616171208033915, + "language_loss": 0.75737381, + "learning_rate": 1.5822847564614244e-06, + "loss": 0.83422971, + "num_input_tokens_seen": 207734720, + "router_z_loss_clip": 1.43164062, + "router_z_loss_mlp": 0.11102295, + "step": 9646, + "time_per_iteration": 2.527848958969116 + }, + { + "auxiliary_loss_clip": 0.06425247, + "auxiliary_loss_mlp": 0.01268889, + "balance_loss_clip": 0.06276453, + "balance_loss_mlp": 0.01256461, + "epoch": 0.5800090184879002, + "flos": 38404478995200.0, + "grad_norm": 1.7871006843554935, + "language_loss": 0.59099573, + "learning_rate": 1.5819038918673038e-06, + "loss": 0.6679371, + "num_input_tokens_seen": 207755435, + "router_z_loss_clip": 1.48828125, + "router_z_loss_mlp": 0.12426758, + "step": 9647, + "time_per_iteration": 2.643890142440796 + }, + { + "auxiliary_loss_clip": 0.06425125, + "auxiliary_loss_mlp": 0.01271805, + "balance_loss_clip": 0.06275211, + "balance_loss_mlp": 0.01259276, + "epoch": 0.5800691417405681, + "flos": 19789747200000.0, + "grad_norm": 1.4917917867847632, + "language_loss": 0.84483784, + "learning_rate": 1.5815230431274288e-06, + "loss": 0.92180717, + "num_input_tokens_seen": 207773570, + "router_z_loss_clip": 1.49902344, + "router_z_loss_mlp": 0.12524414, + "step": 9648, + "time_per_iteration": 2.48917818069458 + }, + { + "auxiliary_loss_clip": 0.06311809, + "auxiliary_loss_mlp": 0.01252996, + "balance_loss_clip": 0.06251512, + "balance_loss_mlp": 0.01251245, + "epoch": 0.5801292649932361, + "flos": 70333514133120.0, + "grad_norm": 0.8366168453621474, + "language_loss": 0.63013005, + "learning_rate": 1.581142210256242e-06, + "loss": 0.70577806, + "num_input_tokens_seen": 207830095, + "router_z_loss_clip": 0.60449219, + "router_z_loss_mlp": 0.01756287, + "step": 9649, + "time_per_iteration": 3.167630434036255 + }, + { + "auxiliary_loss_clip": 0.064106, + "auxiliary_loss_mlp": 0.01264864, + "balance_loss_clip": 0.06269349, + "balance_loss_mlp": 0.01253903, + "epoch": 0.5801893882459042, + "flos": 18740892516480.0, + "grad_norm": 1.6385207780550837, + "language_loss": 0.82320833, + "learning_rate": 1.5807613932681857e-06, + "loss": 0.89996296, + "num_input_tokens_seen": 207848555, + "router_z_loss_clip": 1.41308594, + "router_z_loss_mlp": 0.10968018, + "step": 9650, + "time_per_iteration": 2.495060920715332 + }, + { + "auxiliary_loss_clip": 0.06424958, + "auxiliary_loss_mlp": 0.01267787, + "balance_loss_clip": 0.0627567, + "balance_loss_mlp": 0.01256194, + "epoch": 0.5802495114985721, + "flos": 15601973915520.0, + "grad_norm": 2.051158244012986, + "language_loss": 0.77640611, + "learning_rate": 1.580380592177698e-06, + "loss": 0.85333359, + "num_input_tokens_seen": 207867060, + "router_z_loss_clip": 1.49121094, + "router_z_loss_mlp": 0.11584473, + "step": 9651, + "time_per_iteration": 3.9003303050994873 + }, + { + "auxiliary_loss_clip": 0.06421195, + "auxiliary_loss_mlp": 0.01270828, + "balance_loss_clip": 0.0627306, + "balance_loss_mlp": 0.01258627, + "epoch": 0.5803096347512401, + "flos": 18260552586240.0, + "grad_norm": 1.678926948492491, + "language_loss": 0.74017727, + "learning_rate": 1.5799998069992213e-06, + "loss": 0.81709743, + "num_input_tokens_seen": 207884520, + "router_z_loss_clip": 1.48046875, + "router_z_loss_mlp": 0.12207031, + "step": 9652, + "time_per_iteration": 2.5226869583129883 + }, + { + "auxiliary_loss_clip": 0.0642662, + "auxiliary_loss_mlp": 0.01267654, + "balance_loss_clip": 0.06278314, + "balance_loss_mlp": 0.012559, + "epoch": 0.580369758003908, + "flos": 22899763342080.0, + "grad_norm": 1.9284827518212118, + "language_loss": 0.77118474, + "learning_rate": 1.579619037747193e-06, + "loss": 0.84812748, + "num_input_tokens_seen": 207905370, + "router_z_loss_clip": 1.48242188, + "router_z_loss_mlp": 0.11749268, + "step": 9653, + "time_per_iteration": 2.5736207962036133 + }, + { + "auxiliary_loss_clip": 0.06425463, + "auxiliary_loss_mlp": 0.01265074, + "balance_loss_clip": 0.06277624, + "balance_loss_mlp": 0.01252789, + "epoch": 0.580429881256576, + "flos": 18703646576640.0, + "grad_norm": 1.9366371532767657, + "language_loss": 0.75627828, + "learning_rate": 1.5792382844360534e-06, + "loss": 0.83318365, + "num_input_tokens_seen": 207923790, + "router_z_loss_clip": 1.47851562, + "router_z_loss_mlp": 0.1229248, + "step": 9654, + "time_per_iteration": 2.667048931121826 + }, + { + "auxiliary_loss_clip": 0.06413651, + "auxiliary_loss_mlp": 0.01265944, + "balance_loss_clip": 0.062739, + "balance_loss_mlp": 0.01254959, + "epoch": 0.5804900045092439, + "flos": 24689050629120.0, + "grad_norm": 1.638178903008904, + "language_loss": 0.70858634, + "learning_rate": 1.5788575470802408e-06, + "loss": 0.78538227, + "num_input_tokens_seen": 207942335, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.10992432, + "step": 9655, + "time_per_iteration": 2.5496294498443604 + }, + { + "auxiliary_loss_clip": 0.06424456, + "auxiliary_loss_mlp": 0.01266011, + "balance_loss_clip": 0.06273113, + "balance_loss_mlp": 0.0125378, + "epoch": 0.580550127761912, + "flos": 23119549401600.0, + "grad_norm": 2.0310142592924314, + "language_loss": 0.70043373, + "learning_rate": 1.5784768256941915e-06, + "loss": 0.77733833, + "num_input_tokens_seen": 207961975, + "router_z_loss_clip": 1.51171875, + "router_z_loss_mlp": 0.12231445, + "step": 9656, + "time_per_iteration": 4.0007078647613525 + }, + { + "auxiliary_loss_clip": 0.06411725, + "auxiliary_loss_mlp": 0.01265789, + "balance_loss_clip": 0.0627184, + "balance_loss_mlp": 0.01255203, + "epoch": 0.5806102510145799, + "flos": 18481093332480.0, + "grad_norm": 1.6851014534608593, + "language_loss": 0.71761322, + "learning_rate": 1.5780961202923433e-06, + "loss": 0.79438841, + "num_input_tokens_seen": 207979520, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.105896, + "step": 9657, + "time_per_iteration": 2.52081298828125 + }, + { + "auxiliary_loss_clip": 0.06426618, + "auxiliary_loss_mlp": 0.01265336, + "balance_loss_clip": 0.06275696, + "balance_loss_mlp": 0.01252843, + "epoch": 0.5806703742672479, + "flos": 23922566605440.0, + "grad_norm": 1.7911249599131025, + "language_loss": 0.70450497, + "learning_rate": 1.5777154308891328e-06, + "loss": 0.78142452, + "num_input_tokens_seen": 207998375, + "router_z_loss_clip": 1.50976562, + "router_z_loss_mlp": 0.12506104, + "step": 9658, + "time_per_iteration": 2.509723424911499 + }, + { + "auxiliary_loss_clip": 0.06307676, + "auxiliary_loss_mlp": 0.01252681, + "balance_loss_clip": 0.06247197, + "balance_loss_mlp": 0.01250939, + "epoch": 0.5807304975199158, + "flos": 66332096328960.0, + "grad_norm": 0.6445385314606554, + "language_loss": 0.53559077, + "learning_rate": 1.5773347574989953e-06, + "loss": 0.61119437, + "num_input_tokens_seen": 208060605, + "router_z_loss_clip": 0.60546875, + "router_z_loss_mlp": 0.01747131, + "step": 9659, + "time_per_iteration": 3.164217233657837 + }, + { + "auxiliary_loss_clip": 0.0642177, + "auxiliary_loss_mlp": 0.01266172, + "balance_loss_clip": 0.06271978, + "balance_loss_mlp": 0.01254191, + "epoch": 0.5807906207725838, + "flos": 31730478888960.0, + "grad_norm": 1.678223545722946, + "language_loss": 0.62300181, + "learning_rate": 1.576954100136366e-06, + "loss": 0.69988132, + "num_input_tokens_seen": 208080320, + "router_z_loss_clip": 1.49804688, + "router_z_loss_mlp": 0.11987305, + "step": 9660, + "time_per_iteration": 4.055291175842285 + }, + { + "auxiliary_loss_clip": 0.06418584, + "auxiliary_loss_mlp": 0.01267971, + "balance_loss_clip": 0.06270796, + "balance_loss_mlp": 0.01256443, + "epoch": 0.5808507440252517, + "flos": 23807223060480.0, + "grad_norm": 1.5142376676823694, + "language_loss": 0.65793735, + "learning_rate": 1.5765734588156797e-06, + "loss": 0.73480284, + "num_input_tokens_seen": 208099305, + "router_z_loss_clip": 1.4765625, + "router_z_loss_mlp": 0.11541748, + "step": 9661, + "time_per_iteration": 2.50545334815979 + }, + { + "auxiliary_loss_clip": 0.06409734, + "auxiliary_loss_mlp": 0.01265632, + "balance_loss_clip": 0.062701, + "balance_loss_mlp": 0.01255565, + "epoch": 0.5809108672779197, + "flos": 13703464431360.0, + "grad_norm": 1.88238902360882, + "language_loss": 0.74297959, + "learning_rate": 1.5761928335513704e-06, + "loss": 0.81973332, + "num_input_tokens_seen": 208116960, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.10070801, + "step": 9662, + "time_per_iteration": 2.4924473762512207 + }, + { + "auxiliary_loss_clip": 0.06306686, + "auxiliary_loss_mlp": 0.01251122, + "balance_loss_clip": 0.06246165, + "balance_loss_mlp": 0.0124951, + "epoch": 0.5809709905305876, + "flos": 69157687386240.0, + "grad_norm": 0.8243605057954629, + "language_loss": 0.58189029, + "learning_rate": 1.5758122243578709e-06, + "loss": 0.65746832, + "num_input_tokens_seen": 208182190, + "router_z_loss_clip": 0.60546875, + "router_z_loss_mlp": 0.0161438, + "step": 9663, + "time_per_iteration": 3.215336799621582 + }, + { + "auxiliary_loss_clip": 0.06414537, + "auxiliary_loss_mlp": 0.01265807, + "balance_loss_clip": 0.06272955, + "balance_loss_mlp": 0.01254392, + "epoch": 0.5810311137832557, + "flos": 19833491831040.0, + "grad_norm": 2.48301510503896, + "language_loss": 0.82404405, + "learning_rate": 1.5754316312496152e-06, + "loss": 0.90084743, + "num_input_tokens_seen": 208197015, + "router_z_loss_clip": 1.41601562, + "router_z_loss_mlp": 0.11413574, + "step": 9664, + "time_per_iteration": 2.663583278656006 + }, + { + "auxiliary_loss_clip": 0.06419012, + "auxiliary_loss_mlp": 0.01263414, + "balance_loss_clip": 0.06271498, + "balance_loss_mlp": 0.01252423, + "epoch": 0.5810912370359237, + "flos": 29245635659520.0, + "grad_norm": 1.676690255308112, + "language_loss": 0.81861937, + "learning_rate": 1.5750510542410337e-06, + "loss": 0.89544368, + "num_input_tokens_seen": 208215795, + "router_z_loss_clip": 1.47460938, + "router_z_loss_mlp": 0.10992432, + "step": 9665, + "time_per_iteration": 2.5936458110809326 + }, + { + "auxiliary_loss_clip": 0.06425443, + "auxiliary_loss_mlp": 0.01269377, + "balance_loss_clip": 0.0627546, + "balance_loss_mlp": 0.01257098, + "epoch": 0.5811513602885916, + "flos": 22792469788800.0, + "grad_norm": 1.7928396623098657, + "language_loss": 0.80963171, + "learning_rate": 1.5746704933465599e-06, + "loss": 0.88657987, + "num_input_tokens_seen": 208234655, + "router_z_loss_clip": 1.49804688, + "router_z_loss_mlp": 0.12268066, + "step": 9666, + "time_per_iteration": 2.556262969970703 + }, + { + "auxiliary_loss_clip": 0.06412445, + "auxiliary_loss_mlp": 0.01266794, + "balance_loss_clip": 0.06271029, + "balance_loss_mlp": 0.01256059, + "epoch": 0.5812114835412596, + "flos": 18740347464960.0, + "grad_norm": 1.6774912146747003, + "language_loss": 0.79895651, + "learning_rate": 1.5742899485806227e-06, + "loss": 0.87574893, + "num_input_tokens_seen": 208251300, + "router_z_loss_clip": 1.41308594, + "router_z_loss_mlp": 0.1072998, + "step": 9667, + "time_per_iteration": 3.980412483215332 + }, + { + "auxiliary_loss_clip": 0.06427534, + "auxiliary_loss_mlp": 0.01265338, + "balance_loss_clip": 0.06274875, + "balance_loss_mlp": 0.01252791, + "epoch": 0.5812716067939275, + "flos": 26438324791680.0, + "grad_norm": 1.482922365624984, + "language_loss": 0.79118401, + "learning_rate": 1.573909419957653e-06, + "loss": 0.86811268, + "num_input_tokens_seen": 208272685, + "router_z_loss_clip": 1.52636719, + "router_z_loss_mlp": 0.12536621, + "step": 9668, + "time_per_iteration": 2.565986156463623 + }, + { + "auxiliary_loss_clip": 0.06418585, + "auxiliary_loss_mlp": 0.01270366, + "balance_loss_clip": 0.06273644, + "balance_loss_mlp": 0.0125872, + "epoch": 0.5813317300465956, + "flos": 43407847595520.0, + "grad_norm": 1.832859625901051, + "language_loss": 0.64703673, + "learning_rate": 1.5735289074920819e-06, + "loss": 0.72392619, + "num_input_tokens_seen": 208294315, + "router_z_loss_clip": 1.44726562, + "router_z_loss_mlp": 0.11657715, + "step": 9669, + "time_per_iteration": 2.804957151412964 + }, + { + "auxiliary_loss_clip": 0.06415828, + "auxiliary_loss_mlp": 0.01266389, + "balance_loss_clip": 0.0627243, + "balance_loss_mlp": 0.01254969, + "epoch": 0.5813918532992635, + "flos": 24791564499840.0, + "grad_norm": 1.4489654033865982, + "language_loss": 0.73791713, + "learning_rate": 1.5731484111983363e-06, + "loss": 0.81473929, + "num_input_tokens_seen": 208315610, + "router_z_loss_clip": 1.43457031, + "router_z_loss_mlp": 0.11425781, + "step": 9670, + "time_per_iteration": 2.54849910736084 + }, + { + "auxiliary_loss_clip": 0.0641885, + "auxiliary_loss_mlp": 0.01269355, + "balance_loss_clip": 0.06272031, + "balance_loss_mlp": 0.0125822, + "epoch": 0.5814519765519315, + "flos": 22864068702720.0, + "grad_norm": 1.8471376195746119, + "language_loss": 0.79354227, + "learning_rate": 1.5727679310908464e-06, + "loss": 0.87042427, + "num_input_tokens_seen": 208334725, + "router_z_loss_clip": 1.46679688, + "router_z_loss_mlp": 0.11138916, + "step": 9671, + "time_per_iteration": 2.553971529006958 + }, + { + "auxiliary_loss_clip": 0.06426669, + "auxiliary_loss_mlp": 0.0126691, + "balance_loss_clip": 0.06274676, + "balance_loss_mlp": 0.01254685, + "epoch": 0.5815120998045994, + "flos": 24067651150080.0, + "grad_norm": 2.0867956489424495, + "language_loss": 0.61609662, + "learning_rate": 1.5723874671840399e-06, + "loss": 0.6930325, + "num_input_tokens_seen": 208353825, + "router_z_loss_clip": 1.51855469, + "router_z_loss_mlp": 0.12219238, + "step": 9672, + "time_per_iteration": 2.5135464668273926 + }, + { + "auxiliary_loss_clip": 0.06413487, + "auxiliary_loss_mlp": 0.01267774, + "balance_loss_clip": 0.06271096, + "balance_loss_mlp": 0.01256735, + "epoch": 0.5815722230572674, + "flos": 24286305179520.0, + "grad_norm": 2.966012751852424, + "language_loss": 0.81724179, + "learning_rate": 1.572007019492342e-06, + "loss": 0.89405441, + "num_input_tokens_seen": 208374160, + "router_z_loss_clip": 1.42285156, + "router_z_loss_mlp": 0.1104126, + "step": 9673, + "time_per_iteration": 2.531637668609619 + }, + { + "auxiliary_loss_clip": 0.06422119, + "auxiliary_loss_mlp": 0.01271004, + "balance_loss_clip": 0.06272921, + "balance_loss_mlp": 0.01258976, + "epoch": 0.5816323463099353, + "flos": 22206932657280.0, + "grad_norm": 1.7930668974507213, + "language_loss": 0.88784432, + "learning_rate": 1.5716265880301817e-06, + "loss": 0.9647755, + "num_input_tokens_seen": 208392105, + "router_z_loss_clip": 1.49121094, + "router_z_loss_mlp": 0.12030029, + "step": 9674, + "time_per_iteration": 2.490135908126831 + }, + { + "auxiliary_loss_clip": 0.06420779, + "auxiliary_loss_mlp": 0.01264457, + "balance_loss_clip": 0.0627536, + "balance_loss_mlp": 0.01253799, + "epoch": 0.5816924695626033, + "flos": 24141388343040.0, + "grad_norm": 1.4439307600636533, + "language_loss": 0.78848791, + "learning_rate": 1.571246172811984e-06, + "loss": 0.86534023, + "num_input_tokens_seen": 208411755, + "router_z_loss_clip": 1.453125, + "router_z_loss_mlp": 0.10656738, + "step": 9675, + "time_per_iteration": 2.570401191711426 + }, + { + "auxiliary_loss_clip": 0.06415851, + "auxiliary_loss_mlp": 0.01264178, + "balance_loss_clip": 0.06271321, + "balance_loss_mlp": 0.01252901, + "epoch": 0.5817525928152713, + "flos": 21330555603840.0, + "grad_norm": 2.1244098418378234, + "language_loss": 0.70489943, + "learning_rate": 1.5708657738521748e-06, + "loss": 0.78169978, + "num_input_tokens_seen": 208429995, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.11279297, + "step": 9676, + "time_per_iteration": 2.5234405994415283 + }, + { + "auxiliary_loss_clip": 0.06419084, + "auxiliary_loss_mlp": 0.01273498, + "balance_loss_clip": 0.06272397, + "balance_loss_mlp": 0.01262579, + "epoch": 0.5818127160679393, + "flos": 26940355729920.0, + "grad_norm": 2.3696751764318478, + "language_loss": 0.63762164, + "learning_rate": 1.5704853911651779e-06, + "loss": 0.71454746, + "num_input_tokens_seen": 208443655, + "router_z_loss_clip": 1.46582031, + "router_z_loss_mlp": 0.10906982, + "step": 9677, + "time_per_iteration": 2.5408287048339844 + }, + { + "auxiliary_loss_clip": 0.06307964, + "auxiliary_loss_mlp": 0.01264809, + "balance_loss_clip": 0.06247746, + "balance_loss_mlp": 0.01262844, + "epoch": 0.5818728393206073, + "flos": 63940779855360.0, + "grad_norm": 0.7897947317556949, + "language_loss": 0.54107881, + "learning_rate": 1.5701050247654182e-06, + "loss": 0.61680651, + "num_input_tokens_seen": 208498405, + "router_z_loss_clip": 0.6015625, + "router_z_loss_mlp": 0.01963806, + "step": 9678, + "time_per_iteration": 3.1962106227874756 + }, + { + "auxiliary_loss_clip": 0.0631143, + "auxiliary_loss_mlp": 0.0126129, + "balance_loss_clip": 0.06251128, + "balance_loss_mlp": 0.01259724, + "epoch": 0.5819329625732752, + "flos": 64972654087680.0, + "grad_norm": 0.717265543619072, + "language_loss": 0.56126428, + "learning_rate": 1.569724674667319e-06, + "loss": 0.6369915, + "num_input_tokens_seen": 208559075, + "router_z_loss_clip": 0.60546875, + "router_z_loss_mlp": 0.01565552, + "step": 9679, + "time_per_iteration": 3.0475993156433105 + }, + { + "auxiliary_loss_clip": 0.06420414, + "auxiliary_loss_mlp": 0.01271497, + "balance_loss_clip": 0.06274636, + "balance_loss_mlp": 0.01260386, + "epoch": 0.5819930858259432, + "flos": 21221668823040.0, + "grad_norm": 1.5334769221386826, + "language_loss": 0.65937847, + "learning_rate": 1.5693443408853032e-06, + "loss": 0.73629761, + "num_input_tokens_seen": 208577770, + "router_z_loss_clip": 1.45703125, + "router_z_loss_mlp": 0.11102295, + "step": 9680, + "time_per_iteration": 2.526440382003784 + }, + { + "auxiliary_loss_clip": 0.06418791, + "auxiliary_loss_mlp": 0.01266513, + "balance_loss_clip": 0.06274027, + "balance_loss_mlp": 0.01255909, + "epoch": 0.5820532090786111, + "flos": 19463715763200.0, + "grad_norm": 1.789175734331282, + "language_loss": 0.84067512, + "learning_rate": 1.5689640234337933e-06, + "loss": 0.91752815, + "num_input_tokens_seen": 208595110, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.10601807, + "step": 9681, + "time_per_iteration": 2.4850056171417236 + }, + { + "auxiliary_loss_clip": 0.06416699, + "auxiliary_loss_mlp": 0.01267084, + "balance_loss_clip": 0.06272473, + "balance_loss_mlp": 0.01255908, + "epoch": 0.5821133323312792, + "flos": 17718424669440.0, + "grad_norm": 2.261651210831951, + "language_loss": 0.76110494, + "learning_rate": 1.5685837223272109e-06, + "loss": 0.83794284, + "num_input_tokens_seen": 208612080, + "router_z_loss_clip": 1.44140625, + "router_z_loss_mlp": 0.11181641, + "step": 9682, + "time_per_iteration": 2.5017287731170654 + }, + { + "auxiliary_loss_clip": 0.06430176, + "auxiliary_loss_mlp": 0.01270705, + "balance_loss_clip": 0.06278756, + "balance_loss_mlp": 0.01258951, + "epoch": 0.5821734555839471, + "flos": 24578738328960.0, + "grad_norm": 2.1342093378293785, + "language_loss": 0.75805819, + "learning_rate": 1.568203437579977e-06, + "loss": 0.83506703, + "num_input_tokens_seen": 208630235, + "router_z_loss_clip": 1.515625, + "router_z_loss_mlp": 0.11749268, + "step": 9683, + "time_per_iteration": 2.5426952838897705 + }, + { + "auxiliary_loss_clip": 0.06429425, + "auxiliary_loss_mlp": 0.01275466, + "balance_loss_clip": 0.06278548, + "balance_loss_mlp": 0.0126283, + "epoch": 0.5822335788366151, + "flos": 22388760017280.0, + "grad_norm": 1.6377653311732083, + "language_loss": 0.74168241, + "learning_rate": 1.5678231692065116e-06, + "loss": 0.81873143, + "num_input_tokens_seen": 208647925, + "router_z_loss_clip": 1.50683594, + "router_z_loss_mlp": 0.12646484, + "step": 9684, + "time_per_iteration": 2.521773338317871 + }, + { + "auxiliary_loss_clip": 0.06424329, + "auxiliary_loss_mlp": 0.01273987, + "balance_loss_clip": 0.06276318, + "balance_loss_mlp": 0.01262114, + "epoch": 0.582293702089283, + "flos": 26729458202880.0, + "grad_norm": 2.7880175036552446, + "language_loss": 0.78406078, + "learning_rate": 1.5674429172212348e-06, + "loss": 0.86104393, + "num_input_tokens_seen": 208666180, + "router_z_loss_clip": 1.47851562, + "router_z_loss_mlp": 0.11871338, + "step": 9685, + "time_per_iteration": 2.53759503364563 + }, + { + "auxiliary_loss_clip": 0.06423293, + "auxiliary_loss_mlp": 0.01274993, + "balance_loss_clip": 0.06276082, + "balance_loss_mlp": 0.0126337, + "epoch": 0.582353825341951, + "flos": 17354560314240.0, + "grad_norm": 1.6209571199936617, + "language_loss": 0.75622851, + "learning_rate": 1.5670626816385667e-06, + "loss": 0.83321142, + "num_input_tokens_seen": 208684240, + "router_z_loss_clip": 1.47167969, + "router_z_loss_mlp": 0.11627197, + "step": 9686, + "time_per_iteration": 2.5203354358673096 + }, + { + "auxiliary_loss_clip": 0.06317171, + "auxiliary_loss_mlp": 0.01254478, + "balance_loss_clip": 0.06256813, + "balance_loss_mlp": 0.012529, + "epoch": 0.5824139485946189, + "flos": 55491133478400.0, + "grad_norm": 0.7976004724910164, + "language_loss": 0.57134593, + "learning_rate": 1.5666824624729244e-06, + "loss": 0.64706242, + "num_input_tokens_seen": 208736090, + "router_z_loss_clip": 0.60546875, + "router_z_loss_mlp": 0.01578522, + "step": 9687, + "time_per_iteration": 2.9669835567474365 + }, + { + "auxiliary_loss_clip": 0.06422709, + "auxiliary_loss_mlp": 0.01267333, + "balance_loss_clip": 0.06275669, + "balance_loss_mlp": 0.01255221, + "epoch": 0.582474071847287, + "flos": 20309261713920.0, + "grad_norm": 1.877177452165203, + "language_loss": 0.70002449, + "learning_rate": 1.566302259738727e-06, + "loss": 0.77692491, + "num_input_tokens_seen": 208754600, + "router_z_loss_clip": 1.47070312, + "router_z_loss_mlp": 0.12109375, + "step": 9688, + "time_per_iteration": 2.506741762161255 + }, + { + "auxiliary_loss_clip": 0.06417575, + "auxiliary_loss_mlp": 0.01265264, + "balance_loss_clip": 0.0627282, + "balance_loss_mlp": 0.01254673, + "epoch": 0.5825341950999549, + "flos": 23884733687040.0, + "grad_norm": 2.896352551150335, + "language_loss": 0.65452719, + "learning_rate": 1.5659220734503918e-06, + "loss": 0.73135561, + "num_input_tokens_seen": 208773140, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.10595703, + "step": 9689, + "time_per_iteration": 2.506406784057617 + }, + { + "auxiliary_loss_clip": 0.06415856, + "auxiliary_loss_mlp": 0.01273228, + "balance_loss_clip": 0.06272023, + "balance_loss_mlp": 0.0126126, + "epoch": 0.5825943183526229, + "flos": 23119842890880.0, + "grad_norm": 1.995545981005341, + "language_loss": 0.73637474, + "learning_rate": 1.5655419036223341e-06, + "loss": 0.81326556, + "num_input_tokens_seen": 208793410, + "router_z_loss_clip": 1.43847656, + "router_z_loss_mlp": 0.11956787, + "step": 9690, + "time_per_iteration": 3.9373486042022705 + }, + { + "auxiliary_loss_clip": 0.0642629, + "auxiliary_loss_mlp": 0.01267094, + "balance_loss_clip": 0.06275761, + "balance_loss_mlp": 0.01254887, + "epoch": 0.5826544416052909, + "flos": 22864152556800.0, + "grad_norm": 1.6091940048024238, + "language_loss": 0.76358879, + "learning_rate": 1.5651617502689717e-06, + "loss": 0.84052265, + "num_input_tokens_seen": 208811920, + "router_z_loss_clip": 1.50488281, + "router_z_loss_mlp": 0.12207031, + "step": 9691, + "time_per_iteration": 2.5036911964416504 + }, + { + "auxiliary_loss_clip": 0.06422149, + "auxiliary_loss_mlp": 0.01270283, + "balance_loss_clip": 0.06274154, + "balance_loss_mlp": 0.0125906, + "epoch": 0.5827145648579588, + "flos": 31509560799360.0, + "grad_norm": 1.692225094183595, + "language_loss": 0.80700606, + "learning_rate": 1.5647816134047184e-06, + "loss": 0.88393039, + "num_input_tokens_seen": 208834720, + "router_z_loss_clip": 1.48046875, + "router_z_loss_mlp": 0.11218262, + "step": 9692, + "time_per_iteration": 2.588819980621338 + }, + { + "auxiliary_loss_clip": 0.06307849, + "auxiliary_loss_mlp": 0.01251158, + "balance_loss_clip": 0.06247954, + "balance_loss_mlp": 0.01249412, + "epoch": 0.5827746881106268, + "flos": 69832028246400.0, + "grad_norm": 0.7844854120913538, + "language_loss": 0.5681411, + "learning_rate": 1.5644014930439907e-06, + "loss": 0.64373118, + "num_input_tokens_seen": 208898415, + "router_z_loss_clip": 0.6015625, + "router_z_loss_mlp": 0.01751709, + "step": 9693, + "time_per_iteration": 3.1347033977508545 + }, + { + "auxiliary_loss_clip": 0.0641888, + "auxiliary_loss_mlp": 0.01268479, + "balance_loss_clip": 0.06273088, + "balance_loss_mlp": 0.0125815, + "epoch": 0.5828348113632947, + "flos": 23119088204160.0, + "grad_norm": 1.522522739802819, + "language_loss": 0.78923696, + "learning_rate": 1.5640213892012025e-06, + "loss": 0.86611056, + "num_input_tokens_seen": 208919045, + "router_z_loss_clip": 1.45703125, + "router_z_loss_mlp": 0.10327148, + "step": 9694, + "time_per_iteration": 2.5068466663360596 + }, + { + "auxiliary_loss_clip": 0.06411383, + "auxiliary_loss_mlp": 0.01263322, + "balance_loss_clip": 0.06271289, + "balance_loss_mlp": 0.01253302, + "epoch": 0.5828949346159628, + "flos": 21879769190400.0, + "grad_norm": 1.3653324202123376, + "language_loss": 0.76330042, + "learning_rate": 1.5636413018907656e-06, + "loss": 0.84004748, + "num_input_tokens_seen": 208939375, + "router_z_loss_clip": 1.40332031, + "router_z_loss_mlp": 0.10021973, + "step": 9695, + "time_per_iteration": 2.556309700012207 + }, + { + "auxiliary_loss_clip": 0.06315481, + "auxiliary_loss_mlp": 0.01251352, + "balance_loss_clip": 0.06255624, + "balance_loss_mlp": 0.01249797, + "epoch": 0.5829550578686307, + "flos": 65985170497920.0, + "grad_norm": 0.7496740614083074, + "language_loss": 0.54866987, + "learning_rate": 1.563261231127095e-06, + "loss": 0.62433827, + "num_input_tokens_seen": 209004760, + "router_z_loss_clip": 0.6015625, + "router_z_loss_mlp": 0.01553345, + "step": 9696, + "time_per_iteration": 4.669760704040527 + }, + { + "auxiliary_loss_clip": 0.06418857, + "auxiliary_loss_mlp": 0.01264307, + "balance_loss_clip": 0.06272456, + "balance_loss_mlp": 0.01252893, + "epoch": 0.5830151811212987, + "flos": 16295391578880.0, + "grad_norm": 1.8785254946392194, + "language_loss": 0.76464188, + "learning_rate": 1.5628811769246021e-06, + "loss": 0.84147352, + "num_input_tokens_seen": 209022930, + "router_z_loss_clip": 1.46289062, + "router_z_loss_mlp": 0.11413574, + "step": 9697, + "time_per_iteration": 2.5041255950927734 + }, + { + "auxiliary_loss_clip": 0.06422254, + "auxiliary_loss_mlp": 0.01268851, + "balance_loss_clip": 0.06272788, + "balance_loss_mlp": 0.01256668, + "epoch": 0.5830753043739666, + "flos": 24175447827840.0, + "grad_norm": 1.6024364882265518, + "language_loss": 0.77965522, + "learning_rate": 1.5625011392976991e-06, + "loss": 0.85656625, + "num_input_tokens_seen": 209043740, + "router_z_loss_clip": 1.49511719, + "router_z_loss_mlp": 0.12188721, + "step": 9698, + "time_per_iteration": 2.5902624130249023 + }, + { + "auxiliary_loss_clip": 0.06415899, + "auxiliary_loss_mlp": 0.01273709, + "balance_loss_clip": 0.06272474, + "balance_loss_mlp": 0.01260894, + "epoch": 0.5831354276266346, + "flos": 27067438846080.0, + "grad_norm": 1.5547381527883266, + "language_loss": 0.84016132, + "learning_rate": 1.5621211182607966e-06, + "loss": 0.91705739, + "num_input_tokens_seen": 209068885, + "router_z_loss_clip": 1.43359375, + "router_z_loss_mlp": 0.12817383, + "step": 9699, + "time_per_iteration": 2.6469032764434814 + }, + { + "auxiliary_loss_clip": 0.0642215, + "auxiliary_loss_mlp": 0.01265721, + "balance_loss_clip": 0.06274705, + "balance_loss_mlp": 0.01254104, + "epoch": 0.5831955508793025, + "flos": 23630301164160.0, + "grad_norm": 1.933998465104238, + "language_loss": 0.65971506, + "learning_rate": 1.561741113828305e-06, + "loss": 0.73659378, + "num_input_tokens_seen": 209087340, + "router_z_loss_clip": 1.47558594, + "router_z_loss_mlp": 0.1161499, + "step": 9700, + "time_per_iteration": 3.9589943885803223 + }, + { + "auxiliary_loss_clip": 0.06417754, + "auxiliary_loss_mlp": 0.0126768, + "balance_loss_clip": 0.0627218, + "balance_loss_mlp": 0.01256086, + "epoch": 0.5832556741319705, + "flos": 24980267894400.0, + "grad_norm": 1.7460823027462598, + "language_loss": 0.71739107, + "learning_rate": 1.5613611260146344e-06, + "loss": 0.79424536, + "num_input_tokens_seen": 209108840, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.1159668, + "step": 9701, + "time_per_iteration": 2.591634511947632 + }, + { + "auxiliary_loss_clip": 0.06415233, + "auxiliary_loss_mlp": 0.01264901, + "balance_loss_clip": 0.06270908, + "balance_loss_mlp": 0.01253278, + "epoch": 0.5833157973846385, + "flos": 23228226547200.0, + "grad_norm": 1.7061750612547373, + "language_loss": 0.85686189, + "learning_rate": 1.5609811548341936e-06, + "loss": 0.93366319, + "num_input_tokens_seen": 209127985, + "router_z_loss_clip": 1.44335938, + "router_z_loss_mlp": 0.11627197, + "step": 9702, + "time_per_iteration": 2.552055835723877 + }, + { + "auxiliary_loss_clip": 0.0641585, + "auxiliary_loss_mlp": 0.01263882, + "balance_loss_clip": 0.0627341, + "balance_loss_mlp": 0.01253511, + "epoch": 0.5833759206373065, + "flos": 21983876288640.0, + "grad_norm": 1.4269240656932136, + "language_loss": 0.78200948, + "learning_rate": 1.560601200301392e-06, + "loss": 0.85880685, + "num_input_tokens_seen": 209146885, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.10369873, + "step": 9703, + "time_per_iteration": 2.500241279602051 + }, + { + "auxiliary_loss_clip": 0.06420664, + "auxiliary_loss_mlp": 0.01264639, + "balance_loss_clip": 0.06272513, + "balance_loss_mlp": 0.01252831, + "epoch": 0.5834360438899745, + "flos": 21768869911680.0, + "grad_norm": 1.5504614474031426, + "language_loss": 0.71309936, + "learning_rate": 1.5602212624306366e-06, + "loss": 0.78995246, + "num_input_tokens_seen": 209166130, + "router_z_loss_clip": 1.48242188, + "router_z_loss_mlp": 0.11816406, + "step": 9704, + "time_per_iteration": 2.5374741554260254 + }, + { + "auxiliary_loss_clip": 0.06421441, + "auxiliary_loss_mlp": 0.0126726, + "balance_loss_clip": 0.06276259, + "balance_loss_mlp": 0.01256919, + "epoch": 0.5834961671426424, + "flos": 15997214424960.0, + "grad_norm": 1.6199693671180324, + "language_loss": 0.81965989, + "learning_rate": 1.559841341236335e-06, + "loss": 0.89654684, + "num_input_tokens_seen": 209183350, + "router_z_loss_clip": 1.45117188, + "router_z_loss_mlp": 0.10339355, + "step": 9705, + "time_per_iteration": 2.5450189113616943 + }, + { + "auxiliary_loss_clip": 0.06418713, + "auxiliary_loss_mlp": 0.01264358, + "balance_loss_clip": 0.06273229, + "balance_loss_mlp": 0.01253379, + "epoch": 0.5835562903953104, + "flos": 22824600629760.0, + "grad_norm": 1.6206416307327924, + "language_loss": 0.80445373, + "learning_rate": 1.5594614367328937e-06, + "loss": 0.88128448, + "num_input_tokens_seen": 209203945, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.10986328, + "step": 9706, + "time_per_iteration": 2.5352673530578613 + }, + { + "auxiliary_loss_clip": 0.06415439, + "auxiliary_loss_mlp": 0.01273281, + "balance_loss_clip": 0.06272696, + "balance_loss_mlp": 0.01261003, + "epoch": 0.5836164136479783, + "flos": 48478664332800.0, + "grad_norm": 1.6746295019388222, + "language_loss": 0.74755418, + "learning_rate": 1.5590815489347187e-06, + "loss": 0.82444143, + "num_input_tokens_seen": 209227080, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.1227417, + "step": 9707, + "time_per_iteration": 4.184760808944702 + }, + { + "auxiliary_loss_clip": 0.06414578, + "auxiliary_loss_mlp": 0.01264036, + "balance_loss_clip": 0.06273817, + "balance_loss_mlp": 0.01253463, + "epoch": 0.5836765369006464, + "flos": 26913172279680.0, + "grad_norm": 1.726633366654796, + "language_loss": 0.81783116, + "learning_rate": 1.5587016778562163e-06, + "loss": 0.89461732, + "num_input_tokens_seen": 209248170, + "router_z_loss_clip": 1.40527344, + "router_z_loss_mlp": 0.10571289, + "step": 9708, + "time_per_iteration": 2.5494630336761475 + }, + { + "auxiliary_loss_clip": 0.064155, + "auxiliary_loss_mlp": 0.01267312, + "balance_loss_clip": 0.06274238, + "balance_loss_mlp": 0.01256404, + "epoch": 0.5837366601533143, + "flos": 20090230341120.0, + "grad_norm": 1.3928808196753693, + "language_loss": 0.78363276, + "learning_rate": 1.5583218235117896e-06, + "loss": 0.86046088, + "num_input_tokens_seen": 209267730, + "router_z_loss_clip": 1.41308594, + "router_z_loss_mlp": 0.10906982, + "step": 9709, + "time_per_iteration": 2.54146409034729 + }, + { + "auxiliary_loss_clip": 0.06313366, + "auxiliary_loss_mlp": 0.01252195, + "balance_loss_clip": 0.06253533, + "balance_loss_mlp": 0.01250684, + "epoch": 0.5837967834059823, + "flos": 65383910726400.0, + "grad_norm": 0.7481338178050596, + "language_loss": 0.5665468, + "learning_rate": 1.557941985915844e-06, + "loss": 0.64220238, + "num_input_tokens_seen": 209332510, + "router_z_loss_clip": 0.60058594, + "router_z_loss_mlp": 0.0151062, + "step": 9710, + "time_per_iteration": 3.130523443222046 + }, + { + "auxiliary_loss_clip": 0.06414168, + "auxiliary_loss_mlp": 0.01266687, + "balance_loss_clip": 0.06273045, + "balance_loss_mlp": 0.01256495, + "epoch": 0.5838569066586502, + "flos": 25345809331200.0, + "grad_norm": 1.5024705126599753, + "language_loss": 0.65656877, + "learning_rate": 1.5575621650827833e-06, + "loss": 0.73337734, + "num_input_tokens_seen": 209353355, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.10198975, + "step": 9711, + "time_per_iteration": 2.558560609817505 + }, + { + "auxiliary_loss_clip": 0.06425221, + "auxiliary_loss_mlp": 0.0126656, + "balance_loss_clip": 0.06273845, + "balance_loss_mlp": 0.0125393, + "epoch": 0.5839170299113182, + "flos": 22234535377920.0, + "grad_norm": 1.9299970772651502, + "language_loss": 0.79264128, + "learning_rate": 1.5571823610270085e-06, + "loss": 0.86955917, + "num_input_tokens_seen": 209370960, + "router_z_loss_clip": 1.51269531, + "router_z_loss_mlp": 0.12640381, + "step": 9712, + "time_per_iteration": 2.571164131164551 + }, + { + "auxiliary_loss_clip": 0.06417041, + "auxiliary_loss_mlp": 0.01264809, + "balance_loss_clip": 0.06273463, + "balance_loss_mlp": 0.01254021, + "epoch": 0.5839771531639861, + "flos": 22206513386880.0, + "grad_norm": 1.5054581881557743, + "language_loss": 0.73669749, + "learning_rate": 1.5568025737629234e-06, + "loss": 0.81351602, + "num_input_tokens_seen": 209390955, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.10784912, + "step": 9713, + "time_per_iteration": 2.5475780963897705 + }, + { + "auxiliary_loss_clip": 0.06424147, + "auxiliary_loss_mlp": 0.01265979, + "balance_loss_clip": 0.06274505, + "balance_loss_mlp": 0.01252932, + "epoch": 0.5840372764166541, + "flos": 22425964030080.0, + "grad_norm": 1.9255335004661567, + "language_loss": 0.70002109, + "learning_rate": 1.5564228033049292e-06, + "loss": 0.77692235, + "num_input_tokens_seen": 209410260, + "router_z_loss_clip": 1.49511719, + "router_z_loss_mlp": 0.13049316, + "step": 9714, + "time_per_iteration": 2.523638963699341 + }, + { + "auxiliary_loss_clip": 0.06419174, + "auxiliary_loss_mlp": 0.01266096, + "balance_loss_clip": 0.06273787, + "balance_loss_mlp": 0.012543, + "epoch": 0.5840973996693221, + "flos": 19834330371840.0, + "grad_norm": 1.8598920078622099, + "language_loss": 0.80627859, + "learning_rate": 1.5560430496674268e-06, + "loss": 0.88313133, + "num_input_tokens_seen": 209429920, + "router_z_loss_clip": 1.453125, + "router_z_loss_mlp": 0.11798096, + "step": 9715, + "time_per_iteration": 2.5382297039031982 + }, + { + "auxiliary_loss_clip": 0.06417744, + "auxiliary_loss_mlp": 0.01265495, + "balance_loss_clip": 0.0627513, + "balance_loss_mlp": 0.01254194, + "epoch": 0.5841575229219901, + "flos": 21149482930560.0, + "grad_norm": 1.9876848107590372, + "language_loss": 0.73826301, + "learning_rate": 1.5556633128648167e-06, + "loss": 0.81509537, + "num_input_tokens_seen": 209449470, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.11303711, + "step": 9716, + "time_per_iteration": 2.5080726146698 + }, + { + "auxiliary_loss_clip": 0.06413358, + "auxiliary_loss_mlp": 0.01264669, + "balance_loss_clip": 0.0627432, + "balance_loss_mlp": 0.01254202, + "epoch": 0.5842176461746581, + "flos": 24646521882240.0, + "grad_norm": 2.3723983049620876, + "language_loss": 0.75045407, + "learning_rate": 1.5552835929114976e-06, + "loss": 0.82723433, + "num_input_tokens_seen": 209467695, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.10467529, + "step": 9717, + "time_per_iteration": 2.5569300651550293 + }, + { + "auxiliary_loss_clip": 0.06420394, + "auxiliary_loss_mlp": 0.01265893, + "balance_loss_clip": 0.06276444, + "balance_loss_mlp": 0.01254759, + "epoch": 0.584277769427326, + "flos": 19136468442240.0, + "grad_norm": 2.2457444336667343, + "language_loss": 0.80242944, + "learning_rate": 1.5549038898218697e-06, + "loss": 0.87929225, + "num_input_tokens_seen": 209484250, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.11132812, + "step": 9718, + "time_per_iteration": 2.5623273849487305 + }, + { + "auxiliary_loss_clip": 0.06421262, + "auxiliary_loss_mlp": 0.01264972, + "balance_loss_clip": 0.0627823, + "balance_loss_mlp": 0.01253117, + "epoch": 0.584337892679994, + "flos": 22681822072320.0, + "grad_norm": 1.5991831303569484, + "language_loss": 0.67348599, + "learning_rate": 1.5545242036103306e-06, + "loss": 0.75034833, + "num_input_tokens_seen": 209502830, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.11853027, + "step": 9719, + "time_per_iteration": 2.5381717681884766 + }, + { + "auxiliary_loss_clip": 0.0641831, + "auxiliary_loss_mlp": 0.01263454, + "balance_loss_clip": 0.06271739, + "balance_loss_mlp": 0.01252022, + "epoch": 0.5843980159326619, + "flos": 31291954945920.0, + "grad_norm": 1.728104183061379, + "language_loss": 0.75697351, + "learning_rate": 1.5541445342912786e-06, + "loss": 0.83379114, + "num_input_tokens_seen": 209525995, + "router_z_loss_clip": 1.46679688, + "router_z_loss_mlp": 0.11425781, + "step": 9720, + "time_per_iteration": 2.6132402420043945 + }, + { + "auxiliary_loss_clip": 0.06421956, + "auxiliary_loss_mlp": 0.01266891, + "balance_loss_clip": 0.06276225, + "balance_loss_mlp": 0.01255799, + "epoch": 0.58445813918533, + "flos": 22754846505600.0, + "grad_norm": 1.447216358863969, + "language_loss": 0.83020425, + "learning_rate": 1.5537648818791105e-06, + "loss": 0.90709275, + "num_input_tokens_seen": 209545895, + "router_z_loss_clip": 1.45605469, + "router_z_loss_mlp": 0.11090088, + "step": 9721, + "time_per_iteration": 2.5127675533294678 + }, + { + "auxiliary_loss_clip": 0.06310159, + "auxiliary_loss_mlp": 0.01253726, + "balance_loss_clip": 0.06250554, + "balance_loss_mlp": 0.01252051, + "epoch": 0.5845182624379979, + "flos": 60704602992000.0, + "grad_norm": 0.9150346622366115, + "language_loss": 0.71186364, + "learning_rate": 1.5533852463882226e-06, + "loss": 0.78750253, + "num_input_tokens_seen": 209602315, + "router_z_loss_clip": 0.59765625, + "router_z_loss_mlp": 0.01678467, + "step": 9722, + "time_per_iteration": 3.1494555473327637 + }, + { + "auxiliary_loss_clip": 0.06417061, + "auxiliary_loss_mlp": 0.01268389, + "balance_loss_clip": 0.06274655, + "balance_loss_mlp": 0.01257255, + "epoch": 0.5845783856906659, + "flos": 16367996741760.0, + "grad_norm": 1.9087918582550145, + "language_loss": 0.8944329, + "learning_rate": 1.5530056278330113e-06, + "loss": 0.97128743, + "num_input_tokens_seen": 209617615, + "router_z_loss_clip": 1.42285156, + "router_z_loss_mlp": 0.11132812, + "step": 9723, + "time_per_iteration": 2.4576761722564697 + }, + { + "auxiliary_loss_clip": 0.06417491, + "auxiliary_loss_mlp": 0.01267794, + "balance_loss_clip": 0.06274551, + "balance_loss_mlp": 0.01256922, + "epoch": 0.5846385089433338, + "flos": 20089475654400.0, + "grad_norm": 1.3439404505357262, + "language_loss": 0.68925285, + "learning_rate": 1.5526260262278709e-06, + "loss": 0.76610565, + "num_input_tokens_seen": 209637005, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.10870361, + "step": 9724, + "time_per_iteration": 2.5088019371032715 + }, + { + "auxiliary_loss_clip": 0.06417604, + "auxiliary_loss_mlp": 0.01265081, + "balance_loss_clip": 0.06271344, + "balance_loss_mlp": 0.01252922, + "epoch": 0.5846986321960018, + "flos": 17316769322880.0, + "grad_norm": 2.3711774156816188, + "language_loss": 0.86716926, + "learning_rate": 1.552246441587197e-06, + "loss": 0.94399607, + "num_input_tokens_seen": 209653170, + "router_z_loss_clip": 1.46289062, + "router_z_loss_mlp": 0.121521, + "step": 9725, + "time_per_iteration": 2.4511706829071045 + }, + { + "auxiliary_loss_clip": 0.06423703, + "auxiliary_loss_mlp": 0.0127082, + "balance_loss_clip": 0.06276515, + "balance_loss_mlp": 0.01258995, + "epoch": 0.5847587554486697, + "flos": 17202977078400.0, + "grad_norm": 1.45457124956925, + "language_loss": 0.8335436, + "learning_rate": 1.5518668739253821e-06, + "loss": 0.91048884, + "num_input_tokens_seen": 209671275, + "router_z_loss_clip": 1.47265625, + "router_z_loss_mlp": 0.1182251, + "step": 9726, + "time_per_iteration": 2.506606340408325 + }, + { + "auxiliary_loss_clip": 0.06418396, + "auxiliary_loss_mlp": 0.01263644, + "balance_loss_clip": 0.06271654, + "balance_loss_mlp": 0.01252957, + "epoch": 0.5848188787013378, + "flos": 24534993697920.0, + "grad_norm": 1.7434091697787477, + "language_loss": 0.67301726, + "learning_rate": 1.5514873232568206e-06, + "loss": 0.7498377, + "num_input_tokens_seen": 209690380, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 0.10675049, + "step": 9727, + "time_per_iteration": 2.5283849239349365 + }, + { + "auxiliary_loss_clip": 0.06419774, + "auxiliary_loss_mlp": 0.01272592, + "balance_loss_clip": 0.06275018, + "balance_loss_mlp": 0.0126054, + "epoch": 0.5848790019540057, + "flos": 20634161120640.0, + "grad_norm": 1.6131340234861964, + "language_loss": 0.82272881, + "learning_rate": 1.5511077895959055e-06, + "loss": 0.89965248, + "num_input_tokens_seen": 209708845, + "router_z_loss_clip": 1.44824219, + "router_z_loss_mlp": 0.12060547, + "step": 9728, + "time_per_iteration": 2.5226187705993652 + }, + { + "auxiliary_loss_clip": 0.06412318, + "auxiliary_loss_mlp": 0.01270439, + "balance_loss_clip": 0.06272879, + "balance_loss_mlp": 0.01260198, + "epoch": 0.5849391252066737, + "flos": 22425377051520.0, + "grad_norm": 1.6963428440366448, + "language_loss": 0.78290164, + "learning_rate": 1.550728272957027e-06, + "loss": 0.85972917, + "num_input_tokens_seen": 209729000, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.10241699, + "step": 9729, + "time_per_iteration": 3.922197103500366 + }, + { + "auxiliary_loss_clip": 0.06418414, + "auxiliary_loss_mlp": 0.01265654, + "balance_loss_clip": 0.06272924, + "balance_loss_mlp": 0.01254228, + "epoch": 0.5849992484593417, + "flos": 25417995223680.0, + "grad_norm": 1.7817091958189777, + "language_loss": 0.71144295, + "learning_rate": 1.5503487733545782e-06, + "loss": 0.78828371, + "num_input_tokens_seen": 209747435, + "router_z_loss_clip": 1.45605469, + "router_z_loss_mlp": 0.11419678, + "step": 9730, + "time_per_iteration": 2.5403687953948975 + }, + { + "auxiliary_loss_clip": 0.06422406, + "auxiliary_loss_mlp": 0.01268067, + "balance_loss_clip": 0.0627557, + "balance_loss_mlp": 0.01256188, + "epoch": 0.5850593717120096, + "flos": 21070840273920.0, + "grad_norm": 1.6620919701985222, + "language_loss": 0.78394347, + "learning_rate": 1.5499692908029482e-06, + "loss": 0.86084819, + "num_input_tokens_seen": 209764910, + "router_z_loss_clip": 1.46679688, + "router_z_loss_mlp": 0.11883545, + "step": 9731, + "time_per_iteration": 2.5166611671447754 + }, + { + "auxiliary_loss_clip": 0.06415913, + "auxiliary_loss_mlp": 0.01267443, + "balance_loss_clip": 0.0627268, + "balance_loss_mlp": 0.01256088, + "epoch": 0.5851194949646776, + "flos": 25308605318400.0, + "grad_norm": 2.100344301849282, + "language_loss": 0.70174819, + "learning_rate": 1.549589825316528e-06, + "loss": 0.77858174, + "num_input_tokens_seen": 209786115, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.11352539, + "step": 9732, + "time_per_iteration": 2.538188934326172 + }, + { + "auxiliary_loss_clip": 0.06423078, + "auxiliary_loss_mlp": 0.01269533, + "balance_loss_clip": 0.06275669, + "balance_loss_mlp": 0.01256707, + "epoch": 0.5851796182173455, + "flos": 23594103400320.0, + "grad_norm": 2.4062469566098685, + "language_loss": 0.53286588, + "learning_rate": 1.5492103769097075e-06, + "loss": 0.60979199, + "num_input_tokens_seen": 209806095, + "router_z_loss_clip": 1.47363281, + "router_z_loss_mlp": 0.12823486, + "step": 9733, + "time_per_iteration": 2.511302947998047 + }, + { + "auxiliary_loss_clip": 0.06417008, + "auxiliary_loss_mlp": 0.01268655, + "balance_loss_clip": 0.06273425, + "balance_loss_mlp": 0.01256657, + "epoch": 0.5852397414700136, + "flos": 24828936220800.0, + "grad_norm": 2.0225140710518184, + "language_loss": 0.87949061, + "learning_rate": 1.5488309455968739e-06, + "loss": 0.95634717, + "num_input_tokens_seen": 209823650, + "router_z_loss_clip": 1.43652344, + "router_z_loss_mlp": 0.12005615, + "step": 9734, + "time_per_iteration": 2.538619041442871 + }, + { + "auxiliary_loss_clip": 0.06415038, + "auxiliary_loss_mlp": 0.01266318, + "balance_loss_clip": 0.06276681, + "balance_loss_mlp": 0.01255667, + "epoch": 0.5852998647226815, + "flos": 19943887985280.0, + "grad_norm": 1.4699537388912873, + "language_loss": 0.72430563, + "learning_rate": 1.5484515313924163e-06, + "loss": 0.80111921, + "num_input_tokens_seen": 209843220, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.10656738, + "step": 9735, + "time_per_iteration": 3.9566004276275635 + }, + { + "auxiliary_loss_clip": 0.06418768, + "auxiliary_loss_mlp": 0.01267652, + "balance_loss_clip": 0.06273651, + "balance_loss_mlp": 0.0125563, + "epoch": 0.5853599879753495, + "flos": 16724817354240.0, + "grad_norm": 2.1987965595401135, + "language_loss": 0.7462939, + "learning_rate": 1.5480721343107217e-06, + "loss": 0.82315814, + "num_input_tokens_seen": 209854880, + "router_z_loss_clip": 1.45117188, + "router_z_loss_mlp": 0.12017822, + "step": 9736, + "time_per_iteration": 2.4270691871643066 + }, + { + "auxiliary_loss_clip": 0.06417002, + "auxiliary_loss_mlp": 0.01263204, + "balance_loss_clip": 0.06274146, + "balance_loss_mlp": 0.0125241, + "epoch": 0.5854201112280174, + "flos": 44466848622720.0, + "grad_norm": 1.4975519288318198, + "language_loss": 0.7076987, + "learning_rate": 1.5476927543661772e-06, + "loss": 0.78450084, + "num_input_tokens_seen": 209877870, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.10791016, + "step": 9737, + "time_per_iteration": 2.744206190109253 + }, + { + "auxiliary_loss_clip": 0.06416388, + "auxiliary_loss_mlp": 0.01270708, + "balance_loss_clip": 0.06274648, + "balance_loss_mlp": 0.01259556, + "epoch": 0.5854802344806854, + "flos": 20345375623680.0, + "grad_norm": 1.6871127807078519, + "language_loss": 0.82840961, + "learning_rate": 1.547313391573169e-06, + "loss": 0.90528059, + "num_input_tokens_seen": 209896690, + "router_z_loss_clip": 1.41796875, + "router_z_loss_mlp": 0.11151123, + "step": 9738, + "time_per_iteration": 2.4849019050598145 + }, + { + "auxiliary_loss_clip": 0.06422549, + "auxiliary_loss_mlp": 0.01269287, + "balance_loss_clip": 0.06275184, + "balance_loss_mlp": 0.01257431, + "epoch": 0.5855403577333533, + "flos": 20927013540480.0, + "grad_norm": 1.6194676695443784, + "language_loss": 0.69157064, + "learning_rate": 1.546934045946082e-06, + "loss": 0.768489, + "num_input_tokens_seen": 209914640, + "router_z_loss_clip": 1.47363281, + "router_z_loss_mlp": 0.11846924, + "step": 9739, + "time_per_iteration": 3.941681146621704 + }, + { + "auxiliary_loss_clip": 0.0641816, + "auxiliary_loss_mlp": 0.01267651, + "balance_loss_clip": 0.06272583, + "balance_loss_mlp": 0.01255796, + "epoch": 0.5856004809860214, + "flos": 20454849383040.0, + "grad_norm": 2.1509507460713038, + "language_loss": 0.59265625, + "learning_rate": 1.5465547174993017e-06, + "loss": 0.66951436, + "num_input_tokens_seen": 209933375, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.11859131, + "step": 9740, + "time_per_iteration": 2.5459988117218018 + }, + { + "auxiliary_loss_clip": 0.06417701, + "auxiliary_loss_mlp": 0.01265897, + "balance_loss_clip": 0.06273193, + "balance_loss_mlp": 0.0125487, + "epoch": 0.5856606042386893, + "flos": 19645962393600.0, + "grad_norm": 1.6784070122461718, + "language_loss": 0.75433791, + "learning_rate": 1.5461754062472113e-06, + "loss": 0.83117396, + "num_input_tokens_seen": 209952055, + "router_z_loss_clip": 1.44726562, + "router_z_loss_mlp": 0.11029053, + "step": 9741, + "time_per_iteration": 2.488905668258667 + }, + { + "auxiliary_loss_clip": 0.06418155, + "auxiliary_loss_mlp": 0.01263599, + "balance_loss_clip": 0.06272431, + "balance_loss_mlp": 0.01251857, + "epoch": 0.5857207274913573, + "flos": 21692072044800.0, + "grad_norm": 1.4885669249171192, + "language_loss": 0.76157856, + "learning_rate": 1.5457961122041959e-06, + "loss": 0.83839613, + "num_input_tokens_seen": 209971190, + "router_z_loss_clip": 1.45605469, + "router_z_loss_mlp": 0.11743164, + "step": 9742, + "time_per_iteration": 2.5480451583862305 + }, + { + "auxiliary_loss_clip": 0.06415333, + "auxiliary_loss_mlp": 0.01266181, + "balance_loss_clip": 0.06272702, + "balance_loss_mlp": 0.01254737, + "epoch": 0.5857808507440253, + "flos": 23188968109440.0, + "grad_norm": 1.7165353954706328, + "language_loss": 0.75240624, + "learning_rate": 1.5454168353846369e-06, + "loss": 0.82922137, + "num_input_tokens_seen": 209990695, + "router_z_loss_clip": 1.42480469, + "router_z_loss_mlp": 0.11444092, + "step": 9743, + "time_per_iteration": 2.503702163696289 + }, + { + "auxiliary_loss_clip": 0.0641541, + "auxiliary_loss_mlp": 0.01265703, + "balance_loss_clip": 0.06275813, + "balance_loss_mlp": 0.01254944, + "epoch": 0.5858409739966932, + "flos": 27242683660800.0, + "grad_norm": 1.53753206771929, + "language_loss": 0.81320727, + "learning_rate": 1.5450375758029172e-06, + "loss": 0.8900184, + "num_input_tokens_seen": 210010210, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.10760498, + "step": 9744, + "time_per_iteration": 2.5923476219177246 + }, + { + "auxiliary_loss_clip": 0.06429034, + "auxiliary_loss_mlp": 0.01268911, + "balance_loss_clip": 0.06278567, + "balance_loss_mlp": 0.01256847, + "epoch": 0.5859010972493612, + "flos": 27862993036800.0, + "grad_norm": 1.7800190043611435, + "language_loss": 0.71494257, + "learning_rate": 1.5446583334734183e-06, + "loss": 0.79192197, + "num_input_tokens_seen": 210030030, + "router_z_loss_clip": 1.50390625, + "router_z_loss_mlp": 0.12072754, + "step": 9745, + "time_per_iteration": 2.5417301654815674 + }, + { + "auxiliary_loss_clip": 0.06318981, + "auxiliary_loss_mlp": 0.01251832, + "balance_loss_clip": 0.06258826, + "balance_loss_mlp": 0.01250336, + "epoch": 0.5859612205020291, + "flos": 70029452465280.0, + "grad_norm": 0.7182748841957548, + "language_loss": 0.53236032, + "learning_rate": 1.5442791084105204e-06, + "loss": 0.60806841, + "num_input_tokens_seen": 210094840, + "router_z_loss_clip": 0.6015625, + "router_z_loss_mlp": 0.01495361, + "step": 9746, + "time_per_iteration": 4.6102893352508545 + }, + { + "auxiliary_loss_clip": 0.06421819, + "auxiliary_loss_mlp": 0.01265345, + "balance_loss_clip": 0.06276383, + "balance_loss_mlp": 0.01253907, + "epoch": 0.5860213437546972, + "flos": 24062032926720.0, + "grad_norm": 1.805241505686608, + "language_loss": 0.7322374, + "learning_rate": 1.5438999006286054e-06, + "loss": 0.80910903, + "num_input_tokens_seen": 210114660, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.11437988, + "step": 9747, + "time_per_iteration": 2.5299086570739746 + }, + { + "auxiliary_loss_clip": 0.06420729, + "auxiliary_loss_mlp": 0.01264964, + "balance_loss_clip": 0.06275554, + "balance_loss_mlp": 0.01253806, + "epoch": 0.5860814670073651, + "flos": 18952670511360.0, + "grad_norm": 1.7528078306488855, + "language_loss": 0.81229597, + "learning_rate": 1.543520710142051e-06, + "loss": 0.88915294, + "num_input_tokens_seen": 210132770, + "router_z_loss_clip": 1.45214844, + "router_z_loss_mlp": 0.1116333, + "step": 9748, + "time_per_iteration": 2.5070362091064453 + }, + { + "auxiliary_loss_clip": 0.06422453, + "auxiliary_loss_mlp": 0.01268094, + "balance_loss_clip": 0.06275974, + "balance_loss_mlp": 0.01256674, + "epoch": 0.5861415902600331, + "flos": 22567904046720.0, + "grad_norm": 2.1315206911445217, + "language_loss": 0.72122687, + "learning_rate": 1.5431415369652375e-06, + "loss": 0.7981323, + "num_input_tokens_seen": 210151895, + "router_z_loss_clip": 1.46484375, + "router_z_loss_mlp": 0.11419678, + "step": 9749, + "time_per_iteration": 2.5568935871124268 + }, + { + "auxiliary_loss_clip": 0.06413895, + "auxiliary_loss_mlp": 0.01265815, + "balance_loss_clip": 0.06272951, + "balance_loss_mlp": 0.01254765, + "epoch": 0.586201713512701, + "flos": 14397217511040.0, + "grad_norm": 2.3126679183899608, + "language_loss": 0.75373948, + "learning_rate": 1.5427623811125428e-06, + "loss": 0.8305366, + "num_input_tokens_seen": 210168040, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.11053467, + "step": 9750, + "time_per_iteration": 2.456709623336792 + }, + { + "auxiliary_loss_clip": 0.06418054, + "auxiliary_loss_mlp": 0.01267589, + "balance_loss_clip": 0.06274709, + "balance_loss_mlp": 0.01256091, + "epoch": 0.586261836765369, + "flos": 19504357793280.0, + "grad_norm": 1.5048801591853769, + "language_loss": 0.70914859, + "learning_rate": 1.542383242598344e-06, + "loss": 0.78600496, + "num_input_tokens_seen": 210187720, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.11505127, + "step": 9751, + "time_per_iteration": 2.516965389251709 + }, + { + "auxiliary_loss_clip": 0.06427741, + "auxiliary_loss_mlp": 0.01267026, + "balance_loss_clip": 0.06278099, + "balance_loss_mlp": 0.01254748, + "epoch": 0.5863219600180369, + "flos": 20707688678400.0, + "grad_norm": 2.2695397417566134, + "language_loss": 0.74817115, + "learning_rate": 1.5420041214370184e-06, + "loss": 0.82511884, + "num_input_tokens_seen": 210206080, + "router_z_loss_clip": 1.49511719, + "router_z_loss_mlp": 0.12280273, + "step": 9752, + "time_per_iteration": 2.4829437732696533 + }, + { + "auxiliary_loss_clip": 0.06419428, + "auxiliary_loss_mlp": 0.01266631, + "balance_loss_clip": 0.06275827, + "balance_loss_mlp": 0.01255026, + "epoch": 0.586382083270705, + "flos": 19798258389120.0, + "grad_norm": 1.7375633359019997, + "language_loss": 0.77788973, + "learning_rate": 1.541625017642943e-06, + "loss": 0.85475028, + "num_input_tokens_seen": 210225660, + "router_z_loss_clip": 1.43457031, + "router_z_loss_mlp": 0.1159668, + "step": 9753, + "time_per_iteration": 2.5376296043395996 + }, + { + "auxiliary_loss_clip": 0.06415142, + "auxiliary_loss_mlp": 0.01266521, + "balance_loss_clip": 0.06275599, + "balance_loss_mlp": 0.01256478, + "epoch": 0.5864422065233729, + "flos": 16504821659520.0, + "grad_norm": 1.5941521516898884, + "language_loss": 0.71418774, + "learning_rate": 1.5412459312304927e-06, + "loss": 0.79100442, + "num_input_tokens_seen": 210242725, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.1003418, + "step": 9754, + "time_per_iteration": 2.482060670852661 + }, + { + "auxiliary_loss_clip": 0.06418964, + "auxiliary_loss_mlp": 0.01266127, + "balance_loss_clip": 0.06275275, + "balance_loss_mlp": 0.01254706, + "epoch": 0.5865023297760409, + "flos": 20419657868160.0, + "grad_norm": 1.5122611907827943, + "language_loss": 0.72473872, + "learning_rate": 1.540866862214043e-06, + "loss": 0.80158961, + "num_input_tokens_seen": 210263225, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.11407471, + "step": 9755, + "time_per_iteration": 2.5370032787323 + }, + { + "auxiliary_loss_clip": 0.06317496, + "auxiliary_loss_mlp": 0.01251101, + "balance_loss_clip": 0.06257688, + "balance_loss_mlp": 0.01249532, + "epoch": 0.5865624530287089, + "flos": 63369386864640.0, + "grad_norm": 0.7287908319651881, + "language_loss": 0.56949997, + "learning_rate": 1.540487810607967e-06, + "loss": 0.64518595, + "num_input_tokens_seen": 210322310, + "router_z_loss_clip": 0.60058594, + "router_z_loss_mlp": 0.01570129, + "step": 9756, + "time_per_iteration": 3.10322904586792 + }, + { + "auxiliary_loss_clip": 0.06418074, + "auxiliary_loss_mlp": 0.01268383, + "balance_loss_clip": 0.06273533, + "balance_loss_mlp": 0.01258048, + "epoch": 0.5866225762813768, + "flos": 27023610360960.0, + "grad_norm": 1.7386050489235434, + "language_loss": 0.76836097, + "learning_rate": 1.5401087764266396e-06, + "loss": 0.84522557, + "num_input_tokens_seen": 210340845, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.10333252, + "step": 9757, + "time_per_iteration": 2.5645911693573 + }, + { + "auxiliary_loss_clip": 0.06316153, + "auxiliary_loss_mlp": 0.01253974, + "balance_loss_clip": 0.06255822, + "balance_loss_mlp": 0.01252219, + "epoch": 0.5866826995340448, + "flos": 73007941224960.0, + "grad_norm": 0.8367731636564993, + "language_loss": 0.60245061, + "learning_rate": 1.5397297596844337e-06, + "loss": 0.67815191, + "num_input_tokens_seen": 210397815, + "router_z_loss_clip": 0.60449219, + "router_z_loss_mlp": 0.01760864, + "step": 9758, + "time_per_iteration": 3.129420042037964 + }, + { + "auxiliary_loss_clip": 0.06425761, + "auxiliary_loss_mlp": 0.01269834, + "balance_loss_clip": 0.06276144, + "balance_loss_mlp": 0.0125824, + "epoch": 0.5867428227867127, + "flos": 21291716436480.0, + "grad_norm": 2.341889353580635, + "language_loss": 0.7231499, + "learning_rate": 1.5393507603957212e-06, + "loss": 0.80010581, + "num_input_tokens_seen": 210413900, + "router_z_loss_clip": 1.49316406, + "router_z_loss_mlp": 0.11602783, + "step": 9759, + "time_per_iteration": 2.5044219493865967 + }, + { + "auxiliary_loss_clip": 0.06416983, + "auxiliary_loss_mlp": 0.01266034, + "balance_loss_clip": 0.06274659, + "balance_loss_mlp": 0.01254924, + "epoch": 0.5868029460393808, + "flos": 33476356961280.0, + "grad_norm": 1.459885556596891, + "language_loss": 0.73556709, + "learning_rate": 1.5389717785748742e-06, + "loss": 0.8123973, + "num_input_tokens_seen": 210434110, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.11114502, + "step": 9760, + "time_per_iteration": 2.662318229675293 + }, + { + "auxiliary_loss_clip": 0.06416824, + "auxiliary_loss_mlp": 0.01264293, + "balance_loss_clip": 0.06273922, + "balance_loss_mlp": 0.01252944, + "epoch": 0.5868630692920487, + "flos": 17894382243840.0, + "grad_norm": 1.6271911446451897, + "language_loss": 0.7251972, + "learning_rate": 1.5385928142362637e-06, + "loss": 0.80200839, + "num_input_tokens_seen": 210451685, + "router_z_loss_clip": 1.42871094, + "router_z_loss_mlp": 0.11352539, + "step": 9761, + "time_per_iteration": 2.635671377182007 + }, + { + "auxiliary_loss_clip": 0.06421126, + "auxiliary_loss_mlp": 0.01265487, + "balance_loss_clip": 0.06272967, + "balance_loss_mlp": 0.01253274, + "epoch": 0.5869231925447167, + "flos": 21041770106880.0, + "grad_norm": 1.8098960680000724, + "language_loss": 0.74938971, + "learning_rate": 1.5382138673942597e-06, + "loss": 0.8262558, + "num_input_tokens_seen": 210470825, + "router_z_loss_clip": 1.47949219, + "router_z_loss_mlp": 0.12200928, + "step": 9762, + "time_per_iteration": 2.511338472366333 + }, + { + "auxiliary_loss_clip": 0.06414436, + "auxiliary_loss_mlp": 0.01266483, + "balance_loss_clip": 0.06275184, + "balance_loss_mlp": 0.01255766, + "epoch": 0.5869833157973846, + "flos": 74753288974080.0, + "grad_norm": 1.2323244190692502, + "language_loss": 0.72678411, + "learning_rate": 1.5378349380632317e-06, + "loss": 0.80359328, + "num_input_tokens_seen": 210500075, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.10723877, + "step": 9763, + "time_per_iteration": 2.966012716293335 + }, + { + "auxiliary_loss_clip": 0.06416167, + "auxiliary_loss_mlp": 0.01264221, + "balance_loss_clip": 0.06274015, + "balance_loss_mlp": 0.01253296, + "epoch": 0.5870434390500526, + "flos": 17644687476480.0, + "grad_norm": 1.6070407244149296, + "language_loss": 0.79883134, + "learning_rate": 1.53745602625755e-06, + "loss": 0.87563521, + "num_input_tokens_seen": 210518150, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.10931396, + "step": 9764, + "time_per_iteration": 2.5360097885131836 + }, + { + "auxiliary_loss_clip": 0.06420099, + "auxiliary_loss_mlp": 0.01269959, + "balance_loss_clip": 0.06274946, + "balance_loss_mlp": 0.01258342, + "epoch": 0.5871035623027205, + "flos": 21512424890880.0, + "grad_norm": 2.0596306569779967, + "language_loss": 0.79149717, + "learning_rate": 1.5370771319915819e-06, + "loss": 0.86839771, + "num_input_tokens_seen": 210537760, + "router_z_loss_clip": 1.45019531, + "router_z_loss_mlp": 0.1161499, + "step": 9765, + "time_per_iteration": 2.523232936859131 + }, + { + "auxiliary_loss_clip": 0.06413256, + "auxiliary_loss_mlp": 0.01264834, + "balance_loss_clip": 0.06272542, + "balance_loss_mlp": 0.01254427, + "epoch": 0.5871636855553886, + "flos": 13556744732160.0, + "grad_norm": 1.6377752901078153, + "language_loss": 0.83660257, + "learning_rate": 1.5366982552796947e-06, + "loss": 0.91338348, + "num_input_tokens_seen": 210555515, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.10406494, + "step": 9766, + "time_per_iteration": 2.468043804168701 + }, + { + "auxiliary_loss_clip": 0.06423902, + "auxiliary_loss_mlp": 0.01268958, + "balance_loss_clip": 0.06274862, + "balance_loss_mlp": 0.01257639, + "epoch": 0.5872238088080565, + "flos": 26220006178560.0, + "grad_norm": 1.5173362705755495, + "language_loss": 0.69876915, + "learning_rate": 1.536319396136257e-06, + "loss": 0.77569771, + "num_input_tokens_seen": 210575000, + "router_z_loss_clip": 1.49121094, + "router_z_loss_mlp": 0.11322021, + "step": 9767, + "time_per_iteration": 2.53935170173645 + }, + { + "auxiliary_loss_clip": 0.06416009, + "auxiliary_loss_mlp": 0.01268588, + "balance_loss_clip": 0.06271268, + "balance_loss_mlp": 0.0125743, + "epoch": 0.5872839320607245, + "flos": 30673196870400.0, + "grad_norm": 6.458419959703109, + "language_loss": 0.64030594, + "learning_rate": 1.5359405545756336e-06, + "loss": 0.71715188, + "num_input_tokens_seen": 210595185, + "router_z_loss_clip": 1.44824219, + "router_z_loss_mlp": 0.11151123, + "step": 9768, + "time_per_iteration": 2.6036899089813232 + }, + { + "auxiliary_loss_clip": 0.06324692, + "auxiliary_loss_mlp": 0.01254391, + "balance_loss_clip": 0.06264571, + "balance_loss_mlp": 0.01252818, + "epoch": 0.5873440553133924, + "flos": 60324623925120.0, + "grad_norm": 0.7185710562845293, + "language_loss": 0.53754711, + "learning_rate": 1.5355617306121914e-06, + "loss": 0.61333793, + "num_input_tokens_seen": 210653210, + "router_z_loss_clip": 0.6015625, + "router_z_loss_mlp": 0.01573944, + "step": 9769, + "time_per_iteration": 4.53153133392334 + }, + { + "auxiliary_loss_clip": 0.06416724, + "auxiliary_loss_mlp": 0.01267359, + "balance_loss_clip": 0.0627375, + "balance_loss_mlp": 0.01256409, + "epoch": 0.5874041785660604, + "flos": 21545016929280.0, + "grad_norm": 1.3491952646211745, + "language_loss": 0.70993185, + "learning_rate": 1.5351829242602945e-06, + "loss": 0.78677267, + "num_input_tokens_seen": 210673750, + "router_z_loss_clip": 1.42871094, + "router_z_loss_mlp": 0.10949707, + "step": 9770, + "time_per_iteration": 2.5152831077575684 + }, + { + "auxiliary_loss_clip": 0.06416201, + "auxiliary_loss_mlp": 0.01268245, + "balance_loss_clip": 0.06274108, + "balance_loss_mlp": 0.01256801, + "epoch": 0.5874643018187284, + "flos": 24395778938880.0, + "grad_norm": 1.9550841164663295, + "language_loss": 0.67880088, + "learning_rate": 1.5348041355343077e-06, + "loss": 0.75564533, + "num_input_tokens_seen": 210692960, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.11444092, + "step": 9771, + "time_per_iteration": 2.518069267272949 + }, + { + "auxiliary_loss_clip": 0.06421787, + "auxiliary_loss_mlp": 0.0126791, + "balance_loss_clip": 0.06274431, + "balance_loss_mlp": 0.0125531, + "epoch": 0.5875244250713964, + "flos": 28155300405120.0, + "grad_norm": 1.4791048602495522, + "language_loss": 0.66491324, + "learning_rate": 1.5344253644485954e-06, + "loss": 0.74181026, + "num_input_tokens_seen": 210714040, + "router_z_loss_clip": 1.47363281, + "router_z_loss_mlp": 0.1260376, + "step": 9772, + "time_per_iteration": 2.5565338134765625 + }, + { + "auxiliary_loss_clip": 0.0642426, + "auxiliary_loss_mlp": 0.01271472, + "balance_loss_clip": 0.06276119, + "balance_loss_mlp": 0.01258866, + "epoch": 0.5875845483240644, + "flos": 25819566716160.0, + "grad_norm": 1.5545187987766196, + "language_loss": 0.7466417, + "learning_rate": 1.534046611017519e-06, + "loss": 0.82359904, + "num_input_tokens_seen": 210733710, + "router_z_loss_clip": 1.48144531, + "router_z_loss_mlp": 0.12615967, + "step": 9773, + "time_per_iteration": 2.533243179321289 + }, + { + "auxiliary_loss_clip": 0.06421398, + "auxiliary_loss_mlp": 0.0126674, + "balance_loss_clip": 0.06276072, + "balance_loss_mlp": 0.01255606, + "epoch": 0.5876446715767323, + "flos": 26913843112320.0, + "grad_norm": 1.8911636717759477, + "language_loss": 0.54071677, + "learning_rate": 1.5336678752554421e-06, + "loss": 0.61759812, + "num_input_tokens_seen": 210753580, + "router_z_loss_clip": 1.45507812, + "router_z_loss_mlp": 0.11138916, + "step": 9774, + "time_per_iteration": 2.5565576553344727 + }, + { + "auxiliary_loss_clip": 0.06419463, + "auxiliary_loss_mlp": 0.01264428, + "balance_loss_clip": 0.06276506, + "balance_loss_mlp": 0.01253192, + "epoch": 0.5877047948294003, + "flos": 36693750510720.0, + "grad_norm": 2.5652883668591886, + "language_loss": 0.65881801, + "learning_rate": 1.5332891571767264e-06, + "loss": 0.73565692, + "num_input_tokens_seen": 210773495, + "router_z_loss_clip": 1.42871094, + "router_z_loss_mlp": 0.11242676, + "step": 9775, + "time_per_iteration": 4.102318525314331 + }, + { + "auxiliary_loss_clip": 0.06418855, + "auxiliary_loss_mlp": 0.01267575, + "balance_loss_clip": 0.06274112, + "balance_loss_mlp": 0.01256459, + "epoch": 0.5877649180820682, + "flos": 26732057679360.0, + "grad_norm": 1.541611587459476, + "language_loss": 0.73877925, + "learning_rate": 1.5329104567957326e-06, + "loss": 0.81564349, + "num_input_tokens_seen": 210793645, + "router_z_loss_clip": 1.44726562, + "router_z_loss_mlp": 0.11114502, + "step": 9776, + "time_per_iteration": 2.534105062484741 + }, + { + "auxiliary_loss_clip": 0.06416035, + "auxiliary_loss_mlp": 0.01267161, + "balance_loss_clip": 0.06270815, + "balance_loss_mlp": 0.0125586, + "epoch": 0.5878250413347362, + "flos": 21038457870720.0, + "grad_norm": 1.5037279013590201, + "language_loss": 0.7431531, + "learning_rate": 1.532531774126821e-06, + "loss": 0.81998503, + "num_input_tokens_seen": 210813415, + "router_z_loss_clip": 1.45117188, + "router_z_loss_mlp": 0.11315918, + "step": 9777, + "time_per_iteration": 2.501791000366211 + }, + { + "auxiliary_loss_clip": 0.06412566, + "auxiliary_loss_mlp": 0.01267719, + "balance_loss_clip": 0.06273127, + "balance_loss_mlp": 0.01257407, + "epoch": 0.5878851645874041, + "flos": 25491397000320.0, + "grad_norm": 1.389592011343503, + "language_loss": 0.74136406, + "learning_rate": 1.5321531091843512e-06, + "loss": 0.81816691, + "num_input_tokens_seen": 210833850, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.10302734, + "step": 9778, + "time_per_iteration": 2.5198276042938232 + }, + { + "auxiliary_loss_clip": 0.06416066, + "auxiliary_loss_mlp": 0.01272779, + "balance_loss_clip": 0.06272696, + "balance_loss_mlp": 0.01261293, + "epoch": 0.5879452878400722, + "flos": 23775930760320.0, + "grad_norm": 1.6684393614308786, + "language_loss": 0.70061487, + "learning_rate": 1.5317744619826824e-06, + "loss": 0.77750337, + "num_input_tokens_seen": 210853115, + "router_z_loss_clip": 1.43457031, + "router_z_loss_mlp": 0.11486816, + "step": 9779, + "time_per_iteration": 3.9999070167541504 + }, + { + "auxiliary_loss_clip": 0.06419669, + "auxiliary_loss_mlp": 0.01264938, + "balance_loss_clip": 0.06273909, + "balance_loss_mlp": 0.0125331, + "epoch": 0.5880054110927401, + "flos": 17830749467520.0, + "grad_norm": 1.9325071243234666, + "language_loss": 0.67414713, + "learning_rate": 1.5313958325361727e-06, + "loss": 0.75099313, + "num_input_tokens_seen": 210872090, + "router_z_loss_clip": 1.45800781, + "router_z_loss_mlp": 0.11633301, + "step": 9780, + "time_per_iteration": 2.525421142578125 + }, + { + "auxiliary_loss_clip": 0.06422442, + "auxiliary_loss_mlp": 0.01271374, + "balance_loss_clip": 0.0627559, + "balance_loss_mlp": 0.0125981, + "epoch": 0.5880655343454081, + "flos": 19469417840640.0, + "grad_norm": 1.9086155780635632, + "language_loss": 0.73100537, + "learning_rate": 1.5310172208591807e-06, + "loss": 0.80794352, + "num_input_tokens_seen": 210888490, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 0.11572266, + "step": 9781, + "time_per_iteration": 2.4647257328033447 + }, + { + "auxiliary_loss_clip": 0.06415875, + "auxiliary_loss_mlp": 0.01269752, + "balance_loss_clip": 0.06273176, + "balance_loss_mlp": 0.01258731, + "epoch": 0.588125657598076, + "flos": 21403999307520.0, + "grad_norm": 1.283507981192047, + "language_loss": 0.7022016, + "learning_rate": 1.5306386269660622e-06, + "loss": 0.77905786, + "num_input_tokens_seen": 210908220, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.11016846, + "step": 9782, + "time_per_iteration": 2.531780481338501 + }, + { + "auxiliary_loss_clip": 0.06420694, + "auxiliary_loss_mlp": 0.01268128, + "balance_loss_clip": 0.06274669, + "balance_loss_mlp": 0.01256314, + "epoch": 0.588185780850744, + "flos": 16040246296320.0, + "grad_norm": 2.020771184042221, + "language_loss": 0.71036118, + "learning_rate": 1.5302600508711741e-06, + "loss": 0.78724945, + "num_input_tokens_seen": 210923945, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.11804199, + "step": 9783, + "time_per_iteration": 2.452061176300049 + }, + { + "auxiliary_loss_clip": 0.06426281, + "auxiliary_loss_mlp": 0.01267542, + "balance_loss_clip": 0.06277394, + "balance_loss_mlp": 0.01255538, + "epoch": 0.588245904103412, + "flos": 23734282481280.0, + "grad_norm": 1.861465214251895, + "language_loss": 0.69312334, + "learning_rate": 1.5298814925888719e-06, + "loss": 0.77006149, + "num_input_tokens_seen": 210941955, + "router_z_loss_clip": 1.48828125, + "router_z_loss_mlp": 0.12005615, + "step": 9784, + "time_per_iteration": 2.552767515182495 + }, + { + "auxiliary_loss_clip": 0.06421058, + "auxiliary_loss_mlp": 0.01265879, + "balance_loss_clip": 0.06273105, + "balance_loss_mlp": 0.01254596, + "epoch": 0.58830602735608, + "flos": 33810983441280.0, + "grad_norm": 1.7066395827536198, + "language_loss": 0.69576097, + "learning_rate": 1.5295029521335102e-06, + "loss": 0.77263039, + "num_input_tokens_seen": 210963105, + "router_z_loss_clip": 1.47753906, + "router_z_loss_mlp": 0.112854, + "step": 9785, + "time_per_iteration": 3.9847395420074463 + }, + { + "auxiliary_loss_clip": 0.06415717, + "auxiliary_loss_mlp": 0.01265718, + "balance_loss_clip": 0.06274907, + "balance_loss_mlp": 0.01255352, + "epoch": 0.588366150608748, + "flos": 17096144722560.0, + "grad_norm": 1.8665479354272698, + "language_loss": 0.78022271, + "learning_rate": 1.5291244295194448e-06, + "loss": 0.85703707, + "num_input_tokens_seen": 210978720, + "router_z_loss_clip": 1.40722656, + "router_z_loss_mlp": 0.10369873, + "step": 9786, + "time_per_iteration": 2.4842867851257324 + }, + { + "auxiliary_loss_clip": 0.06423976, + "auxiliary_loss_mlp": 0.01266691, + "balance_loss_clip": 0.06278287, + "balance_loss_mlp": 0.01255128, + "epoch": 0.5884262738614159, + "flos": 22133698588800.0, + "grad_norm": 1.4734886628165487, + "language_loss": 0.78796208, + "learning_rate": 1.5287459247610276e-06, + "loss": 0.86486876, + "num_input_tokens_seen": 210998750, + "router_z_loss_clip": 1.45605469, + "router_z_loss_mlp": 0.11566162, + "step": 9787, + "time_per_iteration": 2.497192144393921 + }, + { + "auxiliary_loss_clip": 0.06418703, + "auxiliary_loss_mlp": 0.01265555, + "balance_loss_clip": 0.06275064, + "balance_loss_mlp": 0.01254617, + "epoch": 0.5884863971140839, + "flos": 21038038600320.0, + "grad_norm": 1.5088398107909506, + "language_loss": 0.66488671, + "learning_rate": 1.5283674378726116e-06, + "loss": 0.74172926, + "num_input_tokens_seen": 211017550, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.10943604, + "step": 9788, + "time_per_iteration": 2.5208425521850586 + }, + { + "auxiliary_loss_clip": 0.06416389, + "auxiliary_loss_mlp": 0.01266859, + "balance_loss_clip": 0.0627557, + "balance_loss_mlp": 0.01255212, + "epoch": 0.5885465203667518, + "flos": 23811835034880.0, + "grad_norm": 2.124690797246634, + "language_loss": 0.8100794, + "learning_rate": 1.5279889688685506e-06, + "loss": 0.88691187, + "num_input_tokens_seen": 211034135, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.11651611, + "step": 9789, + "time_per_iteration": 2.497751235961914 + }, + { + "auxiliary_loss_clip": 0.06413969, + "auxiliary_loss_mlp": 0.01268072, + "balance_loss_clip": 0.06274658, + "balance_loss_mlp": 0.01257432, + "epoch": 0.5886066436194198, + "flos": 18886647893760.0, + "grad_norm": 1.5219157367370164, + "language_loss": 0.69998693, + "learning_rate": 1.5276105177631944e-06, + "loss": 0.77680737, + "num_input_tokens_seen": 211053850, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.10638428, + "step": 9790, + "time_per_iteration": 2.5238122940063477 + }, + { + "auxiliary_loss_clip": 0.06416899, + "auxiliary_loss_mlp": 0.01266137, + "balance_loss_clip": 0.06275025, + "balance_loss_mlp": 0.01254484, + "epoch": 0.5886667668720877, + "flos": 24797015015040.0, + "grad_norm": 1.9547129753533632, + "language_loss": 0.83327186, + "learning_rate": 1.527232084570895e-06, + "loss": 0.91010225, + "num_input_tokens_seen": 211072165, + "router_z_loss_clip": 1.41601562, + "router_z_loss_mlp": 0.11651611, + "step": 9791, + "time_per_iteration": 2.518833637237549 + }, + { + "auxiliary_loss_clip": 0.06420578, + "auxiliary_loss_mlp": 0.01270103, + "balance_loss_clip": 0.06276245, + "balance_loss_mlp": 0.01259297, + "epoch": 0.5887268901247558, + "flos": 21620473130880.0, + "grad_norm": 1.5293641441028467, + "language_loss": 0.76486295, + "learning_rate": 1.5268536693060026e-06, + "loss": 0.84176975, + "num_input_tokens_seen": 211089630, + "router_z_loss_clip": 1.44140625, + "router_z_loss_mlp": 0.1081543, + "step": 9792, + "time_per_iteration": 2.5101959705352783 + }, + { + "auxiliary_loss_clip": 0.06421857, + "auxiliary_loss_mlp": 0.01269547, + "balance_loss_clip": 0.06273879, + "balance_loss_mlp": 0.01258424, + "epoch": 0.5887870133774237, + "flos": 20487357567360.0, + "grad_norm": 2.1847202997614477, + "language_loss": 0.69169068, + "learning_rate": 1.5264752719828662e-06, + "loss": 0.76860476, + "num_input_tokens_seen": 211106120, + "router_z_loss_clip": 1.48046875, + "router_z_loss_mlp": 0.11114502, + "step": 9793, + "time_per_iteration": 2.4927995204925537 + }, + { + "auxiliary_loss_clip": 0.06418081, + "auxiliary_loss_mlp": 0.01269605, + "balance_loss_clip": 0.06276278, + "balance_loss_mlp": 0.01258483, + "epoch": 0.5888471366300917, + "flos": 19211966570880.0, + "grad_norm": 1.7416997591947727, + "language_loss": 0.60439771, + "learning_rate": 1.5260968926158353e-06, + "loss": 0.68127453, + "num_input_tokens_seen": 211122450, + "router_z_loss_clip": 1.41796875, + "router_z_loss_mlp": 0.11132812, + "step": 9794, + "time_per_iteration": 2.543231248855591 + }, + { + "auxiliary_loss_clip": 0.06420963, + "auxiliary_loss_mlp": 0.01267396, + "balance_loss_clip": 0.06275123, + "balance_loss_mlp": 0.01256113, + "epoch": 0.5889072598827596, + "flos": 19978786010880.0, + "grad_norm": 1.5723031838894885, + "language_loss": 0.65483499, + "learning_rate": 1.525718531219257e-06, + "loss": 0.73171854, + "num_input_tokens_seen": 211141765, + "router_z_loss_clip": 1.45703125, + "router_z_loss_mlp": 0.11291504, + "step": 9795, + "time_per_iteration": 2.502537965774536 + }, + { + "auxiliary_loss_clip": 0.06414207, + "auxiliary_loss_mlp": 0.01265609, + "balance_loss_clip": 0.06274657, + "balance_loss_mlp": 0.01255197, + "epoch": 0.5889673831354276, + "flos": 20747617948800.0, + "grad_norm": 1.4841948976653832, + "language_loss": 0.74256188, + "learning_rate": 1.5253401878074801e-06, + "loss": 0.81936008, + "num_input_tokens_seen": 211160475, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.10418701, + "step": 9796, + "time_per_iteration": 2.496511220932007 + }, + { + "auxiliary_loss_clip": 0.06417978, + "auxiliary_loss_mlp": 0.01267283, + "balance_loss_clip": 0.06275263, + "balance_loss_mlp": 0.01256238, + "epoch": 0.5890275063880956, + "flos": 25307892558720.0, + "grad_norm": 2.3243895650299566, + "language_loss": 0.83142781, + "learning_rate": 1.5249618623948507e-06, + "loss": 0.90828037, + "num_input_tokens_seen": 211180480, + "router_z_loss_clip": 1.42675781, + "router_z_loss_mlp": 0.11047363, + "step": 9797, + "time_per_iteration": 2.5991365909576416 + }, + { + "auxiliary_loss_clip": 0.06417, + "auxiliary_loss_mlp": 0.01261637, + "balance_loss_clip": 0.06275804, + "balance_loss_mlp": 0.01250806, + "epoch": 0.5890876296407636, + "flos": 11770182702720.0, + "grad_norm": 1.5626242229143896, + "language_loss": 0.79473782, + "learning_rate": 1.5245835549957152e-06, + "loss": 0.87152421, + "num_input_tokens_seen": 211198000, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.1083374, + "step": 9798, + "time_per_iteration": 2.5399045944213867 + }, + { + "auxiliary_loss_clip": 0.06414175, + "auxiliary_loss_mlp": 0.01264907, + "balance_loss_clip": 0.06274281, + "balance_loss_mlp": 0.01254584, + "epoch": 0.5891477528934316, + "flos": 13594535723520.0, + "grad_norm": 2.254418827792415, + "language_loss": 0.75000322, + "learning_rate": 1.5242052656244186e-06, + "loss": 0.82679403, + "num_input_tokens_seen": 211214765, + "router_z_loss_clip": 1.39941406, + "router_z_loss_mlp": 0.10321045, + "step": 9799, + "time_per_iteration": 2.4642131328582764 + }, + { + "auxiliary_loss_clip": 0.06420485, + "auxiliary_loss_mlp": 0.01266976, + "balance_loss_clip": 0.06274568, + "balance_loss_mlp": 0.01254798, + "epoch": 0.5892078761460995, + "flos": 15054563191680.0, + "grad_norm": 1.9320779180150096, + "language_loss": 0.76666486, + "learning_rate": 1.5238269942953064e-06, + "loss": 0.84353948, + "num_input_tokens_seen": 211232335, + "router_z_loss_clip": 1.45898438, + "router_z_loss_mlp": 0.12182617, + "step": 9800, + "time_per_iteration": 2.5170304775238037 + }, + { + "auxiliary_loss_clip": 0.06421179, + "auxiliary_loss_mlp": 0.01264846, + "balance_loss_clip": 0.06275316, + "balance_loss_mlp": 0.0125361, + "epoch": 0.5892679993987675, + "flos": 15783591640320.0, + "grad_norm": 1.6350760782373632, + "language_loss": 0.79415876, + "learning_rate": 1.523448741022722e-06, + "loss": 0.87101901, + "num_input_tokens_seen": 211249985, + "router_z_loss_clip": 1.45898438, + "router_z_loss_mlp": 0.11242676, + "step": 9801, + "time_per_iteration": 2.4804494380950928 + }, + { + "auxiliary_loss_clip": 0.06421967, + "auxiliary_loss_mlp": 0.01265274, + "balance_loss_clip": 0.06274579, + "balance_loss_mlp": 0.01253467, + "epoch": 0.5893281226514354, + "flos": 25272281773440.0, + "grad_norm": 1.6257193775599612, + "language_loss": 0.6664654, + "learning_rate": 1.5230705058210088e-06, + "loss": 0.74333781, + "num_input_tokens_seen": 211268425, + "router_z_loss_clip": 1.47265625, + "router_z_loss_mlp": 0.11804199, + "step": 9802, + "time_per_iteration": 2.536524534225464 + }, + { + "auxiliary_loss_clip": 0.06417859, + "auxiliary_loss_mlp": 0.01267449, + "balance_loss_clip": 0.06276833, + "balance_loss_mlp": 0.01256475, + "epoch": 0.5893882459041034, + "flos": 19463380346880.0, + "grad_norm": 2.7221530495776953, + "language_loss": 0.78339422, + "learning_rate": 1.5226922887045108e-06, + "loss": 0.86024731, + "num_input_tokens_seen": 211286680, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.10986328, + "step": 9803, + "time_per_iteration": 2.4658396244049072 + }, + { + "auxiliary_loss_clip": 0.06422158, + "auxiliary_loss_mlp": 0.01266134, + "balance_loss_clip": 0.06275959, + "balance_loss_mlp": 0.01255143, + "epoch": 0.5894483691567713, + "flos": 20640785592960.0, + "grad_norm": 1.3509589673333673, + "language_loss": 0.73070806, + "learning_rate": 1.5223140896875686e-06, + "loss": 0.80759096, + "num_input_tokens_seen": 211307700, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.10986328, + "step": 9804, + "time_per_iteration": 2.5561769008636475 + }, + { + "auxiliary_loss_clip": 0.06421436, + "auxiliary_loss_mlp": 0.01267021, + "balance_loss_clip": 0.06279321, + "balance_loss_mlp": 0.01255779, + "epoch": 0.5895084924094394, + "flos": 17782812132480.0, + "grad_norm": 4.893575785915148, + "language_loss": 0.74802667, + "learning_rate": 1.5219359087845234e-06, + "loss": 0.82491124, + "num_input_tokens_seen": 211324835, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.11254883, + "step": 9805, + "time_per_iteration": 2.4777255058288574 + }, + { + "auxiliary_loss_clip": 0.06430615, + "auxiliary_loss_mlp": 0.01266439, + "balance_loss_clip": 0.06278822, + "balance_loss_mlp": 0.01254542, + "epoch": 0.5895686156621073, + "flos": 20127350499840.0, + "grad_norm": 1.9675390106462767, + "language_loss": 0.78339982, + "learning_rate": 1.5215577460097174e-06, + "loss": 0.8603704, + "num_input_tokens_seen": 211344130, + "router_z_loss_clip": 1.515625, + "router_z_loss_mlp": 0.11901855, + "step": 9806, + "time_per_iteration": 2.556187868118286 + }, + { + "auxiliary_loss_clip": 0.06426841, + "auxiliary_loss_mlp": 0.01268335, + "balance_loss_clip": 0.06283563, + "balance_loss_mlp": 0.01256813, + "epoch": 0.5896287389147753, + "flos": 20856337021440.0, + "grad_norm": 1.8953677951134942, + "language_loss": 0.77413982, + "learning_rate": 1.5211796013774887e-06, + "loss": 0.85109162, + "num_input_tokens_seen": 211362915, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.11523438, + "step": 9807, + "time_per_iteration": 2.519200325012207 + }, + { + "auxiliary_loss_clip": 0.06425367, + "auxiliary_loss_mlp": 0.01268029, + "balance_loss_clip": 0.06276954, + "balance_loss_mlp": 0.01256341, + "epoch": 0.5896888621674432, + "flos": 14543098669440.0, + "grad_norm": 1.5805632295861456, + "language_loss": 0.75183058, + "learning_rate": 1.5208014749021786e-06, + "loss": 0.82876456, + "num_input_tokens_seen": 211380700, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.11694336, + "step": 9808, + "time_per_iteration": 3.908586025238037 + }, + { + "auxiliary_loss_clip": 0.06422409, + "auxiliary_loss_mlp": 0.01266023, + "balance_loss_clip": 0.06277257, + "balance_loss_mlp": 0.01253912, + "epoch": 0.5897489854201112, + "flos": 20893079836800.0, + "grad_norm": 1.9290339931200338, + "language_loss": 0.71909666, + "learning_rate": 1.5204233665981236e-06, + "loss": 0.79598099, + "num_input_tokens_seen": 211400095, + "router_z_loss_clip": 1.45019531, + "router_z_loss_mlp": 0.12103271, + "step": 9809, + "time_per_iteration": 2.5768144130706787 + }, + { + "auxiliary_loss_clip": 0.06423716, + "auxiliary_loss_mlp": 0.01266116, + "balance_loss_clip": 0.0627635, + "balance_loss_mlp": 0.01254272, + "epoch": 0.5898091086727792, + "flos": 20017331688960.0, + "grad_norm": 2.0062119760557473, + "language_loss": 0.82969332, + "learning_rate": 1.5200452764796627e-06, + "loss": 0.90659165, + "num_input_tokens_seen": 211417810, + "router_z_loss_clip": 1.47070312, + "router_z_loss_mlp": 0.1184082, + "step": 9810, + "time_per_iteration": 2.5024096965789795 + }, + { + "auxiliary_loss_clip": 0.06418087, + "auxiliary_loss_mlp": 0.01268409, + "balance_loss_clip": 0.06278655, + "balance_loss_mlp": 0.01257394, + "epoch": 0.5898692319254472, + "flos": 16258816471680.0, + "grad_norm": 2.656719323590735, + "language_loss": 0.81247234, + "learning_rate": 1.5196672045611336e-06, + "loss": 0.8893373, + "num_input_tokens_seen": 211436020, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.11016846, + "step": 9811, + "time_per_iteration": 2.5079774856567383 + }, + { + "auxiliary_loss_clip": 0.06424809, + "auxiliary_loss_mlp": 0.01266333, + "balance_loss_clip": 0.06278014, + "balance_loss_mlp": 0.01254442, + "epoch": 0.5899293551781152, + "flos": 20454723601920.0, + "grad_norm": 1.7175276958807264, + "language_loss": 0.7698791, + "learning_rate": 1.5192891508568715e-06, + "loss": 0.84679055, + "num_input_tokens_seen": 211454335, + "router_z_loss_clip": 1.46777344, + "router_z_loss_mlp": 0.11883545, + "step": 9812, + "time_per_iteration": 2.4813108444213867 + }, + { + "auxiliary_loss_clip": 0.06419283, + "auxiliary_loss_mlp": 0.01264428, + "balance_loss_clip": 0.0627578, + "balance_loss_mlp": 0.01253992, + "epoch": 0.5899894784307831, + "flos": 13886885018880.0, + "grad_norm": 1.6786934004730485, + "language_loss": 0.71137106, + "learning_rate": 1.5189111153812133e-06, + "loss": 0.78820813, + "num_input_tokens_seen": 211472775, + "router_z_loss_clip": 1.43359375, + "router_z_loss_mlp": 0.10437012, + "step": 9813, + "time_per_iteration": 2.5212063789367676 + }, + { + "auxiliary_loss_clip": 0.0641876, + "auxiliary_loss_mlp": 0.01270874, + "balance_loss_clip": 0.06273647, + "balance_loss_mlp": 0.01259394, + "epoch": 0.5900496016834511, + "flos": 20089936851840.0, + "grad_norm": 1.420675326684763, + "language_loss": 0.7244218, + "learning_rate": 1.518533098148494e-06, + "loss": 0.80131817, + "num_input_tokens_seen": 211492195, + "router_z_loss_clip": 1.45019531, + "router_z_loss_mlp": 0.11468506, + "step": 9814, + "time_per_iteration": 2.4773387908935547 + }, + { + "auxiliary_loss_clip": 0.06421163, + "auxiliary_loss_mlp": 0.01268081, + "balance_loss_clip": 0.06276704, + "balance_loss_mlp": 0.01256768, + "epoch": 0.590109724936119, + "flos": 20264133490560.0, + "grad_norm": 1.7152732807584992, + "language_loss": 0.7885775, + "learning_rate": 1.5181550991730476e-06, + "loss": 0.86546993, + "num_input_tokens_seen": 211510220, + "router_z_loss_clip": 1.4453125, + "router_z_loss_mlp": 0.11309814, + "step": 9815, + "time_per_iteration": 3.939445972442627 + }, + { + "auxiliary_loss_clip": 0.06427211, + "auxiliary_loss_mlp": 0.01267373, + "balance_loss_clip": 0.06275927, + "balance_loss_mlp": 0.01255142, + "epoch": 0.590169848188787, + "flos": 24240548050560.0, + "grad_norm": 1.7218203048390952, + "language_loss": 0.76316988, + "learning_rate": 1.5177771184692083e-06, + "loss": 0.84011579, + "num_input_tokens_seen": 211526260, + "router_z_loss_clip": 1.51171875, + "router_z_loss_mlp": 0.12243652, + "step": 9816, + "time_per_iteration": 2.5245048999786377 + }, + { + "auxiliary_loss_clip": 0.06419881, + "auxiliary_loss_mlp": 0.01267479, + "balance_loss_clip": 0.06277047, + "balance_loss_mlp": 0.01255725, + "epoch": 0.590229971441455, + "flos": 17790400926720.0, + "grad_norm": 1.8371364848215923, + "language_loss": 0.81572855, + "learning_rate": 1.517399156051309e-06, + "loss": 0.89260209, + "num_input_tokens_seen": 211542890, + "router_z_loss_clip": 1.42871094, + "router_z_loss_mlp": 0.11743164, + "step": 9817, + "time_per_iteration": 2.4621410369873047 + }, + { + "auxiliary_loss_clip": 0.06418833, + "auxiliary_loss_mlp": 0.01268261, + "balance_loss_clip": 0.06274004, + "balance_loss_mlp": 0.01257544, + "epoch": 0.590290094694123, + "flos": 22243465837440.0, + "grad_norm": 1.5541077044812335, + "language_loss": 0.76864719, + "learning_rate": 1.517021211933682e-06, + "loss": 0.84551811, + "num_input_tokens_seen": 211562685, + "router_z_loss_clip": 1.44726562, + "router_z_loss_mlp": 0.10717773, + "step": 9818, + "time_per_iteration": 2.5125410556793213 + }, + { + "auxiliary_loss_clip": 0.06416667, + "auxiliary_loss_mlp": 0.01265866, + "balance_loss_clip": 0.06275138, + "balance_loss_mlp": 0.01255501, + "epoch": 0.5903502179467909, + "flos": 19104589163520.0, + "grad_norm": 1.8321116335564553, + "language_loss": 0.67227435, + "learning_rate": 1.5166432861306592e-06, + "loss": 0.74909973, + "num_input_tokens_seen": 211579960, + "router_z_loss_clip": 1.41601562, + "router_z_loss_mlp": 0.10369873, + "step": 9819, + "time_per_iteration": 4.011074066162109 + }, + { + "auxiliary_loss_clip": 0.06420997, + "auxiliary_loss_mlp": 0.01266547, + "balance_loss_clip": 0.06275985, + "balance_loss_mlp": 0.01255819, + "epoch": 0.5904103411994589, + "flos": 24241051175040.0, + "grad_norm": 1.4923193447304384, + "language_loss": 0.7829935, + "learning_rate": 1.5162653786565714e-06, + "loss": 0.85986888, + "num_input_tokens_seen": 211599310, + "router_z_loss_clip": 1.45117188, + "router_z_loss_mlp": 0.10723877, + "step": 9820, + "time_per_iteration": 2.5523388385772705 + }, + { + "auxiliary_loss_clip": 0.06318125, + "auxiliary_loss_mlp": 0.01254512, + "balance_loss_clip": 0.06258737, + "balance_loss_mlp": 0.01253092, + "epoch": 0.5904704644521268, + "flos": 64894388774400.0, + "grad_norm": 0.9340841048050909, + "language_loss": 0.65183949, + "learning_rate": 1.5158874895257487e-06, + "loss": 0.72756588, + "num_input_tokens_seen": 211658790, + "router_z_loss_clip": 0.59375, + "router_z_loss_mlp": 0.01417542, + "step": 9821, + "time_per_iteration": 3.1619784832000732 + }, + { + "auxiliary_loss_clip": 0.06416959, + "auxiliary_loss_mlp": 0.0126236, + "balance_loss_clip": 0.06275654, + "balance_loss_mlp": 0.01251935, + "epoch": 0.5905305877047948, + "flos": 19616137539840.0, + "grad_norm": 2.101599923194391, + "language_loss": 0.6190716, + "learning_rate": 1.515509618752521e-06, + "loss": 0.69586486, + "num_input_tokens_seen": 211677240, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.10412598, + "step": 9822, + "time_per_iteration": 2.519482374191284 + }, + { + "auxiliary_loss_clip": 0.06419894, + "auxiliary_loss_mlp": 0.01268851, + "balance_loss_clip": 0.06275024, + "balance_loss_mlp": 0.01257365, + "epoch": 0.5905907109574628, + "flos": 18995660455680.0, + "grad_norm": 1.8507285157055846, + "language_loss": 0.82910419, + "learning_rate": 1.5151317663512173e-06, + "loss": 0.90599167, + "num_input_tokens_seen": 211695485, + "router_z_loss_clip": 1.44921875, + "router_z_loss_mlp": 0.1149292, + "step": 9823, + "time_per_iteration": 2.5134451389312744 + }, + { + "auxiliary_loss_clip": 0.06417045, + "auxiliary_loss_mlp": 0.01267549, + "balance_loss_clip": 0.06275238, + "balance_loss_mlp": 0.01256546, + "epoch": 0.5906508342101308, + "flos": 22206974584320.0, + "grad_norm": 1.8772651852061113, + "language_loss": 0.73388183, + "learning_rate": 1.514753932336165e-06, + "loss": 0.81072783, + "num_input_tokens_seen": 211713090, + "router_z_loss_clip": 1.41796875, + "router_z_loss_mlp": 0.11004639, + "step": 9824, + "time_per_iteration": 3.8841147422790527 + }, + { + "auxiliary_loss_clip": 0.064331, + "auxiliary_loss_mlp": 0.01267625, + "balance_loss_clip": 0.06277563, + "balance_loss_mlp": 0.01255013, + "epoch": 0.5907109574627988, + "flos": 20892995982720.0, + "grad_norm": 1.9523854086350827, + "language_loss": 0.82938302, + "learning_rate": 1.514376116721693e-06, + "loss": 0.90639031, + "num_input_tokens_seen": 211732510, + "router_z_loss_clip": 1.55566406, + "router_z_loss_mlp": 0.12609863, + "step": 9825, + "time_per_iteration": 2.527808427810669 + }, + { + "auxiliary_loss_clip": 0.06417271, + "auxiliary_loss_mlp": 0.01264281, + "balance_loss_clip": 0.06277614, + "balance_loss_mlp": 0.0125422, + "epoch": 0.5907710807154667, + "flos": 21513011869440.0, + "grad_norm": 1.8272335212588457, + "language_loss": 0.76679188, + "learning_rate": 1.5139983195221272e-06, + "loss": 0.84360743, + "num_input_tokens_seen": 211748695, + "router_z_loss_clip": 1.39648438, + "router_z_loss_mlp": 0.10058594, + "step": 9826, + "time_per_iteration": 2.4867746829986572 + }, + { + "auxiliary_loss_clip": 0.06416261, + "auxiliary_loss_mlp": 0.01262552, + "balance_loss_clip": 0.06274769, + "balance_loss_mlp": 0.01252419, + "epoch": 0.5908312039681347, + "flos": 22024979516160.0, + "grad_norm": 1.5050840799955296, + "language_loss": 0.7292102, + "learning_rate": 1.513620540751793e-06, + "loss": 0.80599833, + "num_input_tokens_seen": 211768545, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.10131836, + "step": 9827, + "time_per_iteration": 2.5261569023132324 + }, + { + "auxiliary_loss_clip": 0.06419525, + "auxiliary_loss_mlp": 0.01266997, + "balance_loss_clip": 0.0627335, + "balance_loss_mlp": 0.0125588, + "epoch": 0.5908913272208026, + "flos": 18485579525760.0, + "grad_norm": 1.8170415974974599, + "language_loss": 0.80223072, + "learning_rate": 1.5132427804250178e-06, + "loss": 0.87909591, + "num_input_tokens_seen": 211786665, + "router_z_loss_clip": 1.46191406, + "router_z_loss_mlp": 0.11120605, + "step": 9828, + "time_per_iteration": 2.4725866317749023 + }, + { + "auxiliary_loss_clip": 0.06421993, + "auxiliary_loss_mlp": 0.01272492, + "balance_loss_clip": 0.06275676, + "balance_loss_mlp": 0.01260375, + "epoch": 0.5909514504734706, + "flos": 12317006448000.0, + "grad_norm": 1.8455350152663679, + "language_loss": 0.88620806, + "learning_rate": 1.5128650385561241e-06, + "loss": 0.96315295, + "num_input_tokens_seen": 211801215, + "router_z_loss_clip": 1.46289062, + "router_z_loss_mlp": 0.12133789, + "step": 9829, + "time_per_iteration": 2.4783804416656494 + }, + { + "auxiliary_loss_clip": 0.06324679, + "auxiliary_loss_mlp": 0.01254341, + "balance_loss_clip": 0.06265787, + "balance_loss_mlp": 0.01252693, + "epoch": 0.5910115737261386, + "flos": 70233557811840.0, + "grad_norm": 0.7549892406299625, + "language_loss": 0.57903004, + "learning_rate": 1.5124873151594376e-06, + "loss": 0.6548202, + "num_input_tokens_seen": 211857005, + "router_z_loss_clip": 0.58984375, + "router_z_loss_mlp": 0.01651001, + "step": 9830, + "time_per_iteration": 3.0390307903289795 + }, + { + "auxiliary_loss_clip": 0.0643173, + "auxiliary_loss_mlp": 0.01269908, + "balance_loss_clip": 0.06281478, + "balance_loss_mlp": 0.01257308, + "epoch": 0.5910716969788066, + "flos": 22024266756480.0, + "grad_norm": 2.1560619163105965, + "language_loss": 0.75963652, + "learning_rate": 1.5121096102492812e-06, + "loss": 0.83665287, + "num_input_tokens_seen": 211876675, + "router_z_loss_clip": 1.50097656, + "router_z_loss_mlp": 0.12591553, + "step": 9831, + "time_per_iteration": 2.5367510318756104 + }, + { + "auxiliary_loss_clip": 0.06409759, + "auxiliary_loss_mlp": 0.01262704, + "balance_loss_clip": 0.06272636, + "balance_loss_mlp": 0.01252124, + "epoch": 0.5911318202314745, + "flos": 21258034295040.0, + "grad_norm": 1.5753423885742641, + "language_loss": 0.77885556, + "learning_rate": 1.5117319238399767e-06, + "loss": 0.85558021, + "num_input_tokens_seen": 211895725, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.10583496, + "step": 9832, + "time_per_iteration": 2.504584789276123 + }, + { + "auxiliary_loss_clip": 0.06416824, + "auxiliary_loss_mlp": 0.01265662, + "balance_loss_clip": 0.06274946, + "balance_loss_mlp": 0.01254797, + "epoch": 0.5911919434841425, + "flos": 17827353377280.0, + "grad_norm": 1.6998910709640538, + "language_loss": 0.83265263, + "learning_rate": 1.511354255945847e-06, + "loss": 0.90947747, + "num_input_tokens_seen": 211913860, + "router_z_loss_clip": 1.41894531, + "router_z_loss_mlp": 0.10864258, + "step": 9833, + "time_per_iteration": 2.508920192718506 + }, + { + "auxiliary_loss_clip": 0.06420296, + "auxiliary_loss_mlp": 0.01269729, + "balance_loss_clip": 0.06274877, + "balance_loss_mlp": 0.01259006, + "epoch": 0.5912520667368104, + "flos": 20380818700800.0, + "grad_norm": 1.4145847544307324, + "language_loss": 0.74488783, + "learning_rate": 1.5109766065812123e-06, + "loss": 0.82178807, + "num_input_tokens_seen": 211932880, + "router_z_loss_clip": 1.453125, + "router_z_loss_mlp": 0.10723877, + "step": 9834, + "time_per_iteration": 2.515340566635132 + }, + { + "auxiliary_loss_clip": 0.06420908, + "auxiliary_loss_mlp": 0.0126652, + "balance_loss_clip": 0.06276181, + "balance_loss_mlp": 0.01255308, + "epoch": 0.5913121899894784, + "flos": 17936240158080.0, + "grad_norm": 2.2554155860211296, + "language_loss": 0.78118962, + "learning_rate": 1.5105989757603942e-06, + "loss": 0.85806394, + "num_input_tokens_seen": 211948625, + "router_z_loss_clip": 1.44824219, + "router_z_loss_mlp": 0.11212158, + "step": 9835, + "time_per_iteration": 2.516449213027954 + }, + { + "auxiliary_loss_clip": 0.06422424, + "auxiliary_loss_mlp": 0.01268422, + "balance_loss_clip": 0.06274521, + "balance_loss_mlp": 0.0125724, + "epoch": 0.5913723132421465, + "flos": 22133405099520.0, + "grad_norm": 1.7910918924229287, + "language_loss": 0.74562353, + "learning_rate": 1.5102213634977117e-06, + "loss": 0.82253206, + "num_input_tokens_seen": 211965355, + "router_z_loss_clip": 1.47949219, + "router_z_loss_mlp": 0.11187744, + "step": 9836, + "time_per_iteration": 2.4944818019866943 + }, + { + "auxiliary_loss_clip": 0.06421088, + "auxiliary_loss_mlp": 0.01263965, + "balance_loss_clip": 0.06274953, + "balance_loss_mlp": 0.01252396, + "epoch": 0.5914324364948144, + "flos": 15702056017920.0, + "grad_norm": 1.9466597288818261, + "language_loss": 0.82267582, + "learning_rate": 1.5098437698074841e-06, + "loss": 0.89952636, + "num_input_tokens_seen": 211982245, + "router_z_loss_clip": 1.46289062, + "router_z_loss_mlp": 0.11572266, + "step": 9837, + "time_per_iteration": 2.5073657035827637 + }, + { + "auxiliary_loss_clip": 0.06423111, + "auxiliary_loss_mlp": 0.01265723, + "balance_loss_clip": 0.06276567, + "balance_loss_mlp": 0.01253665, + "epoch": 0.5914925597474824, + "flos": 22753924110720.0, + "grad_norm": 1.6146002375859378, + "language_loss": 0.7983368, + "learning_rate": 1.5094661947040304e-06, + "loss": 0.87522513, + "num_input_tokens_seen": 212000250, + "router_z_loss_clip": 1.46386719, + "router_z_loss_mlp": 0.1206665, + "step": 9838, + "time_per_iteration": 2.5024936199188232 + }, + { + "auxiliary_loss_clip": 0.06421801, + "auxiliary_loss_mlp": 0.01267887, + "balance_loss_clip": 0.06276052, + "balance_loss_mlp": 0.01256503, + "epoch": 0.5915526830001503, + "flos": 18298092015360.0, + "grad_norm": 1.7930328536333848, + "language_loss": 0.70194936, + "learning_rate": 1.5090886382016673e-06, + "loss": 0.77884626, + "num_input_tokens_seen": 212017505, + "router_z_loss_clip": 1.45703125, + "router_z_loss_mlp": 0.11383057, + "step": 9839, + "time_per_iteration": 2.5000133514404297 + }, + { + "auxiliary_loss_clip": 0.06421608, + "auxiliary_loss_mlp": 0.01265072, + "balance_loss_clip": 0.06275722, + "balance_loss_mlp": 0.01254462, + "epoch": 0.5916128062528183, + "flos": 17024713516800.0, + "grad_norm": 2.2460586823912254, + "language_loss": 0.65840614, + "learning_rate": 1.5087111003147124e-06, + "loss": 0.73527294, + "num_input_tokens_seen": 212034595, + "router_z_loss_clip": 1.45898438, + "router_z_loss_mlp": 0.10614014, + "step": 9840, + "time_per_iteration": 2.472325325012207 + }, + { + "auxiliary_loss_clip": 0.06421183, + "auxiliary_loss_mlp": 0.01269035, + "balance_loss_clip": 0.06273993, + "balance_loss_mlp": 0.01257019, + "epoch": 0.5916729295054862, + "flos": 24761194594560.0, + "grad_norm": 7.488465580129743, + "language_loss": 0.82013118, + "learning_rate": 1.5083335810574813e-06, + "loss": 0.89703333, + "num_input_tokens_seen": 212055775, + "router_z_loss_clip": 1.47265625, + "router_z_loss_mlp": 0.12023926, + "step": 9841, + "time_per_iteration": 2.539569139480591 + }, + { + "auxiliary_loss_clip": 0.06417108, + "auxiliary_loss_mlp": 0.01266112, + "balance_loss_clip": 0.06275231, + "balance_loss_mlp": 0.01255782, + "epoch": 0.5917330527581542, + "flos": 15963196867200.0, + "grad_norm": 1.7355438933283587, + "language_loss": 0.69817364, + "learning_rate": 1.507956080444291e-06, + "loss": 0.77500588, + "num_input_tokens_seen": 212074000, + "router_z_loss_clip": 1.41796875, + "router_z_loss_mlp": 0.10333252, + "step": 9842, + "time_per_iteration": 2.4748387336730957 + }, + { + "auxiliary_loss_clip": 0.06423896, + "auxiliary_loss_mlp": 0.0126808, + "balance_loss_clip": 0.06278209, + "balance_loss_mlp": 0.01256332, + "epoch": 0.5917931760108222, + "flos": 23806719936000.0, + "grad_norm": 2.0642371985300105, + "language_loss": 0.83243513, + "learning_rate": 1.5075785984894549e-06, + "loss": 0.90935493, + "num_input_tokens_seen": 212091415, + "router_z_loss_clip": 1.45703125, + "router_z_loss_mlp": 0.11755371, + "step": 9843, + "time_per_iteration": 2.5579354763031006 + }, + { + "auxiliary_loss_clip": 0.06423706, + "auxiliary_loss_mlp": 0.01264185, + "balance_loss_clip": 0.06277691, + "balance_loss_mlp": 0.01252419, + "epoch": 0.5918532992634902, + "flos": 23254864945920.0, + "grad_norm": 2.21208381325965, + "language_loss": 0.81869078, + "learning_rate": 1.5072011352072875e-06, + "loss": 0.89556968, + "num_input_tokens_seen": 212105255, + "router_z_loss_clip": 1.45898438, + "router_z_loss_mlp": 0.11773682, + "step": 9844, + "time_per_iteration": 2.4732062816619873 + }, + { + "auxiliary_loss_clip": 0.06423113, + "auxiliary_loss_mlp": 0.01264577, + "balance_loss_clip": 0.0627753, + "balance_loss_mlp": 0.01253496, + "epoch": 0.5919134225161581, + "flos": 19505867166720.0, + "grad_norm": 2.0396261684123966, + "language_loss": 0.74979722, + "learning_rate": 1.5068236906121032e-06, + "loss": 0.8266741, + "num_input_tokens_seen": 212122765, + "router_z_loss_clip": 1.45507812, + "router_z_loss_mlp": 0.11077881, + "step": 9845, + "time_per_iteration": 2.5498902797698975 + }, + { + "auxiliary_loss_clip": 0.0642004, + "auxiliary_loss_mlp": 0.01267189, + "balance_loss_clip": 0.06273404, + "balance_loss_mlp": 0.01255215, + "epoch": 0.5919735457688261, + "flos": 38810201264640.0, + "grad_norm": 1.7793580681254029, + "language_loss": 0.64624578, + "learning_rate": 1.506446264718213e-06, + "loss": 0.72311807, + "num_input_tokens_seen": 212143960, + "router_z_loss_clip": 1.46484375, + "router_z_loss_mlp": 0.11962891, + "step": 9846, + "time_per_iteration": 2.6562187671661377 + }, + { + "auxiliary_loss_clip": 0.0641156, + "auxiliary_loss_mlp": 0.01268591, + "balance_loss_clip": 0.06275991, + "balance_loss_mlp": 0.01258851, + "epoch": 0.592033669021494, + "flos": 22170567185280.0, + "grad_norm": 1.5989871653678733, + "language_loss": 0.76435882, + "learning_rate": 1.506068857539931e-06, + "loss": 0.84116036, + "num_input_tokens_seen": 212162005, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.09735107, + "step": 9847, + "time_per_iteration": 2.5877273082733154 + }, + { + "auxiliary_loss_clip": 0.06420001, + "auxiliary_loss_mlp": 0.01267428, + "balance_loss_clip": 0.06274936, + "balance_loss_mlp": 0.01255477, + "epoch": 0.592093792274162, + "flos": 22717600565760.0, + "grad_norm": 1.9085044692476394, + "language_loss": 0.62601185, + "learning_rate": 1.5056914690915667e-06, + "loss": 0.70288616, + "num_input_tokens_seen": 212181635, + "router_z_loss_clip": 1.44921875, + "router_z_loss_mlp": 0.11956787, + "step": 9848, + "time_per_iteration": 3.9838032722473145 + }, + { + "auxiliary_loss_clip": 0.06422321, + "auxiliary_loss_mlp": 0.01263974, + "balance_loss_clip": 0.06275022, + "balance_loss_mlp": 0.01252959, + "epoch": 0.59215391552683, + "flos": 22535605497600.0, + "grad_norm": 2.0066393042716855, + "language_loss": 0.76503384, + "learning_rate": 1.5053140993874312e-06, + "loss": 0.84189683, + "num_input_tokens_seen": 212201615, + "router_z_loss_clip": 1.47265625, + "router_z_loss_mlp": 0.11022949, + "step": 9849, + "time_per_iteration": 2.5015931129455566 + }, + { + "auxiliary_loss_clip": 0.06421839, + "auxiliary_loss_mlp": 0.01268681, + "balance_loss_clip": 0.06277264, + "balance_loss_mlp": 0.01256671, + "epoch": 0.592214038779498, + "flos": 24505965457920.0, + "grad_norm": 1.745648722955103, + "language_loss": 0.75836027, + "learning_rate": 1.5049367484418353e-06, + "loss": 0.8352654, + "num_input_tokens_seen": 212219355, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.12005615, + "step": 9850, + "time_per_iteration": 2.600179672241211 + }, + { + "auxiliary_loss_clip": 0.06417172, + "auxiliary_loss_mlp": 0.01268411, + "balance_loss_clip": 0.06275059, + "balance_loss_mlp": 0.01257367, + "epoch": 0.592274162032166, + "flos": 21837156589440.0, + "grad_norm": 1.6508975523953922, + "language_loss": 0.75545883, + "learning_rate": 1.5045594162690868e-06, + "loss": 0.83231473, + "num_input_tokens_seen": 212236710, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.1105957, + "step": 9851, + "time_per_iteration": 2.4818735122680664 + }, + { + "auxiliary_loss_clip": 0.06419359, + "auxiliary_loss_mlp": 0.01266702, + "balance_loss_clip": 0.06275028, + "balance_loss_mlp": 0.01254918, + "epoch": 0.5923342852848339, + "flos": 24615061873920.0, + "grad_norm": 1.7463946887344501, + "language_loss": 0.70506394, + "learning_rate": 1.5041821028834954e-06, + "loss": 0.78192449, + "num_input_tokens_seen": 212256195, + "router_z_loss_clip": 1.44238281, + "router_z_loss_mlp": 0.11779785, + "step": 9852, + "time_per_iteration": 2.587822675704956 + }, + { + "auxiliary_loss_clip": 0.06423963, + "auxiliary_loss_mlp": 0.01273382, + "balance_loss_clip": 0.06275325, + "balance_loss_mlp": 0.01261043, + "epoch": 0.5923944085375019, + "flos": 19944307255680.0, + "grad_norm": 1.582534152024796, + "language_loss": 0.80272847, + "learning_rate": 1.5038048082993685e-06, + "loss": 0.87970185, + "num_input_tokens_seen": 212274085, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.12347412, + "step": 9853, + "time_per_iteration": 2.4834022521972656 + }, + { + "auxiliary_loss_clip": 0.06412584, + "auxiliary_loss_mlp": 0.01264493, + "balance_loss_clip": 0.06272356, + "balance_loss_mlp": 0.01253985, + "epoch": 0.5924545317901698, + "flos": 28666177948800.0, + "grad_norm": 1.4145056961897013, + "language_loss": 0.67743915, + "learning_rate": 1.5034275325310124e-06, + "loss": 0.75421, + "num_input_tokens_seen": 212295530, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.1050415, + "step": 9854, + "time_per_iteration": 3.9716901779174805 + }, + { + "auxiliary_loss_clip": 0.06417395, + "auxiliary_loss_mlp": 0.01268291, + "balance_loss_clip": 0.06274853, + "balance_loss_mlp": 0.01257514, + "epoch": 0.5925146550428378, + "flos": 19870989333120.0, + "grad_norm": 1.7006302713228023, + "language_loss": 0.89085132, + "learning_rate": 1.5030502755927344e-06, + "loss": 0.96770817, + "num_input_tokens_seen": 212313770, + "router_z_loss_clip": 1.42675781, + "router_z_loss_mlp": 0.10772705, + "step": 9855, + "time_per_iteration": 2.54018235206604 + }, + { + "auxiliary_loss_clip": 0.06414687, + "auxiliary_loss_mlp": 0.01266215, + "balance_loss_clip": 0.06275393, + "balance_loss_mlp": 0.0125585, + "epoch": 0.5925747782955058, + "flos": 15128510019840.0, + "grad_norm": 1.7501100927117066, + "language_loss": 0.86997199, + "learning_rate": 1.5026730374988397e-06, + "loss": 0.94678098, + "num_input_tokens_seen": 212331525, + "router_z_loss_clip": 1.39257812, + "router_z_loss_mlp": 0.10369873, + "step": 9856, + "time_per_iteration": 2.5016441345214844 + }, + { + "auxiliary_loss_clip": 0.06422357, + "auxiliary_loss_mlp": 0.01265204, + "balance_loss_clip": 0.06275797, + "balance_loss_mlp": 0.01254177, + "epoch": 0.5926349015481738, + "flos": 18411297281280.0, + "grad_norm": 1.7487529922228526, + "language_loss": 0.77790916, + "learning_rate": 1.5022958182636332e-06, + "loss": 0.85478473, + "num_input_tokens_seen": 212347295, + "router_z_loss_clip": 1.46484375, + "router_z_loss_mlp": 0.11016846, + "step": 9857, + "time_per_iteration": 2.5232088565826416 + }, + { + "auxiliary_loss_clip": 0.06421745, + "auxiliary_loss_mlp": 0.01266127, + "balance_loss_clip": 0.06278913, + "balance_loss_mlp": 0.01254689, + "epoch": 0.5926950248008417, + "flos": 23117620757760.0, + "grad_norm": 2.3581492349261524, + "language_loss": 0.65045798, + "learning_rate": 1.501918617901419e-06, + "loss": 0.72733665, + "num_input_tokens_seen": 212365750, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.11431885, + "step": 9858, + "time_per_iteration": 4.080450773239136 + }, + { + "auxiliary_loss_clip": 0.06418257, + "auxiliary_loss_mlp": 0.01268065, + "balance_loss_clip": 0.06277932, + "balance_loss_mlp": 0.01256662, + "epoch": 0.5927551480535097, + "flos": 28040753473920.0, + "grad_norm": 1.620046821031832, + "language_loss": 0.77013564, + "learning_rate": 1.501541436426501e-06, + "loss": 0.84699887, + "num_input_tokens_seen": 212385300, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.11395264, + "step": 9859, + "time_per_iteration": 2.5496175289154053 + }, + { + "auxiliary_loss_clip": 0.06422819, + "auxiliary_loss_mlp": 0.01272084, + "balance_loss_clip": 0.06277181, + "balance_loss_mlp": 0.01260217, + "epoch": 0.5928152713061776, + "flos": 21805109602560.0, + "grad_norm": 2.0806402016169914, + "language_loss": 0.75381404, + "learning_rate": 1.5011642738531818e-06, + "loss": 0.8307631, + "num_input_tokens_seen": 212402140, + "router_z_loss_clip": 1.45507812, + "router_z_loss_mlp": 0.11865234, + "step": 9860, + "time_per_iteration": 2.4913806915283203 + }, + { + "auxiliary_loss_clip": 0.06419, + "auxiliary_loss_mlp": 0.01267797, + "balance_loss_clip": 0.06277152, + "balance_loss_mlp": 0.01257557, + "epoch": 0.5928753945588456, + "flos": 24323802681600.0, + "grad_norm": 1.5719426663731493, + "language_loss": 0.7657429, + "learning_rate": 1.500787130195763e-06, + "loss": 0.84261084, + "num_input_tokens_seen": 212421790, + "router_z_loss_clip": 1.41894531, + "router_z_loss_mlp": 0.10235596, + "step": 9861, + "time_per_iteration": 2.542318344116211 + }, + { + "auxiliary_loss_clip": 0.06416907, + "auxiliary_loss_mlp": 0.01266144, + "balance_loss_clip": 0.0627644, + "balance_loss_mlp": 0.01255355, + "epoch": 0.5929355178115137, + "flos": 26471126465280.0, + "grad_norm": 1.7884263747312634, + "language_loss": 0.70557332, + "learning_rate": 1.5004100054685465e-06, + "loss": 0.78240383, + "num_input_tokens_seen": 212442115, + "router_z_loss_clip": 1.40429688, + "router_z_loss_mlp": 0.10797119, + "step": 9862, + "time_per_iteration": 2.5269577503204346 + }, + { + "auxiliary_loss_clip": 0.06422247, + "auxiliary_loss_mlp": 0.01262904, + "balance_loss_clip": 0.06279124, + "balance_loss_mlp": 0.01252455, + "epoch": 0.5929956410641816, + "flos": 24971798632320.0, + "grad_norm": 1.7042567790148921, + "language_loss": 0.7816, + "learning_rate": 1.500032899685832e-06, + "loss": 0.85845149, + "num_input_tokens_seen": 212459535, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.10449219, + "step": 9863, + "time_per_iteration": 2.560952663421631 + }, + { + "auxiliary_loss_clip": 0.06423997, + "auxiliary_loss_mlp": 0.01269473, + "balance_loss_clip": 0.06280629, + "balance_loss_mlp": 0.01258917, + "epoch": 0.5930557643168496, + "flos": 26214639517440.0, + "grad_norm": 1.987432864542063, + "language_loss": 0.71297693, + "learning_rate": 1.499655812861921e-06, + "loss": 0.78991163, + "num_input_tokens_seen": 212479385, + "router_z_loss_clip": 1.43359375, + "router_z_loss_mlp": 0.10565186, + "step": 9864, + "time_per_iteration": 4.022796869277954 + }, + { + "auxiliary_loss_clip": 0.0642028, + "auxiliary_loss_mlp": 0.01268386, + "balance_loss_clip": 0.06276219, + "balance_loss_mlp": 0.01256578, + "epoch": 0.5931158875695175, + "flos": 27862322204160.0, + "grad_norm": 2.045271412380321, + "language_loss": 0.67615211, + "learning_rate": 1.4992787450111112e-06, + "loss": 0.75303876, + "num_input_tokens_seen": 212500060, + "router_z_loss_clip": 1.43847656, + "router_z_loss_mlp": 0.11816406, + "step": 9865, + "time_per_iteration": 2.542477607727051 + }, + { + "auxiliary_loss_clip": 0.06424178, + "auxiliary_loss_mlp": 0.01265131, + "balance_loss_clip": 0.06278679, + "balance_loss_mlp": 0.01253597, + "epoch": 0.5931760108221855, + "flos": 15419014525440.0, + "grad_norm": 2.0467341556470906, + "language_loss": 0.78422129, + "learning_rate": 1.4989016961477015e-06, + "loss": 0.86111438, + "num_input_tokens_seen": 212518590, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.11535645, + "step": 9866, + "time_per_iteration": 2.5601937770843506 + }, + { + "auxiliary_loss_clip": 0.06417245, + "auxiliary_loss_mlp": 0.01267033, + "balance_loss_clip": 0.06280121, + "balance_loss_mlp": 0.01256114, + "epoch": 0.5932361340748534, + "flos": 30196043395200.0, + "grad_norm": 1.6991427361252174, + "language_loss": 0.72385359, + "learning_rate": 1.4985246662859903e-06, + "loss": 0.80069637, + "num_input_tokens_seen": 212538190, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.10919189, + "step": 9867, + "time_per_iteration": 2.582200527191162 + }, + { + "auxiliary_loss_clip": 0.06421208, + "auxiliary_loss_mlp": 0.01268228, + "balance_loss_clip": 0.06280105, + "balance_loss_mlp": 0.0125589, + "epoch": 0.5932962573275214, + "flos": 20163841752960.0, + "grad_norm": 1.4126147288957658, + "language_loss": 0.6694321, + "learning_rate": 1.4981476554402732e-06, + "loss": 0.74632645, + "num_input_tokens_seen": 212557820, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.12335205, + "step": 9868, + "time_per_iteration": 2.515268087387085 + }, + { + "auxiliary_loss_clip": 0.06420252, + "auxiliary_loss_mlp": 0.01266526, + "balance_loss_clip": 0.06275701, + "balance_loss_mlp": 0.01255046, + "epoch": 0.5933563805801894, + "flos": 25452725541120.0, + "grad_norm": 1.59033500525529, + "language_loss": 0.75624323, + "learning_rate": 1.4977706636248478e-06, + "loss": 0.83311105, + "num_input_tokens_seen": 212577645, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.11474609, + "step": 9869, + "time_per_iteration": 2.5264642238616943 + }, + { + "auxiliary_loss_clip": 0.06425707, + "auxiliary_loss_mlp": 0.01265896, + "balance_loss_clip": 0.06281111, + "balance_loss_mlp": 0.01254779, + "epoch": 0.5934165038328574, + "flos": 60007971674880.0, + "grad_norm": 1.9233451977688907, + "language_loss": 0.74787021, + "learning_rate": 1.4973936908540091e-06, + "loss": 0.82478619, + "num_input_tokens_seen": 212603430, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.11114502, + "step": 9870, + "time_per_iteration": 2.8604302406311035 + }, + { + "auxiliary_loss_clip": 0.06422332, + "auxiliary_loss_mlp": 0.01265883, + "balance_loss_clip": 0.0627723, + "balance_loss_mlp": 0.01254719, + "epoch": 0.5934766270855253, + "flos": 24426568114560.0, + "grad_norm": 2.4352017906666226, + "language_loss": 0.72491121, + "learning_rate": 1.4970167371420517e-06, + "loss": 0.80179334, + "num_input_tokens_seen": 212620730, + "router_z_loss_clip": 1.45214844, + "router_z_loss_mlp": 0.11169434, + "step": 9871, + "time_per_iteration": 2.504990577697754 + }, + { + "auxiliary_loss_clip": 0.06424776, + "auxiliary_loss_mlp": 0.01266102, + "balance_loss_clip": 0.0627915, + "balance_loss_mlp": 0.01254843, + "epoch": 0.5935367503381933, + "flos": 23519821155840.0, + "grad_norm": 2.2688315988077736, + "language_loss": 0.74858117, + "learning_rate": 1.496639802503271e-06, + "loss": 0.82548994, + "num_input_tokens_seen": 212639745, + "router_z_loss_clip": 1.45703125, + "router_z_loss_mlp": 0.11254883, + "step": 9872, + "time_per_iteration": 2.5957329273223877 + }, + { + "auxiliary_loss_clip": 0.06431574, + "auxiliary_loss_mlp": 0.01267461, + "balance_loss_clip": 0.06283869, + "balance_loss_mlp": 0.01255517, + "epoch": 0.5935968735908612, + "flos": 18953550979200.0, + "grad_norm": 11.679124704717912, + "language_loss": 0.79073173, + "learning_rate": 1.4962628869519583e-06, + "loss": 0.86772209, + "num_input_tokens_seen": 212655915, + "router_z_loss_clip": 1.4765625, + "router_z_loss_mlp": 0.1194458, + "step": 9873, + "time_per_iteration": 2.4669687747955322 + }, + { + "auxiliary_loss_clip": 0.064208, + "auxiliary_loss_mlp": 0.01267302, + "balance_loss_clip": 0.06276259, + "balance_loss_mlp": 0.01255459, + "epoch": 0.5936569968435292, + "flos": 25490432678400.0, + "grad_norm": 1.6349451241448802, + "language_loss": 0.85223055, + "learning_rate": 1.4958859905024078e-06, + "loss": 0.9291116, + "num_input_tokens_seen": 212676115, + "router_z_loss_clip": 1.44726562, + "router_z_loss_mlp": 0.11853027, + "step": 9874, + "time_per_iteration": 2.5542490482330322 + }, + { + "auxiliary_loss_clip": 0.06322969, + "auxiliary_loss_mlp": 0.01256968, + "balance_loss_clip": 0.0626381, + "balance_loss_mlp": 0.01255485, + "epoch": 0.5937171200961973, + "flos": 66397364259840.0, + "grad_norm": 0.7006393782995821, + "language_loss": 0.59778833, + "learning_rate": 1.4955091131689115e-06, + "loss": 0.67358768, + "num_input_tokens_seen": 212737560, + "router_z_loss_clip": 0.59375, + "router_z_loss_mlp": 0.01482391, + "step": 9875, + "time_per_iteration": 3.2118613719940186 + }, + { + "auxiliary_loss_clip": 0.06429566, + "auxiliary_loss_mlp": 0.01269748, + "balance_loss_clip": 0.06278439, + "balance_loss_mlp": 0.01257302, + "epoch": 0.5937772433488652, + "flos": 14908849741440.0, + "grad_norm": 2.56951836872527, + "language_loss": 0.78072035, + "learning_rate": 1.4951322549657594e-06, + "loss": 0.85771352, + "num_input_tokens_seen": 212755365, + "router_z_loss_clip": 1.51367188, + "router_z_loss_mlp": 0.12451172, + "step": 9876, + "time_per_iteration": 2.488849401473999 + }, + { + "auxiliary_loss_clip": 0.06411201, + "auxiliary_loss_mlp": 0.0126454, + "balance_loss_clip": 0.06273002, + "balance_loss_mlp": 0.01253764, + "epoch": 0.5938373666015332, + "flos": 22567484776320.0, + "grad_norm": 1.5512644369371444, + "language_loss": 0.7603606, + "learning_rate": 1.494755415907243e-06, + "loss": 0.83711803, + "num_input_tokens_seen": 212773875, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.10772705, + "step": 9877, + "time_per_iteration": 2.5584661960601807 + }, + { + "auxiliary_loss_clip": 0.06419433, + "auxiliary_loss_mlp": 0.01268567, + "balance_loss_clip": 0.06274508, + "balance_loss_mlp": 0.01256801, + "epoch": 0.5938974898542011, + "flos": 18446572650240.0, + "grad_norm": 2.5934425226299243, + "language_loss": 0.81566256, + "learning_rate": 1.4943785960076522e-06, + "loss": 0.8925426, + "num_input_tokens_seen": 212790590, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.11779785, + "step": 9878, + "time_per_iteration": 2.498063802719116 + }, + { + "auxiliary_loss_clip": 0.0642112, + "auxiliary_loss_mlp": 0.0126802, + "balance_loss_clip": 0.06274901, + "balance_loss_mlp": 0.01256993, + "epoch": 0.5939576131068691, + "flos": 45597029293440.0, + "grad_norm": 1.6161422600744055, + "language_loss": 0.71359301, + "learning_rate": 1.4940017952812754e-06, + "loss": 0.79048443, + "num_input_tokens_seen": 212812265, + "router_z_loss_clip": 1.46191406, + "router_z_loss_mlp": 0.11029053, + "step": 9879, + "time_per_iteration": 2.7588438987731934 + }, + { + "auxiliary_loss_clip": 0.06414315, + "auxiliary_loss_mlp": 0.0126561, + "balance_loss_clip": 0.06272938, + "balance_loss_mlp": 0.01254166, + "epoch": 0.594017736359537, + "flos": 23594648451840.0, + "grad_norm": 1.558347600048505, + "language_loss": 0.57834136, + "learning_rate": 1.493625013742401e-06, + "loss": 0.65514064, + "num_input_tokens_seen": 212831915, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.11431885, + "step": 9880, + "time_per_iteration": 2.5477280616760254 + }, + { + "auxiliary_loss_clip": 0.0641728, + "auxiliary_loss_mlp": 0.012706, + "balance_loss_clip": 0.06272745, + "balance_loss_mlp": 0.01258751, + "epoch": 0.594077859612205, + "flos": 29464373543040.0, + "grad_norm": 1.9254284711947285, + "language_loss": 0.78115642, + "learning_rate": 1.4932482514053177e-06, + "loss": 0.85803521, + "num_input_tokens_seen": 212851350, + "router_z_loss_clip": 1.4453125, + "router_z_loss_mlp": 0.11846924, + "step": 9881, + "time_per_iteration": 2.596902847290039 + }, + { + "auxiliary_loss_clip": 0.06421138, + "auxiliary_loss_mlp": 0.0126373, + "balance_loss_clip": 0.06276222, + "balance_loss_mlp": 0.01252882, + "epoch": 0.594137982864873, + "flos": 16805682144000.0, + "grad_norm": 2.173471904433077, + "language_loss": 0.83138072, + "learning_rate": 1.4928715082843112e-06, + "loss": 0.90822935, + "num_input_tokens_seen": 212867995, + "router_z_loss_clip": 1.44726562, + "router_z_loss_mlp": 0.10839844, + "step": 9882, + "time_per_iteration": 2.483264446258545 + }, + { + "auxiliary_loss_clip": 0.06420217, + "auxiliary_loss_mlp": 0.01271488, + "balance_loss_clip": 0.06276472, + "balance_loss_mlp": 0.01260318, + "epoch": 0.594198106117541, + "flos": 12755194974720.0, + "grad_norm": 2.093124407330454, + "language_loss": 0.79720157, + "learning_rate": 1.492494784393667e-06, + "loss": 0.87411857, + "num_input_tokens_seen": 212885220, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.11175537, + "step": 9883, + "time_per_iteration": 2.5007734298706055 + }, + { + "auxiliary_loss_clip": 0.06424005, + "auxiliary_loss_mlp": 0.01269731, + "balance_loss_clip": 0.06275944, + "balance_loss_mlp": 0.01258097, + "epoch": 0.5942582293702089, + "flos": 21002930939520.0, + "grad_norm": 1.7867915832733556, + "language_loss": 0.7479161, + "learning_rate": 1.4921180797476725e-06, + "loss": 0.82485354, + "num_input_tokens_seen": 212903195, + "router_z_loss_clip": 1.48144531, + "router_z_loss_mlp": 0.11645508, + "step": 9884, + "time_per_iteration": 2.5044338703155518 + }, + { + "auxiliary_loss_clip": 0.06419083, + "auxiliary_loss_mlp": 0.01265524, + "balance_loss_clip": 0.06275263, + "balance_loss_mlp": 0.01253549, + "epoch": 0.5943183526228769, + "flos": 28298665941120.0, + "grad_norm": 2.661403390475952, + "language_loss": 0.6670655, + "learning_rate": 1.4917413943606106e-06, + "loss": 0.7439115, + "num_input_tokens_seen": 212923340, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.11975098, + "step": 9885, + "time_per_iteration": 2.592233180999756 + }, + { + "auxiliary_loss_clip": 0.06417437, + "auxiliary_loss_mlp": 0.01268066, + "balance_loss_clip": 0.06274392, + "balance_loss_mlp": 0.01256884, + "epoch": 0.5943784758755448, + "flos": 26621829233280.0, + "grad_norm": 2.23147400779812, + "language_loss": 0.76914746, + "learning_rate": 1.4913647282467667e-06, + "loss": 0.84600246, + "num_input_tokens_seen": 212942755, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.11181641, + "step": 9886, + "time_per_iteration": 2.5211451053619385 + }, + { + "auxiliary_loss_clip": 0.06318811, + "auxiliary_loss_mlp": 0.01252302, + "balance_loss_clip": 0.06259875, + "balance_loss_mlp": 0.01250785, + "epoch": 0.5944385991282128, + "flos": 64209859643520.0, + "grad_norm": 0.8085761446732002, + "language_loss": 0.64425516, + "learning_rate": 1.490988081420423e-06, + "loss": 0.71996629, + "num_input_tokens_seen": 212999355, + "router_z_loss_clip": 0.58984375, + "router_z_loss_mlp": 0.01515961, + "step": 9887, + "time_per_iteration": 4.4216148853302 + }, + { + "auxiliary_loss_clip": 0.06419201, + "auxiliary_loss_mlp": 0.01265936, + "balance_loss_clip": 0.06275857, + "balance_loss_mlp": 0.01254307, + "epoch": 0.5944987223808808, + "flos": 19577885351040.0, + "grad_norm": 1.7443994329425772, + "language_loss": 0.691764, + "learning_rate": 1.4906114538958615e-06, + "loss": 0.76861531, + "num_input_tokens_seen": 213018570, + "router_z_loss_clip": 1.43164062, + "router_z_loss_mlp": 0.11633301, + "step": 9888, + "time_per_iteration": 2.558119058609009 + }, + { + "auxiliary_loss_clip": 0.06419526, + "auxiliary_loss_mlp": 0.01269907, + "balance_loss_clip": 0.06276903, + "balance_loss_mlp": 0.01258773, + "epoch": 0.5945588456335488, + "flos": 26184856590720.0, + "grad_norm": 1.5028057851776446, + "language_loss": 0.7952224, + "learning_rate": 1.490234845687366e-06, + "loss": 0.87211674, + "num_input_tokens_seen": 213037735, + "router_z_loss_clip": 1.42675781, + "router_z_loss_mlp": 0.11138916, + "step": 9889, + "time_per_iteration": 2.556455612182617 + }, + { + "auxiliary_loss_clip": 0.06416804, + "auxiliary_loss_mlp": 0.01267591, + "balance_loss_clip": 0.06273508, + "balance_loss_mlp": 0.01257076, + "epoch": 0.5946189688862168, + "flos": 20452333760640.0, + "grad_norm": 1.5171149074997012, + "language_loss": 0.70987219, + "learning_rate": 1.4898582568092154e-06, + "loss": 0.7867161, + "num_input_tokens_seen": 213057160, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.1050415, + "step": 9890, + "time_per_iteration": 2.572852373123169 + }, + { + "auxiliary_loss_clip": 0.06420811, + "auxiliary_loss_mlp": 0.01269509, + "balance_loss_clip": 0.06275058, + "balance_loss_mlp": 0.01258041, + "epoch": 0.5946790921388847, + "flos": 13441568895360.0, + "grad_norm": 1.9815921383050485, + "language_loss": 0.697523, + "learning_rate": 1.489481687275691e-06, + "loss": 0.77442622, + "num_input_tokens_seen": 213073630, + "router_z_loss_clip": 1.45800781, + "router_z_loss_mlp": 0.11468506, + "step": 9891, + "time_per_iteration": 2.474308729171753 + }, + { + "auxiliary_loss_clip": 0.06419806, + "auxiliary_loss_mlp": 0.01266103, + "balance_loss_clip": 0.06277567, + "balance_loss_mlp": 0.01255839, + "epoch": 0.5947392153915527, + "flos": 20418483911040.0, + "grad_norm": 1.7485359350265648, + "language_loss": 0.53498697, + "learning_rate": 1.4891051371010726e-06, + "loss": 0.61184609, + "num_input_tokens_seen": 213092450, + "router_z_loss_clip": 1.42285156, + "router_z_loss_mlp": 0.10266113, + "step": 9892, + "time_per_iteration": 2.534221649169922 + }, + { + "auxiliary_loss_clip": 0.06313733, + "auxiliary_loss_mlp": 0.01253007, + "balance_loss_clip": 0.06254771, + "balance_loss_mlp": 0.01251455, + "epoch": 0.5947993386442206, + "flos": 65639181790080.0, + "grad_norm": 0.6531062006914405, + "language_loss": 0.54571462, + "learning_rate": 1.4887286062996375e-06, + "loss": 0.621382, + "num_input_tokens_seen": 213155465, + "router_z_loss_clip": 0.58984375, + "router_z_loss_mlp": 0.01551056, + "step": 9893, + "time_per_iteration": 3.1853702068328857 + }, + { + "auxiliary_loss_clip": 0.064126, + "auxiliary_loss_mlp": 0.0126532, + "balance_loss_clip": 0.06272365, + "balance_loss_mlp": 0.01254841, + "epoch": 0.5948594618968887, + "flos": 23189429306880.0, + "grad_norm": 1.6806512476713673, + "language_loss": 0.75017619, + "learning_rate": 1.4883520948856658e-06, + "loss": 0.82695538, + "num_input_tokens_seen": 213174875, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.10473633, + "step": 9894, + "time_per_iteration": 4.046506643295288 + }, + { + "auxiliary_loss_clip": 0.06415449, + "auxiliary_loss_mlp": 0.01265281, + "balance_loss_clip": 0.06273435, + "balance_loss_mlp": 0.01253831, + "epoch": 0.5949195851495566, + "flos": 13631991298560.0, + "grad_norm": 1.844376504699444, + "language_loss": 0.77997828, + "learning_rate": 1.487975602873434e-06, + "loss": 0.8567856, + "num_input_tokens_seen": 213192695, + "router_z_loss_clip": 1.41894531, + "router_z_loss_mlp": 0.11444092, + "step": 9895, + "time_per_iteration": 2.5028066635131836 + }, + { + "auxiliary_loss_clip": 0.06421571, + "auxiliary_loss_mlp": 0.01264682, + "balance_loss_clip": 0.06273872, + "balance_loss_mlp": 0.01252862, + "epoch": 0.5949797084022246, + "flos": 19756358547840.0, + "grad_norm": 2.034072439962686, + "language_loss": 0.79318964, + "learning_rate": 1.4875991302772182e-06, + "loss": 0.8700521, + "num_input_tokens_seen": 213211195, + "router_z_loss_clip": 1.47753906, + "router_z_loss_mlp": 0.11816406, + "step": 9896, + "time_per_iteration": 2.496610164642334 + }, + { + "auxiliary_loss_clip": 0.06420637, + "auxiliary_loss_mlp": 0.01265344, + "balance_loss_clip": 0.06275238, + "balance_loss_mlp": 0.01253709, + "epoch": 0.5950398316548925, + "flos": 25780685621760.0, + "grad_norm": 1.4418973411464253, + "language_loss": 0.8331461, + "learning_rate": 1.4872226771112954e-06, + "loss": 0.91000593, + "num_input_tokens_seen": 213231975, + "router_z_loss_clip": 1.45507812, + "router_z_loss_mlp": 0.11645508, + "step": 9897, + "time_per_iteration": 2.6055963039398193 + }, + { + "auxiliary_loss_clip": 0.06422365, + "auxiliary_loss_mlp": 0.0126489, + "balance_loss_clip": 0.06278124, + "balance_loss_mlp": 0.01254012, + "epoch": 0.5950999549075605, + "flos": 23045644500480.0, + "grad_norm": 2.157917564883112, + "language_loss": 0.71089602, + "learning_rate": 1.486846243389939e-06, + "loss": 0.78776848, + "num_input_tokens_seen": 213249760, + "router_z_loss_clip": 1.44335938, + "router_z_loss_mlp": 0.10882568, + "step": 9898, + "time_per_iteration": 3.95219087600708 + }, + { + "auxiliary_loss_clip": 0.06426959, + "auxiliary_loss_mlp": 0.01267336, + "balance_loss_clip": 0.06277014, + "balance_loss_mlp": 0.01254897, + "epoch": 0.5951600781602284, + "flos": 32453553697920.0, + "grad_norm": 2.106705884146929, + "language_loss": 0.63699448, + "learning_rate": 1.4864698291274251e-06, + "loss": 0.71393746, + "num_input_tokens_seen": 213269890, + "router_z_loss_clip": 1.49902344, + "router_z_loss_mlp": 0.12451172, + "step": 9899, + "time_per_iteration": 2.597721576690674 + }, + { + "auxiliary_loss_clip": 0.06419618, + "auxiliary_loss_mlp": 0.01270579, + "balance_loss_clip": 0.06276435, + "balance_loss_mlp": 0.01259999, + "epoch": 0.5952202014128964, + "flos": 23806887644160.0, + "grad_norm": 1.5164228353921223, + "language_loss": 0.72182071, + "learning_rate": 1.4860934343380267e-06, + "loss": 0.79872268, + "num_input_tokens_seen": 213289400, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.10571289, + "step": 9900, + "time_per_iteration": 2.5579535961151123 + }, + { + "auxiliary_loss_clip": 0.06414567, + "auxiliary_loss_mlp": 0.01267005, + "balance_loss_clip": 0.06274517, + "balance_loss_mlp": 0.01255484, + "epoch": 0.5952803246655644, + "flos": 22498778828160.0, + "grad_norm": 1.774545476213964, + "language_loss": 0.84691358, + "learning_rate": 1.4857170590360169e-06, + "loss": 0.9237293, + "num_input_tokens_seen": 213308040, + "router_z_loss_clip": 1.40039062, + "router_z_loss_mlp": 0.11523438, + "step": 9901, + "time_per_iteration": 2.532650947570801 + }, + { + "auxiliary_loss_clip": 0.06311554, + "auxiliary_loss_mlp": 0.01252152, + "balance_loss_clip": 0.06252782, + "balance_loss_mlp": 0.01250599, + "epoch": 0.5953404479182324, + "flos": 51250810884480.0, + "grad_norm": 0.7741789718205083, + "language_loss": 0.58204901, + "learning_rate": 1.4853407032356674e-06, + "loss": 0.65768605, + "num_input_tokens_seen": 213358585, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 0.01550293, + "step": 9902, + "time_per_iteration": 2.995508909225464 + }, + { + "auxiliary_loss_clip": 0.06422149, + "auxiliary_loss_mlp": 0.01268252, + "balance_loss_clip": 0.06274737, + "balance_loss_mlp": 0.01256653, + "epoch": 0.5954005711709004, + "flos": 23119423620480.0, + "grad_norm": 1.8631652775155525, + "language_loss": 0.77643347, + "learning_rate": 1.4849643669512503e-06, + "loss": 0.85333747, + "num_input_tokens_seen": 213379585, + "router_z_loss_clip": 1.47363281, + "router_z_loss_mlp": 0.11608887, + "step": 9903, + "time_per_iteration": 2.526265859603882 + }, + { + "auxiliary_loss_clip": 0.06419012, + "auxiliary_loss_mlp": 0.01265075, + "balance_loss_clip": 0.06274754, + "balance_loss_mlp": 0.01253691, + "epoch": 0.5954606944235683, + "flos": 35963464250880.0, + "grad_norm": 1.7611381352056217, + "language_loss": 0.78137469, + "learning_rate": 1.4845880501970362e-06, + "loss": 0.85821557, + "num_input_tokens_seen": 213401465, + "router_z_loss_clip": 1.44140625, + "router_z_loss_mlp": 0.1138916, + "step": 9904, + "time_per_iteration": 4.04362940788269 + }, + { + "auxiliary_loss_clip": 0.0642558, + "auxiliary_loss_mlp": 0.012642, + "balance_loss_clip": 0.06275237, + "balance_loss_mlp": 0.01252619, + "epoch": 0.5955208176762363, + "flos": 30451188677760.0, + "grad_norm": 1.2800711014437993, + "language_loss": 0.72963494, + "learning_rate": 1.4842117529872942e-06, + "loss": 0.80653274, + "num_input_tokens_seen": 213422720, + "router_z_loss_clip": 1.50390625, + "router_z_loss_mlp": 0.11566162, + "step": 9905, + "time_per_iteration": 2.630237340927124 + }, + { + "auxiliary_loss_clip": 0.06417751, + "auxiliary_loss_mlp": 0.01267213, + "balance_loss_clip": 0.06273159, + "balance_loss_mlp": 0.01255942, + "epoch": 0.5955809409289042, + "flos": 17645987214720.0, + "grad_norm": 2.1926975812717524, + "language_loss": 0.70104027, + "learning_rate": 1.483835475336295e-06, + "loss": 0.77788991, + "num_input_tokens_seen": 213439480, + "router_z_loss_clip": 1.4453125, + "router_z_loss_mlp": 0.11273193, + "step": 9906, + "time_per_iteration": 2.5136594772338867 + }, + { + "auxiliary_loss_clip": 0.06423035, + "auxiliary_loss_mlp": 0.0126641, + "balance_loss_clip": 0.06276789, + "balance_loss_mlp": 0.01254316, + "epoch": 0.5956410641815723, + "flos": 24286766376960.0, + "grad_norm": 1.7055783949352592, + "language_loss": 0.74976909, + "learning_rate": 1.4834592172583057e-06, + "loss": 0.82666361, + "num_input_tokens_seen": 213458895, + "router_z_loss_clip": 1.46191406, + "router_z_loss_mlp": 0.12103271, + "step": 9907, + "time_per_iteration": 2.5186941623687744 + }, + { + "auxiliary_loss_clip": 0.06419441, + "auxiliary_loss_mlp": 0.01268122, + "balance_loss_clip": 0.06274839, + "balance_loss_mlp": 0.01256618, + "epoch": 0.5957011874342402, + "flos": 35742713869440.0, + "grad_norm": 1.9121613205115942, + "language_loss": 0.67437243, + "learning_rate": 1.483082978767595e-06, + "loss": 0.75124806, + "num_input_tokens_seen": 213481730, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.11505127, + "step": 9908, + "time_per_iteration": 2.641977310180664 + }, + { + "auxiliary_loss_clip": 0.06417987, + "auxiliary_loss_mlp": 0.01266521, + "balance_loss_clip": 0.0627388, + "balance_loss_mlp": 0.01255459, + "epoch": 0.5957613106869082, + "flos": 21250277792640.0, + "grad_norm": 1.9262426125407, + "language_loss": 0.7637223, + "learning_rate": 1.4827067598784298e-06, + "loss": 0.84056735, + "num_input_tokens_seen": 213497225, + "router_z_loss_clip": 1.43945312, + "router_z_loss_mlp": 0.1105957, + "step": 9909, + "time_per_iteration": 2.4708259105682373 + }, + { + "auxiliary_loss_clip": 0.06309633, + "auxiliary_loss_mlp": 0.01253319, + "balance_loss_clip": 0.06250934, + "balance_loss_mlp": 0.01251702, + "epoch": 0.5958214339395761, + "flos": 65959972346880.0, + "grad_norm": 0.8925366465224025, + "language_loss": 0.73392916, + "learning_rate": 1.4823305606050753e-06, + "loss": 0.80955869, + "num_input_tokens_seen": 213556890, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 0.01618958, + "step": 9910, + "time_per_iteration": 3.2132058143615723 + }, + { + "auxiliary_loss_clip": 0.06420797, + "auxiliary_loss_mlp": 0.01266027, + "balance_loss_clip": 0.06273291, + "balance_loss_mlp": 0.01253838, + "epoch": 0.5958815571922441, + "flos": 23224872384000.0, + "grad_norm": 1.906132958424511, + "language_loss": 0.69966662, + "learning_rate": 1.481954380961799e-06, + "loss": 0.77653486, + "num_input_tokens_seen": 213575800, + "router_z_loss_clip": 1.47753906, + "router_z_loss_mlp": 0.12194824, + "step": 9911, + "time_per_iteration": 2.5891547203063965 + }, + { + "auxiliary_loss_clip": 0.06430559, + "auxiliary_loss_mlp": 0.01269185, + "balance_loss_clip": 0.06277213, + "balance_loss_mlp": 0.01256471, + "epoch": 0.595941680444912, + "flos": 16543157702400.0, + "grad_norm": 1.8117496085568294, + "language_loss": 0.65995622, + "learning_rate": 1.4815782209628631e-06, + "loss": 0.73695368, + "num_input_tokens_seen": 213592740, + "router_z_loss_clip": 1.53515625, + "router_z_loss_mlp": 0.12713623, + "step": 9912, + "time_per_iteration": 2.5106897354125977 + }, + { + "auxiliary_loss_clip": 0.06418723, + "auxiliary_loss_mlp": 0.01269847, + "balance_loss_clip": 0.06273462, + "balance_loss_mlp": 0.01257681, + "epoch": 0.59600180369758, + "flos": 27826334075520.0, + "grad_norm": 1.8937269812557305, + "language_loss": 0.73603946, + "learning_rate": 1.4812020806225337e-06, + "loss": 0.81292516, + "num_input_tokens_seen": 213611970, + "router_z_loss_clip": 1.45117188, + "router_z_loss_mlp": 0.12145996, + "step": 9913, + "time_per_iteration": 2.5845842361450195 + }, + { + "auxiliary_loss_clip": 0.06422256, + "auxiliary_loss_mlp": 0.01265933, + "balance_loss_clip": 0.06272183, + "balance_loss_mlp": 0.01254316, + "epoch": 0.596061926950248, + "flos": 29498349173760.0, + "grad_norm": 2.1687664822630692, + "language_loss": 0.79983938, + "learning_rate": 1.4808259599550738e-06, + "loss": 0.87672126, + "num_input_tokens_seen": 213632230, + "router_z_loss_clip": 1.50097656, + "router_z_loss_mlp": 0.1161499, + "step": 9914, + "time_per_iteration": 2.677943229675293 + }, + { + "auxiliary_loss_clip": 0.06418366, + "auxiliary_loss_mlp": 0.01267743, + "balance_loss_clip": 0.06274056, + "balance_loss_mlp": 0.01256233, + "epoch": 0.596122050202916, + "flos": 16842424959360.0, + "grad_norm": 1.662988077903936, + "language_loss": 0.67750293, + "learning_rate": 1.4804498589747448e-06, + "loss": 0.75436401, + "num_input_tokens_seen": 213649645, + "router_z_loss_clip": 1.44238281, + "router_z_loss_mlp": 0.1149292, + "step": 9915, + "time_per_iteration": 2.527804374694824 + }, + { + "auxiliary_loss_clip": 0.06422138, + "auxiliary_loss_mlp": 0.01266284, + "balance_loss_clip": 0.06274668, + "balance_loss_mlp": 0.01254888, + "epoch": 0.596182173455584, + "flos": 21003056720640.0, + "grad_norm": 1.4119869222981658, + "language_loss": 0.7862711, + "learning_rate": 1.4800737776958095e-06, + "loss": 0.86315531, + "num_input_tokens_seen": 213668850, + "router_z_loss_clip": 1.47363281, + "router_z_loss_mlp": 0.11395264, + "step": 9916, + "time_per_iteration": 2.5146098136901855 + }, + { + "auxiliary_loss_clip": 0.06422624, + "auxiliary_loss_mlp": 0.01266472, + "balance_loss_clip": 0.06273377, + "balance_loss_mlp": 0.01254808, + "epoch": 0.5962422967082519, + "flos": 16070364639360.0, + "grad_norm": 1.8279133386942186, + "language_loss": 0.83302379, + "learning_rate": 1.4796977161325286e-06, + "loss": 0.90991473, + "num_input_tokens_seen": 213685695, + "router_z_loss_clip": 1.49511719, + "router_z_loss_mlp": 0.11657715, + "step": 9917, + "time_per_iteration": 2.5148332118988037 + }, + { + "auxiliary_loss_clip": 0.06418853, + "auxiliary_loss_mlp": 0.0126709, + "balance_loss_clip": 0.06274682, + "balance_loss_mlp": 0.01256236, + "epoch": 0.5963024199609199, + "flos": 12171879976320.0, + "grad_norm": 1.6879177929284592, + "language_loss": 0.77521312, + "learning_rate": 1.4793216742991625e-06, + "loss": 0.85207248, + "num_input_tokens_seen": 213703515, + "router_z_loss_clip": 1.44335938, + "router_z_loss_mlp": 0.10852051, + "step": 9918, + "time_per_iteration": 2.4897613525390625 + }, + { + "auxiliary_loss_clip": 0.06419399, + "auxiliary_loss_mlp": 0.01267485, + "balance_loss_clip": 0.06274245, + "balance_loss_mlp": 0.01256661, + "epoch": 0.5963625432135878, + "flos": 28081772847360.0, + "grad_norm": 1.5296515450402863, + "language_loss": 0.7930398, + "learning_rate": 1.4789456522099707e-06, + "loss": 0.86990869, + "num_input_tokens_seen": 213724170, + "router_z_loss_clip": 1.45214844, + "router_z_loss_mlp": 0.10821533, + "step": 9919, + "time_per_iteration": 2.6023364067077637 + }, + { + "auxiliary_loss_clip": 0.06424099, + "auxiliary_loss_mlp": 0.01265086, + "balance_loss_clip": 0.06277885, + "balance_loss_mlp": 0.01253434, + "epoch": 0.5964226664662559, + "flos": 19865664599040.0, + "grad_norm": 2.0582572283345537, + "language_loss": 0.77598941, + "learning_rate": 1.4785696498792122e-06, + "loss": 0.85288125, + "num_input_tokens_seen": 213740620, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.11645508, + "step": 9920, + "time_per_iteration": 2.499610424041748 + }, + { + "auxiliary_loss_clip": 0.06428593, + "auxiliary_loss_mlp": 0.01269926, + "balance_loss_clip": 0.06280707, + "balance_loss_mlp": 0.01258124, + "epoch": 0.5964827897189238, + "flos": 12937567386240.0, + "grad_norm": 2.9535163377991647, + "language_loss": 0.8317768, + "learning_rate": 1.4781936673211446e-06, + "loss": 0.90876198, + "num_input_tokens_seen": 213755390, + "router_z_loss_clip": 1.47753906, + "router_z_loss_mlp": 0.11798096, + "step": 9921, + "time_per_iteration": 2.5134449005126953 + }, + { + "auxiliary_loss_clip": 0.06416389, + "auxiliary_loss_mlp": 0.01268083, + "balance_loss_clip": 0.0627341, + "balance_loss_mlp": 0.01256389, + "epoch": 0.5965429129715918, + "flos": 18156738977280.0, + "grad_norm": 1.8928045831706461, + "language_loss": 0.80601788, + "learning_rate": 1.4778177045500252e-06, + "loss": 0.88286257, + "num_input_tokens_seen": 213773225, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.11694336, + "step": 9922, + "time_per_iteration": 2.4813597202301025 + }, + { + "auxiliary_loss_clip": 0.06417114, + "auxiliary_loss_mlp": 0.01269772, + "balance_loss_clip": 0.06271716, + "balance_loss_mlp": 0.01258828, + "epoch": 0.5966030362242597, + "flos": 21769834233600.0, + "grad_norm": 3.055273537118157, + "language_loss": 0.7726593, + "learning_rate": 1.477441761580111e-06, + "loss": 0.84952813, + "num_input_tokens_seen": 213791860, + "router_z_loss_clip": 1.453125, + "router_z_loss_mlp": 0.10949707, + "step": 9923, + "time_per_iteration": 2.5638489723205566 + }, + { + "auxiliary_loss_clip": 0.06424043, + "auxiliary_loss_mlp": 0.01268694, + "balance_loss_clip": 0.06273048, + "balance_loss_mlp": 0.01254973, + "epoch": 0.5966631594769277, + "flos": 18813204190080.0, + "grad_norm": 1.8922524994378742, + "language_loss": 0.76095831, + "learning_rate": 1.4770658384256573e-06, + "loss": 0.83788568, + "num_input_tokens_seen": 213809455, + "router_z_loss_clip": 1.51074219, + "router_z_loss_mlp": 0.13720703, + "step": 9924, + "time_per_iteration": 2.4999732971191406 + }, + { + "auxiliary_loss_clip": 0.06413831, + "auxiliary_loss_mlp": 0.01268542, + "balance_loss_clip": 0.06272236, + "balance_loss_mlp": 0.01256633, + "epoch": 0.5967232827295956, + "flos": 14069383211520.0, + "grad_norm": 1.7112851014893713, + "language_loss": 0.66830564, + "learning_rate": 1.4766899351009204e-06, + "loss": 0.74512935, + "num_input_tokens_seen": 213826615, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.11920166, + "step": 9925, + "time_per_iteration": 2.5139551162719727 + }, + { + "auxiliary_loss_clip": 0.06421202, + "auxiliary_loss_mlp": 0.0126999, + "balance_loss_clip": 0.06279947, + "balance_loss_mlp": 0.01258409, + "epoch": 0.5967834059822636, + "flos": 17243954524800.0, + "grad_norm": 1.861204364539265, + "language_loss": 0.72200316, + "learning_rate": 1.4763140516201528e-06, + "loss": 0.79891503, + "num_input_tokens_seen": 213844495, + "router_z_loss_clip": 1.41210938, + "router_z_loss_mlp": 0.11584473, + "step": 9926, + "time_per_iteration": 3.9693188667297363 + }, + { + "auxiliary_loss_clip": 0.06422362, + "auxiliary_loss_mlp": 0.01270656, + "balance_loss_clip": 0.06274919, + "balance_loss_mlp": 0.01258556, + "epoch": 0.5968435292349316, + "flos": 42529751533440.0, + "grad_norm": 1.9299553445847866, + "language_loss": 0.70147216, + "learning_rate": 1.4759381879976088e-06, + "loss": 0.77840233, + "num_input_tokens_seen": 213869125, + "router_z_loss_clip": 1.47460938, + "router_z_loss_mlp": 0.12103271, + "step": 9927, + "time_per_iteration": 2.7299752235412598 + }, + { + "auxiliary_loss_clip": 0.06429256, + "auxiliary_loss_mlp": 0.01266883, + "balance_loss_clip": 0.06277983, + "balance_loss_mlp": 0.0125467, + "epoch": 0.5969036524875996, + "flos": 37639546272000.0, + "grad_norm": 1.5668113041571725, + "language_loss": 0.63611758, + "learning_rate": 1.4755623442475415e-06, + "loss": 0.71307898, + "num_input_tokens_seen": 213891115, + "router_z_loss_clip": 1.51269531, + "router_z_loss_mlp": 0.12213135, + "step": 9928, + "time_per_iteration": 2.7166144847869873 + }, + { + "auxiliary_loss_clip": 0.06418041, + "auxiliary_loss_mlp": 0.01265529, + "balance_loss_clip": 0.06274209, + "balance_loss_mlp": 0.01254454, + "epoch": 0.5969637757402676, + "flos": 23154992478720.0, + "grad_norm": 2.1979213221977596, + "language_loss": 0.69668317, + "learning_rate": 1.4751865203842022e-06, + "loss": 0.77351892, + "num_input_tokens_seen": 213911925, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.1105957, + "step": 9929, + "time_per_iteration": 2.51379656791687 + }, + { + "auxiliary_loss_clip": 0.0641327, + "auxiliary_loss_mlp": 0.01270831, + "balance_loss_clip": 0.06274718, + "balance_loss_mlp": 0.01259697, + "epoch": 0.5970238989929355, + "flos": 24027176828160.0, + "grad_norm": 1.690473988948275, + "language_loss": 0.7685796, + "learning_rate": 1.4748107164218431e-06, + "loss": 0.8454206, + "num_input_tokens_seen": 213930715, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.11138916, + "step": 9930, + "time_per_iteration": 2.590068817138672 + }, + { + "auxiliary_loss_clip": 0.06427103, + "auxiliary_loss_mlp": 0.01271306, + "balance_loss_clip": 0.06277532, + "balance_loss_mlp": 0.01259206, + "epoch": 0.5970840222456035, + "flos": 19432884660480.0, + "grad_norm": 1.4319660868037594, + "language_loss": 0.69073558, + "learning_rate": 1.4744349323747146e-06, + "loss": 0.76771963, + "num_input_tokens_seen": 213950015, + "router_z_loss_clip": 1.49511719, + "router_z_loss_mlp": 0.12097168, + "step": 9931, + "time_per_iteration": 2.4879701137542725 + }, + { + "auxiliary_loss_clip": 0.06314774, + "auxiliary_loss_mlp": 0.01252398, + "balance_loss_clip": 0.06255934, + "balance_loss_mlp": 0.01250752, + "epoch": 0.5971441454982714, + "flos": 62993615230080.0, + "grad_norm": 0.8560146868595252, + "language_loss": 0.64260876, + "learning_rate": 1.474059168257065e-06, + "loss": 0.71828043, + "num_input_tokens_seen": 214003330, + "router_z_loss_clip": 0.58984375, + "router_z_loss_mlp": 0.01649475, + "step": 9932, + "time_per_iteration": 3.0806198120117188 + }, + { + "auxiliary_loss_clip": 0.06415366, + "auxiliary_loss_mlp": 0.01270842, + "balance_loss_clip": 0.06272191, + "balance_loss_mlp": 0.01259976, + "epoch": 0.5972042687509395, + "flos": 20272393117440.0, + "grad_norm": 1.7768464871728415, + "language_loss": 0.74403048, + "learning_rate": 1.4736834240831454e-06, + "loss": 0.82089257, + "num_input_tokens_seen": 214021680, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.10864258, + "step": 9933, + "time_per_iteration": 3.9164891242980957 + }, + { + "auxiliary_loss_clip": 0.06316046, + "auxiliary_loss_mlp": 0.01258623, + "balance_loss_clip": 0.06257492, + "balance_loss_mlp": 0.01256835, + "epoch": 0.5972643920036074, + "flos": 71675625778560.0, + "grad_norm": 0.666650666050939, + "language_loss": 0.51957405, + "learning_rate": 1.473307699867203e-06, + "loss": 0.59532076, + "num_input_tokens_seen": 214090265, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 0.01785278, + "step": 9934, + "time_per_iteration": 3.263599157333374 + }, + { + "auxiliary_loss_clip": 0.06320157, + "auxiliary_loss_mlp": 0.01253316, + "balance_loss_clip": 0.06261201, + "balance_loss_mlp": 0.01251523, + "epoch": 0.5973245152562754, + "flos": 56910225427200.0, + "grad_norm": 0.8129555240105609, + "language_loss": 0.54121673, + "learning_rate": 1.4729319956234849e-06, + "loss": 0.61695147, + "num_input_tokens_seen": 214146375, + "router_z_loss_clip": 0.58984375, + "router_z_loss_mlp": 0.0178833, + "step": 9935, + "time_per_iteration": 3.13610577583313 + }, + { + "auxiliary_loss_clip": 0.0641949, + "auxiliary_loss_mlp": 0.01266102, + "balance_loss_clip": 0.06273362, + "balance_loss_mlp": 0.01254229, + "epoch": 0.5973846385089433, + "flos": 24170206947840.0, + "grad_norm": 1.6283043946182527, + "language_loss": 0.65934885, + "learning_rate": 1.4725563113662394e-06, + "loss": 0.7362048, + "num_input_tokens_seen": 214165340, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.11883545, + "step": 9936, + "time_per_iteration": 2.5317225456237793 + }, + { + "auxiliary_loss_clip": 0.06426519, + "auxiliary_loss_mlp": 0.01266905, + "balance_loss_clip": 0.06278973, + "balance_loss_mlp": 0.01256027, + "epoch": 0.5974447617616113, + "flos": 17675476652160.0, + "grad_norm": 1.977673103112211, + "language_loss": 0.67786443, + "learning_rate": 1.4721806471097103e-06, + "loss": 0.75479865, + "num_input_tokens_seen": 214181360, + "router_z_loss_clip": 1.47460938, + "router_z_loss_mlp": 0.10882568, + "step": 9937, + "time_per_iteration": 2.51056170463562 + }, + { + "auxiliary_loss_clip": 0.0642201, + "auxiliary_loss_mlp": 0.01272578, + "balance_loss_clip": 0.06274251, + "balance_loss_mlp": 0.01260073, + "epoch": 0.5975048850142792, + "flos": 22899008655360.0, + "grad_norm": 2.0510739773646853, + "language_loss": 0.77639204, + "learning_rate": 1.4718050028681442e-06, + "loss": 0.85333794, + "num_input_tokens_seen": 214198525, + "router_z_loss_clip": 1.47558594, + "router_z_loss_mlp": 0.12512207, + "step": 9938, + "time_per_iteration": 3.988826274871826 + }, + { + "auxiliary_loss_clip": 0.06425326, + "auxiliary_loss_mlp": 0.01266797, + "balance_loss_clip": 0.06278642, + "balance_loss_mlp": 0.01255145, + "epoch": 0.5975650082669473, + "flos": 24360042372480.0, + "grad_norm": 1.4729050693859964, + "language_loss": 0.76065636, + "learning_rate": 1.4714293786557855e-06, + "loss": 0.83757758, + "num_input_tokens_seen": 214218710, + "router_z_loss_clip": 1.46679688, + "router_z_loss_mlp": 0.11645508, + "step": 9939, + "time_per_iteration": 2.556417226791382 + }, + { + "auxiliary_loss_clip": 0.06427339, + "auxiliary_loss_mlp": 0.01268522, + "balance_loss_clip": 0.06275803, + "balance_loss_mlp": 0.01255206, + "epoch": 0.5976251315196152, + "flos": 20929696871040.0, + "grad_norm": 2.2639919876209498, + "language_loss": 0.68839771, + "learning_rate": 1.471053774486878e-06, + "loss": 0.7653563, + "num_input_tokens_seen": 214237800, + "router_z_loss_clip": 1.515625, + "router_z_loss_mlp": 0.13323975, + "step": 9940, + "time_per_iteration": 2.5342793464660645 + }, + { + "auxiliary_loss_clip": 0.06417148, + "auxiliary_loss_mlp": 0.01270575, + "balance_loss_clip": 0.06276263, + "balance_loss_mlp": 0.01259602, + "epoch": 0.5976852547722832, + "flos": 35853193877760.0, + "grad_norm": 1.2345186889810322, + "language_loss": 0.69966424, + "learning_rate": 1.470678190375664e-06, + "loss": 0.77654147, + "num_input_tokens_seen": 214260355, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.10968018, + "step": 9941, + "time_per_iteration": 2.6775453090667725 + }, + { + "auxiliary_loss_clip": 0.06416304, + "auxiliary_loss_mlp": 0.01265548, + "balance_loss_clip": 0.06272396, + "balance_loss_mlp": 0.0125433, + "epoch": 0.5977453780249512, + "flos": 12860266394880.0, + "grad_norm": 1.7893879951427467, + "language_loss": 0.77519101, + "learning_rate": 1.470302626336386e-06, + "loss": 0.85200953, + "num_input_tokens_seen": 214277120, + "router_z_loss_clip": 1.43847656, + "router_z_loss_mlp": 0.11224365, + "step": 9942, + "time_per_iteration": 2.5630502700805664 + }, + { + "auxiliary_loss_clip": 0.06422595, + "auxiliary_loss_mlp": 0.01267488, + "balance_loss_clip": 0.06273595, + "balance_loss_mlp": 0.0125478, + "epoch": 0.5978055012776191, + "flos": 20965391510400.0, + "grad_norm": 1.999196380936964, + "language_loss": 0.76118851, + "learning_rate": 1.4699270823832857e-06, + "loss": 0.83808935, + "num_input_tokens_seen": 214295300, + "router_z_loss_clip": 1.48828125, + "router_z_loss_mlp": 0.12713623, + "step": 9943, + "time_per_iteration": 3.9001221656799316 + }, + { + "auxiliary_loss_clip": 0.06417957, + "auxiliary_loss_mlp": 0.01266022, + "balance_loss_clip": 0.06274446, + "balance_loss_mlp": 0.01255728, + "epoch": 0.5978656245302871, + "flos": 34066506067200.0, + "grad_norm": 1.9908445339246823, + "language_loss": 0.62211335, + "learning_rate": 1.4695515585306032e-06, + "loss": 0.69895315, + "num_input_tokens_seen": 214317050, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.10296631, + "step": 9944, + "time_per_iteration": 2.6546871662139893 + }, + { + "auxiliary_loss_clip": 0.06420632, + "auxiliary_loss_mlp": 0.01266771, + "balance_loss_clip": 0.06276795, + "balance_loss_mlp": 0.01255333, + "epoch": 0.597925747782955, + "flos": 37381508023680.0, + "grad_norm": 1.6358533401507223, + "language_loss": 0.72854936, + "learning_rate": 1.4691760547925795e-06, + "loss": 0.80542344, + "num_input_tokens_seen": 214337470, + "router_z_loss_clip": 1.43945312, + "router_z_loss_mlp": 0.11450195, + "step": 9945, + "time_per_iteration": 2.631753444671631 + }, + { + "auxiliary_loss_clip": 0.06419382, + "auxiliary_loss_mlp": 0.01270411, + "balance_loss_clip": 0.06275386, + "balance_loss_mlp": 0.01258997, + "epoch": 0.5979858710356231, + "flos": 25381923240960.0, + "grad_norm": 1.7624660559370904, + "language_loss": 0.67425656, + "learning_rate": 1.4688005711834522e-06, + "loss": 0.75115454, + "num_input_tokens_seen": 214357975, + "router_z_loss_clip": 1.43945312, + "router_z_loss_mlp": 0.11401367, + "step": 9946, + "time_per_iteration": 2.5964295864105225 + }, + { + "auxiliary_loss_clip": 0.06427635, + "auxiliary_loss_mlp": 0.01269885, + "balance_loss_clip": 0.06277838, + "balance_loss_mlp": 0.01257678, + "epoch": 0.598045994288291, + "flos": 13703422504320.0, + "grad_norm": 1.825350503307894, + "language_loss": 0.88689518, + "learning_rate": 1.468425107717461e-06, + "loss": 0.96387035, + "num_input_tokens_seen": 214374125, + "router_z_loss_clip": 1.49804688, + "router_z_loss_mlp": 0.12194824, + "step": 9947, + "time_per_iteration": 2.47194766998291 + }, + { + "auxiliary_loss_clip": 0.06412566, + "auxiliary_loss_mlp": 0.01263948, + "balance_loss_clip": 0.06274778, + "balance_loss_mlp": 0.01253409, + "epoch": 0.598106117540959, + "flos": 21987859357440.0, + "grad_norm": 1.5868690486029033, + "language_loss": 0.71892309, + "learning_rate": 1.4680496644088432e-06, + "loss": 0.79568821, + "num_input_tokens_seen": 214393395, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.10540771, + "step": 9948, + "time_per_iteration": 2.519465446472168 + }, + { + "auxiliary_loss_clip": 0.06424625, + "auxiliary_loss_mlp": 0.0126681, + "balance_loss_clip": 0.06277405, + "balance_loss_mlp": 0.01255015, + "epoch": 0.5981662407936269, + "flos": 20565790588800.0, + "grad_norm": 1.9625714193598658, + "language_loss": 0.89521587, + "learning_rate": 1.4676742412718347e-06, + "loss": 0.97213024, + "num_input_tokens_seen": 214411550, + "router_z_loss_clip": 1.46972656, + "router_z_loss_mlp": 0.11791992, + "step": 9949, + "time_per_iteration": 2.512617588043213 + }, + { + "auxiliary_loss_clip": 0.0641937, + "auxiliary_loss_mlp": 0.01266363, + "balance_loss_clip": 0.06276002, + "balance_loss_mlp": 0.0125524, + "epoch": 0.5982263640462949, + "flos": 14069005868160.0, + "grad_norm": 2.2044341220338484, + "language_loss": 0.70866632, + "learning_rate": 1.467298838320673e-06, + "loss": 0.78552365, + "num_input_tokens_seen": 214429780, + "router_z_loss_clip": 1.43359375, + "router_z_loss_mlp": 0.11126709, + "step": 9950, + "time_per_iteration": 2.4983901977539062 + }, + { + "auxiliary_loss_clip": 0.06423427, + "auxiliary_loss_mlp": 0.01265207, + "balance_loss_clip": 0.06276861, + "balance_loss_mlp": 0.01254103, + "epoch": 0.5982864872989628, + "flos": 17712135613440.0, + "grad_norm": 1.7147951868971159, + "language_loss": 0.7865026, + "learning_rate": 1.4669234555695921e-06, + "loss": 0.86338896, + "num_input_tokens_seen": 214447775, + "router_z_loss_clip": 1.46484375, + "router_z_loss_mlp": 0.11102295, + "step": 9951, + "time_per_iteration": 2.5179500579833984 + }, + { + "auxiliary_loss_clip": 0.06422336, + "auxiliary_loss_mlp": 0.01268893, + "balance_loss_clip": 0.06276189, + "balance_loss_mlp": 0.01256215, + "epoch": 0.5983466105516309, + "flos": 16770574483200.0, + "grad_norm": 2.724642744329358, + "language_loss": 0.73936313, + "learning_rate": 1.4665480930328275e-06, + "loss": 0.81627548, + "num_input_tokens_seen": 214467245, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.12689209, + "step": 9952, + "time_per_iteration": 2.5671274662017822 + }, + { + "auxiliary_loss_clip": 0.06420863, + "auxiliary_loss_mlp": 0.01266742, + "balance_loss_clip": 0.06275067, + "balance_loss_mlp": 0.01254243, + "epoch": 0.5984067338042988, + "flos": 20048078937600.0, + "grad_norm": 1.9086154248374307, + "language_loss": 0.79033399, + "learning_rate": 1.466172750724613e-06, + "loss": 0.86721003, + "num_input_tokens_seen": 214484385, + "router_z_loss_clip": 1.45800781, + "router_z_loss_mlp": 0.12512207, + "step": 9953, + "time_per_iteration": 2.5575039386749268 + }, + { + "auxiliary_loss_clip": 0.06419245, + "auxiliary_loss_mlp": 0.01268437, + "balance_loss_clip": 0.06276231, + "balance_loss_mlp": 0.01257267, + "epoch": 0.5984668570569668, + "flos": 26326586972160.0, + "grad_norm": 1.3586799739820394, + "language_loss": 0.69871485, + "learning_rate": 1.4657974286591807e-06, + "loss": 0.77559167, + "num_input_tokens_seen": 214503465, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.1116333, + "step": 9954, + "time_per_iteration": 2.5664639472961426 + }, + { + "auxiliary_loss_clip": 0.06421678, + "auxiliary_loss_mlp": 0.01264771, + "balance_loss_clip": 0.06275603, + "balance_loss_mlp": 0.01253953, + "epoch": 0.5985269803096348, + "flos": 20599808146560.0, + "grad_norm": 3.504460387705041, + "language_loss": 0.73099947, + "learning_rate": 1.4654221268507637e-06, + "loss": 0.80786395, + "num_input_tokens_seen": 214520725, + "router_z_loss_clip": 1.46191406, + "router_z_loss_mlp": 0.10803223, + "step": 9955, + "time_per_iteration": 2.5450916290283203 + }, + { + "auxiliary_loss_clip": 0.06417805, + "auxiliary_loss_mlp": 0.01264034, + "balance_loss_clip": 0.06273872, + "balance_loss_mlp": 0.01252632, + "epoch": 0.5985871035623027, + "flos": 26871859416960.0, + "grad_norm": 1.7558609344018261, + "language_loss": 0.68993962, + "learning_rate": 1.4650468453135934e-06, + "loss": 0.76675797, + "num_input_tokens_seen": 214540675, + "router_z_loss_clip": 1.43945312, + "router_z_loss_mlp": 0.11401367, + "step": 9956, + "time_per_iteration": 2.596081256866455 + }, + { + "auxiliary_loss_clip": 0.06423829, + "auxiliary_loss_mlp": 0.01264045, + "balance_loss_clip": 0.06278121, + "balance_loss_mlp": 0.01253346, + "epoch": 0.5986472268149707, + "flos": 19615802123520.0, + "grad_norm": 2.031153762409854, + "language_loss": 0.74002242, + "learning_rate": 1.4646715840618999e-06, + "loss": 0.81690115, + "num_input_tokens_seen": 214559910, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.10699463, + "step": 9957, + "time_per_iteration": 2.5518100261688232 + }, + { + "auxiliary_loss_clip": 0.06412163, + "auxiliary_loss_mlp": 0.01266872, + "balance_loss_clip": 0.06272288, + "balance_loss_mlp": 0.01256071, + "epoch": 0.5987073500676386, + "flos": 21800371847040.0, + "grad_norm": 1.7255020808995434, + "language_loss": 0.84429491, + "learning_rate": 1.4642963431099138e-06, + "loss": 0.92108524, + "num_input_tokens_seen": 214575960, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.10803223, + "step": 9958, + "time_per_iteration": 2.5053975582122803 + }, + { + "auxiliary_loss_clip": 0.06420925, + "auxiliary_loss_mlp": 0.01267847, + "balance_loss_clip": 0.06275073, + "balance_loss_mlp": 0.01256594, + "epoch": 0.5987674733203067, + "flos": 24320909715840.0, + "grad_norm": 1.676255529467866, + "language_loss": 0.66404957, + "learning_rate": 1.463921122471864e-06, + "loss": 0.74093723, + "num_input_tokens_seen": 214594230, + "router_z_loss_clip": 1.45703125, + "router_z_loss_mlp": 0.11248779, + "step": 9959, + "time_per_iteration": 2.577558994293213 + }, + { + "auxiliary_loss_clip": 0.06423216, + "auxiliary_loss_mlp": 0.01263705, + "balance_loss_clip": 0.06278974, + "balance_loss_mlp": 0.01253418, + "epoch": 0.5988275965729746, + "flos": 21325859775360.0, + "grad_norm": 1.5343309289681366, + "language_loss": 0.83860743, + "learning_rate": 1.4635459221619796e-06, + "loss": 0.91547662, + "num_input_tokens_seen": 214613130, + "router_z_loss_clip": 1.44335938, + "router_z_loss_mlp": 0.10296631, + "step": 9960, + "time_per_iteration": 2.5171096324920654 + }, + { + "auxiliary_loss_clip": 0.06416292, + "auxiliary_loss_mlp": 0.01266192, + "balance_loss_clip": 0.0627322, + "balance_loss_mlp": 0.01254927, + "epoch": 0.5988877198256426, + "flos": 25124891241600.0, + "grad_norm": 1.3977520489587403, + "language_loss": 0.79645187, + "learning_rate": 1.4631707421944868e-06, + "loss": 0.87327671, + "num_input_tokens_seen": 214634470, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.11260986, + "step": 9961, + "time_per_iteration": 2.5664830207824707 + }, + { + "auxiliary_loss_clip": 0.06418522, + "auxiliary_loss_mlp": 0.01263845, + "balance_loss_clip": 0.0627479, + "balance_loss_mlp": 0.01253337, + "epoch": 0.5989478430783105, + "flos": 26435767242240.0, + "grad_norm": 1.8145848373023497, + "language_loss": 0.67511421, + "learning_rate": 1.4627955825836136e-06, + "loss": 0.75193793, + "num_input_tokens_seen": 214654030, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.10516357, + "step": 9962, + "time_per_iteration": 2.5658552646636963 + }, + { + "auxiliary_loss_clip": 0.06419411, + "auxiliary_loss_mlp": 0.01269677, + "balance_loss_clip": 0.06275185, + "balance_loss_mlp": 0.01258698, + "epoch": 0.5990079663309785, + "flos": 25786010355840.0, + "grad_norm": 1.2715525883777674, + "language_loss": 0.74696618, + "learning_rate": 1.4624204433435857e-06, + "loss": 0.82385707, + "num_input_tokens_seen": 214676985, + "router_z_loss_clip": 1.44238281, + "router_z_loss_mlp": 0.10980225, + "step": 9963, + "time_per_iteration": 2.5959842205047607 + }, + { + "auxiliary_loss_clip": 0.06414087, + "auxiliary_loss_mlp": 0.01266086, + "balance_loss_clip": 0.06273367, + "balance_loss_mlp": 0.01255494, + "epoch": 0.5990680895836464, + "flos": 36840889480320.0, + "grad_norm": 1.7000475586235915, + "language_loss": 0.68318057, + "learning_rate": 1.4620453244886281e-06, + "loss": 0.75998235, + "num_input_tokens_seen": 214700105, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.10601807, + "step": 9964, + "time_per_iteration": 2.652066230773926 + }, + { + "auxiliary_loss_clip": 0.06415234, + "auxiliary_loss_mlp": 0.01266775, + "balance_loss_clip": 0.06276559, + "balance_loss_mlp": 0.01256219, + "epoch": 0.5991282128363145, + "flos": 24140340167040.0, + "grad_norm": 1.9446201927807645, + "language_loss": 0.77307773, + "learning_rate": 1.4616702260329662e-06, + "loss": 0.84989786, + "num_input_tokens_seen": 214717885, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.10559082, + "step": 9965, + "time_per_iteration": 2.5652666091918945 + }, + { + "auxiliary_loss_clip": 0.0641766, + "auxiliary_loss_mlp": 0.01265032, + "balance_loss_clip": 0.0627239, + "balance_loss_mlp": 0.01254076, + "epoch": 0.5991883360889824, + "flos": 10308310444800.0, + "grad_norm": 2.43508720605834, + "language_loss": 0.77253437, + "learning_rate": 1.4612951479908229e-06, + "loss": 0.8493613, + "num_input_tokens_seen": 214733680, + "router_z_loss_clip": 1.45214844, + "router_z_loss_mlp": 0.10955811, + "step": 9966, + "time_per_iteration": 3.8983960151672363 + }, + { + "auxiliary_loss_clip": 0.06418956, + "auxiliary_loss_mlp": 0.01264547, + "balance_loss_clip": 0.06277221, + "balance_loss_mlp": 0.01254462, + "epoch": 0.5992484593416504, + "flos": 23957967755520.0, + "grad_norm": 1.382537362814459, + "language_loss": 0.73829538, + "learning_rate": 1.460920090376422e-06, + "loss": 0.81513047, + "num_input_tokens_seen": 214753285, + "router_z_loss_clip": 1.41894531, + "router_z_loss_mlp": 0.10095215, + "step": 9967, + "time_per_iteration": 2.55789852142334 + }, + { + "auxiliary_loss_clip": 0.06430869, + "auxiliary_loss_mlp": 0.01269853, + "balance_loss_clip": 0.06279887, + "balance_loss_mlp": 0.01258177, + "epoch": 0.5993085825943184, + "flos": 11948320483200.0, + "grad_norm": 2.02451624384261, + "language_loss": 0.69043863, + "learning_rate": 1.4605450532039847e-06, + "loss": 0.76744592, + "num_input_tokens_seen": 214767810, + "router_z_loss_clip": 1.50878906, + "router_z_loss_mlp": 0.11669922, + "step": 9968, + "time_per_iteration": 2.4782519340515137 + }, + { + "auxiliary_loss_clip": 0.06417669, + "auxiliary_loss_mlp": 0.01265537, + "balance_loss_clip": 0.06270653, + "balance_loss_mlp": 0.01253926, + "epoch": 0.5993687058469863, + "flos": 19032990249600.0, + "grad_norm": 1.5128271497944086, + "language_loss": 0.79284239, + "learning_rate": 1.4601700364877334e-06, + "loss": 0.86967438, + "num_input_tokens_seen": 214786040, + "router_z_loss_clip": 1.46972656, + "router_z_loss_mlp": 0.11608887, + "step": 9969, + "time_per_iteration": 2.5151612758636475 + }, + { + "auxiliary_loss_clip": 0.06416395, + "auxiliary_loss_mlp": 0.01265086, + "balance_loss_clip": 0.06272908, + "balance_loss_mlp": 0.0125369, + "epoch": 0.5994288290996543, + "flos": 14288204949120.0, + "grad_norm": 1.5374697799261579, + "language_loss": 0.81015587, + "learning_rate": 1.4597950402418889e-06, + "loss": 0.88697076, + "num_input_tokens_seen": 214803110, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.11383057, + "step": 9970, + "time_per_iteration": 2.5037295818328857 + }, + { + "auxiliary_loss_clip": 0.06425726, + "auxiliary_loss_mlp": 0.01265933, + "balance_loss_clip": 0.06278643, + "balance_loss_mlp": 0.01253136, + "epoch": 0.5994889523523222, + "flos": 19212385841280.0, + "grad_norm": 1.7784771847806544, + "language_loss": 0.6253432, + "learning_rate": 1.4594200644806697e-06, + "loss": 0.70225984, + "num_input_tokens_seen": 214819945, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 0.12805176, + "step": 9971, + "time_per_iteration": 2.5600948333740234 + }, + { + "auxiliary_loss_clip": 0.0641441, + "auxiliary_loss_mlp": 0.01262981, + "balance_loss_clip": 0.06275569, + "balance_loss_mlp": 0.01252121, + "epoch": 0.5995490756049903, + "flos": 28044401126400.0, + "grad_norm": 1.5809560666799003, + "language_loss": 0.79321986, + "learning_rate": 1.4590451092182962e-06, + "loss": 0.86999381, + "num_input_tokens_seen": 214838810, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.10864258, + "step": 9972, + "time_per_iteration": 2.5908236503601074 + }, + { + "auxiliary_loss_clip": 0.06426332, + "auxiliary_loss_mlp": 0.01268265, + "balance_loss_clip": 0.06275315, + "balance_loss_mlp": 0.01256595, + "epoch": 0.5996091988576582, + "flos": 29059531741440.0, + "grad_norm": 2.0347749890566957, + "language_loss": 0.76122165, + "learning_rate": 1.4586701744689864e-06, + "loss": 0.83816767, + "num_input_tokens_seen": 214857040, + "router_z_loss_clip": 1.50878906, + "router_z_loss_mlp": 0.11663818, + "step": 9973, + "time_per_iteration": 4.03744912147522 + }, + { + "auxiliary_loss_clip": 0.06415765, + "auxiliary_loss_mlp": 0.01269004, + "balance_loss_clip": 0.06271605, + "balance_loss_mlp": 0.01258048, + "epoch": 0.5996693221103262, + "flos": 20820306965760.0, + "grad_norm": 8.14230844682113, + "language_loss": 0.65456331, + "learning_rate": 1.4582952602469578e-06, + "loss": 0.73141098, + "num_input_tokens_seen": 214873375, + "router_z_loss_clip": 1.44140625, + "router_z_loss_mlp": 0.10961914, + "step": 9974, + "time_per_iteration": 2.545727491378784 + }, + { + "auxiliary_loss_clip": 0.06421987, + "auxiliary_loss_mlp": 0.01267073, + "balance_loss_clip": 0.06277154, + "balance_loss_mlp": 0.0125607, + "epoch": 0.5997294453629941, + "flos": 23775679198080.0, + "grad_norm": 1.6348808694128185, + "language_loss": 0.74560261, + "learning_rate": 1.457920366566428e-06, + "loss": 0.8224932, + "num_input_tokens_seen": 214893900, + "router_z_loss_clip": 1.44726562, + "router_z_loss_mlp": 0.11010742, + "step": 9975, + "time_per_iteration": 2.515960931777954 + }, + { + "auxiliary_loss_clip": 0.06416074, + "auxiliary_loss_mlp": 0.01267839, + "balance_loss_clip": 0.06272042, + "balance_loss_mlp": 0.01256985, + "epoch": 0.5997895686156621, + "flos": 20966397759360.0, + "grad_norm": 1.627086760059136, + "language_loss": 0.77381539, + "learning_rate": 1.457545493441611e-06, + "loss": 0.85065448, + "num_input_tokens_seen": 214912110, + "router_z_loss_clip": 1.44042969, + "router_z_loss_mlp": 0.10864258, + "step": 9976, + "time_per_iteration": 2.5143842697143555 + }, + { + "auxiliary_loss_clip": 0.06419265, + "auxiliary_loss_mlp": 0.01265963, + "balance_loss_clip": 0.06276691, + "balance_loss_mlp": 0.01255162, + "epoch": 0.59984969186833, + "flos": 28372864331520.0, + "grad_norm": 2.2336999868815837, + "language_loss": 0.75166976, + "learning_rate": 1.4571706408867237e-06, + "loss": 0.82852209, + "num_input_tokens_seen": 214930140, + "router_z_loss_clip": 1.42480469, + "router_z_loss_mlp": 0.10803223, + "step": 9977, + "time_per_iteration": 2.5434179306030273 + }, + { + "auxiliary_loss_clip": 0.06417818, + "auxiliary_loss_mlp": 0.01269861, + "balance_loss_clip": 0.06272452, + "balance_loss_mlp": 0.01258358, + "epoch": 0.5999098151209981, + "flos": 22572641802240.0, + "grad_norm": 1.5140714638849335, + "language_loss": 0.69135988, + "learning_rate": 1.4567958089159802e-06, + "loss": 0.76823664, + "num_input_tokens_seen": 214949200, + "router_z_loss_clip": 1.453125, + "router_z_loss_mlp": 0.11499023, + "step": 9978, + "time_per_iteration": 3.9952354431152344 + }, + { + "auxiliary_loss_clip": 0.06421594, + "auxiliary_loss_mlp": 0.01266213, + "balance_loss_clip": 0.06274537, + "balance_loss_mlp": 0.01254977, + "epoch": 0.599969938373666, + "flos": 18774365022720.0, + "grad_norm": 1.8838130799328623, + "language_loss": 0.81737733, + "learning_rate": 1.456420997543594e-06, + "loss": 0.89425546, + "num_input_tokens_seen": 214965775, + "router_z_loss_clip": 1.46972656, + "router_z_loss_mlp": 0.11236572, + "step": 9979, + "time_per_iteration": 2.5352437496185303 + }, + { + "auxiliary_loss_clip": 0.06412499, + "auxiliary_loss_mlp": 0.01267556, + "balance_loss_clip": 0.06274675, + "balance_loss_mlp": 0.01257239, + "epoch": 0.600030061626334, + "flos": 11331910321920.0, + "grad_norm": 1.7106471218945785, + "language_loss": 0.70199746, + "learning_rate": 1.4560462067837782e-06, + "loss": 0.77879798, + "num_input_tokens_seen": 214982480, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.10314941, + "step": 9980, + "time_per_iteration": 2.4757728576660156 + }, + { + "auxiliary_loss_clip": 0.06423149, + "auxiliary_loss_mlp": 0.01269991, + "balance_loss_clip": 0.06274426, + "balance_loss_mlp": 0.01258463, + "epoch": 0.600090184879002, + "flos": 16583799732480.0, + "grad_norm": 2.417469697653489, + "language_loss": 0.690139, + "learning_rate": 1.4556714366507445e-06, + "loss": 0.76707041, + "num_input_tokens_seen": 214998110, + "router_z_loss_clip": 1.48925781, + "router_z_loss_mlp": 0.11523438, + "step": 9981, + "time_per_iteration": 2.4791438579559326 + }, + { + "auxiliary_loss_clip": 0.0641709, + "auxiliary_loss_mlp": 0.01265689, + "balance_loss_clip": 0.0627474, + "balance_loss_mlp": 0.01255342, + "epoch": 0.6001503081316699, + "flos": 23624641013760.0, + "grad_norm": 3.5503488009813275, + "language_loss": 0.78682542, + "learning_rate": 1.4552966871587048e-06, + "loss": 0.86365318, + "num_input_tokens_seen": 215017995, + "router_z_loss_clip": 1.42382812, + "router_z_loss_mlp": 0.10345459, + "step": 9982, + "time_per_iteration": 2.517265796661377 + }, + { + "auxiliary_loss_clip": 0.06418465, + "auxiliary_loss_mlp": 0.01269533, + "balance_loss_clip": 0.06276916, + "balance_loss_mlp": 0.01258852, + "epoch": 0.6002104313843379, + "flos": 20673922682880.0, + "grad_norm": 1.4834511581102687, + "language_loss": 0.72993171, + "learning_rate": 1.4549219583218686e-06, + "loss": 0.80681169, + "num_input_tokens_seen": 215038285, + "router_z_loss_clip": 1.41308594, + "router_z_loss_mlp": 0.10681152, + "step": 9983, + "time_per_iteration": 2.5322060585021973 + }, + { + "auxiliary_loss_clip": 0.06419442, + "auxiliary_loss_mlp": 0.01265277, + "balance_loss_clip": 0.0627455, + "balance_loss_mlp": 0.01254274, + "epoch": 0.6002705546370058, + "flos": 22461742523520.0, + "grad_norm": 1.817313812044092, + "language_loss": 0.77973288, + "learning_rate": 1.454547250154447e-06, + "loss": 0.85658008, + "num_input_tokens_seen": 215057825, + "router_z_loss_clip": 1.44921875, + "router_z_loss_mlp": 0.10998535, + "step": 9984, + "time_per_iteration": 3.889902353286743 + }, + { + "auxiliary_loss_clip": 0.06414619, + "auxiliary_loss_mlp": 0.01267761, + "balance_loss_clip": 0.06271429, + "balance_loss_mlp": 0.01256568, + "epoch": 0.6003306778896739, + "flos": 25199005777920.0, + "grad_norm": 1.5215747487142872, + "language_loss": 0.83512825, + "learning_rate": 1.4541725626706485e-06, + "loss": 0.91195202, + "num_input_tokens_seen": 215077790, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.11199951, + "step": 9985, + "time_per_iteration": 2.575650691986084 + }, + { + "auxiliary_loss_clip": 0.06417745, + "auxiliary_loss_mlp": 0.01270811, + "balance_loss_clip": 0.06274939, + "balance_loss_mlp": 0.01260666, + "epoch": 0.6003908011423418, + "flos": 26694979447680.0, + "grad_norm": 1.7185413261664646, + "language_loss": 0.71617854, + "learning_rate": 1.4537978958846809e-06, + "loss": 0.79306406, + "num_input_tokens_seen": 215097650, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.10144043, + "step": 9986, + "time_per_iteration": 2.603126287460327 + }, + { + "auxiliary_loss_clip": 0.06418968, + "auxiliary_loss_mlp": 0.0127052, + "balance_loss_clip": 0.0627557, + "balance_loss_mlp": 0.01259451, + "epoch": 0.6004509243950098, + "flos": 22571677480320.0, + "grad_norm": 1.4916160282529034, + "language_loss": 0.72118956, + "learning_rate": 1.4534232498107514e-06, + "loss": 0.79808438, + "num_input_tokens_seen": 215118235, + "router_z_loss_clip": 1.43359375, + "router_z_loss_mlp": 0.11077881, + "step": 9987, + "time_per_iteration": 2.5536653995513916 + }, + { + "auxiliary_loss_clip": 0.06410448, + "auxiliary_loss_mlp": 0.01267963, + "balance_loss_clip": 0.06270513, + "balance_loss_mlp": 0.01257741, + "epoch": 0.6005110476476777, + "flos": 19725443591040.0, + "grad_norm": 1.6002442710001008, + "language_loss": 0.85169375, + "learning_rate": 1.4530486244630673e-06, + "loss": 0.92847788, + "num_input_tokens_seen": 215136755, + "router_z_loss_clip": 1.39941406, + "router_z_loss_mlp": 0.10223389, + "step": 9988, + "time_per_iteration": 2.676584482192993 + }, + { + "auxiliary_loss_clip": 0.06413879, + "auxiliary_loss_mlp": 0.01268869, + "balance_loss_clip": 0.06271169, + "balance_loss_mlp": 0.0125783, + "epoch": 0.6005711709003457, + "flos": 17718340815360.0, + "grad_norm": 1.8176771569563623, + "language_loss": 0.66009402, + "learning_rate": 1.4526740198558346e-06, + "loss": 0.73692149, + "num_input_tokens_seen": 215155225, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.1104126, + "step": 9989, + "time_per_iteration": 2.486422300338745 + }, + { + "auxiliary_loss_clip": 0.06419196, + "auxiliary_loss_mlp": 0.01266624, + "balance_loss_clip": 0.06276186, + "balance_loss_mlp": 0.01256288, + "epoch": 0.6006312941530136, + "flos": 18520267916160.0, + "grad_norm": 1.406905965203465, + "language_loss": 0.80891693, + "learning_rate": 1.452299436003257e-06, + "loss": 0.88577515, + "num_input_tokens_seen": 215174815, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.10327148, + "step": 9990, + "time_per_iteration": 2.535477876663208 + }, + { + "auxiliary_loss_clip": 0.06421524, + "auxiliary_loss_mlp": 0.01272993, + "balance_loss_clip": 0.06275146, + "balance_loss_mlp": 0.01261829, + "epoch": 0.6006914174056817, + "flos": 21396117024000.0, + "grad_norm": 2.6934120952656557, + "language_loss": 0.82880741, + "learning_rate": 1.4519248729195403e-06, + "loss": 0.9057526, + "num_input_tokens_seen": 215192045, + "router_z_loss_clip": 1.46386719, + "router_z_loss_mlp": 0.11157227, + "step": 9991, + "time_per_iteration": 2.518101215362549 + }, + { + "auxiliary_loss_clip": 0.06412101, + "auxiliary_loss_mlp": 0.01267116, + "balance_loss_clip": 0.06272052, + "balance_loss_mlp": 0.01256012, + "epoch": 0.6007515406583496, + "flos": 12755488464000.0, + "grad_norm": 1.8815822669797526, + "language_loss": 0.83029675, + "learning_rate": 1.4515503306188878e-06, + "loss": 0.90708888, + "num_input_tokens_seen": 215209885, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.11096191, + "step": 9992, + "time_per_iteration": 2.521474599838257 + }, + { + "auxiliary_loss_clip": 0.06415074, + "auxiliary_loss_mlp": 0.01267357, + "balance_loss_clip": 0.06272477, + "balance_loss_mlp": 0.01256098, + "epoch": 0.6008116639110176, + "flos": 19212679330560.0, + "grad_norm": 1.7865103371256597, + "language_loss": 0.66380614, + "learning_rate": 1.4511758091155008e-06, + "loss": 0.74063051, + "num_input_tokens_seen": 215228150, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.11260986, + "step": 9993, + "time_per_iteration": 2.4865942001342773 + }, + { + "auxiliary_loss_clip": 0.0641458, + "auxiliary_loss_mlp": 0.01267154, + "balance_loss_clip": 0.06271669, + "balance_loss_mlp": 0.0125633, + "epoch": 0.6008717871636855, + "flos": 17060953207680.0, + "grad_norm": 2.3852752129116115, + "language_loss": 0.81380951, + "learning_rate": 1.4508013084235826e-06, + "loss": 0.89062685, + "num_input_tokens_seen": 215243755, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.1083374, + "step": 9994, + "time_per_iteration": 2.500990390777588 + }, + { + "auxiliary_loss_clip": 0.0640981, + "auxiliary_loss_mlp": 0.01267464, + "balance_loss_clip": 0.06272399, + "balance_loss_mlp": 0.01257242, + "epoch": 0.6009319104163535, + "flos": 20304188542080.0, + "grad_norm": 1.763050873993328, + "language_loss": 0.72585195, + "learning_rate": 1.4504268285573337e-06, + "loss": 0.8026247, + "num_input_tokens_seen": 215262130, + "router_z_loss_clip": 1.37402344, + "router_z_loss_mlp": 0.10229492, + "step": 9995, + "time_per_iteration": 2.482269287109375 + }, + { + "auxiliary_loss_clip": 0.06416491, + "auxiliary_loss_mlp": 0.01267971, + "balance_loss_clip": 0.06272282, + "balance_loss_mlp": 0.01257242, + "epoch": 0.6009920336690215, + "flos": 21843487572480.0, + "grad_norm": 1.6604568353476683, + "language_loss": 0.81016338, + "learning_rate": 1.4500523695309546e-06, + "loss": 0.88700801, + "num_input_tokens_seen": 215281785, + "router_z_loss_clip": 1.44433594, + "router_z_loss_mlp": 0.10736084, + "step": 9996, + "time_per_iteration": 2.5466809272766113 + }, + { + "auxiliary_loss_clip": 0.06416655, + "auxiliary_loss_mlp": 0.01270292, + "balance_loss_clip": 0.06274925, + "balance_loss_mlp": 0.01259772, + "epoch": 0.6010521569216895, + "flos": 22601795823360.0, + "grad_norm": 1.669746646683285, + "language_loss": 0.79055232, + "learning_rate": 1.4496779313586447e-06, + "loss": 0.86742181, + "num_input_tokens_seen": 215297550, + "router_z_loss_clip": 1.41796875, + "router_z_loss_mlp": 0.10522461, + "step": 9997, + "time_per_iteration": 2.489703416824341 + }, + { + "auxiliary_loss_clip": 0.06421417, + "auxiliary_loss_mlp": 0.01266279, + "balance_loss_clip": 0.06274536, + "balance_loss_mlp": 0.0125496, + "epoch": 0.6011122801743575, + "flos": 19177697450880.0, + "grad_norm": 1.7167006806270684, + "language_loss": 0.72813851, + "learning_rate": 1.4493035140546028e-06, + "loss": 0.80501544, + "num_input_tokens_seen": 215316360, + "router_z_loss_clip": 1.46777344, + "router_z_loss_mlp": 0.11315918, + "step": 9998, + "time_per_iteration": 2.5477771759033203 + }, + { + "auxiliary_loss_clip": 0.06413899, + "auxiliary_loss_mlp": 0.01264976, + "balance_loss_clip": 0.0627325, + "balance_loss_mlp": 0.01254671, + "epoch": 0.6011724034270254, + "flos": 25017094563840.0, + "grad_norm": 1.4177411729498055, + "language_loss": 0.72547859, + "learning_rate": 1.448929117633027e-06, + "loss": 0.80226737, + "num_input_tokens_seen": 215336405, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.10302734, + "step": 9999, + "time_per_iteration": 2.658071517944336 + }, + { + "auxiliary_loss_clip": 0.06419925, + "auxiliary_loss_mlp": 0.0126529, + "balance_loss_clip": 0.06273222, + "balance_loss_mlp": 0.01253948, + "epoch": 0.6012325266796934, + "flos": 21803935645440.0, + "grad_norm": 1.3735035595460474, + "language_loss": 0.78419137, + "learning_rate": 1.4485547421081142e-06, + "loss": 0.86104351, + "num_input_tokens_seen": 215356590, + "router_z_loss_clip": 1.46777344, + "router_z_loss_mlp": 0.11346436, + "step": 10000, + "time_per_iteration": 2.6216328144073486 + }, + { + "auxiliary_loss_clip": 0.06423375, + "auxiliary_loss_mlp": 0.0126636, + "balance_loss_clip": 0.06274506, + "balance_loss_mlp": 0.01253974, + "epoch": 0.6012926499323613, + "flos": 19579059308160.0, + "grad_norm": 2.6942443051056797, + "language_loss": 0.77449071, + "learning_rate": 1.4481803874940608e-06, + "loss": 0.85138798, + "num_input_tokens_seen": 215374295, + "router_z_loss_clip": 1.48828125, + "router_z_loss_mlp": 0.1239624, + "step": 10001, + "time_per_iteration": 2.4916481971740723 + }, + { + "auxiliary_loss_clip": 0.06419365, + "auxiliary_loss_mlp": 0.01264494, + "balance_loss_clip": 0.06272592, + "balance_loss_mlp": 0.01253479, + "epoch": 0.6013527731850293, + "flos": 34869439416960.0, + "grad_norm": 2.005983259780714, + "language_loss": 0.59280682, + "learning_rate": 1.4478060538050624e-06, + "loss": 0.66964543, + "num_input_tokens_seen": 215394535, + "router_z_loss_clip": 1.46679688, + "router_z_loss_mlp": 0.11004639, + "step": 10002, + "time_per_iteration": 2.6645169258117676 + }, + { + "auxiliary_loss_clip": 0.06426313, + "auxiliary_loss_mlp": 0.01266842, + "balance_loss_clip": 0.06280068, + "balance_loss_mlp": 0.01255636, + "epoch": 0.6014128964376972, + "flos": 23298190306560.0, + "grad_norm": 1.4832163301855164, + "language_loss": 0.78208435, + "learning_rate": 1.447431741055314e-06, + "loss": 0.85901594, + "num_input_tokens_seen": 215414355, + "router_z_loss_clip": 1.46289062, + "router_z_loss_mlp": 0.11199951, + "step": 10003, + "time_per_iteration": 2.5180611610412598 + }, + { + "auxiliary_loss_clip": 0.0641861, + "auxiliary_loss_mlp": 0.01265947, + "balance_loss_clip": 0.06273924, + "balance_loss_mlp": 0.01254503, + "epoch": 0.6014730196903653, + "flos": 24826839868800.0, + "grad_norm": 2.3891485516500857, + "language_loss": 0.77473211, + "learning_rate": 1.4470574492590091e-06, + "loss": 0.8515777, + "num_input_tokens_seen": 215428280, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.11437988, + "step": 10004, + "time_per_iteration": 2.6330173015594482 + }, + { + "auxiliary_loss_clip": 0.06419056, + "auxiliary_loss_mlp": 0.01264798, + "balance_loss_clip": 0.06274185, + "balance_loss_mlp": 0.01253622, + "epoch": 0.6015331429430332, + "flos": 23119046277120.0, + "grad_norm": 1.439097178617253, + "language_loss": 0.72748709, + "learning_rate": 1.4466831784303408e-06, + "loss": 0.80432558, + "num_input_tokens_seen": 215448970, + "router_z_loss_clip": 1.45019531, + "router_z_loss_mlp": 0.11187744, + "step": 10005, + "time_per_iteration": 3.9784722328186035 + }, + { + "auxiliary_loss_clip": 0.06408843, + "auxiliary_loss_mlp": 0.01267392, + "balance_loss_clip": 0.06270996, + "balance_loss_mlp": 0.01257415, + "epoch": 0.6015932661957012, + "flos": 19206222566400.0, + "grad_norm": 2.0810783182593453, + "language_loss": 0.75111496, + "learning_rate": 1.4463089285835026e-06, + "loss": 0.82787728, + "num_input_tokens_seen": 215465260, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.09979248, + "step": 10006, + "time_per_iteration": 2.479973793029785 + }, + { + "auxiliary_loss_clip": 0.06413963, + "auxiliary_loss_mlp": 0.01266799, + "balance_loss_clip": 0.06270643, + "balance_loss_mlp": 0.01255659, + "epoch": 0.6016533894483691, + "flos": 18119451110400.0, + "grad_norm": 1.7404924752402045, + "language_loss": 0.74258769, + "learning_rate": 1.445934699732685e-06, + "loss": 0.8193953, + "num_input_tokens_seen": 215482725, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.1114502, + "step": 10007, + "time_per_iteration": 2.514868974685669 + }, + { + "auxiliary_loss_clip": 0.06414758, + "auxiliary_loss_mlp": 0.01265594, + "balance_loss_clip": 0.06273913, + "balance_loss_mlp": 0.01254161, + "epoch": 0.6017135127010371, + "flos": 16222492926720.0, + "grad_norm": 1.6904603378944318, + "language_loss": 0.70442504, + "learning_rate": 1.4455604918920785e-06, + "loss": 0.78122854, + "num_input_tokens_seen": 215500420, + "router_z_loss_clip": 1.40722656, + "router_z_loss_mlp": 0.11425781, + "step": 10008, + "time_per_iteration": 2.491718053817749 + }, + { + "auxiliary_loss_clip": 0.0641681, + "auxiliary_loss_mlp": 0.01264471, + "balance_loss_clip": 0.06274457, + "balance_loss_mlp": 0.01254291, + "epoch": 0.6017736359537051, + "flos": 23451576405120.0, + "grad_norm": 1.626126690886893, + "language_loss": 0.7634151, + "learning_rate": 1.4451863050758748e-06, + "loss": 0.84022784, + "num_input_tokens_seen": 215522260, + "router_z_loss_clip": 1.42480469, + "router_z_loss_mlp": 0.10186768, + "step": 10009, + "time_per_iteration": 2.599497079849243 + }, + { + "auxiliary_loss_clip": 0.06414296, + "auxiliary_loss_mlp": 0.01267744, + "balance_loss_clip": 0.06272782, + "balance_loss_mlp": 0.01256455, + "epoch": 0.601833759206373, + "flos": 23520869331840.0, + "grad_norm": 2.016447610820272, + "language_loss": 0.73958981, + "learning_rate": 1.4448121392982608e-06, + "loss": 0.8164103, + "num_input_tokens_seen": 215541715, + "router_z_loss_clip": 1.41503906, + "router_z_loss_mlp": 0.11279297, + "step": 10010, + "time_per_iteration": 2.542102098464966 + }, + { + "auxiliary_loss_clip": 0.06320257, + "auxiliary_loss_mlp": 0.01264863, + "balance_loss_clip": 0.06261265, + "balance_loss_mlp": 0.01263333, + "epoch": 0.6018938824590411, + "flos": 64013846215680.0, + "grad_norm": 0.9512553520354263, + "language_loss": 0.55134046, + "learning_rate": 1.4444379945734268e-06, + "loss": 0.6271916, + "num_input_tokens_seen": 215603020, + "router_z_loss_clip": 0.59130859, + "router_z_loss_mlp": 0.01529694, + "step": 10011, + "time_per_iteration": 3.219438076019287 + }, + { + "auxiliary_loss_clip": 0.064165, + "auxiliary_loss_mlp": 0.01266395, + "balance_loss_clip": 0.06272937, + "balance_loss_mlp": 0.01256233, + "epoch": 0.601954005711709, + "flos": 34648311692160.0, + "grad_norm": 1.3620910382501825, + "language_loss": 0.6241864, + "learning_rate": 1.44406387091556e-06, + "loss": 0.70101535, + "num_input_tokens_seen": 215625115, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.1015625, + "step": 10012, + "time_per_iteration": 4.187492609024048 + }, + { + "auxiliary_loss_clip": 0.06412341, + "auxiliary_loss_mlp": 0.01261432, + "balance_loss_clip": 0.06271702, + "balance_loss_mlp": 0.0125155, + "epoch": 0.602014128964377, + "flos": 19433094295680.0, + "grad_norm": 1.6346863878236784, + "language_loss": 0.75188845, + "learning_rate": 1.4436897683388462e-06, + "loss": 0.82862616, + "num_input_tokens_seen": 215643730, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.09881592, + "step": 10013, + "time_per_iteration": 2.4897818565368652 + }, + { + "auxiliary_loss_clip": 0.06409096, + "auxiliary_loss_mlp": 0.01262449, + "balance_loss_clip": 0.06273073, + "balance_loss_mlp": 0.01252823, + "epoch": 0.6020742522170449, + "flos": 28336876202880.0, + "grad_norm": 1.4752372512859242, + "language_loss": 0.81565046, + "learning_rate": 1.4433156868574732e-06, + "loss": 0.89236587, + "num_input_tokens_seen": 215664425, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.09625244, + "step": 10014, + "time_per_iteration": 2.5903513431549072 + }, + { + "auxiliary_loss_clip": 0.06408108, + "auxiliary_loss_mlp": 0.01262661, + "balance_loss_clip": 0.06272644, + "balance_loss_mlp": 0.01252617, + "epoch": 0.6021343754697129, + "flos": 22753588694400.0, + "grad_norm": 1.6084117246958012, + "language_loss": 0.72432387, + "learning_rate": 1.442941626485624e-06, + "loss": 0.80103159, + "num_input_tokens_seen": 215684280, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.10046387, + "step": 10015, + "time_per_iteration": 2.5320956707000732 + }, + { + "auxiliary_loss_clip": 0.06313504, + "auxiliary_loss_mlp": 0.01271116, + "balance_loss_clip": 0.06254423, + "balance_loss_mlp": 0.01269587, + "epoch": 0.6021944987223808, + "flos": 65769885360000.0, + "grad_norm": 0.8212846281484271, + "language_loss": 0.54902303, + "learning_rate": 1.4425675872374848e-06, + "loss": 0.62486923, + "num_input_tokens_seen": 215739780, + "router_z_loss_clip": 0.59375, + "router_z_loss_mlp": 0.01528168, + "step": 10016, + "time_per_iteration": 3.0691990852355957 + }, + { + "auxiliary_loss_clip": 0.06413935, + "auxiliary_loss_mlp": 0.01266264, + "balance_loss_clip": 0.06274504, + "balance_loss_mlp": 0.01255767, + "epoch": 0.6022546219750489, + "flos": 16110377763840.0, + "grad_norm": 1.6476177539901398, + "language_loss": 0.82975459, + "learning_rate": 1.4421935691272381e-06, + "loss": 0.90655655, + "num_input_tokens_seen": 215757885, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.10498047, + "step": 10017, + "time_per_iteration": 4.000306606292725 + }, + { + "auxiliary_loss_clip": 0.06413059, + "auxiliary_loss_mlp": 0.01267664, + "balance_loss_clip": 0.06276649, + "balance_loss_mlp": 0.01257465, + "epoch": 0.6023147452277168, + "flos": 25518328888320.0, + "grad_norm": 1.7212842530240955, + "language_loss": 0.83736604, + "learning_rate": 1.4418195721690677e-06, + "loss": 0.91417325, + "num_input_tokens_seen": 215776415, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.10198975, + "step": 10018, + "time_per_iteration": 2.5354957580566406 + }, + { + "auxiliary_loss_clip": 0.06423128, + "auxiliary_loss_mlp": 0.01265844, + "balance_loss_clip": 0.06276394, + "balance_loss_mlp": 0.01254751, + "epoch": 0.6023748684803848, + "flos": 22642353999360.0, + "grad_norm": 1.5941982193166335, + "language_loss": 0.78464353, + "learning_rate": 1.4414455963771549e-06, + "loss": 0.86153316, + "num_input_tokens_seen": 215794865, + "router_z_loss_clip": 1.46777344, + "router_z_loss_mlp": 0.11096191, + "step": 10019, + "time_per_iteration": 2.534315586090088 + }, + { + "auxiliary_loss_clip": 0.06414038, + "auxiliary_loss_mlp": 0.0126418, + "balance_loss_clip": 0.06272502, + "balance_loss_mlp": 0.01253433, + "epoch": 0.6024349917330527, + "flos": 26217113212800.0, + "grad_norm": 1.7295998133508477, + "language_loss": 0.7397396, + "learning_rate": 1.441071641765681e-06, + "loss": 0.81652176, + "num_input_tokens_seen": 215816840, + "router_z_loss_clip": 1.41503906, + "router_z_loss_mlp": 0.10742188, + "step": 10020, + "time_per_iteration": 2.5745153427124023 + }, + { + "auxiliary_loss_clip": 0.06419009, + "auxiliary_loss_mlp": 0.01267121, + "balance_loss_clip": 0.06276802, + "balance_loss_mlp": 0.01256875, + "epoch": 0.6024951149857207, + "flos": 21258160076160.0, + "grad_norm": 1.6276524527254101, + "language_loss": 0.64517641, + "learning_rate": 1.4406977083488264e-06, + "loss": 0.72203767, + "num_input_tokens_seen": 215836100, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.10247803, + "step": 10021, + "time_per_iteration": 2.5457210540771484 + }, + { + "auxiliary_loss_clip": 0.06415432, + "auxiliary_loss_mlp": 0.01267969, + "balance_loss_clip": 0.06273261, + "balance_loss_mlp": 0.01256776, + "epoch": 0.6025552382383887, + "flos": 26950795562880.0, + "grad_norm": 1.4058190289621155, + "language_loss": 0.80931878, + "learning_rate": 1.4403237961407704e-06, + "loss": 0.88615286, + "num_input_tokens_seen": 215858480, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.11187744, + "step": 10022, + "time_per_iteration": 4.0118248462677 + }, + { + "auxiliary_loss_clip": 0.06419462, + "auxiliary_loss_mlp": 0.0126571, + "balance_loss_clip": 0.06273965, + "balance_loss_mlp": 0.01255089, + "epoch": 0.6026153614910567, + "flos": 31692142846080.0, + "grad_norm": 1.4147504892998892, + "language_loss": 0.66787559, + "learning_rate": 1.439949905155693e-06, + "loss": 0.74472731, + "num_input_tokens_seen": 215879950, + "router_z_loss_clip": 1.45703125, + "router_z_loss_mlp": 0.10620117, + "step": 10023, + "time_per_iteration": 2.6242425441741943 + }, + { + "auxiliary_loss_clip": 0.0642107, + "auxiliary_loss_mlp": 0.01268575, + "balance_loss_clip": 0.06277968, + "balance_loss_mlp": 0.01257554, + "epoch": 0.6026754847437247, + "flos": 29320085612160.0, + "grad_norm": 1.6857710992723132, + "language_loss": 0.73865843, + "learning_rate": 1.4395760354077707e-06, + "loss": 0.81555492, + "num_input_tokens_seen": 215899830, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.11029053, + "step": 10024, + "time_per_iteration": 2.5943942070007324 + }, + { + "auxiliary_loss_clip": 0.06414223, + "auxiliary_loss_mlp": 0.01264046, + "balance_loss_clip": 0.06273946, + "balance_loss_mlp": 0.01253454, + "epoch": 0.6027356079963926, + "flos": 23593558348800.0, + "grad_norm": 1.5719504936966129, + "language_loss": 0.72838885, + "learning_rate": 1.4392021869111815e-06, + "loss": 0.80517155, + "num_input_tokens_seen": 215920440, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.105896, + "step": 10025, + "time_per_iteration": 2.5456719398498535 + }, + { + "auxiliary_loss_clip": 0.06421927, + "auxiliary_loss_mlp": 0.0126511, + "balance_loss_clip": 0.06274527, + "balance_loss_mlp": 0.01253469, + "epoch": 0.6027957312490606, + "flos": 20820055403520.0, + "grad_norm": 2.0657942826528526, + "language_loss": 0.67852134, + "learning_rate": 1.4388283596801016e-06, + "loss": 0.75539172, + "num_input_tokens_seen": 215940535, + "router_z_loss_clip": 1.47265625, + "router_z_loss_mlp": 0.11651611, + "step": 10026, + "time_per_iteration": 2.598649024963379 + }, + { + "auxiliary_loss_clip": 0.06409953, + "auxiliary_loss_mlp": 0.0126467, + "balance_loss_clip": 0.06272997, + "balance_loss_mlp": 0.01254794, + "epoch": 0.6028558545017285, + "flos": 19941540071040.0, + "grad_norm": 1.6702920817519378, + "language_loss": 0.80409044, + "learning_rate": 1.4384545537287061e-06, + "loss": 0.88083661, + "num_input_tokens_seen": 215958045, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.09881592, + "step": 10027, + "time_per_iteration": 2.4931211471557617 + }, + { + "auxiliary_loss_clip": 0.06421126, + "auxiliary_loss_mlp": 0.01265388, + "balance_loss_clip": 0.06276809, + "balance_loss_mlp": 0.01254516, + "epoch": 0.6029159777543965, + "flos": 22827535522560.0, + "grad_norm": 2.164274421178336, + "language_loss": 0.71328938, + "learning_rate": 1.438080769071171e-06, + "loss": 0.79015452, + "num_input_tokens_seen": 215977330, + "router_z_loss_clip": 1.44335938, + "router_z_loss_mlp": 0.10876465, + "step": 10028, + "time_per_iteration": 2.5468251705169678 + }, + { + "auxiliary_loss_clip": 0.06418602, + "auxiliary_loss_mlp": 0.01267926, + "balance_loss_clip": 0.06276453, + "balance_loss_mlp": 0.01256911, + "epoch": 0.6029761010070644, + "flos": 23594103400320.0, + "grad_norm": 1.6575222347679248, + "language_loss": 0.84050506, + "learning_rate": 1.437707005721669e-06, + "loss": 0.91737038, + "num_input_tokens_seen": 215997865, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.11016846, + "step": 10029, + "time_per_iteration": 2.529097557067871 + }, + { + "auxiliary_loss_clip": 0.06414534, + "auxiliary_loss_mlp": 0.01271064, + "balance_loss_clip": 0.06275196, + "balance_loss_mlp": 0.01261146, + "epoch": 0.6030362242597325, + "flos": 13667518229760.0, + "grad_norm": 1.639514659773033, + "language_loss": 0.800816, + "learning_rate": 1.437333263694373e-06, + "loss": 0.8776719, + "num_input_tokens_seen": 216016230, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.09918213, + "step": 10030, + "time_per_iteration": 2.527984619140625 + }, + { + "auxiliary_loss_clip": 0.06420292, + "auxiliary_loss_mlp": 0.01267091, + "balance_loss_clip": 0.06277453, + "balance_loss_mlp": 0.01256595, + "epoch": 0.6030963475124004, + "flos": 24429293372160.0, + "grad_norm": 1.55352827539933, + "language_loss": 0.71218026, + "learning_rate": 1.4369595430034572e-06, + "loss": 0.7890541, + "num_input_tokens_seen": 216035785, + "router_z_loss_clip": 1.42675781, + "router_z_loss_mlp": 0.1050415, + "step": 10031, + "time_per_iteration": 2.5585272312164307 + }, + { + "auxiliary_loss_clip": 0.06422323, + "auxiliary_loss_mlp": 0.01265322, + "balance_loss_clip": 0.06275461, + "balance_loss_mlp": 0.01253592, + "epoch": 0.6031564707650684, + "flos": 29651944907520.0, + "grad_norm": 1.5252565411095604, + "language_loss": 0.73936534, + "learning_rate": 1.4365858436630912e-06, + "loss": 0.81624174, + "num_input_tokens_seen": 216059555, + "router_z_loss_clip": 1.46777344, + "router_z_loss_mlp": 0.11730957, + "step": 10032, + "time_per_iteration": 2.6043312549591064 + }, + { + "auxiliary_loss_clip": 0.06425112, + "auxiliary_loss_mlp": 0.01269372, + "balance_loss_clip": 0.06280036, + "balance_loss_mlp": 0.01258124, + "epoch": 0.6032165940177363, + "flos": 16624525616640.0, + "grad_norm": 1.652390402199518, + "language_loss": 0.68466848, + "learning_rate": 1.4362121656874465e-06, + "loss": 0.76161331, + "num_input_tokens_seen": 216077235, + "router_z_loss_clip": 1.45214844, + "router_z_loss_mlp": 0.11242676, + "step": 10033, + "time_per_iteration": 2.4788658618927 + }, + { + "auxiliary_loss_clip": 0.06415801, + "auxiliary_loss_mlp": 0.01267578, + "balance_loss_clip": 0.06276157, + "balance_loss_mlp": 0.01256676, + "epoch": 0.6032767172704043, + "flos": 17493020386560.0, + "grad_norm": 2.062963272365632, + "language_loss": 0.76036859, + "learning_rate": 1.4358385090906934e-06, + "loss": 0.83720237, + "num_input_tokens_seen": 216094985, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.10900879, + "step": 10034, + "time_per_iteration": 2.5080766677856445 + }, + { + "auxiliary_loss_clip": 0.06421614, + "auxiliary_loss_mlp": 0.01268433, + "balance_loss_clip": 0.06277142, + "balance_loss_mlp": 0.01257668, + "epoch": 0.6033368405230723, + "flos": 26840105919360.0, + "grad_norm": 1.6546972875454138, + "language_loss": 0.74774975, + "learning_rate": 1.4354648738870004e-06, + "loss": 0.82465017, + "num_input_tokens_seen": 216115905, + "router_z_loss_clip": 1.4453125, + "router_z_loss_mlp": 0.10754395, + "step": 10035, + "time_per_iteration": 2.563206434249878 + }, + { + "auxiliary_loss_clip": 0.06417766, + "auxiliary_loss_mlp": 0.0126329, + "balance_loss_clip": 0.06278257, + "balance_loss_mlp": 0.01252752, + "epoch": 0.6033969637757403, + "flos": 16915575173760.0, + "grad_norm": 1.5348173305795916, + "language_loss": 0.86666334, + "learning_rate": 1.435091260090536e-06, + "loss": 0.94347388, + "num_input_tokens_seen": 216132420, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.10552979, + "step": 10036, + "time_per_iteration": 2.5237104892730713 + }, + { + "auxiliary_loss_clip": 0.06422649, + "auxiliary_loss_mlp": 0.01265037, + "balance_loss_clip": 0.06279111, + "balance_loss_mlp": 0.01253641, + "epoch": 0.6034570870284083, + "flos": 22936757719680.0, + "grad_norm": 1.8203362960867906, + "language_loss": 0.70372736, + "learning_rate": 1.4347176677154676e-06, + "loss": 0.78060424, + "num_input_tokens_seen": 216149800, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.11401367, + "step": 10037, + "time_per_iteration": 2.5395092964172363 + }, + { + "auxiliary_loss_clip": 0.06418501, + "auxiliary_loss_mlp": 0.0126923, + "balance_loss_clip": 0.06279185, + "balance_loss_mlp": 0.01258603, + "epoch": 0.6035172102810762, + "flos": 23372807967360.0, + "grad_norm": 1.59892513624744, + "language_loss": 0.85074937, + "learning_rate": 1.4343440967759616e-06, + "loss": 0.92762661, + "num_input_tokens_seen": 216168200, + "router_z_loss_clip": 1.39257812, + "router_z_loss_mlp": 0.10626221, + "step": 10038, + "time_per_iteration": 2.5844480991363525 + }, + { + "auxiliary_loss_clip": 0.06419212, + "auxiliary_loss_mlp": 0.01268169, + "balance_loss_clip": 0.06275028, + "balance_loss_mlp": 0.01257786, + "epoch": 0.6035773335337442, + "flos": 20893457180160.0, + "grad_norm": 2.8819957775512757, + "language_loss": 0.77070892, + "learning_rate": 1.4339705472861846e-06, + "loss": 0.8475827, + "num_input_tokens_seen": 216187105, + "router_z_loss_clip": 1.44140625, + "router_z_loss_mlp": 0.1038208, + "step": 10039, + "time_per_iteration": 2.5122628211975098 + }, + { + "auxiliary_loss_clip": 0.06415309, + "auxiliary_loss_mlp": 0.01264287, + "balance_loss_clip": 0.06274001, + "balance_loss_mlp": 0.01253981, + "epoch": 0.6036374567864121, + "flos": 24943231589760.0, + "grad_norm": 1.5604135097118987, + "language_loss": 0.71224856, + "learning_rate": 1.433597019260301e-06, + "loss": 0.78904456, + "num_input_tokens_seen": 216205440, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.10296631, + "step": 10040, + "time_per_iteration": 2.571869373321533 + }, + { + "auxiliary_loss_clip": 0.06419596, + "auxiliary_loss_mlp": 0.01268369, + "balance_loss_clip": 0.06274244, + "balance_loss_mlp": 0.01256627, + "epoch": 0.6036975800390801, + "flos": 23154866697600.0, + "grad_norm": 1.8943612239225145, + "language_loss": 0.7865687, + "learning_rate": 1.433223512712475e-06, + "loss": 0.86344838, + "num_input_tokens_seen": 216223130, + "router_z_loss_clip": 1.45117188, + "router_z_loss_mlp": 0.11749268, + "step": 10041, + "time_per_iteration": 2.4987337589263916 + }, + { + "auxiliary_loss_clip": 0.0641794, + "auxiliary_loss_mlp": 0.01264385, + "balance_loss_clip": 0.0627731, + "balance_loss_mlp": 0.01254026, + "epoch": 0.603757703291748, + "flos": 18666610272000.0, + "grad_norm": 4.973303913397253, + "language_loss": 0.75757015, + "learning_rate": 1.4328500276568704e-06, + "loss": 0.83439338, + "num_input_tokens_seen": 216240260, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.10357666, + "step": 10042, + "time_per_iteration": 2.5307700634002686 + }, + { + "auxiliary_loss_clip": 0.06414665, + "auxiliary_loss_mlp": 0.0126551, + "balance_loss_clip": 0.06273496, + "balance_loss_mlp": 0.01254477, + "epoch": 0.6038178265444161, + "flos": 19688700775680.0, + "grad_norm": 1.7644311631125091, + "language_loss": 0.84805411, + "learning_rate": 1.4324765641076498e-06, + "loss": 0.92485589, + "num_input_tokens_seen": 216258510, + "router_z_loss_clip": 1.41210938, + "router_z_loss_mlp": 0.1104126, + "step": 10043, + "time_per_iteration": 2.483207941055298 + }, + { + "auxiliary_loss_clip": 0.06418431, + "auxiliary_loss_mlp": 0.01267651, + "balance_loss_clip": 0.06272442, + "balance_loss_mlp": 0.01256034, + "epoch": 0.603877949797084, + "flos": 22644869621760.0, + "grad_norm": 1.873589684997381, + "language_loss": 0.69873232, + "learning_rate": 1.432103122078974e-06, + "loss": 0.77559316, + "num_input_tokens_seen": 216277550, + "router_z_loss_clip": 1.45996094, + "router_z_loss_mlp": 0.1161499, + "step": 10044, + "time_per_iteration": 3.940486192703247 + }, + { + "auxiliary_loss_clip": 0.0642198, + "auxiliary_loss_mlp": 0.01265838, + "balance_loss_clip": 0.06277104, + "balance_loss_mlp": 0.01254168, + "epoch": 0.603938073049752, + "flos": 25455031528320.0, + "grad_norm": 2.2351691288080966, + "language_loss": 0.77851117, + "learning_rate": 1.4317297015850057e-06, + "loss": 0.85538936, + "num_input_tokens_seen": 216296690, + "router_z_loss_clip": 1.44726562, + "router_z_loss_mlp": 0.11669922, + "step": 10045, + "time_per_iteration": 2.5411202907562256 + }, + { + "auxiliary_loss_clip": 0.06414884, + "auxiliary_loss_mlp": 0.01268718, + "balance_loss_clip": 0.06274995, + "balance_loss_mlp": 0.01257697, + "epoch": 0.6039981963024199, + "flos": 22345686218880.0, + "grad_norm": 1.7669017569149148, + "language_loss": 0.77354729, + "learning_rate": 1.4313563026399036e-06, + "loss": 0.85038328, + "num_input_tokens_seen": 216316110, + "router_z_loss_clip": 1.39648438, + "router_z_loss_mlp": 0.11010742, + "step": 10046, + "time_per_iteration": 2.6118433475494385 + }, + { + "auxiliary_loss_clip": 0.064179, + "auxiliary_loss_mlp": 0.01266383, + "balance_loss_clip": 0.06273997, + "balance_loss_mlp": 0.0125643, + "epoch": 0.6040583195550879, + "flos": 20709239978880.0, + "grad_norm": 1.4772024450084065, + "language_loss": 0.87242824, + "learning_rate": 1.430982925257827e-06, + "loss": 0.94927108, + "num_input_tokens_seen": 216333855, + "router_z_loss_clip": 1.43945312, + "router_z_loss_mlp": 0.09960938, + "step": 10047, + "time_per_iteration": 2.5964560508728027 + }, + { + "auxiliary_loss_clip": 0.06416798, + "auxiliary_loss_mlp": 0.01263003, + "balance_loss_clip": 0.06279427, + "balance_loss_mlp": 0.01252459, + "epoch": 0.604118442807756, + "flos": 27170623549440.0, + "grad_norm": 1.57099000963109, + "language_loss": 0.76137155, + "learning_rate": 1.4306095694529358e-06, + "loss": 0.83816957, + "num_input_tokens_seen": 216354890, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.10540771, + "step": 10048, + "time_per_iteration": 2.619131326675415 + }, + { + "auxiliary_loss_clip": 0.06423929, + "auxiliary_loss_mlp": 0.01267255, + "balance_loss_clip": 0.06274632, + "balance_loss_mlp": 0.0125512, + "epoch": 0.6041785660604239, + "flos": 30889125642240.0, + "grad_norm": 2.0836935767176508, + "language_loss": 0.66702586, + "learning_rate": 1.430236235239386e-06, + "loss": 0.74393767, + "num_input_tokens_seen": 216376055, + "router_z_loss_clip": 1.49316406, + "router_z_loss_mlp": 0.12121582, + "step": 10049, + "time_per_iteration": 2.650125741958618 + }, + { + "auxiliary_loss_clip": 0.06413972, + "auxiliary_loss_mlp": 0.01268699, + "balance_loss_clip": 0.06272484, + "balance_loss_mlp": 0.0125769, + "epoch": 0.6042386893130919, + "flos": 19944391109760.0, + "grad_norm": 1.425076043351067, + "language_loss": 0.6651637, + "learning_rate": 1.429862922631336e-06, + "loss": 0.74199045, + "num_input_tokens_seen": 216396295, + "router_z_loss_clip": 1.41503906, + "router_z_loss_mlp": 0.11004639, + "step": 10050, + "time_per_iteration": 2.523010015487671 + }, + { + "auxiliary_loss_clip": 0.06421351, + "auxiliary_loss_mlp": 0.01263894, + "balance_loss_clip": 0.06279106, + "balance_loss_mlp": 0.01252956, + "epoch": 0.6042988125657598, + "flos": 32424106187520.0, + "grad_norm": 1.5652221823172618, + "language_loss": 0.70055592, + "learning_rate": 1.4294896316429408e-06, + "loss": 0.7774083, + "num_input_tokens_seen": 216416605, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.10949707, + "step": 10051, + "time_per_iteration": 2.6328225135803223 + }, + { + "auxiliary_loss_clip": 0.06413503, + "auxiliary_loss_mlp": 0.01264826, + "balance_loss_clip": 0.062729, + "balance_loss_mlp": 0.01253167, + "epoch": 0.6043589358184278, + "flos": 17426578498560.0, + "grad_norm": 1.814191650563656, + "language_loss": 0.64989793, + "learning_rate": 1.4291163622883553e-06, + "loss": 0.72668123, + "num_input_tokens_seen": 216435130, + "router_z_loss_clip": 1.40722656, + "router_z_loss_mlp": 0.11651611, + "step": 10052, + "time_per_iteration": 4.032447814941406 + }, + { + "auxiliary_loss_clip": 0.06422505, + "auxiliary_loss_mlp": 0.01270462, + "balance_loss_clip": 0.06280071, + "balance_loss_mlp": 0.01259275, + "epoch": 0.6044190590710957, + "flos": 27680243281920.0, + "grad_norm": 1.5013537444726899, + "language_loss": 0.69046491, + "learning_rate": 1.4287431145817358e-06, + "loss": 0.76739454, + "num_input_tokens_seen": 216455640, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.11187744, + "step": 10053, + "time_per_iteration": 2.5837066173553467 + }, + { + "auxiliary_loss_clip": 0.06317958, + "auxiliary_loss_mlp": 0.01251886, + "balance_loss_clip": 0.06259381, + "balance_loss_mlp": 0.01250314, + "epoch": 0.6044791823237637, + "flos": 65334422090880.0, + "grad_norm": 0.7098963484594624, + "language_loss": 0.60469133, + "learning_rate": 1.4283698885372336e-06, + "loss": 0.68038976, + "num_input_tokens_seen": 216518130, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 0.01572418, + "step": 10054, + "time_per_iteration": 3.282451868057251 + }, + { + "auxiliary_loss_clip": 0.0641373, + "auxiliary_loss_mlp": 0.0126602, + "balance_loss_clip": 0.06275851, + "balance_loss_mlp": 0.01255023, + "epoch": 0.6045393055764317, + "flos": 24498208955520.0, + "grad_norm": 1.4963816601479185, + "language_loss": 0.85832298, + "learning_rate": 1.4279966841690027e-06, + "loss": 0.93512046, + "num_input_tokens_seen": 216536845, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.10998535, + "step": 10055, + "time_per_iteration": 2.5359747409820557 + }, + { + "auxiliary_loss_clip": 0.06417194, + "auxiliary_loss_mlp": 0.01268307, + "balance_loss_clip": 0.0627384, + "balance_loss_mlp": 0.01256416, + "epoch": 0.6045994288290997, + "flos": 19058999742720.0, + "grad_norm": 2.4042532312332243, + "language_loss": 0.74155682, + "learning_rate": 1.4276235014911952e-06, + "loss": 0.81841183, + "num_input_tokens_seen": 216551860, + "router_z_loss_clip": 1.43164062, + "router_z_loss_mlp": 0.11895752, + "step": 10056, + "time_per_iteration": 2.5254933834075928 + }, + { + "auxiliary_loss_clip": 0.06408785, + "auxiliary_loss_mlp": 0.01263059, + "balance_loss_clip": 0.06271578, + "balance_loss_mlp": 0.01252926, + "epoch": 0.6046595520817676, + "flos": 26583660898560.0, + "grad_norm": 1.6233300173420022, + "language_loss": 0.80582207, + "learning_rate": 1.4272503405179616e-06, + "loss": 0.88254052, + "num_input_tokens_seen": 216574775, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.10137939, + "step": 10057, + "time_per_iteration": 3.975159168243408 + }, + { + "auxiliary_loss_clip": 0.06411809, + "auxiliary_loss_mlp": 0.01267453, + "balance_loss_clip": 0.06273948, + "balance_loss_mlp": 0.01256557, + "epoch": 0.6047196753344356, + "flos": 13586150315520.0, + "grad_norm": 2.1360006581590727, + "language_loss": 0.751284, + "learning_rate": 1.4268772012634527e-06, + "loss": 0.82807666, + "num_input_tokens_seen": 216590100, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.10900879, + "step": 10058, + "time_per_iteration": 2.519793748855591 + }, + { + "auxiliary_loss_clip": 0.06412867, + "auxiliary_loss_mlp": 0.0126516, + "balance_loss_clip": 0.06274287, + "balance_loss_mlp": 0.01253967, + "epoch": 0.6047797985871035, + "flos": 25527552837120.0, + "grad_norm": 1.8108696315105546, + "language_loss": 0.70813042, + "learning_rate": 1.4265040837418176e-06, + "loss": 0.78491068, + "num_input_tokens_seen": 216610145, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.11181641, + "step": 10059, + "time_per_iteration": 2.5327351093292236 + }, + { + "auxiliary_loss_clip": 0.06417379, + "auxiliary_loss_mlp": 0.01264428, + "balance_loss_clip": 0.06274404, + "balance_loss_mlp": 0.01253538, + "epoch": 0.6048399218397715, + "flos": 20526112880640.0, + "grad_norm": 1.5165980047863354, + "language_loss": 0.76569366, + "learning_rate": 1.4261309879672054e-06, + "loss": 0.84251177, + "num_input_tokens_seen": 216630625, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.10888672, + "step": 10060, + "time_per_iteration": 2.5674891471862793 + }, + { + "auxiliary_loss_clip": 0.06412329, + "auxiliary_loss_mlp": 0.01266467, + "balance_loss_clip": 0.06271071, + "balance_loss_mlp": 0.01256036, + "epoch": 0.6049000450924396, + "flos": 20414416988160.0, + "grad_norm": 1.961791815817934, + "language_loss": 0.73817396, + "learning_rate": 1.4257579139537628e-06, + "loss": 0.81496191, + "num_input_tokens_seen": 216649255, + "router_z_loss_clip": 1.41210938, + "router_z_loss_mlp": 0.10418701, + "step": 10061, + "time_per_iteration": 2.4917149543762207 + }, + { + "auxiliary_loss_clip": 0.06419303, + "auxiliary_loss_mlp": 0.0126307, + "balance_loss_clip": 0.06275985, + "balance_loss_mlp": 0.01252497, + "epoch": 0.6049601683451075, + "flos": 20747743729920.0, + "grad_norm": 1.6943031579927808, + "language_loss": 0.67628121, + "learning_rate": 1.425384861715639e-06, + "loss": 0.75310493, + "num_input_tokens_seen": 216668100, + "router_z_loss_clip": 1.43164062, + "router_z_loss_mlp": 0.10565186, + "step": 10062, + "time_per_iteration": 3.9096996784210205 + }, + { + "auxiliary_loss_clip": 0.06412483, + "auxiliary_loss_mlp": 0.01265649, + "balance_loss_clip": 0.06272361, + "balance_loss_mlp": 0.01254246, + "epoch": 0.6050202915977755, + "flos": 20089140238080.0, + "grad_norm": 1.9017616396263957, + "language_loss": 0.71490061, + "learning_rate": 1.425011831266978e-06, + "loss": 0.79168195, + "num_input_tokens_seen": 216686125, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.11395264, + "step": 10063, + "time_per_iteration": 2.532278299331665 + }, + { + "auxiliary_loss_clip": 0.06410936, + "auxiliary_loss_mlp": 0.01264295, + "balance_loss_clip": 0.06271436, + "balance_loss_mlp": 0.01253858, + "epoch": 0.6050804148504434, + "flos": 15966257541120.0, + "grad_norm": 1.545014679780644, + "language_loss": 0.84818602, + "learning_rate": 1.424638822621926e-06, + "loss": 0.92493832, + "num_input_tokens_seen": 216704265, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.10430908, + "step": 10064, + "time_per_iteration": 2.4977669715881348 + }, + { + "auxiliary_loss_clip": 0.06412817, + "auxiliary_loss_mlp": 0.01264433, + "balance_loss_clip": 0.06272112, + "balance_loss_mlp": 0.01253567, + "epoch": 0.6051405381031114, + "flos": 17462315064960.0, + "grad_norm": 2.0946043423181293, + "language_loss": 0.801759, + "learning_rate": 1.4242658357946278e-06, + "loss": 0.87853146, + "num_input_tokens_seen": 216721765, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.10870361, + "step": 10065, + "time_per_iteration": 2.563521146774292 + }, + { + "auxiliary_loss_clip": 0.06424835, + "auxiliary_loss_mlp": 0.01265118, + "balance_loss_clip": 0.06278696, + "balance_loss_mlp": 0.0125371, + "epoch": 0.6052006613557793, + "flos": 11404808974080.0, + "grad_norm": 1.8141288170700578, + "language_loss": 0.7897802, + "learning_rate": 1.423892870799226e-06, + "loss": 0.86667973, + "num_input_tokens_seen": 216738295, + "router_z_loss_clip": 1.45996094, + "router_z_loss_mlp": 0.11413574, + "step": 10066, + "time_per_iteration": 2.4816365242004395 + }, + { + "auxiliary_loss_clip": 0.0641356, + "auxiliary_loss_mlp": 0.0126889, + "balance_loss_clip": 0.06272712, + "balance_loss_mlp": 0.01257857, + "epoch": 0.6052607846084473, + "flos": 24757421160960.0, + "grad_norm": 1.6017965029602446, + "language_loss": 0.73526549, + "learning_rate": 1.4235199276498655e-06, + "loss": 0.81208998, + "num_input_tokens_seen": 216759875, + "router_z_loss_clip": 1.40722656, + "router_z_loss_mlp": 0.1104126, + "step": 10067, + "time_per_iteration": 2.585381269454956 + }, + { + "auxiliary_loss_clip": 0.06416602, + "auxiliary_loss_mlp": 0.01267036, + "balance_loss_clip": 0.06275155, + "balance_loss_mlp": 0.01255646, + "epoch": 0.6053209078611153, + "flos": 20747492167680.0, + "grad_norm": 1.2388364270447627, + "language_loss": 0.68978894, + "learning_rate": 1.4231470063606863e-06, + "loss": 0.76662529, + "num_input_tokens_seen": 216780705, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.1138916, + "step": 10068, + "time_per_iteration": 2.533571243286133 + }, + { + "auxiliary_loss_clip": 0.06416383, + "auxiliary_loss_mlp": 0.01265473, + "balance_loss_clip": 0.06272757, + "balance_loss_mlp": 0.01254864, + "epoch": 0.6053810311137833, + "flos": 18959169202560.0, + "grad_norm": 2.164785155160147, + "language_loss": 0.87104344, + "learning_rate": 1.4227741069458303e-06, + "loss": 0.94786203, + "num_input_tokens_seen": 216797625, + "router_z_loss_clip": 1.43652344, + "router_z_loss_mlp": 0.1060791, + "step": 10069, + "time_per_iteration": 2.5425305366516113 + }, + { + "auxiliary_loss_clip": 0.06414159, + "auxiliary_loss_mlp": 0.01265922, + "balance_loss_clip": 0.06274873, + "balance_loss_mlp": 0.01255259, + "epoch": 0.6054411543664512, + "flos": 23957883901440.0, + "grad_norm": 1.623757415978513, + "language_loss": 0.83496463, + "learning_rate": 1.4224012294194387e-06, + "loss": 0.91176546, + "num_input_tokens_seen": 216817610, + "router_z_loss_clip": 1.39257812, + "router_z_loss_mlp": 0.10662842, + "step": 10070, + "time_per_iteration": 2.528780221939087 + }, + { + "auxiliary_loss_clip": 0.06416136, + "auxiliary_loss_mlp": 0.0126614, + "balance_loss_clip": 0.06271877, + "balance_loss_mlp": 0.0125528, + "epoch": 0.6055012776191192, + "flos": 20600101635840.0, + "grad_norm": 1.4904746237370996, + "language_loss": 0.86489964, + "learning_rate": 1.4220283737956496e-06, + "loss": 0.94172239, + "num_input_tokens_seen": 216836835, + "router_z_loss_clip": 1.44238281, + "router_z_loss_mlp": 0.10858154, + "step": 10071, + "time_per_iteration": 2.538874387741089 + }, + { + "auxiliary_loss_clip": 0.06422232, + "auxiliary_loss_mlp": 0.01271365, + "balance_loss_clip": 0.06276511, + "balance_loss_mlp": 0.01259129, + "epoch": 0.6055614008717871, + "flos": 30305768716800.0, + "grad_norm": 1.8258498039752344, + "language_loss": 0.77371645, + "learning_rate": 1.421655540088603e-06, + "loss": 0.85065246, + "num_input_tokens_seen": 216856760, + "router_z_loss_clip": 1.45703125, + "router_z_loss_mlp": 0.12231445, + "step": 10072, + "time_per_iteration": 2.5658671855926514 + }, + { + "auxiliary_loss_clip": 0.06419331, + "auxiliary_loss_mlp": 0.01267468, + "balance_loss_clip": 0.06274524, + "balance_loss_mlp": 0.01255523, + "epoch": 0.6056215241244551, + "flos": 27132245579520.0, + "grad_norm": 1.5250709401817175, + "language_loss": 0.74363017, + "learning_rate": 1.4212827283124367e-06, + "loss": 0.82049823, + "num_input_tokens_seen": 216878795, + "router_z_loss_clip": 1.44921875, + "router_z_loss_mlp": 0.11962891, + "step": 10073, + "time_per_iteration": 2.5838263034820557 + }, + { + "auxiliary_loss_clip": 0.06330025, + "auxiliary_loss_mlp": 0.01255009, + "balance_loss_clip": 0.06271286, + "balance_loss_mlp": 0.01253449, + "epoch": 0.6056816473771232, + "flos": 56023073124480.0, + "grad_norm": 0.7392641743542041, + "language_loss": 0.55267042, + "learning_rate": 1.4209099384812863e-06, + "loss": 0.62852079, + "num_input_tokens_seen": 216937800, + "router_z_loss_clip": 0.58740234, + "router_z_loss_mlp": 0.01560211, + "step": 10074, + "time_per_iteration": 3.192260503768921 + }, + { + "auxiliary_loss_clip": 0.06416894, + "auxiliary_loss_mlp": 0.01266981, + "balance_loss_clip": 0.0627609, + "balance_loss_mlp": 0.01256353, + "epoch": 0.6057417706297911, + "flos": 23556144700800.0, + "grad_norm": 1.6660379644056391, + "language_loss": 0.81972474, + "learning_rate": 1.4205371706092894e-06, + "loss": 0.89656347, + "num_input_tokens_seen": 216955280, + "router_z_loss_clip": 1.40722656, + "router_z_loss_mlp": 0.10626221, + "step": 10075, + "time_per_iteration": 2.514631509780884 + }, + { + "auxiliary_loss_clip": 0.06414524, + "auxiliary_loss_mlp": 0.01266219, + "balance_loss_clip": 0.06272351, + "balance_loss_mlp": 0.01255526, + "epoch": 0.6058018938824591, + "flos": 27751464852480.0, + "grad_norm": 1.6456827746682687, + "language_loss": 0.78334481, + "learning_rate": 1.4201644247105813e-06, + "loss": 0.86015224, + "num_input_tokens_seen": 216976950, + "router_z_loss_clip": 1.41992188, + "router_z_loss_mlp": 0.10699463, + "step": 10076, + "time_per_iteration": 2.5620245933532715 + }, + { + "auxiliary_loss_clip": 0.06419735, + "auxiliary_loss_mlp": 0.01264098, + "balance_loss_clip": 0.06275415, + "balance_loss_mlp": 0.01252994, + "epoch": 0.605862017135127, + "flos": 22789912239360.0, + "grad_norm": 1.939163307933087, + "language_loss": 0.72597015, + "learning_rate": 1.4197917007992964e-06, + "loss": 0.80280852, + "num_input_tokens_seen": 216996945, + "router_z_loss_clip": 1.44238281, + "router_z_loss_mlp": 0.11102295, + "step": 10077, + "time_per_iteration": 2.5249850749969482 + }, + { + "auxiliary_loss_clip": 0.06421016, + "auxiliary_loss_mlp": 0.0126711, + "balance_loss_clip": 0.06278025, + "balance_loss_mlp": 0.01256155, + "epoch": 0.605922140387795, + "flos": 21221375333760.0, + "grad_norm": 1.5785416430125656, + "language_loss": 0.55953008, + "learning_rate": 1.4194189988895682e-06, + "loss": 0.63641137, + "num_input_tokens_seen": 217016580, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.10961914, + "step": 10078, + "time_per_iteration": 2.5278408527374268 + }, + { + "auxiliary_loss_clip": 0.06424035, + "auxiliary_loss_mlp": 0.01271223, + "balance_loss_clip": 0.06278145, + "balance_loss_mlp": 0.01259911, + "epoch": 0.6059822636404629, + "flos": 27275191845120.0, + "grad_norm": 1.4527216797355516, + "language_loss": 0.70788896, + "learning_rate": 1.4190463189955297e-06, + "loss": 0.78484154, + "num_input_tokens_seen": 217037300, + "router_z_loss_clip": 1.45800781, + "router_z_loss_mlp": 0.11322021, + "step": 10079, + "time_per_iteration": 2.5871152877807617 + }, + { + "auxiliary_loss_clip": 0.06417212, + "auxiliary_loss_mlp": 0.0126863, + "balance_loss_clip": 0.06276966, + "balance_loss_mlp": 0.01257991, + "epoch": 0.606042386893131, + "flos": 20637599137920.0, + "grad_norm": 1.8315516840845918, + "language_loss": 0.63098562, + "learning_rate": 1.4186736611313131e-06, + "loss": 0.70784402, + "num_input_tokens_seen": 217055805, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.10638428, + "step": 10080, + "time_per_iteration": 2.491398334503174 + }, + { + "auxiliary_loss_clip": 0.06417031, + "auxiliary_loss_mlp": 0.01266608, + "balance_loss_clip": 0.06274322, + "balance_loss_mlp": 0.01255289, + "epoch": 0.6061025101457989, + "flos": 23008859758080.0, + "grad_norm": 1.6961363468706865, + "language_loss": 0.71255064, + "learning_rate": 1.4183010253110492e-06, + "loss": 0.78938705, + "num_input_tokens_seen": 217074175, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.11322021, + "step": 10081, + "time_per_iteration": 2.512700080871582 + }, + { + "auxiliary_loss_clip": 0.06420416, + "auxiliary_loss_mlp": 0.01269117, + "balance_loss_clip": 0.06277903, + "balance_loss_mlp": 0.01258406, + "epoch": 0.6061626333984669, + "flos": 29906796700800.0, + "grad_norm": 1.5910736573937334, + "language_loss": 0.69392467, + "learning_rate": 1.4179284115488691e-06, + "loss": 0.77082002, + "num_input_tokens_seen": 217095695, + "router_z_loss_clip": 1.42382812, + "router_z_loss_mlp": 0.10717773, + "step": 10082, + "time_per_iteration": 2.5597543716430664 + }, + { + "auxiliary_loss_clip": 0.06418272, + "auxiliary_loss_mlp": 0.01266999, + "balance_loss_clip": 0.06275124, + "balance_loss_mlp": 0.01256514, + "epoch": 0.6062227566511348, + "flos": 25016130241920.0, + "grad_norm": 1.2876460924932913, + "language_loss": 0.66258222, + "learning_rate": 1.4175558198589015e-06, + "loss": 0.7394349, + "num_input_tokens_seen": 217116260, + "router_z_loss_clip": 1.43164062, + "router_z_loss_mlp": 0.1048584, + "step": 10083, + "time_per_iteration": 4.032879114151001 + }, + { + "auxiliary_loss_clip": 0.06418855, + "auxiliary_loss_mlp": 0.01266697, + "balance_loss_clip": 0.06274892, + "balance_loss_mlp": 0.01256147, + "epoch": 0.6062828799038028, + "flos": 19470046746240.0, + "grad_norm": 1.984600644426631, + "language_loss": 0.74219275, + "learning_rate": 1.4171832502552764e-06, + "loss": 0.81904829, + "num_input_tokens_seen": 217134465, + "router_z_loss_clip": 1.43945312, + "router_z_loss_mlp": 0.10546875, + "step": 10084, + "time_per_iteration": 2.549463987350464 + }, + { + "auxiliary_loss_clip": 0.0641944, + "auxiliary_loss_mlp": 0.0126482, + "balance_loss_clip": 0.06277829, + "balance_loss_mlp": 0.01254305, + "epoch": 0.6063430031564707, + "flos": 13594661504640.0, + "grad_norm": 2.649456512280636, + "language_loss": 0.72717726, + "learning_rate": 1.4168107027521204e-06, + "loss": 0.80401981, + "num_input_tokens_seen": 217149920, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.10516357, + "step": 10085, + "time_per_iteration": 2.569584846496582 + }, + { + "auxiliary_loss_clip": 0.06415457, + "auxiliary_loss_mlp": 0.01267297, + "balance_loss_clip": 0.06275511, + "balance_loss_mlp": 0.01256771, + "epoch": 0.6064031264091387, + "flos": 23261740980480.0, + "grad_norm": 2.0482376544916057, + "language_loss": 0.76309711, + "learning_rate": 1.4164381773635605e-06, + "loss": 0.83992463, + "num_input_tokens_seen": 217168165, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.10522461, + "step": 10086, + "time_per_iteration": 2.5559799671173096 + }, + { + "auxiliary_loss_clip": 0.0641108, + "auxiliary_loss_mlp": 0.01265292, + "balance_loss_clip": 0.06273226, + "balance_loss_mlp": 0.01255231, + "epoch": 0.6064632496618068, + "flos": 22465515957120.0, + "grad_norm": 1.2564833731282572, + "language_loss": 0.72978222, + "learning_rate": 1.4160656741037246e-06, + "loss": 0.80654591, + "num_input_tokens_seen": 217190070, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.10070801, + "step": 10087, + "time_per_iteration": 2.5399293899536133 + }, + { + "auxiliary_loss_clip": 0.06412689, + "auxiliary_loss_mlp": 0.01269622, + "balance_loss_clip": 0.06275249, + "balance_loss_mlp": 0.01259555, + "epoch": 0.6065233729144747, + "flos": 25125604001280.0, + "grad_norm": 1.521602814132933, + "language_loss": 0.83829105, + "learning_rate": 1.4156931929867355e-06, + "loss": 0.91511416, + "num_input_tokens_seen": 217209370, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.10058594, + "step": 10088, + "time_per_iteration": 2.5622670650482178 + }, + { + "auxiliary_loss_clip": 0.06411251, + "auxiliary_loss_mlp": 0.01268104, + "balance_loss_clip": 0.06272328, + "balance_loss_mlp": 0.01257709, + "epoch": 0.6065834961671427, + "flos": 23484126516480.0, + "grad_norm": 1.9713789944159437, + "language_loss": 0.71166384, + "learning_rate": 1.4153207340267201e-06, + "loss": 0.78845739, + "num_input_tokens_seen": 217226990, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.10400391, + "step": 10089, + "time_per_iteration": 2.516352891921997 + }, + { + "auxiliary_loss_clip": 0.06418794, + "auxiliary_loss_mlp": 0.01265974, + "balance_loss_clip": 0.06277877, + "balance_loss_mlp": 0.01255835, + "epoch": 0.6066436194198106, + "flos": 17025090860160.0, + "grad_norm": 1.830033701594393, + "language_loss": 0.82651365, + "learning_rate": 1.4149482972378009e-06, + "loss": 0.90336132, + "num_input_tokens_seen": 217244585, + "router_z_loss_clip": 1.40917969, + "router_z_loss_mlp": 0.10137939, + "step": 10090, + "time_per_iteration": 2.5144259929656982 + }, + { + "auxiliary_loss_clip": 0.06427157, + "auxiliary_loss_mlp": 0.01267358, + "balance_loss_clip": 0.06276274, + "balance_loss_mlp": 0.01255848, + "epoch": 0.6067037426724786, + "flos": 18520603332480.0, + "grad_norm": 2.204687443594168, + "language_loss": 0.76034927, + "learning_rate": 1.4145758826341e-06, + "loss": 0.83729446, + "num_input_tokens_seen": 217263435, + "router_z_loss_clip": 1.5078125, + "router_z_loss_mlp": 0.11505127, + "step": 10091, + "time_per_iteration": 2.4818389415740967 + }, + { + "auxiliary_loss_clip": 0.06416716, + "auxiliary_loss_mlp": 0.01268883, + "balance_loss_clip": 0.06278287, + "balance_loss_mlp": 0.01258041, + "epoch": 0.6067638659251465, + "flos": 22352520326400.0, + "grad_norm": 1.3588116701946646, + "language_loss": 0.7976529, + "learning_rate": 1.4142034902297415e-06, + "loss": 0.87450886, + "num_input_tokens_seen": 217283725, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.10858154, + "step": 10092, + "time_per_iteration": 4.102951765060425 + }, + { + "auxiliary_loss_clip": 0.06413257, + "auxiliary_loss_mlp": 0.01264393, + "balance_loss_clip": 0.06271385, + "balance_loss_mlp": 0.01253623, + "epoch": 0.6068239891778145, + "flos": 12454669906560.0, + "grad_norm": 1.7580568445861304, + "language_loss": 0.76897407, + "learning_rate": 1.4138311200388444e-06, + "loss": 0.84575057, + "num_input_tokens_seen": 217301120, + "router_z_loss_clip": 1.41796875, + "router_z_loss_mlp": 0.10778809, + "step": 10093, + "time_per_iteration": 2.5497262477874756 + }, + { + "auxiliary_loss_clip": 0.06417312, + "auxiliary_loss_mlp": 0.01264272, + "balance_loss_clip": 0.06280127, + "balance_loss_mlp": 0.01254396, + "epoch": 0.6068841124304825, + "flos": 23192657688960.0, + "grad_norm": 1.756366452209319, + "language_loss": 0.87924957, + "learning_rate": 1.4134587720755304e-06, + "loss": 0.95606542, + "num_input_tokens_seen": 217319585, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.09887695, + "step": 10094, + "time_per_iteration": 2.5853447914123535 + }, + { + "auxiliary_loss_clip": 0.06414801, + "auxiliary_loss_mlp": 0.01269704, + "balance_loss_clip": 0.06274891, + "balance_loss_mlp": 0.01258891, + "epoch": 0.6069442356831505, + "flos": 18593795473920.0, + "grad_norm": 1.6037560799373654, + "language_loss": 0.72400463, + "learning_rate": 1.413086446353919e-06, + "loss": 0.80084968, + "num_input_tokens_seen": 217338880, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.1081543, + "step": 10095, + "time_per_iteration": 2.522684335708618 + }, + { + "auxiliary_loss_clip": 0.06416344, + "auxiliary_loss_mlp": 0.01265543, + "balance_loss_clip": 0.06275313, + "balance_loss_mlp": 0.01255202, + "epoch": 0.6070043589358184, + "flos": 20966775102720.0, + "grad_norm": 1.6943237110311855, + "language_loss": 0.76768452, + "learning_rate": 1.4127141428881273e-06, + "loss": 0.8445034, + "num_input_tokens_seen": 217357480, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.10333252, + "step": 10096, + "time_per_iteration": 3.974635362625122 + }, + { + "auxiliary_loss_clip": 0.06419063, + "auxiliary_loss_mlp": 0.01267681, + "balance_loss_clip": 0.06276296, + "balance_loss_mlp": 0.01257018, + "epoch": 0.6070644821884864, + "flos": 11697242123520.0, + "grad_norm": 1.6709554759687573, + "language_loss": 0.80418944, + "learning_rate": 1.4123418616922749e-06, + "loss": 0.8810569, + "num_input_tokens_seen": 217374575, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.10668945, + "step": 10097, + "time_per_iteration": 2.5277743339538574 + }, + { + "auxiliary_loss_clip": 0.06411067, + "auxiliary_loss_mlp": 0.01267086, + "balance_loss_clip": 0.0627345, + "balance_loss_mlp": 0.01256888, + "epoch": 0.6071246054411543, + "flos": 19315402836480.0, + "grad_norm": 1.4624120271510725, + "language_loss": 0.6741221, + "learning_rate": 1.411969602780478e-06, + "loss": 0.75090361, + "num_input_tokens_seen": 217392950, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.10198975, + "step": 10098, + "time_per_iteration": 2.476284980773926 + }, + { + "auxiliary_loss_clip": 0.06410795, + "auxiliary_loss_mlp": 0.01267637, + "balance_loss_clip": 0.06272739, + "balance_loss_mlp": 0.01257695, + "epoch": 0.6071847286938223, + "flos": 17754832068480.0, + "grad_norm": 1.6528826990411218, + "language_loss": 0.80661249, + "learning_rate": 1.4115973661668523e-06, + "loss": 0.8833968, + "num_input_tokens_seen": 217412145, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.0994873, + "step": 10099, + "time_per_iteration": 2.5101730823516846 + }, + { + "auxiliary_loss_clip": 0.06419415, + "auxiliary_loss_mlp": 0.01267814, + "balance_loss_clip": 0.06273925, + "balance_loss_mlp": 0.01256382, + "epoch": 0.6072448519464904, + "flos": 22644031080960.0, + "grad_norm": 1.7660509562429656, + "language_loss": 0.71092284, + "learning_rate": 1.4112251518655133e-06, + "loss": 0.78779513, + "num_input_tokens_seen": 217432080, + "router_z_loss_clip": 1.45507812, + "router_z_loss_mlp": 0.11437988, + "step": 10100, + "time_per_iteration": 2.5284388065338135 + }, + { + "auxiliary_loss_clip": 0.06417382, + "auxiliary_loss_mlp": 0.012671, + "balance_loss_clip": 0.06275873, + "balance_loss_mlp": 0.01255072, + "epoch": 0.6073049751991583, + "flos": 19543490449920.0, + "grad_norm": 2.5847426043420807, + "language_loss": 0.71003377, + "learning_rate": 1.4108529598905764e-06, + "loss": 0.78687859, + "num_input_tokens_seen": 217450945, + "router_z_loss_clip": 1.41503906, + "router_z_loss_mlp": 0.12030029, + "step": 10101, + "time_per_iteration": 2.5114076137542725 + }, + { + "auxiliary_loss_clip": 0.06414101, + "auxiliary_loss_mlp": 0.01264452, + "balance_loss_clip": 0.06275541, + "balance_loss_mlp": 0.01254534, + "epoch": 0.6073650984518263, + "flos": 28301936250240.0, + "grad_norm": 1.5889760307817664, + "language_loss": 0.69726598, + "learning_rate": 1.410480790256154e-06, + "loss": 0.77405149, + "num_input_tokens_seen": 217473105, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.09924316, + "step": 10102, + "time_per_iteration": 4.067505836486816 + }, + { + "auxiliary_loss_clip": 0.06414825, + "auxiliary_loss_mlp": 0.01266798, + "balance_loss_clip": 0.06273274, + "balance_loss_mlp": 0.01256409, + "epoch": 0.6074252217044942, + "flos": 25671211862400.0, + "grad_norm": 1.7072302673605428, + "language_loss": 0.73599881, + "learning_rate": 1.4101086429763589e-06, + "loss": 0.81281507, + "num_input_tokens_seen": 217491780, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.10394287, + "step": 10103, + "time_per_iteration": 2.5059690475463867 + }, + { + "auxiliary_loss_clip": 0.06429945, + "auxiliary_loss_mlp": 0.01270767, + "balance_loss_clip": 0.06280673, + "balance_loss_mlp": 0.01259215, + "epoch": 0.6074853449571622, + "flos": 22863775213440.0, + "grad_norm": 2.6623380378388943, + "language_loss": 0.76573825, + "learning_rate": 1.4097365180653032e-06, + "loss": 0.84274542, + "num_input_tokens_seen": 217510605, + "router_z_loss_clip": 1.4921875, + "router_z_loss_mlp": 0.11560059, + "step": 10104, + "time_per_iteration": 2.5691661834716797 + }, + { + "auxiliary_loss_clip": 0.06324141, + "auxiliary_loss_mlp": 0.01255914, + "balance_loss_clip": 0.0626532, + "balance_loss_mlp": 0.01253873, + "epoch": 0.6075454682098301, + "flos": 67131088536960.0, + "grad_norm": 0.6977033795055727, + "language_loss": 0.55382067, + "learning_rate": 1.4093644155370977e-06, + "loss": 0.62962115, + "num_input_tokens_seen": 217574815, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 0.02041626, + "step": 10105, + "time_per_iteration": 3.1780333518981934 + }, + { + "auxiliary_loss_clip": 0.06325028, + "auxiliary_loss_mlp": 0.0125398, + "balance_loss_clip": 0.06266589, + "balance_loss_mlp": 0.01252049, + "epoch": 0.6076055914624982, + "flos": 70730389797120.0, + "grad_norm": 1.0472762602622778, + "language_loss": 0.5682922, + "learning_rate": 1.4089923354058533e-06, + "loss": 0.64408225, + "num_input_tokens_seen": 217632375, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 0.01928711, + "step": 10106, + "time_per_iteration": 3.1282505989074707 + }, + { + "auxiliary_loss_clip": 0.06414115, + "auxiliary_loss_mlp": 0.01266002, + "balance_loss_clip": 0.06276634, + "balance_loss_mlp": 0.01256042, + "epoch": 0.6076657147151661, + "flos": 28371816155520.0, + "grad_norm": 1.4629042426300594, + "language_loss": 0.69019145, + "learning_rate": 1.4086202776856784e-06, + "loss": 0.76699257, + "num_input_tokens_seen": 217653055, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.09954834, + "step": 10107, + "time_per_iteration": 2.6175951957702637 + }, + { + "auxiliary_loss_clip": 0.0642143, + "auxiliary_loss_mlp": 0.01265799, + "balance_loss_clip": 0.06277055, + "balance_loss_mlp": 0.01255297, + "epoch": 0.6077258379678341, + "flos": 15055234024320.0, + "grad_norm": 1.7550359653422893, + "language_loss": 0.80674279, + "learning_rate": 1.4082482423906815e-06, + "loss": 0.88361514, + "num_input_tokens_seen": 217671520, + "router_z_loss_clip": 1.44433594, + "router_z_loss_mlp": 0.1050415, + "step": 10108, + "time_per_iteration": 2.482895851135254 + }, + { + "auxiliary_loss_clip": 0.06424679, + "auxiliary_loss_mlp": 0.01267352, + "balance_loss_clip": 0.06279299, + "balance_loss_mlp": 0.01256223, + "epoch": 0.607785961220502, + "flos": 36174948756480.0, + "grad_norm": 1.6080944832957944, + "language_loss": 0.71795905, + "learning_rate": 1.4078762295349714e-06, + "loss": 0.79487944, + "num_input_tokens_seen": 217691880, + "router_z_loss_clip": 1.45214844, + "router_z_loss_mlp": 0.11138916, + "step": 10109, + "time_per_iteration": 2.6855504512786865 + }, + { + "auxiliary_loss_clip": 0.06412528, + "auxiliary_loss_mlp": 0.01268721, + "balance_loss_clip": 0.06276727, + "balance_loss_mlp": 0.01259119, + "epoch": 0.60784608447317, + "flos": 22530113055360.0, + "grad_norm": 1.591486225286121, + "language_loss": 0.80463254, + "learning_rate": 1.407504239132653e-06, + "loss": 0.88144499, + "num_input_tokens_seen": 217710530, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.09613037, + "step": 10110, + "time_per_iteration": 2.4970977306365967 + }, + { + "auxiliary_loss_clip": 0.06416238, + "auxiliary_loss_mlp": 0.01268709, + "balance_loss_clip": 0.06275235, + "balance_loss_mlp": 0.01258052, + "epoch": 0.6079062077258379, + "flos": 23847823163520.0, + "grad_norm": 17.062743331014456, + "language_loss": 0.7053231, + "learning_rate": 1.4071322711978338e-06, + "loss": 0.78217256, + "num_input_tokens_seen": 217728650, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.10656738, + "step": 10111, + "time_per_iteration": 2.5446176528930664 + }, + { + "auxiliary_loss_clip": 0.0641928, + "auxiliary_loss_mlp": 0.01267582, + "balance_loss_clip": 0.06276086, + "balance_loss_mlp": 0.01255631, + "epoch": 0.6079663309785059, + "flos": 23373646508160.0, + "grad_norm": 1.767884967540518, + "language_loss": 0.64890563, + "learning_rate": 1.4067603257446186e-06, + "loss": 0.72577429, + "num_input_tokens_seen": 217747135, + "router_z_loss_clip": 1.43164062, + "router_z_loss_mlp": 0.11950684, + "step": 10112, + "time_per_iteration": 2.5041110515594482 + }, + { + "auxiliary_loss_clip": 0.06319214, + "auxiliary_loss_mlp": 0.01254153, + "balance_loss_clip": 0.0626073, + "balance_loss_mlp": 0.01252635, + "epoch": 0.6080264542311739, + "flos": 71403709680000.0, + "grad_norm": 0.6188727131541597, + "language_loss": 0.49428421, + "learning_rate": 1.4063884027871105e-06, + "loss": 0.57001793, + "num_input_tokens_seen": 217811860, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 0.01517487, + "step": 10113, + "time_per_iteration": 3.2030844688415527 + }, + { + "auxiliary_loss_clip": 0.06322706, + "auxiliary_loss_mlp": 0.01253815, + "balance_loss_clip": 0.06264073, + "balance_loss_mlp": 0.01252375, + "epoch": 0.6080865774838419, + "flos": 66549786036480.0, + "grad_norm": 0.826261074954681, + "language_loss": 0.57000625, + "learning_rate": 1.4060165023394147e-06, + "loss": 0.64577138, + "num_input_tokens_seen": 217866510, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 0.01438141, + "step": 10114, + "time_per_iteration": 3.0561811923980713 + }, + { + "auxiliary_loss_clip": 0.06416565, + "auxiliary_loss_mlp": 0.0126699, + "balance_loss_clip": 0.0627362, + "balance_loss_mlp": 0.01255528, + "epoch": 0.6081467007365099, + "flos": 19213895214720.0, + "grad_norm": 2.9429969583310744, + "language_loss": 0.70665103, + "learning_rate": 1.4056446244156317e-06, + "loss": 0.7834866, + "num_input_tokens_seen": 217885650, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.11456299, + "step": 10115, + "time_per_iteration": 2.536123037338257 + }, + { + "auxiliary_loss_clip": 0.06416753, + "auxiliary_loss_mlp": 0.01265083, + "balance_loss_clip": 0.06275412, + "balance_loss_mlp": 0.01254128, + "epoch": 0.6082068239891778, + "flos": 24174148089600.0, + "grad_norm": 2.2262194131188617, + "language_loss": 0.72516567, + "learning_rate": 1.4052727690298642e-06, + "loss": 0.80198407, + "num_input_tokens_seen": 217905300, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.10961914, + "step": 10116, + "time_per_iteration": 2.5744457244873047 + }, + { + "auxiliary_loss_clip": 0.06418931, + "auxiliary_loss_mlp": 0.0126628, + "balance_loss_clip": 0.06275393, + "balance_loss_mlp": 0.01254562, + "epoch": 0.6082669472418458, + "flos": 37422150053760.0, + "grad_norm": 1.8492666967546532, + "language_loss": 0.54224104, + "learning_rate": 1.4049009361962138e-06, + "loss": 0.61909318, + "num_input_tokens_seen": 217927845, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.1171875, + "step": 10117, + "time_per_iteration": 2.7010717391967773 + }, + { + "auxiliary_loss_clip": 0.06415669, + "auxiliary_loss_mlp": 0.01262582, + "balance_loss_clip": 0.06273679, + "balance_loss_mlp": 0.01252431, + "epoch": 0.6083270704945137, + "flos": 15090886736640.0, + "grad_norm": 1.6926126638400165, + "language_loss": 0.70553619, + "learning_rate": 1.4045291259287786e-06, + "loss": 0.78231865, + "num_input_tokens_seen": 217946145, + "router_z_loss_clip": 1.41894531, + "router_z_loss_mlp": 0.1015625, + "step": 10118, + "time_per_iteration": 2.5118987560272217 + }, + { + "auxiliary_loss_clip": 0.0641689, + "auxiliary_loss_mlp": 0.01265841, + "balance_loss_clip": 0.06276117, + "balance_loss_mlp": 0.01255857, + "epoch": 0.6083871937471818, + "flos": 20674845077760.0, + "grad_norm": 1.454621938136119, + "language_loss": 0.75087917, + "learning_rate": 1.4041573382416588e-06, + "loss": 0.82770652, + "num_input_tokens_seen": 217965190, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.09979248, + "step": 10119, + "time_per_iteration": 2.5343713760375977 + }, + { + "auxiliary_loss_clip": 0.06418591, + "auxiliary_loss_mlp": 0.01266372, + "balance_loss_clip": 0.06277768, + "balance_loss_mlp": 0.0125559, + "epoch": 0.6084473169998497, + "flos": 21513305358720.0, + "grad_norm": 1.7245965425427678, + "language_loss": 0.67339104, + "learning_rate": 1.4037855731489525e-06, + "loss": 0.75024068, + "num_input_tokens_seen": 217983625, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.10784912, + "step": 10120, + "time_per_iteration": 2.4992902278900146 + }, + { + "auxiliary_loss_clip": 0.06424947, + "auxiliary_loss_mlp": 0.01267829, + "balance_loss_clip": 0.06279485, + "balance_loss_mlp": 0.0125673, + "epoch": 0.6085074402525177, + "flos": 26877309932160.0, + "grad_norm": 1.7168671771406325, + "language_loss": 0.74690855, + "learning_rate": 1.4034138306647571e-06, + "loss": 0.82383633, + "num_input_tokens_seen": 218006005, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.11096191, + "step": 10121, + "time_per_iteration": 2.552943468093872 + }, + { + "auxiliary_loss_clip": 0.06415446, + "auxiliary_loss_mlp": 0.0126478, + "balance_loss_clip": 0.06275289, + "balance_loss_mlp": 0.01254844, + "epoch": 0.6085675635051856, + "flos": 10894518408960.0, + "grad_norm": 1.695682661500106, + "language_loss": 0.80907005, + "learning_rate": 1.4030421108031685e-06, + "loss": 0.88587236, + "num_input_tokens_seen": 218024195, + "router_z_loss_clip": 1.40039062, + "router_z_loss_mlp": 0.09936523, + "step": 10122, + "time_per_iteration": 3.890413522720337 + }, + { + "auxiliary_loss_clip": 0.06419112, + "auxiliary_loss_mlp": 0.0126449, + "balance_loss_clip": 0.06278858, + "balance_loss_mlp": 0.01254483, + "epoch": 0.6086276867578536, + "flos": 34871074571520.0, + "grad_norm": 1.4621063194109842, + "language_loss": 0.55791676, + "learning_rate": 1.402670413578284e-06, + "loss": 0.63475281, + "num_input_tokens_seen": 218047190, + "router_z_loss_clip": 1.40429688, + "router_z_loss_mlp": 0.10015869, + "step": 10123, + "time_per_iteration": 2.6325483322143555 + }, + { + "auxiliary_loss_clip": 0.06419839, + "auxiliary_loss_mlp": 0.01264678, + "balance_loss_clip": 0.06281708, + "balance_loss_mlp": 0.0125355, + "epoch": 0.6086878100105215, + "flos": 20053906796160.0, + "grad_norm": 1.6808318536129285, + "language_loss": 0.74430656, + "learning_rate": 1.4022987390041965e-06, + "loss": 0.82115179, + "num_input_tokens_seen": 218065945, + "router_z_loss_clip": 1.38183594, + "router_z_loss_mlp": 0.11114502, + "step": 10124, + "time_per_iteration": 2.5358493328094482 + }, + { + "auxiliary_loss_clip": 0.06421429, + "auxiliary_loss_mlp": 0.01269718, + "balance_loss_clip": 0.06278759, + "balance_loss_mlp": 0.01258393, + "epoch": 0.6087479332631895, + "flos": 18338314775040.0, + "grad_norm": 11.543954575524463, + "language_loss": 0.65884316, + "learning_rate": 1.4019270870950006e-06, + "loss": 0.73575461, + "num_input_tokens_seen": 218085285, + "router_z_loss_clip": 1.42675781, + "router_z_loss_mlp": 0.11322021, + "step": 10125, + "time_per_iteration": 2.4864342212677 + }, + { + "auxiliary_loss_clip": 0.06421918, + "auxiliary_loss_mlp": 0.01264385, + "balance_loss_clip": 0.06282578, + "balance_loss_mlp": 0.01253841, + "epoch": 0.6088080565158575, + "flos": 24499424839680.0, + "grad_norm": 2.2712886028305, + "language_loss": 0.76395416, + "learning_rate": 1.40155545786479e-06, + "loss": 0.84081715, + "num_input_tokens_seen": 218104735, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.10552979, + "step": 10126, + "time_per_iteration": 2.5664777755737305 + }, + { + "auxiliary_loss_clip": 0.06427297, + "auxiliary_loss_mlp": 0.01266279, + "balance_loss_clip": 0.06280977, + "balance_loss_mlp": 0.0125524, + "epoch": 0.6088681797685255, + "flos": 10273496273280.0, + "grad_norm": 5.11214091408941, + "language_loss": 0.71820217, + "learning_rate": 1.4011838513276558e-06, + "loss": 0.79513788, + "num_input_tokens_seen": 218121855, + "router_z_loss_clip": 1.46386719, + "router_z_loss_mlp": 0.1104126, + "step": 10127, + "time_per_iteration": 2.478034257888794 + }, + { + "auxiliary_loss_clip": 0.06430127, + "auxiliary_loss_mlp": 0.01266951, + "balance_loss_clip": 0.06284942, + "balance_loss_mlp": 0.01255465, + "epoch": 0.6089283030211935, + "flos": 21978928897920.0, + "grad_norm": 2.2629720759221996, + "language_loss": 0.72788715, + "learning_rate": 1.400812267497691e-06, + "loss": 0.80485797, + "num_input_tokens_seen": 218137325, + "router_z_loss_clip": 1.44921875, + "router_z_loss_mlp": 0.11486816, + "step": 10128, + "time_per_iteration": 2.553764820098877 + }, + { + "auxiliary_loss_clip": 0.06422316, + "auxiliary_loss_mlp": 0.0126747, + "balance_loss_clip": 0.06282373, + "balance_loss_mlp": 0.01257355, + "epoch": 0.6089884262738614, + "flos": 17790945978240.0, + "grad_norm": 1.9776728101481476, + "language_loss": 0.7314598, + "learning_rate": 1.4004407063889842e-06, + "loss": 0.8083576, + "num_input_tokens_seen": 218155530, + "router_z_loss_clip": 1.40039062, + "router_z_loss_mlp": 0.10119629, + "step": 10129, + "time_per_iteration": 2.4939491748809814 + }, + { + "auxiliary_loss_clip": 0.06421769, + "auxiliary_loss_mlp": 0.01271284, + "balance_loss_clip": 0.06280705, + "balance_loss_mlp": 0.01260764, + "epoch": 0.6090485495265294, + "flos": 36920496458880.0, + "grad_norm": 1.3316519758914749, + "language_loss": 0.65839994, + "learning_rate": 1.400069168015626e-06, + "loss": 0.73533046, + "num_input_tokens_seen": 218182535, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.10528564, + "step": 10130, + "time_per_iteration": 2.7194180488586426 + }, + { + "auxiliary_loss_clip": 0.0641261, + "auxiliary_loss_mlp": 0.012646, + "balance_loss_clip": 0.06274526, + "balance_loss_mlp": 0.01254926, + "epoch": 0.6091086727791973, + "flos": 19904755328640.0, + "grad_norm": 1.5918133317154841, + "language_loss": 0.77794468, + "learning_rate": 1.3996976523917054e-06, + "loss": 0.85471684, + "num_input_tokens_seen": 218201740, + "router_z_loss_clip": 1.38183594, + "router_z_loss_mlp": 0.09680176, + "step": 10131, + "time_per_iteration": 2.5376133918762207 + }, + { + "auxiliary_loss_clip": 0.0641945, + "auxiliary_loss_mlp": 0.01265085, + "balance_loss_clip": 0.06279676, + "balance_loss_mlp": 0.01255071, + "epoch": 0.6091687960318654, + "flos": 22170147914880.0, + "grad_norm": 1.8790929127191944, + "language_loss": 0.77705514, + "learning_rate": 1.3993261595313093e-06, + "loss": 0.85390049, + "num_input_tokens_seen": 218219800, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.10003662, + "step": 10132, + "time_per_iteration": 3.9999635219573975 + }, + { + "auxiliary_loss_clip": 0.06414825, + "auxiliary_loss_mlp": 0.01267619, + "balance_loss_clip": 0.06278821, + "balance_loss_mlp": 0.01257618, + "epoch": 0.6092289192845333, + "flos": 21470818538880.0, + "grad_norm": 2.2139477747978136, + "language_loss": 0.75865889, + "learning_rate": 1.3989546894485261e-06, + "loss": 0.83548331, + "num_input_tokens_seen": 218237585, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10003662, + "step": 10133, + "time_per_iteration": 2.545747756958008 + }, + { + "auxiliary_loss_clip": 0.06417366, + "auxiliary_loss_mlp": 0.01266553, + "balance_loss_clip": 0.06276603, + "balance_loss_mlp": 0.01255973, + "epoch": 0.6092890425372013, + "flos": 28702585347840.0, + "grad_norm": 1.8044338362434222, + "language_loss": 0.64228314, + "learning_rate": 1.3985832421574414e-06, + "loss": 0.71912241, + "num_input_tokens_seen": 218258700, + "router_z_loss_clip": 1.40722656, + "router_z_loss_mlp": 0.10583496, + "step": 10134, + "time_per_iteration": 2.563861131668091 + }, + { + "auxiliary_loss_clip": 0.06424356, + "auxiliary_loss_mlp": 0.01263619, + "balance_loss_clip": 0.06285493, + "balance_loss_mlp": 0.01253331, + "epoch": 0.6093491657898692, + "flos": 20819384570880.0, + "grad_norm": 1.7758601490441968, + "language_loss": 0.78973985, + "learning_rate": 1.3982118176721397e-06, + "loss": 0.86661959, + "num_input_tokens_seen": 218275655, + "router_z_loss_clip": 1.38476562, + "router_z_loss_mlp": 0.10290527, + "step": 10135, + "time_per_iteration": 2.553738832473755 + }, + { + "auxiliary_loss_clip": 0.06420235, + "auxiliary_loss_mlp": 0.01266173, + "balance_loss_clip": 0.06279118, + "balance_loss_mlp": 0.01256416, + "epoch": 0.6094092890425372, + "flos": 25453983352320.0, + "grad_norm": 1.626137919034545, + "language_loss": 0.72278392, + "learning_rate": 1.3978404160067069e-06, + "loss": 0.79964805, + "num_input_tokens_seen": 218295720, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.09753418, + "step": 10136, + "time_per_iteration": 4.003901958465576 + }, + { + "auxiliary_loss_clip": 0.06420286, + "auxiliary_loss_mlp": 0.01265077, + "balance_loss_clip": 0.06279141, + "balance_loss_mlp": 0.0125464, + "epoch": 0.6094694122952051, + "flos": 35629089333120.0, + "grad_norm": 1.6356074117681172, + "language_loss": 0.74919081, + "learning_rate": 1.3974690371752253e-06, + "loss": 0.82604444, + "num_input_tokens_seen": 218316745, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.10443115, + "step": 10137, + "time_per_iteration": 2.634158134460449 + }, + { + "auxiliary_loss_clip": 0.06417631, + "auxiliary_loss_mlp": 0.01266963, + "balance_loss_clip": 0.06275456, + "balance_loss_mlp": 0.01256246, + "epoch": 0.6095295355478731, + "flos": 24462975513600.0, + "grad_norm": 2.0845106182551163, + "language_loss": 0.80188054, + "learning_rate": 1.3970976811917785e-06, + "loss": 0.87872648, + "num_input_tokens_seen": 218335385, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.10717773, + "step": 10138, + "time_per_iteration": 2.5884156227111816 + }, + { + "auxiliary_loss_clip": 0.06410988, + "auxiliary_loss_mlp": 0.01265559, + "balance_loss_clip": 0.06275302, + "balance_loss_mlp": 0.01255354, + "epoch": 0.6095896588005411, + "flos": 15638716730880.0, + "grad_norm": 1.5018300865324132, + "language_loss": 0.81360239, + "learning_rate": 1.3967263480704481e-06, + "loss": 0.89036787, + "num_input_tokens_seen": 218353320, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.10205078, + "step": 10139, + "time_per_iteration": 2.4757158756256104 + }, + { + "auxiliary_loss_clip": 0.06419017, + "auxiliary_loss_mlp": 0.01267763, + "balance_loss_clip": 0.06276064, + "balance_loss_mlp": 0.01255895, + "epoch": 0.6096497820532091, + "flos": 15554455850880.0, + "grad_norm": 1.944047007891517, + "language_loss": 0.83626902, + "learning_rate": 1.396355037825315e-06, + "loss": 0.91313678, + "num_input_tokens_seen": 218365620, + "router_z_loss_clip": 1.42871094, + "router_z_loss_mlp": 0.11865234, + "step": 10140, + "time_per_iteration": 2.5361695289611816 + }, + { + "auxiliary_loss_clip": 0.06419208, + "auxiliary_loss_mlp": 0.0126965, + "balance_loss_clip": 0.06277294, + "balance_loss_mlp": 0.01258718, + "epoch": 0.6097099053058771, + "flos": 24210932832000.0, + "grad_norm": 1.8133263657959964, + "language_loss": 0.75536144, + "learning_rate": 1.3959837504704592e-06, + "loss": 0.83225, + "num_input_tokens_seen": 218383785, + "router_z_loss_clip": 1.41992188, + "router_z_loss_mlp": 0.10925293, + "step": 10141, + "time_per_iteration": 3.9623372554779053 + }, + { + "auxiliary_loss_clip": 0.06413428, + "auxiliary_loss_mlp": 0.01263151, + "balance_loss_clip": 0.06275016, + "balance_loss_mlp": 0.01253358, + "epoch": 0.609770028558545, + "flos": 19575830926080.0, + "grad_norm": 2.621888589140599, + "language_loss": 0.76574522, + "learning_rate": 1.3956124860199603e-06, + "loss": 0.842511, + "num_input_tokens_seen": 218399055, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.09790039, + "step": 10142, + "time_per_iteration": 2.5719213485717773 + }, + { + "auxiliary_loss_clip": 0.06415378, + "auxiliary_loss_mlp": 0.01266124, + "balance_loss_clip": 0.06274366, + "balance_loss_mlp": 0.01255979, + "epoch": 0.609830151811213, + "flos": 23955619841280.0, + "grad_norm": 1.612746865863279, + "language_loss": 0.76346582, + "learning_rate": 1.3952412444878964e-06, + "loss": 0.84028077, + "num_input_tokens_seen": 218419120, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.10150146, + "step": 10143, + "time_per_iteration": 2.529778242111206 + }, + { + "auxiliary_loss_clip": 0.06417874, + "auxiliary_loss_mlp": 0.01264047, + "balance_loss_clip": 0.06277366, + "balance_loss_mlp": 0.01253467, + "epoch": 0.6098902750638809, + "flos": 16185205059840.0, + "grad_norm": 2.5594432881750104, + "language_loss": 0.7530098, + "learning_rate": 1.3948700258883448e-06, + "loss": 0.82982898, + "num_input_tokens_seen": 218435290, + "router_z_loss_clip": 1.40527344, + "router_z_loss_mlp": 0.105896, + "step": 10144, + "time_per_iteration": 2.526620864868164 + }, + { + "auxiliary_loss_clip": 0.06420074, + "auxiliary_loss_mlp": 0.01264405, + "balance_loss_clip": 0.06276617, + "balance_loss_mlp": 0.01253634, + "epoch": 0.609950398316549, + "flos": 44536141549440.0, + "grad_norm": 2.1298130564389224, + "language_loss": 0.73869997, + "learning_rate": 1.394498830235383e-06, + "loss": 0.81554472, + "num_input_tokens_seen": 218457880, + "router_z_loss_clip": 1.43457031, + "router_z_loss_mlp": 0.10772705, + "step": 10145, + "time_per_iteration": 2.7241427898406982 + }, + { + "auxiliary_loss_clip": 0.06415195, + "auxiliary_loss_mlp": 0.01263159, + "balance_loss_clip": 0.06274907, + "balance_loss_mlp": 0.01252156, + "epoch": 0.6100105215692169, + "flos": 23228436182400.0, + "grad_norm": 1.5962491809481525, + "language_loss": 0.69665307, + "learning_rate": 1.3941276575430862e-06, + "loss": 0.77343661, + "num_input_tokens_seen": 218475930, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.11004639, + "step": 10146, + "time_per_iteration": 2.557990312576294 + }, + { + "auxiliary_loss_clip": 0.0641242, + "auxiliary_loss_mlp": 0.01264789, + "balance_loss_clip": 0.06276412, + "balance_loss_mlp": 0.01254865, + "epoch": 0.6100706448218849, + "flos": 15017904230400.0, + "grad_norm": 1.5284940617625797, + "language_loss": 0.76506376, + "learning_rate": 1.3937565078255289e-06, + "loss": 0.84183586, + "num_input_tokens_seen": 218493675, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.09936523, + "step": 10147, + "time_per_iteration": 2.5613648891448975 + }, + { + "auxiliary_loss_clip": 0.06412101, + "auxiliary_loss_mlp": 0.01262446, + "balance_loss_clip": 0.0627313, + "balance_loss_mlp": 0.01252153, + "epoch": 0.6101307680745528, + "flos": 19645039998720.0, + "grad_norm": 1.6729040728987632, + "language_loss": 0.78694391, + "learning_rate": 1.393385381096786e-06, + "loss": 0.86368936, + "num_input_tokens_seen": 218511780, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.10296631, + "step": 10148, + "time_per_iteration": 2.5073816776275635 + }, + { + "auxiliary_loss_clip": 0.06424719, + "auxiliary_loss_mlp": 0.01265462, + "balance_loss_clip": 0.06278485, + "balance_loss_mlp": 0.01253672, + "epoch": 0.6101908913272208, + "flos": 29943455662080.0, + "grad_norm": 11.644498336945409, + "language_loss": 0.53887326, + "learning_rate": 1.39301427737093e-06, + "loss": 0.61577505, + "num_input_tokens_seen": 218531850, + "router_z_loss_clip": 1.45996094, + "router_z_loss_mlp": 0.11779785, + "step": 10149, + "time_per_iteration": 2.579378843307495 + }, + { + "auxiliary_loss_clip": 0.0641048, + "auxiliary_loss_mlp": 0.01264861, + "balance_loss_clip": 0.06277239, + "balance_loss_mlp": 0.0125511, + "epoch": 0.6102510145798887, + "flos": 21805067675520.0, + "grad_norm": 1.6674264382808133, + "language_loss": 0.80347526, + "learning_rate": 1.3926431966620333e-06, + "loss": 0.8802287, + "num_input_tokens_seen": 218551245, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.09753418, + "step": 10150, + "time_per_iteration": 2.542039394378662 + }, + { + "auxiliary_loss_clip": 0.06418844, + "auxiliary_loss_mlp": 0.01266292, + "balance_loss_clip": 0.06277014, + "balance_loss_mlp": 0.01254747, + "epoch": 0.6103111378325567, + "flos": 20712719923200.0, + "grad_norm": 1.6063484518637994, + "language_loss": 0.69615412, + "learning_rate": 1.3922721389841684e-06, + "loss": 0.77300549, + "num_input_tokens_seen": 218571365, + "router_z_loss_clip": 1.41699219, + "router_z_loss_mlp": 0.11529541, + "step": 10151, + "time_per_iteration": 2.5254616737365723 + }, + { + "auxiliary_loss_clip": 0.06415872, + "auxiliary_loss_mlp": 0.01264029, + "balance_loss_clip": 0.06276833, + "balance_loss_mlp": 0.01254218, + "epoch": 0.6103712610852247, + "flos": 29388330362880.0, + "grad_norm": 1.5395706469140102, + "language_loss": 0.71042097, + "learning_rate": 1.3919011043514036e-06, + "loss": 0.78722, + "num_input_tokens_seen": 218588315, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.0980835, + "step": 10152, + "time_per_iteration": 2.565767288208008 + }, + { + "auxiliary_loss_clip": 0.06416918, + "auxiliary_loss_mlp": 0.01268582, + "balance_loss_clip": 0.06275494, + "balance_loss_mlp": 0.01257883, + "epoch": 0.6104313843378927, + "flos": 20819216862720.0, + "grad_norm": 1.604020409534104, + "language_loss": 0.78784543, + "learning_rate": 1.391530092777811e-06, + "loss": 0.86470044, + "num_input_tokens_seen": 218605940, + "router_z_loss_clip": 1.41308594, + "router_z_loss_mlp": 0.10699463, + "step": 10153, + "time_per_iteration": 2.5230531692504883 + }, + { + "auxiliary_loss_clip": 0.06414121, + "auxiliary_loss_mlp": 0.01268779, + "balance_loss_clip": 0.06273308, + "balance_loss_mlp": 0.01258873, + "epoch": 0.6104915075905607, + "flos": 26585715323520.0, + "grad_norm": 1.630222855772095, + "language_loss": 0.79992545, + "learning_rate": 1.3911591042774573e-06, + "loss": 0.8767544, + "num_input_tokens_seen": 218626100, + "router_z_loss_clip": 1.40917969, + "router_z_loss_mlp": 0.09906006, + "step": 10154, + "time_per_iteration": 2.5763237476348877 + }, + { + "auxiliary_loss_clip": 0.06417637, + "auxiliary_loss_mlp": 0.01269392, + "balance_loss_clip": 0.06279704, + "balance_loss_mlp": 0.01258937, + "epoch": 0.6105516308432286, + "flos": 23922734313600.0, + "grad_norm": 1.4598935838539129, + "language_loss": 0.70770371, + "learning_rate": 1.3907881388644116e-06, + "loss": 0.78457403, + "num_input_tokens_seen": 218645060, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.10455322, + "step": 10155, + "time_per_iteration": 2.5680413246154785 + }, + { + "auxiliary_loss_clip": 0.06418546, + "auxiliary_loss_mlp": 0.01266443, + "balance_loss_clip": 0.06278499, + "balance_loss_mlp": 0.0125569, + "epoch": 0.6106117540958966, + "flos": 31585520125440.0, + "grad_norm": 1.5387182092943745, + "language_loss": 0.71842468, + "learning_rate": 1.3904171965527413e-06, + "loss": 0.79527456, + "num_input_tokens_seen": 218667690, + "router_z_loss_clip": 1.39941406, + "router_z_loss_mlp": 0.10742188, + "step": 10156, + "time_per_iteration": 2.6240859031677246 + }, + { + "auxiliary_loss_clip": 0.06412362, + "auxiliary_loss_mlp": 0.01266681, + "balance_loss_clip": 0.06277083, + "balance_loss_mlp": 0.01255422, + "epoch": 0.6106718773485645, + "flos": 19613999260800.0, + "grad_norm": 1.3880208824071523, + "language_loss": 0.67516112, + "learning_rate": 1.3900462773565114e-06, + "loss": 0.75195158, + "num_input_tokens_seen": 218687505, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.11254883, + "step": 10157, + "time_per_iteration": 2.533141613006592 + }, + { + "auxiliary_loss_clip": 0.06414488, + "auxiliary_loss_mlp": 0.01264295, + "balance_loss_clip": 0.06273255, + "balance_loss_mlp": 0.01253888, + "epoch": 0.6107320006012326, + "flos": 17128778688000.0, + "grad_norm": 1.7065905103759618, + "language_loss": 0.72894049, + "learning_rate": 1.3896753812897877e-06, + "loss": 0.80572832, + "num_input_tokens_seen": 218705315, + "router_z_loss_clip": 1.41308594, + "router_z_loss_mlp": 0.10400391, + "step": 10158, + "time_per_iteration": 2.4852585792541504 + }, + { + "auxiliary_loss_clip": 0.06417953, + "auxiliary_loss_mlp": 0.01268086, + "balance_loss_clip": 0.06274998, + "balance_loss_mlp": 0.01257917, + "epoch": 0.6107921238539005, + "flos": 30155107875840.0, + "grad_norm": 1.7026117107079757, + "language_loss": 0.69434297, + "learning_rate": 1.389304508366635e-06, + "loss": 0.7712034, + "num_input_tokens_seen": 218725735, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.1015625, + "step": 10159, + "time_per_iteration": 2.6481263637542725 + }, + { + "auxiliary_loss_clip": 0.06416903, + "auxiliary_loss_mlp": 0.01266619, + "balance_loss_clip": 0.06276091, + "balance_loss_mlp": 0.01255747, + "epoch": 0.6108522471065685, + "flos": 18445859890560.0, + "grad_norm": 1.7469967655501557, + "language_loss": 0.79027724, + "learning_rate": 1.3889336586011167e-06, + "loss": 0.86711246, + "num_input_tokens_seen": 218743215, + "router_z_loss_clip": 1.40722656, + "router_z_loss_mlp": 0.10876465, + "step": 10160, + "time_per_iteration": 2.5056142807006836 + }, + { + "auxiliary_loss_clip": 0.06325343, + "auxiliary_loss_mlp": 0.01260291, + "balance_loss_clip": 0.06266694, + "balance_loss_mlp": 0.01258597, + "epoch": 0.6109123703592364, + "flos": 64157295605760.0, + "grad_norm": 0.797024648042973, + "language_loss": 0.61520749, + "learning_rate": 1.388562832007295e-06, + "loss": 0.69106382, + "num_input_tokens_seen": 218806440, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 0.01698303, + "step": 10161, + "time_per_iteration": 3.325639486312866 + }, + { + "auxiliary_loss_clip": 0.06418448, + "auxiliary_loss_mlp": 0.01268382, + "balance_loss_clip": 0.06276111, + "balance_loss_mlp": 0.01257099, + "epoch": 0.6109724936119044, + "flos": 20674132318080.0, + "grad_norm": 2.3454759388543316, + "language_loss": 0.76444739, + "learning_rate": 1.3881920285992324e-06, + "loss": 0.84131569, + "num_input_tokens_seen": 218825720, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.112854, + "step": 10162, + "time_per_iteration": 4.040041446685791 + }, + { + "auxiliary_loss_clip": 0.06414326, + "auxiliary_loss_mlp": 0.01264875, + "balance_loss_clip": 0.0627545, + "balance_loss_mlp": 0.01253669, + "epoch": 0.6110326168645723, + "flos": 31358899958400.0, + "grad_norm": 1.528039199186958, + "language_loss": 0.71962601, + "learning_rate": 1.3878212483909888e-06, + "loss": 0.79641795, + "num_input_tokens_seen": 218847735, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.11218262, + "step": 10163, + "time_per_iteration": 2.5920441150665283 + }, + { + "auxiliary_loss_clip": 0.06409657, + "auxiliary_loss_mlp": 0.01267118, + "balance_loss_clip": 0.06273548, + "balance_loss_mlp": 0.01257903, + "epoch": 0.6110927401172404, + "flos": 25009338061440.0, + "grad_norm": 1.7630876229655692, + "language_loss": 0.60071069, + "learning_rate": 1.387450491396625e-06, + "loss": 0.67747843, + "num_input_tokens_seen": 218866585, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.09210205, + "step": 10164, + "time_per_iteration": 2.559441328048706 + }, + { + "auxiliary_loss_clip": 0.06414106, + "auxiliary_loss_mlp": 0.01269871, + "balance_loss_clip": 0.0627519, + "balance_loss_mlp": 0.0125975, + "epoch": 0.6111528633699083, + "flos": 26254946131200.0, + "grad_norm": 1.466434652755145, + "language_loss": 0.75936824, + "learning_rate": 1.3870797576302003e-06, + "loss": 0.83620799, + "num_input_tokens_seen": 218885560, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.10119629, + "step": 10165, + "time_per_iteration": 2.521923542022705 + }, + { + "auxiliary_loss_clip": 0.0641854, + "auxiliary_loss_mlp": 0.01268441, + "balance_loss_clip": 0.06282263, + "balance_loss_mlp": 0.0125807, + "epoch": 0.6112129866225763, + "flos": 22389011579520.0, + "grad_norm": 1.518231620716018, + "language_loss": 0.79607749, + "learning_rate": 1.3867090471057719e-06, + "loss": 0.87294728, + "num_input_tokens_seen": 218905055, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.10375977, + "step": 10166, + "time_per_iteration": 2.5410702228546143 + }, + { + "auxiliary_loss_clip": 0.06416941, + "auxiliary_loss_mlp": 0.01265827, + "balance_loss_clip": 0.06276624, + "balance_loss_mlp": 0.01254949, + "epoch": 0.6112731098752443, + "flos": 25234826198400.0, + "grad_norm": 7.9003095632563385, + "language_loss": 0.68483454, + "learning_rate": 1.3863383598373987e-06, + "loss": 0.76166224, + "num_input_tokens_seen": 218924030, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.10876465, + "step": 10167, + "time_per_iteration": 2.5295464992523193 + }, + { + "auxiliary_loss_clip": 0.0641242, + "auxiliary_loss_mlp": 0.01265962, + "balance_loss_clip": 0.06275839, + "balance_loss_mlp": 0.01256586, + "epoch": 0.6113332331279122, + "flos": 22899763342080.0, + "grad_norm": 1.6873056368761516, + "language_loss": 0.7915386, + "learning_rate": 1.3859676958391364e-06, + "loss": 0.86832243, + "num_input_tokens_seen": 218943750, + "router_z_loss_clip": 1.36621094, + "router_z_loss_mlp": 0.09381104, + "step": 10168, + "time_per_iteration": 2.53782320022583 + }, + { + "auxiliary_loss_clip": 0.06426514, + "auxiliary_loss_mlp": 0.0126727, + "balance_loss_clip": 0.06277908, + "balance_loss_mlp": 0.01254991, + "epoch": 0.6113933563805802, + "flos": 18625548971520.0, + "grad_norm": 2.2514835469058405, + "language_loss": 0.86128104, + "learning_rate": 1.3855970551250398e-06, + "loss": 0.93821883, + "num_input_tokens_seen": 218957585, + "router_z_loss_clip": 1.48632812, + "router_z_loss_mlp": 0.12286377, + "step": 10169, + "time_per_iteration": 2.4681122303009033 + }, + { + "auxiliary_loss_clip": 0.06415342, + "auxiliary_loss_mlp": 0.01264532, + "balance_loss_clip": 0.06275853, + "balance_loss_mlp": 0.01254871, + "epoch": 0.6114534796332481, + "flos": 41876137359360.0, + "grad_norm": 1.5861355547500362, + "language_loss": 0.79530609, + "learning_rate": 1.3852264377091652e-06, + "loss": 0.87210482, + "num_input_tokens_seen": 218980025, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.09661865, + "step": 10170, + "time_per_iteration": 2.707791566848755 + }, + { + "auxiliary_loss_clip": 0.06423808, + "auxiliary_loss_mlp": 0.01264285, + "balance_loss_clip": 0.06277203, + "balance_loss_mlp": 0.01252359, + "epoch": 0.6115136028859162, + "flos": 21914960705280.0, + "grad_norm": 2.240444553593937, + "language_loss": 0.6873374, + "learning_rate": 1.3848558436055651e-06, + "loss": 0.76421833, + "num_input_tokens_seen": 218998200, + "router_z_loss_clip": 1.46679688, + "router_z_loss_mlp": 0.1192627, + "step": 10171, + "time_per_iteration": 2.505051612854004 + }, + { + "auxiliary_loss_clip": 0.06420024, + "auxiliary_loss_mlp": 0.01266591, + "balance_loss_clip": 0.06277289, + "balance_loss_mlp": 0.01254634, + "epoch": 0.6115737261385841, + "flos": 28812604158720.0, + "grad_norm": 6.231678075331036, + "language_loss": 0.79464412, + "learning_rate": 1.3844852728282934e-06, + "loss": 0.87151027, + "num_input_tokens_seen": 219017910, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.11962891, + "step": 10172, + "time_per_iteration": 4.057689666748047 + }, + { + "auxiliary_loss_clip": 0.06425016, + "auxiliary_loss_mlp": 0.01268326, + "balance_loss_clip": 0.06279068, + "balance_loss_mlp": 0.01257222, + "epoch": 0.6116338493912521, + "flos": 21257824659840.0, + "grad_norm": 1.6337666078989976, + "language_loss": 0.67181307, + "learning_rate": 1.3841147253914022e-06, + "loss": 0.74874651, + "num_input_tokens_seen": 219037730, + "router_z_loss_clip": 1.45703125, + "router_z_loss_mlp": 0.11108398, + "step": 10173, + "time_per_iteration": 2.5301437377929688 + }, + { + "auxiliary_loss_clip": 0.06418002, + "auxiliary_loss_mlp": 0.01270854, + "balance_loss_clip": 0.06275578, + "balance_loss_mlp": 0.01259261, + "epoch": 0.61169397264392, + "flos": 17535968403840.0, + "grad_norm": 1.769252328158937, + "language_loss": 0.56344169, + "learning_rate": 1.3837442013089416e-06, + "loss": 0.64033026, + "num_input_tokens_seen": 219056755, + "router_z_loss_clip": 1.42285156, + "router_z_loss_mlp": 0.1159668, + "step": 10174, + "time_per_iteration": 2.530437707901001 + }, + { + "auxiliary_loss_clip": 0.064185, + "auxiliary_loss_mlp": 0.01267148, + "balance_loss_clip": 0.06277028, + "balance_loss_mlp": 0.01255931, + "epoch": 0.611754095896588, + "flos": 23958387025920.0, + "grad_norm": 1.6825013036462741, + "language_loss": 0.66233337, + "learning_rate": 1.3833737005949628e-06, + "loss": 0.73918986, + "num_input_tokens_seen": 219076985, + "router_z_loss_clip": 1.41699219, + "router_z_loss_mlp": 0.11212158, + "step": 10175, + "time_per_iteration": 4.048693656921387 + }, + { + "auxiliary_loss_clip": 0.06415173, + "auxiliary_loss_mlp": 0.01263056, + "balance_loss_clip": 0.06275052, + "balance_loss_mlp": 0.01253019, + "epoch": 0.6118142191492559, + "flos": 26002064908800.0, + "grad_norm": 1.985962827753808, + "language_loss": 0.82859969, + "learning_rate": 1.3830032232635154e-06, + "loss": 0.90538198, + "num_input_tokens_seen": 219096050, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.10040283, + "step": 10176, + "time_per_iteration": 2.5558836460113525 + }, + { + "auxiliary_loss_clip": 0.06419128, + "auxiliary_loss_mlp": 0.01271507, + "balance_loss_clip": 0.06277899, + "balance_loss_mlp": 0.01259491, + "epoch": 0.611874342401924, + "flos": 24609275942400.0, + "grad_norm": 1.5904100346197647, + "language_loss": 0.77812099, + "learning_rate": 1.3826327693286474e-06, + "loss": 0.85502738, + "num_input_tokens_seen": 219112665, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.12011719, + "step": 10177, + "time_per_iteration": 2.5346739292144775 + }, + { + "auxiliary_loss_clip": 0.06416818, + "auxiliary_loss_mlp": 0.01269124, + "balance_loss_clip": 0.06275249, + "balance_loss_mlp": 0.01257924, + "epoch": 0.6119344656545919, + "flos": 15892436494080.0, + "grad_norm": 2.6097925851891755, + "language_loss": 0.75949138, + "learning_rate": 1.3822623388044065e-06, + "loss": 0.8363508, + "num_input_tokens_seen": 219129120, + "router_z_loss_clip": 1.41503906, + "router_z_loss_mlp": 0.11212158, + "step": 10178, + "time_per_iteration": 2.524557113647461 + }, + { + "auxiliary_loss_clip": 0.06418636, + "auxiliary_loss_mlp": 0.01267998, + "balance_loss_clip": 0.06276189, + "balance_loss_mlp": 0.01256435, + "epoch": 0.6119945889072599, + "flos": 21659312298240.0, + "grad_norm": 1.5720284026291744, + "language_loss": 0.67318261, + "learning_rate": 1.3818919317048402e-06, + "loss": 0.75004888, + "num_input_tokens_seen": 219148950, + "router_z_loss_clip": 1.42285156, + "router_z_loss_mlp": 0.11553955, + "step": 10179, + "time_per_iteration": 2.5297069549560547 + }, + { + "auxiliary_loss_clip": 0.06419764, + "auxiliary_loss_mlp": 0.01264087, + "balance_loss_clip": 0.06276909, + "balance_loss_mlp": 0.01253179, + "epoch": 0.6120547121599279, + "flos": 13777746675840.0, + "grad_norm": 1.9709040238374929, + "language_loss": 0.83888078, + "learning_rate": 1.3815215480439933e-06, + "loss": 0.91571933, + "num_input_tokens_seen": 219165585, + "router_z_loss_clip": 1.42675781, + "router_z_loss_mlp": 0.10906982, + "step": 10180, + "time_per_iteration": 3.9827919006347656 + }, + { + "auxiliary_loss_clip": 0.06417181, + "auxiliary_loss_mlp": 0.01268448, + "balance_loss_clip": 0.06276719, + "balance_loss_mlp": 0.01256683, + "epoch": 0.6121148354125958, + "flos": 20084528263680.0, + "grad_norm": 1.549982980411044, + "language_loss": 0.77731764, + "learning_rate": 1.3811511878359113e-06, + "loss": 0.8541739, + "num_input_tokens_seen": 219183280, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.11761475, + "step": 10181, + "time_per_iteration": 2.4853463172912598 + }, + { + "auxiliary_loss_clip": 0.06420098, + "auxiliary_loss_mlp": 0.01269807, + "balance_loss_clip": 0.06277204, + "balance_loss_mlp": 0.01258565, + "epoch": 0.6121749586652638, + "flos": 13474915620480.0, + "grad_norm": 2.0089243925599973, + "language_loss": 0.8071022, + "learning_rate": 1.3807808510946384e-06, + "loss": 0.88400126, + "num_input_tokens_seen": 219197200, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.11248779, + "step": 10182, + "time_per_iteration": 2.4935574531555176 + }, + { + "auxiliary_loss_clip": 0.06411545, + "auxiliary_loss_mlp": 0.01267116, + "balance_loss_clip": 0.0627587, + "balance_loss_mlp": 0.0125805, + "epoch": 0.6122350819179317, + "flos": 20126721594240.0, + "grad_norm": 1.501667213386016, + "language_loss": 0.83102655, + "learning_rate": 1.3804105378342177e-06, + "loss": 0.90781319, + "num_input_tokens_seen": 219216825, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.09069824, + "step": 10183, + "time_per_iteration": 2.5836997032165527 + }, + { + "auxiliary_loss_clip": 0.06327992, + "auxiliary_loss_mlp": 0.01253825, + "balance_loss_clip": 0.06268366, + "balance_loss_mlp": 0.01252147, + "epoch": 0.6122952051705998, + "flos": 65448004700160.0, + "grad_norm": 0.7149962337899693, + "language_loss": 0.62764937, + "learning_rate": 1.3800402480686914e-06, + "loss": 0.70346749, + "num_input_tokens_seen": 219283795, + "router_z_loss_clip": 0.59765625, + "router_z_loss_mlp": 0.01681519, + "step": 10184, + "time_per_iteration": 3.3003170490264893 + }, + { + "auxiliary_loss_clip": 0.06420484, + "auxiliary_loss_mlp": 0.01263793, + "balance_loss_clip": 0.06279504, + "balance_loss_mlp": 0.01253857, + "epoch": 0.6123553284232677, + "flos": 20382537709440.0, + "grad_norm": 1.6441224641064962, + "language_loss": 0.82408071, + "learning_rate": 1.379669981812101e-06, + "loss": 0.90092349, + "num_input_tokens_seen": 219302385, + "router_z_loss_clip": 1.40917969, + "router_z_loss_mlp": 0.09936523, + "step": 10185, + "time_per_iteration": 2.5150225162506104 + }, + { + "auxiliary_loss_clip": 0.06425197, + "auxiliary_loss_mlp": 0.01266627, + "balance_loss_clip": 0.06278922, + "balance_loss_mlp": 0.01255487, + "epoch": 0.6124154516759357, + "flos": 23994417081600.0, + "grad_norm": 1.7366290964606979, + "language_loss": 0.75121021, + "learning_rate": 1.3792997390784868e-06, + "loss": 0.82812846, + "num_input_tokens_seen": 219319765, + "router_z_loss_clip": 1.46386719, + "router_z_loss_mlp": 0.11151123, + "step": 10186, + "time_per_iteration": 2.627387046813965 + }, + { + "auxiliary_loss_clip": 0.06415901, + "auxiliary_loss_mlp": 0.01262607, + "balance_loss_clip": 0.06275809, + "balance_loss_mlp": 0.01252599, + "epoch": 0.6124755749286036, + "flos": 21474927388800.0, + "grad_norm": 1.4642741872217127, + "language_loss": 0.78637451, + "learning_rate": 1.3789295198818895e-06, + "loss": 0.8631596, + "num_input_tokens_seen": 219337440, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.10003662, + "step": 10187, + "time_per_iteration": 2.49202561378479 + }, + { + "auxiliary_loss_clip": 0.06414475, + "auxiliary_loss_mlp": 0.01265646, + "balance_loss_clip": 0.06274372, + "balance_loss_mlp": 0.0125472, + "epoch": 0.6125356981812716, + "flos": 23886117279360.0, + "grad_norm": 1.4743912854017487, + "language_loss": 0.83344066, + "learning_rate": 1.3785593242363462e-06, + "loss": 0.91024196, + "num_input_tokens_seen": 219357525, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.10925293, + "step": 10188, + "time_per_iteration": 2.555687427520752 + }, + { + "auxiliary_loss_clip": 0.06417944, + "auxiliary_loss_mlp": 0.01265819, + "balance_loss_clip": 0.06276008, + "balance_loss_mlp": 0.01255168, + "epoch": 0.6125958214339395, + "flos": 14430312673920.0, + "grad_norm": 1.6601752905069214, + "language_loss": 0.75527823, + "learning_rate": 1.378189152155896e-06, + "loss": 0.83211589, + "num_input_tokens_seen": 219374855, + "router_z_loss_clip": 1.41992188, + "router_z_loss_mlp": 0.10656738, + "step": 10189, + "time_per_iteration": 2.4994595050811768 + }, + { + "auxiliary_loss_clip": 0.06417951, + "auxiliary_loss_mlp": 0.01265327, + "balance_loss_clip": 0.06275356, + "balance_loss_mlp": 0.012543, + "epoch": 0.6126559446866076, + "flos": 23265933684480.0, + "grad_norm": 1.4192081343801892, + "language_loss": 0.74300897, + "learning_rate": 1.3778190036545758e-06, + "loss": 0.81984174, + "num_input_tokens_seen": 219394740, + "router_z_loss_clip": 1.42480469, + "router_z_loss_mlp": 0.11016846, + "step": 10190, + "time_per_iteration": 2.6080024242401123 + }, + { + "auxiliary_loss_clip": 0.06418385, + "auxiliary_loss_mlp": 0.01266786, + "balance_loss_clip": 0.0627688, + "balance_loss_mlp": 0.01255044, + "epoch": 0.6127160679392755, + "flos": 26871188584320.0, + "grad_norm": 1.672928736412144, + "language_loss": 0.68484575, + "learning_rate": 1.3774488787464207e-06, + "loss": 0.76169741, + "num_input_tokens_seen": 219413755, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.11749268, + "step": 10191, + "time_per_iteration": 2.54805064201355 + }, + { + "auxiliary_loss_clip": 0.06419395, + "auxiliary_loss_mlp": 0.012717, + "balance_loss_clip": 0.06276383, + "balance_loss_mlp": 0.01259833, + "epoch": 0.6127761911919435, + "flos": 26403720255360.0, + "grad_norm": 1.7824154048725067, + "language_loss": 0.73771405, + "learning_rate": 1.377078777445467e-06, + "loss": 0.81462502, + "num_input_tokens_seen": 219433560, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.11859131, + "step": 10192, + "time_per_iteration": 2.556392192840576 + }, + { + "auxiliary_loss_clip": 0.06413901, + "auxiliary_loss_mlp": 0.01263543, + "balance_loss_clip": 0.06275194, + "balance_loss_mlp": 0.01253225, + "epoch": 0.6128363144446115, + "flos": 22640802698880.0, + "grad_norm": 1.814520897334069, + "language_loss": 0.84227109, + "learning_rate": 1.3767086997657478e-06, + "loss": 0.91904557, + "num_input_tokens_seen": 219452640, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.10314941, + "step": 10193, + "time_per_iteration": 2.5000216960906982 + }, + { + "auxiliary_loss_clip": 0.06417094, + "auxiliary_loss_mlp": 0.01267497, + "balance_loss_clip": 0.06275633, + "balance_loss_mlp": 0.01256625, + "epoch": 0.6128964376972794, + "flos": 26766033310080.0, + "grad_norm": 2.0280898056271255, + "language_loss": 0.707515, + "learning_rate": 1.3763386457212979e-06, + "loss": 0.78436089, + "num_input_tokens_seen": 219468585, + "router_z_loss_clip": 1.41308594, + "router_z_loss_mlp": 0.10870361, + "step": 10194, + "time_per_iteration": 2.5357043743133545 + }, + { + "auxiliary_loss_clip": 0.06330009, + "auxiliary_loss_mlp": 0.01254574, + "balance_loss_clip": 0.06270672, + "balance_loss_mlp": 0.01252429, + "epoch": 0.6129565609499474, + "flos": 65585500450560.0, + "grad_norm": 0.7963949843311754, + "language_loss": 0.58648682, + "learning_rate": 1.375968615326149e-06, + "loss": 0.66233265, + "num_input_tokens_seen": 219523015, + "router_z_loss_clip": 0.59375, + "router_z_loss_mlp": 0.02146912, + "step": 10195, + "time_per_iteration": 2.935722589492798 + }, + { + "auxiliary_loss_clip": 0.06416507, + "auxiliary_loss_mlp": 0.01269514, + "balance_loss_clip": 0.06275862, + "balance_loss_mlp": 0.01257873, + "epoch": 0.6130166842026153, + "flos": 16367577471360.0, + "grad_norm": 1.8676293874241905, + "language_loss": 0.69944096, + "learning_rate": 1.3755986085943324e-06, + "loss": 0.77630115, + "num_input_tokens_seen": 219539980, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.11639404, + "step": 10196, + "time_per_iteration": 2.522855520248413 + }, + { + "auxiliary_loss_clip": 0.06413607, + "auxiliary_loss_mlp": 0.0126591, + "balance_loss_clip": 0.06273703, + "balance_loss_mlp": 0.01255795, + "epoch": 0.6130768074552834, + "flos": 23658029665920.0, + "grad_norm": 1.6623431982713033, + "language_loss": 0.7114116, + "learning_rate": 1.3752286255398788e-06, + "loss": 0.78820676, + "num_input_tokens_seen": 219556980, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.10113525, + "step": 10197, + "time_per_iteration": 2.576241970062256 + }, + { + "auxiliary_loss_clip": 0.06418445, + "auxiliary_loss_mlp": 0.01271491, + "balance_loss_clip": 0.06275209, + "balance_loss_mlp": 0.01260828, + "epoch": 0.6131369307079513, + "flos": 20053613306880.0, + "grad_norm": 1.7635400810353365, + "language_loss": 0.78912157, + "learning_rate": 1.3748586661768191e-06, + "loss": 0.86602092, + "num_input_tokens_seen": 219576410, + "router_z_loss_clip": 1.43164062, + "router_z_loss_mlp": 0.10675049, + "step": 10198, + "time_per_iteration": 2.5441195964813232 + }, + { + "auxiliary_loss_clip": 0.06419414, + "auxiliary_loss_mlp": 0.01266374, + "balance_loss_clip": 0.06274287, + "balance_loss_mlp": 0.01255138, + "epoch": 0.6131970539606193, + "flos": 22678384055040.0, + "grad_norm": 1.422407986186852, + "language_loss": 0.74737686, + "learning_rate": 1.374488730519181e-06, + "loss": 0.82423472, + "num_input_tokens_seen": 219597180, + "router_z_loss_clip": 1.44824219, + "router_z_loss_mlp": 0.11236572, + "step": 10199, + "time_per_iteration": 2.567636251449585 + }, + { + "auxiliary_loss_clip": 0.06417924, + "auxiliary_loss_mlp": 0.01269269, + "balance_loss_clip": 0.06272729, + "balance_loss_mlp": 0.01257735, + "epoch": 0.6132571772132872, + "flos": 26878316181120.0, + "grad_norm": 1.5670545162327942, + "language_loss": 0.62008464, + "learning_rate": 1.374118818580993e-06, + "loss": 0.69695652, + "num_input_tokens_seen": 219617630, + "router_z_loss_clip": 1.45214844, + "router_z_loss_mlp": 0.11541748, + "step": 10200, + "time_per_iteration": 2.561591863632202 + }, + { + "auxiliary_loss_clip": 0.06416481, + "auxiliary_loss_mlp": 0.01270085, + "balance_loss_clip": 0.06275273, + "balance_loss_mlp": 0.0125944, + "epoch": 0.6133173004659552, + "flos": 22899176363520.0, + "grad_norm": 1.7093296118249273, + "language_loss": 0.69054127, + "learning_rate": 1.3737489303762822e-06, + "loss": 0.76740688, + "num_input_tokens_seen": 219637025, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.10644531, + "step": 10201, + "time_per_iteration": 3.9431076049804688 + }, + { + "auxiliary_loss_clip": 0.06409751, + "auxiliary_loss_mlp": 0.01268274, + "balance_loss_clip": 0.06271015, + "balance_loss_mlp": 0.01257462, + "epoch": 0.6133774237186231, + "flos": 20491298709120.0, + "grad_norm": 2.3821613548396368, + "language_loss": 0.83898175, + "learning_rate": 1.3733790659190746e-06, + "loss": 0.91576207, + "num_input_tokens_seen": 219656625, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.10809326, + "step": 10202, + "time_per_iteration": 2.496201276779175 + }, + { + "auxiliary_loss_clip": 0.06332828, + "auxiliary_loss_mlp": 0.01255453, + "balance_loss_clip": 0.06274157, + "balance_loss_mlp": 0.01253526, + "epoch": 0.6134375469712912, + "flos": 69433643208960.0, + "grad_norm": 0.8530026378603166, + "language_loss": 0.66995066, + "learning_rate": 1.3730092252233953e-06, + "loss": 0.74583346, + "num_input_tokens_seen": 219718090, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 0.01924133, + "step": 10203, + "time_per_iteration": 3.1688590049743652 + }, + { + "auxiliary_loss_clip": 0.06417629, + "auxiliary_loss_mlp": 0.01266234, + "balance_loss_clip": 0.06275114, + "balance_loss_mlp": 0.0125538, + "epoch": 0.6134976702239591, + "flos": 41291145279360.0, + "grad_norm": 1.6901163598507989, + "language_loss": 0.61053431, + "learning_rate": 1.37263940830327e-06, + "loss": 0.68737298, + "num_input_tokens_seen": 219740100, + "router_z_loss_clip": 1.42675781, + "router_z_loss_mlp": 0.10845947, + "step": 10204, + "time_per_iteration": 2.7038605213165283 + }, + { + "auxiliary_loss_clip": 0.06412404, + "auxiliary_loss_mlp": 0.01263093, + "balance_loss_clip": 0.06273691, + "balance_loss_mlp": 0.01252901, + "epoch": 0.6135577934766271, + "flos": 22353233086080.0, + "grad_norm": 1.6787218918093536, + "language_loss": 0.72929007, + "learning_rate": 1.3722696151727204e-06, + "loss": 0.80604506, + "num_input_tokens_seen": 219761225, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.10198975, + "step": 10205, + "time_per_iteration": 2.5766189098358154 + }, + { + "auxiliary_loss_clip": 0.06411709, + "auxiliary_loss_mlp": 0.01265007, + "balance_loss_clip": 0.06273441, + "balance_loss_mlp": 0.01253843, + "epoch": 0.6136179167292951, + "flos": 23734198627200.0, + "grad_norm": 1.5218154078879744, + "language_loss": 0.76180834, + "learning_rate": 1.3718998458457701e-06, + "loss": 0.83857548, + "num_input_tokens_seen": 219780085, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.1116333, + "step": 10206, + "time_per_iteration": 2.5717761516571045 + }, + { + "auxiliary_loss_clip": 0.0641268, + "auxiliary_loss_mlp": 0.01265782, + "balance_loss_clip": 0.06271984, + "balance_loss_mlp": 0.01254595, + "epoch": 0.613678039981963, + "flos": 26030757732480.0, + "grad_norm": 2.128320629636919, + "language_loss": 0.7591306, + "learning_rate": 1.3715301003364407e-06, + "loss": 0.83591521, + "num_input_tokens_seen": 219797895, + "router_z_loss_clip": 1.40722656, + "router_z_loss_mlp": 0.11181641, + "step": 10207, + "time_per_iteration": 2.5353450775146484 + }, + { + "auxiliary_loss_clip": 0.06418657, + "auxiliary_loss_mlp": 0.01264109, + "balance_loss_clip": 0.06278594, + "balance_loss_mlp": 0.01253362, + "epoch": 0.613738163234631, + "flos": 9863078175360.0, + "grad_norm": 1.9702213064203427, + "language_loss": 0.82853335, + "learning_rate": 1.3711603786587525e-06, + "loss": 0.90536106, + "num_input_tokens_seen": 219811295, + "router_z_loss_clip": 1.39941406, + "router_z_loss_mlp": 0.10748291, + "step": 10208, + "time_per_iteration": 2.4810874462127686 + }, + { + "auxiliary_loss_clip": 0.06422867, + "auxiliary_loss_mlp": 0.01267664, + "balance_loss_clip": 0.06278636, + "balance_loss_mlp": 0.01255814, + "epoch": 0.613798286487299, + "flos": 33190380576000.0, + "grad_norm": 1.7610608340758167, + "language_loss": 0.72894984, + "learning_rate": 1.3707906808267265e-06, + "loss": 0.8058551, + "num_input_tokens_seen": 219832735, + "router_z_loss_clip": 1.44238281, + "router_z_loss_mlp": 0.1184082, + "step": 10209, + "time_per_iteration": 2.6061112880706787 + }, + { + "auxiliary_loss_clip": 0.06413165, + "auxiliary_loss_mlp": 0.01267749, + "balance_loss_clip": 0.06273563, + "balance_loss_mlp": 0.01257157, + "epoch": 0.613858409739967, + "flos": 25634678682240.0, + "grad_norm": 1.6794559835324834, + "language_loss": 0.74641943, + "learning_rate": 1.37042100685438e-06, + "loss": 0.8232286, + "num_input_tokens_seen": 219852755, + "router_z_loss_clip": 1.39648438, + "router_z_loss_mlp": 0.10595703, + "step": 10210, + "time_per_iteration": 2.5699121952056885 + }, + { + "auxiliary_loss_clip": 0.06324588, + "auxiliary_loss_mlp": 0.01253647, + "balance_loss_clip": 0.06266326, + "balance_loss_mlp": 0.01251882, + "epoch": 0.6139185329926349, + "flos": 67213336919040.0, + "grad_norm": 0.8410650121869828, + "language_loss": 0.65019715, + "learning_rate": 1.3700513567557325e-06, + "loss": 0.72597951, + "num_input_tokens_seen": 219922785, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 0.01765442, + "step": 10211, + "time_per_iteration": 3.2996082305908203 + }, + { + "auxiliary_loss_clip": 0.06413533, + "auxiliary_loss_mlp": 0.01270005, + "balance_loss_clip": 0.06274238, + "balance_loss_mlp": 0.01258889, + "epoch": 0.6139786562453029, + "flos": 21550090101120.0, + "grad_norm": 1.5192132224806107, + "language_loss": 0.75830382, + "learning_rate": 1.369681730544801e-06, + "loss": 0.83513916, + "num_input_tokens_seen": 219942215, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.11120605, + "step": 10212, + "time_per_iteration": 3.9495487213134766 + }, + { + "auxiliary_loss_clip": 0.06416361, + "auxiliary_loss_mlp": 0.01273486, + "balance_loss_clip": 0.06276919, + "balance_loss_mlp": 0.01262614, + "epoch": 0.6140387794979708, + "flos": 26075802101760.0, + "grad_norm": 1.4991601562707406, + "language_loss": 0.74122798, + "learning_rate": 1.3693121282356009e-06, + "loss": 0.8181265, + "num_input_tokens_seen": 219963830, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.10882568, + "step": 10213, + "time_per_iteration": 2.550542116165161 + }, + { + "auxiliary_loss_clip": 0.06420778, + "auxiliary_loss_mlp": 0.01265233, + "balance_loss_clip": 0.06275892, + "balance_loss_mlp": 0.01253742, + "epoch": 0.6140989027506388, + "flos": 23701145391360.0, + "grad_norm": 1.8705312076501914, + "language_loss": 0.73641956, + "learning_rate": 1.3689425498421483e-06, + "loss": 0.81327969, + "num_input_tokens_seen": 219983815, + "router_z_loss_clip": 1.44824219, + "router_z_loss_mlp": 0.11499023, + "step": 10214, + "time_per_iteration": 2.524115562438965 + }, + { + "auxiliary_loss_clip": 0.06416141, + "auxiliary_loss_mlp": 0.012644, + "balance_loss_clip": 0.06273637, + "balance_loss_mlp": 0.01253289, + "epoch": 0.6141590260033067, + "flos": 22237428343680.0, + "grad_norm": 1.5033107567748507, + "language_loss": 0.74553859, + "learning_rate": 1.3685729953784572e-06, + "loss": 0.82234401, + "num_input_tokens_seen": 220003165, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.11108398, + "step": 10215, + "time_per_iteration": 3.9794795513153076 + }, + { + "auxiliary_loss_clip": 0.06410043, + "auxiliary_loss_mlp": 0.01269466, + "balance_loss_clip": 0.06271295, + "balance_loss_mlp": 0.01258719, + "epoch": 0.6142191492559748, + "flos": 23877312600960.0, + "grad_norm": 1.5966298517178832, + "language_loss": 0.78681469, + "learning_rate": 1.368203464858542e-06, + "loss": 0.86360973, + "num_input_tokens_seen": 220021015, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.10742188, + "step": 10216, + "time_per_iteration": 2.5095551013946533 + }, + { + "auxiliary_loss_clip": 0.06413998, + "auxiliary_loss_mlp": 0.01267734, + "balance_loss_clip": 0.06273836, + "balance_loss_mlp": 0.0125694, + "epoch": 0.6142792725086427, + "flos": 15046764762240.0, + "grad_norm": 2.0499714549796475, + "language_loss": 0.8017531, + "learning_rate": 1.3678339582964147e-06, + "loss": 0.87857044, + "num_input_tokens_seen": 220035780, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.10797119, + "step": 10217, + "time_per_iteration": 2.530963897705078 + }, + { + "auxiliary_loss_clip": 0.06415407, + "auxiliary_loss_mlp": 0.01266792, + "balance_loss_clip": 0.06273971, + "balance_loss_mlp": 0.01255789, + "epoch": 0.6143393957613107, + "flos": 23337616452480.0, + "grad_norm": 2.309819184905194, + "language_loss": 0.78097677, + "learning_rate": 1.3674644757060865e-06, + "loss": 0.85779876, + "num_input_tokens_seen": 220054280, + "router_z_loss_clip": 1.41503906, + "router_z_loss_mlp": 0.11004639, + "step": 10218, + "time_per_iteration": 2.5020768642425537 + }, + { + "auxiliary_loss_clip": 0.06413251, + "auxiliary_loss_mlp": 0.01268832, + "balance_loss_clip": 0.06275171, + "balance_loss_mlp": 0.01258032, + "epoch": 0.6143995190139786, + "flos": 20122696598400.0, + "grad_norm": 1.7507364905585892, + "language_loss": 0.82176745, + "learning_rate": 1.367095017101569e-06, + "loss": 0.89858824, + "num_input_tokens_seen": 220074120, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.10803223, + "step": 10219, + "time_per_iteration": 4.098464250564575 + }, + { + "auxiliary_loss_clip": 0.06413841, + "auxiliary_loss_mlp": 0.01271094, + "balance_loss_clip": 0.06272161, + "balance_loss_mlp": 0.01259602, + "epoch": 0.6144596422666466, + "flos": 42313403491200.0, + "grad_norm": 1.6881627886326696, + "language_loss": 0.66870147, + "learning_rate": 1.3667255824968717e-06, + "loss": 0.74555075, + "num_input_tokens_seen": 220096320, + "router_z_loss_clip": 1.41699219, + "router_z_loss_mlp": 0.1149292, + "step": 10220, + "time_per_iteration": 2.724275827407837 + }, + { + "auxiliary_loss_clip": 0.0641406, + "auxiliary_loss_mlp": 0.01269064, + "balance_loss_clip": 0.06274959, + "balance_loss_mlp": 0.012584, + "epoch": 0.6145197655193146, + "flos": 21578992560000.0, + "grad_norm": 2.2248894315314454, + "language_loss": 0.72078216, + "learning_rate": 1.3663561719060041e-06, + "loss": 0.79761338, + "num_input_tokens_seen": 220114850, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.10656738, + "step": 10221, + "time_per_iteration": 2.5253100395202637 + }, + { + "auxiliary_loss_clip": 0.06412181, + "auxiliary_loss_mlp": 0.01267039, + "balance_loss_clip": 0.06273518, + "balance_loss_mlp": 0.01256609, + "epoch": 0.6145798887719826, + "flos": 21477610719360.0, + "grad_norm": 1.6538985449457846, + "language_loss": 0.7942664, + "learning_rate": 1.3659867853429735e-06, + "loss": 0.87105858, + "num_input_tokens_seen": 220133395, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.10430908, + "step": 10222, + "time_per_iteration": 2.5524139404296875 + }, + { + "auxiliary_loss_clip": 0.06418169, + "auxiliary_loss_mlp": 0.01267247, + "balance_loss_clip": 0.06275628, + "balance_loss_mlp": 0.01256447, + "epoch": 0.6146400120246506, + "flos": 20783270661120.0, + "grad_norm": 1.750623742282724, + "language_loss": 0.76586866, + "learning_rate": 1.365617422821788e-06, + "loss": 0.84272277, + "num_input_tokens_seen": 220152790, + "router_z_loss_clip": 1.42480469, + "router_z_loss_mlp": 0.10803223, + "step": 10223, + "time_per_iteration": 2.507918119430542 + }, + { + "auxiliary_loss_clip": 0.06413615, + "auxiliary_loss_mlp": 0.01266598, + "balance_loss_clip": 0.06278135, + "balance_loss_mlp": 0.01255392, + "epoch": 0.6147001352773185, + "flos": 13886423821440.0, + "grad_norm": 2.0249480129984287, + "language_loss": 0.78430009, + "learning_rate": 1.3652480843564535e-06, + "loss": 0.86110222, + "num_input_tokens_seen": 220169535, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.11212158, + "step": 10224, + "time_per_iteration": 2.5212504863739014 + }, + { + "auxiliary_loss_clip": 0.06409969, + "auxiliary_loss_mlp": 0.0126517, + "balance_loss_clip": 0.06274317, + "balance_loss_mlp": 0.01255359, + "epoch": 0.6147602585299865, + "flos": 56653920915840.0, + "grad_norm": 1.2562846499273215, + "language_loss": 0.66504145, + "learning_rate": 1.3648787699609746e-06, + "loss": 0.74179292, + "num_input_tokens_seen": 220195305, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.09814453, + "step": 10225, + "time_per_iteration": 2.814272880554199 + }, + { + "auxiliary_loss_clip": 0.06418905, + "auxiliary_loss_mlp": 0.01269548, + "balance_loss_clip": 0.06276867, + "balance_loss_mlp": 0.01258884, + "epoch": 0.6148203817826544, + "flos": 32825468044800.0, + "grad_norm": 1.9241791753141533, + "language_loss": 0.6340794, + "learning_rate": 1.364509479649357e-06, + "loss": 0.71096396, + "num_input_tokens_seen": 220215040, + "router_z_loss_clip": 1.41796875, + "router_z_loss_mlp": 0.10675049, + "step": 10226, + "time_per_iteration": 2.629307270050049 + }, + { + "auxiliary_loss_clip": 0.06414378, + "auxiliary_loss_mlp": 0.01266247, + "balance_loss_clip": 0.0627353, + "balance_loss_mlp": 0.01255303, + "epoch": 0.6148805050353224, + "flos": 18337811650560.0, + "grad_norm": 1.8500325381447646, + "language_loss": 0.76063347, + "learning_rate": 1.3641402134356037e-06, + "loss": 0.83743972, + "num_input_tokens_seen": 220234205, + "router_z_loss_clip": 1.40722656, + "router_z_loss_mlp": 0.10949707, + "step": 10227, + "time_per_iteration": 2.5072264671325684 + }, + { + "auxiliary_loss_clip": 0.06417207, + "auxiliary_loss_mlp": 0.01270328, + "balance_loss_clip": 0.06274723, + "balance_loss_mlp": 0.0125678, + "epoch": 0.6149406282879903, + "flos": 14069173576320.0, + "grad_norm": 4.1558900532043, + "language_loss": 0.62490618, + "learning_rate": 1.3637709713337164e-06, + "loss": 0.70178151, + "num_input_tokens_seen": 220252730, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.13568115, + "step": 10228, + "time_per_iteration": 2.625681161880493 + }, + { + "auxiliary_loss_clip": 0.06412059, + "auxiliary_loss_mlp": 0.01265474, + "balance_loss_clip": 0.0627415, + "balance_loss_mlp": 0.01254763, + "epoch": 0.6150007515406584, + "flos": 25196909425920.0, + "grad_norm": 1.4129638919460634, + "language_loss": 0.74878526, + "learning_rate": 1.3634017533576985e-06, + "loss": 0.82556051, + "num_input_tokens_seen": 220273345, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.1071167, + "step": 10229, + "time_per_iteration": 2.5437581539154053 + }, + { + "auxiliary_loss_clip": 0.06413749, + "auxiliary_loss_mlp": 0.01267795, + "balance_loss_clip": 0.0627471, + "balance_loss_mlp": 0.01256876, + "epoch": 0.6150608747933263, + "flos": 21951829301760.0, + "grad_norm": 1.6020000118574074, + "language_loss": 0.78397381, + "learning_rate": 1.3630325595215493e-06, + "loss": 0.86078924, + "num_input_tokens_seen": 220293845, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.10906982, + "step": 10230, + "time_per_iteration": 2.530174732208252 + }, + { + "auxiliary_loss_clip": 0.06413004, + "auxiliary_loss_mlp": 0.01266985, + "balance_loss_clip": 0.06270448, + "balance_loss_mlp": 0.01256149, + "epoch": 0.6151209980459943, + "flos": 30125283022080.0, + "grad_norm": 1.40012821108437, + "language_loss": 0.72963595, + "learning_rate": 1.36266338983927e-06, + "loss": 0.80643588, + "num_input_tokens_seen": 220316070, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.10827637, + "step": 10231, + "time_per_iteration": 2.5843095779418945 + }, + { + "auxiliary_loss_clip": 0.0641135, + "auxiliary_loss_mlp": 0.01267055, + "balance_loss_clip": 0.06271622, + "balance_loss_mlp": 0.01256434, + "epoch": 0.6151811212986622, + "flos": 30016228533120.0, + "grad_norm": 1.7264160083970947, + "language_loss": 0.70266879, + "learning_rate": 1.362294244324858e-06, + "loss": 0.77945286, + "num_input_tokens_seen": 220335695, + "router_z_loss_clip": 1.39648438, + "router_z_loss_mlp": 0.10626221, + "step": 10232, + "time_per_iteration": 2.5726914405822754 + }, + { + "auxiliary_loss_clip": 0.06409374, + "auxiliary_loss_mlp": 0.01268072, + "balance_loss_clip": 0.06274308, + "balance_loss_mlp": 0.01258112, + "epoch": 0.6152412445513302, + "flos": 18877675507200.0, + "grad_norm": 2.1019570874525484, + "language_loss": 0.92268974, + "learning_rate": 1.3619251229923126e-06, + "loss": 0.99946421, + "num_input_tokens_seen": 220353720, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.09960938, + "step": 10233, + "time_per_iteration": 2.475142002105713 + }, + { + "auxiliary_loss_clip": 0.06412026, + "auxiliary_loss_mlp": 0.01266426, + "balance_loss_clip": 0.06274009, + "balance_loss_mlp": 0.01256019, + "epoch": 0.6153013678039982, + "flos": 25710847643520.0, + "grad_norm": 1.7026564571899578, + "language_loss": 0.7220425, + "learning_rate": 1.3615560258556306e-06, + "loss": 0.79882705, + "num_input_tokens_seen": 220372515, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.10412598, + "step": 10234, + "time_per_iteration": 2.538825750350952 + }, + { + "auxiliary_loss_clip": 0.06412051, + "auxiliary_loss_mlp": 0.01265802, + "balance_loss_clip": 0.06270387, + "balance_loss_mlp": 0.01255187, + "epoch": 0.6153614910566662, + "flos": 28517529605760.0, + "grad_norm": 1.8042716232808833, + "language_loss": 0.67118728, + "learning_rate": 1.3611869529288077e-06, + "loss": 0.74796581, + "num_input_tokens_seen": 220393490, + "router_z_loss_clip": 1.41699219, + "router_z_loss_mlp": 0.10620117, + "step": 10235, + "time_per_iteration": 2.5539941787719727 + }, + { + "auxiliary_loss_clip": 0.06416909, + "auxiliary_loss_mlp": 0.01269314, + "balance_loss_clip": 0.06272343, + "balance_loss_mlp": 0.01258489, + "epoch": 0.6154216143093342, + "flos": 23556480117120.0, + "grad_norm": 1.5012129447427485, + "language_loss": 0.81535256, + "learning_rate": 1.3608179042258398e-06, + "loss": 0.89221478, + "num_input_tokens_seen": 220412855, + "router_z_loss_clip": 1.4453125, + "router_z_loss_mlp": 0.10821533, + "step": 10236, + "time_per_iteration": 2.538961887359619 + }, + { + "auxiliary_loss_clip": 0.06413287, + "auxiliary_loss_mlp": 0.01269421, + "balance_loss_clip": 0.06269701, + "balance_loss_mlp": 0.01258281, + "epoch": 0.6154817375620021, + "flos": 22754804578560.0, + "grad_norm": 1.3960361226739142, + "language_loss": 0.8069132, + "learning_rate": 1.360448879760721e-06, + "loss": 0.88374025, + "num_input_tokens_seen": 220433440, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.11138916, + "step": 10237, + "time_per_iteration": 2.5317978858947754 + }, + { + "auxiliary_loss_clip": 0.06410801, + "auxiliary_loss_mlp": 0.01271969, + "balance_loss_clip": 0.06272944, + "balance_loss_mlp": 0.01261198, + "epoch": 0.6155418608146701, + "flos": 27170455841280.0, + "grad_norm": 1.5039507372145677, + "language_loss": 0.76442957, + "learning_rate": 1.3600798795474449e-06, + "loss": 0.84125727, + "num_input_tokens_seen": 220453445, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.10772705, + "step": 10238, + "time_per_iteration": 2.5912821292877197 + }, + { + "auxiliary_loss_clip": 0.06320563, + "auxiliary_loss_mlp": 0.01256509, + "balance_loss_clip": 0.06262375, + "balance_loss_mlp": 0.01254774, + "epoch": 0.615601984067338, + "flos": 68828610003840.0, + "grad_norm": 1.135422984419524, + "language_loss": 0.57526618, + "learning_rate": 1.3597109036000036e-06, + "loss": 0.65103698, + "num_input_tokens_seen": 220509730, + "router_z_loss_clip": 0.58203125, + "router_z_loss_mlp": 0.01739502, + "step": 10239, + "time_per_iteration": 3.167433738708496 + }, + { + "auxiliary_loss_clip": 0.06415902, + "auxiliary_loss_mlp": 0.01263733, + "balance_loss_clip": 0.06273024, + "balance_loss_mlp": 0.0125323, + "epoch": 0.615662107320006, + "flos": 15521528396160.0, + "grad_norm": 1.8815161483190883, + "language_loss": 0.77940285, + "learning_rate": 1.3593419519323892e-06, + "loss": 0.8561992, + "num_input_tokens_seen": 220527295, + "router_z_loss_clip": 1.42871094, + "router_z_loss_mlp": 0.10498047, + "step": 10240, + "time_per_iteration": 2.4900901317596436 + }, + { + "auxiliary_loss_clip": 0.06418262, + "auxiliary_loss_mlp": 0.01272722, + "balance_loss_clip": 0.06275868, + "balance_loss_mlp": 0.01262017, + "epoch": 0.615722230572674, + "flos": 21069121265280.0, + "grad_norm": 2.263045257123095, + "language_loss": 0.72996962, + "learning_rate": 1.3589730245585922e-06, + "loss": 0.80687952, + "num_input_tokens_seen": 220542730, + "router_z_loss_clip": 1.42382812, + "router_z_loss_mlp": 0.1071167, + "step": 10241, + "time_per_iteration": 3.901360511779785 + }, + { + "auxiliary_loss_clip": 0.06409363, + "auxiliary_loss_mlp": 0.01269863, + "balance_loss_clip": 0.0627209, + "balance_loss_mlp": 0.01259873, + "epoch": 0.615782353825342, + "flos": 23263250353920.0, + "grad_norm": 1.504543290987149, + "language_loss": 0.72248924, + "learning_rate": 1.3586041214926018e-06, + "loss": 0.79928148, + "num_input_tokens_seen": 220562995, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.09997559, + "step": 10242, + "time_per_iteration": 2.5169565677642822 + }, + { + "auxiliary_loss_clip": 0.06411266, + "auxiliary_loss_mlp": 0.01265628, + "balance_loss_clip": 0.06271993, + "balance_loss_mlp": 0.01255066, + "epoch": 0.6158424770780099, + "flos": 21109972930560.0, + "grad_norm": 2.215067200442713, + "language_loss": 0.7281, + "learning_rate": 1.3582352427484086e-06, + "loss": 0.80486894, + "num_input_tokens_seen": 220581775, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.10565186, + "step": 10243, + "time_per_iteration": 2.540512800216675 + }, + { + "auxiliary_loss_clip": 0.06321675, + "auxiliary_loss_mlp": 0.01255828, + "balance_loss_clip": 0.06263578, + "balance_loss_mlp": 0.01254183, + "epoch": 0.6159026003306779, + "flos": 70355358120960.0, + "grad_norm": 0.7449608811837395, + "language_loss": 0.56762981, + "learning_rate": 1.3578663883399984e-06, + "loss": 0.64340484, + "num_input_tokens_seen": 220646395, + "router_z_loss_clip": 0.58105469, + "router_z_loss_mlp": 0.01647949, + "step": 10244, + "time_per_iteration": 3.2194366455078125 + }, + { + "auxiliary_loss_clip": 0.06409553, + "auxiliary_loss_mlp": 0.01267536, + "balance_loss_clip": 0.06271067, + "balance_loss_mlp": 0.01256855, + "epoch": 0.6159627235833458, + "flos": 33882624282240.0, + "grad_norm": 1.5482958097169006, + "language_loss": 0.63865972, + "learning_rate": 1.3574975582813593e-06, + "loss": 0.71543062, + "num_input_tokens_seen": 220668335, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.10675049, + "step": 10245, + "time_per_iteration": 2.640113353729248 + }, + { + "auxiliary_loss_clip": 0.06409854, + "auxiliary_loss_mlp": 0.01267557, + "balance_loss_clip": 0.06270616, + "balance_loss_mlp": 0.01257442, + "epoch": 0.6160228468360138, + "flos": 26582193452160.0, + "grad_norm": 1.6235599905950853, + "language_loss": 0.79032344, + "learning_rate": 1.3571287525864771e-06, + "loss": 0.8670975, + "num_input_tokens_seen": 220688915, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.10119629, + "step": 10246, + "time_per_iteration": 2.5686607360839844 + }, + { + "auxiliary_loss_clip": 0.0641896, + "auxiliary_loss_mlp": 0.0127079, + "balance_loss_clip": 0.0627369, + "balance_loss_mlp": 0.01258952, + "epoch": 0.6160829700886818, + "flos": 17197568490240.0, + "grad_norm": 2.4844316843996825, + "language_loss": 0.88253343, + "learning_rate": 1.3567599712693368e-06, + "loss": 0.95943093, + "num_input_tokens_seen": 220703465, + "router_z_loss_clip": 1.453125, + "router_z_loss_mlp": 0.1184082, + "step": 10247, + "time_per_iteration": 2.450960397720337 + }, + { + "auxiliary_loss_clip": 0.06417046, + "auxiliary_loss_mlp": 0.01268101, + "balance_loss_clip": 0.06275311, + "balance_loss_mlp": 0.01258028, + "epoch": 0.6161430933413498, + "flos": 23630385018240.0, + "grad_norm": 1.598841912113341, + "language_loss": 0.80267406, + "learning_rate": 1.3563912143439235e-06, + "loss": 0.87952548, + "num_input_tokens_seen": 220722090, + "router_z_loss_clip": 1.41699219, + "router_z_loss_mlp": 0.10076904, + "step": 10248, + "time_per_iteration": 2.5717732906341553 + }, + { + "auxiliary_loss_clip": 0.06409503, + "auxiliary_loss_mlp": 0.01268015, + "balance_loss_clip": 0.06271905, + "balance_loss_mlp": 0.01257733, + "epoch": 0.6162032165940178, + "flos": 23009027466240.0, + "grad_norm": 1.6786182085700423, + "language_loss": 0.87678397, + "learning_rate": 1.3560224818242191e-06, + "loss": 0.95355916, + "num_input_tokens_seen": 220741075, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.10284424, + "step": 10249, + "time_per_iteration": 2.5637669563293457 + }, + { + "auxiliary_loss_clip": 0.06414458, + "auxiliary_loss_mlp": 0.01265908, + "balance_loss_clip": 0.06273694, + "balance_loss_mlp": 0.01255239, + "epoch": 0.6162633398466857, + "flos": 39431474962560.0, + "grad_norm": 2.372002019412244, + "language_loss": 0.70129162, + "learning_rate": 1.3556537737242072e-06, + "loss": 0.7780953, + "num_input_tokens_seen": 220763395, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.10668945, + "step": 10250, + "time_per_iteration": 2.700856924057007 + }, + { + "auxiliary_loss_clip": 0.06403701, + "auxiliary_loss_mlp": 0.01263182, + "balance_loss_clip": 0.06270384, + "balance_loss_mlp": 0.0125386, + "epoch": 0.6163234630993537, + "flos": 19250679957120.0, + "grad_norm": 1.6751579708994577, + "language_loss": 0.74076283, + "learning_rate": 1.3552850900578692e-06, + "loss": 0.81743157, + "num_input_tokens_seen": 220780640, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.09320068, + "step": 10251, + "time_per_iteration": 3.9032137393951416 + }, + { + "auxiliary_loss_clip": 0.06412694, + "auxiliary_loss_mlp": 0.01265039, + "balance_loss_clip": 0.06272181, + "balance_loss_mlp": 0.01255288, + "epoch": 0.6163835863520216, + "flos": 15967389571200.0, + "grad_norm": 1.9695671027525665, + "language_loss": 0.69094777, + "learning_rate": 1.3549164308391844e-06, + "loss": 0.76772505, + "num_input_tokens_seen": 220797960, + "router_z_loss_clip": 1.40429688, + "router_z_loss_mlp": 0.09753418, + "step": 10252, + "time_per_iteration": 2.546041250228882 + }, + { + "auxiliary_loss_clip": 0.06321114, + "auxiliary_loss_mlp": 0.01253403, + "balance_loss_clip": 0.06262837, + "balance_loss_mlp": 0.01252003, + "epoch": 0.6164437096046896, + "flos": 68124905487360.0, + "grad_norm": 0.8614248496363994, + "language_loss": 0.57690394, + "learning_rate": 1.3545477960821333e-06, + "loss": 0.6526491, + "num_input_tokens_seen": 220856930, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 0.01400757, + "step": 10253, + "time_per_iteration": 3.1977267265319824 + }, + { + "auxiliary_loss_clip": 0.06417613, + "auxiliary_loss_mlp": 0.01268494, + "balance_loss_clip": 0.06274711, + "balance_loss_mlp": 0.01257783, + "epoch": 0.6165038328573575, + "flos": 21367633835520.0, + "grad_norm": 1.503369483441608, + "language_loss": 0.79960692, + "learning_rate": 1.3541791858006946e-06, + "loss": 0.876468, + "num_input_tokens_seen": 220877595, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.1071167, + "step": 10254, + "time_per_iteration": 3.95928692817688 + }, + { + "auxiliary_loss_clip": 0.06419028, + "auxiliary_loss_mlp": 0.01264054, + "balance_loss_clip": 0.06276255, + "balance_loss_mlp": 0.01253128, + "epoch": 0.6165639561100256, + "flos": 21107708870400.0, + "grad_norm": 1.746255949432921, + "language_loss": 0.81143081, + "learning_rate": 1.353810600008846e-06, + "loss": 0.88826168, + "num_input_tokens_seen": 220896880, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.10925293, + "step": 10255, + "time_per_iteration": 2.5300750732421875 + }, + { + "auxiliary_loss_clip": 0.06416211, + "auxiliary_loss_mlp": 0.01266666, + "balance_loss_clip": 0.06273863, + "balance_loss_mlp": 0.01255371, + "epoch": 0.6166240793626935, + "flos": 25345683550080.0, + "grad_norm": 1.880965378472566, + "language_loss": 0.65514123, + "learning_rate": 1.3534420387205646e-06, + "loss": 0.73196995, + "num_input_tokens_seen": 220916425, + "router_z_loss_clip": 1.42285156, + "router_z_loss_mlp": 0.11291504, + "step": 10256, + "time_per_iteration": 2.539006233215332 + }, + { + "auxiliary_loss_clip": 0.06415517, + "auxiliary_loss_mlp": 0.01267871, + "balance_loss_clip": 0.06277969, + "balance_loss_mlp": 0.0125806, + "epoch": 0.6166842026153615, + "flos": 19688742702720.0, + "grad_norm": 1.5659047978931129, + "language_loss": 0.72409272, + "learning_rate": 1.353073501949825e-06, + "loss": 0.80092663, + "num_input_tokens_seen": 220935050, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.09802246, + "step": 10257, + "time_per_iteration": 2.5153865814208984 + }, + { + "auxiliary_loss_clip": 0.06416216, + "auxiliary_loss_mlp": 0.01264385, + "balance_loss_clip": 0.06275131, + "balance_loss_mlp": 0.01253788, + "epoch": 0.6167443258680294, + "flos": 19324501004160.0, + "grad_norm": 1.6557108650811327, + "language_loss": 0.71972775, + "learning_rate": 1.3527049897106034e-06, + "loss": 0.79653382, + "num_input_tokens_seen": 220953085, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.1060791, + "step": 10258, + "time_per_iteration": 2.480304718017578 + }, + { + "auxiliary_loss_clip": 0.06417316, + "auxiliary_loss_mlp": 0.01263861, + "balance_loss_clip": 0.06275502, + "balance_loss_mlp": 0.01253222, + "epoch": 0.6168044491206974, + "flos": 25272323700480.0, + "grad_norm": 1.9257678582667488, + "language_loss": 0.63553512, + "learning_rate": 1.3523365020168735e-06, + "loss": 0.71234685, + "num_input_tokens_seen": 220969050, + "router_z_loss_clip": 1.41601562, + "router_z_loss_mlp": 0.10638428, + "step": 10259, + "time_per_iteration": 4.02075719833374 + }, + { + "auxiliary_loss_clip": 0.06410451, + "auxiliary_loss_mlp": 0.0126865, + "balance_loss_clip": 0.0627453, + "balance_loss_mlp": 0.01257898, + "epoch": 0.6168645723733654, + "flos": 13224130750080.0, + "grad_norm": 1.6228127894065456, + "language_loss": 0.71578032, + "learning_rate": 1.3519680388826084e-06, + "loss": 0.79257131, + "num_input_tokens_seen": 220985825, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.10748291, + "step": 10260, + "time_per_iteration": 2.4910624027252197 + }, + { + "auxiliary_loss_clip": 0.06424432, + "auxiliary_loss_mlp": 0.01268478, + "balance_loss_clip": 0.06278151, + "balance_loss_mlp": 0.01256492, + "epoch": 0.6169246956260334, + "flos": 26659410589440.0, + "grad_norm": 1.7088590339487795, + "language_loss": 0.68640685, + "learning_rate": 1.3515996003217803e-06, + "loss": 0.76333594, + "num_input_tokens_seen": 221004465, + "router_z_loss_clip": 1.46289062, + "router_z_loss_mlp": 0.11981201, + "step": 10261, + "time_per_iteration": 2.5747649669647217 + }, + { + "auxiliary_loss_clip": 0.06414127, + "auxiliary_loss_mlp": 0.01264284, + "balance_loss_clip": 0.06274065, + "balance_loss_mlp": 0.01254151, + "epoch": 0.6169848188787014, + "flos": 23155034405760.0, + "grad_norm": 1.7119551141937153, + "language_loss": 0.71845949, + "learning_rate": 1.3512311863483602e-06, + "loss": 0.79524362, + "num_input_tokens_seen": 221023260, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.10131836, + "step": 10262, + "time_per_iteration": 2.560232162475586 + }, + { + "auxiliary_loss_clip": 0.06416971, + "auxiliary_loss_mlp": 0.01265583, + "balance_loss_clip": 0.06277905, + "balance_loss_mlp": 0.01254425, + "epoch": 0.6170449421313693, + "flos": 23338748482560.0, + "grad_norm": 1.8792858261778465, + "language_loss": 0.70386994, + "learning_rate": 1.3508627969763188e-06, + "loss": 0.7806955, + "num_input_tokens_seen": 221043090, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.11157227, + "step": 10263, + "time_per_iteration": 2.5188369750976562 + }, + { + "auxiliary_loss_clip": 0.06418619, + "auxiliary_loss_mlp": 0.01266762, + "balance_loss_clip": 0.06274839, + "balance_loss_mlp": 0.01256618, + "epoch": 0.6171050653840373, + "flos": 15857077271040.0, + "grad_norm": 2.3172465393141404, + "language_loss": 0.76572752, + "learning_rate": 1.3504944322196244e-06, + "loss": 0.84258133, + "num_input_tokens_seen": 221061435, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.10150146, + "step": 10264, + "time_per_iteration": 2.525599956512451 + }, + { + "auxiliary_loss_clip": 0.06414546, + "auxiliary_loss_mlp": 0.01266705, + "balance_loss_clip": 0.06275049, + "balance_loss_mlp": 0.01255726, + "epoch": 0.6171651886367052, + "flos": 20051349246720.0, + "grad_norm": 2.349171582745048, + "language_loss": 0.85150325, + "learning_rate": 1.350126092092247e-06, + "loss": 0.92831576, + "num_input_tokens_seen": 221078705, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.10992432, + "step": 10265, + "time_per_iteration": 2.5084152221679688 + }, + { + "auxiliary_loss_clip": 0.06410134, + "auxiliary_loss_mlp": 0.01264888, + "balance_loss_clip": 0.0627327, + "balance_loss_mlp": 0.01254099, + "epoch": 0.6172253118893732, + "flos": 26439959946240.0, + "grad_norm": 2.0102817715219112, + "language_loss": 0.64766055, + "learning_rate": 1.349757776608153e-06, + "loss": 0.72441077, + "num_input_tokens_seen": 221099245, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.10791016, + "step": 10266, + "time_per_iteration": 2.5796725749969482 + }, + { + "auxiliary_loss_clip": 0.06410654, + "auxiliary_loss_mlp": 0.01267414, + "balance_loss_clip": 0.06270823, + "balance_loss_mlp": 0.01257263, + "epoch": 0.6172854351420412, + "flos": 22638622492800.0, + "grad_norm": 1.5096082169739153, + "language_loss": 0.76070148, + "learning_rate": 1.3493894857813094e-06, + "loss": 0.83748215, + "num_input_tokens_seen": 221116930, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.10150146, + "step": 10267, + "time_per_iteration": 2.5105693340301514 + }, + { + "auxiliary_loss_clip": 0.06419747, + "auxiliary_loss_mlp": 0.01265039, + "balance_loss_clip": 0.06275809, + "balance_loss_mlp": 0.01254066, + "epoch": 0.6173455583947092, + "flos": 21218943565440.0, + "grad_norm": 1.6454778934730863, + "language_loss": 0.7525773, + "learning_rate": 1.3490212196256818e-06, + "loss": 0.82942522, + "num_input_tokens_seen": 221137660, + "router_z_loss_clip": 1.43847656, + "router_z_loss_mlp": 0.10974121, + "step": 10268, + "time_per_iteration": 2.587233543395996 + }, + { + "auxiliary_loss_clip": 0.06419453, + "auxiliary_loss_mlp": 0.0126697, + "balance_loss_clip": 0.06273817, + "balance_loss_mlp": 0.01256396, + "epoch": 0.6174056816473771, + "flos": 19506370291200.0, + "grad_norm": 1.5800856340056704, + "language_loss": 0.75772798, + "learning_rate": 1.3486529781552342e-06, + "loss": 0.83459222, + "num_input_tokens_seen": 221156225, + "router_z_loss_clip": 1.45703125, + "router_z_loss_mlp": 0.10583496, + "step": 10269, + "time_per_iteration": 2.4955811500549316 + }, + { + "auxiliary_loss_clip": 0.06411718, + "auxiliary_loss_mlp": 0.01267212, + "balance_loss_clip": 0.06271979, + "balance_loss_mlp": 0.01256549, + "epoch": 0.6174658049000451, + "flos": 16002790721280.0, + "grad_norm": 2.3324483712409685, + "language_loss": 0.76473081, + "learning_rate": 1.3482847613839318e-06, + "loss": 0.84152013, + "num_input_tokens_seen": 221173820, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.10662842, + "step": 10270, + "time_per_iteration": 2.5138041973114014 + }, + { + "auxiliary_loss_clip": 0.0641441, + "auxiliary_loss_mlp": 0.01270386, + "balance_loss_clip": 0.06274129, + "balance_loss_mlp": 0.0125986, + "epoch": 0.617525928152713, + "flos": 21909635971200.0, + "grad_norm": 1.7440039477364133, + "language_loss": 0.82272917, + "learning_rate": 1.347916569325736e-06, + "loss": 0.89957708, + "num_input_tokens_seen": 221191815, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.10522461, + "step": 10271, + "time_per_iteration": 2.488560676574707 + }, + { + "auxiliary_loss_clip": 0.06416266, + "auxiliary_loss_mlp": 0.01264784, + "balance_loss_clip": 0.06273527, + "balance_loss_mlp": 0.01254801, + "epoch": 0.617586051405381, + "flos": 21112362771840.0, + "grad_norm": 1.4517106193495921, + "language_loss": 0.77416623, + "learning_rate": 1.3475484019946093e-06, + "loss": 0.85097671, + "num_input_tokens_seen": 221211205, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.09985352, + "step": 10272, + "time_per_iteration": 2.520111560821533 + }, + { + "auxiliary_loss_clip": 0.06312063, + "auxiliary_loss_mlp": 0.01254406, + "balance_loss_clip": 0.06253687, + "balance_loss_mlp": 0.01252749, + "epoch": 0.617646174658049, + "flos": 58629129684480.0, + "grad_norm": 0.7932568322885909, + "language_loss": 0.59031951, + "learning_rate": 1.347180259404513e-06, + "loss": 0.66598421, + "num_input_tokens_seen": 221268430, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 0.01660156, + "step": 10273, + "time_per_iteration": 2.9967992305755615 + }, + { + "auxiliary_loss_clip": 0.0640862, + "auxiliary_loss_mlp": 0.01264919, + "balance_loss_clip": 0.06270938, + "balance_loss_mlp": 0.01254274, + "epoch": 0.617706297910717, + "flos": 13883363147520.0, + "grad_norm": 2.2785278271278897, + "language_loss": 0.73286194, + "learning_rate": 1.3468121415694059e-06, + "loss": 0.80959731, + "num_input_tokens_seen": 221281930, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.10632324, + "step": 10274, + "time_per_iteration": 2.4770405292510986 + }, + { + "auxiliary_loss_clip": 0.06412372, + "auxiliary_loss_mlp": 0.01266317, + "balance_loss_clip": 0.06272519, + "balance_loss_mlp": 0.01255713, + "epoch": 0.617766421163385, + "flos": 19214482193280.0, + "grad_norm": 1.605129158536194, + "language_loss": 0.77453375, + "learning_rate": 1.3464440485032484e-06, + "loss": 0.85132062, + "num_input_tokens_seen": 221301605, + "router_z_loss_clip": 1.39941406, + "router_z_loss_mlp": 0.1060791, + "step": 10275, + "time_per_iteration": 2.4878437519073486 + }, + { + "auxiliary_loss_clip": 0.06409969, + "auxiliary_loss_mlp": 0.01271601, + "balance_loss_clip": 0.06272689, + "balance_loss_mlp": 0.01261134, + "epoch": 0.6178265444160529, + "flos": 22572725656320.0, + "grad_norm": 1.5524938527976675, + "language_loss": 0.79471135, + "learning_rate": 1.346075980219998e-06, + "loss": 0.87152702, + "num_input_tokens_seen": 221320105, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.10461426, + "step": 10276, + "time_per_iteration": 2.644413709640503 + }, + { + "auxiliary_loss_clip": 0.06416178, + "auxiliary_loss_mlp": 0.0126935, + "balance_loss_clip": 0.06274026, + "balance_loss_mlp": 0.01258192, + "epoch": 0.6178866676687209, + "flos": 11989130221440.0, + "grad_norm": 2.611664280498841, + "language_loss": 0.81007028, + "learning_rate": 1.345707936733612e-06, + "loss": 0.88692558, + "num_input_tokens_seen": 221335915, + "router_z_loss_clip": 1.42285156, + "router_z_loss_mlp": 0.1114502, + "step": 10277, + "time_per_iteration": 2.497955799102783 + }, + { + "auxiliary_loss_clip": 0.06418674, + "auxiliary_loss_mlp": 0.01267294, + "balance_loss_clip": 0.06274392, + "balance_loss_mlp": 0.01256381, + "epoch": 0.6179467909213888, + "flos": 20997061153920.0, + "grad_norm": 1.6653557744536012, + "language_loss": 0.81855345, + "learning_rate": 1.3453399180580466e-06, + "loss": 0.89541304, + "num_input_tokens_seen": 221353965, + "router_z_loss_clip": 1.44042969, + "router_z_loss_mlp": 0.10925293, + "step": 10278, + "time_per_iteration": 2.529439687728882 + }, + { + "auxiliary_loss_clip": 0.06410799, + "auxiliary_loss_mlp": 0.01263691, + "balance_loss_clip": 0.06271666, + "balance_loss_mlp": 0.0125394, + "epoch": 0.6180069141740568, + "flos": 25345180425600.0, + "grad_norm": 1.5510866303043802, + "language_loss": 0.74313521, + "learning_rate": 1.3449719242072567e-06, + "loss": 0.81988013, + "num_input_tokens_seen": 221374080, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.09753418, + "step": 10279, + "time_per_iteration": 2.5355474948883057 + }, + { + "auxiliary_loss_clip": 0.06408358, + "auxiliary_loss_mlp": 0.01263048, + "balance_loss_clip": 0.06268996, + "balance_loss_mlp": 0.0125316, + "epoch": 0.6180670374267248, + "flos": 19651748325120.0, + "grad_norm": 1.3695497899575455, + "language_loss": 0.70764935, + "learning_rate": 1.3446039551951975e-06, + "loss": 0.78436339, + "num_input_tokens_seen": 221392910, + "router_z_loss_clip": 1.39257812, + "router_z_loss_mlp": 0.09887695, + "step": 10280, + "time_per_iteration": 3.9792449474334717 + }, + { + "auxiliary_loss_clip": 0.06417054, + "auxiliary_loss_mlp": 0.01267828, + "balance_loss_clip": 0.06274389, + "balance_loss_mlp": 0.01256873, + "epoch": 0.6181271606793928, + "flos": 19471136849280.0, + "grad_norm": 1.3977623720923391, + "language_loss": 0.73107064, + "learning_rate": 1.3442360110358215e-06, + "loss": 0.8079195, + "num_input_tokens_seen": 221410990, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.10943604, + "step": 10281, + "time_per_iteration": 2.515800952911377 + }, + { + "auxiliary_loss_clip": 0.06410573, + "auxiliary_loss_mlp": 0.01266845, + "balance_loss_clip": 0.06274214, + "balance_loss_mlp": 0.01256927, + "epoch": 0.6181872839320607, + "flos": 25601541592320.0, + "grad_norm": 1.5934743777966283, + "language_loss": 0.76599932, + "learning_rate": 1.3438680917430827e-06, + "loss": 0.84277344, + "num_input_tokens_seen": 221431020, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.09924316, + "step": 10282, + "time_per_iteration": 2.5432822704315186 + }, + { + "auxiliary_loss_clip": 0.06415926, + "auxiliary_loss_mlp": 0.01266703, + "balance_loss_clip": 0.06272847, + "balance_loss_mlp": 0.01254884, + "epoch": 0.6182474071847287, + "flos": 25558048523520.0, + "grad_norm": 1.5342450755249748, + "language_loss": 0.69123679, + "learning_rate": 1.343500197330931e-06, + "loss": 0.76806307, + "num_input_tokens_seen": 221453235, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.1182251, + "step": 10283, + "time_per_iteration": 2.588545322418213 + }, + { + "auxiliary_loss_clip": 0.06422709, + "auxiliary_loss_mlp": 0.0126698, + "balance_loss_clip": 0.06273957, + "balance_loss_mlp": 0.01255607, + "epoch": 0.6183075304373966, + "flos": 22129673592960.0, + "grad_norm": 1.473012438045687, + "language_loss": 0.75165606, + "learning_rate": 1.3431323278133176e-06, + "loss": 0.82855296, + "num_input_tokens_seen": 221472560, + "router_z_loss_clip": 1.48828125, + "router_z_loss_mlp": 0.11364746, + "step": 10284, + "time_per_iteration": 2.4986348152160645 + }, + { + "auxiliary_loss_clip": 0.06405671, + "auxiliary_loss_mlp": 0.01268655, + "balance_loss_clip": 0.06274024, + "balance_loss_mlp": 0.01259034, + "epoch": 0.6183676536900646, + "flos": 22462161793920.0, + "grad_norm": 1.4548798471123576, + "language_loss": 0.75635868, + "learning_rate": 1.3427644832041922e-06, + "loss": 0.83310193, + "num_input_tokens_seen": 221492835, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09619141, + "step": 10285, + "time_per_iteration": 2.585350513458252 + }, + { + "auxiliary_loss_clip": 0.06410024, + "auxiliary_loss_mlp": 0.0126635, + "balance_loss_clip": 0.06269899, + "balance_loss_mlp": 0.01255377, + "epoch": 0.6184277769427327, + "flos": 23370250417920.0, + "grad_norm": 1.3734994412846095, + "language_loss": 0.72883123, + "learning_rate": 1.342396663517503e-06, + "loss": 0.80559498, + "num_input_tokens_seen": 221511870, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.10974121, + "step": 10286, + "time_per_iteration": 2.569110870361328 + }, + { + "auxiliary_loss_clip": 0.06411327, + "auxiliary_loss_mlp": 0.01268421, + "balance_loss_clip": 0.0627317, + "balance_loss_mlp": 0.01257311, + "epoch": 0.6184879001954006, + "flos": 22717684419840.0, + "grad_norm": 1.5486281180664692, + "language_loss": 0.76501298, + "learning_rate": 1.342028868767199e-06, + "loss": 0.84181046, + "num_input_tokens_seen": 221529915, + "router_z_loss_clip": 1.38183594, + "router_z_loss_mlp": 0.11108398, + "step": 10287, + "time_per_iteration": 2.5511634349823 + }, + { + "auxiliary_loss_clip": 0.06411948, + "auxiliary_loss_mlp": 0.01264572, + "balance_loss_clip": 0.06272362, + "balance_loss_mlp": 0.01253587, + "epoch": 0.6185480234480686, + "flos": 23848703631360.0, + "grad_norm": 1.5880408145773481, + "language_loss": 0.73586667, + "learning_rate": 1.3416610989672262e-06, + "loss": 0.81263179, + "num_input_tokens_seen": 221549745, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.10986328, + "step": 10288, + "time_per_iteration": 2.507291555404663 + }, + { + "auxiliary_loss_clip": 0.06409134, + "auxiliary_loss_mlp": 0.01263119, + "balance_loss_clip": 0.06273092, + "balance_loss_mlp": 0.0125264, + "epoch": 0.6186081467007365, + "flos": 45487932877440.0, + "grad_norm": 1.4570853227015406, + "language_loss": 0.73074299, + "learning_rate": 1.3412933541315296e-06, + "loss": 0.80746555, + "num_input_tokens_seen": 221572455, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.10473633, + "step": 10289, + "time_per_iteration": 2.7538769245147705 + }, + { + "auxiliary_loss_clip": 0.0641107, + "auxiliary_loss_mlp": 0.01268567, + "balance_loss_clip": 0.06269012, + "balance_loss_mlp": 0.01257468, + "epoch": 0.6186682699534045, + "flos": 23557737928320.0, + "grad_norm": 1.4253961785396534, + "language_loss": 0.79380536, + "learning_rate": 1.340925634274056e-06, + "loss": 0.87060177, + "num_input_tokens_seen": 221591325, + "router_z_loss_clip": 1.41992188, + "router_z_loss_mlp": 0.11090088, + "step": 10290, + "time_per_iteration": 2.532860040664673 + }, + { + "auxiliary_loss_clip": 0.06417654, + "auxiliary_loss_mlp": 0.01269395, + "balance_loss_clip": 0.06273635, + "balance_loss_mlp": 0.01258374, + "epoch": 0.6187283932060724, + "flos": 25781062965120.0, + "grad_norm": 1.5195693495374782, + "language_loss": 0.81756544, + "learning_rate": 1.3405579394087475e-06, + "loss": 0.89443594, + "num_input_tokens_seen": 221611640, + "router_z_loss_clip": 1.44140625, + "router_z_loss_mlp": 0.11022949, + "step": 10291, + "time_per_iteration": 3.985360860824585 + }, + { + "auxiliary_loss_clip": 0.06414646, + "auxiliary_loss_mlp": 0.0126579, + "balance_loss_clip": 0.06274836, + "balance_loss_mlp": 0.01255967, + "epoch": 0.6187885164587404, + "flos": 25272281773440.0, + "grad_norm": 5.259543114674327, + "language_loss": 0.78044999, + "learning_rate": 1.3401902695495487e-06, + "loss": 0.85725427, + "num_input_tokens_seen": 221631225, + "router_z_loss_clip": 1.39941406, + "router_z_loss_mlp": 0.09820557, + "step": 10292, + "time_per_iteration": 2.5699048042297363 + }, + { + "auxiliary_loss_clip": 0.06421922, + "auxiliary_loss_mlp": 0.01269104, + "balance_loss_clip": 0.06274973, + "balance_loss_mlp": 0.01257285, + "epoch": 0.6188486397114084, + "flos": 26258090659200.0, + "grad_norm": 2.757581205213687, + "language_loss": 0.73825526, + "learning_rate": 1.339822624710401e-06, + "loss": 0.81516558, + "num_input_tokens_seen": 221651035, + "router_z_loss_clip": 1.46972656, + "router_z_loss_mlp": 0.11816406, + "step": 10293, + "time_per_iteration": 4.005521774291992 + }, + { + "auxiliary_loss_clip": 0.06414802, + "auxiliary_loss_mlp": 0.01268302, + "balance_loss_clip": 0.06274456, + "balance_loss_mlp": 0.0125721, + "epoch": 0.6189087629640764, + "flos": 20929738798080.0, + "grad_norm": 1.751787926809697, + "language_loss": 0.83461618, + "learning_rate": 1.3394550049052454e-06, + "loss": 0.91144723, + "num_input_tokens_seen": 221671300, + "router_z_loss_clip": 1.40332031, + "router_z_loss_mlp": 0.11096191, + "step": 10294, + "time_per_iteration": 2.5416274070739746 + }, + { + "auxiliary_loss_clip": 0.06413339, + "auxiliary_loss_mlp": 0.01271366, + "balance_loss_clip": 0.06272751, + "balance_loss_mlp": 0.01260434, + "epoch": 0.6189688862167443, + "flos": 14835070621440.0, + "grad_norm": 2.3983238935990525, + "language_loss": 0.70671308, + "learning_rate": 1.3390874101480225e-06, + "loss": 0.7835601, + "num_input_tokens_seen": 221687320, + "router_z_loss_clip": 1.40429688, + "router_z_loss_mlp": 0.10931396, + "step": 10295, + "time_per_iteration": 2.474698781967163 + }, + { + "auxiliary_loss_clip": 0.06411821, + "auxiliary_loss_mlp": 0.01272777, + "balance_loss_clip": 0.06273046, + "balance_loss_mlp": 0.01261494, + "epoch": 0.6190290094694123, + "flos": 24292803870720.0, + "grad_norm": 1.4317659849997142, + "language_loss": 0.69952327, + "learning_rate": 1.3387198404526705e-06, + "loss": 0.77636921, + "num_input_tokens_seen": 221710175, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.11291504, + "step": 10296, + "time_per_iteration": 2.618892192840576 + }, + { + "auxiliary_loss_clip": 0.06412887, + "auxiliary_loss_mlp": 0.01267051, + "balance_loss_clip": 0.06270926, + "balance_loss_mlp": 0.0125547, + "epoch": 0.6190891327220802, + "flos": 22536192476160.0, + "grad_norm": 1.9563521083429962, + "language_loss": 0.71887541, + "learning_rate": 1.3383522958331287e-06, + "loss": 0.7956748, + "num_input_tokens_seen": 221728145, + "router_z_loss_clip": 1.41992188, + "router_z_loss_mlp": 0.11584473, + "step": 10297, + "time_per_iteration": 2.5115151405334473 + }, + { + "auxiliary_loss_clip": 0.0631431, + "auxiliary_loss_mlp": 0.01254184, + "balance_loss_clip": 0.0625589, + "balance_loss_mlp": 0.01252958, + "epoch": 0.6191492559747482, + "flos": 67748756509440.0, + "grad_norm": 0.8712851262632907, + "language_loss": 0.64291644, + "learning_rate": 1.3379847763033345e-06, + "loss": 0.71860135, + "num_input_tokens_seen": 221786100, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 0.01225281, + "step": 10298, + "time_per_iteration": 3.0254995822906494 + }, + { + "auxiliary_loss_clip": 0.06415632, + "auxiliary_loss_mlp": 0.01266663, + "balance_loss_clip": 0.06271654, + "balance_loss_mlp": 0.01255517, + "epoch": 0.6192093792274163, + "flos": 22353316940160.0, + "grad_norm": 1.6622389387462033, + "language_loss": 0.73995864, + "learning_rate": 1.3376172818772236e-06, + "loss": 0.81678164, + "num_input_tokens_seen": 221806450, + "router_z_loss_clip": 1.43945312, + "router_z_loss_mlp": 0.11157227, + "step": 10299, + "time_per_iteration": 3.9369277954101562 + }, + { + "auxiliary_loss_clip": 0.06421331, + "auxiliary_loss_mlp": 0.01268355, + "balance_loss_clip": 0.06274632, + "balance_loss_mlp": 0.01257054, + "epoch": 0.6192695024800842, + "flos": 13559176500480.0, + "grad_norm": 1.5604516058647369, + "language_loss": 0.68912721, + "learning_rate": 1.337249812568732e-06, + "loss": 0.76602411, + "num_input_tokens_seen": 221823330, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 0.11297607, + "step": 10300, + "time_per_iteration": 2.462852716445923 + }, + { + "auxiliary_loss_clip": 0.06414428, + "auxiliary_loss_mlp": 0.01266769, + "balance_loss_clip": 0.06272526, + "balance_loss_mlp": 0.01255241, + "epoch": 0.6193296257327522, + "flos": 17420163661440.0, + "grad_norm": 1.6482033452585196, + "language_loss": 0.67021179, + "learning_rate": 1.3368823683917939e-06, + "loss": 0.74702382, + "num_input_tokens_seen": 221839360, + "router_z_loss_clip": 1.41992188, + "router_z_loss_mlp": 0.11529541, + "step": 10301, + "time_per_iteration": 2.496779680252075 + }, + { + "auxiliary_loss_clip": 0.06414926, + "auxiliary_loss_mlp": 0.01266961, + "balance_loss_clip": 0.06272815, + "balance_loss_mlp": 0.01256411, + "epoch": 0.6193897489854201, + "flos": 31108869774720.0, + "grad_norm": 1.608536765976836, + "language_loss": 0.72948015, + "learning_rate": 1.3365149493603424e-06, + "loss": 0.80629897, + "num_input_tokens_seen": 221859465, + "router_z_loss_clip": 1.41992188, + "router_z_loss_mlp": 0.10546875, + "step": 10302, + "time_per_iteration": 2.5844531059265137 + }, + { + "auxiliary_loss_clip": 0.06413972, + "auxiliary_loss_mlp": 0.01269333, + "balance_loss_clip": 0.06273288, + "balance_loss_mlp": 0.01258038, + "epoch": 0.6194498722380881, + "flos": 19139822605440.0, + "grad_norm": 1.7442373384203957, + "language_loss": 0.81269908, + "learning_rate": 1.3361475554883107e-06, + "loss": 0.88953209, + "num_input_tokens_seen": 221878555, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.11303711, + "step": 10303, + "time_per_iteration": 2.527067184448242 + }, + { + "auxiliary_loss_clip": 0.06420361, + "auxiliary_loss_mlp": 0.01268221, + "balance_loss_clip": 0.06272827, + "balance_loss_mlp": 0.01255274, + "epoch": 0.619509995490756, + "flos": 21841517001600.0, + "grad_norm": 1.6019319576417599, + "language_loss": 0.76846468, + "learning_rate": 1.3357801867896307e-06, + "loss": 0.8453505, + "num_input_tokens_seen": 221898790, + "router_z_loss_clip": 1.47851562, + "router_z_loss_mlp": 0.12957764, + "step": 10304, + "time_per_iteration": 2.4880640506744385 + }, + { + "auxiliary_loss_clip": 0.06424797, + "auxiliary_loss_mlp": 0.01268109, + "balance_loss_clip": 0.06276388, + "balance_loss_mlp": 0.0125617, + "epoch": 0.619570118743424, + "flos": 23813512116480.0, + "grad_norm": 1.7485917713195505, + "language_loss": 0.77554089, + "learning_rate": 1.3354128432782324e-06, + "loss": 0.85246998, + "num_input_tokens_seen": 221918875, + "router_z_loss_clip": 1.48339844, + "router_z_loss_mlp": 0.1194458, + "step": 10305, + "time_per_iteration": 2.5362794399261475 + }, + { + "auxiliary_loss_clip": 0.06418667, + "auxiliary_loss_mlp": 0.01267084, + "balance_loss_clip": 0.06272887, + "balance_loss_mlp": 0.0125508, + "epoch": 0.619630241996092, + "flos": 21107289600000.0, + "grad_norm": 1.5608682149054525, + "language_loss": 0.79292911, + "learning_rate": 1.335045524968045e-06, + "loss": 0.86978668, + "num_input_tokens_seen": 221937895, + "router_z_loss_clip": 1.45703125, + "router_z_loss_mlp": 0.12005615, + "step": 10306, + "time_per_iteration": 2.5073060989379883 + }, + { + "auxiliary_loss_clip": 0.0640957, + "auxiliary_loss_mlp": 0.01267646, + "balance_loss_clip": 0.06271125, + "balance_loss_mlp": 0.01258067, + "epoch": 0.61969036524876, + "flos": 27315666167040.0, + "grad_norm": 1.5979283875043302, + "language_loss": 0.80772972, + "learning_rate": 1.3346782318729988e-06, + "loss": 0.88450187, + "num_input_tokens_seen": 221955920, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.09576416, + "step": 10307, + "time_per_iteration": 2.576525926589966 + }, + { + "auxiliary_loss_clip": 0.06313084, + "auxiliary_loss_mlp": 0.01252494, + "balance_loss_clip": 0.06255361, + "balance_loss_mlp": 0.01251256, + "epoch": 0.6197504885014279, + "flos": 51667308403200.0, + "grad_norm": 0.783320902533958, + "language_loss": 0.59562945, + "learning_rate": 1.3343109640070203e-06, + "loss": 0.67128521, + "num_input_tokens_seen": 222011405, + "router_z_loss_clip": 0.578125, + "router_z_loss_mlp": 0.01237488, + "step": 10308, + "time_per_iteration": 3.167433738708496 + }, + { + "auxiliary_loss_clip": 0.06410602, + "auxiliary_loss_mlp": 0.01264735, + "balance_loss_clip": 0.06271984, + "balance_loss_mlp": 0.01254191, + "epoch": 0.6198106117540959, + "flos": 30565316338560.0, + "grad_norm": 1.6157907948964547, + "language_loss": 0.68128729, + "learning_rate": 1.333943721384037e-06, + "loss": 0.75804067, + "num_input_tokens_seen": 222034545, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.10540771, + "step": 10309, + "time_per_iteration": 2.5872271060943604 + }, + { + "auxiliary_loss_clip": 0.06412695, + "auxiliary_loss_mlp": 0.01268034, + "balance_loss_clip": 0.06273058, + "balance_loss_mlp": 0.01257108, + "epoch": 0.6198707350067638, + "flos": 18914586030720.0, + "grad_norm": 1.6991122803597551, + "language_loss": 0.725124, + "learning_rate": 1.3335765040179746e-06, + "loss": 0.80193126, + "num_input_tokens_seen": 222052690, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.10925293, + "step": 10310, + "time_per_iteration": 2.5339155197143555 + }, + { + "auxiliary_loss_clip": 0.0642102, + "auxiliary_loss_mlp": 0.01267157, + "balance_loss_clip": 0.06275747, + "balance_loss_mlp": 0.01254974, + "epoch": 0.6199308582594318, + "flos": 21440238998400.0, + "grad_norm": 1.796323815916351, + "language_loss": 0.78780711, + "learning_rate": 1.3332093119227573e-06, + "loss": 0.86468887, + "num_input_tokens_seen": 222069095, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.12176514, + "step": 10311, + "time_per_iteration": 2.5148420333862305 + }, + { + "auxiliary_loss_clip": 0.06414344, + "auxiliary_loss_mlp": 0.01267618, + "balance_loss_clip": 0.06271456, + "balance_loss_mlp": 0.0125643, + "epoch": 0.6199909815120999, + "flos": 18413561341440.0, + "grad_norm": 2.1642456621818935, + "language_loss": 0.72494328, + "learning_rate": 1.3328421451123105e-06, + "loss": 0.80176294, + "num_input_tokens_seen": 222087360, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.11175537, + "step": 10312, + "time_per_iteration": 2.5287880897521973 + }, + { + "auxiliary_loss_clip": 0.0642011, + "auxiliary_loss_mlp": 0.01266003, + "balance_loss_clip": 0.06274375, + "balance_loss_mlp": 0.01254744, + "epoch": 0.6200511047647678, + "flos": 21472663328640.0, + "grad_norm": 5.562964449835012, + "language_loss": 0.72224271, + "learning_rate": 1.3324750036005557e-06, + "loss": 0.79910386, + "num_input_tokens_seen": 222106130, + "router_z_loss_clip": 1.45898438, + "router_z_loss_mlp": 0.1126709, + "step": 10313, + "time_per_iteration": 2.5028812885284424 + }, + { + "auxiliary_loss_clip": 0.06422722, + "auxiliary_loss_mlp": 0.01266585, + "balance_loss_clip": 0.06275584, + "balance_loss_mlp": 0.01254521, + "epoch": 0.6201112280174358, + "flos": 18220539461760.0, + "grad_norm": 1.7747609453089435, + "language_loss": 0.78361583, + "learning_rate": 1.332107887401416e-06, + "loss": 0.86050892, + "num_input_tokens_seen": 222123125, + "router_z_loss_clip": 1.47167969, + "router_z_loss_mlp": 0.12054443, + "step": 10314, + "time_per_iteration": 2.5241122245788574 + }, + { + "auxiliary_loss_clip": 0.06416035, + "auxiliary_loss_mlp": 0.01264642, + "balance_loss_clip": 0.06273148, + "balance_loss_mlp": 0.01253723, + "epoch": 0.6201713512701037, + "flos": 20017373616000.0, + "grad_norm": 1.7540334225503873, + "language_loss": 0.78008437, + "learning_rate": 1.331740796528812e-06, + "loss": 0.8568911, + "num_input_tokens_seen": 222140655, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.10925293, + "step": 10315, + "time_per_iteration": 2.515916585922241 + }, + { + "auxiliary_loss_clip": 0.06417818, + "auxiliary_loss_mlp": 0.01268496, + "balance_loss_clip": 0.06271202, + "balance_loss_mlp": 0.01257719, + "epoch": 0.6202314745227717, + "flos": 22493537948160.0, + "grad_norm": 2.219101181270965, + "language_loss": 0.76005399, + "learning_rate": 1.3313737309966641e-06, + "loss": 0.83691716, + "num_input_tokens_seen": 222160450, + "router_z_loss_clip": 1.46582031, + "router_z_loss_mlp": 0.10766602, + "step": 10316, + "time_per_iteration": 2.5367636680603027 + }, + { + "auxiliary_loss_clip": 0.06417404, + "auxiliary_loss_mlp": 0.01267951, + "balance_loss_clip": 0.06271914, + "balance_loss_mlp": 0.01256948, + "epoch": 0.6202915977754396, + "flos": 26835116601600.0, + "grad_norm": 1.8483221587209677, + "language_loss": 0.77761883, + "learning_rate": 1.3310066908188915e-06, + "loss": 0.8544724, + "num_input_tokens_seen": 222179170, + "router_z_loss_clip": 1.45507812, + "router_z_loss_mlp": 0.11004639, + "step": 10317, + "time_per_iteration": 2.5396320819854736 + }, + { + "auxiliary_loss_clip": 0.06315257, + "auxiliary_loss_mlp": 0.01256399, + "balance_loss_clip": 0.0625724, + "balance_loss_mlp": 0.01255023, + "epoch": 0.6203517210281076, + "flos": 62763248828160.0, + "grad_norm": 0.6893904060556487, + "language_loss": 0.58856946, + "learning_rate": 1.3306396760094122e-06, + "loss": 0.66428602, + "num_input_tokens_seen": 222242660, + "router_z_loss_clip": 0.578125, + "router_z_loss_mlp": 0.01377869, + "step": 10318, + "time_per_iteration": 3.1691195964813232 + }, + { + "auxiliary_loss_clip": 0.06414767, + "auxiliary_loss_mlp": 0.01270191, + "balance_loss_clip": 0.06272453, + "balance_loss_mlp": 0.01258425, + "epoch": 0.6204118442807756, + "flos": 23411018229120.0, + "grad_norm": 1.7666446205430133, + "language_loss": 0.78163171, + "learning_rate": 1.330272686582143e-06, + "loss": 0.85848129, + "num_input_tokens_seen": 222262170, + "router_z_loss_clip": 1.41992188, + "router_z_loss_mlp": 0.11755371, + "step": 10319, + "time_per_iteration": 2.5313587188720703 + }, + { + "auxiliary_loss_clip": 0.06410229, + "auxiliary_loss_mlp": 0.01267722, + "balance_loss_clip": 0.06271461, + "balance_loss_mlp": 0.01257589, + "epoch": 0.6204719675334436, + "flos": 20199871808640.0, + "grad_norm": 1.5707406021720693, + "language_loss": 0.66525, + "learning_rate": 1.3299057225510013e-06, + "loss": 0.74202955, + "num_input_tokens_seen": 222280375, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.10137939, + "step": 10320, + "time_per_iteration": 3.8696272373199463 + }, + { + "auxiliary_loss_clip": 0.06407389, + "auxiliary_loss_mlp": 0.01264937, + "balance_loss_clip": 0.06270511, + "balance_loss_mlp": 0.01255025, + "epoch": 0.6205320907861115, + "flos": 13193048085120.0, + "grad_norm": 1.6249727148286428, + "language_loss": 0.76339847, + "learning_rate": 1.3295387839299013e-06, + "loss": 0.84012175, + "num_input_tokens_seen": 222297325, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.09912109, + "step": 10321, + "time_per_iteration": 2.4867870807647705 + }, + { + "auxiliary_loss_clip": 0.06409396, + "auxiliary_loss_mlp": 0.01266949, + "balance_loss_clip": 0.06270664, + "balance_loss_mlp": 0.01256256, + "epoch": 0.6205922140387795, + "flos": 20674761223680.0, + "grad_norm": 1.5610091783179405, + "language_loss": 0.74460745, + "learning_rate": 1.329171870732758e-06, + "loss": 0.82137096, + "num_input_tokens_seen": 222317095, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.10693359, + "step": 10322, + "time_per_iteration": 2.506465196609497 + }, + { + "auxiliary_loss_clip": 0.06410797, + "auxiliary_loss_mlp": 0.01264937, + "balance_loss_clip": 0.06272407, + "balance_loss_mlp": 0.01255275, + "epoch": 0.6206523372914474, + "flos": 23884524051840.0, + "grad_norm": 1.6823894915828839, + "language_loss": 0.72711974, + "learning_rate": 1.3288049829734845e-06, + "loss": 0.80387706, + "num_input_tokens_seen": 222337055, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.09667969, + "step": 10323, + "time_per_iteration": 2.5490479469299316 + }, + { + "auxiliary_loss_clip": 0.06424229, + "auxiliary_loss_mlp": 0.012682, + "balance_loss_clip": 0.06274472, + "balance_loss_mlp": 0.01257322, + "epoch": 0.6207124605441154, + "flos": 13411576333440.0, + "grad_norm": 31.978129858103646, + "language_loss": 0.59017056, + "learning_rate": 1.3284381206659933e-06, + "loss": 0.66709483, + "num_input_tokens_seen": 222354515, + "router_z_loss_clip": 1.49707031, + "router_z_loss_mlp": 0.10876465, + "step": 10324, + "time_per_iteration": 2.5541300773620605 + }, + { + "auxiliary_loss_clip": 0.0641806, + "auxiliary_loss_mlp": 0.01267454, + "balance_loss_clip": 0.06274732, + "balance_loss_mlp": 0.01255664, + "epoch": 0.6207725837967835, + "flos": 18922300606080.0, + "grad_norm": 1.723600813321157, + "language_loss": 0.76792443, + "learning_rate": 1.3280712838241956e-06, + "loss": 0.84477955, + "num_input_tokens_seen": 222372755, + "router_z_loss_clip": 1.43164062, + "router_z_loss_mlp": 0.11791992, + "step": 10325, + "time_per_iteration": 2.5330686569213867 + }, + { + "auxiliary_loss_clip": 0.06421543, + "auxiliary_loss_mlp": 0.01267318, + "balance_loss_clip": 0.06275088, + "balance_loss_mlp": 0.01256207, + "epoch": 0.6208327070494514, + "flos": 23985738184320.0, + "grad_norm": 1.8229064209367492, + "language_loss": 0.72747815, + "learning_rate": 1.327704472462003e-06, + "loss": 0.80436671, + "num_input_tokens_seen": 222391380, + "router_z_loss_clip": 1.46289062, + "router_z_loss_mlp": 0.11120605, + "step": 10326, + "time_per_iteration": 2.5343799591064453 + }, + { + "auxiliary_loss_clip": 0.06419887, + "auxiliary_loss_mlp": 0.01268815, + "balance_loss_clip": 0.06274612, + "balance_loss_mlp": 0.0125687, + "epoch": 0.6208928303021194, + "flos": 22827032398080.0, + "grad_norm": 1.9354170249209526, + "language_loss": 0.73989004, + "learning_rate": 1.3273376865933234e-06, + "loss": 0.81677705, + "num_input_tokens_seen": 222411165, + "router_z_loss_clip": 1.45214844, + "router_z_loss_mlp": 0.11950684, + "step": 10327, + "time_per_iteration": 2.555742025375366 + }, + { + "auxiliary_loss_clip": 0.06417272, + "auxiliary_loss_mlp": 0.0126664, + "balance_loss_clip": 0.06271765, + "balance_loss_mlp": 0.01255261, + "epoch": 0.6209529535547873, + "flos": 17569944034560.0, + "grad_norm": 2.1609251311460493, + "language_loss": 0.80099189, + "learning_rate": 1.326970926232066e-06, + "loss": 0.8778311, + "num_input_tokens_seen": 222428110, + "router_z_loss_clip": 1.45507812, + "router_z_loss_mlp": 0.11364746, + "step": 10328, + "time_per_iteration": 2.4839911460876465 + }, + { + "auxiliary_loss_clip": 0.06413457, + "auxiliary_loss_mlp": 0.0126611, + "balance_loss_clip": 0.06270879, + "balance_loss_mlp": 0.01254791, + "epoch": 0.6210130768074553, + "flos": 22017432648960.0, + "grad_norm": 1.8104585499122046, + "language_loss": 0.78316593, + "learning_rate": 1.3266041913921396e-06, + "loss": 0.85996157, + "num_input_tokens_seen": 222446385, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.11322021, + "step": 10329, + "time_per_iteration": 2.551748514175415 + }, + { + "auxiliary_loss_clip": 0.06317136, + "auxiliary_loss_mlp": 0.01252093, + "balance_loss_clip": 0.0625931, + "balance_loss_mlp": 0.0125077, + "epoch": 0.6210732000601232, + "flos": 63695166739200.0, + "grad_norm": 0.8181079803134828, + "language_loss": 0.62296569, + "learning_rate": 1.3262374820874484e-06, + "loss": 0.69865799, + "num_input_tokens_seen": 222502150, + "router_z_loss_clip": 0.578125, + "router_z_loss_mlp": 0.013237, + "step": 10330, + "time_per_iteration": 4.52486252784729 + }, + { + "auxiliary_loss_clip": 0.06422883, + "auxiliary_loss_mlp": 0.01268071, + "balance_loss_clip": 0.06276384, + "balance_loss_mlp": 0.01256275, + "epoch": 0.6211333233127913, + "flos": 24250233196800.0, + "grad_norm": 2.0105352809521517, + "language_loss": 0.77933174, + "learning_rate": 1.3258707983319002e-06, + "loss": 0.85624135, + "num_input_tokens_seen": 222519880, + "router_z_loss_clip": 1.46582031, + "router_z_loss_mlp": 0.11791992, + "step": 10331, + "time_per_iteration": 2.558311939239502 + }, + { + "auxiliary_loss_clip": 0.06423557, + "auxiliary_loss_mlp": 0.01267101, + "balance_loss_clip": 0.06275949, + "balance_loss_mlp": 0.01255151, + "epoch": 0.6211934465654592, + "flos": 16949047680000.0, + "grad_norm": 2.3537089497540147, + "language_loss": 0.67977309, + "learning_rate": 1.3255041401393992e-06, + "loss": 0.75667971, + "num_input_tokens_seen": 222538545, + "router_z_loss_clip": 1.4765625, + "router_z_loss_mlp": 0.11950684, + "step": 10332, + "time_per_iteration": 2.4883179664611816 + }, + { + "auxiliary_loss_clip": 0.06419694, + "auxiliary_loss_mlp": 0.01266096, + "balance_loss_clip": 0.06276092, + "balance_loss_mlp": 0.01255677, + "epoch": 0.6212535698181272, + "flos": 15272672169600.0, + "grad_norm": 1.3382118578807503, + "language_loss": 0.76498306, + "learning_rate": 1.3251375075238476e-06, + "loss": 0.84184092, + "num_input_tokens_seen": 222556935, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.10418701, + "step": 10333, + "time_per_iteration": 3.9705252647399902 + }, + { + "auxiliary_loss_clip": 0.06414539, + "auxiliary_loss_mlp": 0.01267678, + "balance_loss_clip": 0.06275988, + "balance_loss_mlp": 0.012563, + "epoch": 0.6213136930707951, + "flos": 13449073835520.0, + "grad_norm": 2.1789310130446227, + "language_loss": 0.70102298, + "learning_rate": 1.3247709004991507e-06, + "loss": 0.77784514, + "num_input_tokens_seen": 222574035, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.11383057, + "step": 10334, + "time_per_iteration": 2.5797176361083984 + }, + { + "auxiliary_loss_clip": 0.06414784, + "auxiliary_loss_mlp": 0.01264307, + "balance_loss_clip": 0.06275611, + "balance_loss_mlp": 0.01254168, + "epoch": 0.6213738163234631, + "flos": 18116641998720.0, + "grad_norm": 1.637338123067712, + "language_loss": 0.70408571, + "learning_rate": 1.3244043190792078e-06, + "loss": 0.78087658, + "num_input_tokens_seen": 222592290, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.10137939, + "step": 10335, + "time_per_iteration": 2.482482671737671 + }, + { + "auxiliary_loss_clip": 0.06413939, + "auxiliary_loss_mlp": 0.01267616, + "balance_loss_clip": 0.0627524, + "balance_loss_mlp": 0.01257185, + "epoch": 0.621433939576131, + "flos": 25344299957760.0, + "grad_norm": 1.5093006351890013, + "language_loss": 0.80123997, + "learning_rate": 1.3240377632779213e-06, + "loss": 0.87805557, + "num_input_tokens_seen": 222612805, + "router_z_loss_clip": 1.38476562, + "router_z_loss_mlp": 0.10430908, + "step": 10336, + "time_per_iteration": 2.5523369312286377 + }, + { + "auxiliary_loss_clip": 0.06410298, + "auxiliary_loss_mlp": 0.01268916, + "balance_loss_clip": 0.06271983, + "balance_loss_mlp": 0.0125848, + "epoch": 0.621494062828799, + "flos": 22572306385920.0, + "grad_norm": 1.6169920799644502, + "language_loss": 0.73330015, + "learning_rate": 1.3236712331091907e-06, + "loss": 0.81009233, + "num_input_tokens_seen": 222632260, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.10437012, + "step": 10337, + "time_per_iteration": 2.4964675903320312 + }, + { + "auxiliary_loss_clip": 0.0642301, + "auxiliary_loss_mlp": 0.01266548, + "balance_loss_clip": 0.06278226, + "balance_loss_mlp": 0.012548, + "epoch": 0.621554186081467, + "flos": 27425433415680.0, + "grad_norm": 1.8853547327091988, + "language_loss": 0.63167447, + "learning_rate": 1.3233047285869145e-06, + "loss": 0.70857, + "num_input_tokens_seen": 222653570, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.11755371, + "step": 10338, + "time_per_iteration": 4.016883611679077 + }, + { + "auxiliary_loss_clip": 0.06417143, + "auxiliary_loss_mlp": 0.0126833, + "balance_loss_clip": 0.06275916, + "balance_loss_mlp": 0.01257787, + "epoch": 0.621614309334135, + "flos": 22353484648320.0, + "grad_norm": 1.7306917238363975, + "language_loss": 0.71876323, + "learning_rate": 1.322938249724991e-06, + "loss": 0.79561794, + "num_input_tokens_seen": 222672480, + "router_z_loss_clip": 1.41210938, + "router_z_loss_mlp": 0.10546875, + "step": 10339, + "time_per_iteration": 2.5129294395446777 + }, + { + "auxiliary_loss_clip": 0.06411034, + "auxiliary_loss_mlp": 0.01266092, + "balance_loss_clip": 0.06274111, + "balance_loss_mlp": 0.0125519, + "epoch": 0.621674432586803, + "flos": 19287255064320.0, + "grad_norm": 1.654477546235719, + "language_loss": 0.69824433, + "learning_rate": 1.3225717965373166e-06, + "loss": 0.77501559, + "num_input_tokens_seen": 222691200, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.10906982, + "step": 10340, + "time_per_iteration": 2.491989850997925 + }, + { + "auxiliary_loss_clip": 0.0641477, + "auxiliary_loss_mlp": 0.01265499, + "balance_loss_clip": 0.06276464, + "balance_loss_mlp": 0.01255074, + "epoch": 0.6217345558394709, + "flos": 21614812980480.0, + "grad_norm": 1.760593238290477, + "language_loss": 0.68765497, + "learning_rate": 1.322205369037788e-06, + "loss": 0.76445758, + "num_input_tokens_seen": 222709975, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.10430908, + "step": 10341, + "time_per_iteration": 2.6119179725646973 + }, + { + "auxiliary_loss_clip": 0.06421542, + "auxiliary_loss_mlp": 0.01267354, + "balance_loss_clip": 0.06278797, + "balance_loss_mlp": 0.01256089, + "epoch": 0.6217946790921389, + "flos": 18009893496960.0, + "grad_norm": 2.3031674054515867, + "language_loss": 0.81059158, + "learning_rate": 1.321838967240299e-06, + "loss": 0.88748062, + "num_input_tokens_seen": 222729005, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.11273193, + "step": 10342, + "time_per_iteration": 2.4969582557678223 + }, + { + "auxiliary_loss_clip": 0.0631469, + "auxiliary_loss_mlp": 0.0125491, + "balance_loss_clip": 0.0625717, + "balance_loss_mlp": 0.01253292, + "epoch": 0.6218548023448068, + "flos": 61993578349440.0, + "grad_norm": 0.8110464269458239, + "language_loss": 0.5724324, + "learning_rate": 1.3214725911587452e-06, + "loss": 0.64812839, + "num_input_tokens_seen": 222786090, + "router_z_loss_clip": 0.578125, + "router_z_loss_mlp": 0.01620483, + "step": 10343, + "time_per_iteration": 3.0396130084991455 + }, + { + "auxiliary_loss_clip": 0.06411558, + "auxiliary_loss_mlp": 0.01264969, + "balance_loss_clip": 0.06274949, + "balance_loss_mlp": 0.01254812, + "epoch": 0.6219149255974749, + "flos": 25746248793600.0, + "grad_norm": 1.838833235576279, + "language_loss": 0.73063612, + "learning_rate": 1.3211062408070184e-06, + "loss": 0.80740142, + "num_input_tokens_seen": 222806100, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.1015625, + "step": 10344, + "time_per_iteration": 2.5173933506011963 + }, + { + "auxiliary_loss_clip": 0.0641374, + "auxiliary_loss_mlp": 0.01264496, + "balance_loss_clip": 0.06273273, + "balance_loss_mlp": 0.01253803, + "epoch": 0.6219750488501428, + "flos": 25418162931840.0, + "grad_norm": 2.137498021001217, + "language_loss": 0.60161531, + "learning_rate": 1.3207399161990105e-06, + "loss": 0.67839766, + "num_input_tokens_seen": 222826575, + "router_z_loss_clip": 1.40332031, + "router_z_loss_mlp": 0.10699463, + "step": 10345, + "time_per_iteration": 2.5472302436828613 + }, + { + "auxiliary_loss_clip": 0.06417334, + "auxiliary_loss_mlp": 0.01264437, + "balance_loss_clip": 0.06275278, + "balance_loss_mlp": 0.01253357, + "epoch": 0.6220351721028108, + "flos": 20053529452800.0, + "grad_norm": 2.827284227984571, + "language_loss": 0.78566015, + "learning_rate": 1.320373617348614e-06, + "loss": 0.86247778, + "num_input_tokens_seen": 222845285, + "router_z_loss_clip": 1.41992188, + "router_z_loss_mlp": 0.11083984, + "step": 10346, + "time_per_iteration": 2.487410068511963 + }, + { + "auxiliary_loss_clip": 0.06418615, + "auxiliary_loss_mlp": 0.01266577, + "balance_loss_clip": 0.06276032, + "balance_loss_mlp": 0.01255419, + "epoch": 0.6220952953554787, + "flos": 27495439102080.0, + "grad_norm": 1.506091245470688, + "language_loss": 0.71672869, + "learning_rate": 1.3200073442697171e-06, + "loss": 0.79358065, + "num_input_tokens_seen": 222864575, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.11151123, + "step": 10347, + "time_per_iteration": 2.589825391769409 + }, + { + "auxiliary_loss_clip": 0.06409717, + "auxiliary_loss_mlp": 0.01264267, + "balance_loss_clip": 0.06270842, + "balance_loss_mlp": 0.01254117, + "epoch": 0.6221554186081467, + "flos": 19213517871360.0, + "grad_norm": 1.5983272943469429, + "language_loss": 0.7253015, + "learning_rate": 1.3196410969762108e-06, + "loss": 0.80204135, + "num_input_tokens_seen": 222884420, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.10144043, + "step": 10348, + "time_per_iteration": 2.497612953186035 + }, + { + "auxiliary_loss_clip": 0.06308477, + "auxiliary_loss_mlp": 0.01254968, + "balance_loss_clip": 0.06251626, + "balance_loss_mlp": 0.01253483, + "epoch": 0.6222155418608146, + "flos": 62969744016000.0, + "grad_norm": 0.7906840461302661, + "language_loss": 0.54113448, + "learning_rate": 1.3192748754819815e-06, + "loss": 0.61676896, + "num_input_tokens_seen": 222944690, + "router_z_loss_clip": 0.5703125, + "router_z_loss_mlp": 0.01483154, + "step": 10349, + "time_per_iteration": 3.123992681503296 + }, + { + "auxiliary_loss_clip": 0.06409817, + "auxiliary_loss_mlp": 0.01266982, + "balance_loss_clip": 0.06269394, + "balance_loss_mlp": 0.01256086, + "epoch": 0.6222756651134826, + "flos": 22607623681920.0, + "grad_norm": 1.7328717856317462, + "language_loss": 0.69908136, + "learning_rate": 1.3189086798009173e-06, + "loss": 0.77584934, + "num_input_tokens_seen": 222962990, + "router_z_loss_clip": 1.40429688, + "router_z_loss_mlp": 0.10894775, + "step": 10350, + "time_per_iteration": 2.5098471641540527 + }, + { + "auxiliary_loss_clip": 0.0641721, + "auxiliary_loss_mlp": 0.01269342, + "balance_loss_clip": 0.06275678, + "balance_loss_mlp": 0.01257946, + "epoch": 0.6223357883661506, + "flos": 21148602462720.0, + "grad_norm": 1.8273350624055802, + "language_loss": 0.57737762, + "learning_rate": 1.3185425099469046e-06, + "loss": 0.65424317, + "num_input_tokens_seen": 222980715, + "router_z_loss_clip": 1.41308594, + "router_z_loss_mlp": 0.11395264, + "step": 10351, + "time_per_iteration": 2.508089780807495 + }, + { + "auxiliary_loss_clip": 0.06308511, + "auxiliary_loss_mlp": 0.01256508, + "balance_loss_clip": 0.06251398, + "balance_loss_mlp": 0.01254946, + "epoch": 0.6223959116188186, + "flos": 63785926310400.0, + "grad_norm": 0.780725998939495, + "language_loss": 0.61087048, + "learning_rate": 1.3181763659338276e-06, + "loss": 0.6865207, + "num_input_tokens_seen": 223040685, + "router_z_loss_clip": 0.5703125, + "router_z_loss_mlp": 0.01560974, + "step": 10352, + "time_per_iteration": 3.1217076778411865 + }, + { + "auxiliary_loss_clip": 0.06412127, + "auxiliary_loss_mlp": 0.0126301, + "balance_loss_clip": 0.06274231, + "balance_loss_mlp": 0.01252866, + "epoch": 0.6224560348714866, + "flos": 22572432167040.0, + "grad_norm": 2.017492088511814, + "language_loss": 0.82234097, + "learning_rate": 1.3178102477755714e-06, + "loss": 0.89909232, + "num_input_tokens_seen": 223059000, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.10144043, + "step": 10353, + "time_per_iteration": 2.527926445007324 + }, + { + "auxiliary_loss_clip": 0.06406288, + "auxiliary_loss_mlp": 0.01271685, + "balance_loss_clip": 0.06271318, + "balance_loss_mlp": 0.01261474, + "epoch": 0.6225161581241545, + "flos": 24104645527680.0, + "grad_norm": 1.3564318500578532, + "language_loss": 0.75680768, + "learning_rate": 1.3174441554860195e-06, + "loss": 0.83358729, + "num_input_tokens_seen": 223079345, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.10217285, + "step": 10354, + "time_per_iteration": 2.577965021133423 + }, + { + "auxiliary_loss_clip": 0.06411938, + "auxiliary_loss_mlp": 0.0126369, + "balance_loss_clip": 0.06273422, + "balance_loss_mlp": 0.01253659, + "epoch": 0.6225762813768225, + "flos": 20448853816320.0, + "grad_norm": 1.3905640818253433, + "language_loss": 0.7869665, + "learning_rate": 1.3170780890790528e-06, + "loss": 0.8637228, + "num_input_tokens_seen": 223097880, + "router_z_loss_clip": 1.38476562, + "router_z_loss_mlp": 0.1003418, + "step": 10355, + "time_per_iteration": 2.520951986312866 + }, + { + "auxiliary_loss_clip": 0.06414016, + "auxiliary_loss_mlp": 0.01267836, + "balance_loss_clip": 0.06272769, + "balance_loss_mlp": 0.01257757, + "epoch": 0.6226364046294904, + "flos": 27205395793920.0, + "grad_norm": 1.8039879302815294, + "language_loss": 0.78103602, + "learning_rate": 1.3167120485685538e-06, + "loss": 0.85785455, + "num_input_tokens_seen": 223118185, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.10083008, + "step": 10356, + "time_per_iteration": 2.595402956008911 + }, + { + "auxiliary_loss_clip": 0.06422309, + "auxiliary_loss_mlp": 0.01269591, + "balance_loss_clip": 0.06274671, + "balance_loss_mlp": 0.01257307, + "epoch": 0.6226965278821585, + "flos": 20451495219840.0, + "grad_norm": 2.2679706310330037, + "language_loss": 0.67886806, + "learning_rate": 1.3163460339684024e-06, + "loss": 0.75578707, + "num_input_tokens_seen": 223137600, + "router_z_loss_clip": 1.47753906, + "router_z_loss_mlp": 0.1229248, + "step": 10357, + "time_per_iteration": 2.5113070011138916 + }, + { + "auxiliary_loss_clip": 0.06419406, + "auxiliary_loss_mlp": 0.01267785, + "balance_loss_clip": 0.06272604, + "balance_loss_mlp": 0.0125578, + "epoch": 0.6227566511348264, + "flos": 22169099738880.0, + "grad_norm": 2.9791987901041788, + "language_loss": 0.76851863, + "learning_rate": 1.3159800452924778e-06, + "loss": 0.84539044, + "num_input_tokens_seen": 223154360, + "router_z_loss_clip": 1.46777344, + "router_z_loss_mlp": 0.11999512, + "step": 10358, + "time_per_iteration": 2.532348394393921 + }, + { + "auxiliary_loss_clip": 0.06416389, + "auxiliary_loss_mlp": 0.01266377, + "balance_loss_clip": 0.0627404, + "balance_loss_mlp": 0.01255922, + "epoch": 0.6228167743874944, + "flos": 18046720166400.0, + "grad_norm": 1.8844002351613314, + "language_loss": 0.82833385, + "learning_rate": 1.3156140825546588e-06, + "loss": 0.9051615, + "num_input_tokens_seen": 223172255, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.10455322, + "step": 10359, + "time_per_iteration": 3.914476156234741 + }, + { + "auxiliary_loss_clip": 0.06410404, + "auxiliary_loss_mlp": 0.01263862, + "balance_loss_clip": 0.06273699, + "balance_loss_mlp": 0.01253353, + "epoch": 0.6228768976401623, + "flos": 17747620617600.0, + "grad_norm": 2.053797228905972, + "language_loss": 0.73535556, + "learning_rate": 1.315248145768822e-06, + "loss": 0.81209821, + "num_input_tokens_seen": 223186965, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.10510254, + "step": 10360, + "time_per_iteration": 2.476815700531006 + }, + { + "auxiliary_loss_clip": 0.06415363, + "auxiliary_loss_mlp": 0.01268466, + "balance_loss_clip": 0.06274994, + "balance_loss_mlp": 0.01257999, + "epoch": 0.6229370208928303, + "flos": 17900755153920.0, + "grad_norm": 2.156230361739645, + "language_loss": 0.77647728, + "learning_rate": 1.3148822349488442e-06, + "loss": 0.85331559, + "num_input_tokens_seen": 223206045, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.10461426, + "step": 10361, + "time_per_iteration": 2.4798471927642822 + }, + { + "auxiliary_loss_clip": 0.06413896, + "auxiliary_loss_mlp": 0.01265649, + "balance_loss_clip": 0.0627467, + "balance_loss_mlp": 0.01255618, + "epoch": 0.6229971441454982, + "flos": 17353512138240.0, + "grad_norm": 1.5462012893965447, + "language_loss": 0.68078434, + "learning_rate": 1.3145163501086005e-06, + "loss": 0.7575798, + "num_input_tokens_seen": 223224820, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.1003418, + "step": 10362, + "time_per_iteration": 2.5225536823272705 + }, + { + "auxiliary_loss_clip": 0.06412376, + "auxiliary_loss_mlp": 0.01266163, + "balance_loss_clip": 0.06272472, + "balance_loss_mlp": 0.0125466, + "epoch": 0.6230572673981662, + "flos": 29248989822720.0, + "grad_norm": 1.9753113738567412, + "language_loss": 0.67607152, + "learning_rate": 1.3141504912619658e-06, + "loss": 0.75285697, + "num_input_tokens_seen": 223243205, + "router_z_loss_clip": 1.40039062, + "router_z_loss_mlp": 0.11505127, + "step": 10363, + "time_per_iteration": 2.5485036373138428 + }, + { + "auxiliary_loss_clip": 0.06417742, + "auxiliary_loss_mlp": 0.01267367, + "balance_loss_clip": 0.06273825, + "balance_loss_mlp": 0.01256305, + "epoch": 0.6231173906508342, + "flos": 16331505488640.0, + "grad_norm": 1.8348569408777065, + "language_loss": 0.86522818, + "learning_rate": 1.3137846584228127e-06, + "loss": 0.94207931, + "num_input_tokens_seen": 223261370, + "router_z_loss_clip": 1.44042969, + "router_z_loss_mlp": 0.11071777, + "step": 10364, + "time_per_iteration": 2.510781764984131 + }, + { + "auxiliary_loss_clip": 0.06305057, + "auxiliary_loss_mlp": 0.01252144, + "balance_loss_clip": 0.06248282, + "balance_loss_mlp": 0.01250801, + "epoch": 0.6231775139035022, + "flos": 68719513587840.0, + "grad_norm": 0.8659025027753965, + "language_loss": 0.60801929, + "learning_rate": 1.313418851605015e-06, + "loss": 0.68359125, + "num_input_tokens_seen": 223315050, + "router_z_loss_clip": 0.5703125, + "router_z_loss_mlp": 0.01345062, + "step": 10365, + "time_per_iteration": 3.1263084411621094 + }, + { + "auxiliary_loss_clip": 0.06424095, + "auxiliary_loss_mlp": 0.0127084, + "balance_loss_clip": 0.0627584, + "balance_loss_mlp": 0.01257948, + "epoch": 0.6232376371561702, + "flos": 19825903036800.0, + "grad_norm": 1.776687810821879, + "language_loss": 0.75874949, + "learning_rate": 1.3130530708224427e-06, + "loss": 0.83569884, + "num_input_tokens_seen": 223332130, + "router_z_loss_clip": 1.48242188, + "router_z_loss_mlp": 0.12884521, + "step": 10366, + "time_per_iteration": 2.522902488708496 + }, + { + "auxiliary_loss_clip": 0.06416557, + "auxiliary_loss_mlp": 0.0126853, + "balance_loss_clip": 0.06272408, + "balance_loss_mlp": 0.01257372, + "epoch": 0.6232977604088381, + "flos": 23264969362560.0, + "grad_norm": 1.9573356945915528, + "language_loss": 0.77186829, + "learning_rate": 1.3126873160889665e-06, + "loss": 0.84871918, + "num_input_tokens_seen": 223351605, + "router_z_loss_clip": 1.44042969, + "router_z_loss_mlp": 0.11157227, + "step": 10367, + "time_per_iteration": 2.538060426712036 + }, + { + "auxiliary_loss_clip": 0.06409356, + "auxiliary_loss_mlp": 0.01268566, + "balance_loss_clip": 0.06272524, + "balance_loss_mlp": 0.0125841, + "epoch": 0.6233578836615061, + "flos": 21112907823360.0, + "grad_norm": 1.357507759578204, + "language_loss": 0.78851044, + "learning_rate": 1.312321587418457e-06, + "loss": 0.86528963, + "num_input_tokens_seen": 223372090, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.10162354, + "step": 10368, + "time_per_iteration": 2.525911569595337 + }, + { + "auxiliary_loss_clip": 0.06415667, + "auxiliary_loss_mlp": 0.01267784, + "balance_loss_clip": 0.06274077, + "balance_loss_mlp": 0.0125693, + "epoch": 0.623418006914174, + "flos": 23776266176640.0, + "grad_norm": 1.7380644464591393, + "language_loss": 0.69022548, + "learning_rate": 1.3119558848247811e-06, + "loss": 0.76706004, + "num_input_tokens_seen": 223390110, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.10864258, + "step": 10369, + "time_per_iteration": 3.9844348430633545 + }, + { + "auxiliary_loss_clip": 0.06414494, + "auxiliary_loss_mlp": 0.0126546, + "balance_loss_clip": 0.06272612, + "balance_loss_mlp": 0.01253861, + "epoch": 0.6234781301668421, + "flos": 17895556200960.0, + "grad_norm": 1.8898374142824015, + "language_loss": 0.88083899, + "learning_rate": 1.3115902083218072e-06, + "loss": 0.95763862, + "num_input_tokens_seen": 223404205, + "router_z_loss_clip": 1.41894531, + "router_z_loss_mlp": 0.1161499, + "step": 10370, + "time_per_iteration": 2.4602532386779785 + }, + { + "auxiliary_loss_clip": 0.06409945, + "auxiliary_loss_mlp": 0.01266714, + "balance_loss_clip": 0.06271629, + "balance_loss_mlp": 0.01256217, + "epoch": 0.62353825341951, + "flos": 26182424822400.0, + "grad_norm": 1.435666838781933, + "language_loss": 0.66256654, + "learning_rate": 1.311224557923402e-06, + "loss": 0.73933315, + "num_input_tokens_seen": 223424855, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.10510254, + "step": 10371, + "time_per_iteration": 2.585590124130249 + }, + { + "auxiliary_loss_clip": 0.06403823, + "auxiliary_loss_mlp": 0.01263874, + "balance_loss_clip": 0.06271943, + "balance_loss_mlp": 0.01254474, + "epoch": 0.623598376672178, + "flos": 31148044358400.0, + "grad_norm": 3.7034450225790962, + "language_loss": 0.77720612, + "learning_rate": 1.3108589336434298e-06, + "loss": 0.85388303, + "num_input_tokens_seen": 223447225, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09405518, + "step": 10372, + "time_per_iteration": 4.1913182735443115 + }, + { + "auxiliary_loss_clip": 0.06414315, + "auxiliary_loss_mlp": 0.01265562, + "balance_loss_clip": 0.06273008, + "balance_loss_mlp": 0.01254011, + "epoch": 0.6236584999248459, + "flos": 23736588468480.0, + "grad_norm": 1.6658386756111663, + "language_loss": 0.78006816, + "learning_rate": 1.3104933354957568e-06, + "loss": 0.85686696, + "num_input_tokens_seen": 223467520, + "router_z_loss_clip": 1.41210938, + "router_z_loss_mlp": 0.11553955, + "step": 10373, + "time_per_iteration": 2.5229697227478027 + }, + { + "auxiliary_loss_clip": 0.06407828, + "auxiliary_loss_mlp": 0.01266675, + "balance_loss_clip": 0.06271695, + "balance_loss_mlp": 0.01256494, + "epoch": 0.6237186231775139, + "flos": 21769289182080.0, + "grad_norm": 1.5443019053614775, + "language_loss": 0.69842112, + "learning_rate": 1.3101277634942448e-06, + "loss": 0.77516615, + "num_input_tokens_seen": 223488130, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.10174561, + "step": 10374, + "time_per_iteration": 2.546381711959839 + }, + { + "auxiliary_loss_clip": 0.06416135, + "auxiliary_loss_mlp": 0.01266815, + "balance_loss_clip": 0.06273846, + "balance_loss_mlp": 0.01256325, + "epoch": 0.6237787464301818, + "flos": 14944795943040.0, + "grad_norm": 1.644641658888945, + "language_loss": 0.77371937, + "learning_rate": 1.3097622176527577e-06, + "loss": 0.85054886, + "num_input_tokens_seen": 223505105, + "router_z_loss_clip": 1.42480469, + "router_z_loss_mlp": 0.10491943, + "step": 10375, + "time_per_iteration": 2.4894163608551025 + }, + { + "auxiliary_loss_clip": 0.06411552, + "auxiliary_loss_mlp": 0.01264147, + "balance_loss_clip": 0.06274613, + "balance_loss_mlp": 0.0125439, + "epoch": 0.6238388696828499, + "flos": 35599054844160.0, + "grad_norm": 1.2901779302370762, + "language_loss": 0.70425236, + "learning_rate": 1.3093966979851566e-06, + "loss": 0.78100938, + "num_input_tokens_seen": 223528065, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.09753418, + "step": 10376, + "time_per_iteration": 2.6778111457824707 + }, + { + "auxiliary_loss_clip": 0.06417015, + "auxiliary_loss_mlp": 0.01265351, + "balance_loss_clip": 0.06274082, + "balance_loss_mlp": 0.0125405, + "epoch": 0.6238989929355178, + "flos": 23630343091200.0, + "grad_norm": 1.5935175737828453, + "language_loss": 0.76607609, + "learning_rate": 1.309031204505301e-06, + "loss": 0.84289968, + "num_input_tokens_seen": 223547305, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.11315918, + "step": 10377, + "time_per_iteration": 4.115941524505615 + }, + { + "auxiliary_loss_clip": 0.06413018, + "auxiliary_loss_mlp": 0.01268384, + "balance_loss_clip": 0.06273637, + "balance_loss_mlp": 0.01258442, + "epoch": 0.6239591161881858, + "flos": 22093433902080.0, + "grad_norm": 1.8691726356193223, + "language_loss": 0.67910546, + "learning_rate": 1.308665737227052e-06, + "loss": 0.75591946, + "num_input_tokens_seen": 223567205, + "router_z_loss_clip": 1.39257812, + "router_z_loss_mlp": 0.09942627, + "step": 10378, + "time_per_iteration": 2.5460588932037354 + }, + { + "auxiliary_loss_clip": 0.06413449, + "auxiliary_loss_mlp": 0.01265408, + "balance_loss_clip": 0.06274828, + "balance_loss_mlp": 0.01254572, + "epoch": 0.6240192394408538, + "flos": 24542959835520.0, + "grad_norm": 1.7661801800879762, + "language_loss": 0.7668879, + "learning_rate": 1.3083002961642675e-06, + "loss": 0.84367645, + "num_input_tokens_seen": 223586560, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.1083374, + "step": 10379, + "time_per_iteration": 2.594383955001831 + }, + { + "auxiliary_loss_clip": 0.06411, + "auxiliary_loss_mlp": 0.01266487, + "balance_loss_clip": 0.0627025, + "balance_loss_mlp": 0.01255723, + "epoch": 0.6240793626935217, + "flos": 27940000538880.0, + "grad_norm": 1.331820718073444, + "language_loss": 0.79390121, + "learning_rate": 1.3079348813308051e-06, + "loss": 0.87067604, + "num_input_tokens_seen": 223610595, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.10766602, + "step": 10380, + "time_per_iteration": 2.593872308731079 + }, + { + "auxiliary_loss_clip": 0.06410354, + "auxiliary_loss_mlp": 0.01264738, + "balance_loss_clip": 0.06274755, + "balance_loss_mlp": 0.01254486, + "epoch": 0.6241394859461897, + "flos": 22899008655360.0, + "grad_norm": 1.5236398593874663, + "language_loss": 0.8010897, + "learning_rate": 1.3075694927405207e-06, + "loss": 0.87784058, + "num_input_tokens_seen": 223630230, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.1026001, + "step": 10381, + "time_per_iteration": 2.640678882598877 + }, + { + "auxiliary_loss_clip": 0.06414736, + "auxiliary_loss_mlp": 0.01267898, + "balance_loss_clip": 0.06274471, + "balance_loss_mlp": 0.01257079, + "epoch": 0.6241996091988576, + "flos": 12755781953280.0, + "grad_norm": 1.9060003648467456, + "language_loss": 0.74558902, + "learning_rate": 1.3072041304072718e-06, + "loss": 0.82241541, + "num_input_tokens_seen": 223648360, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.10821533, + "step": 10382, + "time_per_iteration": 2.479747772216797 + }, + { + "auxiliary_loss_clip": 0.06410253, + "auxiliary_loss_mlp": 0.01268006, + "balance_loss_clip": 0.06273764, + "balance_loss_mlp": 0.01258243, + "epoch": 0.6242597324515257, + "flos": 25858867080960.0, + "grad_norm": 1.410036242187738, + "language_loss": 0.78590852, + "learning_rate": 1.306838794344911e-06, + "loss": 0.8626911, + "num_input_tokens_seen": 223671255, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.09771729, + "step": 10383, + "time_per_iteration": 2.598404884338379 + }, + { + "auxiliary_loss_clip": 0.06411845, + "auxiliary_loss_mlp": 0.01264124, + "balance_loss_clip": 0.06273676, + "balance_loss_mlp": 0.01254236, + "epoch": 0.6243198557041936, + "flos": 19943804131200.0, + "grad_norm": 1.7487914543970622, + "language_loss": 0.75636935, + "learning_rate": 1.3064734845672925e-06, + "loss": 0.83312905, + "num_input_tokens_seen": 223689860, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.09899902, + "step": 10384, + "time_per_iteration": 2.493638038635254 + }, + { + "auxiliary_loss_clip": 0.06412329, + "auxiliary_loss_mlp": 0.01265797, + "balance_loss_clip": 0.06271704, + "balance_loss_mlp": 0.01254353, + "epoch": 0.6243799789568616, + "flos": 18412177749120.0, + "grad_norm": 2.229109392374204, + "language_loss": 0.66725862, + "learning_rate": 1.3061082010882694e-06, + "loss": 0.74403983, + "num_input_tokens_seen": 223707835, + "router_z_loss_clip": 1.40917969, + "router_z_loss_mlp": 0.11444092, + "step": 10385, + "time_per_iteration": 2.5185563564300537 + }, + { + "auxiliary_loss_clip": 0.06304897, + "auxiliary_loss_mlp": 0.01254771, + "balance_loss_clip": 0.06248314, + "balance_loss_mlp": 0.01253304, + "epoch": 0.6244401022095295, + "flos": 66048887128320.0, + "grad_norm": 0.7408334865403556, + "language_loss": 0.61911088, + "learning_rate": 1.305742943921692e-06, + "loss": 0.69470763, + "num_input_tokens_seen": 223771875, + "router_z_loss_clip": 0.56640625, + "router_z_loss_mlp": 0.01464844, + "step": 10386, + "time_per_iteration": 3.1636085510253906 + }, + { + "auxiliary_loss_clip": 0.06412023, + "auxiliary_loss_mlp": 0.01269919, + "balance_loss_clip": 0.06271843, + "balance_loss_mlp": 0.01258952, + "epoch": 0.6245002254621975, + "flos": 24578109423360.0, + "grad_norm": 2.35418101440168, + "language_loss": 0.71798837, + "learning_rate": 1.3053777130814128e-06, + "loss": 0.79480779, + "num_input_tokens_seen": 223788895, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.10974121, + "step": 10387, + "time_per_iteration": 2.5554144382476807 + }, + { + "auxiliary_loss_clip": 0.06417753, + "auxiliary_loss_mlp": 0.01266065, + "balance_loss_clip": 0.06271799, + "balance_loss_mlp": 0.01253399, + "epoch": 0.6245603487148654, + "flos": 29176510440960.0, + "grad_norm": 2.0504228233869886, + "language_loss": 0.65577459, + "learning_rate": 1.3050125085812798e-06, + "loss": 0.73261279, + "num_input_tokens_seen": 223810385, + "router_z_loss_clip": 1.45898438, + "router_z_loss_mlp": 0.12664795, + "step": 10388, + "time_per_iteration": 2.5694010257720947 + }, + { + "auxiliary_loss_clip": 0.0641178, + "auxiliary_loss_mlp": 0.0126472, + "balance_loss_clip": 0.06273006, + "balance_loss_mlp": 0.01255505, + "epoch": 0.6246204719675335, + "flos": 14794805934720.0, + "grad_norm": 1.572723869665335, + "language_loss": 0.79661775, + "learning_rate": 1.3046473304351417e-06, + "loss": 0.87338269, + "num_input_tokens_seen": 223826040, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.09216309, + "step": 10389, + "time_per_iteration": 2.497745990753174 + }, + { + "auxiliary_loss_clip": 0.06407995, + "auxiliary_loss_mlp": 0.01263578, + "balance_loss_clip": 0.06270336, + "balance_loss_mlp": 0.01253928, + "epoch": 0.6246805952202014, + "flos": 12498204902400.0, + "grad_norm": 2.3002980745210384, + "language_loss": 0.60729766, + "learning_rate": 1.3042821786568475e-06, + "loss": 0.68401337, + "num_input_tokens_seen": 223842300, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.09643555, + "step": 10390, + "time_per_iteration": 2.47084379196167 + }, + { + "auxiliary_loss_clip": 0.06418662, + "auxiliary_loss_mlp": 0.01265735, + "balance_loss_clip": 0.06275147, + "balance_loss_mlp": 0.01254553, + "epoch": 0.6247407184728694, + "flos": 12791602373760.0, + "grad_norm": 1.9019889358611486, + "language_loss": 0.77116674, + "learning_rate": 1.3039170532602416e-06, + "loss": 0.84801072, + "num_input_tokens_seen": 223858320, + "router_z_loss_clip": 1.43652344, + "router_z_loss_mlp": 0.11181641, + "step": 10391, + "time_per_iteration": 2.5408506393432617 + }, + { + "auxiliary_loss_clip": 0.06416374, + "auxiliary_loss_mlp": 0.0126612, + "balance_loss_clip": 0.06274267, + "balance_loss_mlp": 0.01255165, + "epoch": 0.6248008417255374, + "flos": 40639417822080.0, + "grad_norm": 1.6390307551388046, + "language_loss": 0.64875287, + "learning_rate": 1.3035519542591718e-06, + "loss": 0.72557783, + "num_input_tokens_seen": 223883545, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.10943604, + "step": 10392, + "time_per_iteration": 2.7098827362060547 + }, + { + "auxiliary_loss_clip": 0.06416553, + "auxiliary_loss_mlp": 0.01266782, + "balance_loss_clip": 0.06274416, + "balance_loss_mlp": 0.01255618, + "epoch": 0.6248609649782053, + "flos": 19908235272960.0, + "grad_norm": 1.9113748677122278, + "language_loss": 0.76920122, + "learning_rate": 1.3031868816674819e-06, + "loss": 0.84603459, + "num_input_tokens_seen": 223901445, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.11169434, + "step": 10393, + "time_per_iteration": 2.548680543899536 + }, + { + "auxiliary_loss_clip": 0.0641488, + "auxiliary_loss_mlp": 0.01268434, + "balance_loss_clip": 0.06272462, + "balance_loss_mlp": 0.01255971, + "epoch": 0.6249210882308733, + "flos": 19688868483840.0, + "grad_norm": 1.752087282406205, + "language_loss": 0.82699966, + "learning_rate": 1.3028218354990142e-06, + "loss": 0.90383279, + "num_input_tokens_seen": 223920170, + "router_z_loss_clip": 1.42285156, + "router_z_loss_mlp": 0.12451172, + "step": 10394, + "time_per_iteration": 2.5310568809509277 + }, + { + "auxiliary_loss_clip": 0.064147, + "auxiliary_loss_mlp": 0.01265504, + "balance_loss_clip": 0.06271648, + "balance_loss_mlp": 0.01254721, + "epoch": 0.6249812114835412, + "flos": 13995855653760.0, + "grad_norm": 1.7190801919243177, + "language_loss": 0.75490797, + "learning_rate": 1.3024568157676128e-06, + "loss": 0.83170998, + "num_input_tokens_seen": 223936495, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.10784912, + "step": 10395, + "time_per_iteration": 2.5296716690063477 + }, + { + "auxiliary_loss_clip": 0.06417533, + "auxiliary_loss_mlp": 0.01267604, + "balance_loss_clip": 0.06273706, + "balance_loss_mlp": 0.01256536, + "epoch": 0.6250413347362093, + "flos": 14533916647680.0, + "grad_norm": 2.451423836023636, + "language_loss": 0.73157996, + "learning_rate": 1.302091822487119e-06, + "loss": 0.80843133, + "num_input_tokens_seen": 223950070, + "router_z_loss_clip": 1.44042969, + "router_z_loss_mlp": 0.11065674, + "step": 10396, + "time_per_iteration": 2.5183842182159424 + }, + { + "auxiliary_loss_clip": 0.06411869, + "auxiliary_loss_mlp": 0.01266063, + "balance_loss_clip": 0.06272602, + "balance_loss_mlp": 0.01255936, + "epoch": 0.6251014579888772, + "flos": 22969098195840.0, + "grad_norm": 1.6502966804998584, + "language_loss": 0.76563799, + "learning_rate": 1.3017268556713732e-06, + "loss": 0.84241736, + "num_input_tokens_seen": 223970065, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.10131836, + "step": 10397, + "time_per_iteration": 2.5712759494781494 + }, + { + "auxiliary_loss_clip": 0.06415206, + "auxiliary_loss_mlp": 0.01267814, + "balance_loss_clip": 0.06274014, + "balance_loss_mlp": 0.0125718, + "epoch": 0.6251615812415452, + "flos": 28118809152000.0, + "grad_norm": 1.853529789472771, + "language_loss": 0.75433117, + "learning_rate": 1.3013619153342154e-06, + "loss": 0.83116138, + "num_input_tokens_seen": 223990315, + "router_z_loss_clip": 1.41308594, + "router_z_loss_mlp": 0.10638428, + "step": 10398, + "time_per_iteration": 4.095698595046997 + }, + { + "auxiliary_loss_clip": 0.0641809, + "auxiliary_loss_mlp": 0.01267876, + "balance_loss_clip": 0.0627377, + "balance_loss_mlp": 0.01256462, + "epoch": 0.6252217044942131, + "flos": 26731764190080.0, + "grad_norm": 1.615458357588448, + "language_loss": 0.74413693, + "learning_rate": 1.300997001489483e-06, + "loss": 0.82099664, + "num_input_tokens_seen": 224009960, + "router_z_loss_clip": 1.44335938, + "router_z_loss_mlp": 0.11419678, + "step": 10399, + "time_per_iteration": 2.5753824710845947 + }, + { + "auxiliary_loss_clip": 0.06412279, + "auxiliary_loss_mlp": 0.01266467, + "balance_loss_clip": 0.0627217, + "balance_loss_mlp": 0.01256287, + "epoch": 0.6252818277468811, + "flos": 20012216590080.0, + "grad_norm": 1.6187380573242784, + "language_loss": 0.74690026, + "learning_rate": 1.3006321141510147e-06, + "loss": 0.82368767, + "num_input_tokens_seen": 224028870, + "router_z_loss_clip": 1.40039062, + "router_z_loss_mlp": 0.10180664, + "step": 10400, + "time_per_iteration": 2.5361061096191406 + }, + { + "auxiliary_loss_clip": 0.06307141, + "auxiliary_loss_mlp": 0.01253939, + "balance_loss_clip": 0.06249951, + "balance_loss_mlp": 0.01252542, + "epoch": 0.625341950999549, + "flos": 59298550352640.0, + "grad_norm": 0.8247682302462489, + "language_loss": 0.56403446, + "learning_rate": 1.3002672533326465e-06, + "loss": 0.63964522, + "num_input_tokens_seen": 224094140, + "router_z_loss_clip": 0.57373047, + "router_z_loss_mlp": 0.01399231, + "step": 10401, + "time_per_iteration": 3.2024521827697754 + }, + { + "auxiliary_loss_clip": 0.06411454, + "auxiliary_loss_mlp": 0.01264191, + "balance_loss_clip": 0.06270526, + "balance_loss_mlp": 0.01253135, + "epoch": 0.625402074252217, + "flos": 20163296701440.0, + "grad_norm": 1.9270860159318792, + "language_loss": 0.82986021, + "learning_rate": 1.2999024190482146e-06, + "loss": 0.90661669, + "num_input_tokens_seen": 224113235, + "router_z_loss_clip": 1.40722656, + "router_z_loss_mlp": 0.1105957, + "step": 10402, + "time_per_iteration": 2.5365302562713623 + }, + { + "auxiliary_loss_clip": 0.06408338, + "auxiliary_loss_mlp": 0.01268072, + "balance_loss_clip": 0.06270024, + "balance_loss_mlp": 0.0125751, + "epoch": 0.625462197504885, + "flos": 29140228823040.0, + "grad_norm": 1.8928346901761637, + "language_loss": 0.68982589, + "learning_rate": 1.2995376113115527e-06, + "loss": 0.76659, + "num_input_tokens_seen": 224134530, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.10565186, + "step": 10403, + "time_per_iteration": 2.582432985305786 + }, + { + "auxiliary_loss_clip": 0.06414935, + "auxiliary_loss_mlp": 0.01268099, + "balance_loss_clip": 0.06273684, + "balance_loss_mlp": 0.01255791, + "epoch": 0.625522320757553, + "flos": 26111664449280.0, + "grad_norm": 1.458072120324879, + "language_loss": 0.7191205, + "learning_rate": 1.2991728301364954e-06, + "loss": 0.79595077, + "num_input_tokens_seen": 224154170, + "router_z_loss_clip": 1.41308594, + "router_z_loss_mlp": 0.12310791, + "step": 10404, + "time_per_iteration": 2.561168909072876 + }, + { + "auxiliary_loss_clip": 0.06414899, + "auxiliary_loss_mlp": 0.01265432, + "balance_loss_clip": 0.06274525, + "balance_loss_mlp": 0.01254376, + "epoch": 0.625582444010221, + "flos": 20637179867520.0, + "grad_norm": 1.708836006791191, + "language_loss": 0.69769311, + "learning_rate": 1.2988080755368742e-06, + "loss": 0.77449644, + "num_input_tokens_seen": 224172730, + "router_z_loss_clip": 1.40527344, + "router_z_loss_mlp": 0.11053467, + "step": 10405, + "time_per_iteration": 2.5165655612945557 + }, + { + "auxiliary_loss_clip": 0.06413669, + "auxiliary_loss_mlp": 0.01268037, + "balance_loss_clip": 0.06275192, + "balance_loss_mlp": 0.01257332, + "epoch": 0.6256425672628889, + "flos": 20527706108160.0, + "grad_norm": 1.5616382463324912, + "language_loss": 0.79137939, + "learning_rate": 1.2984433475265207e-06, + "loss": 0.86819649, + "num_input_tokens_seen": 224192620, + "router_z_loss_clip": 1.38476562, + "router_z_loss_mlp": 0.10693359, + "step": 10406, + "time_per_iteration": 2.526115894317627 + }, + { + "auxiliary_loss_clip": 0.06414723, + "auxiliary_loss_mlp": 0.01268249, + "balance_loss_clip": 0.06273726, + "balance_loss_mlp": 0.01257598, + "epoch": 0.6257026905155569, + "flos": 29536182092160.0, + "grad_norm": 1.7875701803121953, + "language_loss": 0.69265002, + "learning_rate": 1.2980786461192666e-06, + "loss": 0.76947975, + "num_input_tokens_seen": 224214660, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.10650635, + "step": 10407, + "time_per_iteration": 2.58450984954834 + }, + { + "auxiliary_loss_clip": 0.06403035, + "auxiliary_loss_mlp": 0.01268168, + "balance_loss_clip": 0.06269637, + "balance_loss_mlp": 0.01257898, + "epoch": 0.6257628137682248, + "flos": 24031788802560.0, + "grad_norm": 1.594681235705685, + "language_loss": 0.85355765, + "learning_rate": 1.2977139713289398e-06, + "loss": 0.93026972, + "num_input_tokens_seen": 224234170, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.10272217, + "step": 10408, + "time_per_iteration": 2.5464730262756348 + }, + { + "auxiliary_loss_clip": 0.06411938, + "auxiliary_loss_mlp": 0.01265758, + "balance_loss_clip": 0.06273568, + "balance_loss_mlp": 0.01255757, + "epoch": 0.6258229370208929, + "flos": 20857385197440.0, + "grad_norm": 1.6518363285256767, + "language_loss": 0.7993108, + "learning_rate": 1.2973493231693699e-06, + "loss": 0.87608778, + "num_input_tokens_seen": 224253115, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.09997559, + "step": 10409, + "time_per_iteration": 4.006382465362549 + }, + { + "auxiliary_loss_clip": 0.06408045, + "auxiliary_loss_mlp": 0.01265341, + "balance_loss_clip": 0.06269314, + "balance_loss_mlp": 0.0125475, + "epoch": 0.6258830602735608, + "flos": 22237218708480.0, + "grad_norm": 2.026280584027718, + "language_loss": 0.6951521, + "learning_rate": 1.2969847016543845e-06, + "loss": 0.77188593, + "num_input_tokens_seen": 224271375, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.10601807, + "step": 10410, + "time_per_iteration": 2.4960851669311523 + }, + { + "auxiliary_loss_clip": 0.06406428, + "auxiliary_loss_mlp": 0.0126592, + "balance_loss_clip": 0.0627175, + "balance_loss_mlp": 0.01256712, + "epoch": 0.6259431835262288, + "flos": 25082949473280.0, + "grad_norm": 1.7089284959721278, + "language_loss": 0.68380713, + "learning_rate": 1.2966201067978086e-06, + "loss": 0.76053059, + "num_input_tokens_seen": 224290315, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.09210205, + "step": 10411, + "time_per_iteration": 2.555173397064209 + }, + { + "auxiliary_loss_clip": 0.06413864, + "auxiliary_loss_mlp": 0.01267605, + "balance_loss_clip": 0.06273196, + "balance_loss_mlp": 0.01256489, + "epoch": 0.6260033067788967, + "flos": 28259072087040.0, + "grad_norm": 1.650436219337463, + "language_loss": 0.70024323, + "learning_rate": 1.2962555386134702e-06, + "loss": 0.77705795, + "num_input_tokens_seen": 224310545, + "router_z_loss_clip": 1.40722656, + "router_z_loss_mlp": 0.11114502, + "step": 10412, + "time_per_iteration": 4.113879919052124 + }, + { + "auxiliary_loss_clip": 0.06406923, + "auxiliary_loss_mlp": 0.01266017, + "balance_loss_clip": 0.06270111, + "balance_loss_mlp": 0.01256152, + "epoch": 0.6260634300315647, + "flos": 23374107705600.0, + "grad_norm": 1.4649345950741752, + "language_loss": 0.69805682, + "learning_rate": 1.2958909971151908e-06, + "loss": 0.77478617, + "num_input_tokens_seen": 224331115, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.09869385, + "step": 10413, + "time_per_iteration": 2.519340753555298 + }, + { + "auxiliary_loss_clip": 0.06415603, + "auxiliary_loss_mlp": 0.0126598, + "balance_loss_clip": 0.06269616, + "balance_loss_mlp": 0.01254101, + "epoch": 0.6261235532842326, + "flos": 18040221475200.0, + "grad_norm": 2.973303633857383, + "language_loss": 0.81012505, + "learning_rate": 1.295526482316796e-06, + "loss": 0.88694084, + "num_input_tokens_seen": 224347525, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.11877441, + "step": 10414, + "time_per_iteration": 2.5359139442443848 + }, + { + "auxiliary_loss_clip": 0.06411665, + "auxiliary_loss_mlp": 0.01265079, + "balance_loss_clip": 0.06273223, + "balance_loss_mlp": 0.012545, + "epoch": 0.6261836765369007, + "flos": 22016677962240.0, + "grad_norm": 1.921958755127535, + "language_loss": 0.74850363, + "learning_rate": 1.2951619942321083e-06, + "loss": 0.82527107, + "num_input_tokens_seen": 224367045, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.10577393, + "step": 10415, + "time_per_iteration": 2.529327630996704 + }, + { + "auxiliary_loss_clip": 0.06409019, + "auxiliary_loss_mlp": 0.01267114, + "balance_loss_clip": 0.06273155, + "balance_loss_mlp": 0.01256993, + "epoch": 0.6262437997895686, + "flos": 24942896173440.0, + "grad_norm": 1.4283741323498855, + "language_loss": 0.74384236, + "learning_rate": 1.2947975328749472e-06, + "loss": 0.82060367, + "num_input_tokens_seen": 224388860, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10119629, + "step": 10416, + "time_per_iteration": 2.626948595046997 + }, + { + "auxiliary_loss_clip": 0.06405699, + "auxiliary_loss_mlp": 0.01264334, + "balance_loss_clip": 0.06271897, + "balance_loss_mlp": 0.01254523, + "epoch": 0.6263039230422366, + "flos": 31615680395520.0, + "grad_norm": 1.6046151983772523, + "language_loss": 0.84637046, + "learning_rate": 1.2944330982591352e-06, + "loss": 0.92307079, + "num_input_tokens_seen": 224409645, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.09814453, + "step": 10417, + "time_per_iteration": 4.062727689743042 + }, + { + "auxiliary_loss_clip": 0.06414269, + "auxiliary_loss_mlp": 0.01264072, + "balance_loss_clip": 0.0627402, + "balance_loss_mlp": 0.01253713, + "epoch": 0.6263640462949046, + "flos": 17645232528000.0, + "grad_norm": 2.126036841621572, + "language_loss": 0.57267582, + "learning_rate": 1.2940686903984904e-06, + "loss": 0.6494593, + "num_input_tokens_seen": 224428530, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.10357666, + "step": 10418, + "time_per_iteration": 2.5384292602539062 + }, + { + "auxiliary_loss_clip": 0.06423989, + "auxiliary_loss_mlp": 0.01267395, + "balance_loss_clip": 0.06278068, + "balance_loss_mlp": 0.01255629, + "epoch": 0.6264241695475725, + "flos": 19981175852160.0, + "grad_norm": 2.5601033776039688, + "language_loss": 0.85281551, + "learning_rate": 1.2937043093068316e-06, + "loss": 0.92972934, + "num_input_tokens_seen": 224447175, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.11767578, + "step": 10419, + "time_per_iteration": 2.6254498958587646 + }, + { + "auxiliary_loss_clip": 0.0641915, + "auxiliary_loss_mlp": 0.01269689, + "balance_loss_clip": 0.06276678, + "balance_loss_mlp": 0.01258644, + "epoch": 0.6264842928002405, + "flos": 27351654295680.0, + "grad_norm": 1.7349665783281947, + "language_loss": 0.64790374, + "learning_rate": 1.2933399549979762e-06, + "loss": 0.72479212, + "num_input_tokens_seen": 224469445, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.1104126, + "step": 10420, + "time_per_iteration": 2.6838459968566895 + }, + { + "auxiliary_loss_clip": 0.06413981, + "auxiliary_loss_mlp": 0.01268518, + "balance_loss_clip": 0.0627203, + "balance_loss_mlp": 0.01257056, + "epoch": 0.6265444160529084, + "flos": 23002989972480.0, + "grad_norm": 1.7751280230906503, + "language_loss": 0.85910356, + "learning_rate": 1.292975627485741e-06, + "loss": 0.93592852, + "num_input_tokens_seen": 224486590, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.11462402, + "step": 10421, + "time_per_iteration": 2.502638101577759 + }, + { + "auxiliary_loss_clip": 0.06412976, + "auxiliary_loss_mlp": 0.01265861, + "balance_loss_clip": 0.06274194, + "balance_loss_mlp": 0.01255454, + "epoch": 0.6266045393055765, + "flos": 19944516890880.0, + "grad_norm": 1.9594550321950581, + "language_loss": 0.79719132, + "learning_rate": 1.2926113267839403e-06, + "loss": 0.87397969, + "num_input_tokens_seen": 224502795, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.10406494, + "step": 10422, + "time_per_iteration": 2.506927013397217 + }, + { + "auxiliary_loss_clip": 0.06411508, + "auxiliary_loss_mlp": 0.01266347, + "balance_loss_clip": 0.062728, + "balance_loss_mlp": 0.01255845, + "epoch": 0.6266646625582444, + "flos": 24395946647040.0, + "grad_norm": 1.5344190640547188, + "language_loss": 0.74784446, + "learning_rate": 1.292247052906389e-06, + "loss": 0.82462305, + "num_input_tokens_seen": 224522300, + "router_z_loss_clip": 1.38476562, + "router_z_loss_mlp": 0.10510254, + "step": 10423, + "time_per_iteration": 2.5245227813720703 + }, + { + "auxiliary_loss_clip": 0.064162, + "auxiliary_loss_mlp": 0.01266629, + "balance_loss_clip": 0.06277235, + "balance_loss_mlp": 0.01256186, + "epoch": 0.6267247858109124, + "flos": 14689021754880.0, + "grad_norm": 2.220018745384266, + "language_loss": 0.77700025, + "learning_rate": 1.2918828058669004e-06, + "loss": 0.85382849, + "num_input_tokens_seen": 224538260, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.10443115, + "step": 10424, + "time_per_iteration": 2.477313756942749 + }, + { + "auxiliary_loss_clip": 0.06416199, + "auxiliary_loss_mlp": 0.0126622, + "balance_loss_clip": 0.06277827, + "balance_loss_mlp": 0.01255139, + "epoch": 0.6267849090635803, + "flos": 24935852430720.0, + "grad_norm": 1.661217463389483, + "language_loss": 0.69195008, + "learning_rate": 1.2915185856792868e-06, + "loss": 0.76877427, + "num_input_tokens_seen": 224559155, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.11077881, + "step": 10425, + "time_per_iteration": 2.543240547180176 + }, + { + "auxiliary_loss_clip": 0.06407383, + "auxiliary_loss_mlp": 0.01264995, + "balance_loss_clip": 0.0627373, + "balance_loss_mlp": 0.01255232, + "epoch": 0.6268450323162483, + "flos": 25344886936320.0, + "grad_norm": 1.5301783551006911, + "language_loss": 0.74874127, + "learning_rate": 1.2911543923573598e-06, + "loss": 0.82546508, + "num_input_tokens_seen": 224578660, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.09765625, + "step": 10426, + "time_per_iteration": 2.541133403778076 + }, + { + "auxiliary_loss_clip": 0.06415579, + "auxiliary_loss_mlp": 0.01266633, + "balance_loss_clip": 0.06275427, + "balance_loss_mlp": 0.01256268, + "epoch": 0.6269051555689162, + "flos": 26184521174400.0, + "grad_norm": 1.3173967967859561, + "language_loss": 0.80809879, + "learning_rate": 1.290790225914929e-06, + "loss": 0.88492095, + "num_input_tokens_seen": 224599080, + "router_z_loss_clip": 1.40429688, + "router_z_loss_mlp": 0.10369873, + "step": 10427, + "time_per_iteration": 2.582977294921875 + }, + { + "auxiliary_loss_clip": 0.06420124, + "auxiliary_loss_mlp": 0.01267442, + "balance_loss_clip": 0.06276904, + "balance_loss_mlp": 0.01256618, + "epoch": 0.6269652788215843, + "flos": 18262271594880.0, + "grad_norm": 2.288264071636072, + "language_loss": 0.68539417, + "learning_rate": 1.2904260863658034e-06, + "loss": 0.76226991, + "num_input_tokens_seen": 224614225, + "router_z_loss_clip": 1.43164062, + "router_z_loss_mlp": 0.10821533, + "step": 10428, + "time_per_iteration": 2.470303773880005 + }, + { + "auxiliary_loss_clip": 0.06415083, + "auxiliary_loss_mlp": 0.01265748, + "balance_loss_clip": 0.06275322, + "balance_loss_mlp": 0.01255156, + "epoch": 0.6270254020742522, + "flos": 11770224629760.0, + "grad_norm": 1.7672728863863079, + "language_loss": 0.71438128, + "learning_rate": 1.2900619737237928e-06, + "loss": 0.79118955, + "num_input_tokens_seen": 224632365, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.105896, + "step": 10429, + "time_per_iteration": 2.4885928630828857 + }, + { + "auxiliary_loss_clip": 0.0641719, + "auxiliary_loss_mlp": 0.01266586, + "balance_loss_clip": 0.06274317, + "balance_loss_mlp": 0.01254665, + "epoch": 0.6270855253269202, + "flos": 23482114018560.0, + "grad_norm": 1.4192780160361307, + "language_loss": 0.80064285, + "learning_rate": 1.2896978880027023e-06, + "loss": 0.87748063, + "num_input_tokens_seen": 224651125, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.11920166, + "step": 10430, + "time_per_iteration": 2.695157766342163 + }, + { + "auxiliary_loss_clip": 0.06316154, + "auxiliary_loss_mlp": 0.01261761, + "balance_loss_clip": 0.0625899, + "balance_loss_mlp": 0.01260201, + "epoch": 0.6271456485795882, + "flos": 70084322490240.0, + "grad_norm": 0.7576452894497838, + "language_loss": 0.59208155, + "learning_rate": 1.2893338292163393e-06, + "loss": 0.66786075, + "num_input_tokens_seen": 224716115, + "router_z_loss_clip": 0.5703125, + "router_z_loss_mlp": 0.01556396, + "step": 10431, + "time_per_iteration": 3.2964041233062744 + }, + { + "auxiliary_loss_clip": 0.06312843, + "auxiliary_loss_mlp": 0.01258809, + "balance_loss_clip": 0.06255913, + "balance_loss_mlp": 0.01257378, + "epoch": 0.6272057718322561, + "flos": 65178673349760.0, + "grad_norm": 0.9858891279415538, + "language_loss": 0.63665617, + "learning_rate": 1.2889697973785095e-06, + "loss": 0.71237266, + "num_input_tokens_seen": 224782930, + "router_z_loss_clip": 0.5703125, + "router_z_loss_mlp": 0.01428223, + "step": 10432, + "time_per_iteration": 3.2280328273773193 + }, + { + "auxiliary_loss_clip": 0.06412185, + "auxiliary_loss_mlp": 0.01266828, + "balance_loss_clip": 0.06274938, + "balance_loss_mlp": 0.01256952, + "epoch": 0.6272658950849241, + "flos": 24396240136320.0, + "grad_norm": 1.6010176873941773, + "language_loss": 0.65241134, + "learning_rate": 1.2886057925030153e-06, + "loss": 0.72920156, + "num_input_tokens_seen": 224802010, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.09875488, + "step": 10433, + "time_per_iteration": 2.6001501083374023 + }, + { + "auxiliary_loss_clip": 0.06421921, + "auxiliary_loss_mlp": 0.01264381, + "balance_loss_clip": 0.06276838, + "balance_loss_mlp": 0.01252985, + "epoch": 0.627326018337592, + "flos": 17971515527040.0, + "grad_norm": 2.0859900141473897, + "language_loss": 0.62490857, + "learning_rate": 1.2882418146036612e-06, + "loss": 0.70177162, + "num_input_tokens_seen": 224818875, + "router_z_loss_clip": 1.44921875, + "router_z_loss_mlp": 0.1138916, + "step": 10434, + "time_per_iteration": 2.4881582260131836 + }, + { + "auxiliary_loss_clip": 0.06417267, + "auxiliary_loss_mlp": 0.01265758, + "balance_loss_clip": 0.06275722, + "balance_loss_mlp": 0.01255363, + "epoch": 0.6273861415902601, + "flos": 20236321134720.0, + "grad_norm": 1.4988303322096788, + "language_loss": 0.84577382, + "learning_rate": 1.2878778636942484e-06, + "loss": 0.92260414, + "num_input_tokens_seen": 224837790, + "router_z_loss_clip": 1.41601562, + "router_z_loss_mlp": 0.10394287, + "step": 10435, + "time_per_iteration": 2.508821487426758 + }, + { + "auxiliary_loss_clip": 0.06310409, + "auxiliary_loss_mlp": 0.01254017, + "balance_loss_clip": 0.06253147, + "balance_loss_mlp": 0.0125247, + "epoch": 0.627446264842928, + "flos": 64971605911680.0, + "grad_norm": 0.7140995203776986, + "language_loss": 0.6143651, + "learning_rate": 1.2875139397885786e-06, + "loss": 0.69000936, + "num_input_tokens_seen": 224899685, + "router_z_loss_clip": 0.57421875, + "router_z_loss_mlp": 0.01544952, + "step": 10436, + "time_per_iteration": 3.1841728687286377 + }, + { + "auxiliary_loss_clip": 0.06415884, + "auxiliary_loss_mlp": 0.01270936, + "balance_loss_clip": 0.06275365, + "balance_loss_mlp": 0.01259635, + "epoch": 0.627506388095596, + "flos": 23590623456000.0, + "grad_norm": 1.4165717499809394, + "language_loss": 0.77800572, + "learning_rate": 1.2871500429004523e-06, + "loss": 0.8548739, + "num_input_tokens_seen": 224918650, + "router_z_loss_clip": 1.40722656, + "router_z_loss_mlp": 0.11303711, + "step": 10437, + "time_per_iteration": 2.5377817153930664 + }, + { + "auxiliary_loss_clip": 0.06309696, + "auxiliary_loss_mlp": 0.01252859, + "balance_loss_clip": 0.06252521, + "balance_loss_mlp": 0.01251612, + "epoch": 0.6275665113482639, + "flos": 67603043059200.0, + "grad_norm": 0.7073778525823976, + "language_loss": 0.54094195, + "learning_rate": 1.2867861730436667e-06, + "loss": 0.61656755, + "num_input_tokens_seen": 224981575, + "router_z_loss_clip": 0.57226562, + "router_z_loss_mlp": 0.01247406, + "step": 10438, + "time_per_iteration": 4.560008764266968 + }, + { + "auxiliary_loss_clip": 0.06412268, + "auxiliary_loss_mlp": 0.01266715, + "balance_loss_clip": 0.06273399, + "balance_loss_mlp": 0.01255569, + "epoch": 0.6276266346009319, + "flos": 27644422861440.0, + "grad_norm": 1.692810124153385, + "language_loss": 0.84027016, + "learning_rate": 1.2864223302320214e-06, + "loss": 0.91705996, + "num_input_tokens_seen": 225000820, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.11138916, + "step": 10439, + "time_per_iteration": 2.5736849308013916 + }, + { + "auxiliary_loss_clip": 0.06415922, + "auxiliary_loss_mlp": 0.01266206, + "balance_loss_clip": 0.06272548, + "balance_loss_mlp": 0.01255399, + "epoch": 0.6276867578535998, + "flos": 22752540518400.0, + "grad_norm": 2.0302945438571047, + "language_loss": 0.80827779, + "learning_rate": 1.2860585144793128e-06, + "loss": 0.88509905, + "num_input_tokens_seen": 225017585, + "router_z_loss_clip": 1.43359375, + "router_z_loss_mlp": 0.10809326, + "step": 10440, + "time_per_iteration": 2.5353291034698486 + }, + { + "auxiliary_loss_clip": 0.06405526, + "auxiliary_loss_mlp": 0.01265635, + "balance_loss_clip": 0.0627224, + "balance_loss_mlp": 0.01256241, + "epoch": 0.6277468811062679, + "flos": 24651050002560.0, + "grad_norm": 1.4466963642107937, + "language_loss": 0.74692273, + "learning_rate": 1.285694725799337e-06, + "loss": 0.82363433, + "num_input_tokens_seen": 225039085, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.09393311, + "step": 10441, + "time_per_iteration": 2.5965688228607178 + }, + { + "auxiliary_loss_clip": 0.06410202, + "auxiliary_loss_mlp": 0.01267405, + "balance_loss_clip": 0.06272199, + "balance_loss_mlp": 0.01256932, + "epoch": 0.6278070043589358, + "flos": 19684466144640.0, + "grad_norm": 1.738690700547975, + "language_loss": 0.72243971, + "learning_rate": 1.2853309642058884e-06, + "loss": 0.79921579, + "num_input_tokens_seen": 225058105, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.1048584, + "step": 10442, + "time_per_iteration": 2.5236124992370605 + }, + { + "auxiliary_loss_clip": 0.06413672, + "auxiliary_loss_mlp": 0.01264225, + "balance_loss_clip": 0.06273279, + "balance_loss_mlp": 0.01254443, + "epoch": 0.6278671276116038, + "flos": 22127451459840.0, + "grad_norm": 1.5746919411428797, + "language_loss": 0.71842909, + "learning_rate": 1.284967229712762e-06, + "loss": 0.7952081, + "num_input_tokens_seen": 225077605, + "router_z_loss_clip": 1.40332031, + "router_z_loss_mlp": 0.09783936, + "step": 10443, + "time_per_iteration": 2.523799419403076 + }, + { + "auxiliary_loss_clip": 0.06411857, + "auxiliary_loss_mlp": 0.01265717, + "balance_loss_clip": 0.06272158, + "balance_loss_mlp": 0.01255412, + "epoch": 0.6279272508642717, + "flos": 23045099448960.0, + "grad_norm": 2.0032164077839787, + "language_loss": 0.73292875, + "learning_rate": 1.2846035223337492e-06, + "loss": 0.80970454, + "num_input_tokens_seen": 225097775, + "router_z_loss_clip": 1.39648438, + "router_z_loss_mlp": 0.10302734, + "step": 10444, + "time_per_iteration": 2.557166337966919 + }, + { + "auxiliary_loss_clip": 0.06410734, + "auxiliary_loss_mlp": 0.01266329, + "balance_loss_clip": 0.0627318, + "balance_loss_mlp": 0.01255595, + "epoch": 0.6279873741169397, + "flos": 19829466835200.0, + "grad_norm": 2.156521717901959, + "language_loss": 0.72276205, + "learning_rate": 1.2842398420826423e-06, + "loss": 0.79953271, + "num_input_tokens_seen": 225115585, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.10736084, + "step": 10445, + "time_per_iteration": 2.526127815246582 + }, + { + "auxiliary_loss_clip": 0.06412753, + "auxiliary_loss_mlp": 0.01265639, + "balance_loss_clip": 0.0627303, + "balance_loss_mlp": 0.01254601, + "epoch": 0.6280474973696077, + "flos": 23922273116160.0, + "grad_norm": 1.5888677783518865, + "language_loss": 0.69281161, + "learning_rate": 1.2838761889732331e-06, + "loss": 0.76959556, + "num_input_tokens_seen": 225135575, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.1104126, + "step": 10446, + "time_per_iteration": 2.530104637145996 + }, + { + "auxiliary_loss_clip": 0.06423883, + "auxiliary_loss_mlp": 0.01267771, + "balance_loss_clip": 0.06276697, + "balance_loss_mlp": 0.01256637, + "epoch": 0.6281076206222757, + "flos": 17973821514240.0, + "grad_norm": 1.8539120492479848, + "language_loss": 0.73894954, + "learning_rate": 1.2835125630193102e-06, + "loss": 0.81586611, + "num_input_tokens_seen": 225154230, + "router_z_loss_clip": 1.47167969, + "router_z_loss_mlp": 0.1114502, + "step": 10447, + "time_per_iteration": 2.4985270500183105 + }, + { + "auxiliary_loss_clip": 0.06304939, + "auxiliary_loss_mlp": 0.01257491, + "balance_loss_clip": 0.06248139, + "balance_loss_mlp": 0.01256266, + "epoch": 0.6281677438749437, + "flos": 66797216743680.0, + "grad_norm": 0.6871055611916008, + "language_loss": 0.51990867, + "learning_rate": 1.2831489642346626e-06, + "loss": 0.59553301, + "num_input_tokens_seen": 225213650, + "router_z_loss_clip": 0.5703125, + "router_z_loss_mlp": 0.01223755, + "step": 10448, + "time_per_iteration": 4.437039136886597 + }, + { + "auxiliary_loss_clip": 0.0641938, + "auxiliary_loss_mlp": 0.01268052, + "balance_loss_clip": 0.0627671, + "balance_loss_mlp": 0.01256346, + "epoch": 0.6282278671276116, + "flos": 11661002432640.0, + "grad_norm": 1.9501627229016425, + "language_loss": 0.91483194, + "learning_rate": 1.282785392633079e-06, + "loss": 0.99170625, + "num_input_tokens_seen": 225230135, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.1171875, + "step": 10449, + "time_per_iteration": 2.5085034370422363 + }, + { + "auxiliary_loss_clip": 0.06415906, + "auxiliary_loss_mlp": 0.01270346, + "balance_loss_clip": 0.06275564, + "balance_loss_mlp": 0.01260452, + "epoch": 0.6282879903802796, + "flos": 42751550090880.0, + "grad_norm": 1.4186227693043074, + "language_loss": 0.60281998, + "learning_rate": 1.2824218482283438e-06, + "loss": 0.67968249, + "num_input_tokens_seen": 225253520, + "router_z_loss_clip": 1.40332031, + "router_z_loss_mlp": 0.09893799, + "step": 10450, + "time_per_iteration": 2.6810834407806396 + }, + { + "auxiliary_loss_clip": 0.06408551, + "auxiliary_loss_mlp": 0.0126717, + "balance_loss_clip": 0.06272364, + "balance_loss_mlp": 0.01256269, + "epoch": 0.6283481136329475, + "flos": 20015067628800.0, + "grad_norm": 1.5189772221694435, + "language_loss": 0.77163285, + "learning_rate": 1.2820583310342452e-06, + "loss": 0.8483901, + "num_input_tokens_seen": 225272460, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.10906982, + "step": 10451, + "time_per_iteration": 2.5098116397857666 + }, + { + "auxiliary_loss_clip": 0.06416346, + "auxiliary_loss_mlp": 0.01265995, + "balance_loss_clip": 0.06274851, + "balance_loss_mlp": 0.01254652, + "epoch": 0.6284082368856155, + "flos": 21910264876800.0, + "grad_norm": 1.4797334153303925, + "language_loss": 0.77516776, + "learning_rate": 1.281694841064566e-06, + "loss": 0.85199118, + "num_input_tokens_seen": 225291700, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.11346436, + "step": 10452, + "time_per_iteration": 4.029058933258057 + }, + { + "auxiliary_loss_clip": 0.06413398, + "auxiliary_loss_mlp": 0.01268188, + "balance_loss_clip": 0.06273846, + "balance_loss_mlp": 0.01257173, + "epoch": 0.6284683601382834, + "flos": 25491313146240.0, + "grad_norm": 1.654591158178899, + "language_loss": 0.72948235, + "learning_rate": 1.2813313783330904e-06, + "loss": 0.8062982, + "num_input_tokens_seen": 225311470, + "router_z_loss_clip": 1.39648438, + "router_z_loss_mlp": 0.11029053, + "step": 10453, + "time_per_iteration": 2.542074680328369 + }, + { + "auxiliary_loss_clip": 0.06415626, + "auxiliary_loss_mlp": 0.0126451, + "balance_loss_clip": 0.0627359, + "balance_loss_mlp": 0.01253268, + "epoch": 0.6285284833909515, + "flos": 16543241556480.0, + "grad_norm": 1.6231177337896328, + "language_loss": 0.80777168, + "learning_rate": 1.2809679428536013e-06, + "loss": 0.88457304, + "num_input_tokens_seen": 225328385, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.11236572, + "step": 10454, + "time_per_iteration": 2.5263936519622803 + }, + { + "auxiliary_loss_clip": 0.06409679, + "auxiliary_loss_mlp": 0.0127067, + "balance_loss_clip": 0.06273915, + "balance_loss_mlp": 0.01260728, + "epoch": 0.6285886066436194, + "flos": 22827367814400.0, + "grad_norm": 1.7338027562142968, + "language_loss": 0.82249027, + "learning_rate": 1.2806045346398792e-06, + "loss": 0.89929378, + "num_input_tokens_seen": 225348415, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.09936523, + "step": 10455, + "time_per_iteration": 2.500506639480591 + }, + { + "auxiliary_loss_clip": 0.06415103, + "auxiliary_loss_mlp": 0.01264745, + "balance_loss_clip": 0.0627536, + "balance_loss_mlp": 0.01254225, + "epoch": 0.6286487298962874, + "flos": 24722355427200.0, + "grad_norm": 1.4932136487879293, + "language_loss": 0.82079554, + "learning_rate": 1.280241153705706e-06, + "loss": 0.89759403, + "num_input_tokens_seen": 225367740, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.10516357, + "step": 10456, + "time_per_iteration": 2.561309814453125 + }, + { + "auxiliary_loss_clip": 0.06420746, + "auxiliary_loss_mlp": 0.01268645, + "balance_loss_clip": 0.06275859, + "balance_loss_mlp": 0.01257624, + "epoch": 0.6287088531489553, + "flos": 20747114824320.0, + "grad_norm": 1.4461153744951818, + "language_loss": 0.72119695, + "learning_rate": 1.27987780006486e-06, + "loss": 0.79809082, + "num_input_tokens_seen": 225388405, + "router_z_loss_clip": 1.44921875, + "router_z_loss_mlp": 0.11022949, + "step": 10457, + "time_per_iteration": 3.957395076751709 + }, + { + "auxiliary_loss_clip": 0.06422028, + "auxiliary_loss_mlp": 0.01264534, + "balance_loss_clip": 0.06275769, + "balance_loss_mlp": 0.01253316, + "epoch": 0.6287689764016233, + "flos": 23076433676160.0, + "grad_norm": 1.6277999457875445, + "language_loss": 0.79939413, + "learning_rate": 1.2795144737311202e-06, + "loss": 0.8762598, + "num_input_tokens_seen": 225408360, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.11224365, + "step": 10458, + "time_per_iteration": 2.5144598484039307 + }, + { + "auxiliary_loss_clip": 0.06420826, + "auxiliary_loss_mlp": 0.01272203, + "balance_loss_clip": 0.06276783, + "balance_loss_mlp": 0.01261081, + "epoch": 0.6288290996542913, + "flos": 32241859557120.0, + "grad_norm": 1.5510176438747023, + "language_loss": 0.61428088, + "learning_rate": 1.2791511747182635e-06, + "loss": 0.69121122, + "num_input_tokens_seen": 225431310, + "router_z_loss_clip": 1.44140625, + "router_z_loss_mlp": 0.11120605, + "step": 10459, + "time_per_iteration": 2.673271894454956 + }, + { + "auxiliary_loss_clip": 0.06418507, + "auxiliary_loss_mlp": 0.01266867, + "balance_loss_clip": 0.06276773, + "balance_loss_mlp": 0.01256066, + "epoch": 0.6288892229069593, + "flos": 24647695839360.0, + "grad_norm": 1.5279768291149622, + "language_loss": 0.79008341, + "learning_rate": 1.2787879030400666e-06, + "loss": 0.86693716, + "num_input_tokens_seen": 225450385, + "router_z_loss_clip": 1.41894531, + "router_z_loss_mlp": 0.10809326, + "step": 10460, + "time_per_iteration": 2.5390427112579346 + }, + { + "auxiliary_loss_clip": 0.06411569, + "auxiliary_loss_mlp": 0.0126639, + "balance_loss_clip": 0.06274751, + "balance_loss_mlp": 0.01256305, + "epoch": 0.6289493461596273, + "flos": 17864138119680.0, + "grad_norm": 1.9201849344746347, + "language_loss": 0.73887581, + "learning_rate": 1.2784246587103047e-06, + "loss": 0.81565541, + "num_input_tokens_seen": 225467325, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.10089111, + "step": 10461, + "time_per_iteration": 2.524601459503174 + }, + { + "auxiliary_loss_clip": 0.06411408, + "auxiliary_loss_mlp": 0.01263734, + "balance_loss_clip": 0.06275996, + "balance_loss_mlp": 0.01253637, + "epoch": 0.6290094694122952, + "flos": 22351807566720.0, + "grad_norm": 1.8529909730554852, + "language_loss": 0.70305121, + "learning_rate": 1.2780614417427523e-06, + "loss": 0.77980262, + "num_input_tokens_seen": 225487370, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.10101318, + "step": 10462, + "time_per_iteration": 2.5161097049713135 + }, + { + "auxiliary_loss_clip": 0.06407323, + "auxiliary_loss_mlp": 0.01263666, + "balance_loss_clip": 0.06275059, + "balance_loss_mlp": 0.01254224, + "epoch": 0.6290695926649632, + "flos": 28409942563200.0, + "grad_norm": 1.9398923730208482, + "language_loss": 0.72176754, + "learning_rate": 1.2776982521511821e-06, + "loss": 0.79847741, + "num_input_tokens_seen": 225506915, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.09442139, + "step": 10463, + "time_per_iteration": 2.579223394393921 + }, + { + "auxiliary_loss_clip": 0.06409386, + "auxiliary_loss_mlp": 0.01271723, + "balance_loss_clip": 0.06276202, + "balance_loss_mlp": 0.01261751, + "epoch": 0.6291297159176311, + "flos": 21511628277120.0, + "grad_norm": 1.539324014350412, + "language_loss": 0.7288208, + "learning_rate": 1.2773350899493665e-06, + "loss": 0.80563188, + "num_input_tokens_seen": 225525670, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.09967041, + "step": 10464, + "time_per_iteration": 2.494276762008667 + }, + { + "auxiliary_loss_clip": 0.06412283, + "auxiliary_loss_mlp": 0.01266428, + "balance_loss_clip": 0.06275527, + "balance_loss_mlp": 0.01256969, + "epoch": 0.6291898391702991, + "flos": 12208203521280.0, + "grad_norm": 1.7590102978799784, + "language_loss": 0.69385099, + "learning_rate": 1.2769719551510768e-06, + "loss": 0.77063811, + "num_input_tokens_seen": 225542235, + "router_z_loss_clip": 1.36621094, + "router_z_loss_mlp": 0.09466553, + "step": 10465, + "time_per_iteration": 2.5754034519195557 + }, + { + "auxiliary_loss_clip": 0.06303164, + "auxiliary_loss_mlp": 0.01258656, + "balance_loss_clip": 0.06246626, + "balance_loss_mlp": 0.01257341, + "epoch": 0.629249962422967, + "flos": 69319347840000.0, + "grad_norm": 0.6721611616517246, + "language_loss": 0.59656096, + "learning_rate": 1.2766088477700832e-06, + "loss": 0.67217922, + "num_input_tokens_seen": 225607185, + "router_z_loss_clip": 0.56640625, + "router_z_loss_mlp": 0.01316833, + "step": 10466, + "time_per_iteration": 3.231010913848877 + }, + { + "auxiliary_loss_clip": 0.0640944, + "auxiliary_loss_mlp": 0.01262544, + "balance_loss_clip": 0.06271854, + "balance_loss_mlp": 0.01253305, + "epoch": 0.6293100856756351, + "flos": 40087353196800.0, + "grad_norm": 2.1464377164547916, + "language_loss": 0.64920712, + "learning_rate": 1.276245767820154e-06, + "loss": 0.72592694, + "num_input_tokens_seen": 225628785, + "router_z_loss_clip": 1.37695312, + "router_z_loss_mlp": 0.09234619, + "step": 10467, + "time_per_iteration": 2.7820122241973877 + }, + { + "auxiliary_loss_clip": 0.06300975, + "auxiliary_loss_mlp": 0.01258806, + "balance_loss_clip": 0.06244308, + "balance_loss_mlp": 0.01257555, + "epoch": 0.629370208928303, + "flos": 67518907960320.0, + "grad_norm": 0.7784779642706487, + "language_loss": 0.56803113, + "learning_rate": 1.2758827153150586e-06, + "loss": 0.64362895, + "num_input_tokens_seen": 225678980, + "router_z_loss_clip": 0.56640625, + "router_z_loss_mlp": 0.01250458, + "step": 10468, + "time_per_iteration": 2.934441089630127 + }, + { + "auxiliary_loss_clip": 0.06299016, + "auxiliary_loss_mlp": 0.0125297, + "balance_loss_clip": 0.06242396, + "balance_loss_mlp": 0.01251782, + "epoch": 0.629430332180971, + "flos": 60680228653440.0, + "grad_norm": 0.7475097067157215, + "language_loss": 0.57685459, + "learning_rate": 1.2755196902685626e-06, + "loss": 0.65237445, + "num_input_tokens_seen": 225740295, + "router_z_loss_clip": 0.56689453, + "router_z_loss_mlp": 0.01186371, + "step": 10469, + "time_per_iteration": 3.097425699234009 + }, + { + "auxiliary_loss_clip": 0.06301235, + "auxiliary_loss_mlp": 0.01251651, + "balance_loss_clip": 0.0624446, + "balance_loss_mlp": 0.01250373, + "epoch": 0.6294904554336389, + "flos": 66891707821440.0, + "grad_norm": 0.675756451414952, + "language_loss": 0.5208174, + "learning_rate": 1.2751566926944329e-06, + "loss": 0.59634632, + "num_input_tokens_seen": 225805615, + "router_z_loss_clip": 0.5703125, + "router_z_loss_mlp": 0.01277924, + "step": 10470, + "time_per_iteration": 3.224271774291992 + }, + { + "auxiliary_loss_clip": 0.06409313, + "auxiliary_loss_mlp": 0.01268407, + "balance_loss_clip": 0.06274216, + "balance_loss_mlp": 0.01258322, + "epoch": 0.6295505786863069, + "flos": 42532728353280.0, + "grad_norm": 1.628220195821946, + "language_loss": 0.75025994, + "learning_rate": 1.2747937226064342e-06, + "loss": 0.8270371, + "num_input_tokens_seen": 225826585, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.10076904, + "step": 10471, + "time_per_iteration": 2.7104806900024414 + }, + { + "auxiliary_loss_clip": 0.06416945, + "auxiliary_loss_mlp": 0.01263691, + "balance_loss_clip": 0.06276174, + "balance_loss_mlp": 0.01253689, + "epoch": 0.629610701938975, + "flos": 17389877610240.0, + "grad_norm": 1.7371618192940372, + "language_loss": 0.63321209, + "learning_rate": 1.2744307800183297e-06, + "loss": 0.71001846, + "num_input_tokens_seen": 225844095, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.10003662, + "step": 10472, + "time_per_iteration": 2.51810884475708 + }, + { + "auxiliary_loss_clip": 0.06414427, + "auxiliary_loss_mlp": 0.01266162, + "balance_loss_clip": 0.06273856, + "balance_loss_mlp": 0.01255434, + "epoch": 0.6296708251916429, + "flos": 24249730072320.0, + "grad_norm": 1.5892163482922788, + "language_loss": 0.69503713, + "learning_rate": 1.2740678649438828e-06, + "loss": 0.77184302, + "num_input_tokens_seen": 225864310, + "router_z_loss_clip": 1.40527344, + "router_z_loss_mlp": 0.10717773, + "step": 10473, + "time_per_iteration": 2.5234594345092773 + }, + { + "auxiliary_loss_clip": 0.06411944, + "auxiliary_loss_mlp": 0.0126239, + "balance_loss_clip": 0.0627473, + "balance_loss_mlp": 0.01252502, + "epoch": 0.6297309484443109, + "flos": 19284110536320.0, + "grad_norm": 1.4968676246915393, + "language_loss": 0.74922514, + "learning_rate": 1.2737049773968554e-06, + "loss": 0.8259685, + "num_input_tokens_seen": 225883830, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.09899902, + "step": 10474, + "time_per_iteration": 2.581749200820923 + }, + { + "auxiliary_loss_clip": 0.06412183, + "auxiliary_loss_mlp": 0.01264808, + "balance_loss_clip": 0.06272481, + "balance_loss_mlp": 0.0125455, + "epoch": 0.6297910716969788, + "flos": 30670261977600.0, + "grad_norm": 1.6340326591826166, + "language_loss": 0.66562986, + "learning_rate": 1.2733421173910081e-06, + "loss": 0.74239981, + "num_input_tokens_seen": 225905755, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.10253906, + "step": 10475, + "time_per_iteration": 2.6167984008789062 + }, + { + "auxiliary_loss_clip": 0.06403632, + "auxiliary_loss_mlp": 0.01261865, + "balance_loss_clip": 0.06270278, + "balance_loss_mlp": 0.01252107, + "epoch": 0.6298511949496468, + "flos": 14427293927040.0, + "grad_norm": 1.8082220709351975, + "language_loss": 0.90615106, + "learning_rate": 1.272979284940101e-06, + "loss": 0.98280615, + "num_input_tokens_seen": 225922155, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.09759521, + "step": 10476, + "time_per_iteration": 2.5575828552246094 + }, + { + "auxiliary_loss_clip": 0.06412712, + "auxiliary_loss_mlp": 0.01271614, + "balance_loss_clip": 0.06276231, + "balance_loss_mlp": 0.01261285, + "epoch": 0.6299113182023147, + "flos": 23520995112960.0, + "grad_norm": 1.6129960695216716, + "language_loss": 0.75463134, + "learning_rate": 1.2726164800578913e-06, + "loss": 0.83147454, + "num_input_tokens_seen": 225941060, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.10321045, + "step": 10477, + "time_per_iteration": 4.080779314041138 + }, + { + "auxiliary_loss_clip": 0.06409407, + "auxiliary_loss_mlp": 0.01263638, + "balance_loss_clip": 0.06271356, + "balance_loss_mlp": 0.01252927, + "epoch": 0.6299714414549827, + "flos": 22681109312640.0, + "grad_norm": 1.9893759064975287, + "language_loss": 0.70635891, + "learning_rate": 1.272253702758138e-06, + "loss": 0.7830894, + "num_input_tokens_seen": 225960870, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.10717773, + "step": 10478, + "time_per_iteration": 2.526340961456299 + }, + { + "auxiliary_loss_clip": 0.06415921, + "auxiliary_loss_mlp": 0.01267154, + "balance_loss_clip": 0.06272538, + "balance_loss_mlp": 0.01256419, + "epoch": 0.6300315647076506, + "flos": 14506984759680.0, + "grad_norm": 2.55864896023097, + "language_loss": 0.6816293, + "learning_rate": 1.2718909530545974e-06, + "loss": 0.75846004, + "num_input_tokens_seen": 225977895, + "router_z_loss_clip": 1.43359375, + "router_z_loss_mlp": 0.10742188, + "step": 10479, + "time_per_iteration": 2.5156965255737305 + }, + { + "auxiliary_loss_clip": 0.06411125, + "auxiliary_loss_mlp": 0.01264946, + "balance_loss_clip": 0.06273742, + "balance_loss_mlp": 0.01254188, + "epoch": 0.6300916879603187, + "flos": 21878134035840.0, + "grad_norm": 1.462422599280115, + "language_loss": 0.73846787, + "learning_rate": 1.2715282309610245e-06, + "loss": 0.81522858, + "num_input_tokens_seen": 225997835, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.10760498, + "step": 10480, + "time_per_iteration": 2.528325319290161 + }, + { + "auxiliary_loss_clip": 0.06412197, + "auxiliary_loss_mlp": 0.01263721, + "balance_loss_clip": 0.06272143, + "balance_loss_mlp": 0.01253141, + "epoch": 0.6301518112129866, + "flos": 21840301117440.0, + "grad_norm": 1.7175758648379602, + "language_loss": 0.78970373, + "learning_rate": 1.2711655364911744e-06, + "loss": 0.86646283, + "num_input_tokens_seen": 226017620, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.10571289, + "step": 10481, + "time_per_iteration": 2.60512638092041 + }, + { + "auxiliary_loss_clip": 0.06303924, + "auxiliary_loss_mlp": 0.01252426, + "balance_loss_clip": 0.06247687, + "balance_loss_mlp": 0.01251297, + "epoch": 0.6302119344656546, + "flos": 44348429675520.0, + "grad_norm": 0.8754005674495109, + "language_loss": 0.61759591, + "learning_rate": 1.2708028696588e-06, + "loss": 0.69315946, + "num_input_tokens_seen": 226068755, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01131439, + "step": 10482, + "time_per_iteration": 2.8790156841278076 + }, + { + "auxiliary_loss_clip": 0.06422234, + "auxiliary_loss_mlp": 0.01270647, + "balance_loss_clip": 0.0627502, + "balance_loss_mlp": 0.01259125, + "epoch": 0.6302720577183225, + "flos": 11222604270720.0, + "grad_norm": 1.8532441203732761, + "language_loss": 0.82836294, + "learning_rate": 1.2704402304776541e-06, + "loss": 0.90529174, + "num_input_tokens_seen": 226084395, + "router_z_loss_clip": 1.47167969, + "router_z_loss_mlp": 0.11517334, + "step": 10483, + "time_per_iteration": 2.5396814346313477 + }, + { + "auxiliary_loss_clip": 0.06401882, + "auxiliary_loss_mlp": 0.01265558, + "balance_loss_clip": 0.06271434, + "balance_loss_mlp": 0.01255873, + "epoch": 0.6303321809709905, + "flos": 27972424869120.0, + "grad_norm": 1.7223788623313236, + "language_loss": 0.72617853, + "learning_rate": 1.270077618961487e-06, + "loss": 0.80285299, + "num_input_tokens_seen": 226105890, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.09680176, + "step": 10484, + "time_per_iteration": 2.580455780029297 + }, + { + "auxiliary_loss_clip": 0.06412905, + "auxiliary_loss_mlp": 0.01264883, + "balance_loss_clip": 0.06272406, + "balance_loss_mlp": 0.01254804, + "epoch": 0.6303923042236586, + "flos": 28228366765440.0, + "grad_norm": 1.5965857276488986, + "language_loss": 0.74397701, + "learning_rate": 1.2697150351240506e-06, + "loss": 0.82075489, + "num_input_tokens_seen": 226126760, + "router_z_loss_clip": 1.40429688, + "router_z_loss_mlp": 0.10064697, + "step": 10485, + "time_per_iteration": 2.5941050052642822 + }, + { + "auxiliary_loss_clip": 0.06418431, + "auxiliary_loss_mlp": 0.01266454, + "balance_loss_clip": 0.06274744, + "balance_loss_mlp": 0.01255552, + "epoch": 0.6304524274763265, + "flos": 27637546826880.0, + "grad_norm": 2.046844751133349, + "language_loss": 0.81281161, + "learning_rate": 1.269352478979093e-06, + "loss": 0.88966042, + "num_input_tokens_seen": 226147315, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.10906982, + "step": 10486, + "time_per_iteration": 2.558913469314575 + }, + { + "auxiliary_loss_clip": 0.06410582, + "auxiliary_loss_mlp": 0.01264672, + "balance_loss_clip": 0.06273519, + "balance_loss_mlp": 0.01254617, + "epoch": 0.6305125507289945, + "flos": 17317062812160.0, + "grad_norm": 2.0599224612771923, + "language_loss": 0.6412251, + "learning_rate": 1.2689899505403628e-06, + "loss": 0.71797758, + "num_input_tokens_seen": 226165935, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.1005249, + "step": 10487, + "time_per_iteration": 2.5042107105255127 + }, + { + "auxiliary_loss_clip": 0.06409851, + "auxiliary_loss_mlp": 0.01271472, + "balance_loss_clip": 0.06273416, + "balance_loss_mlp": 0.01261816, + "epoch": 0.6305726739816624, + "flos": 25814745106560.0, + "grad_norm": 1.4604670858512163, + "language_loss": 0.67510849, + "learning_rate": 1.2686274498216065e-06, + "loss": 0.75192171, + "num_input_tokens_seen": 226186890, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.09655762, + "step": 10488, + "time_per_iteration": 4.039014101028442 + }, + { + "auxiliary_loss_clip": 0.06409914, + "auxiliary_loss_mlp": 0.01266449, + "balance_loss_clip": 0.06271197, + "balance_loss_mlp": 0.01255827, + "epoch": 0.6306327972343304, + "flos": 21803684083200.0, + "grad_norm": 1.7399651792203026, + "language_loss": 0.67476416, + "learning_rate": 1.2682649768365706e-06, + "loss": 0.75152779, + "num_input_tokens_seen": 226206710, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.10620117, + "step": 10489, + "time_per_iteration": 2.522010564804077 + }, + { + "auxiliary_loss_clip": 0.06421866, + "auxiliary_loss_mlp": 0.01264052, + "balance_loss_clip": 0.06273416, + "balance_loss_mlp": 0.0125256, + "epoch": 0.6306929204869983, + "flos": 20783689931520.0, + "grad_norm": 1.8067939569631877, + "language_loss": 0.69957733, + "learning_rate": 1.2679025315990007e-06, + "loss": 0.77643645, + "num_input_tokens_seen": 226225565, + "router_z_loss_clip": 1.48242188, + "router_z_loss_mlp": 0.11486816, + "step": 10490, + "time_per_iteration": 2.56429123878479 + }, + { + "auxiliary_loss_clip": 0.06410774, + "auxiliary_loss_mlp": 0.01267822, + "balance_loss_clip": 0.06271295, + "balance_loss_mlp": 0.01257123, + "epoch": 0.6307530437396663, + "flos": 23660084090880.0, + "grad_norm": 1.7944305121470099, + "language_loss": 0.78453183, + "learning_rate": 1.2675401141226393e-06, + "loss": 0.86131787, + "num_input_tokens_seen": 226243680, + "router_z_loss_clip": 1.39648438, + "router_z_loss_mlp": 0.10699463, + "step": 10491, + "time_per_iteration": 3.9702792167663574 + }, + { + "auxiliary_loss_clip": 0.06410797, + "auxiliary_loss_mlp": 0.01264458, + "balance_loss_clip": 0.0627301, + "balance_loss_mlp": 0.01253753, + "epoch": 0.6308131669923343, + "flos": 24726170787840.0, + "grad_norm": 2.4094216465826914, + "language_loss": 0.55782068, + "learning_rate": 1.2671777244212308e-06, + "loss": 0.63457322, + "num_input_tokens_seen": 226264345, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.10705566, + "step": 10492, + "time_per_iteration": 2.5553138256073 + }, + { + "auxiliary_loss_clip": 0.06413063, + "auxiliary_loss_mlp": 0.01265406, + "balance_loss_clip": 0.06272843, + "balance_loss_mlp": 0.01254772, + "epoch": 0.6308732902450023, + "flos": 22572054823680.0, + "grad_norm": 2.1354270064325935, + "language_loss": 0.64787519, + "learning_rate": 1.2668153625085168e-06, + "loss": 0.72465986, + "num_input_tokens_seen": 226283165, + "router_z_loss_clip": 1.40332031, + "router_z_loss_mlp": 0.10620117, + "step": 10493, + "time_per_iteration": 2.532414197921753 + }, + { + "auxiliary_loss_clip": 0.06409045, + "auxiliary_loss_mlp": 0.01266138, + "balance_loss_clip": 0.06271107, + "balance_loss_mlp": 0.01255797, + "epoch": 0.6309334134976702, + "flos": 24651050002560.0, + "grad_norm": 1.3969800101414371, + "language_loss": 0.82710558, + "learning_rate": 1.2664530283982367e-06, + "loss": 0.90385741, + "num_input_tokens_seen": 226304080, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.10345459, + "step": 10494, + "time_per_iteration": 2.5479516983032227 + }, + { + "auxiliary_loss_clip": 0.06410792, + "auxiliary_loss_mlp": 0.01270884, + "balance_loss_clip": 0.06271842, + "balance_loss_mlp": 0.01260691, + "epoch": 0.6309935367503382, + "flos": 41437655343360.0, + "grad_norm": 1.6454448829725794, + "language_loss": 0.79526448, + "learning_rate": 1.2660907221041317e-06, + "loss": 0.87208128, + "num_input_tokens_seen": 226325925, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.10192871, + "step": 10495, + "time_per_iteration": 2.705066204071045 + }, + { + "auxiliary_loss_clip": 0.06412271, + "auxiliary_loss_mlp": 0.01267403, + "balance_loss_clip": 0.06272048, + "balance_loss_mlp": 0.01257019, + "epoch": 0.6310536600030061, + "flos": 15123772264320.0, + "grad_norm": 1.7689443425086426, + "language_loss": 0.70583153, + "learning_rate": 1.2657284436399403e-06, + "loss": 0.78262818, + "num_input_tokens_seen": 226344190, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.1038208, + "step": 10496, + "time_per_iteration": 2.4985408782958984 + }, + { + "auxiliary_loss_clip": 0.06412859, + "auxiliary_loss_mlp": 0.01267227, + "balance_loss_clip": 0.06273797, + "balance_loss_mlp": 0.01256212, + "epoch": 0.6311137832556741, + "flos": 15237019457280.0, + "grad_norm": 3.784046746171531, + "language_loss": 0.80308318, + "learning_rate": 1.2653661930193997e-06, + "loss": 0.879884, + "num_input_tokens_seen": 226361520, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.11016846, + "step": 10497, + "time_per_iteration": 3.934098243713379 + }, + { + "auxiliary_loss_clip": 0.06407946, + "auxiliary_loss_mlp": 0.0126368, + "balance_loss_clip": 0.06270217, + "balance_loss_mlp": 0.01254495, + "epoch": 0.6311739065083422, + "flos": 22025314932480.0, + "grad_norm": 1.763173694901495, + "language_loss": 0.7404235, + "learning_rate": 1.265003970256247e-06, + "loss": 0.81713974, + "num_input_tokens_seen": 226381920, + "router_z_loss_clip": 1.37695312, + "router_z_loss_mlp": 0.09185791, + "step": 10498, + "time_per_iteration": 2.499866485595703 + }, + { + "auxiliary_loss_clip": 0.06410685, + "auxiliary_loss_mlp": 0.01267486, + "balance_loss_clip": 0.06272636, + "balance_loss_mlp": 0.01257174, + "epoch": 0.6312340297610101, + "flos": 22717349003520.0, + "grad_norm": 2.1933614541595543, + "language_loss": 0.70156991, + "learning_rate": 1.264641775364217e-06, + "loss": 0.77835166, + "num_input_tokens_seen": 226400035, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.10308838, + "step": 10499, + "time_per_iteration": 2.52750825881958 + }, + { + "auxiliary_loss_clip": 0.06406461, + "auxiliary_loss_mlp": 0.01267196, + "balance_loss_clip": 0.06273144, + "balance_loss_mlp": 0.01257122, + "epoch": 0.6312941530136781, + "flos": 24287017939200.0, + "grad_norm": 1.829578685045339, + "language_loss": 0.69904381, + "learning_rate": 1.2642796083570448e-06, + "loss": 0.77578032, + "num_input_tokens_seen": 226418280, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.10083008, + "step": 10500, + "time_per_iteration": 2.5188052654266357 + }, + { + "auxiliary_loss_clip": 0.06409658, + "auxiliary_loss_mlp": 0.01264556, + "balance_loss_clip": 0.06272549, + "balance_loss_mlp": 0.01254412, + "epoch": 0.631354276266346, + "flos": 21732420585600.0, + "grad_norm": 1.7241647945677354, + "language_loss": 0.74330127, + "learning_rate": 1.2639174692484634e-06, + "loss": 0.82004339, + "num_input_tokens_seen": 226436650, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.10144043, + "step": 10501, + "time_per_iteration": 2.5523152351379395 + }, + { + "auxiliary_loss_clip": 0.06406975, + "auxiliary_loss_mlp": 0.01265441, + "balance_loss_clip": 0.06271887, + "balance_loss_mlp": 0.01254331, + "epoch": 0.631414399519014, + "flos": 24032040364800.0, + "grad_norm": 1.6086243864849348, + "language_loss": 0.75708318, + "learning_rate": 1.2635553580522053e-06, + "loss": 0.83380735, + "num_input_tokens_seen": 226456275, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.11102295, + "step": 10502, + "time_per_iteration": 2.531738519668579 + }, + { + "auxiliary_loss_clip": 0.06415547, + "auxiliary_loss_mlp": 0.01269255, + "balance_loss_clip": 0.06271978, + "balance_loss_mlp": 0.01258026, + "epoch": 0.6314745227716819, + "flos": 24322586797440.0, + "grad_norm": 1.857189484196882, + "language_loss": 0.85481834, + "learning_rate": 1.2631932747820022e-06, + "loss": 0.93166631, + "num_input_tokens_seen": 226473610, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.11230469, + "step": 10503, + "time_per_iteration": 2.552402973175049 + }, + { + "auxiliary_loss_clip": 0.06410381, + "auxiliary_loss_mlp": 0.01264313, + "balance_loss_clip": 0.06270783, + "balance_loss_mlp": 0.01254061, + "epoch": 0.6315346460243499, + "flos": 23372891821440.0, + "grad_norm": 1.6307573056927078, + "language_loss": 0.86482477, + "learning_rate": 1.2628312194515838e-06, + "loss": 0.94157171, + "num_input_tokens_seen": 226493665, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.10253906, + "step": 10504, + "time_per_iteration": 2.5060269832611084 + }, + { + "auxiliary_loss_clip": 0.064176, + "auxiliary_loss_mlp": 0.01268121, + "balance_loss_clip": 0.06273593, + "balance_loss_mlp": 0.01257142, + "epoch": 0.6315947692770179, + "flos": 20265517082880.0, + "grad_norm": 1.678620058857516, + "language_loss": 0.76972538, + "learning_rate": 1.2624691920746793e-06, + "loss": 0.84658259, + "num_input_tokens_seen": 226511625, + "router_z_loss_clip": 1.44042969, + "router_z_loss_mlp": 0.10974121, + "step": 10505, + "time_per_iteration": 2.5305702686309814 + }, + { + "auxiliary_loss_clip": 0.06409689, + "auxiliary_loss_mlp": 0.01264983, + "balance_loss_clip": 0.06271394, + "balance_loss_mlp": 0.01254647, + "epoch": 0.6316548925296859, + "flos": 25273036460160.0, + "grad_norm": 1.9130295201566025, + "language_loss": 0.82312322, + "learning_rate": 1.2621071926650166e-06, + "loss": 0.89986992, + "num_input_tokens_seen": 226530085, + "router_z_loss_clip": 1.38476562, + "router_z_loss_mlp": 0.10339355, + "step": 10506, + "time_per_iteration": 2.5286946296691895 + }, + { + "auxiliary_loss_clip": 0.06409711, + "auxiliary_loss_mlp": 0.01264286, + "balance_loss_clip": 0.06270994, + "balance_loss_mlp": 0.01253164, + "epoch": 0.6317150157823538, + "flos": 22937344698240.0, + "grad_norm": 1.904699510430935, + "language_loss": 0.74647379, + "learning_rate": 1.2617452212363238e-06, + "loss": 0.82321376, + "num_input_tokens_seen": 226548115, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.11120605, + "step": 10507, + "time_per_iteration": 2.5269975662231445 + }, + { + "auxiliary_loss_clip": 0.06414819, + "auxiliary_loss_mlp": 0.01266326, + "balance_loss_clip": 0.06273329, + "balance_loss_mlp": 0.01254876, + "epoch": 0.6317751390350218, + "flos": 22533383364480.0, + "grad_norm": 1.9107193302266279, + "language_loss": 0.68296039, + "learning_rate": 1.2613832778023258e-06, + "loss": 0.75977188, + "num_input_tokens_seen": 226567955, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.11456299, + "step": 10508, + "time_per_iteration": 2.522627830505371 + }, + { + "auxiliary_loss_clip": 0.06408058, + "auxiliary_loss_mlp": 0.01267063, + "balance_loss_clip": 0.06270574, + "balance_loss_mlp": 0.0125662, + "epoch": 0.6318352622876897, + "flos": 23301460615680.0, + "grad_norm": 1.6343142360187424, + "language_loss": 0.70864749, + "learning_rate": 1.2610213623767478e-06, + "loss": 0.78539872, + "num_input_tokens_seen": 226588205, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.10449219, + "step": 10509, + "time_per_iteration": 2.542271614074707 + }, + { + "auxiliary_loss_clip": 0.06404234, + "auxiliary_loss_mlp": 0.01267915, + "balance_loss_clip": 0.06269038, + "balance_loss_mlp": 0.01257901, + "epoch": 0.6318953855403577, + "flos": 20710330081920.0, + "grad_norm": 1.5692460316561092, + "language_loss": 0.79883605, + "learning_rate": 1.2606594749733143e-06, + "loss": 0.87555748, + "num_input_tokens_seen": 226606965, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.10003662, + "step": 10510, + "time_per_iteration": 2.5088951587677 + }, + { + "auxiliary_loss_clip": 0.06416003, + "auxiliary_loss_mlp": 0.01266499, + "balance_loss_clip": 0.0627503, + "balance_loss_mlp": 0.01255627, + "epoch": 0.6319555087930258, + "flos": 22826613127680.0, + "grad_norm": 1.472787804562701, + "language_loss": 0.71112996, + "learning_rate": 1.2602976156057469e-06, + "loss": 0.78795499, + "num_input_tokens_seen": 226627845, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.10870361, + "step": 10511, + "time_per_iteration": 2.5239315032958984 + }, + { + "auxiliary_loss_clip": 0.06404155, + "auxiliary_loss_mlp": 0.01264501, + "balance_loss_clip": 0.06270795, + "balance_loss_mlp": 0.01254863, + "epoch": 0.6320156320456937, + "flos": 19976480023680.0, + "grad_norm": 1.5136926076294552, + "language_loss": 0.80152798, + "learning_rate": 1.2599357842877684e-06, + "loss": 0.87821454, + "num_input_tokens_seen": 226645855, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.09631348, + "step": 10512, + "time_per_iteration": 2.4730801582336426 + }, + { + "auxiliary_loss_clip": 0.06412748, + "auxiliary_loss_mlp": 0.01269686, + "balance_loss_clip": 0.06273789, + "balance_loss_mlp": 0.01258599, + "epoch": 0.6320757552983617, + "flos": 27020256197760.0, + "grad_norm": 1.640445181436539, + "language_loss": 0.71047747, + "learning_rate": 1.2595739810330994e-06, + "loss": 0.7873019, + "num_input_tokens_seen": 226665375, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.11090088, + "step": 10513, + "time_per_iteration": 2.554516077041626 + }, + { + "auxiliary_loss_clip": 0.06414016, + "auxiliary_loss_mlp": 0.01265922, + "balance_loss_clip": 0.06272392, + "balance_loss_mlp": 0.01255527, + "epoch": 0.6321358785510296, + "flos": 23702696691840.0, + "grad_norm": 1.6086341634408383, + "language_loss": 0.67001855, + "learning_rate": 1.259212205855459e-06, + "loss": 0.74681789, + "num_input_tokens_seen": 226685270, + "router_z_loss_clip": 1.41699219, + "router_z_loss_mlp": 0.10394287, + "step": 10514, + "time_per_iteration": 2.519026517868042 + }, + { + "auxiliary_loss_clip": 0.06407337, + "auxiliary_loss_mlp": 0.01266338, + "balance_loss_clip": 0.06271799, + "balance_loss_mlp": 0.01256491, + "epoch": 0.6321960018036976, + "flos": 26002484179200.0, + "grad_norm": 1.6426182718028832, + "language_loss": 0.74301624, + "learning_rate": 1.2588504587685663e-06, + "loss": 0.81975299, + "num_input_tokens_seen": 226705325, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.09851074, + "step": 10515, + "time_per_iteration": 2.6021077632904053 + }, + { + "auxiliary_loss_clip": 0.06406167, + "auxiliary_loss_mlp": 0.0126568, + "balance_loss_clip": 0.0627216, + "balance_loss_mlp": 0.01256054, + "epoch": 0.6322561250563655, + "flos": 22827745157760.0, + "grad_norm": 1.6516346518134952, + "language_loss": 0.90002799, + "learning_rate": 1.2584887397861379e-06, + "loss": 0.9767465, + "num_input_tokens_seen": 226723815, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.09631348, + "step": 10516, + "time_per_iteration": 3.9120290279388428 + }, + { + "auxiliary_loss_clip": 0.0641951, + "auxiliary_loss_mlp": 0.01266296, + "balance_loss_clip": 0.06273714, + "balance_loss_mlp": 0.01254208, + "epoch": 0.6323162483090335, + "flos": 18994234936320.0, + "grad_norm": 1.6653274793264599, + "language_loss": 0.81976604, + "learning_rate": 1.2581270489218911e-06, + "loss": 0.89662409, + "num_input_tokens_seen": 226741550, + "router_z_loss_clip": 1.45898438, + "router_z_loss_mlp": 0.12084961, + "step": 10517, + "time_per_iteration": 2.478886127471924 + }, + { + "auxiliary_loss_clip": 0.06409353, + "auxiliary_loss_mlp": 0.01263914, + "balance_loss_clip": 0.06273272, + "balance_loss_mlp": 0.01254312, + "epoch": 0.6323763715617015, + "flos": 19871324749440.0, + "grad_norm": 1.77487902385547, + "language_loss": 0.77740157, + "learning_rate": 1.257765386189541e-06, + "loss": 0.8541342, + "num_input_tokens_seen": 226761115, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.0960083, + "step": 10518, + "time_per_iteration": 2.529668092727661 + }, + { + "auxiliary_loss_clip": 0.06409025, + "auxiliary_loss_mlp": 0.01262964, + "balance_loss_clip": 0.0627432, + "balance_loss_mlp": 0.01253475, + "epoch": 0.6324364948143695, + "flos": 22789115625600.0, + "grad_norm": 1.399689960822604, + "language_loss": 0.85268837, + "learning_rate": 1.2574037516028018e-06, + "loss": 0.92940825, + "num_input_tokens_seen": 226782225, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.0949707, + "step": 10519, + "time_per_iteration": 2.5316224098205566 + }, + { + "auxiliary_loss_clip": 0.06407413, + "auxiliary_loss_mlp": 0.01264534, + "balance_loss_clip": 0.06274519, + "balance_loss_mlp": 0.01255081, + "epoch": 0.6324966180670374, + "flos": 22242333807360.0, + "grad_norm": 1.7591221317630206, + "language_loss": 0.7227571, + "learning_rate": 1.2570421451753867e-06, + "loss": 0.79947662, + "num_input_tokens_seen": 226802375, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.09454346, + "step": 10520, + "time_per_iteration": 2.593050479888916 + }, + { + "auxiliary_loss_clip": 0.06409709, + "auxiliary_loss_mlp": 0.01264525, + "balance_loss_clip": 0.06273077, + "balance_loss_mlp": 0.01254405, + "epoch": 0.6325567413197054, + "flos": 21695593916160.0, + "grad_norm": 1.8135575738100813, + "language_loss": 0.71838474, + "learning_rate": 1.2566805669210081e-06, + "loss": 0.79512703, + "num_input_tokens_seen": 226822165, + "router_z_loss_clip": 1.36621094, + "router_z_loss_mlp": 0.10119629, + "step": 10521, + "time_per_iteration": 2.5069823265075684 + }, + { + "auxiliary_loss_clip": 0.06414442, + "auxiliary_loss_mlp": 0.01265675, + "balance_loss_clip": 0.06276147, + "balance_loss_mlp": 0.01255018, + "epoch": 0.6326168645723733, + "flos": 19943133298560.0, + "grad_norm": 1.6828366730110347, + "language_loss": 0.7199434, + "learning_rate": 1.256319016853377e-06, + "loss": 0.79674459, + "num_input_tokens_seen": 226841645, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.10662842, + "step": 10522, + "time_per_iteration": 2.6152310371398926 + }, + { + "auxiliary_loss_clip": 0.06406049, + "auxiliary_loss_mlp": 0.0126663, + "balance_loss_clip": 0.06271145, + "balance_loss_mlp": 0.01256897, + "epoch": 0.6326769878250413, + "flos": 20236614624000.0, + "grad_norm": 1.7290468863072455, + "language_loss": 0.8156153, + "learning_rate": 1.2559574949862023e-06, + "loss": 0.89234209, + "num_input_tokens_seen": 226860355, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.09735107, + "step": 10523, + "time_per_iteration": 2.5101752281188965 + }, + { + "auxiliary_loss_clip": 0.06411799, + "auxiliary_loss_mlp": 0.012662, + "balance_loss_clip": 0.06276074, + "balance_loss_mlp": 0.01256669, + "epoch": 0.6327371110777094, + "flos": 20781803214720.0, + "grad_norm": 1.7543720010709223, + "language_loss": 0.73841488, + "learning_rate": 1.255596001333195e-06, + "loss": 0.81519485, + "num_input_tokens_seen": 226878390, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.09527588, + "step": 10524, + "time_per_iteration": 2.5357463359832764 + }, + { + "auxiliary_loss_clip": 0.06421272, + "auxiliary_loss_mlp": 0.01269485, + "balance_loss_clip": 0.06276855, + "balance_loss_mlp": 0.01258977, + "epoch": 0.6327972343303773, + "flos": 30344440176000.0, + "grad_norm": 2.100184187405554, + "language_loss": 0.84972739, + "learning_rate": 1.2552345359080615e-06, + "loss": 0.92663497, + "num_input_tokens_seen": 226898420, + "router_z_loss_clip": 1.44140625, + "router_z_loss_mlp": 0.10510254, + "step": 10525, + "time_per_iteration": 2.579566478729248 + }, + { + "auxiliary_loss_clip": 0.06407693, + "auxiliary_loss_mlp": 0.01265026, + "balance_loss_clip": 0.06272401, + "balance_loss_mlp": 0.01255632, + "epoch": 0.6328573575830453, + "flos": 17097947585280.0, + "grad_norm": 1.5662936390284432, + "language_loss": 0.67044812, + "learning_rate": 1.2548730987245093e-06, + "loss": 0.74717528, + "num_input_tokens_seen": 226916305, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.09393311, + "step": 10526, + "time_per_iteration": 2.6565749645233154 + }, + { + "auxiliary_loss_clip": 0.06418256, + "auxiliary_loss_mlp": 0.0126482, + "balance_loss_clip": 0.06276698, + "balance_loss_mlp": 0.01254002, + "epoch": 0.6329174808357132, + "flos": 25054340503680.0, + "grad_norm": 1.744260985628437, + "language_loss": 0.73593014, + "learning_rate": 1.254511689796244e-06, + "loss": 0.81276095, + "num_input_tokens_seen": 226937705, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.10821533, + "step": 10527, + "time_per_iteration": 4.000992298126221 + }, + { + "auxiliary_loss_clip": 0.06408013, + "auxiliary_loss_mlp": 0.01264369, + "balance_loss_clip": 0.062744, + "balance_loss_mlp": 0.01255124, + "epoch": 0.6329776040883812, + "flos": 16842466886400.0, + "grad_norm": 2.0238254127026347, + "language_loss": 0.72017205, + "learning_rate": 1.2541503091369693e-06, + "loss": 0.79689586, + "num_input_tokens_seen": 226954880, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.0925293, + "step": 10528, + "time_per_iteration": 2.482356548309326 + }, + { + "auxiliary_loss_clip": 0.06410971, + "auxiliary_loss_mlp": 0.01266595, + "balance_loss_clip": 0.0627386, + "balance_loss_mlp": 0.01256647, + "epoch": 0.6330377273410491, + "flos": 13521804779520.0, + "grad_norm": 2.0709634573058966, + "language_loss": 0.67286944, + "learning_rate": 1.2537889567603905e-06, + "loss": 0.74964511, + "num_input_tokens_seen": 226972595, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.0994873, + "step": 10529, + "time_per_iteration": 2.506375551223755 + }, + { + "auxiliary_loss_clip": 0.06417675, + "auxiliary_loss_mlp": 0.01266042, + "balance_loss_clip": 0.06276476, + "balance_loss_mlp": 0.0125486, + "epoch": 0.6330978505937171, + "flos": 21544471877760.0, + "grad_norm": 1.8153408645192133, + "language_loss": 0.75284207, + "learning_rate": 1.2534276326802092e-06, + "loss": 0.82967925, + "num_input_tokens_seen": 226991910, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.11181641, + "step": 10530, + "time_per_iteration": 4.016285419464111 + }, + { + "auxiliary_loss_clip": 0.06421702, + "auxiliary_loss_mlp": 0.01265839, + "balance_loss_clip": 0.06280397, + "balance_loss_mlp": 0.01255557, + "epoch": 0.6331579738463851, + "flos": 25016465658240.0, + "grad_norm": 1.412209042537855, + "language_loss": 0.74000126, + "learning_rate": 1.2530663369101259e-06, + "loss": 0.81687671, + "num_input_tokens_seen": 227010175, + "router_z_loss_clip": 1.41308594, + "router_z_loss_mlp": 0.10284424, + "step": 10531, + "time_per_iteration": 2.5478739738464355 + }, + { + "auxiliary_loss_clip": 0.06410021, + "auxiliary_loss_mlp": 0.0126691, + "balance_loss_clip": 0.06275932, + "balance_loss_mlp": 0.01257093, + "epoch": 0.6332180970990531, + "flos": 14981329123200.0, + "grad_norm": 4.395160978524889, + "language_loss": 0.80356932, + "learning_rate": 1.2527050694638432e-06, + "loss": 0.88033861, + "num_input_tokens_seen": 227025540, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.0980835, + "step": 10532, + "time_per_iteration": 2.4629757404327393 + }, + { + "auxiliary_loss_clip": 0.06411614, + "auxiliary_loss_mlp": 0.01265113, + "balance_loss_clip": 0.06276565, + "balance_loss_mlp": 0.01256017, + "epoch": 0.633278220351721, + "flos": 22712904737280.0, + "grad_norm": 1.6509114242634397, + "language_loss": 0.75345361, + "learning_rate": 1.2523438303550582e-06, + "loss": 0.83022094, + "num_input_tokens_seen": 227045520, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.09094238, + "step": 10533, + "time_per_iteration": 2.5486817359924316 + }, + { + "auxiliary_loss_clip": 0.06421439, + "auxiliary_loss_mlp": 0.01266816, + "balance_loss_clip": 0.0627851, + "balance_loss_mlp": 0.01255586, + "epoch": 0.633338343604389, + "flos": 12607594807680.0, + "grad_norm": 2.155852114283844, + "language_loss": 0.7738024, + "learning_rate": 1.2519826195974706e-06, + "loss": 0.850685, + "num_input_tokens_seen": 227059420, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.11224365, + "step": 10534, + "time_per_iteration": 2.447556257247925 + }, + { + "auxiliary_loss_clip": 0.06414493, + "auxiliary_loss_mlp": 0.01265709, + "balance_loss_clip": 0.06277296, + "balance_loss_mlp": 0.01255314, + "epoch": 0.6333984668570569, + "flos": 25967586153600.0, + "grad_norm": 8.614230799549778, + "language_loss": 0.85787749, + "learning_rate": 1.251621437204777e-06, + "loss": 0.93467951, + "num_input_tokens_seen": 227081310, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.10400391, + "step": 10535, + "time_per_iteration": 2.564028739929199 + }, + { + "auxiliary_loss_clip": 0.06413931, + "auxiliary_loss_mlp": 0.01265846, + "balance_loss_clip": 0.06276763, + "balance_loss_mlp": 0.01255606, + "epoch": 0.6334585901097249, + "flos": 23665953876480.0, + "grad_norm": 1.7881941276129079, + "language_loss": 0.76803362, + "learning_rate": 1.2512602831906733e-06, + "loss": 0.84483141, + "num_input_tokens_seen": 227100365, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.10235596, + "step": 10536, + "time_per_iteration": 4.017718315124512 + }, + { + "auxiliary_loss_clip": 0.06411674, + "auxiliary_loss_mlp": 0.01264671, + "balance_loss_clip": 0.06276245, + "balance_loss_mlp": 0.01254848, + "epoch": 0.633518713362393, + "flos": 28766930883840.0, + "grad_norm": 1.5924161290871786, + "language_loss": 0.6050871, + "learning_rate": 1.250899157568855e-06, + "loss": 0.68185055, + "num_input_tokens_seen": 227119680, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.09820557, + "step": 10537, + "time_per_iteration": 2.575690746307373 + }, + { + "auxiliary_loss_clip": 0.0632174, + "auxiliary_loss_mlp": 0.01257375, + "balance_loss_clip": 0.06265318, + "balance_loss_mlp": 0.01256043, + "epoch": 0.6335788366150609, + "flos": 70438669407360.0, + "grad_norm": 0.7645314683588974, + "language_loss": 0.5222913, + "learning_rate": 1.2505380603530155e-06, + "loss": 0.59808248, + "num_input_tokens_seen": 227184465, + "router_z_loss_clip": 0.56298828, + "router_z_loss_mlp": 0.01334381, + "step": 10538, + "time_per_iteration": 3.254763126373291 + }, + { + "auxiliary_loss_clip": 0.06417011, + "auxiliary_loss_mlp": 0.01268273, + "balance_loss_clip": 0.06275439, + "balance_loss_mlp": 0.01257383, + "epoch": 0.6336389598677289, + "flos": 23738768674560.0, + "grad_norm": 1.8043673999860153, + "language_loss": 0.83927584, + "learning_rate": 1.250176991556848e-06, + "loss": 0.91612864, + "num_input_tokens_seen": 227202185, + "router_z_loss_clip": 1.41503906, + "router_z_loss_mlp": 0.10888672, + "step": 10539, + "time_per_iteration": 2.533168315887451 + }, + { + "auxiliary_loss_clip": 0.06413823, + "auxiliary_loss_mlp": 0.01265505, + "balance_loss_clip": 0.06273531, + "balance_loss_mlp": 0.01254526, + "epoch": 0.6336990831203968, + "flos": 29284097483520.0, + "grad_norm": 1.5633861305622094, + "language_loss": 0.87373441, + "learning_rate": 1.2498159511940438e-06, + "loss": 0.95052767, + "num_input_tokens_seen": 227222020, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.10980225, + "step": 10540, + "time_per_iteration": 2.5700464248657227 + }, + { + "auxiliary_loss_clip": 0.0641039, + "auxiliary_loss_mlp": 0.01263695, + "balance_loss_clip": 0.06275897, + "balance_loss_mlp": 0.01254671, + "epoch": 0.6337592063730648, + "flos": 29104659964800.0, + "grad_norm": 1.757260374288504, + "language_loss": 0.7308234, + "learning_rate": 1.2494549392782943e-06, + "loss": 0.80756426, + "num_input_tokens_seen": 227240885, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.090271, + "step": 10541, + "time_per_iteration": 2.5605950355529785 + }, + { + "auxiliary_loss_clip": 0.06419826, + "auxiliary_loss_mlp": 0.01267808, + "balance_loss_clip": 0.06276362, + "balance_loss_mlp": 0.01255934, + "epoch": 0.6338193296257327, + "flos": 34713705404160.0, + "grad_norm": 3.0522247844622217, + "language_loss": 0.85394645, + "learning_rate": 1.2490939558232887e-06, + "loss": 0.93082273, + "num_input_tokens_seen": 227257880, + "router_z_loss_clip": 1.43359375, + "router_z_loss_mlp": 0.11865234, + "step": 10542, + "time_per_iteration": 2.711641788482666 + }, + { + "auxiliary_loss_clip": 0.06413235, + "auxiliary_loss_mlp": 0.01264694, + "balance_loss_clip": 0.06275927, + "balance_loss_mlp": 0.01253477, + "epoch": 0.6338794528784008, + "flos": 16692644586240.0, + "grad_norm": 1.6414110705076674, + "language_loss": 0.77927899, + "learning_rate": 1.2487330008427153e-06, + "loss": 0.85605824, + "num_input_tokens_seen": 227274840, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.11224365, + "step": 10543, + "time_per_iteration": 2.4868364334106445 + }, + { + "auxiliary_loss_clip": 0.06406207, + "auxiliary_loss_mlp": 0.01263491, + "balance_loss_clip": 0.0627438, + "balance_loss_mlp": 0.01254049, + "epoch": 0.6339395761310687, + "flos": 22353233086080.0, + "grad_norm": 1.4561914884468037, + "language_loss": 0.73388422, + "learning_rate": 1.2483720743502618e-06, + "loss": 0.81058121, + "num_input_tokens_seen": 227294835, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09442139, + "step": 10544, + "time_per_iteration": 2.5364322662353516 + }, + { + "auxiliary_loss_clip": 0.06420652, + "auxiliary_loss_mlp": 0.01265757, + "balance_loss_clip": 0.06277749, + "balance_loss_mlp": 0.0125501, + "epoch": 0.6339996993837367, + "flos": 18557765418240.0, + "grad_norm": 2.1124884217915953, + "language_loss": 0.68196738, + "learning_rate": 1.2480111763596144e-06, + "loss": 0.7588315, + "num_input_tokens_seen": 227314935, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.10736084, + "step": 10545, + "time_per_iteration": 2.498805284500122 + }, + { + "auxiliary_loss_clip": 0.06407638, + "auxiliary_loss_mlp": 0.01263932, + "balance_loss_clip": 0.06273287, + "balance_loss_mlp": 0.01254217, + "epoch": 0.6340598226364046, + "flos": 12974519836800.0, + "grad_norm": 1.9119054748089928, + "language_loss": 0.71463943, + "learning_rate": 1.2476503068844592e-06, + "loss": 0.79135519, + "num_input_tokens_seen": 227332905, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.09710693, + "step": 10546, + "time_per_iteration": 2.494575262069702 + }, + { + "auxiliary_loss_clip": 0.06404417, + "auxiliary_loss_mlp": 0.012635, + "balance_loss_clip": 0.06273207, + "balance_loss_mlp": 0.01254214, + "epoch": 0.6341199458890726, + "flos": 26695272936960.0, + "grad_norm": 1.3275160208019028, + "language_loss": 0.78403944, + "learning_rate": 1.2472894659384792e-06, + "loss": 0.86071861, + "num_input_tokens_seen": 227354915, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09283447, + "step": 10547, + "time_per_iteration": 2.565394639968872 + }, + { + "auxiliary_loss_clip": 0.06415725, + "auxiliary_loss_mlp": 0.01266589, + "balance_loss_clip": 0.0627535, + "balance_loss_mlp": 0.01256462, + "epoch": 0.6341800691417405, + "flos": 18740263610880.0, + "grad_norm": 1.5896144863347355, + "language_loss": 0.63801014, + "learning_rate": 1.2469286535353578e-06, + "loss": 0.71483326, + "num_input_tokens_seen": 227372990, + "router_z_loss_clip": 1.40332031, + "router_z_loss_mlp": 0.10131836, + "step": 10548, + "time_per_iteration": 2.531881332397461 + }, + { + "auxiliary_loss_clip": 0.06408647, + "auxiliary_loss_mlp": 0.01263438, + "balance_loss_clip": 0.06272966, + "balance_loss_mlp": 0.01253746, + "epoch": 0.6342401923944085, + "flos": 26256539358720.0, + "grad_norm": 1.5473137822842997, + "language_loss": 0.61999178, + "learning_rate": 1.2465678696887785e-06, + "loss": 0.69671261, + "num_input_tokens_seen": 227393270, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.09698486, + "step": 10549, + "time_per_iteration": 2.590090274810791 + }, + { + "auxiliary_loss_clip": 0.06413013, + "auxiliary_loss_mlp": 0.01265888, + "balance_loss_clip": 0.06276116, + "balance_loss_mlp": 0.01256047, + "epoch": 0.6343003156470765, + "flos": 24687834744960.0, + "grad_norm": 1.5414529536537591, + "language_loss": 0.74040842, + "learning_rate": 1.2462071144124197e-06, + "loss": 0.81719744, + "num_input_tokens_seen": 227413630, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.09844971, + "step": 10550, + "time_per_iteration": 2.575768232345581 + }, + { + "auxiliary_loss_clip": 0.06314379, + "auxiliary_loss_mlp": 0.01254446, + "balance_loss_clip": 0.06258175, + "balance_loss_mlp": 0.01252981, + "epoch": 0.6343604388997445, + "flos": 69824481379200.0, + "grad_norm": 0.6831342981577847, + "language_loss": 0.57712334, + "learning_rate": 1.2458463877199638e-06, + "loss": 0.65281159, + "num_input_tokens_seen": 227476630, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01463318, + "step": 10551, + "time_per_iteration": 3.169085741043091 + }, + { + "auxiliary_loss_clip": 0.06408188, + "auxiliary_loss_mlp": 0.01264711, + "balance_loss_clip": 0.06273001, + "balance_loss_mlp": 0.01255257, + "epoch": 0.6344205621524125, + "flos": 21989117168640.0, + "grad_norm": 1.9821146557890166, + "language_loss": 0.67052966, + "learning_rate": 1.2454856896250881e-06, + "loss": 0.74725866, + "num_input_tokens_seen": 227496060, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.09454346, + "step": 10552, + "time_per_iteration": 2.51409649848938 + }, + { + "auxiliary_loss_clip": 0.06415403, + "auxiliary_loss_mlp": 0.01263367, + "balance_loss_clip": 0.0627457, + "balance_loss_mlp": 0.01252883, + "epoch": 0.6344806854050804, + "flos": 20455100945280.0, + "grad_norm": 1.6854116098373486, + "language_loss": 0.82256383, + "learning_rate": 1.24512502014147e-06, + "loss": 0.89935154, + "num_input_tokens_seen": 227513440, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.1048584, + "step": 10553, + "time_per_iteration": 2.5263893604278564 + }, + { + "auxiliary_loss_clip": 0.06412624, + "auxiliary_loss_mlp": 0.01266225, + "balance_loss_clip": 0.06273618, + "balance_loss_mlp": 0.01256021, + "epoch": 0.6345408086577484, + "flos": 40519294594560.0, + "grad_norm": 1.7209630881675668, + "language_loss": 0.55282557, + "learning_rate": 1.2447643792827879e-06, + "loss": 0.629614, + "num_input_tokens_seen": 227535395, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.10205078, + "step": 10554, + "time_per_iteration": 2.6742208003997803 + }, + { + "auxiliary_loss_clip": 0.06412828, + "auxiliary_loss_mlp": 0.01266072, + "balance_loss_clip": 0.06274945, + "balance_loss_mlp": 0.01255701, + "epoch": 0.6346009319104163, + "flos": 21367759616640.0, + "grad_norm": 1.6547697162667994, + "language_loss": 0.7092278, + "learning_rate": 1.2444037670627153e-06, + "loss": 0.78601682, + "num_input_tokens_seen": 227554545, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.10369873, + "step": 10555, + "time_per_iteration": 2.5059010982513428 + }, + { + "auxiliary_loss_clip": 0.06308, + "auxiliary_loss_mlp": 0.01256771, + "balance_loss_clip": 0.06252061, + "balance_loss_mlp": 0.01255482, + "epoch": 0.6346610551630844, + "flos": 71383333138560.0, + "grad_norm": 0.7594485734837986, + "language_loss": 0.5526008, + "learning_rate": 1.2440431834949276e-06, + "loss": 0.62824851, + "num_input_tokens_seen": 227608575, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.01290131, + "step": 10556, + "time_per_iteration": 4.480233669281006 + }, + { + "auxiliary_loss_clip": 0.0641848, + "auxiliary_loss_mlp": 0.01268051, + "balance_loss_clip": 0.06276923, + "balance_loss_mlp": 0.01257227, + "epoch": 0.6347211784157523, + "flos": 25418666056320.0, + "grad_norm": 1.720664259353744, + "language_loss": 0.68248415, + "learning_rate": 1.2436826285930985e-06, + "loss": 0.75934947, + "num_input_tokens_seen": 227628175, + "router_z_loss_clip": 1.41503906, + "router_z_loss_mlp": 0.10827637, + "step": 10557, + "time_per_iteration": 2.5347533226013184 + }, + { + "auxiliary_loss_clip": 0.06415346, + "auxiliary_loss_mlp": 0.01266286, + "balance_loss_clip": 0.06277986, + "balance_loss_mlp": 0.01256069, + "epoch": 0.6347813016684203, + "flos": 15748274344320.0, + "grad_norm": 1.7185775847351308, + "language_loss": 0.7034533, + "learning_rate": 1.2433221023709002e-06, + "loss": 0.78026962, + "num_input_tokens_seen": 227645330, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.10211182, + "step": 10558, + "time_per_iteration": 2.5184271335601807 + }, + { + "auxiliary_loss_clip": 0.06415297, + "auxiliary_loss_mlp": 0.01267927, + "balance_loss_clip": 0.06277342, + "balance_loss_mlp": 0.01257812, + "epoch": 0.6348414249210882, + "flos": 21470231560320.0, + "grad_norm": 1.5690247234550625, + "language_loss": 0.78373873, + "learning_rate": 1.2429616048420031e-06, + "loss": 0.86057091, + "num_input_tokens_seen": 227665250, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.10113525, + "step": 10559, + "time_per_iteration": 2.5017571449279785 + }, + { + "auxiliary_loss_clip": 0.06413186, + "auxiliary_loss_mlp": 0.01268043, + "balance_loss_clip": 0.06274431, + "balance_loss_mlp": 0.01257404, + "epoch": 0.6349015481737562, + "flos": 21659521933440.0, + "grad_norm": 1.6584174732731671, + "language_loss": 0.68334514, + "learning_rate": 1.242601136020078e-06, + "loss": 0.76015741, + "num_input_tokens_seen": 227685070, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.10638428, + "step": 10560, + "time_per_iteration": 2.536973237991333 + }, + { + "auxiliary_loss_clip": 0.06413247, + "auxiliary_loss_mlp": 0.01267835, + "balance_loss_clip": 0.06275544, + "balance_loss_mlp": 0.01257679, + "epoch": 0.6349616714264241, + "flos": 22200643601280.0, + "grad_norm": 1.5868389258687317, + "language_loss": 0.77125943, + "learning_rate": 1.2422406959187939e-06, + "loss": 0.84807026, + "num_input_tokens_seen": 227704430, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.10150146, + "step": 10561, + "time_per_iteration": 2.5515172481536865 + }, + { + "auxiliary_loss_clip": 0.06412898, + "auxiliary_loss_mlp": 0.01265705, + "balance_loss_clip": 0.06273612, + "balance_loss_mlp": 0.01254433, + "epoch": 0.6350217946790921, + "flos": 25417324391040.0, + "grad_norm": 1.8175837603303404, + "language_loss": 0.72219515, + "learning_rate": 1.2418802845518178e-06, + "loss": 0.79898125, + "num_input_tokens_seen": 227724920, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.11279297, + "step": 10562, + "time_per_iteration": 2.563812255859375 + }, + { + "auxiliary_loss_clip": 0.06418765, + "auxiliary_loss_mlp": 0.0126928, + "balance_loss_clip": 0.0627933, + "balance_loss_mlp": 0.01258808, + "epoch": 0.63508191793176, + "flos": 19725024320640.0, + "grad_norm": 1.9663518722420297, + "language_loss": 0.81324869, + "learning_rate": 1.2415199019328185e-06, + "loss": 0.89012909, + "num_input_tokens_seen": 227743400, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.10473633, + "step": 10563, + "time_per_iteration": 2.618112087249756 + }, + { + "auxiliary_loss_clip": 0.06424198, + "auxiliary_loss_mlp": 0.01272987, + "balance_loss_clip": 0.06281862, + "balance_loss_mlp": 0.01262092, + "epoch": 0.6351420411844281, + "flos": 18192810960000.0, + "grad_norm": 2.213984919304992, + "language_loss": 0.81394589, + "learning_rate": 1.2411595480754597e-06, + "loss": 0.89091778, + "num_input_tokens_seen": 227759990, + "router_z_loss_clip": 1.42382812, + "router_z_loss_mlp": 0.10913086, + "step": 10564, + "time_per_iteration": 2.54693341255188 + }, + { + "auxiliary_loss_clip": 0.06417058, + "auxiliary_loss_mlp": 0.01266083, + "balance_loss_clip": 0.06278841, + "balance_loss_mlp": 0.01256272, + "epoch": 0.6352021644370961, + "flos": 33734437136640.0, + "grad_norm": 2.2491852390349614, + "language_loss": 0.73082668, + "learning_rate": 1.240799222993407e-06, + "loss": 0.80765808, + "num_input_tokens_seen": 227780835, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.09796143, + "step": 10565, + "time_per_iteration": 2.6810452938079834 + }, + { + "auxiliary_loss_clip": 0.06416303, + "auxiliary_loss_mlp": 0.01267579, + "balance_loss_clip": 0.06276368, + "balance_loss_mlp": 0.01256093, + "epoch": 0.635262287689764, + "flos": 20380818700800.0, + "grad_norm": 2.01281164224499, + "language_loss": 0.68792611, + "learning_rate": 1.240438926700324e-06, + "loss": 0.7647649, + "num_input_tokens_seen": 227798580, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.1149292, + "step": 10566, + "time_per_iteration": 2.5485215187072754 + }, + { + "auxiliary_loss_clip": 0.06410012, + "auxiliary_loss_mlp": 0.01265054, + "balance_loss_clip": 0.06277308, + "balance_loss_mlp": 0.01255022, + "epoch": 0.635322410942432, + "flos": 27532559260800.0, + "grad_norm": 1.717445195940493, + "language_loss": 0.69661963, + "learning_rate": 1.2400786592098725e-06, + "loss": 0.77337033, + "num_input_tokens_seen": 227819210, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.1003418, + "step": 10567, + "time_per_iteration": 4.017431974411011 + }, + { + "auxiliary_loss_clip": 0.064077, + "auxiliary_loss_mlp": 0.01265057, + "balance_loss_clip": 0.06274484, + "balance_loss_mlp": 0.01255151, + "epoch": 0.6353825341950999, + "flos": 21550048174080.0, + "grad_norm": 1.9561940375454367, + "language_loss": 0.84912741, + "learning_rate": 1.2397184205357154e-06, + "loss": 0.92585498, + "num_input_tokens_seen": 227838340, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.09906006, + "step": 10568, + "time_per_iteration": 2.528050422668457 + }, + { + "auxiliary_loss_clip": 0.06414051, + "auxiliary_loss_mlp": 0.012645, + "balance_loss_clip": 0.06275208, + "balance_loss_mlp": 0.01254427, + "epoch": 0.635442657447768, + "flos": 31767934464000.0, + "grad_norm": 1.8080598645215213, + "language_loss": 0.84412146, + "learning_rate": 1.2393582106915113e-06, + "loss": 0.92090696, + "num_input_tokens_seen": 227859170, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.10070801, + "step": 10569, + "time_per_iteration": 2.6543846130371094 + }, + { + "auxiliary_loss_clip": 0.06409843, + "auxiliary_loss_mlp": 0.01268445, + "balance_loss_clip": 0.06274843, + "balance_loss_mlp": 0.0125811, + "epoch": 0.6355027807004359, + "flos": 19835001204480.0, + "grad_norm": 1.4845804125044393, + "language_loss": 0.69596767, + "learning_rate": 1.2389980296909198e-06, + "loss": 0.77275056, + "num_input_tokens_seen": 227878545, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.10327148, + "step": 10570, + "time_per_iteration": 3.903024435043335 + }, + { + "auxiliary_loss_clip": 0.06413252, + "auxiliary_loss_mlp": 0.01264199, + "balance_loss_clip": 0.06273122, + "balance_loss_mlp": 0.01253989, + "epoch": 0.6355629039531039, + "flos": 30380176742400.0, + "grad_norm": 1.6479967140904772, + "language_loss": 0.66236866, + "learning_rate": 1.2386378775476e-06, + "loss": 0.73914319, + "num_input_tokens_seen": 227898875, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.10211182, + "step": 10571, + "time_per_iteration": 2.571477174758911 + }, + { + "auxiliary_loss_clip": 0.06416899, + "auxiliary_loss_mlp": 0.01266469, + "balance_loss_clip": 0.06277502, + "balance_loss_mlp": 0.01256097, + "epoch": 0.6356230272057718, + "flos": 17938001093760.0, + "grad_norm": 1.5990791790465455, + "language_loss": 0.71629465, + "learning_rate": 1.2382777542752074e-06, + "loss": 0.79312837, + "num_input_tokens_seen": 227917130, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.10375977, + "step": 10572, + "time_per_iteration": 2.466371774673462 + }, + { + "auxiliary_loss_clip": 0.06409136, + "auxiliary_loss_mlp": 0.0126563, + "balance_loss_clip": 0.06273179, + "balance_loss_mlp": 0.01255623, + "epoch": 0.6356831504584398, + "flos": 25383139125120.0, + "grad_norm": 1.3707006156469355, + "language_loss": 0.81310254, + "learning_rate": 1.2379176598873992e-06, + "loss": 0.88985026, + "num_input_tokens_seen": 227939550, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.10015869, + "step": 10573, + "time_per_iteration": 2.5966269969940186 + }, + { + "auxiliary_loss_clip": 0.06417162, + "auxiliary_loss_mlp": 0.01267057, + "balance_loss_clip": 0.06277572, + "balance_loss_mlp": 0.01255899, + "epoch": 0.6357432737111077, + "flos": 46511029630080.0, + "grad_norm": 1.745983210040395, + "language_loss": 0.68758935, + "learning_rate": 1.2375575943978303e-06, + "loss": 0.76443154, + "num_input_tokens_seen": 227962200, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.11151123, + "step": 10574, + "time_per_iteration": 2.7297935485839844 + }, + { + "auxiliary_loss_clip": 0.06411967, + "auxiliary_loss_mlp": 0.01265063, + "balance_loss_clip": 0.06275427, + "balance_loss_mlp": 0.01254513, + "epoch": 0.6358033969637757, + "flos": 17280026507520.0, + "grad_norm": 2.032779061466396, + "language_loss": 0.8712132, + "learning_rate": 1.2371975578201525e-06, + "loss": 0.9479835, + "num_input_tokens_seen": 227979270, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.10540771, + "step": 10575, + "time_per_iteration": 2.505861520767212 + }, + { + "auxiliary_loss_clip": 0.06410281, + "auxiliary_loss_mlp": 0.01265614, + "balance_loss_clip": 0.06273504, + "balance_loss_mlp": 0.01255946, + "epoch": 0.6358635202164437, + "flos": 27132832558080.0, + "grad_norm": 1.4971132099643523, + "language_loss": 0.72510445, + "learning_rate": 1.2368375501680204e-06, + "loss": 0.80186343, + "num_input_tokens_seen": 228000550, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.09667969, + "step": 10576, + "time_per_iteration": 3.991710901260376 + }, + { + "auxiliary_loss_clip": 0.06415755, + "auxiliary_loss_mlp": 0.01267596, + "balance_loss_clip": 0.06276268, + "balance_loss_mlp": 0.01257368, + "epoch": 0.6359236434691117, + "flos": 27532307698560.0, + "grad_norm": 1.4171583307321047, + "language_loss": 0.6902113, + "learning_rate": 1.236477571455085e-06, + "loss": 0.76704478, + "num_input_tokens_seen": 228022005, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.10223389, + "step": 10577, + "time_per_iteration": 2.553823947906494 + }, + { + "auxiliary_loss_clip": 0.06410993, + "auxiliary_loss_mlp": 0.01267287, + "balance_loss_clip": 0.06274246, + "balance_loss_mlp": 0.01257613, + "epoch": 0.6359837667217797, + "flos": 39357653915520.0, + "grad_norm": 1.7634862953282429, + "language_loss": 0.72702098, + "learning_rate": 1.2361176216949964e-06, + "loss": 0.8038038, + "num_input_tokens_seen": 228043770, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.09674072, + "step": 10578, + "time_per_iteration": 2.7065927982330322 + }, + { + "auxiliary_loss_clip": 0.06310344, + "auxiliary_loss_mlp": 0.01250981, + "balance_loss_clip": 0.06254056, + "balance_loss_mlp": 0.0124968, + "epoch": 0.6360438899744476, + "flos": 56430472475520.0, + "grad_norm": 0.7091193353039391, + "language_loss": 0.54502332, + "learning_rate": 1.2357577009014044e-06, + "loss": 0.62063658, + "num_input_tokens_seen": 228104985, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01301575, + "step": 10579, + "time_per_iteration": 3.198455333709717 + }, + { + "auxiliary_loss_clip": 0.06409089, + "auxiliary_loss_mlp": 0.01264424, + "balance_loss_clip": 0.06272582, + "balance_loss_mlp": 0.01254369, + "epoch": 0.6361040132271156, + "flos": 24980100186240.0, + "grad_norm": 1.5151266119166613, + "language_loss": 0.77508366, + "learning_rate": 1.2353978090879568e-06, + "loss": 0.8518188, + "num_input_tokens_seen": 228125620, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.1005249, + "step": 10580, + "time_per_iteration": 2.5561928749084473 + }, + { + "auxiliary_loss_clip": 0.06411447, + "auxiliary_loss_mlp": 0.01269158, + "balance_loss_clip": 0.06273703, + "balance_loss_mlp": 0.01259043, + "epoch": 0.6361641364797835, + "flos": 23266059465600.0, + "grad_norm": 1.9638125336396983, + "language_loss": 0.66766918, + "learning_rate": 1.235037946268301e-06, + "loss": 0.74447519, + "num_input_tokens_seen": 228143495, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.10113525, + "step": 10581, + "time_per_iteration": 2.5164785385131836 + }, + { + "auxiliary_loss_clip": 0.06410715, + "auxiliary_loss_mlp": 0.01264464, + "balance_loss_clip": 0.06273356, + "balance_loss_mlp": 0.01254683, + "epoch": 0.6362242597324516, + "flos": 26001645638400.0, + "grad_norm": 1.4228320252439628, + "language_loss": 0.6843577, + "learning_rate": 1.2346781124560828e-06, + "loss": 0.76110947, + "num_input_tokens_seen": 228166500, + "router_z_loss_clip": 1.37402344, + "router_z_loss_mlp": 0.09783936, + "step": 10582, + "time_per_iteration": 2.6015806198120117 + }, + { + "auxiliary_loss_clip": 0.06416672, + "auxiliary_loss_mlp": 0.01264747, + "balance_loss_clip": 0.06276425, + "balance_loss_mlp": 0.01254203, + "epoch": 0.6362843829851195, + "flos": 25710428373120.0, + "grad_norm": 2.448331234664856, + "language_loss": 0.84422374, + "learning_rate": 1.2343183076649473e-06, + "loss": 0.92103791, + "num_input_tokens_seen": 228185325, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.10552979, + "step": 10583, + "time_per_iteration": 2.5657055377960205 + }, + { + "auxiliary_loss_clip": 0.06411825, + "auxiliary_loss_mlp": 0.01266338, + "balance_loss_clip": 0.06278308, + "balance_loss_mlp": 0.01256086, + "epoch": 0.6363445062377875, + "flos": 20529341262720.0, + "grad_norm": 1.5773260338409785, + "language_loss": 0.75534987, + "learning_rate": 1.233958531908538e-06, + "loss": 0.83213151, + "num_input_tokens_seen": 228204050, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.10247803, + "step": 10584, + "time_per_iteration": 2.527031421661377 + }, + { + "auxiliary_loss_clip": 0.06414576, + "auxiliary_loss_mlp": 0.01267643, + "balance_loss_clip": 0.06273754, + "balance_loss_mlp": 0.01256139, + "epoch": 0.6364046294904554, + "flos": 19469879038080.0, + "grad_norm": 1.7122506045265105, + "language_loss": 0.73591262, + "learning_rate": 1.2335987852004985e-06, + "loss": 0.81273478, + "num_input_tokens_seen": 228222430, + "router_z_loss_clip": 1.40722656, + "router_z_loss_mlp": 0.11505127, + "step": 10585, + "time_per_iteration": 2.4975733757019043 + }, + { + "auxiliary_loss_clip": 0.06413724, + "auxiliary_loss_mlp": 0.01264888, + "balance_loss_clip": 0.06275959, + "balance_loss_mlp": 0.01254981, + "epoch": 0.6364647527431234, + "flos": 21002176252800.0, + "grad_norm": 1.805788279769041, + "language_loss": 0.83174026, + "learning_rate": 1.2332390675544697e-06, + "loss": 0.9085263, + "num_input_tokens_seen": 228241925, + "router_z_loss_clip": 1.37695312, + "router_z_loss_mlp": 0.09906006, + "step": 10586, + "time_per_iteration": 2.531947612762451 + }, + { + "auxiliary_loss_clip": 0.06412107, + "auxiliary_loss_mlp": 0.0126422, + "balance_loss_clip": 0.06275982, + "balance_loss_mlp": 0.0125435, + "epoch": 0.6365248759957913, + "flos": 25777079896320.0, + "grad_norm": 1.5441547949198797, + "language_loss": 0.72916567, + "learning_rate": 1.2328793789840918e-06, + "loss": 0.80592889, + "num_input_tokens_seen": 228262535, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.09863281, + "step": 10587, + "time_per_iteration": 2.589169979095459 + }, + { + "auxiliary_loss_clip": 0.06412084, + "auxiliary_loss_mlp": 0.01264457, + "balance_loss_clip": 0.062725, + "balance_loss_mlp": 0.01254014, + "epoch": 0.6365849992484593, + "flos": 22462161793920.0, + "grad_norm": 2.0110608871651823, + "language_loss": 0.77360207, + "learning_rate": 1.2325197195030058e-06, + "loss": 0.85036743, + "num_input_tokens_seen": 228281340, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.10443115, + "step": 10588, + "time_per_iteration": 2.5107719898223877 + }, + { + "auxiliary_loss_clip": 0.06404337, + "auxiliary_loss_mlp": 0.01266834, + "balance_loss_clip": 0.06271751, + "balance_loss_mlp": 0.01256564, + "epoch": 0.6366451225011273, + "flos": 19031648584320.0, + "grad_norm": 1.403923680448765, + "language_loss": 0.79945314, + "learning_rate": 1.2321600891248478e-06, + "loss": 0.87616491, + "num_input_tokens_seen": 228300865, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.10266113, + "step": 10589, + "time_per_iteration": 2.5198166370391846 + }, + { + "auxiliary_loss_clip": 0.06407724, + "auxiliary_loss_mlp": 0.01266892, + "balance_loss_clip": 0.06272867, + "balance_loss_mlp": 0.01256616, + "epoch": 0.6367052457537953, + "flos": 25235413176960.0, + "grad_norm": 1.9669131634706534, + "language_loss": 0.67181933, + "learning_rate": 1.231800487863257e-06, + "loss": 0.74856544, + "num_input_tokens_seen": 228320815, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.1027832, + "step": 10590, + "time_per_iteration": 2.5376667976379395 + }, + { + "auxiliary_loss_clip": 0.0642258, + "auxiliary_loss_mlp": 0.01266478, + "balance_loss_clip": 0.06278451, + "balance_loss_mlp": 0.01254945, + "epoch": 0.6367653690064633, + "flos": 19214482193280.0, + "grad_norm": 1.635127472973657, + "language_loss": 0.7910291, + "learning_rate": 1.2314409157318685e-06, + "loss": 0.86791968, + "num_input_tokens_seen": 228339065, + "router_z_loss_clip": 1.44238281, + "router_z_loss_mlp": 0.11523438, + "step": 10591, + "time_per_iteration": 2.542515993118286 + }, + { + "auxiliary_loss_clip": 0.06405823, + "auxiliary_loss_mlp": 0.01265189, + "balance_loss_clip": 0.06271368, + "balance_loss_mlp": 0.0125564, + "epoch": 0.6368254922591312, + "flos": 23553000172800.0, + "grad_norm": 1.3721943309197018, + "language_loss": 0.89071333, + "learning_rate": 1.231081372744317e-06, + "loss": 0.96742344, + "num_input_tokens_seen": 228359210, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.09552002, + "step": 10592, + "time_per_iteration": 2.51094126701355 + }, + { + "auxiliary_loss_clip": 0.06405515, + "auxiliary_loss_mlp": 0.01266442, + "balance_loss_clip": 0.06272536, + "balance_loss_mlp": 0.01256906, + "epoch": 0.6368856155117992, + "flos": 26474270993280.0, + "grad_norm": 1.3189503052137, + "language_loss": 0.68928409, + "learning_rate": 1.2307218589142376e-06, + "loss": 0.76600361, + "num_input_tokens_seen": 228379630, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.09533691, + "step": 10593, + "time_per_iteration": 2.5533511638641357 + }, + { + "auxiliary_loss_clip": 0.06408849, + "auxiliary_loss_mlp": 0.01266265, + "balance_loss_clip": 0.06273521, + "balance_loss_mlp": 0.01256329, + "epoch": 0.6369457387644671, + "flos": 33700754995200.0, + "grad_norm": 1.6851555086975611, + "language_loss": 0.6369772, + "learning_rate": 1.2303623742552618e-06, + "loss": 0.71372831, + "num_input_tokens_seen": 228401410, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.09942627, + "step": 10594, + "time_per_iteration": 2.6149699687957764 + }, + { + "auxiliary_loss_clip": 0.06308158, + "auxiliary_loss_mlp": 0.01250909, + "balance_loss_clip": 0.06252004, + "balance_loss_mlp": 0.01249539, + "epoch": 0.6370058620171352, + "flos": 70929365316480.0, + "grad_norm": 0.7572264790485472, + "language_loss": 0.54663223, + "learning_rate": 1.230002918781022e-06, + "loss": 0.6222229, + "num_input_tokens_seen": 228470335, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01372528, + "step": 10595, + "time_per_iteration": 4.630947589874268 + }, + { + "auxiliary_loss_clip": 0.06416945, + "auxiliary_loss_mlp": 0.01266296, + "balance_loss_clip": 0.06275225, + "balance_loss_mlp": 0.01255436, + "epoch": 0.6370659852698031, + "flos": 21148267046400.0, + "grad_norm": 1.6750235845380184, + "language_loss": 0.66897941, + "learning_rate": 1.2296434925051493e-06, + "loss": 0.74581182, + "num_input_tokens_seen": 228490765, + "router_z_loss_clip": 1.41699219, + "router_z_loss_mlp": 0.10858154, + "step": 10596, + "time_per_iteration": 2.550053834915161 + }, + { + "auxiliary_loss_clip": 0.06410693, + "auxiliary_loss_mlp": 0.01266417, + "balance_loss_clip": 0.06275079, + "balance_loss_mlp": 0.01256022, + "epoch": 0.6371261085224711, + "flos": 20199452538240.0, + "grad_norm": 4.2038058583126405, + "language_loss": 0.79555941, + "learning_rate": 1.2292840954412718e-06, + "loss": 0.87233055, + "num_input_tokens_seen": 228509700, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.10400391, + "step": 10597, + "time_per_iteration": 2.5332624912261963 + }, + { + "auxiliary_loss_clip": 0.06414443, + "auxiliary_loss_mlp": 0.01266294, + "balance_loss_clip": 0.06275137, + "balance_loss_mlp": 0.01255446, + "epoch": 0.637186231775139, + "flos": 19689790878720.0, + "grad_norm": 1.6206633129115742, + "language_loss": 0.7509104, + "learning_rate": 1.2289247276030189e-06, + "loss": 0.82771772, + "num_input_tokens_seen": 228529050, + "router_z_loss_clip": 1.39257812, + "router_z_loss_mlp": 0.10852051, + "step": 10598, + "time_per_iteration": 2.5732879638671875 + }, + { + "auxiliary_loss_clip": 0.06411502, + "auxiliary_loss_mlp": 0.01263084, + "balance_loss_clip": 0.06272967, + "balance_loss_mlp": 0.01253381, + "epoch": 0.637246355027807, + "flos": 13074937355520.0, + "grad_norm": 1.7290939316313776, + "language_loss": 0.68839526, + "learning_rate": 1.2285653890040176e-06, + "loss": 0.76514107, + "num_input_tokens_seen": 228544665, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.0970459, + "step": 10599, + "time_per_iteration": 2.476140260696411 + }, + { + "auxiliary_loss_clip": 0.06417891, + "auxiliary_loss_mlp": 0.01266352, + "balance_loss_clip": 0.06276424, + "balance_loss_mlp": 0.01254664, + "epoch": 0.6373064782804749, + "flos": 18228421745280.0, + "grad_norm": 1.9832548083292807, + "language_loss": 0.80652881, + "learning_rate": 1.2282060796578942e-06, + "loss": 0.88337129, + "num_input_tokens_seen": 228562060, + "router_z_loss_clip": 1.41308594, + "router_z_loss_mlp": 0.11700439, + "step": 10600, + "time_per_iteration": 2.496344804763794 + }, + { + "auxiliary_loss_clip": 0.06407046, + "auxiliary_loss_mlp": 0.0126749, + "balance_loss_clip": 0.06272307, + "balance_loss_mlp": 0.01257626, + "epoch": 0.637366601533143, + "flos": 24505336552320.0, + "grad_norm": 1.383513371134078, + "language_loss": 0.79706007, + "learning_rate": 1.2278467995782732e-06, + "loss": 0.8738054, + "num_input_tokens_seen": 228582550, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.09863281, + "step": 10601, + "time_per_iteration": 2.533555269241333 + }, + { + "auxiliary_loss_clip": 0.06416898, + "auxiliary_loss_mlp": 0.01263888, + "balance_loss_clip": 0.06276521, + "balance_loss_mlp": 0.01253332, + "epoch": 0.6374267247858109, + "flos": 26366180826240.0, + "grad_norm": 2.20794570441013, + "language_loss": 0.67092741, + "learning_rate": 1.2274875487787797e-06, + "loss": 0.74773526, + "num_input_tokens_seen": 228604960, + "router_z_loss_clip": 1.40332031, + "router_z_loss_mlp": 0.10559082, + "step": 10602, + "time_per_iteration": 2.5890238285064697 + }, + { + "auxiliary_loss_clip": 0.06413972, + "auxiliary_loss_mlp": 0.01266008, + "balance_loss_clip": 0.06275181, + "balance_loss_mlp": 0.0125578, + "epoch": 0.6374868480384789, + "flos": 20377254902400.0, + "grad_norm": 1.5742012675871089, + "language_loss": 0.79736137, + "learning_rate": 1.2271283272730354e-06, + "loss": 0.87416112, + "num_input_tokens_seen": 228622195, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.10223389, + "step": 10603, + "time_per_iteration": 2.4978857040405273 + }, + { + "auxiliary_loss_clip": 0.0641142, + "auxiliary_loss_mlp": 0.01265674, + "balance_loss_clip": 0.06272836, + "balance_loss_mlp": 0.01255058, + "epoch": 0.6375469712911469, + "flos": 21002595523200.0, + "grad_norm": 2.075723287568445, + "language_loss": 0.76759392, + "learning_rate": 1.2267691350746621e-06, + "loss": 0.84436482, + "num_input_tokens_seen": 228639735, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.10626221, + "step": 10604, + "time_per_iteration": 2.5228052139282227 + }, + { + "auxiliary_loss_clip": 0.0641887, + "auxiliary_loss_mlp": 0.01265156, + "balance_loss_clip": 0.062751, + "balance_loss_mlp": 0.01253551, + "epoch": 0.6376070945438148, + "flos": 19721292814080.0, + "grad_norm": 2.969254888536146, + "language_loss": 0.77310598, + "learning_rate": 1.226409972197281e-06, + "loss": 0.84994626, + "num_input_tokens_seen": 228658195, + "router_z_loss_clip": 1.43847656, + "router_z_loss_mlp": 0.11608887, + "step": 10605, + "time_per_iteration": 2.4766769409179688 + }, + { + "auxiliary_loss_clip": 0.06417184, + "auxiliary_loss_mlp": 0.01265543, + "balance_loss_clip": 0.06277403, + "balance_loss_mlp": 0.01254087, + "epoch": 0.6376672177964828, + "flos": 21513137650560.0, + "grad_norm": 1.8415567136743551, + "language_loss": 0.66146404, + "learning_rate": 1.2260508386545106e-06, + "loss": 0.73829126, + "num_input_tokens_seen": 228677415, + "router_z_loss_clip": 1.39648438, + "router_z_loss_mlp": 0.11437988, + "step": 10606, + "time_per_iteration": 3.962454080581665 + }, + { + "auxiliary_loss_clip": 0.06409881, + "auxiliary_loss_mlp": 0.01267672, + "balance_loss_clip": 0.06276855, + "balance_loss_mlp": 0.01257891, + "epoch": 0.6377273410491507, + "flos": 18849905078400.0, + "grad_norm": 1.5392078588294233, + "language_loss": 0.75399411, + "learning_rate": 1.225691734459971e-06, + "loss": 0.8307696, + "num_input_tokens_seen": 228696450, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.09777832, + "step": 10607, + "time_per_iteration": 2.481400489807129 + }, + { + "auxiliary_loss_clip": 0.06417431, + "auxiliary_loss_mlp": 0.01270028, + "balance_loss_clip": 0.06278283, + "balance_loss_mlp": 0.01259514, + "epoch": 0.6377874643018188, + "flos": 53073962749440.0, + "grad_norm": 1.6290224643321956, + "language_loss": 0.655065, + "learning_rate": 1.225332659627278e-06, + "loss": 0.73193955, + "num_input_tokens_seen": 228721600, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.1050415, + "step": 10608, + "time_per_iteration": 2.80210018157959 + }, + { + "auxiliary_loss_clip": 0.06314453, + "auxiliary_loss_mlp": 0.01252573, + "balance_loss_clip": 0.0625798, + "balance_loss_mlp": 0.01251221, + "epoch": 0.6378475875544867, + "flos": 65153349417600.0, + "grad_norm": 0.7210390428690479, + "language_loss": 0.5201869, + "learning_rate": 1.2249736141700475e-06, + "loss": 0.59585714, + "num_input_tokens_seen": 228784535, + "router_z_loss_clip": 0.56591797, + "router_z_loss_mlp": 0.01354218, + "step": 10609, + "time_per_iteration": 4.542863368988037 + }, + { + "auxiliary_loss_clip": 0.06406713, + "auxiliary_loss_mlp": 0.01266217, + "balance_loss_clip": 0.06272352, + "balance_loss_mlp": 0.01257122, + "epoch": 0.6379077108071547, + "flos": 23009404809600.0, + "grad_norm": 1.4796346735577246, + "language_loss": 0.74981046, + "learning_rate": 1.2246145981018965e-06, + "loss": 0.82653975, + "num_input_tokens_seen": 228804110, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.09100342, + "step": 10610, + "time_per_iteration": 2.5884346961975098 + }, + { + "auxiliary_loss_clip": 0.06314634, + "auxiliary_loss_mlp": 0.01251771, + "balance_loss_clip": 0.06257996, + "balance_loss_mlp": 0.01250523, + "epoch": 0.6379678340598226, + "flos": 67624425849600.0, + "grad_norm": 0.8350558513372389, + "language_loss": 0.62598002, + "learning_rate": 1.2242556114364364e-06, + "loss": 0.70164406, + "num_input_tokens_seen": 228867705, + "router_z_loss_clip": 0.56689453, + "router_z_loss_mlp": 0.01247406, + "step": 10611, + "time_per_iteration": 3.208292245864868 + }, + { + "auxiliary_loss_clip": 0.06416688, + "auxiliary_loss_mlp": 0.01263819, + "balance_loss_clip": 0.06276392, + "balance_loss_mlp": 0.01253513, + "epoch": 0.6380279573124906, + "flos": 29687891109120.0, + "grad_norm": 2.188557109067727, + "language_loss": 0.72870415, + "learning_rate": 1.223896654187282e-06, + "loss": 0.80550921, + "num_input_tokens_seen": 228889215, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.10308838, + "step": 10612, + "time_per_iteration": 2.5807394981384277 + }, + { + "auxiliary_loss_clip": 0.06312064, + "auxiliary_loss_mlp": 0.01253142, + "balance_loss_clip": 0.06255382, + "balance_loss_mlp": 0.01251885, + "epoch": 0.6380880805651585, + "flos": 66502435680000.0, + "grad_norm": 0.7266099968525627, + "language_loss": 0.57775903, + "learning_rate": 1.2235377263680446e-06, + "loss": 0.65341103, + "num_input_tokens_seen": 228948465, + "router_z_loss_clip": 0.56982422, + "router_z_loss_mlp": 0.01256561, + "step": 10613, + "time_per_iteration": 3.0924766063690186 + }, + { + "auxiliary_loss_clip": 0.06422254, + "auxiliary_loss_mlp": 0.01264432, + "balance_loss_clip": 0.06280632, + "balance_loss_mlp": 0.01253483, + "epoch": 0.6381482038178266, + "flos": 23921811918720.0, + "grad_norm": 1.7742162127346608, + "language_loss": 0.75586814, + "learning_rate": 1.2231788279923334e-06, + "loss": 0.832735, + "num_input_tokens_seen": 228967955, + "router_z_loss_clip": 1.41601562, + "router_z_loss_mlp": 0.10949707, + "step": 10614, + "time_per_iteration": 2.5669398307800293 + }, + { + "auxiliary_loss_clip": 0.06413062, + "auxiliary_loss_mlp": 0.01263583, + "balance_loss_clip": 0.0627507, + "balance_loss_mlp": 0.01253277, + "epoch": 0.6382083270704945, + "flos": 24249855853440.0, + "grad_norm": 1.866062102155962, + "language_loss": 0.79879516, + "learning_rate": 1.2228199590737599e-06, + "loss": 0.87556159, + "num_input_tokens_seen": 228985495, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.10314941, + "step": 10615, + "time_per_iteration": 3.9333317279815674 + }, + { + "auxiliary_loss_clip": 0.06313558, + "auxiliary_loss_mlp": 0.01251207, + "balance_loss_clip": 0.0625703, + "balance_loss_mlp": 0.01249947, + "epoch": 0.6382684503231625, + "flos": 70798452111360.0, + "grad_norm": 0.6364915071256667, + "language_loss": 0.55039352, + "learning_rate": 1.2224611196259305e-06, + "loss": 0.62604117, + "num_input_tokens_seen": 229052995, + "router_z_loss_clip": 0.56640625, + "router_z_loss_mlp": 0.01260376, + "step": 10616, + "time_per_iteration": 3.2114999294281006 + }, + { + "auxiliary_loss_clip": 0.06411368, + "auxiliary_loss_mlp": 0.01263079, + "balance_loss_clip": 0.06272632, + "balance_loss_mlp": 0.01252654, + "epoch": 0.6383285735758305, + "flos": 16550411080320.0, + "grad_norm": 1.6623229086008653, + "language_loss": 0.84516096, + "learning_rate": 1.2221023096624538e-06, + "loss": 0.92190546, + "num_input_tokens_seen": 229071030, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.10430908, + "step": 10617, + "time_per_iteration": 2.50490665435791 + }, + { + "auxiliary_loss_clip": 0.06414464, + "auxiliary_loss_mlp": 0.01266034, + "balance_loss_clip": 0.06274582, + "balance_loss_mlp": 0.01255037, + "epoch": 0.6383886968284984, + "flos": 14432702515200.0, + "grad_norm": 1.7049012321551236, + "language_loss": 0.86996436, + "learning_rate": 1.221743529196936e-06, + "loss": 0.94676924, + "num_input_tokens_seen": 229088275, + "router_z_loss_clip": 1.39941406, + "router_z_loss_mlp": 0.10998535, + "step": 10618, + "time_per_iteration": 2.4782254695892334 + }, + { + "auxiliary_loss_clip": 0.06414133, + "auxiliary_loss_mlp": 0.01263472, + "balance_loss_clip": 0.06273396, + "balance_loss_mlp": 0.01253536, + "epoch": 0.6384488200811664, + "flos": 17935191982080.0, + "grad_norm": 1.660467856665914, + "language_loss": 0.73454595, + "learning_rate": 1.2213847782429806e-06, + "loss": 0.81132197, + "num_input_tokens_seen": 229105190, + "router_z_loss_clip": 1.40527344, + "router_z_loss_mlp": 0.09936523, + "step": 10619, + "time_per_iteration": 2.5073039531707764 + }, + { + "auxiliary_loss_clip": 0.06421836, + "auxiliary_loss_mlp": 0.01269484, + "balance_loss_clip": 0.06276071, + "balance_loss_mlp": 0.01258475, + "epoch": 0.6385089433338343, + "flos": 18521567654400.0, + "grad_norm": 1.8426309945064288, + "language_loss": 0.7661649, + "learning_rate": 1.221026056814193e-06, + "loss": 0.84307802, + "num_input_tokens_seen": 229122290, + "router_z_loss_clip": 1.45703125, + "router_z_loss_mlp": 0.11010742, + "step": 10620, + "time_per_iteration": 2.5297937393188477 + }, + { + "auxiliary_loss_clip": 0.06419566, + "auxiliary_loss_mlp": 0.01267834, + "balance_loss_clip": 0.0628044, + "balance_loss_mlp": 0.01256963, + "epoch": 0.6385690665865024, + "flos": 24760481834880.0, + "grad_norm": 2.368652650522925, + "language_loss": 0.70688897, + "learning_rate": 1.2206673649241752e-06, + "loss": 0.78376299, + "num_input_tokens_seen": 229141620, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.10870361, + "step": 10621, + "time_per_iteration": 2.5605804920196533 + }, + { + "auxiliary_loss_clip": 0.0640726, + "auxiliary_loss_mlp": 0.01264019, + "balance_loss_clip": 0.06274956, + "balance_loss_mlp": 0.01254887, + "epoch": 0.6386291898391703, + "flos": 20126763521280.0, + "grad_norm": 1.5541804815340177, + "language_loss": 0.77669823, + "learning_rate": 1.220308702586529e-06, + "loss": 0.85341108, + "num_input_tokens_seen": 229161570, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.09130859, + "step": 10622, + "time_per_iteration": 2.495631217956543 + }, + { + "auxiliary_loss_clip": 0.06408195, + "auxiliary_loss_mlp": 0.0126391, + "balance_loss_clip": 0.06273771, + "balance_loss_mlp": 0.01253903, + "epoch": 0.6386893130918383, + "flos": 16871914396800.0, + "grad_norm": 1.737894673487703, + "language_loss": 0.74773431, + "learning_rate": 1.2199500698148546e-06, + "loss": 0.82445532, + "num_input_tokens_seen": 229178465, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.10015869, + "step": 10623, + "time_per_iteration": 2.5214576721191406 + }, + { + "auxiliary_loss_clip": 0.0640855, + "auxiliary_loss_mlp": 0.01264001, + "balance_loss_clip": 0.06273185, + "balance_loss_mlp": 0.01254512, + "epoch": 0.6387494363445062, + "flos": 22972913556480.0, + "grad_norm": 1.3339080512049293, + "language_loss": 0.77151477, + "learning_rate": 1.2195914666227527e-06, + "loss": 0.84824026, + "num_input_tokens_seen": 229198975, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.09490967, + "step": 10624, + "time_per_iteration": 2.5108532905578613 + }, + { + "auxiliary_loss_clip": 0.064144, + "auxiliary_loss_mlp": 0.01262692, + "balance_loss_clip": 0.06276258, + "balance_loss_mlp": 0.01252637, + "epoch": 0.6388095595971742, + "flos": 22864487973120.0, + "grad_norm": 1.5899649446688702, + "language_loss": 0.80630493, + "learning_rate": 1.21923289302382e-06, + "loss": 0.88307583, + "num_input_tokens_seen": 229218825, + "router_z_loss_clip": 1.38183594, + "router_z_loss_mlp": 0.10064697, + "step": 10625, + "time_per_iteration": 2.5426197052001953 + }, + { + "auxiliary_loss_clip": 0.06416376, + "auxiliary_loss_mlp": 0.0126597, + "balance_loss_clip": 0.06277139, + "balance_loss_mlp": 0.01254842, + "epoch": 0.6388696828498421, + "flos": 17317314374400.0, + "grad_norm": 1.7136519687434957, + "language_loss": 0.72979832, + "learning_rate": 1.218874349031654e-06, + "loss": 0.80662179, + "num_input_tokens_seen": 229236060, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.11126709, + "step": 10626, + "time_per_iteration": 2.494306802749634 + }, + { + "auxiliary_loss_clip": 0.06408393, + "auxiliary_loss_mlp": 0.01265881, + "balance_loss_clip": 0.06270021, + "balance_loss_mlp": 0.01255015, + "epoch": 0.6389298061025102, + "flos": 17134313057280.0, + "grad_norm": 1.513972649351316, + "language_loss": 0.73141295, + "learning_rate": 1.2185158346598517e-06, + "loss": 0.80815566, + "num_input_tokens_seen": 229255160, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.10870361, + "step": 10627, + "time_per_iteration": 2.5244781970977783 + }, + { + "auxiliary_loss_clip": 0.06419984, + "auxiliary_loss_mlp": 0.01267157, + "balance_loss_clip": 0.06274766, + "balance_loss_mlp": 0.01255391, + "epoch": 0.6389899293551781, + "flos": 27718663178880.0, + "grad_norm": 1.6703880840860492, + "language_loss": 0.66923428, + "learning_rate": 1.2181573499220064e-06, + "loss": 0.74610573, + "num_input_tokens_seen": 229278705, + "router_z_loss_clip": 1.45117188, + "router_z_loss_mlp": 0.11773682, + "step": 10628, + "time_per_iteration": 2.575000762939453 + }, + { + "auxiliary_loss_clip": 0.06410551, + "auxiliary_loss_mlp": 0.01264342, + "balance_loss_clip": 0.0627692, + "balance_loss_mlp": 0.01254197, + "epoch": 0.6390500526078461, + "flos": 21222171947520.0, + "grad_norm": 1.956585229435901, + "language_loss": 0.68194425, + "learning_rate": 1.2177988948317135e-06, + "loss": 0.7586931, + "num_input_tokens_seen": 229299990, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.10150146, + "step": 10629, + "time_per_iteration": 2.5807948112487793 + }, + { + "auxiliary_loss_clip": 0.06422858, + "auxiliary_loss_mlp": 0.01271827, + "balance_loss_clip": 0.0627673, + "balance_loss_mlp": 0.01258708, + "epoch": 0.6391101758605141, + "flos": 21587671457280.0, + "grad_norm": 1.5207801965767835, + "language_loss": 0.75444686, + "learning_rate": 1.2174404694025646e-06, + "loss": 0.83139372, + "num_input_tokens_seen": 229319230, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.13116455, + "step": 10630, + "time_per_iteration": 2.5017268657684326 + }, + { + "auxiliary_loss_clip": 0.06408527, + "auxiliary_loss_mlp": 0.01264942, + "balance_loss_clip": 0.06272866, + "balance_loss_mlp": 0.01255, + "epoch": 0.639170299113182, + "flos": 19906432410240.0, + "grad_norm": 1.6356950234102068, + "language_loss": 0.70487773, + "learning_rate": 1.2170820736481511e-06, + "loss": 0.78161246, + "num_input_tokens_seen": 229338600, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.09942627, + "step": 10631, + "time_per_iteration": 2.55197811126709 + }, + { + "auxiliary_loss_clip": 0.06314358, + "auxiliary_loss_mlp": 0.01251531, + "balance_loss_clip": 0.06258033, + "balance_loss_mlp": 0.01250199, + "epoch": 0.63923042236585, + "flos": 69896625344640.0, + "grad_norm": 0.7602289508759135, + "language_loss": 0.62733555, + "learning_rate": 1.2167237075820646e-06, + "loss": 0.70299447, + "num_input_tokens_seen": 229402420, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01333618, + "step": 10632, + "time_per_iteration": 3.190108060836792 + }, + { + "auxiliary_loss_clip": 0.06410427, + "auxiliary_loss_mlp": 0.01266129, + "balance_loss_clip": 0.0627519, + "balance_loss_mlp": 0.01255948, + "epoch": 0.639290545618518, + "flos": 22681486656000.0, + "grad_norm": 2.160270989856127, + "language_loss": 0.66821963, + "learning_rate": 1.216365371217893e-06, + "loss": 0.74498516, + "num_input_tokens_seen": 229419185, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.10174561, + "step": 10633, + "time_per_iteration": 2.552823543548584 + }, + { + "auxiliary_loss_clip": 0.06411168, + "auxiliary_loss_mlp": 0.01267004, + "balance_loss_clip": 0.06274083, + "balance_loss_mlp": 0.01256472, + "epoch": 0.639350668871186, + "flos": 19835420474880.0, + "grad_norm": 2.0078331211958638, + "language_loss": 0.82085246, + "learning_rate": 1.216007064569225e-06, + "loss": 0.89763421, + "num_input_tokens_seen": 229436735, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.10540771, + "step": 10634, + "time_per_iteration": 3.9264204502105713 + }, + { + "auxiliary_loss_clip": 0.06411835, + "auxiliary_loss_mlp": 0.01269552, + "balance_loss_clip": 0.06274228, + "balance_loss_mlp": 0.01258585, + "epoch": 0.6394107921238539, + "flos": 20558746846080.0, + "grad_norm": 1.4689992647467067, + "language_loss": 0.75053954, + "learning_rate": 1.2156487876496483e-06, + "loss": 0.82735342, + "num_input_tokens_seen": 229455595, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.10968018, + "step": 10635, + "time_per_iteration": 2.4891774654388428 + }, + { + "auxiliary_loss_clip": 0.06409803, + "auxiliary_loss_mlp": 0.01264504, + "balance_loss_clip": 0.06272061, + "balance_loss_mlp": 0.01254878, + "epoch": 0.6394709153765219, + "flos": 25781985360000.0, + "grad_norm": 1.6046642220248264, + "language_loss": 0.71619642, + "learning_rate": 1.2152905404727475e-06, + "loss": 0.79293942, + "num_input_tokens_seen": 229476230, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.09637451, + "step": 10636, + "time_per_iteration": 2.5812439918518066 + }, + { + "auxiliary_loss_clip": 0.06415339, + "auxiliary_loss_mlp": 0.01266128, + "balance_loss_clip": 0.0627417, + "balance_loss_mlp": 0.01255352, + "epoch": 0.6395310386291898, + "flos": 17535926476800.0, + "grad_norm": 2.1920700627694867, + "language_loss": 0.73530567, + "learning_rate": 1.2149323230521085e-06, + "loss": 0.81212032, + "num_input_tokens_seen": 229494300, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.10772705, + "step": 10637, + "time_per_iteration": 2.485643148422241 + }, + { + "auxiliary_loss_clip": 0.0641741, + "auxiliary_loss_mlp": 0.01266874, + "balance_loss_clip": 0.06276354, + "balance_loss_mlp": 0.01255871, + "epoch": 0.6395911618818578, + "flos": 18594172817280.0, + "grad_norm": 1.7577292466251317, + "language_loss": 0.78289723, + "learning_rate": 1.2145741354013143e-06, + "loss": 0.85974002, + "num_input_tokens_seen": 229512985, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.10986328, + "step": 10638, + "time_per_iteration": 2.482006549835205 + }, + { + "auxiliary_loss_clip": 0.06409052, + "auxiliary_loss_mlp": 0.01264378, + "balance_loss_clip": 0.0627173, + "balance_loss_mlp": 0.01253655, + "epoch": 0.6396512851345257, + "flos": 28374164069760.0, + "grad_norm": 1.4288466998721474, + "language_loss": 0.815153, + "learning_rate": 1.2142159775339478e-06, + "loss": 0.89188731, + "num_input_tokens_seen": 229534270, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.10717773, + "step": 10639, + "time_per_iteration": 2.553853750228882 + }, + { + "auxiliary_loss_clip": 0.06314266, + "auxiliary_loss_mlp": 0.01251751, + "balance_loss_clip": 0.06258021, + "balance_loss_mlp": 0.01250554, + "epoch": 0.6397114083871938, + "flos": 70744728844800.0, + "grad_norm": 0.7996184433796636, + "language_loss": 0.59009802, + "learning_rate": 1.21385784946359e-06, + "loss": 0.66575813, + "num_input_tokens_seen": 229596455, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01195526, + "step": 10640, + "time_per_iteration": 3.0804762840270996 + }, + { + "auxiliary_loss_clip": 0.0640569, + "auxiliary_loss_mlp": 0.01265577, + "balance_loss_clip": 0.06272022, + "balance_loss_mlp": 0.01255963, + "epoch": 0.6397715316398617, + "flos": 18147095758080.0, + "grad_norm": 1.6659836554468106, + "language_loss": 0.78961474, + "learning_rate": 1.2134997512038215e-06, + "loss": 0.8663274, + "num_input_tokens_seen": 229612860, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.09619141, + "step": 10641, + "time_per_iteration": 2.470735788345337 + }, + { + "auxiliary_loss_clip": 0.06423657, + "auxiliary_loss_mlp": 0.01266539, + "balance_loss_clip": 0.06278598, + "balance_loss_mlp": 0.01255422, + "epoch": 0.6398316548925297, + "flos": 25746668064000.0, + "grad_norm": 2.1982581134788672, + "language_loss": 0.63584703, + "learning_rate": 1.2131416827682209e-06, + "loss": 0.712749, + "num_input_tokens_seen": 229633960, + "router_z_loss_clip": 1.45117188, + "router_z_loss_mlp": 0.11120605, + "step": 10642, + "time_per_iteration": 2.572493314743042 + }, + { + "auxiliary_loss_clip": 0.06314563, + "auxiliary_loss_mlp": 0.0125166, + "balance_loss_clip": 0.06258431, + "balance_loss_mlp": 0.01250544, + "epoch": 0.6398917781451977, + "flos": 71231246778240.0, + "grad_norm": 0.888550554325656, + "language_loss": 0.55987263, + "learning_rate": 1.2127836441703667e-06, + "loss": 0.63553476, + "num_input_tokens_seen": 229686730, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01118469, + "step": 10643, + "time_per_iteration": 3.0916545391082764 + }, + { + "auxiliary_loss_clip": 0.06416592, + "auxiliary_loss_mlp": 0.01266284, + "balance_loss_clip": 0.06274326, + "balance_loss_mlp": 0.01255252, + "epoch": 0.6399519013978656, + "flos": 20528083451520.0, + "grad_norm": 1.8692423093064807, + "language_loss": 0.772012, + "learning_rate": 1.2124256354238358e-06, + "loss": 0.84884077, + "num_input_tokens_seen": 229704800, + "router_z_loss_clip": 1.42285156, + "router_z_loss_mlp": 0.11022949, + "step": 10644, + "time_per_iteration": 2.523844003677368 + }, + { + "auxiliary_loss_clip": 0.06409791, + "auxiliary_loss_mlp": 0.0126534, + "balance_loss_clip": 0.06274743, + "balance_loss_mlp": 0.01254676, + "epoch": 0.6400120246505336, + "flos": 24467503633920.0, + "grad_norm": 1.3560803021320431, + "language_loss": 0.82639438, + "learning_rate": 1.212067656542203e-06, + "loss": 0.90314567, + "num_input_tokens_seen": 229725265, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.10675049, + "step": 10645, + "time_per_iteration": 2.546128749847412 + }, + { + "auxiliary_loss_clip": 0.06421367, + "auxiliary_loss_mlp": 0.01263793, + "balance_loss_clip": 0.06277816, + "balance_loss_mlp": 0.01251997, + "epoch": 0.6400721479032015, + "flos": 28373619018240.0, + "grad_norm": 1.814178451427478, + "language_loss": 0.73952079, + "learning_rate": 1.2117097075390447e-06, + "loss": 0.81637239, + "num_input_tokens_seen": 229744840, + "router_z_loss_clip": 1.43652344, + "router_z_loss_mlp": 0.11798096, + "step": 10646, + "time_per_iteration": 3.966240167617798 + }, + { + "auxiliary_loss_clip": 0.06412562, + "auxiliary_loss_mlp": 0.01268277, + "balance_loss_clip": 0.06275235, + "balance_loss_mlp": 0.01257167, + "epoch": 0.6401322711558696, + "flos": 17821441664640.0, + "grad_norm": 1.9335985649403467, + "language_loss": 0.80623794, + "learning_rate": 1.2113517884279327e-06, + "loss": 0.88304639, + "num_input_tokens_seen": 229759095, + "router_z_loss_clip": 1.37402344, + "router_z_loss_mlp": 0.11114502, + "step": 10647, + "time_per_iteration": 2.497234582901001 + }, + { + "auxiliary_loss_clip": 0.06410154, + "auxiliary_loss_mlp": 0.01265126, + "balance_loss_clip": 0.06276208, + "balance_loss_mlp": 0.01255094, + "epoch": 0.6401923944085375, + "flos": 26037969183360.0, + "grad_norm": 1.5109233302980645, + "language_loss": 0.75784671, + "learning_rate": 1.2109938992224399e-06, + "loss": 0.83459949, + "num_input_tokens_seen": 229777750, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.10028076, + "step": 10648, + "time_per_iteration": 2.5445501804351807 + }, + { + "auxiliary_loss_clip": 0.06407083, + "auxiliary_loss_mlp": 0.01263508, + "balance_loss_clip": 0.06269361, + "balance_loss_mlp": 0.01253525, + "epoch": 0.6402525176612055, + "flos": 23593181005440.0, + "grad_norm": 1.948589206417596, + "language_loss": 0.79203671, + "learning_rate": 1.210636039936138e-06, + "loss": 0.86874264, + "num_input_tokens_seen": 229796785, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.09979248, + "step": 10649, + "time_per_iteration": 3.9821319580078125 + }, + { + "auxiliary_loss_clip": 0.06411543, + "auxiliary_loss_mlp": 0.01264939, + "balance_loss_clip": 0.06272741, + "balance_loss_mlp": 0.01254222, + "epoch": 0.6403126409138734, + "flos": 18047349072000.0, + "grad_norm": 2.12746104130849, + "language_loss": 0.75310314, + "learning_rate": 1.2102782105825956e-06, + "loss": 0.82986802, + "num_input_tokens_seen": 229815425, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.1071167, + "step": 10650, + "time_per_iteration": 2.488818883895874 + }, + { + "auxiliary_loss_clip": 0.06408805, + "auxiliary_loss_mlp": 0.01268267, + "balance_loss_clip": 0.06273156, + "balance_loss_mlp": 0.01256513, + "epoch": 0.6403727641665414, + "flos": 21985679151360.0, + "grad_norm": 1.3966136649863612, + "language_loss": 0.70929539, + "learning_rate": 1.2099204111753833e-06, + "loss": 0.78606611, + "num_input_tokens_seen": 229834545, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.11743164, + "step": 10651, + "time_per_iteration": 2.5219950675964355 + }, + { + "auxiliary_loss_clip": 0.06413059, + "auxiliary_loss_mlp": 0.01269512, + "balance_loss_clip": 0.06274731, + "balance_loss_mlp": 0.0125824, + "epoch": 0.6404328874192093, + "flos": 24901751018880.0, + "grad_norm": 2.1293665277256624, + "language_loss": 0.64404488, + "learning_rate": 1.2095626417280684e-06, + "loss": 0.72087055, + "num_input_tokens_seen": 229849175, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.11273193, + "step": 10652, + "time_per_iteration": 2.5231480598449707 + }, + { + "auxiliary_loss_clip": 0.06411535, + "auxiliary_loss_mlp": 0.01262653, + "balance_loss_clip": 0.06274502, + "balance_loss_mlp": 0.01252509, + "epoch": 0.6404930106718774, + "flos": 17601991021440.0, + "grad_norm": 1.8908665793351147, + "language_loss": 0.79652649, + "learning_rate": 1.2092049022542168e-06, + "loss": 0.87326837, + "num_input_tokens_seen": 229865400, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.10150146, + "step": 10653, + "time_per_iteration": 2.5704574584960938 + }, + { + "auxiliary_loss_clip": 0.06425246, + "auxiliary_loss_mlp": 0.0127165, + "balance_loss_clip": 0.06277368, + "balance_loss_mlp": 0.01259973, + "epoch": 0.6405531339245453, + "flos": 20164219096320.0, + "grad_norm": 2.6567000735134463, + "language_loss": 0.70885104, + "learning_rate": 1.2088471927673952e-06, + "loss": 0.78582001, + "num_input_tokens_seen": 229882945, + "router_z_loss_clip": 1.47753906, + "router_z_loss_mlp": 0.11682129, + "step": 10654, + "time_per_iteration": 2.534069061279297 + }, + { + "auxiliary_loss_clip": 0.0641733, + "auxiliary_loss_mlp": 0.01267285, + "balance_loss_clip": 0.06274031, + "balance_loss_mlp": 0.0125574, + "epoch": 0.6406132571772133, + "flos": 21948349357440.0, + "grad_norm": 1.5377239110005414, + "language_loss": 0.72583055, + "learning_rate": 1.2084895132811666e-06, + "loss": 0.80267668, + "num_input_tokens_seen": 229901590, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.11553955, + "step": 10655, + "time_per_iteration": 3.9230480194091797 + }, + { + "auxiliary_loss_clip": 0.06412716, + "auxiliary_loss_mlp": 0.01268726, + "balance_loss_clip": 0.06272289, + "balance_loss_mlp": 0.01257074, + "epoch": 0.6406733804298813, + "flos": 28775693635200.0, + "grad_norm": 1.9128350177290707, + "language_loss": 0.82931209, + "learning_rate": 1.2081318638090952e-06, + "loss": 0.90612656, + "num_input_tokens_seen": 229922535, + "router_z_loss_clip": 1.40332031, + "router_z_loss_mlp": 0.11657715, + "step": 10656, + "time_per_iteration": 2.601238489151001 + }, + { + "auxiliary_loss_clip": 0.06410467, + "auxiliary_loss_mlp": 0.01268343, + "balance_loss_clip": 0.06271605, + "balance_loss_mlp": 0.01257817, + "epoch": 0.6407335036825492, + "flos": 17462943970560.0, + "grad_norm": 3.923220638478792, + "language_loss": 0.72232449, + "learning_rate": 1.2077742443647433e-06, + "loss": 0.79911268, + "num_input_tokens_seen": 229939575, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.10516357, + "step": 10657, + "time_per_iteration": 2.478569984436035 + }, + { + "auxiliary_loss_clip": 0.06411502, + "auxiliary_loss_mlp": 0.01272042, + "balance_loss_clip": 0.06272899, + "balance_loss_mlp": 0.01261766, + "epoch": 0.6407936269352172, + "flos": 22131476455680.0, + "grad_norm": 1.5017144440006371, + "language_loss": 0.77455044, + "learning_rate": 1.2074166549616707e-06, + "loss": 0.85138589, + "num_input_tokens_seen": 229958840, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.10272217, + "step": 10658, + "time_per_iteration": 2.6262331008911133 + }, + { + "auxiliary_loss_clip": 0.06414957, + "auxiliary_loss_mlp": 0.01267425, + "balance_loss_clip": 0.06273896, + "balance_loss_mlp": 0.01256494, + "epoch": 0.6408537501878852, + "flos": 23117033779200.0, + "grad_norm": 1.5568653096914684, + "language_loss": 0.76262242, + "learning_rate": 1.2070590956134386e-06, + "loss": 0.83944625, + "num_input_tokens_seen": 229979680, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.10943604, + "step": 10659, + "time_per_iteration": 2.5234532356262207 + }, + { + "auxiliary_loss_clip": 0.06413037, + "auxiliary_loss_mlp": 0.01263947, + "balance_loss_clip": 0.06271739, + "balance_loss_mlp": 0.01253719, + "epoch": 0.6409138734405532, + "flos": 16478099406720.0, + "grad_norm": 1.5970917751630926, + "language_loss": 0.77884215, + "learning_rate": 1.2067015663336046e-06, + "loss": 0.85561204, + "num_input_tokens_seen": 229996830, + "router_z_loss_clip": 1.41210938, + "router_z_loss_mlp": 0.10229492, + "step": 10660, + "time_per_iteration": 2.522568941116333 + }, + { + "auxiliary_loss_clip": 0.0642052, + "auxiliary_loss_mlp": 0.01265628, + "balance_loss_clip": 0.06275806, + "balance_loss_mlp": 0.01253796, + "epoch": 0.6409739966932211, + "flos": 22783539329280.0, + "grad_norm": 1.8503290839739344, + "language_loss": 0.6901319, + "learning_rate": 1.206344067135727e-06, + "loss": 0.7669934, + "num_input_tokens_seen": 230015115, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.11834717, + "step": 10661, + "time_per_iteration": 2.5030124187469482 + }, + { + "auxiliary_loss_clip": 0.06407891, + "auxiliary_loss_mlp": 0.01269221, + "balance_loss_clip": 0.06273415, + "balance_loss_mlp": 0.01259017, + "epoch": 0.6410341199458891, + "flos": 25158489528960.0, + "grad_norm": 1.7100659203746285, + "language_loss": 0.7628997, + "learning_rate": 1.205986598033362e-06, + "loss": 0.83967084, + "num_input_tokens_seen": 230035515, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.10205078, + "step": 10662, + "time_per_iteration": 2.5515527725219727 + }, + { + "auxiliary_loss_clip": 0.0641142, + "auxiliary_loss_mlp": 0.01265377, + "balance_loss_clip": 0.06272576, + "balance_loss_mlp": 0.01255507, + "epoch": 0.641094243198557, + "flos": 27052428965760.0, + "grad_norm": 1.7631594614441006, + "language_loss": 0.69671446, + "learning_rate": 1.2056291590400644e-06, + "loss": 0.77348244, + "num_input_tokens_seen": 230054355, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.09863281, + "step": 10663, + "time_per_iteration": 2.5377395153045654 + }, + { + "auxiliary_loss_clip": 0.06414999, + "auxiliary_loss_mlp": 0.01271226, + "balance_loss_clip": 0.062755, + "balance_loss_mlp": 0.01258876, + "epoch": 0.641154366451225, + "flos": 25381629751680.0, + "grad_norm": 1.9040182096837255, + "language_loss": 0.68253797, + "learning_rate": 1.205271750169389e-06, + "loss": 0.75940025, + "num_input_tokens_seen": 230074605, + "router_z_loss_clip": 1.39648438, + "router_z_loss_mlp": 0.12353516, + "step": 10664, + "time_per_iteration": 2.5686044692993164 + }, + { + "auxiliary_loss_clip": 0.06408753, + "auxiliary_loss_mlp": 0.01265685, + "balance_loss_clip": 0.06271468, + "balance_loss_mlp": 0.01255081, + "epoch": 0.6412144897038929, + "flos": 25159998902400.0, + "grad_norm": 1.8980640494634613, + "language_loss": 0.66647685, + "learning_rate": 1.2049143714348881e-06, + "loss": 0.74322122, + "num_input_tokens_seen": 230093820, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.10601807, + "step": 10665, + "time_per_iteration": 2.5681324005126953 + }, + { + "auxiliary_loss_clip": 0.06406175, + "auxiliary_loss_mlp": 0.01263975, + "balance_loss_clip": 0.06270282, + "balance_loss_mlp": 0.01254027, + "epoch": 0.641274612956561, + "flos": 23447509482240.0, + "grad_norm": 1.7797122960809293, + "language_loss": 0.64406478, + "learning_rate": 1.2045570228501145e-06, + "loss": 0.72076625, + "num_input_tokens_seen": 230114285, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.0994873, + "step": 10666, + "time_per_iteration": 2.560159921646118 + }, + { + "auxiliary_loss_clip": 0.06411792, + "auxiliary_loss_mlp": 0.01267404, + "balance_loss_clip": 0.06272641, + "balance_loss_mlp": 0.01256556, + "epoch": 0.6413347362092289, + "flos": 19433597420160.0, + "grad_norm": 1.633933286881918, + "language_loss": 0.70997214, + "learning_rate": 1.2041997044286176e-06, + "loss": 0.78676403, + "num_input_tokens_seen": 230132760, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.10839844, + "step": 10667, + "time_per_iteration": 2.478955030441284 + }, + { + "auxiliary_loss_clip": 0.06424954, + "auxiliary_loss_mlp": 0.0127036, + "balance_loss_clip": 0.0627383, + "balance_loss_mlp": 0.01258004, + "epoch": 0.6413948594618969, + "flos": 17201425777920.0, + "grad_norm": 2.6317109326582204, + "language_loss": 0.78275955, + "learning_rate": 1.2038424161839484e-06, + "loss": 0.85971272, + "num_input_tokens_seen": 230149690, + "router_z_loss_clip": 1.50976562, + "router_z_loss_mlp": 0.12359619, + "step": 10668, + "time_per_iteration": 2.5198874473571777 + }, + { + "auxiliary_loss_clip": 0.06411108, + "auxiliary_loss_mlp": 0.01270624, + "balance_loss_clip": 0.06274307, + "balance_loss_mlp": 0.01259913, + "epoch": 0.6414549827145648, + "flos": 22275764386560.0, + "grad_norm": 1.497004648642511, + "language_loss": 0.67674375, + "learning_rate": 1.2034851581296544e-06, + "loss": 0.75356108, + "num_input_tokens_seen": 230166950, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.10705566, + "step": 10669, + "time_per_iteration": 2.589388132095337 + }, + { + "auxiliary_loss_clip": 0.06420371, + "auxiliary_loss_mlp": 0.01265605, + "balance_loss_clip": 0.0627445, + "balance_loss_mlp": 0.01254382, + "epoch": 0.6415151059672328, + "flos": 19645291560960.0, + "grad_norm": 1.6345904804173623, + "language_loss": 0.7890048, + "learning_rate": 1.2031279302792825e-06, + "loss": 0.86586452, + "num_input_tokens_seen": 230184785, + "router_z_loss_clip": 1.45703125, + "router_z_loss_mlp": 0.11224365, + "step": 10670, + "time_per_iteration": 2.539581537246704 + }, + { + "auxiliary_loss_clip": 0.06415358, + "auxiliary_loss_mlp": 0.01263886, + "balance_loss_clip": 0.06272778, + "balance_loss_mlp": 0.01252752, + "epoch": 0.6415752292199008, + "flos": 14871016823040.0, + "grad_norm": 2.295733548922842, + "language_loss": 0.88453639, + "learning_rate": 1.20277073264638e-06, + "loss": 0.96132886, + "num_input_tokens_seen": 230201385, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.11138916, + "step": 10671, + "time_per_iteration": 2.477959632873535 + }, + { + "auxiliary_loss_clip": 0.06407315, + "auxiliary_loss_mlp": 0.01263473, + "balance_loss_clip": 0.0627213, + "balance_loss_mlp": 0.01253591, + "epoch": 0.6416353524725688, + "flos": 13740710371200.0, + "grad_norm": 1.4227697494992897, + "language_loss": 0.6938256, + "learning_rate": 1.2024135652444907e-06, + "loss": 0.77053344, + "num_input_tokens_seen": 230220380, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.09893799, + "step": 10672, + "time_per_iteration": 2.5083000659942627 + }, + { + "auxiliary_loss_clip": 0.06417342, + "auxiliary_loss_mlp": 0.01266287, + "balance_loss_clip": 0.06272715, + "balance_loss_mlp": 0.0125343, + "epoch": 0.6416954757252368, + "flos": 24541785878400.0, + "grad_norm": 1.8997700971465656, + "language_loss": 0.74453592, + "learning_rate": 1.2020564280871593e-06, + "loss": 0.82137227, + "num_input_tokens_seen": 230239845, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.128479, + "step": 10673, + "time_per_iteration": 3.9653780460357666 + }, + { + "auxiliary_loss_clip": 0.06409254, + "auxiliary_loss_mlp": 0.01267909, + "balance_loss_clip": 0.06269009, + "balance_loss_mlp": 0.01256948, + "epoch": 0.6417555989779047, + "flos": 27717531148800.0, + "grad_norm": 1.5327640795153767, + "language_loss": 0.69868958, + "learning_rate": 1.2016993211879283e-06, + "loss": 0.77546132, + "num_input_tokens_seen": 230262420, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.10961914, + "step": 10674, + "time_per_iteration": 2.563626766204834 + }, + { + "auxiliary_loss_clip": 0.06417114, + "auxiliary_loss_mlp": 0.01264104, + "balance_loss_clip": 0.06272194, + "balance_loss_mlp": 0.01253376, + "epoch": 0.6418157222305727, + "flos": 20562604133760.0, + "grad_norm": 1.803070032007693, + "language_loss": 0.67809439, + "learning_rate": 1.201342244560338e-06, + "loss": 0.75490659, + "num_input_tokens_seen": 230279950, + "router_z_loss_clip": 1.44921875, + "router_z_loss_mlp": 0.10736084, + "step": 10675, + "time_per_iteration": 2.508819580078125 + }, + { + "auxiliary_loss_clip": 0.06411684, + "auxiliary_loss_mlp": 0.01266305, + "balance_loss_clip": 0.06274499, + "balance_loss_mlp": 0.01255648, + "epoch": 0.6418758454832406, + "flos": 22608126806400.0, + "grad_norm": 1.6761966103099513, + "language_loss": 0.66968966, + "learning_rate": 1.2009851982179307e-06, + "loss": 0.7464695, + "num_input_tokens_seen": 230299705, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.10662842, + "step": 10676, + "time_per_iteration": 2.504427909851074 + }, + { + "auxiliary_loss_clip": 0.06413673, + "auxiliary_loss_mlp": 0.01266671, + "balance_loss_clip": 0.06272808, + "balance_loss_mlp": 0.01255078, + "epoch": 0.6419359687359086, + "flos": 27381479149440.0, + "grad_norm": 1.8338510977392408, + "language_loss": 0.75681728, + "learning_rate": 1.2006281821742446e-06, + "loss": 0.83362073, + "num_input_tokens_seen": 230320030, + "router_z_loss_clip": 1.40917969, + "router_z_loss_mlp": 0.11590576, + "step": 10677, + "time_per_iteration": 2.5891265869140625 + }, + { + "auxiliary_loss_clip": 0.06311014, + "auxiliary_loss_mlp": 0.01250224, + "balance_loss_clip": 0.06254409, + "balance_loss_mlp": 0.01249042, + "epoch": 0.6419960919885765, + "flos": 67270722566400.0, + "grad_norm": 0.7408362116441561, + "language_loss": 0.60777372, + "learning_rate": 1.200271196442818e-06, + "loss": 0.68338609, + "num_input_tokens_seen": 230381495, + "router_z_loss_clip": 0.56640625, + "router_z_loss_mlp": 0.01180267, + "step": 10678, + "time_per_iteration": 3.185296058654785 + }, + { + "auxiliary_loss_clip": 0.06408557, + "auxiliary_loss_mlp": 0.01266002, + "balance_loss_clip": 0.06272914, + "balance_loss_mlp": 0.01255816, + "epoch": 0.6420562152412446, + "flos": 19908067564800.0, + "grad_norm": 2.4133916332472083, + "language_loss": 0.67507815, + "learning_rate": 1.1999142410371875e-06, + "loss": 0.75182372, + "num_input_tokens_seen": 230401385, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10186768, + "step": 10679, + "time_per_iteration": 2.5243141651153564 + }, + { + "auxiliary_loss_clip": 0.06412959, + "auxiliary_loss_mlp": 0.01264697, + "balance_loss_clip": 0.06271341, + "balance_loss_mlp": 0.0125395, + "epoch": 0.6421163384939125, + "flos": 24797056942080.0, + "grad_norm": 1.7795780158399093, + "language_loss": 0.73073864, + "learning_rate": 1.1995573159708897e-06, + "loss": 0.8075152, + "num_input_tokens_seen": 230421340, + "router_z_loss_clip": 1.41503906, + "router_z_loss_mlp": 0.10742188, + "step": 10680, + "time_per_iteration": 2.5331122875213623 + }, + { + "auxiliary_loss_clip": 0.06414793, + "auxiliary_loss_mlp": 0.01266326, + "balance_loss_clip": 0.06276178, + "balance_loss_mlp": 0.01256014, + "epoch": 0.6421764617465805, + "flos": 25599822583680.0, + "grad_norm": 2.391895628783687, + "language_loss": 0.68047994, + "learning_rate": 1.1992004212574582e-06, + "loss": 0.75729114, + "num_input_tokens_seen": 230441270, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.10308838, + "step": 10681, + "time_per_iteration": 2.53722882270813 + }, + { + "auxiliary_loss_clip": 0.06410016, + "auxiliary_loss_mlp": 0.01263743, + "balance_loss_clip": 0.06272537, + "balance_loss_mlp": 0.01253318, + "epoch": 0.6422365849992484, + "flos": 14139556606080.0, + "grad_norm": 1.5905545864535235, + "language_loss": 0.74707049, + "learning_rate": 1.198843556910427e-06, + "loss": 0.82380807, + "num_input_tokens_seen": 230457455, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.10437012, + "step": 10682, + "time_per_iteration": 2.472856283187866 + }, + { + "auxiliary_loss_clip": 0.06400837, + "auxiliary_loss_mlp": 0.01268483, + "balance_loss_clip": 0.06270464, + "balance_loss_mlp": 0.01258499, + "epoch": 0.6422967082519164, + "flos": 22390688661120.0, + "grad_norm": 1.4486797107477571, + "language_loss": 0.79339921, + "learning_rate": 1.1984867229433287e-06, + "loss": 0.87009233, + "num_input_tokens_seen": 230478955, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.09985352, + "step": 10683, + "time_per_iteration": 2.5533552169799805 + }, + { + "auxiliary_loss_clip": 0.06413358, + "auxiliary_loss_mlp": 0.01265956, + "balance_loss_clip": 0.06272833, + "balance_loss_mlp": 0.01254607, + "epoch": 0.6423568315045844, + "flos": 14653243261440.0, + "grad_norm": 1.9282526307042827, + "language_loss": 0.67605591, + "learning_rate": 1.1981299193696941e-06, + "loss": 0.75284898, + "num_input_tokens_seen": 230496425, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.11334229, + "step": 10684, + "time_per_iteration": 2.482949733734131 + }, + { + "auxiliary_loss_clip": 0.06413907, + "auxiliary_loss_mlp": 0.01267282, + "balance_loss_clip": 0.06273498, + "balance_loss_mlp": 0.01255909, + "epoch": 0.6424169547572524, + "flos": 26841237949440.0, + "grad_norm": 1.917462680158283, + "language_loss": 0.71542668, + "learning_rate": 1.1977731462030533e-06, + "loss": 0.79223859, + "num_input_tokens_seen": 230516245, + "router_z_loss_clip": 1.40527344, + "router_z_loss_mlp": 0.1137085, + "step": 10685, + "time_per_iteration": 3.9797728061676025 + }, + { + "auxiliary_loss_clip": 0.06408305, + "auxiliary_loss_mlp": 0.01271537, + "balance_loss_clip": 0.06272995, + "balance_loss_mlp": 0.01260451, + "epoch": 0.6424770780099204, + "flos": 22713449788800.0, + "grad_norm": 1.7465950797369785, + "language_loss": 0.75233316, + "learning_rate": 1.197416403456935e-06, + "loss": 0.8291316, + "num_input_tokens_seen": 230534745, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.11083984, + "step": 10686, + "time_per_iteration": 2.5496456623077393 + }, + { + "auxiliary_loss_clip": 0.06415822, + "auxiliary_loss_mlp": 0.01270285, + "balance_loss_clip": 0.06274287, + "balance_loss_mlp": 0.01258501, + "epoch": 0.6425372012625883, + "flos": 28476049034880.0, + "grad_norm": 2.381729998669287, + "language_loss": 0.68881834, + "learning_rate": 1.197059691144867e-06, + "loss": 0.76567948, + "num_input_tokens_seen": 230555895, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.11767578, + "step": 10687, + "time_per_iteration": 2.570040464401245 + }, + { + "auxiliary_loss_clip": 0.06416762, + "auxiliary_loss_mlp": 0.01265122, + "balance_loss_clip": 0.06275085, + "balance_loss_mlp": 0.01254089, + "epoch": 0.6425973245152563, + "flos": 29359469831040.0, + "grad_norm": 1.9635514388954842, + "language_loss": 0.66698802, + "learning_rate": 1.1967030092803767e-06, + "loss": 0.74380684, + "num_input_tokens_seen": 230577460, + "router_z_loss_clip": 1.41796875, + "router_z_loss_mlp": 0.11029053, + "step": 10688, + "time_per_iteration": 4.0477213859558105 + }, + { + "auxiliary_loss_clip": 0.06411983, + "auxiliary_loss_mlp": 0.01266463, + "balance_loss_clip": 0.06273896, + "balance_loss_mlp": 0.01255716, + "epoch": 0.6426574477679242, + "flos": 16435109462400.0, + "grad_norm": 1.9153737313813421, + "language_loss": 0.73537695, + "learning_rate": 1.1963463578769876e-06, + "loss": 0.81216139, + "num_input_tokens_seen": 230595030, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.10742188, + "step": 10689, + "time_per_iteration": 2.5043931007385254 + }, + { + "auxiliary_loss_clip": 0.06405617, + "auxiliary_loss_mlp": 0.01262867, + "balance_loss_clip": 0.06272008, + "balance_loss_mlp": 0.01252758, + "epoch": 0.6427175710205922, + "flos": 21842481323520.0, + "grad_norm": 2.0498755252573932, + "language_loss": 0.72094941, + "learning_rate": 1.195989736948226e-06, + "loss": 0.79763424, + "num_input_tokens_seen": 230615135, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.10101318, + "step": 10690, + "time_per_iteration": 2.5244081020355225 + }, + { + "auxiliary_loss_clip": 0.06408664, + "auxiliary_loss_mlp": 0.01266562, + "balance_loss_clip": 0.06273106, + "balance_loss_mlp": 0.01256203, + "epoch": 0.6427776942732601, + "flos": 17792623059840.0, + "grad_norm": 2.705995899316003, + "language_loss": 0.78068197, + "learning_rate": 1.1956331465076143e-06, + "loss": 0.85743421, + "num_input_tokens_seen": 230631965, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.1036377, + "step": 10691, + "time_per_iteration": 2.530010461807251 + }, + { + "auxiliary_loss_clip": 0.0641586, + "auxiliary_loss_mlp": 0.0126902, + "balance_loss_clip": 0.06274788, + "balance_loss_mlp": 0.0125822, + "epoch": 0.6428378175259282, + "flos": 15091306007040.0, + "grad_norm": 1.6963645960197293, + "language_loss": 0.74278462, + "learning_rate": 1.1952765865686738e-06, + "loss": 0.81963336, + "num_input_tokens_seen": 230649565, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.10797119, + "step": 10692, + "time_per_iteration": 2.4988198280334473 + }, + { + "auxiliary_loss_clip": 0.06415784, + "auxiliary_loss_mlp": 0.01265088, + "balance_loss_clip": 0.06276909, + "balance_loss_mlp": 0.01254371, + "epoch": 0.6428979407785961, + "flos": 23848535923200.0, + "grad_norm": 1.7731596560048748, + "language_loss": 0.61612236, + "learning_rate": 1.1949200571449263e-06, + "loss": 0.69293106, + "num_input_tokens_seen": 230669265, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.1071167, + "step": 10693, + "time_per_iteration": 2.5508644580841064 + }, + { + "auxiliary_loss_clip": 0.06415299, + "auxiliary_loss_mlp": 0.01263917, + "balance_loss_clip": 0.06272541, + "balance_loss_mlp": 0.01252258, + "epoch": 0.6429580640312641, + "flos": 32935151439360.0, + "grad_norm": 1.6308651969538634, + "language_loss": 0.59823889, + "learning_rate": 1.1945635582498903e-06, + "loss": 0.67503107, + "num_input_tokens_seen": 230690575, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.11669922, + "step": 10694, + "time_per_iteration": 3.998856544494629 + }, + { + "auxiliary_loss_clip": 0.0641511, + "auxiliary_loss_mlp": 0.012666, + "balance_loss_clip": 0.06274424, + "balance_loss_mlp": 0.01255645, + "epoch": 0.643018187283932, + "flos": 21074571780480.0, + "grad_norm": 1.333714526566846, + "language_loss": 0.79901004, + "learning_rate": 1.1942070898970853e-06, + "loss": 0.87582707, + "num_input_tokens_seen": 230709420, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.10961914, + "step": 10695, + "time_per_iteration": 2.5433716773986816 + }, + { + "auxiliary_loss_clip": 0.0641124, + "auxiliary_loss_mlp": 0.01265686, + "balance_loss_clip": 0.06271, + "balance_loss_mlp": 0.01254904, + "epoch": 0.6430783105366, + "flos": 26731973825280.0, + "grad_norm": 1.5735391795945948, + "language_loss": 0.73628104, + "learning_rate": 1.1938506521000285e-06, + "loss": 0.81305027, + "num_input_tokens_seen": 230729350, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.10778809, + "step": 10696, + "time_per_iteration": 2.5438404083251953 + }, + { + "auxiliary_loss_clip": 0.06407514, + "auxiliary_loss_mlp": 0.01265995, + "balance_loss_clip": 0.06272715, + "balance_loss_mlp": 0.01255779, + "epoch": 0.643138433789268, + "flos": 23703744867840.0, + "grad_norm": 1.7384218375133755, + "language_loss": 0.75689638, + "learning_rate": 1.1934942448722347e-06, + "loss": 0.83363152, + "num_input_tokens_seen": 230749220, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.10211182, + "step": 10697, + "time_per_iteration": 2.538093090057373 + }, + { + "auxiliary_loss_clip": 0.06406935, + "auxiliary_loss_mlp": 0.01264883, + "balance_loss_clip": 0.06271957, + "balance_loss_mlp": 0.01255066, + "epoch": 0.643198557041936, + "flos": 34210416654720.0, + "grad_norm": 1.3977759922631694, + "language_loss": 0.65892148, + "learning_rate": 1.1931378682272208e-06, + "loss": 0.73563969, + "num_input_tokens_seen": 230770245, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.09820557, + "step": 10698, + "time_per_iteration": 2.598088026046753 + }, + { + "auxiliary_loss_clip": 0.06311838, + "auxiliary_loss_mlp": 0.01254343, + "balance_loss_clip": 0.06254914, + "balance_loss_mlp": 0.01253054, + "epoch": 0.643258680294604, + "flos": 67646955398400.0, + "grad_norm": 0.7781801094870626, + "language_loss": 0.63529652, + "learning_rate": 1.1927815221784996e-06, + "loss": 0.71095836, + "num_input_tokens_seen": 230837030, + "router_z_loss_clip": 0.5703125, + "router_z_loss_mlp": 0.01290131, + "step": 10699, + "time_per_iteration": 3.115173101425171 + }, + { + "auxiliary_loss_clip": 0.06406387, + "auxiliary_loss_mlp": 0.01265934, + "balance_loss_clip": 0.06272414, + "balance_loss_mlp": 0.01256397, + "epoch": 0.6433188035472719, + "flos": 25192003962240.0, + "grad_norm": 1.4785466380460042, + "language_loss": 0.69763827, + "learning_rate": 1.1924252067395838e-06, + "loss": 0.77436155, + "num_input_tokens_seen": 230856845, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.09545898, + "step": 10700, + "time_per_iteration": 2.5910451412200928 + }, + { + "auxiliary_loss_clip": 0.06412176, + "auxiliary_loss_mlp": 0.01267748, + "balance_loss_clip": 0.0627284, + "balance_loss_mlp": 0.01256918, + "epoch": 0.6433789267999399, + "flos": 24980645237760.0, + "grad_norm": 1.528088543997644, + "language_loss": 0.73932713, + "learning_rate": 1.1920689219239855e-06, + "loss": 0.81612635, + "num_input_tokens_seen": 230878785, + "router_z_loss_clip": 1.39257812, + "router_z_loss_mlp": 0.10827637, + "step": 10701, + "time_per_iteration": 2.544930934906006 + }, + { + "auxiliary_loss_clip": 0.06417713, + "auxiliary_loss_mlp": 0.01266156, + "balance_loss_clip": 0.06274359, + "balance_loss_mlp": 0.012551, + "epoch": 0.6434390500526078, + "flos": 17571704970240.0, + "grad_norm": 2.0241741030403064, + "language_loss": 0.81973577, + "learning_rate": 1.1917126677452144e-06, + "loss": 0.8965745, + "num_input_tokens_seen": 230895445, + "router_z_loss_clip": 1.43457031, + "router_z_loss_mlp": 0.1105957, + "step": 10702, + "time_per_iteration": 2.5270791053771973 + }, + { + "auxiliary_loss_clip": 0.06410103, + "auxiliary_loss_mlp": 0.01270083, + "balance_loss_clip": 0.06273524, + "balance_loss_mlp": 0.01259927, + "epoch": 0.6434991733052758, + "flos": 20848790154240.0, + "grad_norm": 1.961461723280124, + "language_loss": 0.74951881, + "learning_rate": 1.1913564442167798e-06, + "loss": 0.82632065, + "num_input_tokens_seen": 230911375, + "router_z_loss_clip": 1.36621094, + "router_z_loss_mlp": 0.1015625, + "step": 10703, + "time_per_iteration": 2.490809917449951 + }, + { + "auxiliary_loss_clip": 0.06306668, + "auxiliary_loss_mlp": 0.01249951, + "balance_loss_clip": 0.06250144, + "balance_loss_mlp": 0.01248577, + "epoch": 0.6435592965579437, + "flos": 66114909745920.0, + "grad_norm": 0.6384717488493646, + "language_loss": 0.54610157, + "learning_rate": 1.1910002513521898e-06, + "loss": 0.62166774, + "num_input_tokens_seen": 230975990, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01377106, + "step": 10704, + "time_per_iteration": 3.160659074783325 + }, + { + "auxiliary_loss_clip": 0.06412737, + "auxiliary_loss_mlp": 0.01269762, + "balance_loss_clip": 0.06273799, + "balance_loss_mlp": 0.0125994, + "epoch": 0.6436194198106118, + "flos": 23775595344000.0, + "grad_norm": 1.7759265636720112, + "language_loss": 0.77319264, + "learning_rate": 1.1906440891649519e-06, + "loss": 0.85001761, + "num_input_tokens_seen": 230997110, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.09814453, + "step": 10705, + "time_per_iteration": 2.543015718460083 + }, + { + "auxiliary_loss_clip": 0.06412525, + "auxiliary_loss_mlp": 0.01267692, + "balance_loss_clip": 0.06272702, + "balance_loss_mlp": 0.0125694, + "epoch": 0.6436795430632797, + "flos": 20236572696960.0, + "grad_norm": 1.551816271189714, + "language_loss": 0.79286802, + "learning_rate": 1.1902879576685708e-06, + "loss": 0.86967015, + "num_input_tokens_seen": 231015590, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.10748291, + "step": 10706, + "time_per_iteration": 2.571018934249878 + }, + { + "auxiliary_loss_clip": 0.06408278, + "auxiliary_loss_mlp": 0.01264089, + "balance_loss_clip": 0.06270924, + "balance_loss_mlp": 0.01253807, + "epoch": 0.6437396663159477, + "flos": 20307878121600.0, + "grad_norm": 1.8116162091626624, + "language_loss": 0.80532277, + "learning_rate": 1.1899318568765518e-06, + "loss": 0.8820464, + "num_input_tokens_seen": 231033800, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.10284424, + "step": 10707, + "time_per_iteration": 2.49252986907959 + }, + { + "auxiliary_loss_clip": 0.06408471, + "auxiliary_loss_mlp": 0.01266248, + "balance_loss_clip": 0.06271025, + "balance_loss_mlp": 0.01256151, + "epoch": 0.6437997895686156, + "flos": 23885404519680.0, + "grad_norm": 1.5335483275855415, + "language_loss": 0.85439938, + "learning_rate": 1.1895757868023978e-06, + "loss": 0.93114662, + "num_input_tokens_seen": 231053160, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.10101318, + "step": 10708, + "time_per_iteration": 2.554351806640625 + }, + { + "auxiliary_loss_clip": 0.0642588, + "auxiliary_loss_mlp": 0.01267773, + "balance_loss_clip": 0.06278181, + "balance_loss_mlp": 0.0125649, + "epoch": 0.6438599128212836, + "flos": 18995241185280.0, + "grad_norm": 2.1632531373454507, + "language_loss": 0.66272986, + "learning_rate": 1.1892197474596106e-06, + "loss": 0.73966646, + "num_input_tokens_seen": 231069470, + "router_z_loss_clip": 1.4765625, + "router_z_loss_mlp": 0.11279297, + "step": 10709, + "time_per_iteration": 2.4882705211639404 + }, + { + "auxiliary_loss_clip": 0.06406571, + "auxiliary_loss_mlp": 0.01264597, + "balance_loss_clip": 0.06270951, + "balance_loss_mlp": 0.01254793, + "epoch": 0.6439200360739517, + "flos": 24103010373120.0, + "grad_norm": 1.6506823259196688, + "language_loss": 0.80511576, + "learning_rate": 1.1888637388616929e-06, + "loss": 0.88182747, + "num_input_tokens_seen": 231088205, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.09802246, + "step": 10710, + "time_per_iteration": 2.56453537940979 + }, + { + "auxiliary_loss_clip": 0.0640994, + "auxiliary_loss_mlp": 0.01264827, + "balance_loss_clip": 0.06274116, + "balance_loss_mlp": 0.01254676, + "epoch": 0.6439801593266196, + "flos": 31909748699520.0, + "grad_norm": 1.6423775297739596, + "language_loss": 0.66664886, + "learning_rate": 1.1885077610221425e-06, + "loss": 0.74339652, + "num_input_tokens_seen": 231107850, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.1015625, + "step": 10711, + "time_per_iteration": 2.5858142375946045 + }, + { + "auxiliary_loss_clip": 0.06416127, + "auxiliary_loss_mlp": 0.01267658, + "balance_loss_clip": 0.06276122, + "balance_loss_mlp": 0.01257155, + "epoch": 0.6440402825792876, + "flos": 27133251828480.0, + "grad_norm": 1.4850866798945335, + "language_loss": 0.78739464, + "learning_rate": 1.1881518139544597e-06, + "loss": 0.86423248, + "num_input_tokens_seen": 231127200, + "router_z_loss_clip": 1.39941406, + "router_z_loss_mlp": 0.10498047, + "step": 10712, + "time_per_iteration": 2.5875256061553955 + }, + { + "auxiliary_loss_clip": 0.06415762, + "auxiliary_loss_mlp": 0.01268856, + "balance_loss_clip": 0.0627311, + "balance_loss_mlp": 0.01258264, + "epoch": 0.6441004058319555, + "flos": 20673964609920.0, + "grad_norm": 4.153275753738836, + "language_loss": 0.82697159, + "learning_rate": 1.1877958976721417e-06, + "loss": 0.90381777, + "num_input_tokens_seen": 231146360, + "router_z_loss_clip": 1.42480469, + "router_z_loss_mlp": 0.105896, + "step": 10713, + "time_per_iteration": 3.9446072578430176 + }, + { + "auxiliary_loss_clip": 0.06405178, + "auxiliary_loss_mlp": 0.01267711, + "balance_loss_clip": 0.06273344, + "balance_loss_mlp": 0.0125691, + "epoch": 0.6441605290846235, + "flos": 26032309032960.0, + "grad_norm": 1.3361931407869754, + "language_loss": 0.78574234, + "learning_rate": 1.187440012188684e-06, + "loss": 0.86247128, + "num_input_tokens_seen": 231168350, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.10809326, + "step": 10714, + "time_per_iteration": 2.530367612838745 + }, + { + "auxiliary_loss_clip": 0.06407861, + "auxiliary_loss_mlp": 0.01264356, + "balance_loss_clip": 0.0627133, + "balance_loss_mlp": 0.01254741, + "epoch": 0.6442206523372914, + "flos": 24906362993280.0, + "grad_norm": 1.4535353305453917, + "language_loss": 0.81736881, + "learning_rate": 1.187084157517583e-06, + "loss": 0.89409101, + "num_input_tokens_seen": 231188385, + "router_z_loss_clip": 1.36621094, + "router_z_loss_mlp": 0.09619141, + "step": 10715, + "time_per_iteration": 2.563981294631958 + }, + { + "auxiliary_loss_clip": 0.06417291, + "auxiliary_loss_mlp": 0.01266314, + "balance_loss_clip": 0.06276529, + "balance_loss_mlp": 0.01255812, + "epoch": 0.6442807755899594, + "flos": 25163478846720.0, + "grad_norm": 2.5611767206234335, + "language_loss": 0.81585336, + "learning_rate": 1.186728333672332e-06, + "loss": 0.89268947, + "num_input_tokens_seen": 231209880, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.10498047, + "step": 10716, + "time_per_iteration": 2.54089617729187 + }, + { + "auxiliary_loss_clip": 0.06414896, + "auxiliary_loss_mlp": 0.0126582, + "balance_loss_clip": 0.06271458, + "balance_loss_mlp": 0.01254931, + "epoch": 0.6443408988426274, + "flos": 27351863930880.0, + "grad_norm": 1.9349198900461007, + "language_loss": 0.783328, + "learning_rate": 1.186372540666424e-06, + "loss": 0.8601352, + "num_input_tokens_seen": 231230765, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.10894775, + "step": 10717, + "time_per_iteration": 2.726794719696045 + }, + { + "auxiliary_loss_clip": 0.06407352, + "auxiliary_loss_mlp": 0.0126699, + "balance_loss_clip": 0.06274462, + "balance_loss_mlp": 0.01256929, + "epoch": 0.6444010220952954, + "flos": 27935807834880.0, + "grad_norm": 1.5112707746860563, + "language_loss": 0.68381333, + "learning_rate": 1.1860167785133513e-06, + "loss": 0.76055682, + "num_input_tokens_seen": 231252350, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.10058594, + "step": 10718, + "time_per_iteration": 2.610858201980591 + }, + { + "auxiliary_loss_clip": 0.0630646, + "auxiliary_loss_mlp": 0.01253706, + "balance_loss_clip": 0.06250188, + "balance_loss_mlp": 0.01252236, + "epoch": 0.6444611453479633, + "flos": 71232169173120.0, + "grad_norm": 0.7437918033374209, + "language_loss": 0.49586019, + "learning_rate": 1.185661047226603e-06, + "loss": 0.5714618, + "num_input_tokens_seen": 231313865, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01467896, + "step": 10719, + "time_per_iteration": 3.303040027618408 + }, + { + "auxiliary_loss_clip": 0.06416054, + "auxiliary_loss_mlp": 0.01264815, + "balance_loss_clip": 0.06274591, + "balance_loss_mlp": 0.01253598, + "epoch": 0.6445212686006313, + "flos": 22710766458240.0, + "grad_norm": 1.8616807218185105, + "language_loss": 0.77902591, + "learning_rate": 1.18530534681967e-06, + "loss": 0.8558346, + "num_input_tokens_seen": 231331710, + "router_z_loss_clip": 1.41503906, + "router_z_loss_mlp": 0.11212158, + "step": 10720, + "time_per_iteration": 2.4988739490509033 + }, + { + "auxiliary_loss_clip": 0.06409489, + "auxiliary_loss_mlp": 0.01265868, + "balance_loss_clip": 0.06272486, + "balance_loss_mlp": 0.01255556, + "epoch": 0.6445813918532992, + "flos": 21185219496960.0, + "grad_norm": 1.7169707268636247, + "language_loss": 0.77512503, + "learning_rate": 1.18494967730604e-06, + "loss": 0.85187852, + "num_input_tokens_seen": 231350705, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.10314941, + "step": 10721, + "time_per_iteration": 2.5300545692443848 + }, + { + "auxiliary_loss_clip": 0.06412297, + "auxiliary_loss_mlp": 0.01265332, + "balance_loss_clip": 0.0627277, + "balance_loss_mlp": 0.01254722, + "epoch": 0.6446415151059672, + "flos": 25198921923840.0, + "grad_norm": 2.0971313720175253, + "language_loss": 0.72901034, + "learning_rate": 1.1845940386991995e-06, + "loss": 0.80578673, + "num_input_tokens_seen": 231369550, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.1060791, + "step": 10722, + "time_per_iteration": 2.5350587368011475 + }, + { + "auxiliary_loss_clip": 0.06411985, + "auxiliary_loss_mlp": 0.01267025, + "balance_loss_clip": 0.06273404, + "balance_loss_mlp": 0.01257149, + "epoch": 0.6447016383586353, + "flos": 25309401932160.0, + "grad_norm": 1.4844277887266815, + "language_loss": 0.78381926, + "learning_rate": 1.184238431012635e-06, + "loss": 0.86060935, + "num_input_tokens_seen": 231389285, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.09881592, + "step": 10723, + "time_per_iteration": 2.550785541534424 + }, + { + "auxiliary_loss_clip": 0.06412604, + "auxiliary_loss_mlp": 0.01264685, + "balance_loss_clip": 0.06270273, + "balance_loss_mlp": 0.01253825, + "epoch": 0.6447617616113032, + "flos": 27709523084160.0, + "grad_norm": 1.5774078355025598, + "language_loss": 0.58958089, + "learning_rate": 1.1838828542598312e-06, + "loss": 0.66635382, + "num_input_tokens_seen": 231408820, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.10858154, + "step": 10724, + "time_per_iteration": 2.54042387008667 + }, + { + "auxiliary_loss_clip": 0.06404805, + "auxiliary_loss_mlp": 0.012629, + "balance_loss_clip": 0.06271456, + "balance_loss_mlp": 0.0125294, + "epoch": 0.6448218848639712, + "flos": 23045728354560.0, + "grad_norm": 1.8379385823931873, + "language_loss": 0.83613712, + "learning_rate": 1.183527308454271e-06, + "loss": 0.91281414, + "num_input_tokens_seen": 231428100, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.09960938, + "step": 10725, + "time_per_iteration": 3.910567045211792 + }, + { + "auxiliary_loss_clip": 0.06409329, + "auxiliary_loss_mlp": 0.0126531, + "balance_loss_clip": 0.06272514, + "balance_loss_mlp": 0.01255123, + "epoch": 0.6448820081166391, + "flos": 24502569367680.0, + "grad_norm": 1.6966621719955104, + "language_loss": 0.82546258, + "learning_rate": 1.1831717936094368e-06, + "loss": 0.90220898, + "num_input_tokens_seen": 231445810, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.10186768, + "step": 10726, + "time_per_iteration": 2.5510244369506836 + }, + { + "auxiliary_loss_clip": 0.06413421, + "auxiliary_loss_mlp": 0.01265367, + "balance_loss_clip": 0.06271534, + "balance_loss_mlp": 0.01254757, + "epoch": 0.6449421313693071, + "flos": 22425880176000.0, + "grad_norm": 1.8351379370292278, + "language_loss": 0.82230431, + "learning_rate": 1.1828163097388108e-06, + "loss": 0.8990922, + "num_input_tokens_seen": 231463570, + "router_z_loss_clip": 1.41992188, + "router_z_loss_mlp": 0.10601807, + "step": 10727, + "time_per_iteration": 4.002009153366089 + }, + { + "auxiliary_loss_clip": 0.0641925, + "auxiliary_loss_mlp": 0.01267298, + "balance_loss_clip": 0.06273851, + "balance_loss_mlp": 0.01255908, + "epoch": 0.645002254621975, + "flos": 20231206035840.0, + "grad_norm": 1.8310574877771004, + "language_loss": 0.79621851, + "learning_rate": 1.1824608568558717e-06, + "loss": 0.87308395, + "num_input_tokens_seen": 231482155, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.1138916, + "step": 10728, + "time_per_iteration": 2.500166416168213 + }, + { + "auxiliary_loss_clip": 0.06411231, + "auxiliary_loss_mlp": 0.0126533, + "balance_loss_clip": 0.06273383, + "balance_loss_mlp": 0.01253767, + "epoch": 0.645062377874643, + "flos": 27862909182720.0, + "grad_norm": 1.7840301112259453, + "language_loss": 0.7434454, + "learning_rate": 1.1821054349740988e-06, + "loss": 0.82021105, + "num_input_tokens_seen": 231502465, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.11578369, + "step": 10729, + "time_per_iteration": 2.5444576740264893 + }, + { + "auxiliary_loss_clip": 0.06416906, + "auxiliary_loss_mlp": 0.01269622, + "balance_loss_clip": 0.06276138, + "balance_loss_mlp": 0.01258971, + "epoch": 0.645122501127311, + "flos": 25308563391360.0, + "grad_norm": 1.804382369686425, + "language_loss": 0.66694868, + "learning_rate": 1.1817500441069706e-06, + "loss": 0.74381399, + "num_input_tokens_seen": 231522740, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.10662842, + "step": 10730, + "time_per_iteration": 2.557570695877075 + }, + { + "auxiliary_loss_clip": 0.06414691, + "auxiliary_loss_mlp": 0.01268999, + "balance_loss_clip": 0.0627515, + "balance_loss_mlp": 0.01257823, + "epoch": 0.645182624379979, + "flos": 18813371898240.0, + "grad_norm": 1.7610800842195338, + "language_loss": 0.64359826, + "learning_rate": 1.1813946842679614e-06, + "loss": 0.72043514, + "num_input_tokens_seen": 231542050, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.11181641, + "step": 10731, + "time_per_iteration": 2.496885299682617 + }, + { + "auxiliary_loss_clip": 0.06408474, + "auxiliary_loss_mlp": 0.01264128, + "balance_loss_clip": 0.06272513, + "balance_loss_mlp": 0.01253507, + "epoch": 0.6452427476326469, + "flos": 18337979358720.0, + "grad_norm": 1.6539865973631505, + "language_loss": 0.68541694, + "learning_rate": 1.1810393554705492e-06, + "loss": 0.76214296, + "num_input_tokens_seen": 231560380, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.10620117, + "step": 10732, + "time_per_iteration": 2.5379278659820557 + }, + { + "auxiliary_loss_clip": 0.06405264, + "auxiliary_loss_mlp": 0.01268037, + "balance_loss_clip": 0.06272335, + "balance_loss_mlp": 0.01257392, + "epoch": 0.6453028708853149, + "flos": 22791505466880.0, + "grad_norm": 1.6003799317808598, + "language_loss": 0.75854611, + "learning_rate": 1.1806840577282055e-06, + "loss": 0.83527917, + "num_input_tokens_seen": 231580810, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.10638428, + "step": 10733, + "time_per_iteration": 2.5387895107269287 + }, + { + "auxiliary_loss_clip": 0.06419903, + "auxiliary_loss_mlp": 0.01269065, + "balance_loss_clip": 0.06276383, + "balance_loss_mlp": 0.01257466, + "epoch": 0.6453629941379828, + "flos": 23951888334720.0, + "grad_norm": 1.8221527595961244, + "language_loss": 0.6735214, + "learning_rate": 1.1803287910544048e-06, + "loss": 0.75041103, + "num_input_tokens_seen": 231600585, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.1159668, + "step": 10734, + "time_per_iteration": 3.968029260635376 + }, + { + "auxiliary_loss_clip": 0.06404681, + "auxiliary_loss_mlp": 0.01263578, + "balance_loss_clip": 0.06273787, + "balance_loss_mlp": 0.01252694, + "epoch": 0.6454231173906508, + "flos": 17682226905600.0, + "grad_norm": 2.0600495273099377, + "language_loss": 0.7393254, + "learning_rate": 1.1799735554626191e-06, + "loss": 0.81600797, + "num_input_tokens_seen": 231618765, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.10882568, + "step": 10735, + "time_per_iteration": 2.5028645992279053 + }, + { + "auxiliary_loss_clip": 0.06413495, + "auxiliary_loss_mlp": 0.01265876, + "balance_loss_clip": 0.06275123, + "balance_loss_mlp": 0.01255791, + "epoch": 0.6454832406433189, + "flos": 23299154628480.0, + "grad_norm": 1.713856204545893, + "language_loss": 0.75178444, + "learning_rate": 1.1796183509663176e-06, + "loss": 0.82857811, + "num_input_tokens_seen": 231638525, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.10083008, + "step": 10736, + "time_per_iteration": 2.52396821975708 + }, + { + "auxiliary_loss_clip": 0.06414569, + "auxiliary_loss_mlp": 0.01265141, + "balance_loss_clip": 0.06272043, + "balance_loss_mlp": 0.01254097, + "epoch": 0.6455433638959868, + "flos": 20163422482560.0, + "grad_norm": 1.900325282027751, + "language_loss": 0.70704216, + "learning_rate": 1.1792631775789708e-06, + "loss": 0.78383923, + "num_input_tokens_seen": 231656785, + "router_z_loss_clip": 1.42382812, + "router_z_loss_mlp": 0.1104126, + "step": 10737, + "time_per_iteration": 2.533444404602051 + }, + { + "auxiliary_loss_clip": 0.06321093, + "auxiliary_loss_mlp": 0.01260403, + "balance_loss_clip": 0.06264752, + "balance_loss_mlp": 0.01258907, + "epoch": 0.6456034871486548, + "flos": 66553391761920.0, + "grad_norm": 0.7654525046837665, + "language_loss": 0.58448923, + "learning_rate": 1.1789080353140464e-06, + "loss": 0.66030419, + "num_input_tokens_seen": 231719075, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01495361, + "step": 10738, + "time_per_iteration": 3.180669069290161 + }, + { + "auxiliary_loss_clip": 0.06409475, + "auxiliary_loss_mlp": 0.01265038, + "balance_loss_clip": 0.06273897, + "balance_loss_mlp": 0.0125478, + "epoch": 0.6456636104013227, + "flos": 24212819548800.0, + "grad_norm": 2.1666946936849434, + "language_loss": 0.74776822, + "learning_rate": 1.1785529241850118e-06, + "loss": 0.82451332, + "num_input_tokens_seen": 231737810, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.1026001, + "step": 10739, + "time_per_iteration": 2.556649923324585 + }, + { + "auxiliary_loss_clip": 0.06415305, + "auxiliary_loss_mlp": 0.01264707, + "balance_loss_clip": 0.06273393, + "balance_loss_mlp": 0.01254098, + "epoch": 0.6457237336539907, + "flos": 23631013923840.0, + "grad_norm": 1.691973671023819, + "language_loss": 0.71430027, + "learning_rate": 1.1781978442053324e-06, + "loss": 0.79110038, + "num_input_tokens_seen": 231756140, + "router_z_loss_clip": 1.41894531, + "router_z_loss_mlp": 0.1060791, + "step": 10740, + "time_per_iteration": 2.5294902324676514 + }, + { + "auxiliary_loss_clip": 0.06311092, + "auxiliary_loss_mlp": 0.0125644, + "balance_loss_clip": 0.06254861, + "balance_loss_mlp": 0.01255001, + "epoch": 0.6457838569066586, + "flos": 65867437111680.0, + "grad_norm": 1.1432056527915397, + "language_loss": 0.55345345, + "learning_rate": 1.1778427953884733e-06, + "loss": 0.62912881, + "num_input_tokens_seen": 231823665, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01437378, + "step": 10741, + "time_per_iteration": 3.1684045791625977 + }, + { + "auxiliary_loss_clip": 0.06412791, + "auxiliary_loss_mlp": 0.01265658, + "balance_loss_clip": 0.06276751, + "balance_loss_mlp": 0.01255149, + "epoch": 0.6458439801593266, + "flos": 22388424600960.0, + "grad_norm": 1.6129388785112204, + "language_loss": 0.80396634, + "learning_rate": 1.1774877777478977e-06, + "loss": 0.88075083, + "num_input_tokens_seen": 231844500, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.1050415, + "step": 10742, + "time_per_iteration": 2.5326621532440186 + }, + { + "auxiliary_loss_clip": 0.06404757, + "auxiliary_loss_mlp": 0.01265116, + "balance_loss_clip": 0.06273461, + "balance_loss_mlp": 0.01254643, + "epoch": 0.6459041034119946, + "flos": 24795966839040.0, + "grad_norm": 1.5649270887964326, + "language_loss": 0.81750703, + "learning_rate": 1.1771327912970678e-06, + "loss": 0.89420575, + "num_input_tokens_seen": 231864510, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.10467529, + "step": 10743, + "time_per_iteration": 2.525972366333008 + }, + { + "auxiliary_loss_clip": 0.06406047, + "auxiliary_loss_mlp": 0.01265343, + "balance_loss_clip": 0.06271668, + "balance_loss_mlp": 0.01255377, + "epoch": 0.6459642266646626, + "flos": 18330013221120.0, + "grad_norm": 1.6048937891157424, + "language_loss": 0.71681064, + "learning_rate": 1.1767778360494453e-06, + "loss": 0.79352456, + "num_input_tokens_seen": 231881555, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.09973145, + "step": 10744, + "time_per_iteration": 2.571387767791748 + }, + { + "auxiliary_loss_clip": 0.06408056, + "auxiliary_loss_mlp": 0.01267463, + "balance_loss_clip": 0.0627251, + "balance_loss_mlp": 0.01257753, + "epoch": 0.6460243499173305, + "flos": 43591561672320.0, + "grad_norm": 1.9454844326150766, + "language_loss": 0.67213976, + "learning_rate": 1.1764229120184896e-06, + "loss": 0.74889499, + "num_input_tokens_seen": 231905945, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.0970459, + "step": 10745, + "time_per_iteration": 2.6937074661254883 + }, + { + "auxiliary_loss_clip": 0.06406983, + "auxiliary_loss_mlp": 0.01268476, + "balance_loss_clip": 0.0627151, + "balance_loss_mlp": 0.01257711, + "epoch": 0.6460844731699985, + "flos": 19249925270400.0, + "grad_norm": 2.096395113743082, + "language_loss": 0.74313092, + "learning_rate": 1.1760680192176597e-06, + "loss": 0.81988549, + "num_input_tokens_seen": 231922535, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.10778809, + "step": 10746, + "time_per_iteration": 2.5105156898498535 + }, + { + "auxiliary_loss_clip": 0.06413017, + "auxiliary_loss_mlp": 0.012653, + "balance_loss_clip": 0.06273216, + "balance_loss_mlp": 0.01254649, + "epoch": 0.6461445964226664, + "flos": 27460624930560.0, + "grad_norm": 1.4939234449131917, + "language_loss": 0.67274344, + "learning_rate": 1.175713157660413e-06, + "loss": 0.74952662, + "num_input_tokens_seen": 231944800, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.10644531, + "step": 10747, + "time_per_iteration": 2.5424420833587646 + }, + { + "auxiliary_loss_clip": 0.0641461, + "auxiliary_loss_mlp": 0.01265405, + "balance_loss_clip": 0.0627532, + "balance_loss_mlp": 0.01255272, + "epoch": 0.6462047196753344, + "flos": 20300457035520.0, + "grad_norm": 1.6454594650819265, + "language_loss": 0.67613244, + "learning_rate": 1.1753583273602056e-06, + "loss": 0.75293255, + "num_input_tokens_seen": 231962970, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.10137939, + "step": 10748, + "time_per_iteration": 2.529270887374878 + }, + { + "auxiliary_loss_clip": 0.0641374, + "auxiliary_loss_mlp": 0.01266285, + "balance_loss_clip": 0.06271876, + "balance_loss_mlp": 0.01254764, + "epoch": 0.6462648429280025, + "flos": 22024937589120.0, + "grad_norm": 1.9564061615945416, + "language_loss": 0.76055253, + "learning_rate": 1.1750035283304937e-06, + "loss": 0.83735275, + "num_input_tokens_seen": 231981195, + "router_z_loss_clip": 1.41796875, + "router_z_loss_mlp": 0.11517334, + "step": 10749, + "time_per_iteration": 2.5083682537078857 + }, + { + "auxiliary_loss_clip": 0.06411772, + "auxiliary_loss_mlp": 0.01264574, + "balance_loss_clip": 0.0627101, + "balance_loss_mlp": 0.01254208, + "epoch": 0.6463249661806704, + "flos": 27788375376000.0, + "grad_norm": 1.4570564957131642, + "language_loss": 0.77334827, + "learning_rate": 1.17464876058473e-06, + "loss": 0.85011172, + "num_input_tokens_seen": 232001735, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.10369873, + "step": 10750, + "time_per_iteration": 2.5812573432922363 + }, + { + "auxiliary_loss_clip": 0.06417309, + "auxiliary_loss_mlp": 0.01269158, + "balance_loss_clip": 0.06274579, + "balance_loss_mlp": 0.01258268, + "epoch": 0.6463850894333384, + "flos": 22056481451520.0, + "grad_norm": 2.0670822566581437, + "language_loss": 0.6898241, + "learning_rate": 1.1742940241363683e-06, + "loss": 0.76668882, + "num_input_tokens_seen": 232019830, + "router_z_loss_clip": 1.42675781, + "router_z_loss_mlp": 0.10900879, + "step": 10751, + "time_per_iteration": 2.4936625957489014 + }, + { + "auxiliary_loss_clip": 0.06414577, + "auxiliary_loss_mlp": 0.01265138, + "balance_loss_clip": 0.06273049, + "balance_loss_mlp": 0.0125448, + "epoch": 0.6464452126860063, + "flos": 21112698188160.0, + "grad_norm": 1.7780067956451429, + "language_loss": 0.71182156, + "learning_rate": 1.1739393189988604e-06, + "loss": 0.78861868, + "num_input_tokens_seen": 232039625, + "router_z_loss_clip": 1.41601562, + "router_z_loss_mlp": 0.10662842, + "step": 10752, + "time_per_iteration": 3.927877426147461 + }, + { + "auxiliary_loss_clip": 0.06415342, + "auxiliary_loss_mlp": 0.01266476, + "balance_loss_clip": 0.06274153, + "balance_loss_mlp": 0.01253661, + "epoch": 0.6465053359386743, + "flos": 16032531720960.0, + "grad_norm": 1.540910380020274, + "language_loss": 0.77855444, + "learning_rate": 1.1735846451856554e-06, + "loss": 0.85537261, + "num_input_tokens_seen": 232055855, + "router_z_loss_clip": 1.41210938, + "router_z_loss_mlp": 0.12823486, + "step": 10753, + "time_per_iteration": 2.4648597240448 + }, + { + "auxiliary_loss_clip": 0.06412196, + "auxiliary_loss_mlp": 0.01268464, + "balance_loss_clip": 0.0627618, + "balance_loss_mlp": 0.01256871, + "epoch": 0.6465654591913422, + "flos": 23404477610880.0, + "grad_norm": 1.596791967646976, + "language_loss": 0.85541224, + "learning_rate": 1.1732300027102041e-06, + "loss": 0.93221891, + "num_input_tokens_seen": 232073475, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.11584473, + "step": 10754, + "time_per_iteration": 2.5978291034698486 + }, + { + "auxiliary_loss_clip": 0.06414384, + "auxiliary_loss_mlp": 0.01267288, + "balance_loss_clip": 0.06275849, + "balance_loss_mlp": 0.01256374, + "epoch": 0.6466255824440102, + "flos": 15382649053440.0, + "grad_norm": 2.138696261718271, + "language_loss": 0.6015234, + "learning_rate": 1.1728753915859541e-06, + "loss": 0.67834014, + "num_input_tokens_seen": 232091090, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.10919189, + "step": 10755, + "time_per_iteration": 2.5456504821777344 + }, + { + "auxiliary_loss_clip": 0.06412394, + "auxiliary_loss_mlp": 0.01268659, + "balance_loss_clip": 0.06275767, + "balance_loss_mlp": 0.01257025, + "epoch": 0.6466857056966782, + "flos": 16258355274240.0, + "grad_norm": 2.6815820423410845, + "language_loss": 0.68557096, + "learning_rate": 1.1725208118263518e-06, + "loss": 0.76238149, + "num_input_tokens_seen": 232107320, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.11633301, + "step": 10756, + "time_per_iteration": 2.4882616996765137 + }, + { + "auxiliary_loss_clip": 0.06423604, + "auxiliary_loss_mlp": 0.01266345, + "balance_loss_clip": 0.06278333, + "balance_loss_mlp": 0.01255199, + "epoch": 0.6467458289493462, + "flos": 21184548664320.0, + "grad_norm": 2.427580887606393, + "language_loss": 0.74556214, + "learning_rate": 1.172166263444844e-06, + "loss": 0.82246166, + "num_input_tokens_seen": 232123930, + "router_z_loss_clip": 1.453125, + "router_z_loss_mlp": 0.1114502, + "step": 10757, + "time_per_iteration": 2.5800364017486572 + }, + { + "auxiliary_loss_clip": 0.06404246, + "auxiliary_loss_mlp": 0.01268605, + "balance_loss_clip": 0.06271093, + "balance_loss_mlp": 0.01257357, + "epoch": 0.6468059522020141, + "flos": 17974198857600.0, + "grad_norm": 1.6114695233803533, + "language_loss": 0.74794757, + "learning_rate": 1.1718117464548734e-06, + "loss": 0.82467604, + "num_input_tokens_seen": 232142905, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.11248779, + "step": 10758, + "time_per_iteration": 2.537113666534424 + }, + { + "auxiliary_loss_clip": 0.06411805, + "auxiliary_loss_mlp": 0.0127172, + "balance_loss_clip": 0.06272358, + "balance_loss_mlp": 0.01259715, + "epoch": 0.6468660754546821, + "flos": 17895178857600.0, + "grad_norm": 1.7921091077439633, + "language_loss": 0.6853838, + "learning_rate": 1.1714572608698845e-06, + "loss": 0.76221907, + "num_input_tokens_seen": 232162230, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.11999512, + "step": 10759, + "time_per_iteration": 2.5501279830932617 + }, + { + "auxiliary_loss_clip": 0.06419058, + "auxiliary_loss_mlp": 0.01268931, + "balance_loss_clip": 0.0627493, + "balance_loss_mlp": 0.01257666, + "epoch": 0.64692619870735, + "flos": 22607497900800.0, + "grad_norm": 1.5782597023408493, + "language_loss": 0.75492609, + "learning_rate": 1.1711028067033197e-06, + "loss": 0.831806, + "num_input_tokens_seen": 232182700, + "router_z_loss_clip": 1.44335938, + "router_z_loss_mlp": 0.11273193, + "step": 10760, + "time_per_iteration": 2.5426504611968994 + }, + { + "auxiliary_loss_clip": 0.06408913, + "auxiliary_loss_mlp": 0.0126904, + "balance_loss_clip": 0.06273125, + "balance_loss_mlp": 0.01258621, + "epoch": 0.646986321960018, + "flos": 49611863750400.0, + "grad_norm": 1.5088139829750542, + "language_loss": 0.65700191, + "learning_rate": 1.1707483839686194e-06, + "loss": 0.73378146, + "num_input_tokens_seen": 232208235, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10406494, + "step": 10761, + "time_per_iteration": 2.8235716819763184 + }, + { + "auxiliary_loss_clip": 0.0641157, + "auxiliary_loss_mlp": 0.01270239, + "balance_loss_clip": 0.06273905, + "balance_loss_mlp": 0.0125886, + "epoch": 0.6470464452126861, + "flos": 21914960705280.0, + "grad_norm": 4.087602702214583, + "language_loss": 0.70041698, + "learning_rate": 1.1703939926792235e-06, + "loss": 0.77723515, + "num_input_tokens_seen": 232228720, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.11376953, + "step": 10762, + "time_per_iteration": 2.4962708950042725 + }, + { + "auxiliary_loss_clip": 0.06415009, + "auxiliary_loss_mlp": 0.01270412, + "balance_loss_clip": 0.06273261, + "balance_loss_mlp": 0.01259039, + "epoch": 0.647106568465354, + "flos": 18110688359040.0, + "grad_norm": 2.044366921559264, + "language_loss": 0.82845706, + "learning_rate": 1.1700396328485705e-06, + "loss": 0.90531123, + "num_input_tokens_seen": 232244655, + "router_z_loss_clip": 1.41699219, + "router_z_loss_mlp": 0.11364746, + "step": 10763, + "time_per_iteration": 2.5127148628234863 + }, + { + "auxiliary_loss_clip": 0.06315573, + "auxiliary_loss_mlp": 0.01250562, + "balance_loss_clip": 0.06259283, + "balance_loss_mlp": 0.01249394, + "epoch": 0.647166691718022, + "flos": 69499623899520.0, + "grad_norm": 0.6915624783517184, + "language_loss": 0.5774473, + "learning_rate": 1.1696853044900978e-06, + "loss": 0.65310872, + "num_input_tokens_seen": 232308685, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01165009, + "step": 10764, + "time_per_iteration": 4.764317035675049 + }, + { + "auxiliary_loss_clip": 0.06411065, + "auxiliary_loss_mlp": 0.01264999, + "balance_loss_clip": 0.06273772, + "balance_loss_mlp": 0.01254532, + "epoch": 0.6472268149706899, + "flos": 34103793934080.0, + "grad_norm": 1.637421021891431, + "language_loss": 0.60742128, + "learning_rate": 1.1693310076172413e-06, + "loss": 0.68418187, + "num_input_tokens_seen": 232327520, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.10467529, + "step": 10765, + "time_per_iteration": 2.6306469440460205 + }, + { + "auxiliary_loss_clip": 0.06408644, + "auxiliary_loss_mlp": 0.01269206, + "balance_loss_clip": 0.0627322, + "balance_loss_mlp": 0.01258924, + "epoch": 0.6472869382233579, + "flos": 28118809152000.0, + "grad_norm": 2.0826927975642273, + "language_loss": 0.63338971, + "learning_rate": 1.168976742243437e-06, + "loss": 0.71016824, + "num_input_tokens_seen": 232349025, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.1027832, + "step": 10766, + "time_per_iteration": 2.608025074005127 + }, + { + "auxiliary_loss_clip": 0.06411771, + "auxiliary_loss_mlp": 0.01268357, + "balance_loss_clip": 0.06273695, + "balance_loss_mlp": 0.01257616, + "epoch": 0.6473470614760258, + "flos": 22498736901120.0, + "grad_norm": 1.6916160768027213, + "language_loss": 0.75775635, + "learning_rate": 1.1686225083821174e-06, + "loss": 0.83455759, + "num_input_tokens_seen": 232367835, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.10736084, + "step": 10767, + "time_per_iteration": 3.9129326343536377 + }, + { + "auxiliary_loss_clip": 0.06410106, + "auxiliary_loss_mlp": 0.01266795, + "balance_loss_clip": 0.06272191, + "balance_loss_mlp": 0.01255613, + "epoch": 0.6474071847286939, + "flos": 14544314553600.0, + "grad_norm": 1.8076972632130168, + "language_loss": 0.77841228, + "learning_rate": 1.1682683060467153e-06, + "loss": 0.85518134, + "num_input_tokens_seen": 232385840, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.11187744, + "step": 10768, + "time_per_iteration": 2.5130937099456787 + }, + { + "auxiliary_loss_clip": 0.06411847, + "auxiliary_loss_mlp": 0.01266069, + "balance_loss_clip": 0.06274557, + "balance_loss_mlp": 0.01255894, + "epoch": 0.6474673079813618, + "flos": 24105190579200.0, + "grad_norm": 1.6392494709530092, + "language_loss": 0.71794009, + "learning_rate": 1.167914135250663e-06, + "loss": 0.79471928, + "num_input_tokens_seen": 232406205, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.10162354, + "step": 10769, + "time_per_iteration": 2.5274879932403564 + }, + { + "auxiliary_loss_clip": 0.06409761, + "auxiliary_loss_mlp": 0.01267023, + "balance_loss_clip": 0.06276036, + "balance_loss_mlp": 0.01256985, + "epoch": 0.6475274312340298, + "flos": 14981538758400.0, + "grad_norm": 1.8331179769777781, + "language_loss": 0.73102438, + "learning_rate": 1.1675599960073895e-06, + "loss": 0.80779225, + "num_input_tokens_seen": 232424995, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.10040283, + "step": 10770, + "time_per_iteration": 2.4902164936065674 + }, + { + "auxiliary_loss_clip": 0.0641522, + "auxiliary_loss_mlp": 0.01267005, + "balance_loss_clip": 0.0627073, + "balance_loss_mlp": 0.01254357, + "epoch": 0.6475875544866977, + "flos": 25052202224640.0, + "grad_norm": 1.6464816515513445, + "language_loss": 0.73554993, + "learning_rate": 1.167205888330325e-06, + "loss": 0.81237221, + "num_input_tokens_seen": 232445870, + "router_z_loss_clip": 1.4453125, + "router_z_loss_mlp": 0.12646484, + "step": 10771, + "time_per_iteration": 2.5617709159851074 + }, + { + "auxiliary_loss_clip": 0.06412145, + "auxiliary_loss_mlp": 0.0126638, + "balance_loss_clip": 0.06276581, + "balance_loss_mlp": 0.01255324, + "epoch": 0.6476476777393657, + "flos": 16477763990400.0, + "grad_norm": 2.394956758167514, + "language_loss": 0.74415565, + "learning_rate": 1.1668518122328958e-06, + "loss": 0.82094085, + "num_input_tokens_seen": 232464285, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.1105957, + "step": 10772, + "time_per_iteration": 2.54032826423645 + }, + { + "auxiliary_loss_clip": 0.06408937, + "auxiliary_loss_mlp": 0.01268327, + "balance_loss_clip": 0.06275553, + "balance_loss_mlp": 0.01258987, + "epoch": 0.6477078009920336, + "flos": 25819399008000.0, + "grad_norm": 1.4893197324025274, + "language_loss": 0.82968116, + "learning_rate": 1.1664977677285305e-06, + "loss": 0.90645373, + "num_input_tokens_seen": 232485815, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.09338379, + "step": 10773, + "time_per_iteration": 3.9616613388061523 + }, + { + "auxiliary_loss_clip": 0.06405786, + "auxiliary_loss_mlp": 0.01267593, + "balance_loss_clip": 0.06272345, + "balance_loss_mlp": 0.01257776, + "epoch": 0.6477679242447016, + "flos": 17681933416320.0, + "grad_norm": 1.4328505723610274, + "language_loss": 0.78670597, + "learning_rate": 1.1661437548306524e-06, + "loss": 0.8634398, + "num_input_tokens_seen": 232504875, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.0982666, + "step": 10774, + "time_per_iteration": 2.471349000930786 + }, + { + "auxiliary_loss_clip": 0.06414998, + "auxiliary_loss_mlp": 0.01270742, + "balance_loss_clip": 0.0627519, + "balance_loss_mlp": 0.01259406, + "epoch": 0.6478280474973696, + "flos": 21038583651840.0, + "grad_norm": 2.0152385899029763, + "language_loss": 0.69592845, + "learning_rate": 1.1657897735526867e-06, + "loss": 0.7727859, + "num_input_tokens_seen": 232521945, + "router_z_loss_clip": 1.40039062, + "router_z_loss_mlp": 0.11346436, + "step": 10775, + "time_per_iteration": 2.518340826034546 + }, + { + "auxiliary_loss_clip": 0.06416593, + "auxiliary_loss_mlp": 0.01267491, + "balance_loss_clip": 0.06272413, + "balance_loss_mlp": 0.01256792, + "epoch": 0.6478881707500376, + "flos": 21623449950720.0, + "grad_norm": 1.6656343992417288, + "language_loss": 0.65808022, + "learning_rate": 1.1654358239080574e-06, + "loss": 0.73492104, + "num_input_tokens_seen": 232541500, + "router_z_loss_clip": 1.44238281, + "router_z_loss_mlp": 0.10705566, + "step": 10776, + "time_per_iteration": 2.511101722717285 + }, + { + "auxiliary_loss_clip": 0.06413212, + "auxiliary_loss_mlp": 0.01268424, + "balance_loss_clip": 0.06273791, + "balance_loss_mlp": 0.01257343, + "epoch": 0.6479482940027056, + "flos": 18448543221120.0, + "grad_norm": 2.2928682482209015, + "language_loss": 0.79598206, + "learning_rate": 1.1650819059101839e-06, + "loss": 0.87279832, + "num_input_tokens_seen": 232559720, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.11096191, + "step": 10777, + "time_per_iteration": 2.554004669189453 + }, + { + "auxiliary_loss_clip": 0.06412454, + "auxiliary_loss_mlp": 0.01266878, + "balance_loss_clip": 0.06275424, + "balance_loss_mlp": 0.01256311, + "epoch": 0.6480084172553735, + "flos": 22170651039360.0, + "grad_norm": 1.8955877147463427, + "language_loss": 0.74017107, + "learning_rate": 1.1647280195724896e-06, + "loss": 0.81696445, + "num_input_tokens_seen": 232579370, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.10571289, + "step": 10778, + "time_per_iteration": 2.5087220668792725 + }, + { + "auxiliary_loss_clip": 0.06407086, + "auxiliary_loss_mlp": 0.01264071, + "balance_loss_clip": 0.06272884, + "balance_loss_mlp": 0.01253694, + "epoch": 0.6480685405080415, + "flos": 24323089921920.0, + "grad_norm": 1.3775726820823926, + "language_loss": 0.78463447, + "learning_rate": 1.1643741649083923e-06, + "loss": 0.86134601, + "num_input_tokens_seen": 232600495, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.10388184, + "step": 10779, + "time_per_iteration": 2.5677905082702637 + }, + { + "auxiliary_loss_clip": 0.06319194, + "auxiliary_loss_mlp": 0.01254794, + "balance_loss_clip": 0.06262461, + "balance_loss_mlp": 0.01253526, + "epoch": 0.6481286637607094, + "flos": 59910348539520.0, + "grad_norm": 0.7063734620210058, + "language_loss": 0.59437895, + "learning_rate": 1.1640203419313095e-06, + "loss": 0.67011881, + "num_input_tokens_seen": 232663165, + "router_z_loss_clip": 0.56640625, + "router_z_loss_mlp": 0.01268005, + "step": 10780, + "time_per_iteration": 3.11826229095459 + }, + { + "auxiliary_loss_clip": 0.06409959, + "auxiliary_loss_mlp": 0.01264952, + "balance_loss_clip": 0.06273422, + "balance_loss_mlp": 0.01254974, + "epoch": 0.6481887870133775, + "flos": 25491313146240.0, + "grad_norm": 1.83776143864241, + "language_loss": 0.79705411, + "learning_rate": 1.1636665506546599e-06, + "loss": 0.87380326, + "num_input_tokens_seen": 232683385, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.09979248, + "step": 10781, + "time_per_iteration": 2.5406956672668457 + }, + { + "auxiliary_loss_clip": 0.06418487, + "auxiliary_loss_mlp": 0.0127058, + "balance_loss_clip": 0.06278095, + "balance_loss_mlp": 0.01258636, + "epoch": 0.6482489102660454, + "flos": 19935041379840.0, + "grad_norm": 2.151495176949557, + "language_loss": 0.78676552, + "learning_rate": 1.1633127910918578e-06, + "loss": 0.86365616, + "num_input_tokens_seen": 232699095, + "router_z_loss_clip": 1.40429688, + "router_z_loss_mlp": 0.11938477, + "step": 10782, + "time_per_iteration": 2.5015201568603516 + }, + { + "auxiliary_loss_clip": 0.06412151, + "auxiliary_loss_mlp": 0.01268158, + "balance_loss_clip": 0.06272621, + "balance_loss_mlp": 0.01257525, + "epoch": 0.6483090335187134, + "flos": 26986741764480.0, + "grad_norm": 3.0083350466584378, + "language_loss": 0.64055502, + "learning_rate": 1.1629590632563187e-06, + "loss": 0.71735811, + "num_input_tokens_seen": 232717920, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.10632324, + "step": 10783, + "time_per_iteration": 2.536803960800171 + }, + { + "auxiliary_loss_clip": 0.06416991, + "auxiliary_loss_mlp": 0.01269846, + "balance_loss_clip": 0.06275127, + "balance_loss_mlp": 0.01258426, + "epoch": 0.6483691567713813, + "flos": 25084207284480.0, + "grad_norm": 1.8907849838824615, + "language_loss": 0.89016545, + "learning_rate": 1.1626053671614561e-06, + "loss": 0.96703386, + "num_input_tokens_seen": 232737605, + "router_z_loss_clip": 1.41894531, + "router_z_loss_mlp": 0.11431885, + "step": 10784, + "time_per_iteration": 2.5452053546905518 + }, + { + "auxiliary_loss_clip": 0.0641108, + "auxiliary_loss_mlp": 0.01266426, + "balance_loss_clip": 0.06276603, + "balance_loss_mlp": 0.01254636, + "epoch": 0.6484292800240493, + "flos": 16111300158720.0, + "grad_norm": 2.486751490302504, + "language_loss": 0.73449266, + "learning_rate": 1.1622517028206815e-06, + "loss": 0.81126773, + "num_input_tokens_seen": 232755110, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.11798096, + "step": 10785, + "time_per_iteration": 2.4847772121429443 + }, + { + "auxiliary_loss_clip": 0.06405519, + "auxiliary_loss_mlp": 0.01266455, + "balance_loss_clip": 0.06272283, + "balance_loss_mlp": 0.01256507, + "epoch": 0.6484894032767172, + "flos": 28848005308800.0, + "grad_norm": 1.4322253483725718, + "language_loss": 0.69456708, + "learning_rate": 1.1618980702474071e-06, + "loss": 0.77128685, + "num_input_tokens_seen": 232779040, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.0994873, + "step": 10786, + "time_per_iteration": 2.585789918899536 + }, + { + "auxiliary_loss_clip": 0.06408978, + "auxiliary_loss_mlp": 0.0126988, + "balance_loss_clip": 0.06272955, + "balance_loss_mlp": 0.01259122, + "epoch": 0.6485495265293852, + "flos": 30234924489600.0, + "grad_norm": 2.0420211875900285, + "language_loss": 0.71877193, + "learning_rate": 1.161544469455041e-06, + "loss": 0.79556048, + "num_input_tokens_seen": 232800515, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.10760498, + "step": 10787, + "time_per_iteration": 2.566206216812134 + }, + { + "auxiliary_loss_clip": 0.06411794, + "auxiliary_loss_mlp": 0.01266479, + "balance_loss_clip": 0.06271984, + "balance_loss_mlp": 0.01255595, + "epoch": 0.6486096497820532, + "flos": 20088050135040.0, + "grad_norm": 1.7621323533283269, + "language_loss": 0.84403133, + "learning_rate": 1.1611909004569934e-06, + "loss": 0.92081404, + "num_input_tokens_seen": 232818450, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.10882568, + "step": 10788, + "time_per_iteration": 2.482072353363037 + }, + { + "auxiliary_loss_clip": 0.06410778, + "auxiliary_loss_mlp": 0.01268935, + "balance_loss_clip": 0.06273876, + "balance_loss_mlp": 0.01258111, + "epoch": 0.6486697730347212, + "flos": 17134816181760.0, + "grad_norm": 2.2095301330311643, + "language_loss": 0.77364171, + "learning_rate": 1.1608373632666708e-06, + "loss": 0.85043883, + "num_input_tokens_seen": 232834785, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.10821533, + "step": 10789, + "time_per_iteration": 2.5368380546569824 + }, + { + "auxiliary_loss_clip": 0.06408279, + "auxiliary_loss_mlp": 0.01268929, + "balance_loss_clip": 0.06272905, + "balance_loss_mlp": 0.01258606, + "epoch": 0.6487298962873892, + "flos": 38921477886720.0, + "grad_norm": 1.570352466870208, + "language_loss": 0.76618487, + "learning_rate": 1.160483857897479e-06, + "loss": 0.8429569, + "num_input_tokens_seen": 232856050, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.10327148, + "step": 10790, + "time_per_iteration": 2.6590943336486816 + }, + { + "auxiliary_loss_clip": 0.06408708, + "auxiliary_loss_mlp": 0.01266087, + "balance_loss_clip": 0.0627384, + "balance_loss_mlp": 0.01256169, + "epoch": 0.6487900195400571, + "flos": 11952680895360.0, + "grad_norm": 2.134716405653686, + "language_loss": 0.59979677, + "learning_rate": 1.160130384362823e-06, + "loss": 0.67654467, + "num_input_tokens_seen": 232873945, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.09924316, + "step": 10791, + "time_per_iteration": 3.963503360748291 + }, + { + "auxiliary_loss_clip": 0.06410848, + "auxiliary_loss_mlp": 0.01268892, + "balance_loss_clip": 0.06274579, + "balance_loss_mlp": 0.01258646, + "epoch": 0.6488501427927251, + "flos": 22350717463680.0, + "grad_norm": 1.5491724826349689, + "language_loss": 0.8594861, + "learning_rate": 1.1597769426761082e-06, + "loss": 0.93628347, + "num_input_tokens_seen": 232892160, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.10253906, + "step": 10792, + "time_per_iteration": 2.555723190307617 + }, + { + "auxiliary_loss_clip": 0.06414551, + "auxiliary_loss_mlp": 0.01268197, + "balance_loss_clip": 0.06273945, + "balance_loss_mlp": 0.01256419, + "epoch": 0.648910266045393, + "flos": 22242753077760.0, + "grad_norm": 1.7314529044761888, + "language_loss": 0.78069973, + "learning_rate": 1.159423532850735e-06, + "loss": 0.85752726, + "num_input_tokens_seen": 232911725, + "router_z_loss_clip": 1.40527344, + "router_z_loss_mlp": 0.11773682, + "step": 10793, + "time_per_iteration": 2.5019938945770264 + }, + { + "auxiliary_loss_clip": 0.06413871, + "auxiliary_loss_mlp": 0.01268245, + "balance_loss_clip": 0.06274308, + "balance_loss_mlp": 0.01257367, + "epoch": 0.6489703892980611, + "flos": 25308269902080.0, + "grad_norm": 1.950729669882986, + "language_loss": 0.74567354, + "learning_rate": 1.1590701549001055e-06, + "loss": 0.82249475, + "num_input_tokens_seen": 232929085, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.10882568, + "step": 10794, + "time_per_iteration": 2.5795669555664062 + }, + { + "auxiliary_loss_clip": 0.06410497, + "auxiliary_loss_mlp": 0.01265921, + "balance_loss_clip": 0.06272513, + "balance_loss_mlp": 0.01254655, + "epoch": 0.649030512550729, + "flos": 24578864110080.0, + "grad_norm": 1.8148879038848986, + "language_loss": 0.699453, + "learning_rate": 1.158716808837621e-06, + "loss": 0.77621716, + "num_input_tokens_seen": 232949455, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.11273193, + "step": 10795, + "time_per_iteration": 2.538400173187256 + }, + { + "auxiliary_loss_clip": 0.06416844, + "auxiliary_loss_mlp": 0.01273855, + "balance_loss_clip": 0.06276066, + "balance_loss_mlp": 0.01261964, + "epoch": 0.649090635803397, + "flos": 26251004989440.0, + "grad_norm": 1.9678382508243188, + "language_loss": 0.54238826, + "learning_rate": 1.158363494676679e-06, + "loss": 0.61929524, + "num_input_tokens_seen": 232969445, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.11895752, + "step": 10796, + "time_per_iteration": 2.6402297019958496 + }, + { + "auxiliary_loss_clip": 0.06412029, + "auxiliary_loss_mlp": 0.01265233, + "balance_loss_clip": 0.06273568, + "balance_loss_mlp": 0.01254767, + "epoch": 0.6491507590560649, + "flos": 24944489400960.0, + "grad_norm": 1.676360773921332, + "language_loss": 0.77936971, + "learning_rate": 1.1580102124306775e-06, + "loss": 0.85614228, + "num_input_tokens_seen": 232988900, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.10467529, + "step": 10797, + "time_per_iteration": 2.5467689037323 + }, + { + "auxiliary_loss_clip": 0.06405483, + "auxiliary_loss_mlp": 0.01265668, + "balance_loss_clip": 0.06273736, + "balance_loss_mlp": 0.01255935, + "epoch": 0.6492108823087329, + "flos": 19505783312640.0, + "grad_norm": 3.2369805565604053, + "language_loss": 0.7037648, + "learning_rate": 1.1576569621130134e-06, + "loss": 0.78047633, + "num_input_tokens_seen": 233005060, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.09729004, + "step": 10798, + "time_per_iteration": 2.5187807083129883 + }, + { + "auxiliary_loss_clip": 0.06409095, + "auxiliary_loss_mlp": 0.01266435, + "balance_loss_clip": 0.06272874, + "balance_loss_mlp": 0.01256493, + "epoch": 0.6492710055614008, + "flos": 19725443591040.0, + "grad_norm": 1.928025975497767, + "language_loss": 0.77484357, + "learning_rate": 1.1573037437370811e-06, + "loss": 0.85159886, + "num_input_tokens_seen": 233023375, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.09942627, + "step": 10799, + "time_per_iteration": 2.4996323585510254 + }, + { + "auxiliary_loss_clip": 0.06416353, + "auxiliary_loss_mlp": 0.01268958, + "balance_loss_clip": 0.06274813, + "balance_loss_mlp": 0.01257466, + "epoch": 0.6493311288140688, + "flos": 24324012316800.0, + "grad_norm": 1.6859277521525557, + "language_loss": 0.72046328, + "learning_rate": 1.1569505573162755e-06, + "loss": 0.79731631, + "num_input_tokens_seen": 233043130, + "router_z_loss_clip": 1.41503906, + "router_z_loss_mlp": 0.11480713, + "step": 10800, + "time_per_iteration": 2.5757715702056885 + }, + { + "auxiliary_loss_clip": 0.06306565, + "auxiliary_loss_mlp": 0.01256479, + "balance_loss_clip": 0.06250083, + "balance_loss_mlp": 0.01255134, + "epoch": 0.6493912520667368, + "flos": 70953655800960.0, + "grad_norm": 0.743676703722325, + "language_loss": 0.60158885, + "learning_rate": 1.1565974028639897e-06, + "loss": 0.67721927, + "num_input_tokens_seen": 233110560, + "router_z_loss_clip": 0.56640625, + "router_z_loss_mlp": 0.01346588, + "step": 10801, + "time_per_iteration": 3.246039867401123 + }, + { + "auxiliary_loss_clip": 0.06415623, + "auxiliary_loss_mlp": 0.01272232, + "balance_loss_clip": 0.06277107, + "balance_loss_mlp": 0.01260782, + "epoch": 0.6494513753194048, + "flos": 25344803082240.0, + "grad_norm": 1.7594241437691729, + "language_loss": 0.78884411, + "learning_rate": 1.156244280393614e-06, + "loss": 0.86572272, + "num_input_tokens_seen": 233130080, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.11456299, + "step": 10802, + "time_per_iteration": 2.563626766204834 + }, + { + "auxiliary_loss_clip": 0.06407687, + "auxiliary_loss_mlp": 0.01265006, + "balance_loss_clip": 0.06270398, + "balance_loss_mlp": 0.01254385, + "epoch": 0.6495114985720728, + "flos": 24689050629120.0, + "grad_norm": 1.4701116877862836, + "language_loss": 0.7461825, + "learning_rate": 1.155891189918541e-06, + "loss": 0.82290947, + "num_input_tokens_seen": 233150235, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.10620117, + "step": 10803, + "time_per_iteration": 2.6647095680236816 + }, + { + "auxiliary_loss_clip": 0.06410737, + "auxiliary_loss_mlp": 0.01268913, + "balance_loss_clip": 0.06273716, + "balance_loss_mlp": 0.01258112, + "epoch": 0.6495716218247407, + "flos": 23656520292480.0, + "grad_norm": 2.024891036997784, + "language_loss": 0.6987229, + "learning_rate": 1.1555381314521578e-06, + "loss": 0.77551937, + "num_input_tokens_seen": 233166710, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.10803223, + "step": 10804, + "time_per_iteration": 3.998316526412964 + }, + { + "auxiliary_loss_clip": 0.06410199, + "auxiliary_loss_mlp": 0.01264742, + "balance_loss_clip": 0.0627581, + "balance_loss_mlp": 0.01254019, + "epoch": 0.6496317450774087, + "flos": 22352729961600.0, + "grad_norm": 1.61833096357978, + "language_loss": 0.72940427, + "learning_rate": 1.1551851050078537e-06, + "loss": 0.80615366, + "num_input_tokens_seen": 233185445, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.1072998, + "step": 10805, + "time_per_iteration": 2.550152540206909 + }, + { + "auxiliary_loss_clip": 0.06408597, + "auxiliary_loss_mlp": 0.01268433, + "balance_loss_clip": 0.06270424, + "balance_loss_mlp": 0.01258384, + "epoch": 0.6496918683300766, + "flos": 30526519098240.0, + "grad_norm": 1.9854028073217467, + "language_loss": 0.66420656, + "learning_rate": 1.1548321105990155e-06, + "loss": 0.74097693, + "num_input_tokens_seen": 233205805, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.1005249, + "step": 10806, + "time_per_iteration": 4.017642021179199 + }, + { + "auxiliary_loss_clip": 0.06412096, + "auxiliary_loss_mlp": 0.01270405, + "balance_loss_clip": 0.06272469, + "balance_loss_mlp": 0.01259587, + "epoch": 0.6497519915827447, + "flos": 12463977709440.0, + "grad_norm": 2.120421469188937, + "language_loss": 0.79874885, + "learning_rate": 1.1544791482390275e-06, + "loss": 0.87557387, + "num_input_tokens_seen": 233224215, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.10821533, + "step": 10807, + "time_per_iteration": 2.47318959236145 + }, + { + "auxiliary_loss_clip": 0.06308749, + "auxiliary_loss_mlp": 0.01254009, + "balance_loss_clip": 0.06252696, + "balance_loss_mlp": 0.01252862, + "epoch": 0.6498121148354126, + "flos": 69115787544960.0, + "grad_norm": 0.7752767775633225, + "language_loss": 0.5892998, + "learning_rate": 1.1541262179412745e-06, + "loss": 0.66492736, + "num_input_tokens_seen": 233294440, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01145935, + "step": 10808, + "time_per_iteration": 3.316317319869995 + }, + { + "auxiliary_loss_clip": 0.06407646, + "auxiliary_loss_mlp": 0.0126717, + "balance_loss_clip": 0.06275291, + "balance_loss_mlp": 0.01257043, + "epoch": 0.6498722380880806, + "flos": 36904983454080.0, + "grad_norm": 1.693655644054658, + "language_loss": 0.63518184, + "learning_rate": 1.1537733197191415e-06, + "loss": 0.71192998, + "num_input_tokens_seen": 233316125, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.10131836, + "step": 10809, + "time_per_iteration": 2.6661953926086426 + }, + { + "auxiliary_loss_clip": 0.06407648, + "auxiliary_loss_mlp": 0.01268298, + "balance_loss_clip": 0.06274128, + "balance_loss_mlp": 0.01258499, + "epoch": 0.6499323613407485, + "flos": 29024549861760.0, + "grad_norm": 1.455455865849343, + "language_loss": 0.81994486, + "learning_rate": 1.153420453586008e-06, + "loss": 0.89670432, + "num_input_tokens_seen": 233336140, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.09796143, + "step": 10810, + "time_per_iteration": 2.582893133163452 + }, + { + "auxiliary_loss_clip": 0.06403928, + "auxiliary_loss_mlp": 0.01273294, + "balance_loss_clip": 0.06272624, + "balance_loss_mlp": 0.01263382, + "epoch": 0.6499924845934165, + "flos": 20125212220800.0, + "grad_norm": 1.5531414073118446, + "language_loss": 0.71929145, + "learning_rate": 1.1530676195552561e-06, + "loss": 0.79606366, + "num_input_tokens_seen": 233356095, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.09912109, + "step": 10811, + "time_per_iteration": 2.5130205154418945 + }, + { + "auxiliary_loss_clip": 0.06403043, + "auxiliary_loss_mlp": 0.01269239, + "balance_loss_clip": 0.06273396, + "balance_loss_mlp": 0.01259273, + "epoch": 0.6500526078460844, + "flos": 24427490509440.0, + "grad_norm": 1.5864651817553501, + "language_loss": 0.78127778, + "learning_rate": 1.1527148176402649e-06, + "loss": 0.85800058, + "num_input_tokens_seen": 233376830, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.09967041, + "step": 10812, + "time_per_iteration": 2.5567028522491455 + }, + { + "auxiliary_loss_clip": 0.06411995, + "auxiliary_loss_mlp": 0.01269878, + "balance_loss_clip": 0.06273413, + "balance_loss_mlp": 0.01258887, + "epoch": 0.6501127310987524, + "flos": 23337700306560.0, + "grad_norm": 1.8208092909693303, + "language_loss": 0.85530257, + "learning_rate": 1.152362047854413e-06, + "loss": 0.93212128, + "num_input_tokens_seen": 233395275, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.10992432, + "step": 10813, + "time_per_iteration": 3.9791102409362793 + }, + { + "auxiliary_loss_clip": 0.06410386, + "auxiliary_loss_mlp": 0.01268379, + "balance_loss_clip": 0.0627619, + "balance_loss_mlp": 0.01257955, + "epoch": 0.6501728543514204, + "flos": 18703814284800.0, + "grad_norm": 1.7861415482224605, + "language_loss": 0.80307227, + "learning_rate": 1.1520093102110764e-06, + "loss": 0.87985992, + "num_input_tokens_seen": 233413345, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.10424805, + "step": 10814, + "time_per_iteration": 2.4790940284729004 + }, + { + "auxiliary_loss_clip": 0.06415637, + "auxiliary_loss_mlp": 0.01266919, + "balance_loss_clip": 0.06275604, + "balance_loss_mlp": 0.01256119, + "epoch": 0.6502329776040884, + "flos": 44209858550400.0, + "grad_norm": 1.5485248232594282, + "language_loss": 0.65536499, + "learning_rate": 1.1516566047236328e-06, + "loss": 0.73219061, + "num_input_tokens_seen": 233436105, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.10803223, + "step": 10815, + "time_per_iteration": 2.7446234226226807 + }, + { + "auxiliary_loss_clip": 0.06417957, + "auxiliary_loss_mlp": 0.01270506, + "balance_loss_clip": 0.06274943, + "balance_loss_mlp": 0.01257667, + "epoch": 0.6502931008567564, + "flos": 14580009192960.0, + "grad_norm": 1.8474906541134053, + "language_loss": 0.75516546, + "learning_rate": 1.1513039314054546e-06, + "loss": 0.83205009, + "num_input_tokens_seen": 233452320, + "router_z_loss_clip": 1.43164062, + "router_z_loss_mlp": 0.12841797, + "step": 10816, + "time_per_iteration": 2.4595513343811035 + }, + { + "auxiliary_loss_clip": 0.06411922, + "auxiliary_loss_mlp": 0.01272269, + "balance_loss_clip": 0.06278138, + "balance_loss_mlp": 0.01261845, + "epoch": 0.6503532241094243, + "flos": 21400980560640.0, + "grad_norm": 1.6906297848786114, + "language_loss": 0.73428237, + "learning_rate": 1.1509512902699174e-06, + "loss": 0.81112432, + "num_input_tokens_seen": 233469920, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.10424805, + "step": 10817, + "time_per_iteration": 2.5484201908111572 + }, + { + "auxiliary_loss_clip": 0.06410678, + "auxiliary_loss_mlp": 0.01266458, + "balance_loss_clip": 0.06273761, + "balance_loss_mlp": 0.01255783, + "epoch": 0.6504133473620923, + "flos": 74756349648000.0, + "grad_norm": 1.454828626029086, + "language_loss": 0.71655715, + "learning_rate": 1.1505986813303916e-06, + "loss": 0.79332852, + "num_input_tokens_seen": 233499780, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.10675049, + "step": 10818, + "time_per_iteration": 2.908658504486084 + }, + { + "auxiliary_loss_clip": 0.06415702, + "auxiliary_loss_mlp": 0.01267764, + "balance_loss_clip": 0.06276265, + "balance_loss_mlp": 0.01257261, + "epoch": 0.6504734706147602, + "flos": 19718399848320.0, + "grad_norm": 2.191602402717942, + "language_loss": 0.64758539, + "learning_rate": 1.150246104600249e-06, + "loss": 0.72442001, + "num_input_tokens_seen": 233518235, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.10510254, + "step": 10819, + "time_per_iteration": 2.5333735942840576 + }, + { + "auxiliary_loss_clip": 0.06412923, + "auxiliary_loss_mlp": 0.01268465, + "balance_loss_clip": 0.06274152, + "balance_loss_mlp": 0.01257849, + "epoch": 0.6505335938674283, + "flos": 25563960236160.0, + "grad_norm": 1.7905989506117173, + "language_loss": 0.83637512, + "learning_rate": 1.14989356009286e-06, + "loss": 0.91318899, + "num_input_tokens_seen": 233535215, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.10614014, + "step": 10820, + "time_per_iteration": 2.5265371799468994 + }, + { + "auxiliary_loss_clip": 0.06416887, + "auxiliary_loss_mlp": 0.01268038, + "balance_loss_clip": 0.06276121, + "balance_loss_mlp": 0.01256278, + "epoch": 0.6505937171200962, + "flos": 17827143742080.0, + "grad_norm": 2.110303525663697, + "language_loss": 0.78078735, + "learning_rate": 1.1495410478215914e-06, + "loss": 0.85763657, + "num_input_tokens_seen": 233552775, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.11755371, + "step": 10821, + "time_per_iteration": 2.5157594680786133 + }, + { + "auxiliary_loss_clip": 0.06407174, + "auxiliary_loss_mlp": 0.01267611, + "balance_loss_clip": 0.06274926, + "balance_loss_mlp": 0.01258193, + "epoch": 0.6506538403727642, + "flos": 20674467734400.0, + "grad_norm": 1.345963122833849, + "language_loss": 0.79950106, + "learning_rate": 1.1491885677998126e-06, + "loss": 0.8762489, + "num_input_tokens_seen": 233572080, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.09417725, + "step": 10822, + "time_per_iteration": 2.556008815765381 + }, + { + "auxiliary_loss_clip": 0.06409828, + "auxiliary_loss_mlp": 0.01265301, + "balance_loss_clip": 0.06275606, + "balance_loss_mlp": 0.01254489, + "epoch": 0.6507139636254321, + "flos": 11724970625280.0, + "grad_norm": 1.7704738467059193, + "language_loss": 0.87903178, + "learning_rate": 1.1488361200408883e-06, + "loss": 0.95578313, + "num_input_tokens_seen": 233589155, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.1081543, + "step": 10823, + "time_per_iteration": 2.5153284072875977 + }, + { + "auxiliary_loss_clip": 0.06410562, + "auxiliary_loss_mlp": 0.01263649, + "balance_loss_clip": 0.06273584, + "balance_loss_mlp": 0.01252885, + "epoch": 0.6507740868781001, + "flos": 26769177838080.0, + "grad_norm": 1.5876907781405154, + "language_loss": 0.66698307, + "learning_rate": 1.148483704558183e-06, + "loss": 0.74372518, + "num_input_tokens_seen": 233608180, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.10760498, + "step": 10824, + "time_per_iteration": 2.5415477752685547 + }, + { + "auxiliary_loss_clip": 0.06414588, + "auxiliary_loss_mlp": 0.01270098, + "balance_loss_clip": 0.06274509, + "balance_loss_mlp": 0.01259471, + "epoch": 0.650834210130768, + "flos": 16477260865920.0, + "grad_norm": 2.5628817527572365, + "language_loss": 0.88034272, + "learning_rate": 1.1481313213650607e-06, + "loss": 0.95718956, + "num_input_tokens_seen": 233625750, + "router_z_loss_clip": 1.40039062, + "router_z_loss_mlp": 0.10632324, + "step": 10825, + "time_per_iteration": 2.5432024002075195 + }, + { + "auxiliary_loss_clip": 0.06415717, + "auxiliary_loss_mlp": 0.01269359, + "balance_loss_clip": 0.0627567, + "balance_loss_mlp": 0.012577, + "epoch": 0.650894333383436, + "flos": 17134354984320.0, + "grad_norm": 2.078178971450375, + "language_loss": 0.73451078, + "learning_rate": 1.147778970474885e-06, + "loss": 0.81136155, + "num_input_tokens_seen": 233644235, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.11651611, + "step": 10826, + "time_per_iteration": 2.483405113220215 + }, + { + "auxiliary_loss_clip": 0.06414787, + "auxiliary_loss_mlp": 0.0126516, + "balance_loss_clip": 0.06277563, + "balance_loss_mlp": 0.01255057, + "epoch": 0.650954456636104, + "flos": 18740221683840.0, + "grad_norm": 2.050300118391263, + "language_loss": 0.69847488, + "learning_rate": 1.1474266519010157e-06, + "loss": 0.7752744, + "num_input_tokens_seen": 233662845, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.10107422, + "step": 10827, + "time_per_iteration": 2.529306650161743 + }, + { + "auxiliary_loss_clip": 0.06416346, + "auxiliary_loss_mlp": 0.01267488, + "balance_loss_clip": 0.0627773, + "balance_loss_mlp": 0.01256479, + "epoch": 0.651014579888772, + "flos": 24533987448960.0, + "grad_norm": 2.390068067700356, + "language_loss": 0.77023715, + "learning_rate": 1.1470743656568136e-06, + "loss": 0.84707546, + "num_input_tokens_seen": 233681990, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.10998535, + "step": 10828, + "time_per_iteration": 2.5035903453826904 + }, + { + "auxiliary_loss_clip": 0.06409818, + "auxiliary_loss_mlp": 0.01263344, + "balance_loss_clip": 0.06275382, + "balance_loss_mlp": 0.01252961, + "epoch": 0.65107470314144, + "flos": 24067944639360.0, + "grad_norm": 1.7088923896554455, + "language_loss": 0.89246607, + "learning_rate": 1.1467221117556362e-06, + "loss": 0.96919769, + "num_input_tokens_seen": 233698930, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.10388184, + "step": 10829, + "time_per_iteration": 2.51090931892395 + }, + { + "auxiliary_loss_clip": 0.06314664, + "auxiliary_loss_mlp": 0.0125328, + "balance_loss_clip": 0.06258522, + "balance_loss_mlp": 0.01251908, + "epoch": 0.6511348263941079, + "flos": 72502304561280.0, + "grad_norm": 0.6366010219235949, + "language_loss": 0.55376649, + "learning_rate": 1.1463698902108428e-06, + "loss": 0.62944591, + "num_input_tokens_seen": 233769825, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01374817, + "step": 10830, + "time_per_iteration": 3.2892563343048096 + }, + { + "auxiliary_loss_clip": 0.06424817, + "auxiliary_loss_mlp": 0.01266709, + "balance_loss_clip": 0.06282428, + "balance_loss_mlp": 0.01255372, + "epoch": 0.6511949496467759, + "flos": 23374401194880.0, + "grad_norm": 2.1202653739592026, + "language_loss": 0.75132632, + "learning_rate": 1.1460177010357878e-06, + "loss": 0.82824159, + "num_input_tokens_seen": 233787095, + "router_z_loss_clip": 1.42480469, + "router_z_loss_mlp": 0.11334229, + "step": 10831, + "time_per_iteration": 4.007694482803345 + }, + { + "auxiliary_loss_clip": 0.06315142, + "auxiliary_loss_mlp": 0.01253248, + "balance_loss_clip": 0.06259014, + "balance_loss_mlp": 0.01251801, + "epoch": 0.6512550728994438, + "flos": 67353390218880.0, + "grad_norm": 0.6347055670227107, + "language_loss": 0.51072258, + "learning_rate": 1.145665544243828e-06, + "loss": 0.58640647, + "num_input_tokens_seen": 233853050, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01445007, + "step": 10832, + "time_per_iteration": 3.2983696460723877 + }, + { + "auxiliary_loss_clip": 0.06417792, + "auxiliary_loss_mlp": 0.01264906, + "balance_loss_clip": 0.06276103, + "balance_loss_mlp": 0.01254195, + "epoch": 0.6513151961521119, + "flos": 21147973557120.0, + "grad_norm": 2.2140276605758693, + "language_loss": 0.8367548, + "learning_rate": 1.145313419848316e-06, + "loss": 0.91358173, + "num_input_tokens_seen": 233871385, + "router_z_loss_clip": 1.41699219, + "router_z_loss_mlp": 0.10699463, + "step": 10833, + "time_per_iteration": 2.511261463165283 + }, + { + "auxiliary_loss_clip": 0.06416205, + "auxiliary_loss_mlp": 0.01266301, + "balance_loss_clip": 0.06280707, + "balance_loss_mlp": 0.01255471, + "epoch": 0.6513753194047798, + "flos": 15164246586240.0, + "grad_norm": 10.86743731426701, + "language_loss": 0.84111547, + "learning_rate": 1.1449613278626049e-06, + "loss": 0.9179405, + "num_input_tokens_seen": 233888175, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.1083374, + "step": 10834, + "time_per_iteration": 2.4789986610412598 + }, + { + "auxiliary_loss_clip": 0.06416395, + "auxiliary_loss_mlp": 0.01267897, + "balance_loss_clip": 0.06278732, + "balance_loss_mlp": 0.01257979, + "epoch": 0.6514354426574478, + "flos": 30234421365120.0, + "grad_norm": 1.7456774308536143, + "language_loss": 0.77525127, + "learning_rate": 1.1446092683000455e-06, + "loss": 0.85209417, + "num_input_tokens_seen": 233911470, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.09918213, + "step": 10835, + "time_per_iteration": 2.588974714279175 + }, + { + "auxiliary_loss_clip": 0.06414215, + "auxiliary_loss_mlp": 0.01268341, + "balance_loss_clip": 0.06276295, + "balance_loss_mlp": 0.01257624, + "epoch": 0.6514955659101157, + "flos": 24212232570240.0, + "grad_norm": 5.683759297238724, + "language_loss": 0.77732491, + "learning_rate": 1.1442572411739882e-06, + "loss": 0.85415047, + "num_input_tokens_seen": 233932135, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.10717773, + "step": 10836, + "time_per_iteration": 2.5676357746124268 + }, + { + "auxiliary_loss_clip": 0.06414723, + "auxiliary_loss_mlp": 0.01267155, + "balance_loss_clip": 0.06277227, + "balance_loss_mlp": 0.01256552, + "epoch": 0.6515556891627837, + "flos": 12381351984000.0, + "grad_norm": 1.8169643503490496, + "language_loss": 0.82167637, + "learning_rate": 1.143905246497783e-06, + "loss": 0.8984952, + "num_input_tokens_seen": 233947880, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.1060791, + "step": 10837, + "time_per_iteration": 2.483123779296875 + }, + { + "auxiliary_loss_clip": 0.06414027, + "auxiliary_loss_mlp": 0.01269762, + "balance_loss_clip": 0.06281339, + "balance_loss_mlp": 0.01258753, + "epoch": 0.6516158124154516, + "flos": 49612366874880.0, + "grad_norm": 1.9745505880128194, + "language_loss": 0.59549761, + "learning_rate": 1.1435532842847758e-06, + "loss": 0.67233551, + "num_input_tokens_seen": 233971475, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.11004639, + "step": 10838, + "time_per_iteration": 2.762786865234375 + }, + { + "auxiliary_loss_clip": 0.06317103, + "auxiliary_loss_mlp": 0.01253866, + "balance_loss_clip": 0.06261341, + "balance_loss_mlp": 0.01252529, + "epoch": 0.6516759356681197, + "flos": 59720848531200.0, + "grad_norm": 0.7135395932752281, + "language_loss": 0.60686612, + "learning_rate": 1.1432013545483147e-06, + "loss": 0.68257582, + "num_input_tokens_seen": 234030690, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.01338196, + "step": 10839, + "time_per_iteration": 3.223712921142578 + }, + { + "auxiliary_loss_clip": 0.06412867, + "auxiliary_loss_mlp": 0.01261941, + "balance_loss_clip": 0.06278579, + "balance_loss_mlp": 0.01252809, + "epoch": 0.6517360589207876, + "flos": 37459815264000.0, + "grad_norm": 1.5945463275519725, + "language_loss": 0.67963755, + "learning_rate": 1.1428494573017439e-06, + "loss": 0.75638568, + "num_input_tokens_seen": 234052470, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.09136963, + "step": 10840, + "time_per_iteration": 2.6288609504699707 + }, + { + "auxiliary_loss_clip": 0.06418526, + "auxiliary_loss_mlp": 0.01264725, + "balance_loss_clip": 0.06281736, + "balance_loss_mlp": 0.01254264, + "epoch": 0.6517961821734556, + "flos": 25382049022080.0, + "grad_norm": 2.724184034803811, + "language_loss": 0.73645818, + "learning_rate": 1.1424975925584071e-06, + "loss": 0.81329072, + "num_input_tokens_seen": 234071495, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.10461426, + "step": 10841, + "time_per_iteration": 2.6020925045013428 + }, + { + "auxiliary_loss_clip": 0.06416935, + "auxiliary_loss_mlp": 0.01263784, + "balance_loss_clip": 0.0627799, + "balance_loss_mlp": 0.01252632, + "epoch": 0.6518563054261236, + "flos": 28774519678080.0, + "grad_norm": 1.3493483862035613, + "language_loss": 0.6300385, + "learning_rate": 1.142145760331648e-06, + "loss": 0.7068457, + "num_input_tokens_seen": 234092325, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.11151123, + "step": 10842, + "time_per_iteration": 2.550992012023926 + }, + { + "auxiliary_loss_clip": 0.06321006, + "auxiliary_loss_mlp": 0.01250785, + "balance_loss_clip": 0.06265368, + "balance_loss_mlp": 0.01249527, + "epoch": 0.6519164286787915, + "flos": 68942905372800.0, + "grad_norm": 0.8268303815829595, + "language_loss": 0.56121087, + "learning_rate": 1.141793960634807e-06, + "loss": 0.6369288, + "num_input_tokens_seen": 234148005, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01258087, + "step": 10843, + "time_per_iteration": 4.4302709102630615 + }, + { + "auxiliary_loss_clip": 0.06418709, + "auxiliary_loss_mlp": 0.01268693, + "balance_loss_clip": 0.06276315, + "balance_loss_mlp": 0.01256844, + "epoch": 0.6519765519314595, + "flos": 20447009026560.0, + "grad_norm": 1.9018808017225726, + "language_loss": 0.83082736, + "learning_rate": 1.1414421934812253e-06, + "loss": 0.90770137, + "num_input_tokens_seen": 234164280, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.11846924, + "step": 10844, + "time_per_iteration": 2.600843906402588 + }, + { + "auxiliary_loss_clip": 0.06412451, + "auxiliary_loss_mlp": 0.01265857, + "balance_loss_clip": 0.06274604, + "balance_loss_mlp": 0.01254598, + "epoch": 0.6520366751841274, + "flos": 28410571468800.0, + "grad_norm": 1.712600797448846, + "language_loss": 0.60434437, + "learning_rate": 1.1410904588842421e-06, + "loss": 0.68112737, + "num_input_tokens_seen": 234185090, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.11260986, + "step": 10845, + "time_per_iteration": 2.5539886951446533 + }, + { + "auxiliary_loss_clip": 0.0641913, + "auxiliary_loss_mlp": 0.01267979, + "balance_loss_clip": 0.0628117, + "balance_loss_mlp": 0.01256964, + "epoch": 0.6520967984367955, + "flos": 22279999017600.0, + "grad_norm": 1.7154837264423382, + "language_loss": 0.79721403, + "learning_rate": 1.140738756857194e-06, + "loss": 0.87408507, + "num_input_tokens_seen": 234204050, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.11010742, + "step": 10846, + "time_per_iteration": 3.9483704566955566 + }, + { + "auxiliary_loss_clip": 0.06323321, + "auxiliary_loss_mlp": 0.01252083, + "balance_loss_clip": 0.06267467, + "balance_loss_mlp": 0.01250644, + "epoch": 0.6521569216894634, + "flos": 68940123459840.0, + "grad_norm": 0.9959560363450068, + "language_loss": 0.60117191, + "learning_rate": 1.1403870874134192e-06, + "loss": 0.67692602, + "num_input_tokens_seen": 234269790, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.01437378, + "step": 10847, + "time_per_iteration": 3.259263277053833 + }, + { + "auxiliary_loss_clip": 0.06419109, + "auxiliary_loss_mlp": 0.01266521, + "balance_loss_clip": 0.06278636, + "balance_loss_mlp": 0.01255196, + "epoch": 0.6522170449421314, + "flos": 29137880908800.0, + "grad_norm": 1.6024469489184654, + "language_loss": 0.81200469, + "learning_rate": 1.1400354505662514e-06, + "loss": 0.88886106, + "num_input_tokens_seen": 234290135, + "router_z_loss_clip": 1.40429688, + "router_z_loss_mlp": 0.11322021, + "step": 10848, + "time_per_iteration": 2.5693862438201904 + }, + { + "auxiliary_loss_clip": 0.06413288, + "auxiliary_loss_mlp": 0.0127236, + "balance_loss_clip": 0.06276944, + "balance_loss_mlp": 0.01262072, + "epoch": 0.6522771681947993, + "flos": 26659284808320.0, + "grad_norm": 2.0899993216020527, + "language_loss": 0.74621618, + "learning_rate": 1.1396838463290263e-06, + "loss": 0.82307267, + "num_input_tokens_seen": 234309535, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.10284424, + "step": 10849, + "time_per_iteration": 2.636046886444092 + }, + { + "auxiliary_loss_clip": 0.06412181, + "auxiliary_loss_mlp": 0.01268851, + "balance_loss_clip": 0.06278128, + "balance_loss_mlp": 0.01258129, + "epoch": 0.6523372914474673, + "flos": 25746961553280.0, + "grad_norm": 1.4470039882385268, + "language_loss": 0.68371421, + "learning_rate": 1.1393322747150752e-06, + "loss": 0.76052451, + "num_input_tokens_seen": 234328755, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.1072998, + "step": 10850, + "time_per_iteration": 2.52057147026062 + }, + { + "auxiliary_loss_clip": 0.0640863, + "auxiliary_loss_mlp": 0.01263783, + "balance_loss_clip": 0.06275396, + "balance_loss_mlp": 0.01253752, + "epoch": 0.6523974147001352, + "flos": 24834344808960.0, + "grad_norm": 1.562549828159254, + "language_loss": 0.67212379, + "learning_rate": 1.1389807357377313e-06, + "loss": 0.7488479, + "num_input_tokens_seen": 234348655, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.10046387, + "step": 10851, + "time_per_iteration": 2.5808029174804688 + }, + { + "auxiliary_loss_clip": 0.06416307, + "auxiliary_loss_mlp": 0.01265062, + "balance_loss_clip": 0.06276499, + "balance_loss_mlp": 0.01254386, + "epoch": 0.6524575379528033, + "flos": 26323945568640.0, + "grad_norm": 2.0070314818502695, + "language_loss": 0.7443608, + "learning_rate": 1.1386292294103235e-06, + "loss": 0.8211745, + "num_input_tokens_seen": 234367445, + "router_z_loss_clip": 1.39941406, + "router_z_loss_mlp": 0.10687256, + "step": 10852, + "time_per_iteration": 4.013243675231934 + }, + { + "auxiliary_loss_clip": 0.0641986, + "auxiliary_loss_mlp": 0.01268659, + "balance_loss_clip": 0.06278665, + "balance_loss_mlp": 0.01257191, + "epoch": 0.6525176612054712, + "flos": 19499200767360.0, + "grad_norm": 1.9187417240841533, + "language_loss": 0.67066777, + "learning_rate": 1.1382777557461812e-06, + "loss": 0.74755299, + "num_input_tokens_seen": 234384825, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.11468506, + "step": 10853, + "time_per_iteration": 2.506601572036743 + }, + { + "auxiliary_loss_clip": 0.06318477, + "auxiliary_loss_mlp": 0.01256063, + "balance_loss_clip": 0.06262536, + "balance_loss_mlp": 0.0125474, + "epoch": 0.6525777844581392, + "flos": 71727057786240.0, + "grad_norm": 0.715298954462881, + "language_loss": 0.63038433, + "learning_rate": 1.137926314758634e-06, + "loss": 0.70612979, + "num_input_tokens_seen": 234450630, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01324463, + "step": 10854, + "time_per_iteration": 3.2700932025909424 + }, + { + "auxiliary_loss_clip": 0.06413402, + "auxiliary_loss_mlp": 0.01267951, + "balance_loss_clip": 0.06275877, + "balance_loss_mlp": 0.01256549, + "epoch": 0.6526379077108072, + "flos": 26660668400640.0, + "grad_norm": 1.6617688619573214, + "language_loss": 0.77541685, + "learning_rate": 1.1375749064610072e-06, + "loss": 0.85223043, + "num_input_tokens_seen": 234473505, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.11407471, + "step": 10855, + "time_per_iteration": 2.5642480850219727 + }, + { + "auxiliary_loss_clip": 0.06405862, + "auxiliary_loss_mlp": 0.01265521, + "balance_loss_clip": 0.06274554, + "balance_loss_mlp": 0.01255174, + "epoch": 0.6526980309634751, + "flos": 22826990471040.0, + "grad_norm": 1.7631241717885235, + "language_loss": 0.79621822, + "learning_rate": 1.1372235308666256e-06, + "loss": 0.87293208, + "num_input_tokens_seen": 234492485, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.10345459, + "step": 10856, + "time_per_iteration": 2.537353992462158 + }, + { + "auxiliary_loss_clip": 0.06408816, + "auxiliary_loss_mlp": 0.01267488, + "balance_loss_clip": 0.06273422, + "balance_loss_mlp": 0.01256408, + "epoch": 0.6527581542161431, + "flos": 28372403134080.0, + "grad_norm": 1.6923564955573929, + "language_loss": 0.73936152, + "learning_rate": 1.136872187988815e-06, + "loss": 0.81612456, + "num_input_tokens_seen": 234512645, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.11077881, + "step": 10857, + "time_per_iteration": 2.5426032543182373 + }, + { + "auxiliary_loss_clip": 0.06409546, + "auxiliary_loss_mlp": 0.01266483, + "balance_loss_clip": 0.06273436, + "balance_loss_mlp": 0.01256195, + "epoch": 0.652818277468811, + "flos": 18375099517440.0, + "grad_norm": 2.1707425213383136, + "language_loss": 0.63389534, + "learning_rate": 1.1365208778408965e-06, + "loss": 0.71065563, + "num_input_tokens_seen": 234529310, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.10290527, + "step": 10858, + "time_per_iteration": 2.495542049407959 + }, + { + "auxiliary_loss_clip": 0.06408103, + "auxiliary_loss_mlp": 0.01265114, + "balance_loss_clip": 0.06274083, + "balance_loss_mlp": 0.01254784, + "epoch": 0.6528784007214791, + "flos": 18041227724160.0, + "grad_norm": 1.644037371034234, + "language_loss": 0.78852642, + "learning_rate": 1.1361696004361939e-06, + "loss": 0.86525851, + "num_input_tokens_seen": 234546685, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.10333252, + "step": 10859, + "time_per_iteration": 2.5497894287109375 + }, + { + "auxiliary_loss_clip": 0.06414656, + "auxiliary_loss_mlp": 0.01263542, + "balance_loss_clip": 0.06273727, + "balance_loss_mlp": 0.01252611, + "epoch": 0.652938523974147, + "flos": 22388466528000.0, + "grad_norm": 1.5493254250566866, + "language_loss": 0.67967153, + "learning_rate": 1.1358183557880256e-06, + "loss": 0.75645357, + "num_input_tokens_seen": 234566255, + "router_z_loss_clip": 1.40917969, + "router_z_loss_mlp": 0.10931396, + "step": 10860, + "time_per_iteration": 2.5913808345794678 + }, + { + "auxiliary_loss_clip": 0.06418759, + "auxiliary_loss_mlp": 0.01268006, + "balance_loss_clip": 0.06276677, + "balance_loss_mlp": 0.01257426, + "epoch": 0.652998647226815, + "flos": 16769694015360.0, + "grad_norm": 1.8207811146767594, + "language_loss": 0.67290318, + "learning_rate": 1.135467143909712e-06, + "loss": 0.74977076, + "num_input_tokens_seen": 234585405, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.10583496, + "step": 10861, + "time_per_iteration": 2.50136137008667 + }, + { + "auxiliary_loss_clip": 0.06415796, + "auxiliary_loss_mlp": 0.01266199, + "balance_loss_clip": 0.06276291, + "balance_loss_mlp": 0.01254886, + "epoch": 0.6530587704794829, + "flos": 35781259547520.0, + "grad_norm": 2.0180062200449744, + "language_loss": 0.65632504, + "learning_rate": 1.135115964814572e-06, + "loss": 0.733145, + "num_input_tokens_seen": 234608095, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.11309814, + "step": 10862, + "time_per_iteration": 2.7082483768463135 + }, + { + "auxiliary_loss_clip": 0.06413227, + "auxiliary_loss_mlp": 0.01267111, + "balance_loss_clip": 0.06276508, + "balance_loss_mlp": 0.01256912, + "epoch": 0.6531188937321509, + "flos": 19321901527680.0, + "grad_norm": 1.7523951884589628, + "language_loss": 0.77599865, + "learning_rate": 1.13476481851592e-06, + "loss": 0.85280204, + "num_input_tokens_seen": 234627335, + "router_z_loss_clip": 1.36621094, + "router_z_loss_mlp": 0.10198975, + "step": 10863, + "time_per_iteration": 2.525467872619629 + }, + { + "auxiliary_loss_clip": 0.06412541, + "auxiliary_loss_mlp": 0.01266016, + "balance_loss_clip": 0.06275116, + "balance_loss_mlp": 0.01255579, + "epoch": 0.6531790169848188, + "flos": 22900476101760.0, + "grad_norm": 1.5537645301307006, + "language_loss": 0.74952781, + "learning_rate": 1.1344137050270739e-06, + "loss": 0.82631332, + "num_input_tokens_seen": 234646540, + "router_z_loss_clip": 1.37402344, + "router_z_loss_mlp": 0.10430908, + "step": 10864, + "time_per_iteration": 2.5613489151000977 + }, + { + "auxiliary_loss_clip": 0.06410347, + "auxiliary_loss_mlp": 0.01267199, + "balance_loss_clip": 0.06275168, + "balance_loss_mlp": 0.01256929, + "epoch": 0.6532391402374869, + "flos": 29570157722880.0, + "grad_norm": 1.9052418824081008, + "language_loss": 0.86169875, + "learning_rate": 1.1340626243613458e-06, + "loss": 0.93847424, + "num_input_tokens_seen": 234665470, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.1026001, + "step": 10865, + "time_per_iteration": 2.5604805946350098 + }, + { + "auxiliary_loss_clip": 0.06417938, + "auxiliary_loss_mlp": 0.01269286, + "balance_loss_clip": 0.0627698, + "balance_loss_mlp": 0.01258926, + "epoch": 0.6532992634901548, + "flos": 23110996285440.0, + "grad_norm": 1.6108799527314137, + "language_loss": 0.81515527, + "learning_rate": 1.133711576532051e-06, + "loss": 0.8920275, + "num_input_tokens_seen": 234683955, + "router_z_loss_clip": 1.40917969, + "router_z_loss_mlp": 0.10357666, + "step": 10866, + "time_per_iteration": 2.5684125423431396 + }, + { + "auxiliary_loss_clip": 0.06411187, + "auxiliary_loss_mlp": 0.01264991, + "balance_loss_clip": 0.06275405, + "balance_loss_mlp": 0.01254382, + "epoch": 0.6533593867428228, + "flos": 26074460436480.0, + "grad_norm": 1.6718467663998162, + "language_loss": 0.82545173, + "learning_rate": 1.1333605615524995e-06, + "loss": 0.90221351, + "num_input_tokens_seen": 234704595, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.10614014, + "step": 10867, + "time_per_iteration": 2.5475850105285645 + }, + { + "auxiliary_loss_clip": 0.06413805, + "auxiliary_loss_mlp": 0.01264816, + "balance_loss_clip": 0.06276451, + "balance_loss_mlp": 0.01254314, + "epoch": 0.6534195099954908, + "flos": 21218398513920.0, + "grad_norm": 1.6506076303544417, + "language_loss": 0.81211448, + "learning_rate": 1.1330095794360016e-06, + "loss": 0.88890064, + "num_input_tokens_seen": 234724090, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.1050415, + "step": 10868, + "time_per_iteration": 2.5498743057250977 + }, + { + "auxiliary_loss_clip": 0.06418251, + "auxiliary_loss_mlp": 0.01266421, + "balance_loss_clip": 0.06277823, + "balance_loss_mlp": 0.01255579, + "epoch": 0.6534796332481587, + "flos": 19652754574080.0, + "grad_norm": 1.774479415812712, + "language_loss": 0.7959047, + "learning_rate": 1.1326586301958675e-06, + "loss": 0.87275141, + "num_input_tokens_seen": 234742560, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.10845947, + "step": 10869, + "time_per_iteration": 2.5166242122650146 + }, + { + "auxiliary_loss_clip": 0.06413683, + "auxiliary_loss_mlp": 0.01266573, + "balance_loss_clip": 0.0627695, + "balance_loss_mlp": 0.01256172, + "epoch": 0.6535397565008267, + "flos": 24028979690880.0, + "grad_norm": 2.0325113837901703, + "language_loss": 0.72014058, + "learning_rate": 1.1323077138454063e-06, + "loss": 0.79694319, + "num_input_tokens_seen": 234762315, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.10406494, + "step": 10870, + "time_per_iteration": 2.5486953258514404 + }, + { + "auxiliary_loss_clip": 0.06413276, + "auxiliary_loss_mlp": 0.01265068, + "balance_loss_clip": 0.06275949, + "balance_loss_mlp": 0.01254584, + "epoch": 0.6535998797534947, + "flos": 24608772817920.0, + "grad_norm": 1.9753517025590153, + "language_loss": 0.74408901, + "learning_rate": 1.1319568303979221e-06, + "loss": 0.82087243, + "num_input_tokens_seen": 234781300, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.10479736, + "step": 10871, + "time_per_iteration": 4.039932489395142 + }, + { + "auxiliary_loss_clip": 0.06410573, + "auxiliary_loss_mlp": 0.01263739, + "balance_loss_clip": 0.06277861, + "balance_loss_mlp": 0.01253791, + "epoch": 0.6536600030061627, + "flos": 23370292344960.0, + "grad_norm": 1.4980578991412412, + "language_loss": 0.56041443, + "learning_rate": 1.1316059798667227e-06, + "loss": 0.6371575, + "num_input_tokens_seen": 234801040, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.0994873, + "step": 10872, + "time_per_iteration": 2.502490282058716 + }, + { + "auxiliary_loss_clip": 0.06416132, + "auxiliary_loss_mlp": 0.01267921, + "balance_loss_clip": 0.06281123, + "balance_loss_mlp": 0.01256918, + "epoch": 0.6537201262588306, + "flos": 23885278738560.0, + "grad_norm": 1.5337992373700162, + "language_loss": 0.75344592, + "learning_rate": 1.1312551622651112e-06, + "loss": 0.8302865, + "num_input_tokens_seen": 234821415, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.11022949, + "step": 10873, + "time_per_iteration": 2.5598514080047607 + }, + { + "auxiliary_loss_clip": 0.06410979, + "auxiliary_loss_mlp": 0.012657, + "balance_loss_clip": 0.06274614, + "balance_loss_mlp": 0.01255585, + "epoch": 0.6537802495114986, + "flos": 24361971016320.0, + "grad_norm": 1.420531378230647, + "language_loss": 0.76059687, + "learning_rate": 1.1309043776063917e-06, + "loss": 0.8373636, + "num_input_tokens_seen": 234843795, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.10113525, + "step": 10874, + "time_per_iteration": 2.549380302429199 + }, + { + "auxiliary_loss_clip": 0.06415659, + "auxiliary_loss_mlp": 0.01268814, + "balance_loss_clip": 0.06279317, + "balance_loss_mlp": 0.01258032, + "epoch": 0.6538403727641665, + "flos": 28003633315200.0, + "grad_norm": 1.5256219818178185, + "language_loss": 0.81805712, + "learning_rate": 1.1305536259038642e-06, + "loss": 0.89490187, + "num_input_tokens_seen": 234862350, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.10784912, + "step": 10875, + "time_per_iteration": 2.583240270614624 + }, + { + "auxiliary_loss_clip": 0.06411637, + "auxiliary_loss_mlp": 0.01266928, + "balance_loss_clip": 0.06273378, + "balance_loss_mlp": 0.01256372, + "epoch": 0.6539004960168345, + "flos": 27571021084800.0, + "grad_norm": 1.6524409835803482, + "language_loss": 0.69961172, + "learning_rate": 1.1302029071708314e-06, + "loss": 0.77639741, + "num_input_tokens_seen": 234881790, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.10552979, + "step": 10876, + "time_per_iteration": 2.53607439994812 + }, + { + "auxiliary_loss_clip": 0.0641342, + "auxiliary_loss_mlp": 0.01265066, + "balance_loss_clip": 0.06277761, + "balance_loss_mlp": 0.01254177, + "epoch": 0.6539606192695024, + "flos": 14533958574720.0, + "grad_norm": 1.8504141345372043, + "language_loss": 0.79613322, + "learning_rate": 1.1298522214205908e-06, + "loss": 0.87291813, + "num_input_tokens_seen": 234897775, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.10888672, + "step": 10877, + "time_per_iteration": 2.482450246810913 + }, + { + "auxiliary_loss_clip": 0.0641083, + "auxiliary_loss_mlp": 0.01271317, + "balance_loss_clip": 0.06274553, + "balance_loss_mlp": 0.01260976, + "epoch": 0.6540207425221705, + "flos": 21622779118080.0, + "grad_norm": 2.1988791511764507, + "language_loss": 0.80130821, + "learning_rate": 1.1295015686664408e-06, + "loss": 0.87812972, + "num_input_tokens_seen": 234918395, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.10333252, + "step": 10878, + "time_per_iteration": 2.4935176372528076 + }, + { + "auxiliary_loss_clip": 0.06409772, + "auxiliary_loss_mlp": 0.01267486, + "balance_loss_clip": 0.06272677, + "balance_loss_mlp": 0.01256185, + "epoch": 0.6540808657748384, + "flos": 17673589935360.0, + "grad_norm": 2.582136269580718, + "language_loss": 0.8441155, + "learning_rate": 1.1291509489216797e-06, + "loss": 0.92088807, + "num_input_tokens_seen": 234936260, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.11309814, + "step": 10879, + "time_per_iteration": 2.478309392929077 + }, + { + "auxiliary_loss_clip": 0.06413597, + "auxiliary_loss_mlp": 0.01266934, + "balance_loss_clip": 0.06273437, + "balance_loss_mlp": 0.01255937, + "epoch": 0.6541409890275064, + "flos": 14543559866880.0, + "grad_norm": 2.245673949677598, + "language_loss": 0.72627622, + "learning_rate": 1.128800362199601e-06, + "loss": 0.80308151, + "num_input_tokens_seen": 234952110, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.11004639, + "step": 10880, + "time_per_iteration": 2.448975086212158 + }, + { + "auxiliary_loss_clip": 0.06410271, + "auxiliary_loss_mlp": 0.01269229, + "balance_loss_clip": 0.06275423, + "balance_loss_mlp": 0.01258899, + "epoch": 0.6542011122801744, + "flos": 17171013945600.0, + "grad_norm": 1.8546451564603688, + "language_loss": 0.84333724, + "learning_rate": 1.1284498085135005e-06, + "loss": 0.92013222, + "num_input_tokens_seen": 234970810, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.10333252, + "step": 10881, + "time_per_iteration": 2.5005478858947754 + }, + { + "auxiliary_loss_clip": 0.06415182, + "auxiliary_loss_mlp": 0.01264701, + "balance_loss_clip": 0.0627469, + "balance_loss_mlp": 0.01252995, + "epoch": 0.6542612355328423, + "flos": 18192433616640.0, + "grad_norm": 1.7673801500025483, + "language_loss": 0.78099298, + "learning_rate": 1.1280992878766699e-06, + "loss": 0.85779178, + "num_input_tokens_seen": 234989565, + "router_z_loss_clip": 1.40332031, + "router_z_loss_mlp": 0.11700439, + "step": 10882, + "time_per_iteration": 2.4750256538391113 + }, + { + "auxiliary_loss_clip": 0.06413694, + "auxiliary_loss_mlp": 0.01268989, + "balance_loss_clip": 0.06275713, + "balance_loss_mlp": 0.01257837, + "epoch": 0.6543213587855103, + "flos": 19798635732480.0, + "grad_norm": 1.55805041018917, + "language_loss": 0.81790304, + "learning_rate": 1.1277488003024024e-06, + "loss": 0.89472985, + "num_input_tokens_seen": 235007955, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.1116333, + "step": 10883, + "time_per_iteration": 3.958979368209839 + }, + { + "auxiliary_loss_clip": 0.06415352, + "auxiliary_loss_mlp": 0.0126774, + "balance_loss_clip": 0.06277536, + "balance_loss_mlp": 0.01256427, + "epoch": 0.6543814820381783, + "flos": 21111356522880.0, + "grad_norm": 2.318256186808643, + "language_loss": 0.85692853, + "learning_rate": 1.127398345803988e-06, + "loss": 0.93375945, + "num_input_tokens_seen": 235024860, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.11322021, + "step": 10884, + "time_per_iteration": 2.4991559982299805 + }, + { + "auxiliary_loss_clip": 0.06414054, + "auxiliary_loss_mlp": 0.01263304, + "balance_loss_clip": 0.06276435, + "balance_loss_mlp": 0.01252623, + "epoch": 0.6544416052908463, + "flos": 20200333006080.0, + "grad_norm": 2.0262705152465985, + "language_loss": 0.8030138, + "learning_rate": 1.127047924394715e-06, + "loss": 0.87978739, + "num_input_tokens_seen": 235043815, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.10687256, + "step": 10885, + "time_per_iteration": 3.945915699005127 + }, + { + "auxiliary_loss_clip": 0.06412613, + "auxiliary_loss_mlp": 0.01269809, + "balance_loss_clip": 0.06277589, + "balance_loss_mlp": 0.01259468, + "epoch": 0.6545017285435142, + "flos": 23375072027520.0, + "grad_norm": 1.9399514462864902, + "language_loss": 0.72038162, + "learning_rate": 1.1266975360878722e-06, + "loss": 0.79720581, + "num_input_tokens_seen": 235062985, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.10339355, + "step": 10886, + "time_per_iteration": 2.592869520187378 + }, + { + "auxiliary_loss_clip": 0.06412855, + "auxiliary_loss_mlp": 0.0126236, + "balance_loss_clip": 0.06275351, + "balance_loss_mlp": 0.01252591, + "epoch": 0.6545618517961822, + "flos": 19140619219200.0, + "grad_norm": 1.841753490100957, + "language_loss": 0.78875196, + "learning_rate": 1.1263471808967468e-06, + "loss": 0.86550403, + "num_input_tokens_seen": 235081670, + "router_z_loss_clip": 1.37402344, + "router_z_loss_mlp": 0.09765625, + "step": 10887, + "time_per_iteration": 2.4951751232147217 + }, + { + "auxiliary_loss_clip": 0.06415602, + "auxiliary_loss_mlp": 0.0126552, + "balance_loss_clip": 0.06278757, + "balance_loss_mlp": 0.01255346, + "epoch": 0.6546219750488501, + "flos": 14943789694080.0, + "grad_norm": 1.7286309451287045, + "language_loss": 0.791143, + "learning_rate": 1.1259968588346234e-06, + "loss": 0.86795419, + "num_input_tokens_seen": 235098510, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.10168457, + "step": 10888, + "time_per_iteration": 2.5363447666168213 + }, + { + "auxiliary_loss_clip": 0.0641408, + "auxiliary_loss_mlp": 0.01266895, + "balance_loss_clip": 0.06279381, + "balance_loss_mlp": 0.01257025, + "epoch": 0.6546820983015181, + "flos": 36329466885120.0, + "grad_norm": 1.4489059834180797, + "language_loss": 0.66680413, + "learning_rate": 1.1256465699147874e-06, + "loss": 0.7436139, + "num_input_tokens_seen": 235119990, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.09869385, + "step": 10889, + "time_per_iteration": 2.631702184677124 + }, + { + "auxiliary_loss_clip": 0.06413323, + "auxiliary_loss_mlp": 0.01267679, + "balance_loss_clip": 0.06274237, + "balance_loss_mlp": 0.01255359, + "epoch": 0.654742221554186, + "flos": 20417519589120.0, + "grad_norm": 1.4090787224296468, + "language_loss": 0.80175591, + "learning_rate": 1.1252963141505203e-06, + "loss": 0.87856597, + "num_input_tokens_seen": 235139255, + "router_z_loss_clip": 1.39257812, + "router_z_loss_mlp": 0.12322998, + "step": 10890, + "time_per_iteration": 2.553987503051758 + }, + { + "auxiliary_loss_clip": 0.06413622, + "auxiliary_loss_mlp": 0.01266787, + "balance_loss_clip": 0.06272978, + "balance_loss_mlp": 0.01255963, + "epoch": 0.6548023448068541, + "flos": 24870626426880.0, + "grad_norm": 1.9658735826984712, + "language_loss": 0.66080928, + "learning_rate": 1.1249460915551052e-06, + "loss": 0.73761332, + "num_input_tokens_seen": 235158455, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.10827637, + "step": 10891, + "time_per_iteration": 3.981126546859741 + }, + { + "auxiliary_loss_clip": 0.06412101, + "auxiliary_loss_mlp": 0.01268584, + "balance_loss_clip": 0.06276606, + "balance_loss_mlp": 0.01258314, + "epoch": 0.654862468059522, + "flos": 21432901766400.0, + "grad_norm": 1.7619514062333756, + "language_loss": 0.80124283, + "learning_rate": 1.1245959021418214e-06, + "loss": 0.87804967, + "num_input_tokens_seen": 235177350, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.1027832, + "step": 10892, + "time_per_iteration": 2.487014055252075 + }, + { + "auxiliary_loss_clip": 0.06417862, + "auxiliary_loss_mlp": 0.01267184, + "balance_loss_clip": 0.06275848, + "balance_loss_mlp": 0.01256502, + "epoch": 0.65492259131219, + "flos": 26585002563840.0, + "grad_norm": 1.8517707324094554, + "language_loss": 0.78348118, + "learning_rate": 1.1242457459239497e-06, + "loss": 0.86033165, + "num_input_tokens_seen": 235196435, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.10675049, + "step": 10893, + "time_per_iteration": 2.5751121044158936 + }, + { + "auxiliary_loss_clip": 0.06416593, + "auxiliary_loss_mlp": 0.01265779, + "balance_loss_clip": 0.06276494, + "balance_loss_mlp": 0.01254126, + "epoch": 0.6549827145648579, + "flos": 21506806667520.0, + "grad_norm": 1.5510106151766068, + "language_loss": 0.70386314, + "learning_rate": 1.123895622914766e-06, + "loss": 0.78068686, + "num_input_tokens_seen": 235215430, + "router_z_loss_clip": 1.40039062, + "router_z_loss_mlp": 0.11651611, + "step": 10894, + "time_per_iteration": 2.492877721786499 + }, + { + "auxiliary_loss_clip": 0.06416629, + "auxiliary_loss_mlp": 0.01264665, + "balance_loss_clip": 0.06276509, + "balance_loss_mlp": 0.01252959, + "epoch": 0.6550428378175259, + "flos": 22599657544320.0, + "grad_norm": 2.852975580128828, + "language_loss": 0.62881947, + "learning_rate": 1.123545533127549e-06, + "loss": 0.70563233, + "num_input_tokens_seen": 235232015, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.11712646, + "step": 10895, + "time_per_iteration": 2.508265733718872 + }, + { + "auxiliary_loss_clip": 0.06409365, + "auxiliary_loss_mlp": 0.01264591, + "balance_loss_clip": 0.06273523, + "balance_loss_mlp": 0.0125487, + "epoch": 0.655102961070194, + "flos": 12828848313600.0, + "grad_norm": 1.7300998551667346, + "language_loss": 0.79205835, + "learning_rate": 1.1231954765755722e-06, + "loss": 0.8687979, + "num_input_tokens_seen": 235248115, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.097229, + "step": 10896, + "time_per_iteration": 2.4711906909942627 + }, + { + "auxiliary_loss_clip": 0.06409965, + "auxiliary_loss_mlp": 0.0126749, + "balance_loss_clip": 0.06276735, + "balance_loss_mlp": 0.01257417, + "epoch": 0.6551630843228619, + "flos": 24798105118080.0, + "grad_norm": 1.3882264371892772, + "language_loss": 0.70543504, + "learning_rate": 1.1228454532721111e-06, + "loss": 0.78220963, + "num_input_tokens_seen": 235270785, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.10076904, + "step": 10897, + "time_per_iteration": 2.6822469234466553 + }, + { + "auxiliary_loss_clip": 0.06417882, + "auxiliary_loss_mlp": 0.01268345, + "balance_loss_clip": 0.06276685, + "balance_loss_mlp": 0.01257628, + "epoch": 0.6552232075755299, + "flos": 16729597036800.0, + "grad_norm": 1.5280933060289523, + "language_loss": 0.75582546, + "learning_rate": 1.1224954632304391e-06, + "loss": 0.83268768, + "num_input_tokens_seen": 235287905, + "router_z_loss_clip": 1.41210938, + "router_z_loss_mlp": 0.10717773, + "step": 10898, + "time_per_iteration": 2.475172519683838 + }, + { + "auxiliary_loss_clip": 0.06413586, + "auxiliary_loss_mlp": 0.0126988, + "balance_loss_clip": 0.06276682, + "balance_loss_mlp": 0.0125986, + "epoch": 0.6552833308281978, + "flos": 22022757383040.0, + "grad_norm": 2.1698837802172193, + "language_loss": 0.7396723, + "learning_rate": 1.122145506463827e-06, + "loss": 0.81650698, + "num_input_tokens_seen": 235305525, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.10028076, + "step": 10899, + "time_per_iteration": 2.5430071353912354 + }, + { + "auxiliary_loss_clip": 0.06414597, + "auxiliary_loss_mlp": 0.0126991, + "balance_loss_clip": 0.06275821, + "balance_loss_mlp": 0.01259229, + "epoch": 0.6553434540808658, + "flos": 24870332937600.0, + "grad_norm": 2.0271227306533346, + "language_loss": 0.56131774, + "learning_rate": 1.1217955829855443e-06, + "loss": 0.63816285, + "num_input_tokens_seen": 235324415, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.10693359, + "step": 10900, + "time_per_iteration": 2.5413925647735596 + }, + { + "auxiliary_loss_clip": 0.06419879, + "auxiliary_loss_mlp": 0.01265514, + "balance_loss_clip": 0.06280822, + "balance_loss_mlp": 0.01254511, + "epoch": 0.6554035773335337, + "flos": 23227639568640.0, + "grad_norm": 1.632650390975927, + "language_loss": 0.77087748, + "learning_rate": 1.1214456928088622e-06, + "loss": 0.84773135, + "num_input_tokens_seen": 235341595, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.11004639, + "step": 10901, + "time_per_iteration": 2.5584566593170166 + }, + { + "auxiliary_loss_clip": 0.06417914, + "auxiliary_loss_mlp": 0.01269861, + "balance_loss_clip": 0.06281441, + "balance_loss_mlp": 0.01259484, + "epoch": 0.6554637005862017, + "flos": 22790163801600.0, + "grad_norm": 1.6269884512414954, + "language_loss": 0.73415089, + "learning_rate": 1.1210958359470463e-06, + "loss": 0.81102872, + "num_input_tokens_seen": 235361700, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.10375977, + "step": 10902, + "time_per_iteration": 2.5149738788604736 + }, + { + "auxiliary_loss_clip": 0.06411173, + "auxiliary_loss_mlp": 0.01265501, + "balance_loss_clip": 0.0627598, + "balance_loss_mlp": 0.01255118, + "epoch": 0.6555238238388696, + "flos": 21513682702080.0, + "grad_norm": 2.0084891996216254, + "language_loss": 0.68054104, + "learning_rate": 1.1207460124133645e-06, + "loss": 0.75730777, + "num_input_tokens_seen": 235382065, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.10388184, + "step": 10903, + "time_per_iteration": 2.5427961349487305 + }, + { + "auxiliary_loss_clip": 0.06420846, + "auxiliary_loss_mlp": 0.01267584, + "balance_loss_clip": 0.06277949, + "balance_loss_mlp": 0.01255926, + "epoch": 0.6555839470915377, + "flos": 30527483420160.0, + "grad_norm": 1.6549904072812014, + "language_loss": 0.67021459, + "learning_rate": 1.1203962222210832e-06, + "loss": 0.74709886, + "num_input_tokens_seen": 235402130, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.11645508, + "step": 10904, + "time_per_iteration": 2.5631024837493896 + }, + { + "auxiliary_loss_clip": 0.06421356, + "auxiliary_loss_mlp": 0.01264475, + "balance_loss_clip": 0.06279784, + "balance_loss_mlp": 0.0125327, + "epoch": 0.6556440703442056, + "flos": 24649582556160.0, + "grad_norm": 1.7705609323248692, + "language_loss": 0.90557879, + "learning_rate": 1.120046465383464e-06, + "loss": 0.98243713, + "num_input_tokens_seen": 235420435, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.11212158, + "step": 10905, + "time_per_iteration": 2.551908493041992 + }, + { + "auxiliary_loss_clip": 0.06408294, + "auxiliary_loss_mlp": 0.01265256, + "balance_loss_clip": 0.06275466, + "balance_loss_mlp": 0.01255194, + "epoch": 0.6557041935968736, + "flos": 23739229872000.0, + "grad_norm": 1.7103913409482634, + "language_loss": 0.75575101, + "learning_rate": 1.1196967419137721e-06, + "loss": 0.83248651, + "num_input_tokens_seen": 235439960, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.10058594, + "step": 10906, + "time_per_iteration": 2.5098323822021484 + }, + { + "auxiliary_loss_clip": 0.06419322, + "auxiliary_loss_mlp": 0.0126702, + "balance_loss_clip": 0.06278144, + "balance_loss_mlp": 0.01256094, + "epoch": 0.6557643168495415, + "flos": 11106464112000.0, + "grad_norm": 2.5310893479547385, + "language_loss": 0.75316978, + "learning_rate": 1.119347051825267e-06, + "loss": 0.83003318, + "num_input_tokens_seen": 235457495, + "router_z_loss_clip": 1.41210938, + "router_z_loss_mlp": 0.10925293, + "step": 10907, + "time_per_iteration": 2.5110371112823486 + }, + { + "auxiliary_loss_clip": 0.06413908, + "auxiliary_loss_mlp": 0.01264522, + "balance_loss_clip": 0.06275952, + "balance_loss_mlp": 0.01253585, + "epoch": 0.6558244401022095, + "flos": 30198978288000.0, + "grad_norm": 1.3099733417202022, + "language_loss": 0.7233519, + "learning_rate": 1.118997395131211e-06, + "loss": 0.80013621, + "num_input_tokens_seen": 235479525, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.109375, + "step": 10908, + "time_per_iteration": 2.6000733375549316 + }, + { + "auxiliary_loss_clip": 0.06419864, + "auxiliary_loss_mlp": 0.01265366, + "balance_loss_clip": 0.06280993, + "balance_loss_mlp": 0.01254912, + "epoch": 0.6558845633548775, + "flos": 17936827136640.0, + "grad_norm": 2.2254285972113155, + "language_loss": 0.82226503, + "learning_rate": 1.118647771844861e-06, + "loss": 0.89911729, + "num_input_tokens_seen": 235496305, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.10455322, + "step": 10909, + "time_per_iteration": 2.524258613586426 + }, + { + "auxiliary_loss_clip": 0.06420204, + "auxiliary_loss_mlp": 0.01267528, + "balance_loss_clip": 0.0627941, + "balance_loss_mlp": 0.01256567, + "epoch": 0.6559446866075455, + "flos": 21909929460480.0, + "grad_norm": 2.0664641654441334, + "language_loss": 0.64063025, + "learning_rate": 1.1182981819794767e-06, + "loss": 0.71750748, + "num_input_tokens_seen": 235512545, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.10968018, + "step": 10910, + "time_per_iteration": 4.0342183113098145 + }, + { + "auxiliary_loss_clip": 0.06428535, + "auxiliary_loss_mlp": 0.012681, + "balance_loss_clip": 0.06281586, + "balance_loss_mlp": 0.01256501, + "epoch": 0.6560048098602135, + "flos": 14131674322560.0, + "grad_norm": 2.6155993780376408, + "language_loss": 0.76254046, + "learning_rate": 1.117948625548313e-06, + "loss": 0.8395068, + "num_input_tokens_seen": 235526045, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 0.1159668, + "step": 10911, + "time_per_iteration": 2.447054386138916 + }, + { + "auxiliary_loss_clip": 0.06411637, + "auxiliary_loss_mlp": 0.01268286, + "balance_loss_clip": 0.0627694, + "balance_loss_mlp": 0.0125798, + "epoch": 0.6560649331128814, + "flos": 18813623460480.0, + "grad_norm": 1.5982338886507241, + "language_loss": 0.756971, + "learning_rate": 1.1175991025646265e-06, + "loss": 0.83377028, + "num_input_tokens_seen": 235545285, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.10308838, + "step": 10912, + "time_per_iteration": 2.5681815147399902 + }, + { + "auxiliary_loss_clip": 0.06430128, + "auxiliary_loss_mlp": 0.01272614, + "balance_loss_clip": 0.0628223, + "balance_loss_mlp": 0.01260431, + "epoch": 0.6561250563655494, + "flos": 17058940709760.0, + "grad_norm": 1.6202794136024683, + "language_loss": 0.77903795, + "learning_rate": 1.1172496130416697e-06, + "loss": 0.85606527, + "num_input_tokens_seen": 235563150, + "router_z_loss_clip": 1.47949219, + "router_z_loss_mlp": 0.12176514, + "step": 10913, + "time_per_iteration": 2.4939568042755127 + }, + { + "auxiliary_loss_clip": 0.0641174, + "auxiliary_loss_mlp": 0.01263849, + "balance_loss_clip": 0.06277423, + "balance_loss_mlp": 0.01254425, + "epoch": 0.6561851796182173, + "flos": 22644198789120.0, + "grad_norm": 1.7766660084969559, + "language_loss": 0.71619821, + "learning_rate": 1.1169001569926961e-06, + "loss": 0.79295409, + "num_input_tokens_seen": 235582535, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.09423828, + "step": 10914, + "time_per_iteration": 2.569068431854248 + }, + { + "auxiliary_loss_clip": 0.06418359, + "auxiliary_loss_mlp": 0.01264819, + "balance_loss_clip": 0.06280423, + "balance_loss_mlp": 0.01254149, + "epoch": 0.6562453028708853, + "flos": 19244307047040.0, + "grad_norm": 1.8135755345317126, + "language_loss": 0.74166334, + "learning_rate": 1.116550734430958e-06, + "loss": 0.81849515, + "num_input_tokens_seen": 235601490, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.10675049, + "step": 10915, + "time_per_iteration": 2.487908363342285 + }, + { + "auxiliary_loss_clip": 0.06413562, + "auxiliary_loss_mlp": 0.01266089, + "balance_loss_clip": 0.06277299, + "balance_loss_mlp": 0.01254823, + "epoch": 0.6563054261235532, + "flos": 23807390768640.0, + "grad_norm": 1.4909835290624114, + "language_loss": 0.79751885, + "learning_rate": 1.1162013453697042e-06, + "loss": 0.87431538, + "num_input_tokens_seen": 235619165, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.11254883, + "step": 10916, + "time_per_iteration": 2.5246381759643555 + }, + { + "auxiliary_loss_clip": 0.06414592, + "auxiliary_loss_mlp": 0.01266229, + "balance_loss_clip": 0.06275203, + "balance_loss_mlp": 0.01255727, + "epoch": 0.6563655493762213, + "flos": 19245271368960.0, + "grad_norm": 1.7342152629791572, + "language_loss": 0.76458621, + "learning_rate": 1.1158519898221831e-06, + "loss": 0.84139442, + "num_input_tokens_seen": 235637115, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.10498047, + "step": 10917, + "time_per_iteration": 2.468027353286743 + }, + { + "auxiliary_loss_clip": 0.06412656, + "auxiliary_loss_mlp": 0.01267091, + "balance_loss_clip": 0.06277646, + "balance_loss_mlp": 0.0125678, + "epoch": 0.6564256726288892, + "flos": 25563457111680.0, + "grad_norm": 1.7726258593528208, + "language_loss": 0.70893037, + "learning_rate": 1.1155026678016445e-06, + "loss": 0.78572786, + "num_input_tokens_seen": 235656330, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.10314941, + "step": 10918, + "time_per_iteration": 2.5601627826690674 + }, + { + "auxiliary_loss_clip": 0.06410314, + "auxiliary_loss_mlp": 0.01263599, + "balance_loss_clip": 0.06277462, + "balance_loss_mlp": 0.01253806, + "epoch": 0.6564857958815572, + "flos": 22207226146560.0, + "grad_norm": 1.5162098354406723, + "language_loss": 0.76179051, + "learning_rate": 1.115153379321332e-06, + "loss": 0.83852965, + "num_input_tokens_seen": 235674510, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09802246, + "step": 10919, + "time_per_iteration": 2.515432357788086 + }, + { + "auxiliary_loss_clip": 0.06311788, + "auxiliary_loss_mlp": 0.01255206, + "balance_loss_clip": 0.06255645, + "balance_loss_mlp": 0.01254054, + "epoch": 0.6565459191342251, + "flos": 58139188462080.0, + "grad_norm": 0.7048888157954881, + "language_loss": 0.52975726, + "learning_rate": 1.1148041243944931e-06, + "loss": 0.60542721, + "num_input_tokens_seen": 235735050, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01150513, + "step": 10920, + "time_per_iteration": 3.225492238998413 + }, + { + "auxiliary_loss_clip": 0.06409396, + "auxiliary_loss_mlp": 0.01264778, + "balance_loss_clip": 0.06275034, + "balance_loss_mlp": 0.01254252, + "epoch": 0.6566060423868931, + "flos": 30817400947200.0, + "grad_norm": 2.612121109527078, + "language_loss": 0.66109598, + "learning_rate": 1.1144549030343697e-06, + "loss": 0.73783767, + "num_input_tokens_seen": 235757545, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.10516357, + "step": 10921, + "time_per_iteration": 2.5863046646118164 + }, + { + "auxiliary_loss_clip": 0.06413272, + "auxiliary_loss_mlp": 0.01265745, + "balance_loss_clip": 0.06276343, + "balance_loss_mlp": 0.01254086, + "epoch": 0.6566661656395612, + "flos": 23374107705600.0, + "grad_norm": 1.6764293200295557, + "language_loss": 0.81199658, + "learning_rate": 1.114105715254205e-06, + "loss": 0.88878673, + "num_input_tokens_seen": 235777265, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.11657715, + "step": 10922, + "time_per_iteration": 3.958033800125122 + }, + { + "auxiliary_loss_clip": 0.06414749, + "auxiliary_loss_mlp": 0.01268836, + "balance_loss_clip": 0.06275846, + "balance_loss_mlp": 0.01258131, + "epoch": 0.6567262888922291, + "flos": 25742098016640.0, + "grad_norm": 1.8770672525164127, + "language_loss": 0.71403915, + "learning_rate": 1.1137565610672414e-06, + "loss": 0.79087496, + "num_input_tokens_seen": 235796565, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.1071167, + "step": 10923, + "time_per_iteration": 2.6299500465393066 + }, + { + "auxiliary_loss_clip": 0.06414993, + "auxiliary_loss_mlp": 0.01266649, + "balance_loss_clip": 0.06276433, + "balance_loss_mlp": 0.0125629, + "epoch": 0.6567864121448971, + "flos": 17128569052800.0, + "grad_norm": 1.8445128185559154, + "language_loss": 0.80703431, + "learning_rate": 1.1134074404867169e-06, + "loss": 0.88385069, + "num_input_tokens_seen": 235814805, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.10357666, + "step": 10924, + "time_per_iteration": 2.474226713180542 + }, + { + "auxiliary_loss_clip": 0.06413686, + "auxiliary_loss_mlp": 0.01262003, + "balance_loss_clip": 0.06275852, + "balance_loss_mlp": 0.0125187, + "epoch": 0.656846535397565, + "flos": 22425922103040.0, + "grad_norm": 2.0896707953815543, + "language_loss": 0.72634912, + "learning_rate": 1.1130583535258717e-06, + "loss": 0.80310595, + "num_input_tokens_seen": 235833405, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.10137939, + "step": 10925, + "time_per_iteration": 4.006798982620239 + }, + { + "auxiliary_loss_clip": 0.0641509, + "auxiliary_loss_mlp": 0.01263906, + "balance_loss_clip": 0.06276507, + "balance_loss_mlp": 0.01253768, + "epoch": 0.656906658650233, + "flos": 17708991085440.0, + "grad_norm": 2.4212353880000586, + "language_loss": 0.72549468, + "learning_rate": 1.112709300197942e-06, + "loss": 0.80228466, + "num_input_tokens_seen": 235848530, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.10137939, + "step": 10926, + "time_per_iteration": 2.470264434814453 + }, + { + "auxiliary_loss_clip": 0.06419797, + "auxiliary_loss_mlp": 0.01265954, + "balance_loss_clip": 0.06277547, + "balance_loss_mlp": 0.01254498, + "epoch": 0.6569667819029009, + "flos": 21180942938880.0, + "grad_norm": 1.9117955392450259, + "language_loss": 0.72684854, + "learning_rate": 1.1123602805161656e-06, + "loss": 0.80370605, + "num_input_tokens_seen": 235867225, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.11468506, + "step": 10927, + "time_per_iteration": 2.5509166717529297 + }, + { + "auxiliary_loss_clip": 0.06310604, + "auxiliary_loss_mlp": 0.01252717, + "balance_loss_clip": 0.06254312, + "balance_loss_mlp": 0.01251483, + "epoch": 0.6570269051555689, + "flos": 68783299344000.0, + "grad_norm": 0.7240640825769642, + "language_loss": 0.64406443, + "learning_rate": 1.112011294493775e-06, + "loss": 0.71969765, + "num_input_tokens_seen": 235932925, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.0123291, + "step": 10928, + "time_per_iteration": 3.1493797302246094 + }, + { + "auxiliary_loss_clip": 0.06413682, + "auxiliary_loss_mlp": 0.01270572, + "balance_loss_clip": 0.06277151, + "balance_loss_mlp": 0.01259354, + "epoch": 0.6570870284082369, + "flos": 26325874212480.0, + "grad_norm": 2.727605777521059, + "language_loss": 0.78076899, + "learning_rate": 1.1116623421440063e-06, + "loss": 0.85761154, + "num_input_tokens_seen": 235952680, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.11212158, + "step": 10929, + "time_per_iteration": 2.602822780609131 + }, + { + "auxiliary_loss_clip": 0.06411244, + "auxiliary_loss_mlp": 0.01264052, + "balance_loss_clip": 0.06275063, + "balance_loss_mlp": 0.01253181, + "epoch": 0.6571471516609049, + "flos": 26181544354560.0, + "grad_norm": 1.645365805026195, + "language_loss": 0.65459454, + "learning_rate": 1.1113134234800895e-06, + "loss": 0.73134756, + "num_input_tokens_seen": 235972075, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.10876465, + "step": 10930, + "time_per_iteration": 3.964470863342285 + }, + { + "auxiliary_loss_clip": 0.06414342, + "auxiliary_loss_mlp": 0.01268622, + "balance_loss_clip": 0.06276581, + "balance_loss_mlp": 0.01257733, + "epoch": 0.6572072749135728, + "flos": 20382537709440.0, + "grad_norm": 1.4804583724978688, + "language_loss": 0.71204734, + "learning_rate": 1.110964538515258e-06, + "loss": 0.78887701, + "num_input_tokens_seen": 235990340, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.10888672, + "step": 10931, + "time_per_iteration": 2.4909491539001465 + }, + { + "auxiliary_loss_clip": 0.06417586, + "auxiliary_loss_mlp": 0.0127043, + "balance_loss_clip": 0.06275665, + "balance_loss_mlp": 0.01259784, + "epoch": 0.6572673981662408, + "flos": 17134438838400.0, + "grad_norm": 1.8915521473051504, + "language_loss": 0.68812561, + "learning_rate": 1.1106156872627393e-06, + "loss": 0.76500577, + "num_input_tokens_seen": 236007470, + "router_z_loss_clip": 1.41894531, + "router_z_loss_mlp": 0.10644531, + "step": 10932, + "time_per_iteration": 2.5176515579223633 + }, + { + "auxiliary_loss_clip": 0.06412166, + "auxiliary_loss_mlp": 0.01268175, + "balance_loss_clip": 0.06274658, + "balance_loss_mlp": 0.01257952, + "epoch": 0.6573275214189087, + "flos": 41283640339200.0, + "grad_norm": 1.6891496229276404, + "language_loss": 0.80723727, + "learning_rate": 1.1102668697357626e-06, + "loss": 0.88404071, + "num_input_tokens_seen": 236029030, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.10229492, + "step": 10933, + "time_per_iteration": 2.6675453186035156 + }, + { + "auxiliary_loss_clip": 0.06419124, + "auxiliary_loss_mlp": 0.01264988, + "balance_loss_clip": 0.06278023, + "balance_loss_mlp": 0.01254432, + "epoch": 0.6573876446715767, + "flos": 22896241470720.0, + "grad_norm": 1.753523075649994, + "language_loss": 0.73957497, + "learning_rate": 1.1099180859475571e-06, + "loss": 0.81641608, + "num_input_tokens_seen": 236047160, + "router_z_loss_clip": 1.41210938, + "router_z_loss_mlp": 0.10552979, + "step": 10934, + "time_per_iteration": 2.555539131164551 + }, + { + "auxiliary_loss_clip": 0.0641007, + "auxiliary_loss_mlp": 0.01270037, + "balance_loss_clip": 0.0627473, + "balance_loss_mlp": 0.01259445, + "epoch": 0.6574477679242448, + "flos": 44028240825600.0, + "grad_norm": 1.5029164504422408, + "language_loss": 0.76213276, + "learning_rate": 1.1095693359113454e-06, + "loss": 0.83893389, + "num_input_tokens_seen": 236069215, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.10583496, + "step": 10935, + "time_per_iteration": 2.6976189613342285 + }, + { + "auxiliary_loss_clip": 0.06416147, + "auxiliary_loss_mlp": 0.01270518, + "balance_loss_clip": 0.06277473, + "balance_loss_mlp": 0.01258967, + "epoch": 0.6575078911769127, + "flos": 24578402912640.0, + "grad_norm": 1.4839652411177968, + "language_loss": 0.78411627, + "learning_rate": 1.1092206196403538e-06, + "loss": 0.86098289, + "num_input_tokens_seen": 236088335, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.11553955, + "step": 10936, + "time_per_iteration": 2.518728494644165 + }, + { + "auxiliary_loss_clip": 0.06411346, + "auxiliary_loss_mlp": 0.01270987, + "balance_loss_clip": 0.06275463, + "balance_loss_mlp": 0.01261301, + "epoch": 0.6575680144295807, + "flos": 20930493484800.0, + "grad_norm": 1.7706689890869223, + "language_loss": 0.68970346, + "learning_rate": 1.1088719371478056e-06, + "loss": 0.76652682, + "num_input_tokens_seen": 236108540, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.09692383, + "step": 10937, + "time_per_iteration": 2.5257480144500732 + }, + { + "auxiliary_loss_clip": 0.06410159, + "auxiliary_loss_mlp": 0.01266555, + "balance_loss_clip": 0.06273675, + "balance_loss_mlp": 0.01255696, + "epoch": 0.6576281376822486, + "flos": 10930213048320.0, + "grad_norm": 2.6009314091519804, + "language_loss": 0.68779373, + "learning_rate": 1.1085232884469236e-06, + "loss": 0.76456088, + "num_input_tokens_seen": 236124495, + "router_z_loss_clip": 1.36621094, + "router_z_loss_mlp": 0.10858154, + "step": 10938, + "time_per_iteration": 2.487494468688965 + }, + { + "auxiliary_loss_clip": 0.06411414, + "auxiliary_loss_mlp": 0.01265537, + "balance_loss_clip": 0.06273697, + "balance_loss_mlp": 0.01254659, + "epoch": 0.6576882609349166, + "flos": 19287632407680.0, + "grad_norm": 1.7840896081065163, + "language_loss": 0.71399069, + "learning_rate": 1.108174673550927e-06, + "loss": 0.79076016, + "num_input_tokens_seen": 236142550, + "router_z_loss_clip": 1.37695312, + "router_z_loss_mlp": 0.10876465, + "step": 10939, + "time_per_iteration": 2.4861202239990234 + }, + { + "auxiliary_loss_clip": 0.0641602, + "auxiliary_loss_mlp": 0.01267708, + "balance_loss_clip": 0.06275935, + "balance_loss_mlp": 0.01256199, + "epoch": 0.6577483841875845, + "flos": 20225168542080.0, + "grad_norm": 5.914491475263239, + "language_loss": 0.77965903, + "learning_rate": 1.107826092473037e-06, + "loss": 0.85649633, + "num_input_tokens_seen": 236156620, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.11505127, + "step": 10940, + "time_per_iteration": 2.491938829421997 + }, + { + "auxiliary_loss_clip": 0.06417249, + "auxiliary_loss_mlp": 0.01271369, + "balance_loss_clip": 0.06273827, + "balance_loss_mlp": 0.01260253, + "epoch": 0.6578085074402525, + "flos": 34759672168320.0, + "grad_norm": 1.9394980575704135, + "language_loss": 0.69278842, + "learning_rate": 1.107477545226471e-06, + "loss": 0.76967466, + "num_input_tokens_seen": 236177095, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.11132812, + "step": 10941, + "time_per_iteration": 2.6296122074127197 + }, + { + "auxiliary_loss_clip": 0.06406929, + "auxiliary_loss_mlp": 0.0126384, + "balance_loss_clip": 0.06270303, + "balance_loss_mlp": 0.01253934, + "epoch": 0.6578686306929205, + "flos": 23476705430400.0, + "grad_norm": 1.8720735918703966, + "language_loss": 0.68617851, + "learning_rate": 1.1071290318244448e-06, + "loss": 0.76288623, + "num_input_tokens_seen": 236194695, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.09906006, + "step": 10942, + "time_per_iteration": 2.5199849605560303 + }, + { + "auxiliary_loss_clip": 0.06417514, + "auxiliary_loss_mlp": 0.01265909, + "balance_loss_clip": 0.0627285, + "balance_loss_mlp": 0.0125391, + "epoch": 0.6579287539455885, + "flos": 18082876003200.0, + "grad_norm": 1.8863772080566783, + "language_loss": 0.71839166, + "learning_rate": 1.1067805522801753e-06, + "loss": 0.7952258, + "num_input_tokens_seen": 236213885, + "router_z_loss_clip": 1.44433594, + "router_z_loss_mlp": 0.12005615, + "step": 10943, + "time_per_iteration": 2.4810752868652344 + }, + { + "auxiliary_loss_clip": 0.06409079, + "auxiliary_loss_mlp": 0.01268026, + "balance_loss_clip": 0.06272689, + "balance_loss_mlp": 0.01257327, + "epoch": 0.6579888771982564, + "flos": 28669532112000.0, + "grad_norm": 1.7035342930552537, + "language_loss": 0.59567684, + "learning_rate": 1.1064321066068778e-06, + "loss": 0.67244786, + "num_input_tokens_seen": 236237315, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.10687256, + "step": 10944, + "time_per_iteration": 2.593003273010254 + }, + { + "auxiliary_loss_clip": 0.06423099, + "auxiliary_loss_mlp": 0.01269429, + "balance_loss_clip": 0.06277057, + "balance_loss_mlp": 0.01257555, + "epoch": 0.6580490004509244, + "flos": 25053627744000.0, + "grad_norm": 1.4789836122868327, + "language_loss": 0.72602201, + "learning_rate": 1.1060836948177646e-06, + "loss": 0.80294728, + "num_input_tokens_seen": 236256345, + "router_z_loss_clip": 1.45898438, + "router_z_loss_mlp": 0.11871338, + "step": 10945, + "time_per_iteration": 2.53983998298645 + }, + { + "auxiliary_loss_clip": 0.06410586, + "auxiliary_loss_mlp": 0.01266442, + "balance_loss_clip": 0.06275351, + "balance_loss_mlp": 0.01256321, + "epoch": 0.6581091237035923, + "flos": 43519040363520.0, + "grad_norm": 1.838349836001675, + "language_loss": 0.70316982, + "learning_rate": 1.105735316926046e-06, + "loss": 0.77994007, + "num_input_tokens_seen": 236281890, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.10119629, + "step": 10946, + "time_per_iteration": 2.798476219177246 + }, + { + "auxiliary_loss_clip": 0.06410632, + "auxiliary_loss_mlp": 0.01265957, + "balance_loss_clip": 0.06272982, + "balance_loss_mlp": 0.01255514, + "epoch": 0.6581692469562603, + "flos": 22421352055680.0, + "grad_norm": 1.8876327732241813, + "language_loss": 0.82383513, + "learning_rate": 1.105386972944934e-06, + "loss": 0.90060103, + "num_input_tokens_seen": 236298370, + "router_z_loss_clip": 1.37402344, + "router_z_loss_mlp": 0.10443115, + "step": 10947, + "time_per_iteration": 2.5243499279022217 + }, + { + "auxiliary_loss_clip": 0.06414369, + "auxiliary_loss_mlp": 0.01263895, + "balance_loss_clip": 0.0627495, + "balance_loss_mlp": 0.01253447, + "epoch": 0.6582293702089284, + "flos": 24866098306560.0, + "grad_norm": 1.5151980350674914, + "language_loss": 0.77415752, + "learning_rate": 1.1050386628876385e-06, + "loss": 0.85094017, + "num_input_tokens_seen": 236317380, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.10449219, + "step": 10948, + "time_per_iteration": 2.543790578842163 + }, + { + "auxiliary_loss_clip": 0.06411085, + "auxiliary_loss_mlp": 0.01265504, + "balance_loss_clip": 0.06274116, + "balance_loss_mlp": 0.01255288, + "epoch": 0.6582894934615963, + "flos": 23046399187200.0, + "grad_norm": 1.478986900014917, + "language_loss": 0.79121858, + "learning_rate": 1.1046903867673655e-06, + "loss": 0.86798447, + "num_input_tokens_seen": 236336210, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.10223389, + "step": 10949, + "time_per_iteration": 2.535895824432373 + }, + { + "auxiliary_loss_clip": 0.06312477, + "auxiliary_loss_mlp": 0.01264797, + "balance_loss_clip": 0.06256588, + "balance_loss_mlp": 0.01263514, + "epoch": 0.6583496167142643, + "flos": 72573274569600.0, + "grad_norm": 0.7232821189613112, + "language_loss": 0.61788374, + "learning_rate": 1.104342144597323e-06, + "loss": 0.69365644, + "num_input_tokens_seen": 236403090, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.01284027, + "step": 10950, + "time_per_iteration": 4.580410957336426 + }, + { + "auxiliary_loss_clip": 0.06408125, + "auxiliary_loss_mlp": 0.01268595, + "balance_loss_clip": 0.06274961, + "balance_loss_mlp": 0.01258778, + "epoch": 0.6584097399669322, + "flos": 13083867815040.0, + "grad_norm": 2.2244546266186354, + "language_loss": 0.6719563, + "learning_rate": 1.1039939363907178e-06, + "loss": 0.74872345, + "num_input_tokens_seen": 236420475, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.09820557, + "step": 10951, + "time_per_iteration": 2.510561466217041 + }, + { + "auxiliary_loss_clip": 0.06409305, + "auxiliary_loss_mlp": 0.01270102, + "balance_loss_clip": 0.06273426, + "balance_loss_mlp": 0.01259921, + "epoch": 0.6584698632196002, + "flos": 28700530922880.0, + "grad_norm": 1.3260041408046892, + "language_loss": 0.76428199, + "learning_rate": 1.1036457621607504e-06, + "loss": 0.84107602, + "num_input_tokens_seen": 236441915, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.10180664, + "step": 10952, + "time_per_iteration": 2.5918259620666504 + }, + { + "auxiliary_loss_clip": 0.06409515, + "auxiliary_loss_mlp": 0.01268051, + "balance_loss_clip": 0.06275044, + "balance_loss_mlp": 0.01257954, + "epoch": 0.6585299864722681, + "flos": 14324486567040.0, + "grad_norm": 1.6835884668716123, + "language_loss": 0.73700249, + "learning_rate": 1.1032976219206257e-06, + "loss": 0.81377816, + "num_input_tokens_seen": 236460340, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.10083008, + "step": 10953, + "time_per_iteration": 2.5165388584136963 + }, + { + "auxiliary_loss_clip": 0.06410642, + "auxiliary_loss_mlp": 0.01266102, + "balance_loss_clip": 0.06274508, + "balance_loss_mlp": 0.01255427, + "epoch": 0.6585901097249361, + "flos": 26805291747840.0, + "grad_norm": 1.6924688741082035, + "language_loss": 0.79007798, + "learning_rate": 1.102949515683546e-06, + "loss": 0.86684537, + "num_input_tokens_seen": 236478280, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.10681152, + "step": 10954, + "time_per_iteration": 2.564539909362793 + }, + { + "auxiliary_loss_clip": 0.06413999, + "auxiliary_loss_mlp": 0.01267466, + "balance_loss_clip": 0.06276879, + "balance_loss_mlp": 0.01257411, + "epoch": 0.658650232977604, + "flos": 18738921945600.0, + "grad_norm": 3.4725197474545215, + "language_loss": 0.69489324, + "learning_rate": 1.1026014434627096e-06, + "loss": 0.77170783, + "num_input_tokens_seen": 236493225, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.10058594, + "step": 10955, + "time_per_iteration": 2.495082139968872 + }, + { + "auxiliary_loss_clip": 0.06405246, + "auxiliary_loss_mlp": 0.01266042, + "balance_loss_clip": 0.06274106, + "balance_loss_mlp": 0.01256398, + "epoch": 0.6587103562302721, + "flos": 24760272199680.0, + "grad_norm": 2.1168101225513056, + "language_loss": 0.81125724, + "learning_rate": 1.1022534052713172e-06, + "loss": 0.88797009, + "num_input_tokens_seen": 236514420, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.09637451, + "step": 10956, + "time_per_iteration": 2.636908531188965 + }, + { + "auxiliary_loss_clip": 0.06413392, + "auxiliary_loss_mlp": 0.0127424, + "balance_loss_clip": 0.06275264, + "balance_loss_mlp": 0.01262808, + "epoch": 0.65877047948294, + "flos": 22352688034560.0, + "grad_norm": 2.1582606979270462, + "language_loss": 0.81753582, + "learning_rate": 1.1019054011225648e-06, + "loss": 0.89441204, + "num_input_tokens_seen": 236532785, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.11431885, + "step": 10957, + "time_per_iteration": 2.6302380561828613 + }, + { + "auxiliary_loss_clip": 0.06405203, + "auxiliary_loss_mlp": 0.01264716, + "balance_loss_clip": 0.0627131, + "balance_loss_mlp": 0.01255513, + "epoch": 0.658830602735608, + "flos": 45189965358720.0, + "grad_norm": 1.6069945820528309, + "language_loss": 0.76651394, + "learning_rate": 1.1015574310296506e-06, + "loss": 0.8432132, + "num_input_tokens_seen": 236553330, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.09197998, + "step": 10958, + "time_per_iteration": 2.7235934734344482 + }, + { + "auxiliary_loss_clip": 0.06409356, + "auxiliary_loss_mlp": 0.01266973, + "balance_loss_clip": 0.0627449, + "balance_loss_mlp": 0.01256811, + "epoch": 0.6588907259882759, + "flos": 19907774075520.0, + "grad_norm": 1.6704982273704214, + "language_loss": 0.75102574, + "learning_rate": 1.1012094950057678e-06, + "loss": 0.82778907, + "num_input_tokens_seen": 236572960, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.10168457, + "step": 10959, + "time_per_iteration": 2.4919495582580566 + }, + { + "auxiliary_loss_clip": 0.06411363, + "auxiliary_loss_mlp": 0.01263366, + "balance_loss_clip": 0.062753, + "balance_loss_mlp": 0.01253609, + "epoch": 0.6589508492409439, + "flos": 24140591729280.0, + "grad_norm": 1.5345825682480954, + "language_loss": 0.65334243, + "learning_rate": 1.1008615930641107e-06, + "loss": 0.73008978, + "num_input_tokens_seen": 236594090, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.09759521, + "step": 10960, + "time_per_iteration": 2.539113998413086 + }, + { + "auxiliary_loss_clip": 0.06417534, + "auxiliary_loss_mlp": 0.01266682, + "balance_loss_clip": 0.06274159, + "balance_loss_mlp": 0.01256305, + "epoch": 0.659010972493612, + "flos": 18228715234560.0, + "grad_norm": 1.960089741542263, + "language_loss": 0.81517863, + "learning_rate": 1.1005137252178734e-06, + "loss": 0.89202076, + "num_input_tokens_seen": 236610190, + "router_z_loss_clip": 1.43359375, + "router_z_loss_mlp": 0.1038208, + "step": 10961, + "time_per_iteration": 3.8582499027252197 + }, + { + "auxiliary_loss_clip": 0.0641351, + "auxiliary_loss_mlp": 0.01267598, + "balance_loss_clip": 0.06275603, + "balance_loss_mlp": 0.01257292, + "epoch": 0.6590710957462799, + "flos": 27607428483840.0, + "grad_norm": 1.7237322524813996, + "language_loss": 0.736247, + "learning_rate": 1.1001658914802453e-06, + "loss": 0.81305802, + "num_input_tokens_seen": 236631575, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.10302734, + "step": 10962, + "time_per_iteration": 2.542795419692993 + }, + { + "auxiliary_loss_clip": 0.06414889, + "auxiliary_loss_mlp": 0.01268579, + "balance_loss_clip": 0.06274842, + "balance_loss_mlp": 0.01257522, + "epoch": 0.6591312189989479, + "flos": 20309177859840.0, + "grad_norm": 1.8258870034084347, + "language_loss": 0.80250466, + "learning_rate": 1.0998180918644165e-06, + "loss": 0.87933934, + "num_input_tokens_seen": 236649815, + "router_z_loss_clip": 1.39941406, + "router_z_loss_mlp": 0.11071777, + "step": 10963, + "time_per_iteration": 2.484524965286255 + }, + { + "auxiliary_loss_clip": 0.06407138, + "auxiliary_loss_mlp": 0.01266706, + "balance_loss_clip": 0.06273393, + "balance_loss_mlp": 0.0125696, + "epoch": 0.6591913422516158, + "flos": 12317886915840.0, + "grad_norm": 1.5886018528393113, + "language_loss": 0.78204167, + "learning_rate": 1.0994703263835754e-06, + "loss": 0.85878009, + "num_input_tokens_seen": 236668335, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.09753418, + "step": 10964, + "time_per_iteration": 4.032490015029907 + }, + { + "auxiliary_loss_clip": 0.06414784, + "auxiliary_loss_mlp": 0.0126533, + "balance_loss_clip": 0.06274471, + "balance_loss_mlp": 0.01255787, + "epoch": 0.6592514655042838, + "flos": 25891626827520.0, + "grad_norm": 1.653857660787362, + "language_loss": 0.7398777, + "learning_rate": 1.0991225950509106e-06, + "loss": 0.81667888, + "num_input_tokens_seen": 236688945, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.09539795, + "step": 10965, + "time_per_iteration": 2.558753490447998 + }, + { + "auxiliary_loss_clip": 0.06415711, + "auxiliary_loss_mlp": 0.01266111, + "balance_loss_clip": 0.0627279, + "balance_loss_mlp": 0.0125528, + "epoch": 0.6593115887569517, + "flos": 14068754305920.0, + "grad_norm": 2.292623636057082, + "language_loss": 0.74313521, + "learning_rate": 1.0987748978796067e-06, + "loss": 0.81995344, + "num_input_tokens_seen": 236707055, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.1083374, + "step": 10966, + "time_per_iteration": 2.4695546627044678 + }, + { + "auxiliary_loss_clip": 0.06410235, + "auxiliary_loss_mlp": 0.01265948, + "balance_loss_clip": 0.06273091, + "balance_loss_mlp": 0.01255273, + "epoch": 0.6593717120096197, + "flos": 24724912976640.0, + "grad_norm": 1.5343869413599147, + "language_loss": 0.77172506, + "learning_rate": 1.0984272348828487e-06, + "loss": 0.8484869, + "num_input_tokens_seen": 236725900, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.10662842, + "step": 10967, + "time_per_iteration": 2.554844856262207 + }, + { + "auxiliary_loss_clip": 0.0630592, + "auxiliary_loss_mlp": 0.01258736, + "balance_loss_clip": 0.06250164, + "balance_loss_mlp": 0.01257491, + "epoch": 0.6594318352622877, + "flos": 55577951907840.0, + "grad_norm": 0.6831964979389027, + "language_loss": 0.48237032, + "learning_rate": 1.0980796060738221e-06, + "loss": 0.5580169, + "num_input_tokens_seen": 236788415, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.01243591, + "step": 10968, + "time_per_iteration": 3.1279184818267822 + }, + { + "auxiliary_loss_clip": 0.06412826, + "auxiliary_loss_mlp": 0.01261785, + "balance_loss_clip": 0.06273898, + "balance_loss_mlp": 0.01251569, + "epoch": 0.6594919585149557, + "flos": 17462650481280.0, + "grad_norm": 1.6973549586156937, + "language_loss": 0.79805654, + "learning_rate": 1.0977320114657058e-06, + "loss": 0.87480259, + "num_input_tokens_seen": 236805155, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.10211182, + "step": 10969, + "time_per_iteration": 3.929111957550049 + }, + { + "auxiliary_loss_clip": 0.0641497, + "auxiliary_loss_mlp": 0.01265533, + "balance_loss_clip": 0.06276352, + "balance_loss_mlp": 0.01255239, + "epoch": 0.6595520817676236, + "flos": 18229092577920.0, + "grad_norm": 1.9822858612354273, + "language_loss": 0.65968251, + "learning_rate": 1.0973844510716817e-06, + "loss": 0.73648757, + "num_input_tokens_seen": 236824360, + "router_z_loss_clip": 1.38476562, + "router_z_loss_mlp": 0.10296631, + "step": 10970, + "time_per_iteration": 2.534639835357666 + }, + { + "auxiliary_loss_clip": 0.06411758, + "auxiliary_loss_mlp": 0.01263435, + "balance_loss_clip": 0.06272757, + "balance_loss_mlp": 0.01253368, + "epoch": 0.6596122050202916, + "flos": 22206219897600.0, + "grad_norm": 1.4827049257585125, + "language_loss": 0.76440203, + "learning_rate": 1.0970369249049308e-06, + "loss": 0.84115398, + "num_input_tokens_seen": 236844640, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.10064697, + "step": 10971, + "time_per_iteration": 2.518568515777588 + }, + { + "auxiliary_loss_clip": 0.06414073, + "auxiliary_loss_mlp": 0.01263478, + "balance_loss_clip": 0.06274455, + "balance_loss_mlp": 0.01253101, + "epoch": 0.6596723282729595, + "flos": 14179108533120.0, + "grad_norm": 2.58028286016492, + "language_loss": 0.70073628, + "learning_rate": 1.096689432978629e-06, + "loss": 0.77751178, + "num_input_tokens_seen": 236861160, + "router_z_loss_clip": 1.39648438, + "router_z_loss_mlp": 0.10388184, + "step": 10972, + "time_per_iteration": 2.5301804542541504 + }, + { + "auxiliary_loss_clip": 0.06411418, + "auxiliary_loss_mlp": 0.01263284, + "balance_loss_clip": 0.0627436, + "balance_loss_mlp": 0.01252931, + "epoch": 0.6597324515256275, + "flos": 30560746291200.0, + "grad_norm": 1.6494264278825825, + "language_loss": 0.55793309, + "learning_rate": 1.0963419753059556e-06, + "loss": 0.63468015, + "num_input_tokens_seen": 236880465, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.10351562, + "step": 10973, + "time_per_iteration": 2.5836968421936035 + }, + { + "auxiliary_loss_clip": 0.06425004, + "auxiliary_loss_mlp": 0.01265958, + "balance_loss_clip": 0.06279783, + "balance_loss_mlp": 0.0125579, + "epoch": 0.6597925747782956, + "flos": 17645693725440.0, + "grad_norm": 2.424477152178303, + "language_loss": 0.78669357, + "learning_rate": 1.0959945519000839e-06, + "loss": 0.86360323, + "num_input_tokens_seen": 236897730, + "router_z_loss_clip": 1.45214844, + "router_z_loss_mlp": 0.10174561, + "step": 10974, + "time_per_iteration": 2.5438265800476074 + }, + { + "auxiliary_loss_clip": 0.06416789, + "auxiliary_loss_mlp": 0.01266385, + "balance_loss_clip": 0.06276938, + "balance_loss_mlp": 0.01255567, + "epoch": 0.6598526980309635, + "flos": 22825523024640.0, + "grad_norm": 2.75247163208804, + "language_loss": 0.69161505, + "learning_rate": 1.0956471627741906e-06, + "loss": 0.7684468, + "num_input_tokens_seen": 236917300, + "router_z_loss_clip": 1.39941406, + "router_z_loss_mlp": 0.10821533, + "step": 10975, + "time_per_iteration": 2.517643690109253 + }, + { + "auxiliary_loss_clip": 0.06413519, + "auxiliary_loss_mlp": 0.01263226, + "balance_loss_clip": 0.06275275, + "balance_loss_mlp": 0.01252766, + "epoch": 0.6599128212836315, + "flos": 21074194437120.0, + "grad_norm": 1.6033931639433516, + "language_loss": 0.70794642, + "learning_rate": 1.0952998079414464e-06, + "loss": 0.78471386, + "num_input_tokens_seen": 236935590, + "router_z_loss_clip": 1.38183594, + "router_z_loss_mlp": 0.10455322, + "step": 10976, + "time_per_iteration": 2.5318117141723633 + }, + { + "auxiliary_loss_clip": 0.06410262, + "auxiliary_loss_mlp": 0.01267729, + "balance_loss_clip": 0.06275579, + "balance_loss_mlp": 0.01257065, + "epoch": 0.6599729445362994, + "flos": 22170022133760.0, + "grad_norm": 1.5758270650588126, + "language_loss": 0.67691094, + "learning_rate": 1.0949524874150243e-06, + "loss": 0.75369084, + "num_input_tokens_seen": 236952830, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.10668945, + "step": 10977, + "time_per_iteration": 2.485891342163086 + }, + { + "auxiliary_loss_clip": 0.06420588, + "auxiliary_loss_mlp": 0.01267585, + "balance_loss_clip": 0.0627695, + "balance_loss_mlp": 0.01256427, + "epoch": 0.6600330677889674, + "flos": 18155900436480.0, + "grad_norm": 2.2117923844530694, + "language_loss": 0.81200063, + "learning_rate": 1.0946052012080952e-06, + "loss": 0.8888824, + "num_input_tokens_seen": 236971930, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.11157227, + "step": 10978, + "time_per_iteration": 2.5422048568725586 + }, + { + "auxiliary_loss_clip": 0.0641408, + "auxiliary_loss_mlp": 0.01266547, + "balance_loss_clip": 0.06273466, + "balance_loss_mlp": 0.01255461, + "epoch": 0.6600931910416353, + "flos": 18155942363520.0, + "grad_norm": 2.6619753374489767, + "language_loss": 0.67523986, + "learning_rate": 1.0942579493338278e-06, + "loss": 0.75204611, + "num_input_tokens_seen": 236989920, + "router_z_loss_clip": 1.40429688, + "router_z_loss_mlp": 0.11096191, + "step": 10979, + "time_per_iteration": 2.5064504146575928 + }, + { + "auxiliary_loss_clip": 0.06413005, + "auxiliary_loss_mlp": 0.01265818, + "balance_loss_clip": 0.06272849, + "balance_loss_mlp": 0.0125528, + "epoch": 0.6601533142943034, + "flos": 17426494644480.0, + "grad_norm": 2.8604366894108324, + "language_loss": 0.73473299, + "learning_rate": 1.0939107318053889e-06, + "loss": 0.81152123, + "num_input_tokens_seen": 237006570, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.10540771, + "step": 10980, + "time_per_iteration": 2.5004913806915283 + }, + { + "auxiliary_loss_clip": 0.06408733, + "auxiliary_loss_mlp": 0.01271257, + "balance_loss_clip": 0.06274907, + "balance_loss_mlp": 0.01261441, + "epoch": 0.6602134375469713, + "flos": 28226983173120.0, + "grad_norm": 1.584002725324806, + "language_loss": 0.72518432, + "learning_rate": 1.0935635486359459e-06, + "loss": 0.80198425, + "num_input_tokens_seen": 237028415, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.09814453, + "step": 10981, + "time_per_iteration": 2.552730083465576 + }, + { + "auxiliary_loss_clip": 0.0641138, + "auxiliary_loss_mlp": 0.01266077, + "balance_loss_clip": 0.06272905, + "balance_loss_mlp": 0.01256111, + "epoch": 0.6602735607996393, + "flos": 29424737761920.0, + "grad_norm": 1.8532747935564327, + "language_loss": 0.69432831, + "learning_rate": 1.0932163998386647e-06, + "loss": 0.77110291, + "num_input_tokens_seen": 237046595, + "router_z_loss_clip": 1.38476562, + "router_z_loss_mlp": 0.09960938, + "step": 10982, + "time_per_iteration": 2.591977834701538 + }, + { + "auxiliary_loss_clip": 0.06413966, + "auxiliary_loss_mlp": 0.01264907, + "balance_loss_clip": 0.06277901, + "balance_loss_mlp": 0.01254148, + "epoch": 0.6603336840523072, + "flos": 18593963182080.0, + "grad_norm": 1.4024673840301536, + "language_loss": 0.69806457, + "learning_rate": 1.0928692854267075e-06, + "loss": 0.77485329, + "num_input_tokens_seen": 237066150, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.10760498, + "step": 10983, + "time_per_iteration": 2.483527660369873 + }, + { + "auxiliary_loss_clip": 0.06413279, + "auxiliary_loss_mlp": 0.012674, + "balance_loss_clip": 0.06274509, + "balance_loss_mlp": 0.01256409, + "epoch": 0.6603938073049752, + "flos": 33263153447040.0, + "grad_norm": 1.5623815208568963, + "language_loss": 0.70765328, + "learning_rate": 1.092522205413239e-06, + "loss": 0.78446013, + "num_input_tokens_seen": 237087060, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.10998535, + "step": 10984, + "time_per_iteration": 2.6334474086761475 + }, + { + "auxiliary_loss_clip": 0.06408207, + "auxiliary_loss_mlp": 0.01266467, + "balance_loss_clip": 0.06274273, + "balance_loss_mlp": 0.01256078, + "epoch": 0.6604539305576431, + "flos": 17390045318400.0, + "grad_norm": 1.8218342593599246, + "language_loss": 0.84316599, + "learning_rate": 1.0921751598114193e-06, + "loss": 0.9199127, + "num_input_tokens_seen": 237103825, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.10394287, + "step": 10985, + "time_per_iteration": 2.4621846675872803 + }, + { + "auxiliary_loss_clip": 0.06415112, + "auxiliary_loss_mlp": 0.01267549, + "balance_loss_clip": 0.06275454, + "balance_loss_mlp": 0.01256779, + "epoch": 0.6605140538103111, + "flos": 21257447316480.0, + "grad_norm": 1.9945336241456124, + "language_loss": 0.74090636, + "learning_rate": 1.0918281486344077e-06, + "loss": 0.81773293, + "num_input_tokens_seen": 237121740, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.10778809, + "step": 10986, + "time_per_iteration": 2.5241971015930176 + }, + { + "auxiliary_loss_clip": 0.06414539, + "auxiliary_loss_mlp": 0.01269603, + "balance_loss_clip": 0.06278964, + "balance_loss_mlp": 0.01259673, + "epoch": 0.6605741770629792, + "flos": 13886885018880.0, + "grad_norm": 1.8900199688101529, + "language_loss": 0.79989499, + "learning_rate": 1.0914811718953636e-06, + "loss": 0.8767364, + "num_input_tokens_seen": 237139565, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.09936523, + "step": 10987, + "time_per_iteration": 2.467759132385254 + }, + { + "auxiliary_loss_clip": 0.06315437, + "auxiliary_loss_mlp": 0.01250965, + "balance_loss_clip": 0.06259646, + "balance_loss_mlp": 0.0124932, + "epoch": 0.6606343003156471, + "flos": 69338885840640.0, + "grad_norm": 0.958585987636571, + "language_loss": 0.5413903, + "learning_rate": 1.0911342296074454e-06, + "loss": 0.61705434, + "num_input_tokens_seen": 237201055, + "router_z_loss_clip": 0.55566406, + "router_z_loss_mlp": 0.01647949, + "step": 10988, + "time_per_iteration": 3.2449100017547607 + }, + { + "auxiliary_loss_clip": 0.0641297, + "auxiliary_loss_mlp": 0.01265201, + "balance_loss_clip": 0.06277774, + "balance_loss_mlp": 0.0125508, + "epoch": 0.6606944235683151, + "flos": 27279887673600.0, + "grad_norm": 1.4331259688792952, + "language_loss": 0.77265781, + "learning_rate": 1.0907873217838077e-06, + "loss": 0.8494395, + "num_input_tokens_seen": 237221805, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.10119629, + "step": 10989, + "time_per_iteration": 2.565397262573242 + }, + { + "auxiliary_loss_clip": 0.06413271, + "auxiliary_loss_mlp": 0.01268256, + "balance_loss_clip": 0.06277858, + "balance_loss_mlp": 0.01257796, + "epoch": 0.660754546820983, + "flos": 13778082092160.0, + "grad_norm": 1.981088082283497, + "language_loss": 0.77234143, + "learning_rate": 1.0904404484376064e-06, + "loss": 0.84915674, + "num_input_tokens_seen": 237238270, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.10461426, + "step": 10990, + "time_per_iteration": 3.8957126140594482 + }, + { + "auxiliary_loss_clip": 0.06422216, + "auxiliary_loss_mlp": 0.01267426, + "balance_loss_clip": 0.06283079, + "balance_loss_mlp": 0.0125693, + "epoch": 0.660814670073651, + "flos": 15710567207040.0, + "grad_norm": 2.3076268356000864, + "language_loss": 0.60737276, + "learning_rate": 1.0900936095819937e-06, + "loss": 0.68426919, + "num_input_tokens_seen": 237255400, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.10491943, + "step": 10991, + "time_per_iteration": 2.528184175491333 + }, + { + "auxiliary_loss_clip": 0.0641991, + "auxiliary_loss_mlp": 0.01270981, + "balance_loss_clip": 0.06280324, + "balance_loss_mlp": 0.012599, + "epoch": 0.6608747933263189, + "flos": 20856295094400.0, + "grad_norm": 2.771721604026619, + "language_loss": 0.67745811, + "learning_rate": 1.0897468052301234e-06, + "loss": 0.75436699, + "num_input_tokens_seen": 237273105, + "router_z_loss_clip": 1.39257812, + "router_z_loss_mlp": 0.11083984, + "step": 10992, + "time_per_iteration": 2.5081818103790283 + }, + { + "auxiliary_loss_clip": 0.06419984, + "auxiliary_loss_mlp": 0.01265645, + "balance_loss_clip": 0.06279188, + "balance_loss_mlp": 0.01254588, + "epoch": 0.660934916578987, + "flos": 20638521532800.0, + "grad_norm": 1.8747370045388403, + "language_loss": 0.87962919, + "learning_rate": 1.0894000353951444e-06, + "loss": 0.95648551, + "num_input_tokens_seen": 237292650, + "router_z_loss_clip": 1.40722656, + "router_z_loss_mlp": 0.11053467, + "step": 10993, + "time_per_iteration": 2.5521185398101807 + }, + { + "auxiliary_loss_clip": 0.0642574, + "auxiliary_loss_mlp": 0.0126717, + "balance_loss_clip": 0.06281907, + "balance_loss_mlp": 0.01255434, + "epoch": 0.6609950398316549, + "flos": 25119692288640.0, + "grad_norm": 1.7537930651875573, + "language_loss": 0.67272747, + "learning_rate": 1.0890533000902078e-06, + "loss": 0.74965656, + "num_input_tokens_seen": 237312865, + "router_z_loss_clip": 1.43847656, + "router_z_loss_mlp": 0.11737061, + "step": 10994, + "time_per_iteration": 2.6144933700561523 + }, + { + "auxiliary_loss_clip": 0.06417718, + "auxiliary_loss_mlp": 0.01264904, + "balance_loss_clip": 0.06279863, + "balance_loss_mlp": 0.01253812, + "epoch": 0.6610551630843229, + "flos": 18667155323520.0, + "grad_norm": 1.5859648112701323, + "language_loss": 0.77035165, + "learning_rate": 1.0887065993284626e-06, + "loss": 0.84717792, + "num_input_tokens_seen": 237331210, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.11096191, + "step": 10995, + "time_per_iteration": 2.5111653804779053 + }, + { + "auxiliary_loss_clip": 0.06421737, + "auxiliary_loss_mlp": 0.01276001, + "balance_loss_clip": 0.06282931, + "balance_loss_mlp": 0.01265868, + "epoch": 0.6611152863369908, + "flos": 23264885508480.0, + "grad_norm": 1.7748442712796604, + "language_loss": 0.74969876, + "learning_rate": 1.088359933123053e-06, + "loss": 0.82667613, + "num_input_tokens_seen": 237349455, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.10137939, + "step": 10996, + "time_per_iteration": 2.5098516941070557 + }, + { + "auxiliary_loss_clip": 0.06418104, + "auxiliary_loss_mlp": 0.01265908, + "balance_loss_clip": 0.06280057, + "balance_loss_mlp": 0.0125562, + "epoch": 0.6611754095896588, + "flos": 22165577867520.0, + "grad_norm": 1.6113039426712623, + "language_loss": 0.69186199, + "learning_rate": 1.088013301487126e-06, + "loss": 0.76870203, + "num_input_tokens_seen": 237367100, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.10296631, + "step": 10997, + "time_per_iteration": 2.525808095932007 + }, + { + "auxiliary_loss_clip": 0.06421575, + "auxiliary_loss_mlp": 0.01265058, + "balance_loss_clip": 0.06279309, + "balance_loss_mlp": 0.01254467, + "epoch": 0.6612355328423267, + "flos": 13996442632320.0, + "grad_norm": 1.959031062109239, + "language_loss": 0.68880165, + "learning_rate": 1.0876667044338269e-06, + "loss": 0.76566797, + "num_input_tokens_seen": 237384840, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.10601807, + "step": 10998, + "time_per_iteration": 2.457221269607544 + }, + { + "auxiliary_loss_clip": 0.06313896, + "auxiliary_loss_mlp": 0.01252861, + "balance_loss_clip": 0.06257924, + "balance_loss_mlp": 0.01251496, + "epoch": 0.6612956560949947, + "flos": 61472051337600.0, + "grad_norm": 0.641819710963161, + "language_loss": 0.50997436, + "learning_rate": 1.087320141976297e-06, + "loss": 0.58564192, + "num_input_tokens_seen": 237443355, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01367188, + "step": 10999, + "time_per_iteration": 3.1182916164398193 + }, + { + "auxiliary_loss_clip": 0.06424031, + "auxiliary_loss_mlp": 0.01268354, + "balance_loss_clip": 0.06280085, + "balance_loss_mlp": 0.01257554, + "epoch": 0.6613557793476627, + "flos": 21623114534400.0, + "grad_norm": 2.559990275838241, + "language_loss": 0.70366681, + "learning_rate": 1.086973614127679e-06, + "loss": 0.78059065, + "num_input_tokens_seen": 237459205, + "router_z_loss_clip": 1.43945312, + "router_z_loss_mlp": 0.10797119, + "step": 11000, + "time_per_iteration": 3.9581432342529297 + }, + { + "auxiliary_loss_clip": 0.06411293, + "auxiliary_loss_mlp": 0.01264334, + "balance_loss_clip": 0.06276174, + "balance_loss_mlp": 0.01254523, + "epoch": 0.6614159026003307, + "flos": 34028379659520.0, + "grad_norm": 1.6165930596704574, + "language_loss": 0.65563923, + "learning_rate": 1.0866271209011133e-06, + "loss": 0.73239553, + "num_input_tokens_seen": 237483580, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.0980835, + "step": 11001, + "time_per_iteration": 2.6200945377349854 + }, + { + "auxiliary_loss_clip": 0.06414855, + "auxiliary_loss_mlp": 0.01264334, + "balance_loss_clip": 0.06279069, + "balance_loss_mlp": 0.01254207, + "epoch": 0.6614760258529987, + "flos": 24104100476160.0, + "grad_norm": 1.733561890110771, + "language_loss": 0.73266578, + "learning_rate": 1.086280662309739e-06, + "loss": 0.80945766, + "num_input_tokens_seen": 237502860, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.10137939, + "step": 11002, + "time_per_iteration": 2.5620791912078857 + }, + { + "auxiliary_loss_clip": 0.06415205, + "auxiliary_loss_mlp": 0.01266083, + "balance_loss_clip": 0.06279428, + "balance_loss_mlp": 0.01255372, + "epoch": 0.6615361491056666, + "flos": 14909227084800.0, + "grad_norm": 2.451590701969631, + "language_loss": 0.79098624, + "learning_rate": 1.0859342383666928e-06, + "loss": 0.86779916, + "num_input_tokens_seen": 237521030, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10705566, + "step": 11003, + "time_per_iteration": 2.481431007385254 + }, + { + "auxiliary_loss_clip": 0.06419842, + "auxiliary_loss_mlp": 0.01267917, + "balance_loss_clip": 0.06279956, + "balance_loss_mlp": 0.01256449, + "epoch": 0.6615962723583346, + "flos": 15310337379840.0, + "grad_norm": 2.101443479539304, + "language_loss": 0.69193184, + "learning_rate": 1.0855878490851119e-06, + "loss": 0.76880944, + "num_input_tokens_seen": 237539585, + "router_z_loss_clip": 1.39941406, + "router_z_loss_mlp": 0.11468506, + "step": 11004, + "time_per_iteration": 4.006279945373535 + }, + { + "auxiliary_loss_clip": 0.06422809, + "auxiliary_loss_mlp": 0.0127206, + "balance_loss_clip": 0.06279877, + "balance_loss_mlp": 0.01260741, + "epoch": 0.6616563956110025, + "flos": 18738293040000.0, + "grad_norm": 2.056452219231189, + "language_loss": 0.70325673, + "learning_rate": 1.085241494478132e-06, + "loss": 0.78020537, + "num_input_tokens_seen": 237557655, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.11328125, + "step": 11005, + "time_per_iteration": 2.4944448471069336 + }, + { + "auxiliary_loss_clip": 0.06413882, + "auxiliary_loss_mlp": 0.01266539, + "balance_loss_clip": 0.06277984, + "balance_loss_mlp": 0.01256019, + "epoch": 0.6617165188636706, + "flos": 24501353483520.0, + "grad_norm": 1.5254702956902315, + "language_loss": 0.78776741, + "learning_rate": 1.0848951745588855e-06, + "loss": 0.86457157, + "num_input_tokens_seen": 237577000, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10510254, + "step": 11006, + "time_per_iteration": 2.5451557636260986 + }, + { + "auxiliary_loss_clip": 0.06416766, + "auxiliary_loss_mlp": 0.01267157, + "balance_loss_clip": 0.06280621, + "balance_loss_mlp": 0.01256649, + "epoch": 0.6617766421163385, + "flos": 22385741270400.0, + "grad_norm": 1.834529140929997, + "language_loss": 0.76486355, + "learning_rate": 1.0845488893405068e-06, + "loss": 0.84170276, + "num_input_tokens_seen": 237597960, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.1050415, + "step": 11007, + "time_per_iteration": 2.5298049449920654 + }, + { + "auxiliary_loss_clip": 0.0641939, + "auxiliary_loss_mlp": 0.01265491, + "balance_loss_clip": 0.06281586, + "balance_loss_mlp": 0.01255185, + "epoch": 0.6618367653690065, + "flos": 20856756291840.0, + "grad_norm": 1.4555215695175368, + "language_loss": 0.78606236, + "learning_rate": 1.0842026388361248e-06, + "loss": 0.86291116, + "num_input_tokens_seen": 237616385, + "router_z_loss_clip": 1.37695312, + "router_z_loss_mlp": 0.10302734, + "step": 11008, + "time_per_iteration": 4.0146424770355225 + }, + { + "auxiliary_loss_clip": 0.06420049, + "auxiliary_loss_mlp": 0.01265114, + "balance_loss_clip": 0.06275912, + "balance_loss_mlp": 0.01254004, + "epoch": 0.6618968886216744, + "flos": 17718089253120.0, + "grad_norm": 1.6552311812920846, + "language_loss": 0.82077724, + "learning_rate": 1.0838564230588715e-06, + "loss": 0.89762884, + "num_input_tokens_seen": 237634930, + "router_z_loss_clip": 1.44140625, + "router_z_loss_mlp": 0.11114502, + "step": 11009, + "time_per_iteration": 2.532111883163452 + }, + { + "auxiliary_loss_clip": 0.06314184, + "auxiliary_loss_mlp": 0.01255522, + "balance_loss_clip": 0.06257774, + "balance_loss_mlp": 0.01254004, + "epoch": 0.6619570118743424, + "flos": 67054500305280.0, + "grad_norm": 0.9881156540659067, + "language_loss": 0.67673898, + "learning_rate": 1.0835102420218735e-06, + "loss": 0.75243598, + "num_input_tokens_seen": 237693175, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01517487, + "step": 11010, + "time_per_iteration": 3.0648674964904785 + }, + { + "auxiliary_loss_clip": 0.06415196, + "auxiliary_loss_mlp": 0.01266404, + "balance_loss_clip": 0.0627633, + "balance_loss_mlp": 0.01254745, + "epoch": 0.6620171351270103, + "flos": 18666819907200.0, + "grad_norm": 1.5625294645604648, + "language_loss": 0.71682811, + "learning_rate": 1.0831640957382593e-06, + "loss": 0.79364407, + "num_input_tokens_seen": 237713160, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.11657715, + "step": 11011, + "time_per_iteration": 2.527869939804077 + }, + { + "auxiliary_loss_clip": 0.06418953, + "auxiliary_loss_mlp": 0.0126958, + "balance_loss_clip": 0.06281759, + "balance_loss_mlp": 0.01259548, + "epoch": 0.6620772583796783, + "flos": 24177376471680.0, + "grad_norm": 1.61722758281003, + "language_loss": 0.72627336, + "learning_rate": 1.0828179842211557e-06, + "loss": 0.80315864, + "num_input_tokens_seen": 237733600, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.10040283, + "step": 11012, + "time_per_iteration": 2.53691029548645 + }, + { + "auxiliary_loss_clip": 0.0640786, + "auxiliary_loss_mlp": 0.01270166, + "balance_loss_clip": 0.06279317, + "balance_loss_mlp": 0.01260903, + "epoch": 0.6621373816323463, + "flos": 23630385018240.0, + "grad_norm": 1.5542286383883441, + "language_loss": 0.79656094, + "learning_rate": 1.0824719074836845e-06, + "loss": 0.8733412, + "num_input_tokens_seen": 237752135, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.09265137, + "step": 11013, + "time_per_iteration": 2.5782439708709717 + }, + { + "auxiliary_loss_clip": 0.06413269, + "auxiliary_loss_mlp": 0.01265248, + "balance_loss_clip": 0.062774, + "balance_loss_mlp": 0.01254973, + "epoch": 0.6621975048850143, + "flos": 18448123950720.0, + "grad_norm": 1.9713400088604554, + "language_loss": 0.70423663, + "learning_rate": 1.082125865538971e-06, + "loss": 0.78102177, + "num_input_tokens_seen": 237770735, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10266113, + "step": 11014, + "time_per_iteration": 2.474597454071045 + }, + { + "auxiliary_loss_clip": 0.06411768, + "auxiliary_loss_mlp": 0.01265368, + "balance_loss_clip": 0.06278192, + "balance_loss_mlp": 0.01256475, + "epoch": 0.6622576281376823, + "flos": 14069047795200.0, + "grad_norm": 1.5898800545059366, + "language_loss": 0.77497208, + "learning_rate": 1.081779858400137e-06, + "loss": 0.85174346, + "num_input_tokens_seen": 237789005, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.08886719, + "step": 11015, + "time_per_iteration": 2.5123109817504883 + }, + { + "auxiliary_loss_clip": 0.06413803, + "auxiliary_loss_mlp": 0.01267289, + "balance_loss_clip": 0.06278028, + "balance_loss_mlp": 0.01256191, + "epoch": 0.6623177513903502, + "flos": 17024587735680.0, + "grad_norm": 1.7138462778054382, + "language_loss": 0.82368481, + "learning_rate": 1.0814338860803021e-06, + "loss": 0.90049571, + "num_input_tokens_seen": 237807740, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.11102295, + "step": 11016, + "time_per_iteration": 2.477137565612793 + }, + { + "auxiliary_loss_clip": 0.06418676, + "auxiliary_loss_mlp": 0.01263773, + "balance_loss_clip": 0.06277445, + "balance_loss_mlp": 0.01253175, + "epoch": 0.6623778746430182, + "flos": 17276127292800.0, + "grad_norm": 2.159067097867079, + "language_loss": 0.70195687, + "learning_rate": 1.0810879485925864e-06, + "loss": 0.77878135, + "num_input_tokens_seen": 237826340, + "router_z_loss_clip": 1.41210938, + "router_z_loss_mlp": 0.10583496, + "step": 11017, + "time_per_iteration": 2.5194361209869385 + }, + { + "auxiliary_loss_clip": 0.06414436, + "auxiliary_loss_mlp": 0.01267466, + "balance_loss_clip": 0.0627765, + "balance_loss_mlp": 0.01257101, + "epoch": 0.6624379978956861, + "flos": 48802725198720.0, + "grad_norm": 1.7089146920832974, + "language_loss": 0.77715868, + "learning_rate": 1.0807420459501084e-06, + "loss": 0.85397768, + "num_input_tokens_seen": 237848305, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.1036377, + "step": 11018, + "time_per_iteration": 2.7684452533721924 + }, + { + "auxiliary_loss_clip": 0.06414039, + "auxiliary_loss_mlp": 0.0126262, + "balance_loss_clip": 0.06278235, + "balance_loss_mlp": 0.01252714, + "epoch": 0.6624981211483542, + "flos": 18958330661760.0, + "grad_norm": 1.809730512167174, + "language_loss": 0.83465689, + "learning_rate": 1.0803961781659841e-06, + "loss": 0.91142356, + "num_input_tokens_seen": 237867020, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.09899902, + "step": 11019, + "time_per_iteration": 2.5207102298736572 + }, + { + "auxiliary_loss_clip": 0.06410275, + "auxiliary_loss_mlp": 0.01263185, + "balance_loss_clip": 0.0627672, + "balance_loss_mlp": 0.01253434, + "epoch": 0.6625582444010221, + "flos": 23262998791680.0, + "grad_norm": 1.565039350749023, + "language_loss": 0.72290635, + "learning_rate": 1.080050345253328e-06, + "loss": 0.79964089, + "num_input_tokens_seen": 237886710, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.09747314, + "step": 11020, + "time_per_iteration": 2.52868914604187 + }, + { + "auxiliary_loss_clip": 0.06419435, + "auxiliary_loss_mlp": 0.01268652, + "balance_loss_clip": 0.06276274, + "balance_loss_mlp": 0.01257601, + "epoch": 0.6626183676536901, + "flos": 21400770925440.0, + "grad_norm": 3.661943544447812, + "language_loss": 0.72194296, + "learning_rate": 1.0797045472252554e-06, + "loss": 0.79882383, + "num_input_tokens_seen": 237904795, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.11047363, + "step": 11021, + "time_per_iteration": 2.5214977264404297 + }, + { + "auxiliary_loss_clip": 0.06417044, + "auxiliary_loss_mlp": 0.01269377, + "balance_loss_clip": 0.06279403, + "balance_loss_mlp": 0.0125891, + "epoch": 0.662678490906358, + "flos": 14575984197120.0, + "grad_norm": 4.221661740882693, + "language_loss": 0.83307576, + "learning_rate": 1.0793587840948793e-06, + "loss": 0.90993994, + "num_input_tokens_seen": 237921320, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.10467529, + "step": 11022, + "time_per_iteration": 2.495877981185913 + }, + { + "auxiliary_loss_clip": 0.0642494, + "auxiliary_loss_mlp": 0.01267242, + "balance_loss_clip": 0.06277288, + "balance_loss_mlp": 0.0125513, + "epoch": 0.662738614159026, + "flos": 15996962862720.0, + "grad_norm": 2.5511625457855116, + "language_loss": 0.73115802, + "learning_rate": 1.0790130558753099e-06, + "loss": 0.80807984, + "num_input_tokens_seen": 237933525, + "router_z_loss_clip": 1.47753906, + "router_z_loss_mlp": 0.12115479, + "step": 11023, + "time_per_iteration": 2.475238800048828 + }, + { + "auxiliary_loss_clip": 0.06413288, + "auxiliary_loss_mlp": 0.01270086, + "balance_loss_clip": 0.06276564, + "balance_loss_mlp": 0.01259327, + "epoch": 0.6627987374116939, + "flos": 19542358419840.0, + "grad_norm": 1.582084315278466, + "language_loss": 0.75136846, + "learning_rate": 1.0786673625796574e-06, + "loss": 0.82820219, + "num_input_tokens_seen": 237953395, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.10748291, + "step": 11024, + "time_per_iteration": 2.5104072093963623 + }, + { + "auxiliary_loss_clip": 0.06414796, + "auxiliary_loss_mlp": 0.01267042, + "balance_loss_clip": 0.06277162, + "balance_loss_mlp": 0.01256635, + "epoch": 0.662858860664362, + "flos": 15707800022400.0, + "grad_norm": 3.5687971531497236, + "language_loss": 0.70028591, + "learning_rate": 1.0783217042210306e-06, + "loss": 0.77710426, + "num_input_tokens_seen": 237971445, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.10406494, + "step": 11025, + "time_per_iteration": 2.528007745742798 + }, + { + "auxiliary_loss_clip": 0.06416678, + "auxiliary_loss_mlp": 0.01266074, + "balance_loss_clip": 0.06279378, + "balance_loss_mlp": 0.01255513, + "epoch": 0.6629189839170299, + "flos": 20160026392320.0, + "grad_norm": 1.3776452398710215, + "language_loss": 0.78906387, + "learning_rate": 1.0779760808125379e-06, + "loss": 0.8658914, + "num_input_tokens_seen": 237989965, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.10565186, + "step": 11026, + "time_per_iteration": 2.5116465091705322 + }, + { + "auxiliary_loss_clip": 0.06413042, + "auxiliary_loss_mlp": 0.0126691, + "balance_loss_clip": 0.06277484, + "balance_loss_mlp": 0.01256759, + "epoch": 0.6629791071696979, + "flos": 20920430995200.0, + "grad_norm": 1.672126176860425, + "language_loss": 0.76636124, + "learning_rate": 1.0776304923672842e-06, + "loss": 0.84316075, + "num_input_tokens_seen": 238006820, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.1015625, + "step": 11027, + "time_per_iteration": 2.496917486190796 + }, + { + "auxiliary_loss_clip": 0.06414916, + "auxiliary_loss_mlp": 0.01265895, + "balance_loss_clip": 0.0627641, + "balance_loss_mlp": 0.01254708, + "epoch": 0.6630392304223659, + "flos": 20852647441920.0, + "grad_norm": 2.0836235208298115, + "language_loss": 0.70842957, + "learning_rate": 1.0772849388983742e-06, + "loss": 0.78523767, + "num_input_tokens_seen": 238022560, + "router_z_loss_clip": 1.38476562, + "router_z_loss_mlp": 0.11193848, + "step": 11028, + "time_per_iteration": 2.5055668354034424 + }, + { + "auxiliary_loss_clip": 0.06413043, + "auxiliary_loss_mlp": 0.01264718, + "balance_loss_clip": 0.06275769, + "balance_loss_mlp": 0.01254741, + "epoch": 0.6630993536750338, + "flos": 21002092398720.0, + "grad_norm": 1.9464575885295123, + "language_loss": 0.79627401, + "learning_rate": 1.0769394204189138e-06, + "loss": 0.87305164, + "num_input_tokens_seen": 238041895, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.09979248, + "step": 11029, + "time_per_iteration": 4.029799461364746 + }, + { + "auxiliary_loss_clip": 0.06414881, + "auxiliary_loss_mlp": 0.01267354, + "balance_loss_clip": 0.06275269, + "balance_loss_mlp": 0.01255755, + "epoch": 0.6631594769277018, + "flos": 18264787217280.0, + "grad_norm": 2.0842184585841994, + "language_loss": 0.76459014, + "learning_rate": 1.0765939369420012e-06, + "loss": 0.84141254, + "num_input_tokens_seen": 238060445, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.1159668, + "step": 11030, + "time_per_iteration": 2.499678611755371 + }, + { + "auxiliary_loss_clip": 0.06420542, + "auxiliary_loss_mlp": 0.01269601, + "balance_loss_clip": 0.06277149, + "balance_loss_mlp": 0.01258426, + "epoch": 0.6632196001803697, + "flos": 17826053639040.0, + "grad_norm": 2.267864257363868, + "language_loss": 0.75185478, + "learning_rate": 1.0762484884807391e-06, + "loss": 0.82875621, + "num_input_tokens_seen": 238077080, + "router_z_loss_clip": 1.43359375, + "router_z_loss_mlp": 0.11169434, + "step": 11031, + "time_per_iteration": 2.470355272293091 + }, + { + "auxiliary_loss_clip": 0.06414694, + "auxiliary_loss_mlp": 0.01264566, + "balance_loss_clip": 0.06273525, + "balance_loss_mlp": 0.0125342, + "epoch": 0.6632797234330378, + "flos": 12673910914560.0, + "grad_norm": 2.431299325405645, + "language_loss": 0.74500775, + "learning_rate": 1.075903075048228e-06, + "loss": 0.82180035, + "num_input_tokens_seen": 238091045, + "router_z_loss_clip": 1.41210938, + "router_z_loss_mlp": 0.11151123, + "step": 11032, + "time_per_iteration": 2.485921859741211 + }, + { + "auxiliary_loss_clip": 0.06407184, + "auxiliary_loss_mlp": 0.01266513, + "balance_loss_clip": 0.06272276, + "balance_loss_mlp": 0.01256296, + "epoch": 0.6633398466857057, + "flos": 23591168507520.0, + "grad_norm": 1.735276154326279, + "language_loss": 0.80570471, + "learning_rate": 1.0755576966575635e-06, + "loss": 0.88244164, + "num_input_tokens_seen": 238110220, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.10217285, + "step": 11033, + "time_per_iteration": 2.5526669025421143 + }, + { + "auxiliary_loss_clip": 0.0641445, + "auxiliary_loss_mlp": 0.01269108, + "balance_loss_clip": 0.06275126, + "balance_loss_mlp": 0.01257497, + "epoch": 0.6633999699383737, + "flos": 20638018408320.0, + "grad_norm": 1.5867971062319928, + "language_loss": 0.80710161, + "learning_rate": 1.0752123533218451e-06, + "loss": 0.88393718, + "num_input_tokens_seen": 238130400, + "router_z_loss_clip": 1.39257812, + "router_z_loss_mlp": 0.11608887, + "step": 11034, + "time_per_iteration": 2.5465288162231445 + }, + { + "auxiliary_loss_clip": 0.06408665, + "auxiliary_loss_mlp": 0.01266422, + "balance_loss_clip": 0.06272399, + "balance_loss_mlp": 0.01256569, + "epoch": 0.6634600931910416, + "flos": 21803264812800.0, + "grad_norm": 1.6372739814417405, + "language_loss": 0.76400816, + "learning_rate": 1.074867045054166e-06, + "loss": 0.84075904, + "num_input_tokens_seen": 238148165, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.09851074, + "step": 11035, + "time_per_iteration": 2.5024783611297607 + }, + { + "auxiliary_loss_clip": 0.06416409, + "auxiliary_loss_mlp": 0.01265126, + "balance_loss_clip": 0.06273785, + "balance_loss_mlp": 0.01254648, + "epoch": 0.6635202164437096, + "flos": 18738628456320.0, + "grad_norm": 1.632864185122063, + "language_loss": 0.8277241, + "learning_rate": 1.074521771867622e-06, + "loss": 0.90453947, + "num_input_tokens_seen": 238166360, + "router_z_loss_clip": 1.42675781, + "router_z_loss_mlp": 0.10491943, + "step": 11036, + "time_per_iteration": 2.5380334854125977 + }, + { + "auxiliary_loss_clip": 0.06308148, + "auxiliary_loss_mlp": 0.01254977, + "balance_loss_clip": 0.06252232, + "balance_loss_mlp": 0.0125369, + "epoch": 0.6635803396963775, + "flos": 60242501324160.0, + "grad_norm": 0.7586749678323187, + "language_loss": 0.5225606, + "learning_rate": 1.0741765337753044e-06, + "loss": 0.59819186, + "num_input_tokens_seen": 238227630, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01287842, + "step": 11037, + "time_per_iteration": 3.1442580223083496 + }, + { + "auxiliary_loss_clip": 0.06412059, + "auxiliary_loss_mlp": 0.01266845, + "balance_loss_clip": 0.06273833, + "balance_loss_mlp": 0.01255443, + "epoch": 0.6636404629490456, + "flos": 29174414088960.0, + "grad_norm": 1.6208815133420311, + "language_loss": 0.79116094, + "learning_rate": 1.0738313307903052e-06, + "loss": 0.86795002, + "num_input_tokens_seen": 238248435, + "router_z_loss_clip": 1.38476562, + "router_z_loss_mlp": 0.11407471, + "step": 11038, + "time_per_iteration": 2.5753371715545654 + }, + { + "auxiliary_loss_clip": 0.06411879, + "auxiliary_loss_mlp": 0.01264789, + "balance_loss_clip": 0.06272987, + "balance_loss_mlp": 0.01253542, + "epoch": 0.6637005862017135, + "flos": 38916530496000.0, + "grad_norm": 2.008253443704211, + "language_loss": 0.6435625, + "learning_rate": 1.073486162925716e-06, + "loss": 0.72032923, + "num_input_tokens_seen": 238268755, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.11248779, + "step": 11039, + "time_per_iteration": 2.6589627265930176 + }, + { + "auxiliary_loss_clip": 0.06414853, + "auxiliary_loss_mlp": 0.01265068, + "balance_loss_clip": 0.06273548, + "balance_loss_mlp": 0.01254613, + "epoch": 0.6637607094543815, + "flos": 22789870312320.0, + "grad_norm": 2.5741405662525856, + "language_loss": 0.64139444, + "learning_rate": 1.0731410301946237e-06, + "loss": 0.71819365, + "num_input_tokens_seen": 238290120, + "router_z_loss_clip": 1.41503906, + "router_z_loss_mlp": 0.10455322, + "step": 11040, + "time_per_iteration": 3.924652338027954 + }, + { + "auxiliary_loss_clip": 0.06410997, + "auxiliary_loss_mlp": 0.01267386, + "balance_loss_clip": 0.06275022, + "balance_loss_mlp": 0.01257909, + "epoch": 0.6638208327070495, + "flos": 18119996161920.0, + "grad_norm": 1.923413934429174, + "language_loss": 0.72439963, + "learning_rate": 1.0727959326101161e-06, + "loss": 0.80118346, + "num_input_tokens_seen": 238309290, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.09484863, + "step": 11041, + "time_per_iteration": 2.5356383323669434 + }, + { + "auxiliary_loss_clip": 0.06416036, + "auxiliary_loss_mlp": 0.01265882, + "balance_loss_clip": 0.06278844, + "balance_loss_mlp": 0.01255415, + "epoch": 0.6638809559597174, + "flos": 29432703899520.0, + "grad_norm": 2.049859271676146, + "language_loss": 0.61855423, + "learning_rate": 1.0724508701852806e-06, + "loss": 0.69537336, + "num_input_tokens_seen": 238327280, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.10473633, + "step": 11042, + "time_per_iteration": 2.664304256439209 + }, + { + "auxiliary_loss_clip": 0.06417962, + "auxiliary_loss_mlp": 0.012679, + "balance_loss_clip": 0.06273351, + "balance_loss_mlp": 0.01256444, + "epoch": 0.6639410792123854, + "flos": 28079928057600.0, + "grad_norm": 1.8233607330526647, + "language_loss": 0.69058919, + "learning_rate": 1.0721058429331998e-06, + "loss": 0.76744783, + "num_input_tokens_seen": 238346330, + "router_z_loss_clip": 1.44433594, + "router_z_loss_mlp": 0.11462402, + "step": 11043, + "time_per_iteration": 4.0889365673065186 + }, + { + "auxiliary_loss_clip": 0.06404908, + "auxiliary_loss_mlp": 0.01269104, + "balance_loss_clip": 0.06272525, + "balance_loss_mlp": 0.01259818, + "epoch": 0.6640012024650533, + "flos": 25563373257600.0, + "grad_norm": 1.464057970327077, + "language_loss": 0.83693618, + "learning_rate": 1.0717608508669587e-06, + "loss": 0.91367632, + "num_input_tokens_seen": 238364650, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.09283447, + "step": 11044, + "time_per_iteration": 2.5765178203582764 + }, + { + "auxiliary_loss_clip": 0.0640911, + "auxiliary_loss_mlp": 0.01263885, + "balance_loss_clip": 0.0627351, + "balance_loss_mlp": 0.01253234, + "epoch": 0.6640613257177214, + "flos": 14872316561280.0, + "grad_norm": 2.273920138408825, + "language_loss": 0.69855309, + "learning_rate": 1.0714158939996392e-06, + "loss": 0.77528304, + "num_input_tokens_seen": 238381630, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.10650635, + "step": 11045, + "time_per_iteration": 2.475839376449585 + }, + { + "auxiliary_loss_clip": 0.06414758, + "auxiliary_loss_mlp": 0.01268834, + "balance_loss_clip": 0.06275514, + "balance_loss_mlp": 0.01258349, + "epoch": 0.6641214489703893, + "flos": 23227681495680.0, + "grad_norm": 1.3157905928087725, + "language_loss": 0.64253563, + "learning_rate": 1.0710709723443235e-06, + "loss": 0.71937156, + "num_input_tokens_seen": 238402595, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.10479736, + "step": 11046, + "time_per_iteration": 2.550718307495117 + }, + { + "auxiliary_loss_clip": 0.06412549, + "auxiliary_loss_mlp": 0.01265992, + "balance_loss_clip": 0.06275138, + "balance_loss_mlp": 0.01255859, + "epoch": 0.6641815722230573, + "flos": 37751661434880.0, + "grad_norm": 1.3902156312209348, + "language_loss": 0.71747851, + "learning_rate": 1.070726085914088e-06, + "loss": 0.79426396, + "num_input_tokens_seen": 238426860, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.10137939, + "step": 11047, + "time_per_iteration": 2.6542744636535645 + }, + { + "auxiliary_loss_clip": 0.06412829, + "auxiliary_loss_mlp": 0.01265859, + "balance_loss_clip": 0.06275409, + "balance_loss_mlp": 0.01255226, + "epoch": 0.6642416954757252, + "flos": 17936910990720.0, + "grad_norm": 1.7027644321315345, + "language_loss": 0.77464539, + "learning_rate": 1.0703812347220126e-06, + "loss": 0.8514322, + "num_input_tokens_seen": 238443990, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.10632324, + "step": 11048, + "time_per_iteration": 3.896479606628418 + }, + { + "auxiliary_loss_clip": 0.06311446, + "auxiliary_loss_mlp": 0.01254354, + "balance_loss_clip": 0.06255244, + "balance_loss_mlp": 0.01253094, + "epoch": 0.6643018187283932, + "flos": 52010712362880.0, + "grad_norm": 0.7347657101869507, + "language_loss": 0.55013496, + "learning_rate": 1.0700364187811745e-06, + "loss": 0.62579298, + "num_input_tokens_seen": 238503045, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01259613, + "step": 11049, + "time_per_iteration": 3.139099359512329 + }, + { + "auxiliary_loss_clip": 0.06414302, + "auxiliary_loss_mlp": 0.01268369, + "balance_loss_clip": 0.06277852, + "balance_loss_mlp": 0.01258189, + "epoch": 0.6643619419810611, + "flos": 30234463292160.0, + "grad_norm": 1.5235184894534042, + "language_loss": 0.64387465, + "learning_rate": 1.069691638104648e-06, + "loss": 0.72070134, + "num_input_tokens_seen": 238527320, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.10174561, + "step": 11050, + "time_per_iteration": 2.5815443992614746 + }, + { + "auxiliary_loss_clip": 0.06413838, + "auxiliary_loss_mlp": 0.0126498, + "balance_loss_clip": 0.06278379, + "balance_loss_mlp": 0.01254948, + "epoch": 0.6644220652337292, + "flos": 22972745848320.0, + "grad_norm": 1.9836199726179196, + "language_loss": 0.7914626, + "learning_rate": 1.0693468927055085e-06, + "loss": 0.86825073, + "num_input_tokens_seen": 238546030, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.1003418, + "step": 11051, + "time_per_iteration": 2.554255247116089 + }, + { + "auxiliary_loss_clip": 0.06413689, + "auxiliary_loss_mlp": 0.01267197, + "balance_loss_clip": 0.06275009, + "balance_loss_mlp": 0.01256778, + "epoch": 0.6644821884863971, + "flos": 21148602462720.0, + "grad_norm": 1.572752749022216, + "language_loss": 0.85833442, + "learning_rate": 1.0690021825968276e-06, + "loss": 0.93514335, + "num_input_tokens_seen": 238564175, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.10418701, + "step": 11052, + "time_per_iteration": 2.526331663131714 + }, + { + "auxiliary_loss_clip": 0.06422149, + "auxiliary_loss_mlp": 0.01265962, + "balance_loss_clip": 0.06279093, + "balance_loss_mlp": 0.01255108, + "epoch": 0.6645423117390651, + "flos": 20198907486720.0, + "grad_norm": 2.2521915942040134, + "language_loss": 0.75079048, + "learning_rate": 1.0686575077916776e-06, + "loss": 0.82767153, + "num_input_tokens_seen": 238581010, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.10864258, + "step": 11053, + "time_per_iteration": 2.495643377304077 + }, + { + "auxiliary_loss_clip": 0.06411796, + "auxiliary_loss_mlp": 0.01267797, + "balance_loss_clip": 0.06275838, + "balance_loss_mlp": 0.01257659, + "epoch": 0.6646024349917331, + "flos": 24358700707200.0, + "grad_norm": 1.4285282050820745, + "language_loss": 0.79548883, + "learning_rate": 1.0683128683031278e-06, + "loss": 0.87228477, + "num_input_tokens_seen": 238601365, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10144043, + "step": 11054, + "time_per_iteration": 2.533238649368286 + }, + { + "auxiliary_loss_clip": 0.06410603, + "auxiliary_loss_mlp": 0.0126873, + "balance_loss_clip": 0.06275114, + "balance_loss_mlp": 0.01258848, + "epoch": 0.664662558244401, + "flos": 18812617211520.0, + "grad_norm": 1.7645551715374934, + "language_loss": 0.73951137, + "learning_rate": 1.0679682641442472e-06, + "loss": 0.81630468, + "num_input_tokens_seen": 238619850, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.09875488, + "step": 11055, + "time_per_iteration": 2.5263750553131104 + }, + { + "auxiliary_loss_clip": 0.0641698, + "auxiliary_loss_mlp": 0.01266606, + "balance_loss_clip": 0.06276543, + "balance_loss_mlp": 0.01255186, + "epoch": 0.664722681497069, + "flos": 18958749932160.0, + "grad_norm": 1.6799288466366076, + "language_loss": 0.72991651, + "learning_rate": 1.0676236953281042e-06, + "loss": 0.80675244, + "num_input_tokens_seen": 238637635, + "router_z_loss_clip": 1.40527344, + "router_z_loss_mlp": 0.11431885, + "step": 11056, + "time_per_iteration": 2.4944491386413574 + }, + { + "auxiliary_loss_clip": 0.064121, + "auxiliary_loss_mlp": 0.01267868, + "balance_loss_clip": 0.06275958, + "balance_loss_mlp": 0.01257508, + "epoch": 0.6647828047497369, + "flos": 19577046810240.0, + "grad_norm": 1.7319313014316244, + "language_loss": 0.69902766, + "learning_rate": 1.0672791618677641e-06, + "loss": 0.77582735, + "num_input_tokens_seen": 238656200, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.1036377, + "step": 11057, + "time_per_iteration": 2.5427403450012207 + }, + { + "auxiliary_loss_clip": 0.06416071, + "auxiliary_loss_mlp": 0.01265479, + "balance_loss_clip": 0.06276184, + "balance_loss_mlp": 0.01255298, + "epoch": 0.664842928002405, + "flos": 23156250289920.0, + "grad_norm": 1.6627595883052484, + "language_loss": 0.80624598, + "learning_rate": 1.066934663776291e-06, + "loss": 0.88306141, + "num_input_tokens_seen": 238675005, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.10186768, + "step": 11058, + "time_per_iteration": 2.543358325958252 + }, + { + "auxiliary_loss_clip": 0.06310651, + "auxiliary_loss_mlp": 0.01251744, + "balance_loss_clip": 0.06254779, + "balance_loss_mlp": 0.01250295, + "epoch": 0.6649030512550729, + "flos": 65263326301440.0, + "grad_norm": 0.7825270857978761, + "language_loss": 0.6256783, + "learning_rate": 1.0665902010667496e-06, + "loss": 0.70130229, + "num_input_tokens_seen": 238731425, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.01447296, + "step": 11059, + "time_per_iteration": 3.081268548965454 + }, + { + "auxiliary_loss_clip": 0.0641288, + "auxiliary_loss_mlp": 0.01266045, + "balance_loss_clip": 0.06275995, + "balance_loss_mlp": 0.01255549, + "epoch": 0.6649631745077409, + "flos": 20201213473920.0, + "grad_norm": 1.6475331375538982, + "language_loss": 0.79008389, + "learning_rate": 1.0662457737522008e-06, + "loss": 0.86687315, + "num_input_tokens_seen": 238752020, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.1048584, + "step": 11060, + "time_per_iteration": 2.5021138191223145 + }, + { + "auxiliary_loss_clip": 0.06418125, + "auxiliary_loss_mlp": 0.0126778, + "balance_loss_clip": 0.06280607, + "balance_loss_mlp": 0.01257266, + "epoch": 0.6650232977604088, + "flos": 17244331868160.0, + "grad_norm": 2.2525334751718358, + "language_loss": 0.79225111, + "learning_rate": 1.0659013818457055e-06, + "loss": 0.86911017, + "num_input_tokens_seen": 238769665, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.10510254, + "step": 11061, + "time_per_iteration": 2.4997215270996094 + }, + { + "auxiliary_loss_clip": 0.06414805, + "auxiliary_loss_mlp": 0.01266652, + "balance_loss_clip": 0.06278637, + "balance_loss_mlp": 0.01256102, + "epoch": 0.6650834210130768, + "flos": 10010175217920.0, + "grad_norm": 1.965420807772364, + "language_loss": 0.57191408, + "learning_rate": 1.0655570253603243e-06, + "loss": 0.64872867, + "num_input_tokens_seen": 238782180, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.10552979, + "step": 11062, + "time_per_iteration": 2.457599401473999 + }, + { + "auxiliary_loss_clip": 0.06419773, + "auxiliary_loss_mlp": 0.01266686, + "balance_loss_clip": 0.06275927, + "balance_loss_mlp": 0.01254533, + "epoch": 0.6651435442657447, + "flos": 10456707225600.0, + "grad_norm": 2.498798138431811, + "language_loss": 0.76121116, + "learning_rate": 1.0652127043091144e-06, + "loss": 0.83807576, + "num_input_tokens_seen": 238800315, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.121521, + "step": 11063, + "time_per_iteration": 2.5354268550872803 + }, + { + "auxiliary_loss_clip": 0.06417998, + "auxiliary_loss_mlp": 0.01266902, + "balance_loss_clip": 0.06278798, + "balance_loss_mlp": 0.0125724, + "epoch": 0.6652036675184128, + "flos": 22350465901440.0, + "grad_norm": 2.2315353157370836, + "language_loss": 0.708628, + "learning_rate": 1.0648684187051316e-06, + "loss": 0.78547704, + "num_input_tokens_seen": 238822250, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.09655762, + "step": 11064, + "time_per_iteration": 2.601271390914917 + }, + { + "auxiliary_loss_clip": 0.06307759, + "auxiliary_loss_mlp": 0.01252714, + "balance_loss_clip": 0.06251188, + "balance_loss_mlp": 0.01251267, + "epoch": 0.6652637907710807, + "flos": 52925467386240.0, + "grad_norm": 0.8269137521288277, + "language_loss": 0.62977844, + "learning_rate": 1.0645241685614322e-06, + "loss": 0.70538318, + "num_input_tokens_seen": 238877190, + "router_z_loss_clip": 0.56640625, + "router_z_loss_mlp": 0.01445007, + "step": 11065, + "time_per_iteration": 3.088651180267334 + }, + { + "auxiliary_loss_clip": 0.06417314, + "auxiliary_loss_mlp": 0.01265582, + "balance_loss_clip": 0.06277956, + "balance_loss_mlp": 0.01255091, + "epoch": 0.6653239140237487, + "flos": 23110031963520.0, + "grad_norm": 1.7770048566161585, + "language_loss": 0.62216848, + "learning_rate": 1.0641799538910708e-06, + "loss": 0.69899738, + "num_input_tokens_seen": 238896010, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.10491943, + "step": 11066, + "time_per_iteration": 2.514662981033325 + }, + { + "auxiliary_loss_clip": 0.06416589, + "auxiliary_loss_mlp": 0.01266733, + "balance_loss_clip": 0.06276087, + "balance_loss_mlp": 0.01256123, + "epoch": 0.6653840372764167, + "flos": 25966747612800.0, + "grad_norm": 1.500590710166923, + "language_loss": 0.70431817, + "learning_rate": 1.0638357747070985e-06, + "loss": 0.78115141, + "num_input_tokens_seen": 238918990, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.1060791, + "step": 11067, + "time_per_iteration": 2.629611015319824 + }, + { + "auxiliary_loss_clip": 0.06312129, + "auxiliary_loss_mlp": 0.01250999, + "balance_loss_clip": 0.06255849, + "balance_loss_mlp": 0.01249609, + "epoch": 0.6654441605290846, + "flos": 66059593251840.0, + "grad_norm": 0.8851345245048583, + "language_loss": 0.71944451, + "learning_rate": 1.0634916310225684e-06, + "loss": 0.79507577, + "num_input_tokens_seen": 238975735, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01391602, + "step": 11068, + "time_per_iteration": 3.1097211837768555 + }, + { + "auxiliary_loss_clip": 0.06313328, + "auxiliary_loss_mlp": 0.01253328, + "balance_loss_clip": 0.0625675, + "balance_loss_mlp": 0.01251991, + "epoch": 0.6655042837817526, + "flos": 65218560693120.0, + "grad_norm": 0.7108385158391787, + "language_loss": 0.577793, + "learning_rate": 1.0631475228505285e-06, + "loss": 0.65345955, + "num_input_tokens_seen": 239042360, + "router_z_loss_clip": 0.56640625, + "router_z_loss_mlp": 0.01338959, + "step": 11069, + "time_per_iteration": 4.7683820724487305 + }, + { + "auxiliary_loss_clip": 0.0631298, + "auxiliary_loss_mlp": 0.01252294, + "balance_loss_clip": 0.0625658, + "balance_loss_mlp": 0.01250911, + "epoch": 0.6655644070344205, + "flos": 69028759480320.0, + "grad_norm": 0.7328423376388431, + "language_loss": 0.63529485, + "learning_rate": 1.062803450204029e-06, + "loss": 0.71094757, + "num_input_tokens_seen": 239109410, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01386261, + "step": 11070, + "time_per_iteration": 3.218775749206543 + }, + { + "auxiliary_loss_clip": 0.06412843, + "auxiliary_loss_mlp": 0.0126417, + "balance_loss_clip": 0.06274422, + "balance_loss_mlp": 0.01253668, + "epoch": 0.6656245302870886, + "flos": 36323680953600.0, + "grad_norm": 1.5647890242278204, + "language_loss": 0.58715665, + "learning_rate": 1.062459413096116e-06, + "loss": 0.66392684, + "num_input_tokens_seen": 239135345, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.1050415, + "step": 11071, + "time_per_iteration": 2.6759583950042725 + }, + { + "auxiliary_loss_clip": 0.06415486, + "auxiliary_loss_mlp": 0.01266737, + "balance_loss_clip": 0.06278834, + "balance_loss_mlp": 0.01256544, + "epoch": 0.6656846535397565, + "flos": 21800623409280.0, + "grad_norm": 1.6094882760656495, + "language_loss": 0.7278558, + "learning_rate": 1.0621154115398364e-06, + "loss": 0.80467808, + "num_input_tokens_seen": 239154340, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.10192871, + "step": 11072, + "time_per_iteration": 2.506439685821533 + }, + { + "auxiliary_loss_clip": 0.0641008, + "auxiliary_loss_mlp": 0.01266315, + "balance_loss_clip": 0.06274915, + "balance_loss_mlp": 0.01255729, + "epoch": 0.6657447767924245, + "flos": 37496683860480.0, + "grad_norm": 1.9931671493726393, + "language_loss": 0.70538545, + "learning_rate": 1.0617714455482353e-06, + "loss": 0.78214943, + "num_input_tokens_seen": 239177815, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.10583496, + "step": 11073, + "time_per_iteration": 2.687361240386963 + }, + { + "auxiliary_loss_clip": 0.06420862, + "auxiliary_loss_mlp": 0.01262058, + "balance_loss_clip": 0.06278072, + "balance_loss_mlp": 0.01251353, + "epoch": 0.6658049000450924, + "flos": 16843473135360.0, + "grad_norm": 1.8042269767870909, + "language_loss": 0.5659616, + "learning_rate": 1.061427515134354e-06, + "loss": 0.64279079, + "num_input_tokens_seen": 239195735, + "router_z_loss_clip": 1.42675781, + "router_z_loss_mlp": 0.10699463, + "step": 11074, + "time_per_iteration": 2.476226568222046 + }, + { + "auxiliary_loss_clip": 0.06415518, + "auxiliary_loss_mlp": 0.01268741, + "balance_loss_clip": 0.06278802, + "balance_loss_mlp": 0.01258417, + "epoch": 0.6658650232977604, + "flos": 33519430759680.0, + "grad_norm": 1.4700349170865334, + "language_loss": 0.72126347, + "learning_rate": 1.061083620311235e-06, + "loss": 0.79810607, + "num_input_tokens_seen": 239217535, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.10321045, + "step": 11075, + "time_per_iteration": 2.655700922012329 + }, + { + "auxiliary_loss_clip": 0.06410009, + "auxiliary_loss_mlp": 0.01264716, + "balance_loss_clip": 0.06274687, + "balance_loss_mlp": 0.01254983, + "epoch": 0.6659251465504283, + "flos": 37715379816960.0, + "grad_norm": 1.432398272569416, + "language_loss": 0.66657937, + "learning_rate": 1.0607397610919202e-06, + "loss": 0.7433266, + "num_input_tokens_seen": 239241975, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.09729004, + "step": 11076, + "time_per_iteration": 2.66424822807312 + }, + { + "auxiliary_loss_clip": 0.06411892, + "auxiliary_loss_mlp": 0.01265269, + "balance_loss_clip": 0.06275803, + "balance_loss_mlp": 0.01254433, + "epoch": 0.6659852698030964, + "flos": 24899277323520.0, + "grad_norm": 1.6226979142446254, + "language_loss": 0.75448096, + "learning_rate": 1.0603959374894468e-06, + "loss": 0.83125257, + "num_input_tokens_seen": 239262025, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.10845947, + "step": 11077, + "time_per_iteration": 2.5727341175079346 + }, + { + "auxiliary_loss_clip": 0.06412426, + "auxiliary_loss_mlp": 0.01263175, + "balance_loss_clip": 0.06273601, + "balance_loss_mlp": 0.01252631, + "epoch": 0.6660453930557643, + "flos": 24359706956160.0, + "grad_norm": 1.8442117034793826, + "language_loss": 0.66886055, + "learning_rate": 1.0600521495168538e-06, + "loss": 0.74561661, + "num_input_tokens_seen": 239282775, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.10546875, + "step": 11078, + "time_per_iteration": 2.543839931488037 + }, + { + "auxiliary_loss_clip": 0.06421163, + "auxiliary_loss_mlp": 0.01268494, + "balance_loss_clip": 0.06279247, + "balance_loss_mlp": 0.01257533, + "epoch": 0.6661055163084323, + "flos": 10602420675840.0, + "grad_norm": 1.9694934778902873, + "language_loss": 0.69631219, + "learning_rate": 1.0597083971871783e-06, + "loss": 0.77320874, + "num_input_tokens_seen": 239299775, + "router_z_loss_clip": 1.41894531, + "router_z_loss_mlp": 0.10961914, + "step": 11079, + "time_per_iteration": 2.541069269180298 + }, + { + "auxiliary_loss_clip": 0.06411281, + "auxiliary_loss_mlp": 0.01265438, + "balance_loss_clip": 0.06274305, + "balance_loss_mlp": 0.01255067, + "epoch": 0.6661656395611003, + "flos": 24063751935360.0, + "grad_norm": 2.893983796141558, + "language_loss": 0.80461812, + "learning_rate": 1.0593646805134544e-06, + "loss": 0.88138527, + "num_input_tokens_seen": 239319660, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.10375977, + "step": 11080, + "time_per_iteration": 4.085668087005615 + }, + { + "auxiliary_loss_clip": 0.06407166, + "auxiliary_loss_mlp": 0.01263859, + "balance_loss_clip": 0.06275053, + "balance_loss_mlp": 0.01254114, + "epoch": 0.6662257628137682, + "flos": 23042332264320.0, + "grad_norm": 1.7166684069014877, + "language_loss": 0.78285092, + "learning_rate": 1.0590209995087157e-06, + "loss": 0.85956115, + "num_input_tokens_seen": 239339215, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09729004, + "step": 11081, + "time_per_iteration": 2.5193705558776855 + }, + { + "auxiliary_loss_clip": 0.06415745, + "auxiliary_loss_mlp": 0.01265653, + "balance_loss_clip": 0.06274147, + "balance_loss_mlp": 0.01254364, + "epoch": 0.6662858860664362, + "flos": 24761446156800.0, + "grad_norm": 1.6242146726224216, + "language_loss": 0.80530953, + "learning_rate": 1.0586773541859946e-06, + "loss": 0.88212347, + "num_input_tokens_seen": 239358545, + "router_z_loss_clip": 1.41601562, + "router_z_loss_mlp": 0.11291504, + "step": 11082, + "time_per_iteration": 2.569957971572876 + }, + { + "auxiliary_loss_clip": 0.0641424, + "auxiliary_loss_mlp": 0.01265735, + "balance_loss_clip": 0.06276894, + "balance_loss_mlp": 0.01255757, + "epoch": 0.6663460093191041, + "flos": 20014899920640.0, + "grad_norm": 1.3932549437891448, + "language_loss": 0.83467507, + "learning_rate": 1.0583337445583234e-06, + "loss": 0.91147482, + "num_input_tokens_seen": 239376665, + "router_z_loss_clip": 1.37402344, + "router_z_loss_mlp": 0.09979248, + "step": 11083, + "time_per_iteration": 3.9742698669433594 + }, + { + "auxiliary_loss_clip": 0.06423122, + "auxiliary_loss_mlp": 0.01266045, + "balance_loss_clip": 0.06280323, + "balance_loss_mlp": 0.01254995, + "epoch": 0.6664061325717722, + "flos": 17827101815040.0, + "grad_norm": 2.1194460311014023, + "language_loss": 0.85585803, + "learning_rate": 1.057990170638731e-06, + "loss": 0.93274969, + "num_input_tokens_seen": 239394345, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.11053467, + "step": 11084, + "time_per_iteration": 2.4959633350372314 + }, + { + "auxiliary_loss_clip": 0.0642017, + "auxiliary_loss_mlp": 0.01265063, + "balance_loss_clip": 0.06277794, + "balance_loss_mlp": 0.0125434, + "epoch": 0.6664662558244401, + "flos": 18082666368000.0, + "grad_norm": 2.6259945452160185, + "language_loss": 0.73187411, + "learning_rate": 1.0576466324402452e-06, + "loss": 0.80872643, + "num_input_tokens_seen": 239410605, + "router_z_loss_clip": 1.42382812, + "router_z_loss_mlp": 0.1072998, + "step": 11085, + "time_per_iteration": 2.475743055343628 + }, + { + "auxiliary_loss_clip": 0.06412315, + "auxiliary_loss_mlp": 0.01264882, + "balance_loss_clip": 0.06275545, + "balance_loss_mlp": 0.01253718, + "epoch": 0.6665263790771081, + "flos": 21579663392640.0, + "grad_norm": 1.7551532896089992, + "language_loss": 0.80931759, + "learning_rate": 1.057303129975894e-06, + "loss": 0.88608956, + "num_input_tokens_seen": 239427155, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.11157227, + "step": 11086, + "time_per_iteration": 2.537797689437866 + }, + { + "auxiliary_loss_clip": 0.06411488, + "auxiliary_loss_mlp": 0.01267617, + "balance_loss_clip": 0.06275479, + "balance_loss_mlp": 0.01257079, + "epoch": 0.666586502329776, + "flos": 24213448454400.0, + "grad_norm": 1.98835460832662, + "language_loss": 0.7529, + "learning_rate": 1.056959663258702e-06, + "loss": 0.82969105, + "num_input_tokens_seen": 239445510, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.10540771, + "step": 11087, + "time_per_iteration": 2.5238702297210693 + }, + { + "auxiliary_loss_clip": 0.06414294, + "auxiliary_loss_mlp": 0.01264278, + "balance_loss_clip": 0.06277943, + "balance_loss_mlp": 0.01253621, + "epoch": 0.666646625582444, + "flos": 22207100365440.0, + "grad_norm": 1.5295252788179032, + "language_loss": 0.65136206, + "learning_rate": 1.0566162323016939e-06, + "loss": 0.72814775, + "num_input_tokens_seen": 239464805, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.10656738, + "step": 11088, + "time_per_iteration": 3.9619038105010986 + }, + { + "auxiliary_loss_clip": 0.06416193, + "auxiliary_loss_mlp": 0.01266589, + "balance_loss_clip": 0.06277834, + "balance_loss_mlp": 0.01255753, + "epoch": 0.6667067488351119, + "flos": 18265835393280.0, + "grad_norm": 1.9855105228277763, + "language_loss": 0.64599085, + "learning_rate": 1.0562728371178928e-06, + "loss": 0.72281867, + "num_input_tokens_seen": 239483890, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.1083374, + "step": 11089, + "time_per_iteration": 2.5900728702545166 + }, + { + "auxiliary_loss_clip": 0.06409112, + "auxiliary_loss_mlp": 0.01265636, + "balance_loss_clip": 0.06274208, + "balance_loss_mlp": 0.01255313, + "epoch": 0.66676687208778, + "flos": 17241983953920.0, + "grad_norm": 2.1106067212474704, + "language_loss": 0.81439161, + "learning_rate": 1.0559294777203221e-06, + "loss": 0.89113915, + "num_input_tokens_seen": 239500080, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.10314941, + "step": 11090, + "time_per_iteration": 2.4597456455230713 + }, + { + "auxiliary_loss_clip": 0.06415623, + "auxiliary_loss_mlp": 0.01266415, + "balance_loss_clip": 0.06274828, + "balance_loss_mlp": 0.01255877, + "epoch": 0.6668269953404479, + "flos": 19757742140160.0, + "grad_norm": 1.8443713907824004, + "language_loss": 0.7767818, + "learning_rate": 1.0555861541219984e-06, + "loss": 0.85360217, + "num_input_tokens_seen": 239517335, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.10540771, + "step": 11091, + "time_per_iteration": 2.5587215423583984 + }, + { + "auxiliary_loss_clip": 0.06415166, + "auxiliary_loss_mlp": 0.01267323, + "balance_loss_clip": 0.06277118, + "balance_loss_mlp": 0.01256487, + "epoch": 0.6668871185931159, + "flos": 20564700485760.0, + "grad_norm": 3.5971234891656265, + "language_loss": 0.79227078, + "learning_rate": 1.0552428663359425e-06, + "loss": 0.86909568, + "num_input_tokens_seen": 239536240, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.10827637, + "step": 11092, + "time_per_iteration": 2.4899661540985107 + }, + { + "auxiliary_loss_clip": 0.06313632, + "auxiliary_loss_mlp": 0.01258221, + "balance_loss_clip": 0.06257559, + "balance_loss_mlp": 0.0125709, + "epoch": 0.6669472418457839, + "flos": 58104458144640.0, + "grad_norm": 0.7522047627769642, + "language_loss": 0.57524383, + "learning_rate": 1.0548996143751724e-06, + "loss": 0.65096241, + "num_input_tokens_seen": 239598000, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01133728, + "step": 11093, + "time_per_iteration": 3.147273540496826 + }, + { + "auxiliary_loss_clip": 0.06411624, + "auxiliary_loss_mlp": 0.01265167, + "balance_loss_clip": 0.06275775, + "balance_loss_mlp": 0.0125504, + "epoch": 0.6670073650984518, + "flos": 26071860960000.0, + "grad_norm": 1.491694696645918, + "language_loss": 0.76499665, + "learning_rate": 1.054556398252703e-06, + "loss": 0.84176457, + "num_input_tokens_seen": 239617650, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.10125732, + "step": 11094, + "time_per_iteration": 2.654946804046631 + }, + { + "auxiliary_loss_clip": 0.06412062, + "auxiliary_loss_mlp": 0.01267472, + "balance_loss_clip": 0.06274471, + "balance_loss_mlp": 0.01256349, + "epoch": 0.6670674883511198, + "flos": 32425196290560.0, + "grad_norm": 1.786455566216807, + "language_loss": 0.73555851, + "learning_rate": 1.05421321798155e-06, + "loss": 0.81235385, + "num_input_tokens_seen": 239639825, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.11132812, + "step": 11095, + "time_per_iteration": 2.6546003818511963 + }, + { + "auxiliary_loss_clip": 0.06414741, + "auxiliary_loss_mlp": 0.01270593, + "balance_loss_clip": 0.06277339, + "balance_loss_mlp": 0.01260145, + "epoch": 0.6671276116037878, + "flos": 18043114440960.0, + "grad_norm": 1.9034949183118532, + "language_loss": 0.73389214, + "learning_rate": 1.053870073574727e-06, + "loss": 0.81074548, + "num_input_tokens_seen": 239656300, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.10437012, + "step": 11096, + "time_per_iteration": 2.5232880115509033 + }, + { + "auxiliary_loss_clip": 0.06407115, + "auxiliary_loss_mlp": 0.01267563, + "balance_loss_clip": 0.06273691, + "balance_loss_mlp": 0.01257419, + "epoch": 0.6671877348564558, + "flos": 23773498992000.0, + "grad_norm": 1.8900040408751917, + "language_loss": 0.64173019, + "learning_rate": 1.0535269650452456e-06, + "loss": 0.71847701, + "num_input_tokens_seen": 239676655, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.10144043, + "step": 11097, + "time_per_iteration": 2.53245210647583 + }, + { + "auxiliary_loss_clip": 0.06414811, + "auxiliary_loss_mlp": 0.01270626, + "balance_loss_clip": 0.06272861, + "balance_loss_mlp": 0.01259939, + "epoch": 0.6672478581091237, + "flos": 20923869012480.0, + "grad_norm": 1.7889953519105342, + "language_loss": 0.76164997, + "learning_rate": 1.0531838924061158e-06, + "loss": 0.83850437, + "num_input_tokens_seen": 239695430, + "router_z_loss_clip": 1.41894531, + "router_z_loss_mlp": 0.10681152, + "step": 11098, + "time_per_iteration": 2.5418834686279297 + }, + { + "auxiliary_loss_clip": 0.0641548, + "auxiliary_loss_mlp": 0.01271314, + "balance_loss_clip": 0.0627536, + "balance_loss_mlp": 0.01260675, + "epoch": 0.6673079813617917, + "flos": 27863328453120.0, + "grad_norm": 1.4249693183378689, + "language_loss": 0.74138522, + "learning_rate": 1.0528408556703476e-06, + "loss": 0.81825316, + "num_input_tokens_seen": 239717070, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.10632324, + "step": 11099, + "time_per_iteration": 2.6019399166107178 + }, + { + "auxiliary_loss_clip": 0.06409659, + "auxiliary_loss_mlp": 0.01264891, + "balance_loss_clip": 0.06275995, + "balance_loss_mlp": 0.01254722, + "epoch": 0.6673681046144596, + "flos": 21623366096640.0, + "grad_norm": 1.7662195801139693, + "language_loss": 0.78545117, + "learning_rate": 1.0524978548509502e-06, + "loss": 0.86219656, + "num_input_tokens_seen": 239737105, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.1015625, + "step": 11100, + "time_per_iteration": 2.681669235229492 + }, + { + "auxiliary_loss_clip": 0.06412613, + "auxiliary_loss_mlp": 0.01264451, + "balance_loss_clip": 0.06276593, + "balance_loss_mlp": 0.01254247, + "epoch": 0.6674282278671276, + "flos": 20896727489280.0, + "grad_norm": 1.8459209339693166, + "language_loss": 0.60927689, + "learning_rate": 1.0521548899609288e-06, + "loss": 0.68604755, + "num_input_tokens_seen": 239757835, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.10211182, + "step": 11101, + "time_per_iteration": 2.53374981880188 + }, + { + "auxiliary_loss_clip": 0.06421657, + "auxiliary_loss_mlp": 0.0126643, + "balance_loss_clip": 0.06276177, + "balance_loss_mlp": 0.01254276, + "epoch": 0.6674883511197955, + "flos": 23631139704960.0, + "grad_norm": 1.6188105594216948, + "language_loss": 0.7136634, + "learning_rate": 1.0518119610132884e-06, + "loss": 0.79054427, + "num_input_tokens_seen": 239775425, + "router_z_loss_clip": 1.45507812, + "router_z_loss_mlp": 0.121521, + "step": 11102, + "time_per_iteration": 2.572932481765747 + }, + { + "auxiliary_loss_clip": 0.06414107, + "auxiliary_loss_mlp": 0.01266311, + "balance_loss_clip": 0.06274531, + "balance_loss_mlp": 0.01256041, + "epoch": 0.6675484743724636, + "flos": 19615760196480.0, + "grad_norm": 1.3319232732101594, + "language_loss": 0.84587741, + "learning_rate": 1.051469068021034e-06, + "loss": 0.92268157, + "num_input_tokens_seen": 239794605, + "router_z_loss_clip": 1.39648438, + "router_z_loss_mlp": 0.1027832, + "step": 11103, + "time_per_iteration": 2.5075833797454834 + }, + { + "auxiliary_loss_clip": 0.06411143, + "auxiliary_loss_mlp": 0.01263708, + "balance_loss_clip": 0.06271936, + "balance_loss_mlp": 0.01254482, + "epoch": 0.6676085976251315, + "flos": 14324696202240.0, + "grad_norm": 1.9260757560792952, + "language_loss": 0.78627831, + "learning_rate": 1.0511262109971668e-06, + "loss": 0.86302686, + "num_input_tokens_seen": 239812135, + "router_z_loss_clip": 1.39257812, + "router_z_loss_mlp": 0.09222412, + "step": 11104, + "time_per_iteration": 2.5494680404663086 + }, + { + "auxiliary_loss_clip": 0.06418018, + "auxiliary_loss_mlp": 0.01267231, + "balance_loss_clip": 0.0627483, + "balance_loss_mlp": 0.01256531, + "epoch": 0.6676687208777995, + "flos": 38113219802880.0, + "grad_norm": 1.3963666193820934, + "language_loss": 0.58238858, + "learning_rate": 1.0507833899546889e-06, + "loss": 0.65924108, + "num_input_tokens_seen": 239835845, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.10693359, + "step": 11105, + "time_per_iteration": 2.6544291973114014 + }, + { + "auxiliary_loss_clip": 0.06419846, + "auxiliary_loss_mlp": 0.01267664, + "balance_loss_clip": 0.06274708, + "balance_loss_mlp": 0.01255921, + "epoch": 0.6677288441304675, + "flos": 23987331411840.0, + "grad_norm": 1.4856417680447878, + "language_loss": 0.72987849, + "learning_rate": 1.0504406049066e-06, + "loss": 0.80675358, + "num_input_tokens_seen": 239853820, + "router_z_loss_clip": 1.45019531, + "router_z_loss_mlp": 0.11749268, + "step": 11106, + "time_per_iteration": 2.591508150100708 + }, + { + "auxiliary_loss_clip": 0.06410738, + "auxiliary_loss_mlp": 0.01269876, + "balance_loss_clip": 0.06272997, + "balance_loss_mlp": 0.01259392, + "epoch": 0.6677889673831354, + "flos": 24177586106880.0, + "grad_norm": 1.6277621549569181, + "language_loss": 0.76611882, + "learning_rate": 1.0500978558659e-06, + "loss": 0.84292495, + "num_input_tokens_seen": 239873365, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.1048584, + "step": 11107, + "time_per_iteration": 2.5117390155792236 + }, + { + "auxiliary_loss_clip": 0.06407823, + "auxiliary_loss_mlp": 0.01272133, + "balance_loss_clip": 0.06275569, + "balance_loss_mlp": 0.01262364, + "epoch": 0.6678490906358034, + "flos": 22316196781440.0, + "grad_norm": 2.1688615595462033, + "language_loss": 0.90383065, + "learning_rate": 1.049755142845583e-06, + "loss": 0.98063028, + "num_input_tokens_seen": 239891215, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.09765625, + "step": 11108, + "time_per_iteration": 3.940439224243164 + }, + { + "auxiliary_loss_clip": 0.06408696, + "auxiliary_loss_mlp": 0.01263517, + "balance_loss_clip": 0.0627287, + "balance_loss_mlp": 0.01254499, + "epoch": 0.6679092138884714, + "flos": 36906870170880.0, + "grad_norm": 1.379580541372803, + "language_loss": 0.82916903, + "learning_rate": 1.049412465858646e-06, + "loss": 0.90589124, + "num_input_tokens_seen": 239913490, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.09020996, + "step": 11109, + "time_per_iteration": 2.6550536155700684 + }, + { + "auxiliary_loss_clip": 0.06415845, + "auxiliary_loss_mlp": 0.01269099, + "balance_loss_clip": 0.06276993, + "balance_loss_mlp": 0.01257869, + "epoch": 0.6679693371411394, + "flos": 18156151998720.0, + "grad_norm": 1.7439527968582467, + "language_loss": 0.69522661, + "learning_rate": 1.0490698249180847e-06, + "loss": 0.77207607, + "num_input_tokens_seen": 239931565, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.11236572, + "step": 11110, + "time_per_iteration": 2.505737543106079 + }, + { + "auxiliary_loss_clip": 0.06418422, + "auxiliary_loss_mlp": 0.01267651, + "balance_loss_clip": 0.06277301, + "balance_loss_mlp": 0.01255886, + "epoch": 0.6680294603938073, + "flos": 27205437720960.0, + "grad_norm": 1.4770947447978742, + "language_loss": 0.73935318, + "learning_rate": 1.04872722003689e-06, + "loss": 0.81621397, + "num_input_tokens_seen": 239952395, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.11767578, + "step": 11111, + "time_per_iteration": 2.6036081314086914 + }, + { + "auxiliary_loss_clip": 0.06412682, + "auxiliary_loss_mlp": 0.01267643, + "balance_loss_clip": 0.06276079, + "balance_loss_mlp": 0.01257266, + "epoch": 0.6680895836464753, + "flos": 21731665898880.0, + "grad_norm": 1.7721381481924603, + "language_loss": 0.65662813, + "learning_rate": 1.0483846512280553e-06, + "loss": 0.73343134, + "num_input_tokens_seen": 239968910, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.10375977, + "step": 11112, + "time_per_iteration": 2.5148162841796875 + }, + { + "auxiliary_loss_clip": 0.06408017, + "auxiliary_loss_mlp": 0.01264862, + "balance_loss_clip": 0.06270978, + "balance_loss_mlp": 0.01254509, + "epoch": 0.6681497068991432, + "flos": 19652628792960.0, + "grad_norm": 2.188254018589407, + "language_loss": 0.63796169, + "learning_rate": 1.048042118504569e-06, + "loss": 0.71469045, + "num_input_tokens_seen": 239987680, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.10357666, + "step": 11113, + "time_per_iteration": 2.5091605186462402 + }, + { + "auxiliary_loss_clip": 0.06408161, + "auxiliary_loss_mlp": 0.0126667, + "balance_loss_clip": 0.06274618, + "balance_loss_mlp": 0.01257008, + "epoch": 0.6682098301518112, + "flos": 17424649854720.0, + "grad_norm": 1.7204263321571711, + "language_loss": 0.65997386, + "learning_rate": 1.047699621879422e-06, + "loss": 0.73672217, + "num_input_tokens_seen": 240005790, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.09667969, + "step": 11114, + "time_per_iteration": 2.5244226455688477 + }, + { + "auxiliary_loss_clip": 0.06406785, + "auxiliary_loss_mlp": 0.01265665, + "balance_loss_clip": 0.06270755, + "balance_loss_mlp": 0.01255378, + "epoch": 0.6682699534044791, + "flos": 22605191913600.0, + "grad_norm": 1.4259756578870375, + "language_loss": 0.78704619, + "learning_rate": 1.0473571613655998e-06, + "loss": 0.86377072, + "num_input_tokens_seen": 240025895, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.10290527, + "step": 11115, + "time_per_iteration": 2.544543504714966 + }, + { + "auxiliary_loss_clip": 0.06410562, + "auxiliary_loss_mlp": 0.01266412, + "balance_loss_clip": 0.06271002, + "balance_loss_mlp": 0.01256703, + "epoch": 0.6683300766571472, + "flos": 24870668353920.0, + "grad_norm": 1.896886529208747, + "language_loss": 0.79640424, + "learning_rate": 1.0470147369760896e-06, + "loss": 0.87317395, + "num_input_tokens_seen": 240044880, + "router_z_loss_clip": 1.39648438, + "router_z_loss_mlp": 0.09716797, + "step": 11116, + "time_per_iteration": 2.5271427631378174 + }, + { + "auxiliary_loss_clip": 0.06415811, + "auxiliary_loss_mlp": 0.01274733, + "balance_loss_clip": 0.06276368, + "balance_loss_mlp": 0.01263891, + "epoch": 0.6683901999098151, + "flos": 27134132296320.0, + "grad_norm": 1.70831438842013, + "language_loss": 0.79465652, + "learning_rate": 1.0466723487238768e-06, + "loss": 0.871562, + "num_input_tokens_seen": 240065785, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.10852051, + "step": 11117, + "time_per_iteration": 2.5867950916290283 + }, + { + "auxiliary_loss_clip": 0.06413716, + "auxiliary_loss_mlp": 0.01269769, + "balance_loss_clip": 0.06274913, + "balance_loss_mlp": 0.01258147, + "epoch": 0.6684503231624831, + "flos": 20745018472320.0, + "grad_norm": 1.68089949787921, + "language_loss": 0.65774792, + "learning_rate": 1.0463299966219441e-06, + "loss": 0.73458278, + "num_input_tokens_seen": 240085130, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.1161499, + "step": 11118, + "time_per_iteration": 2.5065219402313232 + }, + { + "auxiliary_loss_clip": 0.06409101, + "auxiliary_loss_mlp": 0.01267125, + "balance_loss_clip": 0.06272688, + "balance_loss_mlp": 0.01256426, + "epoch": 0.668510446415151, + "flos": 21768618349440.0, + "grad_norm": 1.4670277033373609, + "language_loss": 0.69327927, + "learning_rate": 1.0459876806832727e-06, + "loss": 0.77004153, + "num_input_tokens_seen": 240105495, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.10705566, + "step": 11119, + "time_per_iteration": 3.9497127532958984 + }, + { + "auxiliary_loss_clip": 0.06411311, + "auxiliary_loss_mlp": 0.01263174, + "balance_loss_clip": 0.06272611, + "balance_loss_mlp": 0.01253155, + "epoch": 0.668570569667819, + "flos": 30199229850240.0, + "grad_norm": 1.557441143928688, + "language_loss": 0.67133182, + "learning_rate": 1.0456454009208448e-06, + "loss": 0.74807668, + "num_input_tokens_seen": 240125455, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.10015869, + "step": 11120, + "time_per_iteration": 2.583557605743408 + }, + { + "auxiliary_loss_clip": 0.06409501, + "auxiliary_loss_mlp": 0.01266513, + "balance_loss_clip": 0.06271403, + "balance_loss_mlp": 0.0125551, + "epoch": 0.668630692920487, + "flos": 24177544179840.0, + "grad_norm": 1.6997365737566905, + "language_loss": 0.72227985, + "learning_rate": 1.045303157347638e-06, + "loss": 0.79904002, + "num_input_tokens_seen": 240143870, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.10998535, + "step": 11121, + "time_per_iteration": 2.5303213596343994 + }, + { + "auxiliary_loss_clip": 0.06415744, + "auxiliary_loss_mlp": 0.01268909, + "balance_loss_clip": 0.06275598, + "balance_loss_mlp": 0.01258442, + "epoch": 0.668690816173155, + "flos": 17462902043520.0, + "grad_norm": 2.410576654010779, + "language_loss": 0.70488191, + "learning_rate": 1.0449609499766316e-06, + "loss": 0.78172839, + "num_input_tokens_seen": 240161020, + "router_z_loss_clip": 1.39941406, + "router_z_loss_mlp": 0.10467529, + "step": 11122, + "time_per_iteration": 2.480928897857666 + }, + { + "auxiliary_loss_clip": 0.06412323, + "auxiliary_loss_mlp": 0.01265084, + "balance_loss_clip": 0.06273821, + "balance_loss_mlp": 0.01254683, + "epoch": 0.668750939425823, + "flos": 25011350559360.0, + "grad_norm": 1.579363869036545, + "language_loss": 0.71597642, + "learning_rate": 1.0446187788208015e-06, + "loss": 0.79275048, + "num_input_tokens_seen": 240179820, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.10406494, + "step": 11123, + "time_per_iteration": 3.993523597717285 + }, + { + "auxiliary_loss_clip": 0.06416023, + "auxiliary_loss_mlp": 0.01267794, + "balance_loss_clip": 0.06275098, + "balance_loss_mlp": 0.01256713, + "epoch": 0.6688110626784909, + "flos": 24103513497600.0, + "grad_norm": 1.6918402194537734, + "language_loss": 0.79247653, + "learning_rate": 1.0442766438931244e-06, + "loss": 0.86931467, + "num_input_tokens_seen": 240200130, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.11090088, + "step": 11124, + "time_per_iteration": 2.5730183124542236 + }, + { + "auxiliary_loss_clip": 0.06414519, + "auxiliary_loss_mlp": 0.0126539, + "balance_loss_clip": 0.06277663, + "balance_loss_mlp": 0.01255496, + "epoch": 0.6688711859311589, + "flos": 21765515748480.0, + "grad_norm": 1.8258374996153537, + "language_loss": 0.74714315, + "learning_rate": 1.0439345452065716e-06, + "loss": 0.8239423, + "num_input_tokens_seen": 240217945, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.09899902, + "step": 11125, + "time_per_iteration": 2.586688995361328 + }, + { + "auxiliary_loss_clip": 0.06414272, + "auxiliary_loss_mlp": 0.01265114, + "balance_loss_clip": 0.06274511, + "balance_loss_mlp": 0.01254802, + "epoch": 0.6689313091838268, + "flos": 22936254595200.0, + "grad_norm": 1.821756692805589, + "language_loss": 0.66474277, + "learning_rate": 1.043592482774116e-06, + "loss": 0.74153662, + "num_input_tokens_seen": 240237220, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.10314941, + "step": 11126, + "time_per_iteration": 2.5671706199645996 + }, + { + "auxiliary_loss_clip": 0.06412929, + "auxiliary_loss_mlp": 0.01267353, + "balance_loss_clip": 0.06274001, + "balance_loss_mlp": 0.01256774, + "epoch": 0.6689914324364948, + "flos": 20892367077120.0, + "grad_norm": 1.6855233783346146, + "language_loss": 0.71609974, + "learning_rate": 1.0432504566087305e-06, + "loss": 0.79290259, + "num_input_tokens_seen": 240256000, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.10577393, + "step": 11127, + "time_per_iteration": 3.9430463314056396 + }, + { + "auxiliary_loss_clip": 0.06417182, + "auxiliary_loss_mlp": 0.01267327, + "balance_loss_clip": 0.06273168, + "balance_loss_mlp": 0.01255841, + "epoch": 0.6690515556891627, + "flos": 22754972286720.0, + "grad_norm": 1.8544786849615413, + "language_loss": 0.80330718, + "learning_rate": 1.0429084667233827e-06, + "loss": 0.88015223, + "num_input_tokens_seen": 240275845, + "router_z_loss_clip": 1.44042969, + "router_z_loss_mlp": 0.11486816, + "step": 11128, + "time_per_iteration": 2.545502185821533 + }, + { + "auxiliary_loss_clip": 0.06412885, + "auxiliary_loss_mlp": 0.01266335, + "balance_loss_clip": 0.06271905, + "balance_loss_mlp": 0.01255582, + "epoch": 0.6691116789418308, + "flos": 23338203431040.0, + "grad_norm": 1.7840790291668756, + "language_loss": 0.81335264, + "learning_rate": 1.0425665131310427e-06, + "loss": 0.89014482, + "num_input_tokens_seen": 240294095, + "router_z_loss_clip": 1.40917969, + "router_z_loss_mlp": 0.10742188, + "step": 11129, + "time_per_iteration": 2.5280702114105225 + }, + { + "auxiliary_loss_clip": 0.06404583, + "auxiliary_loss_mlp": 0.01264694, + "balance_loss_clip": 0.06271389, + "balance_loss_mlp": 0.01254972, + "epoch": 0.6691718021944987, + "flos": 32454308384640.0, + "grad_norm": 1.6197681941265856, + "language_loss": 0.70428884, + "learning_rate": 1.0422245958446762e-06, + "loss": 0.7809816, + "num_input_tokens_seen": 240313460, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.097229, + "step": 11130, + "time_per_iteration": 2.578578233718872 + }, + { + "auxiliary_loss_clip": 0.06406342, + "auxiliary_loss_mlp": 0.01262916, + "balance_loss_clip": 0.0627137, + "balance_loss_mlp": 0.0125301, + "epoch": 0.6692319254471667, + "flos": 23738223623040.0, + "grad_norm": 1.529399392054523, + "language_loss": 0.70701146, + "learning_rate": 1.0418827148772486e-06, + "loss": 0.78370404, + "num_input_tokens_seen": 240333540, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.09918213, + "step": 11131, + "time_per_iteration": 2.537551164627075 + }, + { + "auxiliary_loss_clip": 0.06414618, + "auxiliary_loss_mlp": 0.01266754, + "balance_loss_clip": 0.06274183, + "balance_loss_mlp": 0.01255906, + "epoch": 0.6692920486998346, + "flos": 14432996004480.0, + "grad_norm": 2.3888765741874645, + "language_loss": 0.65664881, + "learning_rate": 1.0415408702417243e-06, + "loss": 0.73346257, + "num_input_tokens_seen": 240350085, + "router_z_loss_clip": 1.40527344, + "router_z_loss_mlp": 0.10858154, + "step": 11132, + "time_per_iteration": 2.45595383644104 + }, + { + "auxiliary_loss_clip": 0.06414949, + "auxiliary_loss_mlp": 0.0126617, + "balance_loss_clip": 0.06275167, + "balance_loss_mlp": 0.01254839, + "epoch": 0.6693521719525026, + "flos": 21513976191360.0, + "grad_norm": 1.5662057284927036, + "language_loss": 0.74730015, + "learning_rate": 1.0411990619510661e-06, + "loss": 0.82411134, + "num_input_tokens_seen": 240370015, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.11340332, + "step": 11133, + "time_per_iteration": 2.5248849391937256 + }, + { + "auxiliary_loss_clip": 0.06419569, + "auxiliary_loss_mlp": 0.01271511, + "balance_loss_clip": 0.06276593, + "balance_loss_mlp": 0.01259412, + "epoch": 0.6694122952051706, + "flos": 25413341322240.0, + "grad_norm": 3.5912228691538757, + "language_loss": 0.66650522, + "learning_rate": 1.0408572900182363e-06, + "loss": 0.74341607, + "num_input_tokens_seen": 240390770, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.12097168, + "step": 11134, + "time_per_iteration": 2.556043863296509 + }, + { + "auxiliary_loss_clip": 0.06424067, + "auxiliary_loss_mlp": 0.01264606, + "balance_loss_clip": 0.06279507, + "balance_loss_mlp": 0.01253294, + "epoch": 0.6694724184578386, + "flos": 25668067334400.0, + "grad_norm": 1.7597980858171118, + "language_loss": 0.77272904, + "learning_rate": 1.0405155544561943e-06, + "loss": 0.84961575, + "num_input_tokens_seen": 240409590, + "router_z_loss_clip": 1.4453125, + "router_z_loss_mlp": 0.11309814, + "step": 11135, + "time_per_iteration": 2.572221279144287 + }, + { + "auxiliary_loss_clip": 0.06406624, + "auxiliary_loss_mlp": 0.0126679, + "balance_loss_clip": 0.0627154, + "balance_loss_mlp": 0.01256079, + "epoch": 0.6695325417105066, + "flos": 17714567381760.0, + "grad_norm": 1.4860361528198607, + "language_loss": 0.74150556, + "learning_rate": 1.040173855277898e-06, + "loss": 0.81823969, + "num_input_tokens_seen": 240428180, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.1071167, + "step": 11136, + "time_per_iteration": 2.482616662979126 + }, + { + "auxiliary_loss_clip": 0.06421445, + "auxiliary_loss_mlp": 0.01264954, + "balance_loss_clip": 0.06277363, + "balance_loss_mlp": 0.01253814, + "epoch": 0.6695926649631745, + "flos": 24466581239040.0, + "grad_norm": 1.5006390680612098, + "language_loss": 0.622679, + "learning_rate": 1.0398321924963061e-06, + "loss": 0.69954294, + "num_input_tokens_seen": 240447815, + "router_z_loss_clip": 1.43945312, + "router_z_loss_mlp": 0.1114502, + "step": 11137, + "time_per_iteration": 2.60404109954834 + }, + { + "auxiliary_loss_clip": 0.06413136, + "auxiliary_loss_mlp": 0.01269365, + "balance_loss_clip": 0.0627469, + "balance_loss_mlp": 0.01258535, + "epoch": 0.6696527882158425, + "flos": 24287059866240.0, + "grad_norm": 1.73693802973788, + "language_loss": 0.66198957, + "learning_rate": 1.0394905661243724e-06, + "loss": 0.73881459, + "num_input_tokens_seen": 240468635, + "router_z_loss_clip": 1.38476562, + "router_z_loss_mlp": 0.1083374, + "step": 11138, + "time_per_iteration": 2.5446555614471436 + }, + { + "auxiliary_loss_clip": 0.06407638, + "auxiliary_loss_mlp": 0.01264748, + "balance_loss_clip": 0.06273118, + "balance_loss_mlp": 0.01255009, + "epoch": 0.6697129114685104, + "flos": 23009404809600.0, + "grad_norm": 1.563215252926209, + "language_loss": 0.73026919, + "learning_rate": 1.039148976175053e-06, + "loss": 0.80699301, + "num_input_tokens_seen": 240488550, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.09741211, + "step": 11139, + "time_per_iteration": 2.5669844150543213 + }, + { + "auxiliary_loss_clip": 0.06403776, + "auxiliary_loss_mlp": 0.01266346, + "balance_loss_clip": 0.06271777, + "balance_loss_mlp": 0.01256326, + "epoch": 0.6697730347211784, + "flos": 22644743840640.0, + "grad_norm": 1.6502373859256334, + "language_loss": 0.70972526, + "learning_rate": 1.0388074226613016e-06, + "loss": 0.78642654, + "num_input_tokens_seen": 240508330, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.10015869, + "step": 11140, + "time_per_iteration": 2.524345874786377 + }, + { + "auxiliary_loss_clip": 0.06414337, + "auxiliary_loss_mlp": 0.01264927, + "balance_loss_clip": 0.06273174, + "balance_loss_mlp": 0.01254103, + "epoch": 0.6698331579738463, + "flos": 28884915832320.0, + "grad_norm": 1.9955464769525513, + "language_loss": 0.75788713, + "learning_rate": 1.0384659055960691e-06, + "loss": 0.83467978, + "num_input_tokens_seen": 240528470, + "router_z_loss_clip": 1.41210938, + "router_z_loss_mlp": 0.1083374, + "step": 11141, + "time_per_iteration": 2.610853433609009 + }, + { + "auxiliary_loss_clip": 0.06411906, + "auxiliary_loss_mlp": 0.01271137, + "balance_loss_clip": 0.06273371, + "balance_loss_mlp": 0.01260337, + "epoch": 0.6698932812265144, + "flos": 24213993505920.0, + "grad_norm": 1.7317387192226181, + "language_loss": 0.82309425, + "learning_rate": 1.0381244249923052e-06, + "loss": 0.8999247, + "num_input_tokens_seen": 240547815, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.10803223, + "step": 11142, + "time_per_iteration": 2.5797901153564453 + }, + { + "auxiliary_loss_clip": 0.0640756, + "auxiliary_loss_mlp": 0.01269267, + "balance_loss_clip": 0.06271559, + "balance_loss_mlp": 0.01258556, + "epoch": 0.6699534044791823, + "flos": 22096704211200.0, + "grad_norm": 1.4627194343759278, + "language_loss": 0.70282012, + "learning_rate": 1.037782980862959e-06, + "loss": 0.77958834, + "num_input_tokens_seen": 240567765, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.1071167, + "step": 11143, + "time_per_iteration": 2.543877601623535 + }, + { + "auxiliary_loss_clip": 0.06405188, + "auxiliary_loss_mlp": 0.01262215, + "balance_loss_clip": 0.06271453, + "balance_loss_mlp": 0.01252577, + "epoch": 0.6700135277318503, + "flos": 25199466975360.0, + "grad_norm": 1.4915968751654103, + "language_loss": 0.70360661, + "learning_rate": 1.0374415732209796e-06, + "loss": 0.78028065, + "num_input_tokens_seen": 240590750, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.09637451, + "step": 11144, + "time_per_iteration": 2.5488550662994385 + }, + { + "auxiliary_loss_clip": 0.06411098, + "auxiliary_loss_mlp": 0.01264928, + "balance_loss_clip": 0.06275296, + "balance_loss_mlp": 0.01253735, + "epoch": 0.6700736509845182, + "flos": 23446838649600.0, + "grad_norm": 1.6240872047460435, + "language_loss": 0.74927717, + "learning_rate": 1.0371002020793114e-06, + "loss": 0.82603747, + "num_input_tokens_seen": 240608875, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.11193848, + "step": 11145, + "time_per_iteration": 2.542711019515991 + }, + { + "auxiliary_loss_clip": 0.06415901, + "auxiliary_loss_mlp": 0.0126542, + "balance_loss_clip": 0.06274743, + "balance_loss_mlp": 0.01254405, + "epoch": 0.6701337742371862, + "flos": 24396952896000.0, + "grad_norm": 1.5772021074008409, + "language_loss": 0.71292794, + "learning_rate": 1.0367588674509008e-06, + "loss": 0.7897411, + "num_input_tokens_seen": 240628565, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.11016846, + "step": 11146, + "time_per_iteration": 2.5397775173187256 + }, + { + "auxiliary_loss_clip": 0.06402436, + "auxiliary_loss_mlp": 0.01264562, + "balance_loss_clip": 0.06271266, + "balance_loss_mlp": 0.0125459, + "epoch": 0.6701938974898543, + "flos": 14798956711680.0, + "grad_norm": 2.075971191875419, + "language_loss": 0.78937066, + "learning_rate": 1.0364175693486905e-06, + "loss": 0.86604059, + "num_input_tokens_seen": 240646325, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09979248, + "step": 11147, + "time_per_iteration": 2.521651029586792 + }, + { + "auxiliary_loss_clip": 0.06408454, + "auxiliary_loss_mlp": 0.01268691, + "balance_loss_clip": 0.06272413, + "balance_loss_mlp": 0.0125801, + "epoch": 0.6702540207425222, + "flos": 20159690976000.0, + "grad_norm": 1.9550194289938683, + "language_loss": 0.70223355, + "learning_rate": 1.0360763077856218e-06, + "loss": 0.77900505, + "num_input_tokens_seen": 240666145, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.10687256, + "step": 11148, + "time_per_iteration": 4.084912300109863 + }, + { + "auxiliary_loss_clip": 0.06407622, + "auxiliary_loss_mlp": 0.01263909, + "balance_loss_clip": 0.06271225, + "balance_loss_mlp": 0.01253991, + "epoch": 0.6703141439951902, + "flos": 21220369084800.0, + "grad_norm": 1.6593895437552093, + "language_loss": 0.70494747, + "learning_rate": 1.035735082774636e-06, + "loss": 0.78166282, + "num_input_tokens_seen": 240685570, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.09918213, + "step": 11149, + "time_per_iteration": 2.532682418823242 + }, + { + "auxiliary_loss_clip": 0.06408584, + "auxiliary_loss_mlp": 0.0126327, + "balance_loss_clip": 0.06269559, + "balance_loss_mlp": 0.01253245, + "epoch": 0.6703742672478581, + "flos": 23119255912320.0, + "grad_norm": 2.1651783548168124, + "language_loss": 0.73744798, + "learning_rate": 1.0353938943286727e-06, + "loss": 0.81416655, + "num_input_tokens_seen": 240706945, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.10028076, + "step": 11150, + "time_per_iteration": 2.591546058654785 + }, + { + "auxiliary_loss_clip": 0.06414528, + "auxiliary_loss_mlp": 0.01264123, + "balance_loss_clip": 0.06276007, + "balance_loss_mlp": 0.01253829, + "epoch": 0.6704343905005261, + "flos": 22535563570560.0, + "grad_norm": 1.9523081475406603, + "language_loss": 0.78322434, + "learning_rate": 1.035052742460671e-06, + "loss": 0.86001086, + "num_input_tokens_seen": 240727990, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.10296631, + "step": 11151, + "time_per_iteration": 2.536759853363037 + }, + { + "auxiliary_loss_clip": 0.06307358, + "auxiliary_loss_mlp": 0.01251405, + "balance_loss_clip": 0.06251603, + "balance_loss_mlp": 0.01250013, + "epoch": 0.670494513753194, + "flos": 64815270192000.0, + "grad_norm": 0.7758908798936945, + "language_loss": 0.55567682, + "learning_rate": 1.0347116271835643e-06, + "loss": 0.63126445, + "num_input_tokens_seen": 240790380, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.0139389, + "step": 11152, + "time_per_iteration": 3.201535224914551 + }, + { + "auxiliary_loss_clip": 0.06410956, + "auxiliary_loss_mlp": 0.01264996, + "balance_loss_clip": 0.06271775, + "balance_loss_mlp": 0.01254815, + "epoch": 0.670554637005862, + "flos": 23517892512000.0, + "grad_norm": 1.915770962366586, + "language_loss": 0.81010997, + "learning_rate": 1.0343705485102896e-06, + "loss": 0.88686949, + "num_input_tokens_seen": 240811545, + "router_z_loss_clip": 1.39257812, + "router_z_loss_mlp": 0.10186768, + "step": 11153, + "time_per_iteration": 2.537212371826172 + }, + { + "auxiliary_loss_clip": 0.06411768, + "auxiliary_loss_mlp": 0.0126436, + "balance_loss_clip": 0.06273108, + "balance_loss_mlp": 0.01253822, + "epoch": 0.67061476025853, + "flos": 19469417840640.0, + "grad_norm": 1.508737872634347, + "language_loss": 0.76268411, + "learning_rate": 1.0340295064537814e-06, + "loss": 0.83944541, + "num_input_tokens_seen": 240831380, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.10534668, + "step": 11154, + "time_per_iteration": 2.558519124984741 + }, + { + "auxiliary_loss_clip": 0.06415759, + "auxiliary_loss_mlp": 0.01269836, + "balance_loss_clip": 0.06274074, + "balance_loss_mlp": 0.01259304, + "epoch": 0.670674883511198, + "flos": 20525903245440.0, + "grad_norm": 3.082678767747609, + "language_loss": 0.76461852, + "learning_rate": 1.0336885010269702e-06, + "loss": 0.84147453, + "num_input_tokens_seen": 240851855, + "router_z_loss_clip": 1.41699219, + "router_z_loss_mlp": 0.10534668, + "step": 11155, + "time_per_iteration": 2.504171371459961 + }, + { + "auxiliary_loss_clip": 0.06407665, + "auxiliary_loss_mlp": 0.01265908, + "balance_loss_clip": 0.0627, + "balance_loss_mlp": 0.01256187, + "epoch": 0.6707350067638659, + "flos": 25491061584000.0, + "grad_norm": 2.1059181531121873, + "language_loss": 0.82157421, + "learning_rate": 1.0333475322427878e-06, + "loss": 0.89830995, + "num_input_tokens_seen": 240869980, + "router_z_loss_clip": 1.37695312, + "router_z_loss_mlp": 0.09716797, + "step": 11156, + "time_per_iteration": 2.562812089920044 + }, + { + "auxiliary_loss_clip": 0.06406271, + "auxiliary_loss_mlp": 0.01265165, + "balance_loss_clip": 0.06271681, + "balance_loss_mlp": 0.01255706, + "epoch": 0.6707951300165339, + "flos": 22280040944640.0, + "grad_norm": 1.7628533784510112, + "language_loss": 0.74903405, + "learning_rate": 1.033006600114165e-06, + "loss": 0.82574838, + "num_input_tokens_seen": 240888680, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.09460449, + "step": 11157, + "time_per_iteration": 2.5089879035949707 + }, + { + "auxiliary_loss_clip": 0.06412502, + "auxiliary_loss_mlp": 0.01267451, + "balance_loss_clip": 0.06273752, + "balance_loss_mlp": 0.01256919, + "epoch": 0.6708552532692018, + "flos": 23990853283200.0, + "grad_norm": 1.6697268751930758, + "language_loss": 0.74289936, + "learning_rate": 1.0326657046540282e-06, + "loss": 0.81969893, + "num_input_tokens_seen": 240909050, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.10528564, + "step": 11158, + "time_per_iteration": 2.5533461570739746 + }, + { + "auxiliary_loss_clip": 0.06413293, + "auxiliary_loss_mlp": 0.01263254, + "balance_loss_clip": 0.06271626, + "balance_loss_mlp": 0.01253449, + "epoch": 0.6709153765218698, + "flos": 24944657109120.0, + "grad_norm": 1.5416620862644819, + "language_loss": 0.81707746, + "learning_rate": 1.0323248458753044e-06, + "loss": 0.89384294, + "num_input_tokens_seen": 240930035, + "router_z_loss_clip": 1.41699219, + "router_z_loss_mlp": 0.0980835, + "step": 11159, + "time_per_iteration": 4.040963649749756 + }, + { + "auxiliary_loss_clip": 0.06412386, + "auxiliary_loss_mlp": 0.01268767, + "balance_loss_clip": 0.06273866, + "balance_loss_mlp": 0.01258986, + "epoch": 0.6709754997745379, + "flos": 17536010330880.0, + "grad_norm": 1.5609798446772174, + "language_loss": 0.7718569, + "learning_rate": 1.0319840237909193e-06, + "loss": 0.84866846, + "num_input_tokens_seen": 240948895, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.09783936, + "step": 11160, + "time_per_iteration": 2.4715282917022705 + }, + { + "auxiliary_loss_clip": 0.06406286, + "auxiliary_loss_mlp": 0.01263422, + "balance_loss_clip": 0.06271639, + "balance_loss_mlp": 0.01254397, + "epoch": 0.6710356230272058, + "flos": 22097416970880.0, + "grad_norm": 1.6605543467204091, + "language_loss": 0.73893428, + "learning_rate": 1.0316432384137978e-06, + "loss": 0.81563139, + "num_input_tokens_seen": 240967770, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.09020996, + "step": 11161, + "time_per_iteration": 2.5761518478393555 + }, + { + "auxiliary_loss_clip": 0.0641313, + "auxiliary_loss_mlp": 0.01268388, + "balance_loss_clip": 0.06271637, + "balance_loss_mlp": 0.01257874, + "epoch": 0.6710957462798738, + "flos": 24213238819200.0, + "grad_norm": 1.698475212339427, + "language_loss": 0.68223077, + "learning_rate": 1.0313024897568618e-06, + "loss": 0.75904596, + "num_input_tokens_seen": 240988985, + "router_z_loss_clip": 1.41503906, + "router_z_loss_mlp": 0.10522461, + "step": 11162, + "time_per_iteration": 4.0347349643707275 + }, + { + "auxiliary_loss_clip": 0.06406809, + "auxiliary_loss_mlp": 0.01265434, + "balance_loss_clip": 0.06271581, + "balance_loss_mlp": 0.01255367, + "epoch": 0.6711558695325417, + "flos": 19099138648320.0, + "grad_norm": 1.6208038414483141, + "language_loss": 0.70270795, + "learning_rate": 1.030961777833032e-06, + "loss": 0.77943039, + "num_input_tokens_seen": 241005455, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.10064697, + "step": 11163, + "time_per_iteration": 2.4880189895629883 + }, + { + "auxiliary_loss_clip": 0.06402589, + "auxiliary_loss_mlp": 0.01262753, + "balance_loss_clip": 0.06269395, + "balance_loss_mlp": 0.0125383, + "epoch": 0.6712159927852097, + "flos": 25565134193280.0, + "grad_norm": 1.5352927814280746, + "language_loss": 0.75905788, + "learning_rate": 1.0306211026552291e-06, + "loss": 0.8357113, + "num_input_tokens_seen": 241026175, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.08929443, + "step": 11164, + "time_per_iteration": 2.5312371253967285 + }, + { + "auxiliary_loss_clip": 0.06409736, + "auxiliary_loss_mlp": 0.01265492, + "balance_loss_clip": 0.06273673, + "balance_loss_mlp": 0.01254907, + "epoch": 0.6712761160378776, + "flos": 22234032253440.0, + "grad_norm": 2.0741329798372408, + "language_loss": 0.65590626, + "learning_rate": 1.0302804642363704e-06, + "loss": 0.73265851, + "num_input_tokens_seen": 241044040, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.10595703, + "step": 11165, + "time_per_iteration": 2.5017032623291016 + }, + { + "auxiliary_loss_clip": 0.06407681, + "auxiliary_loss_mlp": 0.01264452, + "balance_loss_clip": 0.06272089, + "balance_loss_mlp": 0.01254444, + "epoch": 0.6713362392905456, + "flos": 22462077939840.0, + "grad_norm": 1.8809222742523355, + "language_loss": 0.71774828, + "learning_rate": 1.0299398625893738e-06, + "loss": 0.79446959, + "num_input_tokens_seen": 241063615, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.10015869, + "step": 11166, + "time_per_iteration": 2.50738787651062 + }, + { + "auxiliary_loss_clip": 0.06404926, + "auxiliary_loss_mlp": 0.01262643, + "balance_loss_clip": 0.06272519, + "balance_loss_mlp": 0.0125282, + "epoch": 0.6713963625432136, + "flos": 25637362012800.0, + "grad_norm": 1.8955119453047675, + "language_loss": 0.77147096, + "learning_rate": 1.0295992977271546e-06, + "loss": 0.84814668, + "num_input_tokens_seen": 241082520, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.09814453, + "step": 11167, + "time_per_iteration": 3.929837942123413 + }, + { + "auxiliary_loss_clip": 0.06410499, + "auxiliary_loss_mlp": 0.01266509, + "balance_loss_clip": 0.06272188, + "balance_loss_mlp": 0.01256078, + "epoch": 0.6714564857958816, + "flos": 35015110940160.0, + "grad_norm": 1.8086126039126507, + "language_loss": 0.68893099, + "learning_rate": 1.029258769662629e-06, + "loss": 0.76570106, + "num_input_tokens_seen": 241103505, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.10437012, + "step": 11168, + "time_per_iteration": 2.6505095958709717 + }, + { + "auxiliary_loss_clip": 0.06413946, + "auxiliary_loss_mlp": 0.01269172, + "balance_loss_clip": 0.0627404, + "balance_loss_mlp": 0.012578, + "epoch": 0.6715166090485495, + "flos": 26286028796160.0, + "grad_norm": 1.7287934282524213, + "language_loss": 0.73465478, + "learning_rate": 1.0289182784087068e-06, + "loss": 0.81148595, + "num_input_tokens_seen": 241122885, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.11358643, + "step": 11169, + "time_per_iteration": 2.5538253784179688 + }, + { + "auxiliary_loss_clip": 0.06410573, + "auxiliary_loss_mlp": 0.01265262, + "balance_loss_clip": 0.0627141, + "balance_loss_mlp": 0.01254706, + "epoch": 0.6715767323012175, + "flos": 15929556652800.0, + "grad_norm": 1.9811109571628822, + "language_loss": 0.76329374, + "learning_rate": 1.0285778239783005e-06, + "loss": 0.84005201, + "num_input_tokens_seen": 241140865, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.10565186, + "step": 11170, + "time_per_iteration": 2.5357441902160645 + }, + { + "auxiliary_loss_clip": 0.06412025, + "auxiliary_loss_mlp": 0.01265598, + "balance_loss_clip": 0.06272931, + "balance_loss_mlp": 0.01254964, + "epoch": 0.6716368555538854, + "flos": 17496835747200.0, + "grad_norm": 1.8551997359651162, + "language_loss": 0.74972916, + "learning_rate": 1.0282374063843212e-06, + "loss": 0.82650542, + "num_input_tokens_seen": 241158225, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.10626221, + "step": 11171, + "time_per_iteration": 2.4740569591522217 + }, + { + "auxiliary_loss_clip": 0.06413123, + "auxiliary_loss_mlp": 0.01262691, + "balance_loss_clip": 0.06273066, + "balance_loss_mlp": 0.01252344, + "epoch": 0.6716969788065534, + "flos": 16766759122560.0, + "grad_norm": 1.4543204322223777, + "language_loss": 0.86493564, + "learning_rate": 1.0278970256396762e-06, + "loss": 0.94169378, + "num_input_tokens_seen": 241175215, + "router_z_loss_clip": 1.40039062, + "router_z_loss_mlp": 0.10345459, + "step": 11172, + "time_per_iteration": 2.5120010375976562 + }, + { + "auxiliary_loss_clip": 0.06408751, + "auxiliary_loss_mlp": 0.01266926, + "balance_loss_clip": 0.06271499, + "balance_loss_mlp": 0.01256763, + "epoch": 0.6717571020592215, + "flos": 22716216973440.0, + "grad_norm": 2.0454540055069863, + "language_loss": 0.63633478, + "learning_rate": 1.0275566817572733e-06, + "loss": 0.71309155, + "num_input_tokens_seen": 241195250, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.10168457, + "step": 11173, + "time_per_iteration": 2.49975848197937 + }, + { + "auxiliary_loss_clip": 0.06422, + "auxiliary_loss_mlp": 0.01271665, + "balance_loss_clip": 0.06275772, + "balance_loss_mlp": 0.01260549, + "epoch": 0.6718172253118894, + "flos": 18740053975680.0, + "grad_norm": 4.441337622220845, + "language_loss": 0.71819955, + "learning_rate": 1.02721637475002e-06, + "loss": 0.79513621, + "num_input_tokens_seen": 241210720, + "router_z_loss_clip": 1.46191406, + "router_z_loss_mlp": 0.11108398, + "step": 11174, + "time_per_iteration": 2.483900547027588 + }, + { + "auxiliary_loss_clip": 0.06401111, + "auxiliary_loss_mlp": 0.01264943, + "balance_loss_clip": 0.06269203, + "balance_loss_mlp": 0.01255472, + "epoch": 0.6718773485645574, + "flos": 15637920117120.0, + "grad_norm": 1.9560679016643376, + "language_loss": 0.69026506, + "learning_rate": 1.0268761046308178e-06, + "loss": 0.76692557, + "num_input_tokens_seen": 241227395, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.09472656, + "step": 11175, + "time_per_iteration": 2.463592767715454 + }, + { + "auxiliary_loss_clip": 0.06406569, + "auxiliary_loss_mlp": 0.01265187, + "balance_loss_clip": 0.06273869, + "balance_loss_mlp": 0.01255341, + "epoch": 0.6719374718172253, + "flos": 19360908403200.0, + "grad_norm": 1.7117830890697936, + "language_loss": 0.74226189, + "learning_rate": 1.0265358714125714e-06, + "loss": 0.8189795, + "num_input_tokens_seen": 241246355, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09844971, + "step": 11176, + "time_per_iteration": 2.5074222087860107 + }, + { + "auxiliary_loss_clip": 0.06410944, + "auxiliary_loss_mlp": 0.0126684, + "balance_loss_clip": 0.062714, + "balance_loss_mlp": 0.0125654, + "epoch": 0.6719975950698933, + "flos": 21987817430400.0, + "grad_norm": 2.8444182697169014, + "language_loss": 0.73030323, + "learning_rate": 1.026195675108182e-06, + "loss": 0.80708104, + "num_input_tokens_seen": 241264180, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.10296631, + "step": 11177, + "time_per_iteration": 2.4807181358337402 + }, + { + "auxiliary_loss_clip": 0.06411102, + "auxiliary_loss_mlp": 0.01268926, + "balance_loss_clip": 0.06272686, + "balance_loss_mlp": 0.01258144, + "epoch": 0.6720577183225612, + "flos": 25235035833600.0, + "grad_norm": 2.1466059593233755, + "language_loss": 0.76338404, + "learning_rate": 1.025855515730551e-06, + "loss": 0.84018433, + "num_input_tokens_seen": 241282245, + "router_z_loss_clip": 1.38476562, + "router_z_loss_mlp": 0.10772705, + "step": 11178, + "time_per_iteration": 2.5277843475341797 + }, + { + "auxiliary_loss_clip": 0.06410985, + "auxiliary_loss_mlp": 0.01264657, + "balance_loss_clip": 0.06272356, + "balance_loss_mlp": 0.01255007, + "epoch": 0.6721178415752292, + "flos": 16951479448320.0, + "grad_norm": 1.7634405951154783, + "language_loss": 0.70127761, + "learning_rate": 1.0255153932925766e-06, + "loss": 0.77803409, + "num_input_tokens_seen": 241300745, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.09643555, + "step": 11179, + "time_per_iteration": 2.4638893604278564 + }, + { + "auxiliary_loss_clip": 0.06403655, + "auxiliary_loss_mlp": 0.01265471, + "balance_loss_clip": 0.06269027, + "balance_loss_mlp": 0.01256077, + "epoch": 0.6721779648278972, + "flos": 21547448697600.0, + "grad_norm": 1.4326115817211162, + "language_loss": 0.74262661, + "learning_rate": 1.0251753078071557e-06, + "loss": 0.81931782, + "num_input_tokens_seen": 241319320, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.09393311, + "step": 11180, + "time_per_iteration": 2.5094285011291504 + }, + { + "auxiliary_loss_clip": 0.0640661, + "auxiliary_loss_mlp": 0.01263274, + "balance_loss_clip": 0.06271511, + "balance_loss_mlp": 0.01252843, + "epoch": 0.6722380880805652, + "flos": 22612696853760.0, + "grad_norm": 1.3575184211837767, + "language_loss": 0.75178289, + "learning_rate": 1.0248352592871848e-06, + "loss": 0.82848167, + "num_input_tokens_seen": 241342225, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.10424805, + "step": 11181, + "time_per_iteration": 2.5373446941375732 + }, + { + "auxiliary_loss_clip": 0.06412126, + "auxiliary_loss_mlp": 0.0126461, + "balance_loss_clip": 0.06272763, + "balance_loss_mlp": 0.01254615, + "epoch": 0.6722982113332331, + "flos": 15930856391040.0, + "grad_norm": 2.2936660091873597, + "language_loss": 0.75133812, + "learning_rate": 1.0244952477455585e-06, + "loss": 0.82810551, + "num_input_tokens_seen": 241358240, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.09991455, + "step": 11182, + "time_per_iteration": 2.5146076679229736 + }, + { + "auxiliary_loss_clip": 0.06407333, + "auxiliary_loss_mlp": 0.01266179, + "balance_loss_clip": 0.06272985, + "balance_loss_mlp": 0.01256535, + "epoch": 0.6723583345859011, + "flos": 20602659185280.0, + "grad_norm": 1.7825231183024703, + "language_loss": 0.69884634, + "learning_rate": 1.0241552731951699e-06, + "loss": 0.77558148, + "num_input_tokens_seen": 241378420, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.09643555, + "step": 11183, + "time_per_iteration": 2.510972499847412 + }, + { + "auxiliary_loss_clip": 0.06407849, + "auxiliary_loss_mlp": 0.01268223, + "balance_loss_clip": 0.06270228, + "balance_loss_mlp": 0.01258234, + "epoch": 0.672418457838569, + "flos": 21732294804480.0, + "grad_norm": 1.4388499153565433, + "language_loss": 0.78377849, + "learning_rate": 1.0238153356489112e-06, + "loss": 0.8605392, + "num_input_tokens_seen": 241397185, + "router_z_loss_clip": 1.37695312, + "router_z_loss_mlp": 0.09985352, + "step": 11184, + "time_per_iteration": 2.5102083683013916 + }, + { + "auxiliary_loss_clip": 0.06418785, + "auxiliary_loss_mlp": 0.01263963, + "balance_loss_clip": 0.06274929, + "balance_loss_mlp": 0.01253305, + "epoch": 0.672478581091237, + "flos": 21476772178560.0, + "grad_norm": 2.087218631508525, + "language_loss": 0.66671652, + "learning_rate": 1.0234754351196743e-06, + "loss": 0.74354398, + "num_input_tokens_seen": 241415785, + "router_z_loss_clip": 1.43847656, + "router_z_loss_mlp": 0.10668945, + "step": 11185, + "time_per_iteration": 2.4922776222229004 + }, + { + "auxiliary_loss_clip": 0.06405509, + "auxiliary_loss_mlp": 0.01264604, + "balance_loss_clip": 0.06269497, + "balance_loss_mlp": 0.01253905, + "epoch": 0.6725387043439051, + "flos": 30854646887040.0, + "grad_norm": 3.8783146360767518, + "language_loss": 0.80847633, + "learning_rate": 1.023135571620345e-06, + "loss": 0.88517749, + "num_input_tokens_seen": 241437390, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.10693359, + "step": 11186, + "time_per_iteration": 2.650069236755371 + }, + { + "auxiliary_loss_clip": 0.06405525, + "auxiliary_loss_mlp": 0.01268075, + "balance_loss_clip": 0.06272745, + "balance_loss_mlp": 0.01258753, + "epoch": 0.672598827596573, + "flos": 24061949072640.0, + "grad_norm": 1.3182024269377546, + "language_loss": 0.807257, + "learning_rate": 1.022795745163813e-06, + "loss": 0.88399297, + "num_input_tokens_seen": 241458085, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.09320068, + "step": 11187, + "time_per_iteration": 2.5736026763916016 + }, + { + "auxiliary_loss_clip": 0.06414247, + "auxiliary_loss_mlp": 0.01266802, + "balance_loss_clip": 0.06271032, + "balance_loss_mlp": 0.01255996, + "epoch": 0.672658950849241, + "flos": 21878343671040.0, + "grad_norm": 1.7328673404989177, + "language_loss": 0.71004307, + "learning_rate": 1.022455955762965e-06, + "loss": 0.78685355, + "num_input_tokens_seen": 241476880, + "router_z_loss_clip": 1.43164062, + "router_z_loss_mlp": 0.1081543, + "step": 11188, + "time_per_iteration": 3.9358599185943604 + }, + { + "auxiliary_loss_clip": 0.06400838, + "auxiliary_loss_mlp": 0.01264937, + "balance_loss_clip": 0.06269124, + "balance_loss_mlp": 0.01255364, + "epoch": 0.6727190741019089, + "flos": 23228855452800.0, + "grad_norm": 1.7513555431786316, + "language_loss": 0.75587308, + "learning_rate": 1.0221162034306842e-06, + "loss": 0.83253086, + "num_input_tokens_seen": 241496535, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09576416, + "step": 11189, + "time_per_iteration": 2.558595895767212 + }, + { + "auxiliary_loss_clip": 0.06412518, + "auxiliary_loss_mlp": 0.01264313, + "balance_loss_clip": 0.06271306, + "balance_loss_mlp": 0.01252762, + "epoch": 0.6727791973545769, + "flos": 15784052837760.0, + "grad_norm": 2.0872354058578186, + "language_loss": 0.75281942, + "learning_rate": 1.0217764881798562e-06, + "loss": 0.8295877, + "num_input_tokens_seen": 241513465, + "router_z_loss_clip": 1.41210938, + "router_z_loss_mlp": 0.11547852, + "step": 11190, + "time_per_iteration": 2.465223550796509 + }, + { + "auxiliary_loss_clip": 0.06406397, + "auxiliary_loss_mlp": 0.01267439, + "balance_loss_clip": 0.06271145, + "balance_loss_mlp": 0.01256788, + "epoch": 0.6728393206072448, + "flos": 21255937943040.0, + "grad_norm": 1.3785573959073936, + "language_loss": 0.76754856, + "learning_rate": 1.0214368100233612e-06, + "loss": 0.84428692, + "num_input_tokens_seen": 241534125, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.10650635, + "step": 11191, + "time_per_iteration": 2.519883155822754 + }, + { + "auxiliary_loss_clip": 0.06406602, + "auxiliary_loss_mlp": 0.01266147, + "balance_loss_clip": 0.06273556, + "balance_loss_mlp": 0.01256509, + "epoch": 0.6728994438599128, + "flos": 32131295694720.0, + "grad_norm": 1.5727699537163, + "language_loss": 0.86438018, + "learning_rate": 1.0210971689740802e-06, + "loss": 0.94110769, + "num_input_tokens_seen": 241556340, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.09637451, + "step": 11192, + "time_per_iteration": 2.589451789855957 + }, + { + "auxiliary_loss_clip": 0.06414255, + "auxiliary_loss_mlp": 0.0126838, + "balance_loss_clip": 0.06275576, + "balance_loss_mlp": 0.01256948, + "epoch": 0.6729595671125808, + "flos": 23119046277120.0, + "grad_norm": 2.0400596637632997, + "language_loss": 0.76247764, + "learning_rate": 1.0207575650448923e-06, + "loss": 0.83930409, + "num_input_tokens_seen": 241575185, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.11437988, + "step": 11193, + "time_per_iteration": 2.569079637527466 + }, + { + "auxiliary_loss_clip": 0.06408816, + "auxiliary_loss_mlp": 0.01268779, + "balance_loss_clip": 0.06272899, + "balance_loss_mlp": 0.0125802, + "epoch": 0.6730196903652488, + "flos": 14616710081280.0, + "grad_norm": 1.7886354434370773, + "language_loss": 0.78477633, + "learning_rate": 1.0204179982486758e-06, + "loss": 0.86155224, + "num_input_tokens_seen": 241592970, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.10766602, + "step": 11194, + "time_per_iteration": 2.501262664794922 + }, + { + "auxiliary_loss_clip": 0.06410375, + "auxiliary_loss_mlp": 0.01264075, + "balance_loss_clip": 0.06271183, + "balance_loss_mlp": 0.01253889, + "epoch": 0.6730798136179167, + "flos": 21112320844800.0, + "grad_norm": 1.7894428961307616, + "language_loss": 0.90123671, + "learning_rate": 1.0200784685983075e-06, + "loss": 0.97798121, + "num_input_tokens_seen": 241610245, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.10192871, + "step": 11195, + "time_per_iteration": 2.529911994934082 + }, + { + "auxiliary_loss_clip": 0.06404506, + "auxiliary_loss_mlp": 0.01267592, + "balance_loss_clip": 0.0627027, + "balance_loss_mlp": 0.01257119, + "epoch": 0.6731399368705847, + "flos": 28993886467200.0, + "grad_norm": 1.9634861378348352, + "language_loss": 0.72801971, + "learning_rate": 1.019738976106662e-06, + "loss": 0.80474073, + "num_input_tokens_seen": 241630350, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.10467529, + "step": 11196, + "time_per_iteration": 2.5403385162353516 + }, + { + "auxiliary_loss_clip": 0.06306562, + "auxiliary_loss_mlp": 0.01254217, + "balance_loss_clip": 0.06250267, + "balance_loss_mlp": 0.01253061, + "epoch": 0.6732000601232526, + "flos": 64763643277440.0, + "grad_norm": 0.755157348431284, + "language_loss": 0.56539071, + "learning_rate": 1.0193995207866123e-06, + "loss": 0.64099848, + "num_input_tokens_seen": 241692380, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01152802, + "step": 11197, + "time_per_iteration": 3.103764295578003 + }, + { + "auxiliary_loss_clip": 0.06400825, + "auxiliary_loss_mlp": 0.01269132, + "balance_loss_clip": 0.06270334, + "balance_loss_mlp": 0.01259316, + "epoch": 0.6732601833759206, + "flos": 17207337490560.0, + "grad_norm": 1.957045035118017, + "language_loss": 0.76133382, + "learning_rate": 1.0190601026510312e-06, + "loss": 0.83803332, + "num_input_tokens_seen": 241710430, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.09814453, + "step": 11198, + "time_per_iteration": 2.4750118255615234 + }, + { + "auxiliary_loss_clip": 0.06411158, + "auxiliary_loss_mlp": 0.01264495, + "balance_loss_clip": 0.06272125, + "balance_loss_mlp": 0.01253492, + "epoch": 0.6733203066285887, + "flos": 18664430065920.0, + "grad_norm": 2.5858701419359185, + "language_loss": 0.81900644, + "learning_rate": 1.0187207217127892e-06, + "loss": 0.89576292, + "num_input_tokens_seen": 241724775, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.11010742, + "step": 11199, + "time_per_iteration": 3.915224075317383 + }, + { + "auxiliary_loss_clip": 0.06408331, + "auxiliary_loss_mlp": 0.01268211, + "balance_loss_clip": 0.06268819, + "balance_loss_mlp": 0.01257566, + "epoch": 0.6733804298812566, + "flos": 35818128144000.0, + "grad_norm": 1.7377353958720951, + "language_loss": 0.71924305, + "learning_rate": 1.0183813779847552e-06, + "loss": 0.79600847, + "num_input_tokens_seen": 241744440, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.10650635, + "step": 11200, + "time_per_iteration": 2.6547374725341797 + }, + { + "auxiliary_loss_clip": 0.06413474, + "auxiliary_loss_mlp": 0.01270012, + "balance_loss_clip": 0.06276008, + "balance_loss_mlp": 0.01259581, + "epoch": 0.6734405531339246, + "flos": 61651545511680.0, + "grad_norm": 1.525289564934158, + "language_loss": 0.64700097, + "learning_rate": 1.0180420714797987e-06, + "loss": 0.72383583, + "num_input_tokens_seen": 241771705, + "router_z_loss_clip": 1.37402344, + "router_z_loss_mlp": 0.10437012, + "step": 11201, + "time_per_iteration": 2.884462356567383 + }, + { + "auxiliary_loss_clip": 0.06414636, + "auxiliary_loss_mlp": 0.01267107, + "balance_loss_clip": 0.06272763, + "balance_loss_mlp": 0.01255466, + "epoch": 0.6735006763865925, + "flos": 20528670430080.0, + "grad_norm": 1.5117322786205176, + "language_loss": 0.63124895, + "learning_rate": 1.0177028022107856e-06, + "loss": 0.7080664, + "num_input_tokens_seen": 241790830, + "router_z_loss_clip": 1.41796875, + "router_z_loss_mlp": 0.11639404, + "step": 11202, + "time_per_iteration": 3.9962854385375977 + }, + { + "auxiliary_loss_clip": 0.06410715, + "auxiliary_loss_mlp": 0.0126867, + "balance_loss_clip": 0.06272809, + "balance_loss_mlp": 0.01258198, + "epoch": 0.6735607996392605, + "flos": 13924172885760.0, + "grad_norm": 1.7265240314624624, + "language_loss": 0.75169051, + "learning_rate": 1.0173635701905796e-06, + "loss": 0.82848436, + "num_input_tokens_seen": 241808165, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.10473633, + "step": 11203, + "time_per_iteration": 2.4805357456207275 + }, + { + "auxiliary_loss_clip": 0.06417318, + "auxiliary_loss_mlp": 0.0126681, + "balance_loss_clip": 0.0627423, + "balance_loss_mlp": 0.01254979, + "epoch": 0.6736209228919284, + "flos": 18813246117120.0, + "grad_norm": 2.5086879815410996, + "language_loss": 0.6739623, + "learning_rate": 1.0170243754320456e-06, + "loss": 0.75080359, + "num_input_tokens_seen": 241826925, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.11834717, + "step": 11204, + "time_per_iteration": 2.5092830657958984 + }, + { + "auxiliary_loss_clip": 0.06417938, + "auxiliary_loss_mlp": 0.01267705, + "balance_loss_clip": 0.06275398, + "balance_loss_mlp": 0.01256565, + "epoch": 0.6736810461445965, + "flos": 20378890056960.0, + "grad_norm": 1.4739361265515354, + "language_loss": 0.74145937, + "learning_rate": 1.0166852179480465e-06, + "loss": 0.81831586, + "num_input_tokens_seen": 241845525, + "router_z_loss_clip": 1.42480469, + "router_z_loss_mlp": 0.11151123, + "step": 11205, + "time_per_iteration": 2.5575578212738037 + }, + { + "auxiliary_loss_clip": 0.06405318, + "auxiliary_loss_mlp": 0.01270325, + "balance_loss_clip": 0.06271175, + "balance_loss_mlp": 0.0126027, + "epoch": 0.6737411693972644, + "flos": 30015264211200.0, + "grad_norm": 1.4826905039931084, + "language_loss": 0.71781552, + "learning_rate": 1.0163460977514416e-06, + "loss": 0.79457194, + "num_input_tokens_seen": 241866815, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.10058594, + "step": 11206, + "time_per_iteration": 4.010627031326294 + }, + { + "auxiliary_loss_clip": 0.0641677, + "auxiliary_loss_mlp": 0.01267501, + "balance_loss_clip": 0.06272575, + "balance_loss_mlp": 0.0125648, + "epoch": 0.6738012926499324, + "flos": 25454402622720.0, + "grad_norm": 2.885338634405065, + "language_loss": 0.67620468, + "learning_rate": 1.016007014855092e-06, + "loss": 0.75304735, + "num_input_tokens_seen": 241887050, + "router_z_loss_clip": 1.44335938, + "router_z_loss_mlp": 0.11016846, + "step": 11207, + "time_per_iteration": 2.5686817169189453 + }, + { + "auxiliary_loss_clip": 0.06404196, + "auxiliary_loss_mlp": 0.01268865, + "balance_loss_clip": 0.06272342, + "balance_loss_mlp": 0.01258672, + "epoch": 0.6738614159026003, + "flos": 20783102952960.0, + "grad_norm": 2.0413352600750145, + "language_loss": 0.74134195, + "learning_rate": 1.0156679692718553e-06, + "loss": 0.81807256, + "num_input_tokens_seen": 241904280, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.10186768, + "step": 11208, + "time_per_iteration": 2.4913690090179443 + }, + { + "auxiliary_loss_clip": 0.06408808, + "auxiliary_loss_mlp": 0.01269437, + "balance_loss_clip": 0.06270136, + "balance_loss_mlp": 0.01257432, + "epoch": 0.6739215391552683, + "flos": 19571931711360.0, + "grad_norm": 1.741711609442522, + "language_loss": 0.75868964, + "learning_rate": 1.0153289610145867e-06, + "loss": 0.83547217, + "num_input_tokens_seen": 241919190, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.11999512, + "step": 11209, + "time_per_iteration": 2.494077444076538 + }, + { + "auxiliary_loss_clip": 0.06402588, + "auxiliary_loss_mlp": 0.01265115, + "balance_loss_clip": 0.062707, + "balance_loss_mlp": 0.01255042, + "epoch": 0.6739816624079362, + "flos": 24394898471040.0, + "grad_norm": 1.8799682247559513, + "language_loss": 0.66601419, + "learning_rate": 1.0149899900961428e-06, + "loss": 0.74269128, + "num_input_tokens_seen": 241940525, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.10064697, + "step": 11210, + "time_per_iteration": 2.531925916671753 + }, + { + "auxiliary_loss_clip": 0.06400777, + "auxiliary_loss_mlp": 0.0126575, + "balance_loss_clip": 0.06269025, + "balance_loss_mlp": 0.01256297, + "epoch": 0.6740417856606042, + "flos": 22534683102720.0, + "grad_norm": 3.725779709718602, + "language_loss": 0.8045913, + "learning_rate": 1.014651056529377e-06, + "loss": 0.88125658, + "num_input_tokens_seen": 241959290, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09454346, + "step": 11211, + "time_per_iteration": 2.546027898788452 + }, + { + "auxiliary_loss_clip": 0.06403598, + "auxiliary_loss_mlp": 0.01265983, + "balance_loss_clip": 0.06271007, + "balance_loss_mlp": 0.01256208, + "epoch": 0.6741019089132723, + "flos": 25782530411520.0, + "grad_norm": 1.3057254169112946, + "language_loss": 0.76753151, + "learning_rate": 1.014312160327143e-06, + "loss": 0.84422737, + "num_input_tokens_seen": 241980715, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.09777832, + "step": 11212, + "time_per_iteration": 2.542628049850464 + }, + { + "auxiliary_loss_clip": 0.06409732, + "auxiliary_loss_mlp": 0.01268637, + "balance_loss_clip": 0.06270209, + "balance_loss_mlp": 0.01257539, + "epoch": 0.6741620321659402, + "flos": 21112027355520.0, + "grad_norm": 1.7288185495326422, + "language_loss": 0.78622723, + "learning_rate": 1.0139733015022905e-06, + "loss": 0.86301088, + "num_input_tokens_seen": 241999985, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.11108398, + "step": 11213, + "time_per_iteration": 2.553414821624756 + }, + { + "auxiliary_loss_clip": 0.06413242, + "auxiliary_loss_mlp": 0.01267804, + "balance_loss_clip": 0.06272203, + "balance_loss_mlp": 0.01256789, + "epoch": 0.6742221554186082, + "flos": 20746653626880.0, + "grad_norm": 1.7499991393106977, + "language_loss": 0.6779902, + "learning_rate": 1.0136344800676685e-06, + "loss": 0.75480068, + "num_input_tokens_seen": 242018990, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.11016846, + "step": 11214, + "time_per_iteration": 2.4924774169921875 + }, + { + "auxiliary_loss_clip": 0.06411138, + "auxiliary_loss_mlp": 0.01266837, + "balance_loss_clip": 0.06271094, + "balance_loss_mlp": 0.01256907, + "epoch": 0.6742822786712761, + "flos": 37782366756480.0, + "grad_norm": 1.5348832786859372, + "language_loss": 0.73044717, + "learning_rate": 1.0132956960361263e-06, + "loss": 0.8072269, + "num_input_tokens_seen": 242039340, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.0993042, + "step": 11215, + "time_per_iteration": 2.6919710636138916 + }, + { + "auxiliary_loss_clip": 0.06411563, + "auxiliary_loss_mlp": 0.01266913, + "balance_loss_clip": 0.06272543, + "balance_loss_mlp": 0.0125653, + "epoch": 0.6743424019239441, + "flos": 37272118118400.0, + "grad_norm": 1.6783781241391482, + "language_loss": 0.66716719, + "learning_rate": 1.0129569494205096e-06, + "loss": 0.74395192, + "num_input_tokens_seen": 242062215, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.1038208, + "step": 11216, + "time_per_iteration": 2.6457085609436035 + }, + { + "auxiliary_loss_clip": 0.06304459, + "auxiliary_loss_mlp": 0.01251318, + "balance_loss_clip": 0.06248666, + "balance_loss_mlp": 0.012498, + "epoch": 0.674402525176612, + "flos": 66020152377600.0, + "grad_norm": 0.6583920548662452, + "language_loss": 0.56272531, + "learning_rate": 1.0126182402336646e-06, + "loss": 0.63828307, + "num_input_tokens_seen": 242131130, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.01516724, + "step": 11217, + "time_per_iteration": 3.2267727851867676 + }, + { + "auxiliary_loss_clip": 0.064037, + "auxiliary_loss_mlp": 0.01266203, + "balance_loss_clip": 0.06268451, + "balance_loss_mlp": 0.01255939, + "epoch": 0.67446264842928, + "flos": 26467143396480.0, + "grad_norm": 1.8797709757007424, + "language_loss": 0.74946856, + "learning_rate": 1.0122795684884363e-06, + "loss": 0.82616764, + "num_input_tokens_seen": 242149720, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.1026001, + "step": 11218, + "time_per_iteration": 2.5534565448760986 + }, + { + "auxiliary_loss_clip": 0.06412031, + "auxiliary_loss_mlp": 0.01268347, + "balance_loss_clip": 0.06273925, + "balance_loss_mlp": 0.01257189, + "epoch": 0.674522771681948, + "flos": 23739146017920.0, + "grad_norm": 1.571619211134611, + "language_loss": 0.6640991, + "learning_rate": 1.0119409341976639e-06, + "loss": 0.74090284, + "num_input_tokens_seen": 242168875, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.1116333, + "step": 11219, + "time_per_iteration": 2.5408942699432373 + }, + { + "auxiliary_loss_clip": 0.06409343, + "auxiliary_loss_mlp": 0.01269022, + "balance_loss_clip": 0.062702, + "balance_loss_mlp": 0.01257935, + "epoch": 0.674582894934616, + "flos": 24761320375680.0, + "grad_norm": 1.6133708722293332, + "language_loss": 0.75378865, + "learning_rate": 1.0116023373741904e-06, + "loss": 0.83057231, + "num_input_tokens_seen": 242188465, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.11090088, + "step": 11220, + "time_per_iteration": 2.556192398071289 + }, + { + "auxiliary_loss_clip": 0.0640621, + "auxiliary_loss_mlp": 0.01265502, + "balance_loss_clip": 0.06268732, + "balance_loss_mlp": 0.01254988, + "epoch": 0.6746430181872839, + "flos": 24833506268160.0, + "grad_norm": 1.5601512803843804, + "language_loss": 0.70583248, + "learning_rate": 1.0112637780308554e-06, + "loss": 0.78254962, + "num_input_tokens_seen": 242208675, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.10522461, + "step": 11221, + "time_per_iteration": 2.538742780685425 + }, + { + "auxiliary_loss_clip": 0.06408031, + "auxiliary_loss_mlp": 0.01264539, + "balance_loss_clip": 0.06272538, + "balance_loss_mlp": 0.01255032, + "epoch": 0.6747031414399519, + "flos": 16879167774720.0, + "grad_norm": 2.089456373953198, + "language_loss": 0.58824384, + "learning_rate": 1.010925256180498e-06, + "loss": 0.66496956, + "num_input_tokens_seen": 242227440, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.09509277, + "step": 11222, + "time_per_iteration": 2.5625038146972656 + }, + { + "auxiliary_loss_clip": 0.06411393, + "auxiliary_loss_mlp": 0.01266063, + "balance_loss_clip": 0.0627331, + "balance_loss_mlp": 0.01255, + "epoch": 0.6747632646926198, + "flos": 22791715102080.0, + "grad_norm": 1.7403006489773343, + "language_loss": 0.76732111, + "learning_rate": 1.0105867718359528e-06, + "loss": 0.84409571, + "num_input_tokens_seen": 242245240, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.11065674, + "step": 11223, + "time_per_iteration": 2.499220132827759 + }, + { + "auxiliary_loss_clip": 0.06407724, + "auxiliary_loss_mlp": 0.01267921, + "balance_loss_clip": 0.06270097, + "balance_loss_mlp": 0.01257854, + "epoch": 0.6748233879452878, + "flos": 20052020079360.0, + "grad_norm": 1.8418495567149014, + "language_loss": 0.75473273, + "learning_rate": 1.0102483250100574e-06, + "loss": 0.83148926, + "num_input_tokens_seen": 242263435, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.10064697, + "step": 11224, + "time_per_iteration": 2.5515925884246826 + }, + { + "auxiliary_loss_clip": 0.06404493, + "auxiliary_loss_mlp": 0.01263212, + "balance_loss_clip": 0.06271124, + "balance_loss_mlp": 0.01254289, + "epoch": 0.6748835111979558, + "flos": 23009488663680.0, + "grad_norm": 1.6780430249692133, + "language_loss": 0.63333517, + "learning_rate": 1.0099099157156445e-06, + "loss": 0.7100122, + "num_input_tokens_seen": 242282765, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.0892334, + "step": 11225, + "time_per_iteration": 2.5058155059814453 + }, + { + "auxiliary_loss_clip": 0.0639993, + "auxiliary_loss_mlp": 0.01263232, + "balance_loss_clip": 0.06269206, + "balance_loss_mlp": 0.012541, + "epoch": 0.6749436344506238, + "flos": 12201201705600.0, + "grad_norm": 1.7347966506914976, + "language_loss": 0.64211845, + "learning_rate": 1.0095715439655462e-06, + "loss": 0.71875006, + "num_input_tokens_seen": 242298980, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.09130859, + "step": 11226, + "time_per_iteration": 2.5148916244506836 + }, + { + "auxiliary_loss_clip": 0.06412213, + "auxiliary_loss_mlp": 0.01266854, + "balance_loss_clip": 0.06273121, + "balance_loss_mlp": 0.01256256, + "epoch": 0.6750037577032918, + "flos": 11878356723840.0, + "grad_norm": 2.584638628864584, + "language_loss": 0.72339863, + "learning_rate": 1.0092332097725945e-06, + "loss": 0.80018932, + "num_input_tokens_seen": 242315420, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.10595703, + "step": 11227, + "time_per_iteration": 2.4601356983184814 + }, + { + "auxiliary_loss_clip": 0.06406709, + "auxiliary_loss_mlp": 0.01263943, + "balance_loss_clip": 0.06272034, + "balance_loss_mlp": 0.01254097, + "epoch": 0.6750638809559597, + "flos": 17025342422400.0, + "grad_norm": 2.4759856374415077, + "language_loss": 0.7107985, + "learning_rate": 1.0088949131496183e-06, + "loss": 0.78750503, + "num_input_tokens_seen": 242332805, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.09851074, + "step": 11228, + "time_per_iteration": 3.974013566970825 + }, + { + "auxiliary_loss_clip": 0.0630679, + "auxiliary_loss_mlp": 0.01262425, + "balance_loss_clip": 0.06250891, + "balance_loss_mlp": 0.01260476, + "epoch": 0.6751240042086277, + "flos": 70972774531200.0, + "grad_norm": 0.7443387383646383, + "language_loss": 0.52992356, + "learning_rate": 1.0085566541094482e-06, + "loss": 0.60561574, + "num_input_tokens_seen": 242396160, + "router_z_loss_clip": 0.55908203, + "router_z_loss_mlp": 0.01947021, + "step": 11229, + "time_per_iteration": 3.1949167251586914 + }, + { + "auxiliary_loss_clip": 0.06405008, + "auxiliary_loss_mlp": 0.01265887, + "balance_loss_clip": 0.06271674, + "balance_loss_mlp": 0.01256249, + "epoch": 0.6751841274612956, + "flos": 22681863999360.0, + "grad_norm": 2.9468842422151673, + "language_loss": 0.80432749, + "learning_rate": 1.0082184326649072e-06, + "loss": 0.88103646, + "num_input_tokens_seen": 242414660, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.09625244, + "step": 11230, + "time_per_iteration": 2.5213663578033447 + }, + { + "auxiliary_loss_clip": 0.06402741, + "auxiliary_loss_mlp": 0.01262658, + "balance_loss_clip": 0.06269971, + "balance_loss_mlp": 0.0125333, + "epoch": 0.6752442507139637, + "flos": 21295112526720.0, + "grad_norm": 1.434197979050497, + "language_loss": 0.65974534, + "learning_rate": 1.0078802488288228e-06, + "loss": 0.73639941, + "num_input_tokens_seen": 242434225, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09326172, + "step": 11231, + "time_per_iteration": 2.512449026107788 + }, + { + "auxiliary_loss_clip": 0.06417508, + "auxiliary_loss_mlp": 0.01271667, + "balance_loss_clip": 0.06276156, + "balance_loss_mlp": 0.01260396, + "epoch": 0.6753043739666316, + "flos": 28264480675200.0, + "grad_norm": 1.8511033060394846, + "language_loss": 0.66944438, + "learning_rate": 1.0075421026140198e-06, + "loss": 0.7463361, + "num_input_tokens_seen": 242454355, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.11260986, + "step": 11232, + "time_per_iteration": 2.5738155841827393 + }, + { + "auxiliary_loss_clip": 0.06404346, + "auxiliary_loss_mlp": 0.01266971, + "balance_loss_clip": 0.0627114, + "balance_loss_mlp": 0.01257226, + "epoch": 0.6753644972192996, + "flos": 21366627586560.0, + "grad_norm": 1.674017645319507, + "language_loss": 0.72178799, + "learning_rate": 1.0072039940333188e-06, + "loss": 0.79850119, + "num_input_tokens_seen": 242474935, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.09735107, + "step": 11233, + "time_per_iteration": 2.5327250957489014 + }, + { + "auxiliary_loss_clip": 0.0640566, + "auxiliary_loss_mlp": 0.01263187, + "balance_loss_clip": 0.06269811, + "balance_loss_mlp": 0.01253579, + "epoch": 0.6754246204719675, + "flos": 26549224070400.0, + "grad_norm": 1.499022886883579, + "language_loss": 0.7716381, + "learning_rate": 1.0068659230995418e-06, + "loss": 0.84832656, + "num_input_tokens_seen": 242495530, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.09606934, + "step": 11234, + "time_per_iteration": 2.607923746109009 + }, + { + "auxiliary_loss_clip": 0.0640721, + "auxiliary_loss_mlp": 0.01266453, + "balance_loss_clip": 0.06272233, + "balance_loss_mlp": 0.01255224, + "epoch": 0.6754847437246355, + "flos": 25563750600960.0, + "grad_norm": 1.4543561341667586, + "language_loss": 0.75457549, + "learning_rate": 1.0065278898255101e-06, + "loss": 0.83131212, + "num_input_tokens_seen": 242514550, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.11230469, + "step": 11235, + "time_per_iteration": 2.614145278930664 + }, + { + "auxiliary_loss_clip": 0.06304054, + "auxiliary_loss_mlp": 0.01255487, + "balance_loss_clip": 0.06248432, + "balance_loss_mlp": 0.01253944, + "epoch": 0.6755448669773034, + "flos": 59530216492800.0, + "grad_norm": 0.7576799363115112, + "language_loss": 0.51220065, + "learning_rate": 1.0061898942240387e-06, + "loss": 0.58779609, + "num_input_tokens_seen": 242569200, + "router_z_loss_clip": 0.55712891, + "router_z_loss_mlp": 0.01538849, + "step": 11236, + "time_per_iteration": 3.079153060913086 + }, + { + "auxiliary_loss_clip": 0.06406215, + "auxiliary_loss_mlp": 0.01265101, + "balance_loss_clip": 0.06270912, + "balance_loss_mlp": 0.01253931, + "epoch": 0.6756049902299714, + "flos": 23301209053440.0, + "grad_norm": 1.9064890293106858, + "language_loss": 0.75501907, + "learning_rate": 1.0058519363079464e-06, + "loss": 0.83173215, + "num_input_tokens_seen": 242586950, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.11181641, + "step": 11237, + "time_per_iteration": 2.591219186782837 + }, + { + "auxiliary_loss_clip": 0.06407686, + "auxiliary_loss_mlp": 0.01265319, + "balance_loss_clip": 0.06271937, + "balance_loss_mlp": 0.01254441, + "epoch": 0.6756651134826394, + "flos": 31583256065280.0, + "grad_norm": 1.6435273747755843, + "language_loss": 0.77603805, + "learning_rate": 1.0055140160900482e-06, + "loss": 0.85276806, + "num_input_tokens_seen": 242607380, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10876465, + "step": 11238, + "time_per_iteration": 4.004278659820557 + }, + { + "auxiliary_loss_clip": 0.06411187, + "auxiliary_loss_mlp": 0.01266355, + "balance_loss_clip": 0.06269816, + "balance_loss_mlp": 0.01255834, + "epoch": 0.6757252367353074, + "flos": 27279761892480.0, + "grad_norm": 1.8597789781280543, + "language_loss": 0.66815203, + "learning_rate": 1.0051761335831587e-06, + "loss": 0.74492747, + "num_input_tokens_seen": 242628025, + "router_z_loss_clip": 1.41503906, + "router_z_loss_mlp": 0.10510254, + "step": 11239, + "time_per_iteration": 2.5872182846069336 + }, + { + "auxiliary_loss_clip": 0.06401898, + "auxiliary_loss_mlp": 0.01262458, + "balance_loss_clip": 0.06269912, + "balance_loss_mlp": 0.01252927, + "epoch": 0.6757853599879754, + "flos": 16835548924800.0, + "grad_norm": 2.5961823999819074, + "language_loss": 0.8317802, + "learning_rate": 1.0048382888000898e-06, + "loss": 0.90842378, + "num_input_tokens_seen": 242643825, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09533691, + "step": 11240, + "time_per_iteration": 2.4803500175476074 + }, + { + "auxiliary_loss_clip": 0.0641778, + "auxiliary_loss_mlp": 0.01269049, + "balance_loss_clip": 0.06275319, + "balance_loss_mlp": 0.0125677, + "epoch": 0.6758454832406433, + "flos": 23226465611520.0, + "grad_norm": 1.9848396876019143, + "language_loss": 0.7422142, + "learning_rate": 1.0045004817536525e-06, + "loss": 0.8190825, + "num_input_tokens_seen": 242661820, + "router_z_loss_clip": 1.42382812, + "router_z_loss_mlp": 0.12268066, + "step": 11241, + "time_per_iteration": 2.526111602783203 + }, + { + "auxiliary_loss_clip": 0.06407639, + "auxiliary_loss_mlp": 0.0126396, + "balance_loss_clip": 0.06271756, + "balance_loss_mlp": 0.01253833, + "epoch": 0.6759056064933113, + "flos": 16295098089600.0, + "grad_norm": 2.0527933437331343, + "language_loss": 0.80294073, + "learning_rate": 1.0041627124566572e-06, + "loss": 0.87965673, + "num_input_tokens_seen": 242679890, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.10131836, + "step": 11242, + "time_per_iteration": 3.933396339416504 + }, + { + "auxiliary_loss_clip": 0.06405968, + "auxiliary_loss_mlp": 0.01263229, + "balance_loss_clip": 0.06268989, + "balance_loss_mlp": 0.01253734, + "epoch": 0.6759657297459792, + "flos": 25929543600000.0, + "grad_norm": 1.6744190932532899, + "language_loss": 0.72630656, + "learning_rate": 1.0038249809219109e-06, + "loss": 0.80299854, + "num_input_tokens_seen": 242699495, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.09490967, + "step": 11243, + "time_per_iteration": 2.514404535293579 + }, + { + "auxiliary_loss_clip": 0.06407295, + "auxiliary_loss_mlp": 0.01265212, + "balance_loss_clip": 0.06272102, + "balance_loss_mlp": 0.01255306, + "epoch": 0.6760258529986473, + "flos": 23007140749440.0, + "grad_norm": 1.5647847453275578, + "language_loss": 0.72900802, + "learning_rate": 1.003487287162221e-06, + "loss": 0.80573308, + "num_input_tokens_seen": 242719500, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.09906006, + "step": 11244, + "time_per_iteration": 2.5581138134002686 + }, + { + "auxiliary_loss_clip": 0.06405992, + "auxiliary_loss_mlp": 0.01266317, + "balance_loss_clip": 0.06269385, + "balance_loss_mlp": 0.01255887, + "epoch": 0.6760859762513152, + "flos": 20965601145600.0, + "grad_norm": 4.977975302469332, + "language_loss": 0.85911322, + "learning_rate": 1.003149631190393e-06, + "loss": 0.93583632, + "num_input_tokens_seen": 242738325, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.10437012, + "step": 11245, + "time_per_iteration": 2.485227584838867 + }, + { + "auxiliary_loss_clip": 0.06410875, + "auxiliary_loss_mlp": 0.01265401, + "balance_loss_clip": 0.06269195, + "balance_loss_mlp": 0.01254743, + "epoch": 0.6761460995039832, + "flos": 23629672258560.0, + "grad_norm": 1.7215460318487352, + "language_loss": 0.74000847, + "learning_rate": 1.0028120130192327e-06, + "loss": 0.81677115, + "num_input_tokens_seen": 242756620, + "router_z_loss_clip": 1.41796875, + "router_z_loss_mlp": 0.10656738, + "step": 11246, + "time_per_iteration": 3.958766460418701 + }, + { + "auxiliary_loss_clip": 0.06405219, + "auxiliary_loss_mlp": 0.01262106, + "balance_loss_clip": 0.0626854, + "balance_loss_mlp": 0.01251896, + "epoch": 0.6762062227566511, + "flos": 20776101137280.0, + "grad_norm": 1.7168055925724897, + "language_loss": 0.87943971, + "learning_rate": 1.002474432661539e-06, + "loss": 0.95611298, + "num_input_tokens_seen": 242774505, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.10205078, + "step": 11247, + "time_per_iteration": 2.586812973022461 + }, + { + "auxiliary_loss_clip": 0.06307312, + "auxiliary_loss_mlp": 0.01250807, + "balance_loss_clip": 0.06251501, + "balance_loss_mlp": 0.01249509, + "epoch": 0.6762663460093191, + "flos": 52836915219840.0, + "grad_norm": 0.8036403587512043, + "language_loss": 0.53957772, + "learning_rate": 1.002136890130115e-06, + "loss": 0.61515892, + "num_input_tokens_seen": 242828645, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.01298523, + "step": 11248, + "time_per_iteration": 3.125509262084961 + }, + { + "auxiliary_loss_clip": 0.06402693, + "auxiliary_loss_mlp": 0.01266342, + "balance_loss_clip": 0.06271251, + "balance_loss_mlp": 0.0125671, + "epoch": 0.676326469261987, + "flos": 23703115962240.0, + "grad_norm": 1.8151620805455404, + "language_loss": 0.73989308, + "learning_rate": 1.001799385437761e-06, + "loss": 0.81658345, + "num_input_tokens_seen": 242850100, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09625244, + "step": 11249, + "time_per_iteration": 2.6366310119628906 + }, + { + "auxiliary_loss_clip": 0.06411433, + "auxiliary_loss_mlp": 0.01264935, + "balance_loss_clip": 0.06270382, + "balance_loss_mlp": 0.01253372, + "epoch": 0.676386592514655, + "flos": 14068880087040.0, + "grad_norm": 2.152895610647936, + "language_loss": 0.74230921, + "learning_rate": 1.0014619185972732e-06, + "loss": 0.81907284, + "num_input_tokens_seen": 242867775, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.11566162, + "step": 11250, + "time_per_iteration": 2.458453416824341 + }, + { + "auxiliary_loss_clip": 0.06409556, + "auxiliary_loss_mlp": 0.01266298, + "balance_loss_clip": 0.06271183, + "balance_loss_mlp": 0.01256082, + "epoch": 0.676446715767323, + "flos": 20418441984000.0, + "grad_norm": 1.8697083640776453, + "language_loss": 0.74947959, + "learning_rate": 1.0011244896214497e-06, + "loss": 0.82623816, + "num_input_tokens_seen": 242886865, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.10217285, + "step": 11251, + "time_per_iteration": 2.568087100982666 + }, + { + "auxiliary_loss_clip": 0.06411379, + "auxiliary_loss_mlp": 0.01266225, + "balance_loss_clip": 0.06275384, + "balance_loss_mlp": 0.012553, + "epoch": 0.676506839019991, + "flos": 21294651329280.0, + "grad_norm": 1.5310605534253319, + "language_loss": 0.69863832, + "learning_rate": 1.0007870985230873e-06, + "loss": 0.77541435, + "num_input_tokens_seen": 242906705, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.109375, + "step": 11252, + "time_per_iteration": 2.541651725769043 + }, + { + "auxiliary_loss_clip": 0.06405863, + "auxiliary_loss_mlp": 0.0126458, + "balance_loss_clip": 0.06270701, + "balance_loss_mlp": 0.01254406, + "epoch": 0.676566962272659, + "flos": 29939849936640.0, + "grad_norm": 2.258609602750375, + "language_loss": 0.67108035, + "learning_rate": 1.0004497453149765e-06, + "loss": 0.74778473, + "num_input_tokens_seen": 242925215, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.10174561, + "step": 11253, + "time_per_iteration": 2.6143195629119873 + }, + { + "auxiliary_loss_clip": 0.06413913, + "auxiliary_loss_mlp": 0.01267285, + "balance_loss_clip": 0.06273795, + "balance_loss_mlp": 0.01255722, + "epoch": 0.6766270855253269, + "flos": 17936994844800.0, + "grad_norm": 1.5309002898419535, + "language_loss": 0.77274752, + "learning_rate": 1.0001124300099115e-06, + "loss": 0.84955955, + "num_input_tokens_seen": 242944750, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.11560059, + "step": 11254, + "time_per_iteration": 2.4911346435546875 + }, + { + "auxiliary_loss_clip": 0.06411318, + "auxiliary_loss_mlp": 0.01266788, + "balance_loss_clip": 0.06272383, + "balance_loss_mlp": 0.01255439, + "epoch": 0.6766872087779949, + "flos": 23110283525760.0, + "grad_norm": 2.0449563599790874, + "language_loss": 0.71835911, + "learning_rate": 9.997751526206835e-07, + "loss": 0.79514015, + "num_input_tokens_seen": 242963860, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.11340332, + "step": 11255, + "time_per_iteration": 2.5604913234710693 + }, + { + "auxiliary_loss_clip": 0.0641115, + "auxiliary_loss_mlp": 0.0126876, + "balance_loss_clip": 0.06271946, + "balance_loss_mlp": 0.01257376, + "epoch": 0.6767473320306628, + "flos": 26220257740800.0, + "grad_norm": 1.9457423412026578, + "language_loss": 0.75806832, + "learning_rate": 9.994379131600828e-07, + "loss": 0.83486742, + "num_input_tokens_seen": 242983050, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.11383057, + "step": 11256, + "time_per_iteration": 2.5321764945983887 + }, + { + "auxiliary_loss_clip": 0.06411014, + "auxiliary_loss_mlp": 0.01265861, + "balance_loss_clip": 0.06275011, + "balance_loss_mlp": 0.01255192, + "epoch": 0.6768074552833309, + "flos": 18374554465920.0, + "grad_norm": 2.012218384442974, + "language_loss": 0.65943599, + "learning_rate": 9.991007116408965e-07, + "loss": 0.73620474, + "num_input_tokens_seen": 243001125, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.10662842, + "step": 11257, + "time_per_iteration": 2.502154588699341 + }, + { + "auxiliary_loss_clip": 0.06409346, + "auxiliary_loss_mlp": 0.01265352, + "balance_loss_clip": 0.0627479, + "balance_loss_mlp": 0.01255159, + "epoch": 0.6768675785359988, + "flos": 23046692676480.0, + "grad_norm": 1.399276257571999, + "language_loss": 0.75707698, + "learning_rate": 9.987635480759109e-07, + "loss": 0.83382392, + "num_input_tokens_seen": 243021865, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.10186768, + "step": 11258, + "time_per_iteration": 2.536574602127075 + }, + { + "auxiliary_loss_clip": 0.06402203, + "auxiliary_loss_mlp": 0.01264608, + "balance_loss_clip": 0.06270992, + "balance_loss_mlp": 0.01254696, + "epoch": 0.6769277017886668, + "flos": 33044876760960.0, + "grad_norm": 1.5373580485699971, + "language_loss": 0.66955268, + "learning_rate": 9.984264224779127e-07, + "loss": 0.74622083, + "num_input_tokens_seen": 243042970, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.09912109, + "step": 11259, + "time_per_iteration": 2.59914231300354 + }, + { + "auxiliary_loss_clip": 0.06411228, + "auxiliary_loss_mlp": 0.01264994, + "balance_loss_clip": 0.06273773, + "balance_loss_mlp": 0.01254218, + "epoch": 0.6769878250413347, + "flos": 20854408377600.0, + "grad_norm": 2.0822099065238397, + "language_loss": 0.85664153, + "learning_rate": 9.980893348596839e-07, + "loss": 0.93340379, + "num_input_tokens_seen": 243058470, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.10778809, + "step": 11260, + "time_per_iteration": 2.470489501953125 + }, + { + "auxiliary_loss_clip": 0.06415793, + "auxiliary_loss_mlp": 0.01265606, + "balance_loss_clip": 0.06273471, + "balance_loss_mlp": 0.01253453, + "epoch": 0.6770479482940027, + "flos": 15601345009920.0, + "grad_norm": 2.2691636202149206, + "language_loss": 0.77703118, + "learning_rate": 9.977522852340081e-07, + "loss": 0.85384524, + "num_input_tokens_seen": 243076630, + "router_z_loss_clip": 1.42285156, + "router_z_loss_mlp": 0.12164307, + "step": 11261, + "time_per_iteration": 2.5071561336517334 + }, + { + "auxiliary_loss_clip": 0.06410246, + "auxiliary_loss_mlp": 0.01267278, + "balance_loss_clip": 0.06271648, + "balance_loss_mlp": 0.01256013, + "epoch": 0.6771080715466706, + "flos": 18626345585280.0, + "grad_norm": 1.5719770677718063, + "language_loss": 0.87847519, + "learning_rate": 9.97415273613666e-07, + "loss": 0.95525038, + "num_input_tokens_seen": 243092260, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.1126709, + "step": 11262, + "time_per_iteration": 2.4645345211029053 + }, + { + "auxiliary_loss_clip": 0.06413369, + "auxiliary_loss_mlp": 0.01265118, + "balance_loss_clip": 0.06273858, + "balance_loss_mlp": 0.01254371, + "epoch": 0.6771681947993387, + "flos": 12500427035520.0, + "grad_norm": 1.7525589115394145, + "language_loss": 0.74310911, + "learning_rate": 9.97078300011439e-07, + "loss": 0.81989402, + "num_input_tokens_seen": 243109405, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.10754395, + "step": 11263, + "time_per_iteration": 2.6041438579559326 + }, + { + "auxiliary_loss_clip": 0.06415032, + "auxiliary_loss_mlp": 0.01264304, + "balance_loss_clip": 0.06272443, + "balance_loss_mlp": 0.01252406, + "epoch": 0.6772283180520066, + "flos": 22243549691520.0, + "grad_norm": 2.1938876589125544, + "language_loss": 0.68432045, + "learning_rate": 9.967413644401016e-07, + "loss": 0.76111376, + "num_input_tokens_seen": 243128135, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.11901855, + "step": 11264, + "time_per_iteration": 2.5002152919769287 + }, + { + "auxiliary_loss_clip": 0.0641073, + "auxiliary_loss_mlp": 0.01264807, + "balance_loss_clip": 0.062745, + "balance_loss_mlp": 0.01254006, + "epoch": 0.6772884413046746, + "flos": 16148588025600.0, + "grad_norm": 1.8587455254700258, + "language_loss": 0.73335183, + "learning_rate": 9.964044669124324e-07, + "loss": 0.81010723, + "num_input_tokens_seen": 243146785, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.10797119, + "step": 11265, + "time_per_iteration": 2.469163179397583 + }, + { + "auxiliary_loss_clip": 0.06407094, + "auxiliary_loss_mlp": 0.01269883, + "balance_loss_clip": 0.06273008, + "balance_loss_mlp": 0.01258969, + "epoch": 0.6773485645573426, + "flos": 19141835103360.0, + "grad_norm": 1.6254501454395083, + "language_loss": 0.61922127, + "learning_rate": 9.96067607441207e-07, + "loss": 0.69599104, + "num_input_tokens_seen": 243165275, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.10913086, + "step": 11266, + "time_per_iteration": 2.495842933654785 + }, + { + "auxiliary_loss_clip": 0.06409343, + "auxiliary_loss_mlp": 0.01269206, + "balance_loss_clip": 0.06271549, + "balance_loss_mlp": 0.01258829, + "epoch": 0.6774086878100105, + "flos": 14142114155520.0, + "grad_norm": 1.8179552610473837, + "language_loss": 0.70953995, + "learning_rate": 9.957307860391976e-07, + "loss": 0.78632545, + "num_input_tokens_seen": 243182845, + "router_z_loss_clip": 1.37695312, + "router_z_loss_mlp": 0.1038208, + "step": 11267, + "time_per_iteration": 2.517019033432007 + }, + { + "auxiliary_loss_clip": 0.06410597, + "auxiliary_loss_mlp": 0.01264315, + "balance_loss_clip": 0.06273153, + "balance_loss_mlp": 0.01254009, + "epoch": 0.6774688110626785, + "flos": 22203075369600.0, + "grad_norm": 4.7399438404850525, + "language_loss": 0.71134216, + "learning_rate": 9.953940027191785e-07, + "loss": 0.7880913, + "num_input_tokens_seen": 243201475, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.10314941, + "step": 11268, + "time_per_iteration": 3.937225103378296 + }, + { + "auxiliary_loss_clip": 0.06412301, + "auxiliary_loss_mlp": 0.01268549, + "balance_loss_clip": 0.06274435, + "balance_loss_mlp": 0.0125726, + "epoch": 0.6775289343153464, + "flos": 23046734603520.0, + "grad_norm": 1.4295252958840357, + "language_loss": 0.76893616, + "learning_rate": 9.950572574939194e-07, + "loss": 0.84574473, + "num_input_tokens_seen": 243221850, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.11291504, + "step": 11269, + "time_per_iteration": 2.5114824771881104 + }, + { + "auxiliary_loss_clip": 0.06414156, + "auxiliary_loss_mlp": 0.01271853, + "balance_loss_clip": 0.06274021, + "balance_loss_mlp": 0.01259879, + "epoch": 0.6775890575680145, + "flos": 18298930556160.0, + "grad_norm": 1.7033288836702745, + "language_loss": 0.74101746, + "learning_rate": 9.94720550376189e-07, + "loss": 0.81787759, + "num_input_tokens_seen": 243239855, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.11968994, + "step": 11270, + "time_per_iteration": 2.4997193813323975 + }, + { + "auxiliary_loss_clip": 0.06411543, + "auxiliary_loss_mlp": 0.01265167, + "balance_loss_clip": 0.06274433, + "balance_loss_mlp": 0.01254504, + "epoch": 0.6776491808206824, + "flos": 25343251781760.0, + "grad_norm": 1.5419173604084193, + "language_loss": 0.72974074, + "learning_rate": 9.94383881378756e-07, + "loss": 0.80650789, + "num_input_tokens_seen": 243260085, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.10668945, + "step": 11271, + "time_per_iteration": 2.5310120582580566 + }, + { + "auxiliary_loss_clip": 0.06411068, + "auxiliary_loss_mlp": 0.01265404, + "balance_loss_clip": 0.06274058, + "balance_loss_mlp": 0.01254902, + "epoch": 0.6777093040733504, + "flos": 26034908509440.0, + "grad_norm": 1.6287619781350626, + "language_loss": 0.6787045, + "learning_rate": 9.94047250514387e-07, + "loss": 0.75546926, + "num_input_tokens_seen": 243280065, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.10498047, + "step": 11272, + "time_per_iteration": 2.556326389312744 + }, + { + "auxiliary_loss_clip": 0.06416756, + "auxiliary_loss_mlp": 0.01268859, + "balance_loss_clip": 0.06274517, + "balance_loss_mlp": 0.01256723, + "epoch": 0.6777694273260183, + "flos": 18009306518400.0, + "grad_norm": 2.0957855047238865, + "language_loss": 0.73988581, + "learning_rate": 9.937106577958481e-07, + "loss": 0.81674194, + "num_input_tokens_seen": 243297775, + "router_z_loss_clip": 1.42285156, + "router_z_loss_mlp": 0.121521, + "step": 11273, + "time_per_iteration": 2.4888038635253906 + }, + { + "auxiliary_loss_clip": 0.0640964, + "auxiliary_loss_mlp": 0.01267095, + "balance_loss_clip": 0.06273794, + "balance_loss_mlp": 0.01256069, + "epoch": 0.6778295505786863, + "flos": 23447886825600.0, + "grad_norm": 1.597740332843532, + "language_loss": 0.70512903, + "learning_rate": 9.933741032359015e-07, + "loss": 0.78189635, + "num_input_tokens_seen": 243315760, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.11022949, + "step": 11274, + "time_per_iteration": 2.5328569412231445 + }, + { + "auxiliary_loss_clip": 0.06408958, + "auxiliary_loss_mlp": 0.01270481, + "balance_loss_clip": 0.06268886, + "balance_loss_mlp": 0.01259413, + "epoch": 0.6778896738313542, + "flos": 19104337601280.0, + "grad_norm": 1.549823334564571, + "language_loss": 0.65894532, + "learning_rate": 9.930375868473093e-07, + "loss": 0.73573971, + "num_input_tokens_seen": 243335715, + "router_z_loss_clip": 1.40039062, + "router_z_loss_mlp": 0.1105957, + "step": 11275, + "time_per_iteration": 2.511591672897339 + }, + { + "auxiliary_loss_clip": 0.06410493, + "auxiliary_loss_mlp": 0.01266749, + "balance_loss_clip": 0.06273688, + "balance_loss_mlp": 0.01256801, + "epoch": 0.6779497970840223, + "flos": 26111077470720.0, + "grad_norm": 1.6541358125051857, + "language_loss": 0.72680271, + "learning_rate": 9.927011086428335e-07, + "loss": 0.80357516, + "num_input_tokens_seen": 243356935, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.0994873, + "step": 11276, + "time_per_iteration": 2.5891473293304443 + }, + { + "auxiliary_loss_clip": 0.06409149, + "auxiliary_loss_mlp": 0.01265896, + "balance_loss_clip": 0.06273319, + "balance_loss_mlp": 0.01255245, + "epoch": 0.6780099203366902, + "flos": 19725359736960.0, + "grad_norm": 1.5650058182326292, + "language_loss": 0.76883596, + "learning_rate": 9.923646686352317e-07, + "loss": 0.84558642, + "num_input_tokens_seen": 243375625, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.10650635, + "step": 11277, + "time_per_iteration": 3.915508985519409 + }, + { + "auxiliary_loss_clip": 0.06416161, + "auxiliary_loss_mlp": 0.01266536, + "balance_loss_clip": 0.06275125, + "balance_loss_mlp": 0.01254633, + "epoch": 0.6780700435893582, + "flos": 18218946234240.0, + "grad_norm": 2.711703251949157, + "language_loss": 0.83725727, + "learning_rate": 9.920282668372627e-07, + "loss": 0.91408426, + "num_input_tokens_seen": 243390195, + "router_z_loss_clip": 1.40917969, + "router_z_loss_mlp": 0.11907959, + "step": 11278, + "time_per_iteration": 2.4728851318359375 + }, + { + "auxiliary_loss_clip": 0.06408397, + "auxiliary_loss_mlp": 0.01270203, + "balance_loss_clip": 0.06273898, + "balance_loss_mlp": 0.01259862, + "epoch": 0.6781301668420262, + "flos": 25383600322560.0, + "grad_norm": 1.4808013348463376, + "language_loss": 0.70247126, + "learning_rate": 9.916919032616844e-07, + "loss": 0.77925724, + "num_input_tokens_seen": 243411690, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.10339355, + "step": 11279, + "time_per_iteration": 2.5876686573028564 + }, + { + "auxiliary_loss_clip": 0.06411046, + "auxiliary_loss_mlp": 0.01265971, + "balance_loss_clip": 0.06272636, + "balance_loss_mlp": 0.01254027, + "epoch": 0.6781902900946941, + "flos": 24026589849600.0, + "grad_norm": 1.7835400791989957, + "language_loss": 0.74185818, + "learning_rate": 9.913555779212485e-07, + "loss": 0.81862831, + "num_input_tokens_seen": 243430280, + "router_z_loss_clip": 1.38476562, + "router_z_loss_mlp": 0.1194458, + "step": 11280, + "time_per_iteration": 2.558945655822754 + }, + { + "auxiliary_loss_clip": 0.06412832, + "auxiliary_loss_mlp": 0.01263795, + "balance_loss_clip": 0.06270506, + "balance_loss_mlp": 0.01251844, + "epoch": 0.6782504133473621, + "flos": 19652964209280.0, + "grad_norm": 1.818075538813212, + "language_loss": 0.70597506, + "learning_rate": 9.910192908287104e-07, + "loss": 0.78274131, + "num_input_tokens_seen": 243448690, + "router_z_loss_clip": 1.42382812, + "router_z_loss_mlp": 0.11950684, + "step": 11281, + "time_per_iteration": 2.5192151069641113 + }, + { + "auxiliary_loss_clip": 0.06408101, + "auxiliary_loss_mlp": 0.01268091, + "balance_loss_clip": 0.06274794, + "balance_loss_mlp": 0.01257821, + "epoch": 0.67831053660003, + "flos": 24939080812800.0, + "grad_norm": 1.5294707212527767, + "language_loss": 0.63880533, + "learning_rate": 9.906830419968217e-07, + "loss": 0.71556723, + "num_input_tokens_seen": 243470695, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.1026001, + "step": 11282, + "time_per_iteration": 4.0389556884765625 + }, + { + "auxiliary_loss_clip": 0.06416775, + "auxiliary_loss_mlp": 0.01269152, + "balance_loss_clip": 0.06272826, + "balance_loss_mlp": 0.01257434, + "epoch": 0.6783706598526981, + "flos": 31215785984640.0, + "grad_norm": 1.5661846366283017, + "language_loss": 0.74472761, + "learning_rate": 9.90346831438334e-07, + "loss": 0.82158691, + "num_input_tokens_seen": 243493345, + "router_z_loss_clip": 1.43945312, + "router_z_loss_mlp": 0.11712646, + "step": 11283, + "time_per_iteration": 2.5889575481414795 + }, + { + "auxiliary_loss_clip": 0.06409109, + "auxiliary_loss_mlp": 0.01265353, + "balance_loss_clip": 0.06271229, + "balance_loss_mlp": 0.01255179, + "epoch": 0.678430783105366, + "flos": 35449526033280.0, + "grad_norm": 1.6303319808688523, + "language_loss": 0.57121617, + "learning_rate": 9.900106591659948e-07, + "loss": 0.64796078, + "num_input_tokens_seen": 243515670, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.10180664, + "step": 11284, + "time_per_iteration": 2.622241258621216 + }, + { + "auxiliary_loss_clip": 0.0640896, + "auxiliary_loss_mlp": 0.01263816, + "balance_loss_clip": 0.06271388, + "balance_loss_mlp": 0.01253719, + "epoch": 0.678490906358034, + "flos": 14434044180480.0, + "grad_norm": 1.7585312003136033, + "language_loss": 0.75540352, + "learning_rate": 9.896745251925535e-07, + "loss": 0.83213127, + "num_input_tokens_seen": 243533625, + "router_z_loss_clip": 1.37402344, + "router_z_loss_mlp": 0.10095215, + "step": 11285, + "time_per_iteration": 3.914513111114502 + }, + { + "auxiliary_loss_clip": 0.06408092, + "auxiliary_loss_mlp": 0.01264708, + "balance_loss_clip": 0.06274541, + "balance_loss_mlp": 0.01254355, + "epoch": 0.6785510296107019, + "flos": 24317262063360.0, + "grad_norm": 1.6087593577428982, + "language_loss": 0.66518104, + "learning_rate": 9.893384295307557e-07, + "loss": 0.74190903, + "num_input_tokens_seen": 243553040, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.10351562, + "step": 11286, + "time_per_iteration": 2.5443532466888428 + }, + { + "auxiliary_loss_clip": 0.06411726, + "auxiliary_loss_mlp": 0.01266212, + "balance_loss_clip": 0.06271268, + "balance_loss_mlp": 0.01254553, + "epoch": 0.6786111528633699, + "flos": 26984142288000.0, + "grad_norm": 2.2563712255718453, + "language_loss": 0.52888298, + "learning_rate": 9.890023721933447e-07, + "loss": 0.60566235, + "num_input_tokens_seen": 243572590, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.11663818, + "step": 11287, + "time_per_iteration": 2.5215566158294678 + }, + { + "auxiliary_loss_clip": 0.06408818, + "auxiliary_loss_mlp": 0.01265445, + "balance_loss_clip": 0.06271936, + "balance_loss_mlp": 0.01255265, + "epoch": 0.6786712761160378, + "flos": 24324641222400.0, + "grad_norm": 1.4827043233914352, + "language_loss": 0.7744714, + "learning_rate": 9.886663531930655e-07, + "loss": 0.85121405, + "num_input_tokens_seen": 243594140, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.10180664, + "step": 11288, + "time_per_iteration": 2.5451719760894775 + }, + { + "auxiliary_loss_clip": 0.06414543, + "auxiliary_loss_mlp": 0.01270807, + "balance_loss_clip": 0.06275427, + "balance_loss_mlp": 0.0125993, + "epoch": 0.6787313993687059, + "flos": 22937176990080.0, + "grad_norm": 1.9021636809125866, + "language_loss": 0.73458755, + "learning_rate": 9.883303725426593e-07, + "loss": 0.81144106, + "num_input_tokens_seen": 243615170, + "router_z_loss_clip": 1.39257812, + "router_z_loss_mlp": 0.10882568, + "step": 11289, + "time_per_iteration": 2.524062395095825 + }, + { + "auxiliary_loss_clip": 0.0640981, + "auxiliary_loss_mlp": 0.01268655, + "balance_loss_clip": 0.06271172, + "balance_loss_mlp": 0.01257795, + "epoch": 0.6787915226213738, + "flos": 26875423215360.0, + "grad_norm": 1.3961935649800772, + "language_loss": 0.80240023, + "learning_rate": 9.879944302548682e-07, + "loss": 0.87918484, + "num_input_tokens_seen": 243635675, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.10852051, + "step": 11290, + "time_per_iteration": 2.563781499862671 + }, + { + "auxiliary_loss_clip": 0.06406706, + "auxiliary_loss_mlp": 0.01270194, + "balance_loss_clip": 0.06273251, + "balance_loss_mlp": 0.01260395, + "epoch": 0.6788516458740418, + "flos": 20014648358400.0, + "grad_norm": 1.3943952846011585, + "language_loss": 0.75320244, + "learning_rate": 9.87658526342428e-07, + "loss": 0.82997143, + "num_input_tokens_seen": 243654950, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.09802246, + "step": 11291, + "time_per_iteration": 2.4833710193634033 + }, + { + "auxiliary_loss_clip": 0.06409583, + "auxiliary_loss_mlp": 0.01265199, + "balance_loss_clip": 0.06270351, + "balance_loss_mlp": 0.01254709, + "epoch": 0.6789117691267098, + "flos": 28734045356160.0, + "grad_norm": 1.6032413484745063, + "language_loss": 0.75235522, + "learning_rate": 9.873226608180785e-07, + "loss": 0.82910305, + "num_input_tokens_seen": 243674970, + "router_z_loss_clip": 1.39257812, + "router_z_loss_mlp": 0.10491943, + "step": 11292, + "time_per_iteration": 2.5987610816955566 + }, + { + "auxiliary_loss_clip": 0.06407046, + "auxiliary_loss_mlp": 0.01271571, + "balance_loss_clip": 0.06271286, + "balance_loss_mlp": 0.01261235, + "epoch": 0.6789718923793777, + "flos": 23410053907200.0, + "grad_norm": 1.8128590339737811, + "language_loss": 0.84362906, + "learning_rate": 9.869868336945556e-07, + "loss": 0.92041528, + "num_input_tokens_seen": 243693440, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.10345459, + "step": 11293, + "time_per_iteration": 2.6490092277526855 + }, + { + "auxiliary_loss_clip": 0.06418362, + "auxiliary_loss_mlp": 0.01266521, + "balance_loss_clip": 0.06273804, + "balance_loss_mlp": 0.01255661, + "epoch": 0.6790320156320457, + "flos": 20455100945280.0, + "grad_norm": 2.3830710729233937, + "language_loss": 0.79575551, + "learning_rate": 9.866510449845929e-07, + "loss": 0.87260431, + "num_input_tokens_seen": 243710055, + "router_z_loss_clip": 1.44433594, + "router_z_loss_mlp": 0.10852051, + "step": 11294, + "time_per_iteration": 2.540187120437622 + }, + { + "auxiliary_loss_clip": 0.06410551, + "auxiliary_loss_mlp": 0.0126649, + "balance_loss_clip": 0.06273465, + "balance_loss_mlp": 0.01256507, + "epoch": 0.6790921388847136, + "flos": 24173519184000.0, + "grad_norm": 1.663290513792591, + "language_loss": 0.79323423, + "learning_rate": 9.86315294700924e-07, + "loss": 0.87000465, + "num_input_tokens_seen": 243728635, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.09985352, + "step": 11295, + "time_per_iteration": 2.539522171020508 + }, + { + "auxiliary_loss_clip": 0.06403016, + "auxiliary_loss_mlp": 0.01270622, + "balance_loss_clip": 0.06270514, + "balance_loss_mlp": 0.01261312, + "epoch": 0.6791522621373817, + "flos": 21914541434880.0, + "grad_norm": 1.9398184157871654, + "language_loss": 0.71742594, + "learning_rate": 9.859795828562823e-07, + "loss": 0.79416239, + "num_input_tokens_seen": 243748330, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.09313965, + "step": 11296, + "time_per_iteration": 2.5390286445617676 + }, + { + "auxiliary_loss_clip": 0.06406362, + "auxiliary_loss_mlp": 0.01266184, + "balance_loss_clip": 0.06269884, + "balance_loss_mlp": 0.01256212, + "epoch": 0.6792123853900496, + "flos": 24833380487040.0, + "grad_norm": 1.7008493408846614, + "language_loss": 0.70970011, + "learning_rate": 9.856439094633949e-07, + "loss": 0.78642553, + "num_input_tokens_seen": 243769380, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.09979248, + "step": 11297, + "time_per_iteration": 2.5342774391174316 + }, + { + "auxiliary_loss_clip": 0.06413988, + "auxiliary_loss_mlp": 0.01268754, + "balance_loss_clip": 0.06271179, + "balance_loss_mlp": 0.01257691, + "epoch": 0.6792725086427176, + "flos": 17571998459520.0, + "grad_norm": 2.072165205112126, + "language_loss": 0.66610634, + "learning_rate": 9.853082745349918e-07, + "loss": 0.74293375, + "num_input_tokens_seen": 243785510, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.11071777, + "step": 11298, + "time_per_iteration": 2.5330231189727783 + }, + { + "auxiliary_loss_clip": 0.06408876, + "auxiliary_loss_mlp": 0.01265536, + "balance_loss_clip": 0.06269588, + "balance_loss_mlp": 0.01255767, + "epoch": 0.6793326318953855, + "flos": 26948908846080.0, + "grad_norm": 1.6501656577542423, + "language_loss": 0.71810848, + "learning_rate": 9.84972678083801e-07, + "loss": 0.79485255, + "num_input_tokens_seen": 243805545, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.09771729, + "step": 11299, + "time_per_iteration": 2.547666072845459 + }, + { + "auxiliary_loss_clip": 0.06407908, + "auxiliary_loss_mlp": 0.01269253, + "balance_loss_clip": 0.06269622, + "balance_loss_mlp": 0.01258196, + "epoch": 0.6793927551480535, + "flos": 24325479763200.0, + "grad_norm": 1.2577197776351332, + "language_loss": 0.77542967, + "learning_rate": 9.846371201225488e-07, + "loss": 0.85220122, + "num_input_tokens_seen": 243825185, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.1105957, + "step": 11300, + "time_per_iteration": 2.568537473678589 + }, + { + "auxiliary_loss_clip": 0.06409447, + "auxiliary_loss_mlp": 0.01267373, + "balance_loss_clip": 0.06272208, + "balance_loss_mlp": 0.01256847, + "epoch": 0.6794528784007214, + "flos": 11441300227200.0, + "grad_norm": 1.9915071500414414, + "language_loss": 0.63348699, + "learning_rate": 9.843016006639577e-07, + "loss": 0.71025515, + "num_input_tokens_seen": 243841600, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.10534668, + "step": 11301, + "time_per_iteration": 2.4696924686431885 + }, + { + "auxiliary_loss_clip": 0.06409229, + "auxiliary_loss_mlp": 0.01266875, + "balance_loss_clip": 0.06270877, + "balance_loss_mlp": 0.01256772, + "epoch": 0.6795130016533895, + "flos": 25236922550400.0, + "grad_norm": 1.7173390721705748, + "language_loss": 0.82948458, + "learning_rate": 9.839661197207525e-07, + "loss": 0.90624553, + "num_input_tokens_seen": 243862250, + "router_z_loss_clip": 1.38476562, + "router_z_loss_mlp": 0.10107422, + "step": 11302, + "time_per_iteration": 2.598444938659668 + }, + { + "auxiliary_loss_clip": 0.0641208, + "auxiliary_loss_mlp": 0.01264081, + "balance_loss_clip": 0.06272297, + "balance_loss_mlp": 0.01254121, + "epoch": 0.6795731249060574, + "flos": 18302326646400.0, + "grad_norm": 1.7779256028698032, + "language_loss": 0.69851995, + "learning_rate": 9.83630677305654e-07, + "loss": 0.77528167, + "num_input_tokens_seen": 243880560, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.09954834, + "step": 11303, + "time_per_iteration": 2.4852330684661865 + }, + { + "auxiliary_loss_clip": 0.06413473, + "auxiliary_loss_mlp": 0.01264262, + "balance_loss_clip": 0.06271894, + "balance_loss_mlp": 0.0125336, + "epoch": 0.6796332481587254, + "flos": 20306159112960.0, + "grad_norm": 1.8204218049780263, + "language_loss": 0.70597726, + "learning_rate": 9.832952734313813e-07, + "loss": 0.7827546, + "num_input_tokens_seen": 243900635, + "router_z_loss_clip": 1.41601562, + "router_z_loss_mlp": 0.10900879, + "step": 11304, + "time_per_iteration": 2.5139074325561523 + }, + { + "auxiliary_loss_clip": 0.0641301, + "auxiliary_loss_mlp": 0.01268726, + "balance_loss_clip": 0.0627501, + "balance_loss_mlp": 0.01257794, + "epoch": 0.6796933714113934, + "flos": 23593642202880.0, + "grad_norm": 2.4376362863510046, + "language_loss": 0.72319949, + "learning_rate": 9.829599081106536e-07, + "loss": 0.80001682, + "num_input_tokens_seen": 243920160, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.109375, + "step": 11305, + "time_per_iteration": 2.522174119949341 + }, + { + "auxiliary_loss_clip": 0.06407507, + "auxiliary_loss_mlp": 0.01264269, + "balance_loss_clip": 0.06268832, + "balance_loss_mlp": 0.01252986, + "epoch": 0.6797534946640613, + "flos": 27126291939840.0, + "grad_norm": 2.8826024363137535, + "language_loss": 0.66289663, + "learning_rate": 9.826245813561882e-07, + "loss": 0.73961437, + "num_input_tokens_seen": 243939015, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.11297607, + "step": 11306, + "time_per_iteration": 2.5523674488067627 + }, + { + "auxiliary_loss_clip": 0.06408583, + "auxiliary_loss_mlp": 0.01265584, + "balance_loss_clip": 0.06272008, + "balance_loss_mlp": 0.01255547, + "epoch": 0.6798136179167293, + "flos": 22133992078080.0, + "grad_norm": 1.614397517334369, + "language_loss": 0.80464542, + "learning_rate": 9.822892931807021e-07, + "loss": 0.88138705, + "num_input_tokens_seen": 243958470, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.10028076, + "step": 11307, + "time_per_iteration": 3.9510881900787354 + }, + { + "auxiliary_loss_clip": 0.06403545, + "auxiliary_loss_mlp": 0.0126431, + "balance_loss_clip": 0.06269792, + "balance_loss_mlp": 0.01253677, + "epoch": 0.6798737411693972, + "flos": 17493565438080.0, + "grad_norm": 1.503954365849396, + "language_loss": 0.89141631, + "learning_rate": 9.819540435969066e-07, + "loss": 0.96809489, + "num_input_tokens_seen": 243975450, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.10638428, + "step": 11308, + "time_per_iteration": 2.454899549484253 + }, + { + "auxiliary_loss_clip": 0.06406927, + "auxiliary_loss_mlp": 0.01264598, + "balance_loss_clip": 0.06268145, + "balance_loss_mlp": 0.01253792, + "epoch": 0.6799338644220653, + "flos": 22898715166080.0, + "grad_norm": 1.9892982746856287, + "language_loss": 0.71669519, + "learning_rate": 9.816188326175154e-07, + "loss": 0.79341042, + "num_input_tokens_seen": 243994355, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.1081543, + "step": 11309, + "time_per_iteration": 2.537949562072754 + }, + { + "auxiliary_loss_clip": 0.06407045, + "auxiliary_loss_mlp": 0.01269522, + "balance_loss_clip": 0.0626999, + "balance_loss_mlp": 0.01259312, + "epoch": 0.6799939876747332, + "flos": 23186284778880.0, + "grad_norm": 2.168983976078807, + "language_loss": 0.84444106, + "learning_rate": 9.812836602552411e-07, + "loss": 0.92120677, + "num_input_tokens_seen": 244011620, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.10217285, + "step": 11310, + "time_per_iteration": 2.5093727111816406 + }, + { + "auxiliary_loss_clip": 0.06401814, + "auxiliary_loss_mlp": 0.01262918, + "balance_loss_clip": 0.06269856, + "balance_loss_mlp": 0.0125331, + "epoch": 0.6800541109274012, + "flos": 19505951020800.0, + "grad_norm": 1.936116503903549, + "language_loss": 0.83367699, + "learning_rate": 9.80948526522792e-07, + "loss": 0.91032434, + "num_input_tokens_seen": 244029925, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09613037, + "step": 11311, + "time_per_iteration": 2.5046095848083496 + }, + { + "auxiliary_loss_clip": 0.064105, + "auxiliary_loss_mlp": 0.01269609, + "balance_loss_clip": 0.06267536, + "balance_loss_mlp": 0.01257491, + "epoch": 0.6801142341800691, + "flos": 22284946408320.0, + "grad_norm": 1.5408548920294685, + "language_loss": 0.7658841, + "learning_rate": 9.806134314328767e-07, + "loss": 0.84268516, + "num_input_tokens_seen": 244051225, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.12133789, + "step": 11312, + "time_per_iteration": 2.5174195766448975 + }, + { + "auxiliary_loss_clip": 0.06310892, + "auxiliary_loss_mlp": 0.01252687, + "balance_loss_clip": 0.06255079, + "balance_loss_mlp": 0.01251411, + "epoch": 0.6801743574327371, + "flos": 68734439614080.0, + "grad_norm": 0.6438614608961274, + "language_loss": 0.57270527, + "learning_rate": 9.802783749982038e-07, + "loss": 0.64834106, + "num_input_tokens_seen": 244115930, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.01276398, + "step": 11313, + "time_per_iteration": 3.2520179748535156 + }, + { + "auxiliary_loss_clip": 0.06408104, + "auxiliary_loss_mlp": 0.01265154, + "balance_loss_clip": 0.06268254, + "balance_loss_mlp": 0.0125483, + "epoch": 0.680234480685405, + "flos": 29468146976640.0, + "grad_norm": 1.6190653949052565, + "language_loss": 0.69341791, + "learning_rate": 9.799433572314754e-07, + "loss": 0.77015042, + "num_input_tokens_seen": 244137320, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.10327148, + "step": 11314, + "time_per_iteration": 2.5535359382629395 + }, + { + "auxiliary_loss_clip": 0.06404889, + "auxiliary_loss_mlp": 0.01267434, + "balance_loss_clip": 0.06268796, + "balance_loss_mlp": 0.01257731, + "epoch": 0.6802946039380731, + "flos": 15921045463680.0, + "grad_norm": 1.9728888269672866, + "language_loss": 0.81508797, + "learning_rate": 9.796083781453972e-07, + "loss": 0.89181113, + "num_input_tokens_seen": 244152755, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.0970459, + "step": 11315, + "time_per_iteration": 2.5169835090637207 + }, + { + "auxiliary_loss_clip": 0.06405143, + "auxiliary_loss_mlp": 0.01264219, + "balance_loss_clip": 0.06267972, + "balance_loss_mlp": 0.01253723, + "epoch": 0.680354727190741, + "flos": 22025314932480.0, + "grad_norm": 1.6675934827220065, + "language_loss": 0.70277983, + "learning_rate": 9.792734377526718e-07, + "loss": 0.77947348, + "num_input_tokens_seen": 244171480, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.1048584, + "step": 11316, + "time_per_iteration": 2.4984679222106934 + }, + { + "auxiliary_loss_clip": 0.06405444, + "auxiliary_loss_mlp": 0.01268676, + "balance_loss_clip": 0.06269848, + "balance_loss_mlp": 0.01258478, + "epoch": 0.680414850443409, + "flos": 18447285409920.0, + "grad_norm": 2.1628292849287267, + "language_loss": 0.67277592, + "learning_rate": 9.789385360660003e-07, + "loss": 0.74951708, + "num_input_tokens_seen": 244187920, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.10205078, + "step": 11317, + "time_per_iteration": 3.912996292114258 + }, + { + "auxiliary_loss_clip": 0.06412488, + "auxiliary_loss_mlp": 0.01266936, + "balance_loss_clip": 0.06273043, + "balance_loss_mlp": 0.01256666, + "epoch": 0.680474973696077, + "flos": 26365677701760.0, + "grad_norm": 1.4339432029892007, + "language_loss": 0.74834979, + "learning_rate": 9.78603673098082e-07, + "loss": 0.82514405, + "num_input_tokens_seen": 244209565, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.10266113, + "step": 11318, + "time_per_iteration": 2.613416910171509 + }, + { + "auxiliary_loss_clip": 0.06405453, + "auxiliary_loss_mlp": 0.01261508, + "balance_loss_clip": 0.06270547, + "balance_loss_mlp": 0.01252502, + "epoch": 0.6805350969487449, + "flos": 18339069461760.0, + "grad_norm": 1.741381394136802, + "language_loss": 0.6821155, + "learning_rate": 9.782688488616143e-07, + "loss": 0.75878513, + "num_input_tokens_seen": 244228015, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.09008789, + "step": 11319, + "time_per_iteration": 2.4735772609710693 + }, + { + "auxiliary_loss_clip": 0.06402999, + "auxiliary_loss_mlp": 0.01267278, + "balance_loss_clip": 0.06269106, + "balance_loss_mlp": 0.01257354, + "epoch": 0.6805952202014129, + "flos": 19943552568960.0, + "grad_norm": 1.589394100312008, + "language_loss": 0.77030569, + "learning_rate": 9.779340633692945e-07, + "loss": 0.84700847, + "num_input_tokens_seen": 244245615, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.09924316, + "step": 11320, + "time_per_iteration": 2.5447402000427246 + }, + { + "auxiliary_loss_clip": 0.06406876, + "auxiliary_loss_mlp": 0.01264766, + "balance_loss_clip": 0.06270229, + "balance_loss_mlp": 0.01254341, + "epoch": 0.6806553434540809, + "flos": 25230633494400.0, + "grad_norm": 1.8063346564210203, + "language_loss": 0.75357598, + "learning_rate": 9.77599316633817e-07, + "loss": 0.8302924, + "num_input_tokens_seen": 244263625, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.10437012, + "step": 11321, + "time_per_iteration": 3.959946393966675 + }, + { + "auxiliary_loss_clip": 0.064097, + "auxiliary_loss_mlp": 0.01264729, + "balance_loss_clip": 0.06270082, + "balance_loss_mlp": 0.01254274, + "epoch": 0.6807154667067489, + "flos": 17791407175680.0, + "grad_norm": 2.0443838016403495, + "language_loss": 0.73213184, + "learning_rate": 9.772646086678758e-07, + "loss": 0.80887616, + "num_input_tokens_seen": 244282745, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.10461426, + "step": 11322, + "time_per_iteration": 2.508143663406372 + }, + { + "auxiliary_loss_clip": 0.0641022, + "auxiliary_loss_mlp": 0.01264866, + "balance_loss_clip": 0.06270386, + "balance_loss_mlp": 0.01253517, + "epoch": 0.6807755899594168, + "flos": 22206387605760.0, + "grad_norm": 1.7755779600619086, + "language_loss": 0.78547817, + "learning_rate": 9.769299394841638e-07, + "loss": 0.86222905, + "num_input_tokens_seen": 244303770, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.11352539, + "step": 11323, + "time_per_iteration": 2.5345656871795654 + }, + { + "auxiliary_loss_clip": 0.06315179, + "auxiliary_loss_mlp": 0.01251391, + "balance_loss_clip": 0.06259721, + "balance_loss_mlp": 0.0125015, + "epoch": 0.6808357132120848, + "flos": 68648878995840.0, + "grad_norm": 0.7384546914137473, + "language_loss": 0.57113785, + "learning_rate": 9.765953090953714e-07, + "loss": 0.64680356, + "num_input_tokens_seen": 244355910, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.0124054, + "step": 11324, + "time_per_iteration": 2.9890177249908447 + }, + { + "auxiliary_loss_clip": 0.06410179, + "auxiliary_loss_mlp": 0.01265495, + "balance_loss_clip": 0.06271601, + "balance_loss_mlp": 0.01254301, + "epoch": 0.6808958364647527, + "flos": 23850380712960.0, + "grad_norm": 1.8768737712077719, + "language_loss": 0.68368208, + "learning_rate": 9.76260717514186e-07, + "loss": 0.76043886, + "num_input_tokens_seen": 244376610, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.11193848, + "step": 11325, + "time_per_iteration": 4.024105072021484 + }, + { + "auxiliary_loss_clip": 0.06410693, + "auxiliary_loss_mlp": 0.0126769, + "balance_loss_clip": 0.06269176, + "balance_loss_mlp": 0.01256705, + "epoch": 0.6809559597174207, + "flos": 17717376493440.0, + "grad_norm": 2.1078464153023924, + "language_loss": 0.70419264, + "learning_rate": 9.759261647532974e-07, + "loss": 0.78097641, + "num_input_tokens_seen": 244393000, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.10986328, + "step": 11326, + "time_per_iteration": 2.484449625015259 + }, + { + "auxiliary_loss_clip": 0.06407395, + "auxiliary_loss_mlp": 0.01261696, + "balance_loss_clip": 0.06270957, + "balance_loss_mlp": 0.01251551, + "epoch": 0.6810160829700886, + "flos": 22498443411840.0, + "grad_norm": 1.638017241748174, + "language_loss": 0.72914612, + "learning_rate": 9.75591650825392e-07, + "loss": 0.80583698, + "num_input_tokens_seen": 244409515, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.10150146, + "step": 11327, + "time_per_iteration": 2.502293586730957 + }, + { + "auxiliary_loss_clip": 0.06405802, + "auxiliary_loss_mlp": 0.01266544, + "balance_loss_clip": 0.06270967, + "balance_loss_mlp": 0.01255839, + "epoch": 0.6810762062227567, + "flos": 16837854912000.0, + "grad_norm": 1.827919270381089, + "language_loss": 0.77294552, + "learning_rate": 9.752571757431526e-07, + "loss": 0.84966898, + "num_input_tokens_seen": 244427165, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.10705566, + "step": 11328, + "time_per_iteration": 2.469923734664917 + }, + { + "auxiliary_loss_clip": 0.06412201, + "auxiliary_loss_mlp": 0.01264628, + "balance_loss_clip": 0.0627179, + "balance_loss_mlp": 0.01253941, + "epoch": 0.6811363294754246, + "flos": 12719751897600.0, + "grad_norm": 1.8250307958699987, + "language_loss": 0.64754045, + "learning_rate": 9.74922739519265e-07, + "loss": 0.72430873, + "num_input_tokens_seen": 244445705, + "router_z_loss_clip": 1.40527344, + "router_z_loss_mlp": 0.10681152, + "step": 11329, + "time_per_iteration": 2.5292539596557617 + }, + { + "auxiliary_loss_clip": 0.06409349, + "auxiliary_loss_mlp": 0.01264815, + "balance_loss_clip": 0.06270607, + "balance_loss_mlp": 0.01254182, + "epoch": 0.6811964527280926, + "flos": 17717669982720.0, + "grad_norm": 1.8641198647355242, + "language_loss": 0.79316872, + "learning_rate": 9.745883421664096e-07, + "loss": 0.86991036, + "num_input_tokens_seen": 244460415, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.10638428, + "step": 11330, + "time_per_iteration": 2.4813790321350098 + }, + { + "auxiliary_loss_clip": 0.0641039, + "auxiliary_loss_mlp": 0.01264709, + "balance_loss_clip": 0.06272174, + "balance_loss_mlp": 0.0125376, + "epoch": 0.6812565759807605, + "flos": 24870416791680.0, + "grad_norm": 2.109092836267495, + "language_loss": 0.64502859, + "learning_rate": 9.742539836972665e-07, + "loss": 0.72177964, + "num_input_tokens_seen": 244480555, + "router_z_loss_clip": 1.38183594, + "router_z_loss_mlp": 0.10943604, + "step": 11331, + "time_per_iteration": 2.6124520301818848 + }, + { + "auxiliary_loss_clip": 0.06407228, + "auxiliary_loss_mlp": 0.01265019, + "balance_loss_clip": 0.06270872, + "balance_loss_mlp": 0.01254666, + "epoch": 0.6813166992334285, + "flos": 17171852486400.0, + "grad_norm": 1.5406157015161637, + "language_loss": 0.72821605, + "learning_rate": 9.739196641245148e-07, + "loss": 0.80493855, + "num_input_tokens_seen": 244498540, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.1036377, + "step": 11332, + "time_per_iteration": 2.483144760131836 + }, + { + "auxiliary_loss_clip": 0.06412952, + "auxiliary_loss_mlp": 0.01267338, + "balance_loss_clip": 0.06272908, + "balance_loss_mlp": 0.01256705, + "epoch": 0.6813768224860965, + "flos": 18849527735040.0, + "grad_norm": 2.149720533461842, + "language_loss": 0.74508882, + "learning_rate": 9.735853834608326e-07, + "loss": 0.82189173, + "num_input_tokens_seen": 244517015, + "router_z_loss_clip": 1.39941406, + "router_z_loss_mlp": 0.10638428, + "step": 11333, + "time_per_iteration": 2.5427186489105225 + }, + { + "auxiliary_loss_clip": 0.06414136, + "auxiliary_loss_mlp": 0.01267127, + "balance_loss_clip": 0.06272501, + "balance_loss_mlp": 0.01256786, + "epoch": 0.6814369457387645, + "flos": 24539228328960.0, + "grad_norm": 1.3823548887580743, + "language_loss": 0.72367668, + "learning_rate": 9.732511417188963e-07, + "loss": 0.80048931, + "num_input_tokens_seen": 244537450, + "router_z_loss_clip": 1.41601562, + "router_z_loss_mlp": 0.10345459, + "step": 11334, + "time_per_iteration": 2.537958860397339 + }, + { + "auxiliary_loss_clip": 0.06405447, + "auxiliary_loss_mlp": 0.01266429, + "balance_loss_clip": 0.06271046, + "balance_loss_mlp": 0.0125607, + "epoch": 0.6814970689914325, + "flos": 18228799088640.0, + "grad_norm": 1.6460074116702026, + "language_loss": 0.86505604, + "learning_rate": 9.729169389113791e-07, + "loss": 0.94177485, + "num_input_tokens_seen": 244555640, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.10357666, + "step": 11335, + "time_per_iteration": 2.5018861293792725 + }, + { + "auxiliary_loss_clip": 0.06401964, + "auxiliary_loss_mlp": 0.01265777, + "balance_loss_clip": 0.06271435, + "balance_loss_mlp": 0.01255656, + "epoch": 0.6815571922441004, + "flos": 25235874374400.0, + "grad_norm": 1.6438782420335836, + "language_loss": 0.81760287, + "learning_rate": 9.725827750509542e-07, + "loss": 0.89428031, + "num_input_tokens_seen": 244574005, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.10125732, + "step": 11336, + "time_per_iteration": 2.5359947681427 + }, + { + "auxiliary_loss_clip": 0.06403621, + "auxiliary_loss_mlp": 0.01268492, + "balance_loss_clip": 0.06270905, + "balance_loss_mlp": 0.0125818, + "epoch": 0.6816173154967684, + "flos": 19460864724480.0, + "grad_norm": 1.9165693219649298, + "language_loss": 0.82064402, + "learning_rate": 9.72248650150294e-07, + "loss": 0.89736515, + "num_input_tokens_seen": 244591395, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.10321045, + "step": 11337, + "time_per_iteration": 2.511289119720459 + }, + { + "auxiliary_loss_clip": 0.06404516, + "auxiliary_loss_mlp": 0.01264446, + "balance_loss_clip": 0.06271403, + "balance_loss_mlp": 0.01254462, + "epoch": 0.6816774387494363, + "flos": 17937288334080.0, + "grad_norm": 1.560533910826156, + "language_loss": 0.73002589, + "learning_rate": 9.719145642220673e-07, + "loss": 0.80671549, + "num_input_tokens_seen": 244610400, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.09979248, + "step": 11338, + "time_per_iteration": 2.511681318283081 + }, + { + "auxiliary_loss_clip": 0.06413732, + "auxiliary_loss_mlp": 0.01264898, + "balance_loss_clip": 0.06275684, + "balance_loss_mlp": 0.01254337, + "epoch": 0.6817375620021043, + "flos": 22238937717120.0, + "grad_norm": 1.4240412111564371, + "language_loss": 0.77416432, + "learning_rate": 9.715805172789435e-07, + "loss": 0.8509506, + "num_input_tokens_seen": 244630400, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.10559082, + "step": 11339, + "time_per_iteration": 2.5428354740142822 + }, + { + "auxiliary_loss_clip": 0.06410687, + "auxiliary_loss_mlp": 0.01264953, + "balance_loss_clip": 0.06272987, + "balance_loss_mlp": 0.012542, + "epoch": 0.6817976852547722, + "flos": 25381462043520.0, + "grad_norm": 1.7944902461652392, + "language_loss": 0.71041632, + "learning_rate": 9.712465093335901e-07, + "loss": 0.78717273, + "num_input_tokens_seen": 244649155, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.10748291, + "step": 11340, + "time_per_iteration": 2.550901412963867 + }, + { + "auxiliary_loss_clip": 0.06413396, + "auxiliary_loss_mlp": 0.01267156, + "balance_loss_clip": 0.06273545, + "balance_loss_mlp": 0.01256725, + "epoch": 0.6818578085074403, + "flos": 22271068558080.0, + "grad_norm": 2.180704981107058, + "language_loss": 0.84409666, + "learning_rate": 9.709125403986722e-07, + "loss": 0.92090219, + "num_input_tokens_seen": 244665470, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.10437012, + "step": 11341, + "time_per_iteration": 2.5165159702301025 + }, + { + "auxiliary_loss_clip": 0.06414375, + "auxiliary_loss_mlp": 0.01266506, + "balance_loss_clip": 0.06275092, + "balance_loss_mlp": 0.01255831, + "epoch": 0.6819179317601082, + "flos": 19324249441920.0, + "grad_norm": 1.5598647366733476, + "language_loss": 0.68810844, + "learning_rate": 9.705786104868531e-07, + "loss": 0.76491725, + "num_input_tokens_seen": 244684390, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.10681152, + "step": 11342, + "time_per_iteration": 2.593763589859009 + }, + { + "auxiliary_loss_clip": 0.06407441, + "auxiliary_loss_mlp": 0.01261474, + "balance_loss_clip": 0.0627171, + "balance_loss_mlp": 0.01251342, + "epoch": 0.6819780550127762, + "flos": 21110224492800.0, + "grad_norm": 1.6656061272859015, + "language_loss": 0.74818993, + "learning_rate": 9.702447196107963e-07, + "loss": 0.82487905, + "num_input_tokens_seen": 244703370, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.10131836, + "step": 11343, + "time_per_iteration": 2.524341344833374 + }, + { + "auxiliary_loss_clip": 0.06415273, + "auxiliary_loss_mlp": 0.01266539, + "balance_loss_clip": 0.06277119, + "balance_loss_mlp": 0.01256055, + "epoch": 0.6820381782654441, + "flos": 29724214654080.0, + "grad_norm": 1.6102730777044594, + "language_loss": 0.80077457, + "learning_rate": 9.699108677831639e-07, + "loss": 0.87759268, + "num_input_tokens_seen": 244723325, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.1048584, + "step": 11344, + "time_per_iteration": 2.559631586074829 + }, + { + "auxiliary_loss_clip": 0.06412022, + "auxiliary_loss_mlp": 0.01263183, + "balance_loss_clip": 0.06272998, + "balance_loss_mlp": 0.01252747, + "epoch": 0.6820983015181121, + "flos": 29249870290560.0, + "grad_norm": 1.8689488071291331, + "language_loss": 0.66530693, + "learning_rate": 9.695770550166136e-07, + "loss": 0.74205899, + "num_input_tokens_seen": 244745650, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.10424805, + "step": 11345, + "time_per_iteration": 2.588878870010376 + }, + { + "auxiliary_loss_clip": 0.06416089, + "auxiliary_loss_mlp": 0.01264993, + "balance_loss_clip": 0.06275414, + "balance_loss_mlp": 0.01254538, + "epoch": 0.6821584247707801, + "flos": 18876375768960.0, + "grad_norm": 2.261790357681116, + "language_loss": 0.65540516, + "learning_rate": 9.692432813238054e-07, + "loss": 0.732216, + "num_input_tokens_seen": 244760270, + "router_z_loss_clip": 1.40527344, + "router_z_loss_mlp": 0.10461426, + "step": 11346, + "time_per_iteration": 2.4776885509490967 + }, + { + "auxiliary_loss_clip": 0.06415972, + "auxiliary_loss_mlp": 0.01264195, + "balance_loss_clip": 0.06274392, + "balance_loss_mlp": 0.01253567, + "epoch": 0.6822185480234481, + "flos": 21330974874240.0, + "grad_norm": 1.434084459819624, + "language_loss": 0.7886349, + "learning_rate": 9.689095467173952e-07, + "loss": 0.86543655, + "num_input_tokens_seen": 244779565, + "router_z_loss_clip": 1.41699219, + "router_z_loss_mlp": 0.10632324, + "step": 11347, + "time_per_iteration": 3.919304132461548 + }, + { + "auxiliary_loss_clip": 0.06316185, + "auxiliary_loss_mlp": 0.01255511, + "balance_loss_clip": 0.06260848, + "balance_loss_mlp": 0.01254305, + "epoch": 0.6822786712761161, + "flos": 63505540949760.0, + "grad_norm": 0.7177694724545725, + "language_loss": 0.52512419, + "learning_rate": 9.685758512100378e-07, + "loss": 0.60084116, + "num_input_tokens_seen": 244838480, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01203918, + "step": 11348, + "time_per_iteration": 3.14101505279541 + }, + { + "auxiliary_loss_clip": 0.06413009, + "auxiliary_loss_mlp": 0.01264656, + "balance_loss_clip": 0.06278681, + "balance_loss_mlp": 0.01255209, + "epoch": 0.682338794528784, + "flos": 21075242613120.0, + "grad_norm": 1.7094709865372797, + "language_loss": 0.79881036, + "learning_rate": 9.682421948143873e-07, + "loss": 0.87558699, + "num_input_tokens_seen": 244855265, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.09448242, + "step": 11349, + "time_per_iteration": 2.497866630554199 + }, + { + "auxiliary_loss_clip": 0.06425133, + "auxiliary_loss_mlp": 0.01267838, + "balance_loss_clip": 0.06278804, + "balance_loss_mlp": 0.01255595, + "epoch": 0.682398917781452, + "flos": 36292053237120.0, + "grad_norm": 1.5698213232216975, + "language_loss": 0.7393533, + "learning_rate": 9.67908577543096e-07, + "loss": 0.81628305, + "num_input_tokens_seen": 244875555, + "router_z_loss_clip": 1.46289062, + "router_z_loss_mlp": 0.12243652, + "step": 11350, + "time_per_iteration": 2.62261700630188 + }, + { + "auxiliary_loss_clip": 0.06411327, + "auxiliary_loss_mlp": 0.01267917, + "balance_loss_clip": 0.06275079, + "balance_loss_mlp": 0.01258094, + "epoch": 0.6824590410341199, + "flos": 24865427473920.0, + "grad_norm": 1.5591585279724258, + "language_loss": 0.79965377, + "learning_rate": 9.675749994088161e-07, + "loss": 0.87644625, + "num_input_tokens_seen": 244895270, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.09832764, + "step": 11351, + "time_per_iteration": 2.528369665145874 + }, + { + "auxiliary_loss_clip": 0.06409021, + "auxiliary_loss_mlp": 0.01262582, + "balance_loss_clip": 0.06272362, + "balance_loss_mlp": 0.0125292, + "epoch": 0.6825191642867879, + "flos": 22458430287360.0, + "grad_norm": 1.5623570195172147, + "language_loss": 0.73523104, + "learning_rate": 9.672414604241954e-07, + "loss": 0.81194711, + "num_input_tokens_seen": 244914535, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.09661865, + "step": 11352, + "time_per_iteration": 2.522172451019287 + }, + { + "auxiliary_loss_clip": 0.06413847, + "auxiliary_loss_mlp": 0.0126533, + "balance_loss_clip": 0.0627329, + "balance_loss_mlp": 0.01253677, + "epoch": 0.6825792875394558, + "flos": 29432116920960.0, + "grad_norm": 1.626079801889606, + "language_loss": 0.804649, + "learning_rate": 9.669079606018814e-07, + "loss": 0.88144076, + "num_input_tokens_seen": 244936095, + "router_z_loss_clip": 1.40332031, + "router_z_loss_mlp": 0.11639404, + "step": 11353, + "time_per_iteration": 2.5686585903167725 + }, + { + "auxiliary_loss_clip": 0.06413363, + "auxiliary_loss_mlp": 0.01265606, + "balance_loss_clip": 0.06276349, + "balance_loss_mlp": 0.01254747, + "epoch": 0.6826394107921239, + "flos": 18777006426240.0, + "grad_norm": 1.604562568600035, + "language_loss": 0.78506744, + "learning_rate": 9.665744999545218e-07, + "loss": 0.86185712, + "num_input_tokens_seen": 244955290, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.10864258, + "step": 11354, + "time_per_iteration": 2.5204999446868896 + }, + { + "auxiliary_loss_clip": 0.06408085, + "auxiliary_loss_mlp": 0.01263379, + "balance_loss_clip": 0.06272091, + "balance_loss_mlp": 0.0125355, + "epoch": 0.6826995340447918, + "flos": 16623142024320.0, + "grad_norm": 2.019321118646576, + "language_loss": 0.62111843, + "learning_rate": 9.662410784947599e-07, + "loss": 0.69783312, + "num_input_tokens_seen": 244972935, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.09814453, + "step": 11355, + "time_per_iteration": 2.4766104221343994 + }, + { + "auxiliary_loss_clip": 0.06412464, + "auxiliary_loss_mlp": 0.01263892, + "balance_loss_clip": 0.0627443, + "balance_loss_mlp": 0.01254117, + "epoch": 0.6827596572974598, + "flos": 20854282596480.0, + "grad_norm": 1.7897850919384148, + "language_loss": 0.82221437, + "learning_rate": 9.659076962352398e-07, + "loss": 0.89897794, + "num_input_tokens_seen": 244989440, + "router_z_loss_clip": 1.38183594, + "router_z_loss_mlp": 0.09771729, + "step": 11356, + "time_per_iteration": 3.9204885959625244 + }, + { + "auxiliary_loss_clip": 0.06415853, + "auxiliary_loss_mlp": 0.01263188, + "balance_loss_clip": 0.06275809, + "balance_loss_mlp": 0.01252561, + "epoch": 0.6828197805501277, + "flos": 22754804578560.0, + "grad_norm": 1.6532324250211312, + "language_loss": 0.78508228, + "learning_rate": 9.655743531886052e-07, + "loss": 0.86187267, + "num_input_tokens_seen": 245007830, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.10626221, + "step": 11357, + "time_per_iteration": 2.5153608322143555 + }, + { + "auxiliary_loss_clip": 0.06314074, + "auxiliary_loss_mlp": 0.01254778, + "balance_loss_clip": 0.06258625, + "balance_loss_mlp": 0.01253596, + "epoch": 0.6828799038027957, + "flos": 71668833598080.0, + "grad_norm": 0.7966113468619515, + "language_loss": 0.59682757, + "learning_rate": 9.65241049367493e-07, + "loss": 0.67251611, + "num_input_tokens_seen": 245070720, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01180267, + "step": 11358, + "time_per_iteration": 3.1846532821655273 + }, + { + "auxiliary_loss_clip": 0.06419402, + "auxiliary_loss_mlp": 0.01269456, + "balance_loss_clip": 0.06276588, + "balance_loss_mlp": 0.01257648, + "epoch": 0.6829400270554637, + "flos": 19835378547840.0, + "grad_norm": 1.7044245093067194, + "language_loss": 0.78866333, + "learning_rate": 9.64907784784544e-07, + "loss": 0.86555189, + "num_input_tokens_seen": 245089070, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.11816406, + "step": 11359, + "time_per_iteration": 2.5490803718566895 + }, + { + "auxiliary_loss_clip": 0.064127, + "auxiliary_loss_mlp": 0.01264331, + "balance_loss_clip": 0.06273861, + "balance_loss_mlp": 0.01253734, + "epoch": 0.6830001503081317, + "flos": 21987020816640.0, + "grad_norm": 2.0193369174380664, + "language_loss": 0.82223153, + "learning_rate": 9.645745594523958e-07, + "loss": 0.89900184, + "num_input_tokens_seen": 245106500, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.105896, + "step": 11360, + "time_per_iteration": 3.9807236194610596 + }, + { + "auxiliary_loss_clip": 0.0641343, + "auxiliary_loss_mlp": 0.01265293, + "balance_loss_clip": 0.06274153, + "balance_loss_mlp": 0.01254677, + "epoch": 0.6830602735607997, + "flos": 24323718827520.0, + "grad_norm": 1.651921957497636, + "language_loss": 0.75011313, + "learning_rate": 9.642413733836844e-07, + "loss": 0.82690036, + "num_input_tokens_seen": 245125260, + "router_z_loss_clip": 1.39257812, + "router_z_loss_mlp": 0.1060791, + "step": 11361, + "time_per_iteration": 2.535749673843384 + }, + { + "auxiliary_loss_clip": 0.06309322, + "auxiliary_loss_mlp": 0.01254085, + "balance_loss_clip": 0.06253715, + "balance_loss_mlp": 0.01252928, + "epoch": 0.6831203968134676, + "flos": 57706827793920.0, + "grad_norm": 0.8409522652001101, + "language_loss": 0.595146, + "learning_rate": 9.639082265910437e-07, + "loss": 0.67078006, + "num_input_tokens_seen": 245188730, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01154327, + "step": 11362, + "time_per_iteration": 3.249852180480957 + }, + { + "auxiliary_loss_clip": 0.06412338, + "auxiliary_loss_mlp": 0.0126686, + "balance_loss_clip": 0.06271093, + "balance_loss_mlp": 0.01255792, + "epoch": 0.6831805200661356, + "flos": 14393024807040.0, + "grad_norm": 2.0585212828502004, + "language_loss": 0.76010299, + "learning_rate": 9.635751190871074e-07, + "loss": 0.83689499, + "num_input_tokens_seen": 245205065, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.11077881, + "step": 11363, + "time_per_iteration": 2.5203006267547607 + }, + { + "auxiliary_loss_clip": 0.06410082, + "auxiliary_loss_mlp": 0.01264688, + "balance_loss_clip": 0.06273843, + "balance_loss_mlp": 0.01253828, + "epoch": 0.6832406433188035, + "flos": 22826906616960.0, + "grad_norm": 2.358731005347766, + "language_loss": 0.89481944, + "learning_rate": 9.632420508845063e-07, + "loss": 0.97156709, + "num_input_tokens_seen": 245224265, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.10870361, + "step": 11364, + "time_per_iteration": 2.5663001537323 + }, + { + "auxiliary_loss_clip": 0.06405666, + "auxiliary_loss_mlp": 0.0126555, + "balance_loss_clip": 0.06269991, + "balance_loss_mlp": 0.01255721, + "epoch": 0.6833007665714715, + "flos": 17566673725440.0, + "grad_norm": 1.8217270673941708, + "language_loss": 0.88218802, + "learning_rate": 9.629090219958697e-07, + "loss": 0.95890021, + "num_input_tokens_seen": 245243360, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.09838867, + "step": 11365, + "time_per_iteration": 3.9711902141571045 + }, + { + "auxiliary_loss_clip": 0.06422257, + "auxiliary_loss_mlp": 0.0127244, + "balance_loss_clip": 0.06279552, + "balance_loss_mlp": 0.01261222, + "epoch": 0.6833608898241395, + "flos": 22450883420160.0, + "grad_norm": 1.95679459658848, + "language_loss": 0.81100428, + "learning_rate": 9.625760324338272e-07, + "loss": 0.88795125, + "num_input_tokens_seen": 245256350, + "router_z_loss_clip": 1.42675781, + "router_z_loss_mlp": 0.11230469, + "step": 11366, + "time_per_iteration": 2.496051788330078 + }, + { + "auxiliary_loss_clip": 0.06410712, + "auxiliary_loss_mlp": 0.01263817, + "balance_loss_clip": 0.06271282, + "balance_loss_mlp": 0.01253434, + "epoch": 0.6834210130768075, + "flos": 24541450462080.0, + "grad_norm": 1.3668234382616995, + "language_loss": 0.76664793, + "learning_rate": 9.622430822110062e-07, + "loss": 0.84339321, + "num_input_tokens_seen": 245277575, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.1038208, + "step": 11367, + "time_per_iteration": 2.597698450088501 + }, + { + "auxiliary_loss_clip": 0.06411598, + "auxiliary_loss_mlp": 0.01263902, + "balance_loss_clip": 0.06272662, + "balance_loss_mlp": 0.0125312, + "epoch": 0.6834811363294754, + "flos": 20053235963520.0, + "grad_norm": 1.5010742143698117, + "language_loss": 0.69233596, + "learning_rate": 9.619101713400312e-07, + "loss": 0.76909101, + "num_input_tokens_seen": 245296615, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.10791016, + "step": 11368, + "time_per_iteration": 2.520679473876953 + }, + { + "auxiliary_loss_clip": 0.06409574, + "auxiliary_loss_mlp": 0.01266367, + "balance_loss_clip": 0.06272889, + "balance_loss_mlp": 0.0125553, + "epoch": 0.6835412595821434, + "flos": 24797727774720.0, + "grad_norm": 1.604090291521746, + "language_loss": 0.73295021, + "learning_rate": 9.615772998335261e-07, + "loss": 0.80970967, + "num_input_tokens_seen": 245316275, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.1083374, + "step": 11369, + "time_per_iteration": 2.5773866176605225 + }, + { + "auxiliary_loss_clip": 0.06409427, + "auxiliary_loss_mlp": 0.01264128, + "balance_loss_clip": 0.06271335, + "balance_loss_mlp": 0.01254067, + "epoch": 0.6836013828348113, + "flos": 19506454145280.0, + "grad_norm": 1.9399454003386187, + "language_loss": 0.79163188, + "learning_rate": 9.612444677041138e-07, + "loss": 0.86836743, + "num_input_tokens_seen": 245334595, + "router_z_loss_clip": 1.38183594, + "router_z_loss_mlp": 0.10064697, + "step": 11370, + "time_per_iteration": 2.4922618865966797 + }, + { + "auxiliary_loss_clip": 0.06306867, + "auxiliary_loss_mlp": 0.01250813, + "balance_loss_clip": 0.06251401, + "balance_loss_mlp": 0.0124961, + "epoch": 0.6836615060874793, + "flos": 58383753402240.0, + "grad_norm": 0.8179842252969125, + "language_loss": 0.59746689, + "learning_rate": 9.609116749644162e-07, + "loss": 0.67304367, + "num_input_tokens_seen": 245389750, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.0120163, + "step": 11371, + "time_per_iteration": 3.0478594303131104 + }, + { + "auxiliary_loss_clip": 0.06402698, + "auxiliary_loss_mlp": 0.01263932, + "balance_loss_clip": 0.06270069, + "balance_loss_mlp": 0.01254175, + "epoch": 0.6837216293401474, + "flos": 12171796122240.0, + "grad_norm": 1.5508500684767301, + "language_loss": 0.63639355, + "learning_rate": 9.605789216270511e-07, + "loss": 0.71305984, + "num_input_tokens_seen": 245407530, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09759521, + "step": 11372, + "time_per_iteration": 2.4811301231384277 + }, + { + "auxiliary_loss_clip": 0.06408484, + "auxiliary_loss_mlp": 0.01265592, + "balance_loss_clip": 0.06272547, + "balance_loss_mlp": 0.01255137, + "epoch": 0.6837817525928153, + "flos": 22134159786240.0, + "grad_norm": 1.4333850518313196, + "language_loss": 0.71846133, + "learning_rate": 9.602462077046375e-07, + "loss": 0.79520208, + "num_input_tokens_seen": 245427000, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.10461426, + "step": 11373, + "time_per_iteration": 2.5287580490112305 + }, + { + "auxiliary_loss_clip": 0.06305692, + "auxiliary_loss_mlp": 0.01251081, + "balance_loss_clip": 0.06250165, + "balance_loss_mlp": 0.01249923, + "epoch": 0.6838418758454833, + "flos": 65027048186880.0, + "grad_norm": 1.1033743133145881, + "language_loss": 0.56752723, + "learning_rate": 9.599135332097935e-07, + "loss": 0.6430949, + "num_input_tokens_seen": 245491620, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01155853, + "step": 11374, + "time_per_iteration": 3.302116632461548 + }, + { + "auxiliary_loss_clip": 0.06410992, + "auxiliary_loss_mlp": 0.01268892, + "balance_loss_clip": 0.06272627, + "balance_loss_mlp": 0.01257895, + "epoch": 0.6839019990981512, + "flos": 21036864643200.0, + "grad_norm": 1.4837774857580213, + "language_loss": 0.7423023, + "learning_rate": 9.595808981551312e-07, + "loss": 0.81910115, + "num_input_tokens_seen": 245511285, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.11001587, + "step": 11375, + "time_per_iteration": 2.5274906158447266 + }, + { + "auxiliary_loss_clip": 0.06406655, + "auxiliary_loss_mlp": 0.01267316, + "balance_loss_clip": 0.06271502, + "balance_loss_mlp": 0.01257684, + "epoch": 0.6839621223508192, + "flos": 24942351121920.0, + "grad_norm": 1.6223536594822023, + "language_loss": 0.7043916, + "learning_rate": 9.592483025532651e-07, + "loss": 0.78113139, + "num_input_tokens_seen": 245532910, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.09637451, + "step": 11376, + "time_per_iteration": 2.5494120121002197 + }, + { + "auxiliary_loss_clip": 0.06412984, + "auxiliary_loss_mlp": 0.012638, + "balance_loss_clip": 0.06272008, + "balance_loss_mlp": 0.01253161, + "epoch": 0.6840222456034871, + "flos": 26365929264000.0, + "grad_norm": 1.7833627654713686, + "language_loss": 0.74259639, + "learning_rate": 9.58915746416808e-07, + "loss": 0.81936419, + "num_input_tokens_seen": 245550540, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.10632324, + "step": 11377, + "time_per_iteration": 2.5434489250183105 + }, + { + "auxiliary_loss_clip": 0.06309253, + "auxiliary_loss_mlp": 0.01251187, + "balance_loss_clip": 0.06253564, + "balance_loss_mlp": 0.01249992, + "epoch": 0.6840823688561551, + "flos": 66009167493120.0, + "grad_norm": 0.7064811243320783, + "language_loss": 0.56814432, + "learning_rate": 9.585832297583707e-07, + "loss": 0.64374876, + "num_input_tokens_seen": 245619570, + "router_z_loss_clip": 0.55517578, + "router_z_loss_mlp": 0.01193237, + "step": 11378, + "time_per_iteration": 3.2616686820983887 + }, + { + "auxiliary_loss_clip": 0.06409612, + "auxiliary_loss_mlp": 0.01265612, + "balance_loss_clip": 0.06271753, + "balance_loss_mlp": 0.01254764, + "epoch": 0.684142492108823, + "flos": 21403999307520.0, + "grad_norm": 1.6132418851945567, + "language_loss": 0.78663373, + "learning_rate": 9.58250752590561e-07, + "loss": 0.86338598, + "num_input_tokens_seen": 245637980, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.10858154, + "step": 11379, + "time_per_iteration": 2.53483247756958 + }, + { + "auxiliary_loss_clip": 0.06401949, + "auxiliary_loss_mlp": 0.01263666, + "balance_loss_clip": 0.06271493, + "balance_loss_mlp": 0.01254976, + "epoch": 0.6842026153614911, + "flos": 18806453936640.0, + "grad_norm": 2.5056443246249, + "language_loss": 0.68875623, + "learning_rate": 9.57918314925988e-07, + "loss": 0.76541233, + "num_input_tokens_seen": 245655690, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.08685303, + "step": 11380, + "time_per_iteration": 2.5189809799194336 + }, + { + "auxiliary_loss_clip": 0.06407002, + "auxiliary_loss_mlp": 0.01265061, + "balance_loss_clip": 0.06271026, + "balance_loss_mlp": 0.01254678, + "epoch": 0.684262738614159, + "flos": 19652544938880.0, + "grad_norm": 1.774794382077768, + "language_loss": 0.78619421, + "learning_rate": 9.575859167772568e-07, + "loss": 0.8629148, + "num_input_tokens_seen": 245671525, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.1038208, + "step": 11381, + "time_per_iteration": 2.5038013458251953 + }, + { + "auxiliary_loss_clip": 0.0631157, + "auxiliary_loss_mlp": 0.01250817, + "balance_loss_clip": 0.06255913, + "balance_loss_mlp": 0.01249629, + "epoch": 0.684322861866827, + "flos": 62371041793920.0, + "grad_norm": 0.8443750872588546, + "language_loss": 0.67272472, + "learning_rate": 9.572535581569713e-07, + "loss": 0.74834859, + "num_input_tokens_seen": 245724115, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01186371, + "step": 11382, + "time_per_iteration": 3.022620677947998 + }, + { + "auxiliary_loss_clip": 0.06309118, + "auxiliary_loss_mlp": 0.01252769, + "balance_loss_clip": 0.06253339, + "balance_loss_mlp": 0.01251537, + "epoch": 0.6843829851194949, + "flos": 65825704978560.0, + "grad_norm": 0.8346748203160914, + "language_loss": 0.58115959, + "learning_rate": 9.569212390777356e-07, + "loss": 0.65677845, + "num_input_tokens_seen": 245789245, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.01231384, + "step": 11383, + "time_per_iteration": 3.205733060836792 + }, + { + "auxiliary_loss_clip": 0.06403822, + "auxiliary_loss_mlp": 0.01263656, + "balance_loss_clip": 0.06269525, + "balance_loss_mlp": 0.01253697, + "epoch": 0.6844431083721629, + "flos": 27862573766400.0, + "grad_norm": 1.743965936300629, + "language_loss": 0.79892695, + "learning_rate": 9.565889595521517e-07, + "loss": 0.87560171, + "num_input_tokens_seen": 245812420, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.09960938, + "step": 11384, + "time_per_iteration": 2.576397657394409 + }, + { + "auxiliary_loss_clip": 0.0641057, + "auxiliary_loss_mlp": 0.01264349, + "balance_loss_clip": 0.06270487, + "balance_loss_mlp": 0.01253459, + "epoch": 0.684503231624831, + "flos": 18260091388800.0, + "grad_norm": 1.8125132078887, + "language_loss": 0.77559322, + "learning_rate": 9.562567195928187e-07, + "loss": 0.85234237, + "num_input_tokens_seen": 245829135, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.10894775, + "step": 11385, + "time_per_iteration": 2.5222182273864746 + }, + { + "auxiliary_loss_clip": 0.06418984, + "auxiliary_loss_mlp": 0.01266461, + "balance_loss_clip": 0.0627387, + "balance_loss_mlp": 0.01254397, + "epoch": 0.6845633548774989, + "flos": 17645484090240.0, + "grad_norm": 2.2044599558463105, + "language_loss": 0.84624577, + "learning_rate": 9.55924519212335e-07, + "loss": 0.92310023, + "num_input_tokens_seen": 245847140, + "router_z_loss_clip": 1.45019531, + "router_z_loss_mlp": 0.12072754, + "step": 11386, + "time_per_iteration": 3.9474587440490723 + }, + { + "auxiliary_loss_clip": 0.06409421, + "auxiliary_loss_mlp": 0.01262563, + "balance_loss_clip": 0.06272484, + "balance_loss_mlp": 0.01252883, + "epoch": 0.6846234781301669, + "flos": 20812843952640.0, + "grad_norm": 1.925558647056537, + "language_loss": 0.83398205, + "learning_rate": 9.555923584232984e-07, + "loss": 0.91070187, + "num_input_tokens_seen": 245862855, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.09680176, + "step": 11387, + "time_per_iteration": 2.5117714405059814 + }, + { + "auxiliary_loss_clip": 0.06405626, + "auxiliary_loss_mlp": 0.01263725, + "balance_loss_clip": 0.06270427, + "balance_loss_mlp": 0.01254033, + "epoch": 0.6846836013828348, + "flos": 36110016241920.0, + "grad_norm": 1.588804983998274, + "language_loss": 0.72422922, + "learning_rate": 9.552602372383047e-07, + "loss": 0.80092275, + "num_input_tokens_seen": 245885415, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.09692383, + "step": 11388, + "time_per_iteration": 2.669675588607788 + }, + { + "auxiliary_loss_clip": 0.0640699, + "auxiliary_loss_mlp": 0.01267663, + "balance_loss_clip": 0.06272318, + "balance_loss_mlp": 0.01258198, + "epoch": 0.6847437246355028, + "flos": 43152408823680.0, + "grad_norm": 2.116517308354933, + "language_loss": 0.63188899, + "learning_rate": 9.549281556699469e-07, + "loss": 0.70863551, + "num_input_tokens_seen": 245906285, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.09460449, + "step": 11389, + "time_per_iteration": 2.775179862976074 + }, + { + "auxiliary_loss_clip": 0.06304318, + "auxiliary_loss_mlp": 0.01252682, + "balance_loss_clip": 0.06248381, + "balance_loss_mlp": 0.01251546, + "epoch": 0.6848038478881707, + "flos": 71682768103680.0, + "grad_norm": 0.7038129025924749, + "language_loss": 0.55774271, + "learning_rate": 9.54596113730818e-07, + "loss": 0.63331264, + "num_input_tokens_seen": 245967620, + "router_z_loss_clip": 0.55908203, + "router_z_loss_mlp": 0.01138306, + "step": 11390, + "time_per_iteration": 3.2121734619140625 + }, + { + "auxiliary_loss_clip": 0.06409647, + "auxiliary_loss_mlp": 0.01266416, + "balance_loss_clip": 0.06272963, + "balance_loss_mlp": 0.01255997, + "epoch": 0.6848639711408387, + "flos": 19943929912320.0, + "grad_norm": 1.8977282247890388, + "language_loss": 0.87613106, + "learning_rate": 9.542641114335109e-07, + "loss": 0.95289165, + "num_input_tokens_seen": 245985075, + "router_z_loss_clip": 1.36621094, + "router_z_loss_mlp": 0.10424805, + "step": 11391, + "time_per_iteration": 2.500140428543091 + }, + { + "auxiliary_loss_clip": 0.06412797, + "auxiliary_loss_mlp": 0.01263893, + "balance_loss_clip": 0.0627296, + "balance_loss_mlp": 0.01253343, + "epoch": 0.6849240943935067, + "flos": 26874333112320.0, + "grad_norm": 1.48935328965904, + "language_loss": 0.79339015, + "learning_rate": 9.539321487906117e-07, + "loss": 0.870157, + "num_input_tokens_seen": 246003560, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.10552979, + "step": 11392, + "time_per_iteration": 2.557020902633667 + }, + { + "auxiliary_loss_clip": 0.06403191, + "auxiliary_loss_mlp": 0.01264788, + "balance_loss_clip": 0.06271005, + "balance_loss_mlp": 0.01254751, + "epoch": 0.6849842176461747, + "flos": 13740458808960.0, + "grad_norm": 2.0081405471627884, + "language_loss": 0.71175981, + "learning_rate": 9.536002258147104e-07, + "loss": 0.78843963, + "num_input_tokens_seen": 246019600, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.10040283, + "step": 11393, + "time_per_iteration": 2.5271036624908447 + }, + { + "auxiliary_loss_clip": 0.06415832, + "auxiliary_loss_mlp": 0.01265598, + "balance_loss_clip": 0.0627556, + "balance_loss_mlp": 0.01255, + "epoch": 0.6850443408988426, + "flos": 24980058259200.0, + "grad_norm": 1.5317798757580128, + "language_loss": 0.64661515, + "learning_rate": 9.532683425183936e-07, + "loss": 0.72342944, + "num_input_tokens_seen": 246038920, + "router_z_loss_clip": 1.40332031, + "router_z_loss_mlp": 0.10595703, + "step": 11394, + "time_per_iteration": 2.53812313079834 + }, + { + "auxiliary_loss_clip": 0.06411145, + "auxiliary_loss_mlp": 0.01264493, + "balance_loss_clip": 0.06272422, + "balance_loss_mlp": 0.0125439, + "epoch": 0.6851044641515106, + "flos": 27751380998400.0, + "grad_norm": 1.5645262580549901, + "language_loss": 0.80918968, + "learning_rate": 9.529364989142468e-07, + "loss": 0.88594604, + "num_input_tokens_seen": 246060490, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.10101318, + "step": 11395, + "time_per_iteration": 2.550346851348877 + }, + { + "auxiliary_loss_clip": 0.06410371, + "auxiliary_loss_mlp": 0.01268735, + "balance_loss_clip": 0.06274814, + "balance_loss_mlp": 0.01258144, + "epoch": 0.6851645874041785, + "flos": 24357652531200.0, + "grad_norm": 1.7469268170163024, + "language_loss": 0.72832096, + "learning_rate": 9.526046950148527e-07, + "loss": 0.80511206, + "num_input_tokens_seen": 246081465, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10595703, + "step": 11396, + "time_per_iteration": 3.9635422229766846 + }, + { + "auxiliary_loss_clip": 0.06410467, + "auxiliary_loss_mlp": 0.01265588, + "balance_loss_clip": 0.06270725, + "balance_loss_mlp": 0.01255056, + "epoch": 0.6852247106568465, + "flos": 15081914350080.0, + "grad_norm": 2.3772034852800643, + "language_loss": 0.79818743, + "learning_rate": 9.522729308327931e-07, + "loss": 0.87494791, + "num_input_tokens_seen": 246096110, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.10528564, + "step": 11397, + "time_per_iteration": 2.481863260269165 + }, + { + "auxiliary_loss_clip": 0.06411494, + "auxiliary_loss_mlp": 0.01267109, + "balance_loss_clip": 0.0627315, + "balance_loss_mlp": 0.01256828, + "epoch": 0.6852848339095146, + "flos": 18775874396160.0, + "grad_norm": 1.839103323810105, + "language_loss": 0.71941662, + "learning_rate": 9.519412063806493e-07, + "loss": 0.7962026, + "num_input_tokens_seen": 246114785, + "router_z_loss_clip": 1.38476562, + "router_z_loss_mlp": 0.10284424, + "step": 11398, + "time_per_iteration": 2.5322060585021973 + }, + { + "auxiliary_loss_clip": 0.06403108, + "auxiliary_loss_mlp": 0.01265797, + "balance_loss_clip": 0.06270117, + "balance_loss_mlp": 0.0125632, + "epoch": 0.6853449571621825, + "flos": 27861651371520.0, + "grad_norm": 1.5188649145265738, + "language_loss": 0.71170795, + "learning_rate": 9.516095216709996e-07, + "loss": 0.78839701, + "num_input_tokens_seen": 246136375, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.0947876, + "step": 11399, + "time_per_iteration": 3.972925901412964 + }, + { + "auxiliary_loss_clip": 0.06411214, + "auxiliary_loss_mlp": 0.01269341, + "balance_loss_clip": 0.06273123, + "balance_loss_mlp": 0.01259119, + "epoch": 0.6854050804148505, + "flos": 18156403560960.0, + "grad_norm": 1.6092651373600877, + "language_loss": 0.70567757, + "learning_rate": 9.512778767164217e-07, + "loss": 0.78248316, + "num_input_tokens_seen": 246155090, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.10217285, + "step": 11400, + "time_per_iteration": 2.474824905395508 + }, + { + "auxiliary_loss_clip": 0.06426042, + "auxiliary_loss_mlp": 0.01267609, + "balance_loss_clip": 0.06277213, + "balance_loss_mlp": 0.01255163, + "epoch": 0.6854652036675184, + "flos": 16331798977920.0, + "grad_norm": 1.9177955333528751, + "language_loss": 0.77889669, + "learning_rate": 9.509462715294927e-07, + "loss": 0.85583317, + "num_input_tokens_seen": 246172645, + "router_z_loss_clip": 1.48632812, + "router_z_loss_mlp": 0.12463379, + "step": 11401, + "time_per_iteration": 2.5186407566070557 + }, + { + "auxiliary_loss_clip": 0.06405222, + "auxiliary_loss_mlp": 0.01266109, + "balance_loss_clip": 0.06271464, + "balance_loss_mlp": 0.01256537, + "epoch": 0.6855253269201864, + "flos": 14946347243520.0, + "grad_norm": 2.060399475016654, + "language_loss": 0.75462782, + "learning_rate": 9.50614706122786e-07, + "loss": 0.83134115, + "num_input_tokens_seen": 246189055, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.0958252, + "step": 11402, + "time_per_iteration": 2.461958885192871 + }, + { + "auxiliary_loss_clip": 0.06414859, + "auxiliary_loss_mlp": 0.01266931, + "balance_loss_clip": 0.06273296, + "balance_loss_mlp": 0.01255487, + "epoch": 0.6855854501728543, + "flos": 23044135127040.0, + "grad_norm": 1.4779944862214063, + "language_loss": 0.73165995, + "learning_rate": 9.502831805088742e-07, + "loss": 0.80847782, + "num_input_tokens_seen": 246207990, + "router_z_loss_clip": 1.41503906, + "router_z_loss_mlp": 0.11444092, + "step": 11403, + "time_per_iteration": 2.5588088035583496 + }, + { + "auxiliary_loss_clip": 0.06407753, + "auxiliary_loss_mlp": 0.01264829, + "balance_loss_clip": 0.06272316, + "balance_loss_mlp": 0.0125522, + "epoch": 0.6856455734255223, + "flos": 13257393621120.0, + "grad_norm": 3.459862281853561, + "language_loss": 0.81727648, + "learning_rate": 9.499516947003294e-07, + "loss": 0.89400232, + "num_input_tokens_seen": 246221595, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.09613037, + "step": 11404, + "time_per_iteration": 3.899538993835449 + }, + { + "auxiliary_loss_clip": 0.06407394, + "auxiliary_loss_mlp": 0.01269418, + "balance_loss_clip": 0.06274688, + "balance_loss_mlp": 0.01259381, + "epoch": 0.6857056966781903, + "flos": 23340551345280.0, + "grad_norm": 1.3350169784860642, + "language_loss": 0.7794162, + "learning_rate": 9.496202487097222e-07, + "loss": 0.8561843, + "num_input_tokens_seen": 246242970, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.10046387, + "step": 11405, + "time_per_iteration": 2.618781089782715 + }, + { + "auxiliary_loss_clip": 0.06313835, + "auxiliary_loss_mlp": 0.01251022, + "balance_loss_clip": 0.06257869, + "balance_loss_mlp": 0.01250013, + "epoch": 0.6857658199308583, + "flos": 61870646010240.0, + "grad_norm": 0.7926132752302004, + "language_loss": 0.60793728, + "learning_rate": 9.492888425496199e-07, + "loss": 0.68358588, + "num_input_tokens_seen": 246300405, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01009369, + "step": 11406, + "time_per_iteration": 3.192826986312866 + }, + { + "auxiliary_loss_clip": 0.06409362, + "auxiliary_loss_mlp": 0.0126412, + "balance_loss_clip": 0.06271882, + "balance_loss_mlp": 0.01253826, + "epoch": 0.6858259431835262, + "flos": 16660178328960.0, + "grad_norm": 1.6678552032285212, + "language_loss": 0.77383244, + "learning_rate": 9.489574762325907e-07, + "loss": 0.85056722, + "num_input_tokens_seen": 246318780, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.10296631, + "step": 11407, + "time_per_iteration": 2.5133752822875977 + }, + { + "auxiliary_loss_clip": 0.06408191, + "auxiliary_loss_mlp": 0.01265606, + "balance_loss_clip": 0.0626992, + "balance_loss_mlp": 0.0125455, + "epoch": 0.6858860664361942, + "flos": 21879643409280.0, + "grad_norm": 2.893760051958565, + "language_loss": 0.71341193, + "learning_rate": 9.486261497711991e-07, + "loss": 0.79014993, + "num_input_tokens_seen": 246339405, + "router_z_loss_clip": 1.38183594, + "router_z_loss_mlp": 0.11053467, + "step": 11408, + "time_per_iteration": 2.5356616973876953 + }, + { + "auxiliary_loss_clip": 0.06413727, + "auxiliary_loss_mlp": 0.01265844, + "balance_loss_clip": 0.06273487, + "balance_loss_mlp": 0.0125514, + "epoch": 0.6859461896888621, + "flos": 15272965658880.0, + "grad_norm": 1.731957908279727, + "language_loss": 0.70413965, + "learning_rate": 9.482948631780087e-07, + "loss": 0.78093535, + "num_input_tokens_seen": 246357055, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.1071167, + "step": 11409, + "time_per_iteration": 2.52020525932312 + }, + { + "auxiliary_loss_clip": 0.0640128, + "auxiliary_loss_mlp": 0.01263971, + "balance_loss_clip": 0.06270745, + "balance_loss_mlp": 0.01254733, + "epoch": 0.6860063129415301, + "flos": 18625507044480.0, + "grad_norm": 1.590904402895803, + "language_loss": 0.78129441, + "learning_rate": 9.479636164655825e-07, + "loss": 0.85794687, + "num_input_tokens_seen": 246374050, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.09240723, + "step": 11410, + "time_per_iteration": 2.546893358230591 + }, + { + "auxiliary_loss_clip": 0.06412078, + "auxiliary_loss_mlp": 0.01266884, + "balance_loss_clip": 0.06270525, + "balance_loss_mlp": 0.01256078, + "epoch": 0.6860664361941982, + "flos": 23958177390720.0, + "grad_norm": 1.8721880718662787, + "language_loss": 0.7200377, + "learning_rate": 9.476324096464821e-07, + "loss": 0.79682732, + "num_input_tokens_seen": 246392910, + "router_z_loss_clip": 1.41601562, + "router_z_loss_mlp": 0.1081543, + "step": 11411, + "time_per_iteration": 2.532982349395752 + }, + { + "auxiliary_loss_clip": 0.0641197, + "auxiliary_loss_mlp": 0.01268743, + "balance_loss_clip": 0.06274374, + "balance_loss_mlp": 0.01258551, + "epoch": 0.6861265594468661, + "flos": 20413243031040.0, + "grad_norm": 1.9740044070304406, + "language_loss": 0.70534211, + "learning_rate": 9.473012427332654e-07, + "loss": 0.78214926, + "num_input_tokens_seen": 246411540, + "router_z_loss_clip": 1.37695312, + "router_z_loss_mlp": 0.10192871, + "step": 11412, + "time_per_iteration": 2.5798745155334473 + }, + { + "auxiliary_loss_clip": 0.06410308, + "auxiliary_loss_mlp": 0.01266719, + "balance_loss_clip": 0.06272474, + "balance_loss_mlp": 0.01256324, + "epoch": 0.6861866826995341, + "flos": 11431908570240.0, + "grad_norm": 3.0856036818138692, + "language_loss": 0.71973193, + "learning_rate": 9.469701157384919e-07, + "loss": 0.79650223, + "num_input_tokens_seen": 246423295, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.10394287, + "step": 11413, + "time_per_iteration": 2.4693074226379395 + }, + { + "auxiliary_loss_clip": 0.06411856, + "auxiliary_loss_mlp": 0.01267734, + "balance_loss_clip": 0.06274316, + "balance_loss_mlp": 0.01257518, + "epoch": 0.686246805952202, + "flos": 16003084210560.0, + "grad_norm": 1.8173139685722925, + "language_loss": 0.73670095, + "learning_rate": 9.466390286747164e-07, + "loss": 0.81349689, + "num_input_tokens_seen": 246441045, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.10217285, + "step": 11414, + "time_per_iteration": 2.510739803314209 + }, + { + "auxiliary_loss_clip": 0.06415157, + "auxiliary_loss_mlp": 0.01267285, + "balance_loss_clip": 0.06276812, + "balance_loss_mlp": 0.01256425, + "epoch": 0.68630692920487, + "flos": 19832527509120.0, + "grad_norm": 2.474590574257684, + "language_loss": 0.87128049, + "learning_rate": 9.46307981554495e-07, + "loss": 0.94810498, + "num_input_tokens_seen": 246456905, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.10852051, + "step": 11415, + "time_per_iteration": 2.4847946166992188 + }, + { + "auxiliary_loss_clip": 0.06415314, + "auxiliary_loss_mlp": 0.01266339, + "balance_loss_clip": 0.06276202, + "balance_loss_mlp": 0.01254705, + "epoch": 0.6863670524575379, + "flos": 26293366028160.0, + "grad_norm": 9.907368268016192, + "language_loss": 0.67353249, + "learning_rate": 9.459769743903801e-07, + "loss": 0.75034899, + "num_input_tokens_seen": 246477545, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.11633301, + "step": 11416, + "time_per_iteration": 2.5904948711395264 + }, + { + "auxiliary_loss_clip": 0.06403923, + "auxiliary_loss_mlp": 0.0126434, + "balance_loss_clip": 0.06269173, + "balance_loss_mlp": 0.01254284, + "epoch": 0.686427175710206, + "flos": 19179374532480.0, + "grad_norm": 1.4750819254499818, + "language_loss": 0.76489693, + "learning_rate": 9.456460071949237e-07, + "loss": 0.84157956, + "num_input_tokens_seen": 246496705, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.10058594, + "step": 11417, + "time_per_iteration": 2.487197160720825 + }, + { + "auxiliary_loss_clip": 0.06410322, + "auxiliary_loss_mlp": 0.0126862, + "balance_loss_clip": 0.0627322, + "balance_loss_mlp": 0.01258863, + "epoch": 0.6864872989628739, + "flos": 18922636022400.0, + "grad_norm": 1.8452434101813986, + "language_loss": 0.77370739, + "learning_rate": 9.45315079980678e-07, + "loss": 0.85049683, + "num_input_tokens_seen": 246514860, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.09759521, + "step": 11418, + "time_per_iteration": 2.510810375213623 + }, + { + "auxiliary_loss_clip": 0.06410821, + "auxiliary_loss_mlp": 0.01265598, + "balance_loss_clip": 0.06272699, + "balance_loss_mlp": 0.01255382, + "epoch": 0.6865474222155419, + "flos": 25963016106240.0, + "grad_norm": 1.6317928435070383, + "language_loss": 0.76463497, + "learning_rate": 9.449841927601887e-07, + "loss": 0.84139907, + "num_input_tokens_seen": 246536145, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.10217285, + "step": 11419, + "time_per_iteration": 2.5700454711914062 + }, + { + "auxiliary_loss_clip": 0.06407338, + "auxiliary_loss_mlp": 0.01267938, + "balance_loss_clip": 0.06270772, + "balance_loss_mlp": 0.01258359, + "epoch": 0.6866075454682098, + "flos": 18483902444160.0, + "grad_norm": 1.6443171286333353, + "language_loss": 0.71588171, + "learning_rate": 9.446533455460044e-07, + "loss": 0.79263443, + "num_input_tokens_seen": 246553265, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.0958252, + "step": 11420, + "time_per_iteration": 2.5144495964050293 + }, + { + "auxiliary_loss_clip": 0.06407318, + "auxiliary_loss_mlp": 0.0126343, + "balance_loss_clip": 0.06272776, + "balance_loss_mlp": 0.01253506, + "epoch": 0.6866676687208778, + "flos": 34248459208320.0, + "grad_norm": 1.3410332761873145, + "language_loss": 0.75059515, + "learning_rate": 9.443225383506712e-07, + "loss": 0.82730258, + "num_input_tokens_seen": 246575130, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.09924316, + "step": 11421, + "time_per_iteration": 2.61454176902771 + }, + { + "auxiliary_loss_clip": 0.0640727, + "auxiliary_loss_mlp": 0.01265626, + "balance_loss_clip": 0.06272772, + "balance_loss_mlp": 0.01255982, + "epoch": 0.6867277919735457, + "flos": 21727515121920.0, + "grad_norm": 1.6725729939473468, + "language_loss": 0.77230668, + "learning_rate": 9.439917711867338e-07, + "loss": 0.84903562, + "num_input_tokens_seen": 246593095, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.09637451, + "step": 11422, + "time_per_iteration": 2.5174617767333984 + }, + { + "auxiliary_loss_clip": 0.0641562, + "auxiliary_loss_mlp": 0.01272736, + "balance_loss_clip": 0.06279219, + "balance_loss_mlp": 0.01261536, + "epoch": 0.6867879152262137, + "flos": 24104939016960.0, + "grad_norm": 1.647039828063758, + "language_loss": 0.77276117, + "learning_rate": 9.436610440667334e-07, + "loss": 0.84964472, + "num_input_tokens_seen": 246612165, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.11206055, + "step": 11423, + "time_per_iteration": 2.5189144611358643 + }, + { + "auxiliary_loss_clip": 0.06414216, + "auxiliary_loss_mlp": 0.01267082, + "balance_loss_clip": 0.06274028, + "balance_loss_mlp": 0.01256461, + "epoch": 0.6868480384788818, + "flos": 21622150212480.0, + "grad_norm": 1.4426214659548335, + "language_loss": 0.73124474, + "learning_rate": 9.433303570032129e-07, + "loss": 0.80805779, + "num_input_tokens_seen": 246632065, + "router_z_loss_clip": 1.40332031, + "router_z_loss_mlp": 0.10614014, + "step": 11424, + "time_per_iteration": 2.5789601802825928 + }, + { + "auxiliary_loss_clip": 0.06411408, + "auxiliary_loss_mlp": 0.01265287, + "balance_loss_clip": 0.06273325, + "balance_loss_mlp": 0.01254839, + "epoch": 0.6869081617315497, + "flos": 26293282174080.0, + "grad_norm": 1.8417753723265369, + "language_loss": 0.65276968, + "learning_rate": 9.429997100087112e-07, + "loss": 0.72953665, + "num_input_tokens_seen": 246651245, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.10437012, + "step": 11425, + "time_per_iteration": 2.547678232192993 + }, + { + "auxiliary_loss_clip": 0.06408506, + "auxiliary_loss_mlp": 0.0126771, + "balance_loss_clip": 0.06275355, + "balance_loss_mlp": 0.01257381, + "epoch": 0.6869682849842177, + "flos": 21111356522880.0, + "grad_norm": 1.3347714221988014, + "language_loss": 0.71902603, + "learning_rate": 9.426691030957657e-07, + "loss": 0.79578817, + "num_input_tokens_seen": 246672225, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.10327148, + "step": 11426, + "time_per_iteration": 4.051712512969971 + }, + { + "auxiliary_loss_clip": 0.06412126, + "auxiliary_loss_mlp": 0.01266408, + "balance_loss_clip": 0.06274693, + "balance_loss_mlp": 0.0125606, + "epoch": 0.6870284082368856, + "flos": 17098408782720.0, + "grad_norm": 2.192498277588843, + "language_loss": 0.85740101, + "learning_rate": 9.423385362769136e-07, + "loss": 0.93418634, + "num_input_tokens_seen": 246688385, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.10351562, + "step": 11427, + "time_per_iteration": 2.533590316772461 + }, + { + "auxiliary_loss_clip": 0.06408241, + "auxiliary_loss_mlp": 0.01263719, + "balance_loss_clip": 0.06273334, + "balance_loss_mlp": 0.01253312, + "epoch": 0.6870885314895536, + "flos": 27315456531840.0, + "grad_norm": 1.4340637684485376, + "language_loss": 0.76548541, + "learning_rate": 9.420080095646909e-07, + "loss": 0.84220493, + "num_input_tokens_seen": 246710730, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.10412598, + "step": 11428, + "time_per_iteration": 2.579432249069214 + }, + { + "auxiliary_loss_clip": 0.06414707, + "auxiliary_loss_mlp": 0.0127044, + "balance_loss_clip": 0.06273684, + "balance_loss_mlp": 0.01259002, + "epoch": 0.6871486547422215, + "flos": 20820977798400.0, + "grad_norm": 2.1898072552839087, + "language_loss": 0.73509127, + "learning_rate": 9.4167752297163e-07, + "loss": 0.81194276, + "num_input_tokens_seen": 246730350, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.11437988, + "step": 11429, + "time_per_iteration": 2.508434772491455 + }, + { + "auxiliary_loss_clip": 0.0641626, + "auxiliary_loss_mlp": 0.01266327, + "balance_loss_clip": 0.06277661, + "balance_loss_mlp": 0.01256474, + "epoch": 0.6872087779948896, + "flos": 30161983910400.0, + "grad_norm": 1.931452469341354, + "language_loss": 0.83630431, + "learning_rate": 9.413470765102643e-07, + "loss": 0.91313016, + "num_input_tokens_seen": 246751700, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.09851074, + "step": 11430, + "time_per_iteration": 2.630755662918091 + }, + { + "auxiliary_loss_clip": 0.06412026, + "auxiliary_loss_mlp": 0.0126587, + "balance_loss_clip": 0.06274621, + "balance_loss_mlp": 0.0125504, + "epoch": 0.6872689012475575, + "flos": 20710917060480.0, + "grad_norm": 2.0596974928309253, + "language_loss": 0.70543802, + "learning_rate": 9.410166701931225e-07, + "loss": 0.78221703, + "num_input_tokens_seen": 246769860, + "router_z_loss_clip": 1.37402344, + "router_z_loss_mlp": 0.10827637, + "step": 11431, + "time_per_iteration": 2.491147756576538 + }, + { + "auxiliary_loss_clip": 0.06409967, + "auxiliary_loss_mlp": 0.01264771, + "balance_loss_clip": 0.06271458, + "balance_loss_mlp": 0.01254293, + "epoch": 0.6873290245002255, + "flos": 25528014034560.0, + "grad_norm": 1.7781814059522836, + "language_loss": 0.80397063, + "learning_rate": 9.406863040327355e-07, + "loss": 0.88071799, + "num_input_tokens_seen": 246789905, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.1048584, + "step": 11432, + "time_per_iteration": 2.5659162998199463 + }, + { + "auxiliary_loss_clip": 0.06404472, + "auxiliary_loss_mlp": 0.01268851, + "balance_loss_clip": 0.06272881, + "balance_loss_mlp": 0.01259362, + "epoch": 0.6873891477528934, + "flos": 25198418799360.0, + "grad_norm": 2.2741442538336125, + "language_loss": 0.68286675, + "learning_rate": 9.403559780416295e-07, + "loss": 0.75959998, + "num_input_tokens_seen": 246808815, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09490967, + "step": 11433, + "time_per_iteration": 2.6121439933776855 + }, + { + "auxiliary_loss_clip": 0.064156, + "auxiliary_loss_mlp": 0.01269066, + "balance_loss_clip": 0.06278776, + "balance_loss_mlp": 0.01258665, + "epoch": 0.6874492710055614, + "flos": 35161034025600.0, + "grad_norm": 2.030098002823672, + "language_loss": 0.72783715, + "learning_rate": 9.400256922323309e-07, + "loss": 0.8046838, + "num_input_tokens_seen": 246829775, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.10400391, + "step": 11434, + "time_per_iteration": 2.6294844150543213 + }, + { + "auxiliary_loss_clip": 0.06410138, + "auxiliary_loss_mlp": 0.01269251, + "balance_loss_clip": 0.06275442, + "balance_loss_mlp": 0.0125919, + "epoch": 0.6875093942582293, + "flos": 17828066136960.0, + "grad_norm": 1.5552043430175444, + "language_loss": 0.80520236, + "learning_rate": 9.396954466173657e-07, + "loss": 0.88199627, + "num_input_tokens_seen": 246848045, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.10064697, + "step": 11435, + "time_per_iteration": 2.501239061355591 + }, + { + "auxiliary_loss_clip": 0.06411996, + "auxiliary_loss_mlp": 0.01269183, + "balance_loss_clip": 0.06272568, + "balance_loss_mlp": 0.01258227, + "epoch": 0.6875695175108973, + "flos": 20710875133440.0, + "grad_norm": 9.52111477806384, + "language_loss": 0.8158865, + "learning_rate": 9.393652412092538e-07, + "loss": 0.89269829, + "num_input_tokens_seen": 246866095, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.10943604, + "step": 11436, + "time_per_iteration": 3.8841755390167236 + }, + { + "auxiliary_loss_clip": 0.064064, + "auxiliary_loss_mlp": 0.01268806, + "balance_loss_clip": 0.0627645, + "balance_loss_mlp": 0.01259806, + "epoch": 0.6876296407635654, + "flos": 25381000846080.0, + "grad_norm": 1.6419248940044093, + "language_loss": 0.81966716, + "learning_rate": 9.390350760205183e-07, + "loss": 0.89641917, + "num_input_tokens_seen": 246883975, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.08996582, + "step": 11437, + "time_per_iteration": 2.5980188846588135 + }, + { + "auxiliary_loss_clip": 0.06421375, + "auxiliary_loss_mlp": 0.01270532, + "balance_loss_clip": 0.06274987, + "balance_loss_mlp": 0.01257729, + "epoch": 0.6876897640162333, + "flos": 23229107015040.0, + "grad_norm": 2.1640181952928486, + "language_loss": 0.77725911, + "learning_rate": 9.387049510636793e-07, + "loss": 0.85417819, + "num_input_tokens_seen": 246901560, + "router_z_loss_clip": 1.46484375, + "router_z_loss_mlp": 0.12792969, + "step": 11438, + "time_per_iteration": 2.5095889568328857 + }, + { + "auxiliary_loss_clip": 0.06405748, + "auxiliary_loss_mlp": 0.01270285, + "balance_loss_clip": 0.06273987, + "balance_loss_mlp": 0.01260838, + "epoch": 0.6877498872689013, + "flos": 27131448965760.0, + "grad_norm": 1.6644547524403899, + "language_loss": 0.72329235, + "learning_rate": 9.383748663512554e-07, + "loss": 0.80005264, + "num_input_tokens_seen": 246922655, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09448242, + "step": 11439, + "time_per_iteration": 3.9927306175231934 + }, + { + "auxiliary_loss_clip": 0.06406644, + "auxiliary_loss_mlp": 0.01268484, + "balance_loss_clip": 0.06271771, + "balance_loss_mlp": 0.01258554, + "epoch": 0.6878100105215692, + "flos": 11586217063680.0, + "grad_norm": 1.9676653989850965, + "language_loss": 0.75157619, + "learning_rate": 9.380448218957623e-07, + "loss": 0.82832754, + "num_input_tokens_seen": 246940100, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.09936523, + "step": 11440, + "time_per_iteration": 2.4851269721984863 + }, + { + "auxiliary_loss_clip": 0.06404521, + "auxiliary_loss_mlp": 0.01267859, + "balance_loss_clip": 0.06272353, + "balance_loss_mlp": 0.012584, + "epoch": 0.6878701337742372, + "flos": 20309429422080.0, + "grad_norm": 1.4828372396976293, + "language_loss": 0.71795368, + "learning_rate": 9.377148177097167e-07, + "loss": 0.79467738, + "num_input_tokens_seen": 246958545, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.09448242, + "step": 11441, + "time_per_iteration": 2.514653444290161 + }, + { + "auxiliary_loss_clip": 0.06418902, + "auxiliary_loss_mlp": 0.01272176, + "balance_loss_clip": 0.06276838, + "balance_loss_mlp": 0.01260893, + "epoch": 0.6879302570269051, + "flos": 13844398199040.0, + "grad_norm": 1.6175108384355714, + "language_loss": 0.66777945, + "learning_rate": 9.373848538056317e-07, + "loss": 0.74469018, + "num_input_tokens_seen": 246974805, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.11291504, + "step": 11442, + "time_per_iteration": 2.5146420001983643 + }, + { + "auxiliary_loss_clip": 0.06411453, + "auxiliary_loss_mlp": 0.01266841, + "balance_loss_clip": 0.06274946, + "balance_loss_mlp": 0.01256547, + "epoch": 0.6879903802795732, + "flos": 21331058728320.0, + "grad_norm": 2.38232064736284, + "language_loss": 0.69958794, + "learning_rate": 9.370549301960189e-07, + "loss": 0.77637082, + "num_input_tokens_seen": 246992505, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.10290527, + "step": 11443, + "time_per_iteration": 2.493436574935913 + }, + { + "auxiliary_loss_clip": 0.06419516, + "auxiliary_loss_mlp": 0.01266925, + "balance_loss_clip": 0.06279808, + "balance_loss_mlp": 0.01256524, + "epoch": 0.6880505035322411, + "flos": 25158489528960.0, + "grad_norm": 1.390720225309701, + "language_loss": 0.763533, + "learning_rate": 9.367250468933893e-07, + "loss": 0.84039736, + "num_input_tokens_seen": 247013370, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.10394287, + "step": 11444, + "time_per_iteration": 3.9500269889831543 + }, + { + "auxiliary_loss_clip": 0.06406762, + "auxiliary_loss_mlp": 0.01267311, + "balance_loss_clip": 0.06272952, + "balance_loss_mlp": 0.01257059, + "epoch": 0.6881106267849091, + "flos": 23221182804480.0, + "grad_norm": 1.8756092745031845, + "language_loss": 0.76660252, + "learning_rate": 9.363952039102536e-07, + "loss": 0.84334326, + "num_input_tokens_seen": 247029855, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.10253906, + "step": 11445, + "time_per_iteration": 2.488555908203125 + }, + { + "auxiliary_loss_clip": 0.06317502, + "auxiliary_loss_mlp": 0.01252549, + "balance_loss_clip": 0.06261797, + "balance_loss_mlp": 0.01251243, + "epoch": 0.688170750037577, + "flos": 48497741136000.0, + "grad_norm": 0.8087198242159813, + "language_loss": 0.58278191, + "learning_rate": 9.360654012591183e-07, + "loss": 0.65848243, + "num_input_tokens_seen": 247085030, + "router_z_loss_clip": 0.55810547, + "router_z_loss_mlp": 0.01306915, + "step": 11446, + "time_per_iteration": 3.1777503490448 + }, + { + "auxiliary_loss_clip": 0.06413881, + "auxiliary_loss_mlp": 0.0126538, + "balance_loss_clip": 0.06273392, + "balance_loss_mlp": 0.01254562, + "epoch": 0.688230873290245, + "flos": 22790205728640.0, + "grad_norm": 1.616943103064761, + "language_loss": 0.76008183, + "learning_rate": 9.357356389524886e-07, + "loss": 0.83687443, + "num_input_tokens_seen": 247104840, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.10821533, + "step": 11447, + "time_per_iteration": 2.5756897926330566 + }, + { + "auxiliary_loss_clip": 0.06411539, + "auxiliary_loss_mlp": 0.01266898, + "balance_loss_clip": 0.06274877, + "balance_loss_mlp": 0.01256884, + "epoch": 0.6882909965429129, + "flos": 22462245648000.0, + "grad_norm": 1.9129765382773336, + "language_loss": 0.74044937, + "learning_rate": 9.354059170028705e-07, + "loss": 0.81723368, + "num_input_tokens_seen": 247121905, + "router_z_loss_clip": 1.36621094, + "router_z_loss_mlp": 0.10015869, + "step": 11448, + "time_per_iteration": 2.5083351135253906 + }, + { + "auxiliary_loss_clip": 0.06417549, + "auxiliary_loss_mlp": 0.01266481, + "balance_loss_clip": 0.06275415, + "balance_loss_mlp": 0.01255376, + "epoch": 0.688351119795581, + "flos": 26221431697920.0, + "grad_norm": 1.5605900643108004, + "language_loss": 0.74581099, + "learning_rate": 9.350762354227673e-07, + "loss": 0.82265133, + "num_input_tokens_seen": 247142375, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.11102295, + "step": 11449, + "time_per_iteration": 2.585969924926758 + }, + { + "auxiliary_loss_clip": 0.06408881, + "auxiliary_loss_mlp": 0.01266876, + "balance_loss_clip": 0.06273638, + "balance_loss_mlp": 0.01256809, + "epoch": 0.6884112430482489, + "flos": 22571887115520.0, + "grad_norm": 1.6262008407242425, + "language_loss": 0.70027089, + "learning_rate": 9.34746594224679e-07, + "loss": 0.77702844, + "num_input_tokens_seen": 247161095, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.1005249, + "step": 11450, + "time_per_iteration": 2.5182437896728516 + }, + { + "auxiliary_loss_clip": 0.06418543, + "auxiliary_loss_mlp": 0.0126869, + "balance_loss_clip": 0.06276023, + "balance_loss_mlp": 0.01257187, + "epoch": 0.6884713663009169, + "flos": 17345671781760.0, + "grad_norm": 1.9477242871289788, + "language_loss": 0.76100504, + "learning_rate": 9.344169934211068e-07, + "loss": 0.83787739, + "num_input_tokens_seen": 247178565, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.1151123, + "step": 11451, + "time_per_iteration": 2.5395891666412354 + }, + { + "auxiliary_loss_clip": 0.06416887, + "auxiliary_loss_mlp": 0.01263826, + "balance_loss_clip": 0.06276768, + "balance_loss_mlp": 0.01253926, + "epoch": 0.6885314895535849, + "flos": 26478379843200.0, + "grad_norm": 1.2780895399548546, + "language_loss": 0.69393182, + "learning_rate": 9.340874330245505e-07, + "loss": 0.77073896, + "num_input_tokens_seen": 247202345, + "router_z_loss_clip": 1.40039062, + "router_z_loss_mlp": 0.09899902, + "step": 11452, + "time_per_iteration": 2.584246873855591 + }, + { + "auxiliary_loss_clip": 0.06409479, + "auxiliary_loss_mlp": 0.01267469, + "balance_loss_clip": 0.06273156, + "balance_loss_mlp": 0.0125553, + "epoch": 0.6885916128062528, + "flos": 20527748035200.0, + "grad_norm": 1.553726438653973, + "language_loss": 0.71749568, + "learning_rate": 9.337579130475042e-07, + "loss": 0.79426515, + "num_input_tokens_seen": 247219240, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.11932373, + "step": 11453, + "time_per_iteration": 2.5244805812835693 + }, + { + "auxiliary_loss_clip": 0.06314202, + "auxiliary_loss_mlp": 0.01249184, + "balance_loss_clip": 0.06258714, + "balance_loss_mlp": 0.01248031, + "epoch": 0.6886517360589208, + "flos": 70734792136320.0, + "grad_norm": 0.77256871445285, + "language_loss": 0.50623441, + "learning_rate": 9.334284335024644e-07, + "loss": 0.58186829, + "num_input_tokens_seen": 247272010, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01150513, + "step": 11454, + "time_per_iteration": 2.982760190963745 + }, + { + "auxiliary_loss_clip": 0.06402037, + "auxiliary_loss_mlp": 0.01264708, + "balance_loss_clip": 0.06273487, + "balance_loss_mlp": 0.01254998, + "epoch": 0.6887118593115887, + "flos": 17899119999360.0, + "grad_norm": 1.70106225646023, + "language_loss": 0.75493348, + "learning_rate": 9.330989944019263e-07, + "loss": 0.8316009, + "num_input_tokens_seen": 247290630, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.09716797, + "step": 11455, + "time_per_iteration": 2.5417535305023193 + }, + { + "auxiliary_loss_clip": 0.0641242, + "auxiliary_loss_mlp": 0.01266873, + "balance_loss_clip": 0.06273204, + "balance_loss_mlp": 0.01255286, + "epoch": 0.6887719825642568, + "flos": 17458080433920.0, + "grad_norm": 2.3349527650336945, + "language_loss": 0.72984523, + "learning_rate": 9.327695957583803e-07, + "loss": 0.80663818, + "num_input_tokens_seen": 247304800, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.11578369, + "step": 11456, + "time_per_iteration": 2.452291250228882 + }, + { + "auxiliary_loss_clip": 0.0640955, + "auxiliary_loss_mlp": 0.01265957, + "balance_loss_clip": 0.06275116, + "balance_loss_mlp": 0.01255621, + "epoch": 0.6888321058169247, + "flos": 23075930551680.0, + "grad_norm": 1.6190505365782226, + "language_loss": 0.81124002, + "learning_rate": 9.32440237584319e-07, + "loss": 0.88799506, + "num_input_tokens_seen": 247323450, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.10339355, + "step": 11457, + "time_per_iteration": 2.540853977203369 + }, + { + "auxiliary_loss_clip": 0.06415743, + "auxiliary_loss_mlp": 0.01267797, + "balance_loss_clip": 0.06276038, + "balance_loss_mlp": 0.01257152, + "epoch": 0.6888922290695927, + "flos": 23375742860160.0, + "grad_norm": 1.590427454304544, + "language_loss": 0.7679534, + "learning_rate": 9.321109198922301e-07, + "loss": 0.84478879, + "num_input_tokens_seen": 247343845, + "router_z_loss_clip": 1.39648438, + "router_z_loss_mlp": 0.10638428, + "step": 11458, + "time_per_iteration": 2.510422706604004 + }, + { + "auxiliary_loss_clip": 0.06409671, + "auxiliary_loss_mlp": 0.01264265, + "balance_loss_clip": 0.0627234, + "balance_loss_mlp": 0.012539, + "epoch": 0.6889523523222606, + "flos": 17636092433280.0, + "grad_norm": 2.414805126891923, + "language_loss": 0.68316978, + "learning_rate": 9.31781642694603e-07, + "loss": 0.75990915, + "num_input_tokens_seen": 247356650, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.1036377, + "step": 11459, + "time_per_iteration": 2.5042388439178467 + }, + { + "auxiliary_loss_clip": 0.06414565, + "auxiliary_loss_mlp": 0.01267614, + "balance_loss_clip": 0.06275657, + "balance_loss_mlp": 0.01257976, + "epoch": 0.6890124755749286, + "flos": 25235119687680.0, + "grad_norm": 1.5145065442588617, + "language_loss": 0.68853188, + "learning_rate": 9.314524060039221e-07, + "loss": 0.76535368, + "num_input_tokens_seen": 247377340, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.09637451, + "step": 11460, + "time_per_iteration": 2.548172950744629 + }, + { + "auxiliary_loss_clip": 0.06421833, + "auxiliary_loss_mlp": 0.01269493, + "balance_loss_clip": 0.06274051, + "balance_loss_mlp": 0.01257727, + "epoch": 0.6890725988275965, + "flos": 20236488842880.0, + "grad_norm": 1.6636597256364867, + "language_loss": 0.77513885, + "learning_rate": 9.311232098326731e-07, + "loss": 0.85205209, + "num_input_tokens_seen": 247395805, + "router_z_loss_clip": 1.47753906, + "router_z_loss_mlp": 0.11761475, + "step": 11461, + "time_per_iteration": 2.524261474609375 + }, + { + "auxiliary_loss_clip": 0.06409161, + "auxiliary_loss_mlp": 0.01267077, + "balance_loss_clip": 0.06273255, + "balance_loss_mlp": 0.01256777, + "epoch": 0.6891327220802645, + "flos": 14540079922560.0, + "grad_norm": 2.0638516380212932, + "language_loss": 0.69867802, + "learning_rate": 9.307940541933401e-07, + "loss": 0.77544034, + "num_input_tokens_seen": 247413165, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.10302734, + "step": 11462, + "time_per_iteration": 2.470341444015503 + }, + { + "auxiliary_loss_clip": 0.06410427, + "auxiliary_loss_mlp": 0.01263925, + "balance_loss_clip": 0.06272087, + "balance_loss_mlp": 0.01253864, + "epoch": 0.6891928453329325, + "flos": 21144996737280.0, + "grad_norm": 1.4840489217528152, + "language_loss": 0.87375474, + "learning_rate": 9.304649390984034e-07, + "loss": 0.95049822, + "num_input_tokens_seen": 247433140, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.10064697, + "step": 11463, + "time_per_iteration": 2.550734043121338 + }, + { + "auxiliary_loss_clip": 0.06405184, + "auxiliary_loss_mlp": 0.01265431, + "balance_loss_clip": 0.06273332, + "balance_loss_mlp": 0.01255656, + "epoch": 0.6892529685856005, + "flos": 17864347754880.0, + "grad_norm": 1.4959389236419984, + "language_loss": 0.68525398, + "learning_rate": 9.301358645603428e-07, + "loss": 0.76196021, + "num_input_tokens_seen": 247451265, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.09771729, + "step": 11464, + "time_per_iteration": 3.9007256031036377 + }, + { + "auxiliary_loss_clip": 0.06409206, + "auxiliary_loss_mlp": 0.01266234, + "balance_loss_clip": 0.06272039, + "balance_loss_mlp": 0.01255571, + "epoch": 0.6893130918382685, + "flos": 29942575194240.0, + "grad_norm": 1.7446769813388354, + "language_loss": 0.65578705, + "learning_rate": 9.298068305916373e-07, + "loss": 0.73254144, + "num_input_tokens_seen": 247471645, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.10662842, + "step": 11465, + "time_per_iteration": 2.554800271987915 + }, + { + "auxiliary_loss_clip": 0.06418021, + "auxiliary_loss_mlp": 0.01264957, + "balance_loss_clip": 0.06274985, + "balance_loss_mlp": 0.01253388, + "epoch": 0.6893732150909364, + "flos": 24395275814400.0, + "grad_norm": 1.468256683851191, + "language_loss": 0.72699749, + "learning_rate": 9.294778372047649e-07, + "loss": 0.80382729, + "num_input_tokens_seen": 247491170, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.11578369, + "step": 11466, + "time_per_iteration": 2.5593020915985107 + }, + { + "auxiliary_loss_clip": 0.06412645, + "auxiliary_loss_mlp": 0.01265937, + "balance_loss_clip": 0.06275305, + "balance_loss_mlp": 0.01255632, + "epoch": 0.6894333383436044, + "flos": 16988557680000.0, + "grad_norm": 1.6869523120590046, + "language_loss": 0.72136575, + "learning_rate": 9.291488844121995e-07, + "loss": 0.79815149, + "num_input_tokens_seen": 247509005, + "router_z_loss_clip": 1.37402344, + "router_z_loss_mlp": 0.10302734, + "step": 11467, + "time_per_iteration": 2.4603004455566406 + }, + { + "auxiliary_loss_clip": 0.06414096, + "auxiliary_loss_mlp": 0.01266553, + "balance_loss_clip": 0.0627349, + "balance_loss_mlp": 0.0125462, + "epoch": 0.6894934615962723, + "flos": 18990880773120.0, + "grad_norm": 1.8974823893079618, + "language_loss": 0.80639178, + "learning_rate": 9.288199722264156e-07, + "loss": 0.88319826, + "num_input_tokens_seen": 247527050, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.11950684, + "step": 11468, + "time_per_iteration": 2.500204086303711 + }, + { + "auxiliary_loss_clip": 0.06415653, + "auxiliary_loss_mlp": 0.01266091, + "balance_loss_clip": 0.06276623, + "balance_loss_mlp": 0.01255941, + "epoch": 0.6895535848489404, + "flos": 34540137671040.0, + "grad_norm": 1.4230744907421156, + "language_loss": 0.66238683, + "learning_rate": 9.284911006598875e-07, + "loss": 0.73920429, + "num_input_tokens_seen": 247547765, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.10137939, + "step": 11469, + "time_per_iteration": 2.6155412197113037 + }, + { + "auxiliary_loss_clip": 0.06315388, + "auxiliary_loss_mlp": 0.01251862, + "balance_loss_clip": 0.06259958, + "balance_loss_mlp": 0.01250618, + "epoch": 0.6896137081016083, + "flos": 50093237128320.0, + "grad_norm": 0.7794555860117556, + "language_loss": 0.54945397, + "learning_rate": 9.281622697250824e-07, + "loss": 0.62512648, + "num_input_tokens_seen": 247603515, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01243591, + "step": 11470, + "time_per_iteration": 3.0223581790924072 + }, + { + "auxiliary_loss_clip": 0.0640993, + "auxiliary_loss_mlp": 0.01264419, + "balance_loss_clip": 0.0627588, + "balance_loss_mlp": 0.01255133, + "epoch": 0.6896738313542763, + "flos": 19944391109760.0, + "grad_norm": 1.6677407290115414, + "language_loss": 0.78484243, + "learning_rate": 9.278334794344715e-07, + "loss": 0.86158597, + "num_input_tokens_seen": 247622110, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.09283447, + "step": 11471, + "time_per_iteration": 2.486112594604492 + }, + { + "auxiliary_loss_clip": 0.0641201, + "auxiliary_loss_mlp": 0.0126608, + "balance_loss_clip": 0.06274249, + "balance_loss_mlp": 0.01255369, + "epoch": 0.6897339546069442, + "flos": 21731875534080.0, + "grad_norm": 1.810273606719927, + "language_loss": 0.78542721, + "learning_rate": 9.275047298005232e-07, + "loss": 0.86220813, + "num_input_tokens_seen": 247641905, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.10723877, + "step": 11472, + "time_per_iteration": 2.5265328884124756 + }, + { + "auxiliary_loss_clip": 0.06408779, + "auxiliary_loss_mlp": 0.01266157, + "balance_loss_clip": 0.06272413, + "balance_loss_mlp": 0.01256168, + "epoch": 0.6897940778596122, + "flos": 19832275946880.0, + "grad_norm": 1.5025655331144128, + "language_loss": 0.76723063, + "learning_rate": 9.271760208357024e-07, + "loss": 0.84398007, + "num_input_tokens_seen": 247660945, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.09985352, + "step": 11473, + "time_per_iteration": 2.5112764835357666 + }, + { + "auxiliary_loss_clip": 0.06415299, + "auxiliary_loss_mlp": 0.01264941, + "balance_loss_clip": 0.06274555, + "balance_loss_mlp": 0.01254099, + "epoch": 0.6898542011122801, + "flos": 17315595365760.0, + "grad_norm": 1.762455288405268, + "language_loss": 0.75548446, + "learning_rate": 9.268473525524751e-07, + "loss": 0.83228695, + "num_input_tokens_seen": 247678395, + "router_z_loss_clip": 1.40722656, + "router_z_loss_mlp": 0.10839844, + "step": 11474, + "time_per_iteration": 2.527608871459961 + }, + { + "auxiliary_loss_clip": 0.06414007, + "auxiliary_loss_mlp": 0.0127013, + "balance_loss_clip": 0.06276175, + "balance_loss_mlp": 0.01259097, + "epoch": 0.6899143243649482, + "flos": 24760984959360.0, + "grad_norm": 1.5301145681679174, + "language_loss": 0.74686491, + "learning_rate": 9.26518724963303e-07, + "loss": 0.82370627, + "num_input_tokens_seen": 247698380, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.11047363, + "step": 11475, + "time_per_iteration": 2.61885404586792 + }, + { + "auxiliary_loss_clip": 0.06408798, + "auxiliary_loss_mlp": 0.01264551, + "balance_loss_clip": 0.0627286, + "balance_loss_mlp": 0.01254168, + "epoch": 0.6899744476176161, + "flos": 17239636039680.0, + "grad_norm": 1.9758347439707513, + "language_loss": 0.89060938, + "learning_rate": 9.261901380806491e-07, + "loss": 0.96734291, + "num_input_tokens_seen": 247716370, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.1038208, + "step": 11476, + "time_per_iteration": 3.9992854595184326 + }, + { + "auxiliary_loss_clip": 0.06409539, + "auxiliary_loss_mlp": 0.01267337, + "balance_loss_clip": 0.06274991, + "balance_loss_mlp": 0.01256864, + "epoch": 0.6900345708702841, + "flos": 25417701734400.0, + "grad_norm": 1.3283080082562368, + "language_loss": 0.70312291, + "learning_rate": 9.258615919169724e-07, + "loss": 0.77989161, + "num_input_tokens_seen": 247737335, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.10473633, + "step": 11477, + "time_per_iteration": 2.5792300701141357 + }, + { + "auxiliary_loss_clip": 0.06419337, + "auxiliary_loss_mlp": 0.01267418, + "balance_loss_clip": 0.06276701, + "balance_loss_mlp": 0.0125567, + "epoch": 0.6900946941229521, + "flos": 23439836833920.0, + "grad_norm": 2.3323261899860386, + "language_loss": 0.68125427, + "learning_rate": 9.255330864847313e-07, + "loss": 0.75812185, + "num_input_tokens_seen": 247756680, + "router_z_loss_clip": 1.42675781, + "router_z_loss_mlp": 0.11737061, + "step": 11478, + "time_per_iteration": 4.033671855926514 + }, + { + "auxiliary_loss_clip": 0.06415287, + "auxiliary_loss_mlp": 0.01266031, + "balance_loss_clip": 0.06275256, + "balance_loss_mlp": 0.01255469, + "epoch": 0.69015481737562, + "flos": 17825592441600.0, + "grad_norm": 2.187140386680911, + "language_loss": 0.76715493, + "learning_rate": 9.252046217963843e-07, + "loss": 0.84396803, + "num_input_tokens_seen": 247774265, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.10565186, + "step": 11479, + "time_per_iteration": 2.507310390472412 + }, + { + "auxiliary_loss_clip": 0.06417705, + "auxiliary_loss_mlp": 0.0126466, + "balance_loss_clip": 0.06277484, + "balance_loss_mlp": 0.01253084, + "epoch": 0.690214940628288, + "flos": 17462147356800.0, + "grad_norm": 1.7422547235207548, + "language_loss": 0.78936756, + "learning_rate": 9.248761978643856e-07, + "loss": 0.86619121, + "num_input_tokens_seen": 247792395, + "router_z_loss_clip": 1.40039062, + "router_z_loss_mlp": 0.11584473, + "step": 11480, + "time_per_iteration": 2.4853224754333496 + }, + { + "auxiliary_loss_clip": 0.06408322, + "auxiliary_loss_mlp": 0.01267563, + "balance_loss_clip": 0.06271941, + "balance_loss_mlp": 0.01256685, + "epoch": 0.6902750638809559, + "flos": 29573847302400.0, + "grad_norm": 1.6397986809458904, + "language_loss": 0.75654733, + "learning_rate": 9.245478147011885e-07, + "loss": 0.83330619, + "num_input_tokens_seen": 247811985, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.10870361, + "step": 11481, + "time_per_iteration": 2.557511806488037 + }, + { + "auxiliary_loss_clip": 0.06409919, + "auxiliary_loss_mlp": 0.01267642, + "balance_loss_clip": 0.06274407, + "balance_loss_mlp": 0.01257151, + "epoch": 0.690335187133624, + "flos": 25564253725440.0, + "grad_norm": 1.7034098487881468, + "language_loss": 0.69767886, + "learning_rate": 9.24219472319246e-07, + "loss": 0.77445447, + "num_input_tokens_seen": 247831880, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.10491943, + "step": 11482, + "time_per_iteration": 2.52620267868042 + }, + { + "auxiliary_loss_clip": 0.06410135, + "auxiliary_loss_mlp": 0.01265009, + "balance_loss_clip": 0.06271818, + "balance_loss_mlp": 0.0125403, + "epoch": 0.6903953103862919, + "flos": 22494418416000.0, + "grad_norm": 1.3936382068363662, + "language_loss": 0.82645047, + "learning_rate": 9.238911707310096e-07, + "loss": 0.90320188, + "num_input_tokens_seen": 247851170, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.10980225, + "step": 11483, + "time_per_iteration": 3.9243674278259277 + }, + { + "auxiliary_loss_clip": 0.06413989, + "auxiliary_loss_mlp": 0.01264114, + "balance_loss_clip": 0.06273346, + "balance_loss_mlp": 0.01254202, + "epoch": 0.6904554336389599, + "flos": 26107094401920.0, + "grad_norm": 1.7789545949672325, + "language_loss": 0.65774268, + "learning_rate": 9.235629099489273e-07, + "loss": 0.73452371, + "num_input_tokens_seen": 247868950, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.09918213, + "step": 11484, + "time_per_iteration": 2.570255994796753 + }, + { + "auxiliary_loss_clip": 0.06407849, + "auxiliary_loss_mlp": 0.01267989, + "balance_loss_clip": 0.06274161, + "balance_loss_mlp": 0.01257838, + "epoch": 0.6905155568916278, + "flos": 31179127023360.0, + "grad_norm": 1.529832254030816, + "language_loss": 0.73510063, + "learning_rate": 9.232346899854479e-07, + "loss": 0.81185901, + "num_input_tokens_seen": 247889805, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.1015625, + "step": 11485, + "time_per_iteration": 2.6148314476013184 + }, + { + "auxiliary_loss_clip": 0.06415319, + "auxiliary_loss_mlp": 0.0126655, + "balance_loss_clip": 0.0627619, + "balance_loss_mlp": 0.01255863, + "epoch": 0.6905756801442958, + "flos": 17645484090240.0, + "grad_norm": 1.7447168149804075, + "language_loss": 0.85063231, + "learning_rate": 9.22906510853017e-07, + "loss": 0.92745095, + "num_input_tokens_seen": 247908585, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.10687256, + "step": 11486, + "time_per_iteration": 2.5396366119384766 + }, + { + "auxiliary_loss_clip": 0.06414411, + "auxiliary_loss_mlp": 0.0126458, + "balance_loss_clip": 0.06275952, + "balance_loss_mlp": 0.01254071, + "epoch": 0.6906358033969637, + "flos": 22349836995840.0, + "grad_norm": 1.4442882109961312, + "language_loss": 0.73110938, + "learning_rate": 9.225783725640786e-07, + "loss": 0.8078993, + "num_input_tokens_seen": 247928480, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.10510254, + "step": 11487, + "time_per_iteration": 2.5067358016967773 + }, + { + "auxiliary_loss_clip": 0.06322645, + "auxiliary_loss_mlp": 0.01254949, + "balance_loss_clip": 0.06266931, + "balance_loss_mlp": 0.01253606, + "epoch": 0.6906959266496318, + "flos": 69769485573120.0, + "grad_norm": 0.8802440439282012, + "language_loss": 0.66566062, + "learning_rate": 9.222502751310759e-07, + "loss": 0.74143648, + "num_input_tokens_seen": 247988855, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.01345062, + "step": 11488, + "time_per_iteration": 3.1760408878326416 + }, + { + "auxiliary_loss_clip": 0.06420241, + "auxiliary_loss_mlp": 0.01268855, + "balance_loss_clip": 0.06275697, + "balance_loss_mlp": 0.01256773, + "epoch": 0.6907560499022997, + "flos": 21440700195840.0, + "grad_norm": 1.9049138044907, + "language_loss": 0.75416613, + "learning_rate": 9.219222185664519e-07, + "loss": 0.83105707, + "num_input_tokens_seen": 248007685, + "router_z_loss_clip": 1.4453125, + "router_z_loss_mlp": 0.12072754, + "step": 11489, + "time_per_iteration": 2.515700578689575 + }, + { + "auxiliary_loss_clip": 0.06413751, + "auxiliary_loss_mlp": 0.01269098, + "balance_loss_clip": 0.06274071, + "balance_loss_mlp": 0.01257862, + "epoch": 0.6908161731549677, + "flos": 14397427146240.0, + "grad_norm": 2.0018253870073806, + "language_loss": 0.62274224, + "learning_rate": 9.215942028826445e-07, + "loss": 0.69957072, + "num_input_tokens_seen": 248025145, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.11236572, + "step": 11490, + "time_per_iteration": 2.532935857772827 + }, + { + "auxiliary_loss_clip": 0.06417898, + "auxiliary_loss_mlp": 0.01266366, + "balance_loss_clip": 0.06278036, + "balance_loss_mlp": 0.01255911, + "epoch": 0.6908762964076357, + "flos": 20017122053760.0, + "grad_norm": 1.8130615922920168, + "language_loss": 0.73057532, + "learning_rate": 9.212662280920937e-07, + "loss": 0.80741799, + "num_input_tokens_seen": 248043750, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.10455322, + "step": 11491, + "time_per_iteration": 2.521466016769409 + }, + { + "auxiliary_loss_clip": 0.0640818, + "auxiliary_loss_mlp": 0.0126409, + "balance_loss_clip": 0.06273587, + "balance_loss_mlp": 0.01253117, + "epoch": 0.6909364196603036, + "flos": 28776951446400.0, + "grad_norm": 1.7336299759284137, + "language_loss": 0.7042138, + "learning_rate": 9.20938294207235e-07, + "loss": 0.78093648, + "num_input_tokens_seen": 248065765, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.10968018, + "step": 11492, + "time_per_iteration": 2.585730791091919 + }, + { + "auxiliary_loss_clip": 0.06420228, + "auxiliary_loss_mlp": 0.01266161, + "balance_loss_clip": 0.0627589, + "balance_loss_mlp": 0.01255545, + "epoch": 0.6909965429129716, + "flos": 22534641175680.0, + "grad_norm": 1.7712531915598577, + "language_loss": 0.7470516, + "learning_rate": 9.206104012405049e-07, + "loss": 0.82391548, + "num_input_tokens_seen": 248083810, + "router_z_loss_clip": 1.44335938, + "router_z_loss_mlp": 0.1060791, + "step": 11493, + "time_per_iteration": 2.5050244331359863 + }, + { + "auxiliary_loss_clip": 0.06412148, + "auxiliary_loss_mlp": 0.01265374, + "balance_loss_clip": 0.06274831, + "balance_loss_mlp": 0.01254211, + "epoch": 0.6910566661656395, + "flos": 18411884259840.0, + "grad_norm": 1.6258065693735415, + "language_loss": 0.74673963, + "learning_rate": 9.20282549204336e-07, + "loss": 0.82351482, + "num_input_tokens_seen": 248103185, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.1116333, + "step": 11494, + "time_per_iteration": 2.5276567935943604 + }, + { + "auxiliary_loss_clip": 0.06411964, + "auxiliary_loss_mlp": 0.01267354, + "balance_loss_clip": 0.06274857, + "balance_loss_mlp": 0.01257263, + "epoch": 0.6911167894183076, + "flos": 30781874016000.0, + "grad_norm": 1.529019816420153, + "language_loss": 0.68227768, + "learning_rate": 9.19954738111161e-07, + "loss": 0.75907087, + "num_input_tokens_seen": 248125665, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.10101318, + "step": 11495, + "time_per_iteration": 2.5842087268829346 + }, + { + "auxiliary_loss_clip": 0.06411652, + "auxiliary_loss_mlp": 0.01268081, + "balance_loss_clip": 0.06274678, + "balance_loss_mlp": 0.01256863, + "epoch": 0.6911769126709755, + "flos": 13740878079360.0, + "grad_norm": 1.6566133128888745, + "language_loss": 0.74368346, + "learning_rate": 9.196269679734119e-07, + "loss": 0.82048082, + "num_input_tokens_seen": 248142545, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.11224365, + "step": 11496, + "time_per_iteration": 2.5154151916503906 + }, + { + "auxiliary_loss_clip": 0.06410149, + "auxiliary_loss_mlp": 0.01262738, + "balance_loss_clip": 0.06274073, + "balance_loss_mlp": 0.01252987, + "epoch": 0.6912370359236435, + "flos": 17572669292160.0, + "grad_norm": 1.7205825998793636, + "language_loss": 0.80305141, + "learning_rate": 9.19299238803515e-07, + "loss": 0.87978023, + "num_input_tokens_seen": 248160225, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.09753418, + "step": 11497, + "time_per_iteration": 2.4925076961517334 + }, + { + "auxiliary_loss_clip": 0.06416431, + "auxiliary_loss_mlp": 0.01267714, + "balance_loss_clip": 0.06275152, + "balance_loss_mlp": 0.01256061, + "epoch": 0.6912971591763114, + "flos": 22097291189760.0, + "grad_norm": 1.653826561150034, + "language_loss": 0.8077867, + "learning_rate": 9.189715506138993e-07, + "loss": 0.88462818, + "num_input_tokens_seen": 248180430, + "router_z_loss_clip": 1.41308594, + "router_z_loss_mlp": 0.11651611, + "step": 11498, + "time_per_iteration": 2.5465574264526367 + }, + { + "auxiliary_loss_clip": 0.06408113, + "auxiliary_loss_mlp": 0.01262525, + "balance_loss_clip": 0.06274167, + "balance_loss_mlp": 0.01251701, + "epoch": 0.6913572824289794, + "flos": 29979276082560.0, + "grad_norm": 2.039776107623003, + "language_loss": 0.85973012, + "learning_rate": 9.186439034169915e-07, + "loss": 0.93643653, + "num_input_tokens_seen": 248202365, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.10827637, + "step": 11499, + "time_per_iteration": 2.5665283203125 + }, + { + "auxiliary_loss_clip": 0.06408866, + "auxiliary_loss_mlp": 0.01265419, + "balance_loss_clip": 0.06275891, + "balance_loss_mlp": 0.01255399, + "epoch": 0.6914174056816473, + "flos": 20455184799360.0, + "grad_norm": 1.6118393659485355, + "language_loss": 0.7559222, + "learning_rate": 9.183162972252145e-07, + "loss": 0.83266509, + "num_input_tokens_seen": 248221750, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.10021973, + "step": 11500, + "time_per_iteration": 2.503854751586914 + }, + { + "auxiliary_loss_clip": 0.06412221, + "auxiliary_loss_mlp": 0.01266959, + "balance_loss_clip": 0.06274468, + "balance_loss_mlp": 0.0125567, + "epoch": 0.6914775289343154, + "flos": 21287984929920.0, + "grad_norm": 1.8512682937239455, + "language_loss": 0.77863973, + "learning_rate": 9.179887320509921e-07, + "loss": 0.85543144, + "num_input_tokens_seen": 248239535, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.112854, + "step": 11501, + "time_per_iteration": 2.4953453540802 + }, + { + "auxiliary_loss_clip": 0.06417021, + "auxiliary_loss_mlp": 0.01267471, + "balance_loss_clip": 0.06276537, + "balance_loss_mlp": 0.01256748, + "epoch": 0.6915376521869833, + "flos": 23884859468160.0, + "grad_norm": 1.8723825147208624, + "language_loss": 0.73532307, + "learning_rate": 9.176612079067458e-07, + "loss": 0.81216794, + "num_input_tokens_seen": 248259055, + "router_z_loss_clip": 1.40527344, + "router_z_loss_mlp": 0.10717773, + "step": 11502, + "time_per_iteration": 2.5416178703308105 + }, + { + "auxiliary_loss_clip": 0.06414314, + "auxiliary_loss_mlp": 0.01265378, + "balance_loss_clip": 0.06275123, + "balance_loss_mlp": 0.01253993, + "epoch": 0.6915977754396513, + "flos": 11515079347200.0, + "grad_norm": 1.8781803370630783, + "language_loss": 0.73954153, + "learning_rate": 9.173337248048953e-07, + "loss": 0.81633848, + "num_input_tokens_seen": 248276765, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.11395264, + "step": 11503, + "time_per_iteration": 2.499391794204712 + }, + { + "auxiliary_loss_clip": 0.06408094, + "auxiliary_loss_mlp": 0.01262533, + "balance_loss_clip": 0.06271478, + "balance_loss_mlp": 0.01252233, + "epoch": 0.6916578986923193, + "flos": 22607833317120.0, + "grad_norm": 1.5988526178616205, + "language_loss": 0.77127218, + "learning_rate": 9.170062827578575e-07, + "loss": 0.84797841, + "num_input_tokens_seen": 248295310, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.10302734, + "step": 11504, + "time_per_iteration": 3.9501583576202393 + }, + { + "auxiliary_loss_clip": 0.06413034, + "auxiliary_loss_mlp": 0.01266076, + "balance_loss_clip": 0.06275813, + "balance_loss_mlp": 0.01255472, + "epoch": 0.6917180219449872, + "flos": 23484126516480.0, + "grad_norm": 1.8617681816675509, + "language_loss": 0.73855585, + "learning_rate": 9.166788817780499e-07, + "loss": 0.81534696, + "num_input_tokens_seen": 248315230, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.10601807, + "step": 11505, + "time_per_iteration": 2.5829193592071533 + }, + { + "auxiliary_loss_clip": 0.06409241, + "auxiliary_loss_mlp": 0.01267959, + "balance_loss_clip": 0.06273368, + "balance_loss_mlp": 0.0125723, + "epoch": 0.6917781451976552, + "flos": 23739313726080.0, + "grad_norm": 1.75743437760736, + "language_loss": 0.876764, + "learning_rate": 9.163515218778886e-07, + "loss": 0.95353591, + "num_input_tokens_seen": 248332980, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.1072998, + "step": 11506, + "time_per_iteration": 2.5154294967651367 + }, + { + "auxiliary_loss_clip": 0.06412455, + "auxiliary_loss_mlp": 0.01265369, + "balance_loss_clip": 0.06276374, + "balance_loss_mlp": 0.01254783, + "epoch": 0.6918382684503231, + "flos": 31474704700800.0, + "grad_norm": 2.0688391280679648, + "language_loss": 0.7024008, + "learning_rate": 9.160242030697856e-07, + "loss": 0.7791791, + "num_input_tokens_seen": 248352865, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.105896, + "step": 11507, + "time_per_iteration": 2.5845768451690674 + }, + { + "auxiliary_loss_clip": 0.06413335, + "auxiliary_loss_mlp": 0.01264122, + "balance_loss_clip": 0.06273569, + "balance_loss_mlp": 0.01253631, + "epoch": 0.6918983917029912, + "flos": 21656503186560.0, + "grad_norm": 1.743467082940077, + "language_loss": 0.77142328, + "learning_rate": 9.156969253661538e-07, + "loss": 0.84819788, + "num_input_tokens_seen": 248371125, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.10491943, + "step": 11508, + "time_per_iteration": 2.4946086406707764 + }, + { + "auxiliary_loss_clip": 0.06406476, + "auxiliary_loss_mlp": 0.01267235, + "balance_loss_clip": 0.06273084, + "balance_loss_mlp": 0.01257501, + "epoch": 0.6919585149556591, + "flos": 25556036025600.0, + "grad_norm": 1.485663055998357, + "language_loss": 0.75072491, + "learning_rate": 9.153696887794027e-07, + "loss": 0.82746202, + "num_input_tokens_seen": 248390455, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.09735107, + "step": 11509, + "time_per_iteration": 2.591611623764038 + }, + { + "auxiliary_loss_clip": 0.06409086, + "auxiliary_loss_mlp": 0.0126353, + "balance_loss_clip": 0.06273773, + "balance_loss_mlp": 0.01253344, + "epoch": 0.6920186382083271, + "flos": 23666582782080.0, + "grad_norm": 1.6709622746913153, + "language_loss": 0.64358246, + "learning_rate": 9.150424933219425e-07, + "loss": 0.7203086, + "num_input_tokens_seen": 248411305, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.10192871, + "step": 11510, + "time_per_iteration": 2.522277593612671 + }, + { + "auxiliary_loss_clip": 0.06419423, + "auxiliary_loss_mlp": 0.0126943, + "balance_loss_clip": 0.06276652, + "balance_loss_mlp": 0.01257938, + "epoch": 0.692078761460995, + "flos": 19067888275200.0, + "grad_norm": 1.58502931536568, + "language_loss": 0.75757432, + "learning_rate": 9.147153390061788e-07, + "loss": 0.83446282, + "num_input_tokens_seen": 248430190, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.1149292, + "step": 11511, + "time_per_iteration": 2.5163841247558594 + }, + { + "auxiliary_loss_clip": 0.06410709, + "auxiliary_loss_mlp": 0.0126443, + "balance_loss_clip": 0.06275946, + "balance_loss_mlp": 0.01254482, + "epoch": 0.692138884713663, + "flos": 29031006625920.0, + "grad_norm": 1.5915143740912923, + "language_loss": 0.62864697, + "learning_rate": 9.143882258445184e-07, + "loss": 0.70539832, + "num_input_tokens_seen": 248450830, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.0994873, + "step": 11512, + "time_per_iteration": 2.5597567558288574 + }, + { + "auxiliary_loss_clip": 0.06413583, + "auxiliary_loss_mlp": 0.01267879, + "balance_loss_clip": 0.06275637, + "balance_loss_mlp": 0.01257323, + "epoch": 0.6921990079663309, + "flos": 14763262072320.0, + "grad_norm": 2.1370127100150373, + "language_loss": 0.83359182, + "learning_rate": 9.140611538493666e-07, + "loss": 0.91040647, + "num_input_tokens_seen": 248468585, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.10559082, + "step": 11513, + "time_per_iteration": 2.5295650959014893 + }, + { + "auxiliary_loss_clip": 0.06406762, + "auxiliary_loss_mlp": 0.01263079, + "balance_loss_clip": 0.06272393, + "balance_loss_mlp": 0.01253614, + "epoch": 0.692259131218999, + "flos": 23848619777280.0, + "grad_norm": 1.3335195335102994, + "language_loss": 0.78370172, + "learning_rate": 9.137341230331233e-07, + "loss": 0.86040014, + "num_input_tokens_seen": 248490535, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.09466553, + "step": 11514, + "time_per_iteration": 2.5325093269348145 + }, + { + "auxiliary_loss_clip": 0.06413436, + "auxiliary_loss_mlp": 0.01264156, + "balance_loss_clip": 0.06271526, + "balance_loss_mlp": 0.0125323, + "epoch": 0.6923192544716669, + "flos": 19141038489600.0, + "grad_norm": 1.7641312985276416, + "language_loss": 0.7541517, + "learning_rate": 9.134071334081907e-07, + "loss": 0.83092761, + "num_input_tokens_seen": 248508575, + "router_z_loss_clip": 1.41796875, + "router_z_loss_mlp": 0.10919189, + "step": 11515, + "time_per_iteration": 2.4964303970336914 + }, + { + "auxiliary_loss_clip": 0.06405345, + "auxiliary_loss_mlp": 0.01265608, + "balance_loss_clip": 0.06272751, + "balance_loss_mlp": 0.01255606, + "epoch": 0.6923793777243349, + "flos": 28082192117760.0, + "grad_norm": 1.899911587445346, + "language_loss": 0.53861475, + "learning_rate": 9.130801849869694e-07, + "loss": 0.61532426, + "num_input_tokens_seen": 248527025, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.10003662, + "step": 11516, + "time_per_iteration": 3.975773811340332 + }, + { + "auxiliary_loss_clip": 0.06402789, + "auxiliary_loss_mlp": 0.01269302, + "balance_loss_clip": 0.06273137, + "balance_loss_mlp": 0.01258812, + "epoch": 0.6924395009770029, + "flos": 16586818479360.0, + "grad_norm": 1.754197992941401, + "language_loss": 0.73113155, + "learning_rate": 9.127532777818557e-07, + "loss": 0.80785251, + "num_input_tokens_seen": 248544275, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.1048584, + "step": 11517, + "time_per_iteration": 2.5128793716430664 + }, + { + "auxiliary_loss_clip": 0.06413449, + "auxiliary_loss_mlp": 0.01270737, + "balance_loss_clip": 0.06275631, + "balance_loss_mlp": 0.01260223, + "epoch": 0.6924996242296708, + "flos": 16661058796800.0, + "grad_norm": 1.5645702983922471, + "language_loss": 0.76377338, + "learning_rate": 9.124264118052465e-07, + "loss": 0.84061527, + "num_input_tokens_seen": 248561870, + "router_z_loss_clip": 1.37695312, + "router_z_loss_mlp": 0.10510254, + "step": 11518, + "time_per_iteration": 4.030726432800293 + }, + { + "auxiliary_loss_clip": 0.06418861, + "auxiliary_loss_mlp": 0.01271759, + "balance_loss_clip": 0.06276505, + "balance_loss_mlp": 0.01260065, + "epoch": 0.6925597474823388, + "flos": 34763277893760.0, + "grad_norm": 1.2922865476436283, + "language_loss": 0.64748263, + "learning_rate": 9.120995870695376e-07, + "loss": 0.72438884, + "num_input_tokens_seen": 248588190, + "router_z_loss_clip": 1.42382812, + "router_z_loss_mlp": 0.11712646, + "step": 11519, + "time_per_iteration": 2.6468279361724854 + }, + { + "auxiliary_loss_clip": 0.06410517, + "auxiliary_loss_mlp": 0.01266916, + "balance_loss_clip": 0.06272532, + "balance_loss_mlp": 0.01255746, + "epoch": 0.6926198707350067, + "flos": 21878175962880.0, + "grad_norm": 1.754829284599123, + "language_loss": 0.62671852, + "learning_rate": 9.117728035871212e-07, + "loss": 0.70349276, + "num_input_tokens_seen": 248606460, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.1116333, + "step": 11520, + "time_per_iteration": 2.6443254947662354 + }, + { + "auxiliary_loss_clip": 0.06421007, + "auxiliary_loss_mlp": 0.0127025, + "balance_loss_clip": 0.06274754, + "balance_loss_mlp": 0.01259104, + "epoch": 0.6926799939876748, + "flos": 13011346506240.0, + "grad_norm": 1.8045037459633815, + "language_loss": 0.78247267, + "learning_rate": 9.114460613703887e-07, + "loss": 0.85938519, + "num_input_tokens_seen": 248623715, + "router_z_loss_clip": 1.46484375, + "router_z_loss_mlp": 0.11151123, + "step": 11521, + "time_per_iteration": 2.540693521499634 + }, + { + "auxiliary_loss_clip": 0.0641452, + "auxiliary_loss_mlp": 0.0126495, + "balance_loss_clip": 0.06273233, + "balance_loss_mlp": 0.0125356, + "epoch": 0.6927401172403427, + "flos": 16766423706240.0, + "grad_norm": 1.8333636519131566, + "language_loss": 0.82234508, + "learning_rate": 9.111193604317304e-07, + "loss": 0.89913976, + "num_input_tokens_seen": 248640575, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.11383057, + "step": 11522, + "time_per_iteration": 3.9248740673065186 + }, + { + "auxiliary_loss_clip": 0.06410085, + "auxiliary_loss_mlp": 0.01264492, + "balance_loss_clip": 0.06274056, + "balance_loss_mlp": 0.01254013, + "epoch": 0.6928002404930107, + "flos": 25713237484800.0, + "grad_norm": 1.543280654363121, + "language_loss": 0.77247906, + "learning_rate": 9.107927007835361e-07, + "loss": 0.84922481, + "num_input_tokens_seen": 248663535, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.10479736, + "step": 11523, + "time_per_iteration": 2.6300647258758545 + }, + { + "auxiliary_loss_clip": 0.0640799, + "auxiliary_loss_mlp": 0.01264871, + "balance_loss_clip": 0.06273483, + "balance_loss_mlp": 0.01255227, + "epoch": 0.6928603637456786, + "flos": 18594214744320.0, + "grad_norm": 1.7989990955818747, + "language_loss": 0.68682468, + "learning_rate": 9.104660824381915e-07, + "loss": 0.76355332, + "num_input_tokens_seen": 248681125, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.09637451, + "step": 11524, + "time_per_iteration": 2.4765005111694336 + }, + { + "auxiliary_loss_clip": 0.06415472, + "auxiliary_loss_mlp": 0.01265103, + "balance_loss_clip": 0.06274404, + "balance_loss_mlp": 0.0125385, + "epoch": 0.6929204869983466, + "flos": 22207519635840.0, + "grad_norm": 1.775837201090113, + "language_loss": 0.64731717, + "learning_rate": 9.101395054080815e-07, + "loss": 0.72412294, + "num_input_tokens_seen": 248700555, + "router_z_loss_clip": 1.41210938, + "router_z_loss_mlp": 0.1126709, + "step": 11525, + "time_per_iteration": 2.5243499279022217 + }, + { + "auxiliary_loss_clip": 0.06416623, + "auxiliary_loss_mlp": 0.01268916, + "balance_loss_clip": 0.06279063, + "balance_loss_mlp": 0.01258568, + "epoch": 0.6929806102510145, + "flos": 17900545518720.0, + "grad_norm": 2.0930840901881007, + "language_loss": 0.70522892, + "learning_rate": 9.098129697055907e-07, + "loss": 0.78208423, + "num_input_tokens_seen": 248716095, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.10351562, + "step": 11526, + "time_per_iteration": 2.4600794315338135 + }, + { + "auxiliary_loss_clip": 0.06409934, + "auxiliary_loss_mlp": 0.01263712, + "balance_loss_clip": 0.06273712, + "balance_loss_mlp": 0.01253186, + "epoch": 0.6930407335036826, + "flos": 19761222084480.0, + "grad_norm": 1.7010928543667516, + "language_loss": 0.76265514, + "learning_rate": 9.094864753431022e-07, + "loss": 0.83939159, + "num_input_tokens_seen": 248735330, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.10516357, + "step": 11527, + "time_per_iteration": 2.5164694786071777 + }, + { + "auxiliary_loss_clip": 0.06411794, + "auxiliary_loss_mlp": 0.01263204, + "balance_loss_clip": 0.06273556, + "balance_loss_mlp": 0.01253149, + "epoch": 0.6931008567563505, + "flos": 21550802860800.0, + "grad_norm": 1.5438747158568011, + "language_loss": 0.79877269, + "learning_rate": 9.091600223329952e-07, + "loss": 0.87552267, + "num_input_tokens_seen": 248754530, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.1005249, + "step": 11528, + "time_per_iteration": 2.501044988632202 + }, + { + "auxiliary_loss_clip": 0.06405636, + "auxiliary_loss_mlp": 0.01267062, + "balance_loss_clip": 0.06273603, + "balance_loss_mlp": 0.01256917, + "epoch": 0.6931609800090185, + "flos": 26257210191360.0, + "grad_norm": 1.3083455635421857, + "language_loss": 0.75950116, + "learning_rate": 9.088336106876491e-07, + "loss": 0.83622813, + "num_input_tokens_seen": 248775825, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.10144043, + "step": 11529, + "time_per_iteration": 2.5608596801757812 + }, + { + "auxiliary_loss_clip": 0.06410852, + "auxiliary_loss_mlp": 0.01265207, + "balance_loss_clip": 0.06276192, + "balance_loss_mlp": 0.01254961, + "epoch": 0.6932211032616865, + "flos": 32351626805760.0, + "grad_norm": 2.07531682890069, + "language_loss": 0.73131585, + "learning_rate": 9.085072404194436e-07, + "loss": 0.80807638, + "num_input_tokens_seen": 248796180, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.10241699, + "step": 11530, + "time_per_iteration": 2.5931029319763184 + }, + { + "auxiliary_loss_clip": 0.06423162, + "auxiliary_loss_mlp": 0.01267459, + "balance_loss_clip": 0.06278834, + "balance_loss_mlp": 0.0125598, + "epoch": 0.6932812265143544, + "flos": 22054720515840.0, + "grad_norm": 1.8331163383956572, + "language_loss": 0.78110623, + "learning_rate": 9.081809115407513e-07, + "loss": 0.85801244, + "num_input_tokens_seen": 248814735, + "router_z_loss_clip": 1.44238281, + "router_z_loss_mlp": 0.11474609, + "step": 11531, + "time_per_iteration": 2.537781000137329 + }, + { + "auxiliary_loss_clip": 0.06406952, + "auxiliary_loss_mlp": 0.01266064, + "balance_loss_clip": 0.06274234, + "balance_loss_mlp": 0.01256092, + "epoch": 0.6933413497670224, + "flos": 26264924766720.0, + "grad_norm": 1.4723585148230005, + "language_loss": 0.69516993, + "learning_rate": 9.078546240639484e-07, + "loss": 0.77190006, + "num_input_tokens_seen": 248839140, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.09973145, + "step": 11532, + "time_per_iteration": 2.6068294048309326 + }, + { + "auxiliary_loss_clip": 0.06414198, + "auxiliary_loss_mlp": 0.01265385, + "balance_loss_clip": 0.06275293, + "balance_loss_mlp": 0.0125403, + "epoch": 0.6934014730196904, + "flos": 19579059308160.0, + "grad_norm": 1.68179431170249, + "language_loss": 0.66939062, + "learning_rate": 9.075283780014082e-07, + "loss": 0.74618644, + "num_input_tokens_seen": 248858300, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.11358643, + "step": 11533, + "time_per_iteration": 2.5188937187194824 + }, + { + "auxiliary_loss_clip": 0.06414025, + "auxiliary_loss_mlp": 0.01266342, + "balance_loss_clip": 0.06274263, + "balance_loss_mlp": 0.01254892, + "epoch": 0.6934615962723584, + "flos": 22124432712960.0, + "grad_norm": 2.2635878062852384, + "language_loss": 0.59154713, + "learning_rate": 9.072021733655007e-07, + "loss": 0.66835076, + "num_input_tokens_seen": 248876310, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.11456299, + "step": 11534, + "time_per_iteration": 2.513169288635254 + }, + { + "auxiliary_loss_clip": 0.06412862, + "auxiliary_loss_mlp": 0.01265908, + "balance_loss_clip": 0.06276149, + "balance_loss_mlp": 0.01255639, + "epoch": 0.6935217195250263, + "flos": 21367172638080.0, + "grad_norm": 2.468732709113743, + "language_loss": 0.71063632, + "learning_rate": 9.068760101685971e-07, + "loss": 0.78742403, + "num_input_tokens_seen": 248895650, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.10266113, + "step": 11535, + "time_per_iteration": 2.5125019550323486 + }, + { + "auxiliary_loss_clip": 0.0632171, + "auxiliary_loss_mlp": 0.012535, + "balance_loss_clip": 0.06265885, + "balance_loss_mlp": 0.01252321, + "epoch": 0.6935818427776943, + "flos": 64085864400000.0, + "grad_norm": 0.6899850160451471, + "language_loss": 0.58968407, + "learning_rate": 9.065498884230638e-07, + "loss": 0.66543621, + "num_input_tokens_seen": 248963920, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.01176453, + "step": 11536, + "time_per_iteration": 3.2811362743377686 + }, + { + "auxiliary_loss_clip": 0.06415699, + "auxiliary_loss_mlp": 0.01266201, + "balance_loss_clip": 0.06274739, + "balance_loss_mlp": 0.01255628, + "epoch": 0.6936419660303622, + "flos": 20308716662400.0, + "grad_norm": 1.4806055752543272, + "language_loss": 0.72754341, + "learning_rate": 9.062238081412692e-07, + "loss": 0.80436242, + "num_input_tokens_seen": 248983380, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.10571289, + "step": 11537, + "time_per_iteration": 2.521667242050171 + }, + { + "auxiliary_loss_clip": 0.06322287, + "auxiliary_loss_mlp": 0.01253211, + "balance_loss_clip": 0.06266545, + "balance_loss_mlp": 0.01252035, + "epoch": 0.6937020892830302, + "flos": 67201974691200.0, + "grad_norm": 0.7781896456354132, + "language_loss": 0.5562225, + "learning_rate": 9.058977693355767e-07, + "loss": 0.63197744, + "num_input_tokens_seen": 249044680, + "router_z_loss_clip": 0.55517578, + "router_z_loss_mlp": 0.01173401, + "step": 11538, + "time_per_iteration": 3.133890390396118 + }, + { + "auxiliary_loss_clip": 0.06402846, + "auxiliary_loss_mlp": 0.01263458, + "balance_loss_clip": 0.0627329, + "balance_loss_mlp": 0.01253844, + "epoch": 0.6937622125356981, + "flos": 23884943322240.0, + "grad_norm": 1.4430233846230829, + "language_loss": 0.7770322, + "learning_rate": 9.055717720183505e-07, + "loss": 0.85369527, + "num_input_tokens_seen": 249061060, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.09613037, + "step": 11539, + "time_per_iteration": 2.5152971744537354 + }, + { + "auxiliary_loss_clip": 0.0640855, + "auxiliary_loss_mlp": 0.01262731, + "balance_loss_clip": 0.06274487, + "balance_loss_mlp": 0.01252664, + "epoch": 0.6938223357883662, + "flos": 28738154206080.0, + "grad_norm": 1.7708768043043424, + "language_loss": 0.64184511, + "learning_rate": 9.05245816201953e-07, + "loss": 0.71855795, + "num_input_tokens_seen": 249081430, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.10070801, + "step": 11540, + "time_per_iteration": 2.5849952697753906 + }, + { + "auxiliary_loss_clip": 0.06409811, + "auxiliary_loss_mlp": 0.01263592, + "balance_loss_clip": 0.06274833, + "balance_loss_mlp": 0.01254288, + "epoch": 0.6938824590410341, + "flos": 28662111025920.0, + "grad_norm": 1.4340903998261632, + "language_loss": 0.87096, + "learning_rate": 9.049199018987437e-07, + "loss": 0.94769406, + "num_input_tokens_seen": 249103020, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.09301758, + "step": 11541, + "time_per_iteration": 2.5415987968444824 + }, + { + "auxiliary_loss_clip": 0.06411604, + "auxiliary_loss_mlp": 0.01265165, + "balance_loss_clip": 0.06272925, + "balance_loss_mlp": 0.0125474, + "epoch": 0.6939425822937021, + "flos": 18987987807360.0, + "grad_norm": 1.6079825627082245, + "language_loss": 0.84464371, + "learning_rate": 9.04594029121081e-07, + "loss": 0.92141145, + "num_input_tokens_seen": 249120810, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.10418701, + "step": 11542, + "time_per_iteration": 2.499424457550049 + }, + { + "auxiliary_loss_clip": 0.06415489, + "auxiliary_loss_mlp": 0.01265068, + "balance_loss_clip": 0.06275496, + "balance_loss_mlp": 0.01254136, + "epoch": 0.6940027055463701, + "flos": 23082513096960.0, + "grad_norm": 1.8518042954467828, + "language_loss": 0.75316143, + "learning_rate": 9.04268197881323e-07, + "loss": 0.82996696, + "num_input_tokens_seen": 249138050, + "router_z_loss_clip": 1.39941406, + "router_z_loss_mlp": 0.10931396, + "step": 11543, + "time_per_iteration": 3.9085495471954346 + }, + { + "auxiliary_loss_clip": 0.06410378, + "auxiliary_loss_mlp": 0.01265988, + "balance_loss_clip": 0.06273862, + "balance_loss_mlp": 0.01255373, + "epoch": 0.694062828799038, + "flos": 18192391689600.0, + "grad_norm": 1.648222513312388, + "language_loss": 0.76331246, + "learning_rate": 9.039424081918241e-07, + "loss": 0.84007609, + "num_input_tokens_seen": 249155570, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.10614014, + "step": 11544, + "time_per_iteration": 2.5347986221313477 + }, + { + "auxiliary_loss_clip": 0.06413911, + "auxiliary_loss_mlp": 0.0126496, + "balance_loss_clip": 0.06275374, + "balance_loss_mlp": 0.012541, + "epoch": 0.694122952051706, + "flos": 17827269523200.0, + "grad_norm": 1.8058959765981615, + "language_loss": 0.71283519, + "learning_rate": 9.036166600649388e-07, + "loss": 0.78962398, + "num_input_tokens_seen": 249172960, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.10864258, + "step": 11545, + "time_per_iteration": 2.4718210697174072 + }, + { + "auxiliary_loss_clip": 0.06407937, + "auxiliary_loss_mlp": 0.01262856, + "balance_loss_clip": 0.06275916, + "balance_loss_mlp": 0.01253039, + "epoch": 0.694183075304374, + "flos": 21221710750080.0, + "grad_norm": 1.516472070644587, + "language_loss": 0.79896855, + "learning_rate": 9.0329095351302e-07, + "loss": 0.87567645, + "num_input_tokens_seen": 249192450, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.09814453, + "step": 11546, + "time_per_iteration": 2.5148062705993652 + }, + { + "auxiliary_loss_clip": 0.06411743, + "auxiliary_loss_mlp": 0.01267153, + "balance_loss_clip": 0.06275012, + "balance_loss_mlp": 0.01256281, + "epoch": 0.694243198557042, + "flos": 24067273806720.0, + "grad_norm": 1.4558199270771826, + "language_loss": 0.7883184, + "learning_rate": 9.029652885484194e-07, + "loss": 0.8651073, + "num_input_tokens_seen": 249214320, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.10870361, + "step": 11547, + "time_per_iteration": 2.5461182594299316 + }, + { + "auxiliary_loss_clip": 0.06409074, + "auxiliary_loss_mlp": 0.01267288, + "balance_loss_clip": 0.06275046, + "balance_loss_mlp": 0.01256845, + "epoch": 0.6943033218097099, + "flos": 21148183192320.0, + "grad_norm": 2.180775706849967, + "language_loss": 0.80900609, + "learning_rate": 9.026396651834834e-07, + "loss": 0.88576972, + "num_input_tokens_seen": 249230925, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.10443115, + "step": 11548, + "time_per_iteration": 2.499633312225342 + }, + { + "auxiliary_loss_clip": 0.06316315, + "auxiliary_loss_mlp": 0.01251651, + "balance_loss_clip": 0.06260554, + "balance_loss_mlp": 0.01250445, + "epoch": 0.6943634450623779, + "flos": 57830892163200.0, + "grad_norm": 0.8127275261655555, + "language_loss": 0.53539848, + "learning_rate": 9.023140834305613e-07, + "loss": 0.61107814, + "num_input_tokens_seen": 249293975, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.01203918, + "step": 11549, + "time_per_iteration": 3.1340725421905518 + }, + { + "auxiliary_loss_clip": 0.06409207, + "auxiliary_loss_mlp": 0.01267856, + "balance_loss_clip": 0.0627339, + "balance_loss_mlp": 0.01256924, + "epoch": 0.6944235683150458, + "flos": 30598411501440.0, + "grad_norm": 1.3218169673539149, + "language_loss": 0.73849893, + "learning_rate": 9.01988543302e-07, + "loss": 0.81526959, + "num_input_tokens_seen": 249315285, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.109375, + "step": 11550, + "time_per_iteration": 2.5708651542663574 + }, + { + "auxiliary_loss_clip": 0.06414837, + "auxiliary_loss_mlp": 0.01267221, + "balance_loss_clip": 0.06273603, + "balance_loss_mlp": 0.01255836, + "epoch": 0.6944836915677138, + "flos": 19725611299200.0, + "grad_norm": 2.422306593837277, + "language_loss": 0.7436735, + "learning_rate": 9.016630448101425e-07, + "loss": 0.82049412, + "num_input_tokens_seen": 249333505, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.11364746, + "step": 11551, + "time_per_iteration": 2.527280807495117 + }, + { + "auxiliary_loss_clip": 0.06412678, + "auxiliary_loss_mlp": 0.01266399, + "balance_loss_clip": 0.06274699, + "balance_loss_mlp": 0.01255592, + "epoch": 0.6945438148203817, + "flos": 24870542572800.0, + "grad_norm": 1.4976139060418592, + "language_loss": 0.84468353, + "learning_rate": 9.01337587967333e-07, + "loss": 0.92147428, + "num_input_tokens_seen": 249354180, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.10797119, + "step": 11552, + "time_per_iteration": 2.5304994583129883 + }, + { + "auxiliary_loss_clip": 0.06412995, + "auxiliary_loss_mlp": 0.01266444, + "balance_loss_clip": 0.06275281, + "balance_loss_mlp": 0.01255787, + "epoch": 0.6946039380730498, + "flos": 33334752360960.0, + "grad_norm": 1.8566044703469122, + "language_loss": 0.67553848, + "learning_rate": 9.010121727859117e-07, + "loss": 0.75233287, + "num_input_tokens_seen": 249377035, + "router_z_loss_clip": 1.37695312, + "router_z_loss_mlp": 0.10656738, + "step": 11553, + "time_per_iteration": 2.6192421913146973 + }, + { + "auxiliary_loss_clip": 0.064182, + "auxiliary_loss_mlp": 0.01265466, + "balance_loss_clip": 0.06275068, + "balance_loss_mlp": 0.01254314, + "epoch": 0.6946640613257177, + "flos": 20857385197440.0, + "grad_norm": 1.702671495962781, + "language_loss": 0.79674661, + "learning_rate": 9.006867992782195e-07, + "loss": 0.87358326, + "num_input_tokens_seen": 249396155, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.11138916, + "step": 11554, + "time_per_iteration": 2.486833095550537 + }, + { + "auxiliary_loss_clip": 0.06411414, + "auxiliary_loss_mlp": 0.0126656, + "balance_loss_clip": 0.06272921, + "balance_loss_mlp": 0.01256064, + "epoch": 0.6947241845783857, + "flos": 19360992257280.0, + "grad_norm": 2.4583328560659825, + "language_loss": 0.72664356, + "learning_rate": 9.003614674565934e-07, + "loss": 0.80342329, + "num_input_tokens_seen": 249414555, + "router_z_loss_clip": 1.38476562, + "router_z_loss_mlp": 0.10498047, + "step": 11555, + "time_per_iteration": 4.000531196594238 + }, + { + "auxiliary_loss_clip": 0.0640734, + "auxiliary_loss_mlp": 0.01264698, + "balance_loss_clip": 0.0627168, + "balance_loss_mlp": 0.01254404, + "epoch": 0.6947843078310536, + "flos": 27126669283200.0, + "grad_norm": 1.6806828217534537, + "language_loss": 0.78220618, + "learning_rate": 9.000361773333705e-07, + "loss": 0.85892653, + "num_input_tokens_seen": 249433570, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10284424, + "step": 11556, + "time_per_iteration": 2.5366411209106445 + }, + { + "auxiliary_loss_clip": 0.06412055, + "auxiliary_loss_mlp": 0.01264593, + "balance_loss_clip": 0.06273782, + "balance_loss_mlp": 0.01254198, + "epoch": 0.6948444310837216, + "flos": 28592692318080.0, + "grad_norm": 2.2663636290746205, + "language_loss": 0.60655725, + "learning_rate": 8.997109289208869e-07, + "loss": 0.68332362, + "num_input_tokens_seen": 249453735, + "router_z_loss_clip": 1.38183594, + "router_z_loss_mlp": 0.10394287, + "step": 11557, + "time_per_iteration": 2.5730667114257812 + }, + { + "auxiliary_loss_clip": 0.06406298, + "auxiliary_loss_mlp": 0.0126677, + "balance_loss_clip": 0.06273069, + "balance_loss_mlp": 0.01256923, + "epoch": 0.6949045543363896, + "flos": 15674704859520.0, + "grad_norm": 1.6481144158645147, + "language_loss": 0.85564643, + "learning_rate": 8.993857222314752e-07, + "loss": 0.9323771, + "num_input_tokens_seen": 249470805, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.09851074, + "step": 11558, + "time_per_iteration": 3.9160499572753906 + }, + { + "auxiliary_loss_clip": 0.06415498, + "auxiliary_loss_mlp": 0.01268636, + "balance_loss_clip": 0.06274904, + "balance_loss_mlp": 0.01257764, + "epoch": 0.6949646775890576, + "flos": 23266311027840.0, + "grad_norm": 1.591782165805242, + "language_loss": 0.70581871, + "learning_rate": 8.990605572774664e-07, + "loss": 0.78266007, + "num_input_tokens_seen": 249491150, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.10876465, + "step": 11559, + "time_per_iteration": 2.527818441390991 + }, + { + "auxiliary_loss_clip": 0.06411439, + "auxiliary_loss_mlp": 0.01267371, + "balance_loss_clip": 0.06274717, + "balance_loss_mlp": 0.01256946, + "epoch": 0.6950248008417256, + "flos": 22389095433600.0, + "grad_norm": 1.4072009263276422, + "language_loss": 0.78738344, + "learning_rate": 8.987354340711921e-07, + "loss": 0.8641715, + "num_input_tokens_seen": 249511560, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.10424805, + "step": 11560, + "time_per_iteration": 2.5627846717834473 + }, + { + "auxiliary_loss_clip": 0.06408294, + "auxiliary_loss_mlp": 0.01264919, + "balance_loss_clip": 0.06274506, + "balance_loss_mlp": 0.01255614, + "epoch": 0.6950849240943935, + "flos": 23484126516480.0, + "grad_norm": 1.4947787442240967, + "language_loss": 0.76889873, + "learning_rate": 8.9841035262498e-07, + "loss": 0.84563088, + "num_input_tokens_seen": 249531910, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.09307861, + "step": 11561, + "time_per_iteration": 2.4997048377990723 + }, + { + "auxiliary_loss_clip": 0.06411804, + "auxiliary_loss_mlp": 0.01269689, + "balance_loss_clip": 0.06277403, + "balance_loss_mlp": 0.012589, + "epoch": 0.6951450473470615, + "flos": 17426285009280.0, + "grad_norm": 1.734417047783141, + "language_loss": 0.78360051, + "learning_rate": 8.980853129511577e-07, + "loss": 0.86041546, + "num_input_tokens_seen": 249550300, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.10784912, + "step": 11562, + "time_per_iteration": 3.868687868118286 + }, + { + "auxiliary_loss_clip": 0.06413691, + "auxiliary_loss_mlp": 0.0126516, + "balance_loss_clip": 0.06274996, + "balance_loss_mlp": 0.01254509, + "epoch": 0.6952051705997294, + "flos": 20492053395840.0, + "grad_norm": 2.791172268200526, + "language_loss": 0.69210434, + "learning_rate": 8.977603150620515e-07, + "loss": 0.76889294, + "num_input_tokens_seen": 249567740, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.10656738, + "step": 11563, + "time_per_iteration": 2.521984338760376 + }, + { + "auxiliary_loss_clip": 0.0640626, + "auxiliary_loss_mlp": 0.01264877, + "balance_loss_clip": 0.06274064, + "balance_loss_mlp": 0.01255006, + "epoch": 0.6952652938523974, + "flos": 13994472061440.0, + "grad_norm": 2.2938813143699943, + "language_loss": 0.73795921, + "learning_rate": 8.974353589699846e-07, + "loss": 0.81467056, + "num_input_tokens_seen": 249582700, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.09869385, + "step": 11564, + "time_per_iteration": 2.454090118408203 + }, + { + "auxiliary_loss_clip": 0.06431751, + "auxiliary_loss_mlp": 0.01272001, + "balance_loss_clip": 0.06280031, + "balance_loss_mlp": 0.01259174, + "epoch": 0.6953254171050653, + "flos": 30961479242880.0, + "grad_norm": 1.9156541387809913, + "language_loss": 0.71630907, + "learning_rate": 8.971104446872785e-07, + "loss": 0.79334664, + "num_input_tokens_seen": 249602920, + "router_z_loss_clip": 1.51660156, + "router_z_loss_mlp": 0.12823486, + "step": 11565, + "time_per_iteration": 2.6339352130889893 + }, + { + "auxiliary_loss_clip": 0.06312925, + "auxiliary_loss_mlp": 0.01254517, + "balance_loss_clip": 0.0625705, + "balance_loss_mlp": 0.01253326, + "epoch": 0.6953855403577334, + "flos": 61688231671680.0, + "grad_norm": 0.9056621867794188, + "language_loss": 0.58358586, + "learning_rate": 8.96785572226255e-07, + "loss": 0.65926027, + "num_input_tokens_seen": 249660400, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.01189423, + "step": 11566, + "time_per_iteration": 2.9703423976898193 + }, + { + "auxiliary_loss_clip": 0.0641438, + "auxiliary_loss_mlp": 0.01265896, + "balance_loss_clip": 0.06273914, + "balance_loss_mlp": 0.01254237, + "epoch": 0.6954456636104013, + "flos": 23045644500480.0, + "grad_norm": 1.741502187715767, + "language_loss": 0.74213183, + "learning_rate": 8.964607415992338e-07, + "loss": 0.81893462, + "num_input_tokens_seen": 249679335, + "router_z_loss_clip": 1.40429688, + "router_z_loss_mlp": 0.11663818, + "step": 11567, + "time_per_iteration": 2.5282747745513916 + }, + { + "auxiliary_loss_clip": 0.06409914, + "auxiliary_loss_mlp": 0.01264668, + "balance_loss_clip": 0.06274567, + "balance_loss_mlp": 0.0125382, + "epoch": 0.6955057868630693, + "flos": 23925920768640.0, + "grad_norm": 1.2088897193849768, + "language_loss": 0.76795661, + "learning_rate": 8.961359528185313e-07, + "loss": 0.84470242, + "num_input_tokens_seen": 249701805, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.10858154, + "step": 11568, + "time_per_iteration": 2.555664300918579 + }, + { + "auxiliary_loss_clip": 0.06409561, + "auxiliary_loss_mlp": 0.01267134, + "balance_loss_clip": 0.06274664, + "balance_loss_mlp": 0.01257567, + "epoch": 0.6955659101157372, + "flos": 22600076814720.0, + "grad_norm": 2.0811162561190444, + "language_loss": 0.72560644, + "learning_rate": 8.958112058964649e-07, + "loss": 0.80237341, + "num_input_tokens_seen": 249720550, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.09570312, + "step": 11569, + "time_per_iteration": 2.550203323364258 + }, + { + "auxiliary_loss_clip": 0.06412488, + "auxiliary_loss_mlp": 0.01267303, + "balance_loss_clip": 0.0627417, + "balance_loss_mlp": 0.01256568, + "epoch": 0.6956260333684052, + "flos": 24579576869760.0, + "grad_norm": 1.4598042665233286, + "language_loss": 0.77169657, + "learning_rate": 8.954865008453471e-07, + "loss": 0.84849441, + "num_input_tokens_seen": 249740325, + "router_z_loss_clip": 1.38183594, + "router_z_loss_mlp": 0.10736084, + "step": 11570, + "time_per_iteration": 2.5227878093719482 + }, + { + "auxiliary_loss_clip": 0.06413926, + "auxiliary_loss_mlp": 0.01265729, + "balance_loss_clip": 0.06273335, + "balance_loss_mlp": 0.01255436, + "epoch": 0.6956861566210732, + "flos": 25852745733120.0, + "grad_norm": 1.7591175950059927, + "language_loss": 0.7487582, + "learning_rate": 8.95161837677493e-07, + "loss": 0.82555479, + "num_input_tokens_seen": 249760570, + "router_z_loss_clip": 1.40429688, + "router_z_loss_mlp": 0.10284424, + "step": 11571, + "time_per_iteration": 2.597681999206543 + }, + { + "auxiliary_loss_clip": 0.06403409, + "auxiliary_loss_mlp": 0.01263016, + "balance_loss_clip": 0.062727, + "balance_loss_mlp": 0.01253241, + "epoch": 0.6957462798737412, + "flos": 15306270456960.0, + "grad_norm": 1.6743829197171876, + "language_loss": 0.74611163, + "learning_rate": 8.948372164052118e-07, + "loss": 0.8227759, + "num_input_tokens_seen": 249778290, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09771729, + "step": 11572, + "time_per_iteration": 2.479717254638672 + }, + { + "auxiliary_loss_clip": 0.06411865, + "auxiliary_loss_mlp": 0.01266562, + "balance_loss_clip": 0.06272524, + "balance_loss_mlp": 0.01256036, + "epoch": 0.6958064031264092, + "flos": 36255645838080.0, + "grad_norm": 1.9177386659246018, + "language_loss": 0.70336205, + "learning_rate": 8.94512637040814e-07, + "loss": 0.7801463, + "num_input_tokens_seen": 249800925, + "router_z_loss_clip": 1.39257812, + "router_z_loss_mlp": 0.10522461, + "step": 11573, + "time_per_iteration": 2.646585702896118 + }, + { + "auxiliary_loss_clip": 0.064174, + "auxiliary_loss_mlp": 0.01266296, + "balance_loss_clip": 0.06275034, + "balance_loss_mlp": 0.01254935, + "epoch": 0.6958665263790771, + "flos": 19214817609600.0, + "grad_norm": 1.6543405774844155, + "language_loss": 0.75180942, + "learning_rate": 8.941880995966095e-07, + "loss": 0.82864642, + "num_input_tokens_seen": 249820500, + "router_z_loss_clip": 1.42382812, + "router_z_loss_mlp": 0.11364746, + "step": 11574, + "time_per_iteration": 2.5017471313476562 + }, + { + "auxiliary_loss_clip": 0.06413898, + "auxiliary_loss_mlp": 0.0126532, + "balance_loss_clip": 0.06272998, + "balance_loss_mlp": 0.01254996, + "epoch": 0.6959266496317451, + "flos": 21801797366400.0, + "grad_norm": 1.6788443251259586, + "language_loss": 0.74745572, + "learning_rate": 8.938636040849014e-07, + "loss": 0.8242479, + "num_input_tokens_seen": 249839845, + "router_z_loss_clip": 1.40722656, + "router_z_loss_mlp": 0.10327148, + "step": 11575, + "time_per_iteration": 2.5528361797332764 + }, + { + "auxiliary_loss_clip": 0.06409347, + "auxiliary_loss_mlp": 0.01269096, + "balance_loss_clip": 0.06271648, + "balance_loss_mlp": 0.01258248, + "epoch": 0.695986772884413, + "flos": 20564490850560.0, + "grad_norm": 1.717283083984882, + "language_loss": 0.79060346, + "learning_rate": 8.935391505179966e-07, + "loss": 0.86738789, + "num_input_tokens_seen": 249857400, + "router_z_loss_clip": 1.37695312, + "router_z_loss_mlp": 0.10845947, + "step": 11576, + "time_per_iteration": 2.4801833629608154 + }, + { + "auxiliary_loss_clip": 0.06413432, + "auxiliary_loss_mlp": 0.01262741, + "balance_loss_clip": 0.06272326, + "balance_loss_mlp": 0.01252191, + "epoch": 0.696046896137081, + "flos": 14940980582400.0, + "grad_norm": 2.5670489052023404, + "language_loss": 0.57032454, + "learning_rate": 8.932147389081985e-07, + "loss": 0.64708626, + "num_input_tokens_seen": 249871645, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.10559082, + "step": 11577, + "time_per_iteration": 2.502033233642578 + }, + { + "auxiliary_loss_clip": 0.06404924, + "auxiliary_loss_mlp": 0.01266503, + "balance_loss_clip": 0.06274053, + "balance_loss_mlp": 0.01257521, + "epoch": 0.696107019389749, + "flos": 30748569217920.0, + "grad_norm": 1.378295678041548, + "language_loss": 0.76719046, + "learning_rate": 8.928903692678081e-07, + "loss": 0.84390473, + "num_input_tokens_seen": 249894215, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.08984375, + "step": 11578, + "time_per_iteration": 2.605837821960449 + }, + { + "auxiliary_loss_clip": 0.06414018, + "auxiliary_loss_mlp": 0.01262965, + "balance_loss_clip": 0.0627658, + "balance_loss_mlp": 0.01253249, + "epoch": 0.696167142642417, + "flos": 20782935244800.0, + "grad_norm": 3.119426120413718, + "language_loss": 0.79773849, + "learning_rate": 8.925660416091254e-07, + "loss": 0.87450832, + "num_input_tokens_seen": 249912850, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.09716797, + "step": 11579, + "time_per_iteration": 2.5537924766540527 + }, + { + "auxiliary_loss_clip": 0.06405934, + "auxiliary_loss_mlp": 0.01263768, + "balance_loss_clip": 0.06271495, + "balance_loss_mlp": 0.01253558, + "epoch": 0.6962272658950849, + "flos": 22571761334400.0, + "grad_norm": 1.5861987374843416, + "language_loss": 0.72813702, + "learning_rate": 8.922417559444502e-07, + "loss": 0.80483407, + "num_input_tokens_seen": 249932650, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.10205078, + "step": 11580, + "time_per_iteration": 2.5217056274414062 + }, + { + "auxiliary_loss_clip": 0.0641515, + "auxiliary_loss_mlp": 0.01267668, + "balance_loss_clip": 0.06275546, + "balance_loss_mlp": 0.01255896, + "epoch": 0.6962873891477529, + "flos": 22206681095040.0, + "grad_norm": 2.1085212775747975, + "language_loss": 0.66371673, + "learning_rate": 8.919175122860787e-07, + "loss": 0.74054492, + "num_input_tokens_seen": 249951205, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.11767578, + "step": 11581, + "time_per_iteration": 2.5470681190490723 + }, + { + "auxiliary_loss_clip": 0.06415606, + "auxiliary_loss_mlp": 0.01263239, + "balance_loss_clip": 0.06278277, + "balance_loss_mlp": 0.01253726, + "epoch": 0.6963475124004208, + "flos": 12493718709120.0, + "grad_norm": 3.192459541289618, + "language_loss": 0.76738924, + "learning_rate": 8.915933106463056e-07, + "loss": 0.84417772, + "num_input_tokens_seen": 249967045, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.09509277, + "step": 11582, + "time_per_iteration": 2.5975067615509033 + }, + { + "auxiliary_loss_clip": 0.06411912, + "auxiliary_loss_mlp": 0.01266649, + "balance_loss_clip": 0.06274536, + "balance_loss_mlp": 0.01256355, + "epoch": 0.6964076356530888, + "flos": 17170762383360.0, + "grad_norm": 2.14882454800848, + "language_loss": 0.70161986, + "learning_rate": 8.91269151037425e-07, + "loss": 0.77840543, + "num_input_tokens_seen": 249984565, + "router_z_loss_clip": 1.37402344, + "router_z_loss_mlp": 0.10290527, + "step": 11583, + "time_per_iteration": 3.9500138759613037 + }, + { + "auxiliary_loss_clip": 0.06410628, + "auxiliary_loss_mlp": 0.01268947, + "balance_loss_clip": 0.06274879, + "balance_loss_mlp": 0.01258272, + "epoch": 0.6964677589057569, + "flos": 19943342933760.0, + "grad_norm": 1.7749969250449007, + "language_loss": 0.82683307, + "learning_rate": 8.909450334717301e-07, + "loss": 0.90362883, + "num_input_tokens_seen": 250004235, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.10681152, + "step": 11584, + "time_per_iteration": 2.5435311794281006 + }, + { + "auxiliary_loss_clip": 0.06411311, + "auxiliary_loss_mlp": 0.01267824, + "balance_loss_clip": 0.06271736, + "balance_loss_mlp": 0.01256565, + "epoch": 0.6965278821584248, + "flos": 22790708853120.0, + "grad_norm": 2.098465309846489, + "language_loss": 0.79802585, + "learning_rate": 8.906209579615107e-07, + "loss": 0.87481719, + "num_input_tokens_seen": 250017645, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.1126709, + "step": 11585, + "time_per_iteration": 2.490299701690674 + }, + { + "auxiliary_loss_clip": 0.06406368, + "auxiliary_loss_mlp": 0.01265153, + "balance_loss_clip": 0.06273674, + "balance_loss_mlp": 0.01255735, + "epoch": 0.6965880054110928, + "flos": 20053739088000.0, + "grad_norm": 1.7604905238703683, + "language_loss": 0.77940738, + "learning_rate": 8.90296924519055e-07, + "loss": 0.85612255, + "num_input_tokens_seen": 250037640, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.09411621, + "step": 11586, + "time_per_iteration": 2.5373406410217285 + }, + { + "auxiliary_loss_clip": 0.06404427, + "auxiliary_loss_mlp": 0.01266758, + "balance_loss_clip": 0.06273477, + "balance_loss_mlp": 0.0125706, + "epoch": 0.6966481286637607, + "flos": 21914709143040.0, + "grad_norm": 1.8539557700987637, + "language_loss": 0.78935838, + "learning_rate": 8.899729331566519e-07, + "loss": 0.86607027, + "num_input_tokens_seen": 250056490, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09698486, + "step": 11587, + "time_per_iteration": 2.4801838397979736 + }, + { + "auxiliary_loss_clip": 0.06406583, + "auxiliary_loss_mlp": 0.01264851, + "balance_loss_clip": 0.0627536, + "balance_loss_mlp": 0.01254915, + "epoch": 0.6967082519164287, + "flos": 15638674803840.0, + "grad_norm": 1.9230111566874013, + "language_loss": 0.73017895, + "learning_rate": 8.896489838865857e-07, + "loss": 0.80689335, + "num_input_tokens_seen": 250074285, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09936523, + "step": 11588, + "time_per_iteration": 2.488046646118164 + }, + { + "auxiliary_loss_clip": 0.06411311, + "auxiliary_loss_mlp": 0.01262306, + "balance_loss_clip": 0.06274327, + "balance_loss_mlp": 0.01252507, + "epoch": 0.6967683751690966, + "flos": 24031453386240.0, + "grad_norm": 2.0364063263002885, + "language_loss": 0.74887639, + "learning_rate": 8.893250767211413e-07, + "loss": 0.82561255, + "num_input_tokens_seen": 250093350, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.09802246, + "step": 11589, + "time_per_iteration": 2.548539400100708 + }, + { + "auxiliary_loss_clip": 0.06411868, + "auxiliary_loss_mlp": 0.01265329, + "balance_loss_clip": 0.06274883, + "balance_loss_mlp": 0.01254773, + "epoch": 0.6968284984217646, + "flos": 31031862272640.0, + "grad_norm": 4.3993143538672275, + "language_loss": 0.63862813, + "learning_rate": 8.890012116726012e-07, + "loss": 0.71539998, + "num_input_tokens_seen": 250114170, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.10552979, + "step": 11590, + "time_per_iteration": 2.6050679683685303 + }, + { + "auxiliary_loss_clip": 0.06316171, + "auxiliary_loss_mlp": 0.01251394, + "balance_loss_clip": 0.06259812, + "balance_loss_mlp": 0.0125019, + "epoch": 0.6968886216744326, + "flos": 67642888475520.0, + "grad_norm": 0.7383814790063842, + "language_loss": 0.6120699, + "learning_rate": 8.88677388753248e-07, + "loss": 0.68774557, + "num_input_tokens_seen": 250178250, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01203156, + "step": 11591, + "time_per_iteration": 3.205728530883789 + }, + { + "auxiliary_loss_clip": 0.06413443, + "auxiliary_loss_mlp": 0.01267566, + "balance_loss_clip": 0.0627727, + "balance_loss_mlp": 0.01256539, + "epoch": 0.6969487449271006, + "flos": 24870668353920.0, + "grad_norm": 1.4802717401382182, + "language_loss": 0.69663697, + "learning_rate": 8.883536079753582e-07, + "loss": 0.77344704, + "num_input_tokens_seen": 250198420, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.11029053, + "step": 11592, + "time_per_iteration": 2.530959367752075 + }, + { + "auxiliary_loss_clip": 0.06411387, + "auxiliary_loss_mlp": 0.01269289, + "balance_loss_clip": 0.06275564, + "balance_loss_mlp": 0.01259132, + "epoch": 0.6970088681797685, + "flos": 28775525927040.0, + "grad_norm": 1.753602003372511, + "language_loss": 0.62838447, + "learning_rate": 8.880298693512109e-07, + "loss": 0.70519125, + "num_input_tokens_seen": 250220650, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.10150146, + "step": 11593, + "time_per_iteration": 2.5508384704589844 + }, + { + "auxiliary_loss_clip": 0.06406593, + "auxiliary_loss_mlp": 0.01263771, + "balance_loss_clip": 0.06274071, + "balance_loss_mlp": 0.012547, + "epoch": 0.6970689914324365, + "flos": 27316001583360.0, + "grad_norm": 1.3874621408455479, + "language_loss": 0.54750943, + "learning_rate": 8.877061728930832e-07, + "loss": 0.6242131, + "num_input_tokens_seen": 250241750, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09069824, + "step": 11594, + "time_per_iteration": 2.559556484222412 + }, + { + "auxiliary_loss_clip": 0.06411646, + "auxiliary_loss_mlp": 0.01264287, + "balance_loss_clip": 0.06274341, + "balance_loss_mlp": 0.01254106, + "epoch": 0.6971291146851044, + "flos": 19142422081920.0, + "grad_norm": 1.79939196206485, + "language_loss": 0.77473152, + "learning_rate": 8.87382518613248e-07, + "loss": 0.85149086, + "num_input_tokens_seen": 250259445, + "router_z_loss_clip": 1.37402344, + "router_z_loss_mlp": 0.10186768, + "step": 11595, + "time_per_iteration": 3.9267494678497314 + }, + { + "auxiliary_loss_clip": 0.06412616, + "auxiliary_loss_mlp": 0.0126537, + "balance_loss_clip": 0.06274199, + "balance_loss_mlp": 0.01254611, + "epoch": 0.6971892379377724, + "flos": 14615661905280.0, + "grad_norm": 2.356908454706418, + "language_loss": 0.72375011, + "learning_rate": 8.870589065239793e-07, + "loss": 0.80052996, + "num_input_tokens_seen": 250275640, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.10766602, + "step": 11596, + "time_per_iteration": 2.4861929416656494 + }, + { + "auxiliary_loss_clip": 0.0641246, + "auxiliary_loss_mlp": 0.01264522, + "balance_loss_clip": 0.06275618, + "balance_loss_mlp": 0.0125368, + "epoch": 0.6972493611904405, + "flos": 22313639232000.0, + "grad_norm": 1.9958593203679207, + "language_loss": 0.76570636, + "learning_rate": 8.867353366375492e-07, + "loss": 0.84247619, + "num_input_tokens_seen": 250296435, + "router_z_loss_clip": 1.36621094, + "router_z_loss_mlp": 0.10839844, + "step": 11597, + "time_per_iteration": 3.9746484756469727 + }, + { + "auxiliary_loss_clip": 0.064082, + "auxiliary_loss_mlp": 0.01267633, + "balance_loss_clip": 0.06272379, + "balance_loss_mlp": 0.01257232, + "epoch": 0.6973094844431084, + "flos": 17426075374080.0, + "grad_norm": 1.890364129189079, + "language_loss": 0.74871194, + "learning_rate": 8.864118089662267e-07, + "loss": 0.82547033, + "num_input_tokens_seen": 250314035, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10406494, + "step": 11598, + "time_per_iteration": 2.4967358112335205 + }, + { + "auxiliary_loss_clip": 0.06416015, + "auxiliary_loss_mlp": 0.01267108, + "balance_loss_clip": 0.06276817, + "balance_loss_mlp": 0.01256111, + "epoch": 0.6973696076957764, + "flos": 27242767514880.0, + "grad_norm": 1.672066699636808, + "language_loss": 0.89636326, + "learning_rate": 8.860883235222791e-07, + "loss": 0.97319448, + "num_input_tokens_seen": 250332995, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.10998535, + "step": 11599, + "time_per_iteration": 2.5665690898895264 + }, + { + "auxiliary_loss_clip": 0.06421445, + "auxiliary_loss_mlp": 0.01269073, + "balance_loss_clip": 0.06277397, + "balance_loss_mlp": 0.0125798, + "epoch": 0.6974297309484443, + "flos": 22024644099840.0, + "grad_norm": 1.8416467781869745, + "language_loss": 0.70383334, + "learning_rate": 8.85764880317974e-07, + "loss": 0.78073853, + "num_input_tokens_seen": 250352120, + "router_z_loss_clip": 1.43847656, + "router_z_loss_mlp": 0.11090088, + "step": 11600, + "time_per_iteration": 2.491593360900879 + }, + { + "auxiliary_loss_clip": 0.0641008, + "auxiliary_loss_mlp": 0.0126546, + "balance_loss_clip": 0.06272715, + "balance_loss_mlp": 0.01254958, + "epoch": 0.6974898542011123, + "flos": 28374038288640.0, + "grad_norm": 1.5173038128226022, + "language_loss": 0.76574016, + "learning_rate": 8.854414793655771e-07, + "loss": 0.84249556, + "num_input_tokens_seen": 250371705, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.10498047, + "step": 11601, + "time_per_iteration": 4.1049439907073975 + }, + { + "auxiliary_loss_clip": 0.06404468, + "auxiliary_loss_mlp": 0.01265462, + "balance_loss_clip": 0.06272994, + "balance_loss_mlp": 0.01255615, + "epoch": 0.6975499774537802, + "flos": 15237522581760.0, + "grad_norm": 1.8655763623744426, + "language_loss": 0.72371268, + "learning_rate": 8.851181206773508e-07, + "loss": 0.80041194, + "num_input_tokens_seen": 250390485, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09851074, + "step": 11602, + "time_per_iteration": 2.5268797874450684 + }, + { + "auxiliary_loss_clip": 0.06410255, + "auxiliary_loss_mlp": 0.01265285, + "balance_loss_clip": 0.06275497, + "balance_loss_mlp": 0.01255343, + "epoch": 0.6976101007064482, + "flos": 22162894536960.0, + "grad_norm": 2.1937279130738365, + "language_loss": 0.77231717, + "learning_rate": 8.847948042655567e-07, + "loss": 0.84907258, + "num_input_tokens_seen": 250407020, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.09942627, + "step": 11603, + "time_per_iteration": 2.4806923866271973 + }, + { + "auxiliary_loss_clip": 0.06408552, + "auxiliary_loss_mlp": 0.01263968, + "balance_loss_clip": 0.06273254, + "balance_loss_mlp": 0.01254211, + "epoch": 0.6976702239591162, + "flos": 22280124798720.0, + "grad_norm": 1.4370854048834028, + "language_loss": 0.62313223, + "learning_rate": 8.844715301424557e-07, + "loss": 0.69985747, + "num_input_tokens_seen": 250425880, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.09759521, + "step": 11604, + "time_per_iteration": 2.556675910949707 + }, + { + "auxiliary_loss_clip": 0.06411324, + "auxiliary_loss_mlp": 0.01265602, + "balance_loss_clip": 0.06273848, + "balance_loss_mlp": 0.01254486, + "epoch": 0.6977303472117842, + "flos": 25855722552960.0, + "grad_norm": 2.158609093070266, + "language_loss": 0.8206296, + "learning_rate": 8.841482983203057e-07, + "loss": 0.89739883, + "num_input_tokens_seen": 250442925, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.11120605, + "step": 11605, + "time_per_iteration": 2.5453009605407715 + }, + { + "auxiliary_loss_clip": 0.06408873, + "auxiliary_loss_mlp": 0.01266358, + "balance_loss_clip": 0.0627379, + "balance_loss_mlp": 0.01256637, + "epoch": 0.6977904704644521, + "flos": 20965894634880.0, + "grad_norm": 1.4817287317876005, + "language_loss": 0.7024073, + "learning_rate": 8.838251088113638e-07, + "loss": 0.77915967, + "num_input_tokens_seen": 250461220, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.09716797, + "step": 11606, + "time_per_iteration": 2.524181604385376 + }, + { + "auxiliary_loss_clip": 0.06411228, + "auxiliary_loss_mlp": 0.01265998, + "balance_loss_clip": 0.06271623, + "balance_loss_mlp": 0.01255221, + "epoch": 0.6978505937171201, + "flos": 22061680404480.0, + "grad_norm": 2.145616317364061, + "language_loss": 0.82643318, + "learning_rate": 8.835019616278856e-07, + "loss": 0.90320545, + "num_input_tokens_seen": 250480975, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.10772705, + "step": 11607, + "time_per_iteration": 2.4895663261413574 + }, + { + "auxiliary_loss_clip": 0.06416652, + "auxiliary_loss_mlp": 0.01265204, + "balance_loss_clip": 0.06274567, + "balance_loss_mlp": 0.01254201, + "epoch": 0.697910716969788, + "flos": 20049252894720.0, + "grad_norm": 2.008483115639311, + "language_loss": 0.79149514, + "learning_rate": 8.831788567821265e-07, + "loss": 0.86831373, + "num_input_tokens_seen": 250497980, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.11004639, + "step": 11608, + "time_per_iteration": 2.517848014831543 + }, + { + "auxiliary_loss_clip": 0.06411079, + "auxiliary_loss_mlp": 0.01264975, + "balance_loss_clip": 0.06272355, + "balance_loss_mlp": 0.0125461, + "epoch": 0.697970840222456, + "flos": 15893736232320.0, + "grad_norm": 1.856773515642951, + "language_loss": 0.9026711, + "learning_rate": 8.828557942863357e-07, + "loss": 0.97943169, + "num_input_tokens_seen": 250511910, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.10357666, + "step": 11609, + "time_per_iteration": 2.464045763015747 + }, + { + "auxiliary_loss_clip": 0.06410901, + "auxiliary_loss_mlp": 0.01262705, + "balance_loss_clip": 0.06270923, + "balance_loss_mlp": 0.01252965, + "epoch": 0.698030963475124, + "flos": 21222088093440.0, + "grad_norm": 1.4134029282176452, + "language_loss": 0.64230514, + "learning_rate": 8.82532774152765e-07, + "loss": 0.71904123, + "num_input_tokens_seen": 250531090, + "router_z_loss_clip": 1.40039062, + "router_z_loss_mlp": 0.09747314, + "step": 11610, + "time_per_iteration": 2.5426440238952637 + }, + { + "auxiliary_loss_clip": 0.06407233, + "auxiliary_loss_mlp": 0.01264187, + "balance_loss_clip": 0.06273091, + "balance_loss_mlp": 0.0125446, + "epoch": 0.698091086727792, + "flos": 33767113029120.0, + "grad_norm": 1.5536592755713354, + "language_loss": 0.84326196, + "learning_rate": 8.822097963936643e-07, + "loss": 0.91997612, + "num_input_tokens_seen": 250551565, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.097229, + "step": 11611, + "time_per_iteration": 2.6129181385040283 + }, + { + "auxiliary_loss_clip": 0.06411347, + "auxiliary_loss_mlp": 0.01264511, + "balance_loss_clip": 0.06272921, + "balance_loss_mlp": 0.01253752, + "epoch": 0.69815120998046, + "flos": 15893275034880.0, + "grad_norm": 1.864564945323593, + "language_loss": 0.70917654, + "learning_rate": 8.818868610212793e-07, + "loss": 0.78593516, + "num_input_tokens_seen": 250569625, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.10754395, + "step": 11612, + "time_per_iteration": 2.4869654178619385 + }, + { + "auxiliary_loss_clip": 0.06406604, + "auxiliary_loss_mlp": 0.01264449, + "balance_loss_clip": 0.06273325, + "balance_loss_mlp": 0.01254096, + "epoch": 0.6982113332331279, + "flos": 18952041605760.0, + "grad_norm": 1.4951443393996662, + "language_loss": 0.81150031, + "learning_rate": 8.815639680478573e-07, + "loss": 0.88821077, + "num_input_tokens_seen": 250586960, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.10345459, + "step": 11613, + "time_per_iteration": 2.4747042655944824 + }, + { + "auxiliary_loss_clip": 0.06409472, + "auxiliary_loss_mlp": 0.01267068, + "balance_loss_clip": 0.06274355, + "balance_loss_mlp": 0.01257335, + "epoch": 0.6982714564857959, + "flos": 24396533625600.0, + "grad_norm": 1.8067810947897194, + "language_loss": 0.75539565, + "learning_rate": 8.812411174856411e-07, + "loss": 0.83216107, + "num_input_tokens_seen": 250605080, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.09741211, + "step": 11614, + "time_per_iteration": 2.533997058868408 + }, + { + "auxiliary_loss_clip": 0.06408294, + "auxiliary_loss_mlp": 0.01268326, + "balance_loss_clip": 0.06272974, + "balance_loss_mlp": 0.0125817, + "epoch": 0.6983315797384638, + "flos": 20089852997760.0, + "grad_norm": 1.9161960736489865, + "language_loss": 0.77505577, + "learning_rate": 8.809183093468746e-07, + "loss": 0.85182202, + "num_input_tokens_seen": 250623965, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.10162354, + "step": 11615, + "time_per_iteration": 2.4810245037078857 + }, + { + "auxiliary_loss_clip": 0.06403261, + "auxiliary_loss_mlp": 0.01262746, + "balance_loss_clip": 0.06272578, + "balance_loss_mlp": 0.01253048, + "epoch": 0.6983917029911318, + "flos": 13516815461760.0, + "grad_norm": 1.8844428750511293, + "language_loss": 0.73254174, + "learning_rate": 8.80595543643797e-07, + "loss": 0.80920184, + "num_input_tokens_seen": 250640675, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.09692383, + "step": 11616, + "time_per_iteration": 2.4856157302856445 + }, + { + "auxiliary_loss_clip": 0.06408458, + "auxiliary_loss_mlp": 0.01264075, + "balance_loss_clip": 0.06277423, + "balance_loss_mlp": 0.01254091, + "epoch": 0.6984518262437998, + "flos": 22025021443200.0, + "grad_norm": 1.4724184586515745, + "language_loss": 0.84294975, + "learning_rate": 8.802728203886487e-07, + "loss": 0.91967505, + "num_input_tokens_seen": 250660295, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.09979248, + "step": 11617, + "time_per_iteration": 2.503758668899536 + }, + { + "auxiliary_loss_clip": 0.0641643, + "auxiliary_loss_mlp": 0.0126771, + "balance_loss_clip": 0.0627649, + "balance_loss_mlp": 0.01257035, + "epoch": 0.6985119494964678, + "flos": 18776587155840.0, + "grad_norm": 2.0634899151280623, + "language_loss": 0.59477413, + "learning_rate": 8.799501395936682e-07, + "loss": 0.67161554, + "num_input_tokens_seen": 250678155, + "router_z_loss_clip": 1.39941406, + "router_z_loss_mlp": 0.10668945, + "step": 11618, + "time_per_iteration": 2.502458333969116 + }, + { + "auxiliary_loss_clip": 0.06411035, + "auxiliary_loss_mlp": 0.0126303, + "balance_loss_clip": 0.06276886, + "balance_loss_mlp": 0.0125307, + "epoch": 0.6985720727491357, + "flos": 22389430849920.0, + "grad_norm": 2.158587147069475, + "language_loss": 0.83073372, + "learning_rate": 8.796275012710903e-07, + "loss": 0.9074744, + "num_input_tokens_seen": 250697230, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.0994873, + "step": 11619, + "time_per_iteration": 2.4989545345306396 + }, + { + "auxiliary_loss_clip": 0.06409271, + "auxiliary_loss_mlp": 0.01266979, + "balance_loss_clip": 0.06278059, + "balance_loss_mlp": 0.01258152, + "epoch": 0.6986321960018037, + "flos": 39577398048000.0, + "grad_norm": 1.554266189454373, + "language_loss": 0.67337298, + "learning_rate": 8.793049054331494e-07, + "loss": 0.75013542, + "num_input_tokens_seen": 250719865, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.08825684, + "step": 11620, + "time_per_iteration": 2.765410900115967 + }, + { + "auxiliary_loss_clip": 0.06411748, + "auxiliary_loss_mlp": 0.01266896, + "balance_loss_clip": 0.06273868, + "balance_loss_mlp": 0.01256621, + "epoch": 0.6986923192544716, + "flos": 17973528024960.0, + "grad_norm": 2.4474211013812432, + "language_loss": 0.73446906, + "learning_rate": 8.789823520920794e-07, + "loss": 0.81125557, + "num_input_tokens_seen": 250736565, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.1027832, + "step": 11621, + "time_per_iteration": 2.4840140342712402 + }, + { + "auxiliary_loss_clip": 0.06412227, + "auxiliary_loss_mlp": 0.01264203, + "balance_loss_clip": 0.06272949, + "balance_loss_mlp": 0.01253737, + "epoch": 0.6987524425071396, + "flos": 25601583519360.0, + "grad_norm": 1.724040192260788, + "language_loss": 0.68410677, + "learning_rate": 8.7865984126011e-07, + "loss": 0.76087105, + "num_input_tokens_seen": 250757235, + "router_z_loss_clip": 1.39257812, + "router_z_loss_mlp": 0.10461426, + "step": 11622, + "time_per_iteration": 3.950021743774414 + }, + { + "auxiliary_loss_clip": 0.06409498, + "auxiliary_loss_mlp": 0.01267194, + "balance_loss_clip": 0.0627782, + "balance_loss_mlp": 0.01257383, + "epoch": 0.6988125657598077, + "flos": 17535842622720.0, + "grad_norm": 1.8022622371846757, + "language_loss": 0.62591398, + "learning_rate": 8.783373729494721e-07, + "loss": 0.70268083, + "num_input_tokens_seen": 250775585, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.09814453, + "step": 11623, + "time_per_iteration": 2.529270887374878 + }, + { + "auxiliary_loss_clip": 0.06415178, + "auxiliary_loss_mlp": 0.01265248, + "balance_loss_clip": 0.06272644, + "balance_loss_mlp": 0.01254817, + "epoch": 0.6988726890124756, + "flos": 39175029941760.0, + "grad_norm": 1.7670185249526673, + "language_loss": 0.60458779, + "learning_rate": 8.780149471723932e-07, + "loss": 0.68139207, + "num_input_tokens_seen": 250795725, + "router_z_loss_clip": 1.42285156, + "router_z_loss_mlp": 0.10430908, + "step": 11624, + "time_per_iteration": 2.6375675201416016 + }, + { + "auxiliary_loss_clip": 0.06411561, + "auxiliary_loss_mlp": 0.01267973, + "balance_loss_clip": 0.06272775, + "balance_loss_mlp": 0.01256564, + "epoch": 0.6989328122651436, + "flos": 20199662173440.0, + "grad_norm": 1.5069469972343055, + "language_loss": 0.78510606, + "learning_rate": 8.776925639411017e-07, + "loss": 0.8619014, + "num_input_tokens_seen": 250814555, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.11413574, + "step": 11625, + "time_per_iteration": 2.534061908721924 + }, + { + "auxiliary_loss_clip": 0.06406638, + "auxiliary_loss_mlp": 0.01266638, + "balance_loss_clip": 0.06275126, + "balance_loss_mlp": 0.01257256, + "epoch": 0.6989929355178115, + "flos": 21841265439360.0, + "grad_norm": 1.6759866105601053, + "language_loss": 0.66316259, + "learning_rate": 8.773702232678188e-07, + "loss": 0.73989534, + "num_input_tokens_seen": 250833105, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09381104, + "step": 11626, + "time_per_iteration": 2.4902937412261963 + }, + { + "auxiliary_loss_clip": 0.06411765, + "auxiliary_loss_mlp": 0.0126589, + "balance_loss_clip": 0.06275335, + "balance_loss_mlp": 0.01255733, + "epoch": 0.6990530587704795, + "flos": 26330066916480.0, + "grad_norm": 2.0325683536698205, + "language_loss": 0.70813847, + "learning_rate": 8.770479251647697e-07, + "loss": 0.78491497, + "num_input_tokens_seen": 250852570, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.10144043, + "step": 11627, + "time_per_iteration": 2.5748379230499268 + }, + { + "auxiliary_loss_clip": 0.0640467, + "auxiliary_loss_mlp": 0.01264187, + "balance_loss_clip": 0.06273688, + "balance_loss_mlp": 0.0125508, + "epoch": 0.6991131820231474, + "flos": 19835168912640.0, + "grad_norm": 1.7164277105253158, + "language_loss": 0.62609565, + "learning_rate": 8.767256696441768e-07, + "loss": 0.70278424, + "num_input_tokens_seen": 250870500, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.09112549, + "step": 11628, + "time_per_iteration": 2.4829564094543457 + }, + { + "auxiliary_loss_clip": 0.06410889, + "auxiliary_loss_mlp": 0.01265997, + "balance_loss_clip": 0.06272821, + "balance_loss_mlp": 0.0125559, + "epoch": 0.6991733052758154, + "flos": 33993271998720.0, + "grad_norm": 1.816957818772296, + "language_loss": 0.68972111, + "learning_rate": 8.764034567182581e-07, + "loss": 0.76648998, + "num_input_tokens_seen": 250892745, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.10412598, + "step": 11629, + "time_per_iteration": 2.6509320735931396 + }, + { + "auxiliary_loss_clip": 0.06409748, + "auxiliary_loss_mlp": 0.01265873, + "balance_loss_clip": 0.06276409, + "balance_loss_mlp": 0.0125515, + "epoch": 0.6992334285284834, + "flos": 15638632876800.0, + "grad_norm": 1.5060784407018701, + "language_loss": 0.72445923, + "learning_rate": 8.760812863992337e-07, + "loss": 0.80121547, + "num_input_tokens_seen": 250910225, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.1072998, + "step": 11630, + "time_per_iteration": 2.4783284664154053 + }, + { + "auxiliary_loss_clip": 0.0641311, + "auxiliary_loss_mlp": 0.01265861, + "balance_loss_clip": 0.06278898, + "balance_loss_mlp": 0.01255943, + "epoch": 0.6992935517811514, + "flos": 21732797928960.0, + "grad_norm": 1.7108311606213942, + "language_loss": 0.74144894, + "learning_rate": 8.757591586993196e-07, + "loss": 0.81823862, + "num_input_tokens_seen": 250929715, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.09912109, + "step": 11631, + "time_per_iteration": 2.5788233280181885 + }, + { + "auxiliary_loss_clip": 0.06419384, + "auxiliary_loss_mlp": 0.01269329, + "balance_loss_clip": 0.0628057, + "balance_loss_mlp": 0.01258022, + "epoch": 0.6993536750338193, + "flos": 20120558319360.0, + "grad_norm": 2.3602125436995105, + "language_loss": 0.89111435, + "learning_rate": 8.7543707363073e-07, + "loss": 0.96800154, + "num_input_tokens_seen": 250944230, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.11303711, + "step": 11632, + "time_per_iteration": 2.473422050476074 + }, + { + "auxiliary_loss_clip": 0.06414177, + "auxiliary_loss_mlp": 0.01264877, + "balance_loss_clip": 0.06276321, + "balance_loss_mlp": 0.01254864, + "epoch": 0.6994137982864873, + "flos": 22015839421440.0, + "grad_norm": 1.6028389301274413, + "language_loss": 0.79952157, + "learning_rate": 8.751150312056792e-07, + "loss": 0.87631214, + "num_input_tokens_seen": 250961865, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.10009766, + "step": 11633, + "time_per_iteration": 2.513282060623169 + }, + { + "auxiliary_loss_clip": 0.06417207, + "auxiliary_loss_mlp": 0.01265902, + "balance_loss_clip": 0.06276365, + "balance_loss_mlp": 0.01254202, + "epoch": 0.6994739215391552, + "flos": 25525875755520.0, + "grad_norm": 1.8057869627886596, + "language_loss": 0.67083466, + "learning_rate": 8.747930314363794e-07, + "loss": 0.7476657, + "num_input_tokens_seen": 250982025, + "router_z_loss_clip": 1.40917969, + "router_z_loss_mlp": 0.11712646, + "step": 11634, + "time_per_iteration": 3.9409241676330566 + }, + { + "auxiliary_loss_clip": 0.06321115, + "auxiliary_loss_mlp": 0.0125178, + "balance_loss_clip": 0.06264269, + "balance_loss_mlp": 0.01250645, + "epoch": 0.6995340447918232, + "flos": 59147931438720.0, + "grad_norm": 0.6717939190194797, + "language_loss": 0.53298014, + "learning_rate": 8.744710743350412e-07, + "loss": 0.6087091, + "num_input_tokens_seen": 251046900, + "router_z_loss_clip": 0.5703125, + "router_z_loss_mlp": 0.0113678, + "step": 11635, + "time_per_iteration": 3.2486236095428467 + }, + { + "auxiliary_loss_clip": 0.06412114, + "auxiliary_loss_mlp": 0.01264348, + "balance_loss_clip": 0.06275758, + "balance_loss_mlp": 0.01253631, + "epoch": 0.6995941680444913, + "flos": 17973653806080.0, + "grad_norm": 1.479923932232007, + "language_loss": 0.8206256, + "learning_rate": 8.741491599138726e-07, + "loss": 0.89739013, + "num_input_tokens_seen": 251065050, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.1071167, + "step": 11636, + "time_per_iteration": 2.516813039779663 + }, + { + "auxiliary_loss_clip": 0.06416257, + "auxiliary_loss_mlp": 0.01266147, + "balance_loss_clip": 0.06278151, + "balance_loss_mlp": 0.01255722, + "epoch": 0.6996542912971592, + "flos": 21986391911040.0, + "grad_norm": 3.1669516008633813, + "language_loss": 0.83141685, + "learning_rate": 8.738272881850801e-07, + "loss": 0.90824091, + "num_input_tokens_seen": 251083355, + "router_z_loss_clip": 1.38183594, + "router_z_loss_mlp": 0.10430908, + "step": 11637, + "time_per_iteration": 3.917647123336792 + }, + { + "auxiliary_loss_clip": 0.06409974, + "auxiliary_loss_mlp": 0.0126639, + "balance_loss_clip": 0.06274991, + "balance_loss_mlp": 0.0125584, + "epoch": 0.6997144145498272, + "flos": 11689904891520.0, + "grad_norm": 1.7413253088603204, + "language_loss": 0.68017536, + "learning_rate": 8.735054591608704e-07, + "loss": 0.75693905, + "num_input_tokens_seen": 251096420, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.10559082, + "step": 11638, + "time_per_iteration": 2.455333709716797 + }, + { + "auxiliary_loss_clip": 0.06417674, + "auxiliary_loss_mlp": 0.01267445, + "balance_loss_clip": 0.06275746, + "balance_loss_mlp": 0.01255244, + "epoch": 0.6997745378024951, + "flos": 29614992456960.0, + "grad_norm": 1.8583897053492529, + "language_loss": 0.77953184, + "learning_rate": 8.731836728534459e-07, + "loss": 0.85638303, + "num_input_tokens_seen": 251115410, + "router_z_loss_clip": 1.41894531, + "router_z_loss_mlp": 0.12200928, + "step": 11639, + "time_per_iteration": 2.5732390880584717 + }, + { + "auxiliary_loss_clip": 0.06415096, + "auxiliary_loss_mlp": 0.01267452, + "balance_loss_clip": 0.06277713, + "balance_loss_mlp": 0.01256842, + "epoch": 0.6998346610551631, + "flos": 20892912128640.0, + "grad_norm": 1.9224229885402988, + "language_loss": 0.83357054, + "learning_rate": 8.728619292750093e-07, + "loss": 0.91039604, + "num_input_tokens_seen": 251133530, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.10601807, + "step": 11640, + "time_per_iteration": 2.518707275390625 + }, + { + "auxiliary_loss_clip": 0.06408644, + "auxiliary_loss_mlp": 0.01265078, + "balance_loss_clip": 0.06273933, + "balance_loss_mlp": 0.01255422, + "epoch": 0.699894784307831, + "flos": 27170539695360.0, + "grad_norm": 1.6039437808829469, + "language_loss": 0.75522578, + "learning_rate": 8.725402284377619e-07, + "loss": 0.83196306, + "num_input_tokens_seen": 251153985, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.09655762, + "step": 11641, + "time_per_iteration": 4.078887701034546 + }, + { + "auxiliary_loss_clip": 0.06412257, + "auxiliary_loss_mlp": 0.01266553, + "balance_loss_clip": 0.06275941, + "balance_loss_mlp": 0.01256361, + "epoch": 0.699954907560499, + "flos": 20930032287360.0, + "grad_norm": 1.8680055959443465, + "language_loss": 0.77721083, + "learning_rate": 8.722185703539022e-07, + "loss": 0.85399896, + "num_input_tokens_seen": 251173225, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.10192871, + "step": 11642, + "time_per_iteration": 2.500046730041504 + }, + { + "auxiliary_loss_clip": 0.0641754, + "auxiliary_loss_mlp": 0.01265471, + "balance_loss_clip": 0.06277227, + "balance_loss_mlp": 0.01253592, + "epoch": 0.700015030813167, + "flos": 28665339408000.0, + "grad_norm": 2.533169755671386, + "language_loss": 0.74393576, + "learning_rate": 8.718969550356266e-07, + "loss": 0.82076585, + "num_input_tokens_seen": 251192485, + "router_z_loss_clip": 1.40332031, + "router_z_loss_mlp": 0.11883545, + "step": 11643, + "time_per_iteration": 2.5775840282440186 + }, + { + "auxiliary_loss_clip": 0.06414674, + "auxiliary_loss_mlp": 0.01264637, + "balance_loss_clip": 0.06276005, + "balance_loss_mlp": 0.01254362, + "epoch": 0.700075154065835, + "flos": 29212959767040.0, + "grad_norm": 1.5245425147272047, + "language_loss": 0.60040998, + "learning_rate": 8.715753824951315e-07, + "loss": 0.67720306, + "num_input_tokens_seen": 251214965, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.1027832, + "step": 11644, + "time_per_iteration": 2.552072286605835 + }, + { + "auxiliary_loss_clip": 0.06407935, + "auxiliary_loss_mlp": 0.01271385, + "balance_loss_clip": 0.06275052, + "balance_loss_mlp": 0.01260579, + "epoch": 0.7001352773185029, + "flos": 23119130131200.0, + "grad_norm": 1.5458952120749485, + "language_loss": 0.82132351, + "learning_rate": 8.712538527446119e-07, + "loss": 0.89811671, + "num_input_tokens_seen": 251234500, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.10809326, + "step": 11645, + "time_per_iteration": 2.558337450027466 + }, + { + "auxiliary_loss_clip": 0.06407823, + "auxiliary_loss_mlp": 0.01266733, + "balance_loss_clip": 0.06274226, + "balance_loss_mlp": 0.01256743, + "epoch": 0.7001954005711709, + "flos": 21328962376320.0, + "grad_norm": 2.5779246493483177, + "language_loss": 0.68295795, + "learning_rate": 8.709323657962584e-07, + "loss": 0.75970346, + "num_input_tokens_seen": 251254360, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.09985352, + "step": 11646, + "time_per_iteration": 2.5126430988311768 + }, + { + "auxiliary_loss_clip": 0.06410798, + "auxiliary_loss_mlp": 0.01264039, + "balance_loss_clip": 0.06276618, + "balance_loss_mlp": 0.0125371, + "epoch": 0.7002555238238388, + "flos": 24542834054400.0, + "grad_norm": 1.467898418777351, + "language_loss": 0.71547973, + "learning_rate": 8.706109216622635e-07, + "loss": 0.7922281, + "num_input_tokens_seen": 251274790, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.10339355, + "step": 11647, + "time_per_iteration": 2.5304250717163086 + }, + { + "auxiliary_loss_clip": 0.06414019, + "auxiliary_loss_mlp": 0.01269431, + "balance_loss_clip": 0.0627712, + "balance_loss_mlp": 0.01258041, + "epoch": 0.7003156470765068, + "flos": 39065891598720.0, + "grad_norm": 1.749288264158044, + "language_loss": 0.72289455, + "learning_rate": 8.702895203548155e-07, + "loss": 0.79972911, + "num_input_tokens_seen": 251296275, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.1138916, + "step": 11648, + "time_per_iteration": 2.678863525390625 + }, + { + "auxiliary_loss_clip": 0.06409213, + "auxiliary_loss_mlp": 0.01267629, + "balance_loss_clip": 0.06275574, + "balance_loss_mlp": 0.01257377, + "epoch": 0.7003757703291749, + "flos": 28811723690880.0, + "grad_norm": 1.4492190580209505, + "language_loss": 0.77860492, + "learning_rate": 8.699681618861014e-07, + "loss": 0.85537332, + "num_input_tokens_seen": 251317375, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.10247803, + "step": 11649, + "time_per_iteration": 2.558931589126587 + }, + { + "auxiliary_loss_clip": 0.06409431, + "auxiliary_loss_mlp": 0.01267142, + "balance_loss_clip": 0.06275406, + "balance_loss_mlp": 0.01257421, + "epoch": 0.7004358935818428, + "flos": 15958123695360.0, + "grad_norm": 1.4433792721312992, + "language_loss": 0.78238451, + "learning_rate": 8.69646846268308e-07, + "loss": 0.85915029, + "num_input_tokens_seen": 251333570, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.097229, + "step": 11650, + "time_per_iteration": 2.461639642715454 + }, + { + "auxiliary_loss_clip": 0.06409653, + "auxiliary_loss_mlp": 0.0126613, + "balance_loss_clip": 0.06273135, + "balance_loss_mlp": 0.01256247, + "epoch": 0.7004960168345108, + "flos": 20418148494720.0, + "grad_norm": 2.0802744101319406, + "language_loss": 0.78669983, + "learning_rate": 8.693255735136194e-07, + "loss": 0.86345768, + "num_input_tokens_seen": 251351070, + "router_z_loss_clip": 1.36621094, + "router_z_loss_mlp": 0.09881592, + "step": 11651, + "time_per_iteration": 2.500000238418579 + }, + { + "auxiliary_loss_clip": 0.06420258, + "auxiliary_loss_mlp": 0.01269045, + "balance_loss_clip": 0.06280224, + "balance_loss_mlp": 0.01258649, + "epoch": 0.7005561400871787, + "flos": 17353260576000.0, + "grad_norm": 1.5099151755448044, + "language_loss": 0.70310026, + "learning_rate": 8.690043436342198e-07, + "loss": 0.7799933, + "num_input_tokens_seen": 251370005, + "router_z_loss_clip": 1.40039062, + "router_z_loss_mlp": 0.10388184, + "step": 11652, + "time_per_iteration": 2.4739015102386475 + }, + { + "auxiliary_loss_clip": 0.06413841, + "auxiliary_loss_mlp": 0.01263486, + "balance_loss_clip": 0.06277132, + "balance_loss_mlp": 0.01253663, + "epoch": 0.7006162633398467, + "flos": 25309276151040.0, + "grad_norm": 1.323517960695476, + "language_loss": 0.74456298, + "learning_rate": 8.686831566422874e-07, + "loss": 0.82133621, + "num_input_tokens_seen": 251391210, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.0982666, + "step": 11653, + "time_per_iteration": 2.532655954360962 + }, + { + "auxiliary_loss_clip": 0.06417534, + "auxiliary_loss_mlp": 0.01263141, + "balance_loss_clip": 0.06278478, + "balance_loss_mlp": 0.0125271, + "epoch": 0.7006763865925146, + "flos": 20675473983360.0, + "grad_norm": 2.0288883835732228, + "language_loss": 0.70729959, + "learning_rate": 8.68362012550003e-07, + "loss": 0.78410637, + "num_input_tokens_seen": 251411505, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.10430908, + "step": 11654, + "time_per_iteration": 2.519660711288452 + }, + { + "auxiliary_loss_clip": 0.06415437, + "auxiliary_loss_mlp": 0.0126811, + "balance_loss_clip": 0.06277716, + "balance_loss_mlp": 0.01256696, + "epoch": 0.7007365098451827, + "flos": 20052439349760.0, + "grad_norm": 2.2628281377067134, + "language_loss": 0.72993428, + "learning_rate": 8.680409113695453e-07, + "loss": 0.80676985, + "num_input_tokens_seen": 251428975, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.11413574, + "step": 11655, + "time_per_iteration": 2.48612117767334 + }, + { + "auxiliary_loss_clip": 0.06424905, + "auxiliary_loss_mlp": 0.01271007, + "balance_loss_clip": 0.06280498, + "balance_loss_mlp": 0.01259062, + "epoch": 0.7007966330978506, + "flos": 20783689931520.0, + "grad_norm": 1.9221196897273614, + "language_loss": 0.70366073, + "learning_rate": 8.677198531130889e-07, + "loss": 0.78061986, + "num_input_tokens_seen": 251446940, + "router_z_loss_clip": 1.44335938, + "router_z_loss_mlp": 0.11950684, + "step": 11656, + "time_per_iteration": 2.4856395721435547 + }, + { + "auxiliary_loss_clip": 0.06408404, + "auxiliary_loss_mlp": 0.01266899, + "balance_loss_clip": 0.06273983, + "balance_loss_mlp": 0.01257123, + "epoch": 0.7008567563505186, + "flos": 29645110800000.0, + "grad_norm": 1.5392970097639627, + "language_loss": 0.78185248, + "learning_rate": 8.673988377928092e-07, + "loss": 0.8586055, + "num_input_tokens_seen": 251466205, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.09783936, + "step": 11657, + "time_per_iteration": 2.5812113285064697 + }, + { + "auxiliary_loss_clip": 0.06419835, + "auxiliary_loss_mlp": 0.01268196, + "balance_loss_clip": 0.06277259, + "balance_loss_mlp": 0.01257229, + "epoch": 0.7009168796031865, + "flos": 17097654096000.0, + "grad_norm": 2.227553712273129, + "language_loss": 0.78159571, + "learning_rate": 8.670778654208797e-07, + "loss": 0.85847604, + "num_input_tokens_seen": 251484820, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.10968018, + "step": 11658, + "time_per_iteration": 2.4778008460998535 + }, + { + "auxiliary_loss_clip": 0.0640991, + "auxiliary_loss_mlp": 0.0126385, + "balance_loss_clip": 0.06276852, + "balance_loss_mlp": 0.01254099, + "epoch": 0.7009770028558545, + "flos": 20455226726400.0, + "grad_norm": 1.6635136984807588, + "language_loss": 0.83274609, + "learning_rate": 8.667569360094713e-07, + "loss": 0.90948367, + "num_input_tokens_seen": 251502670, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.09747314, + "step": 11659, + "time_per_iteration": 2.4965016841888428 + }, + { + "auxiliary_loss_clip": 0.06406507, + "auxiliary_loss_mlp": 0.01265707, + "balance_loss_clip": 0.06273511, + "balance_loss_mlp": 0.01256296, + "epoch": 0.7010371261085224, + "flos": 19251225008640.0, + "grad_norm": 2.205019124031737, + "language_loss": 0.69561887, + "learning_rate": 8.664360495707526e-07, + "loss": 0.77234095, + "num_input_tokens_seen": 251521630, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.09411621, + "step": 11660, + "time_per_iteration": 2.4827144145965576 + }, + { + "auxiliary_loss_clip": 0.06414962, + "auxiliary_loss_mlp": 0.01266553, + "balance_loss_clip": 0.06275482, + "balance_loss_mlp": 0.01256134, + "epoch": 0.7010972493611904, + "flos": 22134159786240.0, + "grad_norm": 2.0869897578232295, + "language_loss": 0.81401628, + "learning_rate": 8.661152061168924e-07, + "loss": 0.89083141, + "num_input_tokens_seen": 251540105, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.10412598, + "step": 11661, + "time_per_iteration": 3.9388158321380615 + }, + { + "auxiliary_loss_clip": 0.06407215, + "auxiliary_loss_mlp": 0.01264683, + "balance_loss_clip": 0.06272362, + "balance_loss_mlp": 0.01254544, + "epoch": 0.7011573726138585, + "flos": 31398619593600.0, + "grad_norm": 1.8643289831680394, + "language_loss": 0.79429448, + "learning_rate": 8.657944056600579e-07, + "loss": 0.87101352, + "num_input_tokens_seen": 251560530, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.10137939, + "step": 11662, + "time_per_iteration": 2.6265618801116943 + }, + { + "auxiliary_loss_clip": 0.06416287, + "auxiliary_loss_mlp": 0.01267119, + "balance_loss_clip": 0.06277344, + "balance_loss_mlp": 0.01256295, + "epoch": 0.7012174958665264, + "flos": 18156487415040.0, + "grad_norm": 1.6800388441509395, + "language_loss": 0.83806753, + "learning_rate": 8.654736482124134e-07, + "loss": 0.91490161, + "num_input_tokens_seen": 251577930, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.10821533, + "step": 11663, + "time_per_iteration": 2.488739252090454 + }, + { + "auxiliary_loss_clip": 0.06318727, + "auxiliary_loss_mlp": 0.01250759, + "balance_loss_clip": 0.06262303, + "balance_loss_mlp": 0.012494, + "epoch": 0.7012776191191944, + "flos": 60669495331200.0, + "grad_norm": 0.8224381055881935, + "language_loss": 0.5391866, + "learning_rate": 8.651529337861209e-07, + "loss": 0.6148814, + "num_input_tokens_seen": 251638820, + "router_z_loss_clip": 0.56591797, + "router_z_loss_mlp": 0.01361084, + "step": 11664, + "time_per_iteration": 3.160693645477295 + }, + { + "auxiliary_loss_clip": 0.06413987, + "auxiliary_loss_mlp": 0.01267114, + "balance_loss_clip": 0.06275371, + "balance_loss_mlp": 0.01256731, + "epoch": 0.7013377423718623, + "flos": 27205940845440.0, + "grad_norm": 1.7370315255440756, + "language_loss": 0.79090619, + "learning_rate": 8.64832262393344e-07, + "loss": 0.86771721, + "num_input_tokens_seen": 251658070, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.1038208, + "step": 11665, + "time_per_iteration": 2.5398123264312744 + }, + { + "auxiliary_loss_clip": 0.06412809, + "auxiliary_loss_mlp": 0.01262516, + "balance_loss_clip": 0.06277609, + "balance_loss_mlp": 0.01252563, + "epoch": 0.7013978656245303, + "flos": 16548901706880.0, + "grad_norm": 2.00554211734292, + "language_loss": 0.76867342, + "learning_rate": 8.645116340462404e-07, + "loss": 0.84542668, + "num_input_tokens_seen": 251671575, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.09954834, + "step": 11666, + "time_per_iteration": 2.4652414321899414 + }, + { + "auxiliary_loss_clip": 0.0641577, + "auxiliary_loss_mlp": 0.01267108, + "balance_loss_clip": 0.06279963, + "balance_loss_mlp": 0.01256725, + "epoch": 0.7014579888771982, + "flos": 23149625817600.0, + "grad_norm": 1.7866180274258885, + "language_loss": 0.81048751, + "learning_rate": 8.641910487569695e-07, + "loss": 0.88731629, + "num_input_tokens_seen": 251689350, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10388184, + "step": 11667, + "time_per_iteration": 2.5062241554260254 + }, + { + "auxiliary_loss_clip": 0.06409969, + "auxiliary_loss_mlp": 0.01266348, + "balance_loss_clip": 0.06275474, + "balance_loss_mlp": 0.01255917, + "epoch": 0.7015181121298663, + "flos": 25089028894080.0, + "grad_norm": 2.0567499658134087, + "language_loss": 0.65901959, + "learning_rate": 8.638705065376879e-07, + "loss": 0.73578274, + "num_input_tokens_seen": 251704635, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.10443115, + "step": 11668, + "time_per_iteration": 2.6001944541931152 + }, + { + "auxiliary_loss_clip": 0.06415643, + "auxiliary_loss_mlp": 0.01266119, + "balance_loss_clip": 0.06275932, + "balance_loss_mlp": 0.01255248, + "epoch": 0.7015782353825342, + "flos": 23334052654080.0, + "grad_norm": 1.636860913695636, + "language_loss": 0.76856339, + "learning_rate": 8.635500074005519e-07, + "loss": 0.84538102, + "num_input_tokens_seen": 251723035, + "router_z_loss_clip": 1.39648438, + "router_z_loss_mlp": 0.10870361, + "step": 11669, + "time_per_iteration": 2.580120801925659 + }, + { + "auxiliary_loss_clip": 0.06316374, + "auxiliary_loss_mlp": 0.01249475, + "balance_loss_clip": 0.06259722, + "balance_loss_mlp": 0.01248101, + "epoch": 0.7016383586352022, + "flos": 70417733086080.0, + "grad_norm": 0.683633883002792, + "language_loss": 0.54477966, + "learning_rate": 8.632295513577122e-07, + "loss": 0.62043816, + "num_input_tokens_seen": 251791630, + "router_z_loss_clip": 0.56640625, + "router_z_loss_mlp": 0.01376343, + "step": 11670, + "time_per_iteration": 3.239391565322876 + }, + { + "auxiliary_loss_clip": 0.06410887, + "auxiliary_loss_mlp": 0.01266693, + "balance_loss_clip": 0.06276417, + "balance_loss_mlp": 0.01256447, + "epoch": 0.7016984818878701, + "flos": 19798426097280.0, + "grad_norm": 1.5820465602747873, + "language_loss": 0.81851846, + "learning_rate": 8.629091384213218e-07, + "loss": 0.89529431, + "num_input_tokens_seen": 251809840, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.10247803, + "step": 11671, + "time_per_iteration": 2.5156307220458984 + }, + { + "auxiliary_loss_clip": 0.06415814, + "auxiliary_loss_mlp": 0.01265108, + "balance_loss_clip": 0.06276827, + "balance_loss_mlp": 0.01254611, + "epoch": 0.7017586051405381, + "flos": 12901998528000.0, + "grad_norm": 1.7162410726978943, + "language_loss": 0.74825186, + "learning_rate": 8.625887686035313e-07, + "loss": 0.82506108, + "num_input_tokens_seen": 251827550, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.10498047, + "step": 11672, + "time_per_iteration": 2.4657065868377686 + }, + { + "auxiliary_loss_clip": 0.064162, + "auxiliary_loss_mlp": 0.01267901, + "balance_loss_clip": 0.06278486, + "balance_loss_mlp": 0.01256922, + "epoch": 0.701818728393206, + "flos": 18338734045440.0, + "grad_norm": 1.6561114230567193, + "language_loss": 0.87079096, + "learning_rate": 8.622684419164883e-07, + "loss": 0.94763196, + "num_input_tokens_seen": 251844880, + "router_z_loss_clip": 1.37695312, + "router_z_loss_mlp": 0.10980225, + "step": 11673, + "time_per_iteration": 2.51084303855896 + }, + { + "auxiliary_loss_clip": 0.06411691, + "auxiliary_loss_mlp": 0.01268986, + "balance_loss_clip": 0.06277934, + "balance_loss_mlp": 0.01258502, + "epoch": 0.701878851645874, + "flos": 17389961464320.0, + "grad_norm": 1.7599431551764082, + "language_loss": 0.73397923, + "learning_rate": 8.619481583723399e-07, + "loss": 0.81078601, + "num_input_tokens_seen": 251861025, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.10491943, + "step": 11674, + "time_per_iteration": 3.8845224380493164 + }, + { + "auxiliary_loss_clip": 0.06408197, + "auxiliary_loss_mlp": 0.01264811, + "balance_loss_clip": 0.06276836, + "balance_loss_mlp": 0.01255173, + "epoch": 0.701938974898542, + "flos": 23922398897280.0, + "grad_norm": 1.5893184098427633, + "language_loss": 0.72403145, + "learning_rate": 8.616279179832329e-07, + "loss": 0.80076146, + "num_input_tokens_seen": 251880175, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09631348, + "step": 11675, + "time_per_iteration": 2.535900115966797 + }, + { + "auxiliary_loss_clip": 0.06414977, + "auxiliary_loss_mlp": 0.01267979, + "balance_loss_clip": 0.06276758, + "balance_loss_mlp": 0.01257047, + "epoch": 0.70199909815121, + "flos": 21801503877120.0, + "grad_norm": 2.0246464203601278, + "language_loss": 0.51067138, + "learning_rate": 8.613077207613078e-07, + "loss": 0.58750093, + "num_input_tokens_seen": 251899005, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.109375, + "step": 11676, + "time_per_iteration": 2.555906057357788 + }, + { + "auxiliary_loss_clip": 0.06319048, + "auxiliary_loss_mlp": 0.01249904, + "balance_loss_clip": 0.06262474, + "balance_loss_mlp": 0.01248563, + "epoch": 0.702059221403878, + "flos": 71736575224320.0, + "grad_norm": 0.7224738346499476, + "language_loss": 0.59202904, + "learning_rate": 8.609875667187079e-07, + "loss": 0.66771859, + "num_input_tokens_seen": 251966790, + "router_z_loss_clip": 0.56640625, + "router_z_loss_mlp": 0.01343536, + "step": 11677, + "time_per_iteration": 4.580153942108154 + }, + { + "auxiliary_loss_clip": 0.06413269, + "auxiliary_loss_mlp": 0.01266317, + "balance_loss_clip": 0.06275491, + "balance_loss_mlp": 0.0125582, + "epoch": 0.7021193446565459, + "flos": 28118599516800.0, + "grad_norm": 1.944945343813431, + "language_loss": 0.6293093, + "learning_rate": 8.606674558675737e-07, + "loss": 0.70610511, + "num_input_tokens_seen": 251989315, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.10498047, + "step": 11678, + "time_per_iteration": 2.652944803237915 + }, + { + "auxiliary_loss_clip": 0.06410077, + "auxiliary_loss_mlp": 0.0126477, + "balance_loss_clip": 0.06276654, + "balance_loss_mlp": 0.01254786, + "epoch": 0.7021794679092139, + "flos": 22930720225920.0, + "grad_norm": 1.5864608475530155, + "language_loss": 0.7993412, + "learning_rate": 8.603473882200444e-07, + "loss": 0.87608963, + "num_input_tokens_seen": 252006620, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.09991455, + "step": 11679, + "time_per_iteration": 2.517608404159546 + }, + { + "auxiliary_loss_clip": 0.06410368, + "auxiliary_loss_mlp": 0.01263633, + "balance_loss_clip": 0.06277052, + "balance_loss_mlp": 0.01254615, + "epoch": 0.7022395911618818, + "flos": 18083756471040.0, + "grad_norm": 2.1970830940848614, + "language_loss": 0.70462888, + "learning_rate": 8.600273637882567e-07, + "loss": 0.78136891, + "num_input_tokens_seen": 252024570, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.09014893, + "step": 11680, + "time_per_iteration": 2.4937846660614014 + }, + { + "auxiliary_loss_clip": 0.06416643, + "auxiliary_loss_mlp": 0.01267202, + "balance_loss_clip": 0.06276958, + "balance_loss_mlp": 0.01256408, + "epoch": 0.7022997144145499, + "flos": 16039827025920.0, + "grad_norm": 1.5993399056299638, + "language_loss": 0.74800062, + "learning_rate": 8.597073825843446e-07, + "loss": 0.82483912, + "num_input_tokens_seen": 252042775, + "router_z_loss_clip": 1.39648438, + "router_z_loss_mlp": 0.10791016, + "step": 11681, + "time_per_iteration": 3.912652015686035 + }, + { + "auxiliary_loss_clip": 0.06407465, + "auxiliary_loss_mlp": 0.01264961, + "balance_loss_clip": 0.06273095, + "balance_loss_mlp": 0.01254536, + "epoch": 0.7023598376672178, + "flos": 26475864220800.0, + "grad_norm": 1.529501150189484, + "language_loss": 0.77074146, + "learning_rate": 8.593874446204434e-07, + "loss": 0.84746575, + "num_input_tokens_seen": 252063690, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.10424805, + "step": 11682, + "time_per_iteration": 2.5244510173797607 + }, + { + "auxiliary_loss_clip": 0.06414787, + "auxiliary_loss_mlp": 0.01267242, + "balance_loss_clip": 0.06274539, + "balance_loss_mlp": 0.01255625, + "epoch": 0.7024199609198858, + "flos": 17061624040320.0, + "grad_norm": 2.0146711656624947, + "language_loss": 0.73610115, + "learning_rate": 8.590675499086841e-07, + "loss": 0.81292146, + "num_input_tokens_seen": 252080335, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.11627197, + "step": 11683, + "time_per_iteration": 2.4807722568511963 + }, + { + "auxiliary_loss_clip": 0.06412771, + "auxiliary_loss_mlp": 0.01265673, + "balance_loss_clip": 0.06278127, + "balance_loss_mlp": 0.01254467, + "epoch": 0.7024800841725537, + "flos": 25856225677440.0, + "grad_norm": 1.8616488886702496, + "language_loss": 0.7201761, + "learning_rate": 8.587476984611976e-07, + "loss": 0.79696059, + "num_input_tokens_seen": 252101075, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.11212158, + "step": 11684, + "time_per_iteration": 2.5248489379882812 + }, + { + "auxiliary_loss_clip": 0.06409675, + "auxiliary_loss_mlp": 0.01268405, + "balance_loss_clip": 0.06274322, + "balance_loss_mlp": 0.01257741, + "epoch": 0.7025402074252217, + "flos": 23519653447680.0, + "grad_norm": 2.2560693638667386, + "language_loss": 0.72109079, + "learning_rate": 8.584278902901128e-07, + "loss": 0.79787153, + "num_input_tokens_seen": 252120510, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.10668945, + "step": 11685, + "time_per_iteration": 2.5545883178710938 + }, + { + "auxiliary_loss_clip": 0.06411938, + "auxiliary_loss_mlp": 0.01264141, + "balance_loss_clip": 0.06274469, + "balance_loss_mlp": 0.01254021, + "epoch": 0.7026003306778896, + "flos": 20156169104640.0, + "grad_norm": 1.6059462262520903, + "language_loss": 0.8497479, + "learning_rate": 8.581081254075582e-07, + "loss": 0.92650867, + "num_input_tokens_seen": 252137590, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.10119629, + "step": 11686, + "time_per_iteration": 2.4869866371154785 + }, + { + "auxiliary_loss_clip": 0.06311645, + "auxiliary_loss_mlp": 0.01250458, + "balance_loss_clip": 0.06255314, + "balance_loss_mlp": 0.01249239, + "epoch": 0.7026604539305576, + "flos": 64791036362880.0, + "grad_norm": 0.9748591985428325, + "language_loss": 0.6989513, + "learning_rate": 8.577884038256566e-07, + "loss": 0.77457231, + "num_input_tokens_seen": 252199830, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01217651, + "step": 11687, + "time_per_iteration": 3.2795140743255615 + }, + { + "auxiliary_loss_clip": 0.06411874, + "auxiliary_loss_mlp": 0.01269631, + "balance_loss_clip": 0.06276284, + "balance_loss_mlp": 0.01259421, + "epoch": 0.7027205771832256, + "flos": 21877882473600.0, + "grad_norm": 2.1687744057978575, + "language_loss": 0.7759158, + "learning_rate": 8.574687255565329e-07, + "loss": 0.85273087, + "num_input_tokens_seen": 252217200, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.10205078, + "step": 11688, + "time_per_iteration": 2.506697416305542 + }, + { + "auxiliary_loss_clip": 0.06409185, + "auxiliary_loss_mlp": 0.01263217, + "balance_loss_clip": 0.06273778, + "balance_loss_mlp": 0.0125287, + "epoch": 0.7027807004358936, + "flos": 23374526976000.0, + "grad_norm": 2.0500924601059687, + "language_loss": 0.69007778, + "learning_rate": 8.571490906123107e-07, + "loss": 0.76680183, + "num_input_tokens_seen": 252236105, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.10339355, + "step": 11689, + "time_per_iteration": 2.526963472366333 + }, + { + "auxiliary_loss_clip": 0.06412712, + "auxiliary_loss_mlp": 0.01267707, + "balance_loss_clip": 0.0627338, + "balance_loss_mlp": 0.01255834, + "epoch": 0.7028408236885616, + "flos": 15309624620160.0, + "grad_norm": 2.4528764604041977, + "language_loss": 0.79761183, + "learning_rate": 8.568294990051086e-07, + "loss": 0.87441605, + "num_input_tokens_seen": 252253315, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.11871338, + "step": 11690, + "time_per_iteration": 2.5314319133758545 + }, + { + "auxiliary_loss_clip": 0.06412818, + "auxiliary_loss_mlp": 0.01269418, + "balance_loss_clip": 0.06277384, + "balance_loss_mlp": 0.01258677, + "epoch": 0.7029009469412295, + "flos": 22024769880960.0, + "grad_norm": 1.8333973382314617, + "language_loss": 0.75588238, + "learning_rate": 8.56509950747047e-07, + "loss": 0.83270478, + "num_input_tokens_seen": 252272765, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.10748291, + "step": 11691, + "time_per_iteration": 2.5446360111236572 + }, + { + "auxiliary_loss_clip": 0.06412929, + "auxiliary_loss_mlp": 0.01264486, + "balance_loss_clip": 0.06278588, + "balance_loss_mlp": 0.0125449, + "epoch": 0.7029610701938975, + "flos": 21842020126080.0, + "grad_norm": 1.7290780486458988, + "language_loss": 0.81951666, + "learning_rate": 8.561904458502429e-07, + "loss": 0.89629078, + "num_input_tokens_seen": 252290510, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.09997559, + "step": 11692, + "time_per_iteration": 2.475939989089966 + }, + { + "auxiliary_loss_clip": 0.06407632, + "auxiliary_loss_mlp": 0.01264663, + "balance_loss_clip": 0.06272383, + "balance_loss_mlp": 0.01253577, + "epoch": 0.7030211934465654, + "flos": 19141709322240.0, + "grad_norm": 1.4786815492141234, + "language_loss": 0.76637983, + "learning_rate": 8.558709843268111e-07, + "loss": 0.84310281, + "num_input_tokens_seen": 252309365, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.11090088, + "step": 11693, + "time_per_iteration": 2.523207664489746 + }, + { + "auxiliary_loss_clip": 0.06409247, + "auxiliary_loss_mlp": 0.01267485, + "balance_loss_clip": 0.06274758, + "balance_loss_mlp": 0.01256959, + "epoch": 0.7030813166992335, + "flos": 38555307544320.0, + "grad_norm": 3.0680910714990945, + "language_loss": 0.685, + "learning_rate": 8.55551566188866e-07, + "loss": 0.76176739, + "num_input_tokens_seen": 252333010, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.10522461, + "step": 11694, + "time_per_iteration": 2.6671559810638428 + }, + { + "auxiliary_loss_clip": 0.06413712, + "auxiliary_loss_mlp": 0.01265339, + "balance_loss_clip": 0.06276645, + "balance_loss_mlp": 0.01255105, + "epoch": 0.7031414399519014, + "flos": 14726225767680.0, + "grad_norm": 2.01117706312431, + "language_loss": 0.75637174, + "learning_rate": 8.552321914485203e-07, + "loss": 0.83316225, + "num_input_tokens_seen": 252351330, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.10235596, + "step": 11695, + "time_per_iteration": 2.508373975753784 + }, + { + "auxiliary_loss_clip": 0.0642024, + "auxiliary_loss_mlp": 0.01270249, + "balance_loss_clip": 0.06280233, + "balance_loss_mlp": 0.01258644, + "epoch": 0.7032015632045694, + "flos": 14032388833920.0, + "grad_norm": 1.954001814184471, + "language_loss": 0.74258196, + "learning_rate": 8.549128601178852e-07, + "loss": 0.81948686, + "num_input_tokens_seen": 252369580, + "router_z_loss_clip": 1.39941406, + "router_z_loss_mlp": 0.11602783, + "step": 11696, + "time_per_iteration": 2.4646289348602295 + }, + { + "auxiliary_loss_clip": 0.06413354, + "auxiliary_loss_mlp": 0.01266085, + "balance_loss_clip": 0.06275193, + "balance_loss_mlp": 0.01254969, + "epoch": 0.7032616864572373, + "flos": 27644716350720.0, + "grad_norm": 7.188542829701478, + "language_loss": 0.75876927, + "learning_rate": 8.545935722090693e-07, + "loss": 0.83556366, + "num_input_tokens_seen": 252390525, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.11108398, + "step": 11697, + "time_per_iteration": 2.564423084259033 + }, + { + "auxiliary_loss_clip": 0.06411704, + "auxiliary_loss_mlp": 0.0126863, + "balance_loss_clip": 0.06273724, + "balance_loss_mlp": 0.01257508, + "epoch": 0.7033218097099053, + "flos": 17973024900480.0, + "grad_norm": 1.6931225387398507, + "language_loss": 0.80683148, + "learning_rate": 8.542743277341793e-07, + "loss": 0.88363487, + "num_input_tokens_seen": 252407470, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.11126709, + "step": 11698, + "time_per_iteration": 2.4535627365112305 + }, + { + "auxiliary_loss_clip": 0.0641105, + "auxiliary_loss_mlp": 0.01266224, + "balance_loss_clip": 0.06272902, + "balance_loss_mlp": 0.01255239, + "epoch": 0.7033819329625732, + "flos": 19508047372800.0, + "grad_norm": 1.3566537423348073, + "language_loss": 0.84644032, + "learning_rate": 8.539551267053222e-07, + "loss": 0.92321312, + "num_input_tokens_seen": 252427025, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.10974121, + "step": 11699, + "time_per_iteration": 2.5543456077575684 + }, + { + "auxiliary_loss_clip": 0.06408502, + "auxiliary_loss_mlp": 0.01265387, + "balance_loss_clip": 0.06274264, + "balance_loss_mlp": 0.01254628, + "epoch": 0.7034420562152413, + "flos": 23994417081600.0, + "grad_norm": 1.970773248623371, + "language_loss": 0.7962184, + "learning_rate": 8.53635969134601e-07, + "loss": 0.87295729, + "num_input_tokens_seen": 252445410, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.10760498, + "step": 11700, + "time_per_iteration": 2.4985594749450684 + }, + { + "auxiliary_loss_clip": 0.06412737, + "auxiliary_loss_mlp": 0.01264767, + "balance_loss_clip": 0.06273302, + "balance_loss_mlp": 0.01253507, + "epoch": 0.7035021794679092, + "flos": 35052147244800.0, + "grad_norm": 1.812061465534113, + "language_loss": 0.74477667, + "learning_rate": 8.533168550341186e-07, + "loss": 0.82155174, + "num_input_tokens_seen": 252463905, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.11254883, + "step": 11701, + "time_per_iteration": 4.042437314987183 + }, + { + "auxiliary_loss_clip": 0.064155, + "auxiliary_loss_mlp": 0.01264422, + "balance_loss_clip": 0.06275072, + "balance_loss_mlp": 0.01253246, + "epoch": 0.7035623027205772, + "flos": 11001811962240.0, + "grad_norm": 2.072031067866928, + "language_loss": 0.83952713, + "learning_rate": 8.529977844159769e-07, + "loss": 0.91632634, + "num_input_tokens_seen": 252478655, + "router_z_loss_clip": 1.40527344, + "router_z_loss_mlp": 0.11175537, + "step": 11702, + "time_per_iteration": 2.5586178302764893 + }, + { + "auxiliary_loss_clip": 0.0641142, + "auxiliary_loss_mlp": 0.01264208, + "balance_loss_clip": 0.06272231, + "balance_loss_mlp": 0.01253825, + "epoch": 0.7036224259732452, + "flos": 23630594653440.0, + "grad_norm": 1.6523267572786273, + "language_loss": 0.61088848, + "learning_rate": 8.526787572922738e-07, + "loss": 0.68764472, + "num_input_tokens_seen": 252498740, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.1038208, + "step": 11703, + "time_per_iteration": 2.521512985229492 + }, + { + "auxiliary_loss_clip": 0.06413552, + "auxiliary_loss_mlp": 0.01266937, + "balance_loss_clip": 0.06275339, + "balance_loss_mlp": 0.01255869, + "epoch": 0.7036825492259131, + "flos": 31694239198080.0, + "grad_norm": 1.8799008475861942, + "language_loss": 0.61646456, + "learning_rate": 8.523597736751067e-07, + "loss": 0.69326943, + "num_input_tokens_seen": 252517800, + "router_z_loss_clip": 1.38183594, + "router_z_loss_mlp": 0.11065674, + "step": 11704, + "time_per_iteration": 2.637000560760498 + }, + { + "auxiliary_loss_clip": 0.06406493, + "auxiliary_loss_mlp": 0.0126777, + "balance_loss_clip": 0.06273523, + "balance_loss_mlp": 0.01258109, + "epoch": 0.7037426724785811, + "flos": 30201116567040.0, + "grad_norm": 1.5166852635712837, + "language_loss": 0.70736712, + "learning_rate": 8.520408335765719e-07, + "loss": 0.78410971, + "num_input_tokens_seen": 252539620, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.09667969, + "step": 11705, + "time_per_iteration": 2.5815892219543457 + }, + { + "auxiliary_loss_clip": 0.06409339, + "auxiliary_loss_mlp": 0.01265192, + "balance_loss_clip": 0.06274589, + "balance_loss_mlp": 0.01254833, + "epoch": 0.703802795731249, + "flos": 24317597479680.0, + "grad_norm": 1.8692688199911445, + "language_loss": 0.61916155, + "learning_rate": 8.517219370087645e-07, + "loss": 0.69590688, + "num_input_tokens_seen": 252557300, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.10351562, + "step": 11706, + "time_per_iteration": 2.537567615509033 + }, + { + "auxiliary_loss_clip": 0.06410844, + "auxiliary_loss_mlp": 0.01265613, + "balance_loss_clip": 0.06273291, + "balance_loss_mlp": 0.01254061, + "epoch": 0.7038629189839171, + "flos": 22535605497600.0, + "grad_norm": 2.4391424281987506, + "language_loss": 0.68479651, + "learning_rate": 8.514030839837756e-07, + "loss": 0.76156104, + "num_input_tokens_seen": 252576715, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.11560059, + "step": 11707, + "time_per_iteration": 2.4984869956970215 + }, + { + "auxiliary_loss_clip": 0.06406912, + "auxiliary_loss_mlp": 0.01267156, + "balance_loss_clip": 0.06272735, + "balance_loss_mlp": 0.01257101, + "epoch": 0.703923042236585, + "flos": 26257755242880.0, + "grad_norm": 1.9008341016793249, + "language_loss": 0.76335013, + "learning_rate": 8.510842745136974e-07, + "loss": 0.84009075, + "num_input_tokens_seen": 252596190, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.10058594, + "step": 11708, + "time_per_iteration": 2.552219867706299 + }, + { + "auxiliary_loss_clip": 0.06407606, + "auxiliary_loss_mlp": 0.01261422, + "balance_loss_clip": 0.06274488, + "balance_loss_mlp": 0.01251313, + "epoch": 0.703983165489253, + "flos": 19396225699200.0, + "grad_norm": 1.582678176456311, + "language_loss": 0.7205376, + "learning_rate": 8.50765508610619e-07, + "loss": 0.79722786, + "num_input_tokens_seen": 252613410, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.10107422, + "step": 11709, + "time_per_iteration": 2.479956865310669 + }, + { + "auxiliary_loss_clip": 0.06409952, + "auxiliary_loss_mlp": 0.01266177, + "balance_loss_clip": 0.06274274, + "balance_loss_mlp": 0.01256098, + "epoch": 0.7040432887419209, + "flos": 16688032611840.0, + "grad_norm": 1.9337929130323093, + "language_loss": 0.79638529, + "learning_rate": 8.504467862866267e-07, + "loss": 0.87314653, + "num_input_tokens_seen": 252629150, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10076904, + "step": 11710, + "time_per_iteration": 2.495333194732666 + }, + { + "auxiliary_loss_clip": 0.06415999, + "auxiliary_loss_mlp": 0.0126626, + "balance_loss_clip": 0.06278241, + "balance_loss_mlp": 0.01255674, + "epoch": 0.7041034119945889, + "flos": 21147638140800.0, + "grad_norm": 1.663598845140954, + "language_loss": 0.77776545, + "learning_rate": 8.501281075538076e-07, + "loss": 0.85458803, + "num_input_tokens_seen": 252648225, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.105896, + "step": 11711, + "time_per_iteration": 2.500640392303467 + }, + { + "auxiliary_loss_clip": 0.06410688, + "auxiliary_loss_mlp": 0.01265823, + "balance_loss_clip": 0.06276608, + "balance_loss_mlp": 0.01255237, + "epoch": 0.7041635352472568, + "flos": 16916036371200.0, + "grad_norm": 1.9928632293831094, + "language_loss": 0.7447651, + "learning_rate": 8.498094724242457e-07, + "loss": 0.82153022, + "num_input_tokens_seen": 252665380, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.10583496, + "step": 11712, + "time_per_iteration": 2.501585006713867 + }, + { + "auxiliary_loss_clip": 0.06320854, + "auxiliary_loss_mlp": 0.01257118, + "balance_loss_clip": 0.06264362, + "balance_loss_mlp": 0.01255823, + "epoch": 0.7042236584999249, + "flos": 71703186572160.0, + "grad_norm": 0.8590002483868424, + "language_loss": 0.64672804, + "learning_rate": 8.494908809100247e-07, + "loss": 0.72250772, + "num_input_tokens_seen": 252727950, + "router_z_loss_clip": 0.56640625, + "router_z_loss_mlp": 0.01295471, + "step": 11713, + "time_per_iteration": 4.5734851360321045 + }, + { + "auxiliary_loss_clip": 0.06410141, + "auxiliary_loss_mlp": 0.01263047, + "balance_loss_clip": 0.06274079, + "balance_loss_mlp": 0.01252991, + "epoch": 0.7042837817525928, + "flos": 28665800605440.0, + "grad_norm": 1.9680516689018257, + "language_loss": 0.72915512, + "learning_rate": 8.49172333023225e-07, + "loss": 0.80588698, + "num_input_tokens_seen": 252746770, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.1005249, + "step": 11714, + "time_per_iteration": 2.5535781383514404 + }, + { + "auxiliary_loss_clip": 0.06411086, + "auxiliary_loss_mlp": 0.01268594, + "balance_loss_clip": 0.06275805, + "balance_loss_mlp": 0.01256757, + "epoch": 0.7043439050052608, + "flos": 19759335367680.0, + "grad_norm": 2.3616586102145805, + "language_loss": 0.80244958, + "learning_rate": 8.488538287759248e-07, + "loss": 0.87924635, + "num_input_tokens_seen": 252765610, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.11828613, + "step": 11715, + "time_per_iteration": 2.4991419315338135 + }, + { + "auxiliary_loss_clip": 0.06414278, + "auxiliary_loss_mlp": 0.01267093, + "balance_loss_clip": 0.0627607, + "balance_loss_mlp": 0.01256155, + "epoch": 0.7044040282579288, + "flos": 11541969308160.0, + "grad_norm": 1.9765202948162532, + "language_loss": 0.71383488, + "learning_rate": 8.485353681802037e-07, + "loss": 0.79064858, + "num_input_tokens_seen": 252781610, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.10931396, + "step": 11716, + "time_per_iteration": 3.9245705604553223 + }, + { + "auxiliary_loss_clip": 0.06418915, + "auxiliary_loss_mlp": 0.01264541, + "balance_loss_clip": 0.06277251, + "balance_loss_mlp": 0.01253783, + "epoch": 0.7044641515105967, + "flos": 33664473377280.0, + "grad_norm": 1.7730534730356675, + "language_loss": 0.66482782, + "learning_rate": 8.482169512481358e-07, + "loss": 0.74166238, + "num_input_tokens_seen": 252800600, + "router_z_loss_clip": 1.41699219, + "router_z_loss_mlp": 0.10760498, + "step": 11717, + "time_per_iteration": 2.6029398441314697 + }, + { + "auxiliary_loss_clip": 0.06415347, + "auxiliary_loss_mlp": 0.01266424, + "balance_loss_clip": 0.0627737, + "balance_loss_mlp": 0.01256011, + "epoch": 0.7045242747632647, + "flos": 26731051430400.0, + "grad_norm": 1.5043477958415044, + "language_loss": 0.74609149, + "learning_rate": 8.478985779917967e-07, + "loss": 0.82290918, + "num_input_tokens_seen": 252822310, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.10412598, + "step": 11718, + "time_per_iteration": 2.574075937271118 + }, + { + "auxiliary_loss_clip": 0.06412348, + "auxiliary_loss_mlp": 0.01264631, + "balance_loss_clip": 0.06277113, + "balance_loss_mlp": 0.01254224, + "epoch": 0.7045843980159326, + "flos": 26804998258560.0, + "grad_norm": 1.5984477962629227, + "language_loss": 0.80229437, + "learning_rate": 8.475802484232606e-07, + "loss": 0.8790642, + "num_input_tokens_seen": 252842355, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.10412598, + "step": 11719, + "time_per_iteration": 2.557602643966675 + }, + { + "auxiliary_loss_clip": 0.0641358, + "auxiliary_loss_mlp": 0.01263485, + "balance_loss_clip": 0.06277666, + "balance_loss_mlp": 0.01252524, + "epoch": 0.7046445212686007, + "flos": 41584710458880.0, + "grad_norm": 1.6868566975802164, + "language_loss": 0.65635586, + "learning_rate": 8.472619625545951e-07, + "loss": 0.73312646, + "num_input_tokens_seen": 252866785, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.10961914, + "step": 11720, + "time_per_iteration": 4.092779159545898 + }, + { + "auxiliary_loss_clip": 0.06422915, + "auxiliary_loss_mlp": 0.01266179, + "balance_loss_clip": 0.06280062, + "balance_loss_mlp": 0.01255194, + "epoch": 0.7047046445212686, + "flos": 15565650370560.0, + "grad_norm": 2.147768548041585, + "language_loss": 0.8022362, + "learning_rate": 8.46943720397872e-07, + "loss": 0.87912714, + "num_input_tokens_seen": 252881870, + "router_z_loss_clip": 1.42871094, + "router_z_loss_mlp": 0.10986328, + "step": 11721, + "time_per_iteration": 2.4634041786193848 + }, + { + "auxiliary_loss_clip": 0.06318594, + "auxiliary_loss_mlp": 0.01253531, + "balance_loss_clip": 0.06262027, + "balance_loss_mlp": 0.01252384, + "epoch": 0.7047647677739366, + "flos": 70433036455680.0, + "grad_norm": 0.7472916144331851, + "language_loss": 0.64821076, + "learning_rate": 8.466255219651582e-07, + "loss": 0.72393203, + "num_input_tokens_seen": 252951300, + "router_z_loss_clip": 0.56640625, + "router_z_loss_mlp": 0.01146698, + "step": 11722, + "time_per_iteration": 3.2447893619537354 + }, + { + "auxiliary_loss_clip": 0.06410772, + "auxiliary_loss_mlp": 0.0126411, + "balance_loss_clip": 0.06275559, + "balance_loss_mlp": 0.01253536, + "epoch": 0.7048248910266045, + "flos": 23666876271360.0, + "grad_norm": 2.268842508315268, + "language_loss": 0.66067719, + "learning_rate": 8.463073672685211e-07, + "loss": 0.73742604, + "num_input_tokens_seen": 252971400, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.10571289, + "step": 11723, + "time_per_iteration": 2.556645154953003 + }, + { + "auxiliary_loss_clip": 0.06413794, + "auxiliary_loss_mlp": 0.01263861, + "balance_loss_clip": 0.06275541, + "balance_loss_mlp": 0.01252703, + "epoch": 0.7048850142792725, + "flos": 21403496183040.0, + "grad_norm": 1.9667058211108481, + "language_loss": 0.80938751, + "learning_rate": 8.459892563200235e-07, + "loss": 0.88616407, + "num_input_tokens_seen": 252989475, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.11151123, + "step": 11724, + "time_per_iteration": 2.521294116973877 + }, + { + "auxiliary_loss_clip": 0.06412652, + "auxiliary_loss_mlp": 0.01263234, + "balance_loss_clip": 0.06273533, + "balance_loss_mlp": 0.01252619, + "epoch": 0.7049451375319404, + "flos": 21653736001920.0, + "grad_norm": 1.878825511688235, + "language_loss": 0.73036087, + "learning_rate": 8.456711891317296e-07, + "loss": 0.80711973, + "num_input_tokens_seen": 253007220, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.10620117, + "step": 11725, + "time_per_iteration": 2.491532325744629 + }, + { + "auxiliary_loss_clip": 0.06419054, + "auxiliary_loss_mlp": 0.01266944, + "balance_loss_clip": 0.06278444, + "balance_loss_mlp": 0.01256275, + "epoch": 0.7050052607846085, + "flos": 14872148853120.0, + "grad_norm": 1.93227359409925, + "language_loss": 0.78747177, + "learning_rate": 8.453531657156998e-07, + "loss": 0.86433172, + "num_input_tokens_seen": 253025410, + "router_z_loss_clip": 1.40527344, + "router_z_loss_mlp": 0.10668945, + "step": 11726, + "time_per_iteration": 2.625894069671631 + }, + { + "auxiliary_loss_clip": 0.06411958, + "auxiliary_loss_mlp": 0.0126862, + "balance_loss_clip": 0.06275987, + "balance_loss_mlp": 0.01258273, + "epoch": 0.7050653840372764, + "flos": 19247283866880.0, + "grad_norm": 2.1540780661141374, + "language_loss": 0.70452571, + "learning_rate": 8.450351860839931e-07, + "loss": 0.78133154, + "num_input_tokens_seen": 253043305, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.10351562, + "step": 11727, + "time_per_iteration": 2.540519952774048 + }, + { + "auxiliary_loss_clip": 0.06403094, + "auxiliary_loss_mlp": 0.01263675, + "balance_loss_clip": 0.0627404, + "balance_loss_mlp": 0.01254752, + "epoch": 0.7051255072899444, + "flos": 27787536835200.0, + "grad_norm": 1.531115099301347, + "language_loss": 0.69006073, + "learning_rate": 8.44717250248668e-07, + "loss": 0.7667284, + "num_input_tokens_seen": 253062790, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.08917236, + "step": 11728, + "time_per_iteration": 2.5793302059173584 + }, + { + "auxiliary_loss_clip": 0.06412704, + "auxiliary_loss_mlp": 0.0126399, + "balance_loss_clip": 0.06276618, + "balance_loss_mlp": 0.01253773, + "epoch": 0.7051856305426124, + "flos": 27899526216960.0, + "grad_norm": 1.8133071590962522, + "language_loss": 0.73397171, + "learning_rate": 8.443993582217803e-07, + "loss": 0.81073868, + "num_input_tokens_seen": 253082055, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.10211182, + "step": 11729, + "time_per_iteration": 2.632077693939209 + }, + { + "auxiliary_loss_clip": 0.06421916, + "auxiliary_loss_mlp": 0.01265278, + "balance_loss_clip": 0.06277753, + "balance_loss_mlp": 0.01253775, + "epoch": 0.7052457537952803, + "flos": 25050147799680.0, + "grad_norm": 1.613038649768226, + "language_loss": 0.78167063, + "learning_rate": 8.440815100153862e-07, + "loss": 0.8585425, + "num_input_tokens_seen": 253102575, + "router_z_loss_clip": 1.44140625, + "router_z_loss_mlp": 0.1149292, + "step": 11730, + "time_per_iteration": 2.5648131370544434 + }, + { + "auxiliary_loss_clip": 0.06414882, + "auxiliary_loss_mlp": 0.01268388, + "balance_loss_clip": 0.06275609, + "balance_loss_mlp": 0.0125698, + "epoch": 0.7053058770479483, + "flos": 21878175962880.0, + "grad_norm": 2.325298368428052, + "language_loss": 0.62874782, + "learning_rate": 8.437637056415359e-07, + "loss": 0.70558047, + "num_input_tokens_seen": 253121290, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.11401367, + "step": 11731, + "time_per_iteration": 2.546156167984009 + }, + { + "auxiliary_loss_clip": 0.06416281, + "auxiliary_loss_mlp": 0.01270278, + "balance_loss_clip": 0.06275978, + "balance_loss_mlp": 0.01258679, + "epoch": 0.7053660003006162, + "flos": 16404236432640.0, + "grad_norm": 1.9339047251972874, + "language_loss": 0.74811733, + "learning_rate": 8.434459451122815e-07, + "loss": 0.82498294, + "num_input_tokens_seen": 253139720, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.1159668, + "step": 11732, + "time_per_iteration": 2.4927430152893066 + }, + { + "auxiliary_loss_clip": 0.06408133, + "auxiliary_loss_mlp": 0.01266798, + "balance_loss_clip": 0.06274602, + "balance_loss_mlp": 0.01256534, + "epoch": 0.7054261235532843, + "flos": 22718271398400.0, + "grad_norm": 1.4288707050417415, + "language_loss": 0.71580064, + "learning_rate": 8.431282284396735e-07, + "loss": 0.79254997, + "num_input_tokens_seen": 253160250, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.1026001, + "step": 11733, + "time_per_iteration": 2.543832540512085 + }, + { + "auxiliary_loss_clip": 0.06411871, + "auxiliary_loss_mlp": 0.01268245, + "balance_loss_clip": 0.06275688, + "balance_loss_mlp": 0.01258154, + "epoch": 0.7054862468059522, + "flos": 13594829212800.0, + "grad_norm": 1.9266065814345037, + "language_loss": 0.73917806, + "learning_rate": 8.428105556357583e-07, + "loss": 0.81597924, + "num_input_tokens_seen": 253178710, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.10095215, + "step": 11734, + "time_per_iteration": 2.496680736541748 + }, + { + "auxiliary_loss_clip": 0.06421253, + "auxiliary_loss_mlp": 0.01273046, + "balance_loss_clip": 0.06277873, + "balance_loss_mlp": 0.0126184, + "epoch": 0.7055463700586202, + "flos": 15884931553920.0, + "grad_norm": 4.995085142451974, + "language_loss": 0.70442164, + "learning_rate": 8.424929267125829e-07, + "loss": 0.78136462, + "num_input_tokens_seen": 253194805, + "router_z_loss_clip": 1.43359375, + "router_z_loss_mlp": 0.11206055, + "step": 11735, + "time_per_iteration": 2.560451030731201 + }, + { + "auxiliary_loss_clip": 0.06413963, + "auxiliary_loss_mlp": 0.01270144, + "balance_loss_clip": 0.06274843, + "balance_loss_mlp": 0.01257955, + "epoch": 0.7056064933112881, + "flos": 23082890440320.0, + "grad_norm": 1.6821797399985068, + "language_loss": 0.72724199, + "learning_rate": 8.421753416821933e-07, + "loss": 0.80408299, + "num_input_tokens_seen": 253213895, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.12182617, + "step": 11736, + "time_per_iteration": 2.5113935470581055 + }, + { + "auxiliary_loss_clip": 0.06410478, + "auxiliary_loss_mlp": 0.0126459, + "balance_loss_clip": 0.06277382, + "balance_loss_mlp": 0.01254356, + "epoch": 0.7056666165639561, + "flos": 24063374592000.0, + "grad_norm": 1.617495345914111, + "language_loss": 0.69220245, + "learning_rate": 8.41857800556629e-07, + "loss": 0.7689532, + "num_input_tokens_seen": 253231620, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.10235596, + "step": 11737, + "time_per_iteration": 2.5327107906341553 + }, + { + "auxiliary_loss_clip": 0.06416027, + "auxiliary_loss_mlp": 0.01265741, + "balance_loss_clip": 0.06277978, + "balance_loss_mlp": 0.01254279, + "epoch": 0.705726739816624, + "flos": 17498932099200.0, + "grad_norm": 1.8698204681752435, + "language_loss": 0.67921227, + "learning_rate": 8.415403033479332e-07, + "loss": 0.75602996, + "num_input_tokens_seen": 253249590, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.11474609, + "step": 11738, + "time_per_iteration": 2.458019733428955 + }, + { + "auxiliary_loss_clip": 0.06411514, + "auxiliary_loss_mlp": 0.01264856, + "balance_loss_clip": 0.06274632, + "balance_loss_mlp": 0.0125408, + "epoch": 0.7057868630692921, + "flos": 51361515256320.0, + "grad_norm": 7.975241590020644, + "language_loss": 0.74895537, + "learning_rate": 8.41222850068145e-07, + "loss": 0.82571906, + "num_input_tokens_seen": 253273870, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.10784912, + "step": 11739, + "time_per_iteration": 2.7849392890930176 + }, + { + "auxiliary_loss_clip": 0.0641078, + "auxiliary_loss_mlp": 0.01263148, + "balance_loss_clip": 0.0627811, + "balance_loss_mlp": 0.01252663, + "epoch": 0.70584698632196, + "flos": 26109945440640.0, + "grad_norm": 1.5818256072351289, + "language_loss": 0.71794957, + "learning_rate": 8.409054407293032e-07, + "loss": 0.79468888, + "num_input_tokens_seen": 253293720, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.10479736, + "step": 11740, + "time_per_iteration": 4.018102645874023 + }, + { + "auxiliary_loss_clip": 0.06408996, + "auxiliary_loss_mlp": 0.01270494, + "balance_loss_clip": 0.06274964, + "balance_loss_mlp": 0.01260939, + "epoch": 0.705907109574628, + "flos": 21549503122560.0, + "grad_norm": 1.4620628375932287, + "language_loss": 0.82029426, + "learning_rate": 8.405880753434434e-07, + "loss": 0.89708912, + "num_input_tokens_seen": 253313700, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.09558105, + "step": 11741, + "time_per_iteration": 2.5226922035217285 + }, + { + "auxiliary_loss_clip": 0.06412125, + "auxiliary_loss_mlp": 0.0126669, + "balance_loss_clip": 0.06276572, + "balance_loss_mlp": 0.01255389, + "epoch": 0.705967232827296, + "flos": 22717432857600.0, + "grad_norm": 1.792685843416777, + "language_loss": 0.7848987, + "learning_rate": 8.402707539225993e-07, + "loss": 0.86168694, + "num_input_tokens_seen": 253332425, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.11297607, + "step": 11742, + "time_per_iteration": 2.4881513118743896 + }, + { + "auxiliary_loss_clip": 0.06420448, + "auxiliary_loss_mlp": 0.01267345, + "balance_loss_clip": 0.06277722, + "balance_loss_mlp": 0.01256408, + "epoch": 0.7060273560799639, + "flos": 28698266862720.0, + "grad_norm": 1.447375520003719, + "language_loss": 0.64323652, + "learning_rate": 8.39953476478805e-07, + "loss": 0.72011447, + "num_input_tokens_seen": 253353620, + "router_z_loss_clip": 1.42675781, + "router_z_loss_mlp": 0.10919189, + "step": 11743, + "time_per_iteration": 2.5737526416778564 + }, + { + "auxiliary_loss_clip": 0.06413458, + "auxiliary_loss_mlp": 0.01269024, + "balance_loss_clip": 0.0627328, + "balance_loss_mlp": 0.01257693, + "epoch": 0.7060874793326319, + "flos": 15711699237120.0, + "grad_norm": 1.7211358867446458, + "language_loss": 0.65871137, + "learning_rate": 8.396362430240902e-07, + "loss": 0.73553622, + "num_input_tokens_seen": 253370930, + "router_z_loss_clip": 1.40039062, + "router_z_loss_mlp": 0.11322021, + "step": 11744, + "time_per_iteration": 2.479001998901367 + }, + { + "auxiliary_loss_clip": 0.06408134, + "auxiliary_loss_mlp": 0.01271135, + "balance_loss_clip": 0.06274446, + "balance_loss_mlp": 0.01260728, + "epoch": 0.7061476025852998, + "flos": 21513137650560.0, + "grad_norm": 2.025199572577618, + "language_loss": 0.63794267, + "learning_rate": 8.393190535704857e-07, + "loss": 0.71473539, + "num_input_tokens_seen": 253389810, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.10394287, + "step": 11745, + "time_per_iteration": 2.52616810798645 + }, + { + "auxiliary_loss_clip": 0.06410205, + "auxiliary_loss_mlp": 0.01263676, + "balance_loss_clip": 0.06273259, + "balance_loss_mlp": 0.01253311, + "epoch": 0.7062077258379679, + "flos": 28189024473600.0, + "grad_norm": 1.8444242196367828, + "language_loss": 0.71914798, + "learning_rate": 8.390019081300188e-07, + "loss": 0.79588681, + "num_input_tokens_seen": 253408685, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.10369873, + "step": 11746, + "time_per_iteration": 2.5588066577911377 + }, + { + "auxiliary_loss_clip": 0.06411352, + "auxiliary_loss_mlp": 0.01268167, + "balance_loss_clip": 0.06275406, + "balance_loss_mlp": 0.01257653, + "epoch": 0.7062678490906358, + "flos": 27860854757760.0, + "grad_norm": 1.5188195218955072, + "language_loss": 0.79773951, + "learning_rate": 8.386848067147175e-07, + "loss": 0.87453461, + "num_input_tokens_seen": 253429685, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.10510254, + "step": 11747, + "time_per_iteration": 2.5661420822143555 + }, + { + "auxiliary_loss_clip": 0.06411886, + "auxiliary_loss_mlp": 0.01264357, + "balance_loss_clip": 0.06277459, + "balance_loss_mlp": 0.01254307, + "epoch": 0.7063279723433038, + "flos": 23191483731840.0, + "grad_norm": 1.5251666611578065, + "language_loss": 0.65140951, + "learning_rate": 8.383677493366031e-07, + "loss": 0.72817194, + "num_input_tokens_seen": 253448260, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.1005249, + "step": 11748, + "time_per_iteration": 2.5165350437164307 + }, + { + "auxiliary_loss_clip": 0.06412359, + "auxiliary_loss_mlp": 0.01266364, + "balance_loss_clip": 0.06276652, + "balance_loss_mlp": 0.0125548, + "epoch": 0.7063880955959717, + "flos": 20194043950080.0, + "grad_norm": 1.8580174500745112, + "language_loss": 0.79421908, + "learning_rate": 8.380507360077003e-07, + "loss": 0.87100631, + "num_input_tokens_seen": 253467725, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.10888672, + "step": 11749, + "time_per_iteration": 2.5304911136627197 + }, + { + "auxiliary_loss_clip": 0.06318866, + "auxiliary_loss_mlp": 0.01253368, + "balance_loss_clip": 0.06263049, + "balance_loss_mlp": 0.01252189, + "epoch": 0.7064482188486397, + "flos": 63685020395520.0, + "grad_norm": 0.7869711578789559, + "language_loss": 0.54065382, + "learning_rate": 8.377337667400304e-07, + "loss": 0.61637622, + "num_input_tokens_seen": 253526940, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.01176453, + "step": 11750, + "time_per_iteration": 3.118065118789673 + }, + { + "auxiliary_loss_clip": 0.06410946, + "auxiliary_loss_mlp": 0.01265459, + "balance_loss_clip": 0.06275111, + "balance_loss_mlp": 0.01254623, + "epoch": 0.7065083421013076, + "flos": 25198125310080.0, + "grad_norm": 1.6339849961789776, + "language_loss": 0.78829509, + "learning_rate": 8.37416841545612e-07, + "loss": 0.86505914, + "num_input_tokens_seen": 253546160, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.10839844, + "step": 11751, + "time_per_iteration": 2.5452511310577393 + }, + { + "auxiliary_loss_clip": 0.0640781, + "auxiliary_loss_mlp": 0.0126673, + "balance_loss_clip": 0.06274024, + "balance_loss_mlp": 0.01256842, + "epoch": 0.7065684653539757, + "flos": 22900392247680.0, + "grad_norm": 1.6672445306420212, + "language_loss": 0.68168157, + "learning_rate": 8.370999604364634e-07, + "loss": 0.75842696, + "num_input_tokens_seen": 253565505, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.09893799, + "step": 11752, + "time_per_iteration": 3.9393372535705566 + }, + { + "auxiliary_loss_clip": 0.06408882, + "auxiliary_loss_mlp": 0.01267025, + "balance_loss_clip": 0.06275536, + "balance_loss_mlp": 0.01256934, + "epoch": 0.7066285886066436, + "flos": 23557025168640.0, + "grad_norm": 1.8022680768003871, + "language_loss": 0.76729679, + "learning_rate": 8.367831234246025e-07, + "loss": 0.84405589, + "num_input_tokens_seen": 253585125, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.10083008, + "step": 11753, + "time_per_iteration": 2.5189971923828125 + }, + { + "auxiliary_loss_clip": 0.06404173, + "auxiliary_loss_mlp": 0.01265164, + "balance_loss_clip": 0.06273716, + "balance_loss_mlp": 0.01255097, + "epoch": 0.7066887118593116, + "flos": 21075661883520.0, + "grad_norm": 1.4940357111697604, + "language_loss": 0.7128973, + "learning_rate": 8.364663305220405e-07, + "loss": 0.78959066, + "num_input_tokens_seen": 253604815, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.10076904, + "step": 11754, + "time_per_iteration": 2.5660195350646973 + }, + { + "auxiliary_loss_clip": 0.064097, + "auxiliary_loss_mlp": 0.01267445, + "balance_loss_clip": 0.0627328, + "balance_loss_mlp": 0.01257491, + "epoch": 0.7067488351119796, + "flos": 21182284604160.0, + "grad_norm": 1.5428805294467156, + "language_loss": 0.89486808, + "learning_rate": 8.361495817407919e-07, + "loss": 0.97163951, + "num_input_tokens_seen": 253622855, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.09960938, + "step": 11755, + "time_per_iteration": 2.507603883743286 + }, + { + "auxiliary_loss_clip": 0.06407668, + "auxiliary_loss_mlp": 0.01267402, + "balance_loss_clip": 0.06274056, + "balance_loss_mlp": 0.01257293, + "epoch": 0.7068089583646475, + "flos": 20455520215680.0, + "grad_norm": 1.4982614193498491, + "language_loss": 0.79735661, + "learning_rate": 8.358328770928678e-07, + "loss": 0.87410736, + "num_input_tokens_seen": 253642760, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.10119629, + "step": 11756, + "time_per_iteration": 3.994943618774414 + }, + { + "auxiliary_loss_clip": 0.06321511, + "auxiliary_loss_mlp": 0.0125505, + "balance_loss_clip": 0.06265193, + "balance_loss_mlp": 0.01253739, + "epoch": 0.7068690816173155, + "flos": 59125542399360.0, + "grad_norm": 0.8066454127458581, + "language_loss": 0.6018793, + "learning_rate": 8.355162165902785e-07, + "loss": 0.67764497, + "num_input_tokens_seen": 253695685, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01311493, + "step": 11757, + "time_per_iteration": 2.9342048168182373 + }, + { + "auxiliary_loss_clip": 0.06406799, + "auxiliary_loss_mlp": 0.01267209, + "balance_loss_clip": 0.06273741, + "balance_loss_mlp": 0.01256135, + "epoch": 0.7069292048699835, + "flos": 16256845900800.0, + "grad_norm": 2.1598051545702264, + "language_loss": 0.80614579, + "learning_rate": 8.351996002450307e-07, + "loss": 0.88288587, + "num_input_tokens_seen": 253713305, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.11071777, + "step": 11758, + "time_per_iteration": 2.4969773292541504 + }, + { + "auxiliary_loss_clip": 0.06407057, + "auxiliary_loss_mlp": 0.01266896, + "balance_loss_clip": 0.06273986, + "balance_loss_mlp": 0.01256143, + "epoch": 0.7069893281226515, + "flos": 41182468133760.0, + "grad_norm": 1.7333024967156656, + "language_loss": 0.77613515, + "learning_rate": 8.348830280691304e-07, + "loss": 0.85287464, + "num_input_tokens_seen": 253736100, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.10754395, + "step": 11759, + "time_per_iteration": 2.6857149600982666 + }, + { + "auxiliary_loss_clip": 0.06407617, + "auxiliary_loss_mlp": 0.01266387, + "balance_loss_clip": 0.06274342, + "balance_loss_mlp": 0.01254746, + "epoch": 0.7070494513753194, + "flos": 24214203141120.0, + "grad_norm": 1.49498062494056, + "language_loss": 0.68238914, + "learning_rate": 8.34566500074583e-07, + "loss": 0.75912917, + "num_input_tokens_seen": 253757350, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.11639404, + "step": 11760, + "time_per_iteration": 4.106550455093384 + }, + { + "auxiliary_loss_clip": 0.06414315, + "auxiliary_loss_mlp": 0.01264826, + "balance_loss_clip": 0.06276926, + "balance_loss_mlp": 0.01254354, + "epoch": 0.7071095746279874, + "flos": 20190564005760.0, + "grad_norm": 1.927414071449925, + "language_loss": 0.79955995, + "learning_rate": 8.342500162733899e-07, + "loss": 0.8763513, + "num_input_tokens_seen": 253772855, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.10479736, + "step": 11761, + "time_per_iteration": 2.4826464653015137 + }, + { + "auxiliary_loss_clip": 0.0640934, + "auxiliary_loss_mlp": 0.01267235, + "balance_loss_clip": 0.06273883, + "balance_loss_mlp": 0.0125588, + "epoch": 0.7071696978806553, + "flos": 18188282839680.0, + "grad_norm": 2.2121961398440684, + "language_loss": 0.75218999, + "learning_rate": 8.33933576677553e-07, + "loss": 0.82895583, + "num_input_tokens_seen": 253790360, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.11352539, + "step": 11762, + "time_per_iteration": 2.4954895973205566 + }, + { + "auxiliary_loss_clip": 0.06405114, + "auxiliary_loss_mlp": 0.01264533, + "balance_loss_clip": 0.06272383, + "balance_loss_mlp": 0.01254579, + "epoch": 0.7072298211333233, + "flos": 24138201888000.0, + "grad_norm": 1.8799497376122591, + "language_loss": 0.77263492, + "learning_rate": 8.336171812990724e-07, + "loss": 0.84933138, + "num_input_tokens_seen": 253810585, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09954834, + "step": 11763, + "time_per_iteration": 2.53564453125 + }, + { + "auxiliary_loss_clip": 0.06407874, + "auxiliary_loss_mlp": 0.01264442, + "balance_loss_clip": 0.062722, + "balance_loss_mlp": 0.01253493, + "epoch": 0.7072899443859912, + "flos": 27205731210240.0, + "grad_norm": 2.480752014730448, + "language_loss": 0.78787279, + "learning_rate": 8.333008301499453e-07, + "loss": 0.86459595, + "num_input_tokens_seen": 253829080, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10949707, + "step": 11764, + "time_per_iteration": 2.652902841567993 + }, + { + "auxiliary_loss_clip": 0.06416324, + "auxiliary_loss_mlp": 0.01267754, + "balance_loss_clip": 0.06276786, + "balance_loss_mlp": 0.01256852, + "epoch": 0.7073500676386593, + "flos": 16441188883200.0, + "grad_norm": 1.6649904523449048, + "language_loss": 0.79710478, + "learning_rate": 8.32984523242167e-07, + "loss": 0.87394559, + "num_input_tokens_seen": 253846780, + "router_z_loss_clip": 1.39648438, + "router_z_loss_mlp": 0.10900879, + "step": 11765, + "time_per_iteration": 2.478731632232666 + }, + { + "auxiliary_loss_clip": 0.0640541, + "auxiliary_loss_mlp": 0.01265613, + "balance_loss_clip": 0.06272826, + "balance_loss_mlp": 0.0125638, + "epoch": 0.7074101908913272, + "flos": 27681291457920.0, + "grad_norm": 1.64401676901429, + "language_loss": 0.69017607, + "learning_rate": 8.326682605877324e-07, + "loss": 0.76688629, + "num_input_tokens_seen": 253867075, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09222412, + "step": 11766, + "time_per_iteration": 2.5636019706726074 + }, + { + "auxiliary_loss_clip": 0.06409839, + "auxiliary_loss_mlp": 0.01267425, + "balance_loss_clip": 0.06272456, + "balance_loss_mlp": 0.01256399, + "epoch": 0.7074703141439952, + "flos": 22244849429760.0, + "grad_norm": 1.7806465184891558, + "language_loss": 0.64121795, + "learning_rate": 8.323520421986352e-07, + "loss": 0.71799058, + "num_input_tokens_seen": 253885790, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.11016846, + "step": 11767, + "time_per_iteration": 2.509098529815674 + }, + { + "auxiliary_loss_clip": 0.06408227, + "auxiliary_loss_mlp": 0.01264258, + "balance_loss_clip": 0.06273193, + "balance_loss_mlp": 0.01253768, + "epoch": 0.7075304373966632, + "flos": 29650980585600.0, + "grad_norm": 1.5320251232109037, + "language_loss": 0.53099549, + "learning_rate": 8.320358680868646e-07, + "loss": 0.60772038, + "num_input_tokens_seen": 253907070, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.10491943, + "step": 11768, + "time_per_iteration": 2.5991628170013428 + }, + { + "auxiliary_loss_clip": 0.06404776, + "auxiliary_loss_mlp": 0.01263382, + "balance_loss_clip": 0.06271052, + "balance_loss_mlp": 0.01253565, + "epoch": 0.7075905606493311, + "flos": 19761264011520.0, + "grad_norm": 1.5482480325031622, + "language_loss": 0.75826794, + "learning_rate": 8.317197382644119e-07, + "loss": 0.83494949, + "num_input_tokens_seen": 253927290, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.0980835, + "step": 11769, + "time_per_iteration": 2.553248167037964 + }, + { + "auxiliary_loss_clip": 0.063171, + "auxiliary_loss_mlp": 0.01250363, + "balance_loss_clip": 0.06260812, + "balance_loss_mlp": 0.01249205, + "epoch": 0.7076506839019991, + "flos": 65734106866560.0, + "grad_norm": 0.8156037445248981, + "language_loss": 0.6198988, + "learning_rate": 8.314036527432637e-07, + "loss": 0.69557339, + "num_input_tokens_seen": 253983440, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01155853, + "step": 11770, + "time_per_iteration": 3.0812795162200928 + }, + { + "auxiliary_loss_clip": 0.0641284, + "auxiliary_loss_mlp": 0.01265072, + "balance_loss_clip": 0.06274459, + "balance_loss_mlp": 0.01254516, + "epoch": 0.707710807154667, + "flos": 23771444567040.0, + "grad_norm": 1.6411438931926623, + "language_loss": 0.76769519, + "learning_rate": 8.310876115354055e-07, + "loss": 0.84447432, + "num_input_tokens_seen": 254003825, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.10552979, + "step": 11771, + "time_per_iteration": 2.5363407135009766 + }, + { + "auxiliary_loss_clip": 0.06403352, + "auxiliary_loss_mlp": 0.01265567, + "balance_loss_clip": 0.06272224, + "balance_loss_mlp": 0.01255482, + "epoch": 0.7077709304073351, + "flos": 21257698878720.0, + "grad_norm": 1.3979456660804543, + "language_loss": 0.71690625, + "learning_rate": 8.307716146528221e-07, + "loss": 0.79359543, + "num_input_tokens_seen": 254023345, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.10083008, + "step": 11772, + "time_per_iteration": 2.517993688583374 + }, + { + "auxiliary_loss_clip": 0.06417513, + "auxiliary_loss_mlp": 0.01264872, + "balance_loss_clip": 0.06277703, + "balance_loss_mlp": 0.01253535, + "epoch": 0.707831053660003, + "flos": 20747030970240.0, + "grad_norm": 1.7220446646082324, + "language_loss": 0.69968081, + "learning_rate": 8.30455662107496e-07, + "loss": 0.77650464, + "num_input_tokens_seen": 254041815, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.11334229, + "step": 11773, + "time_per_iteration": 2.487250328063965 + }, + { + "auxiliary_loss_clip": 0.06409782, + "auxiliary_loss_mlp": 0.01269179, + "balance_loss_clip": 0.0627438, + "balance_loss_mlp": 0.01259016, + "epoch": 0.707891176912671, + "flos": 21987440087040.0, + "grad_norm": 1.361330798775882, + "language_loss": 0.70201778, + "learning_rate": 8.301397539114095e-07, + "loss": 0.77880728, + "num_input_tokens_seen": 254062065, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.10150146, + "step": 11774, + "time_per_iteration": 2.519763231277466 + }, + { + "auxiliary_loss_clip": 0.0640517, + "auxiliary_loss_mlp": 0.01266109, + "balance_loss_clip": 0.06274074, + "balance_loss_mlp": 0.01256316, + "epoch": 0.7079513001653389, + "flos": 21075284540160.0, + "grad_norm": 1.498970377219278, + "language_loss": 0.7492069, + "learning_rate": 8.298238900765407e-07, + "loss": 0.82591969, + "num_input_tokens_seen": 254080605, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.09802246, + "step": 11775, + "time_per_iteration": 2.5430877208709717 + }, + { + "auxiliary_loss_clip": 0.06415135, + "auxiliary_loss_mlp": 0.01264314, + "balance_loss_clip": 0.06278447, + "balance_loss_mlp": 0.01254014, + "epoch": 0.7080114234180069, + "flos": 18046468604160.0, + "grad_norm": 1.621138107650678, + "language_loss": 0.87510455, + "learning_rate": 8.295080706148665e-07, + "loss": 0.95189905, + "num_input_tokens_seen": 254098710, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.10314941, + "step": 11776, + "time_per_iteration": 2.517082691192627 + }, + { + "auxiliary_loss_clip": 0.06408748, + "auxiliary_loss_mlp": 0.01265871, + "balance_loss_clip": 0.0627363, + "balance_loss_mlp": 0.01256096, + "epoch": 0.7080715466706748, + "flos": 15127671479040.0, + "grad_norm": 1.4637417425019663, + "language_loss": 0.75087041, + "learning_rate": 8.291922955383641e-07, + "loss": 0.82761657, + "num_input_tokens_seen": 254117200, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.09777832, + "step": 11777, + "time_per_iteration": 2.5164589881896973 + }, + { + "auxiliary_loss_clip": 0.06418398, + "auxiliary_loss_mlp": 0.01264812, + "balance_loss_clip": 0.0627712, + "balance_loss_mlp": 0.01253928, + "epoch": 0.7081316699233429, + "flos": 14427042364800.0, + "grad_norm": 1.984175776722718, + "language_loss": 0.82697594, + "learning_rate": 8.288765648590066e-07, + "loss": 0.903808, + "num_input_tokens_seen": 254132115, + "router_z_loss_clip": 1.41308594, + "router_z_loss_mlp": 0.10888672, + "step": 11778, + "time_per_iteration": 2.5013656616210938 + }, + { + "auxiliary_loss_clip": 0.06404569, + "auxiliary_loss_mlp": 0.01264308, + "balance_loss_clip": 0.06274152, + "balance_loss_mlp": 0.01255213, + "epoch": 0.7081917931760108, + "flos": 23229190869120.0, + "grad_norm": 1.4143364906484888, + "language_loss": 0.84851789, + "learning_rate": 8.285608785887673e-07, + "loss": 0.9252066, + "num_input_tokens_seen": 254152285, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.09100342, + "step": 11779, + "time_per_iteration": 2.5495359897613525 + }, + { + "auxiliary_loss_clip": 0.06410395, + "auxiliary_loss_mlp": 0.01264448, + "balance_loss_clip": 0.06273511, + "balance_loss_mlp": 0.01254321, + "epoch": 0.7082519164286788, + "flos": 39317221520640.0, + "grad_norm": 1.7515830912849983, + "language_loss": 0.7191208, + "learning_rate": 8.28245236739618e-07, + "loss": 0.79586923, + "num_input_tokens_seen": 254172805, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.10125732, + "step": 11780, + "time_per_iteration": 4.163387775421143 + }, + { + "auxiliary_loss_clip": 0.06407901, + "auxiliary_loss_mlp": 0.01267276, + "balance_loss_clip": 0.06274346, + "balance_loss_mlp": 0.01257382, + "epoch": 0.7083120396813467, + "flos": 21657299800320.0, + "grad_norm": 1.349993887717698, + "language_loss": 0.73180461, + "learning_rate": 8.279296393235256e-07, + "loss": 0.80855638, + "num_input_tokens_seen": 254191890, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.09887695, + "step": 11781, + "time_per_iteration": 2.523428440093994 + }, + { + "auxiliary_loss_clip": 0.06407337, + "auxiliary_loss_mlp": 0.0126471, + "balance_loss_clip": 0.06273166, + "balance_loss_mlp": 0.01254625, + "epoch": 0.7083721629340147, + "flos": 17572878927360.0, + "grad_norm": 2.699338792660173, + "language_loss": 0.77578008, + "learning_rate": 8.276140863524585e-07, + "loss": 0.85250056, + "num_input_tokens_seen": 254210150, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.10089111, + "step": 11782, + "time_per_iteration": 2.458449363708496 + }, + { + "auxiliary_loss_clip": 0.06406146, + "auxiliary_loss_mlp": 0.0126417, + "balance_loss_clip": 0.06272672, + "balance_loss_mlp": 0.01254991, + "epoch": 0.7084322861866827, + "flos": 29358086238720.0, + "grad_norm": 1.4360937815095354, + "language_loss": 0.70182502, + "learning_rate": 8.272985778383828e-07, + "loss": 0.77852821, + "num_input_tokens_seen": 254233015, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.09173584, + "step": 11783, + "time_per_iteration": 2.5887033939361572 + }, + { + "auxiliary_loss_clip": 0.06414656, + "auxiliary_loss_mlp": 0.01268164, + "balance_loss_clip": 0.06274001, + "balance_loss_mlp": 0.0125768, + "epoch": 0.7084924094393507, + "flos": 20200626495360.0, + "grad_norm": 1.5971747704172947, + "language_loss": 0.79307884, + "learning_rate": 8.269831137932632e-07, + "loss": 0.86990702, + "num_input_tokens_seen": 254251345, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.1048584, + "step": 11784, + "time_per_iteration": 2.490954637527466 + }, + { + "auxiliary_loss_clip": 0.0640732, + "auxiliary_loss_mlp": 0.01267, + "balance_loss_clip": 0.06272314, + "balance_loss_mlp": 0.01256737, + "epoch": 0.7085525326920187, + "flos": 23483958808320.0, + "grad_norm": 1.617674750849371, + "language_loss": 0.77606887, + "learning_rate": 8.266676942290609e-07, + "loss": 0.85281205, + "num_input_tokens_seen": 254269905, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.1026001, + "step": 11785, + "time_per_iteration": 2.521693706512451 + }, + { + "auxiliary_loss_clip": 0.06413119, + "auxiliary_loss_mlp": 0.01265727, + "balance_loss_clip": 0.06278774, + "balance_loss_mlp": 0.01255934, + "epoch": 0.7086126559446866, + "flos": 25966076780160.0, + "grad_norm": 1.4386102379185288, + "language_loss": 0.78040558, + "learning_rate": 8.26352319157738e-07, + "loss": 0.85719407, + "num_input_tokens_seen": 254289990, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.09796143, + "step": 11786, + "time_per_iteration": 2.522735834121704 + }, + { + "auxiliary_loss_clip": 0.06412391, + "auxiliary_loss_mlp": 0.01268502, + "balance_loss_clip": 0.06275783, + "balance_loss_mlp": 0.0125834, + "epoch": 0.7086727791973546, + "flos": 26732141533440.0, + "grad_norm": 1.8351634972642936, + "language_loss": 0.79121733, + "learning_rate": 8.260369885912526e-07, + "loss": 0.86802632, + "num_input_tokens_seen": 254309085, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.10162354, + "step": 11787, + "time_per_iteration": 2.5581464767456055 + }, + { + "auxiliary_loss_clip": 0.06412619, + "auxiliary_loss_mlp": 0.01271025, + "balance_loss_clip": 0.06277216, + "balance_loss_mlp": 0.01260475, + "epoch": 0.7087329024500225, + "flos": 21688801735680.0, + "grad_norm": 1.8228289571149952, + "language_loss": 0.76948512, + "learning_rate": 8.257217025415615e-07, + "loss": 0.84632152, + "num_input_tokens_seen": 254327045, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.10540771, + "step": 11788, + "time_per_iteration": 2.490006446838379 + }, + { + "auxiliary_loss_clip": 0.06420539, + "auxiliary_loss_mlp": 0.01270333, + "balance_loss_clip": 0.06279223, + "balance_loss_mlp": 0.01259014, + "epoch": 0.7087930257026905, + "flos": 17936827136640.0, + "grad_norm": 2.296634586886211, + "language_loss": 0.67989695, + "learning_rate": 8.254064610206212e-07, + "loss": 0.75680566, + "num_input_tokens_seen": 254344585, + "router_z_loss_clip": 1.41308594, + "router_z_loss_mlp": 0.11322021, + "step": 11789, + "time_per_iteration": 2.5101919174194336 + }, + { + "auxiliary_loss_clip": 0.06411231, + "auxiliary_loss_mlp": 0.01266357, + "balance_loss_clip": 0.06272002, + "balance_loss_mlp": 0.01256111, + "epoch": 0.7088531489553584, + "flos": 18916682382720.0, + "grad_norm": 1.5602629922400044, + "language_loss": 0.77709448, + "learning_rate": 8.250912640403858e-07, + "loss": 0.85387033, + "num_input_tokens_seen": 254362470, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.10241699, + "step": 11790, + "time_per_iteration": 2.484931468963623 + }, + { + "auxiliary_loss_clip": 0.06419586, + "auxiliary_loss_mlp": 0.01267055, + "balance_loss_clip": 0.06277139, + "balance_loss_mlp": 0.01255253, + "epoch": 0.7089132722080265, + "flos": 27388229402880.0, + "grad_norm": 1.5308750679240268, + "language_loss": 0.71250129, + "learning_rate": 8.247761116128085e-07, + "loss": 0.78936774, + "num_input_tokens_seen": 254383190, + "router_z_loss_clip": 1.42382812, + "router_z_loss_mlp": 0.11798096, + "step": 11791, + "time_per_iteration": 2.583948850631714 + }, + { + "auxiliary_loss_clip": 0.06410724, + "auxiliary_loss_mlp": 0.01267551, + "balance_loss_clip": 0.06275617, + "balance_loss_mlp": 0.0125675, + "epoch": 0.7089733954606944, + "flos": 22169309374080.0, + "grad_norm": 1.511652721397476, + "language_loss": 0.82245874, + "learning_rate": 8.244610037498376e-07, + "loss": 0.89924157, + "num_input_tokens_seen": 254403115, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.1081543, + "step": 11792, + "time_per_iteration": 3.987499475479126 + }, + { + "auxiliary_loss_clip": 0.06412215, + "auxiliary_loss_mlp": 0.01267904, + "balance_loss_clip": 0.06272028, + "balance_loss_mlp": 0.01256817, + "epoch": 0.7090335187133624, + "flos": 24432731389440.0, + "grad_norm": 1.9294753325302831, + "language_loss": 0.65135908, + "learning_rate": 8.241459404634232e-07, + "loss": 0.72816032, + "num_input_tokens_seen": 254421875, + "router_z_loss_clip": 1.40039062, + "router_z_loss_mlp": 0.11083984, + "step": 11793, + "time_per_iteration": 2.5396199226379395 + }, + { + "auxiliary_loss_clip": 0.06407128, + "auxiliary_loss_mlp": 0.01268973, + "balance_loss_clip": 0.06271678, + "balance_loss_mlp": 0.01258834, + "epoch": 0.7090936419660303, + "flos": 21841684709760.0, + "grad_norm": 1.9925409901798494, + "language_loss": 0.70387089, + "learning_rate": 8.238309217655133e-07, + "loss": 0.78063184, + "num_input_tokens_seen": 254440765, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.10144043, + "step": 11794, + "time_per_iteration": 2.5805962085723877 + }, + { + "auxiliary_loss_clip": 0.06410742, + "auxiliary_loss_mlp": 0.01263848, + "balance_loss_clip": 0.0627709, + "balance_loss_mlp": 0.01253828, + "epoch": 0.7091537652186983, + "flos": 20088259770240.0, + "grad_norm": 1.8813846026416328, + "language_loss": 0.76058149, + "learning_rate": 8.23515947668052e-07, + "loss": 0.83732742, + "num_input_tokens_seen": 254459480, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.10015869, + "step": 11795, + "time_per_iteration": 3.9482054710388184 + }, + { + "auxiliary_loss_clip": 0.06412329, + "auxiliary_loss_mlp": 0.0126988, + "balance_loss_clip": 0.06275567, + "balance_loss_mlp": 0.01258812, + "epoch": 0.7092138884713663, + "flos": 13156556832000.0, + "grad_norm": 2.0194589674634242, + "language_loss": 0.75623167, + "learning_rate": 8.232010181829838e-07, + "loss": 0.83305377, + "num_input_tokens_seen": 254473985, + "router_z_loss_clip": 1.36621094, + "router_z_loss_mlp": 0.11077881, + "step": 11796, + "time_per_iteration": 2.49794340133667 + }, + { + "auxiliary_loss_clip": 0.06421532, + "auxiliary_loss_mlp": 0.01265378, + "balance_loss_clip": 0.06280202, + "balance_loss_mlp": 0.01254024, + "epoch": 0.7092740117240343, + "flos": 21651262306560.0, + "grad_norm": 1.5362456233213855, + "language_loss": 0.74430573, + "learning_rate": 8.228861333222523e-07, + "loss": 0.8211748, + "num_input_tokens_seen": 254492135, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.11352539, + "step": 11797, + "time_per_iteration": 2.5082199573516846 + }, + { + "auxiliary_loss_clip": 0.06411034, + "auxiliary_loss_mlp": 0.01266935, + "balance_loss_clip": 0.06274262, + "balance_loss_mlp": 0.01256326, + "epoch": 0.7093341349767023, + "flos": 21038835214080.0, + "grad_norm": 1.402262543828535, + "language_loss": 0.79553568, + "learning_rate": 8.225712930977953e-07, + "loss": 0.87231541, + "num_input_tokens_seen": 254512865, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.10614014, + "step": 11798, + "time_per_iteration": 2.5451393127441406 + }, + { + "auxiliary_loss_clip": 0.06409004, + "auxiliary_loss_mlp": 0.01266407, + "balance_loss_clip": 0.06273472, + "balance_loss_mlp": 0.01255911, + "epoch": 0.7093942582293702, + "flos": 22024140975360.0, + "grad_norm": 2.0553615011101236, + "language_loss": 0.67001218, + "learning_rate": 8.222564975215529e-07, + "loss": 0.74676633, + "num_input_tokens_seen": 254532605, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.10491943, + "step": 11799, + "time_per_iteration": 3.9047088623046875 + }, + { + "auxiliary_loss_clip": 0.06411745, + "auxiliary_loss_mlp": 0.01265473, + "balance_loss_clip": 0.06276356, + "balance_loss_mlp": 0.01254548, + "epoch": 0.7094543814820382, + "flos": 27243019077120.0, + "grad_norm": 1.5384407371377906, + "language_loss": 0.82004559, + "learning_rate": 8.219417466054622e-07, + "loss": 0.89681768, + "num_input_tokens_seen": 254553780, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.10925293, + "step": 11800, + "time_per_iteration": 2.54984188079834 + }, + { + "auxiliary_loss_clip": 0.06408048, + "auxiliary_loss_mlp": 0.01264695, + "balance_loss_clip": 0.06274039, + "balance_loss_mlp": 0.01255218, + "epoch": 0.7095145047347061, + "flos": 12093237319680.0, + "grad_norm": 1.8049515172262331, + "language_loss": 0.86792338, + "learning_rate": 8.21627040361459e-07, + "loss": 0.94465083, + "num_input_tokens_seen": 254567510, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.0947876, + "step": 11801, + "time_per_iteration": 2.472968339920044 + }, + { + "auxiliary_loss_clip": 0.06414308, + "auxiliary_loss_mlp": 0.01268303, + "balance_loss_clip": 0.06278587, + "balance_loss_mlp": 0.01257896, + "epoch": 0.7095746279873741, + "flos": 19388678832000.0, + "grad_norm": 1.9685683260033982, + "language_loss": 0.7659384, + "learning_rate": 8.213123788014758e-07, + "loss": 0.8427645, + "num_input_tokens_seen": 254585565, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.10412598, + "step": 11802, + "time_per_iteration": 2.469217300415039 + }, + { + "auxiliary_loss_clip": 0.06413268, + "auxiliary_loss_mlp": 0.01270796, + "balance_loss_clip": 0.06277166, + "balance_loss_mlp": 0.01259948, + "epoch": 0.709634751240042, + "flos": 21366921075840.0, + "grad_norm": 1.7164711115559128, + "language_loss": 0.81734449, + "learning_rate": 8.209977619374462e-07, + "loss": 0.89418513, + "num_input_tokens_seen": 254603465, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.10845947, + "step": 11803, + "time_per_iteration": 2.5675346851348877 + }, + { + "auxiliary_loss_clip": 0.06413771, + "auxiliary_loss_mlp": 0.01268086, + "balance_loss_clip": 0.0627571, + "balance_loss_mlp": 0.01256702, + "epoch": 0.7096948744927101, + "flos": 13922034606720.0, + "grad_norm": 2.2508010678544363, + "language_loss": 0.6771282, + "learning_rate": 8.206831897812995e-07, + "loss": 0.75394678, + "num_input_tokens_seen": 254620500, + "router_z_loss_clip": 1.38183594, + "router_z_loss_mlp": 0.1137085, + "step": 11804, + "time_per_iteration": 2.4850802421569824 + }, + { + "auxiliary_loss_clip": 0.06406445, + "auxiliary_loss_mlp": 0.01269291, + "balance_loss_clip": 0.06276047, + "balance_loss_mlp": 0.01259694, + "epoch": 0.709754997745378, + "flos": 30305936424960.0, + "grad_norm": 1.836033307049916, + "language_loss": 0.78141153, + "learning_rate": 8.203686623449637e-07, + "loss": 0.8581689, + "num_input_tokens_seen": 254638565, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.0960083, + "step": 11805, + "time_per_iteration": 2.5807907581329346 + }, + { + "auxiliary_loss_clip": 0.06411435, + "auxiliary_loss_mlp": 0.01266806, + "balance_loss_clip": 0.06275858, + "balance_loss_mlp": 0.01256202, + "epoch": 0.709815120998046, + "flos": 18521064529920.0, + "grad_norm": 3.360423816262503, + "language_loss": 0.78911841, + "learning_rate": 8.200541796403667e-07, + "loss": 0.86590087, + "num_input_tokens_seen": 254657505, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.1060791, + "step": 11806, + "time_per_iteration": 2.4750113487243652 + }, + { + "auxiliary_loss_clip": 0.06409614, + "auxiliary_loss_mlp": 0.01266594, + "balance_loss_clip": 0.06275766, + "balance_loss_mlp": 0.01256503, + "epoch": 0.7098752442507139, + "flos": 22279034695680.0, + "grad_norm": 3.0880614568331883, + "language_loss": 0.56418979, + "learning_rate": 8.197397416794332e-07, + "loss": 0.64095187, + "num_input_tokens_seen": 254674730, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.10095215, + "step": 11807, + "time_per_iteration": 2.5265543460845947 + }, + { + "auxiliary_loss_clip": 0.06416228, + "auxiliary_loss_mlp": 0.01269148, + "balance_loss_clip": 0.06274513, + "balance_loss_mlp": 0.01257686, + "epoch": 0.7099353675033819, + "flos": 19280504810880.0, + "grad_norm": 2.07369456244542, + "language_loss": 0.68290567, + "learning_rate": 8.194253484740882e-07, + "loss": 0.75975943, + "num_input_tokens_seen": 254691665, + "router_z_loss_clip": 1.41699219, + "router_z_loss_mlp": 0.11462402, + "step": 11808, + "time_per_iteration": 2.472132444381714 + }, + { + "auxiliary_loss_clip": 0.06414328, + "auxiliary_loss_mlp": 0.01264603, + "balance_loss_clip": 0.06275385, + "balance_loss_mlp": 0.01254512, + "epoch": 0.70999549075605, + "flos": 21915044559360.0, + "grad_norm": 1.9968242899147548, + "language_loss": 0.71669781, + "learning_rate": 8.191110000362513e-07, + "loss": 0.79348707, + "num_input_tokens_seen": 254711610, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.10089111, + "step": 11809, + "time_per_iteration": 2.524571180343628 + }, + { + "auxiliary_loss_clip": 0.06322539, + "auxiliary_loss_mlp": 0.01256903, + "balance_loss_clip": 0.06266782, + "balance_loss_mlp": 0.01255681, + "epoch": 0.7100556140087179, + "flos": 70474280192640.0, + "grad_norm": 0.7372364518861584, + "language_loss": 0.59065175, + "learning_rate": 8.187966963778435e-07, + "loss": 0.66644615, + "num_input_tokens_seen": 254772615, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.01220703, + "step": 11810, + "time_per_iteration": 3.2093372344970703 + }, + { + "auxiliary_loss_clip": 0.06413063, + "auxiliary_loss_mlp": 0.01263776, + "balance_loss_clip": 0.06277919, + "balance_loss_mlp": 0.01253721, + "epoch": 0.7101157372613859, + "flos": 23046273406080.0, + "grad_norm": 1.545725512324635, + "language_loss": 0.74353242, + "learning_rate": 8.18482437510784e-07, + "loss": 0.82030082, + "num_input_tokens_seen": 254791375, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.10064697, + "step": 11811, + "time_per_iteration": 2.5427846908569336 + }, + { + "auxiliary_loss_clip": 0.06404351, + "auxiliary_loss_mlp": 0.0126459, + "balance_loss_clip": 0.06272991, + "balance_loss_mlp": 0.01255149, + "epoch": 0.7101758605140538, + "flos": 23192028783360.0, + "grad_norm": 1.7044281012631433, + "language_loss": 0.83467686, + "learning_rate": 8.181682234469882e-07, + "loss": 0.91136628, + "num_input_tokens_seen": 254809300, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09442139, + "step": 11812, + "time_per_iteration": 2.5327343940734863 + }, + { + "auxiliary_loss_clip": 0.0641521, + "auxiliary_loss_mlp": 0.0126703, + "balance_loss_clip": 0.06277661, + "balance_loss_mlp": 0.01256659, + "epoch": 0.7102359837667218, + "flos": 23702906327040.0, + "grad_norm": 1.4051092754707344, + "language_loss": 0.69960868, + "learning_rate": 8.178540541983716e-07, + "loss": 0.77643108, + "num_input_tokens_seen": 254829325, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.10375977, + "step": 11813, + "time_per_iteration": 2.6402204036712646 + }, + { + "auxiliary_loss_clip": 0.06402316, + "auxiliary_loss_mlp": 0.01264286, + "balance_loss_clip": 0.06270487, + "balance_loss_mlp": 0.01254451, + "epoch": 0.7102961070193897, + "flos": 19397231948160.0, + "grad_norm": 1.7011399194035903, + "language_loss": 0.82479846, + "learning_rate": 8.175399297768495e-07, + "loss": 0.90146458, + "num_input_tokens_seen": 254847690, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09832764, + "step": 11814, + "time_per_iteration": 2.4825360774993896 + }, + { + "auxiliary_loss_clip": 0.06407954, + "auxiliary_loss_mlp": 0.01266287, + "balance_loss_clip": 0.06273861, + "balance_loss_mlp": 0.01255308, + "epoch": 0.7103562302720577, + "flos": 21514018118400.0, + "grad_norm": 1.9900571557306543, + "language_loss": 0.76711023, + "learning_rate": 8.172258501943301e-07, + "loss": 0.84385264, + "num_input_tokens_seen": 254865960, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.10974121, + "step": 11815, + "time_per_iteration": 2.5411629676818848 + }, + { + "auxiliary_loss_clip": 0.06407356, + "auxiliary_loss_mlp": 0.01265787, + "balance_loss_clip": 0.0627517, + "balance_loss_mlp": 0.01256012, + "epoch": 0.7104163535247257, + "flos": 14539786433280.0, + "grad_norm": 2.148014854725882, + "language_loss": 0.78734261, + "learning_rate": 8.16911815462725e-07, + "loss": 0.86407399, + "num_input_tokens_seen": 254882815, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.09777832, + "step": 11816, + "time_per_iteration": 2.4732110500335693 + }, + { + "auxiliary_loss_clip": 0.06409387, + "auxiliary_loss_mlp": 0.0126716, + "balance_loss_clip": 0.06273407, + "balance_loss_mlp": 0.01257415, + "epoch": 0.7104764767773937, + "flos": 11405018609280.0, + "grad_norm": 1.710233044928932, + "language_loss": 0.87136269, + "learning_rate": 8.165978255939426e-07, + "loss": 0.9481281, + "num_input_tokens_seen": 254898705, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.09747314, + "step": 11817, + "time_per_iteration": 2.4930732250213623 + }, + { + "auxiliary_loss_clip": 0.06405669, + "auxiliary_loss_mlp": 0.01263794, + "balance_loss_clip": 0.06273086, + "balance_loss_mlp": 0.01254358, + "epoch": 0.7105366000300616, + "flos": 11694894209280.0, + "grad_norm": 2.3467290312942906, + "language_loss": 0.84727818, + "learning_rate": 8.162838805998897e-07, + "loss": 0.92397279, + "num_input_tokens_seen": 254913665, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09436035, + "step": 11818, + "time_per_iteration": 2.4601902961730957 + }, + { + "auxiliary_loss_clip": 0.06407452, + "auxiliary_loss_mlp": 0.01265048, + "balance_loss_clip": 0.06270839, + "balance_loss_mlp": 0.01254808, + "epoch": 0.7105967232827296, + "flos": 19360027935360.0, + "grad_norm": 1.943101872130184, + "language_loss": 0.76065433, + "learning_rate": 8.159699804924709e-07, + "loss": 0.83737928, + "num_input_tokens_seen": 254932140, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.10235596, + "step": 11819, + "time_per_iteration": 2.5082414150238037 + }, + { + "auxiliary_loss_clip": 0.06408325, + "auxiliary_loss_mlp": 0.01273169, + "balance_loss_clip": 0.06273748, + "balance_loss_mlp": 0.01262422, + "epoch": 0.7106568465353975, + "flos": 22937135063040.0, + "grad_norm": 1.5613953087486683, + "language_loss": 0.71238112, + "learning_rate": 8.156561252835883e-07, + "loss": 0.78919601, + "num_input_tokens_seen": 254951580, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.10748291, + "step": 11820, + "time_per_iteration": 3.9562554359436035 + }, + { + "auxiliary_loss_clip": 0.06406607, + "auxiliary_loss_mlp": 0.01266388, + "balance_loss_clip": 0.06272983, + "balance_loss_mlp": 0.01256309, + "epoch": 0.7107169697880655, + "flos": 19105805047680.0, + "grad_norm": 1.709009415960719, + "language_loss": 0.75201517, + "learning_rate": 8.153423149851449e-07, + "loss": 0.82874513, + "num_input_tokens_seen": 254969425, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.10083008, + "step": 11821, + "time_per_iteration": 2.4773855209350586 + }, + { + "auxiliary_loss_clip": 0.0631486, + "auxiliary_loss_mlp": 0.0125056, + "balance_loss_clip": 0.06259306, + "balance_loss_mlp": 0.01249267, + "epoch": 0.7107770930407336, + "flos": 63655950228480.0, + "grad_norm": 0.8065746142119063, + "language_loss": 0.55105186, + "learning_rate": 8.150285496090388e-07, + "loss": 0.626706, + "num_input_tokens_seen": 255032680, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01293182, + "step": 11822, + "time_per_iteration": 3.1728925704956055 + }, + { + "auxiliary_loss_clip": 0.06399868, + "auxiliary_loss_mlp": 0.01265617, + "balance_loss_clip": 0.0627214, + "balance_loss_mlp": 0.01256313, + "epoch": 0.7108372162934015, + "flos": 22061009571840.0, + "grad_norm": 1.7664810996184872, + "language_loss": 0.61042011, + "learning_rate": 8.147148291671688e-07, + "loss": 0.68707502, + "num_input_tokens_seen": 255054400, + "router_z_loss_clip": 1.27539062, + "router_z_loss_mlp": 0.09301758, + "step": 11823, + "time_per_iteration": 2.685396194458008 + }, + { + "auxiliary_loss_clip": 0.06409906, + "auxiliary_loss_mlp": 0.01263571, + "balance_loss_clip": 0.0627628, + "balance_loss_mlp": 0.01254195, + "epoch": 0.7108973395460695, + "flos": 19141122343680.0, + "grad_norm": 1.95026020169961, + "language_loss": 0.71794426, + "learning_rate": 8.144011536714322e-07, + "loss": 0.79467905, + "num_input_tokens_seen": 255072785, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.09375, + "step": 11824, + "time_per_iteration": 2.5620133876800537 + }, + { + "auxiliary_loss_clip": 0.06401232, + "auxiliary_loss_mlp": 0.01266002, + "balance_loss_clip": 0.06271533, + "balance_loss_mlp": 0.01256841, + "epoch": 0.7109574627987374, + "flos": 17900168175360.0, + "grad_norm": 2.011245948242179, + "language_loss": 0.72948581, + "learning_rate": 8.140875231337223e-07, + "loss": 0.80615819, + "num_input_tokens_seen": 255091820, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.09161377, + "step": 11825, + "time_per_iteration": 2.481990098953247 + }, + { + "auxiliary_loss_clip": 0.06409375, + "auxiliary_loss_mlp": 0.01265195, + "balance_loss_clip": 0.06273198, + "balance_loss_mlp": 0.01254669, + "epoch": 0.7110175860514054, + "flos": 28986129964800.0, + "grad_norm": 1.8577779500908889, + "language_loss": 0.80001605, + "learning_rate": 8.137739375659321e-07, + "loss": 0.87676173, + "num_input_tokens_seen": 255111720, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.10540771, + "step": 11826, + "time_per_iteration": 2.5934202671051025 + }, + { + "auxiliary_loss_clip": 0.06401698, + "auxiliary_loss_mlp": 0.01267979, + "balance_loss_clip": 0.06270775, + "balance_loss_mlp": 0.0125846, + "epoch": 0.7110777093040733, + "flos": 26179867272960.0, + "grad_norm": 1.3769409852595975, + "language_loss": 0.83070964, + "learning_rate": 8.134603969799527e-07, + "loss": 0.90740645, + "num_input_tokens_seen": 255133495, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09521484, + "step": 11827, + "time_per_iteration": 2.5412826538085938 + }, + { + "auxiliary_loss_clip": 0.0640677, + "auxiliary_loss_mlp": 0.01267116, + "balance_loss_clip": 0.06271519, + "balance_loss_mlp": 0.01256507, + "epoch": 0.7111378325567413, + "flos": 26877184151040.0, + "grad_norm": 1.489155185626094, + "language_loss": 0.62609684, + "learning_rate": 8.131469013876748e-07, + "loss": 0.70283562, + "num_input_tokens_seen": 255156880, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.10601807, + "step": 11828, + "time_per_iteration": 2.549358367919922 + }, + { + "auxiliary_loss_clip": 0.0640718, + "auxiliary_loss_mlp": 0.01265747, + "balance_loss_clip": 0.06272048, + "balance_loss_mlp": 0.01255543, + "epoch": 0.7111979558094093, + "flos": 27279216840960.0, + "grad_norm": 1.3931875657884774, + "language_loss": 0.72552299, + "learning_rate": 8.128334508009846e-07, + "loss": 0.80225229, + "num_input_tokens_seen": 255178920, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.10205078, + "step": 11829, + "time_per_iteration": 2.538902997970581 + }, + { + "auxiliary_loss_clip": 0.06404835, + "auxiliary_loss_mlp": 0.01268934, + "balance_loss_clip": 0.06271756, + "balance_loss_mlp": 0.01259343, + "epoch": 0.7112580790620773, + "flos": 25054088941440.0, + "grad_norm": 1.7068284012281256, + "language_loss": 0.80460179, + "learning_rate": 8.125200452317697e-07, + "loss": 0.88133949, + "num_input_tokens_seen": 255198095, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.09594727, + "step": 11830, + "time_per_iteration": 2.527684450149536 + }, + { + "auxiliary_loss_clip": 0.064045, + "auxiliary_loss_mlp": 0.01264964, + "balance_loss_clip": 0.06270975, + "balance_loss_mlp": 0.01255338, + "epoch": 0.7113182023147452, + "flos": 21652016993280.0, + "grad_norm": 1.5791795722004685, + "language_loss": 0.84228051, + "learning_rate": 8.122066846919138e-07, + "loss": 0.91897511, + "num_input_tokens_seen": 255215860, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.09625244, + "step": 11831, + "time_per_iteration": 3.8946433067321777 + }, + { + "auxiliary_loss_clip": 0.06405313, + "auxiliary_loss_mlp": 0.01264799, + "balance_loss_clip": 0.06270519, + "balance_loss_mlp": 0.01255453, + "epoch": 0.7113783255674132, + "flos": 21002637450240.0, + "grad_norm": 1.9181792200519638, + "language_loss": 0.77265865, + "learning_rate": 8.118933691932985e-07, + "loss": 0.84935975, + "num_input_tokens_seen": 255235425, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.09344482, + "step": 11832, + "time_per_iteration": 2.517416477203369 + }, + { + "auxiliary_loss_clip": 0.06316236, + "auxiliary_loss_mlp": 0.01252897, + "balance_loss_clip": 0.06260582, + "balance_loss_mlp": 0.01251798, + "epoch": 0.7114384488200811, + "flos": 66788705554560.0, + "grad_norm": 0.7355523312106115, + "language_loss": 0.56510413, + "learning_rate": 8.115800987478059e-07, + "loss": 0.64079541, + "num_input_tokens_seen": 255291680, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01100922, + "step": 11833, + "time_per_iteration": 3.083800792694092 + }, + { + "auxiliary_loss_clip": 0.06404281, + "auxiliary_loss_mlp": 0.01264607, + "balance_loss_clip": 0.06270045, + "balance_loss_mlp": 0.01255255, + "epoch": 0.7114985720727491, + "flos": 25017136490880.0, + "grad_norm": 1.685224360571569, + "language_loss": 0.71167994, + "learning_rate": 8.11266873367315e-07, + "loss": 0.78836882, + "num_input_tokens_seen": 255313880, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.09350586, + "step": 11834, + "time_per_iteration": 2.5492658615112305 + }, + { + "auxiliary_loss_clip": 0.06408249, + "auxiliary_loss_mlp": 0.01268558, + "balance_loss_clip": 0.06272918, + "balance_loss_mlp": 0.01257972, + "epoch": 0.7115586953254172, + "flos": 21476478689280.0, + "grad_norm": 1.811757150622914, + "language_loss": 0.79512018, + "learning_rate": 8.10953693063704e-07, + "loss": 0.87188828, + "num_input_tokens_seen": 255332390, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.10583496, + "step": 11835, + "time_per_iteration": 3.936241865158081 + }, + { + "auxiliary_loss_clip": 0.06403308, + "auxiliary_loss_mlp": 0.01266062, + "balance_loss_clip": 0.06270957, + "balance_loss_mlp": 0.0125646, + "epoch": 0.7116188185780851, + "flos": 28630357528320.0, + "grad_norm": 1.5711246954693516, + "language_loss": 0.76045537, + "learning_rate": 8.10640557848848e-07, + "loss": 0.83714908, + "num_input_tokens_seen": 255354025, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09606934, + "step": 11836, + "time_per_iteration": 2.5701663494110107 + }, + { + "auxiliary_loss_clip": 0.06406698, + "auxiliary_loss_mlp": 0.01265952, + "balance_loss_clip": 0.06274588, + "balance_loss_mlp": 0.01256653, + "epoch": 0.7116789418307531, + "flos": 25299339442560.0, + "grad_norm": 1.6743206701340672, + "language_loss": 0.69986928, + "learning_rate": 8.103274677346208e-07, + "loss": 0.77659577, + "num_input_tokens_seen": 255371400, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09301758, + "step": 11837, + "time_per_iteration": 2.575038194656372 + }, + { + "auxiliary_loss_clip": 0.0641223, + "auxiliary_loss_mlp": 0.01266229, + "balance_loss_clip": 0.06274512, + "balance_loss_mlp": 0.01255494, + "epoch": 0.711739065083421, + "flos": 25564463360640.0, + "grad_norm": 1.8455270082673318, + "language_loss": 0.61858809, + "learning_rate": 8.100144227328958e-07, + "loss": 0.69537258, + "num_input_tokens_seen": 255390710, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.10736084, + "step": 11838, + "time_per_iteration": 2.5805752277374268 + }, + { + "auxiliary_loss_clip": 0.06409779, + "auxiliary_loss_mlp": 0.01267582, + "balance_loss_clip": 0.0627556, + "balance_loss_mlp": 0.01257699, + "epoch": 0.711799188336089, + "flos": 26148239556480.0, + "grad_norm": 2.1939319933932424, + "language_loss": 0.68031204, + "learning_rate": 8.097014228555426e-07, + "loss": 0.75708568, + "num_input_tokens_seen": 255408790, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.09875488, + "step": 11839, + "time_per_iteration": 3.951659679412842 + }, + { + "auxiliary_loss_clip": 0.06405699, + "auxiliary_loss_mlp": 0.01265718, + "balance_loss_clip": 0.06272204, + "balance_loss_mlp": 0.01256349, + "epoch": 0.7118593115887569, + "flos": 21146757672960.0, + "grad_norm": 2.0203738416997226, + "language_loss": 0.8447386, + "learning_rate": 8.093884681144305e-07, + "loss": 0.92145276, + "num_input_tokens_seen": 255426280, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.09375, + "step": 11840, + "time_per_iteration": 2.5161664485931396 + }, + { + "auxiliary_loss_clip": 0.0641197, + "auxiliary_loss_mlp": 0.01266296, + "balance_loss_clip": 0.06274749, + "balance_loss_mlp": 0.01256413, + "epoch": 0.711919434841425, + "flos": 14980951779840.0, + "grad_norm": 1.9072315995358804, + "language_loss": 0.77299631, + "learning_rate": 8.090755585214277e-07, + "loss": 0.84977901, + "num_input_tokens_seen": 255442935, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.09881592, + "step": 11841, + "time_per_iteration": 2.5373709201812744 + }, + { + "auxiliary_loss_clip": 0.06406824, + "auxiliary_loss_mlp": 0.01265843, + "balance_loss_clip": 0.06271842, + "balance_loss_mlp": 0.01256348, + "epoch": 0.7119795580940929, + "flos": 16514674513920.0, + "grad_norm": 2.1386907373947186, + "language_loss": 0.75567174, + "learning_rate": 8.087626940883994e-07, + "loss": 0.83239841, + "num_input_tokens_seen": 255460925, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.0949707, + "step": 11842, + "time_per_iteration": 2.5253396034240723 + }, + { + "auxiliary_loss_clip": 0.06309856, + "auxiliary_loss_mlp": 0.01250631, + "balance_loss_clip": 0.06254404, + "balance_loss_mlp": 0.01249538, + "epoch": 0.7120396813467609, + "flos": 66591434315520.0, + "grad_norm": 0.7631692514869006, + "language_loss": 0.61363775, + "learning_rate": 8.084498748272082e-07, + "loss": 0.6892426, + "num_input_tokens_seen": 255521360, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01094818, + "step": 11843, + "time_per_iteration": 3.097399950027466 + }, + { + "auxiliary_loss_clip": 0.06403574, + "auxiliary_loss_mlp": 0.01266422, + "balance_loss_clip": 0.06270365, + "balance_loss_mlp": 0.01256432, + "epoch": 0.7120998045994288, + "flos": 26440001873280.0, + "grad_norm": 3.96385360450405, + "language_loss": 0.80268991, + "learning_rate": 8.081371007497171e-07, + "loss": 0.87938976, + "num_input_tokens_seen": 255541435, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.09997559, + "step": 11844, + "time_per_iteration": 2.552259683609009 + }, + { + "auxiliary_loss_clip": 0.06406216, + "auxiliary_loss_mlp": 0.01262016, + "balance_loss_clip": 0.06270443, + "balance_loss_mlp": 0.01252759, + "epoch": 0.7121599278520968, + "flos": 16432300350720.0, + "grad_norm": 2.2064261749206784, + "language_loss": 0.79144967, + "learning_rate": 8.078243718677873e-07, + "loss": 0.868132, + "num_input_tokens_seen": 255558505, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.09259033, + "step": 11845, + "time_per_iteration": 2.5421273708343506 + }, + { + "auxiliary_loss_clip": 0.06402468, + "auxiliary_loss_mlp": 0.01265331, + "balance_loss_clip": 0.06271762, + "balance_loss_mlp": 0.01255532, + "epoch": 0.7122200511047647, + "flos": 28957520995200.0, + "grad_norm": 2.3428288803792485, + "language_loss": 0.77299261, + "learning_rate": 8.075116881932762e-07, + "loss": 0.84967065, + "num_input_tokens_seen": 255577815, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09796143, + "step": 11846, + "time_per_iteration": 2.527745485305786 + }, + { + "auxiliary_loss_clip": 0.06408693, + "auxiliary_loss_mlp": 0.01266657, + "balance_loss_clip": 0.06274035, + "balance_loss_mlp": 0.01256334, + "epoch": 0.7122801743574327, + "flos": 16477428574080.0, + "grad_norm": 1.8749902395969622, + "language_loss": 0.58446372, + "learning_rate": 8.071990497380421e-07, + "loss": 0.66121721, + "num_input_tokens_seen": 255595885, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.10314941, + "step": 11847, + "time_per_iteration": 2.4880757331848145 + }, + { + "auxiliary_loss_clip": 0.06397726, + "auxiliary_loss_mlp": 0.01263987, + "balance_loss_clip": 0.06270626, + "balance_loss_mlp": 0.01254081, + "epoch": 0.7123402976101008, + "flos": 20637263721600.0, + "grad_norm": 1.2877189780235179, + "language_loss": 0.71294212, + "learning_rate": 8.068864565139395e-07, + "loss": 0.78955925, + "num_input_tokens_seen": 255616750, + "router_z_loss_clip": 1.27050781, + "router_z_loss_mlp": 0.09918213, + "step": 11848, + "time_per_iteration": 2.5513198375701904 + }, + { + "auxiliary_loss_clip": 0.0630827, + "auxiliary_loss_mlp": 0.01254097, + "balance_loss_clip": 0.06252526, + "balance_loss_mlp": 0.01252904, + "epoch": 0.7124004208627687, + "flos": 62343606781440.0, + "grad_norm": 0.847952001487362, + "language_loss": 0.6271292, + "learning_rate": 8.065739085328211e-07, + "loss": 0.70275289, + "num_input_tokens_seen": 255677900, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.01190948, + "step": 11849, + "time_per_iteration": 3.1112751960754395 + }, + { + "auxiliary_loss_clip": 0.06405951, + "auxiliary_loss_mlp": 0.01264545, + "balance_loss_clip": 0.06269863, + "balance_loss_mlp": 0.01254699, + "epoch": 0.7124605441154367, + "flos": 39685278579840.0, + "grad_norm": 1.4089636975562345, + "language_loss": 0.64458466, + "learning_rate": 8.0626140580654e-07, + "loss": 0.72128963, + "num_input_tokens_seen": 255699140, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.09844971, + "step": 11850, + "time_per_iteration": 2.632457733154297 + }, + { + "auxiliary_loss_clip": 0.06404182, + "auxiliary_loss_mlp": 0.01262554, + "balance_loss_clip": 0.06269325, + "balance_loss_mlp": 0.0125254, + "epoch": 0.7125206673681046, + "flos": 28189066400640.0, + "grad_norm": 1.5452031150775634, + "language_loss": 0.70381355, + "learning_rate": 8.05948948346946e-07, + "loss": 0.78048086, + "num_input_tokens_seen": 255719640, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.10003662, + "step": 11851, + "time_per_iteration": 2.563063144683838 + }, + { + "auxiliary_loss_clip": 0.06402514, + "auxiliary_loss_mlp": 0.01261637, + "balance_loss_clip": 0.06271089, + "balance_loss_mlp": 0.0125275, + "epoch": 0.7125807906207726, + "flos": 26184101904000.0, + "grad_norm": 1.4548821396986709, + "language_loss": 0.83386576, + "learning_rate": 8.056365361658882e-07, + "loss": 0.9105072, + "num_input_tokens_seen": 255740450, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.08892822, + "step": 11852, + "time_per_iteration": 2.5185182094573975 + }, + { + "auxiliary_loss_clip": 0.06408215, + "auxiliary_loss_mlp": 0.01266945, + "balance_loss_clip": 0.06270768, + "balance_loss_mlp": 0.01256759, + "epoch": 0.7126409138734405, + "flos": 17161706142720.0, + "grad_norm": 2.03558575161385, + "language_loss": 0.72365862, + "learning_rate": 8.053241692752126e-07, + "loss": 0.80041021, + "num_input_tokens_seen": 255758070, + "router_z_loss_clip": 1.37402344, + "router_z_loss_mlp": 0.10186768, + "step": 11853, + "time_per_iteration": 2.4712510108947754 + }, + { + "auxiliary_loss_clip": 0.06400356, + "auxiliary_loss_mlp": 0.01265707, + "balance_loss_clip": 0.06273182, + "balance_loss_mlp": 0.01257005, + "epoch": 0.7127010371261085, + "flos": 18775790542080.0, + "grad_norm": 1.725464250509213, + "language_loss": 0.92318237, + "learning_rate": 8.050118476867635e-07, + "loss": 0.999843, + "num_input_tokens_seen": 255775685, + "router_z_loss_clip": 1.27246094, + "router_z_loss_mlp": 0.08703613, + "step": 11854, + "time_per_iteration": 2.4725341796875 + }, + { + "auxiliary_loss_clip": 0.06403268, + "auxiliary_loss_mlp": 0.01268625, + "balance_loss_clip": 0.06272953, + "balance_loss_mlp": 0.01260018, + "epoch": 0.7127611603787765, + "flos": 20382747344640.0, + "grad_norm": 1.8133122260210155, + "language_loss": 0.79957211, + "learning_rate": 8.046995714123856e-07, + "loss": 0.8762911, + "num_input_tokens_seen": 255794750, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.08612061, + "step": 11855, + "time_per_iteration": 2.5004756450653076 + }, + { + "auxiliary_loss_clip": 0.0640405, + "auxiliary_loss_mlp": 0.01264358, + "balance_loss_clip": 0.06273045, + "balance_loss_mlp": 0.01254244, + "epoch": 0.7128212836314445, + "flos": 20455268653440.0, + "grad_norm": 1.8163189094799566, + "language_loss": 0.73227429, + "learning_rate": 8.043873404639192e-07, + "loss": 0.80895841, + "num_input_tokens_seen": 255813325, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.10119629, + "step": 11856, + "time_per_iteration": 2.489022731781006 + }, + { + "auxiliary_loss_clip": 0.06408788, + "auxiliary_loss_mlp": 0.01268564, + "balance_loss_clip": 0.06273171, + "balance_loss_mlp": 0.01258634, + "epoch": 0.7128814068841124, + "flos": 23447593336320.0, + "grad_norm": 1.4996097551327818, + "language_loss": 0.69965553, + "learning_rate": 8.040751548532046e-07, + "loss": 0.77642906, + "num_input_tokens_seen": 255832470, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.0993042, + "step": 11857, + "time_per_iteration": 2.5889153480529785 + }, + { + "auxiliary_loss_clip": 0.06401453, + "auxiliary_loss_mlp": 0.01263435, + "balance_loss_clip": 0.06270251, + "balance_loss_mlp": 0.01253488, + "epoch": 0.7129415301367804, + "flos": 18228757161600.0, + "grad_norm": 1.9673696792632074, + "language_loss": 0.85894734, + "learning_rate": 8.03763014592081e-07, + "loss": 0.93559623, + "num_input_tokens_seen": 255849740, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.09942627, + "step": 11858, + "time_per_iteration": 2.4554738998413086 + }, + { + "auxiliary_loss_clip": 0.0641135, + "auxiliary_loss_mlp": 0.01265866, + "balance_loss_clip": 0.062728, + "balance_loss_mlp": 0.01255697, + "epoch": 0.7130016533894483, + "flos": 15529410679680.0, + "grad_norm": 1.7544523597871677, + "language_loss": 0.80554175, + "learning_rate": 8.034509196923829e-07, + "loss": 0.88231397, + "num_input_tokens_seen": 255866975, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.10168457, + "step": 11859, + "time_per_iteration": 3.9745945930480957 + }, + { + "auxiliary_loss_clip": 0.06400725, + "auxiliary_loss_mlp": 0.01264096, + "balance_loss_clip": 0.06269667, + "balance_loss_mlp": 0.0125472, + "epoch": 0.7130617766421163, + "flos": 57127804081920.0, + "grad_norm": 1.1922495989293056, + "language_loss": 0.69005597, + "learning_rate": 8.031388701659456e-07, + "loss": 0.76670408, + "num_input_tokens_seen": 255892915, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.09381104, + "step": 11860, + "time_per_iteration": 2.891012668609619 + }, + { + "auxiliary_loss_clip": 0.06406054, + "auxiliary_loss_mlp": 0.01266268, + "balance_loss_clip": 0.06271956, + "balance_loss_mlp": 0.01255575, + "epoch": 0.7131218998947844, + "flos": 19793730268800.0, + "grad_norm": 2.1261081147363097, + "language_loss": 0.64239693, + "learning_rate": 8.028268660246023e-07, + "loss": 0.71912014, + "num_input_tokens_seen": 255911480, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.10693359, + "step": 11861, + "time_per_iteration": 2.5796282291412354 + }, + { + "auxiliary_loss_clip": 0.06410623, + "auxiliary_loss_mlp": 0.01265484, + "balance_loss_clip": 0.06273146, + "balance_loss_mlp": 0.01254857, + "epoch": 0.7131820231474523, + "flos": 26659242881280.0, + "grad_norm": 3.187443939826819, + "language_loss": 0.67274332, + "learning_rate": 8.025149072801849e-07, + "loss": 0.74950445, + "num_input_tokens_seen": 255931140, + "router_z_loss_clip": 1.37402344, + "router_z_loss_mlp": 0.10620117, + "step": 11862, + "time_per_iteration": 2.576899528503418 + }, + { + "auxiliary_loss_clip": 0.064044, + "auxiliary_loss_mlp": 0.01265218, + "balance_loss_clip": 0.06273039, + "balance_loss_mlp": 0.01255926, + "epoch": 0.7132421464001203, + "flos": 29213337110400.0, + "grad_norm": 2.2144093674445426, + "language_loss": 0.67745155, + "learning_rate": 8.022029939445214e-07, + "loss": 0.75414771, + "num_input_tokens_seen": 255951665, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.09283447, + "step": 11863, + "time_per_iteration": 2.563467264175415 + }, + { + "auxiliary_loss_clip": 0.06412646, + "auxiliary_loss_mlp": 0.0126882, + "balance_loss_clip": 0.06272405, + "balance_loss_mlp": 0.01258103, + "epoch": 0.7133022696527882, + "flos": 23079913620480.0, + "grad_norm": 1.7053563824160904, + "language_loss": 0.6612097, + "learning_rate": 8.018911260294414e-07, + "loss": 0.73802435, + "num_input_tokens_seen": 255970055, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.10717773, + "step": 11864, + "time_per_iteration": 2.5226974487304688 + }, + { + "auxiliary_loss_clip": 0.06409131, + "auxiliary_loss_mlp": 0.01265229, + "balance_loss_clip": 0.06273311, + "balance_loss_mlp": 0.01255019, + "epoch": 0.7133623929054562, + "flos": 17462860116480.0, + "grad_norm": 3.439605466883789, + "language_loss": 0.86094218, + "learning_rate": 8.015793035467697e-07, + "loss": 0.93768573, + "num_input_tokens_seen": 255987720, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10217285, + "step": 11865, + "time_per_iteration": 2.441121816635132 + }, + { + "auxiliary_loss_clip": 0.06408411, + "auxiliary_loss_mlp": 0.01263379, + "balance_loss_clip": 0.06273241, + "balance_loss_mlp": 0.01252942, + "epoch": 0.7134225161581241, + "flos": 19542609982080.0, + "grad_norm": 2.0189990892571807, + "language_loss": 0.75141108, + "learning_rate": 8.012675265083304e-07, + "loss": 0.82812905, + "num_input_tokens_seen": 256005490, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.10443115, + "step": 11866, + "time_per_iteration": 2.4785237312316895 + }, + { + "auxiliary_loss_clip": 0.06411657, + "auxiliary_loss_mlp": 0.01267167, + "balance_loss_clip": 0.06275963, + "balance_loss_mlp": 0.01256408, + "epoch": 0.7134826394107922, + "flos": 26257294045440.0, + "grad_norm": 3.679418691378197, + "language_loss": 0.70483118, + "learning_rate": 8.009557949259464e-07, + "loss": 0.78161943, + "num_input_tokens_seen": 256026030, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.10748291, + "step": 11867, + "time_per_iteration": 2.518202066421509 + }, + { + "auxiliary_loss_clip": 0.06403194, + "auxiliary_loss_mlp": 0.01266031, + "balance_loss_clip": 0.06272841, + "balance_loss_mlp": 0.01256477, + "epoch": 0.7135427626634601, + "flos": 15820795653120.0, + "grad_norm": 4.975034900378342, + "language_loss": 0.71782935, + "learning_rate": 8.006441088114397e-07, + "loss": 0.79452157, + "num_input_tokens_seen": 256043680, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.09552002, + "step": 11868, + "time_per_iteration": 2.4938719272613525 + }, + { + "auxiliary_loss_clip": 0.06411693, + "auxiliary_loss_mlp": 0.01268858, + "balance_loss_clip": 0.06273223, + "balance_loss_mlp": 0.01257635, + "epoch": 0.7136028859161281, + "flos": 18229302213120.0, + "grad_norm": 1.9405833387691556, + "language_loss": 0.66333723, + "learning_rate": 8.003324681766286e-07, + "loss": 0.7401427, + "num_input_tokens_seen": 256059705, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.11236572, + "step": 11869, + "time_per_iteration": 2.4637274742126465 + }, + { + "auxiliary_loss_clip": 0.06408057, + "auxiliary_loss_mlp": 0.01264796, + "balance_loss_clip": 0.06273142, + "balance_loss_mlp": 0.01255003, + "epoch": 0.713663009168796, + "flos": 24321454767360.0, + "grad_norm": 1.4404508285538464, + "language_loss": 0.77963442, + "learning_rate": 8.000208730333298e-07, + "loss": 0.856363, + "num_input_tokens_seen": 256079785, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.09790039, + "step": 11870, + "time_per_iteration": 2.545146942138672 + }, + { + "auxiliary_loss_clip": 0.06407803, + "auxiliary_loss_mlp": 0.01266072, + "balance_loss_clip": 0.06275113, + "balance_loss_mlp": 0.01255248, + "epoch": 0.713723132421464, + "flos": 26545157147520.0, + "grad_norm": 2.250105845614367, + "language_loss": 0.81401408, + "learning_rate": 7.997093233933597e-07, + "loss": 0.89075279, + "num_input_tokens_seen": 256099000, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.10821533, + "step": 11871, + "time_per_iteration": 4.061939477920532 + }, + { + "auxiliary_loss_clip": 0.06409386, + "auxiliary_loss_mlp": 0.01271871, + "balance_loss_clip": 0.06272148, + "balance_loss_mlp": 0.01261541, + "epoch": 0.7137832556741319, + "flos": 19871911728000.0, + "grad_norm": 1.5669444552919631, + "language_loss": 0.78963834, + "learning_rate": 7.993978192685331e-07, + "loss": 0.86645091, + "num_input_tokens_seen": 256117985, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.10321045, + "step": 11872, + "time_per_iteration": 2.502652645111084 + }, + { + "auxiliary_loss_clip": 0.06413025, + "auxiliary_loss_mlp": 0.01263574, + "balance_loss_clip": 0.06273353, + "balance_loss_mlp": 0.01253108, + "epoch": 0.7138433789267999, + "flos": 21695300426880.0, + "grad_norm": 2.078419347550335, + "language_loss": 0.83881956, + "learning_rate": 7.990863606706606e-07, + "loss": 0.91558552, + "num_input_tokens_seen": 256134350, + "router_z_loss_clip": 1.39648438, + "router_z_loss_mlp": 0.10473633, + "step": 11873, + "time_per_iteration": 2.49755859375 + }, + { + "auxiliary_loss_clip": 0.06404479, + "auxiliary_loss_mlp": 0.01264148, + "balance_loss_clip": 0.06273785, + "balance_loss_mlp": 0.0125491, + "epoch": 0.713903502179468, + "flos": 17608447785600.0, + "grad_norm": 2.139862978747737, + "language_loss": 0.85866129, + "learning_rate": 7.987749476115539e-07, + "loss": 0.93534762, + "num_input_tokens_seen": 256150610, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.09240723, + "step": 11874, + "time_per_iteration": 2.446295976638794 + }, + { + "auxiliary_loss_clip": 0.0641006, + "auxiliary_loss_mlp": 0.01266331, + "balance_loss_clip": 0.06275686, + "balance_loss_mlp": 0.01256043, + "epoch": 0.7139636254321359, + "flos": 18046091260800.0, + "grad_norm": 1.75973654551926, + "language_loss": 0.83120143, + "learning_rate": 7.984635801030228e-07, + "loss": 0.90796536, + "num_input_tokens_seen": 256168620, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.10284424, + "step": 11875, + "time_per_iteration": 3.8960680961608887 + }, + { + "auxiliary_loss_clip": 0.06414599, + "auxiliary_loss_mlp": 0.01267914, + "balance_loss_clip": 0.06272531, + "balance_loss_mlp": 0.01256136, + "epoch": 0.7140237486848039, + "flos": 23337826087680.0, + "grad_norm": 1.757783447264505, + "language_loss": 0.69900811, + "learning_rate": 7.981522581568721e-07, + "loss": 0.77583325, + "num_input_tokens_seen": 256186700, + "router_z_loss_clip": 1.41992188, + "router_z_loss_mlp": 0.11779785, + "step": 11876, + "time_per_iteration": 2.491225481033325 + }, + { + "auxiliary_loss_clip": 0.06411763, + "auxiliary_loss_mlp": 0.01262915, + "balance_loss_clip": 0.06274708, + "balance_loss_mlp": 0.01252663, + "epoch": 0.7140838719374718, + "flos": 16842760375680.0, + "grad_norm": 1.8106538192439035, + "language_loss": 0.78886259, + "learning_rate": 7.978409817849079e-07, + "loss": 0.86560941, + "num_input_tokens_seen": 256205390, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.10253906, + "step": 11877, + "time_per_iteration": 2.493778705596924 + }, + { + "auxiliary_loss_clip": 0.0640865, + "auxiliary_loss_mlp": 0.01267195, + "balance_loss_clip": 0.06276323, + "balance_loss_mlp": 0.01257611, + "epoch": 0.7141439951901398, + "flos": 21148350900480.0, + "grad_norm": 1.8508532405281077, + "language_loss": 0.70390731, + "learning_rate": 7.97529750998934e-07, + "loss": 0.78066581, + "num_input_tokens_seen": 256224575, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.0958252, + "step": 11878, + "time_per_iteration": 3.8979172706604004 + }, + { + "auxiliary_loss_clip": 0.06407811, + "auxiliary_loss_mlp": 0.01264089, + "balance_loss_clip": 0.06277137, + "balance_loss_mlp": 0.01254153, + "epoch": 0.7142041184428077, + "flos": 24724661414400.0, + "grad_norm": 1.94673596086021, + "language_loss": 0.67341477, + "learning_rate": 7.972185658107535e-07, + "loss": 0.75013375, + "num_input_tokens_seen": 256242130, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09936523, + "step": 11879, + "time_per_iteration": 2.5100598335266113 + }, + { + "auxiliary_loss_clip": 0.06410161, + "auxiliary_loss_mlp": 0.01262301, + "balance_loss_clip": 0.06275017, + "balance_loss_mlp": 0.01252037, + "epoch": 0.7142642416954758, + "flos": 21914667216000.0, + "grad_norm": 1.6535111085971643, + "language_loss": 0.69445574, + "learning_rate": 7.969074262321646e-07, + "loss": 0.77118039, + "num_input_tokens_seen": 256261920, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.10266113, + "step": 11880, + "time_per_iteration": 2.507603406906128 + }, + { + "auxiliary_loss_clip": 0.0641037, + "auxiliary_loss_mlp": 0.01264833, + "balance_loss_clip": 0.06273447, + "balance_loss_mlp": 0.01254772, + "epoch": 0.7143243649481437, + "flos": 20810579892480.0, + "grad_norm": 2.0343383375931894, + "language_loss": 0.80753726, + "learning_rate": 7.965963322749674e-07, + "loss": 0.88428932, + "num_input_tokens_seen": 256277970, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.10058594, + "step": 11881, + "time_per_iteration": 2.4606220722198486 + }, + { + "auxiliary_loss_clip": 0.06409037, + "auxiliary_loss_mlp": 0.01264183, + "balance_loss_clip": 0.06274998, + "balance_loss_mlp": 0.01254539, + "epoch": 0.7143844882008117, + "flos": 27242348244480.0, + "grad_norm": 1.58430278316452, + "language_loss": 0.64282894, + "learning_rate": 7.962852839509579e-07, + "loss": 0.71956116, + "num_input_tokens_seen": 256298205, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.09643555, + "step": 11882, + "time_per_iteration": 2.56210994720459 + }, + { + "auxiliary_loss_clip": 0.06411886, + "auxiliary_loss_mlp": 0.01263564, + "balance_loss_clip": 0.06275278, + "balance_loss_mlp": 0.01253473, + "epoch": 0.7144446114534796, + "flos": 17935150055040.0, + "grad_norm": 1.872999181445386, + "language_loss": 0.69193482, + "learning_rate": 7.959742812719304e-07, + "loss": 0.76868939, + "num_input_tokens_seen": 256316685, + "router_z_loss_clip": 1.36621094, + "router_z_loss_mlp": 0.10101318, + "step": 11883, + "time_per_iteration": 2.4767167568206787 + }, + { + "auxiliary_loss_clip": 0.06408374, + "auxiliary_loss_mlp": 0.01263792, + "balance_loss_clip": 0.06277797, + "balance_loss_mlp": 0.01253761, + "epoch": 0.7145047347061476, + "flos": 20747282532480.0, + "grad_norm": 2.264759730138534, + "language_loss": 0.7842024, + "learning_rate": 7.956633242496788e-07, + "loss": 0.86092412, + "num_input_tokens_seen": 256334205, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.10040283, + "step": 11884, + "time_per_iteration": 2.5488386154174805 + }, + { + "auxiliary_loss_clip": 0.06414723, + "auxiliary_loss_mlp": 0.01266561, + "balance_loss_clip": 0.06273861, + "balance_loss_mlp": 0.01255517, + "epoch": 0.7145648579588155, + "flos": 21184967934720.0, + "grad_norm": 5.179157665604164, + "language_loss": 0.74281037, + "learning_rate": 7.953524128959954e-07, + "loss": 0.81962323, + "num_input_tokens_seen": 256353340, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.1104126, + "step": 11885, + "time_per_iteration": 2.4918782711029053 + }, + { + "auxiliary_loss_clip": 0.06317447, + "auxiliary_loss_mlp": 0.01252483, + "balance_loss_clip": 0.06261733, + "balance_loss_mlp": 0.0125137, + "epoch": 0.7146249812114835, + "flos": 64805207702400.0, + "grad_norm": 0.9938747796430238, + "language_loss": 0.66419291, + "learning_rate": 7.95041547222669e-07, + "loss": 0.73989218, + "num_input_tokens_seen": 256411550, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01115417, + "step": 11886, + "time_per_iteration": 3.0856966972351074 + }, + { + "auxiliary_loss_clip": 0.06409487, + "auxiliary_loss_mlp": 0.01262772, + "balance_loss_clip": 0.06275956, + "balance_loss_mlp": 0.01253361, + "epoch": 0.7146851044641516, + "flos": 18119744599680.0, + "grad_norm": 1.9726076644282031, + "language_loss": 0.75334477, + "learning_rate": 7.947307272414874e-07, + "loss": 0.8300674, + "num_input_tokens_seen": 256430360, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.09411621, + "step": 11887, + "time_per_iteration": 2.457226037979126 + }, + { + "auxiliary_loss_clip": 0.06411713, + "auxiliary_loss_mlp": 0.01264697, + "balance_loss_clip": 0.06275448, + "balance_loss_mlp": 0.01254701, + "epoch": 0.7147452277168195, + "flos": 19249715635200.0, + "grad_norm": 1.4837579130348453, + "language_loss": 0.71681702, + "learning_rate": 7.944199529642372e-07, + "loss": 0.79358119, + "num_input_tokens_seen": 256449750, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.10003662, + "step": 11888, + "time_per_iteration": 2.5040013790130615 + }, + { + "auxiliary_loss_clip": 0.06412415, + "auxiliary_loss_mlp": 0.0126625, + "balance_loss_clip": 0.06273472, + "balance_loss_mlp": 0.01256266, + "epoch": 0.7148053509694875, + "flos": 23770773734400.0, + "grad_norm": 1.770417967060374, + "language_loss": 0.84754878, + "learning_rate": 7.941092244027041e-07, + "loss": 0.92433536, + "num_input_tokens_seen": 256467330, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.09991455, + "step": 11889, + "time_per_iteration": 2.498847246170044 + }, + { + "auxiliary_loss_clip": 0.0640825, + "auxiliary_loss_mlp": 0.01263505, + "balance_loss_clip": 0.06273344, + "balance_loss_mlp": 0.0125401, + "epoch": 0.7148654742221554, + "flos": 22490770763520.0, + "grad_norm": 1.697229185177074, + "language_loss": 0.75894499, + "learning_rate": 7.937985415686695e-07, + "loss": 0.8356626, + "num_input_tokens_seen": 256485705, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.0949707, + "step": 11890, + "time_per_iteration": 2.5205180644989014 + }, + { + "auxiliary_loss_clip": 0.06404347, + "auxiliary_loss_mlp": 0.01264735, + "balance_loss_clip": 0.06271873, + "balance_loss_mlp": 0.01255073, + "epoch": 0.7149255974748234, + "flos": 24685822247040.0, + "grad_norm": 1.9172824039571863, + "language_loss": 0.74212694, + "learning_rate": 7.934879044739147e-07, + "loss": 0.81881773, + "num_input_tokens_seen": 256504755, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.09667969, + "step": 11891, + "time_per_iteration": 2.515684127807617 + }, + { + "auxiliary_loss_clip": 0.06409282, + "auxiliary_loss_mlp": 0.0126706, + "balance_loss_clip": 0.06273171, + "balance_loss_mlp": 0.01256963, + "epoch": 0.7149857207274913, + "flos": 18411464989440.0, + "grad_norm": 1.8378637994341889, + "language_loss": 0.68246537, + "learning_rate": 7.931773131302211e-07, + "loss": 0.75922883, + "num_input_tokens_seen": 256523670, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.10101318, + "step": 11892, + "time_per_iteration": 2.4761176109313965 + }, + { + "auxiliary_loss_clip": 0.06410619, + "auxiliary_loss_mlp": 0.01266239, + "balance_loss_clip": 0.06271711, + "balance_loss_mlp": 0.01254813, + "epoch": 0.7150458439801594, + "flos": 24975907482240.0, + "grad_norm": 1.712623401245163, + "language_loss": 0.74044412, + "learning_rate": 7.928667675493632e-07, + "loss": 0.81721264, + "num_input_tokens_seen": 256542225, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.11413574, + "step": 11893, + "time_per_iteration": 2.5127475261688232 + }, + { + "auxiliary_loss_clip": 0.06412932, + "auxiliary_loss_mlp": 0.01265609, + "balance_loss_clip": 0.06273164, + "balance_loss_mlp": 0.01253873, + "epoch": 0.7151059672328273, + "flos": 16696376092800.0, + "grad_norm": 2.7158372012320315, + "language_loss": 0.66545182, + "learning_rate": 7.925562677431185e-07, + "loss": 0.74223733, + "num_input_tokens_seen": 256560730, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.11743164, + "step": 11894, + "time_per_iteration": 2.5338070392608643 + }, + { + "auxiliary_loss_clip": 0.06413232, + "auxiliary_loss_mlp": 0.01263618, + "balance_loss_clip": 0.06275386, + "balance_loss_mlp": 0.0125364, + "epoch": 0.7151660904854953, + "flos": 27279216840960.0, + "grad_norm": 7.327232790836601, + "language_loss": 0.77995753, + "learning_rate": 7.922458137232613e-07, + "loss": 0.85672593, + "num_input_tokens_seen": 256580505, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.09979248, + "step": 11895, + "time_per_iteration": 2.545539379119873 + }, + { + "auxiliary_loss_clip": 0.06408492, + "auxiliary_loss_mlp": 0.01262254, + "balance_loss_clip": 0.06271514, + "balance_loss_mlp": 0.01251776, + "epoch": 0.7152262137381632, + "flos": 18338063212800.0, + "grad_norm": 2.1720944859755327, + "language_loss": 0.69649661, + "learning_rate": 7.919354055015643e-07, + "loss": 0.77320409, + "num_input_tokens_seen": 256597330, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.1048584, + "step": 11896, + "time_per_iteration": 2.5020852088928223 + }, + { + "auxiliary_loss_clip": 0.06410179, + "auxiliary_loss_mlp": 0.01270904, + "balance_loss_clip": 0.06272502, + "balance_loss_mlp": 0.01259203, + "epoch": 0.7152863369908312, + "flos": 21805822362240.0, + "grad_norm": 1.8979241109476415, + "language_loss": 0.8686198, + "learning_rate": 7.91625043089798e-07, + "loss": 0.94543064, + "num_input_tokens_seen": 256616030, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.11694336, + "step": 11897, + "time_per_iteration": 2.4981558322906494 + }, + { + "auxiliary_loss_clip": 0.06406087, + "auxiliary_loss_mlp": 0.01264151, + "balance_loss_clip": 0.06274753, + "balance_loss_mlp": 0.01254084, + "epoch": 0.7153464602434991, + "flos": 22164068494080.0, + "grad_norm": 1.7720635566598981, + "language_loss": 0.78347677, + "learning_rate": 7.913147264997304e-07, + "loss": 0.86017919, + "num_input_tokens_seen": 256635570, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.10070801, + "step": 11898, + "time_per_iteration": 2.568208694458008 + }, + { + "auxiliary_loss_clip": 0.06413846, + "auxiliary_loss_mlp": 0.01263525, + "balance_loss_clip": 0.0627441, + "balance_loss_mlp": 0.01252868, + "epoch": 0.7154065834961671, + "flos": 24722732770560.0, + "grad_norm": 1.7720575063877593, + "language_loss": 0.73240674, + "learning_rate": 7.910044557431302e-07, + "loss": 0.8091805, + "num_input_tokens_seen": 256655290, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.10656738, + "step": 11899, + "time_per_iteration": 3.9873409271240234 + }, + { + "auxiliary_loss_clip": 0.06406702, + "auxiliary_loss_mlp": 0.0126605, + "balance_loss_clip": 0.06271633, + "balance_loss_mlp": 0.01255482, + "epoch": 0.7154667067488351, + "flos": 22608084879360.0, + "grad_norm": 2.7184837218905216, + "language_loss": 0.75906515, + "learning_rate": 7.906942308317614e-07, + "loss": 0.83579266, + "num_input_tokens_seen": 256671605, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.10565186, + "step": 11900, + "time_per_iteration": 2.48612380027771 + }, + { + "auxiliary_loss_clip": 0.06410916, + "auxiliary_loss_mlp": 0.01263744, + "balance_loss_clip": 0.06274971, + "balance_loss_mlp": 0.01254064, + "epoch": 0.7155268300015031, + "flos": 18777216061440.0, + "grad_norm": 1.8830405388899822, + "language_loss": 0.80537415, + "learning_rate": 7.903840517773886e-07, + "loss": 0.88212073, + "num_input_tokens_seen": 256689680, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.09680176, + "step": 11901, + "time_per_iteration": 2.538071632385254 + }, + { + "auxiliary_loss_clip": 0.06413621, + "auxiliary_loss_mlp": 0.01265462, + "balance_loss_clip": 0.0627216, + "balance_loss_mlp": 0.01254626, + "epoch": 0.7155869532541711, + "flos": 18302242792320.0, + "grad_norm": 1.8091761354011133, + "language_loss": 0.82077742, + "learning_rate": 7.900739185917744e-07, + "loss": 0.89756829, + "num_input_tokens_seen": 256707760, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.10839844, + "step": 11902, + "time_per_iteration": 2.4796504974365234 + }, + { + "auxiliary_loss_clip": 0.06407838, + "auxiliary_loss_mlp": 0.01263676, + "balance_loss_clip": 0.06271887, + "balance_loss_mlp": 0.01254306, + "epoch": 0.715647076506839, + "flos": 11985063298560.0, + "grad_norm": 1.8489548968848413, + "language_loss": 0.68603027, + "learning_rate": 7.897638312866785e-07, + "loss": 0.76274538, + "num_input_tokens_seen": 256724150, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.09356689, + "step": 11903, + "time_per_iteration": 2.502664566040039 + }, + { + "auxiliary_loss_clip": 0.06406122, + "auxiliary_loss_mlp": 0.01265685, + "balance_loss_clip": 0.06273056, + "balance_loss_mlp": 0.0125591, + "epoch": 0.715707199759507, + "flos": 18957408266880.0, + "grad_norm": 1.5823213300778882, + "language_loss": 0.75905824, + "learning_rate": 7.894537898738589e-07, + "loss": 0.83577633, + "num_input_tokens_seen": 256742780, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.09765625, + "step": 11904, + "time_per_iteration": 2.4838523864746094 + }, + { + "auxiliary_loss_clip": 0.06408757, + "auxiliary_loss_mlp": 0.01267288, + "balance_loss_clip": 0.06273915, + "balance_loss_mlp": 0.01255838, + "epoch": 0.7157673230121749, + "flos": 15309792328320.0, + "grad_norm": 1.6671251370747393, + "language_loss": 0.7200684, + "learning_rate": 7.891437943650727e-07, + "loss": 0.79682887, + "num_input_tokens_seen": 256761355, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.11456299, + "step": 11905, + "time_per_iteration": 2.5194296836853027 + }, + { + "auxiliary_loss_clip": 0.06407201, + "auxiliary_loss_mlp": 0.0126414, + "balance_loss_clip": 0.06273023, + "balance_loss_mlp": 0.01254377, + "epoch": 0.715827446264843, + "flos": 23228561963520.0, + "grad_norm": 1.7268826203228764, + "language_loss": 0.7871933, + "learning_rate": 7.88833844772076e-07, + "loss": 0.86390674, + "num_input_tokens_seen": 256781335, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.09765625, + "step": 11906, + "time_per_iteration": 2.505692720413208 + }, + { + "auxiliary_loss_clip": 0.06311446, + "auxiliary_loss_mlp": 0.01249409, + "balance_loss_clip": 0.06255978, + "balance_loss_mlp": 0.01248228, + "epoch": 0.7158875695175109, + "flos": 60993011145600.0, + "grad_norm": 0.7186868091888179, + "language_loss": 0.55247056, + "learning_rate": 7.885239411066205e-07, + "loss": 0.62807906, + "num_input_tokens_seen": 256838890, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01179504, + "step": 11907, + "time_per_iteration": 3.077824354171753 + }, + { + "auxiliary_loss_clip": 0.06404838, + "auxiliary_loss_mlp": 0.01262889, + "balance_loss_clip": 0.06269851, + "balance_loss_mlp": 0.01252893, + "epoch": 0.7159476927701789, + "flos": 17134480765440.0, + "grad_norm": 1.7650418564568968, + "language_loss": 0.69603425, + "learning_rate": 7.882140833804593e-07, + "loss": 0.77271152, + "num_input_tokens_seen": 256858145, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.09985352, + "step": 11908, + "time_per_iteration": 2.4865145683288574 + }, + { + "auxiliary_loss_clip": 0.06412758, + "auxiliary_loss_mlp": 0.01264461, + "balance_loss_clip": 0.06276048, + "balance_loss_mlp": 0.01253625, + "epoch": 0.7160078160228468, + "flos": 22496934038400.0, + "grad_norm": 1.9817565541714355, + "language_loss": 0.71485305, + "learning_rate": 7.879042716053415e-07, + "loss": 0.79162526, + "num_input_tokens_seen": 256878545, + "router_z_loss_clip": 1.36621094, + "router_z_loss_mlp": 0.1083374, + "step": 11909, + "time_per_iteration": 2.5261456966400146 + }, + { + "auxiliary_loss_clip": 0.06411682, + "auxiliary_loss_mlp": 0.01264075, + "balance_loss_clip": 0.06275836, + "balance_loss_mlp": 0.01253316, + "epoch": 0.7160679392755148, + "flos": 30598704990720.0, + "grad_norm": 1.38087645688004, + "language_loss": 0.75330472, + "learning_rate": 7.875945057930144e-07, + "loss": 0.83006227, + "num_input_tokens_seen": 256899920, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.10766602, + "step": 11910, + "time_per_iteration": 4.044188022613525 + }, + { + "auxiliary_loss_clip": 0.06406509, + "auxiliary_loss_mlp": 0.01266213, + "balance_loss_clip": 0.06271985, + "balance_loss_mlp": 0.01256098, + "epoch": 0.7161280625281827, + "flos": 21329884771200.0, + "grad_norm": 1.597685322541952, + "language_loss": 0.76519787, + "learning_rate": 7.872847859552251e-07, + "loss": 0.84192502, + "num_input_tokens_seen": 256918460, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.10107422, + "step": 11911, + "time_per_iteration": 2.665767192840576 + }, + { + "auxiliary_loss_clip": 0.06409479, + "auxiliary_loss_mlp": 0.01265159, + "balance_loss_clip": 0.06274366, + "balance_loss_mlp": 0.01254376, + "epoch": 0.7161881857808508, + "flos": 61873218288000.0, + "grad_norm": 1.667698649027388, + "language_loss": 0.58612812, + "learning_rate": 7.869751121037192e-07, + "loss": 0.66287452, + "num_input_tokens_seen": 256942015, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.10791016, + "step": 11912, + "time_per_iteration": 2.9163358211517334 + }, + { + "auxiliary_loss_clip": 0.06408441, + "auxiliary_loss_mlp": 0.01264274, + "balance_loss_clip": 0.06275295, + "balance_loss_mlp": 0.0125398, + "epoch": 0.7162483090335187, + "flos": 20818126759680.0, + "grad_norm": 1.9057750004055583, + "language_loss": 0.78541219, + "learning_rate": 7.866654842502376e-07, + "loss": 0.86213928, + "num_input_tokens_seen": 256961065, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.10296631, + "step": 11913, + "time_per_iteration": 2.496882438659668 + }, + { + "auxiliary_loss_clip": 0.06405665, + "auxiliary_loss_mlp": 0.01267442, + "balance_loss_clip": 0.06273191, + "balance_loss_mlp": 0.01257864, + "epoch": 0.7163084322861867, + "flos": 24104393965440.0, + "grad_norm": 1.590904649851159, + "language_loss": 0.7420674, + "learning_rate": 7.863559024065234e-07, + "loss": 0.81879842, + "num_input_tokens_seen": 256982165, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.0958252, + "step": 11914, + "time_per_iteration": 3.96821665763855 + }, + { + "auxiliary_loss_clip": 0.06403452, + "auxiliary_loss_mlp": 0.01261289, + "balance_loss_clip": 0.06272376, + "balance_loss_mlp": 0.01251574, + "epoch": 0.7163685555388547, + "flos": 20086540761600.0, + "grad_norm": 1.6632734389842445, + "language_loss": 0.74058056, + "learning_rate": 7.860463665843143e-07, + "loss": 0.81722796, + "num_input_tokens_seen": 256999825, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.097229, + "step": 11915, + "time_per_iteration": 2.4962167739868164 + }, + { + "auxiliary_loss_clip": 0.06405881, + "auxiliary_loss_mlp": 0.01264509, + "balance_loss_clip": 0.06270003, + "balance_loss_mlp": 0.01254323, + "epoch": 0.7164286787915226, + "flos": 17462692408320.0, + "grad_norm": 1.6596246771079706, + "language_loss": 0.81293082, + "learning_rate": 7.85736876795349e-07, + "loss": 0.88963467, + "num_input_tokens_seen": 257017450, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.10186768, + "step": 11916, + "time_per_iteration": 2.5293524265289307 + }, + { + "auxiliary_loss_clip": 0.06407885, + "auxiliary_loss_mlp": 0.01267283, + "balance_loss_clip": 0.06272584, + "balance_loss_mlp": 0.01257555, + "epoch": 0.7164888020441906, + "flos": 19724982393600.0, + "grad_norm": 1.9910779108762084, + "language_loss": 0.68661398, + "learning_rate": 7.854274330513626e-07, + "loss": 0.76336563, + "num_input_tokens_seen": 257035465, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.09729004, + "step": 11917, + "time_per_iteration": 2.5082740783691406 + }, + { + "auxiliary_loss_clip": 0.0640521, + "auxiliary_loss_mlp": 0.01268808, + "balance_loss_clip": 0.06270327, + "balance_loss_mlp": 0.01258127, + "epoch": 0.7165489252968585, + "flos": 21476939886720.0, + "grad_norm": 1.5888688683522953, + "language_loss": 0.76160645, + "learning_rate": 7.851180353640896e-07, + "loss": 0.8383466, + "num_input_tokens_seen": 257053750, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.10687256, + "step": 11918, + "time_per_iteration": 3.8991646766662598 + }, + { + "auxiliary_loss_clip": 0.06316125, + "auxiliary_loss_mlp": 0.01260952, + "balance_loss_clip": 0.06260598, + "balance_loss_mlp": 0.01259661, + "epoch": 0.7166090485495266, + "flos": 69949426216320.0, + "grad_norm": 0.6355552708819127, + "language_loss": 0.53723788, + "learning_rate": 7.848086837452639e-07, + "loss": 0.61300862, + "num_input_tokens_seen": 257121215, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01291656, + "step": 11919, + "time_per_iteration": 3.2083816528320312 + }, + { + "auxiliary_loss_clip": 0.06411423, + "auxiliary_loss_mlp": 0.01266152, + "balance_loss_clip": 0.06274234, + "balance_loss_mlp": 0.01255948, + "epoch": 0.7166691718021945, + "flos": 27351151171200.0, + "grad_norm": 2.064464674479712, + "language_loss": 0.69286996, + "learning_rate": 7.844993782066132e-07, + "loss": 0.76964575, + "num_input_tokens_seen": 257143370, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.10211182, + "step": 11920, + "time_per_iteration": 2.6113531589508057 + }, + { + "auxiliary_loss_clip": 0.064086, + "auxiliary_loss_mlp": 0.01265203, + "balance_loss_clip": 0.06273469, + "balance_loss_mlp": 0.01255106, + "epoch": 0.7167292950548625, + "flos": 30416667995520.0, + "grad_norm": 1.8345459175809258, + "language_loss": 0.75019145, + "learning_rate": 7.841901187598678e-07, + "loss": 0.82692945, + "num_input_tokens_seen": 257162160, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.10101318, + "step": 11921, + "time_per_iteration": 2.5700902938842773 + }, + { + "auxiliary_loss_clip": 0.06416579, + "auxiliary_loss_mlp": 0.01267308, + "balance_loss_clip": 0.06275436, + "balance_loss_mlp": 0.01254177, + "epoch": 0.7167894183075304, + "flos": 14575942270080.0, + "grad_norm": 1.9367359294583022, + "language_loss": 0.75734651, + "learning_rate": 7.83880905416755e-07, + "loss": 0.83418536, + "num_input_tokens_seen": 257179300, + "router_z_loss_clip": 1.41210938, + "router_z_loss_mlp": 0.13128662, + "step": 11922, + "time_per_iteration": 2.465078830718994 + }, + { + "auxiliary_loss_clip": 0.06313948, + "auxiliary_loss_mlp": 0.0125594, + "balance_loss_clip": 0.06258468, + "balance_loss_mlp": 0.01254771, + "epoch": 0.7168495415601984, + "flos": 64128365948160.0, + "grad_norm": 0.7346387486828846, + "language_loss": 0.55178893, + "learning_rate": 7.83571738189001e-07, + "loss": 0.62748784, + "num_input_tokens_seen": 257235470, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01166534, + "step": 11923, + "time_per_iteration": 2.953462839126587 + }, + { + "auxiliary_loss_clip": 0.06408657, + "auxiliary_loss_mlp": 0.01267487, + "balance_loss_clip": 0.06272471, + "balance_loss_mlp": 0.01257062, + "epoch": 0.7169096648128663, + "flos": 24688421723520.0, + "grad_norm": 1.4959305525203388, + "language_loss": 0.77240855, + "learning_rate": 7.832626170883279e-07, + "loss": 0.84916997, + "num_input_tokens_seen": 257255850, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.10430908, + "step": 11924, + "time_per_iteration": 2.540371894836426 + }, + { + "auxiliary_loss_clip": 0.06404062, + "auxiliary_loss_mlp": 0.01264587, + "balance_loss_clip": 0.06271583, + "balance_loss_mlp": 0.01254776, + "epoch": 0.7169697880655344, + "flos": 20673754974720.0, + "grad_norm": 1.6022064591556118, + "language_loss": 0.68295527, + "learning_rate": 7.829535421264588e-07, + "loss": 0.75964177, + "num_input_tokens_seen": 257275425, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.0980835, + "step": 11925, + "time_per_iteration": 2.517883539199829 + }, + { + "auxiliary_loss_clip": 0.06401929, + "auxiliary_loss_mlp": 0.01264464, + "balance_loss_clip": 0.062714, + "balance_loss_mlp": 0.01254689, + "epoch": 0.7170299113182023, + "flos": 21039044849280.0, + "grad_norm": 1.4805989114047955, + "language_loss": 0.77453327, + "learning_rate": 7.826445133151133e-07, + "loss": 0.85119712, + "num_input_tokens_seen": 257295740, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.09771729, + "step": 11926, + "time_per_iteration": 2.525294065475464 + }, + { + "auxiliary_loss_clip": 0.06412005, + "auxiliary_loss_mlp": 0.01265458, + "balance_loss_clip": 0.06270812, + "balance_loss_mlp": 0.01254652, + "epoch": 0.7170900345708703, + "flos": 22899931050240.0, + "grad_norm": 2.0777865418109798, + "language_loss": 0.77830517, + "learning_rate": 7.823355306660093e-07, + "loss": 0.85507977, + "num_input_tokens_seen": 257315970, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.10809326, + "step": 11927, + "time_per_iteration": 2.5361175537109375 + }, + { + "auxiliary_loss_clip": 0.06405352, + "auxiliary_loss_mlp": 0.012651, + "balance_loss_clip": 0.06273961, + "balance_loss_mlp": 0.01255134, + "epoch": 0.7171501578235383, + "flos": 15523331258880.0, + "grad_norm": 1.5750787532555974, + "language_loss": 0.69694316, + "learning_rate": 7.820265941908642e-07, + "loss": 0.77364767, + "num_input_tokens_seen": 257334230, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09960938, + "step": 11928, + "time_per_iteration": 2.5053482055664062 + }, + { + "auxiliary_loss_clip": 0.06404196, + "auxiliary_loss_mlp": 0.01263642, + "balance_loss_clip": 0.06272393, + "balance_loss_mlp": 0.01253563, + "epoch": 0.7172102810762062, + "flos": 26111496741120.0, + "grad_norm": 1.7658790260288333, + "language_loss": 0.65507495, + "learning_rate": 7.817177039013931e-07, + "loss": 0.73175335, + "num_input_tokens_seen": 257352145, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.10076904, + "step": 11929, + "time_per_iteration": 2.5298080444335938 + }, + { + "auxiliary_loss_clip": 0.06411615, + "auxiliary_loss_mlp": 0.01264642, + "balance_loss_clip": 0.06275426, + "balance_loss_mlp": 0.01254455, + "epoch": 0.7172704043288742, + "flos": 21513011869440.0, + "grad_norm": 1.88648366975717, + "language_loss": 0.70105934, + "learning_rate": 7.81408859809308e-07, + "loss": 0.7778219, + "num_input_tokens_seen": 257371460, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.10186768, + "step": 11930, + "time_per_iteration": 2.492851972579956 + }, + { + "auxiliary_loss_clip": 0.06407914, + "auxiliary_loss_mlp": 0.0126604, + "balance_loss_clip": 0.06271791, + "balance_loss_mlp": 0.01255675, + "epoch": 0.7173305275815421, + "flos": 18776964499200.0, + "grad_norm": 1.6767880793565944, + "language_loss": 0.80551809, + "learning_rate": 7.811000619263219e-07, + "loss": 0.88225758, + "num_input_tokens_seen": 257390800, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.10351562, + "step": 11931, + "time_per_iteration": 2.5129940509796143 + }, + { + "auxiliary_loss_clip": 0.06405962, + "auxiliary_loss_mlp": 0.01263185, + "balance_loss_clip": 0.06272676, + "balance_loss_mlp": 0.01253398, + "epoch": 0.7173906508342102, + "flos": 16185372768000.0, + "grad_norm": 2.3164344242090245, + "language_loss": 0.78938711, + "learning_rate": 7.80791310264143e-07, + "loss": 0.8660785, + "num_input_tokens_seen": 257407495, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.09790039, + "step": 11932, + "time_per_iteration": 2.458064317703247 + }, + { + "auxiliary_loss_clip": 0.06406456, + "auxiliary_loss_mlp": 0.01265129, + "balance_loss_clip": 0.06274296, + "balance_loss_mlp": 0.01255163, + "epoch": 0.7174507740868781, + "flos": 26620948765440.0, + "grad_norm": 2.941669914403725, + "language_loss": 0.75155187, + "learning_rate": 7.804826048344803e-07, + "loss": 0.82826775, + "num_input_tokens_seen": 257429675, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.09960938, + "step": 11933, + "time_per_iteration": 2.5739805698394775 + }, + { + "auxiliary_loss_clip": 0.06418844, + "auxiliary_loss_mlp": 0.01266714, + "balance_loss_clip": 0.06277472, + "balance_loss_mlp": 0.01254858, + "epoch": 0.7175108973395461, + "flos": 18437264847360.0, + "grad_norm": 7.531680164120171, + "language_loss": 0.69827807, + "learning_rate": 7.801739456490388e-07, + "loss": 0.77513361, + "num_input_tokens_seen": 257442765, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.11859131, + "step": 11934, + "time_per_iteration": 2.4455020427703857 + }, + { + "auxiliary_loss_clip": 0.06406108, + "auxiliary_loss_mlp": 0.01263916, + "balance_loss_clip": 0.06272999, + "balance_loss_mlp": 0.0125395, + "epoch": 0.717571020592214, + "flos": 23921769991680.0, + "grad_norm": 2.2343261949316013, + "language_loss": 0.86673319, + "learning_rate": 7.798653327195237e-07, + "loss": 0.9434334, + "num_input_tokens_seen": 257459310, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.09967041, + "step": 11935, + "time_per_iteration": 2.528456211090088 + }, + { + "auxiliary_loss_clip": 0.06406541, + "auxiliary_loss_mlp": 0.0126352, + "balance_loss_clip": 0.06272122, + "balance_loss_mlp": 0.01253202, + "epoch": 0.717631143844882, + "flos": 38266647828480.0, + "grad_norm": 1.602642316585254, + "language_loss": 0.73995256, + "learning_rate": 7.795567660576388e-07, + "loss": 0.81665319, + "num_input_tokens_seen": 257484750, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.10314941, + "step": 11936, + "time_per_iteration": 2.67246413230896 + }, + { + "auxiliary_loss_clip": 0.06313888, + "auxiliary_loss_mlp": 0.01249886, + "balance_loss_clip": 0.06258012, + "balance_loss_mlp": 0.01248772, + "epoch": 0.7176912670975499, + "flos": 65536961408640.0, + "grad_norm": 0.7536478557805156, + "language_loss": 0.55813849, + "learning_rate": 7.79248245675082e-07, + "loss": 0.63377625, + "num_input_tokens_seen": 257543110, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.0111618, + "step": 11937, + "time_per_iteration": 3.14385724067688 + }, + { + "auxiliary_loss_clip": 0.06410685, + "auxiliary_loss_mlp": 0.01264931, + "balance_loss_clip": 0.06272934, + "balance_loss_mlp": 0.01254042, + "epoch": 0.717751390350218, + "flos": 31288433074560.0, + "grad_norm": 3.0696111718968555, + "language_loss": 0.54891688, + "learning_rate": 7.789397715835542e-07, + "loss": 0.62567306, + "num_input_tokens_seen": 257567410, + "router_z_loss_clip": 1.37695312, + "router_z_loss_mlp": 0.10900879, + "step": 11938, + "time_per_iteration": 2.612314462661743 + }, + { + "auxiliary_loss_clip": 0.06404351, + "auxiliary_loss_mlp": 0.01261396, + "balance_loss_clip": 0.06274119, + "balance_loss_mlp": 0.01251811, + "epoch": 0.7178115136028859, + "flos": 19864155225600.0, + "grad_norm": 1.5149026364788483, + "language_loss": 0.77031577, + "learning_rate": 7.786313437947527e-07, + "loss": 0.84697324, + "num_input_tokens_seen": 257586270, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.09576416, + "step": 11939, + "time_per_iteration": 3.9376840591430664 + }, + { + "auxiliary_loss_clip": 0.06311642, + "auxiliary_loss_mlp": 0.01253055, + "balance_loss_clip": 0.06255894, + "balance_loss_mlp": 0.01251996, + "epoch": 0.7178716368555539, + "flos": 64369576725120.0, + "grad_norm": 0.7379302398056043, + "language_loss": 0.6123156, + "learning_rate": 7.783229623203738e-07, + "loss": 0.68796259, + "num_input_tokens_seen": 257647415, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01060486, + "step": 11940, + "time_per_iteration": 3.106687545776367 + }, + { + "auxiliary_loss_clip": 0.0640372, + "auxiliary_loss_mlp": 0.01262674, + "balance_loss_clip": 0.06272845, + "balance_loss_mlp": 0.01253209, + "epoch": 0.7179317601082219, + "flos": 26770184087040.0, + "grad_norm": 1.6027609306181398, + "language_loss": 0.59101206, + "learning_rate": 7.780146271721097e-07, + "loss": 0.66767597, + "num_input_tokens_seen": 257669795, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09466553, + "step": 11941, + "time_per_iteration": 2.6309211254119873 + }, + { + "auxiliary_loss_clip": 0.06405525, + "auxiliary_loss_mlp": 0.01263209, + "balance_loss_clip": 0.06273725, + "balance_loss_mlp": 0.01253779, + "epoch": 0.7179918833608898, + "flos": 23520575842560.0, + "grad_norm": 1.7346427869736905, + "language_loss": 0.79611468, + "learning_rate": 7.777063383616543e-07, + "loss": 0.87280202, + "num_input_tokens_seen": 257687415, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09429932, + "step": 11942, + "time_per_iteration": 2.5131733417510986 + }, + { + "auxiliary_loss_clip": 0.06404739, + "auxiliary_loss_mlp": 0.01268984, + "balance_loss_clip": 0.06271753, + "balance_loss_mlp": 0.01258345, + "epoch": 0.7180520066135578, + "flos": 17171349361920.0, + "grad_norm": 2.144705941723289, + "language_loss": 0.66274554, + "learning_rate": 7.773980959006968e-07, + "loss": 0.73948282, + "num_input_tokens_seen": 257706215, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.10638428, + "step": 11943, + "time_per_iteration": 2.5236313343048096 + }, + { + "auxiliary_loss_clip": 0.06407227, + "auxiliary_loss_mlp": 0.01268193, + "balance_loss_clip": 0.06273103, + "balance_loss_mlp": 0.01257798, + "epoch": 0.7181121298662257, + "flos": 17572417729920.0, + "grad_norm": 1.703985250404805, + "language_loss": 0.78651738, + "learning_rate": 7.770898998009254e-07, + "loss": 0.86327153, + "num_input_tokens_seen": 257724740, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.10388184, + "step": 11944, + "time_per_iteration": 2.489701271057129 + }, + { + "auxiliary_loss_clip": 0.06407581, + "auxiliary_loss_mlp": 0.01268486, + "balance_loss_clip": 0.06269886, + "balance_loss_mlp": 0.01256243, + "epoch": 0.7181722531188938, + "flos": 11952471260160.0, + "grad_norm": 2.3927781343480024, + "language_loss": 0.62825882, + "learning_rate": 7.767817500740277e-07, + "loss": 0.70501947, + "num_input_tokens_seen": 257742060, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.12243652, + "step": 11945, + "time_per_iteration": 2.523031711578369 + }, + { + "auxiliary_loss_clip": 0.0631476, + "auxiliary_loss_mlp": 0.0125155, + "balance_loss_clip": 0.06259042, + "balance_loss_mlp": 0.01250277, + "epoch": 0.7182323763715617, + "flos": 65522664288000.0, + "grad_norm": 0.6825637115139678, + "language_loss": 0.5092659, + "learning_rate": 7.76473646731689e-07, + "loss": 0.58492899, + "num_input_tokens_seen": 257802250, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.01273346, + "step": 11946, + "time_per_iteration": 3.0530238151550293 + }, + { + "auxiliary_loss_clip": 0.06408353, + "auxiliary_loss_mlp": 0.01265169, + "balance_loss_clip": 0.06271833, + "balance_loss_mlp": 0.01254553, + "epoch": 0.7182924996242297, + "flos": 20637137940480.0, + "grad_norm": 1.6252151206202925, + "language_loss": 0.7525813, + "learning_rate": 7.761655897855925e-07, + "loss": 0.8293165, + "num_input_tokens_seen": 257821155, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.10620117, + "step": 11947, + "time_per_iteration": 2.535158157348633 + }, + { + "auxiliary_loss_clip": 0.0640206, + "auxiliary_loss_mlp": 0.01266117, + "balance_loss_clip": 0.06270691, + "balance_loss_mlp": 0.01256556, + "epoch": 0.7183526228768976, + "flos": 16221947875200.0, + "grad_norm": 1.376797817491515, + "language_loss": 0.7316047, + "learning_rate": 7.758575792474187e-07, + "loss": 0.80828649, + "num_input_tokens_seen": 257839905, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09564209, + "step": 11948, + "time_per_iteration": 2.465437173843384 + }, + { + "auxiliary_loss_clip": 0.06408493, + "auxiliary_loss_mlp": 0.01270033, + "balance_loss_clip": 0.06272705, + "balance_loss_mlp": 0.0125959, + "epoch": 0.7184127461295656, + "flos": 22238518446720.0, + "grad_norm": 1.618352037269111, + "language_loss": 0.71604127, + "learning_rate": 7.755496151288483e-07, + "loss": 0.79282653, + "num_input_tokens_seen": 257860055, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10443115, + "step": 11949, + "time_per_iteration": 2.5727827548980713 + }, + { + "auxiliary_loss_clip": 0.06405893, + "auxiliary_loss_mlp": 0.01265064, + "balance_loss_clip": 0.06273241, + "balance_loss_mlp": 0.01255659, + "epoch": 0.7184728693822335, + "flos": 27351863930880.0, + "grad_norm": 2.584174612007466, + "language_loss": 0.76537007, + "learning_rate": 7.752416974415598e-07, + "loss": 0.84207964, + "num_input_tokens_seen": 257879315, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09411621, + "step": 11950, + "time_per_iteration": 4.074851751327515 + }, + { + "auxiliary_loss_clip": 0.0641187, + "auxiliary_loss_mlp": 0.01266048, + "balance_loss_clip": 0.06275279, + "balance_loss_mlp": 0.01254968, + "epoch": 0.7185329926349016, + "flos": 16514129462400.0, + "grad_norm": 2.1607831663839163, + "language_loss": 0.67883182, + "learning_rate": 7.749338261972282e-07, + "loss": 0.75561094, + "num_input_tokens_seen": 257896570, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.11071777, + "step": 11951, + "time_per_iteration": 2.4646525382995605 + }, + { + "auxiliary_loss_clip": 0.06409188, + "auxiliary_loss_mlp": 0.0126641, + "balance_loss_clip": 0.0627277, + "balance_loss_mlp": 0.01254549, + "epoch": 0.7185931158875695, + "flos": 23957800047360.0, + "grad_norm": 1.7824491955160577, + "language_loss": 0.78629339, + "learning_rate": 7.746260014075286e-07, + "loss": 0.86304945, + "num_input_tokens_seen": 257916855, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.11865234, + "step": 11952, + "time_per_iteration": 2.516615390777588 + }, + { + "auxiliary_loss_clip": 0.06412063, + "auxiliary_loss_mlp": 0.01268038, + "balance_loss_clip": 0.06272954, + "balance_loss_mlp": 0.0125725, + "epoch": 0.7186532391402375, + "flos": 26549265997440.0, + "grad_norm": 1.8155741690117748, + "language_loss": 0.74781901, + "learning_rate": 7.743182230841352e-07, + "loss": 0.82462001, + "num_input_tokens_seen": 257937140, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.10803223, + "step": 11953, + "time_per_iteration": 2.527876853942871 + }, + { + "auxiliary_loss_clip": 0.06407471, + "auxiliary_loss_mlp": 0.01266403, + "balance_loss_clip": 0.06272335, + "balance_loss_mlp": 0.01256223, + "epoch": 0.7187133623929055, + "flos": 22389682412160.0, + "grad_norm": 1.6183356638137696, + "language_loss": 0.73045003, + "learning_rate": 7.740104912387164e-07, + "loss": 0.80718875, + "num_input_tokens_seen": 257956785, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.10180664, + "step": 11954, + "time_per_iteration": 3.9654276371002197 + }, + { + "auxiliary_loss_clip": 0.06407467, + "auxiliary_loss_mlp": 0.01268821, + "balance_loss_clip": 0.0627225, + "balance_loss_mlp": 0.01258372, + "epoch": 0.7187734856455734, + "flos": 15785184867840.0, + "grad_norm": 1.5034974225164766, + "language_loss": 0.74558902, + "learning_rate": 7.737028058829425e-07, + "loss": 0.82235181, + "num_input_tokens_seen": 257975455, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.10455322, + "step": 11955, + "time_per_iteration": 2.478512763977051 + }, + { + "auxiliary_loss_clip": 0.0640816, + "auxiliary_loss_mlp": 0.01262735, + "balance_loss_clip": 0.06272267, + "balance_loss_mlp": 0.01253032, + "epoch": 0.7188336088982414, + "flos": 31767766755840.0, + "grad_norm": 1.8388372007030418, + "language_loss": 0.73576057, + "learning_rate": 7.733951670284817e-07, + "loss": 0.81246948, + "num_input_tokens_seen": 257996850, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.09698486, + "step": 11956, + "time_per_iteration": 2.5664751529693604 + }, + { + "auxiliary_loss_clip": 0.06408941, + "auxiliary_loss_mlp": 0.01266307, + "balance_loss_clip": 0.06270766, + "balance_loss_mlp": 0.01255793, + "epoch": 0.7188937321509093, + "flos": 21470734684800.0, + "grad_norm": 1.7841137783080476, + "language_loss": 0.70991242, + "learning_rate": 7.730875746869987e-07, + "loss": 0.7866649, + "num_input_tokens_seen": 258016145, + "router_z_loss_clip": 1.38183594, + "router_z_loss_mlp": 0.1050415, + "step": 11957, + "time_per_iteration": 2.5579633712768555 + }, + { + "auxiliary_loss_clip": 0.0641226, + "auxiliary_loss_mlp": 0.01268285, + "balance_loss_clip": 0.06273985, + "balance_loss_mlp": 0.01256966, + "epoch": 0.7189538554035774, + "flos": 27278839497600.0, + "grad_norm": 1.7957042197859685, + "language_loss": 0.74078369, + "learning_rate": 7.727800288701582e-07, + "loss": 0.81758916, + "num_input_tokens_seen": 258035420, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.11322021, + "step": 11958, + "time_per_iteration": 3.9170804023742676 + }, + { + "auxiliary_loss_clip": 0.06403583, + "auxiliary_loss_mlp": 0.01264508, + "balance_loss_clip": 0.06271464, + "balance_loss_mlp": 0.0125484, + "epoch": 0.7190139786562453, + "flos": 21587168332800.0, + "grad_norm": 1.5040650051227977, + "language_loss": 0.84225762, + "learning_rate": 7.724725295896215e-07, + "loss": 0.91893852, + "num_input_tokens_seen": 258053520, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09667969, + "step": 11959, + "time_per_iteration": 2.506953239440918 + }, + { + "auxiliary_loss_clip": 0.06412622, + "auxiliary_loss_mlp": 0.01263708, + "balance_loss_clip": 0.06274716, + "balance_loss_mlp": 0.01253665, + "epoch": 0.7190741019089133, + "flos": 26727990756480.0, + "grad_norm": 1.629776742462507, + "language_loss": 0.82108045, + "learning_rate": 7.7216507685705e-07, + "loss": 0.89784372, + "num_input_tokens_seen": 258073020, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.10046387, + "step": 11960, + "time_per_iteration": 2.5172626972198486 + }, + { + "auxiliary_loss_clip": 0.06408188, + "auxiliary_loss_mlp": 0.01266297, + "balance_loss_clip": 0.06274426, + "balance_loss_mlp": 0.01256051, + "epoch": 0.7191342251615812, + "flos": 26112041792640.0, + "grad_norm": 2.013110188990865, + "language_loss": 0.7794981, + "learning_rate": 7.718576706841013e-07, + "loss": 0.85624301, + "num_input_tokens_seen": 258093155, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.10241699, + "step": 11961, + "time_per_iteration": 2.585214853286743 + }, + { + "auxiliary_loss_clip": 0.06404266, + "auxiliary_loss_mlp": 0.01266808, + "balance_loss_clip": 0.06274937, + "balance_loss_mlp": 0.01257164, + "epoch": 0.7191943484142492, + "flos": 22973794024320.0, + "grad_norm": 1.3445368370245, + "language_loss": 0.75350589, + "learning_rate": 7.715503110824326e-07, + "loss": 0.83021665, + "num_input_tokens_seen": 258113905, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.09643555, + "step": 11962, + "time_per_iteration": 2.5126750469207764 + }, + { + "auxiliary_loss_clip": 0.06408066, + "auxiliary_loss_mlp": 0.01264043, + "balance_loss_clip": 0.06272985, + "balance_loss_mlp": 0.01253428, + "epoch": 0.7192544716669171, + "flos": 22571970969600.0, + "grad_norm": 1.8990374225745255, + "language_loss": 0.7543835, + "learning_rate": 7.712429980637001e-07, + "loss": 0.83110464, + "num_input_tokens_seen": 258132820, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.10614014, + "step": 11963, + "time_per_iteration": 2.531531572341919 + }, + { + "auxiliary_loss_clip": 0.0641598, + "auxiliary_loss_mlp": 0.01268254, + "balance_loss_clip": 0.06276201, + "balance_loss_mlp": 0.01256888, + "epoch": 0.7193145949195852, + "flos": 18986981558400.0, + "grad_norm": 2.117256305222674, + "language_loss": 0.81201178, + "learning_rate": 7.709357316395564e-07, + "loss": 0.88885415, + "num_input_tokens_seen": 258148055, + "router_z_loss_clip": 1.39941406, + "router_z_loss_mlp": 0.11364746, + "step": 11964, + "time_per_iteration": 2.455134630203247 + }, + { + "auxiliary_loss_clip": 0.06404482, + "auxiliary_loss_mlp": 0.01268831, + "balance_loss_clip": 0.0627102, + "balance_loss_mlp": 0.01258854, + "epoch": 0.7193747181722531, + "flos": 18010061205120.0, + "grad_norm": 1.7059884029893508, + "language_loss": 0.75202858, + "learning_rate": 7.70628511821652e-07, + "loss": 0.8287617, + "num_input_tokens_seen": 258165995, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.09979248, + "step": 11965, + "time_per_iteration": 2.49127459526062 + }, + { + "auxiliary_loss_clip": 0.06410991, + "auxiliary_loss_mlp": 0.01265781, + "balance_loss_clip": 0.06272766, + "balance_loss_mlp": 0.01255172, + "epoch": 0.7194348414249211, + "flos": 24396323990400.0, + "grad_norm": 1.448883188350496, + "language_loss": 0.77801377, + "learning_rate": 7.703213386216377e-07, + "loss": 0.85478151, + "num_input_tokens_seen": 258186165, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.1060791, + "step": 11966, + "time_per_iteration": 2.5172245502471924 + }, + { + "auxiliary_loss_clip": 0.06405114, + "auxiliary_loss_mlp": 0.01265324, + "balance_loss_clip": 0.06270087, + "balance_loss_mlp": 0.01254953, + "epoch": 0.7194949646775891, + "flos": 22169938279680.0, + "grad_norm": 1.704579112714729, + "language_loss": 0.73619503, + "learning_rate": 7.700142120511619e-07, + "loss": 0.81289935, + "num_input_tokens_seen": 258204595, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.10375977, + "step": 11967, + "time_per_iteration": 2.5002834796905518 + }, + { + "auxiliary_loss_clip": 0.06399344, + "auxiliary_loss_mlp": 0.01265984, + "balance_loss_clip": 0.06271313, + "balance_loss_mlp": 0.01256679, + "epoch": 0.719555087930257, + "flos": 20272560825600.0, + "grad_norm": 1.5295572568049065, + "language_loss": 0.82314783, + "learning_rate": 7.6970713212187e-07, + "loss": 0.89980114, + "num_input_tokens_seen": 258223110, + "router_z_loss_clip": 1.27929688, + "router_z_loss_mlp": 0.09307861, + "step": 11968, + "time_per_iteration": 2.5851659774780273 + }, + { + "auxiliary_loss_clip": 0.06403178, + "auxiliary_loss_mlp": 0.01265341, + "balance_loss_clip": 0.0627176, + "balance_loss_mlp": 0.01255262, + "epoch": 0.719615211182925, + "flos": 24723026259840.0, + "grad_norm": 1.755748062324177, + "language_loss": 0.76839387, + "learning_rate": 7.69400098845407e-07, + "loss": 0.84507906, + "num_input_tokens_seen": 258242660, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.10070801, + "step": 11969, + "time_per_iteration": 2.52701997756958 + }, + { + "auxiliary_loss_clip": 0.06404562, + "auxiliary_loss_mlp": 0.01266338, + "balance_loss_clip": 0.06269367, + "balance_loss_mlp": 0.01255973, + "epoch": 0.719675334435593, + "flos": 20015570753280.0, + "grad_norm": 1.3860945342705195, + "language_loss": 0.71083385, + "learning_rate": 7.69093112233417e-07, + "loss": 0.78754288, + "num_input_tokens_seen": 258261850, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.1036377, + "step": 11970, + "time_per_iteration": 2.4650230407714844 + }, + { + "auxiliary_loss_clip": 0.0631284, + "auxiliary_loss_mlp": 0.01254485, + "balance_loss_clip": 0.06257641, + "balance_loss_mlp": 0.0125341, + "epoch": 0.719735457688261, + "flos": 44215965169920.0, + "grad_norm": 0.888192753215213, + "language_loss": 0.60509741, + "learning_rate": 7.68786172297538e-07, + "loss": 0.68077064, + "num_input_tokens_seen": 258312570, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01076508, + "step": 11971, + "time_per_iteration": 3.049323558807373 + }, + { + "auxiliary_loss_clip": 0.06412932, + "auxiliary_loss_mlp": 0.01264656, + "balance_loss_clip": 0.06273159, + "balance_loss_mlp": 0.01254541, + "epoch": 0.7197955809409289, + "flos": 16808952453120.0, + "grad_norm": 1.9914531833581635, + "language_loss": 0.79825729, + "learning_rate": 7.684792790494105e-07, + "loss": 0.87503314, + "num_input_tokens_seen": 258331600, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.10107422, + "step": 11972, + "time_per_iteration": 2.4930012226104736 + }, + { + "auxiliary_loss_clip": 0.06406973, + "auxiliary_loss_mlp": 0.01266584, + "balance_loss_clip": 0.062718, + "balance_loss_mlp": 0.01256487, + "epoch": 0.7198557041935969, + "flos": 24542330929920.0, + "grad_norm": 1.4491238198032386, + "language_loss": 0.76038206, + "learning_rate": 7.681724325006733e-07, + "loss": 0.83711761, + "num_input_tokens_seen": 258351785, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.10095215, + "step": 11973, + "time_per_iteration": 2.548208475112915 + }, + { + "auxiliary_loss_clip": 0.06313819, + "auxiliary_loss_mlp": 0.01251276, + "balance_loss_clip": 0.06258664, + "balance_loss_mlp": 0.01250185, + "epoch": 0.7199158274462648, + "flos": 70729006204800.0, + "grad_norm": 0.8373324972209466, + "language_loss": 0.57018536, + "learning_rate": 7.6786563266296e-07, + "loss": 0.64583629, + "num_input_tokens_seen": 258404035, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01093292, + "step": 11974, + "time_per_iteration": 2.9727988243103027 + }, + { + "auxiliary_loss_clip": 0.06406881, + "auxiliary_loss_mlp": 0.01266132, + "balance_loss_clip": 0.06270801, + "balance_loss_mlp": 0.01256082, + "epoch": 0.7199759506989328, + "flos": 29355151345920.0, + "grad_norm": 2.3495582662204164, + "language_loss": 0.61703098, + "learning_rate": 7.675588795479062e-07, + "loss": 0.69376105, + "num_input_tokens_seen": 258424850, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.10058594, + "step": 11975, + "time_per_iteration": 2.5667810440063477 + }, + { + "auxiliary_loss_clip": 0.06407548, + "auxiliary_loss_mlp": 0.01266502, + "balance_loss_clip": 0.06274091, + "balance_loss_mlp": 0.01256465, + "epoch": 0.7200360739516007, + "flos": 24646689590400.0, + "grad_norm": 1.7506172714592478, + "language_loss": 0.6773572, + "learning_rate": 7.672521731671425e-07, + "loss": 0.7540977, + "num_input_tokens_seen": 258445485, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.10040283, + "step": 11976, + "time_per_iteration": 2.5304412841796875 + }, + { + "auxiliary_loss_clip": 0.06406543, + "auxiliary_loss_mlp": 0.01261585, + "balance_loss_clip": 0.06271597, + "balance_loss_mlp": 0.01252024, + "epoch": 0.7200961972042688, + "flos": 20819007227520.0, + "grad_norm": 1.8109272198274133, + "language_loss": 0.6749649, + "learning_rate": 7.669455135323004e-07, + "loss": 0.75164616, + "num_input_tokens_seen": 258464505, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.09564209, + "step": 11977, + "time_per_iteration": 2.547656536102295 + }, + { + "auxiliary_loss_clip": 0.06408069, + "auxiliary_loss_mlp": 0.012691, + "balance_loss_clip": 0.06271597, + "balance_loss_mlp": 0.01258336, + "epoch": 0.7201563204569367, + "flos": 31253493121920.0, + "grad_norm": 1.5436676151403905, + "language_loss": 0.754664, + "learning_rate": 7.666389006550074e-07, + "loss": 0.83143568, + "num_input_tokens_seen": 258487190, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.10766602, + "step": 11978, + "time_per_iteration": 4.067101240158081 + }, + { + "auxiliary_loss_clip": 0.06403241, + "auxiliary_loss_mlp": 0.01264162, + "balance_loss_clip": 0.06271459, + "balance_loss_mlp": 0.01254327, + "epoch": 0.7202164437096047, + "flos": 26658655902720.0, + "grad_norm": 1.78319056574555, + "language_loss": 0.78890365, + "learning_rate": 7.663323345468908e-07, + "loss": 0.86557764, + "num_input_tokens_seen": 258503790, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.09832764, + "step": 11979, + "time_per_iteration": 2.5176994800567627 + }, + { + "auxiliary_loss_clip": 0.06404784, + "auxiliary_loss_mlp": 0.01266233, + "balance_loss_clip": 0.06271341, + "balance_loss_mlp": 0.01255999, + "epoch": 0.7202765669622727, + "flos": 25966999175040.0, + "grad_norm": 1.5387882255892862, + "language_loss": 0.64881861, + "learning_rate": 7.660258152195767e-07, + "loss": 0.72552878, + "num_input_tokens_seen": 258527335, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.10235596, + "step": 11980, + "time_per_iteration": 2.5968124866485596 + }, + { + "auxiliary_loss_clip": 0.06408978, + "auxiliary_loss_mlp": 0.01265341, + "balance_loss_clip": 0.06272249, + "balance_loss_mlp": 0.01254618, + "epoch": 0.7203366902149406, + "flos": 28519961374080.0, + "grad_norm": 1.8098282466640043, + "language_loss": 0.67242014, + "learning_rate": 7.657193426846871e-07, + "loss": 0.74916333, + "num_input_tokens_seen": 258546690, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.10717773, + "step": 11981, + "time_per_iteration": 2.5330793857574463 + }, + { + "auxiliary_loss_clip": 0.0640622, + "auxiliary_loss_mlp": 0.01265599, + "balance_loss_clip": 0.06270846, + "balance_loss_mlp": 0.01255555, + "epoch": 0.7203968134676086, + "flos": 21112446625920.0, + "grad_norm": 1.6958532399278234, + "language_loss": 0.74167675, + "learning_rate": 7.65412916953843e-07, + "loss": 0.81839496, + "num_input_tokens_seen": 258566340, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.10040283, + "step": 11982, + "time_per_iteration": 2.510929584503174 + }, + { + "auxiliary_loss_clip": 0.06405748, + "auxiliary_loss_mlp": 0.01266184, + "balance_loss_clip": 0.06270775, + "balance_loss_mlp": 0.01256802, + "epoch": 0.7204569367202766, + "flos": 18337937431680.0, + "grad_norm": 1.8860370503158916, + "language_loss": 0.65837574, + "learning_rate": 7.65106538038665e-07, + "loss": 0.73509502, + "num_input_tokens_seen": 258584455, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.09387207, + "step": 11983, + "time_per_iteration": 2.4505462646484375 + }, + { + "auxiliary_loss_clip": 0.06406046, + "auxiliary_loss_mlp": 0.01264887, + "balance_loss_clip": 0.06271453, + "balance_loss_mlp": 0.01254445, + "epoch": 0.7205170599729446, + "flos": 23261279783040.0, + "grad_norm": 1.4437514392705604, + "language_loss": 0.66617727, + "learning_rate": 7.648002059507715e-07, + "loss": 0.74288666, + "num_input_tokens_seen": 258604725, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.10449219, + "step": 11984, + "time_per_iteration": 2.547555446624756 + }, + { + "auxiliary_loss_clip": 0.06413494, + "auxiliary_loss_mlp": 0.01268675, + "balance_loss_clip": 0.06275403, + "balance_loss_mlp": 0.01257994, + "epoch": 0.7205771832256125, + "flos": 20127140864640.0, + "grad_norm": 1.765838717363193, + "language_loss": 0.74360126, + "learning_rate": 7.644939207017771e-07, + "loss": 0.82042295, + "num_input_tokens_seen": 258622885, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.10687256, + "step": 11985, + "time_per_iteration": 2.4865455627441406 + }, + { + "auxiliary_loss_clip": 0.06406047, + "auxiliary_loss_mlp": 0.0126526, + "balance_loss_clip": 0.06272492, + "balance_loss_mlp": 0.01255652, + "epoch": 0.7206373064782805, + "flos": 27709648865280.0, + "grad_norm": 1.7467712742919994, + "language_loss": 0.62577748, + "learning_rate": 7.641876823032977e-07, + "loss": 0.70249057, + "num_input_tokens_seen": 258644305, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.0960083, + "step": 11986, + "time_per_iteration": 2.5774106979370117 + }, + { + "auxiliary_loss_clip": 0.06410712, + "auxiliary_loss_mlp": 0.0127024, + "balance_loss_clip": 0.06274345, + "balance_loss_mlp": 0.01258951, + "epoch": 0.7206974297309484, + "flos": 17974031149440.0, + "grad_norm": 1.663451860117408, + "language_loss": 0.72484905, + "learning_rate": 7.638814907669455e-07, + "loss": 0.80165857, + "num_input_tokens_seen": 258661775, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.11291504, + "step": 11987, + "time_per_iteration": 2.4724771976470947 + }, + { + "auxiliary_loss_clip": 0.06410339, + "auxiliary_loss_mlp": 0.01263822, + "balance_loss_clip": 0.06273559, + "balance_loss_mlp": 0.01253689, + "epoch": 0.7207575529836164, + "flos": 16988893096320.0, + "grad_norm": 2.5242604109279574, + "language_loss": 0.78976148, + "learning_rate": 7.635753461043301e-07, + "loss": 0.86650312, + "num_input_tokens_seen": 258679830, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.10125732, + "step": 11988, + "time_per_iteration": 2.495361566543579 + }, + { + "auxiliary_loss_clip": 0.06404472, + "auxiliary_loss_mlp": 0.01263556, + "balance_loss_clip": 0.06269506, + "balance_loss_mlp": 0.01253489, + "epoch": 0.7208176762362843, + "flos": 18732465181440.0, + "grad_norm": 1.7087764254113869, + "language_loss": 0.79046804, + "learning_rate": 7.632692483270618e-07, + "loss": 0.86714828, + "num_input_tokens_seen": 258697415, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.10064697, + "step": 11989, + "time_per_iteration": 2.5043447017669678 + }, + { + "auxiliary_loss_clip": 0.06400688, + "auxiliary_loss_mlp": 0.01267699, + "balance_loss_clip": 0.06270982, + "balance_loss_mlp": 0.01257364, + "epoch": 0.7208777994889524, + "flos": 18740515173120.0, + "grad_norm": 1.790178990562424, + "language_loss": 0.8290503, + "learning_rate": 7.629631974467481e-07, + "loss": 0.90573412, + "num_input_tokens_seen": 258716755, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.10345459, + "step": 11990, + "time_per_iteration": 3.926800012588501 + }, + { + "auxiliary_loss_clip": 0.064039, + "auxiliary_loss_mlp": 0.01274305, + "balance_loss_clip": 0.06273188, + "balance_loss_mlp": 0.0126484, + "epoch": 0.7209379227416203, + "flos": 14798705149440.0, + "grad_norm": 2.036094389130557, + "language_loss": 0.7637105, + "learning_rate": 7.626571934749931e-07, + "loss": 0.84049255, + "num_input_tokens_seen": 258733270, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09472656, + "step": 11991, + "time_per_iteration": 2.504420042037964 + }, + { + "auxiliary_loss_clip": 0.06401916, + "auxiliary_loss_mlp": 0.01266823, + "balance_loss_clip": 0.06271645, + "balance_loss_mlp": 0.01256976, + "epoch": 0.7209980459942883, + "flos": 29643559499520.0, + "grad_norm": 1.4029888682461984, + "language_loss": 0.72727466, + "learning_rate": 7.623512364234022e-07, + "loss": 0.80396211, + "num_input_tokens_seen": 258755270, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.09844971, + "step": 11992, + "time_per_iteration": 2.5568339824676514 + }, + { + "auxiliary_loss_clip": 0.06410159, + "auxiliary_loss_mlp": 0.01263161, + "balance_loss_clip": 0.06273486, + "balance_loss_mlp": 0.01252695, + "epoch": 0.7210581692469563, + "flos": 23483916881280.0, + "grad_norm": 1.4497931031993367, + "language_loss": 0.66405648, + "learning_rate": 7.620453263035755e-07, + "loss": 0.74078965, + "num_input_tokens_seen": 258775340, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.10473633, + "step": 11993, + "time_per_iteration": 2.6186561584472656 + }, + { + "auxiliary_loss_clip": 0.06405848, + "auxiliary_loss_mlp": 0.01269619, + "balance_loss_clip": 0.06271709, + "balance_loss_mlp": 0.01259695, + "epoch": 0.7211182924996242, + "flos": 26106297788160.0, + "grad_norm": 1.8933872495895026, + "language_loss": 0.6622234, + "learning_rate": 7.61739463127115e-07, + "loss": 0.73897809, + "num_input_tokens_seen": 258794580, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.0993042, + "step": 11994, + "time_per_iteration": 3.895599126815796 + }, + { + "auxiliary_loss_clip": 0.06404895, + "auxiliary_loss_mlp": 0.01266355, + "balance_loss_clip": 0.06270497, + "balance_loss_mlp": 0.01255888, + "epoch": 0.7211784157522922, + "flos": 17717795763840.0, + "grad_norm": 1.9331486787733179, + "language_loss": 0.67162377, + "learning_rate": 7.614336469056172e-07, + "loss": 0.7483362, + "num_input_tokens_seen": 258812330, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.10473633, + "step": 11995, + "time_per_iteration": 2.4796035289764404 + }, + { + "auxiliary_loss_clip": 0.06403686, + "auxiliary_loss_mlp": 0.01265301, + "balance_loss_clip": 0.06274262, + "balance_loss_mlp": 0.01254721, + "epoch": 0.7212385390049602, + "flos": 24430173840000.0, + "grad_norm": 1.6348621026253527, + "language_loss": 0.7952925, + "learning_rate": 7.6112787765068e-07, + "loss": 0.87198234, + "num_input_tokens_seen": 258831770, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.10577393, + "step": 11996, + "time_per_iteration": 2.513824939727783 + }, + { + "auxiliary_loss_clip": 0.06409439, + "auxiliary_loss_mlp": 0.0126526, + "balance_loss_clip": 0.06274767, + "balance_loss_mlp": 0.01255056, + "epoch": 0.7212986622576282, + "flos": 28154755353600.0, + "grad_norm": 3.3591238798386285, + "language_loss": 0.81663775, + "learning_rate": 7.60822155373899e-07, + "loss": 0.89338481, + "num_input_tokens_seen": 258849090, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.10192871, + "step": 11997, + "time_per_iteration": 3.9435391426086426 + }, + { + "auxiliary_loss_clip": 0.06409244, + "auxiliary_loss_mlp": 0.01266354, + "balance_loss_clip": 0.06272081, + "balance_loss_mlp": 0.01255363, + "epoch": 0.7213587855102961, + "flos": 21842313615360.0, + "grad_norm": 1.9166262285811178, + "language_loss": 0.67322028, + "learning_rate": 7.605164800868646e-07, + "loss": 0.74997622, + "num_input_tokens_seen": 258868230, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.10992432, + "step": 11998, + "time_per_iteration": 2.496742010116577 + }, + { + "auxiliary_loss_clip": 0.06405417, + "auxiliary_loss_mlp": 0.01266027, + "balance_loss_clip": 0.06271802, + "balance_loss_mlp": 0.01256777, + "epoch": 0.7214189087629641, + "flos": 14616877789440.0, + "grad_norm": 1.7752534320688365, + "language_loss": 0.72513527, + "learning_rate": 7.602108518011696e-07, + "loss": 0.80184972, + "num_input_tokens_seen": 258885525, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.0925293, + "step": 11999, + "time_per_iteration": 2.458315849304199 + }, + { + "auxiliary_loss_clip": 0.0640653, + "auxiliary_loss_mlp": 0.01266506, + "balance_loss_clip": 0.06272668, + "balance_loss_mlp": 0.01256158, + "epoch": 0.721479032015632, + "flos": 19396938458880.0, + "grad_norm": 2.0883117148535937, + "language_loss": 0.83569586, + "learning_rate": 7.599052705284039e-07, + "loss": 0.91242623, + "num_input_tokens_seen": 258903245, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.10351562, + "step": 12000, + "time_per_iteration": 2.4941916465759277 + }, + { + "auxiliary_loss_clip": 0.06409671, + "auxiliary_loss_mlp": 0.01262001, + "balance_loss_clip": 0.06275104, + "balance_loss_mlp": 0.01251826, + "epoch": 0.7215391552683, + "flos": 18518423126400.0, + "grad_norm": 1.7464338798301249, + "language_loss": 0.77261817, + "learning_rate": 7.59599736280154e-07, + "loss": 0.8493349, + "num_input_tokens_seen": 258921245, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.10174561, + "step": 12001, + "time_per_iteration": 2.4661076068878174 + }, + { + "auxiliary_loss_clip": 0.0640439, + "auxiliary_loss_mlp": 0.01267788, + "balance_loss_clip": 0.06274766, + "balance_loss_mlp": 0.01258323, + "epoch": 0.721599278520968, + "flos": 23265514414080.0, + "grad_norm": 2.52401774728115, + "language_loss": 0.81887865, + "learning_rate": 7.592942490680066e-07, + "loss": 0.89560032, + "num_input_tokens_seen": 258939425, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.09454346, + "step": 12002, + "time_per_iteration": 2.5698509216308594 + }, + { + "auxiliary_loss_clip": 0.06409481, + "auxiliary_loss_mlp": 0.01264806, + "balance_loss_clip": 0.06272879, + "balance_loss_mlp": 0.01254363, + "epoch": 0.721659401773636, + "flos": 39207831615360.0, + "grad_norm": 2.1337554314771117, + "language_loss": 0.62387294, + "learning_rate": 7.589888089035462e-07, + "loss": 0.70061582, + "num_input_tokens_seen": 258960710, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.10437012, + "step": 12003, + "time_per_iteration": 2.646667003631592 + }, + { + "auxiliary_loss_clip": 0.06408672, + "auxiliary_loss_mlp": 0.01269946, + "balance_loss_clip": 0.06271918, + "balance_loss_mlp": 0.01258639, + "epoch": 0.7217195250263039, + "flos": 14945299067520.0, + "grad_norm": 3.165928110898167, + "language_loss": 0.69158828, + "learning_rate": 7.586834157983544e-07, + "loss": 0.76837444, + "num_input_tokens_seen": 258978475, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.11297607, + "step": 12004, + "time_per_iteration": 2.4904415607452393 + }, + { + "auxiliary_loss_clip": 0.06301466, + "auxiliary_loss_mlp": 0.0124999, + "balance_loss_clip": 0.06246269, + "balance_loss_mlp": 0.01249087, + "epoch": 0.7217796482789719, + "flos": 70889477973120.0, + "grad_norm": 0.8473059140767815, + "language_loss": 0.54124975, + "learning_rate": 7.583780697640112e-07, + "loss": 0.61676431, + "num_input_tokens_seen": 259037520, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.00901794, + "step": 12005, + "time_per_iteration": 3.085909366607666 + }, + { + "auxiliary_loss_clip": 0.06406818, + "auxiliary_loss_mlp": 0.0126308, + "balance_loss_clip": 0.06273066, + "balance_loss_mlp": 0.0125349, + "epoch": 0.7218397715316398, + "flos": 37460653804800.0, + "grad_norm": 1.5183383178903638, + "language_loss": 0.63201904, + "learning_rate": 7.580727708120962e-07, + "loss": 0.708718, + "num_input_tokens_seen": 259061325, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.09588623, + "step": 12006, + "time_per_iteration": 2.7121994495391846 + }, + { + "auxiliary_loss_clip": 0.06407326, + "auxiliary_loss_mlp": 0.01263158, + "balance_loss_clip": 0.0627062, + "balance_loss_mlp": 0.0125352, + "epoch": 0.7218998947843078, + "flos": 22717223222400.0, + "grad_norm": 1.5926677831370504, + "language_loss": 0.92170072, + "learning_rate": 7.577675189541865e-07, + "loss": 0.99840552, + "num_input_tokens_seen": 259078135, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.09643555, + "step": 12007, + "time_per_iteration": 2.534914016723633 + }, + { + "auxiliary_loss_clip": 0.06408784, + "auxiliary_loss_mlp": 0.01266152, + "balance_loss_clip": 0.06272783, + "balance_loss_mlp": 0.01255191, + "epoch": 0.7219600180369758, + "flos": 12172131538560.0, + "grad_norm": 1.6024431968555108, + "language_loss": 0.63807905, + "learning_rate": 7.574623142018568e-07, + "loss": 0.71482843, + "num_input_tokens_seen": 259095910, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.10961914, + "step": 12008, + "time_per_iteration": 2.5015389919281006 + }, + { + "auxiliary_loss_clip": 0.0641045, + "auxiliary_loss_mlp": 0.01266515, + "balance_loss_clip": 0.06271577, + "balance_loss_mlp": 0.01256144, + "epoch": 0.7220201412896438, + "flos": 22602340874880.0, + "grad_norm": 1.927754748237573, + "language_loss": 0.79281247, + "learning_rate": 7.57157156566681e-07, + "loss": 0.86958218, + "num_input_tokens_seen": 259114225, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.1038208, + "step": 12009, + "time_per_iteration": 2.5008604526519775 + }, + { + "auxiliary_loss_clip": 0.06407045, + "auxiliary_loss_mlp": 0.01266982, + "balance_loss_clip": 0.06269218, + "balance_loss_mlp": 0.01255533, + "epoch": 0.7220802645423118, + "flos": 26724972009600.0, + "grad_norm": 2.605024867459915, + "language_loss": 0.6418041, + "learning_rate": 7.568520460602297e-07, + "loss": 0.71854436, + "num_input_tokens_seen": 259134660, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.11450195, + "step": 12010, + "time_per_iteration": 2.527949571609497 + }, + { + "auxiliary_loss_clip": 0.06404176, + "auxiliary_loss_mlp": 0.01266927, + "balance_loss_clip": 0.06270487, + "balance_loss_mlp": 0.01256854, + "epoch": 0.7221403877949797, + "flos": 24426568114560.0, + "grad_norm": 1.594533265957021, + "language_loss": 0.77320325, + "learning_rate": 7.565469826940742e-07, + "loss": 0.84991425, + "num_input_tokens_seen": 259153300, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.10070801, + "step": 12011, + "time_per_iteration": 2.5198636054992676 + }, + { + "auxiliary_loss_clip": 0.0640825, + "auxiliary_loss_mlp": 0.01263324, + "balance_loss_clip": 0.06273598, + "balance_loss_mlp": 0.0125368, + "epoch": 0.7222005110476477, + "flos": 23521246675200.0, + "grad_norm": 1.6737582547209497, + "language_loss": 0.79734701, + "learning_rate": 7.56241966479781e-07, + "loss": 0.87406272, + "num_input_tokens_seen": 259172115, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.09637451, + "step": 12012, + "time_per_iteration": 2.5218822956085205 + }, + { + "auxiliary_loss_clip": 0.06409319, + "auxiliary_loss_mlp": 0.01264498, + "balance_loss_clip": 0.0627391, + "balance_loss_mlp": 0.01254955, + "epoch": 0.7222606343003156, + "flos": 23119255912320.0, + "grad_norm": 2.6909809043391744, + "language_loss": 0.76237571, + "learning_rate": 7.559369974289171e-07, + "loss": 0.83911389, + "num_input_tokens_seen": 259191345, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.09533691, + "step": 12013, + "time_per_iteration": 2.501549005508423 + }, + { + "auxiliary_loss_clip": 0.06401782, + "auxiliary_loss_mlp": 0.01266309, + "balance_loss_clip": 0.06270641, + "balance_loss_mlp": 0.01256456, + "epoch": 0.7223207575529836, + "flos": 24357778312320.0, + "grad_norm": 1.4242237370924462, + "language_loss": 0.76199239, + "learning_rate": 7.556320755530484e-07, + "loss": 0.83867329, + "num_input_tokens_seen": 259211700, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09851074, + "step": 12014, + "time_per_iteration": 2.6219167709350586 + }, + { + "auxiliary_loss_clip": 0.0640952, + "auxiliary_loss_mlp": 0.01262375, + "balance_loss_clip": 0.0627341, + "balance_loss_mlp": 0.01252445, + "epoch": 0.7223808808056515, + "flos": 28337798597760.0, + "grad_norm": 1.6715764427822655, + "language_loss": 0.86861187, + "learning_rate": 7.553272008637346e-07, + "loss": 0.9453308, + "num_input_tokens_seen": 259233825, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.09924316, + "step": 12015, + "time_per_iteration": 2.5629379749298096 + }, + { + "auxiliary_loss_clip": 0.0640379, + "auxiliary_loss_mlp": 0.01267259, + "balance_loss_clip": 0.06271358, + "balance_loss_mlp": 0.01257365, + "epoch": 0.7224410040583196, + "flos": 21075829591680.0, + "grad_norm": 2.031854447065517, + "language_loss": 0.78420502, + "learning_rate": 7.55022373372538e-07, + "loss": 0.86091554, + "num_input_tokens_seen": 259253055, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.09899902, + "step": 12016, + "time_per_iteration": 2.549696207046509 + }, + { + "auxiliary_loss_clip": 0.06403818, + "auxiliary_loss_mlp": 0.01265816, + "balance_loss_clip": 0.06270836, + "balance_loss_mlp": 0.01255839, + "epoch": 0.7225011273109875, + "flos": 26802398782080.0, + "grad_norm": 1.3727875388559247, + "language_loss": 0.77603066, + "learning_rate": 7.547175930910186e-07, + "loss": 0.85272694, + "num_input_tokens_seen": 259273420, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.09979248, + "step": 12017, + "time_per_iteration": 2.5937881469726562 + }, + { + "auxiliary_loss_clip": 0.06402834, + "auxiliary_loss_mlp": 0.01265872, + "balance_loss_clip": 0.06271364, + "balance_loss_mlp": 0.0125609, + "epoch": 0.7225612505636555, + "flos": 23589826842240.0, + "grad_norm": 1.6197156862149726, + "language_loss": 0.74198735, + "learning_rate": 7.54412860030732e-07, + "loss": 0.81867433, + "num_input_tokens_seen": 259291000, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09783936, + "step": 12018, + "time_per_iteration": 3.996819257736206 + }, + { + "auxiliary_loss_clip": 0.06402058, + "auxiliary_loss_mlp": 0.01270158, + "balance_loss_clip": 0.06272961, + "balance_loss_mlp": 0.01260812, + "epoch": 0.7226213738163234, + "flos": 20783983420800.0, + "grad_norm": 1.7233802894536456, + "language_loss": 0.77552009, + "learning_rate": 7.541081742032347e-07, + "loss": 0.85224223, + "num_input_tokens_seen": 259312390, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.09344482, + "step": 12019, + "time_per_iteration": 2.52474308013916 + }, + { + "auxiliary_loss_clip": 0.0640556, + "auxiliary_loss_mlp": 0.01263394, + "balance_loss_clip": 0.06272571, + "balance_loss_mlp": 0.01253363, + "epoch": 0.7226814970689914, + "flos": 32644227663360.0, + "grad_norm": 1.6248881332172511, + "language_loss": 0.73835564, + "learning_rate": 7.53803535620081e-07, + "loss": 0.81504518, + "num_input_tokens_seen": 259332645, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.10028076, + "step": 12020, + "time_per_iteration": 2.577397346496582 + }, + { + "auxiliary_loss_clip": 0.06409635, + "auxiliary_loss_mlp": 0.01262192, + "balance_loss_clip": 0.06272969, + "balance_loss_mlp": 0.01252054, + "epoch": 0.7227416203216595, + "flos": 22460736274560.0, + "grad_norm": 1.6075634360932833, + "language_loss": 0.77574962, + "learning_rate": 7.534989442928219e-07, + "loss": 0.85246789, + "num_input_tokens_seen": 259353810, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.10137939, + "step": 12021, + "time_per_iteration": 2.530141592025757 + }, + { + "auxiliary_loss_clip": 0.06403421, + "auxiliary_loss_mlp": 0.0126503, + "balance_loss_clip": 0.06270886, + "balance_loss_mlp": 0.01255267, + "epoch": 0.7228017435743274, + "flos": 21658641465600.0, + "grad_norm": 1.5420069016517286, + "language_loss": 0.68414694, + "learning_rate": 7.531944002330073e-07, + "loss": 0.76083142, + "num_input_tokens_seen": 259372460, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09765625, + "step": 12022, + "time_per_iteration": 2.504757881164551 + }, + { + "auxiliary_loss_clip": 0.06407183, + "auxiliary_loss_mlp": 0.01266298, + "balance_loss_clip": 0.06271838, + "balance_loss_mlp": 0.0125613, + "epoch": 0.7228618668269954, + "flos": 29541171409920.0, + "grad_norm": 1.8382982507035688, + "language_loss": 0.69865435, + "learning_rate": 7.528899034521858e-07, + "loss": 0.77538919, + "num_input_tokens_seen": 259393275, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.10168457, + "step": 12023, + "time_per_iteration": 2.572157859802246 + }, + { + "auxiliary_loss_clip": 0.06405231, + "auxiliary_loss_mlp": 0.01262251, + "balance_loss_clip": 0.06272452, + "balance_loss_mlp": 0.01252356, + "epoch": 0.7229219900796633, + "flos": 27461169982080.0, + "grad_norm": 1.6264829845814306, + "language_loss": 0.71353316, + "learning_rate": 7.525854539619052e-07, + "loss": 0.79020798, + "num_input_tokens_seen": 259416205, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09887695, + "step": 12024, + "time_per_iteration": 2.548758029937744 + }, + { + "auxiliary_loss_clip": 0.06407243, + "auxiliary_loss_mlp": 0.01265115, + "balance_loss_clip": 0.06272963, + "balance_loss_mlp": 0.01254946, + "epoch": 0.7229821133323313, + "flos": 16294888454400.0, + "grad_norm": 2.8784491415688427, + "language_loss": 0.75972795, + "learning_rate": 7.522810517737089e-07, + "loss": 0.83645153, + "num_input_tokens_seen": 259433115, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.10168457, + "step": 12025, + "time_per_iteration": 2.4729340076446533 + }, + { + "auxiliary_loss_clip": 0.06403269, + "auxiliary_loss_mlp": 0.01264783, + "balance_loss_clip": 0.06271631, + "balance_loss_mlp": 0.01255049, + "epoch": 0.7230422365849992, + "flos": 20418567765120.0, + "grad_norm": 1.900331951753324, + "language_loss": 0.76300782, + "learning_rate": 7.519766968991395e-07, + "loss": 0.83968836, + "num_input_tokens_seen": 259450475, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09741211, + "step": 12026, + "time_per_iteration": 2.4887609481811523 + }, + { + "auxiliary_loss_clip": 0.06407255, + "auxiliary_loss_mlp": 0.01263175, + "balance_loss_clip": 0.06272575, + "balance_loss_mlp": 0.01253114, + "epoch": 0.7231023598376672, + "flos": 25600619197440.0, + "grad_norm": 1.727853118389861, + "language_loss": 0.67822838, + "learning_rate": 7.516723893497388e-07, + "loss": 0.75493264, + "num_input_tokens_seen": 259469355, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.10064697, + "step": 12027, + "time_per_iteration": 2.5328831672668457 + }, + { + "auxiliary_loss_clip": 0.06409849, + "auxiliary_loss_mlp": 0.01267637, + "balance_loss_clip": 0.06273012, + "balance_loss_mlp": 0.01256372, + "epoch": 0.7231624830903352, + "flos": 25155638490240.0, + "grad_norm": 20.233836516227683, + "language_loss": 0.79796958, + "learning_rate": 7.513681291370469e-07, + "loss": 0.87474453, + "num_input_tokens_seen": 259486565, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.11260986, + "step": 12028, + "time_per_iteration": 2.5175299644470215 + }, + { + "auxiliary_loss_clip": 0.06406561, + "auxiliary_loss_mlp": 0.01262813, + "balance_loss_clip": 0.06271036, + "balance_loss_mlp": 0.01252722, + "epoch": 0.7232226063430032, + "flos": 21732169023360.0, + "grad_norm": 1.6712799697819898, + "language_loss": 0.8266964, + "learning_rate": 7.510639162726e-07, + "loss": 0.90339005, + "num_input_tokens_seen": 259505070, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.10089111, + "step": 12029, + "time_per_iteration": 3.9506967067718506 + }, + { + "auxiliary_loss_clip": 0.06311534, + "auxiliary_loss_mlp": 0.01251495, + "balance_loss_clip": 0.06256342, + "balance_loss_mlp": 0.01250514, + "epoch": 0.7232827295956711, + "flos": 68458693426560.0, + "grad_norm": 0.7790969864555375, + "language_loss": 0.6171549, + "learning_rate": 7.507597507679347e-07, + "loss": 0.6927852, + "num_input_tokens_seen": 259569135, + "router_z_loss_clip": 0.55322266, + "router_z_loss_mlp": 0.00980377, + "step": 12030, + "time_per_iteration": 3.187685489654541 + }, + { + "auxiliary_loss_clip": 0.06405394, + "auxiliary_loss_mlp": 0.01265748, + "balance_loss_clip": 0.06273839, + "balance_loss_mlp": 0.01255753, + "epoch": 0.7233428528483391, + "flos": 20198697851520.0, + "grad_norm": 1.6342080054038326, + "language_loss": 0.78514922, + "learning_rate": 7.504556326345859e-07, + "loss": 0.86186063, + "num_input_tokens_seen": 259587035, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09997559, + "step": 12031, + "time_per_iteration": 2.47151255607605 + }, + { + "auxiliary_loss_clip": 0.06411318, + "auxiliary_loss_mlp": 0.01265123, + "balance_loss_clip": 0.0627391, + "balance_loss_mlp": 0.01254465, + "epoch": 0.723402976101007, + "flos": 23955955257600.0, + "grad_norm": 1.8287937473952962, + "language_loss": 0.81728959, + "learning_rate": 7.501515618840834e-07, + "loss": 0.894054, + "num_input_tokens_seen": 259606140, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.10656738, + "step": 12032, + "time_per_iteration": 2.5481441020965576 + }, + { + "auxiliary_loss_clip": 0.06416769, + "auxiliary_loss_mlp": 0.01265155, + "balance_loss_clip": 0.06275293, + "balance_loss_mlp": 0.01254485, + "epoch": 0.723463099353675, + "flos": 20819636133120.0, + "grad_norm": 1.8204115009796795, + "language_loss": 0.75397038, + "learning_rate": 7.498475385279592e-07, + "loss": 0.83078963, + "num_input_tokens_seen": 259624275, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.10662842, + "step": 12033, + "time_per_iteration": 3.957021951675415 + }, + { + "auxiliary_loss_clip": 0.0640196, + "auxiliary_loss_mlp": 0.01261304, + "balance_loss_clip": 0.06271483, + "balance_loss_mlp": 0.01251874, + "epoch": 0.723523222606343, + "flos": 19103876403840.0, + "grad_norm": 1.563188843970664, + "language_loss": 0.75271815, + "learning_rate": 7.495435625777423e-07, + "loss": 0.82935083, + "num_input_tokens_seen": 259643465, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.09423828, + "step": 12034, + "time_per_iteration": 2.479860782623291 + }, + { + "auxiliary_loss_clip": 0.0640718, + "auxiliary_loss_mlp": 0.01261832, + "balance_loss_clip": 0.06273372, + "balance_loss_mlp": 0.01252146, + "epoch": 0.723583345859011, + "flos": 26514493752960.0, + "grad_norm": 1.7350921748415202, + "language_loss": 0.80701005, + "learning_rate": 7.492396340449578e-07, + "loss": 0.88370025, + "num_input_tokens_seen": 259662500, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.09680176, + "step": 12035, + "time_per_iteration": 2.559680700302124 + }, + { + "auxiliary_loss_clip": 0.06410785, + "auxiliary_loss_mlp": 0.01263828, + "balance_loss_clip": 0.06273998, + "balance_loss_mlp": 0.01253361, + "epoch": 0.723643469111679, + "flos": 16039323901440.0, + "grad_norm": 3.114522084917199, + "language_loss": 0.61466223, + "learning_rate": 7.489357529411326e-07, + "loss": 0.69140834, + "num_input_tokens_seen": 259680140, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.10473633, + "step": 12036, + "time_per_iteration": 2.4680371284484863 + }, + { + "auxiliary_loss_clip": 0.06403697, + "auxiliary_loss_mlp": 0.01264009, + "balance_loss_clip": 0.06272744, + "balance_loss_mlp": 0.01254914, + "epoch": 0.7237035923643469, + "flos": 21952164718080.0, + "grad_norm": 1.4930749372643133, + "language_loss": 0.67717707, + "learning_rate": 7.486319192777883e-07, + "loss": 0.75385416, + "num_input_tokens_seen": 259700160, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09094238, + "step": 12037, + "time_per_iteration": 3.957728862762451 + }, + { + "auxiliary_loss_clip": 0.06406017, + "auxiliary_loss_mlp": 0.01265379, + "balance_loss_clip": 0.06273565, + "balance_loss_mlp": 0.01255091, + "epoch": 0.7237637156170149, + "flos": 23589281790720.0, + "grad_norm": 1.7134802369768287, + "language_loss": 0.73071694, + "learning_rate": 7.483281330664479e-07, + "loss": 0.80743086, + "num_input_tokens_seen": 259720525, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.10296631, + "step": 12038, + "time_per_iteration": 2.5239899158477783 + }, + { + "auxiliary_loss_clip": 0.06408326, + "auxiliary_loss_mlp": 0.012677, + "balance_loss_clip": 0.06274582, + "balance_loss_mlp": 0.0125625, + "epoch": 0.7238238388696828, + "flos": 20600940176640.0, + "grad_norm": 1.583420390669157, + "language_loss": 0.72335035, + "learning_rate": 7.480243943186293e-07, + "loss": 0.80011058, + "num_input_tokens_seen": 259738680, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.11437988, + "step": 12039, + "time_per_iteration": 2.5016210079193115 + }, + { + "auxiliary_loss_clip": 0.06408711, + "auxiliary_loss_mlp": 0.01262586, + "balance_loss_clip": 0.06274222, + "balance_loss_mlp": 0.0125346, + "epoch": 0.7238839621223508, + "flos": 24213909651840.0, + "grad_norm": 1.553952761498081, + "language_loss": 0.7617048, + "learning_rate": 7.477207030458513e-07, + "loss": 0.83841777, + "num_input_tokens_seen": 259758790, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.09130859, + "step": 12040, + "time_per_iteration": 2.4979355335235596 + }, + { + "auxiliary_loss_clip": 0.0640977, + "auxiliary_loss_mlp": 0.01263735, + "balance_loss_clip": 0.06273755, + "balance_loss_mlp": 0.01252898, + "epoch": 0.7239440853750188, + "flos": 14214928953600.0, + "grad_norm": 1.6058378864892022, + "language_loss": 0.77005613, + "learning_rate": 7.474170592596301e-07, + "loss": 0.84679121, + "num_input_tokens_seen": 259777370, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.10845947, + "step": 12041, + "time_per_iteration": 2.519228458404541 + }, + { + "auxiliary_loss_clip": 0.06408431, + "auxiliary_loss_mlp": 0.01263027, + "balance_loss_clip": 0.06271957, + "balance_loss_mlp": 0.01253365, + "epoch": 0.7240042086276868, + "flos": 21620976255360.0, + "grad_norm": 1.9889626365674344, + "language_loss": 0.63348103, + "learning_rate": 7.471134629714797e-07, + "loss": 0.7101956, + "num_input_tokens_seen": 259794665, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.09667969, + "step": 12042, + "time_per_iteration": 2.475182294845581 + }, + { + "auxiliary_loss_clip": 0.06410774, + "auxiliary_loss_mlp": 0.01268078, + "balance_loss_clip": 0.06275245, + "balance_loss_mlp": 0.012567, + "epoch": 0.7240643318803547, + "flos": 23338203431040.0, + "grad_norm": 1.8474585554645233, + "language_loss": 0.83173352, + "learning_rate": 7.468099141929116e-07, + "loss": 0.90852207, + "num_input_tokens_seen": 259811110, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.11376953, + "step": 12043, + "time_per_iteration": 2.5139901638031006 + }, + { + "auxiliary_loss_clip": 0.06409861, + "auxiliary_loss_mlp": 0.01267458, + "balance_loss_clip": 0.06273165, + "balance_loss_mlp": 0.01256354, + "epoch": 0.7241244551330227, + "flos": 24031746875520.0, + "grad_norm": 2.293056245042729, + "language_loss": 0.64671153, + "learning_rate": 7.465064129354379e-07, + "loss": 0.72348469, + "num_input_tokens_seen": 259831080, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.11102295, + "step": 12044, + "time_per_iteration": 2.499971866607666 + }, + { + "auxiliary_loss_clip": 0.06411785, + "auxiliary_loss_mlp": 0.01265003, + "balance_loss_clip": 0.06276388, + "balance_loss_mlp": 0.01254781, + "epoch": 0.7241845783856906, + "flos": 18735651636480.0, + "grad_norm": 1.9189721390747507, + "language_loss": 0.81796312, + "learning_rate": 7.462029592105658e-07, + "loss": 0.89473093, + "num_input_tokens_seen": 259850135, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.10211182, + "step": 12045, + "time_per_iteration": 2.4791791439056396 + }, + { + "auxiliary_loss_clip": 0.06403655, + "auxiliary_loss_mlp": 0.0126726, + "balance_loss_clip": 0.06274088, + "balance_loss_mlp": 0.01256752, + "epoch": 0.7242447016383586, + "flos": 19504483574400.0, + "grad_norm": 2.888520203836974, + "language_loss": 0.72249848, + "learning_rate": 7.458995530298034e-07, + "loss": 0.79920763, + "num_input_tokens_seen": 259868185, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.1050415, + "step": 12046, + "time_per_iteration": 2.4642648696899414 + }, + { + "auxiliary_loss_clip": 0.064097, + "auxiliary_loss_mlp": 0.01264976, + "balance_loss_clip": 0.06273885, + "balance_loss_mlp": 0.01254396, + "epoch": 0.7243048248910267, + "flos": 22169980206720.0, + "grad_norm": 1.724287594820583, + "language_loss": 0.71379775, + "learning_rate": 7.455961944046553e-07, + "loss": 0.79054451, + "num_input_tokens_seen": 259887055, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.10571289, + "step": 12047, + "time_per_iteration": 2.5032777786254883 + }, + { + "auxiliary_loss_clip": 0.06410667, + "auxiliary_loss_mlp": 0.01264735, + "balance_loss_clip": 0.06274027, + "balance_loss_mlp": 0.01253673, + "epoch": 0.7243649481436946, + "flos": 27680159427840.0, + "grad_norm": 1.6409687158316038, + "language_loss": 0.70148283, + "learning_rate": 7.45292883346627e-07, + "loss": 0.77823687, + "num_input_tokens_seen": 259908295, + "router_z_loss_clip": 1.36621094, + "router_z_loss_mlp": 0.11065674, + "step": 12048, + "time_per_iteration": 2.537400007247925 + }, + { + "auxiliary_loss_clip": 0.06309511, + "auxiliary_loss_mlp": 0.01254196, + "balance_loss_clip": 0.06254156, + "balance_loss_mlp": 0.01253124, + "epoch": 0.7244250713963626, + "flos": 63263686538880.0, + "grad_norm": 0.8079275009265211, + "language_loss": 0.53702354, + "learning_rate": 7.449896198672168e-07, + "loss": 0.61266059, + "num_input_tokens_seen": 259968475, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01072693, + "step": 12049, + "time_per_iteration": 3.117490768432617 + }, + { + "auxiliary_loss_clip": 0.06415777, + "auxiliary_loss_mlp": 0.01264713, + "balance_loss_clip": 0.06273454, + "balance_loss_mlp": 0.01252971, + "epoch": 0.7244851946490305, + "flos": 17972815265280.0, + "grad_norm": 2.160877059772018, + "language_loss": 0.60396636, + "learning_rate": 7.446864039779258e-07, + "loss": 0.68077123, + "num_input_tokens_seen": 259984865, + "router_z_loss_clip": 1.42382812, + "router_z_loss_mlp": 0.11737061, + "step": 12050, + "time_per_iteration": 2.4579668045043945 + }, + { + "auxiliary_loss_clip": 0.06310994, + "auxiliary_loss_mlp": 0.01250921, + "balance_loss_clip": 0.06255537, + "balance_loss_mlp": 0.01249847, + "epoch": 0.7245453179016985, + "flos": 70964179488000.0, + "grad_norm": 0.6964887094333322, + "language_loss": 0.53128082, + "learning_rate": 7.443832356902528e-07, + "loss": 0.60689998, + "num_input_tokens_seen": 260046735, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01075745, + "step": 12051, + "time_per_iteration": 3.1524975299835205 + }, + { + "auxiliary_loss_clip": 0.06405707, + "auxiliary_loss_mlp": 0.01263012, + "balance_loss_clip": 0.06273422, + "balance_loss_mlp": 0.01253594, + "epoch": 0.7246054411543664, + "flos": 24574839114240.0, + "grad_norm": 1.4328858557340107, + "language_loss": 0.71919692, + "learning_rate": 7.440801150156927e-07, + "loss": 0.79588413, + "num_input_tokens_seen": 260067950, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.09417725, + "step": 12052, + "time_per_iteration": 2.599375009536743 + }, + { + "auxiliary_loss_clip": 0.06409772, + "auxiliary_loss_mlp": 0.01266045, + "balance_loss_clip": 0.06275947, + "balance_loss_mlp": 0.01255608, + "epoch": 0.7246655644070344, + "flos": 32345715093120.0, + "grad_norm": 1.7264545008228058, + "language_loss": 0.74337375, + "learning_rate": 7.437770419657415e-07, + "loss": 0.8201319, + "num_input_tokens_seen": 260087730, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.10430908, + "step": 12053, + "time_per_iteration": 2.572556495666504 + }, + { + "auxiliary_loss_clip": 0.06411305, + "auxiliary_loss_mlp": 0.01265609, + "balance_loss_clip": 0.06278073, + "balance_loss_mlp": 0.01254952, + "epoch": 0.7247256876597024, + "flos": 21879056430720.0, + "grad_norm": 2.130811806275834, + "language_loss": 0.78439468, + "learning_rate": 7.434740165518898e-07, + "loss": 0.86116385, + "num_input_tokens_seen": 260107760, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.10650635, + "step": 12054, + "time_per_iteration": 2.594451427459717 + }, + { + "auxiliary_loss_clip": 0.0641022, + "auxiliary_loss_mlp": 0.01263418, + "balance_loss_clip": 0.06276123, + "balance_loss_mlp": 0.01253011, + "epoch": 0.7247858109123704, + "flos": 16218048660480.0, + "grad_norm": 2.4211075094396692, + "language_loss": 0.68897808, + "learning_rate": 7.431710387856301e-07, + "loss": 0.76571441, + "num_input_tokens_seen": 260123660, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.10406494, + "step": 12055, + "time_per_iteration": 2.490989923477173 + }, + { + "auxiliary_loss_clip": 0.06406957, + "auxiliary_loss_mlp": 0.01264855, + "balance_loss_clip": 0.062728, + "balance_loss_mlp": 0.01255467, + "epoch": 0.7248459341650383, + "flos": 20857091708160.0, + "grad_norm": 1.6323335153205245, + "language_loss": 0.74211532, + "learning_rate": 7.428681086784496e-07, + "loss": 0.81883347, + "num_input_tokens_seen": 260142690, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.09387207, + "step": 12056, + "time_per_iteration": 2.5162346363067627 + }, + { + "auxiliary_loss_clip": 0.06405525, + "auxiliary_loss_mlp": 0.01261212, + "balance_loss_clip": 0.06274804, + "balance_loss_mlp": 0.0125152, + "epoch": 0.7249060574177063, + "flos": 25928956621440.0, + "grad_norm": 1.8158169987002448, + "language_loss": 0.70777828, + "learning_rate": 7.425652262418368e-07, + "loss": 0.78444564, + "num_input_tokens_seen": 260162590, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09680176, + "step": 12057, + "time_per_iteration": 4.079265594482422 + }, + { + "auxiliary_loss_clip": 0.0641495, + "auxiliary_loss_mlp": 0.01269409, + "balance_loss_clip": 0.06275235, + "balance_loss_mlp": 0.01258704, + "epoch": 0.7249661806703742, + "flos": 17350912661760.0, + "grad_norm": 1.9388728601507708, + "language_loss": 0.62604892, + "learning_rate": 7.42262391487277e-07, + "loss": 0.70289254, + "num_input_tokens_seen": 260181065, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.1071167, + "step": 12058, + "time_per_iteration": 2.567502737045288 + }, + { + "auxiliary_loss_clip": 0.06412682, + "auxiliary_loss_mlp": 0.01264257, + "balance_loss_clip": 0.06279195, + "balance_loss_mlp": 0.01253963, + "epoch": 0.7250263039230422, + "flos": 19580400973440.0, + "grad_norm": 1.9516605705856642, + "language_loss": 0.75217509, + "learning_rate": 7.419596044262535e-07, + "loss": 0.82894444, + "num_input_tokens_seen": 260200330, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.10290527, + "step": 12059, + "time_per_iteration": 2.4943277835845947 + }, + { + "auxiliary_loss_clip": 0.06405184, + "auxiliary_loss_mlp": 0.01262509, + "balance_loss_clip": 0.06274289, + "balance_loss_mlp": 0.01253282, + "epoch": 0.7250864271757103, + "flos": 21982366915200.0, + "grad_norm": 1.7883051719653056, + "language_loss": 0.79778695, + "learning_rate": 7.416568650702472e-07, + "loss": 0.87446392, + "num_input_tokens_seen": 260219975, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09222412, + "step": 12060, + "time_per_iteration": 2.519117593765259 + }, + { + "auxiliary_loss_clip": 0.06412885, + "auxiliary_loss_mlp": 0.01266886, + "balance_loss_clip": 0.06276695, + "balance_loss_mlp": 0.01256449, + "epoch": 0.7251465504283782, + "flos": 25020113310720.0, + "grad_norm": 1.8093299142299697, + "language_loss": 0.76421869, + "learning_rate": 7.413541734307393e-07, + "loss": 0.84101641, + "num_input_tokens_seen": 260242025, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.10443115, + "step": 12061, + "time_per_iteration": 2.5503969192504883 + }, + { + "auxiliary_loss_clip": 0.06405508, + "auxiliary_loss_mlp": 0.01263826, + "balance_loss_clip": 0.06275885, + "balance_loss_mlp": 0.01253747, + "epoch": 0.7252066736810462, + "flos": 16695621406080.0, + "grad_norm": 1.6247315463998022, + "language_loss": 0.81481957, + "learning_rate": 7.410515295192068e-07, + "loss": 0.89151287, + "num_input_tokens_seen": 260260015, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.10083008, + "step": 12062, + "time_per_iteration": 2.478410482406616 + }, + { + "auxiliary_loss_clip": 0.06418011, + "auxiliary_loss_mlp": 0.01265854, + "balance_loss_clip": 0.06279325, + "balance_loss_mlp": 0.01255066, + "epoch": 0.7252667969337141, + "flos": 25710176810880.0, + "grad_norm": 2.2019312286273705, + "language_loss": 0.69337016, + "learning_rate": 7.407489333471262e-07, + "loss": 0.77020884, + "num_input_tokens_seen": 260278635, + "router_z_loss_clip": 1.38476562, + "router_z_loss_mlp": 0.10778809, + "step": 12063, + "time_per_iteration": 2.5213000774383545 + }, + { + "auxiliary_loss_clip": 0.06404665, + "auxiliary_loss_mlp": 0.01264944, + "balance_loss_clip": 0.06275742, + "balance_loss_mlp": 0.01255186, + "epoch": 0.7253269201863821, + "flos": 18265835393280.0, + "grad_norm": 1.3337230483147808, + "language_loss": 0.70080262, + "learning_rate": 7.40446384925973e-07, + "loss": 0.77749866, + "num_input_tokens_seen": 260298510, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.09759521, + "step": 12064, + "time_per_iteration": 2.4883687496185303 + }, + { + "auxiliary_loss_clip": 0.06412718, + "auxiliary_loss_mlp": 0.01263925, + "balance_loss_clip": 0.06279429, + "balance_loss_mlp": 0.01253846, + "epoch": 0.72538704343905, + "flos": 20417938859520.0, + "grad_norm": 1.6031100014197759, + "language_loss": 0.90715456, + "learning_rate": 7.401438842672192e-07, + "loss": 0.98392093, + "num_input_tokens_seen": 260317405, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.10076904, + "step": 12065, + "time_per_iteration": 2.6608688831329346 + }, + { + "auxiliary_loss_clip": 0.06315897, + "auxiliary_loss_mlp": 0.01252262, + "balance_loss_clip": 0.0626056, + "balance_loss_mlp": 0.01251238, + "epoch": 0.725447166691718, + "flos": 70173321125760.0, + "grad_norm": 0.6440962314349006, + "language_loss": 0.56150329, + "learning_rate": 7.398414313823349e-07, + "loss": 0.63718486, + "num_input_tokens_seen": 260388085, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01023865, + "step": 12066, + "time_per_iteration": 3.253070592880249 + }, + { + "auxiliary_loss_clip": 0.064081, + "auxiliary_loss_mlp": 0.01266559, + "balance_loss_clip": 0.06276315, + "balance_loss_mlp": 0.01257029, + "epoch": 0.725507289944386, + "flos": 27059598489600.0, + "grad_norm": 1.6969511416209166, + "language_loss": 0.76925343, + "learning_rate": 7.395390262827897e-07, + "loss": 0.84600002, + "num_input_tokens_seen": 260406165, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09533691, + "step": 12067, + "time_per_iteration": 2.553955554962158 + }, + { + "auxiliary_loss_clip": 0.0632008, + "auxiliary_loss_mlp": 0.01251739, + "balance_loss_clip": 0.06264634, + "balance_loss_mlp": 0.01250711, + "epoch": 0.725567413197054, + "flos": 62941973587200.0, + "grad_norm": 0.7126407397816765, + "language_loss": 0.56957459, + "learning_rate": 7.392366689800515e-07, + "loss": 0.64529276, + "num_input_tokens_seen": 260461365, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01028442, + "step": 12068, + "time_per_iteration": 3.020040512084961 + }, + { + "auxiliary_loss_clip": 0.06320577, + "auxiliary_loss_mlp": 0.01251119, + "balance_loss_clip": 0.0626526, + "balance_loss_mlp": 0.01250047, + "epoch": 0.7256275364497219, + "flos": 60315735392640.0, + "grad_norm": 0.6491964300681237, + "language_loss": 0.55317146, + "learning_rate": 7.389343594855848e-07, + "loss": 0.62888843, + "num_input_tokens_seen": 260523795, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01074219, + "step": 12069, + "time_per_iteration": 4.627661228179932 + }, + { + "auxiliary_loss_clip": 0.0640723, + "auxiliary_loss_mlp": 0.01261481, + "balance_loss_clip": 0.06277817, + "balance_loss_mlp": 0.01252726, + "epoch": 0.7256876597023899, + "flos": 24505378479360.0, + "grad_norm": 2.803632714871867, + "language_loss": 0.80079329, + "learning_rate": 7.38632097810854e-07, + "loss": 0.87748045, + "num_input_tokens_seen": 260544765, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.08770752, + "step": 12070, + "time_per_iteration": 2.5643179416656494 + }, + { + "auxiliary_loss_clip": 0.06405459, + "auxiliary_loss_mlp": 0.01262838, + "balance_loss_clip": 0.06277329, + "balance_loss_mlp": 0.01252867, + "epoch": 0.7257477829550578, + "flos": 24359623102080.0, + "grad_norm": 1.9027271039299547, + "language_loss": 0.72591138, + "learning_rate": 7.383298839673197e-07, + "loss": 0.80259442, + "num_input_tokens_seen": 260564340, + "router_z_loss_clip": 1.28027344, + "router_z_loss_mlp": 0.09979248, + "step": 12071, + "time_per_iteration": 2.527245283126831 + }, + { + "auxiliary_loss_clip": 0.06408995, + "auxiliary_loss_mlp": 0.01268506, + "balance_loss_clip": 0.06277612, + "balance_loss_mlp": 0.01258379, + "epoch": 0.7258079062077258, + "flos": 17208008323200.0, + "grad_norm": 1.784714322475179, + "language_loss": 0.70686817, + "learning_rate": 7.380277179664436e-07, + "loss": 0.78364313, + "num_input_tokens_seen": 260582565, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.10113525, + "step": 12072, + "time_per_iteration": 3.9422738552093506 + }, + { + "auxiliary_loss_clip": 0.06411255, + "auxiliary_loss_mlp": 0.01265945, + "balance_loss_clip": 0.06273982, + "balance_loss_mlp": 0.01255264, + "epoch": 0.7258680294603939, + "flos": 21586832916480.0, + "grad_norm": 1.7307594033578553, + "language_loss": 0.79001957, + "learning_rate": 7.377255998196821e-07, + "loss": 0.86679161, + "num_input_tokens_seen": 260601700, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.10675049, + "step": 12073, + "time_per_iteration": 2.5204336643218994 + }, + { + "auxiliary_loss_clip": 0.06408107, + "auxiliary_loss_mlp": 0.01262862, + "balance_loss_clip": 0.06276815, + "balance_loss_mlp": 0.0125292, + "epoch": 0.7259281527130618, + "flos": 34863150360960.0, + "grad_norm": 1.4580787781655038, + "language_loss": 0.7035231, + "learning_rate": 7.374235295384923e-07, + "loss": 0.78023279, + "num_input_tokens_seen": 260623040, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09942627, + "step": 12074, + "time_per_iteration": 2.6230850219726562 + }, + { + "auxiliary_loss_clip": 0.06411288, + "auxiliary_loss_mlp": 0.01265218, + "balance_loss_clip": 0.06275726, + "balance_loss_mlp": 0.01255342, + "epoch": 0.7259882759657298, + "flos": 25410657991680.0, + "grad_norm": 2.2056247097324193, + "language_loss": 0.74623215, + "learning_rate": 7.371215071343302e-07, + "loss": 0.82299727, + "num_input_tokens_seen": 260642735, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.09875488, + "step": 12075, + "time_per_iteration": 2.556225538253784 + }, + { + "auxiliary_loss_clip": 0.06410095, + "auxiliary_loss_mlp": 0.01264907, + "balance_loss_clip": 0.06275606, + "balance_loss_mlp": 0.01254089, + "epoch": 0.7260483992183977, + "flos": 62966781924480.0, + "grad_norm": 1.5598815820341405, + "language_loss": 0.64038914, + "learning_rate": 7.368195326186458e-07, + "loss": 0.71713918, + "num_input_tokens_seen": 260669935, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.10803223, + "step": 12076, + "time_per_iteration": 4.355054616928101 + }, + { + "auxiliary_loss_clip": 0.064101, + "auxiliary_loss_mlp": 0.01263502, + "balance_loss_clip": 0.06276171, + "balance_loss_mlp": 0.01253703, + "epoch": 0.7261085224710657, + "flos": 26474522555520.0, + "grad_norm": 1.8575056289170144, + "language_loss": 0.7908951, + "learning_rate": 7.365176060028912e-07, + "loss": 0.86763114, + "num_input_tokens_seen": 260689605, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.09796143, + "step": 12077, + "time_per_iteration": 2.5509204864501953 + }, + { + "auxiliary_loss_clip": 0.06314351, + "auxiliary_loss_mlp": 0.01251566, + "balance_loss_clip": 0.06259085, + "balance_loss_mlp": 0.01250447, + "epoch": 0.7261686457237336, + "flos": 66790634198400.0, + "grad_norm": 0.8642282673020346, + "language_loss": 0.64994717, + "learning_rate": 7.362157272985163e-07, + "loss": 0.72560632, + "num_input_tokens_seen": 260748265, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01121521, + "step": 12078, + "time_per_iteration": 3.138261556625366 + }, + { + "auxiliary_loss_clip": 0.06315269, + "auxiliary_loss_mlp": 0.0125259, + "balance_loss_clip": 0.06259946, + "balance_loss_mlp": 0.01251419, + "epoch": 0.7262287689764017, + "flos": 70020731640960.0, + "grad_norm": 0.7225013247461266, + "language_loss": 0.59434861, + "learning_rate": 7.359138965169671e-07, + "loss": 0.67002714, + "num_input_tokens_seen": 260816715, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.0116806, + "step": 12079, + "time_per_iteration": 3.2418954372406006 + }, + { + "auxiliary_loss_clip": 0.06405665, + "auxiliary_loss_mlp": 0.01266491, + "balance_loss_clip": 0.06273351, + "balance_loss_mlp": 0.01256495, + "epoch": 0.7262888922290696, + "flos": 23812212378240.0, + "grad_norm": 1.9020587797469353, + "language_loss": 0.64648104, + "learning_rate": 7.356121136696895e-07, + "loss": 0.72320265, + "num_input_tokens_seen": 260836765, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.09997559, + "step": 12080, + "time_per_iteration": 2.559204339981079 + }, + { + "auxiliary_loss_clip": 0.06412919, + "auxiliary_loss_mlp": 0.01265282, + "balance_loss_clip": 0.06278147, + "balance_loss_mlp": 0.01254637, + "epoch": 0.7263490154817376, + "flos": 19506412218240.0, + "grad_norm": 2.774312810040863, + "language_loss": 0.70093364, + "learning_rate": 7.35310378768128e-07, + "loss": 0.77771568, + "num_input_tokens_seen": 260854610, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.10644531, + "step": 12081, + "time_per_iteration": 2.4881443977355957 + }, + { + "auxiliary_loss_clip": 0.06414886, + "auxiliary_loss_mlp": 0.01264794, + "balance_loss_clip": 0.06277792, + "balance_loss_mlp": 0.01255144, + "epoch": 0.7264091387344055, + "flos": 16291240801920.0, + "grad_norm": 1.7064307786891335, + "language_loss": 0.81121981, + "learning_rate": 7.350086918237237e-07, + "loss": 0.88801658, + "num_input_tokens_seen": 260871620, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.09655762, + "step": 12082, + "time_per_iteration": 2.51804256439209 + }, + { + "auxiliary_loss_clip": 0.06418996, + "auxiliary_loss_mlp": 0.0126418, + "balance_loss_clip": 0.06277427, + "balance_loss_mlp": 0.01252474, + "epoch": 0.7264692619870735, + "flos": 24358784561280.0, + "grad_norm": 2.224005114416304, + "language_loss": 0.77144599, + "learning_rate": 7.347070528479158e-07, + "loss": 0.84827775, + "num_input_tokens_seen": 260890490, + "router_z_loss_clip": 1.41503906, + "router_z_loss_mlp": 0.11706543, + "step": 12083, + "time_per_iteration": 2.5199551582336426 + }, + { + "auxiliary_loss_clip": 0.06416926, + "auxiliary_loss_mlp": 0.01265943, + "balance_loss_clip": 0.06278973, + "balance_loss_mlp": 0.01255441, + "epoch": 0.7265293852397414, + "flos": 25126568323200.0, + "grad_norm": 1.6593932119603014, + "language_loss": 0.72771877, + "learning_rate": 7.344054618521433e-07, + "loss": 0.80454749, + "num_input_tokens_seen": 260909700, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.10498047, + "step": 12084, + "time_per_iteration": 2.5542185306549072 + }, + { + "auxiliary_loss_clip": 0.06412492, + "auxiliary_loss_mlp": 0.0126483, + "balance_loss_clip": 0.06276167, + "balance_loss_mlp": 0.01254173, + "epoch": 0.7265895084924094, + "flos": 22644869621760.0, + "grad_norm": 1.8149106211320094, + "language_loss": 0.78171599, + "learning_rate": 7.34103918847843e-07, + "loss": 0.85848927, + "num_input_tokens_seen": 260929090, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.10656738, + "step": 12085, + "time_per_iteration": 2.5213918685913086 + }, + { + "auxiliary_loss_clip": 0.06410021, + "auxiliary_loss_mlp": 0.0126413, + "balance_loss_clip": 0.06274905, + "balance_loss_mlp": 0.0125473, + "epoch": 0.7266496317450775, + "flos": 23375030100480.0, + "grad_norm": 1.688683771457735, + "language_loss": 0.7278198, + "learning_rate": 7.338024238464493e-07, + "loss": 0.80456126, + "num_input_tokens_seen": 260946615, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.09405518, + "step": 12086, + "time_per_iteration": 2.5169167518615723 + }, + { + "auxiliary_loss_clip": 0.06407881, + "auxiliary_loss_mlp": 0.01265357, + "balance_loss_clip": 0.06275129, + "balance_loss_mlp": 0.01255123, + "epoch": 0.7267097549977454, + "flos": 28082150190720.0, + "grad_norm": 1.7618222753787933, + "language_loss": 0.69773293, + "learning_rate": 7.335009768593938e-07, + "loss": 0.77446526, + "num_input_tokens_seen": 260968515, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.10247803, + "step": 12087, + "time_per_iteration": 2.552579641342163 + }, + { + "auxiliary_loss_clip": 0.06413816, + "auxiliary_loss_mlp": 0.01265425, + "balance_loss_clip": 0.06276657, + "balance_loss_mlp": 0.01254195, + "epoch": 0.7267698782504134, + "flos": 22201272506880.0, + "grad_norm": 1.8690535814436378, + "language_loss": 0.79212523, + "learning_rate": 7.331995778981088e-07, + "loss": 0.86891758, + "num_input_tokens_seen": 260986790, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.11230469, + "step": 12088, + "time_per_iteration": 2.5224051475524902 + }, + { + "auxiliary_loss_clip": 0.06411967, + "auxiliary_loss_mlp": 0.01267729, + "balance_loss_clip": 0.06275503, + "balance_loss_mlp": 0.01257561, + "epoch": 0.7268300015030813, + "flos": 18520729113600.0, + "grad_norm": 2.081138271531092, + "language_loss": 0.74134862, + "learning_rate": 7.328982269740221e-07, + "loss": 0.81814551, + "num_input_tokens_seen": 261004925, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.10168457, + "step": 12089, + "time_per_iteration": 2.4536690711975098 + }, + { + "auxiliary_loss_clip": 0.06410675, + "auxiliary_loss_mlp": 0.01266044, + "balance_loss_clip": 0.06273594, + "balance_loss_mlp": 0.0125606, + "epoch": 0.7268901247557493, + "flos": 23992530364800.0, + "grad_norm": 1.672566959006191, + "language_loss": 0.71264297, + "learning_rate": 7.325969240985616e-07, + "loss": 0.78941011, + "num_input_tokens_seen": 261023895, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.09979248, + "step": 12090, + "time_per_iteration": 2.518209457397461 + }, + { + "auxiliary_loss_clip": 0.06411642, + "auxiliary_loss_mlp": 0.01265075, + "balance_loss_clip": 0.06275435, + "balance_loss_mlp": 0.01254209, + "epoch": 0.7269502480084172, + "flos": 32096313815040.0, + "grad_norm": 1.7636278155243394, + "language_loss": 0.774212, + "learning_rate": 7.322956692831528e-07, + "loss": 0.85097921, + "num_input_tokens_seen": 261045445, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.10864258, + "step": 12091, + "time_per_iteration": 2.5809051990509033 + }, + { + "auxiliary_loss_clip": 0.06407169, + "auxiliary_loss_mlp": 0.01262324, + "balance_loss_clip": 0.06273061, + "balance_loss_mlp": 0.01251947, + "epoch": 0.7270103712610853, + "flos": 19068852597120.0, + "grad_norm": 1.7821213244340646, + "language_loss": 0.71747637, + "learning_rate": 7.319944625392205e-07, + "loss": 0.79417133, + "num_input_tokens_seen": 261064275, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.10375977, + "step": 12092, + "time_per_iteration": 2.5037333965301514 + }, + { + "auxiliary_loss_clip": 0.06409185, + "auxiliary_loss_mlp": 0.01262916, + "balance_loss_clip": 0.0627584, + "balance_loss_mlp": 0.01252718, + "epoch": 0.7270704945137532, + "flos": 34541605117440.0, + "grad_norm": 1.8451884643439012, + "language_loss": 0.61625177, + "learning_rate": 7.31693303878184e-07, + "loss": 0.69297278, + "num_input_tokens_seen": 261083310, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.10198975, + "step": 12093, + "time_per_iteration": 2.6145272254943848 + }, + { + "auxiliary_loss_clip": 0.06407997, + "auxiliary_loss_mlp": 0.0127204, + "balance_loss_clip": 0.06275733, + "balance_loss_mlp": 0.01261461, + "epoch": 0.7271306177664212, + "flos": 21514101972480.0, + "grad_norm": 1.4518547441748084, + "language_loss": 0.7566582, + "learning_rate": 7.313921933114644e-07, + "loss": 0.83345854, + "num_input_tokens_seen": 261103460, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.10583496, + "step": 12094, + "time_per_iteration": 2.5348317623138428 + }, + { + "auxiliary_loss_clip": 0.06402551, + "auxiliary_loss_mlp": 0.01268346, + "balance_loss_clip": 0.0627303, + "balance_loss_mlp": 0.01258976, + "epoch": 0.7271907410190891, + "flos": 22278866987520.0, + "grad_norm": 1.9666023712862966, + "language_loss": 0.84875292, + "learning_rate": 7.310911308504808e-07, + "loss": 0.92546189, + "num_input_tokens_seen": 261121375, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.09375, + "step": 12095, + "time_per_iteration": 2.4921047687530518 + }, + { + "auxiliary_loss_clip": 0.06408881, + "auxiliary_loss_mlp": 0.01266756, + "balance_loss_clip": 0.06273626, + "balance_loss_mlp": 0.01256319, + "epoch": 0.7272508642717571, + "flos": 22899721415040.0, + "grad_norm": 1.6073112969743308, + "language_loss": 0.77431858, + "learning_rate": 7.307901165066479e-07, + "loss": 0.85107493, + "num_input_tokens_seen": 261141105, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.10437012, + "step": 12096, + "time_per_iteration": 2.5228958129882812 + }, + { + "auxiliary_loss_clip": 0.06409237, + "auxiliary_loss_mlp": 0.01264949, + "balance_loss_clip": 0.06274968, + "balance_loss_mlp": 0.01254852, + "epoch": 0.727310987524425, + "flos": 11660667016320.0, + "grad_norm": 1.766744410162751, + "language_loss": 0.72485346, + "learning_rate": 7.30489150291381e-07, + "loss": 0.80159533, + "num_input_tokens_seen": 261159255, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.10095215, + "step": 12097, + "time_per_iteration": 3.9472336769104004 + }, + { + "auxiliary_loss_clip": 0.06410161, + "auxiliary_loss_mlp": 0.01263507, + "balance_loss_clip": 0.06275506, + "balance_loss_mlp": 0.01253111, + "epoch": 0.727371110777093, + "flos": 24542247075840.0, + "grad_norm": 1.6914945832849257, + "language_loss": 0.76620024, + "learning_rate": 7.301882322160935e-07, + "loss": 0.84293687, + "num_input_tokens_seen": 261177960, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.10400391, + "step": 12098, + "time_per_iteration": 2.5401840209960938 + }, + { + "auxiliary_loss_clip": 0.06412796, + "auxiliary_loss_mlp": 0.0126774, + "balance_loss_clip": 0.06274997, + "balance_loss_mlp": 0.01256982, + "epoch": 0.7274312340297611, + "flos": 74755175690880.0, + "grad_norm": 1.647144818498915, + "language_loss": 0.67571467, + "learning_rate": 7.298873622921952e-07, + "loss": 0.75252008, + "num_input_tokens_seen": 261205660, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.10766602, + "step": 12099, + "time_per_iteration": 2.933919668197632 + }, + { + "auxiliary_loss_clip": 0.06414318, + "auxiliary_loss_mlp": 0.01268861, + "balance_loss_clip": 0.06274534, + "balance_loss_mlp": 0.0125731, + "epoch": 0.727491357282429, + "flos": 22348872673920.0, + "grad_norm": 1.593136067800256, + "language_loss": 0.72549355, + "learning_rate": 7.29586540531095e-07, + "loss": 0.80232537, + "num_input_tokens_seen": 261225185, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.11560059, + "step": 12100, + "time_per_iteration": 2.485959053039551 + }, + { + "auxiliary_loss_clip": 0.06406155, + "auxiliary_loss_mlp": 0.01265862, + "balance_loss_clip": 0.06273396, + "balance_loss_mlp": 0.01256778, + "epoch": 0.727551480535097, + "flos": 23304730924800.0, + "grad_norm": 1.4119889543918884, + "language_loss": 0.75127757, + "learning_rate": 7.292857669442005e-07, + "loss": 0.82799774, + "num_input_tokens_seen": 261247965, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.09088135, + "step": 12101, + "time_per_iteration": 2.610421895980835 + }, + { + "auxiliary_loss_clip": 0.06405263, + "auxiliary_loss_mlp": 0.01263956, + "balance_loss_clip": 0.06274393, + "balance_loss_mlp": 0.01254651, + "epoch": 0.7276116037877649, + "flos": 21476981813760.0, + "grad_norm": 1.6630445155880014, + "language_loss": 0.82583451, + "learning_rate": 7.289850415429177e-07, + "loss": 0.90252674, + "num_input_tokens_seen": 261267585, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.09295654, + "step": 12102, + "time_per_iteration": 2.5227344036102295 + }, + { + "auxiliary_loss_clip": 0.06406877, + "auxiliary_loss_mlp": 0.01266073, + "balance_loss_clip": 0.06273448, + "balance_loss_mlp": 0.012565, + "epoch": 0.7276717270404329, + "flos": 21469393019520.0, + "grad_norm": 2.031204621507473, + "language_loss": 0.81889427, + "learning_rate": 7.286843643386495e-07, + "loss": 0.89562374, + "num_input_tokens_seen": 261285200, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.09570312, + "step": 12103, + "time_per_iteration": 2.4974191188812256 + }, + { + "auxiliary_loss_clip": 0.06410246, + "auxiliary_loss_mlp": 0.01264171, + "balance_loss_clip": 0.06276208, + "balance_loss_mlp": 0.01253818, + "epoch": 0.7277318502931008, + "flos": 16842928083840.0, + "grad_norm": 1.574176499871837, + "language_loss": 0.66993153, + "learning_rate": 7.283837353427968e-07, + "loss": 0.74667573, + "num_input_tokens_seen": 261303645, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.10351562, + "step": 12104, + "time_per_iteration": 2.4653480052948 + }, + { + "auxiliary_loss_clip": 0.06406664, + "auxiliary_loss_mlp": 0.01268067, + "balance_loss_clip": 0.06276865, + "balance_loss_mlp": 0.01257815, + "epoch": 0.7277919735457689, + "flos": 33408824970240.0, + "grad_norm": 1.70221768283368, + "language_loss": 0.65823901, + "learning_rate": 7.280831545667611e-07, + "loss": 0.73498631, + "num_input_tokens_seen": 261323265, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.1026001, + "step": 12105, + "time_per_iteration": 2.6353166103363037 + }, + { + "auxiliary_loss_clip": 0.06408508, + "auxiliary_loss_mlp": 0.01267339, + "balance_loss_clip": 0.06276379, + "balance_loss_mlp": 0.01257599, + "epoch": 0.7278520967984368, + "flos": 19212218133120.0, + "grad_norm": 2.1199426403905197, + "language_loss": 0.75508106, + "learning_rate": 7.27782622021939e-07, + "loss": 0.83183956, + "num_input_tokens_seen": 261339745, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.09741211, + "step": 12106, + "time_per_iteration": 2.46575665473938 + }, + { + "auxiliary_loss_clip": 0.06411369, + "auxiliary_loss_mlp": 0.01266618, + "balance_loss_clip": 0.06273164, + "balance_loss_mlp": 0.01255228, + "epoch": 0.7279122200511048, + "flos": 34103206955520.0, + "grad_norm": 1.806710660650235, + "language_loss": 0.70616901, + "learning_rate": 7.274821377197273e-07, + "loss": 0.78294891, + "num_input_tokens_seen": 261359310, + "router_z_loss_clip": 1.38183594, + "router_z_loss_mlp": 0.11395264, + "step": 12107, + "time_per_iteration": 2.6280477046966553 + }, + { + "auxiliary_loss_clip": 0.06407417, + "auxiliary_loss_mlp": 0.01263404, + "balance_loss_clip": 0.06274886, + "balance_loss_mlp": 0.01253885, + "epoch": 0.7279723433037727, + "flos": 54610913865600.0, + "grad_norm": 1.4427675680101948, + "language_loss": 0.75342691, + "learning_rate": 7.271817016715205e-07, + "loss": 0.83013523, + "num_input_tokens_seen": 261384640, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.09515381, + "step": 12108, + "time_per_iteration": 4.324532985687256 + }, + { + "auxiliary_loss_clip": 0.0640891, + "auxiliary_loss_mlp": 0.01265846, + "balance_loss_clip": 0.06273282, + "balance_loss_mlp": 0.01255809, + "epoch": 0.7280324665564407, + "flos": 36146297859840.0, + "grad_norm": 1.5700716356881925, + "language_loss": 0.67018294, + "learning_rate": 7.268813138887124e-07, + "loss": 0.74693048, + "num_input_tokens_seen": 261405290, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.1003418, + "step": 12109, + "time_per_iteration": 2.615412473678589 + }, + { + "auxiliary_loss_clip": 0.06406409, + "auxiliary_loss_mlp": 0.01267734, + "balance_loss_clip": 0.06273519, + "balance_loss_mlp": 0.01256218, + "epoch": 0.7280925898091086, + "flos": 11623169514240.0, + "grad_norm": 7.186110502128194, + "language_loss": 0.63434047, + "learning_rate": 7.265809743826912e-07, + "loss": 0.71108198, + "num_input_tokens_seen": 261419710, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.11517334, + "step": 12110, + "time_per_iteration": 2.4591712951660156 + }, + { + "auxiliary_loss_clip": 0.06409231, + "auxiliary_loss_mlp": 0.01266788, + "balance_loss_clip": 0.06271385, + "balance_loss_mlp": 0.01256184, + "epoch": 0.7281527130617766, + "flos": 34285663221120.0, + "grad_norm": 1.770442169865723, + "language_loss": 0.5852263, + "learning_rate": 7.26280683164847e-07, + "loss": 0.66198647, + "num_input_tokens_seen": 261442385, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.10595703, + "step": 12111, + "time_per_iteration": 2.5891120433807373 + }, + { + "auxiliary_loss_clip": 0.06411764, + "auxiliary_loss_mlp": 0.01265175, + "balance_loss_clip": 0.0627564, + "balance_loss_mlp": 0.01254744, + "epoch": 0.7282128363144446, + "flos": 13923208563840.0, + "grad_norm": 2.24560382762785, + "language_loss": 0.74143445, + "learning_rate": 7.259804402465677e-07, + "loss": 0.81820381, + "num_input_tokens_seen": 261459805, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.10430908, + "step": 12112, + "time_per_iteration": 3.927354335784912 + }, + { + "auxiliary_loss_clip": 0.0640655, + "auxiliary_loss_mlp": 0.01266322, + "balance_loss_clip": 0.06273867, + "balance_loss_mlp": 0.01256767, + "epoch": 0.7282729595671126, + "flos": 20783983420800.0, + "grad_norm": 2.386616636448106, + "language_loss": 0.66917908, + "learning_rate": 7.25680245639237e-07, + "loss": 0.74590778, + "num_input_tokens_seen": 261477175, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09552002, + "step": 12113, + "time_per_iteration": 2.501143455505371 + }, + { + "auxiliary_loss_clip": 0.06406707, + "auxiliary_loss_mlp": 0.01264241, + "balance_loss_clip": 0.06271793, + "balance_loss_mlp": 0.01254311, + "epoch": 0.7283330828197806, + "flos": 16330876583040.0, + "grad_norm": 1.6899344961685594, + "language_loss": 0.73054916, + "learning_rate": 7.253800993542399e-07, + "loss": 0.80725861, + "num_input_tokens_seen": 261494990, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.0993042, + "step": 12114, + "time_per_iteration": 2.492030382156372 + }, + { + "auxiliary_loss_clip": 0.06404929, + "auxiliary_loss_mlp": 0.01265418, + "balance_loss_clip": 0.06272236, + "balance_loss_mlp": 0.01255429, + "epoch": 0.7283932060724485, + "flos": 27497535454080.0, + "grad_norm": 1.7662061899425427, + "language_loss": 0.68715543, + "learning_rate": 7.250800014029564e-07, + "loss": 0.76385891, + "num_input_tokens_seen": 261514445, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09985352, + "step": 12115, + "time_per_iteration": 2.557182788848877 + }, + { + "auxiliary_loss_clip": 0.06409318, + "auxiliary_loss_mlp": 0.01265218, + "balance_loss_clip": 0.06271549, + "balance_loss_mlp": 0.01254811, + "epoch": 0.7284533293251165, + "flos": 18373548216960.0, + "grad_norm": 1.8492705823258373, + "language_loss": 0.60310125, + "learning_rate": 7.247799517967674e-07, + "loss": 0.67984653, + "num_input_tokens_seen": 261533565, + "router_z_loss_clip": 1.37695312, + "router_z_loss_mlp": 0.10406494, + "step": 12116, + "time_per_iteration": 3.906881093978882 + }, + { + "auxiliary_loss_clip": 0.06408231, + "auxiliary_loss_mlp": 0.01266827, + "balance_loss_clip": 0.06275375, + "balance_loss_mlp": 0.01256766, + "epoch": 0.7285134525777844, + "flos": 21731917461120.0, + "grad_norm": 1.7320251042844839, + "language_loss": 0.72842097, + "learning_rate": 7.2447995054705e-07, + "loss": 0.80517155, + "num_input_tokens_seen": 261553795, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.10058594, + "step": 12117, + "time_per_iteration": 2.522825002670288 + }, + { + "auxiliary_loss_clip": 0.06408626, + "auxiliary_loss_mlp": 0.01265235, + "balance_loss_clip": 0.06274951, + "balance_loss_mlp": 0.01254673, + "epoch": 0.7285735758304525, + "flos": 20747743729920.0, + "grad_norm": 1.8305634695552309, + "language_loss": 0.69773346, + "learning_rate": 7.241799976651807e-07, + "loss": 0.77447206, + "num_input_tokens_seen": 261572565, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.10565186, + "step": 12118, + "time_per_iteration": 2.48207426071167 + }, + { + "auxiliary_loss_clip": 0.06402861, + "auxiliary_loss_mlp": 0.01267316, + "balance_loss_clip": 0.06275323, + "balance_loss_mlp": 0.01257714, + "epoch": 0.7286336990831204, + "flos": 17316643541760.0, + "grad_norm": 1.7593601335155638, + "language_loss": 0.84603906, + "learning_rate": 7.238800931625346e-07, + "loss": 0.92274088, + "num_input_tokens_seen": 261590910, + "router_z_loss_clip": 1.27539062, + "router_z_loss_mlp": 0.0960083, + "step": 12119, + "time_per_iteration": 2.6029109954833984 + }, + { + "auxiliary_loss_clip": 0.0640807, + "auxiliary_loss_mlp": 0.01265759, + "balance_loss_clip": 0.06272867, + "balance_loss_mlp": 0.01255454, + "epoch": 0.7286938223357884, + "flos": 19792724019840.0, + "grad_norm": 1.9939013522780928, + "language_loss": 0.82186806, + "learning_rate": 7.235802370504831e-07, + "loss": 0.89860642, + "num_input_tokens_seen": 261606005, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.10308838, + "step": 12120, + "time_per_iteration": 2.4777402877807617 + }, + { + "auxiliary_loss_clip": 0.06409417, + "auxiliary_loss_mlp": 0.01265212, + "balance_loss_clip": 0.06275336, + "balance_loss_mlp": 0.0125496, + "epoch": 0.7287539455884563, + "flos": 15346241654400.0, + "grad_norm": 1.8086433157736466, + "language_loss": 0.7907117, + "learning_rate": 7.232804293403963e-07, + "loss": 0.86745799, + "num_input_tokens_seen": 261622305, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.10266113, + "step": 12121, + "time_per_iteration": 2.493319511413574 + }, + { + "auxiliary_loss_clip": 0.06409892, + "auxiliary_loss_mlp": 0.01266243, + "balance_loss_clip": 0.06270927, + "balance_loss_mlp": 0.01255693, + "epoch": 0.7288140688411243, + "flos": 25199592756480.0, + "grad_norm": 1.5783623622806526, + "language_loss": 0.69521451, + "learning_rate": 7.229806700436441e-07, + "loss": 0.77197587, + "num_input_tokens_seen": 261642465, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.10559082, + "step": 12122, + "time_per_iteration": 2.524064064025879 + }, + { + "auxiliary_loss_clip": 0.06402311, + "auxiliary_loss_mlp": 0.01263872, + "balance_loss_clip": 0.06270998, + "balance_loss_mlp": 0.01254586, + "epoch": 0.7288741920937922, + "flos": 23990350158720.0, + "grad_norm": 1.7454149846167522, + "language_loss": 0.87436593, + "learning_rate": 7.226809591715923e-07, + "loss": 0.95102781, + "num_input_tokens_seen": 261661420, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.09283447, + "step": 12123, + "time_per_iteration": 2.542051315307617 + }, + { + "auxiliary_loss_clip": 0.06402463, + "auxiliary_loss_mlp": 0.01265281, + "balance_loss_clip": 0.06270853, + "balance_loss_mlp": 0.01255094, + "epoch": 0.7289343153464602, + "flos": 22751114999040.0, + "grad_norm": 1.6465558507133775, + "language_loss": 0.8315962, + "learning_rate": 7.223812967356065e-07, + "loss": 0.90827358, + "num_input_tokens_seen": 261680865, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.10186768, + "step": 12124, + "time_per_iteration": 2.493330955505371 + }, + { + "auxiliary_loss_clip": 0.06405756, + "auxiliary_loss_mlp": 0.01266287, + "balance_loss_clip": 0.06272376, + "balance_loss_mlp": 0.01256173, + "epoch": 0.7289944385991282, + "flos": 24906991898880.0, + "grad_norm": 1.5973594077423074, + "language_loss": 0.66998374, + "learning_rate": 7.220816827470499e-07, + "loss": 0.74670422, + "num_input_tokens_seen": 261701455, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.10113525, + "step": 12125, + "time_per_iteration": 2.5571157932281494 + }, + { + "auxiliary_loss_clip": 0.06410982, + "auxiliary_loss_mlp": 0.01267293, + "balance_loss_clip": 0.06272356, + "balance_loss_mlp": 0.01255521, + "epoch": 0.7290545618517962, + "flos": 22973835951360.0, + "grad_norm": 1.7735347741305036, + "language_loss": 0.75574493, + "learning_rate": 7.217821172172855e-07, + "loss": 0.83252764, + "num_input_tokens_seen": 261721260, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.11773682, + "step": 12126, + "time_per_iteration": 2.4986443519592285 + }, + { + "auxiliary_loss_clip": 0.0631386, + "auxiliary_loss_mlp": 0.01254001, + "balance_loss_clip": 0.06258902, + "balance_loss_mlp": 0.01252942, + "epoch": 0.7291146851044642, + "flos": 61921602092160.0, + "grad_norm": 0.8043212871024376, + "language_loss": 0.58652955, + "learning_rate": 7.2148260015767e-07, + "loss": 0.66220808, + "num_input_tokens_seen": 261779370, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.01060486, + "step": 12127, + "time_per_iteration": 3.065887689590454 + }, + { + "auxiliary_loss_clip": 0.06406868, + "auxiliary_loss_mlp": 0.01268134, + "balance_loss_clip": 0.06276388, + "balance_loss_mlp": 0.01259032, + "epoch": 0.7291748083571321, + "flos": 23337616452480.0, + "grad_norm": 2.002154348717822, + "language_loss": 0.68532437, + "learning_rate": 7.21183131579562e-07, + "loss": 0.76207435, + "num_input_tokens_seen": 261798050, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.09100342, + "step": 12128, + "time_per_iteration": 2.5636982917785645 + }, + { + "auxiliary_loss_clip": 0.06407112, + "auxiliary_loss_mlp": 0.01265972, + "balance_loss_clip": 0.06272791, + "balance_loss_mlp": 0.01255493, + "epoch": 0.7292349316098001, + "flos": 28337588962560.0, + "grad_norm": 1.9770234243530824, + "language_loss": 0.65893352, + "learning_rate": 7.20883711494319e-07, + "loss": 0.73566437, + "num_input_tokens_seen": 261817660, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.10479736, + "step": 12129, + "time_per_iteration": 2.5952858924865723 + }, + { + "auxiliary_loss_clip": 0.06401228, + "auxiliary_loss_mlp": 0.01265963, + "balance_loss_clip": 0.06271209, + "balance_loss_mlp": 0.01255878, + "epoch": 0.729295054862468, + "flos": 24138788866560.0, + "grad_norm": 2.8834397381641206, + "language_loss": 0.74323857, + "learning_rate": 7.205843399132927e-07, + "loss": 0.81991053, + "num_input_tokens_seen": 261837935, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.10076904, + "step": 12130, + "time_per_iteration": 2.5151498317718506 + }, + { + "auxiliary_loss_clip": 0.06408465, + "auxiliary_loss_mlp": 0.01266174, + "balance_loss_clip": 0.06273751, + "balance_loss_mlp": 0.01256548, + "epoch": 0.7293551781151361, + "flos": 22822168861440.0, + "grad_norm": 1.7601185133573507, + "language_loss": 0.69902027, + "learning_rate": 7.202850168478374e-07, + "loss": 0.77576661, + "num_input_tokens_seen": 261857575, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.09625244, + "step": 12131, + "time_per_iteration": 2.5700907707214355 + }, + { + "auxiliary_loss_clip": 0.06405198, + "auxiliary_loss_mlp": 0.0126315, + "balance_loss_clip": 0.06273468, + "balance_loss_mlp": 0.01253238, + "epoch": 0.729415301367804, + "flos": 22133111610240.0, + "grad_norm": 1.4321727616978588, + "language_loss": 0.77646959, + "learning_rate": 7.199857423093025e-07, + "loss": 0.85315311, + "num_input_tokens_seen": 261877265, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09912109, + "step": 12132, + "time_per_iteration": 2.5047810077667236 + }, + { + "auxiliary_loss_clip": 0.06406032, + "auxiliary_loss_mlp": 0.01268163, + "balance_loss_clip": 0.06274553, + "balance_loss_mlp": 0.01258382, + "epoch": 0.729475424620472, + "flos": 12354587804160.0, + "grad_norm": 2.26553261567321, + "language_loss": 0.79865611, + "learning_rate": 7.196865163090358e-07, + "loss": 0.87539804, + "num_input_tokens_seen": 261893695, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.09790039, + "step": 12133, + "time_per_iteration": 2.5156800746917725 + }, + { + "auxiliary_loss_clip": 0.06405626, + "auxiliary_loss_mlp": 0.01262377, + "balance_loss_clip": 0.06273352, + "balance_loss_mlp": 0.01252555, + "epoch": 0.7295355478731399, + "flos": 22201020944640.0, + "grad_norm": 2.1172065702021228, + "language_loss": 0.72792143, + "learning_rate": 7.193873388583846e-07, + "loss": 0.80460143, + "num_input_tokens_seen": 261911825, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.09832764, + "step": 12134, + "time_per_iteration": 2.493656873703003 + }, + { + "auxiliary_loss_clip": 0.06407951, + "auxiliary_loss_mlp": 0.01266233, + "balance_loss_clip": 0.06272922, + "balance_loss_mlp": 0.01255796, + "epoch": 0.7295956711258079, + "flos": 23228771598720.0, + "grad_norm": 1.8016892870366705, + "language_loss": 0.7149846, + "learning_rate": 7.190882099686939e-07, + "loss": 0.79172647, + "num_input_tokens_seen": 261931190, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.10424805, + "step": 12135, + "time_per_iteration": 2.5029256343841553 + }, + { + "auxiliary_loss_clip": 0.06412001, + "auxiliary_loss_mlp": 0.01266167, + "balance_loss_clip": 0.06275906, + "balance_loss_mlp": 0.01256362, + "epoch": 0.7296557943784758, + "flos": 31877282442240.0, + "grad_norm": 2.0055855777259683, + "language_loss": 0.62525374, + "learning_rate": 7.187891296513075e-07, + "loss": 0.70203543, + "num_input_tokens_seen": 261951240, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.0980835, + "step": 12136, + "time_per_iteration": 2.6325221061706543 + }, + { + "auxiliary_loss_clip": 0.06405275, + "auxiliary_loss_mlp": 0.01264655, + "balance_loss_clip": 0.06272214, + "balance_loss_mlp": 0.01255184, + "epoch": 0.7297159176311439, + "flos": 26659033246080.0, + "grad_norm": 1.794436841721563, + "language_loss": 0.7470715, + "learning_rate": 7.184900979175654e-07, + "loss": 0.82377088, + "num_input_tokens_seen": 261971605, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.09472656, + "step": 12137, + "time_per_iteration": 3.958789825439453 + }, + { + "auxiliary_loss_clip": 0.06406206, + "auxiliary_loss_mlp": 0.0126361, + "balance_loss_clip": 0.0627296, + "balance_loss_mlp": 0.01253466, + "epoch": 0.7297760408838118, + "flos": 24755744079360.0, + "grad_norm": 1.5243930727188364, + "language_loss": 0.74341732, + "learning_rate": 7.181911147788069e-07, + "loss": 0.82011551, + "num_input_tokens_seen": 261990830, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.10162354, + "step": 12138, + "time_per_iteration": 2.5344252586364746 + }, + { + "auxiliary_loss_clip": 0.06401816, + "auxiliary_loss_mlp": 0.01265792, + "balance_loss_clip": 0.06270966, + "balance_loss_mlp": 0.01256434, + "epoch": 0.7298361641364798, + "flos": 18079018715520.0, + "grad_norm": 2.292743835188078, + "language_loss": 0.72074485, + "learning_rate": 7.178921802463702e-07, + "loss": 0.79742092, + "num_input_tokens_seen": 262008190, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09350586, + "step": 12139, + "time_per_iteration": 2.4686436653137207 + }, + { + "auxiliary_loss_clip": 0.06401777, + "auxiliary_loss_mlp": 0.01264266, + "balance_loss_clip": 0.06273351, + "balance_loss_mlp": 0.01255015, + "epoch": 0.7298962873891478, + "flos": 29902897486080.0, + "grad_norm": 1.4427366017316514, + "language_loss": 0.73659438, + "learning_rate": 7.175932943315898e-07, + "loss": 0.81325477, + "num_input_tokens_seen": 262030460, + "router_z_loss_clip": 1.28417969, + "router_z_loss_mlp": 0.09246826, + "step": 12140, + "time_per_iteration": 2.5841948986053467 + }, + { + "auxiliary_loss_clip": 0.06410205, + "auxiliary_loss_mlp": 0.01265019, + "balance_loss_clip": 0.06274636, + "balance_loss_mlp": 0.01254266, + "epoch": 0.7299564106418157, + "flos": 32273613054720.0, + "grad_norm": 1.4465948977154814, + "language_loss": 0.55615419, + "learning_rate": 7.172944570458003e-07, + "loss": 0.63290644, + "num_input_tokens_seen": 262050830, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.10748291, + "step": 12141, + "time_per_iteration": 2.5818471908569336 + }, + { + "auxiliary_loss_clip": 0.06406234, + "auxiliary_loss_mlp": 0.01263105, + "balance_loss_clip": 0.06276207, + "balance_loss_mlp": 0.01254277, + "epoch": 0.7300165338944837, + "flos": 22937009281920.0, + "grad_norm": 1.432470794912082, + "language_loss": 0.73197258, + "learning_rate": 7.169956684003342e-07, + "loss": 0.80866599, + "num_input_tokens_seen": 262071245, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.0881958, + "step": 12142, + "time_per_iteration": 2.5505692958831787 + }, + { + "auxiliary_loss_clip": 0.0640648, + "auxiliary_loss_mlp": 0.01261695, + "balance_loss_clip": 0.06273788, + "balance_loss_mlp": 0.01252629, + "epoch": 0.7300766571471516, + "flos": 19834959277440.0, + "grad_norm": 1.6768515180809767, + "language_loss": 0.74087632, + "learning_rate": 7.16696928406521e-07, + "loss": 0.81755805, + "num_input_tokens_seen": 262087525, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.09063721, + "step": 12143, + "time_per_iteration": 2.490084648132324 + }, + { + "auxiliary_loss_clip": 0.06409657, + "auxiliary_loss_mlp": 0.01263891, + "balance_loss_clip": 0.0627545, + "balance_loss_mlp": 0.01253907, + "epoch": 0.7301367803998197, + "flos": 24353879097600.0, + "grad_norm": 2.204410002817552, + "language_loss": 0.66878092, + "learning_rate": 7.163982370756882e-07, + "loss": 0.74551642, + "num_input_tokens_seen": 262107355, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.09973145, + "step": 12144, + "time_per_iteration": 2.54231858253479 + }, + { + "auxiliary_loss_clip": 0.06408693, + "auxiliary_loss_mlp": 0.0126374, + "balance_loss_clip": 0.06274417, + "balance_loss_mlp": 0.01253232, + "epoch": 0.7301969036524876, + "flos": 15309918109440.0, + "grad_norm": 1.5759955689849319, + "language_loss": 0.79171866, + "learning_rate": 7.160995944191627e-07, + "loss": 0.86844301, + "num_input_tokens_seen": 262125645, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.10510254, + "step": 12145, + "time_per_iteration": 2.479991912841797 + }, + { + "auxiliary_loss_clip": 0.06406255, + "auxiliary_loss_mlp": 0.01266826, + "balance_loss_clip": 0.06275664, + "balance_loss_mlp": 0.01256819, + "epoch": 0.7302570269051556, + "flos": 23512945121280.0, + "grad_norm": 1.601000858309641, + "language_loss": 0.92001355, + "learning_rate": 7.158010004482702e-07, + "loss": 0.99674433, + "num_input_tokens_seen": 262144075, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.10003662, + "step": 12146, + "time_per_iteration": 2.536653757095337 + }, + { + "auxiliary_loss_clip": 0.06406654, + "auxiliary_loss_mlp": 0.01262625, + "balance_loss_clip": 0.06276748, + "balance_loss_mlp": 0.01252885, + "epoch": 0.7303171501578235, + "flos": 20529508970880.0, + "grad_norm": 1.778676340204468, + "language_loss": 0.62199593, + "learning_rate": 7.155024551743316e-07, + "loss": 0.69868875, + "num_input_tokens_seen": 262165940, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.097229, + "step": 12147, + "time_per_iteration": 3.9292736053466797 + }, + { + "auxiliary_loss_clip": 0.06418571, + "auxiliary_loss_mlp": 0.01266018, + "balance_loss_clip": 0.06282554, + "balance_loss_mlp": 0.0125579, + "epoch": 0.7303772734104915, + "flos": 18338482483200.0, + "grad_norm": 1.749812940389672, + "language_loss": 0.75328469, + "learning_rate": 7.152039586086693e-07, + "loss": 0.83013058, + "num_input_tokens_seen": 262184520, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.10229492, + "step": 12148, + "time_per_iteration": 2.466489791870117 + }, + { + "auxiliary_loss_clip": 0.06311627, + "auxiliary_loss_mlp": 0.01255211, + "balance_loss_clip": 0.06256207, + "balance_loss_mlp": 0.01254079, + "epoch": 0.7304373966631594, + "flos": 60673604181120.0, + "grad_norm": 3.1920126472148245, + "language_loss": 0.56622815, + "learning_rate": 7.149055107626017e-07, + "loss": 0.64189649, + "num_input_tokens_seen": 262247070, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01133728, + "step": 12149, + "time_per_iteration": 3.1208536624908447 + }, + { + "auxiliary_loss_clip": 0.06409251, + "auxiliary_loss_mlp": 0.01266086, + "balance_loss_clip": 0.06273203, + "balance_loss_mlp": 0.01256108, + "epoch": 0.7304975199158275, + "flos": 19834120736640.0, + "grad_norm": 2.2110460738796847, + "language_loss": 0.74197543, + "learning_rate": 7.146071116474451e-07, + "loss": 0.8187288, + "num_input_tokens_seen": 262266605, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.09979248, + "step": 12150, + "time_per_iteration": 2.563061475753784 + }, + { + "auxiliary_loss_clip": 0.06411943, + "auxiliary_loss_mlp": 0.01268026, + "balance_loss_clip": 0.0627417, + "balance_loss_mlp": 0.01257804, + "epoch": 0.7305576431684954, + "flos": 13228910432640.0, + "grad_norm": 2.0644493545304012, + "language_loss": 0.845092, + "learning_rate": 7.143087612745158e-07, + "loss": 0.92189169, + "num_input_tokens_seen": 262283880, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.10229492, + "step": 12151, + "time_per_iteration": 3.9333503246307373 + }, + { + "auxiliary_loss_clip": 0.0641029, + "auxiliary_loss_mlp": 0.01268677, + "balance_loss_clip": 0.06276184, + "balance_loss_mlp": 0.01258395, + "epoch": 0.7306177664211634, + "flos": 24067231879680.0, + "grad_norm": 1.709088154989502, + "language_loss": 0.77853483, + "learning_rate": 7.14010459655127e-07, + "loss": 0.85532451, + "num_input_tokens_seen": 262304155, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.10272217, + "step": 12152, + "time_per_iteration": 2.549255132675171 + }, + { + "auxiliary_loss_clip": 0.06408677, + "auxiliary_loss_mlp": 0.01265757, + "balance_loss_clip": 0.06275931, + "balance_loss_mlp": 0.0125588, + "epoch": 0.7306778896738314, + "flos": 27096425159040.0, + "grad_norm": 1.4467429234304112, + "language_loss": 0.79911304, + "learning_rate": 7.137122068005919e-07, + "loss": 0.87585741, + "num_input_tokens_seen": 262325660, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.09875488, + "step": 12153, + "time_per_iteration": 2.584221839904785 + }, + { + "auxiliary_loss_clip": 0.06409719, + "auxiliary_loss_mlp": 0.01268444, + "balance_loss_clip": 0.06271548, + "balance_loss_mlp": 0.01258473, + "epoch": 0.7307380129264993, + "flos": 16696250311680.0, + "grad_norm": 1.5292836861635837, + "language_loss": 0.67226088, + "learning_rate": 7.134140027222173e-07, + "loss": 0.74904257, + "num_input_tokens_seen": 262344075, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.09967041, + "step": 12154, + "time_per_iteration": 2.482377052307129 + }, + { + "auxiliary_loss_clip": 0.06408456, + "auxiliary_loss_mlp": 0.01265496, + "balance_loss_clip": 0.06273298, + "balance_loss_mlp": 0.01255584, + "epoch": 0.7307981361791673, + "flos": 21732169023360.0, + "grad_norm": 1.735892015555871, + "language_loss": 0.66179639, + "learning_rate": 7.131158474313128e-07, + "loss": 0.73853588, + "num_input_tokens_seen": 262363305, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.09912109, + "step": 12155, + "time_per_iteration": 3.920834541320801 + }, + { + "auxiliary_loss_clip": 0.06405047, + "auxiliary_loss_mlp": 0.01263947, + "balance_loss_clip": 0.06273931, + "balance_loss_mlp": 0.01254416, + "epoch": 0.7308582594318352, + "flos": 18046468604160.0, + "grad_norm": 1.7732442430270934, + "language_loss": 0.82409012, + "learning_rate": 7.128177409391851e-07, + "loss": 0.90078008, + "num_input_tokens_seen": 262380730, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.09527588, + "step": 12156, + "time_per_iteration": 2.498297691345215 + }, + { + "auxiliary_loss_clip": 0.06404316, + "auxiliary_loss_mlp": 0.01268424, + "balance_loss_clip": 0.06272586, + "balance_loss_mlp": 0.01259304, + "epoch": 0.7309183826845033, + "flos": 13850100276480.0, + "grad_norm": 2.231479695583903, + "language_loss": 0.75512803, + "learning_rate": 7.125196832571367e-07, + "loss": 0.83185542, + "num_input_tokens_seen": 262395480, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09118652, + "step": 12157, + "time_per_iteration": 2.469118595123291 + }, + { + "auxiliary_loss_clip": 0.06404246, + "auxiliary_loss_mlp": 0.0126719, + "balance_loss_clip": 0.06274454, + "balance_loss_mlp": 0.0125816, + "epoch": 0.7309785059371712, + "flos": 17024881224960.0, + "grad_norm": 1.9988755435472185, + "language_loss": 0.73910487, + "learning_rate": 7.122216743964713e-07, + "loss": 0.81581926, + "num_input_tokens_seen": 262413340, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.090271, + "step": 12158, + "time_per_iteration": 2.498945713043213 + }, + { + "auxiliary_loss_clip": 0.06413946, + "auxiliary_loss_mlp": 0.01263808, + "balance_loss_clip": 0.06278427, + "balance_loss_mlp": 0.0125417, + "epoch": 0.7310386291898392, + "flos": 26509127091840.0, + "grad_norm": 1.5605455050098358, + "language_loss": 0.85817492, + "learning_rate": 7.119237143684896e-07, + "loss": 0.93495244, + "num_input_tokens_seen": 262433455, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.09637451, + "step": 12159, + "time_per_iteration": 2.5414113998413086 + }, + { + "auxiliary_loss_clip": 0.06415824, + "auxiliary_loss_mlp": 0.01267306, + "balance_loss_clip": 0.0627675, + "balance_loss_mlp": 0.01256148, + "epoch": 0.7310987524425071, + "flos": 16951521375360.0, + "grad_norm": 1.9612355888194155, + "language_loss": 0.74199778, + "learning_rate": 7.116258031844895e-07, + "loss": 0.81882906, + "num_input_tokens_seen": 262450335, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.1114502, + "step": 12160, + "time_per_iteration": 2.598435163497925 + }, + { + "auxiliary_loss_clip": 0.06413984, + "auxiliary_loss_mlp": 0.01266348, + "balance_loss_clip": 0.06275676, + "balance_loss_mlp": 0.01256304, + "epoch": 0.7311588756951751, + "flos": 13850477619840.0, + "grad_norm": 2.3687706371159023, + "language_loss": 0.72816062, + "learning_rate": 7.113279408557675e-07, + "loss": 0.80496389, + "num_input_tokens_seen": 262468240, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.10040283, + "step": 12161, + "time_per_iteration": 2.487931728363037 + }, + { + "auxiliary_loss_clip": 0.06419692, + "auxiliary_loss_mlp": 0.01265118, + "balance_loss_clip": 0.06277676, + "balance_loss_mlp": 0.01254413, + "epoch": 0.731218998947843, + "flos": 28775567854080.0, + "grad_norm": 1.7390428804054665, + "language_loss": 0.69832623, + "learning_rate": 7.110301273936192e-07, + "loss": 0.77517438, + "num_input_tokens_seen": 262487045, + "router_z_loss_clip": 1.41992188, + "router_z_loss_mlp": 0.10705566, + "step": 12162, + "time_per_iteration": 2.578719139099121 + }, + { + "auxiliary_loss_clip": 0.06409628, + "auxiliary_loss_mlp": 0.01266805, + "balance_loss_clip": 0.0627304, + "balance_loss_mlp": 0.01256785, + "epoch": 0.7312791222005111, + "flos": 27096047815680.0, + "grad_norm": 1.6401378277284773, + "language_loss": 0.67019415, + "learning_rate": 7.107323628093382e-07, + "loss": 0.74695843, + "num_input_tokens_seen": 262504855, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.10028076, + "step": 12163, + "time_per_iteration": 2.5393404960632324 + }, + { + "auxiliary_loss_clip": 0.06406513, + "auxiliary_loss_mlp": 0.01266726, + "balance_loss_clip": 0.0627192, + "balance_loss_mlp": 0.012566, + "epoch": 0.731339245453179, + "flos": 20930493484800.0, + "grad_norm": 1.6144773935767842, + "language_loss": 0.68972957, + "learning_rate": 7.104346471142153e-07, + "loss": 0.76646197, + "num_input_tokens_seen": 262524920, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.10119629, + "step": 12164, + "time_per_iteration": 2.5153493881225586 + }, + { + "auxiliary_loss_clip": 0.06404346, + "auxiliary_loss_mlp": 0.01263865, + "balance_loss_clip": 0.06274466, + "balance_loss_mlp": 0.01254262, + "epoch": 0.731399368705847, + "flos": 23082345388800.0, + "grad_norm": 1.4748874559419136, + "language_loss": 0.73714507, + "learning_rate": 7.101369803195391e-07, + "loss": 0.81382716, + "num_input_tokens_seen": 262545725, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.0960083, + "step": 12165, + "time_per_iteration": 2.5240328311920166 + }, + { + "auxiliary_loss_clip": 0.06409434, + "auxiliary_loss_mlp": 0.01264974, + "balance_loss_clip": 0.06273365, + "balance_loss_mlp": 0.01254782, + "epoch": 0.731459491958515, + "flos": 23588778666240.0, + "grad_norm": 1.7494932066214843, + "language_loss": 0.76978707, + "learning_rate": 7.098393624365988e-07, + "loss": 0.84653127, + "num_input_tokens_seen": 262565480, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.10192871, + "step": 12166, + "time_per_iteration": 2.535602569580078 + }, + { + "auxiliary_loss_clip": 0.06405294, + "auxiliary_loss_mlp": 0.01264593, + "balance_loss_clip": 0.06273952, + "balance_loss_mlp": 0.01254574, + "epoch": 0.7315196152111829, + "flos": 22385280072960.0, + "grad_norm": 1.6529519301050002, + "language_loss": 0.79870826, + "learning_rate": 7.095417934766781e-07, + "loss": 0.87540716, + "num_input_tokens_seen": 262584145, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.10015869, + "step": 12167, + "time_per_iteration": 2.5016744136810303 + }, + { + "auxiliary_loss_clip": 0.06406464, + "auxiliary_loss_mlp": 0.0126602, + "balance_loss_clip": 0.06274685, + "balance_loss_mlp": 0.01256155, + "epoch": 0.7315797384638509, + "flos": 26184227685120.0, + "grad_norm": 1.5786791569795495, + "language_loss": 0.77113497, + "learning_rate": 7.092442734510622e-07, + "loss": 0.84785974, + "num_input_tokens_seen": 262604045, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09863281, + "step": 12168, + "time_per_iteration": 2.550841808319092 + }, + { + "auxiliary_loss_clip": 0.06411693, + "auxiliary_loss_mlp": 0.01264978, + "balance_loss_clip": 0.06273279, + "balance_loss_mlp": 0.01254011, + "epoch": 0.7316398617165188, + "flos": 21512634526080.0, + "grad_norm": 1.4637772541157787, + "language_loss": 0.82124925, + "learning_rate": 7.089468023710326e-07, + "loss": 0.89801592, + "num_input_tokens_seen": 262624540, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.10955811, + "step": 12169, + "time_per_iteration": 2.4971840381622314 + }, + { + "auxiliary_loss_clip": 0.06413089, + "auxiliary_loss_mlp": 0.01269449, + "balance_loss_clip": 0.06276171, + "balance_loss_mlp": 0.01259031, + "epoch": 0.7316999849691869, + "flos": 30490489042560.0, + "grad_norm": 1.5962469016193046, + "language_loss": 0.70136017, + "learning_rate": 7.08649380247871e-07, + "loss": 0.77818549, + "num_input_tokens_seen": 262644545, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.10418701, + "step": 12170, + "time_per_iteration": 2.580601692199707 + }, + { + "auxiliary_loss_clip": 0.06408713, + "auxiliary_loss_mlp": 0.01268064, + "balance_loss_clip": 0.06273422, + "balance_loss_mlp": 0.01256655, + "epoch": 0.7317601082218548, + "flos": 21550257809280.0, + "grad_norm": 1.8557087884597323, + "language_loss": 0.69686925, + "learning_rate": 7.083520070928533e-07, + "loss": 0.773637, + "num_input_tokens_seen": 262662570, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.11413574, + "step": 12171, + "time_per_iteration": 2.483708143234253 + }, + { + "auxiliary_loss_clip": 0.06406379, + "auxiliary_loss_mlp": 0.01269004, + "balance_loss_clip": 0.06272611, + "balance_loss_mlp": 0.01258406, + "epoch": 0.7318202314745228, + "flos": 33259338086400.0, + "grad_norm": 1.4958611702028526, + "language_loss": 0.65253127, + "learning_rate": 7.080546829172564e-07, + "loss": 0.72928506, + "num_input_tokens_seen": 262683245, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.10595703, + "step": 12172, + "time_per_iteration": 2.6077332496643066 + }, + { + "auxiliary_loss_clip": 0.06410083, + "auxiliary_loss_mlp": 0.01266169, + "balance_loss_clip": 0.06274219, + "balance_loss_mlp": 0.0125547, + "epoch": 0.7318803547271907, + "flos": 20163254774400.0, + "grad_norm": 2.043922732836794, + "language_loss": 0.61819667, + "learning_rate": 7.077574077323564e-07, + "loss": 0.69495922, + "num_input_tokens_seen": 262701585, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.10693359, + "step": 12173, + "time_per_iteration": 2.4937400817871094 + }, + { + "auxiliary_loss_clip": 0.06411927, + "auxiliary_loss_mlp": 0.01265932, + "balance_loss_clip": 0.0627674, + "balance_loss_mlp": 0.01256395, + "epoch": 0.7319404779798587, + "flos": 20564826266880.0, + "grad_norm": 1.776213405218001, + "language_loss": 0.74138248, + "learning_rate": 7.074601815494243e-07, + "loss": 0.81816107, + "num_input_tokens_seen": 262719295, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.09533691, + "step": 12174, + "time_per_iteration": 2.5296590328216553 + }, + { + "auxiliary_loss_clip": 0.06402949, + "auxiliary_loss_mlp": 0.01263773, + "balance_loss_clip": 0.06272517, + "balance_loss_mlp": 0.01254701, + "epoch": 0.7320006012325266, + "flos": 28703130399360.0, + "grad_norm": 1.6525649397268998, + "language_loss": 0.81230605, + "learning_rate": 7.071630043797317e-07, + "loss": 0.88897324, + "num_input_tokens_seen": 262739995, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.09069824, + "step": 12175, + "time_per_iteration": 2.5799436569213867 + }, + { + "auxiliary_loss_clip": 0.06408073, + "auxiliary_loss_mlp": 0.01263853, + "balance_loss_clip": 0.06274186, + "balance_loss_mlp": 0.01253846, + "epoch": 0.7320607244851947, + "flos": 16368290231040.0, + "grad_norm": 1.8780371649414138, + "language_loss": 0.76478672, + "learning_rate": 7.068658762345488e-07, + "loss": 0.841506, + "num_input_tokens_seen": 262757680, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.10009766, + "step": 12176, + "time_per_iteration": 2.48456072807312 + }, + { + "auxiliary_loss_clip": 0.06404638, + "auxiliary_loss_mlp": 0.01267397, + "balance_loss_clip": 0.06272592, + "balance_loss_mlp": 0.01257526, + "epoch": 0.7321208477378626, + "flos": 20960653754880.0, + "grad_norm": 1.8116961288906432, + "language_loss": 0.76882672, + "learning_rate": 7.065687971251399e-07, + "loss": 0.84554708, + "num_input_tokens_seen": 262776990, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.09881592, + "step": 12177, + "time_per_iteration": 3.9612483978271484 + }, + { + "auxiliary_loss_clip": 0.06404608, + "auxiliary_loss_mlp": 0.01266624, + "balance_loss_clip": 0.06272198, + "balance_loss_mlp": 0.01257183, + "epoch": 0.7321809709905306, + "flos": 13850226057600.0, + "grad_norm": 2.0192997733839855, + "language_loss": 0.74703526, + "learning_rate": 7.06271767062772e-07, + "loss": 0.82374752, + "num_input_tokens_seen": 262795440, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09442139, + "step": 12178, + "time_per_iteration": 2.451946973800659 + }, + { + "auxiliary_loss_clip": 0.06407191, + "auxiliary_loss_mlp": 0.0126406, + "balance_loss_clip": 0.062708, + "balance_loss_mlp": 0.01253617, + "epoch": 0.7322410942431986, + "flos": 26987286816000.0, + "grad_norm": 1.9092278699703453, + "language_loss": 0.82810688, + "learning_rate": 7.059747860587084e-07, + "loss": 0.90481937, + "num_input_tokens_seen": 262816385, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.10449219, + "step": 12179, + "time_per_iteration": 2.5572235584259033 + }, + { + "auxiliary_loss_clip": 0.06400885, + "auxiliary_loss_mlp": 0.01265696, + "balance_loss_clip": 0.0627311, + "balance_loss_mlp": 0.0125573, + "epoch": 0.7323012174958665, + "flos": 17645526017280.0, + "grad_norm": 1.5024024158805138, + "language_loss": 0.7521069, + "learning_rate": 7.056778541242115e-07, + "loss": 0.82877266, + "num_input_tokens_seen": 262834955, + "router_z_loss_clip": 1.27832031, + "router_z_loss_mlp": 0.09960938, + "step": 12180, + "time_per_iteration": 2.455678701400757 + }, + { + "auxiliary_loss_clip": 0.06411432, + "auxiliary_loss_mlp": 0.01267053, + "balance_loss_clip": 0.06272306, + "balance_loss_mlp": 0.01256503, + "epoch": 0.7323613407485345, + "flos": 32350914046080.0, + "grad_norm": 1.8054283665304076, + "language_loss": 0.79850274, + "learning_rate": 7.053809712705396e-07, + "loss": 0.87528759, + "num_input_tokens_seen": 262853555, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.10552979, + "step": 12181, + "time_per_iteration": 2.595571756362915 + }, + { + "auxiliary_loss_clip": 0.06413537, + "auxiliary_loss_mlp": 0.01272467, + "balance_loss_clip": 0.06274928, + "balance_loss_mlp": 0.01261625, + "epoch": 0.7324214640012024, + "flos": 18367594577280.0, + "grad_norm": 1.7248361460474335, + "language_loss": 0.72176909, + "learning_rate": 7.050841375089506e-07, + "loss": 0.79862905, + "num_input_tokens_seen": 262870975, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.10852051, + "step": 12182, + "time_per_iteration": 2.4603164196014404 + }, + { + "auxiliary_loss_clip": 0.06412099, + "auxiliary_loss_mlp": 0.01267282, + "balance_loss_clip": 0.06276859, + "balance_loss_mlp": 0.01257268, + "epoch": 0.7324815872538705, + "flos": 30820503548160.0, + "grad_norm": 1.5618517746342058, + "language_loss": 0.71680033, + "learning_rate": 7.047873528507015e-07, + "loss": 0.79359412, + "num_input_tokens_seen": 262892635, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.10021973, + "step": 12183, + "time_per_iteration": 2.6027462482452393 + }, + { + "auxiliary_loss_clip": 0.0641363, + "auxiliary_loss_mlp": 0.01270088, + "balance_loss_clip": 0.06275654, + "balance_loss_mlp": 0.01258441, + "epoch": 0.7325417105065384, + "flos": 21511167079680.0, + "grad_norm": 1.8564082179513295, + "language_loss": 0.72663099, + "learning_rate": 7.04490617307045e-07, + "loss": 0.80346817, + "num_input_tokens_seen": 262910725, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.11639404, + "step": 12184, + "time_per_iteration": 2.481126070022583 + }, + { + "auxiliary_loss_clip": 0.06312383, + "auxiliary_loss_mlp": 0.01252618, + "balance_loss_clip": 0.06257074, + "balance_loss_mlp": 0.01251615, + "epoch": 0.7326018337592064, + "flos": 67277514746880.0, + "grad_norm": 0.738407632839968, + "language_loss": 0.65071452, + "learning_rate": 7.041939308892344e-07, + "loss": 0.72636449, + "num_input_tokens_seen": 262974150, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01002502, + "step": 12185, + "time_per_iteration": 3.106149196624756 + }, + { + "auxiliary_loss_clip": 0.06409767, + "auxiliary_loss_mlp": 0.01263715, + "balance_loss_clip": 0.06272019, + "balance_loss_mlp": 0.01253278, + "epoch": 0.7326619570118743, + "flos": 22863733286400.0, + "grad_norm": 1.8830306075887209, + "language_loss": 0.8029325, + "learning_rate": 7.038972936085197e-07, + "loss": 0.87966728, + "num_input_tokens_seen": 262993370, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.10443115, + "step": 12186, + "time_per_iteration": 3.9164252281188965 + }, + { + "auxiliary_loss_clip": 0.06409957, + "auxiliary_loss_mlp": 0.01267283, + "balance_loss_clip": 0.06272968, + "balance_loss_mlp": 0.0125656, + "epoch": 0.7327220802645423, + "flos": 23333591456640.0, + "grad_norm": 3.1049708773187685, + "language_loss": 0.73623288, + "learning_rate": 7.036007054761508e-07, + "loss": 0.81300521, + "num_input_tokens_seen": 263012665, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.10717773, + "step": 12187, + "time_per_iteration": 2.534468412399292 + }, + { + "auxiliary_loss_clip": 0.06412861, + "auxiliary_loss_mlp": 0.01267726, + "balance_loss_clip": 0.06277903, + "balance_loss_mlp": 0.01257462, + "epoch": 0.7327822035172102, + "flos": 23186578268160.0, + "grad_norm": 1.736323244132865, + "language_loss": 0.89323306, + "learning_rate": 7.033041665033716e-07, + "loss": 0.97003901, + "num_input_tokens_seen": 263031475, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.10272217, + "step": 12188, + "time_per_iteration": 2.6024370193481445 + }, + { + "auxiliary_loss_clip": 0.06405529, + "auxiliary_loss_mlp": 0.01267933, + "balance_loss_clip": 0.06268479, + "balance_loss_mlp": 0.01257449, + "epoch": 0.7328423267698783, + "flos": 21072517355520.0, + "grad_norm": 1.8789204802001953, + "language_loss": 0.75451827, + "learning_rate": 7.030076767014284e-07, + "loss": 0.83125293, + "num_input_tokens_seen": 263051445, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.10479736, + "step": 12189, + "time_per_iteration": 2.4941177368164062 + }, + { + "auxiliary_loss_clip": 0.06409896, + "auxiliary_loss_mlp": 0.01268331, + "balance_loss_clip": 0.06272592, + "balance_loss_mlp": 0.01257865, + "epoch": 0.7329024500225462, + "flos": 21696055113600.0, + "grad_norm": 1.5072102792760083, + "language_loss": 0.82332706, + "learning_rate": 7.027112360815648e-07, + "loss": 0.90010929, + "num_input_tokens_seen": 263070835, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.10473633, + "step": 12190, + "time_per_iteration": 2.526470184326172 + }, + { + "auxiliary_loss_clip": 0.06406286, + "auxiliary_loss_mlp": 0.01269765, + "balance_loss_clip": 0.06270757, + "balance_loss_mlp": 0.01258995, + "epoch": 0.7329625732752142, + "flos": 24169829604480.0, + "grad_norm": 1.85565696251354, + "language_loss": 0.72012609, + "learning_rate": 7.024148446550204e-07, + "loss": 0.79688656, + "num_input_tokens_seen": 263090070, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.10766602, + "step": 12191, + "time_per_iteration": 3.952462673187256 + }, + { + "auxiliary_loss_clip": 0.06405483, + "auxiliary_loss_mlp": 0.01267854, + "balance_loss_clip": 0.06271866, + "balance_loss_mlp": 0.01257793, + "epoch": 0.7330226965278822, + "flos": 30085227970560.0, + "grad_norm": 1.8630604521541774, + "language_loss": 0.69281983, + "learning_rate": 7.021185024330361e-07, + "loss": 0.76955318, + "num_input_tokens_seen": 263110030, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.10058594, + "step": 12192, + "time_per_iteration": 2.569606065750122 + }, + { + "auxiliary_loss_clip": 0.06404717, + "auxiliary_loss_mlp": 0.01264705, + "balance_loss_clip": 0.06270668, + "balance_loss_mlp": 0.01254859, + "epoch": 0.7330828197805501, + "flos": 23375113954560.0, + "grad_norm": 2.149879925519752, + "language_loss": 0.73025858, + "learning_rate": 7.01822209426848e-07, + "loss": 0.80695283, + "num_input_tokens_seen": 263129735, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.09844971, + "step": 12193, + "time_per_iteration": 2.5172417163848877 + }, + { + "auxiliary_loss_clip": 0.06408362, + "auxiliary_loss_mlp": 0.01270537, + "balance_loss_clip": 0.06271482, + "balance_loss_mlp": 0.01260207, + "epoch": 0.7331429430332181, + "flos": 21039170630400.0, + "grad_norm": 1.6561607292660703, + "language_loss": 0.77499682, + "learning_rate": 7.015259656476911e-07, + "loss": 0.85178578, + "num_input_tokens_seen": 263149100, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.10333252, + "step": 12194, + "time_per_iteration": 2.479529857635498 + }, + { + "auxiliary_loss_clip": 0.06405737, + "auxiliary_loss_mlp": 0.01263406, + "balance_loss_clip": 0.06272283, + "balance_loss_mlp": 0.0125285, + "epoch": 0.733203066285886, + "flos": 14653201334400.0, + "grad_norm": 1.6173563987107382, + "language_loss": 0.70813656, + "learning_rate": 7.012297711067998e-07, + "loss": 0.78482801, + "num_input_tokens_seen": 263166620, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.10552979, + "step": 12195, + "time_per_iteration": 3.877392292022705 + }, + { + "auxiliary_loss_clip": 0.06408596, + "auxiliary_loss_mlp": 0.01263504, + "balance_loss_clip": 0.06272919, + "balance_loss_mlp": 0.01253991, + "epoch": 0.7332631895385541, + "flos": 17171013945600.0, + "grad_norm": 1.8915458632347482, + "language_loss": 0.72392344, + "learning_rate": 7.009336258154057e-07, + "loss": 0.80064452, + "num_input_tokens_seen": 263184780, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.09515381, + "step": 12196, + "time_per_iteration": 2.475527286529541 + }, + { + "auxiliary_loss_clip": 0.0640474, + "auxiliary_loss_mlp": 0.01267096, + "balance_loss_clip": 0.06272123, + "balance_loss_mlp": 0.01256808, + "epoch": 0.733323312791222, + "flos": 28665758678400.0, + "grad_norm": 1.6827859274042947, + "language_loss": 0.7184931, + "learning_rate": 7.006375297847394e-07, + "loss": 0.79521143, + "num_input_tokens_seen": 263204625, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.10290527, + "step": 12197, + "time_per_iteration": 2.535411834716797 + }, + { + "auxiliary_loss_clip": 0.06414885, + "auxiliary_loss_mlp": 0.0127094, + "balance_loss_clip": 0.06273107, + "balance_loss_mlp": 0.0125918, + "epoch": 0.73338343604389, + "flos": 16624106346240.0, + "grad_norm": 1.8099581096795507, + "language_loss": 0.7810899, + "learning_rate": 7.003414830260282e-07, + "loss": 0.85794812, + "num_input_tokens_seen": 263221565, + "router_z_loss_clip": 1.41699219, + "router_z_loss_mlp": 0.11767578, + "step": 12198, + "time_per_iteration": 2.5611343383789062 + }, + { + "auxiliary_loss_clip": 0.06406511, + "auxiliary_loss_mlp": 0.01266433, + "balance_loss_clip": 0.06270938, + "balance_loss_mlp": 0.0125661, + "epoch": 0.7334435592965579, + "flos": 21148434754560.0, + "grad_norm": 1.7977488720869146, + "language_loss": 0.74877429, + "learning_rate": 7.000454855504974e-07, + "loss": 0.82550371, + "num_input_tokens_seen": 263240620, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.0982666, + "step": 12199, + "time_per_iteration": 2.549605369567871 + }, + { + "auxiliary_loss_clip": 0.06412543, + "auxiliary_loss_mlp": 0.01267154, + "balance_loss_clip": 0.06272766, + "balance_loss_mlp": 0.01255984, + "epoch": 0.7335036825492259, + "flos": 17130455769600.0, + "grad_norm": 2.1057189118558655, + "language_loss": 0.76952875, + "learning_rate": 6.997495373693729e-07, + "loss": 0.84632576, + "num_input_tokens_seen": 263254365, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.11175537, + "step": 12200, + "time_per_iteration": 2.4664149284362793 + }, + { + "auxiliary_loss_clip": 0.06406954, + "auxiliary_loss_mlp": 0.01269537, + "balance_loss_clip": 0.06272939, + "balance_loss_mlp": 0.01258874, + "epoch": 0.7335638058018938, + "flos": 23738475185280.0, + "grad_norm": 1.6692295634407006, + "language_loss": 0.61729515, + "learning_rate": 6.994536384938754e-07, + "loss": 0.69406003, + "num_input_tokens_seen": 263275880, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.10662842, + "step": 12201, + "time_per_iteration": 2.5405964851379395 + }, + { + "auxiliary_loss_clip": 0.0640207, + "auxiliary_loss_mlp": 0.01264063, + "balance_loss_clip": 0.06269816, + "balance_loss_mlp": 0.01254544, + "epoch": 0.7336239290545619, + "flos": 34941876871680.0, + "grad_norm": 1.7828880391385733, + "language_loss": 0.52268887, + "learning_rate": 6.991577889352264e-07, + "loss": 0.59935021, + "num_input_tokens_seen": 263298315, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.09521484, + "step": 12202, + "time_per_iteration": 2.610280990600586 + }, + { + "auxiliary_loss_clip": 0.06403884, + "auxiliary_loss_mlp": 0.01264935, + "balance_loss_clip": 0.06270868, + "balance_loss_mlp": 0.01255082, + "epoch": 0.7336840523072298, + "flos": 21108966681600.0, + "grad_norm": 3.0029682825255706, + "language_loss": 0.686993, + "learning_rate": 6.98861988704645e-07, + "loss": 0.76368117, + "num_input_tokens_seen": 263318615, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.09844971, + "step": 12203, + "time_per_iteration": 2.507932424545288 + }, + { + "auxiliary_loss_clip": 0.06414039, + "auxiliary_loss_mlp": 0.01270628, + "balance_loss_clip": 0.06272701, + "balance_loss_mlp": 0.01259959, + "epoch": 0.7337441755598978, + "flos": 24031243751040.0, + "grad_norm": 2.856553755482537, + "language_loss": 0.66825521, + "learning_rate": 6.985662378133474e-07, + "loss": 0.74510193, + "num_input_tokens_seen": 263336705, + "router_z_loss_clip": 1.41210938, + "router_z_loss_mlp": 0.10668945, + "step": 12204, + "time_per_iteration": 2.514671802520752 + }, + { + "auxiliary_loss_clip": 0.06406862, + "auxiliary_loss_mlp": 0.01263286, + "balance_loss_clip": 0.06273827, + "balance_loss_mlp": 0.01253779, + "epoch": 0.7338042988125658, + "flos": 22717977909120.0, + "grad_norm": 1.8458208661726296, + "language_loss": 0.77401447, + "learning_rate": 6.982705362725479e-07, + "loss": 0.85071599, + "num_input_tokens_seen": 263355065, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.09509277, + "step": 12205, + "time_per_iteration": 2.5407674312591553 + }, + { + "auxiliary_loss_clip": 0.06401809, + "auxiliary_loss_mlp": 0.01264175, + "balance_loss_clip": 0.06270801, + "balance_loss_mlp": 0.01255288, + "epoch": 0.7338644220652337, + "flos": 21367382273280.0, + "grad_norm": 2.465584123041792, + "language_loss": 0.80136371, + "learning_rate": 6.979748840934601e-07, + "loss": 0.87802351, + "num_input_tokens_seen": 263374460, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.08892822, + "step": 12206, + "time_per_iteration": 2.505405902862549 + }, + { + "auxiliary_loss_clip": 0.06407475, + "auxiliary_loss_mlp": 0.01266198, + "balance_loss_clip": 0.06271542, + "balance_loss_mlp": 0.01256447, + "epoch": 0.7339245453179017, + "flos": 30928216371840.0, + "grad_norm": 1.8649817824814656, + "language_loss": 0.71671152, + "learning_rate": 6.976792812872958e-07, + "loss": 0.79344821, + "num_input_tokens_seen": 263393610, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.09747314, + "step": 12207, + "time_per_iteration": 2.5743727684020996 + }, + { + "auxiliary_loss_clip": 0.06311717, + "auxiliary_loss_mlp": 0.01252748, + "balance_loss_clip": 0.06256534, + "balance_loss_mlp": 0.01251759, + "epoch": 0.7339846685705697, + "flos": 67916789873280.0, + "grad_norm": 0.7657187342696471, + "language_loss": 0.54859233, + "learning_rate": 6.97383727865263e-07, + "loss": 0.62423694, + "num_input_tokens_seen": 263450340, + "router_z_loss_clip": 0.55419922, + "router_z_loss_mlp": 0.00988007, + "step": 12208, + "time_per_iteration": 3.215527057647705 + }, + { + "auxiliary_loss_clip": 0.06409256, + "auxiliary_loss_mlp": 0.01263774, + "balance_loss_clip": 0.06273347, + "balance_loss_mlp": 0.01253963, + "epoch": 0.7340447918232377, + "flos": 22243298129280.0, + "grad_norm": 1.295062015849254, + "language_loss": 0.80369568, + "learning_rate": 6.970882238385703e-07, + "loss": 0.88042593, + "num_input_tokens_seen": 263471735, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.0980835, + "step": 12209, + "time_per_iteration": 2.604940414428711 + }, + { + "auxiliary_loss_clip": 0.06402272, + "auxiliary_loss_mlp": 0.01265832, + "balance_loss_clip": 0.06270745, + "balance_loss_mlp": 0.01256164, + "epoch": 0.7341049150759056, + "flos": 23770857588480.0, + "grad_norm": 1.3756281752304946, + "language_loss": 0.7923339, + "learning_rate": 6.96792769218423e-07, + "loss": 0.86901498, + "num_input_tokens_seen": 263493245, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09661865, + "step": 12210, + "time_per_iteration": 2.586808919906616 + }, + { + "auxiliary_loss_clip": 0.06405463, + "auxiliary_loss_mlp": 0.01263055, + "balance_loss_clip": 0.06273089, + "balance_loss_mlp": 0.01253142, + "epoch": 0.7341650383285736, + "flos": 17241983953920.0, + "grad_norm": 1.587399394910607, + "language_loss": 0.76868075, + "learning_rate": 6.964973640160236e-07, + "loss": 0.84536588, + "num_input_tokens_seen": 263511660, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.09918213, + "step": 12211, + "time_per_iteration": 2.5032119750976562 + }, + { + "auxiliary_loss_clip": 0.06406663, + "auxiliary_loss_mlp": 0.01269483, + "balance_loss_clip": 0.06271002, + "balance_loss_mlp": 0.01259464, + "epoch": 0.7342251615812415, + "flos": 23410640885760.0, + "grad_norm": 1.8683107617310235, + "language_loss": 0.7257871, + "learning_rate": 6.962020082425748e-07, + "loss": 0.80254853, + "num_input_tokens_seen": 263530875, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.10021973, + "step": 12212, + "time_per_iteration": 2.529822826385498 + }, + { + "auxiliary_loss_clip": 0.06408443, + "auxiliary_loss_mlp": 0.01264026, + "balance_loss_clip": 0.06274249, + "balance_loss_mlp": 0.01253983, + "epoch": 0.7342852848339095, + "flos": 22753756402560.0, + "grad_norm": 1.4731208484223037, + "language_loss": 0.69065344, + "learning_rate": 6.959067019092766e-07, + "loss": 0.76737809, + "num_input_tokens_seen": 263551585, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.10046387, + "step": 12213, + "time_per_iteration": 2.5050880908966064 + }, + { + "auxiliary_loss_clip": 0.06311147, + "auxiliary_loss_mlp": 0.01250993, + "balance_loss_clip": 0.06256209, + "balance_loss_mlp": 0.01250006, + "epoch": 0.7343454080865774, + "flos": 53960219856000.0, + "grad_norm": 0.6961582505379801, + "language_loss": 0.54205143, + "learning_rate": 6.956114450273276e-07, + "loss": 0.61767286, + "num_input_tokens_seen": 263609545, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.00987244, + "step": 12214, + "time_per_iteration": 3.01758074760437 + }, + { + "auxiliary_loss_clip": 0.06412373, + "auxiliary_loss_mlp": 0.0126565, + "balance_loss_clip": 0.06272756, + "balance_loss_mlp": 0.01255058, + "epoch": 0.7344055313392455, + "flos": 12171754195200.0, + "grad_norm": 1.9351269551691648, + "language_loss": 0.70493495, + "learning_rate": 6.953162376079233e-07, + "loss": 0.78171515, + "num_input_tokens_seen": 263627880, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.105896, + "step": 12215, + "time_per_iteration": 2.450974941253662 + }, + { + "auxiliary_loss_clip": 0.06400481, + "auxiliary_loss_mlp": 0.0126608, + "balance_loss_clip": 0.06270639, + "balance_loss_mlp": 0.01256347, + "epoch": 0.7344656545919134, + "flos": 18555710993280.0, + "grad_norm": 1.5126294577685706, + "language_loss": 0.7330094, + "learning_rate": 6.950210796622573e-07, + "loss": 0.80967498, + "num_input_tokens_seen": 263645665, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.09741211, + "step": 12216, + "time_per_iteration": 3.8361501693725586 + }, + { + "auxiliary_loss_clip": 0.06417778, + "auxiliary_loss_mlp": 0.01265589, + "balance_loss_clip": 0.06274825, + "balance_loss_mlp": 0.01254085, + "epoch": 0.7345257778445814, + "flos": 23668762988160.0, + "grad_norm": 1.664988120098628, + "language_loss": 0.78114659, + "learning_rate": 6.947259712015236e-07, + "loss": 0.85798025, + "num_input_tokens_seen": 263668170, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.11505127, + "step": 12217, + "time_per_iteration": 2.5286312103271484 + }, + { + "auxiliary_loss_clip": 0.06405286, + "auxiliary_loss_mlp": 0.01265343, + "balance_loss_clip": 0.06273887, + "balance_loss_mlp": 0.01256056, + "epoch": 0.7345859010972494, + "flos": 13813818658560.0, + "grad_norm": 2.564959401036019, + "language_loss": 0.78167617, + "learning_rate": 6.94430912236911e-07, + "loss": 0.85838252, + "num_input_tokens_seen": 263684190, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.09289551, + "step": 12218, + "time_per_iteration": 2.4696590900421143 + }, + { + "auxiliary_loss_clip": 0.06401719, + "auxiliary_loss_mlp": 0.0126533, + "balance_loss_clip": 0.06270626, + "balance_loss_mlp": 0.01255567, + "epoch": 0.7346460243499173, + "flos": 22279202403840.0, + "grad_norm": 1.5944736181083394, + "language_loss": 0.72325158, + "learning_rate": 6.941359027796092e-07, + "loss": 0.79992205, + "num_input_tokens_seen": 263702095, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09777832, + "step": 12219, + "time_per_iteration": 2.5853631496429443 + }, + { + "auxiliary_loss_clip": 0.06402183, + "auxiliary_loss_mlp": 0.01264937, + "balance_loss_clip": 0.06272361, + "balance_loss_mlp": 0.01255531, + "epoch": 0.7347061476025853, + "flos": 23261447491200.0, + "grad_norm": 1.646626241048598, + "language_loss": 0.74960732, + "learning_rate": 6.938409428408061e-07, + "loss": 0.82627851, + "num_input_tokens_seen": 263721385, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.09405518, + "step": 12220, + "time_per_iteration": 2.5074381828308105 + }, + { + "auxiliary_loss_clip": 0.06411088, + "auxiliary_loss_mlp": 0.01266137, + "balance_loss_clip": 0.06272232, + "balance_loss_mlp": 0.01255384, + "epoch": 0.7347662708552533, + "flos": 15272881804800.0, + "grad_norm": 1.5752596580091636, + "language_loss": 0.65676045, + "learning_rate": 6.93546032431684e-07, + "loss": 0.73353267, + "num_input_tokens_seen": 263737835, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.10742188, + "step": 12221, + "time_per_iteration": 2.4807536602020264 + }, + { + "auxiliary_loss_clip": 0.06407331, + "auxiliary_loss_mlp": 0.01266734, + "balance_loss_clip": 0.06272253, + "balance_loss_mlp": 0.01256809, + "epoch": 0.7348263941079213, + "flos": 24866349868800.0, + "grad_norm": 1.700720501906822, + "language_loss": 0.6957171, + "learning_rate": 6.932511715634273e-07, + "loss": 0.77245772, + "num_input_tokens_seen": 263756480, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.09918213, + "step": 12222, + "time_per_iteration": 2.550657272338867 + }, + { + "auxiliary_loss_clip": 0.06405503, + "auxiliary_loss_mlp": 0.01265893, + "balance_loss_clip": 0.06273381, + "balance_loss_mlp": 0.01257054, + "epoch": 0.7348865173605892, + "flos": 24358868415360.0, + "grad_norm": 1.4474540063064079, + "language_loss": 0.66394234, + "learning_rate": 6.92956360247217e-07, + "loss": 0.74065632, + "num_input_tokens_seen": 263776440, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.08843994, + "step": 12223, + "time_per_iteration": 2.5699193477630615 + }, + { + "auxiliary_loss_clip": 0.06405693, + "auxiliary_loss_mlp": 0.0126412, + "balance_loss_clip": 0.06272776, + "balance_loss_mlp": 0.01254404, + "epoch": 0.7349466406132572, + "flos": 20009700967680.0, + "grad_norm": 2.3059227794211834, + "language_loss": 0.72692394, + "learning_rate": 6.926615984942332e-07, + "loss": 0.80362213, + "num_input_tokens_seen": 263793700, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.09716797, + "step": 12224, + "time_per_iteration": 2.470388412475586 + }, + { + "auxiliary_loss_clip": 0.06410325, + "auxiliary_loss_mlp": 0.01265671, + "balance_loss_clip": 0.06273518, + "balance_loss_mlp": 0.01254776, + "epoch": 0.7350067638659251, + "flos": 29832766018560.0, + "grad_norm": 1.7299293804881801, + "language_loss": 0.72725701, + "learning_rate": 6.92366886315652e-07, + "loss": 0.80401695, + "num_input_tokens_seen": 263814620, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.10900879, + "step": 12225, + "time_per_iteration": 2.596513509750366 + }, + { + "auxiliary_loss_clip": 0.06415132, + "auxiliary_loss_mlp": 0.0126347, + "balance_loss_clip": 0.06274726, + "balance_loss_mlp": 0.01252825, + "epoch": 0.7350668871185931, + "flos": 21871677271680.0, + "grad_norm": 1.7624309121462833, + "language_loss": 0.76816809, + "learning_rate": 6.920722237226501e-07, + "loss": 0.84495413, + "num_input_tokens_seen": 263832725, + "router_z_loss_clip": 1.40429688, + "router_z_loss_mlp": 0.10644531, + "step": 12226, + "time_per_iteration": 3.9786300659179688 + }, + { + "auxiliary_loss_clip": 0.06405763, + "auxiliary_loss_mlp": 0.01263929, + "balance_loss_clip": 0.06270237, + "balance_loss_mlp": 0.01254041, + "epoch": 0.735127010371261, + "flos": 22572893364480.0, + "grad_norm": 1.4073989113743075, + "language_loss": 0.67142195, + "learning_rate": 6.917776107264008e-07, + "loss": 0.74811888, + "num_input_tokens_seen": 263853850, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.09893799, + "step": 12227, + "time_per_iteration": 2.5849621295928955 + }, + { + "auxiliary_loss_clip": 0.06410711, + "auxiliary_loss_mlp": 0.012626, + "balance_loss_clip": 0.06274848, + "balance_loss_mlp": 0.0125292, + "epoch": 0.7351871336239291, + "flos": 25891333338240.0, + "grad_norm": 1.4691171153634894, + "language_loss": 0.63763392, + "learning_rate": 6.914830473380749e-07, + "loss": 0.71436703, + "num_input_tokens_seen": 263874760, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.09680176, + "step": 12228, + "time_per_iteration": 2.535334587097168 + }, + { + "auxiliary_loss_clip": 0.06409031, + "auxiliary_loss_mlp": 0.01263285, + "balance_loss_clip": 0.06274029, + "balance_loss_mlp": 0.0125404, + "epoch": 0.735247256876597, + "flos": 17938126874880.0, + "grad_norm": 1.6163859960159983, + "language_loss": 0.6387676, + "learning_rate": 6.911885335688427e-07, + "loss": 0.7154907, + "num_input_tokens_seen": 263893390, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.09246826, + "step": 12229, + "time_per_iteration": 2.5226519107818604 + }, + { + "auxiliary_loss_clip": 0.06409419, + "auxiliary_loss_mlp": 0.01264039, + "balance_loss_clip": 0.06271814, + "balance_loss_mlp": 0.01253352, + "epoch": 0.735307380129265, + "flos": 28882484064000.0, + "grad_norm": 1.5503109559277863, + "language_loss": 0.734267, + "learning_rate": 6.908940694298726e-07, + "loss": 0.81100154, + "num_input_tokens_seen": 263911180, + "router_z_loss_clip": 1.37695312, + "router_z_loss_mlp": 0.10693359, + "step": 12230, + "time_per_iteration": 3.9754912853240967 + }, + { + "auxiliary_loss_clip": 0.06410781, + "auxiliary_loss_mlp": 0.01267625, + "balance_loss_clip": 0.06275117, + "balance_loss_mlp": 0.01257177, + "epoch": 0.7353675033819329, + "flos": 13630691560320.0, + "grad_norm": 2.023268936424561, + "language_loss": 0.72356808, + "learning_rate": 6.90599654932332e-07, + "loss": 0.8003521, + "num_input_tokens_seen": 263928975, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.10455322, + "step": 12231, + "time_per_iteration": 2.4864163398742676 + }, + { + "auxiliary_loss_clip": 0.06412238, + "auxiliary_loss_mlp": 0.01272917, + "balance_loss_clip": 0.06275348, + "balance_loss_mlp": 0.01262003, + "epoch": 0.7354276266346009, + "flos": 19469040497280.0, + "grad_norm": 2.0034739477169965, + "language_loss": 0.64325827, + "learning_rate": 6.903052900873823e-07, + "loss": 0.72010976, + "num_input_tokens_seen": 263944495, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.10906982, + "step": 12232, + "time_per_iteration": 2.5125675201416016 + }, + { + "auxiliary_loss_clip": 0.06407313, + "auxiliary_loss_mlp": 0.01267406, + "balance_loss_clip": 0.06270695, + "balance_loss_mlp": 0.01256922, + "epoch": 0.735487749887269, + "flos": 15776170554240.0, + "grad_norm": 1.8738456436799267, + "language_loss": 0.75562924, + "learning_rate": 6.900109749061874e-07, + "loss": 0.83237642, + "num_input_tokens_seen": 263961325, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.10491943, + "step": 12233, + "time_per_iteration": 2.496495246887207 + }, + { + "auxiliary_loss_clip": 0.06407893, + "auxiliary_loss_mlp": 0.01268588, + "balance_loss_clip": 0.06273118, + "balance_loss_mlp": 0.0125805, + "epoch": 0.7355478731399369, + "flos": 18266673934080.0, + "grad_norm": 1.8052457003626037, + "language_loss": 0.73313487, + "learning_rate": 6.897167093999079e-07, + "loss": 0.80989963, + "num_input_tokens_seen": 263980445, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.10534668, + "step": 12234, + "time_per_iteration": 3.9552576541900635 + }, + { + "auxiliary_loss_clip": 0.064089, + "auxiliary_loss_mlp": 0.01265135, + "balance_loss_clip": 0.06273393, + "balance_loss_mlp": 0.01255104, + "epoch": 0.7356079963926049, + "flos": 26549307924480.0, + "grad_norm": 1.8318735304656244, + "language_loss": 0.59923625, + "learning_rate": 6.894224935797017e-07, + "loss": 0.67597657, + "num_input_tokens_seen": 263999330, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.10028076, + "step": 12235, + "time_per_iteration": 2.536958932876587 + }, + { + "auxiliary_loss_clip": 0.06406462, + "auxiliary_loss_mlp": 0.01266095, + "balance_loss_clip": 0.06273465, + "balance_loss_mlp": 0.01255611, + "epoch": 0.7356681196452728, + "flos": 10782990224640.0, + "grad_norm": 2.1420111841430445, + "language_loss": 0.86364961, + "learning_rate": 6.891283274567259e-07, + "loss": 0.94037515, + "num_input_tokens_seen": 264014150, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.10479736, + "step": 12236, + "time_per_iteration": 2.4920454025268555 + }, + { + "auxiliary_loss_clip": 0.0641176, + "auxiliary_loss_mlp": 0.01264567, + "balance_loss_clip": 0.06274892, + "balance_loss_mlp": 0.01254178, + "epoch": 0.7357282428979408, + "flos": 19724730831360.0, + "grad_norm": 1.819458830371115, + "language_loss": 0.69971436, + "learning_rate": 6.888342110421364e-07, + "loss": 0.77647763, + "num_input_tokens_seen": 264033140, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.1038208, + "step": 12237, + "time_per_iteration": 2.5083632469177246 + }, + { + "auxiliary_loss_clip": 0.0640821, + "auxiliary_loss_mlp": 0.01262709, + "balance_loss_clip": 0.06271386, + "balance_loss_mlp": 0.01252647, + "epoch": 0.7357883661506087, + "flos": 19470130600320.0, + "grad_norm": 1.6051120472726816, + "language_loss": 0.72315025, + "learning_rate": 6.885401443470839e-07, + "loss": 0.79985946, + "num_input_tokens_seen": 264052105, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.10070801, + "step": 12238, + "time_per_iteration": 2.5418028831481934 + }, + { + "auxiliary_loss_clip": 0.06415435, + "auxiliary_loss_mlp": 0.01267845, + "balance_loss_clip": 0.06272001, + "balance_loss_mlp": 0.01257038, + "epoch": 0.7358484894032767, + "flos": 27129897665280.0, + "grad_norm": 1.6224977172165573, + "language_loss": 0.73030883, + "learning_rate": 6.882461273827205e-07, + "loss": 0.8071416, + "num_input_tokens_seen": 264070690, + "router_z_loss_clip": 1.43359375, + "router_z_loss_mlp": 0.10809326, + "step": 12239, + "time_per_iteration": 2.57132887840271 + }, + { + "auxiliary_loss_clip": 0.06405096, + "auxiliary_loss_mlp": 0.01263816, + "balance_loss_clip": 0.06275095, + "balance_loss_mlp": 0.01254786, + "epoch": 0.7359086126559446, + "flos": 24509780818560.0, + "grad_norm": 1.236291832045993, + "language_loss": 0.79114598, + "learning_rate": 6.879521601601954e-07, + "loss": 0.8678351, + "num_input_tokens_seen": 264094225, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.09033203, + "step": 12240, + "time_per_iteration": 2.574645757675171 + }, + { + "auxiliary_loss_clip": 0.06410246, + "auxiliary_loss_mlp": 0.01266401, + "balance_loss_clip": 0.0627753, + "balance_loss_mlp": 0.01256942, + "epoch": 0.7359687359086127, + "flos": 23337993795840.0, + "grad_norm": 1.821182153740144, + "language_loss": 0.83331031, + "learning_rate": 6.876582426906565e-07, + "loss": 0.91007674, + "num_input_tokens_seen": 264113190, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09454346, + "step": 12241, + "time_per_iteration": 2.5325047969818115 + }, + { + "auxiliary_loss_clip": 0.06407616, + "auxiliary_loss_mlp": 0.01262523, + "balance_loss_clip": 0.06274907, + "balance_loss_mlp": 0.01252909, + "epoch": 0.7360288591612806, + "flos": 20199578319360.0, + "grad_norm": 1.8489352198230395, + "language_loss": 0.78972995, + "learning_rate": 6.873643749852484e-07, + "loss": 0.86643136, + "num_input_tokens_seen": 264132050, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09606934, + "step": 12242, + "time_per_iteration": 2.4817190170288086 + }, + { + "auxiliary_loss_clip": 0.06405145, + "auxiliary_loss_mlp": 0.01268429, + "balance_loss_clip": 0.06273502, + "balance_loss_mlp": 0.01258981, + "epoch": 0.7360889824139486, + "flos": 24979722842880.0, + "grad_norm": 1.7750845941868088, + "language_loss": 0.79797709, + "learning_rate": 6.870705570551145e-07, + "loss": 0.87471282, + "num_input_tokens_seen": 264152800, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09436035, + "step": 12243, + "time_per_iteration": 2.5396323204040527 + }, + { + "auxiliary_loss_clip": 0.06411023, + "auxiliary_loss_mlp": 0.01264312, + "balance_loss_clip": 0.06271946, + "balance_loss_mlp": 0.01253423, + "epoch": 0.7361491056666165, + "flos": 15017610741120.0, + "grad_norm": 2.051473837828663, + "language_loss": 0.74682987, + "learning_rate": 6.867767889113969e-07, + "loss": 0.82358325, + "num_input_tokens_seen": 264169650, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.10888672, + "step": 12244, + "time_per_iteration": 2.468791961669922 + }, + { + "auxiliary_loss_clip": 0.06409503, + "auxiliary_loss_mlp": 0.01266285, + "balance_loss_clip": 0.06271558, + "balance_loss_mlp": 0.01256033, + "epoch": 0.7362092289192845, + "flos": 22937135063040.0, + "grad_norm": 1.5646917897943269, + "language_loss": 0.69797492, + "learning_rate": 6.864830705652347e-07, + "loss": 0.77473283, + "num_input_tokens_seen": 264190530, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.10253906, + "step": 12245, + "time_per_iteration": 2.6041831970214844 + }, + { + "auxiliary_loss_clip": 0.06401391, + "auxiliary_loss_mlp": 0.01266236, + "balance_loss_clip": 0.06273212, + "balance_loss_mlp": 0.01255787, + "epoch": 0.7362693521719526, + "flos": 20708694927360.0, + "grad_norm": 1.4104590909640493, + "language_loss": 0.73381358, + "learning_rate": 6.861894020277658e-07, + "loss": 0.81048983, + "num_input_tokens_seen": 264210820, + "router_z_loss_clip": 1.28222656, + "router_z_loss_mlp": 0.10449219, + "step": 12246, + "time_per_iteration": 2.5084409713745117 + }, + { + "auxiliary_loss_clip": 0.06402211, + "auxiliary_loss_mlp": 0.01268595, + "balance_loss_clip": 0.06273086, + "balance_loss_mlp": 0.01259833, + "epoch": 0.7363294754246205, + "flos": 13115747093760.0, + "grad_norm": 1.8401513132222869, + "language_loss": 0.73210883, + "learning_rate": 6.858957833101266e-07, + "loss": 0.80881691, + "num_input_tokens_seen": 264227430, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.08758545, + "step": 12247, + "time_per_iteration": 2.5997636318206787 + }, + { + "auxiliary_loss_clip": 0.06406122, + "auxiliary_loss_mlp": 0.01262591, + "balance_loss_clip": 0.06276006, + "balance_loss_mlp": 0.0125269, + "epoch": 0.7363895986772885, + "flos": 14032598469120.0, + "grad_norm": 1.520275800225871, + "language_loss": 0.74474341, + "learning_rate": 6.856022144234526e-07, + "loss": 0.8214305, + "num_input_tokens_seen": 264245230, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.09899902, + "step": 12248, + "time_per_iteration": 2.4908292293548584 + }, + { + "auxiliary_loss_clip": 0.06410165, + "auxiliary_loss_mlp": 0.01271268, + "balance_loss_clip": 0.06274056, + "balance_loss_mlp": 0.01261022, + "epoch": 0.7364497219299564, + "flos": 19726240204800.0, + "grad_norm": 1.8587136102784652, + "language_loss": 0.73065788, + "learning_rate": 6.853086953788727e-07, + "loss": 0.80747223, + "num_input_tokens_seen": 264263945, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.1026001, + "step": 12249, + "time_per_iteration": 2.5477547645568848 + }, + { + "auxiliary_loss_clip": 0.06408364, + "auxiliary_loss_mlp": 0.01269722, + "balance_loss_clip": 0.06275103, + "balance_loss_mlp": 0.0125922, + "epoch": 0.7365098451826244, + "flos": 21367843470720.0, + "grad_norm": 1.7459434910305351, + "language_loss": 0.7680105, + "learning_rate": 6.850152261875189e-07, + "loss": 0.84479141, + "num_input_tokens_seen": 264281500, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.1050415, + "step": 12250, + "time_per_iteration": 2.50736665725708 + }, + { + "auxiliary_loss_clip": 0.06411077, + "auxiliary_loss_mlp": 0.01264873, + "balance_loss_clip": 0.0627429, + "balance_loss_mlp": 0.01254043, + "epoch": 0.7365699684352923, + "flos": 23375030100480.0, + "grad_norm": 1.6059448981622937, + "language_loss": 0.71334994, + "learning_rate": 6.8472180686052e-07, + "loss": 0.79010946, + "num_input_tokens_seen": 264301625, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.10839844, + "step": 12251, + "time_per_iteration": 2.545740842819214 + }, + { + "auxiliary_loss_clip": 0.0640523, + "auxiliary_loss_mlp": 0.01263198, + "balance_loss_clip": 0.06272127, + "balance_loss_mlp": 0.01253584, + "epoch": 0.7366300916879603, + "flos": 59537610380160.0, + "grad_norm": 1.4529727777201047, + "language_loss": 0.66069037, + "learning_rate": 6.844284374090015e-07, + "loss": 0.73737466, + "num_input_tokens_seen": 264323975, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.09606934, + "step": 12252, + "time_per_iteration": 2.884873628616333 + }, + { + "auxiliary_loss_clip": 0.06412438, + "auxiliary_loss_mlp": 0.0126905, + "balance_loss_clip": 0.06274702, + "balance_loss_mlp": 0.01258488, + "epoch": 0.7366902149406283, + "flos": 20929445308800.0, + "grad_norm": 1.6593281267940243, + "language_loss": 0.79292876, + "learning_rate": 6.841351178440884e-07, + "loss": 0.86974359, + "num_input_tokens_seen": 264343785, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.10559082, + "step": 12253, + "time_per_iteration": 2.56786847114563 + }, + { + "auxiliary_loss_clip": 0.06405851, + "auxiliary_loss_mlp": 0.01262554, + "balance_loss_clip": 0.06274677, + "balance_loss_mlp": 0.01253739, + "epoch": 0.7367503381932963, + "flos": 17353973335680.0, + "grad_norm": 1.9323805517919423, + "language_loss": 0.76607239, + "learning_rate": 6.83841848176905e-07, + "loss": 0.84275639, + "num_input_tokens_seen": 264361130, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.08813477, + "step": 12254, + "time_per_iteration": 2.465092182159424 + }, + { + "auxiliary_loss_clip": 0.06408474, + "auxiliary_loss_mlp": 0.01264148, + "balance_loss_clip": 0.06274708, + "balance_loss_mlp": 0.0125361, + "epoch": 0.7368104614459642, + "flos": 17827017960960.0, + "grad_norm": 3.2694109886339366, + "language_loss": 0.69397593, + "learning_rate": 6.835486284185692e-07, + "loss": 0.77070212, + "num_input_tokens_seen": 264376965, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.10534668, + "step": 12255, + "time_per_iteration": 2.5002591609954834 + }, + { + "auxiliary_loss_clip": 0.06412044, + "auxiliary_loss_mlp": 0.01265607, + "balance_loss_clip": 0.06276523, + "balance_loss_mlp": 0.01255117, + "epoch": 0.7368705846986322, + "flos": 24612672032640.0, + "grad_norm": 1.5801315841847023, + "language_loss": 0.75219184, + "learning_rate": 6.832554585802012e-07, + "loss": 0.82896841, + "num_input_tokens_seen": 264396310, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.10491943, + "step": 12256, + "time_per_iteration": 4.017148494720459 + }, + { + "auxiliary_loss_clip": 0.06408297, + "auxiliary_loss_mlp": 0.0126377, + "balance_loss_clip": 0.06273545, + "balance_loss_mlp": 0.01254043, + "epoch": 0.7369307079513001, + "flos": 34978829322240.0, + "grad_norm": 1.5326155216287436, + "language_loss": 0.74032342, + "learning_rate": 6.829623386729182e-07, + "loss": 0.81704414, + "num_input_tokens_seen": 264418085, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.09729004, + "step": 12257, + "time_per_iteration": 2.647477388381958 + }, + { + "auxiliary_loss_clip": 0.06406973, + "auxiliary_loss_mlp": 0.01263484, + "balance_loss_clip": 0.0627301, + "balance_loss_mlp": 0.01253965, + "epoch": 0.7369908312039681, + "flos": 21220872209280.0, + "grad_norm": 1.4761434387135868, + "language_loss": 0.78534251, + "learning_rate": 6.826692687078362e-07, + "loss": 0.86204708, + "num_input_tokens_seen": 264437595, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.09521484, + "step": 12258, + "time_per_iteration": 2.572261333465576 + }, + { + "auxiliary_loss_clip": 0.06412143, + "auxiliary_loss_mlp": 0.01264951, + "balance_loss_clip": 0.06274798, + "balance_loss_mlp": 0.01255194, + "epoch": 0.7370509544566362, + "flos": 23630510799360.0, + "grad_norm": 1.4160381635671, + "language_loss": 0.66616917, + "learning_rate": 6.823762486960674e-07, + "loss": 0.74294007, + "num_input_tokens_seen": 264457385, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.09759521, + "step": 12259, + "time_per_iteration": 2.507096290588379 + }, + { + "auxiliary_loss_clip": 0.06408918, + "auxiliary_loss_mlp": 0.01264842, + "balance_loss_clip": 0.06274989, + "balance_loss_mlp": 0.01254406, + "epoch": 0.7371110777093041, + "flos": 24834764079360.0, + "grad_norm": 1.6356397611324185, + "language_loss": 0.73572636, + "learning_rate": 6.820832786487225e-07, + "loss": 0.81246388, + "num_input_tokens_seen": 264477205, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.10424805, + "step": 12260, + "time_per_iteration": 2.55729341506958 + }, + { + "auxiliary_loss_clip": 0.06410116, + "auxiliary_loss_mlp": 0.0126791, + "balance_loss_clip": 0.06274181, + "balance_loss_mlp": 0.01257217, + "epoch": 0.7371712009619721, + "flos": 23156292216960.0, + "grad_norm": 1.5911507549060615, + "language_loss": 0.7366817, + "learning_rate": 6.817903585769125e-07, + "loss": 0.81346196, + "num_input_tokens_seen": 264497195, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.10693359, + "step": 12261, + "time_per_iteration": 2.4976613521575928 + }, + { + "auxiliary_loss_clip": 0.06411919, + "auxiliary_loss_mlp": 0.01266277, + "balance_loss_clip": 0.06273584, + "balance_loss_mlp": 0.01254845, + "epoch": 0.73723132421464, + "flos": 23119675182720.0, + "grad_norm": 1.9595701183137586, + "language_loss": 0.67333376, + "learning_rate": 6.814974884917438e-07, + "loss": 0.75011569, + "num_input_tokens_seen": 264516950, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.11425781, + "step": 12262, + "time_per_iteration": 2.5359151363372803 + }, + { + "auxiliary_loss_clip": 0.06410287, + "auxiliary_loss_mlp": 0.01266365, + "balance_loss_clip": 0.06273925, + "balance_loss_mlp": 0.01255881, + "epoch": 0.737291447467308, + "flos": 19278031115520.0, + "grad_norm": 1.8055684860594015, + "language_loss": 0.8872509, + "learning_rate": 6.81204668404322e-07, + "loss": 0.96401745, + "num_input_tokens_seen": 264532675, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.10479736, + "step": 12263, + "time_per_iteration": 2.4645025730133057 + }, + { + "auxiliary_loss_clip": 0.06401009, + "auxiliary_loss_mlp": 0.01262824, + "balance_loss_clip": 0.06273551, + "balance_loss_mlp": 0.01253717, + "epoch": 0.7373515707199759, + "flos": 25125142803840.0, + "grad_norm": 1.5128594481302715, + "language_loss": 0.67552602, + "learning_rate": 6.809118983257522e-07, + "loss": 0.75216436, + "num_input_tokens_seen": 264555635, + "router_z_loss_clip": 1.27441406, + "router_z_loss_mlp": 0.09100342, + "step": 12264, + "time_per_iteration": 2.569833517074585 + }, + { + "auxiliary_loss_clip": 0.06405195, + "auxiliary_loss_mlp": 0.012641, + "balance_loss_clip": 0.06273174, + "balance_loss_mlp": 0.0125442, + "epoch": 0.737411693972644, + "flos": 32415427290240.0, + "grad_norm": 1.6707890497545697, + "language_loss": 0.80282211, + "learning_rate": 6.806191782671356e-07, + "loss": 0.87951505, + "num_input_tokens_seen": 264573140, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.09674072, + "step": 12265, + "time_per_iteration": 3.997997283935547 + }, + { + "auxiliary_loss_clip": 0.06415318, + "auxiliary_loss_mlp": 0.01264678, + "balance_loss_clip": 0.06273959, + "balance_loss_mlp": 0.01253758, + "epoch": 0.7374718172253119, + "flos": 24322586797440.0, + "grad_norm": 1.6052844739789887, + "language_loss": 0.75045347, + "learning_rate": 6.803265082395711e-07, + "loss": 0.82725346, + "num_input_tokens_seen": 264591610, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.10919189, + "step": 12266, + "time_per_iteration": 2.5624334812164307 + }, + { + "auxiliary_loss_clip": 0.06408488, + "auxiliary_loss_mlp": 0.01267186, + "balance_loss_clip": 0.06273493, + "balance_loss_mlp": 0.0125697, + "epoch": 0.7375319404779799, + "flos": 27162447776640.0, + "grad_norm": 1.557791078804126, + "language_loss": 0.73471284, + "learning_rate": 6.800338882541576e-07, + "loss": 0.81146955, + "num_input_tokens_seen": 264611170, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.10217285, + "step": 12267, + "time_per_iteration": 2.561325788497925 + }, + { + "auxiliary_loss_clip": 0.06408671, + "auxiliary_loss_mlp": 0.0126606, + "balance_loss_clip": 0.06273606, + "balance_loss_mlp": 0.01256654, + "epoch": 0.7375920637306478, + "flos": 18885977061120.0, + "grad_norm": 1.9471728084971924, + "language_loss": 0.83236742, + "learning_rate": 6.797413183219923e-07, + "loss": 0.90911472, + "num_input_tokens_seen": 264629365, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.09411621, + "step": 12268, + "time_per_iteration": 2.515185832977295 + }, + { + "auxiliary_loss_clip": 0.06403858, + "auxiliary_loss_mlp": 0.01268762, + "balance_loss_clip": 0.06272093, + "balance_loss_mlp": 0.01258641, + "epoch": 0.7376521869833158, + "flos": 15675291838080.0, + "grad_norm": 1.7639029349548874, + "language_loss": 0.73450869, + "learning_rate": 6.794487984541677e-07, + "loss": 0.81123489, + "num_input_tokens_seen": 264647915, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.10113525, + "step": 12269, + "time_per_iteration": 3.9070801734924316 + }, + { + "auxiliary_loss_clip": 0.06414587, + "auxiliary_loss_mlp": 0.01264636, + "balance_loss_clip": 0.06275409, + "balance_loss_mlp": 0.01253186, + "epoch": 0.7377123102359837, + "flos": 36980146166400.0, + "grad_norm": 1.919355815322485, + "language_loss": 0.70780635, + "learning_rate": 6.791563286617776e-07, + "loss": 0.78459859, + "num_input_tokens_seen": 264669620, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.11450195, + "step": 12270, + "time_per_iteration": 2.6150050163269043 + }, + { + "auxiliary_loss_clip": 0.06405621, + "auxiliary_loss_mlp": 0.01267086, + "balance_loss_clip": 0.06273162, + "balance_loss_mlp": 0.01257514, + "epoch": 0.7377724334886517, + "flos": 24502779002880.0, + "grad_norm": 1.650003260672948, + "language_loss": 0.69519281, + "learning_rate": 6.788639089559119e-07, + "loss": 0.77191985, + "num_input_tokens_seen": 264689345, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09564209, + "step": 12271, + "time_per_iteration": 2.545802593231201 + }, + { + "auxiliary_loss_clip": 0.06407182, + "auxiliary_loss_mlp": 0.01265449, + "balance_loss_clip": 0.06271105, + "balance_loss_mlp": 0.01254565, + "epoch": 0.7378325567413198, + "flos": 24397036750080.0, + "grad_norm": 2.0373077116973577, + "language_loss": 0.67736673, + "learning_rate": 6.785715393476586e-07, + "loss": 0.75409299, + "num_input_tokens_seen": 264707625, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.10882568, + "step": 12272, + "time_per_iteration": 2.5161080360412598 + }, + { + "auxiliary_loss_clip": 0.064047, + "auxiliary_loss_mlp": 0.01266291, + "balance_loss_clip": 0.06272876, + "balance_loss_mlp": 0.01255812, + "epoch": 0.7378926799939877, + "flos": 17421421472640.0, + "grad_norm": 1.6693820905355277, + "language_loss": 0.78472829, + "learning_rate": 6.782792198481049e-07, + "loss": 0.86143827, + "num_input_tokens_seen": 264725575, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.10479736, + "step": 12273, + "time_per_iteration": 2.527449369430542 + }, + { + "auxiliary_loss_clip": 0.0640404, + "auxiliary_loss_mlp": 0.01265172, + "balance_loss_clip": 0.06270438, + "balance_loss_mlp": 0.01255111, + "epoch": 0.7379528032466557, + "flos": 18479374323840.0, + "grad_norm": 1.7204820046502844, + "language_loss": 0.83983135, + "learning_rate": 6.779869504683355e-07, + "loss": 0.91652346, + "num_input_tokens_seen": 264742855, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.1005249, + "step": 12274, + "time_per_iteration": 3.8728952407836914 + }, + { + "auxiliary_loss_clip": 0.06420162, + "auxiliary_loss_mlp": 0.0126937, + "balance_loss_clip": 0.06277606, + "balance_loss_mlp": 0.01258414, + "epoch": 0.7380129264993236, + "flos": 17827814574720.0, + "grad_norm": 1.7616073867402775, + "language_loss": 0.7422626, + "learning_rate": 6.776947312194341e-07, + "loss": 0.81915796, + "num_input_tokens_seen": 264761155, + "router_z_loss_clip": 1.42382812, + "router_z_loss_mlp": 0.10961914, + "step": 12275, + "time_per_iteration": 2.528137445449829 + }, + { + "auxiliary_loss_clip": 0.06413853, + "auxiliary_loss_mlp": 0.01270057, + "balance_loss_clip": 0.06274875, + "balance_loss_mlp": 0.01259352, + "epoch": 0.7380730497519916, + "flos": 23003115753600.0, + "grad_norm": 1.6499843647208283, + "language_loss": 0.73819113, + "learning_rate": 6.774025621124813e-07, + "loss": 0.81503022, + "num_input_tokens_seen": 264780660, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.10699463, + "step": 12276, + "time_per_iteration": 2.49808931350708 + }, + { + "auxiliary_loss_clip": 0.06408275, + "auxiliary_loss_mlp": 0.01262969, + "balance_loss_clip": 0.062733, + "balance_loss_mlp": 0.0125329, + "epoch": 0.7381331730046595, + "flos": 20272435044480.0, + "grad_norm": 1.938538877021236, + "language_loss": 0.77922094, + "learning_rate": 6.771104431585551e-07, + "loss": 0.85593343, + "num_input_tokens_seen": 264798850, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.09680176, + "step": 12277, + "time_per_iteration": 2.5433340072631836 + }, + { + "auxiliary_loss_clip": 0.06408259, + "auxiliary_loss_mlp": 0.01270849, + "balance_loss_clip": 0.06276105, + "balance_loss_mlp": 0.01260495, + "epoch": 0.7381932962573275, + "flos": 19760467397760.0, + "grad_norm": 1.5941630218798921, + "language_loss": 0.79001057, + "learning_rate": 6.768183743687338e-07, + "loss": 0.86680162, + "num_input_tokens_seen": 264816795, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.10351562, + "step": 12278, + "time_per_iteration": 2.5074949264526367 + }, + { + "auxiliary_loss_clip": 0.06409795, + "auxiliary_loss_mlp": 0.01264815, + "balance_loss_clip": 0.06271898, + "balance_loss_mlp": 0.01254248, + "epoch": 0.7382534195099955, + "flos": 17310060996480.0, + "grad_norm": 3.5373334504988474, + "language_loss": 0.71857256, + "learning_rate": 6.765263557540921e-07, + "loss": 0.79531866, + "num_input_tokens_seen": 264834105, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.10577393, + "step": 12279, + "time_per_iteration": 2.516350269317627 + }, + { + "auxiliary_loss_clip": 0.06410283, + "auxiliary_loss_mlp": 0.01266626, + "balance_loss_clip": 0.0627284, + "balance_loss_mlp": 0.01256243, + "epoch": 0.7383135427626635, + "flos": 18703269233280.0, + "grad_norm": 2.101190205716009, + "language_loss": 0.85982198, + "learning_rate": 6.762343873257034e-07, + "loss": 0.93659103, + "num_input_tokens_seen": 264850895, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.10388184, + "step": 12280, + "time_per_iteration": 2.4823272228240967 + }, + { + "auxiliary_loss_clip": 0.06411093, + "auxiliary_loss_mlp": 0.01264508, + "balance_loss_clip": 0.06273913, + "balance_loss_mlp": 0.01253493, + "epoch": 0.7383736660153314, + "flos": 20886706926720.0, + "grad_norm": 1.8639643742325518, + "language_loss": 0.72394395, + "learning_rate": 6.759424690946408e-07, + "loss": 0.80069995, + "num_input_tokens_seen": 264869505, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.11016846, + "step": 12281, + "time_per_iteration": 2.5224528312683105 + }, + { + "auxiliary_loss_clip": 0.06412193, + "auxiliary_loss_mlp": 0.01266264, + "balance_loss_clip": 0.06275124, + "balance_loss_mlp": 0.01255821, + "epoch": 0.7384337892679994, + "flos": 20668723729920.0, + "grad_norm": 1.7354362664323408, + "language_loss": 0.61005342, + "learning_rate": 6.756506010719711e-07, + "loss": 0.68683791, + "num_input_tokens_seen": 264886915, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.10449219, + "step": 12282, + "time_per_iteration": 2.5047874450683594 + }, + { + "auxiliary_loss_clip": 0.06414121, + "auxiliary_loss_mlp": 0.01267578, + "balance_loss_clip": 0.06274915, + "balance_loss_mlp": 0.01256945, + "epoch": 0.7384939125206673, + "flos": 29177432835840.0, + "grad_norm": 1.7016014462601576, + "language_loss": 0.6800909, + "learning_rate": 6.753587832687632e-07, + "loss": 0.75690794, + "num_input_tokens_seen": 264910350, + "router_z_loss_clip": 1.39257812, + "router_z_loss_mlp": 0.10632324, + "step": 12283, + "time_per_iteration": 2.5679969787597656 + }, + { + "auxiliary_loss_clip": 0.06408164, + "auxiliary_loss_mlp": 0.01266727, + "balance_loss_clip": 0.06274041, + "balance_loss_mlp": 0.01256636, + "epoch": 0.7385540357733353, + "flos": 36320494498560.0, + "grad_norm": 1.58111004650423, + "language_loss": 0.76160252, + "learning_rate": 6.750670156960832e-07, + "loss": 0.83835149, + "num_input_tokens_seen": 264930705, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.10095215, + "step": 12284, + "time_per_iteration": 2.6471667289733887 + }, + { + "auxiliary_loss_clip": 0.06415117, + "auxiliary_loss_mlp": 0.01265727, + "balance_loss_clip": 0.06277623, + "balance_loss_mlp": 0.01255028, + "epoch": 0.7386141590260034, + "flos": 20308758589440.0, + "grad_norm": 2.367235737464537, + "language_loss": 0.69446218, + "learning_rate": 6.747752983649954e-07, + "loss": 0.77127063, + "num_input_tokens_seen": 264946975, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.10705566, + "step": 12285, + "time_per_iteration": 2.473684549331665 + }, + { + "auxiliary_loss_clip": 0.06417808, + "auxiliary_loss_mlp": 0.01266655, + "balance_loss_clip": 0.06276424, + "balance_loss_mlp": 0.0125499, + "epoch": 0.7386742822786713, + "flos": 25490851948800.0, + "grad_norm": 1.8974918118522153, + "language_loss": 0.80231923, + "learning_rate": 6.744836312865602e-07, + "loss": 0.87916386, + "num_input_tokens_seen": 264967665, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.11669922, + "step": 12286, + "time_per_iteration": 2.552478313446045 + }, + { + "auxiliary_loss_clip": 0.06409865, + "auxiliary_loss_mlp": 0.01264773, + "balance_loss_clip": 0.06276139, + "balance_loss_mlp": 0.01254897, + "epoch": 0.7387344055313393, + "flos": 13777075843200.0, + "grad_norm": 2.0836319453796452, + "language_loss": 0.65815514, + "learning_rate": 6.741920144718396e-07, + "loss": 0.73490155, + "num_input_tokens_seen": 264985480, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.09881592, + "step": 12287, + "time_per_iteration": 2.47298264503479 + }, + { + "auxiliary_loss_clip": 0.0640405, + "auxiliary_loss_mlp": 0.01265177, + "balance_loss_clip": 0.06273359, + "balance_loss_mlp": 0.01255557, + "epoch": 0.7387945287840072, + "flos": 27862615693440.0, + "grad_norm": 1.674403553414071, + "language_loss": 0.76529717, + "learning_rate": 6.739004479318903e-07, + "loss": 0.84198946, + "num_input_tokens_seen": 265004790, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.09619141, + "step": 12288, + "time_per_iteration": 2.5699422359466553 + }, + { + "auxiliary_loss_clip": 0.06413888, + "auxiliary_loss_mlp": 0.0126915, + "balance_loss_clip": 0.06274378, + "balance_loss_mlp": 0.01257689, + "epoch": 0.7388546520366752, + "flos": 44242492515840.0, + "grad_norm": 1.8421640794180243, + "language_loss": 0.58466721, + "learning_rate": 6.736089316777684e-07, + "loss": 0.66149765, + "num_input_tokens_seen": 265028790, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.11462402, + "step": 12289, + "time_per_iteration": 2.691962242126465 + }, + { + "auxiliary_loss_clip": 0.06318665, + "auxiliary_loss_mlp": 0.01255253, + "balance_loss_clip": 0.06263465, + "balance_loss_mlp": 0.01254091, + "epoch": 0.7389147752893431, + "flos": 70700145672960.0, + "grad_norm": 0.6181631309216685, + "language_loss": 0.49242556, + "learning_rate": 6.733174657205287e-07, + "loss": 0.56816471, + "num_input_tokens_seen": 265096660, + "router_z_loss_clip": 0.55419922, + "router_z_loss_mlp": 0.01159668, + "step": 12290, + "time_per_iteration": 3.2382025718688965 + }, + { + "auxiliary_loss_clip": 0.06410427, + "auxiliary_loss_mlp": 0.01269006, + "balance_loss_clip": 0.0627414, + "balance_loss_mlp": 0.01256811, + "epoch": 0.7389748985420111, + "flos": 26002190689920.0, + "grad_norm": 1.6462515447687802, + "language_loss": 0.67644894, + "learning_rate": 6.730260500712237e-07, + "loss": 0.75324321, + "num_input_tokens_seen": 265116375, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.12182617, + "step": 12291, + "time_per_iteration": 2.5330934524536133 + }, + { + "auxiliary_loss_clip": 0.06323051, + "auxiliary_loss_mlp": 0.01253715, + "balance_loss_clip": 0.06267922, + "balance_loss_mlp": 0.01252465, + "epoch": 0.7390350217946791, + "flos": 54419428558080.0, + "grad_norm": 0.9538265155410941, + "language_loss": 0.60977232, + "learning_rate": 6.727346847409052e-07, + "loss": 0.68553996, + "num_input_tokens_seen": 265161230, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01249695, + "step": 12292, + "time_per_iteration": 2.809068202972412 + }, + { + "auxiliary_loss_clip": 0.06409512, + "auxiliary_loss_mlp": 0.01265193, + "balance_loss_clip": 0.06275129, + "balance_loss_mlp": 0.01255388, + "epoch": 0.7390951450473471, + "flos": 32205116741760.0, + "grad_norm": 2.042192821638958, + "language_loss": 0.67519832, + "learning_rate": 6.724433697406191e-07, + "loss": 0.75194532, + "num_input_tokens_seen": 265182515, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.09814453, + "step": 12293, + "time_per_iteration": 2.633490800857544 + }, + { + "auxiliary_loss_clip": 0.06407283, + "auxiliary_loss_mlp": 0.01264321, + "balance_loss_clip": 0.06273873, + "balance_loss_mlp": 0.01253682, + "epoch": 0.739155268300015, + "flos": 16688745371520.0, + "grad_norm": 1.7465858872032636, + "language_loss": 0.84024155, + "learning_rate": 6.721521050814134e-07, + "loss": 0.91695762, + "num_input_tokens_seen": 265198160, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.10644531, + "step": 12294, + "time_per_iteration": 2.4902942180633545 + }, + { + "auxiliary_loss_clip": 0.064035, + "auxiliary_loss_mlp": 0.01264966, + "balance_loss_clip": 0.06273185, + "balance_loss_mlp": 0.01254976, + "epoch": 0.739215391552683, + "flos": 31657831799040.0, + "grad_norm": 1.4686013728036598, + "language_loss": 0.72988927, + "learning_rate": 6.718608907743337e-07, + "loss": 0.80657387, + "num_input_tokens_seen": 265218480, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.09985352, + "step": 12295, + "time_per_iteration": 4.01623272895813 + }, + { + "auxiliary_loss_clip": 0.06404971, + "auxiliary_loss_mlp": 0.01264401, + "balance_loss_clip": 0.06274794, + "balance_loss_mlp": 0.01254906, + "epoch": 0.7392755148053509, + "flos": 29726688349440.0, + "grad_norm": 1.6462168088608014, + "language_loss": 0.78829199, + "learning_rate": 6.715697268304215e-07, + "loss": 0.8649857, + "num_input_tokens_seen": 265240165, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.09490967, + "step": 12296, + "time_per_iteration": 2.6365103721618652 + }, + { + "auxiliary_loss_clip": 0.06404981, + "auxiliary_loss_mlp": 0.01267023, + "balance_loss_clip": 0.0627135, + "balance_loss_mlp": 0.01256008, + "epoch": 0.7393356380580189, + "flos": 37059585436800.0, + "grad_norm": 1.8865876945980686, + "language_loss": 0.67489415, + "learning_rate": 6.712786132607182e-07, + "loss": 0.75161421, + "num_input_tokens_seen": 265263295, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.11010742, + "step": 12297, + "time_per_iteration": 2.6924734115600586 + }, + { + "auxiliary_loss_clip": 0.06407569, + "auxiliary_loss_mlp": 0.01264759, + "balance_loss_clip": 0.06272732, + "balance_loss_mlp": 0.01254447, + "epoch": 0.739395761310687, + "flos": 19725820934400.0, + "grad_norm": 1.5263040230444953, + "language_loss": 0.68836749, + "learning_rate": 6.709875500762645e-07, + "loss": 0.7650907, + "num_input_tokens_seen": 265282740, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.10308838, + "step": 12298, + "time_per_iteration": 2.501797914505005 + }, + { + "auxiliary_loss_clip": 0.06407927, + "auxiliary_loss_mlp": 0.01267097, + "balance_loss_clip": 0.06273854, + "balance_loss_mlp": 0.01256559, + "epoch": 0.7394558845633549, + "flos": 11806254685440.0, + "grad_norm": 2.783354408484115, + "language_loss": 0.74698675, + "learning_rate": 6.706965372880946e-07, + "loss": 0.82373697, + "num_input_tokens_seen": 265300175, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.10534668, + "step": 12299, + "time_per_iteration": 2.479194164276123 + }, + { + "auxiliary_loss_clip": 0.06317861, + "auxiliary_loss_mlp": 0.01251014, + "balance_loss_clip": 0.06262733, + "balance_loss_mlp": 0.01249821, + "epoch": 0.7395160078160229, + "flos": 66214782213120.0, + "grad_norm": 0.7124865082748734, + "language_loss": 0.60634726, + "learning_rate": 6.704055749072455e-07, + "loss": 0.68203598, + "num_input_tokens_seen": 265363275, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.01190948, + "step": 12300, + "time_per_iteration": 3.154963493347168 + }, + { + "auxiliary_loss_clip": 0.06409278, + "auxiliary_loss_mlp": 0.01265061, + "balance_loss_clip": 0.06273282, + "balance_loss_mlp": 0.01254451, + "epoch": 0.7395761310686908, + "flos": 21255770234880.0, + "grad_norm": 1.6643476346606387, + "language_loss": 0.80243456, + "learning_rate": 6.7011466294475e-07, + "loss": 0.87917793, + "num_input_tokens_seen": 265382935, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.1060791, + "step": 12301, + "time_per_iteration": 2.529728889465332 + }, + { + "auxiliary_loss_clip": 0.064082, + "auxiliary_loss_mlp": 0.01264915, + "balance_loss_clip": 0.06274755, + "balance_loss_mlp": 0.01254508, + "epoch": 0.7396362543213588, + "flos": 25961967930240.0, + "grad_norm": 1.3607409082618038, + "language_loss": 0.72955477, + "learning_rate": 6.698238014116406e-07, + "loss": 0.80628592, + "num_input_tokens_seen": 265403245, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.10406494, + "step": 12302, + "time_per_iteration": 2.546940326690674 + }, + { + "auxiliary_loss_clip": 0.06409822, + "auxiliary_loss_mlp": 0.01265837, + "balance_loss_clip": 0.06272913, + "balance_loss_mlp": 0.01255567, + "epoch": 0.7396963775740267, + "flos": 27384791385600.0, + "grad_norm": 1.8966052271775322, + "language_loss": 0.74529129, + "learning_rate": 6.695329903189451e-07, + "loss": 0.82204789, + "num_input_tokens_seen": 265423105, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.1027832, + "step": 12303, + "time_per_iteration": 2.5615267753601074 + }, + { + "auxiliary_loss_clip": 0.06403703, + "auxiliary_loss_mlp": 0.01264851, + "balance_loss_clip": 0.06271822, + "balance_loss_mlp": 0.01255546, + "epoch": 0.7397565008266948, + "flos": 25527175493760.0, + "grad_norm": 1.6634023085525402, + "language_loss": 0.54497898, + "learning_rate": 6.692422296776927e-07, + "loss": 0.62166452, + "num_input_tokens_seen": 265443445, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09307861, + "step": 12304, + "time_per_iteration": 2.5219099521636963 + }, + { + "auxiliary_loss_clip": 0.06408396, + "auxiliary_loss_mlp": 0.01263792, + "balance_loss_clip": 0.06273419, + "balance_loss_mlp": 0.01253808, + "epoch": 0.7398166240793627, + "flos": 23733737429760.0, + "grad_norm": 6.743550792885306, + "language_loss": 0.84620976, + "learning_rate": 6.689515194989084e-07, + "loss": 0.92293161, + "num_input_tokens_seen": 265462085, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.09979248, + "step": 12305, + "time_per_iteration": 3.947659969329834 + }, + { + "auxiliary_loss_clip": 0.06311572, + "auxiliary_loss_mlp": 0.01252487, + "balance_loss_clip": 0.06256508, + "balance_loss_mlp": 0.01251203, + "epoch": 0.7398767473320307, + "flos": 67289002755840.0, + "grad_norm": 0.8626934880407965, + "language_loss": 0.57769525, + "learning_rate": 6.68660859793615e-07, + "loss": 0.65333581, + "num_input_tokens_seen": 265521190, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.0128479, + "step": 12306, + "time_per_iteration": 3.1756792068481445 + }, + { + "auxiliary_loss_clip": 0.06411088, + "auxiliary_loss_mlp": 0.01263791, + "balance_loss_clip": 0.06273864, + "balance_loss_mlp": 0.0125327, + "epoch": 0.7399368705846986, + "flos": 22025356859520.0, + "grad_norm": 1.7963583951725388, + "language_loss": 0.81658536, + "learning_rate": 6.683702505728355e-07, + "loss": 0.89333415, + "num_input_tokens_seen": 265539705, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.10516357, + "step": 12307, + "time_per_iteration": 2.506915330886841 + }, + { + "auxiliary_loss_clip": 0.06403811, + "auxiliary_loss_mlp": 0.0126475, + "balance_loss_clip": 0.06274117, + "balance_loss_mlp": 0.01255696, + "epoch": 0.7399969938373666, + "flos": 14179150460160.0, + "grad_norm": 1.6050625884123768, + "language_loss": 0.70237017, + "learning_rate": 6.680796918475893e-07, + "loss": 0.77905583, + "num_input_tokens_seen": 265555855, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.09051514, + "step": 12308, + "time_per_iteration": 3.91337513923645 + }, + { + "auxiliary_loss_clip": 0.06401709, + "auxiliary_loss_mlp": 0.01262204, + "balance_loss_clip": 0.06271876, + "balance_loss_mlp": 0.01252459, + "epoch": 0.7400571170900345, + "flos": 25308521464320.0, + "grad_norm": 1.6982405979686375, + "language_loss": 0.81117153, + "learning_rate": 6.67789183628896e-07, + "loss": 0.88781071, + "num_input_tokens_seen": 265575455, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.09747314, + "step": 12309, + "time_per_iteration": 2.5796985626220703 + }, + { + "auxiliary_loss_clip": 0.06409381, + "auxiliary_loss_mlp": 0.01269417, + "balance_loss_clip": 0.06270479, + "balance_loss_mlp": 0.01258534, + "epoch": 0.7401172403427025, + "flos": 22718019836160.0, + "grad_norm": 5.238582270491251, + "language_loss": 0.73371196, + "learning_rate": 6.674987259277692e-07, + "loss": 0.81049991, + "num_input_tokens_seen": 265595250, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.10882568, + "step": 12310, + "time_per_iteration": 2.5165646076202393 + }, + { + "auxiliary_loss_clip": 0.06409644, + "auxiliary_loss_mlp": 0.01269084, + "balance_loss_clip": 0.06274551, + "balance_loss_mlp": 0.01257669, + "epoch": 0.7401773635953706, + "flos": 18071639556480.0, + "grad_norm": 2.7222235322625417, + "language_loss": 0.89223385, + "learning_rate": 6.672083187552239e-07, + "loss": 0.96902108, + "num_input_tokens_seen": 265606945, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.11425781, + "step": 12311, + "time_per_iteration": 2.467475652694702 + }, + { + "auxiliary_loss_clip": 0.0640601, + "auxiliary_loss_mlp": 0.01266757, + "balance_loss_clip": 0.06272036, + "balance_loss_mlp": 0.01256934, + "epoch": 0.7402374868480385, + "flos": 22718942231040.0, + "grad_norm": 1.4999851664761075, + "language_loss": 0.8031621, + "learning_rate": 6.669179621222738e-07, + "loss": 0.87988985, + "num_input_tokens_seen": 265626115, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.09832764, + "step": 12312, + "time_per_iteration": 2.5331287384033203 + }, + { + "auxiliary_loss_clip": 0.06405149, + "auxiliary_loss_mlp": 0.01264931, + "balance_loss_clip": 0.06272588, + "balance_loss_mlp": 0.01255072, + "epoch": 0.7402976101007065, + "flos": 22863272088960.0, + "grad_norm": 1.7972684240515402, + "language_loss": 0.78719336, + "learning_rate": 6.666276560399273e-07, + "loss": 0.86389416, + "num_input_tokens_seen": 265646520, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.09857178, + "step": 12313, + "time_per_iteration": 2.5370211601257324 + }, + { + "auxiliary_loss_clip": 0.06407566, + "auxiliary_loss_mlp": 0.01265144, + "balance_loss_clip": 0.0626882, + "balance_loss_mlp": 0.01254308, + "epoch": 0.7403577333533744, + "flos": 12350143537920.0, + "grad_norm": 1.8417739265455044, + "language_loss": 0.79031622, + "learning_rate": 6.663374005191937e-07, + "loss": 0.86704326, + "num_input_tokens_seen": 265661875, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.10827637, + "step": 12314, + "time_per_iteration": 3.856675148010254 + }, + { + "auxiliary_loss_clip": 0.06317294, + "auxiliary_loss_mlp": 0.01250351, + "balance_loss_clip": 0.06261952, + "balance_loss_mlp": 0.01249078, + "epoch": 0.7404178566060424, + "flos": 60346189152000.0, + "grad_norm": 0.8038008604712399, + "language_loss": 0.55230701, + "learning_rate": 6.660471955710809e-07, + "loss": 0.62798345, + "num_input_tokens_seen": 265721255, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01273346, + "step": 12315, + "time_per_iteration": 3.094839334487915 + }, + { + "auxiliary_loss_clip": 0.06400545, + "auxiliary_loss_mlp": 0.01269055, + "balance_loss_clip": 0.06270912, + "balance_loss_mlp": 0.01259298, + "epoch": 0.7404779798587103, + "flos": 32022786257280.0, + "grad_norm": 1.42588959053577, + "language_loss": 0.79849303, + "learning_rate": 6.65757041206591e-07, + "loss": 0.87518907, + "num_input_tokens_seen": 265743970, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.09759521, + "step": 12316, + "time_per_iteration": 2.6217541694641113 + }, + { + "auxiliary_loss_clip": 0.06405086, + "auxiliary_loss_mlp": 0.01263693, + "balance_loss_clip": 0.06270514, + "balance_loss_mlp": 0.01253703, + "epoch": 0.7405381031113784, + "flos": 12893571192960.0, + "grad_norm": 1.9031027598783419, + "language_loss": 0.74949759, + "learning_rate": 6.654669374367275e-07, + "loss": 0.82618535, + "num_input_tokens_seen": 265760890, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.09997559, + "step": 12317, + "time_per_iteration": 2.4909305572509766 + }, + { + "auxiliary_loss_clip": 0.06398293, + "auxiliary_loss_mlp": 0.01265661, + "balance_loss_clip": 0.06270675, + "balance_loss_mlp": 0.01256625, + "epoch": 0.7405982263640463, + "flos": 20235189104640.0, + "grad_norm": 1.7604511064610666, + "language_loss": 0.81780982, + "learning_rate": 6.651768842724917e-07, + "loss": 0.89444935, + "num_input_tokens_seen": 265779600, + "router_z_loss_clip": 1.27636719, + "router_z_loss_mlp": 0.09039307, + "step": 12318, + "time_per_iteration": 2.5435891151428223 + }, + { + "auxiliary_loss_clip": 0.06408297, + "auxiliary_loss_mlp": 0.01266199, + "balance_loss_clip": 0.06271499, + "balance_loss_mlp": 0.01256317, + "epoch": 0.7406583496167143, + "flos": 17573088562560.0, + "grad_norm": 1.866306408499981, + "language_loss": 0.76751161, + "learning_rate": 6.648868817248827e-07, + "loss": 0.84425652, + "num_input_tokens_seen": 265797030, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.09887695, + "step": 12319, + "time_per_iteration": 2.4622530937194824 + }, + { + "auxiliary_loss_clip": 0.0640564, + "auxiliary_loss_mlp": 0.01263336, + "balance_loss_clip": 0.06272121, + "balance_loss_mlp": 0.01253645, + "epoch": 0.7407184728693822, + "flos": 18301530032640.0, + "grad_norm": 2.0432497673800563, + "language_loss": 0.63919193, + "learning_rate": 6.64596929804897e-07, + "loss": 0.71588171, + "num_input_tokens_seen": 265815055, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.09698486, + "step": 12320, + "time_per_iteration": 2.491823196411133 + }, + { + "auxiliary_loss_clip": 0.06412543, + "auxiliary_loss_mlp": 0.01263353, + "balance_loss_clip": 0.06273834, + "balance_loss_mlp": 0.01252761, + "epoch": 0.7407785961220502, + "flos": 16696124530560.0, + "grad_norm": 2.5007986584617767, + "language_loss": 0.82488716, + "learning_rate": 6.643070285235288e-07, + "loss": 0.90164608, + "num_input_tokens_seen": 265828480, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.10583496, + "step": 12321, + "time_per_iteration": 2.472942352294922 + }, + { + "auxiliary_loss_clip": 0.06413056, + "auxiliary_loss_mlp": 0.01275475, + "balance_loss_clip": 0.06272203, + "balance_loss_mlp": 0.01263488, + "epoch": 0.7408387193747181, + "flos": 22094440151040.0, + "grad_norm": 1.687827757394498, + "language_loss": 0.72481614, + "learning_rate": 6.640171778917727e-07, + "loss": 0.80170149, + "num_input_tokens_seen": 265845825, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.11993408, + "step": 12322, + "time_per_iteration": 2.5148372650146484 + }, + { + "auxiliary_loss_clip": 0.06410389, + "auxiliary_loss_mlp": 0.01264964, + "balance_loss_clip": 0.06275401, + "balance_loss_mlp": 0.01254969, + "epoch": 0.7408988426273861, + "flos": 24242476694400.0, + "grad_norm": 1.7223397407589476, + "language_loss": 0.64227688, + "learning_rate": 6.637273779206183e-07, + "loss": 0.71903044, + "num_input_tokens_seen": 265866335, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.09991455, + "step": 12323, + "time_per_iteration": 2.545907735824585 + }, + { + "auxiliary_loss_clip": 0.06410556, + "auxiliary_loss_mlp": 0.01267934, + "balance_loss_clip": 0.06273916, + "balance_loss_mlp": 0.01257348, + "epoch": 0.7409589658800542, + "flos": 29030671209600.0, + "grad_norm": 1.3447635409056256, + "language_loss": 0.76155257, + "learning_rate": 6.634376286210559e-07, + "loss": 0.83833748, + "num_input_tokens_seen": 265888945, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.105896, + "step": 12324, + "time_per_iteration": 2.6743714809417725 + }, + { + "auxiliary_loss_clip": 0.06405617, + "auxiliary_loss_mlp": 0.01264226, + "balance_loss_clip": 0.06272118, + "balance_loss_mlp": 0.01254272, + "epoch": 0.7410190891327221, + "flos": 19356925334400.0, + "grad_norm": 13.963490844682125, + "language_loss": 0.74922419, + "learning_rate": 6.63147930004073e-07, + "loss": 0.82592261, + "num_input_tokens_seen": 265908030, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.09960938, + "step": 12325, + "time_per_iteration": 2.471677780151367 + }, + { + "auxiliary_loss_clip": 0.064167, + "auxiliary_loss_mlp": 0.01267104, + "balance_loss_clip": 0.06275749, + "balance_loss_mlp": 0.01256208, + "epoch": 0.7410792123853901, + "flos": 22754301454080.0, + "grad_norm": 1.6510689232341687, + "language_loss": 0.68920004, + "learning_rate": 6.628582820806545e-07, + "loss": 0.76603806, + "num_input_tokens_seen": 265927030, + "router_z_loss_clip": 1.40917969, + "router_z_loss_mlp": 0.10906982, + "step": 12326, + "time_per_iteration": 2.544271469116211 + }, + { + "auxiliary_loss_clip": 0.06406512, + "auxiliary_loss_mlp": 0.01270057, + "balance_loss_clip": 0.06271762, + "balance_loss_mlp": 0.01259943, + "epoch": 0.741139335638058, + "flos": 25379156056320.0, + "grad_norm": 2.684979070680883, + "language_loss": 0.89408934, + "learning_rate": 6.625686848617835e-07, + "loss": 0.97085506, + "num_input_tokens_seen": 265945490, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.10113525, + "step": 12327, + "time_per_iteration": 2.514342784881592 + }, + { + "auxiliary_loss_clip": 0.06405853, + "auxiliary_loss_mlp": 0.01270995, + "balance_loss_clip": 0.0627297, + "balance_loss_mlp": 0.01260326, + "epoch": 0.741199458890726, + "flos": 18591154070400.0, + "grad_norm": 1.616289045038266, + "language_loss": 0.86022431, + "learning_rate": 6.62279138358442e-07, + "loss": 0.93699282, + "num_input_tokens_seen": 265963265, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.10668945, + "step": 12328, + "time_per_iteration": 2.546849012374878 + }, + { + "auxiliary_loss_clip": 0.06404015, + "auxiliary_loss_mlp": 0.01266041, + "balance_loss_clip": 0.06273206, + "balance_loss_mlp": 0.01256373, + "epoch": 0.7412595821433939, + "flos": 22133572807680.0, + "grad_norm": 3.0862478099951476, + "language_loss": 0.66898477, + "learning_rate": 6.619896425816103e-07, + "loss": 0.74568534, + "num_input_tokens_seen": 265982270, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09655762, + "step": 12329, + "time_per_iteration": 2.4837799072265625 + }, + { + "auxiliary_loss_clip": 0.06415252, + "auxiliary_loss_mlp": 0.01271747, + "balance_loss_clip": 0.06274865, + "balance_loss_mlp": 0.01261262, + "epoch": 0.741319705396062, + "flos": 29177516689920.0, + "grad_norm": 1.6153996639831127, + "language_loss": 0.67172372, + "learning_rate": 6.617001975422647e-07, + "loss": 0.74859369, + "num_input_tokens_seen": 266003835, + "router_z_loss_clip": 1.40332031, + "router_z_loss_mlp": 0.10479736, + "step": 12330, + "time_per_iteration": 2.59244441986084 + }, + { + "auxiliary_loss_clip": 0.06414045, + "auxiliary_loss_mlp": 0.01265631, + "balance_loss_clip": 0.06274007, + "balance_loss_mlp": 0.01254467, + "epoch": 0.7413798286487299, + "flos": 20673713047680.0, + "grad_norm": 1.8418070280678467, + "language_loss": 0.85594726, + "learning_rate": 6.614108032513823e-07, + "loss": 0.93274403, + "num_input_tokens_seen": 266021595, + "router_z_loss_clip": 1.39941406, + "router_z_loss_mlp": 0.11169434, + "step": 12331, + "time_per_iteration": 2.6050429344177246 + }, + { + "auxiliary_loss_clip": 0.06410865, + "auxiliary_loss_mlp": 0.01264119, + "balance_loss_clip": 0.06275013, + "balance_loss_mlp": 0.01253837, + "epoch": 0.7414399519013979, + "flos": 16404446067840.0, + "grad_norm": 1.9259075760322277, + "language_loss": 0.69746608, + "learning_rate": 6.611214597199364e-07, + "loss": 0.77421594, + "num_input_tokens_seen": 266039860, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10284424, + "step": 12332, + "time_per_iteration": 2.519845485687256 + }, + { + "auxiliary_loss_clip": 0.06408165, + "auxiliary_loss_mlp": 0.01266174, + "balance_loss_clip": 0.06273398, + "balance_loss_mlp": 0.01255761, + "epoch": 0.7415000751540658, + "flos": 25637403939840.0, + "grad_norm": 1.899841467346803, + "language_loss": 0.63552696, + "learning_rate": 6.608321669588984e-07, + "loss": 0.71227038, + "num_input_tokens_seen": 266058050, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.10418701, + "step": 12333, + "time_per_iteration": 2.5220582485198975 + }, + { + "auxiliary_loss_clip": 0.06403545, + "auxiliary_loss_mlp": 0.0126491, + "balance_loss_clip": 0.06274091, + "balance_loss_mlp": 0.01255391, + "epoch": 0.7415601984067338, + "flos": 24506803998720.0, + "grad_norm": 1.7352435942597948, + "language_loss": 0.7115826, + "learning_rate": 6.605429249792387e-07, + "loss": 0.78826714, + "num_input_tokens_seen": 266078060, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.09521484, + "step": 12334, + "time_per_iteration": 3.9428293704986572 + }, + { + "auxiliary_loss_clip": 0.0640265, + "auxiliary_loss_mlp": 0.01263886, + "balance_loss_clip": 0.06269788, + "balance_loss_mlp": 0.01253628, + "epoch": 0.7416203216594017, + "flos": 20893541034240.0, + "grad_norm": 1.579239832257194, + "language_loss": 0.82769573, + "learning_rate": 6.602537337919257e-07, + "loss": 0.90436113, + "num_input_tokens_seen": 266097110, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.10253906, + "step": 12335, + "time_per_iteration": 2.5163700580596924 + }, + { + "auxiliary_loss_clip": 0.06406333, + "auxiliary_loss_mlp": 0.01267868, + "balance_loss_clip": 0.06269982, + "balance_loss_mlp": 0.01257556, + "epoch": 0.7416804449120697, + "flos": 15628276897920.0, + "grad_norm": 2.378220107859676, + "language_loss": 0.75595701, + "learning_rate": 6.599645934079259e-07, + "loss": 0.832699, + "num_input_tokens_seen": 266110870, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.10308838, + "step": 12336, + "time_per_iteration": 2.471386432647705 + }, + { + "auxiliary_loss_clip": 0.06412801, + "auxiliary_loss_mlp": 0.01265477, + "balance_loss_clip": 0.06276821, + "balance_loss_mlp": 0.01255582, + "epoch": 0.7417405681647377, + "flos": 17124795619200.0, + "grad_norm": 1.7670482081057908, + "language_loss": 0.73856127, + "learning_rate": 6.596755038382029e-07, + "loss": 0.8153441, + "num_input_tokens_seen": 266127845, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.09899902, + "step": 12337, + "time_per_iteration": 2.466338872909546 + }, + { + "auxiliary_loss_clip": 0.06405115, + "auxiliary_loss_mlp": 0.01266953, + "balance_loss_clip": 0.06274252, + "balance_loss_mlp": 0.01257428, + "epoch": 0.7418006914174057, + "flos": 18886354404480.0, + "grad_norm": 1.7252215797420232, + "language_loss": 0.76747906, + "learning_rate": 6.593864650937186e-07, + "loss": 0.84419966, + "num_input_tokens_seen": 266145400, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09527588, + "step": 12338, + "time_per_iteration": 2.4993648529052734 + }, + { + "auxiliary_loss_clip": 0.06403196, + "auxiliary_loss_mlp": 0.01266291, + "balance_loss_clip": 0.06271601, + "balance_loss_mlp": 0.01256993, + "epoch": 0.7418608146700737, + "flos": 21587294113920.0, + "grad_norm": 1.629364816328998, + "language_loss": 0.72958922, + "learning_rate": 6.590974771854345e-07, + "loss": 0.80628407, + "num_input_tokens_seen": 266164430, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09301758, + "step": 12339, + "time_per_iteration": 2.4901506900787354 + }, + { + "auxiliary_loss_clip": 0.06403936, + "auxiliary_loss_mlp": 0.01263048, + "balance_loss_clip": 0.06271182, + "balance_loss_mlp": 0.01253011, + "epoch": 0.7419209379227416, + "flos": 22346063562240.0, + "grad_norm": 3.4897351250421322, + "language_loss": 0.79916894, + "learning_rate": 6.588085401243077e-07, + "loss": 0.87583876, + "num_input_tokens_seen": 266183855, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.10046387, + "step": 12340, + "time_per_iteration": 2.5338644981384277 + }, + { + "auxiliary_loss_clip": 0.0640725, + "auxiliary_loss_mlp": 0.0126408, + "balance_loss_clip": 0.06272589, + "balance_loss_mlp": 0.01254168, + "epoch": 0.7419810611754096, + "flos": 16767639590400.0, + "grad_norm": 1.374564761122075, + "language_loss": 0.76099288, + "learning_rate": 6.585196539212958e-07, + "loss": 0.83770621, + "num_input_tokens_seen": 266202085, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.09912109, + "step": 12341, + "time_per_iteration": 2.495758056640625 + }, + { + "auxiliary_loss_clip": 0.06401518, + "auxiliary_loss_mlp": 0.01269793, + "balance_loss_clip": 0.06276906, + "balance_loss_mlp": 0.01260292, + "epoch": 0.7420411844280775, + "flos": 26220048105600.0, + "grad_norm": 1.417674408189636, + "language_loss": 0.80324268, + "learning_rate": 6.582308185873535e-07, + "loss": 0.87995577, + "num_input_tokens_seen": 266223445, + "router_z_loss_clip": 1.24511719, + "router_z_loss_mlp": 0.09503174, + "step": 12342, + "time_per_iteration": 2.5588223934173584 + }, + { + "auxiliary_loss_clip": 0.06405, + "auxiliary_loss_mlp": 0.01266068, + "balance_loss_clip": 0.06272244, + "balance_loss_mlp": 0.01256328, + "epoch": 0.7421013076807456, + "flos": 68542354857600.0, + "grad_norm": 1.7864358028362888, + "language_loss": 0.7745598, + "learning_rate": 6.57942034133433e-07, + "loss": 0.85127044, + "num_input_tokens_seen": 266246575, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09741211, + "step": 12343, + "time_per_iteration": 2.893523693084717 + }, + { + "auxiliary_loss_clip": 0.0640204, + "auxiliary_loss_mlp": 0.01267663, + "balance_loss_clip": 0.06267961, + "balance_loss_mlp": 0.01257482, + "epoch": 0.7421614309334135, + "flos": 24432144410880.0, + "grad_norm": 1.492444453579108, + "language_loss": 0.68024582, + "learning_rate": 6.576533005704843e-07, + "loss": 0.75694287, + "num_input_tokens_seen": 266266055, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.10186768, + "step": 12344, + "time_per_iteration": 4.0460686683654785 + }, + { + "auxiliary_loss_clip": 0.0640749, + "auxiliary_loss_mlp": 0.0126471, + "balance_loss_clip": 0.06272101, + "balance_loss_mlp": 0.01254178, + "epoch": 0.7422215541860815, + "flos": 12315706709760.0, + "grad_norm": 2.0673948051612983, + "language_loss": 0.81438386, + "learning_rate": 6.573646179094572e-07, + "loss": 0.89110589, + "num_input_tokens_seen": 266282240, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.10522461, + "step": 12345, + "time_per_iteration": 2.5168869495391846 + }, + { + "auxiliary_loss_clip": 0.06405216, + "auxiliary_loss_mlp": 0.01263643, + "balance_loss_clip": 0.06271648, + "balance_loss_mlp": 0.01253975, + "epoch": 0.7422816774387494, + "flos": 19651580616960.0, + "grad_norm": 1.781451237104089, + "language_loss": 0.70713991, + "learning_rate": 6.570759861612988e-07, + "loss": 0.7838285, + "num_input_tokens_seen": 266300980, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.09661865, + "step": 12346, + "time_per_iteration": 2.481515407562256 + }, + { + "auxiliary_loss_clip": 0.06407449, + "auxiliary_loss_mlp": 0.01266551, + "balance_loss_clip": 0.06272161, + "balance_loss_mlp": 0.0125683, + "epoch": 0.7423418006914174, + "flos": 32024337557760.0, + "grad_norm": 1.4530238546108785, + "language_loss": 0.73483253, + "learning_rate": 6.56787405336953e-07, + "loss": 0.81157255, + "num_input_tokens_seen": 266322215, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.097229, + "step": 12347, + "time_per_iteration": 2.6118276119232178 + }, + { + "auxiliary_loss_clip": 0.06410117, + "auxiliary_loss_mlp": 0.01263875, + "balance_loss_clip": 0.06271449, + "balance_loss_mlp": 0.01253355, + "epoch": 0.7424019239440853, + "flos": 18923013365760.0, + "grad_norm": 2.221279445831195, + "language_loss": 0.81336832, + "learning_rate": 6.564988754473642e-07, + "loss": 0.89010823, + "num_input_tokens_seen": 266341600, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.10522461, + "step": 12348, + "time_per_iteration": 3.9795804023742676 + }, + { + "auxiliary_loss_clip": 0.06404714, + "auxiliary_loss_mlp": 0.01264602, + "balance_loss_clip": 0.06274206, + "balance_loss_mlp": 0.01254827, + "epoch": 0.7424620471967533, + "flos": 35884360396800.0, + "grad_norm": 1.7176907745599117, + "language_loss": 0.72897398, + "learning_rate": 6.562103965034724e-07, + "loss": 0.8056671, + "num_input_tokens_seen": 266362895, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.09765625, + "step": 12349, + "time_per_iteration": 2.5986247062683105 + }, + { + "auxiliary_loss_clip": 0.0641204, + "auxiliary_loss_mlp": 0.01266614, + "balance_loss_clip": 0.06272119, + "balance_loss_mlp": 0.01255629, + "epoch": 0.7425221704494213, + "flos": 27023987704320.0, + "grad_norm": 1.8752409058268018, + "language_loss": 0.79401171, + "learning_rate": 6.559219685162165e-07, + "loss": 0.87079823, + "num_input_tokens_seen": 266384015, + "router_z_loss_clip": 1.39941406, + "router_z_loss_mlp": 0.10986328, + "step": 12350, + "time_per_iteration": 2.5616562366485596 + }, + { + "auxiliary_loss_clip": 0.06404371, + "auxiliary_loss_mlp": 0.01263238, + "balance_loss_clip": 0.06270912, + "balance_loss_mlp": 0.01253147, + "epoch": 0.7425822937020893, + "flos": 34175602483200.0, + "grad_norm": 3.363091942962461, + "language_loss": 0.75271994, + "learning_rate": 6.556335914965343e-07, + "loss": 0.82939601, + "num_input_tokens_seen": 266405990, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.10101318, + "step": 12351, + "time_per_iteration": 2.5991873741149902 + }, + { + "auxiliary_loss_clip": 0.06407189, + "auxiliary_loss_mlp": 0.01264826, + "balance_loss_clip": 0.06273928, + "balance_loss_mlp": 0.01255033, + "epoch": 0.7426424169547573, + "flos": 21289200814080.0, + "grad_norm": 1.9305253620740155, + "language_loss": 0.81533462, + "learning_rate": 6.553452654553611e-07, + "loss": 0.89205474, + "num_input_tokens_seen": 266424260, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.09790039, + "step": 12352, + "time_per_iteration": 2.531691551208496 + }, + { + "auxiliary_loss_clip": 0.06410765, + "auxiliary_loss_mlp": 0.01263525, + "balance_loss_clip": 0.06275038, + "balance_loss_mlp": 0.01253386, + "epoch": 0.7427025402074252, + "flos": 22453818312960.0, + "grad_norm": 1.6215241658944841, + "language_loss": 0.71717203, + "learning_rate": 6.550569904036307e-07, + "loss": 0.79391491, + "num_input_tokens_seen": 266444580, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10144043, + "step": 12353, + "time_per_iteration": 4.0272791385650635 + }, + { + "auxiliary_loss_clip": 0.06404988, + "auxiliary_loss_mlp": 0.01265185, + "balance_loss_clip": 0.0627149, + "balance_loss_mlp": 0.01255731, + "epoch": 0.7427626634600932, + "flos": 22530532325760.0, + "grad_norm": 2.41683810368099, + "language_loss": 0.72524661, + "learning_rate": 6.547687663522739e-07, + "loss": 0.80194831, + "num_input_tokens_seen": 266465640, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.09454346, + "step": 12354, + "time_per_iteration": 2.5672101974487305 + }, + { + "auxiliary_loss_clip": 0.06316006, + "auxiliary_loss_mlp": 0.01252952, + "balance_loss_clip": 0.0626021, + "balance_loss_mlp": 0.01251813, + "epoch": 0.7428227867127611, + "flos": 67227271424640.0, + "grad_norm": 0.6879551946330541, + "language_loss": 0.59384382, + "learning_rate": 6.544805933122199e-07, + "loss": 0.66953337, + "num_input_tokens_seen": 266531950, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.01139832, + "step": 12355, + "time_per_iteration": 3.244594097137451 + }, + { + "auxiliary_loss_clip": 0.06405793, + "auxiliary_loss_mlp": 0.01264507, + "balance_loss_clip": 0.06270608, + "balance_loss_mlp": 0.01254363, + "epoch": 0.7428829099654292, + "flos": 14726603111040.0, + "grad_norm": 1.6011597337483758, + "language_loss": 0.67696226, + "learning_rate": 6.541924712943971e-07, + "loss": 0.75366527, + "num_input_tokens_seen": 266550665, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.10150146, + "step": 12356, + "time_per_iteration": 2.48699951171875 + }, + { + "auxiliary_loss_clip": 0.06406914, + "auxiliary_loss_mlp": 0.01263054, + "balance_loss_clip": 0.06269816, + "balance_loss_mlp": 0.01252623, + "epoch": 0.7429430332180971, + "flos": 48656466696960.0, + "grad_norm": 1.5868291550448252, + "language_loss": 0.72533596, + "learning_rate": 6.539044003097301e-07, + "loss": 0.80203569, + "num_input_tokens_seen": 266572455, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.10424805, + "step": 12357, + "time_per_iteration": 2.8397207260131836 + }, + { + "auxiliary_loss_clip": 0.06402919, + "auxiliary_loss_mlp": 0.01263418, + "balance_loss_clip": 0.06274128, + "balance_loss_mlp": 0.01254495, + "epoch": 0.7430031564707651, + "flos": 16769735942400.0, + "grad_norm": 1.978658121021226, + "language_loss": 0.65120018, + "learning_rate": 6.53616380369143e-07, + "loss": 0.72786361, + "num_input_tokens_seen": 266590895, + "router_z_loss_clip": 1.28808594, + "router_z_loss_mlp": 0.08917236, + "step": 12358, + "time_per_iteration": 2.4834437370300293 + }, + { + "auxiliary_loss_clip": 0.06409361, + "auxiliary_loss_mlp": 0.01267679, + "balance_loss_clip": 0.06271667, + "balance_loss_mlp": 0.01256807, + "epoch": 0.743063279723433, + "flos": 23876054789760.0, + "grad_norm": 1.7508744864963774, + "language_loss": 0.81005955, + "learning_rate": 6.533284114835591e-07, + "loss": 0.88682991, + "num_input_tokens_seen": 266607660, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.10864258, + "step": 12359, + "time_per_iteration": 2.5511791706085205 + }, + { + "auxiliary_loss_clip": 0.06404864, + "auxiliary_loss_mlp": 0.01269499, + "balance_loss_clip": 0.06269827, + "balance_loss_mlp": 0.01259491, + "epoch": 0.743123402976101, + "flos": 14396840167680.0, + "grad_norm": 2.4409850901837924, + "language_loss": 0.688115, + "learning_rate": 6.530404936638956e-07, + "loss": 0.7648586, + "num_input_tokens_seen": 266624260, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.10009766, + "step": 12360, + "time_per_iteration": 2.454799175262451 + }, + { + "auxiliary_loss_clip": 0.06402747, + "auxiliary_loss_mlp": 0.01266625, + "balance_loss_clip": 0.06271, + "balance_loss_mlp": 0.01256695, + "epoch": 0.7431835262287689, + "flos": 27461756960640.0, + "grad_norm": 1.612303136385371, + "language_loss": 0.73023605, + "learning_rate": 6.527526269210715e-07, + "loss": 0.80692977, + "num_input_tokens_seen": 266644210, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09936523, + "step": 12361, + "time_per_iteration": 2.563950538635254 + }, + { + "auxiliary_loss_clip": 0.06409371, + "auxiliary_loss_mlp": 0.01263731, + "balance_loss_clip": 0.06271869, + "balance_loss_mlp": 0.01253706, + "epoch": 0.743243649481437, + "flos": 20965810780800.0, + "grad_norm": 2.1605200841945345, + "language_loss": 0.56417334, + "learning_rate": 6.524648112660027e-07, + "loss": 0.64090431, + "num_input_tokens_seen": 266664230, + "router_z_loss_clip": 1.37695312, + "router_z_loss_mlp": 0.10028076, + "step": 12362, + "time_per_iteration": 2.5222644805908203 + }, + { + "auxiliary_loss_clip": 0.06406482, + "auxiliary_loss_mlp": 0.01263965, + "balance_loss_clip": 0.06272303, + "balance_loss_mlp": 0.012541, + "epoch": 0.7433037727341049, + "flos": 22789660677120.0, + "grad_norm": 2.4729179704806796, + "language_loss": 0.77661127, + "learning_rate": 6.521770467096039e-07, + "loss": 0.85331571, + "num_input_tokens_seen": 266683270, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.09869385, + "step": 12363, + "time_per_iteration": 2.5122897624969482 + }, + { + "auxiliary_loss_clip": 0.06408481, + "auxiliary_loss_mlp": 0.01264275, + "balance_loss_clip": 0.06273359, + "balance_loss_mlp": 0.01255054, + "epoch": 0.7433638959867729, + "flos": 22202656099200.0, + "grad_norm": 1.616246538203827, + "language_loss": 0.78287363, + "learning_rate": 6.518893332627862e-07, + "loss": 0.85960114, + "num_input_tokens_seen": 266701235, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.09222412, + "step": 12364, + "time_per_iteration": 2.492027521133423 + }, + { + "auxiliary_loss_clip": 0.06406204, + "auxiliary_loss_mlp": 0.01264726, + "balance_loss_clip": 0.06272129, + "balance_loss_mlp": 0.01254867, + "epoch": 0.7434240192394409, + "flos": 23303808529920.0, + "grad_norm": 1.801205271942991, + "language_loss": 0.78693449, + "learning_rate": 6.516016709364604e-07, + "loss": 0.86364377, + "num_input_tokens_seen": 266721495, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.09851074, + "step": 12365, + "time_per_iteration": 2.536839485168457 + }, + { + "auxiliary_loss_clip": 0.06409302, + "auxiliary_loss_mlp": 0.01265053, + "balance_loss_clip": 0.06271569, + "balance_loss_mlp": 0.01254884, + "epoch": 0.7434841424921088, + "flos": 54020387416320.0, + "grad_norm": 1.5444951998265788, + "language_loss": 0.77106571, + "learning_rate": 6.513140597415346e-07, + "loss": 0.8478092, + "num_input_tokens_seen": 266747400, + "router_z_loss_clip": 1.37695312, + "router_z_loss_mlp": 0.10168457, + "step": 12366, + "time_per_iteration": 2.7708029747009277 + }, + { + "auxiliary_loss_clip": 0.06405418, + "auxiliary_loss_mlp": 0.01263106, + "balance_loss_clip": 0.06275211, + "balance_loss_mlp": 0.01254588, + "epoch": 0.7435442657447768, + "flos": 21440364779520.0, + "grad_norm": 1.560298463472275, + "language_loss": 0.71305168, + "learning_rate": 6.510264996889141e-07, + "loss": 0.78973687, + "num_input_tokens_seen": 266767630, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.08514404, + "step": 12367, + "time_per_iteration": 2.5184154510498047 + }, + { + "auxiliary_loss_clip": 0.06410043, + "auxiliary_loss_mlp": 0.01265202, + "balance_loss_clip": 0.06271939, + "balance_loss_mlp": 0.01255242, + "epoch": 0.7436043889974447, + "flos": 24506426655360.0, + "grad_norm": 1.476887140959893, + "language_loss": 0.75017029, + "learning_rate": 6.507389907895038e-07, + "loss": 0.82692266, + "num_input_tokens_seen": 266788015, + "router_z_loss_clip": 1.38183594, + "router_z_loss_mlp": 0.09960938, + "step": 12368, + "time_per_iteration": 2.5212924480438232 + }, + { + "auxiliary_loss_clip": 0.0640331, + "auxiliary_loss_mlp": 0.01266737, + "balance_loss_clip": 0.06271964, + "balance_loss_mlp": 0.01257248, + "epoch": 0.7436645122501128, + "flos": 40707997989120.0, + "grad_norm": 1.6519128138397359, + "language_loss": 0.69042623, + "learning_rate": 6.50451533054207e-07, + "loss": 0.76712668, + "num_input_tokens_seen": 266809010, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.09490967, + "step": 12369, + "time_per_iteration": 2.7047884464263916 + }, + { + "auxiliary_loss_clip": 0.06408005, + "auxiliary_loss_mlp": 0.01266433, + "balance_loss_clip": 0.06272747, + "balance_loss_mlp": 0.01256258, + "epoch": 0.7437246355027807, + "flos": 18913537854720.0, + "grad_norm": 1.595861424874944, + "language_loss": 0.75370234, + "learning_rate": 6.501641264939233e-07, + "loss": 0.83044672, + "num_input_tokens_seen": 266825390, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.10168457, + "step": 12370, + "time_per_iteration": 2.473238468170166 + }, + { + "auxiliary_loss_clip": 0.06403841, + "auxiliary_loss_mlp": 0.01266197, + "balance_loss_clip": 0.06273004, + "balance_loss_mlp": 0.01256487, + "epoch": 0.7437847587554487, + "flos": 21550299736320.0, + "grad_norm": 1.5233822709060378, + "language_loss": 0.78544998, + "learning_rate": 6.498767711195503e-07, + "loss": 0.86215037, + "num_input_tokens_seen": 266844675, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09710693, + "step": 12371, + "time_per_iteration": 2.5248806476593018 + }, + { + "auxiliary_loss_clip": 0.06407221, + "auxiliary_loss_mlp": 0.0126359, + "balance_loss_clip": 0.06274284, + "balance_loss_mlp": 0.01253415, + "epoch": 0.7438448820081166, + "flos": 27789926676480.0, + "grad_norm": 1.5517667722387558, + "language_loss": 0.69689578, + "learning_rate": 6.495894669419857e-07, + "loss": 0.77360392, + "num_input_tokens_seen": 266865160, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.10168457, + "step": 12372, + "time_per_iteration": 2.552630662918091 + }, + { + "auxiliary_loss_clip": 0.06404461, + "auxiliary_loss_mlp": 0.01263234, + "balance_loss_clip": 0.06271353, + "balance_loss_mlp": 0.01253519, + "epoch": 0.7439050052607846, + "flos": 17973653806080.0, + "grad_norm": 1.7715467949119694, + "language_loss": 0.75746936, + "learning_rate": 6.493022139721245e-07, + "loss": 0.83414626, + "num_input_tokens_seen": 266883285, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.09716797, + "step": 12373, + "time_per_iteration": 2.546383857727051 + }, + { + "auxiliary_loss_clip": 0.06406415, + "auxiliary_loss_mlp": 0.01264372, + "balance_loss_clip": 0.06269443, + "balance_loss_mlp": 0.01253643, + "epoch": 0.7439651285134525, + "flos": 22964066951040.0, + "grad_norm": 1.646659393981313, + "language_loss": 0.77668065, + "learning_rate": 6.49015012220858e-07, + "loss": 0.85338849, + "num_input_tokens_seen": 266900960, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.10723877, + "step": 12374, + "time_per_iteration": 3.92050838470459 + }, + { + "auxiliary_loss_clip": 0.0640787, + "auxiliary_loss_mlp": 0.01263238, + "balance_loss_clip": 0.06273149, + "balance_loss_mlp": 0.0125323, + "epoch": 0.7440252517661206, + "flos": 18812701065600.0, + "grad_norm": 2.0942511176343936, + "language_loss": 0.76647848, + "learning_rate": 6.487278616990774e-07, + "loss": 0.8431896, + "num_input_tokens_seen": 266917710, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.10009766, + "step": 12375, + "time_per_iteration": 2.4693682193756104 + }, + { + "auxiliary_loss_clip": 0.06401422, + "auxiliary_loss_mlp": 0.01264376, + "balance_loss_clip": 0.06271002, + "balance_loss_mlp": 0.0125509, + "epoch": 0.7440853750187885, + "flos": 20272476971520.0, + "grad_norm": 1.9421008713204126, + "language_loss": 0.77613479, + "learning_rate": 6.484407624176733e-07, + "loss": 0.85279274, + "num_input_tokens_seen": 266934220, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.09289551, + "step": 12376, + "time_per_iteration": 2.5313687324523926 + }, + { + "auxiliary_loss_clip": 0.06411325, + "auxiliary_loss_mlp": 0.0126521, + "balance_loss_clip": 0.06274679, + "balance_loss_mlp": 0.01254195, + "epoch": 0.7441454982714565, + "flos": 25344216103680.0, + "grad_norm": 1.6879518297233593, + "language_loss": 0.79368329, + "learning_rate": 6.481537143875296e-07, + "loss": 0.87044865, + "num_input_tokens_seen": 266955210, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.11010742, + "step": 12377, + "time_per_iteration": 2.5384654998779297 + }, + { + "auxiliary_loss_clip": 0.0640887, + "auxiliary_loss_mlp": 0.01264545, + "balance_loss_clip": 0.06272136, + "balance_loss_mlp": 0.01254025, + "epoch": 0.7442056215241245, + "flos": 64493460915840.0, + "grad_norm": 1.858045271266799, + "language_loss": 0.67843312, + "learning_rate": 6.478667176195322e-07, + "loss": 0.75516731, + "num_input_tokens_seen": 266976555, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.10528564, + "step": 12378, + "time_per_iteration": 2.898494005203247 + }, + { + "auxiliary_loss_clip": 0.06408532, + "auxiliary_loss_mlp": 0.0126824, + "balance_loss_clip": 0.06271744, + "balance_loss_mlp": 0.01256784, + "epoch": 0.7442657447767924, + "flos": 31293464319360.0, + "grad_norm": 1.6105987456814335, + "language_loss": 0.71894264, + "learning_rate": 6.475797721245648e-07, + "loss": 0.79571033, + "num_input_tokens_seen": 266997640, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.11462402, + "step": 12379, + "time_per_iteration": 2.5628533363342285 + }, + { + "auxiliary_loss_clip": 0.06407094, + "auxiliary_loss_mlp": 0.01265472, + "balance_loss_clip": 0.06273466, + "balance_loss_mlp": 0.01255292, + "epoch": 0.7443258680294604, + "flos": 20813221296000.0, + "grad_norm": 1.9550409468219483, + "language_loss": 0.65543461, + "learning_rate": 6.472928779135085e-07, + "loss": 0.73216021, + "num_input_tokens_seen": 267016165, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.10186768, + "step": 12380, + "time_per_iteration": 2.5494651794433594 + }, + { + "auxiliary_loss_clip": 0.06408666, + "auxiliary_loss_mlp": 0.01266245, + "balance_loss_clip": 0.0627347, + "balance_loss_mlp": 0.01256267, + "epoch": 0.7443859912821283, + "flos": 22206303751680.0, + "grad_norm": 1.8887848682533184, + "language_loss": 0.79213363, + "learning_rate": 6.470060349972411e-07, + "loss": 0.86888278, + "num_input_tokens_seen": 267034075, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.09973145, + "step": 12381, + "time_per_iteration": 2.4954755306243896 + }, + { + "auxiliary_loss_clip": 0.06412176, + "auxiliary_loss_mlp": 0.01265606, + "balance_loss_clip": 0.06274785, + "balance_loss_mlp": 0.0125446, + "epoch": 0.7444461145347964, + "flos": 22024350610560.0, + "grad_norm": 1.8902076761628224, + "language_loss": 0.73109865, + "learning_rate": 6.467192433866411e-07, + "loss": 0.80787647, + "num_input_tokens_seen": 267053645, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.1114502, + "step": 12382, + "time_per_iteration": 2.534949779510498 + }, + { + "auxiliary_loss_clip": 0.06317867, + "auxiliary_loss_mlp": 0.01256388, + "balance_loss_clip": 0.06262469, + "balance_loss_mlp": 0.01255137, + "epoch": 0.7445062377874643, + "flos": 70582313704320.0, + "grad_norm": 0.6399574084951353, + "language_loss": 0.54684198, + "learning_rate": 6.464325030925831e-07, + "loss": 0.62258446, + "num_input_tokens_seen": 267121830, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01251221, + "step": 12383, + "time_per_iteration": 3.2762465476989746 + }, + { + "auxiliary_loss_clip": 0.06408082, + "auxiliary_loss_mlp": 0.01263086, + "balance_loss_clip": 0.06273709, + "balance_loss_mlp": 0.01253168, + "epoch": 0.7445663610401323, + "flos": 22171070309760.0, + "grad_norm": 1.8693949570564194, + "language_loss": 0.76230967, + "learning_rate": 6.461458141259395e-07, + "loss": 0.83902138, + "num_input_tokens_seen": 267141145, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.09924316, + "step": 12384, + "time_per_iteration": 3.9471797943115234 + }, + { + "auxiliary_loss_clip": 0.0640517, + "auxiliary_loss_mlp": 0.01268527, + "balance_loss_clip": 0.06271986, + "balance_loss_mlp": 0.01258162, + "epoch": 0.7446264842928002, + "flos": 24177082982400.0, + "grad_norm": 2.0160606528555665, + "language_loss": 0.79418957, + "learning_rate": 6.458591764975823e-07, + "loss": 0.87092656, + "num_input_tokens_seen": 267159280, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.1036377, + "step": 12385, + "time_per_iteration": 2.548703193664551 + }, + { + "auxiliary_loss_clip": 0.06411269, + "auxiliary_loss_mlp": 0.01267945, + "balance_loss_clip": 0.06273325, + "balance_loss_mlp": 0.0125609, + "epoch": 0.7446866075454682, + "flos": 24141514124160.0, + "grad_norm": 1.683035804247251, + "language_loss": 0.81670487, + "learning_rate": 6.455725902183813e-07, + "loss": 0.89349711, + "num_input_tokens_seen": 267179390, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.11859131, + "step": 12386, + "time_per_iteration": 2.5256152153015137 + }, + { + "auxiliary_loss_clip": 0.06404106, + "auxiliary_loss_mlp": 0.01267713, + "balance_loss_clip": 0.06274322, + "balance_loss_mlp": 0.01257598, + "epoch": 0.7447467307981361, + "flos": 23554467619200.0, + "grad_norm": 1.6483993248680413, + "language_loss": 0.71268487, + "learning_rate": 6.452860552992037e-07, + "loss": 0.78940308, + "num_input_tokens_seen": 267198165, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.10119629, + "step": 12387, + "time_per_iteration": 3.9517242908477783 + }, + { + "auxiliary_loss_clip": 0.0640709, + "auxiliary_loss_mlp": 0.01265221, + "balance_loss_clip": 0.06274819, + "balance_loss_mlp": 0.01255464, + "epoch": 0.7448068540508042, + "flos": 19573021814400.0, + "grad_norm": 1.9204384374405874, + "language_loss": 0.70408261, + "learning_rate": 6.449995717509138e-07, + "loss": 0.78080571, + "num_input_tokens_seen": 267214520, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.09771729, + "step": 12388, + "time_per_iteration": 2.5048129558563232 + }, + { + "auxiliary_loss_clip": 0.06406976, + "auxiliary_loss_mlp": 0.0126489, + "balance_loss_clip": 0.06273593, + "balance_loss_mlp": 0.01254727, + "epoch": 0.7448669773034721, + "flos": 21846925589760.0, + "grad_norm": 1.5688285062230494, + "language_loss": 0.85222888, + "learning_rate": 6.447131395843761e-07, + "loss": 0.92894751, + "num_input_tokens_seen": 267236555, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.10162354, + "step": 12389, + "time_per_iteration": 2.5551319122314453 + }, + { + "auxiliary_loss_clip": 0.06411929, + "auxiliary_loss_mlp": 0.01264711, + "balance_loss_clip": 0.06275173, + "balance_loss_mlp": 0.01254388, + "epoch": 0.7449271005561401, + "flos": 25162388743680.0, + "grad_norm": 1.6015967900986, + "language_loss": 0.79076087, + "learning_rate": 6.444267588104526e-07, + "loss": 0.86752725, + "num_input_tokens_seen": 267254800, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.10333252, + "step": 12390, + "time_per_iteration": 2.5427069664001465 + }, + { + "auxiliary_loss_clip": 0.06406707, + "auxiliary_loss_mlp": 0.01265994, + "balance_loss_clip": 0.06271118, + "balance_loss_mlp": 0.01255414, + "epoch": 0.7449872238088081, + "flos": 22279915163520.0, + "grad_norm": 1.7310702404068883, + "language_loss": 0.84598923, + "learning_rate": 6.441404294400014e-07, + "loss": 0.92271626, + "num_input_tokens_seen": 267274610, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10577393, + "step": 12391, + "time_per_iteration": 2.563535451889038 + }, + { + "auxiliary_loss_clip": 0.0640666, + "auxiliary_loss_mlp": 0.01267143, + "balance_loss_clip": 0.06273681, + "balance_loss_mlp": 0.01257481, + "epoch": 0.745047347061476, + "flos": 20601065957760.0, + "grad_norm": 1.6668133059608343, + "language_loss": 0.74029422, + "learning_rate": 6.438541514838811e-07, + "loss": 0.81703228, + "num_input_tokens_seen": 267292600, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.09655762, + "step": 12392, + "time_per_iteration": 2.54951548576355 + }, + { + "auxiliary_loss_clip": 0.06402859, + "auxiliary_loss_mlp": 0.01260815, + "balance_loss_clip": 0.06272476, + "balance_loss_mlp": 0.01251344, + "epoch": 0.745107470314144, + "flos": 22134117859200.0, + "grad_norm": 1.5576525473269558, + "language_loss": 0.76858068, + "learning_rate": 6.435679249529487e-07, + "loss": 0.84521741, + "num_input_tokens_seen": 267311295, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.09466553, + "step": 12393, + "time_per_iteration": 3.9006175994873047 + }, + { + "auxiliary_loss_clip": 0.06406154, + "auxiliary_loss_mlp": 0.01264743, + "balance_loss_clip": 0.06273723, + "balance_loss_mlp": 0.01253681, + "epoch": 0.745167593566812, + "flos": 22243004640000.0, + "grad_norm": 1.8129190571327771, + "language_loss": 0.72895974, + "learning_rate": 6.432817498580552e-07, + "loss": 0.80566871, + "num_input_tokens_seen": 267328390, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.11065674, + "step": 12394, + "time_per_iteration": 2.5072154998779297 + }, + { + "auxiliary_loss_clip": 0.06409433, + "auxiliary_loss_mlp": 0.0126662, + "balance_loss_clip": 0.062764, + "balance_loss_mlp": 0.01256386, + "epoch": 0.74522771681948, + "flos": 20672245601280.0, + "grad_norm": 1.907024512464057, + "language_loss": 0.81604195, + "learning_rate": 6.429956262100535e-07, + "loss": 0.89280254, + "num_input_tokens_seen": 267348185, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.10229492, + "step": 12395, + "time_per_iteration": 2.558364152908325 + }, + { + "auxiliary_loss_clip": 0.06410865, + "auxiliary_loss_mlp": 0.01263239, + "balance_loss_clip": 0.06272958, + "balance_loss_mlp": 0.0125276, + "epoch": 0.7452878400721479, + "flos": 21113578656000.0, + "grad_norm": 2.0296389774228696, + "language_loss": 0.71353412, + "learning_rate": 6.427095540197937e-07, + "loss": 0.7902751, + "num_input_tokens_seen": 267367010, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.10479736, + "step": 12396, + "time_per_iteration": 2.5333800315856934 + }, + { + "auxiliary_loss_clip": 0.06410335, + "auxiliary_loss_mlp": 0.01270272, + "balance_loss_clip": 0.0627405, + "balance_loss_mlp": 0.01259817, + "epoch": 0.7453479633248159, + "flos": 26695356791040.0, + "grad_norm": 1.7653498862939656, + "language_loss": 0.68180245, + "learning_rate": 6.424235332981245e-07, + "loss": 0.75860852, + "num_input_tokens_seen": 267386605, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.10455322, + "step": 12397, + "time_per_iteration": 2.578571081161499 + }, + { + "auxiliary_loss_clip": 0.06405051, + "auxiliary_loss_mlp": 0.0126851, + "balance_loss_clip": 0.0627315, + "balance_loss_mlp": 0.01258926, + "epoch": 0.7454080865774838, + "flos": 17021191645440.0, + "grad_norm": 1.6817792283863804, + "language_loss": 0.77217615, + "learning_rate": 6.421375640558908e-07, + "loss": 0.84891176, + "num_input_tokens_seen": 267404135, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09576416, + "step": 12398, + "time_per_iteration": 2.512648344039917 + }, + { + "auxiliary_loss_clip": 0.06403591, + "auxiliary_loss_mlp": 0.01261876, + "balance_loss_clip": 0.06272794, + "balance_loss_mlp": 0.01252328, + "epoch": 0.7454682098301518, + "flos": 21330178260480.0, + "grad_norm": 1.5838932633911913, + "language_loss": 0.78415573, + "learning_rate": 6.418516463039363e-07, + "loss": 0.8608104, + "num_input_tokens_seen": 267423120, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09552002, + "step": 12399, + "time_per_iteration": 2.505819320678711 + }, + { + "auxiliary_loss_clip": 0.06400932, + "auxiliary_loss_mlp": 0.01264955, + "balance_loss_clip": 0.06273317, + "balance_loss_mlp": 0.01255728, + "epoch": 0.7455283330828197, + "flos": 17864138119680.0, + "grad_norm": 1.9696837581168143, + "language_loss": 0.7409634, + "learning_rate": 6.415657800531038e-07, + "loss": 0.81762224, + "num_input_tokens_seen": 267441250, + "router_z_loss_clip": 1.27539062, + "router_z_loss_mlp": 0.09222412, + "step": 12400, + "time_per_iteration": 2.5325090885162354 + }, + { + "auxiliary_loss_clip": 0.06404567, + "auxiliary_loss_mlp": 0.01264569, + "balance_loss_clip": 0.06272677, + "balance_loss_mlp": 0.01254829, + "epoch": 0.7455884563354878, + "flos": 30782209432320.0, + "grad_norm": 1.9542118355306637, + "language_loss": 0.82345331, + "learning_rate": 6.412799653142327e-07, + "loss": 0.90014458, + "num_input_tokens_seen": 267462820, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.09735107, + "step": 12401, + "time_per_iteration": 2.577702283859253 + }, + { + "auxiliary_loss_clip": 0.06408406, + "auxiliary_loss_mlp": 0.01262184, + "balance_loss_clip": 0.06275339, + "balance_loss_mlp": 0.01252689, + "epoch": 0.7456485795881557, + "flos": 23192280345600.0, + "grad_norm": 1.6740517505744856, + "language_loss": 0.65013397, + "learning_rate": 6.409942020981611e-07, + "loss": 0.72683978, + "num_input_tokens_seen": 267483065, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.0949707, + "step": 12402, + "time_per_iteration": 2.6253459453582764 + }, + { + "auxiliary_loss_clip": 0.06401449, + "auxiliary_loss_mlp": 0.01264812, + "balance_loss_clip": 0.06271583, + "balance_loss_mlp": 0.01255472, + "epoch": 0.7457087028408237, + "flos": 38736254436480.0, + "grad_norm": 1.537912259359591, + "language_loss": 0.73276114, + "learning_rate": 6.407084904157265e-07, + "loss": 0.8094238, + "num_input_tokens_seen": 267504825, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.09350586, + "step": 12403, + "time_per_iteration": 2.700143575668335 + }, + { + "auxiliary_loss_clip": 0.06316997, + "auxiliary_loss_mlp": 0.01251636, + "balance_loss_clip": 0.06261828, + "balance_loss_mlp": 0.0125041, + "epoch": 0.7457688260934917, + "flos": 56059480523520.0, + "grad_norm": 1.1139053392521483, + "language_loss": 0.58594716, + "learning_rate": 6.404228302777621e-07, + "loss": 0.66163349, + "num_input_tokens_seen": 267559260, + "router_z_loss_clip": 0.55371094, + "router_z_loss_mlp": 0.01225281, + "step": 12404, + "time_per_iteration": 2.995051145553589 + }, + { + "auxiliary_loss_clip": 0.06405495, + "auxiliary_loss_mlp": 0.01263977, + "balance_loss_clip": 0.06272737, + "balance_loss_mlp": 0.01254256, + "epoch": 0.7458289493461596, + "flos": 20121606495360.0, + "grad_norm": 1.4914507939432748, + "language_loss": 0.77947497, + "learning_rate": 6.401372216950995e-07, + "loss": 0.85616976, + "num_input_tokens_seen": 267578720, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09710693, + "step": 12405, + "time_per_iteration": 2.5471739768981934 + }, + { + "auxiliary_loss_clip": 0.0640135, + "auxiliary_loss_mlp": 0.01269033, + "balance_loss_clip": 0.06272865, + "balance_loss_mlp": 0.01259067, + "epoch": 0.7458890725988276, + "flos": 20199200976000.0, + "grad_norm": 1.4963815731193124, + "language_loss": 0.69489747, + "learning_rate": 6.398516646785698e-07, + "loss": 0.77160132, + "num_input_tokens_seen": 267598250, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.09960938, + "step": 12406, + "time_per_iteration": 2.5200746059417725 + }, + { + "auxiliary_loss_clip": 0.0641366, + "auxiliary_loss_mlp": 0.012669, + "balance_loss_clip": 0.06274001, + "balance_loss_mlp": 0.01256344, + "epoch": 0.7459491958514956, + "flos": 17024336173440.0, + "grad_norm": 1.8403958635643813, + "language_loss": 0.65422976, + "learning_rate": 6.39566159239002e-07, + "loss": 0.73103529, + "num_input_tokens_seen": 267615430, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.10559082, + "step": 12407, + "time_per_iteration": 2.508833408355713 + }, + { + "auxiliary_loss_clip": 0.06406917, + "auxiliary_loss_mlp": 0.01262212, + "balance_loss_clip": 0.06270534, + "balance_loss_mlp": 0.01251775, + "epoch": 0.7460093191041636, + "flos": 25085087752320.0, + "grad_norm": 1.7359295101063332, + "language_loss": 0.721986, + "learning_rate": 6.392807053872212e-07, + "loss": 0.79867733, + "num_input_tokens_seen": 267635075, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.10443115, + "step": 12408, + "time_per_iteration": 2.5363566875457764 + }, + { + "auxiliary_loss_clip": 0.06410854, + "auxiliary_loss_mlp": 0.01270325, + "balance_loss_clip": 0.06272398, + "balance_loss_mlp": 0.01258875, + "epoch": 0.7460694423568315, + "flos": 21915044559360.0, + "grad_norm": 1.699572837322079, + "language_loss": 0.72972172, + "learning_rate": 6.38995303134053e-07, + "loss": 0.80653358, + "num_input_tokens_seen": 267654105, + "router_z_loss_clip": 1.38476562, + "router_z_loss_mlp": 0.11444092, + "step": 12409, + "time_per_iteration": 2.546006441116333 + }, + { + "auxiliary_loss_clip": 0.06399277, + "auxiliary_loss_mlp": 0.01265888, + "balance_loss_clip": 0.06271146, + "balance_loss_mlp": 0.0125671, + "epoch": 0.7461295656094995, + "flos": 21222213874560.0, + "grad_norm": 1.598232986197546, + "language_loss": 0.6626668, + "learning_rate": 6.38709952490319e-07, + "loss": 0.73931849, + "num_input_tokens_seen": 267673090, + "router_z_loss_clip": 1.28222656, + "router_z_loss_mlp": 0.09173584, + "step": 12410, + "time_per_iteration": 2.539109468460083 + }, + { + "auxiliary_loss_clip": 0.06399163, + "auxiliary_loss_mlp": 0.01263377, + "balance_loss_clip": 0.06269792, + "balance_loss_mlp": 0.01253912, + "epoch": 0.7461896888621674, + "flos": 22353526575360.0, + "grad_norm": 1.945676042330692, + "language_loss": 0.84313834, + "learning_rate": 6.384246534668396e-07, + "loss": 0.9197638, + "num_input_tokens_seen": 267690605, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.09466553, + "step": 12411, + "time_per_iteration": 2.5426361560821533 + }, + { + "auxiliary_loss_clip": 0.06406285, + "auxiliary_loss_mlp": 0.01265139, + "balance_loss_clip": 0.06272309, + "balance_loss_mlp": 0.01255412, + "epoch": 0.7462498121148354, + "flos": 25489845699840.0, + "grad_norm": 1.4027823600738436, + "language_loss": 0.78116751, + "learning_rate": 6.381394060744339e-07, + "loss": 0.85788167, + "num_input_tokens_seen": 267710540, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.09729004, + "step": 12412, + "time_per_iteration": 2.533936023712158 + }, + { + "auxiliary_loss_clip": 0.06404398, + "auxiliary_loss_mlp": 0.01264219, + "balance_loss_clip": 0.06270991, + "balance_loss_mlp": 0.01254599, + "epoch": 0.7463099353675033, + "flos": 33956319548160.0, + "grad_norm": 1.7620547753312321, + "language_loss": 0.62684309, + "learning_rate": 6.378542103239188e-07, + "loss": 0.70352924, + "num_input_tokens_seen": 267730780, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.09625244, + "step": 12413, + "time_per_iteration": 2.6400840282440186 + }, + { + "auxiliary_loss_clip": 0.06308331, + "auxiliary_loss_mlp": 0.01251289, + "balance_loss_clip": 0.06253117, + "balance_loss_mlp": 0.01250132, + "epoch": 0.7463700586201714, + "flos": 62786365355520.0, + "grad_norm": 0.710053456092447, + "language_loss": 0.54915559, + "learning_rate": 6.375690662261082e-07, + "loss": 0.62475181, + "num_input_tokens_seen": 267794240, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.0115509, + "step": 12414, + "time_per_iteration": 4.637887954711914 + }, + { + "auxiliary_loss_clip": 0.06405766, + "auxiliary_loss_mlp": 0.01265973, + "balance_loss_clip": 0.06271549, + "balance_loss_mlp": 0.01255924, + "epoch": 0.7464301818728393, + "flos": 33440201124480.0, + "grad_norm": 1.8480790856179932, + "language_loss": 0.54996049, + "learning_rate": 6.372839737918154e-07, + "loss": 0.62667787, + "num_input_tokens_seen": 267817190, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.1005249, + "step": 12415, + "time_per_iteration": 2.615811347961426 + }, + { + "auxiliary_loss_clip": 0.06405137, + "auxiliary_loss_mlp": 0.01263099, + "balance_loss_clip": 0.06273064, + "balance_loss_mlp": 0.01252985, + "epoch": 0.7464903051255073, + "flos": 26877100296960.0, + "grad_norm": 1.5361542558007044, + "language_loss": 0.75346631, + "learning_rate": 6.369989330318506e-07, + "loss": 0.8301487, + "num_input_tokens_seen": 267836245, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.10107422, + "step": 12416, + "time_per_iteration": 2.5900840759277344 + }, + { + "auxiliary_loss_clip": 0.06405427, + "auxiliary_loss_mlp": 0.0126512, + "balance_loss_clip": 0.06271985, + "balance_loss_mlp": 0.01254868, + "epoch": 0.7465504283781753, + "flos": 44096359795200.0, + "grad_norm": 1.4549877982075725, + "language_loss": 0.69495994, + "learning_rate": 6.367139439570233e-07, + "loss": 0.77166545, + "num_input_tokens_seen": 267858310, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.10247803, + "step": 12417, + "time_per_iteration": 2.7127816677093506 + }, + { + "auxiliary_loss_clip": 0.06411283, + "auxiliary_loss_mlp": 0.01262613, + "balance_loss_clip": 0.0627514, + "balance_loss_mlp": 0.01252456, + "epoch": 0.7466105516308432, + "flos": 19681111981440.0, + "grad_norm": 1.698297081844245, + "language_loss": 0.74025893, + "learning_rate": 6.364290065781392e-07, + "loss": 0.81699783, + "num_input_tokens_seen": 267876345, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.10162354, + "step": 12418, + "time_per_iteration": 2.535360336303711 + }, + { + "auxiliary_loss_clip": 0.06406084, + "auxiliary_loss_mlp": 0.01266736, + "balance_loss_clip": 0.06273702, + "balance_loss_mlp": 0.01256526, + "epoch": 0.7466706748835112, + "flos": 20526783713280.0, + "grad_norm": 1.5246031666283997, + "language_loss": 0.68934214, + "learning_rate": 6.361441209060039e-07, + "loss": 0.76607031, + "num_input_tokens_seen": 267896740, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.10211182, + "step": 12419, + "time_per_iteration": 2.555774211883545 + }, + { + "auxiliary_loss_clip": 0.06398122, + "auxiliary_loss_mlp": 0.01265504, + "balance_loss_clip": 0.06271016, + "balance_loss_mlp": 0.01256307, + "epoch": 0.7467307981361792, + "flos": 21696851727360.0, + "grad_norm": 1.9457389695389966, + "language_loss": 0.7466985, + "learning_rate": 6.358592869514216e-07, + "loss": 0.82333469, + "num_input_tokens_seen": 267914765, + "router_z_loss_clip": 1.27050781, + "router_z_loss_mlp": 0.09197998, + "step": 12420, + "time_per_iteration": 2.570023536682129 + }, + { + "auxiliary_loss_clip": 0.06408262, + "auxiliary_loss_mlp": 0.01264113, + "balance_loss_clip": 0.06273928, + "balance_loss_mlp": 0.01253152, + "epoch": 0.7467909213888472, + "flos": 19579855921920.0, + "grad_norm": 2.0032714530696087, + "language_loss": 0.67321241, + "learning_rate": 6.355745047251904e-07, + "loss": 0.7499361, + "num_input_tokens_seen": 267934085, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.10955811, + "step": 12421, + "time_per_iteration": 2.474916696548462 + }, + { + "auxiliary_loss_clip": 0.06408735, + "auxiliary_loss_mlp": 0.01265956, + "balance_loss_clip": 0.06271867, + "balance_loss_mlp": 0.0125574, + "epoch": 0.7468510446415151, + "flos": 23701858151040.0, + "grad_norm": 1.5609377146869152, + "language_loss": 0.72308791, + "learning_rate": 6.352897742381107e-07, + "loss": 0.79983485, + "num_input_tokens_seen": 267955170, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.10223389, + "step": 12422, + "time_per_iteration": 2.5997939109802246 + }, + { + "auxiliary_loss_clip": 0.06401733, + "auxiliary_loss_mlp": 0.01265232, + "balance_loss_clip": 0.06271507, + "balance_loss_mlp": 0.01255272, + "epoch": 0.7469111678941831, + "flos": 29323649410560.0, + "grad_norm": 1.8474742568559126, + "language_loss": 0.75012529, + "learning_rate": 6.350050955009796e-07, + "loss": 0.82679492, + "num_input_tokens_seen": 267974980, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.09960938, + "step": 12423, + "time_per_iteration": 4.05024266242981 + }, + { + "auxiliary_loss_clip": 0.06402838, + "auxiliary_loss_mlp": 0.01263552, + "balance_loss_clip": 0.06272693, + "balance_loss_mlp": 0.01254534, + "epoch": 0.746971291146851, + "flos": 21805067675520.0, + "grad_norm": 1.325189199688027, + "language_loss": 0.67964166, + "learning_rate": 6.347204685245929e-07, + "loss": 0.75630558, + "num_input_tokens_seen": 267994985, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.09020996, + "step": 12424, + "time_per_iteration": 2.531129837036133 + }, + { + "auxiliary_loss_clip": 0.06410465, + "auxiliary_loss_mlp": 0.01267373, + "balance_loss_clip": 0.06274019, + "balance_loss_mlp": 0.01257491, + "epoch": 0.747031414399519, + "flos": 36253591413120.0, + "grad_norm": 1.7828664572749888, + "language_loss": 0.74532795, + "learning_rate": 6.344358933197418e-07, + "loss": 0.82210636, + "num_input_tokens_seen": 268014985, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.09881592, + "step": 12425, + "time_per_iteration": 2.7197470664978027 + }, + { + "auxiliary_loss_clip": 0.06402496, + "auxiliary_loss_mlp": 0.01265684, + "balance_loss_clip": 0.06268051, + "balance_loss_mlp": 0.01254431, + "epoch": 0.7470915376521869, + "flos": 19981133925120.0, + "grad_norm": 2.1292666289385016, + "language_loss": 0.69784462, + "learning_rate": 6.341513698972194e-07, + "loss": 0.77452642, + "num_input_tokens_seen": 268034395, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.1126709, + "step": 12426, + "time_per_iteration": 3.9324328899383545 + }, + { + "auxiliary_loss_clip": 0.06403908, + "auxiliary_loss_mlp": 0.01267662, + "balance_loss_clip": 0.06274264, + "balance_loss_mlp": 0.01258269, + "epoch": 0.747151660904855, + "flos": 20090523830400.0, + "grad_norm": 1.610031666552814, + "language_loss": 0.65698165, + "learning_rate": 6.338668982678139e-07, + "loss": 0.73369735, + "num_input_tokens_seen": 268054485, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.09399414, + "step": 12427, + "time_per_iteration": 2.544971466064453 + }, + { + "auxiliary_loss_clip": 0.06408876, + "auxiliary_loss_mlp": 0.01263755, + "balance_loss_clip": 0.06273834, + "balance_loss_mlp": 0.01253754, + "epoch": 0.7472117841575229, + "flos": 16296062411520.0, + "grad_norm": 1.5416820216719087, + "language_loss": 0.74925625, + "learning_rate": 6.335824784423118e-07, + "loss": 0.82598257, + "num_input_tokens_seen": 268072250, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.09997559, + "step": 12428, + "time_per_iteration": 2.4757473468780518 + }, + { + "auxiliary_loss_clip": 0.06413485, + "auxiliary_loss_mlp": 0.01264592, + "balance_loss_clip": 0.06274045, + "balance_loss_mlp": 0.01253756, + "epoch": 0.7472719074101909, + "flos": 21395068848000.0, + "grad_norm": 2.468151584449191, + "language_loss": 0.58381009, + "learning_rate": 6.33298110431499e-07, + "loss": 0.66059089, + "num_input_tokens_seen": 268089840, + "router_z_loss_clip": 1.39257812, + "router_z_loss_mlp": 0.1083374, + "step": 12429, + "time_per_iteration": 2.5076515674591064 + }, + { + "auxiliary_loss_clip": 0.06411515, + "auxiliary_loss_mlp": 0.0126451, + "balance_loss_clip": 0.06274679, + "balance_loss_mlp": 0.01254395, + "epoch": 0.7473320306628589, + "flos": 29651064439680.0, + "grad_norm": 1.7643839025540142, + "language_loss": 0.60671711, + "learning_rate": 6.330137942461595e-07, + "loss": 0.6834774, + "num_input_tokens_seen": 268109360, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.10113525, + "step": 12430, + "time_per_iteration": 2.580826997756958 + }, + { + "auxiliary_loss_clip": 0.06397452, + "auxiliary_loss_mlp": 0.01264423, + "balance_loss_clip": 0.06268569, + "balance_loss_mlp": 0.01255339, + "epoch": 0.7473921539155268, + "flos": 24143316986880.0, + "grad_norm": 1.3480044268517646, + "language_loss": 0.7548542, + "learning_rate": 6.327295298970734e-07, + "loss": 0.83147293, + "num_input_tokens_seen": 268131840, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.09088135, + "step": 12431, + "time_per_iteration": 2.5767364501953125 + }, + { + "auxiliary_loss_clip": 0.06404008, + "auxiliary_loss_mlp": 0.01264023, + "balance_loss_clip": 0.06270575, + "balance_loss_mlp": 0.01253831, + "epoch": 0.7474522771681948, + "flos": 17492768824320.0, + "grad_norm": 2.003596145191226, + "language_loss": 0.75284076, + "learning_rate": 6.32445317395021e-07, + "loss": 0.82952106, + "num_input_tokens_seen": 268148300, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.10198975, + "step": 12432, + "time_per_iteration": 3.9378252029418945 + }, + { + "auxiliary_loss_clip": 0.06408846, + "auxiliary_loss_mlp": 0.01264276, + "balance_loss_clip": 0.06271054, + "balance_loss_mlp": 0.01253833, + "epoch": 0.7475124004208628, + "flos": 16732909272960.0, + "grad_norm": 2.3826566050681652, + "language_loss": 0.70483506, + "learning_rate": 6.321611567507787e-07, + "loss": 0.78156626, + "num_input_tokens_seen": 268166450, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.10437012, + "step": 12433, + "time_per_iteration": 2.4768426418304443 + }, + { + "auxiliary_loss_clip": 0.06408405, + "auxiliary_loss_mlp": 0.01266362, + "balance_loss_clip": 0.06274009, + "balance_loss_mlp": 0.01255782, + "epoch": 0.7475725236735308, + "flos": 19726533694080.0, + "grad_norm": 1.7388304285111835, + "language_loss": 0.67580962, + "learning_rate": 6.318770479751232e-07, + "loss": 0.75255728, + "num_input_tokens_seen": 268186165, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.105896, + "step": 12434, + "time_per_iteration": 2.547088384628296 + }, + { + "auxiliary_loss_clip": 0.06395668, + "auxiliary_loss_mlp": 0.01264935, + "balance_loss_clip": 0.06270221, + "balance_loss_mlp": 0.01256042, + "epoch": 0.7476326469261987, + "flos": 26293114465920.0, + "grad_norm": 1.4738346539678335, + "language_loss": 0.7966851, + "learning_rate": 6.315929910788263e-07, + "loss": 0.87329113, + "num_input_tokens_seen": 268208145, + "router_z_loss_clip": 1.25488281, + "router_z_loss_mlp": 0.08898926, + "step": 12435, + "time_per_iteration": 2.5363943576812744 + }, + { + "auxiliary_loss_clip": 0.06409591, + "auxiliary_loss_mlp": 0.01267417, + "balance_loss_clip": 0.0627221, + "balance_loss_mlp": 0.01257236, + "epoch": 0.7476927701788667, + "flos": 31839868794240.0, + "grad_norm": 2.1319276645513736, + "language_loss": 0.68030941, + "learning_rate": 6.313089860726604e-07, + "loss": 0.75707954, + "num_input_tokens_seen": 268228345, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.10180664, + "step": 12436, + "time_per_iteration": 2.655866861343384 + }, + { + "auxiliary_loss_clip": 0.06408997, + "auxiliary_loss_mlp": 0.01263336, + "balance_loss_clip": 0.06271006, + "balance_loss_mlp": 0.01252732, + "epoch": 0.7477528934315346, + "flos": 31803545249280.0, + "grad_norm": 1.4428842251570377, + "language_loss": 0.7086063, + "learning_rate": 6.31025032967396e-07, + "loss": 0.78532964, + "num_input_tokens_seen": 268250260, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.10601807, + "step": 12437, + "time_per_iteration": 2.5668420791625977 + }, + { + "auxiliary_loss_clip": 0.06400211, + "auxiliary_loss_mlp": 0.01266102, + "balance_loss_clip": 0.06271319, + "balance_loss_mlp": 0.01256929, + "epoch": 0.7478130166842026, + "flos": 20377548391680.0, + "grad_norm": 1.5941584942666511, + "language_loss": 0.6725921, + "learning_rate": 6.307411317737986e-07, + "loss": 0.74925524, + "num_input_tokens_seen": 268268440, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.09179688, + "step": 12438, + "time_per_iteration": 2.5391809940338135 + }, + { + "auxiliary_loss_clip": 0.06402425, + "auxiliary_loss_mlp": 0.01269468, + "balance_loss_clip": 0.06270497, + "balance_loss_mlp": 0.01259878, + "epoch": 0.7478731399368705, + "flos": 18154558771200.0, + "grad_norm": 1.5910882903057735, + "language_loss": 0.81170976, + "learning_rate": 6.304572825026344e-07, + "loss": 0.88842869, + "num_input_tokens_seen": 268285765, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09588623, + "step": 12439, + "time_per_iteration": 2.530305862426758 + }, + { + "auxiliary_loss_clip": 0.06401659, + "auxiliary_loss_mlp": 0.01264664, + "balance_loss_clip": 0.0627173, + "balance_loss_mlp": 0.0125502, + "epoch": 0.7479332631895386, + "flos": 15273259148160.0, + "grad_norm": 2.0986943273037335, + "language_loss": 0.71237975, + "learning_rate": 6.301734851646674e-07, + "loss": 0.78904307, + "num_input_tokens_seen": 268304015, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.09655762, + "step": 12440, + "time_per_iteration": 2.5543224811553955 + }, + { + "auxiliary_loss_clip": 0.06400722, + "auxiliary_loss_mlp": 0.01265179, + "balance_loss_clip": 0.06271139, + "balance_loss_mlp": 0.01255606, + "epoch": 0.7479933864422065, + "flos": 21148937879040.0, + "grad_norm": 1.8969303435383589, + "language_loss": 0.74162072, + "learning_rate": 6.298897397706597e-07, + "loss": 0.81827968, + "num_input_tokens_seen": 268323290, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.09570312, + "step": 12441, + "time_per_iteration": 2.4814085960388184 + }, + { + "auxiliary_loss_clip": 0.06407572, + "auxiliary_loss_mlp": 0.01269518, + "balance_loss_clip": 0.06270711, + "balance_loss_mlp": 0.01258664, + "epoch": 0.7480535096948745, + "flos": 14397217511040.0, + "grad_norm": 2.1766125237206384, + "language_loss": 0.82771671, + "learning_rate": 6.296060463313698e-07, + "loss": 0.90448761, + "num_input_tokens_seen": 268339490, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.10858154, + "step": 12442, + "time_per_iteration": 2.474766969680786 + }, + { + "auxiliary_loss_clip": 0.06407404, + "auxiliary_loss_mlp": 0.01264143, + "balance_loss_clip": 0.06271537, + "balance_loss_mlp": 0.01253551, + "epoch": 0.7481136329475425, + "flos": 27352073566080.0, + "grad_norm": 2.1201863783826087, + "language_loss": 0.63084489, + "learning_rate": 6.293224048575565e-07, + "loss": 0.7075603, + "num_input_tokens_seen": 268359865, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.105896, + "step": 12443, + "time_per_iteration": 2.537418842315674 + }, + { + "auxiliary_loss_clip": 0.06402731, + "auxiliary_loss_mlp": 0.01263567, + "balance_loss_clip": 0.06270343, + "balance_loss_mlp": 0.01254, + "epoch": 0.7481737562002104, + "flos": 19536656342400.0, + "grad_norm": 1.7130617298160193, + "language_loss": 0.71587157, + "learning_rate": 6.29038815359975e-07, + "loss": 0.79253459, + "num_input_tokens_seen": 268377065, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09570312, + "step": 12444, + "time_per_iteration": 2.5142312049865723 + }, + { + "auxiliary_loss_clip": 0.06404774, + "auxiliary_loss_mlp": 0.01263681, + "balance_loss_clip": 0.06271861, + "balance_loss_mlp": 0.01253483, + "epoch": 0.7482338794528784, + "flos": 21766102727040.0, + "grad_norm": 1.3467287331144688, + "language_loss": 0.68781805, + "learning_rate": 6.287552778493786e-07, + "loss": 0.76450258, + "num_input_tokens_seen": 268396935, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.10198975, + "step": 12445, + "time_per_iteration": 2.498960018157959 + }, + { + "auxiliary_loss_clip": 0.06400403, + "auxiliary_loss_mlp": 0.01264071, + "balance_loss_clip": 0.06269241, + "balance_loss_mlp": 0.01254319, + "epoch": 0.7482940027055464, + "flos": 18703269233280.0, + "grad_norm": 1.5654377266954753, + "language_loss": 0.74401557, + "learning_rate": 6.28471792336519e-07, + "loss": 0.82066035, + "num_input_tokens_seen": 268414460, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.09747314, + "step": 12446, + "time_per_iteration": 2.489685535430908 + }, + { + "auxiliary_loss_clip": 0.06408426, + "auxiliary_loss_mlp": 0.01264963, + "balance_loss_clip": 0.06271491, + "balance_loss_mlp": 0.01254467, + "epoch": 0.7483541259582144, + "flos": 16003587335040.0, + "grad_norm": 1.896183227268288, + "language_loss": 0.7341156, + "learning_rate": 6.281883588321475e-07, + "loss": 0.81084955, + "num_input_tokens_seen": 268432225, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.10491943, + "step": 12447, + "time_per_iteration": 2.464768648147583 + }, + { + "auxiliary_loss_clip": 0.06403442, + "auxiliary_loss_mlp": 0.01263884, + "balance_loss_clip": 0.06270905, + "balance_loss_mlp": 0.01254102, + "epoch": 0.7484142492108823, + "flos": 25563289403520.0, + "grad_norm": 2.623161293575912, + "language_loss": 0.72332132, + "learning_rate": 6.279049773470109e-07, + "loss": 0.79999459, + "num_input_tokens_seen": 268449270, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09777832, + "step": 12448, + "time_per_iteration": 2.601579427719116 + }, + { + "auxiliary_loss_clip": 0.06408041, + "auxiliary_loss_mlp": 0.01266135, + "balance_loss_clip": 0.06272004, + "balance_loss_mlp": 0.01256145, + "epoch": 0.7484743724635503, + "flos": 22893432359040.0, + "grad_norm": 1.636804246707767, + "language_loss": 0.73365426, + "learning_rate": 6.276216478918543e-07, + "loss": 0.81039608, + "num_input_tokens_seen": 268467250, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.09991455, + "step": 12449, + "time_per_iteration": 2.54630184173584 + }, + { + "auxiliary_loss_clip": 0.06411887, + "auxiliary_loss_mlp": 0.012677, + "balance_loss_clip": 0.06271833, + "balance_loss_mlp": 0.01256548, + "epoch": 0.7485344957162182, + "flos": 25307137872000.0, + "grad_norm": 1.841554129413667, + "language_loss": 0.61420983, + "learning_rate": 6.273383704774225e-07, + "loss": 0.69100565, + "num_input_tokens_seen": 268487270, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.11151123, + "step": 12450, + "time_per_iteration": 2.5542476177215576 + }, + { + "auxiliary_loss_clip": 0.06399691, + "auxiliary_loss_mlp": 0.01263156, + "balance_loss_clip": 0.0627162, + "balance_loss_mlp": 0.01254156, + "epoch": 0.7485946189688862, + "flos": 27060395103360.0, + "grad_norm": 1.84091608525743, + "language_loss": 0.70658576, + "learning_rate": 6.270551451144577e-07, + "loss": 0.78321427, + "num_input_tokens_seen": 268508020, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.08990479, + "step": 12451, + "time_per_iteration": 2.552686929702759 + }, + { + "auxiliary_loss_clip": 0.06414381, + "auxiliary_loss_mlp": 0.01265729, + "balance_loss_clip": 0.06273015, + "balance_loss_mlp": 0.01255143, + "epoch": 0.7486547422215541, + "flos": 26914052747520.0, + "grad_norm": 1.8323009368960723, + "language_loss": 0.80237973, + "learning_rate": 6.267719718136988e-07, + "loss": 0.87918079, + "num_input_tokens_seen": 268527375, + "router_z_loss_clip": 1.41308594, + "router_z_loss_mlp": 0.105896, + "step": 12452, + "time_per_iteration": 2.525906562805176 + }, + { + "auxiliary_loss_clip": 0.06414159, + "auxiliary_loss_mlp": 0.01265227, + "balance_loss_clip": 0.06274606, + "balance_loss_mlp": 0.01254898, + "epoch": 0.7487148654742222, + "flos": 22352855742720.0, + "grad_norm": 2.4829537234299184, + "language_loss": 0.72200477, + "learning_rate": 6.264888505858843e-07, + "loss": 0.79879862, + "num_input_tokens_seen": 268544870, + "router_z_loss_clip": 1.39648438, + "router_z_loss_mlp": 0.10333252, + "step": 12453, + "time_per_iteration": 3.899683952331543 + }, + { + "auxiliary_loss_clip": 0.06408122, + "auxiliary_loss_mlp": 0.01265158, + "balance_loss_clip": 0.06273174, + "balance_loss_mlp": 0.01255544, + "epoch": 0.7487749887268901, + "flos": 23045392938240.0, + "grad_norm": 1.5935388766621728, + "language_loss": 0.74146187, + "learning_rate": 6.262057814417517e-07, + "loss": 0.81819469, + "num_input_tokens_seen": 268564580, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.09619141, + "step": 12454, + "time_per_iteration": 2.494929552078247 + }, + { + "auxiliary_loss_clip": 0.06311407, + "auxiliary_loss_mlp": 0.012513, + "balance_loss_clip": 0.06256338, + "balance_loss_mlp": 0.01250216, + "epoch": 0.7488351119795581, + "flos": 71545565842560.0, + "grad_norm": 0.7199296433862132, + "language_loss": 0.59468263, + "learning_rate": 6.259227643920322e-07, + "loss": 0.67030972, + "num_input_tokens_seen": 268629550, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.01085663, + "step": 12455, + "time_per_iteration": 3.2877697944641113 + }, + { + "auxiliary_loss_clip": 0.06402359, + "auxiliary_loss_mlp": 0.01260932, + "balance_loss_clip": 0.06271666, + "balance_loss_mlp": 0.01251759, + "epoch": 0.748895235232226, + "flos": 17201048434560.0, + "grad_norm": 1.6203322015377568, + "language_loss": 0.79953825, + "learning_rate": 6.256397994474592e-07, + "loss": 0.87617117, + "num_input_tokens_seen": 268646645, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.09179688, + "step": 12456, + "time_per_iteration": 2.4608328342437744 + }, + { + "auxiliary_loss_clip": 0.06310637, + "auxiliary_loss_mlp": 0.01250455, + "balance_loss_clip": 0.06255627, + "balance_loss_mlp": 0.01249323, + "epoch": 0.748955358484894, + "flos": 58998276846720.0, + "grad_norm": 0.8208514355444383, + "language_loss": 0.61328387, + "learning_rate": 6.25356886618763e-07, + "loss": 0.68889475, + "num_input_tokens_seen": 268702275, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.01134491, + "step": 12457, + "time_per_iteration": 3.048952102661133 + }, + { + "auxiliary_loss_clip": 0.06408623, + "auxiliary_loss_mlp": 0.01266166, + "balance_loss_clip": 0.06272934, + "balance_loss_mlp": 0.01255867, + "epoch": 0.749015481737562, + "flos": 11364544287360.0, + "grad_norm": 1.9496047447072924, + "language_loss": 0.67320937, + "learning_rate": 6.250740259166711e-07, + "loss": 0.7499572, + "num_input_tokens_seen": 268716265, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.10308838, + "step": 12458, + "time_per_iteration": 2.4301834106445312 + }, + { + "auxiliary_loss_clip": 0.06403044, + "auxiliary_loss_mlp": 0.01266185, + "balance_loss_clip": 0.06271131, + "balance_loss_mlp": 0.01256279, + "epoch": 0.74907560499023, + "flos": 21112991677440.0, + "grad_norm": 1.7212914648304267, + "language_loss": 0.80174047, + "learning_rate": 6.247912173519106e-07, + "loss": 0.87843275, + "num_input_tokens_seen": 268734330, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09912109, + "step": 12459, + "time_per_iteration": 2.518477439880371 + }, + { + "auxiliary_loss_clip": 0.06404047, + "auxiliary_loss_mlp": 0.01264599, + "balance_loss_clip": 0.06271756, + "balance_loss_mlp": 0.01254926, + "epoch": 0.749135728242898, + "flos": 22273709961600.0, + "grad_norm": 1.512865855807545, + "language_loss": 0.80564761, + "learning_rate": 6.245084609352043e-07, + "loss": 0.88233417, + "num_input_tokens_seen": 268753500, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.09674072, + "step": 12460, + "time_per_iteration": 2.5079431533813477 + }, + { + "auxiliary_loss_clip": 0.06403753, + "auxiliary_loss_mlp": 0.01265433, + "balance_loss_clip": 0.0627199, + "balance_loss_mlp": 0.0125477, + "epoch": 0.7491958514955659, + "flos": 24063793862400.0, + "grad_norm": 1.6076689252740726, + "language_loss": 0.86212254, + "learning_rate": 6.242257566772755e-07, + "loss": 0.93881446, + "num_input_tokens_seen": 268772055, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.10662842, + "step": 12461, + "time_per_iteration": 2.542217969894409 + }, + { + "auxiliary_loss_clip": 0.06400948, + "auxiliary_loss_mlp": 0.01263344, + "balance_loss_clip": 0.06270917, + "balance_loss_mlp": 0.01254254, + "epoch": 0.7492559747482339, + "flos": 24497915466240.0, + "grad_norm": 1.880430722981425, + "language_loss": 0.69432622, + "learning_rate": 6.239431045888435e-07, + "loss": 0.77096915, + "num_input_tokens_seen": 268792265, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.09088135, + "step": 12462, + "time_per_iteration": 2.5493383407592773 + }, + { + "auxiliary_loss_clip": 0.06405858, + "auxiliary_loss_mlp": 0.01266202, + "balance_loss_clip": 0.06273175, + "balance_loss_mlp": 0.01255301, + "epoch": 0.7493160980009018, + "flos": 27752680736640.0, + "grad_norm": 1.8211376167609288, + "language_loss": 0.70671761, + "learning_rate": 6.236605046806267e-07, + "loss": 0.78343821, + "num_input_tokens_seen": 268812735, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.10900879, + "step": 12463, + "time_per_iteration": 3.986877918243408 + }, + { + "auxiliary_loss_clip": 0.06407613, + "auxiliary_loss_mlp": 0.01265922, + "balance_loss_clip": 0.06274509, + "balance_loss_mlp": 0.01255664, + "epoch": 0.7493762212535698, + "flos": 30233918240640.0, + "grad_norm": 1.7635457747868553, + "language_loss": 0.77660054, + "learning_rate": 6.233779569633419e-07, + "loss": 0.85333592, + "num_input_tokens_seen": 268833090, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.10247803, + "step": 12464, + "time_per_iteration": 2.613281726837158 + }, + { + "auxiliary_loss_clip": 0.06402797, + "auxiliary_loss_mlp": 0.01263814, + "balance_loss_clip": 0.06269908, + "balance_loss_mlp": 0.01254289, + "epoch": 0.7494363445062378, + "flos": 21950906906880.0, + "grad_norm": 1.6126979618339465, + "language_loss": 0.78109074, + "learning_rate": 6.230954614477034e-07, + "loss": 0.85775691, + "num_input_tokens_seen": 268851880, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.09521484, + "step": 12465, + "time_per_iteration": 2.4863994121551514 + }, + { + "auxiliary_loss_clip": 0.06420696, + "auxiliary_loss_mlp": 0.01267627, + "balance_loss_clip": 0.06278575, + "balance_loss_mlp": 0.01256332, + "epoch": 0.7494964677589058, + "flos": 12494473395840.0, + "grad_norm": 2.5697202625678877, + "language_loss": 0.74354923, + "learning_rate": 6.22813018144422e-07, + "loss": 0.82043248, + "num_input_tokens_seen": 268867910, + "router_z_loss_clip": 1.41992188, + "router_z_loss_mlp": 0.11303711, + "step": 12466, + "time_per_iteration": 3.9045188426971436 + }, + { + "auxiliary_loss_clip": 0.06406893, + "auxiliary_loss_mlp": 0.01262068, + "balance_loss_clip": 0.06270187, + "balance_loss_mlp": 0.01252293, + "epoch": 0.7495565910115737, + "flos": 21659521933440.0, + "grad_norm": 1.9829684209764449, + "language_loss": 0.66688263, + "learning_rate": 6.22530627064209e-07, + "loss": 0.74357224, + "num_input_tokens_seen": 268887260, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.09777832, + "step": 12467, + "time_per_iteration": 2.54917049407959 + }, + { + "auxiliary_loss_clip": 0.06409226, + "auxiliary_loss_mlp": 0.01263538, + "balance_loss_clip": 0.06273383, + "balance_loss_mlp": 0.01253501, + "epoch": 0.7496167142642417, + "flos": 15274013834880.0, + "grad_norm": 2.0991094746025416, + "language_loss": 0.76436639, + "learning_rate": 6.222482882177735e-07, + "loss": 0.84109402, + "num_input_tokens_seen": 268902520, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.1003418, + "step": 12468, + "time_per_iteration": 2.4655251502990723 + }, + { + "auxiliary_loss_clip": 0.0640367, + "auxiliary_loss_mlp": 0.01266554, + "balance_loss_clip": 0.06271279, + "balance_loss_mlp": 0.01256129, + "epoch": 0.7496768375169096, + "flos": 22061554623360.0, + "grad_norm": 1.9736124429451793, + "language_loss": 0.69775021, + "learning_rate": 6.219660016158201e-07, + "loss": 0.77445245, + "num_input_tokens_seen": 268920970, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.10430908, + "step": 12469, + "time_per_iteration": 2.533859968185425 + }, + { + "auxiliary_loss_clip": 0.06409403, + "auxiliary_loss_mlp": 0.01264633, + "balance_loss_clip": 0.06274202, + "balance_loss_mlp": 0.01254726, + "epoch": 0.7497369607695776, + "flos": 19062144270720.0, + "grad_norm": 2.2473454659812107, + "language_loss": 0.6920374, + "learning_rate": 6.216837672690543e-07, + "loss": 0.76877773, + "num_input_tokens_seen": 268936600, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.09899902, + "step": 12470, + "time_per_iteration": 2.4770658016204834 + }, + { + "auxiliary_loss_clip": 0.06413378, + "auxiliary_loss_mlp": 0.01268274, + "balance_loss_clip": 0.06271495, + "balance_loss_mlp": 0.01256329, + "epoch": 0.7497970840222457, + "flos": 21624036929280.0, + "grad_norm": 1.7361312699239924, + "language_loss": 0.75303179, + "learning_rate": 6.214015851881793e-07, + "loss": 0.82984829, + "num_input_tokens_seen": 268956560, + "router_z_loss_clip": 1.41894531, + "router_z_loss_mlp": 0.11950684, + "step": 12471, + "time_per_iteration": 2.5342705249786377 + }, + { + "auxiliary_loss_clip": 0.06412168, + "auxiliary_loss_mlp": 0.01265091, + "balance_loss_clip": 0.06277177, + "balance_loss_mlp": 0.01255, + "epoch": 0.7498572072749136, + "flos": 13740710371200.0, + "grad_norm": 2.1773399303982663, + "language_loss": 0.77400845, + "learning_rate": 6.211194553838929e-07, + "loss": 0.85078096, + "num_input_tokens_seen": 268973945, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.10089111, + "step": 12472, + "time_per_iteration": 3.870166540145874 + }, + { + "auxiliary_loss_clip": 0.06403755, + "auxiliary_loss_mlp": 0.01264487, + "balance_loss_clip": 0.06272653, + "balance_loss_mlp": 0.01255039, + "epoch": 0.7499173305275816, + "flos": 22973207045760.0, + "grad_norm": 1.4354078089227125, + "language_loss": 0.84353936, + "learning_rate": 6.208373778668951e-07, + "loss": 0.92022181, + "num_input_tokens_seen": 268993245, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.09460449, + "step": 12473, + "time_per_iteration": 2.537057399749756 + }, + { + "auxiliary_loss_clip": 0.06410777, + "auxiliary_loss_mlp": 0.01268473, + "balance_loss_clip": 0.06273849, + "balance_loss_mlp": 0.01257261, + "epoch": 0.7499774537802495, + "flos": 22745916046080.0, + "grad_norm": 1.8524575994010102, + "language_loss": 0.73466665, + "learning_rate": 6.205553526478829e-07, + "loss": 0.81145918, + "num_input_tokens_seen": 269012125, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.11212158, + "step": 12474, + "time_per_iteration": 2.4842028617858887 + }, + { + "auxiliary_loss_clip": 0.06415059, + "auxiliary_loss_mlp": 0.01267095, + "balance_loss_clip": 0.06274258, + "balance_loss_mlp": 0.01255311, + "epoch": 0.7500375770329175, + "flos": 18302494354560.0, + "grad_norm": 1.6095037145271875, + "language_loss": 0.74770164, + "learning_rate": 6.202733797375492e-07, + "loss": 0.82452309, + "num_input_tokens_seen": 269030545, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.11779785, + "step": 12475, + "time_per_iteration": 2.4979960918426514 + }, + { + "auxiliary_loss_clip": 0.06415677, + "auxiliary_loss_mlp": 0.01269527, + "balance_loss_clip": 0.06274221, + "balance_loss_mlp": 0.01257898, + "epoch": 0.7500977002855854, + "flos": 19175684952960.0, + "grad_norm": 2.1095772826483907, + "language_loss": 0.80763221, + "learning_rate": 6.199914591465878e-07, + "loss": 0.88448429, + "num_input_tokens_seen": 269048180, + "router_z_loss_clip": 1.41308594, + "router_z_loss_mlp": 0.11633301, + "step": 12476, + "time_per_iteration": 2.491819381713867 + }, + { + "auxiliary_loss_clip": 0.06407472, + "auxiliary_loss_mlp": 0.01264673, + "balance_loss_clip": 0.06272332, + "balance_loss_mlp": 0.01254999, + "epoch": 0.7501578235382534, + "flos": 22170441404160.0, + "grad_norm": 7.116833282628377, + "language_loss": 0.77544057, + "learning_rate": 6.19709590885688e-07, + "loss": 0.852162, + "num_input_tokens_seen": 269068600, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.09674072, + "step": 12477, + "time_per_iteration": 2.5502593517303467 + }, + { + "auxiliary_loss_clip": 0.06310226, + "auxiliary_loss_mlp": 0.01250565, + "balance_loss_clip": 0.06254882, + "balance_loss_mlp": 0.01249338, + "epoch": 0.7502179467909214, + "flos": 64481035783680.0, + "grad_norm": 0.7848730842725032, + "language_loss": 0.54270738, + "learning_rate": 6.194277749655394e-07, + "loss": 0.61831528, + "num_input_tokens_seen": 269119045, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01226044, + "step": 12478, + "time_per_iteration": 3.0923471450805664 + }, + { + "auxiliary_loss_clip": 0.06402513, + "auxiliary_loss_mlp": 0.01266297, + "balance_loss_clip": 0.06272154, + "balance_loss_mlp": 0.01255747, + "epoch": 0.7502780700435894, + "flos": 20483332571520.0, + "grad_norm": 1.5542360710976224, + "language_loss": 0.80265927, + "learning_rate": 6.191460113968272e-07, + "loss": 0.87934738, + "num_input_tokens_seen": 269136755, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.10559082, + "step": 12479, + "time_per_iteration": 2.503929615020752 + }, + { + "auxiliary_loss_clip": 0.06412464, + "auxiliary_loss_mlp": 0.01265738, + "balance_loss_clip": 0.06273077, + "balance_loss_mlp": 0.01254162, + "epoch": 0.7503381932962573, + "flos": 20450908241280.0, + "grad_norm": 4.66275961009968, + "language_loss": 0.62624717, + "learning_rate": 6.188643001902369e-07, + "loss": 0.70302922, + "num_input_tokens_seen": 269156120, + "router_z_loss_clip": 1.39257812, + "router_z_loss_mlp": 0.11566162, + "step": 12480, + "time_per_iteration": 2.488246202468872 + }, + { + "auxiliary_loss_clip": 0.06401666, + "auxiliary_loss_mlp": 0.0126556, + "balance_loss_clip": 0.06272847, + "balance_loss_mlp": 0.01256148, + "epoch": 0.7503983165489253, + "flos": 22388382673920.0, + "grad_norm": 1.5669372883229389, + "language_loss": 0.784675, + "learning_rate": 6.185826413564512e-07, + "loss": 0.86134732, + "num_input_tokens_seen": 269175650, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.09411621, + "step": 12481, + "time_per_iteration": 2.514516830444336 + }, + { + "auxiliary_loss_clip": 0.06406647, + "auxiliary_loss_mlp": 0.01271353, + "balance_loss_clip": 0.06270355, + "balance_loss_mlp": 0.01260159, + "epoch": 0.7504584398015932, + "flos": 24906321066240.0, + "grad_norm": 1.6690563670496772, + "language_loss": 0.71560133, + "learning_rate": 6.183010349061501e-07, + "loss": 0.79238129, + "num_input_tokens_seen": 269197080, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.11193848, + "step": 12482, + "time_per_iteration": 2.570258140563965 + }, + { + "auxiliary_loss_clip": 0.06406072, + "auxiliary_loss_mlp": 0.01265844, + "balance_loss_clip": 0.06272655, + "balance_loss_mlp": 0.0125505, + "epoch": 0.7505185630542612, + "flos": 25892381514240.0, + "grad_norm": 1.622739148659245, + "language_loss": 0.70420146, + "learning_rate": 6.180194808500118e-07, + "loss": 0.78092062, + "num_input_tokens_seen": 269218600, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.10784912, + "step": 12483, + "time_per_iteration": 2.545875072479248 + }, + { + "auxiliary_loss_clip": 0.06406315, + "auxiliary_loss_mlp": 0.01266459, + "balance_loss_clip": 0.0627225, + "balance_loss_mlp": 0.01257227, + "epoch": 0.7505786863069293, + "flos": 23149709671680.0, + "grad_norm": 1.6112204819340308, + "language_loss": 0.74173069, + "learning_rate": 6.177379791987131e-07, + "loss": 0.81845844, + "num_input_tokens_seen": 269239245, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.09240723, + "step": 12484, + "time_per_iteration": 2.50899600982666 + }, + { + "auxiliary_loss_clip": 0.06404275, + "auxiliary_loss_mlp": 0.01267227, + "balance_loss_clip": 0.06272139, + "balance_loss_mlp": 0.01256761, + "epoch": 0.7506388095595972, + "flos": 16989144658560.0, + "grad_norm": 1.988075921906434, + "language_loss": 0.84860504, + "learning_rate": 6.174565299629295e-07, + "loss": 0.92532003, + "num_input_tokens_seen": 269258520, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.10473633, + "step": 12485, + "time_per_iteration": 2.5089685916900635 + }, + { + "auxiliary_loss_clip": 0.06403236, + "auxiliary_loss_mlp": 0.01262842, + "balance_loss_clip": 0.06270488, + "balance_loss_mlp": 0.01253121, + "epoch": 0.7506989328122652, + "flos": 22351346369280.0, + "grad_norm": 1.4931669119648077, + "language_loss": 0.78489572, + "learning_rate": 6.171751331533323e-07, + "loss": 0.86155653, + "num_input_tokens_seen": 269278320, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.097229, + "step": 12486, + "time_per_iteration": 2.5051820278167725 + }, + { + "auxiliary_loss_clip": 0.06408528, + "auxiliary_loss_mlp": 0.01263313, + "balance_loss_clip": 0.06272987, + "balance_loss_mlp": 0.01253245, + "epoch": 0.7507590560649331, + "flos": 25783243171200.0, + "grad_norm": 1.7753955887486508, + "language_loss": 0.73021758, + "learning_rate": 6.168937887805932e-07, + "loss": 0.80693603, + "num_input_tokens_seen": 269298025, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.10064697, + "step": 12487, + "time_per_iteration": 2.547999382019043 + }, + { + "auxiliary_loss_clip": 0.06404672, + "auxiliary_loss_mlp": 0.01263386, + "balance_loss_clip": 0.0626927, + "balance_loss_mlp": 0.01253528, + "epoch": 0.7508191793176011, + "flos": 24286221325440.0, + "grad_norm": 1.9310699455089921, + "language_loss": 0.67608893, + "learning_rate": 6.166124968553801e-07, + "loss": 0.75276947, + "num_input_tokens_seen": 269316770, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.09857178, + "step": 12488, + "time_per_iteration": 2.5895445346832275 + }, + { + "auxiliary_loss_clip": 0.0640392, + "auxiliary_loss_mlp": 0.0126508, + "balance_loss_clip": 0.06270676, + "balance_loss_mlp": 0.01254822, + "epoch": 0.750879302570269, + "flos": 19905384234240.0, + "grad_norm": 1.5890652635946048, + "language_loss": 0.77430677, + "learning_rate": 6.163312573883592e-07, + "loss": 0.85099679, + "num_input_tokens_seen": 269334755, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.10253906, + "step": 12489, + "time_per_iteration": 2.5337159633636475 + }, + { + "auxiliary_loss_clip": 0.0640057, + "auxiliary_loss_mlp": 0.01265302, + "balance_loss_clip": 0.06270728, + "balance_loss_mlp": 0.01255431, + "epoch": 0.750939425822937, + "flos": 29213420964480.0, + "grad_norm": 1.5668986388800445, + "language_loss": 0.75072443, + "learning_rate": 6.160500703901956e-07, + "loss": 0.8273831, + "num_input_tokens_seen": 269353810, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.09875488, + "step": 12490, + "time_per_iteration": 2.5781826972961426 + }, + { + "auxiliary_loss_clip": 0.06405737, + "auxiliary_loss_mlp": 0.01266052, + "balance_loss_clip": 0.06274259, + "balance_loss_mlp": 0.0125592, + "epoch": 0.750999549075605, + "flos": 21148686316800.0, + "grad_norm": 1.487741862942094, + "language_loss": 0.7861315, + "learning_rate": 6.157689358715527e-07, + "loss": 0.86284935, + "num_input_tokens_seen": 269372910, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.10144043, + "step": 12491, + "time_per_iteration": 2.5030393600463867 + }, + { + "auxiliary_loss_clip": 0.06398296, + "auxiliary_loss_mlp": 0.01269676, + "balance_loss_clip": 0.06269314, + "balance_loss_mlp": 0.01260473, + "epoch": 0.751059672328273, + "flos": 23554090275840.0, + "grad_norm": 1.6435305052483133, + "language_loss": 0.76645952, + "learning_rate": 6.154878538430899e-07, + "loss": 0.84313929, + "num_input_tokens_seen": 269391545, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.09210205, + "step": 12492, + "time_per_iteration": 2.5466179847717285 + }, + { + "auxiliary_loss_clip": 0.06403392, + "auxiliary_loss_mlp": 0.01267084, + "balance_loss_clip": 0.06270675, + "balance_loss_mlp": 0.01257446, + "epoch": 0.7511197955809409, + "flos": 18995786236800.0, + "grad_norm": 1.8268388211945472, + "language_loss": 0.71465898, + "learning_rate": 6.152068243154671e-07, + "loss": 0.79136372, + "num_input_tokens_seen": 269408530, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09637451, + "step": 12493, + "time_per_iteration": 3.923126697540283 + }, + { + "auxiliary_loss_clip": 0.06408728, + "auxiliary_loss_mlp": 0.01268664, + "balance_loss_clip": 0.06274524, + "balance_loss_mlp": 0.01258603, + "epoch": 0.7511799188336089, + "flos": 22052246820480.0, + "grad_norm": 1.6129417562793205, + "language_loss": 0.80984807, + "learning_rate": 6.149258472993395e-07, + "loss": 0.88662201, + "num_input_tokens_seen": 269425930, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.10070801, + "step": 12494, + "time_per_iteration": 2.499166488647461 + }, + { + "auxiliary_loss_clip": 0.06403729, + "auxiliary_loss_mlp": 0.01266628, + "balance_loss_clip": 0.06270036, + "balance_loss_mlp": 0.01256418, + "epoch": 0.7512400420862768, + "flos": 16471894204800.0, + "grad_norm": 1.701536760083375, + "language_loss": 0.79124582, + "learning_rate": 6.146449228053634e-07, + "loss": 0.86794937, + "num_input_tokens_seen": 269443945, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.10211182, + "step": 12495, + "time_per_iteration": 2.482259511947632 + }, + { + "auxiliary_loss_clip": 0.06400186, + "auxiliary_loss_mlp": 0.01262526, + "balance_loss_clip": 0.06269289, + "balance_loss_mlp": 0.01253108, + "epoch": 0.7513001653389448, + "flos": 20454472039680.0, + "grad_norm": 1.7104928099780732, + "language_loss": 0.71375751, + "learning_rate": 6.143640508441898e-07, + "loss": 0.79038465, + "num_input_tokens_seen": 269463625, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09417725, + "step": 12496, + "time_per_iteration": 2.513437032699585 + }, + { + "auxiliary_loss_clip": 0.06405301, + "auxiliary_loss_mlp": 0.01263444, + "balance_loss_clip": 0.06272015, + "balance_loss_mlp": 0.01253907, + "epoch": 0.7513602885916129, + "flos": 23483497610880.0, + "grad_norm": 1.6654554654788911, + "language_loss": 0.78218853, + "learning_rate": 6.140832314264705e-07, + "loss": 0.85887605, + "num_input_tokens_seen": 269483415, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.09533691, + "step": 12497, + "time_per_iteration": 2.513091564178467 + }, + { + "auxiliary_loss_clip": 0.06402559, + "auxiliary_loss_mlp": 0.01266934, + "balance_loss_clip": 0.06268804, + "balance_loss_mlp": 0.01256867, + "epoch": 0.7514204118442808, + "flos": 26804495134080.0, + "grad_norm": 1.4375816508354362, + "language_loss": 0.77240133, + "learning_rate": 6.13802464562855e-07, + "loss": 0.8490963, + "num_input_tokens_seen": 269504635, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.10070801, + "step": 12498, + "time_per_iteration": 2.5410008430480957 + }, + { + "auxiliary_loss_clip": 0.06400871, + "auxiliary_loss_mlp": 0.01263117, + "balance_loss_clip": 0.06272262, + "balance_loss_mlp": 0.01254462, + "epoch": 0.7514805350969488, + "flos": 19871869800960.0, + "grad_norm": 1.7337697309070021, + "language_loss": 0.74015534, + "learning_rate": 6.135217502639878e-07, + "loss": 0.81679523, + "num_input_tokens_seen": 269523955, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.08654785, + "step": 12499, + "time_per_iteration": 2.557349443435669 + }, + { + "auxiliary_loss_clip": 0.06399096, + "auxiliary_loss_mlp": 0.01264017, + "balance_loss_clip": 0.06268655, + "balance_loss_mlp": 0.01254737, + "epoch": 0.7515406583496167, + "flos": 24578444839680.0, + "grad_norm": 2.167576832097364, + "language_loss": 0.79499745, + "learning_rate": 6.132410885405148e-07, + "loss": 0.87162852, + "num_input_tokens_seen": 269544410, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.09277344, + "step": 12500, + "time_per_iteration": 2.5547473430633545 + }, + { + "auxiliary_loss_clip": 0.06415384, + "auxiliary_loss_mlp": 0.01265407, + "balance_loss_clip": 0.06272934, + "balance_loss_mlp": 0.01253772, + "epoch": 0.7516007816022847, + "flos": 20126386177920.0, + "grad_norm": 1.9841359152283422, + "language_loss": 0.73215604, + "learning_rate": 6.129604794030794e-07, + "loss": 0.80896389, + "num_input_tokens_seen": 269563315, + "router_z_loss_clip": 1.42382812, + "router_z_loss_mlp": 0.11639404, + "step": 12501, + "time_per_iteration": 2.4737539291381836 + }, + { + "auxiliary_loss_clip": 0.06401603, + "auxiliary_loss_mlp": 0.0126533, + "balance_loss_clip": 0.06269078, + "balance_loss_mlp": 0.01255764, + "epoch": 0.7516609048549526, + "flos": 22791379685760.0, + "grad_norm": 1.708165440784374, + "language_loss": 0.7856493, + "learning_rate": 6.126799228623207e-07, + "loss": 0.86231852, + "num_input_tokens_seen": 269583950, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09570312, + "step": 12502, + "time_per_iteration": 4.065747499465942 + }, + { + "auxiliary_loss_clip": 0.0640514, + "auxiliary_loss_mlp": 0.01262296, + "balance_loss_clip": 0.06270734, + "balance_loss_mlp": 0.01251895, + "epoch": 0.7517210281076206, + "flos": 10638576512640.0, + "grad_norm": 2.198342230636315, + "language_loss": 0.70527124, + "learning_rate": 6.123994189288786e-07, + "loss": 0.78194559, + "num_input_tokens_seen": 269600120, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.10406494, + "step": 12503, + "time_per_iteration": 2.4975264072418213 + }, + { + "auxiliary_loss_clip": 0.06308451, + "auxiliary_loss_mlp": 0.01250423, + "balance_loss_clip": 0.06253403, + "balance_loss_mlp": 0.01249304, + "epoch": 0.7517811513602886, + "flos": 66071542458240.0, + "grad_norm": 0.9653674550577583, + "language_loss": 0.63868368, + "learning_rate": 6.121189676133903e-07, + "loss": 0.71427244, + "num_input_tokens_seen": 269659815, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.01122284, + "step": 12504, + "time_per_iteration": 3.0423572063446045 + }, + { + "auxiliary_loss_clip": 0.06398649, + "auxiliary_loss_mlp": 0.01267599, + "balance_loss_clip": 0.06269499, + "balance_loss_mlp": 0.01258533, + "epoch": 0.7518412746129566, + "flos": 37277317071360.0, + "grad_norm": 1.461644685561848, + "language_loss": 0.68779212, + "learning_rate": 6.118385689264896e-07, + "loss": 0.7644546, + "num_input_tokens_seen": 269684565, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.09069824, + "step": 12505, + "time_per_iteration": 4.1895623207092285 + }, + { + "auxiliary_loss_clip": 0.06309824, + "auxiliary_loss_mlp": 0.01250829, + "balance_loss_clip": 0.06254642, + "balance_loss_mlp": 0.01249779, + "epoch": 0.7519013978656245, + "flos": 60539001396480.0, + "grad_norm": 0.633292190388587, + "language_loss": 0.55014133, + "learning_rate": 6.11558222878809e-07, + "loss": 0.6257478, + "num_input_tokens_seen": 269752325, + "router_z_loss_clip": 0.55126953, + "router_z_loss_mlp": 0.01050568, + "step": 12506, + "time_per_iteration": 3.249525785446167 + }, + { + "auxiliary_loss_clip": 0.06407043, + "auxiliary_loss_mlp": 0.01265184, + "balance_loss_clip": 0.0627189, + "balance_loss_mlp": 0.01254831, + "epoch": 0.7519615211182925, + "flos": 18812826846720.0, + "grad_norm": 1.7032377600653197, + "language_loss": 0.78890646, + "learning_rate": 6.112779294809796e-07, + "loss": 0.86562872, + "num_input_tokens_seen": 269770630, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.10339355, + "step": 12507, + "time_per_iteration": 2.4874064922332764 + }, + { + "auxiliary_loss_clip": 0.06398805, + "auxiliary_loss_mlp": 0.01267855, + "balance_loss_clip": 0.06269046, + "balance_loss_mlp": 0.0125808, + "epoch": 0.7520216443709604, + "flos": 14580596171520.0, + "grad_norm": 1.7335317284626974, + "language_loss": 0.71662533, + "learning_rate": 6.10997688743631e-07, + "loss": 0.79329199, + "num_input_tokens_seen": 269787280, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.09777832, + "step": 12508, + "time_per_iteration": 2.5105843544006348 + }, + { + "auxiliary_loss_clip": 0.06401521, + "auxiliary_loss_mlp": 0.0126325, + "balance_loss_clip": 0.06269743, + "balance_loss_mlp": 0.01254262, + "epoch": 0.7520817676236284, + "flos": 17062420654080.0, + "grad_norm": 1.5570539032807615, + "language_loss": 0.72277093, + "learning_rate": 6.107175006773885e-07, + "loss": 0.79941863, + "num_input_tokens_seen": 269805205, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.08984375, + "step": 12509, + "time_per_iteration": 2.452536106109619 + }, + { + "auxiliary_loss_clip": 0.06410283, + "auxiliary_loss_mlp": 0.01268332, + "balance_loss_clip": 0.06271298, + "balance_loss_mlp": 0.01257496, + "epoch": 0.7521418908762965, + "flos": 25673517849600.0, + "grad_norm": 1.5708944313915068, + "language_loss": 0.61849803, + "learning_rate": 6.104373652928785e-07, + "loss": 0.69528419, + "num_input_tokens_seen": 269824820, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.10839844, + "step": 12510, + "time_per_iteration": 2.5873842239379883 + }, + { + "auxiliary_loss_clip": 0.0640108, + "auxiliary_loss_mlp": 0.01265287, + "balance_loss_clip": 0.06272186, + "balance_loss_mlp": 0.01255613, + "epoch": 0.7522020141289644, + "flos": 20893079836800.0, + "grad_norm": 2.376424166314484, + "language_loss": 0.81816781, + "learning_rate": 6.10157282600722e-07, + "loss": 0.89483154, + "num_input_tokens_seen": 269842825, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.09674072, + "step": 12511, + "time_per_iteration": 3.9771971702575684 + }, + { + "auxiliary_loss_clip": 0.06408679, + "auxiliary_loss_mlp": 0.01269282, + "balance_loss_clip": 0.06270606, + "balance_loss_mlp": 0.01258571, + "epoch": 0.7522621373816324, + "flos": 12645134236800.0, + "grad_norm": 1.635821418460478, + "language_loss": 0.76383078, + "learning_rate": 6.098772526115412e-07, + "loss": 0.84061033, + "num_input_tokens_seen": 269859000, + "router_z_loss_clip": 1.38183594, + "router_z_loss_mlp": 0.1071167, + "step": 12512, + "time_per_iteration": 2.497439384460449 + }, + { + "auxiliary_loss_clip": 0.06396883, + "auxiliary_loss_mlp": 0.01265576, + "balance_loss_clip": 0.06270725, + "balance_loss_mlp": 0.01256557, + "epoch": 0.7523222606343003, + "flos": 25632624257280.0, + "grad_norm": 1.702992973321348, + "language_loss": 0.82472456, + "learning_rate": 6.095972753359537e-07, + "loss": 0.90134907, + "num_input_tokens_seen": 269878895, + "router_z_loss_clip": 1.26171875, + "router_z_loss_mlp": 0.09002686, + "step": 12513, + "time_per_iteration": 2.581941604614258 + }, + { + "auxiliary_loss_clip": 0.06405152, + "auxiliary_loss_mlp": 0.01262838, + "balance_loss_clip": 0.06268971, + "balance_loss_mlp": 0.01252747, + "epoch": 0.7523823838869683, + "flos": 20455142872320.0, + "grad_norm": 1.6682256759648477, + "language_loss": 0.7510156, + "learning_rate": 6.093173507845771e-07, + "loss": 0.82769549, + "num_input_tokens_seen": 269897280, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.10089111, + "step": 12514, + "time_per_iteration": 2.4942328929901123 + }, + { + "auxiliary_loss_clip": 0.06397319, + "auxiliary_loss_mlp": 0.0126564, + "balance_loss_clip": 0.06269006, + "balance_loss_mlp": 0.01256955, + "epoch": 0.7524425071396362, + "flos": 14725890351360.0, + "grad_norm": 1.7883586477571864, + "language_loss": 0.689107, + "learning_rate": 6.090374789680271e-07, + "loss": 0.76573658, + "num_input_tokens_seen": 269914640, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.08679199, + "step": 12515, + "time_per_iteration": 2.494940996170044 + }, + { + "auxiliary_loss_clip": 0.06405492, + "auxiliary_loss_mlp": 0.01266715, + "balance_loss_clip": 0.06272881, + "balance_loss_mlp": 0.01257225, + "epoch": 0.7525026303923043, + "flos": 30600004728960.0, + "grad_norm": 2.8396136921883905, + "language_loss": 0.70415783, + "learning_rate": 6.087576598969137e-07, + "loss": 0.78087991, + "num_input_tokens_seen": 269934960, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.09490967, + "step": 12516, + "time_per_iteration": 2.584015130996704 + }, + { + "auxiliary_loss_clip": 0.06399474, + "auxiliary_loss_mlp": 0.01267761, + "balance_loss_clip": 0.06270649, + "balance_loss_mlp": 0.01258325, + "epoch": 0.7525627536449722, + "flos": 24798901731840.0, + "grad_norm": 1.5910108360276343, + "language_loss": 0.89611065, + "learning_rate": 6.084778935818495e-07, + "loss": 0.97278303, + "num_input_tokens_seen": 269956655, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.09436035, + "step": 12517, + "time_per_iteration": 2.5272841453552246 + }, + { + "auxiliary_loss_clip": 0.06410724, + "auxiliary_loss_mlp": 0.01264516, + "balance_loss_clip": 0.06273246, + "balance_loss_mlp": 0.01254359, + "epoch": 0.7526228768976402, + "flos": 20786499043200.0, + "grad_norm": 1.4709684896857864, + "language_loss": 0.74636328, + "learning_rate": 6.081981800334437e-07, + "loss": 0.82311571, + "num_input_tokens_seen": 269976835, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.10150146, + "step": 12518, + "time_per_iteration": 2.507249116897583 + }, + { + "auxiliary_loss_clip": 0.06313983, + "auxiliary_loss_mlp": 0.01251233, + "balance_loss_clip": 0.06258783, + "balance_loss_mlp": 0.01250141, + "epoch": 0.7526830001503081, + "flos": 66578017662720.0, + "grad_norm": 0.6920212642256274, + "language_loss": 0.55552846, + "learning_rate": 6.079185192623017e-07, + "loss": 0.63118064, + "num_input_tokens_seen": 270040630, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01094055, + "step": 12519, + "time_per_iteration": 3.1638381481170654 + }, + { + "auxiliary_loss_clip": 0.06402172, + "auxiliary_loss_mlp": 0.0126505, + "balance_loss_clip": 0.06268954, + "balance_loss_mlp": 0.01255423, + "epoch": 0.7527431234029761, + "flos": 23484755422080.0, + "grad_norm": 1.392327642078427, + "language_loss": 0.77952313, + "learning_rate": 6.07638911279029e-07, + "loss": 0.85619533, + "num_input_tokens_seen": 270059695, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.09625244, + "step": 12520, + "time_per_iteration": 2.5008206367492676 + }, + { + "auxiliary_loss_clip": 0.06405456, + "auxiliary_loss_mlp": 0.01265903, + "balance_loss_clip": 0.06273633, + "balance_loss_mlp": 0.01256158, + "epoch": 0.752803246655644, + "flos": 22055265567360.0, + "grad_norm": 8.971083878889642, + "language_loss": 0.74495649, + "learning_rate": 6.07359356094229e-07, + "loss": 0.82167011, + "num_input_tokens_seen": 270078420, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.09747314, + "step": 12521, + "time_per_iteration": 2.5451552867889404 + }, + { + "auxiliary_loss_clip": 0.06412265, + "auxiliary_loss_mlp": 0.0126799, + "balance_loss_clip": 0.06272561, + "balance_loss_mlp": 0.01257059, + "epoch": 0.752863369908312, + "flos": 30161606567040.0, + "grad_norm": 1.8189760564155686, + "language_loss": 0.67176616, + "learning_rate": 6.070798537185016e-07, + "loss": 0.74856877, + "num_input_tokens_seen": 270097040, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.10925293, + "step": 12522, + "time_per_iteration": 2.556718349456787 + }, + { + "auxiliary_loss_clip": 0.06409014, + "auxiliary_loss_mlp": 0.01271964, + "balance_loss_clip": 0.06271487, + "balance_loss_mlp": 0.01261825, + "epoch": 0.7529234931609801, + "flos": 24573874792320.0, + "grad_norm": 1.5612093736475694, + "language_loss": 0.78733182, + "learning_rate": 6.068004041624453e-07, + "loss": 0.86414158, + "num_input_tokens_seen": 270116365, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.10137939, + "step": 12523, + "time_per_iteration": 2.5776190757751465 + }, + { + "auxiliary_loss_clip": 0.0639995, + "auxiliary_loss_mlp": 0.01266629, + "balance_loss_clip": 0.06269381, + "balance_loss_mlp": 0.0125683, + "epoch": 0.752983616413648, + "flos": 23119088204160.0, + "grad_norm": 1.791528721862032, + "language_loss": 0.80482811, + "learning_rate": 6.065210074366571e-07, + "loss": 0.88149387, + "num_input_tokens_seen": 270135395, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.09796143, + "step": 12524, + "time_per_iteration": 2.500800132751465 + }, + { + "auxiliary_loss_clip": 0.06402539, + "auxiliary_loss_mlp": 0.01269955, + "balance_loss_clip": 0.06271717, + "balance_loss_mlp": 0.01260996, + "epoch": 0.753043739666316, + "flos": 24323928462720.0, + "grad_norm": 1.510186119620748, + "language_loss": 0.74149638, + "learning_rate": 6.062416635517326e-07, + "loss": 0.81822133, + "num_input_tokens_seen": 270156425, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.08953857, + "step": 12525, + "time_per_iteration": 2.5363988876342773 + }, + { + "auxiliary_loss_clip": 0.0639966, + "auxiliary_loss_mlp": 0.01264528, + "balance_loss_clip": 0.06270238, + "balance_loss_mlp": 0.01254777, + "epoch": 0.7531038629189839, + "flos": 24250149342720.0, + "grad_norm": 1.8502310757699438, + "language_loss": 0.725272, + "learning_rate": 6.059623725182641e-07, + "loss": 0.80191386, + "num_input_tokens_seen": 270176905, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.09753418, + "step": 12526, + "time_per_iteration": 2.5115420818328857 + }, + { + "auxiliary_loss_clip": 0.06402011, + "auxiliary_loss_mlp": 0.01263679, + "balance_loss_clip": 0.06270412, + "balance_loss_mlp": 0.01254167, + "epoch": 0.7531639861716519, + "flos": 30196378811520.0, + "grad_norm": 1.617761308290089, + "language_loss": 0.72719419, + "learning_rate": 6.056831343468414e-07, + "loss": 0.80385113, + "num_input_tokens_seen": 270196640, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09509277, + "step": 12527, + "time_per_iteration": 2.620079517364502 + }, + { + "auxiliary_loss_clip": 0.06399914, + "auxiliary_loss_mlp": 0.01265035, + "balance_loss_clip": 0.06268723, + "balance_loss_mlp": 0.01255558, + "epoch": 0.7532241094243198, + "flos": 18229050650880.0, + "grad_norm": 1.8406342788129475, + "language_loss": 0.81231797, + "learning_rate": 6.054039490480539e-07, + "loss": 0.88896745, + "num_input_tokens_seen": 270213905, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.0947876, + "step": 12528, + "time_per_iteration": 2.4696736335754395 + }, + { + "auxiliary_loss_clip": 0.06403716, + "auxiliary_loss_mlp": 0.01265532, + "balance_loss_clip": 0.06269462, + "balance_loss_mlp": 0.0125525, + "epoch": 0.7532842326769879, + "flos": 20886413437440.0, + "grad_norm": 2.282089070313471, + "language_loss": 0.85098541, + "learning_rate": 6.051248166324892e-07, + "loss": 0.92767787, + "num_input_tokens_seen": 270231995, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.1027832, + "step": 12529, + "time_per_iteration": 2.5071592330932617 + }, + { + "auxiliary_loss_clip": 0.06410262, + "auxiliary_loss_mlp": 0.01264635, + "balance_loss_clip": 0.06272294, + "balance_loss_mlp": 0.01254818, + "epoch": 0.7533443559296558, + "flos": 18084762720000.0, + "grad_norm": 1.902579288696582, + "language_loss": 0.74726146, + "learning_rate": 6.048457371107303e-07, + "loss": 0.82401049, + "num_input_tokens_seen": 270251480, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.09814453, + "step": 12530, + "time_per_iteration": 2.502178192138672 + }, + { + "auxiliary_loss_clip": 0.06308636, + "auxiliary_loss_mlp": 0.01252721, + "balance_loss_clip": 0.06253405, + "balance_loss_mlp": 0.01251678, + "epoch": 0.7534044791823238, + "flos": 50271668398080.0, + "grad_norm": 0.8173638776820421, + "language_loss": 0.63636577, + "learning_rate": 6.045667104933612e-07, + "loss": 0.71197939, + "num_input_tokens_seen": 270306480, + "router_z_loss_clip": 0.55273438, + "router_z_loss_mlp": 0.01044464, + "step": 12531, + "time_per_iteration": 2.9869658946990967 + }, + { + "auxiliary_loss_clip": 0.06406563, + "auxiliary_loss_mlp": 0.01265391, + "balance_loss_clip": 0.06270574, + "balance_loss_mlp": 0.01255437, + "epoch": 0.7534646024349917, + "flos": 20856588583680.0, + "grad_norm": 2.370705934223187, + "language_loss": 0.70650482, + "learning_rate": 6.042877367909633e-07, + "loss": 0.78322434, + "num_input_tokens_seen": 270324595, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.0994873, + "step": 12532, + "time_per_iteration": 3.92488169670105 + }, + { + "auxiliary_loss_clip": 0.06397863, + "auxiliary_loss_mlp": 0.01267261, + "balance_loss_clip": 0.06270358, + "balance_loss_mlp": 0.01257814, + "epoch": 0.7535247256876597, + "flos": 23077775341440.0, + "grad_norm": 1.5088215588647627, + "language_loss": 0.77771306, + "learning_rate": 6.040088160141132e-07, + "loss": 0.85436428, + "num_input_tokens_seen": 270344375, + "router_z_loss_clip": 1.27441406, + "router_z_loss_mlp": 0.09442139, + "step": 12533, + "time_per_iteration": 2.489647626876831 + }, + { + "auxiliary_loss_clip": 0.06306736, + "auxiliary_loss_mlp": 0.01251137, + "balance_loss_clip": 0.06251442, + "balance_loss_mlp": 0.01250062, + "epoch": 0.7535848489403276, + "flos": 58643888002560.0, + "grad_norm": 0.7841580581676975, + "language_loss": 0.57404244, + "learning_rate": 6.037299481733886e-07, + "loss": 0.64962119, + "num_input_tokens_seen": 270405235, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01076508, + "step": 12534, + "time_per_iteration": 3.1910510063171387 + }, + { + "auxiliary_loss_clip": 0.06403376, + "auxiliary_loss_mlp": 0.01267552, + "balance_loss_clip": 0.06270553, + "balance_loss_mlp": 0.01257568, + "epoch": 0.7536449721929956, + "flos": 26585044490880.0, + "grad_norm": 1.3288810458432065, + "language_loss": 0.71601486, + "learning_rate": 6.03451133279365e-07, + "loss": 0.79272413, + "num_input_tokens_seen": 270425820, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.09991455, + "step": 12535, + "time_per_iteration": 2.5521280765533447 + }, + { + "auxiliary_loss_clip": 0.06405595, + "auxiliary_loss_mlp": 0.0126787, + "balance_loss_clip": 0.06269699, + "balance_loss_mlp": 0.01258024, + "epoch": 0.7537050954456637, + "flos": 25742559214080.0, + "grad_norm": 1.4204428074088968, + "language_loss": 0.80683547, + "learning_rate": 6.031723713426135e-07, + "loss": 0.88357008, + "num_input_tokens_seen": 270447120, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.09838867, + "step": 12536, + "time_per_iteration": 2.612800359725952 + }, + { + "auxiliary_loss_clip": 0.06397747, + "auxiliary_loss_mlp": 0.01263423, + "balance_loss_clip": 0.06268154, + "balance_loss_mlp": 0.01254006, + "epoch": 0.7537652186983316, + "flos": 30231863815680.0, + "grad_norm": 2.5926766320548333, + "language_loss": 0.7478568, + "learning_rate": 6.028936623737067e-07, + "loss": 0.82446849, + "num_input_tokens_seen": 270468680, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.09423828, + "step": 12537, + "time_per_iteration": 2.6071624755859375 + }, + { + "auxiliary_loss_clip": 0.06407893, + "auxiliary_loss_mlp": 0.01268462, + "balance_loss_clip": 0.06273423, + "balance_loss_mlp": 0.01258771, + "epoch": 0.7538253419509996, + "flos": 12646224339840.0, + "grad_norm": 1.6302297616085528, + "language_loss": 0.74427301, + "learning_rate": 6.026150063832111e-07, + "loss": 0.82103658, + "num_input_tokens_seen": 270486310, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.09698486, + "step": 12538, + "time_per_iteration": 2.532360076904297 + }, + { + "auxiliary_loss_clip": 0.06404191, + "auxiliary_loss_mlp": 0.01267125, + "balance_loss_clip": 0.06270571, + "balance_loss_mlp": 0.01256676, + "epoch": 0.7538854652036675, + "flos": 23192783470080.0, + "grad_norm": 1.9550849129782661, + "language_loss": 0.67649639, + "learning_rate": 6.023364033816956e-07, + "loss": 0.75320947, + "num_input_tokens_seen": 270507210, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.10455322, + "step": 12539, + "time_per_iteration": 2.5289549827575684 + }, + { + "auxiliary_loss_clip": 0.06399977, + "auxiliary_loss_mlp": 0.01264844, + "balance_loss_clip": 0.06269806, + "balance_loss_mlp": 0.01255296, + "epoch": 0.7539455884563355, + "flos": 23193076959360.0, + "grad_norm": 1.5765955359694397, + "language_loss": 0.74866569, + "learning_rate": 6.020578533797229e-07, + "loss": 0.82531393, + "num_input_tokens_seen": 270525250, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.09552002, + "step": 12540, + "time_per_iteration": 2.519505023956299 + }, + { + "auxiliary_loss_clip": 0.06404985, + "auxiliary_loss_mlp": 0.01264812, + "balance_loss_clip": 0.06269932, + "balance_loss_mlp": 0.01254816, + "epoch": 0.7540057117090034, + "flos": 13184998093440.0, + "grad_norm": 1.8443764292717588, + "language_loss": 0.73148596, + "learning_rate": 6.017793563878566e-07, + "loss": 0.80818391, + "num_input_tokens_seen": 270539295, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.10003662, + "step": 12541, + "time_per_iteration": 2.4335999488830566 + }, + { + "auxiliary_loss_clip": 0.06404177, + "auxiliary_loss_mlp": 0.0126394, + "balance_loss_clip": 0.06270086, + "balance_loss_mlp": 0.01254254, + "epoch": 0.7540658349616715, + "flos": 45488561783040.0, + "grad_norm": 1.5152984414319595, + "language_loss": 0.72388256, + "learning_rate": 6.015009124166576e-07, + "loss": 0.80056369, + "num_input_tokens_seen": 270562815, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.09680176, + "step": 12542, + "time_per_iteration": 4.1390299797058105 + }, + { + "auxiliary_loss_clip": 0.06397901, + "auxiliary_loss_mlp": 0.01264113, + "balance_loss_clip": 0.06268644, + "balance_loss_mlp": 0.01254344, + "epoch": 0.7541259582143394, + "flos": 19935754139520.0, + "grad_norm": 2.884156487358873, + "language_loss": 0.84689027, + "learning_rate": 6.012225214766844e-07, + "loss": 0.92351043, + "num_input_tokens_seen": 270579055, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.09771729, + "step": 12543, + "time_per_iteration": 2.503478765487671 + }, + { + "auxiliary_loss_clip": 0.06401214, + "auxiliary_loss_mlp": 0.0126353, + "balance_loss_clip": 0.06271526, + "balance_loss_mlp": 0.01253886, + "epoch": 0.7541860814670074, + "flos": 27205521575040.0, + "grad_norm": 2.0819371266250095, + "language_loss": 0.73893505, + "learning_rate": 6.009441835784927e-07, + "loss": 0.81558251, + "num_input_tokens_seen": 270599080, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.09643555, + "step": 12544, + "time_per_iteration": 2.5382394790649414 + }, + { + "auxiliary_loss_clip": 0.06402065, + "auxiliary_loss_mlp": 0.01263786, + "balance_loss_clip": 0.06270371, + "balance_loss_mlp": 0.01254505, + "epoch": 0.7542462047196753, + "flos": 21330471749760.0, + "grad_norm": 1.7394409636932977, + "language_loss": 0.68186235, + "learning_rate": 6.006658987326383e-07, + "loss": 0.7585209, + "num_input_tokens_seen": 270618715, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09277344, + "step": 12545, + "time_per_iteration": 3.9819624423980713 + }, + { + "auxiliary_loss_clip": 0.06407365, + "auxiliary_loss_mlp": 0.01263612, + "balance_loss_clip": 0.06273335, + "balance_loss_mlp": 0.01254326, + "epoch": 0.7543063279723433, + "flos": 11944630903680.0, + "grad_norm": 1.6656335194491443, + "language_loss": 0.69190776, + "learning_rate": 6.003876669496728e-07, + "loss": 0.76861751, + "num_input_tokens_seen": 270635695, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.09283447, + "step": 12546, + "time_per_iteration": 2.5855300426483154 + }, + { + "auxiliary_loss_clip": 0.06408285, + "auxiliary_loss_mlp": 0.01269444, + "balance_loss_clip": 0.06272961, + "balance_loss_mlp": 0.01258423, + "epoch": 0.7543664512250112, + "flos": 22826529273600.0, + "grad_norm": 2.2583251382821268, + "language_loss": 0.73943269, + "learning_rate": 6.00109488240147e-07, + "loss": 0.81620997, + "num_input_tokens_seen": 270654325, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.11022949, + "step": 12547, + "time_per_iteration": 2.5086138248443604 + }, + { + "auxiliary_loss_clip": 0.0640479, + "auxiliary_loss_mlp": 0.01264266, + "balance_loss_clip": 0.06272002, + "balance_loss_mlp": 0.01253943, + "epoch": 0.7544265744776792, + "flos": 20930283849600.0, + "grad_norm": 1.77678899313766, + "language_loss": 0.68066597, + "learning_rate": 5.998313626146099e-07, + "loss": 0.75735652, + "num_input_tokens_seen": 270674260, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.10333252, + "step": 12548, + "time_per_iteration": 2.534188747406006 + }, + { + "auxiliary_loss_clip": 0.0640662, + "auxiliary_loss_mlp": 0.01267563, + "balance_loss_clip": 0.06271043, + "balance_loss_mlp": 0.01257811, + "epoch": 0.7544866977303473, + "flos": 15200947474560.0, + "grad_norm": 1.8925592973514778, + "language_loss": 0.87693512, + "learning_rate": 5.995532900836088e-07, + "loss": 0.95367694, + "num_input_tokens_seen": 270692200, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.09747314, + "step": 12549, + "time_per_iteration": 2.508145332336426 + }, + { + "auxiliary_loss_clip": 0.06395473, + "auxiliary_loss_mlp": 0.01264006, + "balance_loss_clip": 0.06269422, + "balance_loss_mlp": 0.01254213, + "epoch": 0.7545468209830152, + "flos": 27090094176000.0, + "grad_norm": 1.707615461244764, + "language_loss": 0.77432424, + "learning_rate": 5.992752706576865e-07, + "loss": 0.85091901, + "num_input_tokens_seen": 270709675, + "router_z_loss_clip": 1.25976562, + "router_z_loss_mlp": 0.09790039, + "step": 12550, + "time_per_iteration": 3.9424808025360107 + }, + { + "auxiliary_loss_clip": 0.06406951, + "auxiliary_loss_mlp": 0.01264837, + "balance_loss_clip": 0.06272922, + "balance_loss_mlp": 0.01254967, + "epoch": 0.7546069442356832, + "flos": 26879238576000.0, + "grad_norm": 1.4048272187532633, + "language_loss": 0.6982311, + "learning_rate": 5.98997304347386e-07, + "loss": 0.77494895, + "num_input_tokens_seen": 270733055, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.09869385, + "step": 12551, + "time_per_iteration": 2.577078342437744 + }, + { + "auxiliary_loss_clip": 0.06402165, + "auxiliary_loss_mlp": 0.01267501, + "balance_loss_clip": 0.06271981, + "balance_loss_mlp": 0.0125766, + "epoch": 0.7546670674883511, + "flos": 15748735541760.0, + "grad_norm": 1.8643367564290814, + "language_loss": 0.86457175, + "learning_rate": 5.987193911632487e-07, + "loss": 0.94126844, + "num_input_tokens_seen": 270749275, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.09832764, + "step": 12552, + "time_per_iteration": 2.5127792358398438 + }, + { + "auxiliary_loss_clip": 0.06407504, + "auxiliary_loss_mlp": 0.01265602, + "balance_loss_clip": 0.0627365, + "balance_loss_mlp": 0.01256393, + "epoch": 0.7547271907410191, + "flos": 23484545786880.0, + "grad_norm": 1.6196877851330536, + "language_loss": 0.78280461, + "learning_rate": 5.98441531115812e-07, + "loss": 0.85953569, + "num_input_tokens_seen": 270768230, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.09210205, + "step": 12553, + "time_per_iteration": 2.5273962020874023 + }, + { + "auxiliary_loss_clip": 0.06404902, + "auxiliary_loss_mlp": 0.01264178, + "balance_loss_clip": 0.06272501, + "balance_loss_mlp": 0.01254027, + "epoch": 0.754787313993687, + "flos": 31730898159360.0, + "grad_norm": 2.42415612197757, + "language_loss": 0.63542819, + "learning_rate": 5.981637242156135e-07, + "loss": 0.71211898, + "num_input_tokens_seen": 270786285, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.1015625, + "step": 12554, + "time_per_iteration": 2.5882747173309326 + }, + { + "auxiliary_loss_clip": 0.06402658, + "auxiliary_loss_mlp": 0.01263371, + "balance_loss_clip": 0.06271334, + "balance_loss_mlp": 0.01253983, + "epoch": 0.7548474372463551, + "flos": 27570392179200.0, + "grad_norm": 1.504037054855903, + "language_loss": 0.73400116, + "learning_rate": 5.978859704731864e-07, + "loss": 0.81066149, + "num_input_tokens_seen": 270805505, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09393311, + "step": 12555, + "time_per_iteration": 2.539822578430176 + }, + { + "auxiliary_loss_clip": 0.0640943, + "auxiliary_loss_mlp": 0.01263982, + "balance_loss_clip": 0.06275169, + "balance_loss_mlp": 0.01253599, + "epoch": 0.754907560499023, + "flos": 19324752566400.0, + "grad_norm": 1.737792546565587, + "language_loss": 0.78918052, + "learning_rate": 5.976082698990645e-07, + "loss": 0.86591458, + "num_input_tokens_seen": 270824610, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.10388184, + "step": 12556, + "time_per_iteration": 2.520672082901001 + }, + { + "auxiliary_loss_clip": 0.06309493, + "auxiliary_loss_mlp": 0.01252888, + "balance_loss_clip": 0.06254127, + "balance_loss_mlp": 0.01251748, + "epoch": 0.754967683751691, + "flos": 69765795993600.0, + "grad_norm": 0.6939528334291757, + "language_loss": 0.50454944, + "learning_rate": 5.973306225037769e-07, + "loss": 0.58017325, + "num_input_tokens_seen": 270886155, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01139832, + "step": 12557, + "time_per_iteration": 3.1293344497680664 + }, + { + "auxiliary_loss_clip": 0.06408815, + "auxiliary_loss_mlp": 0.01264037, + "balance_loss_clip": 0.06273429, + "balance_loss_mlp": 0.01253857, + "epoch": 0.7550278070043589, + "flos": 24428161342080.0, + "grad_norm": 1.622493392306736, + "language_loss": 0.71709013, + "learning_rate": 5.970530282978525e-07, + "loss": 0.79381871, + "num_input_tokens_seen": 270905325, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.10186768, + "step": 12558, + "time_per_iteration": 2.5321953296661377 + }, + { + "auxiliary_loss_clip": 0.06402349, + "auxiliary_loss_mlp": 0.01266792, + "balance_loss_clip": 0.0626944, + "balance_loss_mlp": 0.01257726, + "epoch": 0.7550879302570269, + "flos": 32642802144000.0, + "grad_norm": 1.8637892647127214, + "language_loss": 0.80580068, + "learning_rate": 5.967754872918187e-07, + "loss": 0.88249207, + "num_input_tokens_seen": 270927535, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09063721, + "step": 12559, + "time_per_iteration": 2.615544557571411 + }, + { + "auxiliary_loss_clip": 0.06405831, + "auxiliary_loss_mlp": 0.01265308, + "balance_loss_clip": 0.06270069, + "balance_loss_mlp": 0.01255276, + "epoch": 0.7551480535096948, + "flos": 21801461950080.0, + "grad_norm": 1.6337605293226678, + "language_loss": 0.78857327, + "learning_rate": 5.96497999496199e-07, + "loss": 0.86528468, + "num_input_tokens_seen": 270946920, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.10021973, + "step": 12560, + "time_per_iteration": 2.5266849994659424 + }, + { + "auxiliary_loss_clip": 0.06401823, + "auxiliary_loss_mlp": 0.01266116, + "balance_loss_clip": 0.06271054, + "balance_loss_mlp": 0.01256752, + "epoch": 0.7552081767623628, + "flos": 18521022602880.0, + "grad_norm": 1.579385743882106, + "language_loss": 0.70900261, + "learning_rate": 5.96220564921515e-07, + "loss": 0.78568202, + "num_input_tokens_seen": 270965705, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09362793, + "step": 12561, + "time_per_iteration": 2.4935779571533203 + }, + { + "auxiliary_loss_clip": 0.06401284, + "auxiliary_loss_mlp": 0.01266321, + "balance_loss_clip": 0.06268281, + "balance_loss_mlp": 0.01256594, + "epoch": 0.7552683000150308, + "flos": 27641949166080.0, + "grad_norm": 1.5637953071800728, + "language_loss": 0.7579698, + "learning_rate": 5.959431835782889e-07, + "loss": 0.83464587, + "num_input_tokens_seen": 270986550, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.09735107, + "step": 12562, + "time_per_iteration": 2.5509040355682373 + }, + { + "auxiliary_loss_clip": 0.06403111, + "auxiliary_loss_mlp": 0.01264985, + "balance_loss_clip": 0.06271905, + "balance_loss_mlp": 0.01255144, + "epoch": 0.7553284232676988, + "flos": 20309135932800.0, + "grad_norm": 1.8403167486550738, + "language_loss": 0.75524759, + "learning_rate": 5.956658554770371e-07, + "loss": 0.83192855, + "num_input_tokens_seen": 271006250, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09838867, + "step": 12563, + "time_per_iteration": 2.513921022415161 + }, + { + "auxiliary_loss_clip": 0.06417328, + "auxiliary_loss_mlp": 0.01264981, + "balance_loss_clip": 0.06274921, + "balance_loss_mlp": 0.01253454, + "epoch": 0.7553885465203668, + "flos": 33263866206720.0, + "grad_norm": 2.816655574793258, + "language_loss": 0.67061448, + "learning_rate": 5.953885806282768e-07, + "loss": 0.7474376, + "num_input_tokens_seen": 271025575, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.11529541, + "step": 12564, + "time_per_iteration": 2.5836448669433594 + }, + { + "auxiliary_loss_clip": 0.06408054, + "auxiliary_loss_mlp": 0.01265348, + "balance_loss_clip": 0.06272587, + "balance_loss_mlp": 0.01254929, + "epoch": 0.7554486697730347, + "flos": 21622653336960.0, + "grad_norm": 1.6673790511457676, + "language_loss": 0.68740308, + "learning_rate": 5.951113590425228e-07, + "loss": 0.76413709, + "num_input_tokens_seen": 271045805, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.10412598, + "step": 12565, + "time_per_iteration": 2.547016143798828 + }, + { + "auxiliary_loss_clip": 0.06408931, + "auxiliary_loss_mlp": 0.01266223, + "balance_loss_clip": 0.06269513, + "balance_loss_mlp": 0.01255864, + "epoch": 0.7555087930257027, + "flos": 27639810887040.0, + "grad_norm": 1.5709631477548602, + "language_loss": 0.74854088, + "learning_rate": 5.94834190730287e-07, + "loss": 0.82529235, + "num_input_tokens_seen": 271066065, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.10357666, + "step": 12566, + "time_per_iteration": 2.5360589027404785 + }, + { + "auxiliary_loss_clip": 0.06412722, + "auxiliary_loss_mlp": 0.01268164, + "balance_loss_clip": 0.0627517, + "balance_loss_mlp": 0.01257399, + "epoch": 0.7555689162783706, + "flos": 23628162885120.0, + "grad_norm": 2.012452039611991, + "language_loss": 0.74581742, + "learning_rate": 5.945570757020789e-07, + "loss": 0.82262623, + "num_input_tokens_seen": 271085870, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.10766602, + "step": 12567, + "time_per_iteration": 2.5815160274505615 + }, + { + "auxiliary_loss_clip": 0.06405583, + "auxiliary_loss_mlp": 0.01263668, + "balance_loss_clip": 0.06273046, + "balance_loss_mlp": 0.01254155, + "epoch": 0.7556290395310387, + "flos": 24869955594240.0, + "grad_norm": 2.2187055340404216, + "language_loss": 0.62846589, + "learning_rate": 5.942800139684073e-07, + "loss": 0.70515835, + "num_input_tokens_seen": 271104260, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09515381, + "step": 12568, + "time_per_iteration": 2.5301473140716553 + }, + { + "auxiliary_loss_clip": 0.06402, + "auxiliary_loss_mlp": 0.01264781, + "balance_loss_clip": 0.06270471, + "balance_loss_mlp": 0.01255471, + "epoch": 0.7556891627837066, + "flos": 43553770680960.0, + "grad_norm": 1.9192871198198145, + "language_loss": 0.66908652, + "learning_rate": 5.940030055397789e-07, + "loss": 0.7457543, + "num_input_tokens_seen": 271125745, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09301758, + "step": 12569, + "time_per_iteration": 2.707559585571289 + }, + { + "auxiliary_loss_clip": 0.06408378, + "auxiliary_loss_mlp": 0.01264951, + "balance_loss_clip": 0.06271549, + "balance_loss_mlp": 0.01254527, + "epoch": 0.7557492860363746, + "flos": 26658110851200.0, + "grad_norm": 2.041017717148161, + "language_loss": 0.67703956, + "learning_rate": 5.93726050426697e-07, + "loss": 0.75377285, + "num_input_tokens_seen": 271147145, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.10424805, + "step": 12570, + "time_per_iteration": 2.5359280109405518 + }, + { + "auxiliary_loss_clip": 0.06407271, + "auxiliary_loss_mlp": 0.0126553, + "balance_loss_clip": 0.0627284, + "balance_loss_mlp": 0.01255868, + "epoch": 0.7558094092890425, + "flos": 55194857769600.0, + "grad_norm": 1.6855740351628876, + "language_loss": 0.71908271, + "learning_rate": 5.934491486396647e-07, + "loss": 0.7958107, + "num_input_tokens_seen": 271170865, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.09667969, + "step": 12571, + "time_per_iteration": 2.8340237140655518 + }, + { + "auxiliary_loss_clip": 0.06408758, + "auxiliary_loss_mlp": 0.01265226, + "balance_loss_clip": 0.06272244, + "balance_loss_mlp": 0.01255242, + "epoch": 0.7558695325417105, + "flos": 23995171768320.0, + "grad_norm": 1.5360803868989372, + "language_loss": 0.74071586, + "learning_rate": 5.931723001891811e-07, + "loss": 0.81745565, + "num_input_tokens_seen": 271191450, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.09985352, + "step": 12572, + "time_per_iteration": 4.078891754150391 + }, + { + "auxiliary_loss_clip": 0.06408488, + "auxiliary_loss_mlp": 0.01264697, + "balance_loss_clip": 0.06272225, + "balance_loss_mlp": 0.01254981, + "epoch": 0.7559296557943784, + "flos": 14616542373120.0, + "grad_norm": 2.087893523265595, + "language_loss": 0.77022463, + "learning_rate": 5.928955050857456e-07, + "loss": 0.84695649, + "num_input_tokens_seen": 271207335, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.097229, + "step": 12573, + "time_per_iteration": 2.4667983055114746 + }, + { + "auxiliary_loss_clip": 0.06406313, + "auxiliary_loss_mlp": 0.01264981, + "balance_loss_clip": 0.06269629, + "balance_loss_mlp": 0.01254032, + "epoch": 0.7559897790470465, + "flos": 18556214117760.0, + "grad_norm": 1.6481386316669568, + "language_loss": 0.69339514, + "learning_rate": 5.926187633398527e-07, + "loss": 0.7701081, + "num_input_tokens_seen": 271226895, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.10955811, + "step": 12574, + "time_per_iteration": 2.521108627319336 + }, + { + "auxiliary_loss_clip": 0.06401183, + "auxiliary_loss_mlp": 0.0126439, + "balance_loss_clip": 0.0626963, + "balance_loss_mlp": 0.01254532, + "epoch": 0.7560499022997144, + "flos": 17973695733120.0, + "grad_norm": 2.167691196758321, + "language_loss": 0.71799374, + "learning_rate": 5.923420749619974e-07, + "loss": 0.79464948, + "num_input_tokens_seen": 271244375, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09869385, + "step": 12575, + "time_per_iteration": 2.4676809310913086 + }, + { + "auxiliary_loss_clip": 0.0640292, + "auxiliary_loss_mlp": 0.01261787, + "balance_loss_clip": 0.0626974, + "balance_loss_mlp": 0.0125222, + "epoch": 0.7561100255523824, + "flos": 15742530339840.0, + "grad_norm": 1.985003709379718, + "language_loss": 0.7146281, + "learning_rate": 5.92065439962673e-07, + "loss": 0.79127514, + "num_input_tokens_seen": 271259530, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.09564209, + "step": 12576, + "time_per_iteration": 2.525620937347412 + }, + { + "auxiliary_loss_clip": 0.06402552, + "auxiliary_loss_mlp": 0.01265228, + "balance_loss_clip": 0.06271128, + "balance_loss_mlp": 0.0125497, + "epoch": 0.7561701488050504, + "flos": 15893568524160.0, + "grad_norm": 1.7792307856828309, + "language_loss": 0.67103839, + "learning_rate": 5.917888583523669e-07, + "loss": 0.74771613, + "num_input_tokens_seen": 271276835, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.10247803, + "step": 12577, + "time_per_iteration": 2.468843936920166 + }, + { + "auxiliary_loss_clip": 0.06400042, + "auxiliary_loss_mlp": 0.01263628, + "balance_loss_clip": 0.06269364, + "balance_loss_mlp": 0.01253978, + "epoch": 0.7562302720577183, + "flos": 20345333696640.0, + "grad_norm": 1.5059365090765435, + "language_loss": 0.78157711, + "learning_rate": 5.915123301415685e-07, + "loss": 0.85821384, + "num_input_tokens_seen": 271296275, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09649658, + "step": 12578, + "time_per_iteration": 2.530263900756836 + }, + { + "auxiliary_loss_clip": 0.0640607, + "auxiliary_loss_mlp": 0.01262105, + "balance_loss_clip": 0.06271346, + "balance_loss_mlp": 0.01251871, + "epoch": 0.7562903953103863, + "flos": 20818252540800.0, + "grad_norm": 1.5853993549027412, + "language_loss": 0.76139581, + "learning_rate": 5.912358553407641e-07, + "loss": 0.83807755, + "num_input_tokens_seen": 271315685, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.10229492, + "step": 12579, + "time_per_iteration": 2.507765054702759 + }, + { + "auxiliary_loss_clip": 0.06411377, + "auxiliary_loss_mlp": 0.01264596, + "balance_loss_clip": 0.06272445, + "balance_loss_mlp": 0.01253599, + "epoch": 0.7563505185630542, + "flos": 37606073765760.0, + "grad_norm": 1.7167109835920158, + "language_loss": 0.62744486, + "learning_rate": 5.90959433960437e-07, + "loss": 0.70420462, + "num_input_tokens_seen": 271336790, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.11004639, + "step": 12580, + "time_per_iteration": 2.6855556964874268 + }, + { + "auxiliary_loss_clip": 0.06404164, + "auxiliary_loss_mlp": 0.01266589, + "balance_loss_clip": 0.06272098, + "balance_loss_mlp": 0.01256355, + "epoch": 0.7564106418157223, + "flos": 20237369310720.0, + "grad_norm": 3.698052227516868, + "language_loss": 0.75504309, + "learning_rate": 5.906830660110691e-07, + "loss": 0.83175057, + "num_input_tokens_seen": 271355470, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.10241699, + "step": 12581, + "time_per_iteration": 3.9208571910858154 + }, + { + "auxiliary_loss_clip": 0.06411214, + "auxiliary_loss_mlp": 0.01264654, + "balance_loss_clip": 0.06274357, + "balance_loss_mlp": 0.01254682, + "epoch": 0.7564707650683902, + "flos": 24761949281280.0, + "grad_norm": 1.712129660168012, + "language_loss": 0.63223112, + "learning_rate": 5.904067515031412e-07, + "loss": 0.70898986, + "num_input_tokens_seen": 271375810, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.09967041, + "step": 12582, + "time_per_iteration": 2.5469281673431396 + }, + { + "auxiliary_loss_clip": 0.06310637, + "auxiliary_loss_mlp": 0.01252832, + "balance_loss_clip": 0.06255485, + "balance_loss_mlp": 0.01251842, + "epoch": 0.7565308883210582, + "flos": 48544965711360.0, + "grad_norm": 0.9271563619933442, + "language_loss": 0.60731697, + "learning_rate": 5.901304904471307e-07, + "loss": 0.68295169, + "num_input_tokens_seen": 271424775, + "router_z_loss_clip": 0.55322266, + "router_z_loss_mlp": 0.00989532, + "step": 12583, + "time_per_iteration": 2.8734805583953857 + }, + { + "auxiliary_loss_clip": 0.06408859, + "auxiliary_loss_mlp": 0.01265781, + "balance_loss_clip": 0.06275302, + "balance_loss_mlp": 0.01255792, + "epoch": 0.7565910115737261, + "flos": 12500007765120.0, + "grad_norm": 1.9446553716026287, + "language_loss": 0.7914691, + "learning_rate": 5.898542828535125e-07, + "loss": 0.8682155, + "num_input_tokens_seen": 271440500, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.09985352, + "step": 12584, + "time_per_iteration": 2.5946009159088135 + }, + { + "auxiliary_loss_clip": 0.06402295, + "auxiliary_loss_mlp": 0.01264418, + "balance_loss_clip": 0.06272683, + "balance_loss_mlp": 0.01254559, + "epoch": 0.7566511348263941, + "flos": 21178427316480.0, + "grad_norm": 5.075260482718231, + "language_loss": 0.7806747, + "learning_rate": 5.895781287327612e-07, + "loss": 0.85734189, + "num_input_tokens_seen": 271458180, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.09857178, + "step": 12585, + "time_per_iteration": 4.006917953491211 + }, + { + "auxiliary_loss_clip": 0.06406915, + "auxiliary_loss_mlp": 0.01263646, + "balance_loss_clip": 0.06271342, + "balance_loss_mlp": 0.01253609, + "epoch": 0.756711258079062, + "flos": 21760023306240.0, + "grad_norm": 1.5685604080996611, + "language_loss": 0.83183873, + "learning_rate": 5.893020280953493e-07, + "loss": 0.9085443, + "num_input_tokens_seen": 271475730, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.1003418, + "step": 12586, + "time_per_iteration": 2.4981296062469482 + }, + { + "auxiliary_loss_clip": 0.06409433, + "auxiliary_loss_mlp": 0.01265703, + "balance_loss_clip": 0.06271765, + "balance_loss_mlp": 0.01255487, + "epoch": 0.75677138133173, + "flos": 22389514704000.0, + "grad_norm": 2.1588778105399116, + "language_loss": 0.83529806, + "learning_rate": 5.890259809517459e-07, + "loss": 0.91204941, + "num_input_tokens_seen": 271495030, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.10223389, + "step": 12587, + "time_per_iteration": 2.5264017581939697 + }, + { + "auxiliary_loss_clip": 0.06405166, + "auxiliary_loss_mlp": 0.01262614, + "balance_loss_clip": 0.06272217, + "balance_loss_mlp": 0.01252356, + "epoch": 0.756831504584398, + "flos": 22715252651520.0, + "grad_norm": 1.5206694910339098, + "language_loss": 0.71336639, + "learning_rate": 5.88749987312418e-07, + "loss": 0.79004425, + "num_input_tokens_seen": 271515355, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.1026001, + "step": 12588, + "time_per_iteration": 2.522880792617798 + }, + { + "auxiliary_loss_clip": 0.06410505, + "auxiliary_loss_mlp": 0.01264184, + "balance_loss_clip": 0.06272528, + "balance_loss_mlp": 0.01253777, + "epoch": 0.756891627837066, + "flos": 24105358287360.0, + "grad_norm": 1.8052754527396453, + "language_loss": 0.69118118, + "learning_rate": 5.884740471878327e-07, + "loss": 0.76792806, + "num_input_tokens_seen": 271535090, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.10412598, + "step": 12589, + "time_per_iteration": 2.543221950531006 + }, + { + "auxiliary_loss_clip": 0.06404439, + "auxiliary_loss_mlp": 0.01269435, + "balance_loss_clip": 0.06271015, + "balance_loss_mlp": 0.01259499, + "epoch": 0.756951751089734, + "flos": 19754010633600.0, + "grad_norm": 1.742132882513342, + "language_loss": 0.92203468, + "learning_rate": 5.881981605884522e-07, + "loss": 0.99877346, + "num_input_tokens_seen": 271551075, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.09942627, + "step": 12590, + "time_per_iteration": 3.913285732269287 + }, + { + "auxiliary_loss_clip": 0.06402917, + "auxiliary_loss_mlp": 0.01263743, + "balance_loss_clip": 0.06272686, + "balance_loss_mlp": 0.01253092, + "epoch": 0.7570118743424019, + "flos": 35087883811200.0, + "grad_norm": 1.7860803954634257, + "language_loss": 0.65924931, + "learning_rate": 5.879223275247391e-07, + "loss": 0.7359159, + "num_input_tokens_seen": 271571035, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.10644531, + "step": 12591, + "time_per_iteration": 2.6003847122192383 + }, + { + "auxiliary_loss_clip": 0.06403872, + "auxiliary_loss_mlp": 0.01263019, + "balance_loss_clip": 0.06273251, + "balance_loss_mlp": 0.01253667, + "epoch": 0.7570719975950699, + "flos": 25601835081600.0, + "grad_norm": 1.452450221530786, + "language_loss": 0.73701084, + "learning_rate": 5.876465480071528e-07, + "loss": 0.81367981, + "num_input_tokens_seen": 271592950, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.09356689, + "step": 12592, + "time_per_iteration": 2.5929007530212402 + }, + { + "auxiliary_loss_clip": 0.06405754, + "auxiliary_loss_mlp": 0.01266898, + "balance_loss_clip": 0.06270353, + "balance_loss_mlp": 0.01257165, + "epoch": 0.7571321208477378, + "flos": 10820781216000.0, + "grad_norm": 2.164551759300356, + "language_loss": 0.71882141, + "learning_rate": 5.873708220461522e-07, + "loss": 0.79554784, + "num_input_tokens_seen": 271608835, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.09741211, + "step": 12593, + "time_per_iteration": 2.4659135341644287 + }, + { + "auxiliary_loss_clip": 0.0640605, + "auxiliary_loss_mlp": 0.01263408, + "balance_loss_clip": 0.06271473, + "balance_loss_mlp": 0.01253216, + "epoch": 0.7571922441004059, + "flos": 18266045028480.0, + "grad_norm": 1.7009854752836593, + "language_loss": 0.66789973, + "learning_rate": 5.870951496521903e-07, + "loss": 0.74459434, + "num_input_tokens_seen": 271627730, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.10192871, + "step": 12594, + "time_per_iteration": 2.6039915084838867 + }, + { + "auxiliary_loss_clip": 0.06412069, + "auxiliary_loss_mlp": 0.01266946, + "balance_loss_clip": 0.06273807, + "balance_loss_mlp": 0.01256599, + "epoch": 0.7572523673530738, + "flos": 22896660741120.0, + "grad_norm": 1.6054592725551893, + "language_loss": 0.80899853, + "learning_rate": 5.86819530835722e-07, + "loss": 0.88578868, + "num_input_tokens_seen": 271646415, + "router_z_loss_clip": 1.38183594, + "router_z_loss_mlp": 0.10339355, + "step": 12595, + "time_per_iteration": 2.571235179901123 + }, + { + "auxiliary_loss_clip": 0.06404546, + "auxiliary_loss_mlp": 0.01268345, + "balance_loss_clip": 0.06273299, + "balance_loss_mlp": 0.01259166, + "epoch": 0.7573124906057418, + "flos": 21002679377280.0, + "grad_norm": 1.9975391540186431, + "language_loss": 0.71918476, + "learning_rate": 5.865439656071993e-07, + "loss": 0.7959137, + "num_input_tokens_seen": 271666240, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09185791, + "step": 12596, + "time_per_iteration": 2.551135301589966 + }, + { + "auxiliary_loss_clip": 0.0640128, + "auxiliary_loss_mlp": 0.01266132, + "balance_loss_clip": 0.06271507, + "balance_loss_mlp": 0.0125737, + "epoch": 0.7573726138584097, + "flos": 20892534785280.0, + "grad_norm": 1.4422973158795673, + "language_loss": 0.80943167, + "learning_rate": 5.862684539770706e-07, + "loss": 0.8861059, + "num_input_tokens_seen": 271686370, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.08764648, + "step": 12597, + "time_per_iteration": 2.4924709796905518 + }, + { + "auxiliary_loss_clip": 0.06410646, + "auxiliary_loss_mlp": 0.01265912, + "balance_loss_clip": 0.06274585, + "balance_loss_mlp": 0.01255076, + "epoch": 0.7574327371110777, + "flos": 24536628852480.0, + "grad_norm": 1.549330306362407, + "language_loss": 0.83572793, + "learning_rate": 5.859929959557835e-07, + "loss": 0.91249353, + "num_input_tokens_seen": 271705050, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.10839844, + "step": 12598, + "time_per_iteration": 2.5620381832122803 + }, + { + "auxiliary_loss_clip": 0.0640049, + "auxiliary_loss_mlp": 0.01265859, + "balance_loss_clip": 0.06269588, + "balance_loss_mlp": 0.01256656, + "epoch": 0.7574928603637456, + "flos": 23370711615360.0, + "grad_norm": 1.5128329006829742, + "language_loss": 0.62814438, + "learning_rate": 5.857175915537845e-07, + "loss": 0.70480788, + "num_input_tokens_seen": 271724915, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09197998, + "step": 12599, + "time_per_iteration": 2.517794132232666 + }, + { + "auxiliary_loss_clip": 0.06412463, + "auxiliary_loss_mlp": 0.01264733, + "balance_loss_clip": 0.06273595, + "balance_loss_mlp": 0.01253641, + "epoch": 0.7575529836164137, + "flos": 13521301655040.0, + "grad_norm": 2.5096070763269047, + "language_loss": 0.63904691, + "learning_rate": 5.854422407815161e-07, + "loss": 0.71581882, + "num_input_tokens_seen": 271742410, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.11096191, + "step": 12600, + "time_per_iteration": 2.4784600734710693 + }, + { + "auxiliary_loss_clip": 0.06401792, + "auxiliary_loss_mlp": 0.01265717, + "balance_loss_clip": 0.06272122, + "balance_loss_mlp": 0.01255709, + "epoch": 0.7576131068690816, + "flos": 19652754574080.0, + "grad_norm": 1.7462695207740195, + "language_loss": 0.66372097, + "learning_rate": 5.851669436494191e-07, + "loss": 0.74039608, + "num_input_tokens_seen": 271761425, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.10003662, + "step": 12601, + "time_per_iteration": 2.473879337310791 + }, + { + "auxiliary_loss_clip": 0.06400197, + "auxiliary_loss_mlp": 0.01265733, + "balance_loss_clip": 0.06269629, + "balance_loss_mlp": 0.01256429, + "epoch": 0.7576732301217496, + "flos": 20054535701760.0, + "grad_norm": 2.2130741302051904, + "language_loss": 0.68382788, + "learning_rate": 5.848917001679335e-07, + "loss": 0.7604872, + "num_input_tokens_seen": 271780875, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09301758, + "step": 12602, + "time_per_iteration": 2.49818754196167 + }, + { + "auxiliary_loss_clip": 0.0640595, + "auxiliary_loss_mlp": 0.0126578, + "balance_loss_clip": 0.06273246, + "balance_loss_mlp": 0.01255373, + "epoch": 0.7577333533744176, + "flos": 15382439418240.0, + "grad_norm": 1.7531421277811328, + "language_loss": 0.67018741, + "learning_rate": 5.846165103474967e-07, + "loss": 0.74690473, + "num_input_tokens_seen": 271799490, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.10412598, + "step": 12603, + "time_per_iteration": 2.4679315090179443 + }, + { + "auxiliary_loss_clip": 0.06399174, + "auxiliary_loss_mlp": 0.0126693, + "balance_loss_clip": 0.06270204, + "balance_loss_mlp": 0.01257441, + "epoch": 0.7577934766270855, + "flos": 17900671299840.0, + "grad_norm": 2.0091560992358417, + "language_loss": 0.62072337, + "learning_rate": 5.843413741985439e-07, + "loss": 0.69738448, + "num_input_tokens_seen": 271817040, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.09484863, + "step": 12604, + "time_per_iteration": 2.4903266429901123 + }, + { + "auxiliary_loss_clip": 0.06405266, + "auxiliary_loss_mlp": 0.01268866, + "balance_loss_clip": 0.0627261, + "balance_loss_mlp": 0.01258256, + "epoch": 0.7578535998797535, + "flos": 21619760371200.0, + "grad_norm": 1.8724094104834093, + "language_loss": 0.80161738, + "learning_rate": 5.840662917315076e-07, + "loss": 0.87835866, + "num_input_tokens_seen": 271835480, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.10614014, + "step": 12605, + "time_per_iteration": 2.4841203689575195 + }, + { + "auxiliary_loss_clip": 0.06405874, + "auxiliary_loss_mlp": 0.01267443, + "balance_loss_clip": 0.06269677, + "balance_loss_mlp": 0.01256863, + "epoch": 0.7579137231324214, + "flos": 18484237860480.0, + "grad_norm": 2.5250222349386866, + "language_loss": 0.80021864, + "learning_rate": 5.837912629568198e-07, + "loss": 0.87695181, + "num_input_tokens_seen": 271849835, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.10577393, + "step": 12606, + "time_per_iteration": 2.4846410751342773 + }, + { + "auxiliary_loss_clip": 0.06398265, + "auxiliary_loss_mlp": 0.01262661, + "balance_loss_clip": 0.06272207, + "balance_loss_mlp": 0.01254048, + "epoch": 0.7579738463850895, + "flos": 23261195928960.0, + "grad_norm": 1.3978882073919028, + "language_loss": 0.73257685, + "learning_rate": 5.835162878849087e-07, + "loss": 0.8091861, + "num_input_tokens_seen": 271869560, + "router_z_loss_clip": 1.25976562, + "router_z_loss_mlp": 0.08612061, + "step": 12607, + "time_per_iteration": 2.5159242153167725 + }, + { + "auxiliary_loss_clip": 0.06412238, + "auxiliary_loss_mlp": 0.01270562, + "balance_loss_clip": 0.06273781, + "balance_loss_mlp": 0.01260798, + "epoch": 0.7580339696377574, + "flos": 14032137271680.0, + "grad_norm": 1.9743130927740786, + "language_loss": 0.74911094, + "learning_rate": 5.83241366526202e-07, + "loss": 0.82593894, + "num_input_tokens_seen": 271887950, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.09759521, + "step": 12608, + "time_per_iteration": 2.497614622116089 + }, + { + "auxiliary_loss_clip": 0.06404, + "auxiliary_loss_mlp": 0.01265498, + "balance_loss_clip": 0.06272872, + "balance_loss_mlp": 0.01255335, + "epoch": 0.7580940928904254, + "flos": 25089825507840.0, + "grad_norm": 1.4850994343846526, + "language_loss": 0.71440935, + "learning_rate": 5.829664988911245e-07, + "loss": 0.79110432, + "num_input_tokens_seen": 271907700, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.10162354, + "step": 12609, + "time_per_iteration": 2.5046613216400146 + }, + { + "auxiliary_loss_clip": 0.06403238, + "auxiliary_loss_mlp": 0.01266218, + "balance_loss_clip": 0.06269231, + "balance_loss_mlp": 0.01255149, + "epoch": 0.7581542161430933, + "flos": 23842288794240.0, + "grad_norm": 1.5362768058581475, + "language_loss": 0.81678033, + "learning_rate": 5.826916849901007e-07, + "loss": 0.89347494, + "num_input_tokens_seen": 271926840, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.11071777, + "step": 12610, + "time_per_iteration": 2.517946243286133 + }, + { + "auxiliary_loss_clip": 0.06408758, + "auxiliary_loss_mlp": 0.01262988, + "balance_loss_clip": 0.0627152, + "balance_loss_mlp": 0.01252921, + "epoch": 0.7582143393957613, + "flos": 22243591618560.0, + "grad_norm": 1.594141702958548, + "language_loss": 0.70561087, + "learning_rate": 5.824169248335488e-07, + "loss": 0.78232837, + "num_input_tokens_seen": 271946465, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.10070801, + "step": 12611, + "time_per_iteration": 2.490994930267334 + }, + { + "auxiliary_loss_clip": 0.06402324, + "auxiliary_loss_mlp": 0.01265013, + "balance_loss_clip": 0.0626975, + "balance_loss_mlp": 0.0125516, + "epoch": 0.7582744626484292, + "flos": 21112865896320.0, + "grad_norm": 1.5348173916293948, + "language_loss": 0.70921582, + "learning_rate": 5.821422184318893e-07, + "loss": 0.78588921, + "num_input_tokens_seen": 271967295, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.09857178, + "step": 12612, + "time_per_iteration": 3.989048719406128 + }, + { + "auxiliary_loss_clip": 0.06410398, + "auxiliary_loss_mlp": 0.01264672, + "balance_loss_clip": 0.06273097, + "balance_loss_mlp": 0.01254641, + "epoch": 0.7583345859010973, + "flos": 24611120732160.0, + "grad_norm": 1.3541649077655429, + "language_loss": 0.60250545, + "learning_rate": 5.818675657955397e-07, + "loss": 0.6792562, + "num_input_tokens_seen": 271987960, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.10028076, + "step": 12613, + "time_per_iteration": 2.5280654430389404 + }, + { + "auxiliary_loss_clip": 0.06406002, + "auxiliary_loss_mlp": 0.01265434, + "balance_loss_clip": 0.06272647, + "balance_loss_mlp": 0.01255367, + "epoch": 0.7583947091537652, + "flos": 33555167326080.0, + "grad_norm": 1.434876816663814, + "language_loss": 0.60180938, + "learning_rate": 5.815929669349135e-07, + "loss": 0.67852372, + "num_input_tokens_seen": 272011780, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.10064697, + "step": 12614, + "time_per_iteration": 2.6500730514526367 + }, + { + "auxiliary_loss_clip": 0.06408043, + "auxiliary_loss_mlp": 0.01264127, + "balance_loss_clip": 0.06270881, + "balance_loss_mlp": 0.01253976, + "epoch": 0.7584548324064332, + "flos": 20127266645760.0, + "grad_norm": 1.6646286333989884, + "language_loss": 0.73613036, + "learning_rate": 5.813184218604246e-07, + "loss": 0.81285203, + "num_input_tokens_seen": 272030825, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.1015625, + "step": 12615, + "time_per_iteration": 2.5028393268585205 + }, + { + "auxiliary_loss_clip": 0.06306437, + "auxiliary_loss_mlp": 0.01253251, + "balance_loss_clip": 0.06250888, + "balance_loss_mlp": 0.01251755, + "epoch": 0.7585149556591012, + "flos": 70424064069120.0, + "grad_norm": 0.8421080448004001, + "language_loss": 0.67521149, + "learning_rate": 5.810439305824828e-07, + "loss": 0.75080836, + "num_input_tokens_seen": 272095825, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01496124, + "step": 12616, + "time_per_iteration": 3.1849849224090576 + }, + { + "auxiliary_loss_clip": 0.06408077, + "auxiliary_loss_mlp": 0.01262858, + "balance_loss_clip": 0.06270512, + "balance_loss_mlp": 0.01252779, + "epoch": 0.7585750789117691, + "flos": 16149342712320.0, + "grad_norm": 1.7878130457508898, + "language_loss": 0.84241217, + "learning_rate": 5.807694931114979e-07, + "loss": 0.9191215, + "num_input_tokens_seen": 272113950, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.10076904, + "step": 12617, + "time_per_iteration": 2.4973013401031494 + }, + { + "auxiliary_loss_clip": 0.06407297, + "auxiliary_loss_mlp": 0.01262597, + "balance_loss_clip": 0.06272709, + "balance_loss_mlp": 0.01253257, + "epoch": 0.7586352021644371, + "flos": 17498848245120.0, + "grad_norm": 2.3587408181523544, + "language_loss": 0.74931777, + "learning_rate": 5.804951094578757e-07, + "loss": 0.82601666, + "num_input_tokens_seen": 272130315, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.09338379, + "step": 12618, + "time_per_iteration": 2.494654417037964 + }, + { + "auxiliary_loss_clip": 0.06410335, + "auxiliary_loss_mlp": 0.01262457, + "balance_loss_clip": 0.06271516, + "balance_loss_mlp": 0.01251967, + "epoch": 0.758695325417105, + "flos": 17280990829440.0, + "grad_norm": 2.0665265442485485, + "language_loss": 0.77541107, + "learning_rate": 5.802207796320209e-07, + "loss": 0.852139, + "num_input_tokens_seen": 272149080, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.1048584, + "step": 12619, + "time_per_iteration": 2.5350186824798584 + }, + { + "auxiliary_loss_clip": 0.06403962, + "auxiliary_loss_mlp": 0.01265943, + "balance_loss_clip": 0.06272481, + "balance_loss_mlp": 0.01255751, + "epoch": 0.7587554486697731, + "flos": 29503128856320.0, + "grad_norm": 1.7154948098726508, + "language_loss": 0.82232845, + "learning_rate": 5.79946503644337e-07, + "loss": 0.89902753, + "num_input_tokens_seen": 272168285, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.10180664, + "step": 12620, + "time_per_iteration": 2.5445215702056885 + }, + { + "auxiliary_loss_clip": 0.06409103, + "auxiliary_loss_mlp": 0.01267734, + "balance_loss_clip": 0.06271064, + "balance_loss_mlp": 0.0125651, + "epoch": 0.758815571922441, + "flos": 16105262664960.0, + "grad_norm": 2.254667976985654, + "language_loss": 0.82809436, + "learning_rate": 5.796722815052242e-07, + "loss": 0.90486276, + "num_input_tokens_seen": 272184585, + "router_z_loss_clip": 1.38183594, + "router_z_loss_mlp": 0.11236572, + "step": 12621, + "time_per_iteration": 3.918266534805298 + }, + { + "auxiliary_loss_clip": 0.0640413, + "auxiliary_loss_mlp": 0.01267456, + "balance_loss_clip": 0.06271367, + "balance_loss_mlp": 0.01257717, + "epoch": 0.758875695175109, + "flos": 16149258858240.0, + "grad_norm": 1.986087185770293, + "language_loss": 0.73904622, + "learning_rate": 5.7939811322508e-07, + "loss": 0.81576204, + "num_input_tokens_seen": 272200205, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09747314, + "step": 12622, + "time_per_iteration": 2.4622373580932617 + }, + { + "auxiliary_loss_clip": 0.06310892, + "auxiliary_loss_mlp": 0.01253319, + "balance_loss_clip": 0.06255639, + "balance_loss_mlp": 0.01252096, + "epoch": 0.7589358184277769, + "flos": 68482019589120.0, + "grad_norm": 0.8176590581901009, + "language_loss": 0.60799408, + "learning_rate": 5.791239988143024e-07, + "loss": 0.68363619, + "num_input_tokens_seen": 272259670, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01221466, + "step": 12623, + "time_per_iteration": 3.143218755722046 + }, + { + "auxiliary_loss_clip": 0.06401753, + "auxiliary_loss_mlp": 0.01262985, + "balance_loss_clip": 0.06271981, + "balance_loss_mlp": 0.01254349, + "epoch": 0.7589959416804449, + "flos": 20053445598720.0, + "grad_norm": 1.8387445657701582, + "language_loss": 0.67715496, + "learning_rate": 5.788499382832847e-07, + "loss": 0.75380242, + "num_input_tokens_seen": 272277925, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.08636475, + "step": 12624, + "time_per_iteration": 3.9293882846832275 + }, + { + "auxiliary_loss_clip": 0.06401351, + "auxiliary_loss_mlp": 0.01266658, + "balance_loss_clip": 0.06270081, + "balance_loss_mlp": 0.01257038, + "epoch": 0.7590560649331128, + "flos": 18777970748160.0, + "grad_norm": 1.6859497284261105, + "language_loss": 0.76178044, + "learning_rate": 5.785759316424196e-07, + "loss": 0.83846056, + "num_input_tokens_seen": 272296010, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.09625244, + "step": 12625, + "time_per_iteration": 2.4780449867248535 + }, + { + "auxiliary_loss_clip": 0.06401481, + "auxiliary_loss_mlp": 0.01264022, + "balance_loss_clip": 0.06271295, + "balance_loss_mlp": 0.0125383, + "epoch": 0.7591161881857809, + "flos": 29833017580800.0, + "grad_norm": 1.7327397977395311, + "language_loss": 0.63387203, + "learning_rate": 5.783019789020977e-07, + "loss": 0.71052712, + "num_input_tokens_seen": 272318330, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.10198975, + "step": 12626, + "time_per_iteration": 2.5631775856018066 + }, + { + "auxiliary_loss_clip": 0.06407394, + "auxiliary_loss_mlp": 0.01265555, + "balance_loss_clip": 0.06272081, + "balance_loss_mlp": 0.01255715, + "epoch": 0.7591763114384488, + "flos": 20308884370560.0, + "grad_norm": 1.7841706388815284, + "language_loss": 0.74468005, + "learning_rate": 5.780280800727084e-07, + "loss": 0.82140952, + "num_input_tokens_seen": 272335265, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.09844971, + "step": 12627, + "time_per_iteration": 2.469609260559082 + }, + { + "auxiliary_loss_clip": 0.06408302, + "auxiliary_loss_mlp": 0.012668, + "balance_loss_clip": 0.06272177, + "balance_loss_mlp": 0.01257412, + "epoch": 0.7592364346911168, + "flos": 20819887695360.0, + "grad_norm": 2.5677146388224728, + "language_loss": 0.69222355, + "learning_rate": 5.777542351646356e-07, + "loss": 0.76897466, + "num_input_tokens_seen": 272354795, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.09387207, + "step": 12628, + "time_per_iteration": 2.520756483078003 + }, + { + "auxiliary_loss_clip": 0.06418896, + "auxiliary_loss_mlp": 0.01268483, + "balance_loss_clip": 0.06277822, + "balance_loss_mlp": 0.01257951, + "epoch": 0.7592965579437848, + "flos": 21257866586880.0, + "grad_norm": 2.617063400341695, + "language_loss": 0.62842494, + "learning_rate": 5.774804441882648e-07, + "loss": 0.70529878, + "num_input_tokens_seen": 272372875, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.10528564, + "step": 12629, + "time_per_iteration": 3.9617972373962402 + }, + { + "auxiliary_loss_clip": 0.06400847, + "auxiliary_loss_mlp": 0.01264471, + "balance_loss_clip": 0.06271888, + "balance_loss_mlp": 0.0125463, + "epoch": 0.7593566811964527, + "flos": 26220802792320.0, + "grad_norm": 1.4187303097446593, + "language_loss": 0.7784214, + "learning_rate": 5.772067071539786e-07, + "loss": 0.85507464, + "num_input_tokens_seen": 272394715, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.09844971, + "step": 12630, + "time_per_iteration": 2.5400242805480957 + }, + { + "auxiliary_loss_clip": 0.0631338, + "auxiliary_loss_mlp": 0.01256151, + "balance_loss_clip": 0.06257843, + "balance_loss_mlp": 0.01255109, + "epoch": 0.7594168044491207, + "flos": 71258122010880.0, + "grad_norm": 0.8178625518129599, + "language_loss": 0.61609149, + "learning_rate": 5.769330240721562e-07, + "loss": 0.69178677, + "num_input_tokens_seen": 272458775, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01042175, + "step": 12631, + "time_per_iteration": 3.2121753692626953 + }, + { + "auxiliary_loss_clip": 0.06412616, + "auxiliary_loss_mlp": 0.0126774, + "balance_loss_clip": 0.06273548, + "balance_loss_mlp": 0.01256188, + "epoch": 0.7594769277017887, + "flos": 26620319859840.0, + "grad_norm": 1.723696706430517, + "language_loss": 0.74189103, + "learning_rate": 5.766593949531767e-07, + "loss": 0.81869459, + "num_input_tokens_seen": 272479355, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.11547852, + "step": 12632, + "time_per_iteration": 2.633206605911255 + }, + { + "auxiliary_loss_clip": 0.06406914, + "auxiliary_loss_mlp": 0.01263252, + "balance_loss_clip": 0.06272458, + "balance_loss_mlp": 0.01252743, + "epoch": 0.7595370509544567, + "flos": 17600523575040.0, + "grad_norm": 1.7631507541187388, + "language_loss": 0.75345957, + "learning_rate": 5.763858198074154e-07, + "loss": 0.83016121, + "num_input_tokens_seen": 272493555, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.1050415, + "step": 12633, + "time_per_iteration": 2.4908735752105713 + }, + { + "auxiliary_loss_clip": 0.06404668, + "auxiliary_loss_mlp": 0.01264134, + "balance_loss_clip": 0.06271268, + "balance_loss_mlp": 0.01254883, + "epoch": 0.7595971742071246, + "flos": 18008551831680.0, + "grad_norm": 1.9259614725215357, + "language_loss": 0.73589694, + "learning_rate": 5.76112298645246e-07, + "loss": 0.81258494, + "num_input_tokens_seen": 272508925, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.09240723, + "step": 12634, + "time_per_iteration": 2.463972330093384 + }, + { + "auxiliary_loss_clip": 0.06401845, + "auxiliary_loss_mlp": 0.01266383, + "balance_loss_clip": 0.06269458, + "balance_loss_mlp": 0.01256715, + "epoch": 0.7596572974597926, + "flos": 28847921454720.0, + "grad_norm": 1.6183361542433332, + "language_loss": 0.65202701, + "learning_rate": 5.758388314770408e-07, + "loss": 0.72870934, + "num_input_tokens_seen": 272528805, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.09661865, + "step": 12635, + "time_per_iteration": 2.5608267784118652 + }, + { + "auxiliary_loss_clip": 0.06408376, + "auxiliary_loss_mlp": 0.01262438, + "balance_loss_clip": 0.06272096, + "balance_loss_mlp": 0.01252252, + "epoch": 0.7597174207124605, + "flos": 14288037240960.0, + "grad_norm": 1.6247637528825494, + "language_loss": 0.69144988, + "learning_rate": 5.7556541831317e-07, + "loss": 0.76815796, + "num_input_tokens_seen": 272546655, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.10186768, + "step": 12636, + "time_per_iteration": 2.4801905155181885 + }, + { + "auxiliary_loss_clip": 0.0640962, + "auxiliary_loss_mlp": 0.01262748, + "balance_loss_clip": 0.06271771, + "balance_loss_mlp": 0.01252103, + "epoch": 0.7597775439651285, + "flos": 21695300426880.0, + "grad_norm": 1.9394255431745338, + "language_loss": 0.81419599, + "learning_rate": 5.752920591640018e-07, + "loss": 0.89091963, + "num_input_tokens_seen": 272564010, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.10650635, + "step": 12637, + "time_per_iteration": 2.535862922668457 + }, + { + "auxiliary_loss_clip": 0.06405479, + "auxiliary_loss_mlp": 0.01261246, + "balance_loss_clip": 0.06269705, + "balance_loss_mlp": 0.01251781, + "epoch": 0.7598376672177964, + "flos": 36110100096000.0, + "grad_norm": 1.8287091414841325, + "language_loss": 0.66797674, + "learning_rate": 5.750187540399017e-07, + "loss": 0.74464405, + "num_input_tokens_seen": 272585840, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.09460449, + "step": 12638, + "time_per_iteration": 2.620074987411499 + }, + { + "auxiliary_loss_clip": 0.06408533, + "auxiliary_loss_mlp": 0.01265156, + "balance_loss_clip": 0.06273371, + "balance_loss_mlp": 0.01254135, + "epoch": 0.7598977904704645, + "flos": 18338147066880.0, + "grad_norm": 2.2175642348047746, + "language_loss": 0.65482736, + "learning_rate": 5.747455029512323e-07, + "loss": 0.73156428, + "num_input_tokens_seen": 272602300, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.11022949, + "step": 12639, + "time_per_iteration": 2.495577096939087 + }, + { + "auxiliary_loss_clip": 0.06406114, + "auxiliary_loss_mlp": 0.01266924, + "balance_loss_clip": 0.06273108, + "balance_loss_mlp": 0.0125706, + "epoch": 0.7599579137231324, + "flos": 20198697851520.0, + "grad_norm": 2.4320385733819814, + "language_loss": 0.69979274, + "learning_rate": 5.744723059083572e-07, + "loss": 0.77652305, + "num_input_tokens_seen": 272619595, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.09863281, + "step": 12640, + "time_per_iteration": 2.5001392364501953 + }, + { + "auxiliary_loss_clip": 0.06408872, + "auxiliary_loss_mlp": 0.01266047, + "balance_loss_clip": 0.06271793, + "balance_loss_mlp": 0.01254788, + "epoch": 0.7600180369758004, + "flos": 24031746875520.0, + "grad_norm": 1.6154408738671377, + "language_loss": 0.66895354, + "learning_rate": 5.741991629216343e-07, + "loss": 0.74570274, + "num_input_tokens_seen": 272638825, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.11260986, + "step": 12641, + "time_per_iteration": 2.5159339904785156 + }, + { + "auxiliary_loss_clip": 0.064065, + "auxiliary_loss_mlp": 0.01265385, + "balance_loss_clip": 0.06269056, + "balance_loss_mlp": 0.01254865, + "epoch": 0.7600781602284684, + "flos": 18995534674560.0, + "grad_norm": 2.038376474313416, + "language_loss": 0.6667732, + "learning_rate": 5.73926074001422e-07, + "loss": 0.74349207, + "num_input_tokens_seen": 272657240, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.10522461, + "step": 12642, + "time_per_iteration": 2.4950852394104004 + }, + { + "auxiliary_loss_clip": 0.06405585, + "auxiliary_loss_mlp": 0.01265846, + "balance_loss_clip": 0.0627634, + "balance_loss_mlp": 0.01256571, + "epoch": 0.7601382834811363, + "flos": 26074670071680.0, + "grad_norm": 1.8779608812077913, + "language_loss": 0.75724566, + "learning_rate": 5.736530391580765e-07, + "loss": 0.83396, + "num_input_tokens_seen": 272677520, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.0927124, + "step": 12643, + "time_per_iteration": 2.660304069519043 + }, + { + "auxiliary_loss_clip": 0.06411186, + "auxiliary_loss_mlp": 0.01265406, + "balance_loss_clip": 0.06275575, + "balance_loss_mlp": 0.01254219, + "epoch": 0.7601984067338043, + "flos": 18850324348800.0, + "grad_norm": 1.8216194715113248, + "language_loss": 0.78901958, + "learning_rate": 5.733800584019508e-07, + "loss": 0.86578548, + "num_input_tokens_seen": 272696770, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.11187744, + "step": 12644, + "time_per_iteration": 2.513680934906006 + }, + { + "auxiliary_loss_clip": 0.06404514, + "auxiliary_loss_mlp": 0.01261707, + "balance_loss_clip": 0.06268981, + "balance_loss_mlp": 0.01251801, + "epoch": 0.7602585299864723, + "flos": 24653607552000.0, + "grad_norm": 1.4015203810474768, + "language_loss": 0.807042, + "learning_rate": 5.731071317433957e-07, + "loss": 0.88370419, + "num_input_tokens_seen": 272718340, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.09912109, + "step": 12645, + "time_per_iteration": 2.7170186042785645 + }, + { + "auxiliary_loss_clip": 0.06406523, + "auxiliary_loss_mlp": 0.01267162, + "balance_loss_clip": 0.06271391, + "balance_loss_mlp": 0.01256779, + "epoch": 0.7603186532391403, + "flos": 23848913266560.0, + "grad_norm": 1.4313892113151905, + "language_loss": 0.7345466, + "learning_rate": 5.728342591927611e-07, + "loss": 0.81128347, + "num_input_tokens_seen": 272739575, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.1038208, + "step": 12646, + "time_per_iteration": 2.7041969299316406 + }, + { + "auxiliary_loss_clip": 0.06405969, + "auxiliary_loss_mlp": 0.01267521, + "balance_loss_clip": 0.06275387, + "balance_loss_mlp": 0.0125842, + "epoch": 0.7603787764918082, + "flos": 22206387605760.0, + "grad_norm": 1.8247890758149474, + "language_loss": 0.67541718, + "learning_rate": 5.725614407603949e-07, + "loss": 0.75215209, + "num_input_tokens_seen": 272758710, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.09100342, + "step": 12647, + "time_per_iteration": 2.631646156311035 + }, + { + "auxiliary_loss_clip": 0.06309351, + "auxiliary_loss_mlp": 0.01254415, + "balance_loss_clip": 0.06253824, + "balance_loss_mlp": 0.01253126, + "epoch": 0.7604388997444762, + "flos": 54104549713920.0, + "grad_norm": 0.6718107108151633, + "language_loss": 0.48995575, + "learning_rate": 5.722886764566415e-07, + "loss": 0.56559336, + "num_input_tokens_seen": 272814855, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01289368, + "step": 12648, + "time_per_iteration": 3.0884687900543213 + }, + { + "auxiliary_loss_clip": 0.06397881, + "auxiliary_loss_mlp": 0.01264414, + "balance_loss_clip": 0.06268241, + "balance_loss_mlp": 0.01255801, + "epoch": 0.7604990229971441, + "flos": 19687904161920.0, + "grad_norm": 1.3891263247246097, + "language_loss": 0.76770478, + "learning_rate": 5.720159662918451e-07, + "loss": 0.84432769, + "num_input_tokens_seen": 272834400, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.08612061, + "step": 12649, + "time_per_iteration": 2.4948225021362305 + }, + { + "auxiliary_loss_clip": 0.06400768, + "auxiliary_loss_mlp": 0.01263835, + "balance_loss_clip": 0.06269015, + "balance_loss_mlp": 0.01254501, + "epoch": 0.7605591462498121, + "flos": 25234993906560.0, + "grad_norm": 1.5285209228148775, + "language_loss": 0.6904434, + "learning_rate": 5.717433102763462e-07, + "loss": 0.76708949, + "num_input_tokens_seen": 272854760, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.09332275, + "step": 12650, + "time_per_iteration": 2.5328054428100586 + }, + { + "auxiliary_loss_clip": 0.06313049, + "auxiliary_loss_mlp": 0.01254535, + "balance_loss_clip": 0.06257538, + "balance_loss_mlp": 0.01253279, + "epoch": 0.76061926950248, + "flos": 66803505799680.0, + "grad_norm": 0.7352332079053004, + "language_loss": 0.62801003, + "learning_rate": 5.714707084204838e-07, + "loss": 0.70368588, + "num_input_tokens_seen": 272919030, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01255798, + "step": 12651, + "time_per_iteration": 4.553870916366577 + }, + { + "auxiliary_loss_clip": 0.06400903, + "auxiliary_loss_mlp": 0.01266142, + "balance_loss_clip": 0.06269742, + "balance_loss_mlp": 0.01256629, + "epoch": 0.7606793927551481, + "flos": 25345473914880.0, + "grad_norm": 1.3627527735409288, + "language_loss": 0.71875393, + "learning_rate": 5.711981607345951e-07, + "loss": 0.79542446, + "num_input_tokens_seen": 272938925, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09515381, + "step": 12652, + "time_per_iteration": 2.5254390239715576 + }, + { + "auxiliary_loss_clip": 0.06403194, + "auxiliary_loss_mlp": 0.0126807, + "balance_loss_clip": 0.06270062, + "balance_loss_mlp": 0.01258229, + "epoch": 0.760739516007816, + "flos": 18229553775360.0, + "grad_norm": 1.992377129366734, + "language_loss": 0.80116236, + "learning_rate": 5.709256672290152e-07, + "loss": 0.87787497, + "num_input_tokens_seen": 272954945, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.09838867, + "step": 12653, + "time_per_iteration": 2.475878953933716 + }, + { + "auxiliary_loss_clip": 0.06406933, + "auxiliary_loss_mlp": 0.01265577, + "balance_loss_clip": 0.06269571, + "balance_loss_mlp": 0.01255248, + "epoch": 0.760799639260484, + "flos": 22564717591680.0, + "grad_norm": 1.5079651219958228, + "language_loss": 0.80019051, + "learning_rate": 5.706532279140785e-07, + "loss": 0.87691557, + "num_input_tokens_seen": 272972855, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.10327148, + "step": 12654, + "time_per_iteration": 2.4968621730804443 + }, + { + "auxiliary_loss_clip": 0.06408094, + "auxiliary_loss_mlp": 0.01268021, + "balance_loss_clip": 0.0627185, + "balance_loss_mlp": 0.01256953, + "epoch": 0.760859762513152, + "flos": 22315819438080.0, + "grad_norm": 2.0930481497067968, + "language_loss": 0.79525441, + "learning_rate": 5.703808428001136e-07, + "loss": 0.87201554, + "num_input_tokens_seen": 272989895, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.11065674, + "step": 12655, + "time_per_iteration": 2.5296621322631836 + }, + { + "auxiliary_loss_clip": 0.06400845, + "auxiliary_loss_mlp": 0.01263727, + "balance_loss_clip": 0.06271712, + "balance_loss_mlp": 0.0125565, + "epoch": 0.7609198857658199, + "flos": 24870919916160.0, + "grad_norm": 1.5227214319467992, + "language_loss": 0.68902338, + "learning_rate": 5.701085118974505e-07, + "loss": 0.76566911, + "num_input_tokens_seen": 273011695, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.08068848, + "step": 12656, + "time_per_iteration": 2.541064739227295 + }, + { + "auxiliary_loss_clip": 0.06410336, + "auxiliary_loss_mlp": 0.01267534, + "balance_loss_clip": 0.06272005, + "balance_loss_mlp": 0.01256913, + "epoch": 0.760980009018488, + "flos": 16842424959360.0, + "grad_norm": 2.207190684629195, + "language_loss": 0.73558354, + "learning_rate": 5.698362352164164e-07, + "loss": 0.81236219, + "num_input_tokens_seen": 273028815, + "router_z_loss_clip": 1.38476562, + "router_z_loss_mlp": 0.10632324, + "step": 12657, + "time_per_iteration": 2.492959499359131 + }, + { + "auxiliary_loss_clip": 0.06312352, + "auxiliary_loss_mlp": 0.01255494, + "balance_loss_clip": 0.06256969, + "balance_loss_mlp": 0.01254303, + "epoch": 0.7610401322711559, + "flos": 61248198355200.0, + "grad_norm": 0.8387316949065597, + "language_loss": 0.65017879, + "learning_rate": 5.695640127673347e-07, + "loss": 0.7258572, + "num_input_tokens_seen": 273084080, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01189423, + "step": 12658, + "time_per_iteration": 3.0756664276123047 + }, + { + "auxiliary_loss_clip": 0.06397738, + "auxiliary_loss_mlp": 0.0126605, + "balance_loss_clip": 0.06270427, + "balance_loss_mlp": 0.01255691, + "epoch": 0.7611002555238239, + "flos": 19645920466560.0, + "grad_norm": 1.5440041293540654, + "language_loss": 0.7962606, + "learning_rate": 5.692918445605293e-07, + "loss": 0.87289846, + "num_input_tokens_seen": 273102295, + "router_z_loss_clip": 1.2734375, + "router_z_loss_mlp": 0.1036377, + "step": 12659, + "time_per_iteration": 2.5428194999694824 + }, + { + "auxiliary_loss_clip": 0.0640292, + "auxiliary_loss_mlp": 0.01264514, + "balance_loss_clip": 0.06270297, + "balance_loss_mlp": 0.01255138, + "epoch": 0.7611603787764918, + "flos": 26879825554560.0, + "grad_norm": 1.4756646122445365, + "language_loss": 0.69142807, + "learning_rate": 5.690197306063209e-07, + "loss": 0.76810235, + "num_input_tokens_seen": 273123400, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09375, + "step": 12660, + "time_per_iteration": 4.065267086029053 + }, + { + "auxiliary_loss_clip": 0.06405179, + "auxiliary_loss_mlp": 0.01264177, + "balance_loss_clip": 0.06272516, + "balance_loss_mlp": 0.01254759, + "epoch": 0.7612205020291598, + "flos": 27351570441600.0, + "grad_norm": 1.631280435549901, + "language_loss": 0.70831662, + "learning_rate": 5.687476709150281e-07, + "loss": 0.78501016, + "num_input_tokens_seen": 273145150, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.09423828, + "step": 12661, + "time_per_iteration": 2.541351079940796 + }, + { + "auxiliary_loss_clip": 0.06405234, + "auxiliary_loss_mlp": 0.01265085, + "balance_loss_clip": 0.06271017, + "balance_loss_mlp": 0.01255447, + "epoch": 0.7612806252818277, + "flos": 29322265818240.0, + "grad_norm": 1.4447529833958312, + "language_loss": 0.84105158, + "learning_rate": 5.68475665496966e-07, + "loss": 0.91775477, + "num_input_tokens_seen": 273165180, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.09637451, + "step": 12662, + "time_per_iteration": 2.654850721359253 + }, + { + "auxiliary_loss_clip": 0.06407061, + "auxiliary_loss_mlp": 0.01269015, + "balance_loss_clip": 0.06273231, + "balance_loss_mlp": 0.0125974, + "epoch": 0.7613407485344957, + "flos": 19032067854720.0, + "grad_norm": 1.6864772603594633, + "language_loss": 0.69368142, + "learning_rate": 5.682037143624505e-07, + "loss": 0.77044225, + "num_input_tokens_seen": 273184005, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.09283447, + "step": 12663, + "time_per_iteration": 3.926262617111206 + }, + { + "auxiliary_loss_clip": 0.06401078, + "auxiliary_loss_mlp": 0.01261863, + "balance_loss_clip": 0.0627175, + "balance_loss_mlp": 0.01253119, + "epoch": 0.7614008717871636, + "flos": 23262369886080.0, + "grad_norm": 1.4557154718503251, + "language_loss": 0.70039129, + "learning_rate": 5.67931817521794e-07, + "loss": 0.77702069, + "num_input_tokens_seen": 273203565, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.08746338, + "step": 12664, + "time_per_iteration": 2.5054047107696533 + }, + { + "auxiliary_loss_clip": 0.06409515, + "auxiliary_loss_mlp": 0.01268679, + "balance_loss_clip": 0.06272146, + "balance_loss_mlp": 0.01257724, + "epoch": 0.7614609950398317, + "flos": 21586329792000.0, + "grad_norm": 1.5992794514882698, + "language_loss": 0.79600513, + "learning_rate": 5.676599749853066e-07, + "loss": 0.87278712, + "num_input_tokens_seen": 273221645, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.10949707, + "step": 12665, + "time_per_iteration": 2.599689483642578 + }, + { + "auxiliary_loss_clip": 0.06403616, + "auxiliary_loss_mlp": 0.01268033, + "balance_loss_clip": 0.06274, + "balance_loss_mlp": 0.01258097, + "epoch": 0.7615211182924996, + "flos": 29285523002880.0, + "grad_norm": 1.8706140840131316, + "language_loss": 0.88243985, + "learning_rate": 5.673881867632959e-07, + "loss": 0.95915639, + "num_input_tokens_seen": 273242040, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.09936523, + "step": 12666, + "time_per_iteration": 2.5415070056915283 + }, + { + "auxiliary_loss_clip": 0.06408084, + "auxiliary_loss_mlp": 0.0126673, + "balance_loss_clip": 0.06272887, + "balance_loss_mlp": 0.01256472, + "epoch": 0.7615812415451676, + "flos": 13266156372480.0, + "grad_norm": 2.0248103449736963, + "language_loss": 0.83170617, + "learning_rate": 5.671164528660693e-07, + "loss": 0.90845418, + "num_input_tokens_seen": 273257365, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.10253906, + "step": 12667, + "time_per_iteration": 2.4605929851531982 + }, + { + "auxiliary_loss_clip": 0.06401822, + "auxiliary_loss_mlp": 0.01264725, + "balance_loss_clip": 0.06271848, + "balance_loss_mlp": 0.01255266, + "epoch": 0.7616413647978356, + "flos": 18590105894400.0, + "grad_norm": 1.5289232692663373, + "language_loss": 0.78628266, + "learning_rate": 5.668447733039296e-07, + "loss": 0.86294812, + "num_input_tokens_seen": 273274710, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.09460449, + "step": 12668, + "time_per_iteration": 3.9720492362976074 + }, + { + "auxiliary_loss_clip": 0.06403045, + "auxiliary_loss_mlp": 0.01263851, + "balance_loss_clip": 0.06270594, + "balance_loss_mlp": 0.01254469, + "epoch": 0.7617014880505035, + "flos": 18522280414080.0, + "grad_norm": 1.6924413590277445, + "language_loss": 0.64424682, + "learning_rate": 5.6657314808718e-07, + "loss": 0.72091579, + "num_input_tokens_seen": 273292870, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.09381104, + "step": 12669, + "time_per_iteration": 2.4817726612091064 + }, + { + "auxiliary_loss_clip": 0.0640804, + "auxiliary_loss_mlp": 0.01266418, + "balance_loss_clip": 0.06272504, + "balance_loss_mlp": 0.01255403, + "epoch": 0.7617616113031715, + "flos": 24980184040320.0, + "grad_norm": 1.625894991767346, + "language_loss": 0.66114289, + "learning_rate": 5.663015772261202e-07, + "loss": 0.7378875, + "num_input_tokens_seen": 273312375, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.11016846, + "step": 12670, + "time_per_iteration": 2.531942844390869 + }, + { + "auxiliary_loss_clip": 0.06408806, + "auxiliary_loss_mlp": 0.01267085, + "balance_loss_clip": 0.06272422, + "balance_loss_mlp": 0.01256821, + "epoch": 0.7618217345558395, + "flos": 23301796032000.0, + "grad_norm": 1.6261426293442, + "language_loss": 0.72730261, + "learning_rate": 5.660300607310493e-07, + "loss": 0.80406153, + "num_input_tokens_seen": 273332590, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.10266113, + "step": 12671, + "time_per_iteration": 2.555997133255005 + }, + { + "auxiliary_loss_clip": 0.06401184, + "auxiliary_loss_mlp": 0.01263811, + "balance_loss_clip": 0.06269476, + "balance_loss_mlp": 0.01254686, + "epoch": 0.7618818578085075, + "flos": 25489803772800.0, + "grad_norm": 1.5891051355844041, + "language_loss": 0.73397064, + "learning_rate": 5.657585986122613e-07, + "loss": 0.81062061, + "num_input_tokens_seen": 273352885, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09124756, + "step": 12672, + "time_per_iteration": 2.5291435718536377 + }, + { + "auxiliary_loss_clip": 0.06309396, + "auxiliary_loss_mlp": 0.01251395, + "balance_loss_clip": 0.06254143, + "balance_loss_mlp": 0.01250371, + "epoch": 0.7619419810611754, + "flos": 61168633303680.0, + "grad_norm": 0.7432915400862121, + "language_loss": 0.56722248, + "learning_rate": 5.654871908800506e-07, + "loss": 0.64283037, + "num_input_tokens_seen": 273411730, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01023865, + "step": 12673, + "time_per_iteration": 3.134204864501953 + }, + { + "auxiliary_loss_clip": 0.06401986, + "auxiliary_loss_mlp": 0.01266349, + "balance_loss_clip": 0.06268115, + "balance_loss_mlp": 0.01256371, + "epoch": 0.7620021043138434, + "flos": 23265430560000.0, + "grad_norm": 1.7103416042413309, + "language_loss": 0.74883175, + "learning_rate": 5.652158375447102e-07, + "loss": 0.82551509, + "num_input_tokens_seen": 273430020, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.09985352, + "step": 12674, + "time_per_iteration": 2.507917642593384 + }, + { + "auxiliary_loss_clip": 0.06398366, + "auxiliary_loss_mlp": 0.01265734, + "balance_loss_clip": 0.06268415, + "balance_loss_mlp": 0.01257002, + "epoch": 0.7620622275665113, + "flos": 25089490091520.0, + "grad_norm": 2.2685266755673847, + "language_loss": 0.72315985, + "learning_rate": 5.649445386165286e-07, + "loss": 0.79980081, + "num_input_tokens_seen": 273448690, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.08728027, + "step": 12675, + "time_per_iteration": 2.5618882179260254 + }, + { + "auxiliary_loss_clip": 0.0640251, + "auxiliary_loss_mlp": 0.01264495, + "balance_loss_clip": 0.06272566, + "balance_loss_mlp": 0.01254911, + "epoch": 0.7621223508191793, + "flos": 20160864933120.0, + "grad_norm": 1.9392842077457455, + "language_loss": 0.7294848, + "learning_rate": 5.646732941057936e-07, + "loss": 0.80615485, + "num_input_tokens_seen": 273465190, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.09588623, + "step": 12676, + "time_per_iteration": 2.4889016151428223 + }, + { + "auxiliary_loss_clip": 0.06412819, + "auxiliary_loss_mlp": 0.01265496, + "balance_loss_clip": 0.06273077, + "balance_loss_mlp": 0.01255125, + "epoch": 0.7621824740718472, + "flos": 18005323449600.0, + "grad_norm": 3.350191420610347, + "language_loss": 0.54523033, + "learning_rate": 5.644021040227927e-07, + "loss": 0.62201345, + "num_input_tokens_seen": 273478620, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.10357666, + "step": 12677, + "time_per_iteration": 2.479889392852783 + }, + { + "auxiliary_loss_clip": 0.06403828, + "auxiliary_loss_mlp": 0.01261111, + "balance_loss_clip": 0.06271364, + "balance_loss_mlp": 0.0125102, + "epoch": 0.7622425973245153, + "flos": 21732085169280.0, + "grad_norm": 1.924626512292605, + "language_loss": 0.79229861, + "learning_rate": 5.641309683778064e-07, + "loss": 0.86894798, + "num_input_tokens_seen": 273497635, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.10101318, + "step": 12678, + "time_per_iteration": 2.5050454139709473 + }, + { + "auxiliary_loss_clip": 0.0640271, + "auxiliary_loss_mlp": 0.0126229, + "balance_loss_clip": 0.06268604, + "balance_loss_mlp": 0.01252694, + "epoch": 0.7623027205771832, + "flos": 19724563123200.0, + "grad_norm": 2.0630846770322133, + "language_loss": 0.77460301, + "learning_rate": 5.638598871811175e-07, + "loss": 0.85125297, + "num_input_tokens_seen": 273513955, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.09588623, + "step": 12679, + "time_per_iteration": 2.5036091804504395 + }, + { + "auxiliary_loss_clip": 0.06405875, + "auxiliary_loss_mlp": 0.01264484, + "balance_loss_clip": 0.06272455, + "balance_loss_mlp": 0.0125526, + "epoch": 0.7623628438298512, + "flos": 23995800673920.0, + "grad_norm": 1.5339500294685882, + "language_loss": 0.79924572, + "learning_rate": 5.635888604430059e-07, + "loss": 0.87594938, + "num_input_tokens_seen": 273533970, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.0921936, + "step": 12680, + "time_per_iteration": 2.5672616958618164 + }, + { + "auxiliary_loss_clip": 0.06404954, + "auxiliary_loss_mlp": 0.01265568, + "balance_loss_clip": 0.06273016, + "balance_loss_mlp": 0.0125565, + "epoch": 0.7624229670825191, + "flos": 22352184910080.0, + "grad_norm": 1.9657419278541466, + "language_loss": 0.62747079, + "learning_rate": 5.633178881737493e-07, + "loss": 0.70417601, + "num_input_tokens_seen": 273553090, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09918213, + "step": 12681, + "time_per_iteration": 2.5365428924560547 + }, + { + "auxiliary_loss_clip": 0.06399923, + "auxiliary_loss_mlp": 0.01266445, + "balance_loss_clip": 0.06270124, + "balance_loss_mlp": 0.01256789, + "epoch": 0.7624830903351871, + "flos": 22718522960640.0, + "grad_norm": 2.3247043396178335, + "language_loss": 0.76673269, + "learning_rate": 5.63046970383622e-07, + "loss": 0.84339643, + "num_input_tokens_seen": 273572460, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.09649658, + "step": 12682, + "time_per_iteration": 2.5021934509277344 + }, + { + "auxiliary_loss_clip": 0.06400375, + "auxiliary_loss_mlp": 0.0126528, + "balance_loss_clip": 0.06271029, + "balance_loss_mlp": 0.01256554, + "epoch": 0.7625432135878552, + "flos": 25600870759680.0, + "grad_norm": 1.6797876321314247, + "language_loss": 0.68138206, + "learning_rate": 5.627761070828974e-07, + "loss": 0.75803858, + "num_input_tokens_seen": 273592815, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.08728027, + "step": 12683, + "time_per_iteration": 2.5445661544799805 + }, + { + "auxiliary_loss_clip": 0.06401844, + "auxiliary_loss_mlp": 0.01265651, + "balance_loss_clip": 0.06269109, + "balance_loss_mlp": 0.01256078, + "epoch": 0.7626033368405231, + "flos": 23994417081600.0, + "grad_norm": 1.9075173015451221, + "language_loss": 0.83300132, + "learning_rate": 5.625052982818472e-07, + "loss": 0.90967631, + "num_input_tokens_seen": 273611790, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.09564209, + "step": 12684, + "time_per_iteration": 2.545069932937622 + }, + { + "auxiliary_loss_clip": 0.06406077, + "auxiliary_loss_mlp": 0.0126848, + "balance_loss_clip": 0.06271654, + "balance_loss_mlp": 0.01258264, + "epoch": 0.7626634600931911, + "flos": 12603150541440.0, + "grad_norm": 1.7483092151310056, + "language_loss": 0.82848525, + "learning_rate": 5.622345439907396e-07, + "loss": 0.90523082, + "num_input_tokens_seen": 273628340, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.10211182, + "step": 12685, + "time_per_iteration": 2.5331482887268066 + }, + { + "auxiliary_loss_clip": 0.06405815, + "auxiliary_loss_mlp": 0.01266629, + "balance_loss_clip": 0.0627293, + "balance_loss_mlp": 0.0125692, + "epoch": 0.762723583345859, + "flos": 26329731500160.0, + "grad_norm": 1.6739148989024917, + "language_loss": 0.77748114, + "learning_rate": 5.619638442198422e-07, + "loss": 0.85420561, + "num_input_tokens_seen": 273646585, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.0970459, + "step": 12686, + "time_per_iteration": 2.529662609100342 + }, + { + "auxiliary_loss_clip": 0.06407499, + "auxiliary_loss_mlp": 0.01264706, + "balance_loss_clip": 0.06270917, + "balance_loss_mlp": 0.01254204, + "epoch": 0.762783706598527, + "flos": 21913325550720.0, + "grad_norm": 1.6937601944819862, + "language_loss": 0.72154206, + "learning_rate": 5.616931989794198e-07, + "loss": 0.79826409, + "num_input_tokens_seen": 273665410, + "router_z_loss_clip": 1.36621094, + "router_z_loss_mlp": 0.1050415, + "step": 12687, + "time_per_iteration": 2.486391544342041 + }, + { + "auxiliary_loss_clip": 0.06404573, + "auxiliary_loss_mlp": 0.01266259, + "balance_loss_clip": 0.062728, + "balance_loss_mlp": 0.01256263, + "epoch": 0.7628438298511949, + "flos": 15344983843200.0, + "grad_norm": 3.1096174425988656, + "language_loss": 0.65146047, + "learning_rate": 5.614226082797369e-07, + "loss": 0.72816885, + "num_input_tokens_seen": 273683035, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09991455, + "step": 12688, + "time_per_iteration": 2.486335515975952 + }, + { + "auxiliary_loss_clip": 0.06397952, + "auxiliary_loss_mlp": 0.01267437, + "balance_loss_clip": 0.062691, + "balance_loss_mlp": 0.01258103, + "epoch": 0.7629039531038629, + "flos": 13011388433280.0, + "grad_norm": 1.9926161434676632, + "language_loss": 0.70924902, + "learning_rate": 5.611520721310515e-07, + "loss": 0.78590292, + "num_input_tokens_seen": 273700130, + "router_z_loss_clip": 1.28613281, + "router_z_loss_mlp": 0.09332275, + "step": 12689, + "time_per_iteration": 2.5037851333618164 + }, + { + "auxiliary_loss_clip": 0.06412265, + "auxiliary_loss_mlp": 0.01264555, + "balance_loss_clip": 0.06273138, + "balance_loss_mlp": 0.01254493, + "epoch": 0.7629640763565309, + "flos": 26177938629120.0, + "grad_norm": 1.870564488725158, + "language_loss": 0.70028657, + "learning_rate": 5.608815905436238e-07, + "loss": 0.77705473, + "num_input_tokens_seen": 273720310, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.10058594, + "step": 12690, + "time_per_iteration": 2.533437728881836 + }, + { + "auxiliary_loss_clip": 0.06403746, + "auxiliary_loss_mlp": 0.01262782, + "balance_loss_clip": 0.06271788, + "balance_loss_mlp": 0.01253174, + "epoch": 0.7630241996091989, + "flos": 36802553437440.0, + "grad_norm": 1.3861533863354163, + "language_loss": 0.69748205, + "learning_rate": 5.606111635277109e-07, + "loss": 0.77414727, + "num_input_tokens_seen": 273744475, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.0960083, + "step": 12691, + "time_per_iteration": 4.015859127044678 + }, + { + "auxiliary_loss_clip": 0.06401307, + "auxiliary_loss_mlp": 0.01260884, + "balance_loss_clip": 0.06269828, + "balance_loss_mlp": 0.01252003, + "epoch": 0.7630843228618668, + "flos": 21841600855680.0, + "grad_norm": 1.5523680121734649, + "language_loss": 0.82087487, + "learning_rate": 5.603407910935662e-07, + "loss": 0.89749676, + "num_input_tokens_seen": 273764635, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.08880615, + "step": 12692, + "time_per_iteration": 2.5389950275421143 + }, + { + "auxiliary_loss_clip": 0.06409267, + "auxiliary_loss_mlp": 0.01265339, + "balance_loss_clip": 0.06275039, + "balance_loss_mlp": 0.01255993, + "epoch": 0.7631444461145348, + "flos": 12645385799040.0, + "grad_norm": 2.3344184890866564, + "language_loss": 0.77300888, + "learning_rate": 5.600704732514438e-07, + "loss": 0.84975493, + "num_input_tokens_seen": 273780115, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.09344482, + "step": 12693, + "time_per_iteration": 2.445725917816162 + }, + { + "auxiliary_loss_clip": 0.064025, + "auxiliary_loss_mlp": 0.0126808, + "balance_loss_clip": 0.06269249, + "balance_loss_mlp": 0.01257643, + "epoch": 0.7632045693672027, + "flos": 16842215324160.0, + "grad_norm": 1.879033723685166, + "language_loss": 0.7319355, + "learning_rate": 5.598002100115933e-07, + "loss": 0.80864131, + "num_input_tokens_seen": 273796605, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.10437012, + "step": 12694, + "time_per_iteration": 2.480100154876709 + }, + { + "auxiliary_loss_clip": 0.06401706, + "auxiliary_loss_mlp": 0.01263272, + "balance_loss_clip": 0.06270289, + "balance_loss_mlp": 0.01253663, + "epoch": 0.7632646926198707, + "flos": 22023763632000.0, + "grad_norm": 1.7362595054615078, + "language_loss": 0.70577729, + "learning_rate": 5.595300013842625e-07, + "loss": 0.78242707, + "num_input_tokens_seen": 273816515, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09619141, + "step": 12695, + "time_per_iteration": 2.484557867050171 + }, + { + "auxiliary_loss_clip": 0.06405228, + "auxiliary_loss_mlp": 0.01265272, + "balance_loss_clip": 0.0627223, + "balance_loss_mlp": 0.01255134, + "epoch": 0.7633248158725388, + "flos": 23120974920960.0, + "grad_norm": 1.5006607242564833, + "language_loss": 0.72539437, + "learning_rate": 5.592598473796985e-07, + "loss": 0.80209941, + "num_input_tokens_seen": 273837060, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.10150146, + "step": 12696, + "time_per_iteration": 2.535898208618164 + }, + { + "auxiliary_loss_clip": 0.06401037, + "auxiliary_loss_mlp": 0.0126526, + "balance_loss_clip": 0.06268622, + "balance_loss_mlp": 0.01255568, + "epoch": 0.7633849391252067, + "flos": 10894518408960.0, + "grad_norm": 2.5144564572490116, + "language_loss": 0.71505952, + "learning_rate": 5.589897480081453e-07, + "loss": 0.79172248, + "num_input_tokens_seen": 273853365, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09692383, + "step": 12697, + "time_per_iteration": 2.4591684341430664 + }, + { + "auxiliary_loss_clip": 0.06400824, + "auxiliary_loss_mlp": 0.01260764, + "balance_loss_clip": 0.06270981, + "balance_loss_mlp": 0.01251179, + "epoch": 0.7634450623778747, + "flos": 21000163754880.0, + "grad_norm": 1.880904163415611, + "language_loss": 0.67272222, + "learning_rate": 5.587197032798461e-07, + "loss": 0.74933803, + "num_input_tokens_seen": 273870750, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.0958252, + "step": 12698, + "time_per_iteration": 2.5230917930603027 + }, + { + "auxiliary_loss_clip": 0.06403317, + "auxiliary_loss_mlp": 0.01265477, + "balance_loss_clip": 0.06270997, + "balance_loss_mlp": 0.01255529, + "epoch": 0.7635051856305426, + "flos": 18888366902400.0, + "grad_norm": 1.5780107163253119, + "language_loss": 0.72484887, + "learning_rate": 5.5844971320504e-07, + "loss": 0.8015368, + "num_input_tokens_seen": 273890890, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.0994873, + "step": 12699, + "time_per_iteration": 2.5273780822753906 + }, + { + "auxiliary_loss_clip": 0.0640247, + "auxiliary_loss_mlp": 0.0126796, + "balance_loss_clip": 0.06273928, + "balance_loss_mlp": 0.01258906, + "epoch": 0.7635653088832106, + "flos": 34795492588800.0, + "grad_norm": 1.9895424194721678, + "language_loss": 0.73307264, + "learning_rate": 5.581797777939648e-07, + "loss": 0.8097769, + "num_input_tokens_seen": 273914015, + "router_z_loss_clip": 1.28613281, + "router_z_loss_mlp": 0.09069824, + "step": 12700, + "time_per_iteration": 4.06644868850708 + }, + { + "auxiliary_loss_clip": 0.0640322, + "auxiliary_loss_mlp": 0.01269407, + "balance_loss_clip": 0.06270028, + "balance_loss_mlp": 0.01259608, + "epoch": 0.7636254321358785, + "flos": 23183978791680.0, + "grad_norm": 1.8289500414025046, + "language_loss": 0.69277215, + "learning_rate": 5.579098970568574e-07, + "loss": 0.76949847, + "num_input_tokens_seen": 273927415, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.09796143, + "step": 12701, + "time_per_iteration": 2.4977099895477295 + }, + { + "auxiliary_loss_clip": 0.06401876, + "auxiliary_loss_mlp": 0.01262857, + "balance_loss_clip": 0.06269674, + "balance_loss_mlp": 0.01253243, + "epoch": 0.7636855553885465, + "flos": 21331729560960.0, + "grad_norm": 1.5301057508918974, + "language_loss": 0.64290726, + "learning_rate": 5.576400710039508e-07, + "loss": 0.7195546, + "num_input_tokens_seen": 273946690, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.09606934, + "step": 12702, + "time_per_iteration": 2.4910881519317627 + }, + { + "auxiliary_loss_clip": 0.06402961, + "auxiliary_loss_mlp": 0.01265669, + "balance_loss_clip": 0.06269959, + "balance_loss_mlp": 0.01256234, + "epoch": 0.7637456786412145, + "flos": 28665674824320.0, + "grad_norm": 1.963609141873143, + "language_loss": 0.66137874, + "learning_rate": 5.57370299645477e-07, + "loss": 0.738065, + "num_input_tokens_seen": 273966870, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.09429932, + "step": 12703, + "time_per_iteration": 3.9583401679992676 + }, + { + "auxiliary_loss_clip": 0.06406517, + "auxiliary_loss_mlp": 0.01265828, + "balance_loss_clip": 0.06273364, + "balance_loss_mlp": 0.01256721, + "epoch": 0.7638058018938825, + "flos": 21913577112960.0, + "grad_norm": 2.0195903258707757, + "language_loss": 0.83478069, + "learning_rate": 5.571005829916668e-07, + "loss": 0.91150421, + "num_input_tokens_seen": 273986360, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.09112549, + "step": 12704, + "time_per_iteration": 2.5038557052612305 + }, + { + "auxiliary_loss_clip": 0.0640365, + "auxiliary_loss_mlp": 0.01268211, + "balance_loss_clip": 0.06271724, + "balance_loss_mlp": 0.01258686, + "epoch": 0.7638659251465504, + "flos": 29651777199360.0, + "grad_norm": 1.4030805409759646, + "language_loss": 0.68150222, + "learning_rate": 5.568309210527469e-07, + "loss": 0.75822091, + "num_input_tokens_seen": 274009745, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.09527588, + "step": 12705, + "time_per_iteration": 2.5900156497955322 + }, + { + "auxiliary_loss_clip": 0.06400676, + "auxiliary_loss_mlp": 0.01264845, + "balance_loss_clip": 0.06270821, + "balance_loss_mlp": 0.01255672, + "epoch": 0.7639260483992184, + "flos": 26148449191680.0, + "grad_norm": 1.5410038713701188, + "language_loss": 0.74538386, + "learning_rate": 5.565613138389427e-07, + "loss": 0.82203901, + "num_input_tokens_seen": 274028775, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.09173584, + "step": 12706, + "time_per_iteration": 2.559558391571045 + }, + { + "auxiliary_loss_clip": 0.06403012, + "auxiliary_loss_mlp": 0.01265533, + "balance_loss_clip": 0.0627191, + "balance_loss_mlp": 0.01256336, + "epoch": 0.7639861716518863, + "flos": 20162835504000.0, + "grad_norm": 1.755600712442579, + "language_loss": 0.78974855, + "learning_rate": 5.562917613604781e-07, + "loss": 0.86643398, + "num_input_tokens_seen": 274047520, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09191895, + "step": 12707, + "time_per_iteration": 3.932704210281372 + }, + { + "auxiliary_loss_clip": 0.06401724, + "auxiliary_loss_mlp": 0.01265201, + "balance_loss_clip": 0.06268962, + "balance_loss_mlp": 0.01255283, + "epoch": 0.7640462949045543, + "flos": 18588219177600.0, + "grad_norm": 6.1940407959342885, + "language_loss": 0.80090815, + "learning_rate": 5.560222636275751e-07, + "loss": 0.87757736, + "num_input_tokens_seen": 274065350, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.0993042, + "step": 12708, + "time_per_iteration": 2.4813318252563477 + }, + { + "auxiliary_loss_clip": 0.06315993, + "auxiliary_loss_mlp": 0.0125198, + "balance_loss_clip": 0.06260599, + "balance_loss_mlp": 0.01250996, + "epoch": 0.7641064181572224, + "flos": 68342972538240.0, + "grad_norm": 0.7968333839429529, + "language_loss": 0.5539844, + "learning_rate": 5.557528206504521e-07, + "loss": 0.62966412, + "num_input_tokens_seen": 274122315, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.00983429, + "step": 12709, + "time_per_iteration": 3.1384057998657227 + }, + { + "auxiliary_loss_clip": 0.0640793, + "auxiliary_loss_mlp": 0.01269871, + "balance_loss_clip": 0.06272653, + "balance_loss_mlp": 0.0125925, + "epoch": 0.7641665414098903, + "flos": 17974995471360.0, + "grad_norm": 1.6571298349962345, + "language_loss": 0.63628614, + "learning_rate": 5.554834324393271e-07, + "loss": 0.71306419, + "num_input_tokens_seen": 274140555, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.10614014, + "step": 12710, + "time_per_iteration": 2.503221273422241 + }, + { + "auxiliary_loss_clip": 0.06405756, + "auxiliary_loss_mlp": 0.01266035, + "balance_loss_clip": 0.06270481, + "balance_loss_mlp": 0.01255705, + "epoch": 0.7642266646625583, + "flos": 21258537419520.0, + "grad_norm": 2.423165664894835, + "language_loss": 0.64622939, + "learning_rate": 5.552140990044154e-07, + "loss": 0.72294724, + "num_input_tokens_seen": 274161125, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.10327148, + "step": 12711, + "time_per_iteration": 2.48382568359375 + }, + { + "auxiliary_loss_clip": 0.06402837, + "auxiliary_loss_mlp": 0.01266675, + "balance_loss_clip": 0.06270531, + "balance_loss_mlp": 0.01257216, + "epoch": 0.7642867879152262, + "flos": 22754469162240.0, + "grad_norm": 1.499831368340144, + "language_loss": 0.73271233, + "learning_rate": 5.549448203559293e-07, + "loss": 0.80940747, + "num_input_tokens_seen": 274180835, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.09454346, + "step": 12712, + "time_per_iteration": 2.518559455871582 + }, + { + "auxiliary_loss_clip": 0.06399734, + "auxiliary_loss_mlp": 0.01265086, + "balance_loss_clip": 0.06270479, + "balance_loss_mlp": 0.01256247, + "epoch": 0.7643469111678942, + "flos": 23339000044800.0, + "grad_norm": 4.100229806424162, + "language_loss": 0.80473924, + "learning_rate": 5.546755965040804e-07, + "loss": 0.88138747, + "num_input_tokens_seen": 274201190, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.08837891, + "step": 12713, + "time_per_iteration": 2.495666742324829 + }, + { + "auxiliary_loss_clip": 0.0640631, + "auxiliary_loss_mlp": 0.01266494, + "balance_loss_clip": 0.06271648, + "balance_loss_mlp": 0.01256237, + "epoch": 0.7644070344205621, + "flos": 19861891165440.0, + "grad_norm": 2.1468665185465396, + "language_loss": 0.84159482, + "learning_rate": 5.544064274590776e-07, + "loss": 0.91832292, + "num_input_tokens_seen": 274217595, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.10266113, + "step": 12714, + "time_per_iteration": 2.4871368408203125 + }, + { + "auxiliary_loss_clip": 0.06406413, + "auxiliary_loss_mlp": 0.01267342, + "balance_loss_clip": 0.06272297, + "balance_loss_mlp": 0.01257603, + "epoch": 0.7644671576732301, + "flos": 22097123481600.0, + "grad_norm": 1.4736408355385546, + "language_loss": 0.73087925, + "learning_rate": 5.541373132311287e-07, + "loss": 0.80761683, + "num_input_tokens_seen": 274237885, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.09741211, + "step": 12715, + "time_per_iteration": 2.4971745014190674 + }, + { + "auxiliary_loss_clip": 0.06399769, + "auxiliary_loss_mlp": 0.01265115, + "balance_loss_clip": 0.06267397, + "balance_loss_mlp": 0.01256252, + "epoch": 0.7645272809258981, + "flos": 25488084764160.0, + "grad_norm": 1.606219528134415, + "language_loss": 0.63579881, + "learning_rate": 5.538682538304376e-07, + "loss": 0.71244764, + "num_input_tokens_seen": 274258820, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.08868408, + "step": 12716, + "time_per_iteration": 2.5588536262512207 + }, + { + "auxiliary_loss_clip": 0.06410594, + "auxiliary_loss_mlp": 0.01264337, + "balance_loss_clip": 0.06273409, + "balance_loss_mlp": 0.01254353, + "epoch": 0.7645874041785661, + "flos": 21548035676160.0, + "grad_norm": 1.605402904200963, + "language_loss": 0.80340159, + "learning_rate": 5.535992492672068e-07, + "loss": 0.88015091, + "num_input_tokens_seen": 274278835, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.09991455, + "step": 12717, + "time_per_iteration": 2.4905505180358887 + }, + { + "auxiliary_loss_clip": 0.06401056, + "auxiliary_loss_mlp": 0.01264789, + "balance_loss_clip": 0.06271626, + "balance_loss_mlp": 0.01255342, + "epoch": 0.764647527431234, + "flos": 20637096013440.0, + "grad_norm": 2.3928982518870474, + "language_loss": 0.669339, + "learning_rate": 5.53330299551638e-07, + "loss": 0.74599743, + "num_input_tokens_seen": 274297110, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.09448242, + "step": 12718, + "time_per_iteration": 2.492809772491455 + }, + { + "auxiliary_loss_clip": 0.06399414, + "auxiliary_loss_mlp": 0.01266678, + "balance_loss_clip": 0.06269114, + "balance_loss_mlp": 0.01257368, + "epoch": 0.764707650683902, + "flos": 21440490560640.0, + "grad_norm": 1.7155178939343805, + "language_loss": 0.77496254, + "learning_rate": 5.530614046939286e-07, + "loss": 0.85162342, + "num_input_tokens_seen": 274315610, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.09301758, + "step": 12719, + "time_per_iteration": 2.5259573459625244 + }, + { + "auxiliary_loss_clip": 0.06404945, + "auxiliary_loss_mlp": 0.01264588, + "balance_loss_clip": 0.06272125, + "balance_loss_mlp": 0.01255021, + "epoch": 0.7647677739365699, + "flos": 22717852128000.0, + "grad_norm": 1.9590152643999037, + "language_loss": 0.69958895, + "learning_rate": 5.527925647042754e-07, + "loss": 0.77628434, + "num_input_tokens_seen": 274333975, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09564209, + "step": 12720, + "time_per_iteration": 2.539653778076172 + }, + { + "auxiliary_loss_clip": 0.06404178, + "auxiliary_loss_mlp": 0.01262819, + "balance_loss_clip": 0.06272593, + "balance_loss_mlp": 0.01252716, + "epoch": 0.7648278971892379, + "flos": 21330429822720.0, + "grad_norm": 1.6704748814369004, + "language_loss": 0.73973656, + "learning_rate": 5.52523779592875e-07, + "loss": 0.81640649, + "num_input_tokens_seen": 274353695, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.10107422, + "step": 12721, + "time_per_iteration": 2.501253128051758 + }, + { + "auxiliary_loss_clip": 0.06403898, + "auxiliary_loss_mlp": 0.01264362, + "balance_loss_clip": 0.06270562, + "balance_loss_mlp": 0.01254771, + "epoch": 0.764888020441906, + "flos": 20673545339520.0, + "grad_norm": 1.706168153440744, + "language_loss": 0.73528266, + "learning_rate": 5.522550493699163e-07, + "loss": 0.81196523, + "num_input_tokens_seen": 274371120, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.09594727, + "step": 12722, + "time_per_iteration": 2.509871244430542 + }, + { + "auxiliary_loss_clip": 0.06399025, + "auxiliary_loss_mlp": 0.01265445, + "balance_loss_clip": 0.06269681, + "balance_loss_mlp": 0.01256015, + "epoch": 0.7649481436945739, + "flos": 25089532018560.0, + "grad_norm": 1.7286135730297545, + "language_loss": 0.74329245, + "learning_rate": 5.519863740455912e-07, + "loss": 0.81993717, + "num_input_tokens_seen": 274389665, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.09423828, + "step": 12723, + "time_per_iteration": 2.510096549987793 + }, + { + "auxiliary_loss_clip": 0.06404193, + "auxiliary_loss_mlp": 0.01262404, + "balance_loss_clip": 0.06269242, + "balance_loss_mlp": 0.01252688, + "epoch": 0.7650082669472419, + "flos": 24907998147840.0, + "grad_norm": 2.2850113448580958, + "language_loss": 0.73361677, + "learning_rate": 5.517177536300881e-07, + "loss": 0.81028277, + "num_input_tokens_seen": 274408750, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.09710693, + "step": 12724, + "time_per_iteration": 2.5588150024414062 + }, + { + "auxiliary_loss_clip": 0.06401032, + "auxiliary_loss_mlp": 0.01264201, + "balance_loss_clip": 0.06271203, + "balance_loss_mlp": 0.01254885, + "epoch": 0.7650683901999098, + "flos": 14652614355840.0, + "grad_norm": 1.6932286249415067, + "language_loss": 0.84691983, + "learning_rate": 5.514491881335935e-07, + "loss": 0.92357218, + "num_input_tokens_seen": 274424600, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.09320068, + "step": 12725, + "time_per_iteration": 2.4555823802948 + }, + { + "auxiliary_loss_clip": 0.06405662, + "auxiliary_loss_mlp": 0.01270715, + "balance_loss_clip": 0.06275846, + "balance_loss_mlp": 0.01260433, + "epoch": 0.7651285134525778, + "flos": 26358466250880.0, + "grad_norm": 1.7988072143781486, + "language_loss": 0.77533686, + "learning_rate": 5.511806775662901e-07, + "loss": 0.85210061, + "num_input_tokens_seen": 274443075, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.10284424, + "step": 12726, + "time_per_iteration": 2.56742000579834 + }, + { + "auxiliary_loss_clip": 0.06403583, + "auxiliary_loss_mlp": 0.01263268, + "balance_loss_clip": 0.06271972, + "balance_loss_mlp": 0.01254024, + "epoch": 0.7651886367052457, + "flos": 26653373095680.0, + "grad_norm": 1.6652210765488402, + "language_loss": 0.70600379, + "learning_rate": 5.509122219383615e-07, + "loss": 0.78267229, + "num_input_tokens_seen": 274463240, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09240723, + "step": 12727, + "time_per_iteration": 2.5245282649993896 + }, + { + "auxiliary_loss_clip": 0.06395786, + "auxiliary_loss_mlp": 0.01263203, + "balance_loss_clip": 0.06267853, + "balance_loss_mlp": 0.01254024, + "epoch": 0.7652487599579137, + "flos": 25709967175680.0, + "grad_norm": 1.6422371786213563, + "language_loss": 0.80038959, + "learning_rate": 5.506438212599864e-07, + "loss": 0.87697947, + "num_input_tokens_seen": 274482750, + "router_z_loss_clip": 1.27832031, + "router_z_loss_mlp": 0.09179688, + "step": 12728, + "time_per_iteration": 2.553881883621216 + }, + { + "auxiliary_loss_clip": 0.064078, + "auxiliary_loss_mlp": 0.01267492, + "balance_loss_clip": 0.0627337, + "balance_loss_mlp": 0.01257395, + "epoch": 0.7653088832105817, + "flos": 28593237369600.0, + "grad_norm": 1.6909382906919501, + "language_loss": 0.55773109, + "learning_rate": 5.503754755413424e-07, + "loss": 0.63448405, + "num_input_tokens_seen": 274503545, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.10089111, + "step": 12729, + "time_per_iteration": 2.561567783355713 + }, + { + "auxiliary_loss_clip": 0.06402748, + "auxiliary_loss_mlp": 0.01266568, + "balance_loss_clip": 0.0627131, + "balance_loss_mlp": 0.01256435, + "epoch": 0.7653690064632497, + "flos": 23373311091840.0, + "grad_norm": 1.5255211318254533, + "language_loss": 0.77756214, + "learning_rate": 5.501071847926055e-07, + "loss": 0.85425532, + "num_input_tokens_seen": 274523825, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.10131836, + "step": 12730, + "time_per_iteration": 3.951883316040039 + }, + { + "auxiliary_loss_clip": 0.0640994, + "auxiliary_loss_mlp": 0.01263677, + "balance_loss_clip": 0.06275389, + "balance_loss_mlp": 0.01253496, + "epoch": 0.7654291297159176, + "flos": 15778560395520.0, + "grad_norm": 1.5538691638081712, + "language_loss": 0.68886495, + "learning_rate": 5.498389490239495e-07, + "loss": 0.7656011, + "num_input_tokens_seen": 274541625, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.10180664, + "step": 12731, + "time_per_iteration": 2.496400833129883 + }, + { + "auxiliary_loss_clip": 0.06406744, + "auxiliary_loss_mlp": 0.01266172, + "balance_loss_clip": 0.06273277, + "balance_loss_mlp": 0.01255997, + "epoch": 0.7654892529685856, + "flos": 18038460539520.0, + "grad_norm": 1.970235991711743, + "language_loss": 0.70561087, + "learning_rate": 5.495707682455471e-07, + "loss": 0.78233999, + "num_input_tokens_seen": 274557580, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.10174561, + "step": 12732, + "time_per_iteration": 2.4463298320770264 + }, + { + "auxiliary_loss_clip": 0.06407348, + "auxiliary_loss_mlp": 0.01267052, + "balance_loss_clip": 0.06273159, + "balance_loss_mlp": 0.01257009, + "epoch": 0.7655493762212535, + "flos": 27243522201600.0, + "grad_norm": 1.6975746826212326, + "language_loss": 0.7867943, + "learning_rate": 5.493026424675653e-07, + "loss": 0.86353827, + "num_input_tokens_seen": 274578135, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.10040283, + "step": 12733, + "time_per_iteration": 2.5465524196624756 + }, + { + "auxiliary_loss_clip": 0.06404738, + "auxiliary_loss_mlp": 0.01264475, + "balance_loss_clip": 0.06275003, + "balance_loss_mlp": 0.01254843, + "epoch": 0.7656094994739215, + "flos": 20779706862720.0, + "grad_norm": 1.7438651719482663, + "language_loss": 0.78086102, + "learning_rate": 5.490345717001726e-07, + "loss": 0.85755318, + "num_input_tokens_seen": 274595655, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.09637451, + "step": 12734, + "time_per_iteration": 2.491992235183716 + }, + { + "auxiliary_loss_clip": 0.06409705, + "auxiliary_loss_mlp": 0.01265243, + "balance_loss_clip": 0.06273736, + "balance_loss_mlp": 0.01254628, + "epoch": 0.7656696227265896, + "flos": 23045896062720.0, + "grad_norm": 1.5457458237043498, + "language_loss": 0.73303032, + "learning_rate": 5.48766555953535e-07, + "loss": 0.80977982, + "num_input_tokens_seen": 274616305, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.1060791, + "step": 12735, + "time_per_iteration": 2.549952507019043 + }, + { + "auxiliary_loss_clip": 0.06403875, + "auxiliary_loss_mlp": 0.01265362, + "balance_loss_clip": 0.0627028, + "balance_loss_mlp": 0.01255956, + "epoch": 0.7657297459792575, + "flos": 27532810823040.0, + "grad_norm": 1.38702410103644, + "language_loss": 0.72968668, + "learning_rate": 5.484985952378145e-07, + "loss": 0.80637902, + "num_input_tokens_seen": 274638110, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.09399414, + "step": 12736, + "time_per_iteration": 2.5478687286376953 + }, + { + "auxiliary_loss_clip": 0.06409203, + "auxiliary_loss_mlp": 0.0126645, + "balance_loss_clip": 0.06272754, + "balance_loss_mlp": 0.01255399, + "epoch": 0.7657898692319255, + "flos": 17134103422080.0, + "grad_norm": 1.7853161990922843, + "language_loss": 0.77847868, + "learning_rate": 5.482306895631728e-07, + "loss": 0.85523522, + "num_input_tokens_seen": 274656565, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.11065674, + "step": 12737, + "time_per_iteration": 2.517828941345215 + }, + { + "auxiliary_loss_clip": 0.06403487, + "auxiliary_loss_mlp": 0.01264987, + "balance_loss_clip": 0.06271316, + "balance_loss_mlp": 0.01254795, + "epoch": 0.7658499924845934, + "flos": 21471363590400.0, + "grad_norm": 1.7993008956393386, + "language_loss": 0.7689963, + "learning_rate": 5.479628389397699e-07, + "loss": 0.84568107, + "num_input_tokens_seen": 274674215, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.10186768, + "step": 12738, + "time_per_iteration": 2.4858741760253906 + }, + { + "auxiliary_loss_clip": 0.06409841, + "auxiliary_loss_mlp": 0.01265376, + "balance_loss_clip": 0.06272836, + "balance_loss_mlp": 0.01254748, + "epoch": 0.7659101157372614, + "flos": 29504302813440.0, + "grad_norm": 1.7653019874765563, + "language_loss": 0.6329987, + "learning_rate": 5.476950433777603e-07, + "loss": 0.70975083, + "num_input_tokens_seen": 274693445, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.10620117, + "step": 12739, + "time_per_iteration": 3.9952597618103027 + }, + { + "auxiliary_loss_clip": 0.06407788, + "auxiliary_loss_mlp": 0.0126759, + "balance_loss_clip": 0.06274374, + "balance_loss_mlp": 0.01256718, + "epoch": 0.7659702389899293, + "flos": 18557765418240.0, + "grad_norm": 1.7669010799995182, + "language_loss": 0.7909317, + "learning_rate": 5.474273028873004e-07, + "loss": 0.8676855, + "num_input_tokens_seen": 274712815, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.10870361, + "step": 12740, + "time_per_iteration": 2.5115749835968018 + }, + { + "auxiliary_loss_clip": 0.06403244, + "auxiliary_loss_mlp": 0.01263789, + "balance_loss_clip": 0.06271347, + "balance_loss_mlp": 0.01253853, + "epoch": 0.7660303622425974, + "flos": 23555767357440.0, + "grad_norm": 1.6620793532611546, + "language_loss": 0.65799433, + "learning_rate": 5.471596174785429e-07, + "loss": 0.73466468, + "num_input_tokens_seen": 274732690, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.09924316, + "step": 12741, + "time_per_iteration": 2.55269718170166 + }, + { + "auxiliary_loss_clip": 0.06404097, + "auxiliary_loss_mlp": 0.01266317, + "balance_loss_clip": 0.06272512, + "balance_loss_mlp": 0.01256482, + "epoch": 0.7660904854952653, + "flos": 18922761803520.0, + "grad_norm": 1.4348808707369967, + "language_loss": 0.76128972, + "learning_rate": 5.468919871616386e-07, + "loss": 0.83799386, + "num_input_tokens_seen": 274752460, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09832764, + "step": 12742, + "time_per_iteration": 3.9655463695526123 + }, + { + "auxiliary_loss_clip": 0.06397024, + "auxiliary_loss_mlp": 0.01262102, + "balance_loss_clip": 0.06269021, + "balance_loss_mlp": 0.01253274, + "epoch": 0.7661506087479333, + "flos": 23153986229760.0, + "grad_norm": 1.3105418877806154, + "language_loss": 0.76677555, + "learning_rate": 5.46624411946736e-07, + "loss": 0.84336686, + "num_input_tokens_seen": 274773070, + "router_z_loss_clip": 1.27929688, + "router_z_loss_mlp": 0.08831787, + "step": 12743, + "time_per_iteration": 2.4942922592163086 + }, + { + "auxiliary_loss_clip": 0.064053, + "auxiliary_loss_mlp": 0.01263354, + "balance_loss_clip": 0.06272225, + "balance_loss_mlp": 0.01253918, + "epoch": 0.7662107320006012, + "flos": 17571411480960.0, + "grad_norm": 1.8622912064646877, + "language_loss": 0.75256228, + "learning_rate": 5.463568918439805e-07, + "loss": 0.82924885, + "num_input_tokens_seen": 274790220, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.09442139, + "step": 12744, + "time_per_iteration": 2.500877618789673 + }, + { + "auxiliary_loss_clip": 0.06405517, + "auxiliary_loss_mlp": 0.01265062, + "balance_loss_clip": 0.06271944, + "balance_loss_mlp": 0.01255078, + "epoch": 0.7662708552532692, + "flos": 22308524133120.0, + "grad_norm": 3.023764218410669, + "language_loss": 0.70912051, + "learning_rate": 5.460894268635181e-07, + "loss": 0.78582633, + "num_input_tokens_seen": 274805095, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.09979248, + "step": 12745, + "time_per_iteration": 2.4632673263549805 + }, + { + "auxiliary_loss_clip": 0.06404217, + "auxiliary_loss_mlp": 0.01263005, + "balance_loss_clip": 0.0627097, + "balance_loss_mlp": 0.01252938, + "epoch": 0.7663309785059371, + "flos": 15747477730560.0, + "grad_norm": 2.4148009048873975, + "language_loss": 0.77143252, + "learning_rate": 5.458220170154896e-07, + "loss": 0.84810472, + "num_input_tokens_seen": 274821800, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.10058594, + "step": 12746, + "time_per_iteration": 2.470808506011963 + }, + { + "auxiliary_loss_clip": 0.06317573, + "auxiliary_loss_mlp": 0.01252549, + "balance_loss_clip": 0.06262261, + "balance_loss_mlp": 0.0125142, + "epoch": 0.7663911017586051, + "flos": 62184503877120.0, + "grad_norm": 0.6541980070594193, + "language_loss": 0.56711543, + "learning_rate": 5.455546623100362e-07, + "loss": 0.6428166, + "num_input_tokens_seen": 274886970, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01132202, + "step": 12747, + "time_per_iteration": 4.652554273605347 + }, + { + "auxiliary_loss_clip": 0.06402487, + "auxiliary_loss_mlp": 0.01263124, + "balance_loss_clip": 0.06272968, + "balance_loss_mlp": 0.01254393, + "epoch": 0.7664512250112732, + "flos": 26513361722880.0, + "grad_norm": 1.4294052686303238, + "language_loss": 0.72911537, + "learning_rate": 5.452873627572956e-07, + "loss": 0.80577153, + "num_input_tokens_seen": 274907240, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.08728027, + "step": 12748, + "time_per_iteration": 2.532306432723999 + }, + { + "auxiliary_loss_clip": 0.06404538, + "auxiliary_loss_mlp": 0.01268933, + "balance_loss_clip": 0.0627327, + "balance_loss_mlp": 0.01259348, + "epoch": 0.7665113482639411, + "flos": 16254497986560.0, + "grad_norm": 1.791719003468204, + "language_loss": 0.70015478, + "learning_rate": 5.450201183674052e-07, + "loss": 0.77688944, + "num_input_tokens_seen": 274924650, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.0958252, + "step": 12749, + "time_per_iteration": 2.492206573486328 + }, + { + "auxiliary_loss_clip": 0.06405895, + "auxiliary_loss_mlp": 0.01264322, + "balance_loss_clip": 0.06271075, + "balance_loss_mlp": 0.01254136, + "epoch": 0.7665714715166091, + "flos": 27205102304640.0, + "grad_norm": 1.5075173450833508, + "language_loss": 0.73696417, + "learning_rate": 5.447529291504967e-07, + "loss": 0.81366634, + "num_input_tokens_seen": 274944550, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.10180664, + "step": 12750, + "time_per_iteration": 2.6194586753845215 + }, + { + "auxiliary_loss_clip": 0.06403321, + "auxiliary_loss_mlp": 0.01264912, + "balance_loss_clip": 0.06273864, + "balance_loss_mlp": 0.01255637, + "epoch": 0.766631594769277, + "flos": 21073900947840.0, + "grad_norm": 2.338667432338341, + "language_loss": 0.75889468, + "learning_rate": 5.444857951167026e-07, + "loss": 0.83557701, + "num_input_tokens_seen": 274961330, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.09265137, + "step": 12751, + "time_per_iteration": 2.535900354385376 + }, + { + "auxiliary_loss_clip": 0.06405959, + "auxiliary_loss_mlp": 0.01265211, + "balance_loss_clip": 0.06275126, + "balance_loss_mlp": 0.01255442, + "epoch": 0.766691718021945, + "flos": 24104897089920.0, + "grad_norm": 1.8024081309521767, + "language_loss": 0.61214471, + "learning_rate": 5.442187162761537e-07, + "loss": 0.68885642, + "num_input_tokens_seen": 274981655, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09759521, + "step": 12752, + "time_per_iteration": 2.520057439804077 + }, + { + "auxiliary_loss_clip": 0.06407845, + "auxiliary_loss_mlp": 0.01265918, + "balance_loss_clip": 0.06274091, + "balance_loss_mlp": 0.01255452, + "epoch": 0.7667518412746129, + "flos": 23447383701120.0, + "grad_norm": 2.502768793247081, + "language_loss": 0.68991947, + "learning_rate": 5.439516926389767e-07, + "loss": 0.76665711, + "num_input_tokens_seen": 274999970, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.10467529, + "step": 12753, + "time_per_iteration": 2.5649516582489014 + }, + { + "auxiliary_loss_clip": 0.06405421, + "auxiliary_loss_mlp": 0.01267269, + "balance_loss_clip": 0.06272765, + "balance_loss_mlp": 0.01257339, + "epoch": 0.766811964527281, + "flos": 18154391063040.0, + "grad_norm": 2.2031278091751103, + "language_loss": 0.62667269, + "learning_rate": 5.436847242152971e-07, + "loss": 0.7033996, + "num_input_tokens_seen": 275015805, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.09936523, + "step": 12754, + "time_per_iteration": 2.4367518424987793 + }, + { + "auxiliary_loss_clip": 0.06402913, + "auxiliary_loss_mlp": 0.01263482, + "balance_loss_clip": 0.06272813, + "balance_loss_mlp": 0.01253426, + "epoch": 0.7668720877799489, + "flos": 19542023003520.0, + "grad_norm": 2.343791341299276, + "language_loss": 0.80305493, + "learning_rate": 5.434178110152401e-07, + "loss": 0.87971884, + "num_input_tokens_seen": 275031810, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.10040283, + "step": 12755, + "time_per_iteration": 2.4789938926696777 + }, + { + "auxiliary_loss_clip": 0.06403362, + "auxiliary_loss_mlp": 0.01266077, + "balance_loss_clip": 0.06272961, + "balance_loss_mlp": 0.0125626, + "epoch": 0.7669322110326169, + "flos": 22680899677440.0, + "grad_norm": 1.9246427907733588, + "language_loss": 0.70196575, + "learning_rate": 5.431509530489242e-07, + "loss": 0.77866018, + "num_input_tokens_seen": 275049325, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.09820557, + "step": 12756, + "time_per_iteration": 2.4842453002929688 + }, + { + "auxiliary_loss_clip": 0.06408253, + "auxiliary_loss_mlp": 0.01265925, + "balance_loss_clip": 0.06274906, + "balance_loss_mlp": 0.01256621, + "epoch": 0.7669923342852848, + "flos": 26476702761600.0, + "grad_norm": 1.4236493885684283, + "language_loss": 0.70190722, + "learning_rate": 5.428841503264706e-07, + "loss": 0.77864897, + "num_input_tokens_seen": 275070865, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.09307861, + "step": 12757, + "time_per_iteration": 2.5436339378356934 + }, + { + "auxiliary_loss_clip": 0.06405462, + "auxiliary_loss_mlp": 0.01264437, + "balance_loss_clip": 0.06275049, + "balance_loss_mlp": 0.01254089, + "epoch": 0.7670524575379528, + "flos": 22862643183360.0, + "grad_norm": 1.8472558815325884, + "language_loss": 0.76448315, + "learning_rate": 5.426174028579955e-07, + "loss": 0.84118211, + "num_input_tokens_seen": 275088015, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.10345459, + "step": 12758, + "time_per_iteration": 2.4789509773254395 + }, + { + "auxiliary_loss_clip": 0.06399853, + "auxiliary_loss_mlp": 0.01265053, + "balance_loss_clip": 0.06270798, + "balance_loss_mlp": 0.01255576, + "epoch": 0.7671125807906207, + "flos": 22458136798080.0, + "grad_norm": 1.6508827422801604, + "language_loss": 0.76464295, + "learning_rate": 5.423507106536156e-07, + "loss": 0.84129202, + "num_input_tokens_seen": 275106975, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.0947876, + "step": 12759, + "time_per_iteration": 2.5259945392608643 + }, + { + "auxiliary_loss_clip": 0.0640488, + "auxiliary_loss_mlp": 0.01263564, + "balance_loss_clip": 0.06270535, + "balance_loss_mlp": 0.01254195, + "epoch": 0.7671727040432887, + "flos": 35380275033600.0, + "grad_norm": 1.982345292184502, + "language_loss": 0.68377602, + "learning_rate": 5.420840737234425e-07, + "loss": 0.7604605, + "num_input_tokens_seen": 275129560, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.09368896, + "step": 12760, + "time_per_iteration": 2.5982978343963623 + }, + { + "auxiliary_loss_clip": 0.06406338, + "auxiliary_loss_mlp": 0.01265901, + "balance_loss_clip": 0.06272851, + "balance_loss_mlp": 0.0125584, + "epoch": 0.7672328272959568, + "flos": 22502007210240.0, + "grad_norm": 1.3719850689198565, + "language_loss": 0.79309064, + "learning_rate": 5.418174920775871e-07, + "loss": 0.86981302, + "num_input_tokens_seen": 275151180, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.10058594, + "step": 12761, + "time_per_iteration": 2.5480268001556396 + }, + { + "auxiliary_loss_clip": 0.06403705, + "auxiliary_loss_mlp": 0.01267963, + "balance_loss_clip": 0.06276072, + "balance_loss_mlp": 0.01258289, + "epoch": 0.7672929505486247, + "flos": 22821372247680.0, + "grad_norm": 2.021114982719017, + "language_loss": 0.66376638, + "learning_rate": 5.415509657261589e-07, + "loss": 0.74048305, + "num_input_tokens_seen": 275170605, + "router_z_loss_clip": 1.27539062, + "router_z_loss_mlp": 0.09674072, + "step": 12762, + "time_per_iteration": 2.487494707107544 + }, + { + "auxiliary_loss_clip": 0.06406671, + "auxiliary_loss_mlp": 0.01262822, + "balance_loss_clip": 0.06272823, + "balance_loss_mlp": 0.01253148, + "epoch": 0.7673530738012927, + "flos": 20344956353280.0, + "grad_norm": 1.669517530242866, + "language_loss": 0.74410594, + "learning_rate": 5.412844946792639e-07, + "loss": 0.82080084, + "num_input_tokens_seen": 275188750, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.09667969, + "step": 12763, + "time_per_iteration": 2.50715970993042 + }, + { + "auxiliary_loss_clip": 0.06406026, + "auxiliary_loss_mlp": 0.01264927, + "balance_loss_clip": 0.06275215, + "balance_loss_mlp": 0.01254836, + "epoch": 0.7674131970539606, + "flos": 34942212288000.0, + "grad_norm": 1.4115021004744182, + "language_loss": 0.70948029, + "learning_rate": 5.410180789470067e-07, + "loss": 0.78618985, + "num_input_tokens_seen": 275211365, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.10089111, + "step": 12764, + "time_per_iteration": 2.625321388244629 + }, + { + "auxiliary_loss_clip": 0.06405284, + "auxiliary_loss_mlp": 0.0126607, + "balance_loss_clip": 0.06274922, + "balance_loss_mlp": 0.01256241, + "epoch": 0.7674733203066286, + "flos": 28336247297280.0, + "grad_norm": 1.6715058951392505, + "language_loss": 0.69761688, + "learning_rate": 5.40751718539491e-07, + "loss": 0.77433044, + "num_input_tokens_seen": 275231670, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.0982666, + "step": 12765, + "time_per_iteration": 2.6227502822875977 + }, + { + "auxiliary_loss_clip": 0.06399858, + "auxiliary_loss_mlp": 0.012619, + "balance_loss_clip": 0.06270436, + "balance_loss_mlp": 0.01252769, + "epoch": 0.7675334435592965, + "flos": 16295307724800.0, + "grad_norm": 1.8004519699404298, + "language_loss": 0.6087966, + "learning_rate": 5.404854134668162e-07, + "loss": 0.6854142, + "num_input_tokens_seen": 275249425, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.09136963, + "step": 12766, + "time_per_iteration": 2.4817140102386475 + }, + { + "auxiliary_loss_clip": 0.06319875, + "auxiliary_loss_mlp": 0.01254158, + "balance_loss_clip": 0.06264514, + "balance_loss_mlp": 0.01252872, + "epoch": 0.7675935668119646, + "flos": 64847778376320.0, + "grad_norm": 0.7247432278410384, + "language_loss": 0.6077764, + "learning_rate": 5.402191637390803e-07, + "loss": 0.68351674, + "num_input_tokens_seen": 275312485, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01286316, + "step": 12767, + "time_per_iteration": 3.2508630752563477 + }, + { + "auxiliary_loss_clip": 0.06402268, + "auxiliary_loss_mlp": 0.01266038, + "balance_loss_clip": 0.06271527, + "balance_loss_mlp": 0.0125668, + "epoch": 0.7676536900646325, + "flos": 22682157488640.0, + "grad_norm": 1.91918463694606, + "language_loss": 0.69715631, + "learning_rate": 5.399529693663801e-07, + "loss": 0.77383935, + "num_input_tokens_seen": 275331680, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.09356689, + "step": 12768, + "time_per_iteration": 2.502361297607422 + }, + { + "auxiliary_loss_clip": 0.06411647, + "auxiliary_loss_mlp": 0.01267577, + "balance_loss_clip": 0.06273838, + "balance_loss_mlp": 0.01256729, + "epoch": 0.7677138133173005, + "flos": 26946393223680.0, + "grad_norm": 1.5949336757988604, + "language_loss": 0.70845366, + "learning_rate": 5.3968683035881e-07, + "loss": 0.7852459, + "num_input_tokens_seen": 275351615, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.10864258, + "step": 12769, + "time_per_iteration": 2.554861068725586 + }, + { + "auxiliary_loss_clip": 0.0641087, + "auxiliary_loss_mlp": 0.01267364, + "balance_loss_clip": 0.06275321, + "balance_loss_mlp": 0.01257184, + "epoch": 0.7677739365699684, + "flos": 23805336343680.0, + "grad_norm": 1.7985045785763099, + "language_loss": 0.80694544, + "learning_rate": 5.394207467264611e-07, + "loss": 0.88372779, + "num_input_tokens_seen": 275368815, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10174561, + "step": 12770, + "time_per_iteration": 3.9488418102264404 + }, + { + "auxiliary_loss_clip": 0.06402189, + "auxiliary_loss_mlp": 0.01263232, + "balance_loss_clip": 0.06272912, + "balance_loss_mlp": 0.01254363, + "epoch": 0.7678340598226364, + "flos": 34463423658240.0, + "grad_norm": 1.5007452698192065, + "language_loss": 0.78956687, + "learning_rate": 5.391547184794245e-07, + "loss": 0.86622107, + "num_input_tokens_seen": 275389345, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.08868408, + "step": 12771, + "time_per_iteration": 2.5934486389160156 + }, + { + "auxiliary_loss_clip": 0.06403628, + "auxiliary_loss_mlp": 0.01263065, + "balance_loss_clip": 0.06271377, + "balance_loss_mlp": 0.01253487, + "epoch": 0.7678941830753043, + "flos": 23848493996160.0, + "grad_norm": 1.2517341680866723, + "language_loss": 0.68444574, + "learning_rate": 5.388887456277876e-07, + "loss": 0.76111269, + "num_input_tokens_seen": 275411240, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09576416, + "step": 12772, + "time_per_iteration": 2.5651042461395264 + }, + { + "auxiliary_loss_clip": 0.06401607, + "auxiliary_loss_mlp": 0.01265845, + "balance_loss_clip": 0.0627486, + "balance_loss_mlp": 0.01256893, + "epoch": 0.7679543063279723, + "flos": 25417995223680.0, + "grad_norm": 1.427251107853352, + "language_loss": 0.73993248, + "learning_rate": 5.386228281816349e-07, + "loss": 0.816607, + "num_input_tokens_seen": 275432010, + "router_z_loss_clip": 1.265625, + "router_z_loss_mlp": 0.08953857, + "step": 12773, + "time_per_iteration": 2.5750787258148193 + }, + { + "auxiliary_loss_clip": 0.0639642, + "auxiliary_loss_mlp": 0.01264695, + "balance_loss_clip": 0.06268573, + "balance_loss_mlp": 0.01256554, + "epoch": 0.7680144295806404, + "flos": 27969448049280.0, + "grad_norm": 1.5249418922144822, + "language_loss": 0.81278884, + "learning_rate": 5.383569661510512e-07, + "loss": 0.88940001, + "num_input_tokens_seen": 275453710, + "router_z_loss_clip": 1.27734375, + "router_z_loss_mlp": 0.0814209, + "step": 12774, + "time_per_iteration": 2.549635648727417 + }, + { + "auxiliary_loss_clip": 0.06401657, + "auxiliary_loss_mlp": 0.01264098, + "balance_loss_clip": 0.06272675, + "balance_loss_mlp": 0.01254757, + "epoch": 0.7680745528333083, + "flos": 20419112816640.0, + "grad_norm": 2.7097792481139122, + "language_loss": 0.69999617, + "learning_rate": 5.380911595461177e-07, + "loss": 0.77665365, + "num_input_tokens_seen": 275472915, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.09338379, + "step": 12775, + "time_per_iteration": 2.502872943878174 + }, + { + "auxiliary_loss_clip": 0.06317612, + "auxiliary_loss_mlp": 0.01254016, + "balance_loss_clip": 0.0626227, + "balance_loss_mlp": 0.0125271, + "epoch": 0.7681346760859763, + "flos": 68423124568320.0, + "grad_norm": 0.6822831430052362, + "language_loss": 0.5694207, + "learning_rate": 5.378254083769147e-07, + "loss": 0.64513695, + "num_input_tokens_seen": 275534785, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01306915, + "step": 12776, + "time_per_iteration": 3.1927366256713867 + }, + { + "auxiliary_loss_clip": 0.0640178, + "auxiliary_loss_mlp": 0.0126464, + "balance_loss_clip": 0.06271428, + "balance_loss_mlp": 0.01255545, + "epoch": 0.7681947993386442, + "flos": 21257824659840.0, + "grad_norm": 1.8462760284119832, + "language_loss": 0.74373579, + "learning_rate": 5.375597126535188e-07, + "loss": 0.8204, + "num_input_tokens_seen": 275553205, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.09100342, + "step": 12777, + "time_per_iteration": 2.5175979137420654 + }, + { + "auxiliary_loss_clip": 0.06408069, + "auxiliary_loss_mlp": 0.0126398, + "balance_loss_clip": 0.06275662, + "balance_loss_mlp": 0.01254837, + "epoch": 0.7682549225913122, + "flos": 21404125088640.0, + "grad_norm": 1.9483232393983472, + "language_loss": 0.70101172, + "learning_rate": 5.372940723860043e-07, + "loss": 0.77773219, + "num_input_tokens_seen": 275571490, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.09143066, + "step": 12778, + "time_per_iteration": 2.6068058013916016 + }, + { + "auxiliary_loss_clip": 0.06405266, + "auxiliary_loss_mlp": 0.0126478, + "balance_loss_clip": 0.06274477, + "balance_loss_mlp": 0.01255172, + "epoch": 0.7683150458439801, + "flos": 23045518719360.0, + "grad_norm": 1.8309114800353317, + "language_loss": 0.70335215, + "learning_rate": 5.37028487584446e-07, + "loss": 0.7800526, + "num_input_tokens_seen": 275589665, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09619141, + "step": 12779, + "time_per_iteration": 4.003666639328003 + }, + { + "auxiliary_loss_clip": 0.0640587, + "auxiliary_loss_mlp": 0.01265519, + "balance_loss_clip": 0.062737, + "balance_loss_mlp": 0.01255898, + "epoch": 0.7683751690966482, + "flos": 67346361204480.0, + "grad_norm": 1.5118738364126798, + "language_loss": 0.58973181, + "learning_rate": 5.367629582589133e-07, + "loss": 0.66644573, + "num_input_tokens_seen": 275615605, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09619141, + "step": 12780, + "time_per_iteration": 2.915029525756836 + }, + { + "auxiliary_loss_clip": 0.06409752, + "auxiliary_loss_mlp": 0.01268476, + "balance_loss_clip": 0.06273384, + "balance_loss_mlp": 0.01258587, + "epoch": 0.7684352923493161, + "flos": 21805361164800.0, + "grad_norm": 2.2303773736896373, + "language_loss": 0.68361402, + "learning_rate": 5.364974844194759e-07, + "loss": 0.7603963, + "num_input_tokens_seen": 275634965, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.09881592, + "step": 12781, + "time_per_iteration": 4.043205976486206 + }, + { + "auxiliary_loss_clip": 0.06404178, + "auxiliary_loss_mlp": 0.01263917, + "balance_loss_clip": 0.06271324, + "balance_loss_mlp": 0.01254428, + "epoch": 0.7684954156019841, + "flos": 25854548595840.0, + "grad_norm": 1.651939170673441, + "language_loss": 0.79629219, + "learning_rate": 5.362320660762016e-07, + "loss": 0.87297314, + "num_input_tokens_seen": 275655785, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.0949707, + "step": 12782, + "time_per_iteration": 2.5380043983459473 + }, + { + "auxiliary_loss_clip": 0.06406912, + "auxiliary_loss_mlp": 0.01263775, + "balance_loss_clip": 0.06272779, + "balance_loss_mlp": 0.01253719, + "epoch": 0.768555538854652, + "flos": 25454444549760.0, + "grad_norm": 1.9972993449433587, + "language_loss": 0.66687256, + "learning_rate": 5.35966703239153e-07, + "loss": 0.74357939, + "num_input_tokens_seen": 275676160, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.10058594, + "step": 12783, + "time_per_iteration": 2.5223419666290283 + }, + { + "auxiliary_loss_clip": 0.0640647, + "auxiliary_loss_mlp": 0.01262671, + "balance_loss_clip": 0.06273863, + "balance_loss_mlp": 0.01253069, + "epoch": 0.76861566210732, + "flos": 19652503011840.0, + "grad_norm": 1.5789937278772177, + "language_loss": 0.69208997, + "learning_rate": 5.357013959183938e-07, + "loss": 0.7687813, + "num_input_tokens_seen": 275695660, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09606934, + "step": 12784, + "time_per_iteration": 2.5100221633911133 + }, + { + "auxiliary_loss_clip": 0.06402996, + "auxiliary_loss_mlp": 0.01264042, + "balance_loss_clip": 0.06271263, + "balance_loss_mlp": 0.01255482, + "epoch": 0.7686757853599879, + "flos": 22425586686720.0, + "grad_norm": 2.2747197635366074, + "language_loss": 0.80762935, + "learning_rate": 5.354361441239843e-07, + "loss": 0.88429976, + "num_input_tokens_seen": 275714025, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.08551025, + "step": 12785, + "time_per_iteration": 2.4869916439056396 + }, + { + "auxiliary_loss_clip": 0.06404176, + "auxiliary_loss_mlp": 0.01265645, + "balance_loss_clip": 0.06271531, + "balance_loss_mlp": 0.01255506, + "epoch": 0.768735908612656, + "flos": 47784659690880.0, + "grad_norm": 2.213863326437895, + "language_loss": 0.7748611, + "learning_rate": 5.351709478659836e-07, + "loss": 0.85155928, + "num_input_tokens_seen": 275737300, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.10137939, + "step": 12786, + "time_per_iteration": 2.7327218055725098 + }, + { + "auxiliary_loss_clip": 0.06400453, + "auxiliary_loss_mlp": 0.01264363, + "balance_loss_clip": 0.06269495, + "balance_loss_mlp": 0.01254844, + "epoch": 0.7687960318653239, + "flos": 30270996472320.0, + "grad_norm": 1.9359041928849132, + "language_loss": 0.58734947, + "learning_rate": 5.349058071544468e-07, + "loss": 0.66399765, + "num_input_tokens_seen": 275757895, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.09515381, + "step": 12787, + "time_per_iteration": 4.117979288101196 + }, + { + "auxiliary_loss_clip": 0.06401558, + "auxiliary_loss_mlp": 0.01264466, + "balance_loss_clip": 0.06272475, + "balance_loss_mlp": 0.01254972, + "epoch": 0.7688561551179919, + "flos": 19579562432640.0, + "grad_norm": 1.5619171139299415, + "language_loss": 0.76386726, + "learning_rate": 5.346407219994292e-07, + "loss": 0.84052753, + "num_input_tokens_seen": 275776745, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.0949707, + "step": 12788, + "time_per_iteration": 2.5265915393829346 + }, + { + "auxiliary_loss_clip": 0.06405907, + "auxiliary_loss_mlp": 0.0126463, + "balance_loss_clip": 0.06274015, + "balance_loss_mlp": 0.01254771, + "epoch": 0.7689162783706599, + "flos": 22790373436800.0, + "grad_norm": 1.5307962602577754, + "language_loss": 0.666574, + "learning_rate": 5.343756924109821e-07, + "loss": 0.74327934, + "num_input_tokens_seen": 275797205, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09844971, + "step": 12789, + "time_per_iteration": 2.5482897758483887 + }, + { + "auxiliary_loss_clip": 0.06407897, + "auxiliary_loss_mlp": 0.01269129, + "balance_loss_clip": 0.062732, + "balance_loss_mlp": 0.01258842, + "epoch": 0.7689764016233278, + "flos": 34212764568960.0, + "grad_norm": 1.7716505240879148, + "language_loss": 0.68803114, + "learning_rate": 5.341107183991553e-07, + "loss": 0.76480138, + "num_input_tokens_seen": 275817935, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.10290527, + "step": 12790, + "time_per_iteration": 2.6209323406219482 + }, + { + "auxiliary_loss_clip": 0.06403899, + "auxiliary_loss_mlp": 0.01263088, + "balance_loss_clip": 0.0627263, + "balance_loss_mlp": 0.01253825, + "epoch": 0.7690365248759958, + "flos": 17280152288640.0, + "grad_norm": 1.3993850053379062, + "language_loss": 0.68957317, + "learning_rate": 5.338457999739969e-07, + "loss": 0.76624304, + "num_input_tokens_seen": 275837145, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09265137, + "step": 12791, + "time_per_iteration": 2.5464963912963867 + }, + { + "auxiliary_loss_clip": 0.06400929, + "auxiliary_loss_mlp": 0.01264866, + "balance_loss_clip": 0.06270476, + "balance_loss_mlp": 0.01255418, + "epoch": 0.7690966481286637, + "flos": 18229008723840.0, + "grad_norm": 1.5956237198168277, + "language_loss": 0.79798484, + "learning_rate": 5.335809371455526e-07, + "loss": 0.87464273, + "num_input_tokens_seen": 275855705, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.09448242, + "step": 12792, + "time_per_iteration": 2.489346981048584 + }, + { + "auxiliary_loss_clip": 0.06410688, + "auxiliary_loss_mlp": 0.01268815, + "balance_loss_clip": 0.06273898, + "balance_loss_mlp": 0.01258999, + "epoch": 0.7691567713813318, + "flos": 21543004431360.0, + "grad_norm": 1.8308011822945844, + "language_loss": 0.73121727, + "learning_rate": 5.333161299238673e-07, + "loss": 0.80801225, + "num_input_tokens_seen": 275873930, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.09814453, + "step": 12793, + "time_per_iteration": 2.558523416519165 + }, + { + "auxiliary_loss_clip": 0.06407025, + "auxiliary_loss_mlp": 0.01264714, + "balance_loss_clip": 0.06272246, + "balance_loss_mlp": 0.01254689, + "epoch": 0.7692168946339997, + "flos": 39388568872320.0, + "grad_norm": 1.7835594774438226, + "language_loss": 0.63780582, + "learning_rate": 5.330513783189803e-07, + "loss": 0.7145232, + "num_input_tokens_seen": 275895895, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.1003418, + "step": 12794, + "time_per_iteration": 2.6618335247039795 + }, + { + "auxiliary_loss_clip": 0.06408365, + "auxiliary_loss_mlp": 0.01265956, + "balance_loss_clip": 0.06273225, + "balance_loss_mlp": 0.01256336, + "epoch": 0.7692770178866677, + "flos": 25017010709760.0, + "grad_norm": 1.4664054108250584, + "language_loss": 0.76531231, + "learning_rate": 5.327866823409319e-07, + "loss": 0.84205556, + "num_input_tokens_seen": 275917825, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.09619141, + "step": 12795, + "time_per_iteration": 2.5922963619232178 + }, + { + "auxiliary_loss_clip": 0.0640534, + "auxiliary_loss_mlp": 0.01263991, + "balance_loss_clip": 0.0627051, + "balance_loss_mlp": 0.01253453, + "epoch": 0.7693371411393356, + "flos": 24722984332800.0, + "grad_norm": 1.4884281283084904, + "language_loss": 0.72098613, + "learning_rate": 5.325220419997601e-07, + "loss": 0.79767948, + "num_input_tokens_seen": 275937890, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.10540771, + "step": 12796, + "time_per_iteration": 2.5227742195129395 + }, + { + "auxiliary_loss_clip": 0.06403993, + "auxiliary_loss_mlp": 0.01265667, + "balance_loss_clip": 0.06272089, + "balance_loss_mlp": 0.01255994, + "epoch": 0.7693972643920036, + "flos": 15930311339520.0, + "grad_norm": 1.7278751632986438, + "language_loss": 0.64795017, + "learning_rate": 5.32257457305499e-07, + "loss": 0.72464675, + "num_input_tokens_seen": 275954495, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09667969, + "step": 12797, + "time_per_iteration": 2.503452777862549 + }, + { + "auxiliary_loss_clip": 0.06409369, + "auxiliary_loss_mlp": 0.0127561, + "balance_loss_clip": 0.06275479, + "balance_loss_mlp": 0.01264798, + "epoch": 0.7694573876446715, + "flos": 25412125438080.0, + "grad_norm": 1.8485649321852773, + "language_loss": 0.91645068, + "learning_rate": 5.319929282681823e-07, + "loss": 0.9933005, + "num_input_tokens_seen": 275972395, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.10809326, + "step": 12798, + "time_per_iteration": 2.5266406536102295 + }, + { + "auxiliary_loss_clip": 0.06401522, + "auxiliary_loss_mlp": 0.01265889, + "balance_loss_clip": 0.06268082, + "balance_loss_mlp": 0.01256489, + "epoch": 0.7695175108973396, + "flos": 16659800985600.0, + "grad_norm": 1.7639360291305515, + "language_loss": 0.82879943, + "learning_rate": 5.317284548978418e-07, + "loss": 0.90547353, + "num_input_tokens_seen": 275989020, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.09387207, + "step": 12799, + "time_per_iteration": 2.4981637001037598 + }, + { + "auxiliary_loss_clip": 0.06404725, + "auxiliary_loss_mlp": 0.01268019, + "balance_loss_clip": 0.06270967, + "balance_loss_mlp": 0.01257862, + "epoch": 0.7695776341500075, + "flos": 13631697809280.0, + "grad_norm": 2.5788494866617513, + "language_loss": 0.78243637, + "learning_rate": 5.314640372045045e-07, + "loss": 0.85916382, + "num_input_tokens_seen": 276006525, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.10162354, + "step": 12800, + "time_per_iteration": 2.472907304763794 + }, + { + "auxiliary_loss_clip": 0.06410202, + "auxiliary_loss_mlp": 0.01266803, + "balance_loss_clip": 0.06270645, + "balance_loss_mlp": 0.01256182, + "epoch": 0.7696377574026755, + "flos": 24283034870400.0, + "grad_norm": 1.8264730167588297, + "language_loss": 0.84045184, + "learning_rate": 5.31199675198198e-07, + "loss": 0.9172219, + "num_input_tokens_seen": 276027130, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.10620117, + "step": 12801, + "time_per_iteration": 2.53623366355896 + }, + { + "auxiliary_loss_clip": 0.06406119, + "auxiliary_loss_mlp": 0.01267538, + "balance_loss_clip": 0.06272501, + "balance_loss_mlp": 0.01257495, + "epoch": 0.7696978806553435, + "flos": 20929445308800.0, + "grad_norm": 1.8709548721646438, + "language_loss": 0.73054564, + "learning_rate": 5.30935368888947e-07, + "loss": 0.80728221, + "num_input_tokens_seen": 276045715, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.1003418, + "step": 12802, + "time_per_iteration": 2.4759271144866943 + }, + { + "auxiliary_loss_clip": 0.06399865, + "auxiliary_loss_mlp": 0.01265258, + "balance_loss_clip": 0.06271532, + "balance_loss_mlp": 0.01255757, + "epoch": 0.7697580039080114, + "flos": 22936212668160.0, + "grad_norm": 1.8081953162086668, + "language_loss": 0.76470077, + "learning_rate": 5.306711182867747e-07, + "loss": 0.84135199, + "num_input_tokens_seen": 276065375, + "router_z_loss_clip": 1.28417969, + "router_z_loss_mlp": 0.0949707, + "step": 12803, + "time_per_iteration": 2.5474445819854736 + }, + { + "auxiliary_loss_clip": 0.06313179, + "auxiliary_loss_mlp": 0.01253049, + "balance_loss_clip": 0.06258132, + "balance_loss_mlp": 0.01251863, + "epoch": 0.7698181271606794, + "flos": 68737751850240.0, + "grad_norm": 0.742546771949619, + "language_loss": 0.55879092, + "learning_rate": 5.304069234017001e-07, + "loss": 0.63445318, + "num_input_tokens_seen": 276131405, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.01184082, + "step": 12804, + "time_per_iteration": 3.1489827632904053 + }, + { + "auxiliary_loss_clip": 0.06316254, + "auxiliary_loss_mlp": 0.0125264, + "balance_loss_clip": 0.0626114, + "balance_loss_mlp": 0.01251505, + "epoch": 0.7698782504133473, + "flos": 67430523502080.0, + "grad_norm": 0.7295540312789194, + "language_loss": 0.53939354, + "learning_rate": 5.301427842437429e-07, + "loss": 0.61508244, + "num_input_tokens_seen": 276200755, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.0113678, + "step": 12805, + "time_per_iteration": 3.2659192085266113 + }, + { + "auxiliary_loss_clip": 0.0640514, + "auxiliary_loss_mlp": 0.01270733, + "balance_loss_clip": 0.06272765, + "balance_loss_mlp": 0.01261047, + "epoch": 0.7699383736660154, + "flos": 22494879613440.0, + "grad_norm": 3.06352805467247, + "language_loss": 0.73035467, + "learning_rate": 5.298787008229187e-07, + "loss": 0.80711341, + "num_input_tokens_seen": 276217880, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.09686279, + "step": 12806, + "time_per_iteration": 2.4905054569244385 + }, + { + "auxiliary_loss_clip": 0.06401073, + "auxiliary_loss_mlp": 0.01266133, + "balance_loss_clip": 0.06269582, + "balance_loss_mlp": 0.01256704, + "epoch": 0.7699984969186833, + "flos": 21545520053760.0, + "grad_norm": 1.6739965963260217, + "language_loss": 0.75159943, + "learning_rate": 5.296146731492408e-07, + "loss": 0.82827145, + "num_input_tokens_seen": 276234810, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09423828, + "step": 12807, + "time_per_iteration": 2.5074682235717773 + }, + { + "auxiliary_loss_clip": 0.06406098, + "auxiliary_loss_mlp": 0.01264768, + "balance_loss_clip": 0.0626993, + "balance_loss_mlp": 0.01254098, + "epoch": 0.7700586201713513, + "flos": 21724412520960.0, + "grad_norm": 2.037865665188592, + "language_loss": 0.8067742, + "learning_rate": 5.293507012327218e-07, + "loss": 0.88348287, + "num_input_tokens_seen": 276252850, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.10681152, + "step": 12808, + "time_per_iteration": 3.8791632652282715 + }, + { + "auxiliary_loss_clip": 0.06407686, + "auxiliary_loss_mlp": 0.01266704, + "balance_loss_clip": 0.06271963, + "balance_loss_mlp": 0.01256595, + "epoch": 0.7701187434240192, + "flos": 27863580015360.0, + "grad_norm": 1.7006184108687237, + "language_loss": 0.7921378, + "learning_rate": 5.290867850833718e-07, + "loss": 0.8688817, + "num_input_tokens_seen": 276272525, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10113525, + "step": 12809, + "time_per_iteration": 2.5961480140686035 + }, + { + "auxiliary_loss_clip": 0.06399591, + "auxiliary_loss_mlp": 0.01264077, + "balance_loss_clip": 0.06270431, + "balance_loss_mlp": 0.01254594, + "epoch": 0.7701788666766872, + "flos": 28628848154880.0, + "grad_norm": 1.4421816702879584, + "language_loss": 0.70197344, + "learning_rate": 5.288229247111993e-07, + "loss": 0.77861011, + "num_input_tokens_seen": 276294210, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.0947876, + "step": 12810, + "time_per_iteration": 2.6107945442199707 + }, + { + "auxiliary_loss_clip": 0.06406891, + "auxiliary_loss_mlp": 0.01266297, + "balance_loss_clip": 0.06271058, + "balance_loss_mlp": 0.01254769, + "epoch": 0.7702389899293551, + "flos": 14251671768960.0, + "grad_norm": 2.2769003713635967, + "language_loss": 0.78979844, + "learning_rate": 5.285591201262079e-07, + "loss": 0.8665303, + "num_input_tokens_seen": 276310290, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.11523438, + "step": 12811, + "time_per_iteration": 2.555101156234741 + }, + { + "auxiliary_loss_clip": 0.06317817, + "auxiliary_loss_mlp": 0.01251839, + "balance_loss_clip": 0.06262816, + "balance_loss_mlp": 0.01250771, + "epoch": 0.7702991131820232, + "flos": 70593816441600.0, + "grad_norm": 0.7969175673938892, + "language_loss": 0.56677693, + "learning_rate": 5.28295371338402e-07, + "loss": 0.64247346, + "num_input_tokens_seen": 276371715, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.01069641, + "step": 12812, + "time_per_iteration": 3.1775879859924316 + }, + { + "auxiliary_loss_clip": 0.06404653, + "auxiliary_loss_mlp": 0.01265227, + "balance_loss_clip": 0.0627086, + "balance_loss_mlp": 0.01254898, + "epoch": 0.7703592364346911, + "flos": 25486449609600.0, + "grad_norm": 1.6911953299431426, + "language_loss": 0.72016954, + "learning_rate": 5.280316783577836e-07, + "loss": 0.79686838, + "num_input_tokens_seen": 276389895, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.10327148, + "step": 12813, + "time_per_iteration": 2.525716781616211 + }, + { + "auxiliary_loss_clip": 0.06403896, + "auxiliary_loss_mlp": 0.01265029, + "balance_loss_clip": 0.06270216, + "balance_loss_mlp": 0.01254962, + "epoch": 0.7704193596873591, + "flos": 19286877720960.0, + "grad_norm": 1.5106493285856717, + "language_loss": 0.66542912, + "learning_rate": 5.27768041194351e-07, + "loss": 0.74211836, + "num_input_tokens_seen": 276408990, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.10058594, + "step": 12814, + "time_per_iteration": 2.511730432510376 + }, + { + "auxiliary_loss_clip": 0.06403521, + "auxiliary_loss_mlp": 0.01267694, + "balance_loss_clip": 0.06271755, + "balance_loss_mlp": 0.01258288, + "epoch": 0.7704794829400271, + "flos": 23665031481600.0, + "grad_norm": 1.765991608700586, + "language_loss": 0.65916228, + "learning_rate": 5.275044598581018e-07, + "loss": 0.73587441, + "num_input_tokens_seen": 276428190, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09399414, + "step": 12815, + "time_per_iteration": 2.552647113800049 + }, + { + "auxiliary_loss_clip": 0.06402738, + "auxiliary_loss_mlp": 0.01262909, + "balance_loss_clip": 0.06270017, + "balance_loss_mlp": 0.01253324, + "epoch": 0.770539606192695, + "flos": 18995283112320.0, + "grad_norm": 3.1094364137223325, + "language_loss": 0.65588892, + "learning_rate": 5.272409343590322e-07, + "loss": 0.73254538, + "num_input_tokens_seen": 276446855, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.0958252, + "step": 12816, + "time_per_iteration": 2.5682597160339355 + }, + { + "auxiliary_loss_clip": 0.06410483, + "auxiliary_loss_mlp": 0.01271453, + "balance_loss_clip": 0.06275068, + "balance_loss_mlp": 0.01261321, + "epoch": 0.770599729445363, + "flos": 11833605843840.0, + "grad_norm": 2.2637093644731685, + "language_loss": 0.72246104, + "learning_rate": 5.26977464707133e-07, + "loss": 0.79928041, + "num_input_tokens_seen": 276462000, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.10131836, + "step": 12817, + "time_per_iteration": 2.485805034637451 + }, + { + "auxiliary_loss_clip": 0.06404669, + "auxiliary_loss_mlp": 0.01264386, + "balance_loss_clip": 0.06271846, + "balance_loss_mlp": 0.01254677, + "epoch": 0.770659852698031, + "flos": 17828527334400.0, + "grad_norm": 3.0609511184199523, + "language_loss": 0.61409748, + "learning_rate": 5.267140509123957e-07, + "loss": 0.69078803, + "num_input_tokens_seen": 276481190, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.0970459, + "step": 12818, + "time_per_iteration": 2.487680673599243 + }, + { + "auxiliary_loss_clip": 0.06399722, + "auxiliary_loss_mlp": 0.01262281, + "balance_loss_clip": 0.062704, + "balance_loss_mlp": 0.01253603, + "epoch": 0.770719975950699, + "flos": 21878469452160.0, + "grad_norm": 1.7396688274909713, + "language_loss": 0.67373377, + "learning_rate": 5.264506929848093e-07, + "loss": 0.75035375, + "num_input_tokens_seen": 276499520, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.08676147, + "step": 12819, + "time_per_iteration": 3.9379172325134277 + }, + { + "auxiliary_loss_clip": 0.06406172, + "auxiliary_loss_mlp": 0.01263778, + "balance_loss_clip": 0.06271698, + "balance_loss_mlp": 0.01253848, + "epoch": 0.7707800992033669, + "flos": 21331519925760.0, + "grad_norm": 1.7217491542401215, + "language_loss": 0.57604039, + "learning_rate": 5.261873909343608e-07, + "loss": 0.65273988, + "num_input_tokens_seen": 276519110, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.09924316, + "step": 12820, + "time_per_iteration": 2.495925188064575 + }, + { + "auxiliary_loss_clip": 0.06404679, + "auxiliary_loss_mlp": 0.01262498, + "balance_loss_clip": 0.06269978, + "balance_loss_mlp": 0.01252735, + "epoch": 0.7708402224560349, + "flos": 28186215361920.0, + "grad_norm": 1.643911762743471, + "language_loss": 0.81179225, + "learning_rate": 5.259241447710343e-07, + "loss": 0.88846403, + "num_input_tokens_seen": 276538805, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.09771729, + "step": 12821, + "time_per_iteration": 3.986278772354126 + }, + { + "auxiliary_loss_clip": 0.06404622, + "auxiliary_loss_mlp": 0.012636, + "balance_loss_clip": 0.06271188, + "balance_loss_mlp": 0.01253521, + "epoch": 0.7709003457087028, + "flos": 15382397491200.0, + "grad_norm": 1.8555601189743978, + "language_loss": 0.68379205, + "learning_rate": 5.256609545048114e-07, + "loss": 0.76047421, + "num_input_tokens_seen": 276554770, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.10076904, + "step": 12822, + "time_per_iteration": 2.4856462478637695 + }, + { + "auxiliary_loss_clip": 0.06400201, + "auxiliary_loss_mlp": 0.01266424, + "balance_loss_clip": 0.0626999, + "balance_loss_mlp": 0.01256786, + "epoch": 0.7709604689613708, + "flos": 30628697552640.0, + "grad_norm": 2.043450133419636, + "language_loss": 0.72353333, + "learning_rate": 5.253978201456733e-07, + "loss": 0.80019963, + "num_input_tokens_seen": 276574535, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.09637451, + "step": 12823, + "time_per_iteration": 2.5663697719573975 + }, + { + "auxiliary_loss_clip": 0.06408671, + "auxiliary_loss_mlp": 0.0126507, + "balance_loss_clip": 0.06270947, + "balance_loss_mlp": 0.01254437, + "epoch": 0.7710205922140387, + "flos": 20307207288960.0, + "grad_norm": 1.6756825279286318, + "language_loss": 0.76604235, + "learning_rate": 5.251347417035969e-07, + "loss": 0.84277976, + "num_input_tokens_seen": 276592925, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.10632324, + "step": 12824, + "time_per_iteration": 2.5135273933410645 + }, + { + "auxiliary_loss_clip": 0.0640358, + "auxiliary_loss_mlp": 0.01264934, + "balance_loss_clip": 0.06270701, + "balance_loss_mlp": 0.01255332, + "epoch": 0.7710807154667068, + "flos": 19649987389440.0, + "grad_norm": 2.8682033137355605, + "language_loss": 0.72291267, + "learning_rate": 5.248717191885592e-07, + "loss": 0.79959786, + "num_input_tokens_seen": 276610540, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.0960083, + "step": 12825, + "time_per_iteration": 2.539870262145996 + }, + { + "auxiliary_loss_clip": 0.06397466, + "auxiliary_loss_mlp": 0.01266775, + "balance_loss_clip": 0.06270086, + "balance_loss_mlp": 0.01257602, + "epoch": 0.7711408387193747, + "flos": 20011713465600.0, + "grad_norm": 1.348856880561093, + "language_loss": 0.73990041, + "learning_rate": 5.246087526105343e-07, + "loss": 0.8165428, + "num_input_tokens_seen": 276629200, + "router_z_loss_clip": 1.27539062, + "router_z_loss_mlp": 0.0916748, + "step": 12826, + "time_per_iteration": 3.9455349445343018 + }, + { + "auxiliary_loss_clip": 0.06404951, + "auxiliary_loss_mlp": 0.012643, + "balance_loss_clip": 0.06270047, + "balance_loss_mlp": 0.01253554, + "epoch": 0.7712009619720427, + "flos": 24977794199040.0, + "grad_norm": 1.495331253862981, + "language_loss": 0.81176156, + "learning_rate": 5.243458419794933e-07, + "loss": 0.88845408, + "num_input_tokens_seen": 276648655, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.10748291, + "step": 12827, + "time_per_iteration": 2.5489249229431152 + }, + { + "auxiliary_loss_clip": 0.0631479, + "auxiliary_loss_mlp": 0.01256103, + "balance_loss_clip": 0.06259546, + "balance_loss_mlp": 0.01255053, + "epoch": 0.7712610852247107, + "flos": 63269682105600.0, + "grad_norm": 0.8475476558719117, + "language_loss": 0.55242074, + "learning_rate": 5.240829873054051e-07, + "loss": 0.6281296, + "num_input_tokens_seen": 276716500, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01051331, + "step": 12828, + "time_per_iteration": 3.2874319553375244 + }, + { + "auxiliary_loss_clip": 0.06395887, + "auxiliary_loss_mlp": 0.01264145, + "balance_loss_clip": 0.06267989, + "balance_loss_mlp": 0.01255317, + "epoch": 0.7713212084773786, + "flos": 18703856211840.0, + "grad_norm": 1.6628752588878346, + "language_loss": 0.69472146, + "learning_rate": 5.23820188598238e-07, + "loss": 0.77132177, + "num_input_tokens_seen": 276733535, + "router_z_loss_clip": 1.27832031, + "router_z_loss_mlp": 0.08825684, + "step": 12829, + "time_per_iteration": 2.5006113052368164 + }, + { + "auxiliary_loss_clip": 0.06407359, + "auxiliary_loss_mlp": 0.01263662, + "balance_loss_clip": 0.06270751, + "balance_loss_mlp": 0.01253428, + "epoch": 0.7713813317300466, + "flos": 14178563481600.0, + "grad_norm": 2.5004318889819146, + "language_loss": 0.79485464, + "learning_rate": 5.235574458679579e-07, + "loss": 0.87156487, + "num_input_tokens_seen": 276749575, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.10235596, + "step": 12830, + "time_per_iteration": 2.455521821975708 + }, + { + "auxiliary_loss_clip": 0.06408571, + "auxiliary_loss_mlp": 0.01265761, + "balance_loss_clip": 0.06271582, + "balance_loss_mlp": 0.01254853, + "epoch": 0.7714414549827145, + "flos": 25711266913920.0, + "grad_norm": 1.5558349458942582, + "language_loss": 0.78193223, + "learning_rate": 5.232947591245269e-07, + "loss": 0.85867554, + "num_input_tokens_seen": 276769460, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.10906982, + "step": 12831, + "time_per_iteration": 2.55888295173645 + }, + { + "auxiliary_loss_clip": 0.06400928, + "auxiliary_loss_mlp": 0.01266262, + "balance_loss_clip": 0.06268953, + "balance_loss_mlp": 0.01256547, + "epoch": 0.7715015782353826, + "flos": 30563219986560.0, + "grad_norm": 1.4404933685883998, + "language_loss": 0.61150742, + "learning_rate": 5.230321283779071e-07, + "loss": 0.68817931, + "num_input_tokens_seen": 276790820, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.0970459, + "step": 12832, + "time_per_iteration": 2.5705411434173584 + }, + { + "auxiliary_loss_clip": 0.06408297, + "auxiliary_loss_mlp": 0.01268082, + "balance_loss_clip": 0.06271287, + "balance_loss_mlp": 0.01258271, + "epoch": 0.7715617014880505, + "flos": 20235440666880.0, + "grad_norm": 1.4904530814793735, + "language_loss": 0.79785657, + "learning_rate": 5.227695536380572e-07, + "loss": 0.87462032, + "num_input_tokens_seen": 276811345, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.09814453, + "step": 12833, + "time_per_iteration": 2.5475685596466064 + }, + { + "auxiliary_loss_clip": 0.06315958, + "auxiliary_loss_mlp": 0.01251107, + "balance_loss_clip": 0.06260836, + "balance_loss_mlp": 0.01250079, + "epoch": 0.7716218247407185, + "flos": 63681037326720.0, + "grad_norm": 0.8315874052432679, + "language_loss": 0.55088067, + "learning_rate": 5.22507034914933e-07, + "loss": 0.62655127, + "num_input_tokens_seen": 276870950, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.01027679, + "step": 12834, + "time_per_iteration": 3.1191012859344482 + }, + { + "auxiliary_loss_clip": 0.0640831, + "auxiliary_loss_mlp": 0.01264302, + "balance_loss_clip": 0.06273386, + "balance_loss_mlp": 0.01254294, + "epoch": 0.7716819479933864, + "flos": 19797881045760.0, + "grad_norm": 2.410723884633937, + "language_loss": 0.73350394, + "learning_rate": 5.222445722184903e-07, + "loss": 0.81023002, + "num_input_tokens_seen": 276890760, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.09997559, + "step": 12835, + "time_per_iteration": 2.5506582260131836 + }, + { + "auxiliary_loss_clip": 0.06406028, + "auxiliary_loss_mlp": 0.01267171, + "balance_loss_clip": 0.06272173, + "balance_loss_mlp": 0.01257884, + "epoch": 0.7717420712460544, + "flos": 18448082023680.0, + "grad_norm": 2.0308771684786113, + "language_loss": 0.70508468, + "learning_rate": 5.219821655586814e-07, + "loss": 0.78181666, + "num_input_tokens_seen": 276909625, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.09289551, + "step": 12836, + "time_per_iteration": 2.5232300758361816 + }, + { + "auxiliary_loss_clip": 0.06398998, + "auxiliary_loss_mlp": 0.01268729, + "balance_loss_clip": 0.06270441, + "balance_loss_mlp": 0.01259222, + "epoch": 0.7718021944987223, + "flos": 35198238038400.0, + "grad_norm": 1.831037228573652, + "language_loss": 0.60367215, + "learning_rate": 5.217198149454575e-07, + "loss": 0.68034947, + "num_input_tokens_seen": 276930760, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.09509277, + "step": 12837, + "time_per_iteration": 2.6591076850891113 + }, + { + "auxiliary_loss_clip": 0.06317183, + "auxiliary_loss_mlp": 0.01257562, + "balance_loss_clip": 0.0626177, + "balance_loss_mlp": 0.01256482, + "epoch": 0.7718623177513904, + "flos": 67944503646720.0, + "grad_norm": 0.8462887217652507, + "language_loss": 0.55739456, + "learning_rate": 5.214575203887666e-07, + "loss": 0.63314199, + "num_input_tokens_seen": 276989580, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01081848, + "step": 12838, + "time_per_iteration": 3.0941390991210938 + }, + { + "auxiliary_loss_clip": 0.06402552, + "auxiliary_loss_mlp": 0.01264762, + "balance_loss_clip": 0.06271369, + "balance_loss_mlp": 0.01255345, + "epoch": 0.7719224410040583, + "flos": 18586206679680.0, + "grad_norm": 2.2960724340178156, + "language_loss": 0.69924515, + "learning_rate": 5.211952818985538e-07, + "loss": 0.77591836, + "num_input_tokens_seen": 277005450, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09411621, + "step": 12839, + "time_per_iteration": 2.4651598930358887 + }, + { + "auxiliary_loss_clip": 0.06401128, + "auxiliary_loss_mlp": 0.01263167, + "balance_loss_clip": 0.0627085, + "balance_loss_mlp": 0.01253893, + "epoch": 0.7719825642567263, + "flos": 23082471169920.0, + "grad_norm": 1.724099382102015, + "language_loss": 0.79996341, + "learning_rate": 5.209330994847647e-07, + "loss": 0.87660646, + "num_input_tokens_seen": 277023055, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.09277344, + "step": 12840, + "time_per_iteration": 2.494185447692871 + }, + { + "auxiliary_loss_clip": 0.0640455, + "auxiliary_loss_mlp": 0.01263769, + "balance_loss_clip": 0.06271051, + "balance_loss_mlp": 0.01254202, + "epoch": 0.7720426875093943, + "flos": 20345249842560.0, + "grad_norm": 1.700648368789641, + "language_loss": 0.80246019, + "learning_rate": 5.206709731573402e-07, + "loss": 0.87914336, + "num_input_tokens_seen": 277041150, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.09564209, + "step": 12841, + "time_per_iteration": 2.4959654808044434 + }, + { + "auxiliary_loss_clip": 0.06402302, + "auxiliary_loss_mlp": 0.01263637, + "balance_loss_clip": 0.06268935, + "balance_loss_mlp": 0.01254261, + "epoch": 0.7721028107620622, + "flos": 23887878215040.0, + "grad_norm": 1.6460484096163284, + "language_loss": 0.76556861, + "learning_rate": 5.204089029262208e-07, + "loss": 0.84222806, + "num_input_tokens_seen": 277063895, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.09381104, + "step": 12842, + "time_per_iteration": 2.5414130687713623 + }, + { + "auxiliary_loss_clip": 0.06408067, + "auxiliary_loss_mlp": 0.0126426, + "balance_loss_clip": 0.06272548, + "balance_loss_mlp": 0.01254527, + "epoch": 0.7721629340147302, + "flos": 26658865537920.0, + "grad_norm": 1.6198153669730124, + "language_loss": 0.68824613, + "learning_rate": 5.201468888013445e-07, + "loss": 0.76496947, + "num_input_tokens_seen": 277084045, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.09735107, + "step": 12843, + "time_per_iteration": 2.555246353149414 + }, + { + "auxiliary_loss_clip": 0.06407151, + "auxiliary_loss_mlp": 0.01263842, + "balance_loss_clip": 0.06270268, + "balance_loss_mlp": 0.01254377, + "epoch": 0.7722230572673981, + "flos": 21185261424000.0, + "grad_norm": 1.9549573678277232, + "language_loss": 0.73833585, + "learning_rate": 5.198849307926465e-07, + "loss": 0.81504577, + "num_input_tokens_seen": 277102625, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.09472656, + "step": 12844, + "time_per_iteration": 2.475722312927246 + }, + { + "auxiliary_loss_clip": 0.06400653, + "auxiliary_loss_mlp": 0.01262464, + "balance_loss_clip": 0.0626903, + "balance_loss_mlp": 0.01253327, + "epoch": 0.7722831805200662, + "flos": 27972089452800.0, + "grad_norm": 1.4105737815374062, + "language_loss": 0.71880949, + "learning_rate": 5.196230289100596e-07, + "loss": 0.79544067, + "num_input_tokens_seen": 277123210, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09143066, + "step": 12845, + "time_per_iteration": 2.537477493286133 + }, + { + "auxiliary_loss_clip": 0.06397612, + "auxiliary_loss_mlp": 0.01266239, + "balance_loss_clip": 0.06268354, + "balance_loss_mlp": 0.01257095, + "epoch": 0.7723433037727341, + "flos": 33884049801600.0, + "grad_norm": 1.693366944822723, + "language_loss": 0.64408147, + "learning_rate": 5.193611831635159e-07, + "loss": 0.72071993, + "num_input_tokens_seen": 277144895, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.09143066, + "step": 12846, + "time_per_iteration": 2.5818498134613037 + }, + { + "auxiliary_loss_clip": 0.06312131, + "auxiliary_loss_mlp": 0.01253105, + "balance_loss_clip": 0.06256564, + "balance_loss_mlp": 0.01252078, + "epoch": 0.7724034270254021, + "flos": 62868194467200.0, + "grad_norm": 0.7376748551210195, + "language_loss": 0.61336023, + "learning_rate": 5.19099393562945e-07, + "loss": 0.68901265, + "num_input_tokens_seen": 277205160, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01026917, + "step": 12847, + "time_per_iteration": 3.0541763305664062 + }, + { + "auxiliary_loss_clip": 0.06401889, + "auxiliary_loss_mlp": 0.01264508, + "balance_loss_clip": 0.06268549, + "balance_loss_mlp": 0.01254983, + "epoch": 0.77246355027807, + "flos": 23302299156480.0, + "grad_norm": 1.5812634929817273, + "language_loss": 0.79369843, + "learning_rate": 5.188376601182732e-07, + "loss": 0.8703624, + "num_input_tokens_seen": 277223005, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.09527588, + "step": 12848, + "time_per_iteration": 3.9165518283843994 + }, + { + "auxiliary_loss_clip": 0.06404726, + "auxiliary_loss_mlp": 0.01266909, + "balance_loss_clip": 0.06268495, + "balance_loss_mlp": 0.01257086, + "epoch": 0.772523673530738, + "flos": 20127602062080.0, + "grad_norm": 1.566706530012109, + "language_loss": 0.73342961, + "learning_rate": 5.185759828394261e-07, + "loss": 0.81014597, + "num_input_tokens_seen": 277241785, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.0982666, + "step": 12849, + "time_per_iteration": 2.476515293121338 + }, + { + "auxiliary_loss_clip": 0.06402398, + "auxiliary_loss_mlp": 0.012638, + "balance_loss_clip": 0.06268849, + "balance_loss_mlp": 0.01254126, + "epoch": 0.7725837967834059, + "flos": 17825592441600.0, + "grad_norm": 2.2364064713439156, + "language_loss": 0.78424966, + "learning_rate": 5.183143617363261e-07, + "loss": 0.86091167, + "num_input_tokens_seen": 277259050, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.09667969, + "step": 12850, + "time_per_iteration": 2.4794983863830566 + }, + { + "auxiliary_loss_clip": 0.0640396, + "auxiliary_loss_mlp": 0.0126685, + "balance_loss_clip": 0.06267555, + "balance_loss_mlp": 0.01256616, + "epoch": 0.772643920036074, + "flos": 27206318188800.0, + "grad_norm": 1.5059914394205691, + "language_loss": 0.80266678, + "learning_rate": 5.180527968188935e-07, + "loss": 0.87937486, + "num_input_tokens_seen": 277278235, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.10235596, + "step": 12851, + "time_per_iteration": 2.5322558879852295 + }, + { + "auxiliary_loss_clip": 0.06400898, + "auxiliary_loss_mlp": 0.01263165, + "balance_loss_clip": 0.06270088, + "balance_loss_mlp": 0.01253193, + "epoch": 0.7727040432887419, + "flos": 21585868594560.0, + "grad_norm": 1.7096231270301345, + "language_loss": 0.73980415, + "learning_rate": 5.177912880970474e-07, + "loss": 0.81644481, + "num_input_tokens_seen": 277298355, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09973145, + "step": 12852, + "time_per_iteration": 2.5234642028808594 + }, + { + "auxiliary_loss_clip": 0.06399091, + "auxiliary_loss_mlp": 0.01264912, + "balance_loss_clip": 0.06268281, + "balance_loss_mlp": 0.01255685, + "epoch": 0.7727641665414099, + "flos": 22243172348160.0, + "grad_norm": 1.8458923236919589, + "language_loss": 0.82645077, + "learning_rate": 5.17529835580704e-07, + "loss": 0.90309083, + "num_input_tokens_seen": 277316095, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.09222412, + "step": 12853, + "time_per_iteration": 2.4855525493621826 + }, + { + "auxiliary_loss_clip": 0.06312872, + "auxiliary_loss_mlp": 0.01252237, + "balance_loss_clip": 0.06257433, + "balance_loss_mlp": 0.01251258, + "epoch": 0.7728242897940779, + "flos": 54852613038720.0, + "grad_norm": 0.7809207037354382, + "language_loss": 0.54245615, + "learning_rate": 5.172684392797786e-07, + "loss": 0.6181072, + "num_input_tokens_seen": 277380130, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.00978088, + "step": 12854, + "time_per_iteration": 3.1956636905670166 + }, + { + "auxiliary_loss_clip": 0.06408576, + "auxiliary_loss_mlp": 0.01265841, + "balance_loss_clip": 0.06272317, + "balance_loss_mlp": 0.01255667, + "epoch": 0.7728844130467458, + "flos": 34470970525440.0, + "grad_norm": 1.470895080979425, + "language_loss": 0.7210083, + "learning_rate": 5.170070992041826e-07, + "loss": 0.7977525, + "num_input_tokens_seen": 277404015, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.10168457, + "step": 12855, + "time_per_iteration": 2.6422533988952637 + }, + { + "auxiliary_loss_clip": 0.0640472, + "auxiliary_loss_mlp": 0.01265685, + "balance_loss_clip": 0.06271958, + "balance_loss_mlp": 0.01256059, + "epoch": 0.7729445362994138, + "flos": 18922300606080.0, + "grad_norm": 1.643707808983738, + "language_loss": 0.68152243, + "learning_rate": 5.167458153638254e-07, + "loss": 0.75822645, + "num_input_tokens_seen": 277421375, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09619141, + "step": 12856, + "time_per_iteration": 2.581195592880249 + }, + { + "auxiliary_loss_clip": 0.06403085, + "auxiliary_loss_mlp": 0.01263682, + "balance_loss_clip": 0.06271385, + "balance_loss_mlp": 0.01254241, + "epoch": 0.7730046595520818, + "flos": 22206555313920.0, + "grad_norm": 2.739925215135401, + "language_loss": 0.7896111, + "learning_rate": 5.164845877686162e-07, + "loss": 0.86627877, + "num_input_tokens_seen": 277440170, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.09442139, + "step": 12857, + "time_per_iteration": 2.536677360534668 + }, + { + "auxiliary_loss_clip": 0.06400988, + "auxiliary_loss_mlp": 0.01266407, + "balance_loss_clip": 0.06271593, + "balance_loss_mlp": 0.01256447, + "epoch": 0.7730647828047498, + "flos": 13557289783680.0, + "grad_norm": 1.6864648119346977, + "language_loss": 0.7856096, + "learning_rate": 5.162234164284591e-07, + "loss": 0.86228359, + "num_input_tokens_seen": 277456880, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.09954834, + "step": 12858, + "time_per_iteration": 3.9322428703308105 + }, + { + "auxiliary_loss_clip": 0.06406689, + "auxiliary_loss_mlp": 0.01266364, + "balance_loss_clip": 0.06271519, + "balance_loss_mlp": 0.01256392, + "epoch": 0.7731249060574177, + "flos": 21981654155520.0, + "grad_norm": 1.7779455572777159, + "language_loss": 0.77746201, + "learning_rate": 5.159623013532591e-07, + "loss": 0.8541925, + "num_input_tokens_seen": 277475365, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.09967041, + "step": 12859, + "time_per_iteration": 2.513849973678589 + }, + { + "auxiliary_loss_clip": 0.06403208, + "auxiliary_loss_mlp": 0.01261712, + "balance_loss_clip": 0.06273893, + "balance_loss_mlp": 0.01253284, + "epoch": 0.7731850293100857, + "flos": 22608462222720.0, + "grad_norm": 1.6555727720253302, + "language_loss": 0.67912078, + "learning_rate": 5.157012425529186e-07, + "loss": 0.75576997, + "num_input_tokens_seen": 277494975, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.08428955, + "step": 12860, + "time_per_iteration": 4.005707740783691 + }, + { + "auxiliary_loss_clip": 0.06407683, + "auxiliary_loss_mlp": 0.01265641, + "balance_loss_clip": 0.06270751, + "balance_loss_mlp": 0.01255449, + "epoch": 0.7732451525627536, + "flos": 14103274988160.0, + "grad_norm": 2.651215964660107, + "language_loss": 0.75251514, + "learning_rate": 5.154402400373343e-07, + "loss": 0.82924837, + "num_input_tokens_seen": 277510520, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.10198975, + "step": 12861, + "time_per_iteration": 2.444032907485962 + }, + { + "auxiliary_loss_clip": 0.06406768, + "auxiliary_loss_mlp": 0.01262473, + "balance_loss_clip": 0.06270678, + "balance_loss_mlp": 0.01252328, + "epoch": 0.7733052758154216, + "flos": 21476352908160.0, + "grad_norm": 3.091257297697316, + "language_loss": 0.75125277, + "learning_rate": 5.15179293816405e-07, + "loss": 0.82794511, + "num_input_tokens_seen": 277530505, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.10137939, + "step": 12862, + "time_per_iteration": 2.5575408935546875 + }, + { + "auxiliary_loss_clip": 0.06400394, + "auxiliary_loss_mlp": 0.01264588, + "balance_loss_clip": 0.06270863, + "balance_loss_mlp": 0.01255552, + "epoch": 0.7733653990680895, + "flos": 21400142019840.0, + "grad_norm": 1.5224536718195483, + "language_loss": 0.83015412, + "learning_rate": 5.149184039000256e-07, + "loss": 0.90680391, + "num_input_tokens_seen": 277550810, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.09039307, + "step": 12863, + "time_per_iteration": 2.500004529953003 + }, + { + "auxiliary_loss_clip": 0.06403436, + "auxiliary_loss_mlp": 0.01266726, + "balance_loss_clip": 0.06272671, + "balance_loss_mlp": 0.01257172, + "epoch": 0.7734255223207576, + "flos": 17681849562240.0, + "grad_norm": 1.666044209334627, + "language_loss": 0.73906845, + "learning_rate": 5.146575702980898e-07, + "loss": 0.81577015, + "num_input_tokens_seen": 277567680, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.09558105, + "step": 12864, + "time_per_iteration": 2.502202272415161 + }, + { + "auxiliary_loss_clip": 0.06405224, + "auxiliary_loss_mlp": 0.01262028, + "balance_loss_clip": 0.06273071, + "balance_loss_mlp": 0.01253117, + "epoch": 0.7734856455734255, + "flos": 25238264215680.0, + "grad_norm": 1.8553120895059094, + "language_loss": 0.82274187, + "learning_rate": 5.143967930204871e-07, + "loss": 0.89941442, + "num_input_tokens_seen": 277588970, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.08911133, + "step": 12865, + "time_per_iteration": 2.5821845531463623 + }, + { + "auxiliary_loss_clip": 0.0640586, + "auxiliary_loss_mlp": 0.0126401, + "balance_loss_clip": 0.06269649, + "balance_loss_mlp": 0.01253627, + "epoch": 0.7735457688260935, + "flos": 23438579022720.0, + "grad_norm": 2.0985789262446763, + "language_loss": 0.71729589, + "learning_rate": 5.141360720771077e-07, + "loss": 0.79399455, + "num_input_tokens_seen": 277605450, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.10375977, + "step": 12866, + "time_per_iteration": 3.9061973094940186 + }, + { + "auxiliary_loss_clip": 0.06406082, + "auxiliary_loss_mlp": 0.01266662, + "balance_loss_clip": 0.06272133, + "balance_loss_mlp": 0.01256309, + "epoch": 0.7736058920787615, + "flos": 18734393825280.0, + "grad_norm": 2.2008061294183046, + "language_loss": 0.64883512, + "learning_rate": 5.138754074778371e-07, + "loss": 0.72556257, + "num_input_tokens_seen": 277622530, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.1036377, + "step": 12867, + "time_per_iteration": 2.438513994216919 + }, + { + "auxiliary_loss_clip": 0.06398055, + "auxiliary_loss_mlp": 0.01264338, + "balance_loss_clip": 0.06268299, + "balance_loss_mlp": 0.01254897, + "epoch": 0.7736660153314294, + "flos": 22899931050240.0, + "grad_norm": 1.3982915625107966, + "language_loss": 0.71222079, + "learning_rate": 5.136147992325595e-07, + "loss": 0.7888447, + "num_input_tokens_seen": 277642700, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.09442139, + "step": 12868, + "time_per_iteration": 2.521263599395752 + }, + { + "auxiliary_loss_clip": 0.06407171, + "auxiliary_loss_mlp": 0.01263296, + "balance_loss_clip": 0.06272081, + "balance_loss_mlp": 0.01253252, + "epoch": 0.7737261385840974, + "flos": 13804762417920.0, + "grad_norm": 1.9680842128147285, + "language_loss": 0.78157473, + "learning_rate": 5.133542473511578e-07, + "loss": 0.85827935, + "num_input_tokens_seen": 277660005, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.10046387, + "step": 12869, + "time_per_iteration": 2.4751439094543457 + }, + { + "auxiliary_loss_clip": 0.06399751, + "auxiliary_loss_mlp": 0.01264789, + "balance_loss_clip": 0.06270332, + "balance_loss_mlp": 0.0125536, + "epoch": 0.7737862618367654, + "flos": 28738279987200.0, + "grad_norm": 1.45372997777974, + "language_loss": 0.73862869, + "learning_rate": 5.130937518435124e-07, + "loss": 0.81527412, + "num_input_tokens_seen": 277682890, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.09429932, + "step": 12870, + "time_per_iteration": 2.568042278289795 + }, + { + "auxiliary_loss_clip": 0.06404359, + "auxiliary_loss_mlp": 0.01266949, + "balance_loss_clip": 0.06270356, + "balance_loss_mlp": 0.01257102, + "epoch": 0.7738463850894334, + "flos": 17024126538240.0, + "grad_norm": 1.914928650569768, + "language_loss": 0.75650132, + "learning_rate": 5.12833312719501e-07, + "loss": 0.83321428, + "num_input_tokens_seen": 277699330, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.09851074, + "step": 12871, + "time_per_iteration": 2.4711315631866455 + }, + { + "auxiliary_loss_clip": 0.06402566, + "auxiliary_loss_mlp": 0.0126384, + "balance_loss_clip": 0.06271693, + "balance_loss_mlp": 0.01254416, + "epoch": 0.7739065083421013, + "flos": 20710246227840.0, + "grad_norm": 1.4478463877402143, + "language_loss": 0.69638461, + "learning_rate": 5.12572929988999e-07, + "loss": 0.77304864, + "num_input_tokens_seen": 277718750, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.09417725, + "step": 12872, + "time_per_iteration": 2.520254135131836 + }, + { + "auxiliary_loss_clip": 0.06404334, + "auxiliary_loss_mlp": 0.01264657, + "balance_loss_clip": 0.0627078, + "balance_loss_mlp": 0.01254173, + "epoch": 0.7739666315947693, + "flos": 20702322017280.0, + "grad_norm": 2.162643360462714, + "language_loss": 0.8514446, + "learning_rate": 5.123126036618804e-07, + "loss": 0.92813456, + "num_input_tokens_seen": 277734645, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.10479736, + "step": 12873, + "time_per_iteration": 2.5746922492980957 + }, + { + "auxiliary_loss_clip": 0.06405018, + "auxiliary_loss_mlp": 0.01265436, + "balance_loss_clip": 0.06272902, + "balance_loss_mlp": 0.0125612, + "epoch": 0.7740267548474372, + "flos": 29578501203840.0, + "grad_norm": 2.074777829849384, + "language_loss": 0.66097724, + "learning_rate": 5.120523337480174e-07, + "loss": 0.73768181, + "num_input_tokens_seen": 277755535, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09313965, + "step": 12874, + "time_per_iteration": 2.5801379680633545 + }, + { + "auxiliary_loss_clip": 0.06399316, + "auxiliary_loss_mlp": 0.01262488, + "balance_loss_clip": 0.06268813, + "balance_loss_mlp": 0.01253166, + "epoch": 0.7740868781001052, + "flos": 23665786168320.0, + "grad_norm": 1.7962266070608972, + "language_loss": 0.62437928, + "learning_rate": 5.117921202572785e-07, + "loss": 0.70099723, + "num_input_tokens_seen": 277775585, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.09313965, + "step": 12875, + "time_per_iteration": 2.5030999183654785 + }, + { + "auxiliary_loss_clip": 0.06404817, + "auxiliary_loss_mlp": 0.01262981, + "balance_loss_clip": 0.06269525, + "balance_loss_mlp": 0.0125264, + "epoch": 0.7741470013527731, + "flos": 24724200216960.0, + "grad_norm": 1.663352661776614, + "language_loss": 0.65509927, + "learning_rate": 5.115319631995318e-07, + "loss": 0.73177719, + "num_input_tokens_seen": 277794795, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.10345459, + "step": 12876, + "time_per_iteration": 2.5258145332336426 + }, + { + "auxiliary_loss_clip": 0.06400372, + "auxiliary_loss_mlp": 0.01266731, + "balance_loss_clip": 0.06269747, + "balance_loss_mlp": 0.01258005, + "epoch": 0.7742071246054412, + "flos": 21878092108800.0, + "grad_norm": 1.7333890551620577, + "language_loss": 0.71176594, + "learning_rate": 5.112718625846433e-07, + "loss": 0.78843695, + "num_input_tokens_seen": 277813235, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.08734131, + "step": 12877, + "time_per_iteration": 2.4929704666137695 + }, + { + "auxiliary_loss_clip": 0.06407753, + "auxiliary_loss_mlp": 0.01264403, + "balance_loss_clip": 0.06269468, + "balance_loss_mlp": 0.01254371, + "epoch": 0.7742672478581091, + "flos": 22680815823360.0, + "grad_norm": 1.9764136329910882, + "language_loss": 0.82948673, + "learning_rate": 5.110118184224736e-07, + "loss": 0.90620828, + "num_input_tokens_seen": 277832560, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.1003418, + "step": 12878, + "time_per_iteration": 2.502988338470459 + }, + { + "auxiliary_loss_clip": 0.06402762, + "auxiliary_loss_mlp": 0.01265169, + "balance_loss_clip": 0.06269325, + "balance_loss_mlp": 0.0125531, + "epoch": 0.7743273711107771, + "flos": 18846425134080.0, + "grad_norm": 1.6763538175981627, + "language_loss": 0.73367083, + "learning_rate": 5.10751830722885e-07, + "loss": 0.81035012, + "num_input_tokens_seen": 277850120, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.09857178, + "step": 12879, + "time_per_iteration": 2.4705021381378174 + }, + { + "auxiliary_loss_clip": 0.06397247, + "auxiliary_loss_mlp": 0.01265601, + "balance_loss_clip": 0.06268625, + "balance_loss_mlp": 0.01256219, + "epoch": 0.7743874943634451, + "flos": 28736644832640.0, + "grad_norm": 1.5623883440546136, + "language_loss": 0.79838526, + "learning_rate": 5.104918994957364e-07, + "loss": 0.87501371, + "num_input_tokens_seen": 277871020, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.09381104, + "step": 12880, + "time_per_iteration": 2.556452989578247 + }, + { + "auxiliary_loss_clip": 0.06398898, + "auxiliary_loss_mlp": 0.01266264, + "balance_loss_clip": 0.06267609, + "balance_loss_mlp": 0.01255899, + "epoch": 0.774447617616113, + "flos": 21916344297600.0, + "grad_norm": 1.366667718096845, + "language_loss": 0.70864272, + "learning_rate": 5.102320247508847e-07, + "loss": 0.78529441, + "num_input_tokens_seen": 277891525, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.10375977, + "step": 12881, + "time_per_iteration": 2.521993637084961 + }, + { + "auxiliary_loss_clip": 0.06408711, + "auxiliary_loss_mlp": 0.01270141, + "balance_loss_clip": 0.06270668, + "balance_loss_mlp": 0.01258512, + "epoch": 0.774507740868781, + "flos": 19506789561600.0, + "grad_norm": 2.127818654803154, + "language_loss": 0.84771377, + "learning_rate": 5.099722064981832e-07, + "loss": 0.92450231, + "num_input_tokens_seen": 277910425, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.11627197, + "step": 12882, + "time_per_iteration": 2.5355141162872314 + }, + { + "auxiliary_loss_clip": 0.06311849, + "auxiliary_loss_mlp": 0.01254336, + "balance_loss_clip": 0.06256157, + "balance_loss_mlp": 0.01253313, + "epoch": 0.774567864121449, + "flos": 59447240622720.0, + "grad_norm": 0.7584667410578986, + "language_loss": 0.60187125, + "learning_rate": 5.097124447474858e-07, + "loss": 0.67753309, + "num_input_tokens_seen": 277972795, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.01023102, + "step": 12883, + "time_per_iteration": 3.124359607696533 + }, + { + "auxiliary_loss_clip": 0.06403667, + "auxiliary_loss_mlp": 0.01265861, + "balance_loss_clip": 0.06270087, + "balance_loss_mlp": 0.01255073, + "epoch": 0.774627987374117, + "flos": 13230461733120.0, + "grad_norm": 1.8439274810077488, + "language_loss": 0.72904599, + "learning_rate": 5.094527395086416e-07, + "loss": 0.80574125, + "num_input_tokens_seen": 277990675, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.10778809, + "step": 12884, + "time_per_iteration": 2.4965550899505615 + }, + { + "auxiliary_loss_clip": 0.06399918, + "auxiliary_loss_mlp": 0.01266004, + "balance_loss_clip": 0.06270594, + "balance_loss_mlp": 0.01257301, + "epoch": 0.7746881106267849, + "flos": 21399848530560.0, + "grad_norm": 1.5524278185982343, + "language_loss": 0.81275487, + "learning_rate": 5.091930907914986e-07, + "loss": 0.88941407, + "num_input_tokens_seen": 278010050, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.08703613, + "step": 12885, + "time_per_iteration": 2.557429075241089 + }, + { + "auxiliary_loss_clip": 0.06401367, + "auxiliary_loss_mlp": 0.01263161, + "balance_loss_clip": 0.06271436, + "balance_loss_mlp": 0.01254084, + "epoch": 0.7747482338794529, + "flos": 25636355763840.0, + "grad_norm": 1.6694918727870636, + "language_loss": 0.63739854, + "learning_rate": 5.089334986059029e-07, + "loss": 0.71404386, + "num_input_tokens_seen": 278030660, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.09088135, + "step": 12886, + "time_per_iteration": 2.5352628231048584 + }, + { + "auxiliary_loss_clip": 0.06405632, + "auxiliary_loss_mlp": 0.01262726, + "balance_loss_clip": 0.06271148, + "balance_loss_mlp": 0.01254221, + "epoch": 0.7748083571321208, + "flos": 11551780235520.0, + "grad_norm": 2.0761314412195335, + "language_loss": 0.69713193, + "learning_rate": 5.086739629616987e-07, + "loss": 0.77381551, + "num_input_tokens_seen": 278047645, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.08508301, + "step": 12887, + "time_per_iteration": 3.896411657333374 + }, + { + "auxiliary_loss_clip": 0.06400104, + "auxiliary_loss_mlp": 0.01265417, + "balance_loss_clip": 0.0626978, + "balance_loss_mlp": 0.01256036, + "epoch": 0.7748684803847888, + "flos": 19068433326720.0, + "grad_norm": 1.724718840710913, + "language_loss": 0.70770532, + "learning_rate": 5.084144838687275e-07, + "loss": 0.78436053, + "num_input_tokens_seen": 278066170, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.09381104, + "step": 12888, + "time_per_iteration": 2.5054144859313965 + }, + { + "auxiliary_loss_clip": 0.06406914, + "auxiliary_loss_mlp": 0.01266857, + "balance_loss_clip": 0.06270684, + "balance_loss_mlp": 0.01256372, + "epoch": 0.7749286036374567, + "flos": 22279705528320.0, + "grad_norm": 1.6247326651931444, + "language_loss": 0.8212378, + "learning_rate": 5.081550613368279e-07, + "loss": 0.89797544, + "num_input_tokens_seen": 278085545, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.1048584, + "step": 12889, + "time_per_iteration": 2.503159999847412 + }, + { + "auxiliary_loss_clip": 0.0640256, + "auxiliary_loss_mlp": 0.01267254, + "balance_loss_clip": 0.0627083, + "balance_loss_mlp": 0.01258122, + "epoch": 0.7749887268901248, + "flos": 20198488216320.0, + "grad_norm": 1.8373652721061162, + "language_loss": 0.79928273, + "learning_rate": 5.07895695375838e-07, + "loss": 0.87598085, + "num_input_tokens_seen": 278102995, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.09130859, + "step": 12890, + "time_per_iteration": 2.4615426063537598 + }, + { + "auxiliary_loss_clip": 0.06406836, + "auxiliary_loss_mlp": 0.01270493, + "balance_loss_clip": 0.06273372, + "balance_loss_mlp": 0.01260206, + "epoch": 0.7750488501427927, + "flos": 20343446979840.0, + "grad_norm": 1.6840660181274105, + "language_loss": 0.66623914, + "learning_rate": 5.076363859955932e-07, + "loss": 0.74301237, + "num_input_tokens_seen": 278121460, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.1027832, + "step": 12891, + "time_per_iteration": 2.4890570640563965 + }, + { + "auxiliary_loss_clip": 0.06404784, + "auxiliary_loss_mlp": 0.01265118, + "balance_loss_clip": 0.06270394, + "balance_loss_mlp": 0.01255241, + "epoch": 0.7751089733954607, + "flos": 28371229176960.0, + "grad_norm": 1.3810973475198156, + "language_loss": 0.79341507, + "learning_rate": 5.073771332059257e-07, + "loss": 0.87011403, + "num_input_tokens_seen": 278143905, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.09881592, + "step": 12892, + "time_per_iteration": 2.5426137447357178 + }, + { + "auxiliary_loss_clip": 0.06410879, + "auxiliary_loss_mlp": 0.01265811, + "balance_loss_clip": 0.06274527, + "balance_loss_mlp": 0.01255273, + "epoch": 0.7751690966481286, + "flos": 16949047680000.0, + "grad_norm": 1.9398212373821864, + "language_loss": 0.67894936, + "learning_rate": 5.071179370166669e-07, + "loss": 0.75571626, + "num_input_tokens_seen": 278160850, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.10522461, + "step": 12893, + "time_per_iteration": 2.469115734100342 + }, + { + "auxiliary_loss_clip": 0.06313038, + "auxiliary_loss_mlp": 0.0125019, + "balance_loss_clip": 0.06257471, + "balance_loss_mlp": 0.01248948, + "epoch": 0.7752292199007966, + "flos": 65690179799040.0, + "grad_norm": 0.7899277487406899, + "language_loss": 0.58551872, + "learning_rate": 5.068587974376468e-07, + "loss": 0.66115099, + "num_input_tokens_seen": 278219950, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01241302, + "step": 12894, + "time_per_iteration": 3.1802139282226562 + }, + { + "auxiliary_loss_clip": 0.06405281, + "auxiliary_loss_mlp": 0.012653, + "balance_loss_clip": 0.06270818, + "balance_loss_mlp": 0.01254637, + "epoch": 0.7752893431534646, + "flos": 20600898249600.0, + "grad_norm": 2.1408661734068697, + "language_loss": 0.78008652, + "learning_rate": 5.065997144786895e-07, + "loss": 0.85679233, + "num_input_tokens_seen": 278237805, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.10662842, + "step": 12895, + "time_per_iteration": 2.517387866973877 + }, + { + "auxiliary_loss_clip": 0.06404513, + "auxiliary_loss_mlp": 0.01265126, + "balance_loss_clip": 0.06271935, + "balance_loss_mlp": 0.01255124, + "epoch": 0.7753494664061326, + "flos": 20491592198400.0, + "grad_norm": 1.7101210231802921, + "language_loss": 0.67742205, + "learning_rate": 5.063406881496209e-07, + "loss": 0.75411844, + "num_input_tokens_seen": 278257660, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.10003662, + "step": 12896, + "time_per_iteration": 2.508040428161621 + }, + { + "auxiliary_loss_clip": 0.06401385, + "auxiliary_loss_mlp": 0.01264283, + "balance_loss_clip": 0.06268774, + "balance_loss_mlp": 0.01254717, + "epoch": 0.7754095896588006, + "flos": 20272015774080.0, + "grad_norm": 1.718290101877412, + "language_loss": 0.68828535, + "learning_rate": 5.060817184602629e-07, + "loss": 0.76494199, + "num_input_tokens_seen": 278275110, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.09570312, + "step": 12897, + "time_per_iteration": 3.958052158355713 + }, + { + "auxiliary_loss_clip": 0.06406542, + "auxiliary_loss_mlp": 0.01265206, + "balance_loss_clip": 0.06272966, + "balance_loss_mlp": 0.01255074, + "epoch": 0.7754697129114685, + "flos": 23337784160640.0, + "grad_norm": 1.8777545444749013, + "language_loss": 0.75346845, + "learning_rate": 5.058228054204364e-07, + "loss": 0.83018595, + "num_input_tokens_seen": 278293035, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.10131836, + "step": 12898, + "time_per_iteration": 2.548725128173828 + }, + { + "auxiliary_loss_clip": 0.06405295, + "auxiliary_loss_mlp": 0.0126368, + "balance_loss_clip": 0.06271052, + "balance_loss_mlp": 0.01253231, + "epoch": 0.7755298361641365, + "flos": 17353344430080.0, + "grad_norm": 2.11113178190308, + "language_loss": 0.70727742, + "learning_rate": 5.055639490399588e-07, + "loss": 0.78396714, + "num_input_tokens_seen": 278311010, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.10443115, + "step": 12899, + "time_per_iteration": 2.4659245014190674 + }, + { + "auxiliary_loss_clip": 0.06405385, + "auxiliary_loss_mlp": 0.01264565, + "balance_loss_clip": 0.06272905, + "balance_loss_mlp": 0.01254266, + "epoch": 0.7755899594168044, + "flos": 19651916033280.0, + "grad_norm": 2.07260093915493, + "language_loss": 0.74897844, + "learning_rate": 5.053051493286453e-07, + "loss": 0.82567799, + "num_input_tokens_seen": 278329900, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.10302734, + "step": 12900, + "time_per_iteration": 4.011428117752075 + }, + { + "auxiliary_loss_clip": 0.06400472, + "auxiliary_loss_mlp": 0.01264751, + "balance_loss_clip": 0.06270377, + "balance_loss_mlp": 0.01255525, + "epoch": 0.7756500826694724, + "flos": 27421324565760.0, + "grad_norm": 1.5623703239819655, + "language_loss": 0.77776372, + "learning_rate": 5.050464062963113e-07, + "loss": 0.85441595, + "num_input_tokens_seen": 278349980, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.09234619, + "step": 12901, + "time_per_iteration": 2.551858425140381 + }, + { + "auxiliary_loss_clip": 0.0639973, + "auxiliary_loss_mlp": 0.0126504, + "balance_loss_clip": 0.06269458, + "balance_loss_mlp": 0.01255289, + "epoch": 0.7757102059221404, + "flos": 28738028424960.0, + "grad_norm": 1.3485417524175327, + "language_loss": 0.77421844, + "learning_rate": 5.047877199527666e-07, + "loss": 0.8508662, + "num_input_tokens_seen": 278372485, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.09747314, + "step": 12902, + "time_per_iteration": 2.5616962909698486 + }, + { + "auxiliary_loss_clip": 0.06401799, + "auxiliary_loss_mlp": 0.01266411, + "balance_loss_clip": 0.06270513, + "balance_loss_mlp": 0.01256898, + "epoch": 0.7757703291748084, + "flos": 22492489772160.0, + "grad_norm": 1.8023361426905782, + "language_loss": 0.73515046, + "learning_rate": 5.045290903078215e-07, + "loss": 0.81183261, + "num_input_tokens_seen": 278391660, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.09509277, + "step": 12903, + "time_per_iteration": 2.5368919372558594 + }, + { + "auxiliary_loss_clip": 0.06400372, + "auxiliary_loss_mlp": 0.01263703, + "balance_loss_clip": 0.06269526, + "balance_loss_mlp": 0.01253851, + "epoch": 0.7758304524274763, + "flos": 21435920513280.0, + "grad_norm": 2.3012880989025946, + "language_loss": 0.75830078, + "learning_rate": 5.042705173712835e-07, + "loss": 0.83494151, + "num_input_tokens_seen": 278409125, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09863281, + "step": 12904, + "time_per_iteration": 2.476417064666748 + }, + { + "auxiliary_loss_clip": 0.06397906, + "auxiliary_loss_mlp": 0.01264748, + "balance_loss_clip": 0.06269727, + "balance_loss_mlp": 0.01256093, + "epoch": 0.7758905756801443, + "flos": 23665953876480.0, + "grad_norm": 1.8947972098454593, + "language_loss": 0.68449861, + "learning_rate": 5.040120011529576e-07, + "loss": 0.76112515, + "num_input_tokens_seen": 278429450, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.08654785, + "step": 12905, + "time_per_iteration": 3.922461748123169 + }, + { + "auxiliary_loss_clip": 0.06398395, + "auxiliary_loss_mlp": 0.01266837, + "balance_loss_clip": 0.06270361, + "balance_loss_mlp": 0.0125736, + "epoch": 0.7759506989328122, + "flos": 28372906258560.0, + "grad_norm": 1.53682543204514, + "language_loss": 0.67685688, + "learning_rate": 5.037535416626459e-07, + "loss": 0.75350916, + "num_input_tokens_seen": 278449925, + "router_z_loss_clip": 1.28027344, + "router_z_loss_mlp": 0.0947876, + "step": 12906, + "time_per_iteration": 2.5313022136688232 + }, + { + "auxiliary_loss_clip": 0.06400718, + "auxiliary_loss_mlp": 0.01267007, + "balance_loss_clip": 0.06268603, + "balance_loss_mlp": 0.01257124, + "epoch": 0.7760108221854802, + "flos": 14908053127680.0, + "grad_norm": 2.1235046530395167, + "language_loss": 0.81742978, + "learning_rate": 5.034951389101498e-07, + "loss": 0.8941071, + "num_input_tokens_seen": 278467255, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.09887695, + "step": 12907, + "time_per_iteration": 2.4844870567321777 + }, + { + "auxiliary_loss_clip": 0.06399026, + "auxiliary_loss_mlp": 0.01267683, + "balance_loss_clip": 0.06271745, + "balance_loss_mlp": 0.01258584, + "epoch": 0.7760709454381483, + "flos": 14797615046400.0, + "grad_norm": 2.0283728968783006, + "language_loss": 0.67200708, + "learning_rate": 5.032367929052685e-07, + "loss": 0.74867415, + "num_input_tokens_seen": 278484250, + "router_z_loss_clip": 1.27246094, + "router_z_loss_mlp": 0.09103394, + "step": 12908, + "time_per_iteration": 2.489652633666992 + }, + { + "auxiliary_loss_clip": 0.06403653, + "auxiliary_loss_mlp": 0.01267977, + "balance_loss_clip": 0.06269245, + "balance_loss_mlp": 0.01258017, + "epoch": 0.7761310686908162, + "flos": 17384846365440.0, + "grad_norm": 1.5208070969667713, + "language_loss": 0.70563579, + "learning_rate": 5.029785036577976e-07, + "loss": 0.78235209, + "num_input_tokens_seen": 278502740, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.09954834, + "step": 12909, + "time_per_iteration": 2.484180450439453 + }, + { + "auxiliary_loss_clip": 0.06401674, + "auxiliary_loss_mlp": 0.01271334, + "balance_loss_clip": 0.06272651, + "balance_loss_mlp": 0.01262208, + "epoch": 0.7761911919434842, + "flos": 25563582892800.0, + "grad_norm": 1.6528787080895593, + "language_loss": 0.68030262, + "learning_rate": 5.027202711775324e-07, + "loss": 0.75703275, + "num_input_tokens_seen": 278523890, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.09130859, + "step": 12910, + "time_per_iteration": 2.5219783782958984 + }, + { + "auxiliary_loss_clip": 0.06401049, + "auxiliary_loss_mlp": 0.01265939, + "balance_loss_clip": 0.06268351, + "balance_loss_mlp": 0.01256193, + "epoch": 0.7762513151961521, + "flos": 23185530092160.0, + "grad_norm": 1.572866205055694, + "language_loss": 0.7175374, + "learning_rate": 5.024620954742646e-07, + "loss": 0.79420727, + "num_input_tokens_seen": 278543185, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09747314, + "step": 12911, + "time_per_iteration": 2.533684730529785 + }, + { + "auxiliary_loss_clip": 0.06403443, + "auxiliary_loss_mlp": 0.01265195, + "balance_loss_clip": 0.06270085, + "balance_loss_mlp": 0.01254651, + "epoch": 0.7763114384488201, + "flos": 21696097040640.0, + "grad_norm": 3.1287600736894867, + "language_loss": 0.63521278, + "learning_rate": 5.022039765577836e-07, + "loss": 0.71189916, + "num_input_tokens_seen": 278559220, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.10546875, + "step": 12912, + "time_per_iteration": 2.4713103771209717 + }, + { + "auxiliary_loss_clip": 0.06310222, + "auxiliary_loss_mlp": 0.01256155, + "balance_loss_clip": 0.06254428, + "balance_loss_mlp": 0.012551, + "epoch": 0.776371561701488, + "flos": 69048381335040.0, + "grad_norm": 0.7692138307274686, + "language_loss": 0.53290647, + "learning_rate": 5.019459144378779e-07, + "loss": 0.60857022, + "num_input_tokens_seen": 278618185, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.01056671, + "step": 12913, + "time_per_iteration": 3.1764438152313232 + }, + { + "auxiliary_loss_clip": 0.06402822, + "auxiliary_loss_mlp": 0.01263376, + "balance_loss_clip": 0.06270348, + "balance_loss_mlp": 0.01254495, + "epoch": 0.776431684954156, + "flos": 22900643809920.0, + "grad_norm": 1.5625942669092794, + "language_loss": 0.6230467, + "learning_rate": 5.016879091243338e-07, + "loss": 0.6997087, + "num_input_tokens_seen": 278636210, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.08880615, + "step": 12914, + "time_per_iteration": 2.534447193145752 + }, + { + "auxiliary_loss_clip": 0.06399079, + "auxiliary_loss_mlp": 0.012627, + "balance_loss_clip": 0.06268825, + "balance_loss_mlp": 0.01253259, + "epoch": 0.776491808206824, + "flos": 20266942602240.0, + "grad_norm": 1.633160981645456, + "language_loss": 0.82489586, + "learning_rate": 5.014299606269339e-07, + "loss": 0.9015137, + "num_input_tokens_seen": 278653305, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.09436035, + "step": 12915, + "time_per_iteration": 2.4910573959350586 + }, + { + "auxiliary_loss_clip": 0.06403746, + "auxiliary_loss_mlp": 0.01265286, + "balance_loss_clip": 0.06268285, + "balance_loss_mlp": 0.01255266, + "epoch": 0.776551931459492, + "flos": 26766033310080.0, + "grad_norm": 1.7528109604711235, + "language_loss": 0.74837983, + "learning_rate": 5.011720689554603e-07, + "loss": 0.82507014, + "num_input_tokens_seen": 278671850, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.10021973, + "step": 12916, + "time_per_iteration": 2.5818369388580322 + }, + { + "auxiliary_loss_clip": 0.06402493, + "auxiliary_loss_mlp": 0.01264205, + "balance_loss_clip": 0.06269188, + "balance_loss_mlp": 0.01254281, + "epoch": 0.7766120547121599, + "flos": 52676583960960.0, + "grad_norm": 1.4770261011777261, + "language_loss": 0.65460002, + "learning_rate": 5.009142341196919e-07, + "loss": 0.73126698, + "num_input_tokens_seen": 278697860, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.09924316, + "step": 12917, + "time_per_iteration": 2.776418924331665 + }, + { + "auxiliary_loss_clip": 0.06402885, + "auxiliary_loss_mlp": 0.01264757, + "balance_loss_clip": 0.06269239, + "balance_loss_mlp": 0.0125522, + "epoch": 0.7766721779648279, + "flos": 25163353065600.0, + "grad_norm": 1.489121757644636, + "language_loss": 0.6467213, + "learning_rate": 5.006564561294065e-07, + "loss": 0.72339773, + "num_input_tokens_seen": 278720655, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.09533691, + "step": 12918, + "time_per_iteration": 2.5809319019317627 + }, + { + "auxiliary_loss_clip": 0.06400011, + "auxiliary_loss_mlp": 0.01265679, + "balance_loss_clip": 0.06268477, + "balance_loss_mlp": 0.01256792, + "epoch": 0.7767323012174958, + "flos": 23766161760000.0, + "grad_norm": 2.1752593632817425, + "language_loss": 0.73467445, + "learning_rate": 5.003987349943777e-07, + "loss": 0.81133133, + "num_input_tokens_seen": 278737375, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.08886719, + "step": 12919, + "time_per_iteration": 2.498762369155884 + }, + { + "auxiliary_loss_clip": 0.06403969, + "auxiliary_loss_mlp": 0.01266374, + "balance_loss_clip": 0.06270312, + "balance_loss_mlp": 0.0125626, + "epoch": 0.7767924244701638, + "flos": 22092469580160.0, + "grad_norm": 1.6453382869225388, + "language_loss": 0.79804212, + "learning_rate": 5.001410707243792e-07, + "loss": 0.87474561, + "num_input_tokens_seen": 278756510, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.10113525, + "step": 12920, + "time_per_iteration": 2.5327045917510986 + }, + { + "auxiliary_loss_clip": 0.06406744, + "auxiliary_loss_mlp": 0.01265583, + "balance_loss_clip": 0.06271371, + "balance_loss_mlp": 0.012561, + "epoch": 0.7768525477228319, + "flos": 21988194773760.0, + "grad_norm": 1.540123297700945, + "language_loss": 0.71420145, + "learning_rate": 4.998834633291829e-07, + "loss": 0.79092473, + "num_input_tokens_seen": 278775410, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.09490967, + "step": 12921, + "time_per_iteration": 2.493539333343506 + }, + { + "auxiliary_loss_clip": 0.06407829, + "auxiliary_loss_mlp": 0.01268758, + "balance_loss_clip": 0.06272625, + "balance_loss_mlp": 0.01258643, + "epoch": 0.7769126709754998, + "flos": 21800329920000.0, + "grad_norm": 1.5870112514861305, + "language_loss": 0.764503, + "learning_rate": 4.996259128185547e-07, + "loss": 0.8412689, + "num_input_tokens_seen": 278794260, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.10113525, + "step": 12922, + "time_per_iteration": 2.664897918701172 + }, + { + "auxiliary_loss_clip": 0.06402089, + "auxiliary_loss_mlp": 0.01264843, + "balance_loss_clip": 0.06270384, + "balance_loss_mlp": 0.01254853, + "epoch": 0.7769727942281678, + "flos": 20054242212480.0, + "grad_norm": 2.0384511748654286, + "language_loss": 0.80950773, + "learning_rate": 4.993684192022625e-07, + "loss": 0.88617706, + "num_input_tokens_seen": 278813290, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09991455, + "step": 12923, + "time_per_iteration": 2.4884073734283447 + }, + { + "auxiliary_loss_clip": 0.06402602, + "auxiliary_loss_mlp": 0.01263266, + "balance_loss_clip": 0.06271294, + "balance_loss_mlp": 0.01253914, + "epoch": 0.7770329174808357, + "flos": 21692784804480.0, + "grad_norm": 1.8529148039982746, + "language_loss": 0.92405283, + "learning_rate": 4.991109824900699e-07, + "loss": 1.00071156, + "num_input_tokens_seen": 278830610, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.09356689, + "step": 12924, + "time_per_iteration": 2.52184796333313 + }, + { + "auxiliary_loss_clip": 0.06402275, + "auxiliary_loss_mlp": 0.01264418, + "balance_loss_clip": 0.06269395, + "balance_loss_mlp": 0.01254804, + "epoch": 0.7770930407335037, + "flos": 25856477239680.0, + "grad_norm": 1.997586908265186, + "language_loss": 0.66484189, + "learning_rate": 4.988536026917401e-07, + "loss": 0.74150878, + "num_input_tokens_seen": 278849530, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.09606934, + "step": 12925, + "time_per_iteration": 2.528657913208008 + }, + { + "auxiliary_loss_clip": 0.06409155, + "auxiliary_loss_mlp": 0.01270758, + "balance_loss_clip": 0.06273882, + "balance_loss_mlp": 0.01261019, + "epoch": 0.7771531639861716, + "flos": 24353921024640.0, + "grad_norm": 1.7055491864849242, + "language_loss": 0.72285664, + "learning_rate": 4.985962798170314e-07, + "loss": 0.7996558, + "num_input_tokens_seen": 278869005, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.09729004, + "step": 12926, + "time_per_iteration": 2.529508352279663 + }, + { + "auxiliary_loss_clip": 0.06404512, + "auxiliary_loss_mlp": 0.01263957, + "balance_loss_clip": 0.06270072, + "balance_loss_mlp": 0.01253914, + "epoch": 0.7772132872388396, + "flos": 25637068523520.0, + "grad_norm": 1.8006607912850339, + "language_loss": 0.65851128, + "learning_rate": 4.983390138757027e-07, + "loss": 0.73519599, + "num_input_tokens_seen": 278888790, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.10046387, + "step": 12927, + "time_per_iteration": 3.9577128887176514 + }, + { + "auxiliary_loss_clip": 0.06403954, + "auxiliary_loss_mlp": 0.01268877, + "balance_loss_clip": 0.06270983, + "balance_loss_mlp": 0.01258607, + "epoch": 0.7772734104915076, + "flos": 26074544290560.0, + "grad_norm": 2.5615945281545147, + "language_loss": 0.72538382, + "learning_rate": 4.980818048775093e-07, + "loss": 0.8021121, + "num_input_tokens_seen": 278908150, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.1026001, + "step": 12928, + "time_per_iteration": 2.524092197418213 + }, + { + "auxiliary_loss_clip": 0.06398363, + "auxiliary_loss_mlp": 0.0126847, + "balance_loss_clip": 0.0626855, + "balance_loss_mlp": 0.0125935, + "epoch": 0.7773335337441756, + "flos": 22930887934080.0, + "grad_norm": 1.7899805445519197, + "language_loss": 0.74762726, + "learning_rate": 4.978246528322036e-07, + "loss": 0.82429558, + "num_input_tokens_seen": 278927425, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.09118652, + "step": 12929, + "time_per_iteration": 2.50419282913208 + }, + { + "auxiliary_loss_clip": 0.06401908, + "auxiliary_loss_mlp": 0.01268664, + "balance_loss_clip": 0.06269601, + "balance_loss_mlp": 0.01258871, + "epoch": 0.7773936569968435, + "flos": 20782977171840.0, + "grad_norm": 1.7754986557966836, + "language_loss": 0.77492833, + "learning_rate": 4.975675577495377e-07, + "loss": 0.85163409, + "num_input_tokens_seen": 278946475, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.09796143, + "step": 12930, + "time_per_iteration": 2.5014841556549072 + }, + { + "auxiliary_loss_clip": 0.06403639, + "auxiliary_loss_mlp": 0.01265185, + "balance_loss_clip": 0.06271214, + "balance_loss_mlp": 0.01255291, + "epoch": 0.7774537802495115, + "flos": 20377883808000.0, + "grad_norm": 1.923217497642762, + "language_loss": 0.80022055, + "learning_rate": 4.973105196392613e-07, + "loss": 0.87690878, + "num_input_tokens_seen": 278964345, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09893799, + "step": 12931, + "time_per_iteration": 2.479499340057373 + }, + { + "auxiliary_loss_clip": 0.06306946, + "auxiliary_loss_mlp": 0.0125312, + "balance_loss_clip": 0.06251584, + "balance_loss_mlp": 0.01252035, + "epoch": 0.7775139035021794, + "flos": 53930981980800.0, + "grad_norm": 0.7888811218125162, + "language_loss": 0.59670961, + "learning_rate": 4.970535385111199e-07, + "loss": 0.67231035, + "num_input_tokens_seen": 279022380, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01087189, + "step": 12932, + "time_per_iteration": 3.131812810897827 + }, + { + "auxiliary_loss_clip": 0.06405772, + "auxiliary_loss_mlp": 0.01263803, + "balance_loss_clip": 0.06271382, + "balance_loss_mlp": 0.01254373, + "epoch": 0.7775740267548474, + "flos": 28850437077120.0, + "grad_norm": 1.493641616196245, + "language_loss": 0.76082242, + "learning_rate": 4.967966143748595e-07, + "loss": 0.83751822, + "num_input_tokens_seen": 279044275, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.09436035, + "step": 12933, + "time_per_iteration": 2.657081127166748 + }, + { + "auxiliary_loss_clip": 0.06403433, + "auxiliary_loss_mlp": 0.01262442, + "balance_loss_clip": 0.06271302, + "balance_loss_mlp": 0.01252077, + "epoch": 0.7776341500075155, + "flos": 21879056430720.0, + "grad_norm": 1.8678224067901799, + "language_loss": 0.73828089, + "learning_rate": 4.965397472402215e-07, + "loss": 0.81493968, + "num_input_tokens_seen": 279063375, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.10369873, + "step": 12934, + "time_per_iteration": 2.514028549194336 + }, + { + "auxiliary_loss_clip": 0.06404053, + "auxiliary_loss_mlp": 0.01265488, + "balance_loss_clip": 0.06270254, + "balance_loss_mlp": 0.01255468, + "epoch": 0.7776942732601834, + "flos": 20236027645440.0, + "grad_norm": 1.899249869710296, + "language_loss": 0.70498896, + "learning_rate": 4.962829371169475e-07, + "loss": 0.78168434, + "num_input_tokens_seen": 279082680, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.10009766, + "step": 12935, + "time_per_iteration": 2.5094125270843506 + }, + { + "auxiliary_loss_clip": 0.06407169, + "auxiliary_loss_mlp": 0.01265988, + "balance_loss_clip": 0.06272172, + "balance_loss_mlp": 0.01256333, + "epoch": 0.7777543965128514, + "flos": 22237554124800.0, + "grad_norm": 1.4942918595564652, + "language_loss": 0.83564198, + "learning_rate": 4.960261840147746e-07, + "loss": 0.91237354, + "num_input_tokens_seen": 279099805, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.09661865, + "step": 12936, + "time_per_iteration": 2.4796142578125 + }, + { + "auxiliary_loss_clip": 0.0640949, + "auxiliary_loss_mlp": 0.0126322, + "balance_loss_clip": 0.06271779, + "balance_loss_mlp": 0.01254202, + "epoch": 0.7778145197655193, + "flos": 14507236321920.0, + "grad_norm": 1.7034390365737724, + "language_loss": 0.67389679, + "learning_rate": 4.957694879434397e-07, + "loss": 0.75062388, + "num_input_tokens_seen": 279117975, + "router_z_loss_clip": 1.37695312, + "router_z_loss_mlp": 0.09020996, + "step": 12937, + "time_per_iteration": 3.914120674133301 + }, + { + "auxiliary_loss_clip": 0.06402509, + "auxiliary_loss_mlp": 0.01264387, + "balance_loss_clip": 0.06269647, + "balance_loss_mlp": 0.01254928, + "epoch": 0.7778746430181873, + "flos": 21146338402560.0, + "grad_norm": 1.4641946456132704, + "language_loss": 0.87061489, + "learning_rate": 4.955128489126777e-07, + "loss": 0.94728386, + "num_input_tokens_seen": 279137255, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.09460449, + "step": 12938, + "time_per_iteration": 2.494309663772583 + }, + { + "auxiliary_loss_clip": 0.06401877, + "auxiliary_loss_mlp": 0.01265878, + "balance_loss_clip": 0.06269386, + "balance_loss_mlp": 0.01255972, + "epoch": 0.7779347662708552, + "flos": 20272560825600.0, + "grad_norm": 1.9237142576123536, + "language_loss": 0.8554709, + "learning_rate": 4.95256266932218e-07, + "loss": 0.93214846, + "num_input_tokens_seen": 279154500, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09906006, + "step": 12939, + "time_per_iteration": 2.4730064868927 + }, + { + "auxiliary_loss_clip": 0.06398107, + "auxiliary_loss_mlp": 0.01265311, + "balance_loss_clip": 0.0626917, + "balance_loss_mlp": 0.01256084, + "epoch": 0.7779948895235232, + "flos": 19215153025920.0, + "grad_norm": 1.7540702962563577, + "language_loss": 0.69412231, + "learning_rate": 4.949997420117915e-07, + "loss": 0.77075648, + "num_input_tokens_seen": 279173635, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.09228516, + "step": 12940, + "time_per_iteration": 3.918668270111084 + }, + { + "auxiliary_loss_clip": 0.064026, + "auxiliary_loss_mlp": 0.01265044, + "balance_loss_clip": 0.06269296, + "balance_loss_mlp": 0.01255627, + "epoch": 0.7780550127761912, + "flos": 23921476502400.0, + "grad_norm": 4.631352047296881, + "language_loss": 0.77788246, + "learning_rate": 4.947432741611255e-07, + "loss": 0.85455894, + "num_input_tokens_seen": 279194430, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.09423828, + "step": 12941, + "time_per_iteration": 2.5110888481140137 + }, + { + "auxiliary_loss_clip": 0.06410088, + "auxiliary_loss_mlp": 0.01268786, + "balance_loss_clip": 0.06272246, + "balance_loss_mlp": 0.01257813, + "epoch": 0.7781151360288592, + "flos": 32424148114560.0, + "grad_norm": 2.2460397891674697, + "language_loss": 0.73285127, + "learning_rate": 4.944868633899462e-07, + "loss": 0.80964005, + "num_input_tokens_seen": 279212920, + "router_z_loss_clip": 1.37695312, + "router_z_loss_mlp": 0.10974121, + "step": 12942, + "time_per_iteration": 2.5817012786865234 + }, + { + "auxiliary_loss_clip": 0.06399062, + "auxiliary_loss_mlp": 0.01266209, + "balance_loss_clip": 0.06270151, + "balance_loss_mlp": 0.01257239, + "epoch": 0.7781752592815271, + "flos": 22352981523840.0, + "grad_norm": 1.9559350984473978, + "language_loss": 0.68287194, + "learning_rate": 4.942305097079751e-07, + "loss": 0.75952458, + "num_input_tokens_seen": 279232310, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.08972168, + "step": 12943, + "time_per_iteration": 2.4933464527130127 + }, + { + "auxiliary_loss_clip": 0.06304064, + "auxiliary_loss_mlp": 0.01250725, + "balance_loss_clip": 0.06248597, + "balance_loss_mlp": 0.01249737, + "epoch": 0.7782353825341951, + "flos": 70479101802240.0, + "grad_norm": 0.7622073777913676, + "language_loss": 0.58524758, + "learning_rate": 4.939742131249347e-07, + "loss": 0.66079545, + "num_input_tokens_seen": 279295375, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.00987244, + "step": 12944, + "time_per_iteration": 3.2943570613861084 + }, + { + "auxiliary_loss_clip": 0.0640593, + "auxiliary_loss_mlp": 0.0126598, + "balance_loss_clip": 0.06270279, + "balance_loss_mlp": 0.01255495, + "epoch": 0.778295505786863, + "flos": 19068601034880.0, + "grad_norm": 1.9954002249316443, + "language_loss": 0.68333346, + "learning_rate": 4.937179736505428e-07, + "loss": 0.76005256, + "num_input_tokens_seen": 279313660, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.10491943, + "step": 12945, + "time_per_iteration": 3.963608741760254 + }, + { + "auxiliary_loss_clip": 0.06401619, + "auxiliary_loss_mlp": 0.01263231, + "balance_loss_clip": 0.06268932, + "balance_loss_mlp": 0.01253837, + "epoch": 0.778355629039531, + "flos": 21006662446080.0, + "grad_norm": 2.4482608319638404, + "language_loss": 0.69179362, + "learning_rate": 4.93461791294516e-07, + "loss": 0.76844209, + "num_input_tokens_seen": 279334495, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09387207, + "step": 12946, + "time_per_iteration": 2.528555393218994 + }, + { + "auxiliary_loss_clip": 0.06402339, + "auxiliary_loss_mlp": 0.01264109, + "balance_loss_clip": 0.06268816, + "balance_loss_mlp": 0.01254328, + "epoch": 0.7784157522921991, + "flos": 21404586286080.0, + "grad_norm": 1.63285369155658, + "language_loss": 0.65319461, + "learning_rate": 4.932056660665689e-07, + "loss": 0.72985911, + "num_input_tokens_seen": 279352985, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.09783936, + "step": 12947, + "time_per_iteration": 2.533308744430542 + }, + { + "auxiliary_loss_clip": 0.06402348, + "auxiliary_loss_mlp": 0.01262916, + "balance_loss_clip": 0.06270808, + "balance_loss_mlp": 0.01253499, + "epoch": 0.778475875544867, + "flos": 20820181184640.0, + "grad_norm": 1.87438794738079, + "language_loss": 0.65581381, + "learning_rate": 4.929495979764147e-07, + "loss": 0.73246646, + "num_input_tokens_seen": 279371360, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.09417725, + "step": 12948, + "time_per_iteration": 2.5082039833068848 + }, + { + "auxiliary_loss_clip": 0.0640206, + "auxiliary_loss_mlp": 0.01261972, + "balance_loss_clip": 0.06271663, + "balance_loss_mlp": 0.01252078, + "epoch": 0.778535998797535, + "flos": 14360516622720.0, + "grad_norm": 1.7911059027184133, + "language_loss": 0.75669527, + "learning_rate": 4.926935870337625e-07, + "loss": 0.83333564, + "num_input_tokens_seen": 279389400, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.09893799, + "step": 12949, + "time_per_iteration": 2.499680519104004 + }, + { + "auxiliary_loss_clip": 0.06407519, + "auxiliary_loss_mlp": 0.0126663, + "balance_loss_clip": 0.06271057, + "balance_loss_mlp": 0.01255871, + "epoch": 0.7785961220502029, + "flos": 19215781931520.0, + "grad_norm": 1.2917746110021882, + "language_loss": 0.69081604, + "learning_rate": 4.924376332483202e-07, + "loss": 0.7675575, + "num_input_tokens_seen": 279409715, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.10760498, + "step": 12950, + "time_per_iteration": 2.4793641567230225 + }, + { + "auxiliary_loss_clip": 0.06404532, + "auxiliary_loss_mlp": 0.0126582, + "balance_loss_clip": 0.06268837, + "balance_loss_mlp": 0.01256307, + "epoch": 0.7786562453028709, + "flos": 25745787596160.0, + "grad_norm": 1.5705407772733666, + "language_loss": 0.72314119, + "learning_rate": 4.921817366297938e-07, + "loss": 0.79984468, + "num_input_tokens_seen": 279427705, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.09509277, + "step": 12951, + "time_per_iteration": 2.533123731613159 + }, + { + "auxiliary_loss_clip": 0.06403095, + "auxiliary_loss_mlp": 0.01262496, + "balance_loss_clip": 0.06272363, + "balance_loss_mlp": 0.01252238, + "epoch": 0.7787163685555388, + "flos": 25746584209920.0, + "grad_norm": 1.6880059510178558, + "language_loss": 0.65866429, + "learning_rate": 4.919258971878877e-07, + "loss": 0.73532021, + "num_input_tokens_seen": 279448215, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.1026001, + "step": 12952, + "time_per_iteration": 2.5218706130981445 + }, + { + "auxiliary_loss_clip": 0.06394114, + "auxiliary_loss_mlp": 0.01264734, + "balance_loss_clip": 0.06268984, + "balance_loss_mlp": 0.01256032, + "epoch": 0.7787764918082068, + "flos": 22754385308160.0, + "grad_norm": 2.055033459437186, + "language_loss": 0.81612301, + "learning_rate": 4.916701149323022e-07, + "loss": 0.89271152, + "num_input_tokens_seen": 279466260, + "router_z_loss_clip": 1.25097656, + "router_z_loss_mlp": 0.08709717, + "step": 12953, + "time_per_iteration": 2.5306200981140137 + }, + { + "auxiliary_loss_clip": 0.06410024, + "auxiliary_loss_mlp": 0.01264944, + "balance_loss_clip": 0.06273989, + "balance_loss_mlp": 0.01254972, + "epoch": 0.7788366150608748, + "flos": 15195538886400.0, + "grad_norm": 1.8925370756412514, + "language_loss": 0.76971662, + "learning_rate": 4.91414389872737e-07, + "loss": 0.8464663, + "num_input_tokens_seen": 279484520, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.09960938, + "step": 12954, + "time_per_iteration": 2.4636683464050293 + }, + { + "auxiliary_loss_clip": 0.0640775, + "auxiliary_loss_mlp": 0.01263138, + "balance_loss_clip": 0.06270479, + "balance_loss_mlp": 0.01253369, + "epoch": 0.7788967383135428, + "flos": 21215799037440.0, + "grad_norm": 1.4850490788267763, + "language_loss": 0.7292642, + "learning_rate": 4.911587220188905e-07, + "loss": 0.80597305, + "num_input_tokens_seen": 279503130, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.09765625, + "step": 12955, + "time_per_iteration": 2.4956090450286865 + }, + { + "auxiliary_loss_clip": 0.06403288, + "auxiliary_loss_mlp": 0.01263998, + "balance_loss_clip": 0.06270338, + "balance_loss_mlp": 0.01253973, + "epoch": 0.7789568615662107, + "flos": 21688340538240.0, + "grad_norm": 1.3614080537003919, + "language_loss": 0.68852103, + "learning_rate": 4.909031113804551e-07, + "loss": 0.76519388, + "num_input_tokens_seen": 279521930, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.10021973, + "step": 12956, + "time_per_iteration": 2.5246806144714355 + }, + { + "auxiliary_loss_clip": 0.06403255, + "auxiliary_loss_mlp": 0.01262407, + "balance_loss_clip": 0.06269701, + "balance_loss_mlp": 0.01252864, + "epoch": 0.7790169848188787, + "flos": 26367732126720.0, + "grad_norm": 1.5408189512052117, + "language_loss": 0.7640478, + "learning_rate": 4.906475579671252e-07, + "loss": 0.84070438, + "num_input_tokens_seen": 279542375, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.09539795, + "step": 12957, + "time_per_iteration": 2.560433864593506 + }, + { + "auxiliary_loss_clip": 0.06402086, + "auxiliary_loss_mlp": 0.01265224, + "balance_loss_clip": 0.06269553, + "balance_loss_mlp": 0.01255407, + "epoch": 0.7790771080715466, + "flos": 25522563519360.0, + "grad_norm": 1.6277364892308188, + "language_loss": 0.77872479, + "learning_rate": 4.903920617885917e-07, + "loss": 0.85539794, + "num_input_tokens_seen": 279561885, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.0982666, + "step": 12958, + "time_per_iteration": 2.5132603645324707 + }, + { + "auxiliary_loss_clip": 0.06403212, + "auxiliary_loss_mlp": 0.0126808, + "balance_loss_clip": 0.06270035, + "balance_loss_mlp": 0.01257995, + "epoch": 0.7791372313242146, + "flos": 16039701244800.0, + "grad_norm": 2.1750549436439295, + "language_loss": 0.71726602, + "learning_rate": 4.901366228545418e-07, + "loss": 0.79397893, + "num_input_tokens_seen": 279579965, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.10076904, + "step": 12959, + "time_per_iteration": 2.4766464233398438 + }, + { + "auxiliary_loss_clip": 0.06403412, + "auxiliary_loss_mlp": 0.01265995, + "balance_loss_clip": 0.06269655, + "balance_loss_mlp": 0.01256208, + "epoch": 0.7791973545768827, + "flos": 23849039047680.0, + "grad_norm": 1.6457903967738072, + "language_loss": 0.77779013, + "learning_rate": 4.898812411746632e-07, + "loss": 0.8544842, + "num_input_tokens_seen": 279599030, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.09783936, + "step": 12960, + "time_per_iteration": 2.5057005882263184 + }, + { + "auxiliary_loss_clip": 0.06403294, + "auxiliary_loss_mlp": 0.01269347, + "balance_loss_clip": 0.06269927, + "balance_loss_mlp": 0.0125934, + "epoch": 0.7792574778295506, + "flos": 24174902776320.0, + "grad_norm": 1.862849792327091, + "language_loss": 0.75439703, + "learning_rate": 4.896259167586385e-07, + "loss": 0.83112347, + "num_input_tokens_seen": 279614400, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.10003662, + "step": 12961, + "time_per_iteration": 2.523517608642578 + }, + { + "auxiliary_loss_clip": 0.06400951, + "auxiliary_loss_mlp": 0.01266276, + "balance_loss_clip": 0.06274296, + "balance_loss_mlp": 0.01257592, + "epoch": 0.7793176010822186, + "flos": 21470399268480.0, + "grad_norm": 1.5483353660342332, + "language_loss": 0.73957908, + "learning_rate": 4.893706496161511e-07, + "loss": 0.81625128, + "num_input_tokens_seen": 279633745, + "router_z_loss_clip": 1.26660156, + "router_z_loss_mlp": 0.08679199, + "step": 12962, + "time_per_iteration": 2.498566150665283 + }, + { + "auxiliary_loss_clip": 0.06398464, + "auxiliary_loss_mlp": 0.01264334, + "balance_loss_clip": 0.06269018, + "balance_loss_mlp": 0.01255012, + "epoch": 0.7793777243348865, + "flos": 20672790652800.0, + "grad_norm": 1.8192572691514057, + "language_loss": 0.70224059, + "learning_rate": 4.891154397568795e-07, + "loss": 0.77886856, + "num_input_tokens_seen": 279651165, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.09326172, + "step": 12963, + "time_per_iteration": 2.507917881011963 + }, + { + "auxiliary_loss_clip": 0.06401575, + "auxiliary_loss_mlp": 0.01264258, + "balance_loss_clip": 0.06272756, + "balance_loss_mlp": 0.01254805, + "epoch": 0.7794378475875545, + "flos": 27133126047360.0, + "grad_norm": 1.5815995663676223, + "language_loss": 0.63879544, + "learning_rate": 4.888602871905019e-07, + "loss": 0.71545374, + "num_input_tokens_seen": 279671175, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.09460449, + "step": 12964, + "time_per_iteration": 2.52024245262146 + }, + { + "auxiliary_loss_clip": 0.06404367, + "auxiliary_loss_mlp": 0.01264838, + "balance_loss_clip": 0.0627073, + "balance_loss_mlp": 0.01254622, + "epoch": 0.7794979708402224, + "flos": 28081605139200.0, + "grad_norm": 1.6072168370659738, + "language_loss": 0.76559496, + "learning_rate": 4.88605191926694e-07, + "loss": 0.84228694, + "num_input_tokens_seen": 279688675, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.10211182, + "step": 12965, + "time_per_iteration": 2.5686237812042236 + }, + { + "auxiliary_loss_clip": 0.06394182, + "auxiliary_loss_mlp": 0.01263131, + "balance_loss_clip": 0.06269042, + "balance_loss_mlp": 0.01254429, + "epoch": 0.7795580940928905, + "flos": 26876722953600.0, + "grad_norm": 1.5862680415926609, + "language_loss": 0.72998363, + "learning_rate": 4.883501539751289e-07, + "loss": 0.80655676, + "num_input_tokens_seen": 279710245, + "router_z_loss_clip": 1.25195312, + "router_z_loss_mlp": 0.08703613, + "step": 12966, + "time_per_iteration": 2.51505708694458 + }, + { + "auxiliary_loss_clip": 0.06398065, + "auxiliary_loss_mlp": 0.01262043, + "balance_loss_clip": 0.06270934, + "balance_loss_mlp": 0.01253323, + "epoch": 0.7796182173455584, + "flos": 23841072910080.0, + "grad_norm": 1.47410798363511, + "language_loss": 0.74184883, + "learning_rate": 4.880951733454768e-07, + "loss": 0.81844991, + "num_input_tokens_seen": 279729045, + "router_z_loss_clip": 1.2734375, + "router_z_loss_mlp": 0.08721924, + "step": 12967, + "time_per_iteration": 3.9195239543914795 + }, + { + "auxiliary_loss_clip": 0.06406528, + "auxiliary_loss_mlp": 0.01262611, + "balance_loss_clip": 0.06272194, + "balance_loss_mlp": 0.01253462, + "epoch": 0.7796783405982264, + "flos": 19798384170240.0, + "grad_norm": 2.482748311118984, + "language_loss": 0.72366989, + "learning_rate": 4.878402500474073e-07, + "loss": 0.80036128, + "num_input_tokens_seen": 279748350, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.09155273, + "step": 12968, + "time_per_iteration": 2.5332348346710205 + }, + { + "auxiliary_loss_clip": 0.06398027, + "auxiliary_loss_mlp": 0.01268988, + "balance_loss_clip": 0.0626802, + "balance_loss_mlp": 0.01259249, + "epoch": 0.7797384638508943, + "flos": 15455589632640.0, + "grad_norm": 1.8161833543427846, + "language_loss": 0.61633801, + "learning_rate": 4.875853840905874e-07, + "loss": 0.69300812, + "num_input_tokens_seen": 279765620, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.09735107, + "step": 12969, + "time_per_iteration": 2.477679967880249 + }, + { + "auxiliary_loss_clip": 0.06398109, + "auxiliary_loss_mlp": 0.0126421, + "balance_loss_clip": 0.06271819, + "balance_loss_mlp": 0.01255651, + "epoch": 0.7797985871035623, + "flos": 20928984111360.0, + "grad_norm": 1.617507688823146, + "language_loss": 0.70254469, + "learning_rate": 4.873305754846811e-07, + "loss": 0.77916789, + "num_input_tokens_seen": 279782485, + "router_z_loss_clip": 1.26171875, + "router_z_loss_mlp": 0.08563232, + "step": 12970, + "time_per_iteration": 2.510071039199829 + }, + { + "auxiliary_loss_clip": 0.06403705, + "auxiliary_loss_mlp": 0.01266712, + "balance_loss_clip": 0.06272732, + "balance_loss_mlp": 0.01256901, + "epoch": 0.7798587103562302, + "flos": 36945667411200.0, + "grad_norm": 1.5338115729729769, + "language_loss": 0.72291183, + "learning_rate": 4.870758242393507e-07, + "loss": 0.79961598, + "num_input_tokens_seen": 279804170, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09814453, + "step": 12971, + "time_per_iteration": 2.654513359069824 + }, + { + "auxiliary_loss_clip": 0.06410386, + "auxiliary_loss_mlp": 0.01266468, + "balance_loss_clip": 0.06272395, + "balance_loss_mlp": 0.01256174, + "epoch": 0.7799188336088982, + "flos": 22425880176000.0, + "grad_norm": 1.7218916493252936, + "language_loss": 0.74606651, + "learning_rate": 4.868211303642578e-07, + "loss": 0.82283497, + "num_input_tokens_seen": 279823730, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.10290527, + "step": 12972, + "time_per_iteration": 2.517273187637329 + }, + { + "auxiliary_loss_clip": 0.06402341, + "auxiliary_loss_mlp": 0.01263993, + "balance_loss_clip": 0.06269114, + "balance_loss_mlp": 0.01254146, + "epoch": 0.7799789568615663, + "flos": 18886522112640.0, + "grad_norm": 2.215385328919691, + "language_loss": 0.71494085, + "learning_rate": 4.865664938690584e-07, + "loss": 0.79160416, + "num_input_tokens_seen": 279843035, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.09844971, + "step": 12973, + "time_per_iteration": 2.472104549407959 + }, + { + "auxiliary_loss_clip": 0.06400935, + "auxiliary_loss_mlp": 0.01265477, + "balance_loss_clip": 0.0627044, + "balance_loss_mlp": 0.01256435, + "epoch": 0.7800390801142342, + "flos": 20267781143040.0, + "grad_norm": 1.7807969698368138, + "language_loss": 0.78121793, + "learning_rate": 4.863119147634089e-07, + "loss": 0.85788202, + "num_input_tokens_seen": 279861450, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.09039307, + "step": 12974, + "time_per_iteration": 2.4978132247924805 + }, + { + "auxiliary_loss_clip": 0.06402993, + "auxiliary_loss_mlp": 0.01264928, + "balance_loss_clip": 0.06272218, + "balance_loss_mlp": 0.01255313, + "epoch": 0.7800992033669022, + "flos": 16695831041280.0, + "grad_norm": 1.52512308426482, + "language_loss": 0.6983875, + "learning_rate": 4.86057393056964e-07, + "loss": 0.77506667, + "num_input_tokens_seen": 279878660, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09619141, + "step": 12975, + "time_per_iteration": 2.4792943000793457 + }, + { + "auxiliary_loss_clip": 0.06404307, + "auxiliary_loss_mlp": 0.01265828, + "balance_loss_clip": 0.06273738, + "balance_loss_mlp": 0.01256703, + "epoch": 0.7801593266195701, + "flos": 18590650945920.0, + "grad_norm": 2.5885152450409654, + "language_loss": 0.82135439, + "learning_rate": 4.858029287593739e-07, + "loss": 0.89805579, + "num_input_tokens_seen": 279895685, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.09124756, + "step": 12976, + "time_per_iteration": 3.9093782901763916 + }, + { + "auxiliary_loss_clip": 0.06403226, + "auxiliary_loss_mlp": 0.01266163, + "balance_loss_clip": 0.06269425, + "balance_loss_mlp": 0.01256299, + "epoch": 0.7802194498722381, + "flos": 25492193614080.0, + "grad_norm": 1.298093609119966, + "language_loss": 0.66121942, + "learning_rate": 4.85548521880289e-07, + "loss": 0.73791331, + "num_input_tokens_seen": 279917240, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.09857178, + "step": 12977, + "time_per_iteration": 2.5382373332977295 + }, + { + "auxiliary_loss_clip": 0.06398032, + "auxiliary_loss_mlp": 0.01265496, + "balance_loss_clip": 0.06268156, + "balance_loss_mlp": 0.01256293, + "epoch": 0.780279573124906, + "flos": 31184451757440.0, + "grad_norm": 1.3843135589513191, + "language_loss": 0.74921417, + "learning_rate": 4.852941724293554e-07, + "loss": 0.82584947, + "num_input_tokens_seen": 279938665, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.09204102, + "step": 12978, + "time_per_iteration": 2.5999321937561035 + }, + { + "auxiliary_loss_clip": 0.0640787, + "auxiliary_loss_mlp": 0.01263935, + "balance_loss_clip": 0.06272239, + "balance_loss_mlp": 0.01253529, + "epoch": 0.780339696377574, + "flos": 26951466395520.0, + "grad_norm": 1.7189824497298882, + "language_loss": 0.6233561, + "learning_rate": 4.85039880416219e-07, + "loss": 0.70007408, + "num_input_tokens_seen": 279957965, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.10406494, + "step": 12979, + "time_per_iteration": 4.002735137939453 + }, + { + "auxiliary_loss_clip": 0.0640031, + "auxiliary_loss_mlp": 0.01264611, + "balance_loss_clip": 0.06269379, + "balance_loss_mlp": 0.01255163, + "epoch": 0.780399819630242, + "flos": 27963662117760.0, + "grad_norm": 1.7958108111348887, + "language_loss": 0.77048111, + "learning_rate": 4.847856458505217e-07, + "loss": 0.8471303, + "num_input_tokens_seen": 279977490, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09454346, + "step": 12980, + "time_per_iteration": 2.574740171432495 + }, + { + "auxiliary_loss_clip": 0.06404287, + "auxiliary_loss_mlp": 0.0126621, + "balance_loss_clip": 0.06269396, + "balance_loss_mlp": 0.01256941, + "epoch": 0.78045994288291, + "flos": 22492489772160.0, + "grad_norm": 7.38729106022631, + "language_loss": 0.77965951, + "learning_rate": 4.845314687419046e-07, + "loss": 0.85636449, + "num_input_tokens_seen": 279994220, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.09259033, + "step": 12981, + "time_per_iteration": 2.6090612411499023 + }, + { + "auxiliary_loss_clip": 0.06406559, + "auxiliary_loss_mlp": 0.01273892, + "balance_loss_clip": 0.0627367, + "balance_loss_mlp": 0.01264642, + "epoch": 0.7805200661355779, + "flos": 20857259416320.0, + "grad_norm": 1.7019427662247137, + "language_loss": 0.72918165, + "learning_rate": 4.842773491000067e-07, + "loss": 0.80598617, + "num_input_tokens_seen": 280012590, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.09246826, + "step": 12982, + "time_per_iteration": 2.538454294204712 + }, + { + "auxiliary_loss_clip": 0.06401584, + "auxiliary_loss_mlp": 0.01261641, + "balance_loss_clip": 0.06268401, + "balance_loss_mlp": 0.01251932, + "epoch": 0.7805801893882459, + "flos": 25673014725120.0, + "grad_norm": 1.3557046111100475, + "language_loss": 0.73713994, + "learning_rate": 4.840232869344636e-07, + "loss": 0.8137722, + "num_input_tokens_seen": 280033700, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.0970459, + "step": 12983, + "time_per_iteration": 2.55915904045105 + }, + { + "auxiliary_loss_clip": 0.06403306, + "auxiliary_loss_mlp": 0.01265365, + "balance_loss_clip": 0.06270759, + "balance_loss_mlp": 0.0125584, + "epoch": 0.7806403126409138, + "flos": 11332581154560.0, + "grad_norm": 1.8511733827062056, + "language_loss": 0.7564944, + "learning_rate": 4.837692822549086e-07, + "loss": 0.83318114, + "num_input_tokens_seen": 280052215, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.09521484, + "step": 12984, + "time_per_iteration": 3.9226207733154297 + }, + { + "auxiliary_loss_clip": 0.06401315, + "auxiliary_loss_mlp": 0.01261166, + "balance_loss_clip": 0.06270321, + "balance_loss_mlp": 0.01252345, + "epoch": 0.7807004358935818, + "flos": 19579478578560.0, + "grad_norm": 1.6909183647734616, + "language_loss": 0.81444597, + "learning_rate": 4.835153350709746e-07, + "loss": 0.89107084, + "num_input_tokens_seen": 280070525, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.08831787, + "step": 12985, + "time_per_iteration": 2.495833396911621 + }, + { + "auxiliary_loss_clip": 0.06404648, + "auxiliary_loss_mlp": 0.01270247, + "balance_loss_clip": 0.06273016, + "balance_loss_mlp": 0.01260007, + "epoch": 0.7807605591462499, + "flos": 19141918957440.0, + "grad_norm": 1.5866346872788593, + "language_loss": 0.7735818, + "learning_rate": 4.832614453922915e-07, + "loss": 0.85033077, + "num_input_tokens_seen": 280089855, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.10235596, + "step": 12986, + "time_per_iteration": 2.4942498207092285 + }, + { + "auxiliary_loss_clip": 0.06404544, + "auxiliary_loss_mlp": 0.01262193, + "balance_loss_clip": 0.06271936, + "balance_loss_mlp": 0.01252829, + "epoch": 0.7808206823989178, + "flos": 32382038638080.0, + "grad_norm": 1.540132157025115, + "language_loss": 0.74469846, + "learning_rate": 4.830076132284859e-07, + "loss": 0.82136583, + "num_input_tokens_seen": 280109960, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09375, + "step": 12987, + "time_per_iteration": 2.6014459133148193 + }, + { + "auxiliary_loss_clip": 0.06307278, + "auxiliary_loss_mlp": 0.01248897, + "balance_loss_clip": 0.06251733, + "balance_loss_mlp": 0.01247845, + "epoch": 0.7808808056515858, + "flos": 55070512381440.0, + "grad_norm": 0.7358853994181496, + "language_loss": 0.55100733, + "learning_rate": 4.82753838589184e-07, + "loss": 0.62656909, + "num_input_tokens_seen": 280169805, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01052094, + "step": 12988, + "time_per_iteration": 3.1363513469696045 + }, + { + "auxiliary_loss_clip": 0.06395964, + "auxiliary_loss_mlp": 0.01273063, + "balance_loss_clip": 0.06268729, + "balance_loss_mlp": 0.01264235, + "epoch": 0.7809409289042537, + "flos": 12864375244800.0, + "grad_norm": 2.503136362743708, + "language_loss": 0.80932319, + "learning_rate": 4.82500121484009e-07, + "loss": 0.88601345, + "num_input_tokens_seen": 280184630, + "router_z_loss_clip": 1.27246094, + "router_z_loss_mlp": 0.08831787, + "step": 12989, + "time_per_iteration": 2.4550793170928955 + }, + { + "auxiliary_loss_clip": 0.06397895, + "auxiliary_loss_mlp": 0.0126169, + "balance_loss_clip": 0.06268378, + "balance_loss_mlp": 0.0125269, + "epoch": 0.7810010521569217, + "flos": 21693329856000.0, + "grad_norm": 1.5548108351785217, + "language_loss": 0.70569479, + "learning_rate": 4.822464619225806e-07, + "loss": 0.78229064, + "num_input_tokens_seen": 280203880, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.09002686, + "step": 12990, + "time_per_iteration": 2.534583330154419 + }, + { + "auxiliary_loss_clip": 0.064027, + "auxiliary_loss_mlp": 0.01265995, + "balance_loss_clip": 0.06270639, + "balance_loss_mlp": 0.01255666, + "epoch": 0.7810611754095896, + "flos": 16761560169600.0, + "grad_norm": 2.151540581159162, + "language_loss": 0.78160757, + "learning_rate": 4.819928599145184e-07, + "loss": 0.85829455, + "num_input_tokens_seen": 280220460, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.10327148, + "step": 12991, + "time_per_iteration": 2.4641294479370117 + }, + { + "auxiliary_loss_clip": 0.06403095, + "auxiliary_loss_mlp": 0.01267597, + "balance_loss_clip": 0.06270657, + "balance_loss_mlp": 0.01257071, + "epoch": 0.7811212986622577, + "flos": 43517489063040.0, + "grad_norm": 1.4386933089332317, + "language_loss": 0.66202235, + "learning_rate": 4.817393154694398e-07, + "loss": 0.73872924, + "num_input_tokens_seen": 280242680, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.10528564, + "step": 12992, + "time_per_iteration": 2.712284564971924 + }, + { + "auxiliary_loss_clip": 0.06407847, + "auxiliary_loss_mlp": 0.0126388, + "balance_loss_clip": 0.06273159, + "balance_loss_mlp": 0.01254373, + "epoch": 0.7811814219149256, + "flos": 21763377469440.0, + "grad_norm": 1.666565007875902, + "language_loss": 0.61892599, + "learning_rate": 4.814858285969578e-07, + "loss": 0.69564325, + "num_input_tokens_seen": 280260655, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.09503174, + "step": 12993, + "time_per_iteration": 2.4966509342193604 + }, + { + "auxiliary_loss_clip": 0.06400012, + "auxiliary_loss_mlp": 0.0126208, + "balance_loss_clip": 0.06270296, + "balance_loss_mlp": 0.01252532, + "epoch": 0.7812415451675936, + "flos": 24068447763840.0, + "grad_norm": 1.3952221037257373, + "language_loss": 0.68836015, + "learning_rate": 4.812323993066862e-07, + "loss": 0.76498109, + "num_input_tokens_seen": 280281185, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.09545898, + "step": 12994, + "time_per_iteration": 2.536137819290161 + }, + { + "auxiliary_loss_clip": 0.06404947, + "auxiliary_loss_mlp": 0.01263745, + "balance_loss_clip": 0.06273837, + "balance_loss_mlp": 0.01254703, + "epoch": 0.7813016684202615, + "flos": 18995744309760.0, + "grad_norm": 1.7501216946691078, + "language_loss": 0.69363022, + "learning_rate": 4.809790276082335e-07, + "loss": 0.77031708, + "num_input_tokens_seen": 280298255, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.09039307, + "step": 12995, + "time_per_iteration": 2.470670700073242 + }, + { + "auxiliary_loss_clip": 0.06396692, + "auxiliary_loss_mlp": 0.01265345, + "balance_loss_clip": 0.06268929, + "balance_loss_mlp": 0.0125644, + "epoch": 0.7813617916729295, + "flos": 25267124747520.0, + "grad_norm": 1.5705022516303782, + "language_loss": 0.75361514, + "learning_rate": 4.807257135112088e-07, + "loss": 0.83023554, + "num_input_tokens_seen": 280319000, + "router_z_loss_clip": 1.27734375, + "router_z_loss_mlp": 0.08905029, + "step": 12996, + "time_per_iteration": 2.548156261444092 + }, + { + "auxiliary_loss_clip": 0.06408437, + "auxiliary_loss_mlp": 0.01266772, + "balance_loss_clip": 0.06271097, + "balance_loss_mlp": 0.01256055, + "epoch": 0.7814219149255974, + "flos": 17971557454080.0, + "grad_norm": 2.5240024848484284, + "language_loss": 0.68320543, + "learning_rate": 4.804724570252167e-07, + "loss": 0.75995755, + "num_input_tokens_seen": 280336375, + "router_z_loss_clip": 1.37402344, + "router_z_loss_mlp": 0.10723877, + "step": 12997, + "time_per_iteration": 2.4495344161987305 + }, + { + "auxiliary_loss_clip": 0.06410494, + "auxiliary_loss_mlp": 0.01266795, + "balance_loss_clip": 0.06272165, + "balance_loss_mlp": 0.01256018, + "epoch": 0.7814820381782654, + "flos": 25783368952320.0, + "grad_norm": 1.6126365862237693, + "language_loss": 0.82193416, + "learning_rate": 4.802192581598614e-07, + "loss": 0.89870703, + "num_input_tokens_seen": 280358760, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.10778809, + "step": 12998, + "time_per_iteration": 2.535696506500244 + }, + { + "auxiliary_loss_clip": 0.06407057, + "auxiliary_loss_mlp": 0.01266001, + "balance_loss_clip": 0.06273869, + "balance_loss_mlp": 0.01256166, + "epoch": 0.7815421614309335, + "flos": 20525442048000.0, + "grad_norm": 1.8946982526297624, + "language_loss": 0.7477777, + "learning_rate": 4.799661169247453e-07, + "loss": 0.82450831, + "num_input_tokens_seen": 280377085, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.09844971, + "step": 12999, + "time_per_iteration": 2.4902775287628174 + }, + { + "auxiliary_loss_clip": 0.06407912, + "auxiliary_loss_mlp": 0.01262829, + "balance_loss_clip": 0.06271957, + "balance_loss_mlp": 0.01252517, + "epoch": 0.7816022846836014, + "flos": 21293980496640.0, + "grad_norm": 1.4384947504961985, + "language_loss": 0.84615433, + "learning_rate": 4.797130333294652e-07, + "loss": 0.92286175, + "num_input_tokens_seen": 280395465, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.10314941, + "step": 13000, + "time_per_iteration": 2.512596607208252 + }, + { + "auxiliary_loss_clip": 0.0640571, + "auxiliary_loss_mlp": 0.01264665, + "balance_loss_clip": 0.06273641, + "balance_loss_mlp": 0.01254126, + "epoch": 0.7816624079362694, + "flos": 19214440266240.0, + "grad_norm": 1.8073266601471953, + "language_loss": 0.66751462, + "learning_rate": 4.794600073836192e-07, + "loss": 0.74421835, + "num_input_tokens_seen": 280412775, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.10540771, + "step": 13001, + "time_per_iteration": 2.4772894382476807 + }, + { + "auxiliary_loss_clip": 0.06405921, + "auxiliary_loss_mlp": 0.01263146, + "balance_loss_clip": 0.06271157, + "balance_loss_mlp": 0.01253526, + "epoch": 0.7817225311889373, + "flos": 26111957938560.0, + "grad_norm": 1.5273491192329303, + "language_loss": 0.66959155, + "learning_rate": 4.792070390968027e-07, + "loss": 0.74628222, + "num_input_tokens_seen": 280432905, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.09625244, + "step": 13002, + "time_per_iteration": 2.5820791721343994 + }, + { + "auxiliary_loss_clip": 0.06409384, + "auxiliary_loss_mlp": 0.01266696, + "balance_loss_clip": 0.06275305, + "balance_loss_mlp": 0.01256176, + "epoch": 0.7817826544416053, + "flos": 21257195754240.0, + "grad_norm": 2.018800094451087, + "language_loss": 0.73878789, + "learning_rate": 4.78954128478607e-07, + "loss": 0.81554866, + "num_input_tokens_seen": 280450785, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.10534668, + "step": 13003, + "time_per_iteration": 2.481661319732666 + }, + { + "auxiliary_loss_clip": 0.06404527, + "auxiliary_loss_mlp": 0.01265727, + "balance_loss_clip": 0.0627287, + "balance_loss_mlp": 0.01256208, + "epoch": 0.7818427776942732, + "flos": 19937347367040.0, + "grad_norm": 1.9756660000355053, + "language_loss": 0.62827951, + "learning_rate": 4.787012755386233e-07, + "loss": 0.70498204, + "num_input_tokens_seen": 280468400, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09515381, + "step": 13004, + "time_per_iteration": 2.497821569442749 + }, + { + "auxiliary_loss_clip": 0.0639583, + "auxiliary_loss_mlp": 0.01262478, + "balance_loss_clip": 0.06268562, + "balance_loss_mlp": 0.01253669, + "epoch": 0.7819029009469413, + "flos": 11368443502080.0, + "grad_norm": 1.7802974888908354, + "language_loss": 0.83142269, + "learning_rate": 4.784484802864403e-07, + "loss": 0.90800571, + "num_input_tokens_seen": 280483930, + "router_z_loss_clip": 1.2734375, + "router_z_loss_mlp": 0.08807373, + "step": 13005, + "time_per_iteration": 2.455112934112549 + }, + { + "auxiliary_loss_clip": 0.06402773, + "auxiliary_loss_mlp": 0.01265138, + "balance_loss_clip": 0.06270364, + "balance_loss_mlp": 0.01255172, + "epoch": 0.7819630241996092, + "flos": 24286053617280.0, + "grad_norm": 1.9304449854635368, + "language_loss": 0.73000956, + "learning_rate": 4.781957427316432e-07, + "loss": 0.80668867, + "num_input_tokens_seen": 280503465, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.09973145, + "step": 13006, + "time_per_iteration": 3.923842191696167 + }, + { + "auxiliary_loss_clip": 0.06406109, + "auxiliary_loss_mlp": 0.01263435, + "balance_loss_clip": 0.06271446, + "balance_loss_mlp": 0.01252891, + "epoch": 0.7820231474522772, + "flos": 22715168797440.0, + "grad_norm": 1.5911839097464888, + "language_loss": 0.72339863, + "learning_rate": 4.779430628838157e-07, + "loss": 0.80009413, + "num_input_tokens_seen": 280523375, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.10540771, + "step": 13007, + "time_per_iteration": 2.5166056156158447 + }, + { + "auxiliary_loss_clip": 0.06406694, + "auxiliary_loss_mlp": 0.01267894, + "balance_loss_clip": 0.06271846, + "balance_loss_mlp": 0.0125782, + "epoch": 0.7820832707049451, + "flos": 20053571379840.0, + "grad_norm": 2.020015501308364, + "language_loss": 0.69036144, + "learning_rate": 4.776904407525397e-07, + "loss": 0.76710731, + "num_input_tokens_seen": 280542920, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.10070801, + "step": 13008, + "time_per_iteration": 2.495736837387085 + }, + { + "auxiliary_loss_clip": 0.064032, + "auxiliary_loss_mlp": 0.012644, + "balance_loss_clip": 0.06269944, + "balance_loss_mlp": 0.01253457, + "epoch": 0.7821433939576131, + "flos": 27170246206080.0, + "grad_norm": 1.7298477969217696, + "language_loss": 0.69919395, + "learning_rate": 4.774378763473954e-07, + "loss": 0.77586997, + "num_input_tokens_seen": 280561700, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.10949707, + "step": 13009, + "time_per_iteration": 2.5899367332458496 + }, + { + "auxiliary_loss_clip": 0.06399304, + "auxiliary_loss_mlp": 0.01262145, + "balance_loss_clip": 0.06269169, + "balance_loss_mlp": 0.01252781, + "epoch": 0.782203517210281, + "flos": 22608755712000.0, + "grad_norm": 1.790636522261297, + "language_loss": 0.81948966, + "learning_rate": 4.771853696779586e-07, + "loss": 0.89610416, + "num_input_tokens_seen": 280580605, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.09362793, + "step": 13010, + "time_per_iteration": 2.5066049098968506 + }, + { + "auxiliary_loss_clip": 0.06400339, + "auxiliary_loss_mlp": 0.01262085, + "balance_loss_clip": 0.06270656, + "balance_loss_mlp": 0.01252692, + "epoch": 0.782263640462949, + "flos": 29067539806080.0, + "grad_norm": 1.385682436411659, + "language_loss": 0.62627685, + "learning_rate": 4.76932920753806e-07, + "loss": 0.70290112, + "num_input_tokens_seen": 280601495, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.09399414, + "step": 13011, + "time_per_iteration": 2.6026289463043213 + }, + { + "auxiliary_loss_clip": 0.06399235, + "auxiliary_loss_mlp": 0.0126419, + "balance_loss_clip": 0.0626906, + "balance_loss_mlp": 0.01255306, + "epoch": 0.782323763715617, + "flos": 25306215477120.0, + "grad_norm": 1.6427811316724177, + "language_loss": 0.70159376, + "learning_rate": 4.7668052958450913e-07, + "loss": 0.77822804, + "num_input_tokens_seen": 280622760, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.08883667, + "step": 13012, + "time_per_iteration": 2.53303861618042 + }, + { + "auxiliary_loss_clip": 0.0630969, + "auxiliary_loss_mlp": 0.01250424, + "balance_loss_clip": 0.06253915, + "balance_loss_mlp": 0.01249417, + "epoch": 0.782383886968285, + "flos": 65216548195200.0, + "grad_norm": 0.6922289036219499, + "language_loss": 0.55011511, + "learning_rate": 4.764281961796395e-07, + "loss": 0.62571621, + "num_input_tokens_seen": 280687115, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.0100708, + "step": 13013, + "time_per_iteration": 3.228905439376831 + }, + { + "auxiliary_loss_clip": 0.06409347, + "auxiliary_loss_mlp": 0.01264895, + "balance_loss_clip": 0.06273122, + "balance_loss_mlp": 0.01254708, + "epoch": 0.782444010220953, + "flos": 18411297281280.0, + "grad_norm": 1.7267010887219136, + "language_loss": 0.6554383, + "learning_rate": 4.76175920548765e-07, + "loss": 0.73218066, + "num_input_tokens_seen": 280705000, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.10186768, + "step": 13014, + "time_per_iteration": 2.4842281341552734 + }, + { + "auxiliary_loss_clip": 0.06309456, + "auxiliary_loss_mlp": 0.01249284, + "balance_loss_clip": 0.06253707, + "balance_loss_mlp": 0.01248232, + "epoch": 0.7825041334736209, + "flos": 63977145327360.0, + "grad_norm": 0.6946375412557042, + "language_loss": 0.58183634, + "learning_rate": 4.759237027014524e-07, + "loss": 0.65742373, + "num_input_tokens_seen": 280773525, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.01052094, + "step": 13015, + "time_per_iteration": 4.588924169540405 + }, + { + "auxiliary_loss_clip": 0.06401119, + "auxiliary_loss_mlp": 0.01267469, + "balance_loss_clip": 0.06269481, + "balance_loss_mlp": 0.01258141, + "epoch": 0.7825642567262889, + "flos": 20345585258880.0, + "grad_norm": 1.703957116588016, + "language_loss": 0.75081736, + "learning_rate": 4.756715426472666e-07, + "loss": 0.8275032, + "num_input_tokens_seen": 280791915, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.09326172, + "step": 13016, + "time_per_iteration": 2.5329108238220215 + }, + { + "auxiliary_loss_clip": 0.06404392, + "auxiliary_loss_mlp": 0.01262942, + "balance_loss_clip": 0.0627065, + "balance_loss_mlp": 0.01252303, + "epoch": 0.7826243799789568, + "flos": 20268577756800.0, + "grad_norm": 1.8073604316882006, + "language_loss": 0.75204456, + "learning_rate": 4.7541944039576766e-07, + "loss": 0.82871789, + "num_input_tokens_seen": 280811460, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.10644531, + "step": 13017, + "time_per_iteration": 2.475156307220459 + }, + { + "auxiliary_loss_clip": 0.06402843, + "auxiliary_loss_mlp": 0.01267244, + "balance_loss_clip": 0.06268843, + "balance_loss_mlp": 0.01256974, + "epoch": 0.7826845032316249, + "flos": 21137743359360.0, + "grad_norm": 2.040801926545799, + "language_loss": 0.76392686, + "learning_rate": 4.7516739595651636e-07, + "loss": 0.84062773, + "num_input_tokens_seen": 280825415, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.10272217, + "step": 13018, + "time_per_iteration": 2.487426280975342 + }, + { + "auxiliary_loss_clip": 0.06399854, + "auxiliary_loss_mlp": 0.01266755, + "balance_loss_clip": 0.06267899, + "balance_loss_mlp": 0.01256652, + "epoch": 0.7827446264842928, + "flos": 22498862682240.0, + "grad_norm": 1.372243474464688, + "language_loss": 0.77303207, + "learning_rate": 4.749154093390708e-07, + "loss": 0.84969819, + "num_input_tokens_seen": 280845335, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.10101318, + "step": 13019, + "time_per_iteration": 3.9929661750793457 + }, + { + "auxiliary_loss_clip": 0.06402994, + "auxiliary_loss_mlp": 0.01262289, + "balance_loss_clip": 0.06270827, + "balance_loss_mlp": 0.01252716, + "epoch": 0.7828047497369608, + "flos": 28848298798080.0, + "grad_norm": 1.5302046245116039, + "language_loss": 0.6745941, + "learning_rate": 4.746634805529852e-07, + "loss": 0.75124693, + "num_input_tokens_seen": 280867145, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.09570312, + "step": 13020, + "time_per_iteration": 2.564709424972534 + }, + { + "auxiliary_loss_clip": 0.06400368, + "auxiliary_loss_mlp": 0.012665, + "balance_loss_clip": 0.0626877, + "balance_loss_mlp": 0.01256397, + "epoch": 0.7828648729896287, + "flos": 23264298529920.0, + "grad_norm": 2.6855687872649825, + "language_loss": 0.62745917, + "learning_rate": 4.7441160960781325e-07, + "loss": 0.70412791, + "num_input_tokens_seen": 280886185, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.10101318, + "step": 13021, + "time_per_iteration": 2.4964163303375244 + }, + { + "auxiliary_loss_clip": 0.06403099, + "auxiliary_loss_mlp": 0.01264616, + "balance_loss_clip": 0.06270363, + "balance_loss_mlp": 0.01255592, + "epoch": 0.7829249962422967, + "flos": 25272826824960.0, + "grad_norm": 1.5874593754725228, + "language_loss": 0.69790453, + "learning_rate": 4.7415979651310636e-07, + "loss": 0.77458167, + "num_input_tokens_seen": 280907665, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.090271, + "step": 13022, + "time_per_iteration": 2.5415072441101074 + }, + { + "auxiliary_loss_clip": 0.06309162, + "auxiliary_loss_mlp": 0.01253506, + "balance_loss_clip": 0.06253611, + "balance_loss_mlp": 0.01252549, + "epoch": 0.7829851194949646, + "flos": 70742087441280.0, + "grad_norm": 0.6386935126948231, + "language_loss": 0.56138313, + "learning_rate": 4.739080412784131e-07, + "loss": 0.6370098, + "num_input_tokens_seen": 280971405, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.009552, + "step": 13023, + "time_per_iteration": 4.637472867965698 + }, + { + "auxiliary_loss_clip": 0.06393711, + "auxiliary_loss_mlp": 0.01265571, + "balance_loss_clip": 0.06267409, + "balance_loss_mlp": 0.01256451, + "epoch": 0.7830452427476327, + "flos": 25666977231360.0, + "grad_norm": 1.576482021290812, + "language_loss": 0.67401826, + "learning_rate": 4.736563439132792e-07, + "loss": 0.75061107, + "num_input_tokens_seen": 280989615, + "router_z_loss_clip": 1.26464844, + "router_z_loss_mlp": 0.09118652, + "step": 13024, + "time_per_iteration": 2.538425922393799 + }, + { + "auxiliary_loss_clip": 0.06403638, + "auxiliary_loss_mlp": 0.01263953, + "balance_loss_clip": 0.06269067, + "balance_loss_mlp": 0.0125357, + "epoch": 0.7831053660003006, + "flos": 22791002342400.0, + "grad_norm": 1.5665497407988729, + "language_loss": 0.77940929, + "learning_rate": 4.734047044272498e-07, + "loss": 0.85608524, + "num_input_tokens_seen": 281009450, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.10369873, + "step": 13025, + "time_per_iteration": 2.5431177616119385 + }, + { + "auxiliary_loss_clip": 0.0640173, + "auxiliary_loss_mlp": 0.01265493, + "balance_loss_clip": 0.06270472, + "balance_loss_mlp": 0.01256302, + "epoch": 0.7831654892529686, + "flos": 25819399008000.0, + "grad_norm": 1.644612426825064, + "language_loss": 0.7874493, + "learning_rate": 4.731531228298673e-07, + "loss": 0.86412156, + "num_input_tokens_seen": 281028120, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09197998, + "step": 13026, + "time_per_iteration": 2.556727647781372 + }, + { + "auxiliary_loss_clip": 0.06404313, + "auxiliary_loss_mlp": 0.01262471, + "balance_loss_clip": 0.06272115, + "balance_loss_mlp": 0.01253006, + "epoch": 0.7832256125056366, + "flos": 20776897751040.0, + "grad_norm": 2.5804756283092334, + "language_loss": 0.75804269, + "learning_rate": 4.729015991306715e-07, + "loss": 0.83471048, + "num_input_tokens_seen": 281042130, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.09466553, + "step": 13027, + "time_per_iteration": 2.4878506660461426 + }, + { + "auxiliary_loss_clip": 0.0639909, + "auxiliary_loss_mlp": 0.01265777, + "balance_loss_clip": 0.06269808, + "balance_loss_mlp": 0.01255978, + "epoch": 0.7832857357583045, + "flos": 21512886088320.0, + "grad_norm": 1.7061440421315746, + "language_loss": 0.70765603, + "learning_rate": 4.726501333391997e-07, + "loss": 0.78430474, + "num_input_tokens_seen": 281060945, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.09802246, + "step": 13028, + "time_per_iteration": 2.498478651046753 + }, + { + "auxiliary_loss_clip": 0.06406339, + "auxiliary_loss_mlp": 0.01268084, + "balance_loss_clip": 0.06271327, + "balance_loss_mlp": 0.01257874, + "epoch": 0.7833458590109725, + "flos": 18083714544000.0, + "grad_norm": 1.9644194417750374, + "language_loss": 0.68658125, + "learning_rate": 4.7239872546498774e-07, + "loss": 0.76332551, + "num_input_tokens_seen": 281079270, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.10217285, + "step": 13029, + "time_per_iteration": 2.580122470855713 + }, + { + "auxiliary_loss_clip": 0.06403092, + "auxiliary_loss_mlp": 0.01267866, + "balance_loss_clip": 0.0626725, + "balance_loss_mlp": 0.01258001, + "epoch": 0.7834059822636404, + "flos": 28295521413120.0, + "grad_norm": 1.7391755665392523, + "language_loss": 0.81014347, + "learning_rate": 4.721473755175698e-07, + "loss": 0.88685304, + "num_input_tokens_seen": 281099500, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.09869385, + "step": 13030, + "time_per_iteration": 2.5314316749572754 + }, + { + "auxiliary_loss_clip": 0.06404968, + "auxiliary_loss_mlp": 0.01261968, + "balance_loss_clip": 0.06269055, + "balance_loss_mlp": 0.01251949, + "epoch": 0.7834661055163085, + "flos": 31694281125120.0, + "grad_norm": 1.5048813517509494, + "language_loss": 0.70804811, + "learning_rate": 4.71896083506476e-07, + "loss": 0.78471744, + "num_input_tokens_seen": 281121250, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10021973, + "step": 13031, + "time_per_iteration": 2.5823378562927246 + }, + { + "auxiliary_loss_clip": 0.06405063, + "auxiliary_loss_mlp": 0.01266526, + "balance_loss_clip": 0.06270566, + "balance_loss_mlp": 0.01257079, + "epoch": 0.7835262287689764, + "flos": 12938238218880.0, + "grad_norm": 2.7115393333323468, + "language_loss": 0.78693461, + "learning_rate": 4.7164484944123574e-07, + "loss": 0.86365044, + "num_input_tokens_seen": 281138760, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.09442139, + "step": 13032, + "time_per_iteration": 2.4609038829803467 + }, + { + "auxiliary_loss_clip": 0.06404404, + "auxiliary_loss_mlp": 0.01268456, + "balance_loss_clip": 0.06269069, + "balance_loss_mlp": 0.01258317, + "epoch": 0.7835863520216444, + "flos": 16148671879680.0, + "grad_norm": 1.9002530639505248, + "language_loss": 0.63003838, + "learning_rate": 4.7139367333137726e-07, + "loss": 0.70676696, + "num_input_tokens_seen": 281157420, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.10137939, + "step": 13033, + "time_per_iteration": 2.500108242034912 + }, + { + "auxiliary_loss_clip": 0.06404372, + "auxiliary_loss_mlp": 0.01263517, + "balance_loss_clip": 0.06270869, + "balance_loss_mlp": 0.01253492, + "epoch": 0.7836464752743123, + "flos": 11514660076800.0, + "grad_norm": 1.5173952682400234, + "language_loss": 0.72150695, + "learning_rate": 4.7114255518642255e-07, + "loss": 0.79818583, + "num_input_tokens_seen": 281174620, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.10021973, + "step": 13034, + "time_per_iteration": 2.4920992851257324 + }, + { + "auxiliary_loss_clip": 0.06405693, + "auxiliary_loss_mlp": 0.01268729, + "balance_loss_clip": 0.06272385, + "balance_loss_mlp": 0.01258685, + "epoch": 0.7837065985269803, + "flos": 18229637629440.0, + "grad_norm": 1.7491156010672833, + "language_loss": 0.7212472, + "learning_rate": 4.7089149501589555e-07, + "loss": 0.79799139, + "num_input_tokens_seen": 281193865, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.1005249, + "step": 13035, + "time_per_iteration": 2.482640027999878 + }, + { + "auxiliary_loss_clip": 0.06404319, + "auxiliary_loss_mlp": 0.01270811, + "balance_loss_clip": 0.06270225, + "balance_loss_mlp": 0.01260541, + "epoch": 0.7837667217796482, + "flos": 24761404229760.0, + "grad_norm": 2.0189753157396373, + "language_loss": 0.66216964, + "learning_rate": 4.7064049282931664e-07, + "loss": 0.73892099, + "num_input_tokens_seen": 281212250, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.10266113, + "step": 13036, + "time_per_iteration": 2.5221505165100098 + }, + { + "auxiliary_loss_clip": 0.06407806, + "auxiliary_loss_mlp": 0.01272324, + "balance_loss_clip": 0.06269644, + "balance_loss_mlp": 0.01260981, + "epoch": 0.7838268450323163, + "flos": 22389766266240.0, + "grad_norm": 2.337708376501524, + "language_loss": 0.73523962, + "learning_rate": 4.703895486362031e-07, + "loss": 0.81204098, + "num_input_tokens_seen": 281230850, + "router_z_loss_clip": 1.38183594, + "router_z_loss_mlp": 0.11340332, + "step": 13037, + "time_per_iteration": 2.5027549266815186 + }, + { + "auxiliary_loss_clip": 0.06402339, + "auxiliary_loss_mlp": 0.01265411, + "balance_loss_clip": 0.06268933, + "balance_loss_mlp": 0.01255099, + "epoch": 0.7838869682849842, + "flos": 19506370291200.0, + "grad_norm": 2.111880919052157, + "language_loss": 0.60144168, + "learning_rate": 4.701386624460717e-07, + "loss": 0.67811918, + "num_input_tokens_seen": 281249810, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.10321045, + "step": 13038, + "time_per_iteration": 2.4813334941864014 + }, + { + "auxiliary_loss_clip": 0.06401114, + "auxiliary_loss_mlp": 0.01264836, + "balance_loss_clip": 0.06270541, + "balance_loss_mlp": 0.01255484, + "epoch": 0.7839470915376522, + "flos": 32901553152000.0, + "grad_norm": 1.5605584713979823, + "language_loss": 0.68332416, + "learning_rate": 4.698878342684349e-07, + "loss": 0.75998366, + "num_input_tokens_seen": 281273730, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.09350586, + "step": 13039, + "time_per_iteration": 2.616943359375 + }, + { + "auxiliary_loss_clip": 0.06395827, + "auxiliary_loss_mlp": 0.01261469, + "balance_loss_clip": 0.06267862, + "balance_loss_mlp": 0.01253244, + "epoch": 0.7840072147903202, + "flos": 29683153353600.0, + "grad_norm": 1.67583580210183, + "language_loss": 0.69978261, + "learning_rate": 4.6963706411280537e-07, + "loss": 0.77635556, + "num_input_tokens_seen": 281293670, + "router_z_loss_clip": 1.27832031, + "router_z_loss_mlp": 0.08227539, + "step": 13040, + "time_per_iteration": 2.575289726257324 + }, + { + "auxiliary_loss_clip": 0.06404934, + "auxiliary_loss_mlp": 0.01266779, + "balance_loss_clip": 0.06269483, + "balance_loss_mlp": 0.01256503, + "epoch": 0.7840673380429881, + "flos": 18192601324800.0, + "grad_norm": 1.9496315301470044, + "language_loss": 0.67735672, + "learning_rate": 4.6938635198869116e-07, + "loss": 0.75407386, + "num_input_tokens_seen": 281313070, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.10272217, + "step": 13041, + "time_per_iteration": 2.5014941692352295 + }, + { + "auxiliary_loss_clip": 0.06304124, + "auxiliary_loss_mlp": 0.01252304, + "balance_loss_clip": 0.06248714, + "balance_loss_mlp": 0.01251298, + "epoch": 0.7841274612956561, + "flos": 66365694616320.0, + "grad_norm": 0.8059954256946308, + "language_loss": 0.57385874, + "learning_rate": 4.691356979055998e-07, + "loss": 0.649423, + "num_input_tokens_seen": 281374880, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01005554, + "step": 13042, + "time_per_iteration": 3.0931692123413086 + }, + { + "auxiliary_loss_clip": 0.06405251, + "auxiliary_loss_mlp": 0.0126489, + "balance_loss_clip": 0.06270869, + "balance_loss_mlp": 0.0125564, + "epoch": 0.784187584548324, + "flos": 26655259812480.0, + "grad_norm": 2.4178981590312105, + "language_loss": 0.84631729, + "learning_rate": 4.688851018730369e-07, + "loss": 0.92301869, + "num_input_tokens_seen": 281392620, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.09246826, + "step": 13043, + "time_per_iteration": 2.5591118335723877 + }, + { + "auxiliary_loss_clip": 0.0639644, + "auxiliary_loss_mlp": 0.01264718, + "balance_loss_clip": 0.06267819, + "balance_loss_mlp": 0.01255796, + "epoch": 0.7842477078009921, + "flos": 25747422750720.0, + "grad_norm": 1.364522654088724, + "language_loss": 0.88473415, + "learning_rate": 4.6863456390050425e-07, + "loss": 0.96134579, + "num_input_tokens_seen": 281413140, + "router_z_loss_clip": 1.28613281, + "router_z_loss_mlp": 0.08917236, + "step": 13044, + "time_per_iteration": 2.5349628925323486 + }, + { + "auxiliary_loss_clip": 0.06410815, + "auxiliary_loss_mlp": 0.01269176, + "balance_loss_clip": 0.06271672, + "balance_loss_mlp": 0.01259132, + "epoch": 0.78430783105366, + "flos": 21987398160000.0, + "grad_norm": 1.6046981571270753, + "language_loss": 0.79284698, + "learning_rate": 4.6838408399750195e-07, + "loss": 0.86964685, + "num_input_tokens_seen": 281430860, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.10040283, + "step": 13045, + "time_per_iteration": 3.9486923217773438 + }, + { + "auxiliary_loss_clip": 0.06400262, + "auxiliary_loss_mlp": 0.01262142, + "balance_loss_clip": 0.0626996, + "balance_loss_mlp": 0.01252862, + "epoch": 0.784367954306328, + "flos": 23849122901760.0, + "grad_norm": 1.3651332690132787, + "language_loss": 0.72812819, + "learning_rate": 4.6813366217352925e-07, + "loss": 0.80475229, + "num_input_tokens_seen": 281451385, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.09277344, + "step": 13046, + "time_per_iteration": 2.5449562072753906 + }, + { + "auxiliary_loss_clip": 0.06399076, + "auxiliary_loss_mlp": 0.01262656, + "balance_loss_clip": 0.06269773, + "balance_loss_mlp": 0.01253036, + "epoch": 0.7844280775589959, + "flos": 24833548195200.0, + "grad_norm": 1.4113250051922885, + "language_loss": 0.63375705, + "learning_rate": 4.678832984380809e-07, + "loss": 0.71037436, + "num_input_tokens_seen": 281472255, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.09619141, + "step": 13047, + "time_per_iteration": 2.555187940597534 + }, + { + "auxiliary_loss_clip": 0.06397624, + "auxiliary_loss_mlp": 0.01263441, + "balance_loss_clip": 0.06269644, + "balance_loss_mlp": 0.01253892, + "epoch": 0.7844882008116639, + "flos": 22462245648000.0, + "grad_norm": 1.5637844175125322, + "language_loss": 0.73288012, + "learning_rate": 4.676329928006515e-07, + "loss": 0.8094908, + "num_input_tokens_seen": 281492860, + "router_z_loss_clip": 1.28027344, + "router_z_loss_mlp": 0.09552002, + "step": 13048, + "time_per_iteration": 2.500697374343872 + }, + { + "auxiliary_loss_clip": 0.06406703, + "auxiliary_loss_mlp": 0.01264778, + "balance_loss_clip": 0.06269943, + "balance_loss_mlp": 0.01254586, + "epoch": 0.7845483240643318, + "flos": 26111203251840.0, + "grad_norm": 1.7122203145326895, + "language_loss": 0.74653435, + "learning_rate": 4.6738274527073243e-07, + "loss": 0.8232491, + "num_input_tokens_seen": 281511815, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.10198975, + "step": 13049, + "time_per_iteration": 2.525059700012207 + }, + { + "auxiliary_loss_clip": 0.06406355, + "auxiliary_loss_mlp": 0.012639, + "balance_loss_clip": 0.06269609, + "balance_loss_mlp": 0.0125279, + "epoch": 0.7846084473169999, + "flos": 19360363351680.0, + "grad_norm": 1.8695615724941215, + "language_loss": 0.72989309, + "learning_rate": 4.6713255585781454e-07, + "loss": 0.80659556, + "num_input_tokens_seen": 281530090, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.11114502, + "step": 13050, + "time_per_iteration": 2.502976655960083 + }, + { + "auxiliary_loss_clip": 0.0640547, + "auxiliary_loss_mlp": 0.01264968, + "balance_loss_clip": 0.06273313, + "balance_loss_mlp": 0.01255658, + "epoch": 0.7846685705696678, + "flos": 23331620885760.0, + "grad_norm": 1.8649850140502078, + "language_loss": 0.73895067, + "learning_rate": 4.668824245713825e-07, + "loss": 0.81565511, + "num_input_tokens_seen": 281547075, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09320068, + "step": 13051, + "time_per_iteration": 2.5090999603271484 + }, + { + "auxiliary_loss_clip": 0.06407961, + "auxiliary_loss_mlp": 0.01270446, + "balance_loss_clip": 0.06272332, + "balance_loss_mlp": 0.01259622, + "epoch": 0.7847286938223358, + "flos": 35818379706240.0, + "grad_norm": 2.0718578838618527, + "language_loss": 0.73053241, + "learning_rate": 4.666323514209227e-07, + "loss": 0.80731648, + "num_input_tokens_seen": 281568080, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10827637, + "step": 13052, + "time_per_iteration": 2.6086881160736084 + }, + { + "auxiliary_loss_clip": 0.06395121, + "auxiliary_loss_mlp": 0.01262593, + "balance_loss_clip": 0.06268048, + "balance_loss_mlp": 0.01253241, + "epoch": 0.7847888170750038, + "flos": 18483986298240.0, + "grad_norm": 1.9107364869927201, + "language_loss": 0.69673455, + "learning_rate": 4.663823364159183e-07, + "loss": 0.77331167, + "num_input_tokens_seen": 281586925, + "router_z_loss_clip": 1.27050781, + "router_z_loss_mlp": 0.09344482, + "step": 13053, + "time_per_iteration": 2.471815586090088 + }, + { + "auxiliary_loss_clip": 0.06401109, + "auxiliary_loss_mlp": 0.01260742, + "balance_loss_clip": 0.06270862, + "balance_loss_mlp": 0.01251807, + "epoch": 0.7848489403276717, + "flos": 25126190979840.0, + "grad_norm": 1.8867575378742971, + "language_loss": 0.70537353, + "learning_rate": 4.6613237956584893e-07, + "loss": 0.78199208, + "num_input_tokens_seen": 281603915, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.08929443, + "step": 13054, + "time_per_iteration": 2.5749151706695557 + }, + { + "auxiliary_loss_clip": 0.06405072, + "auxiliary_loss_mlp": 0.01264324, + "balance_loss_clip": 0.06269364, + "balance_loss_mlp": 0.01253971, + "epoch": 0.7849090635803397, + "flos": 26509169018880.0, + "grad_norm": 1.610774832305801, + "language_loss": 0.76244235, + "learning_rate": 4.658824808801938e-07, + "loss": 0.8391363, + "num_input_tokens_seen": 281624220, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10357666, + "step": 13055, + "time_per_iteration": 3.9623241424560547 + }, + { + "auxiliary_loss_clip": 0.06407758, + "auxiliary_loss_mlp": 0.01263649, + "balance_loss_clip": 0.06270571, + "balance_loss_mlp": 0.01253922, + "epoch": 0.7849691868330076, + "flos": 20965978488960.0, + "grad_norm": 1.9205969834144307, + "language_loss": 0.75488204, + "learning_rate": 4.656326403684283e-07, + "loss": 0.83159614, + "num_input_tokens_seen": 281642325, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.09729004, + "step": 13056, + "time_per_iteration": 2.4767720699310303 + }, + { + "auxiliary_loss_clip": 0.06400058, + "auxiliary_loss_mlp": 0.01266253, + "balance_loss_clip": 0.06269453, + "balance_loss_mlp": 0.01256841, + "epoch": 0.7850293100856757, + "flos": 26074628144640.0, + "grad_norm": 1.52924099348992, + "language_loss": 0.70278704, + "learning_rate": 4.6538285804002744e-07, + "loss": 0.77945018, + "num_input_tokens_seen": 281663065, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09423828, + "step": 13057, + "time_per_iteration": 2.5652661323547363 + }, + { + "auxiliary_loss_clip": 0.06407446, + "auxiliary_loss_mlp": 0.01266111, + "balance_loss_clip": 0.06271527, + "balance_loss_mlp": 0.01256789, + "epoch": 0.7850894333383436, + "flos": 22498443411840.0, + "grad_norm": 2.33768341300027, + "language_loss": 0.76614606, + "learning_rate": 4.6513313390446175e-07, + "loss": 0.84288156, + "num_input_tokens_seen": 281681005, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.09326172, + "step": 13058, + "time_per_iteration": 2.479261875152588 + }, + { + "auxiliary_loss_clip": 0.06401752, + "auxiliary_loss_mlp": 0.0126406, + "balance_loss_clip": 0.06268829, + "balance_loss_mlp": 0.01254244, + "epoch": 0.7851495565910116, + "flos": 20564952048000.0, + "grad_norm": 1.4951701283618941, + "language_loss": 0.71132874, + "learning_rate": 4.6488346797120146e-07, + "loss": 0.78798681, + "num_input_tokens_seen": 281697965, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.0982666, + "step": 13059, + "time_per_iteration": 3.9393692016601562 + }, + { + "auxiliary_loss_clip": 0.06412531, + "auxiliary_loss_mlp": 0.01265523, + "balance_loss_clip": 0.06272064, + "balance_loss_mlp": 0.01254842, + "epoch": 0.7852096798436795, + "flos": 15930353266560.0, + "grad_norm": 1.897902046144861, + "language_loss": 0.77542412, + "learning_rate": 4.646338602497144e-07, + "loss": 0.85220468, + "num_input_tokens_seen": 281716035, + "router_z_loss_clip": 1.40332031, + "router_z_loss_mlp": 0.10687256, + "step": 13060, + "time_per_iteration": 2.4718637466430664 + }, + { + "auxiliary_loss_clip": 0.06402256, + "auxiliary_loss_mlp": 0.0126411, + "balance_loss_clip": 0.06269743, + "balance_loss_mlp": 0.01254085, + "epoch": 0.7852698030963475, + "flos": 19068265618560.0, + "grad_norm": 1.8441572725485498, + "language_loss": 0.76857173, + "learning_rate": 4.643843107494654e-07, + "loss": 0.84523541, + "num_input_tokens_seen": 281732815, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.1003418, + "step": 13061, + "time_per_iteration": 2.4667510986328125 + }, + { + "auxiliary_loss_clip": 0.06403807, + "auxiliary_loss_mlp": 0.01266965, + "balance_loss_clip": 0.06270888, + "balance_loss_mlp": 0.01257738, + "epoch": 0.7853299263490154, + "flos": 24651259637760.0, + "grad_norm": 1.784620382168378, + "language_loss": 0.74518055, + "learning_rate": 4.641348194799164e-07, + "loss": 0.82188833, + "num_input_tokens_seen": 281751980, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.09234619, + "step": 13062, + "time_per_iteration": 2.5519487857818604 + }, + { + "auxiliary_loss_clip": 0.06401968, + "auxiliary_loss_mlp": 0.01263435, + "balance_loss_clip": 0.06270862, + "balance_loss_mlp": 0.01254501, + "epoch": 0.7853900496016835, + "flos": 22024518318720.0, + "grad_norm": 1.444565661483555, + "language_loss": 0.6925329, + "learning_rate": 4.638853864505297e-07, + "loss": 0.76918697, + "num_input_tokens_seen": 281772670, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.08935547, + "step": 13063, + "time_per_iteration": 3.896639585494995 + }, + { + "auxiliary_loss_clip": 0.064018, + "auxiliary_loss_mlp": 0.01262061, + "balance_loss_clip": 0.06272244, + "balance_loss_mlp": 0.01252858, + "epoch": 0.7854501728543514, + "flos": 30235343760000.0, + "grad_norm": 1.975335557654558, + "language_loss": 0.72825849, + "learning_rate": 4.636360116707625e-07, + "loss": 0.80489707, + "num_input_tokens_seen": 281792930, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.09210205, + "step": 13064, + "time_per_iteration": 2.567704200744629 + }, + { + "auxiliary_loss_clip": 0.06403325, + "auxiliary_loss_mlp": 0.01265412, + "balance_loss_clip": 0.0626822, + "balance_loss_mlp": 0.01255583, + "epoch": 0.7855102961070194, + "flos": 18849695443200.0, + "grad_norm": 1.5878092382689184, + "language_loss": 0.67936897, + "learning_rate": 4.633866951500718e-07, + "loss": 0.75605631, + "num_input_tokens_seen": 281811805, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.09838867, + "step": 13065, + "time_per_iteration": 2.470630168914795 + }, + { + "auxiliary_loss_clip": 0.06404464, + "auxiliary_loss_mlp": 0.01266751, + "balance_loss_clip": 0.06273209, + "balance_loss_mlp": 0.01257184, + "epoch": 0.7855704193596874, + "flos": 22316574124800.0, + "grad_norm": 3.292833578537852, + "language_loss": 0.75992739, + "learning_rate": 4.6313743689791196e-07, + "loss": 0.83663952, + "num_input_tokens_seen": 281831885, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09576416, + "step": 13066, + "time_per_iteration": 2.5433592796325684 + }, + { + "auxiliary_loss_clip": 0.06310245, + "auxiliary_loss_mlp": 0.01255234, + "balance_loss_clip": 0.06254524, + "balance_loss_mlp": 0.01254291, + "epoch": 0.7856305426123553, + "flos": 60024224638080.0, + "grad_norm": 0.6974485320329921, + "language_loss": 0.53405064, + "learning_rate": 4.628882369237346e-07, + "loss": 0.60970545, + "num_input_tokens_seen": 281900310, + "router_z_loss_clip": 0.55664062, + "router_z_loss_mlp": 0.00940704, + "step": 13067, + "time_per_iteration": 3.3080852031707764 + }, + { + "auxiliary_loss_clip": 0.06404316, + "auxiliary_loss_mlp": 0.0126413, + "balance_loss_clip": 0.06269915, + "balance_loss_mlp": 0.012542, + "epoch": 0.7856906658650233, + "flos": 21874528310400.0, + "grad_norm": 1.4327852205336962, + "language_loss": 0.68056738, + "learning_rate": 4.62639095236989e-07, + "loss": 0.75725186, + "num_input_tokens_seen": 281918870, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.0993042, + "step": 13068, + "time_per_iteration": 2.5869228839874268 + }, + { + "auxiliary_loss_clip": 0.06399503, + "auxiliary_loss_mlp": 0.01263997, + "balance_loss_clip": 0.06269825, + "balance_loss_mlp": 0.01254883, + "epoch": 0.7857507891176913, + "flos": 23629672258560.0, + "grad_norm": 1.764601675005712, + "language_loss": 0.68482268, + "learning_rate": 4.6239001184712267e-07, + "loss": 0.76145768, + "num_input_tokens_seen": 281936905, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.09112549, + "step": 13069, + "time_per_iteration": 2.5437350273132324 + }, + { + "auxiliary_loss_clip": 0.06404187, + "auxiliary_loss_mlp": 0.01263836, + "balance_loss_clip": 0.06271039, + "balance_loss_mlp": 0.01253984, + "epoch": 0.7858109123703593, + "flos": 25527091639680.0, + "grad_norm": 1.7842031457039946, + "language_loss": 0.76992953, + "learning_rate": 4.6214098676358195e-07, + "loss": 0.84660977, + "num_input_tokens_seen": 281955625, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.09857178, + "step": 13070, + "time_per_iteration": 2.5414490699768066 + }, + { + "auxiliary_loss_clip": 0.06396306, + "auxiliary_loss_mlp": 0.01264006, + "balance_loss_clip": 0.06267333, + "balance_loss_mlp": 0.01255298, + "epoch": 0.7858710356230272, + "flos": 17463195532800.0, + "grad_norm": 1.5496724726178355, + "language_loss": 0.6583572, + "learning_rate": 4.618920199958083e-07, + "loss": 0.73496032, + "num_input_tokens_seen": 281973285, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.08703613, + "step": 13071, + "time_per_iteration": 2.469886541366577 + }, + { + "auxiliary_loss_clip": 0.06407128, + "auxiliary_loss_mlp": 0.01264805, + "balance_loss_clip": 0.06271265, + "balance_loss_mlp": 0.01254946, + "epoch": 0.7859311588756952, + "flos": 24686367298560.0, + "grad_norm": 1.6110892083187893, + "language_loss": 0.73717749, + "learning_rate": 4.616431115532442e-07, + "loss": 0.81389678, + "num_input_tokens_seen": 281991410, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.09857178, + "step": 13072, + "time_per_iteration": 2.519676923751831 + }, + { + "auxiliary_loss_clip": 0.06403338, + "auxiliary_loss_mlp": 0.01268392, + "balance_loss_clip": 0.06269255, + "balance_loss_mlp": 0.01257288, + "epoch": 0.7859912821283631, + "flos": 21805654654080.0, + "grad_norm": 1.8631403345440603, + "language_loss": 0.71523631, + "learning_rate": 4.613942614453268e-07, + "loss": 0.79195362, + "num_input_tokens_seen": 282010845, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.11108398, + "step": 13073, + "time_per_iteration": 2.5105767250061035 + }, + { + "auxiliary_loss_clip": 0.06404594, + "auxiliary_loss_mlp": 0.01265595, + "balance_loss_clip": 0.06270787, + "balance_loss_mlp": 0.01255295, + "epoch": 0.7860514053810311, + "flos": 20853108639360.0, + "grad_norm": 1.5490527180797131, + "language_loss": 0.76964885, + "learning_rate": 4.611454696814938e-07, + "loss": 0.84635073, + "num_input_tokens_seen": 282029635, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.10302734, + "step": 13074, + "time_per_iteration": 2.4855496883392334 + }, + { + "auxiliary_loss_clip": 0.06398475, + "auxiliary_loss_mlp": 0.01266136, + "balance_loss_clip": 0.06269623, + "balance_loss_mlp": 0.01256504, + "epoch": 0.786111528633699, + "flos": 24322461016320.0, + "grad_norm": 1.8530422938464213, + "language_loss": 0.75361305, + "learning_rate": 4.608967362711782e-07, + "loss": 0.8302592, + "num_input_tokens_seen": 282050285, + "router_z_loss_clip": 1.28808594, + "router_z_loss_mlp": 0.09637451, + "step": 13075, + "time_per_iteration": 2.5396533012390137 + }, + { + "auxiliary_loss_clip": 0.06403027, + "auxiliary_loss_mlp": 0.01261838, + "balance_loss_clip": 0.06270842, + "balance_loss_mlp": 0.01252677, + "epoch": 0.7861716518863671, + "flos": 24360126226560.0, + "grad_norm": 1.639337001432503, + "language_loss": 0.68816268, + "learning_rate": 4.6064806122381283e-07, + "loss": 0.7648114, + "num_input_tokens_seen": 282071040, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.09161377, + "step": 13076, + "time_per_iteration": 2.507643461227417 + }, + { + "auxiliary_loss_clip": 0.06400099, + "auxiliary_loss_mlp": 0.01267556, + "balance_loss_clip": 0.06270486, + "balance_loss_mlp": 0.01258461, + "epoch": 0.786231775139035, + "flos": 14026728683520.0, + "grad_norm": 2.3148125900767065, + "language_loss": 0.79768962, + "learning_rate": 4.603994445488282e-07, + "loss": 0.87436622, + "num_input_tokens_seen": 282086610, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.09088135, + "step": 13077, + "time_per_iteration": 2.470398426055908 + }, + { + "auxiliary_loss_clip": 0.06401075, + "auxiliary_loss_mlp": 0.0126456, + "balance_loss_clip": 0.06269512, + "balance_loss_mlp": 0.01255, + "epoch": 0.786291898391703, + "flos": 33731795733120.0, + "grad_norm": 1.615733156524089, + "language_loss": 0.70986831, + "learning_rate": 4.6015088625564956e-07, + "loss": 0.78652471, + "num_input_tokens_seen": 282107440, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.09552002, + "step": 13078, + "time_per_iteration": 2.6685726642608643 + }, + { + "auxiliary_loss_clip": 0.06401184, + "auxiliary_loss_mlp": 0.01265393, + "balance_loss_clip": 0.06270616, + "balance_loss_mlp": 0.01255875, + "epoch": 0.786352021644371, + "flos": 25818476613120.0, + "grad_norm": 1.4651879237887804, + "language_loss": 0.81708902, + "learning_rate": 4.599023863537039e-07, + "loss": 0.89375478, + "num_input_tokens_seen": 282127290, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.09509277, + "step": 13079, + "time_per_iteration": 2.5660455226898193 + }, + { + "auxiliary_loss_clip": 0.0639349, + "auxiliary_loss_mlp": 0.01269341, + "balance_loss_clip": 0.0626843, + "balance_loss_mlp": 0.01260209, + "epoch": 0.7864121448970389, + "flos": 28918010995200.0, + "grad_norm": 1.4929435922037373, + "language_loss": 0.68745899, + "learning_rate": 4.596539448524146e-07, + "loss": 0.76408732, + "num_input_tokens_seen": 282147505, + "router_z_loss_clip": 1.25195312, + "router_z_loss_mlp": 0.09124756, + "step": 13080, + "time_per_iteration": 2.5500268936157227 + }, + { + "auxiliary_loss_clip": 0.06401475, + "auxiliary_loss_mlp": 0.0126541, + "balance_loss_clip": 0.06269769, + "balance_loss_mlp": 0.012552, + "epoch": 0.7864722681497069, + "flos": 19214943390720.0, + "grad_norm": 1.6425983942021263, + "language_loss": 0.70132333, + "learning_rate": 4.594055617612016e-07, + "loss": 0.77799213, + "num_input_tokens_seen": 282166450, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.10211182, + "step": 13081, + "time_per_iteration": 2.508885622024536 + }, + { + "auxiliary_loss_clip": 0.06405645, + "auxiliary_loss_mlp": 0.01264379, + "balance_loss_clip": 0.06271995, + "balance_loss_mlp": 0.01255021, + "epoch": 0.7865323914023749, + "flos": 21878008254720.0, + "grad_norm": 2.0927961593492737, + "language_loss": 0.68778342, + "learning_rate": 4.591572370894838e-07, + "loss": 0.76448363, + "num_input_tokens_seen": 282186465, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.09362793, + "step": 13082, + "time_per_iteration": 2.5268876552581787 + }, + { + "auxiliary_loss_clip": 0.0639787, + "auxiliary_loss_mlp": 0.01264108, + "balance_loss_clip": 0.0626892, + "balance_loss_mlp": 0.01254584, + "epoch": 0.7865925146550429, + "flos": 25527385128960.0, + "grad_norm": 1.5194289662582627, + "language_loss": 0.66099608, + "learning_rate": 4.589089708466789e-07, + "loss": 0.73761588, + "num_input_tokens_seen": 282207180, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.09527588, + "step": 13083, + "time_per_iteration": 2.5328421592712402 + }, + { + "auxiliary_loss_clip": 0.06405569, + "auxiliary_loss_mlp": 0.01266332, + "balance_loss_clip": 0.0627001, + "balance_loss_mlp": 0.01255424, + "epoch": 0.7866526379077108, + "flos": 19103121717120.0, + "grad_norm": 2.2309831052205387, + "language_loss": 0.74742764, + "learning_rate": 4.5866076304220015e-07, + "loss": 0.82414663, + "num_input_tokens_seen": 282225865, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.10906982, + "step": 13084, + "time_per_iteration": 3.8599534034729004 + }, + { + "auxiliary_loss_clip": 0.06398539, + "auxiliary_loss_mlp": 0.01265, + "balance_loss_clip": 0.0626938, + "balance_loss_mlp": 0.01255678, + "epoch": 0.7867127611603788, + "flos": 16178245171200.0, + "grad_norm": 1.7096991986275847, + "language_loss": 0.7048676, + "learning_rate": 4.584126136854591e-07, + "loss": 0.7815029, + "num_input_tokens_seen": 282242895, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.09313965, + "step": 13085, + "time_per_iteration": 2.4548091888427734 + }, + { + "auxiliary_loss_clip": 0.06404947, + "auxiliary_loss_mlp": 0.01266508, + "balance_loss_clip": 0.0626765, + "balance_loss_mlp": 0.01256238, + "epoch": 0.7867728844130467, + "flos": 20779329519360.0, + "grad_norm": 1.9009229295966659, + "language_loss": 0.72873515, + "learning_rate": 4.5816452278586617e-07, + "loss": 0.80544972, + "num_input_tokens_seen": 282260425, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.10266113, + "step": 13086, + "time_per_iteration": 2.4679646492004395 + }, + { + "auxiliary_loss_clip": 0.06401749, + "auxiliary_loss_mlp": 0.0126499, + "balance_loss_clip": 0.0626972, + "balance_loss_mlp": 0.01256132, + "epoch": 0.7868330076657147, + "flos": 21766186581120.0, + "grad_norm": 1.6915622771395795, + "language_loss": 0.75259304, + "learning_rate": 4.5791649035282965e-07, + "loss": 0.82926041, + "num_input_tokens_seen": 282279335, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.08862305, + "step": 13087, + "time_per_iteration": 2.4868595600128174 + }, + { + "auxiliary_loss_clip": 0.06401436, + "auxiliary_loss_mlp": 0.01266533, + "balance_loss_clip": 0.06271186, + "balance_loss_mlp": 0.01257431, + "epoch": 0.7868931309183826, + "flos": 25707451553280.0, + "grad_norm": 1.5159741083416707, + "language_loss": 0.71450847, + "learning_rate": 4.5766851639575456e-07, + "loss": 0.79118818, + "num_input_tokens_seen": 282299905, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.09088135, + "step": 13088, + "time_per_iteration": 2.5030412673950195 + }, + { + "auxiliary_loss_clip": 0.06311038, + "auxiliary_loss_mlp": 0.01250466, + "balance_loss_clip": 0.06255361, + "balance_loss_mlp": 0.012495, + "epoch": 0.7869532541710507, + "flos": 64666579921920.0, + "grad_norm": 0.663330829427475, + "language_loss": 0.55047309, + "learning_rate": 4.574206009240431e-07, + "loss": 0.62608814, + "num_input_tokens_seen": 282367620, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.00964355, + "step": 13089, + "time_per_iteration": 3.1940503120422363 + }, + { + "auxiliary_loss_clip": 0.06311715, + "auxiliary_loss_mlp": 0.01259019, + "balance_loss_clip": 0.0625612, + "balance_loss_mlp": 0.01257986, + "epoch": 0.7870133774237186, + "flos": 67475651725440.0, + "grad_norm": 0.7045101458235505, + "language_loss": 0.49567109, + "learning_rate": 4.571727439470976e-07, + "loss": 0.57137847, + "num_input_tokens_seen": 282435695, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01033783, + "step": 13090, + "time_per_iteration": 3.2323949337005615 + }, + { + "auxiliary_loss_clip": 0.06399588, + "auxiliary_loss_mlp": 0.01264155, + "balance_loss_clip": 0.0626979, + "balance_loss_mlp": 0.01255006, + "epoch": 0.7870735006763866, + "flos": 26075592466560.0, + "grad_norm": 1.3918495812457483, + "language_loss": 0.84173477, + "learning_rate": 4.5692494547431583e-07, + "loss": 0.91837221, + "num_input_tokens_seen": 282456025, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.0914917, + "step": 13091, + "time_per_iteration": 2.5303354263305664 + }, + { + "auxiliary_loss_clip": 0.06311627, + "auxiliary_loss_mlp": 0.01253337, + "balance_loss_clip": 0.0625616, + "balance_loss_mlp": 0.01252234, + "epoch": 0.7871336239290546, + "flos": 70310439532800.0, + "grad_norm": 0.6984253533928471, + "language_loss": 0.63944566, + "learning_rate": 4.566772055150947e-07, + "loss": 0.71509528, + "num_input_tokens_seen": 282520995, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01104736, + "step": 13092, + "time_per_iteration": 3.186598300933838 + }, + { + "auxiliary_loss_clip": 0.06405234, + "auxiliary_loss_mlp": 0.01264101, + "balance_loss_clip": 0.06272719, + "balance_loss_mlp": 0.01254749, + "epoch": 0.7871937471817225, + "flos": 15784010910720.0, + "grad_norm": 2.677362510314703, + "language_loss": 0.79394525, + "learning_rate": 4.564295240788285e-07, + "loss": 0.87063861, + "num_input_tokens_seen": 282539355, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09350586, + "step": 13093, + "time_per_iteration": 2.4746809005737305 + }, + { + "auxiliary_loss_clip": 0.06399192, + "auxiliary_loss_mlp": 0.01262897, + "balance_loss_clip": 0.06268847, + "balance_loss_mlp": 0.01253747, + "epoch": 0.7872538704343905, + "flos": 20491466417280.0, + "grad_norm": 1.6510022815590566, + "language_loss": 0.75735247, + "learning_rate": 4.561819011749106e-07, + "loss": 0.83397341, + "num_input_tokens_seen": 282555735, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.0914917, + "step": 13094, + "time_per_iteration": 4.020095109939575 + }, + { + "auxiliary_loss_clip": 0.06407712, + "auxiliary_loss_mlp": 0.01266386, + "balance_loss_clip": 0.06273055, + "balance_loss_mlp": 0.01256719, + "epoch": 0.7873139936870585, + "flos": 25089699726720.0, + "grad_norm": 1.5509563724400146, + "language_loss": 0.79440391, + "learning_rate": 4.5593433681272884e-07, + "loss": 0.87114489, + "num_input_tokens_seen": 282574550, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.09674072, + "step": 13095, + "time_per_iteration": 2.609463930130005 + }, + { + "auxiliary_loss_clip": 0.06408177, + "auxiliary_loss_mlp": 0.01265337, + "balance_loss_clip": 0.06271407, + "balance_loss_mlp": 0.01255425, + "epoch": 0.7873741169397265, + "flos": 30891054286080.0, + "grad_norm": 1.609249488827552, + "language_loss": 0.68118989, + "learning_rate": 4.556868310016715e-07, + "loss": 0.75792503, + "num_input_tokens_seen": 282596520, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.09918213, + "step": 13096, + "time_per_iteration": 2.5687479972839355 + }, + { + "auxiliary_loss_clip": 0.0639504, + "auxiliary_loss_mlp": 0.01263751, + "balance_loss_clip": 0.06268235, + "balance_loss_mlp": 0.01255102, + "epoch": 0.7874342401923944, + "flos": 46802666165760.0, + "grad_norm": 1.4338734934522757, + "language_loss": 0.70958376, + "learning_rate": 4.55439383751125e-07, + "loss": 0.78617167, + "num_input_tokens_seen": 282620560, + "router_z_loss_clip": 1.26953125, + "router_z_loss_mlp": 0.08648682, + "step": 13097, + "time_per_iteration": 2.739225387573242 + }, + { + "auxiliary_loss_clip": 0.0640981, + "auxiliary_loss_mlp": 0.01270015, + "balance_loss_clip": 0.06274028, + "balance_loss_mlp": 0.0125987, + "epoch": 0.7874943634450624, + "flos": 23590958872320.0, + "grad_norm": 4.324515792208533, + "language_loss": 0.8066771, + "learning_rate": 4.5519199507047126e-07, + "loss": 0.8834753, + "num_input_tokens_seen": 282639830, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.10144043, + "step": 13098, + "time_per_iteration": 4.011147737503052 + }, + { + "auxiliary_loss_clip": 0.06403133, + "auxiliary_loss_mlp": 0.01264821, + "balance_loss_clip": 0.06272101, + "balance_loss_mlp": 0.0125591, + "epoch": 0.7875544866977303, + "flos": 20196978842880.0, + "grad_norm": 1.6374038368604131, + "language_loss": 0.74357909, + "learning_rate": 4.5494466496909177e-07, + "loss": 0.82025862, + "num_input_tokens_seen": 282660130, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.08898926, + "step": 13099, + "time_per_iteration": 2.5371813774108887 + }, + { + "auxiliary_loss_clip": 0.06403521, + "auxiliary_loss_mlp": 0.01264223, + "balance_loss_clip": 0.06272208, + "balance_loss_mlp": 0.01254811, + "epoch": 0.7876146099503983, + "flos": 22609342690560.0, + "grad_norm": 1.4701340709539035, + "language_loss": 0.78340292, + "learning_rate": 4.5469739345636603e-07, + "loss": 0.86008036, + "num_input_tokens_seen": 282681125, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.09417725, + "step": 13100, + "time_per_iteration": 2.518275737762451 + }, + { + "auxiliary_loss_clip": 0.06411106, + "auxiliary_loss_mlp": 0.01262468, + "balance_loss_clip": 0.06271806, + "balance_loss_mlp": 0.01251334, + "epoch": 0.7876747332030662, + "flos": 10710217353600.0, + "grad_norm": 2.2988714589951122, + "language_loss": 0.66578412, + "learning_rate": 4.5445018054167007e-07, + "loss": 0.74251986, + "num_input_tokens_seen": 282696690, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.11138916, + "step": 13101, + "time_per_iteration": 2.478010416030884 + }, + { + "auxiliary_loss_clip": 0.06403912, + "auxiliary_loss_mlp": 0.01262729, + "balance_loss_clip": 0.06271445, + "balance_loss_mlp": 0.01253026, + "epoch": 0.7877348564557343, + "flos": 38408462064000.0, + "grad_norm": 1.3711840285849346, + "language_loss": 0.78050315, + "learning_rate": 4.5420302623437745e-07, + "loss": 0.85716951, + "num_input_tokens_seen": 282721210, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.09716797, + "step": 13102, + "time_per_iteration": 2.6512677669525146 + }, + { + "auxiliary_loss_clip": 0.06402024, + "auxiliary_loss_mlp": 0.01263165, + "balance_loss_clip": 0.06270896, + "balance_loss_mlp": 0.01253968, + "epoch": 0.7877949797084022, + "flos": 18334876757760.0, + "grad_norm": 3.387524543051336, + "language_loss": 0.82612967, + "learning_rate": 4.5395593054386093e-07, + "loss": 0.90278161, + "num_input_tokens_seen": 282738505, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09197998, + "step": 13103, + "time_per_iteration": 3.8968992233276367 + }, + { + "auxiliary_loss_clip": 0.0640745, + "auxiliary_loss_mlp": 0.01262901, + "balance_loss_clip": 0.0627317, + "balance_loss_mlp": 0.0125349, + "epoch": 0.7878551029610702, + "flos": 25812942243840.0, + "grad_norm": 2.089208992674617, + "language_loss": 0.80857301, + "learning_rate": 4.537088934794913e-07, + "loss": 0.8852765, + "num_input_tokens_seen": 282756895, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.09405518, + "step": 13104, + "time_per_iteration": 2.531153917312622 + }, + { + "auxiliary_loss_clip": 0.06404544, + "auxiliary_loss_mlp": 0.01264676, + "balance_loss_clip": 0.06272654, + "balance_loss_mlp": 0.0125505, + "epoch": 0.7879152262137382, + "flos": 22348663038720.0, + "grad_norm": 1.6665656648061993, + "language_loss": 0.74192965, + "learning_rate": 4.5346191505063515e-07, + "loss": 0.81862175, + "num_input_tokens_seen": 282774955, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.09619141, + "step": 13105, + "time_per_iteration": 2.470590114593506 + }, + { + "auxiliary_loss_clip": 0.06407781, + "auxiliary_loss_mlp": 0.01265901, + "balance_loss_clip": 0.0627221, + "balance_loss_mlp": 0.01255798, + "epoch": 0.7879753494664061, + "flos": 24791396791680.0, + "grad_norm": 1.540938509232933, + "language_loss": 0.75896162, + "learning_rate": 4.5321499526665776e-07, + "loss": 0.83569837, + "num_input_tokens_seen": 282793165, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.10101318, + "step": 13106, + "time_per_iteration": 2.5313045978546143 + }, + { + "auxiliary_loss_clip": 0.06404249, + "auxiliary_loss_mlp": 0.01267414, + "balance_loss_clip": 0.06271406, + "balance_loss_mlp": 0.01257592, + "epoch": 0.7880354727190741, + "flos": 16914610851840.0, + "grad_norm": 2.261490692087697, + "language_loss": 0.7317878, + "learning_rate": 4.5296813413692337e-07, + "loss": 0.80850446, + "num_input_tokens_seen": 282809820, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.09832764, + "step": 13107, + "time_per_iteration": 2.4657392501831055 + }, + { + "auxiliary_loss_clip": 0.0640149, + "auxiliary_loss_mlp": 0.01266906, + "balance_loss_clip": 0.06272627, + "balance_loss_mlp": 0.01257083, + "epoch": 0.7880955959717421, + "flos": 22236002824320.0, + "grad_norm": 1.7249934129069375, + "language_loss": 0.73170471, + "learning_rate": 4.5272133167079165e-07, + "loss": 0.80838865, + "num_input_tokens_seen": 282828600, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.09820557, + "step": 13108, + "time_per_iteration": 2.522061347961426 + }, + { + "auxiliary_loss_clip": 0.06308442, + "auxiliary_loss_mlp": 0.01251318, + "balance_loss_clip": 0.06252776, + "balance_loss_mlp": 0.01250208, + "epoch": 0.7881557192244101, + "flos": 69201907943040.0, + "grad_norm": 0.865010287169312, + "language_loss": 0.60254252, + "learning_rate": 4.5247458787762216e-07, + "loss": 0.6781401, + "num_input_tokens_seen": 282882775, + "router_z_loss_clip": 0.55810547, + "router_z_loss_mlp": 0.01112366, + "step": 13109, + "time_per_iteration": 3.0764577388763428 + }, + { + "auxiliary_loss_clip": 0.06398489, + "auxiliary_loss_mlp": 0.0126099, + "balance_loss_clip": 0.06271066, + "balance_loss_mlp": 0.01252025, + "epoch": 0.788215842477078, + "flos": 24942225340800.0, + "grad_norm": 1.5302071478358445, + "language_loss": 0.72546446, + "learning_rate": 4.5222790276677126e-07, + "loss": 0.80205929, + "num_input_tokens_seen": 282902680, + "router_z_loss_clip": 1.2734375, + "router_z_loss_mlp": 0.08959961, + "step": 13110, + "time_per_iteration": 2.5210487842559814 + }, + { + "auxiliary_loss_clip": 0.06396982, + "auxiliary_loss_mlp": 0.01264197, + "balance_loss_clip": 0.06268892, + "balance_loss_mlp": 0.01255453, + "epoch": 0.788275965729746, + "flos": 26114054290560.0, + "grad_norm": 1.2956006250382688, + "language_loss": 0.75373393, + "learning_rate": 4.5198127634759455e-07, + "loss": 0.83034575, + "num_input_tokens_seen": 282923625, + "router_z_loss_clip": 1.27929688, + "router_z_loss_mlp": 0.08734131, + "step": 13111, + "time_per_iteration": 2.5650205612182617 + }, + { + "auxiliary_loss_clip": 0.06403745, + "auxiliary_loss_mlp": 0.01269317, + "balance_loss_clip": 0.06272365, + "balance_loss_mlp": 0.01259524, + "epoch": 0.7883360889824139, + "flos": 21221123771520.0, + "grad_norm": 1.7931682275164638, + "language_loss": 0.6193608, + "learning_rate": 4.5173470862944206e-07, + "loss": 0.69609141, + "num_input_tokens_seen": 282941955, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09790039, + "step": 13112, + "time_per_iteration": 2.5178818702697754 + }, + { + "auxiliary_loss_clip": 0.06402722, + "auxiliary_loss_mlp": 0.01268033, + "balance_loss_clip": 0.06271605, + "balance_loss_mlp": 0.01258025, + "epoch": 0.7883962122350819, + "flos": 21148979806080.0, + "grad_norm": 1.7329728491097858, + "language_loss": 0.67358041, + "learning_rate": 4.514881996216644e-07, + "loss": 0.75028789, + "num_input_tokens_seen": 282961280, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.10003662, + "step": 13113, + "time_per_iteration": 2.4997618198394775 + }, + { + "auxiliary_loss_clip": 0.06400861, + "auxiliary_loss_mlp": 0.01265802, + "balance_loss_clip": 0.06271458, + "balance_loss_mlp": 0.01256629, + "epoch": 0.7884563354877498, + "flos": 15308031392640.0, + "grad_norm": 2.191522970823139, + "language_loss": 0.58949661, + "learning_rate": 4.5124174933361e-07, + "loss": 0.66616333, + "num_input_tokens_seen": 282978210, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.0916748, + "step": 13114, + "time_per_iteration": 2.499992609024048 + }, + { + "auxiliary_loss_clip": 0.06405228, + "auxiliary_loss_mlp": 0.01263713, + "balance_loss_clip": 0.06271623, + "balance_loss_mlp": 0.01254063, + "epoch": 0.7885164587404179, + "flos": 24395024252160.0, + "grad_norm": 2.5351098559279452, + "language_loss": 0.67195284, + "learning_rate": 4.5099535777462306e-07, + "loss": 0.74864221, + "num_input_tokens_seen": 282998845, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.09649658, + "step": 13115, + "time_per_iteration": 2.6665830612182617 + }, + { + "auxiliary_loss_clip": 0.06404252, + "auxiliary_loss_mlp": 0.0126713, + "balance_loss_clip": 0.06272528, + "balance_loss_mlp": 0.0125732, + "epoch": 0.7885765819930858, + "flos": 14390047987200.0, + "grad_norm": 1.969107246296687, + "language_loss": 0.8892082, + "learning_rate": 4.50749024954048e-07, + "loss": 0.965922, + "num_input_tokens_seen": 283015200, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.0980835, + "step": 13116, + "time_per_iteration": 2.488569498062134 + }, + { + "auxiliary_loss_clip": 0.06413092, + "auxiliary_loss_mlp": 0.0126853, + "balance_loss_clip": 0.06272166, + "balance_loss_mlp": 0.01257551, + "epoch": 0.7886367052457538, + "flos": 18265835393280.0, + "grad_norm": 2.2399693742143296, + "language_loss": 0.73226219, + "learning_rate": 4.505027508812245e-07, + "loss": 0.80907845, + "num_input_tokens_seen": 283033680, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.10986328, + "step": 13117, + "time_per_iteration": 2.4811642169952393 + }, + { + "auxiliary_loss_clip": 0.06399462, + "auxiliary_loss_mlp": 0.01262163, + "balance_loss_clip": 0.06271145, + "balance_loss_mlp": 0.0125355, + "epoch": 0.7886968284984217, + "flos": 15310588942080.0, + "grad_norm": 1.3858230532181541, + "language_loss": 0.80464065, + "learning_rate": 4.502565355654926e-07, + "loss": 0.88125694, + "num_input_tokens_seen": 283050620, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.08612061, + "step": 13118, + "time_per_iteration": 2.486297369003296 + }, + { + "auxiliary_loss_clip": 0.06400422, + "auxiliary_loss_mlp": 0.01266146, + "balance_loss_clip": 0.06270169, + "balance_loss_mlp": 0.01256538, + "epoch": 0.7887569517510897, + "flos": 21221878458240.0, + "grad_norm": 1.766770664669928, + "language_loss": 0.7323485, + "learning_rate": 4.500103790161878e-07, + "loss": 0.80901414, + "num_input_tokens_seen": 283070215, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.09613037, + "step": 13119, + "time_per_iteration": 2.4904284477233887 + }, + { + "auxiliary_loss_clip": 0.06406539, + "auxiliary_loss_mlp": 0.01262086, + "balance_loss_clip": 0.06272633, + "balance_loss_mlp": 0.01253146, + "epoch": 0.7888170750037578, + "flos": 22717894055040.0, + "grad_norm": 1.2838410999725969, + "language_loss": 0.7203325, + "learning_rate": 4.4976428124264454e-07, + "loss": 0.79701877, + "num_input_tokens_seen": 283091485, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.0894165, + "step": 13120, + "time_per_iteration": 2.531905174255371 + }, + { + "auxiliary_loss_clip": 0.06402384, + "auxiliary_loss_mlp": 0.01269736, + "balance_loss_clip": 0.0627251, + "balance_loss_mlp": 0.01259919, + "epoch": 0.7888771982564257, + "flos": 36437976322560.0, + "grad_norm": 1.5849995361084, + "language_loss": 0.79042959, + "learning_rate": 4.4951824225419564e-07, + "loss": 0.86715084, + "num_input_tokens_seen": 283115040, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.0980835, + "step": 13121, + "time_per_iteration": 2.6270458698272705 + }, + { + "auxiliary_loss_clip": 0.06399482, + "auxiliary_loss_mlp": 0.01265138, + "balance_loss_clip": 0.06271152, + "balance_loss_mlp": 0.01255524, + "epoch": 0.7889373215090937, + "flos": 27317678664960.0, + "grad_norm": 1.3500924966016437, + "language_loss": 0.80276608, + "learning_rate": 4.4927226206017057e-07, + "loss": 0.87941229, + "num_input_tokens_seen": 283136925, + "router_z_loss_clip": 1.28320312, + "router_z_loss_mlp": 0.09613037, + "step": 13122, + "time_per_iteration": 2.5672237873077393 + }, + { + "auxiliary_loss_clip": 0.06403008, + "auxiliary_loss_mlp": 0.01263927, + "balance_loss_clip": 0.06269404, + "balance_loss_mlp": 0.0125526, + "epoch": 0.7889974447617616, + "flos": 19835210839680.0, + "grad_norm": 1.809945605348313, + "language_loss": 0.78323883, + "learning_rate": 4.4902634066989597e-07, + "loss": 0.85990816, + "num_input_tokens_seen": 283155725, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.08666992, + "step": 13123, + "time_per_iteration": 2.5139808654785156 + }, + { + "auxiliary_loss_clip": 0.06405288, + "auxiliary_loss_mlp": 0.01262619, + "balance_loss_clip": 0.06270181, + "balance_loss_mlp": 0.01253154, + "epoch": 0.7890575680144296, + "flos": 17276336928000.0, + "grad_norm": 3.407845901525998, + "language_loss": 0.67230475, + "learning_rate": 4.487804780926985e-07, + "loss": 0.7489838, + "num_input_tokens_seen": 283173845, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.09466553, + "step": 13124, + "time_per_iteration": 3.877263069152832 + }, + { + "auxiliary_loss_clip": 0.06410992, + "auxiliary_loss_mlp": 0.01265224, + "balance_loss_clip": 0.06275047, + "balance_loss_mlp": 0.01255598, + "epoch": 0.7891176912670975, + "flos": 27607596192000.0, + "grad_norm": 2.1455737597716995, + "language_loss": 0.73154545, + "learning_rate": 4.4853467433790036e-07, + "loss": 0.80830753, + "num_input_tokens_seen": 283191985, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.09619141, + "step": 13125, + "time_per_iteration": 2.5944886207580566 + }, + { + "auxiliary_loss_clip": 0.06402256, + "auxiliary_loss_mlp": 0.01261205, + "balance_loss_clip": 0.06267411, + "balance_loss_mlp": 0.01251728, + "epoch": 0.7891778145197655, + "flos": 22718397179520.0, + "grad_norm": 1.8448957307034948, + "language_loss": 0.73224074, + "learning_rate": 4.4828892941482267e-07, + "loss": 0.80887532, + "num_input_tokens_seen": 283210855, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.09472656, + "step": 13126, + "time_per_iteration": 2.6197116374969482 + }, + { + "auxiliary_loss_clip": 0.06406458, + "auxiliary_loss_mlp": 0.01265351, + "balance_loss_clip": 0.06271337, + "balance_loss_mlp": 0.0125604, + "epoch": 0.7892379377724335, + "flos": 17316433906560.0, + "grad_norm": 1.6718073300601826, + "language_loss": 0.77387738, + "learning_rate": 4.480432433327845e-07, + "loss": 0.85059547, + "num_input_tokens_seen": 283229665, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.09301758, + "step": 13127, + "time_per_iteration": 2.475583553314209 + }, + { + "auxiliary_loss_clip": 0.06398283, + "auxiliary_loss_mlp": 0.01266293, + "balance_loss_clip": 0.06270358, + "balance_loss_mlp": 0.01256649, + "epoch": 0.7892980610251015, + "flos": 25782781973760.0, + "grad_norm": 1.6570002472061196, + "language_loss": 0.85693359, + "learning_rate": 4.47797616101103e-07, + "loss": 0.93357939, + "num_input_tokens_seen": 283248615, + "router_z_loss_clip": 1.27734375, + "router_z_loss_mlp": 0.09643555, + "step": 13128, + "time_per_iteration": 2.506098508834839 + }, + { + "auxiliary_loss_clip": 0.06401196, + "auxiliary_loss_mlp": 0.01265664, + "balance_loss_clip": 0.06271003, + "balance_loss_mlp": 0.01256634, + "epoch": 0.7893581842777694, + "flos": 21586371719040.0, + "grad_norm": 1.9505455740147257, + "language_loss": 0.69738185, + "learning_rate": 4.475520477290904e-07, + "loss": 0.77405041, + "num_input_tokens_seen": 283267135, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.09033203, + "step": 13129, + "time_per_iteration": 2.492781400680542 + }, + { + "auxiliary_loss_clip": 0.06314191, + "auxiliary_loss_mlp": 0.01255045, + "balance_loss_clip": 0.06258637, + "balance_loss_mlp": 0.01254005, + "epoch": 0.7894183075304374, + "flos": 69037773793920.0, + "grad_norm": 0.7003894761434999, + "language_loss": 0.61533356, + "learning_rate": 4.473065382260597e-07, + "loss": 0.69102591, + "num_input_tokens_seen": 283328940, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01041412, + "step": 13130, + "time_per_iteration": 3.109016180038452 + }, + { + "auxiliary_loss_clip": 0.06405208, + "auxiliary_loss_mlp": 0.01262252, + "balance_loss_clip": 0.06272055, + "balance_loss_mlp": 0.01252686, + "epoch": 0.7894784307831053, + "flos": 24250107415680.0, + "grad_norm": 1.475922878769178, + "language_loss": 0.74187315, + "learning_rate": 4.4706108760132124e-07, + "loss": 0.81854773, + "num_input_tokens_seen": 283350000, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.09564209, + "step": 13131, + "time_per_iteration": 2.526529312133789 + }, + { + "auxiliary_loss_clip": 0.06417171, + "auxiliary_loss_mlp": 0.01266681, + "balance_loss_clip": 0.06273621, + "balance_loss_mlp": 0.01255297, + "epoch": 0.7895385540357733, + "flos": 20272770460800.0, + "grad_norm": 15.433314794516651, + "language_loss": 0.69895113, + "learning_rate": 4.4681569586418153e-07, + "loss": 0.77578956, + "num_input_tokens_seen": 283368020, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.11376953, + "step": 13132, + "time_per_iteration": 2.5669658184051514 + }, + { + "auxiliary_loss_clip": 0.06403211, + "auxiliary_loss_mlp": 0.01266676, + "balance_loss_clip": 0.06269971, + "balance_loss_mlp": 0.01256573, + "epoch": 0.7895986772884414, + "flos": 21002972866560.0, + "grad_norm": 2.4066374074433186, + "language_loss": 0.61959308, + "learning_rate": 4.465703630239468e-07, + "loss": 0.69629192, + "num_input_tokens_seen": 283387030, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.10113525, + "step": 13133, + "time_per_iteration": 2.4860470294952393 + }, + { + "auxiliary_loss_clip": 0.06406127, + "auxiliary_loss_mlp": 0.01270355, + "balance_loss_clip": 0.06272439, + "balance_loss_mlp": 0.01259644, + "epoch": 0.7896588005411093, + "flos": 18663423816960.0, + "grad_norm": 2.0571343653676326, + "language_loss": 0.8017205, + "learning_rate": 4.463250890899195e-07, + "loss": 0.87848526, + "num_input_tokens_seen": 283402090, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.10717773, + "step": 13134, + "time_per_iteration": 3.9168148040771484 + }, + { + "auxiliary_loss_clip": 0.06404164, + "auxiliary_loss_mlp": 0.01263167, + "balance_loss_clip": 0.06271005, + "balance_loss_mlp": 0.01254059, + "epoch": 0.7897189237937773, + "flos": 18411842332800.0, + "grad_norm": 2.033133539223884, + "language_loss": 0.80772352, + "learning_rate": 4.460798740713998e-07, + "loss": 0.88439691, + "num_input_tokens_seen": 283421035, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.09112549, + "step": 13135, + "time_per_iteration": 2.4654078483581543 + }, + { + "auxiliary_loss_clip": 0.06399068, + "auxiliary_loss_mlp": 0.01263162, + "balance_loss_clip": 0.06268865, + "balance_loss_mlp": 0.01253089, + "epoch": 0.7897790470464452, + "flos": 23738223623040.0, + "grad_norm": 1.6530850460824498, + "language_loss": 0.72782981, + "learning_rate": 4.4583471797768733e-07, + "loss": 0.80445212, + "num_input_tokens_seen": 283441830, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.10076904, + "step": 13136, + "time_per_iteration": 2.5253071784973145 + }, + { + "auxiliary_loss_clip": 0.06410457, + "auxiliary_loss_mlp": 0.01263296, + "balance_loss_clip": 0.06271002, + "balance_loss_mlp": 0.01252222, + "epoch": 0.7898391702991132, + "flos": 15923477232000.0, + "grad_norm": 2.3537390068214656, + "language_loss": 0.70506489, + "learning_rate": 4.455896208180778e-07, + "loss": 0.78180242, + "num_input_tokens_seen": 283459540, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.11077881, + "step": 13137, + "time_per_iteration": 2.468620777130127 + }, + { + "auxiliary_loss_clip": 0.06401488, + "auxiliary_loss_mlp": 0.01264377, + "balance_loss_clip": 0.06271732, + "balance_loss_mlp": 0.01254506, + "epoch": 0.7898992935517811, + "flos": 19835252766720.0, + "grad_norm": 1.578942697411419, + "language_loss": 0.74176329, + "learning_rate": 4.4534458260186645e-07, + "loss": 0.81842196, + "num_input_tokens_seen": 283478790, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.09869385, + "step": 13138, + "time_per_iteration": 3.9565515518188477 + }, + { + "auxiliary_loss_clip": 0.0640148, + "auxiliary_loss_mlp": 0.0126554, + "balance_loss_clip": 0.06271301, + "balance_loss_mlp": 0.01256271, + "epoch": 0.7899594168044491, + "flos": 16221738240000.0, + "grad_norm": 1.9480374334640547, + "language_loss": 0.686391, + "learning_rate": 4.4509960333834426e-07, + "loss": 0.76306117, + "num_input_tokens_seen": 283495720, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.09277344, + "step": 13139, + "time_per_iteration": 2.4804084300994873 + }, + { + "auxiliary_loss_clip": 0.06313749, + "auxiliary_loss_mlp": 0.01251905, + "balance_loss_clip": 0.06258325, + "balance_loss_mlp": 0.01250762, + "epoch": 0.790019540057117, + "flos": 68353496225280.0, + "grad_norm": 0.8282799229852567, + "language_loss": 0.60166419, + "learning_rate": 4.448546830368003e-07, + "loss": 0.67732072, + "num_input_tokens_seen": 283558795, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01143646, + "step": 13140, + "time_per_iteration": 3.181234359741211 + }, + { + "auxiliary_loss_clip": 0.06408462, + "auxiliary_loss_mlp": 0.01266869, + "balance_loss_clip": 0.06275274, + "balance_loss_mlp": 0.01257619, + "epoch": 0.7900796633097851, + "flos": 30340037836800.0, + "grad_norm": 1.5194345427413907, + "language_loss": 0.76587826, + "learning_rate": 4.4460982170652304e-07, + "loss": 0.84263158, + "num_input_tokens_seen": 283579305, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.09259033, + "step": 13141, + "time_per_iteration": 2.5935022830963135 + }, + { + "auxiliary_loss_clip": 0.06406665, + "auxiliary_loss_mlp": 0.01265708, + "balance_loss_clip": 0.06272526, + "balance_loss_mlp": 0.01255421, + "epoch": 0.790139786562453, + "flos": 22133237391360.0, + "grad_norm": 1.706504607669126, + "language_loss": 0.68517488, + "learning_rate": 4.4436501935679694e-07, + "loss": 0.76189852, + "num_input_tokens_seen": 283597840, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.10290527, + "step": 13142, + "time_per_iteration": 3.9123146533966064 + }, + { + "auxiliary_loss_clip": 0.06313135, + "auxiliary_loss_mlp": 0.01253569, + "balance_loss_clip": 0.06257692, + "balance_loss_mlp": 0.01252476, + "epoch": 0.790199909815121, + "flos": 58225210277760.0, + "grad_norm": 0.7895590429355487, + "language_loss": 0.59896362, + "learning_rate": 4.441202759969049e-07, + "loss": 0.6746307, + "num_input_tokens_seen": 283647950, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01094818, + "step": 13143, + "time_per_iteration": 2.9545323848724365 + }, + { + "auxiliary_loss_clip": 0.06407971, + "auxiliary_loss_mlp": 0.01265938, + "balance_loss_clip": 0.06271341, + "balance_loss_mlp": 0.01255495, + "epoch": 0.7902600330677889, + "flos": 34542066314880.0, + "grad_norm": 1.4595073006493966, + "language_loss": 0.74559182, + "learning_rate": 4.4387559163612875e-07, + "loss": 0.82233089, + "num_input_tokens_seen": 283670645, + "router_z_loss_clip": 1.36621094, + "router_z_loss_mlp": 0.10443115, + "step": 13144, + "time_per_iteration": 2.6375374794006348 + }, + { + "auxiliary_loss_clip": 0.06405632, + "auxiliary_loss_mlp": 0.01270956, + "balance_loss_clip": 0.0627213, + "balance_loss_mlp": 0.01260537, + "epoch": 0.7903201563204569, + "flos": 22352981523840.0, + "grad_norm": 1.6890449908385896, + "language_loss": 0.83446616, + "learning_rate": 4.4363096628374605e-07, + "loss": 0.91123205, + "num_input_tokens_seen": 283688830, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.10424805, + "step": 13145, + "time_per_iteration": 2.499363660812378 + }, + { + "auxiliary_loss_clip": 0.06395718, + "auxiliary_loss_mlp": 0.01261823, + "balance_loss_clip": 0.06268772, + "balance_loss_mlp": 0.01252971, + "epoch": 0.790380279573125, + "flos": 22059919468800.0, + "grad_norm": 1.6613829846262294, + "language_loss": 0.7342999, + "learning_rate": 4.4338639994903235e-07, + "loss": 0.81087536, + "num_input_tokens_seen": 283708625, + "router_z_loss_clip": 1.26953125, + "router_z_loss_mlp": 0.08862305, + "step": 13146, + "time_per_iteration": 2.515782356262207 + }, + { + "auxiliary_loss_clip": 0.06406832, + "auxiliary_loss_mlp": 0.01262426, + "balance_loss_clip": 0.0627181, + "balance_loss_mlp": 0.01252704, + "epoch": 0.7904404028257929, + "flos": 20308758589440.0, + "grad_norm": 1.836231171589266, + "language_loss": 0.76197815, + "learning_rate": 4.4314189264126246e-07, + "loss": 0.83867073, + "num_input_tokens_seen": 283725710, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.09716797, + "step": 13147, + "time_per_iteration": 2.4807651042938232 + }, + { + "auxiliary_loss_clip": 0.06400219, + "auxiliary_loss_mlp": 0.01266803, + "balance_loss_clip": 0.06270348, + "balance_loss_mlp": 0.01256921, + "epoch": 0.7905005260784609, + "flos": 20014732212480.0, + "grad_norm": 1.7419913226116706, + "language_loss": 0.72276485, + "learning_rate": 4.428974443697087e-07, + "loss": 0.79943514, + "num_input_tokens_seen": 283744150, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.09881592, + "step": 13148, + "time_per_iteration": 2.506728410720825 + }, + { + "auxiliary_loss_clip": 0.06406561, + "auxiliary_loss_mlp": 0.01264165, + "balance_loss_clip": 0.06271912, + "balance_loss_mlp": 0.01253782, + "epoch": 0.7905606493311288, + "flos": 26913088425600.0, + "grad_norm": 1.5866446208537701, + "language_loss": 0.71421397, + "learning_rate": 4.4265305514363913e-07, + "loss": 0.79092121, + "num_input_tokens_seen": 283764170, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.1038208, + "step": 13149, + "time_per_iteration": 2.5299153327941895 + }, + { + "auxiliary_loss_clip": 0.0640769, + "auxiliary_loss_mlp": 0.01263913, + "balance_loss_clip": 0.0627196, + "balance_loss_mlp": 0.01253417, + "epoch": 0.7906207725837968, + "flos": 23703032108160.0, + "grad_norm": 2.1166900358706138, + "language_loss": 0.65887839, + "learning_rate": 4.424087249723225e-07, + "loss": 0.73559439, + "num_input_tokens_seen": 283784305, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.10498047, + "step": 13150, + "time_per_iteration": 2.5118424892425537 + }, + { + "auxiliary_loss_clip": 0.06400509, + "auxiliary_loss_mlp": 0.01263964, + "balance_loss_clip": 0.06269284, + "balance_loss_mlp": 0.01254171, + "epoch": 0.7906808958364647, + "flos": 20854911502080.0, + "grad_norm": 1.5600793718059285, + "language_loss": 0.70213783, + "learning_rate": 4.421644538650231e-07, + "loss": 0.77878249, + "num_input_tokens_seen": 283804040, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09790039, + "step": 13151, + "time_per_iteration": 2.479990243911743 + }, + { + "auxiliary_loss_clip": 0.06407944, + "auxiliary_loss_mlp": 0.01264552, + "balance_loss_clip": 0.06272637, + "balance_loss_mlp": 0.01254682, + "epoch": 0.7907410190891327, + "flos": 40744866585600.0, + "grad_norm": 1.3436721274508034, + "language_loss": 0.70374179, + "learning_rate": 4.4192024183100306e-07, + "loss": 0.78046679, + "num_input_tokens_seen": 283827120, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.09875488, + "step": 13152, + "time_per_iteration": 2.66023850440979 + }, + { + "auxiliary_loss_clip": 0.06400564, + "auxiliary_loss_mlp": 0.01268098, + "balance_loss_clip": 0.06269571, + "balance_loss_mlp": 0.01258919, + "epoch": 0.7908011423418007, + "flos": 13266198299520.0, + "grad_norm": 1.733827476588534, + "language_loss": 0.72901142, + "learning_rate": 4.4167608887952367e-07, + "loss": 0.8056981, + "num_input_tokens_seen": 283844820, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.09179688, + "step": 13153, + "time_per_iteration": 2.4535181522369385 + }, + { + "auxiliary_loss_clip": 0.06401587, + "auxiliary_loss_mlp": 0.01266515, + "balance_loss_clip": 0.06268425, + "balance_loss_mlp": 0.01256502, + "epoch": 0.7908612655944687, + "flos": 19760718960000.0, + "grad_norm": 1.4410962438109587, + "language_loss": 0.78749764, + "learning_rate": 4.4143199501984306e-07, + "loss": 0.86417866, + "num_input_tokens_seen": 283862870, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.10009766, + "step": 13154, + "time_per_iteration": 2.481267213821411 + }, + { + "auxiliary_loss_clip": 0.06410754, + "auxiliary_loss_mlp": 0.01263056, + "balance_loss_clip": 0.06270463, + "balance_loss_mlp": 0.01252286, + "epoch": 0.7909213888471366, + "flos": 21294064350720.0, + "grad_norm": 1.8857519871038082, + "language_loss": 0.70335776, + "learning_rate": 4.411879602612185e-07, + "loss": 0.78009582, + "num_input_tokens_seen": 283882405, + "router_z_loss_clip": 1.40332031, + "router_z_loss_mlp": 0.10778809, + "step": 13155, + "time_per_iteration": 2.474088668823242 + }, + { + "auxiliary_loss_clip": 0.06405213, + "auxiliary_loss_mlp": 0.01266856, + "balance_loss_clip": 0.06271385, + "balance_loss_mlp": 0.01257069, + "epoch": 0.7909815120998046, + "flos": 22535521643520.0, + "grad_norm": 2.510036385951424, + "language_loss": 0.77293575, + "learning_rate": 4.4094398461290174e-07, + "loss": 0.8496564, + "num_input_tokens_seen": 283902070, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.09790039, + "step": 13156, + "time_per_iteration": 2.513814926147461 + }, + { + "auxiliary_loss_clip": 0.06403618, + "auxiliary_loss_mlp": 0.01263274, + "balance_loss_clip": 0.06271893, + "balance_loss_mlp": 0.01254185, + "epoch": 0.7910416353524725, + "flos": 26735537623680.0, + "grad_norm": 1.591424288088247, + "language_loss": 0.65432274, + "learning_rate": 4.4070006808414526e-07, + "loss": 0.73099172, + "num_input_tokens_seen": 283924100, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09088135, + "step": 13157, + "time_per_iteration": 2.534609079360962 + }, + { + "auxiliary_loss_clip": 0.0640482, + "auxiliary_loss_mlp": 0.01266464, + "balance_loss_clip": 0.06269716, + "balance_loss_mlp": 0.01256272, + "epoch": 0.7911017586051405, + "flos": 24651804689280.0, + "grad_norm": 2.191693050285661, + "language_loss": 0.7477805, + "learning_rate": 4.4045621068419894e-07, + "loss": 0.82449341, + "num_input_tokens_seen": 283944955, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.10192871, + "step": 13158, + "time_per_iteration": 2.5379066467285156 + }, + { + "auxiliary_loss_clip": 0.06396219, + "auxiliary_loss_mlp": 0.01263878, + "balance_loss_clip": 0.06268845, + "balance_loss_mlp": 0.01255116, + "epoch": 0.7911618818578086, + "flos": 17571076064640.0, + "grad_norm": 1.9112834208400953, + "language_loss": 0.67451692, + "learning_rate": 4.40212412422309e-07, + "loss": 0.75111789, + "num_input_tokens_seen": 283963125, + "router_z_loss_clip": 1.2734375, + "router_z_loss_mlp": 0.08764648, + "step": 13159, + "time_per_iteration": 2.464768171310425 + }, + { + "auxiliary_loss_clip": 0.06400043, + "auxiliary_loss_mlp": 0.01266297, + "balance_loss_clip": 0.06269793, + "balance_loss_mlp": 0.0125645, + "epoch": 0.7912220051104765, + "flos": 16726326727680.0, + "grad_norm": 1.6817860395466344, + "language_loss": 0.67496979, + "learning_rate": 4.399686733077206e-07, + "loss": 0.75163317, + "num_input_tokens_seen": 283982850, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.09838867, + "step": 13160, + "time_per_iteration": 2.5563478469848633 + }, + { + "auxiliary_loss_clip": 0.0639656, + "auxiliary_loss_mlp": 0.01260248, + "balance_loss_clip": 0.06270408, + "balance_loss_mlp": 0.01252225, + "epoch": 0.7912821283631445, + "flos": 13703799847680.0, + "grad_norm": 1.7956028234892243, + "language_loss": 0.73223495, + "learning_rate": 4.3972499334967694e-07, + "loss": 0.80880302, + "num_input_tokens_seen": 283998275, + "router_z_loss_clip": 1.26171875, + "router_z_loss_mlp": 0.08007812, + "step": 13161, + "time_per_iteration": 2.449843406677246 + }, + { + "auxiliary_loss_clip": 0.0639775, + "auxiliary_loss_mlp": 0.01264548, + "balance_loss_clip": 0.06270458, + "balance_loss_mlp": 0.01255142, + "epoch": 0.7913422516158124, + "flos": 23775804979200.0, + "grad_norm": 1.579946795431406, + "language_loss": 0.73348385, + "learning_rate": 4.39481372557418e-07, + "loss": 0.81010681, + "num_input_tokens_seen": 284018750, + "router_z_loss_clip": 1.2734375, + "router_z_loss_mlp": 0.09399414, + "step": 13162, + "time_per_iteration": 2.538973093032837 + }, + { + "auxiliary_loss_clip": 0.06408161, + "auxiliary_loss_mlp": 0.01265697, + "balance_loss_clip": 0.06272799, + "balance_loss_mlp": 0.01255326, + "epoch": 0.7914023748684804, + "flos": 19944433036800.0, + "grad_norm": 3.1550813809291127, + "language_loss": 0.72027671, + "learning_rate": 4.392378109401811e-07, + "loss": 0.79701531, + "num_input_tokens_seen": 284037850, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.10369873, + "step": 13163, + "time_per_iteration": 2.481580972671509 + }, + { + "auxiliary_loss_clip": 0.06402975, + "auxiliary_loss_mlp": 0.01263483, + "balance_loss_clip": 0.06272998, + "balance_loss_mlp": 0.01253315, + "epoch": 0.7914624981211483, + "flos": 20601065957760.0, + "grad_norm": 1.7688129227744467, + "language_loss": 0.69559741, + "learning_rate": 4.3899430850720296e-07, + "loss": 0.77226198, + "num_input_tokens_seen": 284056380, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.10168457, + "step": 13164, + "time_per_iteration": 3.9441864490509033 + }, + { + "auxiliary_loss_clip": 0.06400138, + "auxiliary_loss_mlp": 0.01262142, + "balance_loss_clip": 0.0626981, + "balance_loss_mlp": 0.01253058, + "epoch": 0.7915226213738163, + "flos": 21806031997440.0, + "grad_norm": 1.639968913344359, + "language_loss": 0.66723585, + "learning_rate": 4.387508652677177e-07, + "loss": 0.74385864, + "num_input_tokens_seen": 284074945, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.09088135, + "step": 13165, + "time_per_iteration": 2.480177164077759 + }, + { + "auxiliary_loss_clip": 0.06395824, + "auxiliary_loss_mlp": 0.01263637, + "balance_loss_clip": 0.06268749, + "balance_loss_mlp": 0.01254887, + "epoch": 0.7915827446264843, + "flos": 16293714497280.0, + "grad_norm": 1.7980788419504534, + "language_loss": 0.72814763, + "learning_rate": 4.385074812309557e-07, + "loss": 0.80474222, + "num_input_tokens_seen": 284092070, + "router_z_loss_clip": 1.27148438, + "router_z_loss_mlp": 0.08758545, + "step": 13166, + "time_per_iteration": 2.5405478477478027 + }, + { + "auxiliary_loss_clip": 0.06400768, + "auxiliary_loss_mlp": 0.01267015, + "balance_loss_clip": 0.06271509, + "balance_loss_mlp": 0.01256602, + "epoch": 0.7916428678791523, + "flos": 25709673686400.0, + "grad_norm": 1.5950499739045652, + "language_loss": 0.77752012, + "learning_rate": 4.382641564061462e-07, + "loss": 0.85419798, + "num_input_tokens_seen": 284112255, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.10412598, + "step": 13167, + "time_per_iteration": 2.513096332550049 + }, + { + "auxiliary_loss_clip": 0.06400877, + "auxiliary_loss_mlp": 0.01265571, + "balance_loss_clip": 0.0627252, + "balance_loss_mlp": 0.01256553, + "epoch": 0.7917029911318202, + "flos": 23885320665600.0, + "grad_norm": 1.5971175695751862, + "language_loss": 0.84140885, + "learning_rate": 4.3802089080251713e-07, + "loss": 0.9180733, + "num_input_tokens_seen": 284132330, + "router_z_loss_clip": 1.28222656, + "router_z_loss_mlp": 0.09020996, + "step": 13168, + "time_per_iteration": 2.5276131629943848 + }, + { + "auxiliary_loss_clip": 0.06402327, + "auxiliary_loss_mlp": 0.0126475, + "balance_loss_clip": 0.06270839, + "balance_loss_mlp": 0.01254939, + "epoch": 0.7917631143844882, + "flos": 21651975066240.0, + "grad_norm": 1.4948037375095564, + "language_loss": 0.72659689, + "learning_rate": 4.3777768442929155e-07, + "loss": 0.8032676, + "num_input_tokens_seen": 284150640, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09820557, + "step": 13169, + "time_per_iteration": 2.476069211959839 + }, + { + "auxiliary_loss_clip": 0.06405612, + "auxiliary_loss_mlp": 0.01262617, + "balance_loss_clip": 0.06269795, + "balance_loss_mlp": 0.01252794, + "epoch": 0.7918232376371561, + "flos": 38883519187200.0, + "grad_norm": 1.931209408255316, + "language_loss": 0.674968, + "learning_rate": 4.3753453729569287e-07, + "loss": 0.75165027, + "num_input_tokens_seen": 284171910, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.09820557, + "step": 13170, + "time_per_iteration": 2.632267951965332 + }, + { + "auxiliary_loss_clip": 0.06402327, + "auxiliary_loss_mlp": 0.01263649, + "balance_loss_clip": 0.06270221, + "balance_loss_mlp": 0.01255108, + "epoch": 0.7918833608898241, + "flos": 20781551652480.0, + "grad_norm": 1.5871676794676228, + "language_loss": 0.70988441, + "learning_rate": 4.372914494109412e-07, + "loss": 0.7865442, + "num_input_tokens_seen": 284191340, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.08544922, + "step": 13171, + "time_per_iteration": 2.510680675506592 + }, + { + "auxiliary_loss_clip": 0.06402034, + "auxiliary_loss_mlp": 0.01267973, + "balance_loss_clip": 0.06270307, + "balance_loss_mlp": 0.0125855, + "epoch": 0.7919434841424922, + "flos": 33918276994560.0, + "grad_norm": 2.589962482835532, + "language_loss": 0.67366862, + "learning_rate": 4.370484207842553e-07, + "loss": 0.75036865, + "num_input_tokens_seen": 284212495, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.09417725, + "step": 13172, + "time_per_iteration": 2.6106696128845215 + }, + { + "auxiliary_loss_clip": 0.06403903, + "auxiliary_loss_mlp": 0.0126396, + "balance_loss_clip": 0.06273881, + "balance_loss_mlp": 0.01254209, + "epoch": 0.7920036073951601, + "flos": 21070253295360.0, + "grad_norm": 1.738065699124664, + "language_loss": 0.80093193, + "learning_rate": 4.3680545142484893e-07, + "loss": 0.87761056, + "num_input_tokens_seen": 284230825, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.09753418, + "step": 13173, + "time_per_iteration": 3.950551986694336 + }, + { + "auxiliary_loss_clip": 0.06400689, + "auxiliary_loss_mlp": 0.01261307, + "balance_loss_clip": 0.06269704, + "balance_loss_mlp": 0.01252307, + "epoch": 0.7920637306478281, + "flos": 23662138515840.0, + "grad_norm": 1.8426798849917176, + "language_loss": 0.77325201, + "learning_rate": 4.365625413419365e-07, + "loss": 0.84987199, + "num_input_tokens_seen": 284250365, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09002686, + "step": 13174, + "time_per_iteration": 2.591482639312744 + }, + { + "auxiliary_loss_clip": 0.06398596, + "auxiliary_loss_mlp": 0.01261992, + "balance_loss_clip": 0.06270695, + "balance_loss_mlp": 0.01253219, + "epoch": 0.792123853900496, + "flos": 27202251265920.0, + "grad_norm": 1.5031237737360255, + "language_loss": 0.71669394, + "learning_rate": 4.363196905447297e-07, + "loss": 0.79329979, + "num_input_tokens_seen": 284269635, + "router_z_loss_clip": 1.27832031, + "router_z_loss_mlp": 0.08770752, + "step": 13175, + "time_per_iteration": 2.587193489074707 + }, + { + "auxiliary_loss_clip": 0.06401914, + "auxiliary_loss_mlp": 0.01263613, + "balance_loss_clip": 0.06270476, + "balance_loss_mlp": 0.01254601, + "epoch": 0.792183977153164, + "flos": 19104631090560.0, + "grad_norm": 1.9608803410251472, + "language_loss": 0.59982938, + "learning_rate": 4.360768990424364e-07, + "loss": 0.67648464, + "num_input_tokens_seen": 284288380, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09014893, + "step": 13176, + "time_per_iteration": 2.4545774459838867 + }, + { + "auxiliary_loss_clip": 0.06398389, + "auxiliary_loss_mlp": 0.01268261, + "balance_loss_clip": 0.06270067, + "balance_loss_mlp": 0.01258635, + "epoch": 0.7922441004058319, + "flos": 17134564619520.0, + "grad_norm": 1.8342420107617015, + "language_loss": 0.73352873, + "learning_rate": 4.3583416684426376e-07, + "loss": 0.81019521, + "num_input_tokens_seen": 284306920, + "router_z_loss_clip": 1.28320312, + "router_z_loss_mlp": 0.09619141, + "step": 13177, + "time_per_iteration": 3.9278790950775146 + }, + { + "auxiliary_loss_clip": 0.06401221, + "auxiliary_loss_mlp": 0.0126363, + "balance_loss_clip": 0.06272082, + "balance_loss_mlp": 0.01254475, + "epoch": 0.7923042236585, + "flos": 17827395304320.0, + "grad_norm": 1.8523697538025845, + "language_loss": 0.64460981, + "learning_rate": 4.355914939594174e-07, + "loss": 0.72125828, + "num_input_tokens_seen": 284324700, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.09155273, + "step": 13178, + "time_per_iteration": 2.464949131011963 + }, + { + "auxiliary_loss_clip": 0.06402718, + "auxiliary_loss_mlp": 0.01261465, + "balance_loss_clip": 0.06270282, + "balance_loss_mlp": 0.01252804, + "epoch": 0.7923643469111679, + "flos": 29943036391680.0, + "grad_norm": 1.8056668444425423, + "language_loss": 0.69007665, + "learning_rate": 4.3534888039709726e-07, + "loss": 0.76671851, + "num_input_tokens_seen": 284345985, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.08660889, + "step": 13179, + "time_per_iteration": 2.560208559036255 + }, + { + "auxiliary_loss_clip": 0.06402154, + "auxiliary_loss_mlp": 0.01265495, + "balance_loss_clip": 0.06272629, + "balance_loss_mlp": 0.01256155, + "epoch": 0.7924244701638359, + "flos": 22681360874880.0, + "grad_norm": 2.1905203910288105, + "language_loss": 0.74228048, + "learning_rate": 4.3510632616650444e-07, + "loss": 0.81895697, + "num_input_tokens_seen": 284364475, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.09332275, + "step": 13180, + "time_per_iteration": 2.5125856399536133 + }, + { + "auxiliary_loss_clip": 0.06402977, + "auxiliary_loss_mlp": 0.01265326, + "balance_loss_clip": 0.06271179, + "balance_loss_mlp": 0.01254729, + "epoch": 0.7924845934165038, + "flos": 17974031149440.0, + "grad_norm": 2.3420456225908524, + "language_loss": 0.81796247, + "learning_rate": 4.3486383127683646e-07, + "loss": 0.89464545, + "num_input_tokens_seen": 284382125, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.10595703, + "step": 13181, + "time_per_iteration": 2.4527087211608887 + }, + { + "auxiliary_loss_clip": 0.06399131, + "auxiliary_loss_mlp": 0.01263297, + "balance_loss_clip": 0.06270739, + "balance_loss_mlp": 0.01253791, + "epoch": 0.7925447166691718, + "flos": 23483665319040.0, + "grad_norm": 1.8219768185370055, + "language_loss": 0.7760042, + "learning_rate": 4.346213957372895e-07, + "loss": 0.85262847, + "num_input_tokens_seen": 284401585, + "router_z_loss_clip": 1.28320312, + "router_z_loss_mlp": 0.09509277, + "step": 13182, + "time_per_iteration": 4.028662919998169 + }, + { + "auxiliary_loss_clip": 0.06410173, + "auxiliary_loss_mlp": 0.01265893, + "balance_loss_clip": 0.06274082, + "balance_loss_mlp": 0.01254866, + "epoch": 0.7926048399218397, + "flos": 20453591571840.0, + "grad_norm": 1.6188805399457735, + "language_loss": 0.74277139, + "learning_rate": 4.34379019557056e-07, + "loss": 0.8195321, + "num_input_tokens_seen": 284419125, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.11029053, + "step": 13183, + "time_per_iteration": 2.4738929271698 + }, + { + "auxiliary_loss_clip": 0.06403777, + "auxiliary_loss_mlp": 0.01263216, + "balance_loss_clip": 0.06273498, + "balance_loss_mlp": 0.0125424, + "epoch": 0.7926649631745077, + "flos": 37169184977280.0, + "grad_norm": 1.7084157774544453, + "language_loss": 0.68652374, + "learning_rate": 4.341367027453264e-07, + "loss": 0.76319367, + "num_input_tokens_seen": 284440445, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.08978271, + "step": 13184, + "time_per_iteration": 2.6054959297180176 + }, + { + "auxiliary_loss_clip": 0.06404284, + "auxiliary_loss_mlp": 0.01263636, + "balance_loss_clip": 0.06271448, + "balance_loss_mlp": 0.01254082, + "epoch": 0.7927250864271758, + "flos": 17024168465280.0, + "grad_norm": 1.8074716343378143, + "language_loss": 0.71104252, + "learning_rate": 4.338944453112907e-07, + "loss": 0.78772175, + "num_input_tokens_seen": 284459370, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09558105, + "step": 13185, + "time_per_iteration": 2.457500696182251 + }, + { + "auxiliary_loss_clip": 0.06404824, + "auxiliary_loss_mlp": 0.01263758, + "balance_loss_clip": 0.06271466, + "balance_loss_mlp": 0.01254377, + "epoch": 0.7927852096798437, + "flos": 17755041703680.0, + "grad_norm": 2.0425556514381777, + "language_loss": 0.65721595, + "learning_rate": 4.3365224726413375e-07, + "loss": 0.73390174, + "num_input_tokens_seen": 284477525, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.09381104, + "step": 13186, + "time_per_iteration": 2.491744041442871 + }, + { + "auxiliary_loss_clip": 0.06399564, + "auxiliary_loss_mlp": 0.01262578, + "balance_loss_clip": 0.06271927, + "balance_loss_mlp": 0.01253965, + "epoch": 0.7928453329325117, + "flos": 23844636708480.0, + "grad_norm": 1.452369328079203, + "language_loss": 0.77105349, + "learning_rate": 4.334101086130408e-07, + "loss": 0.84767497, + "num_input_tokens_seen": 284496590, + "router_z_loss_clip": 1.27539062, + "router_z_loss_mlp": 0.08612061, + "step": 13187, + "time_per_iteration": 2.512676239013672 + }, + { + "auxiliary_loss_clip": 0.06400672, + "auxiliary_loss_mlp": 0.01265003, + "balance_loss_clip": 0.06270963, + "balance_loss_mlp": 0.01255741, + "epoch": 0.7929054561851796, + "flos": 17460302567040.0, + "grad_norm": 1.9206985573704325, + "language_loss": 0.72777045, + "learning_rate": 4.3316802936719334e-07, + "loss": 0.80442715, + "num_input_tokens_seen": 284511470, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.09259033, + "step": 13188, + "time_per_iteration": 2.4961729049682617 + }, + { + "auxiliary_loss_clip": 0.06405029, + "auxiliary_loss_mlp": 0.01265612, + "balance_loss_clip": 0.06271419, + "balance_loss_mlp": 0.01254633, + "epoch": 0.7929655794378476, + "flos": 21987775503360.0, + "grad_norm": 2.0256790948802066, + "language_loss": 0.63584489, + "learning_rate": 4.329260095357725e-07, + "loss": 0.71255124, + "num_input_tokens_seen": 284531125, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.10980225, + "step": 13189, + "time_per_iteration": 2.481018304824829 + }, + { + "auxiliary_loss_clip": 0.06406255, + "auxiliary_loss_mlp": 0.01267784, + "balance_loss_clip": 0.06275403, + "balance_loss_mlp": 0.01258539, + "epoch": 0.7930257026905155, + "flos": 17279523383040.0, + "grad_norm": 2.1940059966398557, + "language_loss": 0.72796714, + "learning_rate": 4.3268404912795307e-07, + "loss": 0.80470747, + "num_input_tokens_seen": 284549340, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09240723, + "step": 13190, + "time_per_iteration": 2.489017963409424 + }, + { + "auxiliary_loss_clip": 0.06397982, + "auxiliary_loss_mlp": 0.01262706, + "balance_loss_clip": 0.06271739, + "balance_loss_mlp": 0.01254487, + "epoch": 0.7930858259431836, + "flos": 27306693780480.0, + "grad_norm": 2.0481734999626213, + "language_loss": 0.73499632, + "learning_rate": 4.3244214815291166e-07, + "loss": 0.81160319, + "num_input_tokens_seen": 284567060, + "router_z_loss_clip": 1.26269531, + "router_z_loss_mlp": 0.08221436, + "step": 13191, + "time_per_iteration": 2.523073196411133 + }, + { + "auxiliary_loss_clip": 0.06402196, + "auxiliary_loss_mlp": 0.01264267, + "balance_loss_clip": 0.06271292, + "balance_loss_mlp": 0.01254915, + "epoch": 0.7931459491958515, + "flos": 19869647667840.0, + "grad_norm": 1.6892778710359044, + "language_loss": 0.69173294, + "learning_rate": 4.322003066198219e-07, + "loss": 0.76839757, + "num_input_tokens_seen": 284586600, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09350586, + "step": 13192, + "time_per_iteration": 2.4932494163513184 + }, + { + "auxiliary_loss_clip": 0.06401037, + "auxiliary_loss_mlp": 0.0126355, + "balance_loss_clip": 0.06269231, + "balance_loss_mlp": 0.01254395, + "epoch": 0.7932060724485195, + "flos": 23153525032320.0, + "grad_norm": 1.5309974551938075, + "language_loss": 0.75287253, + "learning_rate": 4.3195852453785274e-07, + "loss": 0.82951844, + "num_input_tokens_seen": 284605715, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.0914917, + "step": 13193, + "time_per_iteration": 2.4988462924957275 + }, + { + "auxiliary_loss_clip": 0.0639962, + "auxiliary_loss_mlp": 0.01263491, + "balance_loss_clip": 0.06269534, + "balance_loss_mlp": 0.01253216, + "epoch": 0.7932661957011874, + "flos": 29942617121280.0, + "grad_norm": 1.4608356167152348, + "language_loss": 0.72191167, + "learning_rate": 4.317168019161741e-07, + "loss": 0.7985428, + "num_input_tokens_seen": 284628540, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.1026001, + "step": 13194, + "time_per_iteration": 2.545863151550293 + }, + { + "auxiliary_loss_clip": 0.06407529, + "auxiliary_loss_mlp": 0.01263238, + "balance_loss_clip": 0.06271923, + "balance_loss_mlp": 0.0125323, + "epoch": 0.7933263189538554, + "flos": 22564717591680.0, + "grad_norm": 1.9164119447525156, + "language_loss": 0.70693266, + "learning_rate": 4.314751387639517e-07, + "loss": 0.78364033, + "num_input_tokens_seen": 284646040, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.10015869, + "step": 13195, + "time_per_iteration": 2.478484869003296 + }, + { + "auxiliary_loss_clip": 0.06403863, + "auxiliary_loss_mlp": 0.0126619, + "balance_loss_clip": 0.06272461, + "balance_loss_mlp": 0.0125679, + "epoch": 0.7933864422065233, + "flos": 25485317579520.0, + "grad_norm": 1.4419483453830304, + "language_loss": 0.77285999, + "learning_rate": 4.3123353509034844e-07, + "loss": 0.8495605, + "num_input_tokens_seen": 284665110, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.09411621, + "step": 13196, + "time_per_iteration": 2.5209035873413086 + }, + { + "auxiliary_loss_clip": 0.06408395, + "auxiliary_loss_mlp": 0.01271096, + "balance_loss_clip": 0.06274862, + "balance_loss_mlp": 0.01261196, + "epoch": 0.7934465654591913, + "flos": 33591490871040.0, + "grad_norm": 1.6476530892648569, + "language_loss": 0.6925202, + "learning_rate": 4.309919909045268e-07, + "loss": 0.76931512, + "num_input_tokens_seen": 284686515, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.09899902, + "step": 13197, + "time_per_iteration": 2.6008334159851074 + }, + { + "auxiliary_loss_clip": 0.06401211, + "auxiliary_loss_mlp": 0.012638, + "balance_loss_clip": 0.06270218, + "balance_loss_mlp": 0.01254281, + "epoch": 0.7935066887118594, + "flos": 31440854851200.0, + "grad_norm": 1.7257166200150085, + "language_loss": 0.65332729, + "learning_rate": 4.30750506215646e-07, + "loss": 0.72997743, + "num_input_tokens_seen": 284707300, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09521484, + "step": 13198, + "time_per_iteration": 2.5760626792907715 + }, + { + "auxiliary_loss_clip": 0.06407583, + "auxiliary_loss_mlp": 0.01266914, + "balance_loss_clip": 0.06272698, + "balance_loss_mlp": 0.0125696, + "epoch": 0.7935668119645273, + "flos": 14687638162560.0, + "grad_norm": 1.9381240473938566, + "language_loss": 0.72217059, + "learning_rate": 4.30509081032864e-07, + "loss": 0.79891551, + "num_input_tokens_seen": 284723545, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.0994873, + "step": 13199, + "time_per_iteration": 2.4537320137023926 + }, + { + "auxiliary_loss_clip": 0.06404065, + "auxiliary_loss_mlp": 0.01264064, + "balance_loss_clip": 0.06271455, + "balance_loss_mlp": 0.01254647, + "epoch": 0.7936269352171953, + "flos": 18010061205120.0, + "grad_norm": 1.8593669017855428, + "language_loss": 0.80699968, + "learning_rate": 4.302677153653349e-07, + "loss": 0.88368094, + "num_input_tokens_seen": 284742650, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09411621, + "step": 13200, + "time_per_iteration": 2.4965553283691406 + }, + { + "auxiliary_loss_clip": 0.06395376, + "auxiliary_loss_mlp": 0.01263957, + "balance_loss_clip": 0.06269375, + "balance_loss_mlp": 0.01254527, + "epoch": 0.7936870584698632, + "flos": 18886228623360.0, + "grad_norm": 1.593396762237453, + "language_loss": 0.77522814, + "learning_rate": 4.3002640922221077e-07, + "loss": 0.85182142, + "num_input_tokens_seen": 284760955, + "router_z_loss_clip": 1.26171875, + "router_z_loss_mlp": 0.09423828, + "step": 13201, + "time_per_iteration": 2.497309446334839 + }, + { + "auxiliary_loss_clip": 0.06399371, + "auxiliary_loss_mlp": 0.01265865, + "balance_loss_clip": 0.06270684, + "balance_loss_mlp": 0.01256149, + "epoch": 0.7937471817225312, + "flos": 23373604581120.0, + "grad_norm": 1.5839447213043625, + "language_loss": 0.67329711, + "learning_rate": 4.2978516261264296e-07, + "loss": 0.74994946, + "num_input_tokens_seen": 284780745, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.09716797, + "step": 13202, + "time_per_iteration": 2.5105254650115967 + }, + { + "auxiliary_loss_clip": 0.06399509, + "auxiliary_loss_mlp": 0.01267318, + "balance_loss_clip": 0.06267376, + "balance_loss_mlp": 0.01257501, + "epoch": 0.7938073049751991, + "flos": 22681025458560.0, + "grad_norm": 1.8682622779044114, + "language_loss": 0.75083208, + "learning_rate": 4.2954397554577884e-07, + "loss": 0.82750034, + "num_input_tokens_seen": 284799000, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.09820557, + "step": 13203, + "time_per_iteration": 3.8750996589660645 + }, + { + "auxiliary_loss_clip": 0.06400256, + "auxiliary_loss_mlp": 0.01263086, + "balance_loss_clip": 0.06268462, + "balance_loss_mlp": 0.01253907, + "epoch": 0.7938674282278672, + "flos": 22857150741120.0, + "grad_norm": 1.6792002510464108, + "language_loss": 0.66683894, + "learning_rate": 4.293028480307643e-07, + "loss": 0.74347234, + "num_input_tokens_seen": 284817450, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09173584, + "step": 13204, + "time_per_iteration": 2.4866726398468018 + }, + { + "auxiliary_loss_clip": 0.0640104, + "auxiliary_loss_mlp": 0.01260862, + "balance_loss_clip": 0.0627223, + "balance_loss_mlp": 0.01252249, + "epoch": 0.7939275514805351, + "flos": 27019208021760.0, + "grad_norm": 1.3684183312797948, + "language_loss": 0.79726428, + "learning_rate": 4.290617800767438e-07, + "loss": 0.87388325, + "num_input_tokens_seen": 284838865, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.08605957, + "step": 13205, + "time_per_iteration": 2.555922746658325 + }, + { + "auxiliary_loss_clip": 0.06398693, + "auxiliary_loss_mlp": 0.012639, + "balance_loss_clip": 0.06270471, + "balance_loss_mlp": 0.01254596, + "epoch": 0.7939876747332031, + "flos": 21149315222400.0, + "grad_norm": 1.956372656118469, + "language_loss": 0.77988601, + "learning_rate": 4.28820771692858e-07, + "loss": 0.85651195, + "num_input_tokens_seen": 284857975, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.09295654, + "step": 13206, + "time_per_iteration": 2.5223846435546875 + }, + { + "auxiliary_loss_clip": 0.06407081, + "auxiliary_loss_mlp": 0.01264461, + "balance_loss_clip": 0.06272183, + "balance_loss_mlp": 0.01254638, + "epoch": 0.794047797985871, + "flos": 23294836143360.0, + "grad_norm": 2.5564565777737265, + "language_loss": 0.78640836, + "learning_rate": 4.285798228882456e-07, + "loss": 0.86312377, + "num_input_tokens_seen": 284877145, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.0982666, + "step": 13207, + "time_per_iteration": 2.5289721488952637 + }, + { + "auxiliary_loss_clip": 0.06401804, + "auxiliary_loss_mlp": 0.01266401, + "balance_loss_clip": 0.06270908, + "balance_loss_mlp": 0.01256679, + "epoch": 0.794107921238539, + "flos": 24614978019840.0, + "grad_norm": 1.988476360796287, + "language_loss": 0.84176642, + "learning_rate": 4.2833893367204375e-07, + "loss": 0.91844845, + "num_input_tokens_seen": 284895560, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.097229, + "step": 13208, + "time_per_iteration": 2.5182619094848633 + }, + { + "auxiliary_loss_clip": 0.06307561, + "auxiliary_loss_mlp": 0.01252747, + "balance_loss_clip": 0.06251705, + "balance_loss_mlp": 0.01251759, + "epoch": 0.7941680444912069, + "flos": 64114641077760.0, + "grad_norm": 0.7251481470508581, + "language_loss": 0.58347547, + "learning_rate": 4.280981040533875e-07, + "loss": 0.65907854, + "num_input_tokens_seen": 284963135, + "router_z_loss_clip": 0.56005859, + "router_z_loss_mlp": 0.00986481, + "step": 13209, + "time_per_iteration": 3.215669631958008 + }, + { + "auxiliary_loss_clip": 0.06411248, + "auxiliary_loss_mlp": 0.01263694, + "balance_loss_clip": 0.06275053, + "balance_loss_mlp": 0.01253753, + "epoch": 0.794228167743875, + "flos": 24395653157760.0, + "grad_norm": 2.3239436118534544, + "language_loss": 0.63244212, + "learning_rate": 4.2785733404140825e-07, + "loss": 0.70919156, + "num_input_tokens_seen": 284981755, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.09936523, + "step": 13210, + "time_per_iteration": 2.509675979614258 + }, + { + "auxiliary_loss_clip": 0.0639855, + "auxiliary_loss_mlp": 0.01264565, + "balance_loss_clip": 0.06268808, + "balance_loss_mlp": 0.01255135, + "epoch": 0.794288290996543, + "flos": 28520129082240.0, + "grad_norm": 1.5283303816318292, + "language_loss": 0.69651222, + "learning_rate": 4.2761662364523676e-07, + "loss": 0.77314341, + "num_input_tokens_seen": 285003060, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.09423828, + "step": 13211, + "time_per_iteration": 2.5609560012817383 + }, + { + "auxiliary_loss_clip": 0.0640647, + "auxiliary_loss_mlp": 0.01264423, + "balance_loss_clip": 0.06271889, + "balance_loss_mlp": 0.01253593, + "epoch": 0.7943484142492109, + "flos": 25929333964800.0, + "grad_norm": 1.5675650116890587, + "language_loss": 0.72487032, + "learning_rate": 4.2737597287400074e-07, + "loss": 0.80157924, + "num_input_tokens_seen": 285021640, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.10827637, + "step": 13212, + "time_per_iteration": 2.5255634784698486 + }, + { + "auxiliary_loss_clip": 0.06398303, + "auxiliary_loss_mlp": 0.0126368, + "balance_loss_clip": 0.06271377, + "balance_loss_mlp": 0.01254716, + "epoch": 0.7944085375018789, + "flos": 23922147335040.0, + "grad_norm": 1.6395336684596964, + "language_loss": 0.80590618, + "learning_rate": 4.271353817368246e-07, + "loss": 0.88252604, + "num_input_tokens_seen": 285040490, + "router_z_loss_clip": 1.26855469, + "router_z_loss_mlp": 0.08972168, + "step": 13213, + "time_per_iteration": 3.9452641010284424 + }, + { + "auxiliary_loss_clip": 0.06409128, + "auxiliary_loss_mlp": 0.01263209, + "balance_loss_clip": 0.06274794, + "balance_loss_mlp": 0.0125316, + "epoch": 0.7944686607545468, + "flos": 20236153426560.0, + "grad_norm": 2.1556158344518463, + "language_loss": 0.67980099, + "learning_rate": 4.268948502428327e-07, + "loss": 0.75652432, + "num_input_tokens_seen": 285059270, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.10046387, + "step": 13214, + "time_per_iteration": 2.5221662521362305 + }, + { + "auxiliary_loss_clip": 0.06399108, + "auxiliary_loss_mlp": 0.0126568, + "balance_loss_clip": 0.06270888, + "balance_loss_mlp": 0.01256215, + "epoch": 0.7945287840072148, + "flos": 21987440087040.0, + "grad_norm": 1.6557569175319402, + "language_loss": 0.72647429, + "learning_rate": 4.2665437840114535e-07, + "loss": 0.80312216, + "num_input_tokens_seen": 285075390, + "router_z_loss_clip": 1.28222656, + "router_z_loss_mlp": 0.09454346, + "step": 13215, + "time_per_iteration": 2.482057809829712 + }, + { + "auxiliary_loss_clip": 0.06396606, + "auxiliary_loss_mlp": 0.01264543, + "balance_loss_clip": 0.06269813, + "balance_loss_mlp": 0.0125512, + "epoch": 0.7945889072598827, + "flos": 26405229628800.0, + "grad_norm": 1.661805737915831, + "language_loss": 0.79503906, + "learning_rate": 4.2641396622088253e-07, + "loss": 0.87165052, + "num_input_tokens_seen": 285096290, + "router_z_loss_clip": 1.26855469, + "router_z_loss_mlp": 0.09429932, + "step": 13216, + "time_per_iteration": 2.5464351177215576 + }, + { + "auxiliary_loss_clip": 0.06404807, + "auxiliary_loss_mlp": 0.01263362, + "balance_loss_clip": 0.06270844, + "balance_loss_mlp": 0.01253772, + "epoch": 0.7946490305125508, + "flos": 25817051093760.0, + "grad_norm": 1.6049687625888907, + "language_loss": 0.73967838, + "learning_rate": 4.261736137111598e-07, + "loss": 0.81636012, + "num_input_tokens_seen": 285116020, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.09588623, + "step": 13217, + "time_per_iteration": 3.931478977203369 + }, + { + "auxiliary_loss_clip": 0.06401365, + "auxiliary_loss_mlp": 0.01263665, + "balance_loss_clip": 0.0627373, + "balance_loss_mlp": 0.0125408, + "epoch": 0.7947091537652187, + "flos": 15966425249280.0, + "grad_norm": 1.8482353685704531, + "language_loss": 0.74055278, + "learning_rate": 4.259333208810907e-07, + "loss": 0.81720304, + "num_input_tokens_seen": 285133510, + "router_z_loss_clip": 1.27636719, + "router_z_loss_mlp": 0.09591675, + "step": 13218, + "time_per_iteration": 2.4553987979888916 + }, + { + "auxiliary_loss_clip": 0.06410147, + "auxiliary_loss_mlp": 0.01263676, + "balance_loss_clip": 0.06273754, + "balance_loss_mlp": 0.0125424, + "epoch": 0.7947692770178867, + "flos": 18593753546880.0, + "grad_norm": 1.8816401972337626, + "language_loss": 0.83479667, + "learning_rate": 4.2569308773978817e-07, + "loss": 0.91153485, + "num_input_tokens_seen": 285151690, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.09442139, + "step": 13219, + "time_per_iteration": 2.44667911529541 + }, + { + "auxiliary_loss_clip": 0.06409134, + "auxiliary_loss_mlp": 0.01268173, + "balance_loss_clip": 0.06272696, + "balance_loss_mlp": 0.01258064, + "epoch": 0.7948294002705546, + "flos": 20447344442880.0, + "grad_norm": 1.667648831846699, + "language_loss": 0.7587316, + "learning_rate": 4.2545291429636123e-07, + "loss": 0.83550465, + "num_input_tokens_seen": 285170485, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.10113525, + "step": 13220, + "time_per_iteration": 2.515125036239624 + }, + { + "auxiliary_loss_clip": 0.06413321, + "auxiliary_loss_mlp": 0.01262935, + "balance_loss_clip": 0.0627633, + "balance_loss_mlp": 0.01253041, + "epoch": 0.7948895235232226, + "flos": 38190436940160.0, + "grad_norm": 1.659539697860105, + "language_loss": 0.72439814, + "learning_rate": 4.252128005599176e-07, + "loss": 0.80116069, + "num_input_tokens_seen": 285191050, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.09893799, + "step": 13221, + "time_per_iteration": 4.03423810005188 + }, + { + "auxiliary_loss_clip": 0.06401148, + "auxiliary_loss_mlp": 0.01264392, + "balance_loss_clip": 0.0627249, + "balance_loss_mlp": 0.01255052, + "epoch": 0.7949496467758905, + "flos": 15565231100160.0, + "grad_norm": 2.544368910491826, + "language_loss": 0.75068891, + "learning_rate": 4.249727465395634e-07, + "loss": 0.8273443, + "num_input_tokens_seen": 285208750, + "router_z_loss_clip": 1.28613281, + "router_z_loss_mlp": 0.09332275, + "step": 13222, + "time_per_iteration": 2.491516590118408 + }, + { + "auxiliary_loss_clip": 0.06308898, + "auxiliary_loss_mlp": 0.01254396, + "balance_loss_clip": 0.06253184, + "balance_loss_mlp": 0.01253385, + "epoch": 0.7950097700285585, + "flos": 70915864809600.0, + "grad_norm": 0.7838771916152429, + "language_loss": 0.66774839, + "learning_rate": 4.247327522443993e-07, + "loss": 0.74338138, + "num_input_tokens_seen": 285264605, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.01010132, + "step": 13223, + "time_per_iteration": 3.031728744506836 + }, + { + "auxiliary_loss_clip": 0.06404258, + "auxiliary_loss_mlp": 0.01264069, + "balance_loss_clip": 0.06271882, + "balance_loss_mlp": 0.01253829, + "epoch": 0.7950698932812266, + "flos": 23958470880000.0, + "grad_norm": 1.6379349696855243, + "language_loss": 0.71398437, + "learning_rate": 4.2449281768352717e-07, + "loss": 0.79066753, + "num_input_tokens_seen": 285283940, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.10241699, + "step": 13224, + "time_per_iteration": 2.5175724029541016 + }, + { + "auxiliary_loss_clip": 0.06312153, + "auxiliary_loss_mlp": 0.01251169, + "balance_loss_clip": 0.06256486, + "balance_loss_mlp": 0.01250191, + "epoch": 0.7951300165338945, + "flos": 60300096606720.0, + "grad_norm": 0.6591691135419323, + "language_loss": 0.55062973, + "learning_rate": 4.2425294286604527e-07, + "loss": 0.62626302, + "num_input_tokens_seen": 285349525, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.00976562, + "step": 13225, + "time_per_iteration": 3.178450345993042 + }, + { + "auxiliary_loss_clip": 0.06401074, + "auxiliary_loss_mlp": 0.01261342, + "balance_loss_clip": 0.06272745, + "balance_loss_mlp": 0.01252884, + "epoch": 0.7951901397865625, + "flos": 22825397243520.0, + "grad_norm": 2.154430910035814, + "language_loss": 0.65301824, + "learning_rate": 4.2401312780105034e-07, + "loss": 0.72964251, + "num_input_tokens_seen": 285367355, + "router_z_loss_clip": 1.28320312, + "router_z_loss_mlp": 0.08459473, + "step": 13226, + "time_per_iteration": 2.5249226093292236 + }, + { + "auxiliary_loss_clip": 0.06407489, + "auxiliary_loss_mlp": 0.0126573, + "balance_loss_clip": 0.06274739, + "balance_loss_mlp": 0.01256062, + "epoch": 0.7952502630392304, + "flos": 35703748920960.0, + "grad_norm": 2.011551916679729, + "language_loss": 0.70672739, + "learning_rate": 4.237733724976349e-07, + "loss": 0.78345954, + "num_input_tokens_seen": 285386190, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09680176, + "step": 13227, + "time_per_iteration": 2.6486446857452393 + }, + { + "auxiliary_loss_clip": 0.06398386, + "auxiliary_loss_mlp": 0.01262858, + "balance_loss_clip": 0.06269887, + "balance_loss_mlp": 0.01254162, + "epoch": 0.7953103862918984, + "flos": 25636942742400.0, + "grad_norm": 1.7944937078069616, + "language_loss": 0.69723666, + "learning_rate": 4.2353367696489184e-07, + "loss": 0.77384907, + "num_input_tokens_seen": 285406150, + "router_z_loss_clip": 1.28417969, + "router_z_loss_mlp": 0.08691406, + "step": 13228, + "time_per_iteration": 2.6445536613464355 + }, + { + "auxiliary_loss_clip": 0.06402546, + "auxiliary_loss_mlp": 0.01265131, + "balance_loss_clip": 0.06270213, + "balance_loss_mlp": 0.01255564, + "epoch": 0.7953705095445663, + "flos": 40561487925120.0, + "grad_norm": 1.474530595441345, + "language_loss": 0.70921922, + "learning_rate": 4.232940412119095e-07, + "loss": 0.78589594, + "num_input_tokens_seen": 285429900, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.09558105, + "step": 13229, + "time_per_iteration": 2.6637799739837646 + }, + { + "auxiliary_loss_clip": 0.0641102, + "auxiliary_loss_mlp": 0.0126613, + "balance_loss_clip": 0.06274529, + "balance_loss_mlp": 0.01256063, + "epoch": 0.7954306327972344, + "flos": 27644129372160.0, + "grad_norm": 1.7873536766913725, + "language_loss": 0.71492708, + "learning_rate": 4.2305446524777457e-07, + "loss": 0.79169858, + "num_input_tokens_seen": 285452555, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.10076904, + "step": 13230, + "time_per_iteration": 2.574101209640503 + }, + { + "auxiliary_loss_clip": 0.06309671, + "auxiliary_loss_mlp": 0.01251481, + "balance_loss_clip": 0.06254265, + "balance_loss_mlp": 0.01250479, + "epoch": 0.7954907560499023, + "flos": 59525505936000.0, + "grad_norm": 0.8781067484442618, + "language_loss": 0.63612801, + "learning_rate": 4.2281494908157247e-07, + "loss": 0.71173954, + "num_input_tokens_seen": 285515700, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.0100174, + "step": 13231, + "time_per_iteration": 3.143348217010498 + }, + { + "auxiliary_loss_clip": 0.06401561, + "auxiliary_loss_mlp": 0.01263604, + "balance_loss_clip": 0.0627121, + "balance_loss_mlp": 0.01253615, + "epoch": 0.7955508793025703, + "flos": 20126721594240.0, + "grad_norm": 1.6206459895498453, + "language_loss": 0.69870329, + "learning_rate": 4.2257549272238566e-07, + "loss": 0.77535492, + "num_input_tokens_seen": 285533910, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.09991455, + "step": 13232, + "time_per_iteration": 2.534808874130249 + }, + { + "auxiliary_loss_clip": 0.06401277, + "auxiliary_loss_mlp": 0.01262737, + "balance_loss_clip": 0.06270236, + "balance_loss_mlp": 0.01253272, + "epoch": 0.7956110025552382, + "flos": 26512607036160.0, + "grad_norm": 1.7341819887914223, + "language_loss": 0.78396481, + "learning_rate": 4.223360961792952e-07, + "loss": 0.860605, + "num_input_tokens_seen": 285554080, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09466553, + "step": 13233, + "time_per_iteration": 2.5741093158721924 + }, + { + "auxiliary_loss_clip": 0.06403272, + "auxiliary_loss_mlp": 0.01265137, + "balance_loss_clip": 0.06270528, + "balance_loss_mlp": 0.01255803, + "epoch": 0.7956711258079062, + "flos": 22572138677760.0, + "grad_norm": 1.88878875282178, + "language_loss": 0.78960502, + "learning_rate": 4.220967594613769e-07, + "loss": 0.86628914, + "num_input_tokens_seen": 285572325, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.09332275, + "step": 13234, + "time_per_iteration": 2.5267715454101562 + }, + { + "auxiliary_loss_clip": 0.064052, + "auxiliary_loss_mlp": 0.01262721, + "balance_loss_clip": 0.06274294, + "balance_loss_mlp": 0.01254102, + "epoch": 0.7957312490605741, + "flos": 17383882043520.0, + "grad_norm": 2.969852188387872, + "language_loss": 0.70354939, + "learning_rate": 4.218574825777077e-07, + "loss": 0.78022861, + "num_input_tokens_seen": 285589770, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.08618164, + "step": 13235, + "time_per_iteration": 2.472926616668701 + }, + { + "auxiliary_loss_clip": 0.0640211, + "auxiliary_loss_mlp": 0.0126658, + "balance_loss_clip": 0.06269485, + "balance_loss_mlp": 0.012564, + "epoch": 0.7957913723132422, + "flos": 22497898360320.0, + "grad_norm": 3.326054048453629, + "language_loss": 0.68091619, + "learning_rate": 4.2161826553736145e-07, + "loss": 0.75760305, + "num_input_tokens_seen": 285610065, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.10174561, + "step": 13236, + "time_per_iteration": 2.5275604724884033 + }, + { + "auxiliary_loss_clip": 0.06401785, + "auxiliary_loss_mlp": 0.01265164, + "balance_loss_clip": 0.06272059, + "balance_loss_mlp": 0.01256295, + "epoch": 0.7958514955659101, + "flos": 22644701913600.0, + "grad_norm": 1.5838694899419836, + "language_loss": 0.75233686, + "learning_rate": 4.2137910834940826e-07, + "loss": 0.82900631, + "num_input_tokens_seen": 285628480, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.08874512, + "step": 13237, + "time_per_iteration": 2.5152275562286377 + }, + { + "auxiliary_loss_clip": 0.06404451, + "auxiliary_loss_mlp": 0.0126561, + "balance_loss_clip": 0.06271912, + "balance_loss_mlp": 0.01255788, + "epoch": 0.7959116188185781, + "flos": 20710497790080.0, + "grad_norm": 1.909101485463629, + "language_loss": 0.71454495, + "learning_rate": 4.211400110229175e-07, + "loss": 0.79124558, + "num_input_tokens_seen": 285647805, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.0982666, + "step": 13238, + "time_per_iteration": 2.5149312019348145 + }, + { + "auxiliary_loss_clip": 0.0640163, + "auxiliary_loss_mlp": 0.01263785, + "balance_loss_clip": 0.06269349, + "balance_loss_mlp": 0.01254844, + "epoch": 0.7959717420712461, + "flos": 19030474627200.0, + "grad_norm": 2.2119566924128584, + "language_loss": 0.74293685, + "learning_rate": 4.2090097356695684e-07, + "loss": 0.81959099, + "num_input_tokens_seen": 285665505, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.0894165, + "step": 13239, + "time_per_iteration": 2.4692234992980957 + }, + { + "auxiliary_loss_clip": 0.06405409, + "auxiliary_loss_mlp": 0.01264077, + "balance_loss_clip": 0.0627186, + "balance_loss_mlp": 0.01254314, + "epoch": 0.796031865323914, + "flos": 26363371714560.0, + "grad_norm": 4.594953960637003, + "language_loss": 0.69371974, + "learning_rate": 4.2066199599058814e-07, + "loss": 0.77041459, + "num_input_tokens_seen": 285685855, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.09765625, + "step": 13240, + "time_per_iteration": 2.5826754570007324 + }, + { + "auxiliary_loss_clip": 0.06308684, + "auxiliary_loss_mlp": 0.01255726, + "balance_loss_clip": 0.06253344, + "balance_loss_mlp": 0.01254768, + "epoch": 0.796091988576582, + "flos": 62087119833600.0, + "grad_norm": 0.8806225517212096, + "language_loss": 0.5847106, + "learning_rate": 4.2042307830287526e-07, + "loss": 0.66035473, + "num_input_tokens_seen": 285735710, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.00956726, + "step": 13241, + "time_per_iteration": 2.9126768112182617 + }, + { + "auxiliary_loss_clip": 0.06403052, + "auxiliary_loss_mlp": 0.01265132, + "balance_loss_clip": 0.06270704, + "balance_loss_mlp": 0.01255864, + "epoch": 0.7961521118292499, + "flos": 39029442272640.0, + "grad_norm": 2.127726994888291, + "language_loss": 0.64769882, + "learning_rate": 4.201842205128772e-07, + "loss": 0.72438073, + "num_input_tokens_seen": 285757045, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.09265137, + "step": 13242, + "time_per_iteration": 2.635535717010498 + }, + { + "auxiliary_loss_clip": 0.06402293, + "auxiliary_loss_mlp": 0.01267879, + "balance_loss_clip": 0.06268795, + "balance_loss_mlp": 0.01257795, + "epoch": 0.796212235081918, + "flos": 21769373036160.0, + "grad_norm": 2.0186777582920024, + "language_loss": 0.76239574, + "learning_rate": 4.199454226296526e-07, + "loss": 0.83909744, + "num_input_tokens_seen": 285776050, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.10083008, + "step": 13243, + "time_per_iteration": 3.8618268966674805 + }, + { + "auxiliary_loss_clip": 0.06402823, + "auxiliary_loss_mlp": 0.01264428, + "balance_loss_clip": 0.06270328, + "balance_loss_mlp": 0.01254605, + "epoch": 0.7962723583345859, + "flos": 21185261424000.0, + "grad_norm": 1.6364985939961718, + "language_loss": 0.79507935, + "learning_rate": 4.1970668466225565e-07, + "loss": 0.8717519, + "num_input_tokens_seen": 285796830, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.09832764, + "step": 13244, + "time_per_iteration": 2.51326322555542 + }, + { + "auxiliary_loss_clip": 0.06406613, + "auxiliary_loss_mlp": 0.01264352, + "balance_loss_clip": 0.06270078, + "balance_loss_mlp": 0.01254308, + "epoch": 0.7963324815872539, + "flos": 17134313057280.0, + "grad_norm": 1.908775351263593, + "language_loss": 0.68666172, + "learning_rate": 4.1946800661973934e-07, + "loss": 0.76337141, + "num_input_tokens_seen": 285814755, + "router_z_loss_clip": 1.36621094, + "router_z_loss_mlp": 0.10046387, + "step": 13245, + "time_per_iteration": 2.461880922317505 + }, + { + "auxiliary_loss_clip": 0.0640422, + "auxiliary_loss_mlp": 0.01265244, + "balance_loss_clip": 0.06271861, + "balance_loss_mlp": 0.01255749, + "epoch": 0.7963926048399218, + "flos": 21403873526400.0, + "grad_norm": 1.7297162444203578, + "language_loss": 0.79002523, + "learning_rate": 4.192293885111549e-07, + "loss": 0.86671984, + "num_input_tokens_seen": 285834255, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.0949707, + "step": 13246, + "time_per_iteration": 2.4906105995178223 + }, + { + "auxiliary_loss_clip": 0.06404968, + "auxiliary_loss_mlp": 0.012642, + "balance_loss_clip": 0.06269813, + "balance_loss_mlp": 0.01254073, + "epoch": 0.7964527280925898, + "flos": 25189907610240.0, + "grad_norm": 1.8120227230539676, + "language_loss": 0.66180718, + "learning_rate": 4.1899083034555007e-07, + "loss": 0.73849887, + "num_input_tokens_seen": 285853540, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.10125732, + "step": 13247, + "time_per_iteration": 2.534837484359741 + }, + { + "auxiliary_loss_clip": 0.0639786, + "auxiliary_loss_mlp": 0.01263181, + "balance_loss_clip": 0.06269214, + "balance_loss_mlp": 0.0125458, + "epoch": 0.7965128513452577, + "flos": 27023149163520.0, + "grad_norm": 1.7943633437832778, + "language_loss": 0.71878839, + "learning_rate": 4.1875233213197123e-07, + "loss": 0.79539883, + "num_input_tokens_seen": 285872705, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.08599854, + "step": 13248, + "time_per_iteration": 2.5318338871002197 + }, + { + "auxiliary_loss_clip": 0.06404188, + "auxiliary_loss_mlp": 0.01265183, + "balance_loss_clip": 0.06268889, + "balance_loss_mlp": 0.01255378, + "epoch": 0.7965729745979258, + "flos": 24425436084480.0, + "grad_norm": 2.290940910554294, + "language_loss": 0.76236963, + "learning_rate": 4.1851389387946255e-07, + "loss": 0.83906335, + "num_input_tokens_seen": 285890290, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.0980835, + "step": 13249, + "time_per_iteration": 2.5285370349884033 + }, + { + "auxiliary_loss_clip": 0.06399461, + "auxiliary_loss_mlp": 0.01262002, + "balance_loss_clip": 0.06270114, + "balance_loss_mlp": 0.01252703, + "epoch": 0.7966330978505937, + "flos": 18845838155520.0, + "grad_norm": 1.9207763897520123, + "language_loss": 0.61375982, + "learning_rate": 4.1827551559706674e-07, + "loss": 0.69037437, + "num_input_tokens_seen": 285909190, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.09307861, + "step": 13250, + "time_per_iteration": 2.4775562286376953 + }, + { + "auxiliary_loss_clip": 0.06399567, + "auxiliary_loss_mlp": 0.01263631, + "balance_loss_clip": 0.06269053, + "balance_loss_mlp": 0.01253982, + "epoch": 0.7966932211032617, + "flos": 13157437299840.0, + "grad_norm": 2.289000304094375, + "language_loss": 0.72802746, + "learning_rate": 4.180371972938206e-07, + "loss": 0.80465943, + "num_input_tokens_seen": 285927570, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.09655762, + "step": 13251, + "time_per_iteration": 2.5408740043640137 + }, + { + "auxiliary_loss_clip": 0.06409312, + "auxiliary_loss_mlp": 0.01265133, + "balance_loss_clip": 0.06273971, + "balance_loss_mlp": 0.01254654, + "epoch": 0.7967533443559297, + "flos": 23956290673920.0, + "grad_norm": 1.9875673178726758, + "language_loss": 0.73053861, + "learning_rate": 4.177989389787624e-07, + "loss": 0.80728304, + "num_input_tokens_seen": 285945810, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.1048584, + "step": 13252, + "time_per_iteration": 3.9433846473693848 + }, + { + "auxiliary_loss_clip": 0.06396703, + "auxiliary_loss_mlp": 0.01266191, + "balance_loss_clip": 0.06269825, + "balance_loss_mlp": 0.01256886, + "epoch": 0.7968134676085976, + "flos": 30375984038400.0, + "grad_norm": 1.8369149171198353, + "language_loss": 0.66266763, + "learning_rate": 4.175607406609278e-07, + "loss": 0.73929667, + "num_input_tokens_seen": 285964235, + "router_z_loss_clip": 1.26757812, + "router_z_loss_mlp": 0.09307861, + "step": 13253, + "time_per_iteration": 2.5753839015960693 + }, + { + "auxiliary_loss_clip": 0.06402615, + "auxiliary_loss_mlp": 0.01264505, + "balance_loss_clip": 0.06269044, + "balance_loss_mlp": 0.01254289, + "epoch": 0.7968735908612656, + "flos": 23081590702080.0, + "grad_norm": 1.5642785207566534, + "language_loss": 0.67620826, + "learning_rate": 4.1732260234934767e-07, + "loss": 0.75287944, + "num_input_tokens_seen": 285983710, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.10223389, + "step": 13254, + "time_per_iteration": 2.587885856628418 + }, + { + "auxiliary_loss_clip": 0.0640402, + "auxiliary_loss_mlp": 0.01267658, + "balance_loss_clip": 0.06271625, + "balance_loss_mlp": 0.01258467, + "epoch": 0.7969337141139335, + "flos": 23588275541760.0, + "grad_norm": 2.088422762405943, + "language_loss": 0.69607329, + "learning_rate": 4.1708452405305314e-07, + "loss": 0.77279007, + "num_input_tokens_seen": 286003425, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.09191895, + "step": 13255, + "time_per_iteration": 2.5366928577423096 + }, + { + "auxiliary_loss_clip": 0.06399679, + "auxiliary_loss_mlp": 0.01263773, + "balance_loss_clip": 0.0626971, + "balance_loss_mlp": 0.0125463, + "epoch": 0.7969938373666016, + "flos": 19762018698240.0, + "grad_norm": 1.6762095197917861, + "language_loss": 0.79241788, + "learning_rate": 4.168465057810733e-07, + "loss": 0.86905241, + "num_input_tokens_seen": 286020130, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.09143066, + "step": 13256, + "time_per_iteration": 3.9199607372283936 + }, + { + "auxiliary_loss_clip": 0.06405733, + "auxiliary_loss_mlp": 0.01263678, + "balance_loss_clip": 0.06272037, + "balance_loss_mlp": 0.01254195, + "epoch": 0.7970539606192695, + "flos": 24140969072640.0, + "grad_norm": 1.817522476863435, + "language_loss": 0.66469562, + "learning_rate": 4.166085475424315e-07, + "loss": 0.74138975, + "num_input_tokens_seen": 286040230, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.09484863, + "step": 13257, + "time_per_iteration": 2.4968059062957764 + }, + { + "auxiliary_loss_clip": 0.06411573, + "auxiliary_loss_mlp": 0.01262722, + "balance_loss_clip": 0.06272082, + "balance_loss_mlp": 0.01252977, + "epoch": 0.7971140838719375, + "flos": 17974576200960.0, + "grad_norm": 2.293552355321388, + "language_loss": 0.721138, + "learning_rate": 4.163706493461523e-07, + "loss": 0.79788101, + "num_input_tokens_seen": 286059475, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.09753418, + "step": 13258, + "time_per_iteration": 2.466635227203369 + }, + { + "auxiliary_loss_clip": 0.06404628, + "auxiliary_loss_mlp": 0.01268173, + "balance_loss_clip": 0.06270341, + "balance_loss_mlp": 0.01257439, + "epoch": 0.7971742071246054, + "flos": 19175181828480.0, + "grad_norm": 1.7912391212808825, + "language_loss": 0.69168359, + "learning_rate": 4.1613281120125655e-07, + "loss": 0.76841164, + "num_input_tokens_seen": 286077820, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.1072998, + "step": 13259, + "time_per_iteration": 2.5077145099639893 + }, + { + "auxiliary_loss_clip": 0.06399243, + "auxiliary_loss_mlp": 0.01264467, + "balance_loss_clip": 0.06270258, + "balance_loss_mlp": 0.01255467, + "epoch": 0.7972343303772734, + "flos": 27133335682560.0, + "grad_norm": 1.8522631827723854, + "language_loss": 0.73832285, + "learning_rate": 4.158950331167641e-07, + "loss": 0.81495994, + "num_input_tokens_seen": 286097285, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.09002686, + "step": 13260, + "time_per_iteration": 2.542802333831787 + }, + { + "auxiliary_loss_clip": 0.0640289, + "auxiliary_loss_mlp": 0.01260989, + "balance_loss_clip": 0.06273317, + "balance_loss_mlp": 0.01251559, + "epoch": 0.7972944536299413, + "flos": 21003056720640.0, + "grad_norm": 1.7849042953427723, + "language_loss": 0.78480017, + "learning_rate": 4.1565731510169065e-07, + "loss": 0.86143899, + "num_input_tokens_seen": 286116000, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.09423828, + "step": 13261, + "time_per_iteration": 3.9328079223632812 + }, + { + "auxiliary_loss_clip": 0.06398886, + "auxiliary_loss_mlp": 0.01262833, + "balance_loss_clip": 0.06273298, + "balance_loss_mlp": 0.01254673, + "epoch": 0.7973545768826094, + "flos": 21586455573120.0, + "grad_norm": 1.5738375071778383, + "language_loss": 0.76378083, + "learning_rate": 4.154196571650501e-07, + "loss": 0.84039807, + "num_input_tokens_seen": 286135110, + "router_z_loss_clip": 1.25488281, + "router_z_loss_mlp": 0.081604, + "step": 13262, + "time_per_iteration": 2.563962936401367 + }, + { + "auxiliary_loss_clip": 0.06407683, + "auxiliary_loss_mlp": 0.01266045, + "balance_loss_clip": 0.06271025, + "balance_loss_mlp": 0.01254929, + "epoch": 0.7974147001352773, + "flos": 20564826266880.0, + "grad_norm": 2.3741111295907626, + "language_loss": 0.70724112, + "learning_rate": 4.1518205931585524e-07, + "loss": 0.7839784, + "num_input_tokens_seen": 286152835, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.11126709, + "step": 13263, + "time_per_iteration": 2.4744935035705566 + }, + { + "auxiliary_loss_clip": 0.0641284, + "auxiliary_loss_mlp": 0.01264474, + "balance_loss_clip": 0.06274222, + "balance_loss_mlp": 0.01253174, + "epoch": 0.7974748233879453, + "flos": 21003224428800.0, + "grad_norm": 1.8041636283725375, + "language_loss": 0.71434695, + "learning_rate": 4.149445215631153e-07, + "loss": 0.79112011, + "num_input_tokens_seen": 286171785, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.11297607, + "step": 13264, + "time_per_iteration": 2.485276460647583 + }, + { + "auxiliary_loss_clip": 0.06398866, + "auxiliary_loss_mlp": 0.0126452, + "balance_loss_clip": 0.06270253, + "balance_loss_mlp": 0.01256187, + "epoch": 0.7975349466406133, + "flos": 22571803261440.0, + "grad_norm": 1.6689770527063423, + "language_loss": 0.77659208, + "learning_rate": 4.1470704391583776e-07, + "loss": 0.85322595, + "num_input_tokens_seen": 286190420, + "router_z_loss_clip": 1.28417969, + "router_z_loss_mlp": 0.08331299, + "step": 13265, + "time_per_iteration": 2.50765061378479 + }, + { + "auxiliary_loss_clip": 0.06407373, + "auxiliary_loss_mlp": 0.01269533, + "balance_loss_clip": 0.06273501, + "balance_loss_mlp": 0.01259609, + "epoch": 0.7975950698932812, + "flos": 21696013186560.0, + "grad_norm": 1.8504698542540234, + "language_loss": 0.76059192, + "learning_rate": 4.144696263830285e-07, + "loss": 0.83736098, + "num_input_tokens_seen": 286210105, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.0993042, + "step": 13266, + "time_per_iteration": 2.5207157135009766 + }, + { + "auxiliary_loss_clip": 0.06402943, + "auxiliary_loss_mlp": 0.01264296, + "balance_loss_clip": 0.06272074, + "balance_loss_mlp": 0.01255183, + "epoch": 0.7976551931459492, + "flos": 19609806556800.0, + "grad_norm": 1.6112289211308914, + "language_loss": 0.83747739, + "learning_rate": 4.1423226897369015e-07, + "loss": 0.91414976, + "num_input_tokens_seen": 286228180, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.09112549, + "step": 13267, + "time_per_iteration": 2.523797035217285 + }, + { + "auxiliary_loss_clip": 0.06403189, + "auxiliary_loss_mlp": 0.01266238, + "balance_loss_clip": 0.06272589, + "balance_loss_mlp": 0.01256725, + "epoch": 0.7977153163986171, + "flos": 21693749126400.0, + "grad_norm": 1.4537624263579578, + "language_loss": 0.76656401, + "learning_rate": 4.139949716968223e-07, + "loss": 0.84325826, + "num_input_tokens_seen": 286247305, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09503174, + "step": 13268, + "time_per_iteration": 2.50384783744812 + }, + { + "auxiliary_loss_clip": 0.06404118, + "auxiliary_loss_mlp": 0.0126592, + "balance_loss_clip": 0.06272426, + "balance_loss_mlp": 0.01256574, + "epoch": 0.7977754396512852, + "flos": 23483455683840.0, + "grad_norm": 1.5523298062662978, + "language_loss": 0.78092402, + "learning_rate": 4.1375773456142403e-07, + "loss": 0.85762441, + "num_input_tokens_seen": 286268145, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09344482, + "step": 13269, + "time_per_iteration": 2.544590473175049 + }, + { + "auxiliary_loss_clip": 0.06399094, + "auxiliary_loss_mlp": 0.01261853, + "balance_loss_clip": 0.06270756, + "balance_loss_mlp": 0.01253043, + "epoch": 0.7978355629039531, + "flos": 22388718090240.0, + "grad_norm": 1.6478961708757416, + "language_loss": 0.82291299, + "learning_rate": 4.135205575764922e-07, + "loss": 0.89952242, + "num_input_tokens_seen": 286286775, + "router_z_loss_clip": 1.28222656, + "router_z_loss_mlp": 0.08813477, + "step": 13270, + "time_per_iteration": 2.4902870655059814 + }, + { + "auxiliary_loss_clip": 0.06401956, + "auxiliary_loss_mlp": 0.01264701, + "balance_loss_clip": 0.06270558, + "balance_loss_mlp": 0.01255331, + "epoch": 0.7978956861566211, + "flos": 20272518898560.0, + "grad_norm": 2.1156464454549297, + "language_loss": 0.59938061, + "learning_rate": 4.1328344075101905e-07, + "loss": 0.67604721, + "num_input_tokens_seen": 286305590, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.09362793, + "step": 13271, + "time_per_iteration": 2.5591602325439453 + }, + { + "auxiliary_loss_clip": 0.06410769, + "auxiliary_loss_mlp": 0.01265645, + "balance_loss_clip": 0.06274214, + "balance_loss_mlp": 0.01256037, + "epoch": 0.797955809409289, + "flos": 28120192744320.0, + "grad_norm": 1.4386088451054988, + "language_loss": 0.73758554, + "learning_rate": 4.130463840939975e-07, + "loss": 0.81434965, + "num_input_tokens_seen": 286328050, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.09606934, + "step": 13272, + "time_per_iteration": 2.570200204849243 + }, + { + "auxiliary_loss_clip": 0.06401898, + "auxiliary_loss_mlp": 0.0126542, + "balance_loss_clip": 0.06270777, + "balance_loss_mlp": 0.012558, + "epoch": 0.798015932661957, + "flos": 15564979537920.0, + "grad_norm": 2.1482391429317067, + "language_loss": 0.71803975, + "learning_rate": 4.128093876144161e-07, + "loss": 0.79471296, + "num_input_tokens_seen": 286345265, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09625244, + "step": 13273, + "time_per_iteration": 2.4748198986053467 + }, + { + "auxiliary_loss_clip": 0.0640889, + "auxiliary_loss_mlp": 0.01264134, + "balance_loss_clip": 0.06274156, + "balance_loss_mlp": 0.012539, + "epoch": 0.7980760559146249, + "flos": 23957967755520.0, + "grad_norm": 1.5725586223842085, + "language_loss": 0.75832808, + "learning_rate": 4.1257245132126117e-07, + "loss": 0.83505827, + "num_input_tokens_seen": 286364465, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.10241699, + "step": 13274, + "time_per_iteration": 2.55397629737854 + }, + { + "auxiliary_loss_clip": 0.06394248, + "auxiliary_loss_mlp": 0.01262515, + "balance_loss_clip": 0.06268619, + "balance_loss_mlp": 0.01253622, + "epoch": 0.798136179167293, + "flos": 28045617010560.0, + "grad_norm": 1.334626175327206, + "language_loss": 0.77871919, + "learning_rate": 4.12335575223518e-07, + "loss": 0.85528684, + "num_input_tokens_seen": 286385565, + "router_z_loss_clip": 1.25585938, + "router_z_loss_mlp": 0.08892822, + "step": 13275, + "time_per_iteration": 2.594181776046753 + }, + { + "auxiliary_loss_clip": 0.0640621, + "auxiliary_loss_mlp": 0.01265971, + "balance_loss_clip": 0.06270525, + "balance_loss_mlp": 0.01255189, + "epoch": 0.7981963024199609, + "flos": 35992157074560.0, + "grad_norm": 2.855483452086949, + "language_loss": 0.64085776, + "learning_rate": 4.1209875933016877e-07, + "loss": 0.71757954, + "num_input_tokens_seen": 286403950, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.10784912, + "step": 13276, + "time_per_iteration": 2.5930356979370117 + }, + { + "auxiliary_loss_clip": 0.06401938, + "auxiliary_loss_mlp": 0.01267748, + "balance_loss_clip": 0.06273316, + "balance_loss_mlp": 0.0125805, + "epoch": 0.7982564256726289, + "flos": 25892004170880.0, + "grad_norm": 1.5904474642505515, + "language_loss": 0.61038435, + "learning_rate": 4.118620036501945e-07, + "loss": 0.68708122, + "num_input_tokens_seen": 286426160, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.09692383, + "step": 13277, + "time_per_iteration": 2.5839786529541016 + }, + { + "auxiliary_loss_clip": 0.06411898, + "auxiliary_loss_mlp": 0.0126538, + "balance_loss_clip": 0.06276092, + "balance_loss_mlp": 0.012561, + "epoch": 0.7983165489252969, + "flos": 25746248793600.0, + "grad_norm": 1.8327445572983765, + "language_loss": 0.79849744, + "learning_rate": 4.1162530819257227e-07, + "loss": 0.87527025, + "num_input_tokens_seen": 286446610, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.09283447, + "step": 13278, + "time_per_iteration": 2.5260982513427734 + }, + { + "auxiliary_loss_clip": 0.06405683, + "auxiliary_loss_mlp": 0.01263371, + "balance_loss_clip": 0.06271876, + "balance_loss_mlp": 0.01253518, + "epoch": 0.7983766721779648, + "flos": 21914667216000.0, + "grad_norm": 1.9889744564125917, + "language_loss": 0.63581717, + "learning_rate": 4.113886729662768e-07, + "loss": 0.71250772, + "num_input_tokens_seen": 286465460, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.09844971, + "step": 13279, + "time_per_iteration": 2.5182244777679443 + }, + { + "auxiliary_loss_clip": 0.06394448, + "auxiliary_loss_mlp": 0.01266216, + "balance_loss_clip": 0.06270408, + "balance_loss_mlp": 0.01257925, + "epoch": 0.7984367954306328, + "flos": 29354480513280.0, + "grad_norm": 1.5743045282106698, + "language_loss": 0.71176022, + "learning_rate": 4.111520979802825e-07, + "loss": 0.78836685, + "num_input_tokens_seen": 286485720, + "router_z_loss_clip": 1.24121094, + "router_z_loss_mlp": 0.08282471, + "step": 13280, + "time_per_iteration": 2.575366258621216 + }, + { + "auxiliary_loss_clip": 0.06409226, + "auxiliary_loss_mlp": 0.01266632, + "balance_loss_clip": 0.06273544, + "balance_loss_mlp": 0.01257149, + "epoch": 0.7984969186833007, + "flos": 31365775992960.0, + "grad_norm": 1.6558048262309357, + "language_loss": 0.62836027, + "learning_rate": 4.1091558324355955e-07, + "loss": 0.70511883, + "num_input_tokens_seen": 286507465, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.0947876, + "step": 13281, + "time_per_iteration": 2.624361276626587 + }, + { + "auxiliary_loss_clip": 0.06407207, + "auxiliary_loss_mlp": 0.01265261, + "balance_loss_clip": 0.06269886, + "balance_loss_mlp": 0.01254807, + "epoch": 0.7985570419359688, + "flos": 24319232634240.0, + "grad_norm": 1.8833916192642874, + "language_loss": 0.79982495, + "learning_rate": 4.1067912876507683e-07, + "loss": 0.8765496, + "num_input_tokens_seen": 286526345, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.10449219, + "step": 13282, + "time_per_iteration": 2.522733211517334 + }, + { + "auxiliary_loss_clip": 0.06405975, + "auxiliary_loss_mlp": 0.01265316, + "balance_loss_clip": 0.06271493, + "balance_loss_mlp": 0.01256339, + "epoch": 0.7986171651886367, + "flos": 15747687365760.0, + "grad_norm": 2.26715299858664, + "language_loss": 0.72620189, + "learning_rate": 4.10442734553802e-07, + "loss": 0.8029148, + "num_input_tokens_seen": 286544095, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.08972168, + "step": 13283, + "time_per_iteration": 3.8687400817871094 + }, + { + "auxiliary_loss_clip": 0.06398675, + "auxiliary_loss_mlp": 0.01262054, + "balance_loss_clip": 0.06269114, + "balance_loss_mlp": 0.01253072, + "epoch": 0.7986772884413047, + "flos": 11624175763200.0, + "grad_norm": 2.1421699909472474, + "language_loss": 0.73992294, + "learning_rate": 4.102064006186967e-07, + "loss": 0.81653023, + "num_input_tokens_seen": 286560960, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.08984375, + "step": 13284, + "time_per_iteration": 2.464895486831665 + }, + { + "auxiliary_loss_clip": 0.06401472, + "auxiliary_loss_mlp": 0.01263764, + "balance_loss_clip": 0.06270264, + "balance_loss_mlp": 0.01254883, + "epoch": 0.7987374116939726, + "flos": 22097626606080.0, + "grad_norm": 1.6639585561146113, + "language_loss": 0.70836139, + "learning_rate": 4.0997012696872415e-07, + "loss": 0.78501368, + "num_input_tokens_seen": 286579865, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.08874512, + "step": 13285, + "time_per_iteration": 2.5129339694976807 + }, + { + "auxiliary_loss_clip": 0.06401065, + "auxiliary_loss_mlp": 0.01262275, + "balance_loss_clip": 0.06268647, + "balance_loss_mlp": 0.01252982, + "epoch": 0.7987975349466406, + "flos": 17895807763200.0, + "grad_norm": 1.6553012923822499, + "language_loss": 0.73934168, + "learning_rate": 4.097339136128437e-07, + "loss": 0.81597507, + "num_input_tokens_seen": 286597295, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.09295654, + "step": 13286, + "time_per_iteration": 2.4993607997894287 + }, + { + "auxiliary_loss_clip": 0.0640146, + "auxiliary_loss_mlp": 0.01262205, + "balance_loss_clip": 0.06270432, + "balance_loss_mlp": 0.01252859, + "epoch": 0.7988576581993085, + "flos": 19725359736960.0, + "grad_norm": 1.5989615606819938, + "language_loss": 0.75195587, + "learning_rate": 4.0949776056001296e-07, + "loss": 0.82859248, + "num_input_tokens_seen": 286616270, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09350586, + "step": 13287, + "time_per_iteration": 2.498539447784424 + }, + { + "auxiliary_loss_clip": 0.0640296, + "auxiliary_loss_mlp": 0.01263938, + "balance_loss_clip": 0.06271001, + "balance_loss_mlp": 0.01254598, + "epoch": 0.7989177814519766, + "flos": 28043604512640.0, + "grad_norm": 1.4032913596903045, + "language_loss": 0.62071377, + "learning_rate": 4.092616678191863e-07, + "loss": 0.69738275, + "num_input_tokens_seen": 286638315, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.09338379, + "step": 13288, + "time_per_iteration": 2.5561347007751465 + }, + { + "auxiliary_loss_clip": 0.06401485, + "auxiliary_loss_mlp": 0.01264116, + "balance_loss_clip": 0.06273647, + "balance_loss_mlp": 0.01255122, + "epoch": 0.7989779047046445, + "flos": 28877662454400.0, + "grad_norm": 2.6038900989096705, + "language_loss": 0.70626175, + "learning_rate": 4.090256353993169e-07, + "loss": 0.78291774, + "num_input_tokens_seen": 286658630, + "router_z_loss_clip": 1.27832031, + "router_z_loss_mlp": 0.08996582, + "step": 13289, + "time_per_iteration": 2.5535638332366943 + }, + { + "auxiliary_loss_clip": 0.06396915, + "auxiliary_loss_mlp": 0.01263033, + "balance_loss_clip": 0.06270102, + "balance_loss_mlp": 0.01253771, + "epoch": 0.7990380279573125, + "flos": 18192769032960.0, + "grad_norm": 2.213156856555218, + "language_loss": 0.63382244, + "learning_rate": 4.0878966330935506e-07, + "loss": 0.71042198, + "num_input_tokens_seen": 286676870, + "router_z_loss_clip": 1.26757812, + "router_z_loss_mlp": 0.09259033, + "step": 13290, + "time_per_iteration": 2.4844484329223633 + }, + { + "auxiliary_loss_clip": 0.06406233, + "auxiliary_loss_mlp": 0.01266627, + "balance_loss_clip": 0.06273846, + "balance_loss_mlp": 0.01256458, + "epoch": 0.7990981512099805, + "flos": 20885113699200.0, + "grad_norm": 1.8461892272796565, + "language_loss": 0.71634483, + "learning_rate": 4.08553751558248e-07, + "loss": 0.79307342, + "num_input_tokens_seen": 286694300, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.10168457, + "step": 13291, + "time_per_iteration": 2.526987314224243 + }, + { + "auxiliary_loss_clip": 0.06397383, + "auxiliary_loss_mlp": 0.01264262, + "balance_loss_clip": 0.06268732, + "balance_loss_mlp": 0.01255107, + "epoch": 0.7991582744626484, + "flos": 26106381642240.0, + "grad_norm": 1.5963617377533177, + "language_loss": 0.63653862, + "learning_rate": 4.083179001549422e-07, + "loss": 0.71315503, + "num_input_tokens_seen": 286714545, + "router_z_loss_clip": 1.28613281, + "router_z_loss_mlp": 0.09161377, + "step": 13292, + "time_per_iteration": 3.920006513595581 + }, + { + "auxiliary_loss_clip": 0.06398708, + "auxiliary_loss_mlp": 0.01264318, + "balance_loss_clip": 0.06267934, + "balance_loss_mlp": 0.01254733, + "epoch": 0.7992183977153164, + "flos": 35304106072320.0, + "grad_norm": 1.797759826858067, + "language_loss": 0.56198502, + "learning_rate": 4.0808210910838105e-07, + "loss": 0.63861531, + "num_input_tokens_seen": 286734525, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.0958252, + "step": 13293, + "time_per_iteration": 2.625302314758301 + }, + { + "auxiliary_loss_clip": 0.06404014, + "auxiliary_loss_mlp": 0.01264714, + "balance_loss_clip": 0.06272873, + "balance_loss_mlp": 0.01255284, + "epoch": 0.7992785209679844, + "flos": 51863294632320.0, + "grad_norm": 2.2763572451506944, + "language_loss": 0.71341664, + "learning_rate": 4.0784637842750704e-07, + "loss": 0.79010391, + "num_input_tokens_seen": 286753430, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.09429932, + "step": 13294, + "time_per_iteration": 2.76823353767395 + }, + { + "auxiliary_loss_clip": 0.06401891, + "auxiliary_loss_mlp": 0.01262732, + "balance_loss_clip": 0.06269768, + "balance_loss_mlp": 0.01252623, + "epoch": 0.7993386442206524, + "flos": 22571719407360.0, + "grad_norm": 1.8830431252935182, + "language_loss": 0.72672385, + "learning_rate": 4.0761070812125675e-07, + "loss": 0.80337006, + "num_input_tokens_seen": 286771915, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.10107422, + "step": 13295, + "time_per_iteration": 3.9486594200134277 + }, + { + "auxiliary_loss_clip": 0.06399785, + "auxiliary_loss_mlp": 0.01270961, + "balance_loss_clip": 0.06270969, + "balance_loss_mlp": 0.01262367, + "epoch": 0.7993987674733203, + "flos": 18805112271360.0, + "grad_norm": 1.8035732738246322, + "language_loss": 0.76883113, + "learning_rate": 4.0737509819856797e-07, + "loss": 0.84553862, + "num_input_tokens_seen": 286789835, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.0859375, + "step": 13296, + "time_per_iteration": 2.5124893188476562 + }, + { + "auxiliary_loss_clip": 0.06317963, + "auxiliary_loss_mlp": 0.01251058, + "balance_loss_clip": 0.06262526, + "balance_loss_mlp": 0.0125003, + "epoch": 0.7994588907259883, + "flos": 69443747625600.0, + "grad_norm": 0.6778750345647286, + "language_loss": 0.60765332, + "learning_rate": 4.0713954866837573e-07, + "loss": 0.68334353, + "num_input_tokens_seen": 286855580, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01027679, + "step": 13297, + "time_per_iteration": 3.258441209793091 + }, + { + "auxiliary_loss_clip": 0.06401801, + "auxiliary_loss_mlp": 0.01265804, + "balance_loss_clip": 0.06271636, + "balance_loss_mlp": 0.01256398, + "epoch": 0.7995190139786562, + "flos": 13485439307520.0, + "grad_norm": 2.2443800001049645, + "language_loss": 0.70575351, + "learning_rate": 4.0690405953961073e-07, + "loss": 0.78242958, + "num_input_tokens_seen": 286874360, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.09399414, + "step": 13298, + "time_per_iteration": 2.4816195964813232 + }, + { + "auxiliary_loss_clip": 0.06406148, + "auxiliary_loss_mlp": 0.01264059, + "balance_loss_clip": 0.06270477, + "balance_loss_mlp": 0.01253563, + "epoch": 0.7995791372313242, + "flos": 21659270371200.0, + "grad_norm": 1.914137701086928, + "language_loss": 0.76235688, + "learning_rate": 4.066686308212037e-07, + "loss": 0.839059, + "num_input_tokens_seen": 286891950, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.10498047, + "step": 13299, + "time_per_iteration": 2.491387128829956 + }, + { + "auxiliary_loss_clip": 0.06396549, + "auxiliary_loss_mlp": 0.01265326, + "balance_loss_clip": 0.06268974, + "balance_loss_mlp": 0.01256779, + "epoch": 0.7996392604839921, + "flos": 26075382831360.0, + "grad_norm": 1.6376768390824803, + "language_loss": 0.77644742, + "learning_rate": 4.064332625220828e-07, + "loss": 0.85306615, + "num_input_tokens_seen": 286911725, + "router_z_loss_clip": 1.27734375, + "router_z_loss_mlp": 0.08544922, + "step": 13300, + "time_per_iteration": 3.941457986831665 + }, + { + "auxiliary_loss_clip": 0.06406416, + "auxiliary_loss_mlp": 0.01264711, + "balance_loss_clip": 0.06271292, + "balance_loss_mlp": 0.01255473, + "epoch": 0.7996993837366602, + "flos": 24613594427520.0, + "grad_norm": 1.7813390500304356, + "language_loss": 0.64086711, + "learning_rate": 4.0619795465117115e-07, + "loss": 0.71757841, + "num_input_tokens_seen": 286931400, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.09228516, + "step": 13301, + "time_per_iteration": 2.5052661895751953 + }, + { + "auxiliary_loss_clip": 0.06398593, + "auxiliary_loss_mlp": 0.01264195, + "balance_loss_clip": 0.06270251, + "balance_loss_mlp": 0.01255285, + "epoch": 0.7997595069893281, + "flos": 20997690059520.0, + "grad_norm": 1.5469395807720157, + "language_loss": 0.71982718, + "learning_rate": 4.059627072173928e-07, + "loss": 0.79645514, + "num_input_tokens_seen": 286949795, + "router_z_loss_clip": 1.28320312, + "router_z_loss_mlp": 0.08911133, + "step": 13302, + "time_per_iteration": 2.489457368850708 + }, + { + "auxiliary_loss_clip": 0.06408885, + "auxiliary_loss_mlp": 0.01265444, + "balance_loss_clip": 0.0627289, + "balance_loss_mlp": 0.01255967, + "epoch": 0.7998196302419961, + "flos": 24433528003200.0, + "grad_norm": 1.7910708704236549, + "language_loss": 0.83398485, + "learning_rate": 4.057275202296684e-07, + "loss": 0.91072816, + "num_input_tokens_seen": 286968805, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.09484863, + "step": 13303, + "time_per_iteration": 2.5182011127471924 + }, + { + "auxiliary_loss_clip": 0.06399085, + "auxiliary_loss_mlp": 0.01263644, + "balance_loss_clip": 0.06271808, + "balance_loss_mlp": 0.01254429, + "epoch": 0.7998797534946641, + "flos": 30272715480960.0, + "grad_norm": 1.579021550921295, + "language_loss": 0.58929861, + "learning_rate": 4.054923936969166e-07, + "loss": 0.66592586, + "num_input_tokens_seen": 286990235, + "router_z_loss_clip": 1.27148438, + "router_z_loss_mlp": 0.09210205, + "step": 13304, + "time_per_iteration": 2.584608316421509 + }, + { + "auxiliary_loss_clip": 0.06406042, + "auxiliary_loss_mlp": 0.01261222, + "balance_loss_clip": 0.06271531, + "balance_loss_mlp": 0.0125202, + "epoch": 0.799939876747332, + "flos": 23520785477760.0, + "grad_norm": 1.5411018505136698, + "language_loss": 0.68989539, + "learning_rate": 4.0525732762805265e-07, + "loss": 0.76656806, + "num_input_tokens_seen": 287011060, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.09210205, + "step": 13305, + "time_per_iteration": 2.495842218399048 + }, + { + "auxiliary_loss_clip": 0.06398628, + "auxiliary_loss_mlp": 0.0126253, + "balance_loss_clip": 0.06269637, + "balance_loss_mlp": 0.01254028, + "epoch": 0.8, + "flos": 19324207514880.0, + "grad_norm": 1.5483879862096703, + "language_loss": 0.6919629, + "learning_rate": 4.0502232203199107e-07, + "loss": 0.76857448, + "num_input_tokens_seen": 287029215, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.08493042, + "step": 13306, + "time_per_iteration": 2.4815428256988525 + }, + { + "auxiliary_loss_clip": 0.06404909, + "auxiliary_loss_mlp": 0.01263974, + "balance_loss_clip": 0.06271838, + "balance_loss_mlp": 0.01254813, + "epoch": 0.800060123252668, + "flos": 32420039264640.0, + "grad_norm": 1.3465720910639238, + "language_loss": 0.69548619, + "learning_rate": 4.0478737691764286e-07, + "loss": 0.77217495, + "num_input_tokens_seen": 287050855, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.09155273, + "step": 13307, + "time_per_iteration": 2.5902602672576904 + }, + { + "auxiliary_loss_clip": 0.06402986, + "auxiliary_loss_mlp": 0.01264461, + "balance_loss_clip": 0.06269908, + "balance_loss_mlp": 0.01255151, + "epoch": 0.800120246505336, + "flos": 20016702783360.0, + "grad_norm": 1.932839582685843, + "language_loss": 0.77209872, + "learning_rate": 4.0455249229391677e-07, + "loss": 0.84877324, + "num_input_tokens_seen": 287069915, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.09313965, + "step": 13308, + "time_per_iteration": 2.5227887630462646 + }, + { + "auxiliary_loss_clip": 0.06406727, + "auxiliary_loss_mlp": 0.01264112, + "balance_loss_clip": 0.06270848, + "balance_loss_mlp": 0.0125395, + "epoch": 0.8001803697580039, + "flos": 31876318120320.0, + "grad_norm": 1.398024400765408, + "language_loss": 0.78861815, + "learning_rate": 4.0431766816972e-07, + "loss": 0.86532652, + "num_input_tokens_seen": 287091450, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.10174561, + "step": 13309, + "time_per_iteration": 2.694766044616699 + }, + { + "auxiliary_loss_clip": 0.06317627, + "auxiliary_loss_mlp": 0.01253959, + "balance_loss_clip": 0.06261955, + "balance_loss_mlp": 0.01252847, + "epoch": 0.8002404930106719, + "flos": 63411496341120.0, + "grad_norm": 0.9515368521242993, + "language_loss": 0.64834917, + "learning_rate": 4.040829045539571e-07, + "loss": 0.72406501, + "num_input_tokens_seen": 287148365, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.01114655, + "step": 13310, + "time_per_iteration": 3.0877020359039307 + }, + { + "auxiliary_loss_clip": 0.06409021, + "auxiliary_loss_mlp": 0.01267758, + "balance_loss_clip": 0.06276361, + "balance_loss_mlp": 0.01258257, + "epoch": 0.8003006162633398, + "flos": 27862951109760.0, + "grad_norm": 1.8032558576679762, + "language_loss": 0.83180302, + "learning_rate": 4.0384820145553156e-07, + "loss": 0.90857077, + "num_input_tokens_seen": 287168280, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.0949707, + "step": 13311, + "time_per_iteration": 2.555682897567749 + }, + { + "auxiliary_loss_clip": 0.06402326, + "auxiliary_loss_mlp": 0.01265058, + "balance_loss_clip": 0.06271294, + "balance_loss_mlp": 0.01255944, + "epoch": 0.8003607395160078, + "flos": 18229218359040.0, + "grad_norm": 1.9156158973382509, + "language_loss": 0.6619851, + "learning_rate": 4.0361355888334116e-07, + "loss": 0.73865891, + "num_input_tokens_seen": 287185980, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.09118652, + "step": 13312, + "time_per_iteration": 2.4853975772857666 + } + ], + "logging_steps": 1.0, + "max_steps": 16632, + "num_input_tokens_seen": 287185980, + "num_train_epochs": 1, + "save_steps": 3328, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.1198325594259456e+18, + "train_batch_size": 5, + "trial_name": null, + "trial_params": null +} diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-13312/training_args.bin b/sft/revise_Full_smoe_tcmoe/checkpoint-13312/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..97c752df28a864c1e1da329f5474435eefe7778b --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-13312/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fda08a1e9d46ee3a47070dfbfdde239474b3b39c0e298dedbf0b0dd9cdd3c27e +size 7992 diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-13312/zero_to_fp32.py b/sft/revise_Full_smoe_tcmoe/checkpoint-13312/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-13312/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-16632/added_tokens.json b/sft/revise_Full_smoe_tcmoe/checkpoint-16632/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..c9d3d3a1b74d87e381e471f7b33784015d2dc0ea --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-16632/added_tokens.json @@ -0,0 +1,13 @@ +{ + "<|assistant|>": 32001, + "<|endoftext|>": 32000, + "<|end|>": 32007, + "<|placeholder1|>": 32002, + "<|placeholder2|>": 32003, + "<|placeholder3|>": 32004, + "<|placeholder4|>": 32005, + "<|placeholder5|>": 32008, + "<|placeholder6|>": 32009, + "<|system|>": 32006, + "<|user|>": 32010 +} diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-16632/config.json b/sft/revise_Full_smoe_tcmoe/checkpoint-16632/config.json new file mode 100644 index 0000000000000000000000000000000000000000..da3b0c65c0ef1d3a1c68ffdd7565996d4dd85a33 --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-16632/config.json @@ -0,0 +1,203 @@ +{ + "_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "architectures": [ + "LlavaPhiForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_phi3.Phi3Config", + "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM" + }, + "bal_comp_loss_coef": 0.01, + "balance_loss_coef": 0.01, + "bos_token_id": 1, + "clip_smoe": true, + "diversity_loss_coef": 0.01, + "dropout": false, + "e_loss_coef": 0.001, + "embd_pdrop": 0.0, + "entropy_advance_loss": false, + "eos_token_id": 32000, + "freeze_backbone": false, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 3072, + "hybrid": false, + "image_aspect_ratio": "pad", + "init_weight": true, + "initializer_range": 0.02, + "intermediate_size": 8192, + "is_cosine": false, + "is_norm_weight": false, + "local_rank": 0, + "loss1": "balanceloss", + "loss2": "zloss", + "luna": false, + "max_compete_in_iter": 3, + "max_position_embeddings": 131072, + "mlp_smoe": true, + "mm_hidden_size": 1152, + "mm_patch_merge_type": "flat", + "mm_projector_lr": null, + "mm_projector_type": "moe", + "mm_use_im_patch_token": false, + "mm_use_im_start_end": false, + "mm_vision_select_feature": "patch", + "mm_vision_select_layer": -2, + "mm_vision_tower": "google/siglip-so400m-patch14-224", + "model_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "model_type": "llava_phi", + "moe_name": "smoe_tcmoe", + "moe_relu_l1_reg_coeff_multiplier": 1.2, + "mp_pixel_shuffle_factor": 1, + "norm_softmax": false, + "normalization": true, + "num_attention_heads": 32, + "num_experts": 4, + "num_hidden_layers": 32, + "num_key_value_heads": 32, + "num_layers": 3, + "num_selected": 2, + "number_of_previous_tokens": 2, + "original_max_position_embeddings": 4096, + "pad_token_id": 32000, + "pretrain_mm_mlp_adapter": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft/mm_projector.bin", + "rate_compete": 0.2, + "rate_flip": 0.05, + "resid_pdrop": 0.0, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "long_factor": [ + 1.0800000429153442, + 1.1100000143051147, + 1.1399999856948853, + 1.340000033378601, + 1.5899999141693115, + 1.600000023841858, + 1.6200000047683716, + 2.620000123977661, + 3.2300000190734863, + 3.2300000190734863, + 4.789999961853027, + 7.400000095367432, + 7.700000286102295, + 9.09000015258789, + 12.199999809265137, + 17.670000076293945, + 24.46000099182129, + 28.57000160217285, + 30.420001983642578, + 30.840002059936523, + 32.590003967285156, + 32.93000411987305, + 42.320003509521484, + 44.96000289916992, + 50.340003967285156, + 50.45000457763672, + 57.55000305175781, + 57.93000411987305, + 58.21000289916992, + 60.1400032043457, + 62.61000442504883, + 62.62000274658203, + 62.71000289916992, + 63.1400032043457, + 63.1400032043457, + 63.77000427246094, + 63.93000411987305, + 63.96000289916992, + 63.970001220703125, + 64.02999877929688, + 64.06999969482422, + 64.08000183105469, + 64.12000274658203, + 64.41000366210938, + 64.4800033569336, + 64.51000213623047, + 64.52999877929688, + 64.83999633789062 + ], + "short_factor": [ + 1.0, + 1.0199999809265137, + 1.0299999713897705, + 1.0299999713897705, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0699999332427979, + 1.0999999046325684, + 1.1099998950958252, + 1.1599998474121094, + 1.1599998474121094, + 1.1699998378753662, + 1.2899998426437378, + 1.339999794960022, + 1.679999828338623, + 1.7899998426437378, + 1.8199998140335083, + 1.8499997854232788, + 1.8799997568130493, + 1.9099997282028198, + 1.9399996995925903, + 1.9899996519088745, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0799996852874756, + 2.0899996757507324, + 2.189999580383301, + 2.2199995517730713, + 2.5899994373321533, + 2.729999542236328, + 2.749999523162842, + 2.8399994373321533 + ], + "type": "longrope" + }, + "rope_theta": 10000.0, + "router_loss_coef": 0.01, + "router_theta": 0.1, + "router_z_loss_coef": 0.001, + "scales": [ + 1, + 3 + ], + "sliding_window": 262144, + "sparse_upcycling": true, + "std_gate": 0.02, + "strategy_train": "base", + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "topk_max": 2, + "topk_min": 1, + "torch_dtype": "bfloat16", + "training": true, + "transformers_version": "4.43.0", + "tune_mm_mlp_adapter": false, + "unit_test": true, + "use_cache": false, + "use_mm_proj": true, + "use_old": false, + "version": "phi35", + "vision_tower": "google/siglip-so400m-patch14-224", + "vision_tower_dir": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft/clip.bin", + "vocab_size": 32064, + "warm_up": 0.05 +} diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-16632/generation_config.json b/sft/revise_Full_smoe_tcmoe/checkpoint-16632/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..dad5c4578f0dc5969b38755d095fc30c368bb54a --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-16632/generation_config.json @@ -0,0 +1,12 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "do_sample": true, + "eos_token_id": [ + 32007, + 32001, + 32000 + ], + "pad_token_id": 32000, + "transformers_version": "4.43.0" +} diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-16632/latest b/sft/revise_Full_smoe_tcmoe/checkpoint-16632/latest new file mode 100644 index 0000000000000000000000000000000000000000..e4087b037c4d90a88f08b57160ddc65e74a0c271 --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-16632/latest @@ -0,0 +1 @@ +global_step16632 \ No newline at end of file diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-16632/model-00001-of-00003.safetensors b/sft/revise_Full_smoe_tcmoe/checkpoint-16632/model-00001-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..798443627f1263899c866256fd59ce2fbe0df56e --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-16632/model-00001-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8cfb15cbd71963baf99565fd4d9fd14e98800e52fdf006178b86ce47ea734594 +size 4972489328 diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-16632/model-00002-of-00003.safetensors b/sft/revise_Full_smoe_tcmoe/checkpoint-16632/model-00002-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..73d57713990bf2eb68dce8f3c9b432b6de492551 --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-16632/model-00002-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:362553433cdf9d93f9a4467259536767eb6e05ac7c09ef282b394d2a64ba8130 +size 4985902928 diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-16632/model-00003-of-00003.safetensors b/sft/revise_Full_smoe_tcmoe/checkpoint-16632/model-00003-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e75f96a46fc6b9dddc4cdc76f9b3a70117d39c36 --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-16632/model-00003-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:95ca63b9e9d9ded2c4a3a328f34c7a554c0b0ad5d0ef0a874d9fb47cf24df3bd +size 248971200 diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-16632/model.safetensors.index.json b/sft/revise_Full_smoe_tcmoe/checkpoint-16632/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..3197289c4553bb4cba30dd31a8c232b7496a92b5 --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-16632/model.safetensors.index.json @@ -0,0 +1,1005 @@ +{ + "metadata": { + "total_size": 10207220352 + }, + "weight_map": { + "lm_head.weight": "model-00003-of-00003.safetensors", + "model.embed_tokens.weight": "model-00001-of-00003.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.16.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.16.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.17.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.17.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.18.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.18.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.19.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.19.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.21.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.25.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.25.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.25.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.26.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.26.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.26.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.27.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.27.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.27.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.28.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.28.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.28.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.29.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.29.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.29.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.30.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.30.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.30.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.31.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.31.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.31.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.mm_projector.moelayer.experts.0.0.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.0.0.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.0.2.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.0.2.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.1.0.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.1.0.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.1.2.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.1.2.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.2.0.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.2.0.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.2.2.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.2.2.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.3.0.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.3.0.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.3.2.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.3.2.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.gate.weight": "model-00003-of-00003.safetensors", + "model.norm.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00002-of-00003.safetensors" + } +} diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-16632/rng_state_0.pth b/sft/revise_Full_smoe_tcmoe/checkpoint-16632/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..9231f69f5fd461899867106a669ce247e70c72c2 --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-16632/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f9f23d807f0e704f4ca79670a6631cbff43189cf7f8ff4e1fc0a4330e636a798 +size 14960 diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-16632/rng_state_1.pth b/sft/revise_Full_smoe_tcmoe/checkpoint-16632/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..19fe2dcc766f192ea5de79cec4dcff17172a10f7 --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-16632/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d37f92f6aea5386e84d2d64a1a25d6ef96a10b3bbbfe63627981604c8934076 +size 14960 diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-16632/rng_state_2.pth b/sft/revise_Full_smoe_tcmoe/checkpoint-16632/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..bfe492519c6b79b07a8d68b98c5f3d0c073667aa --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-16632/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:667ebf727735115f00a6bdbe090344e9846c726d11bb555cdc201c415f27ad85 +size 14960 diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-16632/rng_state_3.pth b/sft/revise_Full_smoe_tcmoe/checkpoint-16632/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..838d42ad13e30851fdbd1d8801738a4106a9ce8b --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-16632/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49d306f8c511cba8a225e3b723c5fa79d8a6ecc922f834da914ff0780c78b1fc +size 14960 diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-16632/special_tokens_map.json b/sft/revise_Full_smoe_tcmoe/checkpoint-16632/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3e4d5a5bc1cb51753cc9ae0305ece0da60052b10 --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-16632/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-16632/tokenizer.model b/sft/revise_Full_smoe_tcmoe/checkpoint-16632/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-16632/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-16632/tokenizer_config.json b/sft/revise_Full_smoe_tcmoe/checkpoint-16632/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d579bb0b91b24b214ea3c2e487e27a65017cdc4a --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-16632/tokenizer_config.json @@ -0,0 +1,132 @@ +{ + "add_bos_token": false, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": false + }, + "32000": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32001": { + "content": "<|assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32002": { + "content": "<|placeholder1|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32003": { + "content": "<|placeholder2|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32004": { + "content": "<|placeholder3|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32005": { + "content": "<|placeholder4|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32006": { + "content": "<|system|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32007": { + "content": "<|end|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32008": { + "content": "<|placeholder5|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32009": { + "content": "<|placeholder6|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32010": { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "chat_template": "{% for message in messages %}{% if message['role'] == 'system' and message['content'] %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "legacy": false, + "model_max_length": 2048, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-16632/trainer_state.json b/sft/revise_Full_smoe_tcmoe/checkpoint-16632/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..2f9a3ff8f0538213f63c06f30805ee6f68e7c9bf --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-16632/trainer_state.json @@ -0,0 +1,282777 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.999969938373666, + "eval_steps": 500, + "global_step": 16632, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "auxiliary_loss_clip": 0.20073968, + "auxiliary_loss_mlp": 1.0941844, + "balance_loss_clip": 0.12873733, + "balance_loss_mlp": 0.03705556, + "epoch": 6.012325266796934e-05, + "flos": 24462952254720.0, + "grad_norm": 941654.8300602314, + "language_loss": 24.32558632, + "learning_rate": 0.0, + "loss": 16.92002487, + "num_input_tokens_seen": 19155, + "router_z_loss_clip": 72.03125, + "router_z_loss_mlp": 1058.5, + "step": 1, + "time_per_iteration": 18.343486785888672 + }, + { + "auxiliary_loss_clip": 0.13316599, + "auxiliary_loss_mlp": 0.71558112, + "balance_loss_clip": 0.08576315, + "balance_loss_mlp": 0.02466314, + "epoch": 0.00012024650533593868, + "flos": 20231457598080.0, + "grad_norm": 271164.48776572174, + "language_loss": 15.90828419, + "learning_rate": 4.4628432569317594e-07, + "loss": 16.75703049, + "num_input_tokens_seen": 36175, + "router_z_loss_clip": 47.40625, + "router_z_loss_mlp": 691.5, + "step": 2, + "time_per_iteration": 2.4823946952819824 + }, + { + "auxiliary_loss_clip": 0.13345747, + "auxiliary_loss_mlp": 0.73460984, + "balance_loss_clip": 0.08591475, + "balance_loss_mlp": 0.02464893, + "epoch": 0.000180369758003908, + "flos": 22316532197760.0, + "grad_norm": 30890.300344628693, + "language_loss": 15.82156086, + "learning_rate": 7.073439208833112e-07, + "loss": 16.68962669, + "num_input_tokens_seen": 54870, + "router_z_loss_clip": 47.46875, + "router_z_loss_mlp": 711.0, + "step": 3, + "time_per_iteration": 2.4773216247558594 + }, + { + "auxiliary_loss_clip": 0.13399127, + "auxiliary_loss_mlp": 0.72687411, + "balance_loss_clip": 0.08587996, + "balance_loss_mlp": 0.02472562, + "epoch": 0.00024049301067187735, + "flos": 22420471587840.0, + "grad_norm": 3825.373736974443, + "language_loss": 15.7262888, + "learning_rate": 8.925686513863519e-07, + "loss": 16.58715439, + "num_input_tokens_seen": 74575, + "router_z_loss_clip": 48.15625, + "router_z_loss_mlp": 703.0, + "step": 4, + "time_per_iteration": 2.492133378982544 + }, + { + "auxiliary_loss_clip": 0.13353133, + "auxiliary_loss_mlp": 0.72775936, + "balance_loss_clip": 0.08579096, + "balance_loss_mlp": 0.02463434, + "epoch": 0.0003006162633398467, + "flos": 21403286547840.0, + "grad_norm": 4441.394942298188, + "language_loss": 15.57899952, + "learning_rate": 1.0362401141348472e-06, + "loss": 16.44029045, + "num_input_tokens_seen": 92580, + "router_z_loss_clip": 47.65625, + "router_z_loss_mlp": 704.0, + "step": 5, + "time_per_iteration": 2.7607173919677734 + }, + { + "auxiliary_loss_clip": 0.13327441, + "auxiliary_loss_mlp": 0.71557182, + "balance_loss_clip": 0.08570103, + "balance_loss_mlp": 0.02465384, + "epoch": 0.000360739516007816, + "flos": 21658725319680.0, + "grad_norm": 2540.715684092784, + "language_loss": 14.90827179, + "learning_rate": 1.153628246576487e-06, + "loss": 15.75711823, + "num_input_tokens_seen": 109705, + "router_z_loss_clip": 47.5625, + "router_z_loss_mlp": 691.5, + "step": 6, + "time_per_iteration": 2.6497979164123535 + }, + { + "auxiliary_loss_clip": 0.13351092, + "auxiliary_loss_mlp": 0.7340821, + "balance_loss_clip": 0.08562777, + "balance_loss_mlp": 0.02460942, + "epoch": 0.0004208627686757854, + "flos": 27166682407680.0, + "grad_norm": 2502.417206046203, + "language_loss": 14.593853, + "learning_rate": 1.2528784983718962e-06, + "loss": 15.46144581, + "num_input_tokens_seen": 129425, + "router_z_loss_clip": 47.875, + "router_z_loss_mlp": 710.5, + "step": 7, + "time_per_iteration": 2.7325549125671387 + }, + { + "auxiliary_loss_clip": 0.13360947, + "auxiliary_loss_mlp": 0.73910165, + "balance_loss_clip": 0.08574936, + "balance_loss_mlp": 0.02474618, + "epoch": 0.0004809860213437547, + "flos": 31326727190400.0, + "grad_norm": 4081.02679202092, + "language_loss": 14.47960091, + "learning_rate": 1.338852977079528e-06, + "loss": 15.35231113, + "num_input_tokens_seen": 149210, + "router_z_loss_clip": 47.78125, + "router_z_loss_mlp": 715.5, + "step": 8, + "time_per_iteration": 2.7674574851989746 + }, + { + "auxiliary_loss_clip": 0.13345738, + "auxiliary_loss_mlp": 0.74048162, + "balance_loss_clip": 0.08564517, + "balance_loss_mlp": 0.02466127, + "epoch": 0.000541109274011724, + "flos": 32168541634560.0, + "grad_norm": 2607.7195165159947, + "language_loss": 13.74505424, + "learning_rate": 1.4146878417666224e-06, + "loss": 14.61899281, + "num_input_tokens_seen": 169055, + "router_z_loss_clip": 47.78125, + "router_z_loss_mlp": 716.5, + "step": 9, + "time_per_iteration": 2.8135807514190674 + }, + { + "auxiliary_loss_clip": 0.13289651, + "auxiliary_loss_mlp": 0.7478379, + "balance_loss_clip": 0.08548209, + "balance_loss_mlp": 0.02469334, + "epoch": 0.0006012325266796934, + "flos": 18922845657600.0, + "grad_norm": 8226.203152944285, + "language_loss": 12.47718525, + "learning_rate": 1.4825244398280232e-06, + "loss": 13.35791969, + "num_input_tokens_seen": 188045, + "router_z_loss_clip": 47.375, + "router_z_loss_mlp": 724.5, + "step": 10, + "time_per_iteration": 2.665703296661377 + }, + { + "auxiliary_loss_clip": 0.1330242, + "auxiliary_loss_mlp": 0.74298382, + "balance_loss_clip": 0.08549603, + "balance_loss_mlp": 0.02472211, + "epoch": 0.0006613557793476627, + "flos": 20780755038720.0, + "grad_norm": 29924.608712817644, + "language_loss": 12.23305321, + "learning_rate": 1.5438901072051983e-06, + "loss": 13.10906219, + "num_input_tokens_seen": 207035, + "router_z_loss_clip": 47.53125, + "router_z_loss_mlp": 719.0, + "step": 11, + "time_per_iteration": 2.6799204349517822 + }, + { + "auxiliary_loss_clip": 0.133246, + "auxiliary_loss_mlp": 0.74782056, + "balance_loss_clip": 0.08560382, + "balance_loss_mlp": 0.02467602, + "epoch": 0.000721479032015632, + "flos": 16587321603840.0, + "grad_norm": 24119.088684995622, + "language_loss": 11.84583473, + "learning_rate": 1.5999125722696629e-06, + "loss": 12.72690105, + "num_input_tokens_seen": 223225, + "router_z_loss_clip": 47.6875, + "router_z_loss_mlp": 723.5, + "step": 12, + "time_per_iteration": 2.707231044769287 + }, + { + "auxiliary_loss_clip": 0.13276552, + "auxiliary_loss_mlp": 0.74238944, + "balance_loss_clip": 0.08559544, + "balance_loss_mlp": 0.02461605, + "epoch": 0.0007816022846836014, + "flos": 23812254305280.0, + "grad_norm": 118556.26638855682, + "language_loss": 11.36912918, + "learning_rate": 1.6514482443788434e-06, + "loss": 12.24428368, + "num_input_tokens_seen": 242570, + "router_z_loss_clip": 47.1875, + "router_z_loss_mlp": 718.0, + "step": 13, + "time_per_iteration": 2.696007251739502 + }, + { + "auxiliary_loss_clip": 0.13292459, + "auxiliary_loss_mlp": 0.74095768, + "balance_loss_clip": 0.0856985, + "balance_loss_mlp": 0.02464909, + "epoch": 0.0008417255373515708, + "flos": 19178284429440.0, + "grad_norm": 181106.81391623587, + "language_loss": 10.94849205, + "learning_rate": 1.6991628240650723e-06, + "loss": 11.82237434, + "num_input_tokens_seen": 261215, + "router_z_loss_clip": 47.1875, + "router_z_loss_mlp": 716.5, + "step": 14, + "time_per_iteration": 2.676393985748291 + }, + { + "auxiliary_loss_clip": 0.13372461, + "auxiliary_loss_mlp": 0.75321233, + "balance_loss_clip": 0.08592231, + "balance_loss_mlp": 0.02469672, + "epoch": 0.00090184879001954, + "flos": 26402714006400.0, + "grad_norm": 8872.944602873076, + "language_loss": 11.40745831, + "learning_rate": 1.7435840350181584e-06, + "loss": 12.29439545, + "num_input_tokens_seen": 280035, + "router_z_loss_clip": 47.78125, + "router_z_loss_mlp": 729.5, + "step": 15, + "time_per_iteration": 2.716722249984741 + }, + { + "auxiliary_loss_clip": 0.13287091, + "auxiliary_loss_mlp": 0.73999238, + "balance_loss_clip": 0.0855229, + "balance_loss_mlp": 0.02466036, + "epoch": 0.0009619720426875094, + "flos": 24686157663360.0, + "grad_norm": 5195.838129438997, + "language_loss": 10.71900749, + "learning_rate": 1.7851373027727038e-06, + "loss": 11.59187126, + "num_input_tokens_seen": 300265, + "router_z_loss_clip": 47.3125, + "router_z_loss_mlp": 716.5, + "step": 16, + "time_per_iteration": 2.744054079055786 + }, + { + "auxiliary_loss_clip": 0.13309729, + "auxiliary_loss_mlp": 0.76006317, + "balance_loss_clip": 0.08562544, + "balance_loss_mlp": 0.0247116, + "epoch": 0.0010220952953554788, + "flos": 18630454435200.0, + "grad_norm": 4421.362455936007, + "language_loss": 10.42590714, + "learning_rate": 1.8241705979033208e-06, + "loss": 11.319067, + "num_input_tokens_seen": 317375, + "router_z_loss_clip": 47.5, + "router_z_loss_mlp": 736.0, + "step": 17, + "time_per_iteration": 4.191499471664429 + }, + { + "auxiliary_loss_clip": 0.13315202, + "auxiliary_loss_mlp": 0.7600373, + "balance_loss_clip": 0.08556177, + "balance_loss_mlp": 0.02468574, + "epoch": 0.001082218548023448, + "flos": 26150042419200.0, + "grad_norm": 7888.125072686045, + "language_loss": 9.94283867, + "learning_rate": 1.860972167459798e-06, + "loss": 10.83602905, + "num_input_tokens_seen": 337975, + "router_z_loss_clip": 47.625, + "router_z_loss_mlp": 735.5, + "step": 18, + "time_per_iteration": 2.7808027267456055 + }, + { + "auxiliary_loss_clip": 0.13318592, + "auxiliary_loss_mlp": 0.73953104, + "balance_loss_clip": 0.08563764, + "balance_loss_mlp": 0.02468731, + "epoch": 0.0011423418006914173, + "flos": 19615885977600.0, + "grad_norm": 21999.592558043798, + "language_loss": 8.84625435, + "learning_rate": 1.89578346593066e-06, + "loss": 9.71897125, + "num_input_tokens_seen": 356635, + "router_z_loss_clip": 47.53125, + "router_z_loss_mlp": 716.0, + "step": 19, + "time_per_iteration": 4.131728172302246 + }, + { + "auxiliary_loss_clip": 0.13303626, + "auxiliary_loss_mlp": 0.74244332, + "balance_loss_clip": 0.08565694, + "balance_loss_mlp": 0.02466989, + "epoch": 0.0012024650533593868, + "flos": 17901258278400.0, + "grad_norm": 4121.169450537968, + "language_loss": 8.27947521, + "learning_rate": 1.928808765521199e-06, + "loss": 9.15495491, + "num_input_tokens_seen": 375625, + "router_z_loss_clip": 47.34375, + "router_z_loss_mlp": 718.5, + "step": 20, + "time_per_iteration": 2.708914279937744 + }, + { + "auxiliary_loss_clip": 0.13338368, + "auxiliary_loss_mlp": 0.76394671, + "balance_loss_clip": 0.08570746, + "balance_loss_mlp": 0.02468888, + "epoch": 0.001262588306027356, + "flos": 21258495492480.0, + "grad_norm": 4514.811048777073, + "language_loss": 8.72282791, + "learning_rate": 1.9602224192552076e-06, + "loss": 9.62015915, + "num_input_tokens_seen": 394350, + "router_z_loss_clip": 47.6875, + "router_z_loss_mlp": 740.0, + "step": 21, + "time_per_iteration": 2.685307502746582 + }, + { + "auxiliary_loss_clip": 0.13281943, + "auxiliary_loss_mlp": 0.75118458, + "balance_loss_clip": 0.08552284, + "balance_loss_mlp": 0.02462207, + "epoch": 0.0013227115586953253, + "flos": 26111245178880.0, + "grad_norm": 4471.445911682346, + "language_loss": 8.71503925, + "learning_rate": 1.9901744328983746e-06, + "loss": 9.5990448, + "num_input_tokens_seen": 413255, + "router_z_loss_clip": 47.28125, + "router_z_loss_mlp": 727.5, + "step": 22, + "time_per_iteration": 2.734961748123169 + }, + { + "auxiliary_loss_clip": 0.13285899, + "auxiliary_loss_mlp": 0.73805398, + "balance_loss_clip": 0.08560154, + "balance_loss_mlp": 0.02467511, + "epoch": 0.0013828348113632948, + "flos": 23958177390720.0, + "grad_norm": 2111.5818511880134, + "language_loss": 8.18912506, + "learning_rate": 2.018794797290208e-06, + "loss": 9.06003761, + "num_input_tokens_seen": 433065, + "router_z_loss_clip": 47.3125, + "router_z_loss_mlp": 714.5, + "step": 23, + "time_per_iteration": 2.756584882736206 + }, + { + "auxiliary_loss_clip": 0.13278747, + "auxiliary_loss_mlp": 0.74887347, + "balance_loss_clip": 0.08537573, + "balance_loss_mlp": 0.0247524, + "epoch": 0.001442958064031264, + "flos": 15965125511040.0, + "grad_norm": 1807.1551511559412, + "language_loss": 8.28752899, + "learning_rate": 2.046196897962839e-06, + "loss": 9.16918945, + "num_input_tokens_seen": 451175, + "router_z_loss_clip": 47.4375, + "router_z_loss_mlp": 724.5, + "step": 24, + "time_per_iteration": 2.6928858757019043 + }, + { + "auxiliary_loss_clip": 0.13229564, + "auxiliary_loss_mlp": 0.73557305, + "balance_loss_clip": 0.08544464, + "balance_loss_mlp": 0.02463556, + "epoch": 0.0015030813166992333, + "flos": 18113287835520.0, + "grad_norm": 1186.4376598888527, + "language_loss": 7.80813074, + "learning_rate": 2.0724802282696944e-06, + "loss": 8.67599869, + "num_input_tokens_seen": 468775, + "router_z_loss_clip": 46.875, + "router_z_loss_mlp": 712.0, + "step": 25, + "time_per_iteration": 2.7093117237091064 + }, + { + "auxiliary_loss_clip": 0.13238442, + "auxiliary_loss_mlp": 0.7248075, + "balance_loss_clip": 0.085484, + "balance_loss_mlp": 0.02461214, + "epoch": 0.0015632045693672028, + "flos": 22240740579840.0, + "grad_norm": 3090.3782450571143, + "language_loss": 8.51009178, + "learning_rate": 2.0977325700720194e-06, + "loss": 9.36728287, + "num_input_tokens_seen": 488530, + "router_z_loss_clip": 46.875, + "router_z_loss_mlp": 701.0, + "step": 26, + "time_per_iteration": 2.7142887115478516 + }, + { + "auxiliary_loss_clip": 0.13264546, + "auxiliary_loss_mlp": 0.74387956, + "balance_loss_clip": 0.085568, + "balance_loss_mlp": 0.02464127, + "epoch": 0.001623327822035172, + "flos": 23999448326400.0, + "grad_norm": 883.8040958014411, + "language_loss": 8.80418682, + "learning_rate": 2.122031762649933e-06, + "loss": 9.68071175, + "num_input_tokens_seen": 510495, + "router_z_loss_clip": 47.03125, + "router_z_loss_mlp": 720.5, + "step": 27, + "time_per_iteration": 2.739086389541626 + }, + { + "auxiliary_loss_clip": 0.13261499, + "auxiliary_loss_mlp": 0.74588925, + "balance_loss_clip": 0.08545862, + "balance_loss_mlp": 0.02469785, + "epoch": 0.0016834510747031415, + "flos": 19682914844160.0, + "grad_norm": 778.9563997110462, + "language_loss": 7.52667618, + "learning_rate": 2.1454471497582483e-06, + "loss": 8.40517998, + "num_input_tokens_seen": 528605, + "router_z_loss_clip": 47.125, + "router_z_loss_mlp": 722.0, + "step": 28, + "time_per_iteration": 2.684328079223633 + }, + { + "auxiliary_loss_clip": 0.1322532, + "auxiliary_loss_mlp": 0.72868228, + "balance_loss_clip": 0.08545788, + "balance_loss_mlp": 0.02458075, + "epoch": 0.0017435743273711108, + "flos": 20930241922560.0, + "grad_norm": 711.3301469780024, + "language_loss": 7.32490015, + "learning_rate": 2.1680407726407727e-06, + "loss": 8.18583584, + "num_input_tokens_seen": 548515, + "router_z_loss_clip": 46.84375, + "router_z_loss_mlp": 705.0, + "step": 29, + "time_per_iteration": 2.6822586059570312 + }, + { + "auxiliary_loss_clip": 0.13197789, + "auxiliary_loss_mlp": 0.72772777, + "balance_loss_clip": 0.08529261, + "balance_loss_mlp": 0.02460276, + "epoch": 0.00180369758003908, + "flos": 19533763376640.0, + "grad_norm": 596.7513494595695, + "language_loss": 7.62213326, + "learning_rate": 2.189868360711334e-06, + "loss": 8.48183823, + "num_input_tokens_seen": 564025, + "router_z_loss_clip": 46.65625, + "router_z_loss_mlp": 703.5, + "step": 30, + "time_per_iteration": 2.66929030418396 + }, + { + "auxiliary_loss_clip": 0.13220352, + "auxiliary_loss_mlp": 0.73066145, + "balance_loss_clip": 0.08544487, + "balance_loss_mlp": 0.02460678, + "epoch": 0.0018638208327070496, + "flos": 27460415295360.0, + "grad_norm": 562.9814252823624, + "language_loss": 6.46621895, + "learning_rate": 2.2109801597326265e-06, + "loss": 7.32908344, + "num_input_tokens_seen": 583345, + "router_z_loss_clip": 46.78125, + "router_z_loss_mlp": 707.0, + "step": 31, + "time_per_iteration": 2.769524574279785 + }, + { + "auxiliary_loss_clip": 0.13217463, + "auxiliary_loss_mlp": 0.72719908, + "balance_loss_clip": 0.08546316, + "balance_loss_mlp": 0.02456231, + "epoch": 0.0019239440853750188, + "flos": 13594535723520.0, + "grad_norm": 932.7202356227122, + "language_loss": 6.38840246, + "learning_rate": 2.2314216284658796e-06, + "loss": 7.24777603, + "num_input_tokens_seen": 600010, + "router_z_loss_clip": 46.65625, + "router_z_loss_mlp": 703.0, + "step": 32, + "time_per_iteration": 2.6535158157348633 + }, + { + "auxiliary_loss_clip": 0.13187753, + "auxiliary_loss_mlp": 0.73303366, + "balance_loss_clip": 0.08555806, + "balance_loss_mlp": 0.02453755, + "epoch": 0.001984067338042988, + "flos": 11258466618240.0, + "grad_norm": 1313.3745045414653, + "language_loss": 6.49637842, + "learning_rate": 2.2512340280885094e-06, + "loss": 7.36128998, + "num_input_tokens_seen": 616295, + "router_z_loss_clip": 46.34375, + "router_z_loss_mlp": 709.5, + "step": 33, + "time_per_iteration": 2.7210733890533447 + }, + { + "auxiliary_loss_clip": 0.13162288, + "auxiliary_loss_mlp": 0.73504317, + "balance_loss_clip": 0.08544378, + "balance_loss_mlp": 0.02459392, + "epoch": 0.0020441905907109576, + "flos": 22393413918720.0, + "grad_norm": 826.9088902553285, + "language_loss": 6.77253819, + "learning_rate": 2.270454923596497e-06, + "loss": 7.6392045, + "num_input_tokens_seen": 637640, + "router_z_loss_clip": 46.0625, + "router_z_loss_mlp": 711.5, + "step": 34, + "time_per_iteration": 2.7001218795776367 + }, + { + "auxiliary_loss_clip": 0.13097668, + "auxiliary_loss_mlp": 0.75116229, + "balance_loss_clip": 0.08524574, + "balance_loss_mlp": 0.02459984, + "epoch": 0.0021043138433789266, + "flos": 49788911427840.0, + "grad_norm": 577.9485802079388, + "language_loss": 6.20400715, + "learning_rate": 2.2891186125067434e-06, + "loss": 7.08614588, + "num_input_tokens_seen": 659710, + "router_z_loss_clip": 45.6875, + "router_z_loss_mlp": 727.0, + "step": 35, + "time_per_iteration": 3.031013250350952 + }, + { + "auxiliary_loss_clip": 0.13148203, + "auxiliary_loss_mlp": 0.75109303, + "balance_loss_clip": 0.08537915, + "balance_loss_mlp": 0.02453051, + "epoch": 0.002164437096046896, + "flos": 20564155434240.0, + "grad_norm": 623.9821605724222, + "language_loss": 6.06852198, + "learning_rate": 2.307256493152974e-06, + "loss": 6.95109653, + "num_input_tokens_seen": 679670, + "router_z_loss_clip": 46.0625, + "router_z_loss_mlp": 727.0, + "step": 36, + "time_per_iteration": 2.7437260150909424 + }, + { + "auxiliary_loss_clip": 0.13138273, + "auxiliary_loss_mlp": 0.77219343, + "balance_loss_clip": 0.08535384, + "balance_loss_mlp": 0.02463487, + "epoch": 0.0022245603487148656, + "flos": 26549601413760.0, + "grad_norm": 1356.3181729473308, + "language_loss": 6.23619747, + "learning_rate": 2.3248973825097614e-06, + "loss": 7.13977337, + "num_input_tokens_seen": 700170, + "router_z_loss_clip": 46.03125, + "router_z_loss_mlp": 747.5, + "step": 37, + "time_per_iteration": 2.761021375656128 + }, + { + "auxiliary_loss_clip": 0.1308586, + "auxiliary_loss_mlp": 0.75746208, + "balance_loss_clip": 0.0852948, + "balance_loss_mlp": 0.02455192, + "epoch": 0.0022846836013828346, + "flos": 20344201666560.0, + "grad_norm": 550.1318567752543, + "language_loss": 6.76989794, + "learning_rate": 2.3420677916238357e-06, + "loss": 7.65821838, + "num_input_tokens_seen": 718545, + "router_z_loss_clip": 45.53125, + "router_z_loss_mlp": 733.5, + "step": 38, + "time_per_iteration": 2.797001600265503 + }, + { + "auxiliary_loss_clip": 0.13035053, + "auxiliary_loss_mlp": 0.76824772, + "balance_loss_clip": 0.08534516, + "balance_loss_mlp": 0.02459541, + "epoch": 0.002344806854050804, + "flos": 26254359152640.0, + "grad_norm": 327.614641212253, + "language_loss": 6.69246101, + "learning_rate": 2.358792165262154e-06, + "loss": 7.59105968, + "num_input_tokens_seen": 739865, + "router_z_loss_clip": 45.0, + "router_z_loss_mlp": 744.0, + "step": 39, + "time_per_iteration": 2.7852022647857666 + }, + { + "auxiliary_loss_clip": 0.1300399, + "auxiliary_loss_mlp": 0.74368668, + "balance_loss_clip": 0.08536238, + "balance_loss_mlp": 0.0244484, + "epoch": 0.0024049301067187736, + "flos": 11806296612480.0, + "grad_norm": 474.92846081285364, + "language_loss": 5.92113161, + "learning_rate": 2.3750930912143747e-06, + "loss": 6.79485798, + "num_input_tokens_seen": 755770, + "router_z_loss_clip": 44.6875, + "router_z_loss_mlp": 720.0, + "step": 40, + "time_per_iteration": 2.679415464401245 + }, + { + "auxiliary_loss_clip": 0.1309007, + "auxiliary_loss_mlp": 0.78535652, + "balance_loss_clip": 0.08556648, + "balance_loss_mlp": 0.02461432, + "epoch": 0.0024650533593867426, + "flos": 20637808773120.0, + "grad_norm": 345.5419638030077, + "language_loss": 6.47731018, + "learning_rate": 2.3909914837471044e-06, + "loss": 7.39356709, + "num_input_tokens_seen": 773440, + "router_z_loss_clip": 45.3125, + "router_z_loss_mlp": 760.0, + "step": 41, + "time_per_iteration": 2.835094928741455 + }, + { + "auxiliary_loss_clip": 0.13010421, + "auxiliary_loss_mlp": 0.76229548, + "balance_loss_clip": 0.08534975, + "balance_loss_mlp": 0.02450255, + "epoch": 0.002525176612054712, + "flos": 18412093895040.0, + "grad_norm": 622.6550674421553, + "language_loss": 6.03043365, + "learning_rate": 2.4065067449483835e-06, + "loss": 6.92283392, + "num_input_tokens_seen": 790455, + "router_z_loss_clip": 44.75, + "router_z_loss_mlp": 738.0, + "step": 42, + "time_per_iteration": 2.66955828666687 + }, + { + "auxiliary_loss_clip": 0.13026509, + "auxiliary_loss_mlp": 0.76781166, + "balance_loss_clip": 0.08538143, + "balance_loss_mlp": 0.02464763, + "epoch": 0.0025852998647226816, + "flos": 28191582023040.0, + "grad_norm": 8462.035545761653, + "language_loss": 5.972929, + "learning_rate": 2.4216569070848724e-06, + "loss": 6.87100601, + "num_input_tokens_seen": 810645, + "router_z_loss_clip": 44.875, + "router_z_loss_mlp": 744.0, + "step": 43, + "time_per_iteration": 2.7703070640563965 + }, + { + "auxiliary_loss_clip": 0.13056265, + "auxiliary_loss_mlp": 0.74383116, + "balance_loss_clip": 0.0856277, + "balance_loss_mlp": 0.02459292, + "epoch": 0.0026454231173906506, + "flos": 14288372657280.0, + "grad_norm": 293.14149660558166, + "language_loss": 5.65497112, + "learning_rate": 2.4364587585915504e-06, + "loss": 6.52936459, + "num_input_tokens_seen": 827470, + "router_z_loss_clip": 44.875, + "router_z_loss_mlp": 720.0, + "step": 44, + "time_per_iteration": 2.655585527420044 + }, + { + "auxiliary_loss_clip": 0.13054577, + "auxiliary_loss_mlp": 0.75350422, + "balance_loss_clip": 0.08569255, + "balance_loss_mlp": 0.02450033, + "epoch": 0.00270554637005862, + "flos": 22425796321920.0, + "grad_norm": 174.2843578867089, + "language_loss": 6.01187468, + "learning_rate": 2.450927955901469e-06, + "loss": 6.89592457, + "num_input_tokens_seen": 847285, + "router_z_loss_clip": 44.84375, + "router_z_loss_mlp": 730.0, + "step": 45, + "time_per_iteration": 2.705265522003174 + }, + { + "auxiliary_loss_clip": 0.12984964, + "auxiliary_loss_mlp": 0.73199093, + "balance_loss_clip": 0.08560722, + "balance_loss_mlp": 0.02447144, + "epoch": 0.0027656696227265896, + "flos": 23992236875520.0, + "grad_norm": 191.3929439681521, + "language_loss": 6.48347139, + "learning_rate": 2.465079122983384e-06, + "loss": 7.34531212, + "num_input_tokens_seen": 867545, + "router_z_loss_clip": 44.1875, + "router_z_loss_mlp": 708.5, + "step": 46, + "time_per_iteration": 2.733833074569702 + }, + { + "auxiliary_loss_clip": 0.12997682, + "auxiliary_loss_mlp": 0.73999059, + "balance_loss_clip": 0.08536641, + "balance_loss_mlp": 0.02465855, + "epoch": 0.0028257928753945586, + "flos": 37678511220480.0, + "grad_norm": 214.21785552289575, + "language_loss": 5.68396425, + "learning_rate": 2.4789259401737868e-06, + "loss": 6.55393171, + "num_input_tokens_seen": 889915, + "router_z_loss_clip": 44.5625, + "router_z_loss_mlp": 716.0, + "step": 47, + "time_per_iteration": 2.8230926990509033 + }, + { + "auxiliary_loss_clip": 0.1297729, + "auxiliary_loss_mlp": 0.74471426, + "balance_loss_clip": 0.08536708, + "balance_loss_mlp": 0.0244994, + "epoch": 0.002885916128062528, + "flos": 22460945909760.0, + "grad_norm": 449.4004858001912, + "language_loss": 5.75540733, + "learning_rate": 2.492481223656015e-06, + "loss": 6.62989426, + "num_input_tokens_seen": 908975, + "router_z_loss_clip": 44.40625, + "router_z_loss_mlp": 721.5, + "step": 48, + "time_per_iteration": 2.7284624576568604 + }, + { + "auxiliary_loss_clip": 0.12959239, + "auxiliary_loss_mlp": 0.73848325, + "balance_loss_clip": 0.08549985, + "balance_loss_mlp": 0.02461606, + "epoch": 0.0029460393807304976, + "flos": 27019543438080.0, + "grad_norm": 230.30029270071188, + "language_loss": 6.70517731, + "learning_rate": 2.5057569967437924e-06, + "loss": 7.57325315, + "num_input_tokens_seen": 929810, + "router_z_loss_clip": 44.0625, + "router_z_loss_mlp": 715.0, + "step": 49, + "time_per_iteration": 2.792755603790283 + }, + { + "auxiliary_loss_clip": 0.12996669, + "auxiliary_loss_mlp": 0.71446228, + "balance_loss_clip": 0.08555867, + "balance_loss_mlp": 0.02452083, + "epoch": 0.0030061626333984666, + "flos": 15857328833280.0, + "grad_norm": 311.93786428729913, + "language_loss": 5.55702782, + "learning_rate": 2.51876455396287e-06, + "loss": 6.40145731, + "num_input_tokens_seen": 948650, + "router_z_loss_clip": 44.34375, + "router_z_loss_mlp": 690.5, + "step": 50, + "time_per_iteration": 2.689176559448242 + }, + { + "auxiliary_loss_clip": 0.12955803, + "auxiliary_loss_mlp": 0.71350002, + "balance_loss_clip": 0.08553191, + "balance_loss_mlp": 0.02453516, + "epoch": 0.003066285886066436, + "flos": 31834292497920.0, + "grad_norm": 326.0050772098012, + "language_loss": 6.42039013, + "learning_rate": 2.5315145187866316e-06, + "loss": 7.26344872, + "num_input_tokens_seen": 966455, + "router_z_loss_clip": 44.0, + "router_z_loss_mlp": 689.5, + "step": 51, + "time_per_iteration": 2.751997232437134 + }, + { + "auxiliary_loss_clip": 0.12936625, + "auxiliary_loss_mlp": 0.71062022, + "balance_loss_clip": 0.08552323, + "balance_loss_mlp": 0.02458507, + "epoch": 0.0031264091387344056, + "flos": 41437110291840.0, + "grad_norm": 467.7969407780881, + "language_loss": 5.78601551, + "learning_rate": 2.5440168957651953e-06, + "loss": 6.62600183, + "num_input_tokens_seen": 988110, + "router_z_loss_clip": 43.84375, + "router_z_loss_mlp": 686.5, + "step": 52, + "time_per_iteration": 2.8259687423706055 + }, + { + "auxiliary_loss_clip": 0.12935326, + "auxiliary_loss_mlp": 0.69343221, + "balance_loss_clip": 0.08550587, + "balance_loss_mlp": 0.02448688, + "epoch": 0.0031865323914023747, + "flos": 23447719117440.0, + "grad_norm": 4084.3297995155954, + "language_loss": 5.79331207, + "learning_rate": 2.5562811176888872e-06, + "loss": 6.61609745, + "num_input_tokens_seen": 1008550, + "router_z_loss_clip": 43.78125, + "router_z_loss_mlp": 669.0, + "step": 53, + "time_per_iteration": 2.6902496814727783 + }, + { + "auxiliary_loss_clip": 0.12926383, + "auxiliary_loss_mlp": 0.69104648, + "balance_loss_clip": 0.08542258, + "balance_loss_mlp": 0.02454257, + "epoch": 0.003246655644070344, + "flos": 14434505377920.0, + "grad_norm": 247.18448581495338, + "language_loss": 5.53028297, + "learning_rate": 2.5683160883431093e-06, + "loss": 6.35059309, + "num_input_tokens_seen": 1026840, + "router_z_loss_clip": 43.75, + "router_z_loss_mlp": 666.5, + "step": 54, + "time_per_iteration": 2.642801523208618 + }, + { + "auxiliary_loss_clip": 0.12913677, + "auxiliary_loss_mlp": 0.68966341, + "balance_loss_clip": 0.08543722, + "balance_loss_mlp": 0.02462436, + "epoch": 0.0033067788967383136, + "flos": 35926972997760.0, + "grad_norm": 431.229914559421, + "language_loss": 5.18386555, + "learning_rate": 2.580130221340046e-06, + "loss": 6.00266552, + "num_input_tokens_seen": 1048875, + "router_z_loss_clip": 43.6875, + "router_z_loss_mlp": 665.0, + "step": 55, + "time_per_iteration": 2.7916810512542725 + }, + { + "auxiliary_loss_clip": 0.12884736, + "auxiliary_loss_mlp": 0.68559694, + "balance_loss_clip": 0.08553176, + "balance_loss_mlp": 0.02446416, + "epoch": 0.003366902149406283, + "flos": 22964108878080.0, + "grad_norm": 559.5224439968259, + "language_loss": 5.74156904, + "learning_rate": 2.5917314754514246e-06, + "loss": 6.55601311, + "num_input_tokens_seen": 1066435, + "router_z_loss_clip": 43.28125, + "router_z_loss_mlp": 661.0, + "step": 56, + "time_per_iteration": 2.638873338699341 + }, + { + "auxiliary_loss_clip": 0.12877631, + "auxiliary_loss_mlp": 0.65916806, + "balance_loss_clip": 0.08553813, + "balance_loss_mlp": 0.02440244, + "epoch": 0.003427025402074252, + "flos": 26590830422400.0, + "grad_norm": 1293.1571760901363, + "language_loss": 6.61670828, + "learning_rate": 2.6031273868139713e-06, + "loss": 7.4046526, + "num_input_tokens_seen": 1090330, + "router_z_loss_clip": 43.28125, + "router_z_loss_mlp": 634.0, + "step": 57, + "time_per_iteration": 4.246931314468384 + }, + { + "auxiliary_loss_clip": 0.12864697, + "auxiliary_loss_mlp": 0.66109824, + "balance_loss_clip": 0.08544569, + "balance_loss_mlp": 0.02437945, + "epoch": 0.0034871486547422216, + "flos": 23957967755520.0, + "grad_norm": 1581.401693587077, + "language_loss": 6.75815916, + "learning_rate": 2.614325098333948e-06, + "loss": 7.54790401, + "num_input_tokens_seen": 1109840, + "router_z_loss_clip": 43.25, + "router_z_loss_mlp": 636.0, + "step": 58, + "time_per_iteration": 4.129940986633301 + }, + { + "auxiliary_loss_clip": 0.12923497, + "auxiliary_loss_mlp": 0.64957327, + "balance_loss_clip": 0.08577307, + "balance_loss_mlp": 0.02457325, + "epoch": 0.003547271907410191, + "flos": 21221333406720.0, + "grad_norm": 1242.7465016222895, + "language_loss": 5.84827662, + "learning_rate": 2.625331386578098e-06, + "loss": 6.62708521, + "num_input_tokens_seen": 1128415, + "router_z_loss_clip": 43.40625, + "router_z_loss_mlp": 624.0, + "step": 59, + "time_per_iteration": 2.81791090965271 + }, + { + "auxiliary_loss_clip": 0.1292145, + "auxiliary_loss_mlp": 0.65939367, + "balance_loss_clip": 0.08575267, + "balance_loss_mlp": 0.02462805, + "epoch": 0.00360739516007816, + "flos": 16509894831360.0, + "grad_norm": 2163.0106173410372, + "language_loss": 6.19513655, + "learning_rate": 2.63615268640451e-06, + "loss": 6.98374462, + "num_input_tokens_seen": 1146515, + "router_z_loss_clip": 43.4375, + "router_z_loss_mlp": 634.0, + "step": 60, + "time_per_iteration": 2.6462490558624268 + }, + { + "auxiliary_loss_clip": 0.12888563, + "auxiliary_loss_mlp": 0.64225286, + "balance_loss_clip": 0.08565725, + "balance_loss_mlp": 0.0245771, + "epoch": 0.0036675184127461296, + "flos": 19471052995200.0, + "grad_norm": 635.7445513752676, + "language_loss": 5.79569387, + "learning_rate": 2.6467951135575943e-06, + "loss": 6.56683254, + "num_input_tokens_seen": 1166330, + "router_z_loss_clip": 43.21875, + "router_z_loss_mlp": 617.0, + "step": 61, + "time_per_iteration": 2.681910753250122 + }, + { + "auxiliary_loss_clip": 0.12824672, + "auxiliary_loss_mlp": 0.63430971, + "balance_loss_clip": 0.08548941, + "balance_loss_mlp": 0.02444647, + "epoch": 0.003727641665414099, + "flos": 20963253231360.0, + "grad_norm": 899.0914058712833, + "language_loss": 5.87668133, + "learning_rate": 2.657264485425803e-06, + "loss": 6.63923836, + "num_input_tokens_seen": 1186010, + "router_z_loss_clip": 42.71875, + "router_z_loss_mlp": 609.0, + "step": 62, + "time_per_iteration": 2.6819515228271484 + }, + { + "auxiliary_loss_clip": 0.12823591, + "auxiliary_loss_mlp": 0.6255362, + "balance_loss_clip": 0.08562292, + "balance_loss_mlp": 0.02446202, + "epoch": 0.003787764918082068, + "flos": 18412010040960.0, + "grad_norm": 1285.0325266073119, + "language_loss": 5.71324301, + "learning_rate": 2.6675663401385186e-06, + "loss": 6.46701479, + "num_input_tokens_seen": 1204985, + "router_z_loss_clip": 42.59375, + "router_z_loss_mlp": 600.0, + "step": 63, + "time_per_iteration": 2.6705985069274902 + }, + { + "auxiliary_loss_clip": 0.12830947, + "auxiliary_loss_mlp": 0.62154531, + "balance_loss_clip": 0.08567161, + "balance_loss_mlp": 0.02437731, + "epoch": 0.0038478881707500376, + "flos": 12464271198720.0, + "grad_norm": 1843.6770385957534, + "language_loss": 5.25008583, + "learning_rate": 2.677705954159056e-06, + "loss": 5.99994087, + "num_input_tokens_seen": 1223545, + "router_z_loss_clip": 42.6875, + "router_z_loss_mlp": 597.0, + "step": 64, + "time_per_iteration": 2.7688894271850586 + }, + { + "auxiliary_loss_clip": 0.12807481, + "auxiliary_loss_mlp": 0.61575615, + "balance_loss_clip": 0.08564365, + "balance_loss_mlp": 0.02444756, + "epoch": 0.003908011423418007, + "flos": 13558463740800.0, + "grad_norm": 1007.498474071754, + "language_loss": 5.29735851, + "learning_rate": 2.6876883585136904e-06, + "loss": 6.04118919, + "num_input_tokens_seen": 1241175, + "router_z_loss_clip": 42.40625, + "router_z_loss_mlp": 590.5, + "step": 65, + "time_per_iteration": 2.7044079303741455 + }, + { + "auxiliary_loss_clip": 0.12739113, + "auxiliary_loss_mlp": 0.60150075, + "balance_loss_clip": 0.08550942, + "balance_loss_mlp": 0.02435229, + "epoch": 0.003968134676085976, + "flos": 18339488732160.0, + "grad_norm": 1472.5993340381553, + "language_loss": 5.05529404, + "learning_rate": 2.697518353781685e-06, + "loss": 5.78418589, + "num_input_tokens_seen": 1259315, + "router_z_loss_clip": 41.90625, + "router_z_loss_mlp": 577.0, + "step": 66, + "time_per_iteration": 2.639763116836548 + }, + { + "auxiliary_loss_clip": 0.12713413, + "auxiliary_loss_mlp": 0.58826029, + "balance_loss_clip": 0.08548602, + "balance_loss_mlp": 0.02429543, + "epoch": 0.004028257928753946, + "flos": 20491466417280.0, + "grad_norm": 2128.447716031984, + "language_loss": 5.57779789, + "learning_rate": 2.7072005239581103e-06, + "loss": 6.29319191, + "num_input_tokens_seen": 1277055, + "router_z_loss_clip": 41.65625, + "router_z_loss_mlp": 564.0, + "step": 67, + "time_per_iteration": 2.6764183044433594 + }, + { + "auxiliary_loss_clip": 0.12659386, + "auxiliary_loss_mlp": 0.59566367, + "balance_loss_clip": 0.08534892, + "balance_loss_mlp": 0.02437462, + "epoch": 0.004088381181421915, + "flos": 18849863151360.0, + "grad_norm": 1300.1095038466112, + "language_loss": 5.65431881, + "learning_rate": 2.7167392492896727e-06, + "loss": 6.37657642, + "num_input_tokens_seen": 1294355, + "router_z_loss_clip": 41.21875, + "router_z_loss_mlp": 571.5, + "step": 68, + "time_per_iteration": 2.6499533653259277 + }, + { + "auxiliary_loss_clip": 0.12670201, + "auxiliary_loss_mlp": 0.59023213, + "balance_loss_clip": 0.08528139, + "balance_loss_mlp": 0.02431421, + "epoch": 0.004148504434089885, + "flos": 19433974763520.0, + "grad_norm": 775.8661457915586, + "language_loss": 5.68540192, + "learning_rate": 2.7261387181735195e-06, + "loss": 6.40233564, + "num_input_tokens_seen": 1313525, + "router_z_loss_clip": 41.375, + "router_z_loss_mlp": 566.0, + "step": 69, + "time_per_iteration": 2.680570363998413 + }, + { + "auxiliary_loss_clip": 0.12638462, + "auxiliary_loss_mlp": 0.5930984, + "balance_loss_clip": 0.08532386, + "balance_loss_mlp": 0.02425073, + "epoch": 0.004208627686757853, + "flos": 20816868948480.0, + "grad_norm": 532.7078221445815, + "language_loss": 6.55753994, + "learning_rate": 2.7354029381999196e-06, + "loss": 7.27702332, + "num_input_tokens_seen": 1330505, + "router_z_loss_clip": 41.09375, + "router_z_loss_mlp": 570.0, + "step": 70, + "time_per_iteration": 2.6596553325653076 + }, + { + "auxiliary_loss_clip": 0.12589023, + "auxiliary_loss_mlp": 0.57596606, + "balance_loss_clip": 0.08525643, + "balance_loss_mlp": 0.02420826, + "epoch": 0.004268750939425823, + "flos": 19104589163520.0, + "grad_norm": 3523.620393185992, + "language_loss": 4.99572229, + "learning_rate": 2.7445357464116983e-06, + "loss": 5.69757891, + "num_input_tokens_seen": 1349615, + "router_z_loss_clip": 40.71875, + "router_z_loss_mlp": 552.5, + "step": 71, + "time_per_iteration": 2.6517086029052734 + }, + { + "auxiliary_loss_clip": 0.13345143, + "auxiliary_loss_mlp": 0.53337634, + "balance_loss_clip": 0.08910056, + "balance_loss_mlp": 0.02458726, + "epoch": 0.004328874192093792, + "flos": 52456112340480.0, + "grad_norm": 24.73254947156558, + "language_loss": 0.75920403, + "learning_rate": 2.75354081884615e-06, + "loss": 1.42603183, + "num_input_tokens_seen": 1410275, + "router_z_loss_clip": 44.375, + "router_z_loss_mlp": 508.25, + "step": 72, + "time_per_iteration": 3.4461121559143066 + }, + { + "auxiliary_loss_clip": 0.13279217, + "auxiliary_loss_mlp": 0.51093936, + "balance_loss_clip": 0.08903308, + "balance_loss_mlp": 0.02436709, + "epoch": 0.004388997444761762, + "flos": 66495922260480.0, + "grad_norm": 24.018429481505308, + "language_loss": 0.70889235, + "learning_rate": 2.7624216794188286e-06, + "loss": 1.35262394, + "num_input_tokens_seen": 1473020, + "router_z_loss_clip": 43.71875, + "router_z_loss_mlp": 486.25, + "step": 73, + "time_per_iteration": 3.8973076343536377 + }, + { + "auxiliary_loss_clip": 0.12491501, + "auxiliary_loss_mlp": 0.53349555, + "balance_loss_clip": 0.08502775, + "balance_loss_mlp": 0.02397403, + "epoch": 0.004449120697429731, + "flos": 18958959567360.0, + "grad_norm": 3320.4524015503866, + "language_loss": 5.2433157, + "learning_rate": 2.771181708202938e-06, + "loss": 5.90172577, + "num_input_tokens_seen": 1490385, + "router_z_loss_clip": 39.90625, + "router_z_loss_mlp": 509.5, + "step": 74, + "time_per_iteration": 2.6803529262542725 + }, + { + "auxiliary_loss_clip": 0.12445074, + "auxiliary_loss_mlp": 0.51731253, + "balance_loss_clip": 0.08501716, + "balance_loss_mlp": 0.02390428, + "epoch": 0.004509243950097701, + "flos": 21111817720320.0, + "grad_norm": 2097.466788992517, + "language_loss": 5.57566261, + "learning_rate": 2.779824149153005e-06, + "loss": 6.21742582, + "num_input_tokens_seen": 1509725, + "router_z_loss_clip": 39.4375, + "router_z_loss_mlp": 493.0, + "step": 75, + "time_per_iteration": 2.687678575515747 + }, + { + "auxiliary_loss_clip": 0.12385009, + "auxiliary_loss_mlp": 0.49917772, + "balance_loss_clip": 0.08505447, + "balance_loss_mlp": 0.0235918, + "epoch": 0.004569367202765669, + "flos": 20704082952960.0, + "grad_norm": 7030.779065512956, + "language_loss": 5.64007378, + "learning_rate": 2.788352117317012e-06, + "loss": 6.26310158, + "num_input_tokens_seen": 1527245, + "router_z_loss_clip": 38.8125, + "router_z_loss_mlp": 475.25, + "step": 76, + "time_per_iteration": 2.666630744934082 + }, + { + "auxiliary_loss_clip": 0.12336895, + "auxiliary_loss_mlp": 0.48941305, + "balance_loss_clip": 0.08483945, + "balance_loss_mlp": 0.02359273, + "epoch": 0.004629490455433639, + "flos": 28666136021760.0, + "grad_norm": 620.4309602119407, + "language_loss": 5.72052956, + "learning_rate": 2.796768605577095e-06, + "loss": 6.33331108, + "num_input_tokens_seen": 1548930, + "router_z_loss_clip": 38.5, + "router_z_loss_mlp": 465.5, + "step": 77, + "time_per_iteration": 2.7469568252563477 + }, + { + "auxiliary_loss_clip": 0.12308235, + "auxiliary_loss_mlp": 0.48191378, + "balance_loss_clip": 0.08460534, + "balance_loss_mlp": 0.02366182, + "epoch": 0.004689613708101608, + "flos": 11077142382720.0, + "grad_norm": 1643.3438058920954, + "language_loss": 5.09305811, + "learning_rate": 2.80507649095533e-06, + "loss": 5.69805431, + "num_input_tokens_seen": 1565695, + "router_z_loss_clip": 38.5, + "router_z_loss_mlp": 458.25, + "step": 78, + "time_per_iteration": 2.6558547019958496 + }, + { + "auxiliary_loss_clip": 0.12249273, + "auxiliary_loss_mlp": 0.46293706, + "balance_loss_clip": 0.08442898, + "balance_loss_mlp": 0.02348393, + "epoch": 0.004749736960769578, + "flos": 21805612727040.0, + "grad_norm": 2200.9167741447113, + "language_loss": 4.90451622, + "learning_rate": 2.813278540517843e-06, + "loss": 5.48994637, + "num_input_tokens_seen": 1582625, + "router_z_loss_clip": 38.0625, + "router_z_loss_mlp": 439.75, + "step": 79, + "time_per_iteration": 2.7162697315216064 + }, + { + "auxiliary_loss_clip": 0.12262511, + "auxiliary_loss_mlp": 0.46983981, + "balance_loss_clip": 0.08447941, + "balance_loss_mlp": 0.02355075, + "epoch": 0.004809860213437547, + "flos": 19798803440640.0, + "grad_norm": 344.66463824801895, + "language_loss": 5.05523586, + "learning_rate": 2.8213774169075505e-06, + "loss": 5.64770126, + "num_input_tokens_seen": 1601725, + "router_z_loss_clip": 38.125, + "router_z_loss_mlp": 446.75, + "step": 80, + "time_per_iteration": 2.687460422515869 + }, + { + "auxiliary_loss_clip": 0.12261841, + "auxiliary_loss_mlp": 0.45211679, + "balance_loss_clip": 0.08451226, + "balance_loss_mlp": 0.02364997, + "epoch": 0.004869983466105517, + "flos": 26580893713920.0, + "grad_norm": 1677.7099343970488, + "language_loss": 5.56453705, + "learning_rate": 2.829375683533245e-06, + "loss": 6.13927221, + "num_input_tokens_seen": 1622420, + "router_z_loss_clip": 38.125, + "router_z_loss_mlp": 428.5, + "step": 81, + "time_per_iteration": 2.7709527015686035 + }, + { + "auxiliary_loss_clip": 0.12245495, + "auxiliary_loss_mlp": 0.44303346, + "balance_loss_clip": 0.08439148, + "balance_loss_mlp": 0.02335574, + "epoch": 0.004930106718773485, + "flos": 12828345189120.0, + "grad_norm": 4679.4395433895315, + "language_loss": 4.60398674, + "learning_rate": 2.8372758094402803e-06, + "loss": 5.16947508, + "num_input_tokens_seen": 1640715, + "router_z_loss_clip": 38.125, + "router_z_loss_mlp": 419.75, + "step": 82, + "time_per_iteration": 2.6463286876678467 + }, + { + "auxiliary_loss_clip": 0.12233329, + "auxiliary_loss_mlp": 0.44903332, + "balance_loss_clip": 0.0843938, + "balance_loss_mlp": 0.0234962, + "epoch": 0.004990229971441455, + "flos": 25781901505920.0, + "grad_norm": 1468.5073951038269, + "language_loss": 5.41148376, + "learning_rate": 2.84508017388607e-06, + "loss": 5.98285007, + "num_input_tokens_seen": 1662210, + "router_z_loss_clip": 37.96875, + "router_z_loss_mlp": 425.5, + "step": 83, + "time_per_iteration": 2.751582145690918 + }, + { + "auxiliary_loss_clip": 0.12286501, + "auxiliary_loss_mlp": 0.44843888, + "balance_loss_clip": 0.08466095, + "balance_loss_mlp": 0.0236342, + "epoch": 0.005050353224109424, + "flos": 17463027824640.0, + "grad_norm": 333.54187308321605, + "language_loss": 4.89241934, + "learning_rate": 2.852791070641559e-06, + "loss": 5.46372318, + "num_input_tokens_seen": 1681070, + "router_z_loss_clip": 38.21875, + "router_z_loss_mlp": 425.0, + "step": 84, + "time_per_iteration": 2.6613667011260986 + }, + { + "auxiliary_loss_clip": 0.12715524, + "auxiliary_loss_mlp": 0.33666173, + "balance_loss_clip": 0.08695208, + "balance_loss_mlp": 0.02245275, + "epoch": 0.005110476476777394, + "flos": 69824607160320.0, + "grad_norm": 16.750834021856043, + "language_loss": 0.63998127, + "learning_rate": 2.8604107120381682e-06, + "loss": 1.10379827, + "num_input_tokens_seen": 1747140, + "router_z_loss_clip": 40.09375, + "router_z_loss_mlp": 313.75, + "step": 85, + "time_per_iteration": 3.4564764499664307 + }, + { + "auxiliary_loss_clip": 0.12209877, + "auxiliary_loss_mlp": 0.42757708, + "balance_loss_clip": 0.08426955, + "balance_loss_mlp": 0.02352437, + "epoch": 0.005170599729445363, + "flos": 24796973088000.0, + "grad_norm": 542.703970895993, + "language_loss": 4.92362881, + "learning_rate": 2.8679412327780482e-06, + "loss": 5.47330475, + "num_input_tokens_seen": 1767475, + "router_z_loss_clip": 37.90625, + "router_z_loss_mlp": 403.75, + "step": 86, + "time_per_iteration": 2.775689125061035 + }, + { + "auxiliary_loss_clip": 0.12224952, + "auxiliary_loss_mlp": 0.4164477, + "balance_loss_clip": 0.08412233, + "balance_loss_mlp": 0.02362544, + "epoch": 0.005230722982113333, + "flos": 23264717800320.0, + "grad_norm": 4371.207136836947, + "language_loss": 5.4414258, + "learning_rate": 2.8753846935240833e-06, + "loss": 5.98012304, + "num_input_tokens_seen": 1784980, + "router_z_loss_clip": 38.15625, + "router_z_loss_mlp": 392.25, + "step": 87, + "time_per_iteration": 2.7322311401367188 + }, + { + "auxiliary_loss_clip": 0.12200201, + "auxiliary_loss_mlp": 0.41744971, + "balance_loss_clip": 0.08406796, + "balance_loss_mlp": 0.02365087, + "epoch": 0.005290846234781301, + "flos": 16733622032640.0, + "grad_norm": 2919.861295310318, + "language_loss": 4.86351013, + "learning_rate": 2.8827430842847267e-06, + "loss": 5.40296173, + "num_input_tokens_seen": 1803030, + "router_z_loss_clip": 38.0, + "router_z_loss_mlp": 393.75, + "step": 88, + "time_per_iteration": 2.7260544300079346 + }, + { + "auxiliary_loss_clip": 0.1219901, + "auxiliary_loss_mlp": 0.40224642, + "balance_loss_clip": 0.08417168, + "balance_loss_mlp": 0.02358433, + "epoch": 0.005350969487449271, + "flos": 20892283223040.0, + "grad_norm": 1645.58162705774, + "language_loss": 5.16751766, + "learning_rate": 2.8900183276075957e-06, + "loss": 5.69175386, + "num_input_tokens_seen": 1822865, + "router_z_loss_clip": 37.875, + "router_z_loss_mlp": 378.5, + "step": 89, + "time_per_iteration": 2.674370288848877 + }, + { + "auxiliary_loss_clip": 0.12154645, + "auxiliary_loss_mlp": 0.38342261, + "balance_loss_clip": 0.0840472, + "balance_loss_mlp": 0.02331517, + "epoch": 0.00541109274011724, + "flos": 26216568161280.0, + "grad_norm": 1270.091627450628, + "language_loss": 4.37986279, + "learning_rate": 2.8972122815946455e-06, + "loss": 4.88483191, + "num_input_tokens_seen": 1842435, + "router_z_loss_clip": 37.5, + "router_z_loss_mlp": 360.75, + "step": 90, + "time_per_iteration": 2.7423648834228516 + }, + { + "auxiliary_loss_clip": 0.12150387, + "auxiliary_loss_mlp": 0.38653693, + "balance_loss_clip": 0.08385181, + "balance_loss_mlp": 0.02349981, + "epoch": 0.00547121599278521, + "flos": 21184926007680.0, + "grad_norm": 803.9563265609303, + "language_loss": 5.31085825, + "learning_rate": 2.90432674275074e-06, + "loss": 5.81889915, + "num_input_tokens_seen": 1860065, + "router_z_loss_clip": 37.6875, + "router_z_loss_mlp": 363.0, + "step": 91, + "time_per_iteration": 2.6603400707244873 + }, + { + "auxiliary_loss_clip": 0.12079477, + "auxiliary_loss_mlp": 0.37034535, + "balance_loss_clip": 0.08381163, + "balance_loss_mlp": 0.02342154, + "epoch": 0.005531339245453179, + "flos": 19724856612480.0, + "grad_norm": 829.7403965041182, + "language_loss": 4.4634366, + "learning_rate": 2.91136344867656e-06, + "loss": 4.95457649, + "num_input_tokens_seen": 1878135, + "router_z_loss_clip": 37.0, + "router_z_loss_mlp": 347.25, + "step": 92, + "time_per_iteration": 2.6818525791168213 + }, + { + "auxiliary_loss_clip": 0.1209444, + "auxiliary_loss_mlp": 0.35073167, + "balance_loss_clip": 0.08383686, + "balance_loss_mlp": 0.02309498, + "epoch": 0.005591462498121149, + "flos": 17641291386240.0, + "grad_norm": 1625.08326205636, + "language_loss": 4.56070709, + "learning_rate": 2.918324080615938e-06, + "loss": 5.03238297, + "num_input_tokens_seen": 1894895, + "router_z_loss_clip": 37.125, + "router_z_loss_mlp": 327.5, + "step": 93, + "time_per_iteration": 2.612030029296875 + }, + { + "auxiliary_loss_clip": 0.12023389, + "auxiliary_loss_mlp": 0.34590679, + "balance_loss_clip": 0.08357395, + "balance_loss_mlp": 0.02290875, + "epoch": 0.005651585750789117, + "flos": 20017415543040.0, + "grad_norm": 681.2724931544728, + "language_loss": 4.70847607, + "learning_rate": 2.925210265866963e-06, + "loss": 5.17461681, + "num_input_tokens_seen": 1913220, + "router_z_loss_clip": 36.625, + "router_z_loss_mlp": 322.75, + "step": 94, + "time_per_iteration": 2.6726646423339844 + }, + { + "auxiliary_loss_clip": 0.12331794, + "auxiliary_loss_mlp": 0.21429604, + "balance_loss_clip": 0.08515669, + "balance_loss_mlp": 0.01873939, + "epoch": 0.005711709003457087, + "flos": 59831202758400.0, + "grad_norm": 11.50707364837694, + "language_loss": 0.68575168, + "learning_rate": 2.932023580065507e-06, + "loss": 1.02336574, + "num_input_tokens_seen": 1970970, + "router_z_loss_clip": 38.0, + "router_z_loss_mlp": 195.25, + "step": 95, + "time_per_iteration": 3.168633222579956 + }, + { + "auxiliary_loss_clip": 0.11899618, + "auxiliary_loss_mlp": 0.32138801, + "balance_loss_clip": 0.08329217, + "balance_loss_mlp": 0.02231575, + "epoch": 0.005771832256125056, + "flos": 15564979537920.0, + "grad_norm": 1013.3395640383166, + "language_loss": 4.49414778, + "learning_rate": 2.9387655493491906e-06, + "loss": 4.93453217, + "num_input_tokens_seen": 1988930, + "router_z_loss_clip": 35.71875, + "router_z_loss_mlp": 298.5, + "step": 96, + "time_per_iteration": 5.5690062046051025 + }, + { + "auxiliary_loss_clip": 0.11822618, + "auxiliary_loss_mlp": 0.30064785, + "balance_loss_clip": 0.08285143, + "balance_loss_mlp": 0.02220548, + "epoch": 0.005831955508793026, + "flos": 22534934664960.0, + "grad_norm": 2356.5481695677104, + "language_loss": 5.16498899, + "learning_rate": 2.9454376524092147e-06, + "loss": 5.58386326, + "num_input_tokens_seen": 2006285, + "router_z_loss_clip": 35.4375, + "router_z_loss_mlp": 278.375, + "step": 97, + "time_per_iteration": 4.129577159881592 + }, + { + "auxiliary_loss_clip": 0.11772624, + "auxiliary_loss_mlp": 0.27429676, + "balance_loss_clip": 0.08268203, + "balance_loss_mlp": 0.02161121, + "epoch": 0.005892078761460995, + "flos": 22055600983680.0, + "grad_norm": 1442.767046866879, + "language_loss": 4.65611029, + "learning_rate": 2.952041322436969e-06, + "loss": 5.04813337, + "num_input_tokens_seen": 2024905, + "router_z_loss_clip": 35.03125, + "router_z_loss_mlp": 252.75, + "step": 98, + "time_per_iteration": 4.072925567626953 + }, + { + "auxiliary_loss_clip": 0.12124368, + "auxiliary_loss_mlp": 0.12855935, + "balance_loss_clip": 0.08381641, + "balance_loss_mlp": 0.01625466, + "epoch": 0.005952202014128965, + "flos": 68559865632000.0, + "grad_norm": 9.945172746585492, + "language_loss": 0.65681642, + "learning_rate": 2.9585779489718204e-06, + "loss": 0.90661949, + "num_input_tokens_seen": 2086220, + "router_z_loss_clip": 37.46875, + "router_z_loss_mlp": 112.4375, + "step": 99, + "time_per_iteration": 3.3806052207946777 + }, + { + "auxiliary_loss_clip": 0.11659142, + "auxiliary_loss_mlp": 0.25495899, + "balance_loss_clip": 0.08219896, + "balance_loss_mlp": 0.02095021, + "epoch": 0.006012325266796933, + "flos": 22966624500480.0, + "grad_norm": 5439.355539233552, + "language_loss": 4.89178705, + "learning_rate": 2.9650488796560464e-06, + "loss": 5.26333714, + "num_input_tokens_seen": 2103365, + "router_z_loss_clip": 34.34375, + "router_z_loss_mlp": 233.875, + "step": 100, + "time_per_iteration": 2.6920084953308105 + }, + { + "auxiliary_loss_clip": 0.11642508, + "auxiliary_loss_mlp": 0.23216301, + "balance_loss_clip": 0.08225508, + "balance_loss_mlp": 0.02037103, + "epoch": 0.006072448519464903, + "flos": 17353721773440.0, + "grad_norm": 71170.85330308754, + "language_loss": 4.95652103, + "learning_rate": 2.971455421902446e-06, + "loss": 5.30510902, + "num_input_tokens_seen": 2121995, + "router_z_loss_clip": 34.15625, + "router_z_loss_mlp": 211.875, + "step": 101, + "time_per_iteration": 2.652926206588745 + }, + { + "auxiliary_loss_clip": 0.11583164, + "auxiliary_loss_mlp": 0.214275, + "balance_loss_clip": 0.08206252, + "balance_loss_mlp": 0.01957287, + "epoch": 0.006132571772132872, + "flos": 24688044380160.0, + "grad_norm": 7482.306451170957, + "language_loss": 5.13341808, + "learning_rate": 2.9777988444798075e-06, + "loss": 5.4635253, + "num_input_tokens_seen": 2141815, + "router_z_loss_clip": 33.78125, + "router_z_loss_mlp": 194.625, + "step": 102, + "time_per_iteration": 2.7020983695983887 + }, + { + "auxiliary_loss_clip": 0.11553724, + "auxiliary_loss_mlp": 0.20282698, + "balance_loss_clip": 0.08193958, + "balance_loss_mlp": 0.01923322, + "epoch": 0.006192695024800842, + "flos": 21471279736320.0, + "grad_norm": 1966.1076689836887, + "language_loss": 4.95062399, + "learning_rate": 2.9840803790210285e-06, + "loss": 5.26898813, + "num_input_tokens_seen": 2161125, + "router_z_loss_clip": 33.59375, + "router_z_loss_mlp": 183.75, + "step": 103, + "time_per_iteration": 2.652406692504883 + }, + { + "auxiliary_loss_clip": 0.11498895, + "auxiliary_loss_mlp": 0.18188542, + "balance_loss_clip": 0.08159411, + "balance_loss_mlp": 0.01855535, + "epoch": 0.006252818277468811, + "flos": 17426117301120.0, + "grad_norm": 4017.94727583705, + "language_loss": 4.81252193, + "learning_rate": 2.990301221458371e-06, + "loss": 5.10939646, + "num_input_tokens_seen": 2179510, + "router_z_loss_clip": 33.40625, + "router_z_loss_mlp": 163.25, + "step": 104, + "time_per_iteration": 2.6669459342956543 + }, + { + "auxiliary_loss_clip": 0.11507185, + "auxiliary_loss_mlp": 0.18210354, + "balance_loss_clip": 0.081876, + "balance_loss_mlp": 0.01852931, + "epoch": 0.006312941530136781, + "flos": 19105679266560.0, + "grad_norm": 5275.119248926157, + "language_loss": 4.54453945, + "learning_rate": 2.9964625333900544e-06, + "loss": 4.84171486, + "num_input_tokens_seen": 2197870, + "router_z_loss_clip": 33.1875, + "router_z_loss_mlp": 163.625, + "step": 105, + "time_per_iteration": 2.6467208862304688 + }, + { + "auxiliary_loss_clip": 0.11489026, + "auxiliary_loss_mlp": 0.17571044, + "balance_loss_clip": 0.08164956, + "balance_loss_mlp": 0.01872801, + "epoch": 0.006373064782804749, + "flos": 24067651150080.0, + "grad_norm": 56669.614766689854, + "language_loss": 4.9280014, + "learning_rate": 3.002565443382063e-06, + "loss": 5.2186017, + "num_input_tokens_seen": 2217495, + "router_z_loss_clip": 33.1875, + "router_z_loss_mlp": 157.0, + "step": 106, + "time_per_iteration": 2.7375807762145996 + }, + { + "auxiliary_loss_clip": 0.11464141, + "auxiliary_loss_mlp": 0.16512999, + "balance_loss_clip": 0.08158538, + "balance_loss_mlp": 0.01815734, + "epoch": 0.006433188035472719, + "flos": 18338272848000.0, + "grad_norm": 94457.61945163306, + "language_loss": 4.08243847, + "learning_rate": 3.008611048208843e-06, + "loss": 4.36221027, + "num_input_tokens_seen": 2236520, + "router_z_loss_clip": 33.0625, + "router_z_loss_mlp": 146.875, + "step": 107, + "time_per_iteration": 2.6703994274139404 + }, + { + "auxiliary_loss_clip": 0.12281319, + "auxiliary_loss_mlp": 0.04033342, + "balance_loss_clip": 0.08292686, + "balance_loss_mlp": 0.01773516, + "epoch": 0.006493311288140688, + "flos": 62583266257920.0, + "grad_norm": 1.9990534397749096, + "language_loss": 0.6506741, + "learning_rate": 3.014600414036285e-06, + "loss": 0.81382072, + "num_input_tokens_seen": 2300140, + "router_z_loss_clip": 40.0, + "router_z_loss_mlp": 22.640625, + "step": 108, + "time_per_iteration": 3.3318073749542236 + }, + { + "auxiliary_loss_clip": 0.1146347, + "auxiliary_loss_mlp": 0.17600623, + "balance_loss_clip": 0.08161052, + "balance_loss_mlp": 0.01902381, + "epoch": 0.006553434540808658, + "flos": 19506202583040.0, + "grad_norm": 2213.052526088781, + "language_loss": 5.47699499, + "learning_rate": 3.0205345775501937e-06, + "loss": 5.76763535, + "num_input_tokens_seen": 2317320, + "router_z_loss_clip": 33.03125, + "router_z_loss_mlp": 156.875, + "step": 109, + "time_per_iteration": 2.719162940979004 + }, + { + "auxiliary_loss_clip": 0.11452536, + "auxiliary_loss_mlp": 0.16698027, + "balance_loss_clip": 0.08172794, + "balance_loss_mlp": 0.01903106, + "epoch": 0.006613557793476627, + "flos": 21111398449920.0, + "grad_norm": 8171.333832946622, + "language_loss": 4.33011436, + "learning_rate": 3.0264145470332218e-06, + "loss": 4.61161995, + "num_input_tokens_seen": 2337820, + "router_z_loss_clip": 32.765625, + "router_z_loss_mlp": 147.75, + "step": 110, + "time_per_iteration": 2.7021584510803223 + }, + { + "auxiliary_loss_clip": 0.11498255, + "auxiliary_loss_mlp": 0.16723976, + "balance_loss_clip": 0.08168858, + "balance_loss_mlp": 0.01916846, + "epoch": 0.006673681046144597, + "flos": 26037843402240.0, + "grad_norm": 85243.79091039153, + "language_loss": 5.33909988, + "learning_rate": 3.032241303393073e-06, + "loss": 5.62132263, + "num_input_tokens_seen": 2358560, + "router_z_loss_clip": 33.25, + "router_z_loss_mlp": 148.0625, + "step": 111, + "time_per_iteration": 2.763227939605713 + }, + { + "auxiliary_loss_clip": 0.11479855, + "auxiliary_loss_mlp": 0.17865081, + "balance_loss_clip": 0.08154993, + "balance_loss_mlp": 0.01983733, + "epoch": 0.006733804298812566, + "flos": 23154279719040.0, + "grad_norm": 75829.31622331966, + "language_loss": 4.96874857, + "learning_rate": 3.0380158011446e-06, + "loss": 5.26219797, + "num_input_tokens_seen": 2379005, + "router_z_loss_clip": 33.1875, + "router_z_loss_mlp": 158.875, + "step": 112, + "time_per_iteration": 2.656294822692871 + }, + { + "auxiliary_loss_clip": 0.1147141, + "auxiliary_loss_mlp": 0.17070231, + "balance_loss_clip": 0.08172764, + "balance_loss_mlp": 0.01933513, + "epoch": 0.006793927551480535, + "flos": 11769092599680.0, + "grad_norm": 3384.2074822155987, + "language_loss": 4.32218456, + "learning_rate": 3.0437389693482466e-06, + "loss": 4.60760117, + "num_input_tokens_seen": 2395610, + "router_z_loss_clip": 32.96875, + "router_z_loss_mlp": 151.25, + "step": 113, + "time_per_iteration": 2.6669225692749023 + }, + { + "auxiliary_loss_clip": 0.11510996, + "auxiliary_loss_mlp": 0.18198231, + "balance_loss_clip": 0.08184206, + "balance_loss_mlp": 0.019995, + "epoch": 0.006854050804148504, + "flos": 19177990940160.0, + "grad_norm": 1118.9556792976962, + "language_loss": 4.58965397, + "learning_rate": 3.0494117125071475e-06, + "loss": 4.88674641, + "num_input_tokens_seen": 2415005, + "router_z_loss_clip": 33.28125, + "router_z_loss_mlp": 161.875, + "step": 114, + "time_per_iteration": 2.6245124340057373 + }, + { + "auxiliary_loss_clip": 0.11491105, + "auxiliary_loss_mlp": 0.15876909, + "balance_loss_clip": 0.08183911, + "balance_loss_mlp": 0.01912064, + "epoch": 0.006914174056816474, + "flos": 21988488263040.0, + "grad_norm": 3570.8470324102345, + "language_loss": 4.92026377, + "learning_rate": 3.055034911425055e-06, + "loss": 5.19394398, + "num_input_tokens_seen": 2433965, + "router_z_loss_clip": 33.09375, + "router_z_loss_mlp": 139.625, + "step": 115, + "time_per_iteration": 2.694258689880371 + }, + { + "auxiliary_loss_clip": 0.11497033, + "auxiliary_loss_mlp": 0.17786066, + "balance_loss_clip": 0.08183155, + "balance_loss_mlp": 0.02014583, + "epoch": 0.006974297309484443, + "flos": 16294636892160.0, + "grad_norm": 28497.885490954828, + "language_loss": 4.11111546, + "learning_rate": 3.0606094240271244e-06, + "loss": 4.40394688, + "num_input_tokens_seen": 2451605, + "router_z_loss_clip": 33.125, + "router_z_loss_mlp": 157.75, + "step": 116, + "time_per_iteration": 2.6153717041015625 + }, + { + "auxiliary_loss_clip": 0.11479296, + "auxiliary_loss_mlp": 0.17568065, + "balance_loss_clip": 0.08183482, + "balance_loss_mlp": 0.02040722, + "epoch": 0.007034420562152413, + "flos": 26111161324800.0, + "grad_norm": 6129.230277666204, + "language_loss": 4.56221914, + "learning_rate": 3.0661360861454656e-06, + "loss": 4.8526926, + "num_input_tokens_seen": 2472035, + "router_z_loss_clip": 32.90625, + "router_z_loss_mlp": 155.25, + "step": 117, + "time_per_iteration": 2.698347568511963 + }, + { + "auxiliary_loss_clip": 0.11602448, + "auxiliary_loss_mlp": 0.18875569, + "balance_loss_clip": 0.08221327, + "balance_loss_mlp": 0.02151936, + "epoch": 0.007094543814820382, + "flos": 14208933386880.0, + "grad_norm": 568.8145863995832, + "language_loss": 4.50002289, + "learning_rate": 3.071615712271274e-06, + "loss": 4.80480337, + "num_input_tokens_seen": 2489285, + "router_z_loss_clip": 33.75, + "router_z_loss_mlp": 167.375, + "step": 118, + "time_per_iteration": 2.614288091659546 + }, + { + "auxiliary_loss_clip": 0.11586175, + "auxiliary_loss_mlp": 0.17393641, + "balance_loss_clip": 0.08235049, + "balance_loss_mlp": 0.02086024, + "epoch": 0.007154667067488351, + "flos": 14981329123200.0, + "grad_norm": 337.3163881950513, + "language_loss": 4.89806128, + "learning_rate": 3.0770490962752172e-06, + "loss": 5.18785954, + "num_input_tokens_seen": 2506460, + "router_z_loss_clip": 33.5625, + "router_z_loss_mlp": 153.0, + "step": 119, + "time_per_iteration": 2.6733670234680176 + }, + { + "auxiliary_loss_clip": 0.11613901, + "auxiliary_loss_mlp": 0.17884746, + "balance_loss_clip": 0.08224175, + "balance_loss_mlp": 0.02088849, + "epoch": 0.00721479032015632, + "flos": 20199452538240.0, + "grad_norm": 4431.2993639449, + "language_loss": 4.39706039, + "learning_rate": 3.082437012097686e-06, + "loss": 4.69204712, + "num_input_tokens_seen": 2525565, + "router_z_loss_clip": 33.9375, + "router_z_loss_mlp": 157.75, + "step": 120, + "time_per_iteration": 2.6733429431915283 + }, + { + "auxiliary_loss_clip": 0.11614023, + "auxiliary_loss_mlp": 0.18062758, + "balance_loss_clip": 0.0821183, + "balance_loss_mlp": 0.02144791, + "epoch": 0.00727491357282429, + "flos": 23153650813440.0, + "grad_norm": 6523.034573603343, + "language_loss": 5.06446743, + "learning_rate": 3.0877802144103967e-06, + "loss": 5.36123562, + "num_input_tokens_seen": 2546605, + "router_z_loss_clip": 34.0, + "router_z_loss_mlp": 159.0, + "step": 121, + "time_per_iteration": 2.726327419281006 + }, + { + "auxiliary_loss_clip": 0.11618941, + "auxiliary_loss_mlp": 0.17642631, + "balance_loss_clip": 0.08232379, + "balance_loss_mlp": 0.02127495, + "epoch": 0.007335036825492259, + "flos": 15526811203200.0, + "grad_norm": 1010.4173973733286, + "language_loss": 4.56235886, + "learning_rate": 3.09307943925077e-06, + "loss": 4.85497475, + "num_input_tokens_seen": 2560730, + "router_z_loss_clip": 33.875, + "router_z_loss_mlp": 155.125, + "step": 122, + "time_per_iteration": 2.640110969543457 + }, + { + "auxiliary_loss_clip": 0.11591011, + "auxiliary_loss_mlp": 0.16755471, + "balance_loss_clip": 0.08221178, + "balance_loss_mlp": 0.02094828, + "epoch": 0.007395160078160229, + "flos": 24250233196800.0, + "grad_norm": 4778.191954305265, + "language_loss": 4.97837877, + "learning_rate": 3.0983354046304154e-06, + "loss": 5.2618432, + "num_input_tokens_seen": 2579550, + "router_z_loss_clip": 33.625, + "router_z_loss_mlp": 146.625, + "step": 123, + "time_per_iteration": 2.689462661743164 + }, + { + "auxiliary_loss_clip": 0.11583175, + "auxiliary_loss_mlp": 0.16522312, + "balance_loss_clip": 0.08218054, + "balance_loss_mlp": 0.02069187, + "epoch": 0.007455283330828198, + "flos": 31767976391040.0, + "grad_norm": 918.147653305623, + "language_loss": 4.24658871, + "learning_rate": 3.103548811118979e-06, + "loss": 4.5276432, + "num_input_tokens_seen": 2600390, + "router_z_loss_clip": 33.6875, + "router_z_loss_mlp": 144.625, + "step": 124, + "time_per_iteration": 2.79850172996521 + }, + { + "auxiliary_loss_clip": 0.11631332, + "auxiliary_loss_mlp": 0.17508414, + "balance_loss_clip": 0.08243011, + "balance_loss_mlp": 0.02151969, + "epoch": 0.007515406583496167, + "flos": 26622458138880.0, + "grad_norm": 2521.4972321949017, + "language_loss": 4.22364092, + "learning_rate": 3.108720342404542e-06, + "loss": 4.51503849, + "num_input_tokens_seen": 2620770, + "router_z_loss_clip": 33.90625, + "router_z_loss_mlp": 153.375, + "step": 125, + "time_per_iteration": 2.699488401412964 + }, + { + "auxiliary_loss_clip": 0.11621339, + "auxiliary_loss_mlp": 0.16743667, + "balance_loss_clip": 0.08258513, + "balance_loss_mlp": 0.02131851, + "epoch": 0.007575529836164136, + "flos": 18229637629440.0, + "grad_norm": 2114.724785338214, + "language_loss": 4.42466068, + "learning_rate": 3.1138506658316945e-06, + "loss": 4.70831108, + "num_input_tokens_seen": 2639900, + "router_z_loss_clip": 33.625, + "router_z_loss_mlp": 146.125, + "step": 126, + "time_per_iteration": 2.65913987159729 + }, + { + "auxiliary_loss_clip": 0.11678092, + "auxiliary_loss_mlp": 0.16983882, + "balance_loss_clip": 0.08243092, + "balance_loss_mlp": 0.02127924, + "epoch": 0.007635653088832106, + "flos": 21586916770560.0, + "grad_norm": 719.841664884419, + "language_loss": 3.98921776, + "learning_rate": 3.1189404329183404e-06, + "loss": 4.2758379, + "num_input_tokens_seen": 2657450, + "router_z_loss_clip": 34.3125, + "router_z_loss_mlp": 148.625, + "step": 127, + "time_per_iteration": 2.6392276287078857 + }, + { + "auxiliary_loss_clip": 0.11679719, + "auxiliary_loss_mlp": 0.17065403, + "balance_loss_clip": 0.08245254, + "balance_loss_mlp": 0.02160617, + "epoch": 0.007695776341500075, + "flos": 25382216730240.0, + "grad_norm": 1269.777428310943, + "language_loss": 4.33711529, + "learning_rate": 3.1239902798522317e-06, + "loss": 4.62456656, + "num_input_tokens_seen": 2678150, + "router_z_loss_clip": 34.3125, + "router_z_loss_mlp": 149.125, + "step": 128, + "time_per_iteration": 2.698997974395752 + }, + { + "auxiliary_loss_clip": 0.11722346, + "auxiliary_loss_mlp": 0.16804715, + "balance_loss_clip": 0.08270991, + "balance_loss_mlp": 0.02131863, + "epoch": 0.007755899594168045, + "flos": 22350088558080.0, + "grad_norm": 1159.6537901720856, + "language_loss": 4.87967634, + "learning_rate": 3.129000827968184e-06, + "loss": 5.16494703, + "num_input_tokens_seen": 2698290, + "router_z_loss_clip": 34.5, + "router_z_loss_mlp": 146.625, + "step": 129, + "time_per_iteration": 2.6568491458892822 + }, + { + "auxiliary_loss_clip": 0.11725748, + "auxiliary_loss_mlp": 0.17228858, + "balance_loss_clip": 0.08278215, + "balance_loss_mlp": 0.02165382, + "epoch": 0.007816022846836013, + "flos": 22644869621760.0, + "grad_norm": 436.4430863377033, + "language_loss": 5.01482534, + "learning_rate": 3.133972684206866e-06, + "loss": 5.30437136, + "num_input_tokens_seen": 2717630, + "router_z_loss_clip": 34.4375, + "router_z_loss_mlp": 150.5, + "step": 130, + "time_per_iteration": 2.7268729209899902 + }, + { + "auxiliary_loss_clip": 0.11697873, + "auxiliary_loss_mlp": 0.16884172, + "balance_loss_clip": 0.08257942, + "balance_loss_mlp": 0.02162493, + "epoch": 0.007876146099503984, + "flos": 18188115131520.0, + "grad_norm": 1162.2622739405722, + "language_loss": 4.07958698, + "learning_rate": 3.138906441556014e-06, + "loss": 4.36540699, + "num_input_tokens_seen": 2735835, + "router_z_loss_clip": 34.40625, + "router_z_loss_mlp": 147.25, + "step": 131, + "time_per_iteration": 2.7109498977661133 + }, + { + "auxiliary_loss_clip": 0.11733647, + "auxiliary_loss_mlp": 0.16117501, + "balance_loss_clip": 0.08280095, + "balance_loss_mlp": 0.02128244, + "epoch": 0.007936269352171952, + "flos": 27125788815360.0, + "grad_norm": 7543.348079431309, + "language_loss": 4.20423412, + "learning_rate": 3.143802679474861e-06, + "loss": 4.48274565, + "num_input_tokens_seen": 2756335, + "router_z_loss_clip": 34.5, + "router_z_loss_mlp": 140.0, + "step": 132, + "time_per_iteration": 2.717806816101074 + }, + { + "auxiliary_loss_clip": 0.11797122, + "auxiliary_loss_mlp": 0.16945273, + "balance_loss_clip": 0.08290964, + "balance_loss_mlp": 0.0219918, + "epoch": 0.007996392604839923, + "flos": 19032403271040.0, + "grad_norm": 824.1057706186339, + "language_loss": 4.52130318, + "learning_rate": 3.1486619643025565e-06, + "loss": 4.80872679, + "num_input_tokens_seen": 2775090, + "router_z_loss_clip": 34.96875, + "router_z_loss_mlp": 147.375, + "step": 133, + "time_per_iteration": 2.6183056831359863 + }, + { + "auxiliary_loss_clip": 0.11778916, + "auxiliary_loss_mlp": 0.1607928, + "balance_loss_clip": 0.08279899, + "balance_loss_mlp": 0.02163264, + "epoch": 0.008056515857507891, + "flos": 25491271219200.0, + "grad_norm": 23901.09716796145, + "language_loss": 3.33778429, + "learning_rate": 3.153484849651286e-06, + "loss": 3.61636591, + "num_input_tokens_seen": 2795320, + "router_z_loss_clip": 34.96875, + "router_z_loss_mlp": 139.25, + "step": 134, + "time_per_iteration": 2.715651750564575 + }, + { + "auxiliary_loss_clip": 0.11796138, + "auxiliary_loss_mlp": 0.16928384, + "balance_loss_clip": 0.08284588, + "balance_loss_mlp": 0.02206703, + "epoch": 0.00811663911017586, + "flos": 20563694236800.0, + "grad_norm": 532.3002515432323, + "language_loss": 4.31598186, + "learning_rate": 3.1582718767847806e-06, + "loss": 4.60322666, + "num_input_tokens_seen": 2812815, + "router_z_loss_clip": 35.1875, + "router_z_loss_mlp": 147.25, + "step": 135, + "time_per_iteration": 2.658189296722412 + }, + { + "auxiliary_loss_clip": 0.11834078, + "auxiliary_loss_mlp": 0.17649791, + "balance_loss_clip": 0.08286304, + "balance_loss_mlp": 0.02256724, + "epoch": 0.00817676236284383, + "flos": 18804483365760.0, + "grad_norm": 591.2706889750153, + "language_loss": 4.16468382, + "learning_rate": 3.1630235749828485e-06, + "loss": 4.45952272, + "num_input_tokens_seen": 2830445, + "router_z_loss_clip": 35.4375, + "router_z_loss_mlp": 153.75, + "step": 136, + "time_per_iteration": 5.634068250656128 + }, + { + "auxiliary_loss_clip": 0.11831227, + "auxiliary_loss_mlp": 0.16616376, + "balance_loss_clip": 0.08291583, + "balance_loss_mlp": 0.02193768, + "epoch": 0.008236885615511799, + "flos": 23879576661120.0, + "grad_norm": 754.59577193491, + "language_loss": 4.28476763, + "learning_rate": 3.1677404618925676e-06, + "loss": 4.56924391, + "num_input_tokens_seen": 2846965, + "router_z_loss_clip": 35.46875, + "router_z_loss_mlp": 144.25, + "step": 137, + "time_per_iteration": 2.6984925270080566 + }, + { + "auxiliary_loss_clip": 0.11840196, + "auxiliary_loss_mlp": 0.16576298, + "balance_loss_clip": 0.08293904, + "balance_loss_mlp": 0.02214726, + "epoch": 0.00829700886817977, + "flos": 24650379169920.0, + "grad_norm": 767.1857414798482, + "language_loss": 4.50048828, + "learning_rate": 3.1724230438666953e-06, + "loss": 4.78465271, + "num_input_tokens_seen": 2867520, + "router_z_loss_clip": 35.5, + "router_z_loss_mlp": 143.5625, + "step": 138, + "time_per_iteration": 4.106135368347168 + }, + { + "auxiliary_loss_clip": 0.11846266, + "auxiliary_loss_mlp": 0.16453376, + "balance_loss_clip": 0.08313362, + "balance_loss_mlp": 0.02219978, + "epoch": 0.008357132120847738, + "flos": 25268550266880.0, + "grad_norm": 3135.202751990444, + "language_loss": 4.53827906, + "learning_rate": 3.177071816289865e-06, + "loss": 4.82127523, + "num_input_tokens_seen": 2885675, + "router_z_loss_clip": 35.34375, + "router_z_loss_mlp": 142.5, + "step": 139, + "time_per_iteration": 2.6956582069396973 + }, + { + "auxiliary_loss_clip": 0.11892673, + "auxiliary_loss_mlp": 0.17064422, + "balance_loss_clip": 0.08314734, + "balance_loss_mlp": 0.02245087, + "epoch": 0.008417255373515706, + "flos": 27352325128320.0, + "grad_norm": 729.9492101747932, + "language_loss": 3.41289186, + "learning_rate": 3.181687263893095e-06, + "loss": 3.70246267, + "num_input_tokens_seen": 2905960, + "router_z_loss_clip": 35.71875, + "router_z_loss_mlp": 148.125, + "step": 140, + "time_per_iteration": 2.6964235305786133 + }, + { + "auxiliary_loss_clip": 0.1186142, + "auxiliary_loss_mlp": 0.16847792, + "balance_loss_clip": 0.08325124, + "balance_loss_mlp": 0.02223768, + "epoch": 0.008477378626183677, + "flos": 17644771330560.0, + "grad_norm": 9248.736899536998, + "language_loss": 3.54738212, + "learning_rate": 3.186269861057098e-06, + "loss": 3.83447456, + "num_input_tokens_seen": 2922780, + "router_z_loss_clip": 35.375, + "router_z_loss_mlp": 146.125, + "step": 141, + "time_per_iteration": 2.6551992893218994 + }, + { + "auxiliary_loss_clip": 0.11875261, + "auxiliary_loss_mlp": 0.17182453, + "balance_loss_clip": 0.08333448, + "balance_loss_mlp": 0.02241047, + "epoch": 0.008537501878851645, + "flos": 13886465748480.0, + "grad_norm": 1195.8886145818353, + "language_loss": 3.75801992, + "learning_rate": 3.1908200721048745e-06, + "loss": 4.04859734, + "num_input_tokens_seen": 2938765, + "router_z_loss_clip": 35.46875, + "router_z_loss_mlp": 149.375, + "step": 142, + "time_per_iteration": 2.613173246383667 + }, + { + "auxiliary_loss_clip": 0.11767568, + "auxiliary_loss_mlp": 0.03479403, + "balance_loss_clip": 0.08269441, + "balance_loss_mlp": 0.01324862, + "epoch": 0.008597625131519616, + "flos": 71270783976960.0, + "grad_norm": 1.6897091068609469, + "language_loss": 0.6651473, + "learning_rate": 3.195338351584042e-06, + "loss": 0.81761706, + "num_input_tokens_seen": 3006665, + "router_z_loss_clip": 35.0, + "router_z_loss_mlp": 21.5625, + "step": 143, + "time_per_iteration": 3.571974754333496 + }, + { + "auxiliary_loss_clip": 0.11831102, + "auxiliary_loss_mlp": 0.18004906, + "balance_loss_clip": 0.08322103, + "balance_loss_mlp": 0.02245629, + "epoch": 0.008657748384187584, + "flos": 17608573566720.0, + "grad_norm": 764.3395719536082, + "language_loss": 4.02781963, + "learning_rate": 3.1998251445393258e-06, + "loss": 4.32617998, + "num_input_tokens_seen": 3024335, + "router_z_loss_clip": 35.125, + "router_z_loss_mlp": 157.625, + "step": 144, + "time_per_iteration": 2.950308322906494 + }, + { + "auxiliary_loss_clip": 0.11815393, + "auxiliary_loss_mlp": 0.1653876, + "balance_loss_clip": 0.08320558, + "balance_loss_mlp": 0.021955, + "epoch": 0.008717871636855555, + "flos": 19720789689600.0, + "grad_norm": 995.118837229873, + "language_loss": 3.85104275, + "learning_rate": 3.204280886775619e-06, + "loss": 4.13458443, + "num_input_tokens_seen": 3043300, + "router_z_loss_clip": 34.9375, + "router_z_loss_mlp": 143.625, + "step": 145, + "time_per_iteration": 2.704049587249756 + }, + { + "auxiliary_loss_clip": 0.11712223, + "auxiliary_loss_mlp": 0.1568643, + "balance_loss_clip": 0.08270143, + "balance_loss_mlp": 0.02154936, + "epoch": 0.008777994889523523, + "flos": 24724325998080.0, + "grad_norm": 15039.120691806027, + "language_loss": 3.98885298, + "learning_rate": 3.208706005112005e-06, + "loss": 4.26283932, + "num_input_tokens_seen": 3064610, + "router_z_loss_clip": 34.40625, + "router_z_loss_mlp": 135.4375, + "step": 146, + "time_per_iteration": 2.7329108715057373 + }, + { + "auxiliary_loss_clip": 0.11446112, + "auxiliary_loss_mlp": 0.02845502, + "balance_loss_clip": 0.08152023, + "balance_loss_mlp": 0.01408125, + "epoch": 0.008838118142191492, + "flos": 70150974013440.0, + "grad_norm": 1.1651618479175945, + "language_loss": 0.59517723, + "learning_rate": 3.213100917627104e-06, + "loss": 0.73809338, + "num_input_tokens_seen": 3130385, + "router_z_loss_clip": 33.0, + "router_z_loss_mlp": 14.3671875, + "step": 147, + "time_per_iteration": 3.3949942588806152 + }, + { + "auxiliary_loss_clip": 0.11677637, + "auxiliary_loss_mlp": 0.16713935, + "balance_loss_clip": 0.08274397, + "balance_loss_mlp": 0.02199776, + "epoch": 0.008898241394859462, + "flos": 20050510705920.0, + "grad_norm": 1889.1884601694564, + "language_loss": 4.35780334, + "learning_rate": 3.2174660338961135e-06, + "loss": 4.64171886, + "num_input_tokens_seen": 3149760, + "router_z_loss_clip": 33.96875, + "router_z_loss_mlp": 145.25, + "step": 148, + "time_per_iteration": 2.7146079540252686 + }, + { + "auxiliary_loss_clip": 0.1159438, + "auxiliary_loss_mlp": 0.16573352, + "balance_loss_clip": 0.08248326, + "balance_loss_mlp": 0.02217881, + "epoch": 0.008958364647527431, + "flos": 10748217980160.0, + "grad_norm": 637.0991660467967, + "language_loss": 4.14174032, + "learning_rate": 3.2218017552198588e-06, + "loss": 4.42341805, + "num_input_tokens_seen": 3164500, + "router_z_loss_clip": 33.4375, + "router_z_loss_mlp": 143.625, + "step": 149, + "time_per_iteration": 2.661672353744507 + }, + { + "auxiliary_loss_clip": 0.11618437, + "auxiliary_loss_mlp": 0.16563556, + "balance_loss_clip": 0.08263792, + "balance_loss_mlp": 0.02201984, + "epoch": 0.009018487900195401, + "flos": 29134317110400.0, + "grad_norm": 1769.3998229499293, + "language_loss": 4.95698929, + "learning_rate": 3.226108474846181e-06, + "loss": 5.23880959, + "num_input_tokens_seen": 3182455, + "router_z_loss_clip": 33.5625, + "router_z_loss_mlp": 143.6875, + "step": 150, + "time_per_iteration": 2.7311227321624756 + }, + { + "auxiliary_loss_clip": 0.11585926, + "auxiliary_loss_mlp": 0.16123089, + "balance_loss_clip": 0.08249478, + "balance_loss_mlp": 0.02219281, + "epoch": 0.00907861115286337, + "flos": 32972020035840.0, + "grad_norm": 2114.6136002652206, + "language_loss": 3.36094427, + "learning_rate": 3.2303865781839817e-06, + "loss": 3.63803458, + "num_input_tokens_seen": 3203995, + "router_z_loss_clip": 33.34375, + "router_z_loss_mlp": 139.125, + "step": 151, + "time_per_iteration": 2.7520253658294678 + }, + { + "auxiliary_loss_clip": 0.115492, + "auxiliary_loss_mlp": 0.15748456, + "balance_loss_clip": 0.08239767, + "balance_loss_mlp": 0.02198652, + "epoch": 0.009138734405531338, + "flos": 21768911838720.0, + "grad_norm": 3311.474565423633, + "language_loss": 3.73547316, + "learning_rate": 3.234636443010188e-06, + "loss": 4.00844955, + "num_input_tokens_seen": 3222575, + "router_z_loss_clip": 33.09375, + "router_z_loss_mlp": 135.625, + "step": 152, + "time_per_iteration": 2.694563865661621 + }, + { + "auxiliary_loss_clip": 0.1159073, + "auxiliary_loss_mlp": 0.1623821, + "balance_loss_clip": 0.08250044, + "balance_loss_mlp": 0.02248952, + "epoch": 0.009198857658199309, + "flos": 20847532343040.0, + "grad_norm": 1087.0956983151382, + "language_loss": 3.84302998, + "learning_rate": 3.238858439669943e-06, + "loss": 4.12131977, + "num_input_tokens_seen": 3240180, + "router_z_loss_clip": 33.40625, + "router_z_loss_mlp": 139.875, + "step": 153, + "time_per_iteration": 2.6366450786590576 + }, + { + "auxiliary_loss_clip": 0.11564142, + "auxiliary_loss_mlp": 0.15476364, + "balance_loss_clip": 0.08260261, + "balance_loss_mlp": 0.02207321, + "epoch": 0.009258980910867277, + "flos": 24834386736000.0, + "grad_norm": 8366.148944916698, + "language_loss": 4.13687325, + "learning_rate": 3.2430529312702712e-06, + "loss": 4.40727806, + "num_input_tokens_seen": 3259800, + "router_z_loss_clip": 33.03125, + "router_z_loss_mlp": 132.8125, + "step": 154, + "time_per_iteration": 2.7312138080596924 + }, + { + "auxiliary_loss_clip": 0.11535051, + "auxiliary_loss_mlp": 0.15077396, + "balance_loss_clip": 0.08268774, + "balance_loss_mlp": 0.02198978, + "epoch": 0.009319104163535248, + "flos": 28775442072960.0, + "grad_norm": 662.1258045248602, + "language_loss": 4.14579964, + "learning_rate": 3.2472202738674737e-06, + "loss": 4.41192484, + "num_input_tokens_seen": 3280400, + "router_z_loss_clip": 32.71875, + "router_z_loss_mlp": 128.6875, + "step": 155, + "time_per_iteration": 2.755199909210205 + }, + { + "auxiliary_loss_clip": 0.11566834, + "auxiliary_loss_mlp": 0.15004471, + "balance_loss_clip": 0.08261703, + "balance_loss_mlp": 0.02193191, + "epoch": 0.009379227416203216, + "flos": 16587698947200.0, + "grad_norm": 731.5664855161135, + "language_loss": 3.49704862, + "learning_rate": 3.2513608166485063e-06, + "loss": 3.76276183, + "num_input_tokens_seen": 3297600, + "router_z_loss_clip": 33.09375, + "router_z_loss_mlp": 128.125, + "step": 156, + "time_per_iteration": 2.7707407474517822 + }, + { + "auxiliary_loss_clip": 0.11568415, + "auxiliary_loss_mlp": 0.15332887, + "balance_loss_clip": 0.08266081, + "balance_loss_mlp": 0.02216432, + "epoch": 0.009439350668871187, + "flos": 18335337955200.0, + "grad_norm": 795.683005311381, + "language_loss": 3.94911337, + "learning_rate": 3.2554749021065498e-06, + "loss": 4.2181263, + "num_input_tokens_seen": 3313635, + "router_z_loss_clip": 32.96875, + "router_z_loss_mlp": 131.25, + "step": 157, + "time_per_iteration": 2.6737098693847656 + }, + { + "auxiliary_loss_clip": 0.11567172, + "auxiliary_loss_mlp": 0.15600383, + "balance_loss_clip": 0.0828969, + "balance_loss_mlp": 0.02264203, + "epoch": 0.009499473921539155, + "flos": 24356310865920.0, + "grad_norm": 748.6515809747107, + "language_loss": 3.9944849, + "learning_rate": 3.2595628662110186e-06, + "loss": 4.26616049, + "num_input_tokens_seen": 3333735, + "router_z_loss_clip": 32.75, + "router_z_loss_mlp": 133.5625, + "step": 158, + "time_per_iteration": 2.6704254150390625 + }, + { + "auxiliary_loss_clip": 0.11561831, + "auxiliary_loss_mlp": 0.15665153, + "balance_loss_clip": 0.08273103, + "balance_loss_mlp": 0.02231314, + "epoch": 0.009559597174207124, + "flos": 16404949192320.0, + "grad_norm": 1901.311070356518, + "language_loss": 3.80921197, + "learning_rate": 3.2636250385721982e-06, + "loss": 4.08148146, + "num_input_tokens_seen": 3348800, + "router_z_loss_clip": 32.90625, + "router_z_loss_mlp": 134.4375, + "step": 159, + "time_per_iteration": 2.6218996047973633 + }, + { + "auxiliary_loss_clip": 0.11580203, + "auxiliary_loss_mlp": 0.15643886, + "balance_loss_clip": 0.08278053, + "balance_loss_mlp": 0.02252773, + "epoch": 0.009619720426875094, + "flos": 22863523651200.0, + "grad_norm": 1785.522909187837, + "language_loss": 3.8831954, + "learning_rate": 3.2676617426007263e-06, + "loss": 4.15543652, + "num_input_tokens_seen": 3368595, + "router_z_loss_clip": 33.0, + "router_z_loss_mlp": 134.0, + "step": 160, + "time_per_iteration": 2.6699254512786865 + }, + { + "auxiliary_loss_clip": 0.11567888, + "auxiliary_loss_mlp": 0.15128596, + "balance_loss_clip": 0.08280417, + "balance_loss_mlp": 0.02237971, + "epoch": 0.009679843679543063, + "flos": 19140954635520.0, + "grad_norm": 1894.5705497879367, + "language_loss": 4.38242626, + "learning_rate": 3.2716732956621042e-06, + "loss": 4.6493907, + "num_input_tokens_seen": 3384975, + "router_z_loss_clip": 32.890625, + "router_z_loss_mlp": 129.0, + "step": 161, + "time_per_iteration": 2.692594289779663 + }, + { + "auxiliary_loss_clip": 0.11596949, + "auxiliary_loss_mlp": 0.15413821, + "balance_loss_clip": 0.08296333, + "balance_loss_mlp": 0.02279055, + "epoch": 0.009739966932211033, + "flos": 20309219786880.0, + "grad_norm": 1092.6315431795774, + "language_loss": 3.67637897, + "learning_rate": 3.2756600092264203e-06, + "loss": 3.94648647, + "num_input_tokens_seen": 3404755, + "router_z_loss_clip": 33.0, + "router_z_loss_mlp": 131.4375, + "step": 162, + "time_per_iteration": 2.684589147567749 + }, + { + "auxiliary_loss_clip": 0.10812573, + "auxiliary_loss_mlp": 0.02121325, + "balance_loss_clip": 0.08169468, + "balance_loss_mlp": 0.01469775, + "epoch": 0.009800090184879002, + "flos": 67053200567040.0, + "grad_norm": 1.455168404801105, + "language_loss": 0.72263706, + "learning_rate": 3.279622189013474e-06, + "loss": 0.85197604, + "num_input_tokens_seen": 3467210, + "router_z_loss_clip": 26.484375, + "router_z_loss_mlp": 6.515625, + "step": 163, + "time_per_iteration": 3.2609994411468506 + }, + { + "auxiliary_loss_clip": 0.1158057, + "auxiliary_loss_mlp": 0.15459523, + "balance_loss_clip": 0.08303102, + "balance_loss_mlp": 0.02282033, + "epoch": 0.00986021343754697, + "flos": 17170301185920.0, + "grad_norm": 728.8786194893343, + "language_loss": 3.07243919, + "learning_rate": 3.283560135133457e-06, + "loss": 3.34283996, + "num_input_tokens_seen": 3483220, + "router_z_loss_clip": 32.765625, + "router_z_loss_mlp": 131.8125, + "step": 164, + "time_per_iteration": 2.6558001041412354 + }, + { + "auxiliary_loss_clip": 0.11589515, + "auxiliary_loss_mlp": 0.15754591, + "balance_loss_clip": 0.08312181, + "balance_loss_mlp": 0.02308546, + "epoch": 0.00992033669021494, + "flos": 17755293265920.0, + "grad_norm": 847.0745501241739, + "language_loss": 3.51890922, + "learning_rate": 3.2874741422233565e-06, + "loss": 3.79235029, + "num_input_tokens_seen": 3501465, + "router_z_loss_clip": 32.78125, + "router_z_loss_mlp": 134.4375, + "step": 165, + "time_per_iteration": 2.661271095275879 + }, + { + "auxiliary_loss_clip": 0.11568248, + "auxiliary_loss_mlp": 0.15508898, + "balance_loss_clip": 0.08301617, + "balance_loss_mlp": 0.02294787, + "epoch": 0.00998045994288291, + "flos": 25303490219520.0, + "grad_norm": 327.0790624727143, + "language_loss": 3.23893571, + "learning_rate": 3.2913644995792465e-06, + "loss": 3.50970697, + "num_input_tokens_seen": 3520480, + "router_z_loss_clip": 32.6875, + "router_z_loss_mlp": 132.3125, + "step": 166, + "time_per_iteration": 2.710336923599243 + }, + { + "auxiliary_loss_clip": 0.11574914, + "auxiliary_loss_mlp": 0.14880663, + "balance_loss_clip": 0.08314175, + "balance_loss_mlp": 0.02301317, + "epoch": 0.01004058319555088, + "flos": 32305869676800.0, + "grad_norm": 776.5856268380442, + "language_loss": 4.07326555, + "learning_rate": 3.2952314912845914e-06, + "loss": 4.33782148, + "num_input_tokens_seen": 3539570, + "router_z_loss_clip": 32.609375, + "router_z_loss_mlp": 125.8125, + "step": 167, + "time_per_iteration": 2.779219150543213 + }, + { + "auxiliary_loss_clip": 0.1150827, + "auxiliary_loss_mlp": 0.15720402, + "balance_loss_clip": 0.083069, + "balance_loss_mlp": 0.02304874, + "epoch": 0.010100706448218848, + "flos": 11323399132800.0, + "grad_norm": 2394.835407434967, + "language_loss": 3.28905821, + "learning_rate": 3.299075396334735e-06, + "loss": 3.5613451, + "num_input_tokens_seen": 3555465, + "router_z_loss_clip": 32.0, + "router_z_loss_mlp": 134.25, + "step": 168, + "time_per_iteration": 2.6511645317077637 + }, + { + "auxiliary_loss_clip": 0.11477365, + "auxiliary_loss_mlp": 0.1529358, + "balance_loss_clip": 0.08283502, + "balance_loss_mlp": 0.02299196, + "epoch": 0.010160829700886819, + "flos": 29727820379520.0, + "grad_norm": 656.1528496227621, + "language_loss": 3.4663558, + "learning_rate": 3.3028964887576868e-06, + "loss": 3.73406529, + "num_input_tokens_seen": 3578970, + "router_z_loss_clip": 31.921875, + "router_z_loss_mlp": 130.0, + "step": 169, + "time_per_iteration": 2.744943141937256 + }, + { + "auxiliary_loss_clip": 0.1151928, + "auxiliary_loss_mlp": 0.1559048, + "balance_loss_clip": 0.08316396, + "balance_loss_mlp": 0.02315333, + "epoch": 0.010220952953554787, + "flos": 20418567765120.0, + "grad_norm": 1313.5821328962659, + "language_loss": 3.30928183, + "learning_rate": 3.306695037731344e-06, + "loss": 3.58037925, + "num_input_tokens_seen": 3597275, + "router_z_loss_clip": 32.03125, + "router_z_loss_mlp": 132.75, + "step": 170, + "time_per_iteration": 2.6904942989349365 + }, + { + "auxiliary_loss_clip": 0.11476055, + "auxiliary_loss_mlp": 0.14880618, + "balance_loss_clip": 0.08295664, + "balance_loss_mlp": 0.02301271, + "epoch": 0.010281076206222756, + "flos": 31293170830080.0, + "grad_norm": 1393.3935417181144, + "language_loss": 3.61100364, + "learning_rate": 3.3104713076972827e-06, + "loss": 3.87457037, + "num_input_tokens_seen": 3618905, + "router_z_loss_clip": 31.84375, + "router_z_loss_mlp": 125.75, + "step": 171, + "time_per_iteration": 2.7253830432891846 + }, + { + "auxiliary_loss_clip": 0.11506656, + "auxiliary_loss_mlp": 0.15002409, + "balance_loss_clip": 0.08299719, + "balance_loss_mlp": 0.02294889, + "epoch": 0.010341199458890726, + "flos": 21988949460480.0, + "grad_norm": 857.6014739419991, + "language_loss": 3.63604832, + "learning_rate": 3.314225558471224e-06, + "loss": 3.90113878, + "num_input_tokens_seen": 3639610, + "router_z_loss_clip": 32.015625, + "router_z_loss_mlp": 127.1875, + "step": 172, + "time_per_iteration": 2.687918186187744 + }, + { + "auxiliary_loss_clip": 0.11501465, + "auxiliary_loss_mlp": 0.15934135, + "balance_loss_clip": 0.08304699, + "balance_loss_mlp": 0.02359916, + "epoch": 0.010401322711558695, + "flos": 30818449123200.0, + "grad_norm": 2776.6711688344126, + "language_loss": 3.43709183, + "learning_rate": 3.317958045350308e-06, + "loss": 3.71144772, + "num_input_tokens_seen": 3664030, + "router_z_loss_clip": 31.9375, + "router_z_loss_mlp": 135.6875, + "step": 173, + "time_per_iteration": 2.760416030883789 + }, + { + "auxiliary_loss_clip": 0.11548179, + "auxiliary_loss_mlp": 0.15753293, + "balance_loss_clip": 0.08317138, + "balance_loss_mlp": 0.02337765, + "epoch": 0.010461445964226665, + "flos": 24721642667520.0, + "grad_norm": 1049.1047345334737, + "language_loss": 3.46181607, + "learning_rate": 3.3216690192172596e-06, + "loss": 3.73483086, + "num_input_tokens_seen": 3683615, + "router_z_loss_clip": 32.28125, + "router_z_loss_mlp": 134.125, + "step": 174, + "time_per_iteration": 2.8112432956695557 + }, + { + "auxiliary_loss_clip": 0.11529493, + "auxiliary_loss_mlp": 0.16248052, + "balance_loss_clip": 0.08304952, + "balance_loss_mlp": 0.02319829, + "epoch": 0.010521569216894634, + "flos": 27717950419200.0, + "grad_norm": 1443.6409322594398, + "language_loss": 3.14877939, + "learning_rate": 3.325358726641591e-06, + "loss": 3.42655468, + "num_input_tokens_seen": 3704540, + "router_z_loss_clip": 32.265625, + "router_z_loss_mlp": 139.25, + "step": 175, + "time_per_iteration": 5.6078009605407715 + }, + { + "auxiliary_loss_clip": 0.11549105, + "auxiliary_loss_mlp": 0.15645993, + "balance_loss_clip": 0.08317456, + "balance_loss_mlp": 0.02328122, + "epoch": 0.010581692469562603, + "flos": 12463223022720.0, + "grad_norm": 956.7802143525229, + "language_loss": 3.34866667, + "learning_rate": 3.329027409977902e-06, + "loss": 3.62061763, + "num_input_tokens_seen": 3721320, + "router_z_loss_clip": 32.34375, + "router_z_loss_mlp": 133.375, + "step": 176, + "time_per_iteration": 4.057558059692383 + }, + { + "auxiliary_loss_clip": 0.11580729, + "auxiliary_loss_mlp": 0.16905147, + "balance_loss_clip": 0.08321375, + "balance_loss_mlp": 0.02378779, + "epoch": 0.010641815722230573, + "flos": 19433723201280.0, + "grad_norm": 1505.424754847227, + "language_loss": 3.25544405, + "learning_rate": 3.3326753074614087e-06, + "loss": 3.54030275, + "num_input_tokens_seen": 3739385, + "router_z_loss_clip": 32.5625, + "router_z_loss_mlp": 145.25, + "step": 177, + "time_per_iteration": 4.175410032272339 + }, + { + "auxiliary_loss_clip": 0.11632887, + "auxiliary_loss_mlp": 0.17182559, + "balance_loss_clip": 0.08330977, + "balance_loss_mlp": 0.02387638, + "epoch": 0.010701938974898541, + "flos": 18338440556160.0, + "grad_norm": 1009.0094276513727, + "language_loss": 3.02760315, + "learning_rate": 3.3363026533007716e-06, + "loss": 3.31575751, + "num_input_tokens_seen": 3756360, + "router_z_loss_clip": 33.046875, + "router_z_loss_mlp": 148.0, + "step": 178, + "time_per_iteration": 2.6476314067840576 + }, + { + "auxiliary_loss_clip": 0.11659138, + "auxiliary_loss_mlp": 0.17559879, + "balance_loss_clip": 0.0834986, + "balance_loss_mlp": 0.02398745, + "epoch": 0.010762062227566512, + "flos": 19209283240320.0, + "grad_norm": 645.2944722680985, + "language_loss": 3.18850112, + "learning_rate": 3.3399096777683303e-06, + "loss": 3.48069143, + "num_input_tokens_seen": 3773930, + "router_z_loss_clip": 33.03125, + "router_z_loss_mlp": 151.5, + "step": 179, + "time_per_iteration": 2.673020601272583 + }, + { + "auxiliary_loss_clip": 0.11646449, + "auxiliary_loss_mlp": 0.17152536, + "balance_loss_clip": 0.0833544, + "balance_loss_mlp": 0.02369822, + "epoch": 0.01082218548023448, + "flos": 31432553297280.0, + "grad_norm": 1138.8337468152163, + "language_loss": 3.61664343, + "learning_rate": 3.3434966072878213e-06, + "loss": 3.90463305, + "num_input_tokens_seen": 3793630, + "router_z_loss_clip": 33.125, + "router_z_loss_mlp": 147.75, + "step": 180, + "time_per_iteration": 2.7129592895507812 + }, + { + "auxiliary_loss_clip": 0.1163583, + "auxiliary_loss_mlp": 0.17579561, + "balance_loss_clip": 0.08352019, + "balance_loss_mlp": 0.02406223, + "epoch": 0.01088230873290245, + "flos": 25053501962880.0, + "grad_norm": 1023.6426422721124, + "language_loss": 3.16591597, + "learning_rate": 3.3470636645196674e-06, + "loss": 3.45807004, + "num_input_tokens_seen": 3813610, + "router_z_loss_clip": 32.875, + "router_z_loss_mlp": 151.5, + "step": 181, + "time_per_iteration": 2.7088735103607178 + }, + { + "auxiliary_loss_clip": 0.11667231, + "auxiliary_loss_mlp": 0.17749819, + "balance_loss_clip": 0.08358228, + "balance_loss_mlp": 0.02381167, + "epoch": 0.01094243198557042, + "flos": 22900056831360.0, + "grad_norm": 355.45097956691654, + "language_loss": 3.57462454, + "learning_rate": 3.3506110684439156e-06, + "loss": 3.86879492, + "num_input_tokens_seen": 3831390, + "router_z_loss_clip": 33.03125, + "router_z_loss_mlp": 153.625, + "step": 182, + "time_per_iteration": 2.6655702590942383 + }, + { + "auxiliary_loss_clip": 0.11774068, + "auxiliary_loss_mlp": 0.186405, + "balance_loss_clip": 0.08392486, + "balance_loss_mlp": 0.02429562, + "epoch": 0.011002555238238388, + "flos": 17170720456320.0, + "grad_norm": 544.9308642616941, + "language_loss": 3.01895189, + "learning_rate": 3.3541390344409054e-06, + "loss": 3.32309771, + "num_input_tokens_seen": 3849705, + "router_z_loss_clip": 33.8125, + "router_z_loss_mlp": 162.0, + "step": 183, + "time_per_iteration": 2.672084331512451 + }, + { + "auxiliary_loss_clip": 0.11731043, + "auxiliary_loss_mlp": 0.17741105, + "balance_loss_clip": 0.0838448, + "balance_loss_mlp": 0.02409074, + "epoch": 0.011062678490906358, + "flos": 22316783760000.0, + "grad_norm": 900.0159693716428, + "language_loss": 3.54977012, + "learning_rate": 3.357647774369736e-06, + "loss": 3.84449148, + "num_input_tokens_seen": 3869230, + "router_z_loss_clip": 33.4375, + "router_z_loss_mlp": 153.25, + "step": 184, + "time_per_iteration": 2.664008140563965 + }, + { + "auxiliary_loss_clip": 0.11698474, + "auxiliary_loss_mlp": 0.18400645, + "balance_loss_clip": 0.08363934, + "balance_loss_mlp": 0.02433849, + "epoch": 0.011122801743574327, + "flos": 24395108106240.0, + "grad_norm": 434.928327577731, + "language_loss": 3.09638596, + "learning_rate": 3.3611374966446085e-06, + "loss": 3.39737701, + "num_input_tokens_seen": 3889735, + "router_z_loss_clip": 33.34375, + "router_z_loss_mlp": 159.5, + "step": 185, + "time_per_iteration": 2.726417303085327 + }, + { + "auxiliary_loss_clip": 0.11759127, + "auxiliary_loss_mlp": 0.17777845, + "balance_loss_clip": 0.08374798, + "balance_loss_mlp": 0.02421399, + "epoch": 0.011182924996242297, + "flos": 18156110071680.0, + "grad_norm": 629.7246053366609, + "language_loss": 2.4891119, + "learning_rate": 3.3646084063091142e-06, + "loss": 2.78448153, + "num_input_tokens_seen": 3908855, + "router_z_loss_clip": 33.84375, + "router_z_loss_mlp": 153.5, + "step": 186, + "time_per_iteration": 2.694352865219116 + }, + { + "auxiliary_loss_clip": 0.11730683, + "auxiliary_loss_mlp": 0.17846453, + "balance_loss_clip": 0.08379789, + "balance_loss_mlp": 0.0240456, + "epoch": 0.011243048248910266, + "flos": 15492206666880.0, + "grad_norm": 204.67136476740635, + "language_loss": 3.6299262, + "learning_rate": 3.3680607051085194e-06, + "loss": 3.9256978, + "num_input_tokens_seen": 3923865, + "router_z_loss_clip": 33.53125, + "router_z_loss_mlp": 154.25, + "step": 187, + "time_per_iteration": 2.6440258026123047 + }, + { + "auxiliary_loss_clip": 0.11782947, + "auxiliary_loss_mlp": 0.18885629, + "balance_loss_clip": 0.08391893, + "balance_loss_mlp": 0.02454964, + "epoch": 0.011303171501578235, + "flos": 40926442383360.0, + "grad_norm": 245.45256433797323, + "language_loss": 2.78124428, + "learning_rate": 3.371494591560139e-06, + "loss": 3.0879302, + "num_input_tokens_seen": 3946870, + "router_z_loss_clip": 33.875, + "router_z_loss_mlp": 164.25, + "step": 188, + "time_per_iteration": 2.8504083156585693 + }, + { + "auxiliary_loss_clip": 0.10094331, + "auxiliary_loss_mlp": 0.0271045, + "balance_loss_clip": 0.08081996, + "balance_loss_mlp": 0.01840699, + "epoch": 0.011363294754246205, + "flos": 66321237225600.0, + "grad_norm": 2.5418158680058287, + "language_loss": 0.5572542, + "learning_rate": 3.3749102610218297e-06, + "loss": 0.68530196, + "num_input_tokens_seen": 4010005, + "router_z_loss_clip": 20.140625, + "router_z_loss_mlp": 8.71875, + "step": 189, + "time_per_iteration": 3.351346492767334 + }, + { + "auxiliary_loss_clip": 0.11787133, + "auxiliary_loss_mlp": 0.18362574, + "balance_loss_clip": 0.08391854, + "balance_loss_mlp": 0.02444606, + "epoch": 0.011423418006914174, + "flos": 24907285388160.0, + "grad_norm": 1404.1743205968703, + "language_loss": 3.09611416, + "learning_rate": 3.3783079057586833e-06, + "loss": 3.39761114, + "num_input_tokens_seen": 4029035, + "router_z_loss_clip": 34.0, + "router_z_loss_mlp": 159.125, + "step": 190, + "time_per_iteration": 2.7106430530548096 + }, + { + "auxiliary_loss_clip": 0.11759384, + "auxiliary_loss_mlp": 0.1804318, + "balance_loss_clip": 0.08374631, + "balance_loss_mlp": 0.02442593, + "epoch": 0.011483541259582144, + "flos": 19797964899840.0, + "grad_norm": 958.8286854390585, + "language_loss": 3.06252718, + "learning_rate": 3.3816877150079665e-06, + "loss": 3.36055326, + "num_input_tokens_seen": 4046995, + "router_z_loss_clip": 33.875, + "router_z_loss_mlp": 156.0, + "step": 191, + "time_per_iteration": 2.6592226028442383 + }, + { + "auxiliary_loss_clip": 0.11741614, + "auxiliary_loss_mlp": 0.17628413, + "balance_loss_clip": 0.08397849, + "balance_loss_mlp": 0.02442867, + "epoch": 0.011543664512250112, + "flos": 26184101904000.0, + "grad_norm": 872.0200851454543, + "language_loss": 3.40287876, + "learning_rate": 3.385049875042367e-06, + "loss": 3.69657874, + "num_input_tokens_seen": 4065865, + "router_z_loss_clip": 33.4375, + "router_z_loss_mlp": 151.625, + "step": 192, + "time_per_iteration": 2.7246127128601074 + }, + { + "auxiliary_loss_clip": 0.11744646, + "auxiliary_loss_mlp": 0.1831618, + "balance_loss_clip": 0.08387344, + "balance_loss_mlp": 0.02459247, + "epoch": 0.011603787764918083, + "flos": 23775763052160.0, + "grad_norm": 255.22859463919886, + "language_loss": 3.03195429, + "learning_rate": 3.3883945692315938e-06, + "loss": 3.33256245, + "num_input_tokens_seen": 4085305, + "router_z_loss_clip": 33.59375, + "router_z_loss_mlp": 158.375, + "step": 193, + "time_per_iteration": 2.683800220489502 + }, + { + "auxiliary_loss_clip": 0.11792802, + "auxiliary_loss_mlp": 0.18172303, + "balance_loss_clip": 0.08409159, + "balance_loss_mlp": 0.02449647, + "epoch": 0.011663911017586051, + "flos": 25961255170560.0, + "grad_norm": 151.45813274947093, + "language_loss": 3.26517797, + "learning_rate": 3.3917219781023906e-06, + "loss": 3.56482911, + "num_input_tokens_seen": 4105185, + "router_z_loss_clip": 33.875, + "router_z_loss_mlp": 157.0, + "step": 194, + "time_per_iteration": 2.6878743171691895 + }, + { + "auxiliary_loss_clip": 0.11706592, + "auxiliary_loss_mlp": 0.17706957, + "balance_loss_clip": 0.08367997, + "balance_loss_mlp": 0.0244817, + "epoch": 0.01172403427025402, + "flos": 17901006716160.0, + "grad_norm": 341.36308265873936, + "language_loss": 3.21669102, + "learning_rate": 3.3950322793970014e-06, + "loss": 3.51082659, + "num_input_tokens_seen": 4123160, + "router_z_loss_clip": 33.375, + "router_z_loss_mlp": 152.25, + "step": 195, + "time_per_iteration": 2.6620969772338867 + }, + { + "auxiliary_loss_clip": 0.11741272, + "auxiliary_loss_mlp": 0.18081686, + "balance_loss_clip": 0.08387178, + "balance_loss_mlp": 0.02468893, + "epoch": 0.01178415752292199, + "flos": 17900293956480.0, + "grad_norm": 232.42067340374058, + "language_loss": 3.00283194, + "learning_rate": 3.3983256481301445e-06, + "loss": 3.30106115, + "num_input_tokens_seen": 4140425, + "router_z_loss_clip": 33.53125, + "router_z_loss_mlp": 156.0, + "step": 196, + "time_per_iteration": 2.608747720718384 + }, + { + "auxiliary_loss_clip": 0.11721249, + "auxiliary_loss_mlp": 0.17373422, + "balance_loss_clip": 0.08370736, + "balance_loss_mlp": 0.02444223, + "epoch": 0.011844280775589959, + "flos": 22900224539520.0, + "grad_norm": 115.37051275011517, + "language_loss": 2.93469787, + "learning_rate": 3.4016022566445335e-06, + "loss": 3.22564435, + "num_input_tokens_seen": 4159555, + "router_z_loss_clip": 33.5, + "router_z_loss_mlp": 149.0, + "step": 197, + "time_per_iteration": 2.6884865760803223 + }, + { + "auxiliary_loss_clip": 0.11780085, + "auxiliary_loss_mlp": 0.17500654, + "balance_loss_clip": 0.08412851, + "balance_loss_mlp": 0.02486004, + "epoch": 0.01190440402825793, + "flos": 26987748013440.0, + "grad_norm": 594.5655905086047, + "language_loss": 2.93459964, + "learning_rate": 3.4048622746649966e-06, + "loss": 3.22740698, + "num_input_tokens_seen": 4180480, + "router_z_loss_clip": 33.65625, + "router_z_loss_mlp": 150.25, + "step": 198, + "time_per_iteration": 2.7313427925109863 + }, + { + "auxiliary_loss_clip": 0.11754367, + "auxiliary_loss_mlp": 0.16903168, + "balance_loss_clip": 0.08420561, + "balance_loss_mlp": 0.02462251, + "epoch": 0.011964527280925898, + "flos": 20527789962240.0, + "grad_norm": 145.17481727818333, + "language_loss": 2.84690857, + "learning_rate": 3.4081058693512278e-06, + "loss": 3.13348389, + "num_input_tokens_seen": 4198835, + "router_z_loss_clip": 33.34375, + "router_z_loss_mlp": 144.5, + "step": 199, + "time_per_iteration": 2.688974618911743 + }, + { + "auxiliary_loss_clip": 0.11798929, + "auxiliary_loss_mlp": 0.17447452, + "balance_loss_clip": 0.08422767, + "balance_loss_mlp": 0.02481632, + "epoch": 0.012024650533593867, + "flos": 27753435423360.0, + "grad_norm": 82.0113766879368, + "language_loss": 2.56142473, + "learning_rate": 3.411333205349222e-06, + "loss": 2.85388851, + "num_input_tokens_seen": 4219335, + "router_z_loss_clip": 33.75, + "router_z_loss_mlp": 149.5, + "step": 200, + "time_per_iteration": 2.745638608932495 + }, + { + "auxiliary_loss_clip": 0.11760798, + "auxiliary_loss_mlp": 0.1661135, + "balance_loss_clip": 0.08439215, + "balance_loss_mlp": 0.02475607, + "epoch": 0.012084773786261837, + "flos": 10456623371520.0, + "grad_norm": 81.29107841083456, + "language_loss": 2.49306059, + "learning_rate": 3.4145444448414217e-06, + "loss": 2.77678204, + "num_input_tokens_seen": 4236940, + "router_z_loss_clip": 33.25, + "router_z_loss_mlp": 141.375, + "step": 201, + "time_per_iteration": 2.7527854442596436 + }, + { + "auxiliary_loss_clip": 0.1174719, + "auxiliary_loss_mlp": 0.16602293, + "balance_loss_clip": 0.08432734, + "balance_loss_mlp": 0.02490965, + "epoch": 0.012144897038929806, + "flos": 23111331701760.0, + "grad_norm": 843.8800494285322, + "language_loss": 2.70319819, + "learning_rate": 3.4177397475956223e-06, + "loss": 2.98669291, + "num_input_tokens_seen": 4256755, + "router_z_loss_clip": 33.21875, + "router_z_loss_mlp": 141.125, + "step": 202, + "time_per_iteration": 2.739138603210449 + }, + { + "auxiliary_loss_clip": 0.11772437, + "auxiliary_loss_mlp": 0.16814882, + "balance_loss_clip": 0.08448092, + "balance_loss_mlp": 0.02483826, + "epoch": 0.012205020291597776, + "flos": 21039631827840.0, + "grad_norm": 111.22984226607618, + "language_loss": 2.69834185, + "learning_rate": 3.4209192710126685e-06, + "loss": 2.98421502, + "num_input_tokens_seen": 4276505, + "router_z_loss_clip": 33.25, + "router_z_loss_mlp": 143.375, + "step": 203, + "time_per_iteration": 2.6849801540374756 + }, + { + "auxiliary_loss_clip": 0.09996115, + "auxiliary_loss_mlp": 0.01763683, + "balance_loss_clip": 0.08022483, + "balance_loss_mlp": 0.01355129, + "epoch": 0.012265143544265745, + "flos": 68465416481280.0, + "grad_norm": 2.5939001011358327, + "language_loss": 0.60663998, + "learning_rate": 3.4240831701729837e-06, + "loss": 0.72423798, + "num_input_tokens_seen": 4330965, + "router_z_loss_clip": 19.75, + "router_z_loss_mlp": 4.08984375, + "step": 204, + "time_per_iteration": 3.218200922012329 + }, + { + "auxiliary_loss_clip": 0.11829591, + "auxiliary_loss_mlp": 0.16426852, + "balance_loss_clip": 0.08460154, + "balance_loss_mlp": 0.02486424, + "epoch": 0.012325266796933715, + "flos": 17024923152000.0, + "grad_norm": 175.923318576614, + "language_loss": 2.6947825, + "learning_rate": 3.4272315978819516e-06, + "loss": 2.9773469, + "num_input_tokens_seen": 4348200, + "router_z_loss_clip": 33.6875, + "router_z_loss_mlp": 139.5, + "step": 205, + "time_per_iteration": 2.6580400466918945 + }, + { + "auxiliary_loss_clip": 0.11821875, + "auxiliary_loss_mlp": 0.15477848, + "balance_loss_clip": 0.0845597, + "balance_loss_mlp": 0.02483464, + "epoch": 0.012385390049601683, + "flos": 20195679104640.0, + "grad_norm": 179.20336452265943, + "language_loss": 2.76609898, + "learning_rate": 3.4303647047142043e-06, + "loss": 3.03909636, + "num_input_tokens_seen": 4365460, + "router_z_loss_clip": 33.71875, + "router_z_loss_mlp": 130.0625, + "step": 206, + "time_per_iteration": 2.732661724090576 + }, + { + "auxiliary_loss_clip": 0.11876252, + "auxiliary_loss_mlp": 0.15609139, + "balance_loss_clip": 0.0847889, + "balance_loss_mlp": 0.02498787, + "epoch": 0.012445513302269652, + "flos": 16258690690560.0, + "grad_norm": 37.57079461410369, + "language_loss": 2.63663292, + "learning_rate": 3.43348263905683e-06, + "loss": 2.91148686, + "num_input_tokens_seen": 4383650, + "router_z_loss_clip": 33.9375, + "router_z_loss_mlp": 131.25, + "step": 207, + "time_per_iteration": 2.655898332595825 + }, + { + "auxiliary_loss_clip": 0.11858118, + "auxiliary_loss_mlp": 0.15964949, + "balance_loss_clip": 0.08469288, + "balance_loss_mlp": 0.02500593, + "epoch": 0.012505636554937622, + "flos": 23776224249600.0, + "grad_norm": 80.16610328924297, + "language_loss": 2.31757832, + "learning_rate": 3.436585547151547e-06, + "loss": 2.59580898, + "num_input_tokens_seen": 4403765, + "router_z_loss_clip": 33.90625, + "router_z_loss_mlp": 134.8125, + "step": 208, + "time_per_iteration": 2.7096707820892334 + }, + { + "auxiliary_loss_clip": 0.11891477, + "auxiliary_loss_mlp": 0.15333374, + "balance_loss_clip": 0.08512411, + "balance_loss_mlp": 0.02509888, + "epoch": 0.012565759807605591, + "flos": 30599417750400.0, + "grad_norm": 94.61742092763181, + "language_loss": 2.89340639, + "learning_rate": 3.4396735731358586e-06, + "loss": 3.16565466, + "num_input_tokens_seen": 4421935, + "router_z_loss_clip": 33.8125, + "router_z_loss_mlp": 128.3125, + "step": 209, + "time_per_iteration": 2.7260549068450928 + }, + { + "auxiliary_loss_clip": 0.11866176, + "auxiliary_loss_mlp": 0.14843261, + "balance_loss_clip": 0.08489646, + "balance_loss_mlp": 0.02508056, + "epoch": 0.012625883060273561, + "flos": 40122838200960.0, + "grad_norm": 70.02885877178691, + "language_loss": 2.47040462, + "learning_rate": 3.4427468590832302e-06, + "loss": 2.737499, + "num_input_tokens_seen": 4441470, + "router_z_loss_clip": 33.78125, + "router_z_loss_mlp": 123.375, + "step": 210, + "time_per_iteration": 2.8969995975494385 + }, + { + "auxiliary_loss_clip": 0.1188697, + "auxiliary_loss_mlp": 0.14057073, + "balance_loss_clip": 0.08471721, + "balance_loss_mlp": 0.02497014, + "epoch": 0.01268600631294153, + "flos": 27096509013120.0, + "grad_norm": 122.06391807709156, + "language_loss": 2.54189563, + "learning_rate": 3.445805545042314e-06, + "loss": 2.80133629, + "num_input_tokens_seen": 4459950, + "router_z_loss_clip": 34.15625, + "router_z_loss_mlp": 115.625, + "step": 211, + "time_per_iteration": 2.708080768585205 + }, + { + "auxiliary_loss_clip": 0.11883873, + "auxiliary_loss_mlp": 0.13339609, + "balance_loss_clip": 0.08499163, + "balance_loss_mlp": 0.02499764, + "epoch": 0.012746129565609499, + "flos": 16988431898880.0, + "grad_norm": 126.44131700603937, + "language_loss": 2.37998009, + "learning_rate": 3.448849769075239e-06, + "loss": 2.63221502, + "num_input_tokens_seen": 4478390, + "router_z_loss_clip": 33.84375, + "router_z_loss_mlp": 108.375, + "step": 212, + "time_per_iteration": 2.6480045318603516 + }, + { + "auxiliary_loss_clip": 0.11928719, + "auxiliary_loss_mlp": 0.13044119, + "balance_loss_clip": 0.08510935, + "balance_loss_mlp": 0.02497243, + "epoch": 0.012806252818277469, + "flos": 46543621668480.0, + "grad_norm": 186.42729164055353, + "language_loss": 2.21970725, + "learning_rate": 3.4518796672950093e-06, + "loss": 2.46943569, + "num_input_tokens_seen": 4501665, + "router_z_loss_clip": 34.15625, + "router_z_loss_mlp": 105.5625, + "step": 213, + "time_per_iteration": 2.871330738067627 + }, + { + "auxiliary_loss_clip": 0.119517, + "auxiliary_loss_mlp": 0.12083894, + "balance_loss_clip": 0.08513753, + "balance_loss_mlp": 0.02489167, + "epoch": 0.012866376070945438, + "flos": 14393234442240.0, + "grad_norm": 59.129237382202305, + "language_loss": 2.15201378, + "learning_rate": 3.4548953739020187e-06, + "loss": 2.39236999, + "num_input_tokens_seen": 4519055, + "router_z_loss_clip": 34.40625, + "router_z_loss_mlp": 95.9375, + "step": 214, + "time_per_iteration": 2.677279472351074 + }, + { + "auxiliary_loss_clip": 0.11979187, + "auxiliary_loss_mlp": 0.11437444, + "balance_loss_clip": 0.08527225, + "balance_loss_mlp": 0.02483585, + "epoch": 0.012926499323613408, + "flos": 26148029921280.0, + "grad_norm": 82.8472801825022, + "language_loss": 2.01005268, + "learning_rate": 3.4578970212197196e-06, + "loss": 2.24421906, + "num_input_tokens_seen": 4540870, + "router_z_loss_clip": 34.5, + "router_z_loss_mlp": 89.625, + "step": 215, + "time_per_iteration": 5.505565881729126 + }, + { + "auxiliary_loss_clip": 0.11977073, + "auxiliary_loss_mlp": 0.10736242, + "balance_loss_clip": 0.08518873, + "balance_loss_mlp": 0.02484289, + "epoch": 0.012986622576281377, + "flos": 30124989532800.0, + "grad_norm": 444.29299491343255, + "language_loss": 2.23052669, + "learning_rate": 3.460884739729461e-06, + "loss": 2.45765996, + "num_input_tokens_seen": 4560395, + "router_z_loss_clip": 34.625, + "router_z_loss_mlp": 82.5, + "step": 216, + "time_per_iteration": 4.0875208377838135 + }, + { + "auxiliary_loss_clip": 0.11978886, + "auxiliary_loss_mlp": 0.10150906, + "balance_loss_clip": 0.0852896, + "balance_loss_mlp": 0.02478787, + "epoch": 0.013046745828949347, + "flos": 13959112838400.0, + "grad_norm": 45.21271501184753, + "language_loss": 2.33321786, + "learning_rate": 3.463858658104523e-06, + "loss": 2.55451584, + "num_input_tokens_seen": 4575785, + "router_z_loss_clip": 34.46875, + "router_z_loss_mlp": 76.625, + "step": 217, + "time_per_iteration": 4.032313585281372 + }, + { + "auxiliary_loss_clip": 0.11990365, + "auxiliary_loss_mlp": 0.09330522, + "balance_loss_clip": 0.08498306, + "balance_loss_mlp": 0.02482377, + "epoch": 0.013106869081617315, + "flos": 17353595992320.0, + "grad_norm": 48.7496700865691, + "language_loss": 2.077981, + "learning_rate": 3.4668189032433696e-06, + "loss": 2.29119015, + "num_input_tokens_seen": 4594985, + "router_z_loss_clip": 34.9375, + "router_z_loss_mlp": 68.625, + "step": 218, + "time_per_iteration": 2.655488967895508 + }, + { + "auxiliary_loss_clip": 0.12044869, + "auxiliary_loss_mlp": 0.08778962, + "balance_loss_clip": 0.08527655, + "balance_loss_mlp": 0.02477083, + "epoch": 0.013166992334285284, + "flos": 25892004170880.0, + "grad_norm": 58.49845250600888, + "language_loss": 2.1651845, + "learning_rate": 3.46976560030214e-06, + "loss": 2.3734231, + "num_input_tokens_seen": 4616125, + "router_z_loss_clip": 35.15625, + "router_z_loss_mlp": 63.0, + "step": 219, + "time_per_iteration": 2.7416553497314453 + }, + { + "auxiliary_loss_clip": 0.12097923, + "auxiliary_loss_mlp": 0.08351351, + "balance_loss_clip": 0.08555256, + "balance_loss_mlp": 0.0248282, + "epoch": 0.013227115586953254, + "flos": 31184032487040.0, + "grad_norm": 65.30096795058861, + "language_loss": 2.22661948, + "learning_rate": 3.4726988727263976e-06, + "loss": 2.43111229, + "num_input_tokens_seen": 4637795, + "router_z_loss_clip": 35.40625, + "router_z_loss_mlp": 58.625, + "step": 220, + "time_per_iteration": 2.825364351272583 + }, + { + "auxiliary_loss_clip": 0.12091806, + "auxiliary_loss_mlp": 0.07555279, + "balance_loss_clip": 0.08557573, + "balance_loss_mlp": 0.02477154, + "epoch": 0.013287238839621223, + "flos": 20415213601920.0, + "grad_norm": 85.51848477504389, + "language_loss": 2.08907223, + "learning_rate": 3.475618842282164e-06, + "loss": 2.2855432, + "num_input_tokens_seen": 4656835, + "router_z_loss_clip": 35.375, + "router_z_loss_mlp": 50.75, + "step": 221, + "time_per_iteration": 2.699341058731079 + }, + { + "auxiliary_loss_clip": 0.12102397, + "auxiliary_loss_mlp": 0.07188272, + "balance_loss_clip": 0.08552121, + "balance_loss_mlp": 0.02482462, + "epoch": 0.013347362092289193, + "flos": 14142365717760.0, + "grad_norm": 45.70301732891132, + "language_loss": 2.16536474, + "learning_rate": 3.4785256290862486e-06, + "loss": 2.3582716, + "num_input_tokens_seen": 4673015, + "router_z_loss_clip": 35.5, + "router_z_loss_mlp": 47.0, + "step": 222, + "time_per_iteration": 2.635849714279175 + }, + { + "auxiliary_loss_clip": 0.12141806, + "auxiliary_loss_mlp": 0.06919794, + "balance_loss_clip": 0.08555885, + "balance_loss_mlp": 0.0248864, + "epoch": 0.013407485344957162, + "flos": 21803977572480.0, + "grad_norm": 133.93360024755185, + "language_loss": 2.13315558, + "learning_rate": 3.481419351635897e-06, + "loss": 2.32377172, + "num_input_tokens_seen": 4692355, + "router_z_loss_clip": 35.84375, + "router_z_loss_mlp": 44.375, + "step": 223, + "time_per_iteration": 2.677440881729126 + }, + { + "auxiliary_loss_clip": 0.12133283, + "auxiliary_loss_mlp": 0.06662595, + "balance_loss_clip": 0.08527759, + "balance_loss_mlp": 0.0248779, + "epoch": 0.013467608597625132, + "flos": 18627058344960.0, + "grad_norm": 45.82649386348146, + "language_loss": 2.04508209, + "learning_rate": 3.484300126837776e-06, + "loss": 2.23304057, + "num_input_tokens_seen": 4710080, + "router_z_loss_clip": 36.0, + "router_z_loss_mlp": 41.71875, + "step": 224, + "time_per_iteration": 2.647221803665161 + }, + { + "auxiliary_loss_clip": 0.12132762, + "auxiliary_loss_mlp": 0.06591167, + "balance_loss_clip": 0.0855926, + "balance_loss_mlp": 0.02489604, + "epoch": 0.013527731850293101, + "flos": 18558352396800.0, + "grad_norm": 35.4602333373948, + "language_loss": 1.96751869, + "learning_rate": 3.487168070036317e-06, + "loss": 2.15475798, + "num_input_tokens_seen": 4728980, + "router_z_loss_clip": 35.71875, + "router_z_loss_mlp": 41.0, + "step": 225, + "time_per_iteration": 2.6572558879852295 + }, + { + "auxiliary_loss_clip": 0.12111218, + "auxiliary_loss_mlp": 0.06338836, + "balance_loss_clip": 0.08540972, + "balance_loss_mlp": 0.02487518, + "epoch": 0.01358785510296107, + "flos": 19170318291840.0, + "grad_norm": 35.010295897234684, + "language_loss": 2.14010954, + "learning_rate": 3.4900232950414224e-06, + "loss": 2.32460999, + "num_input_tokens_seen": 4747020, + "router_z_loss_clip": 35.6875, + "router_z_loss_mlp": 38.46875, + "step": 226, + "time_per_iteration": 2.6925666332244873 + }, + { + "auxiliary_loss_clip": 0.12106597, + "auxiliary_loss_mlp": 0.06106333, + "balance_loss_clip": 0.08537765, + "balance_loss_mlp": 0.02477793, + "epoch": 0.01364797835562904, + "flos": 23336442495360.0, + "grad_norm": 62.289483146556975, + "language_loss": 1.89336014, + "learning_rate": 3.4928659141555727e-06, + "loss": 2.07548952, + "num_input_tokens_seen": 4765000, + "router_z_loss_clip": 35.71875, + "router_z_loss_mlp": 36.25, + "step": 227, + "time_per_iteration": 2.662459373474121 + }, + { + "auxiliary_loss_clip": 0.09852038, + "auxiliary_loss_mlp": 0.02028254, + "balance_loss_clip": 0.08093569, + "balance_loss_mlp": 0.01678827, + "epoch": 0.013708101608297009, + "flos": 71016561089280.0, + "grad_norm": 1.118625578373922, + "language_loss": 0.572559, + "learning_rate": 3.4956960382003234e-06, + "loss": 0.6913619, + "num_input_tokens_seen": 4833210, + "router_z_loss_clip": 17.53125, + "router_z_loss_mlp": 3.49804688, + "step": 228, + "time_per_iteration": 3.3785295486450195 + }, + { + "auxiliary_loss_clip": 0.12056112, + "auxiliary_loss_mlp": 0.05858175, + "balance_loss_clip": 0.08522452, + "balance_loss_mlp": 0.02485983, + "epoch": 0.013768224860964979, + "flos": 16330583093760.0, + "grad_norm": 67.20403392826273, + "language_loss": 1.83727443, + "learning_rate": 3.4985137765422354e-06, + "loss": 2.0164175, + "num_input_tokens_seen": 4850120, + "router_z_loss_clip": 35.34375, + "router_z_loss_mlp": 33.765625, + "step": 229, + "time_per_iteration": 2.6247904300689697 + }, + { + "auxiliary_loss_clip": 0.11999249, + "auxiliary_loss_mlp": 0.05601757, + "balance_loss_clip": 0.08509874, + "balance_loss_mlp": 0.02482861, + "epoch": 0.013828348113632948, + "flos": 20199159048960.0, + "grad_norm": 53.50045183346903, + "language_loss": 1.8795563, + "learning_rate": 3.501319237118231e-06, + "loss": 2.05556631, + "num_input_tokens_seen": 4866215, + "router_z_loss_clip": 34.9375, + "router_z_loss_mlp": 31.1875, + "step": 230, + "time_per_iteration": 2.7507057189941406 + }, + { + "auxiliary_loss_clip": 0.12064129, + "auxiliary_loss_mlp": 0.05470717, + "balance_loss_clip": 0.08557475, + "balance_loss_mlp": 0.02487624, + "epoch": 0.013888471366300916, + "flos": 20747408313600.0, + "grad_norm": 34.266749882440614, + "language_loss": 1.64469385, + "learning_rate": 3.5041125264604056e-06, + "loss": 1.82004225, + "num_input_tokens_seen": 4885630, + "router_z_loss_clip": 35.09375, + "router_z_loss_mlp": 29.796875, + "step": 231, + "time_per_iteration": 2.641220808029175 + }, + { + "auxiliary_loss_clip": 0.12051, + "auxiliary_loss_mlp": 0.05321148, + "balance_loss_clip": 0.08549553, + "balance_loss_mlp": 0.02486065, + "epoch": 0.013948594618968886, + "flos": 22097123481600.0, + "grad_norm": 189.27377216215737, + "language_loss": 1.70564377, + "learning_rate": 3.5068937497203002e-06, + "loss": 1.87936521, + "num_input_tokens_seen": 4905570, + "router_z_loss_clip": 35.0, + "router_z_loss_mlp": 28.34375, + "step": 232, + "time_per_iteration": 2.6656322479248047 + }, + { + "auxiliary_loss_clip": 0.12035383, + "auxiliary_loss_mlp": 0.0510756, + "balance_loss_clip": 0.08542152, + "balance_loss_mlp": 0.02483049, + "epoch": 0.014008717871636855, + "flos": 19069229940480.0, + "grad_norm": 76.31242813901656, + "language_loss": 1.64492762, + "learning_rate": 3.509663010692652e-06, + "loss": 1.81635702, + "num_input_tokens_seen": 4923535, + "router_z_loss_clip": 34.96875, + "router_z_loss_mlp": 26.25, + "step": 233, + "time_per_iteration": 2.6354150772094727 + }, + { + "auxiliary_loss_clip": 0.12088259, + "auxiliary_loss_mlp": 0.05079982, + "balance_loss_clip": 0.08570465, + "balance_loss_mlp": 0.02490566, + "epoch": 0.014068841124304825, + "flos": 14534839042560.0, + "grad_norm": 50.00852440461159, + "language_loss": 1.75618017, + "learning_rate": 3.512420411838642e-06, + "loss": 1.92786264, + "num_input_tokens_seen": 4939200, + "router_z_loss_clip": 35.15625, + "router_z_loss_mlp": 25.890625, + "step": 234, + "time_per_iteration": 2.666630983352661 + }, + { + "auxiliary_loss_clip": 0.11989364, + "auxiliary_loss_mlp": 0.05021151, + "balance_loss_clip": 0.08533135, + "balance_loss_mlp": 0.0249277, + "epoch": 0.014128964376972794, + "flos": 18083253346560.0, + "grad_norm": 159.74277839526525, + "language_loss": 1.68861091, + "learning_rate": 3.515166054308634e-06, + "loss": 1.85871601, + "num_input_tokens_seen": 4956620, + "router_z_loss_clip": 34.625, + "router_z_loss_mlp": 25.28125, + "step": 235, + "time_per_iteration": 2.6749186515808105 + }, + { + "auxiliary_loss_clip": 0.12056133, + "auxiliary_loss_mlp": 0.04976581, + "balance_loss_clip": 0.08549982, + "balance_loss_mlp": 0.02495502, + "epoch": 0.014189087629640764, + "flos": 25340778086400.0, + "grad_norm": 181.61682318003585, + "language_loss": 1.60946572, + "learning_rate": 3.5179000379644498e-06, + "loss": 1.77979279, + "num_input_tokens_seen": 4975650, + "router_z_loss_clip": 35.03125, + "router_z_loss_mlp": 24.8125, + "step": 236, + "time_per_iteration": 2.744683027267456 + }, + { + "auxiliary_loss_clip": 0.11981137, + "auxiliary_loss_mlp": 0.04688486, + "balance_loss_clip": 0.08556408, + "balance_loss_mlp": 0.02492746, + "epoch": 0.014249210882308733, + "flos": 36148939263360.0, + "grad_norm": 53.559601436427585, + "language_loss": 1.50691867, + "learning_rate": 3.520622461401154e-06, + "loss": 1.67361498, + "num_input_tokens_seen": 4997415, + "router_z_loss_clip": 34.25, + "router_z_loss_mlp": 21.96875, + "step": 237, + "time_per_iteration": 2.845082998275757 + }, + { + "auxiliary_loss_clip": 0.12020621, + "auxiliary_loss_mlp": 0.04751597, + "balance_loss_clip": 0.08577786, + "balance_loss_mlp": 0.02497874, + "epoch": 0.014309334134976702, + "flos": 12937986656640.0, + "grad_norm": 74.10279300011292, + "language_loss": 1.46138978, + "learning_rate": 3.5233334219683935e-06, + "loss": 1.62911201, + "num_input_tokens_seen": 5013905, + "router_z_loss_clip": 34.4375, + "router_z_loss_mlp": 22.5625, + "step": 238, + "time_per_iteration": 2.658674716949463 + }, + { + "auxiliary_loss_clip": 0.11937614, + "auxiliary_loss_mlp": 0.04392426, + "balance_loss_clip": 0.08564249, + "balance_loss_mlp": 0.02485077, + "epoch": 0.014369457387644672, + "flos": 20783857639680.0, + "grad_norm": 42.588620022932425, + "language_loss": 1.53544843, + "learning_rate": 3.526033015791284e-06, + "loss": 1.69874883, + "num_input_tokens_seen": 5033645, + "router_z_loss_clip": 33.78125, + "router_z_loss_mlp": 19.046875, + "step": 239, + "time_per_iteration": 2.700894355773926 + }, + { + "auxiliary_loss_clip": 0.11902035, + "auxiliary_loss_mlp": 0.04253633, + "balance_loss_clip": 0.08564246, + "balance_loss_mlp": 0.02488191, + "epoch": 0.01442958064031264, + "flos": 25855638698880.0, + "grad_norm": 34.671761903295156, + "language_loss": 1.53386331, + "learning_rate": 3.528721337790862e-06, + "loss": 1.69542003, + "num_input_tokens_seen": 5052875, + "router_z_loss_clip": 33.4375, + "router_z_loss_mlp": 17.671875, + "step": 240, + "time_per_iteration": 2.712979555130005 + }, + { + "auxiliary_loss_clip": 0.11883197, + "auxiliary_loss_mlp": 0.04123231, + "balance_loss_clip": 0.08562298, + "balance_loss_mlp": 0.02487489, + "epoch": 0.014489703892980611, + "flos": 28227150881280.0, + "grad_norm": 79.00201559956153, + "language_loss": 1.47835279, + "learning_rate": 3.531398481704111e-06, + "loss": 1.63841701, + "num_input_tokens_seen": 5075005, + "router_z_loss_clip": 33.15625, + "router_z_loss_mlp": 16.359375, + "step": 241, + "time_per_iteration": 2.7748684883117676 + }, + { + "auxiliary_loss_clip": 0.11856598, + "auxiliary_loss_mlp": 0.0397551, + "balance_loss_clip": 0.08558369, + "balance_loss_mlp": 0.02488541, + "epoch": 0.01454982714564858, + "flos": 22497311381760.0, + "grad_norm": 26.156771136535646, + "language_loss": 1.46749806, + "learning_rate": 3.534064540103573e-06, + "loss": 1.62581909, + "num_input_tokens_seen": 5091875, + "router_z_loss_clip": 32.984375, + "router_z_loss_mlp": 14.875, + "step": 242, + "time_per_iteration": 2.69297456741333 + }, + { + "auxiliary_loss_clip": 0.11859537, + "auxiliary_loss_mlp": 0.03845835, + "balance_loss_clip": 0.08550237, + "balance_loss_mlp": 0.0248704, + "epoch": 0.014609950398316548, + "flos": 21659689641600.0, + "grad_norm": 40.62615504318681, + "language_loss": 1.44594622, + "learning_rate": 3.536719604416555e-06, + "loss": 1.60299993, + "num_input_tokens_seen": 5111290, + "router_z_loss_clip": 33.03125, + "router_z_loss_mlp": 13.5859375, + "step": 243, + "time_per_iteration": 2.7429516315460205 + }, + { + "auxiliary_loss_clip": 0.11778541, + "auxiliary_loss_mlp": 0.03809229, + "balance_loss_clip": 0.08539546, + "balance_loss_mlp": 0.02486292, + "epoch": 0.014670073650984519, + "flos": 21876163464960.0, + "grad_norm": 100.86422067940943, + "language_loss": 1.56203103, + "learning_rate": 3.5393637649439464e-06, + "loss": 1.71790862, + "num_input_tokens_seen": 5132265, + "router_z_loss_clip": 32.34375, + "router_z_loss_mlp": 13.2265625, + "step": 244, + "time_per_iteration": 2.6750683784484863 + }, + { + "auxiliary_loss_clip": 0.11823894, + "auxiliary_loss_mlp": 0.03778996, + "balance_loss_clip": 0.08550587, + "balance_loss_mlp": 0.02497257, + "epoch": 0.014730196903652487, + "flos": 23190142066560.0, + "grad_norm": 48.52251723310838, + "language_loss": 1.50476313, + "learning_rate": 3.54199711087864e-06, + "loss": 1.66079211, + "num_input_tokens_seen": 5148575, + "router_z_loss_clip": 32.71875, + "router_z_loss_mlp": 12.8125, + "step": 245, + "time_per_iteration": 2.72153639793396 + }, + { + "auxiliary_loss_clip": 0.11763392, + "auxiliary_loss_mlp": 0.03610927, + "balance_loss_clip": 0.08551488, + "balance_loss_mlp": 0.02484828, + "epoch": 0.014790320156320457, + "flos": 23229442431360.0, + "grad_norm": 98.70024924690004, + "language_loss": 1.52072549, + "learning_rate": 3.5446197303235913e-06, + "loss": 1.67446864, + "num_input_tokens_seen": 5170415, + "router_z_loss_clip": 32.078125, + "router_z_loss_mlp": 11.265625, + "step": 246, + "time_per_iteration": 2.739284038543701 + }, + { + "auxiliary_loss_clip": 0.11731501, + "auxiliary_loss_mlp": 0.03545591, + "balance_loss_clip": 0.08530955, + "balance_loss_mlp": 0.0246832, + "epoch": 0.014850443408988426, + "flos": 15821005288320.0, + "grad_norm": 33.98035395755878, + "language_loss": 1.40319586, + "learning_rate": 3.5472317103095034e-06, + "loss": 1.55596685, + "num_input_tokens_seen": 5188565, + "router_z_loss_clip": 31.96875, + "router_z_loss_mlp": 10.7734375, + "step": 247, + "time_per_iteration": 2.7273683547973633 + }, + { + "auxiliary_loss_clip": 0.1172208, + "auxiliary_loss_mlp": 0.03547119, + "balance_loss_clip": 0.08564139, + "balance_loss_mlp": 0.02478241, + "epoch": 0.014910566661656396, + "flos": 22787899741440.0, + "grad_norm": 52.371226674183355, + "language_loss": 1.30089116, + "learning_rate": 3.549833136812155e-06, + "loss": 1.453583, + "num_input_tokens_seen": 5207810, + "router_z_loss_clip": 31.578125, + "router_z_loss_mlp": 10.6953125, + "step": 248, + "time_per_iteration": 2.7991907596588135 + }, + { + "auxiliary_loss_clip": 0.11678547, + "auxiliary_loss_mlp": 0.03475812, + "balance_loss_clip": 0.08537906, + "balance_loss_mlp": 0.02466443, + "epoch": 0.014970689914324365, + "flos": 26871440146560.0, + "grad_norm": 39.139484540660874, + "language_loss": 1.33625245, + "learning_rate": 3.552424094769381e-06, + "loss": 1.48779607, + "num_input_tokens_seen": 5226210, + "router_z_loss_clip": 31.390625, + "router_z_loss_mlp": 10.0859375, + "step": 249, + "time_per_iteration": 2.7439961433410645 + }, + { + "auxiliary_loss_clip": 0.11684404, + "auxiliary_loss_mlp": 0.03406032, + "balance_loss_clip": 0.08537483, + "balance_loss_mlp": 0.02458461, + "epoch": 0.015030813166992334, + "flos": 13989943941120.0, + "grad_norm": 151.47532384589994, + "language_loss": 1.465379, + "learning_rate": 3.5550046680977174e-06, + "loss": 1.6162833, + "num_input_tokens_seen": 5241660, + "router_z_loss_clip": 31.46875, + "router_z_loss_mlp": 9.4765625, + "step": 250, + "time_per_iteration": 2.68412184715271 + }, + { + "auxiliary_loss_clip": 0.11659358, + "auxiliary_loss_mlp": 0.03389172, + "balance_loss_clip": 0.08554412, + "balance_loss_mlp": 0.02466397, + "epoch": 0.015090936419660304, + "flos": 24724787195520.0, + "grad_norm": 46.474949555678066, + "language_loss": 1.48383927, + "learning_rate": 3.5575749397087034e-06, + "loss": 1.63432467, + "num_input_tokens_seen": 5261090, + "router_z_loss_clip": 31.0625, + "router_z_loss_mlp": 9.22265625, + "step": 251, + "time_per_iteration": 2.7403595447540283 + }, + { + "auxiliary_loss_clip": 0.11684091, + "auxiliary_loss_mlp": 0.0341421, + "balance_loss_clip": 0.08552309, + "balance_loss_mlp": 0.02502498, + "epoch": 0.015151059672328273, + "flos": 25745829523200.0, + "grad_norm": 38.842940432028065, + "language_loss": 1.35644555, + "learning_rate": 3.5601349915248707e-06, + "loss": 1.50742865, + "num_input_tokens_seen": 5279175, + "router_z_loss_clip": 31.296875, + "router_z_loss_mlp": 9.1171875, + "step": 252, + "time_per_iteration": 2.791579246520996 + }, + { + "auxiliary_loss_clip": 0.11669001, + "auxiliary_loss_mlp": 0.03442915, + "balance_loss_clip": 0.08573347, + "balance_loss_mlp": 0.02537305, + "epoch": 0.015211182924996243, + "flos": 21877588984320.0, + "grad_norm": 62.5379323018988, + "language_loss": 1.55304623, + "learning_rate": 3.5626849044954064e-06, + "loss": 1.70416546, + "num_input_tokens_seen": 5296975, + "router_z_loss_clip": 30.96875, + "router_z_loss_mlp": 9.0625, + "step": 253, + "time_per_iteration": 2.6943836212158203 + }, + { + "auxiliary_loss_clip": 0.09242393, + "auxiliary_loss_mlp": 0.017157, + "balance_loss_clip": 0.07774388, + "balance_loss_mlp": 0.01455537, + "epoch": 0.015271306177664212, + "flos": 66915159765120.0, + "grad_norm": 1.2208472030610649, + "language_loss": 0.55767465, + "learning_rate": 3.5652247586115167e-06, + "loss": 0.66725558, + "num_input_tokens_seen": 5358375, + "router_z_loss_clip": 14.65625, + "router_z_loss_mlp": 2.6015625, + "step": 254, + "time_per_iteration": 4.672732353210449 + }, + { + "auxiliary_loss_clip": 0.11620437, + "auxiliary_loss_mlp": 0.03323486, + "balance_loss_clip": 0.08537702, + "balance_loss_mlp": 0.02497223, + "epoch": 0.01533142943033218, + "flos": 26841405657600.0, + "grad_norm": 25.800997540380294, + "language_loss": 1.37205672, + "learning_rate": 3.567754632921479e-06, + "loss": 1.52149594, + "num_input_tokens_seen": 5377255, + "router_z_loss_clip": 30.84375, + "router_z_loss_mlp": 8.265625, + "step": 255, + "time_per_iteration": 5.487545490264893 + }, + { + "auxiliary_loss_clip": 0.11549303, + "auxiliary_loss_mlp": 0.03243715, + "balance_loss_clip": 0.08531242, + "balance_loss_mlp": 0.02464373, + "epoch": 0.01539155268300015, + "flos": 20820055403520.0, + "grad_norm": 51.38147970022548, + "language_loss": 1.3568666, + "learning_rate": 3.5702746055454075e-06, + "loss": 1.50479686, + "num_input_tokens_seen": 5395320, + "router_z_loss_clip": 30.171875, + "router_z_loss_mlp": 7.7890625, + "step": 256, + "time_per_iteration": 2.7118937969207764 + }, + { + "auxiliary_loss_clip": 0.11515065, + "auxiliary_loss_mlp": 0.0323028, + "balance_loss_clip": 0.08509345, + "balance_loss_mlp": 0.02460093, + "epoch": 0.01545167593566812, + "flos": 15967473425280.0, + "grad_norm": 27.629045104410558, + "language_loss": 1.28094459, + "learning_rate": 3.5727847536897254e-06, + "loss": 1.42839789, + "num_input_tokens_seen": 5411970, + "router_z_loss_clip": 30.046875, + "router_z_loss_mlp": 7.69921875, + "step": 257, + "time_per_iteration": 4.093847751617432 + }, + { + "auxiliary_loss_clip": 0.11514995, + "auxiliary_loss_mlp": 0.03174197, + "balance_loss_clip": 0.08523524, + "balance_loss_mlp": 0.02457415, + "epoch": 0.01551179918833609, + "flos": 22608378368640.0, + "grad_norm": 22.193359085523966, + "language_loss": 1.37467206, + "learning_rate": 3.5752851536613596e-06, + "loss": 1.52156401, + "num_input_tokens_seen": 5430245, + "router_z_loss_clip": 29.921875, + "router_z_loss_mlp": 7.171875, + "step": 258, + "time_per_iteration": 2.6789233684539795 + }, + { + "auxiliary_loss_clip": 0.11490995, + "auxiliary_loss_mlp": 0.03125494, + "balance_loss_clip": 0.08525682, + "balance_loss_mlp": 0.02450675, + "epoch": 0.015571922441004058, + "flos": 22822713912960.0, + "grad_norm": 41.08352403819959, + "language_loss": 1.35431111, + "learning_rate": 3.577775880881658e-06, + "loss": 1.50047588, + "num_input_tokens_seen": 5448905, + "router_z_loss_clip": 29.640625, + "router_z_loss_mlp": 6.75390625, + "step": 259, + "time_per_iteration": 2.716095209121704 + }, + { + "auxiliary_loss_clip": 0.11409761, + "auxiliary_loss_mlp": 0.03065479, + "balance_loss_clip": 0.08500087, + "balance_loss_mlp": 0.02439868, + "epoch": 0.015632045693672027, + "flos": 18952502803200.0, + "grad_norm": 45.41794645804665, + "language_loss": 1.35833013, + "learning_rate": 3.5802570099000424e-06, + "loss": 1.50308251, + "num_input_tokens_seen": 5466405, + "router_z_loss_clip": 29.109375, + "router_z_loss_mlp": 6.25390625, + "step": 260, + "time_per_iteration": 2.63728666305542 + }, + { + "auxiliary_loss_clip": 0.11363758, + "auxiliary_loss_mlp": 0.03047284, + "balance_loss_clip": 0.0847533, + "balance_loss_mlp": 0.02422818, + "epoch": 0.015692168946339995, + "flos": 29979569571840.0, + "grad_norm": 14.449297272648009, + "language_loss": 1.30485594, + "learning_rate": 3.5827286144073947e-06, + "loss": 1.44896626, + "num_input_tokens_seen": 5487055, + "router_z_loss_clip": 28.921875, + "router_z_loss_mlp": 6.23828125, + "step": 261, + "time_per_iteration": 2.7847509384155273 + }, + { + "auxiliary_loss_clip": 0.11379428, + "auxiliary_loss_mlp": 0.03054321, + "balance_loss_clip": 0.08507971, + "balance_loss_mlp": 0.02459991, + "epoch": 0.015752292199007967, + "flos": 19398363978240.0, + "grad_norm": 31.701786044094614, + "language_loss": 1.03000259, + "learning_rate": 3.5851907672491904e-06, + "loss": 1.17434001, + "num_input_tokens_seen": 5506600, + "router_z_loss_clip": 28.71875, + "router_z_loss_mlp": 5.94140625, + "step": 262, + "time_per_iteration": 2.6821658611297607 + }, + { + "auxiliary_loss_clip": 0.11303549, + "auxiliary_loss_mlp": 0.02991728, + "balance_loss_clip": 0.0846238, + "balance_loss_mlp": 0.02461103, + "epoch": 0.015812415451675936, + "flos": 20346088383360.0, + "grad_norm": 21.20591685993131, + "language_loss": 1.06071973, + "learning_rate": 3.587643540438383e-06, + "loss": 1.20367253, + "num_input_tokens_seen": 5524350, + "router_z_loss_clip": 28.421875, + "router_z_loss_mlp": 5.30859375, + "step": 263, + "time_per_iteration": 2.6878163814544678 + }, + { + "auxiliary_loss_clip": 0.11343089, + "auxiliary_loss_mlp": 0.02942515, + "balance_loss_clip": 0.08484475, + "balance_loss_mlp": 0.0242982, + "epoch": 0.015872538704343905, + "flos": 17530392107520.0, + "grad_norm": 30.142563573193335, + "language_loss": 1.29773152, + "learning_rate": 3.590087005168037e-06, + "loss": 1.44058764, + "num_input_tokens_seen": 5542145, + "router_z_loss_clip": 28.59375, + "router_z_loss_mlp": 5.125, + "step": 264, + "time_per_iteration": 2.662154197692871 + }, + { + "auxiliary_loss_clip": 0.11317942, + "auxiliary_loss_mlp": 0.02875043, + "balance_loss_clip": 0.08491537, + "balance_loss_mlp": 0.02415754, + "epoch": 0.015932661957011873, + "flos": 15264622177920.0, + "grad_norm": 32.942584170075996, + "language_loss": 1.38455915, + "learning_rate": 3.5925212318237344e-06, + "loss": 1.52648902, + "num_input_tokens_seen": 5557920, + "router_z_loss_clip": 28.28125, + "router_z_loss_mlp": 4.59375, + "step": 265, + "time_per_iteration": 2.6390388011932373 + }, + { + "auxiliary_loss_clip": 0.11291553, + "auxiliary_loss_mlp": 0.02864291, + "balance_loss_clip": 0.08442727, + "balance_loss_mlp": 0.02421405, + "epoch": 0.015992785209679845, + "flos": 20308674735360.0, + "grad_norm": 55.122223701442024, + "language_loss": 1.13817394, + "learning_rate": 3.5949462899957323e-06, + "loss": 1.27973235, + "num_input_tokens_seen": 5576290, + "router_z_loss_clip": 28.484375, + "router_z_loss_mlp": 4.42773438, + "step": 266, + "time_per_iteration": 2.7511661052703857 + }, + { + "auxiliary_loss_clip": 0.11267024, + "auxiliary_loss_mlp": 0.02842336, + "balance_loss_clip": 0.08455394, + "balance_loss_mlp": 0.02423863, + "epoch": 0.016052908462347814, + "flos": 23368195992960.0, + "grad_norm": 26.951368678186665, + "language_loss": 1.23554707, + "learning_rate": 3.5973622484909068e-06, + "loss": 1.3766408, + "num_input_tokens_seen": 5595205, + "router_z_loss_clip": 28.140625, + "router_z_loss_mlp": 4.17773438, + "step": 267, + "time_per_iteration": 2.681403875350952 + }, + { + "auxiliary_loss_clip": 0.11252864, + "auxiliary_loss_mlp": 0.02837055, + "balance_loss_clip": 0.0845217, + "balance_loss_mlp": 0.02411335, + "epoch": 0.016113031715015783, + "flos": 21292722685440.0, + "grad_norm": 64.20150221953703, + "language_loss": 1.24742389, + "learning_rate": 3.599769175344462e-06, + "loss": 1.38832319, + "num_input_tokens_seen": 5612645, + "router_z_loss_clip": 28.0, + "router_z_loss_mlp": 4.2578125, + "step": 268, + "time_per_iteration": 2.72198224067688 + }, + { + "auxiliary_loss_clip": 0.11163211, + "auxiliary_loss_mlp": 0.02866759, + "balance_loss_clip": 0.08415397, + "balance_loss_mlp": 0.0243093, + "epoch": 0.01617315496768375, + "flos": 18920371962240.0, + "grad_norm": 170.41239636292127, + "language_loss": 1.22916961, + "learning_rate": 3.602167137831432e-06, + "loss": 1.3694694, + "num_input_tokens_seen": 5628345, + "router_z_loss_clip": 27.46875, + "router_z_loss_mlp": 4.36132812, + "step": 269, + "time_per_iteration": 2.6403703689575195 + }, + { + "auxiliary_loss_clip": 0.11217365, + "auxiliary_loss_mlp": 0.02780488, + "balance_loss_clip": 0.08470169, + "balance_loss_mlp": 0.02398446, + "epoch": 0.01623327822035172, + "flos": 16552339724160.0, + "grad_norm": 38.966481299889274, + "language_loss": 1.32494903, + "learning_rate": 3.6045562024779565e-06, + "loss": 1.46492743, + "num_input_tokens_seen": 5645940, + "router_z_loss_clip": 27.515625, + "router_z_loss_mlp": 3.82226562, + "step": 270, + "time_per_iteration": 2.7300021648406982 + }, + { + "auxiliary_loss_clip": 0.11115253, + "auxiliary_loss_mlp": 0.02879213, + "balance_loss_clip": 0.08416284, + "balance_loss_mlp": 0.02523302, + "epoch": 0.016293401473019692, + "flos": 23520198499200.0, + "grad_norm": 74.8782587112652, + "language_loss": 1.26303077, + "learning_rate": 3.606936435072361e-06, + "loss": 1.40297556, + "num_input_tokens_seen": 5665690, + "router_z_loss_clip": 26.984375, + "router_z_loss_mlp": 3.55859375, + "step": 271, + "time_per_iteration": 2.7073349952697754 + }, + { + "auxiliary_loss_clip": 0.11099713, + "auxiliary_loss_mlp": 0.02833465, + "balance_loss_clip": 0.08408779, + "balance_loss_mlp": 0.02473739, + "epoch": 0.01635352472568766, + "flos": 29022579290880.0, + "grad_norm": 92.09487601801163, + "language_loss": 1.22523308, + "learning_rate": 3.609307900676025e-06, + "loss": 1.36456478, + "num_input_tokens_seen": 5683190, + "router_z_loss_clip": 26.921875, + "router_z_loss_mlp": 3.59765625, + "step": 272, + "time_per_iteration": 2.767242670059204 + }, + { + "auxiliary_loss_clip": 0.11100094, + "auxiliary_loss_mlp": 0.02845915, + "balance_loss_clip": 0.08419856, + "balance_loss_mlp": 0.02489432, + "epoch": 0.01641364797835563, + "flos": 13375546277760.0, + "grad_norm": 162.68643260209848, + "language_loss": 1.12912893, + "learning_rate": 3.611670663634051e-06, + "loss": 1.26858902, + "num_input_tokens_seen": 5699780, + "router_z_loss_clip": 26.828125, + "router_z_loss_mlp": 3.5625, + "step": 273, + "time_per_iteration": 2.6756341457366943 + }, + { + "auxiliary_loss_clip": 0.11082844, + "auxiliary_loss_mlp": 0.02877946, + "balance_loss_clip": 0.08410685, + "balance_loss_mlp": 0.02487702, + "epoch": 0.016473771231023598, + "flos": 18883922636160.0, + "grad_norm": 33.34014800610017, + "language_loss": 1.30194449, + "learning_rate": 3.614024787585744e-06, + "loss": 1.44155228, + "num_input_tokens_seen": 5716980, + "router_z_loss_clip": 26.734375, + "router_z_loss_mlp": 3.90234375, + "step": 274, + "time_per_iteration": 2.7216930389404297 + }, + { + "auxiliary_loss_clip": 0.11044294, + "auxiliary_loss_mlp": 0.02852219, + "balance_loss_clip": 0.08402658, + "balance_loss_mlp": 0.02501839, + "epoch": 0.016533894483691566, + "flos": 22608252587520.0, + "grad_norm": 44.408233256015265, + "language_loss": 1.22405624, + "learning_rate": 3.6163703354748927e-06, + "loss": 1.36302137, + "num_input_tokens_seen": 5737780, + "router_z_loss_clip": 26.453125, + "router_z_loss_mlp": 3.50390625, + "step": 275, + "time_per_iteration": 2.6909008026123047 + }, + { + "auxiliary_loss_clip": 0.10985737, + "auxiliary_loss_mlp": 0.02874438, + "balance_loss_clip": 0.08389083, + "balance_loss_mlp": 0.02526728, + "epoch": 0.01659401773635954, + "flos": 21513640775040.0, + "grad_norm": 44.25598676438703, + "language_loss": 1.11958659, + "learning_rate": 3.6187073695598707e-06, + "loss": 1.25818849, + "num_input_tokens_seen": 5758330, + "router_z_loss_clip": 25.984375, + "router_z_loss_mlp": 3.4765625, + "step": 276, + "time_per_iteration": 2.700979471206665 + }, + { + "auxiliary_loss_clip": 0.10974017, + "auxiliary_loss_mlp": 0.02898641, + "balance_loss_clip": 0.08386508, + "balance_loss_mlp": 0.02528615, + "epoch": 0.016654140989027507, + "flos": 32858772842880.0, + "grad_norm": 42.11334181974309, + "language_loss": 1.14762068, + "learning_rate": 3.621035951423551e-06, + "loss": 1.28634739, + "num_input_tokens_seen": 5778340, + "router_z_loss_clip": 25.859375, + "router_z_loss_mlp": 3.703125, + "step": 277, + "time_per_iteration": 2.8497049808502197 + }, + { + "auxiliary_loss_clip": 0.10973347, + "auxiliary_loss_mlp": 0.02864523, + "balance_loss_clip": 0.08391111, + "balance_loss_mlp": 0.02533217, + "epoch": 0.016714264241695476, + "flos": 12310046559360.0, + "grad_norm": 887.2068563232498, + "language_loss": 1.11253488, + "learning_rate": 3.623356141983041e-06, + "loss": 1.25091362, + "num_input_tokens_seen": 5794295, + "router_z_loss_clip": 25.796875, + "router_z_loss_mlp": 3.3125, + "step": 278, + "time_per_iteration": 2.6813693046569824 + }, + { + "auxiliary_loss_clip": 0.10953625, + "auxiliary_loss_mlp": 0.02843702, + "balance_loss_clip": 0.08367237, + "balance_loss_mlp": 0.02501333, + "epoch": 0.016774387494363444, + "flos": 27130820060160.0, + "grad_norm": 34.273698880479216, + "language_loss": 1.25525784, + "learning_rate": 3.6256680014992486e-06, + "loss": 1.39323103, + "num_input_tokens_seen": 5814405, + "router_z_loss_clip": 25.859375, + "router_z_loss_mlp": 3.42382812, + "step": 279, + "time_per_iteration": 2.784980058670044 + }, + { + "auxiliary_loss_clip": 0.10968237, + "auxiliary_loss_mlp": 0.02757426, + "balance_loss_clip": 0.0838433, + "balance_loss_mlp": 0.02447863, + "epoch": 0.016834510747031413, + "flos": 20197356186240.0, + "grad_norm": 53.49395148263472, + "language_loss": 1.29536223, + "learning_rate": 3.6279715895862713e-06, + "loss": 1.43261886, + "num_input_tokens_seen": 5832795, + "router_z_loss_clip": 25.859375, + "router_z_loss_mlp": 3.09570312, + "step": 280, + "time_per_iteration": 2.681295871734619 + }, + { + "auxiliary_loss_clip": 0.10977297, + "auxiliary_loss_mlp": 0.02731509, + "balance_loss_clip": 0.083787, + "balance_loss_mlp": 0.02426143, + "epoch": 0.016894633999699385, + "flos": 27282067879680.0, + "grad_norm": 34.532536985404526, + "language_loss": 1.04021847, + "learning_rate": 3.6302669652206183e-06, + "loss": 1.17730653, + "num_input_tokens_seen": 5855750, + "router_z_loss_clip": 26.0, + "router_z_loss_mlp": 3.0546875, + "step": 281, + "time_per_iteration": 2.760214328765869 + }, + { + "auxiliary_loss_clip": 0.10965681, + "auxiliary_loss_mlp": 0.02675743, + "balance_loss_clip": 0.08379069, + "balance_loss_mlp": 0.02375717, + "epoch": 0.016954757252367354, + "flos": 14908262762880.0, + "grad_norm": 196.2497312811754, + "language_loss": 1.22675765, + "learning_rate": 3.632554186750274e-06, + "loss": 1.36317194, + "num_input_tokens_seen": 5872610, + "router_z_loss_clip": 25.875, + "router_z_loss_mlp": 2.99609375, + "step": 282, + "time_per_iteration": 2.619256019592285 + }, + { + "auxiliary_loss_clip": 0.10984524, + "auxiliary_loss_mlp": 0.02614953, + "balance_loss_clip": 0.0837212, + "balance_loss_mlp": 0.02316834, + "epoch": 0.017014880505035322, + "flos": 21364824723840.0, + "grad_norm": 113.89697119062544, + "language_loss": 1.1510148, + "learning_rate": 3.6348333119035937e-06, + "loss": 1.28700948, + "num_input_tokens_seen": 5892985, + "router_z_loss_clip": 26.125, + "router_z_loss_mlp": 2.98046875, + "step": 283, + "time_per_iteration": 2.7038846015930176 + }, + { + "auxiliary_loss_clip": 0.10939686, + "auxiliary_loss_mlp": 0.02615653, + "balance_loss_clip": 0.08368152, + "balance_loss_mlp": 0.02314101, + "epoch": 0.01707500375770329, + "flos": 35341561647360.0, + "grad_norm": 2832.5964725422496, + "language_loss": 1.17971587, + "learning_rate": 3.6371043977980503e-06, + "loss": 1.31526923, + "num_input_tokens_seen": 5914060, + "router_z_loss_clip": 25.703125, + "router_z_loss_mlp": 3.015625, + "step": 284, + "time_per_iteration": 2.779290199279785 + }, + { + "auxiliary_loss_clip": 0.11009269, + "auxiliary_loss_mlp": 0.02623795, + "balance_loss_clip": 0.08394658, + "balance_loss_mlp": 0.02300118, + "epoch": 0.01713512701037126, + "flos": 23588065906560.0, + "grad_norm": 202.09490986405962, + "language_loss": 1.3942194, + "learning_rate": 3.639367500948819e-06, + "loss": 1.53055, + "num_input_tokens_seen": 5932860, + "router_z_loss_clip": 26.15625, + "router_z_loss_mlp": 3.23632812, + "step": 285, + "time_per_iteration": 2.708090305328369 + }, + { + "auxiliary_loss_clip": 0.10991548, + "auxiliary_loss_mlp": 0.02635612, + "balance_loss_clip": 0.08366679, + "balance_loss_mlp": 0.02286949, + "epoch": 0.01719525026303923, + "flos": 27641781457920.0, + "grad_norm": 356.15135022069484, + "language_loss": 1.3973043, + "learning_rate": 3.6416226772772178e-06, + "loss": 1.53357589, + "num_input_tokens_seen": 5952725, + "router_z_loss_clip": 26.265625, + "router_z_loss_mlp": 3.48828125, + "step": 286, + "time_per_iteration": 2.719446897506714 + }, + { + "auxiliary_loss_clip": 0.11012185, + "auxiliary_loss_mlp": 0.02632762, + "balance_loss_clip": 0.08369677, + "balance_loss_mlp": 0.02288295, + "epoch": 0.0172553735157072, + "flos": 26987035253760.0, + "grad_norm": 104.57350843719594, + "language_loss": 1.20868826, + "learning_rate": 3.643869982119001e-06, + "loss": 1.34513772, + "num_input_tokens_seen": 5970560, + "router_z_loss_clip": 26.4375, + "router_z_loss_mlp": 3.44335938, + "step": 287, + "time_per_iteration": 2.729893207550049 + }, + { + "auxiliary_loss_clip": 0.10980022, + "auxiliary_loss_mlp": 0.02642429, + "balance_loss_clip": 0.08353196, + "balance_loss_mlp": 0.02284801, + "epoch": 0.01731549676837517, + "flos": 14060578533120.0, + "grad_norm": 166.25914626432441, + "language_loss": 1.43957901, + "learning_rate": 3.646109470232502e-06, + "loss": 1.57580352, + "num_input_tokens_seen": 5982980, + "router_z_loss_clip": 26.21875, + "router_z_loss_mlp": 3.57617188, + "step": 288, + "time_per_iteration": 2.649275779724121 + }, + { + "auxiliary_loss_clip": 0.08934768, + "auxiliary_loss_mlp": 0.02473956, + "balance_loss_clip": 0.07674165, + "balance_loss_mlp": 0.02246409, + "epoch": 0.017375620021043137, + "flos": 66533545543680.0, + "grad_norm": 1.4063062090104488, + "language_loss": 0.6396153, + "learning_rate": 3.6483411958066417e-06, + "loss": 0.75370252, + "num_input_tokens_seen": 6049445, + "router_z_loss_clip": 12.625, + "router_z_loss_mlp": 2.27734375, + "step": 289, + "time_per_iteration": 3.379565954208374 + }, + { + "auxiliary_loss_clip": 0.10942794, + "auxiliary_loss_mlp": 0.0259406, + "balance_loss_clip": 0.08345533, + "balance_loss_mlp": 0.02290982, + "epoch": 0.01743574327371111, + "flos": 15229472590080.0, + "grad_norm": 77.68078787610818, + "language_loss": 1.23036659, + "learning_rate": 3.6505652124687957e-06, + "loss": 1.36573505, + "num_input_tokens_seen": 6064150, + "router_z_loss_clip": 26.0, + "router_z_loss_mlp": 3.03320312, + "step": 290, + "time_per_iteration": 2.6509203910827637 + }, + { + "auxiliary_loss_clip": 0.10926615, + "auxiliary_loss_mlp": 0.02615048, + "balance_loss_clip": 0.08348773, + "balance_loss_mlp": 0.02310254, + "epoch": 0.017495866526379078, + "flos": 25380833137920.0, + "grad_norm": 27.564120325217353, + "language_loss": 1.14881706, + "learning_rate": 3.6527815732925258e-06, + "loss": 1.28423381, + "num_input_tokens_seen": 6083920, + "router_z_loss_clip": 25.796875, + "router_z_loss_mlp": 3.046875, + "step": 291, + "time_per_iteration": 2.7178046703338623 + }, + { + "auxiliary_loss_clip": 0.10883434, + "auxiliary_loss_mlp": 0.02591836, + "balance_loss_clip": 0.08332369, + "balance_loss_mlp": 0.02272164, + "epoch": 0.017555989779047047, + "flos": 26366683950720.0, + "grad_norm": 17.764405326344416, + "language_loss": 0.99533927, + "learning_rate": 3.6549903308051806e-06, + "loss": 1.13009202, + "num_input_tokens_seen": 6105460, + "router_z_loss_clip": 25.53125, + "router_z_loss_mlp": 3.1953125, + "step": 292, + "time_per_iteration": 2.788431406021118 + }, + { + "auxiliary_loss_clip": 0.10899352, + "auxiliary_loss_mlp": 0.02663543, + "balance_loss_clip": 0.08339885, + "balance_loss_mlp": 0.02329948, + "epoch": 0.017616113031715015, + "flos": 22344134918400.0, + "grad_norm": 26.042803645754148, + "language_loss": 1.17510223, + "learning_rate": 3.6571915369953646e-06, + "loss": 1.31073129, + "num_input_tokens_seen": 6122890, + "router_z_loss_clip": 25.59375, + "router_z_loss_mlp": 3.33398438, + "step": 293, + "time_per_iteration": 2.6952950954437256 + }, + { + "auxiliary_loss_clip": 0.10900117, + "auxiliary_loss_mlp": 0.02710556, + "balance_loss_clip": 0.08334709, + "balance_loss_mlp": 0.02379822, + "epoch": 0.017676236284382984, + "flos": 20163087066240.0, + "grad_norm": 32.066823918561106, + "language_loss": 1.13700342, + "learning_rate": 3.6593852433202797e-06, + "loss": 1.27311015, + "num_input_tokens_seen": 6142890, + "router_z_loss_clip": 25.640625, + "router_z_loss_mlp": 3.30859375, + "step": 294, + "time_per_iteration": 5.568135976791382 + }, + { + "auxiliary_loss_clip": 0.10885305, + "auxiliary_loss_mlp": 0.02641671, + "balance_loss_clip": 0.08332892, + "balance_loss_mlp": 0.02322953, + "epoch": 0.017736359537050956, + "flos": 25229501464320.0, + "grad_norm": 23.522869629200528, + "language_loss": 1.10671854, + "learning_rate": 3.6615715007129453e-06, + "loss": 1.24198818, + "num_input_tokens_seen": 6162030, + "router_z_loss_clip": 25.515625, + "router_z_loss_mlp": 3.1875, + "step": 295, + "time_per_iteration": 4.106949090957642 + }, + { + "auxiliary_loss_clip": 0.10915332, + "auxiliary_loss_mlp": 0.02662487, + "balance_loss_clip": 0.08334074, + "balance_loss_mlp": 0.02339572, + "epoch": 0.017796482789718925, + "flos": 20344914426240.0, + "grad_norm": 21.437764161161574, + "language_loss": 1.11617136, + "learning_rate": 3.6637503595892897e-06, + "loss": 1.25194955, + "num_input_tokens_seen": 6180540, + "router_z_loss_clip": 25.8125, + "router_z_loss_mlp": 3.22851562, + "step": 296, + "time_per_iteration": 2.6804072856903076 + }, + { + "auxiliary_loss_clip": 0.10889067, + "auxiliary_loss_mlp": 0.02644786, + "balance_loss_clip": 0.08324579, + "balance_loss_mlp": 0.02326259, + "epoch": 0.017856606042386893, + "flos": 22385196218880.0, + "grad_norm": 24.793293378850404, + "language_loss": 1.13374424, + "learning_rate": 3.665921869855132e-06, + "loss": 1.26908278, + "num_input_tokens_seen": 6199425, + "router_z_loss_clip": 25.671875, + "router_z_loss_mlp": 3.18554688, + "step": 297, + "time_per_iteration": 4.217481851577759 + }, + { + "auxiliary_loss_clip": 0.10852176, + "auxiliary_loss_mlp": 0.02688673, + "balance_loss_clip": 0.08303393, + "balance_loss_mlp": 0.02347639, + "epoch": 0.017916729295054862, + "flos": 20236279207680.0, + "grad_norm": 36.45374269731938, + "language_loss": 1.20502043, + "learning_rate": 3.6680860809130346e-06, + "loss": 1.34042883, + "num_input_tokens_seen": 6219170, + "router_z_loss_clip": 25.515625, + "router_z_loss_mlp": 3.40820312, + "step": 298, + "time_per_iteration": 2.6716575622558594 + }, + { + "auxiliary_loss_clip": 0.10865816, + "auxiliary_loss_mlp": 0.02644256, + "balance_loss_clip": 0.08315772, + "balance_loss_mlp": 0.02343848, + "epoch": 0.01797685254772283, + "flos": 19397064240000.0, + "grad_norm": 34.948505853119244, + "language_loss": 1.10227847, + "learning_rate": 3.6702430416690516e-06, + "loss": 1.23737931, + "num_input_tokens_seen": 6237930, + "router_z_loss_clip": 25.5, + "router_z_loss_mlp": 3.00390625, + "step": 299, + "time_per_iteration": 2.6678671836853027 + }, + { + "auxiliary_loss_clip": 0.10841461, + "auxiliary_loss_mlp": 0.02622314, + "balance_loss_clip": 0.08293117, + "balance_loss_mlp": 0.02329536, + "epoch": 0.018036975800390802, + "flos": 24432941024640.0, + "grad_norm": 19.38461643101093, + "language_loss": 0.93498641, + "learning_rate": 3.672392800539357e-06, + "loss": 1.06962407, + "num_input_tokens_seen": 6257170, + "router_z_loss_clip": 25.46875, + "router_z_loss_mlp": 2.92578125, + "step": 300, + "time_per_iteration": 2.678161382675171 + }, + { + "auxiliary_loss_clip": 0.10806506, + "auxiliary_loss_mlp": 0.02621871, + "balance_loss_clip": 0.08281456, + "balance_loss_mlp": 0.02336723, + "epoch": 0.01809709905305877, + "flos": 15784430181120.0, + "grad_norm": 20.696646248156853, + "language_loss": 1.21024799, + "learning_rate": 3.6745354054567686e-06, + "loss": 1.34453177, + "num_input_tokens_seen": 6274780, + "router_z_loss_clip": 25.265625, + "router_z_loss_mlp": 2.85351562, + "step": 301, + "time_per_iteration": 2.6817290782928467 + }, + { + "auxiliary_loss_clip": 0.0850801, + "auxiliary_loss_mlp": 0.01826254, + "balance_loss_clip": 0.07523113, + "balance_loss_mlp": 0.01690356, + "epoch": 0.01815722230572674, + "flos": 67371125356800.0, + "grad_norm": 1.2503467181890604, + "language_loss": 0.62148851, + "learning_rate": 3.676670903877158e-06, + "loss": 0.72483116, + "num_input_tokens_seen": 6340435, + "router_z_loss_clip": 9.859375, + "router_z_loss_mlp": 1.36035156, + "step": 302, + "time_per_iteration": 3.424029588699341 + }, + { + "auxiliary_loss_clip": 0.10791934, + "auxiliary_loss_mlp": 0.02578435, + "balance_loss_clip": 0.08265001, + "balance_loss_mlp": 0.02299963, + "epoch": 0.01821734555839471, + "flos": 15490823074560.0, + "grad_norm": 21.711544566316807, + "language_loss": 1.17839396, + "learning_rate": 3.6787993427857567e-06, + "loss": 1.31209755, + "num_input_tokens_seen": 6358160, + "router_z_loss_clip": 25.265625, + "router_z_loss_mlp": 2.78320312, + "step": 303, + "time_per_iteration": 2.6523215770721436 + }, + { + "auxiliary_loss_clip": 0.10728209, + "auxiliary_loss_mlp": 0.02544189, + "balance_loss_clip": 0.08224705, + "balance_loss_mlp": 0.02301288, + "epoch": 0.018277468811062677, + "flos": 24104268184320.0, + "grad_norm": 23.704422815160775, + "language_loss": 1.0746634, + "learning_rate": 3.680920768703364e-06, + "loss": 1.20738745, + "num_input_tokens_seen": 6378485, + "router_z_loss_clip": 25.03125, + "router_z_loss_mlp": 2.42675781, + "step": 304, + "time_per_iteration": 2.7344958782196045 + }, + { + "auxiliary_loss_clip": 0.1066777, + "auxiliary_loss_mlp": 0.02483555, + "balance_loss_clip": 0.08210013, + "balance_loss_mlp": 0.02260681, + "epoch": 0.01833759206373065, + "flos": 20965601145600.0, + "grad_norm": 30.99837504160223, + "language_loss": 1.03348625, + "learning_rate": 3.6830352276924415e-06, + "loss": 1.16499949, + "num_input_tokens_seen": 6397845, + "router_z_loss_clip": 24.5625, + "router_z_loss_mlp": 2.22949219, + "step": 305, + "time_per_iteration": 2.7260208129882812 + }, + { + "auxiliary_loss_clip": 0.10687442, + "auxiliary_loss_mlp": 0.0251225, + "balance_loss_clip": 0.08201034, + "balance_loss_mlp": 0.0229529, + "epoch": 0.018397715316398618, + "flos": 19396812677760.0, + "grad_norm": 19.918754118514013, + "language_loss": 1.13116205, + "learning_rate": 3.685142765363119e-06, + "loss": 1.26315892, + "num_input_tokens_seen": 6416475, + "router_z_loss_clip": 24.828125, + "router_z_loss_mlp": 2.16992188, + "step": 306, + "time_per_iteration": 2.691499948501587 + }, + { + "auxiliary_loss_clip": 0.10669354, + "auxiliary_loss_mlp": 0.02508631, + "balance_loss_clip": 0.08186156, + "balance_loss_mlp": 0.02314558, + "epoch": 0.018457838569066586, + "flos": 29140228823040.0, + "grad_norm": 47.10981354198648, + "language_loss": 1.13449669, + "learning_rate": 3.687243426879095e-06, + "loss": 1.2662766, + "num_input_tokens_seen": 6437520, + "router_z_loss_clip": 24.859375, + "router_z_loss_mlp": 1.94335938, + "step": 307, + "time_per_iteration": 2.7379393577575684 + }, + { + "auxiliary_loss_clip": 0.10625106, + "auxiliary_loss_mlp": 0.02487612, + "balance_loss_clip": 0.08165652, + "balance_loss_mlp": 0.02317095, + "epoch": 0.018517961821734555, + "flos": 19214733755520.0, + "grad_norm": 42.1678147839251, + "language_loss": 0.98589212, + "learning_rate": 3.6893372569634466e-06, + "loss": 1.11701941, + "num_input_tokens_seen": 6455680, + "router_z_loss_clip": 24.609375, + "router_z_loss_mlp": 1.70605469, + "step": 308, + "time_per_iteration": 2.702864646911621 + }, + { + "auxiliary_loss_clip": 0.1055109, + "auxiliary_loss_mlp": 0.02395341, + "balance_loss_clip": 0.08134291, + "balance_loss_mlp": 0.02218911, + "epoch": 0.018578085074402523, + "flos": 19868809127040.0, + "grad_norm": 28.65950876073581, + "language_loss": 1.1383698, + "learning_rate": 3.6914242999043395e-06, + "loss": 1.26783419, + "num_input_tokens_seen": 6474880, + "router_z_loss_clip": 24.171875, + "router_z_loss_mlp": 1.765625, + "step": 309, + "time_per_iteration": 2.6683051586151123 + }, + { + "auxiliary_loss_clip": 0.10586038, + "auxiliary_loss_mlp": 0.02405273, + "balance_loss_clip": 0.08121731, + "balance_loss_mlp": 0.02230465, + "epoch": 0.018638208327070496, + "flos": 29614740894720.0, + "grad_norm": 52.453360042586766, + "language_loss": 1.0296793, + "learning_rate": 3.69350459956065e-06, + "loss": 1.15959239, + "num_input_tokens_seen": 6495945, + "router_z_loss_clip": 24.625, + "router_z_loss_mlp": 1.74804688, + "step": 310, + "time_per_iteration": 2.775391101837158 + }, + { + "auxiliary_loss_clip": 0.10563378, + "auxiliary_loss_mlp": 0.02371235, + "balance_loss_clip": 0.08112171, + "balance_loss_mlp": 0.02215118, + "epoch": 0.018698331579738464, + "flos": 45741694567680.0, + "grad_norm": 23.410275827875097, + "language_loss": 0.97821265, + "learning_rate": 3.695578199367497e-06, + "loss": 1.10755873, + "num_input_tokens_seen": 6519930, + "router_z_loss_clip": 24.5, + "router_z_loss_mlp": 1.56054688, + "step": 311, + "time_per_iteration": 2.8839335441589355 + }, + { + "auxiliary_loss_clip": 0.10531655, + "auxiliary_loss_mlp": 0.02336008, + "balance_loss_clip": 0.08109175, + "balance_loss_mlp": 0.02177126, + "epoch": 0.018758454832406433, + "flos": 20489621627520.0, + "grad_norm": 82.59483456267918, + "language_loss": 1.18671477, + "learning_rate": 3.6976451423416825e-06, + "loss": 1.31539142, + "num_input_tokens_seen": 6535070, + "router_z_loss_clip": 24.203125, + "router_z_loss_mlp": 1.58886719, + "step": 312, + "time_per_iteration": 2.770037889480591 + }, + { + "auxiliary_loss_clip": 0.10558081, + "auxiliary_loss_mlp": 0.02280057, + "balance_loss_clip": 0.08105703, + "balance_loss_mlp": 0.02130998, + "epoch": 0.0188185780850744, + "flos": 15783088515840.0, + "grad_norm": 63.63527142809732, + "language_loss": 1.19325101, + "learning_rate": 3.699705471087043e-06, + "loss": 1.32163239, + "num_input_tokens_seen": 6554135, + "router_z_loss_clip": 24.515625, + "router_z_loss_mlp": 1.49121094, + "step": 313, + "time_per_iteration": 2.6673521995544434 + }, + { + "auxiliary_loss_clip": 0.10532573, + "auxiliary_loss_mlp": 0.02284473, + "balance_loss_clip": 0.08092797, + "balance_loss_mlp": 0.02119774, + "epoch": 0.018878701337742373, + "flos": 22462329502080.0, + "grad_norm": 55.57556601394066, + "language_loss": 1.1492281, + "learning_rate": 3.7017592277997256e-06, + "loss": 1.27739859, + "num_input_tokens_seen": 6572275, + "router_z_loss_clip": 24.375, + "router_z_loss_mlp": 1.6484375, + "step": 314, + "time_per_iteration": 2.6694388389587402 + }, + { + "auxiliary_loss_clip": 0.10578424, + "auxiliary_loss_mlp": 0.02246847, + "balance_loss_clip": 0.08105191, + "balance_loss_mlp": 0.02083482, + "epoch": 0.018938824590410342, + "flos": 31001576221440.0, + "grad_norm": 45.405049918855795, + "language_loss": 1.21203804, + "learning_rate": 3.7038064542733654e-06, + "loss": 1.34029078, + "num_input_tokens_seen": 6594520, + "router_z_loss_clip": 24.734375, + "router_z_loss_mlp": 1.6328125, + "step": 315, + "time_per_iteration": 2.7529938220977783 + }, + { + "auxiliary_loss_clip": 0.10473935, + "auxiliary_loss_mlp": 0.02224543, + "balance_loss_clip": 0.08059986, + "balance_loss_mlp": 0.02047731, + "epoch": 0.01899894784307831, + "flos": 23265724049280.0, + "grad_norm": 52.87369135887914, + "language_loss": 1.09085321, + "learning_rate": 3.7058471919041945e-06, + "loss": 1.21783805, + "num_input_tokens_seen": 6614245, + "router_z_loss_clip": 24.15625, + "router_z_loss_mlp": 1.76855469, + "step": 316, + "time_per_iteration": 2.7019717693328857 + }, + { + "auxiliary_loss_clip": 0.1049989, + "auxiliary_loss_mlp": 0.02224334, + "balance_loss_clip": 0.08073364, + "balance_loss_mlp": 0.02044757, + "epoch": 0.01905907109574628, + "flos": 17463782511360.0, + "grad_norm": 120.61991368810097, + "language_loss": 1.19369888, + "learning_rate": 3.7078814816960605e-06, + "loss": 1.32094109, + "num_input_tokens_seen": 6632015, + "router_z_loss_clip": 24.234375, + "router_z_loss_mlp": 1.79492188, + "step": 317, + "time_per_iteration": 2.6503257751464844 + }, + { + "auxiliary_loss_clip": 0.10466437, + "auxiliary_loss_mlp": 0.02269676, + "balance_loss_clip": 0.08054706, + "balance_loss_mlp": 0.02081039, + "epoch": 0.019119194348414248, + "flos": 14974578869760.0, + "grad_norm": 61.86297235247138, + "language_loss": 1.22225165, + "learning_rate": 3.709909364265374e-06, + "loss": 1.34961283, + "num_input_tokens_seen": 6649015, + "router_z_loss_clip": 24.109375, + "router_z_loss_mlp": 1.88769531, + "step": 318, + "time_per_iteration": 2.631645917892456 + }, + { + "auxiliary_loss_clip": 0.1039573, + "auxiliary_loss_mlp": 0.02220381, + "balance_loss_clip": 0.08026896, + "balance_loss_mlp": 0.02036608, + "epoch": 0.01917931760108222, + "flos": 25489719918720.0, + "grad_norm": 79.56078914423522, + "language_loss": 1.24628842, + "learning_rate": 3.7119308798459706e-06, + "loss": 1.3724494, + "num_input_tokens_seen": 6669225, + "router_z_loss_clip": 23.65625, + "router_z_loss_mlp": 1.83789062, + "step": 319, + "time_per_iteration": 2.723235607147217 + }, + { + "auxiliary_loss_clip": 0.08211939, + "auxiliary_loss_mlp": 0.01803451, + "balance_loss_clip": 0.07311222, + "balance_loss_mlp": 0.01697974, + "epoch": 0.01923944085375019, + "flos": 71576438872320.0, + "grad_norm": 0.9540157623115577, + "language_loss": 0.59494603, + "learning_rate": 3.7139460682939026e-06, + "loss": 0.69509989, + "num_input_tokens_seen": 6725775, + "router_z_loss_clip": 9.0, + "router_z_loss_mlp": 1.05664062, + "step": 320, + "time_per_iteration": 3.180224895477295 + }, + { + "auxiliary_loss_clip": 0.10427548, + "auxiliary_loss_mlp": 0.02254004, + "balance_loss_clip": 0.0803239, + "balance_loss_mlp": 0.02062601, + "epoch": 0.019299564106418157, + "flos": 19688574994560.0, + "grad_norm": 36.291900925718565, + "language_loss": 1.21542251, + "learning_rate": 3.715954969092154e-06, + "loss": 1.34223795, + "num_input_tokens_seen": 6744170, + "router_z_loss_clip": 23.921875, + "router_z_loss_mlp": 1.9140625, + "step": 321, + "time_per_iteration": 2.682126045227051 + }, + { + "auxiliary_loss_clip": 0.10335587, + "auxiliary_loss_mlp": 0.02247301, + "balance_loss_clip": 0.079924, + "balance_loss_mlp": 0.02050463, + "epoch": 0.019359687359086126, + "flos": 24393682586880.0, + "grad_norm": 33.259970226975035, + "language_loss": 1.13044763, + "learning_rate": 3.7179576213552805e-06, + "loss": 1.25627637, + "num_input_tokens_seen": 6764565, + "router_z_loss_clip": 23.40625, + "router_z_loss_mlp": 1.96972656, + "step": 322, + "time_per_iteration": 2.707108736038208 + }, + { + "auxiliary_loss_clip": 0.10356271, + "auxiliary_loss_mlp": 0.02232923, + "balance_loss_clip": 0.08007558, + "balance_loss_mlp": 0.02039518, + "epoch": 0.019419810611754094, + "flos": 23958177390720.0, + "grad_norm": 36.53278953975959, + "language_loss": 0.99391961, + "learning_rate": 3.719954063833981e-06, + "loss": 1.11981153, + "num_input_tokens_seen": 6785310, + "router_z_loss_clip": 23.46875, + "router_z_loss_mlp": 1.93554688, + "step": 323, + "time_per_iteration": 2.723851442337036 + }, + { + "auxiliary_loss_clip": 0.10368463, + "auxiliary_loss_mlp": 0.02256046, + "balance_loss_clip": 0.08015804, + "balance_loss_mlp": 0.02064739, + "epoch": 0.019479933864422067, + "flos": 22166164846080.0, + "grad_norm": 31.715264393756637, + "language_loss": 1.15310884, + "learning_rate": 3.721944334919596e-06, + "loss": 1.27935386, + "num_input_tokens_seen": 6803290, + "router_z_loss_clip": 23.5, + "router_z_loss_mlp": 1.9140625, + "step": 324, + "time_per_iteration": 2.696791887283325 + }, + { + "auxiliary_loss_clip": 0.10296808, + "auxiliary_loss_mlp": 0.02240866, + "balance_loss_clip": 0.08005355, + "balance_loss_mlp": 0.02052992, + "epoch": 0.019540057117090035, + "flos": 22243381983360.0, + "grad_norm": 43.49790109423306, + "language_loss": 0.94611681, + "learning_rate": 3.7239284726485375e-06, + "loss": 1.07149351, + "num_input_tokens_seen": 6822570, + "router_z_loss_clip": 22.90625, + "router_z_loss_mlp": 1.87890625, + "step": 325, + "time_per_iteration": 2.653348207473755 + }, + { + "auxiliary_loss_clip": 0.10282885, + "auxiliary_loss_mlp": 0.02182889, + "balance_loss_clip": 0.07997272, + "balance_loss_mlp": 0.02001023, + "epoch": 0.019600180369758004, + "flos": 23083603200000.0, + "grad_norm": 27.315965412731057, + "language_loss": 0.98057997, + "learning_rate": 3.72590651470665e-06, + "loss": 1.10523772, + "num_input_tokens_seen": 6841910, + "router_z_loss_clip": 22.859375, + "router_z_loss_mlp": 1.81835938, + "step": 326, + "time_per_iteration": 2.712902545928955 + }, + { + "auxiliary_loss_clip": 0.10212934, + "auxiliary_loss_mlp": 0.0211514, + "balance_loss_clip": 0.07960281, + "balance_loss_mlp": 0.01952062, + "epoch": 0.019660303622425972, + "flos": 25417911369600.0, + "grad_norm": 35.757935523376304, + "language_loss": 1.00482905, + "learning_rate": 3.727878498433505e-06, + "loss": 1.12810981, + "num_input_tokens_seen": 6862480, + "router_z_loss_clip": 22.53125, + "router_z_loss_mlp": 1.63085938, + "step": 327, + "time_per_iteration": 2.7241063117980957 + }, + { + "auxiliary_loss_clip": 0.10138492, + "auxiliary_loss_mlp": 0.02035691, + "balance_loss_clip": 0.07947245, + "balance_loss_mlp": 0.01881101, + "epoch": 0.01972042687509394, + "flos": 23663941378560.0, + "grad_norm": 104.32864902308236, + "language_loss": 1.03565025, + "learning_rate": 3.7298444608266328e-06, + "loss": 1.15739202, + "num_input_tokens_seen": 6882015, + "router_z_loss_clip": 21.9375, + "router_z_loss_mlp": 1.54492188, + "step": 328, + "time_per_iteration": 2.709101438522339 + }, + { + "auxiliary_loss_clip": 0.10164856, + "auxiliary_loss_mlp": 0.01970008, + "balance_loss_clip": 0.0795281, + "balance_loss_mlp": 0.01821044, + "epoch": 0.019780550127761913, + "flos": 18229386067200.0, + "grad_norm": 42.1606706132577, + "language_loss": 1.2875843, + "learning_rate": 3.731804438545683e-06, + "loss": 1.40893316, + "num_input_tokens_seen": 6899785, + "router_z_loss_clip": 22.125, + "router_z_loss_mlp": 1.49023438, + "step": 329, + "time_per_iteration": 2.6586227416992188 + }, + { + "auxiliary_loss_clip": 0.10175324, + "auxiliary_loss_mlp": 0.0194808, + "balance_loss_clip": 0.07956892, + "balance_loss_mlp": 0.0180417, + "epoch": 0.01984067338042988, + "flos": 22425293197440.0, + "grad_norm": 45.342797810033126, + "language_loss": 1.05014217, + "learning_rate": 3.7337584679165324e-06, + "loss": 1.17137623, + "num_input_tokens_seen": 6918575, + "router_z_loss_clip": 22.1875, + "router_z_loss_mlp": 1.43847656, + "step": 330, + "time_per_iteration": 2.7214515209198 + }, + { + "auxiliary_loss_clip": 0.10115402, + "auxiliary_loss_mlp": 0.01893459, + "balance_loss_clip": 0.07927606, + "balance_loss_mlp": 0.01745353, + "epoch": 0.01990079663309785, + "flos": 17060785499520.0, + "grad_norm": 59.15314637886723, + "language_loss": 1.25238144, + "learning_rate": 3.7357065849353186e-06, + "loss": 1.37247014, + "num_input_tokens_seen": 6936965, + "router_z_loss_clip": 21.890625, + "router_z_loss_mlp": 1.48046875, + "step": 331, + "time_per_iteration": 2.657338857650757 + }, + { + "auxiliary_loss_clip": 0.10080996, + "auxiliary_loss_mlp": 0.01847509, + "balance_loss_clip": 0.07917192, + "balance_loss_mlp": 0.01704076, + "epoch": 0.01996091988576582, + "flos": 15967389571200.0, + "grad_norm": 98.01539887897596, + "language_loss": 1.18547392, + "learning_rate": 3.737648825272422e-06, + "loss": 1.30475891, + "num_input_tokens_seen": 6953475, + "router_z_loss_clip": 21.625, + "router_z_loss_mlp": 1.43457031, + "step": 332, + "time_per_iteration": 2.653959035873413 + }, + { + "auxiliary_loss_clip": 0.10103545, + "auxiliary_loss_mlp": 0.01800932, + "balance_loss_clip": 0.07904914, + "balance_loss_mlp": 0.01663794, + "epoch": 0.02002104313843379, + "flos": 23593181005440.0, + "grad_norm": 35.094478760810134, + "language_loss": 1.10768199, + "learning_rate": 3.739585224276384e-06, + "loss": 1.22672677, + "num_input_tokens_seen": 6971630, + "router_z_loss_clip": 21.96875, + "router_z_loss_mlp": 1.37207031, + "step": 333, + "time_per_iteration": 4.1371009349823 + }, + { + "auxiliary_loss_clip": 0.10097618, + "auxiliary_loss_mlp": 0.01781343, + "balance_loss_clip": 0.07907948, + "balance_loss_mlp": 0.01654028, + "epoch": 0.02008116639110176, + "flos": 34103458517760.0, + "grad_norm": 136.68327853765982, + "language_loss": 1.06974816, + "learning_rate": 3.7415158169777673e-06, + "loss": 1.18853784, + "num_input_tokens_seen": 6992775, + "router_z_loss_clip": 21.921875, + "router_z_loss_mlp": 1.2734375, + "step": 334, + "time_per_iteration": 4.332135200500488 + }, + { + "auxiliary_loss_clip": 0.10031913, + "auxiliary_loss_mlp": 0.01781208, + "balance_loss_clip": 0.07884848, + "balance_loss_mlp": 0.01645405, + "epoch": 0.020141289643769728, + "flos": 19690000513920.0, + "grad_norm": 127.35413263461035, + "language_loss": 1.06165111, + "learning_rate": 3.7434406380929575e-06, + "loss": 1.17978239, + "num_input_tokens_seen": 7011425, + "router_z_loss_clip": 21.453125, + "router_z_loss_mlp": 1.35742188, + "step": 335, + "time_per_iteration": 2.6845688819885254 + }, + { + "auxiliary_loss_clip": 0.10012034, + "auxiliary_loss_mlp": 0.01785006, + "balance_loss_clip": 0.07876636, + "balance_loss_mlp": 0.01652064, + "epoch": 0.020201412896437697, + "flos": 20746821335040.0, + "grad_norm": 92.68671579424392, + "language_loss": 1.17325389, + "learning_rate": 3.745359722027911e-06, + "loss": 1.29122424, + "num_input_tokens_seen": 7029450, + "router_z_loss_clip": 21.34375, + "router_z_loss_mlp": 1.33007812, + "step": 336, + "time_per_iteration": 4.08910059928894 + }, + { + "auxiliary_loss_clip": 0.1002828, + "auxiliary_loss_mlp": 0.01777388, + "balance_loss_clip": 0.07887816, + "balance_loss_mlp": 0.01649119, + "epoch": 0.020261536149105665, + "flos": 20272728533760.0, + "grad_norm": 120.00954497896274, + "language_loss": 1.09627342, + "learning_rate": 3.7472731028818428e-06, + "loss": 1.21433008, + "num_input_tokens_seen": 7047555, + "router_z_loss_clip": 21.40625, + "router_z_loss_mlp": 1.28222656, + "step": 337, + "time_per_iteration": 2.805793285369873 + }, + { + "auxiliary_loss_clip": 0.09984031, + "auxiliary_loss_mlp": 0.01793779, + "balance_loss_clip": 0.07868993, + "balance_loss_mlp": 0.01666368, + "epoch": 0.020321659401773638, + "flos": 25855890261120.0, + "grad_norm": 28.99860578242643, + "language_loss": 1.06755781, + "learning_rate": 3.7491808144508626e-06, + "loss": 1.18533587, + "num_input_tokens_seen": 7068185, + "router_z_loss_clip": 21.15625, + "router_z_loss_mlp": 1.2734375, + "step": 338, + "time_per_iteration": 2.731576919555664 + }, + { + "auxiliary_loss_clip": 0.09960704, + "auxiliary_loss_mlp": 0.01799352, + "balance_loss_clip": 0.0785647, + "balance_loss_mlp": 0.01663931, + "epoch": 0.020381782654441606, + "flos": 17501028451200.0, + "grad_norm": 48.687202060804886, + "language_loss": 1.0690763, + "learning_rate": 3.7510828902315576e-06, + "loss": 1.18667698, + "num_input_tokens_seen": 7085955, + "router_z_loss_clip": 21.03125, + "router_z_loss_mlp": 1.35449219, + "step": 339, + "time_per_iteration": 2.6707966327667236 + }, + { + "auxiliary_loss_clip": 0.09979145, + "auxiliary_loss_mlp": 0.01800383, + "balance_loss_clip": 0.07839093, + "balance_loss_mlp": 0.01661433, + "epoch": 0.020441905907109575, + "flos": 24250904029440.0, + "grad_norm": 71.79969186636298, + "language_loss": 1.09025931, + "learning_rate": 3.75297936342452e-06, + "loss": 1.20805454, + "num_input_tokens_seen": 7106345, + "router_z_loss_clip": 21.4375, + "router_z_loss_mlp": 1.38964844, + "step": 340, + "time_per_iteration": 2.6860833168029785 + }, + { + "auxiliary_loss_clip": 0.09942168, + "auxiliary_loss_mlp": 0.01812594, + "balance_loss_clip": 0.07835533, + "balance_loss_mlp": 0.01670592, + "epoch": 0.020502029159777543, + "flos": 22239273133440.0, + "grad_norm": 33.37713513104353, + "language_loss": 1.09787846, + "learning_rate": 3.7548702669378253e-06, + "loss": 1.21542597, + "num_input_tokens_seen": 7125070, + "router_z_loss_clip": 21.09375, + "router_z_loss_mlp": 1.41992188, + "step": 341, + "time_per_iteration": 2.6922483444213867 + }, + { + "auxiliary_loss_clip": 0.09939329, + "auxiliary_loss_mlp": 0.01828812, + "balance_loss_clip": 0.07839939, + "balance_loss_mlp": 0.01694249, + "epoch": 0.020562152412445512, + "flos": 23994668643840.0, + "grad_norm": 29.77192234960925, + "language_loss": 1.11667454, + "learning_rate": 3.756755633390458e-06, + "loss": 1.23435605, + "num_input_tokens_seen": 7144675, + "router_z_loss_clip": 21.0, + "router_z_loss_mlp": 1.34472656, + "step": 342, + "time_per_iteration": 2.6834869384765625 + }, + { + "auxiliary_loss_clip": 0.09933892, + "auxiliary_loss_mlp": 0.01819402, + "balance_loss_clip": 0.07828948, + "balance_loss_mlp": 0.0168541, + "epoch": 0.020622275665113484, + "flos": 26981878227840.0, + "grad_norm": 22.197931915509507, + "language_loss": 1.07990003, + "learning_rate": 3.7586354951156886e-06, + "loss": 1.19743299, + "num_input_tokens_seen": 7165505, + "router_z_loss_clip": 21.0625, + "router_z_loss_mlp": 1.34082031, + "step": 343, + "time_per_iteration": 2.749616861343384 + }, + { + "auxiliary_loss_clip": 0.09917849, + "auxiliary_loss_mlp": 0.01848479, + "balance_loss_clip": 0.07828984, + "balance_loss_mlp": 0.01717921, + "epoch": 0.020682398917781453, + "flos": 22607162484480.0, + "grad_norm": 141.8901696404303, + "language_loss": 0.98407257, + "learning_rate": 3.7605098841644e-06, + "loss": 1.10173583, + "num_input_tokens_seen": 7184605, + "router_z_loss_clip": 20.859375, + "router_z_loss_mlp": 1.30566406, + "step": 344, + "time_per_iteration": 2.675349235534668 + }, + { + "auxiliary_loss_clip": 0.09898005, + "auxiliary_loss_mlp": 0.01869082, + "balance_loss_clip": 0.07812598, + "balance_loss_mlp": 0.01731467, + "epoch": 0.02074252217044942, + "flos": 15019120114560.0, + "grad_norm": 18.785611022256134, + "language_loss": 0.99672723, + "learning_rate": 3.7623788323083666e-06, + "loss": 1.11439812, + "num_input_tokens_seen": 7203065, + "router_z_loss_clip": 20.84375, + "router_z_loss_mlp": 1.37597656, + "step": 345, + "time_per_iteration": 2.692946434020996 + }, + { + "auxiliary_loss_clip": 0.09874325, + "auxiliary_loss_mlp": 0.01900277, + "balance_loss_clip": 0.07799722, + "balance_loss_mlp": 0.01757512, + "epoch": 0.02080264542311739, + "flos": 25345012717440.0, + "grad_norm": 55.83425603592709, + "language_loss": 1.104882, + "learning_rate": 3.7642423710434837e-06, + "loss": 1.222628, + "num_input_tokens_seen": 7222995, + "router_z_loss_clip": 20.734375, + "router_z_loss_mlp": 1.42871094, + "step": 346, + "time_per_iteration": 2.6843760013580322 + }, + { + "auxiliary_loss_clip": 0.09857361, + "auxiliary_loss_mlp": 0.01900508, + "balance_loss_clip": 0.07793791, + "balance_loss_mlp": 0.01751067, + "epoch": 0.02086276867578536, + "flos": 24395611230720.0, + "grad_norm": 77.40789728508068, + "language_loss": 1.02947056, + "learning_rate": 3.7661005315929563e-06, + "loss": 1.14704919, + "num_input_tokens_seen": 7244625, + "router_z_loss_clip": 20.625, + "router_z_loss_mlp": 1.49511719, + "step": 347, + "time_per_iteration": 2.7445502281188965 + }, + { + "auxiliary_loss_clip": 0.09829693, + "auxiliary_loss_mlp": 0.01850064, + "balance_loss_clip": 0.07772936, + "balance_loss_mlp": 0.01707585, + "epoch": 0.02092289192845333, + "flos": 24469096861440.0, + "grad_norm": 39.57326474220843, + "language_loss": 0.95316571, + "learning_rate": 3.7679533449104354e-06, + "loss": 1.06996334, + "num_input_tokens_seen": 7263255, + "router_z_loss_clip": 20.546875, + "router_z_loss_mlp": 1.42578125, + "step": 348, + "time_per_iteration": 2.8197853565216064 + }, + { + "auxiliary_loss_clip": 0.09904477, + "auxiliary_loss_mlp": 0.01869566, + "balance_loss_clip": 0.07792602, + "balance_loss_mlp": 0.01723273, + "epoch": 0.0209830151811213, + "flos": 17455942154880.0, + "grad_norm": 162.53223734199824, + "language_loss": 1.06930375, + "learning_rate": 3.7698008416831116e-06, + "loss": 1.18704414, + "num_input_tokens_seen": 7279275, + "router_z_loss_clip": 21.125, + "router_z_loss_mlp": 1.46289062, + "step": 349, + "time_per_iteration": 2.752092123031616 + }, + { + "auxiliary_loss_clip": 0.09846102, + "auxiliary_loss_mlp": 0.01921246, + "balance_loss_clip": 0.07772378, + "balance_loss_mlp": 0.01771328, + "epoch": 0.021043138433789268, + "flos": 24581295878400.0, + "grad_norm": 27.656933027979164, + "language_loss": 1.05012357, + "learning_rate": 3.7716430523347664e-06, + "loss": 1.16779709, + "num_input_tokens_seen": 7300180, + "router_z_loss_clip": 20.71875, + "router_z_loss_mlp": 1.49902344, + "step": 350, + "time_per_iteration": 2.766042947769165 + }, + { + "auxiliary_loss_clip": 0.0987936, + "auxiliary_loss_mlp": 0.01878538, + "balance_loss_clip": 0.07780807, + "balance_loss_mlp": 0.01733103, + "epoch": 0.021103261686457236, + "flos": 24459579423360.0, + "grad_norm": 79.75623451753691, + "language_loss": 0.99250925, + "learning_rate": 3.773480007028776e-06, + "loss": 1.11008823, + "num_input_tokens_seen": 7317430, + "router_z_loss_clip": 21.0, + "router_z_loss_mlp": 1.45507812, + "step": 351, + "time_per_iteration": 2.7852492332458496 + }, + { + "auxiliary_loss_clip": 0.09914102, + "auxiliary_loss_mlp": 0.01872584, + "balance_loss_clip": 0.07798491, + "balance_loss_mlp": 0.01732013, + "epoch": 0.021163384939125205, + "flos": 14688183214080.0, + "grad_norm": 45.172979776217204, + "language_loss": 1.05138326, + "learning_rate": 3.775311735671078e-06, + "loss": 1.16925001, + "num_input_tokens_seen": 7334875, + "router_z_loss_clip": 21.15625, + "router_z_loss_mlp": 1.40527344, + "step": 352, + "time_per_iteration": 2.670952558517456 + }, + { + "auxiliary_loss_clip": 0.09916839, + "auxiliary_loss_mlp": 0.0188162, + "balance_loss_clip": 0.07782572, + "balance_loss_mlp": 0.01727792, + "epoch": 0.021223508191793177, + "flos": 24499173277440.0, + "grad_norm": 32.69809617550279, + "language_loss": 1.02695966, + "learning_rate": 3.7771382679130878e-06, + "loss": 1.14494431, + "num_input_tokens_seen": 7355185, + "router_z_loss_clip": 21.375, + "router_z_loss_mlp": 1.5390625, + "step": 353, + "time_per_iteration": 2.7037458419799805 + }, + { + "auxiliary_loss_clip": 0.09877251, + "auxiliary_loss_mlp": 0.01866766, + "balance_loss_clip": 0.07783737, + "balance_loss_mlp": 0.01718565, + "epoch": 0.021283631444461146, + "flos": 24132667518720.0, + "grad_norm": 42.14264864151201, + "language_loss": 1.01166749, + "learning_rate": 3.7789596331545845e-06, + "loss": 1.12910759, + "num_input_tokens_seen": 7374425, + "router_z_loss_clip": 20.921875, + "router_z_loss_mlp": 1.48242188, + "step": 354, + "time_per_iteration": 2.692936658859253 + }, + { + "auxiliary_loss_clip": 0.0993467, + "auxiliary_loss_mlp": 0.0189021, + "balance_loss_clip": 0.07795032, + "balance_loss_mlp": 0.01743726, + "epoch": 0.021343754697129114, + "flos": 25199299267200.0, + "grad_norm": 49.082565254141, + "language_loss": 1.02249849, + "learning_rate": 3.780775860546545e-06, + "loss": 1.14074731, + "num_input_tokens_seen": 7394175, + "router_z_loss_clip": 21.359375, + "router_z_loss_mlp": 1.46484375, + "step": 355, + "time_per_iteration": 2.703904151916504 + }, + { + "auxiliary_loss_clip": 0.09890301, + "auxiliary_loss_mlp": 0.01933568, + "balance_loss_clip": 0.07771169, + "balance_loss_mlp": 0.01774495, + "epoch": 0.021403877949797083, + "flos": 17279816872320.0, + "grad_norm": 33.424095724347985, + "language_loss": 1.12320316, + "learning_rate": 3.7825869789939474e-06, + "loss": 1.24144173, + "num_input_tokens_seen": 7412645, + "router_z_loss_clip": 21.21875, + "router_z_loss_mlp": 1.58984375, + "step": 356, + "time_per_iteration": 2.7039332389831543 + }, + { + "auxiliary_loss_clip": 0.09926872, + "auxiliary_loss_mlp": 0.01913321, + "balance_loss_clip": 0.07763862, + "balance_loss_mlp": 0.01768648, + "epoch": 0.021464001202465055, + "flos": 30924946062720.0, + "grad_norm": 28.358403300745604, + "language_loss": 1.00492048, + "learning_rate": 3.784393017158528e-06, + "loss": 1.12332249, + "num_input_tokens_seen": 7432275, + "router_z_loss_clip": 21.640625, + "router_z_loss_mlp": 1.44628906, + "step": 357, + "time_per_iteration": 2.7567434310913086 + }, + { + "auxiliary_loss_clip": 0.09896905, + "auxiliary_loss_mlp": 0.0189471, + "balance_loss_clip": 0.0777001, + "balance_loss_mlp": 0.01751087, + "epoch": 0.021524124455133024, + "flos": 18192182054400.0, + "grad_norm": 311.83490549391024, + "language_loss": 1.00049341, + "learning_rate": 3.786194003461506e-06, + "loss": 1.11840951, + "num_input_tokens_seen": 7450245, + "router_z_loss_clip": 21.28125, + "router_z_loss_mlp": 1.43652344, + "step": 358, + "time_per_iteration": 2.697567939758301 + }, + { + "auxiliary_loss_clip": 0.09952264, + "auxiliary_loss_mlp": 0.01876113, + "balance_loss_clip": 0.0777906, + "balance_loss_mlp": 0.01737449, + "epoch": 0.021584247707800992, + "flos": 13810464495360.0, + "grad_norm": 74.44924093849752, + "language_loss": 1.11748183, + "learning_rate": 3.787989966086264e-06, + "loss": 1.2357657, + "num_input_tokens_seen": 7466845, + "router_z_loss_clip": 21.734375, + "router_z_loss_mlp": 1.38671875, + "step": 359, + "time_per_iteration": 2.683791399002075 + }, + { + "auxiliary_loss_clip": 0.09922898, + "auxiliary_loss_mlp": 0.01885242, + "balance_loss_clip": 0.07765573, + "balance_loss_mlp": 0.01746292, + "epoch": 0.02164437096046896, + "flos": 23301418688640.0, + "grad_norm": 64.98362502413198, + "language_loss": 1.06271791, + "learning_rate": 3.789780932980997e-06, + "loss": 1.18079925, + "num_input_tokens_seen": 7485450, + "router_z_loss_clip": 21.5625, + "router_z_loss_mlp": 1.38867188, + "step": 360, + "time_per_iteration": 2.7144362926483154 + }, + { + "auxiliary_loss_clip": 0.08207352, + "auxiliary_loss_mlp": 0.01776906, + "balance_loss_clip": 0.07236059, + "balance_loss_mlp": 0.01669809, + "epoch": 0.02170449421313693, + "flos": 68919621137280.0, + "grad_norm": 1.0217512577987982, + "language_loss": 0.65141213, + "learning_rate": 3.79156693186132e-06, + "loss": 0.75125468, + "num_input_tokens_seen": 7553780, + "router_z_loss_clip": 9.734375, + "router_z_loss_mlp": 1.07324219, + "step": 361, + "time_per_iteration": 3.3981525897979736 + }, + { + "auxiliary_loss_clip": 0.09926173, + "auxiliary_loss_mlp": 0.01850484, + "balance_loss_clip": 0.07767443, + "balance_loss_mlp": 0.01710961, + "epoch": 0.0217646174658049, + "flos": 25235580885120.0, + "grad_norm": 46.06075194478587, + "language_loss": 1.07240796, + "learning_rate": 3.7933479902128433e-06, + "loss": 1.19017458, + "num_input_tokens_seen": 7574155, + "router_z_loss_clip": 21.5625, + "router_z_loss_mlp": 1.39550781, + "step": 362, + "time_per_iteration": 2.7112934589385986 + }, + { + "auxiliary_loss_clip": 0.09902073, + "auxiliary_loss_mlp": 0.01838434, + "balance_loss_clip": 0.07771316, + "balance_loss_mlp": 0.01689852, + "epoch": 0.02182474071847287, + "flos": 22899721415040.0, + "grad_norm": 31.847388073363284, + "language_loss": 1.10624099, + "learning_rate": 3.7951241352937077e-06, + "loss": 1.22364616, + "num_input_tokens_seen": 7592320, + "router_z_loss_clip": 21.3125, + "router_z_loss_mlp": 1.48632812, + "step": 363, + "time_per_iteration": 2.7391881942749023 + }, + { + "auxiliary_loss_clip": 0.09905075, + "auxiliary_loss_mlp": 0.01804412, + "balance_loss_clip": 0.0776676, + "balance_loss_mlp": 0.01661742, + "epoch": 0.02188486397114084, + "flos": 23665660387200.0, + "grad_norm": 28.541039167709148, + "language_loss": 1.08880925, + "learning_rate": 3.7968953941370915e-06, + "loss": 1.20590401, + "num_input_tokens_seen": 7611185, + "router_z_loss_clip": 21.359375, + "router_z_loss_mlp": 1.42578125, + "step": 364, + "time_per_iteration": 2.7092103958129883 + }, + { + "auxiliary_loss_clip": 0.09940802, + "auxiliary_loss_mlp": 0.01790674, + "balance_loss_clip": 0.07771328, + "balance_loss_mlp": 0.01644666, + "epoch": 0.021944987223808807, + "flos": 21550090101120.0, + "grad_norm": 29.41270562877638, + "language_loss": 1.01945662, + "learning_rate": 3.798661793553676e-06, + "loss": 1.13677144, + "num_input_tokens_seen": 7631970, + "router_z_loss_clip": 21.6875, + "router_z_loss_mlp": 1.4609375, + "step": 365, + "time_per_iteration": 2.7039554119110107 + }, + { + "auxiliary_loss_clip": 0.09880184, + "auxiliary_loss_mlp": 0.01787501, + "balance_loss_clip": 0.07767902, + "balance_loss_mlp": 0.01639968, + "epoch": 0.022005110476476776, + "flos": 16076444060160.0, + "grad_norm": 25.357242967570325, + "language_loss": 1.00391948, + "learning_rate": 3.8004233601340808e-06, + "loss": 1.12059641, + "num_input_tokens_seen": 7649745, + "router_z_loss_clip": 21.125, + "router_z_loss_mlp": 1.47558594, + "step": 366, + "time_per_iteration": 2.6410672664642334 + }, + { + "auxiliary_loss_clip": 0.09886092, + "auxiliary_loss_mlp": 0.01802461, + "balance_loss_clip": 0.07774624, + "balance_loss_mlp": 0.01645009, + "epoch": 0.022065233729144748, + "flos": 21440071290240.0, + "grad_norm": 44.529255844390654, + "language_loss": 1.12988663, + "learning_rate": 3.8021801202512694e-06, + "loss": 1.24677217, + "num_input_tokens_seen": 7668830, + "router_z_loss_clip": 21.09375, + "router_z_loss_mlp": 1.57421875, + "step": 367, + "time_per_iteration": 2.742794990539551 + }, + { + "auxiliary_loss_clip": 0.09926969, + "auxiliary_loss_mlp": 0.01819149, + "balance_loss_clip": 0.0779452, + "balance_loss_mlp": 0.01654545, + "epoch": 0.022125356981812717, + "flos": 21550173955200.0, + "grad_norm": 31.338184320621753, + "language_loss": 1.07241869, + "learning_rate": 3.803932100062912e-06, + "loss": 1.18987989, + "num_input_tokens_seen": 7687240, + "router_z_loss_clip": 21.34375, + "router_z_loss_mlp": 1.64648438, + "step": 368, + "time_per_iteration": 2.660156488418579 + }, + { + "auxiliary_loss_clip": 0.09893043, + "auxiliary_loss_mlp": 0.01817736, + "balance_loss_clip": 0.07784697, + "balance_loss_mlp": 0.01649699, + "epoch": 0.022185480234480685, + "flos": 20710413936000.0, + "grad_norm": 81.09585500154182, + "language_loss": 1.0770272, + "learning_rate": 3.8056793255137264e-06, + "loss": 1.19413495, + "num_input_tokens_seen": 7704440, + "router_z_loss_clip": 21.09375, + "router_z_loss_mlp": 1.6796875, + "step": 369, + "time_per_iteration": 2.6966772079467773 + }, + { + "auxiliary_loss_clip": 0.09905175, + "auxiliary_loss_mlp": 0.01835143, + "balance_loss_clip": 0.07793829, + "balance_loss_mlp": 0.01659667, + "epoch": 0.022245603487148654, + "flos": 25200431297280.0, + "grad_norm": 48.526199326230525, + "language_loss": 1.05259717, + "learning_rate": 3.8074218223377844e-06, + "loss": 1.17000043, + "num_input_tokens_seen": 7727160, + "router_z_loss_clip": 21.09375, + "router_z_loss_mlp": 1.75585938, + "step": 370, + "time_per_iteration": 2.726882219314575 + }, + { + "auxiliary_loss_clip": 0.09840686, + "auxiliary_loss_mlp": 0.01849254, + "balance_loss_clip": 0.0775683, + "balance_loss_mlp": 0.01677497, + "epoch": 0.022305726739816623, + "flos": 21402070663680.0, + "grad_norm": 32.14486041550045, + "language_loss": 1.00516605, + "learning_rate": 3.8091596160607834e-06, + "loss": 1.12206554, + "num_input_tokens_seen": 7747730, + "router_z_loss_clip": 20.828125, + "router_z_loss_mlp": 1.71875, + "step": 371, + "time_per_iteration": 2.6846559047698975 + }, + { + "auxiliary_loss_clip": 0.09844472, + "auxiliary_loss_mlp": 0.01857578, + "balance_loss_clip": 0.07769165, + "balance_loss_mlp": 0.01683151, + "epoch": 0.022365849992484595, + "flos": 22498736901120.0, + "grad_norm": 33.301604666823, + "language_loss": 1.06231499, + "learning_rate": 3.8108927320022896e-06, + "loss": 1.17933559, + "num_input_tokens_seen": 7766765, + "router_z_loss_clip": 20.734375, + "router_z_loss_mlp": 1.74511719, + "step": 372, + "time_per_iteration": 2.7052745819091797 + }, + { + "auxiliary_loss_clip": 0.09826015, + "auxiliary_loss_mlp": 0.01853945, + "balance_loss_clip": 0.07764611, + "balance_loss_mlp": 0.01673796, + "epoch": 0.022425973245152563, + "flos": 17862083694720.0, + "grad_norm": 41.636352487556145, + "language_loss": 1.03913403, + "learning_rate": 3.8126211952779548e-06, + "loss": 1.15593362, + "num_input_tokens_seen": 7784010, + "router_z_loss_clip": 20.640625, + "router_z_loss_mlp": 1.80078125, + "step": 373, + "time_per_iteration": 4.106141090393066 + }, + { + "auxiliary_loss_clip": 0.09845725, + "auxiliary_loss_mlp": 0.01869282, + "balance_loss_clip": 0.07777153, + "balance_loss_mlp": 0.01685128, + "epoch": 0.022486096497820532, + "flos": 15487804327680.0, + "grad_norm": 61.54476347228186, + "language_loss": 1.0650835, + "learning_rate": 3.8143450308016952e-06, + "loss": 1.18223345, + "num_input_tokens_seen": 7801305, + "router_z_loss_clip": 20.703125, + "router_z_loss_mlp": 1.84277344, + "step": 374, + "time_per_iteration": 4.033753871917725 + }, + { + "auxiliary_loss_clip": 0.09812269, + "auxiliary_loss_mlp": 0.01856399, + "balance_loss_clip": 0.07757415, + "balance_loss_mlp": 0.01667095, + "epoch": 0.0225462197504885, + "flos": 27791897247360.0, + "grad_norm": 56.210759270114224, + "language_loss": 1.03319001, + "learning_rate": 3.8160642632878525e-06, + "loss": 1.14987683, + "num_input_tokens_seen": 7823965, + "router_z_loss_clip": 20.5625, + "router_z_loss_mlp": 1.89257812, + "step": 375, + "time_per_iteration": 2.7545790672302246 + }, + { + "auxiliary_loss_clip": 0.0981497, + "auxiliary_loss_mlp": 0.01843627, + "balance_loss_clip": 0.07751609, + "balance_loss_mlp": 0.01665767, + "epoch": 0.02260634300315647, + "flos": 19981804757760.0, + "grad_norm": 57.812718044092065, + "language_loss": 1.07001138, + "learning_rate": 3.817778917253314e-06, + "loss": 1.18659735, + "num_input_tokens_seen": 7842115, + "router_z_loss_clip": 20.625, + "router_z_loss_mlp": 1.77734375, + "step": 376, + "time_per_iteration": 4.076448202133179 + }, + { + "auxiliary_loss_clip": 0.09767978, + "auxiliary_loss_mlp": 0.01843169, + "balance_loss_clip": 0.07741934, + "balance_loss_mlp": 0.01659587, + "epoch": 0.02266646625582444, + "flos": 16032699429120.0, + "grad_norm": 49.61569881920644, + "language_loss": 1.03111744, + "learning_rate": 3.8194890170196155e-06, + "loss": 1.14722896, + "num_input_tokens_seen": 7857830, + "router_z_loss_clip": 20.265625, + "router_z_loss_mlp": 1.83691406, + "step": 377, + "time_per_iteration": 2.7254374027252197 + }, + { + "auxiliary_loss_clip": 0.09738941, + "auxiliary_loss_mlp": 0.01853994, + "balance_loss_clip": 0.07719769, + "balance_loss_mlp": 0.01670221, + "epoch": 0.02272658950849241, + "flos": 20409553451520.0, + "grad_norm": 48.84797020114705, + "language_loss": 1.2001133, + "learning_rate": 3.8211945867150055e-06, + "loss": 1.31604266, + "num_input_tokens_seen": 7875840, + "router_z_loss_clip": 20.171875, + "router_z_loss_mlp": 1.83691406, + "step": 378, + "time_per_iteration": 2.648167848587036 + }, + { + "auxiliary_loss_clip": 0.08046754, + "auxiliary_loss_mlp": 0.0138253, + "balance_loss_clip": 0.07155026, + "balance_loss_mlp": 0.01272953, + "epoch": 0.02278671276116038, + "flos": 69867387469440.0, + "grad_norm": 0.9915915427532991, + "language_loss": 0.75403833, + "learning_rate": 3.822895650276492e-06, + "loss": 0.84833115, + "num_input_tokens_seen": 7940190, + "router_z_loss_clip": 8.90625, + "router_z_loss_mlp": 1.09863281, + "step": 379, + "time_per_iteration": 3.301997661590576 + }, + { + "auxiliary_loss_clip": 0.09709425, + "auxiliary_loss_mlp": 0.01844372, + "balance_loss_clip": 0.07733691, + "balance_loss_mlp": 0.0167643, + "epoch": 0.022846836013828347, + "flos": 38517935823360.0, + "grad_norm": 57.599828595547535, + "language_loss": 1.02933359, + "learning_rate": 3.824592231451859e-06, + "loss": 1.14487147, + "num_input_tokens_seen": 7960840, + "router_z_loss_clip": 19.75, + "router_z_loss_mlp": 1.6796875, + "step": 380, + "time_per_iteration": 2.817310094833374 + }, + { + "auxiliary_loss_clip": 0.09699684, + "auxiliary_loss_mlp": 0.01850822, + "balance_loss_clip": 0.07715706, + "balance_loss_mlp": 0.01682976, + "epoch": 0.02290695926649632, + "flos": 20965768853760.0, + "grad_norm": 97.98649595332142, + "language_loss": 1.19140625, + "learning_rate": 3.826284353801652e-06, + "loss": 1.30691135, + "num_input_tokens_seen": 7975500, + "router_z_loss_clip": 19.875, + "router_z_loss_mlp": 1.6796875, + "step": 381, + "time_per_iteration": 2.6415421962738037 + }, + { + "auxiliary_loss_clip": 0.09691618, + "auxiliary_loss_mlp": 0.01878712, + "balance_loss_clip": 0.0772172, + "balance_loss_mlp": 0.01696942, + "epoch": 0.022967082519164288, + "flos": 24028895836800.0, + "grad_norm": 71.67825440631948, + "language_loss": 1.08586979, + "learning_rate": 3.827972040701142e-06, + "loss": 1.20157313, + "num_input_tokens_seen": 7993880, + "router_z_loss_clip": 19.703125, + "router_z_loss_mlp": 1.81640625, + "step": 382, + "time_per_iteration": 2.688380718231201 + }, + { + "auxiliary_loss_clip": 0.0969088, + "auxiliary_loss_mlp": 0.0187998, + "balance_loss_clip": 0.07735589, + "balance_loss_mlp": 0.01704695, + "epoch": 0.023027205771832256, + "flos": 21003643699200.0, + "grad_norm": 97.39739491884717, + "language_loss": 1.06533158, + "learning_rate": 3.829655315342268e-06, + "loss": 1.18104029, + "num_input_tokens_seen": 8012730, + "router_z_loss_clip": 19.53125, + "router_z_loss_mlp": 1.75292969, + "step": 383, + "time_per_iteration": 2.697038173675537 + }, + { + "auxiliary_loss_clip": 0.09652471, + "auxiliary_loss_mlp": 0.01917586, + "balance_loss_clip": 0.07717164, + "balance_loss_mlp": 0.017485, + "epoch": 0.023087329024500225, + "flos": 21367172638080.0, + "grad_norm": 19.8768776799836, + "language_loss": 1.04799581, + "learning_rate": 3.831334200735543e-06, + "loss": 1.16369653, + "num_input_tokens_seen": 8031275, + "router_z_loss_clip": 19.34375, + "router_z_loss_mlp": 1.68945312, + "step": 384, + "time_per_iteration": 2.778743028640747 + }, + { + "auxiliary_loss_clip": 0.09638548, + "auxiliary_loss_mlp": 0.01934173, + "balance_loss_clip": 0.07711613, + "balance_loss_mlp": 0.01771858, + "epoch": 0.023147452277168194, + "flos": 21879014503680.0, + "grad_norm": 73.36535290584087, + "language_loss": 1.05852127, + "learning_rate": 3.8330087197119426e-06, + "loss": 1.17424858, + "num_input_tokens_seen": 8051600, + "router_z_loss_clip": 19.265625, + "router_z_loss_mlp": 1.62402344, + "step": 385, + "time_per_iteration": 2.6939914226531982 + }, + { + "auxiliary_loss_clip": 0.09652182, + "auxiliary_loss_mlp": 0.01965061, + "balance_loss_clip": 0.07710169, + "balance_loss_mlp": 0.01799503, + "epoch": 0.023207575529836166, + "flos": 18922719876480.0, + "grad_norm": 50.36598663544367, + "language_loss": 0.83061486, + "learning_rate": 3.83467889492477e-06, + "loss": 0.9467873, + "num_input_tokens_seen": 8070600, + "router_z_loss_clip": 19.390625, + "router_z_loss_mlp": 1.65527344, + "step": 386, + "time_per_iteration": 2.655557870864868 + }, + { + "auxiliary_loss_clip": 0.09622966, + "auxiliary_loss_mlp": 0.01950141, + "balance_loss_clip": 0.07707699, + "balance_loss_mlp": 0.01772281, + "epoch": 0.023267698782504134, + "flos": 25052998838400.0, + "grad_norm": 988.1002722416383, + "language_loss": 1.04901791, + "learning_rate": 3.836344748851495e-06, + "loss": 1.16474891, + "num_input_tokens_seen": 8090680, + "router_z_loss_clip": 19.171875, + "router_z_loss_mlp": 1.77832031, + "step": 387, + "time_per_iteration": 2.7180447578430176 + }, + { + "auxiliary_loss_clip": 0.09642081, + "auxiliary_loss_mlp": 0.01949741, + "balance_loss_clip": 0.0771786, + "balance_loss_mlp": 0.0177932, + "epoch": 0.023327822035172103, + "flos": 28887221819520.0, + "grad_norm": 25.325317169555962, + "language_loss": 1.03613186, + "learning_rate": 3.838006303795566e-06, + "loss": 1.15205002, + "num_input_tokens_seen": 8114610, + "router_z_loss_clip": 19.21875, + "router_z_loss_mlp": 1.70410156, + "step": 388, + "time_per_iteration": 2.7562358379364014 + }, + { + "auxiliary_loss_clip": 0.09633669, + "auxiliary_loss_mlp": 0.01946229, + "balance_loss_clip": 0.0770783, + "balance_loss_mlp": 0.01764268, + "epoch": 0.02338794528784007, + "flos": 27128178656640.0, + "grad_norm": 20.981666659787948, + "language_loss": 1.1374321, + "learning_rate": 3.839663581888206e-06, + "loss": 1.25323105, + "num_input_tokens_seen": 8133975, + "router_z_loss_clip": 19.25, + "router_z_loss_mlp": 1.8203125, + "step": 389, + "time_per_iteration": 2.762704372406006 + }, + { + "auxiliary_loss_clip": 0.09556312, + "auxiliary_loss_mlp": 0.01957007, + "balance_loss_clip": 0.07663149, + "balance_loss_mlp": 0.01788016, + "epoch": 0.02344806854050804, + "flos": 21328375397760.0, + "grad_norm": 32.87948782751001, + "language_loss": 1.07566035, + "learning_rate": 3.841316605090178e-06, + "loss": 1.19079351, + "num_input_tokens_seen": 8153570, + "router_z_loss_clip": 18.921875, + "router_z_loss_mlp": 1.68945312, + "step": 390, + "time_per_iteration": 2.659283399581909 + }, + { + "auxiliary_loss_clip": 0.09492537, + "auxiliary_loss_mlp": 0.01896556, + "balance_loss_clip": 0.07636442, + "balance_loss_mlp": 0.01733001, + "epoch": 0.023508191793176012, + "flos": 24796847306880.0, + "grad_norm": 140.16785757024044, + "language_loss": 1.15910161, + "learning_rate": 3.842965395193529e-06, + "loss": 1.27299261, + "num_input_tokens_seen": 8170075, + "router_z_loss_clip": 18.546875, + "router_z_loss_mlp": 1.63476562, + "step": 391, + "time_per_iteration": 2.713545799255371 + }, + { + "auxiliary_loss_clip": 0.09538671, + "auxiliary_loss_mlp": 0.0188554, + "balance_loss_clip": 0.0766757, + "balance_loss_mlp": 0.01730473, + "epoch": 0.02356831504584398, + "flos": 26002651887360.0, + "grad_norm": 36.4029876381944, + "language_loss": 1.06844151, + "learning_rate": 3.84460997382332e-06, + "loss": 1.18268371, + "num_input_tokens_seen": 8190420, + "router_z_loss_clip": 18.6875, + "router_z_loss_mlp": 1.54882812, + "step": 392, + "time_per_iteration": 2.738403081893921 + }, + { + "auxiliary_loss_clip": 0.09424435, + "auxiliary_loss_mlp": 0.01937068, + "balance_loss_clip": 0.07618648, + "balance_loss_mlp": 0.01782287, + "epoch": 0.02362843829851195, + "flos": 19068475253760.0, + "grad_norm": 23.190572901307267, + "language_loss": 1.05277753, + "learning_rate": 3.8462503624393256e-06, + "loss": 1.16639256, + "num_input_tokens_seen": 8208790, + "router_z_loss_clip": 18.0625, + "router_z_loss_mlp": 1.546875, + "step": 393, + "time_per_iteration": 2.730311155319214 + }, + { + "auxiliary_loss_clip": 0.09391345, + "auxiliary_loss_mlp": 0.01894272, + "balance_loss_clip": 0.07595266, + "balance_loss_mlp": 0.01726616, + "epoch": 0.023688561551179918, + "flos": 16076611768320.0, + "grad_norm": 91.86478442531423, + "language_loss": 1.00682688, + "learning_rate": 3.84788658233771e-06, + "loss": 1.11968303, + "num_input_tokens_seen": 8226885, + "router_z_loss_clip": 17.953125, + "router_z_loss_mlp": 1.67578125, + "step": 394, + "time_per_iteration": 2.705462694168091 + }, + { + "auxiliary_loss_clip": 0.09387165, + "auxiliary_loss_mlp": 0.01881808, + "balance_loss_clip": 0.07597888, + "balance_loss_mlp": 0.01708144, + "epoch": 0.023748684803847887, + "flos": 21730575795840.0, + "grad_norm": 29.466731361634597, + "language_loss": 1.02469492, + "learning_rate": 3.84951865465269e-06, + "loss": 1.13738465, + "num_input_tokens_seen": 8246825, + "router_z_loss_clip": 17.875, + "router_z_loss_mlp": 1.73632812, + "step": 395, + "time_per_iteration": 2.67728328704834 + }, + { + "auxiliary_loss_clip": 0.07807533, + "auxiliary_loss_mlp": 0.01422272, + "balance_loss_clip": 0.06998962, + "balance_loss_mlp": 0.01324949, + "epoch": 0.02380880805651586, + "flos": 61944299349120.0, + "grad_norm": 0.9675883167947973, + "language_loss": 0.63979137, + "learning_rate": 3.851146600358172e-06, + "loss": 0.7320894, + "num_input_tokens_seen": 8302835, + "router_z_loss_clip": 8.09375, + "router_z_loss_mlp": 0.97216797, + "step": 396, + "time_per_iteration": 3.085773468017578 + }, + { + "auxiliary_loss_clip": 0.09369384, + "auxiliary_loss_mlp": 0.01878876, + "balance_loss_clip": 0.07592572, + "balance_loss_mlp": 0.01705307, + "epoch": 0.023868931309183827, + "flos": 20272518898560.0, + "grad_norm": 448.6329753345253, + "language_loss": 1.09206522, + "learning_rate": 3.852770440269372e-06, + "loss": 1.20454776, + "num_input_tokens_seen": 8320745, + "router_z_loss_clip": 17.765625, + "router_z_loss_mlp": 1.73632812, + "step": 397, + "time_per_iteration": 2.645312786102295 + }, + { + "auxiliary_loss_clip": 0.09360366, + "auxiliary_loss_mlp": 0.01887806, + "balance_loss_clip": 0.07592075, + "balance_loss_mlp": 0.01703461, + "epoch": 0.023929054561851796, + "flos": 21144954810240.0, + "grad_norm": 35.15382244199787, + "language_loss": 1.09138823, + "learning_rate": 3.854390195044404e-06, + "loss": 1.20386982, + "num_input_tokens_seen": 8339540, + "router_z_loss_clip": 17.671875, + "router_z_loss_mlp": 1.84277344, + "step": 398, + "time_per_iteration": 2.7186756134033203 + }, + { + "auxiliary_loss_clip": 0.09363802, + "auxiliary_loss_mlp": 0.01863352, + "balance_loss_clip": 0.07595689, + "balance_loss_mlp": 0.01681963, + "epoch": 0.023989177814519765, + "flos": 13703548285440.0, + "grad_norm": 79.14501576371894, + "language_loss": 1.17455924, + "learning_rate": 3.856005885185868e-06, + "loss": 1.2868309, + "num_input_tokens_seen": 8354890, + "router_z_loss_clip": 17.6875, + "router_z_loss_mlp": 1.81347656, + "step": 399, + "time_per_iteration": 2.6266868114471436 + }, + { + "auxiliary_loss_clip": 0.09350164, + "auxiliary_loss_mlp": 0.01862402, + "balance_loss_clip": 0.07603092, + "balance_loss_mlp": 0.0168683, + "epoch": 0.024049301067187733, + "flos": 26329060667520.0, + "grad_norm": 31.26445557719831, + "language_loss": 1.02793097, + "learning_rate": 3.857617531042398e-06, + "loss": 1.14005673, + "num_input_tokens_seen": 8375845, + "router_z_loss_clip": 17.46875, + "router_z_loss_mlp": 1.75585938, + "step": 400, + "time_per_iteration": 2.766996145248413 + }, + { + "auxiliary_loss_clip": 0.09326777, + "auxiliary_loss_mlp": 0.01879183, + "balance_loss_clip": 0.07581857, + "balance_loss_mlp": 0.01707522, + "epoch": 0.024109424319855705, + "flos": 24432270192000.0, + "grad_norm": 165.70452294486532, + "language_loss": 0.98901701, + "learning_rate": 3.8592251528102065e-06, + "loss": 1.1010766, + "num_input_tokens_seen": 8395240, + "router_z_loss_clip": 17.46875, + "router_z_loss_mlp": 1.71679688, + "step": 401, + "time_per_iteration": 2.6877481937408447 + }, + { + "auxiliary_loss_clip": 0.09325443, + "auxiliary_loss_mlp": 0.01927273, + "balance_loss_clip": 0.0761469, + "balance_loss_mlp": 0.01736538, + "epoch": 0.024169547572523674, + "flos": 29611764074880.0, + "grad_norm": 158.83382742696674, + "language_loss": 1.04086566, + "learning_rate": 3.8608287705345976e-06, + "loss": 1.15339279, + "num_input_tokens_seen": 8416950, + "router_z_loss_clip": 17.09375, + "router_z_loss_mlp": 1.90722656, + "step": 402, + "time_per_iteration": 2.7297163009643555 + }, + { + "auxiliary_loss_clip": 0.09320071, + "auxiliary_loss_mlp": 0.01914681, + "balance_loss_clip": 0.07593916, + "balance_loss_mlp": 0.01724327, + "epoch": 0.024229670825191642, + "flos": 22608042952320.0, + "grad_norm": 474.9195361774189, + "language_loss": 1.23886442, + "learning_rate": 3.86242840411147e-06, + "loss": 1.35121191, + "num_input_tokens_seen": 8433660, + "router_z_loss_clip": 17.265625, + "router_z_loss_mlp": 1.90234375, + "step": 403, + "time_per_iteration": 2.6663832664489746 + }, + { + "auxiliary_loss_clip": 0.09310063, + "auxiliary_loss_mlp": 0.01918458, + "balance_loss_clip": 0.07606195, + "balance_loss_mlp": 0.01729535, + "epoch": 0.02428979407785961, + "flos": 18156110071680.0, + "grad_norm": 557.4725363749534, + "language_loss": 1.23195148, + "learning_rate": 3.864024073288798e-06, + "loss": 1.34423661, + "num_input_tokens_seen": 8450180, + "router_z_loss_clip": 17.0625, + "router_z_loss_mlp": 1.88867188, + "step": 404, + "time_per_iteration": 2.6930551528930664 + }, + { + "auxiliary_loss_clip": 0.09236102, + "auxiliary_loss_mlp": 0.01972168, + "balance_loss_clip": 0.07543309, + "balance_loss_mlp": 0.01765125, + "epoch": 0.024349917330527583, + "flos": 15310463160960.0, + "grad_norm": 32.91094539461264, + "language_loss": 1.10026622, + "learning_rate": 3.865615797668091e-06, + "loss": 1.21234894, + "num_input_tokens_seen": 8467775, + "router_z_loss_clip": 16.921875, + "router_z_loss_mlp": 2.0703125, + "step": 405, + "time_per_iteration": 2.7313172817230225 + }, + { + "auxiliary_loss_clip": 0.09182028, + "auxiliary_loss_mlp": 0.01998566, + "balance_loss_clip": 0.0751636, + "balance_loss_mlp": 0.01782559, + "epoch": 0.024410040583195552, + "flos": 20779623008640.0, + "grad_norm": 51.884422925202074, + "language_loss": 1.20401216, + "learning_rate": 3.867203596705844e-06, + "loss": 1.31581819, + "num_input_tokens_seen": 8486765, + "router_z_loss_clip": 16.65625, + "router_z_loss_mlp": 2.16015625, + "step": 406, + "time_per_iteration": 2.687269449234009 + }, + { + "auxiliary_loss_clip": 0.09164648, + "auxiliary_loss_mlp": 0.02058169, + "balance_loss_clip": 0.07528092, + "balance_loss_mlp": 0.01824328, + "epoch": 0.02447016383586352, + "flos": 21805319237760.0, + "grad_norm": 51.34272238318618, + "language_loss": 1.09166133, + "learning_rate": 3.86878748971496e-06, + "loss": 1.20388949, + "num_input_tokens_seen": 8506515, + "router_z_loss_clip": 16.375, + "router_z_loss_mlp": 2.33789062, + "step": 407, + "time_per_iteration": 2.7443573474884033 + }, + { + "auxiliary_loss_clip": 0.0913244, + "auxiliary_loss_mlp": 0.02070529, + "balance_loss_clip": 0.07525964, + "balance_loss_mlp": 0.01834208, + "epoch": 0.02453028708853149, + "flos": 33956529183360.0, + "grad_norm": 76.90003006133684, + "language_loss": 0.92362475, + "learning_rate": 3.8703674958661596e-06, + "loss": 1.03565443, + "num_input_tokens_seen": 8528035, + "router_z_loss_clip": 16.0546875, + "router_z_loss_mlp": 2.36132812, + "step": 408, + "time_per_iteration": 2.78354549407959 + }, + { + "auxiliary_loss_clip": 0.09112523, + "auxiliary_loss_mlp": 0.02060747, + "balance_loss_clip": 0.07508834, + "balance_loss_mlp": 0.01828241, + "epoch": 0.024590410341199458, + "flos": 21798485130240.0, + "grad_norm": 96.45423831363296, + "language_loss": 1.18704772, + "learning_rate": 3.871943634189376e-06, + "loss": 1.29878044, + "num_input_tokens_seen": 8546455, + "router_z_loss_clip": 16.015625, + "router_z_loss_mlp": 2.32421875, + "step": 409, + "time_per_iteration": 2.7200136184692383 + }, + { + "auxiliary_loss_clip": 0.09154539, + "auxiliary_loss_mlp": 0.02068674, + "balance_loss_clip": 0.07541502, + "balance_loss_mlp": 0.01836741, + "epoch": 0.02465053359386743, + "flos": 35123243034240.0, + "grad_norm": 76.46793311342431, + "language_loss": 1.05106175, + "learning_rate": 3.873515923575128e-06, + "loss": 1.16329384, + "num_input_tokens_seen": 8568450, + "router_z_loss_clip": 16.1171875, + "router_z_loss_mlp": 2.3203125, + "step": 410, + "time_per_iteration": 2.7935402393341064 + }, + { + "auxiliary_loss_clip": 0.09179245, + "auxiliary_loss_mlp": 0.02052485, + "balance_loss_clip": 0.07555975, + "balance_loss_mlp": 0.01831042, + "epoch": 0.0247106568465354, + "flos": 27458360870400.0, + "grad_norm": 178.4501833385731, + "language_loss": 1.0301317, + "learning_rate": 3.875084382775879e-06, + "loss": 1.14244902, + "num_input_tokens_seen": 8589340, + "router_z_loss_clip": 16.25, + "router_z_loss_mlp": 2.21679688, + "step": 411, + "time_per_iteration": 2.810314416885376 + }, + { + "auxiliary_loss_clip": 0.09117973, + "auxiliary_loss_mlp": 0.02147569, + "balance_loss_clip": 0.07523946, + "balance_loss_mlp": 0.01899232, + "epoch": 0.024770780099203367, + "flos": 20709994665600.0, + "grad_norm": 31.381834451084366, + "language_loss": 1.07807076, + "learning_rate": 3.87664903040738e-06, + "loss": 1.19072616, + "num_input_tokens_seen": 8607150, + "router_z_loss_clip": 15.9375, + "router_z_loss_mlp": 2.48242188, + "step": 412, + "time_per_iteration": 4.135298252105713 + }, + { + "auxiliary_loss_clip": 0.0766484, + "auxiliary_loss_mlp": 0.01383218, + "balance_loss_clip": 0.06950212, + "balance_loss_mlp": 0.01289853, + "epoch": 0.024830903351871336, + "flos": 69571264740480.0, + "grad_norm": 0.8458100626859368, + "language_loss": 0.58554661, + "learning_rate": 3.878209884949994e-06, + "loss": 0.67602718, + "num_input_tokens_seen": 8669865, + "router_z_loss_clip": 7.13671875, + "router_z_loss_mlp": 0.93261719, + "step": 413, + "time_per_iteration": 4.813804864883423 + }, + { + "auxiliary_loss_clip": 0.09105721, + "auxiliary_loss_mlp": 0.02060854, + "balance_loss_clip": 0.07511897, + "balance_loss_mlp": 0.01837503, + "epoch": 0.024891026604539304, + "flos": 32278728153600.0, + "grad_norm": 48.89104730966055, + "language_loss": 0.9726972, + "learning_rate": 3.879766964750006e-06, + "loss": 1.08436298, + "num_input_tokens_seen": 8690235, + "router_z_loss_clip": 15.921875, + "router_z_loss_mlp": 2.234375, + "step": 414, + "time_per_iteration": 2.777872323989868 + }, + { + "auxiliary_loss_clip": 0.0905456, + "auxiliary_loss_mlp": 0.02077859, + "balance_loss_clip": 0.07483284, + "balance_loss_mlp": 0.0185365, + "epoch": 0.024951149857207276, + "flos": 18845712374400.0, + "grad_norm": 208.18956686369972, + "language_loss": 1.01095724, + "learning_rate": 3.881320288020917e-06, + "loss": 1.12228131, + "num_input_tokens_seen": 8706295, + "router_z_loss_clip": 15.71875, + "router_z_loss_mlp": 2.24023438, + "step": 415, + "time_per_iteration": 4.142550230026245 + }, + { + "auxiliary_loss_clip": 0.09080397, + "auxiliary_loss_mlp": 0.02074643, + "balance_loss_clip": 0.07484584, + "balance_loss_mlp": 0.0184805, + "epoch": 0.025011273109875245, + "flos": 15382565199360.0, + "grad_norm": 178.52142115782007, + "language_loss": 1.28543544, + "learning_rate": 3.882869872844723e-06, + "loss": 1.39698577, + "num_input_tokens_seen": 8724200, + "router_z_loss_clip": 15.953125, + "router_z_loss_mlp": 2.26757812, + "step": 416, + "time_per_iteration": 2.6912667751312256 + }, + { + "auxiliary_loss_clip": 0.09093624, + "auxiliary_loss_mlp": 0.02048458, + "balance_loss_clip": 0.07498566, + "balance_loss_mlp": 0.01806797, + "epoch": 0.025071396362543213, + "flos": 18921336284160.0, + "grad_norm": 52.83271193802728, + "language_loss": 0.94415307, + "learning_rate": 3.884415737173176e-06, + "loss": 1.05557394, + "num_input_tokens_seen": 8744170, + "router_z_loss_clip": 15.9609375, + "router_z_loss_mlp": 2.41796875, + "step": 417, + "time_per_iteration": 2.6768579483032227 + }, + { + "auxiliary_loss_clip": 0.0906695, + "auxiliary_loss_mlp": 0.02050523, + "balance_loss_clip": 0.07510033, + "balance_loss_mlp": 0.01817826, + "epoch": 0.025131519615211182, + "flos": 25345012717440.0, + "grad_norm": 47.28632079324067, + "language_loss": 0.95738804, + "learning_rate": 3.8859578988290344e-06, + "loss": 1.06856275, + "num_input_tokens_seen": 8765120, + "router_z_loss_clip": 15.5625, + "router_z_loss_mlp": 2.328125, + "step": 418, + "time_per_iteration": 2.7193026542663574 + }, + { + "auxiliary_loss_clip": 0.09048779, + "auxiliary_loss_mlp": 0.02107992, + "balance_loss_clip": 0.07468801, + "balance_loss_mlp": 0.01844969, + "epoch": 0.02519164286787915, + "flos": 18959169202560.0, + "grad_norm": 64.96228222580599, + "language_loss": 1.10502434, + "learning_rate": 3.887496375507294e-06, + "loss": 1.21659207, + "num_input_tokens_seen": 8783500, + "router_z_loss_clip": 15.7890625, + "router_z_loss_mlp": 2.62890625, + "step": 419, + "time_per_iteration": 2.661895513534546 + }, + { + "auxiliary_loss_clip": 0.09047179, + "auxiliary_loss_mlp": 0.02074314, + "balance_loss_clip": 0.07473344, + "balance_loss_mlp": 0.01826931, + "epoch": 0.025251766120547123, + "flos": 17426913914880.0, + "grad_norm": 60.48178105720379, + "language_loss": 0.91689897, + "learning_rate": 3.8890311847764065e-06, + "loss": 1.02811384, + "num_input_tokens_seen": 8801175, + "router_z_loss_clip": 15.7265625, + "router_z_loss_mlp": 2.47070312, + "step": 420, + "time_per_iteration": 2.690960168838501 + }, + { + "auxiliary_loss_clip": 0.09091747, + "auxiliary_loss_mlp": 0.02038651, + "balance_loss_clip": 0.07504605, + "balance_loss_mlp": 0.01800423, + "epoch": 0.02531188937321509, + "flos": 25052328005760.0, + "grad_norm": 83.61542449738408, + "language_loss": 0.95396888, + "learning_rate": 3.890562344079484e-06, + "loss": 1.06527293, + "num_input_tokens_seen": 8820215, + "router_z_loss_clip": 15.875, + "router_z_loss_mlp": 2.38085938, + "step": 421, + "time_per_iteration": 2.713627338409424 + }, + { + "auxiliary_loss_clip": 0.0910122, + "auxiliary_loss_mlp": 0.02078743, + "balance_loss_clip": 0.07504999, + "balance_loss_mlp": 0.0184185, + "epoch": 0.02537201262588306, + "flos": 30600214364160.0, + "grad_norm": 131.53322969932037, + "language_loss": 1.06396794, + "learning_rate": 3.89208987073549e-06, + "loss": 1.17576766, + "num_input_tokens_seen": 8839660, + "router_z_loss_clip": 15.96875, + "router_z_loss_mlp": 2.36914062, + "step": 422, + "time_per_iteration": 2.779984712600708 + }, + { + "auxiliary_loss_clip": 0.09149099, + "auxiliary_loss_mlp": 0.02005588, + "balance_loss_clip": 0.07524605, + "balance_loss_mlp": 0.01778041, + "epoch": 0.02543213587855103, + "flos": 26072154449280.0, + "grad_norm": 215.69560731113194, + "language_loss": 1.02335918, + "learning_rate": 3.893613781940409e-06, + "loss": 1.13490605, + "num_input_tokens_seen": 8859280, + "router_z_loss_clip": 16.2265625, + "router_z_loss_mlp": 2.27148438, + "step": 423, + "time_per_iteration": 2.72013783454895 + }, + { + "auxiliary_loss_clip": 0.09173086, + "auxiliary_loss_mlp": 0.0200403, + "balance_loss_clip": 0.07535084, + "balance_loss_mlp": 0.01785067, + "epoch": 0.025492259131218997, + "flos": 36030744679680.0, + "grad_norm": 27.081185373152007, + "language_loss": 0.91272038, + "learning_rate": 3.895134094768415e-06, + "loss": 1.02449155, + "num_input_tokens_seen": 8880560, + "router_z_loss_clip": 16.375, + "router_z_loss_mlp": 2.18945312, + "step": 424, + "time_per_iteration": 2.8317928314208984 + }, + { + "auxiliary_loss_clip": 0.09242675, + "auxiliary_loss_mlp": 0.01968499, + "balance_loss_clip": 0.07578178, + "balance_loss_mlp": 0.01753446, + "epoch": 0.02555238238388697, + "flos": 18593963182080.0, + "grad_norm": 166.26721899755887, + "language_loss": 1.05789995, + "learning_rate": 3.896650826173015e-06, + "loss": 1.17001164, + "num_input_tokens_seen": 8899155, + "router_z_loss_clip": 16.625, + "router_z_loss_mlp": 2.15332031, + "step": 425, + "time_per_iteration": 2.660106897354126 + }, + { + "auxiliary_loss_clip": 0.0923897, + "auxiliary_loss_mlp": 0.01943853, + "balance_loss_clip": 0.07566722, + "balance_loss_mlp": 0.01731852, + "epoch": 0.025612505636554938, + "flos": 24250023561600.0, + "grad_norm": 44.6180367993383, + "language_loss": 1.08164155, + "learning_rate": 3.898163992988186e-06, + "loss": 1.19346988, + "num_input_tokens_seen": 8917890, + "router_z_loss_clip": 16.703125, + "router_z_loss_mlp": 2.12109375, + "step": 426, + "time_per_iteration": 2.713566303253174 + }, + { + "auxiliary_loss_clip": 0.07567823, + "auxiliary_loss_mlp": 0.0137553, + "balance_loss_clip": 0.06925757, + "balance_loss_mlp": 0.01282499, + "epoch": 0.025672628889222907, + "flos": 60606617241600.0, + "grad_norm": 0.882551554014491, + "language_loss": 0.57127881, + "learning_rate": 3.899673611929491e-06, + "loss": 0.66071236, + "num_input_tokens_seen": 8978260, + "router_z_loss_clip": 6.43359375, + "router_z_loss_mlp": 0.92919922, + "step": 427, + "time_per_iteration": 3.3642380237579346 + }, + { + "auxiliary_loss_clip": 0.09344095, + "auxiliary_loss_mlp": 0.01954303, + "balance_loss_clip": 0.0761513, + "balance_loss_mlp": 0.01743541, + "epoch": 0.025732752141890875, + "flos": 19579352797440.0, + "grad_norm": 32.1114157010126, + "language_loss": 1.08901465, + "learning_rate": 3.901179699595194e-06, + "loss": 1.20199859, + "num_input_tokens_seen": 8994460, + "router_z_loss_clip": 17.296875, + "router_z_loss_mlp": 2.10839844, + "step": 428, + "time_per_iteration": 2.6606802940368652 + }, + { + "auxiliary_loss_clip": 0.09310514, + "auxiliary_loss_mlp": 0.01961632, + "balance_loss_clip": 0.07603246, + "balance_loss_mlp": 0.01752969, + "epoch": 0.025792875394558847, + "flos": 31292164581120.0, + "grad_norm": 36.551830180207176, + "language_loss": 1.00762367, + "learning_rate": 3.902682272467353e-06, + "loss": 1.12034512, + "num_input_tokens_seen": 9016670, + "router_z_loss_clip": 17.046875, + "router_z_loss_mlp": 2.08984375, + "step": 429, + "time_per_iteration": 2.8459787368774414 + }, + { + "auxiliary_loss_clip": 0.09338318, + "auxiliary_loss_mlp": 0.01955653, + "balance_loss_clip": 0.07623117, + "balance_loss_mlp": 0.01745367, + "epoch": 0.025852998647226816, + "flos": 32387824569600.0, + "grad_norm": 62.5354126598028, + "language_loss": 1.05025983, + "learning_rate": 3.904181346912895e-06, + "loss": 1.16319966, + "num_input_tokens_seen": 9039720, + "router_z_loss_clip": 17.15625, + "router_z_loss_mlp": 2.10644531, + "step": 430, + "time_per_iteration": 2.8446128368377686 + }, + { + "auxiliary_loss_clip": 0.09278628, + "auxiliary_loss_mlp": 0.01943414, + "balance_loss_clip": 0.07600376, + "balance_loss_mlp": 0.01729219, + "epoch": 0.025913121899894784, + "flos": 20199452538240.0, + "grad_norm": 28.225993864396795, + "language_loss": 1.00378919, + "learning_rate": 3.905676939184698e-06, + "loss": 1.11600959, + "num_input_tokens_seen": 9059850, + "router_z_loss_clip": 16.78125, + "router_z_loss_mlp": 2.14453125, + "step": 431, + "time_per_iteration": 2.735534906387329 + }, + { + "auxiliary_loss_clip": 0.09339449, + "auxiliary_loss_mlp": 0.01919694, + "balance_loss_clip": 0.07634744, + "balance_loss_mlp": 0.01714844, + "epoch": 0.025973245152562753, + "flos": 14725680716160.0, + "grad_norm": 242.91179280184718, + "language_loss": 1.11488628, + "learning_rate": 3.907169065422638e-06, + "loss": 1.22747779, + "num_input_tokens_seen": 9077590, + "router_z_loss_clip": 17.046875, + "router_z_loss_mlp": 2.04882812, + "step": 432, + "time_per_iteration": 2.6356372833251953 + }, + { + "auxiliary_loss_clip": 0.09349881, + "auxiliary_loss_mlp": 0.01923388, + "balance_loss_clip": 0.07619249, + "balance_loss_mlp": 0.01717585, + "epoch": 0.02603336840523072, + "flos": 31000947315840.0, + "grad_norm": 39.86728122976192, + "language_loss": 0.95303321, + "learning_rate": 3.908657741654636e-06, + "loss": 1.06576586, + "num_input_tokens_seen": 9099880, + "router_z_loss_clip": 17.328125, + "router_z_loss_mlp": 2.06054688, + "step": 433, + "time_per_iteration": 2.7784080505371094 + }, + { + "auxiliary_loss_clip": 0.09401309, + "auxiliary_loss_mlp": 0.0191169, + "balance_loss_clip": 0.07644869, + "balance_loss_mlp": 0.01712276, + "epoch": 0.026093491657898694, + "flos": 17679753210240.0, + "grad_norm": 1553.0281168066135, + "language_loss": 1.08543563, + "learning_rate": 3.910142983797699e-06, + "loss": 1.19856548, + "num_input_tokens_seen": 9118620, + "router_z_loss_clip": 17.5625, + "router_z_loss_mlp": 1.99511719, + "step": 434, + "time_per_iteration": 2.668267250061035 + }, + { + "auxiliary_loss_clip": 0.09433939, + "auxiliary_loss_mlp": 0.01869234, + "balance_loss_clip": 0.07651832, + "balance_loss_mlp": 0.01678308, + "epoch": 0.026153614910566662, + "flos": 17863593068160.0, + "grad_norm": 33.64342024905016, + "language_loss": 1.03063393, + "learning_rate": 3.9116248076589305e-06, + "loss": 1.14366555, + "num_input_tokens_seen": 9135655, + "router_z_loss_clip": 17.8125, + "router_z_loss_mlp": 1.90917969, + "step": 435, + "time_per_iteration": 2.6838159561157227 + }, + { + "auxiliary_loss_clip": 0.09478317, + "auxiliary_loss_mlp": 0.01863685, + "balance_loss_clip": 0.07678007, + "balance_loss_mlp": 0.01671615, + "epoch": 0.02621373816323463, + "flos": 20017289761920.0, + "grad_norm": 41.08687640619308, + "language_loss": 1.07638645, + "learning_rate": 3.913103228936546e-06, + "loss": 1.18980646, + "num_input_tokens_seen": 9153520, + "router_z_loss_clip": 18.0, + "router_z_loss_mlp": 1.91992188, + "step": 436, + "time_per_iteration": 2.760547399520874 + }, + { + "auxiliary_loss_clip": 0.09473966, + "auxiliary_loss_mlp": 0.0187601, + "balance_loss_clip": 0.07674257, + "balance_loss_mlp": 0.01688708, + "epoch": 0.0262738614159026, + "flos": 19287213137280.0, + "grad_norm": 53.25711722147742, + "language_loss": 0.98595166, + "learning_rate": 3.914578263220868e-06, + "loss": 1.09945142, + "num_input_tokens_seen": 9170750, + "router_z_loss_clip": 18.0, + "router_z_loss_mlp": 1.87402344, + "step": 437, + "time_per_iteration": 2.6779754161834717 + }, + { + "auxiliary_loss_clip": 0.0942243, + "auxiliary_loss_mlp": 0.01861842, + "balance_loss_clip": 0.0761686, + "balance_loss_mlp": 0.01679594, + "epoch": 0.026333984668570568, + "flos": 18813204190080.0, + "grad_norm": 25.40915552443808, + "language_loss": 1.10034943, + "learning_rate": 3.916049925995316e-06, + "loss": 1.21319222, + "num_input_tokens_seen": 9188430, + "router_z_loss_clip": 18.03125, + "router_z_loss_mlp": 1.82421875, + "step": 438, + "time_per_iteration": 2.6451144218444824 + }, + { + "auxiliary_loss_clip": 0.07475804, + "auxiliary_loss_mlp": 0.01367854, + "balance_loss_clip": 0.06865337, + "balance_loss_mlp": 0.01290463, + "epoch": 0.02639410792123854, + "flos": 64593723196800.0, + "grad_norm": 0.9063737016618233, + "language_loss": 0.62703174, + "learning_rate": 3.917518232637377e-06, + "loss": 0.71546829, + "num_input_tokens_seen": 9255835, + "router_z_loss_clip": 6.09765625, + "router_z_loss_mlp": 0.77294922, + "step": 439, + "time_per_iteration": 3.321974992752075 + }, + { + "auxiliary_loss_clip": 0.09522887, + "auxiliary_loss_mlp": 0.0184955, + "balance_loss_clip": 0.07696441, + "balance_loss_mlp": 0.01671499, + "epoch": 0.02645423117390651, + "flos": 28480661009280.0, + "grad_norm": 87.92324241889918, + "language_loss": 0.94047898, + "learning_rate": 3.918983198419573e-06, + "loss": 1.05420327, + "num_input_tokens_seen": 9276835, + "router_z_loss_clip": 18.25, + "router_z_loss_mlp": 1.78027344, + "step": 440, + "time_per_iteration": 2.7474722862243652 + }, + { + "auxiliary_loss_clip": 0.09507709, + "auxiliary_loss_mlp": 0.01844884, + "balance_loss_clip": 0.07691655, + "balance_loss_mlp": 0.01676846, + "epoch": 0.026514354426574478, + "flos": 18557094585600.0, + "grad_norm": 21.281112340814676, + "language_loss": 1.01854694, + "learning_rate": 3.920444838510415e-06, + "loss": 1.13207293, + "num_input_tokens_seen": 9295075, + "router_z_loss_clip": 18.171875, + "router_z_loss_mlp": 1.68066406, + "step": 441, + "time_per_iteration": 2.6456263065338135 + }, + { + "auxiliary_loss_clip": 0.09501958, + "auxiliary_loss_mlp": 0.01843855, + "balance_loss_clip": 0.07712354, + "balance_loss_mlp": 0.01682208, + "epoch": 0.026574477679242446, + "flos": 20674090391040.0, + "grad_norm": 41.33053095224922, + "language_loss": 0.97709602, + "learning_rate": 3.92190316797534e-06, + "loss": 1.09055424, + "num_input_tokens_seen": 9314205, + "router_z_loss_clip": 17.890625, + "router_z_loss_mlp": 1.61621094, + "step": 442, + "time_per_iteration": 2.672673463821411 + }, + { + "auxiliary_loss_clip": 0.07433579, + "auxiliary_loss_mlp": 0.01330966, + "balance_loss_clip": 0.06849352, + "balance_loss_mlp": 0.01265354, + "epoch": 0.026634600931910415, + "flos": 57974718896640.0, + "grad_norm": 0.9677279434812149, + "language_loss": 0.64635992, + "learning_rate": 3.92335820177765e-06, + "loss": 0.73400539, + "num_input_tokens_seen": 9367395, + "router_z_loss_clip": 5.83203125, + "router_z_loss_mlp": 0.65625, + "step": 443, + "time_per_iteration": 3.173064947128296 + }, + { + "auxiliary_loss_clip": 0.09527416, + "auxiliary_loss_mlp": 0.01860056, + "balance_loss_clip": 0.07710861, + "balance_loss_mlp": 0.01695928, + "epoch": 0.026694724184578387, + "flos": 15820586017920.0, + "grad_norm": 61.63283491372988, + "language_loss": 1.0548501, + "learning_rate": 3.924809954779425e-06, + "loss": 1.16872489, + "num_input_tokens_seen": 9385185, + "router_z_loss_clip": 18.15625, + "router_z_loss_mlp": 1.64160156, + "step": 444, + "time_per_iteration": 2.639677047729492 + }, + { + "auxiliary_loss_clip": 0.09502187, + "auxiliary_loss_mlp": 0.01838362, + "balance_loss_clip": 0.07703182, + "balance_loss_mlp": 0.01668608, + "epoch": 0.026754847437246355, + "flos": 23446922503680.0, + "grad_norm": 26.361183363910182, + "language_loss": 1.13923943, + "learning_rate": 3.9262584417424425e-06, + "loss": 1.2526449, + "num_input_tokens_seen": 9403225, + "router_z_loss_clip": 17.96875, + "router_z_loss_mlp": 1.69824219, + "step": 445, + "time_per_iteration": 2.6820874214172363 + }, + { + "auxiliary_loss_clip": 0.09478995, + "auxiliary_loss_mlp": 0.01847369, + "balance_loss_clip": 0.07693952, + "balance_loss_mlp": 0.01688678, + "epoch": 0.026814970689914324, + "flos": 17346552249600.0, + "grad_norm": 24.407324377890284, + "language_loss": 1.13474417, + "learning_rate": 3.9277036773290725e-06, + "loss": 1.24800777, + "num_input_tokens_seen": 9420540, + "router_z_loss_clip": 17.84375, + "router_z_loss_mlp": 1.5859375, + "step": 446, + "time_per_iteration": 2.6508054733276367 + }, + { + "auxiliary_loss_clip": 0.09462097, + "auxiliary_loss_mlp": 0.01860509, + "balance_loss_clip": 0.07703365, + "balance_loss_mlp": 0.01698385, + "epoch": 0.026875093942582293, + "flos": 17900503591680.0, + "grad_norm": 17.536194577693298, + "language_loss": 0.97970635, + "learning_rate": 3.92914567610317e-06, + "loss": 1.09293234, + "num_input_tokens_seen": 9438840, + "router_z_loss_clip": 17.609375, + "router_z_loss_mlp": 1.62109375, + "step": 447, + "time_per_iteration": 2.6584267616271973 + }, + { + "auxiliary_loss_clip": 0.0948635, + "auxiliary_loss_mlp": 0.01891451, + "balance_loss_clip": 0.0770483, + "balance_loss_mlp": 0.01723413, + "epoch": 0.026935217195250265, + "flos": 21730114598400.0, + "grad_norm": 21.562911901589327, + "language_loss": 1.05652094, + "learning_rate": 3.930584452530952e-06, + "loss": 1.17029905, + "num_input_tokens_seen": 9457215, + "router_z_loss_clip": 17.8125, + "router_z_loss_mlp": 1.67871094, + "step": 448, + "time_per_iteration": 2.672372341156006 + }, + { + "auxiliary_loss_clip": 0.09413482, + "auxiliary_loss_mlp": 0.01902533, + "balance_loss_clip": 0.07671943, + "balance_loss_mlp": 0.01741266, + "epoch": 0.026995340447918233, + "flos": 23629378769280.0, + "grad_norm": 23.02833788504926, + "language_loss": 1.03788567, + "learning_rate": 3.9320200209818755e-06, + "loss": 1.1510458, + "num_input_tokens_seen": 9475615, + "router_z_loss_clip": 17.421875, + "router_z_loss_mlp": 1.61328125, + "step": 449, + "time_per_iteration": 2.7325220108032227 + }, + { + "auxiliary_loss_clip": 0.09437311, + "auxiliary_loss_mlp": 0.01924822, + "balance_loss_clip": 0.07667883, + "balance_loss_mlp": 0.0175955, + "epoch": 0.027055463700586202, + "flos": 17937078698880.0, + "grad_norm": 25.829396596685555, + "language_loss": 1.03924859, + "learning_rate": 3.933452395729493e-06, + "loss": 1.15286994, + "num_input_tokens_seen": 9493975, + "router_z_loss_clip": 17.703125, + "router_z_loss_mlp": 1.65332031, + "step": 450, + "time_per_iteration": 2.7811074256896973 + }, + { + "auxiliary_loss_clip": 0.09359707, + "auxiliary_loss_mlp": 0.01970194, + "balance_loss_clip": 0.0764256, + "balance_loss_mlp": 0.01786802, + "epoch": 0.02711558695325417, + "flos": 25125897490560.0, + "grad_norm": 13.607653987068408, + "language_loss": 0.94443107, + "learning_rate": 3.934881590952304e-06, + "loss": 1.05773008, + "num_input_tokens_seen": 9514810, + "router_z_loss_clip": 17.171875, + "router_z_loss_mlp": 1.83398438, + "step": 451, + "time_per_iteration": 2.7412643432617188 + }, + { + "auxiliary_loss_clip": 0.09335385, + "auxiliary_loss_mlp": 0.02017307, + "balance_loss_clip": 0.07637483, + "balance_loss_mlp": 0.0183115, + "epoch": 0.02717571020592214, + "flos": 24245788930560.0, + "grad_norm": 37.22783951143226, + "language_loss": 0.88836813, + "learning_rate": 3.936307620734599e-06, + "loss": 1.00189495, + "num_input_tokens_seen": 9533635, + "router_z_loss_clip": 16.984375, + "router_z_loss_mlp": 1.86132812, + "step": 452, + "time_per_iteration": 4.115676403045654 + }, + { + "auxiliary_loss_clip": 0.09290475, + "auxiliary_loss_mlp": 0.0203207, + "balance_loss_clip": 0.07611442, + "balance_loss_mlp": 0.01843815, + "epoch": 0.02723583345859011, + "flos": 25125939417600.0, + "grad_norm": 26.908598142012707, + "language_loss": 0.85555518, + "learning_rate": 3.937730499067294e-06, + "loss": 0.96878058, + "num_input_tokens_seen": 9555420, + "router_z_loss_clip": 16.796875, + "router_z_loss_mlp": 1.88378906, + "step": 453, + "time_per_iteration": 4.138639211654663 + }, + { + "auxiliary_loss_clip": 0.09325944, + "auxiliary_loss_mlp": 0.02084866, + "balance_loss_clip": 0.07637945, + "balance_loss_mlp": 0.01890889, + "epoch": 0.02729595671125808, + "flos": 42751550090880.0, + "grad_norm": 24.937148454808558, + "language_loss": 1.02160192, + "learning_rate": 3.939150239848748e-06, + "loss": 1.13570988, + "num_input_tokens_seen": 9578950, + "router_z_loss_clip": 16.90625, + "router_z_loss_mlp": 1.94140625, + "step": 454, + "time_per_iteration": 2.851925849914551 + }, + { + "auxiliary_loss_clip": 0.09296365, + "auxiliary_loss_mlp": 0.02123722, + "balance_loss_clip": 0.07621342, + "balance_loss_mlp": 0.01917728, + "epoch": 0.02735607996392605, + "flos": 21436884835200.0, + "grad_norm": 33.11607572615514, + "language_loss": 0.89587128, + "learning_rate": 3.9405668568855866e-06, + "loss": 1.01007211, + "num_input_tokens_seen": 9598160, + "router_z_loss_clip": 16.734375, + "router_z_loss_mlp": 2.0625, + "step": 455, + "time_per_iteration": 4.109623432159424 + }, + { + "auxiliary_loss_clip": 0.09291606, + "auxiliary_loss_mlp": 0.02163595, + "balance_loss_clip": 0.07605162, + "balance_loss_mlp": 0.01945966, + "epoch": 0.027416203216594017, + "flos": 20857762540800.0, + "grad_norm": 21.694013226548094, + "language_loss": 0.99008209, + "learning_rate": 3.941980363893499e-06, + "loss": 1.10463405, + "num_input_tokens_seen": 9616010, + "router_z_loss_clip": 16.84375, + "router_z_loss_mlp": 2.17773438, + "step": 456, + "time_per_iteration": 2.6782984733581543 + }, + { + "auxiliary_loss_clip": 0.09230845, + "auxiliary_loss_mlp": 0.02187109, + "balance_loss_clip": 0.07574348, + "balance_loss_mlp": 0.01970243, + "epoch": 0.027476326469261986, + "flos": 13229497411200.0, + "grad_norm": 28.08353344684151, + "language_loss": 0.97085631, + "learning_rate": 3.9433907744980384e-06, + "loss": 1.0850358, + "num_input_tokens_seen": 9634000, + "router_z_loss_clip": 16.5625, + "router_z_loss_mlp": 2.16894531, + "step": 457, + "time_per_iteration": 2.6582846641540527 + }, + { + "auxiliary_loss_clip": 0.09249748, + "auxiliary_loss_mlp": 0.02209668, + "balance_loss_clip": 0.07581042, + "balance_loss_mlp": 0.01978497, + "epoch": 0.027536449721929958, + "flos": 24031369532160.0, + "grad_norm": 45.18041952436337, + "language_loss": 1.10011601, + "learning_rate": 3.944798102235412e-06, + "loss": 1.21471024, + "num_input_tokens_seen": 9653455, + "router_z_loss_clip": 16.671875, + "router_z_loss_mlp": 2.31054688, + "step": 458, + "time_per_iteration": 2.723140239715576 + }, + { + "auxiliary_loss_clip": 0.09220205, + "auxiliary_loss_mlp": 0.02210297, + "balance_loss_clip": 0.07555029, + "balance_loss_mlp": 0.01976265, + "epoch": 0.027596572974597926, + "flos": 13011094944000.0, + "grad_norm": 45.239920259124276, + "language_loss": 1.02681351, + "learning_rate": 3.9462023605532545e-06, + "loss": 1.14111853, + "num_input_tokens_seen": 9669650, + "router_z_loss_clip": 16.640625, + "router_z_loss_mlp": 2.33984375, + "step": 459, + "time_per_iteration": 2.671720027923584 + }, + { + "auxiliary_loss_clip": 0.09208341, + "auxiliary_loss_mlp": 0.02210187, + "balance_loss_clip": 0.07567435, + "balance_loss_mlp": 0.0198264, + "epoch": 0.027656696227265895, + "flos": 26150671324800.0, + "grad_norm": 19.623434288041715, + "language_loss": 0.97685856, + "learning_rate": 3.947603562811407e-06, + "loss": 1.09104395, + "num_input_tokens_seen": 9691415, + "router_z_loss_clip": 16.40625, + "router_z_loss_mlp": 2.2734375, + "step": 460, + "time_per_iteration": 2.757227897644043 + }, + { + "auxiliary_loss_clip": 0.07349286, + "auxiliary_loss_mlp": 0.01457289, + "balance_loss_clip": 0.06801966, + "balance_loss_mlp": 0.01381853, + "epoch": 0.027716819479933864, + "flos": 60717055322880.0, + "grad_norm": 1.34871546657126, + "language_loss": 0.73767412, + "learning_rate": 3.949001722282675e-06, + "loss": 0.8257398, + "num_input_tokens_seen": 9755605, + "router_z_loss_clip": 5.48828125, + "router_z_loss_mlp": 0.75292969, + "step": 461, + "time_per_iteration": 3.225203514099121 + }, + { + "auxiliary_loss_clip": 0.09153335, + "auxiliary_loss_mlp": 0.02158036, + "balance_loss_clip": 0.07562718, + "balance_loss_mlp": 0.01941456, + "epoch": 0.027776942732601832, + "flos": 31219936761600.0, + "grad_norm": 25.337070845847826, + "language_loss": 1.02236819, + "learning_rate": 3.950396852153582e-06, + "loss": 1.13548183, + "num_input_tokens_seen": 9776270, + "router_z_loss_clip": 15.921875, + "router_z_loss_mlp": 2.16503906, + "step": 462, + "time_per_iteration": 2.761122941970825 + }, + { + "auxiliary_loss_clip": 0.0917296, + "auxiliary_loss_mlp": 0.02143298, + "balance_loss_clip": 0.07564321, + "balance_loss_mlp": 0.01926432, + "epoch": 0.027837065985269804, + "flos": 22681277020800.0, + "grad_norm": 25.879214952659087, + "language_loss": 1.11945248, + "learning_rate": 3.951788965525118e-06, + "loss": 1.23261511, + "num_input_tokens_seen": 9794465, + "router_z_loss_clip": 16.09375, + "router_z_loss_mlp": 2.16796875, + "step": 463, + "time_per_iteration": 2.6517393589019775 + }, + { + "auxiliary_loss_clip": 0.07315847, + "auxiliary_loss_mlp": 0.01337025, + "balance_loss_clip": 0.06773283, + "balance_loss_mlp": 0.01272986, + "epoch": 0.027897189237937773, + "flos": 62200786296960.0, + "grad_norm": 0.9076693638551637, + "language_loss": 0.58966231, + "learning_rate": 3.953178075413476e-06, + "loss": 0.67619097, + "num_input_tokens_seen": 9849685, + "router_z_loss_clip": 5.4375, + "router_z_loss_mlp": 0.64013672, + "step": 464, + "time_per_iteration": 3.2396233081817627 + }, + { + "auxiliary_loss_clip": 0.09172998, + "auxiliary_loss_mlp": 0.02120585, + "balance_loss_clip": 0.07578301, + "balance_loss_mlp": 0.01918502, + "epoch": 0.02795731249060574, + "flos": 24499131350400.0, + "grad_norm": 45.20349334546378, + "language_loss": 1.03495145, + "learning_rate": 3.954564194750784e-06, + "loss": 1.14788723, + "num_input_tokens_seen": 9869505, + "router_z_loss_clip": 15.953125, + "router_z_loss_mlp": 2.02148438, + "step": 465, + "time_per_iteration": 2.725616931915283 + }, + { + "auxiliary_loss_clip": 0.09135859, + "auxiliary_loss_mlp": 0.0204377, + "balance_loss_clip": 0.07563674, + "balance_loss_mlp": 0.01849125, + "epoch": 0.02801743574327371, + "flos": 23739858777600.0, + "grad_norm": 33.78948466858622, + "language_loss": 0.95100033, + "learning_rate": 3.955947336385828e-06, + "loss": 1.06279659, + "num_input_tokens_seen": 9890950, + "router_z_loss_clip": 15.703125, + "router_z_loss_mlp": 1.94628906, + "step": 466, + "time_per_iteration": 2.7096307277679443 + }, + { + "auxiliary_loss_clip": 0.09162845, + "auxiliary_loss_mlp": 0.02091556, + "balance_loss_clip": 0.07588789, + "balance_loss_mlp": 0.0189424, + "epoch": 0.02807755899594168, + "flos": 20634999661440.0, + "grad_norm": 17.071922366982022, + "language_loss": 1.01469541, + "learning_rate": 3.957327513084761e-06, + "loss": 1.12723947, + "num_input_tokens_seen": 9911265, + "router_z_loss_clip": 15.75, + "router_z_loss_mlp": 1.97265625, + "step": 467, + "time_per_iteration": 2.697120189666748 + }, + { + "auxiliary_loss_clip": 0.0908498, + "auxiliary_loss_mlp": 0.02113688, + "balance_loss_clip": 0.07555597, + "balance_loss_mlp": 0.01908934, + "epoch": 0.02813768224860965, + "flos": 19250554176000.0, + "grad_norm": 23.52868546244156, + "language_loss": 1.03801823, + "learning_rate": 3.958704737531818e-06, + "loss": 1.15000498, + "num_input_tokens_seen": 9929025, + "router_z_loss_clip": 15.2734375, + "router_z_loss_mlp": 2.04882812, + "step": 468, + "time_per_iteration": 2.6348235607147217 + }, + { + "auxiliary_loss_clip": 0.09087479, + "auxiliary_loss_mlp": 0.02120186, + "balance_loss_clip": 0.07563758, + "balance_loss_mlp": 0.01912189, + "epoch": 0.02819780550127762, + "flos": 20820306965760.0, + "grad_norm": 34.78387665912523, + "language_loss": 1.11076498, + "learning_rate": 3.9600790223300065e-06, + "loss": 1.2228415, + "num_input_tokens_seen": 9945190, + "router_z_loss_clip": 15.2265625, + "router_z_loss_mlp": 2.08300781, + "step": 469, + "time_per_iteration": 2.6886401176452637 + }, + { + "auxiliary_loss_clip": 0.09051213, + "auxiliary_loss_mlp": 0.02126417, + "balance_loss_clip": 0.07552808, + "balance_loss_mlp": 0.01921949, + "epoch": 0.028257928753945588, + "flos": 19980211530240.0, + "grad_norm": 43.4409759227761, + "language_loss": 1.05499089, + "learning_rate": 3.96145038000181e-06, + "loss": 1.16676712, + "num_input_tokens_seen": 9962820, + "router_z_loss_clip": 15.0078125, + "router_z_loss_mlp": 2.046875, + "step": 470, + "time_per_iteration": 2.649240255355835 + }, + { + "auxiliary_loss_clip": 0.09054536, + "auxiliary_loss_mlp": 0.02164254, + "balance_loss_clip": 0.0753805, + "balance_loss_mlp": 0.0194281, + "epoch": 0.028318052006613557, + "flos": 20490585949440.0, + "grad_norm": 34.229925481391405, + "language_loss": 1.11025834, + "learning_rate": 3.962818822989861e-06, + "loss": 1.2224462, + "num_input_tokens_seen": 9982595, + "router_z_loss_clip": 15.1796875, + "router_z_loss_mlp": 2.21484375, + "step": 471, + "time_per_iteration": 2.694502592086792 + }, + { + "auxiliary_loss_clip": 0.0901389, + "auxiliary_loss_mlp": 0.02100335, + "balance_loss_clip": 0.07527161, + "balance_loss_mlp": 0.01902638, + "epoch": 0.02837817525928153, + "flos": 28522854339840.0, + "grad_norm": 28.640745518781863, + "language_loss": 0.93263328, + "learning_rate": 3.964184363657625e-06, + "loss": 1.04377556, + "num_input_tokens_seen": 10004645, + "router_z_loss_clip": 14.859375, + "router_z_loss_mlp": 1.9765625, + "step": 472, + "time_per_iteration": 2.723616123199463 + }, + { + "auxiliary_loss_clip": 0.09058346, + "auxiliary_loss_mlp": 0.02156495, + "balance_loss_clip": 0.07551048, + "balance_loss_mlp": 0.01941347, + "epoch": 0.028438298511949497, + "flos": 18557597710080.0, + "grad_norm": 31.883678895195217, + "language_loss": 1.09761989, + "learning_rate": 3.965547014290071e-06, + "loss": 1.2097683, + "num_input_tokens_seen": 10022555, + "router_z_loss_clip": 15.078125, + "router_z_loss_mlp": 2.15136719, + "step": 473, + "time_per_iteration": 2.678131580352783 + }, + { + "auxiliary_loss_clip": 0.09018995, + "auxiliary_loss_mlp": 0.02143272, + "balance_loss_clip": 0.07526669, + "balance_loss_mlp": 0.01926216, + "epoch": 0.028498421764617466, + "flos": 16915952517120.0, + "grad_norm": 82.06010961294956, + "language_loss": 1.11515367, + "learning_rate": 3.96690678709433e-06, + "loss": 1.22677636, + "num_input_tokens_seen": 10041025, + "router_z_loss_clip": 14.921875, + "router_z_loss_mlp": 2.171875, + "step": 474, + "time_per_iteration": 2.6410977840423584 + }, + { + "auxiliary_loss_clip": 0.08995185, + "auxiliary_loss_mlp": 0.02205209, + "balance_loss_clip": 0.0752454, + "balance_loss_mlp": 0.01985291, + "epoch": 0.028558545017285435, + "flos": 27785524337280.0, + "grad_norm": 24.826629982331372, + "language_loss": 0.97130352, + "learning_rate": 3.968263694200355e-06, + "loss": 1.0833075, + "num_input_tokens_seen": 10060775, + "router_z_loss_clip": 14.6953125, + "router_z_loss_mlp": 2.19726562, + "step": 475, + "time_per_iteration": 2.7301735877990723 + }, + { + "auxiliary_loss_clip": 0.07259832, + "auxiliary_loss_mlp": 0.01404773, + "balance_loss_clip": 0.06728013, + "balance_loss_mlp": 0.01346599, + "epoch": 0.028618668269953403, + "flos": 65674205596800.0, + "grad_norm": 0.9437348671950723, + "language_loss": 0.66932654, + "learning_rate": 3.969617747661569e-06, + "loss": 0.75597262, + "num_input_tokens_seen": 10120225, + "router_z_loss_clip": 5.3125, + "router_z_loss_mlp": 0.58154297, + "step": 476, + "time_per_iteration": 3.247438430786133 + }, + { + "auxiliary_loss_clip": 0.08952022, + "auxiliary_loss_mlp": 0.02252624, + "balance_loss_clip": 0.07508352, + "balance_loss_mlp": 0.02028701, + "epoch": 0.028678791522621375, + "flos": 21942269936640.0, + "grad_norm": 144.43661292546363, + "language_loss": 1.05051386, + "learning_rate": 3.970968959455509e-06, + "loss": 1.16256034, + "num_input_tokens_seen": 10137880, + "router_z_loss_clip": 14.4296875, + "router_z_loss_mlp": 2.24023438, + "step": 477, + "time_per_iteration": 2.6508686542510986 + }, + { + "auxiliary_loss_clip": 0.08993904, + "auxiliary_loss_mlp": 0.02256823, + "balance_loss_clip": 0.0754967, + "balance_loss_mlp": 0.02029467, + "epoch": 0.028738914775289344, + "flos": 24579115672320.0, + "grad_norm": 33.20185721324117, + "language_loss": 1.03065133, + "learning_rate": 3.97231734148446e-06, + "loss": 1.14315856, + "num_input_tokens_seen": 10156930, + "router_z_loss_clip": 14.453125, + "router_z_loss_mlp": 2.2734375, + "step": 478, + "time_per_iteration": 2.7467830181121826 + }, + { + "auxiliary_loss_clip": 0.08933547, + "auxiliary_loss_mlp": 0.0224041, + "balance_loss_clip": 0.07500903, + "balance_loss_mlp": 0.02019921, + "epoch": 0.028799038027957313, + "flos": 23264633946240.0, + "grad_norm": 28.885721108677235, + "language_loss": 1.00177026, + "learning_rate": 3.973662905576082e-06, + "loss": 1.11350989, + "num_input_tokens_seen": 10176295, + "router_z_loss_clip": 14.328125, + "router_z_loss_mlp": 2.20507812, + "step": 479, + "time_per_iteration": 2.7295467853546143 + }, + { + "auxiliary_loss_clip": 0.08948811, + "auxiliary_loss_mlp": 0.02267472, + "balance_loss_clip": 0.07523456, + "balance_loss_mlp": 0.02031152, + "epoch": 0.02885916128062528, + "flos": 22170692966400.0, + "grad_norm": 33.357673755660976, + "language_loss": 0.91625684, + "learning_rate": 3.975005663484038e-06, + "loss": 1.02841961, + "num_input_tokens_seen": 10195790, + "router_z_loss_clip": 14.25, + "router_z_loss_mlp": 2.36328125, + "step": 480, + "time_per_iteration": 2.766277551651001 + }, + { + "auxiliary_loss_clip": 0.08903027, + "auxiliary_loss_mlp": 0.02291788, + "balance_loss_clip": 0.07483099, + "balance_loss_mlp": 0.02045358, + "epoch": 0.02891928453329325, + "flos": 22939986101760.0, + "grad_norm": 22.287574516605755, + "language_loss": 1.01525128, + "learning_rate": 3.976345626888605e-06, + "loss": 1.12719941, + "num_input_tokens_seen": 10218405, + "router_z_loss_clip": 14.1875, + "router_z_loss_mlp": 2.4609375, + "step": 481, + "time_per_iteration": 2.692387580871582 + }, + { + "auxiliary_loss_clip": 0.07204929, + "auxiliary_loss_mlp": 0.0133661, + "balance_loss_clip": 0.06688471, + "balance_loss_mlp": 0.01279295, + "epoch": 0.028979407785961222, + "flos": 57449376524160.0, + "grad_norm": 0.8487290952821426, + "language_loss": 0.65879083, + "learning_rate": 3.9776828073972864e-06, + "loss": 0.74420619, + "num_input_tokens_seen": 10271005, + "router_z_loss_clip": 5.16015625, + "router_z_loss_mlp": 0.57275391, + "step": 482, + "time_per_iteration": 3.019406318664551 + }, + { + "auxiliary_loss_clip": 0.08916203, + "auxiliary_loss_mlp": 0.02251093, + "balance_loss_clip": 0.0748857, + "balance_loss_mlp": 0.02018397, + "epoch": 0.02903953103862919, + "flos": 16727584538880.0, + "grad_norm": 104.5991727322302, + "language_loss": 1.06331348, + "learning_rate": 3.979017216545415e-06, + "loss": 1.17498636, + "num_input_tokens_seen": 10288405, + "router_z_loss_clip": 14.28125, + "router_z_loss_mlp": 2.32421875, + "step": 483, + "time_per_iteration": 2.609882354736328 + }, + { + "auxiliary_loss_clip": 0.08908117, + "auxiliary_loss_mlp": 0.02236577, + "balance_loss_clip": 0.07510938, + "balance_loss_mlp": 0.02016469, + "epoch": 0.02909965429129716, + "flos": 16769232817920.0, + "grad_norm": 23.083678473769563, + "language_loss": 0.94234419, + "learning_rate": 3.980348865796749e-06, + "loss": 1.05379117, + "num_input_tokens_seen": 10306875, + "router_z_loss_clip": 13.96875, + "router_z_loss_mlp": 2.20507812, + "step": 484, + "time_per_iteration": 2.6507458686828613 + }, + { + "auxiliary_loss_clip": 0.08915585, + "auxiliary_loss_mlp": 0.02232887, + "balance_loss_clip": 0.07503805, + "balance_loss_mlp": 0.02011253, + "epoch": 0.029159777543965128, + "flos": 19790334178560.0, + "grad_norm": 110.91894314268477, + "language_loss": 1.00352454, + "learning_rate": 3.9816777665440615e-06, + "loss": 1.11500931, + "num_input_tokens_seen": 10323965, + "router_z_loss_clip": 14.125, + "router_z_loss_mlp": 2.21679688, + "step": 485, + "time_per_iteration": 2.7673757076263428 + }, + { + "auxiliary_loss_clip": 0.08880442, + "auxiliary_loss_mlp": 0.02237809, + "balance_loss_clip": 0.07482816, + "balance_loss_mlp": 0.02005876, + "epoch": 0.029219900796633096, + "flos": 19648184526720.0, + "grad_norm": 27.10228237086094, + "language_loss": 1.06272924, + "learning_rate": 3.983003930109732e-06, + "loss": 1.17391181, + "num_input_tokens_seen": 10342620, + "router_z_loss_clip": 13.96875, + "router_z_loss_mlp": 2.31835938, + "step": 486, + "time_per_iteration": 2.6508092880249023 + }, + { + "auxiliary_loss_clip": 0.08911004, + "auxiliary_loss_mlp": 0.02193732, + "balance_loss_clip": 0.0752122, + "balance_loss_mlp": 0.01974864, + "epoch": 0.02928002404930107, + "flos": 25892926565760.0, + "grad_norm": 15.693662583850747, + "language_loss": 1.04105806, + "learning_rate": 3.984327367746315e-06, + "loss": 1.15210545, + "num_input_tokens_seen": 10364610, + "router_z_loss_clip": 13.90625, + "router_z_loss_mlp": 2.19042969, + "step": 487, + "time_per_iteration": 2.81233286857605 + }, + { + "auxiliary_loss_clip": 0.0888624, + "auxiliary_loss_mlp": 0.02210903, + "balance_loss_clip": 0.07486838, + "balance_loss_mlp": 0.02002811, + "epoch": 0.029340147301969037, + "flos": 20665243785600.0, + "grad_norm": 49.61563210000309, + "language_loss": 1.12978697, + "learning_rate": 3.985648090637122e-06, + "loss": 1.24075842, + "num_input_tokens_seen": 10380910, + "router_z_loss_clip": 13.9921875, + "router_z_loss_mlp": 2.08300781, + "step": 488, + "time_per_iteration": 2.674189567565918 + }, + { + "auxiliary_loss_clip": 0.08953497, + "auxiliary_loss_mlp": 0.02211393, + "balance_loss_clip": 0.07543504, + "balance_loss_mlp": 0.02002347, + "epoch": 0.029400270554637006, + "flos": 24435288938880.0, + "grad_norm": 19.90256121713189, + "language_loss": 1.00477099, + "learning_rate": 3.986966109896785e-06, + "loss": 1.11641979, + "num_input_tokens_seen": 10400665, + "router_z_loss_clip": 14.1015625, + "router_z_loss_mlp": 2.09277344, + "step": 489, + "time_per_iteration": 2.7639148235321045 + }, + { + "auxiliary_loss_clip": 0.0892607, + "auxiliary_loss_mlp": 0.0220073, + "balance_loss_clip": 0.07529595, + "balance_loss_mlp": 0.01982529, + "epoch": 0.029460393807304974, + "flos": 20127140864640.0, + "grad_norm": 27.578366038116485, + "language_loss": 1.02338409, + "learning_rate": 3.988281436571815e-06, + "loss": 1.13465214, + "num_input_tokens_seen": 10420150, + "router_z_loss_clip": 13.96875, + "router_z_loss_mlp": 2.18359375, + "step": 490, + "time_per_iteration": 2.6444106101989746 + }, + { + "auxiliary_loss_clip": 0.08913176, + "auxiliary_loss_mlp": 0.02195572, + "balance_loss_clip": 0.07533699, + "balance_loss_mlp": 0.0197432, + "epoch": 0.029520517059972943, + "flos": 17681681854080.0, + "grad_norm": 29.015537112342308, + "language_loss": 1.11532688, + "learning_rate": 3.989594081641164e-06, + "loss": 1.22641444, + "num_input_tokens_seen": 10438210, + "router_z_loss_clip": 13.7890625, + "router_z_loss_mlp": 2.21289062, + "step": 491, + "time_per_iteration": 5.5153045654296875 + }, + { + "auxiliary_loss_clip": 0.08889591, + "auxiliary_loss_mlp": 0.02207651, + "balance_loss_clip": 0.07520857, + "balance_loss_mlp": 0.0199317, + "epoch": 0.029580640312640915, + "flos": 18959211129600.0, + "grad_norm": 14.57626480214455, + "language_loss": 0.9931764, + "learning_rate": 3.9909040560167675e-06, + "loss": 1.10414886, + "num_input_tokens_seen": 10455125, + "router_z_loss_clip": 13.6875, + "router_z_loss_mlp": 2.14550781, + "step": 492, + "time_per_iteration": 4.12203049659729 + }, + { + "auxiliary_loss_clip": 0.08912461, + "auxiliary_loss_mlp": 0.02272215, + "balance_loss_clip": 0.07548416, + "balance_loss_mlp": 0.02033606, + "epoch": 0.029640763565308884, + "flos": 18730746172800.0, + "grad_norm": 23.908228280746865, + "language_loss": 1.05753922, + "learning_rate": 3.992211370544093e-06, + "loss": 1.16938591, + "num_input_tokens_seen": 10470990, + "router_z_loss_clip": 13.625, + "router_z_loss_mlp": 2.3828125, + "step": 493, + "time_per_iteration": 2.6953020095825195 + }, + { + "auxiliary_loss_clip": 0.08946873, + "auxiliary_loss_mlp": 0.02207101, + "balance_loss_clip": 0.07561117, + "balance_loss_mlp": 0.01985753, + "epoch": 0.029700886817976852, + "flos": 20601652936320.0, + "grad_norm": 59.82783301164341, + "language_loss": 1.05118871, + "learning_rate": 3.99351603600268e-06, + "loss": 1.16272855, + "num_input_tokens_seen": 10490685, + "router_z_loss_clip": 13.8515625, + "router_z_loss_mlp": 2.21386719, + "step": 494, + "time_per_iteration": 2.6631805896759033 + }, + { + "auxiliary_loss_clip": 0.08915924, + "auxiliary_loss_mlp": 0.02239191, + "balance_loss_clip": 0.07543083, + "balance_loss_mlp": 0.0199753, + "epoch": 0.02976101007064482, + "flos": 22243423910400.0, + "grad_norm": 26.318413946561634, + "language_loss": 1.04354262, + "learning_rate": 3.994818063106668e-06, + "loss": 1.15509367, + "num_input_tokens_seen": 10509435, + "router_z_loss_clip": 13.7265625, + "router_z_loss_mlp": 2.4140625, + "step": 495, + "time_per_iteration": 4.107235908508301 + }, + { + "auxiliary_loss_clip": 0.08888054, + "auxiliary_loss_mlp": 0.02273613, + "balance_loss_clip": 0.07541628, + "balance_loss_mlp": 0.02036148, + "epoch": 0.029821133323312793, + "flos": 23739439507200.0, + "grad_norm": 14.252476342508674, + "language_loss": 0.79374158, + "learning_rate": 3.99611746250533e-06, + "loss": 0.9053582, + "num_input_tokens_seen": 10530050, + "router_z_loss_clip": 13.4609375, + "router_z_loss_mlp": 2.37304688, + "step": 496, + "time_per_iteration": 2.757887363433838 + }, + { + "auxiliary_loss_clip": 0.08908898, + "auxiliary_loss_mlp": 0.0225322, + "balance_loss_clip": 0.07561936, + "balance_loss_mlp": 0.02023385, + "epoch": 0.02988125657598076, + "flos": 22426131738240.0, + "grad_norm": 48.93797296748546, + "language_loss": 1.05435932, + "learning_rate": 3.997414244783595e-06, + "loss": 1.16598058, + "num_input_tokens_seen": 10551370, + "router_z_loss_clip": 13.453125, + "router_z_loss_mlp": 2.296875, + "step": 497, + "time_per_iteration": 2.698960781097412 + }, + { + "auxiliary_loss_clip": 0.08959304, + "auxiliary_loss_mlp": 0.0221962, + "balance_loss_clip": 0.07595803, + "balance_loss_mlp": 0.01998176, + "epoch": 0.02994137982864873, + "flos": 13850267984640.0, + "grad_norm": 57.28331954677374, + "language_loss": 1.09360301, + "learning_rate": 3.998708420462557e-06, + "loss": 1.20539236, + "num_input_tokens_seen": 10569225, + "router_z_loss_clip": 13.640625, + "router_z_loss_mlp": 2.21289062, + "step": 498, + "time_per_iteration": 2.699470281600952 + }, + { + "auxiliary_loss_clip": 0.08942117, + "auxiliary_loss_mlp": 0.02291662, + "balance_loss_clip": 0.07576901, + "balance_loss_mlp": 0.02053434, + "epoch": 0.0300015030813167, + "flos": 23914055416320.0, + "grad_norm": 30.471494656970325, + "language_loss": 1.05517888, + "learning_rate": 4e-06, + "loss": 1.16751671, + "num_input_tokens_seen": 10586170, + "router_z_loss_clip": 13.65625, + "router_z_loss_mlp": 2.37890625, + "step": 499, + "time_per_iteration": 2.6825146675109863 + }, + { + "auxiliary_loss_clip": 0.08909643, + "auxiliary_loss_mlp": 0.02277073, + "balance_loss_clip": 0.07578171, + "balance_loss_mlp": 0.02052769, + "epoch": 0.030061626333984667, + "flos": 22023134726400.0, + "grad_norm": 15.715356901732157, + "language_loss": 0.96281993, + "learning_rate": 3.9999999620799e-06, + "loss": 1.07468712, + "num_input_tokens_seen": 10606205, + "router_z_loss_clip": 13.3046875, + "router_z_loss_mlp": 2.24414062, + "step": 500, + "time_per_iteration": 2.7350914478302 + }, + { + "auxiliary_loss_clip": 0.08887713, + "auxiliary_loss_mlp": 0.02297984, + "balance_loss_clip": 0.07557485, + "balance_loss_mlp": 0.02069103, + "epoch": 0.03012174958665264, + "flos": 23046483041280.0, + "grad_norm": 15.325261953037035, + "language_loss": 1.09255648, + "learning_rate": 3.9999998483196e-06, + "loss": 1.20441341, + "num_input_tokens_seen": 10625995, + "router_z_loss_clip": 13.296875, + "router_z_loss_mlp": 2.2890625, + "step": 501, + "time_per_iteration": 2.6515860557556152 + }, + { + "auxiliary_loss_clip": 0.0895866, + "auxiliary_loss_mlp": 0.02279337, + "balance_loss_clip": 0.07618586, + "balance_loss_mlp": 0.02058275, + "epoch": 0.030181872839320608, + "flos": 18959294983680.0, + "grad_norm": 442.08874740717613, + "language_loss": 1.0616231, + "learning_rate": 3.9999996587191065e-06, + "loss": 1.17400312, + "num_input_tokens_seen": 10644105, + "router_z_loss_clip": 13.40625, + "router_z_loss_mlp": 2.21289062, + "step": 502, + "time_per_iteration": 2.6650314331054688 + }, + { + "auxiliary_loss_clip": 0.08926746, + "auxiliary_loss_mlp": 0.02313635, + "balance_loss_clip": 0.07593986, + "balance_loss_mlp": 0.02080176, + "epoch": 0.030241996091988577, + "flos": 16733747813760.0, + "grad_norm": 40.11923719359636, + "language_loss": 1.00487685, + "learning_rate": 3.999999393278425e-06, + "loss": 1.11728072, + "num_input_tokens_seen": 10661090, + "router_z_loss_clip": 13.3125, + "router_z_loss_mlp": 2.3359375, + "step": 503, + "time_per_iteration": 2.6301283836364746 + }, + { + "auxiliary_loss_clip": 0.08950677, + "auxiliary_loss_mlp": 0.02299167, + "balance_loss_clip": 0.07607222, + "balance_loss_mlp": 0.02070094, + "epoch": 0.030302119344656545, + "flos": 28628806227840.0, + "grad_norm": 16.096297116013613, + "language_loss": 1.02800179, + "learning_rate": 3.999999051997567e-06, + "loss": 1.14050031, + "num_input_tokens_seen": 10682380, + "router_z_loss_clip": 13.4375, + "router_z_loss_mlp": 2.28808594, + "step": 504, + "time_per_iteration": 2.7234466075897217 + }, + { + "auxiliary_loss_clip": 0.08954775, + "auxiliary_loss_mlp": 0.022733, + "balance_loss_clip": 0.07610564, + "balance_loss_mlp": 0.02054241, + "epoch": 0.030362242597324514, + "flos": 15674788713600.0, + "grad_norm": 53.80634610199122, + "language_loss": 0.90572113, + "learning_rate": 3.9999986348765425e-06, + "loss": 1.01800191, + "num_input_tokens_seen": 10699925, + "router_z_loss_clip": 13.453125, + "router_z_loss_mlp": 2.19042969, + "step": 505, + "time_per_iteration": 2.6355271339416504 + }, + { + "auxiliary_loss_clip": 0.07202613, + "auxiliary_loss_mlp": 0.01385887, + "balance_loss_clip": 0.06702607, + "balance_loss_mlp": 0.01312073, + "epoch": 0.030422365849992486, + "flos": 72149173528320.0, + "grad_norm": 1.0312424009228802, + "language_loss": 0.55707914, + "learning_rate": 3.999998141915371e-06, + "loss": 0.64296412, + "num_input_tokens_seen": 10766525, + "router_z_loss_clip": 5.0, + "router_z_loss_mlp": 0.73779297, + "step": 506, + "time_per_iteration": 3.4425716400146484 + }, + { + "auxiliary_loss_clip": 0.08947556, + "auxiliary_loss_mlp": 0.0229462, + "balance_loss_clip": 0.07588895, + "balance_loss_mlp": 0.02080234, + "epoch": 0.030482489102660455, + "flos": 19433974763520.0, + "grad_norm": 15.732874937996321, + "language_loss": 0.96318799, + "learning_rate": 3.999997573114069e-06, + "loss": 1.07560968, + "num_input_tokens_seen": 10786725, + "router_z_loss_clip": 13.5703125, + "router_z_loss_mlp": 2.14648438, + "step": 507, + "time_per_iteration": 2.6885857582092285 + }, + { + "auxiliary_loss_clip": 0.08928548, + "auxiliary_loss_mlp": 0.02259048, + "balance_loss_clip": 0.07588597, + "balance_loss_mlp": 0.02042945, + "epoch": 0.030542612355328423, + "flos": 20382034584960.0, + "grad_norm": 22.351883402694675, + "language_loss": 1.05944586, + "learning_rate": 3.999996928472659e-06, + "loss": 1.17132187, + "num_input_tokens_seen": 10805390, + "router_z_loss_clip": 13.3984375, + "router_z_loss_mlp": 2.15722656, + "step": 508, + "time_per_iteration": 2.659903049468994 + }, + { + "auxiliary_loss_clip": 0.08911724, + "auxiliary_loss_mlp": 0.02284852, + "balance_loss_clip": 0.07589735, + "balance_loss_mlp": 0.02067796, + "epoch": 0.030602735607996392, + "flos": 34685809194240.0, + "grad_norm": 36.57726962187856, + "language_loss": 0.84476292, + "learning_rate": 3.999996207991165e-06, + "loss": 0.95672864, + "num_input_tokens_seen": 10828030, + "router_z_loss_clip": 13.1953125, + "router_z_loss_mlp": 2.17089844, + "step": 509, + "time_per_iteration": 2.8194127082824707 + }, + { + "auxiliary_loss_clip": 0.08892205, + "auxiliary_loss_mlp": 0.02281797, + "balance_loss_clip": 0.07575735, + "balance_loss_mlp": 0.02065503, + "epoch": 0.03066285886066436, + "flos": 23665283043840.0, + "grad_norm": 17.47434487382061, + "language_loss": 0.97325271, + "learning_rate": 3.999995411669614e-06, + "loss": 1.08499277, + "num_input_tokens_seen": 10845240, + "router_z_loss_clip": 13.15625, + "router_z_loss_mlp": 2.16210938, + "step": 510, + "time_per_iteration": 2.6817235946655273 + }, + { + "auxiliary_loss_clip": 0.08892487, + "auxiliary_loss_mlp": 0.02360194, + "balance_loss_clip": 0.07583004, + "balance_loss_mlp": 0.02123492, + "epoch": 0.030722982113332332, + "flos": 23009656371840.0, + "grad_norm": 18.905046526469672, + "language_loss": 1.01792526, + "learning_rate": 3.999994539508036e-06, + "loss": 1.13045216, + "num_input_tokens_seen": 10864325, + "router_z_loss_clip": 13.109375, + "router_z_loss_mlp": 2.36328125, + "step": 511, + "time_per_iteration": 2.7218635082244873 + }, + { + "auxiliary_loss_clip": 0.08893925, + "auxiliary_loss_mlp": 0.02289988, + "balance_loss_clip": 0.07569309, + "balance_loss_mlp": 0.02083041, + "epoch": 0.0307831053660003, + "flos": 24757253452800.0, + "grad_norm": 19.668331583944035, + "language_loss": 0.98058987, + "learning_rate": 3.9999935915064655e-06, + "loss": 1.09242892, + "num_input_tokens_seen": 10883860, + "router_z_loss_clip": 13.25, + "router_z_loss_mlp": 2.07226562, + "step": 512, + "time_per_iteration": 2.6965620517730713 + }, + { + "auxiliary_loss_clip": 0.08852743, + "auxiliary_loss_mlp": 0.02379446, + "balance_loss_clip": 0.0755362, + "balance_loss_mlp": 0.02156858, + "epoch": 0.03084322861866827, + "flos": 26148113775360.0, + "grad_norm": 13.468181826610785, + "language_loss": 1.01916862, + "learning_rate": 3.9999925676649374e-06, + "loss": 1.13149047, + "num_input_tokens_seen": 10904555, + "router_z_loss_clip": 12.984375, + "router_z_loss_mlp": 2.22460938, + "step": 513, + "time_per_iteration": 2.711587429046631 + }, + { + "auxiliary_loss_clip": 0.08845583, + "auxiliary_loss_mlp": 0.02430958, + "balance_loss_clip": 0.07545915, + "balance_loss_mlp": 0.02204555, + "epoch": 0.03090335187133624, + "flos": 18777383769600.0, + "grad_norm": 6.55607776583441, + "language_loss": 0.95138013, + "learning_rate": 3.999991467983491e-06, + "loss": 1.06414557, + "num_input_tokens_seen": 10923700, + "router_z_loss_clip": 13.0, + "router_z_loss_mlp": 2.26269531, + "step": 514, + "time_per_iteration": 2.6500775814056396 + }, + { + "auxiliary_loss_clip": 0.08815307, + "auxiliary_loss_mlp": 0.02407072, + "balance_loss_clip": 0.07539771, + "balance_loss_mlp": 0.02187917, + "epoch": 0.030963475124004207, + "flos": 23228603890560.0, + "grad_norm": 18.204719930438795, + "language_loss": 0.97247916, + "learning_rate": 3.999990292462167e-06, + "loss": 1.08470297, + "num_input_tokens_seen": 10942730, + "router_z_loss_clip": 12.7578125, + "router_z_loss_mlp": 2.19335938, + "step": 515, + "time_per_iteration": 2.7167558670043945 + }, + { + "auxiliary_loss_clip": 0.08806405, + "auxiliary_loss_mlp": 0.02437712, + "balance_loss_clip": 0.0752582, + "balance_loss_mlp": 0.02208258, + "epoch": 0.03102359837667218, + "flos": 42535998662400.0, + "grad_norm": 5.904658856542002, + "language_loss": 1.00314569, + "learning_rate": 3.999989041101011e-06, + "loss": 1.11558676, + "num_input_tokens_seen": 10967120, + "router_z_loss_clip": 12.8203125, + "router_z_loss_mlp": 2.29492188, + "step": 516, + "time_per_iteration": 2.932173013687134 + }, + { + "auxiliary_loss_clip": 0.08796877, + "auxiliary_loss_mlp": 0.02455233, + "balance_loss_clip": 0.07514809, + "balance_loss_mlp": 0.02220629, + "epoch": 0.031083721629340148, + "flos": 21183039290880.0, + "grad_norm": 45.02393900109363, + "language_loss": 0.9180311, + "learning_rate": 3.999987713900071e-06, + "loss": 1.03055215, + "num_input_tokens_seen": 10986775, + "router_z_loss_clip": 12.8359375, + "router_z_loss_mlp": 2.34375, + "step": 517, + "time_per_iteration": 2.666154623031616 + }, + { + "auxiliary_loss_clip": 0.08820206, + "auxiliary_loss_mlp": 0.02414127, + "balance_loss_clip": 0.07551458, + "balance_loss_mlp": 0.02194306, + "epoch": 0.031143844882008116, + "flos": 29723963091840.0, + "grad_norm": 7.285252117980509, + "language_loss": 0.99479294, + "learning_rate": 3.999986310859396e-06, + "loss": 1.10713625, + "num_input_tokens_seen": 11011360, + "router_z_loss_clip": 12.6796875, + "router_z_loss_mlp": 2.19824219, + "step": 518, + "time_per_iteration": 2.752505302429199 + }, + { + "auxiliary_loss_clip": 0.08830461, + "auxiliary_loss_mlp": 0.024645, + "balance_loss_clip": 0.07556459, + "balance_loss_mlp": 0.02246586, + "epoch": 0.031203968134676085, + "flos": 23119172058240.0, + "grad_norm": 20.736865355911096, + "language_loss": 1.01917171, + "learning_rate": 3.999984831979039e-06, + "loss": 1.13212132, + "num_input_tokens_seen": 11030150, + "router_z_loss_clip": 12.734375, + "router_z_loss_mlp": 2.1796875, + "step": 519, + "time_per_iteration": 2.6659457683563232 + }, + { + "auxiliary_loss_clip": 0.08817208, + "auxiliary_loss_mlp": 0.02465606, + "balance_loss_clip": 0.07545176, + "balance_loss_mlp": 0.02241778, + "epoch": 0.03126409138734405, + "flos": 20959815214080.0, + "grad_norm": 7.142122271726701, + "language_loss": 1.00803113, + "learning_rate": 3.999983277259057e-06, + "loss": 1.12085938, + "num_input_tokens_seen": 11049145, + "router_z_loss_clip": 12.7265625, + "router_z_loss_mlp": 2.23632812, + "step": 520, + "time_per_iteration": 2.7612173557281494 + }, + { + "auxiliary_loss_clip": 0.08873951, + "auxiliary_loss_mlp": 0.02427922, + "balance_loss_clip": 0.07591425, + "balance_loss_mlp": 0.02219163, + "epoch": 0.031324214640012026, + "flos": 21656083916160.0, + "grad_norm": 5386.394179139514, + "language_loss": 1.03191018, + "learning_rate": 3.999981646699509e-06, + "loss": 1.14492893, + "num_input_tokens_seen": 11068835, + "router_z_loss_clip": 12.8203125, + "router_z_loss_mlp": 2.08886719, + "step": 521, + "time_per_iteration": 2.6934170722961426 + }, + { + "auxiliary_loss_clip": 0.08889641, + "auxiliary_loss_mlp": 0.02359363, + "balance_loss_clip": 0.07604645, + "balance_loss_mlp": 0.02163669, + "epoch": 0.03138433789267999, + "flos": 23448180314880.0, + "grad_norm": 8.073235529869596, + "language_loss": 0.83005708, + "learning_rate": 3.999979940300456e-06, + "loss": 0.94254714, + "num_input_tokens_seen": 11088980, + "router_z_loss_clip": 12.84375, + "router_z_loss_mlp": 1.95800781, + "step": 522, + "time_per_iteration": 2.8722758293151855 + }, + { + "auxiliary_loss_clip": 0.08903908, + "auxiliary_loss_mlp": 0.02254118, + "balance_loss_clip": 0.07622182, + "balance_loss_mlp": 0.0208465, + "epoch": 0.03144446114534796, + "flos": 18986939631360.0, + "grad_norm": 12.411483225368043, + "language_loss": 1.05680871, + "learning_rate": 3.999978158061963e-06, + "loss": 1.16838908, + "num_input_tokens_seen": 11104300, + "router_z_loss_clip": 12.8046875, + "router_z_loss_mlp": 1.6953125, + "step": 523, + "time_per_iteration": 2.650547742843628 + }, + { + "auxiliary_loss_clip": 0.08934012, + "auxiliary_loss_mlp": 0.02230434, + "balance_loss_clip": 0.07644011, + "balance_loss_mlp": 0.0206087, + "epoch": 0.031504584398015935, + "flos": 22644240716160.0, + "grad_norm": 13.96543726868128, + "language_loss": 1.08792841, + "learning_rate": 3.999976299984099e-06, + "loss": 1.1995728, + "num_input_tokens_seen": 11123335, + "router_z_loss_clip": 12.8984375, + "router_z_loss_mlp": 1.69628906, + "step": 524, + "time_per_iteration": 2.7135303020477295 + }, + { + "auxiliary_loss_clip": 0.08891568, + "auxiliary_loss_mlp": 0.02091454, + "balance_loss_clip": 0.07603844, + "balance_loss_mlp": 0.0193486, + "epoch": 0.0315647076506839, + "flos": 25303364438400.0, + "grad_norm": 13.325751395918596, + "language_loss": 0.96287918, + "learning_rate": 3.999974366066933e-06, + "loss": 1.07270944, + "num_input_tokens_seen": 11140880, + "router_z_loss_clip": 12.875, + "router_z_loss_mlp": 1.56542969, + "step": 525, + "time_per_iteration": 2.7008469104766846 + }, + { + "auxiliary_loss_clip": 0.08895689, + "auxiliary_loss_mlp": 0.02060743, + "balance_loss_clip": 0.07611247, + "balance_loss_mlp": 0.01902052, + "epoch": 0.03162483090335187, + "flos": 16988515752960.0, + "grad_norm": 10.865036443132793, + "language_loss": 0.93799376, + "learning_rate": 3.999972356310538e-06, + "loss": 1.04755807, + "num_input_tokens_seen": 11158710, + "router_z_loss_clip": 12.84375, + "router_z_loss_mlp": 1.58789062, + "step": 526, + "time_per_iteration": 2.6346511840820312 + }, + { + "auxiliary_loss_clip": 0.08917748, + "auxiliary_loss_mlp": 0.01935945, + "balance_loss_clip": 0.07596096, + "balance_loss_mlp": 0.01773629, + "epoch": 0.03168495415601984, + "flos": 18740515173120.0, + "grad_norm": 57.85895101220995, + "language_loss": 0.99752951, + "learning_rate": 3.999970270714991e-06, + "loss": 1.10606647, + "num_input_tokens_seen": 11177550, + "router_z_loss_clip": 13.2109375, + "router_z_loss_mlp": 1.62402344, + "step": 527, + "time_per_iteration": 2.679004669189453 + }, + { + "auxiliary_loss_clip": 0.08855803, + "auxiliary_loss_mlp": 0.01834989, + "balance_loss_clip": 0.07585346, + "balance_loss_mlp": 0.01673914, + "epoch": 0.03174507740868781, + "flos": 21221207625600.0, + "grad_norm": 46.02909291045389, + "language_loss": 1.11322296, + "learning_rate": 3.999968109280371e-06, + "loss": 1.22013092, + "num_input_tokens_seen": 11196230, + "router_z_loss_clip": 12.703125, + "router_z_loss_mlp": 1.61035156, + "step": 528, + "time_per_iteration": 2.6590561866760254 + }, + { + "auxiliary_loss_clip": 0.08896849, + "auxiliary_loss_mlp": 0.01846134, + "balance_loss_clip": 0.07587088, + "balance_loss_mlp": 0.01668655, + "epoch": 0.03180520066135578, + "flos": 24794122049280.0, + "grad_norm": 60.37354361545739, + "language_loss": 0.97275496, + "learning_rate": 3.99996587200676e-06, + "loss": 1.08018494, + "num_input_tokens_seen": 11214935, + "router_z_loss_clip": 13.09375, + "router_z_loss_mlp": 1.77539062, + "step": 529, + "time_per_iteration": 2.7260618209838867 + }, + { + "auxiliary_loss_clip": 0.08883977, + "auxiliary_loss_mlp": 0.01771414, + "balance_loss_clip": 0.07582102, + "balance_loss_mlp": 0.01579535, + "epoch": 0.03186532391402375, + "flos": 24871339186560.0, + "grad_norm": 10627.611218983826, + "language_loss": 1.18170238, + "learning_rate": 3.999963558894243e-06, + "loss": 1.28825641, + "num_input_tokens_seen": 11235310, + "router_z_loss_clip": 13.015625, + "router_z_loss_mlp": 1.91894531, + "step": 530, + "time_per_iteration": 2.7020938396453857 + }, + { + "auxiliary_loss_clip": 0.08833256, + "auxiliary_loss_mlp": 0.01774458, + "balance_loss_clip": 0.07546531, + "balance_loss_mlp": 0.01588683, + "epoch": 0.03192544716669172, + "flos": 21221417260800.0, + "grad_norm": 74.92861353079512, + "language_loss": 0.92192125, + "learning_rate": 3.999961169942907e-06, + "loss": 1.02799833, + "num_input_tokens_seen": 11254425, + "router_z_loss_clip": 12.8671875, + "router_z_loss_mlp": 1.85644531, + "step": 531, + "time_per_iteration": 5.536854028701782 + }, + { + "auxiliary_loss_clip": 0.08819988, + "auxiliary_loss_mlp": 0.0179185, + "balance_loss_clip": 0.07536054, + "balance_loss_mlp": 0.01611224, + "epoch": 0.03198557041935969, + "flos": 24360168153600.0, + "grad_norm": 15.362611414198588, + "language_loss": 1.04843593, + "learning_rate": 3.999958705152843e-06, + "loss": 1.15455437, + "num_input_tokens_seen": 11274595, + "router_z_loss_clip": 12.8359375, + "router_z_loss_mlp": 1.8046875, + "step": 532, + "time_per_iteration": 4.078269958496094 + }, + { + "auxiliary_loss_clip": 0.07593378, + "auxiliary_loss_mlp": 0.01964501, + "balance_loss_clip": 0.07000267, + "balance_loss_mlp": 0.01595619, + "epoch": 0.032045693672027656, + "flos": 61847235993600.0, + "grad_norm": 0.8955673428440366, + "language_loss": 0.58032346, + "learning_rate": 3.9999561645241445e-06, + "loss": 0.67590225, + "num_input_tokens_seen": 11336705, + "router_z_loss_clip": 5.9375, + "router_z_loss_mlp": 3.68554688, + "step": 533, + "time_per_iteration": 3.319361925125122 + }, + { + "auxiliary_loss_clip": 0.08788651, + "auxiliary_loss_mlp": 0.01742728, + "balance_loss_clip": 0.07528964, + "balance_loss_mlp": 0.01567061, + "epoch": 0.03210581692469563, + "flos": 28408475116800.0, + "grad_norm": 18.42557842883857, + "language_loss": 0.99417937, + "learning_rate": 3.999953548056907e-06, + "loss": 1.09949315, + "num_input_tokens_seen": 11356820, + "router_z_loss_clip": 12.5859375, + "router_z_loss_mlp": 1.75585938, + "step": 534, + "time_per_iteration": 4.265074729919434 + }, + { + "auxiliary_loss_clip": 0.08770919, + "auxiliary_loss_mlp": 0.0174947, + "balance_loss_clip": 0.07504185, + "balance_loss_mlp": 0.01577809, + "epoch": 0.03216594017736359, + "flos": 24724661414400.0, + "grad_norm": 508.9639434919875, + "language_loss": 0.94137996, + "learning_rate": 3.999950855751232e-06, + "loss": 1.04658389, + "num_input_tokens_seen": 11376645, + "router_z_loss_clip": 12.671875, + "router_z_loss_mlp": 1.71777344, + "step": 535, + "time_per_iteration": 2.7245981693267822 + }, + { + "auxiliary_loss_clip": 0.08758718, + "auxiliary_loss_mlp": 0.01725335, + "balance_loss_clip": 0.07518992, + "balance_loss_mlp": 0.01554437, + "epoch": 0.032226063430031565, + "flos": 31183445508480.0, + "grad_norm": 22.532643943929422, + "language_loss": 0.94802475, + "learning_rate": 3.999948087607219e-06, + "loss": 1.05286527, + "num_input_tokens_seen": 11397310, + "router_z_loss_clip": 12.390625, + "router_z_loss_mlp": 1.70996094, + "step": 536, + "time_per_iteration": 2.7583792209625244 + }, + { + "auxiliary_loss_clip": 0.08705089, + "auxiliary_loss_mlp": 0.01729852, + "balance_loss_clip": 0.07491484, + "balance_loss_mlp": 0.01569253, + "epoch": 0.03228618668269954, + "flos": 32206584188160.0, + "grad_norm": 18.146665662297185, + "language_loss": 0.83908743, + "learning_rate": 3.999945243624975e-06, + "loss": 0.94343686, + "num_input_tokens_seen": 11418475, + "router_z_loss_clip": 12.1484375, + "router_z_loss_mlp": 1.60546875, + "step": 537, + "time_per_iteration": 2.770418167114258 + }, + { + "auxiliary_loss_clip": 0.08731261, + "auxiliary_loss_mlp": 0.01758368, + "balance_loss_clip": 0.07496089, + "balance_loss_mlp": 0.0159672, + "epoch": 0.0323463099353675, + "flos": 22676036140800.0, + "grad_norm": 12.39933899749453, + "language_loss": 0.95942801, + "learning_rate": 3.999942323804607e-06, + "loss": 1.06432438, + "num_input_tokens_seen": 11436630, + "router_z_loss_clip": 12.3515625, + "router_z_loss_mlp": 1.6171875, + "step": 538, + "time_per_iteration": 2.7392029762268066 + }, + { + "auxiliary_loss_clip": 0.0875225, + "auxiliary_loss_mlp": 0.01750456, + "balance_loss_clip": 0.07507962, + "balance_loss_mlp": 0.01584802, + "epoch": 0.032406433188035474, + "flos": 26912207957760.0, + "grad_norm": 95.24255955505957, + "language_loss": 0.90228236, + "learning_rate": 3.999939328146225e-06, + "loss": 1.00730944, + "num_input_tokens_seen": 11457275, + "router_z_loss_clip": 12.4453125, + "router_z_loss_mlp": 1.65625, + "step": 539, + "time_per_iteration": 2.760545253753662 + }, + { + "auxiliary_loss_clip": 0.08700242, + "auxiliary_loss_mlp": 0.01788145, + "balance_loss_clip": 0.07481987, + "balance_loss_mlp": 0.0161162, + "epoch": 0.03246655644070344, + "flos": 31511992567680.0, + "grad_norm": 15.31403595077071, + "language_loss": 0.89398444, + "learning_rate": 3.999936256649943e-06, + "loss": 0.99886829, + "num_input_tokens_seen": 11476925, + "router_z_loss_clip": 12.1875, + "router_z_loss_mlp": 1.76757812, + "step": 540, + "time_per_iteration": 2.791525363922119 + }, + { + "auxiliary_loss_clip": 0.08740143, + "auxiliary_loss_mlp": 0.01834392, + "balance_loss_clip": 0.07499444, + "balance_loss_mlp": 0.01643276, + "epoch": 0.03252667969337141, + "flos": 23224453113600.0, + "grad_norm": 73.47244628512628, + "language_loss": 0.99572086, + "learning_rate": 3.999933109315878e-06, + "loss": 1.10146618, + "num_input_tokens_seen": 11496830, + "router_z_loss_clip": 12.40625, + "router_z_loss_mlp": 1.90917969, + "step": 541, + "time_per_iteration": 2.698315143585205 + }, + { + "auxiliary_loss_clip": 0.08765414, + "auxiliary_loss_mlp": 0.01821723, + "balance_loss_clip": 0.07523992, + "balance_loss_mlp": 0.01612201, + "epoch": 0.032586802946039384, + "flos": 14762800874880.0, + "grad_norm": 49.77821697975532, + "language_loss": 1.00654817, + "learning_rate": 3.9999298861441496e-06, + "loss": 1.11241961, + "num_input_tokens_seen": 11515605, + "router_z_loss_clip": 12.4296875, + "router_z_loss_mlp": 2.09667969, + "step": 542, + "time_per_iteration": 2.6720223426818848 + }, + { + "auxiliary_loss_clip": 0.08722232, + "auxiliary_loss_mlp": 0.01879557, + "balance_loss_clip": 0.07465587, + "balance_loss_mlp": 0.01644953, + "epoch": 0.03264692619870735, + "flos": 24287688771840.0, + "grad_norm": 65.19472082730613, + "language_loss": 0.83699101, + "learning_rate": 3.999926587134879e-06, + "loss": 0.9430089, + "num_input_tokens_seen": 11536230, + "router_z_loss_clip": 12.5625, + "router_z_loss_mlp": 2.34375, + "step": 543, + "time_per_iteration": 2.692474842071533 + }, + { + "auxiliary_loss_clip": 0.0878472, + "auxiliary_loss_mlp": 0.01882603, + "balance_loss_clip": 0.07507792, + "balance_loss_mlp": 0.01631214, + "epoch": 0.03270704945137532, + "flos": 22899763342080.0, + "grad_norm": 1912.553873416959, + "language_loss": 1.09316349, + "learning_rate": 3.999923212288192e-06, + "loss": 1.19983673, + "num_input_tokens_seen": 11554715, + "router_z_loss_clip": 12.7734375, + "router_z_loss_mlp": 2.51367188, + "step": 544, + "time_per_iteration": 2.663267135620117 + }, + { + "auxiliary_loss_clip": 0.0881625, + "auxiliary_loss_mlp": 0.01879222, + "balance_loss_clip": 0.07490219, + "balance_loss_mlp": 0.01537997, + "epoch": 0.032767172704043286, + "flos": 18046887874560.0, + "grad_norm": 1976.6790975556307, + "language_loss": 0.85651809, + "learning_rate": 3.999919761604216e-06, + "loss": 0.96347284, + "num_input_tokens_seen": 11571370, + "router_z_loss_clip": 13.265625, + "router_z_loss_mlp": 3.41210938, + "step": 545, + "time_per_iteration": 2.6566007137298584 + }, + { + "auxiliary_loss_clip": 0.08881226, + "auxiliary_loss_mlp": 0.01919651, + "balance_loss_clip": 0.07538594, + "balance_loss_mlp": 0.01591969, + "epoch": 0.03282729595671126, + "flos": 22535353935360.0, + "grad_norm": 36635.99630864103, + "language_loss": 1.19350576, + "learning_rate": 3.999916235083083e-06, + "loss": 1.30151451, + "num_input_tokens_seen": 11588560, + "router_z_loss_clip": 13.421875, + "router_z_loss_mlp": 3.27539062, + "step": 546, + "time_per_iteration": 2.6508443355560303 + }, + { + "auxiliary_loss_clip": 0.0885489, + "auxiliary_loss_mlp": 0.01969573, + "balance_loss_clip": 0.07525921, + "balance_loss_mlp": 0.01650092, + "epoch": 0.03288741920937923, + "flos": 20416555267200.0, + "grad_norm": 175.83782863941582, + "language_loss": 1.0484463, + "learning_rate": 3.999912632724925e-06, + "loss": 1.15669084, + "num_input_tokens_seen": 11605685, + "router_z_loss_clip": 13.28125, + "router_z_loss_mlp": 3.1953125, + "step": 547, + "time_per_iteration": 2.709317445755005 + }, + { + "auxiliary_loss_clip": 0.08846241, + "auxiliary_loss_mlp": 0.02054837, + "balance_loss_clip": 0.07521404, + "balance_loss_mlp": 0.01724484, + "epoch": 0.032947542462047195, + "flos": 20784402691200.0, + "grad_norm": 1231.4634556281662, + "language_loss": 0.99917918, + "learning_rate": 3.999908954529881e-06, + "loss": 1.10818994, + "num_input_tokens_seen": 11626290, + "router_z_loss_clip": 13.2578125, + "router_z_loss_mlp": 3.30664062, + "step": 548, + "time_per_iteration": 2.761152744293213 + }, + { + "auxiliary_loss_clip": 0.08837526, + "auxiliary_loss_mlp": 0.02099407, + "balance_loss_clip": 0.07500955, + "balance_loss_mlp": 0.01773059, + "epoch": 0.03300766571471517, + "flos": 19907354805120.0, + "grad_norm": 538.4476306780408, + "language_loss": 0.89559388, + "learning_rate": 3.999905200498087e-06, + "loss": 1.00496316, + "num_input_tokens_seen": 11643950, + "router_z_loss_clip": 13.3671875, + "router_z_loss_mlp": 3.26367188, + "step": 549, + "time_per_iteration": 2.7063941955566406 + }, + { + "auxiliary_loss_clip": 0.08802217, + "auxiliary_loss_mlp": 0.02104246, + "balance_loss_clip": 0.07490957, + "balance_loss_mlp": 0.0178324, + "epoch": 0.03306778896738313, + "flos": 17973569952000.0, + "grad_norm": 95.24031464069257, + "language_loss": 1.00179911, + "learning_rate": 3.999901370629689e-06, + "loss": 1.1108638, + "num_input_tokens_seen": 11662560, + "router_z_loss_clip": 13.125, + "router_z_loss_mlp": 3.20703125, + "step": 550, + "time_per_iteration": 2.6905276775360107 + }, + { + "auxiliary_loss_clip": 0.08789266, + "auxiliary_loss_mlp": 0.02134598, + "balance_loss_clip": 0.07500902, + "balance_loss_mlp": 0.01818551, + "epoch": 0.033127912220051105, + "flos": 21659899276800.0, + "grad_norm": 52.30662645055097, + "language_loss": 0.93777549, + "learning_rate": 3.99989746492483e-06, + "loss": 1.04701412, + "num_input_tokens_seen": 11682265, + "router_z_loss_clip": 12.8984375, + "router_z_loss_mlp": 3.16015625, + "step": 551, + "time_per_iteration": 2.7061314582824707 + }, + { + "auxiliary_loss_clip": 0.08738074, + "auxiliary_loss_mlp": 0.02134365, + "balance_loss_clip": 0.07474738, + "balance_loss_mlp": 0.01835484, + "epoch": 0.03318803547271908, + "flos": 30195875687040.0, + "grad_norm": 81.64424293941155, + "language_loss": 1.06586599, + "learning_rate": 3.999893483383658e-06, + "loss": 1.17459035, + "num_input_tokens_seen": 11699300, + "router_z_loss_clip": 12.6484375, + "router_z_loss_mlp": 2.98828125, + "step": 552, + "time_per_iteration": 2.7557857036590576 + }, + { + "auxiliary_loss_clip": 0.08738689, + "auxiliary_loss_mlp": 0.02132193, + "balance_loss_clip": 0.07474653, + "balance_loss_mlp": 0.01841513, + "epoch": 0.03324815872538704, + "flos": 20382286147200.0, + "grad_norm": 103.46520912531122, + "language_loss": 1.07230687, + "learning_rate": 3.999889426006326e-06, + "loss": 1.18101549, + "num_input_tokens_seen": 11716955, + "router_z_loss_clip": 12.6328125, + "router_z_loss_mlp": 2.90625, + "step": 553, + "time_per_iteration": 2.6690380573272705 + }, + { + "auxiliary_loss_clip": 0.0876793, + "auxiliary_loss_mlp": 0.02203825, + "balance_loss_clip": 0.07493228, + "balance_loss_mlp": 0.01878431, + "epoch": 0.033308281978055014, + "flos": 24500766504960.0, + "grad_norm": 2577.3704160991106, + "language_loss": 0.91311669, + "learning_rate": 3.999885292792986e-06, + "loss": 1.0228343, + "num_input_tokens_seen": 11736130, + "router_z_loss_clip": 12.75, + "router_z_loss_mlp": 3.25390625, + "step": 554, + "time_per_iteration": 2.690467119216919 + }, + { + "auxiliary_loss_clip": 0.08781252, + "auxiliary_loss_mlp": 0.02161472, + "balance_loss_clip": 0.0750941, + "balance_loss_mlp": 0.01854961, + "epoch": 0.03336840523072298, + "flos": 23406406254720.0, + "grad_norm": 23.66967902789698, + "language_loss": 0.92365468, + "learning_rate": 3.999881083743795e-06, + "loss": 1.03308201, + "num_input_tokens_seen": 11754425, + "router_z_loss_clip": 12.7265625, + "router_z_loss_mlp": 3.06445312, + "step": 555, + "time_per_iteration": 2.7009239196777344 + }, + { + "auxiliary_loss_clip": 0.0871176, + "auxiliary_loss_mlp": 0.02191896, + "balance_loss_clip": 0.0746032, + "balance_loss_mlp": 0.01904268, + "epoch": 0.03342852848339095, + "flos": 30557685617280.0, + "grad_norm": 32.47411862244808, + "language_loss": 1.03816569, + "learning_rate": 3.999876798858914e-06, + "loss": 1.14720225, + "num_input_tokens_seen": 11772845, + "router_z_loss_clip": 12.5234375, + "router_z_loss_mlp": 2.875, + "step": 556, + "time_per_iteration": 2.7751269340515137 + }, + { + "auxiliary_loss_clip": 0.08728363, + "auxiliary_loss_mlp": 0.02208938, + "balance_loss_clip": 0.07497713, + "balance_loss_mlp": 0.01914825, + "epoch": 0.03348865173605892, + "flos": 22899931050240.0, + "grad_norm": 26.350622314910414, + "language_loss": 0.97158062, + "learning_rate": 3.999872438138503e-06, + "loss": 1.0809536, + "num_input_tokens_seen": 11792850, + "router_z_loss_clip": 12.3046875, + "router_z_loss_mlp": 2.93945312, + "step": 557, + "time_per_iteration": 2.6803956031799316 + }, + { + "auxiliary_loss_clip": 0.08708371, + "auxiliary_loss_mlp": 0.02154386, + "balance_loss_clip": 0.0748485, + "balance_loss_mlp": 0.01905477, + "epoch": 0.03354877498872689, + "flos": 17681807635200.0, + "grad_norm": 18.772470179547817, + "language_loss": 1.10132766, + "learning_rate": 3.999868001582729e-06, + "loss": 1.20995522, + "num_input_tokens_seen": 11809670, + "router_z_loss_clip": 12.2265625, + "router_z_loss_mlp": 2.49023438, + "step": 558, + "time_per_iteration": 2.650348663330078 + }, + { + "auxiliary_loss_clip": 0.08667068, + "auxiliary_loss_mlp": 0.02131925, + "balance_loss_clip": 0.07472065, + "balance_loss_mlp": 0.01914487, + "epoch": 0.03360889824139486, + "flos": 21659438079360.0, + "grad_norm": 17.45552884003481, + "language_loss": 0.92322779, + "learning_rate": 3.99986348919176e-06, + "loss": 1.03121769, + "num_input_tokens_seen": 11829665, + "router_z_loss_clip": 11.953125, + "router_z_loss_mlp": 2.17578125, + "step": 559, + "time_per_iteration": 2.69866681098938 + }, + { + "auxiliary_loss_clip": 0.08715945, + "auxiliary_loss_mlp": 0.02064835, + "balance_loss_clip": 0.07521564, + "balance_loss_mlp": 0.01861607, + "epoch": 0.033669021494062826, + "flos": 21801671585280.0, + "grad_norm": 8.293279297555102, + "language_loss": 0.96911502, + "learning_rate": 3.9998589009657675e-06, + "loss": 1.07692266, + "num_input_tokens_seen": 11848190, + "router_z_loss_clip": 11.9453125, + "router_z_loss_mlp": 2.03417969, + "step": 560, + "time_per_iteration": 2.7140135765075684 + }, + { + "auxiliary_loss_clip": 0.08642244, + "auxiliary_loss_mlp": 0.01977364, + "balance_loss_clip": 0.07480196, + "balance_loss_mlp": 0.01790062, + "epoch": 0.0337291447467308, + "flos": 21871761125760.0, + "grad_norm": 36.168101096947126, + "language_loss": 0.91244531, + "learning_rate": 3.999854236904925e-06, + "loss": 1.01864135, + "num_input_tokens_seen": 11864795, + "router_z_loss_clip": 11.640625, + "router_z_loss_mlp": 1.875, + "step": 561, + "time_per_iteration": 2.6863293647766113 + }, + { + "auxiliary_loss_clip": 0.08645087, + "auxiliary_loss_mlp": 0.01996294, + "balance_loss_clip": 0.07495341, + "balance_loss_mlp": 0.01809374, + "epoch": 0.03378926799939877, + "flos": 24253251943680.0, + "grad_norm": 9.210066016696686, + "language_loss": 0.90415317, + "learning_rate": 3.999849497009409e-06, + "loss": 1.01056707, + "num_input_tokens_seen": 11885275, + "router_z_loss_clip": 11.4921875, + "router_z_loss_mlp": 1.86914062, + "step": 562, + "time_per_iteration": 2.724127769470215 + }, + { + "auxiliary_loss_clip": 0.08630846, + "auxiliary_loss_mlp": 0.01896325, + "balance_loss_clip": 0.07475269, + "balance_loss_mlp": 0.0172867, + "epoch": 0.033849391252066735, + "flos": 16513290921600.0, + "grad_norm": 8.70795014369516, + "language_loss": 0.93251538, + "learning_rate": 3.999844681279401e-06, + "loss": 1.03778696, + "num_input_tokens_seen": 11903595, + "router_z_loss_clip": 11.5546875, + "router_z_loss_mlp": 1.67773438, + "step": 563, + "time_per_iteration": 2.653869867324829 + }, + { + "auxiliary_loss_clip": 0.08601731, + "auxiliary_loss_mlp": 0.0185707, + "balance_loss_clip": 0.07466102, + "balance_loss_mlp": 0.01686648, + "epoch": 0.03390951450473471, + "flos": 15674746786560.0, + "grad_norm": 12.715008158349837, + "language_loss": 1.03361213, + "learning_rate": 3.99983978971508e-06, + "loss": 1.13820004, + "num_input_tokens_seen": 11917815, + "router_z_loss_clip": 11.34375, + "router_z_loss_mlp": 1.70507812, + "step": 564, + "time_per_iteration": 2.6272659301757812 + }, + { + "auxiliary_loss_clip": 0.08544251, + "auxiliary_loss_mlp": 0.01761406, + "balance_loss_clip": 0.07418631, + "balance_loss_mlp": 0.01609581, + "epoch": 0.03396963775740267, + "flos": 22681444728960.0, + "grad_norm": 17.830043780961535, + "language_loss": 1.06299067, + "learning_rate": 3.999834822316635e-06, + "loss": 1.1660471, + "num_input_tokens_seen": 11936305, + "router_z_loss_clip": 11.2578125, + "router_z_loss_mlp": 1.51855469, + "step": 565, + "time_per_iteration": 2.6662397384643555 + }, + { + "auxiliary_loss_clip": 0.07533604, + "auxiliary_loss_mlp": 0.01361189, + "balance_loss_clip": 0.07012594, + "balance_loss_mlp": 0.01291713, + "epoch": 0.034029761010070644, + "flos": 64414872656640.0, + "grad_norm": 1.941550580035849, + "language_loss": 0.56352836, + "learning_rate": 3.9998297790842535e-06, + "loss": 0.65247625, + "num_input_tokens_seen": 11998940, + "router_z_loss_clip": 5.203125, + "router_z_loss_mlp": 0.6953125, + "step": 566, + "time_per_iteration": 3.3542587757110596 + }, + { + "auxiliary_loss_clip": 0.08492532, + "auxiliary_loss_mlp": 0.0159982, + "balance_loss_clip": 0.07380439, + "balance_loss_mlp": 0.01460488, + "epoch": 0.034089884262738616, + "flos": 25010302383360.0, + "grad_norm": 17.320262523662066, + "language_loss": 0.91644871, + "learning_rate": 3.999824660018126e-06, + "loss": 1.01737225, + "num_input_tokens_seen": 12018860, + "router_z_loss_clip": 11.125, + "router_z_loss_mlp": 1.39355469, + "step": 567, + "time_per_iteration": 2.7798964977264404 + }, + { + "auxiliary_loss_clip": 0.08452182, + "auxiliary_loss_mlp": 0.01578824, + "balance_loss_clip": 0.07376789, + "balance_loss_mlp": 0.01451318, + "epoch": 0.03415000751540658, + "flos": 28446643451520.0, + "grad_norm": 16.848598157475653, + "language_loss": 0.91613495, + "learning_rate": 3.999819465118447e-06, + "loss": 1.01644492, + "num_input_tokens_seen": 12039675, + "router_z_loss_clip": 10.7578125, + "router_z_loss_mlp": 1.27539062, + "step": 568, + "time_per_iteration": 2.7506062984466553 + }, + { + "auxiliary_loss_clip": 0.08471178, + "auxiliary_loss_mlp": 0.01592293, + "balance_loss_clip": 0.07369491, + "balance_loss_mlp": 0.0146307, + "epoch": 0.034210130768074554, + "flos": 21474843534720.0, + "grad_norm": 19.531015605864777, + "language_loss": 0.96641582, + "learning_rate": 3.999814194385413e-06, + "loss": 1.06705046, + "num_input_tokens_seen": 12057680, + "router_z_loss_clip": 11.0234375, + "router_z_loss_mlp": 1.29199219, + "step": 569, + "time_per_iteration": 2.679094076156616 + }, + { + "auxiliary_loss_clip": 0.08444348, + "auxiliary_loss_mlp": 0.01572924, + "balance_loss_clip": 0.07354259, + "balance_loss_mlp": 0.01444559, + "epoch": 0.03427025402074252, + "flos": 18703436941440.0, + "grad_norm": 10.09748529662486, + "language_loss": 1.03407526, + "learning_rate": 3.9998088478192255e-06, + "loss": 1.13424802, + "num_input_tokens_seen": 12076135, + "router_z_loss_clip": 10.90625, + "router_z_loss_mlp": 1.28320312, + "step": 570, + "time_per_iteration": 5.62298059463501 + }, + { + "auxiliary_loss_clip": 0.08452979, + "auxiliary_loss_mlp": 0.01597574, + "balance_loss_clip": 0.07344566, + "balance_loss_mlp": 0.01465204, + "epoch": 0.03433037727341049, + "flos": 20856253167360.0, + "grad_norm": 7.817701028438559, + "language_loss": 0.91945982, + "learning_rate": 3.9998034254200846e-06, + "loss": 1.01996529, + "num_input_tokens_seen": 12094785, + "router_z_loss_clip": 11.09375, + "router_z_loss_mlp": 1.32421875, + "step": 571, + "time_per_iteration": 2.654836654663086 + }, + { + "auxiliary_loss_clip": 0.08401142, + "auxiliary_loss_mlp": 0.01674875, + "balance_loss_clip": 0.073204, + "balance_loss_mlp": 0.01534971, + "epoch": 0.03439050052607846, + "flos": 25417240536960.0, + "grad_norm": 10.131092922686104, + "language_loss": 0.93731064, + "learning_rate": 3.999797927188199e-06, + "loss": 1.0380708, + "num_input_tokens_seen": 12114590, + "router_z_loss_clip": 10.8046875, + "router_z_loss_mlp": 1.39941406, + "step": 572, + "time_per_iteration": 4.118088483810425 + }, + { + "auxiliary_loss_clip": 0.08396388, + "auxiliary_loss_mlp": 0.01765484, + "balance_loss_clip": 0.07306887, + "balance_loss_mlp": 0.01610417, + "epoch": 0.03445062377874643, + "flos": 17646029141760.0, + "grad_norm": 20.127104681387284, + "language_loss": 0.93513721, + "learning_rate": 3.999792353123774e-06, + "loss": 1.03675592, + "num_input_tokens_seen": 12132390, + "router_z_loss_clip": 10.8984375, + "router_z_loss_mlp": 1.55078125, + "step": 573, + "time_per_iteration": 2.743281841278076 + }, + { + "auxiliary_loss_clip": 0.08402257, + "auxiliary_loss_mlp": 0.01880152, + "balance_loss_clip": 0.07297936, + "balance_loss_mlp": 0.01694757, + "epoch": 0.0345107470314144, + "flos": 16770239066880.0, + "grad_norm": 36.525489937717154, + "language_loss": 0.90410393, + "learning_rate": 3.999786703227023e-06, + "loss": 1.00692797, + "num_input_tokens_seen": 12149035, + "router_z_loss_clip": 11.046875, + "router_z_loss_mlp": 1.85351562, + "step": 574, + "time_per_iteration": 4.080662250518799 + }, + { + "auxiliary_loss_clip": 0.08410574, + "auxiliary_loss_mlp": 0.01951083, + "balance_loss_clip": 0.0729783, + "balance_loss_mlp": 0.01742514, + "epoch": 0.03457087028408237, + "flos": 14689776441600.0, + "grad_norm": 44.337021824182244, + "language_loss": 0.94332999, + "learning_rate": 3.9997809774981606e-06, + "loss": 1.04694653, + "num_input_tokens_seen": 12167530, + "router_z_loss_clip": 11.125, + "router_z_loss_mlp": 2.08398438, + "step": 575, + "time_per_iteration": 2.6497297286987305 + }, + { + "auxiliary_loss_clip": 0.0841077, + "auxiliary_loss_mlp": 0.02005797, + "balance_loss_clip": 0.07284614, + "balance_loss_mlp": 0.01780635, + "epoch": 0.03463099353675034, + "flos": 20017499397120.0, + "grad_norm": 29.883353134979416, + "language_loss": 0.90882921, + "learning_rate": 3.9997751759374025e-06, + "loss": 1.01299489, + "num_input_tokens_seen": 12186340, + "router_z_loss_clip": 11.265625, + "router_z_loss_mlp": 2.24804688, + "step": 576, + "time_per_iteration": 2.67240309715271 + }, + { + "auxiliary_loss_clip": 0.08418353, + "auxiliary_loss_mlp": 0.02062659, + "balance_loss_clip": 0.07293572, + "balance_loss_mlp": 0.01817947, + "epoch": 0.03469111678941831, + "flos": 25308144120960.0, + "grad_norm": 230.42461275956111, + "language_loss": 0.94618452, + "learning_rate": 3.99976929854497e-06, + "loss": 1.05099463, + "num_input_tokens_seen": 12204090, + "router_z_loss_clip": 11.2421875, + "router_z_loss_mlp": 2.44921875, + "step": 577, + "time_per_iteration": 2.6817197799682617 + }, + { + "auxiliary_loss_clip": 0.08418664, + "auxiliary_loss_mlp": 0.02057238, + "balance_loss_clip": 0.07282382, + "balance_loss_mlp": 0.01803943, + "epoch": 0.034751240042086275, + "flos": 23266311027840.0, + "grad_norm": 40.134119868020754, + "language_loss": 0.81416667, + "learning_rate": 3.9997633453210845e-06, + "loss": 0.9189257, + "num_input_tokens_seen": 12224850, + "router_z_loss_clip": 11.359375, + "router_z_loss_mlp": 2.53320312, + "step": 578, + "time_per_iteration": 2.6971585750579834 + }, + { + "auxiliary_loss_clip": 0.08457734, + "auxiliary_loss_mlp": 0.0202791, + "balance_loss_clip": 0.07290839, + "balance_loss_mlp": 0.0177881, + "epoch": 0.03481136329475425, + "flos": 23776056541440.0, + "grad_norm": 24.631913893483972, + "language_loss": 0.86342728, + "learning_rate": 3.999757316265973e-06, + "loss": 0.96828371, + "num_input_tokens_seen": 12244935, + "router_z_loss_clip": 11.6640625, + "router_z_loss_mlp": 2.4921875, + "step": 579, + "time_per_iteration": 2.694719076156616 + }, + { + "auxiliary_loss_clip": 0.08425288, + "auxiliary_loss_mlp": 0.0202294, + "balance_loss_clip": 0.07289667, + "balance_loss_mlp": 0.01773459, + "epoch": 0.03487148654742222, + "flos": 20163799825920.0, + "grad_norm": 24.746236106534205, + "language_loss": 0.94137156, + "learning_rate": 3.999751211379863e-06, + "loss": 1.04585385, + "num_input_tokens_seen": 12262140, + "router_z_loss_clip": 11.3671875, + "router_z_loss_mlp": 2.49609375, + "step": 580, + "time_per_iteration": 2.6965222358703613 + }, + { + "auxiliary_loss_clip": 0.08429064, + "auxiliary_loss_mlp": 0.02027245, + "balance_loss_clip": 0.07292753, + "balance_loss_mlp": 0.01790066, + "epoch": 0.034931609800090184, + "flos": 15675082202880.0, + "grad_norm": 72.69729205239823, + "language_loss": 0.92401338, + "learning_rate": 3.999745030662987e-06, + "loss": 1.02857637, + "num_input_tokens_seen": 12280930, + "router_z_loss_clip": 11.34375, + "router_z_loss_mlp": 2.37011719, + "step": 581, + "time_per_iteration": 2.6485416889190674 + }, + { + "auxiliary_loss_clip": 0.08388546, + "auxiliary_loss_mlp": 0.01934185, + "balance_loss_clip": 0.07261664, + "balance_loss_mlp": 0.01722183, + "epoch": 0.034991733052758156, + "flos": 16367912887680.0, + "grad_norm": 7.903206829146829, + "language_loss": 0.86330044, + "learning_rate": 3.99973877411558e-06, + "loss": 0.96652782, + "num_input_tokens_seen": 12299125, + "router_z_loss_clip": 11.28125, + "router_z_loss_mlp": 2.11914062, + "step": 582, + "time_per_iteration": 2.649725914001465 + }, + { + "auxiliary_loss_clip": 0.08328964, + "auxiliary_loss_mlp": 0.01871683, + "balance_loss_clip": 0.07243238, + "balance_loss_mlp": 0.01678087, + "epoch": 0.03505185630542612, + "flos": 19392787681920.0, + "grad_norm": 16.174360943611433, + "language_loss": 0.95958614, + "learning_rate": 3.999732441737877e-06, + "loss": 1.06159258, + "num_input_tokens_seen": 12316905, + "router_z_loss_clip": 10.859375, + "router_z_loss_mlp": 1.9375, + "step": 583, + "time_per_iteration": 2.643488645553589 + }, + { + "auxiliary_loss_clip": 0.08363868, + "auxiliary_loss_mlp": 0.01881498, + "balance_loss_clip": 0.07254223, + "balance_loss_mlp": 0.0168199, + "epoch": 0.03511197955809409, + "flos": 21330094406400.0, + "grad_norm": 77.84633741200611, + "language_loss": 0.91128743, + "learning_rate": 3.99972603353012e-06, + "loss": 1.01374114, + "num_input_tokens_seen": 12335070, + "router_z_loss_clip": 11.09375, + "router_z_loss_mlp": 1.99511719, + "step": 584, + "time_per_iteration": 2.6665167808532715 + }, + { + "auxiliary_loss_clip": 0.08332659, + "auxiliary_loss_mlp": 0.01830344, + "balance_loss_clip": 0.07228079, + "balance_loss_mlp": 0.01642279, + "epoch": 0.035172102810762065, + "flos": 14141736812160.0, + "grad_norm": 18.638483190058057, + "language_loss": 1.05479646, + "learning_rate": 3.999719549492551e-06, + "loss": 1.15642655, + "num_input_tokens_seen": 12350315, + "router_z_loss_clip": 11.046875, + "router_z_loss_mlp": 1.88183594, + "step": 585, + "time_per_iteration": 2.6243345737457275 + }, + { + "auxiliary_loss_clip": 0.08346213, + "auxiliary_loss_mlp": 0.01757237, + "balance_loss_clip": 0.07237425, + "balance_loss_mlp": 0.01597305, + "epoch": 0.03523222606343003, + "flos": 20302092190080.0, + "grad_norm": 16.531437097419627, + "language_loss": 0.96612549, + "learning_rate": 3.9997129896254165e-06, + "loss": 1.06716001, + "num_input_tokens_seen": 12366030, + "router_z_loss_clip": 11.0859375, + "router_z_loss_mlp": 1.59960938, + "step": 586, + "time_per_iteration": 2.79085373878479 + }, + { + "auxiliary_loss_clip": 0.08346236, + "auxiliary_loss_mlp": 0.01816744, + "balance_loss_clip": 0.07224018, + "balance_loss_mlp": 0.01643652, + "epoch": 0.035292349316098, + "flos": 20382034584960.0, + "grad_norm": 18.968444028471765, + "language_loss": 0.85692161, + "learning_rate": 3.999706353928965e-06, + "loss": 0.95855141, + "num_input_tokens_seen": 12384895, + "router_z_loss_clip": 11.234375, + "router_z_loss_mlp": 1.73242188, + "step": 587, + "time_per_iteration": 2.6773126125335693 + }, + { + "auxiliary_loss_clip": 0.08336938, + "auxiliary_loss_mlp": 0.01864921, + "balance_loss_clip": 0.07205997, + "balance_loss_mlp": 0.01679527, + "epoch": 0.03535247256876597, + "flos": 21475011242880.0, + "grad_norm": 15.49018014588467, + "language_loss": 0.87486923, + "learning_rate": 3.999699642403449e-06, + "loss": 0.97688788, + "num_input_tokens_seen": 12404980, + "router_z_loss_clip": 11.3203125, + "router_z_loss_mlp": 1.85546875, + "step": 588, + "time_per_iteration": 2.7011075019836426 + }, + { + "auxiliary_loss_clip": 0.08372419, + "auxiliary_loss_mlp": 0.01837943, + "balance_loss_clip": 0.07240701, + "balance_loss_mlp": 0.01648257, + "epoch": 0.03541259582143394, + "flos": 23629798039680.0, + "grad_norm": 7.372880070726386, + "language_loss": 1.04957795, + "learning_rate": 3.99969285504912e-06, + "loss": 1.15168166, + "num_input_tokens_seen": 12423835, + "router_z_loss_clip": 11.3203125, + "router_z_loss_mlp": 1.8984375, + "step": 589, + "time_per_iteration": 2.6905288696289062 + }, + { + "auxiliary_loss_clip": 0.08381461, + "auxiliary_loss_mlp": 0.01904967, + "balance_loss_clip": 0.07235886, + "balance_loss_mlp": 0.0170708, + "epoch": 0.03547271907410191, + "flos": 33734269428480.0, + "grad_norm": 5.900447642035286, + "language_loss": 0.93457747, + "learning_rate": 3.99968599186624e-06, + "loss": 1.03744173, + "num_input_tokens_seen": 12443135, + "router_z_loss_clip": 11.4609375, + "router_z_loss_mlp": 1.98046875, + "step": 590, + "time_per_iteration": 2.7626585960388184 + }, + { + "auxiliary_loss_clip": 0.08363292, + "auxiliary_loss_mlp": 0.01913512, + "balance_loss_clip": 0.07212853, + "balance_loss_mlp": 0.01716864, + "epoch": 0.03553284232676988, + "flos": 21149147514240.0, + "grad_norm": 8.056614912073432, + "language_loss": 0.93932045, + "learning_rate": 3.999679052855065e-06, + "loss": 1.04208851, + "num_input_tokens_seen": 12462895, + "router_z_loss_clip": 11.5, + "router_z_loss_mlp": 1.96484375, + "step": 591, + "time_per_iteration": 2.6892929077148438 + }, + { + "auxiliary_loss_clip": 0.08372159, + "auxiliary_loss_mlp": 0.0192709, + "balance_loss_clip": 0.0721619, + "balance_loss_mlp": 0.01729871, + "epoch": 0.03559296557943785, + "flos": 20052607057920.0, + "grad_norm": 11.504016210282687, + "language_loss": 0.90931952, + "learning_rate": 3.999672038015861e-06, + "loss": 1.01231205, + "num_input_tokens_seen": 12481515, + "router_z_loss_clip": 11.5546875, + "router_z_loss_mlp": 1.97363281, + "step": 592, + "time_per_iteration": 2.682248830795288 + }, + { + "auxiliary_loss_clip": 0.07476875, + "auxiliary_loss_mlp": 0.01418694, + "balance_loss_clip": 0.06931903, + "balance_loss_mlp": 0.01348551, + "epoch": 0.035653088832105814, + "flos": 60354742268160.0, + "grad_norm": 1.7390456768388496, + "language_loss": 0.61271667, + "learning_rate": 3.999664947348893e-06, + "loss": 0.70167232, + "num_input_tokens_seen": 12548220, + "router_z_loss_clip": 5.4375, + "router_z_loss_mlp": 0.70214844, + "step": 593, + "time_per_iteration": 3.372291088104248 + }, + { + "auxiliary_loss_clip": 0.08396088, + "auxiliary_loss_mlp": 0.01873215, + "balance_loss_clip": 0.07235788, + "balance_loss_mlp": 0.0169402, + "epoch": 0.035713212084773786, + "flos": 20118084624000.0, + "grad_norm": 4.056543882896522, + "language_loss": 0.9366371, + "learning_rate": 3.999657780854429e-06, + "loss": 1.03933024, + "num_input_tokens_seen": 12566105, + "router_z_loss_clip": 11.609375, + "router_z_loss_mlp": 1.79199219, + "step": 594, + "time_per_iteration": 2.656702756881714 + }, + { + "auxiliary_loss_clip": 0.08370538, + "auxiliary_loss_mlp": 0.01864142, + "balance_loss_clip": 0.07210694, + "balance_loss_mlp": 0.01671786, + "epoch": 0.03577333533744176, + "flos": 26292862903680.0, + "grad_norm": 7.659859705492133, + "language_loss": 0.90299201, + "learning_rate": 3.999650538532742e-06, + "loss": 1.00533891, + "num_input_tokens_seen": 12586680, + "router_z_loss_clip": 11.609375, + "router_z_loss_mlp": 1.92480469, + "step": 595, + "time_per_iteration": 2.735182285308838 + }, + { + "auxiliary_loss_clip": 0.08357747, + "auxiliary_loss_mlp": 0.01819213, + "balance_loss_clip": 0.07199049, + "balance_loss_mlp": 0.01642402, + "epoch": 0.035833458590109724, + "flos": 10894392627840.0, + "grad_norm": 11.312857601205495, + "language_loss": 1.05936086, + "learning_rate": 3.999643220384106e-06, + "loss": 1.16113043, + "num_input_tokens_seen": 12601605, + "router_z_loss_clip": 11.6015625, + "router_z_loss_mlp": 1.76953125, + "step": 596, + "time_per_iteration": 2.6456210613250732 + }, + { + "auxiliary_loss_clip": 0.08308871, + "auxiliary_loss_mlp": 0.01797355, + "balance_loss_clip": 0.07171883, + "balance_loss_mlp": 0.01627124, + "epoch": 0.035893581842777696, + "flos": 22096620357120.0, + "grad_norm": 9.130935198122538, + "language_loss": 0.90824974, + "learning_rate": 3.999635826408799e-06, + "loss": 1.00931203, + "num_input_tokens_seen": 12620365, + "router_z_loss_clip": 11.3671875, + "router_z_loss_mlp": 1.70117188, + "step": 597, + "time_per_iteration": 2.6823341846466064 + }, + { + "auxiliary_loss_clip": 0.08270305, + "auxiliary_loss_mlp": 0.01746721, + "balance_loss_clip": 0.0715827, + "balance_loss_mlp": 0.01584406, + "epoch": 0.03595370509544566, + "flos": 23044847886720.0, + "grad_norm": 9.111056149089638, + "language_loss": 0.87109864, + "learning_rate": 3.999628356607101e-06, + "loss": 0.97126889, + "num_input_tokens_seen": 12641140, + "router_z_loss_clip": 11.1171875, + "router_z_loss_mlp": 1.62402344, + "step": 598, + "time_per_iteration": 2.720789670944214 + }, + { + "auxiliary_loss_clip": 0.08249436, + "auxiliary_loss_mlp": 0.01768458, + "balance_loss_clip": 0.07144348, + "balance_loss_mlp": 0.01596511, + "epoch": 0.03601382834811363, + "flos": 20784109201920.0, + "grad_norm": 3.8408259345244593, + "language_loss": 0.87403977, + "learning_rate": 3.999620810979295e-06, + "loss": 0.97421879, + "num_input_tokens_seen": 12661080, + "router_z_loss_clip": 11.0546875, + "router_z_loss_mlp": 1.71972656, + "step": 599, + "time_per_iteration": 2.648764133453369 + }, + { + "auxiliary_loss_clip": 0.08292407, + "auxiliary_loss_mlp": 0.01772624, + "balance_loss_clip": 0.07133689, + "balance_loss_mlp": 0.01594573, + "epoch": 0.036073951600781605, + "flos": 23958470880000.0, + "grad_norm": 6.448569836830266, + "language_loss": 0.96199447, + "learning_rate": 3.999613189525668e-06, + "loss": 1.06264472, + "num_input_tokens_seen": 12678270, + "router_z_loss_clip": 11.6015625, + "router_z_loss_mlp": 1.78027344, + "step": 600, + "time_per_iteration": 2.677182197570801 + }, + { + "auxiliary_loss_clip": 0.08248397, + "auxiliary_loss_mlp": 0.01755802, + "balance_loss_clip": 0.07142025, + "balance_loss_mlp": 0.01582996, + "epoch": 0.03613407485344957, + "flos": 18917562850560.0, + "grad_norm": 6.503034140887701, + "language_loss": 0.8985101, + "learning_rate": 3.999605492246508e-06, + "loss": 0.9985522, + "num_input_tokens_seen": 12697295, + "router_z_loss_clip": 11.0703125, + "router_z_loss_mlp": 1.72753906, + "step": 601, + "time_per_iteration": 2.6344988346099854 + }, + { + "auxiliary_loss_clip": 0.08262836, + "auxiliary_loss_mlp": 0.01796413, + "balance_loss_clip": 0.07111854, + "balance_loss_mlp": 0.01602054, + "epoch": 0.03619419810611754, + "flos": 23045057521920.0, + "grad_norm": 7.606856937764795, + "language_loss": 0.83811623, + "learning_rate": 3.999597719142107e-06, + "loss": 0.93870872, + "num_input_tokens_seen": 12716165, + "router_z_loss_clip": 11.5234375, + "router_z_loss_mlp": 1.94335938, + "step": 602, + "time_per_iteration": 2.6544992923736572 + }, + { + "auxiliary_loss_clip": 0.08245073, + "auxiliary_loss_mlp": 0.01805812, + "balance_loss_clip": 0.07111835, + "balance_loss_mlp": 0.01607543, + "epoch": 0.03625432135878551, + "flos": 29465002448640.0, + "grad_norm": 10.358505294515373, + "language_loss": 0.86272752, + "learning_rate": 3.999589870212761e-06, + "loss": 0.96323633, + "num_input_tokens_seen": 12735475, + "router_z_loss_clip": 11.328125, + "router_z_loss_mlp": 1.984375, + "step": 603, + "time_per_iteration": 2.7074103355407715 + }, + { + "auxiliary_loss_clip": 0.08216999, + "auxiliary_loss_mlp": 0.01791145, + "balance_loss_clip": 0.07080936, + "balance_loss_mlp": 0.01602794, + "epoch": 0.03631444461145348, + "flos": 23514412567680.0, + "grad_norm": 4.761739949728406, + "language_loss": 0.93545526, + "learning_rate": 3.9995819454587664e-06, + "loss": 1.03553677, + "num_input_tokens_seen": 12754540, + "router_z_loss_clip": 11.3671875, + "router_z_loss_mlp": 1.88574219, + "step": 604, + "time_per_iteration": 2.683458089828491 + }, + { + "auxiliary_loss_clip": 0.08179027, + "auxiliary_loss_mlp": 0.01779272, + "balance_loss_clip": 0.07038404, + "balance_loss_mlp": 0.01587965, + "epoch": 0.03637456786412145, + "flos": 16623770929920.0, + "grad_norm": 10.408229209770424, + "language_loss": 0.89575511, + "learning_rate": 3.999573944880424e-06, + "loss": 0.99533808, + "num_input_tokens_seen": 12773050, + "router_z_loss_clip": 11.4140625, + "router_z_loss_mlp": 1.91308594, + "step": 605, + "time_per_iteration": 2.6058335304260254 + }, + { + "auxiliary_loss_clip": 0.08185698, + "auxiliary_loss_mlp": 0.0179345, + "balance_loss_clip": 0.07041989, + "balance_loss_mlp": 0.01587933, + "epoch": 0.03643469111678942, + "flos": 15857328833280.0, + "grad_norm": 18.44965350869095, + "language_loss": 0.94496262, + "learning_rate": 3.9995658684780375e-06, + "loss": 1.04475403, + "num_input_tokens_seen": 12791240, + "router_z_loss_clip": 11.421875, + "router_z_loss_mlp": 2.05566406, + "step": 606, + "time_per_iteration": 2.6620774269104004 + }, + { + "auxiliary_loss_clip": 0.0816614, + "auxiliary_loss_mlp": 0.01748117, + "balance_loss_clip": 0.07028672, + "balance_loss_mlp": 0.01549944, + "epoch": 0.03649481436945739, + "flos": 23626695438720.0, + "grad_norm": 22.881578639374155, + "language_loss": 0.89864534, + "learning_rate": 3.999557716251912e-06, + "loss": 0.99778789, + "num_input_tokens_seen": 12812245, + "router_z_loss_clip": 11.3828125, + "router_z_loss_mlp": 1.98144531, + "step": 607, + "time_per_iteration": 2.643644332885742 + }, + { + "auxiliary_loss_clip": 0.08159362, + "auxiliary_loss_mlp": 0.01746593, + "balance_loss_clip": 0.07035235, + "balance_loss_mlp": 0.01550708, + "epoch": 0.036554937622125354, + "flos": 21760903774080.0, + "grad_norm": 5.869564247499357, + "language_loss": 0.89574814, + "learning_rate": 3.999549488202358e-06, + "loss": 0.99480766, + "num_input_tokens_seen": 12831085, + "router_z_loss_clip": 11.2421875, + "router_z_loss_mlp": 1.95800781, + "step": 608, + "time_per_iteration": 2.6450629234313965 + }, + { + "auxiliary_loss_clip": 0.08127657, + "auxiliary_loss_mlp": 0.01727103, + "balance_loss_clip": 0.07009961, + "balance_loss_mlp": 0.01525497, + "epoch": 0.036615060874793326, + "flos": 17825215098240.0, + "grad_norm": 10.044459064109706, + "language_loss": 0.90011758, + "learning_rate": 3.999541184329688e-06, + "loss": 0.99866509, + "num_input_tokens_seen": 12849115, + "router_z_loss_clip": 11.171875, + "router_z_loss_mlp": 2.01464844, + "step": 609, + "time_per_iteration": 4.030602216720581 + }, + { + "auxiliary_loss_clip": 0.08147175, + "auxiliary_loss_mlp": 0.01709632, + "balance_loss_clip": 0.07004737, + "balance_loss_mlp": 0.01506309, + "epoch": 0.0366751841274613, + "flos": 26759911962240.0, + "grad_norm": 23.288197653985222, + "language_loss": 0.89072526, + "learning_rate": 3.999532804634215e-06, + "loss": 0.98929334, + "num_input_tokens_seen": 12868005, + "router_z_loss_clip": 11.421875, + "router_z_loss_mlp": 2.03515625, + "step": 610, + "time_per_iteration": 4.13908052444458 + }, + { + "auxiliary_loss_clip": 0.08141156, + "auxiliary_loss_mlp": 0.01701532, + "balance_loss_clip": 0.06999695, + "balance_loss_mlp": 0.01503454, + "epoch": 0.03673530738012926, + "flos": 22202949588480.0, + "grad_norm": 12.716864123026268, + "language_loss": 0.93839324, + "learning_rate": 3.9995243491162575e-06, + "loss": 1.03682017, + "num_input_tokens_seen": 12886890, + "router_z_loss_clip": 11.421875, + "router_z_loss_mlp": 1.98046875, + "step": 611, + "time_per_iteration": 4.084355354309082 + }, + { + "auxiliary_loss_clip": 0.08129553, + "auxiliary_loss_mlp": 0.01677889, + "balance_loss_clip": 0.07002232, + "balance_loss_mlp": 0.01494783, + "epoch": 0.036795430632797235, + "flos": 24688673285760.0, + "grad_norm": 5.856966427284507, + "language_loss": 0.80289567, + "learning_rate": 3.999515817776136e-06, + "loss": 0.9009701, + "num_input_tokens_seen": 12906130, + "router_z_loss_clip": 11.296875, + "router_z_loss_mlp": 1.83007812, + "step": 612, + "time_per_iteration": 2.797450065612793 + }, + { + "auxiliary_loss_clip": 0.08124618, + "auxiliary_loss_mlp": 0.01670571, + "balance_loss_clip": 0.06981046, + "balance_loss_mlp": 0.01486607, + "epoch": 0.0368555538854652, + "flos": 17754706287360.0, + "grad_norm": 13.343841316796098, + "language_loss": 0.86962521, + "learning_rate": 3.999507210614175e-06, + "loss": 0.9675771, + "num_input_tokens_seen": 12925260, + "router_z_loss_clip": 11.4453125, + "router_z_loss_mlp": 1.83984375, + "step": 613, + "time_per_iteration": 4.1074419021606445 + }, + { + "auxiliary_loss_clip": 0.0806347, + "auxiliary_loss_mlp": 0.01642999, + "balance_loss_clip": 0.0695873, + "balance_loss_mlp": 0.01476392, + "epoch": 0.03691567713813317, + "flos": 20600772468480.0, + "grad_norm": 5.522225672422525, + "language_loss": 1.0065136, + "learning_rate": 3.9994985276307e-06, + "loss": 1.10357833, + "num_input_tokens_seen": 12944590, + "router_z_loss_clip": 11.046875, + "router_z_loss_mlp": 1.66699219, + "step": 614, + "time_per_iteration": 2.645425796508789 + }, + { + "auxiliary_loss_clip": 0.08091287, + "auxiliary_loss_mlp": 0.01664825, + "balance_loss_clip": 0.06965354, + "balance_loss_mlp": 0.01476188, + "epoch": 0.036975800390801145, + "flos": 33657765050880.0, + "grad_norm": 13.032636577175042, + "language_loss": 0.81820416, + "learning_rate": 3.999489768826041e-06, + "loss": 0.91576523, + "num_input_tokens_seen": 12964785, + "router_z_loss_clip": 11.265625, + "router_z_loss_mlp": 1.88671875, + "step": 615, + "time_per_iteration": 2.781172752380371 + }, + { + "auxiliary_loss_clip": 0.08073606, + "auxiliary_loss_mlp": 0.01648642, + "balance_loss_clip": 0.06957066, + "balance_loss_mlp": 0.01467158, + "epoch": 0.03703592364346911, + "flos": 28301307344640.0, + "grad_norm": 5.888176936290721, + "language_loss": 0.88226712, + "learning_rate": 3.999480934200528e-06, + "loss": 0.97948968, + "num_input_tokens_seen": 12986705, + "router_z_loss_clip": 11.171875, + "router_z_loss_mlp": 1.81445312, + "step": 616, + "time_per_iteration": 2.712480068206787 + }, + { + "auxiliary_loss_clip": 0.08063665, + "auxiliary_loss_mlp": 0.01595674, + "balance_loss_clip": 0.06951402, + "balance_loss_mlp": 0.01438985, + "epoch": 0.03709604689613708, + "flos": 31512327984000.0, + "grad_norm": 15.942016878304402, + "language_loss": 0.7623843, + "learning_rate": 3.999472023754499e-06, + "loss": 0.85897768, + "num_input_tokens_seen": 13010560, + "router_z_loss_clip": 11.1171875, + "router_z_loss_mlp": 1.56738281, + "step": 617, + "time_per_iteration": 2.738520622253418 + }, + { + "auxiliary_loss_clip": 0.08034836, + "auxiliary_loss_mlp": 0.01559373, + "balance_loss_clip": 0.06941325, + "balance_loss_mlp": 0.01401445, + "epoch": 0.03715617014880505, + "flos": 19615424780160.0, + "grad_norm": 6.714823910826054, + "language_loss": 0.88676983, + "learning_rate": 3.99946303748829e-06, + "loss": 0.98271191, + "num_input_tokens_seen": 13028935, + "router_z_loss_clip": 10.953125, + "router_z_loss_mlp": 1.57910156, + "step": 618, + "time_per_iteration": 2.6463687419891357 + }, + { + "auxiliary_loss_clip": 0.08035833, + "auxiliary_loss_mlp": 0.0158681, + "balance_loss_clip": 0.06917505, + "balance_loss_mlp": 0.01430789, + "epoch": 0.03721629340147302, + "flos": 15929598579840.0, + "grad_norm": 200.27470015941975, + "language_loss": 0.97611117, + "learning_rate": 3.999453975402242e-06, + "loss": 1.07233763, + "num_input_tokens_seen": 13046000, + "router_z_loss_clip": 11.171875, + "router_z_loss_mlp": 1.55957031, + "step": 619, + "time_per_iteration": 2.6415488719940186 + }, + { + "auxiliary_loss_clip": 0.08024481, + "auxiliary_loss_mlp": 0.01545146, + "balance_loss_clip": 0.06915386, + "balance_loss_mlp": 0.01399139, + "epoch": 0.03727641665414099, + "flos": 21110182565760.0, + "grad_norm": 5.601090655471351, + "language_loss": 1.00407517, + "learning_rate": 3.9994448374967e-06, + "loss": 1.0997715, + "num_input_tokens_seen": 13062995, + "router_z_loss_clip": 11.1015625, + "router_z_loss_mlp": 1.4609375, + "step": 620, + "time_per_iteration": 2.6569979190826416 + }, + { + "auxiliary_loss_clip": 0.08002374, + "auxiliary_loss_mlp": 0.01557386, + "balance_loss_clip": 0.06899319, + "balance_loss_mlp": 0.01406705, + "epoch": 0.037336539906808956, + "flos": 24138159960960.0, + "grad_norm": 36.40398806521908, + "language_loss": 0.83474398, + "learning_rate": 3.999435623772008e-06, + "loss": 0.9303416, + "num_input_tokens_seen": 13084120, + "router_z_loss_clip": 11.046875, + "router_z_loss_mlp": 1.5078125, + "step": 621, + "time_per_iteration": 2.690336227416992 + }, + { + "auxiliary_loss_clip": 0.07971206, + "auxiliary_loss_mlp": 0.01523645, + "balance_loss_clip": 0.06889994, + "balance_loss_mlp": 0.01385266, + "epoch": 0.03739666315947693, + "flos": 22352981523840.0, + "grad_norm": 9.446463642728892, + "language_loss": 0.92411411, + "learning_rate": 3.999426334228518e-06, + "loss": 1.01906252, + "num_input_tokens_seen": 13100035, + "router_z_loss_clip": 10.828125, + "router_z_loss_mlp": 1.38378906, + "step": 622, + "time_per_iteration": 2.658414363861084 + }, + { + "auxiliary_loss_clip": 0.07994708, + "auxiliary_loss_mlp": 0.01510841, + "balance_loss_clip": 0.06888318, + "balance_loss_mlp": 0.01382, + "epoch": 0.0374567864121449, + "flos": 20455855632000.0, + "grad_norm": 11.361437110202797, + "language_loss": 0.97279346, + "learning_rate": 3.999416968866581e-06, + "loss": 1.06784892, + "num_input_tokens_seen": 13118070, + "router_z_loss_clip": 11.0546875, + "router_z_loss_mlp": 1.2890625, + "step": 623, + "time_per_iteration": 2.641080617904663 + }, + { + "auxiliary_loss_clip": 0.07990901, + "auxiliary_loss_mlp": 0.01512746, + "balance_loss_clip": 0.06881022, + "balance_loss_mlp": 0.0138009, + "epoch": 0.037516909664812866, + "flos": 19214020995840.0, + "grad_norm": 6.5992711028490865, + "language_loss": 0.9044131, + "learning_rate": 3.999407527686551e-06, + "loss": 0.99944961, + "num_input_tokens_seen": 13136355, + "router_z_loss_clip": 11.1171875, + "router_z_loss_mlp": 1.32714844, + "step": 624, + "time_per_iteration": 2.6581132411956787 + }, + { + "auxiliary_loss_clip": 0.07970337, + "auxiliary_loss_mlp": 0.0150074, + "balance_loss_clip": 0.06882318, + "balance_loss_mlp": 0.01368561, + "epoch": 0.03757703291748084, + "flos": 35013643493760.0, + "grad_norm": 9.813739409664771, + "language_loss": 0.77213168, + "learning_rate": 3.999398010688788e-06, + "loss": 0.86684251, + "num_input_tokens_seen": 13155435, + "router_z_loss_clip": 10.8828125, + "router_z_loss_mlp": 1.32128906, + "step": 625, + "time_per_iteration": 2.741912603378296 + }, + { + "auxiliary_loss_clip": 0.07975402, + "auxiliary_loss_mlp": 0.01499832, + "balance_loss_clip": 0.06869578, + "balance_loss_mlp": 0.01362599, + "epoch": 0.0376371561701488, + "flos": 25490977729920.0, + "grad_norm": 10.795152981420221, + "language_loss": 0.84230971, + "learning_rate": 3.999388417873652e-06, + "loss": 0.93706203, + "num_input_tokens_seen": 13174295, + "router_z_loss_clip": 11.0625, + "router_z_loss_mlp": 1.37207031, + "step": 626, + "time_per_iteration": 2.7070746421813965 + }, + { + "auxiliary_loss_clip": 0.07968426, + "auxiliary_loss_mlp": 0.01497735, + "balance_loss_clip": 0.06873227, + "balance_loss_mlp": 0.01361264, + "epoch": 0.037697279422816775, + "flos": 18191301586560.0, + "grad_norm": 4.940336590948721, + "language_loss": 0.86271065, + "learning_rate": 3.999378749241506e-06, + "loss": 0.95737231, + "num_input_tokens_seen": 13192500, + "router_z_loss_clip": 10.953125, + "router_z_loss_mlp": 1.36425781, + "step": 627, + "time_per_iteration": 2.622081756591797 + }, + { + "auxiliary_loss_clip": 0.07952641, + "auxiliary_loss_mlp": 0.01462314, + "balance_loss_clip": 0.06847817, + "balance_loss_mlp": 0.01327273, + "epoch": 0.03775740267548475, + "flos": 24651133856640.0, + "grad_norm": 5.044807916969655, + "language_loss": 0.93558288, + "learning_rate": 3.999369004792719e-06, + "loss": 1.02973247, + "num_input_tokens_seen": 13213470, + "router_z_loss_clip": 11.046875, + "router_z_loss_mlp": 1.35058594, + "step": 628, + "time_per_iteration": 2.699890375137329 + }, + { + "auxiliary_loss_clip": 0.07954629, + "auxiliary_loss_mlp": 0.01473174, + "balance_loss_clip": 0.06867678, + "balance_loss_mlp": 0.01340232, + "epoch": 0.03781752592815271, + "flos": 21294609402240.0, + "grad_norm": 4.416786805856079, + "language_loss": 0.86205798, + "learning_rate": 3.999359184527658e-06, + "loss": 0.95633596, + "num_input_tokens_seen": 13232365, + "router_z_loss_clip": 10.8828125, + "router_z_loss_mlp": 1.32910156, + "step": 629, + "time_per_iteration": 2.629606246948242 + }, + { + "auxiliary_loss_clip": 0.07949786, + "auxiliary_loss_mlp": 0.01478041, + "balance_loss_clip": 0.06862906, + "balance_loss_mlp": 0.01348436, + "epoch": 0.037877649180820684, + "flos": 22095949524480.0, + "grad_norm": 11.02025815590499, + "language_loss": 0.82977569, + "learning_rate": 3.999349288446696e-06, + "loss": 0.92405391, + "num_input_tokens_seen": 13251920, + "router_z_loss_clip": 10.8671875, + "router_z_loss_mlp": 1.29589844, + "step": 630, + "time_per_iteration": 2.6579172611236572 + }, + { + "auxiliary_loss_clip": 0.07989411, + "auxiliary_loss_mlp": 0.01449511, + "balance_loss_clip": 0.06879212, + "balance_loss_mlp": 0.01315711, + "epoch": 0.03793777243348865, + "flos": 14506523562240.0, + "grad_norm": 6.642300097880606, + "language_loss": 0.99746037, + "learning_rate": 3.99933931655021e-06, + "loss": 1.09184957, + "num_input_tokens_seen": 13267440, + "router_z_loss_clip": 11.1015625, + "router_z_loss_mlp": 1.33789062, + "step": 631, + "time_per_iteration": 2.5856504440307617 + }, + { + "auxiliary_loss_clip": 0.079531, + "auxiliary_loss_mlp": 0.0144806, + "balance_loss_clip": 0.06880549, + "balance_loss_mlp": 0.01321221, + "epoch": 0.03799789568615662, + "flos": 21914918778240.0, + "grad_norm": 6.504165414948274, + "language_loss": 0.96511495, + "learning_rate": 3.999329268838575e-06, + "loss": 1.05912662, + "num_input_tokens_seen": 13287850, + "router_z_loss_clip": 10.71875, + "router_z_loss_mlp": 1.26953125, + "step": 632, + "time_per_iteration": 2.6638169288635254 + }, + { + "auxiliary_loss_clip": 0.07980786, + "auxiliary_loss_mlp": 0.01460671, + "balance_loss_clip": 0.06883863, + "balance_loss_mlp": 0.0132668, + "epoch": 0.03805801893882459, + "flos": 24833967465600.0, + "grad_norm": 3.720972995518591, + "language_loss": 0.88515753, + "learning_rate": 3.999319145312175e-06, + "loss": 0.97957206, + "num_input_tokens_seen": 13307760, + "router_z_loss_clip": 10.984375, + "router_z_loss_mlp": 1.33984375, + "step": 633, + "time_per_iteration": 2.7479147911071777 + }, + { + "auxiliary_loss_clip": 0.07973721, + "auxiliary_loss_mlp": 0.01476512, + "balance_loss_clip": 0.06873562, + "balance_loss_mlp": 0.01335273, + "epoch": 0.03811814219149256, + "flos": 30490950240000.0, + "grad_norm": 5.013866846245917, + "language_loss": 0.74909431, + "learning_rate": 3.999308945971392e-06, + "loss": 0.84359664, + "num_input_tokens_seen": 13331230, + "router_z_loss_clip": 11.0078125, + "router_z_loss_mlp": 1.4140625, + "step": 634, + "time_per_iteration": 2.7746760845184326 + }, + { + "auxiliary_loss_clip": 0.07892692, + "auxiliary_loss_mlp": 0.01617175, + "balance_loss_clip": 0.0733197, + "balance_loss_mlp": 0.01455336, + "epoch": 0.03817826544416053, + "flos": 67010671820160.0, + "grad_norm": 1.8703584651187424, + "language_loss": 0.63503969, + "learning_rate": 3.999298670816614e-06, + "loss": 0.73013842, + "num_input_tokens_seen": 13394760, + "router_z_loss_clip": 5.6171875, + "router_z_loss_mlp": 1.61816406, + "step": 635, + "time_per_iteration": 3.2972047328948975 + }, + { + "auxiliary_loss_clip": 0.08014892, + "auxiliary_loss_mlp": 0.01535345, + "balance_loss_clip": 0.06916042, + "balance_loss_mlp": 0.01392198, + "epoch": 0.038238388696828496, + "flos": 20491592198400.0, + "grad_norm": 9.695955755206388, + "language_loss": 0.90505767, + "learning_rate": 3.9992883198482294e-06, + "loss": 1.00056005, + "num_input_tokens_seen": 13412775, + "router_z_loss_clip": 10.9921875, + "router_z_loss_mlp": 1.43066406, + "step": 636, + "time_per_iteration": 2.6479721069335938 + }, + { + "auxiliary_loss_clip": 0.08042439, + "auxiliary_loss_mlp": 0.01559473, + "balance_loss_clip": 0.06923507, + "balance_loss_mlp": 0.01399637, + "epoch": 0.03829851194949647, + "flos": 17971389745920.0, + "grad_norm": 32.79410112755353, + "language_loss": 0.88142544, + "learning_rate": 3.999277893066632e-06, + "loss": 0.97744453, + "num_input_tokens_seen": 13427835, + "router_z_loss_clip": 11.1796875, + "router_z_loss_mlp": 1.59667969, + "step": 637, + "time_per_iteration": 2.6563000679016113 + }, + { + "auxiliary_loss_clip": 0.08110388, + "auxiliary_loss_mlp": 0.0159766, + "balance_loss_clip": 0.06951486, + "balance_loss_mlp": 0.0144078, + "epoch": 0.03835863520216444, + "flos": 22463251896960.0, + "grad_norm": 37.67076952511291, + "language_loss": 0.91187263, + "learning_rate": 3.999267390472215e-06, + "loss": 1.00895298, + "num_input_tokens_seen": 13447295, + "router_z_loss_clip": 11.578125, + "router_z_loss_mlp": 1.56933594, + "step": 638, + "time_per_iteration": 2.6984195709228516 + }, + { + "auxiliary_loss_clip": 0.08094786, + "auxiliary_loss_mlp": 0.01648944, + "balance_loss_clip": 0.0693827, + "balance_loss_mlp": 0.01462406, + "epoch": 0.038418758454832405, + "flos": 22171070309760.0, + "grad_norm": 8.895472090968715, + "language_loss": 0.76717615, + "learning_rate": 3.999256812065381e-06, + "loss": 0.86461353, + "num_input_tokens_seen": 13468455, + "router_z_loss_clip": 11.5703125, + "router_z_loss_mlp": 1.86621094, + "step": 639, + "time_per_iteration": 2.7338461875915527 + }, + { + "auxiliary_loss_clip": 0.08159171, + "auxiliary_loss_mlp": 0.0166434, + "balance_loss_clip": 0.06976852, + "balance_loss_mlp": 0.01475227, + "epoch": 0.03847888170750038, + "flos": 22754049891840.0, + "grad_norm": 14.750114797034104, + "language_loss": 0.93037415, + "learning_rate": 3.999246157846526e-06, + "loss": 1.02860928, + "num_input_tokens_seen": 13489085, + "router_z_loss_clip": 11.8203125, + "router_z_loss_mlp": 1.890625, + "step": 640, + "time_per_iteration": 2.6571292877197266 + }, + { + "auxiliary_loss_clip": 0.08171181, + "auxiliary_loss_mlp": 0.01715232, + "balance_loss_clip": 0.06975375, + "balance_loss_mlp": 0.01501704, + "epoch": 0.03853900496016834, + "flos": 22717852128000.0, + "grad_norm": 10.934463540103733, + "language_loss": 0.90094578, + "learning_rate": 3.9992354278160574e-06, + "loss": 0.99980986, + "num_input_tokens_seen": 13509120, + "router_z_loss_clip": 11.953125, + "router_z_loss_mlp": 2.1328125, + "step": 641, + "time_per_iteration": 2.6885619163513184 + }, + { + "auxiliary_loss_clip": 0.07644878, + "auxiliary_loss_mlp": 0.01447392, + "balance_loss_clip": 0.07120143, + "balance_loss_mlp": 0.01325512, + "epoch": 0.038599128212836314, + "flos": 70420039073280.0, + "grad_norm": 0.9281695288015585, + "language_loss": 0.65025115, + "learning_rate": 3.999224621974381e-06, + "loss": 0.74117386, + "num_input_tokens_seen": 13562005, + "router_z_loss_clip": 5.25, + "router_z_loss_mlp": 1.21679688, + "step": 642, + "time_per_iteration": 3.2678098678588867 + }, + { + "auxiliary_loss_clip": 0.08201542, + "auxiliary_loss_mlp": 0.01819887, + "balance_loss_clip": 0.07001273, + "balance_loss_mlp": 0.01562014, + "epoch": 0.03865925146550429, + "flos": 23301921813120.0, + "grad_norm": 11.481508748032715, + "language_loss": 0.86633605, + "learning_rate": 3.999213740321906e-06, + "loss": 0.96655035, + "num_input_tokens_seen": 13582185, + "router_z_loss_clip": 11.9921875, + "router_z_loss_mlp": 2.57617188, + "step": 643, + "time_per_iteration": 2.659075975418091 + }, + { + "auxiliary_loss_clip": 0.08181606, + "auxiliary_loss_mlp": 0.01825318, + "balance_loss_clip": 0.06992409, + "balance_loss_mlp": 0.01547799, + "epoch": 0.03871937471817225, + "flos": 21436255929600.0, + "grad_norm": 51.325604168223556, + "language_loss": 0.89457649, + "learning_rate": 3.999202782859046e-06, + "loss": 0.99464566, + "num_input_tokens_seen": 13599555, + "router_z_loss_clip": 11.890625, + "router_z_loss_mlp": 2.77539062, + "step": 644, + "time_per_iteration": 2.659674882888794 + }, + { + "auxiliary_loss_clip": 0.08227627, + "auxiliary_loss_mlp": 0.01840427, + "balance_loss_clip": 0.07032949, + "balance_loss_mlp": 0.01557186, + "epoch": 0.038779497970840224, + "flos": 34285914783360.0, + "grad_norm": 72.96819975442757, + "language_loss": 0.90063643, + "learning_rate": 3.9991917495862165e-06, + "loss": 1.00131702, + "num_input_tokens_seen": 13621160, + "router_z_loss_clip": 11.953125, + "router_z_loss_mlp": 2.83007812, + "step": 645, + "time_per_iteration": 2.732840061187744 + }, + { + "auxiliary_loss_clip": 0.08212948, + "auxiliary_loss_mlp": 0.01875445, + "balance_loss_clip": 0.07012647, + "balance_loss_mlp": 0.01580378, + "epoch": 0.03883962122350819, + "flos": 22754930359680.0, + "grad_norm": 12.262203154186425, + "language_loss": 0.90520537, + "learning_rate": 3.9991806405038345e-06, + "loss": 1.00608933, + "num_input_tokens_seen": 13641915, + "router_z_loss_clip": 12.0078125, + "router_z_loss_mlp": 2.95117188, + "step": 646, + "time_per_iteration": 2.6865735054016113 + }, + { + "auxiliary_loss_clip": 0.08250429, + "auxiliary_loss_mlp": 0.01894148, + "balance_loss_clip": 0.07030701, + "balance_loss_mlp": 0.01611288, + "epoch": 0.03889974447617616, + "flos": 21952500134400.0, + "grad_norm": 17.1595872898191, + "language_loss": 0.88891035, + "learning_rate": 3.999169455612323e-06, + "loss": 0.99035615, + "num_input_tokens_seen": 13661410, + "router_z_loss_clip": 12.1953125, + "router_z_loss_mlp": 2.83007812, + "step": 647, + "time_per_iteration": 2.648667097091675 + }, + { + "auxiliary_loss_clip": 0.08277115, + "auxiliary_loss_mlp": 0.01910975, + "balance_loss_clip": 0.0706424, + "balance_loss_mlp": 0.01610376, + "epoch": 0.03895986772884413, + "flos": 31513040743680.0, + "grad_norm": 19.91369953833428, + "language_loss": 0.91710514, + "learning_rate": 3.999158194912106e-06, + "loss": 1.01898599, + "num_input_tokens_seen": 13681705, + "router_z_loss_clip": 12.125, + "router_z_loss_mlp": 3.00585938, + "step": 648, + "time_per_iteration": 2.7659173011779785 + }, + { + "auxiliary_loss_clip": 0.08252379, + "auxiliary_loss_mlp": 0.0196062, + "balance_loss_clip": 0.0704875, + "balance_loss_mlp": 0.01647243, + "epoch": 0.0390199909815121, + "flos": 19907061315840.0, + "grad_norm": 11.116514995705378, + "language_loss": 0.90245318, + "learning_rate": 3.9991468584036086e-06, + "loss": 1.00458312, + "num_input_tokens_seen": 13700400, + "router_z_loss_clip": 12.0234375, + "router_z_loss_mlp": 3.1328125, + "step": 649, + "time_per_iteration": 4.126534938812256 + }, + { + "auxiliary_loss_clip": 0.08304022, + "auxiliary_loss_mlp": 0.01986477, + "balance_loss_clip": 0.07056045, + "balance_loss_mlp": 0.01679394, + "epoch": 0.03908011423418007, + "flos": 21618250997760.0, + "grad_norm": 9.336868328216912, + "language_loss": 0.85345471, + "learning_rate": 3.999135446087263e-06, + "loss": 0.95635974, + "num_input_tokens_seen": 13720145, + "router_z_loss_clip": 12.484375, + "router_z_loss_mlp": 3.07421875, + "step": 650, + "time_per_iteration": 4.1806252002716064 + }, + { + "auxiliary_loss_clip": 0.08239638, + "auxiliary_loss_mlp": 0.01912282, + "balance_loss_clip": 0.0705025, + "balance_loss_mlp": 0.01647351, + "epoch": 0.039140237486848035, + "flos": 18667406885760.0, + "grad_norm": 11.202480244033193, + "language_loss": 0.84588236, + "learning_rate": 3.9991239579635e-06, + "loss": 0.94740158, + "num_input_tokens_seen": 13737500, + "router_z_loss_clip": 11.890625, + "router_z_loss_mlp": 2.6484375, + "step": 651, + "time_per_iteration": 4.02846360206604 + }, + { + "auxiliary_loss_clip": 0.08228613, + "auxiliary_loss_mlp": 0.01893436, + "balance_loss_clip": 0.07038778, + "balance_loss_mlp": 0.01631557, + "epoch": 0.03920036073951601, + "flos": 18667071469440.0, + "grad_norm": 33.17940308554231, + "language_loss": 0.9516173, + "learning_rate": 3.999112394032757e-06, + "loss": 1.05283785, + "num_input_tokens_seen": 13754750, + "router_z_loss_clip": 11.90625, + "router_z_loss_mlp": 2.6171875, + "step": 652, + "time_per_iteration": 2.6877963542938232 + }, + { + "auxiliary_loss_clip": 0.08188264, + "auxiliary_loss_mlp": 0.01841461, + "balance_loss_clip": 0.07017257, + "balance_loss_mlp": 0.01607716, + "epoch": 0.03926048399218398, + "flos": 31361918705280.0, + "grad_norm": 14.717862862310868, + "language_loss": 0.87065995, + "learning_rate": 3.999100754295471e-06, + "loss": 0.97095722, + "num_input_tokens_seen": 13771990, + "router_z_loss_clip": 11.7109375, + "router_z_loss_mlp": 2.33691406, + "step": 653, + "time_per_iteration": 4.161829948425293 + }, + { + "auxiliary_loss_clip": 0.08235107, + "auxiliary_loss_mlp": 0.01869742, + "balance_loss_clip": 0.07023594, + "balance_loss_mlp": 0.01632659, + "epoch": 0.039320607244851945, + "flos": 29610715898880.0, + "grad_norm": 12.720561465838024, + "language_loss": 0.92308909, + "learning_rate": 3.999089038752085e-06, + "loss": 1.0241375, + "num_input_tokens_seen": 13792750, + "router_z_loss_clip": 12.125, + "router_z_loss_mlp": 2.37304688, + "step": 654, + "time_per_iteration": 2.7182300090789795 + }, + { + "auxiliary_loss_clip": 0.07219759, + "auxiliary_loss_mlp": 0.01432266, + "balance_loss_clip": 0.0672446, + "balance_loss_mlp": 0.01342621, + "epoch": 0.03938073049751992, + "flos": 66555362332800.0, + "grad_norm": 4.21609108891928, + "language_loss": 0.5259136, + "learning_rate": 3.999077247403041e-06, + "loss": 0.61243391, + "num_input_tokens_seen": 13858570, + "router_z_loss_clip": 4.94140625, + "router_z_loss_mlp": 0.89599609, + "step": 655, + "time_per_iteration": 3.3539531230926514 + }, + { + "auxiliary_loss_clip": 0.08163472, + "auxiliary_loss_mlp": 0.01789512, + "balance_loss_clip": 0.07021941, + "balance_loss_mlp": 0.01601352, + "epoch": 0.03944085375018788, + "flos": 23374568903040.0, + "grad_norm": 42.09331718280733, + "language_loss": 0.85369515, + "learning_rate": 3.9990653802487886e-06, + "loss": 0.95322502, + "num_input_tokens_seen": 13876335, + "router_z_loss_clip": 11.4140625, + "router_z_loss_mlp": 1.88183594, + "step": 656, + "time_per_iteration": 2.696443557739258 + }, + { + "auxiliary_loss_clip": 0.08208387, + "auxiliary_loss_mlp": 0.01830457, + "balance_loss_clip": 0.07014482, + "balance_loss_mlp": 0.01624177, + "epoch": 0.039500977002855854, + "flos": 18553656568320.0, + "grad_norm": 12.61442729870119, + "language_loss": 0.83751947, + "learning_rate": 3.999053437289776e-06, + "loss": 0.93790793, + "num_input_tokens_seen": 13892640, + "router_z_loss_clip": 11.9296875, + "router_z_loss_mlp": 2.06347656, + "step": 657, + "time_per_iteration": 2.6805458068847656 + }, + { + "auxiliary_loss_clip": 0.08160911, + "auxiliary_loss_mlp": 0.01759172, + "balance_loss_clip": 0.07011348, + "balance_loss_mlp": 0.0155871, + "epoch": 0.039561100255523826, + "flos": 25345264279680.0, + "grad_norm": 59.81491010429953, + "language_loss": 0.86573362, + "learning_rate": 3.999041418526457e-06, + "loss": 0.96493447, + "num_input_tokens_seen": 13910085, + "router_z_loss_clip": 11.5, + "router_z_loss_mlp": 2.00488281, + "step": 658, + "time_per_iteration": 2.7667956352233887 + }, + { + "auxiliary_loss_clip": 0.08139389, + "auxiliary_loss_mlp": 0.01752558, + "balance_loss_clip": 0.07002386, + "balance_loss_mlp": 0.01577368, + "epoch": 0.03962122350819179, + "flos": 18225193363200.0, + "grad_norm": 13.067415763006752, + "language_loss": 0.97220278, + "learning_rate": 3.999029323959287e-06, + "loss": 1.07112217, + "num_input_tokens_seen": 13928800, + "router_z_loss_clip": 11.375, + "router_z_loss_mlp": 1.75097656, + "step": 659, + "time_per_iteration": 2.7390072345733643 + }, + { + "auxiliary_loss_clip": 0.08160311, + "auxiliary_loss_mlp": 0.01767653, + "balance_loss_clip": 0.07020363, + "balance_loss_mlp": 0.01584643, + "epoch": 0.03968134676085976, + "flos": 20528544648960.0, + "grad_norm": 6.696604257077815, + "language_loss": 0.85069668, + "learning_rate": 3.999017153588724e-06, + "loss": 0.94997621, + "num_input_tokens_seen": 13948325, + "router_z_loss_clip": 11.40625, + "router_z_loss_mlp": 1.83203125, + "step": 660, + "time_per_iteration": 2.6942412853240967 + }, + { + "auxiliary_loss_clip": 0.08128712, + "auxiliary_loss_mlp": 0.01673628, + "balance_loss_clip": 0.07018431, + "balance_loss_mlp": 0.01512361, + "epoch": 0.03974147001352773, + "flos": 22429737463680.0, + "grad_norm": 7.3843033134333425, + "language_loss": 0.86255896, + "learning_rate": 3.999004907415231e-06, + "loss": 0.96058238, + "num_input_tokens_seen": 13969090, + "router_z_loss_clip": 11.109375, + "router_z_loss_mlp": 1.61132812, + "step": 661, + "time_per_iteration": 2.688343048095703 + }, + { + "auxiliary_loss_clip": 0.07200997, + "auxiliary_loss_mlp": 0.01397595, + "balance_loss_clip": 0.06707223, + "balance_loss_mlp": 0.01289354, + "epoch": 0.0398015932661957, + "flos": 71149780281600.0, + "grad_norm": 0.9134370604104062, + "language_loss": 0.69827634, + "learning_rate": 3.998992585439272e-06, + "loss": 0.78426224, + "num_input_tokens_seen": 14037555, + "router_z_loss_clip": 4.9375, + "router_z_loss_mlp": 1.08496094, + "step": 662, + "time_per_iteration": 3.4075381755828857 + }, + { + "auxiliary_loss_clip": 0.08114735, + "auxiliary_loss_mlp": 0.01667295, + "balance_loss_clip": 0.06992006, + "balance_loss_mlp": 0.01495347, + "epoch": 0.03986171651886367, + "flos": 16806688392960.0, + "grad_norm": 88.3041379662575, + "language_loss": 0.8901574, + "learning_rate": 3.998980187661314e-06, + "loss": 0.98797774, + "num_input_tokens_seen": 14055765, + "router_z_loss_clip": 11.234375, + "router_z_loss_mlp": 1.71875, + "step": 663, + "time_per_iteration": 2.6151316165924072 + }, + { + "auxiliary_loss_clip": 0.08116017, + "auxiliary_loss_mlp": 0.01665745, + "balance_loss_clip": 0.06974875, + "balance_loss_mlp": 0.01491318, + "epoch": 0.03992183977153164, + "flos": 24541953586560.0, + "grad_norm": 13.584726936237926, + "language_loss": 0.92355931, + "learning_rate": 3.998967714081826e-06, + "loss": 1.02137709, + "num_input_tokens_seen": 14074195, + "router_z_loss_clip": 11.3984375, + "router_z_loss_mlp": 1.74511719, + "step": 664, + "time_per_iteration": 2.7008705139160156 + }, + { + "auxiliary_loss_clip": 0.08040652, + "auxiliary_loss_mlp": 0.01593066, + "balance_loss_clip": 0.06989275, + "balance_loss_mlp": 0.01449252, + "epoch": 0.03998196302419961, + "flos": 15601261155840.0, + "grad_norm": 12.968973833741712, + "language_loss": 0.90573943, + "learning_rate": 3.998955164701281e-06, + "loss": 1.00207651, + "num_input_tokens_seen": 14090215, + "router_z_loss_clip": 10.5078125, + "router_z_loss_mlp": 1.43847656, + "step": 665, + "time_per_iteration": 2.588078737258911 + }, + { + "auxiliary_loss_clip": 0.0806282, + "auxiliary_loss_mlp": 0.01620663, + "balance_loss_clip": 0.06955597, + "balance_loss_mlp": 0.01454533, + "epoch": 0.04004208627686758, + "flos": 25312714168320.0, + "grad_norm": 13.194143098844163, + "language_loss": 0.86261296, + "learning_rate": 3.998942539520158e-06, + "loss": 0.9594478, + "num_input_tokens_seen": 14112150, + "router_z_loss_clip": 11.0859375, + "router_z_loss_mlp": 1.66113281, + "step": 666, + "time_per_iteration": 2.7150063514709473 + }, + { + "auxiliary_loss_clip": 0.08039176, + "auxiliary_loss_mlp": 0.01580059, + "balance_loss_clip": 0.06968041, + "balance_loss_mlp": 0.01428235, + "epoch": 0.04010220952953555, + "flos": 23482365580800.0, + "grad_norm": 143.76139759772911, + "language_loss": 0.91256213, + "learning_rate": 3.998929838538932e-06, + "loss": 1.00875449, + "num_input_tokens_seen": 14131475, + "router_z_loss_clip": 10.71875, + "router_z_loss_mlp": 1.51855469, + "step": 667, + "time_per_iteration": 2.6658053398132324 + }, + { + "auxiliary_loss_clip": 0.08004649, + "auxiliary_loss_mlp": 0.01530234, + "balance_loss_clip": 0.06972381, + "balance_loss_mlp": 0.01387469, + "epoch": 0.04016233278220352, + "flos": 18621691683840.0, + "grad_norm": 22.359711377029505, + "language_loss": 0.8821072, + "learning_rate": 3.998917061758087e-06, + "loss": 0.97745597, + "num_input_tokens_seen": 14146165, + "router_z_loss_clip": 10.3046875, + "router_z_loss_mlp": 1.42773438, + "step": 668, + "time_per_iteration": 2.6255545616149902 + }, + { + "auxiliary_loss_clip": 0.07152489, + "auxiliary_loss_mlp": 0.01341531, + "balance_loss_clip": 0.06666718, + "balance_loss_mlp": 0.01260421, + "epoch": 0.040222456034871484, + "flos": 70926556204800.0, + "grad_norm": 1.1799050230194268, + "language_loss": 0.60729092, + "learning_rate": 3.998904209178107e-06, + "loss": 0.69223112, + "num_input_tokens_seen": 14215005, + "router_z_loss_clip": 4.875, + "router_z_loss_mlp": 0.81103516, + "step": 669, + "time_per_iteration": 3.3595035076141357 + }, + { + "auxiliary_loss_clip": 0.08017544, + "auxiliary_loss_mlp": 0.01537312, + "balance_loss_clip": 0.06961209, + "balance_loss_mlp": 0.0138749, + "epoch": 0.040282579287539456, + "flos": 23770773734400.0, + "grad_norm": 21.749949136203163, + "language_loss": 0.91578722, + "learning_rate": 3.9988912807994785e-06, + "loss": 1.01133573, + "num_input_tokens_seen": 14235510, + "router_z_loss_clip": 10.5703125, + "router_z_loss_mlp": 1.49707031, + "step": 670, + "time_per_iteration": 2.66859769821167 + }, + { + "auxiliary_loss_clip": 0.08002704, + "auxiliary_loss_mlp": 0.01555976, + "balance_loss_clip": 0.0695509, + "balance_loss_mlp": 0.01413116, + "epoch": 0.04034270254020743, + "flos": 18484405568640.0, + "grad_norm": 9.221564261110139, + "language_loss": 0.80103904, + "learning_rate": 3.998878276622692e-06, + "loss": 0.89662588, + "num_input_tokens_seen": 14254565, + "router_z_loss_clip": 10.484375, + "router_z_loss_mlp": 1.4296875, + "step": 671, + "time_per_iteration": 2.6671946048736572 + }, + { + "auxiliary_loss_clip": 0.07994901, + "auxiliary_loss_mlp": 0.01548628, + "balance_loss_clip": 0.06957932, + "balance_loss_mlp": 0.01400332, + "epoch": 0.040402825792875394, + "flos": 17207589052800.0, + "grad_norm": 12.445045366932057, + "language_loss": 0.98976898, + "learning_rate": 3.998865196648242e-06, + "loss": 1.08520412, + "num_input_tokens_seen": 14271885, + "router_z_loss_clip": 10.375, + "router_z_loss_mlp": 1.484375, + "step": 672, + "time_per_iteration": 2.6043524742126465 + }, + { + "auxiliary_loss_clip": 0.08007569, + "auxiliary_loss_mlp": 0.01577526, + "balance_loss_clip": 0.06955793, + "balance_loss_mlp": 0.01428181, + "epoch": 0.040462949045543366, + "flos": 19178242502400.0, + "grad_norm": 16.68355787547426, + "language_loss": 0.95323932, + "learning_rate": 3.998852040876622e-06, + "loss": 1.04909039, + "num_input_tokens_seen": 14289670, + "router_z_loss_clip": 10.53125, + "router_z_loss_mlp": 1.49316406, + "step": 673, + "time_per_iteration": 2.67228102684021 + }, + { + "auxiliary_loss_clip": 0.07999671, + "auxiliary_loss_mlp": 0.01557213, + "balance_loss_clip": 0.06955186, + "balance_loss_mlp": 0.01413161, + "epoch": 0.04052307229821133, + "flos": 24025877089920.0, + "grad_norm": 7.385878323717427, + "language_loss": 0.80140877, + "learning_rate": 3.998838809308334e-06, + "loss": 0.89697754, + "num_input_tokens_seen": 14309285, + "router_z_loss_clip": 10.4375, + "router_z_loss_mlp": 1.43994141, + "step": 674, + "time_per_iteration": 2.6599738597869873 + }, + { + "auxiliary_loss_clip": 0.08032155, + "auxiliary_loss_mlp": 0.01590571, + "balance_loss_clip": 0.06966965, + "balance_loss_mlp": 0.01439795, + "epoch": 0.0405831955508793, + "flos": 16442362840320.0, + "grad_norm": 8.615330731484576, + "language_loss": 0.83709693, + "learning_rate": 3.9988255019438766e-06, + "loss": 0.93332422, + "num_input_tokens_seen": 14328300, + "router_z_loss_clip": 10.6484375, + "router_z_loss_mlp": 1.50683594, + "step": 675, + "time_per_iteration": 2.68145751953125 + }, + { + "auxiliary_loss_clip": 0.07989661, + "auxiliary_loss_mlp": 0.01530552, + "balance_loss_clip": 0.06954966, + "balance_loss_mlp": 0.01384926, + "epoch": 0.040643318803547275, + "flos": 24286808304000.0, + "grad_norm": 7.342047246701879, + "language_loss": 0.80985713, + "learning_rate": 3.998812118783757e-06, + "loss": 0.90505934, + "num_input_tokens_seen": 14346395, + "router_z_loss_clip": 10.3359375, + "router_z_loss_mlp": 1.45605469, + "step": 676, + "time_per_iteration": 2.6827666759490967 + }, + { + "auxiliary_loss_clip": 0.0800771, + "auxiliary_loss_mlp": 0.01548704, + "balance_loss_clip": 0.06941711, + "balance_loss_mlp": 0.01395925, + "epoch": 0.04070344205621524, + "flos": 17717795763840.0, + "grad_norm": 11.552804849972091, + "language_loss": 0.9000327, + "learning_rate": 3.9987986598284804e-06, + "loss": 0.99559683, + "num_input_tokens_seen": 14364605, + "router_z_loss_clip": 10.6640625, + "router_z_loss_mlp": 1.52734375, + "step": 677, + "time_per_iteration": 2.647284984588623 + }, + { + "auxiliary_loss_clip": 0.0795664, + "auxiliary_loss_mlp": 0.01525712, + "balance_loss_clip": 0.06946824, + "balance_loss_mlp": 0.01385522, + "epoch": 0.04076356530888321, + "flos": 26184940444800.0, + "grad_norm": 15.722345117009269, + "language_loss": 0.81235254, + "learning_rate": 3.998785125078559e-06, + "loss": 0.90717608, + "num_input_tokens_seen": 14385265, + "router_z_loss_clip": 10.09375, + "router_z_loss_mlp": 1.40039062, + "step": 678, + "time_per_iteration": 2.713604688644409 + }, + { + "auxiliary_loss_clip": 0.07982595, + "auxiliary_loss_mlp": 0.01542507, + "balance_loss_clip": 0.06946435, + "balance_loss_mlp": 0.01393447, + "epoch": 0.04082368856155118, + "flos": 35782349650560.0, + "grad_norm": 7.406308464158208, + "language_loss": 0.87816763, + "learning_rate": 3.998771514534505e-06, + "loss": 0.97341865, + "num_input_tokens_seen": 14406090, + "router_z_loss_clip": 10.3671875, + "router_z_loss_mlp": 1.4921875, + "step": 679, + "time_per_iteration": 2.7753264904022217 + }, + { + "auxiliary_loss_clip": 0.07950564, + "auxiliary_loss_mlp": 0.01522729, + "balance_loss_clip": 0.06942166, + "balance_loss_mlp": 0.01383969, + "epoch": 0.04088381181421915, + "flos": 28154042593920.0, + "grad_norm": 7.465466597866811, + "language_loss": 0.8230598, + "learning_rate": 3.998757828196835e-06, + "loss": 0.91779268, + "num_input_tokens_seen": 14425130, + "router_z_loss_clip": 10.0859375, + "router_z_loss_mlp": 1.38671875, + "step": 680, + "time_per_iteration": 2.729719400405884 + }, + { + "auxiliary_loss_clip": 0.07993592, + "auxiliary_loss_mlp": 0.01532905, + "balance_loss_clip": 0.06938143, + "balance_loss_mlp": 0.01378696, + "epoch": 0.04094393506688712, + "flos": 27604703226240.0, + "grad_norm": 9.665492233492547, + "language_loss": 0.8765927, + "learning_rate": 3.9987440660660685e-06, + "loss": 0.97185767, + "num_input_tokens_seen": 14447355, + "router_z_loss_clip": 10.5703125, + "router_z_loss_mlp": 1.54199219, + "step": 681, + "time_per_iteration": 2.752514600753784 + }, + { + "auxiliary_loss_clip": 0.07989424, + "auxiliary_loss_mlp": 0.01553673, + "balance_loss_clip": 0.0693374, + "balance_loss_mlp": 0.01390118, + "epoch": 0.04100405831955509, + "flos": 23118668933760.0, + "grad_norm": 7.019008438585821, + "language_loss": 0.77474326, + "learning_rate": 3.998730228142726e-06, + "loss": 0.87017429, + "num_input_tokens_seen": 14466790, + "router_z_loss_clip": 10.5546875, + "router_z_loss_mlp": 1.63476562, + "step": 682, + "time_per_iteration": 2.6727144718170166 + }, + { + "auxiliary_loss_clip": 0.07959605, + "auxiliary_loss_mlp": 0.01503527, + "balance_loss_clip": 0.06938009, + "balance_loss_mlp": 0.01370394, + "epoch": 0.04106418157222306, + "flos": 20162877431040.0, + "grad_norm": 10.358969831785554, + "language_loss": 0.77842575, + "learning_rate": 3.998716314427333e-06, + "loss": 0.87305701, + "num_input_tokens_seen": 14485195, + "router_z_loss_clip": 10.2109375, + "router_z_loss_mlp": 1.33007812, + "step": 683, + "time_per_iteration": 2.6043591499328613 + }, + { + "auxiliary_loss_clip": 0.07972776, + "auxiliary_loss_mlp": 0.01527418, + "balance_loss_clip": 0.06933653, + "balance_loss_mlp": 0.01377405, + "epoch": 0.041124304824891024, + "flos": 17426452717440.0, + "grad_norm": 41.27076771704703, + "language_loss": 0.86504227, + "learning_rate": 3.998702324920417e-06, + "loss": 0.96004421, + "num_input_tokens_seen": 14503370, + "router_z_loss_clip": 10.3984375, + "router_z_loss_mlp": 1.5, + "step": 684, + "time_per_iteration": 2.6286985874176025 + }, + { + "auxiliary_loss_clip": 0.07935933, + "auxiliary_loss_mlp": 0.01488839, + "balance_loss_clip": 0.06928104, + "balance_loss_mlp": 0.01343976, + "epoch": 0.041184428077558996, + "flos": 25788022853760.0, + "grad_norm": 3.9155930370094065, + "language_loss": 0.94948566, + "learning_rate": 3.9986882596225085e-06, + "loss": 1.04373336, + "num_input_tokens_seen": 14526415, + "router_z_loss_clip": 10.0859375, + "router_z_loss_mlp": 1.44824219, + "step": 685, + "time_per_iteration": 2.7345352172851562 + }, + { + "auxiliary_loss_clip": 0.07948299, + "auxiliary_loss_mlp": 0.0149691, + "balance_loss_clip": 0.06921411, + "balance_loss_mlp": 0.01346992, + "epoch": 0.04124455133022697, + "flos": 22971152620800.0, + "grad_norm": 3.7671102410224577, + "language_loss": 0.94070864, + "learning_rate": 3.998674118534141e-06, + "loss": 1.03516078, + "num_input_tokens_seen": 14546595, + "router_z_loss_clip": 10.2734375, + "router_z_loss_mlp": 1.5, + "step": 686, + "time_per_iteration": 2.6663894653320312 + }, + { + "auxiliary_loss_clip": 0.0795872, + "auxiliary_loss_mlp": 0.01501087, + "balance_loss_clip": 0.06920497, + "balance_loss_mlp": 0.01356414, + "epoch": 0.04130467458289493, + "flos": 21295615651200.0, + "grad_norm": 39.86585208650635, + "language_loss": 0.77225804, + "learning_rate": 3.998659901655851e-06, + "loss": 0.8668561, + "num_input_tokens_seen": 14566590, + "router_z_loss_clip": 10.3828125, + "router_z_loss_mlp": 1.44628906, + "step": 687, + "time_per_iteration": 2.6355550289154053 + }, + { + "auxiliary_loss_clip": 0.07898364, + "auxiliary_loss_mlp": 0.01464255, + "balance_loss_clip": 0.06899062, + "balance_loss_mlp": 0.01340564, + "epoch": 0.041364797835562905, + "flos": 19980337311360.0, + "grad_norm": 4.212344971526593, + "language_loss": 0.91093004, + "learning_rate": 3.998645608988177e-06, + "loss": 1.00455618, + "num_input_tokens_seen": 14585965, + "router_z_loss_clip": 10.0078125, + "router_z_loss_mlp": 1.23730469, + "step": 688, + "time_per_iteration": 4.057282209396362 + }, + { + "auxiliary_loss_clip": 0.07878294, + "auxiliary_loss_mlp": 0.01448978, + "balance_loss_clip": 0.06897704, + "balance_loss_mlp": 0.01329388, + "epoch": 0.04142492108823087, + "flos": 21912361228800.0, + "grad_norm": 22.971814885863903, + "language_loss": 0.88008463, + "learning_rate": 3.998631240531661e-06, + "loss": 0.97335738, + "num_input_tokens_seen": 14606015, + "router_z_loss_clip": 9.796875, + "router_z_loss_mlp": 1.19628906, + "step": 689, + "time_per_iteration": 4.07433295249939 + }, + { + "auxiliary_loss_clip": 0.07866906, + "auxiliary_loss_mlp": 0.01444557, + "balance_loss_clip": 0.06897521, + "balance_loss_mlp": 0.01326349, + "epoch": 0.04148504434089884, + "flos": 27647567389440.0, + "grad_norm": 6.767605845927541, + "language_loss": 0.72533339, + "learning_rate": 3.998616796286848e-06, + "loss": 0.81844807, + "num_input_tokens_seen": 14629955, + "router_z_loss_clip": 9.6953125, + "router_z_loss_mlp": 1.18212891, + "step": 690, + "time_per_iteration": 4.110247611999512 + }, + { + "auxiliary_loss_clip": 0.07835479, + "auxiliary_loss_mlp": 0.01439264, + "balance_loss_clip": 0.06874412, + "balance_loss_mlp": 0.01314809, + "epoch": 0.041545167593566815, + "flos": 20524058455680.0, + "grad_norm": 9.225891193910236, + "language_loss": 0.79284167, + "learning_rate": 3.998602276254286e-06, + "loss": 0.88558906, + "num_input_tokens_seen": 14648000, + "router_z_loss_clip": 9.6171875, + "router_z_loss_mlp": 1.24316406, + "step": 691, + "time_per_iteration": 2.667081594467163 + }, + { + "auxiliary_loss_clip": 0.07827538, + "auxiliary_loss_mlp": 0.01419803, + "balance_loss_clip": 0.06878158, + "balance_loss_mlp": 0.01303931, + "epoch": 0.04160529084623478, + "flos": 11872738500480.0, + "grad_norm": 5.1056325398424125, + "language_loss": 0.88591456, + "learning_rate": 3.998587680434526e-06, + "loss": 0.97838795, + "num_input_tokens_seen": 14662235, + "router_z_loss_clip": 9.484375, + "router_z_loss_mlp": 1.15820312, + "step": 692, + "time_per_iteration": 4.027364015579224 + }, + { + "auxiliary_loss_clip": 0.07869601, + "auxiliary_loss_mlp": 0.01461887, + "balance_loss_clip": 0.0685929, + "balance_loss_mlp": 0.01322936, + "epoch": 0.04166541409890275, + "flos": 14833309685760.0, + "grad_norm": 14.964488884578895, + "language_loss": 0.94025421, + "learning_rate": 3.99857300882812e-06, + "loss": 1.0335691, + "num_input_tokens_seen": 14676065, + "router_z_loss_clip": 10.1171875, + "router_z_loss_mlp": 1.38867188, + "step": 693, + "time_per_iteration": 2.6548287868499756 + }, + { + "auxiliary_loss_clip": 0.07852003, + "auxiliary_loss_mlp": 0.01436954, + "balance_loss_clip": 0.06875066, + "balance_loss_mlp": 0.01312977, + "epoch": 0.04172553735157072, + "flos": 25814577398400.0, + "grad_norm": 10.760604695701561, + "language_loss": 0.88156736, + "learning_rate": 3.998558261435626e-06, + "loss": 0.97445703, + "num_input_tokens_seen": 14694955, + "router_z_loss_clip": 9.765625, + "router_z_loss_mlp": 1.24023438, + "step": 694, + "time_per_iteration": 2.6794655323028564 + }, + { + "auxiliary_loss_clip": 0.07850839, + "auxiliary_loss_mlp": 0.01460734, + "balance_loss_clip": 0.0686307, + "balance_loss_mlp": 0.01329222, + "epoch": 0.04178566060423869, + "flos": 24286682522880.0, + "grad_norm": 6.107694720201945, + "language_loss": 0.89735746, + "learning_rate": 3.9985434382576015e-06, + "loss": 0.99047321, + "num_input_tokens_seen": 14715510, + "router_z_loss_clip": 9.890625, + "router_z_loss_mlp": 1.31445312, + "step": 695, + "time_per_iteration": 2.7562625408172607 + }, + { + "auxiliary_loss_clip": 0.07797342, + "auxiliary_loss_mlp": 0.01449631, + "balance_loss_clip": 0.0684258, + "balance_loss_mlp": 0.01321648, + "epoch": 0.04184578385690666, + "flos": 18227667058560.0, + "grad_norm": 4.8539800399764195, + "language_loss": 0.91097277, + "learning_rate": 3.99852853929461e-06, + "loss": 1.00344253, + "num_input_tokens_seen": 14731755, + "router_z_loss_clip": 9.5625, + "router_z_loss_mlp": 1.28027344, + "step": 696, + "time_per_iteration": 2.6180830001831055 + }, + { + "auxiliary_loss_clip": 0.07759669, + "auxiliary_loss_mlp": 0.01436884, + "balance_loss_clip": 0.06835265, + "balance_loss_mlp": 0.01318438, + "epoch": 0.041905907109574626, + "flos": 22781694539520.0, + "grad_norm": 8.248305080547661, + "language_loss": 0.97183168, + "learning_rate": 3.998513564547216e-06, + "loss": 1.06379724, + "num_input_tokens_seen": 14750810, + "router_z_loss_clip": 9.234375, + "router_z_loss_mlp": 1.18359375, + "step": 697, + "time_per_iteration": 2.6976754665374756 + }, + { + "auxiliary_loss_clip": 0.0775051, + "auxiliary_loss_mlp": 0.0142093, + "balance_loss_clip": 0.06823087, + "balance_loss_mlp": 0.01301005, + "epoch": 0.0419660303622426, + "flos": 20163128993280.0, + "grad_norm": 6.669627081417543, + "language_loss": 0.90090138, + "learning_rate": 3.998498514015987e-06, + "loss": 0.99261582, + "num_input_tokens_seen": 14768435, + "router_z_loss_clip": 9.2890625, + "router_z_loss_mlp": 1.20068359, + "step": 698, + "time_per_iteration": 2.6525814533233643 + }, + { + "auxiliary_loss_clip": 0.07798302, + "auxiliary_loss_mlp": 0.01439823, + "balance_loss_clip": 0.06844427, + "balance_loss_mlp": 0.01318039, + "epoch": 0.042026153614910564, + "flos": 23083142002560.0, + "grad_norm": 12.169844049295248, + "language_loss": 0.96140921, + "learning_rate": 3.998483387701495e-06, + "loss": 1.05379045, + "num_input_tokens_seen": 14786690, + "router_z_loss_clip": 9.546875, + "router_z_loss_mlp": 1.21728516, + "step": 699, + "time_per_iteration": 2.700636625289917 + }, + { + "auxiliary_loss_clip": 0.0715683, + "auxiliary_loss_mlp": 0.01383088, + "balance_loss_clip": 0.06685513, + "balance_loss_mlp": 0.01307272, + "epoch": 0.042086276867578536, + "flos": 64516296424320.0, + "grad_norm": 2.8955425132907755, + "language_loss": 0.7356112, + "learning_rate": 3.998468185604312e-06, + "loss": 0.82101035, + "num_input_tokens_seen": 14853840, + "router_z_loss_clip": 4.71875, + "router_z_loss_mlp": 0.75683594, + "step": 700, + "time_per_iteration": 3.2564964294433594 + }, + { + "auxiliary_loss_clip": 0.07741027, + "auxiliary_loss_mlp": 0.01429077, + "balance_loss_clip": 0.0681721, + "balance_loss_mlp": 0.01313587, + "epoch": 0.04214640012024651, + "flos": 15492458229120.0, + "grad_norm": 9.391497638208355, + "language_loss": 0.93962044, + "learning_rate": 3.998452907725016e-06, + "loss": 1.03132153, + "num_input_tokens_seen": 14869580, + "router_z_loss_clip": 9.2421875, + "router_z_loss_mlp": 1.15527344, + "step": 701, + "time_per_iteration": 2.66644024848938 + }, + { + "auxiliary_loss_clip": 0.07737128, + "auxiliary_loss_mlp": 0.01419929, + "balance_loss_clip": 0.06809002, + "balance_loss_mlp": 0.01302341, + "epoch": 0.04220652337291447, + "flos": 23883601656960.0, + "grad_norm": 33.27176662769112, + "language_loss": 0.71847737, + "learning_rate": 3.998437554064184e-06, + "loss": 0.81004792, + "num_input_tokens_seen": 14891065, + "router_z_loss_clip": 9.2890625, + "router_z_loss_mlp": 1.17529297, + "step": 702, + "time_per_iteration": 2.7162067890167236 + }, + { + "auxiliary_loss_clip": 0.07125677, + "auxiliary_loss_mlp": 0.01365095, + "balance_loss_clip": 0.06657615, + "balance_loss_mlp": 0.01297575, + "epoch": 0.042266646625582445, + "flos": 63815289966720.0, + "grad_norm": 0.8674304256332159, + "language_loss": 0.6110186, + "learning_rate": 3.9984221246224006e-06, + "loss": 0.69592631, + "num_input_tokens_seen": 14954815, + "router_z_loss_clip": 4.6875, + "router_z_loss_mlp": 0.67578125, + "step": 703, + "time_per_iteration": 3.3240442276000977 + }, + { + "auxiliary_loss_clip": 0.0710092, + "auxiliary_loss_mlp": 0.01355985, + "balance_loss_clip": 0.06631917, + "balance_loss_mlp": 0.01291803, + "epoch": 0.04232676987825041, + "flos": 50038912154880.0, + "grad_norm": 1.041495616235658, + "language_loss": 0.58151424, + "learning_rate": 3.9984066194002494e-06, + "loss": 0.66608322, + "num_input_tokens_seen": 15003050, + "router_z_loss_clip": 4.6875, + "router_z_loss_mlp": 0.64160156, + "step": 704, + "time_per_iteration": 3.174765110015869 + }, + { + "auxiliary_loss_clip": 0.07745479, + "auxiliary_loss_mlp": 0.01449155, + "balance_loss_clip": 0.06810448, + "balance_loss_mlp": 0.01329278, + "epoch": 0.04238689313091838, + "flos": 21622485628800.0, + "grad_norm": 12.557351496220864, + "language_loss": 0.93966371, + "learning_rate": 3.998391038398319e-06, + "loss": 1.03161013, + "num_input_tokens_seen": 15021990, + "router_z_loss_clip": 9.3515625, + "router_z_loss_mlp": 1.19775391, + "step": 705, + "time_per_iteration": 2.6435232162475586 + }, + { + "auxiliary_loss_clip": 0.07677379, + "auxiliary_loss_mlp": 0.01427121, + "balance_loss_clip": 0.06791299, + "balance_loss_mlp": 0.01325698, + "epoch": 0.042447016383586354, + "flos": 19141080416640.0, + "grad_norm": 3.7381942579388303, + "language_loss": 0.75889277, + "learning_rate": 3.998375381617201e-06, + "loss": 0.8499378, + "num_input_tokens_seen": 15040700, + "router_z_loss_clip": 8.8515625, + "router_z_loss_mlp": 1.01269531, + "step": 706, + "time_per_iteration": 2.671828508377075 + }, + { + "auxiliary_loss_clip": 0.07719514, + "auxiliary_loss_mlp": 0.01450054, + "balance_loss_clip": 0.06807585, + "balance_loss_mlp": 0.01336471, + "epoch": 0.04250713963625432, + "flos": 24432941024640.0, + "grad_norm": 29.794541170575812, + "language_loss": 0.97812521, + "learning_rate": 3.9983596490574875e-06, + "loss": 1.06982088, + "num_input_tokens_seen": 15056725, + "router_z_loss_clip": 9.1171875, + "router_z_loss_mlp": 1.13427734, + "step": 707, + "time_per_iteration": 2.6550920009613037 + }, + { + "auxiliary_loss_clip": 0.07717137, + "auxiliary_loss_mlp": 0.01443639, + "balance_loss_clip": 0.06809401, + "balance_loss_mlp": 0.01333776, + "epoch": 0.04256726288892229, + "flos": 30374348883840.0, + "grad_norm": 14.849267761051758, + "language_loss": 0.85616708, + "learning_rate": 3.998343840719776e-06, + "loss": 0.94777477, + "num_input_tokens_seen": 15077550, + "router_z_loss_clip": 9.09375, + "router_z_loss_mlp": 1.09863281, + "step": 708, + "time_per_iteration": 2.7447280883789062 + }, + { + "auxiliary_loss_clip": 0.07730591, + "auxiliary_loss_mlp": 0.01453146, + "balance_loss_clip": 0.06808455, + "balance_loss_mlp": 0.01341232, + "epoch": 0.04262738614159026, + "flos": 16368248304000.0, + "grad_norm": 3.836638557890093, + "language_loss": 0.88926339, + "learning_rate": 3.998327956604666e-06, + "loss": 0.98110074, + "num_input_tokens_seen": 15094955, + "router_z_loss_clip": 9.21875, + "router_z_loss_mlp": 1.11914062, + "step": 709, + "time_per_iteration": 2.632735252380371 + }, + { + "auxiliary_loss_clip": 0.07711782, + "auxiliary_loss_mlp": 0.01472, + "balance_loss_clip": 0.06786519, + "balance_loss_mlp": 0.01342396, + "epoch": 0.04268750939425823, + "flos": 20418609692160.0, + "grad_norm": 7.682824070104421, + "language_loss": 0.92841685, + "learning_rate": 3.99831199671276e-06, + "loss": 1.02025461, + "num_input_tokens_seen": 15113395, + "router_z_loss_clip": 9.2421875, + "router_z_loss_mlp": 1.296875, + "step": 710, + "time_per_iteration": 2.6799728870391846 + }, + { + "auxiliary_loss_clip": 0.07731062, + "auxiliary_loss_mlp": 0.01465957, + "balance_loss_clip": 0.06815341, + "balance_loss_mlp": 0.01351993, + "epoch": 0.0427476326469262, + "flos": 20309177859840.0, + "grad_norm": 5.073822997040578, + "language_loss": 0.89081585, + "learning_rate": 3.998295961044662e-06, + "loss": 0.98278606, + "num_input_tokens_seen": 15132920, + "router_z_loss_clip": 9.1484375, + "router_z_loss_mlp": 1.13867188, + "step": 711, + "time_per_iteration": 2.6377625465393066 + }, + { + "auxiliary_loss_clip": 0.07695919, + "auxiliary_loss_mlp": 0.01446717, + "balance_loss_clip": 0.06801347, + "balance_loss_mlp": 0.01336377, + "epoch": 0.042807755899594166, + "flos": 21656880529920.0, + "grad_norm": 4.571300727713509, + "language_loss": 0.91390419, + "learning_rate": 3.9982798496009804e-06, + "loss": 1.00533056, + "num_input_tokens_seen": 15153115, + "router_z_loss_clip": 8.9453125, + "router_z_loss_mlp": 1.10302734, + "step": 712, + "time_per_iteration": 2.6158323287963867 + }, + { + "auxiliary_loss_clip": 0.07722442, + "auxiliary_loss_mlp": 0.01473663, + "balance_loss_clip": 0.06794881, + "balance_loss_mlp": 0.01356647, + "epoch": 0.04286787915226214, + "flos": 21441580663680.0, + "grad_norm": 10.343893565695913, + "language_loss": 0.96509683, + "learning_rate": 3.998263662382328e-06, + "loss": 1.05705786, + "num_input_tokens_seen": 15172770, + "router_z_loss_clip": 9.265625, + "router_z_loss_mlp": 1.17041016, + "step": 713, + "time_per_iteration": 2.668109655380249 + }, + { + "auxiliary_loss_clip": 0.07025006, + "auxiliary_loss_mlp": 0.01310492, + "balance_loss_clip": 0.06573053, + "balance_loss_mlp": 0.01250029, + "epoch": 0.04292800240493011, + "flos": 66420256423680.0, + "grad_norm": 1.0671347208063184, + "language_loss": 0.65522671, + "learning_rate": 3.9982473993893165e-06, + "loss": 0.73858166, + "num_input_tokens_seen": 15240055, + "router_z_loss_clip": 4.5078125, + "router_z_loss_mlp": 0.60351562, + "step": 714, + "time_per_iteration": 3.317920207977295 + }, + { + "auxiliary_loss_clip": 0.07647526, + "auxiliary_loss_mlp": 0.01441108, + "balance_loss_clip": 0.0677468, + "balance_loss_mlp": 0.01326476, + "epoch": 0.042988125657598075, + "flos": 31658418777600.0, + "grad_norm": 3.6319248406792983, + "language_loss": 0.79793668, + "learning_rate": 3.998231060622563e-06, + "loss": 0.88882303, + "num_input_tokens_seen": 15261585, + "router_z_loss_clip": 8.73046875, + "router_z_loss_mlp": 1.14550781, + "step": 715, + "time_per_iteration": 2.717393398284912 + }, + { + "auxiliary_loss_clip": 0.07645463, + "auxiliary_loss_mlp": 0.01445614, + "balance_loss_clip": 0.06767702, + "balance_loss_mlp": 0.01331984, + "epoch": 0.04304824891026605, + "flos": 33255690433920.0, + "grad_norm": 29.540799393093693, + "language_loss": 0.77394652, + "learning_rate": 3.998214646082688e-06, + "loss": 0.86485732, + "num_input_tokens_seen": 15281160, + "router_z_loss_clip": 8.7890625, + "router_z_loss_mlp": 1.13623047, + "step": 716, + "time_per_iteration": 2.7298099994659424 + }, + { + "auxiliary_loss_clip": 0.07019071, + "auxiliary_loss_mlp": 0.01306888, + "balance_loss_clip": 0.06569381, + "balance_loss_mlp": 0.01252815, + "epoch": 0.04310837216293401, + "flos": 64086996430080.0, + "grad_norm": 0.9619131870502678, + "language_loss": 0.6602453, + "learning_rate": 3.998198155770314e-06, + "loss": 0.74350488, + "num_input_tokens_seen": 15344505, + "router_z_loss_clip": 4.5, + "router_z_loss_mlp": 0.54199219, + "step": 717, + "time_per_iteration": 3.2711920738220215 + }, + { + "auxiliary_loss_clip": 0.06998679, + "auxiliary_loss_mlp": 0.01302753, + "balance_loss_clip": 0.06550965, + "balance_loss_mlp": 0.01248918, + "epoch": 0.043168495415601985, + "flos": 61361990599680.0, + "grad_norm": 0.9806748941419274, + "language_loss": 0.58663344, + "learning_rate": 3.998181589686065e-06, + "loss": 0.66964775, + "num_input_tokens_seen": 15404050, + "router_z_loss_clip": 4.49609375, + "router_z_loss_mlp": 0.53955078, + "step": 718, + "time_per_iteration": 3.083362579345703 + }, + { + "auxiliary_loss_clip": 0.07634784, + "auxiliary_loss_mlp": 0.01408365, + "balance_loss_clip": 0.06757121, + "balance_loss_mlp": 0.01309135, + "epoch": 0.04322861866826996, + "flos": 20710539717120.0, + "grad_norm": 8.670927241625472, + "language_loss": 0.97469372, + "learning_rate": 3.99816494783057e-06, + "loss": 1.06512523, + "num_input_tokens_seen": 15424190, + "router_z_loss_clip": 8.78125, + "router_z_loss_mlp": 0.99316406, + "step": 719, + "time_per_iteration": 2.620244264602661 + }, + { + "auxiliary_loss_clip": 0.07617359, + "auxiliary_loss_mlp": 0.01437239, + "balance_loss_clip": 0.06746139, + "balance_loss_mlp": 0.01327042, + "epoch": 0.04328874192093792, + "flos": 30381308772480.0, + "grad_norm": 7.103043460272315, + "language_loss": 0.71241379, + "learning_rate": 3.99814823020446e-06, + "loss": 0.8029598, + "num_input_tokens_seen": 15446500, + "router_z_loss_clip": 8.703125, + "router_z_loss_mlp": 1.10253906, + "step": 720, + "time_per_iteration": 2.7137084007263184 + }, + { + "auxiliary_loss_clip": 0.07571768, + "auxiliary_loss_mlp": 0.01420566, + "balance_loss_clip": 0.06721878, + "balance_loss_mlp": 0.01314518, + "epoch": 0.043348865173605894, + "flos": 21951284250240.0, + "grad_norm": 7.242521234745598, + "language_loss": 0.82826072, + "learning_rate": 3.9981314368083684e-06, + "loss": 0.91818404, + "num_input_tokens_seen": 15465830, + "router_z_loss_clip": 8.5078125, + "router_z_loss_mlp": 1.06152344, + "step": 721, + "time_per_iteration": 2.6496849060058594 + }, + { + "auxiliary_loss_clip": 0.07618188, + "auxiliary_loss_mlp": 0.01421571, + "balance_loss_clip": 0.06749155, + "balance_loss_mlp": 0.01323009, + "epoch": 0.04340898842627386, + "flos": 15268982590080.0, + "grad_norm": 11.950148766430376, + "language_loss": 0.94630802, + "learning_rate": 3.998114567642933e-06, + "loss": 1.03670549, + "num_input_tokens_seen": 15479985, + "router_z_loss_clip": 8.6953125, + "router_z_loss_mlp": 0.98486328, + "step": 722, + "time_per_iteration": 2.665302038192749 + }, + { + "auxiliary_loss_clip": 0.07582939, + "auxiliary_loss_mlp": 0.01410079, + "balance_loss_clip": 0.06720737, + "balance_loss_mlp": 0.01309896, + "epoch": 0.04346911167894183, + "flos": 27973011847680.0, + "grad_norm": 7.626593725821058, + "language_loss": 0.90292984, + "learning_rate": 3.998097622708792e-06, + "loss": 0.99286008, + "num_input_tokens_seen": 15501545, + "router_z_loss_clip": 8.625, + "router_z_loss_mlp": 1.00195312, + "step": 723, + "time_per_iteration": 2.6893301010131836 + }, + { + "auxiliary_loss_clip": 0.0756183, + "auxiliary_loss_mlp": 0.01404071, + "balance_loss_clip": 0.06712201, + "balance_loss_mlp": 0.01307798, + "epoch": 0.0435292349316098, + "flos": 29249954144640.0, + "grad_norm": 5.654199567369001, + "language_loss": 0.8762064, + "learning_rate": 3.99808060200659e-06, + "loss": 0.96586531, + "num_input_tokens_seen": 15521725, + "router_z_loss_clip": 8.5, + "router_z_loss_mlp": 0.96337891, + "step": 724, + "time_per_iteration": 2.7862863540649414 + }, + { + "auxiliary_loss_clip": 0.07522231, + "auxiliary_loss_mlp": 0.01408898, + "balance_loss_clip": 0.06700347, + "balance_loss_mlp": 0.01310479, + "epoch": 0.04358935818427777, + "flos": 20564616631680.0, + "grad_norm": 17.469159252810304, + "language_loss": 0.84563124, + "learning_rate": 3.998063505536971e-06, + "loss": 0.93494248, + "num_input_tokens_seen": 15540910, + "router_z_loss_clip": 8.2109375, + "router_z_loss_mlp": 0.98339844, + "step": 725, + "time_per_iteration": 2.6348090171813965 + }, + { + "auxiliary_loss_clip": 0.07563804, + "auxiliary_loss_mlp": 0.01414464, + "balance_loss_clip": 0.06708695, + "balance_loss_mlp": 0.01317428, + "epoch": 0.04364948143694574, + "flos": 14470116163200.0, + "grad_norm": 13.275228581754149, + "language_loss": 0.94372833, + "learning_rate": 3.998046333300584e-06, + "loss": 1.03351104, + "num_input_tokens_seen": 15558640, + "router_z_loss_clip": 8.5546875, + "router_z_loss_mlp": 0.96972656, + "step": 726, + "time_per_iteration": 2.6198081970214844 + }, + { + "auxiliary_loss_clip": 0.06976914, + "auxiliary_loss_mlp": 0.01364793, + "balance_loss_clip": 0.0652867, + "balance_loss_mlp": 0.01297797, + "epoch": 0.043709604689613706, + "flos": 50083216565760.0, + "grad_norm": 0.973992689315138, + "language_loss": 0.56151426, + "learning_rate": 3.998029085298079e-06, + "loss": 0.64493132, + "num_input_tokens_seen": 15612975, + "router_z_loss_clip": 4.4921875, + "router_z_loss_mlp": 0.67041016, + "step": 727, + "time_per_iteration": 3.331416368484497 + }, + { + "auxiliary_loss_clip": 0.07546923, + "auxiliary_loss_mlp": 0.01412171, + "balance_loss_clip": 0.06696635, + "balance_loss_mlp": 0.01320475, + "epoch": 0.04376972794228168, + "flos": 13996861902720.0, + "grad_norm": 5.257747667032763, + "language_loss": 0.87717295, + "learning_rate": 3.998011761530112e-06, + "loss": 0.96676385, + "num_input_tokens_seen": 15631070, + "router_z_loss_clip": 8.51953125, + "router_z_loss_mlp": 0.91699219, + "step": 728, + "time_per_iteration": 3.989957571029663 + }, + { + "auxiliary_loss_clip": 0.07508835, + "auxiliary_loss_mlp": 0.01424416, + "balance_loss_clip": 0.06694756, + "balance_loss_mlp": 0.0133787, + "epoch": 0.04382985119494965, + "flos": 22015084734720.0, + "grad_norm": 7.636957371182376, + "language_loss": 0.80325305, + "learning_rate": 3.997994361997338e-06, + "loss": 0.89258564, + "num_input_tokens_seen": 15647825, + "router_z_loss_clip": 8.14453125, + "router_z_loss_mlp": 0.86572266, + "step": 729, + "time_per_iteration": 4.069265365600586 + }, + { + "auxiliary_loss_clip": 0.07515953, + "auxiliary_loss_mlp": 0.01429781, + "balance_loss_clip": 0.06682766, + "balance_loss_mlp": 0.01337561, + "epoch": 0.043889974447617615, + "flos": 24213322673280.0, + "grad_norm": 4.547809577279536, + "language_loss": 1.00979817, + "learning_rate": 3.997976886700417e-06, + "loss": 1.09925556, + "num_input_tokens_seen": 15668260, + "router_z_loss_clip": 8.33203125, + "router_z_loss_mlp": 0.92285156, + "step": 730, + "time_per_iteration": 4.043174982070923 + }, + { + "auxiliary_loss_clip": 0.07549515, + "auxiliary_loss_mlp": 0.01462607, + "balance_loss_clip": 0.06684491, + "balance_loss_mlp": 0.0135055, + "epoch": 0.04395009770028559, + "flos": 17280236142720.0, + "grad_norm": 42.34250232752857, + "language_loss": 0.93866402, + "learning_rate": 3.997959335640013e-06, + "loss": 1.02878523, + "num_input_tokens_seen": 15685630, + "router_z_loss_clip": 8.6640625, + "router_z_loss_mlp": 1.12011719, + "step": 731, + "time_per_iteration": 2.6158339977264404 + }, + { + "auxiliary_loss_clip": 0.07507139, + "auxiliary_loss_mlp": 0.01450773, + "balance_loss_clip": 0.06690555, + "balance_loss_mlp": 0.0135059, + "epoch": 0.04401022095295355, + "flos": 12314784314880.0, + "grad_norm": 29.143956092822908, + "language_loss": 0.9731133, + "learning_rate": 3.997941708816791e-06, + "loss": 1.0626924, + "num_input_tokens_seen": 15698645, + "router_z_loss_clip": 8.1640625, + "router_z_loss_mlp": 1.00146484, + "step": 732, + "time_per_iteration": 4.100733995437622 + }, + { + "auxiliary_loss_clip": 0.07525843, + "auxiliary_loss_mlp": 0.01458711, + "balance_loss_clip": 0.06679834, + "balance_loss_mlp": 0.01353854, + "epoch": 0.044070344205621524, + "flos": 20965978488960.0, + "grad_norm": 13.482370943505323, + "language_loss": 0.90961432, + "learning_rate": 3.997924006231419e-06, + "loss": 0.9994598, + "num_input_tokens_seen": 15716775, + "router_z_loss_clip": 8.46875, + "router_z_loss_mlp": 1.04785156, + "step": 733, + "time_per_iteration": 2.6597700119018555 + }, + { + "auxiliary_loss_clip": 0.07518548, + "auxiliary_loss_mlp": 0.01469977, + "balance_loss_clip": 0.06685109, + "balance_loss_mlp": 0.01364262, + "epoch": 0.044130467458289496, + "flos": 13850477619840.0, + "grad_norm": 7.4867822080691235, + "language_loss": 0.95689577, + "learning_rate": 3.9979062278845685e-06, + "loss": 1.04678106, + "num_input_tokens_seen": 15733320, + "router_z_loss_clip": 8.34375, + "router_z_loss_mlp": 1.05664062, + "step": 734, + "time_per_iteration": 2.5865581035614014 + }, + { + "auxiliary_loss_clip": 0.0748552, + "auxiliary_loss_mlp": 0.01451415, + "balance_loss_clip": 0.06673294, + "balance_loss_mlp": 0.01355809, + "epoch": 0.04419059071095746, + "flos": 28662152952960.0, + "grad_norm": 3.9560769382385237, + "language_loss": 0.82954776, + "learning_rate": 3.9978883737769125e-06, + "loss": 0.91891712, + "num_input_tokens_seen": 15752705, + "router_z_loss_clip": 8.12890625, + "router_z_loss_mlp": 0.95605469, + "step": 735, + "time_per_iteration": 2.7034595012664795 + }, + { + "auxiliary_loss_clip": 0.07501128, + "auxiliary_loss_mlp": 0.01471986, + "balance_loss_clip": 0.06663659, + "balance_loss_mlp": 0.01360931, + "epoch": 0.04425071396362543, + "flos": 28190743482240.0, + "grad_norm": 5.551572813958511, + "language_loss": 0.95522362, + "learning_rate": 3.9978704439091305e-06, + "loss": 1.04495478, + "num_input_tokens_seen": 15772800, + "router_z_loss_clip": 8.375, + "router_z_loss_mlp": 1.11132812, + "step": 736, + "time_per_iteration": 2.6946370601654053 + }, + { + "auxiliary_loss_clip": 0.07478474, + "auxiliary_loss_mlp": 0.01445427, + "balance_loss_clip": 0.06672784, + "balance_loss_mlp": 0.01338806, + "epoch": 0.0443108372162934, + "flos": 23665031481600.0, + "grad_norm": 16.744954570362566, + "language_loss": 0.88981938, + "learning_rate": 3.997852438281901e-06, + "loss": 0.97905844, + "num_input_tokens_seen": 15793665, + "router_z_loss_clip": 8.0625, + "router_z_loss_mlp": 1.06640625, + "step": 737, + "time_per_iteration": 2.715646266937256 + }, + { + "auxiliary_loss_clip": 0.07480585, + "auxiliary_loss_mlp": 0.01439926, + "balance_loss_clip": 0.0667211, + "balance_loss_mlp": 0.01326964, + "epoch": 0.04437096046896137, + "flos": 33987486067200.0, + "grad_norm": 222.55096495156016, + "language_loss": 0.89570022, + "learning_rate": 3.997834356895906e-06, + "loss": 0.98490536, + "num_input_tokens_seen": 15813175, + "router_z_loss_clip": 8.0859375, + "router_z_loss_mlp": 1.12988281, + "step": 738, + "time_per_iteration": 2.7859413623809814 + }, + { + "auxiliary_loss_clip": 0.06961473, + "auxiliary_loss_mlp": 0.01305245, + "balance_loss_clip": 0.06532852, + "balance_loss_mlp": 0.01250504, + "epoch": 0.04443108372162934, + "flos": 67416268308480.0, + "grad_norm": 0.9420923573397554, + "language_loss": 0.59376323, + "learning_rate": 3.9978161997518324e-06, + "loss": 0.67643034, + "num_input_tokens_seen": 15872050, + "router_z_loss_clip": 4.3046875, + "router_z_loss_mlp": 0.54882812, + "step": 739, + "time_per_iteration": 3.1967270374298096 + }, + { + "auxiliary_loss_clip": 0.07502826, + "auxiliary_loss_mlp": 0.01427717, + "balance_loss_clip": 0.06669345, + "balance_loss_mlp": 0.01320858, + "epoch": 0.04449120697429731, + "flos": 29760454344960.0, + "grad_norm": 6.6049127408313915, + "language_loss": 0.9770751, + "learning_rate": 3.997797966850369e-06, + "loss": 1.0663805, + "num_input_tokens_seen": 15891085, + "router_z_loss_clip": 8.3359375, + "router_z_loss_mlp": 1.06933594, + "step": 740, + "time_per_iteration": 2.768758535385132 + }, + { + "auxiliary_loss_clip": 0.07489674, + "auxiliary_loss_mlp": 0.0143368, + "balance_loss_clip": 0.06660549, + "balance_loss_mlp": 0.01330111, + "epoch": 0.04455133022696528, + "flos": 36510958828800.0, + "grad_norm": 21.062626098117025, + "language_loss": 0.76799577, + "learning_rate": 3.997779658192205e-06, + "loss": 0.85722935, + "num_input_tokens_seen": 15914225, + "router_z_loss_clip": 8.3046875, + "router_z_loss_mlp": 1.03515625, + "step": 741, + "time_per_iteration": 2.755948543548584 + }, + { + "auxiliary_loss_clip": 0.0744606, + "auxiliary_loss_mlp": 0.01441267, + "balance_loss_clip": 0.06655986, + "balance_loss_mlp": 0.01339128, + "epoch": 0.044611453479633245, + "flos": 28811220566400.0, + "grad_norm": 10.341428331493303, + "language_loss": 0.9204191, + "learning_rate": 3.997761273778037e-06, + "loss": 1.00929236, + "num_input_tokens_seen": 15934540, + "router_z_loss_clip": 7.90234375, + "router_z_loss_mlp": 1.02148438, + "step": 742, + "time_per_iteration": 2.6964497566223145 + }, + { + "auxiliary_loss_clip": 0.07461847, + "auxiliary_loss_mlp": 0.01424939, + "balance_loss_clip": 0.06654513, + "balance_loss_mlp": 0.01322085, + "epoch": 0.04467157673230122, + "flos": 20017122053760.0, + "grad_norm": 7.31366885778202, + "language_loss": 0.89204007, + "learning_rate": 3.997742813608561e-06, + "loss": 0.98090798, + "num_input_tokens_seen": 15952560, + "router_z_loss_clip": 8.078125, + "router_z_loss_mlp": 1.02880859, + "step": 743, + "time_per_iteration": 2.6080615520477295 + }, + { + "auxiliary_loss_clip": 0.07439004, + "auxiliary_loss_mlp": 0.01432385, + "balance_loss_clip": 0.06638713, + "balance_loss_mlp": 0.01329913, + "epoch": 0.04473169998496919, + "flos": 18010899745920.0, + "grad_norm": 13.675273731760388, + "language_loss": 0.85338962, + "learning_rate": 3.997724277684479e-06, + "loss": 0.94210356, + "num_input_tokens_seen": 15970620, + "router_z_loss_clip": 8.00390625, + "router_z_loss_mlp": 1.02490234, + "step": 744, + "time_per_iteration": 2.697763204574585 + }, + { + "auxiliary_loss_clip": 0.07427198, + "auxiliary_loss_mlp": 0.01407828, + "balance_loss_clip": 0.06637768, + "balance_loss_mlp": 0.01313938, + "epoch": 0.044791823237637154, + "flos": 20638060335360.0, + "grad_norm": 8.258556171326942, + "language_loss": 0.89771521, + "learning_rate": 3.99770566600649e-06, + "loss": 0.98606539, + "num_input_tokens_seen": 15987325, + "router_z_loss_clip": 7.89453125, + "router_z_loss_mlp": 0.93896484, + "step": 745, + "time_per_iteration": 2.609206438064575 + }, + { + "auxiliary_loss_clip": 0.07450528, + "auxiliary_loss_mlp": 0.01413412, + "balance_loss_clip": 0.06646559, + "balance_loss_mlp": 0.01313371, + "epoch": 0.04485194649030513, + "flos": 31184284049280.0, + "grad_norm": 12.351211228960139, + "language_loss": 0.73676586, + "learning_rate": 3.997686978575302e-06, + "loss": 0.82540524, + "num_input_tokens_seen": 16008310, + "router_z_loss_clip": 8.05859375, + "router_z_loss_mlp": 1.0, + "step": 746, + "time_per_iteration": 2.8217551708221436 + }, + { + "auxiliary_loss_clip": 0.07421336, + "auxiliary_loss_mlp": 0.01411005, + "balance_loss_clip": 0.06631814, + "balance_loss_mlp": 0.01308485, + "epoch": 0.04491206974297309, + "flos": 26150922887040.0, + "grad_norm": 4.52399420645529, + "language_loss": 0.7370531, + "learning_rate": 3.997668215391625e-06, + "loss": 0.82537645, + "num_input_tokens_seen": 16029620, + "router_z_loss_clip": 7.89453125, + "router_z_loss_mlp": 1.02587891, + "step": 747, + "time_per_iteration": 2.724240303039551 + }, + { + "auxiliary_loss_clip": 0.0741486, + "auxiliary_loss_mlp": 0.01407706, + "balance_loss_clip": 0.06629101, + "balance_loss_mlp": 0.0131005, + "epoch": 0.044972192995641064, + "flos": 20673922682880.0, + "grad_norm": 4.695342378066542, + "language_loss": 0.7142753, + "learning_rate": 3.997649376456168e-06, + "loss": 0.80250096, + "num_input_tokens_seen": 16049065, + "router_z_loss_clip": 7.859375, + "router_z_loss_mlp": 0.97607422, + "step": 748, + "time_per_iteration": 2.6020255088806152 + }, + { + "auxiliary_loss_clip": 0.0743566, + "auxiliary_loss_mlp": 0.01385894, + "balance_loss_clip": 0.06626688, + "balance_loss_mlp": 0.01281753, + "epoch": 0.045032316248309036, + "flos": 16112306407680.0, + "grad_norm": 6.462262226814603, + "language_loss": 0.81646264, + "learning_rate": 3.997630461769647e-06, + "loss": 0.90467817, + "num_input_tokens_seen": 16066765, + "router_z_loss_clip": 8.08984375, + "router_z_loss_mlp": 1.04199219, + "step": 749, + "time_per_iteration": 2.715440273284912 + }, + { + "auxiliary_loss_clip": 0.07424041, + "auxiliary_loss_mlp": 0.01391269, + "balance_loss_clip": 0.06627008, + "balance_loss_mlp": 0.01284601, + "epoch": 0.045092439500977, + "flos": 17864725098240.0, + "grad_norm": 4.760324696153287, + "language_loss": 0.94018352, + "learning_rate": 3.997611471332778e-06, + "loss": 1.02833652, + "num_input_tokens_seen": 16085980, + "router_z_loss_clip": 7.96484375, + "router_z_loss_mlp": 1.06542969, + "step": 750, + "time_per_iteration": 2.603782892227173 + }, + { + "auxiliary_loss_clip": 0.07430436, + "auxiliary_loss_mlp": 0.01400307, + "balance_loss_clip": 0.06634089, + "balance_loss_mlp": 0.01284579, + "epoch": 0.04515256275364497, + "flos": 24469809621120.0, + "grad_norm": 8.436133500985974, + "language_loss": 0.79776669, + "learning_rate": 3.9975924051462825e-06, + "loss": 0.88607413, + "num_input_tokens_seen": 16106260, + "router_z_loss_clip": 7.97265625, + "router_z_loss_mlp": 1.15673828, + "step": 751, + "time_per_iteration": 2.6831071376800537 + }, + { + "auxiliary_loss_clip": 0.07439418, + "auxiliary_loss_mlp": 0.01393415, + "balance_loss_clip": 0.06633066, + "balance_loss_mlp": 0.01282932, + "epoch": 0.04521268600631294, + "flos": 20921563025280.0, + "grad_norm": 6.241833654243461, + "language_loss": 0.75070345, + "learning_rate": 3.997573263210883e-06, + "loss": 0.83903182, + "num_input_tokens_seen": 16123475, + "router_z_loss_clip": 8.05859375, + "router_z_loss_mlp": 1.10351562, + "step": 752, + "time_per_iteration": 2.6177663803100586 + }, + { + "auxiliary_loss_clip": 0.07437599, + "auxiliary_loss_mlp": 0.01387858, + "balance_loss_clip": 0.06631324, + "balance_loss_mlp": 0.01275515, + "epoch": 0.04527280925898091, + "flos": 13376552526720.0, + "grad_norm": 9.915844804632899, + "language_loss": 0.97712451, + "learning_rate": 3.997554045527305e-06, + "loss": 1.06537914, + "num_input_tokens_seen": 16138335, + "router_z_loss_clip": 8.0703125, + "router_z_loss_mlp": 1.125, + "step": 753, + "time_per_iteration": 2.613664388656616 + }, + { + "auxiliary_loss_clip": 0.07467066, + "auxiliary_loss_mlp": 0.0138957, + "balance_loss_clip": 0.06645191, + "balance_loss_mlp": 0.01278133, + "epoch": 0.04533293251164888, + "flos": 23260650877440.0, + "grad_norm": 4.960920268809469, + "language_loss": 0.95308006, + "learning_rate": 3.997534752096277e-06, + "loss": 1.04164636, + "num_input_tokens_seen": 16157110, + "router_z_loss_clip": 8.23046875, + "router_z_loss_mlp": 1.11376953, + "step": 754, + "time_per_iteration": 2.6214957237243652 + }, + { + "auxiliary_loss_clip": 0.07402018, + "auxiliary_loss_mlp": 0.01373244, + "balance_loss_clip": 0.06614807, + "balance_loss_mlp": 0.01264812, + "epoch": 0.04539305576431685, + "flos": 12426899477760.0, + "grad_norm": 4.312204742226669, + "language_loss": 0.84473336, + "learning_rate": 3.997515382918531e-06, + "loss": 0.93248594, + "num_input_tokens_seen": 16174155, + "router_z_loss_clip": 7.87890625, + "router_z_loss_mlp": 1.08544922, + "step": 755, + "time_per_iteration": 2.659515857696533 + }, + { + "auxiliary_loss_clip": 0.07425568, + "auxiliary_loss_mlp": 0.01385083, + "balance_loss_clip": 0.06618007, + "balance_loss_mlp": 0.01261582, + "epoch": 0.04545317901698482, + "flos": 16076569841280.0, + "grad_norm": 4.663949688306233, + "language_loss": 0.85189492, + "learning_rate": 3.9974959379948015e-06, + "loss": 0.94000149, + "num_input_tokens_seen": 16192240, + "router_z_loss_clip": 8.078125, + "router_z_loss_mlp": 1.23632812, + "step": 756, + "time_per_iteration": 2.5948095321655273 + }, + { + "auxiliary_loss_clip": 0.0692629, + "auxiliary_loss_mlp": 0.01345145, + "balance_loss_clip": 0.06492035, + "balance_loss_mlp": 0.01295292, + "epoch": 0.045513302269652785, + "flos": 66418118144640.0, + "grad_norm": 0.7901603277703675, + "language_loss": 0.62960637, + "learning_rate": 3.997476417325827e-06, + "loss": 0.71232069, + "num_input_tokens_seen": 16255775, + "router_z_loss_clip": 4.34375, + "router_z_loss_mlp": 0.49829102, + "step": 757, + "time_per_iteration": 3.255581855773926 + }, + { + "auxiliary_loss_clip": 0.07416959, + "auxiliary_loss_mlp": 0.01380818, + "balance_loss_clip": 0.06624802, + "balance_loss_mlp": 0.01258747, + "epoch": 0.04557342552232076, + "flos": 21477694573440.0, + "grad_norm": 3.09506424046452, + "language_loss": 0.87773216, + "learning_rate": 3.997456820912346e-06, + "loss": 0.96570992, + "num_input_tokens_seen": 16277015, + "router_z_loss_clip": 7.921875, + "router_z_loss_mlp": 1.22070312, + "step": 758, + "time_per_iteration": 2.661123514175415 + }, + { + "auxiliary_loss_clip": 0.0740035, + "auxiliary_loss_mlp": 0.01375063, + "balance_loss_clip": 0.06621221, + "balance_loss_mlp": 0.01257952, + "epoch": 0.04563354877498873, + "flos": 23739481434240.0, + "grad_norm": 2.638413914831674, + "language_loss": 0.92492557, + "learning_rate": 3.997437148755101e-06, + "loss": 1.0126797, + "num_input_tokens_seen": 16296005, + "router_z_loss_clip": 7.78515625, + "router_z_loss_mlp": 1.17089844, + "step": 759, + "time_per_iteration": 2.668470859527588 + }, + { + "auxiliary_loss_clip": 0.07430892, + "auxiliary_loss_mlp": 0.01383461, + "balance_loss_clip": 0.06623936, + "balance_loss_mlp": 0.01266541, + "epoch": 0.045693672027656694, + "flos": 25742265724800.0, + "grad_norm": 3.8629420904701237, + "language_loss": 0.79697698, + "learning_rate": 3.9974174008548405e-06, + "loss": 0.88512051, + "num_input_tokens_seen": 16315300, + "router_z_loss_clip": 8.07421875, + "router_z_loss_mlp": 1.16992188, + "step": 760, + "time_per_iteration": 2.716425895690918 + }, + { + "auxiliary_loss_clip": 0.07406907, + "auxiliary_loss_mlp": 0.01369419, + "balance_loss_clip": 0.06620169, + "balance_loss_mlp": 0.01267519, + "epoch": 0.045753795280324666, + "flos": 19725108174720.0, + "grad_norm": 2.8686759977967458, + "language_loss": 0.87246794, + "learning_rate": 3.9973975772123105e-06, + "loss": 0.96023118, + "num_input_tokens_seen": 16333820, + "router_z_loss_clip": 7.87109375, + "router_z_loss_mlp": 1.01855469, + "step": 761, + "time_per_iteration": 2.6261487007141113 + }, + { + "auxiliary_loss_clip": 0.07379207, + "auxiliary_loss_mlp": 0.01371916, + "balance_loss_clip": 0.06607988, + "balance_loss_mlp": 0.01259764, + "epoch": 0.04581391853299264, + "flos": 23262076396800.0, + "grad_norm": 2.7268346941502273, + "language_loss": 0.83904314, + "learning_rate": 3.997377677828266e-06, + "loss": 0.92655438, + "num_input_tokens_seen": 16355290, + "router_z_loss_clip": 7.71875, + "router_z_loss_mlp": 1.12304688, + "step": 762, + "time_per_iteration": 2.677358627319336 + }, + { + "auxiliary_loss_clip": 0.06917945, + "auxiliary_loss_mlp": 0.01342542, + "balance_loss_clip": 0.06491472, + "balance_loss_mlp": 0.01301057, + "epoch": 0.0458740417856606, + "flos": 64250711308800.0, + "grad_norm": 0.9293980504879501, + "language_loss": 0.59131134, + "learning_rate": 3.9973577027034585e-06, + "loss": 0.67391622, + "num_input_tokens_seen": 16415995, + "router_z_loss_clip": 4.25, + "router_z_loss_mlp": 0.41503906, + "step": 763, + "time_per_iteration": 3.262456178665161 + }, + { + "auxiliary_loss_clip": 0.07421511, + "auxiliary_loss_mlp": 0.01399391, + "balance_loss_clip": 0.0662367, + "balance_loss_mlp": 0.01283425, + "epoch": 0.045934165038328575, + "flos": 20775220669440.0, + "grad_norm": 3.4758610459340535, + "language_loss": 0.92935646, + "learning_rate": 3.9973376518386475e-06, + "loss": 1.01756549, + "num_input_tokens_seen": 16433120, + "router_z_loss_clip": 7.98046875, + "router_z_loss_mlp": 1.15869141, + "step": 764, + "time_per_iteration": 2.66152024269104 + }, + { + "auxiliary_loss_clip": 0.07451791, + "auxiliary_loss_mlp": 0.01391333, + "balance_loss_clip": 0.06637829, + "balance_loss_mlp": 0.01274556, + "epoch": 0.04599428829099654, + "flos": 30270661056000.0, + "grad_norm": 3.768496915542153, + "language_loss": 0.90699267, + "learning_rate": 3.997317525234592e-06, + "loss": 0.99542397, + "num_input_tokens_seen": 16453360, + "router_z_loss_clip": 8.14453125, + "router_z_loss_mlp": 1.16845703, + "step": 765, + "time_per_iteration": 2.6835410594940186 + }, + { + "auxiliary_loss_clip": 0.07426902, + "auxiliary_loss_mlp": 0.01398616, + "balance_loss_clip": 0.0662117, + "balance_loss_mlp": 0.01278883, + "epoch": 0.04605441154366451, + "flos": 23045518719360.0, + "grad_norm": 7.076643019058991, + "language_loss": 0.94406933, + "learning_rate": 3.997297322892056e-06, + "loss": 1.03232455, + "num_input_tokens_seen": 16471160, + "router_z_loss_clip": 8.0625, + "router_z_loss_mlp": 1.19580078, + "step": 766, + "time_per_iteration": 2.6382553577423096 + }, + { + "auxiliary_loss_clip": 0.07415807, + "auxiliary_loss_mlp": 0.01393781, + "balance_loss_clip": 0.06614047, + "balance_loss_mlp": 0.01284967, + "epoch": 0.046114534796332485, + "flos": 22023847486080.0, + "grad_norm": 4.776611740874826, + "language_loss": 0.89285934, + "learning_rate": 3.997277044811806e-06, + "loss": 0.98095518, + "num_input_tokens_seen": 16488940, + "router_z_loss_clip": 8.01953125, + "router_z_loss_mlp": 1.08789062, + "step": 767, + "time_per_iteration": 4.195739984512329 + }, + { + "auxiliary_loss_clip": 0.07392205, + "auxiliary_loss_mlp": 0.01374375, + "balance_loss_clip": 0.0661349, + "balance_loss_mlp": 0.01267278, + "epoch": 0.04617465804900045, + "flos": 29870221593600.0, + "grad_norm": 7.642963435689524, + "language_loss": 0.92056656, + "learning_rate": 3.99725669099461e-06, + "loss": 1.00823236, + "num_input_tokens_seen": 16509505, + "router_z_loss_clip": 7.7890625, + "router_z_loss_mlp": 1.0703125, + "step": 768, + "time_per_iteration": 4.208758354187012 + }, + { + "auxiliary_loss_clip": 0.07427865, + "auxiliary_loss_mlp": 0.01386956, + "balance_loss_clip": 0.06619686, + "balance_loss_mlp": 0.01278571, + "epoch": 0.04623478130166842, + "flos": 25637194304640.0, + "grad_norm": 3.542997425401238, + "language_loss": 0.79400444, + "learning_rate": 3.9972362614412395e-06, + "loss": 0.88215268, + "num_input_tokens_seen": 16528840, + "router_z_loss_clip": 8.078125, + "router_z_loss_mlp": 1.08447266, + "step": 769, + "time_per_iteration": 4.17974328994751 + }, + { + "auxiliary_loss_clip": 0.07375413, + "auxiliary_loss_mlp": 0.01385881, + "balance_loss_clip": 0.06606276, + "balance_loss_mlp": 0.01275923, + "epoch": 0.04629490455433639, + "flos": 20455352507520.0, + "grad_norm": 2.7800745603564185, + "language_loss": 0.89842647, + "learning_rate": 3.997215756152471e-06, + "loss": 0.9860394, + "num_input_tokens_seen": 16548335, + "router_z_loss_clip": 7.69140625, + "router_z_loss_mlp": 1.10009766, + "step": 770, + "time_per_iteration": 2.656651735305786 + }, + { + "auxiliary_loss_clip": 0.07423855, + "auxiliary_loss_mlp": 0.01400348, + "balance_loss_clip": 0.06619771, + "balance_loss_mlp": 0.01292678, + "epoch": 0.04635502780700436, + "flos": 23155411749120.0, + "grad_norm": 4.755062709171144, + "language_loss": 0.92055309, + "learning_rate": 3.99719517512908e-06, + "loss": 1.00879514, + "num_input_tokens_seen": 16567725, + "router_z_loss_clip": 8.04296875, + "router_z_loss_mlp": 1.07714844, + "step": 771, + "time_per_iteration": 4.008092403411865 + }, + { + "auxiliary_loss_clip": 0.07446887, + "auxiliary_loss_mlp": 0.0141094, + "balance_loss_clip": 0.06623209, + "balance_loss_mlp": 0.01295641, + "epoch": 0.04641515105967233, + "flos": 23298274160640.0, + "grad_norm": 7.281609081858744, + "language_loss": 0.88918245, + "learning_rate": 3.997174518371848e-06, + "loss": 0.97776067, + "num_input_tokens_seen": 16588175, + "router_z_loss_clip": 8.2265625, + "router_z_loss_mlp": 1.15380859, + "step": 772, + "time_per_iteration": 2.6240971088409424 + }, + { + "auxiliary_loss_clip": 0.07388498, + "auxiliary_loss_mlp": 0.01396403, + "balance_loss_clip": 0.06612748, + "balance_loss_mlp": 0.01294503, + "epoch": 0.046475274312340296, + "flos": 25121579005440.0, + "grad_norm": 3.47084722704317, + "language_loss": 0.78166652, + "learning_rate": 3.997153785881557e-06, + "loss": 0.86951548, + "num_input_tokens_seen": 16607735, + "router_z_loss_clip": 7.765625, + "router_z_loss_mlp": 1.01904297, + "step": 773, + "time_per_iteration": 2.6761457920074463 + }, + { + "auxiliary_loss_clip": 0.07362784, + "auxiliary_loss_mlp": 0.01412458, + "balance_loss_clip": 0.06602354, + "balance_loss_mlp": 0.0130703, + "epoch": 0.04653539756500827, + "flos": 25271946357120.0, + "grad_norm": 3.68531082302782, + "language_loss": 0.82003927, + "learning_rate": 3.997132977658996e-06, + "loss": 0.90779173, + "num_input_tokens_seen": 16627225, + "router_z_loss_clip": 7.609375, + "router_z_loss_mlp": 1.05419922, + "step": 774, + "time_per_iteration": 2.6333625316619873 + }, + { + "auxiliary_loss_clip": 0.0737831, + "auxiliary_loss_mlp": 0.01410602, + "balance_loss_clip": 0.06605712, + "balance_loss_mlp": 0.0129783, + "epoch": 0.046595520817676234, + "flos": 35412238166400.0, + "grad_norm": 3.362442863286837, + "language_loss": 0.78172398, + "learning_rate": 3.997112093704952e-06, + "loss": 0.86961305, + "num_input_tokens_seen": 16647785, + "router_z_loss_clip": 7.73046875, + "router_z_loss_mlp": 1.12792969, + "step": 775, + "time_per_iteration": 2.7341220378875732 + }, + { + "auxiliary_loss_clip": 0.07397586, + "auxiliary_loss_mlp": 0.01408088, + "balance_loss_clip": 0.0662451, + "balance_loss_mlp": 0.01303994, + "epoch": 0.046655644070344206, + "flos": 18118151372160.0, + "grad_norm": 4.938605745427105, + "language_loss": 0.81674814, + "learning_rate": 3.997091134020217e-06, + "loss": 0.90480489, + "num_input_tokens_seen": 16667555, + "router_z_loss_clip": 7.734375, + "router_z_loss_mlp": 1.04052734, + "step": 776, + "time_per_iteration": 2.631185293197632 + }, + { + "auxiliary_loss_clip": 0.07349464, + "auxiliary_loss_mlp": 0.01382372, + "balance_loss_clip": 0.06605366, + "balance_loss_mlp": 0.01283905, + "epoch": 0.04671576732301218, + "flos": 29212959767040.0, + "grad_norm": 3.9530223985438724, + "language_loss": 0.76411474, + "learning_rate": 3.997070098605585e-06, + "loss": 0.85143304, + "num_input_tokens_seen": 16686875, + "router_z_loss_clip": 7.4375, + "router_z_loss_mlp": 0.98535156, + "step": 777, + "time_per_iteration": 2.6883299350738525 + }, + { + "auxiliary_loss_clip": 0.07356873, + "auxiliary_loss_mlp": 0.01403802, + "balance_loss_clip": 0.06604887, + "balance_loss_mlp": 0.0129618, + "epoch": 0.04677589057568014, + "flos": 30485541651840.0, + "grad_norm": 5.886017158674543, + "language_loss": 0.8144322, + "learning_rate": 3.997048987461856e-06, + "loss": 0.90203899, + "num_input_tokens_seen": 16706420, + "router_z_loss_clip": 7.52734375, + "router_z_loss_mlp": 1.07568359, + "step": 778, + "time_per_iteration": 2.685317277908325 + }, + { + "auxiliary_loss_clip": 0.07353938, + "auxiliary_loss_mlp": 0.01397494, + "balance_loss_clip": 0.06609853, + "balance_loss_mlp": 0.01301697, + "epoch": 0.046836013828348115, + "flos": 20563820017920.0, + "grad_norm": 3.1633004103469644, + "language_loss": 0.83870596, + "learning_rate": 3.997027800589829e-06, + "loss": 0.92622018, + "num_input_tokens_seen": 16726390, + "router_z_loss_clip": 7.4375, + "router_z_loss_mlp": 0.95849609, + "step": 779, + "time_per_iteration": 2.737780809402466 + }, + { + "auxiliary_loss_clip": 0.07349363, + "auxiliary_loss_mlp": 0.01400206, + "balance_loss_clip": 0.06610721, + "balance_loss_mlp": 0.01301119, + "epoch": 0.04689613708101608, + "flos": 25454444549760.0, + "grad_norm": 5.859193350473668, + "language_loss": 0.80411738, + "learning_rate": 3.997006537990308e-06, + "loss": 0.89161313, + "num_input_tokens_seen": 16748965, + "router_z_loss_clip": 7.38671875, + "router_z_loss_mlp": 0.99023438, + "step": 780, + "time_per_iteration": 2.7168006896972656 + }, + { + "auxiliary_loss_clip": 0.07343157, + "auxiliary_loss_mlp": 0.0140195, + "balance_loss_clip": 0.06612131, + "balance_loss_mlp": 0.01309253, + "epoch": 0.04695626033368405, + "flos": 23007811582080.0, + "grad_norm": 3.4762604948204707, + "language_loss": 0.80410504, + "learning_rate": 3.996985199664099e-06, + "loss": 0.89155614, + "num_input_tokens_seen": 16768620, + "router_z_loss_clip": 7.3125, + "router_z_loss_mlp": 0.92724609, + "step": 781, + "time_per_iteration": 2.6267943382263184 + }, + { + "auxiliary_loss_clip": 0.07401444, + "auxiliary_loss_mlp": 0.01433849, + "balance_loss_clip": 0.06619258, + "balance_loss_mlp": 0.01321363, + "epoch": 0.047016383586352024, + "flos": 29141193144960.0, + "grad_norm": 4.331089591937386, + "language_loss": 0.79331714, + "learning_rate": 3.99696378561201e-06, + "loss": 0.88167012, + "num_input_tokens_seen": 16789755, + "router_z_loss_clip": 7.83984375, + "router_z_loss_mlp": 1.12451172, + "step": 782, + "time_per_iteration": 2.7272114753723145 + }, + { + "auxiliary_loss_clip": 0.07364355, + "auxiliary_loss_mlp": 0.01439388, + "balance_loss_clip": 0.06623092, + "balance_loss_mlp": 0.01338251, + "epoch": 0.04707650683901999, + "flos": 14981706466560.0, + "grad_norm": 6.433414878185146, + "language_loss": 0.85460365, + "learning_rate": 3.996942295834855e-06, + "loss": 0.94264108, + "num_input_tokens_seen": 16807585, + "router_z_loss_clip": 7.421875, + "router_z_loss_mlp": 1.01269531, + "step": 783, + "time_per_iteration": 2.6950912475585938 + }, + { + "auxiliary_loss_clip": 0.07354224, + "auxiliary_loss_mlp": 0.01436959, + "balance_loss_clip": 0.06629962, + "balance_loss_mlp": 0.01332722, + "epoch": 0.04713663009168796, + "flos": 21657257873280.0, + "grad_norm": 5.367904788236997, + "language_loss": 0.87574267, + "learning_rate": 3.996920730333448e-06, + "loss": 0.96365452, + "num_input_tokens_seen": 16827220, + "router_z_loss_clip": 7.234375, + "router_z_loss_mlp": 1.04150391, + "step": 784, + "time_per_iteration": 2.649948835372925 + }, + { + "auxiliary_loss_clip": 0.07386977, + "auxiliary_loss_mlp": 0.01467498, + "balance_loss_clip": 0.06641141, + "balance_loss_mlp": 0.01344665, + "epoch": 0.04719675334435593, + "flos": 21331939196160.0, + "grad_norm": 33.75407076232228, + "language_loss": 0.85470867, + "learning_rate": 3.996899089108607e-06, + "loss": 0.9432534, + "num_input_tokens_seen": 16846230, + "router_z_loss_clip": 7.453125, + "router_z_loss_mlp": 1.22753906, + "step": 785, + "time_per_iteration": 2.641284227371216 + }, + { + "auxiliary_loss_clip": 0.07399641, + "auxiliary_loss_mlp": 0.01481075, + "balance_loss_clip": 0.06649202, + "balance_loss_mlp": 0.01357002, + "epoch": 0.0472568765970239, + "flos": 17937204480000.0, + "grad_norm": 4.826067054081543, + "language_loss": 0.94969213, + "learning_rate": 3.996877372161152e-06, + "loss": 1.03849936, + "num_input_tokens_seen": 16865325, + "router_z_loss_clip": 7.51953125, + "router_z_loss_mlp": 1.24023438, + "step": 786, + "time_per_iteration": 2.6160340309143066 + }, + { + "auxiliary_loss_clip": 0.07465263, + "auxiliary_loss_mlp": 0.01521969, + "balance_loss_clip": 0.06653383, + "balance_loss_mlp": 0.01371384, + "epoch": 0.04731699984969187, + "flos": 18083169492480.0, + "grad_norm": 10.690384669742231, + "language_loss": 0.84019518, + "learning_rate": 3.9968555794919065e-06, + "loss": 0.93006748, + "num_input_tokens_seen": 16882930, + "router_z_loss_clip": 8.1328125, + "router_z_loss_mlp": 1.50488281, + "step": 787, + "time_per_iteration": 2.5864908695220947 + }, + { + "auxiliary_loss_clip": 0.07389308, + "auxiliary_loss_mlp": 0.01468371, + "balance_loss_clip": 0.06647876, + "balance_loss_mlp": 0.01332663, + "epoch": 0.047377123102359836, + "flos": 23191735294080.0, + "grad_norm": 8.892570877156906, + "language_loss": 0.85964632, + "learning_rate": 3.996833711101698e-06, + "loss": 0.94822311, + "num_input_tokens_seen": 16900710, + "router_z_loss_clip": 7.41796875, + "router_z_loss_mlp": 1.35839844, + "step": 788, + "time_per_iteration": 2.6390748023986816 + }, + { + "auxiliary_loss_clip": 0.07401264, + "auxiliary_loss_mlp": 0.01469979, + "balance_loss_clip": 0.06672339, + "balance_loss_mlp": 0.01334367, + "epoch": 0.04743724635502781, + "flos": 22754469162240.0, + "grad_norm": 17.026258111429804, + "language_loss": 0.89192903, + "learning_rate": 3.996811766991355e-06, + "loss": 0.98064142, + "num_input_tokens_seen": 16919210, + "router_z_loss_clip": 7.29296875, + "router_z_loss_mlp": 1.35449219, + "step": 789, + "time_per_iteration": 2.6131770610809326 + }, + { + "auxiliary_loss_clip": 0.07421435, + "auxiliary_loss_mlp": 0.01479761, + "balance_loss_clip": 0.06683871, + "balance_loss_mlp": 0.01339475, + "epoch": 0.04749736960769577, + "flos": 17244499576320.0, + "grad_norm": 30.32315054606697, + "language_loss": 0.88307178, + "learning_rate": 3.996789747161709e-06, + "loss": 0.97208381, + "num_input_tokens_seen": 16937125, + "router_z_loss_clip": 7.37890625, + "router_z_loss_mlp": 1.40136719, + "step": 790, + "time_per_iteration": 2.618745803833008 + }, + { + "auxiliary_loss_clip": 0.07412322, + "auxiliary_loss_mlp": 0.01470303, + "balance_loss_clip": 0.06664298, + "balance_loss_mlp": 0.01331687, + "epoch": 0.047557492860363745, + "flos": 40488798908160.0, + "grad_norm": 154.88106341207603, + "language_loss": 0.94037831, + "learning_rate": 3.996767651613597e-06, + "loss": 1.02920461, + "num_input_tokens_seen": 16958610, + "router_z_loss_clip": 7.48046875, + "router_z_loss_mlp": 1.38623047, + "step": 791, + "time_per_iteration": 2.7700016498565674 + }, + { + "auxiliary_loss_clip": 0.07422841, + "auxiliary_loss_mlp": 0.01462484, + "balance_loss_clip": 0.06681914, + "balance_loss_mlp": 0.01322198, + "epoch": 0.04761761611303172, + "flos": 18704023920000.0, + "grad_norm": 23.33805920811653, + "language_loss": 0.9476828, + "learning_rate": 3.996745480347854e-06, + "loss": 1.03653598, + "num_input_tokens_seen": 16977300, + "router_z_loss_clip": 7.4140625, + "router_z_loss_mlp": 1.40332031, + "step": 792, + "time_per_iteration": 2.605254888534546 + }, + { + "auxiliary_loss_clip": 0.07424683, + "auxiliary_loss_mlp": 0.01473205, + "balance_loss_clip": 0.0668014, + "balance_loss_mlp": 0.01333396, + "epoch": 0.04767773936569968, + "flos": 20928103643520.0, + "grad_norm": 9.340139883580587, + "language_loss": 0.78320849, + "learning_rate": 3.996723233365324e-06, + "loss": 0.87218744, + "num_input_tokens_seen": 16994950, + "router_z_loss_clip": 7.44921875, + "router_z_loss_mlp": 1.39697266, + "step": 793, + "time_per_iteration": 2.589350938796997 + }, + { + "auxiliary_loss_clip": 0.07421647, + "auxiliary_loss_mlp": 0.01474475, + "balance_loss_clip": 0.06679038, + "balance_loss_mlp": 0.01333379, + "epoch": 0.047737862618367655, + "flos": 23739481434240.0, + "grad_norm": 17.45910394468578, + "language_loss": 0.91955769, + "learning_rate": 3.996700910666847e-06, + "loss": 1.00851893, + "num_input_tokens_seen": 17014760, + "router_z_loss_clip": 7.4296875, + "router_z_loss_mlp": 1.41064453, + "step": 794, + "time_per_iteration": 2.65012264251709 + }, + { + "auxiliary_loss_clip": 0.07410855, + "auxiliary_loss_mlp": 0.01451088, + "balance_loss_clip": 0.06674555, + "balance_loss_mlp": 0.01322247, + "epoch": 0.04779798587103562, + "flos": 23702487056640.0, + "grad_norm": 25.87656480685072, + "language_loss": 0.77586949, + "learning_rate": 3.996678512253272e-06, + "loss": 0.8644889, + "num_input_tokens_seen": 17032715, + "router_z_loss_clip": 7.3671875, + "router_z_loss_mlp": 1.28808594, + "step": 795, + "time_per_iteration": 2.6948788166046143 + }, + { + "auxiliary_loss_clip": 0.07379565, + "auxiliary_loss_mlp": 0.01431544, + "balance_loss_clip": 0.06667496, + "balance_loss_mlp": 0.01302989, + "epoch": 0.04785810912370359, + "flos": 23190058212480.0, + "grad_norm": 8.675826434601191, + "language_loss": 0.85312498, + "learning_rate": 3.996656038125449e-06, + "loss": 0.94123614, + "num_input_tokens_seen": 17052215, + "router_z_loss_clip": 7.12109375, + "router_z_loss_mlp": 1.28466797, + "step": 796, + "time_per_iteration": 2.7435877323150635 + }, + { + "auxiliary_loss_clip": 0.07385565, + "auxiliary_loss_mlp": 0.0140352, + "balance_loss_clip": 0.06662786, + "balance_loss_mlp": 0.01285074, + "epoch": 0.047918232376371564, + "flos": 18046426677120.0, + "grad_norm": 54.926272560680225, + "language_loss": 0.8855834, + "learning_rate": 3.996633488284228e-06, + "loss": 0.97347426, + "num_input_tokens_seen": 17069225, + "router_z_loss_clip": 7.23046875, + "router_z_loss_mlp": 1.18359375, + "step": 797, + "time_per_iteration": 2.6623764038085938 + }, + { + "auxiliary_loss_clip": 0.07094701, + "auxiliary_loss_mlp": 0.01316158, + "balance_loss_clip": 0.0666967, + "balance_loss_mlp": 0.01274649, + "epoch": 0.04797835562903953, + "flos": 62461717511040.0, + "grad_norm": 0.9155106497251145, + "language_loss": 0.64821255, + "learning_rate": 3.996610862730465e-06, + "loss": 0.73232114, + "num_input_tokens_seen": 17126680, + "router_z_loss_clip": 4.25, + "router_z_loss_mlp": 0.4152832, + "step": 798, + "time_per_iteration": 3.148404121398926 + }, + { + "auxiliary_loss_clip": 0.07427999, + "auxiliary_loss_mlp": 0.01422996, + "balance_loss_clip": 0.06684162, + "balance_loss_mlp": 0.01303215, + "epoch": 0.0480384788817075, + "flos": 21513766556160.0, + "grad_norm": 16.018908533164023, + "language_loss": 0.96157068, + "learning_rate": 3.996588161465018e-06, + "loss": 1.05008054, + "num_input_tokens_seen": 17144835, + "router_z_loss_clip": 7.4453125, + "router_z_loss_mlp": 1.19775391, + "step": 799, + "time_per_iteration": 2.6639058589935303 + }, + { + "auxiliary_loss_clip": 0.07364519, + "auxiliary_loss_mlp": 0.01407648, + "balance_loss_clip": 0.06657426, + "balance_loss_mlp": 0.01297594, + "epoch": 0.048098602134375466, + "flos": 21733301053440.0, + "grad_norm": 22.047266878511874, + "language_loss": 0.92366803, + "learning_rate": 3.996565384488748e-06, + "loss": 1.01138973, + "num_input_tokens_seen": 17165030, + "router_z_loss_clip": 7.07421875, + "router_z_loss_mlp": 1.10253906, + "step": 800, + "time_per_iteration": 2.646414041519165 + }, + { + "auxiliary_loss_clip": 0.07370388, + "auxiliary_loss_mlp": 0.01385117, + "balance_loss_clip": 0.06655432, + "balance_loss_mlp": 0.01282549, + "epoch": 0.04815872538704344, + "flos": 22937931676800.0, + "grad_norm": 10.357052219396058, + "language_loss": 0.89344579, + "learning_rate": 3.996542531802518e-06, + "loss": 0.98100084, + "num_input_tokens_seen": 17184895, + "router_z_loss_clip": 7.1484375, + "router_z_loss_mlp": 1.02636719, + "step": 801, + "time_per_iteration": 2.6882050037384033 + }, + { + "auxiliary_loss_clip": 0.07345966, + "auxiliary_loss_mlp": 0.01362249, + "balance_loss_clip": 0.06635958, + "balance_loss_mlp": 0.01265022, + "epoch": 0.04821884863971141, + "flos": 43183952686080.0, + "grad_norm": 6.136831614794949, + "language_loss": 0.85035717, + "learning_rate": 3.996519603407196e-06, + "loss": 0.93743926, + "num_input_tokens_seen": 17208225, + "router_z_loss_clip": 7.10546875, + "router_z_loss_mlp": 0.97216797, + "step": 802, + "time_per_iteration": 2.79622220993042 + }, + { + "auxiliary_loss_clip": 0.07318079, + "auxiliary_loss_mlp": 0.01347073, + "balance_loss_clip": 0.06636789, + "balance_loss_mlp": 0.01265057, + "epoch": 0.048278971892379376, + "flos": 18625171628160.0, + "grad_norm": 43.20373329941697, + "language_loss": 0.91245079, + "learning_rate": 3.996496599303649e-06, + "loss": 0.99910235, + "num_input_tokens_seen": 17226305, + "router_z_loss_clip": 6.81640625, + "router_z_loss_mlp": 0.81982422, + "step": 803, + "time_per_iteration": 2.624542236328125 + }, + { + "auxiliary_loss_clip": 0.07327777, + "auxiliary_loss_mlp": 0.01365974, + "balance_loss_clip": 0.06626104, + "balance_loss_mlp": 0.01271798, + "epoch": 0.04833909514504735, + "flos": 20236279207680.0, + "grad_norm": 95.48194102470296, + "language_loss": 0.905747, + "learning_rate": 3.996473519492753e-06, + "loss": 0.99268442, + "num_input_tokens_seen": 17244545, + "router_z_loss_clip": 7.01953125, + "router_z_loss_mlp": 0.94238281, + "step": 804, + "time_per_iteration": 2.597118854522705 + }, + { + "auxiliary_loss_clip": 0.07322634, + "auxiliary_loss_mlp": 0.01340955, + "balance_loss_clip": 0.0662351, + "balance_loss_mlp": 0.01259273, + "epoch": 0.04839921839771532, + "flos": 24652182032640.0, + "grad_norm": 4.3863417773594096, + "language_loss": 0.91238397, + "learning_rate": 3.99645036397538e-06, + "loss": 0.99901986, + "num_input_tokens_seen": 17265730, + "router_z_loss_clip": 6.9921875, + "router_z_loss_mlp": 0.81689453, + "step": 805, + "time_per_iteration": 2.6999049186706543 + }, + { + "auxiliary_loss_clip": 0.07332969, + "auxiliary_loss_mlp": 0.01347421, + "balance_loss_clip": 0.06628814, + "balance_loss_mlp": 0.01263783, + "epoch": 0.048459341650383285, + "flos": 24834470590080.0, + "grad_norm": 14.417666191465669, + "language_loss": 0.71703786, + "learning_rate": 3.9964271327524085e-06, + "loss": 0.80384171, + "num_input_tokens_seen": 17284820, + "router_z_loss_clip": 7.046875, + "router_z_loss_mlp": 0.8359375, + "step": 806, + "time_per_iteration": 4.025094985961914 + }, + { + "auxiliary_loss_clip": 0.07307116, + "auxiliary_loss_mlp": 0.01343001, + "balance_loss_clip": 0.06628814, + "balance_loss_mlp": 0.01262844, + "epoch": 0.04851946490305126, + "flos": 22169644790400.0, + "grad_norm": 6.037392612651371, + "language_loss": 0.81120235, + "learning_rate": 3.9964038258247214e-06, + "loss": 0.89770353, + "num_input_tokens_seen": 17305085, + "router_z_loss_clip": 6.7734375, + "router_z_loss_mlp": 0.80126953, + "step": 807, + "time_per_iteration": 4.06866717338562 + }, + { + "auxiliary_loss_clip": 0.07289852, + "auxiliary_loss_mlp": 0.01348053, + "balance_loss_clip": 0.06616738, + "balance_loss_mlp": 0.01266228, + "epoch": 0.04857958815571922, + "flos": 19798132608000.0, + "grad_norm": 11.228648532877324, + "language_loss": 0.92036742, + "learning_rate": 3.9963804431932005e-06, + "loss": 1.00674641, + "num_input_tokens_seen": 17322715, + "router_z_loss_clip": 6.73046875, + "router_z_loss_mlp": 0.81738281, + "step": 808, + "time_per_iteration": 3.9916791915893555 + }, + { + "auxiliary_loss_clip": 0.07360442, + "auxiliary_loss_mlp": 0.01352716, + "balance_loss_clip": 0.06635769, + "balance_loss_mlp": 0.01261115, + "epoch": 0.048639711408387194, + "flos": 18703981992960.0, + "grad_norm": 6.742572767322423, + "language_loss": 0.95677304, + "learning_rate": 3.996356984858732e-06, + "loss": 1.04390454, + "num_input_tokens_seen": 17341455, + "router_z_loss_clip": 7.2421875, + "router_z_loss_mlp": 0.91699219, + "step": 809, + "time_per_iteration": 2.6680333614349365 + }, + { + "auxiliary_loss_clip": 0.07315584, + "auxiliary_loss_mlp": 0.01344649, + "balance_loss_clip": 0.06624336, + "balance_loss_mlp": 0.01256863, + "epoch": 0.048699834661055166, + "flos": 24870458718720.0, + "grad_norm": 4.628704942448529, + "language_loss": 0.90077579, + "learning_rate": 3.996333450822208e-06, + "loss": 0.98737824, + "num_input_tokens_seen": 17360765, + "router_z_loss_clip": 6.92578125, + "router_z_loss_mlp": 0.87841797, + "step": 810, + "time_per_iteration": 2.6677091121673584 + }, + { + "auxiliary_loss_clip": 0.07363133, + "auxiliary_loss_mlp": 0.01339196, + "balance_loss_clip": 0.06638221, + "balance_loss_mlp": 0.0126109, + "epoch": 0.04875995791372313, + "flos": 20710246227840.0, + "grad_norm": 31.095133807277897, + "language_loss": 0.84460914, + "learning_rate": 3.99630984108452e-06, + "loss": 0.9316324, + "num_input_tokens_seen": 17380625, + "router_z_loss_clip": 7.25, + "router_z_loss_mlp": 0.78125, + "step": 811, + "time_per_iteration": 4.020594358444214 + }, + { + "auxiliary_loss_clip": 0.07316839, + "auxiliary_loss_mlp": 0.01338146, + "balance_loss_clip": 0.06624701, + "balance_loss_mlp": 0.01256941, + "epoch": 0.048820081166391104, + "flos": 18594256671360.0, + "grad_norm": 4.82975857058881, + "language_loss": 0.78335881, + "learning_rate": 3.9962861556465615e-06, + "loss": 0.86990869, + "num_input_tokens_seen": 17399355, + "router_z_loss_clip": 6.92578125, + "router_z_loss_mlp": 0.81152344, + "step": 812, + "time_per_iteration": 2.614077091217041 + }, + { + "auxiliary_loss_clip": 0.0728099, + "auxiliary_loss_mlp": 0.01351533, + "balance_loss_clip": 0.06610497, + "balance_loss_mlp": 0.0127009, + "epoch": 0.04888020441905907, + "flos": 22713324007680.0, + "grad_norm": 17.655616040127313, + "language_loss": 0.94109142, + "learning_rate": 3.996262394509233e-06, + "loss": 1.02741659, + "num_input_tokens_seen": 17418240, + "router_z_loss_clip": 6.703125, + "router_z_loss_mlp": 0.81494141, + "step": 813, + "time_per_iteration": 2.5956995487213135 + }, + { + "auxiliary_loss_clip": 0.07318511, + "auxiliary_loss_mlp": 0.01349544, + "balance_loss_clip": 0.0662335, + "balance_loss_mlp": 0.01262807, + "epoch": 0.04894032767172704, + "flos": 22791044269440.0, + "grad_norm": 7.289252550466507, + "language_loss": 0.78803051, + "learning_rate": 3.9962385576734335e-06, + "loss": 0.87471104, + "num_input_tokens_seen": 17436250, + "router_z_loss_clip": 6.953125, + "router_z_loss_mlp": 0.8671875, + "step": 814, + "time_per_iteration": 2.625399351119995 + }, + { + "auxiliary_loss_clip": 0.07335538, + "auxiliary_loss_mlp": 0.01355257, + "balance_loss_clip": 0.06626598, + "balance_loss_mlp": 0.01267948, + "epoch": 0.04900045092439501, + "flos": 25522521592320.0, + "grad_norm": 46.975949242566905, + "language_loss": 0.87790531, + "learning_rate": 3.9962146451400675e-06, + "loss": 0.96481323, + "num_input_tokens_seen": 17455750, + "router_z_loss_clip": 7.1015625, + "router_z_loss_mlp": 0.87451172, + "step": 815, + "time_per_iteration": 2.6799027919769287 + }, + { + "auxiliary_loss_clip": 0.0734727, + "auxiliary_loss_mlp": 0.0137345, + "balance_loss_clip": 0.06619896, + "balance_loss_mlp": 0.01271788, + "epoch": 0.04906057417706298, + "flos": 25965280166400.0, + "grad_norm": 11.89199068240792, + "language_loss": 0.95818853, + "learning_rate": 3.996190656910043e-06, + "loss": 1.04539561, + "num_input_tokens_seen": 17474995, + "router_z_loss_clip": 7.28125, + "router_z_loss_mlp": 1.01757812, + "step": 816, + "time_per_iteration": 2.668058395385742 + }, + { + "auxiliary_loss_clip": 0.07340101, + "auxiliary_loss_mlp": 0.01360138, + "balance_loss_clip": 0.066241, + "balance_loss_mlp": 0.01271828, + "epoch": 0.04912069742973095, + "flos": 18630580216320.0, + "grad_norm": 8.092720893633917, + "language_loss": 0.84299397, + "learning_rate": 3.996166592984268e-06, + "loss": 0.92999631, + "num_input_tokens_seen": 17493395, + "router_z_loss_clip": 7.1484375, + "router_z_loss_mlp": 0.88330078, + "step": 817, + "time_per_iteration": 2.5901565551757812 + }, + { + "auxiliary_loss_clip": 0.07312281, + "auxiliary_loss_mlp": 0.01371477, + "balance_loss_clip": 0.06618914, + "balance_loss_mlp": 0.01282404, + "epoch": 0.049180820682398915, + "flos": 23707182885120.0, + "grad_norm": 5.174214831161968, + "language_loss": 0.88566625, + "learning_rate": 3.996142453363656e-06, + "loss": 0.97250384, + "num_input_tokens_seen": 17514565, + "router_z_loss_clip": 6.93359375, + "router_z_loss_mlp": 0.89013672, + "step": 818, + "time_per_iteration": 2.6751646995544434 + }, + { + "auxiliary_loss_clip": 0.07361554, + "auxiliary_loss_mlp": 0.01384487, + "balance_loss_clip": 0.06625406, + "balance_loss_mlp": 0.01290598, + "epoch": 0.04924094393506689, + "flos": 22427179914240.0, + "grad_norm": 6.808629946314654, + "language_loss": 0.81731856, + "learning_rate": 3.996118238049124e-06, + "loss": 0.90477902, + "num_input_tokens_seen": 17534590, + "router_z_loss_clip": 7.36328125, + "router_z_loss_mlp": 0.93798828, + "step": 819, + "time_per_iteration": 2.638293504714966 + }, + { + "auxiliary_loss_clip": 0.07319279, + "auxiliary_loss_mlp": 0.01377789, + "balance_loss_clip": 0.06608901, + "balance_loss_mlp": 0.01285903, + "epoch": 0.04930106718773486, + "flos": 15743033464320.0, + "grad_norm": 10.609665501519604, + "language_loss": 0.88234192, + "learning_rate": 3.996093947041586e-06, + "loss": 0.96931261, + "num_input_tokens_seen": 17551900, + "router_z_loss_clip": 7.109375, + "router_z_loss_mlp": 0.91845703, + "step": 820, + "time_per_iteration": 2.6076858043670654 + }, + { + "auxiliary_loss_clip": 0.07310833, + "auxiliary_loss_mlp": 0.01372579, + "balance_loss_clip": 0.06604609, + "balance_loss_mlp": 0.01282171, + "epoch": 0.049361190440402825, + "flos": 26257922951040.0, + "grad_norm": 5.648893665912937, + "language_loss": 0.94581264, + "learning_rate": 3.996069580341966e-06, + "loss": 1.03264678, + "num_input_tokens_seen": 17571485, + "router_z_loss_clip": 7.0703125, + "router_z_loss_mlp": 0.90380859, + "step": 821, + "time_per_iteration": 2.7164249420166016 + }, + { + "auxiliary_loss_clip": 0.07296955, + "auxiliary_loss_mlp": 0.01366561, + "balance_loss_clip": 0.0660333, + "balance_loss_mlp": 0.01277488, + "epoch": 0.0494213136930708, + "flos": 21258872835840.0, + "grad_norm": 13.842694995476421, + "language_loss": 0.93458569, + "learning_rate": 3.996045137951188e-06, + "loss": 1.02122092, + "num_input_tokens_seen": 17591410, + "router_z_loss_clip": 6.9453125, + "router_z_loss_mlp": 0.890625, + "step": 822, + "time_per_iteration": 2.6453444957733154 + }, + { + "auxiliary_loss_clip": 0.07319045, + "auxiliary_loss_mlp": 0.01374655, + "balance_loss_clip": 0.06613644, + "balance_loss_mlp": 0.0128048, + "epoch": 0.04948143694573876, + "flos": 27973095701760.0, + "grad_norm": 7.088849816783062, + "language_loss": 0.7121917, + "learning_rate": 3.996020619870178e-06, + "loss": 0.79912865, + "num_input_tokens_seen": 17612010, + "router_z_loss_clip": 7.05078125, + "router_z_loss_mlp": 0.94238281, + "step": 823, + "time_per_iteration": 2.6804885864257812 + }, + { + "auxiliary_loss_clip": 0.06953795, + "auxiliary_loss_mlp": 0.01404355, + "balance_loss_clip": 0.06535611, + "balance_loss_mlp": 0.01345371, + "epoch": 0.049541560198406734, + "flos": 66197466345600.0, + "grad_norm": 1.28356919167216, + "language_loss": 0.63197851, + "learning_rate": 3.995996026099866e-06, + "loss": 0.71555996, + "num_input_tokens_seen": 17673430, + "router_z_loss_clip": 4.1875, + "router_z_loss_mlp": 0.58837891, + "step": 824, + "time_per_iteration": 3.3058674335479736 + }, + { + "auxiliary_loss_clip": 0.07323784, + "auxiliary_loss_mlp": 0.01374745, + "balance_loss_clip": 0.06612824, + "balance_loss_mlp": 0.01280998, + "epoch": 0.049601683451074706, + "flos": 22899218290560.0, + "grad_norm": 5.8210235967171435, + "language_loss": 0.9564544, + "learning_rate": 3.995971356641185e-06, + "loss": 1.04343963, + "num_input_tokens_seen": 17689545, + "router_z_loss_clip": 7.11328125, + "router_z_loss_mlp": 0.9375, + "step": 825, + "time_per_iteration": 2.62613844871521 + }, + { + "auxiliary_loss_clip": 0.07281419, + "auxiliary_loss_mlp": 0.01365594, + "balance_loss_clip": 0.06597939, + "balance_loss_mlp": 0.0127695, + "epoch": 0.04966180670374267, + "flos": 21439987436160.0, + "grad_norm": 7.03533776815666, + "language_loss": 0.71345061, + "learning_rate": 3.9959466114950695e-06, + "loss": 0.7999208, + "num_input_tokens_seen": 17705965, + "router_z_loss_clip": 6.83984375, + "router_z_loss_mlp": 0.88671875, + "step": 826, + "time_per_iteration": 2.607252359390259 + }, + { + "auxiliary_loss_clip": 0.07308409, + "auxiliary_loss_mlp": 0.01368352, + "balance_loss_clip": 0.06603594, + "balance_loss_mlp": 0.0127885, + "epoch": 0.04972192995641064, + "flos": 23113218418560.0, + "grad_norm": 6.719033594417253, + "language_loss": 0.82099521, + "learning_rate": 3.995921790662459e-06, + "loss": 0.90776283, + "num_input_tokens_seen": 17724580, + "router_z_loss_clip": 7.05078125, + "router_z_loss_mlp": 0.89550781, + "step": 827, + "time_per_iteration": 2.6468021869659424 + }, + { + "auxiliary_loss_clip": 0.07312737, + "auxiliary_loss_mlp": 0.01384514, + "balance_loss_clip": 0.06605525, + "balance_loss_mlp": 0.01293009, + "epoch": 0.04978205320907861, + "flos": 40415648693760.0, + "grad_norm": 3.6071356819257336, + "language_loss": 0.83064795, + "learning_rate": 3.995896894144294e-06, + "loss": 0.91762054, + "num_input_tokens_seen": 17747755, + "router_z_loss_clip": 7.05859375, + "router_z_loss_mlp": 0.91455078, + "step": 828, + "time_per_iteration": 2.7598366737365723 + }, + { + "auxiliary_loss_clip": 0.07248655, + "auxiliary_loss_mlp": 0.01357422, + "balance_loss_clip": 0.06587116, + "balance_loss_mlp": 0.01271687, + "epoch": 0.04984217646174658, + "flos": 25235580885120.0, + "grad_norm": 7.916023460171269, + "language_loss": 0.88066685, + "learning_rate": 3.995871921941519e-06, + "loss": 0.96672761, + "num_input_tokens_seen": 17768550, + "router_z_loss_clip": 6.609375, + "router_z_loss_mlp": 0.85791016, + "step": 829, + "time_per_iteration": 2.664443016052246 + }, + { + "auxiliary_loss_clip": 0.07290308, + "auxiliary_loss_mlp": 0.01371956, + "balance_loss_clip": 0.06599583, + "balance_loss_mlp": 0.01282025, + "epoch": 0.04990229971441455, + "flos": 15964873948800.0, + "grad_norm": 30.23399077612731, + "language_loss": 0.79482603, + "learning_rate": 3.99584687405508e-06, + "loss": 0.88144869, + "num_input_tokens_seen": 17786080, + "router_z_loss_clip": 6.90625, + "router_z_loss_mlp": 0.90039062, + "step": 830, + "time_per_iteration": 2.5562844276428223 + }, + { + "auxiliary_loss_clip": 0.07284638, + "auxiliary_loss_mlp": 0.01358745, + "balance_loss_clip": 0.06602956, + "balance_loss_mlp": 0.01273677, + "epoch": 0.04996242296708252, + "flos": 18410919937920.0, + "grad_norm": 6.720833612775693, + "language_loss": 0.82703733, + "learning_rate": 3.995821750485929e-06, + "loss": 0.91347122, + "num_input_tokens_seen": 17803635, + "router_z_loss_clip": 6.81640625, + "router_z_loss_mlp": 0.85058594, + "step": 831, + "time_per_iteration": 2.6576318740844727 + }, + { + "auxiliary_loss_clip": 0.07282449, + "auxiliary_loss_mlp": 0.01350763, + "balance_loss_clip": 0.06587234, + "balance_loss_mlp": 0.01262882, + "epoch": 0.05002254621975049, + "flos": 17863802703360.0, + "grad_norm": 5.424543563535015, + "language_loss": 0.97343409, + "learning_rate": 3.995796551235016e-06, + "loss": 1.05976629, + "num_input_tokens_seen": 17822190, + "router_z_loss_clip": 6.953125, + "router_z_loss_mlp": 0.87939453, + "step": 832, + "time_per_iteration": 2.5859360694885254 + }, + { + "auxiliary_loss_clip": 0.07242593, + "auxiliary_loss_mlp": 0.01355446, + "balance_loss_clip": 0.06576244, + "balance_loss_mlp": 0.01268804, + "epoch": 0.050082669472418455, + "flos": 45670682632320.0, + "grad_norm": 14.668918539875873, + "language_loss": 0.86283791, + "learning_rate": 3.9957712763032974e-06, + "loss": 0.94881833, + "num_input_tokens_seen": 17846915, + "router_z_loss_clip": 6.66015625, + "router_z_loss_mlp": 0.86621094, + "step": 833, + "time_per_iteration": 2.8055691719055176 + }, + { + "auxiliary_loss_clip": 0.07249285, + "auxiliary_loss_mlp": 0.01350346, + "balance_loss_clip": 0.06584433, + "balance_loss_mlp": 0.01262561, + "epoch": 0.05014279272508643, + "flos": 37971237859200.0, + "grad_norm": 3.800888643683855, + "language_loss": 0.8636179, + "learning_rate": 3.995745925691733e-06, + "loss": 0.94961417, + "num_input_tokens_seen": 17867270, + "router_z_loss_clip": 6.64453125, + "router_z_loss_mlp": 0.87695312, + "step": 834, + "time_per_iteration": 2.757873296737671 + }, + { + "auxiliary_loss_clip": 0.07281981, + "auxiliary_loss_mlp": 0.01348084, + "balance_loss_clip": 0.0659239, + "balance_loss_mlp": 0.01265353, + "epoch": 0.0502029159777544, + "flos": 21002511669120.0, + "grad_norm": 6.832202768967494, + "language_loss": 0.96576416, + "learning_rate": 3.995720499401282e-06, + "loss": 1.0520649, + "num_input_tokens_seen": 17884880, + "router_z_loss_clip": 6.890625, + "router_z_loss_mlp": 0.82666016, + "step": 835, + "time_per_iteration": 2.5905637741088867 + }, + { + "auxiliary_loss_clip": 0.07274499, + "auxiliary_loss_mlp": 0.01349147, + "balance_loss_clip": 0.06586967, + "balance_loss_mlp": 0.01266273, + "epoch": 0.050263039230422364, + "flos": 15893526597120.0, + "grad_norm": 5.723886418395804, + "language_loss": 0.82083344, + "learning_rate": 3.995694997432911e-06, + "loss": 0.90706992, + "num_input_tokens_seen": 17903695, + "router_z_loss_clip": 6.87890625, + "router_z_loss_mlp": 0.82861328, + "step": 836, + "time_per_iteration": 2.6167397499084473 + }, + { + "auxiliary_loss_clip": 0.0721738, + "auxiliary_loss_mlp": 0.01338932, + "balance_loss_clip": 0.06569374, + "balance_loss_mlp": 0.01261065, + "epoch": 0.050323162483090336, + "flos": 23739565288320.0, + "grad_norm": 23.66781297023958, + "language_loss": 0.88235295, + "learning_rate": 3.9956694197875855e-06, + "loss": 0.96791613, + "num_input_tokens_seen": 17920745, + "router_z_loss_clip": 6.48046875, + "router_z_loss_mlp": 0.77832031, + "step": 837, + "time_per_iteration": 2.614959955215454 + }, + { + "auxiliary_loss_clip": 0.07221343, + "auxiliary_loss_mlp": 0.01354096, + "balance_loss_clip": 0.06550418, + "balance_loss_mlp": 0.01265261, + "epoch": 0.0503832857357583, + "flos": 20272393117440.0, + "grad_norm": 6.0443181189796995, + "language_loss": 0.76965159, + "learning_rate": 3.995643766466275e-06, + "loss": 0.85540605, + "num_input_tokens_seen": 17938220, + "router_z_loss_clip": 6.7109375, + "router_z_loss_mlp": 0.88769531, + "step": 838, + "time_per_iteration": 2.622648239135742 + }, + { + "auxiliary_loss_clip": 0.0724083, + "auxiliary_loss_mlp": 0.01341893, + "balance_loss_clip": 0.06561115, + "balance_loss_mlp": 0.01259353, + "epoch": 0.05044340898842627, + "flos": 17790736343040.0, + "grad_norm": 4.747797763129113, + "language_loss": 0.86986995, + "learning_rate": 3.995618037469953e-06, + "loss": 0.95569718, + "num_input_tokens_seen": 17957325, + "router_z_loss_clip": 6.796875, + "router_z_loss_mlp": 0.82519531, + "step": 839, + "time_per_iteration": 2.5999207496643066 + }, + { + "auxiliary_loss_clip": 0.07210248, + "auxiliary_loss_mlp": 0.01342514, + "balance_loss_clip": 0.06558718, + "balance_loss_mlp": 0.01262024, + "epoch": 0.050503532241094246, + "flos": 22973207045760.0, + "grad_norm": 3.66950577076863, + "language_loss": 0.88844591, + "learning_rate": 3.995592232799595e-06, + "loss": 0.97397357, + "num_input_tokens_seen": 17975875, + "router_z_loss_clip": 6.51953125, + "router_z_loss_mlp": 0.80517578, + "step": 840, + "time_per_iteration": 2.688936948776245 + }, + { + "auxiliary_loss_clip": 0.07223296, + "auxiliary_loss_mlp": 0.01348235, + "balance_loss_clip": 0.06565775, + "balance_loss_mlp": 0.01264264, + "epoch": 0.05056365549376221, + "flos": 22782449226240.0, + "grad_norm": 5.237976654716359, + "language_loss": 0.98182797, + "learning_rate": 3.99556635245618e-06, + "loss": 1.06754327, + "num_input_tokens_seen": 17994340, + "router_z_loss_clip": 6.57421875, + "router_z_loss_mlp": 0.84033203, + "step": 841, + "time_per_iteration": 2.626171588897705 + }, + { + "auxiliary_loss_clip": 0.07216457, + "auxiliary_loss_mlp": 0.01346197, + "balance_loss_clip": 0.06556017, + "balance_loss_mlp": 0.01263227, + "epoch": 0.05062377874643018, + "flos": 30924401011200.0, + "grad_norm": 3.922284831716734, + "language_loss": 0.81540143, + "learning_rate": 3.995540396440688e-06, + "loss": 0.90102798, + "num_input_tokens_seen": 18015260, + "router_z_loss_clip": 6.609375, + "router_z_loss_mlp": 0.82958984, + "step": 842, + "time_per_iteration": 2.707146167755127 + }, + { + "auxiliary_loss_clip": 0.07236033, + "auxiliary_loss_mlp": 0.01355891, + "balance_loss_clip": 0.06555693, + "balance_loss_mlp": 0.0126391, + "epoch": 0.05068390199909815, + "flos": 19653425406720.0, + "grad_norm": 6.4717382946502635, + "language_loss": 0.81965601, + "learning_rate": 3.995514364754105e-06, + "loss": 0.90557522, + "num_input_tokens_seen": 18033960, + "router_z_loss_clip": 6.80078125, + "router_z_loss_mlp": 0.91943359, + "step": 843, + "time_per_iteration": 2.672064781188965 + }, + { + "auxiliary_loss_clip": 0.07235807, + "auxiliary_loss_mlp": 0.01361352, + "balance_loss_clip": 0.06552228, + "balance_loss_mlp": 0.01271992, + "epoch": 0.05074402525176612, + "flos": 37971279786240.0, + "grad_norm": 2.407141650516338, + "language_loss": 0.87016606, + "learning_rate": 3.995488257397417e-06, + "loss": 0.95613766, + "num_input_tokens_seen": 18056700, + "router_z_loss_clip": 6.83203125, + "router_z_loss_mlp": 0.89404297, + "step": 844, + "time_per_iteration": 2.7541916370391846 + }, + { + "auxiliary_loss_clip": 0.07238596, + "auxiliary_loss_mlp": 0.01357268, + "balance_loss_clip": 0.06561587, + "balance_loss_mlp": 0.01275109, + "epoch": 0.05080414850443409, + "flos": 22061177280000.0, + "grad_norm": 5.7438919546505876, + "language_loss": 0.80192208, + "learning_rate": 3.995462074371614e-06, + "loss": 0.8878808, + "num_input_tokens_seen": 18075815, + "router_z_loss_clip": 6.76953125, + "router_z_loss_mlp": 0.82226562, + "step": 845, + "time_per_iteration": 2.5944912433624268 + }, + { + "auxiliary_loss_clip": 0.07213366, + "auxiliary_loss_mlp": 0.01353915, + "balance_loss_clip": 0.06554674, + "balance_loss_mlp": 0.01268561, + "epoch": 0.05086427175710206, + "flos": 20231289889920.0, + "grad_norm": 4.0486216034950475, + "language_loss": 0.91612351, + "learning_rate": 3.99543581567769e-06, + "loss": 1.00179636, + "num_input_tokens_seen": 18095095, + "router_z_loss_clip": 6.578125, + "router_z_loss_mlp": 0.85400391, + "step": 846, + "time_per_iteration": 4.029407739639282 + }, + { + "auxiliary_loss_clip": 0.07198675, + "auxiliary_loss_mlp": 0.01353444, + "balance_loss_clip": 0.06555093, + "balance_loss_mlp": 0.01271094, + "epoch": 0.05092439500977003, + "flos": 15164707783680.0, + "grad_norm": 2.8334464640278307, + "language_loss": 0.91321969, + "learning_rate": 3.9954094813166394e-06, + "loss": 0.99874079, + "num_input_tokens_seen": 18112675, + "router_z_loss_clip": 6.4375, + "router_z_loss_mlp": 0.82324219, + "step": 847, + "time_per_iteration": 4.004042863845825 + }, + { + "auxiliary_loss_clip": 0.07199422, + "auxiliary_loss_mlp": 0.01355266, + "balance_loss_clip": 0.0654697, + "balance_loss_mlp": 0.01273202, + "epoch": 0.050984518262437994, + "flos": 22061806185600.0, + "grad_norm": 3.421485941815423, + "language_loss": 0.86160553, + "learning_rate": 3.995383071289462e-06, + "loss": 0.94715238, + "num_input_tokens_seen": 18130745, + "router_z_loss_clip": 6.52734375, + "router_z_loss_mlp": 0.82080078, + "step": 848, + "time_per_iteration": 4.033248662948608 + }, + { + "auxiliary_loss_clip": 0.07196971, + "auxiliary_loss_mlp": 0.01345708, + "balance_loss_clip": 0.06533228, + "balance_loss_mlp": 0.01262166, + "epoch": 0.05104464151510597, + "flos": 30232911991680.0, + "grad_norm": 3.7966495356829357, + "language_loss": 0.90386808, + "learning_rate": 3.995356585597158e-06, + "loss": 0.98929483, + "num_input_tokens_seen": 18152410, + "router_z_loss_clip": 6.640625, + "router_z_loss_mlp": 0.83544922, + "step": 849, + "time_per_iteration": 2.6612625122070312 + }, + { + "auxiliary_loss_clip": 0.07179346, + "auxiliary_loss_mlp": 0.01359214, + "balance_loss_clip": 0.06533284, + "balance_loss_mlp": 0.01279106, + "epoch": 0.05110476476777394, + "flos": 18338817899520.0, + "grad_norm": 8.277424439503498, + "language_loss": 0.88001835, + "learning_rate": 3.995330024240732e-06, + "loss": 0.96540397, + "num_input_tokens_seen": 18170870, + "router_z_loss_clip": 6.45703125, + "router_z_loss_mlp": 0.80126953, + "step": 850, + "time_per_iteration": 2.591169834136963 + }, + { + "auxiliary_loss_clip": 0.07213688, + "auxiliary_loss_mlp": 0.01358343, + "balance_loss_clip": 0.06542021, + "balance_loss_mlp": 0.01272131, + "epoch": 0.051164888020441904, + "flos": 38007938747520.0, + "grad_norm": 2.8793275004055894, + "language_loss": 0.702048, + "learning_rate": 3.995303387221192e-06, + "loss": 0.78776836, + "num_input_tokens_seen": 18191555, + "router_z_loss_clip": 6.72265625, + "router_z_loss_mlp": 0.86328125, + "step": 851, + "time_per_iteration": 4.218145132064819 + }, + { + "auxiliary_loss_clip": 0.07192284, + "auxiliary_loss_mlp": 0.0136467, + "balance_loss_clip": 0.06527439, + "balance_loss_mlp": 0.01276741, + "epoch": 0.051225011273109876, + "flos": 23045183303040.0, + "grad_norm": 3.6723766751173894, + "language_loss": 0.87184155, + "learning_rate": 3.995276674539547e-06, + "loss": 0.95741105, + "num_input_tokens_seen": 18208620, + "router_z_loss_clip": 6.66015625, + "router_z_loss_mlp": 0.87939453, + "step": 852, + "time_per_iteration": 2.629037380218506 + }, + { + "auxiliary_loss_clip": 0.07206973, + "auxiliary_loss_mlp": 0.01354841, + "balance_loss_clip": 0.06534127, + "balance_loss_mlp": 0.01269678, + "epoch": 0.05128513452577785, + "flos": 18265709612160.0, + "grad_norm": 3.821037496712823, + "language_loss": 0.8378402, + "learning_rate": 3.995249886196811e-06, + "loss": 0.92345834, + "num_input_tokens_seen": 18226370, + "router_z_loss_clip": 6.73046875, + "router_z_loss_mlp": 0.8515625, + "step": 853, + "time_per_iteration": 2.6316466331481934 + }, + { + "auxiliary_loss_clip": 0.07211602, + "auxiliary_loss_mlp": 0.01339797, + "balance_loss_clip": 0.06537303, + "balance_loss_mlp": 0.01257733, + "epoch": 0.05134525777844581, + "flos": 27206360115840.0, + "grad_norm": 3.182696022693741, + "language_loss": 0.80133533, + "learning_rate": 3.995223022193999e-06, + "loss": 0.88684934, + "num_input_tokens_seen": 18247075, + "router_z_loss_clip": 6.7421875, + "router_z_loss_mlp": 0.82080078, + "step": 854, + "time_per_iteration": 2.6477131843566895 + }, + { + "auxiliary_loss_clip": 0.07215541, + "auxiliary_loss_mlp": 0.01344733, + "balance_loss_clip": 0.0654063, + "balance_loss_mlp": 0.01263146, + "epoch": 0.051405381031113785, + "flos": 28369132824960.0, + "grad_norm": 35.99472555736179, + "language_loss": 0.85045469, + "learning_rate": 3.99519608253213e-06, + "loss": 0.93605745, + "num_input_tokens_seen": 18265680, + "router_z_loss_clip": 6.74609375, + "router_z_loss_mlp": 0.81542969, + "step": 855, + "time_per_iteration": 2.6279296875 + }, + { + "auxiliary_loss_clip": 0.06909335, + "auxiliary_loss_mlp": 0.01436301, + "balance_loss_clip": 0.0650633, + "balance_loss_mlp": 0.01398083, + "epoch": 0.05146550428378175, + "flos": 65638049760000.0, + "grad_norm": 0.9716530477482218, + "language_loss": 0.65818644, + "learning_rate": 3.995169067212227e-06, + "loss": 0.74164271, + "num_input_tokens_seen": 18327015, + "router_z_loss_clip": 4.01171875, + "router_z_loss_mlp": 0.3815918, + "step": 856, + "time_per_iteration": 3.1742889881134033 + }, + { + "auxiliary_loss_clip": 0.0715993, + "auxiliary_loss_mlp": 0.01330963, + "balance_loss_clip": 0.06518224, + "balance_loss_mlp": 0.01252571, + "epoch": 0.05152562753644972, + "flos": 22061470769280.0, + "grad_norm": 29.089515075725927, + "language_loss": 0.80351281, + "learning_rate": 3.9951419762353116e-06, + "loss": 0.88842171, + "num_input_tokens_seen": 18345235, + "router_z_loss_clip": 6.41796875, + "router_z_loss_mlp": 0.78417969, + "step": 857, + "time_per_iteration": 2.6136977672576904 + }, + { + "auxiliary_loss_clip": 0.07196955, + "auxiliary_loss_mlp": 0.01347875, + "balance_loss_clip": 0.06528607, + "balance_loss_mlp": 0.01259422, + "epoch": 0.051585750789117694, + "flos": 18514523911680.0, + "grad_norm": 4.501526487205694, + "language_loss": 0.9266271, + "learning_rate": 3.995114809602412e-06, + "loss": 1.01207542, + "num_input_tokens_seen": 18362350, + "router_z_loss_clip": 6.6875, + "router_z_loss_mlp": 0.88427734, + "step": 858, + "time_per_iteration": 2.606518268585205 + }, + { + "auxiliary_loss_clip": 0.07190363, + "auxiliary_loss_mlp": 0.0134683, + "balance_loss_clip": 0.06527077, + "balance_loss_mlp": 0.01261381, + "epoch": 0.05164587404178566, + "flos": 23736630395520.0, + "grad_norm": 4.049462391518637, + "language_loss": 0.80811787, + "learning_rate": 3.9950875673145605e-06, + "loss": 0.89348972, + "num_input_tokens_seen": 18383390, + "router_z_loss_clip": 6.6328125, + "router_z_loss_mlp": 0.85400391, + "step": 859, + "time_per_iteration": 2.624462604522705 + }, + { + "auxiliary_loss_clip": 0.07202329, + "auxiliary_loss_mlp": 0.01352935, + "balance_loss_clip": 0.06525081, + "balance_loss_mlp": 0.01264196, + "epoch": 0.05170599729445363, + "flos": 16258397201280.0, + "grad_norm": 12.806303000100046, + "language_loss": 0.95290452, + "learning_rate": 3.995060249372788e-06, + "loss": 1.03845716, + "num_input_tokens_seen": 18399220, + "router_z_loss_clip": 6.78125, + "router_z_loss_mlp": 0.88769531, + "step": 860, + "time_per_iteration": 2.6383068561553955 + }, + { + "auxiliary_loss_clip": 0.07167631, + "auxiliary_loss_mlp": 0.01344788, + "balance_loss_clip": 0.06524719, + "balance_loss_mlp": 0.01262868, + "epoch": 0.0517661205471216, + "flos": 23992404583680.0, + "grad_norm": 3.0591302489664116, + "language_loss": 0.86028093, + "learning_rate": 3.99503285577813e-06, + "loss": 0.94540519, + "num_input_tokens_seen": 18419005, + "router_z_loss_clip": 6.4375, + "router_z_loss_mlp": 0.81884766, + "step": 861, + "time_per_iteration": 2.6825718879699707 + }, + { + "auxiliary_loss_clip": 0.07179172, + "auxiliary_loss_mlp": 0.01338271, + "balance_loss_clip": 0.06521305, + "balance_loss_mlp": 0.01256732, + "epoch": 0.05182624379978957, + "flos": 29285313367680.0, + "grad_norm": 3.256695777108904, + "language_loss": 0.8236177, + "learning_rate": 3.995005386531627e-06, + "loss": 0.90879214, + "num_input_tokens_seen": 18440550, + "router_z_loss_clip": 6.578125, + "router_z_loss_mlp": 0.81542969, + "step": 862, + "time_per_iteration": 2.723032236099243 + }, + { + "auxiliary_loss_clip": 0.07146881, + "auxiliary_loss_mlp": 0.01338015, + "balance_loss_clip": 0.06502384, + "balance_loss_mlp": 0.01256428, + "epoch": 0.05188636705245754, + "flos": 24177753815040.0, + "grad_norm": 4.080001789672534, + "language_loss": 0.92516744, + "learning_rate": 3.9949778416343195e-06, + "loss": 1.01001632, + "num_input_tokens_seen": 18461950, + "router_z_loss_clip": 6.44140625, + "router_z_loss_mlp": 0.81591797, + "step": 863, + "time_per_iteration": 2.624147653579712 + }, + { + "auxiliary_loss_clip": 0.07156427, + "auxiliary_loss_mlp": 0.0133763, + "balance_loss_clip": 0.06515339, + "balance_loss_mlp": 0.01253897, + "epoch": 0.051946490305125506, + "flos": 26767961953920.0, + "grad_norm": 5.3541817649382875, + "language_loss": 0.7963919, + "learning_rate": 3.9949502210872525e-06, + "loss": 0.88133246, + "num_input_tokens_seen": 18480555, + "router_z_loss_clip": 6.41015625, + "router_z_loss_mlp": 0.83789062, + "step": 864, + "time_per_iteration": 2.6928389072418213 + }, + { + "auxiliary_loss_clip": 0.07167269, + "auxiliary_loss_mlp": 0.01333883, + "balance_loss_clip": 0.0651238, + "balance_loss_mlp": 0.01252963, + "epoch": 0.05200661355779348, + "flos": 21508190259840.0, + "grad_norm": 2.900845784392114, + "language_loss": 0.83983421, + "learning_rate": 3.994922524891474e-06, + "loss": 0.9248457, + "num_input_tokens_seen": 18499645, + "router_z_loss_clip": 6.546875, + "router_z_loss_mlp": 0.80908203, + "step": 865, + "time_per_iteration": 2.6349294185638428 + }, + { + "auxiliary_loss_clip": 0.07157271, + "auxiliary_loss_mlp": 0.01343197, + "balance_loss_clip": 0.06511506, + "balance_loss_mlp": 0.01259417, + "epoch": 0.05206673681046144, + "flos": 18120457359360.0, + "grad_norm": 4.23578044185309, + "language_loss": 0.89868104, + "learning_rate": 3.994894753048032e-06, + "loss": 0.98368573, + "num_input_tokens_seen": 18516810, + "router_z_loss_clip": 6.453125, + "router_z_loss_mlp": 0.83789062, + "step": 866, + "time_per_iteration": 2.605546236038208 + }, + { + "auxiliary_loss_clip": 0.07133412, + "auxiliary_loss_mlp": 0.01337077, + "balance_loss_clip": 0.06502427, + "balance_loss_mlp": 0.01258494, + "epoch": 0.052126860063129415, + "flos": 17528966588160.0, + "grad_norm": 5.089693219930068, + "language_loss": 0.91889334, + "learning_rate": 3.9948669055579815e-06, + "loss": 1.00359821, + "num_input_tokens_seen": 18532510, + "router_z_loss_clip": 6.30859375, + "router_z_loss_mlp": 0.78564453, + "step": 867, + "time_per_iteration": 2.5601866245269775 + }, + { + "auxiliary_loss_clip": 0.07109866, + "auxiliary_loss_mlp": 0.01340108, + "balance_loss_clip": 0.06500173, + "balance_loss_mlp": 0.0126019, + "epoch": 0.05218698331579739, + "flos": 32606227036800.0, + "grad_norm": 2.1025104258361558, + "language_loss": 0.66466248, + "learning_rate": 3.9948389824223785e-06, + "loss": 0.7491622, + "num_input_tokens_seen": 18557380, + "router_z_loss_clip": 6.09765625, + "router_z_loss_mlp": 0.79882812, + "step": 868, + "time_per_iteration": 2.6942384243011475 + }, + { + "auxiliary_loss_clip": 0.0714476, + "auxiliary_loss_mlp": 0.01358483, + "balance_loss_clip": 0.06494892, + "balance_loss_mlp": 0.01263545, + "epoch": 0.05224710656846535, + "flos": 22133824369920.0, + "grad_norm": 2.980657220865539, + "language_loss": 0.87344658, + "learning_rate": 3.994810983642281e-06, + "loss": 0.95847905, + "num_input_tokens_seen": 18575720, + "router_z_loss_clip": 6.5, + "router_z_loss_mlp": 0.94921875, + "step": 869, + "time_per_iteration": 2.5877575874328613 + }, + { + "auxiliary_loss_clip": 0.07143813, + "auxiliary_loss_mlp": 0.01349092, + "balance_loss_clip": 0.06488257, + "balance_loss_mlp": 0.01260353, + "epoch": 0.052307229821133325, + "flos": 11149789472640.0, + "grad_norm": 7.7840171376663285, + "language_loss": 0.91889322, + "learning_rate": 3.994782909218751e-06, + "loss": 1.00382233, + "num_input_tokens_seen": 18592185, + "router_z_loss_clip": 6.55859375, + "router_z_loss_mlp": 0.88720703, + "step": 870, + "time_per_iteration": 2.608442783355713 + }, + { + "auxiliary_loss_clip": 0.07122661, + "auxiliary_loss_mlp": 0.01356358, + "balance_loss_clip": 0.064864, + "balance_loss_mlp": 0.01265759, + "epoch": 0.05236735307380129, + "flos": 19132862716800.0, + "grad_norm": 2.918328667759454, + "language_loss": 0.843858, + "learning_rate": 3.994754759152854e-06, + "loss": 0.92864817, + "num_input_tokens_seen": 18609560, + "router_z_loss_clip": 6.3671875, + "router_z_loss_mlp": 0.90722656, + "step": 871, + "time_per_iteration": 2.5879244804382324 + }, + { + "auxiliary_loss_clip": 0.07078928, + "auxiliary_loss_mlp": 0.01364934, + "balance_loss_clip": 0.06478463, + "balance_loss_mlp": 0.01281488, + "epoch": 0.05242747632646926, + "flos": 20967152446080.0, + "grad_norm": 2.587533245039743, + "language_loss": 0.8462553, + "learning_rate": 3.994726533445656e-06, + "loss": 0.93069392, + "num_input_tokens_seen": 18629405, + "router_z_loss_clip": 6.0078125, + "router_z_loss_mlp": 0.83496094, + "step": 872, + "time_per_iteration": 2.6208133697509766 + }, + { + "auxiliary_loss_clip": 0.06844061, + "auxiliary_loss_mlp": 0.01482571, + "balance_loss_clip": 0.06436051, + "balance_loss_mlp": 0.0141405, + "epoch": 0.052487599579137234, + "flos": 65038005872640.0, + "grad_norm": 0.8977590463147395, + "language_loss": 0.61953008, + "learning_rate": 3.9946982320982274e-06, + "loss": 0.70279646, + "num_input_tokens_seen": 18681480, + "router_z_loss_clip": 4.0625, + "router_z_loss_mlp": 0.68603516, + "step": 873, + "time_per_iteration": 3.134603500366211 + }, + { + "auxiliary_loss_clip": 0.07129098, + "auxiliary_loss_mlp": 0.01340569, + "balance_loss_clip": 0.06492221, + "balance_loss_mlp": 0.01259269, + "epoch": 0.0525477228318052, + "flos": 23294584581120.0, + "grad_norm": 2.232892718211453, + "language_loss": 0.92670178, + "learning_rate": 3.994669855111643e-06, + "loss": 1.01139832, + "num_input_tokens_seen": 18700390, + "router_z_loss_clip": 6.37109375, + "router_z_loss_mlp": 0.81298828, + "step": 874, + "time_per_iteration": 2.6136653423309326 + }, + { + "auxiliary_loss_clip": 0.07136606, + "auxiliary_loss_mlp": 0.01342837, + "balance_loss_clip": 0.0649495, + "balance_loss_mlp": 0.01262681, + "epoch": 0.05260784608447317, + "flos": 32237834561280.0, + "grad_norm": 3.6657665933203796, + "language_loss": 0.78140688, + "learning_rate": 3.994641402486977e-06, + "loss": 0.86620128, + "num_input_tokens_seen": 18721280, + "router_z_loss_clip": 6.41796875, + "router_z_loss_mlp": 0.80175781, + "step": 875, + "time_per_iteration": 2.72760272026062 + }, + { + "auxiliary_loss_clip": 0.07132401, + "auxiliary_loss_mlp": 0.01330422, + "balance_loss_clip": 0.06503764, + "balance_loss_mlp": 0.01255511, + "epoch": 0.052667969337141136, + "flos": 24470270818560.0, + "grad_norm": 2.6184423818700684, + "language_loss": 0.96137547, + "learning_rate": 3.99461287422531e-06, + "loss": 1.04600358, + "num_input_tokens_seen": 18741545, + "router_z_loss_clip": 6.28515625, + "router_z_loss_mlp": 0.74902344, + "step": 876, + "time_per_iteration": 2.627152681350708 + }, + { + "auxiliary_loss_clip": 0.06850941, + "auxiliary_loss_mlp": 0.01378053, + "balance_loss_clip": 0.06451087, + "balance_loss_mlp": 0.01329487, + "epoch": 0.05272809258980911, + "flos": 57804673034880.0, + "grad_norm": 0.7984915998280667, + "language_loss": 0.63229537, + "learning_rate": 3.994584270327722e-06, + "loss": 0.7145853, + "num_input_tokens_seen": 18801400, + "router_z_loss_clip": 4.0, + "router_z_loss_mlp": 0.48510742, + "step": 877, + "time_per_iteration": 3.2541913986206055 + }, + { + "auxiliary_loss_clip": 0.0712804, + "auxiliary_loss_mlp": 0.01326088, + "balance_loss_clip": 0.06496318, + "balance_loss_mlp": 0.01255087, + "epoch": 0.05278821584247708, + "flos": 17426578498560.0, + "grad_norm": 2.7186428977077624, + "language_loss": 0.89685273, + "learning_rate": 3.994555590795299e-06, + "loss": 0.98139405, + "num_input_tokens_seen": 18819670, + "router_z_loss_clip": 6.3125, + "router_z_loss_mlp": 0.71044922, + "step": 878, + "time_per_iteration": 2.5782718658447266 + }, + { + "auxiliary_loss_clip": 0.07154611, + "auxiliary_loss_mlp": 0.0135536, + "balance_loss_clip": 0.06498797, + "balance_loss_mlp": 0.01272485, + "epoch": 0.052848339095145046, + "flos": 26143879144320.0, + "grad_norm": 3.677878171007489, + "language_loss": 0.873586, + "learning_rate": 3.9945268356291275e-06, + "loss": 0.9586857, + "num_input_tokens_seen": 18840580, + "router_z_loss_clip": 6.55859375, + "router_z_loss_mlp": 0.82910156, + "step": 879, + "time_per_iteration": 2.6588823795318604 + }, + { + "auxiliary_loss_clip": 0.07119917, + "auxiliary_loss_mlp": 0.01353348, + "balance_loss_clip": 0.06497534, + "balance_loss_mlp": 0.01274622, + "epoch": 0.05290846234781302, + "flos": 16477680136320.0, + "grad_norm": 3.320308324601447, + "language_loss": 0.88939857, + "learning_rate": 3.9944980048302985e-06, + "loss": 0.97413123, + "num_input_tokens_seen": 18859295, + "router_z_loss_clip": 6.2265625, + "router_z_loss_mlp": 0.78710938, + "step": 880, + "time_per_iteration": 2.578577756881714 + }, + { + "auxiliary_loss_clip": 0.07141528, + "auxiliary_loss_mlp": 0.01362108, + "balance_loss_clip": 0.06505635, + "balance_loss_mlp": 0.0127971, + "epoch": 0.05296858560048098, + "flos": 19871324749440.0, + "grad_norm": 13.59148063097553, + "language_loss": 0.93088204, + "learning_rate": 3.994469098399906e-06, + "loss": 1.01591837, + "num_input_tokens_seen": 18877485, + "router_z_loss_clip": 6.3671875, + "router_z_loss_mlp": 0.82421875, + "step": 881, + "time_per_iteration": 2.5984764099121094 + }, + { + "auxiliary_loss_clip": 0.07145406, + "auxiliary_loss_mlp": 0.01363259, + "balance_loss_clip": 0.06503064, + "balance_loss_mlp": 0.01280146, + "epoch": 0.053028708853148955, + "flos": 24395359668480.0, + "grad_norm": 2.511110361208876, + "language_loss": 0.91561359, + "learning_rate": 3.994440116339046e-06, + "loss": 1.00070024, + "num_input_tokens_seen": 18898275, + "router_z_loss_clip": 6.4140625, + "router_z_loss_mlp": 0.83203125, + "step": 882, + "time_per_iteration": 2.6321942806243896 + }, + { + "auxiliary_loss_clip": 0.07153618, + "auxiliary_loss_mlp": 0.01379213, + "balance_loss_clip": 0.06501983, + "balance_loss_mlp": 0.0129343, + "epoch": 0.05308883210581693, + "flos": 36402072048000.0, + "grad_norm": 3.8602802151834035, + "language_loss": 0.74549603, + "learning_rate": 3.994411058648816e-06, + "loss": 0.83082438, + "num_input_tokens_seen": 18920665, + "router_z_loss_clip": 6.515625, + "router_z_loss_mlp": 0.85839844, + "step": 883, + "time_per_iteration": 2.758694648742676 + }, + { + "auxiliary_loss_clip": 0.07123835, + "auxiliary_loss_mlp": 0.01365604, + "balance_loss_clip": 0.06493074, + "balance_loss_mlp": 0.01279965, + "epoch": 0.05314895535848489, + "flos": 22861427299200.0, + "grad_norm": 3.506018870992282, + "language_loss": 0.79542196, + "learning_rate": 3.994381925330319e-06, + "loss": 0.88031638, + "num_input_tokens_seen": 18939835, + "router_z_loss_clip": 6.3125, + "router_z_loss_mlp": 0.85644531, + "step": 884, + "time_per_iteration": 2.638016700744629 + }, + { + "auxiliary_loss_clip": 0.07094033, + "auxiliary_loss_mlp": 0.01359391, + "balance_loss_clip": 0.06489642, + "balance_loss_mlp": 0.01288057, + "epoch": 0.053209078611152864, + "flos": 12865381493760.0, + "grad_norm": 6.565904312623652, + "language_loss": 0.90469623, + "learning_rate": 3.994352716384659e-06, + "loss": 0.98923051, + "num_input_tokens_seen": 18958405, + "router_z_loss_clip": 6.0390625, + "router_z_loss_mlp": 0.71289062, + "step": 885, + "time_per_iteration": 2.5900588035583496 + }, + { + "auxiliary_loss_clip": 0.07139361, + "auxiliary_loss_mlp": 0.01377795, + "balance_loss_clip": 0.06508732, + "balance_loss_mlp": 0.0129225, + "epoch": 0.05326920186382083, + "flos": 12169112791680.0, + "grad_norm": 9.079017579739912, + "language_loss": 0.91530603, + "learning_rate": 3.994323431812945e-06, + "loss": 1.00047755, + "num_input_tokens_seen": 18975445, + "router_z_loss_clip": 6.3046875, + "router_z_loss_mlp": 0.85595703, + "step": 886, + "time_per_iteration": 4.099337339401245 + }, + { + "auxiliary_loss_clip": 0.07124092, + "auxiliary_loss_mlp": 0.01379295, + "balance_loss_clip": 0.06500152, + "balance_loss_mlp": 0.01295754, + "epoch": 0.0533293251164888, + "flos": 22710011771520.0, + "grad_norm": 3.9905004918105202, + "language_loss": 0.93810099, + "learning_rate": 3.994294071616286e-06, + "loss": 1.02313483, + "num_input_tokens_seen": 18991930, + "router_z_loss_clip": 6.23828125, + "router_z_loss_mlp": 0.83447266, + "step": 887, + "time_per_iteration": 2.5987393856048584 + }, + { + "auxiliary_loss_clip": 0.0714867, + "auxiliary_loss_mlp": 0.01405803, + "balance_loss_clip": 0.06507815, + "balance_loss_mlp": 0.01314536, + "epoch": 0.053389448369156774, + "flos": 26947860670080.0, + "grad_norm": 3.06900720752712, + "language_loss": 0.79354906, + "learning_rate": 3.994264635795796e-06, + "loss": 0.87909377, + "num_input_tokens_seen": 19009790, + "router_z_loss_clip": 6.40234375, + "router_z_loss_mlp": 0.91259766, + "step": 888, + "time_per_iteration": 4.025885820388794 + }, + { + "auxiliary_loss_clip": 0.07115386, + "auxiliary_loss_mlp": 0.01373999, + "balance_loss_clip": 0.06494455, + "balance_loss_mlp": 0.01293223, + "epoch": 0.05344957162182474, + "flos": 25563331330560.0, + "grad_norm": 6.088733603359691, + "language_loss": 0.92500973, + "learning_rate": 3.994235124352592e-06, + "loss": 1.00990355, + "num_input_tokens_seen": 19030170, + "router_z_loss_clip": 6.21484375, + "router_z_loss_mlp": 0.80761719, + "step": 889, + "time_per_iteration": 2.7182345390319824 + }, + { + "auxiliary_loss_clip": 0.07091353, + "auxiliary_loss_mlp": 0.01359755, + "balance_loss_clip": 0.06492079, + "balance_loss_mlp": 0.01289135, + "epoch": 0.05350969487449271, + "flos": 19725779007360.0, + "grad_norm": 3.9732892090836818, + "language_loss": 0.92642856, + "learning_rate": 3.994205537287791e-06, + "loss": 1.0109396, + "num_input_tokens_seen": 19048075, + "router_z_loss_clip": 5.99609375, + "router_z_loss_mlp": 0.70654297, + "step": 890, + "time_per_iteration": 4.055738925933838 + }, + { + "auxiliary_loss_clip": 0.071067, + "auxiliary_loss_mlp": 0.01356348, + "balance_loss_clip": 0.06478938, + "balance_loss_mlp": 0.01276573, + "epoch": 0.053569818127160676, + "flos": 27023694215040.0, + "grad_norm": 3.5767216506214523, + "language_loss": 0.98853362, + "learning_rate": 3.994175874602517e-06, + "loss": 1.07316399, + "num_input_tokens_seen": 19067465, + "router_z_loss_clip": 6.27734375, + "router_z_loss_mlp": 0.79785156, + "step": 891, + "time_per_iteration": 2.651681661605835 + }, + { + "auxiliary_loss_clip": 0.07084872, + "auxiliary_loss_mlp": 0.01351507, + "balance_loss_clip": 0.06476413, + "balance_loss_mlp": 0.01277788, + "epoch": 0.05362994137982865, + "flos": 13193383501440.0, + "grad_norm": 5.794831179079165, + "language_loss": 0.75768781, + "learning_rate": 3.994146136297893e-06, + "loss": 0.84205151, + "num_input_tokens_seen": 19085505, + "router_z_loss_clip": 6.08203125, + "router_z_loss_mlp": 0.73779297, + "step": 892, + "time_per_iteration": 2.5933892726898193 + }, + { + "auxiliary_loss_clip": 0.07096062, + "auxiliary_loss_mlp": 0.01350672, + "balance_loss_clip": 0.0647971, + "balance_loss_mlp": 0.01278002, + "epoch": 0.05369006463249662, + "flos": 28665590970240.0, + "grad_norm": 4.507397126758742, + "language_loss": 0.85958588, + "learning_rate": 3.994116322375049e-06, + "loss": 0.94405323, + "num_input_tokens_seen": 19104360, + "router_z_loss_clip": 6.16796875, + "router_z_loss_mlp": 0.7265625, + "step": 893, + "time_per_iteration": 2.6546378135681152 + }, + { + "auxiliary_loss_clip": 0.07101032, + "auxiliary_loss_mlp": 0.01336529, + "balance_loss_clip": 0.06474701, + "balance_loss_mlp": 0.01265099, + "epoch": 0.053750187885164585, + "flos": 28920736252800.0, + "grad_norm": 9.639579848612797, + "language_loss": 0.85423577, + "learning_rate": 3.994086432835114e-06, + "loss": 0.93861139, + "num_input_tokens_seen": 19124680, + "router_z_loss_clip": 6.265625, + "router_z_loss_mlp": 0.71484375, + "step": 894, + "time_per_iteration": 2.649336099624634 + }, + { + "auxiliary_loss_clip": 0.07051332, + "auxiliary_loss_mlp": 0.0132645, + "balance_loss_clip": 0.06452148, + "balance_loss_mlp": 0.01260742, + "epoch": 0.05381031113783256, + "flos": 15164246586240.0, + "grad_norm": 3.2292453008689215, + "language_loss": 0.79914492, + "learning_rate": 3.994056467679221e-06, + "loss": 0.88292277, + "num_input_tokens_seen": 19142895, + "router_z_loss_clip": 5.9921875, + "router_z_loss_mlp": 0.65722656, + "step": 895, + "time_per_iteration": 2.5825929641723633 + }, + { + "auxiliary_loss_clip": 0.07075687, + "auxiliary_loss_mlp": 0.01335812, + "balance_loss_clip": 0.06453281, + "balance_loss_mlp": 0.01257229, + "epoch": 0.05387043439050053, + "flos": 21841684709760.0, + "grad_norm": 4.836504932030544, + "language_loss": 0.91227436, + "learning_rate": 3.9940264269085065e-06, + "loss": 0.99638927, + "num_input_tokens_seen": 19163125, + "router_z_loss_clip": 6.2265625, + "router_z_loss_mlp": 0.78564453, + "step": 896, + "time_per_iteration": 2.657710313796997 + }, + { + "auxiliary_loss_clip": 0.07047559, + "auxiliary_loss_mlp": 0.0133946, + "balance_loss_clip": 0.06444345, + "balance_loss_mlp": 0.01266504, + "epoch": 0.053930557643168495, + "flos": 17315888855040.0, + "grad_norm": 5.716166538264852, + "language_loss": 0.91855001, + "learning_rate": 3.9939963105241115e-06, + "loss": 1.00242019, + "num_input_tokens_seen": 19179385, + "router_z_loss_clip": 6.0390625, + "router_z_loss_mlp": 0.72998047, + "step": 897, + "time_per_iteration": 2.5864884853363037 + }, + { + "auxiliary_loss_clip": 0.06997538, + "auxiliary_loss_mlp": 0.013383, + "balance_loss_clip": 0.06422779, + "balance_loss_mlp": 0.0126625, + "epoch": 0.05399068089583647, + "flos": 17354350679040.0, + "grad_norm": 28.355738836577903, + "language_loss": 0.93759477, + "learning_rate": 3.993966118527175e-06, + "loss": 1.02095306, + "num_input_tokens_seen": 19198725, + "router_z_loss_clip": 5.74609375, + "router_z_loss_mlp": 0.72070312, + "step": 898, + "time_per_iteration": 2.6132631301879883 + }, + { + "auxiliary_loss_clip": 0.07036521, + "auxiliary_loss_mlp": 0.01343105, + "balance_loss_clip": 0.06425488, + "balance_loss_mlp": 0.01264809, + "epoch": 0.05405080414850443, + "flos": 17491594867200.0, + "grad_norm": 4.630068897804509, + "language_loss": 0.97064686, + "learning_rate": 3.993935850918845e-06, + "loss": 1.05444312, + "num_input_tokens_seen": 19212380, + "router_z_loss_clip": 6.10546875, + "router_z_loss_mlp": 0.78320312, + "step": 899, + "time_per_iteration": 2.5816986560821533 + }, + { + "auxiliary_loss_clip": 0.07002847, + "auxiliary_loss_mlp": 0.01337851, + "balance_loss_clip": 0.06429946, + "balance_loss_mlp": 0.01263131, + "epoch": 0.054110927401172404, + "flos": 24503365981440.0, + "grad_norm": 5.469084454178289, + "language_loss": 0.79532343, + "learning_rate": 3.9939055077002665e-06, + "loss": 0.87873036, + "num_input_tokens_seen": 19232235, + "router_z_loss_clip": 5.73046875, + "router_z_loss_mlp": 0.74755859, + "step": 900, + "time_per_iteration": 2.6616973876953125 + }, + { + "auxiliary_loss_clip": 0.07026203, + "auxiliary_loss_mlp": 0.01335204, + "balance_loss_clip": 0.06429055, + "balance_loss_mlp": 0.01261628, + "epoch": 0.054171050653840376, + "flos": 22936715792640.0, + "grad_norm": 9.114074112173778, + "language_loss": 0.79687816, + "learning_rate": 3.993875088872592e-06, + "loss": 0.88049221, + "num_input_tokens_seen": 19251460, + "router_z_loss_clip": 5.9765625, + "router_z_loss_mlp": 0.73681641, + "step": 901, + "time_per_iteration": 2.6217994689941406 + }, + { + "auxiliary_loss_clip": 0.06969521, + "auxiliary_loss_mlp": 0.01353187, + "balance_loss_clip": 0.06413257, + "balance_loss_mlp": 0.01276941, + "epoch": 0.05423117390650834, + "flos": 12938238218880.0, + "grad_norm": 4.5794905652094675, + "language_loss": 0.8858788, + "learning_rate": 3.9938445944369745e-06, + "loss": 0.96910584, + "num_input_tokens_seen": 19269060, + "router_z_loss_clip": 5.56640625, + "router_z_loss_mlp": 0.76220703, + "step": 902, + "time_per_iteration": 2.600041151046753 + }, + { + "auxiliary_loss_clip": 0.07010742, + "auxiliary_loss_mlp": 0.01348168, + "balance_loss_clip": 0.0642361, + "balance_loss_mlp": 0.01272208, + "epoch": 0.05429129715917631, + "flos": 19907438659200.0, + "grad_norm": 3.5235627900978987, + "language_loss": 0.90038717, + "learning_rate": 3.993814024394569e-06, + "loss": 0.98397624, + "num_input_tokens_seen": 19288620, + "router_z_loss_clip": 5.875, + "router_z_loss_mlp": 0.75927734, + "step": 903, + "time_per_iteration": 2.654343843460083 + }, + { + "auxiliary_loss_clip": 0.07027672, + "auxiliary_loss_mlp": 0.01351984, + "balance_loss_clip": 0.06429485, + "balance_loss_mlp": 0.01276739, + "epoch": 0.05435142041184428, + "flos": 16914065800320.0, + "grad_norm": 3.6682943607818808, + "language_loss": 0.79433787, + "learning_rate": 3.993783378746537e-06, + "loss": 0.87813443, + "num_input_tokens_seen": 19306615, + "router_z_loss_clip": 5.98046875, + "router_z_loss_mlp": 0.75292969, + "step": 904, + "time_per_iteration": 2.5959675312042236 + }, + { + "auxiliary_loss_clip": 0.07042356, + "auxiliary_loss_mlp": 0.01361745, + "balance_loss_clip": 0.06427713, + "balance_loss_mlp": 0.01279062, + "epoch": 0.05441154366451225, + "flos": 23954613592320.0, + "grad_norm": 4.579053653377249, + "language_loss": 0.88901699, + "learning_rate": 3.993752657494039e-06, + "loss": 0.97305799, + "num_input_tokens_seen": 19321680, + "router_z_loss_clip": 6.140625, + "router_z_loss_mlp": 0.82714844, + "step": 905, + "time_per_iteration": 2.6219427585601807 + }, + { + "auxiliary_loss_clip": 0.06998053, + "auxiliary_loss_mlp": 0.01347731, + "balance_loss_clip": 0.06429392, + "balance_loss_mlp": 0.01274727, + "epoch": 0.05447166691718022, + "flos": 19981678976640.0, + "grad_norm": 3.7765145633999624, + "language_loss": 0.78233027, + "learning_rate": 3.993721860638241e-06, + "loss": 0.8657881, + "num_input_tokens_seen": 19339760, + "router_z_loss_clip": 5.6875, + "router_z_loss_mlp": 0.73046875, + "step": 906, + "time_per_iteration": 2.6213393211364746 + }, + { + "auxiliary_loss_clip": 0.07034522, + "auxiliary_loss_mlp": 0.01354415, + "balance_loss_clip": 0.06439427, + "balance_loss_mlp": 0.01281221, + "epoch": 0.05453179016984819, + "flos": 24943483152000.0, + "grad_norm": 3.1487164244038546, + "language_loss": 0.91526973, + "learning_rate": 3.993690988180309e-06, + "loss": 0.9991591, + "num_input_tokens_seen": 19359585, + "router_z_loss_clip": 5.94921875, + "router_z_loss_mlp": 0.73242188, + "step": 907, + "time_per_iteration": 2.6804075241088867 + }, + { + "auxiliary_loss_clip": 0.07033581, + "auxiliary_loss_mlp": 0.01357567, + "balance_loss_clip": 0.06437694, + "balance_loss_mlp": 0.01279461, + "epoch": 0.05459191342251616, + "flos": 18121170119040.0, + "grad_norm": 6.406912601020187, + "language_loss": 0.90540731, + "learning_rate": 3.9936600401214165e-06, + "loss": 0.98931873, + "num_input_tokens_seen": 19378590, + "router_z_loss_clip": 5.953125, + "router_z_loss_mlp": 0.78076172, + "step": 908, + "time_per_iteration": 2.645015001296997 + }, + { + "auxiliary_loss_clip": 0.07043326, + "auxiliary_loss_mlp": 0.01345219, + "balance_loss_clip": 0.06445918, + "balance_loss_mlp": 0.01274695, + "epoch": 0.054652036675184125, + "flos": 19214314485120.0, + "grad_norm": 7.110019645600745, + "language_loss": 0.94541007, + "learning_rate": 3.9936290164627345e-06, + "loss": 1.02929544, + "num_input_tokens_seen": 19397910, + "router_z_loss_clip": 5.96875, + "router_z_loss_mlp": 0.70507812, + "step": 909, + "time_per_iteration": 2.6648013591766357 + }, + { + "auxiliary_loss_clip": 0.07070212, + "auxiliary_loss_mlp": 0.01367531, + "balance_loss_clip": 0.06454301, + "balance_loss_mlp": 0.01287184, + "epoch": 0.0547121599278521, + "flos": 16331253926400.0, + "grad_norm": 4.130588011927331, + "language_loss": 0.76068008, + "learning_rate": 3.99359791720544e-06, + "loss": 0.84505749, + "num_input_tokens_seen": 19415950, + "router_z_loss_clip": 6.15625, + "router_z_loss_mlp": 0.80273438, + "step": 910, + "time_per_iteration": 2.588240146636963 + }, + { + "auxiliary_loss_clip": 0.07039558, + "auxiliary_loss_mlp": 0.0135407, + "balance_loss_clip": 0.06453503, + "balance_loss_mlp": 0.01281829, + "epoch": 0.05477228318052007, + "flos": 20345165988480.0, + "grad_norm": 30.49086914574189, + "language_loss": 0.86822844, + "learning_rate": 3.993566742350714e-06, + "loss": 0.95216471, + "num_input_tokens_seen": 19435275, + "router_z_loss_clip": 5.8671875, + "router_z_loss_mlp": 0.72265625, + "step": 911, + "time_per_iteration": 2.6324408054351807 + }, + { + "auxiliary_loss_clip": 0.07064489, + "auxiliary_loss_mlp": 0.01358074, + "balance_loss_clip": 0.06459624, + "balance_loss_mlp": 0.01280207, + "epoch": 0.054832406433188034, + "flos": 21978216138240.0, + "grad_norm": 33.1555590789585, + "language_loss": 0.80294693, + "learning_rate": 3.993535491899736e-06, + "loss": 0.88717258, + "num_input_tokens_seen": 19452090, + "router_z_loss_clip": 6.046875, + "router_z_loss_mlp": 0.77880859, + "step": 912, + "time_per_iteration": 2.590373992919922 + }, + { + "auxiliary_loss_clip": 0.0703726, + "auxiliary_loss_mlp": 0.01353834, + "balance_loss_clip": 0.06456903, + "balance_loss_mlp": 0.01284979, + "epoch": 0.054892529685856006, + "flos": 16404487994880.0, + "grad_norm": 20.678206909589232, + "language_loss": 0.87077272, + "learning_rate": 3.993504165853694e-06, + "loss": 0.9546836, + "num_input_tokens_seen": 19470865, + "router_z_loss_clip": 5.8046875, + "router_z_loss_mlp": 0.68896484, + "step": 913, + "time_per_iteration": 2.6207854747772217 + }, + { + "auxiliary_loss_clip": 0.07058232, + "auxiliary_loss_mlp": 0.01355937, + "balance_loss_clip": 0.06467378, + "balance_loss_mlp": 0.01279214, + "epoch": 0.05495265293852397, + "flos": 23918709317760.0, + "grad_norm": 2.929829982992902, + "language_loss": 0.86646307, + "learning_rate": 3.993472764213772e-06, + "loss": 0.9506048, + "num_input_tokens_seen": 19492145, + "router_z_loss_clip": 5.91015625, + "router_z_loss_mlp": 0.76708984, + "step": 914, + "time_per_iteration": 2.653738260269165 + }, + { + "auxiliary_loss_clip": 0.07080867, + "auxiliary_loss_mlp": 0.01347963, + "balance_loss_clip": 0.06487378, + "balance_loss_mlp": 0.01278583, + "epoch": 0.055012776191191944, + "flos": 23593767984000.0, + "grad_norm": 5.681880132712419, + "language_loss": 0.94313538, + "learning_rate": 3.9934412869811655e-06, + "loss": 1.02742374, + "num_input_tokens_seen": 19511015, + "router_z_loss_clip": 5.93359375, + "router_z_loss_mlp": 0.69433594, + "step": 915, + "time_per_iteration": 2.6307506561279297 + }, + { + "auxiliary_loss_clip": 0.07055361, + "auxiliary_loss_mlp": 0.01345822, + "balance_loss_clip": 0.06473369, + "balance_loss_mlp": 0.01276442, + "epoch": 0.055072899443859916, + "flos": 17533997832960.0, + "grad_norm": 9.383060565186796, + "language_loss": 0.9327727, + "learning_rate": 3.993409734157064e-06, + "loss": 1.01678455, + "num_input_tokens_seen": 19529040, + "router_z_loss_clip": 5.8203125, + "router_z_loss_mlp": 0.69384766, + "step": 916, + "time_per_iteration": 2.5821292400360107 + }, + { + "auxiliary_loss_clip": 0.0710435, + "auxiliary_loss_mlp": 0.01382873, + "balance_loss_clip": 0.06478155, + "balance_loss_mlp": 0.01299808, + "epoch": 0.05513302269652788, + "flos": 21693246001920.0, + "grad_norm": 9.219504726961107, + "language_loss": 0.83272588, + "learning_rate": 3.993378105742666e-06, + "loss": 0.91759813, + "num_input_tokens_seen": 19549540, + "router_z_loss_clip": 6.2578125, + "router_z_loss_mlp": 0.83056641, + "step": 917, + "time_per_iteration": 2.620739221572876 + }, + { + "auxiliary_loss_clip": 0.07102817, + "auxiliary_loss_mlp": 0.01375299, + "balance_loss_clip": 0.06484253, + "balance_loss_mlp": 0.01293473, + "epoch": 0.05519314594919585, + "flos": 21619257246720.0, + "grad_norm": 3.775060612193374, + "language_loss": 0.84478474, + "learning_rate": 3.9933464017391705e-06, + "loss": 0.92956591, + "num_input_tokens_seen": 19567570, + "router_z_loss_clip": 6.1875, + "router_z_loss_mlp": 0.81787109, + "step": 918, + "time_per_iteration": 2.594416379928589 + }, + { + "auxiliary_loss_clip": 0.07101964, + "auxiliary_loss_mlp": 0.01367305, + "balance_loss_clip": 0.06485492, + "balance_loss_mlp": 0.01289151, + "epoch": 0.05525326920186382, + "flos": 21804983821440.0, + "grad_norm": 30.311763596206674, + "language_loss": 0.92698455, + "learning_rate": 3.99331462214778e-06, + "loss": 1.01167727, + "num_input_tokens_seen": 19585330, + "router_z_loss_clip": 6.171875, + "router_z_loss_mlp": 0.78125, + "step": 919, + "time_per_iteration": 2.652820587158203 + }, + { + "auxiliary_loss_clip": 0.07067424, + "auxiliary_loss_mlp": 0.01355052, + "balance_loss_clip": 0.06469625, + "balance_loss_mlp": 0.01279807, + "epoch": 0.05531339245453179, + "flos": 28447272357120.0, + "grad_norm": 10.071293586926402, + "language_loss": 0.91352344, + "learning_rate": 3.993282766969699e-06, + "loss": 0.99774826, + "num_input_tokens_seen": 19604970, + "router_z_loss_clip": 5.97265625, + "router_z_loss_mlp": 0.75244141, + "step": 920, + "time_per_iteration": 2.676198720932007 + }, + { + "auxiliary_loss_clip": 0.0705073, + "auxiliary_loss_mlp": 0.01349539, + "balance_loss_clip": 0.06465692, + "balance_loss_mlp": 0.01277489, + "epoch": 0.05537351570719976, + "flos": 37383688229760.0, + "grad_norm": 4.912310342767309, + "language_loss": 0.69610375, + "learning_rate": 3.993250836206136e-06, + "loss": 0.78010643, + "num_input_tokens_seen": 19626235, + "router_z_loss_clip": 5.85546875, + "router_z_loss_mlp": 0.72021484, + "step": 921, + "time_per_iteration": 2.729602098464966 + }, + { + "auxiliary_loss_clip": 0.07080688, + "auxiliary_loss_mlp": 0.01369369, + "balance_loss_clip": 0.06465121, + "balance_loss_mlp": 0.01287687, + "epoch": 0.05543363895986773, + "flos": 20090733465600.0, + "grad_norm": 4.2535446135467785, + "language_loss": 0.76117694, + "learning_rate": 3.993218829858301e-06, + "loss": 0.8456775, + "num_input_tokens_seen": 19644305, + "router_z_loss_clip": 6.1640625, + "router_z_loss_mlp": 0.81689453, + "step": 922, + "time_per_iteration": 2.5846810340881348 + }, + { + "auxiliary_loss_clip": 0.07077445, + "auxiliary_loss_mlp": 0.01375095, + "balance_loss_clip": 0.06466563, + "balance_loss_mlp": 0.01293842, + "epoch": 0.0554937622125357, + "flos": 24539773380480.0, + "grad_norm": 5.782149663492731, + "language_loss": 0.86474669, + "learning_rate": 3.993186747927408e-06, + "loss": 0.9492721, + "num_input_tokens_seen": 19662130, + "router_z_loss_clip": 6.1171875, + "router_z_loss_mlp": 0.81298828, + "step": 923, + "time_per_iteration": 2.6038758754730225 + }, + { + "auxiliary_loss_clip": 0.07066977, + "auxiliary_loss_mlp": 0.01365852, + "balance_loss_clip": 0.06460079, + "balance_loss_mlp": 0.01286125, + "epoch": 0.055553885465203665, + "flos": 14325408961920.0, + "grad_norm": 4.5524709486596695, + "language_loss": 0.82890737, + "learning_rate": 3.993154590414675e-06, + "loss": 0.91323566, + "num_input_tokens_seen": 19680715, + "router_z_loss_clip": 6.0703125, + "router_z_loss_mlp": 0.79736328, + "step": 924, + "time_per_iteration": 2.563229560852051 + }, + { + "auxiliary_loss_clip": 0.07049644, + "auxiliary_loss_mlp": 0.01383238, + "balance_loss_clip": 0.06458092, + "balance_loss_mlp": 0.01303654, + "epoch": 0.05561400871787164, + "flos": 27388522892160.0, + "grad_norm": 5.4957057534226115, + "language_loss": 1.05798936, + "learning_rate": 3.993122357321319e-06, + "loss": 1.14231825, + "num_input_tokens_seen": 19700535, + "router_z_loss_clip": 5.9140625, + "router_z_loss_mlp": 0.79492188, + "step": 925, + "time_per_iteration": 4.167480230331421 + }, + { + "auxiliary_loss_clip": 0.07051321, + "auxiliary_loss_mlp": 0.01368022, + "balance_loss_clip": 0.06456822, + "balance_loss_mlp": 0.01291585, + "epoch": 0.05567413197053961, + "flos": 23227681495680.0, + "grad_norm": 4.150968516842117, + "language_loss": 0.85383534, + "learning_rate": 3.993090048648564e-06, + "loss": 0.93802875, + "num_input_tokens_seen": 19718825, + "router_z_loss_clip": 5.94921875, + "router_z_loss_mlp": 0.76367188, + "step": 926, + "time_per_iteration": 4.156589031219482 + }, + { + "auxiliary_loss_clip": 0.07111964, + "auxiliary_loss_mlp": 0.01390888, + "balance_loss_clip": 0.06470172, + "balance_loss_mlp": 0.0130129, + "epoch": 0.055734255223207574, + "flos": 25271988284160.0, + "grad_norm": 8.095313947782397, + "language_loss": 0.79582185, + "learning_rate": 3.993057664397634e-06, + "loss": 0.88085037, + "num_input_tokens_seen": 19739080, + "router_z_loss_clip": 6.42578125, + "router_z_loss_mlp": 0.89550781, + "step": 927, + "time_per_iteration": 2.6851751804351807 + }, + { + "auxiliary_loss_clip": 0.06860578, + "auxiliary_loss_mlp": 0.01306525, + "balance_loss_clip": 0.06486383, + "balance_loss_mlp": 0.01261607, + "epoch": 0.055794378475875546, + "flos": 66524698938240.0, + "grad_norm": 0.7865808163657396, + "language_loss": 0.59965324, + "learning_rate": 3.9930252045697585e-06, + "loss": 0.68132424, + "num_input_tokens_seen": 19802960, + "router_z_loss_clip": 3.74804688, + "router_z_loss_mlp": 0.44921875, + "step": 928, + "time_per_iteration": 4.694532632827759 + }, + { + "auxiliary_loss_clip": 0.0702403, + "auxiliary_loss_mlp": 0.01398439, + "balance_loss_clip": 0.06437568, + "balance_loss_mlp": 0.01313991, + "epoch": 0.05585450172854351, + "flos": 25344635374080.0, + "grad_norm": 5.300738051002958, + "language_loss": 0.99270105, + "learning_rate": 3.992992669166168e-06, + "loss": 1.07692575, + "num_input_tokens_seen": 19822765, + "router_z_loss_clip": 5.8671875, + "router_z_loss_mlp": 0.84472656, + "step": 929, + "time_per_iteration": 2.652329444885254 + }, + { + "auxiliary_loss_clip": 0.07033007, + "auxiliary_loss_mlp": 0.01402576, + "balance_loss_clip": 0.06441823, + "balance_loss_mlp": 0.01318938, + "epoch": 0.05591462498121148, + "flos": 33920163711360.0, + "grad_norm": 20.10669872289237, + "language_loss": 0.7473861, + "learning_rate": 3.992960058188094e-06, + "loss": 0.83174193, + "num_input_tokens_seen": 19843590, + "router_z_loss_clip": 5.91015625, + "router_z_loss_mlp": 0.83691406, + "step": 930, + "time_per_iteration": 4.218009948730469 + }, + { + "auxiliary_loss_clip": 0.0703931, + "auxiliary_loss_mlp": 0.01397804, + "balance_loss_clip": 0.06446733, + "balance_loss_mlp": 0.01313929, + "epoch": 0.055974748233879455, + "flos": 17936617501440.0, + "grad_norm": 4.521391546474749, + "language_loss": 0.88519967, + "learning_rate": 3.992927371636776e-06, + "loss": 0.96957082, + "num_input_tokens_seen": 19860230, + "router_z_loss_clip": 5.91796875, + "router_z_loss_mlp": 0.83886719, + "step": 931, + "time_per_iteration": 2.5678892135620117 + }, + { + "auxiliary_loss_clip": 0.07037735, + "auxiliary_loss_mlp": 0.01413156, + "balance_loss_clip": 0.06439222, + "balance_loss_mlp": 0.01325466, + "epoch": 0.05603487148654742, + "flos": 24028392712320.0, + "grad_norm": 3.3508446860260355, + "language_loss": 0.86982858, + "learning_rate": 3.9928946095134525e-06, + "loss": 0.95433742, + "num_input_tokens_seen": 19880795, + "router_z_loss_clip": 5.9921875, + "router_z_loss_mlp": 0.87695312, + "step": 932, + "time_per_iteration": 2.6454596519470215 + }, + { + "auxiliary_loss_clip": 0.07046005, + "auxiliary_loss_mlp": 0.01409303, + "balance_loss_clip": 0.06444195, + "balance_loss_mlp": 0.01322901, + "epoch": 0.05609499473921539, + "flos": 17312912035200.0, + "grad_norm": 4.63721211876497, + "language_loss": 0.79083282, + "learning_rate": 3.992861771819365e-06, + "loss": 0.87538588, + "num_input_tokens_seen": 19897960, + "router_z_loss_clip": 6.02734375, + "router_z_loss_mlp": 0.86328125, + "step": 933, + "time_per_iteration": 2.5537846088409424 + }, + { + "auxiliary_loss_clip": 0.07023589, + "auxiliary_loss_mlp": 0.01416541, + "balance_loss_clip": 0.06434061, + "balance_loss_mlp": 0.01334287, + "epoch": 0.05615511799188336, + "flos": 21000834587520.0, + "grad_norm": 6.948998666256607, + "language_loss": 0.90410703, + "learning_rate": 3.99282885855576e-06, + "loss": 0.98850828, + "num_input_tokens_seen": 19913315, + "router_z_loss_clip": 5.89453125, + "router_z_loss_mlp": 0.82275391, + "step": 934, + "time_per_iteration": 2.5762336254119873 + }, + { + "auxiliary_loss_clip": 0.06990926, + "auxiliary_loss_mlp": 0.01429171, + "balance_loss_clip": 0.06438624, + "balance_loss_mlp": 0.01345153, + "epoch": 0.05621524124455133, + "flos": 17279062185600.0, + "grad_norm": 7.5646674228018265, + "language_loss": 0.84164441, + "learning_rate": 3.992795869723885e-06, + "loss": 0.92584538, + "num_input_tokens_seen": 19928790, + "router_z_loss_clip": 5.52734375, + "router_z_loss_mlp": 0.83984375, + "step": 935, + "time_per_iteration": 2.6203958988189697 + }, + { + "auxiliary_loss_clip": 0.06841761, + "auxiliary_loss_mlp": 0.01418196, + "balance_loss_clip": 0.06462182, + "balance_loss_mlp": 0.01359927, + "epoch": 0.0562753644972193, + "flos": 58737597194880.0, + "grad_norm": 0.8140808506826857, + "language_loss": 0.69178045, + "learning_rate": 3.99276280532499e-06, + "loss": 0.77438003, + "num_input_tokens_seen": 19988785, + "router_z_loss_clip": 3.79101562, + "router_z_loss_mlp": 0.58105469, + "step": 936, + "time_per_iteration": 3.1629393100738525 + }, + { + "auxiliary_loss_clip": 0.070338, + "auxiliary_loss_mlp": 0.01416227, + "balance_loss_clip": 0.06443301, + "balance_loss_mlp": 0.0133178, + "epoch": 0.05633548774988727, + "flos": 17462776262400.0, + "grad_norm": 4.591481841632389, + "language_loss": 0.81027842, + "learning_rate": 3.992729665360331e-06, + "loss": 0.89477861, + "num_input_tokens_seen": 20007685, + "router_z_loss_clip": 5.8984375, + "router_z_loss_mlp": 0.84472656, + "step": 937, + "time_per_iteration": 2.650186538696289 + }, + { + "auxiliary_loss_clip": 0.0684337, + "auxiliary_loss_mlp": 0.01393468, + "balance_loss_clip": 0.06467308, + "balance_loss_mlp": 0.01340683, + "epoch": 0.05639561100255524, + "flos": 70675939042560.0, + "grad_norm": 0.8752420339339617, + "language_loss": 0.64563346, + "learning_rate": 3.992696449831162e-06, + "loss": 0.72800183, + "num_input_tokens_seen": 20072750, + "router_z_loss_clip": 3.75, + "router_z_loss_mlp": 0.52880859, + "step": 938, + "time_per_iteration": 3.200669050216675 + }, + { + "auxiliary_loss_clip": 0.07073379, + "auxiliary_loss_mlp": 0.01391777, + "balance_loss_clip": 0.06460777, + "balance_loss_mlp": 0.01309332, + "epoch": 0.056455734255223204, + "flos": 20492346885120.0, + "grad_norm": 5.43214954330628, + "language_loss": 0.84251928, + "learning_rate": 3.992663158738745e-06, + "loss": 0.92717087, + "num_input_tokens_seen": 20089070, + "router_z_loss_clip": 6.125, + "router_z_loss_mlp": 0.82373047, + "step": 939, + "time_per_iteration": 2.622727870941162 + }, + { + "auxiliary_loss_clip": 0.07029171, + "auxiliary_loss_mlp": 0.01403853, + "balance_loss_clip": 0.06452838, + "balance_loss_mlp": 0.01326081, + "epoch": 0.056515857507891176, + "flos": 22059961395840.0, + "grad_norm": 5.005416621507547, + "language_loss": 0.76388282, + "learning_rate": 3.992629792084341e-06, + "loss": 0.84821308, + "num_input_tokens_seen": 20108790, + "router_z_loss_clip": 5.765625, + "router_z_loss_mlp": 0.77734375, + "step": 940, + "time_per_iteration": 2.6560001373291016 + }, + { + "auxiliary_loss_clip": 0.07005631, + "auxiliary_loss_mlp": 0.01389365, + "balance_loss_clip": 0.06443679, + "balance_loss_mlp": 0.01314073, + "epoch": 0.05657598076055915, + "flos": 24032291927040.0, + "grad_norm": 11.024308816683174, + "language_loss": 0.7415117, + "learning_rate": 3.992596349869216e-06, + "loss": 0.82546163, + "num_input_tokens_seen": 20128455, + "router_z_loss_clip": 5.6171875, + "router_z_loss_mlp": 0.75341797, + "step": 941, + "time_per_iteration": 2.691328525543213 + }, + { + "auxiliary_loss_clip": 0.07028662, + "auxiliary_loss_mlp": 0.01392256, + "balance_loss_clip": 0.06448376, + "balance_loss_mlp": 0.0131496, + "epoch": 0.05663610401322711, + "flos": 20486057829120.0, + "grad_norm": 6.757951792278694, + "language_loss": 0.8311438, + "learning_rate": 3.992562832094637e-06, + "loss": 0.91535294, + "num_input_tokens_seen": 20145775, + "router_z_loss_clip": 5.80859375, + "router_z_loss_mlp": 0.77246094, + "step": 942, + "time_per_iteration": 2.5987863540649414 + }, + { + "auxiliary_loss_clip": 0.07036945, + "auxiliary_loss_mlp": 0.01378378, + "balance_loss_clip": 0.06460088, + "balance_loss_mlp": 0.01303896, + "epoch": 0.056696227265895086, + "flos": 21075368394240.0, + "grad_norm": 21.600438823460475, + "language_loss": 0.92831737, + "learning_rate": 3.9925292387618755e-06, + "loss": 1.01247072, + "num_input_tokens_seen": 20164315, + "router_z_loss_clip": 5.765625, + "router_z_loss_mlp": 0.74462891, + "step": 943, + "time_per_iteration": 2.62147855758667 + }, + { + "auxiliary_loss_clip": 0.07040788, + "auxiliary_loss_mlp": 0.01386269, + "balance_loss_clip": 0.06462353, + "balance_loss_mlp": 0.01313027, + "epoch": 0.05675635051856306, + "flos": 17827017960960.0, + "grad_norm": 6.279897483523164, + "language_loss": 0.7991842, + "learning_rate": 3.992495569872206e-06, + "loss": 0.8834548, + "num_input_tokens_seen": 20182760, + "router_z_loss_clip": 5.78125, + "router_z_loss_mlp": 0.73242188, + "step": 944, + "time_per_iteration": 2.5755181312561035 + }, + { + "auxiliary_loss_clip": 0.0704762, + "auxiliary_loss_mlp": 0.01372731, + "balance_loss_clip": 0.06471305, + "balance_loss_mlp": 0.01300109, + "epoch": 0.05681647377123102, + "flos": 23122065024000.0, + "grad_norm": 11.186502162192404, + "language_loss": 0.82437181, + "learning_rate": 3.992461825426906e-06, + "loss": 0.90857524, + "num_input_tokens_seen": 20203830, + "router_z_loss_clip": 5.76171875, + "router_z_loss_mlp": 0.7265625, + "step": 945, + "time_per_iteration": 2.646212339401245 + }, + { + "auxiliary_loss_clip": 0.07062095, + "auxiliary_loss_mlp": 0.01352146, + "balance_loss_clip": 0.06473356, + "balance_loss_mlp": 0.01276854, + "epoch": 0.056876597023898995, + "flos": 16076024789760.0, + "grad_norm": 6.503065924665904, + "language_loss": 0.86640823, + "learning_rate": 3.992428005427252e-06, + "loss": 0.95055068, + "num_input_tokens_seen": 20220365, + "router_z_loss_clip": 5.890625, + "router_z_loss_mlp": 0.75195312, + "step": 946, + "time_per_iteration": 2.5955421924591064 + }, + { + "auxiliary_loss_clip": 0.07105307, + "auxiliary_loss_mlp": 0.01349465, + "balance_loss_clip": 0.06487983, + "balance_loss_mlp": 0.01268975, + "epoch": 0.05693672027656696, + "flos": 16841083294080.0, + "grad_norm": 30.160109907470417, + "language_loss": 0.83428961, + "learning_rate": 3.992394109874529e-06, + "loss": 0.91883731, + "num_input_tokens_seen": 20238640, + "router_z_loss_clip": 6.171875, + "router_z_loss_mlp": 0.80517578, + "step": 947, + "time_per_iteration": 2.578885078430176 + }, + { + "auxiliary_loss_clip": 0.07120173, + "auxiliary_loss_mlp": 0.01346427, + "balance_loss_clip": 0.06479014, + "balance_loss_mlp": 0.01264888, + "epoch": 0.05699684352923493, + "flos": 21394104526080.0, + "grad_norm": 7.760122513642949, + "language_loss": 0.89679337, + "learning_rate": 3.9923601387700225e-06, + "loss": 0.98145938, + "num_input_tokens_seen": 20251025, + "router_z_loss_clip": 6.40625, + "router_z_loss_mlp": 0.81542969, + "step": 948, + "time_per_iteration": 2.6047542095184326 + }, + { + "auxiliary_loss_clip": 0.07067588, + "auxiliary_loss_mlp": 0.01342886, + "balance_loss_clip": 0.06478094, + "balance_loss_mlp": 0.01268786, + "epoch": 0.057056966781902904, + "flos": 15565818078720.0, + "grad_norm": 4.718676024566818, + "language_loss": 0.91130018, + "learning_rate": 3.992326092115019e-06, + "loss": 0.99540496, + "num_input_tokens_seen": 20269775, + "router_z_loss_clip": 5.8984375, + "router_z_loss_mlp": 0.74121094, + "step": 949, + "time_per_iteration": 2.59798526763916 + }, + { + "auxiliary_loss_clip": 0.07052803, + "auxiliary_loss_mlp": 0.01334514, + "balance_loss_clip": 0.06479354, + "balance_loss_mlp": 0.01265897, + "epoch": 0.05711709003457087, + "flos": 19943971839360.0, + "grad_norm": 5.50050902669799, + "language_loss": 0.81973231, + "learning_rate": 3.992291969910811e-06, + "loss": 0.90360546, + "num_input_tokens_seen": 20287715, + "router_z_loss_clip": 5.73828125, + "router_z_loss_mlp": 0.68603516, + "step": 950, + "time_per_iteration": 2.6259987354278564 + }, + { + "auxiliary_loss_clip": 0.07096414, + "auxiliary_loss_mlp": 0.01341844, + "balance_loss_clip": 0.06496268, + "balance_loss_mlp": 0.0126536, + "epoch": 0.05717721328723884, + "flos": 30339953982720.0, + "grad_norm": 5.942643661235501, + "language_loss": 0.85793424, + "learning_rate": 3.992257772158691e-06, + "loss": 0.94231689, + "num_input_tokens_seen": 20307070, + "router_z_loss_clip": 5.99609375, + "router_z_loss_mlp": 0.76464844, + "step": 951, + "time_per_iteration": 2.6625497341156006 + }, + { + "auxiliary_loss_clip": 0.07096014, + "auxiliary_loss_mlp": 0.01337385, + "balance_loss_clip": 0.06490001, + "balance_loss_mlp": 0.0125494, + "epoch": 0.05723733653990681, + "flos": 23660251799040.0, + "grad_norm": 12.14793274648965, + "language_loss": 0.90794688, + "learning_rate": 3.992223498859958e-06, + "loss": 0.9922809, + "num_input_tokens_seen": 20324945, + "router_z_loss_clip": 6.06640625, + "router_z_loss_mlp": 0.82373047, + "step": 952, + "time_per_iteration": 2.6754026412963867 + }, + { + "auxiliary_loss_clip": 0.07150276, + "auxiliary_loss_mlp": 0.01358536, + "balance_loss_clip": 0.06509267, + "balance_loss_mlp": 0.01266268, + "epoch": 0.05729745979257478, + "flos": 22062518945280.0, + "grad_norm": 4.876026783534778, + "language_loss": 0.83819556, + "learning_rate": 3.9921891500159084e-06, + "loss": 0.92328364, + "num_input_tokens_seen": 20346135, + "router_z_loss_clip": 6.4140625, + "router_z_loss_mlp": 0.92333984, + "step": 953, + "time_per_iteration": 2.6004669666290283 + }, + { + "auxiliary_loss_clip": 0.07094061, + "auxiliary_loss_mlp": 0.01342327, + "balance_loss_clip": 0.06495301, + "balance_loss_mlp": 0.01262409, + "epoch": 0.05735758304524275, + "flos": 19609554994560.0, + "grad_norm": 6.9064094964387, + "language_loss": 0.9058758, + "learning_rate": 3.992154725627848e-06, + "loss": 0.99023962, + "num_input_tokens_seen": 20364450, + "router_z_loss_clip": 5.98046875, + "router_z_loss_mlp": 0.79931641, + "step": 954, + "time_per_iteration": 2.6270759105682373 + }, + { + "auxiliary_loss_clip": 0.07104363, + "auxiliary_loss_mlp": 0.01340099, + "balance_loss_clip": 0.06505129, + "balance_loss_mlp": 0.01262661, + "epoch": 0.057417706297910716, + "flos": 19105050360960.0, + "grad_norm": 6.439393268367411, + "language_loss": 0.9193548, + "learning_rate": 3.9921202256970804e-06, + "loss": 1.00379944, + "num_input_tokens_seen": 20383500, + "router_z_loss_clip": 6.0, + "router_z_loss_mlp": 0.77490234, + "step": 955, + "time_per_iteration": 2.5784714221954346 + }, + { + "auxiliary_loss_clip": 0.07088242, + "auxiliary_loss_mlp": 0.01339912, + "balance_loss_clip": 0.06500716, + "balance_loss_mlp": 0.01263379, + "epoch": 0.05747782955057869, + "flos": 16660136401920.0, + "grad_norm": 130.9595542139282, + "language_loss": 0.93622941, + "learning_rate": 3.992085650224914e-06, + "loss": 1.02051091, + "num_input_tokens_seen": 20400295, + "router_z_loss_clip": 5.875, + "router_z_loss_mlp": 0.765625, + "step": 956, + "time_per_iteration": 2.654709815979004 + }, + { + "auxiliary_loss_clip": 0.07069805, + "auxiliary_loss_mlp": 0.01336322, + "balance_loss_clip": 0.06513655, + "balance_loss_mlp": 0.01263795, + "epoch": 0.05753795280324665, + "flos": 14507362103040.0, + "grad_norm": 7.35623901329006, + "language_loss": 0.79601187, + "learning_rate": 3.99205099921266e-06, + "loss": 0.88007313, + "num_input_tokens_seen": 20419085, + "router_z_loss_clip": 5.5625, + "router_z_loss_mlp": 0.72509766, + "step": 957, + "time_per_iteration": 2.5814363956451416 + }, + { + "auxiliary_loss_clip": 0.07102334, + "auxiliary_loss_mlp": 0.013347, + "balance_loss_clip": 0.06516448, + "balance_loss_mlp": 0.01260171, + "epoch": 0.057598076055914625, + "flos": 18081995535360.0, + "grad_norm": 9.445676211161578, + "language_loss": 0.8370564, + "learning_rate": 3.992016272661633e-06, + "loss": 0.92142671, + "num_input_tokens_seen": 20437465, + "router_z_loss_clip": 5.859375, + "router_z_loss_mlp": 0.74511719, + "step": 958, + "time_per_iteration": 2.6244523525238037 + }, + { + "auxiliary_loss_clip": 0.0710094, + "auxiliary_loss_mlp": 0.01346675, + "balance_loss_clip": 0.06526074, + "balance_loss_mlp": 0.01272241, + "epoch": 0.0576581993085826, + "flos": 22130679841920.0, + "grad_norm": 4.908180525960309, + "language_loss": 0.91401774, + "learning_rate": 3.99198147057315e-06, + "loss": 0.99849397, + "num_input_tokens_seen": 20456235, + "router_z_loss_clip": 5.74609375, + "router_z_loss_mlp": 0.74365234, + "step": 959, + "time_per_iteration": 2.5950703620910645 + }, + { + "auxiliary_loss_clip": 0.07097997, + "auxiliary_loss_mlp": 0.01349298, + "balance_loss_clip": 0.06514278, + "balance_loss_mlp": 0.01272431, + "epoch": 0.05771832256125056, + "flos": 33190003232640.0, + "grad_norm": 5.502917231642364, + "language_loss": 0.82885253, + "learning_rate": 3.991946592948529e-06, + "loss": 0.91332549, + "num_input_tokens_seen": 20476825, + "router_z_loss_clip": 5.83203125, + "router_z_loss_mlp": 0.76904297, + "step": 960, + "time_per_iteration": 2.7026655673980713 + }, + { + "auxiliary_loss_clip": 0.07121219, + "auxiliary_loss_mlp": 0.0136329, + "balance_loss_clip": 0.06516127, + "balance_loss_mlp": 0.01276888, + "epoch": 0.057778445813918534, + "flos": 24176957201280.0, + "grad_norm": 10.105803552355386, + "language_loss": 0.96418011, + "learning_rate": 3.991911639789094e-06, + "loss": 1.0490253, + "num_input_tokens_seen": 20496965, + "router_z_loss_clip": 6.0546875, + "router_z_loss_mlp": 0.86425781, + "step": 961, + "time_per_iteration": 2.621075391769409 + }, + { + "auxiliary_loss_clip": 0.07137178, + "auxiliary_loss_mlp": 0.0136525, + "balance_loss_clip": 0.06529568, + "balance_loss_mlp": 0.01280421, + "epoch": 0.0578385690665865, + "flos": 29650770950400.0, + "grad_norm": 15.740079848034652, + "language_loss": 0.72144246, + "learning_rate": 3.991876611096169e-06, + "loss": 0.80646676, + "num_input_tokens_seen": 20518035, + "router_z_loss_clip": 6.08203125, + "router_z_loss_mlp": 0.84863281, + "step": 962, + "time_per_iteration": 2.662982702255249 + }, + { + "auxiliary_loss_clip": 0.07124397, + "auxiliary_loss_mlp": 0.01385383, + "balance_loss_clip": 0.06529254, + "balance_loss_mlp": 0.01300888, + "epoch": 0.05789869231925447, + "flos": 20891528536320.0, + "grad_norm": 6.9214750574770765, + "language_loss": 0.92274594, + "learning_rate": 3.991841506871084e-06, + "loss": 1.00784373, + "num_input_tokens_seen": 20534740, + "router_z_loss_clip": 5.953125, + "router_z_loss_mlp": 0.84521484, + "step": 963, + "time_per_iteration": 2.6076695919036865 + }, + { + "auxiliary_loss_clip": 0.07119042, + "auxiliary_loss_mlp": 0.01381304, + "balance_loss_clip": 0.06523143, + "balance_loss_mlp": 0.01297953, + "epoch": 0.057958815571922444, + "flos": 26038262672640.0, + "grad_norm": 11.895031253661099, + "language_loss": 0.8968147, + "learning_rate": 3.99180632711517e-06, + "loss": 0.98181814, + "num_input_tokens_seen": 20553485, + "router_z_loss_clip": 5.96484375, + "router_z_loss_mlp": 0.83300781, + "step": 964, + "time_per_iteration": 2.686906337738037 + }, + { + "auxiliary_loss_clip": 0.07105853, + "auxiliary_loss_mlp": 0.01387507, + "balance_loss_clip": 0.06517063, + "balance_loss_mlp": 0.01305252, + "epoch": 0.05801893882459041, + "flos": 18083588762880.0, + "grad_norm": 5.536598394443464, + "language_loss": 0.80100715, + "learning_rate": 3.99177107182976e-06, + "loss": 0.88594079, + "num_input_tokens_seen": 20572155, + "router_z_loss_clip": 5.88671875, + "router_z_loss_mlp": 0.82275391, + "step": 965, + "time_per_iteration": 4.090426921844482 + }, + { + "auxiliary_loss_clip": 0.07108907, + "auxiliary_loss_mlp": 0.01388674, + "balance_loss_clip": 0.0653006, + "balance_loss_mlp": 0.01307803, + "epoch": 0.05807906207725838, + "flos": 17754664360320.0, + "grad_norm": 8.638909024191255, + "language_loss": 0.85803884, + "learning_rate": 3.99173574101619e-06, + "loss": 0.94301462, + "num_input_tokens_seen": 20590395, + "router_z_loss_clip": 5.79296875, + "router_z_loss_mlp": 0.80859375, + "step": 966, + "time_per_iteration": 2.593015670776367 + }, + { + "auxiliary_loss_clip": 0.07081844, + "auxiliary_loss_mlp": 0.01385278, + "balance_loss_clip": 0.06515825, + "balance_loss_mlp": 0.01308507, + "epoch": 0.058139185329926346, + "flos": 18046133187840.0, + "grad_norm": 11.004143242377477, + "language_loss": 0.80350578, + "learning_rate": 3.9917003346758035e-06, + "loss": 0.88817692, + "num_input_tokens_seen": 20608435, + "router_z_loss_clip": 5.671875, + "router_z_loss_mlp": 0.76855469, + "step": 967, + "time_per_iteration": 4.057944297790527 + }, + { + "auxiliary_loss_clip": 0.06839906, + "auxiliary_loss_mlp": 0.01357839, + "balance_loss_clip": 0.06483683, + "balance_loss_mlp": 0.01313065, + "epoch": 0.05819930858259432, + "flos": 62381355845760.0, + "grad_norm": 0.8360355245003168, + "language_loss": 0.57554376, + "learning_rate": 3.991664852809939e-06, + "loss": 0.65752125, + "num_input_tokens_seen": 20668575, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 0.44799805, + "step": 968, + "time_per_iteration": 3.167989730834961 + }, + { + "auxiliary_loss_clip": 0.07096039, + "auxiliary_loss_mlp": 0.01391053, + "balance_loss_clip": 0.06529184, + "balance_loss_mlp": 0.01317, + "epoch": 0.05825943183526229, + "flos": 19141373905920.0, + "grad_norm": 7.005112994692607, + "language_loss": 0.84630275, + "learning_rate": 3.991629295419945e-06, + "loss": 0.93117368, + "num_input_tokens_seen": 20687355, + "router_z_loss_clip": 5.67578125, + "router_z_loss_mlp": 0.74072266, + "step": 969, + "time_per_iteration": 4.074899911880493 + }, + { + "auxiliary_loss_clip": 0.07116528, + "auxiliary_loss_mlp": 0.0138256, + "balance_loss_clip": 0.06523499, + "balance_loss_mlp": 0.01301068, + "epoch": 0.058319555087930255, + "flos": 29030042304000.0, + "grad_norm": 8.083926871251307, + "language_loss": 0.82668531, + "learning_rate": 3.991593662507167e-06, + "loss": 0.91167617, + "num_input_tokens_seen": 20705710, + "router_z_loss_clip": 5.9296875, + "router_z_loss_mlp": 0.81542969, + "step": 970, + "time_per_iteration": 2.659989833831787 + }, + { + "auxiliary_loss_clip": 0.07099806, + "auxiliary_loss_mlp": 0.01400005, + "balance_loss_clip": 0.06510817, + "balance_loss_mlp": 0.01317321, + "epoch": 0.05837967834059823, + "flos": 18885977061120.0, + "grad_norm": 16.518563352615757, + "language_loss": 0.96487081, + "learning_rate": 3.991557954072958e-06, + "loss": 1.04986882, + "num_input_tokens_seen": 20722405, + "router_z_loss_clip": 5.890625, + "router_z_loss_mlp": 0.82714844, + "step": 971, + "time_per_iteration": 2.666133165359497 + }, + { + "auxiliary_loss_clip": 0.07087609, + "auxiliary_loss_mlp": 0.01388607, + "balance_loss_clip": 0.06502773, + "balance_loss_mlp": 0.01310834, + "epoch": 0.05843980159326619, + "flos": 25710218737920.0, + "grad_norm": 16.27135895590574, + "language_loss": 0.89295512, + "learning_rate": 3.991522170118673e-06, + "loss": 0.97771728, + "num_input_tokens_seen": 20741480, + "router_z_loss_clip": 5.84765625, + "router_z_loss_mlp": 0.77832031, + "step": 972, + "time_per_iteration": 2.655470848083496 + }, + { + "auxiliary_loss_clip": 0.07066658, + "auxiliary_loss_mlp": 0.01374677, + "balance_loss_clip": 0.0650342, + "balance_loss_mlp": 0.01301482, + "epoch": 0.058499924845934165, + "flos": 25558425866880.0, + "grad_norm": 4.193788183762945, + "language_loss": 0.90456176, + "learning_rate": 3.991486310645667e-06, + "loss": 0.98897511, + "num_input_tokens_seen": 20759685, + "router_z_loss_clip": 5.62890625, + "router_z_loss_mlp": 0.73144531, + "step": 973, + "time_per_iteration": 2.6482443809509277 + }, + { + "auxiliary_loss_clip": 0.0705331, + "auxiliary_loss_mlp": 0.01383547, + "balance_loss_clip": 0.06485617, + "balance_loss_mlp": 0.01307635, + "epoch": 0.05856004809860214, + "flos": 16441859715840.0, + "grad_norm": 11.262132273646074, + "language_loss": 0.77443254, + "learning_rate": 3.991450375655301e-06, + "loss": 0.85880107, + "num_input_tokens_seen": 20778180, + "router_z_loss_clip": 5.6796875, + "router_z_loss_mlp": 0.75878906, + "step": 974, + "time_per_iteration": 2.57619047164917 + }, + { + "auxiliary_loss_clip": 0.07050242, + "auxiliary_loss_mlp": 0.01379524, + "balance_loss_clip": 0.06485987, + "balance_loss_mlp": 0.01304852, + "epoch": 0.0586201713512701, + "flos": 39468385486080.0, + "grad_norm": 6.566272929573762, + "language_loss": 0.79448825, + "learning_rate": 3.991414365148936e-06, + "loss": 0.87878591, + "num_input_tokens_seen": 20802705, + "router_z_loss_clip": 5.640625, + "router_z_loss_mlp": 0.74707031, + "step": 975, + "time_per_iteration": 2.79398250579834 + }, + { + "auxiliary_loss_clip": 0.07056309, + "auxiliary_loss_mlp": 0.0138878, + "balance_loss_clip": 0.06472544, + "balance_loss_mlp": 0.01304809, + "epoch": 0.058680294603938074, + "flos": 23371466302080.0, + "grad_norm": 4.828568059250088, + "language_loss": 0.79758298, + "learning_rate": 3.99137827912794e-06, + "loss": 0.88203388, + "num_input_tokens_seen": 20822540, + "router_z_loss_clip": 5.83984375, + "router_z_loss_mlp": 0.83984375, + "step": 976, + "time_per_iteration": 2.6214101314544678 + }, + { + "auxiliary_loss_clip": 0.07040592, + "auxiliary_loss_mlp": 0.01371791, + "balance_loss_clip": 0.06474636, + "balance_loss_mlp": 0.01299216, + "epoch": 0.05874041785660604, + "flos": 32239930913280.0, + "grad_norm": 7.236872171762386, + "language_loss": 0.89953148, + "learning_rate": 3.991342117593679e-06, + "loss": 0.98365533, + "num_input_tokens_seen": 20844175, + "router_z_loss_clip": 5.66015625, + "router_z_loss_mlp": 0.72607422, + "step": 977, + "time_per_iteration": 2.681955099105835 + }, + { + "auxiliary_loss_clip": 0.07041348, + "auxiliary_loss_mlp": 0.01373201, + "balance_loss_clip": 0.06467118, + "balance_loss_mlp": 0.01295619, + "epoch": 0.05880054110927401, + "flos": 22316657978880.0, + "grad_norm": 7.280318669233247, + "language_loss": 0.82238227, + "learning_rate": 3.991305880547527e-06, + "loss": 0.90652776, + "num_input_tokens_seen": 20864730, + "router_z_loss_clip": 5.734375, + "router_z_loss_mlp": 0.77587891, + "step": 978, + "time_per_iteration": 2.614290952682495 + }, + { + "auxiliary_loss_clip": 0.0707294, + "auxiliary_loss_mlp": 0.0136034, + "balance_loss_clip": 0.06484175, + "balance_loss_mlp": 0.01280184, + "epoch": 0.05886066436194198, + "flos": 27387726278400.0, + "grad_norm": 155.96057049304315, + "language_loss": 0.83328485, + "learning_rate": 3.991269567990855e-06, + "loss": 0.91761768, + "num_input_tokens_seen": 20885200, + "router_z_loss_clip": 5.89453125, + "router_z_loss_mlp": 0.80175781, + "step": 979, + "time_per_iteration": 2.635091543197632 + }, + { + "auxiliary_loss_clip": 0.0672864, + "auxiliary_loss_mlp": 0.01304756, + "balance_loss_clip": 0.06376771, + "balance_loss_mlp": 0.01257311, + "epoch": 0.05892078761460995, + "flos": 59601102647040.0, + "grad_norm": 0.9093094214807238, + "language_loss": 0.59396595, + "learning_rate": 3.9912331799250415e-06, + "loss": 0.67429984, + "num_input_tokens_seen": 20940325, + "router_z_loss_clip": 3.51953125, + "router_z_loss_mlp": 0.47387695, + "step": 980, + "time_per_iteration": 3.1261343955993652 + }, + { + "auxiliary_loss_clip": 0.07034945, + "auxiliary_loss_mlp": 0.01348733, + "balance_loss_clip": 0.06472749, + "balance_loss_mlp": 0.01274394, + "epoch": 0.05898091086727792, + "flos": 15419517649920.0, + "grad_norm": 3.186788863209633, + "language_loss": 0.90080172, + "learning_rate": 3.9911967163514665e-06, + "loss": 0.98463851, + "num_input_tokens_seen": 20958220, + "router_z_loss_clip": 5.625, + "router_z_loss_mlp": 0.74267578, + "step": 981, + "time_per_iteration": 2.5808515548706055 + }, + { + "auxiliary_loss_clip": 0.0705516, + "auxiliary_loss_mlp": 0.01348366, + "balance_loss_clip": 0.06484837, + "balance_loss_mlp": 0.0127746, + "epoch": 0.059041034119945886, + "flos": 23661383829120.0, + "grad_norm": 5.662656134717616, + "language_loss": 0.82531273, + "learning_rate": 3.991160177271513e-06, + "loss": 0.90934801, + "num_input_tokens_seen": 20978920, + "router_z_loss_clip": 5.703125, + "router_z_loss_mlp": 0.70898438, + "step": 982, + "time_per_iteration": 2.7105038166046143 + }, + { + "auxiliary_loss_clip": 0.07084571, + "auxiliary_loss_mlp": 0.01361032, + "balance_loss_clip": 0.06488383, + "balance_loss_mlp": 0.01281162, + "epoch": 0.05910115737261386, + "flos": 24761026886400.0, + "grad_norm": 3.604575523078559, + "language_loss": 0.87251258, + "learning_rate": 3.9911235626865654e-06, + "loss": 0.95696855, + "num_input_tokens_seen": 20999490, + "router_z_loss_clip": 5.9609375, + "router_z_loss_mlp": 0.79882812, + "step": 983, + "time_per_iteration": 2.744180917739868 + }, + { + "auxiliary_loss_clip": 0.07044654, + "auxiliary_loss_mlp": 0.01351466, + "balance_loss_clip": 0.06470264, + "balance_loss_mlp": 0.01274648, + "epoch": 0.05916128062528183, + "flos": 11733523741440.0, + "grad_norm": 4.930042751750388, + "language_loss": 0.87498015, + "learning_rate": 3.9910868725980125e-06, + "loss": 0.95894134, + "num_input_tokens_seen": 21017865, + "router_z_loss_clip": 5.734375, + "router_z_loss_mlp": 0.76806641, + "step": 984, + "time_per_iteration": 2.651169538497925 + }, + { + "auxiliary_loss_clip": 0.0704496, + "auxiliary_loss_mlp": 0.01342068, + "balance_loss_clip": 0.06470487, + "balance_loss_mlp": 0.01264582, + "epoch": 0.059221403877949795, + "flos": 21908587795200.0, + "grad_norm": 5.844491017467261, + "language_loss": 0.80473924, + "learning_rate": 3.9910501070072465e-06, + "loss": 0.88860953, + "num_input_tokens_seen": 21035900, + "router_z_loss_clip": 5.74609375, + "router_z_loss_mlp": 0.77490234, + "step": 985, + "time_per_iteration": 2.6289291381835938 + }, + { + "auxiliary_loss_clip": 0.07058708, + "auxiliary_loss_mlp": 0.01361985, + "balance_loss_clip": 0.06475725, + "balance_loss_mlp": 0.01284213, + "epoch": 0.05928152713061777, + "flos": 20519614189440.0, + "grad_norm": 6.301686711015131, + "language_loss": 0.93571031, + "learning_rate": 3.991013265915661e-06, + "loss": 1.01991737, + "num_input_tokens_seen": 21053235, + "router_z_loss_clip": 5.83203125, + "router_z_loss_mlp": 0.77783203, + "step": 986, + "time_per_iteration": 2.655438184738159 + }, + { + "auxiliary_loss_clip": 0.0708475, + "auxiliary_loss_mlp": 0.01349267, + "balance_loss_clip": 0.06479746, + "balance_loss_mlp": 0.01270303, + "epoch": 0.05934165038328574, + "flos": 24501437337600.0, + "grad_norm": 4.15562600287031, + "language_loss": 0.79382873, + "learning_rate": 3.9909763493246525e-06, + "loss": 0.87816888, + "num_input_tokens_seen": 21073090, + "router_z_loss_clip": 6.0546875, + "router_z_loss_mlp": 0.79003906, + "step": 987, + "time_per_iteration": 2.635974168777466 + }, + { + "auxiliary_loss_clip": 0.07112011, + "auxiliary_loss_mlp": 0.01375395, + "balance_loss_clip": 0.06492966, + "balance_loss_mlp": 0.0128861, + "epoch": 0.059401773635953704, + "flos": 38737302612480.0, + "grad_norm": 3.024721532830348, + "language_loss": 0.74664164, + "learning_rate": 3.990939357235621e-06, + "loss": 0.83151573, + "num_input_tokens_seen": 21094895, + "router_z_loss_clip": 6.19140625, + "router_z_loss_mlp": 0.8671875, + "step": 988, + "time_per_iteration": 2.8440210819244385 + }, + { + "auxiliary_loss_clip": 0.06738614, + "auxiliary_loss_mlp": 0.01302441, + "balance_loss_clip": 0.06389277, + "balance_loss_mlp": 0.01254757, + "epoch": 0.059461896888621676, + "flos": 58041244638720.0, + "grad_norm": 0.9346440677006217, + "language_loss": 0.71295583, + "learning_rate": 3.99090228964997e-06, + "loss": 0.79336637, + "num_input_tokens_seen": 21147555, + "router_z_loss_clip": 3.5, + "router_z_loss_mlp": 0.4765625, + "step": 989, + "time_per_iteration": 3.0397932529449463 + }, + { + "auxiliary_loss_clip": 0.07105568, + "auxiliary_loss_mlp": 0.01373719, + "balance_loss_clip": 0.06490866, + "balance_loss_mlp": 0.01288604, + "epoch": 0.05952202014128964, + "flos": 22134369421440.0, + "grad_norm": 3.813782873152628, + "language_loss": 0.81950057, + "learning_rate": 3.990865146569105e-06, + "loss": 0.90429342, + "num_input_tokens_seen": 21167845, + "router_z_loss_clip": 6.1484375, + "router_z_loss_mlp": 0.85107422, + "step": 990, + "time_per_iteration": 2.679490804672241 + }, + { + "auxiliary_loss_clip": 0.07070604, + "auxiliary_loss_mlp": 0.0136635, + "balance_loss_clip": 0.0648191, + "balance_loss_mlp": 0.01286957, + "epoch": 0.059582143393957614, + "flos": 20451495219840.0, + "grad_norm": 3.1821025671437786, + "language_loss": 0.88952839, + "learning_rate": 3.990827927994434e-06, + "loss": 0.97389793, + "num_input_tokens_seen": 21185085, + "router_z_loss_clip": 5.890625, + "router_z_loss_mlp": 0.79443359, + "step": 991, + "time_per_iteration": 2.6212010383605957 + }, + { + "auxiliary_loss_clip": 0.07097097, + "auxiliary_loss_mlp": 0.01373652, + "balance_loss_clip": 0.06486384, + "balance_loss_mlp": 0.012893, + "epoch": 0.059642266646625586, + "flos": 20601149811840.0, + "grad_norm": 4.7552664277712475, + "language_loss": 0.80401003, + "learning_rate": 3.9907906339273674e-06, + "loss": 0.88871753, + "num_input_tokens_seen": 21204230, + "router_z_loss_clip": 6.1171875, + "router_z_loss_mlp": 0.84375, + "step": 992, + "time_per_iteration": 2.6194934844970703 + }, + { + "auxiliary_loss_clip": 0.07081859, + "auxiliary_loss_mlp": 0.01371261, + "balance_loss_clip": 0.06485239, + "balance_loss_mlp": 0.01292869, + "epoch": 0.05970238989929355, + "flos": 19358434707840.0, + "grad_norm": 7.615023287218043, + "language_loss": 0.78822339, + "learning_rate": 3.9907532643693215e-06, + "loss": 0.87275457, + "num_input_tokens_seen": 21222655, + "router_z_loss_clip": 5.96875, + "router_z_loss_mlp": 0.78417969, + "step": 993, + "time_per_iteration": 2.5962717533111572 + }, + { + "auxiliary_loss_clip": 0.07073358, + "auxiliary_loss_mlp": 0.01364747, + "balance_loss_clip": 0.06486119, + "balance_loss_mlp": 0.01289073, + "epoch": 0.05976251315196152, + "flos": 30272002721280.0, + "grad_norm": 5.1352604598244, + "language_loss": 0.83427668, + "learning_rate": 3.990715819321712e-06, + "loss": 0.91865766, + "num_input_tokens_seen": 21242310, + "router_z_loss_clip": 5.875, + "router_z_loss_mlp": 0.75634766, + "step": 994, + "time_per_iteration": 2.677586317062378 + }, + { + "auxiliary_loss_clip": 0.07096842, + "auxiliary_loss_mlp": 0.01391454, + "balance_loss_clip": 0.06492864, + "balance_loss_mlp": 0.01313491, + "epoch": 0.05982263640462949, + "flos": 23191819148160.0, + "grad_norm": 4.423928105923456, + "language_loss": 0.83424294, + "learning_rate": 3.99067829878596e-06, + "loss": 0.91912591, + "num_input_tokens_seen": 21261410, + "router_z_loss_clip": 6.046875, + "router_z_loss_mlp": 0.77978516, + "step": 995, + "time_per_iteration": 2.62821364402771 + }, + { + "auxiliary_loss_clip": 0.07109222, + "auxiliary_loss_mlp": 0.01389117, + "balance_loss_clip": 0.06503183, + "balance_loss_mlp": 0.01309247, + "epoch": 0.05988275965729746, + "flos": 27857584448640.0, + "grad_norm": 3.07551937102457, + "language_loss": 0.89631027, + "learning_rate": 3.990640702763487e-06, + "loss": 0.98129368, + "num_input_tokens_seen": 21280080, + "router_z_loss_clip": 6.05859375, + "router_z_loss_mlp": 0.79785156, + "step": 996, + "time_per_iteration": 2.6472525596618652 + }, + { + "auxiliary_loss_clip": 0.0709434, + "auxiliary_loss_mlp": 0.01374144, + "balance_loss_clip": 0.06487706, + "balance_loss_mlp": 0.01292461, + "epoch": 0.05994288290996543, + "flos": 24686744641920.0, + "grad_norm": 3.8490454271878023, + "language_loss": 0.91812748, + "learning_rate": 3.990603031255718e-06, + "loss": 1.00281239, + "num_input_tokens_seen": 21296765, + "router_z_loss_clip": 6.05078125, + "router_z_loss_mlp": 0.81689453, + "step": 997, + "time_per_iteration": 2.6353485584259033 + }, + { + "auxiliary_loss_clip": 0.06747872, + "auxiliary_loss_mlp": 0.0129538, + "balance_loss_clip": 0.06402076, + "balance_loss_mlp": 0.01256113, + "epoch": 0.0600030061626334, + "flos": 69951187152000.0, + "grad_norm": 1.0138660307708214, + "language_loss": 0.75495923, + "learning_rate": 3.990565284264083e-06, + "loss": 0.83539176, + "num_input_tokens_seen": 21363345, + "router_z_loss_clip": 3.4609375, + "router_z_loss_mlp": 0.39233398, + "step": 998, + "time_per_iteration": 3.2664620876312256 + }, + { + "auxiliary_loss_clip": 0.07050692, + "auxiliary_loss_mlp": 0.01361564, + "balance_loss_clip": 0.06468829, + "balance_loss_mlp": 0.01286844, + "epoch": 0.06006312941530137, + "flos": 26547085791360.0, + "grad_norm": 6.665102912139699, + "language_loss": 0.78679419, + "learning_rate": 3.990527461790013e-06, + "loss": 0.87091672, + "num_input_tokens_seen": 21385290, + "router_z_loss_clip": 5.8203125, + "router_z_loss_mlp": 0.74707031, + "step": 999, + "time_per_iteration": 2.6708481311798096 + }, + { + "auxiliary_loss_clip": 0.07090119, + "auxiliary_loss_mlp": 0.01381378, + "balance_loss_clip": 0.06486722, + "balance_loss_mlp": 0.01301603, + "epoch": 0.060123252667969335, + "flos": 27351276952320.0, + "grad_norm": 3.7400701542168013, + "language_loss": 0.85150427, + "learning_rate": 3.990489563834943e-06, + "loss": 0.93621922, + "num_input_tokens_seen": 21407625, + "router_z_loss_clip": 6.03515625, + "router_z_loss_mlp": 0.79833984, + "step": 1000, + "time_per_iteration": 2.643961191177368 + }, + { + "auxiliary_loss_clip": 0.07061431, + "auxiliary_loss_mlp": 0.01377664, + "balance_loss_clip": 0.06471995, + "balance_loss_mlp": 0.01297555, + "epoch": 0.06018337592063731, + "flos": 27024113485440.0, + "grad_norm": 4.060867986193189, + "language_loss": 0.88738573, + "learning_rate": 3.990451590400309e-06, + "loss": 0.97177666, + "num_input_tokens_seen": 21426835, + "router_z_loss_clip": 5.890625, + "router_z_loss_mlp": 0.80126953, + "step": 1001, + "time_per_iteration": 2.629136323928833 + }, + { + "auxiliary_loss_clip": 0.07032709, + "auxiliary_loss_mlp": 0.01355395, + "balance_loss_clip": 0.06470643, + "balance_loss_mlp": 0.01289306, + "epoch": 0.06024349917330528, + "flos": 25599990291840.0, + "grad_norm": 3.249124655019378, + "language_loss": 0.76097226, + "learning_rate": 3.990413541487551e-06, + "loss": 0.84485334, + "num_input_tokens_seen": 21444920, + "router_z_loss_clip": 5.6171875, + "router_z_loss_mlp": 0.66162109, + "step": 1002, + "time_per_iteration": 2.6258249282836914 + }, + { + "auxiliary_loss_clip": 0.07068716, + "auxiliary_loss_mlp": 0.01374313, + "balance_loss_clip": 0.06480874, + "balance_loss_mlp": 0.01298067, + "epoch": 0.060303622425973244, + "flos": 26139225242880.0, + "grad_norm": 4.8561241229026075, + "language_loss": 0.78990388, + "learning_rate": 3.990375417098112e-06, + "loss": 0.87433422, + "num_input_tokens_seen": 21463555, + "router_z_loss_clip": 5.8828125, + "router_z_loss_mlp": 0.76220703, + "step": 1003, + "time_per_iteration": 2.7662932872772217 + }, + { + "auxiliary_loss_clip": 0.0707517, + "auxiliary_loss_mlp": 0.01365139, + "balance_loss_clip": 0.0647432, + "balance_loss_mlp": 0.01284077, + "epoch": 0.060363745678641216, + "flos": 20383627812480.0, + "grad_norm": 4.219450714846169, + "language_loss": 0.73012471, + "learning_rate": 3.990337217233437e-06, + "loss": 0.81452775, + "num_input_tokens_seen": 21481990, + "router_z_loss_clip": 6.015625, + "router_z_loss_mlp": 0.81005859, + "step": 1004, + "time_per_iteration": 5.472697734832764 + }, + { + "auxiliary_loss_clip": 0.07068998, + "auxiliary_loss_mlp": 0.01370949, + "balance_loss_clip": 0.06471765, + "balance_loss_mlp": 0.0129313, + "epoch": 0.06042386893130918, + "flos": 17754999776640.0, + "grad_norm": 3.350107422381743, + "language_loss": 0.86839885, + "learning_rate": 3.990298941894976e-06, + "loss": 0.95279837, + "num_input_tokens_seen": 21500385, + "router_z_loss_clip": 5.96875, + "router_z_loss_mlp": 0.77832031, + "step": 1005, + "time_per_iteration": 2.628612518310547 + }, + { + "auxiliary_loss_clip": 0.06732726, + "auxiliary_loss_mlp": 0.01300149, + "balance_loss_clip": 0.06388327, + "balance_loss_mlp": 0.01255518, + "epoch": 0.06048399218397715, + "flos": 68559110945280.0, + "grad_norm": 0.8658661250215584, + "language_loss": 0.59003174, + "learning_rate": 3.9902605910841794e-06, + "loss": 0.67036045, + "num_input_tokens_seen": 21561040, + "router_z_loss_clip": 3.4375, + "router_z_loss_mlp": 0.4465332, + "step": 1006, + "time_per_iteration": 3.2709102630615234 + }, + { + "auxiliary_loss_clip": 0.07070711, + "auxiliary_loss_mlp": 0.01360281, + "balance_loss_clip": 0.06464767, + "balance_loss_mlp": 0.01278123, + "epoch": 0.060544115436645125, + "flos": 23265262851840.0, + "grad_norm": 3.0418653981095973, + "language_loss": 0.77645856, + "learning_rate": 3.990222164802503e-06, + "loss": 0.8607685, + "num_input_tokens_seen": 21580655, + "router_z_loss_clip": 6.05859375, + "router_z_loss_mlp": 0.82128906, + "step": 1007, + "time_per_iteration": 4.056382894515991 + }, + { + "auxiliary_loss_clip": 0.07091306, + "auxiliary_loss_mlp": 0.01370917, + "balance_loss_clip": 0.06486145, + "balance_loss_mlp": 0.01290475, + "epoch": 0.06060423868931309, + "flos": 23885236811520.0, + "grad_norm": 3.189900491688776, + "language_loss": 0.83630216, + "learning_rate": 3.9901836630514006e-06, + "loss": 0.92092443, + "num_input_tokens_seen": 21599650, + "router_z_loss_clip": 6.05859375, + "router_z_loss_mlp": 0.8046875, + "step": 1008, + "time_per_iteration": 2.6701247692108154 + }, + { + "auxiliary_loss_clip": 0.07042849, + "auxiliary_loss_mlp": 0.01344814, + "balance_loss_clip": 0.06474254, + "balance_loss_mlp": 0.01273718, + "epoch": 0.06066436194198106, + "flos": 18733010232960.0, + "grad_norm": 8.677434751337552, + "language_loss": 0.80948377, + "learning_rate": 3.990145085832335e-06, + "loss": 0.89336038, + "num_input_tokens_seen": 21617550, + "router_z_loss_clip": 5.6875, + "router_z_loss_mlp": 0.71142578, + "step": 1009, + "time_per_iteration": 4.013457536697388 + }, + { + "auxiliary_loss_clip": 0.07022181, + "auxiliary_loss_mlp": 0.01332483, + "balance_loss_clip": 0.06467105, + "balance_loss_mlp": 0.01266345, + "epoch": 0.06072448519464903, + "flos": 24646689590400.0, + "grad_norm": 3.258884654543471, + "language_loss": 0.95985019, + "learning_rate": 3.990106433146769e-06, + "loss": 1.04339683, + "num_input_tokens_seen": 21635865, + "router_z_loss_clip": 5.55078125, + "router_z_loss_mlp": 0.66162109, + "step": 1010, + "time_per_iteration": 2.631512403488159 + }, + { + "auxiliary_loss_clip": 0.07117961, + "auxiliary_loss_mlp": 0.01383111, + "balance_loss_clip": 0.06489638, + "balance_loss_mlp": 0.01291845, + "epoch": 0.060784608447317, + "flos": 17383672408320.0, + "grad_norm": 3.3823449890168145, + "language_loss": 0.75409305, + "learning_rate": 3.9900677049961665e-06, + "loss": 0.83910382, + "num_input_tokens_seen": 21653945, + "router_z_loss_clip": 6.28125, + "router_z_loss_mlp": 0.91259766, + "step": 1011, + "time_per_iteration": 2.5896708965301514 + }, + { + "auxiliary_loss_clip": 0.07033786, + "auxiliary_loss_mlp": 0.01345512, + "balance_loss_clip": 0.06462559, + "balance_loss_mlp": 0.0126526, + "epoch": 0.06084473169998497, + "flos": 23698336279680.0, + "grad_norm": 3.246815093008435, + "language_loss": 0.89853048, + "learning_rate": 3.990028901381999e-06, + "loss": 0.98232347, + "num_input_tokens_seen": 21671230, + "router_z_loss_clip": 5.7109375, + "router_z_loss_mlp": 0.80273438, + "step": 1012, + "time_per_iteration": 2.637019157409668 + }, + { + "auxiliary_loss_clip": 0.07040339, + "auxiliary_loss_mlp": 0.01338129, + "balance_loss_clip": 0.06458548, + "balance_loss_mlp": 0.01258211, + "epoch": 0.06090485495265294, + "flos": 23552455121280.0, + "grad_norm": 2.5392970439405116, + "language_loss": 0.79602826, + "learning_rate": 3.989990022305734e-06, + "loss": 0.8798129, + "num_input_tokens_seen": 21691155, + "router_z_loss_clip": 5.81640625, + "router_z_loss_mlp": 0.79980469, + "step": 1013, + "time_per_iteration": 2.658986806869507 + }, + { + "auxiliary_loss_clip": 0.0703081, + "auxiliary_loss_mlp": 0.01334151, + "balance_loss_clip": 0.06449694, + "balance_loss_mlp": 0.01255664, + "epoch": 0.06096497820532091, + "flos": 20345501404800.0, + "grad_norm": 3.5799775107607585, + "language_loss": 0.88768977, + "learning_rate": 3.98995106776885e-06, + "loss": 0.97133934, + "num_input_tokens_seen": 21707405, + "router_z_loss_clip": 5.8203125, + "router_z_loss_mlp": 0.78515625, + "step": 1014, + "time_per_iteration": 2.6026017665863037 + }, + { + "auxiliary_loss_clip": 0.07069368, + "auxiliary_loss_mlp": 0.01344703, + "balance_loss_clip": 0.06459542, + "balance_loss_mlp": 0.01260589, + "epoch": 0.061025101457988874, + "flos": 26945638536960.0, + "grad_norm": 5.148864357756937, + "language_loss": 0.77818727, + "learning_rate": 3.98991203777282e-06, + "loss": 0.86232805, + "num_input_tokens_seen": 21728090, + "router_z_loss_clip": 6.1015625, + "router_z_loss_mlp": 0.84082031, + "step": 1015, + "time_per_iteration": 2.6645917892456055 + }, + { + "auxiliary_loss_clip": 0.07000691, + "auxiliary_loss_mlp": 0.01326184, + "balance_loss_clip": 0.06455131, + "balance_loss_mlp": 0.01257949, + "epoch": 0.061085224710656846, + "flos": 25382216730240.0, + "grad_norm": 2.4567185281472868, + "language_loss": 0.82061088, + "learning_rate": 3.9898729323191275e-06, + "loss": 0.90387964, + "num_input_tokens_seen": 21747950, + "router_z_loss_clip": 5.453125, + "router_z_loss_mlp": 0.68359375, + "step": 1016, + "time_per_iteration": 2.631394863128662 + }, + { + "auxiliary_loss_clip": 0.07014458, + "auxiliary_loss_mlp": 0.01339398, + "balance_loss_clip": 0.06457797, + "balance_loss_mlp": 0.01263962, + "epoch": 0.06114534796332482, + "flos": 24831326062080.0, + "grad_norm": 2.2885034058804363, + "language_loss": 0.78705657, + "learning_rate": 3.989833751409254e-06, + "loss": 0.8705951, + "num_input_tokens_seen": 21767900, + "router_z_loss_clip": 5.55859375, + "router_z_loss_mlp": 0.75390625, + "step": 1017, + "time_per_iteration": 2.657306432723999 + }, + { + "auxiliary_loss_clip": 0.07054974, + "auxiliary_loss_mlp": 0.0134134, + "balance_loss_clip": 0.06458369, + "balance_loss_mlp": 0.01256225, + "epoch": 0.061205471215992784, + "flos": 20637724919040.0, + "grad_norm": 9.632952296777574, + "language_loss": 0.88575757, + "learning_rate": 3.989794495044685e-06, + "loss": 0.96972066, + "num_input_tokens_seen": 21787375, + "router_z_loss_clip": 5.97265625, + "router_z_loss_mlp": 0.85107422, + "step": 1018, + "time_per_iteration": 2.5989861488342285 + }, + { + "auxiliary_loss_clip": 0.07009743, + "auxiliary_loss_mlp": 0.01334982, + "balance_loss_clip": 0.06455217, + "balance_loss_mlp": 0.01259165, + "epoch": 0.061265594468660756, + "flos": 16513919827200.0, + "grad_norm": 8.927182809216816, + "language_loss": 0.8225174, + "learning_rate": 3.989755163226909e-06, + "loss": 0.90596467, + "num_input_tokens_seen": 21806275, + "router_z_loss_clip": 5.54296875, + "router_z_loss_mlp": 0.75878906, + "step": 1019, + "time_per_iteration": 2.596885919570923 + }, + { + "auxiliary_loss_clip": 0.07013386, + "auxiliary_loss_mlp": 0.01335228, + "balance_loss_clip": 0.06456258, + "balance_loss_mlp": 0.01263417, + "epoch": 0.06132571772132872, + "flos": 26252765925120.0, + "grad_norm": 3.333827515378615, + "language_loss": 0.86933666, + "learning_rate": 3.989715755957418e-06, + "loss": 0.9528228, + "num_input_tokens_seen": 21826430, + "router_z_loss_clip": 5.5625, + "router_z_loss_mlp": 0.71826172, + "step": 1020, + "time_per_iteration": 2.6224961280822754 + }, + { + "auxiliary_loss_clip": 0.06996658, + "auxiliary_loss_mlp": 0.01346945, + "balance_loss_clip": 0.06447957, + "balance_loss_mlp": 0.01273989, + "epoch": 0.06138584097399669, + "flos": 37423869062400.0, + "grad_norm": 2.8232559173096914, + "language_loss": 0.81487918, + "learning_rate": 3.989676273237705e-06, + "loss": 0.89831525, + "num_input_tokens_seen": 21847800, + "router_z_loss_clip": 5.48828125, + "router_z_loss_mlp": 0.72949219, + "step": 1021, + "time_per_iteration": 2.771052598953247 + }, + { + "auxiliary_loss_clip": 0.06976922, + "auxiliary_loss_mlp": 0.0136383, + "balance_loss_clip": 0.06428508, + "balance_loss_mlp": 0.01285295, + "epoch": 0.061445964226664665, + "flos": 17426410790400.0, + "grad_norm": 7.734725170769636, + "language_loss": 0.9093855, + "learning_rate": 3.9896367150692705e-06, + "loss": 0.99279296, + "num_input_tokens_seen": 21863385, + "router_z_loss_clip": 5.48046875, + "router_z_loss_mlp": 0.78466797, + "step": 1022, + "time_per_iteration": 2.5622968673706055 + }, + { + "auxiliary_loss_clip": 0.0697528, + "auxiliary_loss_mlp": 0.01365327, + "balance_loss_clip": 0.06437931, + "balance_loss_mlp": 0.01295518, + "epoch": 0.06150608747933263, + "flos": 22606365870720.0, + "grad_norm": 3.61040283013288, + "language_loss": 0.84977013, + "learning_rate": 3.989597081453611e-06, + "loss": 0.93317622, + "num_input_tokens_seen": 21881880, + "router_z_loss_clip": 5.37109375, + "router_z_loss_mlp": 0.69824219, + "step": 1023, + "time_per_iteration": 2.6407079696655273 + }, + { + "auxiliary_loss_clip": 0.0673309, + "auxiliary_loss_mlp": 0.01419946, + "balance_loss_clip": 0.06385664, + "balance_loss_mlp": 0.0137119, + "epoch": 0.0615662107320006, + "flos": 56758097139840.0, + "grad_norm": 0.9164460168563352, + "language_loss": 0.64884549, + "learning_rate": 3.989557372392231e-06, + "loss": 0.73037589, + "num_input_tokens_seen": 21940550, + "router_z_loss_clip": 3.46875, + "router_z_loss_mlp": 0.48706055, + "step": 1024, + "time_per_iteration": 3.240457534790039 + }, + { + "auxiliary_loss_clip": 0.06995942, + "auxiliary_loss_mlp": 0.01352799, + "balance_loss_clip": 0.06434722, + "balance_loss_mlp": 0.01272356, + "epoch": 0.06162633398466857, + "flos": 22571342064000.0, + "grad_norm": 2.66796346315112, + "language_loss": 0.91765183, + "learning_rate": 3.989517587886636e-06, + "loss": 1.00113928, + "num_input_tokens_seen": 21958390, + "router_z_loss_clip": 5.61328125, + "router_z_loss_mlp": 0.80371094, + "step": 1025, + "time_per_iteration": 2.6372737884521484 + }, + { + "auxiliary_loss_clip": 0.06986167, + "auxiliary_loss_mlp": 0.01374261, + "balance_loss_clip": 0.06435852, + "balance_loss_mlp": 0.01300828, + "epoch": 0.06168645723733654, + "flos": 25600158000000.0, + "grad_norm": 2.4272602971827535, + "language_loss": 0.871768, + "learning_rate": 3.989477727938335e-06, + "loss": 0.95537233, + "num_input_tokens_seen": 21978625, + "router_z_loss_clip": 5.50390625, + "router_z_loss_mlp": 0.73486328, + "step": 1026, + "time_per_iteration": 2.6508452892303467 + }, + { + "auxiliary_loss_clip": 0.06989977, + "auxiliary_loss_mlp": 0.01363012, + "balance_loss_clip": 0.06439693, + "balance_loss_mlp": 0.01286622, + "epoch": 0.06174658049000451, + "flos": 16003461553920.0, + "grad_norm": 3.495791258705881, + "language_loss": 0.8437736, + "learning_rate": 3.989437792548839e-06, + "loss": 0.92730343, + "num_input_tokens_seen": 21996035, + "router_z_loss_clip": 5.5, + "router_z_loss_mlp": 0.76416016, + "step": 1027, + "time_per_iteration": 2.613172769546509 + }, + { + "auxiliary_loss_clip": 0.06973707, + "auxiliary_loss_mlp": 0.01359003, + "balance_loss_clip": 0.0641673, + "balance_loss_mlp": 0.01281422, + "epoch": 0.06180670374267248, + "flos": 11289842772480.0, + "grad_norm": 3.8173647671524793, + "language_loss": 0.87086433, + "learning_rate": 3.989397781719663e-06, + "loss": 0.95419139, + "num_input_tokens_seen": 22011625, + "router_z_loss_clip": 5.5625, + "router_z_loss_mlp": 0.77539062, + "step": 1028, + "time_per_iteration": 2.6524107456207275 + }, + { + "auxiliary_loss_clip": 0.06704632, + "auxiliary_loss_mlp": 0.01372349, + "balance_loss_clip": 0.06357226, + "balance_loss_mlp": 0.01321519, + "epoch": 0.06186682699534045, + "flos": 65147647340160.0, + "grad_norm": 0.9176628937357996, + "language_loss": 0.60490429, + "learning_rate": 3.989357695452323e-06, + "loss": 0.68567419, + "num_input_tokens_seen": 22066035, + "router_z_loss_clip": 3.46875, + "router_z_loss_mlp": 0.50830078, + "step": 1029, + "time_per_iteration": 3.218085289001465 + }, + { + "auxiliary_loss_clip": 0.07009555, + "auxiliary_loss_mlp": 0.01372678, + "balance_loss_clip": 0.06434123, + "balance_loss_mlp": 0.01287372, + "epoch": 0.061926950248008414, + "flos": 21112111209600.0, + "grad_norm": 3.737194986722716, + "language_loss": 0.85668898, + "learning_rate": 3.98931753374834e-06, + "loss": 0.94051135, + "num_input_tokens_seen": 22085015, + "router_z_loss_clip": 5.75390625, + "router_z_loss_mlp": 0.85253906, + "step": 1030, + "time_per_iteration": 2.7052202224731445 + }, + { + "auxiliary_loss_clip": 0.06989674, + "auxiliary_loss_mlp": 0.01357455, + "balance_loss_clip": 0.06431329, + "balance_loss_mlp": 0.01280446, + "epoch": 0.061987073500676386, + "flos": 17754161235840.0, + "grad_norm": 3.4423452178420013, + "language_loss": 0.83235556, + "learning_rate": 3.989277296609237e-06, + "loss": 0.91582686, + "num_input_tokens_seen": 22102775, + "router_z_loss_clip": 5.5859375, + "router_z_loss_mlp": 0.77050781, + "step": 1031, + "time_per_iteration": 2.588575839996338 + }, + { + "auxiliary_loss_clip": 0.06983647, + "auxiliary_loss_mlp": 0.01355074, + "balance_loss_clip": 0.06433594, + "balance_loss_mlp": 0.01283453, + "epoch": 0.06204719675334436, + "flos": 21842858666880.0, + "grad_norm": 14.220096224086527, + "language_loss": 0.80345309, + "learning_rate": 3.98923698403654e-06, + "loss": 0.88684022, + "num_input_tokens_seen": 22121680, + "router_z_loss_clip": 5.50390625, + "router_z_loss_mlp": 0.71582031, + "step": 1032, + "time_per_iteration": 2.6636962890625 + }, + { + "auxiliary_loss_clip": 0.06996015, + "auxiliary_loss_mlp": 0.01349932, + "balance_loss_clip": 0.064355, + "balance_loss_mlp": 0.01272828, + "epoch": 0.06210732000601232, + "flos": 19359650592000.0, + "grad_norm": 3.724079257252284, + "language_loss": 0.9305315, + "learning_rate": 3.989196596031776e-06, + "loss": 1.01399088, + "num_input_tokens_seen": 22138155, + "router_z_loss_clip": 5.60546875, + "router_z_loss_mlp": 0.77197266, + "step": 1033, + "time_per_iteration": 2.5974748134613037 + }, + { + "auxiliary_loss_clip": 0.06988779, + "auxiliary_loss_mlp": 0.01347157, + "balance_loss_clip": 0.06438898, + "balance_loss_mlp": 0.0127525, + "epoch": 0.062167443258680295, + "flos": 24755534444160.0, + "grad_norm": 3.649174890809254, + "language_loss": 0.87141907, + "learning_rate": 3.989156132596479e-06, + "loss": 0.95477843, + "num_input_tokens_seen": 22157420, + "router_z_loss_clip": 5.49609375, + "router_z_loss_mlp": 0.71875, + "step": 1034, + "time_per_iteration": 2.6747853755950928 + }, + { + "auxiliary_loss_clip": 0.06962503, + "auxiliary_loss_mlp": 0.01360042, + "balance_loss_clip": 0.06434912, + "balance_loss_mlp": 0.01290854, + "epoch": 0.06222756651134827, + "flos": 34466903602560.0, + "grad_norm": 3.3762373845942313, + "language_loss": 0.84657645, + "learning_rate": 3.989115593732182e-06, + "loss": 0.92980194, + "num_input_tokens_seen": 22178620, + "router_z_loss_clip": 5.2734375, + "router_z_loss_mlp": 0.69189453, + "step": 1035, + "time_per_iteration": 2.690265655517578 + }, + { + "auxiliary_loss_clip": 0.06995995, + "auxiliary_loss_mlp": 0.01348638, + "balance_loss_clip": 0.06441504, + "balance_loss_mlp": 0.01275015, + "epoch": 0.06228768976401623, + "flos": 25673601703680.0, + "grad_norm": 4.464615872821339, + "language_loss": 0.81925672, + "learning_rate": 3.989074979440421e-06, + "loss": 0.90270305, + "num_input_tokens_seen": 22197125, + "router_z_loss_clip": 5.5390625, + "router_z_loss_mlp": 0.73583984, + "step": 1036, + "time_per_iteration": 2.6662774085998535 + }, + { + "auxiliary_loss_clip": 0.07003354, + "auxiliary_loss_mlp": 0.01370226, + "balance_loss_clip": 0.064463, + "balance_loss_mlp": 0.01293693, + "epoch": 0.062347813016684205, + "flos": 25301687356800.0, + "grad_norm": 3.754285367283167, + "language_loss": 0.89123344, + "learning_rate": 3.989034289722739e-06, + "loss": 0.97496927, + "num_input_tokens_seen": 22217575, + "router_z_loss_clip": 5.56640625, + "router_z_loss_mlp": 0.76513672, + "step": 1037, + "time_per_iteration": 2.609894037246704 + }, + { + "auxiliary_loss_clip": 0.07008456, + "auxiliary_loss_mlp": 0.01342836, + "balance_loss_clip": 0.06453587, + "balance_loss_mlp": 0.01269641, + "epoch": 0.06240793626935217, + "flos": 26914388163840.0, + "grad_norm": 15.327798453817612, + "language_loss": 0.8346867, + "learning_rate": 3.988993524580676e-06, + "loss": 0.91819966, + "num_input_tokens_seen": 22236840, + "router_z_loss_clip": 5.54296875, + "router_z_loss_mlp": 0.73095703, + "step": 1038, + "time_per_iteration": 2.6626057624816895 + }, + { + "auxiliary_loss_clip": 0.06993866, + "auxiliary_loss_mlp": 0.01340149, + "balance_loss_clip": 0.0645204, + "balance_loss_mlp": 0.01267956, + "epoch": 0.06246805952202014, + "flos": 21622108285440.0, + "grad_norm": 3.08050473605758, + "language_loss": 0.88628823, + "learning_rate": 3.98895268401578e-06, + "loss": 0.96962833, + "num_input_tokens_seen": 22256465, + "router_z_loss_clip": 5.41796875, + "router_z_loss_mlp": 0.72167969, + "step": 1039, + "time_per_iteration": 2.6248486042022705 + }, + { + "auxiliary_loss_clip": 0.0701851, + "auxiliary_loss_mlp": 0.01340836, + "balance_loss_clip": 0.06453219, + "balance_loss_mlp": 0.01264352, + "epoch": 0.0625281827746881, + "flos": 19316954136960.0, + "grad_norm": 4.220230384937809, + "language_loss": 0.85023952, + "learning_rate": 3.9889117680296e-06, + "loss": 0.933833, + "num_input_tokens_seen": 22274025, + "router_z_loss_clip": 5.6484375, + "router_z_loss_mlp": 0.76513672, + "step": 1040, + "time_per_iteration": 2.6646370887756348 + }, + { + "auxiliary_loss_clip": 0.07036482, + "auxiliary_loss_mlp": 0.01364298, + "balance_loss_clip": 0.06464302, + "balance_loss_mlp": 0.01274987, + "epoch": 0.06258830602735609, + "flos": 27753183861120.0, + "grad_norm": 4.590358257909823, + "language_loss": 0.72318321, + "learning_rate": 3.988870776623685e-06, + "loss": 0.80719095, + "num_input_tokens_seen": 22292245, + "router_z_loss_clip": 5.71875, + "router_z_loss_mlp": 0.89306641, + "step": 1041, + "time_per_iteration": 2.6730599403381348 + }, + { + "auxiliary_loss_clip": 0.07040736, + "auxiliary_loss_mlp": 0.01378227, + "balance_loss_clip": 0.06470466, + "balance_loss_mlp": 0.01298548, + "epoch": 0.06264842928002405, + "flos": 23229442431360.0, + "grad_norm": 2.706616424442574, + "language_loss": 0.84952104, + "learning_rate": 3.9888297097995905e-06, + "loss": 0.93371069, + "num_input_tokens_seen": 22311455, + "router_z_loss_clip": 5.6953125, + "router_z_loss_mlp": 0.796875, + "step": 1042, + "time_per_iteration": 2.6521389484405518 + }, + { + "auxiliary_loss_clip": 0.0703849, + "auxiliary_loss_mlp": 0.0134851, + "balance_loss_clip": 0.06476429, + "balance_loss_mlp": 0.01272598, + "epoch": 0.06270855253269202, + "flos": 38408671699200.0, + "grad_norm": 3.072391396873047, + "language_loss": 0.79772788, + "learning_rate": 3.988788567558874e-06, + "loss": 0.88159788, + "num_input_tokens_seen": 22333750, + "router_z_loss_clip": 5.62109375, + "router_z_loss_mlp": 0.75927734, + "step": 1043, + "time_per_iteration": 4.132354021072388 + }, + { + "auxiliary_loss_clip": 0.07023476, + "auxiliary_loss_mlp": 0.01365807, + "balance_loss_clip": 0.06473523, + "balance_loss_mlp": 0.01289656, + "epoch": 0.06276867578535998, + "flos": 22459771952640.0, + "grad_norm": 8.578696431093903, + "language_loss": 0.95484012, + "learning_rate": 3.988747349903097e-06, + "loss": 1.03873289, + "num_input_tokens_seen": 22351940, + "router_z_loss_clip": 5.49609375, + "router_z_loss_mlp": 0.76123047, + "step": 1044, + "time_per_iteration": 4.0872087478637695 + }, + { + "auxiliary_loss_clip": 0.0702454, + "auxiliary_loss_mlp": 0.0136404, + "balance_loss_clip": 0.06474113, + "balance_loss_mlp": 0.0129156, + "epoch": 0.06282879903802796, + "flos": 22937176990080.0, + "grad_norm": 5.298315501835511, + "language_loss": 0.88737643, + "learning_rate": 3.988706056833821e-06, + "loss": 0.97126228, + "num_input_tokens_seen": 22372085, + "router_z_loss_clip": 5.5, + "router_z_loss_mlp": 0.72412109, + "step": 1045, + "time_per_iteration": 2.6359164714813232 + }, + { + "auxiliary_loss_clip": 0.07016507, + "auxiliary_loss_mlp": 0.01377248, + "balance_loss_clip": 0.06467608, + "balance_loss_mlp": 0.01300334, + "epoch": 0.06288892229069593, + "flos": 34827036451200.0, + "grad_norm": 2.8748954821383803, + "language_loss": 0.81643683, + "learning_rate": 3.9886646883526125e-06, + "loss": 0.90037435, + "num_input_tokens_seen": 22392020, + "router_z_loss_clip": 5.484375, + "router_z_loss_mlp": 0.76855469, + "step": 1046, + "time_per_iteration": 4.205566883087158 + }, + { + "auxiliary_loss_clip": 0.07049687, + "auxiliary_loss_mlp": 0.01383919, + "balance_loss_clip": 0.0647831, + "balance_loss_mlp": 0.01309628, + "epoch": 0.06294904554336389, + "flos": 19433178149760.0, + "grad_norm": 3.049904917466256, + "language_loss": 0.8054778, + "learning_rate": 3.988623244461039e-06, + "loss": 0.8898139, + "num_input_tokens_seen": 22411180, + "router_z_loss_clip": 5.71484375, + "router_z_loss_mlp": 0.74267578, + "step": 1047, + "time_per_iteration": 2.628453493118286 + }, + { + "auxiliary_loss_clip": 0.07082113, + "auxiliary_loss_mlp": 0.01418593, + "balance_loss_clip": 0.06488797, + "balance_loss_mlp": 0.01332237, + "epoch": 0.06300916879603187, + "flos": 40671464808960.0, + "grad_norm": 5.477739593856775, + "language_loss": 0.80062962, + "learning_rate": 3.988581725160672e-06, + "loss": 0.88563669, + "num_input_tokens_seen": 22435105, + "router_z_loss_clip": 5.921875, + "router_z_loss_mlp": 0.86279297, + "step": 1048, + "time_per_iteration": 4.191184997558594 + }, + { + "auxiliary_loss_clip": 0.07059699, + "auxiliary_loss_mlp": 0.01409495, + "balance_loss_clip": 0.06479897, + "balance_loss_mlp": 0.01322902, + "epoch": 0.06306929204869983, + "flos": 23810703004800.0, + "grad_norm": 4.634968800445174, + "language_loss": 0.81291783, + "learning_rate": 3.988540130453087e-06, + "loss": 0.89760983, + "num_input_tokens_seen": 22452710, + "router_z_loss_clip": 5.796875, + "router_z_loss_mlp": 0.86669922, + "step": 1049, + "time_per_iteration": 2.650202989578247 + }, + { + "auxiliary_loss_clip": 0.07039324, + "auxiliary_loss_mlp": 0.01395065, + "balance_loss_clip": 0.06466646, + "balance_loss_mlp": 0.01316435, + "epoch": 0.0631294153013678, + "flos": 18921671700480.0, + "grad_norm": 5.321703459602036, + "language_loss": 0.85613585, + "learning_rate": 3.988498460339862e-06, + "loss": 0.9404797, + "num_input_tokens_seen": 22470175, + "router_z_loss_clip": 5.71875, + "router_z_loss_mlp": 0.78662109, + "step": 1050, + "time_per_iteration": 2.6393301486968994 + }, + { + "auxiliary_loss_clip": 0.07003346, + "auxiliary_loss_mlp": 0.01381224, + "balance_loss_clip": 0.06475418, + "balance_loss_mlp": 0.01309221, + "epoch": 0.06318953855403578, + "flos": 24287101793280.0, + "grad_norm": 2.921652621723748, + "language_loss": 0.80915332, + "learning_rate": 3.988456714822575e-06, + "loss": 0.89299899, + "num_input_tokens_seen": 22490020, + "router_z_loss_clip": 5.2734375, + "router_z_loss_mlp": 0.71972656, + "step": 1051, + "time_per_iteration": 2.6563098430633545 + }, + { + "auxiliary_loss_clip": 0.07019964, + "auxiliary_loss_mlp": 0.01395256, + "balance_loss_clip": 0.06461668, + "balance_loss_mlp": 0.01314957, + "epoch": 0.06324966180670374, + "flos": 22535563570560.0, + "grad_norm": 3.4102512673670256, + "language_loss": 0.84142733, + "learning_rate": 3.98841489390281e-06, + "loss": 0.92557955, + "num_input_tokens_seen": 22509685, + "router_z_loss_clip": 5.57421875, + "router_z_loss_mlp": 0.80224609, + "step": 1052, + "time_per_iteration": 2.6776039600372314 + }, + { + "auxiliary_loss_clip": 0.07036786, + "auxiliary_loss_mlp": 0.01379519, + "balance_loss_clip": 0.06459802, + "balance_loss_mlp": 0.01299411, + "epoch": 0.06330978505937171, + "flos": 15783465859200.0, + "grad_norm": 2.8507947153873663, + "language_loss": 0.80809307, + "learning_rate": 3.988372997582155e-06, + "loss": 0.89225614, + "num_input_tokens_seen": 22527905, + "router_z_loss_clip": 5.76953125, + "router_z_loss_mlp": 0.80175781, + "step": 1053, + "time_per_iteration": 2.6043174266815186 + }, + { + "auxiliary_loss_clip": 0.06984901, + "auxiliary_loss_mlp": 0.01368181, + "balance_loss_clip": 0.06446727, + "balance_loss_mlp": 0.0129532, + "epoch": 0.06336990831203967, + "flos": 21477610719360.0, + "grad_norm": 4.159955078588776, + "language_loss": 0.88012934, + "learning_rate": 3.988331025862195e-06, + "loss": 0.96366018, + "num_input_tokens_seen": 22546335, + "router_z_loss_clip": 5.3828125, + "router_z_loss_mlp": 0.72802734, + "step": 1054, + "time_per_iteration": 2.647026777267456 + }, + { + "auxiliary_loss_clip": 0.06987712, + "auxiliary_loss_mlp": 0.01370375, + "balance_loss_clip": 0.06445334, + "balance_loss_mlp": 0.01301568, + "epoch": 0.06343003156470765, + "flos": 18484824839040.0, + "grad_norm": 2.8104304693341837, + "language_loss": 0.89331806, + "learning_rate": 3.9882889787445225e-06, + "loss": 0.97689891, + "num_input_tokens_seen": 22563885, + "router_z_loss_clip": 5.421875, + "router_z_loss_mlp": 0.68798828, + "step": 1055, + "time_per_iteration": 2.5695717334747314 + }, + { + "auxiliary_loss_clip": 0.07031021, + "auxiliary_loss_mlp": 0.01393239, + "balance_loss_clip": 0.06440826, + "balance_loss_mlp": 0.01302354, + "epoch": 0.06349015481737562, + "flos": 25161801765120.0, + "grad_norm": 4.1133835551619224, + "language_loss": 0.85196388, + "learning_rate": 3.988246856230734e-06, + "loss": 0.93620646, + "num_input_tokens_seen": 22583035, + "router_z_loss_clip": 5.89453125, + "router_z_loss_mlp": 0.90820312, + "step": 1056, + "time_per_iteration": 2.685821056365967 + }, + { + "auxiliary_loss_clip": 0.07029925, + "auxiliary_loss_mlp": 0.01408784, + "balance_loss_clip": 0.06446205, + "balance_loss_mlp": 0.01319377, + "epoch": 0.06355027807004358, + "flos": 26879322430080.0, + "grad_norm": 5.02877545894497, + "language_loss": 0.84474576, + "learning_rate": 3.988204658322426e-06, + "loss": 0.92913282, + "num_input_tokens_seen": 22605055, + "router_z_loss_clip": 5.8359375, + "router_z_loss_mlp": 0.89501953, + "step": 1057, + "time_per_iteration": 2.6688387393951416 + }, + { + "auxiliary_loss_clip": 0.06953399, + "auxiliary_loss_mlp": 0.01345887, + "balance_loss_clip": 0.06428042, + "balance_loss_mlp": 0.01278987, + "epoch": 0.06361040132271156, + "flos": 21402951131520.0, + "grad_norm": 3.9641222811805337, + "language_loss": 0.85986251, + "learning_rate": 3.988162385021196e-06, + "loss": 0.94285542, + "num_input_tokens_seen": 22623760, + "router_z_loss_clip": 5.25, + "router_z_loss_mlp": 0.66845703, + "step": 1058, + "time_per_iteration": 2.6371591091156006 + }, + { + "auxiliary_loss_clip": 0.0698344, + "auxiliary_loss_mlp": 0.01353949, + "balance_loss_clip": 0.06427366, + "balance_loss_mlp": 0.01275796, + "epoch": 0.06367052457537953, + "flos": 25739959737600.0, + "grad_norm": 3.2277693096185125, + "language_loss": 0.90202904, + "learning_rate": 3.988120036328651e-06, + "loss": 0.98540288, + "num_input_tokens_seen": 22643000, + "router_z_loss_clip": 5.5625, + "router_z_loss_mlp": 0.78173828, + "step": 1059, + "time_per_iteration": 2.6188669204711914 + }, + { + "auxiliary_loss_clip": 0.06969759, + "auxiliary_loss_mlp": 0.01343893, + "balance_loss_clip": 0.06422018, + "balance_loss_mlp": 0.01267218, + "epoch": 0.0637306478280475, + "flos": 17635840871040.0, + "grad_norm": 3.450468160359764, + "language_loss": 0.94701946, + "learning_rate": 3.988077612246394e-06, + "loss": 1.0301559, + "num_input_tokens_seen": 22660460, + "router_z_loss_clip": 5.48046875, + "router_z_loss_mlp": 0.76708984, + "step": 1060, + "time_per_iteration": 2.659820079803467 + }, + { + "auxiliary_loss_clip": 0.06957703, + "auxiliary_loss_mlp": 0.0133292, + "balance_loss_clip": 0.06419823, + "balance_loss_mlp": 0.01262396, + "epoch": 0.06379077108071547, + "flos": 13667727864960.0, + "grad_norm": 3.5269486179455622, + "language_loss": 0.91039562, + "learning_rate": 3.988035112776035e-06, + "loss": 0.99330181, + "num_input_tokens_seen": 22679270, + "router_z_loss_clip": 5.38671875, + "router_z_loss_mlp": 0.70483398, + "step": 1061, + "time_per_iteration": 2.595237970352173 + }, + { + "auxiliary_loss_clip": 0.07004992, + "auxiliary_loss_mlp": 0.0134989, + "balance_loss_clip": 0.06433421, + "balance_loss_mlp": 0.01272071, + "epoch": 0.06385089433338344, + "flos": 28486950065280.0, + "grad_norm": 26.387846770017223, + "language_loss": 0.80432439, + "learning_rate": 3.987992537919185e-06, + "loss": 0.88787317, + "num_input_tokens_seen": 22699330, + "router_z_loss_clip": 5.7109375, + "router_z_loss_mlp": 0.77832031, + "step": 1062, + "time_per_iteration": 2.69326114654541 + }, + { + "auxiliary_loss_clip": 0.06971388, + "auxiliary_loss_mlp": 0.01333448, + "balance_loss_clip": 0.06420203, + "balance_loss_mlp": 0.01260349, + "epoch": 0.0639110175860514, + "flos": 24317052428160.0, + "grad_norm": 14.259145516712906, + "language_loss": 0.90426183, + "learning_rate": 3.987949887677459e-06, + "loss": 0.98731029, + "num_input_tokens_seen": 22717945, + "router_z_loss_clip": 5.5, + "router_z_loss_mlp": 0.73095703, + "step": 1063, + "time_per_iteration": 2.642476797103882 + }, + { + "auxiliary_loss_clip": 0.06974378, + "auxiliary_loss_mlp": 0.01332583, + "balance_loss_clip": 0.06425211, + "balance_loss_mlp": 0.01259436, + "epoch": 0.06397114083871938, + "flos": 22097291189760.0, + "grad_norm": 2.9601227778370176, + "language_loss": 0.82562792, + "learning_rate": 3.9879071620524744e-06, + "loss": 0.90869761, + "num_input_tokens_seen": 22736790, + "router_z_loss_clip": 5.48828125, + "router_z_loss_mlp": 0.73144531, + "step": 1064, + "time_per_iteration": 2.661435604095459 + }, + { + "auxiliary_loss_clip": 0.06941259, + "auxiliary_loss_mlp": 0.01342729, + "balance_loss_clip": 0.06412596, + "balance_loss_mlp": 0.01271298, + "epoch": 0.06403126409138735, + "flos": 19578849672960.0, + "grad_norm": 3.2505919469988727, + "language_loss": 0.86995006, + "learning_rate": 3.987864361045851e-06, + "loss": 0.95278984, + "num_input_tokens_seen": 22754745, + "router_z_loss_clip": 5.28515625, + "router_z_loss_mlp": 0.71386719, + "step": 1065, + "time_per_iteration": 2.5758113861083984 + }, + { + "auxiliary_loss_clip": 0.06963679, + "auxiliary_loss_mlp": 0.01340247, + "balance_loss_clip": 0.06401139, + "balance_loss_mlp": 0.01265669, + "epoch": 0.06409138734405531, + "flos": 40816968624000.0, + "grad_norm": 2.0842805851080395, + "language_loss": 0.71325147, + "learning_rate": 3.987821484659211e-06, + "loss": 0.79629076, + "num_input_tokens_seen": 22776780, + "router_z_loss_clip": 5.625, + "router_z_loss_mlp": 0.74609375, + "step": 1066, + "time_per_iteration": 2.7917239665985107 + }, + { + "auxiliary_loss_clip": 0.06944396, + "auxiliary_loss_mlp": 0.0133661, + "balance_loss_clip": 0.06404863, + "balance_loss_mlp": 0.01266419, + "epoch": 0.06415151059672328, + "flos": 20446631683200.0, + "grad_norm": 3.9323967107233093, + "language_loss": 0.93839109, + "learning_rate": 3.987778532894181e-06, + "loss": 1.02120125, + "num_input_tokens_seen": 22793915, + "router_z_loss_clip": 5.390625, + "router_z_loss_mlp": 0.70166016, + "step": 1067, + "time_per_iteration": 2.6115174293518066 + }, + { + "auxiliary_loss_clip": 0.06956208, + "auxiliary_loss_mlp": 0.0134, + "balance_loss_clip": 0.06410809, + "balance_loss_mlp": 0.01270954, + "epoch": 0.06421163384939126, + "flos": 18077006217600.0, + "grad_norm": 2.3907527813163947, + "language_loss": 0.86262715, + "learning_rate": 3.987735505752391e-06, + "loss": 0.94558918, + "num_input_tokens_seen": 22812670, + "router_z_loss_clip": 5.453125, + "router_z_loss_mlp": 0.68994141, + "step": 1068, + "time_per_iteration": 2.6069822311401367 + }, + { + "auxiliary_loss_clip": 0.06937677, + "auxiliary_loss_mlp": 0.01339596, + "balance_loss_clip": 0.0640877, + "balance_loss_mlp": 0.01269787, + "epoch": 0.06427175710205922, + "flos": 25126526396160.0, + "grad_norm": 3.0644651013361175, + "language_loss": 0.92719203, + "learning_rate": 3.987692403235471e-06, + "loss": 1.0099647, + "num_input_tokens_seen": 22832440, + "router_z_loss_clip": 5.296875, + "router_z_loss_mlp": 0.69775391, + "step": 1069, + "time_per_iteration": 2.6751255989074707 + }, + { + "auxiliary_loss_clip": 0.06952519, + "auxiliary_loss_mlp": 0.01331878, + "balance_loss_clip": 0.06402327, + "balance_loss_mlp": 0.01256777, + "epoch": 0.06433188035472719, + "flos": 17385684906240.0, + "grad_norm": 4.001862380962301, + "language_loss": 0.98985177, + "learning_rate": 3.987649225345056e-06, + "loss": 1.07269573, + "num_input_tokens_seen": 22845495, + "router_z_loss_clip": 5.5078125, + "router_z_loss_mlp": 0.75048828, + "step": 1070, + "time_per_iteration": 2.5646464824676514 + }, + { + "auxiliary_loss_clip": 0.06933151, + "auxiliary_loss_mlp": 0.01337757, + "balance_loss_clip": 0.0639724, + "balance_loss_mlp": 0.01267042, + "epoch": 0.06439200360739517, + "flos": 23552371267200.0, + "grad_norm": 2.5082910657712474, + "language_loss": 0.90418053, + "learning_rate": 3.987605972082782e-06, + "loss": 0.98688966, + "num_input_tokens_seen": 22865390, + "router_z_loss_clip": 5.359375, + "router_z_loss_mlp": 0.70703125, + "step": 1071, + "time_per_iteration": 2.6427106857299805 + }, + { + "auxiliary_loss_clip": 0.06918223, + "auxiliary_loss_mlp": 0.01334321, + "balance_loss_clip": 0.06398708, + "balance_loss_mlp": 0.01262414, + "epoch": 0.06445212686006313, + "flos": 21986014567680.0, + "grad_norm": 1.871300371090536, + "language_loss": 0.79228568, + "learning_rate": 3.987562643450292e-06, + "loss": 0.87481117, + "num_input_tokens_seen": 22885495, + "router_z_loss_clip": 5.19921875, + "router_z_loss_mlp": 0.71923828, + "step": 1072, + "time_per_iteration": 2.647038698196411 + }, + { + "auxiliary_loss_clip": 0.06937171, + "auxiliary_loss_mlp": 0.01329872, + "balance_loss_clip": 0.06401432, + "balance_loss_mlp": 0.01259205, + "epoch": 0.0645122501127311, + "flos": 25928369642880.0, + "grad_norm": 2.655186985808554, + "language_loss": 0.84775895, + "learning_rate": 3.987519239449226e-06, + "loss": 0.9304294, + "num_input_tokens_seen": 22904845, + "router_z_loss_clip": 5.35546875, + "router_z_loss_mlp": 0.70800781, + "step": 1073, + "time_per_iteration": 2.658341646194458 + }, + { + "auxiliary_loss_clip": 0.06906792, + "auxiliary_loss_mlp": 0.01330074, + "balance_loss_clip": 0.06396446, + "balance_loss_mlp": 0.01263412, + "epoch": 0.06457237336539907, + "flos": 25632498476160.0, + "grad_norm": 1.923481252052909, + "language_loss": 0.82366061, + "learning_rate": 3.987475760081233e-06, + "loss": 0.90602928, + "num_input_tokens_seen": 22925940, + "router_z_loss_clip": 5.09765625, + "router_z_loss_mlp": 0.66650391, + "step": 1074, + "time_per_iteration": 2.6500589847564697 + }, + { + "auxiliary_loss_clip": 0.06911084, + "auxiliary_loss_mlp": 0.01341632, + "balance_loss_clip": 0.0638795, + "balance_loss_mlp": 0.01268152, + "epoch": 0.06463249661806704, + "flos": 19470088673280.0, + "grad_norm": 4.283359791903129, + "language_loss": 0.82960403, + "learning_rate": 3.987432205347958e-06, + "loss": 0.91213125, + "num_input_tokens_seen": 22944375, + "router_z_loss_clip": 5.23046875, + "router_z_loss_mlp": 0.73486328, + "step": 1075, + "time_per_iteration": 2.620055675506592 + }, + { + "auxiliary_loss_clip": 0.06919183, + "auxiliary_loss_mlp": 0.01329908, + "balance_loss_clip": 0.06393343, + "balance_loss_mlp": 0.01260528, + "epoch": 0.064692619870735, + "flos": 24504833427840.0, + "grad_norm": 4.7074268898703, + "language_loss": 0.90130782, + "learning_rate": 3.987388575251055e-06, + "loss": 0.98379874, + "num_input_tokens_seen": 22959145, + "router_z_loss_clip": 5.26171875, + "router_z_loss_mlp": 0.69335938, + "step": 1076, + "time_per_iteration": 2.6410202980041504 + }, + { + "auxiliary_loss_clip": 0.06917243, + "auxiliary_loss_mlp": 0.01324517, + "balance_loss_clip": 0.06391963, + "balance_loss_mlp": 0.01256901, + "epoch": 0.06475274312340297, + "flos": 17024252319360.0, + "grad_norm": 4.89859871786138, + "language_loss": 0.84430212, + "learning_rate": 3.98734486979218e-06, + "loss": 0.92671967, + "num_input_tokens_seen": 22978100, + "router_z_loss_clip": 5.25390625, + "router_z_loss_mlp": 0.67578125, + "step": 1077, + "time_per_iteration": 2.6577157974243164 + }, + { + "auxiliary_loss_clip": 0.06961326, + "auxiliary_loss_mlp": 0.0134572, + "balance_loss_clip": 0.06399816, + "balance_loss_mlp": 0.01265659, + "epoch": 0.06481286637607095, + "flos": 24579409161600.0, + "grad_norm": 2.525164880783881, + "language_loss": 0.95071888, + "learning_rate": 3.987301088972986e-06, + "loss": 1.03378928, + "num_input_tokens_seen": 22997285, + "router_z_loss_clip": 5.609375, + "router_z_loss_mlp": 0.80078125, + "step": 1078, + "time_per_iteration": 2.60807466506958 + }, + { + "auxiliary_loss_clip": 0.0696152, + "auxiliary_loss_mlp": 0.01348441, + "balance_loss_clip": 0.0639492, + "balance_loss_mlp": 0.01266616, + "epoch": 0.06487298962873891, + "flos": 21111985428480.0, + "grad_norm": 2.577127703708103, + "language_loss": 0.81118071, + "learning_rate": 3.987257232795137e-06, + "loss": 0.89428037, + "num_input_tokens_seen": 23016285, + "router_z_loss_clip": 5.6640625, + "router_z_loss_mlp": 0.81835938, + "step": 1079, + "time_per_iteration": 2.6317968368530273 + }, + { + "auxiliary_loss_clip": 0.06928547, + "auxiliary_loss_mlp": 0.01328554, + "balance_loss_clip": 0.06390582, + "balance_loss_mlp": 0.01256837, + "epoch": 0.06493311288140688, + "flos": 24615103800960.0, + "grad_norm": 2.4676521714353865, + "language_loss": 0.72843546, + "learning_rate": 3.987213301260294e-06, + "loss": 0.81100643, + "num_input_tokens_seen": 23036420, + "router_z_loss_clip": 5.37109375, + "router_z_loss_mlp": 0.71728516, + "step": 1080, + "time_per_iteration": 2.6215646266937256 + }, + { + "auxiliary_loss_clip": 0.06919578, + "auxiliary_loss_mlp": 0.01334283, + "balance_loss_clip": 0.06385017, + "balance_loss_mlp": 0.01258323, + "epoch": 0.06499323613407486, + "flos": 25345054644480.0, + "grad_norm": 2.8195024652173233, + "language_loss": 0.76152724, + "learning_rate": 3.987169294370123e-06, + "loss": 0.8440659, + "num_input_tokens_seen": 23056945, + "router_z_loss_clip": 5.34375, + "router_z_loss_mlp": 0.75927734, + "step": 1081, + "time_per_iteration": 2.619861364364624 + }, + { + "auxiliary_loss_clip": 0.06903991, + "auxiliary_loss_mlp": 0.01330699, + "balance_loss_clip": 0.06382824, + "balance_loss_mlp": 0.01260985, + "epoch": 0.06505335938674282, + "flos": 20381908803840.0, + "grad_norm": 3.8302016885059436, + "language_loss": 0.87991226, + "learning_rate": 3.987125212126294e-06, + "loss": 0.96225917, + "num_input_tokens_seen": 23074940, + "router_z_loss_clip": 5.2109375, + "router_z_loss_mlp": 0.69726562, + "step": 1082, + "time_per_iteration": 3.9682254791259766 + }, + { + "auxiliary_loss_clip": 0.06965172, + "auxiliary_loss_mlp": 0.01343743, + "balance_loss_clip": 0.06394538, + "balance_loss_mlp": 0.01265304, + "epoch": 0.06511348263941079, + "flos": 25344970790400.0, + "grad_norm": 3.078052560557278, + "language_loss": 0.85807657, + "learning_rate": 3.987081054530478e-06, + "loss": 0.94116569, + "num_input_tokens_seen": 23093420, + "router_z_loss_clip": 5.70703125, + "router_z_loss_mlp": 0.78417969, + "step": 1083, + "time_per_iteration": 4.172176361083984 + }, + { + "auxiliary_loss_clip": 0.06918654, + "auxiliary_loss_mlp": 0.01347933, + "balance_loss_clip": 0.06379002, + "balance_loss_mlp": 0.01269684, + "epoch": 0.06517360589207877, + "flos": 20337912610560.0, + "grad_norm": 5.768369350853526, + "language_loss": 0.82737648, + "learning_rate": 3.987036821584348e-06, + "loss": 0.91004241, + "num_input_tokens_seen": 23111550, + "router_z_loss_clip": 5.40234375, + "router_z_loss_mlp": 0.78173828, + "step": 1084, + "time_per_iteration": 2.5647377967834473 + }, + { + "auxiliary_loss_clip": 0.06925946, + "auxiliary_loss_mlp": 0.01344614, + "balance_loss_clip": 0.06381474, + "balance_loss_mlp": 0.0126379, + "epoch": 0.06523372914474673, + "flos": 31688956391040.0, + "grad_norm": 2.8637661589946664, + "language_loss": 0.69041795, + "learning_rate": 3.986992513289584e-06, + "loss": 0.7731235, + "num_input_tokens_seen": 23130335, + "router_z_loss_clip": 5.44921875, + "router_z_loss_mlp": 0.80908203, + "step": 1085, + "time_per_iteration": 2.6726510524749756 + }, + { + "auxiliary_loss_clip": 0.06912835, + "auxiliary_loss_mlp": 0.01346265, + "balance_loss_clip": 0.06394207, + "balance_loss_mlp": 0.01271496, + "epoch": 0.0652938523974147, + "flos": 20784612326400.0, + "grad_norm": 3.652482458321433, + "language_loss": 0.80282378, + "learning_rate": 3.9869481296478645e-06, + "loss": 0.88541472, + "num_input_tokens_seen": 23152380, + "router_z_loss_clip": 5.1875, + "router_z_loss_mlp": 0.74707031, + "step": 1086, + "time_per_iteration": 4.0445778369903564 + }, + { + "auxiliary_loss_clip": 0.06903446, + "auxiliary_loss_mlp": 0.01343539, + "balance_loss_clip": 0.06383859, + "balance_loss_mlp": 0.01271489, + "epoch": 0.06535397565008266, + "flos": 16696627655040.0, + "grad_norm": 2.983342921031512, + "language_loss": 0.88718885, + "learning_rate": 3.986903670660872e-06, + "loss": 0.96965867, + "num_input_tokens_seen": 23171630, + "router_z_loss_clip": 5.1953125, + "router_z_loss_mlp": 0.72021484, + "step": 1087, + "time_per_iteration": 2.612272024154663 + }, + { + "auxiliary_loss_clip": 0.06922436, + "auxiliary_loss_mlp": 0.01359561, + "balance_loss_clip": 0.06381297, + "balance_loss_mlp": 0.01282457, + "epoch": 0.06541409890275064, + "flos": 26875171653120.0, + "grad_norm": 4.165814553604834, + "language_loss": 0.81038088, + "learning_rate": 3.9868591363302945e-06, + "loss": 0.89320087, + "num_input_tokens_seen": 23192520, + "router_z_loss_clip": 5.4140625, + "router_z_loss_mlp": 0.77099609, + "step": 1088, + "time_per_iteration": 4.128512620925903 + }, + { + "auxiliary_loss_clip": 0.06905861, + "auxiliary_loss_mlp": 0.01369914, + "balance_loss_clip": 0.0637981, + "balance_loss_mlp": 0.01292333, + "epoch": 0.06547422215541861, + "flos": 20527831889280.0, + "grad_norm": 2.3905965673188043, + "language_loss": 0.73899305, + "learning_rate": 3.9868145266578186e-06, + "loss": 0.82175082, + "num_input_tokens_seen": 23210710, + "router_z_loss_clip": 5.26171875, + "router_z_loss_mlp": 0.77587891, + "step": 1089, + "time_per_iteration": 2.5846424102783203 + }, + { + "auxiliary_loss_clip": 0.06903853, + "auxiliary_loss_mlp": 0.01367809, + "balance_loss_clip": 0.06390744, + "balance_loss_mlp": 0.01297094, + "epoch": 0.06553434540808657, + "flos": 22022925091200.0, + "grad_norm": 2.5933459275490005, + "language_loss": 0.88925481, + "learning_rate": 3.9867698416451366e-06, + "loss": 0.97197139, + "num_input_tokens_seen": 23230305, + "router_z_loss_clip": 5.12890625, + "router_z_loss_mlp": 0.70751953, + "step": 1090, + "time_per_iteration": 2.632730722427368 + }, + { + "auxiliary_loss_clip": 0.06923388, + "auxiliary_loss_mlp": 0.01379562, + "balance_loss_clip": 0.06394897, + "balance_loss_mlp": 0.01304031, + "epoch": 0.06559446866075455, + "flos": 24615648852480.0, + "grad_norm": 5.07637209675267, + "language_loss": 0.7519111, + "learning_rate": 3.9867250812939434e-06, + "loss": 0.83494061, + "num_input_tokens_seen": 23249015, + "router_z_loss_clip": 5.28125, + "router_z_loss_mlp": 0.75634766, + "step": 1091, + "time_per_iteration": 2.6071624755859375 + }, + { + "auxiliary_loss_clip": 0.06920849, + "auxiliary_loss_mlp": 0.01367283, + "balance_loss_clip": 0.06403629, + "balance_loss_mlp": 0.01298141, + "epoch": 0.06565459191342252, + "flos": 24280686956160.0, + "grad_norm": 3.183278775232349, + "language_loss": 0.85751635, + "learning_rate": 3.986680245605936e-06, + "loss": 0.94039762, + "num_input_tokens_seen": 23265105, + "router_z_loss_clip": 5.1640625, + "router_z_loss_mlp": 0.69091797, + "step": 1092, + "time_per_iteration": 2.605273962020874 + }, + { + "auxiliary_loss_clip": 0.06938382, + "auxiliary_loss_mlp": 0.01382517, + "balance_loss_clip": 0.06414036, + "balance_loss_mlp": 0.0131123, + "epoch": 0.06571471516609048, + "flos": 24793493143680.0, + "grad_norm": 3.590473362105347, + "language_loss": 0.74473059, + "learning_rate": 3.986635334582814e-06, + "loss": 0.82793957, + "num_input_tokens_seen": 23283950, + "router_z_loss_clip": 5.2421875, + "router_z_loss_mlp": 0.71337891, + "step": 1093, + "time_per_iteration": 2.638237237930298 + }, + { + "auxiliary_loss_clip": 0.06921268, + "auxiliary_loss_mlp": 0.01380472, + "balance_loss_clip": 0.06396792, + "balance_loss_mlp": 0.01303797, + "epoch": 0.06577483841875846, + "flos": 26221347843840.0, + "grad_norm": 88.21387149104662, + "language_loss": 0.90390575, + "learning_rate": 3.986590348226282e-06, + "loss": 0.98692322, + "num_input_tokens_seen": 23305005, + "router_z_loss_clip": 5.2421875, + "router_z_loss_mlp": 0.76660156, + "step": 1094, + "time_per_iteration": 2.6458590030670166 + }, + { + "auxiliary_loss_clip": 0.06927408, + "auxiliary_loss_mlp": 0.01386993, + "balance_loss_clip": 0.06403756, + "balance_loss_mlp": 0.01310603, + "epoch": 0.06583496167142643, + "flos": 25087519520640.0, + "grad_norm": 2.736930049066649, + "language_loss": 0.83897924, + "learning_rate": 3.986545286538044e-06, + "loss": 0.92212319, + "num_input_tokens_seen": 23323220, + "router_z_loss_clip": 5.23046875, + "router_z_loss_mlp": 0.76416016, + "step": 1095, + "time_per_iteration": 2.678666114807129 + }, + { + "auxiliary_loss_clip": 0.06935441, + "auxiliary_loss_mlp": 0.01385344, + "balance_loss_clip": 0.06404546, + "balance_loss_mlp": 0.01317443, + "epoch": 0.06589508492409439, + "flos": 25636900815360.0, + "grad_norm": 5.395614329655057, + "language_loss": 0.73154068, + "learning_rate": 3.986500149519811e-06, + "loss": 0.81474853, + "num_input_tokens_seen": 23342235, + "router_z_loss_clip": 5.3046875, + "router_z_loss_mlp": 0.67871094, + "step": 1096, + "time_per_iteration": 2.6446287631988525 + }, + { + "auxiliary_loss_clip": 0.06917029, + "auxiliary_loss_mlp": 0.01365132, + "balance_loss_clip": 0.06399326, + "balance_loss_mlp": 0.01297755, + "epoch": 0.06595520817676236, + "flos": 23627701687680.0, + "grad_norm": 3.583666651431395, + "language_loss": 0.80129099, + "learning_rate": 3.986454937173292e-06, + "loss": 0.8841126, + "num_input_tokens_seen": 23363680, + "router_z_loss_clip": 5.171875, + "router_z_loss_mlp": 0.67285156, + "step": 1097, + "time_per_iteration": 2.610381603240967 + }, + { + "auxiliary_loss_clip": 0.06948523, + "auxiliary_loss_mlp": 0.01368674, + "balance_loss_clip": 0.0639759, + "balance_loss_mlp": 0.01295384, + "epoch": 0.06601533142943034, + "flos": 33810019119360.0, + "grad_norm": 2.548144949478092, + "language_loss": 0.80388427, + "learning_rate": 3.986409649500203e-06, + "loss": 0.88705623, + "num_input_tokens_seen": 23385590, + "router_z_loss_clip": 5.50390625, + "router_z_loss_mlp": 0.73339844, + "step": 1098, + "time_per_iteration": 2.720482110977173 + }, + { + "auxiliary_loss_clip": 0.06938128, + "auxiliary_loss_mlp": 0.01366931, + "balance_loss_clip": 0.06409903, + "balance_loss_mlp": 0.01293498, + "epoch": 0.0660754546820983, + "flos": 20264175417600.0, + "grad_norm": 10.171489722923557, + "language_loss": 0.84726501, + "learning_rate": 3.986364286502261e-06, + "loss": 0.93031561, + "num_input_tokens_seen": 23402945, + "router_z_loss_clip": 5.28125, + "router_z_loss_mlp": 0.73486328, + "step": 1099, + "time_per_iteration": 2.598655939102173 + }, + { + "auxiliary_loss_clip": 0.06904539, + "auxiliary_loss_mlp": 0.01375441, + "balance_loss_clip": 0.0639468, + "balance_loss_mlp": 0.01307397, + "epoch": 0.06613557793476627, + "flos": 19360195643520.0, + "grad_norm": 3.568327868722517, + "language_loss": 0.8664155, + "learning_rate": 3.986318848181186e-06, + "loss": 0.94921529, + "num_input_tokens_seen": 23421410, + "router_z_loss_clip": 5.09765625, + "router_z_loss_mlp": 0.68066406, + "step": 1100, + "time_per_iteration": 2.577528238296509 + }, + { + "auxiliary_loss_clip": 0.06927315, + "auxiliary_loss_mlp": 0.01369622, + "balance_loss_clip": 0.06391686, + "balance_loss_mlp": 0.01299861, + "epoch": 0.06619570118743424, + "flos": 13777788602880.0, + "grad_norm": 2.758398197018795, + "language_loss": 0.76281518, + "learning_rate": 3.986273334538702e-06, + "loss": 0.84578454, + "num_input_tokens_seen": 23438870, + "router_z_loss_clip": 5.3515625, + "router_z_loss_mlp": 0.69775391, + "step": 1101, + "time_per_iteration": 2.6156139373779297 + }, + { + "auxiliary_loss_clip": 0.06904308, + "auxiliary_loss_mlp": 0.01359683, + "balance_loss_clip": 0.06387865, + "balance_loss_mlp": 0.01295215, + "epoch": 0.06625582444010221, + "flos": 17863593068160.0, + "grad_norm": 4.389912717391851, + "language_loss": 0.89471924, + "learning_rate": 3.986227745576533e-06, + "loss": 0.97735918, + "num_input_tokens_seen": 23456975, + "router_z_loss_clip": 5.16796875, + "router_z_loss_mlp": 0.64501953, + "step": 1102, + "time_per_iteration": 2.569350242614746 + }, + { + "auxiliary_loss_clip": 0.0692213, + "auxiliary_loss_mlp": 0.01377442, + "balance_loss_clip": 0.06385392, + "balance_loss_mlp": 0.01306584, + "epoch": 0.06631594769277017, + "flos": 11843584479360.0, + "grad_norm": 3.5425773042581055, + "language_loss": 0.86216784, + "learning_rate": 3.98618208129641e-06, + "loss": 0.94516355, + "num_input_tokens_seen": 23473440, + "router_z_loss_clip": 5.36328125, + "router_z_loss_mlp": 0.70898438, + "step": 1103, + "time_per_iteration": 2.6067960262298584 + }, + { + "auxiliary_loss_clip": 0.06886483, + "auxiliary_loss_mlp": 0.01371541, + "balance_loss_clip": 0.06376658, + "balance_loss_mlp": 0.01305547, + "epoch": 0.06637607094543815, + "flos": 19799683908480.0, + "grad_norm": 2.4626452299406383, + "language_loss": 0.8457936, + "learning_rate": 3.986136341700063e-06, + "loss": 0.92837381, + "num_input_tokens_seen": 23493880, + "router_z_loss_clip": 5.09765625, + "router_z_loss_mlp": 0.66015625, + "step": 1104, + "time_per_iteration": 2.5836308002471924 + }, + { + "auxiliary_loss_clip": 0.06882686, + "auxiliary_loss_mlp": 0.01367781, + "balance_loss_clip": 0.0637526, + "balance_loss_mlp": 0.01303408, + "epoch": 0.06643619419810612, + "flos": 25493032154880.0, + "grad_norm": 1.7655477747418094, + "language_loss": 0.83173895, + "learning_rate": 3.986090526789227e-06, + "loss": 0.91424364, + "num_input_tokens_seen": 23514920, + "router_z_loss_clip": 5.0703125, + "router_z_loss_mlp": 0.64306641, + "step": 1105, + "time_per_iteration": 2.662261486053467 + }, + { + "auxiliary_loss_clip": 0.06873615, + "auxiliary_loss_mlp": 0.01369586, + "balance_loss_clip": 0.06380346, + "balance_loss_mlp": 0.01308694, + "epoch": 0.06649631745077408, + "flos": 16952234135040.0, + "grad_norm": 2.812403865753697, + "language_loss": 0.99235487, + "learning_rate": 3.986044636565639e-06, + "loss": 1.0747869, + "num_input_tokens_seen": 23531635, + "router_z_loss_clip": 4.9296875, + "router_z_loss_mlp": 0.60839844, + "step": 1106, + "time_per_iteration": 2.55377459526062 + }, + { + "auxiliary_loss_clip": 0.0691068, + "auxiliary_loss_mlp": 0.01368117, + "balance_loss_clip": 0.06380811, + "balance_loss_mlp": 0.01299977, + "epoch": 0.06655644070344206, + "flos": 17864431608960.0, + "grad_norm": 9.796712570365342, + "language_loss": 0.85572082, + "learning_rate": 3.985998671031039e-06, + "loss": 0.93850881, + "num_input_tokens_seen": 23551020, + "router_z_loss_clip": 5.296875, + "router_z_loss_mlp": 0.68115234, + "step": 1107, + "time_per_iteration": 2.607999324798584 + }, + { + "auxiliary_loss_clip": 0.06769384, + "auxiliary_loss_mlp": 0.01408352, + "balance_loss_clip": 0.06440101, + "balance_loss_mlp": 0.01358189, + "epoch": 0.06661656395611003, + "flos": 61438033779840.0, + "grad_norm": 0.835907980773472, + "language_loss": 0.57139766, + "learning_rate": 3.9859526301871705e-06, + "loss": 0.653175, + "num_input_tokens_seen": 23610675, + "router_z_loss_clip": 3.28515625, + "router_z_loss_mlp": 0.50195312, + "step": 1108, + "time_per_iteration": 3.1505634784698486 + }, + { + "auxiliary_loss_clip": 0.06919513, + "auxiliary_loss_mlp": 0.01358617, + "balance_loss_clip": 0.06388947, + "balance_loss_mlp": 0.01289285, + "epoch": 0.066676687208778, + "flos": 20668304459520.0, + "grad_norm": 4.7813305453067985, + "language_loss": 0.74593651, + "learning_rate": 3.9859065140357795e-06, + "loss": 0.82871783, + "num_input_tokens_seen": 23628710, + "router_z_loss_clip": 5.30078125, + "router_z_loss_mlp": 0.69384766, + "step": 1109, + "time_per_iteration": 2.5951621532440186 + }, + { + "auxiliary_loss_clip": 0.06901313, + "auxiliary_loss_mlp": 0.01359309, + "balance_loss_clip": 0.06382284, + "balance_loss_mlp": 0.01292219, + "epoch": 0.06673681046144596, + "flos": 20929613016960.0, + "grad_norm": 2.4423466539648686, + "language_loss": 0.81162918, + "learning_rate": 3.985860322578614e-06, + "loss": 0.89423537, + "num_input_tokens_seen": 23649160, + "router_z_loss_clip": 5.18359375, + "router_z_loss_mlp": 0.66992188, + "step": 1110, + "time_per_iteration": 2.5594658851623535 + }, + { + "auxiliary_loss_clip": 0.06916048, + "auxiliary_loss_mlp": 0.01350686, + "balance_loss_clip": 0.06385787, + "balance_loss_mlp": 0.01283261, + "epoch": 0.06679693371411394, + "flos": 31073762113920.0, + "grad_norm": 3.192640550751645, + "language_loss": 0.74339402, + "learning_rate": 3.985814055817427e-06, + "loss": 0.82606131, + "num_input_tokens_seen": 23671995, + "router_z_loss_clip": 5.296875, + "router_z_loss_mlp": 0.67431641, + "step": 1111, + "time_per_iteration": 2.6675732135772705 + }, + { + "auxiliary_loss_clip": 0.0692247, + "auxiliary_loss_mlp": 0.01336011, + "balance_loss_clip": 0.0638883, + "balance_loss_mlp": 0.01269492, + "epoch": 0.0668570569667819, + "flos": 21732630220800.0, + "grad_norm": 3.09844838926034, + "language_loss": 0.81051421, + "learning_rate": 3.985767713753971e-06, + "loss": 0.89309895, + "num_input_tokens_seen": 23690705, + "router_z_loss_clip": 5.3359375, + "router_z_loss_mlp": 0.66455078, + "step": 1112, + "time_per_iteration": 2.5785021781921387 + }, + { + "auxiliary_loss_clip": 0.06900664, + "auxiliary_loss_mlp": 0.01347702, + "balance_loss_clip": 0.06384552, + "balance_loss_mlp": 0.01282185, + "epoch": 0.06691718021944987, + "flos": 22753840256640.0, + "grad_norm": 2.9756537070092466, + "language_loss": 0.82400674, + "learning_rate": 3.985721296390005e-06, + "loss": 0.90649039, + "num_input_tokens_seen": 23709990, + "router_z_loss_clip": 5.1640625, + "router_z_loss_mlp": 0.65576172, + "step": 1113, + "time_per_iteration": 2.6159799098968506 + }, + { + "auxiliary_loss_clip": 0.06872059, + "auxiliary_loss_mlp": 0.01337269, + "balance_loss_clip": 0.06376456, + "balance_loss_mlp": 0.01280382, + "epoch": 0.06697730347211785, + "flos": 16551333475200.0, + "grad_norm": 3.049422068587495, + "language_loss": 0.85146165, + "learning_rate": 3.985674803727289e-06, + "loss": 0.93355489, + "num_input_tokens_seen": 23728485, + "router_z_loss_clip": 4.95703125, + "router_z_loss_mlp": 0.56884766, + "step": 1114, + "time_per_iteration": 2.5442495346069336 + }, + { + "auxiliary_loss_clip": 0.06720632, + "auxiliary_loss_mlp": 0.01311166, + "balance_loss_clip": 0.06393555, + "balance_loss_mlp": 0.01264675, + "epoch": 0.06703742672478581, + "flos": 59801545612800.0, + "grad_norm": 0.814822871226623, + "language_loss": 0.58299243, + "learning_rate": 3.985628235767584e-06, + "loss": 0.66331041, + "num_input_tokens_seen": 23786650, + "router_z_loss_clip": 3.2734375, + "router_z_loss_mlp": 0.46435547, + "step": 1115, + "time_per_iteration": 3.1831469535827637 + }, + { + "auxiliary_loss_clip": 0.06912658, + "auxiliary_loss_mlp": 0.01326736, + "balance_loss_clip": 0.06393988, + "balance_loss_mlp": 0.01261314, + "epoch": 0.06709754997745378, + "flos": 16805807925120.0, + "grad_norm": 5.78180725653176, + "language_loss": 0.94695258, + "learning_rate": 3.985581592512658e-06, + "loss": 1.02934647, + "num_input_tokens_seen": 23802555, + "router_z_loss_clip": 5.1875, + "router_z_loss_mlp": 0.65332031, + "step": 1116, + "time_per_iteration": 2.6025443077087402 + }, + { + "auxiliary_loss_clip": 0.06950381, + "auxiliary_loss_mlp": 0.01352294, + "balance_loss_clip": 0.06407215, + "balance_loss_mlp": 0.01283105, + "epoch": 0.06715767323012176, + "flos": 22129883228160.0, + "grad_norm": 3.297350824619057, + "language_loss": 0.90161335, + "learning_rate": 3.985534873964279e-06, + "loss": 0.98464012, + "num_input_tokens_seen": 23822945, + "router_z_loss_clip": 5.42578125, + "router_z_loss_mlp": 0.69189453, + "step": 1117, + "time_per_iteration": 2.640014410018921 + }, + { + "auxiliary_loss_clip": 0.06703123, + "auxiliary_loss_mlp": 0.01296382, + "balance_loss_clip": 0.06378835, + "balance_loss_mlp": 0.01254898, + "epoch": 0.06721779648278972, + "flos": 66634522842240.0, + "grad_norm": 0.828477744144983, + "language_loss": 0.59793437, + "learning_rate": 3.985488080124218e-06, + "loss": 0.67792934, + "num_input_tokens_seen": 23874075, + "router_z_loss_clip": 3.24804688, + "router_z_loss_mlp": 0.41503906, + "step": 1118, + "time_per_iteration": 3.1895816326141357 + }, + { + "auxiliary_loss_clip": 0.0694533, + "auxiliary_loss_mlp": 0.0134688, + "balance_loss_clip": 0.06400572, + "balance_loss_mlp": 0.0127092, + "epoch": 0.06727791973545769, + "flos": 22389011579520.0, + "grad_norm": 4.072656467009049, + "language_loss": 0.87426257, + "learning_rate": 3.985441210994251e-06, + "loss": 0.95718467, + "num_input_tokens_seen": 23889720, + "router_z_loss_clip": 5.453125, + "router_z_loss_mlp": 0.76025391, + "step": 1119, + "time_per_iteration": 2.588590621948242 + }, + { + "auxiliary_loss_clip": 0.0690966, + "auxiliary_loss_mlp": 0.01331486, + "balance_loss_clip": 0.06396869, + "balance_loss_mlp": 0.01269116, + "epoch": 0.06733804298812565, + "flos": 24287143720320.0, + "grad_norm": 3.964620176038611, + "language_loss": 0.88010037, + "learning_rate": 3.9853942665761545e-06, + "loss": 0.9625119, + "num_input_tokens_seen": 23909385, + "router_z_loss_clip": 5.12109375, + "router_z_loss_mlp": 0.62451172, + "step": 1120, + "time_per_iteration": 2.6959142684936523 + }, + { + "auxiliary_loss_clip": 0.06922112, + "auxiliary_loss_mlp": 0.01340271, + "balance_loss_clip": 0.06406626, + "balance_loss_mlp": 0.01275421, + "epoch": 0.06739816624079363, + "flos": 15922638691200.0, + "grad_norm": 2.824028723834481, + "language_loss": 0.81958008, + "learning_rate": 3.985347246871708e-06, + "loss": 0.90220392, + "num_input_tokens_seen": 23926830, + "router_z_loss_clip": 5.15625, + "router_z_loss_mlp": 0.6484375, + "step": 1121, + "time_per_iteration": 2.5337889194488525 + }, + { + "auxiliary_loss_clip": 0.0669936, + "auxiliary_loss_mlp": 0.01328619, + "balance_loss_clip": 0.0637704, + "balance_loss_mlp": 0.01291044, + "epoch": 0.0674582894934616, + "flos": 71422031796480.0, + "grad_norm": 0.7591545371637793, + "language_loss": 0.58392835, + "learning_rate": 3.985300151882694e-06, + "loss": 0.66420811, + "num_input_tokens_seen": 23992640, + "router_z_loss_clip": 3.21875, + "router_z_loss_mlp": 0.375, + "step": 1122, + "time_per_iteration": 4.871971130371094 + }, + { + "auxiliary_loss_clip": 0.06934178, + "auxiliary_loss_mlp": 0.01339594, + "balance_loss_clip": 0.06410946, + "balance_loss_mlp": 0.01275269, + "epoch": 0.06751841274612956, + "flos": 25271988284160.0, + "grad_norm": 2.7004693252579286, + "language_loss": 0.75033748, + "learning_rate": 3.985252981610901e-06, + "loss": 0.83307523, + "num_input_tokens_seen": 24011135, + "router_z_loss_clip": 5.23046875, + "router_z_loss_mlp": 0.64355469, + "step": 1123, + "time_per_iteration": 4.122293472290039 + }, + { + "auxiliary_loss_clip": 0.06974602, + "auxiliary_loss_mlp": 0.0135696, + "balance_loss_clip": 0.06425263, + "balance_loss_mlp": 0.01278282, + "epoch": 0.06757853599879754, + "flos": 23809067850240.0, + "grad_norm": 9.643312426369809, + "language_loss": 0.82052922, + "learning_rate": 3.985205736058114e-06, + "loss": 0.90384483, + "num_input_tokens_seen": 24030695, + "router_z_loss_clip": 5.49609375, + "router_z_loss_mlp": 0.78637695, + "step": 1124, + "time_per_iteration": 2.6173415184020996 + }, + { + "auxiliary_loss_clip": 0.06911455, + "auxiliary_loss_mlp": 0.01341629, + "balance_loss_clip": 0.06401114, + "balance_loss_mlp": 0.01274705, + "epoch": 0.0676386592514655, + "flos": 21040260733440.0, + "grad_norm": 3.063274936287039, + "language_loss": 0.74925935, + "learning_rate": 3.985158415226128e-06, + "loss": 0.83179009, + "num_input_tokens_seen": 24050680, + "router_z_loss_clip": 5.10546875, + "router_z_loss_mlp": 0.66870117, + "step": 1125, + "time_per_iteration": 3.984415292739868 + }, + { + "auxiliary_loss_clip": 0.0694951, + "auxiliary_loss_mlp": 0.01360506, + "balance_loss_clip": 0.06422167, + "balance_loss_mlp": 0.01290745, + "epoch": 0.06769878250413347, + "flos": 25563331330560.0, + "grad_norm": 3.6371795971434935, + "language_loss": 0.84025776, + "learning_rate": 3.985111019116736e-06, + "loss": 0.92335784, + "num_input_tokens_seen": 24067205, + "router_z_loss_clip": 5.2734375, + "router_z_loss_mlp": 0.69726562, + "step": 1126, + "time_per_iteration": 2.6536872386932373 + }, + { + "auxiliary_loss_clip": 0.06684255, + "auxiliary_loss_mlp": 0.01367323, + "balance_loss_clip": 0.06366412, + "balance_loss_mlp": 0.01329891, + "epoch": 0.06775890575680145, + "flos": 70676316385920.0, + "grad_norm": 0.9685337357274917, + "language_loss": 0.60214978, + "learning_rate": 3.985063547731735e-06, + "loss": 0.68266553, + "num_input_tokens_seen": 24131320, + "router_z_loss_clip": 3.1796875, + "router_z_loss_mlp": 0.37353516, + "step": 1127, + "time_per_iteration": 3.2334144115448 + }, + { + "auxiliary_loss_clip": 0.06927685, + "auxiliary_loss_mlp": 0.01345826, + "balance_loss_clip": 0.0640737, + "balance_loss_mlp": 0.01276304, + "epoch": 0.06781902900946941, + "flos": 24241051175040.0, + "grad_norm": 3.0319163993738307, + "language_loss": 0.83925569, + "learning_rate": 3.985016001072925e-06, + "loss": 0.92199081, + "num_input_tokens_seen": 24149930, + "router_z_loss_clip": 5.19921875, + "router_z_loss_mlp": 0.6953125, + "step": 1128, + "time_per_iteration": 4.002989053726196 + }, + { + "auxiliary_loss_clip": 0.06986301, + "auxiliary_loss_mlp": 0.01369711, + "balance_loss_clip": 0.06426411, + "balance_loss_mlp": 0.01288792, + "epoch": 0.06787915226213738, + "flos": 22423825751040.0, + "grad_norm": 5.128906887201041, + "language_loss": 0.79490405, + "learning_rate": 3.984968379142109e-06, + "loss": 0.87846416, + "num_input_tokens_seen": 24169590, + "router_z_loss_clip": 5.59375, + "router_z_loss_mlp": 0.80908203, + "step": 1129, + "time_per_iteration": 2.6091246604919434 + }, + { + "auxiliary_loss_clip": 0.06950344, + "auxiliary_loss_mlp": 0.0134506, + "balance_loss_clip": 0.06413193, + "balance_loss_mlp": 0.01275251, + "epoch": 0.06793927551480534, + "flos": 37716092576640.0, + "grad_norm": 7.724208809946286, + "language_loss": 0.75193048, + "learning_rate": 3.984920681941094e-06, + "loss": 0.83488452, + "num_input_tokens_seen": 24189965, + "router_z_loss_clip": 5.37109375, + "router_z_loss_mlp": 0.69873047, + "step": 1130, + "time_per_iteration": 2.747319221496582 + }, + { + "auxiliary_loss_clip": 0.06924557, + "auxiliary_loss_mlp": 0.01342805, + "balance_loss_clip": 0.06402417, + "balance_loss_mlp": 0.01275428, + "epoch": 0.06799939876747332, + "flos": 20637682992000.0, + "grad_norm": 3.4742611596039583, + "language_loss": 0.83601421, + "learning_rate": 3.984872909471688e-06, + "loss": 0.91868782, + "num_input_tokens_seen": 24208045, + "router_z_loss_clip": 5.21484375, + "router_z_loss_mlp": 0.67333984, + "step": 1131, + "time_per_iteration": 2.619173765182495 + }, + { + "auxiliary_loss_clip": 0.06889838, + "auxiliary_loss_mlp": 0.01323899, + "balance_loss_clip": 0.06390625, + "balance_loss_mlp": 0.01266011, + "epoch": 0.06805952202014129, + "flos": 14869759011840.0, + "grad_norm": 6.452833361572522, + "language_loss": 0.83523953, + "learning_rate": 3.984825061735701e-06, + "loss": 0.91737688, + "num_input_tokens_seen": 24223805, + "router_z_loss_clip": 4.99609375, + "router_z_loss_mlp": 0.57958984, + "step": 1132, + "time_per_iteration": 2.5897791385650635 + }, + { + "auxiliary_loss_clip": 0.06909724, + "auxiliary_loss_mlp": 0.01329094, + "balance_loss_clip": 0.06400912, + "balance_loss_mlp": 0.0126813, + "epoch": 0.06811964527280925, + "flos": 48920710147200.0, + "grad_norm": 2.2815724812180056, + "language_loss": 0.66480637, + "learning_rate": 3.9847771387349495e-06, + "loss": 0.74719459, + "num_input_tokens_seen": 24249475, + "router_z_loss_clip": 5.09375, + "router_z_loss_mlp": 0.61035156, + "step": 1133, + "time_per_iteration": 2.830873966217041 + }, + { + "auxiliary_loss_clip": 0.06951424, + "auxiliary_loss_mlp": 0.01351356, + "balance_loss_clip": 0.06402567, + "balance_loss_mlp": 0.0127573, + "epoch": 0.06817976852547723, + "flos": 15382649053440.0, + "grad_norm": 2.526233551435035, + "language_loss": 0.78033423, + "learning_rate": 3.9847291404712506e-06, + "loss": 0.86336207, + "num_input_tokens_seen": 24267980, + "router_z_loss_clip": 5.484375, + "router_z_loss_mlp": 0.75634766, + "step": 1134, + "time_per_iteration": 2.5770034790039062 + }, + { + "auxiliary_loss_clip": 0.06920115, + "auxiliary_loss_mlp": 0.0133773, + "balance_loss_clip": 0.06399941, + "balance_loss_mlp": 0.01275216, + "epoch": 0.0682398917781452, + "flos": 20161661546880.0, + "grad_norm": 3.170480536995333, + "language_loss": 0.89855266, + "learning_rate": 3.984681066946423e-06, + "loss": 0.98113102, + "num_input_tokens_seen": 24286805, + "router_z_loss_clip": 5.19921875, + "router_z_loss_mlp": 0.625, + "step": 1135, + "time_per_iteration": 2.574153423309326 + }, + { + "auxiliary_loss_clip": 0.06912802, + "auxiliary_loss_mlp": 0.01339867, + "balance_loss_clip": 0.06390901, + "balance_loss_mlp": 0.01268723, + "epoch": 0.06830001503081316, + "flos": 23447341774080.0, + "grad_norm": 4.323885929511343, + "language_loss": 0.81566894, + "learning_rate": 3.984632918162291e-06, + "loss": 0.89819562, + "num_input_tokens_seen": 24305855, + "router_z_loss_clip": 5.2109375, + "router_z_loss_mlp": 0.7109375, + "step": 1136, + "time_per_iteration": 2.632093906402588 + }, + { + "auxiliary_loss_clip": 0.0691568, + "auxiliary_loss_mlp": 0.01339988, + "balance_loss_clip": 0.06395651, + "balance_loss_mlp": 0.01271133, + "epoch": 0.06836013828348114, + "flos": 34358352238080.0, + "grad_norm": 3.452027949613855, + "language_loss": 0.86628962, + "learning_rate": 3.984584694120679e-06, + "loss": 0.94884622, + "num_input_tokens_seen": 24326535, + "router_z_loss_clip": 5.1953125, + "router_z_loss_mlp": 0.68798828, + "step": 1137, + "time_per_iteration": 2.7281885147094727 + }, + { + "auxiliary_loss_clip": 0.0688309, + "auxiliary_loss_mlp": 0.01332345, + "balance_loss_clip": 0.06381994, + "balance_loss_mlp": 0.01269736, + "epoch": 0.06842026153614911, + "flos": 23155537530240.0, + "grad_norm": 8.291551749105667, + "language_loss": 0.81329322, + "learning_rate": 3.984536394823418e-06, + "loss": 0.89544761, + "num_input_tokens_seen": 24345810, + "router_z_loss_clip": 5.0078125, + "router_z_loss_mlp": 0.62646484, + "step": 1138, + "time_per_iteration": 2.605118989944458 + }, + { + "auxiliary_loss_clip": 0.06915967, + "auxiliary_loss_mlp": 0.01331137, + "balance_loss_clip": 0.06396595, + "balance_loss_mlp": 0.01263808, + "epoch": 0.06848038478881707, + "flos": 24616026195840.0, + "grad_norm": 3.6376188064113704, + "language_loss": 0.88301587, + "learning_rate": 3.984488020272336e-06, + "loss": 0.96548682, + "num_input_tokens_seen": 24366095, + "router_z_loss_clip": 5.203125, + "router_z_loss_mlp": 0.67382812, + "step": 1139, + "time_per_iteration": 2.5919554233551025 + }, + { + "auxiliary_loss_clip": 0.06913859, + "auxiliary_loss_mlp": 0.01335261, + "balance_loss_clip": 0.0640454, + "balance_loss_mlp": 0.01272175, + "epoch": 0.06854050804148504, + "flos": 40890663889920.0, + "grad_norm": 3.4360954602414515, + "language_loss": 0.78086925, + "learning_rate": 3.984439570469271e-06, + "loss": 0.8633604, + "num_input_tokens_seen": 24388665, + "router_z_loss_clip": 5.09375, + "router_z_loss_mlp": 0.6315918, + "step": 1140, + "time_per_iteration": 2.805285930633545 + }, + { + "auxiliary_loss_clip": 0.06922249, + "auxiliary_loss_mlp": 0.01343333, + "balance_loss_clip": 0.06401816, + "balance_loss_mlp": 0.01273191, + "epoch": 0.06860063129415302, + "flos": 31694448833280.0, + "grad_norm": 3.650068739701382, + "language_loss": 0.7214306, + "learning_rate": 3.9843910454160574e-06, + "loss": 0.80408645, + "num_input_tokens_seen": 24407705, + "router_z_loss_clip": 5.1953125, + "router_z_loss_mlp": 0.70166016, + "step": 1141, + "time_per_iteration": 2.661224603652954 + }, + { + "auxiliary_loss_clip": 0.06967719, + "auxiliary_loss_mlp": 0.0134803, + "balance_loss_clip": 0.06416196, + "balance_loss_mlp": 0.01274931, + "epoch": 0.06866075454682098, + "flos": 26549265997440.0, + "grad_norm": 3.4867433558806664, + "language_loss": 0.81973946, + "learning_rate": 3.984342445114538e-06, + "loss": 0.902897, + "num_input_tokens_seen": 24428390, + "router_z_loss_clip": 5.515625, + "router_z_loss_mlp": 0.73095703, + "step": 1142, + "time_per_iteration": 2.6615188121795654 + }, + { + "auxiliary_loss_clip": 0.06894746, + "auxiliary_loss_mlp": 0.01330861, + "balance_loss_clip": 0.06396586, + "balance_loss_mlp": 0.01266488, + "epoch": 0.06872087779948895, + "flos": 29797658357760.0, + "grad_norm": 2.7600235318020157, + "language_loss": 0.71011055, + "learning_rate": 3.984293769566553e-06, + "loss": 0.79236662, + "num_input_tokens_seen": 24450810, + "router_z_loss_clip": 4.97265625, + "router_z_loss_mlp": 0.64404297, + "step": 1143, + "time_per_iteration": 2.6366419792175293 + }, + { + "auxiliary_loss_clip": 0.06881121, + "auxiliary_loss_mlp": 0.01324263, + "balance_loss_clip": 0.06384973, + "balance_loss_mlp": 0.01260987, + "epoch": 0.06878100105215693, + "flos": 26948070305280.0, + "grad_norm": 2.948232373137099, + "language_loss": 0.77426863, + "learning_rate": 3.98424501877395e-06, + "loss": 0.85632247, + "num_input_tokens_seen": 24469965, + "router_z_loss_clip": 4.9609375, + "router_z_loss_mlp": 0.63232422, + "step": 1144, + "time_per_iteration": 2.6423499584198 + }, + { + "auxiliary_loss_clip": 0.06941762, + "auxiliary_loss_mlp": 0.01342145, + "balance_loss_clip": 0.0640377, + "balance_loss_mlp": 0.01268617, + "epoch": 0.06884112430482489, + "flos": 10675361255040.0, + "grad_norm": 11.35172742857112, + "language_loss": 0.95204943, + "learning_rate": 3.984196192738577e-06, + "loss": 1.03488851, + "num_input_tokens_seen": 24486370, + "router_z_loss_clip": 5.37890625, + "router_z_loss_mlp": 0.73486328, + "step": 1145, + "time_per_iteration": 2.5397605895996094 + }, + { + "auxiliary_loss_clip": 0.06956828, + "auxiliary_loss_mlp": 0.01350992, + "balance_loss_clip": 0.06409793, + "balance_loss_mlp": 0.01275032, + "epoch": 0.06890124755749286, + "flos": 20199871808640.0, + "grad_norm": 2.888200090327115, + "language_loss": 0.85492933, + "learning_rate": 3.984147291462285e-06, + "loss": 0.93800759, + "num_input_tokens_seen": 24503780, + "router_z_loss_clip": 5.47265625, + "router_z_loss_mlp": 0.76025391, + "step": 1146, + "time_per_iteration": 2.594526529312134 + }, + { + "auxiliary_loss_clip": 0.06872599, + "auxiliary_loss_mlp": 0.01322623, + "balance_loss_clip": 0.06383249, + "balance_loss_mlp": 0.01261373, + "epoch": 0.06896137081016084, + "flos": 20455520215680.0, + "grad_norm": 3.1845992476426472, + "language_loss": 0.87540007, + "learning_rate": 3.98409831494693e-06, + "loss": 0.95735222, + "num_input_tokens_seen": 24522320, + "router_z_loss_clip": 4.890625, + "router_z_loss_mlp": 0.61303711, + "step": 1147, + "time_per_iteration": 2.583275556564331 + }, + { + "auxiliary_loss_clip": 0.06904457, + "auxiliary_loss_mlp": 0.01331833, + "balance_loss_clip": 0.06408815, + "balance_loss_mlp": 0.01268628, + "epoch": 0.0690214940628288, + "flos": 18374512538880.0, + "grad_norm": 2.487655094523106, + "language_loss": 0.88253343, + "learning_rate": 3.984049263194367e-06, + "loss": 0.96489632, + "num_input_tokens_seen": 24540445, + "router_z_loss_clip": 4.9453125, + "router_z_loss_mlp": 0.63232422, + "step": 1148, + "time_per_iteration": 2.6046411991119385 + }, + { + "auxiliary_loss_clip": 0.06914362, + "auxiliary_loss_mlp": 0.01331137, + "balance_loss_clip": 0.0640358, + "balance_loss_mlp": 0.01259516, + "epoch": 0.06908161731549677, + "flos": 20564239288320.0, + "grad_norm": 4.03707404203517, + "language_loss": 0.7250514, + "learning_rate": 3.9840001362064575e-06, + "loss": 0.80750638, + "num_input_tokens_seen": 24557105, + "router_z_loss_clip": 5.10546875, + "router_z_loss_mlp": 0.71606445, + "step": 1149, + "time_per_iteration": 2.598886489868164 + }, + { + "auxiliary_loss_clip": 0.06921704, + "auxiliary_loss_mlp": 0.01339506, + "balance_loss_clip": 0.06409335, + "balance_loss_mlp": 0.01271891, + "epoch": 0.06914174056816474, + "flos": 27571104938880.0, + "grad_norm": 5.60622478722484, + "language_loss": 0.87750047, + "learning_rate": 3.983950933985064e-06, + "loss": 0.96011257, + "num_input_tokens_seen": 24578240, + "router_z_loss_clip": 5.125, + "router_z_loss_mlp": 0.67626953, + "step": 1150, + "time_per_iteration": 2.618924379348755 + }, + { + "auxiliary_loss_clip": 0.06931552, + "auxiliary_loss_mlp": 0.01344517, + "balance_loss_clip": 0.06421608, + "balance_loss_mlp": 0.01277283, + "epoch": 0.06920186382083271, + "flos": 15309331130880.0, + "grad_norm": 4.140310732721626, + "language_loss": 0.85321879, + "learning_rate": 3.983901656532052e-06, + "loss": 0.93597955, + "num_input_tokens_seen": 24593585, + "router_z_loss_clip": 5.08984375, + "router_z_loss_mlp": 0.671875, + "step": 1151, + "time_per_iteration": 2.561635971069336 + }, + { + "auxiliary_loss_clip": 0.06954889, + "auxiliary_loss_mlp": 0.01331032, + "balance_loss_clip": 0.06432007, + "balance_loss_mlp": 0.01262987, + "epoch": 0.06926198707350067, + "flos": 25198125310080.0, + "grad_norm": 6.641784633133515, + "language_loss": 0.8773886, + "learning_rate": 3.983852303849291e-06, + "loss": 0.96024776, + "num_input_tokens_seen": 24613110, + "router_z_loss_clip": 5.2265625, + "router_z_loss_mlp": 0.68066406, + "step": 1152, + "time_per_iteration": 2.610301971435547 + }, + { + "auxiliary_loss_clip": 0.06939621, + "auxiliary_loss_mlp": 0.01350234, + "balance_loss_clip": 0.06435804, + "balance_loss_mlp": 0.01282142, + "epoch": 0.06932211032616864, + "flos": 13260328513920.0, + "grad_norm": 2.8280818960049046, + "language_loss": 0.93534935, + "learning_rate": 3.983802875938651e-06, + "loss": 1.01824796, + "num_input_tokens_seen": 24628795, + "router_z_loss_clip": 5.03125, + "router_z_loss_mlp": 0.68066406, + "step": 1153, + "time_per_iteration": 2.595799207687378 + }, + { + "auxiliary_loss_clip": 0.06937614, + "auxiliary_loss_mlp": 0.01346443, + "balance_loss_clip": 0.06424908, + "balance_loss_mlp": 0.01280687, + "epoch": 0.06938223357883662, + "flos": 24834386736000.0, + "grad_norm": 3.275555077522592, + "language_loss": 0.83502865, + "learning_rate": 3.983753372802008e-06, + "loss": 0.91786921, + "num_input_tokens_seen": 24645480, + "router_z_loss_clip": 5.125, + "router_z_loss_mlp": 0.65771484, + "step": 1154, + "time_per_iteration": 2.615935802459717 + }, + { + "auxiliary_loss_clip": 0.06924553, + "auxiliary_loss_mlp": 0.01343071, + "balance_loss_clip": 0.06417688, + "balance_loss_mlp": 0.01275837, + "epoch": 0.06944235683150458, + "flos": 27274730647680.0, + "grad_norm": 2.790851822686811, + "language_loss": 0.77858025, + "learning_rate": 3.983703794441237e-06, + "loss": 0.86125654, + "num_input_tokens_seen": 24664630, + "router_z_loss_clip": 5.06640625, + "router_z_loss_mlp": 0.67285156, + "step": 1155, + "time_per_iteration": 2.6646928787231445 + }, + { + "auxiliary_loss_clip": 0.06934217, + "auxiliary_loss_mlp": 0.01349275, + "balance_loss_clip": 0.06429212, + "balance_loss_mlp": 0.01284616, + "epoch": 0.06950248008417255, + "flos": 25814493544320.0, + "grad_norm": 4.449978036613599, + "language_loss": 0.73122412, + "learning_rate": 3.98365414085822e-06, + "loss": 0.81405902, + "num_input_tokens_seen": 24684210, + "router_z_loss_clip": 5.05078125, + "router_z_loss_mlp": 0.64697266, + "step": 1156, + "time_per_iteration": 2.6129708290100098 + }, + { + "auxiliary_loss_clip": 0.06933945, + "auxiliary_loss_mlp": 0.0134792, + "balance_loss_clip": 0.06418756, + "balance_loss_mlp": 0.01275202, + "epoch": 0.06956260333684053, + "flos": 22277818811520.0, + "grad_norm": 6.490327446037073, + "language_loss": 0.77343124, + "learning_rate": 3.98360441205484e-06, + "loss": 0.85624993, + "num_input_tokens_seen": 24702490, + "router_z_loss_clip": 5.1484375, + "router_z_loss_mlp": 0.7265625, + "step": 1157, + "time_per_iteration": 2.617549419403076 + }, + { + "auxiliary_loss_clip": 0.06920086, + "auxiliary_loss_mlp": 0.01334116, + "balance_loss_clip": 0.06410048, + "balance_loss_mlp": 0.01268265, + "epoch": 0.0696227265895085, + "flos": 29689442409600.0, + "grad_norm": 3.2808507481159785, + "language_loss": 0.7421459, + "learning_rate": 3.983554608032982e-06, + "loss": 0.8246879, + "num_input_tokens_seen": 24724340, + "router_z_loss_clip": 5.1015625, + "router_z_loss_mlp": 0.65869141, + "step": 1158, + "time_per_iteration": 2.649886131286621 + }, + { + "auxiliary_loss_clip": 0.0693851, + "auxiliary_loss_mlp": 0.01343202, + "balance_loss_clip": 0.06428596, + "balance_loss_mlp": 0.01279401, + "epoch": 0.06968284984217646, + "flos": 25531158562560.0, + "grad_norm": 2.8574838231568687, + "language_loss": 0.82572293, + "learning_rate": 3.983504728794533e-06, + "loss": 0.90854007, + "num_input_tokens_seen": 24745550, + "router_z_loss_clip": 5.09765625, + "router_z_loss_mlp": 0.63818359, + "step": 1159, + "time_per_iteration": 2.657604694366455 + }, + { + "auxiliary_loss_clip": 0.06916194, + "auxiliary_loss_mlp": 0.01333029, + "balance_loss_clip": 0.06403087, + "balance_loss_mlp": 0.01260598, + "epoch": 0.06974297309484444, + "flos": 20703454047360.0, + "grad_norm": 4.319041132998911, + "language_loss": 0.83704364, + "learning_rate": 3.983454774341387e-06, + "loss": 0.91953588, + "num_input_tokens_seen": 24762575, + "router_z_loss_clip": 5.125, + "router_z_loss_mlp": 0.72460938, + "step": 1160, + "time_per_iteration": 2.5699267387390137 + }, + { + "auxiliary_loss_clip": 0.06909285, + "auxiliary_loss_mlp": 0.01331612, + "balance_loss_clip": 0.06406631, + "balance_loss_mlp": 0.01266857, + "epoch": 0.0698030963475124, + "flos": 26512397400960.0, + "grad_norm": 2.5893552087800598, + "language_loss": 0.78334123, + "learning_rate": 3.983404744675437e-06, + "loss": 0.86575019, + "num_input_tokens_seen": 24782605, + "router_z_loss_clip": 5.0234375, + "router_z_loss_mlp": 0.64794922, + "step": 1161, + "time_per_iteration": 4.190939664840698 + }, + { + "auxiliary_loss_clip": 0.06900249, + "auxiliary_loss_mlp": 0.0132851, + "balance_loss_clip": 0.06396457, + "balance_loss_mlp": 0.01263899, + "epoch": 0.06986321960018037, + "flos": 23047279655040.0, + "grad_norm": 6.695162889354259, + "language_loss": 0.8492136, + "learning_rate": 3.9833546397985794e-06, + "loss": 0.93150115, + "num_input_tokens_seen": 24802910, + "router_z_loss_clip": 5.0390625, + "router_z_loss_mlp": 0.64575195, + "step": 1162, + "time_per_iteration": 2.639911413192749 + }, + { + "auxiliary_loss_clip": 0.06873773, + "auxiliary_loss_mlp": 0.01325161, + "balance_loss_clip": 0.06388026, + "balance_loss_mlp": 0.01266557, + "epoch": 0.06992334285284833, + "flos": 28592356901760.0, + "grad_norm": 3.1892890701678778, + "language_loss": 0.82525402, + "learning_rate": 3.983304459712716e-06, + "loss": 0.90724337, + "num_input_tokens_seen": 24823305, + "router_z_loss_clip": 4.8515625, + "router_z_loss_mlp": 0.58642578, + "step": 1163, + "time_per_iteration": 4.1009368896484375 + }, + { + "auxiliary_loss_clip": 0.06902477, + "auxiliary_loss_mlp": 0.0132859, + "balance_loss_clip": 0.06390633, + "balance_loss_mlp": 0.01260832, + "epoch": 0.06998346610551631, + "flos": 20601694863360.0, + "grad_norm": 2.8425577951758956, + "language_loss": 0.8088491, + "learning_rate": 3.983254204419749e-06, + "loss": 0.89115977, + "num_input_tokens_seen": 24842155, + "router_z_loss_clip": 5.1171875, + "router_z_loss_mlp": 0.67773438, + "step": 1164, + "time_per_iteration": 2.6123766899108887 + }, + { + "auxiliary_loss_clip": 0.06897761, + "auxiliary_loss_mlp": 0.01323941, + "balance_loss_clip": 0.06385773, + "balance_loss_mlp": 0.012589, + "epoch": 0.07004358935818428, + "flos": 22535437789440.0, + "grad_norm": 2.2246598791524903, + "language_loss": 0.75642318, + "learning_rate": 3.983203873921583e-06, + "loss": 0.83864021, + "num_input_tokens_seen": 24862080, + "router_z_loss_clip": 5.1171875, + "router_z_loss_mlp": 0.64941406, + "step": 1165, + "time_per_iteration": 4.041048288345337 + }, + { + "auxiliary_loss_clip": 0.06871405, + "auxiliary_loss_mlp": 0.01319453, + "balance_loss_clip": 0.06375992, + "balance_loss_mlp": 0.01258847, + "epoch": 0.07010371261085224, + "flos": 28957646776320.0, + "grad_norm": 2.442665636555923, + "language_loss": 0.83451885, + "learning_rate": 3.983153468220128e-06, + "loss": 0.91642749, + "num_input_tokens_seen": 24886165, + "router_z_loss_clip": 4.94921875, + "router_z_loss_mlp": 0.60668945, + "step": 1166, + "time_per_iteration": 2.652954339981079 + }, + { + "auxiliary_loss_clip": 0.06883232, + "auxiliary_loss_mlp": 0.01318395, + "balance_loss_clip": 0.06374976, + "balance_loss_mlp": 0.01257599, + "epoch": 0.07016383586352022, + "flos": 23665870022400.0, + "grad_norm": 2.9279177018628393, + "language_loss": 0.87250483, + "learning_rate": 3.983102987317295e-06, + "loss": 0.95452112, + "num_input_tokens_seen": 24905775, + "router_z_loss_clip": 5.07421875, + "router_z_loss_mlp": 0.60791016, + "step": 1167, + "time_per_iteration": 3.997807502746582 + }, + { + "auxiliary_loss_clip": 0.06869654, + "auxiliary_loss_mlp": 0.01315759, + "balance_loss_clip": 0.0637234, + "balance_loss_mlp": 0.01256608, + "epoch": 0.07022395911618819, + "flos": 19798258389120.0, + "grad_norm": 3.2057139816430826, + "language_loss": 0.9293927, + "learning_rate": 3.983052431214997e-06, + "loss": 1.01124692, + "num_input_tokens_seen": 24924295, + "router_z_loss_clip": 4.97265625, + "router_z_loss_mlp": 0.59106445, + "step": 1168, + "time_per_iteration": 2.6452579498291016 + }, + { + "auxiliary_loss_clip": 0.06893629, + "auxiliary_loss_mlp": 0.01330714, + "balance_loss_clip": 0.06368282, + "balance_loss_mlp": 0.01258331, + "epoch": 0.07028408236885615, + "flos": 21695551989120.0, + "grad_norm": 11.495675802169094, + "language_loss": 0.91365838, + "learning_rate": 3.983001799915153e-06, + "loss": 0.99590182, + "num_input_tokens_seen": 24943210, + "router_z_loss_clip": 5.24609375, + "router_z_loss_mlp": 0.72363281, + "step": 1169, + "time_per_iteration": 2.647975444793701 + }, + { + "auxiliary_loss_clip": 0.06888205, + "auxiliary_loss_mlp": 0.01328046, + "balance_loss_clip": 0.06373216, + "balance_loss_mlp": 0.01262696, + "epoch": 0.07034420562152413, + "flos": 25637445866880.0, + "grad_norm": 2.8251979605986515, + "language_loss": 0.87019682, + "learning_rate": 3.982951093419681e-06, + "loss": 0.95235932, + "num_input_tokens_seen": 24960360, + "router_z_loss_clip": 5.14453125, + "router_z_loss_mlp": 0.65356445, + "step": 1170, + "time_per_iteration": 2.6168391704559326 + }, + { + "auxiliary_loss_clip": 0.06855451, + "auxiliary_loss_mlp": 0.01322256, + "balance_loss_clip": 0.06370235, + "balance_loss_mlp": 0.01265703, + "epoch": 0.0704043288741921, + "flos": 20816198115840.0, + "grad_norm": 5.8134102676021175, + "language_loss": 0.77777052, + "learning_rate": 3.982900311730506e-06, + "loss": 0.85954762, + "num_input_tokens_seen": 24978290, + "router_z_loss_clip": 4.84765625, + "router_z_loss_mlp": 0.56542969, + "step": 1171, + "time_per_iteration": 2.5752956867218018 + }, + { + "auxiliary_loss_clip": 0.06854077, + "auxiliary_loss_mlp": 0.01325506, + "balance_loss_clip": 0.06365283, + "balance_loss_mlp": 0.01268191, + "epoch": 0.07046445212686006, + "flos": 25600241854080.0, + "grad_norm": 2.1487650465547463, + "language_loss": 0.92066246, + "learning_rate": 3.9828494548495514e-06, + "loss": 1.00245833, + "num_input_tokens_seen": 24997055, + "router_z_loss_clip": 4.88671875, + "router_z_loss_mlp": 0.57373047, + "step": 1172, + "time_per_iteration": 2.6476805210113525 + }, + { + "auxiliary_loss_clip": 0.06885421, + "auxiliary_loss_mlp": 0.01324663, + "balance_loss_clip": 0.06371161, + "balance_loss_mlp": 0.01262006, + "epoch": 0.07052457537952803, + "flos": 25564086017280.0, + "grad_norm": 2.603738764291359, + "language_loss": 0.84748065, + "learning_rate": 3.982798522778748e-06, + "loss": 0.92958152, + "num_input_tokens_seen": 25017490, + "router_z_loss_clip": 5.140625, + "router_z_loss_mlp": 0.62695312, + "step": 1173, + "time_per_iteration": 2.6071321964263916 + }, + { + "auxiliary_loss_clip": 0.06857952, + "auxiliary_loss_mlp": 0.01331109, + "balance_loss_clip": 0.06368312, + "balance_loss_mlp": 0.01273054, + "epoch": 0.070584698632196, + "flos": 17974450419840.0, + "grad_norm": 3.5775835502164868, + "language_loss": 0.85116845, + "learning_rate": 3.9827475155200245e-06, + "loss": 0.9330591, + "num_input_tokens_seen": 25035660, + "router_z_loss_clip": 4.89453125, + "router_z_loss_mlp": 0.58129883, + "step": 1174, + "time_per_iteration": 2.57753324508667 + }, + { + "auxiliary_loss_clip": 0.06853965, + "auxiliary_loss_mlp": 0.01334878, + "balance_loss_clip": 0.06364483, + "balance_loss_mlp": 0.01276847, + "epoch": 0.07064482188486397, + "flos": 25377353193600.0, + "grad_norm": 2.5795508468108053, + "language_loss": 0.87789464, + "learning_rate": 3.982696433075317e-06, + "loss": 0.95978308, + "num_input_tokens_seen": 25054785, + "router_z_loss_clip": 4.89453125, + "router_z_loss_mlp": 0.58056641, + "step": 1175, + "time_per_iteration": 2.610611915588379 + }, + { + "auxiliary_loss_clip": 0.06871554, + "auxiliary_loss_mlp": 0.01331862, + "balance_loss_clip": 0.06373453, + "balance_loss_mlp": 0.0127116, + "epoch": 0.07070494513753194, + "flos": 24906782263680.0, + "grad_norm": 2.676154874226604, + "language_loss": 0.87147272, + "learning_rate": 3.982645275446563e-06, + "loss": 0.95350683, + "num_input_tokens_seen": 25075180, + "router_z_loss_clip": 4.97265625, + "router_z_loss_mlp": 0.60644531, + "step": 1176, + "time_per_iteration": 2.6749603748321533 + }, + { + "auxiliary_loss_clip": 0.06855497, + "auxiliary_loss_mlp": 0.01331059, + "balance_loss_clip": 0.06369121, + "balance_loss_mlp": 0.01272075, + "epoch": 0.07076506839019991, + "flos": 22343715648000.0, + "grad_norm": 7.137695949749425, + "language_loss": 0.76855987, + "learning_rate": 3.982594042635701e-06, + "loss": 0.85042542, + "num_input_tokens_seen": 25093035, + "router_z_loss_clip": 4.86328125, + "router_z_loss_mlp": 0.58984375, + "step": 1177, + "time_per_iteration": 2.57594895362854 + }, + { + "auxiliary_loss_clip": 0.06883623, + "auxiliary_loss_mlp": 0.0132835, + "balance_loss_clip": 0.06377017, + "balance_loss_mlp": 0.01265599, + "epoch": 0.07082519164286788, + "flos": 18666694126080.0, + "grad_norm": 2.8035814441303164, + "language_loss": 0.8769573, + "learning_rate": 3.982542734644673e-06, + "loss": 0.959077, + "num_input_tokens_seen": 25112520, + "router_z_loss_clip": 5.0625, + "router_z_loss_mlp": 0.62695312, + "step": 1178, + "time_per_iteration": 2.6013543605804443 + }, + { + "auxiliary_loss_clip": 0.06703987, + "auxiliary_loss_mlp": 0.0134181, + "balance_loss_clip": 0.06385635, + "balance_loss_mlp": 0.01304808, + "epoch": 0.07088531489553584, + "flos": 63674691615360.0, + "grad_norm": 0.8655968349167181, + "language_loss": 0.63642812, + "learning_rate": 3.982491351475427e-06, + "loss": 0.71688616, + "num_input_tokens_seen": 25177760, + "router_z_loss_clip": 3.1875, + "router_z_loss_mlp": 0.36938477, + "step": 1179, + "time_per_iteration": 3.3081142902374268 + }, + { + "auxiliary_loss_clip": 0.06890059, + "auxiliary_loss_mlp": 0.01335612, + "balance_loss_clip": 0.06383069, + "balance_loss_mlp": 0.01270047, + "epoch": 0.07094543814820382, + "flos": 21577902456960.0, + "grad_norm": 4.088495173814758, + "language_loss": 0.87769747, + "learning_rate": 3.98243989312991e-06, + "loss": 0.9599542, + "num_input_tokens_seen": 25195260, + "router_z_loss_clip": 5.0703125, + "router_z_loss_mlp": 0.65625, + "step": 1180, + "time_per_iteration": 2.559685707092285 + }, + { + "auxiliary_loss_clip": 0.06872466, + "auxiliary_loss_mlp": 0.01339604, + "balance_loss_clip": 0.06370541, + "balance_loss_mlp": 0.01274754, + "epoch": 0.07100556140087179, + "flos": 22096326867840.0, + "grad_norm": 6.479686279022214, + "language_loss": 0.90814912, + "learning_rate": 3.982388359610074e-06, + "loss": 0.99026984, + "num_input_tokens_seen": 25212740, + "router_z_loss_clip": 5.015625, + "router_z_loss_mlp": 0.6484375, + "step": 1181, + "time_per_iteration": 2.616978883743286 + }, + { + "auxiliary_loss_clip": 0.06848356, + "auxiliary_loss_mlp": 0.01339504, + "balance_loss_clip": 0.06372169, + "balance_loss_mlp": 0.01279351, + "epoch": 0.07106568465353975, + "flos": 47933056471680.0, + "grad_norm": 6.025910143763993, + "language_loss": 0.86037725, + "learning_rate": 3.9823367509178725e-06, + "loss": 0.94225585, + "num_input_tokens_seen": 25236420, + "router_z_loss_clip": 4.75390625, + "router_z_loss_mlp": 0.60131836, + "step": 1182, + "time_per_iteration": 2.7946407794952393 + }, + { + "auxiliary_loss_clip": 0.06876318, + "auxiliary_loss_mlp": 0.0134218, + "balance_loss_clip": 0.06371553, + "balance_loss_mlp": 0.01276806, + "epoch": 0.07112580790620772, + "flos": 23447551409280.0, + "grad_norm": 3.676638851024929, + "language_loss": 0.82862288, + "learning_rate": 3.982285067055262e-06, + "loss": 0.91080785, + "num_input_tokens_seen": 25255120, + "router_z_loss_clip": 5.04296875, + "router_z_loss_mlp": 0.65332031, + "step": 1183, + "time_per_iteration": 2.60546612739563 + }, + { + "auxiliary_loss_clip": 0.06882935, + "auxiliary_loss_mlp": 0.01336855, + "balance_loss_clip": 0.0637991, + "balance_loss_mlp": 0.01272101, + "epoch": 0.0711859311588757, + "flos": 31877030880000.0, + "grad_norm": 4.3786669508725335, + "language_loss": 0.81657791, + "learning_rate": 3.982233308024204e-06, + "loss": 0.8987757, + "num_input_tokens_seen": 25275150, + "router_z_loss_clip": 5.02734375, + "router_z_loss_mlp": 0.64794922, + "step": 1184, + "time_per_iteration": 2.651372194290161 + }, + { + "auxiliary_loss_clip": 0.06854693, + "auxiliary_loss_mlp": 0.013301, + "balance_loss_clip": 0.06374621, + "balance_loss_mlp": 0.01271926, + "epoch": 0.07124605441154366, + "flos": 19616514883200.0, + "grad_norm": 2.502972307695957, + "language_loss": 0.79704922, + "learning_rate": 3.98218147382666e-06, + "loss": 0.87889707, + "num_input_tokens_seen": 25293680, + "router_z_loss_clip": 4.796875, + "router_z_loss_mlp": 0.58178711, + "step": 1185, + "time_per_iteration": 2.591947555541992 + }, + { + "auxiliary_loss_clip": 0.06869413, + "auxiliary_loss_mlp": 0.01332248, + "balance_loss_clip": 0.06377724, + "balance_loss_mlp": 0.0127169, + "epoch": 0.07130617766421163, + "flos": 14689776441600.0, + "grad_norm": 8.952451247795917, + "language_loss": 0.68110502, + "learning_rate": 3.982129564464596e-06, + "loss": 0.7631216, + "num_input_tokens_seen": 25310050, + "router_z_loss_clip": 4.91015625, + "router_z_loss_mlp": 0.60546875, + "step": 1186, + "time_per_iteration": 2.52742862701416 + }, + { + "auxiliary_loss_clip": 0.06856332, + "auxiliary_loss_mlp": 0.01335213, + "balance_loss_clip": 0.06375858, + "balance_loss_mlp": 0.01277587, + "epoch": 0.07136630091687961, + "flos": 26075131269120.0, + "grad_norm": 3.0050123348369984, + "language_loss": 0.72187626, + "learning_rate": 3.98207757993998e-06, + "loss": 0.8037917, + "num_input_tokens_seen": 25331020, + "router_z_loss_clip": 4.796875, + "router_z_loss_mlp": 0.57641602, + "step": 1187, + "time_per_iteration": 2.6516740322113037 + }, + { + "auxiliary_loss_clip": 0.06852362, + "auxiliary_loss_mlp": 0.01318955, + "balance_loss_clip": 0.06373794, + "balance_loss_mlp": 0.01261901, + "epoch": 0.07142642416954757, + "flos": 15674621005440.0, + "grad_norm": 8.213543534109728, + "language_loss": 0.81159407, + "learning_rate": 3.9820255202547845e-06, + "loss": 0.89330727, + "num_input_tokens_seen": 25347875, + "router_z_loss_clip": 4.78125, + "router_z_loss_mlp": 0.57006836, + "step": 1188, + "time_per_iteration": 2.535729169845581 + }, + { + "auxiliary_loss_clip": 0.06864372, + "auxiliary_loss_mlp": 0.01337634, + "balance_loss_clip": 0.06379133, + "balance_loss_mlp": 0.01275216, + "epoch": 0.07148654742221554, + "flos": 19761389792640.0, + "grad_norm": 3.9335979273681794, + "language_loss": 0.87605166, + "learning_rate": 3.981973385410981e-06, + "loss": 0.95807171, + "num_input_tokens_seen": 25366715, + "router_z_loss_clip": 4.84765625, + "router_z_loss_mlp": 0.62402344, + "step": 1189, + "time_per_iteration": 2.6562387943267822 + }, + { + "auxiliary_loss_clip": 0.06861293, + "auxiliary_loss_mlp": 0.01342124, + "balance_loss_clip": 0.06382903, + "balance_loss_mlp": 0.01281685, + "epoch": 0.07154667067488352, + "flos": 23477669752320.0, + "grad_norm": 2.556740892092056, + "language_loss": 0.79916418, + "learning_rate": 3.9819211754105494e-06, + "loss": 0.88119841, + "num_input_tokens_seen": 25385450, + "router_z_loss_clip": 4.78125, + "router_z_loss_mlp": 0.60473633, + "step": 1190, + "time_per_iteration": 2.5854697227478027 + }, + { + "auxiliary_loss_clip": 0.06877136, + "auxiliary_loss_mlp": 0.01341277, + "balance_loss_clip": 0.06381981, + "balance_loss_mlp": 0.01274925, + "epoch": 0.07160679392755148, + "flos": 18338859826560.0, + "grad_norm": 3.405692469784563, + "language_loss": 0.78708088, + "learning_rate": 3.981868890255468e-06, + "loss": 0.86926508, + "num_input_tokens_seen": 25403940, + "router_z_loss_clip": 4.94140625, + "router_z_loss_mlp": 0.6628418, + "step": 1191, + "time_per_iteration": 2.638591766357422 + }, + { + "auxiliary_loss_clip": 0.06881537, + "auxiliary_loss_mlp": 0.01331932, + "balance_loss_clip": 0.06384552, + "balance_loss_mlp": 0.01271493, + "epoch": 0.07166691718021945, + "flos": 17752484154240.0, + "grad_norm": 4.470338815774188, + "language_loss": 0.76098609, + "learning_rate": 3.981816529947719e-06, + "loss": 0.84312069, + "num_input_tokens_seen": 25420410, + "router_z_loss_clip": 4.9609375, + "router_z_loss_mlp": 0.60424805, + "step": 1192, + "time_per_iteration": 2.5505447387695312 + }, + { + "auxiliary_loss_clip": 0.06871057, + "auxiliary_loss_mlp": 0.01335615, + "balance_loss_clip": 0.06381638, + "balance_loss_mlp": 0.01275009, + "epoch": 0.07172704043288743, + "flos": 22457885235840.0, + "grad_norm": 6.182703134969588, + "language_loss": 0.8089788, + "learning_rate": 3.9817640944892896e-06, + "loss": 0.89104557, + "num_input_tokens_seen": 25439415, + "router_z_loss_clip": 4.88671875, + "router_z_loss_mlp": 0.60644531, + "step": 1193, + "time_per_iteration": 2.633073329925537 + }, + { + "auxiliary_loss_clip": 0.06859954, + "auxiliary_loss_mlp": 0.01339771, + "balance_loss_clip": 0.06379488, + "balance_loss_mlp": 0.0127733, + "epoch": 0.07178716368555539, + "flos": 23228981233920.0, + "grad_norm": 5.198460731675794, + "language_loss": 0.88664103, + "learning_rate": 3.981711583882166e-06, + "loss": 0.96863824, + "num_input_tokens_seen": 25458715, + "router_z_loss_clip": 4.80078125, + "router_z_loss_mlp": 0.62426758, + "step": 1194, + "time_per_iteration": 2.5827341079711914 + }, + { + "auxiliary_loss_clip": 0.06866181, + "auxiliary_loss_mlp": 0.01325528, + "balance_loss_clip": 0.06383646, + "balance_loss_mlp": 0.01270096, + "epoch": 0.07184728693822336, + "flos": 25157064009600.0, + "grad_norm": 6.369260359442203, + "language_loss": 0.83872163, + "learning_rate": 3.981658998128341e-06, + "loss": 0.92063868, + "num_input_tokens_seen": 25477985, + "router_z_loss_clip": 4.828125, + "router_z_loss_mlp": 0.55444336, + "step": 1195, + "time_per_iteration": 2.6193504333496094 + }, + { + "auxiliary_loss_clip": 0.06856936, + "auxiliary_loss_mlp": 0.01324202, + "balance_loss_clip": 0.06375654, + "balance_loss_mlp": 0.01265241, + "epoch": 0.07190741019089132, + "flos": 22717894055040.0, + "grad_norm": 2.883346879050408, + "language_loss": 0.81836474, + "learning_rate": 3.981606337229808e-06, + "loss": 0.90017617, + "num_input_tokens_seen": 25497110, + "router_z_loss_clip": 4.8046875, + "router_z_loss_mlp": 0.58984375, + "step": 1196, + "time_per_iteration": 2.586151123046875 + }, + { + "auxiliary_loss_clip": 0.06870347, + "auxiliary_loss_mlp": 0.0135034, + "balance_loss_clip": 0.06381004, + "balance_loss_mlp": 0.0128828, + "epoch": 0.0719675334435593, + "flos": 29357247697920.0, + "grad_norm": 3.757214572000768, + "language_loss": 0.74150658, + "learning_rate": 3.9815536011885655e-06, + "loss": 0.82371342, + "num_input_tokens_seen": 25516555, + "router_z_loss_clip": 4.89453125, + "router_z_loss_mlp": 0.62109375, + "step": 1197, + "time_per_iteration": 2.653139114379883 + }, + { + "auxiliary_loss_clip": 0.06849834, + "auxiliary_loss_mlp": 0.01333514, + "balance_loss_clip": 0.0637273, + "balance_loss_mlp": 0.01277867, + "epoch": 0.07202765669622727, + "flos": 17645609871360.0, + "grad_norm": 7.565571046606514, + "language_loss": 0.88836908, + "learning_rate": 3.98150079000661e-06, + "loss": 0.97020251, + "num_input_tokens_seen": 25533895, + "router_z_loss_clip": 4.76953125, + "router_z_loss_mlp": 0.55664062, + "step": 1198, + "time_per_iteration": 2.558506727218628 + }, + { + "auxiliary_loss_clip": 0.06868395, + "auxiliary_loss_mlp": 0.01336115, + "balance_loss_clip": 0.06385568, + "balance_loss_mlp": 0.01278942, + "epoch": 0.07208777994889523, + "flos": 21440448633600.0, + "grad_norm": 9.650241915118821, + "language_loss": 0.86308157, + "learning_rate": 3.981447903685947e-06, + "loss": 0.94512665, + "num_input_tokens_seen": 25554195, + "router_z_loss_clip": 4.828125, + "router_z_loss_mlp": 0.57202148, + "step": 1199, + "time_per_iteration": 2.593768835067749 + }, + { + "auxiliary_loss_clip": 0.06879794, + "auxiliary_loss_mlp": 0.01340676, + "balance_loss_clip": 0.06389172, + "balance_loss_mlp": 0.01281167, + "epoch": 0.07214790320156321, + "flos": 26947776816000.0, + "grad_norm": 2.5713335496183136, + "language_loss": 0.78793061, + "learning_rate": 3.981394942228581e-06, + "loss": 0.87013531, + "num_input_tokens_seen": 25574155, + "router_z_loss_clip": 4.91015625, + "router_z_loss_mlp": 0.59521484, + "step": 1200, + "time_per_iteration": 2.6549324989318848 + }, + { + "auxiliary_loss_clip": 0.06889373, + "auxiliary_loss_mlp": 0.01341905, + "balance_loss_clip": 0.06398184, + "balance_loss_mlp": 0.01281109, + "epoch": 0.07220802645423118, + "flos": 23886997747200.0, + "grad_norm": 3.3919476714664185, + "language_loss": 0.84325218, + "learning_rate": 3.98134190563652e-06, + "loss": 0.925565, + "num_input_tokens_seen": 25592735, + "router_z_loss_clip": 4.91015625, + "router_z_loss_mlp": 0.60839844, + "step": 1201, + "time_per_iteration": 3.9977235794067383 + }, + { + "auxiliary_loss_clip": 0.06908435, + "auxiliary_loss_mlp": 0.01338574, + "balance_loss_clip": 0.06397285, + "balance_loss_mlp": 0.0127382, + "epoch": 0.07226814970689914, + "flos": 19249464072960.0, + "grad_norm": 2.7243272317134624, + "language_loss": 0.71221054, + "learning_rate": 3.981288793911775e-06, + "loss": 0.7946806, + "num_input_tokens_seen": 25611510, + "router_z_loss_clip": 5.109375, + "router_z_loss_mlp": 0.6472168, + "step": 1202, + "time_per_iteration": 4.006861925125122 + }, + { + "auxiliary_loss_clip": 0.06890082, + "auxiliary_loss_mlp": 0.01341886, + "balance_loss_clip": 0.06389347, + "balance_loss_mlp": 0.01278705, + "epoch": 0.07232827295956712, + "flos": 19178074794240.0, + "grad_norm": 3.218171076661328, + "language_loss": 0.89525115, + "learning_rate": 3.98123560705636e-06, + "loss": 0.97757077, + "num_input_tokens_seen": 25629560, + "router_z_loss_clip": 5.00390625, + "router_z_loss_mlp": 0.63232422, + "step": 1203, + "time_per_iteration": 2.6098897457122803 + }, + { + "auxiliary_loss_clip": 0.069024, + "auxiliary_loss_mlp": 0.01349525, + "balance_loss_clip": 0.06393193, + "balance_loss_mlp": 0.01279335, + "epoch": 0.07238839621223508, + "flos": 17645567944320.0, + "grad_norm": 3.0614329982122266, + "language_loss": 0.81485641, + "learning_rate": 3.981182345072293e-06, + "loss": 0.89737558, + "num_input_tokens_seen": 25648330, + "router_z_loss_clip": 5.09375, + "router_z_loss_mlp": 0.70214844, + "step": 1204, + "time_per_iteration": 3.999619960784912 + }, + { + "auxiliary_loss_clip": 0.06911701, + "auxiliary_loss_mlp": 0.01333494, + "balance_loss_clip": 0.06413823, + "balance_loss_mlp": 0.01269693, + "epoch": 0.07244851946490305, + "flos": 28299797971200.0, + "grad_norm": 3.782046298297649, + "language_loss": 0.84954846, + "learning_rate": 3.981129007961593e-06, + "loss": 0.9320004, + "num_input_tokens_seen": 25669470, + "router_z_loss_clip": 4.97265625, + "router_z_loss_mlp": 0.63818359, + "step": 1205, + "time_per_iteration": 2.658663272857666 + }, + { + "auxiliary_loss_clip": 0.06914138, + "auxiliary_loss_mlp": 0.0134752, + "balance_loss_clip": 0.06405394, + "balance_loss_mlp": 0.01278021, + "epoch": 0.07250864271757101, + "flos": 22571383991040.0, + "grad_norm": 9.50364615421703, + "language_loss": 0.78291214, + "learning_rate": 3.981075595726283e-06, + "loss": 0.86552876, + "num_input_tokens_seen": 25690470, + "router_z_loss_clip": 5.078125, + "router_z_loss_mlp": 0.69458008, + "step": 1206, + "time_per_iteration": 2.6500728130340576 + }, + { + "auxiliary_loss_clip": 0.06879818, + "auxiliary_loss_mlp": 0.01347642, + "balance_loss_clip": 0.06386471, + "balance_loss_mlp": 0.0128594, + "epoch": 0.072568765970239, + "flos": 21768869911680.0, + "grad_norm": 3.061800504881848, + "language_loss": 0.79528189, + "learning_rate": 3.981022108368387e-06, + "loss": 0.87755644, + "num_input_tokens_seen": 25709205, + "router_z_loss_clip": 4.9375, + "router_z_loss_mlp": 0.61767578, + "step": 1207, + "time_per_iteration": 4.111234903335571 + }, + { + "auxiliary_loss_clip": 0.06890166, + "auxiliary_loss_mlp": 0.0133734, + "balance_loss_clip": 0.06392397, + "balance_loss_mlp": 0.01278618, + "epoch": 0.07262888922290696, + "flos": 25526672369280.0, + "grad_norm": 2.516808639831756, + "language_loss": 0.82780725, + "learning_rate": 3.9809685458899345e-06, + "loss": 0.91008234, + "num_input_tokens_seen": 25728485, + "router_z_loss_clip": 4.98046875, + "router_z_loss_mlp": 0.58789062, + "step": 1208, + "time_per_iteration": 2.65267276763916 + }, + { + "auxiliary_loss_clip": 0.06873606, + "auxiliary_loss_mlp": 0.01329274, + "balance_loss_clip": 0.06393886, + "balance_loss_mlp": 0.01270813, + "epoch": 0.07268901247557492, + "flos": 21252080655360.0, + "grad_norm": 3.726862788271486, + "language_loss": 0.80825698, + "learning_rate": 3.980914908292955e-06, + "loss": 0.89028573, + "num_input_tokens_seen": 25747730, + "router_z_loss_clip": 4.80078125, + "router_z_loss_mlp": 0.58496094, + "step": 1209, + "time_per_iteration": 2.5653858184814453 + }, + { + "auxiliary_loss_clip": 0.06887256, + "auxiliary_loss_mlp": 0.01333341, + "balance_loss_clip": 0.06401981, + "balance_loss_mlp": 0.012714, + "epoch": 0.0727491357282429, + "flos": 25485611068800.0, + "grad_norm": 85.1554110577333, + "language_loss": 0.83058631, + "learning_rate": 3.980861195579486e-06, + "loss": 0.91279227, + "num_input_tokens_seen": 25768050, + "router_z_loss_clip": 4.84375, + "router_z_loss_mlp": 0.61962891, + "step": 1210, + "time_per_iteration": 2.6290841102600098 + }, + { + "auxiliary_loss_clip": 0.06912959, + "auxiliary_loss_mlp": 0.01335995, + "balance_loss_clip": 0.064188, + "balance_loss_mlp": 0.01275437, + "epoch": 0.07280925898091087, + "flos": 24469054934400.0, + "grad_norm": 2.3690681332483092, + "language_loss": 0.87872899, + "learning_rate": 3.98080740775156e-06, + "loss": 0.96121848, + "num_input_tokens_seen": 25787985, + "router_z_loss_clip": 4.93359375, + "router_z_loss_mlp": 0.60571289, + "step": 1211, + "time_per_iteration": 2.601407289505005 + }, + { + "auxiliary_loss_clip": 0.06907704, + "auxiliary_loss_mlp": 0.01325307, + "balance_loss_clip": 0.06408024, + "balance_loss_mlp": 0.01262221, + "epoch": 0.07286938223357883, + "flos": 18292725354240.0, + "grad_norm": 12.676001298421971, + "language_loss": 0.94102865, + "learning_rate": 3.98075354481122e-06, + "loss": 1.0233587, + "num_input_tokens_seen": 25803620, + "router_z_loss_clip": 5.0, + "router_z_loss_mlp": 0.63134766, + "step": 1212, + "time_per_iteration": 2.583038806915283 + }, + { + "auxiliary_loss_clip": 0.06906819, + "auxiliary_loss_mlp": 0.0132597, + "balance_loss_clip": 0.06410546, + "balance_loss_mlp": 0.01265579, + "epoch": 0.07292950548624681, + "flos": 21221123771520.0, + "grad_norm": 2.174057870864043, + "language_loss": 0.74973536, + "learning_rate": 3.9806996067605055e-06, + "loss": 0.8320632, + "num_input_tokens_seen": 25823315, + "router_z_loss_clip": 4.96484375, + "router_z_loss_mlp": 0.60449219, + "step": 1213, + "time_per_iteration": 2.58750319480896 + }, + { + "auxiliary_loss_clip": 0.06919889, + "auxiliary_loss_mlp": 0.01335737, + "balance_loss_clip": 0.06414144, + "balance_loss_mlp": 0.01270815, + "epoch": 0.07298962873891478, + "flos": 24648492453120.0, + "grad_norm": 3.5327448066046547, + "language_loss": 0.86681479, + "learning_rate": 3.980645593601465e-06, + "loss": 0.9493711, + "num_input_tokens_seen": 25842605, + "router_z_loss_clip": 5.046875, + "router_z_loss_mlp": 0.64868164, + "step": 1214, + "time_per_iteration": 2.6603875160217285 + }, + { + "auxiliary_loss_clip": 0.0691122, + "auxiliary_loss_mlp": 0.01328745, + "balance_loss_clip": 0.06415356, + "balance_loss_mlp": 0.01268855, + "epoch": 0.07304975199158274, + "flos": 27060101614080.0, + "grad_norm": 2.7007963802747197, + "language_loss": 0.87098217, + "learning_rate": 3.980591505336144e-06, + "loss": 0.95338178, + "num_input_tokens_seen": 25863030, + "router_z_loss_clip": 4.95703125, + "router_z_loss_mlp": 0.59863281, + "step": 1215, + "time_per_iteration": 2.6591246128082275 + }, + { + "auxiliary_loss_clip": 0.06944987, + "auxiliary_loss_mlp": 0.01336211, + "balance_loss_clip": 0.06434523, + "balance_loss_mlp": 0.01269025, + "epoch": 0.07310987524425071, + "flos": 33558353781120.0, + "grad_norm": 3.0486240121539385, + "language_loss": 0.83975989, + "learning_rate": 3.980537341966595e-06, + "loss": 0.9225719, + "num_input_tokens_seen": 25888015, + "router_z_loss_clip": 5.1015625, + "router_z_loss_mlp": 0.67138672, + "step": 1216, + "time_per_iteration": 2.7674107551574707 + }, + { + "auxiliary_loss_clip": 0.06944714, + "auxiliary_loss_mlp": 0.01339054, + "balance_loss_clip": 0.06429577, + "balance_loss_mlp": 0.01274585, + "epoch": 0.07316999849691869, + "flos": 28118473735680.0, + "grad_norm": 3.328421621220486, + "language_loss": 0.78921533, + "learning_rate": 3.980483103494872e-06, + "loss": 0.87205303, + "num_input_tokens_seen": 25908660, + "router_z_loss_clip": 5.1484375, + "router_z_loss_mlp": 0.64550781, + "step": 1217, + "time_per_iteration": 2.672692060470581 + }, + { + "auxiliary_loss_clip": 0.06904574, + "auxiliary_loss_mlp": 0.01321216, + "balance_loss_clip": 0.06406265, + "balance_loss_mlp": 0.01263614, + "epoch": 0.07323012174958665, + "flos": 14397888343680.0, + "grad_norm": 2.4648840381938752, + "language_loss": 0.88704532, + "learning_rate": 3.98042878992303e-06, + "loss": 0.96930325, + "num_input_tokens_seen": 25927215, + "router_z_loss_clip": 4.984375, + "router_z_loss_mlp": 0.57592773, + "step": 1218, + "time_per_iteration": 2.6067652702331543 + }, + { + "auxiliary_loss_clip": 0.06908453, + "auxiliary_loss_mlp": 0.01339024, + "balance_loss_clip": 0.06418494, + "balance_loss_mlp": 0.01277607, + "epoch": 0.07329024500225462, + "flos": 21622862972160.0, + "grad_norm": 2.509726295852636, + "language_loss": 0.89056909, + "learning_rate": 3.9803744012531305e-06, + "loss": 0.9730438, + "num_input_tokens_seen": 25945500, + "router_z_loss_clip": 4.89453125, + "router_z_loss_mlp": 0.61376953, + "step": 1219, + "time_per_iteration": 2.644948959350586 + }, + { + "auxiliary_loss_clip": 0.0689719, + "auxiliary_loss_mlp": 0.01336847, + "balance_loss_clip": 0.06407624, + "balance_loss_mlp": 0.01275287, + "epoch": 0.0733503682549226, + "flos": 13229078140800.0, + "grad_norm": 3.459180464583836, + "language_loss": 0.87265766, + "learning_rate": 3.980319937487235e-06, + "loss": 0.95499802, + "num_input_tokens_seen": 25963105, + "router_z_loss_clip": 4.8984375, + "router_z_loss_mlp": 0.61621094, + "step": 1220, + "time_per_iteration": 2.575570583343506 + }, + { + "auxiliary_loss_clip": 0.06925908, + "auxiliary_loss_mlp": 0.01352206, + "balance_loss_clip": 0.06422862, + "balance_loss_mlp": 0.0128974, + "epoch": 0.07341049150759056, + "flos": 20893331399040.0, + "grad_norm": 4.615259324948809, + "language_loss": 0.79933828, + "learning_rate": 3.98026539862741e-06, + "loss": 0.88211942, + "num_input_tokens_seen": 25981690, + "router_z_loss_clip": 5.03125, + "router_z_loss_mlp": 0.62451172, + "step": 1221, + "time_per_iteration": 2.6174440383911133 + }, + { + "auxiliary_loss_clip": 0.06900848, + "auxiliary_loss_mlp": 0.01351796, + "balance_loss_clip": 0.06404451, + "balance_loss_mlp": 0.01290761, + "epoch": 0.07347061476025853, + "flos": 15418972598400.0, + "grad_norm": 2.5998624424358106, + "language_loss": 0.95159388, + "learning_rate": 3.980210784675722e-06, + "loss": 1.03412032, + "num_input_tokens_seen": 25999890, + "router_z_loss_clip": 4.9609375, + "router_z_loss_mlp": 0.61035156, + "step": 1222, + "time_per_iteration": 2.5956273078918457 + }, + { + "auxiliary_loss_clip": 0.06908462, + "auxiliary_loss_mlp": 0.01358079, + "balance_loss_clip": 0.06414389, + "balance_loss_mlp": 0.01303147, + "epoch": 0.0735307380129265, + "flos": 11113591708800.0, + "grad_norm": 14.551194351183868, + "language_loss": 0.93725538, + "learning_rate": 3.980156095634242e-06, + "loss": 1.01992083, + "num_input_tokens_seen": 26016445, + "router_z_loss_clip": 4.94140625, + "router_z_loss_mlp": 0.54907227, + "step": 1223, + "time_per_iteration": 2.5886712074279785 + }, + { + "auxiliary_loss_clip": 0.06916398, + "auxiliary_loss_mlp": 0.01394841, + "balance_loss_clip": 0.06417241, + "balance_loss_mlp": 0.01330874, + "epoch": 0.07359086126559447, + "flos": 23739146017920.0, + "grad_norm": 2.48832330955176, + "language_loss": 0.84952593, + "learning_rate": 3.980101331505045e-06, + "loss": 0.93263835, + "num_input_tokens_seen": 26036080, + "router_z_loss_clip": 4.984375, + "router_z_loss_mlp": 0.63989258, + "step": 1224, + "time_per_iteration": 2.600796937942505 + }, + { + "auxiliary_loss_clip": 0.06916806, + "auxiliary_loss_mlp": 0.01413444, + "balance_loss_clip": 0.06410658, + "balance_loss_mlp": 0.0134354, + "epoch": 0.07365098451826244, + "flos": 20999115578880.0, + "grad_norm": 3.5000549679052932, + "language_loss": 0.86487269, + "learning_rate": 3.9800464922902076e-06, + "loss": 0.94817519, + "num_input_tokens_seen": 26055805, + "router_z_loss_clip": 5.0625, + "router_z_loss_mlp": 0.69921875, + "step": 1225, + "time_per_iteration": 2.6348657608032227 + }, + { + "auxiliary_loss_clip": 0.06893472, + "auxiliary_loss_mlp": 0.01405003, + "balance_loss_clip": 0.06406252, + "balance_loss_mlp": 0.01345017, + "epoch": 0.0737111077709304, + "flos": 19938982521600.0, + "grad_norm": 2.4160640893773544, + "language_loss": 0.93043572, + "learning_rate": 3.979991577991808e-06, + "loss": 1.01342046, + "num_input_tokens_seen": 26073905, + "router_z_loss_clip": 4.8671875, + "router_z_loss_mlp": 0.59960938, + "step": 1226, + "time_per_iteration": 2.5814220905303955 + }, + { + "auxiliary_loss_clip": 0.06951886, + "auxiliary_loss_mlp": 0.01454874, + "balance_loss_clip": 0.06431323, + "balance_loss_mlp": 0.01382633, + "epoch": 0.07377123102359838, + "flos": 16587153895680.0, + "grad_norm": 17.71044350544229, + "language_loss": 0.81177175, + "learning_rate": 3.97993658861193e-06, + "loss": 0.89583939, + "num_input_tokens_seen": 26091700, + "router_z_loss_clip": 5.21484375, + "router_z_loss_mlp": 0.72216797, + "step": 1227, + "time_per_iteration": 2.562495708465576 + }, + { + "auxiliary_loss_clip": 0.06910308, + "auxiliary_loss_mlp": 0.0141995, + "balance_loss_clip": 0.06419577, + "balance_loss_mlp": 0.01357318, + "epoch": 0.07383135427626634, + "flos": 28335911880960.0, + "grad_norm": 2.0840618907227113, + "language_loss": 0.88551241, + "learning_rate": 3.9798815241526575e-06, + "loss": 0.96881503, + "num_input_tokens_seen": 26114105, + "router_z_loss_clip": 4.90625, + "router_z_loss_mlp": 0.6262207, + "step": 1228, + "time_per_iteration": 2.6383354663848877 + }, + { + "auxiliary_loss_clip": 0.06927899, + "auxiliary_loss_mlp": 0.01421335, + "balance_loss_clip": 0.06420749, + "balance_loss_mlp": 0.01352098, + "epoch": 0.07389147752893431, + "flos": 20053277890560.0, + "grad_norm": 2.9618119227327493, + "language_loss": 0.82374752, + "learning_rate": 3.97982638461608e-06, + "loss": 0.90723979, + "num_input_tokens_seen": 26131165, + "router_z_loss_clip": 5.0625, + "router_z_loss_mlp": 0.69238281, + "step": 1229, + "time_per_iteration": 2.572110414505005 + }, + { + "auxiliary_loss_clip": 0.06918953, + "auxiliary_loss_mlp": 0.01426217, + "balance_loss_clip": 0.06413613, + "balance_loss_mlp": 0.01351926, + "epoch": 0.07395160078160229, + "flos": 18120038088960.0, + "grad_norm": 2.8764105468999697, + "language_loss": 0.81244183, + "learning_rate": 3.979771170004287e-06, + "loss": 0.89589357, + "num_input_tokens_seen": 26150040, + "router_z_loss_clip": 5.046875, + "router_z_loss_mlp": 0.74267578, + "step": 1230, + "time_per_iteration": 2.580080270767212 + }, + { + "auxiliary_loss_clip": 0.06901585, + "auxiliary_loss_mlp": 0.01391553, + "balance_loss_clip": 0.06406316, + "balance_loss_mlp": 0.01325273, + "epoch": 0.07401172403427025, + "flos": 23593726056960.0, + "grad_norm": 2.3354922031953547, + "language_loss": 0.83756942, + "learning_rate": 3.979715880319372e-06, + "loss": 0.92050081, + "num_input_tokens_seen": 26169380, + "router_z_loss_clip": 4.95703125, + "router_z_loss_mlp": 0.66210938, + "step": 1231, + "time_per_iteration": 2.6182961463928223 + }, + { + "auxiliary_loss_clip": 0.06916339, + "auxiliary_loss_mlp": 0.01398184, + "balance_loss_clip": 0.06416178, + "balance_loss_mlp": 0.01340868, + "epoch": 0.07407184728693822, + "flos": 26367187075200.0, + "grad_norm": 2.448759958115063, + "language_loss": 0.97958755, + "learning_rate": 3.979660515563434e-06, + "loss": 1.0627327, + "num_input_tokens_seen": 26189420, + "router_z_loss_clip": 5.0, + "router_z_loss_mlp": 0.57373047, + "step": 1232, + "time_per_iteration": 2.6219074726104736 + }, + { + "auxiliary_loss_clip": 0.06881506, + "auxiliary_loss_mlp": 0.01383375, + "balance_loss_clip": 0.06404279, + "balance_loss_mlp": 0.01327991, + "epoch": 0.0741319705396062, + "flos": 22207016511360.0, + "grad_norm": 2.790382340569057, + "language_loss": 0.83657277, + "learning_rate": 3.979605075738569e-06, + "loss": 0.91922164, + "num_input_tokens_seen": 26209300, + "router_z_loss_clip": 4.7734375, + "router_z_loss_mlp": 0.55395508, + "step": 1233, + "time_per_iteration": 2.6186439990997314 + }, + { + "auxiliary_loss_clip": 0.06909496, + "auxiliary_loss_mlp": 0.0136395, + "balance_loss_clip": 0.06408279, + "balance_loss_mlp": 0.01302462, + "epoch": 0.07419209379227416, + "flos": 39209508696960.0, + "grad_norm": 3.1172656995673393, + "language_loss": 0.73086953, + "learning_rate": 3.979549560846883e-06, + "loss": 0.813604, + "num_input_tokens_seen": 26228110, + "router_z_loss_clip": 5.0078125, + "router_z_loss_mlp": 0.61450195, + "step": 1234, + "time_per_iteration": 2.750397205352783 + }, + { + "auxiliary_loss_clip": 0.0689207, + "auxiliary_loss_mlp": 0.01355226, + "balance_loss_clip": 0.06398024, + "balance_loss_mlp": 0.01294786, + "epoch": 0.07425221704494213, + "flos": 22787899741440.0, + "grad_norm": 2.355636628350322, + "language_loss": 0.789891, + "learning_rate": 3.979493970890478e-06, + "loss": 0.87236392, + "num_input_tokens_seen": 26247020, + "router_z_loss_clip": 4.9375, + "router_z_loss_mlp": 0.60473633, + "step": 1235, + "time_per_iteration": 2.5847980976104736 + }, + { + "auxiliary_loss_clip": 0.06876536, + "auxiliary_loss_mlp": 0.0134157, + "balance_loss_clip": 0.0640441, + "balance_loss_mlp": 0.01286972, + "epoch": 0.0743123402976101, + "flos": 22279495893120.0, + "grad_norm": 4.38662001374288, + "language_loss": 0.84938204, + "learning_rate": 3.979438305871464e-06, + "loss": 0.93156314, + "num_input_tokens_seen": 26265750, + "router_z_loss_clip": 4.71484375, + "router_z_loss_mlp": 0.54589844, + "step": 1236, + "time_per_iteration": 2.6517555713653564 + }, + { + "auxiliary_loss_clip": 0.06904443, + "auxiliary_loss_mlp": 0.013457, + "balance_loss_clip": 0.06407445, + "balance_loss_mlp": 0.01288479, + "epoch": 0.07437246355027807, + "flos": 29322768942720.0, + "grad_norm": 2.2405587930301705, + "language_loss": 0.78282797, + "learning_rate": 3.979382565791951e-06, + "loss": 0.86532938, + "num_input_tokens_seen": 26287905, + "router_z_loss_clip": 4.96875, + "router_z_loss_mlp": 0.57275391, + "step": 1237, + "time_per_iteration": 2.729818105697632 + }, + { + "auxiliary_loss_clip": 0.06881858, + "auxiliary_loss_mlp": 0.01325868, + "balance_loss_clip": 0.06397796, + "balance_loss_mlp": 0.01274488, + "epoch": 0.07443258680294604, + "flos": 31953367549440.0, + "grad_norm": 2.5947803667316123, + "language_loss": 0.79746008, + "learning_rate": 3.979326750654053e-06, + "loss": 0.87953734, + "num_input_tokens_seen": 26311795, + "router_z_loss_clip": 4.84765625, + "router_z_loss_mlp": 0.51391602, + "step": 1238, + "time_per_iteration": 2.7127678394317627 + }, + { + "auxiliary_loss_clip": 0.06888152, + "auxiliary_loss_mlp": 0.01350045, + "balance_loss_clip": 0.06387939, + "balance_loss_mlp": 0.01285982, + "epoch": 0.074492710055614, + "flos": 22682031707520.0, + "grad_norm": 6.17193517167714, + "language_loss": 0.88359845, + "learning_rate": 3.9792708604598854e-06, + "loss": 0.96598047, + "num_input_tokens_seen": 26330330, + "router_z_loss_clip": 5.00390625, + "router_z_loss_mlp": 0.64038086, + "step": 1239, + "time_per_iteration": 2.5982487201690674 + }, + { + "auxiliary_loss_clip": 0.06867203, + "auxiliary_loss_mlp": 0.01339139, + "balance_loss_clip": 0.06376298, + "balance_loss_mlp": 0.01279201, + "epoch": 0.07455283330828198, + "flos": 21290752114560.0, + "grad_norm": 4.728508562946579, + "language_loss": 0.9183414, + "learning_rate": 3.979214895211569e-06, + "loss": 1.00040483, + "num_input_tokens_seen": 26348865, + "router_z_loss_clip": 4.90625, + "router_z_loss_mlp": 0.59960938, + "step": 1240, + "time_per_iteration": 3.982212781906128 + }, + { + "auxiliary_loss_clip": 0.0687404, + "auxiliary_loss_mlp": 0.01344277, + "balance_loss_clip": 0.06383809, + "balance_loss_mlp": 0.01287676, + "epoch": 0.07461295656094995, + "flos": 24395150033280.0, + "grad_norm": 2.7209561023558506, + "language_loss": 0.903265, + "learning_rate": 3.979158854911225e-06, + "loss": 0.98544812, + "num_input_tokens_seen": 26368210, + "router_z_loss_clip": 4.8984375, + "router_z_loss_mlp": 0.56616211, + "step": 1241, + "time_per_iteration": 2.622676372528076 + }, + { + "auxiliary_loss_clip": 0.06764787, + "auxiliary_loss_mlp": 0.01319561, + "balance_loss_clip": 0.06452408, + "balance_loss_mlp": 0.01283775, + "epoch": 0.07467307981361791, + "flos": 62127971498880.0, + "grad_norm": 0.8806411506129102, + "language_loss": 0.63242501, + "learning_rate": 3.979102739560979e-06, + "loss": 0.71326846, + "num_input_tokens_seen": 26424890, + "router_z_loss_clip": 3.125, + "router_z_loss_mlp": 0.35864258, + "step": 1242, + "time_per_iteration": 4.608001947402954 + }, + { + "auxiliary_loss_clip": 0.06884564, + "auxiliary_loss_mlp": 0.01350666, + "balance_loss_clip": 0.06376857, + "balance_loss_mlp": 0.01288319, + "epoch": 0.07473320306628589, + "flos": 24870039448320.0, + "grad_norm": 20.01115775481137, + "language_loss": 0.65988898, + "learning_rate": 3.9790465491629595e-06, + "loss": 0.74224126, + "num_input_tokens_seen": 26446405, + "router_z_loss_clip": 5.08203125, + "router_z_loss_mlp": 0.6237793, + "step": 1243, + "time_per_iteration": 2.686720371246338 + }, + { + "auxiliary_loss_clip": 0.068617, + "auxiliary_loss_mlp": 0.01347661, + "balance_loss_clip": 0.06381305, + "balance_loss_mlp": 0.01292491, + "epoch": 0.07479332631895386, + "flos": 24903973152000.0, + "grad_norm": 3.6813184842747346, + "language_loss": 0.78008217, + "learning_rate": 3.978990283719296e-06, + "loss": 0.86217576, + "num_input_tokens_seen": 26466070, + "router_z_loss_clip": 4.8046875, + "router_z_loss_mlp": 0.55175781, + "step": 1244, + "time_per_iteration": 4.040115833282471 + }, + { + "auxiliary_loss_clip": 0.06851211, + "auxiliary_loss_mlp": 0.01348909, + "balance_loss_clip": 0.06370524, + "balance_loss_mlp": 0.01292833, + "epoch": 0.07485344957162182, + "flos": 17819932291200.0, + "grad_norm": 21.86650929914808, + "language_loss": 0.72362238, + "learning_rate": 3.978933943232123e-06, + "loss": 0.80562365, + "num_input_tokens_seen": 26479350, + "router_z_loss_clip": 4.80078125, + "router_z_loss_mlp": 0.56103516, + "step": 1245, + "time_per_iteration": 2.524477481842041 + }, + { + "auxiliary_loss_clip": 0.06865877, + "auxiliary_loss_mlp": 0.01375645, + "balance_loss_clip": 0.06379819, + "balance_loss_mlp": 0.01317042, + "epoch": 0.0749135728242898, + "flos": 25017304199040.0, + "grad_norm": 2.436107230077969, + "language_loss": 0.90751457, + "learning_rate": 3.978877527703576e-06, + "loss": 0.98992985, + "num_input_tokens_seen": 26498255, + "router_z_loss_clip": 4.85546875, + "router_z_loss_mlp": 0.58642578, + "step": 1246, + "time_per_iteration": 4.0361082553863525 + }, + { + "auxiliary_loss_clip": 0.06889592, + "auxiliary_loss_mlp": 0.01353914, + "balance_loss_clip": 0.06373734, + "balance_loss_mlp": 0.0128978, + "epoch": 0.07497369607695777, + "flos": 17827898428800.0, + "grad_norm": 3.630435288529284, + "language_loss": 0.91536689, + "learning_rate": 3.9788210371357945e-06, + "loss": 0.99780184, + "num_input_tokens_seen": 26515375, + "router_z_loss_clip": 5.15234375, + "router_z_loss_mlp": 0.64111328, + "step": 1247, + "time_per_iteration": 2.558710813522339 + }, + { + "auxiliary_loss_clip": 0.06850724, + "auxiliary_loss_mlp": 0.01373111, + "balance_loss_clip": 0.06373762, + "balance_loss_mlp": 0.01312124, + "epoch": 0.07503381932962573, + "flos": 15126287886720.0, + "grad_norm": 2.9459859952497336, + "language_loss": 0.67146099, + "learning_rate": 3.978764471530921e-06, + "loss": 0.7536993, + "num_input_tokens_seen": 26533595, + "router_z_loss_clip": 4.7578125, + "router_z_loss_mlp": 0.60986328, + "step": 1248, + "time_per_iteration": 2.5451245307922363 + }, + { + "auxiliary_loss_clip": 0.06826814, + "auxiliary_loss_mlp": 0.0138466, + "balance_loss_clip": 0.06362367, + "balance_loss_mlp": 0.01326009, + "epoch": 0.0750939425822937, + "flos": 12820588686720.0, + "grad_norm": 4.865871965779137, + "language_loss": 0.76126468, + "learning_rate": 3.978707830891102e-06, + "loss": 0.84337938, + "num_input_tokens_seen": 26549405, + "router_z_loss_clip": 4.64453125, + "router_z_loss_mlp": 0.58642578, + "step": 1249, + "time_per_iteration": 2.547814130783081 + }, + { + "auxiliary_loss_clip": 0.06878477, + "auxiliary_loss_mlp": 0.01356674, + "balance_loss_clip": 0.06384575, + "balance_loss_mlp": 0.01291156, + "epoch": 0.07515406583496168, + "flos": 24213700016640.0, + "grad_norm": 3.3650478618726805, + "language_loss": 0.84855753, + "learning_rate": 3.978651115218482e-06, + "loss": 0.93090904, + "num_input_tokens_seen": 26567200, + "router_z_loss_clip": 4.9296875, + "router_z_loss_mlp": 0.65429688, + "step": 1250, + "time_per_iteration": 2.6201655864715576 + }, + { + "auxiliary_loss_clip": 0.0685844, + "auxiliary_loss_mlp": 0.01372833, + "balance_loss_clip": 0.06383228, + "balance_loss_mlp": 0.01312036, + "epoch": 0.07521418908762964, + "flos": 26695482572160.0, + "grad_norm": 2.950747307093222, + "language_loss": 0.7010417, + "learning_rate": 3.978594324515215e-06, + "loss": 0.7833544, + "num_input_tokens_seen": 26586190, + "router_z_loss_clip": 4.74609375, + "router_z_loss_mlp": 0.60742188, + "step": 1251, + "time_per_iteration": 2.6431658267974854 + }, + { + "auxiliary_loss_clip": 0.06735167, + "auxiliary_loss_mlp": 0.01321971, + "balance_loss_clip": 0.06424966, + "balance_loss_mlp": 0.0128411, + "epoch": 0.0752743123402976, + "flos": 59115255546240.0, + "grad_norm": 0.864981950603712, + "language_loss": 0.69976699, + "learning_rate": 3.9785374587834515e-06, + "loss": 0.78033841, + "num_input_tokens_seen": 26650710, + "router_z_loss_clip": 3.10351562, + "router_z_loss_mlp": 0.37792969, + "step": 1252, + "time_per_iteration": 3.2185781002044678 + }, + { + "auxiliary_loss_clip": 0.06854245, + "auxiliary_loss_mlp": 0.01348889, + "balance_loss_clip": 0.06374305, + "balance_loss_mlp": 0.01288426, + "epoch": 0.07533443559296558, + "flos": 23483749173120.0, + "grad_norm": 3.3162526589419876, + "language_loss": 0.82824075, + "learning_rate": 3.97848051802535e-06, + "loss": 0.91027212, + "num_input_tokens_seen": 26669000, + "router_z_loss_clip": 4.7890625, + "router_z_loss_mlp": 0.60498047, + "step": 1253, + "time_per_iteration": 2.6227848529815674 + }, + { + "auxiliary_loss_clip": 0.06867173, + "auxiliary_loss_mlp": 0.01358456, + "balance_loss_clip": 0.06365065, + "balance_loss_mlp": 0.01293749, + "epoch": 0.07539455884563355, + "flos": 20884149377280.0, + "grad_norm": 6.3858164660002625, + "language_loss": 0.96525204, + "learning_rate": 3.978423502243069e-06, + "loss": 1.04750824, + "num_input_tokens_seen": 26683075, + "router_z_loss_clip": 5.015625, + "router_z_loss_mlp": 0.64697266, + "step": 1254, + "time_per_iteration": 2.5511484146118164 + }, + { + "auxiliary_loss_clip": 0.06840456, + "auxiliary_loss_mlp": 0.0135521, + "balance_loss_clip": 0.06368542, + "balance_loss_mlp": 0.012916, + "epoch": 0.07545468209830151, + "flos": 27680327136000.0, + "grad_norm": 2.4514498349060307, + "language_loss": 0.9076122, + "learning_rate": 3.97836641143877e-06, + "loss": 0.98956883, + "num_input_tokens_seen": 26701875, + "router_z_loss_clip": 4.71875, + "router_z_loss_mlp": 0.63525391, + "step": 1255, + "time_per_iteration": 2.6308302879333496 + }, + { + "auxiliary_loss_clip": 0.06840869, + "auxiliary_loss_mlp": 0.01347194, + "balance_loss_clip": 0.06364559, + "balance_loss_mlp": 0.01285968, + "epoch": 0.0755148053509695, + "flos": 14142198009600.0, + "grad_norm": 2.7245497332904325, + "language_loss": 0.81970763, + "learning_rate": 3.978309245614618e-06, + "loss": 0.90158832, + "num_input_tokens_seen": 26719050, + "router_z_loss_clip": 4.75390625, + "router_z_loss_mlp": 0.61230469, + "step": 1256, + "time_per_iteration": 2.552151679992676 + }, + { + "auxiliary_loss_clip": 0.06681269, + "auxiliary_loss_mlp": 0.01315431, + "balance_loss_clip": 0.06378952, + "balance_loss_mlp": 0.01282076, + "epoch": 0.07557492860363746, + "flos": 58251764822400.0, + "grad_norm": 0.7695886437006154, + "language_loss": 0.58049726, + "learning_rate": 3.9782520047727825e-06, + "loss": 0.66046429, + "num_input_tokens_seen": 26780650, + "router_z_loss_clip": 3.03125, + "router_z_loss_mlp": 0.33374023, + "step": 1257, + "time_per_iteration": 3.304816246032715 + }, + { + "auxiliary_loss_clip": 0.06853162, + "auxiliary_loss_mlp": 0.0135189, + "balance_loss_clip": 0.0636155, + "balance_loss_mlp": 0.01284012, + "epoch": 0.07563505185630542, + "flos": 24651259637760.0, + "grad_norm": 2.373470459060695, + "language_loss": 0.93104446, + "learning_rate": 3.978194688915432e-06, + "loss": 1.0130949, + "num_input_tokens_seen": 26798725, + "router_z_loss_clip": 4.91015625, + "router_z_loss_mlp": 0.6784668, + "step": 1258, + "time_per_iteration": 2.6907479763031006 + }, + { + "auxiliary_loss_clip": 0.06829782, + "auxiliary_loss_mlp": 0.01330684, + "balance_loss_clip": 0.06361564, + "balance_loss_mlp": 0.01273559, + "epoch": 0.07569517510897339, + "flos": 15528362503680.0, + "grad_norm": 3.094615329702446, + "language_loss": 0.84079689, + "learning_rate": 3.978137298044741e-06, + "loss": 0.92240155, + "num_input_tokens_seen": 26817005, + "router_z_loss_clip": 4.6796875, + "router_z_loss_mlp": 0.57128906, + "step": 1259, + "time_per_iteration": 2.5581536293029785 + }, + { + "auxiliary_loss_clip": 0.06848526, + "auxiliary_loss_mlp": 0.0132832, + "balance_loss_clip": 0.06371632, + "balance_loss_mlp": 0.01271052, + "epoch": 0.07575529836164137, + "flos": 22934954856960.0, + "grad_norm": 3.148240250348832, + "language_loss": 0.77577376, + "learning_rate": 3.978079832162885e-06, + "loss": 0.85754222, + "num_input_tokens_seen": 26836655, + "router_z_loss_clip": 4.76171875, + "router_z_loss_mlp": 0.57275391, + "step": 1260, + "time_per_iteration": 2.601511240005493 + }, + { + "auxiliary_loss_clip": 0.06837059, + "auxiliary_loss_mlp": 0.01329742, + "balance_loss_clip": 0.06359653, + "balance_loss_mlp": 0.01268421, + "epoch": 0.07581542161430933, + "flos": 19506537999360.0, + "grad_norm": 2.0302273693268535, + "language_loss": 0.87771595, + "learning_rate": 3.978022291272044e-06, + "loss": 0.95938396, + "num_input_tokens_seen": 26854925, + "router_z_loss_clip": 4.77734375, + "router_z_loss_mlp": 0.61328125, + "step": 1261, + "time_per_iteration": 2.5501255989074707 + }, + { + "auxiliary_loss_clip": 0.06841564, + "auxiliary_loss_mlp": 0.01315914, + "balance_loss_clip": 0.06369701, + "balance_loss_mlp": 0.01256547, + "epoch": 0.0758755448669773, + "flos": 24980519456640.0, + "grad_norm": 2.7189086354386407, + "language_loss": 0.84886664, + "learning_rate": 3.977964675374399e-06, + "loss": 0.93044144, + "num_input_tokens_seen": 26876170, + "router_z_loss_clip": 4.70703125, + "router_z_loss_mlp": 0.59423828, + "step": 1262, + "time_per_iteration": 2.642197370529175 + }, + { + "auxiliary_loss_clip": 0.06848589, + "auxiliary_loss_mlp": 0.01328257, + "balance_loss_clip": 0.06354951, + "balance_loss_mlp": 0.01263312, + "epoch": 0.07593566811964528, + "flos": 22754678797440.0, + "grad_norm": 3.7332355829542183, + "language_loss": 0.84859836, + "learning_rate": 3.977906984472136e-06, + "loss": 0.93036681, + "num_input_tokens_seen": 26895005, + "router_z_loss_clip": 4.94140625, + "router_z_loss_mlp": 0.64941406, + "step": 1263, + "time_per_iteration": 2.5762293338775635 + }, + { + "auxiliary_loss_clip": 0.06852871, + "auxiliary_loss_mlp": 0.01316465, + "balance_loss_clip": 0.06365145, + "balance_loss_mlp": 0.0126039, + "epoch": 0.07599579137231324, + "flos": 23119088204160.0, + "grad_norm": 2.8380907470503036, + "language_loss": 0.78429461, + "learning_rate": 3.977849218567442e-06, + "loss": 0.86598796, + "num_input_tokens_seen": 26913930, + "router_z_loss_clip": 4.87890625, + "router_z_loss_mlp": 0.56103516, + "step": 1264, + "time_per_iteration": 2.7333550453186035 + }, + { + "auxiliary_loss_clip": 0.06862055, + "auxiliary_loss_mlp": 0.01331538, + "balance_loss_clip": 0.06363812, + "balance_loss_mlp": 0.01272362, + "epoch": 0.07605591462498121, + "flos": 14507362103040.0, + "grad_norm": 3.0292139687816455, + "language_loss": 0.84203875, + "learning_rate": 3.977791377662507e-06, + "loss": 0.92397463, + "num_input_tokens_seen": 26931485, + "router_z_loss_clip": 4.984375, + "router_z_loss_mlp": 0.59179688, + "step": 1265, + "time_per_iteration": 2.587218761444092 + }, + { + "auxiliary_loss_clip": 0.06855778, + "auxiliary_loss_mlp": 0.01328532, + "balance_loss_clip": 0.0636021, + "balance_loss_mlp": 0.01264779, + "epoch": 0.07611603787764919, + "flos": 23521037040000.0, + "grad_norm": 3.3546410086249976, + "language_loss": 0.67662913, + "learning_rate": 3.977733461759524e-06, + "loss": 0.7584722, + "num_input_tokens_seen": 26951670, + "router_z_loss_clip": 4.953125, + "router_z_loss_mlp": 0.63720703, + "step": 1266, + "time_per_iteration": 2.6307120323181152 + }, + { + "auxiliary_loss_clip": 0.06869242, + "auxiliary_loss_mlp": 0.01332957, + "balance_loss_clip": 0.06363578, + "balance_loss_mlp": 0.01267201, + "epoch": 0.07617616113031715, + "flos": 21513640775040.0, + "grad_norm": 2.4484297039949894, + "language_loss": 0.81777161, + "learning_rate": 3.977675470860691e-06, + "loss": 0.89979357, + "num_input_tokens_seen": 26970335, + "router_z_loss_clip": 5.0546875, + "router_z_loss_mlp": 0.65673828, + "step": 1267, + "time_per_iteration": 2.5816946029663086 + }, + { + "auxiliary_loss_clip": 0.06859374, + "auxiliary_loss_mlp": 0.01329793, + "balance_loss_clip": 0.06364329, + "balance_loss_mlp": 0.01269354, + "epoch": 0.07623628438298512, + "flos": 14578164403200.0, + "grad_norm": 3.901991680203772, + "language_loss": 0.74711108, + "learning_rate": 3.977617404968205e-06, + "loss": 0.82900274, + "num_input_tokens_seen": 26986025, + "router_z_loss_clip": 4.94140625, + "router_z_loss_mlp": 0.60498047, + "step": 1268, + "time_per_iteration": 2.5329971313476562 + }, + { + "auxiliary_loss_clip": 0.06849901, + "auxiliary_loss_mlp": 0.01321442, + "balance_loss_clip": 0.06367739, + "balance_loss_mlp": 0.01263959, + "epoch": 0.07629640763565308, + "flos": 14725638789120.0, + "grad_norm": 7.47291205592579, + "language_loss": 0.85124403, + "learning_rate": 3.977559264084269e-06, + "loss": 0.93295747, + "num_input_tokens_seen": 27004045, + "router_z_loss_clip": 4.8125, + "router_z_loss_mlp": 0.57421875, + "step": 1269, + "time_per_iteration": 2.5311200618743896 + }, + { + "auxiliary_loss_clip": 0.06839523, + "auxiliary_loss_mlp": 0.01320369, + "balance_loss_clip": 0.0637067, + "balance_loss_mlp": 0.01264126, + "epoch": 0.07635653088832106, + "flos": 14908220835840.0, + "grad_norm": 2.6697300314393355, + "language_loss": 0.91628265, + "learning_rate": 3.977501048211088e-06, + "loss": 0.99788159, + "num_input_tokens_seen": 27022070, + "router_z_loss_clip": 4.6875, + "router_z_loss_mlp": 0.5625, + "step": 1270, + "time_per_iteration": 2.590938091278076 + }, + { + "auxiliary_loss_clip": 0.06847905, + "auxiliary_loss_mlp": 0.01334774, + "balance_loss_clip": 0.06368862, + "balance_loss_mlp": 0.01272309, + "epoch": 0.07641665414098903, + "flos": 26658865537920.0, + "grad_norm": 4.240829447117421, + "language_loss": 0.73391259, + "learning_rate": 3.977442757350869e-06, + "loss": 0.81573939, + "num_input_tokens_seen": 27041755, + "router_z_loss_clip": 4.7890625, + "router_z_loss_mlp": 0.625, + "step": 1271, + "time_per_iteration": 2.5961694717407227 + }, + { + "auxiliary_loss_clip": 0.06838269, + "auxiliary_loss_mlp": 0.01329276, + "balance_loss_clip": 0.06381856, + "balance_loss_mlp": 0.01278445, + "epoch": 0.07647677739365699, + "flos": 25199970099840.0, + "grad_norm": 3.136617280050721, + "language_loss": 0.8526597, + "learning_rate": 3.977384391505823e-06, + "loss": 0.93433517, + "num_input_tokens_seen": 27061540, + "router_z_loss_clip": 4.55859375, + "router_z_loss_mlp": 0.50878906, + "step": 1272, + "time_per_iteration": 2.6091222763061523 + }, + { + "auxiliary_loss_clip": 0.06845278, + "auxiliary_loss_mlp": 0.01336295, + "balance_loss_clip": 0.06370107, + "balance_loss_mlp": 0.01279599, + "epoch": 0.07653690064632497, + "flos": 20564365069440.0, + "grad_norm": 3.1222866186562674, + "language_loss": 0.82570672, + "learning_rate": 3.977325950678162e-06, + "loss": 0.90752244, + "num_input_tokens_seen": 27081395, + "router_z_loss_clip": 4.75, + "router_z_loss_mlp": 0.56713867, + "step": 1273, + "time_per_iteration": 2.5675384998321533 + }, + { + "auxiliary_loss_clip": 0.06864737, + "auxiliary_loss_mlp": 0.01336748, + "balance_loss_clip": 0.06374316, + "balance_loss_mlp": 0.01277787, + "epoch": 0.07659702389899294, + "flos": 22275219335040.0, + "grad_norm": 2.5887634532412123, + "language_loss": 0.83504725, + "learning_rate": 3.977267434870103e-06, + "loss": 0.91706204, + "num_input_tokens_seen": 27101175, + "router_z_loss_clip": 4.90234375, + "router_z_loss_mlp": 0.58862305, + "step": 1274, + "time_per_iteration": 2.594106912612915 + }, + { + "auxiliary_loss_clip": 0.06835781, + "auxiliary_loss_mlp": 0.01338776, + "balance_loss_clip": 0.06372908, + "balance_loss_mlp": 0.01281961, + "epoch": 0.0766571471516609, + "flos": 32644563079680.0, + "grad_norm": 2.657989216371077, + "language_loss": 0.75383544, + "learning_rate": 3.977208844083865e-06, + "loss": 0.835581, + "num_input_tokens_seen": 27124505, + "router_z_loss_clip": 4.62109375, + "router_z_loss_mlp": 0.56835938, + "step": 1275, + "time_per_iteration": 2.6635921001434326 + }, + { + "auxiliary_loss_clip": 0.06867371, + "auxiliary_loss_mlp": 0.01354656, + "balance_loss_clip": 0.06370118, + "balance_loss_mlp": 0.01289377, + "epoch": 0.07671727040432888, + "flos": 15272672169600.0, + "grad_norm": 3.4268385774262637, + "language_loss": 0.82329005, + "learning_rate": 3.9771501783216685e-06, + "loss": 0.90551031, + "num_input_tokens_seen": 27140960, + "router_z_loss_clip": 4.98046875, + "router_z_loss_mlp": 0.65234375, + "step": 1276, + "time_per_iteration": 2.5468428134918213 + }, + { + "auxiliary_loss_clip": 0.06860888, + "auxiliary_loss_mlp": 0.01344496, + "balance_loss_clip": 0.06380928, + "balance_loss_mlp": 0.01285964, + "epoch": 0.07677739365699685, + "flos": 28191665877120.0, + "grad_norm": 8.54617583390301, + "language_loss": 0.61651218, + "learning_rate": 3.97709143758574e-06, + "loss": 0.69856602, + "num_input_tokens_seen": 27160985, + "router_z_loss_clip": 4.78515625, + "router_z_loss_mlp": 0.58544922, + "step": 1277, + "time_per_iteration": 2.6240146160125732 + }, + { + "auxiliary_loss_clip": 0.06864151, + "auxiliary_loss_mlp": 0.01358552, + "balance_loss_clip": 0.06375778, + "balance_loss_mlp": 0.01298471, + "epoch": 0.07683751690966481, + "flos": 18301991230080.0, + "grad_norm": 2.6958136098916565, + "language_loss": 0.76683849, + "learning_rate": 3.977032621878305e-06, + "loss": 0.84906554, + "num_input_tokens_seen": 27178390, + "router_z_loss_clip": 4.875, + "router_z_loss_mlp": 0.60058594, + "step": 1278, + "time_per_iteration": 2.595947742462158 + }, + { + "auxiliary_loss_clip": 0.06835216, + "auxiliary_loss_mlp": 0.01346069, + "balance_loss_clip": 0.06372848, + "balance_loss_mlp": 0.01289683, + "epoch": 0.07689764016233278, + "flos": 21987565868160.0, + "grad_norm": 3.428980152963994, + "language_loss": 0.90527773, + "learning_rate": 3.976973731201596e-06, + "loss": 0.98709059, + "num_input_tokens_seen": 27197505, + "router_z_loss_clip": 4.62109375, + "router_z_loss_mlp": 0.56420898, + "step": 1279, + "time_per_iteration": 3.962568521499634 + }, + { + "auxiliary_loss_clip": 0.06834365, + "auxiliary_loss_mlp": 0.01339419, + "balance_loss_clip": 0.06362047, + "balance_loss_mlp": 0.01287301, + "epoch": 0.07695776341500075, + "flos": 22242417661440.0, + "grad_norm": 3.3495960477632685, + "language_loss": 0.85256732, + "learning_rate": 3.976914765557845e-06, + "loss": 0.93430507, + "num_input_tokens_seen": 27214260, + "router_z_loss_clip": 4.71484375, + "router_z_loss_mlp": 0.52148438, + "step": 1280, + "time_per_iteration": 2.5692243576049805 + }, + { + "auxiliary_loss_clip": 0.06832324, + "auxiliary_loss_mlp": 0.01339262, + "balance_loss_clip": 0.06368576, + "balance_loss_mlp": 0.01283662, + "epoch": 0.07701788666766872, + "flos": 16149300785280.0, + "grad_norm": 2.5153075146211274, + "language_loss": 0.78576446, + "learning_rate": 3.9768557249492875e-06, + "loss": 0.8674804, + "num_input_tokens_seen": 27232525, + "router_z_loss_clip": 4.63671875, + "router_z_loss_mlp": 0.55541992, + "step": 1281, + "time_per_iteration": 4.005364894866943 + }, + { + "auxiliary_loss_clip": 0.06866302, + "auxiliary_loss_mlp": 0.01356763, + "balance_loss_clip": 0.06371205, + "balance_loss_mlp": 0.01291317, + "epoch": 0.07707800992033668, + "flos": 19468998570240.0, + "grad_norm": 5.650134420498799, + "language_loss": 0.77910447, + "learning_rate": 3.9767966093781634e-06, + "loss": 0.8613351, + "num_input_tokens_seen": 27249800, + "router_z_loss_clip": 4.95703125, + "router_z_loss_mlp": 0.65429688, + "step": 1282, + "time_per_iteration": 2.6096553802490234 + }, + { + "auxiliary_loss_clip": 0.06843832, + "auxiliary_loss_mlp": 0.01354603, + "balance_loss_clip": 0.06370867, + "balance_loss_mlp": 0.01298647, + "epoch": 0.07713813317300466, + "flos": 18996415142400.0, + "grad_norm": 3.5179830835441974, + "language_loss": 0.86225599, + "learning_rate": 3.976737418846713e-06, + "loss": 0.94424033, + "num_input_tokens_seen": 27268895, + "router_z_loss_clip": 4.72265625, + "router_z_loss_mlp": 0.55932617, + "step": 1283, + "time_per_iteration": 2.605346202850342 + }, + { + "auxiliary_loss_clip": 0.06835528, + "auxiliary_loss_mlp": 0.01347471, + "balance_loss_clip": 0.06358841, + "balance_loss_mlp": 0.01292039, + "epoch": 0.07719825642567263, + "flos": 18119828453760.0, + "grad_norm": 2.430743235056626, + "language_loss": 0.77539676, + "learning_rate": 3.976678153357181e-06, + "loss": 0.85722673, + "num_input_tokens_seen": 27288180, + "router_z_loss_clip": 4.76171875, + "router_z_loss_mlp": 0.55444336, + "step": 1284, + "time_per_iteration": 3.990124225616455 + }, + { + "auxiliary_loss_clip": 0.06827543, + "auxiliary_loss_mlp": 0.01355487, + "balance_loss_clip": 0.06358978, + "balance_loss_mlp": 0.01300294, + "epoch": 0.0772583796783406, + "flos": 42204307075200.0, + "grad_norm": 2.435341154952095, + "language_loss": 0.78285027, + "learning_rate": 3.976618812911817e-06, + "loss": 0.86468053, + "num_input_tokens_seen": 27311815, + "router_z_loss_clip": 4.6796875, + "router_z_loss_mlp": 0.55200195, + "step": 1285, + "time_per_iteration": 2.7569363117218018 + }, + { + "auxiliary_loss_clip": 0.06851525, + "auxiliary_loss_mlp": 0.01337351, + "balance_loss_clip": 0.06371935, + "balance_loss_mlp": 0.01278081, + "epoch": 0.07731850293100857, + "flos": 24760565688960.0, + "grad_norm": 2.195462031898389, + "language_loss": 0.86501926, + "learning_rate": 3.9765593975128685e-06, + "loss": 0.946908, + "num_input_tokens_seen": 27331890, + "router_z_loss_clip": 4.79296875, + "router_z_loss_mlp": 0.59277344, + "step": 1286, + "time_per_iteration": 4.058920383453369 + }, + { + "auxiliary_loss_clip": 0.06876462, + "auxiliary_loss_mlp": 0.01367501, + "balance_loss_clip": 0.0637191, + "balance_loss_mlp": 0.01299314, + "epoch": 0.07737862618367654, + "flos": 17571537262080.0, + "grad_norm": 2.773879522110049, + "language_loss": 0.79808044, + "learning_rate": 3.97649990716259e-06, + "loss": 0.88052011, + "num_input_tokens_seen": 27348320, + "router_z_loss_clip": 5.04296875, + "router_z_loss_mlp": 0.68212891, + "step": 1287, + "time_per_iteration": 2.562206506729126 + }, + { + "auxiliary_loss_clip": 0.06845251, + "auxiliary_loss_mlp": 0.01340112, + "balance_loss_clip": 0.06370382, + "balance_loss_mlp": 0.01288136, + "epoch": 0.0774387494363445, + "flos": 25633798214400.0, + "grad_norm": 2.3847373218246983, + "language_loss": 0.8715058, + "learning_rate": 3.976440341863237e-06, + "loss": 0.95335943, + "num_input_tokens_seen": 27367670, + "router_z_loss_clip": 4.74609375, + "router_z_loss_mlp": 0.51953125, + "step": 1288, + "time_per_iteration": 2.600308656692505 + }, + { + "auxiliary_loss_clip": 0.0688329, + "auxiliary_loss_mlp": 0.01364865, + "balance_loss_clip": 0.06375885, + "balance_loss_mlp": 0.01300611, + "epoch": 0.07749887268901248, + "flos": 12244778628480.0, + "grad_norm": 3.451146773235629, + "language_loss": 0.8824665, + "learning_rate": 3.976380701617068e-06, + "loss": 0.96494806, + "num_input_tokens_seen": 27385485, + "router_z_loss_clip": 5.0703125, + "router_z_loss_mlp": 0.64306641, + "step": 1289, + "time_per_iteration": 2.6120755672454834 + }, + { + "auxiliary_loss_clip": 0.06845821, + "auxiliary_loss_mlp": 0.01332003, + "balance_loss_clip": 0.06365949, + "balance_loss_mlp": 0.0127781, + "epoch": 0.07755899594168045, + "flos": 25088609623680.0, + "grad_norm": 3.9721153981819377, + "language_loss": 0.87731397, + "learning_rate": 3.976320986426344e-06, + "loss": 0.95909214, + "num_input_tokens_seen": 27405110, + "router_z_loss_clip": 4.80078125, + "router_z_loss_mlp": 0.54150391, + "step": 1290, + "time_per_iteration": 2.6039535999298096 + }, + { + "auxiliary_loss_clip": 0.06849636, + "auxiliary_loss_mlp": 0.0134794, + "balance_loss_clip": 0.0637328, + "balance_loss_mlp": 0.01286833, + "epoch": 0.07761911919434841, + "flos": 14251629841920.0, + "grad_norm": 2.80389948255575, + "language_loss": 0.9359982, + "learning_rate": 3.9762611962933315e-06, + "loss": 1.0179739, + "num_input_tokens_seen": 27422855, + "router_z_loss_clip": 4.765625, + "router_z_loss_mlp": 0.61157227, + "step": 1291, + "time_per_iteration": 2.620960235595703 + }, + { + "auxiliary_loss_clip": 0.06740145, + "auxiliary_loss_mlp": 0.01502792, + "balance_loss_clip": 0.06432445, + "balance_loss_mlp": 0.01475422, + "epoch": 0.07767924244701638, + "flos": 67259639099520.0, + "grad_norm": 0.9524065323514693, + "language_loss": 0.65448344, + "learning_rate": 3.9762013312202955e-06, + "loss": 0.73691273, + "num_input_tokens_seen": 27487190, + "router_z_loss_clip": 3.078125, + "router_z_loss_mlp": 0.27416992, + "step": 1292, + "time_per_iteration": 3.3147408962249756 + }, + { + "auxiliary_loss_clip": 0.06863274, + "auxiliary_loss_mlp": 0.01339428, + "balance_loss_clip": 0.06369414, + "balance_loss_mlp": 0.01279203, + "epoch": 0.07773936569968436, + "flos": 28558548979200.0, + "grad_norm": 5.92776916982661, + "language_loss": 0.89760518, + "learning_rate": 3.9761413912095075e-06, + "loss": 0.97963214, + "num_input_tokens_seen": 27510465, + "router_z_loss_clip": 4.9375, + "router_z_loss_mlp": 0.60229492, + "step": 1293, + "time_per_iteration": 2.649545431137085 + }, + { + "auxiliary_loss_clip": 0.06850281, + "auxiliary_loss_mlp": 0.0134015, + "balance_loss_clip": 0.06365186, + "balance_loss_mlp": 0.01280689, + "epoch": 0.07779948895235232, + "flos": 27497619308160.0, + "grad_norm": 4.7786851588669315, + "language_loss": 0.88117272, + "learning_rate": 3.976081376263239e-06, + "loss": 0.96307707, + "num_input_tokens_seen": 27528645, + "router_z_loss_clip": 4.84765625, + "router_z_loss_mlp": 0.59521484, + "step": 1294, + "time_per_iteration": 2.7246196269989014 + }, + { + "auxiliary_loss_clip": 0.06872948, + "auxiliary_loss_mlp": 0.01341599, + "balance_loss_clip": 0.06369777, + "balance_loss_mlp": 0.01276034, + "epoch": 0.07785961220502029, + "flos": 18229176432000.0, + "grad_norm": 2.917147299599652, + "language_loss": 0.82283127, + "learning_rate": 3.976021286383768e-06, + "loss": 0.90497679, + "num_input_tokens_seen": 27546165, + "router_z_loss_clip": 5.03125, + "router_z_loss_mlp": 0.65576172, + "step": 1295, + "time_per_iteration": 2.565981149673462 + }, + { + "auxiliary_loss_clip": 0.06823503, + "auxiliary_loss_mlp": 0.0131494, + "balance_loss_clip": 0.06354046, + "balance_loss_mlp": 0.01258459, + "epoch": 0.07791973545768827, + "flos": 24615145728000.0, + "grad_norm": 2.406299450212834, + "language_loss": 0.90690672, + "learning_rate": 3.975961121573371e-06, + "loss": 0.9882912, + "num_input_tokens_seen": 27566520, + "router_z_loss_clip": 4.69140625, + "router_z_loss_mlp": 0.56494141, + "step": 1296, + "time_per_iteration": 2.6269545555114746 + }, + { + "auxiliary_loss_clip": 0.06845632, + "auxiliary_loss_mlp": 0.01328069, + "balance_loss_clip": 0.06355733, + "balance_loss_mlp": 0.01267058, + "epoch": 0.07797985871035623, + "flos": 14287156773120.0, + "grad_norm": 2.6954148658412636, + "language_loss": 0.98733974, + "learning_rate": 3.9759008818343305e-06, + "loss": 1.06907678, + "num_input_tokens_seen": 27581960, + "router_z_loss_clip": 4.8984375, + "router_z_loss_mlp": 0.61010742, + "step": 1297, + "time_per_iteration": 2.550185441970825 + }, + { + "auxiliary_loss_clip": 0.06845116, + "auxiliary_loss_mlp": 0.01318807, + "balance_loss_clip": 0.06359702, + "balance_loss_mlp": 0.01258606, + "epoch": 0.0780399819630242, + "flos": 26616965696640.0, + "grad_norm": 2.8603722020093287, + "language_loss": 0.7874198, + "learning_rate": 3.97584056716893e-06, + "loss": 0.86905909, + "num_input_tokens_seen": 27601415, + "router_z_loss_clip": 4.8515625, + "router_z_loss_mlp": 0.60229492, + "step": 1298, + "time_per_iteration": 2.6391749382019043 + }, + { + "auxiliary_loss_clip": 0.06826787, + "auxiliary_loss_mlp": 0.01312488, + "balance_loss_clip": 0.06351642, + "balance_loss_mlp": 0.01258558, + "epoch": 0.07810010521569218, + "flos": 21840846168960.0, + "grad_norm": 2.2381109850938077, + "language_loss": 0.83600903, + "learning_rate": 3.9757801775794575e-06, + "loss": 0.91740179, + "num_input_tokens_seen": 27621490, + "router_z_loss_clip": 4.75, + "router_z_loss_mlp": 0.53979492, + "step": 1299, + "time_per_iteration": 2.5871427059173584 + }, + { + "auxiliary_loss_clip": 0.0681142, + "auxiliary_loss_mlp": 0.01314166, + "balance_loss_clip": 0.06352274, + "balance_loss_mlp": 0.01260713, + "epoch": 0.07816022846836014, + "flos": 25088022645120.0, + "grad_norm": 2.404074331576357, + "language_loss": 0.89199561, + "learning_rate": 3.975719713068202e-06, + "loss": 0.97325152, + "num_input_tokens_seen": 27640600, + "router_z_loss_clip": 4.58984375, + "router_z_loss_mlp": 0.53442383, + "step": 1300, + "time_per_iteration": 2.633734941482544 + }, + { + "auxiliary_loss_clip": 0.06848504, + "auxiliary_loss_mlp": 0.01319579, + "balance_loss_clip": 0.0636059, + "balance_loss_mlp": 0.0125964, + "epoch": 0.0782203517210281, + "flos": 40927197070080.0, + "grad_norm": 2.022718991796153, + "language_loss": 0.7445091, + "learning_rate": 3.975659173637458e-06, + "loss": 0.82618994, + "num_input_tokens_seen": 27663070, + "router_z_loss_clip": 4.87890625, + "router_z_loss_mlp": 0.59936523, + "step": 1301, + "time_per_iteration": 2.7330377101898193 + }, + { + "auxiliary_loss_clip": 0.06825704, + "auxiliary_loss_mlp": 0.01316028, + "balance_loss_clip": 0.0635405, + "balance_loss_mlp": 0.01261335, + "epoch": 0.07828047497369607, + "flos": 41181587665920.0, + "grad_norm": 2.1366155853756275, + "language_loss": 0.73607302, + "learning_rate": 3.97559855928952e-06, + "loss": 0.81749034, + "num_input_tokens_seen": 27686425, + "router_z_loss_clip": 4.7109375, + "router_z_loss_mlp": 0.54736328, + "step": 1302, + "time_per_iteration": 2.781339168548584 + }, + { + "auxiliary_loss_clip": 0.06820317, + "auxiliary_loss_mlp": 0.01324174, + "balance_loss_clip": 0.06356553, + "balance_loss_mlp": 0.01270124, + "epoch": 0.07834059822636405, + "flos": 23513951370240.0, + "grad_norm": 3.2246124193670433, + "language_loss": 0.84486687, + "learning_rate": 3.9755378700266864e-06, + "loss": 0.92631173, + "num_input_tokens_seen": 27704900, + "router_z_loss_clip": 4.640625, + "router_z_loss_mlp": 0.54101562, + "step": 1303, + "time_per_iteration": 2.5946569442749023 + }, + { + "auxiliary_loss_clip": 0.06814861, + "auxiliary_loss_mlp": 0.01309278, + "balance_loss_clip": 0.06343949, + "balance_loss_mlp": 0.01254919, + "epoch": 0.07840072147903202, + "flos": 20200165297920.0, + "grad_norm": 2.085099882897468, + "language_loss": 0.77159727, + "learning_rate": 3.9754771058512585e-06, + "loss": 0.85283864, + "num_input_tokens_seen": 27724890, + "router_z_loss_clip": 4.69921875, + "router_z_loss_mlp": 0.54394531, + "step": 1304, + "time_per_iteration": 2.5800909996032715 + }, + { + "auxiliary_loss_clip": 0.06828763, + "auxiliary_loss_mlp": 0.01313707, + "balance_loss_clip": 0.06349462, + "balance_loss_mlp": 0.01258799, + "epoch": 0.07846084473169998, + "flos": 21367172638080.0, + "grad_norm": 2.1177139553290734, + "language_loss": 0.7841258, + "learning_rate": 3.975416266765542e-06, + "loss": 0.86555046, + "num_input_tokens_seen": 27743115, + "router_z_loss_clip": 4.78515625, + "router_z_loss_mlp": 0.54882812, + "step": 1305, + "time_per_iteration": 2.569558620452881 + }, + { + "auxiliary_loss_clip": 0.06855056, + "auxiliary_loss_mlp": 0.01321096, + "balance_loss_clip": 0.06367438, + "balance_loss_mlp": 0.01261348, + "epoch": 0.07852096798436796, + "flos": 25418037150720.0, + "grad_norm": 3.9004874062794057, + "language_loss": 0.88314414, + "learning_rate": 3.975355352771841e-06, + "loss": 0.96490562, + "num_input_tokens_seen": 27763570, + "router_z_loss_clip": 4.87109375, + "router_z_loss_mlp": 0.59765625, + "step": 1306, + "time_per_iteration": 2.6575305461883545 + }, + { + "auxiliary_loss_clip": 0.06810681, + "auxiliary_loss_mlp": 0.01315273, + "balance_loss_clip": 0.06347391, + "balance_loss_mlp": 0.01263608, + "epoch": 0.07858109123703592, + "flos": 24578360985600.0, + "grad_norm": 4.395850337278793, + "language_loss": 0.93214571, + "learning_rate": 3.975294363872468e-06, + "loss": 1.01340532, + "num_input_tokens_seen": 27780030, + "router_z_loss_clip": 4.6328125, + "router_z_loss_mlp": 0.51660156, + "step": 1307, + "time_per_iteration": 2.592435359954834 + }, + { + "auxiliary_loss_clip": 0.0682511, + "auxiliary_loss_mlp": 0.0131993, + "balance_loss_clip": 0.06345625, + "balance_loss_mlp": 0.01262566, + "epoch": 0.07864121448970389, + "flos": 20704250661120.0, + "grad_norm": 3.2307026300408683, + "language_loss": 0.8507998, + "learning_rate": 3.975233300069735e-06, + "loss": 0.93225014, + "num_input_tokens_seen": 27796225, + "router_z_loss_clip": 4.7890625, + "router_z_loss_mlp": 0.57373047, + "step": 1308, + "time_per_iteration": 2.597881555557251 + }, + { + "auxiliary_loss_clip": 0.06792136, + "auxiliary_loss_mlp": 0.01314144, + "balance_loss_clip": 0.06338251, + "balance_loss_mlp": 0.01262598, + "epoch": 0.07870133774237187, + "flos": 22973207045760.0, + "grad_norm": 1.9389316858499817, + "language_loss": 0.79464692, + "learning_rate": 3.975172161365958e-06, + "loss": 0.87570971, + "num_input_tokens_seen": 27815975, + "router_z_loss_clip": 4.53515625, + "router_z_loss_mlp": 0.515625, + "step": 1309, + "time_per_iteration": 2.599799871444702 + }, + { + "auxiliary_loss_clip": 0.06823064, + "auxiliary_loss_mlp": 0.01328854, + "balance_loss_clip": 0.06347175, + "balance_loss_mlp": 0.01272683, + "epoch": 0.07876146099503983, + "flos": 18848689194240.0, + "grad_norm": 2.5866734138361345, + "language_loss": 0.83378398, + "learning_rate": 3.975110947763453e-06, + "loss": 0.91530323, + "num_input_tokens_seen": 27832255, + "router_z_loss_clip": 4.765625, + "router_z_loss_mlp": 0.56176758, + "step": 1310, + "time_per_iteration": 2.5724973678588867 + }, + { + "auxiliary_loss_clip": 0.0678651, + "auxiliary_loss_mlp": 0.01315999, + "balance_loss_clip": 0.06338531, + "balance_loss_mlp": 0.01264811, + "epoch": 0.0788215842477078, + "flos": 23812631648640.0, + "grad_norm": 2.2765510373912683, + "language_loss": 0.76230896, + "learning_rate": 3.9750496592645435e-06, + "loss": 0.84333402, + "num_input_tokens_seen": 27852180, + "router_z_loss_clip": 4.48046875, + "router_z_loss_mlp": 0.51123047, + "step": 1311, + "time_per_iteration": 2.632310628890991 + }, + { + "auxiliary_loss_clip": 0.0680154, + "auxiliary_loss_mlp": 0.01319845, + "balance_loss_clip": 0.06336971, + "balance_loss_mlp": 0.01265009, + "epoch": 0.07888170750037576, + "flos": 21586329792000.0, + "grad_norm": 3.554782909684318, + "language_loss": 0.88360095, + "learning_rate": 3.974988295871553e-06, + "loss": 0.96481478, + "num_input_tokens_seen": 27871435, + "router_z_loss_clip": 4.640625, + "router_z_loss_mlp": 0.54882812, + "step": 1312, + "time_per_iteration": 2.7384519577026367 + }, + { + "auxiliary_loss_clip": 0.06786558, + "auxiliary_loss_mlp": 0.01318936, + "balance_loss_clip": 0.06334423, + "balance_loss_mlp": 0.01270561, + "epoch": 0.07894183075304374, + "flos": 19870947406080.0, + "grad_norm": 2.1624292410526773, + "language_loss": 0.84578681, + "learning_rate": 3.9749268575868085e-06, + "loss": 0.92684174, + "num_input_tokens_seen": 27890625, + "router_z_loss_clip": 4.5234375, + "router_z_loss_mlp": 0.48388672, + "step": 1313, + "time_per_iteration": 2.6043031215667725 + }, + { + "auxiliary_loss_clip": 0.06836893, + "auxiliary_loss_mlp": 0.01334789, + "balance_loss_clip": 0.06342322, + "balance_loss_mlp": 0.01270368, + "epoch": 0.07900195400571171, + "flos": 16148965368960.0, + "grad_norm": 3.8741474948490717, + "language_loss": 0.75254732, + "learning_rate": 3.97486534441264e-06, + "loss": 0.83426416, + "num_input_tokens_seen": 27906530, + "router_z_loss_clip": 4.94140625, + "router_z_loss_mlp": 0.64404297, + "step": 1314, + "time_per_iteration": 2.532270669937134 + }, + { + "auxiliary_loss_clip": 0.06814209, + "auxiliary_loss_mlp": 0.01316459, + "balance_loss_clip": 0.06346349, + "balance_loss_mlp": 0.01263363, + "epoch": 0.07906207725837967, + "flos": 23736840030720.0, + "grad_norm": 2.0058439737114826, + "language_loss": 0.8208642, + "learning_rate": 3.974803756351379e-06, + "loss": 0.9021709, + "num_input_tokens_seen": 27926725, + "router_z_loss_clip": 4.67578125, + "router_z_loss_mlp": 0.53125, + "step": 1315, + "time_per_iteration": 2.6085028648376465 + }, + { + "auxiliary_loss_clip": 0.06824351, + "auxiliary_loss_mlp": 0.01326067, + "balance_loss_clip": 0.06345295, + "balance_loss_mlp": 0.01265914, + "epoch": 0.07912220051104765, + "flos": 24322712578560.0, + "grad_norm": 1.9106769346900934, + "language_loss": 0.76054502, + "learning_rate": 3.974742093405362e-06, + "loss": 0.84204924, + "num_input_tokens_seen": 27947875, + "router_z_loss_clip": 4.79296875, + "router_z_loss_mlp": 0.60083008, + "step": 1316, + "time_per_iteration": 2.586472749710083 + }, + { + "auxiliary_loss_clip": 0.0684765, + "auxiliary_loss_mlp": 0.01325754, + "balance_loss_clip": 0.06349534, + "balance_loss_mlp": 0.01266244, + "epoch": 0.07918232376371562, + "flos": 18886018988160.0, + "grad_norm": 4.4995832003619, + "language_loss": 0.68677568, + "learning_rate": 3.974680355576927e-06, + "loss": 0.76850969, + "num_input_tokens_seen": 27965040, + "router_z_loss_clip": 4.98046875, + "router_z_loss_mlp": 0.59472656, + "step": 1317, + "time_per_iteration": 2.5489861965179443 + }, + { + "auxiliary_loss_clip": 0.06869859, + "auxiliary_loss_mlp": 0.01349552, + "balance_loss_clip": 0.06357804, + "balance_loss_mlp": 0.01281912, + "epoch": 0.07924244701638358, + "flos": 27382862741760.0, + "grad_norm": 3.047310758275923, + "language_loss": 0.75324464, + "learning_rate": 3.974618542868415e-06, + "loss": 0.83543873, + "num_input_tokens_seen": 27985330, + "router_z_loss_clip": 5.12109375, + "router_z_loss_mlp": 0.67700195, + "step": 1318, + "time_per_iteration": 2.5918128490448 + }, + { + "auxiliary_loss_clip": 0.06830844, + "auxiliary_loss_mlp": 0.01322573, + "balance_loss_clip": 0.06359029, + "balance_loss_mlp": 0.01269692, + "epoch": 0.07930257026905156, + "flos": 25127574572160.0, + "grad_norm": 1.9442087070115428, + "language_loss": 0.92534363, + "learning_rate": 3.97455665528217e-06, + "loss": 1.0068779, + "num_input_tokens_seen": 28007615, + "router_z_loss_clip": 4.7109375, + "router_z_loss_mlp": 0.52929688, + "step": 1319, + "time_per_iteration": 3.993619203567505 + }, + { + "auxiliary_loss_clip": 0.06832193, + "auxiliary_loss_mlp": 0.0132254, + "balance_loss_clip": 0.06361841, + "balance_loss_mlp": 0.01272902, + "epoch": 0.07936269352171953, + "flos": 21840804241920.0, + "grad_norm": 2.144433650708689, + "language_loss": 0.81964207, + "learning_rate": 3.974494692820539e-06, + "loss": 0.90118945, + "num_input_tokens_seen": 28027765, + "router_z_loss_clip": 4.703125, + "router_z_loss_mlp": 0.49633789, + "step": 1320, + "time_per_iteration": 3.991323232650757 + }, + { + "auxiliary_loss_clip": 0.06858893, + "auxiliary_loss_mlp": 0.01331954, + "balance_loss_clip": 0.06361651, + "balance_loss_mlp": 0.01271801, + "epoch": 0.07942281677438749, + "flos": 16944477632640.0, + "grad_norm": 2.2380017082009576, + "language_loss": 0.71816266, + "learning_rate": 3.974432655485872e-06, + "loss": 0.80007118, + "num_input_tokens_seen": 28044225, + "router_z_loss_clip": 4.96484375, + "router_z_loss_mlp": 0.60205078, + "step": 1321, + "time_per_iteration": 2.5437092781066895 + }, + { + "auxiliary_loss_clip": 0.06835557, + "auxiliary_loss_mlp": 0.01340758, + "balance_loss_clip": 0.06363731, + "balance_loss_mlp": 0.01282297, + "epoch": 0.07948294002705546, + "flos": 18992515927680.0, + "grad_norm": 2.7756488817332943, + "language_loss": 0.86391938, + "learning_rate": 3.9743705432805195e-06, + "loss": 0.94568253, + "num_input_tokens_seen": 28062915, + "router_z_loss_clip": 4.7109375, + "router_z_loss_mlp": 0.5847168, + "step": 1322, + "time_per_iteration": 2.579061985015869 + }, + { + "auxiliary_loss_clip": 0.06837995, + "auxiliary_loss_mlp": 0.01339731, + "balance_loss_clip": 0.0636203, + "balance_loss_mlp": 0.01284681, + "epoch": 0.07954306327972344, + "flos": 21659983130880.0, + "grad_norm": 2.3668510426442144, + "language_loss": 0.92888951, + "learning_rate": 3.974308356206838e-06, + "loss": 1.01066673, + "num_input_tokens_seen": 28082175, + "router_z_loss_clip": 4.7578125, + "router_z_loss_mlp": 0.55053711, + "step": 1323, + "time_per_iteration": 3.9885079860687256 + }, + { + "auxiliary_loss_clip": 0.06820317, + "auxiliary_loss_mlp": 0.01320075, + "balance_loss_clip": 0.06361794, + "balance_loss_mlp": 0.01267504, + "epoch": 0.0796031865323914, + "flos": 23226717173760.0, + "grad_norm": 4.577989929254941, + "language_loss": 0.84617591, + "learning_rate": 3.974246094267187e-06, + "loss": 0.92757982, + "num_input_tokens_seen": 28102645, + "router_z_loss_clip": 4.58203125, + "router_z_loss_mlp": 0.52661133, + "step": 1324, + "time_per_iteration": 2.575162410736084 + }, + { + "auxiliary_loss_clip": 0.0682738, + "auxiliary_loss_mlp": 0.01317412, + "balance_loss_clip": 0.06365715, + "balance_loss_mlp": 0.0126372, + "epoch": 0.07966330978505937, + "flos": 23301209053440.0, + "grad_norm": 4.146924168553952, + "language_loss": 0.81619465, + "learning_rate": 3.974183757463925e-06, + "loss": 0.89764249, + "num_input_tokens_seen": 28122805, + "router_z_loss_clip": 4.6171875, + "router_z_loss_mlp": 0.53710938, + "step": 1325, + "time_per_iteration": 3.9960508346557617 + }, + { + "auxiliary_loss_clip": 0.06838783, + "auxiliary_loss_mlp": 0.01317663, + "balance_loss_clip": 0.06375229, + "balance_loss_mlp": 0.01262112, + "epoch": 0.07972343303772735, + "flos": 18368768534400.0, + "grad_norm": 3.482553532723253, + "language_loss": 0.90544963, + "learning_rate": 3.974121345799418e-06, + "loss": 0.98701411, + "num_input_tokens_seen": 28140530, + "router_z_loss_clip": 4.63671875, + "router_z_loss_mlp": 0.55493164, + "step": 1326, + "time_per_iteration": 2.5401828289031982 + }, + { + "auxiliary_loss_clip": 0.0682137, + "auxiliary_loss_mlp": 0.01316322, + "balance_loss_clip": 0.06366737, + "balance_loss_mlp": 0.01263488, + "epoch": 0.07978355629039531, + "flos": 21768995692800.0, + "grad_norm": 2.4962093100336085, + "language_loss": 0.85295928, + "learning_rate": 3.974058859276032e-06, + "loss": 0.93433619, + "num_input_tokens_seen": 28159640, + "router_z_loss_clip": 4.54296875, + "router_z_loss_mlp": 0.52856445, + "step": 1327, + "time_per_iteration": 2.6081485748291016 + }, + { + "auxiliary_loss_clip": 0.0686523, + "auxiliary_loss_mlp": 0.01320845, + "balance_loss_clip": 0.06376741, + "balance_loss_mlp": 0.01260119, + "epoch": 0.07984367954306328, + "flos": 18557178439680.0, + "grad_norm": 3.6856767873413077, + "language_loss": 0.82425529, + "learning_rate": 3.9739962978961354e-06, + "loss": 0.90611601, + "num_input_tokens_seen": 28177050, + "router_z_loss_clip": 4.88671875, + "router_z_loss_mlp": 0.60742188, + "step": 1328, + "time_per_iteration": 2.5963807106018066 + }, + { + "auxiliary_loss_clip": 0.06855517, + "auxiliary_loss_mlp": 0.01323941, + "balance_loss_clip": 0.06378672, + "balance_loss_mlp": 0.01266315, + "epoch": 0.07990380279573125, + "flos": 16908741066240.0, + "grad_norm": 2.810501054411486, + "language_loss": 0.77465802, + "learning_rate": 3.973933661662101e-06, + "loss": 0.85645258, + "num_input_tokens_seen": 28193245, + "router_z_loss_clip": 4.76953125, + "router_z_loss_mlp": 0.57666016, + "step": 1329, + "time_per_iteration": 2.5654993057250977 + }, + { + "auxiliary_loss_clip": 0.06870389, + "auxiliary_loss_mlp": 0.01332359, + "balance_loss_clip": 0.06403654, + "balance_loss_mlp": 0.01277785, + "epoch": 0.07996392604839922, + "flos": 24105358287360.0, + "grad_norm": 3.2158550447724354, + "language_loss": 0.83423603, + "learning_rate": 3.973870950576305e-06, + "loss": 0.91626346, + "num_input_tokens_seen": 28213570, + "router_z_loss_clip": 4.66796875, + "router_z_loss_mlp": 0.5456543, + "step": 1330, + "time_per_iteration": 2.689359426498413 + }, + { + "auxiliary_loss_clip": 0.06871998, + "auxiliary_loss_mlp": 0.01327325, + "balance_loss_clip": 0.06395264, + "balance_loss_mlp": 0.01271893, + "epoch": 0.08002404930106718, + "flos": 14283257558400.0, + "grad_norm": 2.3593668670474375, + "language_loss": 0.91363919, + "learning_rate": 3.9738081646411255e-06, + "loss": 0.99563241, + "num_input_tokens_seen": 28229980, + "router_z_loss_clip": 4.765625, + "router_z_loss_mlp": 0.5534668, + "step": 1331, + "time_per_iteration": 2.535022735595703 + }, + { + "auxiliary_loss_clip": 0.06886654, + "auxiliary_loss_mlp": 0.01331981, + "balance_loss_clip": 0.0639886, + "balance_loss_mlp": 0.01274283, + "epoch": 0.08008417255373516, + "flos": 40415732547840.0, + "grad_norm": 8.382777264974079, + "language_loss": 0.75984204, + "learning_rate": 3.973745303858942e-06, + "loss": 0.84202838, + "num_input_tokens_seen": 28253840, + "router_z_loss_clip": 4.875, + "router_z_loss_mlp": 0.57666016, + "step": 1332, + "time_per_iteration": 2.798543691635132 + }, + { + "auxiliary_loss_clip": 0.06853566, + "auxiliary_loss_mlp": 0.01322273, + "balance_loss_clip": 0.06399575, + "balance_loss_mlp": 0.01270894, + "epoch": 0.08014429580640313, + "flos": 18484866766080.0, + "grad_norm": 3.077187306300229, + "language_loss": 0.84502465, + "learning_rate": 3.973682368232138e-06, + "loss": 0.92678297, + "num_input_tokens_seen": 28271675, + "router_z_loss_clip": 4.54296875, + "router_z_loss_mlp": 0.51318359, + "step": 1333, + "time_per_iteration": 2.55322003364563 + }, + { + "auxiliary_loss_clip": 0.06860092, + "auxiliary_loss_mlp": 0.01337998, + "balance_loss_clip": 0.06402323, + "balance_loss_mlp": 0.01283972, + "epoch": 0.0802044190590711, + "flos": 22059835614720.0, + "grad_norm": 5.409358557797253, + "language_loss": 0.77425432, + "learning_rate": 3.9736193577631015e-06, + "loss": 0.85623527, + "num_input_tokens_seen": 28291850, + "router_z_loss_clip": 4.57421875, + "router_z_loss_mlp": 0.54052734, + "step": 1334, + "time_per_iteration": 2.6176130771636963 + }, + { + "auxiliary_loss_clip": 0.06866166, + "auxiliary_loss_mlp": 0.01339925, + "balance_loss_clip": 0.06404187, + "balance_loss_mlp": 0.01288045, + "epoch": 0.08026454231173906, + "flos": 24579115672320.0, + "grad_norm": 2.171957673256717, + "language_loss": 0.82094586, + "learning_rate": 3.973556272454221e-06, + "loss": 0.90300679, + "num_input_tokens_seen": 28310780, + "router_z_loss_clip": 4.6171875, + "router_z_loss_mlp": 0.51855469, + "step": 1335, + "time_per_iteration": 2.5995283126831055 + }, + { + "auxiliary_loss_clip": 0.0666078, + "auxiliary_loss_mlp": 0.0130214, + "balance_loss_clip": 0.06361455, + "balance_loss_mlp": 0.01275747, + "epoch": 0.08032466556440704, + "flos": 52597716940800.0, + "grad_norm": 0.7171954407460774, + "language_loss": 0.56264853, + "learning_rate": 3.973493112307889e-06, + "loss": 0.64227772, + "num_input_tokens_seen": 28369985, + "router_z_loss_clip": 3.0, + "router_z_loss_mlp": 0.2644043, + "step": 1336, + "time_per_iteration": 3.246748447418213 + }, + { + "auxiliary_loss_clip": 0.06839207, + "auxiliary_loss_mlp": 0.01326336, + "balance_loss_clip": 0.06379974, + "balance_loss_mlp": 0.01274528, + "epoch": 0.080384788817075, + "flos": 23849500245120.0, + "grad_norm": 4.030100704660237, + "language_loss": 0.70582694, + "learning_rate": 3.9734298773265005e-06, + "loss": 0.78748238, + "num_input_tokens_seen": 28388670, + "router_z_loss_clip": 4.58984375, + "router_z_loss_mlp": 0.51757812, + "step": 1337, + "time_per_iteration": 2.6283833980560303 + }, + { + "auxiliary_loss_clip": 0.06838794, + "auxiliary_loss_mlp": 0.01334035, + "balance_loss_clip": 0.06387126, + "balance_loss_mlp": 0.01282751, + "epoch": 0.08044491206974297, + "flos": 25307640996480.0, + "grad_norm": 2.123866739454124, + "language_loss": 0.89543176, + "learning_rate": 3.973366567512453e-06, + "loss": 0.97716004, + "num_input_tokens_seen": 28411845, + "router_z_loss_clip": 4.515625, + "router_z_loss_mlp": 0.51245117, + "step": 1338, + "time_per_iteration": 2.657308340072632 + }, + { + "auxiliary_loss_clip": 0.0684766, + "auxiliary_loss_mlp": 0.01327669, + "balance_loss_clip": 0.06375088, + "balance_loss_mlp": 0.01275956, + "epoch": 0.08050503532241095, + "flos": 22382093617920.0, + "grad_norm": 3.2141596734882705, + "language_loss": 0.89268589, + "learning_rate": 3.973303182868147e-06, + "loss": 0.97443926, + "num_input_tokens_seen": 28427875, + "router_z_loss_clip": 4.7265625, + "router_z_loss_mlp": 0.51708984, + "step": 1339, + "time_per_iteration": 2.592478036880493 + }, + { + "auxiliary_loss_clip": 0.06819817, + "auxiliary_loss_mlp": 0.01317452, + "balance_loss_clip": 0.06381136, + "balance_loss_mlp": 0.01272391, + "epoch": 0.08056515857507891, + "flos": 18375351079680.0, + "grad_norm": 3.0627135326619093, + "language_loss": 0.91607487, + "learning_rate": 3.973239723395988e-06, + "loss": 0.99744761, + "num_input_tokens_seen": 28446615, + "router_z_loss_clip": 4.390625, + "router_z_loss_mlp": 0.45019531, + "step": 1340, + "time_per_iteration": 2.576737403869629 + }, + { + "auxiliary_loss_clip": 0.06633395, + "auxiliary_loss_mlp": 0.01308679, + "balance_loss_clip": 0.06341641, + "balance_loss_mlp": 0.01279282, + "epoch": 0.08062528182774688, + "flos": 51364938545280.0, + "grad_norm": 0.8608858843500025, + "language_loss": 0.65432441, + "learning_rate": 3.97317618909838e-06, + "loss": 0.73374522, + "num_input_tokens_seen": 28505290, + "router_z_loss_clip": 2.91992188, + "router_z_loss_mlp": 0.29321289, + "step": 1341, + "time_per_iteration": 3.1589889526367188 + }, + { + "auxiliary_loss_clip": 0.06851779, + "auxiliary_loss_mlp": 0.01330947, + "balance_loss_clip": 0.06375904, + "balance_loss_mlp": 0.01274966, + "epoch": 0.08068540508041486, + "flos": 17604925914240.0, + "grad_norm": 3.057229978757205, + "language_loss": 0.9131434, + "learning_rate": 3.973112579977733e-06, + "loss": 0.99497068, + "num_input_tokens_seen": 28522735, + "router_z_loss_clip": 4.7578125, + "router_z_loss_mlp": 0.55932617, + "step": 1342, + "time_per_iteration": 2.5444014072418213 + }, + { + "auxiliary_loss_clip": 0.06830276, + "auxiliary_loss_mlp": 0.01334079, + "balance_loss_clip": 0.06376267, + "balance_loss_mlp": 0.01283748, + "epoch": 0.08074552833308282, + "flos": 10565761714560.0, + "grad_norm": 4.354152160697022, + "language_loss": 0.78571475, + "learning_rate": 3.973048896036459e-06, + "loss": 0.86735827, + "num_input_tokens_seen": 28539460, + "router_z_loss_clip": 4.54296875, + "router_z_loss_mlp": 0.50268555, + "step": 1343, + "time_per_iteration": 2.5960419178009033 + }, + { + "auxiliary_loss_clip": 0.06624237, + "auxiliary_loss_mlp": 0.01296199, + "balance_loss_clip": 0.06332739, + "balance_loss_mlp": 0.0127157, + "epoch": 0.08080565158575079, + "flos": 60859624245120.0, + "grad_norm": 0.7713053801929547, + "language_loss": 0.57751364, + "learning_rate": 3.972985137276974e-06, + "loss": 0.65671802, + "num_input_tokens_seen": 28599855, + "router_z_loss_clip": 2.90625, + "router_z_loss_mlp": 0.24609375, + "step": 1344, + "time_per_iteration": 3.101456880569458 + }, + { + "auxiliary_loss_clip": 0.06825489, + "auxiliary_loss_mlp": 0.01321695, + "balance_loss_clip": 0.06367917, + "balance_loss_mlp": 0.01271937, + "epoch": 0.08086577483841875, + "flos": 18338188993920.0, + "grad_norm": 5.096262211204216, + "language_loss": 0.90334368, + "learning_rate": 3.972921303701695e-06, + "loss": 0.98481554, + "num_input_tokens_seen": 28617585, + "router_z_loss_clip": 4.578125, + "router_z_loss_mlp": 0.49780273, + "step": 1345, + "time_per_iteration": 2.586388349533081 + }, + { + "auxiliary_loss_clip": 0.0679345, + "auxiliary_loss_mlp": 0.013189, + "balance_loss_clip": 0.06356402, + "balance_loss_mlp": 0.01272527, + "epoch": 0.08092589809108673, + "flos": 21550048174080.0, + "grad_norm": 2.3072860000969437, + "language_loss": 0.89656544, + "learning_rate": 3.972857395313042e-06, + "loss": 0.97768891, + "num_input_tokens_seen": 28636355, + "router_z_loss_clip": 4.37109375, + "router_z_loss_mlp": 0.46386719, + "step": 1346, + "time_per_iteration": 2.582712411880493 + }, + { + "auxiliary_loss_clip": 0.06790248, + "auxiliary_loss_mlp": 0.01314356, + "balance_loss_clip": 0.06353667, + "balance_loss_mlp": 0.0126734, + "epoch": 0.0809860213437547, + "flos": 22134662910720.0, + "grad_norm": 2.14729633171376, + "language_loss": 0.94647479, + "learning_rate": 3.972793412113439e-06, + "loss": 1.0275209, + "num_input_tokens_seen": 28656260, + "router_z_loss_clip": 4.36328125, + "router_z_loss_mlp": 0.47021484, + "step": 1347, + "time_per_iteration": 2.625967025756836 + }, + { + "auxiliary_loss_clip": 0.06793564, + "auxiliary_loss_mlp": 0.01318721, + "balance_loss_clip": 0.06355867, + "balance_loss_mlp": 0.01268487, + "epoch": 0.08104614459642266, + "flos": 21731875534080.0, + "grad_norm": 1.9969105850097444, + "language_loss": 0.91454613, + "learning_rate": 3.972729354105312e-06, + "loss": 0.99566901, + "num_input_tokens_seen": 28675865, + "router_z_loss_clip": 4.37109375, + "router_z_loss_mlp": 0.50219727, + "step": 1348, + "time_per_iteration": 2.5634779930114746 + }, + { + "auxiliary_loss_clip": 0.06800284, + "auxiliary_loss_mlp": 0.01324319, + "balance_loss_clip": 0.06360676, + "balance_loss_mlp": 0.01274585, + "epoch": 0.08110626784909064, + "flos": 23958764369280.0, + "grad_norm": 1.9721965286660104, + "language_loss": 0.78618681, + "learning_rate": 3.97266522129109e-06, + "loss": 0.86743283, + "num_input_tokens_seen": 28696255, + "router_z_loss_clip": 4.39453125, + "router_z_loss_mlp": 0.49731445, + "step": 1349, + "time_per_iteration": 2.6185498237609863 + }, + { + "auxiliary_loss_clip": 0.06800876, + "auxiliary_loss_mlp": 0.01313559, + "balance_loss_clip": 0.06350809, + "balance_loss_mlp": 0.01260082, + "epoch": 0.0811663911017586, + "flos": 19031648584320.0, + "grad_norm": 2.1691769325426407, + "language_loss": 0.90292668, + "learning_rate": 3.972601013673205e-06, + "loss": 0.98407102, + "num_input_tokens_seen": 28713905, + "router_z_loss_clip": 4.5, + "router_z_loss_mlp": 0.53491211, + "step": 1350, + "time_per_iteration": 2.5529837608337402 + }, + { + "auxiliary_loss_clip": 0.06778225, + "auxiliary_loss_mlp": 0.01313184, + "balance_loss_clip": 0.06345821, + "balance_loss_mlp": 0.01263522, + "epoch": 0.08122651435442657, + "flos": 15346744778880.0, + "grad_norm": 2.4256402439075524, + "language_loss": 0.84302771, + "learning_rate": 3.972536731254092e-06, + "loss": 0.92394179, + "num_input_tokens_seen": 28732075, + "router_z_loss_clip": 4.33203125, + "router_z_loss_mlp": 0.49633789, + "step": 1351, + "time_per_iteration": 2.574605941772461 + }, + { + "auxiliary_loss_clip": 0.06780043, + "auxiliary_loss_mlp": 0.01313675, + "balance_loss_clip": 0.06340061, + "balance_loss_mlp": 0.01260365, + "epoch": 0.08128663760709455, + "flos": 23228226547200.0, + "grad_norm": 2.4241077577089296, + "language_loss": 0.77524561, + "learning_rate": 3.972472374036189e-06, + "loss": 0.85618269, + "num_input_tokens_seen": 28751150, + "router_z_loss_clip": 4.39453125, + "router_z_loss_mlp": 0.53393555, + "step": 1352, + "time_per_iteration": 2.5638983249664307 + }, + { + "auxiliary_loss_clip": 0.06784214, + "auxiliary_loss_mlp": 0.01317971, + "balance_loss_clip": 0.06339107, + "balance_loss_mlp": 0.01263802, + "epoch": 0.08134676085976252, + "flos": 22972158869760.0, + "grad_norm": 2.0098905052691154, + "language_loss": 0.84226817, + "learning_rate": 3.972407942021935e-06, + "loss": 0.92329001, + "num_input_tokens_seen": 28773360, + "router_z_loss_clip": 4.44140625, + "router_z_loss_mlp": 0.54223633, + "step": 1353, + "time_per_iteration": 2.64945125579834 + }, + { + "auxiliary_loss_clip": 0.06608218, + "auxiliary_loss_mlp": 0.01309213, + "balance_loss_clip": 0.06325812, + "balance_loss_mlp": 0.01278219, + "epoch": 0.08140688411243048, + "flos": 64338592642560.0, + "grad_norm": 0.8262871142057754, + "language_loss": 0.5983628, + "learning_rate": 3.972343435213775e-06, + "loss": 0.67753708, + "num_input_tokens_seen": 28833390, + "router_z_loss_clip": 2.828125, + "router_z_loss_mlp": 0.30957031, + "step": 1354, + "time_per_iteration": 3.1732943058013916 + }, + { + "auxiliary_loss_clip": 0.06774879, + "auxiliary_loss_mlp": 0.0130121, + "balance_loss_clip": 0.0634238, + "balance_loss_mlp": 0.01251332, + "epoch": 0.08146700736509845, + "flos": 22498401484800.0, + "grad_norm": 1.9500881523267093, + "language_loss": 0.84588456, + "learning_rate": 3.972278853614154e-06, + "loss": 0.92664552, + "num_input_tokens_seen": 28852430, + "router_z_loss_clip": 4.32421875, + "router_z_loss_mlp": 0.49853516, + "step": 1355, + "time_per_iteration": 2.6024701595306396 + }, + { + "auxiliary_loss_clip": 0.06776839, + "auxiliary_loss_mlp": 0.01312133, + "balance_loss_clip": 0.06341404, + "balance_loss_mlp": 0.01258727, + "epoch": 0.08152713061776642, + "flos": 20453885061120.0, + "grad_norm": 2.065670918937768, + "language_loss": 0.73062277, + "learning_rate": 3.972214197225521e-06, + "loss": 0.81151247, + "num_input_tokens_seen": 28870685, + "router_z_loss_clip": 4.3515625, + "router_z_loss_mlp": 0.53393555, + "step": 1356, + "time_per_iteration": 2.72872257232666 + }, + { + "auxiliary_loss_clip": 0.06800745, + "auxiliary_loss_mlp": 0.01315187, + "balance_loss_clip": 0.06343117, + "balance_loss_mlp": 0.01261305, + "epoch": 0.08158725387043439, + "flos": 23556983241600.0, + "grad_norm": 2.136910900826005, + "language_loss": 0.72079623, + "learning_rate": 3.972149466050329e-06, + "loss": 0.80195546, + "num_input_tokens_seen": 28889860, + "router_z_loss_clip": 4.57421875, + "router_z_loss_mlp": 0.5390625, + "step": 1357, + "time_per_iteration": 2.5841641426086426 + }, + { + "auxiliary_loss_clip": 0.06792152, + "auxiliary_loss_mlp": 0.01312262, + "balance_loss_clip": 0.06345978, + "balance_loss_mlp": 0.01258093, + "epoch": 0.08164737712310235, + "flos": 22023763632000.0, + "grad_norm": 3.905031036394957, + "language_loss": 0.86688, + "learning_rate": 3.97208466009103e-06, + "loss": 0.94792414, + "num_input_tokens_seen": 28905865, + "router_z_loss_clip": 4.45703125, + "router_z_loss_mlp": 0.54150391, + "step": 1358, + "time_per_iteration": 4.091388940811157 + }, + { + "auxiliary_loss_clip": 0.0678063, + "auxiliary_loss_mlp": 0.01322843, + "balance_loss_clip": 0.06336431, + "balance_loss_mlp": 0.01268985, + "epoch": 0.08170750037577033, + "flos": 23374568903040.0, + "grad_norm": 2.183092150408785, + "language_loss": 1.0464294, + "learning_rate": 3.972019779350084e-06, + "loss": 1.12746406, + "num_input_tokens_seen": 28925250, + "router_z_loss_clip": 4.4453125, + "router_z_loss_mlp": 0.53857422, + "step": 1359, + "time_per_iteration": 2.638028860092163 + }, + { + "auxiliary_loss_clip": 0.06798591, + "auxiliary_loss_mlp": 0.01334932, + "balance_loss_clip": 0.06339104, + "balance_loss_mlp": 0.01274732, + "epoch": 0.0817676236284383, + "flos": 28404743610240.0, + "grad_norm": 2.2550025008974335, + "language_loss": 0.86049831, + "learning_rate": 3.971954823829951e-06, + "loss": 0.9418335, + "num_input_tokens_seen": 28943445, + "router_z_loss_clip": 4.59765625, + "router_z_loss_mlp": 0.60229492, + "step": 1360, + "time_per_iteration": 4.079089164733887 + }, + { + "auxiliary_loss_clip": 0.06791367, + "auxiliary_loss_mlp": 0.01327265, + "balance_loss_clip": 0.06338443, + "balance_loss_mlp": 0.01274146, + "epoch": 0.08182774688110626, + "flos": 19215027244800.0, + "grad_norm": 8.376592298607987, + "language_loss": 0.74940681, + "learning_rate": 3.971889793533093e-06, + "loss": 0.83059311, + "num_input_tokens_seen": 28962695, + "router_z_loss_clip": 4.53125, + "router_z_loss_mlp": 0.53125, + "step": 1361, + "time_per_iteration": 2.6070094108581543 + }, + { + "auxiliary_loss_clip": 0.06780887, + "auxiliary_loss_mlp": 0.01320749, + "balance_loss_clip": 0.06343664, + "balance_loss_mlp": 0.01270443, + "epoch": 0.08188787013377424, + "flos": 22790750780160.0, + "grad_norm": 2.8909747766913574, + "language_loss": 0.79067749, + "learning_rate": 3.971824688461976e-06, + "loss": 0.87169385, + "num_input_tokens_seen": 28982120, + "router_z_loss_clip": 4.3671875, + "router_z_loss_mlp": 0.50244141, + "step": 1362, + "time_per_iteration": 2.575406074523926 + }, + { + "auxiliary_loss_clip": 0.06776625, + "auxiliary_loss_mlp": 0.01317112, + "balance_loss_clip": 0.06338399, + "balance_loss_mlp": 0.01266543, + "epoch": 0.08194799338644221, + "flos": 16473026234880.0, + "grad_norm": 2.5840358465526787, + "language_loss": 0.74518561, + "learning_rate": 3.971759508619069e-06, + "loss": 0.826123, + "num_input_tokens_seen": 28998100, + "router_z_loss_clip": 4.3828125, + "router_z_loss_mlp": 0.50537109, + "step": 1363, + "time_per_iteration": 3.9524402618408203 + }, + { + "auxiliary_loss_clip": 0.06785508, + "auxiliary_loss_mlp": 0.01321755, + "balance_loss_clip": 0.06342393, + "balance_loss_mlp": 0.01265846, + "epoch": 0.08200811663911017, + "flos": 23920218691200.0, + "grad_norm": 2.478943630227512, + "language_loss": 0.79175317, + "learning_rate": 3.971694254006844e-06, + "loss": 0.87282574, + "num_input_tokens_seen": 29017095, + "router_z_loss_clip": 4.43359375, + "router_z_loss_mlp": 0.55859375, + "step": 1364, + "time_per_iteration": 2.607170343399048 + }, + { + "auxiliary_loss_clip": 0.06783722, + "auxiliary_loss_mlp": 0.01316868, + "balance_loss_clip": 0.06340142, + "balance_loss_mlp": 0.01262867, + "epoch": 0.08206823989177814, + "flos": 17902641870720.0, + "grad_norm": 2.8411268969790275, + "language_loss": 0.83563399, + "learning_rate": 3.971628924627776e-06, + "loss": 0.91663992, + "num_input_tokens_seen": 29037240, + "router_z_loss_clip": 4.4375, + "router_z_loss_mlp": 0.54003906, + "step": 1365, + "time_per_iteration": 4.020315647125244 + }, + { + "auxiliary_loss_clip": 0.06767645, + "auxiliary_loss_mlp": 0.01324198, + "balance_loss_clip": 0.06336691, + "balance_loss_mlp": 0.01274917, + "epoch": 0.08212836314444612, + "flos": 22094272442880.0, + "grad_norm": 1.9744562731627089, + "language_loss": 0.83576512, + "learning_rate": 3.97156352048434e-06, + "loss": 0.91668355, + "num_input_tokens_seen": 29056250, + "router_z_loss_clip": 4.30078125, + "router_z_loss_mlp": 0.49243164, + "step": 1366, + "time_per_iteration": 2.5904746055603027 + }, + { + "auxiliary_loss_clip": 0.06785953, + "auxiliary_loss_mlp": 0.01321056, + "balance_loss_clip": 0.06344087, + "balance_loss_mlp": 0.01269963, + "epoch": 0.08218848639711408, + "flos": 17602326437760.0, + "grad_norm": 2.595099293602591, + "language_loss": 0.84101415, + "learning_rate": 3.97149804157902e-06, + "loss": 0.92208421, + "num_input_tokens_seen": 29073380, + "router_z_loss_clip": 4.41015625, + "router_z_loss_mlp": 0.51074219, + "step": 1367, + "time_per_iteration": 2.547091007232666 + }, + { + "auxiliary_loss_clip": 0.06812844, + "auxiliary_loss_mlp": 0.01336623, + "balance_loss_clip": 0.06357861, + "balance_loss_mlp": 0.01283504, + "epoch": 0.08224860964978205, + "flos": 17863551141120.0, + "grad_norm": 3.794710967606561, + "language_loss": 0.85955203, + "learning_rate": 3.9714324879142946e-06, + "loss": 0.94104671, + "num_input_tokens_seen": 29091330, + "router_z_loss_clip": 4.546875, + "router_z_loss_mlp": 0.53100586, + "step": 1368, + "time_per_iteration": 2.6025125980377197 + }, + { + "auxiliary_loss_clip": 0.06754048, + "auxiliary_loss_mlp": 0.01305347, + "balance_loss_clip": 0.06340475, + "balance_loss_mlp": 0.01259881, + "epoch": 0.08230873290245003, + "flos": 25234406928000.0, + "grad_norm": 1.7485210372757418, + "language_loss": 0.82751203, + "learning_rate": 3.971366859492653e-06, + "loss": 0.90810603, + "num_input_tokens_seen": 29110375, + "router_z_loss_clip": 4.12890625, + "router_z_loss_mlp": 0.45458984, + "step": 1369, + "time_per_iteration": 2.6027116775512695 + }, + { + "auxiliary_loss_clip": 0.06772825, + "auxiliary_loss_mlp": 0.01314688, + "balance_loss_clip": 0.06341462, + "balance_loss_mlp": 0.01264811, + "epoch": 0.08236885615511799, + "flos": 31768144099200.0, + "grad_norm": 4.8921113569353425, + "language_loss": 0.77775633, + "learning_rate": 3.971301156316582e-06, + "loss": 0.85863149, + "num_input_tokens_seen": 29129395, + "router_z_loss_clip": 4.31640625, + "router_z_loss_mlp": 0.49902344, + "step": 1370, + "time_per_iteration": 2.685317039489746 + }, + { + "auxiliary_loss_clip": 0.06783543, + "auxiliary_loss_mlp": 0.01317271, + "balance_loss_clip": 0.06345622, + "balance_loss_mlp": 0.01265153, + "epoch": 0.08242897940778596, + "flos": 23192615761920.0, + "grad_norm": 2.053394395942029, + "language_loss": 0.76803637, + "learning_rate": 3.971235378388573e-06, + "loss": 0.84904444, + "num_input_tokens_seen": 29148650, + "router_z_loss_clip": 4.37890625, + "router_z_loss_mlp": 0.52124023, + "step": 1371, + "time_per_iteration": 2.6406354904174805 + }, + { + "auxiliary_loss_clip": 0.06769266, + "auxiliary_loss_mlp": 0.01317025, + "balance_loss_clip": 0.06335683, + "balance_loss_mlp": 0.01267625, + "epoch": 0.08248910266045394, + "flos": 34499327932800.0, + "grad_norm": 3.0324747361967557, + "language_loss": 0.72827047, + "learning_rate": 3.971169525711122e-06, + "loss": 0.80913335, + "num_input_tokens_seen": 29170785, + "router_z_loss_clip": 4.33203125, + "router_z_loss_mlp": 0.49438477, + "step": 1372, + "time_per_iteration": 2.709796905517578 + }, + { + "auxiliary_loss_clip": 0.06798708, + "auxiliary_loss_mlp": 0.01317216, + "balance_loss_clip": 0.06345405, + "balance_loss_mlp": 0.01260854, + "epoch": 0.0825492259131219, + "flos": 13440059521920.0, + "grad_norm": 3.0329353190283075, + "language_loss": 0.9010855, + "learning_rate": 3.9711035982867246e-06, + "loss": 0.98224467, + "num_input_tokens_seen": 29185210, + "router_z_loss_clip": 4.53125, + "router_z_loss_mlp": 0.56420898, + "step": 1373, + "time_per_iteration": 2.5570318698883057 + }, + { + "auxiliary_loss_clip": 0.06774755, + "auxiliary_loss_mlp": 0.01317124, + "balance_loss_clip": 0.0634156, + "balance_loss_mlp": 0.01267056, + "epoch": 0.08260934916578987, + "flos": 25819608643200.0, + "grad_norm": 3.0603308178325657, + "language_loss": 0.84582615, + "learning_rate": 3.971037596117882e-06, + "loss": 0.92674494, + "num_input_tokens_seen": 29205210, + "router_z_loss_clip": 4.3359375, + "router_z_loss_mlp": 0.50024414, + "step": 1374, + "time_per_iteration": 2.596226215362549 + }, + { + "auxiliary_loss_clip": 0.06626149, + "auxiliary_loss_mlp": 0.0129603, + "balance_loss_clip": 0.06341976, + "balance_loss_mlp": 0.01265918, + "epoch": 0.08266947241845783, + "flos": 63478609061760.0, + "grad_norm": 0.8009341803089134, + "language_loss": 0.60659707, + "learning_rate": 3.970971519207095e-06, + "loss": 0.68581879, + "num_input_tokens_seen": 29265350, + "router_z_loss_clip": 2.84375, + "router_z_loss_mlp": 0.30053711, + "step": 1375, + "time_per_iteration": 3.177459716796875 + }, + { + "auxiliary_loss_clip": 0.06618689, + "auxiliary_loss_mlp": 0.01286424, + "balance_loss_clip": 0.06334813, + "balance_loss_mlp": 0.01256718, + "epoch": 0.08272959567112581, + "flos": 70013855606400.0, + "grad_norm": 0.886054791003263, + "language_loss": 0.62275791, + "learning_rate": 3.970905367556871e-06, + "loss": 0.70180905, + "num_input_tokens_seen": 29321475, + "router_z_loss_clip": 2.84375, + "router_z_loss_mlp": 0.29638672, + "step": 1376, + "time_per_iteration": 3.1206676959991455 + }, + { + "auxiliary_loss_clip": 0.06771185, + "auxiliary_loss_mlp": 0.01316915, + "balance_loss_clip": 0.06341776, + "balance_loss_mlp": 0.01268611, + "epoch": 0.08278971892379378, + "flos": 20419574014080.0, + "grad_norm": 2.5198182509144735, + "language_loss": 0.84768277, + "learning_rate": 3.970839141169718e-06, + "loss": 0.92856377, + "num_input_tokens_seen": 29341405, + "router_z_loss_clip": 4.2890625, + "router_z_loss_mlp": 0.48266602, + "step": 1377, + "time_per_iteration": 2.6820216178894043 + }, + { + "auxiliary_loss_clip": 0.06764729, + "auxiliary_loss_mlp": 0.01308146, + "balance_loss_clip": 0.06342821, + "balance_loss_mlp": 0.0126144, + "epoch": 0.08284984217646174, + "flos": 26257461753600.0, + "grad_norm": 2.286420184169047, + "language_loss": 0.86602247, + "learning_rate": 3.970772840048147e-06, + "loss": 0.94675124, + "num_input_tokens_seen": 29361955, + "router_z_loss_clip": 4.2109375, + "router_z_loss_mlp": 0.46728516, + "step": 1378, + "time_per_iteration": 2.5983967781066895 + }, + { + "auxiliary_loss_clip": 0.06779523, + "auxiliary_loss_mlp": 0.01324128, + "balance_loss_clip": 0.06348801, + "balance_loss_mlp": 0.01275396, + "epoch": 0.08290996542912972, + "flos": 27201370798080.0, + "grad_norm": 4.155383498543994, + "language_loss": 0.9020921, + "learning_rate": 3.970706464194672e-06, + "loss": 0.98312867, + "num_input_tokens_seen": 29382395, + "router_z_loss_clip": 4.30664062, + "router_z_loss_mlp": 0.48779297, + "step": 1379, + "time_per_iteration": 2.6558284759521484 + }, + { + "auxiliary_loss_clip": 0.06771149, + "auxiliary_loss_mlp": 0.01307486, + "balance_loss_clip": 0.06347619, + "balance_loss_mlp": 0.01261972, + "epoch": 0.08297008868179769, + "flos": 38627367655680.0, + "grad_norm": 2.766384510146163, + "language_loss": 0.80964148, + "learning_rate": 3.970640013611812e-06, + "loss": 0.89042783, + "num_input_tokens_seen": 29404460, + "router_z_loss_clip": 4.23828125, + "router_z_loss_mlp": 0.45483398, + "step": 1380, + "time_per_iteration": 2.7228140830993652 + }, + { + "auxiliary_loss_clip": 0.06759404, + "auxiliary_loss_mlp": 0.01314619, + "balance_loss_clip": 0.06340429, + "balance_loss_mlp": 0.01265576, + "epoch": 0.08303021193446565, + "flos": 19980924289920.0, + "grad_norm": 2.7915027065661593, + "language_loss": 0.88561881, + "learning_rate": 3.970573488302083e-06, + "loss": 0.96635896, + "num_input_tokens_seen": 29422675, + "router_z_loss_clip": 4.18554688, + "router_z_loss_mlp": 0.49023438, + "step": 1381, + "time_per_iteration": 2.6598143577575684 + }, + { + "auxiliary_loss_clip": 0.06800985, + "auxiliary_loss_mlp": 0.0131809, + "balance_loss_clip": 0.06359053, + "balance_loss_mlp": 0.01265972, + "epoch": 0.08309033518713363, + "flos": 13667769792000.0, + "grad_norm": 3.693105114641136, + "language_loss": 0.91473186, + "learning_rate": 3.970506888268011e-06, + "loss": 0.99592257, + "num_input_tokens_seen": 29439840, + "router_z_loss_clip": 4.41796875, + "router_z_loss_mlp": 0.52148438, + "step": 1382, + "time_per_iteration": 2.5975959300994873 + }, + { + "auxiliary_loss_clip": 0.06790116, + "auxiliary_loss_mlp": 0.01312438, + "balance_loss_clip": 0.06361018, + "balance_loss_mlp": 0.01263229, + "epoch": 0.0831504584398016, + "flos": 17974492346880.0, + "grad_norm": 2.495217268396043, + "language_loss": 0.78734231, + "learning_rate": 3.970440213512121e-06, + "loss": 0.86836791, + "num_input_tokens_seen": 29457360, + "router_z_loss_clip": 4.28515625, + "router_z_loss_mlp": 0.49243164, + "step": 1383, + "time_per_iteration": 2.625793695449829 + }, + { + "auxiliary_loss_clip": 0.06786636, + "auxiliary_loss_mlp": 0.01320002, + "balance_loss_clip": 0.06359254, + "balance_loss_mlp": 0.01273797, + "epoch": 0.08321058169246956, + "flos": 22607959098240.0, + "grad_norm": 2.963836437118746, + "language_loss": 0.85324878, + "learning_rate": 3.97037346403694e-06, + "loss": 0.93431515, + "num_input_tokens_seen": 29477040, + "router_z_loss_clip": 4.26953125, + "router_z_loss_mlp": 0.46240234, + "step": 1384, + "time_per_iteration": 2.6376733779907227 + }, + { + "auxiliary_loss_clip": 0.06818897, + "auxiliary_loss_mlp": 0.01334638, + "balance_loss_clip": 0.06359202, + "balance_loss_mlp": 0.01276106, + "epoch": 0.08327070494513754, + "flos": 22855976784000.0, + "grad_norm": 3.1601990232642225, + "language_loss": 0.86789215, + "learning_rate": 3.970306639845e-06, + "loss": 0.94942749, + "num_input_tokens_seen": 29492010, + "router_z_loss_clip": 4.59375, + "router_z_loss_mlp": 0.58569336, + "step": 1385, + "time_per_iteration": 2.568554639816284 + }, + { + "auxiliary_loss_clip": 0.06798602, + "auxiliary_loss_mlp": 0.0132055, + "balance_loss_clip": 0.06352767, + "balance_loss_mlp": 0.01267978, + "epoch": 0.0833308281978055, + "flos": 22789451041920.0, + "grad_norm": 2.43217008586481, + "language_loss": 0.71394652, + "learning_rate": 3.970239740938835e-06, + "loss": 0.795138, + "num_input_tokens_seen": 29511850, + "router_z_loss_clip": 4.45703125, + "router_z_loss_mlp": 0.52563477, + "step": 1386, + "time_per_iteration": 2.6096982955932617 + }, + { + "auxiliary_loss_clip": 0.06791467, + "auxiliary_loss_mlp": 0.01322523, + "balance_loss_clip": 0.06356902, + "balance_loss_mlp": 0.01273099, + "epoch": 0.08339095145047347, + "flos": 20818713738240.0, + "grad_norm": 2.3900622326762133, + "language_loss": 0.84172809, + "learning_rate": 3.97017276732098e-06, + "loss": 0.92286795, + "num_input_tokens_seen": 29531415, + "router_z_loss_clip": 4.34375, + "router_z_loss_mlp": 0.49389648, + "step": 1387, + "time_per_iteration": 2.575343132019043 + }, + { + "auxiliary_loss_clip": 0.06797379, + "auxiliary_loss_mlp": 0.01318956, + "balance_loss_clip": 0.06353064, + "balance_loss_mlp": 0.01265598, + "epoch": 0.08345107470314143, + "flos": 18521274165120.0, + "grad_norm": 5.434584550719809, + "language_loss": 0.79640985, + "learning_rate": 3.970105718993978e-06, + "loss": 0.87757325, + "num_input_tokens_seen": 29549525, + "router_z_loss_clip": 4.44140625, + "router_z_loss_mlp": 0.53369141, + "step": 1388, + "time_per_iteration": 2.567218780517578 + }, + { + "auxiliary_loss_clip": 0.06780161, + "auxiliary_loss_mlp": 0.01317075, + "balance_loss_clip": 0.06354657, + "balance_loss_mlp": 0.0126932, + "epoch": 0.08351119795580941, + "flos": 18813623460480.0, + "grad_norm": 2.631761877844796, + "language_loss": 0.82141799, + "learning_rate": 3.970038595960369e-06, + "loss": 0.90239036, + "num_input_tokens_seen": 29568705, + "router_z_loss_clip": 4.2578125, + "router_z_loss_mlp": 0.47827148, + "step": 1389, + "time_per_iteration": 2.5653841495513916 + }, + { + "auxiliary_loss_clip": 0.06804, + "auxiliary_loss_mlp": 0.01321664, + "balance_loss_clip": 0.06357203, + "balance_loss_mlp": 0.01264014, + "epoch": 0.08357132120847738, + "flos": 18447662753280.0, + "grad_norm": 4.4672809610096005, + "language_loss": 0.89901805, + "learning_rate": 3.969971398222699e-06, + "loss": 0.9802748, + "num_input_tokens_seen": 29585855, + "router_z_loss_clip": 4.46484375, + "router_z_loss_mlp": 0.57666016, + "step": 1390, + "time_per_iteration": 2.5599520206451416 + }, + { + "auxiliary_loss_clip": 0.06784607, + "auxiliary_loss_mlp": 0.01318322, + "balance_loss_clip": 0.06351756, + "balance_loss_mlp": 0.01268469, + "epoch": 0.08363144446114534, + "flos": 25929585527040.0, + "grad_norm": 2.0099549817565, + "language_loss": 0.88354278, + "learning_rate": 3.969904125783517e-06, + "loss": 0.96457207, + "num_input_tokens_seen": 29607280, + "router_z_loss_clip": 4.328125, + "router_z_loss_mlp": 0.49853516, + "step": 1391, + "time_per_iteration": 2.611985921859741 + }, + { + "auxiliary_loss_clip": 0.06815389, + "auxiliary_loss_mlp": 0.01329624, + "balance_loss_clip": 0.06354406, + "balance_loss_mlp": 0.01268851, + "epoch": 0.08369156771381332, + "flos": 18047223290880.0, + "grad_norm": 3.4660821416963805, + "language_loss": 0.90262675, + "learning_rate": 3.969836778645371e-06, + "loss": 0.98407698, + "num_input_tokens_seen": 29624130, + "router_z_loss_clip": 4.609375, + "router_z_loss_mlp": 0.60791016, + "step": 1392, + "time_per_iteration": 2.5649681091308594 + }, + { + "auxiliary_loss_clip": 0.06784143, + "auxiliary_loss_mlp": 0.01319854, + "balance_loss_clip": 0.06346482, + "balance_loss_mlp": 0.01270025, + "epoch": 0.08375169096648129, + "flos": 22681822072320.0, + "grad_norm": 4.398591622405809, + "language_loss": 0.82388842, + "learning_rate": 3.969769356810819e-06, + "loss": 0.90492845, + "num_input_tokens_seen": 29643210, + "router_z_loss_clip": 4.375, + "router_z_loss_mlp": 0.4987793, + "step": 1393, + "time_per_iteration": 2.596484899520874 + }, + { + "auxiliary_loss_clip": 0.06777762, + "auxiliary_loss_mlp": 0.01325984, + "balance_loss_clip": 0.06353533, + "balance_loss_mlp": 0.01276679, + "epoch": 0.08381181421914925, + "flos": 26110238929920.0, + "grad_norm": 2.2804276198164386, + "language_loss": 0.86896241, + "learning_rate": 3.969701860282415e-06, + "loss": 0.94999981, + "num_input_tokens_seen": 29663920, + "router_z_loss_clip": 4.24414062, + "router_z_loss_mlp": 0.49291992, + "step": 1394, + "time_per_iteration": 2.6082303524017334 + }, + { + "auxiliary_loss_clip": 0.06795013, + "auxiliary_loss_mlp": 0.01318108, + "balance_loss_clip": 0.06360835, + "balance_loss_mlp": 0.01267063, + "epoch": 0.08387193747181723, + "flos": 20635796275200.0, + "grad_norm": 2.9482675367733306, + "language_loss": 0.84974355, + "learning_rate": 3.969634289062719e-06, + "loss": 0.93087476, + "num_input_tokens_seen": 29683825, + "router_z_loss_clip": 4.3359375, + "router_z_loss_mlp": 0.51098633, + "step": 1395, + "time_per_iteration": 2.579622745513916 + }, + { + "auxiliary_loss_clip": 0.06798401, + "auxiliary_loss_mlp": 0.01311309, + "balance_loss_clip": 0.06349191, + "balance_loss_mlp": 0.01256282, + "epoch": 0.0839320607244852, + "flos": 13448193367680.0, + "grad_norm": 3.513957453818194, + "language_loss": 0.85002828, + "learning_rate": 3.969566643154293e-06, + "loss": 0.93112534, + "num_input_tokens_seen": 29698775, + "router_z_loss_clip": 4.48828125, + "router_z_loss_mlp": 0.55078125, + "step": 1396, + "time_per_iteration": 2.5521080493927 + }, + { + "auxiliary_loss_clip": 0.06784061, + "auxiliary_loss_mlp": 0.0131232, + "balance_loss_clip": 0.06356047, + "balance_loss_mlp": 0.0126261, + "epoch": 0.08399218397715316, + "flos": 23484000735360.0, + "grad_norm": 4.145800578493811, + "language_loss": 0.79030329, + "learning_rate": 3.969498922559703e-06, + "loss": 0.87126708, + "num_input_tokens_seen": 29719430, + "router_z_loss_clip": 4.28515625, + "router_z_loss_mlp": 0.49682617, + "step": 1397, + "time_per_iteration": 4.026551961898804 + }, + { + "auxiliary_loss_clip": 0.06777123, + "auxiliary_loss_mlp": 0.01309701, + "balance_loss_clip": 0.06349255, + "balance_loss_mlp": 0.01258655, + "epoch": 0.08405230722982113, + "flos": 25927698810240.0, + "grad_norm": 3.1837358420566173, + "language_loss": 0.79802477, + "learning_rate": 3.969431127281516e-06, + "loss": 0.87889296, + "num_input_tokens_seen": 29739685, + "router_z_loss_clip": 4.27734375, + "router_z_loss_mlp": 0.51123047, + "step": 1398, + "time_per_iteration": 2.6027841567993164 + }, + { + "auxiliary_loss_clip": 0.06793746, + "auxiliary_loss_mlp": 0.01312625, + "balance_loss_clip": 0.06375143, + "balance_loss_mlp": 0.01265299, + "epoch": 0.0841124304824891, + "flos": 17973192608640.0, + "grad_norm": 3.0716222673767404, + "language_loss": 0.96745825, + "learning_rate": 3.969363257322304e-06, + "loss": 1.048522, + "num_input_tokens_seen": 29756165, + "router_z_loss_clip": 4.1875, + "router_z_loss_mlp": 0.47290039, + "step": 1399, + "time_per_iteration": 3.9915521144866943 + }, + { + "auxiliary_loss_clip": 0.06813341, + "auxiliary_loss_mlp": 0.01316281, + "balance_loss_clip": 0.06352973, + "balance_loss_mlp": 0.01258012, + "epoch": 0.08417255373515707, + "flos": 25636733107200.0, + "grad_norm": 6.6751707009018055, + "language_loss": 0.83959824, + "learning_rate": 3.96929531268464e-06, + "loss": 0.92089444, + "num_input_tokens_seen": 29776425, + "router_z_loss_clip": 4.6015625, + "router_z_loss_mlp": 0.58300781, + "step": 1400, + "time_per_iteration": 2.6097705364227295 + }, + { + "auxiliary_loss_clip": 0.06801295, + "auxiliary_loss_mlp": 0.01317439, + "balance_loss_clip": 0.06362335, + "balance_loss_mlp": 0.01264868, + "epoch": 0.08423267698782504, + "flos": 26256874775040.0, + "grad_norm": 2.3612401801911487, + "language_loss": 0.8841815, + "learning_rate": 3.969227293371099e-06, + "loss": 0.96536887, + "num_input_tokens_seen": 29796440, + "router_z_loss_clip": 4.38671875, + "router_z_loss_mlp": 0.52539062, + "step": 1401, + "time_per_iteration": 2.654085874557495 + }, + { + "auxiliary_loss_clip": 0.06806403, + "auxiliary_loss_mlp": 0.01316426, + "balance_loss_clip": 0.0637629, + "balance_loss_mlp": 0.01264594, + "epoch": 0.08429280024049302, + "flos": 20125757272320.0, + "grad_norm": 2.1446358728684753, + "language_loss": 0.90116793, + "learning_rate": 3.969159199384263e-06, + "loss": 0.98239625, + "num_input_tokens_seen": 29814755, + "router_z_loss_clip": 4.296875, + "router_z_loss_mlp": 0.51733398, + "step": 1402, + "time_per_iteration": 4.018750905990601 + }, + { + "auxiliary_loss_clip": 0.067935, + "auxiliary_loss_mlp": 0.01308153, + "balance_loss_clip": 0.06370865, + "balance_loss_mlp": 0.01261519, + "epoch": 0.08435292349316098, + "flos": 42934593335040.0, + "grad_norm": 3.3097945414979324, + "language_loss": 0.91613716, + "learning_rate": 3.9690910307267125e-06, + "loss": 0.99715364, + "num_input_tokens_seen": 29834785, + "router_z_loss_clip": 4.21484375, + "router_z_loss_mlp": 0.46655273, + "step": 1403, + "time_per_iteration": 2.75314998626709 + }, + { + "auxiliary_loss_clip": 0.06802634, + "auxiliary_loss_mlp": 0.01312918, + "balance_loss_clip": 0.0636553, + "balance_loss_mlp": 0.01259679, + "epoch": 0.08441304674582895, + "flos": 22863984848640.0, + "grad_norm": 2.1842752098613696, + "language_loss": 0.8341198, + "learning_rate": 3.969022787401033e-06, + "loss": 0.91527522, + "num_input_tokens_seen": 29854695, + "router_z_loss_clip": 4.37109375, + "router_z_loss_mlp": 0.5324707, + "step": 1404, + "time_per_iteration": 4.128188371658325 + }, + { + "auxiliary_loss_clip": 0.06814778, + "auxiliary_loss_mlp": 0.01317505, + "balance_loss_clip": 0.06364593, + "balance_loss_mlp": 0.01263884, + "epoch": 0.08447316999849692, + "flos": 18703436941440.0, + "grad_norm": 2.408821192970914, + "language_loss": 0.85791099, + "learning_rate": 3.968954469409811e-06, + "loss": 0.93923384, + "num_input_tokens_seen": 29872180, + "router_z_loss_clip": 4.5, + "router_z_loss_mlp": 0.53588867, + "step": 1405, + "time_per_iteration": 2.6186141967773438 + }, + { + "auxiliary_loss_clip": 0.06785356, + "auxiliary_loss_mlp": 0.01307288, + "balance_loss_clip": 0.06358731, + "balance_loss_mlp": 0.01261488, + "epoch": 0.08453329325116489, + "flos": 25491061584000.0, + "grad_norm": 2.376275583502495, + "language_loss": 0.82456648, + "learning_rate": 3.968886076755639e-06, + "loss": 0.9054929, + "num_input_tokens_seen": 29893205, + "router_z_loss_clip": 4.2578125, + "router_z_loss_mlp": 0.45825195, + "step": 1406, + "time_per_iteration": 2.620391845703125 + }, + { + "auxiliary_loss_clip": 0.06791453, + "auxiliary_loss_mlp": 0.01321291, + "balance_loss_clip": 0.06356591, + "balance_loss_mlp": 0.01271461, + "epoch": 0.08459341650383286, + "flos": 20925839583360.0, + "grad_norm": 2.994077443847897, + "language_loss": 0.81261843, + "learning_rate": 3.96881760944111e-06, + "loss": 0.8937459, + "num_input_tokens_seen": 29911970, + "router_z_loss_clip": 4.34765625, + "router_z_loss_mlp": 0.49853516, + "step": 1407, + "time_per_iteration": 2.6037673950195312 + }, + { + "auxiliary_loss_clip": 0.06790854, + "auxiliary_loss_mlp": 0.01321715, + "balance_loss_clip": 0.06351606, + "balance_loss_mlp": 0.01269525, + "epoch": 0.08465353975650082, + "flos": 13048215102720.0, + "grad_norm": 4.665844838977458, + "language_loss": 0.93093699, + "learning_rate": 3.968749067468819e-06, + "loss": 1.01206267, + "num_input_tokens_seen": 29929925, + "router_z_loss_clip": 4.39453125, + "router_z_loss_mlp": 0.52197266, + "step": 1408, + "time_per_iteration": 2.5401058197021484 + }, + { + "auxiliary_loss_clip": 0.06614841, + "auxiliary_loss_mlp": 0.0131788, + "balance_loss_clip": 0.06340891, + "balance_loss_mlp": 0.01289985, + "epoch": 0.0847136630091688, + "flos": 60896912112000.0, + "grad_norm": 0.8563868358173309, + "language_loss": 0.62132567, + "learning_rate": 3.968680450841368e-06, + "loss": 0.7006529, + "num_input_tokens_seen": 29985950, + "router_z_loss_clip": 2.75, + "router_z_loss_mlp": 0.27954102, + "step": 1409, + "time_per_iteration": 3.2652077674865723 + }, + { + "auxiliary_loss_clip": 0.06755531, + "auxiliary_loss_mlp": 0.01311791, + "balance_loss_clip": 0.06338526, + "balance_loss_mlp": 0.01266802, + "epoch": 0.08477378626183676, + "flos": 22051743696000.0, + "grad_norm": 2.2146573769232916, + "language_loss": 0.88621575, + "learning_rate": 3.968611759561355e-06, + "loss": 0.96688896, + "num_input_tokens_seen": 30004330, + "router_z_loss_clip": 4.1640625, + "router_z_loss_mlp": 0.44995117, + "step": 1410, + "time_per_iteration": 2.5771710872650146 + }, + { + "auxiliary_loss_clip": 0.06769306, + "auxiliary_loss_mlp": 0.01318797, + "balance_loss_clip": 0.06336072, + "balance_loss_mlp": 0.01268253, + "epoch": 0.08483390951450473, + "flos": 16695537552000.0, + "grad_norm": 2.3714211979189987, + "language_loss": 0.76187658, + "learning_rate": 3.968542993631388e-06, + "loss": 0.84275758, + "num_input_tokens_seen": 30022555, + "router_z_loss_clip": 4.328125, + "router_z_loss_mlp": 0.50585938, + "step": 1411, + "time_per_iteration": 2.5831918716430664 + }, + { + "auxiliary_loss_clip": 0.06605848, + "auxiliary_loss_mlp": 0.01302084, + "balance_loss_clip": 0.06331737, + "balance_loss_mlp": 0.01268491, + "epoch": 0.08489403276717271, + "flos": 51604430313600.0, + "grad_norm": 0.8982882759913209, + "language_loss": 0.57100856, + "learning_rate": 3.968474153054073e-06, + "loss": 0.65008789, + "num_input_tokens_seen": 30077220, + "router_z_loss_clip": 2.7421875, + "router_z_loss_mlp": 0.33618164, + "step": 1412, + "time_per_iteration": 3.1449196338653564 + }, + { + "auxiliary_loss_clip": 0.06776647, + "auxiliary_loss_mlp": 0.0131046, + "balance_loss_clip": 0.06348051, + "balance_loss_mlp": 0.01261393, + "epoch": 0.08495415601984067, + "flos": 17098031439360.0, + "grad_norm": 4.4528738806487, + "language_loss": 0.91184032, + "learning_rate": 3.96840523783202e-06, + "loss": 0.99271137, + "num_input_tokens_seen": 30094600, + "router_z_loss_clip": 4.28515625, + "router_z_loss_mlp": 0.49145508, + "step": 1413, + "time_per_iteration": 2.5736677646636963 + }, + { + "auxiliary_loss_clip": 0.06762269, + "auxiliary_loss_mlp": 0.01310346, + "balance_loss_clip": 0.06341726, + "balance_loss_mlp": 0.01261685, + "epoch": 0.08501427927250864, + "flos": 23155034405760.0, + "grad_norm": 2.1658829941413997, + "language_loss": 0.9017415, + "learning_rate": 3.968336247967844e-06, + "loss": 0.98246765, + "num_input_tokens_seen": 30114475, + "router_z_loss_clip": 4.20703125, + "router_z_loss_mlp": 0.48706055, + "step": 1414, + "time_per_iteration": 2.6087806224823 + }, + { + "auxiliary_loss_clip": 0.06782193, + "auxiliary_loss_mlp": 0.01303484, + "balance_loss_clip": 0.06352735, + "balance_loss_mlp": 0.01258423, + "epoch": 0.08507440252517662, + "flos": 19069649210880.0, + "grad_norm": 2.082765030572706, + "language_loss": 0.79920703, + "learning_rate": 3.96826718346416e-06, + "loss": 0.88006377, + "num_input_tokens_seen": 30133350, + "router_z_loss_clip": 4.2890625, + "router_z_loss_mlp": 0.45068359, + "step": 1415, + "time_per_iteration": 2.5629544258117676 + }, + { + "auxiliary_loss_clip": 0.06759159, + "auxiliary_loss_mlp": 0.01306699, + "balance_loss_clip": 0.06336564, + "balance_loss_mlp": 0.01259492, + "epoch": 0.08513452577784458, + "flos": 60195249550080.0, + "grad_norm": 8.264598666401978, + "language_loss": 0.72300386, + "learning_rate": 3.968198044323587e-06, + "loss": 0.80366242, + "num_input_tokens_seen": 30159005, + "router_z_loss_clip": 4.21875, + "router_z_loss_mlp": 0.47216797, + "step": 1416, + "time_per_iteration": 2.9444239139556885 + }, + { + "auxiliary_loss_clip": 0.06803774, + "auxiliary_loss_mlp": 0.01317561, + "balance_loss_clip": 0.0635466, + "balance_loss_mlp": 0.01264608, + "epoch": 0.08519464903051255, + "flos": 27315917729280.0, + "grad_norm": 2.5149113887395407, + "language_loss": 0.77021283, + "learning_rate": 3.968128830548748e-06, + "loss": 0.85142624, + "num_input_tokens_seen": 30179450, + "router_z_loss_clip": 4.48046875, + "router_z_loss_mlp": 0.5300293, + "step": 1417, + "time_per_iteration": 2.619328260421753 + }, + { + "auxiliary_loss_clip": 0.06779526, + "auxiliary_loss_mlp": 0.01310101, + "balance_loss_clip": 0.06341187, + "balance_loss_mlp": 0.01259341, + "epoch": 0.08525477228318051, + "flos": 20272644679680.0, + "grad_norm": 2.930615198621333, + "language_loss": 0.84423447, + "learning_rate": 3.968059542142265e-06, + "loss": 0.92513078, + "num_input_tokens_seen": 30197235, + "router_z_loss_clip": 4.37890625, + "router_z_loss_mlp": 0.5078125, + "step": 1418, + "time_per_iteration": 2.5782899856567383 + }, + { + "auxiliary_loss_clip": 0.06606524, + "auxiliary_loss_mlp": 0.01274095, + "balance_loss_clip": 0.06333332, + "balance_loss_mlp": 0.01249931, + "epoch": 0.08531489553584849, + "flos": 67633580672640.0, + "grad_norm": 0.9458512268838744, + "language_loss": 0.5659793, + "learning_rate": 3.9679901791067685e-06, + "loss": 0.64478552, + "num_input_tokens_seen": 30257410, + "router_z_loss_clip": 2.73046875, + "router_z_loss_mlp": 0.24157715, + "step": 1419, + "time_per_iteration": 3.1296868324279785 + }, + { + "auxiliary_loss_clip": 0.06790996, + "auxiliary_loss_mlp": 0.01306783, + "balance_loss_clip": 0.06354627, + "balance_loss_mlp": 0.01259004, + "epoch": 0.08537501878851646, + "flos": 27534362123520.0, + "grad_norm": 2.6126551890980076, + "language_loss": 0.72536588, + "learning_rate": 3.967920741444886e-06, + "loss": 0.80634367, + "num_input_tokens_seen": 30277865, + "router_z_loss_clip": 4.359375, + "router_z_loss_mlp": 0.4777832, + "step": 1420, + "time_per_iteration": 2.629305839538574 + }, + { + "auxiliary_loss_clip": 0.06772007, + "auxiliary_loss_mlp": 0.01307483, + "balance_loss_clip": 0.06343359, + "balance_loss_mlp": 0.01257272, + "epoch": 0.08543514204118442, + "flos": 22790918488320.0, + "grad_norm": 2.3388359886837917, + "language_loss": 0.89903885, + "learning_rate": 3.967851229159252e-06, + "loss": 0.97983378, + "num_input_tokens_seen": 30298545, + "router_z_loss_clip": 4.27929688, + "router_z_loss_mlp": 0.50244141, + "step": 1421, + "time_per_iteration": 2.5863590240478516 + }, + { + "auxiliary_loss_clip": 0.06597036, + "auxiliary_loss_mlp": 0.01275978, + "balance_loss_clip": 0.06325173, + "balance_loss_mlp": 0.01249919, + "epoch": 0.0854952652938524, + "flos": 61010872064640.0, + "grad_norm": 0.7745811005373293, + "language_loss": 0.63692141, + "learning_rate": 3.967781642252502e-06, + "loss": 0.71565151, + "num_input_tokens_seen": 30361725, + "router_z_loss_clip": 2.71875, + "router_z_loss_mlp": 0.26098633, + "step": 1422, + "time_per_iteration": 3.19461989402771 + }, + { + "auxiliary_loss_clip": 0.06765623, + "auxiliary_loss_mlp": 0.01311314, + "balance_loss_clip": 0.06344545, + "balance_loss_mlp": 0.01266444, + "epoch": 0.08555538854652037, + "flos": 28045575083520.0, + "grad_norm": 3.3087422543747205, + "language_loss": 0.84878761, + "learning_rate": 3.967711980727276e-06, + "loss": 0.92955703, + "num_input_tokens_seen": 30382180, + "router_z_loss_clip": 4.21289062, + "router_z_loss_mlp": 0.44873047, + "step": 1423, + "time_per_iteration": 2.6554226875305176 + }, + { + "auxiliary_loss_clip": 0.06776007, + "auxiliary_loss_mlp": 0.01303967, + "balance_loss_clip": 0.06351057, + "balance_loss_mlp": 0.01261314, + "epoch": 0.08561551179918833, + "flos": 23515293035520.0, + "grad_norm": 2.569087931646671, + "language_loss": 0.7765131, + "learning_rate": 3.967642244586213e-06, + "loss": 0.85731286, + "num_input_tokens_seen": 30402980, + "router_z_loss_clip": 4.24609375, + "router_z_loss_mlp": 0.42602539, + "step": 1424, + "time_per_iteration": 2.7058026790618896 + }, + { + "auxiliary_loss_clip": 0.06765693, + "auxiliary_loss_mlp": 0.01310667, + "balance_loss_clip": 0.06343248, + "balance_loss_mlp": 0.01265988, + "epoch": 0.08567563505185631, + "flos": 17932005527040.0, + "grad_norm": 1.9981101747379681, + "language_loss": 0.78279495, + "learning_rate": 3.96757243383196e-06, + "loss": 0.86355859, + "num_input_tokens_seen": 30420800, + "router_z_loss_clip": 4.22265625, + "router_z_loss_mlp": 0.44677734, + "step": 1425, + "time_per_iteration": 2.575941801071167 + }, + { + "auxiliary_loss_clip": 0.06768522, + "auxiliary_loss_mlp": 0.01310756, + "balance_loss_clip": 0.06347974, + "balance_loss_mlp": 0.01264074, + "epoch": 0.08573575830452428, + "flos": 19725695153280.0, + "grad_norm": 2.337358950389625, + "language_loss": 0.95636088, + "learning_rate": 3.9675025484671624e-06, + "loss": 1.03715372, + "num_input_tokens_seen": 30439620, + "router_z_loss_clip": 4.20507812, + "router_z_loss_mlp": 0.46679688, + "step": 1426, + "time_per_iteration": 2.5706772804260254 + }, + { + "auxiliary_loss_clip": 0.06791019, + "auxiliary_loss_mlp": 0.01318941, + "balance_loss_clip": 0.06355577, + "balance_loss_mlp": 0.01267776, + "epoch": 0.08579588155719224, + "flos": 17937414115200.0, + "grad_norm": 3.6077969135085945, + "language_loss": 0.78100324, + "learning_rate": 3.967432588494471e-06, + "loss": 0.86210281, + "num_input_tokens_seen": 30457300, + "router_z_loss_clip": 4.3515625, + "router_z_loss_mlp": 0.51196289, + "step": 1427, + "time_per_iteration": 2.620664119720459 + }, + { + "auxiliary_loss_clip": 0.06773555, + "auxiliary_loss_mlp": 0.01322231, + "balance_loss_clip": 0.06351949, + "balance_loss_mlp": 0.01272831, + "epoch": 0.08585600480986022, + "flos": 16038694995840.0, + "grad_norm": 4.670417341284444, + "language_loss": 0.84344131, + "learning_rate": 3.96736255391654e-06, + "loss": 0.92439914, + "num_input_tokens_seen": 30471580, + "router_z_loss_clip": 4.21679688, + "router_z_loss_mlp": 0.49414062, + "step": 1428, + "time_per_iteration": 2.5323448181152344 + }, + { + "auxiliary_loss_clip": 0.06797348, + "auxiliary_loss_mlp": 0.01327926, + "balance_loss_clip": 0.06359121, + "balance_loss_mlp": 0.01274211, + "epoch": 0.08591612806252819, + "flos": 28664920137600.0, + "grad_norm": 3.8563401660428136, + "language_loss": 0.82438064, + "learning_rate": 3.967292444736023e-06, + "loss": 0.90563333, + "num_input_tokens_seen": 30492720, + "router_z_loss_clip": 4.375, + "router_z_loss_mlp": 0.53710938, + "step": 1429, + "time_per_iteration": 2.6729156970977783 + }, + { + "auxiliary_loss_clip": 0.06787296, + "auxiliary_loss_mlp": 0.01320421, + "balance_loss_clip": 0.06368907, + "balance_loss_mlp": 0.0127586, + "epoch": 0.08597625131519615, + "flos": 20965349583360.0, + "grad_norm": 2.123464733030403, + "language_loss": 0.90146309, + "learning_rate": 3.967222260955578e-06, + "loss": 0.98254025, + "num_input_tokens_seen": 30509535, + "router_z_loss_clip": 4.18554688, + "router_z_loss_mlp": 0.44580078, + "step": 1430, + "time_per_iteration": 2.553527355194092 + }, + { + "auxiliary_loss_clip": 0.06773631, + "auxiliary_loss_mlp": 0.01318779, + "balance_loss_clip": 0.06357691, + "balance_loss_mlp": 0.01274552, + "epoch": 0.08603637456786412, + "flos": 23262747229440.0, + "grad_norm": 2.0722520617005924, + "language_loss": 0.84170914, + "learning_rate": 3.96715200257787e-06, + "loss": 0.92263317, + "num_input_tokens_seen": 30529490, + "router_z_loss_clip": 4.16015625, + "router_z_loss_mlp": 0.44213867, + "step": 1431, + "time_per_iteration": 2.5954349040985107 + }, + { + "auxiliary_loss_clip": 0.06773046, + "auxiliary_loss_mlp": 0.01317231, + "balance_loss_clip": 0.06352717, + "balance_loss_mlp": 0.01270858, + "epoch": 0.0860964978205321, + "flos": 28701704880000.0, + "grad_norm": 5.769747909175534, + "language_loss": 0.79544812, + "learning_rate": 3.967081669605559e-06, + "loss": 0.87635088, + "num_input_tokens_seen": 30550205, + "router_z_loss_clip": 4.19726562, + "router_z_loss_mlp": 0.46362305, + "step": 1432, + "time_per_iteration": 2.6024515628814697 + }, + { + "auxiliary_loss_clip": 0.06771973, + "auxiliary_loss_mlp": 0.01314171, + "balance_loss_clip": 0.06355675, + "balance_loss_mlp": 0.01269325, + "epoch": 0.08615662107320006, + "flos": 19324542931200.0, + "grad_norm": 3.3903634053002336, + "language_loss": 0.75487757, + "learning_rate": 3.967011262041315e-06, + "loss": 0.83573902, + "num_input_tokens_seen": 30568830, + "router_z_loss_clip": 4.1640625, + "router_z_loss_mlp": 0.44848633, + "step": 1433, + "time_per_iteration": 2.5895845890045166 + }, + { + "auxiliary_loss_clip": 0.06795658, + "auxiliary_loss_mlp": 0.01322619, + "balance_loss_clip": 0.0636312, + "balance_loss_mlp": 0.01272313, + "epoch": 0.08621674432586802, + "flos": 15857161125120.0, + "grad_norm": 4.641351982999466, + "language_loss": 0.88055921, + "learning_rate": 3.9669407798878065e-06, + "loss": 0.96174198, + "num_input_tokens_seen": 30585730, + "router_z_loss_clip": 4.328125, + "router_z_loss_mlp": 0.50268555, + "step": 1434, + "time_per_iteration": 2.5355098247528076 + }, + { + "auxiliary_loss_clip": 0.06779063, + "auxiliary_loss_mlp": 0.01311558, + "balance_loss_clip": 0.06353655, + "balance_loss_mlp": 0.01263803, + "epoch": 0.086276867578536, + "flos": 14105874464640.0, + "grad_norm": 4.793331202343017, + "language_loss": 0.80184627, + "learning_rate": 3.966870223147707e-06, + "loss": 0.88275254, + "num_input_tokens_seen": 30603180, + "router_z_loss_clip": 4.25195312, + "router_z_loss_mlp": 0.4777832, + "step": 1435, + "time_per_iteration": 2.57381272315979 + }, + { + "auxiliary_loss_clip": 0.06627634, + "auxiliary_loss_mlp": 0.01282391, + "balance_loss_clip": 0.06350996, + "balance_loss_mlp": 0.01255616, + "epoch": 0.08633699083120397, + "flos": 70206500142720.0, + "grad_norm": 0.941958531658993, + "language_loss": 0.58419931, + "learning_rate": 3.96679959182369e-06, + "loss": 0.66329956, + "num_input_tokens_seen": 30668895, + "router_z_loss_clip": 2.7734375, + "router_z_loss_mlp": 0.26831055, + "step": 1436, + "time_per_iteration": 3.282787561416626 + }, + { + "auxiliary_loss_clip": 0.06781173, + "auxiliary_loss_mlp": 0.01309156, + "balance_loss_clip": 0.06351152, + "balance_loss_mlp": 0.01261949, + "epoch": 0.08639711408387193, + "flos": 30306565330560.0, + "grad_norm": 3.136203943019662, + "language_loss": 0.71995145, + "learning_rate": 3.966728885918437e-06, + "loss": 0.80085474, + "num_input_tokens_seen": 30688955, + "router_z_loss_clip": 4.296875, + "router_z_loss_mlp": 0.47167969, + "step": 1437, + "time_per_iteration": 4.062320232391357 + }, + { + "auxiliary_loss_clip": 0.06771993, + "auxiliary_loss_mlp": 0.01311453, + "balance_loss_clip": 0.06345055, + "balance_loss_mlp": 0.01262553, + "epoch": 0.08645723733653991, + "flos": 20303014584960.0, + "grad_norm": 2.1552544434513154, + "language_loss": 0.74663305, + "learning_rate": 3.966658105434627e-06, + "loss": 0.82746744, + "num_input_tokens_seen": 30706095, + "router_z_loss_clip": 4.26757812, + "router_z_loss_mlp": 0.48925781, + "step": 1438, + "time_per_iteration": 2.5902743339538574 + }, + { + "auxiliary_loss_clip": 0.06752677, + "auxiliary_loss_mlp": 0.01311557, + "balance_loss_clip": 0.06331892, + "balance_loss_mlp": 0.01263468, + "epoch": 0.08651736058920788, + "flos": 32898911748480.0, + "grad_norm": 2.1102638652127093, + "language_loss": 0.6610049, + "learning_rate": 3.966587250374945e-06, + "loss": 0.7416473, + "num_input_tokens_seen": 30729025, + "router_z_loss_clip": 4.20703125, + "router_z_loss_mlp": 0.48071289, + "step": 1439, + "time_per_iteration": 4.177356719970703 + }, + { + "auxiliary_loss_clip": 0.06767576, + "auxiliary_loss_mlp": 0.01319213, + "balance_loss_clip": 0.06342776, + "balance_loss_mlp": 0.01270934, + "epoch": 0.08657748384187584, + "flos": 22643863372800.0, + "grad_norm": 6.195931442958794, + "language_loss": 0.89298683, + "learning_rate": 3.966516320742077e-06, + "loss": 0.97385472, + "num_input_tokens_seen": 30746155, + "router_z_loss_clip": 4.25, + "router_z_loss_mlp": 0.4831543, + "step": 1440, + "time_per_iteration": 2.5557472705841064 + }, + { + "auxiliary_loss_clip": 0.06781097, + "auxiliary_loss_mlp": 0.01307911, + "balance_loss_clip": 0.06338568, + "balance_loss_mlp": 0.01254028, + "epoch": 0.08663760709454381, + "flos": 23664947627520.0, + "grad_norm": 2.369224573412665, + "language_loss": 0.86471045, + "learning_rate": 3.9664453165387124e-06, + "loss": 0.94560057, + "num_input_tokens_seen": 30761410, + "router_z_loss_clip": 4.421875, + "router_z_loss_mlp": 0.53833008, + "step": 1441, + "time_per_iteration": 2.65085768699646 + }, + { + "auxiliary_loss_clip": 0.06611373, + "auxiliary_loss_mlp": 0.01295436, + "balance_loss_clip": 0.06333591, + "balance_loss_mlp": 0.01268138, + "epoch": 0.08669773034721179, + "flos": 62703823484160.0, + "grad_norm": 0.803695610307685, + "language_loss": 0.60671109, + "learning_rate": 3.966374237767545e-06, + "loss": 0.68577921, + "num_input_tokens_seen": 30823010, + "router_z_loss_clip": 2.78125, + "router_z_loss_mlp": 0.27368164, + "step": 1442, + "time_per_iteration": 4.761855125427246 + }, + { + "auxiliary_loss_clip": 0.0676527, + "auxiliary_loss_mlp": 0.0130763, + "balance_loss_clip": 0.06333362, + "balance_loss_mlp": 0.0125885, + "epoch": 0.08675785359987975, + "flos": 20673713047680.0, + "grad_norm": 2.753695330350272, + "language_loss": 0.81546146, + "learning_rate": 3.96630308443127e-06, + "loss": 0.8961904, + "num_input_tokens_seen": 30841980, + "router_z_loss_clip": 4.31640625, + "router_z_loss_mlp": 0.48803711, + "step": 1443, + "time_per_iteration": 2.581735134124756 + }, + { + "auxiliary_loss_clip": 0.06751874, + "auxiliary_loss_mlp": 0.01309584, + "balance_loss_clip": 0.06329648, + "balance_loss_mlp": 0.01264404, + "epoch": 0.08681797685254772, + "flos": 26948070305280.0, + "grad_norm": 2.052695672066824, + "language_loss": 0.83898687, + "learning_rate": 3.966231856532584e-06, + "loss": 0.91960144, + "num_input_tokens_seen": 30863280, + "router_z_loss_clip": 4.22460938, + "router_z_loss_mlp": 0.45166016, + "step": 1444, + "time_per_iteration": 4.03491473197937 + }, + { + "auxiliary_loss_clip": 0.06771353, + "auxiliary_loss_mlp": 0.01313762, + "balance_loss_clip": 0.063327, + "balance_loss_mlp": 0.01263408, + "epoch": 0.0868781001052157, + "flos": 17718676231680.0, + "grad_norm": 2.3029002758170236, + "language_loss": 0.89515543, + "learning_rate": 3.966160554074189e-06, + "loss": 0.97600663, + "num_input_tokens_seen": 30881710, + "router_z_loss_clip": 4.3828125, + "router_z_loss_mlp": 0.50341797, + "step": 1445, + "time_per_iteration": 2.53659987449646 + }, + { + "auxiliary_loss_clip": 0.06757164, + "auxiliary_loss_mlp": 0.01319102, + "balance_loss_clip": 0.0633342, + "balance_loss_mlp": 0.01269916, + "epoch": 0.08693822335788366, + "flos": 19901820435840.0, + "grad_norm": 2.912516601595955, + "language_loss": 0.84297967, + "learning_rate": 3.96608917705879e-06, + "loss": 0.92374229, + "num_input_tokens_seen": 30900225, + "router_z_loss_clip": 4.23632812, + "router_z_loss_mlp": 0.49169922, + "step": 1446, + "time_per_iteration": 2.5991437435150146 + }, + { + "auxiliary_loss_clip": 0.06602339, + "auxiliary_loss_mlp": 0.01278086, + "balance_loss_clip": 0.06327674, + "balance_loss_mlp": 0.01252623, + "epoch": 0.08699834661055163, + "flos": 67040957871360.0, + "grad_norm": 0.7332106315857324, + "language_loss": 0.54912937, + "learning_rate": 3.966017725489091e-06, + "loss": 0.62793368, + "num_input_tokens_seen": 30959580, + "router_z_loss_clip": 2.75, + "router_z_loss_mlp": 0.25488281, + "step": 1447, + "time_per_iteration": 3.2708306312561035 + }, + { + "auxiliary_loss_clip": 0.06739033, + "auxiliary_loss_mlp": 0.01328667, + "balance_loss_clip": 0.06324905, + "balance_loss_mlp": 0.01282223, + "epoch": 0.0870584698632196, + "flos": 13485648942720.0, + "grad_norm": 3.073032874929238, + "language_loss": 0.86241722, + "learning_rate": 3.965946199367804e-06, + "loss": 0.94309419, + "num_input_tokens_seen": 30976775, + "router_z_loss_clip": 4.140625, + "router_z_loss_mlp": 0.46508789, + "step": 1448, + "time_per_iteration": 2.537522792816162 + }, + { + "auxiliary_loss_clip": 0.067637, + "auxiliary_loss_mlp": 0.01323636, + "balance_loss_clip": 0.06333195, + "balance_loss_mlp": 0.01275666, + "epoch": 0.08711859311588757, + "flos": 16112516042880.0, + "grad_norm": 5.523495984670142, + "language_loss": 0.81949937, + "learning_rate": 3.965874598697638e-06, + "loss": 0.90037274, + "num_input_tokens_seen": 30990495, + "router_z_loss_clip": 4.3046875, + "router_z_loss_mlp": 0.47949219, + "step": 1449, + "time_per_iteration": 2.57389760017395 + }, + { + "auxiliary_loss_clip": 0.06749628, + "auxiliary_loss_mlp": 0.01305238, + "balance_loss_clip": 0.06335508, + "balance_loss_mlp": 0.01262227, + "epoch": 0.08717871636855554, + "flos": 38481528424320.0, + "grad_norm": 2.3810554922577354, + "language_loss": 0.73064238, + "learning_rate": 3.965802923481313e-06, + "loss": 0.81119096, + "num_input_tokens_seen": 31014080, + "router_z_loss_clip": 4.140625, + "router_z_loss_mlp": 0.43017578, + "step": 1450, + "time_per_iteration": 2.7252304553985596 + }, + { + "auxiliary_loss_clip": 0.06761701, + "auxiliary_loss_mlp": 0.01323911, + "balance_loss_clip": 0.06337759, + "balance_loss_mlp": 0.01275416, + "epoch": 0.0872388396212235, + "flos": 17605932163200.0, + "grad_norm": 2.1112425767796474, + "language_loss": 0.85553432, + "learning_rate": 3.965731173721542e-06, + "loss": 0.9363904, + "num_input_tokens_seen": 31031210, + "router_z_loss_clip": 4.24414062, + "router_z_loss_mlp": 0.48486328, + "step": 1451, + "time_per_iteration": 2.556896209716797 + }, + { + "auxiliary_loss_clip": 0.06751224, + "auxiliary_loss_mlp": 0.01307951, + "balance_loss_clip": 0.06344092, + "balance_loss_mlp": 0.01266395, + "epoch": 0.08729896287389148, + "flos": 25265489592960.0, + "grad_norm": 2.067410826923288, + "language_loss": 0.76721281, + "learning_rate": 3.965659349421049e-06, + "loss": 0.84780455, + "num_input_tokens_seen": 31049710, + "router_z_loss_clip": 4.0703125, + "router_z_loss_mlp": 0.41577148, + "step": 1452, + "time_per_iteration": 2.5980234146118164 + }, + { + "auxiliary_loss_clip": 0.06767467, + "auxiliary_loss_mlp": 0.01321022, + "balance_loss_clip": 0.06343699, + "balance_loss_mlp": 0.01272623, + "epoch": 0.08735908612655945, + "flos": 15637836263040.0, + "grad_norm": 4.836985480100509, + "language_loss": 0.8246457, + "learning_rate": 3.965587450582556e-06, + "loss": 0.90553057, + "num_input_tokens_seen": 31066160, + "router_z_loss_clip": 4.23828125, + "router_z_loss_mlp": 0.48364258, + "step": 1453, + "time_per_iteration": 2.5459630489349365 + }, + { + "auxiliary_loss_clip": 0.06754768, + "auxiliary_loss_mlp": 0.0129928, + "balance_loss_clip": 0.06342497, + "balance_loss_mlp": 0.0125646, + "epoch": 0.08741920937922741, + "flos": 20345920675200.0, + "grad_norm": 3.0656217118084, + "language_loss": 0.72998244, + "learning_rate": 3.96551547720879e-06, + "loss": 0.81052291, + "num_input_tokens_seen": 31085270, + "router_z_loss_clip": 4.12695312, + "router_z_loss_mlp": 0.42822266, + "step": 1454, + "time_per_iteration": 2.551548957824707 + }, + { + "auxiliary_loss_clip": 0.0662789, + "auxiliary_loss_mlp": 0.01303999, + "balance_loss_clip": 0.06353966, + "balance_loss_mlp": 0.01280789, + "epoch": 0.08747933263189539, + "flos": 62841052944000.0, + "grad_norm": 0.7529223255178736, + "language_loss": 0.58298737, + "learning_rate": 3.96544342930248e-06, + "loss": 0.66230631, + "num_input_tokens_seen": 31148445, + "router_z_loss_clip": 2.734375, + "router_z_loss_mlp": 0.23181152, + "step": 1455, + "time_per_iteration": 3.2130184173583984 + }, + { + "auxiliary_loss_clip": 0.06774339, + "auxiliary_loss_mlp": 0.01313917, + "balance_loss_clip": 0.06350334, + "balance_loss_mlp": 0.01265303, + "epoch": 0.08753945588456336, + "flos": 33044122074240.0, + "grad_norm": 1.7776650768799964, + "language_loss": 0.79278296, + "learning_rate": 3.965371306866359e-06, + "loss": 0.87366557, + "num_input_tokens_seen": 31168770, + "router_z_loss_clip": 4.23632812, + "router_z_loss_mlp": 0.4855957, + "step": 1456, + "time_per_iteration": 2.6745898723602295 + }, + { + "auxiliary_loss_clip": 0.06785175, + "auxiliary_loss_mlp": 0.01319613, + "balance_loss_clip": 0.06356893, + "balance_loss_mlp": 0.01271881, + "epoch": 0.08759957913723132, + "flos": 35554807088640.0, + "grad_norm": 2.255439619282858, + "language_loss": 0.74143755, + "learning_rate": 3.96529910990316e-06, + "loss": 0.82248545, + "num_input_tokens_seen": 31189270, + "router_z_loss_clip": 4.28515625, + "router_z_loss_mlp": 0.47753906, + "step": 1457, + "time_per_iteration": 2.6837821006774902 + }, + { + "auxiliary_loss_clip": 0.06763137, + "auxiliary_loss_mlp": 0.01308035, + "balance_loss_clip": 0.06348729, + "balance_loss_mlp": 0.01264738, + "epoch": 0.0876597023898993, + "flos": 23917283798400.0, + "grad_norm": 1.7808177247023305, + "language_loss": 0.88680792, + "learning_rate": 3.965226838415622e-06, + "loss": 0.96751964, + "num_input_tokens_seen": 31210385, + "router_z_loss_clip": 4.140625, + "router_z_loss_mlp": 0.43261719, + "step": 1458, + "time_per_iteration": 2.5912857055664062 + }, + { + "auxiliary_loss_clip": 0.0677645, + "auxiliary_loss_mlp": 0.01313188, + "balance_loss_clip": 0.06355318, + "balance_loss_mlp": 0.01268151, + "epoch": 0.08771982564256726, + "flos": 18119912307840.0, + "grad_norm": 3.1042726617035297, + "language_loss": 0.82429975, + "learning_rate": 3.965154492406486e-06, + "loss": 0.90519613, + "num_input_tokens_seen": 31229745, + "router_z_loss_clip": 4.20703125, + "router_z_loss_mlp": 0.45043945, + "step": 1459, + "time_per_iteration": 2.5870959758758545 + }, + { + "auxiliary_loss_clip": 0.0679104, + "auxiliary_loss_mlp": 0.01327895, + "balance_loss_clip": 0.06355593, + "balance_loss_mlp": 0.01275062, + "epoch": 0.08777994889523523, + "flos": 17717711909760.0, + "grad_norm": 7.236455309064537, + "language_loss": 0.8621763, + "learning_rate": 3.9650820718784945e-06, + "loss": 0.94336569, + "num_input_tokens_seen": 31248280, + "router_z_loss_clip": 4.35546875, + "router_z_loss_mlp": 0.52856445, + "step": 1460, + "time_per_iteration": 2.574669361114502 + }, + { + "auxiliary_loss_clip": 0.06771254, + "auxiliary_loss_mlp": 0.01315799, + "balance_loss_clip": 0.06352662, + "balance_loss_mlp": 0.01271215, + "epoch": 0.0878400721479032, + "flos": 12824320193280.0, + "grad_norm": 3.2811276479841847, + "language_loss": 0.83160508, + "learning_rate": 3.965009576834394e-06, + "loss": 0.91247559, + "num_input_tokens_seen": 31262190, + "router_z_loss_clip": 4.18359375, + "router_z_loss_mlp": 0.44580078, + "step": 1461, + "time_per_iteration": 2.575343608856201 + }, + { + "auxiliary_loss_clip": 0.06765963, + "auxiliary_loss_mlp": 0.01303985, + "balance_loss_clip": 0.06350134, + "balance_loss_mlp": 0.01261094, + "epoch": 0.08790019540057117, + "flos": 26399359843200.0, + "grad_norm": 3.960130795636661, + "language_loss": 0.77723432, + "learning_rate": 3.964937007276932e-06, + "loss": 0.85793376, + "num_input_tokens_seen": 31283690, + "router_z_loss_clip": 4.15625, + "router_z_loss_mlp": 0.42895508, + "step": 1462, + "time_per_iteration": 2.6177735328674316 + }, + { + "auxiliary_loss_clip": 0.06788168, + "auxiliary_loss_mlp": 0.01309058, + "balance_loss_clip": 0.06352487, + "balance_loss_mlp": 0.01258371, + "epoch": 0.08796031865323914, + "flos": 19139822605440.0, + "grad_norm": 5.369695457360621, + "language_loss": 0.76475191, + "learning_rate": 3.9648643632088634e-06, + "loss": 0.84572417, + "num_input_tokens_seen": 31302505, + "router_z_loss_clip": 4.359375, + "router_z_loss_mlp": 0.50732422, + "step": 1463, + "time_per_iteration": 2.532130241394043 + }, + { + "auxiliary_loss_clip": 0.06770946, + "auxiliary_loss_mlp": 0.01316317, + "balance_loss_clip": 0.06331752, + "balance_loss_mlp": 0.01261218, + "epoch": 0.0880204419059071, + "flos": 26070896638080.0, + "grad_norm": 3.6430076592813427, + "language_loss": 0.85532415, + "learning_rate": 3.964791644632941e-06, + "loss": 0.9361968, + "num_input_tokens_seen": 31323070, + "router_z_loss_clip": 4.39453125, + "router_z_loss_mlp": 0.55126953, + "step": 1464, + "time_per_iteration": 2.606081962585449 + }, + { + "auxiliary_loss_clip": 0.06766248, + "auxiliary_loss_mlp": 0.01314801, + "balance_loss_clip": 0.06340823, + "balance_loss_mlp": 0.01264948, + "epoch": 0.08808056515857508, + "flos": 22383602991360.0, + "grad_norm": 2.6056498019463774, + "language_loss": 0.80711126, + "learning_rate": 3.964718851551923e-06, + "loss": 0.88792181, + "num_input_tokens_seen": 31341880, + "router_z_loss_clip": 4.25, + "router_z_loss_mlp": 0.4987793, + "step": 1465, + "time_per_iteration": 2.555612325668335 + }, + { + "auxiliary_loss_clip": 0.06765096, + "auxiliary_loss_mlp": 0.0132391, + "balance_loss_clip": 0.06346563, + "balance_loss_mlp": 0.01275654, + "epoch": 0.08814068841124305, + "flos": 23191986856320.0, + "grad_norm": 5.208613872763048, + "language_loss": 0.8713969, + "learning_rate": 3.9646459839685675e-06, + "loss": 0.95228696, + "num_input_tokens_seen": 31361995, + "router_z_loss_clip": 4.18554688, + "router_z_loss_mlp": 0.48266602, + "step": 1466, + "time_per_iteration": 2.5865933895111084 + }, + { + "auxiliary_loss_clip": 0.067513, + "auxiliary_loss_mlp": 0.01319742, + "balance_loss_clip": 0.06332761, + "balance_loss_mlp": 0.01270842, + "epoch": 0.08820081166391101, + "flos": 25162262962560.0, + "grad_norm": 2.171865464101356, + "language_loss": 0.85806906, + "learning_rate": 3.964573041885641e-06, + "loss": 0.93877947, + "num_input_tokens_seen": 31381515, + "router_z_loss_clip": 4.18359375, + "router_z_loss_mlp": 0.48852539, + "step": 1467, + "time_per_iteration": 2.5861306190490723 + }, + { + "auxiliary_loss_clip": 0.06751268, + "auxiliary_loss_mlp": 0.0130998, + "balance_loss_clip": 0.06337693, + "balance_loss_mlp": 0.01262654, + "epoch": 0.08826093491657899, + "flos": 22237386416640.0, + "grad_norm": 2.29409858909566, + "language_loss": 0.78131318, + "learning_rate": 3.964500025305907e-06, + "loss": 0.86192572, + "num_input_tokens_seen": 31400345, + "router_z_loss_clip": 4.1328125, + "router_z_loss_mlp": 0.47387695, + "step": 1468, + "time_per_iteration": 2.5800206661224365 + }, + { + "auxiliary_loss_clip": 0.06742708, + "auxiliary_loss_mlp": 0.01311969, + "balance_loss_clip": 0.06332668, + "balance_loss_mlp": 0.01265501, + "epoch": 0.08832105816924696, + "flos": 22133279318400.0, + "grad_norm": 1.8356690071746322, + "language_loss": 0.82406783, + "learning_rate": 3.9644269342321355e-06, + "loss": 0.90461457, + "num_input_tokens_seen": 31419620, + "router_z_loss_clip": 4.09570312, + "router_z_loss_mlp": 0.46459961, + "step": 1469, + "time_per_iteration": 2.5584611892700195 + }, + { + "auxiliary_loss_clip": 0.06744162, + "auxiliary_loss_mlp": 0.01313281, + "balance_loss_clip": 0.06327502, + "balance_loss_mlp": 0.01264739, + "epoch": 0.08838118142191492, + "flos": 17572250021760.0, + "grad_norm": 2.2192924058432615, + "language_loss": 0.79711461, + "learning_rate": 3.9643537686670974e-06, + "loss": 0.877689, + "num_input_tokens_seen": 31437970, + "router_z_loss_clip": 4.16210938, + "router_z_loss_mlp": 0.48535156, + "step": 1470, + "time_per_iteration": 2.5447630882263184 + }, + { + "auxiliary_loss_clip": 0.06739189, + "auxiliary_loss_mlp": 0.01312164, + "balance_loss_clip": 0.06326798, + "balance_loss_mlp": 0.0126274, + "epoch": 0.0884413046745829, + "flos": 20783480296320.0, + "grad_norm": 2.030528760335608, + "language_loss": 0.86272311, + "learning_rate": 3.964280528613569e-06, + "loss": 0.94323671, + "num_input_tokens_seen": 31457040, + "router_z_loss_clip": 4.125, + "router_z_loss_mlp": 0.49511719, + "step": 1471, + "time_per_iteration": 2.7219297885894775 + }, + { + "auxiliary_loss_clip": 0.06719133, + "auxiliary_loss_mlp": 0.01304039, + "balance_loss_clip": 0.06321308, + "balance_loss_mlp": 0.01263222, + "epoch": 0.08850142792725087, + "flos": 22131686090880.0, + "grad_norm": 5.945068157557599, + "language_loss": 0.85369575, + "learning_rate": 3.964207214074324e-06, + "loss": 0.93392742, + "num_input_tokens_seen": 31477520, + "router_z_loss_clip": 3.97851562, + "router_z_loss_mlp": 0.40820312, + "step": 1472, + "time_per_iteration": 2.6007394790649414 + }, + { + "auxiliary_loss_clip": 0.06741676, + "auxiliary_loss_mlp": 0.01307162, + "balance_loss_clip": 0.06323978, + "balance_loss_mlp": 0.01258811, + "epoch": 0.08856155117991883, + "flos": 22425251270400.0, + "grad_norm": 4.024487815181785, + "language_loss": 0.85227764, + "learning_rate": 3.964133825052146e-06, + "loss": 0.93276608, + "num_input_tokens_seen": 31495575, + "router_z_loss_clip": 4.1796875, + "router_z_loss_mlp": 0.48388672, + "step": 1473, + "time_per_iteration": 2.610280752182007 + }, + { + "auxiliary_loss_clip": 0.06745915, + "auxiliary_loss_mlp": 0.01303107, + "balance_loss_clip": 0.0632661, + "balance_loss_mlp": 0.01257998, + "epoch": 0.0886216744325868, + "flos": 29945132743680.0, + "grad_norm": 1.5926466073589443, + "language_loss": 0.80301654, + "learning_rate": 3.964060361549816e-06, + "loss": 0.88350677, + "num_input_tokens_seen": 31520020, + "router_z_loss_clip": 4.1953125, + "router_z_loss_mlp": 0.45092773, + "step": 1474, + "time_per_iteration": 2.74392032623291 + }, + { + "auxiliary_loss_clip": 0.0673038, + "auxiliary_loss_mlp": 0.01308218, + "balance_loss_clip": 0.06324204, + "balance_loss_mlp": 0.01263062, + "epoch": 0.08868179768525478, + "flos": 23988798858240.0, + "grad_norm": 2.028999420252469, + "language_loss": 0.80928683, + "learning_rate": 3.963986823570121e-06, + "loss": 0.88967282, + "num_input_tokens_seen": 31539265, + "router_z_loss_clip": 4.05859375, + "router_z_loss_mlp": 0.45166016, + "step": 1475, + "time_per_iteration": 2.570007801055908 + }, + { + "auxiliary_loss_clip": 0.06742392, + "auxiliary_loss_mlp": 0.01303332, + "balance_loss_clip": 0.06327485, + "balance_loss_mlp": 0.01256387, + "epoch": 0.08874192093792274, + "flos": 43187264922240.0, + "grad_norm": 1.8785525854248355, + "language_loss": 0.76261604, + "learning_rate": 3.963913211115848e-06, + "loss": 0.84307337, + "num_input_tokens_seen": 31563425, + "router_z_loss_clip": 4.1484375, + "router_z_loss_mlp": 0.46972656, + "step": 1476, + "time_per_iteration": 4.163857460021973 + }, + { + "auxiliary_loss_clip": 0.06743093, + "auxiliary_loss_mlp": 0.01308468, + "balance_loss_clip": 0.06333718, + "balance_loss_mlp": 0.01262405, + "epoch": 0.0888020441905907, + "flos": 32860491851520.0, + "grad_norm": 1.6890231836232912, + "language_loss": 0.76270819, + "learning_rate": 3.9638395241897895e-06, + "loss": 0.84322381, + "num_input_tokens_seen": 31584525, + "router_z_loss_clip": 4.09375, + "router_z_loss_mlp": 0.46069336, + "step": 1477, + "time_per_iteration": 2.6772334575653076 + }, + { + "auxiliary_loss_clip": 0.06751049, + "auxiliary_loss_mlp": 0.01308123, + "balance_loss_clip": 0.06334269, + "balance_loss_mlp": 0.01263468, + "epoch": 0.08886216744325869, + "flos": 23156124508800.0, + "grad_norm": 2.600680931100332, + "language_loss": 0.88817739, + "learning_rate": 3.963765762794739e-06, + "loss": 0.96876919, + "num_input_tokens_seen": 31603325, + "router_z_loss_clip": 4.16601562, + "router_z_loss_mlp": 0.44677734, + "step": 1478, + "time_per_iteration": 4.08270525932312 + }, + { + "auxiliary_loss_clip": 0.0675, + "auxiliary_loss_mlp": 0.01309174, + "balance_loss_clip": 0.06336476, + "balance_loss_mlp": 0.01263803, + "epoch": 0.08892229069592665, + "flos": 23338371139200.0, + "grad_norm": 1.8272738608530537, + "language_loss": 0.79003656, + "learning_rate": 3.963691926933495e-06, + "loss": 0.87062836, + "num_input_tokens_seen": 31624820, + "router_z_loss_clip": 4.1328125, + "router_z_loss_mlp": 0.45361328, + "step": 1479, + "time_per_iteration": 2.5917623043060303 + }, + { + "auxiliary_loss_clip": 0.06747445, + "auxiliary_loss_mlp": 0.01303872, + "balance_loss_clip": 0.06333964, + "balance_loss_mlp": 0.01256665, + "epoch": 0.08898241394859462, + "flos": 26221012427520.0, + "grad_norm": 4.931621721483509, + "language_loss": 0.80906087, + "learning_rate": 3.9636180166088555e-06, + "loss": 0.88957405, + "num_input_tokens_seen": 31646080, + "router_z_loss_clip": 4.1328125, + "router_z_loss_mlp": 0.47265625, + "step": 1480, + "time_per_iteration": 2.6102962493896484 + }, + { + "auxiliary_loss_clip": 0.06771734, + "auxiliary_loss_mlp": 0.01331796, + "balance_loss_clip": 0.06338413, + "balance_loss_mlp": 0.01278986, + "epoch": 0.0890425372012626, + "flos": 23557444439040.0, + "grad_norm": 2.1143063599710135, + "language_loss": 0.68804622, + "learning_rate": 3.963544031823624e-06, + "loss": 0.76908153, + "num_input_tokens_seen": 31665770, + "router_z_loss_clip": 4.33203125, + "router_z_loss_mlp": 0.52807617, + "step": 1481, + "time_per_iteration": 4.085212707519531 + }, + { + "auxiliary_loss_clip": 0.06743339, + "auxiliary_loss_mlp": 0.01307322, + "balance_loss_clip": 0.06335256, + "balance_loss_mlp": 0.01264358, + "epoch": 0.08910266045393056, + "flos": 23009446736640.0, + "grad_norm": 2.5169726563525234, + "language_loss": 0.99559236, + "learning_rate": 3.9634699725806065e-06, + "loss": 1.07609892, + "num_input_tokens_seen": 31683805, + "router_z_loss_clip": 4.07617188, + "router_z_loss_mlp": 0.42993164, + "step": 1482, + "time_per_iteration": 2.564034938812256 + }, + { + "auxiliary_loss_clip": 0.06760907, + "auxiliary_loss_mlp": 0.0131259, + "balance_loss_clip": 0.06338564, + "balance_loss_mlp": 0.01264024, + "epoch": 0.08916278370659853, + "flos": 31943766257280.0, + "grad_norm": 3.2036096398767993, + "language_loss": 0.81227845, + "learning_rate": 3.96339583888261e-06, + "loss": 0.89301342, + "num_input_tokens_seen": 31704630, + "router_z_loss_clip": 4.22460938, + "router_z_loss_mlp": 0.48535156, + "step": 1483, + "time_per_iteration": 4.063607215881348 + }, + { + "auxiliary_loss_clip": 0.06743906, + "auxiliary_loss_mlp": 0.01316489, + "balance_loss_clip": 0.06329283, + "balance_loss_mlp": 0.01268519, + "epoch": 0.08922290695926649, + "flos": 17536219966080.0, + "grad_norm": 10.926297293099243, + "language_loss": 0.87554848, + "learning_rate": 3.963321630732448e-06, + "loss": 0.95615244, + "num_input_tokens_seen": 31723255, + "router_z_loss_clip": 4.140625, + "router_z_loss_mlp": 0.47998047, + "step": 1484, + "time_per_iteration": 2.5457398891448975 + }, + { + "auxiliary_loss_clip": 0.06757183, + "auxiliary_loss_mlp": 0.01321525, + "balance_loss_clip": 0.06330685, + "balance_loss_mlp": 0.01272315, + "epoch": 0.08928303021193447, + "flos": 32133392046720.0, + "grad_norm": 2.337720635500538, + "language_loss": 0.82324612, + "learning_rate": 3.963247348132932e-06, + "loss": 0.90403324, + "num_input_tokens_seen": 31747045, + "router_z_loss_clip": 4.265625, + "router_z_loss_mlp": 0.49267578, + "step": 1485, + "time_per_iteration": 2.6794724464416504 + }, + { + "auxiliary_loss_clip": 0.06736165, + "auxiliary_loss_mlp": 0.01302402, + "balance_loss_clip": 0.06326707, + "balance_loss_mlp": 0.01256125, + "epoch": 0.08934315346460243, + "flos": 22131392601600.0, + "grad_norm": 3.158284640334893, + "language_loss": 0.84766626, + "learning_rate": 3.96317299108688e-06, + "loss": 0.92805195, + "num_input_tokens_seen": 31766615, + "router_z_loss_clip": 4.09765625, + "router_z_loss_mlp": 0.46264648, + "step": 1486, + "time_per_iteration": 2.5732409954071045 + }, + { + "auxiliary_loss_clip": 0.06736217, + "auxiliary_loss_mlp": 0.0130934, + "balance_loss_clip": 0.06328043, + "balance_loss_mlp": 0.01267569, + "epoch": 0.0894032767172704, + "flos": 22572264458880.0, + "grad_norm": 1.7672180345851645, + "language_loss": 0.78605509, + "learning_rate": 3.963098559597111e-06, + "loss": 0.86651075, + "num_input_tokens_seen": 31785855, + "router_z_loss_clip": 4.07617188, + "router_z_loss_mlp": 0.41748047, + "step": 1487, + "time_per_iteration": 2.5952718257904053 + }, + { + "auxiliary_loss_clip": 0.06736919, + "auxiliary_loss_mlp": 0.01308401, + "balance_loss_clip": 0.06326038, + "balance_loss_mlp": 0.0126353, + "epoch": 0.08946339996993838, + "flos": 20199578319360.0, + "grad_norm": 4.25204894574284, + "language_loss": 0.85387635, + "learning_rate": 3.963024053666449e-06, + "loss": 0.93432951, + "num_input_tokens_seen": 31804210, + "router_z_loss_clip": 4.10546875, + "router_z_loss_mlp": 0.44873047, + "step": 1488, + "time_per_iteration": 2.5534958839416504 + }, + { + "auxiliary_loss_clip": 0.06725559, + "auxiliary_loss_mlp": 0.01303445, + "balance_loss_clip": 0.06320536, + "balance_loss_mlp": 0.01259838, + "epoch": 0.08952352322260634, + "flos": 48371035363200.0, + "grad_norm": 2.4620081078023173, + "language_loss": 0.74370039, + "learning_rate": 3.962949473297718e-06, + "loss": 0.82399046, + "num_input_tokens_seen": 31826150, + "router_z_loss_clip": 4.04882812, + "router_z_loss_mlp": 0.43554688, + "step": 1489, + "time_per_iteration": 2.780122756958008 + }, + { + "auxiliary_loss_clip": 0.06736162, + "auxiliary_loss_mlp": 0.01308033, + "balance_loss_clip": 0.06324734, + "balance_loss_mlp": 0.01264092, + "epoch": 0.08958364647527431, + "flos": 31800736137600.0, + "grad_norm": 2.6258968543660584, + "language_loss": 0.91654348, + "learning_rate": 3.962874818493745e-06, + "loss": 0.99698538, + "num_input_tokens_seen": 31848060, + "router_z_loss_clip": 4.11132812, + "router_z_loss_mlp": 0.43945312, + "step": 1490, + "time_per_iteration": 2.619051456451416 + }, + { + "auxiliary_loss_clip": 0.06748827, + "auxiliary_loss_mlp": 0.01303631, + "balance_loss_clip": 0.06332797, + "balance_loss_mlp": 0.01258737, + "epoch": 0.08964376972794229, + "flos": 23374988173440.0, + "grad_norm": 2.6637397886572076, + "language_loss": 0.76370478, + "learning_rate": 3.9628000892573635e-06, + "loss": 0.84422934, + "num_input_tokens_seen": 31870040, + "router_z_loss_clip": 4.1640625, + "router_z_loss_mlp": 0.44897461, + "step": 1491, + "time_per_iteration": 2.590679407119751 + }, + { + "auxiliary_loss_clip": 0.06728335, + "auxiliary_loss_mlp": 0.01302455, + "balance_loss_clip": 0.06325481, + "balance_loss_mlp": 0.01261804, + "epoch": 0.08970389298061025, + "flos": 23301502542720.0, + "grad_norm": 1.853626118240874, + "language_loss": 0.78431886, + "learning_rate": 3.9627252855914055e-06, + "loss": 0.86462677, + "num_input_tokens_seen": 31890400, + "router_z_loss_clip": 4.02539062, + "router_z_loss_mlp": 0.40673828, + "step": 1492, + "time_per_iteration": 2.5715339183807373 + }, + { + "auxiliary_loss_clip": 0.06729841, + "auxiliary_loss_mlp": 0.01304764, + "balance_loss_clip": 0.06324601, + "balance_loss_mlp": 0.01260298, + "epoch": 0.08976401623327822, + "flos": 33769419016320.0, + "grad_norm": 3.870321699477457, + "language_loss": 0.73167109, + "learning_rate": 3.962650407498707e-06, + "loss": 0.81201714, + "num_input_tokens_seen": 31913435, + "router_z_loss_clip": 4.046875, + "router_z_loss_mlp": 0.44433594, + "step": 1493, + "time_per_iteration": 2.6644091606140137 + }, + { + "auxiliary_loss_clip": 0.0673489, + "auxiliary_loss_mlp": 0.01306407, + "balance_loss_clip": 0.06327641, + "balance_loss_mlp": 0.01259987, + "epoch": 0.08982413948594618, + "flos": 23917535360640.0, + "grad_norm": 1.970514386565943, + "language_loss": 0.88832223, + "learning_rate": 3.962575454982109e-06, + "loss": 0.96873516, + "num_input_tokens_seen": 31932435, + "router_z_loss_clip": 4.07617188, + "router_z_loss_mlp": 0.46435547, + "step": 1494, + "time_per_iteration": 2.58363676071167 + }, + { + "auxiliary_loss_clip": 0.06728575, + "auxiliary_loss_mlp": 0.01309753, + "balance_loss_clip": 0.06328882, + "balance_loss_mlp": 0.01267792, + "epoch": 0.08988426273861416, + "flos": 16843305427200.0, + "grad_norm": 4.2307100076147774, + "language_loss": 0.84796005, + "learning_rate": 3.962500428044454e-06, + "loss": 0.92834336, + "num_input_tokens_seen": 31950125, + "router_z_loss_clip": 3.99414062, + "router_z_loss_mlp": 0.41967773, + "step": 1495, + "time_per_iteration": 2.5592563152313232 + }, + { + "auxiliary_loss_clip": 0.06737964, + "auxiliary_loss_mlp": 0.01307798, + "balance_loss_clip": 0.06329042, + "balance_loss_mlp": 0.01263476, + "epoch": 0.08994438599128213, + "flos": 14798621295360.0, + "grad_norm": 2.6872032858380885, + "language_loss": 0.72458923, + "learning_rate": 3.962425326688585e-06, + "loss": 0.80504692, + "num_input_tokens_seen": 31968050, + "router_z_loss_clip": 4.08203125, + "router_z_loss_mlp": 0.44287109, + "step": 1496, + "time_per_iteration": 2.527702569961548 + }, + { + "auxiliary_loss_clip": 0.06731858, + "auxiliary_loss_mlp": 0.01301643, + "balance_loss_clip": 0.06328158, + "balance_loss_mlp": 0.01259038, + "epoch": 0.09000450924395009, + "flos": 17390087245440.0, + "grad_norm": 1.9873412980644265, + "language_loss": 0.82173735, + "learning_rate": 3.962350150917351e-06, + "loss": 0.90207237, + "num_input_tokens_seen": 31985675, + "router_z_loss_clip": 4.03515625, + "router_z_loss_mlp": 0.42578125, + "step": 1497, + "time_per_iteration": 2.5877413749694824 + }, + { + "auxiliary_loss_clip": 0.06743819, + "auxiliary_loss_mlp": 0.01303103, + "balance_loss_clip": 0.06327296, + "balance_loss_mlp": 0.01257064, + "epoch": 0.09006463249661807, + "flos": 24287269501440.0, + "grad_norm": 4.64905554567639, + "language_loss": 0.85617393, + "learning_rate": 3.9622749007336035e-06, + "loss": 0.93664312, + "num_input_tokens_seen": 32005180, + "router_z_loss_clip": 4.1640625, + "router_z_loss_mlp": 0.4609375, + "step": 1498, + "time_per_iteration": 2.5904557704925537 + }, + { + "auxiliary_loss_clip": 0.06749868, + "auxiliary_loss_mlp": 0.01309538, + "balance_loss_clip": 0.06334974, + "balance_loss_mlp": 0.01263666, + "epoch": 0.09012475574928604, + "flos": 13666931251200.0, + "grad_norm": 3.85109419291821, + "language_loss": 0.81540704, + "learning_rate": 3.962199576140195e-06, + "loss": 0.89600116, + "num_input_tokens_seen": 32022970, + "router_z_loss_clip": 4.1484375, + "router_z_loss_mlp": 0.45849609, + "step": 1499, + "time_per_iteration": 2.5302114486694336 + }, + { + "auxiliary_loss_clip": 0.06728019, + "auxiliary_loss_mlp": 0.01300863, + "balance_loss_clip": 0.06331602, + "balance_loss_mlp": 0.01261142, + "epoch": 0.090184879001954, + "flos": 23333884945920.0, + "grad_norm": 2.0381377997897636, + "language_loss": 0.94349372, + "learning_rate": 3.962124177139981e-06, + "loss": 1.02378249, + "num_input_tokens_seen": 32043055, + "router_z_loss_clip": 3.96484375, + "router_z_loss_mlp": 0.3972168, + "step": 1500, + "time_per_iteration": 2.5795865058898926 + }, + { + "auxiliary_loss_clip": 0.0677222, + "auxiliary_loss_mlp": 0.01314156, + "balance_loss_clip": 0.06350215, + "balance_loss_mlp": 0.01263539, + "epoch": 0.09024500225462198, + "flos": 23009320955520.0, + "grad_norm": 3.436423392701186, + "language_loss": 0.77039468, + "learning_rate": 3.962048703735822e-06, + "loss": 0.8512584, + "num_input_tokens_seen": 32061900, + "router_z_loss_clip": 4.21875, + "router_z_loss_mlp": 0.50634766, + "step": 1501, + "time_per_iteration": 2.5764503479003906 + }, + { + "auxiliary_loss_clip": 0.06607839, + "auxiliary_loss_mlp": 0.01283791, + "balance_loss_clip": 0.06328217, + "balance_loss_mlp": 0.01261165, + "epoch": 0.09030512550728995, + "flos": 62208626653440.0, + "grad_norm": 0.7031155649326037, + "language_loss": 0.58089769, + "learning_rate": 3.96197315593058e-06, + "loss": 0.659814, + "num_input_tokens_seen": 32122745, + "router_z_loss_clip": 2.796875, + "router_z_loss_mlp": 0.22619629, + "step": 1502, + "time_per_iteration": 3.1644375324249268 + }, + { + "auxiliary_loss_clip": 0.06763642, + "auxiliary_loss_mlp": 0.01313188, + "balance_loss_clip": 0.06354539, + "balance_loss_mlp": 0.01269653, + "epoch": 0.09036524875995791, + "flos": 38809907775360.0, + "grad_norm": 3.4086152145479427, + "language_loss": 0.72101718, + "learning_rate": 3.961897533727119e-06, + "loss": 0.80178547, + "num_input_tokens_seen": 32145125, + "router_z_loss_clip": 4.09179688, + "router_z_loss_mlp": 0.43579102, + "step": 1503, + "time_per_iteration": 2.724386215209961 + }, + { + "auxiliary_loss_clip": 0.06781425, + "auxiliary_loss_mlp": 0.01307874, + "balance_loss_clip": 0.06363953, + "balance_loss_mlp": 0.01263075, + "epoch": 0.09042537201262588, + "flos": 21696642092160.0, + "grad_norm": 2.1842796361034793, + "language_loss": 0.881266, + "learning_rate": 3.961821837128306e-06, + "loss": 0.96215898, + "num_input_tokens_seen": 32166255, + "router_z_loss_clip": 4.1796875, + "router_z_loss_mlp": 0.44848633, + "step": 1504, + "time_per_iteration": 2.5873734951019287 + }, + { + "auxiliary_loss_clip": 0.06790902, + "auxiliary_loss_mlp": 0.01331983, + "balance_loss_clip": 0.06361797, + "balance_loss_mlp": 0.01280795, + "epoch": 0.09048549526529386, + "flos": 22272536004480.0, + "grad_norm": 3.0474410186464427, + "language_loss": 0.75017542, + "learning_rate": 3.961746066137014e-06, + "loss": 0.83140427, + "num_input_tokens_seen": 32184010, + "router_z_loss_clip": 4.2890625, + "router_z_loss_mlp": 0.51171875, + "step": 1505, + "time_per_iteration": 2.542175054550171 + }, + { + "auxiliary_loss_clip": 0.06765792, + "auxiliary_loss_mlp": 0.0131069, + "balance_loss_clip": 0.06354111, + "balance_loss_mlp": 0.01263936, + "epoch": 0.09054561851796182, + "flos": 14616165029760.0, + "grad_norm": 3.6481054719455166, + "language_loss": 0.83357459, + "learning_rate": 3.961670220756114e-06, + "loss": 0.91433942, + "num_input_tokens_seen": 32201635, + "router_z_loss_clip": 4.11914062, + "router_z_loss_mlp": 0.46777344, + "step": 1506, + "time_per_iteration": 2.5811927318573 + }, + { + "auxiliary_loss_clip": 0.06768796, + "auxiliary_loss_mlp": 0.01305475, + "balance_loss_clip": 0.06366544, + "balance_loss_mlp": 0.01262584, + "epoch": 0.09060574177062979, + "flos": 27643542393600.0, + "grad_norm": 2.7002549048976388, + "language_loss": 0.78016138, + "learning_rate": 3.961594300988482e-06, + "loss": 0.8609041, + "num_input_tokens_seen": 32221940, + "router_z_loss_clip": 4.02148438, + "router_z_loss_mlp": 0.42871094, + "step": 1507, + "time_per_iteration": 2.6117966175079346 + }, + { + "auxiliary_loss_clip": 0.06588461, + "auxiliary_loss_mlp": 0.01287299, + "balance_loss_clip": 0.06317182, + "balance_loss_mlp": 0.01264351, + "epoch": 0.09066586502329776, + "flos": 66104637621120.0, + "grad_norm": 0.7149959192610794, + "language_loss": 0.57417059, + "learning_rate": 3.961518306836998e-06, + "loss": 0.65292823, + "num_input_tokens_seen": 32276495, + "router_z_loss_clip": 2.71875, + "router_z_loss_mlp": 0.22924805, + "step": 1508, + "time_per_iteration": 3.055577516555786 + }, + { + "auxiliary_loss_clip": 0.06765939, + "auxiliary_loss_mlp": 0.01315934, + "balance_loss_clip": 0.06356797, + "balance_loss_mlp": 0.01271135, + "epoch": 0.09072598827596573, + "flos": 18922426387200.0, + "grad_norm": 2.757411639882116, + "language_loss": 0.87097013, + "learning_rate": 3.961442238304543e-06, + "loss": 0.95178884, + "num_input_tokens_seen": 32294130, + "router_z_loss_clip": 4.09179688, + "router_z_loss_mlp": 0.44775391, + "step": 1509, + "time_per_iteration": 2.5325253009796143 + }, + { + "auxiliary_loss_clip": 0.06796411, + "auxiliary_loss_mlp": 0.01325092, + "balance_loss_clip": 0.06366567, + "balance_loss_mlp": 0.01275358, + "epoch": 0.0907861115286337, + "flos": 24827804190720.0, + "grad_norm": 3.0354649762753896, + "language_loss": 0.86899114, + "learning_rate": 3.961366095394002e-06, + "loss": 0.95020616, + "num_input_tokens_seen": 32313555, + "router_z_loss_clip": 4.29492188, + "router_z_loss_mlp": 0.49707031, + "step": 1510, + "time_per_iteration": 2.608421564102173 + }, + { + "auxiliary_loss_clip": 0.06775412, + "auxiliary_loss_mlp": 0.01304282, + "balance_loss_clip": 0.06358128, + "balance_loss_mlp": 0.01260127, + "epoch": 0.09084623478130167, + "flos": 21659270371200.0, + "grad_norm": 2.4633218193770103, + "language_loss": 0.89968181, + "learning_rate": 3.961289878108262e-06, + "loss": 0.98047876, + "num_input_tokens_seen": 32331430, + "router_z_loss_clip": 4.17773438, + "router_z_loss_mlp": 0.44140625, + "step": 1511, + "time_per_iteration": 2.566403388977051 + }, + { + "auxiliary_loss_clip": 0.0674355, + "auxiliary_loss_mlp": 0.01315251, + "balance_loss_clip": 0.06338912, + "balance_loss_mlp": 0.01272121, + "epoch": 0.09090635803396964, + "flos": 27647148119040.0, + "grad_norm": 2.09202487509347, + "language_loss": 0.86417758, + "learning_rate": 3.9612135864502135e-06, + "loss": 0.94476557, + "num_input_tokens_seen": 32353705, + "router_z_loss_clip": 4.0546875, + "router_z_loss_mlp": 0.43164062, + "step": 1512, + "time_per_iteration": 2.665790319442749 + }, + { + "auxiliary_loss_clip": 0.06752454, + "auxiliary_loss_mlp": 0.0130495, + "balance_loss_clip": 0.06350584, + "balance_loss_mlp": 0.01262726, + "epoch": 0.0909664812866376, + "flos": 17673757643520.0, + "grad_norm": 2.5146334197942926, + "language_loss": 0.88217908, + "learning_rate": 3.961137220422749e-06, + "loss": 0.96275318, + "num_input_tokens_seen": 32370520, + "router_z_loss_clip": 4.02148438, + "router_z_loss_mlp": 0.42211914, + "step": 1513, + "time_per_iteration": 2.531816244125366 + }, + { + "auxiliary_loss_clip": 0.06760095, + "auxiliary_loss_mlp": 0.01314183, + "balance_loss_clip": 0.06354512, + "balance_loss_mlp": 0.01272078, + "epoch": 0.09102660453930557, + "flos": 23958261244800.0, + "grad_norm": 5.873122305201123, + "language_loss": 0.88520277, + "learning_rate": 3.961060780028764e-06, + "loss": 0.9659456, + "num_input_tokens_seen": 32389105, + "router_z_loss_clip": 4.05664062, + "router_z_loss_mlp": 0.42138672, + "step": 1514, + "time_per_iteration": 2.609802722930908 + }, + { + "auxiliary_loss_clip": 0.06748682, + "auxiliary_loss_mlp": 0.01305229, + "balance_loss_clip": 0.06345841, + "balance_loss_mlp": 0.01266104, + "epoch": 0.09108672779197355, + "flos": 25820195621760.0, + "grad_norm": 1.9733366853077507, + "language_loss": 0.91259241, + "learning_rate": 3.960984265271159e-06, + "loss": 0.99313152, + "num_input_tokens_seen": 32408065, + "router_z_loss_clip": 4.02929688, + "router_z_loss_mlp": 0.39111328, + "step": 1515, + "time_per_iteration": 2.626183271408081 + }, + { + "auxiliary_loss_clip": 0.06753635, + "auxiliary_loss_mlp": 0.01307479, + "balance_loss_clip": 0.06346089, + "balance_loss_mlp": 0.01264754, + "epoch": 0.09114685104464151, + "flos": 29646620173440.0, + "grad_norm": 2.1883056599674195, + "language_loss": 0.87669599, + "learning_rate": 3.9609076761528335e-06, + "loss": 0.9573071, + "num_input_tokens_seen": 32427225, + "router_z_loss_clip": 4.078125, + "router_z_loss_mlp": 0.42700195, + "step": 1516, + "time_per_iteration": 4.0171709060668945 + }, + { + "auxiliary_loss_clip": 0.06753673, + "auxiliary_loss_mlp": 0.01309986, + "balance_loss_clip": 0.06344739, + "balance_loss_mlp": 0.01267643, + "epoch": 0.09120697429730948, + "flos": 33738084789120.0, + "grad_norm": 1.96049698042547, + "language_loss": 0.82941747, + "learning_rate": 3.960831012676692e-06, + "loss": 0.91005409, + "num_input_tokens_seen": 32450510, + "router_z_loss_clip": 4.0859375, + "router_z_loss_mlp": 0.42285156, + "step": 1517, + "time_per_iteration": 4.134803056716919 + }, + { + "auxiliary_loss_clip": 0.06748644, + "auxiliary_loss_mlp": 0.01313239, + "balance_loss_clip": 0.06338718, + "balance_loss_mlp": 0.0127061, + "epoch": 0.09126709754997746, + "flos": 18406559525760.0, + "grad_norm": 1.9085933618955446, + "language_loss": 0.79150838, + "learning_rate": 3.960754274845642e-06, + "loss": 0.87212718, + "num_input_tokens_seen": 32468425, + "router_z_loss_clip": 4.09375, + "router_z_loss_mlp": 0.42626953, + "step": 1518, + "time_per_iteration": 2.609239101409912 + }, + { + "auxiliary_loss_clip": 0.06742416, + "auxiliary_loss_mlp": 0.01311508, + "balance_loss_clip": 0.0633543, + "balance_loss_mlp": 0.01267853, + "epoch": 0.09132722080264542, + "flos": 22098674782080.0, + "grad_norm": 1.8265694387954685, + "language_loss": 0.88381147, + "learning_rate": 3.960677462662594e-06, + "loss": 0.9643507, + "num_input_tokens_seen": 32487510, + "router_z_loss_clip": 4.0703125, + "router_z_loss_mlp": 0.43676758, + "step": 1519, + "time_per_iteration": 2.559178590774536 + }, + { + "auxiliary_loss_clip": 0.06749827, + "auxiliary_loss_mlp": 0.01303758, + "balance_loss_clip": 0.06334724, + "balance_loss_mlp": 0.01259507, + "epoch": 0.09138734405531339, + "flos": 21039547973760.0, + "grad_norm": 3.1504469624820497, + "language_loss": 0.75833631, + "learning_rate": 3.96060057613046e-06, + "loss": 0.83887213, + "num_input_tokens_seen": 32507250, + "router_z_loss_clip": 4.15625, + "router_z_loss_mlp": 0.44238281, + "step": 1520, + "time_per_iteration": 2.5994057655334473 + }, + { + "auxiliary_loss_clip": 0.06753822, + "auxiliary_loss_mlp": 0.0130995, + "balance_loss_clip": 0.06342606, + "balance_loss_mlp": 0.01263912, + "epoch": 0.09144746730798137, + "flos": 20090104560000.0, + "grad_norm": 3.4850769207863648, + "language_loss": 0.8813951, + "learning_rate": 3.960523615252156e-06, + "loss": 0.96203285, + "num_input_tokens_seen": 32526045, + "router_z_loss_clip": 4.1171875, + "router_z_loss_mlp": 0.45996094, + "step": 1521, + "time_per_iteration": 3.9595701694488525 + }, + { + "auxiliary_loss_clip": 0.06768003, + "auxiliary_loss_mlp": 0.0131471, + "balance_loss_clip": 0.06346045, + "balance_loss_mlp": 0.01269864, + "epoch": 0.09150759056064933, + "flos": 22783874745600.0, + "grad_norm": 2.490873911959668, + "language_loss": 0.85374022, + "learning_rate": 3.960446580030599e-06, + "loss": 0.93456733, + "num_input_tokens_seen": 32546575, + "router_z_loss_clip": 4.21875, + "router_z_loss_mlp": 0.44824219, + "step": 1522, + "time_per_iteration": 4.0201475620269775 + }, + { + "auxiliary_loss_clip": 0.06745256, + "auxiliary_loss_mlp": 0.01307893, + "balance_loss_clip": 0.06349748, + "balance_loss_mlp": 0.01265359, + "epoch": 0.0915677138133173, + "flos": 27571733844480.0, + "grad_norm": 3.0013683058651974, + "language_loss": 0.82841086, + "learning_rate": 3.960369470468711e-06, + "loss": 0.90894234, + "num_input_tokens_seen": 32568795, + "router_z_loss_clip": 3.95117188, + "router_z_loss_mlp": 0.42504883, + "step": 1523, + "time_per_iteration": 2.6468050479888916 + }, + { + "auxiliary_loss_clip": 0.0678298, + "auxiliary_loss_mlp": 0.01311185, + "balance_loss_clip": 0.06364655, + "balance_loss_mlp": 0.01265838, + "epoch": 0.09162783706598528, + "flos": 17680340188800.0, + "grad_norm": 4.7132272646544395, + "language_loss": 0.75685203, + "learning_rate": 3.960292286569418e-06, + "loss": 0.83779365, + "num_input_tokens_seen": 32587010, + "router_z_loss_clip": 4.1796875, + "router_z_loss_mlp": 0.45361328, + "step": 1524, + "time_per_iteration": 2.521636962890625 + }, + { + "auxiliary_loss_clip": 0.06770191, + "auxiliary_loss_mlp": 0.01303707, + "balance_loss_clip": 0.06361801, + "balance_loss_mlp": 0.01259814, + "epoch": 0.09168796031865324, + "flos": 18484028225280.0, + "grad_norm": 2.538080589714564, + "language_loss": 0.88912833, + "learning_rate": 3.960215028335644e-06, + "loss": 0.96986729, + "num_input_tokens_seen": 32602375, + "router_z_loss_clip": 4.08398438, + "router_z_loss_mlp": 0.43920898, + "step": 1525, + "time_per_iteration": 2.523988962173462 + }, + { + "auxiliary_loss_clip": 0.06788673, + "auxiliary_loss_mlp": 0.01309343, + "balance_loss_clip": 0.06375777, + "balance_loss_mlp": 0.01263877, + "epoch": 0.0917480835713212, + "flos": 29395290251520.0, + "grad_norm": 2.947838768384084, + "language_loss": 0.76479626, + "learning_rate": 3.96013769577032e-06, + "loss": 0.84577644, + "num_input_tokens_seen": 32621460, + "router_z_loss_clip": 4.125, + "router_z_loss_mlp": 0.45458984, + "step": 1526, + "time_per_iteration": 2.622180700302124 + }, + { + "auxiliary_loss_clip": 0.06764297, + "auxiliary_loss_mlp": 0.0130915, + "balance_loss_clip": 0.06361825, + "balance_loss_mlp": 0.01267212, + "epoch": 0.09180820682398917, + "flos": 19835504328960.0, + "grad_norm": 3.217414250452265, + "language_loss": 0.78915322, + "learning_rate": 3.960060288876378e-06, + "loss": 0.86988777, + "num_input_tokens_seen": 32640440, + "router_z_loss_clip": 4.01953125, + "router_z_loss_mlp": 0.41967773, + "step": 1527, + "time_per_iteration": 2.574036121368408 + }, + { + "auxiliary_loss_clip": 0.0678985, + "auxiliary_loss_mlp": 0.0131218, + "balance_loss_clip": 0.0637854, + "balance_loss_mlp": 0.01269146, + "epoch": 0.09186833007665715, + "flos": 23848619777280.0, + "grad_norm": 2.3845621342237284, + "language_loss": 0.81092995, + "learning_rate": 3.959982807656753e-06, + "loss": 0.89195025, + "num_input_tokens_seen": 32660020, + "router_z_loss_clip": 4.109375, + "router_z_loss_mlp": 0.42993164, + "step": 1528, + "time_per_iteration": 2.55942440032959 + }, + { + "auxiliary_loss_clip": 0.067963, + "auxiliary_loss_mlp": 0.01308536, + "balance_loss_clip": 0.06370017, + "balance_loss_mlp": 0.01259708, + "epoch": 0.09192845332932512, + "flos": 12937693167360.0, + "grad_norm": 3.969055249882827, + "language_loss": 0.79179597, + "learning_rate": 3.959905252114384e-06, + "loss": 0.87284434, + "num_input_tokens_seen": 32678170, + "router_z_loss_clip": 4.26171875, + "router_z_loss_mlp": 0.48828125, + "step": 1529, + "time_per_iteration": 2.559513807296753 + }, + { + "auxiliary_loss_clip": 0.06793401, + "auxiliary_loss_mlp": 0.01313121, + "balance_loss_clip": 0.06376834, + "balance_loss_mlp": 0.01266081, + "epoch": 0.09198857658199308, + "flos": 24574503697920.0, + "grad_norm": 2.3851695624911433, + "language_loss": 0.84393311, + "learning_rate": 3.959827622252211e-06, + "loss": 0.92499834, + "num_input_tokens_seen": 32697540, + "router_z_loss_clip": 4.1640625, + "router_z_loss_mlp": 0.47021484, + "step": 1530, + "time_per_iteration": 2.586825132369995 + }, + { + "auxiliary_loss_clip": 0.06782777, + "auxiliary_loss_mlp": 0.01307988, + "balance_loss_clip": 0.0637871, + "balance_loss_mlp": 0.01264596, + "epoch": 0.09204869983466106, + "flos": 20273231658240.0, + "grad_norm": 2.9699033759595728, + "language_loss": 0.85435712, + "learning_rate": 3.959749918073179e-06, + "loss": 0.93526471, + "num_input_tokens_seen": 32716805, + "router_z_loss_clip": 4.0390625, + "router_z_loss_mlp": 0.43383789, + "step": 1531, + "time_per_iteration": 2.592822313308716 + }, + { + "auxiliary_loss_clip": 0.06784501, + "auxiliary_loss_mlp": 0.01306885, + "balance_loss_clip": 0.06371005, + "balance_loss_mlp": 0.01261967, + "epoch": 0.09210882308732903, + "flos": 20891780098560.0, + "grad_norm": 2.1537883780568907, + "language_loss": 0.82955891, + "learning_rate": 3.959672139580233e-06, + "loss": 0.91047275, + "num_input_tokens_seen": 32736385, + "router_z_loss_clip": 4.13671875, + "router_z_loss_mlp": 0.44897461, + "step": 1532, + "time_per_iteration": 2.5733680725097656 + }, + { + "auxiliary_loss_clip": 0.06776289, + "auxiliary_loss_mlp": 0.01303592, + "balance_loss_clip": 0.06368969, + "balance_loss_mlp": 0.01262059, + "epoch": 0.09216894633999699, + "flos": 30964246427520.0, + "grad_norm": 3.2208618489711593, + "language_loss": 0.85266644, + "learning_rate": 3.9595942867763235e-06, + "loss": 0.93346524, + "num_input_tokens_seen": 32757140, + "router_z_loss_clip": 4.06835938, + "router_z_loss_mlp": 0.41552734, + "step": 1533, + "time_per_iteration": 2.640906810760498 + }, + { + "auxiliary_loss_clip": 0.06779255, + "auxiliary_loss_mlp": 0.01307047, + "balance_loss_clip": 0.06369043, + "balance_loss_mlp": 0.01263369, + "epoch": 0.09222906959266497, + "flos": 13156556832000.0, + "grad_norm": 2.5924628709665987, + "language_loss": 0.91772735, + "learning_rate": 3.959516359664402e-06, + "loss": 0.99859047, + "num_input_tokens_seen": 32774860, + "router_z_loss_clip": 4.09960938, + "router_z_loss_mlp": 0.43652344, + "step": 1534, + "time_per_iteration": 2.5586555004119873 + }, + { + "auxiliary_loss_clip": 0.06771498, + "auxiliary_loss_mlp": 0.01306705, + "balance_loss_clip": 0.06357232, + "balance_loss_mlp": 0.01260142, + "epoch": 0.09228919284533293, + "flos": 26001603711360.0, + "grad_norm": 3.0123317324125694, + "language_loss": 0.77440608, + "learning_rate": 3.959438358247424e-06, + "loss": 0.85518813, + "num_input_tokens_seen": 32795250, + "router_z_loss_clip": 4.14257812, + "router_z_loss_mlp": 0.46557617, + "step": 1535, + "time_per_iteration": 2.5873541831970215 + }, + { + "auxiliary_loss_clip": 0.06759383, + "auxiliary_loss_mlp": 0.0131007, + "balance_loss_clip": 0.06362146, + "balance_loss_mlp": 0.012688, + "epoch": 0.0923493160980009, + "flos": 18666694126080.0, + "grad_norm": 2.0947698011843707, + "language_loss": 0.83399653, + "learning_rate": 3.959360282528346e-06, + "loss": 0.91469115, + "num_input_tokens_seen": 32813805, + "router_z_loss_clip": 3.97070312, + "router_z_loss_mlp": 0.41235352, + "step": 1536, + "time_per_iteration": 2.5708868503570557 + }, + { + "auxiliary_loss_clip": 0.06743568, + "auxiliary_loss_mlp": 0.01297679, + "balance_loss_clip": 0.06350097, + "balance_loss_mlp": 0.01257767, + "epoch": 0.09240943935066886, + "flos": 21146673818880.0, + "grad_norm": 2.077431495660488, + "language_loss": 0.91567117, + "learning_rate": 3.959282132510131e-06, + "loss": 0.99608374, + "num_input_tokens_seen": 32830960, + "router_z_loss_clip": 3.93945312, + "router_z_loss_mlp": 0.39916992, + "step": 1537, + "time_per_iteration": 2.5669217109680176 + }, + { + "auxiliary_loss_clip": 0.06758659, + "auxiliary_loss_mlp": 0.01302061, + "balance_loss_clip": 0.06354217, + "balance_loss_mlp": 0.01258288, + "epoch": 0.09246956260333684, + "flos": 20598298773120.0, + "grad_norm": 2.764633424079652, + "language_loss": 0.82388502, + "learning_rate": 3.959203908195741e-06, + "loss": 0.9044922, + "num_input_tokens_seen": 32848275, + "router_z_loss_clip": 4.04296875, + "router_z_loss_mlp": 0.43774414, + "step": 1538, + "time_per_iteration": 2.5693938732147217 + }, + { + "auxiliary_loss_clip": 0.06616426, + "auxiliary_loss_mlp": 0.01331188, + "balance_loss_clip": 0.06353034, + "balance_loss_mlp": 0.01300217, + "epoch": 0.09252968585600481, + "flos": 67580052312960.0, + "grad_norm": 0.7302597602699774, + "language_loss": 0.57435596, + "learning_rate": 3.959125609588142e-06, + "loss": 0.65383208, + "num_input_tokens_seen": 32917730, + "router_z_loss_clip": 2.625, + "router_z_loss_mlp": 0.30932617, + "step": 1539, + "time_per_iteration": 3.310535430908203 + }, + { + "auxiliary_loss_clip": 0.06755982, + "auxiliary_loss_mlp": 0.01299614, + "balance_loss_clip": 0.06351999, + "balance_loss_mlp": 0.01256174, + "epoch": 0.09258980910867277, + "flos": 17389542193920.0, + "grad_norm": 3.846304679224495, + "language_loss": 0.7084049, + "learning_rate": 3.959047236690304e-06, + "loss": 0.78896087, + "num_input_tokens_seen": 32934910, + "router_z_loss_clip": 4.03320312, + "router_z_loss_mlp": 0.43457031, + "step": 1540, + "time_per_iteration": 2.5759708881378174 + }, + { + "auxiliary_loss_clip": 0.06744132, + "auxiliary_loss_mlp": 0.01299701, + "balance_loss_clip": 0.0634924, + "balance_loss_mlp": 0.0125824, + "epoch": 0.09264993236134075, + "flos": 19872205217280.0, + "grad_norm": 1.8486482297190108, + "language_loss": 0.8567428, + "learning_rate": 3.958968789505198e-06, + "loss": 0.93718112, + "num_input_tokens_seen": 32953840, + "router_z_loss_clip": 3.95117188, + "router_z_loss_mlp": 0.41455078, + "step": 1541, + "time_per_iteration": 2.5332911014556885 + }, + { + "auxiliary_loss_clip": 0.06613824, + "auxiliary_loss_mlp": 0.01296188, + "balance_loss_clip": 0.06351398, + "balance_loss_mlp": 0.01268222, + "epoch": 0.09271005561400872, + "flos": 62301455377920.0, + "grad_norm": 0.8853632542817719, + "language_loss": 0.62370431, + "learning_rate": 3.9588902680358e-06, + "loss": 0.70280445, + "num_input_tokens_seen": 33011410, + "router_z_loss_clip": 2.625, + "router_z_loss_mlp": 0.28027344, + "step": 1542, + "time_per_iteration": 3.234708309173584 + }, + { + "auxiliary_loss_clip": 0.06759306, + "auxiliary_loss_mlp": 0.01304245, + "balance_loss_clip": 0.06356558, + "balance_loss_mlp": 0.01259923, + "epoch": 0.09277017886667668, + "flos": 23336358641280.0, + "grad_norm": 2.3970894213309, + "language_loss": 0.84548283, + "learning_rate": 3.958811672285086e-06, + "loss": 0.92611837, + "num_input_tokens_seen": 33031675, + "router_z_loss_clip": 4.03320312, + "router_z_loss_mlp": 0.44360352, + "step": 1543, + "time_per_iteration": 2.5636215209960938 + }, + { + "auxiliary_loss_clip": 0.06747155, + "auxiliary_loss_mlp": 0.01303454, + "balance_loss_clip": 0.06351274, + "balance_loss_mlp": 0.01258178, + "epoch": 0.09283030211934466, + "flos": 54757088513280.0, + "grad_norm": 2.335606951107943, + "language_loss": 0.73961073, + "learning_rate": 3.958733002256038e-06, + "loss": 0.82011688, + "num_input_tokens_seen": 33056355, + "router_z_loss_clip": 3.96289062, + "router_z_loss_mlp": 0.45288086, + "step": 1544, + "time_per_iteration": 2.8664584159851074 + }, + { + "auxiliary_loss_clip": 0.06775358, + "auxiliary_loss_mlp": 0.01304809, + "balance_loss_clip": 0.06364222, + "balance_loss_mlp": 0.01260082, + "epoch": 0.09289042537201263, + "flos": 30342385751040.0, + "grad_norm": 2.3360980643139673, + "language_loss": 0.78971326, + "learning_rate": 3.958654257951637e-06, + "loss": 0.87051487, + "num_input_tokens_seen": 33079520, + "router_z_loss_clip": 4.109375, + "router_z_loss_mlp": 0.44750977, + "step": 1545, + "time_per_iteration": 2.6384429931640625 + }, + { + "auxiliary_loss_clip": 0.0674521, + "auxiliary_loss_mlp": 0.01308675, + "balance_loss_clip": 0.06349306, + "balance_loss_mlp": 0.01266499, + "epoch": 0.09295054862468059, + "flos": 17752274519040.0, + "grad_norm": 3.8854693427637796, + "language_loss": 0.77781618, + "learning_rate": 3.9585754393748706e-06, + "loss": 0.85835493, + "num_input_tokens_seen": 33096135, + "router_z_loss_clip": 3.95703125, + "router_z_loss_mlp": 0.42163086, + "step": 1546, + "time_per_iteration": 2.5352087020874023 + }, + { + "auxiliary_loss_clip": 0.06760454, + "auxiliary_loss_mlp": 0.01300982, + "balance_loss_clip": 0.06357808, + "balance_loss_mlp": 0.01258066, + "epoch": 0.09301067187734856, + "flos": 23664528357120.0, + "grad_norm": 2.488248885797729, + "language_loss": 0.85732055, + "learning_rate": 3.9584965465287275e-06, + "loss": 0.93793488, + "num_input_tokens_seen": 33115245, + "router_z_loss_clip": 4.02734375, + "router_z_loss_mlp": 0.42919922, + "step": 1547, + "time_per_iteration": 2.6185734272003174 + }, + { + "auxiliary_loss_clip": 0.0676943, + "auxiliary_loss_mlp": 0.01302462, + "balance_loss_clip": 0.06361516, + "balance_loss_mlp": 0.01256733, + "epoch": 0.09307079513001654, + "flos": 27535242591360.0, + "grad_norm": 10.105633046635301, + "language_loss": 0.69631422, + "learning_rate": 3.958417579416199e-06, + "loss": 0.77703309, + "num_input_tokens_seen": 33136640, + "router_z_loss_clip": 4.078125, + "router_z_loss_mlp": 0.45703125, + "step": 1548, + "time_per_iteration": 2.590592861175537 + }, + { + "auxiliary_loss_clip": 0.06756231, + "auxiliary_loss_mlp": 0.01308751, + "balance_loss_clip": 0.06351212, + "balance_loss_mlp": 0.01262164, + "epoch": 0.0931309183826845, + "flos": 20632945236480.0, + "grad_norm": 2.778765119974638, + "language_loss": 0.85783607, + "learning_rate": 3.9583385380402795e-06, + "loss": 0.93848586, + "num_input_tokens_seen": 33155060, + "router_z_loss_clip": 4.0546875, + "router_z_loss_mlp": 0.46582031, + "step": 1549, + "time_per_iteration": 2.5733652114868164 + }, + { + "auxiliary_loss_clip": 0.0674461, + "auxiliary_loss_mlp": 0.0130734, + "balance_loss_clip": 0.06348558, + "balance_loss_mlp": 0.01260515, + "epoch": 0.09319104163535247, + "flos": 29028239441280.0, + "grad_norm": 2.291130376172184, + "language_loss": 0.78293371, + "learning_rate": 3.958259422403966e-06, + "loss": 0.86345315, + "num_input_tokens_seen": 33175420, + "router_z_loss_clip": 3.96289062, + "router_z_loss_mlp": 0.46777344, + "step": 1550, + "time_per_iteration": 2.675468683242798 + }, + { + "auxiliary_loss_clip": 0.06764482, + "auxiliary_loss_mlp": 0.01307112, + "balance_loss_clip": 0.06363475, + "balance_loss_mlp": 0.01261932, + "epoch": 0.09325116488802045, + "flos": 25308605318400.0, + "grad_norm": 3.8025580487165827, + "language_loss": 0.85284662, + "learning_rate": 3.95818023251026e-06, + "loss": 0.93356252, + "num_input_tokens_seen": 33194120, + "router_z_loss_clip": 4.00976562, + "router_z_loss_mlp": 0.4519043, + "step": 1551, + "time_per_iteration": 2.6053500175476074 + }, + { + "auxiliary_loss_clip": 0.06596169, + "auxiliary_loss_mlp": 0.0130535, + "balance_loss_clip": 0.0633968, + "balance_loss_mlp": 0.01277837, + "epoch": 0.09331128814068841, + "flos": 61556144509440.0, + "grad_norm": 0.7233822491319317, + "language_loss": 0.61895663, + "learning_rate": 3.958100968362163e-06, + "loss": 0.69797182, + "num_input_tokens_seen": 33261080, + "router_z_loss_clip": 2.5625, + "router_z_loss_mlp": 0.27587891, + "step": 1552, + "time_per_iteration": 3.3384416103363037 + }, + { + "auxiliary_loss_clip": 0.06590016, + "auxiliary_loss_mlp": 0.01301581, + "balance_loss_clip": 0.06333126, + "balance_loss_mlp": 0.012734, + "epoch": 0.09337141139335638, + "flos": 53312810883840.0, + "grad_norm": 0.7946952857616146, + "language_loss": 0.59040678, + "learning_rate": 3.958021629962681e-06, + "loss": 0.66932273, + "num_input_tokens_seen": 33330235, + "router_z_loss_clip": 2.5625, + "router_z_loss_mlp": 0.28222656, + "step": 1553, + "time_per_iteration": 3.328634262084961 + }, + { + "auxiliary_loss_clip": 0.06762205, + "auxiliary_loss_mlp": 0.01305187, + "balance_loss_clip": 0.06356394, + "balance_loss_mlp": 0.01259005, + "epoch": 0.09343153464602436, + "flos": 23483539537920.0, + "grad_norm": 2.4998209031659853, + "language_loss": 0.888143, + "learning_rate": 3.957942217314823e-06, + "loss": 0.96881694, + "num_input_tokens_seen": 33349035, + "router_z_loss_clip": 4.05078125, + "router_z_loss_mlp": 0.46142578, + "step": 1554, + "time_per_iteration": 2.581807851791382 + }, + { + "auxiliary_loss_clip": 0.06741555, + "auxiliary_loss_mlp": 0.01307833, + "balance_loss_clip": 0.06351957, + "balance_loss_mlp": 0.01266014, + "epoch": 0.09349165789869232, + "flos": 19359399029760.0, + "grad_norm": 2.344370035353047, + "language_loss": 0.83131635, + "learning_rate": 3.957862730421599e-06, + "loss": 0.91181016, + "num_input_tokens_seen": 33368060, + "router_z_loss_clip": 3.89453125, + "router_z_loss_mlp": 0.41772461, + "step": 1555, + "time_per_iteration": 2.5902695655822754 + }, + { + "auxiliary_loss_clip": 0.06587426, + "auxiliary_loss_mlp": 0.01289293, + "balance_loss_clip": 0.06331394, + "balance_loss_mlp": 0.01264736, + "epoch": 0.09355178115136029, + "flos": 67520626968960.0, + "grad_norm": 0.861973728001382, + "language_loss": 0.59963852, + "learning_rate": 3.957783169286024e-06, + "loss": 0.67840576, + "num_input_tokens_seen": 33430825, + "router_z_loss_clip": 2.5625, + "router_z_loss_mlp": 0.2454834, + "step": 1556, + "time_per_iteration": 4.633097410202026 + }, + { + "auxiliary_loss_clip": 0.06743869, + "auxiliary_loss_mlp": 0.01306461, + "balance_loss_clip": 0.06350282, + "balance_loss_mlp": 0.01262378, + "epoch": 0.09361190440402825, + "flos": 37350676920960.0, + "grad_norm": 4.324378965941339, + "language_loss": 0.86094332, + "learning_rate": 3.9577035339111155e-06, + "loss": 0.94144666, + "num_input_tokens_seen": 33454855, + "router_z_loss_clip": 3.93359375, + "router_z_loss_mlp": 0.44091797, + "step": 1557, + "time_per_iteration": 4.159425258636475 + }, + { + "auxiliary_loss_clip": 0.06735416, + "auxiliary_loss_mlp": 0.01305568, + "balance_loss_clip": 0.0634184, + "balance_loss_mlp": 0.01261961, + "epoch": 0.09367202765669623, + "flos": 24906614555520.0, + "grad_norm": 1.8416864834979163, + "language_loss": 0.79618692, + "learning_rate": 3.957623824299893e-06, + "loss": 0.87659669, + "num_input_tokens_seen": 33476000, + "router_z_loss_clip": 3.9375, + "router_z_loss_mlp": 0.4362793, + "step": 1558, + "time_per_iteration": 2.592564105987549 + }, + { + "auxiliary_loss_clip": 0.0675108, + "auxiliary_loss_mlp": 0.01310633, + "balance_loss_clip": 0.06350247, + "balance_loss_mlp": 0.0126562, + "epoch": 0.0937321509093642, + "flos": 15710986477440.0, + "grad_norm": 2.1774663365636555, + "language_loss": 0.81722063, + "learning_rate": 3.957544040455379e-06, + "loss": 0.89783776, + "num_input_tokens_seen": 33493845, + "router_z_loss_clip": 4.00390625, + "router_z_loss_mlp": 0.44995117, + "step": 1559, + "time_per_iteration": 2.6032233238220215 + }, + { + "auxiliary_loss_clip": 0.06735763, + "auxiliary_loss_mlp": 0.01315647, + "balance_loss_clip": 0.06339972, + "balance_loss_mlp": 0.0126844, + "epoch": 0.09379227416203216, + "flos": 20489663554560.0, + "grad_norm": 4.6744208078316785, + "language_loss": 0.77938354, + "learning_rate": 3.957464182380599e-06, + "loss": 0.85989761, + "num_input_tokens_seen": 33510850, + "router_z_loss_clip": 3.95117188, + "router_z_loss_mlp": 0.47216797, + "step": 1560, + "time_per_iteration": 4.077486753463745 + }, + { + "auxiliary_loss_clip": 0.06748343, + "auxiliary_loss_mlp": 0.01308417, + "balance_loss_clip": 0.06347422, + "balance_loss_mlp": 0.01262736, + "epoch": 0.09385239741470014, + "flos": 24359329612800.0, + "grad_norm": 2.0394992370655975, + "language_loss": 0.82801652, + "learning_rate": 3.95738425007858e-06, + "loss": 0.90858412, + "num_input_tokens_seen": 33530430, + "router_z_loss_clip": 4.0078125, + "router_z_loss_mlp": 0.45678711, + "step": 1561, + "time_per_iteration": 2.596116781234741 + }, + { + "auxiliary_loss_clip": 0.06752103, + "auxiliary_loss_mlp": 0.01323602, + "balance_loss_clip": 0.06347683, + "balance_loss_mlp": 0.01280186, + "epoch": 0.0939125206673681, + "flos": 33299812408320.0, + "grad_norm": 7.4214047506541085, + "language_loss": 0.63655907, + "learning_rate": 3.957304243552354e-06, + "loss": 0.71731609, + "num_input_tokens_seen": 33551975, + "router_z_loss_clip": 4.046875, + "router_z_loss_mlp": 0.43457031, + "step": 1562, + "time_per_iteration": 4.075207710266113 + }, + { + "auxiliary_loss_clip": 0.06726522, + "auxiliary_loss_mlp": 0.01325114, + "balance_loss_clip": 0.06341539, + "balance_loss_mlp": 0.012796, + "epoch": 0.09397264392003607, + "flos": 19250973446400.0, + "grad_norm": 3.0209063418471516, + "language_loss": 0.87167883, + "learning_rate": 3.957224162804956e-06, + "loss": 0.95219523, + "num_input_tokens_seen": 33569850, + "router_z_loss_clip": 3.84960938, + "router_z_loss_mlp": 0.45556641, + "step": 1563, + "time_per_iteration": 2.5672974586486816 + }, + { + "auxiliary_loss_clip": 0.06731268, + "auxiliary_loss_mlp": 0.01318973, + "balance_loss_clip": 0.06341776, + "balance_loss_mlp": 0.01275843, + "epoch": 0.09403276717270405, + "flos": 19323997879680.0, + "grad_norm": 4.036825223775372, + "language_loss": 0.77853692, + "learning_rate": 3.9571440078394205e-06, + "loss": 0.85903931, + "num_input_tokens_seen": 33590510, + "router_z_loss_clip": 3.90039062, + "router_z_loss_mlp": 0.43139648, + "step": 1564, + "time_per_iteration": 2.586803913116455 + }, + { + "auxiliary_loss_clip": 0.06734219, + "auxiliary_loss_mlp": 0.0132655, + "balance_loss_clip": 0.06344242, + "balance_loss_mlp": 0.01285876, + "epoch": 0.09409289042537201, + "flos": 23589701061120.0, + "grad_norm": 2.2846066488683725, + "language_loss": 0.81194431, + "learning_rate": 3.9570637786587895e-06, + "loss": 0.89255196, + "num_input_tokens_seen": 33608810, + "router_z_loss_clip": 3.90039062, + "router_z_loss_mlp": 0.40649414, + "step": 1565, + "time_per_iteration": 2.5794317722320557 + }, + { + "auxiliary_loss_clip": 0.06753047, + "auxiliary_loss_mlp": 0.01322466, + "balance_loss_clip": 0.06351732, + "balance_loss_mlp": 0.01275616, + "epoch": 0.09415301367803998, + "flos": 20083689722880.0, + "grad_norm": 2.6435222335860984, + "language_loss": 0.77859378, + "learning_rate": 3.956983475266103e-06, + "loss": 0.85934889, + "num_input_tokens_seen": 33627265, + "router_z_loss_clip": 4.01171875, + "router_z_loss_mlp": 0.46850586, + "step": 1566, + "time_per_iteration": 2.585827112197876 + }, + { + "auxiliary_loss_clip": 0.06732298, + "auxiliary_loss_mlp": 0.01317656, + "balance_loss_clip": 0.06341095, + "balance_loss_mlp": 0.01273048, + "epoch": 0.09421313693070796, + "flos": 21067234548480.0, + "grad_norm": 2.512043511854747, + "language_loss": 0.79885954, + "learning_rate": 3.956903097664407e-06, + "loss": 0.87935913, + "num_input_tokens_seen": 33644810, + "router_z_loss_clip": 3.91015625, + "router_z_loss_mlp": 0.44555664, + "step": 1567, + "time_per_iteration": 2.6127569675445557 + }, + { + "auxiliary_loss_clip": 0.06736939, + "auxiliary_loss_mlp": 0.01312026, + "balance_loss_clip": 0.06345257, + "balance_loss_mlp": 0.01268467, + "epoch": 0.09427326018337592, + "flos": 24323006067840.0, + "grad_norm": 2.023408518632979, + "language_loss": 0.8442241, + "learning_rate": 3.956822645856749e-06, + "loss": 0.92471373, + "num_input_tokens_seen": 33665665, + "router_z_loss_clip": 3.91796875, + "router_z_loss_mlp": 0.43505859, + "step": 1568, + "time_per_iteration": 2.569720506668091 + }, + { + "auxiliary_loss_clip": 0.06755883, + "auxiliary_loss_mlp": 0.01306618, + "balance_loss_clip": 0.06353641, + "balance_loss_mlp": 0.01263583, + "epoch": 0.09433338343604389, + "flos": 20269667859840.0, + "grad_norm": 2.477497103121254, + "language_loss": 0.77784359, + "learning_rate": 3.9567421198461814e-06, + "loss": 0.85846859, + "num_input_tokens_seen": 33684760, + "router_z_loss_clip": 4.01757812, + "router_z_loss_mlp": 0.43041992, + "step": 1569, + "time_per_iteration": 2.573776960372925 + }, + { + "auxiliary_loss_clip": 0.06750233, + "auxiliary_loss_mlp": 0.01322236, + "balance_loss_clip": 0.06360742, + "balance_loss_mlp": 0.01281443, + "epoch": 0.09439350668871185, + "flos": 12746683785600.0, + "grad_norm": 3.1104432371221495, + "language_loss": 0.87103617, + "learning_rate": 3.956661519635756e-06, + "loss": 0.95176083, + "num_input_tokens_seen": 33700750, + "router_z_loss_clip": 3.8984375, + "router_z_loss_mlp": 0.40795898, + "step": 1570, + "time_per_iteration": 2.5129590034484863 + }, + { + "auxiliary_loss_clip": 0.06749961, + "auxiliary_loss_mlp": 0.01311255, + "balance_loss_clip": 0.06350505, + "balance_loss_mlp": 0.01269007, + "epoch": 0.09445362994137983, + "flos": 25970101776000.0, + "grad_norm": 2.3671248077954297, + "language_loss": 0.7803812, + "learning_rate": 3.95658084522853e-06, + "loss": 0.86099339, + "num_input_tokens_seen": 33724430, + "router_z_loss_clip": 3.99609375, + "router_z_loss_mlp": 0.42236328, + "step": 1571, + "time_per_iteration": 2.7541556358337402 + }, + { + "auxiliary_loss_clip": 0.0672407, + "auxiliary_loss_mlp": 0.01308455, + "balance_loss_clip": 0.06346194, + "balance_loss_mlp": 0.01269807, + "epoch": 0.0945137531940478, + "flos": 19720831616640.0, + "grad_norm": 2.4306247586771934, + "language_loss": 0.81068146, + "learning_rate": 3.956500096627561e-06, + "loss": 0.89100671, + "num_input_tokens_seen": 33743455, + "router_z_loss_clip": 3.78125, + "router_z_loss_mlp": 0.38623047, + "step": 1572, + "time_per_iteration": 2.5679988861083984 + }, + { + "auxiliary_loss_clip": 0.06744019, + "auxiliary_loss_mlp": 0.01308416, + "balance_loss_clip": 0.06344286, + "balance_loss_mlp": 0.01265691, + "epoch": 0.09457387644671576, + "flos": 23622796224000.0, + "grad_norm": 3.3370924728894185, + "language_loss": 0.8915112, + "learning_rate": 3.956419273835913e-06, + "loss": 0.97203565, + "num_input_tokens_seen": 33763435, + "router_z_loss_clip": 3.99804688, + "router_z_loss_mlp": 0.42700195, + "step": 1573, + "time_per_iteration": 2.607600688934326 + }, + { + "auxiliary_loss_clip": 0.06757497, + "auxiliary_loss_mlp": 0.01304776, + "balance_loss_clip": 0.0635422, + "balance_loss_mlp": 0.0125919, + "epoch": 0.09463399969938374, + "flos": 26914681653120.0, + "grad_norm": 3.5983977458342764, + "language_loss": 0.83351094, + "learning_rate": 3.95633837685665e-06, + "loss": 0.91413361, + "num_input_tokens_seen": 33784325, + "router_z_loss_clip": 4.02734375, + "router_z_loss_mlp": 0.45605469, + "step": 1574, + "time_per_iteration": 2.629686117172241 + }, + { + "auxiliary_loss_clip": 0.06738517, + "auxiliary_loss_mlp": 0.01306377, + "balance_loss_clip": 0.06343692, + "balance_loss_mlp": 0.01264463, + "epoch": 0.0946941229520517, + "flos": 23666331219840.0, + "grad_norm": 2.307572986084867, + "language_loss": 0.82900977, + "learning_rate": 3.95625740569284e-06, + "loss": 0.9094587, + "num_input_tokens_seen": 33802510, + "router_z_loss_clip": 3.9453125, + "router_z_loss_mlp": 0.41918945, + "step": 1575, + "time_per_iteration": 2.6788809299468994 + }, + { + "auxiliary_loss_clip": 0.06738277, + "auxiliary_loss_mlp": 0.013099, + "balance_loss_clip": 0.06341611, + "balance_loss_mlp": 0.01265912, + "epoch": 0.09475424620471967, + "flos": 24140927145600.0, + "grad_norm": 3.091827797586119, + "language_loss": 0.88420904, + "learning_rate": 3.956176360347553e-06, + "loss": 0.9646908, + "num_input_tokens_seen": 33819980, + "router_z_loss_clip": 3.96875, + "router_z_loss_mlp": 0.43969727, + "step": 1576, + "time_per_iteration": 2.579481840133667 + }, + { + "auxiliary_loss_clip": 0.06599005, + "auxiliary_loss_mlp": 0.01293963, + "balance_loss_clip": 0.06343846, + "balance_loss_mlp": 0.01269894, + "epoch": 0.09481436945738765, + "flos": 68446283022720.0, + "grad_norm": 0.9736372426009887, + "language_loss": 0.66026628, + "learning_rate": 3.956095240823862e-06, + "loss": 0.73919594, + "num_input_tokens_seen": 33878925, + "router_z_loss_clip": 2.5546875, + "router_z_loss_mlp": 0.24060059, + "step": 1577, + "time_per_iteration": 3.1515533924102783 + }, + { + "auxiliary_loss_clip": 0.06730399, + "auxiliary_loss_mlp": 0.01300904, + "balance_loss_clip": 0.06338648, + "balance_loss_mlp": 0.01260373, + "epoch": 0.09487449271005562, + "flos": 16659633277440.0, + "grad_norm": 8.095983487206498, + "language_loss": 0.81352609, + "learning_rate": 3.956014047124844e-06, + "loss": 0.89383912, + "num_input_tokens_seen": 33897600, + "router_z_loss_clip": 3.91601562, + "router_z_loss_mlp": 0.40551758, + "step": 1578, + "time_per_iteration": 2.5477943420410156 + }, + { + "auxiliary_loss_clip": 0.06728384, + "auxiliary_loss_mlp": 0.01305272, + "balance_loss_clip": 0.06339101, + "balance_loss_mlp": 0.01262261, + "epoch": 0.09493461596272358, + "flos": 24281860913280.0, + "grad_norm": 2.2398618164761674, + "language_loss": 0.79482144, + "learning_rate": 3.955932779253578e-06, + "loss": 0.87515795, + "num_input_tokens_seen": 33917365, + "router_z_loss_clip": 3.89453125, + "router_z_loss_mlp": 0.43017578, + "step": 1579, + "time_per_iteration": 2.5968475341796875 + }, + { + "auxiliary_loss_clip": 0.06732477, + "auxiliary_loss_mlp": 0.01300696, + "balance_loss_clip": 0.06336749, + "balance_loss_mlp": 0.012579, + "epoch": 0.09499473921539155, + "flos": 21876373100160.0, + "grad_norm": 2.5076146880491406, + "language_loss": 0.75397295, + "learning_rate": 3.955851437213144e-06, + "loss": 0.83430469, + "num_input_tokens_seen": 33936680, + "router_z_loss_clip": 3.95703125, + "router_z_loss_mlp": 0.42822266, + "step": 1580, + "time_per_iteration": 2.570138931274414 + }, + { + "auxiliary_loss_clip": 0.06724589, + "auxiliary_loss_mlp": 0.01310914, + "balance_loss_clip": 0.06333821, + "balance_loss_mlp": 0.01268666, + "epoch": 0.09505486246805953, + "flos": 33555544669440.0, + "grad_norm": 5.064476993970354, + "language_loss": 0.78532892, + "learning_rate": 3.955770021006627e-06, + "loss": 0.86568391, + "num_input_tokens_seen": 33960685, + "router_z_loss_clip": 3.90625, + "router_z_loss_mlp": 0.42236328, + "step": 1581, + "time_per_iteration": 2.6650803089141846 + }, + { + "auxiliary_loss_clip": 0.06722299, + "auxiliary_loss_mlp": 0.01301656, + "balance_loss_clip": 0.06332248, + "balance_loss_mlp": 0.01261006, + "epoch": 0.09511498572072749, + "flos": 21221752677120.0, + "grad_norm": 5.1362606458817925, + "language_loss": 0.89191097, + "learning_rate": 3.955688530637116e-06, + "loss": 0.97215056, + "num_input_tokens_seen": 33980015, + "router_z_loss_clip": 3.90429688, + "router_z_loss_mlp": 0.40698242, + "step": 1582, + "time_per_iteration": 2.5564815998077393 + }, + { + "auxiliary_loss_clip": 0.06727481, + "auxiliary_loss_mlp": 0.01303544, + "balance_loss_clip": 0.06332925, + "balance_loss_mlp": 0.01261773, + "epoch": 0.09517510897339546, + "flos": 14616542373120.0, + "grad_norm": 2.3229781210723393, + "language_loss": 0.68368226, + "learning_rate": 3.955606966107699e-06, + "loss": 0.76399243, + "num_input_tokens_seen": 33997705, + "router_z_loss_clip": 3.94140625, + "router_z_loss_mlp": 0.41772461, + "step": 1583, + "time_per_iteration": 2.6164753437042236 + }, + { + "auxiliary_loss_clip": 0.06727771, + "auxiliary_loss_mlp": 0.01304751, + "balance_loss_clip": 0.06331809, + "balance_loss_mlp": 0.01261048, + "epoch": 0.09523523222606343, + "flos": 27824531212800.0, + "grad_norm": 3.115442275670272, + "language_loss": 0.72724044, + "learning_rate": 3.95552532742147e-06, + "loss": 0.80756557, + "num_input_tokens_seen": 34017465, + "router_z_loss_clip": 3.95703125, + "router_z_loss_mlp": 0.43725586, + "step": 1584, + "time_per_iteration": 2.604071855545044 + }, + { + "auxiliary_loss_clip": 0.06722259, + "auxiliary_loss_mlp": 0.01304961, + "balance_loss_clip": 0.06331295, + "balance_loss_mlp": 0.01265431, + "epoch": 0.0952953554787314, + "flos": 20712887631360.0, + "grad_norm": 1.6075041233622491, + "language_loss": 0.82572448, + "learning_rate": 3.955443614581525e-06, + "loss": 0.90599668, + "num_input_tokens_seen": 34038550, + "router_z_loss_clip": 3.91210938, + "router_z_loss_mlp": 0.39550781, + "step": 1585, + "time_per_iteration": 2.586507797241211 + }, + { + "auxiliary_loss_clip": 0.0673333, + "auxiliary_loss_mlp": 0.01317767, + "balance_loss_clip": 0.06331026, + "balance_loss_mlp": 0.01272039, + "epoch": 0.09535547873139937, + "flos": 24794080122240.0, + "grad_norm": 2.5515489551775854, + "language_loss": 0.74444079, + "learning_rate": 3.955361827590961e-06, + "loss": 0.82495177, + "num_input_tokens_seen": 34058665, + "router_z_loss_clip": 4.01953125, + "router_z_loss_mlp": 0.45727539, + "step": 1586, + "time_per_iteration": 2.629486083984375 + }, + { + "auxiliary_loss_clip": 0.06581648, + "auxiliary_loss_mlp": 0.01282136, + "balance_loss_clip": 0.06328419, + "balance_loss_mlp": 0.01258128, + "epoch": 0.09541560198406734, + "flos": 71930114956800.0, + "grad_norm": 0.7905774049307454, + "language_loss": 0.55110765, + "learning_rate": 3.955279966452883e-06, + "loss": 0.62974548, + "num_input_tokens_seen": 34109655, + "router_z_loss_clip": 2.53125, + "router_z_loss_mlp": 0.23974609, + "step": 1587, + "time_per_iteration": 2.9765305519104004 + }, + { + "auxiliary_loss_clip": 0.06737173, + "auxiliary_loss_mlp": 0.01308566, + "balance_loss_clip": 0.06336194, + "balance_loss_mlp": 0.01264316, + "epoch": 0.09547572523673531, + "flos": 28989609909120.0, + "grad_norm": 3.1625529132554835, + "language_loss": 0.82650244, + "learning_rate": 3.955198031170391e-06, + "loss": 0.90695989, + "num_input_tokens_seen": 34131115, + "router_z_loss_clip": 4.01171875, + "router_z_loss_mlp": 0.44213867, + "step": 1588, + "time_per_iteration": 2.6358370780944824 + }, + { + "auxiliary_loss_clip": 0.06726347, + "auxiliary_loss_mlp": 0.01313798, + "balance_loss_clip": 0.06331095, + "balance_loss_mlp": 0.01270716, + "epoch": 0.09553584848940327, + "flos": 24140759437440.0, + "grad_norm": 5.541794796195464, + "language_loss": 0.83084911, + "learning_rate": 3.955116021746594e-06, + "loss": 0.91125059, + "num_input_tokens_seen": 34151925, + "router_z_loss_clip": 3.95507812, + "router_z_loss_mlp": 0.43066406, + "step": 1589, + "time_per_iteration": 2.609682559967041 + }, + { + "auxiliary_loss_clip": 0.06720543, + "auxiliary_loss_mlp": 0.01306342, + "balance_loss_clip": 0.06330015, + "balance_loss_mlp": 0.01265263, + "epoch": 0.09559597174207124, + "flos": 42861401193600.0, + "grad_norm": 2.659540476465126, + "language_loss": 0.66428804, + "learning_rate": 3.955033938184601e-06, + "loss": 0.7445569, + "num_input_tokens_seen": 34175395, + "router_z_loss_clip": 3.91015625, + "router_z_loss_mlp": 0.41113281, + "step": 1590, + "time_per_iteration": 2.7904412746429443 + }, + { + "auxiliary_loss_clip": 0.06727439, + "auxiliary_loss_mlp": 0.01307692, + "balance_loss_clip": 0.06336293, + "balance_loss_mlp": 0.01267947, + "epoch": 0.09565609499473922, + "flos": 32678999907840.0, + "grad_norm": 1.976054240399588, + "language_loss": 0.84640449, + "learning_rate": 3.954951780487526e-06, + "loss": 0.92675579, + "num_input_tokens_seen": 34197760, + "router_z_loss_clip": 3.91210938, + "router_z_loss_mlp": 0.39746094, + "step": 1591, + "time_per_iteration": 2.677856683731079 + }, + { + "auxiliary_loss_clip": 0.0673625, + "auxiliary_loss_mlp": 0.01301164, + "balance_loss_clip": 0.06335758, + "balance_loss_mlp": 0.01259751, + "epoch": 0.09571621824740718, + "flos": 18484279787520.0, + "grad_norm": 3.2019409014799245, + "language_loss": 0.76485634, + "learning_rate": 3.9548695486584835e-06, + "loss": 0.84523046, + "num_input_tokens_seen": 34215330, + "router_z_loss_clip": 4.00976562, + "router_z_loss_mlp": 0.41381836, + "step": 1592, + "time_per_iteration": 2.5469346046447754 + }, + { + "auxiliary_loss_clip": 0.06718349, + "auxiliary_loss_mlp": 0.01308454, + "balance_loss_clip": 0.06327368, + "balance_loss_mlp": 0.01266444, + "epoch": 0.09577634150007515, + "flos": 29395164470400.0, + "grad_norm": 2.5830614134690757, + "language_loss": 0.75440031, + "learning_rate": 3.954787242700592e-06, + "loss": 0.8346684, + "num_input_tokens_seen": 34237745, + "router_z_loss_clip": 3.90625, + "router_z_loss_mlp": 0.42041016, + "step": 1593, + "time_per_iteration": 2.6077914237976074 + }, + { + "auxiliary_loss_clip": 0.06715257, + "auxiliary_loss_mlp": 0.01313469, + "balance_loss_clip": 0.06327495, + "balance_loss_mlp": 0.01269863, + "epoch": 0.09583646475274313, + "flos": 22754511089280.0, + "grad_norm": 3.098780608368182, + "language_loss": 0.70938909, + "learning_rate": 3.954704862616971e-06, + "loss": 0.78967637, + "num_input_tokens_seen": 34256565, + "router_z_loss_clip": 3.87890625, + "router_z_loss_mlp": 0.4362793, + "step": 1594, + "time_per_iteration": 2.6091833114624023 + }, + { + "auxiliary_loss_clip": 0.06719844, + "auxiliary_loss_mlp": 0.01312184, + "balance_loss_clip": 0.06326512, + "balance_loss_mlp": 0.01271247, + "epoch": 0.0958965880054111, + "flos": 23224495040640.0, + "grad_norm": 3.065197690061672, + "language_loss": 0.83355862, + "learning_rate": 3.954622408410747e-06, + "loss": 0.91387886, + "num_input_tokens_seen": 34275970, + "router_z_loss_clip": 3.92773438, + "router_z_loss_mlp": 0.40942383, + "step": 1595, + "time_per_iteration": 3.978273630142212 + }, + { + "auxiliary_loss_clip": 0.06729501, + "auxiliary_loss_mlp": 0.01321195, + "balance_loss_clip": 0.06329941, + "balance_loss_mlp": 0.01278638, + "epoch": 0.09595671125807906, + "flos": 21330807166080.0, + "grad_norm": 2.8509518249201866, + "language_loss": 0.87066317, + "learning_rate": 3.954539880085045e-06, + "loss": 0.95117009, + "num_input_tokens_seen": 34295490, + "router_z_loss_clip": 3.99414062, + "router_z_loss_mlp": 0.42529297, + "step": 1596, + "time_per_iteration": 4.032626390457153 + }, + { + "auxiliary_loss_clip": 0.06723377, + "auxiliary_loss_mlp": 0.01316069, + "balance_loss_clip": 0.06335501, + "balance_loss_mlp": 0.01273273, + "epoch": 0.09601683451074704, + "flos": 39612841125120.0, + "grad_norm": 3.1423731979310587, + "language_loss": 0.70766866, + "learning_rate": 3.9544572776429945e-06, + "loss": 0.78806317, + "num_input_tokens_seen": 34319990, + "router_z_loss_clip": 3.8828125, + "router_z_loss_mlp": 0.42773438, + "step": 1597, + "time_per_iteration": 2.7174298763275146 + }, + { + "auxiliary_loss_clip": 0.06742129, + "auxiliary_loss_mlp": 0.01306146, + "balance_loss_clip": 0.06339651, + "balance_loss_mlp": 0.01265687, + "epoch": 0.096076957763415, + "flos": 23739523361280.0, + "grad_norm": 3.050895337571829, + "language_loss": 0.77272135, + "learning_rate": 3.954374601087729e-06, + "loss": 0.85320413, + "num_input_tokens_seen": 34339225, + "router_z_loss_clip": 4.02734375, + "router_z_loss_mlp": 0.40429688, + "step": 1598, + "time_per_iteration": 2.5799829959869385 + }, + { + "auxiliary_loss_clip": 0.06737213, + "auxiliary_loss_mlp": 0.01319114, + "balance_loss_clip": 0.06339812, + "balance_loss_mlp": 0.01276103, + "epoch": 0.09613708101608297, + "flos": 34686689662080.0, + "grad_norm": 4.982256482437043, + "language_loss": 0.70875788, + "learning_rate": 3.954291850422382e-06, + "loss": 0.78932118, + "num_input_tokens_seen": 34361020, + "router_z_loss_clip": 3.96679688, + "router_z_loss_mlp": 0.43041992, + "step": 1599, + "time_per_iteration": 4.165144443511963 + }, + { + "auxiliary_loss_clip": 0.0672265, + "auxiliary_loss_mlp": 0.01315059, + "balance_loss_clip": 0.06336158, + "balance_loss_mlp": 0.01275029, + "epoch": 0.09619720426875093, + "flos": 20746192429440.0, + "grad_norm": 2.7563705555600655, + "language_loss": 0.85738063, + "learning_rate": 3.954209025650093e-06, + "loss": 0.93775773, + "num_input_tokens_seen": 34378630, + "router_z_loss_clip": 3.86523438, + "router_z_loss_mlp": 0.40014648, + "step": 1600, + "time_per_iteration": 2.583336591720581 + }, + { + "auxiliary_loss_clip": 0.06737998, + "auxiliary_loss_mlp": 0.01310218, + "balance_loss_clip": 0.06341977, + "balance_loss_mlp": 0.01270641, + "epoch": 0.09625732752141891, + "flos": 13047795832320.0, + "grad_norm": 2.909698328635622, + "language_loss": 0.82446879, + "learning_rate": 3.954126126774001e-06, + "loss": 0.90495098, + "num_input_tokens_seen": 34397110, + "router_z_loss_clip": 3.96484375, + "router_z_loss_mlp": 0.39599609, + "step": 1601, + "time_per_iteration": 3.9834721088409424 + }, + { + "auxiliary_loss_clip": 0.06743482, + "auxiliary_loss_mlp": 0.01303448, + "balance_loss_clip": 0.06337628, + "balance_loss_mlp": 0.01262368, + "epoch": 0.09631745077408688, + "flos": 22280250579840.0, + "grad_norm": 5.887605287140624, + "language_loss": 0.84592891, + "learning_rate": 3.954043153797251e-06, + "loss": 0.92639828, + "num_input_tokens_seen": 34414165, + "router_z_loss_clip": 4.0546875, + "router_z_loss_mlp": 0.41088867, + "step": 1602, + "time_per_iteration": 2.5633962154388428 + }, + { + "auxiliary_loss_clip": 0.06747036, + "auxiliary_loss_mlp": 0.01307728, + "balance_loss_clip": 0.06349348, + "balance_loss_mlp": 0.012661, + "epoch": 0.09637757402675484, + "flos": 24761236521600.0, + "grad_norm": 2.955003508709107, + "language_loss": 0.65285349, + "learning_rate": 3.953960106722989e-06, + "loss": 0.73340118, + "num_input_tokens_seen": 34434445, + "router_z_loss_clip": 3.97851562, + "router_z_loss_mlp": 0.41625977, + "step": 1603, + "time_per_iteration": 2.6790709495544434 + }, + { + "auxiliary_loss_clip": 0.06770037, + "auxiliary_loss_mlp": 0.01301761, + "balance_loss_clip": 0.06360609, + "balance_loss_mlp": 0.01258321, + "epoch": 0.09643769727942282, + "flos": 22531873991040.0, + "grad_norm": 5.353230367509213, + "language_loss": 0.72867018, + "learning_rate": 3.953876985554364e-06, + "loss": 0.80938816, + "num_input_tokens_seen": 34453095, + "router_z_loss_clip": 4.09570312, + "router_z_loss_mlp": 0.43505859, + "step": 1604, + "time_per_iteration": 2.608727216720581 + }, + { + "auxiliary_loss_clip": 0.06740201, + "auxiliary_loss_mlp": 0.01291258, + "balance_loss_clip": 0.06351058, + "balance_loss_mlp": 0.01254327, + "epoch": 0.09649782053209079, + "flos": 30929138766720.0, + "grad_norm": 4.793252253869783, + "language_loss": 0.80923069, + "learning_rate": 3.953793790294527e-06, + "loss": 0.88954532, + "num_input_tokens_seen": 34473680, + "router_z_loss_clip": 3.890625, + "router_z_loss_mlp": 0.36938477, + "step": 1605, + "time_per_iteration": 2.6763031482696533 + }, + { + "auxiliary_loss_clip": 0.06759577, + "auxiliary_loss_mlp": 0.01298287, + "balance_loss_clip": 0.06351094, + "balance_loss_mlp": 0.01258805, + "epoch": 0.09655794378475875, + "flos": 25344635374080.0, + "grad_norm": 2.3859738867756524, + "language_loss": 0.77227855, + "learning_rate": 3.953710520946634e-06, + "loss": 0.85285711, + "num_input_tokens_seen": 34492610, + "router_z_loss_clip": 4.08203125, + "router_z_loss_mlp": 0.39501953, + "step": 1606, + "time_per_iteration": 2.5902390480041504 + }, + { + "auxiliary_loss_clip": 0.0675118, + "auxiliary_loss_mlp": 0.0129606, + "balance_loss_clip": 0.06355944, + "balance_loss_mlp": 0.01258009, + "epoch": 0.09661806703742673, + "flos": 22352604180480.0, + "grad_norm": 2.2398823980048133, + "language_loss": 0.77161521, + "learning_rate": 3.953627177513843e-06, + "loss": 0.85208762, + "num_input_tokens_seen": 34511855, + "router_z_loss_clip": 3.953125, + "router_z_loss_mlp": 0.38085938, + "step": 1607, + "time_per_iteration": 2.5747807025909424 + }, + { + "auxiliary_loss_clip": 0.06767638, + "auxiliary_loss_mlp": 0.01306362, + "balance_loss_clip": 0.06365312, + "balance_loss_mlp": 0.01268597, + "epoch": 0.0966781902900947, + "flos": 17463405168000.0, + "grad_norm": 2.424309477239619, + "language_loss": 0.89527833, + "learning_rate": 3.953543759999312e-06, + "loss": 0.97601831, + "num_input_tokens_seen": 34528905, + "router_z_loss_clip": 4.02539062, + "router_z_loss_mlp": 0.37768555, + "step": 1608, + "time_per_iteration": 2.528881072998047 + }, + { + "auxiliary_loss_clip": 0.06782863, + "auxiliary_loss_mlp": 0.01306552, + "balance_loss_clip": 0.06378618, + "balance_loss_mlp": 0.01264471, + "epoch": 0.09673831354276266, + "flos": 36912991518720.0, + "grad_norm": 7.970472148643012, + "language_loss": 0.74000025, + "learning_rate": 3.953460268406207e-06, + "loss": 0.82089442, + "num_input_tokens_seen": 34548480, + "router_z_loss_clip": 4.0390625, + "router_z_loss_mlp": 0.4206543, + "step": 1609, + "time_per_iteration": 2.734060764312744 + }, + { + "auxiliary_loss_clip": 0.06767572, + "auxiliary_loss_mlp": 0.01304591, + "balance_loss_clip": 0.06368488, + "balance_loss_mlp": 0.01264418, + "epoch": 0.09679843679543064, + "flos": 20707185553920.0, + "grad_norm": 3.4585784172758123, + "language_loss": 0.86017323, + "learning_rate": 3.953376702737693e-06, + "loss": 0.94089484, + "num_input_tokens_seen": 34565410, + "router_z_loss_clip": 3.99414062, + "router_z_loss_mlp": 0.40185547, + "step": 1610, + "time_per_iteration": 2.6115059852600098 + }, + { + "auxiliary_loss_clip": 0.06763892, + "auxiliary_loss_mlp": 0.01304909, + "balance_loss_clip": 0.06364195, + "balance_loss_mlp": 0.01263877, + "epoch": 0.0968585600480986, + "flos": 23521288602240.0, + "grad_norm": 2.270672864322457, + "language_loss": 0.68734491, + "learning_rate": 3.953293062996939e-06, + "loss": 0.76803291, + "num_input_tokens_seen": 34584840, + "router_z_loss_clip": 3.9921875, + "router_z_loss_mlp": 0.41040039, + "step": 1611, + "time_per_iteration": 2.614010810852051 + }, + { + "auxiliary_loss_clip": 0.06775121, + "auxiliary_loss_mlp": 0.01302817, + "balance_loss_clip": 0.06373329, + "balance_loss_mlp": 0.01263239, + "epoch": 0.09691868330076657, + "flos": 20127350499840.0, + "grad_norm": 2.139701940573329, + "language_loss": 0.82997268, + "learning_rate": 3.953209349187115e-06, + "loss": 0.91075206, + "num_input_tokens_seen": 34603360, + "router_z_loss_clip": 4.0234375, + "router_z_loss_mlp": 0.39599609, + "step": 1612, + "time_per_iteration": 2.5493521690368652 + }, + { + "auxiliary_loss_clip": 0.06771481, + "auxiliary_loss_mlp": 0.01301111, + "balance_loss_clip": 0.06373016, + "balance_loss_mlp": 0.01260509, + "epoch": 0.09697880655343454, + "flos": 16550243372160.0, + "grad_norm": 8.083682244788854, + "language_loss": 0.82256299, + "learning_rate": 3.953125561311398e-06, + "loss": 0.90328896, + "num_input_tokens_seen": 34620760, + "router_z_loss_clip": 3.984375, + "router_z_loss_mlp": 0.40600586, + "step": 1613, + "time_per_iteration": 2.597912311553955 + }, + { + "auxiliary_loss_clip": 0.06750716, + "auxiliary_loss_mlp": 0.01299993, + "balance_loss_clip": 0.06359349, + "balance_loss_mlp": 0.01259724, + "epoch": 0.09703892980610251, + "flos": 26111370960000.0, + "grad_norm": 2.0260319330855654, + "language_loss": 0.86653531, + "learning_rate": 3.953041699372964e-06, + "loss": 0.94704247, + "num_input_tokens_seen": 34640695, + "router_z_loss_clip": 3.91210938, + "router_z_loss_mlp": 0.40258789, + "step": 1614, + "time_per_iteration": 2.6904046535491943 + }, + { + "auxiliary_loss_clip": 0.06673412, + "auxiliary_loss_mlp": 0.0133076, + "balance_loss_clip": 0.06412064, + "balance_loss_mlp": 0.01308611, + "epoch": 0.09709905305877048, + "flos": 60463712903040.0, + "grad_norm": 0.7036996820791193, + "language_loss": 0.54819673, + "learning_rate": 3.952957763374992e-06, + "loss": 0.6282385, + "num_input_tokens_seen": 34702395, + "router_z_loss_clip": 2.61914062, + "router_z_loss_mlp": 0.22180176, + "step": 1615, + "time_per_iteration": 3.235962152481079 + }, + { + "auxiliary_loss_clip": 0.06658442, + "auxiliary_loss_mlp": 0.01303789, + "balance_loss_clip": 0.06397749, + "balance_loss_mlp": 0.01282129, + "epoch": 0.09715917631143844, + "flos": 57660510885120.0, + "grad_norm": 0.7526049722603284, + "language_loss": 0.58190084, + "learning_rate": 3.952873753320666e-06, + "loss": 0.66152322, + "num_input_tokens_seen": 34768910, + "router_z_loss_clip": 2.59765625, + "router_z_loss_mlp": 0.21691895, + "step": 1616, + "time_per_iteration": 3.387523889541626 + }, + { + "auxiliary_loss_clip": 0.06757308, + "auxiliary_loss_mlp": 0.01307733, + "balance_loss_clip": 0.06359798, + "balance_loss_mlp": 0.01265652, + "epoch": 0.09721929956410642, + "flos": 20564448923520.0, + "grad_norm": 2.209089082853045, + "language_loss": 0.70192569, + "learning_rate": 3.952789669213172e-06, + "loss": 0.78257608, + "num_input_tokens_seen": 34787680, + "router_z_loss_clip": 3.97265625, + "router_z_loss_mlp": 0.42041016, + "step": 1617, + "time_per_iteration": 2.5756118297576904 + }, + { + "auxiliary_loss_clip": 0.06757677, + "auxiliary_loss_mlp": 0.0131002, + "balance_loss_clip": 0.06358766, + "balance_loss_mlp": 0.01269298, + "epoch": 0.09727942281677439, + "flos": 27351696222720.0, + "grad_norm": 2.235248973511229, + "language_loss": 0.81849337, + "learning_rate": 3.952705511055698e-06, + "loss": 0.89917034, + "num_input_tokens_seen": 34808330, + "router_z_loss_clip": 3.98632812, + "router_z_loss_mlp": 0.40722656, + "step": 1618, + "time_per_iteration": 2.6768393516540527 + }, + { + "auxiliary_loss_clip": 0.0674091, + "auxiliary_loss_mlp": 0.01309795, + "balance_loss_clip": 0.06356256, + "balance_loss_mlp": 0.01273293, + "epoch": 0.09733954606944235, + "flos": 24906991898880.0, + "grad_norm": 1.9369475823390685, + "language_loss": 0.94461536, + "learning_rate": 3.952621278851435e-06, + "loss": 1.0251224, + "num_input_tokens_seen": 34830020, + "router_z_loss_clip": 3.84375, + "router_z_loss_mlp": 0.36474609, + "step": 1619, + "time_per_iteration": 2.6324799060821533 + }, + { + "auxiliary_loss_clip": 0.06749003, + "auxiliary_loss_mlp": 0.01319848, + "balance_loss_clip": 0.06356695, + "balance_loss_mlp": 0.01280556, + "epoch": 0.09739966932211033, + "flos": 31511992567680.0, + "grad_norm": 2.8077555075872183, + "language_loss": 0.90160304, + "learning_rate": 3.9525369726035784e-06, + "loss": 0.98229158, + "num_input_tokens_seen": 34850330, + "router_z_loss_clip": 3.91601562, + "router_z_loss_mlp": 0.39257812, + "step": 1620, + "time_per_iteration": 2.658043146133423 + }, + { + "auxiliary_loss_clip": 0.06742691, + "auxiliary_loss_mlp": 0.01310778, + "balance_loss_clip": 0.06352507, + "balance_loss_mlp": 0.01268602, + "epoch": 0.0974597925747783, + "flos": 23885614154880.0, + "grad_norm": 11.754534189846764, + "language_loss": 0.78833234, + "learning_rate": 3.952452592315324e-06, + "loss": 0.86886704, + "num_input_tokens_seen": 34871640, + "router_z_loss_clip": 3.90429688, + "router_z_loss_mlp": 0.421875, + "step": 1621, + "time_per_iteration": 2.575810432434082 + }, + { + "auxiliary_loss_clip": 0.06744215, + "auxiliary_loss_mlp": 0.01311535, + "balance_loss_clip": 0.06357577, + "balance_loss_mlp": 0.01271863, + "epoch": 0.09751991582744626, + "flos": 17025300495360.0, + "grad_norm": 3.321884403192612, + "language_loss": 0.7956326, + "learning_rate": 3.952368137989871e-06, + "loss": 0.87619019, + "num_input_tokens_seen": 34888100, + "router_z_loss_clip": 3.86523438, + "router_z_loss_mlp": 0.39648438, + "step": 1622, + "time_per_iteration": 2.5544931888580322 + }, + { + "auxiliary_loss_clip": 0.06764823, + "auxiliary_loss_mlp": 0.01312235, + "balance_loss_clip": 0.06359966, + "balance_loss_mlp": 0.0127199, + "epoch": 0.09758003908011423, + "flos": 28410403760640.0, + "grad_norm": 4.629544309513281, + "language_loss": 0.86985308, + "learning_rate": 3.9522836096304225e-06, + "loss": 0.95062363, + "num_input_tokens_seen": 34910485, + "router_z_loss_clip": 4.046875, + "router_z_loss_mlp": 0.40209961, + "step": 1623, + "time_per_iteration": 2.612455129623413 + }, + { + "auxiliary_loss_clip": 0.06759211, + "auxiliary_loss_mlp": 0.01313929, + "balance_loss_clip": 0.06368798, + "balance_loss_mlp": 0.01275353, + "epoch": 0.09764016233278221, + "flos": 18149150183040.0, + "grad_norm": 2.3724260177997, + "language_loss": 0.82168519, + "learning_rate": 3.952199007240184e-06, + "loss": 0.90241659, + "num_input_tokens_seen": 34928615, + "router_z_loss_clip": 3.90429688, + "router_z_loss_mlp": 0.38598633, + "step": 1624, + "time_per_iteration": 2.572327136993408 + }, + { + "auxiliary_loss_clip": 0.06750062, + "auxiliary_loss_mlp": 0.01321107, + "balance_loss_clip": 0.06362263, + "balance_loss_mlp": 0.01284462, + "epoch": 0.09770028558545017, + "flos": 15270869306880.0, + "grad_norm": 2.8002590375685195, + "language_loss": 0.87639892, + "learning_rate": 3.952114330822364e-06, + "loss": 0.95711064, + "num_input_tokens_seen": 34946045, + "router_z_loss_clip": 3.87890625, + "router_z_loss_mlp": 0.36645508, + "step": 1625, + "time_per_iteration": 2.5327792167663574 + }, + { + "auxiliary_loss_clip": 0.06781108, + "auxiliary_loss_mlp": 0.01314743, + "balance_loss_clip": 0.06374431, + "balance_loss_mlp": 0.01273353, + "epoch": 0.09776040883811814, + "flos": 23478382512000.0, + "grad_norm": 2.111707696763749, + "language_loss": 0.8695811, + "learning_rate": 3.952029580380172e-06, + "loss": 0.95053965, + "num_input_tokens_seen": 34962865, + "router_z_loss_clip": 4.06445312, + "router_z_loss_mlp": 0.4140625, + "step": 1626, + "time_per_iteration": 2.631251096725464 + }, + { + "auxiliary_loss_clip": 0.067652, + "auxiliary_loss_mlp": 0.01306731, + "balance_loss_clip": 0.06367379, + "balance_loss_mlp": 0.01267177, + "epoch": 0.09782053209078612, + "flos": 24506510509440.0, + "grad_norm": 2.38090987978409, + "language_loss": 0.84928203, + "learning_rate": 3.9519447559168234e-06, + "loss": 0.93000138, + "num_input_tokens_seen": 34983505, + "router_z_loss_clip": 3.97460938, + "router_z_loss_mlp": 0.39550781, + "step": 1627, + "time_per_iteration": 2.6171953678131104 + }, + { + "auxiliary_loss_clip": 0.06749414, + "auxiliary_loss_mlp": 0.01311575, + "balance_loss_clip": 0.06362557, + "balance_loss_mlp": 0.01274334, + "epoch": 0.09788065534345408, + "flos": 21586623281280.0, + "grad_norm": 2.0465991602511107, + "language_loss": 0.86433482, + "learning_rate": 3.951859857435534e-06, + "loss": 0.94494474, + "num_input_tokens_seen": 35001825, + "router_z_loss_clip": 3.8671875, + "router_z_loss_mlp": 0.37255859, + "step": 1628, + "time_per_iteration": 2.5730161666870117 + }, + { + "auxiliary_loss_clip": 0.06751154, + "auxiliary_loss_mlp": 0.013221, + "balance_loss_clip": 0.06365977, + "balance_loss_mlp": 0.0128362, + "epoch": 0.09794077859612205, + "flos": 23849332536960.0, + "grad_norm": 2.074450963540643, + "language_loss": 0.76707101, + "learning_rate": 3.951774884939523e-06, + "loss": 0.84780353, + "num_input_tokens_seen": 35023075, + "router_z_loss_clip": 3.8515625, + "router_z_loss_mlp": 0.38452148, + "step": 1629, + "time_per_iteration": 2.615643262863159 + }, + { + "auxiliary_loss_clip": 0.06753751, + "auxiliary_loss_mlp": 0.01312675, + "balance_loss_clip": 0.06363355, + "balance_loss_mlp": 0.01273288, + "epoch": 0.09800090184879003, + "flos": 23666708563200.0, + "grad_norm": 2.0658158581699806, + "language_loss": 0.79474878, + "learning_rate": 3.951689838432013e-06, + "loss": 0.87541306, + "num_input_tokens_seen": 35043480, + "router_z_loss_clip": 3.91015625, + "router_z_loss_mlp": 0.39379883, + "step": 1630, + "time_per_iteration": 2.5846662521362305 + }, + { + "auxiliary_loss_clip": 0.06751612, + "auxiliary_loss_mlp": 0.01306103, + "balance_loss_clip": 0.06359278, + "balance_loss_mlp": 0.01266335, + "epoch": 0.09806102510145799, + "flos": 17061456332160.0, + "grad_norm": 3.092577982684634, + "language_loss": 0.88391125, + "learning_rate": 3.951604717916228e-06, + "loss": 0.96448845, + "num_input_tokens_seen": 35061490, + "router_z_loss_clip": 3.92382812, + "router_z_loss_mlp": 0.39770508, + "step": 1631, + "time_per_iteration": 2.545468807220459 + }, + { + "auxiliary_loss_clip": 0.06742664, + "auxiliary_loss_mlp": 0.01296447, + "balance_loss_clip": 0.06359032, + "balance_loss_mlp": 0.01259039, + "epoch": 0.09812114835412596, + "flos": 23885278738560.0, + "grad_norm": 2.2303411170681566, + "language_loss": 0.8421644, + "learning_rate": 3.9515195233953975e-06, + "loss": 0.92255551, + "num_input_tokens_seen": 35079670, + "router_z_loss_clip": 3.8359375, + "router_z_loss_mlp": 0.37426758, + "step": 1632, + "time_per_iteration": 2.5765457153320312 + }, + { + "auxiliary_loss_clip": 0.06746343, + "auxiliary_loss_mlp": 0.01300275, + "balance_loss_clip": 0.0636283, + "balance_loss_mlp": 0.01262557, + "epoch": 0.09818127160679392, + "flos": 20601862571520.0, + "grad_norm": 2.054168262723839, + "language_loss": 0.80421484, + "learning_rate": 3.951434254872751e-06, + "loss": 0.88468099, + "num_input_tokens_seen": 35099205, + "router_z_loss_clip": 3.83984375, + "router_z_loss_mlp": 0.37744141, + "step": 1633, + "time_per_iteration": 2.5900163650512695 + }, + { + "auxiliary_loss_clip": 0.06752759, + "auxiliary_loss_mlp": 0.01296054, + "balance_loss_clip": 0.06366011, + "balance_loss_mlp": 0.01257931, + "epoch": 0.0982413948594619, + "flos": 15492835572480.0, + "grad_norm": 3.0165255601535743, + "language_loss": 0.74936914, + "learning_rate": 3.951348912351521e-06, + "loss": 0.82985729, + "num_input_tokens_seen": 35115270, + "router_z_loss_clip": 3.86914062, + "router_z_loss_mlp": 0.38134766, + "step": 1634, + "time_per_iteration": 3.9524917602539062 + }, + { + "auxiliary_loss_clip": 0.06754396, + "auxiliary_loss_mlp": 0.01296894, + "balance_loss_clip": 0.06358244, + "balance_loss_mlp": 0.01258485, + "epoch": 0.09830151811212987, + "flos": 24214999754880.0, + "grad_norm": 4.629396807552869, + "language_loss": 0.75166363, + "learning_rate": 3.951263495834947e-06, + "loss": 0.83217651, + "num_input_tokens_seen": 35134065, + "router_z_loss_clip": 3.95898438, + "router_z_loss_mlp": 0.3840332, + "step": 1635, + "time_per_iteration": 2.619173049926758 + }, + { + "auxiliary_loss_clip": 0.06750873, + "auxiliary_loss_mlp": 0.01303971, + "balance_loss_clip": 0.0635405, + "balance_loss_mlp": 0.01262486, + "epoch": 0.09836164136479783, + "flos": 20600814395520.0, + "grad_norm": 5.1262872331137945, + "language_loss": 0.79884511, + "learning_rate": 3.951178005326264e-06, + "loss": 0.87939358, + "num_input_tokens_seen": 35154870, + "router_z_loss_clip": 3.96875, + "router_z_loss_mlp": 0.41455078, + "step": 1636, + "time_per_iteration": 4.063632965087891 + }, + { + "auxiliary_loss_clip": 0.06755228, + "auxiliary_loss_mlp": 0.0130259, + "balance_loss_clip": 0.06357834, + "balance_loss_mlp": 0.01260486, + "epoch": 0.09842176461746581, + "flos": 19939653354240.0, + "grad_norm": 2.182253503011162, + "language_loss": 0.72318256, + "learning_rate": 3.951092440828715e-06, + "loss": 0.80376077, + "num_input_tokens_seen": 35171850, + "router_z_loss_clip": 3.97460938, + "router_z_loss_mlp": 0.42163086, + "step": 1637, + "time_per_iteration": 2.573108196258545 + }, + { + "auxiliary_loss_clip": 0.0673624, + "auxiliary_loss_mlp": 0.01302289, + "balance_loss_clip": 0.06349343, + "balance_loss_mlp": 0.01263045, + "epoch": 0.09848188787013377, + "flos": 21220956063360.0, + "grad_norm": 2.9423896219595016, + "language_loss": 0.79459947, + "learning_rate": 3.951006802345545e-06, + "loss": 0.87498474, + "num_input_tokens_seen": 35188795, + "router_z_loss_clip": 3.87304688, + "router_z_loss_mlp": 0.39257812, + "step": 1638, + "time_per_iteration": 2.620058536529541 + }, + { + "auxiliary_loss_clip": 0.06725241, + "auxiliary_loss_mlp": 0.01294434, + "balance_loss_clip": 0.06345727, + "balance_loss_mlp": 0.01258027, + "epoch": 0.09854201112280174, + "flos": 30162109691520.0, + "grad_norm": 1.743966069044169, + "language_loss": 0.7446866, + "learning_rate": 3.950921089880003e-06, + "loss": 0.82488334, + "num_input_tokens_seen": 35212100, + "router_z_loss_clip": 3.79296875, + "router_z_loss_mlp": 0.36401367, + "step": 1639, + "time_per_iteration": 4.186578750610352 + }, + { + "auxiliary_loss_clip": 0.06740695, + "auxiliary_loss_mlp": 0.01301032, + "balance_loss_clip": 0.06346842, + "balance_loss_mlp": 0.01260025, + "epoch": 0.09860213437546972, + "flos": 21801671585280.0, + "grad_norm": 2.1837560711862114, + "language_loss": 0.90050477, + "learning_rate": 3.950835303435337e-06, + "loss": 0.9809221, + "num_input_tokens_seen": 35230390, + "router_z_loss_clip": 3.93945312, + "router_z_loss_mlp": 0.41040039, + "step": 1640, + "time_per_iteration": 2.571072816848755 + }, + { + "auxiliary_loss_clip": 0.06734361, + "auxiliary_loss_mlp": 0.01304387, + "balance_loss_clip": 0.06346233, + "balance_loss_mlp": 0.01265548, + "epoch": 0.09866225762813768, + "flos": 21842062053120.0, + "grad_norm": 2.730520486163119, + "language_loss": 0.82726961, + "learning_rate": 3.950749443014801e-06, + "loss": 0.90765709, + "num_input_tokens_seen": 35250405, + "router_z_loss_clip": 3.87695312, + "router_z_loss_mlp": 0.38818359, + "step": 1641, + "time_per_iteration": 3.9849867820739746 + }, + { + "auxiliary_loss_clip": 0.06739942, + "auxiliary_loss_mlp": 0.01313392, + "balance_loss_clip": 0.06347778, + "balance_loss_mlp": 0.01271692, + "epoch": 0.09872238088080565, + "flos": 17605093622400.0, + "grad_norm": 3.096093902434135, + "language_loss": 0.88531339, + "learning_rate": 3.95066350862165e-06, + "loss": 0.96584678, + "num_input_tokens_seen": 35262820, + "router_z_loss_clip": 3.92382812, + "router_z_loss_mlp": 0.41699219, + "step": 1642, + "time_per_iteration": 2.516415596008301 + }, + { + "auxiliary_loss_clip": 0.06737699, + "auxiliary_loss_mlp": 0.01318919, + "balance_loss_clip": 0.06353228, + "balance_loss_mlp": 0.01281606, + "epoch": 0.09878250413347361, + "flos": 27643500466560.0, + "grad_norm": 2.0791034906225883, + "language_loss": 0.82263941, + "learning_rate": 3.950577500259144e-06, + "loss": 0.90320563, + "num_input_tokens_seen": 35284490, + "router_z_loss_clip": 3.84179688, + "router_z_loss_mlp": 0.37304688, + "step": 1643, + "time_per_iteration": 2.647494077682495 + }, + { + "auxiliary_loss_clip": 0.06734201, + "auxiliary_loss_mlp": 0.01331721, + "balance_loss_clip": 0.06346507, + "balance_loss_mlp": 0.01293407, + "epoch": 0.0988426273861416, + "flos": 16550285299200.0, + "grad_norm": 2.4456553195112574, + "language_loss": 0.84032261, + "learning_rate": 3.950491417930543e-06, + "loss": 0.92098182, + "num_input_tokens_seen": 35302815, + "router_z_loss_clip": 3.88085938, + "router_z_loss_mlp": 0.3828125, + "step": 1644, + "time_per_iteration": 2.532773733139038 + }, + { + "auxiliary_loss_clip": 0.06725995, + "auxiliary_loss_mlp": 0.01324281, + "balance_loss_clip": 0.06350633, + "balance_loss_mlp": 0.0128499, + "epoch": 0.09890275063880956, + "flos": 21221668823040.0, + "grad_norm": 2.0467133061416956, + "language_loss": 0.70372713, + "learning_rate": 3.9504052616391124e-06, + "loss": 0.78422999, + "num_input_tokens_seen": 35321175, + "router_z_loss_clip": 3.75390625, + "router_z_loss_mlp": 0.39282227, + "step": 1645, + "time_per_iteration": 2.622675657272339 + }, + { + "auxiliary_loss_clip": 0.06615774, + "auxiliary_loss_mlp": 0.01318713, + "balance_loss_clip": 0.06367776, + "balance_loss_mlp": 0.01297721, + "epoch": 0.09896287389147752, + "flos": 59398255111680.0, + "grad_norm": 0.866313536392572, + "language_loss": 0.6076256, + "learning_rate": 3.950319031388119e-06, + "loss": 0.68697047, + "num_input_tokens_seen": 35381740, + "router_z_loss_clip": 2.48242188, + "router_z_loss_mlp": 0.21008301, + "step": 1646, + "time_per_iteration": 3.1056430339813232 + }, + { + "auxiliary_loss_clip": 0.06736847, + "auxiliary_loss_mlp": 0.01330956, + "balance_loss_clip": 0.06343894, + "balance_loss_mlp": 0.01288517, + "epoch": 0.0990229971441455, + "flos": 29650351680000.0, + "grad_norm": 13.669187568501263, + "language_loss": 0.74906254, + "learning_rate": 3.950232727180833e-06, + "loss": 0.82974058, + "num_input_tokens_seen": 35403760, + "router_z_loss_clip": 3.93164062, + "router_z_loss_mlp": 0.42456055, + "step": 1647, + "time_per_iteration": 2.6270813941955566 + }, + { + "auxiliary_loss_clip": 0.06742343, + "auxiliary_loss_mlp": 0.01344997, + "balance_loss_clip": 0.0635362, + "balance_loss_mlp": 0.01305277, + "epoch": 0.09908312039681347, + "flos": 21841265439360.0, + "grad_norm": 3.219880040136517, + "language_loss": 0.86054468, + "learning_rate": 3.950146349020525e-06, + "loss": 0.94141805, + "num_input_tokens_seen": 35424050, + "router_z_loss_clip": 3.88671875, + "router_z_loss_mlp": 0.3972168, + "step": 1648, + "time_per_iteration": 2.6192800998687744 + }, + { + "auxiliary_loss_clip": 0.06595583, + "auxiliary_loss_mlp": 0.01312987, + "balance_loss_clip": 0.06350748, + "balance_loss_mlp": 0.01292542, + "epoch": 0.09914324364948143, + "flos": 57584425777920.0, + "grad_norm": 0.7273762983113155, + "language_loss": 0.5560773, + "learning_rate": 3.950059896910473e-06, + "loss": 0.63516295, + "num_input_tokens_seen": 35481690, + "router_z_loss_clip": 2.44140625, + "router_z_loss_mlp": 0.20446777, + "step": 1649, + "time_per_iteration": 3.1318249702453613 + }, + { + "auxiliary_loss_clip": 0.06736004, + "auxiliary_loss_mlp": 0.01331784, + "balance_loss_clip": 0.06347787, + "balance_loss_mlp": 0.01293232, + "epoch": 0.09920336690214941, + "flos": 34131270873600.0, + "grad_norm": 3.80404299498915, + "language_loss": 0.92154968, + "learning_rate": 3.949973370853954e-06, + "loss": 1.00222754, + "num_input_tokens_seen": 35498635, + "router_z_loss_clip": 3.88085938, + "router_z_loss_mlp": 0.38574219, + "step": 1650, + "time_per_iteration": 2.640519142150879 + }, + { + "auxiliary_loss_clip": 0.06583999, + "auxiliary_loss_mlp": 0.012899, + "balance_loss_clip": 0.06337862, + "balance_loss_mlp": 0.012688, + "epoch": 0.09926349015481738, + "flos": 71239910947200.0, + "grad_norm": 0.7750953568391499, + "language_loss": 0.63578606, + "learning_rate": 3.94988677085425e-06, + "loss": 0.71452504, + "num_input_tokens_seen": 35565720, + "router_z_loss_clip": 2.46679688, + "router_z_loss_mlp": 0.21118164, + "step": 1651, + "time_per_iteration": 3.380758047103882 + }, + { + "auxiliary_loss_clip": 0.06739324, + "auxiliary_loss_mlp": 0.01313359, + "balance_loss_clip": 0.06352896, + "balance_loss_mlp": 0.01275236, + "epoch": 0.09932361340748534, + "flos": 23155369822080.0, + "grad_norm": 3.694899481712973, + "language_loss": 0.89802289, + "learning_rate": 3.949800096914643e-06, + "loss": 0.97854972, + "num_input_tokens_seen": 35586000, + "router_z_loss_clip": 3.8671875, + "router_z_loss_mlp": 0.38110352, + "step": 1652, + "time_per_iteration": 2.571901321411133 + }, + { + "auxiliary_loss_clip": 0.06737585, + "auxiliary_loss_mlp": 0.01305643, + "balance_loss_clip": 0.06349514, + "balance_loss_mlp": 0.01267735, + "epoch": 0.09938373666015332, + "flos": 19834791569280.0, + "grad_norm": 2.586330184077195, + "language_loss": 0.8401894, + "learning_rate": 3.949713349038422e-06, + "loss": 0.92062169, + "num_input_tokens_seen": 35604355, + "router_z_loss_clip": 3.88085938, + "router_z_loss_mlp": 0.37890625, + "step": 1653, + "time_per_iteration": 2.5631346702575684 + }, + { + "auxiliary_loss_clip": 0.0674301, + "auxiliary_loss_mlp": 0.01306602, + "balance_loss_clip": 0.06348432, + "balance_loss_mlp": 0.01266428, + "epoch": 0.09944385991282129, + "flos": 22097165408640.0, + "grad_norm": 3.5179958225358914, + "language_loss": 0.81669748, + "learning_rate": 3.949626527228875e-06, + "loss": 0.89719361, + "num_input_tokens_seen": 35625495, + "router_z_loss_clip": 3.94335938, + "router_z_loss_mlp": 0.40136719, + "step": 1654, + "time_per_iteration": 2.602562427520752 + }, + { + "auxiliary_loss_clip": 0.06716993, + "auxiliary_loss_mlp": 0.01303058, + "balance_loss_clip": 0.0634619, + "balance_loss_mlp": 0.01268178, + "epoch": 0.09950398316548925, + "flos": 19835043131520.0, + "grad_norm": 8.671208784933132, + "language_loss": 0.83012509, + "learning_rate": 3.949539631489295e-06, + "loss": 0.91032565, + "num_input_tokens_seen": 35645030, + "router_z_loss_clip": 3.70703125, + "router_z_loss_mlp": 0.34863281, + "step": 1655, + "time_per_iteration": 2.5673985481262207 + }, + { + "auxiliary_loss_clip": 0.06726938, + "auxiliary_loss_mlp": 0.01297279, + "balance_loss_clip": 0.06340201, + "balance_loss_mlp": 0.01259799, + "epoch": 0.09956410641815722, + "flos": 25009715404800.0, + "grad_norm": 2.461628043042503, + "language_loss": 0.82767576, + "learning_rate": 3.9494526618229765e-06, + "loss": 0.90791798, + "num_input_tokens_seen": 35664305, + "router_z_loss_clip": 3.86523438, + "router_z_loss_mlp": 0.37475586, + "step": 1656, + "time_per_iteration": 2.581664800643921 + }, + { + "auxiliary_loss_clip": 0.06710893, + "auxiliary_loss_mlp": 0.01307317, + "balance_loss_clip": 0.06336491, + "balance_loss_mlp": 0.01268812, + "epoch": 0.0996242296708252, + "flos": 19323746317440.0, + "grad_norm": 1.719286888169867, + "language_loss": 0.90283895, + "learning_rate": 3.949365618233217e-06, + "loss": 0.98302102, + "num_input_tokens_seen": 35684060, + "router_z_loss_clip": 3.74804688, + "router_z_loss_mlp": 0.38525391, + "step": 1657, + "time_per_iteration": 2.57688045501709 + }, + { + "auxiliary_loss_clip": 0.06739774, + "auxiliary_loss_mlp": 0.01311666, + "balance_loss_clip": 0.06340782, + "balance_loss_mlp": 0.01267869, + "epoch": 0.09968435292349316, + "flos": 21878050181760.0, + "grad_norm": 2.9029706728478533, + "language_loss": 0.87311482, + "learning_rate": 3.9492785007233195e-06, + "loss": 0.95362926, + "num_input_tokens_seen": 35703250, + "router_z_loss_clip": 3.98632812, + "router_z_loss_mlp": 0.43823242, + "step": 1658, + "time_per_iteration": 2.628093719482422 + }, + { + "auxiliary_loss_clip": 0.06571998, + "auxiliary_loss_mlp": 0.01376397, + "balance_loss_clip": 0.06328425, + "balance_loss_mlp": 0.01349933, + "epoch": 0.09974447617616113, + "flos": 65401912154880.0, + "grad_norm": 0.9037243571562794, + "language_loss": 0.60433233, + "learning_rate": 3.949191309296585e-06, + "loss": 0.68381631, + "num_input_tokens_seen": 35762165, + "router_z_loss_clip": 2.4375, + "router_z_loss_mlp": 0.26513672, + "step": 1659, + "time_per_iteration": 3.2305996417999268 + }, + { + "auxiliary_loss_clip": 0.06713426, + "auxiliary_loss_mlp": 0.01317119, + "balance_loss_clip": 0.06331229, + "balance_loss_mlp": 0.0127735, + "epoch": 0.0998045994288291, + "flos": 23666624709120.0, + "grad_norm": 2.0571407511312865, + "language_loss": 0.87086773, + "learning_rate": 3.949104043956321e-06, + "loss": 0.95117325, + "num_input_tokens_seen": 35781520, + "router_z_loss_clip": 3.82226562, + "router_z_loss_mlp": 0.39746094, + "step": 1660, + "time_per_iteration": 2.5779190063476562 + }, + { + "auxiliary_loss_clip": 0.0670151, + "auxiliary_loss_mlp": 0.01332109, + "balance_loss_clip": 0.06323117, + "balance_loss_mlp": 0.01290529, + "epoch": 0.09986472268149707, + "flos": 19615802123520.0, + "grad_norm": 2.4762315311071315, + "language_loss": 0.80644435, + "learning_rate": 3.949016704705836e-06, + "loss": 0.88678062, + "num_input_tokens_seen": 35799565, + "router_z_loss_clip": 3.78710938, + "router_z_loss_mlp": 0.41552734, + "step": 1661, + "time_per_iteration": 2.691804885864258 + }, + { + "auxiliary_loss_clip": 0.06725313, + "auxiliary_loss_mlp": 0.0132162, + "balance_loss_clip": 0.0632514, + "balance_loss_mlp": 0.01278443, + "epoch": 0.09992484593416504, + "flos": 26220467376000.0, + "grad_norm": 2.2620896744149412, + "language_loss": 0.8613416, + "learning_rate": 3.948929291548443e-06, + "loss": 0.94181097, + "num_input_tokens_seen": 35821085, + "router_z_loss_clip": 4.00585938, + "router_z_loss_mlp": 0.43164062, + "step": 1662, + "time_per_iteration": 2.6255035400390625 + }, + { + "auxiliary_loss_clip": 0.06704119, + "auxiliary_loss_mlp": 0.0133037, + "balance_loss_clip": 0.06321694, + "balance_loss_mlp": 0.0128941, + "epoch": 0.09998496918683301, + "flos": 17499393296640.0, + "grad_norm": 2.3672212997838993, + "language_loss": 0.90448183, + "learning_rate": 3.9488418044874546e-06, + "loss": 0.98482674, + "num_input_tokens_seen": 35839840, + "router_z_loss_clip": 3.82226562, + "router_z_loss_mlp": 0.40966797, + "step": 1663, + "time_per_iteration": 2.6671247482299805 + }, + { + "auxiliary_loss_clip": 0.06712753, + "auxiliary_loss_mlp": 0.01334758, + "balance_loss_clip": 0.06319161, + "balance_loss_mlp": 0.01292105, + "epoch": 0.10004509243950098, + "flos": 22791715102080.0, + "grad_norm": 2.952995005402735, + "language_loss": 0.72149938, + "learning_rate": 3.948754243526191e-06, + "loss": 0.80197442, + "num_input_tokens_seen": 35861545, + "router_z_loss_clip": 3.93164062, + "router_z_loss_mlp": 0.42651367, + "step": 1664, + "time_per_iteration": 2.619164228439331 + }, + { + "auxiliary_loss_clip": 0.06713652, + "auxiliary_loss_mlp": 0.01325429, + "balance_loss_clip": 0.06323303, + "balance_loss_mlp": 0.01284159, + "epoch": 0.10010521569216894, + "flos": 16258984179840.0, + "grad_norm": 39.90990553234195, + "language_loss": 0.80576968, + "learning_rate": 3.94866660866797e-06, + "loss": 0.88616049, + "num_input_tokens_seen": 35878295, + "router_z_loss_clip": 3.90429688, + "router_z_loss_mlp": 0.41235352, + "step": 1665, + "time_per_iteration": 2.605639934539795 + }, + { + "auxiliary_loss_clip": 0.06714154, + "auxiliary_loss_mlp": 0.01316999, + "balance_loss_clip": 0.06327689, + "balance_loss_mlp": 0.01278017, + "epoch": 0.10016533894483691, + "flos": 23409047658240.0, + "grad_norm": 2.1899546372821566, + "language_loss": 0.71735048, + "learning_rate": 3.9485788999161165e-06, + "loss": 0.79766202, + "num_input_tokens_seen": 35898990, + "router_z_loss_clip": 3.86523438, + "router_z_loss_mlp": 0.38964844, + "step": 1666, + "time_per_iteration": 2.565112352371216 + }, + { + "auxiliary_loss_clip": 0.06721501, + "auxiliary_loss_mlp": 0.01334152, + "balance_loss_clip": 0.06329556, + "balance_loss_mlp": 0.01286492, + "epoch": 0.10022546219750489, + "flos": 19360195643520.0, + "grad_norm": 2.4453770076419055, + "language_loss": 0.80451995, + "learning_rate": 3.948491117273956e-06, + "loss": 0.88507646, + "num_input_tokens_seen": 35916225, + "router_z_loss_clip": 3.921875, + "router_z_loss_mlp": 0.47680664, + "step": 1667, + "time_per_iteration": 2.5686376094818115 + }, + { + "auxiliary_loss_clip": 0.06714002, + "auxiliary_loss_mlp": 0.01313023, + "balance_loss_clip": 0.06328776, + "balance_loss_mlp": 0.01272492, + "epoch": 0.10028558545017285, + "flos": 27092525944320.0, + "grad_norm": 3.3659339438704357, + "language_loss": 0.79832667, + "learning_rate": 3.948403260744817e-06, + "loss": 0.8785969, + "num_input_tokens_seen": 35934630, + "router_z_loss_clip": 3.84960938, + "router_z_loss_mlp": 0.40551758, + "step": 1668, + "time_per_iteration": 2.5726866722106934 + }, + { + "auxiliary_loss_clip": 0.0670673, + "auxiliary_loss_mlp": 0.013093, + "balance_loss_clip": 0.06318925, + "balance_loss_mlp": 0.01268101, + "epoch": 0.10034570870284082, + "flos": 25854003544320.0, + "grad_norm": 2.568927800509246, + "language_loss": 0.79338908, + "learning_rate": 3.948315330332031e-06, + "loss": 0.87354934, + "num_input_tokens_seen": 35953855, + "router_z_loss_clip": 3.87695312, + "router_z_loss_mlp": 0.41235352, + "step": 1669, + "time_per_iteration": 2.6188042163848877 + }, + { + "auxiliary_loss_clip": 0.06725293, + "auxiliary_loss_mlp": 0.0130808, + "balance_loss_clip": 0.06329028, + "balance_loss_mlp": 0.01264497, + "epoch": 0.1004058319555088, + "flos": 26256707066880.0, + "grad_norm": 15.895164476932296, + "language_loss": 0.87389982, + "learning_rate": 3.948227326038933e-06, + "loss": 0.95423353, + "num_input_tokens_seen": 35974555, + "router_z_loss_clip": 3.9609375, + "router_z_loss_mlp": 0.43579102, + "step": 1670, + "time_per_iteration": 2.6586272716522217 + }, + { + "auxiliary_loss_clip": 0.06691795, + "auxiliary_loss_mlp": 0.01298769, + "balance_loss_clip": 0.06322314, + "balance_loss_mlp": 0.0126098, + "epoch": 0.10046595520817676, + "flos": 25381545897600.0, + "grad_norm": 1.8967452212827218, + "language_loss": 0.7865597, + "learning_rate": 3.9481392478688586e-06, + "loss": 0.86646533, + "num_input_tokens_seen": 35996830, + "router_z_loss_clip": 3.69335938, + "router_z_loss_mlp": 0.37817383, + "step": 1671, + "time_per_iteration": 2.6737799644470215 + }, + { + "auxiliary_loss_clip": 0.06549042, + "auxiliary_loss_mlp": 0.01335852, + "balance_loss_clip": 0.06305933, + "balance_loss_mlp": 0.01310293, + "epoch": 0.10052607846084473, + "flos": 67479146398080.0, + "grad_norm": 0.7871321089675286, + "language_loss": 0.60865933, + "learning_rate": 3.948051095825149e-06, + "loss": 0.68750823, + "num_input_tokens_seen": 36054465, + "router_z_loss_clip": 2.4375, + "router_z_loss_mlp": 0.25585938, + "step": 1672, + "time_per_iteration": 3.1528263092041016 + }, + { + "auxiliary_loss_clip": 0.06706591, + "auxiliary_loss_mlp": 0.01299319, + "balance_loss_clip": 0.06322384, + "balance_loss_mlp": 0.01258406, + "epoch": 0.10058620171351271, + "flos": 21366795294720.0, + "grad_norm": 25.353895208902486, + "language_loss": 0.78260916, + "learning_rate": 3.947962869911147e-06, + "loss": 0.86266828, + "num_input_tokens_seen": 36073480, + "router_z_loss_clip": 3.83984375, + "router_z_loss_mlp": 0.40917969, + "step": 1673, + "time_per_iteration": 2.548840045928955 + }, + { + "auxiliary_loss_clip": 0.06713213, + "auxiliary_loss_mlp": 0.01301927, + "balance_loss_clip": 0.06326719, + "balance_loss_mlp": 0.01261419, + "epoch": 0.10064632496618067, + "flos": 16805724071040.0, + "grad_norm": 3.2623460746575867, + "language_loss": 0.75444734, + "learning_rate": 3.947874570130197e-06, + "loss": 0.83459872, + "num_input_tokens_seen": 36091830, + "router_z_loss_clip": 3.8671875, + "router_z_loss_mlp": 0.4050293, + "step": 1674, + "time_per_iteration": 3.9417338371276855 + }, + { + "auxiliary_loss_clip": 0.06701215, + "auxiliary_loss_mlp": 0.01303034, + "balance_loss_clip": 0.0631593, + "balance_loss_mlp": 0.01264124, + "epoch": 0.10070644821884864, + "flos": 23631433194240.0, + "grad_norm": 2.3845334341515905, + "language_loss": 0.80716002, + "learning_rate": 3.947786196485649e-06, + "loss": 0.88720256, + "num_input_tokens_seen": 36111400, + "router_z_loss_clip": 3.8515625, + "router_z_loss_mlp": 0.38891602, + "step": 1675, + "time_per_iteration": 2.6035287380218506 + }, + { + "auxiliary_loss_clip": 0.06711227, + "auxiliary_loss_mlp": 0.01308342, + "balance_loss_clip": 0.06320765, + "balance_loss_mlp": 0.01266404, + "epoch": 0.1007665714715166, + "flos": 24469516131840.0, + "grad_norm": 3.2401043480386122, + "language_loss": 0.82723379, + "learning_rate": 3.947697748980853e-06, + "loss": 0.90742946, + "num_input_tokens_seen": 36129345, + "router_z_loss_clip": 3.90234375, + "router_z_loss_mlp": 0.41943359, + "step": 1676, + "time_per_iteration": 4.029613256454468 + }, + { + "auxiliary_loss_clip": 0.06714617, + "auxiliary_loss_mlp": 0.01315911, + "balance_loss_clip": 0.0632771, + "balance_loss_mlp": 0.0127476, + "epoch": 0.10082669472418458, + "flos": 16804550113920.0, + "grad_norm": 2.3128991920650295, + "language_loss": 0.87477523, + "learning_rate": 3.947609227619163e-06, + "loss": 0.95508051, + "num_input_tokens_seen": 36146255, + "router_z_loss_clip": 3.87109375, + "router_z_loss_mlp": 0.41113281, + "step": 1677, + "time_per_iteration": 2.593122720718384 + }, + { + "auxiliary_loss_clip": 0.06712872, + "auxiliary_loss_mlp": 0.01323048, + "balance_loss_clip": 0.06321359, + "balance_loss_mlp": 0.01280586, + "epoch": 0.10088681797685255, + "flos": 13558673376000.0, + "grad_norm": 2.3885344519990017, + "language_loss": 0.87886804, + "learning_rate": 3.947520632403936e-06, + "loss": 0.9592272, + "num_input_tokens_seen": 36164050, + "router_z_loss_clip": 3.9140625, + "router_z_loss_mlp": 0.42480469, + "step": 1678, + "time_per_iteration": 4.02148962020874 + }, + { + "auxiliary_loss_clip": 0.06711318, + "auxiliary_loss_mlp": 0.01321227, + "balance_loss_clip": 0.06328011, + "balance_loss_mlp": 0.01282985, + "epoch": 0.10094694122952051, + "flos": 25272868752000.0, + "grad_norm": 13.556620814946344, + "language_loss": 0.91124773, + "learning_rate": 3.947431963338532e-06, + "loss": 0.99157315, + "num_input_tokens_seen": 36183530, + "router_z_loss_clip": 3.83007812, + "router_z_loss_mlp": 0.38256836, + "step": 1679, + "time_per_iteration": 2.593204975128174 + }, + { + "auxiliary_loss_clip": 0.06551328, + "auxiliary_loss_mlp": 0.01270219, + "balance_loss_clip": 0.06307815, + "balance_loss_mlp": 0.01249143, + "epoch": 0.10100706448218849, + "flos": 69875521315200.0, + "grad_norm": 0.8658555731993547, + "language_loss": 0.53157437, + "learning_rate": 3.947343220426312e-06, + "loss": 0.60978985, + "num_input_tokens_seen": 36248550, + "router_z_loss_clip": 2.4375, + "router_z_loss_mlp": 0.2109375, + "step": 1680, + "time_per_iteration": 4.680401802062988 + }, + { + "auxiliary_loss_clip": 0.06706315, + "auxiliary_loss_mlp": 0.01330393, + "balance_loss_clip": 0.06326837, + "balance_loss_mlp": 0.0129103, + "epoch": 0.10106718773485646, + "flos": 20012677787520.0, + "grad_norm": 2.2086252291478403, + "language_loss": 0.78363287, + "learning_rate": 3.947254403670641e-06, + "loss": 0.86399996, + "num_input_tokens_seen": 36266065, + "router_z_loss_clip": 3.79296875, + "router_z_loss_mlp": 0.39331055, + "step": 1681, + "time_per_iteration": 2.5842180252075195 + }, + { + "auxiliary_loss_clip": 0.06727763, + "auxiliary_loss_mlp": 0.0133733, + "balance_loss_clip": 0.06334171, + "balance_loss_mlp": 0.01293271, + "epoch": 0.10112731098752442, + "flos": 13484852328960.0, + "grad_norm": 2.7825426019965707, + "language_loss": 0.9580273, + "learning_rate": 3.947165513074889e-06, + "loss": 1.03867817, + "num_input_tokens_seen": 36280960, + "router_z_loss_clip": 3.9375, + "router_z_loss_mlp": 0.44067383, + "step": 1682, + "time_per_iteration": 2.5091476440429688 + }, + { + "auxiliary_loss_clip": 0.06722884, + "auxiliary_loss_mlp": 0.01333979, + "balance_loss_clip": 0.06334428, + "balance_loss_mlp": 0.01291803, + "epoch": 0.1011874342401924, + "flos": 18521944997760.0, + "grad_norm": 4.013093374062749, + "language_loss": 0.88974559, + "learning_rate": 3.947076548642425e-06, + "loss": 0.97031426, + "num_input_tokens_seen": 36299010, + "router_z_loss_clip": 3.8828125, + "router_z_loss_mlp": 0.421875, + "step": 1683, + "time_per_iteration": 2.583263635635376 + }, + { + "auxiliary_loss_clip": 0.0671032, + "auxiliary_loss_mlp": 0.01319793, + "balance_loss_clip": 0.06327897, + "balance_loss_mlp": 0.0128074, + "epoch": 0.10124755749286037, + "flos": 20708904562560.0, + "grad_norm": 3.51695946667963, + "language_loss": 0.76482016, + "learning_rate": 3.946987510376624e-06, + "loss": 0.84512126, + "num_input_tokens_seen": 36318400, + "router_z_loss_clip": 3.82226562, + "router_z_loss_mlp": 0.390625, + "step": 1684, + "time_per_iteration": 2.5566201210021973 + }, + { + "auxiliary_loss_clip": 0.06545618, + "auxiliary_loss_mlp": 0.01270157, + "balance_loss_clip": 0.06304231, + "balance_loss_mlp": 0.01252085, + "epoch": 0.10130768074552833, + "flos": 56130100387200.0, + "grad_norm": 0.7359306974182547, + "language_loss": 0.6108619, + "learning_rate": 3.9468983982808615e-06, + "loss": 0.68901968, + "num_input_tokens_seen": 36381815, + "router_z_loss_clip": 2.40625, + "router_z_loss_mlp": 0.1809082, + "step": 1685, + "time_per_iteration": 3.2871286869049072 + }, + { + "auxiliary_loss_clip": 0.06715102, + "auxiliary_loss_mlp": 0.01314643, + "balance_loss_clip": 0.06328554, + "balance_loss_mlp": 0.01273612, + "epoch": 0.1013678039981963, + "flos": 33410921322240.0, + "grad_norm": 2.782312478618552, + "language_loss": 0.61882973, + "learning_rate": 3.946809212358516e-06, + "loss": 0.6991272, + "num_input_tokens_seen": 36404320, + "router_z_loss_clip": 3.86328125, + "router_z_loss_mlp": 0.41064453, + "step": 1686, + "time_per_iteration": 2.6534583568573 + }, + { + "auxiliary_loss_clip": 0.0670934, + "auxiliary_loss_mlp": 0.01311437, + "balance_loss_clip": 0.0633449, + "balance_loss_mlp": 0.01272622, + "epoch": 0.10142792725086427, + "flos": 31913480206080.0, + "grad_norm": 4.585581221965215, + "language_loss": 0.8288697, + "learning_rate": 3.946719952612972e-06, + "loss": 0.90907753, + "num_input_tokens_seen": 36427510, + "router_z_loss_clip": 3.74804688, + "router_z_loss_mlp": 0.38793945, + "step": 1687, + "time_per_iteration": 2.6766278743743896 + }, + { + "auxiliary_loss_clip": 0.06718412, + "auxiliary_loss_mlp": 0.0131249, + "balance_loss_clip": 0.06331126, + "balance_loss_mlp": 0.01271601, + "epoch": 0.10148805050353224, + "flos": 28483512048000.0, + "grad_norm": 2.9352499009147386, + "language_loss": 0.73686063, + "learning_rate": 3.94663061904761e-06, + "loss": 0.81716961, + "num_input_tokens_seen": 36448230, + "router_z_loss_clip": 3.88085938, + "router_z_loss_mlp": 0.40917969, + "step": 1688, + "time_per_iteration": 2.625084400177002 + }, + { + "auxiliary_loss_clip": 0.06704164, + "auxiliary_loss_mlp": 0.01310415, + "balance_loss_clip": 0.06328401, + "balance_loss_mlp": 0.01267905, + "epoch": 0.1015481737562002, + "flos": 25154799949440.0, + "grad_norm": 2.7691275113498293, + "language_loss": 0.88195848, + "learning_rate": 3.94654121166582e-06, + "loss": 0.9621042, + "num_input_tokens_seen": 36464395, + "router_z_loss_clip": 3.7578125, + "router_z_loss_mlp": 0.42480469, + "step": 1689, + "time_per_iteration": 2.595492362976074 + }, + { + "auxiliary_loss_clip": 0.06716056, + "auxiliary_loss_mlp": 0.01310716, + "balance_loss_clip": 0.06332745, + "balance_loss_mlp": 0.01270328, + "epoch": 0.10160829700886818, + "flos": 30890593088640.0, + "grad_norm": 2.202394662859946, + "language_loss": 0.89776945, + "learning_rate": 3.946451730470993e-06, + "loss": 0.97803724, + "num_input_tokens_seen": 36486475, + "router_z_loss_clip": 3.83398438, + "router_z_loss_mlp": 0.40429688, + "step": 1690, + "time_per_iteration": 2.6406383514404297 + }, + { + "auxiliary_loss_clip": 0.06720668, + "auxiliary_loss_mlp": 0.01309465, + "balance_loss_clip": 0.06337205, + "balance_loss_mlp": 0.01267932, + "epoch": 0.10166842026153615, + "flos": 20418190421760.0, + "grad_norm": 2.5850789066585595, + "language_loss": 0.85274917, + "learning_rate": 3.946362175466521e-06, + "loss": 0.93305051, + "num_input_tokens_seen": 36505310, + "router_z_loss_clip": 3.83789062, + "router_z_loss_mlp": 0.4152832, + "step": 1691, + "time_per_iteration": 2.6336474418640137 + }, + { + "auxiliary_loss_clip": 0.06720576, + "auxiliary_loss_mlp": 0.01308382, + "balance_loss_clip": 0.06329723, + "balance_loss_mlp": 0.01266039, + "epoch": 0.10172854351420411, + "flos": 33485832472320.0, + "grad_norm": 1.9210168222319979, + "language_loss": 0.67985535, + "learning_rate": 3.946272546655801e-06, + "loss": 0.76014495, + "num_input_tokens_seen": 36529820, + "router_z_loss_clip": 3.91210938, + "router_z_loss_mlp": 0.4230957, + "step": 1692, + "time_per_iteration": 2.7298569679260254 + }, + { + "auxiliary_loss_clip": 0.0670909, + "auxiliary_loss_mlp": 0.01313275, + "balance_loss_clip": 0.06329532, + "balance_loss_mlp": 0.01271933, + "epoch": 0.1017886667668721, + "flos": 23557109022720.0, + "grad_norm": 2.364359015626866, + "language_loss": 0.77791357, + "learning_rate": 3.94618284404223e-06, + "loss": 0.85813725, + "num_input_tokens_seen": 36549000, + "router_z_loss_clip": 3.79101562, + "router_z_loss_mlp": 0.41333008, + "step": 1693, + "time_per_iteration": 2.5772159099578857 + }, + { + "auxiliary_loss_clip": 0.06718149, + "auxiliary_loss_mlp": 0.01308582, + "balance_loss_clip": 0.06332842, + "balance_loss_mlp": 0.01267813, + "epoch": 0.10184879001954006, + "flos": 23303011916160.0, + "grad_norm": 1.7868831519316952, + "language_loss": 0.88559091, + "learning_rate": 3.9460930676292105e-06, + "loss": 0.96585822, + "num_input_tokens_seen": 36567515, + "router_z_loss_clip": 3.85742188, + "router_z_loss_mlp": 0.4074707, + "step": 1694, + "time_per_iteration": 2.6128172874450684 + }, + { + "auxiliary_loss_clip": 0.06728393, + "auxiliary_loss_mlp": 0.01308189, + "balance_loss_clip": 0.06335086, + "balance_loss_mlp": 0.01266681, + "epoch": 0.10190891327220802, + "flos": 18339069461760.0, + "grad_norm": 12.701803193315635, + "language_loss": 0.81483626, + "learning_rate": 3.946003217420147e-06, + "loss": 0.89520216, + "num_input_tokens_seen": 36586190, + "router_z_loss_clip": 3.9375, + "router_z_loss_mlp": 0.41503906, + "step": 1695, + "time_per_iteration": 2.5383710861206055 + }, + { + "auxiliary_loss_clip": 0.06719907, + "auxiliary_loss_mlp": 0.01309327, + "balance_loss_clip": 0.06335149, + "balance_loss_mlp": 0.01268152, + "epoch": 0.10196903652487599, + "flos": 26472006933120.0, + "grad_norm": 2.5208321376903173, + "language_loss": 0.87899506, + "learning_rate": 3.945913293418447e-06, + "loss": 0.95928741, + "num_input_tokens_seen": 36607495, + "router_z_loss_clip": 3.84765625, + "router_z_loss_mlp": 0.41186523, + "step": 1696, + "time_per_iteration": 2.651993989944458 + }, + { + "auxiliary_loss_clip": 0.067072, + "auxiliary_loss_mlp": 0.01308456, + "balance_loss_clip": 0.06329801, + "balance_loss_mlp": 0.01268545, + "epoch": 0.10202915977754397, + "flos": 21875618413440.0, + "grad_norm": 1.9807901580601361, + "language_loss": 0.83342528, + "learning_rate": 3.945823295627519e-06, + "loss": 0.91358191, + "num_input_tokens_seen": 36628555, + "router_z_loss_clip": 3.77734375, + "router_z_loss_mlp": 0.39916992, + "step": 1697, + "time_per_iteration": 2.5826144218444824 + }, + { + "auxiliary_loss_clip": 0.06717139, + "auxiliary_loss_mlp": 0.01309728, + "balance_loss_clip": 0.06333424, + "balance_loss_mlp": 0.01268339, + "epoch": 0.10208928303021193, + "flos": 22316322562560.0, + "grad_norm": 4.080073154744023, + "language_loss": 0.82607067, + "learning_rate": 3.9457332240507775e-06, + "loss": 0.90633935, + "num_input_tokens_seen": 36646250, + "router_z_loss_clip": 3.83789062, + "router_z_loss_mlp": 0.4140625, + "step": 1698, + "time_per_iteration": 2.6105751991271973 + }, + { + "auxiliary_loss_clip": 0.06711876, + "auxiliary_loss_mlp": 0.01312643, + "balance_loss_clip": 0.06331024, + "balance_loss_mlp": 0.01272541, + "epoch": 0.1021494062828799, + "flos": 22131811872000.0, + "grad_norm": 3.7730678992984594, + "language_loss": 0.78052682, + "learning_rate": 3.945643078691637e-06, + "loss": 0.86077201, + "num_input_tokens_seen": 36666675, + "router_z_loss_clip": 3.8046875, + "router_z_loss_mlp": 0.40112305, + "step": 1699, + "time_per_iteration": 2.554769515991211 + }, + { + "auxiliary_loss_clip": 0.06706256, + "auxiliary_loss_mlp": 0.01310666, + "balance_loss_clip": 0.06325917, + "balance_loss_mlp": 0.01269253, + "epoch": 0.10220952953554788, + "flos": 19652922282240.0, + "grad_norm": 2.595218153740113, + "language_loss": 0.81135154, + "learning_rate": 3.945552859553516e-06, + "loss": 0.89152074, + "num_input_tokens_seen": 36685225, + "router_z_loss_clip": 3.80273438, + "router_z_loss_mlp": 0.41430664, + "step": 1700, + "time_per_iteration": 2.6276824474334717 + }, + { + "auxiliary_loss_clip": 0.06713387, + "auxiliary_loss_mlp": 0.01308957, + "balance_loss_clip": 0.06330973, + "balance_loss_mlp": 0.01269284, + "epoch": 0.10226965278821584, + "flos": 29794765392000.0, + "grad_norm": 1.915620858004171, + "language_loss": 0.78195202, + "learning_rate": 3.945462566639836e-06, + "loss": 0.86217546, + "num_input_tokens_seen": 36705985, + "router_z_loss_clip": 3.82421875, + "router_z_loss_mlp": 0.39697266, + "step": 1701, + "time_per_iteration": 2.6159350872039795 + }, + { + "auxiliary_loss_clip": 0.06729369, + "auxiliary_loss_mlp": 0.01324821, + "balance_loss_clip": 0.06331599, + "balance_loss_mlp": 0.01279617, + "epoch": 0.10232977604088381, + "flos": 27024239266560.0, + "grad_norm": 2.5261274720011473, + "language_loss": 0.79135132, + "learning_rate": 3.945372199954019e-06, + "loss": 0.87189317, + "num_input_tokens_seen": 36725815, + "router_z_loss_clip": 3.98046875, + "router_z_loss_mlp": 0.4519043, + "step": 1702, + "time_per_iteration": 2.629913806915283 + }, + { + "auxiliary_loss_clip": 0.06706569, + "auxiliary_loss_mlp": 0.01317465, + "balance_loss_clip": 0.06326532, + "balance_loss_mlp": 0.01277983, + "epoch": 0.10238989929355179, + "flos": 20783857639680.0, + "grad_norm": 2.3222724065629494, + "language_loss": 0.95639896, + "learning_rate": 3.945281759499494e-06, + "loss": 1.03663921, + "num_input_tokens_seen": 36742345, + "router_z_loss_clip": 3.80078125, + "router_z_loss_mlp": 0.39501953, + "step": 1703, + "time_per_iteration": 2.601848840713501 + }, + { + "auxiliary_loss_clip": 0.06547229, + "auxiliary_loss_mlp": 0.01318477, + "balance_loss_clip": 0.06308849, + "balance_loss_mlp": 0.01299118, + "epoch": 0.10245002254621975, + "flos": 57716471013120.0, + "grad_norm": 0.8331319138238726, + "language_loss": 0.55242068, + "learning_rate": 3.94519124527969e-06, + "loss": 0.63107777, + "num_input_tokens_seen": 36798775, + "router_z_loss_clip": 2.375, + "router_z_loss_mlp": 0.19335938, + "step": 1704, + "time_per_iteration": 3.1248717308044434 + }, + { + "auxiliary_loss_clip": 0.06706051, + "auxiliary_loss_mlp": 0.01308758, + "balance_loss_clip": 0.06321411, + "balance_loss_mlp": 0.0126775, + "epoch": 0.10251014579888772, + "flos": 16805724071040.0, + "grad_norm": 2.30707717904525, + "language_loss": 0.8659755, + "learning_rate": 3.945100657298039e-06, + "loss": 0.94612348, + "num_input_tokens_seen": 36816295, + "router_z_loss_clip": 3.84960938, + "router_z_loss_mlp": 0.41015625, + "step": 1705, + "time_per_iteration": 2.5850555896759033 + }, + { + "auxiliary_loss_clip": 0.06541149, + "auxiliary_loss_mlp": 0.01304681, + "balance_loss_clip": 0.06304285, + "balance_loss_mlp": 0.01286478, + "epoch": 0.1025702690515557, + "flos": 68584533459840.0, + "grad_norm": 0.7436655566620352, + "language_loss": 0.60505682, + "learning_rate": 3.9450099955579765e-06, + "loss": 0.68351519, + "num_input_tokens_seen": 36882030, + "router_z_loss_clip": 2.375, + "router_z_loss_mlp": 0.18212891, + "step": 1706, + "time_per_iteration": 3.239501953125 + }, + { + "auxiliary_loss_clip": 0.06703549, + "auxiliary_loss_mlp": 0.01305907, + "balance_loss_clip": 0.0632052, + "balance_loss_mlp": 0.01262729, + "epoch": 0.10263039230422366, + "flos": 14871939217920.0, + "grad_norm": 2.8485004441458637, + "language_loss": 0.88280994, + "learning_rate": 3.94491926006294e-06, + "loss": 0.96290451, + "num_input_tokens_seen": 36899245, + "router_z_loss_clip": 3.828125, + "router_z_loss_mlp": 0.43188477, + "step": 1707, + "time_per_iteration": 2.6399993896484375 + }, + { + "auxiliary_loss_clip": 0.0669533, + "auxiliary_loss_mlp": 0.01302799, + "balance_loss_clip": 0.06323209, + "balance_loss_mlp": 0.01262887, + "epoch": 0.10269051555689163, + "flos": 25344593447040.0, + "grad_norm": 2.5980108077369604, + "language_loss": 0.74784869, + "learning_rate": 3.944828450816369e-06, + "loss": 0.82783002, + "num_input_tokens_seen": 36920950, + "router_z_loss_clip": 3.71875, + "router_z_loss_mlp": 0.39892578, + "step": 1708, + "time_per_iteration": 2.654852867126465 + }, + { + "auxiliary_loss_clip": 0.06703041, + "auxiliary_loss_mlp": 0.01305178, + "balance_loss_clip": 0.06323138, + "balance_loss_mlp": 0.01263049, + "epoch": 0.10275063880955959, + "flos": 21075116832000.0, + "grad_norm": 2.060667127210552, + "language_loss": 0.92398179, + "learning_rate": 3.944737567821709e-06, + "loss": 1.00406396, + "num_input_tokens_seen": 36938900, + "router_z_loss_clip": 3.80078125, + "router_z_loss_mlp": 0.42114258, + "step": 1709, + "time_per_iteration": 2.573854446411133 + }, + { + "auxiliary_loss_clip": 0.06702737, + "auxiliary_loss_mlp": 0.01298282, + "balance_loss_clip": 0.06322797, + "balance_loss_mlp": 0.01257703, + "epoch": 0.10281076206222757, + "flos": 30373636124160.0, + "grad_norm": 12.814317235362356, + "language_loss": 0.90276158, + "learning_rate": 3.944646611082406e-06, + "loss": 0.98277175, + "num_input_tokens_seen": 36957010, + "router_z_loss_clip": 3.79882812, + "router_z_loss_mlp": 0.40551758, + "step": 1710, + "time_per_iteration": 2.6228139400482178 + }, + { + "auxiliary_loss_clip": 0.06701953, + "auxiliary_loss_mlp": 0.01305177, + "balance_loss_clip": 0.06325494, + "balance_loss_mlp": 0.01263096, + "epoch": 0.10287088531489554, + "flos": 22424748145920.0, + "grad_norm": 2.0240875797159554, + "language_loss": 0.80754149, + "learning_rate": 3.944555580601908e-06, + "loss": 0.88761282, + "num_input_tokens_seen": 36977690, + "router_z_loss_clip": 3.75976562, + "router_z_loss_mlp": 0.42089844, + "step": 1711, + "time_per_iteration": 2.583343982696533 + }, + { + "auxiliary_loss_clip": 0.06708579, + "auxiliary_loss_mlp": 0.01306816, + "balance_loss_clip": 0.06325286, + "balance_loss_mlp": 0.01263447, + "epoch": 0.1029310085675635, + "flos": 25122501400320.0, + "grad_norm": 2.3794944473216684, + "language_loss": 0.74649823, + "learning_rate": 3.944464476383668e-06, + "loss": 0.82665217, + "num_input_tokens_seen": 36997300, + "router_z_loss_clip": 3.83398438, + "router_z_loss_mlp": 0.43383789, + "step": 1712, + "time_per_iteration": 2.571152687072754 + }, + { + "auxiliary_loss_clip": 0.06692443, + "auxiliary_loss_mlp": 0.01305083, + "balance_loss_clip": 0.0632696, + "balance_loss_mlp": 0.01265911, + "epoch": 0.10299113182023148, + "flos": 19871869800960.0, + "grad_norm": 3.881117444097493, + "language_loss": 0.88232982, + "learning_rate": 3.94437329843114e-06, + "loss": 0.96230507, + "num_input_tokens_seen": 37016110, + "router_z_loss_clip": 3.65429688, + "router_z_loss_mlp": 0.3918457, + "step": 1713, + "time_per_iteration": 4.005250453948975 + }, + { + "auxiliary_loss_clip": 0.06698017, + "auxiliary_loss_mlp": 0.01309494, + "balance_loss_clip": 0.06326848, + "balance_loss_mlp": 0.0126789, + "epoch": 0.10305125507289944, + "flos": 20453633498880.0, + "grad_norm": 1.7755930908575366, + "language_loss": 0.74034607, + "learning_rate": 3.944282046747782e-06, + "loss": 0.82042122, + "num_input_tokens_seen": 37036405, + "router_z_loss_clip": 3.70703125, + "router_z_loss_mlp": 0.41601562, + "step": 1714, + "time_per_iteration": 2.5871846675872803 + }, + { + "auxiliary_loss_clip": 0.06718543, + "auxiliary_loss_mlp": 0.01323459, + "balance_loss_clip": 0.06333546, + "balance_loss_mlp": 0.01278446, + "epoch": 0.10311137832556741, + "flos": 26258090659200.0, + "grad_norm": 2.9350503756017425, + "language_loss": 0.92344153, + "learning_rate": 3.944190721337053e-06, + "loss": 1.00386155, + "num_input_tokens_seen": 37057580, + "router_z_loss_clip": 3.84765625, + "router_z_loss_mlp": 0.45043945, + "step": 1715, + "time_per_iteration": 4.0185253620147705 + }, + { + "auxiliary_loss_clip": 0.06704861, + "auxiliary_loss_mlp": 0.01311537, + "balance_loss_clip": 0.06330159, + "balance_loss_mlp": 0.01269957, + "epoch": 0.10317150157823539, + "flos": 35307711797760.0, + "grad_norm": 2.2230189858401834, + "language_loss": 0.77534348, + "learning_rate": 3.944099322202418e-06, + "loss": 0.85550749, + "num_input_tokens_seen": 37079120, + "router_z_loss_clip": 3.74804688, + "router_z_loss_mlp": 0.41577148, + "step": 1716, + "time_per_iteration": 2.6924543380737305 + }, + { + "auxiliary_loss_clip": 0.06704281, + "auxiliary_loss_mlp": 0.01322549, + "balance_loss_clip": 0.06326932, + "balance_loss_mlp": 0.01278037, + "epoch": 0.10323162483090335, + "flos": 25747171188480.0, + "grad_norm": 4.647251493858166, + "language_loss": 0.87329108, + "learning_rate": 3.944007849347342e-06, + "loss": 0.9535594, + "num_input_tokens_seen": 37099710, + "router_z_loss_clip": 3.76757812, + "router_z_loss_mlp": 0.44506836, + "step": 1717, + "time_per_iteration": 2.5771939754486084 + }, + { + "auxiliary_loss_clip": 0.06709914, + "auxiliary_loss_mlp": 0.01337871, + "balance_loss_clip": 0.06322803, + "balance_loss_mlp": 0.0129393, + "epoch": 0.10329174808357132, + "flos": 16295475432960.0, + "grad_norm": 2.5245058321168297, + "language_loss": 0.84142077, + "learning_rate": 3.943916302775292e-06, + "loss": 0.9218986, + "num_input_tokens_seen": 37117775, + "router_z_loss_clip": 3.87109375, + "router_z_loss_mlp": 0.43945312, + "step": 1718, + "time_per_iteration": 3.9576940536499023 + }, + { + "auxiliary_loss_clip": 0.06693481, + "auxiliary_loss_mlp": 0.01328919, + "balance_loss_clip": 0.06322589, + "balance_loss_mlp": 0.01288626, + "epoch": 0.10335187133623928, + "flos": 36696475768320.0, + "grad_norm": 4.723677538171457, + "language_loss": 0.75181365, + "learning_rate": 3.943824682489742e-06, + "loss": 0.83203769, + "num_input_tokens_seen": 37140280, + "router_z_loss_clip": 3.70703125, + "router_z_loss_mlp": 0.40283203, + "step": 1719, + "time_per_iteration": 4.132940769195557 + }, + { + "auxiliary_loss_clip": 0.06689329, + "auxiliary_loss_mlp": 0.01317642, + "balance_loss_clip": 0.06317558, + "balance_loss_mlp": 0.01278064, + "epoch": 0.10341199458890726, + "flos": 14980909852800.0, + "grad_norm": 1.9928809485399477, + "language_loss": 0.94301736, + "learning_rate": 3.9437329884941665e-06, + "loss": 1.02308702, + "num_input_tokens_seen": 37158350, + "router_z_loss_clip": 3.71679688, + "router_z_loss_mlp": 0.39575195, + "step": 1720, + "time_per_iteration": 2.53070068359375 + }, + { + "auxiliary_loss_clip": 0.06693915, + "auxiliary_loss_mlp": 0.01322313, + "balance_loss_clip": 0.06316631, + "balance_loss_mlp": 0.0127811, + "epoch": 0.10347211784157523, + "flos": 21037745111040.0, + "grad_norm": 2.2577738133608944, + "language_loss": 0.80850732, + "learning_rate": 3.943641220792039e-06, + "loss": 0.88866961, + "num_input_tokens_seen": 37177120, + "router_z_loss_clip": 3.77148438, + "router_z_loss_mlp": 0.44213867, + "step": 1721, + "time_per_iteration": 2.6165122985839844 + }, + { + "auxiliary_loss_clip": 0.06711201, + "auxiliary_loss_mlp": 0.01332384, + "balance_loss_clip": 0.06324577, + "balance_loss_mlp": 0.01286345, + "epoch": 0.1035322410942432, + "flos": 19798216462080.0, + "grad_norm": 2.2916288774806137, + "language_loss": 0.81885946, + "learning_rate": 3.9435493793868434e-06, + "loss": 0.89929533, + "num_input_tokens_seen": 37195895, + "router_z_loss_clip": 3.87109375, + "router_z_loss_mlp": 0.46044922, + "step": 1722, + "time_per_iteration": 2.585881471633911 + }, + { + "auxiliary_loss_clip": 0.06547391, + "auxiliary_loss_mlp": 0.01290481, + "balance_loss_clip": 0.06313527, + "balance_loss_mlp": 0.01272635, + "epoch": 0.10359236434691117, + "flos": 52716037305600.0, + "grad_norm": 0.9610809671594381, + "language_loss": 0.66722119, + "learning_rate": 3.943457464282059e-06, + "loss": 0.74559999, + "num_input_tokens_seen": 37247270, + "router_z_loss_clip": 2.34375, + "router_z_loss_mlp": 0.17883301, + "step": 1723, + "time_per_iteration": 2.9245951175689697 + }, + { + "auxiliary_loss_clip": 0.0669903, + "auxiliary_loss_mlp": 0.01310212, + "balance_loss_clip": 0.06318312, + "balance_loss_mlp": 0.01267582, + "epoch": 0.10365248759957914, + "flos": 18411255354240.0, + "grad_norm": 3.390195963482514, + "language_loss": 0.78785694, + "learning_rate": 3.9433654754811745e-06, + "loss": 0.86794937, + "num_input_tokens_seen": 37265595, + "router_z_loss_clip": 3.8046875, + "router_z_loss_mlp": 0.42651367, + "step": 1724, + "time_per_iteration": 2.587998151779175 + }, + { + "auxiliary_loss_clip": 0.06701188, + "auxiliary_loss_mlp": 0.01310671, + "balance_loss_clip": 0.06321733, + "balance_loss_mlp": 0.01269663, + "epoch": 0.1037126108522471, + "flos": 47563615820160.0, + "grad_norm": 2.288753840195378, + "language_loss": 0.76223904, + "learning_rate": 3.943273412987676e-06, + "loss": 0.84235764, + "num_input_tokens_seen": 37286660, + "router_z_loss_clip": 3.79296875, + "router_z_loss_mlp": 0.41015625, + "step": 1725, + "time_per_iteration": 2.7683663368225098 + }, + { + "auxiliary_loss_clip": 0.06675334, + "auxiliary_loss_mlp": 0.01298882, + "balance_loss_clip": 0.06309348, + "balance_loss_mlp": 0.01258041, + "epoch": 0.10377273410491508, + "flos": 22822671985920.0, + "grad_norm": 2.2764288322332265, + "language_loss": 0.76062018, + "learning_rate": 3.943181276805054e-06, + "loss": 0.84036231, + "num_input_tokens_seen": 37304915, + "router_z_loss_clip": 3.66601562, + "router_z_loss_mlp": 0.40869141, + "step": 1726, + "time_per_iteration": 2.587892770767212 + }, + { + "auxiliary_loss_clip": 0.06701919, + "auxiliary_loss_mlp": 0.01307243, + "balance_loss_clip": 0.0631658, + "balance_loss_mlp": 0.0126316, + "epoch": 0.10383285735758305, + "flos": 26145556225920.0, + "grad_norm": 2.697441848061202, + "language_loss": 0.76235563, + "learning_rate": 3.9430890669368035e-06, + "loss": 0.84244722, + "num_input_tokens_seen": 37325265, + "router_z_loss_clip": 3.85351562, + "router_z_loss_mlp": 0.44042969, + "step": 1727, + "time_per_iteration": 2.6308248043060303 + }, + { + "auxiliary_loss_clip": 0.06691539, + "auxiliary_loss_mlp": 0.0130793, + "balance_loss_clip": 0.0631765, + "balance_loss_mlp": 0.01265277, + "epoch": 0.10389298061025101, + "flos": 17097402533760.0, + "grad_norm": 2.4502843901442315, + "language_loss": 0.86415958, + "learning_rate": 3.942996783386422e-06, + "loss": 0.94415426, + "num_input_tokens_seen": 37341650, + "router_z_loss_clip": 3.73828125, + "router_z_loss_mlp": 0.42675781, + "step": 1728, + "time_per_iteration": 2.5618197917938232 + }, + { + "auxiliary_loss_clip": 0.06685561, + "auxiliary_loss_mlp": 0.01302161, + "balance_loss_clip": 0.06312057, + "balance_loss_mlp": 0.01259484, + "epoch": 0.10395310386291898, + "flos": 20782683682560.0, + "grad_norm": 2.0546311064170726, + "language_loss": 0.71406788, + "learning_rate": 3.942904426157406e-06, + "loss": 0.79394507, + "num_input_tokens_seen": 37360270, + "router_z_loss_clip": 3.73632812, + "router_z_loss_mlp": 0.42675781, + "step": 1729, + "time_per_iteration": 2.5618793964385986 + }, + { + "auxiliary_loss_clip": 0.06693864, + "auxiliary_loss_mlp": 0.01305753, + "balance_loss_clip": 0.06314608, + "balance_loss_mlp": 0.01260954, + "epoch": 0.10401322711558696, + "flos": 12825032952960.0, + "grad_norm": 2.8841772006205617, + "language_loss": 0.83575559, + "learning_rate": 3.9428119952532605e-06, + "loss": 0.91575181, + "num_input_tokens_seen": 37375225, + "router_z_loss_clip": 3.79492188, + "router_z_loss_mlp": 0.44775391, + "step": 1730, + "time_per_iteration": 2.623878002166748 + }, + { + "auxiliary_loss_clip": 0.06680113, + "auxiliary_loss_mlp": 0.01302214, + "balance_loss_clip": 0.06313114, + "balance_loss_mlp": 0.01260681, + "epoch": 0.10407335036825492, + "flos": 23191274096640.0, + "grad_norm": 1.835927341089653, + "language_loss": 0.77408624, + "learning_rate": 3.942719490677489e-06, + "loss": 0.85390949, + "num_input_tokens_seen": 37395165, + "router_z_loss_clip": 3.67382812, + "router_z_loss_mlp": 0.4152832, + "step": 1731, + "time_per_iteration": 2.5633392333984375 + }, + { + "auxiliary_loss_clip": 0.0668644, + "auxiliary_loss_mlp": 0.01313118, + "balance_loss_clip": 0.0632073, + "balance_loss_mlp": 0.01273159, + "epoch": 0.10413347362092289, + "flos": 26111370960000.0, + "grad_norm": 1.90471773366097, + "language_loss": 0.84198594, + "learning_rate": 3.9426269124336e-06, + "loss": 0.92198151, + "num_input_tokens_seen": 37414845, + "router_z_loss_clip": 3.65625, + "router_z_loss_mlp": 0.39941406, + "step": 1732, + "time_per_iteration": 2.6176345348358154 + }, + { + "auxiliary_loss_clip": 0.06683554, + "auxiliary_loss_mlp": 0.01314534, + "balance_loss_clip": 0.06312263, + "balance_loss_mlp": 0.01271905, + "epoch": 0.10419359687359087, + "flos": 12646014704640.0, + "grad_norm": 2.549467420686237, + "language_loss": 0.8515988, + "learning_rate": 3.942534260525104e-06, + "loss": 0.93157971, + "num_input_tokens_seen": 37432490, + "router_z_loss_clip": 3.71679688, + "router_z_loss_mlp": 0.42626953, + "step": 1733, + "time_per_iteration": 2.529829978942871 + }, + { + "auxiliary_loss_clip": 0.06699164, + "auxiliary_loss_mlp": 0.01313294, + "balance_loss_clip": 0.06323372, + "balance_loss_mlp": 0.01269139, + "epoch": 0.10425372012625883, + "flos": 12129099667200.0, + "grad_norm": 4.348408719624472, + "language_loss": 0.78445566, + "learning_rate": 3.942441534955514e-06, + "loss": 0.86458015, + "num_input_tokens_seen": 37449435, + "router_z_loss_clip": 3.7578125, + "router_z_loss_mlp": 0.44165039, + "step": 1734, + "time_per_iteration": 2.5436649322509766 + }, + { + "auxiliary_loss_clip": 0.06683113, + "auxiliary_loss_mlp": 0.01310658, + "balance_loss_clip": 0.06320634, + "balance_loss_mlp": 0.01270937, + "epoch": 0.1043138433789268, + "flos": 25344551520000.0, + "grad_norm": 1.8276863047745044, + "language_loss": 0.76546466, + "learning_rate": 3.9423487357283465e-06, + "loss": 0.84540236, + "num_input_tokens_seen": 37469105, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 0.3972168, + "step": 1735, + "time_per_iteration": 2.6129813194274902 + }, + { + "auxiliary_loss_clip": 0.06697765, + "auxiliary_loss_mlp": 0.01313856, + "balance_loss_clip": 0.06318491, + "balance_loss_mlp": 0.01269438, + "epoch": 0.10437396663159478, + "flos": 29174539870080.0, + "grad_norm": 2.0479038136948735, + "language_loss": 0.80253965, + "learning_rate": 3.94225586284712e-06, + "loss": 0.88265586, + "num_input_tokens_seen": 37490540, + "router_z_loss_clip": 3.79492188, + "router_z_loss_mlp": 0.44360352, + "step": 1736, + "time_per_iteration": 2.6438446044921875 + }, + { + "auxiliary_loss_clip": 0.06694648, + "auxiliary_loss_mlp": 0.01312039, + "balance_loss_clip": 0.06322388, + "balance_loss_mlp": 0.01269267, + "epoch": 0.10443408988426274, + "flos": 25087687228800.0, + "grad_norm": 4.638523885209388, + "language_loss": 0.71961701, + "learning_rate": 3.942162916315356e-06, + "loss": 0.79968387, + "num_input_tokens_seen": 37511905, + "router_z_loss_clip": 3.72460938, + "router_z_loss_mlp": 0.42773438, + "step": 1737, + "time_per_iteration": 2.5947039127349854 + }, + { + "auxiliary_loss_clip": 0.06704547, + "auxiliary_loss_mlp": 0.01309535, + "balance_loss_clip": 0.06322168, + "balance_loss_mlp": 0.01263305, + "epoch": 0.1044942131369307, + "flos": 26766746069760.0, + "grad_norm": 2.5677527060209715, + "language_loss": 0.83228981, + "learning_rate": 3.942069896136581e-06, + "loss": 0.91243058, + "num_input_tokens_seen": 37533635, + "router_z_loss_clip": 3.82617188, + "router_z_loss_mlp": 0.46191406, + "step": 1738, + "time_per_iteration": 2.615252733230591 + }, + { + "auxiliary_loss_clip": 0.06695886, + "auxiliary_loss_mlp": 0.01310975, + "balance_loss_clip": 0.06315427, + "balance_loss_mlp": 0.01265747, + "epoch": 0.10455433638959867, + "flos": 18448543221120.0, + "grad_norm": 2.179337588406841, + "language_loss": 0.76366144, + "learning_rate": 3.9419768023143196e-06, + "loss": 0.84373009, + "num_input_tokens_seen": 37552035, + "router_z_loss_clip": 3.80273438, + "router_z_loss_mlp": 0.45239258, + "step": 1739, + "time_per_iteration": 2.5386781692504883 + }, + { + "auxiliary_loss_clip": 0.06684839, + "auxiliary_loss_mlp": 0.01316183, + "balance_loss_clip": 0.06310752, + "balance_loss_mlp": 0.01271456, + "epoch": 0.10461445964226665, + "flos": 23225207800320.0, + "grad_norm": 1.9549702888486553, + "language_loss": 0.7847473, + "learning_rate": 3.941883634852104e-06, + "loss": 0.86475754, + "num_input_tokens_seen": 37571540, + "router_z_loss_clip": 3.74023438, + "router_z_loss_mlp": 0.44775391, + "step": 1740, + "time_per_iteration": 2.6215531826019287 + }, + { + "auxiliary_loss_clip": 0.06687017, + "auxiliary_loss_mlp": 0.01315844, + "balance_loss_clip": 0.06320937, + "balance_loss_mlp": 0.01273953, + "epoch": 0.10467458289493461, + "flos": 24350860350720.0, + "grad_norm": 2.5281783737696246, + "language_loss": 0.86859214, + "learning_rate": 3.941790393753467e-06, + "loss": 0.94862068, + "num_input_tokens_seen": 37588265, + "router_z_loss_clip": 3.66210938, + "router_z_loss_mlp": 0.41894531, + "step": 1741, + "time_per_iteration": 2.5947859287261963 + }, + { + "auxiliary_loss_clip": 0.06689818, + "auxiliary_loss_mlp": 0.01306432, + "balance_loss_clip": 0.06307445, + "balance_loss_mlp": 0.01259201, + "epoch": 0.10473470614760258, + "flos": 21294315912960.0, + "grad_norm": 3.2114625668667367, + "language_loss": 0.76732343, + "learning_rate": 3.941697079021942e-06, + "loss": 0.84728593, + "num_input_tokens_seen": 37606860, + "router_z_loss_clip": 3.82421875, + "router_z_loss_mlp": 0.47265625, + "step": 1742, + "time_per_iteration": 2.5832579135894775 + }, + { + "auxiliary_loss_clip": 0.06678567, + "auxiliary_loss_mlp": 0.01303781, + "balance_loss_clip": 0.06306475, + "balance_loss_mlp": 0.01260628, + "epoch": 0.10479482940027056, + "flos": 21693287928960.0, + "grad_norm": 9.553870000179, + "language_loss": 0.89069176, + "learning_rate": 3.94160369066107e-06, + "loss": 0.97051525, + "num_input_tokens_seen": 37625210, + "router_z_loss_clip": 3.71875, + "router_z_loss_mlp": 0.43164062, + "step": 1743, + "time_per_iteration": 2.5764474868774414 + }, + { + "auxiliary_loss_clip": 0.06671779, + "auxiliary_loss_mlp": 0.01307955, + "balance_loss_clip": 0.06307401, + "balance_loss_mlp": 0.01264801, + "epoch": 0.10485495265293852, + "flos": 21579076414080.0, + "grad_norm": 2.2332748103162907, + "language_loss": 0.77711093, + "learning_rate": 3.941510228674391e-06, + "loss": 0.8569082, + "num_input_tokens_seen": 37644110, + "router_z_loss_clip": 3.64648438, + "router_z_loss_mlp": 0.43164062, + "step": 1744, + "time_per_iteration": 2.5712687969207764 + }, + { + "auxiliary_loss_clip": 0.06674588, + "auxiliary_loss_mlp": 0.01310978, + "balance_loss_clip": 0.06307609, + "balance_loss_mlp": 0.01270685, + "epoch": 0.10491507590560649, + "flos": 37971070151040.0, + "grad_norm": 4.071178521090377, + "language_loss": 0.81752264, + "learning_rate": 3.941416693065451e-06, + "loss": 0.89737833, + "num_input_tokens_seen": 37665800, + "router_z_loss_clip": 3.671875, + "router_z_loss_mlp": 0.40332031, + "step": 1745, + "time_per_iteration": 2.7351014614105225 + }, + { + "auxiliary_loss_clip": 0.06685829, + "auxiliary_loss_mlp": 0.01305127, + "balance_loss_clip": 0.0631006, + "balance_loss_mlp": 0.01260472, + "epoch": 0.10497519915827447, + "flos": 26403552547200.0, + "grad_norm": 2.408878958176613, + "language_loss": 0.84535897, + "learning_rate": 3.941323083837794e-06, + "loss": 0.92526853, + "num_input_tokens_seen": 37685095, + "router_z_loss_clip": 3.7578125, + "router_z_loss_mlp": 0.44628906, + "step": 1746, + "time_per_iteration": 2.6103639602661133 + }, + { + "auxiliary_loss_clip": 0.06678679, + "auxiliary_loss_mlp": 0.01312181, + "balance_loss_clip": 0.06308784, + "balance_loss_mlp": 0.01272174, + "epoch": 0.10503532241094243, + "flos": 40671842152320.0, + "grad_norm": 2.4792988701606444, + "language_loss": 0.72187877, + "learning_rate": 3.941229400994971e-06, + "loss": 0.80178738, + "num_input_tokens_seen": 37707445, + "router_z_loss_clip": 3.69921875, + "router_z_loss_mlp": 0.40014648, + "step": 1747, + "time_per_iteration": 2.7907614707946777 + }, + { + "auxiliary_loss_clip": 0.06697921, + "auxiliary_loss_mlp": 0.01310121, + "balance_loss_clip": 0.06312211, + "balance_loss_mlp": 0.0126432, + "epoch": 0.1050954456636104, + "flos": 29797239087360.0, + "grad_norm": 4.268942313212568, + "language_loss": 0.86334866, + "learning_rate": 3.941135644540535e-06, + "loss": 0.94342911, + "num_input_tokens_seen": 37728325, + "router_z_loss_clip": 3.859375, + "router_z_loss_mlp": 0.45825195, + "step": 1748, + "time_per_iteration": 2.6081960201263428 + }, + { + "auxiliary_loss_clip": 0.06687598, + "auxiliary_loss_mlp": 0.01305718, + "balance_loss_clip": 0.06311792, + "balance_loss_mlp": 0.0126409, + "epoch": 0.10515556891627838, + "flos": 23955116716800.0, + "grad_norm": 1.9464829787737532, + "language_loss": 0.73449892, + "learning_rate": 3.941041814478041e-06, + "loss": 0.81443208, + "num_input_tokens_seen": 37748910, + "router_z_loss_clip": 3.76171875, + "router_z_loss_mlp": 0.41625977, + "step": 1749, + "time_per_iteration": 2.628052234649658 + }, + { + "auxiliary_loss_clip": 0.06669957, + "auxiliary_loss_mlp": 0.01310674, + "balance_loss_clip": 0.0630856, + "balance_loss_mlp": 0.01270882, + "epoch": 0.10521569216894634, + "flos": 18265458049920.0, + "grad_norm": 3.456638635747079, + "language_loss": 0.84465253, + "learning_rate": 3.940947910811047e-06, + "loss": 0.92445886, + "num_input_tokens_seen": 37765745, + "router_z_loss_clip": 3.61328125, + "router_z_loss_mlp": 0.39794922, + "step": 1750, + "time_per_iteration": 2.537736177444458 + }, + { + "auxiliary_loss_clip": 0.06687038, + "auxiliary_loss_mlp": 0.01306152, + "balance_loss_clip": 0.06307652, + "balance_loss_mlp": 0.01264238, + "epoch": 0.10527581542161431, + "flos": 15636033400320.0, + "grad_norm": 3.4228490231822364, + "language_loss": 0.94313812, + "learning_rate": 3.940853933543114e-06, + "loss": 1.0230701, + "num_input_tokens_seen": 37780520, + "router_z_loss_clip": 3.79101562, + "router_z_loss_mlp": 0.41918945, + "step": 1751, + "time_per_iteration": 2.525054931640625 + }, + { + "auxiliary_loss_clip": 0.06674927, + "auxiliary_loss_mlp": 0.01302904, + "balance_loss_clip": 0.06309814, + "balance_loss_mlp": 0.01265686, + "epoch": 0.10533593867428227, + "flos": 18302494354560.0, + "grad_norm": 3.1318677329631757, + "language_loss": 0.8055681, + "learning_rate": 3.940759882677805e-06, + "loss": 0.88534641, + "num_input_tokens_seen": 37799515, + "router_z_loss_clip": 3.65234375, + "router_z_loss_mlp": 0.37207031, + "step": 1752, + "time_per_iteration": 2.61299467086792 + }, + { + "auxiliary_loss_clip": 0.06668897, + "auxiliary_loss_mlp": 0.01309257, + "balance_loss_clip": 0.06304127, + "balance_loss_mlp": 0.01268869, + "epoch": 0.10539606192695025, + "flos": 29030922771840.0, + "grad_norm": 1.9587092194109417, + "language_loss": 0.77260768, + "learning_rate": 3.940665758218686e-06, + "loss": 0.85238922, + "num_input_tokens_seen": 37818695, + "router_z_loss_clip": 3.64453125, + "router_z_loss_mlp": 0.40356445, + "step": 1753, + "time_per_iteration": 3.9985692501068115 + }, + { + "auxiliary_loss_clip": 0.06682716, + "auxiliary_loss_mlp": 0.01311036, + "balance_loss_clip": 0.06304091, + "balance_loss_mlp": 0.01267, + "epoch": 0.10545618517961822, + "flos": 19974593306880.0, + "grad_norm": 2.3568862676270244, + "language_loss": 0.85363507, + "learning_rate": 3.940571560169328e-06, + "loss": 0.93357253, + "num_input_tokens_seen": 37837860, + "router_z_loss_clip": 3.78710938, + "router_z_loss_mlp": 0.44067383, + "step": 1754, + "time_per_iteration": 2.5938985347747803 + }, + { + "auxiliary_loss_clip": 0.06682456, + "auxiliary_loss_mlp": 0.01316264, + "balance_loss_clip": 0.06304919, + "balance_loss_mlp": 0.012723, + "epoch": 0.10551630843228618, + "flos": 16148923441920.0, + "grad_norm": 4.265882829931168, + "language_loss": 0.71315837, + "learning_rate": 3.940477288533302e-06, + "loss": 0.7931456, + "num_input_tokens_seen": 37856260, + "router_z_loss_clip": 3.77148438, + "router_z_loss_mlp": 0.43969727, + "step": 1755, + "time_per_iteration": 3.9860999584198 + }, + { + "auxiliary_loss_clip": 0.06684709, + "auxiliary_loss_mlp": 0.01318348, + "balance_loss_clip": 0.06302933, + "balance_loss_mlp": 0.01273025, + "epoch": 0.10557643168495416, + "flos": 23446754795520.0, + "grad_norm": 2.7157076999837364, + "language_loss": 0.78681093, + "learning_rate": 3.940382943314182e-06, + "loss": 0.86684155, + "num_input_tokens_seen": 37876960, + "router_z_loss_clip": 3.8203125, + "router_z_loss_mlp": 0.453125, + "step": 1756, + "time_per_iteration": 2.616227149963379 + }, + { + "auxiliary_loss_clip": 0.06683522, + "auxiliary_loss_mlp": 0.01310683, + "balance_loss_clip": 0.06306458, + "balance_loss_mlp": 0.0126927, + "epoch": 0.10563655493762213, + "flos": 21805528872960.0, + "grad_norm": 1.8370818155350874, + "language_loss": 0.81619543, + "learning_rate": 3.940288524515547e-06, + "loss": 0.89613748, + "num_input_tokens_seen": 37897070, + "router_z_loss_clip": 3.77148438, + "router_z_loss_mlp": 0.41381836, + "step": 1757, + "time_per_iteration": 2.5410592555999756 + }, + { + "auxiliary_loss_clip": 0.06685489, + "auxiliary_loss_mlp": 0.01318192, + "balance_loss_clip": 0.06307954, + "balance_loss_mlp": 0.01272177, + "epoch": 0.10569667819029009, + "flos": 53813347176960.0, + "grad_norm": 2.270274116106966, + "language_loss": 0.800345, + "learning_rate": 3.940194032140976e-06, + "loss": 0.88038182, + "num_input_tokens_seen": 37923635, + "router_z_loss_clip": 3.77734375, + "router_z_loss_mlp": 0.46020508, + "step": 1758, + "time_per_iteration": 4.229799032211304 + }, + { + "auxiliary_loss_clip": 0.06687906, + "auxiliary_loss_mlp": 0.01314474, + "balance_loss_clip": 0.06312382, + "balance_loss_mlp": 0.01272537, + "epoch": 0.10575680144295807, + "flos": 22931432985600.0, + "grad_norm": 1.92460183667747, + "language_loss": 0.93262696, + "learning_rate": 3.940099466194054e-06, + "loss": 1.01265085, + "num_input_tokens_seen": 37942650, + "router_z_loss_clip": 3.75585938, + "router_z_loss_mlp": 0.41967773, + "step": 1759, + "time_per_iteration": 4.090106248855591 + }, + { + "auxiliary_loss_clip": 0.066918, + "auxiliary_loss_mlp": 0.01305635, + "balance_loss_clip": 0.06315835, + "balance_loss_mlp": 0.01262219, + "epoch": 0.10581692469562604, + "flos": 14141820666240.0, + "grad_norm": 3.0343588084928204, + "language_loss": 0.78992438, + "learning_rate": 3.940004826678365e-06, + "loss": 0.86989868, + "num_input_tokens_seen": 37960660, + "router_z_loss_clip": 3.75976562, + "router_z_loss_mlp": 0.43383789, + "step": 1760, + "time_per_iteration": 2.5582082271575928 + }, + { + "auxiliary_loss_clip": 0.06697676, + "auxiliary_loss_mlp": 0.0131432, + "balance_loss_clip": 0.06312977, + "balance_loss_mlp": 0.01266588, + "epoch": 0.105877047948294, + "flos": 25965909072000.0, + "grad_norm": 2.31808263898244, + "language_loss": 0.91032952, + "learning_rate": 3.939910113597498e-06, + "loss": 0.99044949, + "num_input_tokens_seen": 37978625, + "router_z_loss_clip": 3.8515625, + "router_z_loss_mlp": 0.47729492, + "step": 1761, + "time_per_iteration": 2.5757992267608643 + }, + { + "auxiliary_loss_clip": 0.06676473, + "auxiliary_loss_mlp": 0.01306238, + "balance_loss_clip": 0.06308871, + "balance_loss_mlp": 0.01264229, + "epoch": 0.10593717120096197, + "flos": 30672693745920.0, + "grad_norm": 2.4539135080814862, + "language_loss": 0.79606199, + "learning_rate": 3.9398153269550464e-06, + "loss": 0.87588912, + "num_input_tokens_seen": 38000005, + "router_z_loss_clip": 3.6796875, + "router_z_loss_mlp": 0.42041016, + "step": 1762, + "time_per_iteration": 2.6716315746307373 + }, + { + "auxiliary_loss_clip": 0.06617578, + "auxiliary_loss_mlp": 0.01351391, + "balance_loss_clip": 0.06387473, + "balance_loss_mlp": 0.01331745, + "epoch": 0.10599729445362994, + "flos": 66459347153280.0, + "grad_norm": 0.7549006377741803, + "language_loss": 0.60690284, + "learning_rate": 3.939720466754602e-06, + "loss": 0.68659246, + "num_input_tokens_seen": 38066165, + "router_z_loss_clip": 2.31054688, + "router_z_loss_mlp": 0.19628906, + "step": 1763, + "time_per_iteration": 3.3268401622772217 + }, + { + "auxiliary_loss_clip": 0.06678826, + "auxiliary_loss_mlp": 0.01304205, + "balance_loss_clip": 0.06307326, + "balance_loss_mlp": 0.01263221, + "epoch": 0.10605741770629791, + "flos": 23954445884160.0, + "grad_norm": 2.5468873407149744, + "language_loss": 0.81550586, + "learning_rate": 3.939625532999763e-06, + "loss": 0.89533615, + "num_input_tokens_seen": 38086150, + "router_z_loss_clip": 3.71289062, + "router_z_loss_mlp": 0.40991211, + "step": 1764, + "time_per_iteration": 2.6332688331604004 + }, + { + "auxiliary_loss_clip": 0.06680285, + "auxiliary_loss_mlp": 0.01305528, + "balance_loss_clip": 0.06314, + "balance_loss_mlp": 0.0126359, + "epoch": 0.10611754095896588, + "flos": 19393039244160.0, + "grad_norm": 2.1888720223736384, + "language_loss": 0.81130767, + "learning_rate": 3.9395305256941314e-06, + "loss": 0.89116579, + "num_input_tokens_seen": 38104205, + "router_z_loss_clip": 3.66210938, + "router_z_loss_mlp": 0.41943359, + "step": 1765, + "time_per_iteration": 2.5613298416137695 + }, + { + "auxiliary_loss_clip": 0.0667872, + "auxiliary_loss_mlp": 0.01306506, + "balance_loss_clip": 0.06306241, + "balance_loss_mlp": 0.01263328, + "epoch": 0.10617766421163385, + "flos": 22244472086400.0, + "grad_norm": 2.2657345433152853, + "language_loss": 0.78213799, + "learning_rate": 3.939435444841306e-06, + "loss": 0.86199021, + "num_input_tokens_seen": 38122005, + "router_z_loss_clip": 3.72460938, + "router_z_loss_mlp": 0.43188477, + "step": 1766, + "time_per_iteration": 2.596531867980957 + }, + { + "auxiliary_loss_clip": 0.0668143, + "auxiliary_loss_mlp": 0.01312404, + "balance_loss_clip": 0.06318849, + "balance_loss_mlp": 0.01270705, + "epoch": 0.10623778746430182, + "flos": 28412248550400.0, + "grad_norm": 1.8379569457301719, + "language_loss": 0.78568375, + "learning_rate": 3.939340290444895e-06, + "loss": 0.8656221, + "num_input_tokens_seen": 38143365, + "router_z_loss_clip": 3.62304688, + "router_z_loss_mlp": 0.41674805, + "step": 1767, + "time_per_iteration": 2.6066575050354004 + }, + { + "auxiliary_loss_clip": 0.06566842, + "auxiliary_loss_mlp": 0.01278755, + "balance_loss_clip": 0.06337046, + "balance_loss_mlp": 0.01260039, + "epoch": 0.10629791071696978, + "flos": 64254778231680.0, + "grad_norm": 0.6896173149576642, + "language_loss": 0.57757622, + "learning_rate": 3.939245062508506e-06, + "loss": 0.6560322, + "num_input_tokens_seen": 38210035, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.18688965, + "step": 1768, + "time_per_iteration": 3.3073205947875977 + }, + { + "auxiliary_loss_clip": 0.06681848, + "auxiliary_loss_mlp": 0.01302238, + "balance_loss_clip": 0.06313933, + "balance_loss_mlp": 0.01260634, + "epoch": 0.10635803396963776, + "flos": 22754217600000.0, + "grad_norm": 1.7735238866189138, + "language_loss": 0.88016206, + "learning_rate": 3.939149761035749e-06, + "loss": 0.9600029, + "num_input_tokens_seen": 38231230, + "router_z_loss_clip": 3.67578125, + "router_z_loss_mlp": 0.41625977, + "step": 1769, + "time_per_iteration": 2.59757924079895 + }, + { + "auxiliary_loss_clip": 0.06688489, + "auxiliary_loss_mlp": 0.01307377, + "balance_loss_clip": 0.06315921, + "balance_loss_mlp": 0.01266035, + "epoch": 0.10641815722230573, + "flos": 31403818546560.0, + "grad_norm": 1.8774824554466385, + "language_loss": 0.62396371, + "learning_rate": 3.9390543860302395e-06, + "loss": 0.70392233, + "num_input_tokens_seen": 38253890, + "router_z_loss_clip": 3.72460938, + "router_z_loss_mlp": 0.41357422, + "step": 1770, + "time_per_iteration": 2.619767904281616 + }, + { + "auxiliary_loss_clip": 0.06544405, + "auxiliary_loss_mlp": 0.01277398, + "balance_loss_clip": 0.06314689, + "balance_loss_mlp": 0.01260136, + "epoch": 0.1064782804749737, + "flos": 58567230645120.0, + "grad_norm": 0.8566843095142983, + "language_loss": 0.57127362, + "learning_rate": 3.9389589374955925e-06, + "loss": 0.64949167, + "num_input_tokens_seen": 38304290, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.17285156, + "step": 1771, + "time_per_iteration": 3.075225353240967 + }, + { + "auxiliary_loss_clip": 0.06680871, + "auxiliary_loss_mlp": 0.01316894, + "balance_loss_clip": 0.06314114, + "balance_loss_mlp": 0.01274432, + "epoch": 0.10653840372764166, + "flos": 23994626716800.0, + "grad_norm": 1.9413884947034454, + "language_loss": 0.90273499, + "learning_rate": 3.938863415435429e-06, + "loss": 0.98271263, + "num_input_tokens_seen": 38324725, + "router_z_loss_clip": 3.66601562, + "router_z_loss_mlp": 0.42431641, + "step": 1772, + "time_per_iteration": 2.5640146732330322 + }, + { + "auxiliary_loss_clip": 0.06695, + "auxiliary_loss_mlp": 0.01317722, + "balance_loss_clip": 0.0631227, + "balance_loss_mlp": 0.01272828, + "epoch": 0.10659852698030964, + "flos": 18300272221440.0, + "grad_norm": 4.259637608820723, + "language_loss": 0.78636491, + "learning_rate": 3.93876781985337e-06, + "loss": 0.86649209, + "num_input_tokens_seen": 38340735, + "router_z_loss_clip": 3.83203125, + "router_z_loss_mlp": 0.44824219, + "step": 1773, + "time_per_iteration": 2.528411626815796 + }, + { + "auxiliary_loss_clip": 0.06679896, + "auxiliary_loss_mlp": 0.01313366, + "balance_loss_clip": 0.06312554, + "balance_loss_mlp": 0.01272024, + "epoch": 0.1066586502329776, + "flos": 32168751269760.0, + "grad_norm": 2.123173958110219, + "language_loss": 0.84472597, + "learning_rate": 3.938672150753041e-06, + "loss": 0.92465854, + "num_input_tokens_seen": 38361315, + "router_z_loss_clip": 3.67578125, + "router_z_loss_mlp": 0.41333008, + "step": 1774, + "time_per_iteration": 2.6232900619506836 + }, + { + "auxiliary_loss_clip": 0.06689709, + "auxiliary_loss_mlp": 0.01315484, + "balance_loss_clip": 0.06314571, + "balance_loss_mlp": 0.0127245, + "epoch": 0.10671877348564557, + "flos": 17790904051200.0, + "grad_norm": 3.7633279602301326, + "language_loss": 0.78288794, + "learning_rate": 3.9385764081380704e-06, + "loss": 0.86293983, + "num_input_tokens_seen": 38377425, + "router_z_loss_clip": 3.75585938, + "router_z_loss_mlp": 0.43066406, + "step": 1775, + "time_per_iteration": 2.5444161891937256 + }, + { + "auxiliary_loss_clip": 0.06541309, + "auxiliary_loss_mlp": 0.01282331, + "balance_loss_clip": 0.06314777, + "balance_loss_mlp": 0.0126594, + "epoch": 0.10677889673831355, + "flos": 63531074517120.0, + "grad_norm": 0.8449773894494127, + "language_loss": 0.57561356, + "learning_rate": 3.9384805920120876e-06, + "loss": 0.65384996, + "num_input_tokens_seen": 38440275, + "router_z_loss_clip": 2.265625, + "router_z_loss_mlp": 0.16394043, + "step": 1776, + "time_per_iteration": 3.194715976715088 + }, + { + "auxiliary_loss_clip": 0.06668387, + "auxiliary_loss_mlp": 0.01308478, + "balance_loss_clip": 0.063052, + "balance_loss_mlp": 0.01266421, + "epoch": 0.10683901999098151, + "flos": 22024182902400.0, + "grad_norm": 4.182030492494299, + "language_loss": 0.84917277, + "learning_rate": 3.938384702378727e-06, + "loss": 0.92894137, + "num_input_tokens_seen": 38461820, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 0.42041016, + "step": 1777, + "time_per_iteration": 2.595827102661133 + }, + { + "auxiliary_loss_clip": 0.06665277, + "auxiliary_loss_mlp": 0.01305083, + "balance_loss_clip": 0.06308808, + "balance_loss_mlp": 0.01265076, + "epoch": 0.10689914324364948, + "flos": 25049435040000.0, + "grad_norm": 3.105295988575609, + "language_loss": 0.89778632, + "learning_rate": 3.938288739241625e-06, + "loss": 0.97748995, + "num_input_tokens_seen": 38482235, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 0.40014648, + "step": 1778, + "time_per_iteration": 2.5659501552581787 + }, + { + "auxiliary_loss_clip": 0.06673209, + "auxiliary_loss_mlp": 0.0130986, + "balance_loss_clip": 0.06311059, + "balance_loss_mlp": 0.01270068, + "epoch": 0.10695926649631746, + "flos": 16440643831680.0, + "grad_norm": 2.394911901784639, + "language_loss": 0.85383832, + "learning_rate": 3.938192702604417e-06, + "loss": 0.93366897, + "num_input_tokens_seen": 38500690, + "router_z_loss_clip": 3.62304688, + "router_z_loss_mlp": 0.39794922, + "step": 1779, + "time_per_iteration": 2.593081474304199 + }, + { + "auxiliary_loss_clip": 0.06673639, + "auxiliary_loss_mlp": 0.01307049, + "balance_loss_clip": 0.06310658, + "balance_loss_mlp": 0.01266255, + "epoch": 0.10701938974898542, + "flos": 16984281121920.0, + "grad_norm": 6.263456292034634, + "language_loss": 0.689089, + "learning_rate": 3.9380965924707495e-06, + "loss": 0.76889586, + "num_input_tokens_seen": 38518405, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 0.40844727, + "step": 1780, + "time_per_iteration": 2.5288658142089844 + }, + { + "auxiliary_loss_clip": 0.06670965, + "auxiliary_loss_mlp": 0.01308635, + "balance_loss_clip": 0.06307741, + "balance_loss_mlp": 0.01267675, + "epoch": 0.10707951300165339, + "flos": 15893568524160.0, + "grad_norm": 2.7813039840033116, + "language_loss": 0.94183797, + "learning_rate": 3.938000408844265e-06, + "loss": 1.02163386, + "num_input_tokens_seen": 38535060, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 0.40942383, + "step": 1781, + "time_per_iteration": 2.5472099781036377 + }, + { + "auxiliary_loss_clip": 0.06674273, + "auxiliary_loss_mlp": 0.01309874, + "balance_loss_clip": 0.06307364, + "balance_loss_mlp": 0.01267793, + "epoch": 0.10713963625432135, + "flos": 14252510309760.0, + "grad_norm": 2.902551508287184, + "language_loss": 0.80661923, + "learning_rate": 3.9379041517286105e-06, + "loss": 0.88646066, + "num_input_tokens_seen": 38552855, + "router_z_loss_clip": 3.671875, + "router_z_loss_mlp": 0.4206543, + "step": 1782, + "time_per_iteration": 2.510643482208252 + }, + { + "auxiliary_loss_clip": 0.06686161, + "auxiliary_loss_mlp": 0.01310662, + "balance_loss_clip": 0.06313431, + "balance_loss_mlp": 0.01267341, + "epoch": 0.10719975950698933, + "flos": 16761224753280.0, + "grad_norm": 2.870404925374148, + "language_loss": 0.80170923, + "learning_rate": 3.937807821127436e-06, + "loss": 0.88167745, + "num_input_tokens_seen": 38570075, + "router_z_loss_clip": 3.7265625, + "router_z_loss_mlp": 0.43334961, + "step": 1783, + "time_per_iteration": 2.5342109203338623 + }, + { + "auxiliary_loss_clip": 0.06683534, + "auxiliary_loss_mlp": 0.01311834, + "balance_loss_clip": 0.063077, + "balance_loss_mlp": 0.0126818, + "epoch": 0.1072598827596573, + "flos": 22717181295360.0, + "grad_norm": 2.882000106412139, + "language_loss": 0.88123596, + "learning_rate": 3.937711417044395e-06, + "loss": 0.96118969, + "num_input_tokens_seen": 38587970, + "router_z_loss_clip": 3.75390625, + "router_z_loss_mlp": 0.4362793, + "step": 1784, + "time_per_iteration": 2.5347747802734375 + }, + { + "auxiliary_loss_clip": 0.0667218, + "auxiliary_loss_mlp": 0.0129997, + "balance_loss_clip": 0.06303082, + "balance_loss_mlp": 0.01257484, + "epoch": 0.10732000601232526, + "flos": 23264969362560.0, + "grad_norm": 3.307544320202646, + "language_loss": 1.02124667, + "learning_rate": 3.937614939483143e-06, + "loss": 1.10096812, + "num_input_tokens_seen": 38605840, + "router_z_loss_clip": 3.6875, + "router_z_loss_mlp": 0.42480469, + "step": 1785, + "time_per_iteration": 2.573028802871704 + }, + { + "auxiliary_loss_clip": 0.06653184, + "auxiliary_loss_mlp": 0.01298346, + "balance_loss_clip": 0.06301528, + "balance_loss_mlp": 0.01260676, + "epoch": 0.10738012926499324, + "flos": 24213951578880.0, + "grad_norm": 1.5126040850021356, + "language_loss": 0.86291718, + "learning_rate": 3.937518388447339e-06, + "loss": 0.94243246, + "num_input_tokens_seen": 38627070, + "router_z_loss_clip": 3.51757812, + "router_z_loss_mlp": 0.37670898, + "step": 1786, + "time_per_iteration": 2.583588123321533 + }, + { + "auxiliary_loss_clip": 0.06674268, + "auxiliary_loss_mlp": 0.01305446, + "balance_loss_clip": 0.06299917, + "balance_loss_mlp": 0.01260337, + "epoch": 0.1074402525176612, + "flos": 20929361454720.0, + "grad_norm": 2.204457856509681, + "language_loss": 0.80718577, + "learning_rate": 3.937421763940642e-06, + "loss": 0.88698298, + "num_input_tokens_seen": 38645840, + "router_z_loss_clip": 3.74414062, + "router_z_loss_mlp": 0.45092773, + "step": 1787, + "time_per_iteration": 2.5648107528686523 + }, + { + "auxiliary_loss_clip": 0.06675328, + "auxiliary_loss_mlp": 0.01304706, + "balance_loss_clip": 0.06304328, + "balance_loss_mlp": 0.01262769, + "epoch": 0.10750037577032917, + "flos": 16952695332480.0, + "grad_norm": 2.64327450986053, + "language_loss": 0.8385697, + "learning_rate": 3.937325065966719e-06, + "loss": 0.91837001, + "num_input_tokens_seen": 38664770, + "router_z_loss_clip": 3.71484375, + "router_z_loss_mlp": 0.41943359, + "step": 1788, + "time_per_iteration": 2.5402321815490723 + }, + { + "auxiliary_loss_clip": 0.06668989, + "auxiliary_loss_mlp": 0.01316653, + "balance_loss_clip": 0.0630315, + "balance_loss_mlp": 0.01276384, + "epoch": 0.10756049902299715, + "flos": 20272770460800.0, + "grad_norm": 2.8631598958886135, + "language_loss": 0.79821587, + "learning_rate": 3.9372282945292335e-06, + "loss": 0.87807226, + "num_input_tokens_seen": 38683865, + "router_z_loss_clip": 3.66015625, + "router_z_loss_mlp": 0.40258789, + "step": 1789, + "time_per_iteration": 2.5255203247070312 + }, + { + "auxiliary_loss_clip": 0.06671752, + "auxiliary_loss_mlp": 0.01304626, + "balance_loss_clip": 0.06304207, + "balance_loss_mlp": 0.01261019, + "epoch": 0.10762062227566511, + "flos": 23593264859520.0, + "grad_norm": 3.1602441142249584, + "language_loss": 0.75890934, + "learning_rate": 3.937131449631859e-06, + "loss": 0.83867311, + "num_input_tokens_seen": 38702485, + "router_z_loss_clip": 3.671875, + "router_z_loss_mlp": 0.43603516, + "step": 1790, + "time_per_iteration": 2.6021804809570312 + }, + { + "auxiliary_loss_clip": 0.06681746, + "auxiliary_loss_mlp": 0.01304108, + "balance_loss_clip": 0.06308466, + "balance_loss_mlp": 0.01261741, + "epoch": 0.10768074552833308, + "flos": 24316549303680.0, + "grad_norm": 2.153087509424505, + "language_loss": 0.80275488, + "learning_rate": 3.9370345312782645e-06, + "loss": 0.88261342, + "num_input_tokens_seen": 38722475, + "router_z_loss_clip": 3.734375, + "router_z_loss_mlp": 0.42333984, + "step": 1791, + "time_per_iteration": 2.546696662902832 + }, + { + "auxiliary_loss_clip": 0.06660049, + "auxiliary_loss_mlp": 0.01311951, + "balance_loss_clip": 0.06307684, + "balance_loss_mlp": 0.01273255, + "epoch": 0.10774086878100106, + "flos": 25306760528640.0, + "grad_norm": 1.9333309848647533, + "language_loss": 0.72259545, + "learning_rate": 3.936937539472126e-06, + "loss": 0.80231547, + "num_input_tokens_seen": 38743285, + "router_z_loss_clip": 3.5234375, + "router_z_loss_mlp": 0.38647461, + "step": 1792, + "time_per_iteration": 3.9801604747772217 + }, + { + "auxiliary_loss_clip": 0.06673245, + "auxiliary_loss_mlp": 0.01302989, + "balance_loss_clip": 0.06307209, + "balance_loss_mlp": 0.01260813, + "epoch": 0.10780099203366902, + "flos": 22060506447360.0, + "grad_norm": 2.562098500680419, + "language_loss": 0.78115147, + "learning_rate": 3.9368404742171236e-06, + "loss": 0.86091387, + "num_input_tokens_seen": 38763035, + "router_z_loss_clip": 3.65820312, + "router_z_loss_mlp": 0.42163086, + "step": 1793, + "time_per_iteration": 2.5435540676116943 + }, + { + "auxiliary_loss_clip": 0.06668183, + "auxiliary_loss_mlp": 0.01304414, + "balance_loss_clip": 0.06312631, + "balance_loss_mlp": 0.01268151, + "epoch": 0.10786111528633699, + "flos": 22754091818880.0, + "grad_norm": 1.5894120102976992, + "language_loss": 0.86093199, + "learning_rate": 3.936743335516936e-06, + "loss": 0.94065803, + "num_input_tokens_seen": 38784900, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 0.36279297, + "step": 1794, + "time_per_iteration": 4.001549482345581 + }, + { + "auxiliary_loss_clip": 0.0669271, + "auxiliary_loss_mlp": 0.01312602, + "balance_loss_clip": 0.06319374, + "balance_loss_mlp": 0.01269472, + "epoch": 0.10792123853900495, + "flos": 20857510978560.0, + "grad_norm": 2.1590787324009257, + "language_loss": 0.77325815, + "learning_rate": 3.936646123375246e-06, + "loss": 0.8533113, + "num_input_tokens_seen": 38804695, + "router_z_loss_clip": 3.734375, + "router_z_loss_mlp": 0.43115234, + "step": 1795, + "time_per_iteration": 2.601548910140991 + }, + { + "auxiliary_loss_clip": 0.06686068, + "auxiliary_loss_mlp": 0.01301313, + "balance_loss_clip": 0.06317562, + "balance_loss_mlp": 0.01262212, + "epoch": 0.10798136179167293, + "flos": 17754454725120.0, + "grad_norm": 3.0035183040345306, + "language_loss": 0.83787191, + "learning_rate": 3.936548837795741e-06, + "loss": 0.91774577, + "num_input_tokens_seen": 38822395, + "router_z_loss_clip": 3.68359375, + "router_z_loss_mlp": 0.39086914, + "step": 1796, + "time_per_iteration": 2.506821870803833 + }, + { + "auxiliary_loss_clip": 0.06692545, + "auxiliary_loss_mlp": 0.01329164, + "balance_loss_clip": 0.06318776, + "balance_loss_mlp": 0.01285318, + "epoch": 0.1080414850443409, + "flos": 13594745358720.0, + "grad_norm": 2.560788533662373, + "language_loss": 0.7551347, + "learning_rate": 3.936451478782111e-06, + "loss": 0.83535177, + "num_input_tokens_seen": 38839865, + "router_z_loss_clip": 3.73828125, + "router_z_loss_mlp": 0.43847656, + "step": 1797, + "time_per_iteration": 3.9367597103118896 + }, + { + "auxiliary_loss_clip": 0.06662647, + "auxiliary_loss_mlp": 0.01300606, + "balance_loss_clip": 0.06308785, + "balance_loss_mlp": 0.0126265, + "epoch": 0.10810160829700886, + "flos": 16259026106880.0, + "grad_norm": 2.354924251941542, + "language_loss": 0.83353364, + "learning_rate": 3.936354046338046e-06, + "loss": 0.91316622, + "num_input_tokens_seen": 38857300, + "router_z_loss_clip": 3.53515625, + "router_z_loss_mlp": 0.37939453, + "step": 1798, + "time_per_iteration": 4.009509086608887 + }, + { + "auxiliary_loss_clip": 0.06672391, + "auxiliary_loss_mlp": 0.01305094, + "balance_loss_clip": 0.06315865, + "balance_loss_mlp": 0.01265635, + "epoch": 0.10816173154967684, + "flos": 15163282264320.0, + "grad_norm": 3.5539012768628786, + "language_loss": 0.87248892, + "learning_rate": 3.936256540467242e-06, + "loss": 0.95226371, + "num_input_tokens_seen": 38874960, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 0.39477539, + "step": 1799, + "time_per_iteration": 2.5058934688568115 + }, + { + "auxiliary_loss_clip": 0.06677136, + "auxiliary_loss_mlp": 0.01305557, + "balance_loss_clip": 0.06318786, + "balance_loss_mlp": 0.01268459, + "epoch": 0.10822185480234481, + "flos": 17791113686400.0, + "grad_norm": 2.263102555339672, + "language_loss": 0.78951424, + "learning_rate": 3.9361589611733955e-06, + "loss": 0.86934125, + "num_input_tokens_seen": 38893610, + "router_z_loss_clip": 3.58203125, + "router_z_loss_mlp": 0.37084961, + "step": 1800, + "time_per_iteration": 2.546147584915161 + }, + { + "auxiliary_loss_clip": 0.06672224, + "auxiliary_loss_mlp": 0.01299, + "balance_loss_clip": 0.06316296, + "balance_loss_mlp": 0.01262546, + "epoch": 0.10828197805501277, + "flos": 25563708673920.0, + "grad_norm": 5.510395821762047, + "language_loss": 0.74356997, + "learning_rate": 3.9360613084602075e-06, + "loss": 0.82328218, + "num_input_tokens_seen": 38913485, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 0.36425781, + "step": 1801, + "time_per_iteration": 2.6982262134552 + }, + { + "auxiliary_loss_clip": 0.06691626, + "auxiliary_loss_mlp": 0.01309625, + "balance_loss_clip": 0.06324095, + "balance_loss_mlp": 0.01272813, + "epoch": 0.10834210130768075, + "flos": 28991748188160.0, + "grad_norm": 2.1562213268616355, + "language_loss": 0.67963791, + "learning_rate": 3.935963582331381e-06, + "loss": 0.75965041, + "num_input_tokens_seen": 38935650, + "router_z_loss_clip": 3.67578125, + "router_z_loss_mlp": 0.3684082, + "step": 1802, + "time_per_iteration": 2.633770704269409 + }, + { + "auxiliary_loss_clip": 0.06676073, + "auxiliary_loss_mlp": 0.01309023, + "balance_loss_clip": 0.0632169, + "balance_loss_mlp": 0.01273379, + "epoch": 0.10840222456034872, + "flos": 20270045203200.0, + "grad_norm": 4.600711865085207, + "language_loss": 0.83367407, + "learning_rate": 3.935865782790621e-06, + "loss": 0.9135251, + "num_input_tokens_seen": 38954130, + "router_z_loss_clip": 3.55078125, + "router_z_loss_mlp": 0.35668945, + "step": 1803, + "time_per_iteration": 2.5231714248657227 + }, + { + "auxiliary_loss_clip": 0.06688153, + "auxiliary_loss_mlp": 0.01302267, + "balance_loss_clip": 0.06328186, + "balance_loss_mlp": 0.01263286, + "epoch": 0.10846234781301668, + "flos": 19868851054080.0, + "grad_norm": 2.166179009667806, + "language_loss": 0.92279881, + "learning_rate": 3.9357679098416365e-06, + "loss": 1.00270307, + "num_input_tokens_seen": 38972905, + "router_z_loss_clip": 3.59765625, + "router_z_loss_mlp": 0.39013672, + "step": 1804, + "time_per_iteration": 2.5790512561798096 + }, + { + "auxiliary_loss_clip": 0.06684472, + "auxiliary_loss_mlp": 0.01313096, + "balance_loss_clip": 0.06322414, + "balance_loss_mlp": 0.01273327, + "epoch": 0.10852247106568465, + "flos": 26476283491200.0, + "grad_norm": 2.1541825231451384, + "language_loss": 0.7834245, + "learning_rate": 3.935669963488139e-06, + "loss": 0.8634001, + "num_input_tokens_seen": 38993255, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 0.39794922, + "step": 1805, + "time_per_iteration": 2.579225778579712 + }, + { + "auxiliary_loss_clip": 0.06686831, + "auxiliary_loss_mlp": 0.01314489, + "balance_loss_clip": 0.06327775, + "balance_loss_mlp": 0.01276938, + "epoch": 0.10858259431835263, + "flos": 30089420674560.0, + "grad_norm": 1.8150777160293243, + "language_loss": 0.87391019, + "learning_rate": 3.935571943733843e-06, + "loss": 0.95392346, + "num_input_tokens_seen": 39012610, + "router_z_loss_clip": 3.58789062, + "router_z_loss_mlp": 0.37548828, + "step": 1806, + "time_per_iteration": 2.6113767623901367 + }, + { + "auxiliary_loss_clip": 0.06674515, + "auxiliary_loss_mlp": 0.01306373, + "balance_loss_clip": 0.06320654, + "balance_loss_mlp": 0.01270038, + "epoch": 0.10864271757102059, + "flos": 19069313794560.0, + "grad_norm": 2.587857349139583, + "language_loss": 0.81862879, + "learning_rate": 3.9354738505824635e-06, + "loss": 0.89843768, + "num_input_tokens_seen": 39030120, + "router_z_loss_clip": 3.53515625, + "router_z_loss_mlp": 0.36328125, + "step": 1807, + "time_per_iteration": 2.5133659839630127 + }, + { + "auxiliary_loss_clip": 0.06671922, + "auxiliary_loss_mlp": 0.01298096, + "balance_loss_clip": 0.06316403, + "balance_loss_mlp": 0.01264193, + "epoch": 0.10870284082368856, + "flos": 24721558813440.0, + "grad_norm": 5.872677105154593, + "language_loss": 0.80080831, + "learning_rate": 3.9353756840377225e-06, + "loss": 0.88050854, + "num_input_tokens_seen": 39049875, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 0.33911133, + "step": 1808, + "time_per_iteration": 2.615813732147217 + }, + { + "auxiliary_loss_clip": 0.06679243, + "auxiliary_loss_mlp": 0.01305785, + "balance_loss_clip": 0.06317936, + "balance_loss_mlp": 0.0126926, + "epoch": 0.10876296407635654, + "flos": 20633322579840.0, + "grad_norm": 1.9478579539752536, + "language_loss": 0.80837792, + "learning_rate": 3.935277444103342e-06, + "loss": 0.88822818, + "num_input_tokens_seen": 39068935, + "router_z_loss_clip": 3.61328125, + "router_z_loss_mlp": 0.36523438, + "step": 1809, + "time_per_iteration": 2.5448191165924072 + }, + { + "auxiliary_loss_clip": 0.0666375, + "auxiliary_loss_mlp": 0.01303981, + "balance_loss_clip": 0.06309726, + "balance_loss_mlp": 0.01265119, + "epoch": 0.1088230873290245, + "flos": 21586245937920.0, + "grad_norm": 2.4636813373380213, + "language_loss": 0.86466354, + "learning_rate": 3.935179130783046e-06, + "loss": 0.94434083, + "num_input_tokens_seen": 39087370, + "router_z_loss_clip": 3.54101562, + "router_z_loss_mlp": 0.38891602, + "step": 1810, + "time_per_iteration": 2.603607654571533 + }, + { + "auxiliary_loss_clip": 0.06689243, + "auxiliary_loss_mlp": 0.01306323, + "balance_loss_clip": 0.06319645, + "balance_loss_mlp": 0.01268367, + "epoch": 0.10888321058169247, + "flos": 26476283491200.0, + "grad_norm": 1.9747664396184277, + "language_loss": 0.65524805, + "learning_rate": 3.935080744080564e-06, + "loss": 0.73520374, + "num_input_tokens_seen": 39106635, + "router_z_loss_clip": 3.69726562, + "router_z_loss_mlp": 0.37939453, + "step": 1811, + "time_per_iteration": 2.581341505050659 + }, + { + "auxiliary_loss_clip": 0.0667599, + "auxiliary_loss_mlp": 0.01304861, + "balance_loss_clip": 0.06313843, + "balance_loss_mlp": 0.01266166, + "epoch": 0.10894333383436045, + "flos": 25855722552960.0, + "grad_norm": 2.675746043218001, + "language_loss": 0.75747859, + "learning_rate": 3.934982283999626e-06, + "loss": 0.83728707, + "num_input_tokens_seen": 39126335, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 0.38671875, + "step": 1812, + "time_per_iteration": 2.6015379428863525 + }, + { + "auxiliary_loss_clip": 0.06657378, + "auxiliary_loss_mlp": 0.01303294, + "balance_loss_clip": 0.06303936, + "balance_loss_mlp": 0.01265219, + "epoch": 0.10900345708702841, + "flos": 19543238887680.0, + "grad_norm": 2.31852988369708, + "language_loss": 0.74425399, + "learning_rate": 3.934883750543966e-06, + "loss": 0.82386076, + "num_input_tokens_seen": 39144820, + "router_z_loss_clip": 3.53320312, + "router_z_loss_mlp": 0.38085938, + "step": 1813, + "time_per_iteration": 2.5689308643341064 + }, + { + "auxiliary_loss_clip": 0.06659622, + "auxiliary_loss_mlp": 0.01293341, + "balance_loss_clip": 0.06308373, + "balance_loss_mlp": 0.01258556, + "epoch": 0.10906358033969638, + "flos": 23630091528960.0, + "grad_norm": 1.8365155089256564, + "language_loss": 0.84168994, + "learning_rate": 3.93478514371732e-06, + "loss": 0.92121959, + "num_input_tokens_seen": 39165945, + "router_z_loss_clip": 3.51367188, + "router_z_loss_mlp": 0.34790039, + "step": 1814, + "time_per_iteration": 2.5616791248321533 + }, + { + "auxiliary_loss_clip": 0.06670845, + "auxiliary_loss_mlp": 0.01300399, + "balance_loss_clip": 0.06307411, + "balance_loss_mlp": 0.01261036, + "epoch": 0.10912370359236434, + "flos": 21221039917440.0, + "grad_norm": 3.301230683958358, + "language_loss": 0.85154849, + "learning_rate": 3.934686463523429e-06, + "loss": 0.93126094, + "num_input_tokens_seen": 39183520, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 0.39355469, + "step": 1815, + "time_per_iteration": 2.57688307762146 + }, + { + "auxiliary_loss_clip": 0.06661555, + "auxiliary_loss_mlp": 0.01302183, + "balance_loss_clip": 0.06312289, + "balance_loss_mlp": 0.01263726, + "epoch": 0.10918382684503232, + "flos": 13558296032640.0, + "grad_norm": 2.7300514950641714, + "language_loss": 0.73428917, + "learning_rate": 3.9345877099660315e-06, + "loss": 0.81392652, + "num_input_tokens_seen": 39201190, + "router_z_loss_clip": 3.4921875, + "router_z_loss_mlp": 0.38476562, + "step": 1816, + "time_per_iteration": 2.503822088241577 + }, + { + "auxiliary_loss_clip": 0.06674603, + "auxiliary_loss_mlp": 0.01310351, + "balance_loss_clip": 0.06307942, + "balance_loss_mlp": 0.01269105, + "epoch": 0.10924395009770028, + "flos": 27971712109440.0, + "grad_norm": 2.9873916021139078, + "language_loss": 0.74010128, + "learning_rate": 3.9344888830488744e-06, + "loss": 0.81995082, + "num_input_tokens_seen": 39221210, + "router_z_loss_clip": 3.66601562, + "router_z_loss_mlp": 0.41235352, + "step": 1817, + "time_per_iteration": 2.636141300201416 + }, + { + "auxiliary_loss_clip": 0.06667508, + "auxiliary_loss_mlp": 0.01306282, + "balance_loss_clip": 0.06316356, + "balance_loss_mlp": 0.01268659, + "epoch": 0.10930407335036825, + "flos": 25600912686720.0, + "grad_norm": 1.8767258076281454, + "language_loss": 0.68811858, + "learning_rate": 3.934389982775706e-06, + "loss": 0.76785648, + "num_input_tokens_seen": 39242025, + "router_z_loss_clip": 3.51171875, + "router_z_loss_mlp": 0.37597656, + "step": 1818, + "time_per_iteration": 2.5823488235473633 + }, + { + "auxiliary_loss_clip": 0.06675036, + "auxiliary_loss_mlp": 0.01306463, + "balance_loss_clip": 0.06313543, + "balance_loss_mlp": 0.01266575, + "epoch": 0.10936419660303623, + "flos": 18412177749120.0, + "grad_norm": 2.168064712705315, + "language_loss": 0.74997962, + "learning_rate": 3.934291009150275e-06, + "loss": 0.82979459, + "num_input_tokens_seen": 39259870, + "router_z_loss_clip": 3.61523438, + "router_z_loss_mlp": 0.39892578, + "step": 1819, + "time_per_iteration": 2.5780999660491943 + }, + { + "auxiliary_loss_clip": 0.0666959, + "auxiliary_loss_mlp": 0.01302484, + "balance_loss_clip": 0.06316112, + "balance_loss_mlp": 0.01264123, + "epoch": 0.1094243198557042, + "flos": 23846523425280.0, + "grad_norm": 2.805852177899608, + "language_loss": 0.75565147, + "learning_rate": 3.934191962176335e-06, + "loss": 0.83537227, + "num_input_tokens_seen": 39278500, + "router_z_loss_clip": 3.53515625, + "router_z_loss_mlp": 0.38354492, + "step": 1820, + "time_per_iteration": 2.55102801322937 + }, + { + "auxiliary_loss_clip": 0.06670672, + "auxiliary_loss_mlp": 0.01301119, + "balance_loss_clip": 0.06311028, + "balance_loss_mlp": 0.01261065, + "epoch": 0.10948444310837216, + "flos": 14648589360000.0, + "grad_norm": 3.185311290283081, + "language_loss": 0.84421206, + "learning_rate": 3.934092841857642e-06, + "loss": 0.92392999, + "num_input_tokens_seen": 39294800, + "router_z_loss_clip": 3.59765625, + "router_z_loss_mlp": 0.40039062, + "step": 1821, + "time_per_iteration": 2.557086229324341 + }, + { + "auxiliary_loss_clip": 0.06666994, + "auxiliary_loss_mlp": 0.01310986, + "balance_loss_clip": 0.06314231, + "balance_loss_mlp": 0.01271409, + "epoch": 0.10954456636104014, + "flos": 27826250221440.0, + "grad_norm": 3.7637860321271117, + "language_loss": 0.78284943, + "learning_rate": 3.933993648197955e-06, + "loss": 0.86262918, + "num_input_tokens_seen": 39314625, + "router_z_loss_clip": 3.52734375, + "router_z_loss_mlp": 0.39575195, + "step": 1822, + "time_per_iteration": 2.607753038406372 + }, + { + "auxiliary_loss_clip": 0.06665225, + "auxiliary_loss_mlp": 0.01305751, + "balance_loss_clip": 0.06311564, + "balance_loss_mlp": 0.01267271, + "epoch": 0.1096046896137081, + "flos": 33629491497600.0, + "grad_norm": 2.4721955378281133, + "language_loss": 0.81345534, + "learning_rate": 3.933894381201034e-06, + "loss": 0.89316511, + "num_input_tokens_seen": 39336465, + "router_z_loss_clip": 3.53515625, + "router_z_loss_mlp": 0.38525391, + "step": 1823, + "time_per_iteration": 2.7046356201171875 + }, + { + "auxiliary_loss_clip": 0.06663416, + "auxiliary_loss_mlp": 0.01297526, + "balance_loss_clip": 0.06311031, + "balance_loss_mlp": 0.01260643, + "epoch": 0.10966481286637607, + "flos": 26987370670080.0, + "grad_norm": 1.5405254615008266, + "language_loss": 0.8184576, + "learning_rate": 3.933795040870645e-06, + "loss": 0.898067, + "num_input_tokens_seen": 39357930, + "router_z_loss_clip": 3.5234375, + "router_z_loss_mlp": 0.36889648, + "step": 1824, + "time_per_iteration": 2.6020491123199463 + }, + { + "auxiliary_loss_clip": 0.06675697, + "auxiliary_loss_mlp": 0.01302612, + "balance_loss_clip": 0.06317075, + "balance_loss_mlp": 0.01262796, + "epoch": 0.10972493611904403, + "flos": 23042751534720.0, + "grad_norm": 2.030784567379419, + "language_loss": 0.88740194, + "learning_rate": 3.933695627210554e-06, + "loss": 0.96718502, + "num_input_tokens_seen": 39376380, + "router_z_loss_clip": 3.5859375, + "router_z_loss_mlp": 0.3984375, + "step": 1825, + "time_per_iteration": 2.6143786907196045 + }, + { + "auxiliary_loss_clip": 0.06672946, + "auxiliary_loss_mlp": 0.01304094, + "balance_loss_clip": 0.06315491, + "balance_loss_mlp": 0.01265113, + "epoch": 0.10978505937171201, + "flos": 38113261729920.0, + "grad_norm": 4.39958169553056, + "language_loss": 0.77133435, + "learning_rate": 3.933596140224532e-06, + "loss": 0.85110474, + "num_input_tokens_seen": 39399935, + "router_z_loss_clip": 3.57617188, + "router_z_loss_mlp": 0.39013672, + "step": 1826, + "time_per_iteration": 2.6767754554748535 + }, + { + "auxiliary_loss_clip": 0.06562361, + "auxiliary_loss_mlp": 0.01306115, + "balance_loss_clip": 0.06342762, + "balance_loss_mlp": 0.01289641, + "epoch": 0.10984518262437998, + "flos": 59867987500800.0, + "grad_norm": 0.8265503512589908, + "language_loss": 0.55217832, + "learning_rate": 3.93349657991635e-06, + "loss": 0.63086313, + "num_input_tokens_seen": 39460685, + "router_z_loss_clip": 2.19921875, + "router_z_loss_mlp": 0.16479492, + "step": 1827, + "time_per_iteration": 3.2042500972747803 + }, + { + "auxiliary_loss_clip": 0.06558152, + "auxiliary_loss_mlp": 0.01284859, + "balance_loss_clip": 0.06338888, + "balance_loss_mlp": 0.01267704, + "epoch": 0.10990530587704794, + "flos": 66741088907520.0, + "grad_norm": 0.7202592314019287, + "language_loss": 0.55369592, + "learning_rate": 3.933396946289784e-06, + "loss": 0.63212597, + "num_input_tokens_seen": 39524765, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.17175293, + "step": 1828, + "time_per_iteration": 3.2514500617980957 + }, + { + "auxiliary_loss_clip": 0.06692256, + "auxiliary_loss_mlp": 0.01311884, + "balance_loss_clip": 0.06327218, + "balance_loss_mlp": 0.01270018, + "epoch": 0.10996542912971592, + "flos": 25454234914560.0, + "grad_norm": 6.114677648786519, + "language_loss": 0.86263084, + "learning_rate": 3.933297239348612e-06, + "loss": 0.94267225, + "num_input_tokens_seen": 39543640, + "router_z_loss_clip": 3.65234375, + "router_z_loss_mlp": 0.41918945, + "step": 1829, + "time_per_iteration": 2.586923360824585 + }, + { + "auxiliary_loss_clip": 0.06682983, + "auxiliary_loss_mlp": 0.01319143, + "balance_loss_clip": 0.06320649, + "balance_loss_mlp": 0.01279207, + "epoch": 0.11002555238238389, + "flos": 44028282752640.0, + "grad_norm": 2.5270889660052025, + "language_loss": 0.90112162, + "learning_rate": 3.933197459096614e-06, + "loss": 0.98114288, + "num_input_tokens_seen": 39567525, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 0.3996582, + "step": 1830, + "time_per_iteration": 2.8102030754089355 + }, + { + "auxiliary_loss_clip": 0.06544227, + "auxiliary_loss_mlp": 0.01284934, + "balance_loss_clip": 0.06324031, + "balance_loss_mlp": 0.01268376, + "epoch": 0.11008567563505185, + "flos": 54085248547200.0, + "grad_norm": 0.6738836054555057, + "language_loss": 0.55525172, + "learning_rate": 3.9330976055375756e-06, + "loss": 0.63354337, + "num_input_tokens_seen": 39628470, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.16564941, + "step": 1831, + "time_per_iteration": 4.652044057846069 + }, + { + "auxiliary_loss_clip": 0.06700309, + "auxiliary_loss_mlp": 0.01328613, + "balance_loss_clip": 0.06332322, + "balance_loss_mlp": 0.01284744, + "epoch": 0.11014579888771983, + "flos": 24249981634560.0, + "grad_norm": 4.072580491450979, + "language_loss": 0.92313743, + "learning_rate": 3.932997678675282e-06, + "loss": 1.00342667, + "num_input_tokens_seen": 39646670, + "router_z_loss_clip": 3.67773438, + "router_z_loss_mlp": 0.43823242, + "step": 1832, + "time_per_iteration": 2.6010701656341553 + }, + { + "auxiliary_loss_clip": 0.06543858, + "auxiliary_loss_mlp": 0.01268849, + "balance_loss_clip": 0.06322708, + "balance_loss_mlp": 0.0125247, + "epoch": 0.1102059221403878, + "flos": 57763653661440.0, + "grad_norm": 0.681716215184674, + "language_loss": 0.59753174, + "learning_rate": 3.932897678513523e-06, + "loss": 0.67565876, + "num_input_tokens_seen": 39712915, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.16381836, + "step": 1833, + "time_per_iteration": 3.3245253562927246 + }, + { + "auxiliary_loss_clip": 0.0668912, + "auxiliary_loss_mlp": 0.01321784, + "balance_loss_clip": 0.06319445, + "balance_loss_mlp": 0.01278773, + "epoch": 0.11026604539305576, + "flos": 16800818607360.0, + "grad_norm": 5.311308312768562, + "language_loss": 0.81575066, + "learning_rate": 3.93279760505609e-06, + "loss": 0.89585972, + "num_input_tokens_seen": 39730650, + "router_z_loss_clip": 3.703125, + "router_z_loss_mlp": 0.42993164, + "step": 1834, + "time_per_iteration": 4.020633697509766 + }, + { + "auxiliary_loss_clip": 0.0668771, + "auxiliary_loss_mlp": 0.01323505, + "balance_loss_clip": 0.0632341, + "balance_loss_mlp": 0.01282997, + "epoch": 0.11032616864572373, + "flos": 23994920206080.0, + "grad_norm": 4.522465656610911, + "language_loss": 0.91756475, + "learning_rate": 3.932697458306779e-06, + "loss": 0.99767691, + "num_input_tokens_seen": 39751065, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 0.40478516, + "step": 1835, + "time_per_iteration": 2.5956919193267822 + }, + { + "auxiliary_loss_clip": 0.06685364, + "auxiliary_loss_mlp": 0.01321402, + "balance_loss_clip": 0.06324954, + "balance_loss_mlp": 0.01281729, + "epoch": 0.1103862918983917, + "flos": 19689329681280.0, + "grad_norm": 3.000861759629478, + "language_loss": 0.66412532, + "learning_rate": 3.932597238269386e-06, + "loss": 0.74419296, + "num_input_tokens_seen": 39769245, + "router_z_loss_clip": 3.60546875, + "router_z_loss_mlp": 0.39648438, + "step": 1836, + "time_per_iteration": 2.5927958488464355 + }, + { + "auxiliary_loss_clip": 0.06670263, + "auxiliary_loss_mlp": 0.01319261, + "balance_loss_clip": 0.06317647, + "balance_loss_mlp": 0.01279541, + "epoch": 0.11044641515105967, + "flos": 32169086686080.0, + "grad_norm": 2.1343283023714865, + "language_loss": 0.74546272, + "learning_rate": 3.932496944947711e-06, + "loss": 0.82535791, + "num_input_tokens_seen": 39790830, + "router_z_loss_clip": 3.52539062, + "router_z_loss_mlp": 0.3972168, + "step": 1837, + "time_per_iteration": 5.453325033187866 + }, + { + "auxiliary_loss_clip": 0.06688204, + "auxiliary_loss_mlp": 0.01319143, + "balance_loss_clip": 0.06321806, + "balance_loss_mlp": 0.01281496, + "epoch": 0.11050653840372764, + "flos": 16694573230080.0, + "grad_norm": 2.107729732197389, + "language_loss": 0.79967713, + "learning_rate": 3.93239657834556e-06, + "loss": 0.87975061, + "num_input_tokens_seen": 39809475, + "router_z_loss_clip": 3.66210938, + "router_z_loss_mlp": 0.3762207, + "step": 1838, + "time_per_iteration": 2.5330708026885986 + }, + { + "auxiliary_loss_clip": 0.06681567, + "auxiliary_loss_mlp": 0.01310209, + "balance_loss_clip": 0.06323014, + "balance_loss_mlp": 0.01271013, + "epoch": 0.11056666165639562, + "flos": 21214205809920.0, + "grad_norm": 1.83916180844076, + "language_loss": 0.72651547, + "learning_rate": 3.932296138466736e-06, + "loss": 0.8064332, + "num_input_tokens_seen": 39826355, + "router_z_loss_clip": 3.58398438, + "router_z_loss_mlp": 0.39160156, + "step": 1839, + "time_per_iteration": 2.5494542121887207 + }, + { + "auxiliary_loss_clip": 0.06685573, + "auxiliary_loss_mlp": 0.01308897, + "balance_loss_clip": 0.06317459, + "balance_loss_mlp": 0.0126777, + "epoch": 0.11062678490906358, + "flos": 19170444072960.0, + "grad_norm": 2.2710606045718835, + "language_loss": 0.80620813, + "learning_rate": 3.93219562531505e-06, + "loss": 0.88615286, + "num_input_tokens_seen": 39845335, + "router_z_loss_clip": 3.68359375, + "router_z_loss_mlp": 0.41137695, + "step": 1840, + "time_per_iteration": 2.525967836380005 + }, + { + "auxiliary_loss_clip": 0.0666925, + "auxiliary_loss_mlp": 0.01306907, + "balance_loss_clip": 0.06314851, + "balance_loss_mlp": 0.01271287, + "epoch": 0.11068690816173155, + "flos": 24901457529600.0, + "grad_norm": 1.7471100044619239, + "language_loss": 0.89207804, + "learning_rate": 3.932095038894311e-06, + "loss": 0.97183955, + "num_input_tokens_seen": 39865065, + "router_z_loss_clip": 3.54296875, + "router_z_loss_mlp": 0.35620117, + "step": 1841, + "time_per_iteration": 2.6120924949645996 + }, + { + "auxiliary_loss_clip": 0.06674149, + "auxiliary_loss_mlp": 0.01316221, + "balance_loss_clip": 0.06318908, + "balance_loss_mlp": 0.01276739, + "epoch": 0.11074703141439952, + "flos": 16478015552640.0, + "grad_norm": 2.1111741847875822, + "language_loss": 0.92148924, + "learning_rate": 3.931994379208334e-06, + "loss": 1.00139296, + "num_input_tokens_seen": 39882780, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 0.39477539, + "step": 1842, + "time_per_iteration": 2.5187559127807617 + }, + { + "auxiliary_loss_clip": 0.06674332, + "auxiliary_loss_mlp": 0.01308171, + "balance_loss_clip": 0.06317849, + "balance_loss_mlp": 0.01269166, + "epoch": 0.11080715466706749, + "flos": 19178535991680.0, + "grad_norm": 2.023955120097268, + "language_loss": 0.87531722, + "learning_rate": 3.931893646260937e-06, + "loss": 0.95514226, + "num_input_tokens_seen": 39900295, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 0.39038086, + "step": 1843, + "time_per_iteration": 2.6090967655181885 + }, + { + "auxiliary_loss_clip": 0.06693342, + "auxiliary_loss_mlp": 0.01302224, + "balance_loss_clip": 0.0632928, + "balance_loss_mlp": 0.01261073, + "epoch": 0.11086727791973545, + "flos": 27711325946880.0, + "grad_norm": 2.219830309112563, + "language_loss": 0.75884986, + "learning_rate": 3.931792840055941e-06, + "loss": 0.8388055, + "num_input_tokens_seen": 39922075, + "router_z_loss_clip": 3.64257812, + "router_z_loss_mlp": 0.41137695, + "step": 1844, + "time_per_iteration": 2.6051831245422363 + }, + { + "auxiliary_loss_clip": 0.06685966, + "auxiliary_loss_mlp": 0.01305534, + "balance_loss_clip": 0.06324236, + "balance_loss_mlp": 0.01264311, + "epoch": 0.11092740117240343, + "flos": 18520854894720.0, + "grad_norm": 2.695467374521673, + "language_loss": 0.77040052, + "learning_rate": 3.931691960597165e-06, + "loss": 0.85031545, + "num_input_tokens_seen": 39940115, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 0.41235352, + "step": 1845, + "time_per_iteration": 2.6330642700195312 + }, + { + "auxiliary_loss_clip": 0.06677614, + "auxiliary_loss_mlp": 0.01301707, + "balance_loss_clip": 0.06324686, + "balance_loss_mlp": 0.01264681, + "epoch": 0.1109875244250714, + "flos": 20528796211200.0, + "grad_norm": 2.004922205839187, + "language_loss": 0.77657044, + "learning_rate": 3.9315910078884375e-06, + "loss": 0.85636371, + "num_input_tokens_seen": 39959920, + "router_z_loss_clip": 3.52929688, + "router_z_loss_mlp": 0.37036133, + "step": 1846, + "time_per_iteration": 2.5549449920654297 + }, + { + "auxiliary_loss_clip": 0.06701723, + "auxiliary_loss_mlp": 0.01300229, + "balance_loss_clip": 0.0633509, + "balance_loss_mlp": 0.01259627, + "epoch": 0.11104764767773936, + "flos": 14103484623360.0, + "grad_norm": 2.935889161115543, + "language_loss": 0.88190699, + "learning_rate": 3.931489981933584e-06, + "loss": 0.96192646, + "num_input_tokens_seen": 39974755, + "router_z_loss_clip": 3.671875, + "router_z_loss_mlp": 0.40600586, + "step": 1847, + "time_per_iteration": 2.544952869415283 + }, + { + "auxiliary_loss_clip": 0.06695546, + "auxiliary_loss_mlp": 0.01304809, + "balance_loss_clip": 0.06331737, + "balance_loss_mlp": 0.01263944, + "epoch": 0.11110777093040733, + "flos": 20600730541440.0, + "grad_norm": 2.320230631722476, + "language_loss": 0.79106438, + "learning_rate": 3.931388882736438e-06, + "loss": 0.87106788, + "num_input_tokens_seen": 39993355, + "router_z_loss_clip": 3.63476562, + "router_z_loss_mlp": 0.40893555, + "step": 1848, + "time_per_iteration": 2.6920952796936035 + }, + { + "auxiliary_loss_clip": 0.0668249, + "auxiliary_loss_mlp": 0.01302322, + "balance_loss_clip": 0.06330639, + "balance_loss_mlp": 0.01266702, + "epoch": 0.11116789418307531, + "flos": 21876247319040.0, + "grad_norm": 2.02298107620041, + "language_loss": 0.79027736, + "learning_rate": 3.931287710300832e-06, + "loss": 0.87012547, + "num_input_tokens_seen": 40012410, + "router_z_loss_clip": 3.51757812, + "router_z_loss_mlp": 0.35595703, + "step": 1849, + "time_per_iteration": 2.630244255065918 + }, + { + "auxiliary_loss_clip": 0.0669456, + "auxiliary_loss_mlp": 0.01300991, + "balance_loss_clip": 0.06327619, + "balance_loss_mlp": 0.01259363, + "epoch": 0.11122801743574327, + "flos": 15528488284800.0, + "grad_norm": 3.153012159345978, + "language_loss": 0.73516262, + "learning_rate": 3.931186464630601e-06, + "loss": 0.81511813, + "num_input_tokens_seen": 40029315, + "router_z_loss_clip": 3.66601562, + "router_z_loss_mlp": 0.41625977, + "step": 1850, + "time_per_iteration": 2.5095834732055664 + }, + { + "auxiliary_loss_clip": 0.06693517, + "auxiliary_loss_mlp": 0.01305101, + "balance_loss_clip": 0.06331346, + "balance_loss_mlp": 0.01265952, + "epoch": 0.11128814068841124, + "flos": 14397511000320.0, + "grad_norm": 2.7195587095410594, + "language_loss": 0.83262205, + "learning_rate": 3.931085145729588e-06, + "loss": 0.91260827, + "num_input_tokens_seen": 40045765, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 0.39135742, + "step": 1851, + "time_per_iteration": 2.5094821453094482 + }, + { + "auxiliary_loss_clip": 0.06681279, + "auxiliary_loss_mlp": 0.01301356, + "balance_loss_clip": 0.06326675, + "balance_loss_mlp": 0.01266285, + "epoch": 0.11134826394107922, + "flos": 16659465569280.0, + "grad_norm": 3.1935743698172874, + "language_loss": 0.90682918, + "learning_rate": 3.930983753601631e-06, + "loss": 0.98665553, + "num_input_tokens_seen": 40061660, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 0.35083008, + "step": 1852, + "time_per_iteration": 2.5097947120666504 + }, + { + "auxiliary_loss_clip": 0.06688742, + "auxiliary_loss_mlp": 0.0130004, + "balance_loss_clip": 0.06332849, + "balance_loss_mlp": 0.01261392, + "epoch": 0.11140838719374718, + "flos": 16696627655040.0, + "grad_norm": 2.055655946127079, + "language_loss": 0.73742187, + "learning_rate": 3.930882288250578e-06, + "loss": 0.81730974, + "num_input_tokens_seen": 40080180, + "router_z_loss_clip": 3.56445312, + "router_z_loss_mlp": 0.38647461, + "step": 1853, + "time_per_iteration": 2.5568370819091797 + }, + { + "auxiliary_loss_clip": 0.06563053, + "auxiliary_loss_mlp": 0.01299008, + "balance_loss_clip": 0.06346013, + "balance_loss_mlp": 0.01281771, + "epoch": 0.11146851044641515, + "flos": 60994101248640.0, + "grad_norm": 0.7599812832333546, + "language_loss": 0.53835392, + "learning_rate": 3.930780749680273e-06, + "loss": 0.61697447, + "num_input_tokens_seen": 40138910, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.17260742, + "step": 1854, + "time_per_iteration": 3.1410884857177734 + }, + { + "auxiliary_loss_clip": 0.06710939, + "auxiliary_loss_mlp": 0.01301728, + "balance_loss_clip": 0.06327829, + "balance_loss_mlp": 0.01258336, + "epoch": 0.11152863369908313, + "flos": 22199301936000.0, + "grad_norm": 2.170007206040738, + "language_loss": 0.86019069, + "learning_rate": 3.9306791378945705e-06, + "loss": 0.94031739, + "num_input_tokens_seen": 40157745, + "router_z_loss_clip": 3.83398438, + "router_z_loss_mlp": 0.43383789, + "step": 1855, + "time_per_iteration": 2.5451245307922363 + }, + { + "auxiliary_loss_clip": 0.06687084, + "auxiliary_loss_mlp": 0.01297488, + "balance_loss_clip": 0.0632429, + "balance_loss_mlp": 0.01258745, + "epoch": 0.11158875695175109, + "flos": 19543742012160.0, + "grad_norm": 2.6985711919434054, + "language_loss": 0.83108622, + "learning_rate": 3.9305774528973205e-06, + "loss": 0.91093194, + "num_input_tokens_seen": 40175375, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 0.38720703, + "step": 1856, + "time_per_iteration": 2.578641653060913 + }, + { + "auxiliary_loss_clip": 0.06667097, + "auxiliary_loss_mlp": 0.01293205, + "balance_loss_clip": 0.06315985, + "balance_loss_mlp": 0.01257824, + "epoch": 0.11164888020441906, + "flos": 25448994034560.0, + "grad_norm": 1.90457681551641, + "language_loss": 0.84520233, + "learning_rate": 3.93047569469238e-06, + "loss": 0.92480534, + "num_input_tokens_seen": 40195715, + "router_z_loss_clip": 3.51171875, + "router_z_loss_mlp": 0.35375977, + "step": 1857, + "time_per_iteration": 2.581700086593628 + }, + { + "auxiliary_loss_clip": 0.06686676, + "auxiliary_loss_mlp": 0.01304106, + "balance_loss_clip": 0.06318156, + "balance_loss_mlp": 0.01263289, + "epoch": 0.11170900345708702, + "flos": 15638171679360.0, + "grad_norm": 2.609725880853407, + "language_loss": 0.85109961, + "learning_rate": 3.930373863283608e-06, + "loss": 0.9310075, + "num_input_tokens_seen": 40213975, + "router_z_loss_clip": 3.68164062, + "router_z_loss_mlp": 0.40795898, + "step": 1858, + "time_per_iteration": 2.536013603210449 + }, + { + "auxiliary_loss_clip": 0.0668328, + "auxiliary_loss_mlp": 0.01297406, + "balance_loss_clip": 0.06323688, + "balance_loss_mlp": 0.01259569, + "epoch": 0.111769126709755, + "flos": 23046105697920.0, + "grad_norm": 2.4700078024873102, + "language_loss": 0.92790282, + "learning_rate": 3.930271958674866e-06, + "loss": 1.00770962, + "num_input_tokens_seen": 40233905, + "router_z_loss_clip": 3.59570312, + "router_z_loss_mlp": 0.37841797, + "step": 1859, + "time_per_iteration": 2.541881799697876 + }, + { + "auxiliary_loss_clip": 0.06691643, + "auxiliary_loss_mlp": 0.01299678, + "balance_loss_clip": 0.06318307, + "balance_loss_mlp": 0.0125774, + "epoch": 0.11182924996242297, + "flos": 20857091708160.0, + "grad_norm": 2.367815973832506, + "language_loss": 0.8396585, + "learning_rate": 3.930169980870018e-06, + "loss": 0.9195717, + "num_input_tokens_seen": 40252810, + "router_z_loss_clip": 3.72851562, + "router_z_loss_mlp": 0.41943359, + "step": 1860, + "time_per_iteration": 2.565051555633545 + }, + { + "auxiliary_loss_clip": 0.06669357, + "auxiliary_loss_mlp": 0.01300378, + "balance_loss_clip": 0.06315688, + "balance_loss_mlp": 0.01263065, + "epoch": 0.11188937321509093, + "flos": 17460763764480.0, + "grad_norm": 2.7908462123762026, + "language_loss": 0.7628203, + "learning_rate": 3.930067929872931e-06, + "loss": 0.84251761, + "num_input_tokens_seen": 40272000, + "router_z_loss_clip": 3.53710938, + "router_z_loss_mlp": 0.37304688, + "step": 1861, + "time_per_iteration": 2.5033557415008545 + }, + { + "auxiliary_loss_clip": 0.06670874, + "auxiliary_loss_mlp": 0.01303256, + "balance_loss_clip": 0.0631748, + "balance_loss_mlp": 0.01266635, + "epoch": 0.11194949646775891, + "flos": 24102507248640.0, + "grad_norm": 2.306450242478339, + "language_loss": 0.90480924, + "learning_rate": 3.929965805687474e-06, + "loss": 0.9845506, + "num_input_tokens_seen": 40290660, + "router_z_loss_clip": 3.53515625, + "router_z_loss_mlp": 0.3659668, + "step": 1862, + "time_per_iteration": 2.582846164703369 + }, + { + "auxiliary_loss_clip": 0.06675294, + "auxiliary_loss_mlp": 0.01301536, + "balance_loss_clip": 0.0632014, + "balance_loss_mlp": 0.01265273, + "epoch": 0.11200961972042688, + "flos": 25160627808000.0, + "grad_norm": 2.402216402179579, + "language_loss": 0.88216799, + "learning_rate": 3.92986360831752e-06, + "loss": 0.9619363, + "num_input_tokens_seen": 40307820, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 0.36279297, + "step": 1863, + "time_per_iteration": 2.548849105834961 + }, + { + "auxiliary_loss_clip": 0.06661677, + "auxiliary_loss_mlp": 0.01299701, + "balance_loss_clip": 0.06311835, + "balance_loss_mlp": 0.01259933, + "epoch": 0.11206974297309484, + "flos": 21294735183360.0, + "grad_norm": 3.3365899426908574, + "language_loss": 0.65844059, + "learning_rate": 3.929761337766945e-06, + "loss": 0.73805434, + "num_input_tokens_seen": 40327430, + "router_z_loss_clip": 3.5, + "router_z_loss_mlp": 0.39770508, + "step": 1864, + "time_per_iteration": 2.5405185222625732 + }, + { + "auxiliary_loss_clip": 0.06660779, + "auxiliary_loss_mlp": 0.01305926, + "balance_loss_clip": 0.06303211, + "balance_loss_mlp": 0.01270211, + "epoch": 0.11212986622576282, + "flos": 18921881335680.0, + "grad_norm": 2.2819326265061717, + "language_loss": 0.75939113, + "learning_rate": 3.929658994039627e-06, + "loss": 0.83905816, + "num_input_tokens_seen": 40344545, + "router_z_loss_clip": 3.58007812, + "router_z_loss_mlp": 0.35693359, + "step": 1865, + "time_per_iteration": 2.518132209777832 + }, + { + "auxiliary_loss_clip": 0.06676203, + "auxiliary_loss_mlp": 0.01303479, + "balance_loss_clip": 0.06308948, + "balance_loss_mlp": 0.01262066, + "epoch": 0.11218998947843078, + "flos": 22061344988160.0, + "grad_norm": 2.4630430297676087, + "language_loss": 0.86701274, + "learning_rate": 3.929556577139446e-06, + "loss": 0.94680953, + "num_input_tokens_seen": 40362300, + "router_z_loss_clip": 3.67382812, + "router_z_loss_mlp": 0.4140625, + "step": 1866, + "time_per_iteration": 2.559826135635376 + }, + { + "auxiliary_loss_clip": 0.06668604, + "auxiliary_loss_mlp": 0.0129946, + "balance_loss_clip": 0.06308037, + "balance_loss_mlp": 0.01259405, + "epoch": 0.11225011273109875, + "flos": 24578612547840.0, + "grad_norm": 1.6697676286935108, + "language_loss": 0.82806516, + "learning_rate": 3.929454087070286e-06, + "loss": 0.90774584, + "num_input_tokens_seen": 40384720, + "router_z_loss_clip": 3.60546875, + "router_z_loss_mlp": 0.40014648, + "step": 1867, + "time_per_iteration": 2.6024861335754395 + }, + { + "auxiliary_loss_clip": 0.06666633, + "auxiliary_loss_mlp": 0.01303841, + "balance_loss_clip": 0.06308746, + "balance_loss_mlp": 0.01266099, + "epoch": 0.11231023598376672, + "flos": 28446140327040.0, + "grad_norm": 2.646357828465267, + "language_loss": 0.88275552, + "learning_rate": 3.929351523836035e-06, + "loss": 0.96246034, + "num_input_tokens_seen": 40404000, + "router_z_loss_clip": 3.58007812, + "router_z_loss_mlp": 0.37744141, + "step": 1868, + "time_per_iteration": 2.6040542125701904 + }, + { + "auxiliary_loss_clip": 0.06659871, + "auxiliary_loss_mlp": 0.01297203, + "balance_loss_clip": 0.06306987, + "balance_loss_mlp": 0.01259866, + "epoch": 0.1123703592364347, + "flos": 14431318922880.0, + "grad_norm": 2.6026187077821796, + "language_loss": 0.69696379, + "learning_rate": 3.9292488874405795e-06, + "loss": 0.77653456, + "num_input_tokens_seen": 40418665, + "router_z_loss_clip": 3.52929688, + "router_z_loss_mlp": 0.3737793, + "step": 1869, + "time_per_iteration": 2.562173843383789 + }, + { + "auxiliary_loss_clip": 0.06669002, + "auxiliary_loss_mlp": 0.01308207, + "balance_loss_clip": 0.06307223, + "balance_loss_mlp": 0.01267629, + "epoch": 0.11243048248910266, + "flos": 22242753077760.0, + "grad_norm": 2.004713314117072, + "language_loss": 0.78550231, + "learning_rate": 3.929146177887814e-06, + "loss": 0.86527443, + "num_input_tokens_seen": 40437870, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 0.40600586, + "step": 1870, + "time_per_iteration": 2.5912842750549316 + }, + { + "auxiliary_loss_clip": 0.06677727, + "auxiliary_loss_mlp": 0.01300065, + "balance_loss_clip": 0.06308755, + "balance_loss_mlp": 0.01259462, + "epoch": 0.11249060574177062, + "flos": 18589435061760.0, + "grad_norm": 2.325375460191994, + "language_loss": 0.77409399, + "learning_rate": 3.929043395181631e-06, + "loss": 0.85387194, + "num_input_tokens_seen": 40455570, + "router_z_loss_clip": 3.69140625, + "router_z_loss_mlp": 0.40625, + "step": 1871, + "time_per_iteration": 3.970134735107422 + }, + { + "auxiliary_loss_clip": 0.06669156, + "auxiliary_loss_mlp": 0.01304929, + "balance_loss_clip": 0.06304972, + "balance_loss_mlp": 0.01264803, + "epoch": 0.1125507289944386, + "flos": 22863146307840.0, + "grad_norm": 2.5010943819542395, + "language_loss": 0.83236814, + "learning_rate": 3.928940539325929e-06, + "loss": 0.91210902, + "num_input_tokens_seen": 40473600, + "router_z_loss_clip": 3.64257812, + "router_z_loss_mlp": 0.40112305, + "step": 1872, + "time_per_iteration": 2.53498911857605 + }, + { + "auxiliary_loss_clip": 0.0666475, + "auxiliary_loss_mlp": 0.0132478, + "balance_loss_clip": 0.06302819, + "balance_loss_mlp": 0.01284344, + "epoch": 0.11261085224710657, + "flos": 19681447397760.0, + "grad_norm": 2.9026103981965963, + "language_loss": 0.84496641, + "learning_rate": 3.9288376103246095e-06, + "loss": 0.92486167, + "num_input_tokens_seen": 40490025, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 0.40454102, + "step": 1873, + "time_per_iteration": 3.988614082336426 + }, + { + "auxiliary_loss_clip": 0.06668855, + "auxiliary_loss_mlp": 0.01305813, + "balance_loss_clip": 0.06300959, + "balance_loss_mlp": 0.01266664, + "epoch": 0.11267097549977453, + "flos": 26069680753920.0, + "grad_norm": 2.0146094287088454, + "language_loss": 0.92890203, + "learning_rate": 3.928734608181575e-06, + "loss": 1.00864863, + "num_input_tokens_seen": 40511580, + "router_z_loss_clip": 3.67578125, + "router_z_loss_mlp": 0.3918457, + "step": 1874, + "time_per_iteration": 2.594095230102539 + }, + { + "auxiliary_loss_clip": 0.06647091, + "auxiliary_loss_mlp": 0.01311618, + "balance_loss_clip": 0.06294458, + "balance_loss_mlp": 0.01272589, + "epoch": 0.11273109875244251, + "flos": 21074194437120.0, + "grad_norm": 2.447545582518425, + "language_loss": 0.7598331, + "learning_rate": 3.928631532900729e-06, + "loss": 0.8394202, + "num_input_tokens_seen": 40530155, + "router_z_loss_clip": 3.52539062, + "router_z_loss_mlp": 0.39038086, + "step": 1875, + "time_per_iteration": 2.5846669673919678 + }, + { + "auxiliary_loss_clip": 0.06650866, + "auxiliary_loss_mlp": 0.01305089, + "balance_loss_clip": 0.06300622, + "balance_loss_mlp": 0.01270042, + "epoch": 0.11279122200511048, + "flos": 27096299377920.0, + "grad_norm": 2.1373581639008603, + "language_loss": 0.73336905, + "learning_rate": 3.928528384485984e-06, + "loss": 0.81292862, + "num_input_tokens_seen": 40549500, + "router_z_loss_clip": 3.50585938, + "router_z_loss_mlp": 0.3503418, + "step": 1876, + "time_per_iteration": 3.9819693565368652 + }, + { + "auxiliary_loss_clip": 0.06655607, + "auxiliary_loss_mlp": 0.01304943, + "balance_loss_clip": 0.06303705, + "balance_loss_mlp": 0.01268489, + "epoch": 0.11285134525777844, + "flos": 20193163482240.0, + "grad_norm": 1.9863695087931013, + "language_loss": 0.78284073, + "learning_rate": 3.9284251629412475e-06, + "loss": 0.86244625, + "num_input_tokens_seen": 40567475, + "router_z_loss_clip": 3.51953125, + "router_z_loss_mlp": 0.36474609, + "step": 1877, + "time_per_iteration": 4.03458046913147 + }, + { + "auxiliary_loss_clip": 0.06652889, + "auxiliary_loss_mlp": 0.01306338, + "balance_loss_clip": 0.06294097, + "balance_loss_mlp": 0.01265139, + "epoch": 0.11291146851044641, + "flos": 12463348803840.0, + "grad_norm": 2.614643448765401, + "language_loss": 0.8943826, + "learning_rate": 3.928321868270436e-06, + "loss": 0.97397494, + "num_input_tokens_seen": 40583280, + "router_z_loss_clip": 3.58789062, + "router_z_loss_mlp": 0.41186523, + "step": 1878, + "time_per_iteration": 2.5039942264556885 + }, + { + "auxiliary_loss_clip": 0.06650617, + "auxiliary_loss_mlp": 0.01298934, + "balance_loss_clip": 0.0629722, + "balance_loss_mlp": 0.01262981, + "epoch": 0.11297159176311439, + "flos": 23849164828800.0, + "grad_norm": 2.5452203644148748, + "language_loss": 0.83347368, + "learning_rate": 3.928218500477466e-06, + "loss": 0.91296917, + "num_input_tokens_seen": 40603080, + "router_z_loss_clip": 3.53710938, + "router_z_loss_mlp": 0.35961914, + "step": 1879, + "time_per_iteration": 2.597705125808716 + }, + { + "auxiliary_loss_clip": 0.06658179, + "auxiliary_loss_mlp": 0.01304624, + "balance_loss_clip": 0.06296952, + "balance_loss_mlp": 0.01265333, + "epoch": 0.11303171501578235, + "flos": 29937585876480.0, + "grad_norm": 2.2031468075921765, + "language_loss": 0.71889591, + "learning_rate": 3.928115059566259e-06, + "loss": 0.79852396, + "num_input_tokens_seen": 40623255, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 0.39306641, + "step": 1880, + "time_per_iteration": 2.5943877696990967 + }, + { + "auxiliary_loss_clip": 0.06640352, + "auxiliary_loss_mlp": 0.01299738, + "balance_loss_clip": 0.06297569, + "balance_loss_mlp": 0.01262163, + "epoch": 0.11309183826845032, + "flos": 16186169381760.0, + "grad_norm": 2.477930763311184, + "language_loss": 0.74137151, + "learning_rate": 3.928011545540734e-06, + "loss": 0.82077241, + "num_input_tokens_seen": 40641570, + "router_z_loss_clip": 3.43164062, + "router_z_loss_mlp": 0.37573242, + "step": 1881, + "time_per_iteration": 2.5628225803375244 + }, + { + "auxiliary_loss_clip": 0.06661209, + "auxiliary_loss_mlp": 0.01303844, + "balance_loss_clip": 0.06301182, + "balance_loss_mlp": 0.01264767, + "epoch": 0.1131519615211183, + "flos": 12025537620480.0, + "grad_norm": 2.71671437451568, + "language_loss": 0.75070721, + "learning_rate": 3.927907958404819e-06, + "loss": 0.83035773, + "num_input_tokens_seen": 40658775, + "router_z_loss_clip": 3.59765625, + "router_z_loss_mlp": 0.39111328, + "step": 1882, + "time_per_iteration": 2.5252811908721924 + }, + { + "auxiliary_loss_clip": 0.06659748, + "auxiliary_loss_mlp": 0.01301896, + "balance_loss_clip": 0.06302463, + "balance_loss_mlp": 0.0126363, + "epoch": 0.11321208477378626, + "flos": 26257335972480.0, + "grad_norm": 2.360500107686341, + "language_loss": 0.81115943, + "learning_rate": 3.92780429816244e-06, + "loss": 0.89077592, + "num_input_tokens_seen": 40679555, + "router_z_loss_clip": 3.57226562, + "router_z_loss_mlp": 0.3828125, + "step": 1883, + "time_per_iteration": 2.6215126514434814 + }, + { + "auxiliary_loss_clip": 0.06662337, + "auxiliary_loss_mlp": 0.01301794, + "balance_loss_clip": 0.06305344, + "balance_loss_mlp": 0.01264076, + "epoch": 0.11327220802645423, + "flos": 13631530101120.0, + "grad_norm": 4.398339236734383, + "language_loss": 0.78793007, + "learning_rate": 3.927700564817529e-06, + "loss": 0.86757141, + "num_input_tokens_seen": 40697295, + "router_z_loss_clip": 3.56835938, + "router_z_loss_mlp": 0.37719727, + "step": 1884, + "time_per_iteration": 2.5176398754119873 + }, + { + "auxiliary_loss_clip": 0.06509344, + "auxiliary_loss_mlp": 0.0129272, + "balance_loss_clip": 0.06295511, + "balance_loss_mlp": 0.0127789, + "epoch": 0.1133323312791222, + "flos": 57210582787200.0, + "grad_norm": 0.8090343621743066, + "language_loss": 0.55328304, + "learning_rate": 3.927596758374019e-06, + "loss": 0.63130367, + "num_input_tokens_seen": 40758095, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.14794922, + "step": 1885, + "time_per_iteration": 3.0971505641937256 + }, + { + "auxiliary_loss_clip": 0.06646755, + "auxiliary_loss_mlp": 0.01313183, + "balance_loss_clip": 0.06301701, + "balance_loss_mlp": 0.01277062, + "epoch": 0.11339245453179017, + "flos": 24358407217920.0, + "grad_norm": 2.1975512476365444, + "language_loss": 0.917539, + "learning_rate": 3.927492878835848e-06, + "loss": 0.99713838, + "num_input_tokens_seen": 40777140, + "router_z_loss_clip": 3.45117188, + "router_z_loss_mlp": 0.36132812, + "step": 1886, + "time_per_iteration": 2.557039260864258 + }, + { + "auxiliary_loss_clip": 0.06661782, + "auxiliary_loss_mlp": 0.01305618, + "balance_loss_clip": 0.06311518, + "balance_loss_mlp": 0.01271882, + "epoch": 0.11345257778445814, + "flos": 22676665046400.0, + "grad_norm": 2.7768273002598427, + "language_loss": 0.86747134, + "learning_rate": 3.927388926206953e-06, + "loss": 0.94714534, + "num_input_tokens_seen": 40797505, + "router_z_loss_clip": 3.50195312, + "router_z_loss_mlp": 0.33740234, + "step": 1887, + "time_per_iteration": 2.586071014404297 + }, + { + "auxiliary_loss_clip": 0.06653242, + "auxiliary_loss_mlp": 0.01304972, + "balance_loss_clip": 0.06302808, + "balance_loss_mlp": 0.01268279, + "epoch": 0.11351270103712612, + "flos": 20993245793280.0, + "grad_norm": 4.850859640376328, + "language_loss": 0.7868247, + "learning_rate": 3.927284900491277e-06, + "loss": 0.86640686, + "num_input_tokens_seen": 40812970, + "router_z_loss_clip": 3.50195312, + "router_z_loss_mlp": 0.36694336, + "step": 1888, + "time_per_iteration": 2.5445072650909424 + }, + { + "auxiliary_loss_clip": 0.06662205, + "auxiliary_loss_mlp": 0.01311301, + "balance_loss_clip": 0.06306933, + "balance_loss_mlp": 0.01271366, + "epoch": 0.11357282428979408, + "flos": 37358014152960.0, + "grad_norm": 2.243152205453325, + "language_loss": 0.69439191, + "learning_rate": 3.927180801692764e-06, + "loss": 0.77412695, + "num_input_tokens_seen": 40837745, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 0.39916992, + "step": 1889, + "time_per_iteration": 2.7570948600769043 + }, + { + "auxiliary_loss_clip": 0.06658383, + "auxiliary_loss_mlp": 0.01303074, + "balance_loss_clip": 0.06306529, + "balance_loss_mlp": 0.01266811, + "epoch": 0.11363294754246205, + "flos": 21762580855680.0, + "grad_norm": 2.3560992330068, + "language_loss": 0.85365129, + "learning_rate": 3.927076629815362e-06, + "loss": 0.93326581, + "num_input_tokens_seen": 40856490, + "router_z_loss_clip": 3.51953125, + "router_z_loss_mlp": 0.36279297, + "step": 1890, + "time_per_iteration": 2.539299964904785 + }, + { + "auxiliary_loss_clip": 0.06646931, + "auxiliary_loss_mlp": 0.0130946, + "balance_loss_clip": 0.06299055, + "balance_loss_mlp": 0.01272887, + "epoch": 0.11369307079513001, + "flos": 22608252587520.0, + "grad_norm": 3.2867804654433734, + "language_loss": 0.66679269, + "learning_rate": 3.926972384863022e-06, + "loss": 0.74635661, + "num_input_tokens_seen": 40874070, + "router_z_loss_clip": 3.48046875, + "router_z_loss_mlp": 0.36572266, + "step": 1891, + "time_per_iteration": 2.5804758071899414 + }, + { + "auxiliary_loss_clip": 0.06662975, + "auxiliary_loss_mlp": 0.01306025, + "balance_loss_clip": 0.06305033, + "balance_loss_mlp": 0.01268188, + "epoch": 0.11375319404779799, + "flos": 21950655344640.0, + "grad_norm": 2.3010503008358887, + "language_loss": 0.89755237, + "learning_rate": 3.9268680668396956e-06, + "loss": 0.97724235, + "num_input_tokens_seen": 40892425, + "router_z_loss_clip": 3.58398438, + "router_z_loss_mlp": 0.37817383, + "step": 1892, + "time_per_iteration": 2.5231149196624756 + }, + { + "auxiliary_loss_clip": 0.06664805, + "auxiliary_loss_mlp": 0.01310273, + "balance_loss_clip": 0.06304479, + "balance_loss_mlp": 0.01271149, + "epoch": 0.11381331730046595, + "flos": 26402588225280.0, + "grad_norm": 2.9760722646413966, + "language_loss": 0.75163257, + "learning_rate": 3.926763675749339e-06, + "loss": 0.83138341, + "num_input_tokens_seen": 40912190, + "router_z_loss_clip": 3.60546875, + "router_z_loss_mlp": 0.39111328, + "step": 1893, + "time_per_iteration": 2.6722171306610107 + }, + { + "auxiliary_loss_clip": 0.06657124, + "auxiliary_loss_mlp": 0.0130867, + "balance_loss_clip": 0.06306865, + "balance_loss_mlp": 0.01271405, + "epoch": 0.11387344055313392, + "flos": 23811373837440.0, + "grad_norm": 2.1739305302665417, + "language_loss": 0.81218535, + "learning_rate": 3.92665921159591e-06, + "loss": 0.89184326, + "num_input_tokens_seen": 40928395, + "router_z_loss_clip": 3.5, + "router_z_loss_mlp": 0.37255859, + "step": 1894, + "time_per_iteration": 2.5737743377685547 + }, + { + "auxiliary_loss_clip": 0.06661002, + "auxiliary_loss_mlp": 0.01313123, + "balance_loss_clip": 0.06302214, + "balance_loss_mlp": 0.01272187, + "epoch": 0.1139335638058019, + "flos": 34529865546240.0, + "grad_norm": 3.0499673553250317, + "language_loss": 0.81167793, + "learning_rate": 3.926554674383371e-06, + "loss": 0.89141917, + "num_input_tokens_seen": 40946555, + "router_z_loss_clip": 3.5859375, + "router_z_loss_mlp": 0.40991211, + "step": 1895, + "time_per_iteration": 2.6510303020477295 + }, + { + "auxiliary_loss_clip": 0.06495596, + "auxiliary_loss_mlp": 0.01269878, + "balance_loss_clip": 0.06284232, + "balance_loss_mlp": 0.01256026, + "epoch": 0.11399368705846986, + "flos": 70609790643840.0, + "grad_norm": 0.7664991761837657, + "language_loss": 0.63306981, + "learning_rate": 3.926450064115686e-06, + "loss": 0.71072453, + "num_input_tokens_seen": 41004910, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.13891602, + "step": 1896, + "time_per_iteration": 3.2715020179748535 + }, + { + "auxiliary_loss_clip": 0.06653456, + "auxiliary_loss_mlp": 0.01306088, + "balance_loss_clip": 0.06306494, + "balance_loss_mlp": 0.01266224, + "epoch": 0.11405381031113783, + "flos": 21330597530880.0, + "grad_norm": 2.7976416245645988, + "language_loss": 0.86136234, + "learning_rate": 3.926345380796821e-06, + "loss": 0.94095778, + "num_input_tokens_seen": 41026385, + "router_z_loss_clip": 3.47070312, + "router_z_loss_mlp": 0.3984375, + "step": 1897, + "time_per_iteration": 2.602890729904175 + }, + { + "auxiliary_loss_clip": 0.06656732, + "auxiliary_loss_mlp": 0.01307974, + "balance_loss_clip": 0.06304093, + "balance_loss_mlp": 0.01270041, + "epoch": 0.11411393356380581, + "flos": 19725820934400.0, + "grad_norm": 2.6374143353220068, + "language_loss": 0.80644619, + "learning_rate": 3.9262406244307465e-06, + "loss": 0.88609326, + "num_input_tokens_seen": 41045315, + "router_z_loss_clip": 3.52734375, + "router_z_loss_mlp": 0.37915039, + "step": 1898, + "time_per_iteration": 2.5834596157073975 + }, + { + "auxiliary_loss_clip": 0.06665078, + "auxiliary_loss_mlp": 0.0130214, + "balance_loss_clip": 0.06307302, + "balance_loss_mlp": 0.01261823, + "epoch": 0.11417405681647377, + "flos": 17536261893120.0, + "grad_norm": 3.558801225381502, + "language_loss": 0.74948764, + "learning_rate": 3.926135795021435e-06, + "loss": 0.82915986, + "num_input_tokens_seen": 41063390, + "router_z_loss_clip": 3.57226562, + "router_z_loss_mlp": 0.40283203, + "step": 1899, + "time_per_iteration": 2.5195093154907227 + }, + { + "auxiliary_loss_clip": 0.06484325, + "auxiliary_loss_mlp": 0.01277698, + "balance_loss_clip": 0.06276824, + "balance_loss_mlp": 0.01262463, + "epoch": 0.11423418006914174, + "flos": 59694168205440.0, + "grad_norm": 0.8563849035990295, + "language_loss": 0.63607001, + "learning_rate": 3.92603089257286e-06, + "loss": 0.71369016, + "num_input_tokens_seen": 41124180, + "router_z_loss_clip": 2.078125, + "router_z_loss_mlp": 0.15209961, + "step": 1900, + "time_per_iteration": 3.140596389770508 + }, + { + "auxiliary_loss_clip": 0.06654657, + "auxiliary_loss_mlp": 0.01295658, + "balance_loss_clip": 0.06311209, + "balance_loss_mlp": 0.01260706, + "epoch": 0.1142943033218097, + "flos": 22969223976960.0, + "grad_norm": 2.413799712437086, + "language_loss": 0.7948848, + "learning_rate": 3.925925917089001e-06, + "loss": 0.87438798, + "num_input_tokens_seen": 41143485, + "router_z_loss_clip": 3.43359375, + "router_z_loss_mlp": 0.34960938, + "step": 1901, + "time_per_iteration": 2.5521771907806396 + }, + { + "auxiliary_loss_clip": 0.06657314, + "auxiliary_loss_mlp": 0.01303255, + "balance_loss_clip": 0.06311248, + "balance_loss_mlp": 0.01264011, + "epoch": 0.11435442657447768, + "flos": 18261558835200.0, + "grad_norm": 2.3832212906881862, + "language_loss": 0.8530966, + "learning_rate": 3.925820868573839e-06, + "loss": 0.93270218, + "num_input_tokens_seen": 41161695, + "router_z_loss_clip": 3.4609375, + "router_z_loss_mlp": 0.39257812, + "step": 1902, + "time_per_iteration": 2.538130521774292 + }, + { + "auxiliary_loss_clip": 0.06657556, + "auxiliary_loss_mlp": 0.01298528, + "balance_loss_clip": 0.06305373, + "balance_loss_mlp": 0.01259737, + "epoch": 0.11441454982714565, + "flos": 24068070420480.0, + "grad_norm": 1.6413453356185448, + "language_loss": 0.79046285, + "learning_rate": 3.925715747031356e-06, + "loss": 0.87002361, + "num_input_tokens_seen": 41181715, + "router_z_loss_clip": 3.52539062, + "router_z_loss_mlp": 0.38793945, + "step": 1903, + "time_per_iteration": 2.5491714477539062 + }, + { + "auxiliary_loss_clip": 0.0665084, + "auxiliary_loss_mlp": 0.01296782, + "balance_loss_clip": 0.06302907, + "balance_loss_mlp": 0.01262021, + "epoch": 0.11447467307981361, + "flos": 25344719228160.0, + "grad_norm": 2.444047148927425, + "language_loss": 0.7716713, + "learning_rate": 3.925610552465539e-06, + "loss": 0.85114753, + "num_input_tokens_seen": 41201770, + "router_z_loss_clip": 3.47851562, + "router_z_loss_mlp": 0.34765625, + "step": 1904, + "time_per_iteration": 2.581732749938965 + }, + { + "auxiliary_loss_clip": 0.0665014, + "auxiliary_loss_mlp": 0.01305214, + "balance_loss_clip": 0.0630317, + "balance_loss_mlp": 0.01263967, + "epoch": 0.11453479633248159, + "flos": 21732546366720.0, + "grad_norm": 2.531757155305884, + "language_loss": 0.9328481, + "learning_rate": 3.9255052848803764e-06, + "loss": 1.01240158, + "num_input_tokens_seen": 41220590, + "router_z_loss_clip": 3.46484375, + "router_z_loss_mlp": 0.41259766, + "step": 1905, + "time_per_iteration": 2.5455148220062256 + }, + { + "auxiliary_loss_clip": 0.06677254, + "auxiliary_loss_mlp": 0.01302143, + "balance_loss_clip": 0.06310458, + "balance_loss_mlp": 0.0126185, + "epoch": 0.11459491958514956, + "flos": 12974771399040.0, + "grad_norm": 15.201644676234393, + "language_loss": 0.79179782, + "learning_rate": 3.925399944279861e-06, + "loss": 0.87159181, + "num_input_tokens_seen": 41237250, + "router_z_loss_clip": 3.66992188, + "router_z_loss_mlp": 0.40258789, + "step": 1906, + "time_per_iteration": 2.557220220565796 + }, + { + "auxiliary_loss_clip": 0.06651148, + "auxiliary_loss_mlp": 0.01309487, + "balance_loss_clip": 0.06300925, + "balance_loss_mlp": 0.0127022, + "epoch": 0.11465504283781752, + "flos": 22717935982080.0, + "grad_norm": 2.7916231383135903, + "language_loss": 0.84417903, + "learning_rate": 3.925294530667986e-06, + "loss": 0.92378545, + "num_input_tokens_seen": 41256680, + "router_z_loss_clip": 3.50195312, + "router_z_loss_mlp": 0.39257812, + "step": 1907, + "time_per_iteration": 2.538357734680176 + }, + { + "auxiliary_loss_clip": 0.06659371, + "auxiliary_loss_mlp": 0.01305713, + "balance_loss_clip": 0.06306633, + "balance_loss_mlp": 0.01266064, + "epoch": 0.1147151660904855, + "flos": 23404142194560.0, + "grad_norm": 5.983288386648609, + "language_loss": 0.85784996, + "learning_rate": 3.92518904404875e-06, + "loss": 0.93750072, + "num_input_tokens_seen": 41270955, + "router_z_loss_clip": 3.53125, + "router_z_loss_mlp": 0.39648438, + "step": 1908, + "time_per_iteration": 2.566323757171631 + }, + { + "auxiliary_loss_clip": 0.06483665, + "auxiliary_loss_mlp": 0.01269821, + "balance_loss_clip": 0.0627609, + "balance_loss_mlp": 0.01254252, + "epoch": 0.11477528934315347, + "flos": 63028639036800.0, + "grad_norm": 0.8722245963969955, + "language_loss": 0.60927975, + "learning_rate": 3.925083484426153e-06, + "loss": 0.68681461, + "num_input_tokens_seen": 41319180, + "router_z_loss_clip": 2.07617188, + "router_z_loss_mlp": 0.15551758, + "step": 1909, + "time_per_iteration": 2.9047083854675293 + }, + { + "auxiliary_loss_clip": 0.06651932, + "auxiliary_loss_mlp": 0.01304657, + "balance_loss_clip": 0.06305454, + "balance_loss_mlp": 0.01265223, + "epoch": 0.11483541259582143, + "flos": 16331086218240.0, + "grad_norm": 2.669666495614271, + "language_loss": 0.8074221, + "learning_rate": 3.924977851804197e-06, + "loss": 0.88698798, + "num_input_tokens_seen": 41337480, + "router_z_loss_clip": 3.46289062, + "router_z_loss_mlp": 0.39404297, + "step": 1910, + "time_per_iteration": 2.5531835556030273 + }, + { + "auxiliary_loss_clip": 0.06656756, + "auxiliary_loss_mlp": 0.01297855, + "balance_loss_clip": 0.06303862, + "balance_loss_mlp": 0.01258516, + "epoch": 0.1148955358484894, + "flos": 21586916770560.0, + "grad_norm": 2.9098941838716046, + "language_loss": 0.78589714, + "learning_rate": 3.9248721461868875e-06, + "loss": 0.86544329, + "num_input_tokens_seen": 41354650, + "router_z_loss_clip": 3.52734375, + "router_z_loss_mlp": 0.39331055, + "step": 1911, + "time_per_iteration": 3.928828477859497 + }, + { + "auxiliary_loss_clip": 0.06639488, + "auxiliary_loss_mlp": 0.01303362, + "balance_loss_clip": 0.06301475, + "balance_loss_mlp": 0.01266931, + "epoch": 0.11495565910115738, + "flos": 27681249530880.0, + "grad_norm": 2.02553210679246, + "language_loss": 0.80990648, + "learning_rate": 3.9247663675782336e-06, + "loss": 0.88933504, + "num_input_tokens_seen": 41376935, + "router_z_loss_clip": 3.38085938, + "router_z_loss_mlp": 0.36401367, + "step": 1912, + "time_per_iteration": 2.5985615253448486 + }, + { + "auxiliary_loss_clip": 0.06649567, + "auxiliary_loss_mlp": 0.01304436, + "balance_loss_clip": 0.06303079, + "balance_loss_mlp": 0.01266575, + "epoch": 0.11501578235382534, + "flos": 20638815022080.0, + "grad_norm": 2.0778571754475124, + "language_loss": 0.79150605, + "learning_rate": 3.924660515982246e-06, + "loss": 0.87104607, + "num_input_tokens_seen": 41396105, + "router_z_loss_clip": 3.46484375, + "router_z_loss_mlp": 0.37866211, + "step": 1913, + "time_per_iteration": 3.9840147495269775 + }, + { + "auxiliary_loss_clip": 0.06649221, + "auxiliary_loss_mlp": 0.01302596, + "balance_loss_clip": 0.06302825, + "balance_loss_mlp": 0.01266214, + "epoch": 0.1150759056064933, + "flos": 19835252766720.0, + "grad_norm": 2.174223201073213, + "language_loss": 0.71977127, + "learning_rate": 3.924554591402939e-06, + "loss": 0.79928941, + "num_input_tokens_seen": 41415600, + "router_z_loss_clip": 3.46484375, + "router_z_loss_mlp": 0.36352539, + "step": 1914, + "time_per_iteration": 2.564162492752075 + }, + { + "auxiliary_loss_clip": 0.06490675, + "auxiliary_loss_mlp": 0.01271492, + "balance_loss_clip": 0.06283194, + "balance_loss_mlp": 0.01257139, + "epoch": 0.11513602885916129, + "flos": 70068543194880.0, + "grad_norm": 0.7330745369663106, + "language_loss": 0.61048496, + "learning_rate": 3.92444859384433e-06, + "loss": 0.68810666, + "num_input_tokens_seen": 41478760, + "router_z_loss_clip": 2.0625, + "router_z_loss_mlp": 0.14343262, + "step": 1915, + "time_per_iteration": 4.616885662078857 + }, + { + "auxiliary_loss_clip": 0.06646329, + "auxiliary_loss_mlp": 0.01309796, + "balance_loss_clip": 0.06301694, + "balance_loss_mlp": 0.01271697, + "epoch": 0.11519615211182925, + "flos": 15747100387200.0, + "grad_norm": 2.8536727053056077, + "language_loss": 0.94662005, + "learning_rate": 3.924342523310436e-06, + "loss": 1.02618122, + "num_input_tokens_seen": 41495720, + "router_z_loss_clip": 3.44726562, + "router_z_loss_mlp": 0.38085938, + "step": 1916, + "time_per_iteration": 2.544074058532715 + }, + { + "auxiliary_loss_clip": 0.06649305, + "auxiliary_loss_mlp": 0.01297855, + "balance_loss_clip": 0.06298364, + "balance_loss_mlp": 0.01258945, + "epoch": 0.11525627536449722, + "flos": 20673880755840.0, + "grad_norm": 1.9176091228095486, + "language_loss": 0.73714519, + "learning_rate": 3.9242363798052806e-06, + "loss": 0.81661683, + "num_input_tokens_seen": 41513585, + "router_z_loss_clip": 3.5078125, + "router_z_loss_mlp": 0.3894043, + "step": 1917, + "time_per_iteration": 3.988520383834839 + }, + { + "auxiliary_loss_clip": 0.06637132, + "auxiliary_loss_mlp": 0.01303977, + "balance_loss_clip": 0.06296226, + "balance_loss_mlp": 0.01264876, + "epoch": 0.1153163986171652, + "flos": 20309555203200.0, + "grad_norm": 2.2006178662795546, + "language_loss": 0.7638135, + "learning_rate": 3.92413016333289e-06, + "loss": 0.84322459, + "num_input_tokens_seen": 41533390, + "router_z_loss_clip": 3.40820312, + "router_z_loss_mlp": 0.39135742, + "step": 1918, + "time_per_iteration": 2.531501531600952 + }, + { + "auxiliary_loss_clip": 0.06653848, + "auxiliary_loss_mlp": 0.01302011, + "balance_loss_clip": 0.06300295, + "balance_loss_mlp": 0.01263983, + "epoch": 0.11537652186983316, + "flos": 17645064819840.0, + "grad_norm": 6.624924967769877, + "language_loss": 0.87652063, + "learning_rate": 3.92402387389729e-06, + "loss": 0.95607924, + "num_input_tokens_seen": 41551015, + "router_z_loss_clip": 3.53320312, + "router_z_loss_mlp": 0.38037109, + "step": 1919, + "time_per_iteration": 2.5388336181640625 + }, + { + "auxiliary_loss_clip": 0.06642918, + "auxiliary_loss_mlp": 0.01303256, + "balance_loss_clip": 0.06300272, + "balance_loss_mlp": 0.01265872, + "epoch": 0.11543664512250112, + "flos": 21075787664640.0, + "grad_norm": 2.5165855021660697, + "language_loss": 0.87737721, + "learning_rate": 3.923917511502512e-06, + "loss": 0.95683897, + "num_input_tokens_seen": 41568055, + "router_z_loss_clip": 3.42773438, + "router_z_loss_mlp": 0.37402344, + "step": 1920, + "time_per_iteration": 2.536255121231079 + }, + { + "auxiliary_loss_clip": 0.0663945, + "auxiliary_loss_mlp": 0.01300031, + "balance_loss_clip": 0.06300904, + "balance_loss_mlp": 0.01262671, + "epoch": 0.11549676837516909, + "flos": 22754175672960.0, + "grad_norm": 2.0755692503441696, + "language_loss": 0.81216776, + "learning_rate": 3.923811076152589e-06, + "loss": 0.89156258, + "num_input_tokens_seen": 41587435, + "router_z_loss_clip": 3.3828125, + "router_z_loss_mlp": 0.3737793, + "step": 1921, + "time_per_iteration": 2.5809693336486816 + }, + { + "auxiliary_loss_clip": 0.06661837, + "auxiliary_loss_mlp": 0.01301821, + "balance_loss_clip": 0.06303193, + "balance_loss_mlp": 0.0126036, + "epoch": 0.11555689162783707, + "flos": 19174510995840.0, + "grad_norm": 2.11935003712056, + "language_loss": 0.79765266, + "learning_rate": 3.923704567851557e-06, + "loss": 0.87728924, + "num_input_tokens_seen": 41604975, + "router_z_loss_clip": 3.5859375, + "router_z_loss_mlp": 0.41455078, + "step": 1922, + "time_per_iteration": 2.521562099456787 + }, + { + "auxiliary_loss_clip": 0.06651014, + "auxiliary_loss_mlp": 0.01303966, + "balance_loss_clip": 0.06302896, + "balance_loss_mlp": 0.01265939, + "epoch": 0.11561701488050503, + "flos": 24579031818240.0, + "grad_norm": 1.9630494189649508, + "language_loss": 0.85855269, + "learning_rate": 3.923597986603456e-06, + "loss": 0.93810248, + "num_input_tokens_seen": 41626155, + "router_z_loss_clip": 3.48046875, + "router_z_loss_mlp": 0.38037109, + "step": 1923, + "time_per_iteration": 2.6439831256866455 + }, + { + "auxiliary_loss_clip": 0.06647194, + "auxiliary_loss_mlp": 0.01294133, + "balance_loss_clip": 0.0630134, + "balance_loss_mlp": 0.01258465, + "epoch": 0.115677138133173, + "flos": 17098283001600.0, + "grad_norm": 2.06344411433486, + "language_loss": 0.8208636, + "learning_rate": 3.9234913324123264e-06, + "loss": 0.90027684, + "num_input_tokens_seen": 41644805, + "router_z_loss_clip": 3.4609375, + "router_z_loss_mlp": 0.35668945, + "step": 1924, + "time_per_iteration": 2.5213494300842285 + }, + { + "auxiliary_loss_clip": 0.06494077, + "auxiliary_loss_mlp": 0.01268349, + "balance_loss_clip": 0.06289093, + "balance_loss_mlp": 0.01252459, + "epoch": 0.11573726138584098, + "flos": 62724032317440.0, + "grad_norm": 0.8075731701213882, + "language_loss": 0.60936594, + "learning_rate": 3.923384605282212e-06, + "loss": 0.6869902, + "num_input_tokens_seen": 41709345, + "router_z_loss_clip": 2.06054688, + "router_z_loss_mlp": 0.15881348, + "step": 1925, + "time_per_iteration": 3.2047207355499268 + }, + { + "auxiliary_loss_clip": 0.06648477, + "auxiliary_loss_mlp": 0.01300045, + "balance_loss_clip": 0.06303966, + "balance_loss_mlp": 0.01261016, + "epoch": 0.11579738463850894, + "flos": 22607665608960.0, + "grad_norm": 2.013389480073572, + "language_loss": 0.76518846, + "learning_rate": 3.923277805217161e-06, + "loss": 0.84467369, + "num_input_tokens_seen": 41730210, + "router_z_loss_clip": 3.44726562, + "router_z_loss_mlp": 0.39038086, + "step": 1926, + "time_per_iteration": 2.55283784866333 + }, + { + "auxiliary_loss_clip": 0.06666763, + "auxiliary_loss_mlp": 0.01299238, + "balance_loss_clip": 0.06301835, + "balance_loss_mlp": 0.01255583, + "epoch": 0.11585750789117691, + "flos": 21732630220800.0, + "grad_norm": 5.887246019394102, + "language_loss": 0.7431767, + "learning_rate": 3.923170932221222e-06, + "loss": 0.82283664, + "num_input_tokens_seen": 41750270, + "router_z_loss_clip": 3.64648438, + "router_z_loss_mlp": 0.43652344, + "step": 1927, + "time_per_iteration": 2.560518503189087 + }, + { + "auxiliary_loss_clip": 0.06652652, + "auxiliary_loss_mlp": 0.01306042, + "balance_loss_clip": 0.0630243, + "balance_loss_mlp": 0.01264986, + "epoch": 0.11591763114384489, + "flos": 26294917328640.0, + "grad_norm": 2.5509114333241873, + "language_loss": 0.88765574, + "learning_rate": 3.92306398629845e-06, + "loss": 0.96724266, + "num_input_tokens_seen": 41772975, + "router_z_loss_clip": 3.50585938, + "router_z_loss_mlp": 0.41064453, + "step": 1928, + "time_per_iteration": 2.6590919494628906 + }, + { + "auxiliary_loss_clip": 0.06657438, + "auxiliary_loss_mlp": 0.01301093, + "balance_loss_clip": 0.06300268, + "balance_loss_mlp": 0.01261468, + "epoch": 0.11597775439651285, + "flos": 23006721479040.0, + "grad_norm": 2.0893495121762844, + "language_loss": 0.7806766, + "learning_rate": 3.922956967452898e-06, + "loss": 0.86026198, + "num_input_tokens_seen": 41791765, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 0.39648438, + "step": 1929, + "time_per_iteration": 2.5792133808135986 + }, + { + "auxiliary_loss_clip": 0.06650299, + "auxiliary_loss_mlp": 0.01295794, + "balance_loss_clip": 0.06304935, + "balance_loss_mlp": 0.01259626, + "epoch": 0.11603787764918082, + "flos": 31949845678080.0, + "grad_norm": 1.6257603780251215, + "language_loss": 0.78351086, + "learning_rate": 3.922849875688626e-06, + "loss": 0.86297178, + "num_input_tokens_seen": 41815615, + "router_z_loss_clip": 3.453125, + "router_z_loss_mlp": 0.36181641, + "step": 1930, + "time_per_iteration": 2.6880123615264893 + }, + { + "auxiliary_loss_clip": 0.06647912, + "auxiliary_loss_mlp": 0.01295728, + "balance_loss_clip": 0.06302683, + "balance_loss_mlp": 0.01257438, + "epoch": 0.1160980009018488, + "flos": 22277944592640.0, + "grad_norm": 1.7868265367767153, + "language_loss": 0.73173678, + "learning_rate": 3.922742711009693e-06, + "loss": 0.81117314, + "num_input_tokens_seen": 41834810, + "router_z_loss_clip": 3.453125, + "router_z_loss_mlp": 0.3828125, + "step": 1931, + "time_per_iteration": 2.5717685222625732 + }, + { + "auxiliary_loss_clip": 0.06652078, + "auxiliary_loss_mlp": 0.01303044, + "balance_loss_clip": 0.06304099, + "balance_loss_mlp": 0.01264539, + "epoch": 0.11615812415451676, + "flos": 22790205728640.0, + "grad_norm": 1.6665760080165584, + "language_loss": 0.8340829, + "learning_rate": 3.922635473420164e-06, + "loss": 0.91363412, + "num_input_tokens_seen": 41854975, + "router_z_loss_clip": 3.47851562, + "router_z_loss_mlp": 0.38500977, + "step": 1932, + "time_per_iteration": 2.601752519607544 + }, + { + "auxiliary_loss_clip": 0.0648433, + "auxiliary_loss_mlp": 0.01265345, + "balance_loss_clip": 0.06279489, + "balance_loss_mlp": 0.01250242, + "epoch": 0.11621824740718473, + "flos": 67165483438080.0, + "grad_norm": 0.7530575515980809, + "language_loss": 0.61312342, + "learning_rate": 3.922528162924105e-06, + "loss": 0.69062018, + "num_input_tokens_seen": 41911105, + "router_z_loss_clip": 2.046875, + "router_z_loss_mlp": 0.15075684, + "step": 1933, + "time_per_iteration": 3.078101873397827 + }, + { + "auxiliary_loss_clip": 0.06656399, + "auxiliary_loss_mlp": 0.01297791, + "balance_loss_clip": 0.06303177, + "balance_loss_mlp": 0.01259239, + "epoch": 0.11627837065985269, + "flos": 20382160366080.0, + "grad_norm": 2.5724054750959446, + "language_loss": 0.8773917, + "learning_rate": 3.922420779525586e-06, + "loss": 0.95693362, + "num_input_tokens_seen": 41931750, + "router_z_loss_clip": 3.52929688, + "router_z_loss_mlp": 0.38574219, + "step": 1934, + "time_per_iteration": 2.5999112129211426 + }, + { + "auxiliary_loss_clip": 0.06669597, + "auxiliary_loss_mlp": 0.01303802, + "balance_loss_clip": 0.0630424, + "balance_loss_mlp": 0.01260386, + "epoch": 0.11633849391252067, + "flos": 21732252877440.0, + "grad_norm": 3.12484100633917, + "language_loss": 0.67964768, + "learning_rate": 3.9223133232286776e-06, + "loss": 0.75938165, + "num_input_tokens_seen": 41949400, + "router_z_loss_clip": 3.65625, + "router_z_loss_mlp": 0.43408203, + "step": 1935, + "time_per_iteration": 2.5801587104797363 + }, + { + "auxiliary_loss_clip": 0.06657647, + "auxiliary_loss_mlp": 0.01296559, + "balance_loss_clip": 0.06305058, + "balance_loss_mlp": 0.01259485, + "epoch": 0.11639861716518864, + "flos": 18811023984000.0, + "grad_norm": 1.935927362539055, + "language_loss": 0.77021551, + "learning_rate": 3.922205794037456e-06, + "loss": 0.84975761, + "num_input_tokens_seen": 41968100, + "router_z_loss_clip": 3.52734375, + "router_z_loss_mlp": 0.37084961, + "step": 1936, + "time_per_iteration": 2.5624840259552 + }, + { + "auxiliary_loss_clip": 0.06655373, + "auxiliary_loss_mlp": 0.01299017, + "balance_loss_clip": 0.06303351, + "balance_loss_mlp": 0.01259678, + "epoch": 0.1164587404178566, + "flos": 21221333406720.0, + "grad_norm": 1.9207342779057202, + "language_loss": 0.85928023, + "learning_rate": 3.922098191955998e-06, + "loss": 0.93882406, + "num_input_tokens_seen": 41986375, + "router_z_loss_clip": 3.51953125, + "router_z_loss_mlp": 0.39355469, + "step": 1937, + "time_per_iteration": 2.5510001182556152 + }, + { + "auxiliary_loss_clip": 0.06649198, + "auxiliary_loss_mlp": 0.01298206, + "balance_loss_clip": 0.06305847, + "balance_loss_mlp": 0.01261561, + "epoch": 0.11651886367052458, + "flos": 27826040586240.0, + "grad_norm": 2.6065443485594613, + "language_loss": 0.78032261, + "learning_rate": 3.921990516988384e-06, + "loss": 0.85979664, + "num_input_tokens_seen": 42006055, + "router_z_loss_clip": 3.43359375, + "router_z_loss_mlp": 0.36645508, + "step": 1938, + "time_per_iteration": 2.6225640773773193 + }, + { + "auxiliary_loss_clip": 0.06663075, + "auxiliary_loss_mlp": 0.01303768, + "balance_loss_clip": 0.06310252, + "balance_loss_mlp": 0.01266098, + "epoch": 0.11657898692319255, + "flos": 22895570638080.0, + "grad_norm": 1.931552039208485, + "language_loss": 0.80530608, + "learning_rate": 3.921882769138696e-06, + "loss": 0.88497448, + "num_input_tokens_seen": 42024995, + "router_z_loss_clip": 3.52929688, + "router_z_loss_mlp": 0.37670898, + "step": 1939, + "time_per_iteration": 2.5451977252960205 + }, + { + "auxiliary_loss_clip": 0.06656967, + "auxiliary_loss_mlp": 0.01296552, + "balance_loss_clip": 0.06312265, + "balance_loss_mlp": 0.01261409, + "epoch": 0.11663911017586051, + "flos": 24322712578560.0, + "grad_norm": 2.6690615994939795, + "language_loss": 0.88347197, + "learning_rate": 3.9217749484110215e-06, + "loss": 0.96300709, + "num_input_tokens_seen": 42042640, + "router_z_loss_clip": 3.4453125, + "router_z_loss_mlp": 0.3515625, + "step": 1940, + "time_per_iteration": 2.572737216949463 + }, + { + "auxiliary_loss_clip": 0.06642211, + "auxiliary_loss_mlp": 0.01298321, + "balance_loss_clip": 0.06303503, + "balance_loss_mlp": 0.01262987, + "epoch": 0.11669923342852849, + "flos": 42350020525440.0, + "grad_norm": 1.538525373225641, + "language_loss": 0.7696858, + "learning_rate": 3.921667054809449e-06, + "loss": 0.84909111, + "num_input_tokens_seen": 42067005, + "router_z_loss_clip": 3.38671875, + "router_z_loss_mlp": 0.35327148, + "step": 1941, + "time_per_iteration": 2.72994065284729 + }, + { + "auxiliary_loss_clip": 0.06658466, + "auxiliary_loss_mlp": 0.01294978, + "balance_loss_clip": 0.06313083, + "balance_loss_mlp": 0.01259525, + "epoch": 0.11675935668119646, + "flos": 14646660716160.0, + "grad_norm": 2.147321627209633, + "language_loss": 0.9028796, + "learning_rate": 3.921559088338068e-06, + "loss": 0.98241401, + "num_input_tokens_seen": 42082295, + "router_z_loss_clip": 3.453125, + "router_z_loss_mlp": 0.35449219, + "step": 1942, + "time_per_iteration": 2.550832986831665 + }, + { + "auxiliary_loss_clip": 0.06645136, + "auxiliary_loss_mlp": 0.0129601, + "balance_loss_clip": 0.06305736, + "balance_loss_mlp": 0.01262154, + "epoch": 0.11681947993386442, + "flos": 35125213605120.0, + "grad_norm": 1.8932460092328547, + "language_loss": 0.69414169, + "learning_rate": 3.921451049000975e-06, + "loss": 0.77355313, + "num_input_tokens_seen": 42105295, + "router_z_loss_clip": 3.39453125, + "router_z_loss_mlp": 0.33813477, + "step": 1943, + "time_per_iteration": 2.6689436435699463 + }, + { + "auxiliary_loss_clip": 0.06646268, + "auxiliary_loss_mlp": 0.01301771, + "balance_loss_clip": 0.06305961, + "balance_loss_mlp": 0.01264721, + "epoch": 0.11687960318653239, + "flos": 38992531749120.0, + "grad_norm": 3.030291623904481, + "language_loss": 0.71275461, + "learning_rate": 3.921342936802265e-06, + "loss": 0.79223496, + "num_input_tokens_seen": 42125520, + "router_z_loss_clip": 3.40429688, + "router_z_loss_mlp": 0.37060547, + "step": 1944, + "time_per_iteration": 2.8050050735473633 + }, + { + "auxiliary_loss_clip": 0.06641431, + "auxiliary_loss_mlp": 0.01296797, + "balance_loss_clip": 0.06306483, + "balance_loss_mlp": 0.01261606, + "epoch": 0.11693972643920036, + "flos": 26002190689920.0, + "grad_norm": 1.654338946560172, + "language_loss": 0.83736217, + "learning_rate": 3.921234751746038e-06, + "loss": 0.91674441, + "num_input_tokens_seen": 42146335, + "router_z_loss_clip": 3.34765625, + "router_z_loss_mlp": 0.35205078, + "step": 1945, + "time_per_iteration": 2.6361136436462402 + }, + { + "auxiliary_loss_clip": 0.06650846, + "auxiliary_loss_mlp": 0.01293506, + "balance_loss_clip": 0.06312834, + "balance_loss_mlp": 0.01259579, + "epoch": 0.11699984969186833, + "flos": 27279552257280.0, + "grad_norm": 2.078454883436641, + "language_loss": 0.78074771, + "learning_rate": 3.9211264938363975e-06, + "loss": 0.86019123, + "num_input_tokens_seen": 42165320, + "router_z_loss_clip": 3.38085938, + "router_z_loss_mlp": 0.33935547, + "step": 1946, + "time_per_iteration": 2.6417500972747803 + }, + { + "auxiliary_loss_clip": 0.06645864, + "auxiliary_loss_mlp": 0.01291798, + "balance_loss_clip": 0.06307344, + "balance_loss_mlp": 0.01256083, + "epoch": 0.1170599729445363, + "flos": 15273217221120.0, + "grad_norm": 2.310732730392425, + "language_loss": 0.70257539, + "learning_rate": 3.921018163077448e-06, + "loss": 0.78195202, + "num_input_tokens_seen": 42182955, + "router_z_loss_clip": 3.38671875, + "router_z_loss_mlp": 0.35717773, + "step": 1947, + "time_per_iteration": 2.536513090133667 + }, + { + "auxiliary_loss_clip": 0.0665355, + "auxiliary_loss_mlp": 0.01301689, + "balance_loss_clip": 0.0630812, + "balance_loss_mlp": 0.01263113, + "epoch": 0.11712009619720427, + "flos": 17170007696640.0, + "grad_norm": 1.8188768357243443, + "language_loss": 0.86507225, + "learning_rate": 3.920909759473295e-06, + "loss": 0.94462466, + "num_input_tokens_seen": 42200760, + "router_z_loss_clip": 3.45507812, + "router_z_loss_mlp": 0.38574219, + "step": 1948, + "time_per_iteration": 2.515779495239258 + }, + { + "auxiliary_loss_clip": 0.06494473, + "auxiliary_loss_mlp": 0.01265792, + "balance_loss_clip": 0.06290484, + "balance_loss_mlp": 0.01249031, + "epoch": 0.11718021944987224, + "flos": 70961076887040.0, + "grad_norm": 2.567078438362061, + "language_loss": 0.65165019, + "learning_rate": 3.920801283028054e-06, + "loss": 0.72925287, + "num_input_tokens_seen": 42265745, + "router_z_loss_clip": 2.04101562, + "router_z_loss_mlp": 0.16772461, + "step": 1949, + "time_per_iteration": 3.177534341812134 + }, + { + "auxiliary_loss_clip": 0.06637877, + "auxiliary_loss_mlp": 0.0129446, + "balance_loss_clip": 0.06306669, + "balance_loss_mlp": 0.01261344, + "epoch": 0.1172403427025402, + "flos": 27460750711680.0, + "grad_norm": 1.6361907196052987, + "language_loss": 0.73358595, + "learning_rate": 3.920692733745835e-06, + "loss": 0.81290931, + "num_input_tokens_seen": 42286245, + "router_z_loss_clip": 3.31054688, + "router_z_loss_mlp": 0.33129883, + "step": 1950, + "time_per_iteration": 4.022751808166504 + }, + { + "auxiliary_loss_clip": 0.06660106, + "auxiliary_loss_mlp": 0.01302647, + "balance_loss_clip": 0.063132, + "balance_loss_mlp": 0.01265382, + "epoch": 0.11730046595520818, + "flos": 15674075953920.0, + "grad_norm": 2.7331916034067363, + "language_loss": 0.77657926, + "learning_rate": 3.920584111630755e-06, + "loss": 0.85620677, + "num_input_tokens_seen": 42302710, + "router_z_loss_clip": 3.46875, + "router_z_loss_mlp": 0.37280273, + "step": 1951, + "time_per_iteration": 2.5281777381896973 + }, + { + "auxiliary_loss_clip": 0.06648034, + "auxiliary_loss_mlp": 0.01294944, + "balance_loss_clip": 0.06303104, + "balance_loss_mlp": 0.01259801, + "epoch": 0.11736058920787615, + "flos": 25637320085760.0, + "grad_norm": 1.948975435069226, + "language_loss": 0.77674389, + "learning_rate": 3.9204754166869325e-06, + "loss": 0.85617363, + "num_input_tokens_seen": 42324115, + "router_z_loss_clip": 3.44921875, + "router_z_loss_mlp": 0.35131836, + "step": 1952, + "time_per_iteration": 4.001826286315918 + }, + { + "auxiliary_loss_clip": 0.06657356, + "auxiliary_loss_mlp": 0.01307688, + "balance_loss_clip": 0.06309209, + "balance_loss_mlp": 0.01270828, + "epoch": 0.11742071246054411, + "flos": 21440742122880.0, + "grad_norm": 9.62552088472932, + "language_loss": 0.73713255, + "learning_rate": 3.920366648918491e-06, + "loss": 0.81678301, + "num_input_tokens_seen": 42342505, + "router_z_loss_clip": 3.48242188, + "router_z_loss_mlp": 0.3684082, + "step": 1953, + "time_per_iteration": 2.5549252033233643 + }, + { + "auxiliary_loss_clip": 0.06670918, + "auxiliary_loss_mlp": 0.0130466, + "balance_loss_clip": 0.06317334, + "balance_loss_mlp": 0.01266203, + "epoch": 0.11748083571321208, + "flos": 16003377699840.0, + "grad_norm": 2.536716983337743, + "language_loss": 0.80894691, + "learning_rate": 3.920257808329552e-06, + "loss": 0.88870263, + "num_input_tokens_seen": 42360525, + "router_z_loss_clip": 3.53710938, + "router_z_loss_mlp": 0.38452148, + "step": 1954, + "time_per_iteration": 2.5963521003723145 + }, + { + "auxiliary_loss_clip": 0.06659664, + "auxiliary_loss_mlp": 0.01298566, + "balance_loss_clip": 0.06309056, + "balance_loss_mlp": 0.01260037, + "epoch": 0.11754095896588006, + "flos": 16185582403200.0, + "grad_norm": 1.9904438509588216, + "language_loss": 0.86966431, + "learning_rate": 3.920148894924246e-06, + "loss": 0.94924664, + "num_input_tokens_seen": 42377045, + "router_z_loss_clip": 3.50195312, + "router_z_loss_mlp": 0.38500977, + "step": 1955, + "time_per_iteration": 3.9597103595733643 + }, + { + "auxiliary_loss_clip": 0.06656501, + "auxiliary_loss_mlp": 0.0129708, + "balance_loss_clip": 0.06311554, + "balance_loss_mlp": 0.01262962, + "epoch": 0.11760108221854802, + "flos": 13266701424000.0, + "grad_norm": 2.228472811519511, + "language_loss": 0.79745102, + "learning_rate": 3.920039908706701e-06, + "loss": 0.8769868, + "num_input_tokens_seen": 42393960, + "router_z_loss_clip": 3.44726562, + "router_z_loss_mlp": 0.34130859, + "step": 1956, + "time_per_iteration": 3.990912437438965 + }, + { + "auxiliary_loss_clip": 0.0665153, + "auxiliary_loss_mlp": 0.01299416, + "balance_loss_clip": 0.06313992, + "balance_loss_mlp": 0.01266014, + "epoch": 0.11766120547121599, + "flos": 24505294625280.0, + "grad_norm": 2.0751916947238755, + "language_loss": 0.81691504, + "learning_rate": 3.91993084968105e-06, + "loss": 0.89642453, + "num_input_tokens_seen": 42413160, + "router_z_loss_clip": 3.37695312, + "router_z_loss_mlp": 0.33398438, + "step": 1957, + "time_per_iteration": 2.6472387313842773 + }, + { + "auxiliary_loss_clip": 0.06660254, + "auxiliary_loss_mlp": 0.01296947, + "balance_loss_clip": 0.06313962, + "balance_loss_mlp": 0.01261757, + "epoch": 0.11772132872388397, + "flos": 17789562385920.0, + "grad_norm": 3.000987002447453, + "language_loss": 0.80231309, + "learning_rate": 3.919821717851428e-06, + "loss": 0.88188511, + "num_input_tokens_seen": 42432590, + "router_z_loss_clip": 3.45898438, + "router_z_loss_mlp": 0.35180664, + "step": 1958, + "time_per_iteration": 2.5531046390533447 + }, + { + "auxiliary_loss_clip": 0.06667449, + "auxiliary_loss_mlp": 0.01302997, + "balance_loss_clip": 0.06316346, + "balance_loss_mlp": 0.01263968, + "epoch": 0.11778145197655193, + "flos": 13220776586880.0, + "grad_norm": 3.2848276198767725, + "language_loss": 0.78886813, + "learning_rate": 3.919712513221976e-06, + "loss": 0.86857259, + "num_input_tokens_seen": 42450135, + "router_z_loss_clip": 3.51171875, + "router_z_loss_mlp": 0.39038086, + "step": 1959, + "time_per_iteration": 2.57987642288208 + }, + { + "auxiliary_loss_clip": 0.06661299, + "auxiliary_loss_mlp": 0.01290487, + "balance_loss_clip": 0.06313363, + "balance_loss_mlp": 0.0125656, + "epoch": 0.1178415752292199, + "flos": 20236446915840.0, + "grad_norm": 2.2069161558777033, + "language_loss": 0.72216022, + "learning_rate": 3.919603235796832e-06, + "loss": 0.80167806, + "num_input_tokens_seen": 42470050, + "router_z_loss_clip": 3.47851562, + "router_z_loss_mlp": 0.33911133, + "step": 1960, + "time_per_iteration": 2.568760633468628 + }, + { + "auxiliary_loss_clip": 0.06675136, + "auxiliary_loss_mlp": 0.0129754, + "balance_loss_clip": 0.0632275, + "balance_loss_mlp": 0.01260156, + "epoch": 0.11790169848188788, + "flos": 13044777085440.0, + "grad_norm": 2.729190408722114, + "language_loss": 0.83173323, + "learning_rate": 3.9194938855801406e-06, + "loss": 0.91146004, + "num_input_tokens_seen": 42484335, + "router_z_loss_clip": 3.51953125, + "router_z_loss_mlp": 0.3737793, + "step": 1961, + "time_per_iteration": 2.5375704765319824 + }, + { + "auxiliary_loss_clip": 0.06648357, + "auxiliary_loss_mlp": 0.01294811, + "balance_loss_clip": 0.06310797, + "balance_loss_mlp": 0.01261671, + "epoch": 0.11796182173455584, + "flos": 22271026631040.0, + "grad_norm": 1.7537121481691995, + "language_loss": 0.93383837, + "learning_rate": 3.919384462576049e-06, + "loss": 1.01327002, + "num_input_tokens_seen": 42502720, + "router_z_loss_clip": 3.375, + "router_z_loss_mlp": 0.33105469, + "step": 1962, + "time_per_iteration": 2.5976755619049072 + }, + { + "auxiliary_loss_clip": 0.06656337, + "auxiliary_loss_mlp": 0.01295869, + "balance_loss_clip": 0.06308894, + "balance_loss_mlp": 0.0125994, + "epoch": 0.1180219449872238, + "flos": 10639750469760.0, + "grad_norm": 2.255465148131723, + "language_loss": 0.89418864, + "learning_rate": 3.919274966788707e-06, + "loss": 0.97371072, + "num_input_tokens_seen": 42519460, + "router_z_loss_clip": 3.4765625, + "router_z_loss_mlp": 0.35961914, + "step": 1963, + "time_per_iteration": 2.543811321258545 + }, + { + "auxiliary_loss_clip": 0.06669922, + "auxiliary_loss_mlp": 0.01296273, + "balance_loss_clip": 0.0631619, + "balance_loss_mlp": 0.01260963, + "epoch": 0.11808206823989177, + "flos": 20929906506240.0, + "grad_norm": 1.978622705265592, + "language_loss": 0.85645056, + "learning_rate": 3.919165398222265e-06, + "loss": 0.93611252, + "num_input_tokens_seen": 42539420, + "router_z_loss_clip": 3.5390625, + "router_z_loss_mlp": 0.35327148, + "step": 1964, + "time_per_iteration": 2.623378276824951 + }, + { + "auxiliary_loss_clip": 0.06654269, + "auxiliary_loss_mlp": 0.01293841, + "balance_loss_clip": 0.06309862, + "balance_loss_mlp": 0.01258722, + "epoch": 0.11814219149255975, + "flos": 20784151128960.0, + "grad_norm": 2.5088973707394833, + "language_loss": 0.84141672, + "learning_rate": 3.919055756880879e-06, + "loss": 0.92089784, + "num_input_tokens_seen": 42558225, + "router_z_loss_clip": 3.44335938, + "router_z_loss_mlp": 0.35107422, + "step": 1965, + "time_per_iteration": 2.5660836696624756 + }, + { + "auxiliary_loss_clip": 0.0666364, + "auxiliary_loss_mlp": 0.01301878, + "balance_loss_clip": 0.06310593, + "balance_loss_mlp": 0.01261681, + "epoch": 0.11820231474522772, + "flos": 48770594357760.0, + "grad_norm": 7.622964926374016, + "language_loss": 0.75756431, + "learning_rate": 3.918946042768707e-06, + "loss": 0.83721948, + "num_input_tokens_seen": 42580790, + "router_z_loss_clip": 3.53320312, + "router_z_loss_mlp": 0.40185547, + "step": 1966, + "time_per_iteration": 2.82966947555542 + }, + { + "auxiliary_loss_clip": 0.06671088, + "auxiliary_loss_mlp": 0.01309316, + "balance_loss_clip": 0.06322029, + "balance_loss_mlp": 0.01273887, + "epoch": 0.11826243799789568, + "flos": 16696166457600.0, + "grad_norm": 4.386609320764267, + "language_loss": 0.74750423, + "learning_rate": 3.918836255889908e-06, + "loss": 0.8273083, + "num_input_tokens_seen": 42597355, + "router_z_loss_clip": 3.49414062, + "router_z_loss_mlp": 0.35449219, + "step": 1967, + "time_per_iteration": 2.5282158851623535 + }, + { + "auxiliary_loss_clip": 0.06658092, + "auxiliary_loss_mlp": 0.01304409, + "balance_loss_clip": 0.06307551, + "balance_loss_mlp": 0.01268003, + "epoch": 0.11832256125056366, + "flos": 16915533246720.0, + "grad_norm": 2.9401944207789934, + "language_loss": 0.90244436, + "learning_rate": 3.9187263962486456e-06, + "loss": 0.98206937, + "num_input_tokens_seen": 42616060, + "router_z_loss_clip": 3.5078125, + "router_z_loss_mlp": 0.36401367, + "step": 1968, + "time_per_iteration": 2.573209285736084 + }, + { + "auxiliary_loss_clip": 0.06659393, + "auxiliary_loss_mlp": 0.01300215, + "balance_loss_clip": 0.06313194, + "balance_loss_mlp": 0.01266264, + "epoch": 0.11838268450323162, + "flos": 22827032398080.0, + "grad_norm": 2.909458687960279, + "language_loss": 0.68506658, + "learning_rate": 3.918616463849087e-06, + "loss": 0.76466268, + "num_input_tokens_seen": 42636285, + "router_z_loss_clip": 3.46289062, + "router_z_loss_mlp": 0.33935547, + "step": 1969, + "time_per_iteration": 2.574584484100342 + }, + { + "auxiliary_loss_clip": 0.06652254, + "auxiliary_loss_mlp": 0.01317322, + "balance_loss_clip": 0.06307729, + "balance_loss_mlp": 0.01281034, + "epoch": 0.11844280775589959, + "flos": 33554035296000.0, + "grad_norm": 1.9192483322460232, + "language_loss": 0.81922328, + "learning_rate": 3.918506458695399e-06, + "loss": 0.89891899, + "num_input_tokens_seen": 42658320, + "router_z_loss_clip": 3.44335938, + "router_z_loss_mlp": 0.36303711, + "step": 1970, + "time_per_iteration": 2.688477039337158 + }, + { + "auxiliary_loss_clip": 0.06493312, + "auxiliary_loss_mlp": 0.01272243, + "balance_loss_clip": 0.06287479, + "balance_loss_mlp": 0.01257163, + "epoch": 0.11850293100856757, + "flos": 66371522474880.0, + "grad_norm": 0.7778041955901001, + "language_loss": 0.66349763, + "learning_rate": 3.918396380791754e-06, + "loss": 0.74115324, + "num_input_tokens_seen": 42721500, + "router_z_loss_clip": 2.0625, + "router_z_loss_mlp": 0.1505127, + "step": 1971, + "time_per_iteration": 3.1715264320373535 + }, + { + "auxiliary_loss_clip": 0.06664559, + "auxiliary_loss_mlp": 0.01309662, + "balance_loss_clip": 0.06317366, + "balance_loss_mlp": 0.01274996, + "epoch": 0.11856305426123553, + "flos": 24687960526080.0, + "grad_norm": 2.78038897761295, + "language_loss": 0.81843936, + "learning_rate": 3.918286230142327e-06, + "loss": 0.89818156, + "num_input_tokens_seen": 42739825, + "router_z_loss_clip": 3.47070312, + "router_z_loss_mlp": 0.34643555, + "step": 1972, + "time_per_iteration": 2.6285483837127686 + }, + { + "auxiliary_loss_clip": 0.06645221, + "auxiliary_loss_mlp": 0.01320916, + "balance_loss_clip": 0.06308153, + "balance_loss_mlp": 0.01286179, + "epoch": 0.1186231775139035, + "flos": 24287017939200.0, + "grad_norm": 2.7493832888964116, + "language_loss": 0.746387, + "learning_rate": 3.918176006751292e-06, + "loss": 0.82604837, + "num_input_tokens_seen": 42758695, + "router_z_loss_clip": 3.36914062, + "router_z_loss_mlp": 0.34716797, + "step": 1973, + "time_per_iteration": 2.607680082321167 + }, + { + "auxiliary_loss_clip": 0.06639803, + "auxiliary_loss_mlp": 0.0131421, + "balance_loss_clip": 0.06300108, + "balance_loss_mlp": 0.01277851, + "epoch": 0.11868330076657148, + "flos": 21763042053120.0, + "grad_norm": 1.6365219196166583, + "language_loss": 0.73750299, + "learning_rate": 3.918065710622832e-06, + "loss": 0.81704313, + "num_input_tokens_seen": 42778510, + "router_z_loss_clip": 3.3984375, + "router_z_loss_mlp": 0.36352539, + "step": 1974, + "time_per_iteration": 2.603078603744507 + }, + { + "auxiliary_loss_clip": 0.06653641, + "auxiliary_loss_mlp": 0.01323127, + "balance_loss_clip": 0.06305285, + "balance_loss_mlp": 0.01286196, + "epoch": 0.11874342401923944, + "flos": 17197568490240.0, + "grad_norm": 3.7102130607090893, + "language_loss": 0.79475862, + "learning_rate": 3.917955341761128e-06, + "loss": 0.87452626, + "num_input_tokens_seen": 42793995, + "router_z_loss_clip": 3.48242188, + "router_z_loss_mlp": 0.36914062, + "step": 1975, + "time_per_iteration": 2.529472827911377 + }, + { + "auxiliary_loss_clip": 0.06637481, + "auxiliary_loss_mlp": 0.01318957, + "balance_loss_clip": 0.06305119, + "balance_loss_mlp": 0.01286246, + "epoch": 0.11880354727190741, + "flos": 15234629616000.0, + "grad_norm": 3.277775960681522, + "language_loss": 0.77101427, + "learning_rate": 3.917844900170364e-06, + "loss": 0.85057861, + "num_input_tokens_seen": 42809000, + "router_z_loss_clip": 3.3203125, + "router_z_loss_mlp": 0.32714844, + "step": 1976, + "time_per_iteration": 2.5576260089874268 + }, + { + "auxiliary_loss_clip": 0.06648317, + "auxiliary_loss_mlp": 0.01301156, + "balance_loss_clip": 0.0630495, + "balance_loss_mlp": 0.0126537, + "epoch": 0.11886367052457537, + "flos": 27317343248640.0, + "grad_norm": 1.6788870618385208, + "language_loss": 0.76201534, + "learning_rate": 3.91773438585473e-06, + "loss": 0.84151006, + "num_input_tokens_seen": 42831585, + "router_z_loss_clip": 3.43164062, + "router_z_loss_mlp": 0.35791016, + "step": 1977, + "time_per_iteration": 2.6103506088256836 + }, + { + "auxiliary_loss_clip": 0.06654633, + "auxiliary_loss_mlp": 0.01297753, + "balance_loss_clip": 0.06301999, + "balance_loss_mlp": 0.01261346, + "epoch": 0.11892379377724335, + "flos": 21804648405120.0, + "grad_norm": 2.329560685386949, + "language_loss": 0.75601208, + "learning_rate": 3.9176237988184165e-06, + "loss": 0.835536, + "num_input_tokens_seen": 42848420, + "router_z_loss_clip": 3.5234375, + "router_z_loss_mlp": 0.36401367, + "step": 1978, + "time_per_iteration": 2.556502103805542 + }, + { + "auxiliary_loss_clip": 0.06647499, + "auxiliary_loss_mlp": 0.01294249, + "balance_loss_clip": 0.06307921, + "balance_loss_mlp": 0.0126068, + "epoch": 0.11898391702991132, + "flos": 13996191070080.0, + "grad_norm": 1.8023230195278173, + "language_loss": 0.74423146, + "learning_rate": 3.917513139065616e-06, + "loss": 0.82364893, + "num_input_tokens_seen": 42866645, + "router_z_loss_clip": 3.40039062, + "router_z_loss_mlp": 0.33569336, + "step": 1979, + "time_per_iteration": 2.595372200012207 + }, + { + "auxiliary_loss_clip": 0.0664144, + "auxiliary_loss_mlp": 0.01296465, + "balance_loss_clip": 0.06302245, + "balance_loss_mlp": 0.01261965, + "epoch": 0.11904404028257928, + "flos": 32242907733120.0, + "grad_norm": 1.646895354500375, + "language_loss": 0.99974936, + "learning_rate": 3.917402406600525e-06, + "loss": 1.07912838, + "num_input_tokens_seen": 42888515, + "router_z_loss_clip": 3.39257812, + "router_z_loss_mlp": 0.34521484, + "step": 1980, + "time_per_iteration": 2.6381077766418457 + }, + { + "auxiliary_loss_clip": 0.06647406, + "auxiliary_loss_mlp": 0.01292706, + "balance_loss_clip": 0.0630409, + "balance_loss_mlp": 0.01256299, + "epoch": 0.11910416353524726, + "flos": 23592971370240.0, + "grad_norm": 2.6857595325388095, + "language_loss": 0.87083352, + "learning_rate": 3.917291601427342e-06, + "loss": 0.95023465, + "num_input_tokens_seen": 42909035, + "router_z_loss_clip": 3.43359375, + "router_z_loss_mlp": 0.36401367, + "step": 1981, + "time_per_iteration": 2.5953710079193115 + }, + { + "auxiliary_loss_clip": 0.0664432, + "auxiliary_loss_mlp": 0.01298025, + "balance_loss_clip": 0.06305191, + "balance_loss_mlp": 0.01263287, + "epoch": 0.11916428678791523, + "flos": 25339268712960.0, + "grad_norm": 1.936683956575477, + "language_loss": 0.86578631, + "learning_rate": 3.91718072355027e-06, + "loss": 0.94520986, + "num_input_tokens_seen": 42927555, + "router_z_loss_clip": 3.38867188, + "router_z_loss_mlp": 0.34765625, + "step": 1982, + "time_per_iteration": 2.5845234394073486 + }, + { + "auxiliary_loss_clip": 0.06636401, + "auxiliary_loss_mlp": 0.01296498, + "balance_loss_clip": 0.06295685, + "balance_loss_mlp": 0.0126095, + "epoch": 0.11922441004058319, + "flos": 19793939904000.0, + "grad_norm": 2.0505681107153273, + "language_loss": 0.86230731, + "learning_rate": 3.917069772973513e-06, + "loss": 0.94163632, + "num_input_tokens_seen": 42945300, + "router_z_loss_clip": 3.40820312, + "router_z_loss_mlp": 0.35571289, + "step": 1983, + "time_per_iteration": 2.554844379425049 + }, + { + "auxiliary_loss_clip": 0.06654783, + "auxiliary_loss_mlp": 0.01292763, + "balance_loss_clip": 0.06302382, + "balance_loss_mlp": 0.01256858, + "epoch": 0.11928453329325117, + "flos": 21541578912000.0, + "grad_norm": 3.6464912777756373, + "language_loss": 0.78593659, + "learning_rate": 3.916958749701277e-06, + "loss": 0.86541206, + "num_input_tokens_seen": 42961295, + "router_z_loss_clip": 3.51757812, + "router_z_loss_mlp": 0.35913086, + "step": 1984, + "time_per_iteration": 2.5320324897766113 + }, + { + "auxiliary_loss_clip": 0.06647135, + "auxiliary_loss_mlp": 0.01292695, + "balance_loss_clip": 0.0630364, + "balance_loss_mlp": 0.0125574, + "epoch": 0.11934465654591914, + "flos": 20821522849920.0, + "grad_norm": 1.8707303629344072, + "language_loss": 0.84522444, + "learning_rate": 3.9168476537377745e-06, + "loss": 0.92462277, + "num_input_tokens_seen": 42980330, + "router_z_loss_clip": 3.43164062, + "router_z_loss_mlp": 0.36962891, + "step": 1985, + "time_per_iteration": 2.6096858978271484 + }, + { + "auxiliary_loss_clip": 0.06641059, + "auxiliary_loss_mlp": 0.01296367, + "balance_loss_clip": 0.06304613, + "balance_loss_mlp": 0.01263346, + "epoch": 0.1194047797985871, + "flos": 19066169266560.0, + "grad_norm": 3.6983230286651945, + "language_loss": 0.75468755, + "learning_rate": 3.916736485087216e-06, + "loss": 0.83406186, + "num_input_tokens_seen": 42996125, + "router_z_loss_clip": 3.3671875, + "router_z_loss_mlp": 0.33007812, + "step": 1986, + "time_per_iteration": 2.497166633605957 + }, + { + "auxiliary_loss_clip": 0.06650525, + "auxiliary_loss_mlp": 0.01300056, + "balance_loss_clip": 0.06311469, + "balance_loss_mlp": 0.01265771, + "epoch": 0.11946490305125507, + "flos": 27196842677760.0, + "grad_norm": 2.5090300356015227, + "language_loss": 0.73365855, + "learning_rate": 3.916625243753819e-06, + "loss": 0.81316435, + "num_input_tokens_seen": 43014180, + "router_z_loss_clip": 3.390625, + "router_z_loss_mlp": 0.34301758, + "step": 1987, + "time_per_iteration": 2.6316466331481934 + }, + { + "auxiliary_loss_clip": 0.06659403, + "auxiliary_loss_mlp": 0.01313937, + "balance_loss_clip": 0.06313819, + "balance_loss_mlp": 0.01275886, + "epoch": 0.11952502630392305, + "flos": 21146925381120.0, + "grad_norm": 1.9895182313514284, + "language_loss": 0.73564172, + "learning_rate": 3.916513929741799e-06, + "loss": 0.81537521, + "num_input_tokens_seen": 43032120, + "router_z_loss_clip": 3.453125, + "router_z_loss_mlp": 0.38012695, + "step": 1988, + "time_per_iteration": 2.538780450820923 + }, + { + "auxiliary_loss_clip": 0.06646325, + "auxiliary_loss_mlp": 0.01300531, + "balance_loss_clip": 0.06309503, + "balance_loss_mlp": 0.01265817, + "epoch": 0.11958514955659101, + "flos": 22130260571520.0, + "grad_norm": 2.1843811344265434, + "language_loss": 0.82602763, + "learning_rate": 3.91640254305538e-06, + "loss": 0.90549618, + "num_input_tokens_seen": 43052215, + "router_z_loss_clip": 3.3671875, + "router_z_loss_mlp": 0.34716797, + "step": 1989, + "time_per_iteration": 2.6741979122161865 + }, + { + "auxiliary_loss_clip": 0.06651568, + "auxiliary_loss_mlp": 0.01303723, + "balance_loss_clip": 0.06308736, + "balance_loss_mlp": 0.01266482, + "epoch": 0.11964527280925898, + "flos": 17427333185280.0, + "grad_norm": 3.1495832164614828, + "language_loss": 0.77526391, + "learning_rate": 3.916291083698784e-06, + "loss": 0.85481679, + "num_input_tokens_seen": 43069720, + "router_z_loss_clip": 3.42578125, + "router_z_loss_mlp": 0.37255859, + "step": 1990, + "time_per_iteration": 3.9906837940216064 + }, + { + "auxiliary_loss_clip": 0.06541168, + "auxiliary_loss_mlp": 0.0131986, + "balance_loss_clip": 0.06337936, + "balance_loss_mlp": 0.01304852, + "epoch": 0.11970539606192696, + "flos": 70698804007680.0, + "grad_norm": 0.8660684283454352, + "language_loss": 0.55407226, + "learning_rate": 3.916179551676238e-06, + "loss": 0.63268256, + "num_input_tokens_seen": 43123130, + "router_z_loss_clip": 2.03125, + "router_z_loss_mlp": 0.14978027, + "step": 1991, + "time_per_iteration": 4.6956093311309814 + }, + { + "auxiliary_loss_clip": 0.06638116, + "auxiliary_loss_mlp": 0.01295675, + "balance_loss_clip": 0.06307568, + "balance_loss_mlp": 0.01263345, + "epoch": 0.11976551931459492, + "flos": 21221375333760.0, + "grad_norm": 2.476959921909238, + "language_loss": 0.79074007, + "learning_rate": 3.916067946991971e-06, + "loss": 0.87007797, + "num_input_tokens_seen": 43140015, + "router_z_loss_clip": 3.3046875, + "router_z_loss_mlp": 0.32348633, + "step": 1992, + "time_per_iteration": 2.5945029258728027 + }, + { + "auxiliary_loss_clip": 0.06650865, + "auxiliary_loss_mlp": 0.01302479, + "balance_loss_clip": 0.06309184, + "balance_loss_mlp": 0.01267647, + "epoch": 0.11982564256726289, + "flos": 25995566217600.0, + "grad_norm": 2.0953190944700215, + "language_loss": 0.800017, + "learning_rate": 3.915956269650216e-06, + "loss": 0.87955046, + "num_input_tokens_seen": 43160105, + "router_z_loss_clip": 3.41601562, + "router_z_loss_mlp": 0.34838867, + "step": 1993, + "time_per_iteration": 2.5923471450805664 + }, + { + "auxiliary_loss_clip": 0.06641386, + "auxiliary_loss_mlp": 0.0130103, + "balance_loss_clip": 0.06304519, + "balance_loss_mlp": 0.01266793, + "epoch": 0.11988576581993086, + "flos": 21656964384000.0, + "grad_norm": 1.8929635889117382, + "language_loss": 0.83093858, + "learning_rate": 3.915844519655208e-06, + "loss": 0.91036278, + "num_input_tokens_seen": 43179835, + "router_z_loss_clip": 3.37304688, + "router_z_loss_mlp": 0.3425293, + "step": 1994, + "time_per_iteration": 2.58314847946167 + }, + { + "auxiliary_loss_clip": 0.06638885, + "auxiliary_loss_mlp": 0.01299925, + "balance_loss_clip": 0.06306463, + "balance_loss_mlp": 0.01265617, + "epoch": 0.11994588907259883, + "flos": 17863048016640.0, + "grad_norm": 2.42141016996774, + "language_loss": 0.90494514, + "learning_rate": 3.915732697011183e-06, + "loss": 0.98433328, + "num_input_tokens_seen": 43197210, + "router_z_loss_clip": 3.3203125, + "router_z_loss_mlp": 0.34301758, + "step": 1995, + "time_per_iteration": 5.38932991027832 + }, + { + "auxiliary_loss_clip": 0.06647271, + "auxiliary_loss_mlp": 0.01300085, + "balance_loss_clip": 0.06306107, + "balance_loss_mlp": 0.01263583, + "epoch": 0.1200060123252668, + "flos": 24469725767040.0, + "grad_norm": 3.463827549229225, + "language_loss": 0.75938386, + "learning_rate": 3.9156208017223825e-06, + "loss": 0.83885741, + "num_input_tokens_seen": 43215050, + "router_z_loss_clip": 3.41015625, + "router_z_loss_mlp": 0.36523438, + "step": 1996, + "time_per_iteration": 2.630936861038208 + }, + { + "auxiliary_loss_clip": 0.06633951, + "auxiliary_loss_mlp": 0.01306595, + "balance_loss_clip": 0.06300932, + "balance_loss_mlp": 0.01273097, + "epoch": 0.12006613557793476, + "flos": 18737831842560.0, + "grad_norm": 2.002664476767551, + "language_loss": 0.88733006, + "learning_rate": 3.915508833793048e-06, + "loss": 0.96673548, + "num_input_tokens_seen": 43233900, + "router_z_loss_clip": 3.328125, + "router_z_loss_mlp": 0.33496094, + "step": 1997, + "time_per_iteration": 2.542490243911743 + }, + { + "auxiliary_loss_clip": 0.06639601, + "auxiliary_loss_mlp": 0.01299934, + "balance_loss_clip": 0.06303362, + "balance_loss_mlp": 0.01265864, + "epoch": 0.12012625883060274, + "flos": 22273374545280.0, + "grad_norm": 2.268718132008626, + "language_loss": 0.8047471, + "learning_rate": 3.915396793227428e-06, + "loss": 0.88414252, + "num_input_tokens_seen": 43252105, + "router_z_loss_clip": 3.36523438, + "router_z_loss_mlp": 0.34033203, + "step": 1998, + "time_per_iteration": 2.6070334911346436 + }, + { + "auxiliary_loss_clip": 0.06640439, + "auxiliary_loss_mlp": 0.01306471, + "balance_loss_clip": 0.06312488, + "balance_loss_mlp": 0.01272401, + "epoch": 0.1201863820832707, + "flos": 21764761061760.0, + "grad_norm": 2.100057893204002, + "language_loss": 0.73916173, + "learning_rate": 3.915284680029769e-06, + "loss": 0.81863081, + "num_input_tokens_seen": 43270315, + "router_z_loss_clip": 3.27929688, + "router_z_loss_mlp": 0.34033203, + "step": 1999, + "time_per_iteration": 2.5563113689422607 + }, + { + "auxiliary_loss_clip": 0.0664693, + "auxiliary_loss_mlp": 0.01298334, + "balance_loss_clip": 0.06304446, + "balance_loss_mlp": 0.01263763, + "epoch": 0.12024650533593867, + "flos": 21914415653760.0, + "grad_norm": 2.961282874650153, + "language_loss": 0.76137137, + "learning_rate": 3.915172494204323e-06, + "loss": 0.84082401, + "num_input_tokens_seen": 43289935, + "router_z_loss_clip": 3.42578125, + "router_z_loss_mlp": 0.34545898, + "step": 2000, + "time_per_iteration": 2.6174545288085938 + }, + { + "auxiliary_loss_clip": 0.0664265, + "auxiliary_loss_mlp": 0.0131017, + "balance_loss_clip": 0.06307586, + "balance_loss_mlp": 0.012756, + "epoch": 0.12030662858860665, + "flos": 21695635843200.0, + "grad_norm": 1.7187756113932227, + "language_loss": 0.86554497, + "learning_rate": 3.915060235755344e-06, + "loss": 0.94507325, + "num_input_tokens_seen": 43309325, + "router_z_loss_clip": 3.34960938, + "router_z_loss_mlp": 0.34545898, + "step": 2001, + "time_per_iteration": 2.575740098953247 + }, + { + "auxiliary_loss_clip": 0.06635608, + "auxiliary_loss_mlp": 0.01303825, + "balance_loss_clip": 0.06303231, + "balance_loss_mlp": 0.01270232, + "epoch": 0.12036675184127461, + "flos": 12938280145920.0, + "grad_norm": 3.0530773908117297, + "language_loss": 0.75370091, + "learning_rate": 3.91494790468709e-06, + "loss": 0.83309525, + "num_input_tokens_seen": 43327010, + "router_z_loss_clip": 3.32226562, + "router_z_loss_mlp": 0.33618164, + "step": 2002, + "time_per_iteration": 2.5708627700805664 + }, + { + "auxiliary_loss_clip": 0.06653483, + "auxiliary_loss_mlp": 0.01301657, + "balance_loss_clip": 0.06308778, + "balance_loss_mlp": 0.01265322, + "epoch": 0.12042687509394258, + "flos": 20857469051520.0, + "grad_norm": 3.724600785525669, + "language_loss": 0.79714429, + "learning_rate": 3.9148355010038185e-06, + "loss": 0.87669575, + "num_input_tokens_seen": 43345650, + "router_z_loss_clip": 3.44726562, + "router_z_loss_mlp": 0.36352539, + "step": 2003, + "time_per_iteration": 2.5530362129211426 + }, + { + "auxiliary_loss_clip": 0.06639621, + "auxiliary_loss_mlp": 0.01310661, + "balance_loss_clip": 0.06304517, + "balance_loss_mlp": 0.01276638, + "epoch": 0.12048699834661056, + "flos": 23885320665600.0, + "grad_norm": 3.082354768272036, + "language_loss": 0.72748882, + "learning_rate": 3.914723024709793e-06, + "loss": 0.80699164, + "num_input_tokens_seen": 43365555, + "router_z_loss_clip": 3.3515625, + "router_z_loss_mlp": 0.34008789, + "step": 2004, + "time_per_iteration": 2.583922863006592 + }, + { + "auxiliary_loss_clip": 0.06642192, + "auxiliary_loss_mlp": 0.01300449, + "balance_loss_clip": 0.06302966, + "balance_loss_mlp": 0.01263899, + "epoch": 0.12054712159927852, + "flos": 19762605676800.0, + "grad_norm": 1.8151207739831152, + "language_loss": 0.79435182, + "learning_rate": 3.914610475809279e-06, + "loss": 0.87377822, + "num_input_tokens_seen": 43384990, + "router_z_loss_clip": 3.39257812, + "router_z_loss_mlp": 0.36547852, + "step": 2005, + "time_per_iteration": 2.5544016361236572 + }, + { + "auxiliary_loss_clip": 0.06498255, + "auxiliary_loss_mlp": 0.01304889, + "balance_loss_clip": 0.06296292, + "balance_loss_mlp": 0.01289821, + "epoch": 0.12060724485194649, + "flos": 51688999411200.0, + "grad_norm": 0.895152271859771, + "language_loss": 0.5819217, + "learning_rate": 3.914497854306543e-06, + "loss": 0.65995312, + "num_input_tokens_seen": 43436335, + "router_z_loss_clip": 2.02539062, + "router_z_loss_mlp": 0.15039062, + "step": 2006, + "time_per_iteration": 2.9925737380981445 + }, + { + "auxiliary_loss_clip": 0.06637617, + "auxiliary_loss_mlp": 0.01298518, + "balance_loss_clip": 0.06307045, + "balance_loss_mlp": 0.01264042, + "epoch": 0.12066736810461445, + "flos": 18996582850560.0, + "grad_norm": 2.2145885601274653, + "language_loss": 0.77570707, + "learning_rate": 3.9143851602058575e-06, + "loss": 0.85506845, + "num_input_tokens_seen": 43456495, + "router_z_loss_clip": 3.30664062, + "router_z_loss_mlp": 0.34472656, + "step": 2007, + "time_per_iteration": 2.5426108837127686 + }, + { + "auxiliary_loss_clip": 0.0663473, + "auxiliary_loss_mlp": 0.01296019, + "balance_loss_clip": 0.06301288, + "balance_loss_mlp": 0.01260352, + "epoch": 0.12072749135728243, + "flos": 16477554355200.0, + "grad_norm": 3.5055454300142346, + "language_loss": 0.8601926, + "learning_rate": 3.914272393511494e-06, + "loss": 0.93950009, + "num_input_tokens_seen": 43473085, + "router_z_loss_clip": 3.328125, + "router_z_loss_mlp": 0.35668945, + "step": 2008, + "time_per_iteration": 2.5499417781829834 + }, + { + "auxiliary_loss_clip": 0.06641807, + "auxiliary_loss_mlp": 0.01291488, + "balance_loss_clip": 0.06305657, + "balance_loss_mlp": 0.0125768, + "epoch": 0.1207876146099504, + "flos": 18082917930240.0, + "grad_norm": 2.14462830622821, + "language_loss": 0.84945571, + "learning_rate": 3.91415955422773e-06, + "loss": 0.92878866, + "num_input_tokens_seen": 43491135, + "router_z_loss_clip": 3.359375, + "router_z_loss_mlp": 0.33813477, + "step": 2009, + "time_per_iteration": 2.5377557277679443 + }, + { + "auxiliary_loss_clip": 0.06634751, + "auxiliary_loss_mlp": 0.01300176, + "balance_loss_clip": 0.06306206, + "balance_loss_mlp": 0.01266225, + "epoch": 0.12084773786261836, + "flos": 21878008254720.0, + "grad_norm": 2.1676887329617336, + "language_loss": 0.85496145, + "learning_rate": 3.914046642358844e-06, + "loss": 0.93431073, + "num_input_tokens_seen": 43510440, + "router_z_loss_clip": 3.29101562, + "router_z_loss_mlp": 0.33959961, + "step": 2010, + "time_per_iteration": 2.577526330947876 + }, + { + "auxiliary_loss_clip": 0.06654292, + "auxiliary_loss_mlp": 0.0131443, + "balance_loss_clip": 0.06313477, + "balance_loss_mlp": 0.01277666, + "epoch": 0.12090786111528634, + "flos": 18338985607680.0, + "grad_norm": 2.943319840268963, + "language_loss": 0.85397738, + "learning_rate": 3.9139336579091174e-06, + "loss": 0.93366468, + "num_input_tokens_seen": 43530145, + "router_z_loss_clip": 3.40625, + "router_z_loss_mlp": 0.36767578, + "step": 2011, + "time_per_iteration": 2.5281803607940674 + }, + { + "auxiliary_loss_clip": 0.06651285, + "auxiliary_loss_mlp": 0.01306451, + "balance_loss_clip": 0.06310041, + "balance_loss_mlp": 0.01270975, + "epoch": 0.1209679843679543, + "flos": 21112236990720.0, + "grad_norm": 2.078534673475464, + "language_loss": 0.97477353, + "learning_rate": 3.913820600882834e-06, + "loss": 1.05435085, + "num_input_tokens_seen": 43549315, + "router_z_loss_clip": 3.4140625, + "router_z_loss_mlp": 0.35498047, + "step": 2012, + "time_per_iteration": 2.607473611831665 + }, + { + "auxiliary_loss_clip": 0.06639741, + "auxiliary_loss_mlp": 0.01302196, + "balance_loss_clip": 0.06309405, + "balance_loss_mlp": 0.01268865, + "epoch": 0.12102810762062227, + "flos": 29248612479360.0, + "grad_norm": 1.9848767494674133, + "language_loss": 0.81610048, + "learning_rate": 3.913707471284283e-06, + "loss": 0.89551985, + "num_input_tokens_seen": 43569240, + "router_z_loss_clip": 3.30078125, + "router_z_loss_mlp": 0.33325195, + "step": 2013, + "time_per_iteration": 2.616990566253662 + }, + { + "auxiliary_loss_clip": 0.06652003, + "auxiliary_loss_mlp": 0.01311561, + "balance_loss_clip": 0.06309032, + "balance_loss_mlp": 0.0127525, + "epoch": 0.12108823087329025, + "flos": 17936407866240.0, + "grad_norm": 5.4278493881784415, + "language_loss": 0.78293782, + "learning_rate": 3.9135942691177515e-06, + "loss": 0.8625735, + "num_input_tokens_seen": 43587710, + "router_z_loss_clip": 3.43164062, + "router_z_loss_mlp": 0.36328125, + "step": 2014, + "time_per_iteration": 2.651820421218872 + }, + { + "auxiliary_loss_clip": 0.06640598, + "auxiliary_loss_mlp": 0.01320367, + "balance_loss_clip": 0.0630708, + "balance_loss_mlp": 0.01286344, + "epoch": 0.12114835412595822, + "flos": 22098549000960.0, + "grad_norm": 2.982829144387911, + "language_loss": 0.88284999, + "learning_rate": 3.913480994387535e-06, + "loss": 0.96245968, + "num_input_tokens_seen": 43606000, + "router_z_loss_clip": 3.33398438, + "router_z_loss_mlp": 0.34008789, + "step": 2015, + "time_per_iteration": 2.5447444915771484 + }, + { + "auxiliary_loss_clip": 0.06640744, + "auxiliary_loss_mlp": 0.01318151, + "balance_loss_clip": 0.06308715, + "balance_loss_mlp": 0.01284534, + "epoch": 0.12120847737862618, + "flos": 20418567765120.0, + "grad_norm": 2.096885211944344, + "language_loss": 0.70457768, + "learning_rate": 3.913367647097926e-06, + "loss": 0.78416657, + "num_input_tokens_seen": 43624815, + "router_z_loss_clip": 3.32226562, + "router_z_loss_mlp": 0.3359375, + "step": 2016, + "time_per_iteration": 2.596148729324341 + }, + { + "auxiliary_loss_clip": 0.06646016, + "auxiliary_loss_mlp": 0.01314653, + "balance_loss_clip": 0.06304827, + "balance_loss_mlp": 0.01276792, + "epoch": 0.12126860063129415, + "flos": 22315484021760.0, + "grad_norm": 2.9748504234470214, + "language_loss": 0.80719239, + "learning_rate": 3.913254227253225e-06, + "loss": 0.8867991, + "num_input_tokens_seen": 43643960, + "router_z_loss_clip": 3.41015625, + "router_z_loss_mlp": 0.37890625, + "step": 2017, + "time_per_iteration": 2.531651020050049 + }, + { + "auxiliary_loss_clip": 0.06646961, + "auxiliary_loss_mlp": 0.01325201, + "balance_loss_clip": 0.06301364, + "balance_loss_mlp": 0.01289128, + "epoch": 0.12132872388396213, + "flos": 13704428753280.0, + "grad_norm": 11.74399096976628, + "language_loss": 0.70780957, + "learning_rate": 3.913140734857731e-06, + "loss": 0.78753114, + "num_input_tokens_seen": 43662650, + "router_z_loss_clip": 3.45507812, + "router_z_loss_mlp": 0.3605957, + "step": 2018, + "time_per_iteration": 2.555253267288208 + }, + { + "auxiliary_loss_clip": 0.06636061, + "auxiliary_loss_mlp": 0.01298517, + "balance_loss_clip": 0.06301028, + "balance_loss_mlp": 0.01264828, + "epoch": 0.12138884713663009, + "flos": 26473851722880.0, + "grad_norm": 2.8042762769346714, + "language_loss": 0.73802805, + "learning_rate": 3.91302716991575e-06, + "loss": 0.81737387, + "num_input_tokens_seen": 43684205, + "router_z_loss_clip": 3.34765625, + "router_z_loss_mlp": 0.33691406, + "step": 2019, + "time_per_iteration": 2.6203458309173584 + }, + { + "auxiliary_loss_clip": 0.06639916, + "auxiliary_loss_mlp": 0.01311356, + "balance_loss_clip": 0.06299765, + "balance_loss_mlp": 0.01277238, + "epoch": 0.12144897038929806, + "flos": 26148952316160.0, + "grad_norm": 1.829808829925435, + "language_loss": 0.93501657, + "learning_rate": 3.912913532431586e-06, + "loss": 1.01452923, + "num_input_tokens_seen": 43706320, + "router_z_loss_clip": 3.40234375, + "router_z_loss_mlp": 0.34130859, + "step": 2020, + "time_per_iteration": 2.5888445377349854 + }, + { + "auxiliary_loss_clip": 0.06633772, + "auxiliary_loss_mlp": 0.01299116, + "balance_loss_clip": 0.06297548, + "balance_loss_mlp": 0.01263568, + "epoch": 0.12150909364196603, + "flos": 24724451779200.0, + "grad_norm": 2.526616616661372, + "language_loss": 0.78976464, + "learning_rate": 3.912799822409549e-06, + "loss": 0.86909354, + "num_input_tokens_seen": 43724805, + "router_z_loss_clip": 3.36328125, + "router_z_loss_mlp": 0.35546875, + "step": 2021, + "time_per_iteration": 2.6022841930389404 + }, + { + "auxiliary_loss_clip": 0.0663517, + "auxiliary_loss_mlp": 0.01299013, + "balance_loss_clip": 0.06302813, + "balance_loss_mlp": 0.01266898, + "epoch": 0.121569216894634, + "flos": 25193177919360.0, + "grad_norm": 2.2515588789305645, + "language_loss": 0.8175382, + "learning_rate": 3.912686039853952e-06, + "loss": 0.89688003, + "num_input_tokens_seen": 43742320, + "router_z_loss_clip": 3.32421875, + "router_z_loss_mlp": 0.32128906, + "step": 2022, + "time_per_iteration": 2.5850207805633545 + }, + { + "auxiliary_loss_clip": 0.0664625, + "auxiliary_loss_mlp": 0.01295093, + "balance_loss_clip": 0.06304103, + "balance_loss_mlp": 0.0125964, + "epoch": 0.12162934014730196, + "flos": 13449241543680.0, + "grad_norm": 2.226180845904462, + "language_loss": 0.8644762, + "learning_rate": 3.912572184769108e-06, + "loss": 0.94388956, + "num_input_tokens_seen": 43760665, + "router_z_loss_clip": 3.421875, + "router_z_loss_mlp": 0.35424805, + "step": 2023, + "time_per_iteration": 2.541822671890259 + }, + { + "auxiliary_loss_clip": 0.06652313, + "auxiliary_loss_mlp": 0.01299326, + "balance_loss_clip": 0.06306356, + "balance_loss_mlp": 0.01261394, + "epoch": 0.12168946339996994, + "flos": 16951772937600.0, + "grad_norm": 3.6496728157667477, + "language_loss": 0.87528783, + "learning_rate": 3.912458257159335e-06, + "loss": 0.95480424, + "num_input_tokens_seen": 43779020, + "router_z_loss_clip": 3.4609375, + "router_z_loss_mlp": 0.37963867, + "step": 2024, + "time_per_iteration": 2.510047674179077 + }, + { + "auxiliary_loss_clip": 0.06637174, + "auxiliary_loss_mlp": 0.01298516, + "balance_loss_clip": 0.06299831, + "balance_loss_mlp": 0.01262872, + "epoch": 0.12174958665263791, + "flos": 29828699095680.0, + "grad_norm": 2.180683853985422, + "language_loss": 0.73548269, + "learning_rate": 3.912344257028954e-06, + "loss": 0.8148396, + "num_input_tokens_seen": 43798850, + "router_z_loss_clip": 3.37695312, + "router_z_loss_mlp": 0.35620117, + "step": 2025, + "time_per_iteration": 2.612072229385376 + }, + { + "auxiliary_loss_clip": 0.06640136, + "auxiliary_loss_mlp": 0.01296236, + "balance_loss_clip": 0.06301836, + "balance_loss_mlp": 0.01260425, + "epoch": 0.12180970990530587, + "flos": 24648366672000.0, + "grad_norm": 1.6158057232252747, + "language_loss": 0.77162802, + "learning_rate": 3.912230184382286e-06, + "loss": 0.85099173, + "num_input_tokens_seen": 43820130, + "router_z_loss_clip": 3.38476562, + "router_z_loss_mlp": 0.35766602, + "step": 2026, + "time_per_iteration": 2.5995230674743652 + }, + { + "auxiliary_loss_clip": 0.06645372, + "auxiliary_loss_mlp": 0.01300506, + "balance_loss_clip": 0.06307228, + "balance_loss_mlp": 0.01264219, + "epoch": 0.12186983315797385, + "flos": 20527915743360.0, + "grad_norm": 2.387338120412035, + "language_loss": 0.90280318, + "learning_rate": 3.912116039223659e-06, + "loss": 0.9822619, + "num_input_tokens_seen": 43838485, + "router_z_loss_clip": 3.37890625, + "router_z_loss_mlp": 0.36254883, + "step": 2027, + "time_per_iteration": 2.534867763519287 + }, + { + "auxiliary_loss_clip": 0.06634748, + "auxiliary_loss_mlp": 0.0129945, + "balance_loss_clip": 0.06304284, + "balance_loss_mlp": 0.01266905, + "epoch": 0.12192995641064182, + "flos": 27825705169920.0, + "grad_norm": 2.1781707070906644, + "language_loss": 0.76798415, + "learning_rate": 3.912001821557399e-06, + "loss": 0.84732616, + "num_input_tokens_seen": 43859080, + "router_z_loss_clip": 3.30859375, + "router_z_loss_mlp": 0.32543945, + "step": 2028, + "time_per_iteration": 2.578725576400757 + }, + { + "auxiliary_loss_clip": 0.0664517, + "auxiliary_loss_mlp": 0.01295232, + "balance_loss_clip": 0.06306128, + "balance_loss_mlp": 0.012614, + "epoch": 0.12199007966330978, + "flos": 22023512069760.0, + "grad_norm": 2.4518178731886318, + "language_loss": 0.78897178, + "learning_rate": 3.911887531387839e-06, + "loss": 0.86837584, + "num_input_tokens_seen": 43879030, + "router_z_loss_clip": 3.39257812, + "router_z_loss_mlp": 0.33813477, + "step": 2029, + "time_per_iteration": 2.5508341789245605 + }, + { + "auxiliary_loss_clip": 0.06643746, + "auxiliary_loss_mlp": 0.01296807, + "balance_loss_clip": 0.06307071, + "balance_loss_mlp": 0.01262475, + "epoch": 0.12205020291597775, + "flos": 23302005667200.0, + "grad_norm": 2.091887383256169, + "language_loss": 0.80821085, + "learning_rate": 3.911773168719313e-06, + "loss": 0.8876164, + "num_input_tokens_seen": 43898505, + "router_z_loss_clip": 3.36328125, + "router_z_loss_mlp": 0.34326172, + "step": 2030, + "time_per_iteration": 3.9340591430664062 + }, + { + "auxiliary_loss_clip": 0.06641008, + "auxiliary_loss_mlp": 0.01296523, + "balance_loss_clip": 0.06307271, + "balance_loss_mlp": 0.01263097, + "epoch": 0.12211032616864573, + "flos": 26038849651200.0, + "grad_norm": 4.123821558530392, + "language_loss": 0.75410855, + "learning_rate": 3.911658733556155e-06, + "loss": 0.83348382, + "num_input_tokens_seen": 43917945, + "router_z_loss_clip": 3.33984375, + "router_z_loss_mlp": 0.33398438, + "step": 2031, + "time_per_iteration": 4.0164101123809814 + }, + { + "auxiliary_loss_clip": 0.06642319, + "auxiliary_loss_mlp": 0.01298968, + "balance_loss_clip": 0.06307532, + "balance_loss_mlp": 0.01265947, + "epoch": 0.12217044942131369, + "flos": 20416932610560.0, + "grad_norm": 1.945082071582731, + "language_loss": 0.76790285, + "learning_rate": 3.911544225902707e-06, + "loss": 0.84731567, + "num_input_tokens_seen": 43937385, + "router_z_loss_clip": 3.34765625, + "router_z_loss_mlp": 0.33032227, + "step": 2032, + "time_per_iteration": 2.5583930015563965 + }, + { + "auxiliary_loss_clip": 0.0663031, + "auxiliary_loss_mlp": 0.01300948, + "balance_loss_clip": 0.06305249, + "balance_loss_mlp": 0.01266901, + "epoch": 0.12223057267398166, + "flos": 22863817140480.0, + "grad_norm": 1.7389762148633483, + "language_loss": 0.89850545, + "learning_rate": 3.911429645763311e-06, + "loss": 0.97781807, + "num_input_tokens_seen": 43958130, + "router_z_loss_clip": 3.24609375, + "router_z_loss_mlp": 0.34057617, + "step": 2033, + "time_per_iteration": 2.5717952251434326 + }, + { + "auxiliary_loss_clip": 0.06656118, + "auxiliary_loss_mlp": 0.01295873, + "balance_loss_clip": 0.06305313, + "balance_loss_mlp": 0.01260063, + "epoch": 0.12229069592664964, + "flos": 20053739088000.0, + "grad_norm": 2.329108980084039, + "language_loss": 0.67293733, + "learning_rate": 3.911314993142311e-06, + "loss": 0.75245726, + "num_input_tokens_seen": 43976800, + "router_z_loss_clip": 3.50585938, + "router_z_loss_mlp": 0.3581543, + "step": 2034, + "time_per_iteration": 5.42257833480835 + }, + { + "auxiliary_loss_clip": 0.06636314, + "auxiliary_loss_mlp": 0.01296044, + "balance_loss_clip": 0.06304356, + "balance_loss_mlp": 0.0126164, + "epoch": 0.1223508191793176, + "flos": 22280963339520.0, + "grad_norm": 1.830897331176389, + "language_loss": 0.77330279, + "learning_rate": 3.911200268044055e-06, + "loss": 0.85262644, + "num_input_tokens_seen": 43996620, + "router_z_loss_clip": 3.32226562, + "router_z_loss_mlp": 0.34375, + "step": 2035, + "time_per_iteration": 2.636413097381592 + }, + { + "auxiliary_loss_clip": 0.06651293, + "auxiliary_loss_mlp": 0.01293249, + "balance_loss_clip": 0.06302898, + "balance_loss_mlp": 0.01258893, + "epoch": 0.12241094243198557, + "flos": 21292009925760.0, + "grad_norm": 2.7740017238095187, + "language_loss": 0.73084652, + "learning_rate": 3.911085470472892e-06, + "loss": 0.81029195, + "num_input_tokens_seen": 44016175, + "router_z_loss_clip": 3.48632812, + "router_z_loss_mlp": 0.34350586, + "step": 2036, + "time_per_iteration": 2.528167724609375 + }, + { + "auxiliary_loss_clip": 0.06639268, + "auxiliary_loss_mlp": 0.01290851, + "balance_loss_clip": 0.06301118, + "balance_loss_mlp": 0.01256185, + "epoch": 0.12247106568465355, + "flos": 17387823185280.0, + "grad_norm": 1.824605307650974, + "language_loss": 0.84228837, + "learning_rate": 3.910970600433178e-06, + "loss": 0.92158961, + "num_input_tokens_seen": 44035060, + "router_z_loss_clip": 3.3828125, + "router_z_loss_mlp": 0.34692383, + "step": 2037, + "time_per_iteration": 2.554356575012207 + }, + { + "auxiliary_loss_clip": 0.06640968, + "auxiliary_loss_mlp": 0.0129909, + "balance_loss_clip": 0.06304546, + "balance_loss_mlp": 0.01265043, + "epoch": 0.12253118893732151, + "flos": 27051548497920.0, + "grad_norm": 3.231665500772768, + "language_loss": 0.81365263, + "learning_rate": 3.910855657929267e-06, + "loss": 0.89305323, + "num_input_tokens_seen": 44053330, + "router_z_loss_clip": 3.36523438, + "router_z_loss_mlp": 0.34057617, + "step": 2038, + "time_per_iteration": 2.5666050910949707 + }, + { + "auxiliary_loss_clip": 0.0649721, + "auxiliary_loss_mlp": 0.01268916, + "balance_loss_clip": 0.06293084, + "balance_loss_mlp": 0.01256113, + "epoch": 0.12259131218998948, + "flos": 53878055328000.0, + "grad_norm": 0.7896182211698063, + "language_loss": 0.58607936, + "learning_rate": 3.910740642965518e-06, + "loss": 0.66374058, + "num_input_tokens_seen": 44107575, + "router_z_loss_clip": 2.04101562, + "router_z_loss_mlp": 0.12817383, + "step": 2039, + "time_per_iteration": 3.1232099533081055 + }, + { + "auxiliary_loss_clip": 0.06641525, + "auxiliary_loss_mlp": 0.0129467, + "balance_loss_clip": 0.06306375, + "balance_loss_mlp": 0.01261053, + "epoch": 0.12265143544265744, + "flos": 17897233282560.0, + "grad_norm": 3.4610063472864065, + "language_loss": 0.82137585, + "learning_rate": 3.910625555546292e-06, + "loss": 0.90073782, + "num_input_tokens_seen": 44126075, + "router_z_loss_clip": 3.34765625, + "router_z_loss_mlp": 0.33569336, + "step": 2040, + "time_per_iteration": 2.5443432331085205 + }, + { + "auxiliary_loss_clip": 0.06629258, + "auxiliary_loss_mlp": 0.01288004, + "balance_loss_clip": 0.06301395, + "balance_loss_mlp": 0.01255031, + "epoch": 0.12271155869532542, + "flos": 21806577048960.0, + "grad_norm": 2.3749836007198546, + "language_loss": 0.84196723, + "learning_rate": 3.910510395675953e-06, + "loss": 0.92113984, + "num_input_tokens_seen": 44145605, + "router_z_loss_clip": 3.27539062, + "router_z_loss_mlp": 0.32983398, + "step": 2041, + "time_per_iteration": 2.5387189388275146 + }, + { + "auxiliary_loss_clip": 0.06646631, + "auxiliary_loss_mlp": 0.01292367, + "balance_loss_clip": 0.06301489, + "balance_loss_mlp": 0.0125627, + "epoch": 0.12277168194799339, + "flos": 19834917350400.0, + "grad_norm": 2.032940304960421, + "language_loss": 0.68564701, + "learning_rate": 3.9103951633588694e-06, + "loss": 0.76503706, + "num_input_tokens_seen": 44164770, + "router_z_loss_clip": 3.44726562, + "router_z_loss_mlp": 0.36083984, + "step": 2042, + "time_per_iteration": 2.5871469974517822 + }, + { + "auxiliary_loss_clip": 0.06626363, + "auxiliary_loss_mlp": 0.01291525, + "balance_loss_clip": 0.06293724, + "balance_loss_mlp": 0.01258957, + "epoch": 0.12283180520066135, + "flos": 23227597641600.0, + "grad_norm": 4.507885061874762, + "language_loss": 0.82501084, + "learning_rate": 3.910279858599409e-06, + "loss": 0.90418965, + "num_input_tokens_seen": 44184025, + "router_z_loss_clip": 3.328125, + "router_z_loss_mlp": 0.32568359, + "step": 2043, + "time_per_iteration": 2.5436289310455322 + }, + { + "auxiliary_loss_clip": 0.06642601, + "auxiliary_loss_mlp": 0.01293474, + "balance_loss_clip": 0.06301275, + "balance_loss_mlp": 0.01260501, + "epoch": 0.12289192845332933, + "flos": 18594466306560.0, + "grad_norm": 1.8262165625903515, + "language_loss": 0.8169322, + "learning_rate": 3.910164481401946e-06, + "loss": 0.89629292, + "num_input_tokens_seen": 44202950, + "router_z_loss_clip": 3.41210938, + "router_z_loss_mlp": 0.32983398, + "step": 2044, + "time_per_iteration": 2.5594139099121094 + }, + { + "auxiliary_loss_clip": 0.06635186, + "auxiliary_loss_mlp": 0.0128851, + "balance_loss_clip": 0.06299295, + "balance_loss_mlp": 0.01254416, + "epoch": 0.1229520517059973, + "flos": 25775612449920.0, + "grad_norm": 1.8452303970598702, + "language_loss": 0.79028547, + "learning_rate": 3.910049031770853e-06, + "loss": 0.86952239, + "num_input_tokens_seen": 44221115, + "router_z_loss_clip": 3.35742188, + "router_z_loss_mlp": 0.34082031, + "step": 2045, + "time_per_iteration": 2.5465781688690186 + }, + { + "auxiliary_loss_clip": 0.06636953, + "auxiliary_loss_mlp": 0.01295167, + "balance_loss_clip": 0.06298777, + "balance_loss_mlp": 0.01262408, + "epoch": 0.12301217495866526, + "flos": 20893541034240.0, + "grad_norm": 1.9769865564806426, + "language_loss": 0.69156218, + "learning_rate": 3.90993350971051e-06, + "loss": 0.77088338, + "num_input_tokens_seen": 44240575, + "router_z_loss_clip": 3.38671875, + "router_z_loss_mlp": 0.32763672, + "step": 2046, + "time_per_iteration": 2.5848565101623535 + }, + { + "auxiliary_loss_clip": 0.06628656, + "auxiliary_loss_mlp": 0.01290131, + "balance_loss_clip": 0.06297234, + "balance_loss_mlp": 0.01257277, + "epoch": 0.12307229821133324, + "flos": 22384735021440.0, + "grad_norm": 2.0992511324886713, + "language_loss": 0.73182803, + "learning_rate": 3.909817915225297e-06, + "loss": 0.8110159, + "num_input_tokens_seen": 44257145, + "router_z_loss_clip": 3.31640625, + "router_z_loss_mlp": 0.32861328, + "step": 2047, + "time_per_iteration": 2.5309009552001953 + }, + { + "auxiliary_loss_clip": 0.06630135, + "auxiliary_loss_mlp": 0.0129866, + "balance_loss_clip": 0.06297912, + "balance_loss_mlp": 0.01263732, + "epoch": 0.1231324214640012, + "flos": 23374065778560.0, + "grad_norm": 2.486188262823441, + "language_loss": 0.77457881, + "learning_rate": 3.909702248319597e-06, + "loss": 0.85386682, + "num_input_tokens_seen": 44278035, + "router_z_loss_clip": 3.32226562, + "router_z_loss_mlp": 0.34912109, + "step": 2048, + "time_per_iteration": 2.6273012161254883 + }, + { + "auxiliary_loss_clip": 0.06627734, + "auxiliary_loss_mlp": 0.01290224, + "balance_loss_clip": 0.06297483, + "balance_loss_mlp": 0.01258514, + "epoch": 0.12319254471666917, + "flos": 23773624773120.0, + "grad_norm": 1.9256853930308273, + "language_loss": 0.8659687, + "learning_rate": 3.909586508997797e-06, + "loss": 0.94514829, + "num_input_tokens_seen": 44296980, + "router_z_loss_clip": 3.30273438, + "router_z_loss_mlp": 0.31665039, + "step": 2049, + "time_per_iteration": 2.559253692626953 + }, + { + "auxiliary_loss_clip": 0.06639866, + "auxiliary_loss_mlp": 0.01291416, + "balance_loss_clip": 0.06300847, + "balance_loss_mlp": 0.01257751, + "epoch": 0.12325266796933713, + "flos": 23556899387520.0, + "grad_norm": 2.574663902354124, + "language_loss": 0.76814753, + "learning_rate": 3.909470697264285e-06, + "loss": 0.84746033, + "num_input_tokens_seen": 44318005, + "router_z_loss_clip": 3.390625, + "router_z_loss_mlp": 0.33691406, + "step": 2050, + "time_per_iteration": 2.6138648986816406 + }, + { + "auxiliary_loss_clip": 0.06634495, + "auxiliary_loss_mlp": 0.0128935, + "balance_loss_clip": 0.06301371, + "balance_loss_mlp": 0.01256353, + "epoch": 0.12331279122200511, + "flos": 24430593110400.0, + "grad_norm": 2.4676515957678826, + "language_loss": 0.82809746, + "learning_rate": 3.909354813123452e-06, + "loss": 0.90733588, + "num_input_tokens_seen": 44335260, + "router_z_loss_clip": 3.328125, + "router_z_loss_mlp": 0.32983398, + "step": 2051, + "time_per_iteration": 2.53440260887146 + }, + { + "auxiliary_loss_clip": 0.06631288, + "auxiliary_loss_mlp": 0.01288335, + "balance_loss_clip": 0.06299216, + "balance_loss_mlp": 0.01256625, + "epoch": 0.12337291447467308, + "flos": 25491438927360.0, + "grad_norm": 2.0266783151609666, + "language_loss": 0.81273621, + "learning_rate": 3.909238856579693e-06, + "loss": 0.89193243, + "num_input_tokens_seen": 44355315, + "router_z_loss_clip": 3.32421875, + "router_z_loss_mlp": 0.3170166, + "step": 2052, + "time_per_iteration": 2.5801045894622803 + }, + { + "auxiliary_loss_clip": 0.06643972, + "auxiliary_loss_mlp": 0.012894, + "balance_loss_clip": 0.06304064, + "balance_loss_mlp": 0.0125533, + "epoch": 0.12343303772734104, + "flos": 23556731679360.0, + "grad_norm": 2.520879144307052, + "language_loss": 0.75331706, + "learning_rate": 3.909122827637406e-06, + "loss": 0.83265078, + "num_input_tokens_seen": 44373020, + "router_z_loss_clip": 3.40039062, + "router_z_loss_mlp": 0.34082031, + "step": 2053, + "time_per_iteration": 2.5373435020446777 + }, + { + "auxiliary_loss_clip": 0.06645267, + "auxiliary_loss_mlp": 0.01289892, + "balance_loss_clip": 0.06306874, + "balance_loss_mlp": 0.01256919, + "epoch": 0.12349316098000902, + "flos": 47567724670080.0, + "grad_norm": 1.6252086945457442, + "language_loss": 0.75631851, + "learning_rate": 3.909006726300991e-06, + "loss": 0.83567011, + "num_input_tokens_seen": 44397525, + "router_z_loss_clip": 3.37890625, + "router_z_loss_mlp": 0.32983398, + "step": 2054, + "time_per_iteration": 2.7952961921691895 + }, + { + "auxiliary_loss_clip": 0.06634779, + "auxiliary_loss_mlp": 0.01287596, + "balance_loss_clip": 0.06307411, + "balance_loss_mlp": 0.0125715, + "epoch": 0.12355328423267699, + "flos": 25052956911360.0, + "grad_norm": 1.7485213657356729, + "language_loss": 0.86270738, + "learning_rate": 3.908890552574849e-06, + "loss": 0.94193119, + "num_input_tokens_seen": 44415890, + "router_z_loss_clip": 3.27148438, + "router_z_loss_mlp": 0.30419922, + "step": 2055, + "time_per_iteration": 2.553056001663208 + }, + { + "auxiliary_loss_clip": 0.06643809, + "auxiliary_loss_mlp": 0.01295066, + "balance_loss_clip": 0.06311696, + "balance_loss_mlp": 0.0126226, + "epoch": 0.12361340748534495, + "flos": 27716524899840.0, + "grad_norm": 2.053117172443155, + "language_loss": 0.78908336, + "learning_rate": 3.908774306463384e-06, + "loss": 0.86847222, + "num_input_tokens_seen": 44436625, + "router_z_loss_clip": 3.32421875, + "router_z_loss_mlp": 0.328125, + "step": 2056, + "time_per_iteration": 2.632049322128296 + }, + { + "auxiliary_loss_clip": 0.06652766, + "auxiliary_loss_mlp": 0.01294236, + "balance_loss_clip": 0.06316112, + "balance_loss_mlp": 0.01262002, + "epoch": 0.12367353073801293, + "flos": 26147778359040.0, + "grad_norm": 2.0516910638510835, + "language_loss": 0.84512216, + "learning_rate": 3.908657987971009e-06, + "loss": 0.92459214, + "num_input_tokens_seen": 44455265, + "router_z_loss_clip": 3.3671875, + "router_z_loss_mlp": 0.32226562, + "step": 2057, + "time_per_iteration": 2.5529589653015137 + }, + { + "auxiliary_loss_clip": 0.06650747, + "auxiliary_loss_mlp": 0.0129436, + "balance_loss_clip": 0.06317189, + "balance_loss_mlp": 0.01261553, + "epoch": 0.1237336539906809, + "flos": 25163143430400.0, + "grad_norm": 1.8863431007110945, + "language_loss": 0.7932052, + "learning_rate": 3.90854159710213e-06, + "loss": 0.87265623, + "num_input_tokens_seen": 44475815, + "router_z_loss_clip": 3.33398438, + "router_z_loss_mlp": 0.328125, + "step": 2058, + "time_per_iteration": 2.636936902999878 + }, + { + "auxiliary_loss_clip": 0.06652544, + "auxiliary_loss_mlp": 0.01294377, + "balance_loss_clip": 0.06313539, + "balance_loss_mlp": 0.01259782, + "epoch": 0.12379377724334886, + "flos": 15310001963520.0, + "grad_norm": 2.1631103181071865, + "language_loss": 0.84899569, + "learning_rate": 3.9084251338611624e-06, + "loss": 0.92846489, + "num_input_tokens_seen": 44494045, + "router_z_loss_clip": 3.38671875, + "router_z_loss_mlp": 0.34619141, + "step": 2059, + "time_per_iteration": 2.534330129623413 + }, + { + "auxiliary_loss_clip": 0.06649262, + "auxiliary_loss_mlp": 0.01290616, + "balance_loss_clip": 0.06311791, + "balance_loss_mlp": 0.01258405, + "epoch": 0.12385390049601683, + "flos": 21321792852480.0, + "grad_norm": 2.425291985469593, + "language_loss": 0.82626045, + "learning_rate": 3.908308598252523e-06, + "loss": 0.90565926, + "num_input_tokens_seen": 44509120, + "router_z_loss_clip": 3.37695312, + "router_z_loss_mlp": 0.32177734, + "step": 2060, + "time_per_iteration": 2.6014535427093506 + }, + { + "auxiliary_loss_clip": 0.06642138, + "auxiliary_loss_mlp": 0.01290673, + "balance_loss_clip": 0.06310271, + "balance_loss_mlp": 0.01256579, + "epoch": 0.1239140237486848, + "flos": 15120711590400.0, + "grad_norm": 2.0800945388405734, + "language_loss": 0.87935984, + "learning_rate": 3.9081919902806306e-06, + "loss": 0.95868802, + "num_input_tokens_seen": 44525780, + "router_z_loss_clip": 3.31640625, + "router_z_loss_mlp": 0.34082031, + "step": 2061, + "time_per_iteration": 2.494584321975708 + }, + { + "auxiliary_loss_clip": 0.0663335, + "auxiliary_loss_mlp": 0.01291205, + "balance_loss_clip": 0.06306711, + "balance_loss_mlp": 0.01260259, + "epoch": 0.12397414700135277, + "flos": 21982534623360.0, + "grad_norm": 1.9753177189275368, + "language_loss": 0.85858583, + "learning_rate": 3.908075309949906e-06, + "loss": 0.9378314, + "num_input_tokens_seen": 44543125, + "router_z_loss_clip": 3.26953125, + "router_z_loss_mlp": 0.30932617, + "step": 2062, + "time_per_iteration": 2.5650103092193604 + }, + { + "auxiliary_loss_clip": 0.06642005, + "auxiliary_loss_mlp": 0.01288926, + "balance_loss_clip": 0.06311023, + "balance_loss_mlp": 0.01256549, + "epoch": 0.12403427025402074, + "flos": 13404909934080.0, + "grad_norm": 1.7604795458830171, + "language_loss": 0.80305374, + "learning_rate": 3.907958557264774e-06, + "loss": 0.88236302, + "num_input_tokens_seen": 44560275, + "router_z_loss_clip": 3.30859375, + "router_z_loss_mlp": 0.32373047, + "step": 2063, + "time_per_iteration": 2.5019121170043945 + }, + { + "auxiliary_loss_clip": 0.06644779, + "auxiliary_loss_mlp": 0.0129093, + "balance_loss_clip": 0.06312533, + "balance_loss_mlp": 0.01257146, + "epoch": 0.12409439350668872, + "flos": 15309750401280.0, + "grad_norm": 2.5047408324670832, + "language_loss": 0.80646086, + "learning_rate": 3.907841732229663e-06, + "loss": 0.885818, + "num_input_tokens_seen": 44577640, + "router_z_loss_clip": 3.32421875, + "router_z_loss_mlp": 0.33789062, + "step": 2064, + "time_per_iteration": 2.5915873050689697 + }, + { + "auxiliary_loss_clip": 0.06642206, + "auxiliary_loss_mlp": 0.01295102, + "balance_loss_clip": 0.06310631, + "balance_loss_mlp": 0.01263583, + "epoch": 0.12415451675935668, + "flos": 25016339877120.0, + "grad_norm": 2.4114555321806677, + "language_loss": 0.93642998, + "learning_rate": 3.907724834849002e-06, + "loss": 1.0158031, + "num_input_tokens_seen": 44594860, + "router_z_loss_clip": 3.31445312, + "router_z_loss_mlp": 0.31542969, + "step": 2065, + "time_per_iteration": 2.561858892440796 + }, + { + "auxiliary_loss_clip": 0.06650305, + "auxiliary_loss_mlp": 0.01289676, + "balance_loss_clip": 0.06313996, + "balance_loss_mlp": 0.01256845, + "epoch": 0.12421464001202465, + "flos": 23666457000960.0, + "grad_norm": 2.189266948105698, + "language_loss": 0.81909287, + "learning_rate": 3.907607865127225e-06, + "loss": 0.89849269, + "num_input_tokens_seen": 44614780, + "router_z_loss_clip": 3.36523438, + "router_z_loss_mlp": 0.32836914, + "step": 2066, + "time_per_iteration": 2.593202590942383 + }, + { + "auxiliary_loss_clip": 0.06490391, + "auxiliary_loss_mlp": 0.01263186, + "balance_loss_clip": 0.06289329, + "balance_loss_mlp": 0.01251599, + "epoch": 0.12427476326469263, + "flos": 65753686794240.0, + "grad_norm": 0.8319051039342746, + "language_loss": 0.63633674, + "learning_rate": 3.907490823068766e-06, + "loss": 0.71387255, + "num_input_tokens_seen": 44671240, + "router_z_loss_clip": 2.00195312, + "router_z_loss_mlp": 0.11578369, + "step": 2067, + "time_per_iteration": 3.1761627197265625 + }, + { + "auxiliary_loss_clip": 0.06645706, + "auxiliary_loss_mlp": 0.01298846, + "balance_loss_clip": 0.0631035, + "balance_loss_mlp": 0.01263441, + "epoch": 0.12433488651736059, + "flos": 24542372856960.0, + "grad_norm": 1.826307317776044, + "language_loss": 0.94409752, + "learning_rate": 3.907373708678063e-06, + "loss": 1.023543, + "num_input_tokens_seen": 44691050, + "router_z_loss_clip": 3.3515625, + "router_z_loss_mlp": 0.35375977, + "step": 2068, + "time_per_iteration": 2.548051357269287 + }, + { + "auxiliary_loss_clip": 0.06634392, + "auxiliary_loss_mlp": 0.01295819, + "balance_loss_clip": 0.06307046, + "balance_loss_mlp": 0.01265087, + "epoch": 0.12439500977002856, + "flos": 21037828965120.0, + "grad_norm": 2.192174211914145, + "language_loss": 0.82850045, + "learning_rate": 3.9072565219595596e-06, + "loss": 0.90780252, + "num_input_tokens_seen": 44709850, + "router_z_loss_clip": 3.2734375, + "router_z_loss_mlp": 0.30712891, + "step": 2069, + "time_per_iteration": 3.9771463871002197 + }, + { + "auxiliary_loss_clip": 0.0664653, + "auxiliary_loss_mlp": 0.01287176, + "balance_loss_clip": 0.06312294, + "balance_loss_mlp": 0.01255276, + "epoch": 0.12445513302269653, + "flos": 26837380661760.0, + "grad_norm": 2.140489528942806, + "language_loss": 0.78554291, + "learning_rate": 3.907139262917696e-06, + "loss": 0.86487997, + "num_input_tokens_seen": 44731475, + "router_z_loss_clip": 3.33984375, + "router_z_loss_mlp": 0.31884766, + "step": 2070, + "time_per_iteration": 2.5697221755981445 + }, + { + "auxiliary_loss_clip": 0.06645045, + "auxiliary_loss_mlp": 0.01288939, + "balance_loss_clip": 0.06311486, + "balance_loss_mlp": 0.01258469, + "epoch": 0.1245152562753645, + "flos": 18374764101120.0, + "grad_norm": 2.28424874253062, + "language_loss": 0.81667042, + "learning_rate": 3.907021931556922e-06, + "loss": 0.89601028, + "num_input_tokens_seen": 44749685, + "router_z_loss_clip": 3.3359375, + "router_z_loss_mlp": 0.3046875, + "step": 2071, + "time_per_iteration": 3.9356284141540527 + }, + { + "auxiliary_loss_clip": 0.06624742, + "auxiliary_loss_mlp": 0.01289094, + "balance_loss_clip": 0.06303577, + "balance_loss_mlp": 0.01256407, + "epoch": 0.12457537952803246, + "flos": 33116098331520.0, + "grad_norm": 2.0527550980706626, + "language_loss": 0.79415953, + "learning_rate": 3.906904527881684e-06, + "loss": 0.87329787, + "num_input_tokens_seen": 44772165, + "router_z_loss_clip": 3.20898438, + "router_z_loss_mlp": 0.32666016, + "step": 2072, + "time_per_iteration": 2.659824848175049 + }, + { + "auxiliary_loss_clip": 0.06639021, + "auxiliary_loss_mlp": 0.01293554, + "balance_loss_clip": 0.06306598, + "balance_loss_mlp": 0.01260819, + "epoch": 0.12463550278070043, + "flos": 22276267511040.0, + "grad_norm": 2.0170209718237144, + "language_loss": 0.76458508, + "learning_rate": 3.9067870518964355e-06, + "loss": 0.84391081, + "num_input_tokens_seen": 44790580, + "router_z_loss_clip": 3.32226562, + "router_z_loss_mlp": 0.32739258, + "step": 2073, + "time_per_iteration": 4.0372233390808105 + }, + { + "auxiliary_loss_clip": 0.06627664, + "auxiliary_loss_mlp": 0.012867, + "balance_loss_clip": 0.06303963, + "balance_loss_mlp": 0.01255491, + "epoch": 0.12469562603336841, + "flos": 14683445458560.0, + "grad_norm": 1.9751185197934578, + "language_loss": 0.9136548, + "learning_rate": 3.906669503605631e-06, + "loss": 0.99279845, + "num_input_tokens_seen": 44806730, + "router_z_loss_clip": 3.24023438, + "router_z_loss_mlp": 0.3125, + "step": 2074, + "time_per_iteration": 3.880718946456909 + }, + { + "auxiliary_loss_clip": 0.06644025, + "auxiliary_loss_mlp": 0.01296508, + "balance_loss_clip": 0.06306964, + "balance_loss_mlp": 0.0126065, + "epoch": 0.12475574928603637, + "flos": 24651720835200.0, + "grad_norm": 2.411338932827457, + "language_loss": 0.85379255, + "learning_rate": 3.906551883013728e-06, + "loss": 0.93319792, + "num_input_tokens_seen": 44825550, + "router_z_loss_clip": 3.36914062, + "router_z_loss_mlp": 0.35839844, + "step": 2075, + "time_per_iteration": 2.593402147293091 + }, + { + "auxiliary_loss_clip": 0.06632458, + "auxiliary_loss_mlp": 0.01300353, + "balance_loss_clip": 0.06302904, + "balance_loss_mlp": 0.01267166, + "epoch": 0.12481587253870434, + "flos": 21769540744320.0, + "grad_norm": 1.9904013424210072, + "language_loss": 0.73795271, + "learning_rate": 3.9064341901251865e-06, + "loss": 0.81728083, + "num_input_tokens_seen": 44844155, + "router_z_loss_clip": 3.29296875, + "router_z_loss_mlp": 0.33227539, + "step": 2076, + "time_per_iteration": 2.5252525806427 + }, + { + "auxiliary_loss_clip": 0.06619625, + "auxiliary_loss_mlp": 0.01296003, + "balance_loss_clip": 0.06298469, + "balance_loss_mlp": 0.0126632, + "epoch": 0.12487599579137232, + "flos": 21438687697920.0, + "grad_norm": 2.119852671968812, + "language_loss": 0.76853049, + "learning_rate": 3.906316424944469e-06, + "loss": 0.84768671, + "num_input_tokens_seen": 44863780, + "router_z_loss_clip": 3.2109375, + "router_z_loss_mlp": 0.29663086, + "step": 2077, + "time_per_iteration": 2.5812795162200928 + }, + { + "auxiliary_loss_clip": 0.06627834, + "auxiliary_loss_mlp": 0.01294428, + "balance_loss_clip": 0.06298409, + "balance_loss_mlp": 0.01261503, + "epoch": 0.12493611904404028, + "flos": 16113228802560.0, + "grad_norm": 2.6079444778137906, + "language_loss": 0.83980322, + "learning_rate": 3.906198587476043e-06, + "loss": 0.9190259, + "num_input_tokens_seen": 44881480, + "router_z_loss_clip": 3.296875, + "router_z_loss_mlp": 0.3293457, + "step": 2078, + "time_per_iteration": 2.5144779682159424 + }, + { + "auxiliary_loss_clip": 0.06633472, + "auxiliary_loss_mlp": 0.01297977, + "balance_loss_clip": 0.06301548, + "balance_loss_mlp": 0.01265337, + "epoch": 0.12499624229670825, + "flos": 21586749062400.0, + "grad_norm": 2.088353376240652, + "language_loss": 0.7681694, + "learning_rate": 3.906080677724374e-06, + "loss": 0.84748387, + "num_input_tokens_seen": 44900390, + "router_z_loss_clip": 3.32226562, + "router_z_loss_mlp": 0.32617188, + "step": 2079, + "time_per_iteration": 2.638761043548584 + }, + { + "auxiliary_loss_clip": 0.06640807, + "auxiliary_loss_mlp": 0.01295919, + "balance_loss_clip": 0.06307015, + "balance_loss_mlp": 0.01263351, + "epoch": 0.1250563655493762, + "flos": 25705522909440.0, + "grad_norm": 2.3726479932939064, + "language_loss": 0.85245967, + "learning_rate": 3.905962695693935e-06, + "loss": 0.93182695, + "num_input_tokens_seen": 44920375, + "router_z_loss_clip": 3.33984375, + "router_z_loss_mlp": 0.32592773, + "step": 2080, + "time_per_iteration": 2.5898683071136475 + }, + { + "auxiliary_loss_clip": 0.06618196, + "auxiliary_loss_mlp": 0.0130361, + "balance_loss_clip": 0.06292213, + "balance_loss_mlp": 0.01269993, + "epoch": 0.12511648880204418, + "flos": 16915113976320.0, + "grad_norm": 2.1047824756143263, + "language_loss": 0.86146665, + "learning_rate": 3.9058446413892e-06, + "loss": 0.94068468, + "num_input_tokens_seen": 44938415, + "router_z_loss_clip": 3.25976562, + "router_z_loss_mlp": 0.3359375, + "step": 2081, + "time_per_iteration": 2.5291430950164795 + }, + { + "auxiliary_loss_clip": 0.06628423, + "auxiliary_loss_mlp": 0.01299212, + "balance_loss_clip": 0.06304745, + "balance_loss_mlp": 0.01268289, + "epoch": 0.12517661205471217, + "flos": 17573423978880.0, + "grad_norm": 1.9525319716543403, + "language_loss": 0.77591729, + "learning_rate": 3.905726514814646e-06, + "loss": 0.85519361, + "num_input_tokens_seen": 44957135, + "router_z_loss_clip": 3.23632812, + "router_z_loss_mlp": 0.30908203, + "step": 2082, + "time_per_iteration": 2.5817041397094727 + }, + { + "auxiliary_loss_clip": 0.06645833, + "auxiliary_loss_mlp": 0.01295307, + "balance_loss_clip": 0.06304055, + "balance_loss_mlp": 0.01261118, + "epoch": 0.12523673530738014, + "flos": 16039240047360.0, + "grad_norm": 3.06086551706414, + "language_loss": 0.80167735, + "learning_rate": 3.9056083159747495e-06, + "loss": 0.88108873, + "num_input_tokens_seen": 44974480, + "router_z_loss_clip": 3.421875, + "router_z_loss_mlp": 0.34179688, + "step": 2083, + "time_per_iteration": 2.6278059482574463 + }, + { + "auxiliary_loss_clip": 0.06632711, + "auxiliary_loss_mlp": 0.01297422, + "balance_loss_clip": 0.06298797, + "balance_loss_mlp": 0.0126297, + "epoch": 0.1252968585600481, + "flos": 18813833095680.0, + "grad_norm": 3.451384720222282, + "language_loss": 0.92214763, + "learning_rate": 3.9054900448739966e-06, + "loss": 1.00144899, + "num_input_tokens_seen": 44990310, + "router_z_loss_clip": 3.33789062, + "router_z_loss_mlp": 0.34472656, + "step": 2084, + "time_per_iteration": 2.501530647277832 + }, + { + "auxiliary_loss_clip": 0.0662484, + "auxiliary_loss_mlp": 0.01295191, + "balance_loss_clip": 0.06296518, + "balance_loss_mlp": 0.01263171, + "epoch": 0.12535698181271607, + "flos": 27278923351680.0, + "grad_norm": 1.9702751102582312, + "language_loss": 0.81308639, + "learning_rate": 3.905371701516869e-06, + "loss": 0.89228666, + "num_input_tokens_seen": 45010720, + "router_z_loss_clip": 3.28320312, + "router_z_loss_mlp": 0.32006836, + "step": 2085, + "time_per_iteration": 2.5993080139160156 + }, + { + "auxiliary_loss_clip": 0.06621981, + "auxiliary_loss_mlp": 0.01314133, + "balance_loss_clip": 0.06297316, + "balance_loss_mlp": 0.01281469, + "epoch": 0.12541710506538403, + "flos": 22060590301440.0, + "grad_norm": 2.513443994409739, + "language_loss": 0.89793539, + "learning_rate": 3.905253285907856e-06, + "loss": 0.97729653, + "num_input_tokens_seen": 45030360, + "router_z_loss_clip": 3.24609375, + "router_z_loss_mlp": 0.32641602, + "step": 2086, + "time_per_iteration": 2.526017427444458 + }, + { + "auxiliary_loss_clip": 0.0661508, + "auxiliary_loss_mlp": 0.01297904, + "balance_loss_clip": 0.06298057, + "balance_loss_mlp": 0.01269651, + "epoch": 0.125477228318052, + "flos": 12607888296960.0, + "grad_norm": 2.458580206146656, + "language_loss": 0.88740981, + "learning_rate": 3.905134798051447e-06, + "loss": 0.96653962, + "num_input_tokens_seen": 45045085, + "router_z_loss_clip": 3.16992188, + "router_z_loss_mlp": 0.28271484, + "step": 2087, + "time_per_iteration": 2.6768429279327393 + }, + { + "auxiliary_loss_clip": 0.06626555, + "auxiliary_loss_mlp": 0.0130267, + "balance_loss_clip": 0.06301963, + "balance_loss_mlp": 0.0127077, + "epoch": 0.12553735157071996, + "flos": 23885362592640.0, + "grad_norm": 1.907782132807464, + "language_loss": 0.74902099, + "learning_rate": 3.905016237952136e-06, + "loss": 0.82831323, + "num_input_tokens_seen": 45065145, + "router_z_loss_clip": 3.24804688, + "router_z_loss_mlp": 0.3190918, + "step": 2088, + "time_per_iteration": 2.584322690963745 + }, + { + "auxiliary_loss_clip": 0.06515329, + "auxiliary_loss_mlp": 0.01279784, + "balance_loss_clip": 0.06318291, + "balance_loss_mlp": 0.01264752, + "epoch": 0.12559747482338796, + "flos": 69940998881280.0, + "grad_norm": 0.7370797813517723, + "language_loss": 0.61766195, + "learning_rate": 3.904897605614418e-06, + "loss": 0.69561303, + "num_input_tokens_seen": 45126230, + "router_z_loss_clip": 1.96875, + "router_z_loss_mlp": 0.15002441, + "step": 2089, + "time_per_iteration": 3.1401424407958984 + }, + { + "auxiliary_loss_clip": 0.06624255, + "auxiliary_loss_mlp": 0.01293606, + "balance_loss_clip": 0.06302167, + "balance_loss_mlp": 0.01262707, + "epoch": 0.12565759807605592, + "flos": 24286389033600.0, + "grad_norm": 1.9922861494736146, + "language_loss": 0.80224949, + "learning_rate": 3.904778901042793e-06, + "loss": 0.88142806, + "num_input_tokens_seen": 45145545, + "router_z_loss_clip": 3.22070312, + "router_z_loss_mlp": 0.30883789, + "step": 2090, + "time_per_iteration": 2.6044373512268066 + }, + { + "auxiliary_loss_clip": 0.0651547, + "auxiliary_loss_mlp": 0.01267283, + "balance_loss_clip": 0.06318653, + "balance_loss_mlp": 0.01254635, + "epoch": 0.12571772132872389, + "flos": 56468011904640.0, + "grad_norm": 0.7384472353065198, + "language_loss": 0.58865118, + "learning_rate": 3.90466012424176e-06, + "loss": 0.66647875, + "num_input_tokens_seen": 45206845, + "router_z_loss_clip": 1.96875, + "router_z_loss_mlp": 0.12646484, + "step": 2091, + "time_per_iteration": 3.1160824298858643 + }, + { + "auxiliary_loss_clip": 0.06630008, + "auxiliary_loss_mlp": 0.01289162, + "balance_loss_clip": 0.06302688, + "balance_loss_mlp": 0.0125781, + "epoch": 0.12577784458139185, + "flos": 41255576421120.0, + "grad_norm": 1.8290499485408422, + "language_loss": 0.65244853, + "learning_rate": 3.904541275215825e-06, + "loss": 0.73164022, + "num_input_tokens_seen": 45228495, + "router_z_loss_clip": 3.26953125, + "router_z_loss_mlp": 0.31347656, + "step": 2092, + "time_per_iteration": 2.743067741394043 + }, + { + "auxiliary_loss_clip": 0.06640761, + "auxiliary_loss_mlp": 0.01299851, + "balance_loss_clip": 0.06305548, + "balance_loss_mlp": 0.01265542, + "epoch": 0.12583796783405982, + "flos": 19761599427840.0, + "grad_norm": 2.082922063254684, + "language_loss": 0.82319552, + "learning_rate": 3.904422353969493e-06, + "loss": 0.9026016, + "num_input_tokens_seen": 45245720, + "router_z_loss_clip": 3.34960938, + "router_z_loss_mlp": 0.34277344, + "step": 2093, + "time_per_iteration": 2.5252139568328857 + }, + { + "auxiliary_loss_clip": 0.06622188, + "auxiliary_loss_mlp": 0.01291379, + "balance_loss_clip": 0.06303331, + "balance_loss_mlp": 0.01260766, + "epoch": 0.12589809108672778, + "flos": 22608797639040.0, + "grad_norm": 2.0047110075262635, + "language_loss": 0.76888406, + "learning_rate": 3.904303360507276e-06, + "loss": 0.84801972, + "num_input_tokens_seen": 45265650, + "router_z_loss_clip": 3.1875, + "router_z_loss_mlp": 0.30639648, + "step": 2094, + "time_per_iteration": 2.5590462684631348 + }, + { + "auxiliary_loss_clip": 0.06619669, + "auxiliary_loss_mlp": 0.01298019, + "balance_loss_clip": 0.06299751, + "balance_loss_mlp": 0.01266309, + "epoch": 0.12595821433939577, + "flos": 45233248792320.0, + "grad_norm": 1.7774170004570267, + "language_loss": 0.78170305, + "learning_rate": 3.9041842948336835e-06, + "loss": 0.8608799, + "num_input_tokens_seen": 45287790, + "router_z_loss_clip": 3.20117188, + "router_z_loss_mlp": 0.31689453, + "step": 2095, + "time_per_iteration": 2.7437078952789307 + }, + { + "auxiliary_loss_clip": 0.06632219, + "auxiliary_loss_mlp": 0.01294772, + "balance_loss_clip": 0.06299502, + "balance_loss_mlp": 0.01263492, + "epoch": 0.12601833759206374, + "flos": 14325115472640.0, + "grad_norm": 2.871933509106217, + "language_loss": 0.84611917, + "learning_rate": 3.904065156953232e-06, + "loss": 0.92538905, + "num_input_tokens_seen": 45305720, + "router_z_loss_clip": 3.328125, + "router_z_loss_mlp": 0.31274414, + "step": 2096, + "time_per_iteration": 2.530060052871704 + }, + { + "auxiliary_loss_clip": 0.06630743, + "auxiliary_loss_mlp": 0.01306013, + "balance_loss_clip": 0.06297809, + "balance_loss_mlp": 0.01272038, + "epoch": 0.1260784608447317, + "flos": 21294651329280.0, + "grad_norm": 2.3649533335504365, + "language_loss": 0.7677502, + "learning_rate": 3.903945946870439e-06, + "loss": 0.84711778, + "num_input_tokens_seen": 45325290, + "router_z_loss_clip": 3.33203125, + "router_z_loss_mlp": 0.33984375, + "step": 2097, + "time_per_iteration": 2.5258843898773193 + }, + { + "auxiliary_loss_clip": 0.06624204, + "auxiliary_loss_mlp": 0.01300362, + "balance_loss_clip": 0.06299201, + "balance_loss_mlp": 0.0127025, + "epoch": 0.12613858409739967, + "flos": 26258719564800.0, + "grad_norm": 2.151256625756143, + "language_loss": 0.88275403, + "learning_rate": 3.9038266645898246e-06, + "loss": 0.96199965, + "num_input_tokens_seen": 45344465, + "router_z_loss_clip": 3.25195312, + "router_z_loss_mlp": 0.30102539, + "step": 2098, + "time_per_iteration": 2.5916357040405273 + }, + { + "auxiliary_loss_clip": 0.0664238, + "auxiliary_loss_mlp": 0.01307801, + "balance_loss_clip": 0.06306277, + "balance_loss_mlp": 0.0127149, + "epoch": 0.12619870735006763, + "flos": 21586413646080.0, + "grad_norm": 1.8808679634119545, + "language_loss": 0.71169508, + "learning_rate": 3.903707310115912e-06, + "loss": 0.79119694, + "num_input_tokens_seen": 45362465, + "router_z_loss_clip": 3.359375, + "router_z_loss_mlp": 0.36303711, + "step": 2099, + "time_per_iteration": 2.525548219680786 + }, + { + "auxiliary_loss_clip": 0.06636767, + "auxiliary_loss_mlp": 0.01301654, + "balance_loss_clip": 0.06306287, + "balance_loss_mlp": 0.0126756, + "epoch": 0.1262588306027356, + "flos": 23373646508160.0, + "grad_norm": 3.191355313927065, + "language_loss": 0.83154678, + "learning_rate": 3.903587883453228e-06, + "loss": 0.91093099, + "num_input_tokens_seen": 45382700, + "router_z_loss_clip": 3.30664062, + "router_z_loss_mlp": 0.34106445, + "step": 2100, + "time_per_iteration": 2.581777572631836 + }, + { + "auxiliary_loss_clip": 0.06632592, + "auxiliary_loss_mlp": 0.01304584, + "balance_loss_clip": 0.06304123, + "balance_loss_mlp": 0.01271325, + "epoch": 0.12631895385540357, + "flos": 23955619841280.0, + "grad_norm": 1.9586534535799036, + "language_loss": 0.81579792, + "learning_rate": 3.903468384606302e-06, + "loss": 0.89516962, + "num_input_tokens_seen": 45401005, + "router_z_loss_clip": 3.28515625, + "router_z_loss_mlp": 0.33227539, + "step": 2101, + "time_per_iteration": 2.579571008682251 + }, + { + "auxiliary_loss_clip": 0.06508025, + "auxiliary_loss_mlp": 0.01260999, + "balance_loss_clip": 0.06310984, + "balance_loss_mlp": 0.0125033, + "epoch": 0.12637907710807156, + "flos": 70301760635520.0, + "grad_norm": 0.6797956524806741, + "language_loss": 0.57154572, + "learning_rate": 3.903348813579662e-06, + "loss": 0.6492359, + "num_input_tokens_seen": 45466555, + "router_z_loss_clip": 1.96875, + "router_z_loss_mlp": 0.10681152, + "step": 2102, + "time_per_iteration": 3.2542574405670166 + }, + { + "auxiliary_loss_clip": 0.06635006, + "auxiliary_loss_mlp": 0.0129624, + "balance_loss_clip": 0.06302785, + "balance_loss_mlp": 0.01264888, + "epoch": 0.12643920036073952, + "flos": 18920833159680.0, + "grad_norm": 2.1103424848105177, + "language_loss": 0.95015359, + "learning_rate": 3.903229170377845e-06, + "loss": 1.02946603, + "num_input_tokens_seen": 45485165, + "router_z_loss_clip": 3.32617188, + "router_z_loss_mlp": 0.31396484, + "step": 2103, + "time_per_iteration": 2.554858684539795 + }, + { + "auxiliary_loss_clip": 0.06615217, + "auxiliary_loss_mlp": 0.01290733, + "balance_loss_clip": 0.0629935, + "balance_loss_mlp": 0.0126099, + "epoch": 0.1264993236134075, + "flos": 27789926676480.0, + "grad_norm": 1.8409874759375768, + "language_loss": 0.79467118, + "learning_rate": 3.903109455005387e-06, + "loss": 0.8737306, + "num_input_tokens_seen": 45504630, + "router_z_loss_clip": 3.15820312, + "router_z_loss_mlp": 0.29711914, + "step": 2104, + "time_per_iteration": 2.6194100379943848 + }, + { + "auxiliary_loss_clip": 0.06630556, + "auxiliary_loss_mlp": 0.01294269, + "balance_loss_clip": 0.06301397, + "balance_loss_mlp": 0.0126256, + "epoch": 0.12655944686607545, + "flos": 24761739646080.0, + "grad_norm": 2.4857210053550625, + "language_loss": 0.82356828, + "learning_rate": 3.902989667466828e-06, + "loss": 0.90281653, + "num_input_tokens_seen": 45524885, + "router_z_loss_clip": 3.29296875, + "router_z_loss_mlp": 0.31713867, + "step": 2105, + "time_per_iteration": 2.6011011600494385 + }, + { + "auxiliary_loss_clip": 0.06645899, + "auxiliary_loss_mlp": 0.01301591, + "balance_loss_clip": 0.0630343, + "balance_loss_mlp": 0.01263587, + "epoch": 0.12661957011874342, + "flos": 24139753188480.0, + "grad_norm": 2.6380144602222653, + "language_loss": 0.84079802, + "learning_rate": 3.90286980776671e-06, + "loss": 0.92027295, + "num_input_tokens_seen": 45545000, + "router_z_loss_clip": 3.421875, + "router_z_loss_mlp": 0.37963867, + "step": 2106, + "time_per_iteration": 2.572817087173462 + }, + { + "auxiliary_loss_clip": 0.0662559, + "auxiliary_loss_mlp": 0.012898, + "balance_loss_clip": 0.06298016, + "balance_loss_mlp": 0.01256422, + "epoch": 0.12667969337141138, + "flos": 24576180779520.0, + "grad_norm": 1.9395738781277843, + "language_loss": 0.74407184, + "learning_rate": 3.902749875909578e-06, + "loss": 0.82322574, + "num_input_tokens_seen": 45564210, + "router_z_loss_clip": 3.2734375, + "router_z_loss_mlp": 0.33374023, + "step": 2107, + "time_per_iteration": 2.6193723678588867 + }, + { + "auxiliary_loss_clip": 0.06622959, + "auxiliary_loss_mlp": 0.01290393, + "balance_loss_clip": 0.06299001, + "balance_loss_mlp": 0.01259017, + "epoch": 0.12673981662407935, + "flos": 22967546895360.0, + "grad_norm": 2.0472212441306175, + "language_loss": 0.80444276, + "learning_rate": 3.90262987189998e-06, + "loss": 0.88357627, + "num_input_tokens_seen": 45583030, + "router_z_loss_clip": 3.24023438, + "router_z_loss_mlp": 0.31396484, + "step": 2108, + "time_per_iteration": 2.5497617721557617 + }, + { + "auxiliary_loss_clip": 0.06627882, + "auxiliary_loss_mlp": 0.01288653, + "balance_loss_clip": 0.06298642, + "balance_loss_mlp": 0.01256562, + "epoch": 0.12679993987674734, + "flos": 17280613486080.0, + "grad_norm": 2.14760795310841, + "language_loss": 0.77326792, + "learning_rate": 3.902509795742467e-06, + "loss": 0.85243326, + "num_input_tokens_seen": 45602265, + "router_z_loss_clip": 3.29296875, + "router_z_loss_mlp": 0.32080078, + "step": 2109, + "time_per_iteration": 3.9535577297210693 + }, + { + "auxiliary_loss_clip": 0.06619301, + "auxiliary_loss_mlp": 0.01294051, + "balance_loss_clip": 0.0629691, + "balance_loss_mlp": 0.01260672, + "epoch": 0.1268600631294153, + "flos": 17280865048320.0, + "grad_norm": 1.6861552096477337, + "language_loss": 0.83234507, + "learning_rate": 3.902389647441592e-06, + "loss": 0.91147858, + "num_input_tokens_seen": 45620595, + "router_z_loss_clip": 3.22265625, + "router_z_loss_mlp": 0.33374023, + "step": 2110, + "time_per_iteration": 3.975102424621582 + }, + { + "auxiliary_loss_clip": 0.06634356, + "auxiliary_loss_mlp": 0.01289468, + "balance_loss_clip": 0.06303843, + "balance_loss_mlp": 0.01256661, + "epoch": 0.12692018638208327, + "flos": 24067902712320.0, + "grad_norm": 1.6854035382994426, + "language_loss": 0.79946983, + "learning_rate": 3.90226942700191e-06, + "loss": 0.878708, + "num_input_tokens_seen": 45641140, + "router_z_loss_clip": 3.31054688, + "router_z_loss_mlp": 0.32788086, + "step": 2111, + "time_per_iteration": 2.549649953842163 + }, + { + "auxiliary_loss_clip": 0.06640926, + "auxiliary_loss_mlp": 0.0129832, + "balance_loss_clip": 0.06299084, + "balance_loss_mlp": 0.01261199, + "epoch": 0.12698030963475124, + "flos": 31839952648320.0, + "grad_norm": 2.9365318295255984, + "language_loss": 0.78364569, + "learning_rate": 3.902149134427982e-06, + "loss": 0.86303812, + "num_input_tokens_seen": 45662315, + "router_z_loss_clip": 3.4140625, + "router_z_loss_mlp": 0.37109375, + "step": 2112, + "time_per_iteration": 2.641850233078003 + }, + { + "auxiliary_loss_clip": 0.06616612, + "auxiliary_loss_mlp": 0.01293574, + "balance_loss_clip": 0.062942, + "balance_loss_mlp": 0.01262342, + "epoch": 0.1270404328874192, + "flos": 25194058387200.0, + "grad_norm": 2.0317084660262688, + "language_loss": 0.86970478, + "learning_rate": 3.902028769724367e-06, + "loss": 0.94880664, + "num_input_tokens_seen": 45680335, + "router_z_loss_clip": 3.22851562, + "router_z_loss_mlp": 0.31225586, + "step": 2113, + "time_per_iteration": 5.534189224243164 + }, + { + "auxiliary_loss_clip": 0.06626937, + "auxiliary_loss_mlp": 0.01298292, + "balance_loss_clip": 0.06295247, + "balance_loss_mlp": 0.01265462, + "epoch": 0.12710055614008717, + "flos": 16002790721280.0, + "grad_norm": 2.427248740860799, + "language_loss": 0.75266403, + "learning_rate": 3.9019083328956315e-06, + "loss": 0.83191633, + "num_input_tokens_seen": 45696240, + "router_z_loss_clip": 3.3203125, + "router_z_loss_mlp": 0.32788086, + "step": 2114, + "time_per_iteration": 2.491520643234253 + }, + { + "auxiliary_loss_clip": 0.06621046, + "auxiliary_loss_mlp": 0.01302494, + "balance_loss_clip": 0.06295703, + "balance_loss_mlp": 0.01270975, + "epoch": 0.12716067939275516, + "flos": 15091012517760.0, + "grad_norm": 2.3252793600318125, + "language_loss": 0.85064435, + "learning_rate": 3.901787823946341e-06, + "loss": 0.92987972, + "num_input_tokens_seen": 45713695, + "router_z_loss_clip": 3.2578125, + "router_z_loss_mlp": 0.31518555, + "step": 2115, + "time_per_iteration": 2.5152101516723633 + }, + { + "auxiliary_loss_clip": 0.06622103, + "auxiliary_loss_mlp": 0.01292068, + "balance_loss_clip": 0.06295006, + "balance_loss_mlp": 0.01260787, + "epoch": 0.12722080264542313, + "flos": 28374373704960.0, + "grad_norm": 1.6080767966631377, + "language_loss": 0.88167703, + "learning_rate": 3.901667242881065e-06, + "loss": 0.96081877, + "num_input_tokens_seen": 45736655, + "router_z_loss_clip": 3.2734375, + "router_z_loss_mlp": 0.3125, + "step": 2116, + "time_per_iteration": 2.61238169670105 + }, + { + "auxiliary_loss_clip": 0.06614063, + "auxiliary_loss_mlp": 0.01310146, + "balance_loss_clip": 0.06294715, + "balance_loss_mlp": 0.0127877, + "epoch": 0.1272809258980911, + "flos": 32388159985920.0, + "grad_norm": 4.443941469464488, + "language_loss": 0.72083235, + "learning_rate": 3.9015465897043775e-06, + "loss": 0.8000744, + "num_input_tokens_seen": 45758195, + "router_z_loss_clip": 3.1953125, + "router_z_loss_mlp": 0.3137207, + "step": 2117, + "time_per_iteration": 2.6185410022735596 + }, + { + "auxiliary_loss_clip": 0.06630652, + "auxiliary_loss_mlp": 0.01300593, + "balance_loss_clip": 0.06301345, + "balance_loss_mlp": 0.0126781, + "epoch": 0.12734104915075906, + "flos": 16039952807040.0, + "grad_norm": 1.9850917523754936, + "language_loss": 0.87703407, + "learning_rate": 3.901425864420852e-06, + "loss": 0.95634645, + "num_input_tokens_seen": 45774280, + "router_z_loss_clip": 3.29101562, + "router_z_loss_mlp": 0.32739258, + "step": 2118, + "time_per_iteration": 2.503112316131592 + }, + { + "auxiliary_loss_clip": 0.06623712, + "auxiliary_loss_mlp": 0.01308307, + "balance_loss_clip": 0.06299254, + "balance_loss_mlp": 0.01276359, + "epoch": 0.12740117240342702, + "flos": 18266296590720.0, + "grad_norm": 1.8669738886398666, + "language_loss": 0.88737518, + "learning_rate": 3.901305067035068e-06, + "loss": 0.96669531, + "num_input_tokens_seen": 45792760, + "router_z_loss_clip": 3.24414062, + "router_z_loss_mlp": 0.31945801, + "step": 2119, + "time_per_iteration": 2.541663885116577 + }, + { + "auxiliary_loss_clip": 0.06633841, + "auxiliary_loss_mlp": 0.01294245, + "balance_loss_clip": 0.06305236, + "balance_loss_mlp": 0.01260652, + "epoch": 0.127461295656095, + "flos": 12125242379520.0, + "grad_norm": 2.4570566612421154, + "language_loss": 0.88616729, + "learning_rate": 3.901184197551605e-06, + "loss": 0.96544814, + "num_input_tokens_seen": 45804300, + "router_z_loss_clip": 3.2890625, + "router_z_loss_mlp": 0.33569336, + "step": 2120, + "time_per_iteration": 2.481060743331909 + }, + { + "auxiliary_loss_clip": 0.06631807, + "auxiliary_loss_mlp": 0.01302004, + "balance_loss_clip": 0.06303513, + "balance_loss_mlp": 0.01269079, + "epoch": 0.12752141890876295, + "flos": 23155831019520.0, + "grad_norm": 1.9663880058350043, + "language_loss": 0.7779758, + "learning_rate": 3.901063255975046e-06, + "loss": 0.85731387, + "num_input_tokens_seen": 45823780, + "router_z_loss_clip": 3.28515625, + "router_z_loss_mlp": 0.3293457, + "step": 2121, + "time_per_iteration": 2.5578267574310303 + }, + { + "auxiliary_loss_clip": 0.06632394, + "auxiliary_loss_mlp": 0.01293067, + "balance_loss_clip": 0.06304775, + "balance_loss_mlp": 0.01258949, + "epoch": 0.12758154216143094, + "flos": 21622359847680.0, + "grad_norm": 2.5772818076611976, + "language_loss": 0.84019601, + "learning_rate": 3.900942242309978e-06, + "loss": 0.91945064, + "num_input_tokens_seen": 45840495, + "router_z_loss_clip": 3.28125, + "router_z_loss_mlp": 0.34106445, + "step": 2122, + "time_per_iteration": 2.5861244201660156 + }, + { + "auxiliary_loss_clip": 0.06629082, + "auxiliary_loss_mlp": 0.01293636, + "balance_loss_clip": 0.06302215, + "balance_loss_mlp": 0.01260162, + "epoch": 0.1276416654140989, + "flos": 15930395193600.0, + "grad_norm": 1.9995911681983476, + "language_loss": 0.80520052, + "learning_rate": 3.90082115656099e-06, + "loss": 0.88442767, + "num_input_tokens_seen": 45857735, + "router_z_loss_clip": 3.26953125, + "router_z_loss_mlp": 0.33496094, + "step": 2123, + "time_per_iteration": 2.543966770172119 + }, + { + "auxiliary_loss_clip": 0.06636834, + "auxiliary_loss_mlp": 0.01289825, + "balance_loss_clip": 0.06312384, + "balance_loss_mlp": 0.01257687, + "epoch": 0.12770178866676687, + "flos": 22389263141760.0, + "grad_norm": 1.6312979029769639, + "language_loss": 0.80678988, + "learning_rate": 3.900699998732673e-06, + "loss": 0.88605642, + "num_input_tokens_seen": 45876485, + "router_z_loss_clip": 3.24023438, + "router_z_loss_mlp": 0.3215332, + "step": 2124, + "time_per_iteration": 2.590118169784546 + }, + { + "auxiliary_loss_clip": 0.06636873, + "auxiliary_loss_mlp": 0.01291865, + "balance_loss_clip": 0.06307361, + "balance_loss_mlp": 0.01261228, + "epoch": 0.12776191191943484, + "flos": 21658851100800.0, + "grad_norm": 2.2926076774548765, + "language_loss": 0.76290202, + "learning_rate": 3.900578768829623e-06, + "loss": 0.84218943, + "num_input_tokens_seen": 45894645, + "router_z_loss_clip": 3.29492188, + "router_z_loss_mlp": 0.30639648, + "step": 2125, + "time_per_iteration": 2.5684149265289307 + }, + { + "auxiliary_loss_clip": 0.06631321, + "auxiliary_loss_mlp": 0.01289055, + "balance_loss_clip": 0.0630435, + "balance_loss_mlp": 0.01257011, + "epoch": 0.1278220351721028, + "flos": 25742056089600.0, + "grad_norm": 2.526811883204058, + "language_loss": 0.79172325, + "learning_rate": 3.900457466856434e-06, + "loss": 0.87092698, + "num_input_tokens_seen": 45913755, + "router_z_loss_clip": 3.26757812, + "router_z_loss_mlp": 0.3203125, + "step": 2126, + "time_per_iteration": 2.6264641284942627 + }, + { + "auxiliary_loss_clip": 0.06645348, + "auxiliary_loss_mlp": 0.01292083, + "balance_loss_clip": 0.06316036, + "balance_loss_mlp": 0.01259563, + "epoch": 0.12788215842477077, + "flos": 41252515747200.0, + "grad_norm": 1.559600581864003, + "language_loss": 0.70510435, + "learning_rate": 3.9003360928177085e-06, + "loss": 0.7844786, + "num_input_tokens_seen": 45936095, + "router_z_loss_clip": 3.29101562, + "router_z_loss_mlp": 0.32543945, + "step": 2127, + "time_per_iteration": 2.7501988410949707 + }, + { + "auxiliary_loss_clip": 0.06512339, + "auxiliary_loss_mlp": 0.01271557, + "balance_loss_clip": 0.06312746, + "balance_loss_mlp": 0.01259123, + "epoch": 0.12794228167743876, + "flos": 70899079265280.0, + "grad_norm": 0.8027421200972868, + "language_loss": 0.6268698, + "learning_rate": 3.900214646718047e-06, + "loss": 0.70470876, + "num_input_tokens_seen": 46004655, + "router_z_loss_clip": 2.0, + "router_z_loss_mlp": 0.12438965, + "step": 2128, + "time_per_iteration": 3.2327187061309814 + }, + { + "auxiliary_loss_clip": 0.06647713, + "auxiliary_loss_mlp": 0.0129082, + "balance_loss_clip": 0.06314018, + "balance_loss_mlp": 0.01255987, + "epoch": 0.12800240493010673, + "flos": 16295307724800.0, + "grad_norm": 3.2224372102485757, + "language_loss": 0.78878236, + "learning_rate": 3.900093128562056e-06, + "loss": 0.86816764, + "num_input_tokens_seen": 46023610, + "router_z_loss_clip": 3.3359375, + "router_z_loss_mlp": 0.34790039, + "step": 2129, + "time_per_iteration": 2.513296365737915 + }, + { + "auxiliary_loss_clip": 0.06653494, + "auxiliary_loss_mlp": 0.01302761, + "balance_loss_clip": 0.06312658, + "balance_loss_mlp": 0.012649, + "epoch": 0.1280625281827747, + "flos": 20637850700160.0, + "grad_norm": 2.4415165367574394, + "language_loss": 0.80974901, + "learning_rate": 3.899971538354343e-06, + "loss": 0.88931155, + "num_input_tokens_seen": 46041725, + "router_z_loss_clip": 3.40820312, + "router_z_loss_mlp": 0.37866211, + "step": 2130, + "time_per_iteration": 2.551335573196411 + }, + { + "auxiliary_loss_clip": 0.06635942, + "auxiliary_loss_mlp": 0.01301168, + "balance_loss_clip": 0.06304602, + "balance_loss_mlp": 0.01268457, + "epoch": 0.12812265143544266, + "flos": 22644869621760.0, + "grad_norm": 1.8063453022697407, + "language_loss": 0.73535526, + "learning_rate": 3.899849876099518e-06, + "loss": 0.81472635, + "num_input_tokens_seen": 46061095, + "router_z_loss_clip": 3.31445312, + "router_z_loss_mlp": 0.3269043, + "step": 2131, + "time_per_iteration": 2.591715097427368 + }, + { + "auxiliary_loss_clip": 0.06649061, + "auxiliary_loss_mlp": 0.01307481, + "balance_loss_clip": 0.06316839, + "balance_loss_mlp": 0.01274961, + "epoch": 0.12818277468811062, + "flos": 34723306696320.0, + "grad_norm": 2.4480572994081213, + "language_loss": 0.74477613, + "learning_rate": 3.899728141802197e-06, + "loss": 0.8243416, + "num_input_tokens_seen": 46082670, + "router_z_loss_clip": 3.32226562, + "router_z_loss_mlp": 0.32519531, + "step": 2132, + "time_per_iteration": 2.644005060195923 + }, + { + "auxiliary_loss_clip": 0.06630264, + "auxiliary_loss_mlp": 0.01301188, + "balance_loss_clip": 0.06311467, + "balance_loss_mlp": 0.01268573, + "epoch": 0.1282428979407786, + "flos": 23118752787840.0, + "grad_norm": 2.134664592917613, + "language_loss": 0.83662349, + "learning_rate": 3.8996063354669935e-06, + "loss": 0.91593802, + "num_input_tokens_seen": 46102410, + "router_z_loss_clip": 3.1875, + "router_z_loss_mlp": 0.32617188, + "step": 2133, + "time_per_iteration": 2.526437520980835 + }, + { + "auxiliary_loss_clip": 0.06657492, + "auxiliary_loss_mlp": 0.01312656, + "balance_loss_clip": 0.06318928, + "balance_loss_mlp": 0.01277823, + "epoch": 0.12830302119344655, + "flos": 20892786347520.0, + "grad_norm": 3.0593036297338223, + "language_loss": 0.82609046, + "learning_rate": 3.899484457098528e-06, + "loss": 0.90579188, + "num_input_tokens_seen": 46121145, + "router_z_loss_clip": 3.38476562, + "router_z_loss_mlp": 0.34814453, + "step": 2134, + "time_per_iteration": 2.57069993019104 + }, + { + "auxiliary_loss_clip": 0.06644946, + "auxiliary_loss_mlp": 0.01299694, + "balance_loss_clip": 0.0631265, + "balance_loss_mlp": 0.01266363, + "epoch": 0.12836314444611455, + "flos": 21404208942720.0, + "grad_norm": 1.8809028559826366, + "language_loss": 0.84531921, + "learning_rate": 3.899362506701421e-06, + "loss": 0.92476559, + "num_input_tokens_seen": 46140740, + "router_z_loss_clip": 3.3203125, + "router_z_loss_mlp": 0.33325195, + "step": 2135, + "time_per_iteration": 2.5816993713378906 + }, + { + "auxiliary_loss_clip": 0.06641332, + "auxiliary_loss_mlp": 0.01305378, + "balance_loss_clip": 0.06312244, + "balance_loss_mlp": 0.01272142, + "epoch": 0.1284232676987825, + "flos": 13667560156800.0, + "grad_norm": 3.0323333945799176, + "language_loss": 0.78892457, + "learning_rate": 3.899240484280298e-06, + "loss": 0.86839169, + "num_input_tokens_seen": 46156805, + "router_z_loss_clip": 3.2890625, + "router_z_loss_mlp": 0.33227539, + "step": 2136, + "time_per_iteration": 2.529231548309326 + }, + { + "auxiliary_loss_clip": 0.06499572, + "auxiliary_loss_mlp": 0.01289102, + "balance_loss_clip": 0.06299701, + "balance_loss_mlp": 0.01276156, + "epoch": 0.12848339095145048, + "flos": 60012904337280.0, + "grad_norm": 0.8797489168749767, + "language_loss": 0.5947628, + "learning_rate": 3.899118389839785e-06, + "loss": 0.67264956, + "num_input_tokens_seen": 46222085, + "router_z_loss_clip": 2.0, + "router_z_loss_mlp": 0.12957764, + "step": 2137, + "time_per_iteration": 3.308232545852661 + }, + { + "auxiliary_loss_clip": 0.06652065, + "auxiliary_loss_mlp": 0.01307251, + "balance_loss_clip": 0.06317523, + "balance_loss_mlp": 0.01273515, + "epoch": 0.12854351420411844, + "flos": 13886507675520.0, + "grad_norm": 2.603073013301421, + "language_loss": 0.84481782, + "learning_rate": 3.898996223384512e-06, + "loss": 0.924411, + "num_input_tokens_seen": 46239970, + "router_z_loss_clip": 3.34765625, + "router_z_loss_mlp": 0.3371582, + "step": 2138, + "time_per_iteration": 2.5150487422943115 + }, + { + "auxiliary_loss_clip": 0.0665133, + "auxiliary_loss_mlp": 0.01300544, + "balance_loss_clip": 0.06310506, + "balance_loss_mlp": 0.01263136, + "epoch": 0.1286036374567864, + "flos": 22644534205440.0, + "grad_norm": 2.3721539245571237, + "language_loss": 0.79668736, + "learning_rate": 3.898873984919113e-06, + "loss": 0.87620616, + "num_input_tokens_seen": 46257740, + "router_z_loss_clip": 3.41015625, + "router_z_loss_mlp": 0.37402344, + "step": 2139, + "time_per_iteration": 2.5760304927825928 + }, + { + "auxiliary_loss_clip": 0.06645858, + "auxiliary_loss_mlp": 0.01289965, + "balance_loss_clip": 0.06314536, + "balance_loss_mlp": 0.0125754, + "epoch": 0.12866376070945437, + "flos": 16330121896320.0, + "grad_norm": 1.944874099387006, + "language_loss": 0.86374593, + "learning_rate": 3.8987516744482215e-06, + "loss": 0.94310415, + "num_input_tokens_seen": 46275445, + "router_z_loss_clip": 3.30859375, + "router_z_loss_mlp": 0.32421875, + "step": 2140, + "time_per_iteration": 2.5656511783599854 + }, + { + "auxiliary_loss_clip": 0.06634524, + "auxiliary_loss_mlp": 0.01284799, + "balance_loss_clip": 0.06308289, + "balance_loss_mlp": 0.01254496, + "epoch": 0.12872388396212234, + "flos": 11879321045760.0, + "grad_norm": 2.00800168780761, + "language_loss": 0.87046349, + "learning_rate": 3.898629291976476e-06, + "loss": 0.94965667, + "num_input_tokens_seen": 46291710, + "router_z_loss_clip": 3.2578125, + "router_z_loss_mlp": 0.30322266, + "step": 2141, + "time_per_iteration": 2.589749336242676 + }, + { + "auxiliary_loss_clip": 0.06646, + "auxiliary_loss_mlp": 0.01294177, + "balance_loss_clip": 0.06311622, + "balance_loss_mlp": 0.01261037, + "epoch": 0.12878400721479033, + "flos": 28374331777920.0, + "grad_norm": 2.3143248810569563, + "language_loss": 0.69344199, + "learning_rate": 3.898506837508518e-06, + "loss": 0.77284372, + "num_input_tokens_seen": 46311335, + "router_z_loss_clip": 3.34375, + "router_z_loss_mlp": 0.33154297, + "step": 2142, + "time_per_iteration": 2.631613254547119 + }, + { + "auxiliary_loss_clip": 0.06645877, + "auxiliary_loss_mlp": 0.01292532, + "balance_loss_clip": 0.06308207, + "balance_loss_mlp": 0.01257723, + "epoch": 0.1288441304674583, + "flos": 25892842711680.0, + "grad_norm": 1.8471793604151003, + "language_loss": 0.84538341, + "learning_rate": 3.89838431104899e-06, + "loss": 0.92476749, + "num_input_tokens_seen": 46330985, + "router_z_loss_clip": 3.38085938, + "router_z_loss_mlp": 0.34814453, + "step": 2143, + "time_per_iteration": 2.62510085105896 + }, + { + "auxiliary_loss_clip": 0.06646847, + "auxiliary_loss_mlp": 0.01296075, + "balance_loss_clip": 0.06309757, + "balance_loss_mlp": 0.01261194, + "epoch": 0.12890425372012626, + "flos": 20820097330560.0, + "grad_norm": 2.9481033880232284, + "language_loss": 0.82936227, + "learning_rate": 3.898261712602539e-06, + "loss": 0.90879142, + "num_input_tokens_seen": 46351295, + "router_z_loss_clip": 3.37109375, + "router_z_loss_mlp": 0.34912109, + "step": 2144, + "time_per_iteration": 2.562148332595825 + }, + { + "auxiliary_loss_clip": 0.06632444, + "auxiliary_loss_mlp": 0.01299578, + "balance_loss_clip": 0.06302489, + "balance_loss_mlp": 0.01263196, + "epoch": 0.12896437697279423, + "flos": 22572599875200.0, + "grad_norm": 2.2245116542983046, + "language_loss": 0.80073792, + "learning_rate": 3.898139042173813e-06, + "loss": 0.88005811, + "num_input_tokens_seen": 46368600, + "router_z_loss_clip": 3.29882812, + "router_z_loss_mlp": 0.36376953, + "step": 2145, + "time_per_iteration": 2.5510518550872803 + }, + { + "auxiliary_loss_clip": 0.06636346, + "auxiliary_loss_mlp": 0.0130688, + "balance_loss_clip": 0.06306225, + "balance_loss_mlp": 0.01269877, + "epoch": 0.1290245002254622, + "flos": 17499561004800.0, + "grad_norm": 2.1761731102138686, + "language_loss": 0.83456767, + "learning_rate": 3.898016299767465e-06, + "loss": 0.91399992, + "num_input_tokens_seen": 46387370, + "router_z_loss_clip": 3.30273438, + "router_z_loss_mlp": 0.36987305, + "step": 2146, + "time_per_iteration": 2.5113868713378906 + }, + { + "auxiliary_loss_clip": 0.06626259, + "auxiliary_loss_mlp": 0.01301495, + "balance_loss_clip": 0.06300884, + "balance_loss_mlp": 0.01266042, + "epoch": 0.12908462347813016, + "flos": 36324142151040.0, + "grad_norm": 4.395125583857354, + "language_loss": 0.72594023, + "learning_rate": 3.897893485388149e-06, + "loss": 0.8052178, + "num_input_tokens_seen": 46409570, + "router_z_loss_clip": 3.25585938, + "router_z_loss_mlp": 0.35449219, + "step": 2147, + "time_per_iteration": 2.7282183170318604 + }, + { + "auxiliary_loss_clip": 0.06638759, + "auxiliary_loss_mlp": 0.01311135, + "balance_loss_clip": 0.0630547, + "balance_loss_mlp": 0.0127685, + "epoch": 0.12914474673079815, + "flos": 22535312008320.0, + "grad_norm": 2.709676387149746, + "language_loss": 0.73026669, + "learning_rate": 3.897770599040521e-06, + "loss": 0.80976564, + "num_input_tokens_seen": 46429320, + "router_z_loss_clip": 3.33203125, + "router_z_loss_mlp": 0.34326172, + "step": 2148, + "time_per_iteration": 2.5520236492156982 + }, + { + "auxiliary_loss_clip": 0.0663462, + "auxiliary_loss_mlp": 0.01329577, + "balance_loss_clip": 0.06310473, + "balance_loss_mlp": 0.01295626, + "epoch": 0.12920486998346611, + "flos": 21478533114240.0, + "grad_norm": 1.8799370652963014, + "language_loss": 0.80598587, + "learning_rate": 3.897647640729242e-06, + "loss": 0.88562787, + "num_input_tokens_seen": 46450155, + "router_z_loss_clip": 3.2421875, + "router_z_loss_mlp": 0.33959961, + "step": 2149, + "time_per_iteration": 3.9808621406555176 + }, + { + "auxiliary_loss_clip": 0.06633235, + "auxiliary_loss_mlp": 0.01311577, + "balance_loss_clip": 0.06304948, + "balance_loss_mlp": 0.01273907, + "epoch": 0.12926499323613408, + "flos": 27316001583360.0, + "grad_norm": 1.9848043356035314, + "language_loss": 0.77766216, + "learning_rate": 3.897524610458975e-06, + "loss": 0.85711026, + "num_input_tokens_seen": 46470280, + "router_z_loss_clip": 3.27929688, + "router_z_loss_mlp": 0.37646484, + "step": 2150, + "time_per_iteration": 4.050567388534546 + }, + { + "auxiliary_loss_clip": 0.06637069, + "auxiliary_loss_mlp": 0.01309125, + "balance_loss_clip": 0.06305329, + "balance_loss_mlp": 0.01273791, + "epoch": 0.12932511648880204, + "flos": 22097710460160.0, + "grad_norm": 2.600129389398131, + "language_loss": 0.71828127, + "learning_rate": 3.8974015082343835e-06, + "loss": 0.79774326, + "num_input_tokens_seen": 46487605, + "router_z_loss_clip": 3.31835938, + "router_z_loss_mlp": 0.35351562, + "step": 2151, + "time_per_iteration": 2.539199113845825 + }, + { + "auxiliary_loss_clip": 0.06638855, + "auxiliary_loss_mlp": 0.01316478, + "balance_loss_clip": 0.06308948, + "balance_loss_mlp": 0.01280716, + "epoch": 0.12938523974147, + "flos": 20308968224640.0, + "grad_norm": 2.09152011854814, + "language_loss": 0.85415232, + "learning_rate": 3.897278334060137e-06, + "loss": 0.93370569, + "num_input_tokens_seen": 46505100, + "router_z_loss_clip": 3.296875, + "router_z_loss_mlp": 0.35766602, + "step": 2152, + "time_per_iteration": 4.064931631088257 + }, + { + "auxiliary_loss_clip": 0.06626976, + "auxiliary_loss_mlp": 0.0130895, + "balance_loss_clip": 0.06301983, + "balance_loss_mlp": 0.01275118, + "epoch": 0.12944536299413797, + "flos": 19505992947840.0, + "grad_norm": 2.0734690645371865, + "language_loss": 0.79983026, + "learning_rate": 3.897155087940906e-06, + "loss": 0.87918949, + "num_input_tokens_seen": 46524020, + "router_z_loss_clip": 3.25, + "router_z_loss_mlp": 0.33837891, + "step": 2153, + "time_per_iteration": 3.9787750244140625 + }, + { + "auxiliary_loss_clip": 0.06634978, + "auxiliary_loss_mlp": 0.01296438, + "balance_loss_clip": 0.06309275, + "balance_loss_mlp": 0.01262845, + "epoch": 0.12950548624680594, + "flos": 27715099380480.0, + "grad_norm": 1.6134334939452253, + "language_loss": 0.81228089, + "learning_rate": 3.897031769881364e-06, + "loss": 0.89159513, + "num_input_tokens_seen": 46544640, + "router_z_loss_clip": 3.2578125, + "router_z_loss_mlp": 0.3359375, + "step": 2154, + "time_per_iteration": 2.6176583766937256 + }, + { + "auxiliary_loss_clip": 0.06634305, + "auxiliary_loss_mlp": 0.01301182, + "balance_loss_clip": 0.06307935, + "balance_loss_mlp": 0.01267756, + "epoch": 0.12956560949947393, + "flos": 17571369553920.0, + "grad_norm": 5.013009585067341, + "language_loss": 0.84744835, + "learning_rate": 3.896908379886188e-06, + "loss": 0.92680323, + "num_input_tokens_seen": 46561395, + "router_z_loss_clip": 3.265625, + "router_z_loss_mlp": 0.33422852, + "step": 2155, + "time_per_iteration": 2.512476921081543 + }, + { + "auxiliary_loss_clip": 0.06635429, + "auxiliary_loss_mlp": 0.01300286, + "balance_loss_clip": 0.06301479, + "balance_loss_mlp": 0.01265668, + "epoch": 0.1296257327521419, + "flos": 20746989043200.0, + "grad_norm": 7.629659850029062, + "language_loss": 0.77301121, + "learning_rate": 3.896784917960055e-06, + "loss": 0.85236835, + "num_input_tokens_seen": 46579395, + "router_z_loss_clip": 3.34179688, + "router_z_loss_mlp": 0.34619141, + "step": 2156, + "time_per_iteration": 2.5492148399353027 + }, + { + "auxiliary_loss_clip": 0.06627367, + "auxiliary_loss_mlp": 0.01301012, + "balance_loss_clip": 0.06305566, + "balance_loss_mlp": 0.01268063, + "epoch": 0.12968585600480986, + "flos": 16400756488320.0, + "grad_norm": 2.322189413476167, + "language_loss": 0.88143146, + "learning_rate": 3.896661384107648e-06, + "loss": 0.96071517, + "num_input_tokens_seen": 46597090, + "router_z_loss_clip": 3.21875, + "router_z_loss_mlp": 0.32910156, + "step": 2157, + "time_per_iteration": 2.571720838546753 + }, + { + "auxiliary_loss_clip": 0.06642087, + "auxiliary_loss_mlp": 0.0129196, + "balance_loss_clip": 0.06308718, + "balance_loss_mlp": 0.01257699, + "epoch": 0.12974597925747783, + "flos": 28337043911040.0, + "grad_norm": 2.3553612027238753, + "language_loss": 0.82135451, + "learning_rate": 3.896537778333651e-06, + "loss": 0.90069497, + "num_input_tokens_seen": 46617355, + "router_z_loss_clip": 3.33398438, + "router_z_loss_mlp": 0.34277344, + "step": 2158, + "time_per_iteration": 2.5973830223083496 + }, + { + "auxiliary_loss_clip": 0.06639753, + "auxiliary_loss_mlp": 0.0129687, + "balance_loss_clip": 0.06306097, + "balance_loss_mlp": 0.01263467, + "epoch": 0.1298061025101458, + "flos": 9687036746880.0, + "grad_norm": 2.577133138726625, + "language_loss": 0.76591945, + "learning_rate": 3.896414100642752e-06, + "loss": 0.84528571, + "num_input_tokens_seen": 46633130, + "router_z_loss_clip": 3.33984375, + "router_z_loss_mlp": 0.33422852, + "step": 2159, + "time_per_iteration": 2.4932103157043457 + }, + { + "auxiliary_loss_clip": 0.06634657, + "auxiliary_loss_mlp": 0.01294131, + "balance_loss_clip": 0.06308954, + "balance_loss_mlp": 0.01261086, + "epoch": 0.12986622576281376, + "flos": 27716986097280.0, + "grad_norm": 2.475517406269625, + "language_loss": 0.83553314, + "learning_rate": 3.89629035103964e-06, + "loss": 0.91482103, + "num_input_tokens_seen": 46650575, + "router_z_loss_clip": 3.26171875, + "router_z_loss_mlp": 0.33056641, + "step": 2160, + "time_per_iteration": 2.603818655014038 + }, + { + "auxiliary_loss_clip": 0.06627609, + "auxiliary_loss_mlp": 0.01293116, + "balance_loss_clip": 0.06306535, + "balance_loss_mlp": 0.01259118, + "epoch": 0.12992634901548175, + "flos": 18807963310080.0, + "grad_norm": 1.593154120113757, + "language_loss": 0.83271182, + "learning_rate": 3.896166529529008e-06, + "loss": 0.91191912, + "num_input_tokens_seen": 46668780, + "router_z_loss_clip": 3.21484375, + "router_z_loss_mlp": 0.33984375, + "step": 2161, + "time_per_iteration": 2.5266897678375244 + }, + { + "auxiliary_loss_clip": 0.06639621, + "auxiliary_loss_mlp": 0.01302779, + "balance_loss_clip": 0.06313581, + "balance_loss_mlp": 0.01268423, + "epoch": 0.12998647226814972, + "flos": 29134442891520.0, + "grad_norm": 2.3185391348432254, + "language_loss": 0.83230841, + "learning_rate": 3.896042636115551e-06, + "loss": 0.91173244, + "num_input_tokens_seen": 46687550, + "router_z_loss_clip": 3.25976562, + "router_z_loss_mlp": 0.34375, + "step": 2162, + "time_per_iteration": 2.65075945854187 + }, + { + "auxiliary_loss_clip": 0.06644595, + "auxiliary_loss_mlp": 0.0130915, + "balance_loss_clip": 0.06308532, + "balance_loss_mlp": 0.01275485, + "epoch": 0.13004659552081768, + "flos": 19579855921920.0, + "grad_norm": 2.844531827385147, + "language_loss": 0.74537766, + "learning_rate": 3.895918670803968e-06, + "loss": 0.82491517, + "num_input_tokens_seen": 46706730, + "router_z_loss_clip": 3.36132812, + "router_z_loss_mlp": 0.33666992, + "step": 2163, + "time_per_iteration": 2.54642653465271 + }, + { + "auxiliary_loss_clip": 0.06640218, + "auxiliary_loss_mlp": 0.0130695, + "balance_loss_clip": 0.06307475, + "balance_loss_mlp": 0.01271259, + "epoch": 0.13010671877348565, + "flos": 22497059819520.0, + "grad_norm": 2.8300840640024605, + "language_loss": 0.82687104, + "learning_rate": 3.895794633598958e-06, + "loss": 0.90634274, + "num_input_tokens_seen": 46724250, + "router_z_loss_clip": 3.33007812, + "router_z_loss_mlp": 0.35668945, + "step": 2164, + "time_per_iteration": 2.5606889724731445 + }, + { + "auxiliary_loss_clip": 0.06643611, + "auxiliary_loss_mlp": 0.01308241, + "balance_loss_clip": 0.0631078, + "balance_loss_mlp": 0.0127317, + "epoch": 0.1301668420261536, + "flos": 23884985249280.0, + "grad_norm": 2.1372618334431004, + "language_loss": 0.72789967, + "learning_rate": 3.8956705245052256e-06, + "loss": 0.80741817, + "num_input_tokens_seen": 46744105, + "router_z_loss_clip": 3.33007812, + "router_z_loss_mlp": 0.35058594, + "step": 2165, + "time_per_iteration": 2.5799126625061035 + }, + { + "auxiliary_loss_clip": 0.06653779, + "auxiliary_loss_mlp": 0.01315345, + "balance_loss_clip": 0.06317334, + "balance_loss_mlp": 0.0127932, + "epoch": 0.13022696527882158, + "flos": 23156963049600.0, + "grad_norm": 2.4025078023781563, + "language_loss": 0.76332915, + "learning_rate": 3.8955463435274765e-06, + "loss": 0.84302044, + "num_input_tokens_seen": 46764250, + "router_z_loss_clip": 3.3671875, + "router_z_loss_mlp": 0.35986328, + "step": 2166, + "time_per_iteration": 2.6160640716552734 + }, + { + "auxiliary_loss_clip": 0.06650659, + "auxiliary_loss_mlp": 0.01325427, + "balance_loss_clip": 0.06318434, + "balance_loss_mlp": 0.01292144, + "epoch": 0.13028708853148954, + "flos": 26916149099520.0, + "grad_norm": 2.7267776489226945, + "language_loss": 0.84227574, + "learning_rate": 3.895422090670421e-06, + "loss": 0.92203659, + "num_input_tokens_seen": 46786865, + "router_z_loss_clip": 3.3203125, + "router_z_loss_mlp": 0.33276367, + "step": 2167, + "time_per_iteration": 2.6118650436401367 + }, + { + "auxiliary_loss_clip": 0.0665281, + "auxiliary_loss_mlp": 0.01322266, + "balance_loss_clip": 0.06323615, + "balance_loss_mlp": 0.01284524, + "epoch": 0.13034721178415754, + "flos": 21257824659840.0, + "grad_norm": 1.882236850474067, + "language_loss": 0.84621233, + "learning_rate": 3.89529776593877e-06, + "loss": 0.9259631, + "num_input_tokens_seen": 46807030, + "router_z_loss_clip": 3.29101562, + "router_z_loss_mlp": 0.37719727, + "step": 2168, + "time_per_iteration": 2.599341869354248 + }, + { + "auxiliary_loss_clip": 0.06651181, + "auxiliary_loss_mlp": 0.01330045, + "balance_loss_clip": 0.0631827, + "balance_loss_mlp": 0.01296166, + "epoch": 0.1304073350368255, + "flos": 18772646014080.0, + "grad_norm": 2.6769280516725495, + "language_loss": 0.81258374, + "learning_rate": 3.8951733693372375e-06, + "loss": 0.89239597, + "num_input_tokens_seen": 46826280, + "router_z_loss_clip": 3.32617188, + "router_z_loss_mlp": 0.33886719, + "step": 2169, + "time_per_iteration": 2.551320791244507 + }, + { + "auxiliary_loss_clip": 0.06645042, + "auxiliary_loss_mlp": 0.01325755, + "balance_loss_clip": 0.06314517, + "balance_loss_mlp": 0.01290898, + "epoch": 0.13046745828949347, + "flos": 28371941936640.0, + "grad_norm": 2.6264294111585285, + "language_loss": 0.6902529, + "learning_rate": 3.8950489008705406e-06, + "loss": 0.76996082, + "num_input_tokens_seen": 46846505, + "router_z_loss_clip": 3.30664062, + "router_z_loss_mlp": 0.34838867, + "step": 2170, + "time_per_iteration": 2.636103868484497 + }, + { + "auxiliary_loss_clip": 0.06639146, + "auxiliary_loss_mlp": 0.01323013, + "balance_loss_clip": 0.063104, + "balance_loss_mlp": 0.01289826, + "epoch": 0.13052758154216143, + "flos": 29612518761600.0, + "grad_norm": 2.576487358768087, + "language_loss": 0.68392706, + "learning_rate": 3.8949243605434e-06, + "loss": 0.76354867, + "num_input_tokens_seen": 46867380, + "router_z_loss_clip": 3.28125, + "router_z_loss_mlp": 0.33178711, + "step": 2171, + "time_per_iteration": 2.6055140495300293 + }, + { + "auxiliary_loss_clip": 0.06645554, + "auxiliary_loss_mlp": 0.01327149, + "balance_loss_clip": 0.06309786, + "balance_loss_mlp": 0.0129215, + "epoch": 0.1305877047948294, + "flos": 19396938458880.0, + "grad_norm": 3.1003670458212973, + "language_loss": 0.73706764, + "learning_rate": 3.894799748360537e-06, + "loss": 0.81679469, + "num_input_tokens_seen": 46886810, + "router_z_loss_clip": 3.35742188, + "router_z_loss_mlp": 0.35009766, + "step": 2172, + "time_per_iteration": 2.541368007659912 + }, + { + "auxiliary_loss_clip": 0.06633269, + "auxiliary_loss_mlp": 0.01311381, + "balance_loss_clip": 0.06310625, + "balance_loss_mlp": 0.01278884, + "epoch": 0.13064782804749736, + "flos": 16879209701760.0, + "grad_norm": 2.044770569718403, + "language_loss": 0.7695576, + "learning_rate": 3.894675064326678e-06, + "loss": 0.84900403, + "num_input_tokens_seen": 46905620, + "router_z_loss_clip": 3.22460938, + "router_z_loss_mlp": 0.32470703, + "step": 2173, + "time_per_iteration": 2.5094704627990723 + }, + { + "auxiliary_loss_clip": 0.06648449, + "auxiliary_loss_mlp": 0.0132515, + "balance_loss_clip": 0.06310691, + "balance_loss_mlp": 0.01289125, + "epoch": 0.13070795130016533, + "flos": 24506049312000.0, + "grad_norm": 2.8505370909687575, + "language_loss": 0.725703, + "learning_rate": 3.894550308446551e-06, + "loss": 0.805439, + "num_input_tokens_seen": 46925120, + "router_z_loss_clip": 3.3828125, + "router_z_loss_mlp": 0.36035156, + "step": 2174, + "time_per_iteration": 2.5734338760375977 + }, + { + "auxiliary_loss_clip": 0.06505907, + "auxiliary_loss_mlp": 0.01291883, + "balance_loss_clip": 0.0631025, + "balance_loss_mlp": 0.0128004, + "epoch": 0.13076807455283332, + "flos": 71075288401920.0, + "grad_norm": 0.7747015133023086, + "language_loss": 0.58868217, + "learning_rate": 3.894425480724886e-06, + "loss": 0.66666007, + "num_input_tokens_seen": 46988195, + "router_z_loss_clip": 1.953125, + "router_z_loss_mlp": 0.11834717, + "step": 2175, + "time_per_iteration": 3.2926440238952637 + }, + { + "auxiliary_loss_clip": 0.0663542, + "auxiliary_loss_mlp": 0.01313196, + "balance_loss_clip": 0.06304372, + "balance_loss_mlp": 0.01276337, + "epoch": 0.13082819780550128, + "flos": 20270380619520.0, + "grad_norm": 2.4663196598164543, + "language_loss": 0.8129558, + "learning_rate": 3.894300581166417e-06, + "loss": 0.89244199, + "num_input_tokens_seen": 47004720, + "router_z_loss_clip": 3.30664062, + "router_z_loss_mlp": 0.36865234, + "step": 2176, + "time_per_iteration": 2.509202480316162 + }, + { + "auxiliary_loss_clip": 0.06636009, + "auxiliary_loss_mlp": 0.01308249, + "balance_loss_clip": 0.06307728, + "balance_loss_mlp": 0.01275204, + "epoch": 0.13088832105816925, + "flos": 34211884101120.0, + "grad_norm": 2.555490160200695, + "language_loss": 0.75945169, + "learning_rate": 3.894175609775881e-06, + "loss": 0.83889425, + "num_input_tokens_seen": 47024255, + "router_z_loss_clip": 3.28710938, + "router_z_loss_mlp": 0.33056641, + "step": 2177, + "time_per_iteration": 2.666957378387451 + }, + { + "auxiliary_loss_clip": 0.06632685, + "auxiliary_loss_mlp": 0.01303929, + "balance_loss_clip": 0.0630488, + "balance_loss_mlp": 0.01266378, + "epoch": 0.13094844431083721, + "flos": 17900797080960.0, + "grad_norm": 1.8104390236362107, + "language_loss": 0.8256914, + "learning_rate": 3.894050566558015e-06, + "loss": 0.90505755, + "num_input_tokens_seen": 47042465, + "router_z_loss_clip": 3.27929688, + "router_z_loss_mlp": 0.37548828, + "step": 2178, + "time_per_iteration": 2.5337579250335693 + }, + { + "auxiliary_loss_clip": 0.06635031, + "auxiliary_loss_mlp": 0.01298768, + "balance_loss_clip": 0.06305701, + "balance_loss_mlp": 0.01263625, + "epoch": 0.13100856756350518, + "flos": 17317062812160.0, + "grad_norm": 2.2347658227591327, + "language_loss": 0.76173234, + "learning_rate": 3.893925451517562e-06, + "loss": 0.84107035, + "num_input_tokens_seen": 47060370, + "router_z_loss_clip": 3.296875, + "router_z_loss_mlp": 0.35131836, + "step": 2179, + "time_per_iteration": 2.606982469558716 + }, + { + "auxiliary_loss_clip": 0.06624588, + "auxiliary_loss_mlp": 0.01289469, + "balance_loss_clip": 0.0630476, + "balance_loss_mlp": 0.01256281, + "epoch": 0.13106869081617314, + "flos": 22207142292480.0, + "grad_norm": 2.1299268574103074, + "language_loss": 0.85375142, + "learning_rate": 3.893800264659266e-06, + "loss": 0.93289196, + "num_input_tokens_seen": 47081415, + "router_z_loss_clip": 3.19921875, + "router_z_loss_mlp": 0.33154297, + "step": 2180, + "time_per_iteration": 2.585345506668091 + }, + { + "auxiliary_loss_clip": 0.06632008, + "auxiliary_loss_mlp": 0.01298661, + "balance_loss_clip": 0.06304625, + "balance_loss_mlp": 0.01265282, + "epoch": 0.13112881406884114, + "flos": 21769708452480.0, + "grad_norm": 1.7694842435775522, + "language_loss": 0.9062323, + "learning_rate": 3.8936750059878746e-06, + "loss": 0.98553902, + "num_input_tokens_seen": 47099860, + "router_z_loss_clip": 3.2734375, + "router_z_loss_mlp": 0.33374023, + "step": 2181, + "time_per_iteration": 2.5587892532348633 + }, + { + "auxiliary_loss_clip": 0.06634288, + "auxiliary_loss_mlp": 0.01294395, + "balance_loss_clip": 0.06307417, + "balance_loss_mlp": 0.01259776, + "epoch": 0.1311889373215091, + "flos": 23337784160640.0, + "grad_norm": 2.2247782487696557, + "language_loss": 0.70639372, + "learning_rate": 3.893549675508137e-06, + "loss": 0.78568053, + "num_input_tokens_seen": 47118540, + "router_z_loss_clip": 3.26953125, + "router_z_loss_mlp": 0.34594727, + "step": 2182, + "time_per_iteration": 2.5555248260498047 + }, + { + "auxiliary_loss_clip": 0.06638541, + "auxiliary_loss_mlp": 0.0130911, + "balance_loss_clip": 0.06305085, + "balance_loss_mlp": 0.01272799, + "epoch": 0.13124906057417707, + "flos": 21473250307200.0, + "grad_norm": 2.348832160211932, + "language_loss": 0.79619586, + "learning_rate": 3.893424273224806e-06, + "loss": 0.8756724, + "num_input_tokens_seen": 47136710, + "router_z_loss_clip": 3.33789062, + "router_z_loss_mlp": 0.36303711, + "step": 2183, + "time_per_iteration": 2.6583075523376465 + }, + { + "auxiliary_loss_clip": 0.06622553, + "auxiliary_loss_mlp": 0.01296715, + "balance_loss_clip": 0.06301284, + "balance_loss_mlp": 0.0126379, + "epoch": 0.13130918382684503, + "flos": 23261531345280.0, + "grad_norm": 1.7633024883927577, + "language_loss": 0.86310816, + "learning_rate": 3.893298799142636e-06, + "loss": 0.94230086, + "num_input_tokens_seen": 47157155, + "router_z_loss_clip": 3.2109375, + "router_z_loss_mlp": 0.32910156, + "step": 2184, + "time_per_iteration": 2.565059185028076 + }, + { + "auxiliary_loss_clip": 0.06636564, + "auxiliary_loss_mlp": 0.01289356, + "balance_loss_clip": 0.06310757, + "balance_loss_mlp": 0.0125593, + "epoch": 0.131369307079513, + "flos": 20856588583680.0, + "grad_norm": 2.0374007595813106, + "language_loss": 0.83394486, + "learning_rate": 3.893173253266387e-06, + "loss": 0.91320401, + "num_input_tokens_seen": 47176820, + "router_z_loss_clip": 3.25585938, + "router_z_loss_mlp": 0.33447266, + "step": 2185, + "time_per_iteration": 2.581048011779785 + }, + { + "auxiliary_loss_clip": 0.06633392, + "auxiliary_loss_mlp": 0.01301523, + "balance_loss_clip": 0.063053, + "balance_loss_mlp": 0.012675, + "epoch": 0.13142943033218096, + "flos": 17864138119680.0, + "grad_norm": 2.061355049120503, + "language_loss": 0.7394222, + "learning_rate": 3.893047635600818e-06, + "loss": 0.8187713, + "num_input_tokens_seen": 47195855, + "router_z_loss_clip": 3.27734375, + "router_z_loss_mlp": 0.33984375, + "step": 2186, + "time_per_iteration": 2.5314900875091553 + }, + { + "auxiliary_loss_clip": 0.06633774, + "auxiliary_loss_mlp": 0.01305006, + "balance_loss_clip": 0.06309012, + "balance_loss_mlp": 0.01268337, + "epoch": 0.13148955358484893, + "flos": 21002343960960.0, + "grad_norm": 2.3237992911957748, + "language_loss": 0.8187871, + "learning_rate": 3.892921946150693e-06, + "loss": 0.89817482, + "num_input_tokens_seen": 47214535, + "router_z_loss_clip": 3.24609375, + "router_z_loss_mlp": 0.36669922, + "step": 2187, + "time_per_iteration": 2.575146198272705 + }, + { + "auxiliary_loss_clip": 0.0650041, + "auxiliary_loss_mlp": 0.01303078, + "balance_loss_clip": 0.06306808, + "balance_loss_mlp": 0.01287998, + "epoch": 0.13154967683751692, + "flos": 70192035313920.0, + "grad_norm": 0.8229480574179819, + "language_loss": 0.58883667, + "learning_rate": 3.892796184920778e-06, + "loss": 0.66687155, + "num_input_tokens_seen": 47270300, + "router_z_loss_clip": 1.9375, + "router_z_loss_mlp": 0.1505127, + "step": 2188, + "time_per_iteration": 4.631601572036743 + }, + { + "auxiliary_loss_clip": 0.06627252, + "auxiliary_loss_mlp": 0.01301964, + "balance_loss_clip": 0.06307825, + "balance_loss_mlp": 0.01268609, + "epoch": 0.1316098000901849, + "flos": 20382411928320.0, + "grad_norm": 1.8739878728488704, + "language_loss": 0.75486964, + "learning_rate": 3.892670351915842e-06, + "loss": 0.83416182, + "num_input_tokens_seen": 47290720, + "router_z_loss_clip": 3.1953125, + "router_z_loss_mlp": 0.33300781, + "step": 2189, + "time_per_iteration": 4.007068395614624 + }, + { + "auxiliary_loss_clip": 0.06638934, + "auxiliary_loss_mlp": 0.01302262, + "balance_loss_clip": 0.06312171, + "balance_loss_mlp": 0.01267691, + "epoch": 0.13166992334285285, + "flos": 23227723422720.0, + "grad_norm": 2.019862807668573, + "language_loss": 0.73193908, + "learning_rate": 3.892544447140657e-06, + "loss": 0.81135106, + "num_input_tokens_seen": 47311820, + "router_z_loss_clip": 3.26757812, + "router_z_loss_mlp": 0.34570312, + "step": 2190, + "time_per_iteration": 2.5776755809783936 + }, + { + "auxiliary_loss_clip": 0.06636755, + "auxiliary_loss_mlp": 0.01299753, + "balance_loss_clip": 0.06315562, + "balance_loss_mlp": 0.01266828, + "epoch": 0.13173004659552082, + "flos": 23337616452480.0, + "grad_norm": 1.8457361126651268, + "language_loss": 0.75608957, + "learning_rate": 3.892418470599996e-06, + "loss": 0.83545464, + "num_input_tokens_seen": 47331605, + "router_z_loss_clip": 3.21484375, + "router_z_loss_mlp": 0.32958984, + "step": 2191, + "time_per_iteration": 2.580988645553589 + }, + { + "auxiliary_loss_clip": 0.06637161, + "auxiliary_loss_mlp": 0.01295844, + "balance_loss_clip": 0.06311083, + "balance_loss_mlp": 0.01258699, + "epoch": 0.13179016984818878, + "flos": 21257866586880.0, + "grad_norm": 2.0212941585210613, + "language_loss": 0.80481809, + "learning_rate": 3.892292422298637e-06, + "loss": 0.88414812, + "num_input_tokens_seen": 47350455, + "router_z_loss_clip": 3.2578125, + "router_z_loss_mlp": 0.37133789, + "step": 2192, + "time_per_iteration": 5.4770941734313965 + }, + { + "auxiliary_loss_clip": 0.06644538, + "auxiliary_loss_mlp": 0.01301425, + "balance_loss_clip": 0.06318243, + "balance_loss_mlp": 0.01265758, + "epoch": 0.13185029310085675, + "flos": 17783357184000.0, + "grad_norm": 2.540381366914011, + "language_loss": 0.86697793, + "learning_rate": 3.892166302241361e-06, + "loss": 0.94643748, + "num_input_tokens_seen": 47368225, + "router_z_loss_clip": 3.26757812, + "router_z_loss_mlp": 0.35693359, + "step": 2193, + "time_per_iteration": 2.5420453548431396 + }, + { + "auxiliary_loss_clip": 0.06500036, + "auxiliary_loss_mlp": 0.01269775, + "balance_loss_clip": 0.06307782, + "balance_loss_mlp": 0.01257103, + "epoch": 0.1319104163535247, + "flos": 69872586422400.0, + "grad_norm": 0.721919772393688, + "language_loss": 0.54093373, + "learning_rate": 3.8920401104329475e-06, + "loss": 0.61863184, + "num_input_tokens_seen": 47427125, + "router_z_loss_clip": 1.921875, + "router_z_loss_mlp": 0.12683105, + "step": 2194, + "time_per_iteration": 3.1521217823028564 + }, + { + "auxiliary_loss_clip": 0.06633582, + "auxiliary_loss_mlp": 0.01294441, + "balance_loss_clip": 0.06310762, + "balance_loss_mlp": 0.01261277, + "epoch": 0.1319705396061927, + "flos": 25200305516160.0, + "grad_norm": 1.726437316735012, + "language_loss": 0.7434622, + "learning_rate": 3.891913846878185e-06, + "loss": 0.82274246, + "num_input_tokens_seen": 47450275, + "router_z_loss_clip": 3.23046875, + "router_z_loss_mlp": 0.33154297, + "step": 2195, + "time_per_iteration": 2.593909740447998 + }, + { + "auxiliary_loss_clip": 0.06639563, + "auxiliary_loss_mlp": 0.01299138, + "balance_loss_clip": 0.0630713, + "balance_loss_mlp": 0.01264305, + "epoch": 0.13203066285886067, + "flos": 20746695553920.0, + "grad_norm": 1.9416785711103928, + "language_loss": 0.79390305, + "learning_rate": 3.891787511581859e-06, + "loss": 0.87329006, + "num_input_tokens_seen": 47469155, + "router_z_loss_clip": 3.32421875, + "router_z_loss_mlp": 0.34838867, + "step": 2196, + "time_per_iteration": 2.5824716091156006 + }, + { + "auxiliary_loss_clip": 0.06635743, + "auxiliary_loss_mlp": 0.01302288, + "balance_loss_clip": 0.06304654, + "balance_loss_mlp": 0.01269148, + "epoch": 0.13209078611152864, + "flos": 22060925717760.0, + "grad_norm": 8.075867999821003, + "language_loss": 0.76482284, + "learning_rate": 3.89166110454876e-06, + "loss": 0.84420311, + "num_input_tokens_seen": 47488405, + "router_z_loss_clip": 3.31054688, + "router_z_loss_mlp": 0.33105469, + "step": 2197, + "time_per_iteration": 2.5501832962036133 + }, + { + "auxiliary_loss_clip": 0.06635305, + "auxiliary_loss_mlp": 0.01300777, + "balance_loss_clip": 0.06304274, + "balance_loss_mlp": 0.01266063, + "epoch": 0.1321509093641966, + "flos": 16289731428480.0, + "grad_norm": 2.9293196732039126, + "language_loss": 0.81022984, + "learning_rate": 3.891534625783685e-06, + "loss": 0.88959062, + "num_input_tokens_seen": 47505650, + "router_z_loss_clip": 3.31054688, + "router_z_loss_mlp": 0.34716797, + "step": 2198, + "time_per_iteration": 2.570861577987671 + }, + { + "auxiliary_loss_clip": 0.06631541, + "auxiliary_loss_mlp": 0.01313296, + "balance_loss_clip": 0.06305937, + "balance_loss_mlp": 0.01279513, + "epoch": 0.13221103261686457, + "flos": 16988725388160.0, + "grad_norm": 2.4451285716665914, + "language_loss": 0.83851683, + "learning_rate": 3.891408075291425e-06, + "loss": 0.91796517, + "num_input_tokens_seen": 47521540, + "router_z_loss_clip": 3.25390625, + "router_z_loss_mlp": 0.33764648, + "step": 2199, + "time_per_iteration": 2.521033525466919 + }, + { + "auxiliary_loss_clip": 0.06631772, + "auxiliary_loss_mlp": 0.01306909, + "balance_loss_clip": 0.06307507, + "balance_loss_mlp": 0.01272887, + "epoch": 0.13227115586953253, + "flos": 34240996195200.0, + "grad_norm": 1.9425616182298255, + "language_loss": 0.71189994, + "learning_rate": 3.8912814530767826e-06, + "loss": 0.79128671, + "num_input_tokens_seen": 47543625, + "router_z_loss_clip": 3.24023438, + "router_z_loss_mlp": 0.34033203, + "step": 2200, + "time_per_iteration": 2.670046806335449 + }, + { + "auxiliary_loss_clip": 0.06617988, + "auxiliary_loss_mlp": 0.01304715, + "balance_loss_clip": 0.06300868, + "balance_loss_mlp": 0.01274341, + "epoch": 0.13233127912220052, + "flos": 20711000914560.0, + "grad_norm": 2.1724926946699754, + "language_loss": 0.86090875, + "learning_rate": 3.891154759144557e-06, + "loss": 0.94013584, + "num_input_tokens_seen": 47563740, + "router_z_loss_clip": 3.17773438, + "router_z_loss_mlp": 0.30371094, + "step": 2201, + "time_per_iteration": 2.570223569869995 + }, + { + "auxiliary_loss_clip": 0.06631213, + "auxiliary_loss_mlp": 0.01297349, + "balance_loss_clip": 0.06304044, + "balance_loss_mlp": 0.01263828, + "epoch": 0.1323914023748685, + "flos": 25810971672960.0, + "grad_norm": 1.9172071001088793, + "language_loss": 0.87768662, + "learning_rate": 3.891027993499554e-06, + "loss": 0.95697218, + "num_input_tokens_seen": 47582655, + "router_z_loss_clip": 3.2734375, + "router_z_loss_mlp": 0.33496094, + "step": 2202, + "time_per_iteration": 2.6102631092071533 + }, + { + "auxiliary_loss_clip": 0.06636258, + "auxiliary_loss_mlp": 0.012969, + "balance_loss_clip": 0.06311007, + "balance_loss_mlp": 0.01264427, + "epoch": 0.13245152562753645, + "flos": 21257908513920.0, + "grad_norm": 2.5432278039111202, + "language_loss": 0.73953617, + "learning_rate": 3.89090115614658e-06, + "loss": 0.81886774, + "num_input_tokens_seen": 47600875, + "router_z_loss_clip": 3.25390625, + "router_z_loss_mlp": 0.32470703, + "step": 2203, + "time_per_iteration": 2.582125425338745 + }, + { + "auxiliary_loss_clip": 0.0663885, + "auxiliary_loss_mlp": 0.01297802, + "balance_loss_clip": 0.06312627, + "balance_loss_mlp": 0.01266879, + "epoch": 0.13251164888020442, + "flos": 26617552675200.0, + "grad_norm": 2.0999892579623918, + "language_loss": 0.74886954, + "learning_rate": 3.890774247090444e-06, + "loss": 0.82823604, + "num_input_tokens_seen": 47619250, + "router_z_loss_clip": 3.26171875, + "router_z_loss_mlp": 0.30883789, + "step": 2204, + "time_per_iteration": 2.634873867034912 + }, + { + "auxiliary_loss_clip": 0.06637383, + "auxiliary_loss_mlp": 0.01309474, + "balance_loss_clip": 0.06314126, + "balance_loss_mlp": 0.01276119, + "epoch": 0.13257177213287238, + "flos": 29834485027200.0, + "grad_norm": 2.4895096645832235, + "language_loss": 0.79621047, + "learning_rate": 3.89064726633596e-06, + "loss": 0.87567902, + "num_input_tokens_seen": 47639445, + "router_z_loss_clip": 3.23242188, + "router_z_loss_mlp": 0.33349609, + "step": 2205, + "time_per_iteration": 2.619999647140503 + }, + { + "auxiliary_loss_clip": 0.06630976, + "auxiliary_loss_mlp": 0.01295213, + "balance_loss_clip": 0.06307817, + "balance_loss_mlp": 0.01261548, + "epoch": 0.13263189538554035, + "flos": 21294902891520.0, + "grad_norm": 2.228894402461185, + "language_loss": 0.80627573, + "learning_rate": 3.890520213887941e-06, + "loss": 0.88553762, + "num_input_tokens_seen": 47658740, + "router_z_loss_clip": 3.234375, + "router_z_loss_mlp": 0.33666992, + "step": 2206, + "time_per_iteration": 2.5711123943328857 + }, + { + "auxiliary_loss_clip": 0.06638241, + "auxiliary_loss_mlp": 0.01297492, + "balance_loss_clip": 0.06313571, + "balance_loss_mlp": 0.0126676, + "epoch": 0.13269201863820831, + "flos": 16879880534400.0, + "grad_norm": 2.2771237083056297, + "language_loss": 0.76153713, + "learning_rate": 3.890393089751208e-06, + "loss": 0.84089446, + "num_input_tokens_seen": 47676880, + "router_z_loss_clip": 3.2421875, + "router_z_loss_mlp": 0.30688477, + "step": 2207, + "time_per_iteration": 2.5054686069488525 + }, + { + "auxiliary_loss_clip": 0.06632576, + "auxiliary_loss_mlp": 0.01289317, + "balance_loss_clip": 0.06313936, + "balance_loss_mlp": 0.01259014, + "epoch": 0.1327521418908763, + "flos": 23775679198080.0, + "grad_norm": 2.287917678450009, + "language_loss": 0.85195792, + "learning_rate": 3.890265893930578e-06, + "loss": 0.9311769, + "num_input_tokens_seen": 47696635, + "router_z_loss_clip": 3.18359375, + "router_z_loss_mlp": 0.30322266, + "step": 2208, + "time_per_iteration": 2.609978675842285 + }, + { + "auxiliary_loss_clip": 0.0661916, + "auxiliary_loss_mlp": 0.0129287, + "balance_loss_clip": 0.06309634, + "balance_loss_mlp": 0.01263712, + "epoch": 0.13281226514354427, + "flos": 26512858598400.0, + "grad_norm": 2.1774657992842923, + "language_loss": 0.86578667, + "learning_rate": 3.890138626430876e-06, + "loss": 0.94490695, + "num_input_tokens_seen": 47717760, + "router_z_loss_clip": 3.09765625, + "router_z_loss_mlp": 0.29174805, + "step": 2209, + "time_per_iteration": 2.5905022621154785 + }, + { + "auxiliary_loss_clip": 0.06630558, + "auxiliary_loss_mlp": 0.01296527, + "balance_loss_clip": 0.06307525, + "balance_loss_mlp": 0.01264817, + "epoch": 0.13287238839621224, + "flos": 24505671968640.0, + "grad_norm": 2.0974790857001255, + "language_loss": 0.83324587, + "learning_rate": 3.890011287256929e-06, + "loss": 0.91251671, + "num_input_tokens_seen": 47737685, + "router_z_loss_clip": 3.234375, + "router_z_loss_mlp": 0.31689453, + "step": 2210, + "time_per_iteration": 2.605640172958374 + }, + { + "auxiliary_loss_clip": 0.06520031, + "auxiliary_loss_mlp": 0.01268108, + "balance_loss_clip": 0.06330763, + "balance_loss_mlp": 0.01256634, + "epoch": 0.1329325116488802, + "flos": 67713984264960.0, + "grad_norm": 0.7321997743468096, + "language_loss": 0.57977009, + "learning_rate": 3.889883876413563e-06, + "loss": 0.65765154, + "num_input_tokens_seen": 47802415, + "router_z_loss_clip": 1.890625, + "router_z_loss_mlp": 0.11456299, + "step": 2211, + "time_per_iteration": 3.2822937965393066 + }, + { + "auxiliary_loss_clip": 0.06521661, + "auxiliary_loss_mlp": 0.01258942, + "balance_loss_clip": 0.0633207, + "balance_loss_mlp": 0.01247897, + "epoch": 0.13299263490154817, + "flos": 72283440896640.0, + "grad_norm": 0.7669964089142771, + "language_loss": 0.54991639, + "learning_rate": 3.889756393905611e-06, + "loss": 0.62772238, + "num_input_tokens_seen": 47871485, + "router_z_loss_clip": 1.89648438, + "router_z_loss_mlp": 0.1105957, + "step": 2212, + "time_per_iteration": 3.2838916778564453 + }, + { + "auxiliary_loss_clip": 0.0664072, + "auxiliary_loss_mlp": 0.01298095, + "balance_loss_clip": 0.06314459, + "balance_loss_mlp": 0.012661, + "epoch": 0.13305275815421613, + "flos": 17937078698880.0, + "grad_norm": 3.2445802523020144, + "language_loss": 0.75483733, + "learning_rate": 3.889628839737908e-06, + "loss": 0.83422554, + "num_input_tokens_seen": 47888315, + "router_z_loss_clip": 3.26367188, + "router_z_loss_mlp": 0.31982422, + "step": 2213, + "time_per_iteration": 2.599457025527954 + }, + { + "auxiliary_loss_clip": 0.06623878, + "auxiliary_loss_mlp": 0.01290528, + "balance_loss_clip": 0.06308766, + "balance_loss_mlp": 0.01260917, + "epoch": 0.13311288140688413, + "flos": 22346566686720.0, + "grad_norm": 1.7850496574832224, + "language_loss": 0.80468798, + "learning_rate": 3.889501213915291e-06, + "loss": 0.88383198, + "num_input_tokens_seen": 47906600, + "router_z_loss_clip": 3.15429688, + "router_z_loss_mlp": 0.29614258, + "step": 2214, + "time_per_iteration": 2.572476625442505 + }, + { + "auxiliary_loss_clip": 0.06633762, + "auxiliary_loss_mlp": 0.01291249, + "balance_loss_clip": 0.06310902, + "balance_loss_mlp": 0.01259992, + "epoch": 0.1331730046595521, + "flos": 31877030880000.0, + "grad_norm": 1.879682062967662, + "language_loss": 0.71106076, + "learning_rate": 3.889373516442597e-06, + "loss": 0.79031086, + "num_input_tokens_seen": 47927630, + "router_z_loss_clip": 3.22851562, + "router_z_loss_mlp": 0.3125, + "step": 2215, + "time_per_iteration": 2.6289784908294678 + }, + { + "auxiliary_loss_clip": 0.06635362, + "auxiliary_loss_mlp": 0.01297639, + "balance_loss_clip": 0.06308068, + "balance_loss_mlp": 0.01264762, + "epoch": 0.13323312791222006, + "flos": 22573438416000.0, + "grad_norm": 2.1877299894623063, + "language_loss": 0.81866241, + "learning_rate": 3.889245747324671e-06, + "loss": 0.89799237, + "num_input_tokens_seen": 47947935, + "router_z_loss_clip": 3.27148438, + "router_z_loss_mlp": 0.32861328, + "step": 2216, + "time_per_iteration": 2.5978689193725586 + }, + { + "auxiliary_loss_clip": 0.06628902, + "auxiliary_loss_mlp": 0.01291342, + "balance_loss_clip": 0.06306753, + "balance_loss_mlp": 0.01260229, + "epoch": 0.13329325116488802, + "flos": 15090635174400.0, + "grad_norm": 1.945076656101512, + "language_loss": 0.8810879, + "learning_rate": 3.889117906566356e-06, + "loss": 0.96029037, + "num_input_tokens_seen": 47965515, + "router_z_loss_clip": 3.22265625, + "router_z_loss_mlp": 0.3112793, + "step": 2217, + "time_per_iteration": 2.5901639461517334 + }, + { + "auxiliary_loss_clip": 0.0662536, + "auxiliary_loss_mlp": 0.0129587, + "balance_loss_clip": 0.06307805, + "balance_loss_mlp": 0.01262563, + "epoch": 0.133353374417556, + "flos": 27461002273920.0, + "grad_norm": 2.771116888328456, + "language_loss": 0.75384659, + "learning_rate": 3.888989994172501e-06, + "loss": 0.83305889, + "num_input_tokens_seen": 47985675, + "router_z_loss_clip": 3.1796875, + "router_z_loss_mlp": 0.33349609, + "step": 2218, + "time_per_iteration": 2.5716331005096436 + }, + { + "auxiliary_loss_clip": 0.06631406, + "auxiliary_loss_mlp": 0.01293158, + "balance_loss_clip": 0.06307958, + "balance_loss_mlp": 0.01259875, + "epoch": 0.13341349767022395, + "flos": 24101081729280.0, + "grad_norm": 1.6852729372488615, + "language_loss": 0.88550645, + "learning_rate": 3.8888620101479565e-06, + "loss": 0.96475214, + "num_input_tokens_seen": 48004985, + "router_z_loss_clip": 3.23242188, + "router_z_loss_mlp": 0.33300781, + "step": 2219, + "time_per_iteration": 2.6070170402526855 + }, + { + "auxiliary_loss_clip": 0.06621003, + "auxiliary_loss_mlp": 0.01287851, + "balance_loss_clip": 0.06303806, + "balance_loss_mlp": 0.01257381, + "epoch": 0.13347362092289192, + "flos": 24140088604800.0, + "grad_norm": 2.0906842838932556, + "language_loss": 0.7815029, + "learning_rate": 3.888733954497574e-06, + "loss": 0.86059141, + "num_input_tokens_seen": 48024965, + "router_z_loss_clip": 3.16992188, + "router_z_loss_mlp": 0.3046875, + "step": 2220, + "time_per_iteration": 2.5560426712036133 + }, + { + "auxiliary_loss_clip": 0.06625573, + "auxiliary_loss_mlp": 0.01294385, + "balance_loss_clip": 0.06307516, + "balance_loss_mlp": 0.0126432, + "epoch": 0.1335337441755599, + "flos": 18441499478400.0, + "grad_norm": 3.5848326197945974, + "language_loss": 0.80259734, + "learning_rate": 3.888605827226212e-06, + "loss": 0.88179696, + "num_input_tokens_seen": 48040890, + "router_z_loss_clip": 3.18164062, + "router_z_loss_mlp": 0.30078125, + "step": 2221, + "time_per_iteration": 2.554230213165283 + }, + { + "auxiliary_loss_clip": 0.06500886, + "auxiliary_loss_mlp": 0.01279151, + "balance_loss_clip": 0.06314573, + "balance_loss_mlp": 0.01265382, + "epoch": 0.13359386742822787, + "flos": 50627608542720.0, + "grad_norm": 0.9620548374199929, + "language_loss": 0.69134498, + "learning_rate": 3.8884776283387275e-06, + "loss": 0.76914537, + "num_input_tokens_seen": 48091855, + "router_z_loss_clip": 1.86621094, + "router_z_loss_mlp": 0.13806152, + "step": 2222, + "time_per_iteration": 3.0396814346313477 + }, + { + "auxiliary_loss_clip": 0.0662626, + "auxiliary_loss_mlp": 0.01285858, + "balance_loss_clip": 0.06309929, + "balance_loss_mlp": 0.01257987, + "epoch": 0.13365399068089584, + "flos": 22784294016000.0, + "grad_norm": 6.993006748631453, + "language_loss": 0.68394774, + "learning_rate": 3.888349357839982e-06, + "loss": 0.76306891, + "num_input_tokens_seen": 48111350, + "router_z_loss_clip": 3.16015625, + "router_z_loss_mlp": 0.27856445, + "step": 2223, + "time_per_iteration": 2.6058313846588135 + }, + { + "auxiliary_loss_clip": 0.06624826, + "auxiliary_loss_mlp": 0.01288517, + "balance_loss_clip": 0.06304329, + "balance_loss_mlp": 0.01257296, + "epoch": 0.1337141139335638, + "flos": 12536540945280.0, + "grad_norm": 2.4608215865303937, + "language_loss": 0.8412739, + "learning_rate": 3.88822101573484e-06, + "loss": 0.9204073, + "num_input_tokens_seen": 48129840, + "router_z_loss_clip": 3.2109375, + "router_z_loss_mlp": 0.31213379, + "step": 2224, + "time_per_iteration": 2.5372307300567627 + }, + { + "auxiliary_loss_clip": 0.066294, + "auxiliary_loss_mlp": 0.01287352, + "balance_loss_clip": 0.06301981, + "balance_loss_mlp": 0.01255499, + "epoch": 0.13377423718623177, + "flos": 23045560646400.0, + "grad_norm": 2.2168840240666294, + "language_loss": 0.67877412, + "learning_rate": 3.888092602028167e-06, + "loss": 0.7579416, + "num_input_tokens_seen": 48149240, + "router_z_loss_clip": 3.27734375, + "router_z_loss_mlp": 0.31835938, + "step": 2225, + "time_per_iteration": 2.567253589630127 + }, + { + "auxiliary_loss_clip": 0.06627665, + "auxiliary_loss_mlp": 0.01285599, + "balance_loss_clip": 0.06307095, + "balance_loss_mlp": 0.01257406, + "epoch": 0.13383436043889974, + "flos": 16221905948160.0, + "grad_norm": 2.1695875347778184, + "language_loss": 0.90785301, + "learning_rate": 3.887964116724835e-06, + "loss": 0.98698568, + "num_input_tokens_seen": 48166330, + "router_z_loss_clip": 3.20703125, + "router_z_loss_mlp": 0.28186035, + "step": 2226, + "time_per_iteration": 2.6064305305480957 + }, + { + "auxiliary_loss_clip": 0.06623043, + "auxiliary_loss_mlp": 0.0129267, + "balance_loss_clip": 0.06300287, + "balance_loss_mlp": 0.01261771, + "epoch": 0.1338944836915677, + "flos": 24286514814720.0, + "grad_norm": 2.574481606503262, + "language_loss": 0.75021911, + "learning_rate": 3.887835559829712e-06, + "loss": 0.82937622, + "num_input_tokens_seen": 48187600, + "router_z_loss_clip": 3.23046875, + "router_z_loss_mlp": 0.30883789, + "step": 2227, + "time_per_iteration": 4.016468286514282 + }, + { + "auxiliary_loss_clip": 0.06618345, + "auxiliary_loss_mlp": 0.01292665, + "balance_loss_clip": 0.0629885, + "balance_loss_mlp": 0.01261265, + "epoch": 0.1339546069442357, + "flos": 17603793884160.0, + "grad_norm": 2.0025343623105214, + "language_loss": 0.8591758, + "learning_rate": 3.8877069313476764e-06, + "loss": 0.93828595, + "num_input_tokens_seen": 48204400, + "router_z_loss_clip": 3.1953125, + "router_z_loss_mlp": 0.31396484, + "step": 2228, + "time_per_iteration": 2.55798077583313 + }, + { + "auxiliary_loss_clip": 0.06615113, + "auxiliary_loss_mlp": 0.01284588, + "balance_loss_clip": 0.06298958, + "balance_loss_mlp": 0.01255548, + "epoch": 0.13401473019690366, + "flos": 18996163580160.0, + "grad_norm": 1.8879365390563052, + "language_loss": 0.82201439, + "learning_rate": 3.8875782312836054e-06, + "loss": 0.90101147, + "num_input_tokens_seen": 48222180, + "router_z_loss_clip": 3.15820312, + "router_z_loss_mlp": 0.29052734, + "step": 2229, + "time_per_iteration": 4.120098829269409 + }, + { + "auxiliary_loss_clip": 0.06619616, + "auxiliary_loss_mlp": 0.01290736, + "balance_loss_clip": 0.06300908, + "balance_loss_mlp": 0.01259849, + "epoch": 0.13407485344957162, + "flos": 26951214833280.0, + "grad_norm": 2.2979177943800386, + "language_loss": 0.7564404, + "learning_rate": 3.887449459642378e-06, + "loss": 0.83554387, + "num_input_tokens_seen": 48243245, + "router_z_loss_clip": 3.1875, + "router_z_loss_mlp": 0.30871582, + "step": 2230, + "time_per_iteration": 2.6150131225585938 + }, + { + "auxiliary_loss_clip": 0.06620437, + "auxiliary_loss_mlp": 0.01289621, + "balance_loss_clip": 0.06302108, + "balance_loss_mlp": 0.01261059, + "epoch": 0.1341349767022396, + "flos": 20345585258880.0, + "grad_norm": 1.8496833611889134, + "language_loss": 0.81113201, + "learning_rate": 3.8873206164288785e-06, + "loss": 0.89023262, + "num_input_tokens_seen": 48262600, + "router_z_loss_clip": 3.1796875, + "router_z_loss_mlp": 0.28564453, + "step": 2231, + "time_per_iteration": 2.5791971683502197 + }, + { + "auxiliary_loss_clip": 0.06629717, + "auxiliary_loss_mlp": 0.01304097, + "balance_loss_clip": 0.0629984, + "balance_loss_mlp": 0.01268811, + "epoch": 0.13419509995490755, + "flos": 29869802323200.0, + "grad_norm": 3.0058197712179218, + "language_loss": 0.73244405, + "learning_rate": 3.887191701647992e-06, + "loss": 0.81178224, + "num_input_tokens_seen": 48285075, + "router_z_loss_clip": 3.30273438, + "router_z_loss_mlp": 0.3527832, + "step": 2232, + "time_per_iteration": 4.126416444778442 + }, + { + "auxiliary_loss_clip": 0.06625827, + "auxiliary_loss_mlp": 0.01292477, + "balance_loss_clip": 0.06298069, + "balance_loss_mlp": 0.01260052, + "epoch": 0.13425522320757552, + "flos": 26950250511360.0, + "grad_norm": 2.8502119867979823, + "language_loss": 0.67005944, + "learning_rate": 3.8870627153046066e-06, + "loss": 0.74924242, + "num_input_tokens_seen": 48301285, + "router_z_loss_clip": 3.27539062, + "router_z_loss_mlp": 0.32421875, + "step": 2233, + "time_per_iteration": 2.57535457611084 + }, + { + "auxiliary_loss_clip": 0.0661561, + "auxiliary_loss_mlp": 0.01292122, + "balance_loss_clip": 0.0629602, + "balance_loss_mlp": 0.0126096, + "epoch": 0.1343153464602435, + "flos": 15782501537280.0, + "grad_norm": 2.818232021038303, + "language_loss": 0.82633889, + "learning_rate": 3.886933657403615e-06, + "loss": 0.90541625, + "num_input_tokens_seen": 48317835, + "router_z_loss_clip": 3.19726562, + "router_z_loss_mlp": 0.31176758, + "step": 2234, + "time_per_iteration": 2.5729787349700928 + }, + { + "auxiliary_loss_clip": 0.06617501, + "auxiliary_loss_mlp": 0.01296303, + "balance_loss_clip": 0.06299153, + "balance_loss_mlp": 0.01266668, + "epoch": 0.13437546971291148, + "flos": 24321370913280.0, + "grad_norm": 2.028590274897441, + "language_loss": 0.82841778, + "learning_rate": 3.886804527949909e-06, + "loss": 0.90755594, + "num_input_tokens_seen": 48335670, + "router_z_loss_clip": 3.18359375, + "router_z_loss_mlp": 0.29638672, + "step": 2235, + "time_per_iteration": 2.593050241470337 + }, + { + "auxiliary_loss_clip": 0.06612507, + "auxiliary_loss_mlp": 0.01293723, + "balance_loss_clip": 0.06294179, + "balance_loss_mlp": 0.01261989, + "epoch": 0.13443559296557944, + "flos": 26657817361920.0, + "grad_norm": 1.9716678370354759, + "language_loss": 0.87708902, + "learning_rate": 3.8866753269483864e-06, + "loss": 0.95615125, + "num_input_tokens_seen": 48357805, + "router_z_loss_clip": 3.18554688, + "router_z_loss_mlp": 0.31738281, + "step": 2236, + "time_per_iteration": 2.5910720825195312 + }, + { + "auxiliary_loss_clip": 0.06621092, + "auxiliary_loss_mlp": 0.01294743, + "balance_loss_clip": 0.06297852, + "balance_loss_mlp": 0.012627, + "epoch": 0.1344957162182474, + "flos": 21802216636800.0, + "grad_norm": 1.7646832896946034, + "language_loss": 0.78455186, + "learning_rate": 3.886546054403946e-06, + "loss": 0.86371022, + "num_input_tokens_seen": 48377845, + "router_z_loss_clip": 3.23632812, + "router_z_loss_mlp": 0.32080078, + "step": 2237, + "time_per_iteration": 2.5423593521118164 + }, + { + "auxiliary_loss_clip": 0.06621015, + "auxiliary_loss_mlp": 0.01296744, + "balance_loss_clip": 0.06297819, + "balance_loss_mlp": 0.01263746, + "epoch": 0.13455583947091537, + "flos": 19871785946880.0, + "grad_norm": 2.139876962287315, + "language_loss": 0.80559266, + "learning_rate": 3.886416710321491e-06, + "loss": 0.88477021, + "num_input_tokens_seen": 48394735, + "router_z_loss_clip": 3.23046875, + "router_z_loss_mlp": 0.33007812, + "step": 2238, + "time_per_iteration": 2.547511100769043 + }, + { + "auxiliary_loss_clip": 0.0662026, + "auxiliary_loss_mlp": 0.01290468, + "balance_loss_clip": 0.06300892, + "balance_loss_mlp": 0.0125945, + "epoch": 0.13461596272358334, + "flos": 30854730741120.0, + "grad_norm": 2.2946937997388983, + "language_loss": 0.69019175, + "learning_rate": 3.886287294705924e-06, + "loss": 0.76929903, + "num_input_tokens_seen": 48414200, + "router_z_loss_clip": 3.19335938, + "router_z_loss_mlp": 0.31005859, + "step": 2239, + "time_per_iteration": 2.6161396503448486 + }, + { + "auxiliary_loss_clip": 0.06626255, + "auxiliary_loss_mlp": 0.0129458, + "balance_loss_clip": 0.06302193, + "balance_loss_mlp": 0.01262609, + "epoch": 0.1346760859762513, + "flos": 12499253078400.0, + "grad_norm": 2.740092234793679, + "language_loss": 0.83294439, + "learning_rate": 3.8861578075621555e-06, + "loss": 0.91215271, + "num_input_tokens_seen": 48431065, + "router_z_loss_clip": 3.2421875, + "router_z_loss_mlp": 0.31958008, + "step": 2240, + "time_per_iteration": 2.531810998916626 + }, + { + "auxiliary_loss_clip": 0.06621873, + "auxiliary_loss_mlp": 0.01289824, + "balance_loss_clip": 0.06297486, + "balance_loss_mlp": 0.01256278, + "epoch": 0.1347362092289193, + "flos": 21842607104640.0, + "grad_norm": 1.6487000610588447, + "language_loss": 0.78665066, + "learning_rate": 3.886028248895093e-06, + "loss": 0.86576766, + "num_input_tokens_seen": 48450335, + "router_z_loss_clip": 3.2421875, + "router_z_loss_mlp": 0.33569336, + "step": 2241, + "time_per_iteration": 2.5346198081970215 + }, + { + "auxiliary_loss_clip": 0.06618196, + "auxiliary_loss_mlp": 0.01285675, + "balance_loss_clip": 0.06305367, + "balance_loss_mlp": 0.01256636, + "epoch": 0.13479633248158726, + "flos": 23515502670720.0, + "grad_norm": 1.8184249012274396, + "language_loss": 0.84641361, + "learning_rate": 3.88589861870965e-06, + "loss": 0.92545235, + "num_input_tokens_seen": 48468555, + "router_z_loss_clip": 3.12890625, + "router_z_loss_mlp": 0.29052734, + "step": 2242, + "time_per_iteration": 2.6532411575317383 + }, + { + "auxiliary_loss_clip": 0.0662721, + "auxiliary_loss_mlp": 0.01293952, + "balance_loss_clip": 0.06304164, + "balance_loss_mlp": 0.01261098, + "epoch": 0.13485645573425523, + "flos": 29350874787840.0, + "grad_norm": 2.677815565759994, + "language_loss": 0.66332561, + "learning_rate": 3.885768917010744e-06, + "loss": 0.74253726, + "num_input_tokens_seen": 48488515, + "router_z_loss_clip": 3.22851562, + "router_z_loss_mlp": 0.32836914, + "step": 2243, + "time_per_iteration": 2.599304437637329 + }, + { + "auxiliary_loss_clip": 0.06611082, + "auxiliary_loss_mlp": 0.01284736, + "balance_loss_clip": 0.06295401, + "balance_loss_mlp": 0.01256042, + "epoch": 0.1349165789869232, + "flos": 28044484980480.0, + "grad_norm": 1.4756823100545766, + "language_loss": 0.73444742, + "learning_rate": 3.8856391438032895e-06, + "loss": 0.81340563, + "num_input_tokens_seen": 48510515, + "router_z_loss_clip": 3.15234375, + "router_z_loss_mlp": 0.28662109, + "step": 2244, + "time_per_iteration": 2.640366554260254 + }, + { + "auxiliary_loss_clip": 0.06614108, + "auxiliary_loss_mlp": 0.01291938, + "balance_loss_clip": 0.062971, + "balance_loss_mlp": 0.01260133, + "epoch": 0.13497670223959116, + "flos": 22859834071680.0, + "grad_norm": 7.9965666613423, + "language_loss": 0.87522435, + "learning_rate": 3.88550929909221e-06, + "loss": 0.95428485, + "num_input_tokens_seen": 48529940, + "router_z_loss_clip": 3.17578125, + "router_z_loss_mlp": 0.31787109, + "step": 2245, + "time_per_iteration": 2.537259340286255 + }, + { + "auxiliary_loss_clip": 0.06609753, + "auxiliary_loss_mlp": 0.01291064, + "balance_loss_clip": 0.06298651, + "balance_loss_mlp": 0.0126119, + "epoch": 0.13503682549225912, + "flos": 16509517488000.0, + "grad_norm": 1.6351770671547161, + "language_loss": 0.80275553, + "learning_rate": 3.88537938288243e-06, + "loss": 0.88176376, + "num_input_tokens_seen": 48548190, + "router_z_loss_clip": 3.11328125, + "router_z_loss_mlp": 0.29858398, + "step": 2246, + "time_per_iteration": 2.576324224472046 + }, + { + "auxiliary_loss_clip": 0.06503996, + "auxiliary_loss_mlp": 0.01268266, + "balance_loss_clip": 0.06311698, + "balance_loss_mlp": 0.01256631, + "epoch": 0.1350969487449271, + "flos": 70775979217920.0, + "grad_norm": 0.7288766997222871, + "language_loss": 0.60674834, + "learning_rate": 3.885249395178874e-06, + "loss": 0.68447095, + "num_input_tokens_seen": 48613165, + "router_z_loss_clip": 1.921875, + "router_z_loss_mlp": 0.11621094, + "step": 2247, + "time_per_iteration": 3.295891046524048 + }, + { + "auxiliary_loss_clip": 0.06638567, + "auxiliary_loss_mlp": 0.01298182, + "balance_loss_clip": 0.06305797, + "balance_loss_mlp": 0.01262229, + "epoch": 0.13515707199759508, + "flos": 23082680805120.0, + "grad_norm": 2.7104639981136662, + "language_loss": 0.82279253, + "learning_rate": 3.885119335986473e-06, + "loss": 0.90216005, + "num_input_tokens_seen": 48631705, + "router_z_loss_clip": 3.33203125, + "router_z_loss_mlp": 0.359375, + "step": 2248, + "time_per_iteration": 2.575490951538086 + }, + { + "auxiliary_loss_clip": 0.06606994, + "auxiliary_loss_mlp": 0.01284005, + "balance_loss_clip": 0.0629556, + "balance_loss_mlp": 0.01255013, + "epoch": 0.13521719525026304, + "flos": 23193244667520.0, + "grad_norm": 1.8435286673705464, + "language_loss": 0.7853781, + "learning_rate": 3.884989205310157e-06, + "loss": 0.86428809, + "num_input_tokens_seen": 48649740, + "router_z_loss_clip": 3.11328125, + "router_z_loss_mlp": 0.2902832, + "step": 2249, + "time_per_iteration": 2.5745737552642822 + }, + { + "auxiliary_loss_clip": 0.06615513, + "auxiliary_loss_mlp": 0.01290474, + "balance_loss_clip": 0.06300813, + "balance_loss_mlp": 0.01262293, + "epoch": 0.135277318502931, + "flos": 24797937409920.0, + "grad_norm": 1.7186486055988894, + "language_loss": 0.86064833, + "learning_rate": 3.884859003154862e-06, + "loss": 0.93970823, + "num_input_tokens_seen": 48671565, + "router_z_loss_clip": 3.14453125, + "router_z_loss_mlp": 0.28210449, + "step": 2250, + "time_per_iteration": 2.594517469406128 + }, + { + "auxiliary_loss_clip": 0.06621417, + "auxiliary_loss_mlp": 0.01303153, + "balance_loss_clip": 0.06298415, + "balance_loss_mlp": 0.01270108, + "epoch": 0.13533744175559898, + "flos": 21915044559360.0, + "grad_norm": 3.4195422131585564, + "language_loss": 0.83116192, + "learning_rate": 3.884728729525524e-06, + "loss": 0.91040766, + "num_input_tokens_seen": 48690425, + "router_z_loss_clip": 3.22851562, + "router_z_loss_mlp": 0.33032227, + "step": 2251, + "time_per_iteration": 2.5615222454071045 + }, + { + "auxiliary_loss_clip": 0.066163, + "auxiliary_loss_mlp": 0.01290158, + "balance_loss_clip": 0.06295693, + "balance_loss_mlp": 0.01258579, + "epoch": 0.13539756500826694, + "flos": 21217434192000.0, + "grad_norm": 1.7358628614083547, + "language_loss": 0.86943758, + "learning_rate": 3.884598384427084e-06, + "loss": 0.94850212, + "num_input_tokens_seen": 48707505, + "router_z_loss_clip": 3.20703125, + "router_z_loss_mlp": 0.31555176, + "step": 2252, + "time_per_iteration": 2.5325772762298584 + }, + { + "auxiliary_loss_clip": 0.06482528, + "auxiliary_loss_mlp": 0.01279879, + "balance_loss_clip": 0.06294215, + "balance_loss_mlp": 0.01267404, + "epoch": 0.1354576882609349, + "flos": 63260835500160.0, + "grad_norm": 0.7528010548037618, + "language_loss": 0.61151105, + "learning_rate": 3.884467967864485e-06, + "loss": 0.68913507, + "num_input_tokens_seen": 48775895, + "router_z_loss_clip": 1.88867188, + "router_z_loss_mlp": 0.12481689, + "step": 2253, + "time_per_iteration": 3.2731101512908936 + }, + { + "auxiliary_loss_clip": 0.06617865, + "auxiliary_loss_mlp": 0.01297527, + "balance_loss_clip": 0.06298327, + "balance_loss_mlp": 0.01266961, + "epoch": 0.1355178115136029, + "flos": 25489971480960.0, + "grad_norm": 1.734180018549956, + "language_loss": 0.90171039, + "learning_rate": 3.884337479842671e-06, + "loss": 0.98086423, + "num_input_tokens_seen": 48798370, + "router_z_loss_clip": 3.19726562, + "router_z_loss_mlp": 0.30517578, + "step": 2254, + "time_per_iteration": 2.5830373764038086 + }, + { + "auxiliary_loss_clip": 0.06624171, + "auxiliary_loss_mlp": 0.01291824, + "balance_loss_clip": 0.06297128, + "balance_loss_mlp": 0.01259709, + "epoch": 0.13557793476627086, + "flos": 21623491877760.0, + "grad_norm": 2.5405517045767865, + "language_loss": 0.85834336, + "learning_rate": 3.884206920366591e-06, + "loss": 0.93750322, + "num_input_tokens_seen": 48817955, + "router_z_loss_clip": 3.26757812, + "router_z_loss_mlp": 0.32104492, + "step": 2255, + "time_per_iteration": 2.5849766731262207 + }, + { + "auxiliary_loss_clip": 0.06615041, + "auxiliary_loss_mlp": 0.01294235, + "balance_loss_clip": 0.06296261, + "balance_loss_mlp": 0.01264862, + "epoch": 0.13563805801893883, + "flos": 24933839932800.0, + "grad_norm": 2.4937460094050534, + "language_loss": 0.7602762, + "learning_rate": 3.884076289441196e-06, + "loss": 0.83936894, + "num_input_tokens_seen": 48836330, + "router_z_loss_clip": 3.18945312, + "router_z_loss_mlp": 0.29370117, + "step": 2256, + "time_per_iteration": 2.5914275646209717 + }, + { + "auxiliary_loss_clip": 0.06621285, + "auxiliary_loss_mlp": 0.01288962, + "balance_loss_clip": 0.06294358, + "balance_loss_mlp": 0.01257563, + "epoch": 0.1356981812716068, + "flos": 14754415466880.0, + "grad_norm": 2.129121942862091, + "language_loss": 0.84234703, + "learning_rate": 3.88394558707144e-06, + "loss": 0.92144954, + "num_input_tokens_seen": 48851890, + "router_z_loss_clip": 3.26367188, + "router_z_loss_mlp": 0.31420898, + "step": 2257, + "time_per_iteration": 2.5664286613464355 + }, + { + "auxiliary_loss_clip": 0.06630847, + "auxiliary_loss_mlp": 0.0129256, + "balance_loss_clip": 0.06299773, + "balance_loss_mlp": 0.01259658, + "epoch": 0.13575830452427476, + "flos": 11113256292480.0, + "grad_norm": 1.9364367185101232, + "language_loss": 0.83362973, + "learning_rate": 3.883814813262277e-06, + "loss": 0.91286373, + "num_input_tokens_seen": 48865510, + "router_z_loss_clip": 3.3125, + "router_z_loss_mlp": 0.32910156, + "step": 2258, + "time_per_iteration": 2.521657705307007 + }, + { + "auxiliary_loss_clip": 0.06621088, + "auxiliary_loss_mlp": 0.01297355, + "balance_loss_clip": 0.0629478, + "balance_loss_mlp": 0.01264858, + "epoch": 0.13581842777694272, + "flos": 17964849127680.0, + "grad_norm": 2.721301656824917, + "language_loss": 0.83752787, + "learning_rate": 3.883683968018669e-06, + "loss": 0.91671234, + "num_input_tokens_seen": 48882360, + "router_z_loss_clip": 3.26171875, + "router_z_loss_mlp": 0.32519531, + "step": 2259, + "time_per_iteration": 2.521693706512451 + }, + { + "auxiliary_loss_clip": 0.0660786, + "auxiliary_loss_mlp": 0.01289157, + "balance_loss_clip": 0.06291058, + "balance_loss_mlp": 0.01260952, + "epoch": 0.1358785510296107, + "flos": 22863817140480.0, + "grad_norm": 2.0214358343175927, + "language_loss": 0.74903429, + "learning_rate": 3.8835530513455755e-06, + "loss": 0.82800448, + "num_input_tokens_seen": 48902700, + "router_z_loss_clip": 3.16992188, + "router_z_loss_mlp": 0.28198242, + "step": 2260, + "time_per_iteration": 2.5302374362945557 + }, + { + "auxiliary_loss_clip": 0.0660997, + "auxiliary_loss_mlp": 0.0129096, + "balance_loss_clip": 0.06293269, + "balance_loss_mlp": 0.01260859, + "epoch": 0.13593867428227868, + "flos": 25746542282880.0, + "grad_norm": 2.2338901691781925, + "language_loss": 0.76686287, + "learning_rate": 3.883422063247961e-06, + "loss": 0.84587216, + "num_input_tokens_seen": 48922525, + "router_z_loss_clip": 3.16796875, + "router_z_loss_mlp": 0.30114746, + "step": 2261, + "time_per_iteration": 2.5939574241638184 + }, + { + "auxiliary_loss_clip": 0.06616522, + "auxiliary_loss_mlp": 0.01291008, + "balance_loss_clip": 0.0629552, + "balance_loss_mlp": 0.01259132, + "epoch": 0.13599879753494665, + "flos": 31257350409600.0, + "grad_norm": 2.2895573692407547, + "language_loss": 0.6521523, + "learning_rate": 3.883291003730794e-06, + "loss": 0.73122764, + "num_input_tokens_seen": 48942510, + "router_z_loss_clip": 3.21289062, + "router_z_loss_mlp": 0.31884766, + "step": 2262, + "time_per_iteration": 2.615324020385742 + }, + { + "auxiliary_loss_clip": 0.0662135, + "auxiliary_loss_mlp": 0.01300411, + "balance_loss_clip": 0.06298003, + "balance_loss_mlp": 0.01269584, + "epoch": 0.1360589207876146, + "flos": 23921853845760.0, + "grad_norm": 2.421989013841254, + "language_loss": 0.84175652, + "learning_rate": 3.883159872799043e-06, + "loss": 0.92097414, + "num_input_tokens_seen": 48962625, + "router_z_loss_clip": 3.23632812, + "router_z_loss_mlp": 0.30859375, + "step": 2263, + "time_per_iteration": 2.5566399097442627 + }, + { + "auxiliary_loss_clip": 0.06629188, + "auxiliary_loss_mlp": 0.01291754, + "balance_loss_clip": 0.06304573, + "balance_loss_mlp": 0.0125859, + "epoch": 0.13611904404028258, + "flos": 19980295384320.0, + "grad_norm": 2.5264058207475215, + "language_loss": 0.89336157, + "learning_rate": 3.8830286704576815e-06, + "loss": 0.97257102, + "num_input_tokens_seen": 48982525, + "router_z_loss_clip": 3.24804688, + "router_z_loss_mlp": 0.33178711, + "step": 2264, + "time_per_iteration": 2.5305962562561035 + }, + { + "auxiliary_loss_clip": 0.06637362, + "auxiliary_loss_mlp": 0.0129781, + "balance_loss_clip": 0.06308438, + "balance_loss_mlp": 0.01265195, + "epoch": 0.13617916729295054, + "flos": 15345990092160.0, + "grad_norm": 2.7927094576438716, + "language_loss": 0.71764517, + "learning_rate": 3.882897396711683e-06, + "loss": 0.79699689, + "num_input_tokens_seen": 48997605, + "router_z_loss_clip": 3.28515625, + "router_z_loss_mlp": 0.32617188, + "step": 2265, + "time_per_iteration": 2.561797857284546 + }, + { + "auxiliary_loss_clip": 0.06615983, + "auxiliary_loss_mlp": 0.01290453, + "balance_loss_clip": 0.06299248, + "balance_loss_mlp": 0.01262034, + "epoch": 0.1362392905456185, + "flos": 27458402797440.0, + "grad_norm": 2.5604448311617825, + "language_loss": 0.67458075, + "learning_rate": 3.882766051566027e-06, + "loss": 0.75364506, + "num_input_tokens_seen": 49018535, + "router_z_loss_clip": 3.16992188, + "router_z_loss_mlp": 0.28381348, + "step": 2266, + "time_per_iteration": 2.5694286823272705 + }, + { + "auxiliary_loss_clip": 0.06624304, + "auxiliary_loss_mlp": 0.01294932, + "balance_loss_clip": 0.06304609, + "balance_loss_mlp": 0.01263711, + "epoch": 0.1362994137982865, + "flos": 25015920606720.0, + "grad_norm": 2.0527906242943983, + "language_loss": 0.77445233, + "learning_rate": 3.882634635025694e-06, + "loss": 0.85364473, + "num_input_tokens_seen": 49038865, + "router_z_loss_clip": 3.1953125, + "router_z_loss_mlp": 0.31237793, + "step": 2267, + "time_per_iteration": 4.004362106323242 + }, + { + "auxiliary_loss_clip": 0.06632047, + "auxiliary_loss_mlp": 0.01290209, + "balance_loss_clip": 0.0631062, + "balance_loss_mlp": 0.01259882, + "epoch": 0.13635953705095447, + "flos": 20309261713920.0, + "grad_norm": 1.8370610095313742, + "language_loss": 0.836191, + "learning_rate": 3.882503147095667e-06, + "loss": 0.91541362, + "num_input_tokens_seen": 49058010, + "router_z_loss_clip": 3.21875, + "router_z_loss_mlp": 0.30322266, + "step": 2268, + "time_per_iteration": 3.9506208896636963 + }, + { + "auxiliary_loss_clip": 0.06630498, + "auxiliary_loss_mlp": 0.01294319, + "balance_loss_clip": 0.06311751, + "balance_loss_mlp": 0.01262013, + "epoch": 0.13641966030362243, + "flos": 31366530679680.0, + "grad_norm": 1.9828007462930386, + "language_loss": 0.7747438, + "learning_rate": 3.882371587780931e-06, + "loss": 0.85399193, + "num_input_tokens_seen": 49080330, + "router_z_loss_clip": 3.1875, + "router_z_loss_mlp": 0.32299805, + "step": 2269, + "time_per_iteration": 2.653453826904297 + }, + { + "auxiliary_loss_clip": 0.06638865, + "auxiliary_loss_mlp": 0.01296587, + "balance_loss_clip": 0.06316057, + "balance_loss_mlp": 0.0126545, + "epoch": 0.1364797835562904, + "flos": 20483122936320.0, + "grad_norm": 2.359526754249971, + "language_loss": 0.8236903, + "learning_rate": 3.882239957086477e-06, + "loss": 0.90304482, + "num_input_tokens_seen": 49097035, + "router_z_loss_clip": 3.22851562, + "router_z_loss_mlp": 0.31152344, + "step": 2270, + "time_per_iteration": 2.5675413608551025 + }, + { + "auxiliary_loss_clip": 0.06635441, + "auxiliary_loss_mlp": 0.01293131, + "balance_loss_clip": 0.06311204, + "balance_loss_mlp": 0.01261254, + "epoch": 0.13653990680895836, + "flos": 13083280836480.0, + "grad_norm": 2.670574241660613, + "language_loss": 0.77002323, + "learning_rate": 3.882108255017295e-06, + "loss": 0.84930891, + "num_input_tokens_seen": 49113945, + "router_z_loss_clip": 3.24414062, + "router_z_loss_mlp": 0.31884766, + "step": 2271, + "time_per_iteration": 3.976745367050171 + }, + { + "auxiliary_loss_clip": 0.06636623, + "auxiliary_loss_mlp": 0.01296686, + "balance_loss_clip": 0.06313315, + "balance_loss_mlp": 0.0126419, + "epoch": 0.13660003006162633, + "flos": 16952443770240.0, + "grad_norm": 2.320627701174975, + "language_loss": 0.81754398, + "learning_rate": 3.881976481578379e-06, + "loss": 0.89687717, + "num_input_tokens_seen": 49132855, + "router_z_loss_clip": 3.23242188, + "router_z_loss_mlp": 0.32495117, + "step": 2272, + "time_per_iteration": 4.03596043586731 + }, + { + "auxiliary_loss_clip": 0.0650102, + "auxiliary_loss_mlp": 0.01266825, + "balance_loss_clip": 0.06312356, + "balance_loss_mlp": 0.01255327, + "epoch": 0.1366601533142943, + "flos": 68703105386880.0, + "grad_norm": 0.6745755938751765, + "language_loss": 0.60570937, + "learning_rate": 3.8818446367747255e-06, + "loss": 0.68338782, + "num_input_tokens_seen": 49198310, + "router_z_loss_clip": 1.890625, + "router_z_loss_mlp": 0.11480713, + "step": 2273, + "time_per_iteration": 3.287332534790039 + }, + { + "auxiliary_loss_clip": 0.06625689, + "auxiliary_loss_mlp": 0.01290706, + "balance_loss_clip": 0.06308322, + "balance_loss_mlp": 0.01259831, + "epoch": 0.13672027656696228, + "flos": 19250176832640.0, + "grad_norm": 1.730825672757131, + "language_loss": 0.79225731, + "learning_rate": 3.881712720611336e-06, + "loss": 0.87142122, + "num_input_tokens_seen": 49217250, + "router_z_loss_clip": 3.171875, + "router_z_loss_mlp": 0.30883789, + "step": 2274, + "time_per_iteration": 2.562556743621826 + }, + { + "auxiliary_loss_clip": 0.06626303, + "auxiliary_loss_mlp": 0.01302977, + "balance_loss_clip": 0.06308225, + "balance_loss_mlp": 0.01270457, + "epoch": 0.13678039981963025, + "flos": 24541785878400.0, + "grad_norm": 2.937872524874316, + "language_loss": 0.79763901, + "learning_rate": 3.881580733093211e-06, + "loss": 0.87693179, + "num_input_tokens_seen": 49236615, + "router_z_loss_clip": 3.17773438, + "router_z_loss_mlp": 0.32519531, + "step": 2275, + "time_per_iteration": 2.560577630996704 + }, + { + "auxiliary_loss_clip": 0.06630076, + "auxiliary_loss_mlp": 0.01293627, + "balance_loss_clip": 0.06306267, + "balance_loss_mlp": 0.01259914, + "epoch": 0.13684052307229821, + "flos": 15674788713600.0, + "grad_norm": 2.8834689051693196, + "language_loss": 0.82202291, + "learning_rate": 3.881448674225356e-06, + "loss": 0.9012599, + "num_input_tokens_seen": 49253935, + "router_z_loss_clip": 3.2421875, + "router_z_loss_mlp": 0.33691406, + "step": 2276, + "time_per_iteration": 2.6382758617401123 + }, + { + "auxiliary_loss_clip": 0.06636757, + "auxiliary_loss_mlp": 0.01296316, + "balance_loss_clip": 0.06304651, + "balance_loss_mlp": 0.01260839, + "epoch": 0.13690064632496618, + "flos": 28371983863680.0, + "grad_norm": 2.682466270477189, + "language_loss": 0.71951526, + "learning_rate": 3.881316544012779e-06, + "loss": 0.79884601, + "num_input_tokens_seen": 49273605, + "router_z_loss_clip": 3.32421875, + "router_z_loss_mlp": 0.35473633, + "step": 2277, + "time_per_iteration": 2.59140944480896 + }, + { + "auxiliary_loss_clip": 0.06638919, + "auxiliary_loss_mlp": 0.01298071, + "balance_loss_clip": 0.06309501, + "balance_loss_mlp": 0.01265312, + "epoch": 0.13696076957763414, + "flos": 23411605207680.0, + "grad_norm": 2.2485386037649144, + "language_loss": 0.82153767, + "learning_rate": 3.88118434246049e-06, + "loss": 0.90090752, + "num_input_tokens_seen": 49291785, + "router_z_loss_clip": 3.296875, + "router_z_loss_mlp": 0.32739258, + "step": 2278, + "time_per_iteration": 2.5540530681610107 + }, + { + "auxiliary_loss_clip": 0.06627095, + "auxiliary_loss_mlp": 0.01287889, + "balance_loss_clip": 0.06304022, + "balance_loss_mlp": 0.01256358, + "epoch": 0.1370208928303021, + "flos": 37205760084480.0, + "grad_norm": 2.776511982198055, + "language_loss": 0.76353186, + "learning_rate": 3.881052069573502e-06, + "loss": 0.84268171, + "num_input_tokens_seen": 49311405, + "router_z_loss_clip": 3.234375, + "router_z_loss_mlp": 0.31506348, + "step": 2279, + "time_per_iteration": 2.659834623336792 + }, + { + "auxiliary_loss_clip": 0.06632279, + "auxiliary_loss_mlp": 0.01290702, + "balance_loss_clip": 0.06309781, + "balance_loss_mlp": 0.01260041, + "epoch": 0.13708101608297008, + "flos": 26983052184960.0, + "grad_norm": 1.8236300001025265, + "language_loss": 0.78161544, + "learning_rate": 3.880919725356831e-06, + "loss": 0.86084521, + "num_input_tokens_seen": 49331835, + "router_z_loss_clip": 3.22460938, + "router_z_loss_mlp": 0.30639648, + "step": 2280, + "time_per_iteration": 2.5933265686035156 + }, + { + "auxiliary_loss_clip": 0.06616117, + "auxiliary_loss_mlp": 0.01291386, + "balance_loss_clip": 0.06299774, + "balance_loss_mlp": 0.01259009, + "epoch": 0.13714113933563807, + "flos": 32564243341440.0, + "grad_norm": 2.0971089694494003, + "language_loss": 0.80573678, + "learning_rate": 3.880787309815496e-06, + "loss": 0.88481188, + "num_input_tokens_seen": 49352290, + "router_z_loss_clip": 3.1640625, + "router_z_loss_mlp": 0.32373047, + "step": 2281, + "time_per_iteration": 2.6089248657226562 + }, + { + "auxiliary_loss_clip": 0.06637304, + "auxiliary_loss_mlp": 0.01294147, + "balance_loss_clip": 0.06310696, + "balance_loss_mlp": 0.01260601, + "epoch": 0.13720126258830603, + "flos": 16105807716480.0, + "grad_norm": 1.9438647514298306, + "language_loss": 0.84104228, + "learning_rate": 3.880654822954518e-06, + "loss": 0.92035675, + "num_input_tokens_seen": 49370285, + "router_z_loss_clip": 3.26367188, + "router_z_loss_mlp": 0.33544922, + "step": 2282, + "time_per_iteration": 2.6252219676971436 + }, + { + "auxiliary_loss_clip": 0.06621532, + "auxiliary_loss_mlp": 0.01288566, + "balance_loss_clip": 0.06310192, + "balance_loss_mlp": 0.01258716, + "epoch": 0.137261385840974, + "flos": 18959630400000.0, + "grad_norm": 1.6598116001029841, + "language_loss": 0.74414694, + "learning_rate": 3.8805222647789195e-06, + "loss": 0.82324791, + "num_input_tokens_seen": 49389610, + "router_z_loss_clip": 3.1171875, + "router_z_loss_mlp": 0.29858398, + "step": 2283, + "time_per_iteration": 2.510495185852051 + }, + { + "auxiliary_loss_clip": 0.06626984, + "auxiliary_loss_mlp": 0.01293133, + "balance_loss_clip": 0.06314456, + "balance_loss_mlp": 0.01261686, + "epoch": 0.13732150909364196, + "flos": 23302173375360.0, + "grad_norm": 4.31542841231349, + "language_loss": 0.85737264, + "learning_rate": 3.880389635293729e-06, + "loss": 0.93657386, + "num_input_tokens_seen": 49408390, + "router_z_loss_clip": 3.125, + "router_z_loss_mlp": 0.31445312, + "step": 2284, + "time_per_iteration": 2.569772720336914 + }, + { + "auxiliary_loss_clip": 0.06637374, + "auxiliary_loss_mlp": 0.01296079, + "balance_loss_clip": 0.06309589, + "balance_loss_mlp": 0.01263702, + "epoch": 0.13738163234630993, + "flos": 29358966706560.0, + "grad_norm": 2.3287060101811643, + "language_loss": 0.76374751, + "learning_rate": 3.880256934503974e-06, + "loss": 0.84308201, + "num_input_tokens_seen": 49427725, + "router_z_loss_clip": 3.27734375, + "router_z_loss_mlp": 0.32348633, + "step": 2285, + "time_per_iteration": 2.618502140045166 + }, + { + "auxiliary_loss_clip": 0.06630811, + "auxiliary_loss_mlp": 0.01295468, + "balance_loss_clip": 0.06312186, + "balance_loss_mlp": 0.0126619, + "epoch": 0.1374417555989779, + "flos": 26658572048640.0, + "grad_norm": 1.8592668297074675, + "language_loss": 0.76012349, + "learning_rate": 3.880124162414689e-06, + "loss": 0.83938622, + "num_input_tokens_seen": 49449000, + "router_z_loss_clip": 3.18164062, + "router_z_loss_mlp": 0.29296875, + "step": 2286, + "time_per_iteration": 2.7475874423980713 + }, + { + "auxiliary_loss_clip": 0.06634222, + "auxiliary_loss_mlp": 0.01290764, + "balance_loss_clip": 0.06310531, + "balance_loss_mlp": 0.01258029, + "epoch": 0.1375018788516459, + "flos": 28411074593280.0, + "grad_norm": 5.375995383381602, + "language_loss": 0.87619269, + "learning_rate": 3.879991319030908e-06, + "loss": 0.95544249, + "num_input_tokens_seen": 49468360, + "router_z_loss_clip": 3.23242188, + "router_z_loss_mlp": 0.32763672, + "step": 2287, + "time_per_iteration": 2.7319629192352295 + }, + { + "auxiliary_loss_clip": 0.06638976, + "auxiliary_loss_mlp": 0.01305844, + "balance_loss_clip": 0.06320731, + "balance_loss_mlp": 0.01274683, + "epoch": 0.13756200210431385, + "flos": 37422695105280.0, + "grad_norm": 2.4551568049715486, + "language_loss": 0.70291626, + "learning_rate": 3.879858404357666e-06, + "loss": 0.78236449, + "num_input_tokens_seen": 49493450, + "router_z_loss_clip": 3.18359375, + "router_z_loss_mlp": 0.3112793, + "step": 2288, + "time_per_iteration": 2.6788651943206787 + }, + { + "auxiliary_loss_clip": 0.06632806, + "auxiliary_loss_mlp": 0.01293292, + "balance_loss_clip": 0.06312902, + "balance_loss_mlp": 0.01262667, + "epoch": 0.13762212535698182, + "flos": 22717642492800.0, + "grad_norm": 3.117032975681255, + "language_loss": 0.88826561, + "learning_rate": 3.879725418400005e-06, + "loss": 0.96752661, + "num_input_tokens_seen": 49511220, + "router_z_loss_clip": 3.1953125, + "router_z_loss_mlp": 0.30651855, + "step": 2289, + "time_per_iteration": 2.5602166652679443 + }, + { + "auxiliary_loss_clip": 0.06632558, + "auxiliary_loss_mlp": 0.01293233, + "balance_loss_clip": 0.06320693, + "balance_loss_mlp": 0.01263181, + "epoch": 0.13768224860964978, + "flos": 23959057858560.0, + "grad_norm": 1.9772525840465298, + "language_loss": 0.75630605, + "learning_rate": 3.879592361162969e-06, + "loss": 0.8355639, + "num_input_tokens_seen": 49529820, + "router_z_loss_clip": 3.11914062, + "router_z_loss_mlp": 0.30065918, + "step": 2290, + "time_per_iteration": 2.5592398643493652 + }, + { + "auxiliary_loss_clip": 0.06540786, + "auxiliary_loss_mlp": 0.01268874, + "balance_loss_clip": 0.06353199, + "balance_loss_mlp": 0.01257585, + "epoch": 0.13774237186231775, + "flos": 63612568212480.0, + "grad_norm": 0.6705422790130379, + "language_loss": 0.51642907, + "learning_rate": 3.8794592326516015e-06, + "loss": 0.59452564, + "num_input_tokens_seen": 49595325, + "router_z_loss_clip": 1.875, + "router_z_loss_mlp": 0.112854, + "step": 2291, + "time_per_iteration": 3.2724592685699463 + }, + { + "auxiliary_loss_clip": 0.06630601, + "auxiliary_loss_mlp": 0.01294866, + "balance_loss_clip": 0.0631279, + "balance_loss_mlp": 0.01263657, + "epoch": 0.1378024951149857, + "flos": 24286263252480.0, + "grad_norm": 2.140362896023876, + "language_loss": 0.72877645, + "learning_rate": 3.879326032870952e-06, + "loss": 0.80803108, + "num_input_tokens_seen": 49615850, + "router_z_loss_clip": 3.1796875, + "router_z_loss_mlp": 0.31201172, + "step": 2292, + "time_per_iteration": 2.571537971496582 + }, + { + "auxiliary_loss_clip": 0.0663756, + "auxiliary_loss_mlp": 0.01294271, + "balance_loss_clip": 0.06317808, + "balance_loss_mlp": 0.01261179, + "epoch": 0.13786261836765368, + "flos": 14025722434560.0, + "grad_norm": 2.9525020540096842, + "language_loss": 0.81376028, + "learning_rate": 3.879192761826071e-06, + "loss": 0.89307863, + "num_input_tokens_seen": 49631860, + "router_z_loss_clip": 3.19726562, + "router_z_loss_mlp": 0.33056641, + "step": 2293, + "time_per_iteration": 2.520320177078247 + }, + { + "auxiliary_loss_clip": 0.06629369, + "auxiliary_loss_mlp": 0.01294538, + "balance_loss_clip": 0.06308883, + "balance_loss_mlp": 0.01262065, + "epoch": 0.13792274162032167, + "flos": 28886592913920.0, + "grad_norm": 15.103956304175181, + "language_loss": 0.79534554, + "learning_rate": 3.879059419522011e-06, + "loss": 0.87458467, + "num_input_tokens_seen": 49652145, + "router_z_loss_clip": 3.20507812, + "router_z_loss_mlp": 0.32470703, + "step": 2294, + "time_per_iteration": 2.5958240032196045 + }, + { + "auxiliary_loss_clip": 0.06628333, + "auxiliary_loss_mlp": 0.01293802, + "balance_loss_clip": 0.06314936, + "balance_loss_mlp": 0.01264739, + "epoch": 0.13798286487298964, + "flos": 21147344651520.0, + "grad_norm": 2.1249265647314575, + "language_loss": 0.82119411, + "learning_rate": 3.878926005963831e-06, + "loss": 0.90041548, + "num_input_tokens_seen": 49669880, + "router_z_loss_clip": 3.1328125, + "router_z_loss_mlp": 0.29040527, + "step": 2295, + "time_per_iteration": 2.5259695053100586 + }, + { + "auxiliary_loss_clip": 0.06624444, + "auxiliary_loss_mlp": 0.0128892, + "balance_loss_clip": 0.06304439, + "balance_loss_mlp": 0.01258569, + "epoch": 0.1380429881256576, + "flos": 22493286385920.0, + "grad_norm": 1.9411162070190993, + "language_loss": 0.79297817, + "learning_rate": 3.878792521156588e-06, + "loss": 0.8721118, + "num_input_tokens_seen": 49687255, + "router_z_loss_clip": 3.203125, + "router_z_loss_mlp": 0.3034668, + "step": 2296, + "time_per_iteration": 2.5404605865478516 + }, + { + "auxiliary_loss_clip": 0.06623581, + "auxiliary_loss_mlp": 0.01292011, + "balance_loss_clip": 0.06309658, + "balance_loss_mlp": 0.01261755, + "epoch": 0.13810311137832557, + "flos": 21399429260160.0, + "grad_norm": 1.8193304302063846, + "language_loss": 0.79101717, + "learning_rate": 3.8786589651053446e-06, + "loss": 0.87017298, + "num_input_tokens_seen": 49706650, + "router_z_loss_clip": 3.13867188, + "router_z_loss_mlp": 0.30249023, + "step": 2297, + "time_per_iteration": 2.544902801513672 + }, + { + "auxiliary_loss_clip": 0.06617336, + "auxiliary_loss_mlp": 0.01292431, + "balance_loss_clip": 0.06304273, + "balance_loss_mlp": 0.01261162, + "epoch": 0.13816323463099353, + "flos": 25996195123200.0, + "grad_norm": 2.1649336589446113, + "language_loss": 0.70034248, + "learning_rate": 3.878525337815164e-06, + "loss": 0.77944016, + "num_input_tokens_seen": 49725715, + "router_z_loss_clip": 3.12890625, + "router_z_loss_mlp": 0.31286621, + "step": 2298, + "time_per_iteration": 2.7027747631073 + }, + { + "auxiliary_loss_clip": 0.06625488, + "auxiliary_loss_mlp": 0.01293838, + "balance_loss_clip": 0.06304887, + "balance_loss_mlp": 0.01263511, + "epoch": 0.1382233578836615, + "flos": 19250260686720.0, + "grad_norm": 1.8032659924791181, + "language_loss": 0.87816125, + "learning_rate": 3.878391639291116e-06, + "loss": 0.95735455, + "num_input_tokens_seen": 49744710, + "router_z_loss_clip": 3.20703125, + "router_z_loss_mlp": 0.30310059, + "step": 2299, + "time_per_iteration": 2.5216784477233887 + }, + { + "auxiliary_loss_clip": 0.06619459, + "auxiliary_loss_mlp": 0.01291843, + "balance_loss_clip": 0.06297824, + "balance_loss_mlp": 0.01258965, + "epoch": 0.1382834811363295, + "flos": 25673392068480.0, + "grad_norm": 1.8041271752460513, + "language_loss": 0.77313578, + "learning_rate": 3.878257869538267e-06, + "loss": 0.85224879, + "num_input_tokens_seen": 49764300, + "router_z_loss_clip": 3.21484375, + "router_z_loss_mlp": 0.32910156, + "step": 2300, + "time_per_iteration": 2.5652666091918945 + }, + { + "auxiliary_loss_clip": 0.06615824, + "auxiliary_loss_mlp": 0.01292831, + "balance_loss_clip": 0.06301995, + "balance_loss_mlp": 0.01263219, + "epoch": 0.13834360438899745, + "flos": 19788992513280.0, + "grad_norm": 2.607101946436598, + "language_loss": 0.84398985, + "learning_rate": 3.878124028561692e-06, + "loss": 0.92307633, + "num_input_tokens_seen": 49778380, + "router_z_loss_clip": 3.13867188, + "router_z_loss_mlp": 0.29589844, + "step": 2301, + "time_per_iteration": 2.5100109577178955 + }, + { + "auxiliary_loss_clip": 0.06616862, + "auxiliary_loss_mlp": 0.01292457, + "balance_loss_clip": 0.06302989, + "balance_loss_mlp": 0.01262631, + "epoch": 0.13840372764166542, + "flos": 26659200954240.0, + "grad_norm": 1.960897603887865, + "language_loss": 0.87807304, + "learning_rate": 3.877990116366466e-06, + "loss": 0.95716619, + "num_input_tokens_seen": 49797460, + "router_z_loss_clip": 3.140625, + "router_z_loss_mlp": 0.2980957, + "step": 2302, + "time_per_iteration": 2.5661840438842773 + }, + { + "auxiliary_loss_clip": 0.0648245, + "auxiliary_loss_mlp": 0.01256791, + "balance_loss_clip": 0.06296428, + "balance_loss_mlp": 0.01245943, + "epoch": 0.13846385089433338, + "flos": 70532321944320.0, + "grad_norm": 0.7317106160807376, + "language_loss": 0.65412122, + "learning_rate": 3.877856132957667e-06, + "loss": 0.73151362, + "num_input_tokens_seen": 49868005, + "router_z_loss_clip": 1.859375, + "router_z_loss_mlp": 0.10864258, + "step": 2303, + "time_per_iteration": 3.325839042663574 + }, + { + "auxiliary_loss_clip": 0.06609396, + "auxiliary_loss_mlp": 0.01287851, + "balance_loss_clip": 0.0630075, + "balance_loss_mlp": 0.01258263, + "epoch": 0.13852397414700135, + "flos": 17354644168320.0, + "grad_norm": 2.0774651772022885, + "language_loss": 0.79740053, + "learning_rate": 3.877722078340374e-06, + "loss": 0.87637299, + "num_input_tokens_seen": 49885825, + "router_z_loss_clip": 3.09179688, + "router_z_loss_mlp": 0.29589844, + "step": 2304, + "time_per_iteration": 2.543011426925659 + }, + { + "auxiliary_loss_clip": 0.06619786, + "auxiliary_loss_mlp": 0.01290997, + "balance_loss_clip": 0.06300867, + "balance_loss_mlp": 0.01261147, + "epoch": 0.13858409739966931, + "flos": 21550257809280.0, + "grad_norm": 3.5409811557707527, + "language_loss": 0.78727001, + "learning_rate": 3.877587952519672e-06, + "loss": 0.86637783, + "num_input_tokens_seen": 49905975, + "router_z_loss_clip": 3.19335938, + "router_z_loss_mlp": 0.2980957, + "step": 2305, + "time_per_iteration": 2.546365261077881 + }, + { + "auxiliary_loss_clip": 0.06604174, + "auxiliary_loss_mlp": 0.01290068, + "balance_loss_clip": 0.06297874, + "balance_loss_mlp": 0.01261624, + "epoch": 0.13864422065233728, + "flos": 21586329792000.0, + "grad_norm": 1.8829847036148735, + "language_loss": 0.89061654, + "learning_rate": 3.877453755500647e-06, + "loss": 0.96955895, + "num_input_tokens_seen": 49925800, + "router_z_loss_clip": 3.06054688, + "router_z_loss_mlp": 0.28442383, + "step": 2306, + "time_per_iteration": 2.564483165740967 + }, + { + "auxiliary_loss_clip": 0.06468673, + "auxiliary_loss_mlp": 0.0125835, + "balance_loss_clip": 0.0628318, + "balance_loss_mlp": 0.01247258, + "epoch": 0.13870434390500527, + "flos": 53384927650560.0, + "grad_norm": 0.8396257339497795, + "language_loss": 0.58554721, + "learning_rate": 3.877319487288387e-06, + "loss": 0.66281742, + "num_input_tokens_seen": 49977620, + "router_z_loss_clip": 1.859375, + "router_z_loss_mlp": 0.11108398, + "step": 2307, + "time_per_iteration": 4.632705450057983 + }, + { + "auxiliary_loss_clip": 0.0661881, + "auxiliary_loss_mlp": 0.01288588, + "balance_loss_clip": 0.06295981, + "balance_loss_mlp": 0.01258022, + "epoch": 0.13876446715767324, + "flos": 22572641802240.0, + "grad_norm": 1.7746642333134461, + "language_loss": 0.80762124, + "learning_rate": 3.877185147887984e-06, + "loss": 0.88669527, + "num_input_tokens_seen": 49996650, + "router_z_loss_clip": 3.23046875, + "router_z_loss_mlp": 0.30566406, + "step": 2308, + "time_per_iteration": 3.985261917114258 + }, + { + "auxiliary_loss_clip": 0.06612652, + "auxiliary_loss_mlp": 0.0129232, + "balance_loss_clip": 0.06302111, + "balance_loss_mlp": 0.01262208, + "epoch": 0.1388245904103412, + "flos": 20711671747200.0, + "grad_norm": 2.3070434354932425, + "language_loss": 0.7942912, + "learning_rate": 3.877050737304533e-06, + "loss": 0.8733409, + "num_input_tokens_seen": 50015640, + "router_z_loss_clip": 3.10742188, + "router_z_loss_mlp": 0.30102539, + "step": 2309, + "time_per_iteration": 2.5814623832702637 + }, + { + "auxiliary_loss_clip": 0.06621584, + "auxiliary_loss_mlp": 0.01295268, + "balance_loss_clip": 0.06297516, + "balance_loss_mlp": 0.0126444, + "epoch": 0.13888471366300917, + "flos": 20560382000640.0, + "grad_norm": 2.2863258472271437, + "language_loss": 0.6975733, + "learning_rate": 3.876916255543129e-06, + "loss": 0.77674186, + "num_input_tokens_seen": 50033500, + "router_z_loss_clip": 3.24023438, + "router_z_loss_mlp": 0.30786133, + "step": 2310, + "time_per_iteration": 2.5402469635009766 + }, + { + "auxiliary_loss_clip": 0.06612189, + "auxiliary_loss_mlp": 0.01299127, + "balance_loss_clip": 0.06296849, + "balance_loss_mlp": 0.01268967, + "epoch": 0.13894483691567713, + "flos": 13842008357760.0, + "grad_norm": 1.8909078278877924, + "language_loss": 0.85131961, + "learning_rate": 3.8767817026088725e-06, + "loss": 0.9304328, + "num_input_tokens_seen": 50050075, + "router_z_loss_clip": 3.15625, + "router_z_loss_mlp": 0.30126953, + "step": 2311, + "time_per_iteration": 5.377658128738403 + }, + { + "auxiliary_loss_clip": 0.06618226, + "auxiliary_loss_mlp": 0.01294733, + "balance_loss_clip": 0.06296492, + "balance_loss_mlp": 0.01264358, + "epoch": 0.1390049601683451, + "flos": 28037567018880.0, + "grad_norm": 2.5894979273704783, + "language_loss": 0.83215213, + "learning_rate": 3.876647078506866e-06, + "loss": 0.9112817, + "num_input_tokens_seen": 50070080, + "router_z_loss_clip": 3.21679688, + "router_z_loss_mlp": 0.30395508, + "step": 2312, + "time_per_iteration": 2.6039178371429443 + }, + { + "auxiliary_loss_clip": 0.06618522, + "auxiliary_loss_mlp": 0.01290839, + "balance_loss_clip": 0.06296252, + "balance_loss_mlp": 0.01259964, + "epoch": 0.13906508342101306, + "flos": 26763475760640.0, + "grad_norm": 1.7282329609081795, + "language_loss": 0.87823701, + "learning_rate": 3.876512383242215e-06, + "loss": 0.95733058, + "num_input_tokens_seen": 50090040, + "router_z_loss_clip": 3.22460938, + "router_z_loss_mlp": 0.30883789, + "step": 2313, + "time_per_iteration": 2.6105740070343018 + }, + { + "auxiliary_loss_clip": 0.06614069, + "auxiliary_loss_mlp": 0.01289702, + "balance_loss_clip": 0.06295129, + "balance_loss_mlp": 0.01259185, + "epoch": 0.13912520667368106, + "flos": 24541995513600.0, + "grad_norm": 1.8286826676096326, + "language_loss": 0.81090409, + "learning_rate": 3.876377616820024e-06, + "loss": 0.88994175, + "num_input_tokens_seen": 50110595, + "router_z_loss_clip": 3.1875, + "router_z_loss_mlp": 0.30541992, + "step": 2314, + "time_per_iteration": 2.581137180328369 + }, + { + "auxiliary_loss_clip": 0.06609131, + "auxiliary_loss_mlp": 0.0129379, + "balance_loss_clip": 0.06295457, + "balance_loss_mlp": 0.01263678, + "epoch": 0.13918532992634902, + "flos": 19388007999360.0, + "grad_norm": 4.757536248820732, + "language_loss": 0.86588097, + "learning_rate": 3.876242779245409e-06, + "loss": 0.94491017, + "num_input_tokens_seen": 50125430, + "router_z_loss_clip": 3.140625, + "router_z_loss_mlp": 0.30126953, + "step": 2315, + "time_per_iteration": 2.5262932777404785 + }, + { + "auxiliary_loss_clip": 0.06611065, + "auxiliary_loss_mlp": 0.01285772, + "balance_loss_clip": 0.06296186, + "balance_loss_mlp": 0.01255159, + "epoch": 0.139245453179017, + "flos": 21330010552320.0, + "grad_norm": 2.405797075318415, + "language_loss": 0.78922898, + "learning_rate": 3.876107870523477e-06, + "loss": 0.86819738, + "num_input_tokens_seen": 50144120, + "router_z_loss_clip": 3.15039062, + "router_z_loss_mlp": 0.30615234, + "step": 2316, + "time_per_iteration": 2.529972553253174 + }, + { + "auxiliary_loss_clip": 0.06613404, + "auxiliary_loss_mlp": 0.01292141, + "balance_loss_clip": 0.06296711, + "balance_loss_mlp": 0.01260026, + "epoch": 0.13930557643168495, + "flos": 19506747634560.0, + "grad_norm": 1.7528689753979556, + "language_loss": 0.77613419, + "learning_rate": 3.875972890659349e-06, + "loss": 0.85518968, + "num_input_tokens_seen": 50162500, + "router_z_loss_clip": 3.16992188, + "router_z_loss_mlp": 0.32116699, + "step": 2317, + "time_per_iteration": 2.5425355434417725 + }, + { + "auxiliary_loss_clip": 0.06624125, + "auxiliary_loss_mlp": 0.01286591, + "balance_loss_clip": 0.0630217, + "balance_loss_mlp": 0.01257027, + "epoch": 0.13936569968435292, + "flos": 25417869442560.0, + "grad_norm": 1.999588880264202, + "language_loss": 0.81447107, + "learning_rate": 3.875837839658139e-06, + "loss": 0.89357817, + "num_input_tokens_seen": 50182415, + "router_z_loss_clip": 3.22265625, + "router_z_loss_mlp": 0.2956543, + "step": 2318, + "time_per_iteration": 2.577786922454834 + }, + { + "auxiliary_loss_clip": 0.06479447, + "auxiliary_loss_mlp": 0.01268448, + "balance_loss_clip": 0.06297284, + "balance_loss_mlp": 0.01257373, + "epoch": 0.13942582293702088, + "flos": 70793211231360.0, + "grad_norm": 0.8224169172372592, + "language_loss": 0.59232461, + "learning_rate": 3.87570271752497e-06, + "loss": 0.66980362, + "num_input_tokens_seen": 50245160, + "router_z_loss_clip": 1.82421875, + "router_z_loss_mlp": 0.11090088, + "step": 2319, + "time_per_iteration": 3.204317092895508 + }, + { + "auxiliary_loss_clip": 0.06613657, + "auxiliary_loss_mlp": 0.01294413, + "balance_loss_clip": 0.06293797, + "balance_loss_mlp": 0.01263514, + "epoch": 0.13948594618968888, + "flos": 35599725676800.0, + "grad_norm": 2.1444622790100762, + "language_loss": 0.66576529, + "learning_rate": 3.875567524264967e-06, + "loss": 0.74484605, + "num_input_tokens_seen": 50268215, + "router_z_loss_clip": 3.20117188, + "router_z_loss_mlp": 0.30957031, + "step": 2320, + "time_per_iteration": 2.677716016769409 + }, + { + "auxiliary_loss_clip": 0.06604615, + "auxiliary_loss_mlp": 0.01292225, + "balance_loss_clip": 0.062957, + "balance_loss_mlp": 0.01263245, + "epoch": 0.13954606944235684, + "flos": 21111482304000.0, + "grad_norm": 1.7128433163135388, + "language_loss": 0.7132194, + "learning_rate": 3.875432259883256e-06, + "loss": 0.79218775, + "num_input_tokens_seen": 50288575, + "router_z_loss_clip": 3.08984375, + "router_z_loss_mlp": 0.28967285, + "step": 2321, + "time_per_iteration": 2.5557823181152344 + }, + { + "auxiliary_loss_clip": 0.06610114, + "auxiliary_loss_mlp": 0.01289737, + "balance_loss_clip": 0.06294077, + "balance_loss_mlp": 0.01258158, + "epoch": 0.1396061926950248, + "flos": 25051154048640.0, + "grad_norm": 2.1088337541486215, + "language_loss": 0.87096989, + "learning_rate": 3.875296924384965e-06, + "loss": 0.9499684, + "num_input_tokens_seen": 50308735, + "router_z_loss_clip": 3.15820312, + "router_z_loss_mlp": 0.3157959, + "step": 2322, + "time_per_iteration": 2.563751459121704 + }, + { + "auxiliary_loss_clip": 0.06602737, + "auxiliary_loss_mlp": 0.01287424, + "balance_loss_clip": 0.06298044, + "balance_loss_mlp": 0.01258718, + "epoch": 0.13966631594769277, + "flos": 37643193924480.0, + "grad_norm": 1.6181543517844332, + "language_loss": 0.68045509, + "learning_rate": 3.875161517775226e-06, + "loss": 0.75935674, + "num_input_tokens_seen": 50331025, + "router_z_loss_clip": 3.05078125, + "router_z_loss_mlp": 0.28710938, + "step": 2323, + "time_per_iteration": 2.8503611087799072 + }, + { + "auxiliary_loss_clip": 0.06623898, + "auxiliary_loss_mlp": 0.01286528, + "balance_loss_clip": 0.06301014, + "balance_loss_mlp": 0.01257393, + "epoch": 0.13972643920036074, + "flos": 16696627655040.0, + "grad_norm": 2.142170673512178, + "language_loss": 0.90579832, + "learning_rate": 3.875026040059175e-06, + "loss": 0.98490262, + "num_input_tokens_seen": 50349725, + "router_z_loss_clip": 3.23046875, + "router_z_loss_mlp": 0.29150391, + "step": 2324, + "time_per_iteration": 2.5540571212768555 + }, + { + "auxiliary_loss_clip": 0.06618317, + "auxiliary_loss_mlp": 0.01286509, + "balance_loss_clip": 0.0630155, + "balance_loss_mlp": 0.01256659, + "epoch": 0.1397865624530287, + "flos": 23337742233600.0, + "grad_norm": 4.139742528061125, + "language_loss": 0.72620469, + "learning_rate": 3.8748904912419485e-06, + "loss": 0.80525297, + "num_input_tokens_seen": 50367965, + "router_z_loss_clip": 3.16992188, + "router_z_loss_mlp": 0.29821777, + "step": 2325, + "time_per_iteration": 2.5619618892669678 + }, + { + "auxiliary_loss_clip": 0.0662512, + "auxiliary_loss_mlp": 0.01293129, + "balance_loss_clip": 0.06308709, + "balance_loss_mlp": 0.01264591, + "epoch": 0.13984668570569667, + "flos": 22784000526720.0, + "grad_norm": 2.1958407614138, + "language_loss": 0.83206451, + "learning_rate": 3.874754871328688e-06, + "loss": 0.91124701, + "num_input_tokens_seen": 50385605, + "router_z_loss_clip": 3.16210938, + "router_z_loss_mlp": 0.28503418, + "step": 2326, + "time_per_iteration": 2.544154167175293 + }, + { + "auxiliary_loss_clip": 0.06607386, + "auxiliary_loss_mlp": 0.0128622, + "balance_loss_clip": 0.06303836, + "balance_loss_mlp": 0.01256764, + "epoch": 0.13990680895836466, + "flos": 19470759505920.0, + "grad_norm": 1.8381162719470834, + "language_loss": 0.90198052, + "learning_rate": 3.874619180324534e-06, + "loss": 0.98091662, + "num_input_tokens_seen": 50403985, + "router_z_loss_clip": 3.03515625, + "router_z_loss_mlp": 0.2947998, + "step": 2327, + "time_per_iteration": 2.544022798538208 + }, + { + "auxiliary_loss_clip": 0.06612301, + "auxiliary_loss_mlp": 0.01294926, + "balance_loss_clip": 0.06299497, + "balance_loss_mlp": 0.01263479, + "epoch": 0.13996693221103262, + "flos": 20309555203200.0, + "grad_norm": 2.1153988454525927, + "language_loss": 0.86492193, + "learning_rate": 3.874483418234632e-06, + "loss": 0.9439941, + "num_input_tokens_seen": 50421590, + "router_z_loss_clip": 3.12890625, + "router_z_loss_mlp": 0.31433105, + "step": 2328, + "time_per_iteration": 2.498436212539673 + }, + { + "auxiliary_loss_clip": 0.06619829, + "auxiliary_loss_mlp": 0.01290779, + "balance_loss_clip": 0.06303053, + "balance_loss_mlp": 0.01261239, + "epoch": 0.1400270554637006, + "flos": 26625434958720.0, + "grad_norm": 2.232478376897894, + "language_loss": 0.74862719, + "learning_rate": 3.874347585064131e-06, + "loss": 0.82773322, + "num_input_tokens_seen": 50443945, + "router_z_loss_clip": 3.16992188, + "router_z_loss_mlp": 0.29541016, + "step": 2329, + "time_per_iteration": 2.625213146209717 + }, + { + "auxiliary_loss_clip": 0.06613478, + "auxiliary_loss_mlp": 0.01291404, + "balance_loss_clip": 0.06302348, + "balance_loss_mlp": 0.01261912, + "epoch": 0.14008717871636855, + "flos": 19397651218560.0, + "grad_norm": 2.9962397362189797, + "language_loss": 0.79502976, + "learning_rate": 3.874211680818183e-06, + "loss": 0.87407863, + "num_input_tokens_seen": 50462065, + "router_z_loss_clip": 3.11132812, + "router_z_loss_mlp": 0.29516602, + "step": 2330, + "time_per_iteration": 2.526705265045166 + }, + { + "auxiliary_loss_clip": 0.06610473, + "auxiliary_loss_mlp": 0.01292963, + "balance_loss_clip": 0.06299306, + "balance_loss_mlp": 0.01265187, + "epoch": 0.14014730196903652, + "flos": 15309624620160.0, + "grad_norm": 3.126642482841082, + "language_loss": 0.73399383, + "learning_rate": 3.87407570550194e-06, + "loss": 0.81302822, + "num_input_tokens_seen": 50479565, + "router_z_loss_clip": 3.11328125, + "router_z_loss_mlp": 0.27783203, + "step": 2331, + "time_per_iteration": 2.5545501708984375 + }, + { + "auxiliary_loss_clip": 0.06595145, + "auxiliary_loss_mlp": 0.01295524, + "balance_loss_clip": 0.06296061, + "balance_loss_mlp": 0.01267176, + "epoch": 0.14020742522170448, + "flos": 14945047505280.0, + "grad_norm": 1.5446780905805184, + "language_loss": 0.73888373, + "learning_rate": 3.873939659120557e-06, + "loss": 0.81779039, + "num_input_tokens_seen": 50497305, + "router_z_loss_clip": 2.9921875, + "router_z_loss_mlp": 0.28344727, + "step": 2332, + "time_per_iteration": 2.5132856369018555 + }, + { + "auxiliary_loss_clip": 0.06469279, + "auxiliary_loss_mlp": 0.01265718, + "balance_loss_clip": 0.0628898, + "balance_loss_mlp": 0.01254947, + "epoch": 0.14026754847437245, + "flos": 48839956410240.0, + "grad_norm": 0.7856293848414069, + "language_loss": 0.55978549, + "learning_rate": 3.873803541679196e-06, + "loss": 0.63713545, + "num_input_tokens_seen": 50549735, + "router_z_loss_clip": 1.80371094, + "router_z_loss_mlp": 0.10784912, + "step": 2333, + "time_per_iteration": 3.0545504093170166 + }, + { + "auxiliary_loss_clip": 0.06614032, + "auxiliary_loss_mlp": 0.01304219, + "balance_loss_clip": 0.06302805, + "balance_loss_mlp": 0.01274512, + "epoch": 0.14032767172704044, + "flos": 25779972862080.0, + "grad_norm": 1.7607916686559548, + "language_loss": 0.83699584, + "learning_rate": 3.873667353183016e-06, + "loss": 0.91617835, + "num_input_tokens_seen": 50570100, + "router_z_loss_clip": 3.11328125, + "router_z_loss_mlp": 0.29699707, + "step": 2334, + "time_per_iteration": 2.6067097187042236 + }, + { + "auxiliary_loss_clip": 0.06611067, + "auxiliary_loss_mlp": 0.01296359, + "balance_loss_clip": 0.06295306, + "balance_loss_mlp": 0.01268023, + "epoch": 0.1403877949797084, + "flos": 21222884707200.0, + "grad_norm": 3.2536049566200846, + "language_loss": 0.81910211, + "learning_rate": 3.8735310936371825e-06, + "loss": 0.89817637, + "num_input_tokens_seen": 50589185, + "router_z_loss_clip": 3.15625, + "router_z_loss_mlp": 0.28356934, + "step": 2335, + "time_per_iteration": 2.5793120861053467 + }, + { + "auxiliary_loss_clip": 0.06618994, + "auxiliary_loss_mlp": 0.0129466, + "balance_loss_clip": 0.06299357, + "balance_loss_mlp": 0.01262044, + "epoch": 0.14044791823237637, + "flos": 22754678797440.0, + "grad_norm": 1.8425920337650705, + "language_loss": 0.83025301, + "learning_rate": 3.873394763046862e-06, + "loss": 0.9093895, + "num_input_tokens_seen": 50609645, + "router_z_loss_clip": 3.19335938, + "router_z_loss_mlp": 0.32617188, + "step": 2336, + "time_per_iteration": 2.5754895210266113 + }, + { + "auxiliary_loss_clip": 0.0660933, + "auxiliary_loss_mlp": 0.0129189, + "balance_loss_clip": 0.06299824, + "balance_loss_mlp": 0.01261516, + "epoch": 0.14050804148504434, + "flos": 22970775277440.0, + "grad_norm": 1.9428001111866895, + "language_loss": 0.81449389, + "learning_rate": 3.873258361417225e-06, + "loss": 0.89350611, + "num_input_tokens_seen": 50628385, + "router_z_loss_clip": 3.09765625, + "router_z_loss_mlp": 0.30371094, + "step": 2337, + "time_per_iteration": 2.542494773864746 + }, + { + "auxiliary_loss_clip": 0.06620462, + "auxiliary_loss_mlp": 0.01292117, + "balance_loss_clip": 0.06304233, + "balance_loss_mlp": 0.01262493, + "epoch": 0.1405681647377123, + "flos": 22206890730240.0, + "grad_norm": 2.099495755823345, + "language_loss": 0.80428421, + "learning_rate": 3.873121888753442e-06, + "loss": 0.88341004, + "num_input_tokens_seen": 50647260, + "router_z_loss_clip": 3.16796875, + "router_z_loss_mlp": 0.29626465, + "step": 2338, + "time_per_iteration": 2.5587832927703857 + }, + { + "auxiliary_loss_clip": 0.06618391, + "auxiliary_loss_mlp": 0.01291133, + "balance_loss_clip": 0.06299177, + "balance_loss_mlp": 0.01259447, + "epoch": 0.14062828799038027, + "flos": 23739607215360.0, + "grad_norm": 2.563407914599119, + "language_loss": 0.81585765, + "learning_rate": 3.87298534506069e-06, + "loss": 0.89495289, + "num_input_tokens_seen": 50666130, + "router_z_loss_clip": 3.19140625, + "router_z_loss_mlp": 0.31689453, + "step": 2339, + "time_per_iteration": 2.541985273361206 + }, + { + "auxiliary_loss_clip": 0.06608106, + "auxiliary_loss_mlp": 0.01284227, + "balance_loss_clip": 0.06301871, + "balance_loss_mlp": 0.01254735, + "epoch": 0.14068841124304826, + "flos": 39211856611200.0, + "grad_norm": 1.7427009821835167, + "language_loss": 0.66622555, + "learning_rate": 3.872848730344146e-06, + "loss": 0.7451489, + "num_input_tokens_seen": 50687440, + "router_z_loss_clip": 3.06640625, + "router_z_loss_mlp": 0.29492188, + "step": 2340, + "time_per_iteration": 2.7599191665649414 + }, + { + "auxiliary_loss_clip": 0.06615461, + "auxiliary_loss_mlp": 0.01296967, + "balance_loss_clip": 0.06309174, + "balance_loss_mlp": 0.01267952, + "epoch": 0.14074853449571623, + "flos": 20198278581120.0, + "grad_norm": 2.455789479029152, + "language_loss": 0.80003643, + "learning_rate": 3.87271204460899e-06, + "loss": 0.87916064, + "num_input_tokens_seen": 50704030, + "router_z_loss_clip": 3.06445312, + "router_z_loss_mlp": 0.2902832, + "step": 2341, + "time_per_iteration": 2.5097782611846924 + }, + { + "auxiliary_loss_clip": 0.06617275, + "auxiliary_loss_mlp": 0.01290109, + "balance_loss_clip": 0.06306843, + "balance_loss_mlp": 0.01261118, + "epoch": 0.1408086577483842, + "flos": 18411800405760.0, + "grad_norm": 1.7920815266740484, + "language_loss": 0.81707942, + "learning_rate": 3.8725752878604066e-06, + "loss": 0.89615333, + "num_input_tokens_seen": 50723305, + "router_z_loss_clip": 3.10351562, + "router_z_loss_mlp": 0.29003906, + "step": 2342, + "time_per_iteration": 2.5234599113464355 + }, + { + "auxiliary_loss_clip": 0.06617711, + "auxiliary_loss_mlp": 0.01285014, + "balance_loss_clip": 0.06315217, + "balance_loss_mlp": 0.01257858, + "epoch": 0.14086878100105216, + "flos": 25271569013760.0, + "grad_norm": 1.8907393143090194, + "language_loss": 0.79096431, + "learning_rate": 3.87243846010358e-06, + "loss": 0.8699916, + "num_input_tokens_seen": 50743270, + "router_z_loss_clip": 3.02734375, + "router_z_loss_mlp": 0.27172852, + "step": 2343, + "time_per_iteration": 2.566734552383423 + }, + { + "auxiliary_loss_clip": 0.0648333, + "auxiliary_loss_mlp": 0.01280273, + "balance_loss_clip": 0.06304723, + "balance_loss_mlp": 0.01268566, + "epoch": 0.14092890425372012, + "flos": 65997553703040.0, + "grad_norm": 0.8105470614930316, + "language_loss": 0.61667693, + "learning_rate": 3.872301561343699e-06, + "loss": 0.69431293, + "num_input_tokens_seen": 50802710, + "router_z_loss_clip": 1.78125, + "router_z_loss_mlp": 0.11694336, + "step": 2344, + "time_per_iteration": 3.107311964035034 + }, + { + "auxiliary_loss_clip": 0.06612515, + "auxiliary_loss_mlp": 0.01296816, + "balance_loss_clip": 0.06307824, + "balance_loss_mlp": 0.01267514, + "epoch": 0.1409890275063881, + "flos": 23701564661760.0, + "grad_norm": 1.4479662088391603, + "language_loss": 0.66076458, + "learning_rate": 3.872164591585956e-06, + "loss": 0.73985791, + "num_input_tokens_seen": 50822625, + "router_z_loss_clip": 3.046875, + "router_z_loss_mlp": 0.29321289, + "step": 2345, + "time_per_iteration": 2.548482656478882 + }, + { + "auxiliary_loss_clip": 0.06630909, + "auxiliary_loss_mlp": 0.0129167, + "balance_loss_clip": 0.06307563, + "balance_loss_mlp": 0.01260676, + "epoch": 0.14104915075905605, + "flos": 23629923820800.0, + "grad_norm": 2.297389176264822, + "language_loss": 0.7525146, + "learning_rate": 3.8720275508355435e-06, + "loss": 0.83174026, + "num_input_tokens_seen": 50842330, + "router_z_loss_clip": 3.23242188, + "router_z_loss_mlp": 0.31005859, + "step": 2346, + "time_per_iteration": 3.9794979095458984 + }, + { + "auxiliary_loss_clip": 0.06626198, + "auxiliary_loss_mlp": 0.01293091, + "balance_loss_clip": 0.06312405, + "balance_loss_mlp": 0.0126162, + "epoch": 0.14110927401172405, + "flos": 20601485228160.0, + "grad_norm": 2.0524474508447876, + "language_loss": 0.7827574, + "learning_rate": 3.8718904390976585e-06, + "loss": 0.86195028, + "num_input_tokens_seen": 50861035, + "router_z_loss_clip": 3.13867188, + "router_z_loss_mlp": 0.31445312, + "step": 2347, + "time_per_iteration": 3.98130202293396 + }, + { + "auxiliary_loss_clip": 0.06624688, + "auxiliary_loss_mlp": 0.01292693, + "balance_loss_clip": 0.06315368, + "balance_loss_mlp": 0.01263725, + "epoch": 0.141169397264392, + "flos": 28555530232320.0, + "grad_norm": 2.266106813963602, + "language_loss": 0.77906024, + "learning_rate": 3.8717532563775e-06, + "loss": 0.85823405, + "num_input_tokens_seen": 50880105, + "router_z_loss_clip": 3.09765625, + "router_z_loss_mlp": 0.28955078, + "step": 2348, + "time_per_iteration": 2.594891309738159 + }, + { + "auxiliary_loss_clip": 0.06614843, + "auxiliary_loss_mlp": 0.01295406, + "balance_loss_clip": 0.06306847, + "balance_loss_mlp": 0.01267558, + "epoch": 0.14122952051705998, + "flos": 17097947585280.0, + "grad_norm": 2.2615839491571097, + "language_loss": 0.88040984, + "learning_rate": 3.871616002680272e-06, + "loss": 0.95951235, + "num_input_tokens_seen": 50897720, + "router_z_loss_clip": 3.08007812, + "router_z_loss_mlp": 0.27856445, + "step": 2349, + "time_per_iteration": 2.547189712524414 + }, + { + "auxiliary_loss_clip": 0.06613597, + "auxiliary_loss_mlp": 0.01290937, + "balance_loss_clip": 0.06307055, + "balance_loss_mlp": 0.01260754, + "epoch": 0.14128964376972794, + "flos": 28953915269760.0, + "grad_norm": 1.755772853620136, + "language_loss": 0.89833802, + "learning_rate": 3.871478678011177e-06, + "loss": 0.97738338, + "num_input_tokens_seen": 50918385, + "router_z_loss_clip": 3.06445312, + "router_z_loss_mlp": 0.30200195, + "step": 2350, + "time_per_iteration": 2.5965797901153564 + }, + { + "auxiliary_loss_clip": 0.06614771, + "auxiliary_loss_mlp": 0.01295884, + "balance_loss_clip": 0.06303953, + "balance_loss_mlp": 0.0126626, + "epoch": 0.1413497670223959, + "flos": 18995828163840.0, + "grad_norm": 2.169076392434691, + "language_loss": 0.81670076, + "learning_rate": 3.871341282375423e-06, + "loss": 0.89580733, + "num_input_tokens_seen": 50938270, + "router_z_loss_clip": 3.10742188, + "router_z_loss_mlp": 0.29638672, + "step": 2351, + "time_per_iteration": 4.039130687713623 + }, + { + "auxiliary_loss_clip": 0.06617273, + "auxiliary_loss_mlp": 0.012885, + "balance_loss_clip": 0.06303668, + "balance_loss_mlp": 0.01259246, + "epoch": 0.14140989027506387, + "flos": 29870053885440.0, + "grad_norm": 2.711725731055931, + "language_loss": 0.85320342, + "learning_rate": 3.871203815778219e-06, + "loss": 0.93226123, + "num_input_tokens_seen": 50958155, + "router_z_loss_clip": 3.1328125, + "router_z_loss_mlp": 0.29223633, + "step": 2352, + "time_per_iteration": 2.6179373264312744 + }, + { + "auxiliary_loss_clip": 0.06476805, + "auxiliary_loss_mlp": 0.01279755, + "balance_loss_clip": 0.06299812, + "balance_loss_mlp": 0.01267614, + "epoch": 0.14147001352773186, + "flos": 62098901331840.0, + "grad_norm": 0.8822482530682503, + "language_loss": 0.61915213, + "learning_rate": 3.87106627822478e-06, + "loss": 0.69671774, + "num_input_tokens_seen": 51020705, + "router_z_loss_clip": 1.76953125, + "router_z_loss_mlp": 0.12139893, + "step": 2353, + "time_per_iteration": 3.087498188018799 + }, + { + "auxiliary_loss_clip": 0.06606863, + "auxiliary_loss_mlp": 0.01289785, + "balance_loss_clip": 0.06297718, + "balance_loss_mlp": 0.01259458, + "epoch": 0.14153013678039983, + "flos": 22023973267200.0, + "grad_norm": 1.6072508509392793, + "language_loss": 0.88457793, + "learning_rate": 3.8709286697203196e-06, + "loss": 0.96354443, + "num_input_tokens_seen": 51039995, + "router_z_loss_clip": 3.09179688, + "router_z_loss_mlp": 0.30297852, + "step": 2354, + "time_per_iteration": 2.5465357303619385 + }, + { + "auxiliary_loss_clip": 0.06612588, + "auxiliary_loss_mlp": 0.01286583, + "balance_loss_clip": 0.0630111, + "balance_loss_mlp": 0.01255231, + "epoch": 0.1415902600330678, + "flos": 19726365985920.0, + "grad_norm": 1.842515646240357, + "language_loss": 0.75627196, + "learning_rate": 3.870790990270057e-06, + "loss": 0.83526361, + "num_input_tokens_seen": 51059075, + "router_z_loss_clip": 3.11523438, + "router_z_loss_mlp": 0.31347656, + "step": 2355, + "time_per_iteration": 2.5172102451324463 + }, + { + "auxiliary_loss_clip": 0.0647012, + "auxiliary_loss_mlp": 0.01269619, + "balance_loss_clip": 0.06293327, + "balance_loss_mlp": 0.01258312, + "epoch": 0.14165038328573576, + "flos": 65919330316800.0, + "grad_norm": 0.6582247032564781, + "language_loss": 0.51791292, + "learning_rate": 3.870653239879212e-06, + "loss": 0.59531033, + "num_input_tokens_seen": 51120380, + "router_z_loss_clip": 1.765625, + "router_z_loss_mlp": 0.11303711, + "step": 2356, + "time_per_iteration": 3.150625228881836 + }, + { + "auxiliary_loss_clip": 0.06615196, + "auxiliary_loss_mlp": 0.01292015, + "balance_loss_clip": 0.06304777, + "balance_loss_mlp": 0.01263262, + "epoch": 0.14171050653840372, + "flos": 12135011379840.0, + "grad_norm": 2.2420127528599973, + "language_loss": 0.71637189, + "learning_rate": 3.8705154185530095e-06, + "loss": 0.79544401, + "num_input_tokens_seen": 51136950, + "router_z_loss_clip": 3.10742188, + "router_z_loss_mlp": 0.28759766, + "step": 2357, + "time_per_iteration": 2.552600383758545 + }, + { + "auxiliary_loss_clip": 0.06616427, + "auxiliary_loss_mlp": 0.01288449, + "balance_loss_clip": 0.06301764, + "balance_loss_mlp": 0.01259624, + "epoch": 0.1417706297910717, + "flos": 20418735473280.0, + "grad_norm": 1.865810969860464, + "language_loss": 0.83125997, + "learning_rate": 3.870377526296674e-06, + "loss": 0.91030866, + "num_input_tokens_seen": 51155175, + "router_z_loss_clip": 3.15039062, + "router_z_loss_mlp": 0.28833008, + "step": 2358, + "time_per_iteration": 2.5359318256378174 + }, + { + "auxiliary_loss_clip": 0.06627464, + "auxiliary_loss_mlp": 0.01304325, + "balance_loss_clip": 0.06307626, + "balance_loss_mlp": 0.01270685, + "epoch": 0.14183075304373965, + "flos": 22386831373440.0, + "grad_norm": 2.098054947183796, + "language_loss": 0.72660583, + "learning_rate": 3.870239563115436e-06, + "loss": 0.8059237, + "num_input_tokens_seen": 51174500, + "router_z_loss_clip": 3.19726562, + "router_z_loss_mlp": 0.33642578, + "step": 2359, + "time_per_iteration": 2.5888121128082275 + }, + { + "auxiliary_loss_clip": 0.06615248, + "auxiliary_loss_mlp": 0.01292517, + "balance_loss_clip": 0.06299685, + "balance_loss_mlp": 0.0126126, + "epoch": 0.14189087629640765, + "flos": 21587503749120.0, + "grad_norm": 2.25647767982073, + "language_loss": 0.77278101, + "learning_rate": 3.870101529014526e-06, + "loss": 0.85185868, + "num_input_tokens_seen": 51194270, + "router_z_loss_clip": 3.16015625, + "router_z_loss_mlp": 0.31225586, + "step": 2360, + "time_per_iteration": 2.579084634780884 + }, + { + "auxiliary_loss_clip": 0.06601179, + "auxiliary_loss_mlp": 0.01289048, + "balance_loss_clip": 0.06295604, + "balance_loss_mlp": 0.01258936, + "epoch": 0.1419509995490756, + "flos": 20014312942080.0, + "grad_norm": 2.059957260866831, + "language_loss": 0.83125579, + "learning_rate": 3.869963423999178e-06, + "loss": 0.91015804, + "num_input_tokens_seen": 51211850, + "router_z_loss_clip": 3.0546875, + "router_z_loss_mlp": 0.30102539, + "step": 2361, + "time_per_iteration": 2.5846474170684814 + }, + { + "auxiliary_loss_clip": 0.06605215, + "auxiliary_loss_mlp": 0.01291381, + "balance_loss_clip": 0.06296673, + "balance_loss_mlp": 0.01261745, + "epoch": 0.14201112280174358, + "flos": 31949552188800.0, + "grad_norm": 1.940007653055607, + "language_loss": 0.75587547, + "learning_rate": 3.86982524807463e-06, + "loss": 0.83484137, + "num_input_tokens_seen": 51233545, + "router_z_loss_clip": 3.08789062, + "router_z_loss_mlp": 0.29663086, + "step": 2362, + "time_per_iteration": 2.6412899494171143 + }, + { + "auxiliary_loss_clip": 0.06603248, + "auxiliary_loss_mlp": 0.01291653, + "balance_loss_clip": 0.06299227, + "balance_loss_mlp": 0.01262948, + "epoch": 0.14207124605441154, + "flos": 41473811180160.0, + "grad_norm": 1.7220107932789903, + "language_loss": 0.74775076, + "learning_rate": 3.869687001246122e-06, + "loss": 0.82669979, + "num_input_tokens_seen": 51257615, + "router_z_loss_clip": 3.04296875, + "router_z_loss_mlp": 0.28686523, + "step": 2363, + "time_per_iteration": 2.7700705528259277 + }, + { + "auxiliary_loss_clip": 0.0660228, + "auxiliary_loss_mlp": 0.01297174, + "balance_loss_clip": 0.06294844, + "balance_loss_mlp": 0.01268051, + "epoch": 0.1421313693070795, + "flos": 31913186716800.0, + "grad_norm": 1.995738601500514, + "language_loss": 0.74229443, + "learning_rate": 3.8695486835188946e-06, + "loss": 0.82128894, + "num_input_tokens_seen": 51279645, + "router_z_loss_clip": 3.07226562, + "router_z_loss_mlp": 0.2911377, + "step": 2364, + "time_per_iteration": 2.636725664138794 + }, + { + "auxiliary_loss_clip": 0.06596863, + "auxiliary_loss_mlp": 0.01292827, + "balance_loss_clip": 0.06297632, + "balance_loss_mlp": 0.01264741, + "epoch": 0.14219149255974747, + "flos": 26878609670400.0, + "grad_norm": 3.4348232103303853, + "language_loss": 0.91282582, + "learning_rate": 3.869410294898195e-06, + "loss": 0.9917227, + "num_input_tokens_seen": 51299775, + "router_z_loss_clip": 2.9921875, + "router_z_loss_mlp": 0.28100586, + "step": 2365, + "time_per_iteration": 2.6131789684295654 + }, + { + "auxiliary_loss_clip": 0.06604894, + "auxiliary_loss_mlp": 0.01286963, + "balance_loss_clip": 0.06295748, + "balance_loss_mlp": 0.01257613, + "epoch": 0.14225161581241544, + "flos": 27461882741760.0, + "grad_norm": 1.7987446671320764, + "language_loss": 0.67002726, + "learning_rate": 3.869271835389268e-06, + "loss": 0.74894583, + "num_input_tokens_seen": 51319430, + "router_z_loss_clip": 3.09570312, + "router_z_loss_mlp": 0.29345703, + "step": 2366, + "time_per_iteration": 2.5887913703918457 + }, + { + "auxiliary_loss_clip": 0.06604536, + "auxiliary_loss_mlp": 0.01294035, + "balance_loss_clip": 0.06302322, + "balance_loss_mlp": 0.01266069, + "epoch": 0.14231173906508343, + "flos": 10566055203840.0, + "grad_norm": 1.9092553080536903, + "language_loss": 0.81985664, + "learning_rate": 3.8691333049973665e-06, + "loss": 0.89884233, + "num_input_tokens_seen": 51336045, + "router_z_loss_clip": 3.0234375, + "router_z_loss_mlp": 0.27978516, + "step": 2367, + "time_per_iteration": 2.5478296279907227 + }, + { + "auxiliary_loss_clip": 0.06620896, + "auxiliary_loss_mlp": 0.01287452, + "balance_loss_clip": 0.06312472, + "balance_loss_mlp": 0.01257244, + "epoch": 0.1423718623177514, + "flos": 28367539597440.0, + "grad_norm": 1.7968709236925184, + "language_loss": 0.83861458, + "learning_rate": 3.868994703727742e-06, + "loss": 0.91769814, + "num_input_tokens_seen": 51357030, + "router_z_loss_clip": 3.08984375, + "router_z_loss_mlp": 0.30224609, + "step": 2368, + "time_per_iteration": 2.6346163749694824 + }, + { + "auxiliary_loss_clip": 0.06607647, + "auxiliary_loss_mlp": 0.01292051, + "balance_loss_clip": 0.06299834, + "balance_loss_mlp": 0.01262558, + "epoch": 0.14243198557041936, + "flos": 19360279497600.0, + "grad_norm": 2.15297979683556, + "language_loss": 0.8844623, + "learning_rate": 3.868856031585652e-06, + "loss": 0.96345925, + "num_input_tokens_seen": 51374890, + "router_z_loss_clip": 3.078125, + "router_z_loss_mlp": 0.29516602, + "step": 2369, + "time_per_iteration": 2.539616823196411 + }, + { + "auxiliary_loss_clip": 0.06609218, + "auxiliary_loss_mlp": 0.01286988, + "balance_loss_clip": 0.06298466, + "balance_loss_mlp": 0.01257067, + "epoch": 0.14249210882308733, + "flos": 28814952072960.0, + "grad_norm": 1.4943626605358518, + "language_loss": 0.76837498, + "learning_rate": 3.868717288576354e-06, + "loss": 0.84733701, + "num_input_tokens_seen": 51398100, + "router_z_loss_clip": 3.10546875, + "router_z_loss_mlp": 0.29931641, + "step": 2370, + "time_per_iteration": 2.6086556911468506 + }, + { + "auxiliary_loss_clip": 0.06600792, + "auxiliary_loss_mlp": 0.01298284, + "balance_loss_clip": 0.06298122, + "balance_loss_mlp": 0.01270198, + "epoch": 0.1425522320757553, + "flos": 21841433147520.0, + "grad_norm": 1.5553091357309907, + "language_loss": 0.83888042, + "learning_rate": 3.868578474705109e-06, + "loss": 0.91787124, + "num_input_tokens_seen": 51418745, + "router_z_loss_clip": 3.02734375, + "router_z_loss_mlp": 0.28076172, + "step": 2371, + "time_per_iteration": 2.5464093685150146 + }, + { + "auxiliary_loss_clip": 0.06608661, + "auxiliary_loss_mlp": 0.01298037, + "balance_loss_clip": 0.06299958, + "balance_loss_mlp": 0.01267448, + "epoch": 0.14261235532842326, + "flos": 17317230520320.0, + "grad_norm": 1.80299500179396, + "language_loss": 0.84039259, + "learning_rate": 3.868439589977181e-06, + "loss": 0.91945958, + "num_input_tokens_seen": 51437455, + "router_z_loss_clip": 3.08984375, + "router_z_loss_mlp": 0.30615234, + "step": 2372, + "time_per_iteration": 2.6340725421905518 + }, + { + "auxiliary_loss_clip": 0.0660327, + "auxiliary_loss_mlp": 0.01297499, + "balance_loss_clip": 0.06296232, + "balance_loss_mlp": 0.01267149, + "epoch": 0.14267247858109125, + "flos": 18812659138560.0, + "grad_norm": 1.948811934487197, + "language_loss": 0.8570497, + "learning_rate": 3.868300634397836e-06, + "loss": 0.93605745, + "num_input_tokens_seen": 51455710, + "router_z_loss_clip": 3.07226562, + "router_z_loss_mlp": 0.30322266, + "step": 2373, + "time_per_iteration": 2.580719232559204 + }, + { + "auxiliary_loss_clip": 0.06601362, + "auxiliary_loss_mlp": 0.01295253, + "balance_loss_clip": 0.06296989, + "balance_loss_mlp": 0.01266547, + "epoch": 0.14273260183375922, + "flos": 11362783351680.0, + "grad_norm": 1.9518464435556906, + "language_loss": 0.87130672, + "learning_rate": 3.8681616079723445e-06, + "loss": 0.95027292, + "num_input_tokens_seen": 51471270, + "router_z_loss_clip": 3.04101562, + "router_z_loss_mlp": 0.28710938, + "step": 2374, + "time_per_iteration": 2.499939441680908 + }, + { + "auxiliary_loss_clip": 0.0660402, + "auxiliary_loss_mlp": 0.01294805, + "balance_loss_clip": 0.06292336, + "balance_loss_mlp": 0.01264526, + "epoch": 0.14279272508642718, + "flos": 27575800767360.0, + "grad_norm": 1.5586534981326832, + "language_loss": 0.79946959, + "learning_rate": 3.868022510705977e-06, + "loss": 0.87845778, + "num_input_tokens_seen": 51492705, + "router_z_loss_clip": 3.1171875, + "router_z_loss_mlp": 0.30273438, + "step": 2375, + "time_per_iteration": 2.610959768295288 + }, + { + "auxiliary_loss_clip": 0.06608847, + "auxiliary_loss_mlp": 0.01308792, + "balance_loss_clip": 0.06302035, + "balance_loss_mlp": 0.01278454, + "epoch": 0.14285284833909515, + "flos": 16258019857920.0, + "grad_norm": 4.976375068021591, + "language_loss": 0.77988309, + "learning_rate": 3.867883342604009e-06, + "loss": 0.85905945, + "num_input_tokens_seen": 51510780, + "router_z_loss_clip": 3.0703125, + "router_z_loss_mlp": 0.30310059, + "step": 2376, + "time_per_iteration": 2.5109288692474365 + }, + { + "auxiliary_loss_clip": 0.06606634, + "auxiliary_loss_mlp": 0.01292138, + "balance_loss_clip": 0.06302465, + "balance_loss_mlp": 0.01263742, + "epoch": 0.1429129715917631, + "flos": 19761725208960.0, + "grad_norm": 1.9346292161061796, + "language_loss": 0.94255036, + "learning_rate": 3.867744103671717e-06, + "loss": 1.02153814, + "num_input_tokens_seen": 51531400, + "router_z_loss_clip": 3.04101562, + "router_z_loss_mlp": 0.28393555, + "step": 2377, + "time_per_iteration": 2.5885112285614014 + }, + { + "auxiliary_loss_clip": 0.06608409, + "auxiliary_loss_mlp": 0.01297565, + "balance_loss_clip": 0.06298596, + "balance_loss_mlp": 0.01267524, + "epoch": 0.14297309484443108, + "flos": 21142606896000.0, + "grad_norm": 1.9262255620531108, + "language_loss": 0.92638403, + "learning_rate": 3.867604793914382e-06, + "loss": 1.00544381, + "num_input_tokens_seen": 51548215, + "router_z_loss_clip": 3.09960938, + "router_z_loss_mlp": 0.30029297, + "step": 2378, + "time_per_iteration": 2.5396018028259277 + }, + { + "auxiliary_loss_clip": 0.06602019, + "auxiliary_loss_mlp": 0.01288289, + "balance_loss_clip": 0.06294227, + "balance_loss_mlp": 0.01259667, + "epoch": 0.14303321809709904, + "flos": 23593432567680.0, + "grad_norm": 1.925396398414909, + "language_loss": 0.7506215, + "learning_rate": 3.8674654133372864e-06, + "loss": 0.82952458, + "num_input_tokens_seen": 51566820, + "router_z_loss_clip": 3.07421875, + "router_z_loss_mlp": 0.28649902, + "step": 2379, + "time_per_iteration": 2.5452654361724854 + }, + { + "auxiliary_loss_clip": 0.06604548, + "auxiliary_loss_mlp": 0.01289072, + "balance_loss_clip": 0.06300471, + "balance_loss_mlp": 0.01259342, + "epoch": 0.14309334134976703, + "flos": 15893778159360.0, + "grad_norm": 2.089306422098332, + "language_loss": 0.80051982, + "learning_rate": 3.867325961945714e-06, + "loss": 0.87945604, + "num_input_tokens_seen": 51585075, + "router_z_loss_clip": 3.0390625, + "router_z_loss_mlp": 0.29736328, + "step": 2380, + "time_per_iteration": 2.526667594909668 + }, + { + "auxiliary_loss_clip": 0.06614038, + "auxiliary_loss_mlp": 0.01293901, + "balance_loss_clip": 0.06305015, + "balance_loss_mlp": 0.01263348, + "epoch": 0.143153464602435, + "flos": 16331086218240.0, + "grad_norm": 2.094305551914021, + "language_loss": 0.88833153, + "learning_rate": 3.867186439744955e-06, + "loss": 0.96741092, + "num_input_tokens_seen": 51603185, + "router_z_loss_clip": 3.09179688, + "router_z_loss_mlp": 0.30578613, + "step": 2381, + "time_per_iteration": 2.5728068351745605 + }, + { + "auxiliary_loss_clip": 0.06602444, + "auxiliary_loss_mlp": 0.0128486, + "balance_loss_clip": 0.06299065, + "balance_loss_mlp": 0.01256226, + "epoch": 0.14321358785510296, + "flos": 17097737950080.0, + "grad_norm": 2.316632685614806, + "language_loss": 0.77740443, + "learning_rate": 3.867046846740299e-06, + "loss": 0.85627747, + "num_input_tokens_seen": 51620880, + "router_z_loss_clip": 3.02929688, + "router_z_loss_mlp": 0.28625488, + "step": 2382, + "time_per_iteration": 2.5297727584838867 + }, + { + "auxiliary_loss_clip": 0.06601999, + "auxiliary_loss_mlp": 0.01286872, + "balance_loss_clip": 0.06297128, + "balance_loss_mlp": 0.01257904, + "epoch": 0.14327371110777093, + "flos": 26330108843520.0, + "grad_norm": 2.004241684907444, + "language_loss": 0.78048921, + "learning_rate": 3.866907182937039e-06, + "loss": 0.85937786, + "num_input_tokens_seen": 51640170, + "router_z_loss_clip": 3.05078125, + "router_z_loss_mlp": 0.28955078, + "step": 2383, + "time_per_iteration": 2.598944664001465 + }, + { + "auxiliary_loss_clip": 0.06614614, + "auxiliary_loss_mlp": 0.01292365, + "balance_loss_clip": 0.06304877, + "balance_loss_mlp": 0.01261513, + "epoch": 0.1433338343604389, + "flos": 18082163243520.0, + "grad_norm": 3.628436675924041, + "language_loss": 0.88476908, + "learning_rate": 3.866767448340471e-06, + "loss": 0.96383882, + "num_input_tokens_seen": 51656580, + "router_z_loss_clip": 3.09765625, + "router_z_loss_mlp": 0.30834961, + "step": 2384, + "time_per_iteration": 2.5066895484924316 + }, + { + "auxiliary_loss_clip": 0.06611983, + "auxiliary_loss_mlp": 0.01297446, + "balance_loss_clip": 0.06300933, + "balance_loss_mlp": 0.0126719, + "epoch": 0.14339395761310686, + "flos": 15528110941440.0, + "grad_norm": 5.651210237348795, + "language_loss": 0.81964046, + "learning_rate": 3.866627642955895e-06, + "loss": 0.89873475, + "num_input_tokens_seen": 51674645, + "router_z_loss_clip": 3.11132812, + "router_z_loss_mlp": 0.30273438, + "step": 2385, + "time_per_iteration": 3.9016833305358887 + }, + { + "auxiliary_loss_clip": 0.06612079, + "auxiliary_loss_mlp": 0.01294874, + "balance_loss_clip": 0.06302845, + "balance_loss_mlp": 0.01266406, + "epoch": 0.14345408086577485, + "flos": 28556368773120.0, + "grad_norm": 2.028141972046204, + "language_loss": 0.76766604, + "learning_rate": 3.866487766788612e-06, + "loss": 0.8467356, + "num_input_tokens_seen": 51695770, + "router_z_loss_clip": 3.09570312, + "router_z_loss_mlp": 0.28479004, + "step": 2386, + "time_per_iteration": 4.032405376434326 + }, + { + "auxiliary_loss_clip": 0.06616995, + "auxiliary_loss_mlp": 0.01287556, + "balance_loss_clip": 0.06312285, + "balance_loss_mlp": 0.01258958, + "epoch": 0.14351420411844282, + "flos": 20236279207680.0, + "grad_norm": 2.123480501578919, + "language_loss": 0.79237044, + "learning_rate": 3.866347819843925e-06, + "loss": 0.87141591, + "num_input_tokens_seen": 51714165, + "router_z_loss_clip": 3.046875, + "router_z_loss_mlp": 0.28601074, + "step": 2387, + "time_per_iteration": 2.5608971118927 + }, + { + "auxiliary_loss_clip": 0.06612308, + "auxiliary_loss_mlp": 0.01293206, + "balance_loss_clip": 0.06306893, + "balance_loss_mlp": 0.01263023, + "epoch": 0.14357432737111078, + "flos": 19871157041280.0, + "grad_norm": 2.5788985385847396, + "language_loss": 0.83602524, + "learning_rate": 3.866207802127143e-06, + "loss": 0.91508037, + "num_input_tokens_seen": 51734440, + "router_z_loss_clip": 3.06054688, + "router_z_loss_mlp": 0.30200195, + "step": 2388, + "time_per_iteration": 2.5413224697113037 + }, + { + "auxiliary_loss_clip": 0.06619543, + "auxiliary_loss_mlp": 0.01287669, + "balance_loss_clip": 0.06312172, + "balance_loss_mlp": 0.0126006, + "epoch": 0.14363445062377875, + "flos": 28264354894080.0, + "grad_norm": 2.5598639084548176, + "language_loss": 0.83343434, + "learning_rate": 3.866067713643573e-06, + "loss": 0.91250646, + "num_input_tokens_seen": 51753730, + "router_z_loss_clip": 3.07421875, + "router_z_loss_mlp": 0.27648926, + "step": 2389, + "time_per_iteration": 2.6027376651763916 + }, + { + "auxiliary_loss_clip": 0.06612187, + "auxiliary_loss_mlp": 0.01286457, + "balance_loss_clip": 0.06301727, + "balance_loss_mlp": 0.01257013, + "epoch": 0.1436945738764467, + "flos": 18192517470720.0, + "grad_norm": 2.036228542153499, + "language_loss": 0.84029567, + "learning_rate": 3.8659275543985285e-06, + "loss": 0.91928208, + "num_input_tokens_seen": 51771195, + "router_z_loss_clip": 3.10546875, + "router_z_loss_mlp": 0.29467773, + "step": 2390, + "time_per_iteration": 5.428901672363281 + }, + { + "auxiliary_loss_clip": 0.06612678, + "auxiliary_loss_mlp": 0.01293631, + "balance_loss_clip": 0.06306715, + "balance_loss_mlp": 0.01264282, + "epoch": 0.14375469712911468, + "flos": 27315246896640.0, + "grad_norm": 2.34202135113637, + "language_loss": 0.75496042, + "learning_rate": 3.865787324397324e-06, + "loss": 0.83402348, + "num_input_tokens_seen": 51792290, + "router_z_loss_clip": 3.06054688, + "router_z_loss_mlp": 0.29345703, + "step": 2391, + "time_per_iteration": 2.599823236465454 + }, + { + "auxiliary_loss_clip": 0.06462222, + "auxiliary_loss_mlp": 0.01318708, + "balance_loss_clip": 0.06290679, + "balance_loss_mlp": 0.01307848, + "epoch": 0.14381482038178264, + "flos": 56908757980800.0, + "grad_norm": 0.847659725006037, + "language_loss": 0.61820173, + "learning_rate": 3.865647023645277e-06, + "loss": 0.69601095, + "num_input_tokens_seen": 51843675, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.10876465, + "step": 2392, + "time_per_iteration": 3.007570266723633 + }, + { + "auxiliary_loss_clip": 0.06623066, + "auxiliary_loss_mlp": 0.01297432, + "balance_loss_clip": 0.06308551, + "balance_loss_mlp": 0.01267105, + "epoch": 0.14387494363445064, + "flos": 14287282554240.0, + "grad_norm": 6.716541515366395, + "language_loss": 0.77778554, + "learning_rate": 3.865506652147709e-06, + "loss": 0.85699052, + "num_input_tokens_seen": 51860285, + "router_z_loss_clip": 3.14453125, + "router_z_loss_mlp": 0.30322266, + "step": 2393, + "time_per_iteration": 2.5064942836761475 + }, + { + "auxiliary_loss_clip": 0.06614703, + "auxiliary_loss_mlp": 0.01296275, + "balance_loss_clip": 0.06308223, + "balance_loss_mlp": 0.01266687, + "epoch": 0.1439350668871186, + "flos": 26768884348800.0, + "grad_norm": 2.0037821703408287, + "language_loss": 0.78038269, + "learning_rate": 3.865366209909941e-06, + "loss": 0.85949242, + "num_input_tokens_seen": 51880105, + "router_z_loss_clip": 3.06445312, + "router_z_loss_mlp": 0.2956543, + "step": 2394, + "time_per_iteration": 2.6112003326416016 + }, + { + "auxiliary_loss_clip": 0.06611894, + "auxiliary_loss_mlp": 0.01285238, + "balance_loss_clip": 0.06308618, + "balance_loss_mlp": 0.01256866, + "epoch": 0.14399519013978657, + "flos": 40709926632960.0, + "grad_norm": 2.2776605014778, + "language_loss": 0.87247694, + "learning_rate": 3.8652256969372994e-06, + "loss": 0.95144826, + "num_input_tokens_seen": 51905175, + "router_z_loss_clip": 3.03320312, + "router_z_loss_mlp": 0.28381348, + "step": 2395, + "time_per_iteration": 2.708005428314209 + }, + { + "auxiliary_loss_clip": 0.06606728, + "auxiliary_loss_mlp": 0.0129272, + "balance_loss_clip": 0.06306736, + "balance_loss_mlp": 0.01262846, + "epoch": 0.14405531339245453, + "flos": 20563652309760.0, + "grad_norm": 1.5258430726739798, + "language_loss": 0.83690441, + "learning_rate": 3.865085113235113e-06, + "loss": 0.91589892, + "num_input_tokens_seen": 51924490, + "router_z_loss_clip": 3.00195312, + "router_z_loss_mlp": 0.29882812, + "step": 2396, + "time_per_iteration": 2.554426431655884 + }, + { + "auxiliary_loss_clip": 0.06608565, + "auxiliary_loss_mlp": 0.01286347, + "balance_loss_clip": 0.06309813, + "balance_loss_mlp": 0.0125664, + "epoch": 0.1441154366451225, + "flos": 19578975454080.0, + "grad_norm": 3.4820488024482787, + "language_loss": 0.83915055, + "learning_rate": 3.864944458808712e-06, + "loss": 0.9180997, + "num_input_tokens_seen": 51940490, + "router_z_loss_clip": 2.98828125, + "router_z_loss_mlp": 0.29711914, + "step": 2397, + "time_per_iteration": 2.504763603210449 + }, + { + "auxiliary_loss_clip": 0.0661477, + "auxiliary_loss_mlp": 0.01289633, + "balance_loss_clip": 0.0631109, + "balance_loss_mlp": 0.01261452, + "epoch": 0.14417555989779046, + "flos": 18521735362560.0, + "grad_norm": 2.264494400552882, + "language_loss": 0.81188649, + "learning_rate": 3.86480373366343e-06, + "loss": 0.89093053, + "num_input_tokens_seen": 51957910, + "router_z_loss_clip": 3.03710938, + "router_z_loss_mlp": 0.28186035, + "step": 2398, + "time_per_iteration": 2.5385115146636963 + }, + { + "auxiliary_loss_clip": 0.0661198, + "auxiliary_loss_mlp": 0.01292634, + "balance_loss_clip": 0.06310214, + "balance_loss_mlp": 0.01263535, + "epoch": 0.14423568315045843, + "flos": 26038933505280.0, + "grad_norm": 2.0391001830721014, + "language_loss": 0.65964776, + "learning_rate": 3.864662937804603e-06, + "loss": 0.73869389, + "num_input_tokens_seen": 51978010, + "router_z_loss_clip": 3.01367188, + "router_z_loss_mlp": 0.2911377, + "step": 2399, + "time_per_iteration": 2.5843687057495117 + }, + { + "auxiliary_loss_clip": 0.06611193, + "auxiliary_loss_mlp": 0.01283302, + "balance_loss_clip": 0.06308104, + "balance_loss_mlp": 0.01253953, + "epoch": 0.14429580640312642, + "flos": 21295238307840.0, + "grad_norm": 1.6766317515480094, + "language_loss": 0.83645046, + "learning_rate": 3.864522071237571e-06, + "loss": 0.91539544, + "num_input_tokens_seen": 51998515, + "router_z_loss_clip": 3.03320312, + "router_z_loss_mlp": 0.29321289, + "step": 2400, + "time_per_iteration": 2.555400848388672 + }, + { + "auxiliary_loss_clip": 0.06611119, + "auxiliary_loss_mlp": 0.01295227, + "balance_loss_clip": 0.06304638, + "balance_loss_mlp": 0.01263494, + "epoch": 0.14435592965579438, + "flos": 25634636755200.0, + "grad_norm": 1.4775307939223221, + "language_loss": 0.75889075, + "learning_rate": 3.864381133967676e-06, + "loss": 0.83795416, + "num_input_tokens_seen": 52019270, + "router_z_loss_clip": 3.06445312, + "router_z_loss_mlp": 0.31738281, + "step": 2401, + "time_per_iteration": 2.595510959625244 + }, + { + "auxiliary_loss_clip": 0.06599294, + "auxiliary_loss_mlp": 0.01290815, + "balance_loss_clip": 0.06300685, + "balance_loss_mlp": 0.01262991, + "epoch": 0.14441605290846235, + "flos": 22971488037120.0, + "grad_norm": 3.551603969288966, + "language_loss": 0.81723303, + "learning_rate": 3.86424012600026e-06, + "loss": 0.89613414, + "num_input_tokens_seen": 52039315, + "router_z_loss_clip": 2.984375, + "router_z_loss_mlp": 0.27832031, + "step": 2402, + "time_per_iteration": 2.586766242980957 + }, + { + "auxiliary_loss_clip": 0.06609451, + "auxiliary_loss_mlp": 0.0129576, + "balance_loss_clip": 0.06306025, + "balance_loss_mlp": 0.01267246, + "epoch": 0.14447617616113032, + "flos": 17353386357120.0, + "grad_norm": 2.060017923221776, + "language_loss": 0.8556419, + "learning_rate": 3.864099047340673e-06, + "loss": 0.93469405, + "num_input_tokens_seen": 52056555, + "router_z_loss_clip": 3.03320312, + "router_z_loss_mlp": 0.28491211, + "step": 2403, + "time_per_iteration": 2.607682943344116 + }, + { + "auxiliary_loss_clip": 0.06604473, + "auxiliary_loss_mlp": 0.01296468, + "balance_loss_clip": 0.06304755, + "balance_loss_mlp": 0.01267644, + "epoch": 0.14453629941379828, + "flos": 24066896463360.0, + "grad_norm": 1.6573993279871784, + "language_loss": 0.71218109, + "learning_rate": 3.863957897994262e-06, + "loss": 0.79119051, + "num_input_tokens_seen": 52075800, + "router_z_loss_clip": 2.99609375, + "router_z_loss_mlp": 0.28833008, + "step": 2404, + "time_per_iteration": 2.5632174015045166 + }, + { + "auxiliary_loss_clip": 0.06603173, + "auxiliary_loss_mlp": 0.0129217, + "balance_loss_clip": 0.06303019, + "balance_loss_mlp": 0.0126282, + "epoch": 0.14459642266646625, + "flos": 14434924648320.0, + "grad_norm": 2.334574719230043, + "language_loss": 0.74209595, + "learning_rate": 3.863816677966381e-06, + "loss": 0.82104933, + "num_input_tokens_seen": 52092585, + "router_z_loss_clip": 3.0, + "router_z_loss_mlp": 0.29345703, + "step": 2405, + "time_per_iteration": 2.520474910736084 + }, + { + "auxiliary_loss_clip": 0.06599967, + "auxiliary_loss_mlp": 0.01307828, + "balance_loss_clip": 0.06301095, + "balance_loss_mlp": 0.01279647, + "epoch": 0.14465654591913424, + "flos": 9871337802240.0, + "grad_norm": 2.8694662985653245, + "language_loss": 0.74507034, + "learning_rate": 3.863675387262386e-06, + "loss": 0.8241483, + "num_input_tokens_seen": 52108990, + "router_z_loss_clip": 2.984375, + "router_z_loss_mlp": 0.28173828, + "step": 2406, + "time_per_iteration": 2.5204012393951416 + }, + { + "auxiliary_loss_clip": 0.0660891, + "auxiliary_loss_mlp": 0.01299289, + "balance_loss_clip": 0.06308217, + "balance_loss_mlp": 0.01270584, + "epoch": 0.1447166691718022, + "flos": 24979890551040.0, + "grad_norm": 2.4466515535741027, + "language_loss": 0.77524543, + "learning_rate": 3.8635340258876325e-06, + "loss": 0.85432744, + "num_input_tokens_seen": 52125385, + "router_z_loss_clip": 3.00976562, + "router_z_loss_mlp": 0.28686523, + "step": 2407, + "time_per_iteration": 2.5871012210845947 + }, + { + "auxiliary_loss_clip": 0.06596132, + "auxiliary_loss_mlp": 0.01309759, + "balance_loss_clip": 0.06298497, + "balance_loss_mlp": 0.01281459, + "epoch": 0.14477679242447017, + "flos": 21914457580800.0, + "grad_norm": 2.4005439664015156, + "language_loss": 0.80167431, + "learning_rate": 3.8633925938474826e-06, + "loss": 0.88073325, + "num_input_tokens_seen": 52144985, + "router_z_loss_clip": 2.9765625, + "router_z_loss_mlp": 0.28320312, + "step": 2408, + "time_per_iteration": 2.5400643348693848 + }, + { + "auxiliary_loss_clip": 0.06608425, + "auxiliary_loss_mlp": 0.01300861, + "balance_loss_clip": 0.06305376, + "balance_loss_mlp": 0.0126939, + "epoch": 0.14483691567713813, + "flos": 20747030970240.0, + "grad_norm": 2.230633188895553, + "language_loss": 0.83653724, + "learning_rate": 3.863251091147299e-06, + "loss": 0.9156301, + "num_input_tokens_seen": 52163885, + "router_z_loss_clip": 3.03320312, + "router_z_loss_mlp": 0.31445312, + "step": 2409, + "time_per_iteration": 2.5423808097839355 + }, + { + "auxiliary_loss_clip": 0.06608373, + "auxiliary_loss_mlp": 0.0129938, + "balance_loss_clip": 0.06298821, + "balance_loss_mlp": 0.0126978, + "epoch": 0.1448970389298061, + "flos": 35416388943360.0, + "grad_norm": 2.041474654068305, + "language_loss": 0.76231539, + "learning_rate": 3.863109517792446e-06, + "loss": 0.84139293, + "num_input_tokens_seen": 52184325, + "router_z_loss_clip": 3.09765625, + "router_z_loss_mlp": 0.29602051, + "step": 2410, + "time_per_iteration": 2.6380317211151123 + }, + { + "auxiliary_loss_clip": 0.0660304, + "auxiliary_loss_mlp": 0.01294458, + "balance_loss_clip": 0.06304888, + "balance_loss_mlp": 0.01265491, + "epoch": 0.14495716218247406, + "flos": 15419853066240.0, + "grad_norm": 1.847852108753089, + "language_loss": 0.8233192, + "learning_rate": 3.8629678737882945e-06, + "loss": 0.90229416, + "num_input_tokens_seen": 52202740, + "router_z_loss_clip": 2.98046875, + "router_z_loss_mlp": 0.28942871, + "step": 2411, + "time_per_iteration": 2.5439260005950928 + }, + { + "auxiliary_loss_clip": 0.06610366, + "auxiliary_loss_mlp": 0.0129153, + "balance_loss_clip": 0.06308557, + "balance_loss_mlp": 0.01262514, + "epoch": 0.14501728543514203, + "flos": 33701677390080.0, + "grad_norm": 2.23940850930143, + "language_loss": 0.71979284, + "learning_rate": 3.862826159140214e-06, + "loss": 0.79881179, + "num_input_tokens_seen": 52223100, + "router_z_loss_clip": 3.01757812, + "router_z_loss_mlp": 0.29003906, + "step": 2412, + "time_per_iteration": 2.654892921447754 + }, + { + "auxiliary_loss_clip": 0.06603752, + "auxiliary_loss_mlp": 0.01292883, + "balance_loss_clip": 0.06306557, + "balance_loss_mlp": 0.01265465, + "epoch": 0.14507740868781002, + "flos": 15601512718080.0, + "grad_norm": 1.90667529133839, + "language_loss": 0.78426313, + "learning_rate": 3.862684373853579e-06, + "loss": 0.86322957, + "num_input_tokens_seen": 52239690, + "router_z_loss_clip": 2.96875, + "router_z_loss_mlp": 0.27441406, + "step": 2413, + "time_per_iteration": 2.5105841159820557 + }, + { + "auxiliary_loss_clip": 0.06474504, + "auxiliary_loss_mlp": 0.01256457, + "balance_loss_clip": 0.06298508, + "balance_loss_mlp": 0.01246152, + "epoch": 0.145137531940478, + "flos": 66695247924480.0, + "grad_norm": 0.8850823768955927, + "language_loss": 0.58774322, + "learning_rate": 3.8625425179337656e-06, + "loss": 0.66505289, + "num_input_tokens_seen": 52296705, + "router_z_loss_clip": 1.76269531, + "router_z_loss_mlp": 0.10308838, + "step": 2414, + "time_per_iteration": 3.0886166095733643 + }, + { + "auxiliary_loss_clip": 0.06466582, + "auxiliary_loss_mlp": 0.01255839, + "balance_loss_clip": 0.06291236, + "balance_loss_mlp": 0.01245486, + "epoch": 0.14519765519314595, + "flos": 67542806373120.0, + "grad_norm": 0.8215511806181923, + "language_loss": 0.61917955, + "learning_rate": 3.862400591386154e-06, + "loss": 0.69640374, + "num_input_tokens_seen": 52361830, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.10357666, + "step": 2415, + "time_per_iteration": 3.1800529956817627 + }, + { + "auxiliary_loss_clip": 0.06605236, + "auxiliary_loss_mlp": 0.0128974, + "balance_loss_clip": 0.06304489, + "balance_loss_mlp": 0.01261226, + "epoch": 0.14525777844581392, + "flos": 17204151035520.0, + "grad_norm": 1.9287382315286696, + "language_loss": 0.72791839, + "learning_rate": 3.8622585942161245e-06, + "loss": 0.80686808, + "num_input_tokens_seen": 52379420, + "router_z_loss_clip": 3.0078125, + "router_z_loss_mlp": 0.28540039, + "step": 2416, + "time_per_iteration": 2.5888171195983887 + }, + { + "auxiliary_loss_clip": 0.06466876, + "auxiliary_loss_mlp": 0.01256349, + "balance_loss_clip": 0.06292045, + "balance_loss_mlp": 0.01246574, + "epoch": 0.14531790169848188, + "flos": 65425349370240.0, + "grad_norm": 0.6779730680906524, + "language_loss": 0.60441911, + "learning_rate": 3.8621165264290635e-06, + "loss": 0.68165135, + "num_input_tokens_seen": 52446290, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.09765625, + "step": 2417, + "time_per_iteration": 3.256091356277466 + }, + { + "auxiliary_loss_clip": 0.06611343, + "auxiliary_loss_mlp": 0.01295709, + "balance_loss_clip": 0.06300741, + "balance_loss_mlp": 0.0126543, + "epoch": 0.14537802495114985, + "flos": 32570783959680.0, + "grad_norm": 9.327498524911116, + "language_loss": 0.80428064, + "learning_rate": 3.861974388030356e-06, + "loss": 0.88335121, + "num_input_tokens_seen": 52467295, + "router_z_loss_clip": 3.10742188, + "router_z_loss_mlp": 0.30297852, + "step": 2418, + "time_per_iteration": 2.6627931594848633 + }, + { + "auxiliary_loss_clip": 0.06597205, + "auxiliary_loss_mlp": 0.01293692, + "balance_loss_clip": 0.06300168, + "balance_loss_mlp": 0.01265952, + "epoch": 0.1454381482038178, + "flos": 20232338065920.0, + "grad_norm": 1.7107019560934957, + "language_loss": 0.72557437, + "learning_rate": 3.861832179025394e-06, + "loss": 0.80448335, + "num_input_tokens_seen": 52487295, + "router_z_loss_clip": 2.97265625, + "router_z_loss_mlp": 0.27746582, + "step": 2419, + "time_per_iteration": 2.55110764503479 + }, + { + "auxiliary_loss_clip": 0.06605242, + "auxiliary_loss_mlp": 0.01287615, + "balance_loss_clip": 0.06300443, + "balance_loss_mlp": 0.01258563, + "epoch": 0.1454982714564858, + "flos": 22899721415040.0, + "grad_norm": 2.764675065682222, + "language_loss": 0.91167969, + "learning_rate": 3.861689899419569e-06, + "loss": 0.99060822, + "num_input_tokens_seen": 52504220, + "router_z_loss_clip": 3.04882812, + "router_z_loss_mlp": 0.29064941, + "step": 2420, + "time_per_iteration": 2.554682731628418 + }, + { + "auxiliary_loss_clip": 0.06610379, + "auxiliary_loss_mlp": 0.01289829, + "balance_loss_clip": 0.06309067, + "balance_loss_mlp": 0.01262757, + "epoch": 0.14555839470915377, + "flos": 20236027645440.0, + "grad_norm": 2.2697741355192034, + "language_loss": 0.83967364, + "learning_rate": 3.861547549218276e-06, + "loss": 0.91867572, + "num_input_tokens_seen": 52521900, + "router_z_loss_clip": 3.015625, + "router_z_loss_mlp": 0.27050781, + "step": 2421, + "time_per_iteration": 2.5464484691619873 + }, + { + "auxiliary_loss_clip": 0.06610221, + "auxiliary_loss_mlp": 0.01287397, + "balance_loss_clip": 0.0630337, + "balance_loss_mlp": 0.01259216, + "epoch": 0.14561851796182174, + "flos": 22242753077760.0, + "grad_norm": 1.9618808249376125, + "language_loss": 0.82542074, + "learning_rate": 3.861405128426914e-06, + "loss": 0.90439695, + "num_input_tokens_seen": 52540495, + "router_z_loss_clip": 3.06835938, + "router_z_loss_mlp": 0.28173828, + "step": 2422, + "time_per_iteration": 2.5524632930755615 + }, + { + "auxiliary_loss_clip": 0.06461698, + "auxiliary_loss_mlp": 0.01262269, + "balance_loss_clip": 0.06287467, + "balance_loss_mlp": 0.01252607, + "epoch": 0.1456786412144897, + "flos": 52655758692480.0, + "grad_norm": 0.899920685315801, + "language_loss": 0.63252938, + "learning_rate": 3.861262637050883e-06, + "loss": 0.70976901, + "num_input_tokens_seen": 52603305, + "router_z_loss_clip": 1.74316406, + "router_z_loss_mlp": 0.09649658, + "step": 2423, + "time_per_iteration": 3.186488151550293 + }, + { + "auxiliary_loss_clip": 0.06612016, + "auxiliary_loss_mlp": 0.01288368, + "balance_loss_clip": 0.06311088, + "balance_loss_mlp": 0.01261402, + "epoch": 0.14573876446715767, + "flos": 23228352328320.0, + "grad_norm": 1.6675722488639018, + "language_loss": 0.82883829, + "learning_rate": 3.861120075095585e-06, + "loss": 0.90784216, + "num_input_tokens_seen": 52623435, + "router_z_loss_clip": 3.00976562, + "router_z_loss_mlp": 0.26928711, + "step": 2424, + "time_per_iteration": 2.6136088371276855 + }, + { + "auxiliary_loss_clip": 0.0660837, + "auxiliary_loss_mlp": 0.01282475, + "balance_loss_clip": 0.06310098, + "balance_loss_mlp": 0.01254246, + "epoch": 0.14579888771982563, + "flos": 18120331578240.0, + "grad_norm": 3.5994104334935733, + "language_loss": 0.79757202, + "learning_rate": 3.860977442566429e-06, + "loss": 0.87648046, + "num_input_tokens_seen": 52642255, + "router_z_loss_clip": 2.98242188, + "router_z_loss_mlp": 0.28271484, + "step": 2425, + "time_per_iteration": 4.07472825050354 + }, + { + "auxiliary_loss_clip": 0.06616544, + "auxiliary_loss_mlp": 0.01291448, + "balance_loss_clip": 0.06312044, + "balance_loss_mlp": 0.01263577, + "epoch": 0.14585901097249362, + "flos": 23007476165760.0, + "grad_norm": 3.905152777460985, + "language_loss": 0.84682351, + "learning_rate": 3.860834739468821e-06, + "loss": 0.92590338, + "num_input_tokens_seen": 52658700, + "router_z_loss_clip": 3.046875, + "router_z_loss_mlp": 0.27893066, + "step": 2426, + "time_per_iteration": 3.9595530033111572 + }, + { + "auxiliary_loss_clip": 0.066182, + "auxiliary_loss_mlp": 0.01297578, + "balance_loss_clip": 0.06312812, + "balance_loss_mlp": 0.0126904, + "epoch": 0.1459191342251616, + "flos": 21915212267520.0, + "grad_norm": 3.268887858496738, + "language_loss": 0.87538207, + "learning_rate": 3.860691965808173e-06, + "loss": 0.95453984, + "num_input_tokens_seen": 52678140, + "router_z_loss_clip": 3.0546875, + "router_z_loss_mlp": 0.28564453, + "step": 2427, + "time_per_iteration": 2.5644760131835938 + }, + { + "auxiliary_loss_clip": 0.0661422, + "auxiliary_loss_mlp": 0.01289371, + "balance_loss_clip": 0.06305077, + "balance_loss_mlp": 0.01258805, + "epoch": 0.14597925747782955, + "flos": 14980742144640.0, + "grad_norm": 1.9191014162631195, + "language_loss": 0.67673224, + "learning_rate": 3.8605491215899e-06, + "loss": 0.75576818, + "num_input_tokens_seen": 52696825, + "router_z_loss_clip": 3.09179688, + "router_z_loss_mlp": 0.3059082, + "step": 2428, + "time_per_iteration": 2.507455348968506 + }, + { + "auxiliary_loss_clip": 0.06609876, + "auxiliary_loss_mlp": 0.01290631, + "balance_loss_clip": 0.06305391, + "balance_loss_mlp": 0.01261807, + "epoch": 0.14603938073049752, + "flos": 21075200686080.0, + "grad_norm": 1.7530902442774277, + "language_loss": 0.84668899, + "learning_rate": 3.860406206819417e-06, + "loss": 0.92569411, + "num_input_tokens_seen": 52715125, + "router_z_loss_clip": 3.046875, + "router_z_loss_mlp": 0.28833008, + "step": 2429, + "time_per_iteration": 2.5743284225463867 + }, + { + "auxiliary_loss_clip": 0.06606025, + "auxiliary_loss_mlp": 0.01297985, + "balance_loss_clip": 0.06307633, + "balance_loss_mlp": 0.01269661, + "epoch": 0.14609950398316549, + "flos": 19870863552000.0, + "grad_norm": 1.787324656259552, + "language_loss": 0.80119967, + "learning_rate": 3.860263221502145e-06, + "loss": 0.88023973, + "num_input_tokens_seen": 52734015, + "router_z_loss_clip": 2.98242188, + "router_z_loss_mlp": 0.28308105, + "step": 2430, + "time_per_iteration": 3.9587552547454834 + }, + { + "auxiliary_loss_clip": 0.06618911, + "auxiliary_loss_mlp": 0.01299566, + "balance_loss_clip": 0.06312407, + "balance_loss_mlp": 0.01271552, + "epoch": 0.14615962723583345, + "flos": 22425377051520.0, + "grad_norm": 2.031204881913862, + "language_loss": 0.84236491, + "learning_rate": 3.860120165643504e-06, + "loss": 0.92154968, + "num_input_tokens_seen": 52753025, + "router_z_loss_clip": 3.0625, + "router_z_loss_mlp": 0.28051758, + "step": 2431, + "time_per_iteration": 2.5258126258850098 + }, + { + "auxiliary_loss_clip": 0.06622316, + "auxiliary_loss_mlp": 0.01304388, + "balance_loss_clip": 0.06307245, + "balance_loss_mlp": 0.01273823, + "epoch": 0.14621975048850142, + "flos": 22352813815680.0, + "grad_norm": 2.3067012157334976, + "language_loss": 0.79905456, + "learning_rate": 3.859977039248921e-06, + "loss": 0.87832165, + "num_input_tokens_seen": 52773420, + "router_z_loss_clip": 3.15039062, + "router_z_loss_mlp": 0.30566406, + "step": 2432, + "time_per_iteration": 2.5560994148254395 + }, + { + "auxiliary_loss_clip": 0.06613283, + "auxiliary_loss_mlp": 0.01299078, + "balance_loss_clip": 0.06307921, + "balance_loss_mlp": 0.01268894, + "epoch": 0.1462798737411694, + "flos": 24396030501120.0, + "grad_norm": 3.9772219479987796, + "language_loss": 0.8163479, + "learning_rate": 3.859833842323822e-06, + "loss": 0.89547151, + "num_input_tokens_seen": 52792870, + "router_z_loss_clip": 3.0546875, + "router_z_loss_mlp": 0.30175781, + "step": 2433, + "time_per_iteration": 2.5528087615966797 + }, + { + "auxiliary_loss_clip": 0.06603821, + "auxiliary_loss_mlp": 0.01308033, + "balance_loss_clip": 0.06304027, + "balance_loss_mlp": 0.0128052, + "epoch": 0.14633999699383737, + "flos": 19250679957120.0, + "grad_norm": 5.860215383122996, + "language_loss": 0.79175711, + "learning_rate": 3.859690574873638e-06, + "loss": 0.87087572, + "num_input_tokens_seen": 52811615, + "router_z_loss_clip": 3.0, + "router_z_loss_mlp": 0.27526855, + "step": 2434, + "time_per_iteration": 2.5396053791046143 + }, + { + "auxiliary_loss_clip": 0.0649661, + "auxiliary_loss_mlp": 0.01339476, + "balance_loss_clip": 0.0632303, + "balance_loss_mlp": 0.01328705, + "epoch": 0.14640012024650534, + "flos": 62679658780800.0, + "grad_norm": 0.822335797554765, + "language_loss": 0.58256161, + "learning_rate": 3.8595472369038e-06, + "loss": 0.66092247, + "num_input_tokens_seen": 52873230, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.10784912, + "step": 2435, + "time_per_iteration": 3.147134304046631 + }, + { + "auxiliary_loss_clip": 0.06602708, + "auxiliary_loss_mlp": 0.0130236, + "balance_loss_clip": 0.06305322, + "balance_loss_mlp": 0.01274036, + "epoch": 0.1464602434991733, + "flos": 12281144100480.0, + "grad_norm": 2.2533392469478453, + "language_loss": 0.89637053, + "learning_rate": 3.859403828419744e-06, + "loss": 0.97542119, + "num_input_tokens_seen": 52889325, + "router_z_loss_clip": 2.97460938, + "router_z_loss_mlp": 0.28320312, + "step": 2436, + "time_per_iteration": 2.5397794246673584 + }, + { + "auxiliary_loss_clip": 0.06608147, + "auxiliary_loss_mlp": 0.01302382, + "balance_loss_clip": 0.06305888, + "balance_loss_mlp": 0.01274391, + "epoch": 0.14652036675184127, + "flos": 20928480986880.0, + "grad_norm": 2.9920720004583194, + "language_loss": 0.75810778, + "learning_rate": 3.85926034942691e-06, + "loss": 0.83721304, + "num_input_tokens_seen": 52909705, + "router_z_loss_clip": 3.02148438, + "router_z_loss_mlp": 0.2800293, + "step": 2437, + "time_per_iteration": 2.5667972564697266 + }, + { + "auxiliary_loss_clip": 0.06610391, + "auxiliary_loss_mlp": 0.01306019, + "balance_loss_clip": 0.06306973, + "balance_loss_mlp": 0.01277123, + "epoch": 0.14658049000450923, + "flos": 27710151989760.0, + "grad_norm": 2.606428121821339, + "language_loss": 0.7401824, + "learning_rate": 3.859116799930736e-06, + "loss": 0.81934643, + "num_input_tokens_seen": 52930300, + "router_z_loss_clip": 3.03515625, + "router_z_loss_mlp": 0.28857422, + "step": 2438, + "time_per_iteration": 2.6476521492004395 + }, + { + "auxiliary_loss_clip": 0.06605977, + "auxiliary_loss_mlp": 0.01303285, + "balance_loss_clip": 0.06305391, + "balance_loss_mlp": 0.01274865, + "epoch": 0.14664061325717723, + "flos": 24943483152000.0, + "grad_norm": 2.0459162456522595, + "language_loss": 0.7577256, + "learning_rate": 3.858973179936668e-06, + "loss": 0.83681822, + "num_input_tokens_seen": 52949955, + "router_z_loss_clip": 3.00585938, + "router_z_loss_mlp": 0.28442383, + "step": 2439, + "time_per_iteration": 2.5789241790771484 + }, + { + "auxiliary_loss_clip": 0.06618818, + "auxiliary_loss_mlp": 0.01305858, + "balance_loss_clip": 0.06318325, + "balance_loss_mlp": 0.01278261, + "epoch": 0.1467007365098452, + "flos": 40307306964480.0, + "grad_norm": 4.636382420589035, + "language_loss": 0.74925351, + "learning_rate": 3.85882948945015e-06, + "loss": 0.82850027, + "num_input_tokens_seen": 52972905, + "router_z_loss_clip": 3.00976562, + "router_z_loss_mlp": 0.27624512, + "step": 2440, + "time_per_iteration": 2.7299485206604004 + }, + { + "auxiliary_loss_clip": 0.06605764, + "auxiliary_loss_mlp": 0.01314168, + "balance_loss_clip": 0.06310172, + "balance_loss_mlp": 0.01287667, + "epoch": 0.14676085976251316, + "flos": 26548175894400.0, + "grad_norm": 2.8544116905201755, + "language_loss": 0.84429544, + "learning_rate": 3.85868572847663e-06, + "loss": 0.92349476, + "num_input_tokens_seen": 52994850, + "router_z_loss_clip": 2.95703125, + "router_z_loss_mlp": 0.26513672, + "step": 2441, + "time_per_iteration": 2.631824016571045 + }, + { + "auxiliary_loss_clip": 0.0662398, + "auxiliary_loss_mlp": 0.01301683, + "balance_loss_clip": 0.06313129, + "balance_loss_mlp": 0.0127188, + "epoch": 0.14682098301518112, + "flos": 23556857460480.0, + "grad_norm": 2.3203183858424175, + "language_loss": 0.73868263, + "learning_rate": 3.858541897021563e-06, + "loss": 0.81793922, + "num_input_tokens_seen": 53014740, + "router_z_loss_clip": 3.11132812, + "router_z_loss_mlp": 0.29785156, + "step": 2442, + "time_per_iteration": 2.549813747406006 + }, + { + "auxiliary_loss_clip": 0.06618661, + "auxiliary_loss_mlp": 0.01300103, + "balance_loss_clip": 0.06309915, + "balance_loss_mlp": 0.01271934, + "epoch": 0.1468811062678491, + "flos": 11655048792960.0, + "grad_norm": 3.9053582460255756, + "language_loss": 0.82657981, + "learning_rate": 3.8583979950904e-06, + "loss": 0.90576744, + "num_input_tokens_seen": 53029780, + "router_z_loss_clip": 3.08789062, + "router_z_loss_mlp": 0.28161621, + "step": 2443, + "time_per_iteration": 2.5171542167663574 + }, + { + "auxiliary_loss_clip": 0.06611481, + "auxiliary_loss_mlp": 0.01308471, + "balance_loss_clip": 0.06310362, + "balance_loss_mlp": 0.0128184, + "epoch": 0.14694122952051705, + "flos": 23009237101440.0, + "grad_norm": 2.0286604977239477, + "language_loss": 0.84266245, + "learning_rate": 3.858254022688599e-06, + "loss": 0.92186195, + "num_input_tokens_seen": 53048620, + "router_z_loss_clip": 3.01171875, + "router_z_loss_mlp": 0.26635742, + "step": 2444, + "time_per_iteration": 2.5373833179473877 + }, + { + "auxiliary_loss_clip": 0.06614003, + "auxiliary_loss_mlp": 0.01304434, + "balance_loss_clip": 0.0631294, + "balance_loss_mlp": 0.0127692, + "epoch": 0.14700135277318502, + "flos": 26509797924480.0, + "grad_norm": 1.800920496835182, + "language_loss": 0.72034383, + "learning_rate": 3.85810997982162e-06, + "loss": 0.79952818, + "num_input_tokens_seen": 53070055, + "router_z_loss_clip": 3.0078125, + "router_z_loss_mlp": 0.27539062, + "step": 2445, + "time_per_iteration": 2.6035430431365967 + }, + { + "auxiliary_loss_clip": 0.0652153, + "auxiliary_loss_mlp": 0.01258872, + "balance_loss_clip": 0.06346728, + "balance_loss_mlp": 0.01251392, + "epoch": 0.147061476025853, + "flos": 59467841527680.0, + "grad_norm": 0.7965915579325233, + "language_loss": 0.62555134, + "learning_rate": 3.857965866494923e-06, + "loss": 0.70335531, + "num_input_tokens_seen": 53126945, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.074646, + "step": 2446, + "time_per_iteration": 3.0864346027374268 + }, + { + "auxiliary_loss_clip": 0.06631434, + "auxiliary_loss_mlp": 0.01305294, + "balance_loss_clip": 0.06324492, + "balance_loss_mlp": 0.01278603, + "epoch": 0.14712159927852098, + "flos": 28338637138560.0, + "grad_norm": 5.819879904445231, + "language_loss": 0.75890815, + "learning_rate": 3.857821682713975e-06, + "loss": 0.83827543, + "num_input_tokens_seen": 53149130, + "router_z_loss_clip": 3.0703125, + "router_z_loss_mlp": 0.26708984, + "step": 2447, + "time_per_iteration": 2.6405458450317383 + }, + { + "auxiliary_loss_clip": 0.0662236, + "auxiliary_loss_mlp": 0.01295421, + "balance_loss_clip": 0.06319176, + "balance_loss_mlp": 0.01267097, + "epoch": 0.14718172253118894, + "flos": 27097263699840.0, + "grad_norm": 3.1585594254982094, + "language_loss": 0.86766493, + "learning_rate": 3.857677428484242e-06, + "loss": 0.94684267, + "num_input_tokens_seen": 53167120, + "router_z_loss_clip": 3.03515625, + "router_z_loss_mlp": 0.28344727, + "step": 2448, + "time_per_iteration": 2.588178873062134 + }, + { + "auxiliary_loss_clip": 0.06500641, + "auxiliary_loss_mlp": 0.01262898, + "balance_loss_clip": 0.0632707, + "balance_loss_mlp": 0.01254792, + "epoch": 0.1472418457838569, + "flos": 66725827464960.0, + "grad_norm": 0.7311302410121435, + "language_loss": 0.56820273, + "learning_rate": 3.857533103811195e-06, + "loss": 0.64583808, + "num_input_tokens_seen": 53227945, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.08105469, + "step": 2449, + "time_per_iteration": 3.1432383060455322 + }, + { + "auxiliary_loss_clip": 0.06619844, + "auxiliary_loss_mlp": 0.01304126, + "balance_loss_clip": 0.06319091, + "balance_loss_mlp": 0.01278663, + "epoch": 0.14730196903652487, + "flos": 19579730140800.0, + "grad_norm": 2.3714801519715185, + "language_loss": 0.86300421, + "learning_rate": 3.857388708700307e-06, + "loss": 0.94224387, + "num_input_tokens_seen": 53244615, + "router_z_loss_clip": 3.0078125, + "router_z_loss_mlp": 0.2545166, + "step": 2450, + "time_per_iteration": 2.6230788230895996 + }, + { + "auxiliary_loss_clip": 0.06624465, + "auxiliary_loss_mlp": 0.01292799, + "balance_loss_clip": 0.06318057, + "balance_loss_mlp": 0.01265774, + "epoch": 0.14736209228919284, + "flos": 16076611768320.0, + "grad_norm": 3.0293103266492336, + "language_loss": 0.76407862, + "learning_rate": 3.857244243157052e-06, + "loss": 0.84325123, + "num_input_tokens_seen": 53262205, + "router_z_loss_clip": 3.06445312, + "router_z_loss_mlp": 0.2701416, + "step": 2451, + "time_per_iteration": 2.562429428100586 + }, + { + "auxiliary_loss_clip": 0.06606978, + "auxiliary_loss_mlp": 0.0129124, + "balance_loss_clip": 0.0631422, + "balance_loss_mlp": 0.01263881, + "epoch": 0.1474222155418608, + "flos": 23046147624960.0, + "grad_norm": 2.189425489790517, + "language_loss": 0.82725209, + "learning_rate": 3.85709970718691e-06, + "loss": 0.90623426, + "num_input_tokens_seen": 53282445, + "router_z_loss_clip": 2.92578125, + "router_z_loss_mlp": 0.27355957, + "step": 2452, + "time_per_iteration": 2.5850419998168945 + }, + { + "auxiliary_loss_clip": 0.06614233, + "auxiliary_loss_mlp": 0.01290168, + "balance_loss_clip": 0.06316262, + "balance_loss_mlp": 0.01264562, + "epoch": 0.1474823387945288, + "flos": 17024210392320.0, + "grad_norm": 1.704036472783103, + "language_loss": 0.7534892, + "learning_rate": 3.856955100795361e-06, + "loss": 0.83253324, + "num_input_tokens_seen": 53299060, + "router_z_loss_clip": 2.98046875, + "router_z_loss_mlp": 0.2565918, + "step": 2453, + "time_per_iteration": 2.56315016746521 + }, + { + "auxiliary_loss_clip": 0.06629206, + "auxiliary_loss_mlp": 0.01291559, + "balance_loss_clip": 0.06321974, + "balance_loss_mlp": 0.01263521, + "epoch": 0.14754246204719676, + "flos": 17900880935040.0, + "grad_norm": 2.0859032314961836, + "language_loss": 0.7740314, + "learning_rate": 3.856810423987889e-06, + "loss": 0.853239, + "num_input_tokens_seen": 53315970, + "router_z_loss_clip": 3.06835938, + "router_z_loss_mlp": 0.28076172, + "step": 2454, + "time_per_iteration": 2.512051582336426 + }, + { + "auxiliary_loss_clip": 0.06621231, + "auxiliary_loss_mlp": 0.01296513, + "balance_loss_clip": 0.06321682, + "balance_loss_mlp": 0.01269392, + "epoch": 0.14760258529986472, + "flos": 13084161304320.0, + "grad_norm": 2.060710477094934, + "language_loss": 0.84565163, + "learning_rate": 3.856665676769979e-06, + "loss": 0.92482901, + "num_input_tokens_seen": 53332940, + "router_z_loss_clip": 2.99609375, + "router_z_loss_mlp": 0.2713623, + "step": 2455, + "time_per_iteration": 2.5514066219329834 + }, + { + "auxiliary_loss_clip": 0.06633241, + "auxiliary_loss_mlp": 0.01283691, + "balance_loss_clip": 0.06325488, + "balance_loss_mlp": 0.01257393, + "epoch": 0.1476627085525327, + "flos": 30813627513600.0, + "grad_norm": 5.872574686414898, + "language_loss": 0.85135001, + "learning_rate": 3.85652085914712e-06, + "loss": 0.93051934, + "num_input_tokens_seen": 53353295, + "router_z_loss_clip": 3.07421875, + "router_z_loss_mlp": 0.26281738, + "step": 2456, + "time_per_iteration": 2.638485908508301 + }, + { + "auxiliary_loss_clip": 0.0661984, + "auxiliary_loss_mlp": 0.01288462, + "balance_loss_clip": 0.06324227, + "balance_loss_mlp": 0.01261926, + "epoch": 0.14772283180520066, + "flos": 21695887405440.0, + "grad_norm": 3.5788318870076674, + "language_loss": 0.85374033, + "learning_rate": 3.856375971124805e-06, + "loss": 0.93282336, + "num_input_tokens_seen": 53373410, + "router_z_loss_clip": 2.9609375, + "router_z_loss_mlp": 0.26550293, + "step": 2457, + "time_per_iteration": 2.5397539138793945 + }, + { + "auxiliary_loss_clip": 0.06612187, + "auxiliary_loss_mlp": 0.01285174, + "balance_loss_clip": 0.06322154, + "balance_loss_mlp": 0.01258817, + "epoch": 0.14778295505786862, + "flos": 18776335593600.0, + "grad_norm": 2.2072082990650896, + "language_loss": 0.76667166, + "learning_rate": 3.856231012708527e-06, + "loss": 0.84564531, + "num_input_tokens_seen": 53391430, + "router_z_loss_clip": 2.90039062, + "router_z_loss_mlp": 0.26379395, + "step": 2458, + "time_per_iteration": 2.5479953289031982 + }, + { + "auxiliary_loss_clip": 0.0664083, + "auxiliary_loss_mlp": 0.01290982, + "balance_loss_clip": 0.06331704, + "balance_loss_mlp": 0.01262992, + "epoch": 0.1478430783105366, + "flos": 22900224539520.0, + "grad_norm": 2.4431680555354185, + "language_loss": 0.84230208, + "learning_rate": 3.856085983903782e-06, + "loss": 0.92162013, + "num_input_tokens_seen": 53409960, + "router_z_loss_clip": 3.09375, + "router_z_loss_mlp": 0.28027344, + "step": 2459, + "time_per_iteration": 2.555878162384033 + }, + { + "auxiliary_loss_clip": 0.06625295, + "auxiliary_loss_mlp": 0.01283208, + "balance_loss_clip": 0.06332543, + "balance_loss_mlp": 0.01257983, + "epoch": 0.14790320156320458, + "flos": 15090635174400.0, + "grad_norm": 2.440333441232677, + "language_loss": 0.76468259, + "learning_rate": 3.855940884716071e-06, + "loss": 0.84376764, + "num_input_tokens_seen": 53426160, + "router_z_loss_clip": 2.92773438, + "router_z_loss_mlp": 0.2520752, + "step": 2460, + "time_per_iteration": 2.528325319290161 + }, + { + "auxiliary_loss_clip": 0.06624737, + "auxiliary_loss_mlp": 0.01287086, + "balance_loss_clip": 0.06318681, + "balance_loss_mlp": 0.0125912, + "epoch": 0.14796332481587254, + "flos": 26511894276480.0, + "grad_norm": 1.7434250987621476, + "language_loss": 0.82039559, + "learning_rate": 3.855795715150896e-06, + "loss": 0.89951384, + "num_input_tokens_seen": 53448530, + "router_z_loss_clip": 3.0625, + "router_z_loss_mlp": 0.27941895, + "step": 2461, + "time_per_iteration": 2.609023332595825 + }, + { + "auxiliary_loss_clip": 0.06627606, + "auxiliary_loss_mlp": 0.0129144, + "balance_loss_clip": 0.06326235, + "balance_loss_mlp": 0.01263497, + "epoch": 0.1480234480685405, + "flos": 17568392734080.0, + "grad_norm": 4.638743932579621, + "language_loss": 0.6665929, + "learning_rate": 3.855650475213761e-06, + "loss": 0.74578333, + "num_input_tokens_seen": 53465915, + "router_z_loss_clip": 3.015625, + "router_z_loss_mlp": 0.27954102, + "step": 2462, + "time_per_iteration": 2.5234897136688232 + }, + { + "auxiliary_loss_clip": 0.06619708, + "auxiliary_loss_mlp": 0.01287497, + "balance_loss_clip": 0.06320504, + "balance_loss_mlp": 0.01260925, + "epoch": 0.14808357132120847, + "flos": 53594693147520.0, + "grad_norm": 12.154278546197556, + "language_loss": 0.68225503, + "learning_rate": 3.8555051649101745e-06, + "loss": 0.76132703, + "num_input_tokens_seen": 53496055, + "router_z_loss_clip": 2.9921875, + "router_z_loss_mlp": 0.26574707, + "step": 2463, + "time_per_iteration": 2.847352981567383 + }, + { + "auxiliary_loss_clip": 0.06631631, + "auxiliary_loss_mlp": 0.01292564, + "balance_loss_clip": 0.06328086, + "balance_loss_mlp": 0.01264788, + "epoch": 0.14814369457387644, + "flos": 19835420474880.0, + "grad_norm": 2.5558663587768917, + "language_loss": 0.77389717, + "learning_rate": 3.855359784245646e-06, + "loss": 0.85313916, + "num_input_tokens_seen": 53513790, + "router_z_loss_clip": 3.03515625, + "router_z_loss_mlp": 0.27783203, + "step": 2464, + "time_per_iteration": 3.9868950843811035 + }, + { + "auxiliary_loss_clip": 0.0661262, + "auxiliary_loss_mlp": 0.01291855, + "balance_loss_clip": 0.06322042, + "balance_loss_mlp": 0.01266356, + "epoch": 0.1482038178265444, + "flos": 23921769991680.0, + "grad_norm": 1.9637026483751652, + "language_loss": 0.80667269, + "learning_rate": 3.855214333225688e-06, + "loss": 0.88571739, + "num_input_tokens_seen": 53533410, + "router_z_loss_clip": 2.90820312, + "router_z_loss_mlp": 0.25500488, + "step": 2465, + "time_per_iteration": 4.024165630340576 + }, + { + "auxiliary_loss_clip": 0.06628035, + "auxiliary_loss_mlp": 0.01295444, + "balance_loss_clip": 0.06321928, + "balance_loss_mlp": 0.01265976, + "epoch": 0.1482639410792124, + "flos": 24177376471680.0, + "grad_norm": 3.100026638907138, + "language_loss": 0.77266049, + "learning_rate": 3.855068811855817e-06, + "loss": 0.85189527, + "num_input_tokens_seen": 53554775, + "router_z_loss_clip": 3.06054688, + "router_z_loss_mlp": 0.29467773, + "step": 2466, + "time_per_iteration": 2.583932638168335 + }, + { + "auxiliary_loss_clip": 0.06510445, + "auxiliary_loss_mlp": 0.01275284, + "balance_loss_clip": 0.06339325, + "balance_loss_mlp": 0.012657, + "epoch": 0.14832406433188036, + "flos": 66209205916800.0, + "grad_norm": 0.9642098795906485, + "language_loss": 0.60506117, + "learning_rate": 3.854923220141551e-06, + "loss": 0.68291849, + "num_input_tokens_seen": 53609675, + "router_z_loss_clip": 1.71191406, + "router_z_loss_mlp": 0.09570312, + "step": 2467, + "time_per_iteration": 3.206559419631958 + }, + { + "auxiliary_loss_clip": 0.06627056, + "auxiliary_loss_mlp": 0.0129155, + "balance_loss_clip": 0.06326642, + "balance_loss_mlp": 0.01264573, + "epoch": 0.14838418758454833, + "flos": 25418372567040.0, + "grad_norm": 2.1383686818257877, + "language_loss": 0.88646448, + "learning_rate": 3.85477755808841e-06, + "loss": 0.96565056, + "num_input_tokens_seen": 53626950, + "router_z_loss_clip": 3.00390625, + "router_z_loss_mlp": 0.26965332, + "step": 2468, + "time_per_iteration": 2.586428642272949 + }, + { + "auxiliary_loss_clip": 0.06632069, + "auxiliary_loss_mlp": 0.01295941, + "balance_loss_clip": 0.0632536, + "balance_loss_mlp": 0.01267236, + "epoch": 0.1484443108372163, + "flos": 23295800465280.0, + "grad_norm": 2.089009169061615, + "language_loss": 0.76661634, + "learning_rate": 3.854631825701919e-06, + "loss": 0.84589648, + "num_input_tokens_seen": 53644200, + "router_z_loss_clip": 3.0625, + "router_z_loss_mlp": 0.28686523, + "step": 2469, + "time_per_iteration": 5.45016884803772 + }, + { + "auxiliary_loss_clip": 0.06630681, + "auxiliary_loss_mlp": 0.01291477, + "balance_loss_clip": 0.06328478, + "balance_loss_mlp": 0.01264131, + "epoch": 0.14850443408988426, + "flos": 14652949772160.0, + "grad_norm": 3.485678754962802, + "language_loss": 0.76790643, + "learning_rate": 3.854486022987603e-06, + "loss": 0.84712803, + "num_input_tokens_seen": 53659650, + "router_z_loss_clip": 3.0234375, + "router_z_loss_mlp": 0.2734375, + "step": 2470, + "time_per_iteration": 2.514772653579712 + }, + { + "auxiliary_loss_clip": 0.06622952, + "auxiliary_loss_mlp": 0.01299835, + "balance_loss_clip": 0.06329592, + "balance_loss_mlp": 0.0127324, + "epoch": 0.14856455734255222, + "flos": 23554761108480.0, + "grad_norm": 3.1357945603829576, + "language_loss": 0.73019731, + "learning_rate": 3.8543401499509905e-06, + "loss": 0.80942523, + "num_input_tokens_seen": 53680275, + "router_z_loss_clip": 2.93359375, + "router_z_loss_mlp": 0.26623535, + "step": 2471, + "time_per_iteration": 2.5867044925689697 + }, + { + "auxiliary_loss_clip": 0.06632146, + "auxiliary_loss_mlp": 0.01309567, + "balance_loss_clip": 0.06325525, + "balance_loss_mlp": 0.01281862, + "epoch": 0.1486246805952202, + "flos": 18083127565440.0, + "grad_norm": 2.6270207816723894, + "language_loss": 0.90878981, + "learning_rate": 3.854194206597615e-06, + "loss": 0.98820698, + "num_input_tokens_seen": 53698270, + "router_z_loss_clip": 3.0625, + "router_z_loss_mlp": 0.27709961, + "step": 2472, + "time_per_iteration": 2.5934388637542725 + }, + { + "auxiliary_loss_clip": 0.06620878, + "auxiliary_loss_mlp": 0.01314043, + "balance_loss_clip": 0.06322667, + "balance_loss_mlp": 0.01286136, + "epoch": 0.14868480384788818, + "flos": 19359566737920.0, + "grad_norm": 2.5877207728101332, + "language_loss": 0.81794894, + "learning_rate": 3.854048192933008e-06, + "loss": 0.89729816, + "num_input_tokens_seen": 53716845, + "router_z_loss_clip": 2.984375, + "router_z_loss_mlp": 0.2791748, + "step": 2473, + "time_per_iteration": 2.551769256591797 + }, + { + "auxiliary_loss_clip": 0.06630681, + "auxiliary_loss_mlp": 0.01339003, + "balance_loss_clip": 0.06328606, + "balance_loss_mlp": 0.01311346, + "epoch": 0.14874492710055615, + "flos": 22206723022080.0, + "grad_norm": 2.4925002468384423, + "language_loss": 0.79495537, + "learning_rate": 3.853902108962709e-06, + "loss": 0.87465227, + "num_input_tokens_seen": 53734970, + "router_z_loss_clip": 3.01953125, + "router_z_loss_mlp": 0.27624512, + "step": 2474, + "time_per_iteration": 2.55029034614563 + }, + { + "auxiliary_loss_clip": 0.06643772, + "auxiliary_loss_mlp": 0.01336817, + "balance_loss_clip": 0.06335679, + "balance_loss_mlp": 0.01309256, + "epoch": 0.1488050503532241, + "flos": 21109427879040.0, + "grad_norm": 2.598618910298095, + "language_loss": 0.8324194, + "learning_rate": 3.853755954692255e-06, + "loss": 0.91222525, + "num_input_tokens_seen": 53753415, + "router_z_loss_clip": 3.08007812, + "router_z_loss_mlp": 0.27575684, + "step": 2475, + "time_per_iteration": 2.557748794555664 + }, + { + "auxiliary_loss_clip": 0.06641456, + "auxiliary_loss_mlp": 0.01357893, + "balance_loss_clip": 0.06342697, + "balance_loss_mlp": 0.01329998, + "epoch": 0.14886517360589208, + "flos": 12791476592640.0, + "grad_norm": 3.118918756982401, + "language_loss": 0.81896377, + "learning_rate": 3.85360973012719e-06, + "loss": 0.89895725, + "num_input_tokens_seen": 53770305, + "router_z_loss_clip": 2.984375, + "router_z_loss_mlp": 0.27929688, + "step": 2476, + "time_per_iteration": 2.5228424072265625 + }, + { + "auxiliary_loss_clip": 0.06643493, + "auxiliary_loss_mlp": 0.01381513, + "balance_loss_clip": 0.06351461, + "balance_loss_mlp": 0.01354202, + "epoch": 0.14892529685856004, + "flos": 29030503501440.0, + "grad_norm": 5.933104141951435, + "language_loss": 0.78306687, + "learning_rate": 3.853463435273058e-06, + "loss": 0.86331695, + "num_input_tokens_seen": 53788895, + "router_z_loss_clip": 2.921875, + "router_z_loss_mlp": 0.27307129, + "step": 2477, + "time_per_iteration": 2.6379337310791016 + }, + { + "auxiliary_loss_clip": 0.06518018, + "auxiliary_loss_mlp": 0.01346882, + "balance_loss_clip": 0.06346889, + "balance_loss_mlp": 0.01337793, + "epoch": 0.148985420111228, + "flos": 61944215495040.0, + "grad_norm": 0.7948106415234558, + "language_loss": 0.60108519, + "learning_rate": 3.853317070135407e-06, + "loss": 0.67973411, + "num_input_tokens_seen": 53850260, + "router_z_loss_clip": 1.7109375, + "router_z_loss_mlp": 0.09100342, + "step": 2478, + "time_per_iteration": 3.2091856002807617 + }, + { + "auxiliary_loss_clip": 0.06656381, + "auxiliary_loss_mlp": 0.01381988, + "balance_loss_clip": 0.06356013, + "balance_loss_mlp": 0.01354606, + "epoch": 0.149045543363896, + "flos": 23921937699840.0, + "grad_norm": 3.933079411076695, + "language_loss": 0.71247137, + "learning_rate": 3.853170634719787e-06, + "loss": 0.79285508, + "num_input_tokens_seen": 53867520, + "router_z_loss_clip": 3.00195312, + "router_z_loss_mlp": 0.27392578, + "step": 2479, + "time_per_iteration": 2.613901376724243 + }, + { + "auxiliary_loss_clip": 0.06657803, + "auxiliary_loss_mlp": 0.01383638, + "balance_loss_clip": 0.06357619, + "balance_loss_mlp": 0.01356411, + "epoch": 0.14910566661656396, + "flos": 23660293726080.0, + "grad_norm": 3.520474403550157, + "language_loss": 0.82057166, + "learning_rate": 3.853024129031751e-06, + "loss": 0.90098608, + "num_input_tokens_seen": 53886620, + "router_z_loss_clip": 3.00390625, + "router_z_loss_mlp": 0.27246094, + "step": 2480, + "time_per_iteration": 2.6175220012664795 + }, + { + "auxiliary_loss_clip": 0.06659204, + "auxiliary_loss_mlp": 0.01416958, + "balance_loss_clip": 0.06354087, + "balance_loss_mlp": 0.01387727, + "epoch": 0.14916578986923193, + "flos": 20520452730240.0, + "grad_norm": 2.2296604280919805, + "language_loss": 0.85048115, + "learning_rate": 3.852877553076854e-06, + "loss": 0.9312427, + "num_input_tokens_seen": 53902230, + "router_z_loss_clip": 3.05273438, + "router_z_loss_mlp": 0.29248047, + "step": 2481, + "time_per_iteration": 2.617551565170288 + }, + { + "auxiliary_loss_clip": 0.06647365, + "auxiliary_loss_mlp": 0.01423314, + "balance_loss_clip": 0.06347671, + "balance_loss_mlp": 0.01393416, + "epoch": 0.1492259131218999, + "flos": 22498359557760.0, + "grad_norm": 1.912212150867571, + "language_loss": 0.78788674, + "learning_rate": 3.8527309068606546e-06, + "loss": 0.86859351, + "num_input_tokens_seen": 53919475, + "router_z_loss_clip": 2.9921875, + "router_z_loss_mlp": 0.29882812, + "step": 2482, + "time_per_iteration": 2.5733768939971924 + }, + { + "auxiliary_loss_clip": 0.06663539, + "auxiliary_loss_mlp": 0.0143468, + "balance_loss_clip": 0.06351975, + "balance_loss_mlp": 0.01405808, + "epoch": 0.14928603637456786, + "flos": 23192657688960.0, + "grad_norm": 2.2991604479376777, + "language_loss": 0.80652654, + "learning_rate": 3.852584190388713e-06, + "loss": 0.88750875, + "num_input_tokens_seen": 53939150, + "router_z_loss_clip": 3.1171875, + "router_z_loss_mlp": 0.28857422, + "step": 2483, + "time_per_iteration": 2.597843647003174 + }, + { + "auxiliary_loss_clip": 0.06641878, + "auxiliary_loss_mlp": 0.01472083, + "balance_loss_clip": 0.06352127, + "balance_loss_mlp": 0.01442948, + "epoch": 0.14934615962723582, + "flos": 21659731568640.0, + "grad_norm": 2.0225233992765728, + "language_loss": 0.71627355, + "learning_rate": 3.852437403666595e-06, + "loss": 0.79741317, + "num_input_tokens_seen": 53958735, + "router_z_loss_clip": 2.90039062, + "router_z_loss_mlp": 0.2911377, + "step": 2484, + "time_per_iteration": 2.5717227458953857 + }, + { + "auxiliary_loss_clip": 0.06650308, + "auxiliary_loss_mlp": 0.01467216, + "balance_loss_clip": 0.06347484, + "balance_loss_mlp": 0.01435006, + "epoch": 0.1494062828799038, + "flos": 27016356983040.0, + "grad_norm": 2.0068383034806154, + "language_loss": 0.85284823, + "learning_rate": 3.852290546699863e-06, + "loss": 0.9340235, + "num_input_tokens_seen": 53975065, + "router_z_loss_clip": 3.02539062, + "router_z_loss_mlp": 0.32226562, + "step": 2485, + "time_per_iteration": 2.7037456035614014 + }, + { + "auxiliary_loss_clip": 0.0664534, + "auxiliary_loss_mlp": 0.01441016, + "balance_loss_clip": 0.06342804, + "balance_loss_mlp": 0.01410952, + "epoch": 0.14946640613257178, + "flos": 21221291479680.0, + "grad_norm": 2.0879118929126133, + "language_loss": 0.85614496, + "learning_rate": 3.8521436194940894e-06, + "loss": 0.93700856, + "num_input_tokens_seen": 53993330, + "router_z_loss_clip": 3.02539062, + "router_z_loss_mlp": 0.30053711, + "step": 2486, + "time_per_iteration": 2.5492942333221436 + }, + { + "auxiliary_loss_clip": 0.06628142, + "auxiliary_loss_mlp": 0.01484598, + "balance_loss_clip": 0.06337839, + "balance_loss_mlp": 0.01454963, + "epoch": 0.14952652938523975, + "flos": 13375965548160.0, + "grad_norm": 2.5864541617313805, + "language_loss": 0.75625527, + "learning_rate": 3.851996622054842e-06, + "loss": 0.83738261, + "num_input_tokens_seen": 54010515, + "router_z_loss_clip": 2.90039062, + "router_z_loss_mlp": 0.29638672, + "step": 2487, + "time_per_iteration": 2.6050243377685547 + }, + { + "auxiliary_loss_clip": 0.06636909, + "auxiliary_loss_mlp": 0.01458272, + "balance_loss_clip": 0.06336737, + "balance_loss_mlp": 0.01427635, + "epoch": 0.1495866526379077, + "flos": 35526491608320.0, + "grad_norm": 2.6345212857914415, + "language_loss": 0.72756326, + "learning_rate": 3.8518495543877e-06, + "loss": 0.80851501, + "num_input_tokens_seen": 54031315, + "router_z_loss_clip": 3.0, + "router_z_loss_mlp": 0.30639648, + "step": 2488, + "time_per_iteration": 2.7038300037384033 + }, + { + "auxiliary_loss_clip": 0.06629623, + "auxiliary_loss_mlp": 0.01463441, + "balance_loss_clip": 0.06324254, + "balance_loss_mlp": 0.01431421, + "epoch": 0.14964677589057568, + "flos": 17637392171520.0, + "grad_norm": 3.2533111651102633, + "language_loss": 0.71329439, + "learning_rate": 3.851702416498235e-06, + "loss": 0.79422504, + "num_input_tokens_seen": 54045965, + "router_z_loss_clip": 3.05273438, + "router_z_loss_mlp": 0.3203125, + "step": 2489, + "time_per_iteration": 2.6397132873535156 + }, + { + "auxiliary_loss_clip": 0.06627091, + "auxiliary_loss_mlp": 0.01445303, + "balance_loss_clip": 0.06321006, + "balance_loss_mlp": 0.01412807, + "epoch": 0.14970689914324364, + "flos": 20190102808320.0, + "grad_norm": 15.387963507460157, + "language_loss": 0.82698536, + "learning_rate": 3.8515552083920295e-06, + "loss": 0.90770924, + "num_input_tokens_seen": 54059960, + "router_z_loss_clip": 3.05859375, + "router_z_loss_mlp": 0.32446289, + "step": 2490, + "time_per_iteration": 2.560051918029785 + }, + { + "auxiliary_loss_clip": 0.0662353, + "auxiliary_loss_mlp": 0.01421627, + "balance_loss_clip": 0.06318316, + "balance_loss_mlp": 0.013913, + "epoch": 0.1497670223959116, + "flos": 37237136238720.0, + "grad_norm": 2.555318554574921, + "language_loss": 0.81524169, + "learning_rate": 3.851407930074666e-06, + "loss": 0.8956933, + "num_input_tokens_seen": 54079330, + "router_z_loss_clip": 3.05273438, + "router_z_loss_mlp": 0.30322266, + "step": 2491, + "time_per_iteration": 2.7191121578216553 + }, + { + "auxiliary_loss_clip": 0.06628857, + "auxiliary_loss_mlp": 0.01437567, + "balance_loss_clip": 0.06323408, + "balance_loss_mlp": 0.01406072, + "epoch": 0.1498271456485796, + "flos": 24461675775360.0, + "grad_norm": 2.0859620961652032, + "language_loss": 0.91616488, + "learning_rate": 3.851260581551727e-06, + "loss": 0.99682909, + "num_input_tokens_seen": 54097555, + "router_z_loss_clip": 3.05273438, + "router_z_loss_mlp": 0.31469727, + "step": 2492, + "time_per_iteration": 2.5775644779205322 + }, + { + "auxiliary_loss_clip": 0.06620014, + "auxiliary_loss_mlp": 0.01407656, + "balance_loss_clip": 0.06319647, + "balance_loss_mlp": 0.01375589, + "epoch": 0.14988726890124757, + "flos": 16259235742080.0, + "grad_norm": 4.194340578044498, + "language_loss": 0.80698526, + "learning_rate": 3.851113162828802e-06, + "loss": 0.88726199, + "num_input_tokens_seen": 54115600, + "router_z_loss_clip": 3.00585938, + "router_z_loss_mlp": 0.3203125, + "step": 2493, + "time_per_iteration": 2.522217273712158 + }, + { + "auxiliary_loss_clip": 0.06625558, + "auxiliary_loss_mlp": 0.01423964, + "balance_loss_clip": 0.06320652, + "balance_loss_mlp": 0.01391014, + "epoch": 0.14994739215391553, + "flos": 20672622944640.0, + "grad_norm": 1.92476481647275, + "language_loss": 0.81586623, + "learning_rate": 3.85096567391148e-06, + "loss": 0.89636147, + "num_input_tokens_seen": 54135220, + "router_z_loss_clip": 3.05078125, + "router_z_loss_mlp": 0.32958984, + "step": 2494, + "time_per_iteration": 2.5768370628356934 + }, + { + "auxiliary_loss_clip": 0.06620924, + "auxiliary_loss_mlp": 0.01381746, + "balance_loss_clip": 0.06323613, + "balance_loss_mlp": 0.01351562, + "epoch": 0.1500075154065835, + "flos": 70666855603200.0, + "grad_norm": 1.9921469546830013, + "language_loss": 0.67712897, + "learning_rate": 3.850818114805354e-06, + "loss": 0.75715572, + "num_input_tokens_seen": 54161065, + "router_z_loss_clip": 2.97851562, + "router_z_loss_mlp": 0.30187988, + "step": 2495, + "time_per_iteration": 2.9661571979522705 + }, + { + "auxiliary_loss_clip": 0.06548879, + "auxiliary_loss_mlp": 0.01321563, + "balance_loss_clip": 0.06377496, + "balance_loss_mlp": 0.01310876, + "epoch": 0.15006763865925146, + "flos": 68029827431040.0, + "grad_norm": 0.8769612772619841, + "language_loss": 0.5954529, + "learning_rate": 3.850670485516019e-06, + "loss": 0.67415726, + "num_input_tokens_seen": 54225095, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.10699463, + "step": 2496, + "time_per_iteration": 3.202047109603882 + }, + { + "auxiliary_loss_clip": 0.06631249, + "auxiliary_loss_mlp": 0.0133476, + "balance_loss_clip": 0.06323538, + "balance_loss_mlp": 0.01304254, + "epoch": 0.15012776191191943, + "flos": 18922216752000.0, + "grad_norm": 2.34505525234942, + "language_loss": 0.66916072, + "learning_rate": 3.850522786049075e-06, + "loss": 0.74882078, + "num_input_tokens_seen": 54243750, + "router_z_loss_clip": 3.08007812, + "router_z_loss_mlp": 0.30505371, + "step": 2497, + "time_per_iteration": 2.5355312824249268 + }, + { + "auxiliary_loss_clip": 0.06621728, + "auxiliary_loss_mlp": 0.01327478, + "balance_loss_clip": 0.06319709, + "balance_loss_mlp": 0.01299762, + "epoch": 0.1501878851645874, + "flos": 23708985747840.0, + "grad_norm": 1.6926191632820315, + "language_loss": 0.76545727, + "learning_rate": 3.850375016410121e-06, + "loss": 0.84494931, + "num_input_tokens_seen": 54266185, + "router_z_loss_clip": 3.0234375, + "router_z_loss_mlp": 0.27746582, + "step": 2498, + "time_per_iteration": 2.6315629482269287 + }, + { + "auxiliary_loss_clip": 0.06625126, + "auxiliary_loss_mlp": 0.0132033, + "balance_loss_clip": 0.06315958, + "balance_loss_mlp": 0.01288454, + "epoch": 0.15024800841725539, + "flos": 20418777400320.0, + "grad_norm": 2.3031515729251377, + "language_loss": 0.72851908, + "learning_rate": 3.850227176604761e-06, + "loss": 0.80797374, + "num_input_tokens_seen": 54283940, + "router_z_loss_clip": 3.09570312, + "router_z_loss_mlp": 0.3190918, + "step": 2499, + "time_per_iteration": 2.550572395324707 + }, + { + "auxiliary_loss_clip": 0.06615321, + "auxiliary_loss_mlp": 0.01299804, + "balance_loss_clip": 0.06312654, + "balance_loss_mlp": 0.01270002, + "epoch": 0.15030813166992335, + "flos": 31838904472320.0, + "grad_norm": 2.1036429780105204, + "language_loss": 0.72527623, + "learning_rate": 3.850079266638601e-06, + "loss": 0.80442744, + "num_input_tokens_seen": 54304830, + "router_z_loss_clip": 3.02539062, + "router_z_loss_mlp": 0.29760742, + "step": 2500, + "time_per_iteration": 2.66140079498291 + }, + { + "auxiliary_loss_clip": 0.06611083, + "auxiliary_loss_mlp": 0.01296332, + "balance_loss_clip": 0.06309603, + "balance_loss_mlp": 0.0126765, + "epoch": 0.15036825492259132, + "flos": 35665664440320.0, + "grad_norm": 2.1651988912264697, + "language_loss": 0.6639303, + "learning_rate": 3.849931286517249e-06, + "loss": 0.74300444, + "num_input_tokens_seen": 54325595, + "router_z_loss_clip": 3.01953125, + "router_z_loss_mlp": 0.28686523, + "step": 2501, + "time_per_iteration": 2.6920387744903564 + }, + { + "auxiliary_loss_clip": 0.06617519, + "auxiliary_loss_mlp": 0.0129342, + "balance_loss_clip": 0.06313312, + "balance_loss_mlp": 0.01262283, + "epoch": 0.15042837817525928, + "flos": 18843238679040.0, + "grad_norm": 2.189390095106363, + "language_loss": 0.84965289, + "learning_rate": 3.849783236246318e-06, + "loss": 0.92876226, + "num_input_tokens_seen": 54342180, + "router_z_loss_clip": 3.04101562, + "router_z_loss_mlp": 0.31152344, + "step": 2502, + "time_per_iteration": 2.5896334648132324 + }, + { + "auxiliary_loss_clip": 0.06611362, + "auxiliary_loss_mlp": 0.01289243, + "balance_loss_clip": 0.06310903, + "balance_loss_mlp": 0.0126142, + "epoch": 0.15048850142792725, + "flos": 19541436024960.0, + "grad_norm": 2.1165990533687746, + "language_loss": 0.78282011, + "learning_rate": 3.849635115831421e-06, + "loss": 0.86182618, + "num_input_tokens_seen": 54360255, + "router_z_loss_clip": 3.00585938, + "router_z_loss_mlp": 0.2779541, + "step": 2503, + "time_per_iteration": 3.9853694438934326 + }, + { + "auxiliary_loss_clip": 0.06603716, + "auxiliary_loss_mlp": 0.01289674, + "balance_loss_clip": 0.06307186, + "balance_loss_mlp": 0.01263102, + "epoch": 0.1505486246805952, + "flos": 22024015194240.0, + "grad_norm": 1.9675013040349558, + "language_loss": 0.8635025, + "learning_rate": 3.849486925278176e-06, + "loss": 0.94243646, + "num_input_tokens_seen": 54378260, + "router_z_loss_clip": 2.96679688, + "router_z_loss_mlp": 0.26586914, + "step": 2504, + "time_per_iteration": 2.544656991958618 + }, + { + "auxiliary_loss_clip": 0.06603047, + "auxiliary_loss_mlp": 0.0129183, + "balance_loss_clip": 0.06305411, + "balance_loss_mlp": 0.01264794, + "epoch": 0.15060874793326318, + "flos": 20749840081920.0, + "grad_norm": 2.8187796049403127, + "language_loss": 0.83803535, + "learning_rate": 3.8493386645922e-06, + "loss": 0.91698414, + "num_input_tokens_seen": 54399745, + "router_z_loss_clip": 2.97851562, + "router_z_loss_mlp": 0.27050781, + "step": 2505, + "time_per_iteration": 3.988954544067383 + }, + { + "auxiliary_loss_clip": 0.06600159, + "auxiliary_loss_mlp": 0.01291215, + "balance_loss_clip": 0.06303117, + "balance_loss_mlp": 0.01263249, + "epoch": 0.15066887118593117, + "flos": 16477470501120.0, + "grad_norm": 1.903749804745976, + "language_loss": 0.77148849, + "learning_rate": 3.849190333779117e-06, + "loss": 0.85040224, + "num_input_tokens_seen": 54417105, + "router_z_loss_clip": 2.97070312, + "router_z_loss_mlp": 0.27978516, + "step": 2506, + "time_per_iteration": 2.548551559448242 + }, + { + "auxiliary_loss_clip": 0.06619012, + "auxiliary_loss_mlp": 0.01287214, + "balance_loss_clip": 0.06307869, + "balance_loss_mlp": 0.01257722, + "epoch": 0.15072899443859913, + "flos": 19864490641920.0, + "grad_norm": 4.281401041045214, + "language_loss": 0.78119665, + "learning_rate": 3.849041932844552e-06, + "loss": 0.86025894, + "num_input_tokens_seen": 54433920, + "router_z_loss_clip": 3.11328125, + "router_z_loss_mlp": 0.29467773, + "step": 2507, + "time_per_iteration": 2.494123697280884 + }, + { + "auxiliary_loss_clip": 0.06598042, + "auxiliary_loss_mlp": 0.01289211, + "balance_loss_clip": 0.06304646, + "balance_loss_mlp": 0.01262532, + "epoch": 0.1507891176912671, + "flos": 20782348266240.0, + "grad_norm": 1.9743385281698682, + "language_loss": 0.69510758, + "learning_rate": 3.848893461794131e-06, + "loss": 0.77398014, + "num_input_tokens_seen": 54451540, + "router_z_loss_clip": 2.93554688, + "router_z_loss_mlp": 0.26647949, + "step": 2508, + "time_per_iteration": 2.53487491607666 + }, + { + "auxiliary_loss_clip": 0.06608425, + "auxiliary_loss_mlp": 0.01288258, + "balance_loss_clip": 0.06303222, + "balance_loss_mlp": 0.01259946, + "epoch": 0.15084924094393506, + "flos": 23593390640640.0, + "grad_norm": 1.8413842263271991, + "language_loss": 0.78278601, + "learning_rate": 3.8487449206334845e-06, + "loss": 0.86175287, + "num_input_tokens_seen": 54470800, + "router_z_loss_clip": 3.0546875, + "router_z_loss_mlp": 0.28320312, + "step": 2509, + "time_per_iteration": 5.512920141220093 + }, + { + "auxiliary_loss_clip": 0.06619874, + "auxiliary_loss_mlp": 0.01301611, + "balance_loss_clip": 0.06305903, + "balance_loss_mlp": 0.01270879, + "epoch": 0.15090936419660303, + "flos": 18916430820480.0, + "grad_norm": 3.8878243194331756, + "language_loss": 0.82607746, + "learning_rate": 3.848596309368246e-06, + "loss": 0.90529227, + "num_input_tokens_seen": 54486525, + "router_z_loss_clip": 3.140625, + "router_z_loss_mlp": 0.30688477, + "step": 2510, + "time_per_iteration": 2.4956603050231934 + }, + { + "auxiliary_loss_clip": 0.0661021, + "auxiliary_loss_mlp": 0.01290438, + "balance_loss_clip": 0.06301613, + "balance_loss_mlp": 0.01258919, + "epoch": 0.150969487449271, + "flos": 17933514900480.0, + "grad_norm": 2.455863983709149, + "language_loss": 0.74876237, + "learning_rate": 3.8484476280040495e-06, + "loss": 0.82776886, + "num_input_tokens_seen": 54503795, + "router_z_loss_clip": 3.08789062, + "router_z_loss_mlp": 0.31518555, + "step": 2511, + "time_per_iteration": 2.551175832748413 + }, + { + "auxiliary_loss_clip": 0.06603982, + "auxiliary_loss_mlp": 0.0129301, + "balance_loss_clip": 0.06306278, + "balance_loss_mlp": 0.012649, + "epoch": 0.151029610701939, + "flos": 24249897780480.0, + "grad_norm": 3.2919067663681854, + "language_loss": 0.6990515, + "learning_rate": 3.848298876546534e-06, + "loss": 0.77802145, + "num_input_tokens_seen": 54523025, + "router_z_loss_clip": 2.9765625, + "router_z_loss_mlp": 0.28100586, + "step": 2512, + "time_per_iteration": 2.592564344406128 + }, + { + "auxiliary_loss_clip": 0.06602003, + "auxiliary_loss_mlp": 0.01290201, + "balance_loss_clip": 0.06302576, + "balance_loss_mlp": 0.01260136, + "epoch": 0.15108973395460695, + "flos": 30270199858560.0, + "grad_norm": 3.311694411348407, + "language_loss": 0.75370401, + "learning_rate": 3.84815005500134e-06, + "loss": 0.8326261, + "num_input_tokens_seen": 54545025, + "router_z_loss_clip": 2.99609375, + "router_z_loss_mlp": 0.30078125, + "step": 2513, + "time_per_iteration": 2.675105571746826 + }, + { + "auxiliary_loss_clip": 0.06516539, + "auxiliary_loss_mlp": 0.01341982, + "balance_loss_clip": 0.06344443, + "balance_loss_mlp": 0.01333804, + "epoch": 0.15114985720727492, + "flos": 60456711087360.0, + "grad_norm": 0.8564181084280313, + "language_loss": 0.64582717, + "learning_rate": 3.84800116337411e-06, + "loss": 0.72441238, + "num_input_tokens_seen": 54604545, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.08178711, + "step": 2514, + "time_per_iteration": 3.1119604110717773 + }, + { + "auxiliary_loss_clip": 0.06602134, + "auxiliary_loss_mlp": 0.01300136, + "balance_loss_clip": 0.06303127, + "balance_loss_mlp": 0.01271299, + "epoch": 0.15120998045994288, + "flos": 20527915743360.0, + "grad_norm": 2.3848506685629487, + "language_loss": 0.74193883, + "learning_rate": 3.8478522016704916e-06, + "loss": 0.82096153, + "num_input_tokens_seen": 54620590, + "router_z_loss_clip": 2.9921875, + "router_z_loss_mlp": 0.28869629, + "step": 2515, + "time_per_iteration": 2.554006814956665 + }, + { + "auxiliary_loss_clip": 0.06601816, + "auxiliary_loss_mlp": 0.01297055, + "balance_loss_clip": 0.06304994, + "balance_loss_mlp": 0.01269577, + "epoch": 0.15127010371261085, + "flos": 21185303351040.0, + "grad_norm": 1.9231590772251361, + "language_loss": 0.78707075, + "learning_rate": 3.8477031698961325e-06, + "loss": 0.86605948, + "num_input_tokens_seen": 54640410, + "router_z_loss_clip": 2.96875, + "router_z_loss_mlp": 0.27490234, + "step": 2516, + "time_per_iteration": 2.5447309017181396 + }, + { + "auxiliary_loss_clip": 0.06496674, + "auxiliary_loss_mlp": 0.01300995, + "balance_loss_clip": 0.063242, + "balance_loss_mlp": 0.01292406, + "epoch": 0.1513302269652788, + "flos": 65339537189760.0, + "grad_norm": 0.7164418146378366, + "language_loss": 0.54901356, + "learning_rate": 3.8475540680566835e-06, + "loss": 0.62699026, + "num_input_tokens_seen": 54701430, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.08599854, + "step": 2517, + "time_per_iteration": 3.1926348209381104 + }, + { + "auxiliary_loss_clip": 0.06606746, + "auxiliary_loss_mlp": 0.01299298, + "balance_loss_clip": 0.06308446, + "balance_loss_mlp": 0.01269257, + "epoch": 0.15139035021794678, + "flos": 19141918957440.0, + "grad_norm": 1.8480469380115683, + "language_loss": 0.79359663, + "learning_rate": 3.8474048961577995e-06, + "loss": 0.87265706, + "num_input_tokens_seen": 54720845, + "router_z_loss_clip": 2.98046875, + "router_z_loss_mlp": 0.30078125, + "step": 2518, + "time_per_iteration": 2.563261032104492 + }, + { + "auxiliary_loss_clip": 0.06615496, + "auxiliary_loss_mlp": 0.01294147, + "balance_loss_clip": 0.06308527, + "balance_loss_mlp": 0.01264154, + "epoch": 0.15145047347061477, + "flos": 26585841104640.0, + "grad_norm": 2.595059574569343, + "language_loss": 0.71604168, + "learning_rate": 3.847255654205137e-06, + "loss": 0.79513812, + "num_input_tokens_seen": 54740495, + "router_z_loss_clip": 3.0703125, + "router_z_loss_mlp": 0.29980469, + "step": 2519, + "time_per_iteration": 2.5810017585754395 + }, + { + "auxiliary_loss_clip": 0.06607082, + "auxiliary_loss_mlp": 0.01285902, + "balance_loss_clip": 0.06307598, + "balance_loss_mlp": 0.01257483, + "epoch": 0.15151059672328274, + "flos": 20309177859840.0, + "grad_norm": 2.5486902935962368, + "language_loss": 0.80309343, + "learning_rate": 3.847106342204354e-06, + "loss": 0.88202327, + "num_input_tokens_seen": 54758415, + "router_z_loss_clip": 2.99804688, + "router_z_loss_mlp": 0.28393555, + "step": 2520, + "time_per_iteration": 2.5701065063476562 + }, + { + "auxiliary_loss_clip": 0.06607689, + "auxiliary_loss_mlp": 0.01293848, + "balance_loss_clip": 0.06306153, + "balance_loss_mlp": 0.01262853, + "epoch": 0.1515707199759507, + "flos": 27234591742080.0, + "grad_norm": 2.513682116437687, + "language_loss": 0.7522434, + "learning_rate": 3.846956960161114e-06, + "loss": 0.83125877, + "num_input_tokens_seen": 54779355, + "router_z_loss_clip": 3.01953125, + "router_z_loss_mlp": 0.31005859, + "step": 2521, + "time_per_iteration": 2.6066393852233887 + }, + { + "auxiliary_loss_clip": 0.06609409, + "auxiliary_loss_mlp": 0.01293912, + "balance_loss_clip": 0.06305401, + "balance_loss_mlp": 0.012643, + "epoch": 0.15163084322861867, + "flos": 23594229181440.0, + "grad_norm": 3.360256579964136, + "language_loss": 0.82804251, + "learning_rate": 3.84680750808108e-06, + "loss": 0.9070757, + "num_input_tokens_seen": 54799465, + "router_z_loss_clip": 3.04101562, + "router_z_loss_mlp": 0.29614258, + "step": 2522, + "time_per_iteration": 2.6204471588134766 + }, + { + "auxiliary_loss_clip": 0.06466869, + "auxiliary_loss_mlp": 0.01261371, + "balance_loss_clip": 0.06295171, + "balance_loss_mlp": 0.01253491, + "epoch": 0.15169096648128663, + "flos": 66908786855040.0, + "grad_norm": 0.8016115215940587, + "language_loss": 0.58029842, + "learning_rate": 3.846657985969922e-06, + "loss": 0.65758073, + "num_input_tokens_seen": 54857665, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.07873535, + "step": 2523, + "time_per_iteration": 3.1140880584716797 + }, + { + "auxiliary_loss_clip": 0.06599564, + "auxiliary_loss_mlp": 0.0128657, + "balance_loss_clip": 0.0630584, + "balance_loss_mlp": 0.0125821, + "epoch": 0.1517510897339546, + "flos": 29103024810240.0, + "grad_norm": 3.3848907238065324, + "language_loss": 0.7552231, + "learning_rate": 3.8465083938333066e-06, + "loss": 0.83408445, + "num_input_tokens_seen": 54879895, + "router_z_loss_clip": 2.93945312, + "router_z_loss_mlp": 0.2833252, + "step": 2524, + "time_per_iteration": 2.6701698303222656 + }, + { + "auxiliary_loss_clip": 0.066016, + "auxiliary_loss_mlp": 0.01289357, + "balance_loss_clip": 0.0629995, + "balance_loss_mlp": 0.01259889, + "epoch": 0.1518112129866226, + "flos": 18412597019520.0, + "grad_norm": 1.915224291313093, + "language_loss": 0.75580716, + "learning_rate": 3.8463587316769085e-06, + "loss": 0.8347168, + "num_input_tokens_seen": 54898245, + "router_z_loss_clip": 3.01757812, + "router_z_loss_mlp": 0.29443359, + "step": 2525, + "time_per_iteration": 2.5224146842956543 + }, + { + "auxiliary_loss_clip": 0.06610245, + "auxiliary_loss_mlp": 0.01284071, + "balance_loss_clip": 0.06303011, + "balance_loss_mlp": 0.01254436, + "epoch": 0.15187133623929056, + "flos": 19431165651840.0, + "grad_norm": 1.8765466933559616, + "language_loss": 0.80763042, + "learning_rate": 3.846208999506402e-06, + "loss": 0.88657361, + "num_input_tokens_seen": 54917060, + "router_z_loss_clip": 3.0703125, + "router_z_loss_mlp": 0.29638672, + "step": 2526, + "time_per_iteration": 2.6248834133148193 + }, + { + "auxiliary_loss_clip": 0.06594585, + "auxiliary_loss_mlp": 0.01286752, + "balance_loss_clip": 0.06300339, + "balance_loss_mlp": 0.01258869, + "epoch": 0.15193145949195852, + "flos": 17571914605440.0, + "grad_norm": 1.7842428302313325, + "language_loss": 0.8627159, + "learning_rate": 3.846059197327466e-06, + "loss": 0.94152921, + "num_input_tokens_seen": 54936365, + "router_z_loss_clip": 2.94335938, + "router_z_loss_mlp": 0.27893066, + "step": 2527, + "time_per_iteration": 2.5703248977661133 + }, + { + "auxiliary_loss_clip": 0.06595106, + "auxiliary_loss_mlp": 0.01287139, + "balance_loss_clip": 0.06298759, + "balance_loss_mlp": 0.01258386, + "epoch": 0.15199158274462649, + "flos": 36185472443520.0, + "grad_norm": 2.5277358880769034, + "language_loss": 0.69832277, + "learning_rate": 3.845909325145779e-06, + "loss": 0.77714521, + "num_input_tokens_seen": 54961365, + "router_z_loss_clip": 2.9609375, + "router_z_loss_mlp": 0.28710938, + "step": 2528, + "time_per_iteration": 2.6980392932891846 + }, + { + "auxiliary_loss_clip": 0.06594975, + "auxiliary_loss_mlp": 0.01296705, + "balance_loss_clip": 0.06302442, + "balance_loss_mlp": 0.01268142, + "epoch": 0.15205170599729445, + "flos": 23080416744960.0, + "grad_norm": 1.7045403282780136, + "language_loss": 0.87845027, + "learning_rate": 3.845759382967026e-06, + "loss": 0.95736718, + "num_input_tokens_seen": 54980750, + "router_z_loss_clip": 2.92382812, + "router_z_loss_mlp": 0.28588867, + "step": 2529, + "time_per_iteration": 2.557424545288086 + }, + { + "auxiliary_loss_clip": 0.06594887, + "auxiliary_loss_mlp": 0.01282389, + "balance_loss_clip": 0.06300049, + "balance_loss_mlp": 0.01254446, + "epoch": 0.15211182924996242, + "flos": 21914876851200.0, + "grad_norm": 2.4637975770903227, + "language_loss": 0.84209996, + "learning_rate": 3.845609370796893e-06, + "loss": 0.92087275, + "num_input_tokens_seen": 54999675, + "router_z_loss_clip": 2.9453125, + "router_z_loss_mlp": 0.27929688, + "step": 2530, + "time_per_iteration": 2.567228317260742 + }, + { + "auxiliary_loss_clip": 0.06598973, + "auxiliary_loss_mlp": 0.01283946, + "balance_loss_clip": 0.06302072, + "balance_loss_mlp": 0.01255336, + "epoch": 0.15217195250263038, + "flos": 13886675383680.0, + "grad_norm": 2.4321779104905312, + "language_loss": 0.82142234, + "learning_rate": 3.845459288641066e-06, + "loss": 0.90025157, + "num_input_tokens_seen": 55018295, + "router_z_loss_clip": 2.96875, + "router_z_loss_mlp": 0.28637695, + "step": 2531, + "time_per_iteration": 2.516402006149292 + }, + { + "auxiliary_loss_clip": 0.06592906, + "auxiliary_loss_mlp": 0.01285145, + "balance_loss_clip": 0.06298403, + "balance_loss_mlp": 0.01258085, + "epoch": 0.15223207575529837, + "flos": 24542247075840.0, + "grad_norm": 1.9096136580750296, + "language_loss": 0.79480046, + "learning_rate": 3.8453091365052394e-06, + "loss": 0.87358099, + "num_input_tokens_seen": 55037975, + "router_z_loss_clip": 2.94726562, + "router_z_loss_mlp": 0.27050781, + "step": 2532, + "time_per_iteration": 2.602570056915283 + }, + { + "auxiliary_loss_clip": 0.06598103, + "auxiliary_loss_mlp": 0.01292588, + "balance_loss_clip": 0.06306568, + "balance_loss_mlp": 0.01264038, + "epoch": 0.15229219900796634, + "flos": 25563876382080.0, + "grad_norm": 2.360683407186041, + "language_loss": 0.88639164, + "learning_rate": 3.845158914395105e-06, + "loss": 0.96529853, + "num_input_tokens_seen": 55057135, + "router_z_loss_clip": 2.91796875, + "router_z_loss_mlp": 0.28552246, + "step": 2533, + "time_per_iteration": 2.5762295722961426 + }, + { + "auxiliary_loss_clip": 0.06594107, + "auxiliary_loss_mlp": 0.01284606, + "balance_loss_clip": 0.06298208, + "balance_loss_mlp": 0.01254935, + "epoch": 0.1523523222606343, + "flos": 18222761594880.0, + "grad_norm": 2.499608410280873, + "language_loss": 0.79898536, + "learning_rate": 3.84500862231636e-06, + "loss": 0.87777245, + "num_input_tokens_seen": 55075525, + "router_z_loss_clip": 2.95898438, + "router_z_loss_mlp": 0.29650879, + "step": 2534, + "time_per_iteration": 2.5181829929351807 + }, + { + "auxiliary_loss_clip": 0.06609488, + "auxiliary_loss_mlp": 0.01289006, + "balance_loss_clip": 0.0630374, + "balance_loss_mlp": 0.01258965, + "epoch": 0.15241244551330227, + "flos": 13264940488320.0, + "grad_norm": 3.191609676619316, + "language_loss": 0.77956164, + "learning_rate": 3.844858260274702e-06, + "loss": 0.8585465, + "num_input_tokens_seen": 55090845, + "router_z_loss_clip": 3.05859375, + "router_z_loss_mlp": 0.30029297, + "step": 2535, + "time_per_iteration": 2.503119945526123 + }, + { + "auxiliary_loss_clip": 0.06608094, + "auxiliary_loss_mlp": 0.01284526, + "balance_loss_clip": 0.06304367, + "balance_loss_mlp": 0.01254271, + "epoch": 0.15247256876597023, + "flos": 19721083178880.0, + "grad_norm": 3.2947050027003066, + "language_loss": 0.79165435, + "learning_rate": 3.844707828275835e-06, + "loss": 0.87058055, + "num_input_tokens_seen": 55108750, + "router_z_loss_clip": 3.03710938, + "router_z_loss_mlp": 0.30249023, + "step": 2536, + "time_per_iteration": 2.5530476570129395 + }, + { + "auxiliary_loss_clip": 0.06598002, + "auxiliary_loss_mlp": 0.0128534, + "balance_loss_clip": 0.06305596, + "balance_loss_mlp": 0.01255537, + "epoch": 0.1525326920186382, + "flos": 20382076512000.0, + "grad_norm": 2.2639852442912174, + "language_loss": 0.76164496, + "learning_rate": 3.844557326325461e-06, + "loss": 0.84047836, + "num_input_tokens_seen": 55126750, + "router_z_loss_clip": 2.92578125, + "router_z_loss_mlp": 0.29785156, + "step": 2537, + "time_per_iteration": 2.5634751319885254 + }, + { + "auxiliary_loss_clip": 0.06616107, + "auxiliary_loss_mlp": 0.01291403, + "balance_loss_clip": 0.06314284, + "balance_loss_mlp": 0.0126017, + "epoch": 0.15259281527130616, + "flos": 13595122702080.0, + "grad_norm": 2.083719097909717, + "language_loss": 0.78846097, + "learning_rate": 3.8444067544292896e-06, + "loss": 0.86753607, + "num_input_tokens_seen": 55144690, + "router_z_loss_clip": 3.01953125, + "router_z_loss_mlp": 0.31225586, + "step": 2538, + "time_per_iteration": 2.525216579437256 + }, + { + "auxiliary_loss_clip": 0.0661103, + "auxiliary_loss_mlp": 0.01284923, + "balance_loss_clip": 0.06318808, + "balance_loss_mlp": 0.0125735, + "epoch": 0.15265293852397416, + "flos": 22867590574080.0, + "grad_norm": 1.595971485409624, + "language_loss": 0.90629852, + "learning_rate": 3.844256112593029e-06, + "loss": 0.98525798, + "num_input_tokens_seen": 55166055, + "router_z_loss_clip": 2.921875, + "router_z_loss_mlp": 0.27600098, + "step": 2539, + "time_per_iteration": 2.5915887355804443 + }, + { + "auxiliary_loss_clip": 0.06619261, + "auxiliary_loss_mlp": 0.01284998, + "balance_loss_clip": 0.06323005, + "balance_loss_mlp": 0.01258056, + "epoch": 0.15271306177664212, + "flos": 29245174462080.0, + "grad_norm": 1.9545185046664433, + "language_loss": 0.94507146, + "learning_rate": 3.844105400822391e-06, + "loss": 1.02411401, + "num_input_tokens_seen": 55186285, + "router_z_loss_clip": 2.9609375, + "router_z_loss_mlp": 0.26953125, + "step": 2540, + "time_per_iteration": 2.5957653522491455 + }, + { + "auxiliary_loss_clip": 0.06626961, + "auxiliary_loss_mlp": 0.01293534, + "balance_loss_clip": 0.06334557, + "balance_loss_mlp": 0.01266021, + "epoch": 0.1527731850293101, + "flos": 31253912392320.0, + "grad_norm": 1.8583637495379903, + "language_loss": 0.76235664, + "learning_rate": 3.843954619123092e-06, + "loss": 0.84156162, + "num_input_tokens_seen": 55207915, + "router_z_loss_clip": 2.92578125, + "router_z_loss_mlp": 0.27490234, + "step": 2541, + "time_per_iteration": 2.6641690731048584 + }, + { + "auxiliary_loss_clip": 0.06626125, + "auxiliary_loss_mlp": 0.01288118, + "balance_loss_clip": 0.06332077, + "balance_loss_mlp": 0.01259139, + "epoch": 0.15283330828197805, + "flos": 22388550382080.0, + "grad_norm": 1.961487412354616, + "language_loss": 0.82183802, + "learning_rate": 3.84380376750085e-06, + "loss": 0.90098047, + "num_input_tokens_seen": 55227860, + "router_z_loss_clip": 2.94140625, + "router_z_loss_mlp": 0.28991699, + "step": 2542, + "time_per_iteration": 2.5667076110839844 + }, + { + "auxiliary_loss_clip": 0.06644198, + "auxiliary_loss_mlp": 0.01293823, + "balance_loss_clip": 0.0634245, + "balance_loss_mlp": 0.01263568, + "epoch": 0.15289343153464602, + "flos": 25527175493760.0, + "grad_norm": 2.1541705335190597, + "language_loss": 0.78364998, + "learning_rate": 3.843652845961383e-06, + "loss": 0.8630302, + "num_input_tokens_seen": 55247330, + "router_z_loss_clip": 3.01953125, + "router_z_loss_mlp": 0.3026123, + "step": 2543, + "time_per_iteration": 3.986154556274414 + }, + { + "auxiliary_loss_clip": 0.06638096, + "auxiliary_loss_mlp": 0.01299522, + "balance_loss_clip": 0.06343587, + "balance_loss_mlp": 0.01271616, + "epoch": 0.15295355478731398, + "flos": 22716468535680.0, + "grad_norm": 3.1436155023596886, + "language_loss": 0.88072753, + "learning_rate": 3.843501854510416e-06, + "loss": 0.96010375, + "num_input_tokens_seen": 55266195, + "router_z_loss_clip": 2.9453125, + "router_z_loss_mlp": 0.27905273, + "step": 2544, + "time_per_iteration": 3.9873733520507812 + }, + { + "auxiliary_loss_clip": 0.06648069, + "auxiliary_loss_mlp": 0.01297216, + "balance_loss_clip": 0.06342938, + "balance_loss_mlp": 0.01266937, + "epoch": 0.15301367803998198, + "flos": 23257548276480.0, + "grad_norm": 3.867712661232465, + "language_loss": 0.83686781, + "learning_rate": 3.843350793153673e-06, + "loss": 0.91632062, + "num_input_tokens_seen": 55283305, + "router_z_loss_clip": 3.05078125, + "router_z_loss_mlp": 0.30273438, + "step": 2545, + "time_per_iteration": 2.5443849563598633 + }, + { + "auxiliary_loss_clip": 0.06650628, + "auxiliary_loss_mlp": 0.01286742, + "balance_loss_clip": 0.06356554, + "balance_loss_mlp": 0.01259086, + "epoch": 0.15307380129264994, + "flos": 25893597398400.0, + "grad_norm": 2.572032347282614, + "language_loss": 0.71873057, + "learning_rate": 3.843199661896884e-06, + "loss": 0.79810423, + "num_input_tokens_seen": 55303035, + "router_z_loss_clip": 2.94335938, + "router_z_loss_mlp": 0.27661133, + "step": 2546, + "time_per_iteration": 2.650826930999756 + }, + { + "auxiliary_loss_clip": 0.06637084, + "auxiliary_loss_mlp": 0.0129342, + "balance_loss_clip": 0.06340081, + "balance_loss_mlp": 0.01263164, + "epoch": 0.1531339245453179, + "flos": 46983780766080.0, + "grad_norm": 1.694960648035813, + "language_loss": 0.78831929, + "learning_rate": 3.843048460745779e-06, + "loss": 0.86762434, + "num_input_tokens_seen": 55327570, + "router_z_loss_clip": 2.96875, + "router_z_loss_mlp": 0.30249023, + "step": 2547, + "time_per_iteration": 2.7530312538146973 + }, + { + "auxiliary_loss_clip": 0.06643492, + "auxiliary_loss_mlp": 0.01284901, + "balance_loss_clip": 0.06342105, + "balance_loss_mlp": 0.0125579, + "epoch": 0.15319404779798587, + "flos": 35890817160960.0, + "grad_norm": 3.38346990001629, + "language_loss": 0.75178528, + "learning_rate": 3.842897189706092e-06, + "loss": 0.83106923, + "num_input_tokens_seen": 55351090, + "router_z_loss_clip": 3.015625, + "router_z_loss_mlp": 0.29138184, + "step": 2548, + "time_per_iteration": 4.090601682662964 + }, + { + "auxiliary_loss_clip": 0.06638174, + "auxiliary_loss_mlp": 0.01283175, + "balance_loss_clip": 0.06343598, + "balance_loss_mlp": 0.01255757, + "epoch": 0.15325417105065384, + "flos": 25671463424640.0, + "grad_norm": 1.8173203040893826, + "language_loss": 0.82054353, + "learning_rate": 3.842745848783558e-06, + "loss": 0.89975703, + "num_input_tokens_seen": 55371050, + "router_z_loss_clip": 2.94335938, + "router_z_loss_mlp": 0.27416992, + "step": 2549, + "time_per_iteration": 4.0024590492248535 + }, + { + "auxiliary_loss_clip": 0.06642953, + "auxiliary_loss_mlp": 0.01284523, + "balance_loss_clip": 0.06343073, + "balance_loss_mlp": 0.01256366, + "epoch": 0.1533142943033218, + "flos": 18776838718080.0, + "grad_norm": 1.6738213226373704, + "language_loss": 0.76089072, + "learning_rate": 3.842594437983917e-06, + "loss": 0.84016538, + "num_input_tokens_seen": 55390375, + "router_z_loss_clip": 2.99804688, + "router_z_loss_mlp": 0.28137207, + "step": 2550, + "time_per_iteration": 2.5584487915039062 + }, + { + "auxiliary_loss_clip": 0.06640078, + "auxiliary_loss_mlp": 0.01284284, + "balance_loss_clip": 0.063375, + "balance_loss_mlp": 0.01257093, + "epoch": 0.15337441755598977, + "flos": 23113218418560.0, + "grad_norm": 2.77223179347166, + "language_loss": 0.78078097, + "learning_rate": 3.8424429573129115e-06, + "loss": 0.86002457, + "num_input_tokens_seen": 55408890, + "router_z_loss_clip": 3.02734375, + "router_z_loss_mlp": 0.27172852, + "step": 2551, + "time_per_iteration": 2.5581319332122803 + }, + { + "auxiliary_loss_clip": 0.06594751, + "auxiliary_loss_mlp": 0.01264842, + "balance_loss_clip": 0.0641477, + "balance_loss_mlp": 0.01255657, + "epoch": 0.15343454080865776, + "flos": 59881278372480.0, + "grad_norm": 0.9086682427744472, + "language_loss": 0.56718183, + "learning_rate": 3.842291406776283e-06, + "loss": 0.6457777, + "num_input_tokens_seen": 55463815, + "router_z_loss_clip": 1.796875, + "router_z_loss_mlp": 0.09179688, + "step": 2552, + "time_per_iteration": 3.099020004272461 + }, + { + "auxiliary_loss_clip": 0.06649399, + "auxiliary_loss_mlp": 0.01294284, + "balance_loss_clip": 0.06343735, + "balance_loss_mlp": 0.01263695, + "epoch": 0.15349466406132573, + "flos": 11915644590720.0, + "grad_norm": 7.1683362370520625, + "language_loss": 0.89047897, + "learning_rate": 3.84213978637978e-06, + "loss": 0.96991581, + "num_input_tokens_seen": 55481050, + "router_z_loss_clip": 3.05664062, + "router_z_loss_mlp": 0.30615234, + "step": 2553, + "time_per_iteration": 2.5545389652252197 + }, + { + "auxiliary_loss_clip": 0.06633511, + "auxiliary_loss_mlp": 0.01288342, + "balance_loss_clip": 0.0633003, + "balance_loss_mlp": 0.01258575, + "epoch": 0.1535547873139937, + "flos": 24103681205760.0, + "grad_norm": 2.37345039804312, + "language_loss": 0.79193908, + "learning_rate": 3.841988096129152e-06, + "loss": 0.87115765, + "num_input_tokens_seen": 55500050, + "router_z_loss_clip": 3.03515625, + "router_z_loss_mlp": 0.29748535, + "step": 2554, + "time_per_iteration": 2.5949606895446777 + }, + { + "auxiliary_loss_clip": 0.06630482, + "auxiliary_loss_mlp": 0.01286402, + "balance_loss_clip": 0.06329404, + "balance_loss_mlp": 0.01256278, + "epoch": 0.15361491056666166, + "flos": 17572208094720.0, + "grad_norm": 5.650486163134607, + "language_loss": 0.79014289, + "learning_rate": 3.841836336030151e-06, + "loss": 0.86931169, + "num_input_tokens_seen": 55518125, + "router_z_loss_clip": 3.01171875, + "router_z_loss_mlp": 0.3013916, + "step": 2555, + "time_per_iteration": 2.5340495109558105 + }, + { + "auxiliary_loss_clip": 0.0662353, + "auxiliary_loss_mlp": 0.01288339, + "balance_loss_clip": 0.06330266, + "balance_loss_mlp": 0.01260671, + "epoch": 0.15367503381932962, + "flos": 25053040765440.0, + "grad_norm": 1.6796179562313394, + "language_loss": 0.78025055, + "learning_rate": 3.8416845060885305e-06, + "loss": 0.85936922, + "num_input_tokens_seen": 55540960, + "router_z_loss_clip": 2.93554688, + "router_z_loss_mlp": 0.2767334, + "step": 2556, + "time_per_iteration": 2.623685121536255 + }, + { + "auxiliary_loss_clip": 0.06620497, + "auxiliary_loss_mlp": 0.01288231, + "balance_loss_clip": 0.0633128, + "balance_loss_mlp": 0.01260086, + "epoch": 0.15373515707199759, + "flos": 21513808483200.0, + "grad_norm": 2.256114728182097, + "language_loss": 0.91304088, + "learning_rate": 3.84153260631005e-06, + "loss": 0.99212819, + "num_input_tokens_seen": 55559210, + "router_z_loss_clip": 2.890625, + "router_z_loss_mlp": 0.28161621, + "step": 2557, + "time_per_iteration": 2.6546642780303955 + }, + { + "auxiliary_loss_clip": 0.06632135, + "auxiliary_loss_mlp": 0.01294079, + "balance_loss_clip": 0.0633366, + "balance_loss_mlp": 0.0126411, + "epoch": 0.15379528032466555, + "flos": 26001897200640.0, + "grad_norm": 2.0796567985016656, + "language_loss": 0.71532625, + "learning_rate": 3.841380636700468e-06, + "loss": 0.79458839, + "num_input_tokens_seen": 55578925, + "router_z_loss_clip": 2.98632812, + "router_z_loss_mlp": 0.29980469, + "step": 2558, + "time_per_iteration": 2.604158401489258 + }, + { + "auxiliary_loss_clip": 0.06622511, + "auxiliary_loss_mlp": 0.01287721, + "balance_loss_clip": 0.06324002, + "balance_loss_mlp": 0.01258336, + "epoch": 0.15385540357733354, + "flos": 19282685016960.0, + "grad_norm": 2.0921223854633166, + "language_loss": 0.93401122, + "learning_rate": 3.841228597265548e-06, + "loss": 1.0131135, + "num_input_tokens_seen": 55597255, + "router_z_loss_clip": 2.98242188, + "router_z_loss_mlp": 0.29382324, + "step": 2559, + "time_per_iteration": 2.546621799468994 + }, + { + "auxiliary_loss_clip": 0.06626738, + "auxiliary_loss_mlp": 0.01291924, + "balance_loss_clip": 0.06328855, + "balance_loss_mlp": 0.01262289, + "epoch": 0.1539155268300015, + "flos": 28556788043520.0, + "grad_norm": 2.7498914144184994, + "language_loss": 0.65563196, + "learning_rate": 3.841076488011055e-06, + "loss": 0.73481858, + "num_input_tokens_seen": 55619515, + "router_z_loss_clip": 2.97851562, + "router_z_loss_mlp": 0.29638672, + "step": 2560, + "time_per_iteration": 2.633558511734009 + }, + { + "auxiliary_loss_clip": 0.06620878, + "auxiliary_loss_mlp": 0.01293003, + "balance_loss_clip": 0.06320217, + "balance_loss_mlp": 0.01262927, + "epoch": 0.15397565008266947, + "flos": 23554257984000.0, + "grad_norm": 1.9722034302545564, + "language_loss": 0.89109504, + "learning_rate": 3.8409243089427574e-06, + "loss": 0.9702338, + "num_input_tokens_seen": 55640050, + "router_z_loss_clip": 3.00585938, + "router_z_loss_mlp": 0.30065918, + "step": 2561, + "time_per_iteration": 2.593822479248047 + }, + { + "auxiliary_loss_clip": 0.06618848, + "auxiliary_loss_mlp": 0.01287729, + "balance_loss_clip": 0.06331521, + "balance_loss_mlp": 0.01260811, + "epoch": 0.15403577333533744, + "flos": 17135696649600.0, + "grad_norm": 2.292455015225775, + "language_loss": 0.83781528, + "learning_rate": 3.840772060066425e-06, + "loss": 0.91688108, + "num_input_tokens_seen": 55658695, + "router_z_loss_clip": 2.87109375, + "router_z_loss_mlp": 0.26928711, + "step": 2562, + "time_per_iteration": 2.5630288124084473 + }, + { + "auxiliary_loss_clip": 0.06628443, + "auxiliary_loss_mlp": 0.01297123, + "balance_loss_clip": 0.06321231, + "balance_loss_mlp": 0.01265175, + "epoch": 0.1540958965880054, + "flos": 17900252029440.0, + "grad_norm": 3.685635027542056, + "language_loss": 0.75855017, + "learning_rate": 3.840619741387832e-06, + "loss": 0.83780587, + "num_input_tokens_seen": 55676340, + "router_z_loss_clip": 3.0703125, + "router_z_loss_mlp": 0.31958008, + "step": 2563, + "time_per_iteration": 2.5140066146850586 + }, + { + "auxiliary_loss_clip": 0.06627464, + "auxiliary_loss_mlp": 0.01290382, + "balance_loss_clip": 0.06320702, + "balance_loss_mlp": 0.01258481, + "epoch": 0.15415601984067337, + "flos": 32169296321280.0, + "grad_norm": 2.478610974211426, + "language_loss": 0.77803361, + "learning_rate": 3.8404673529127534e-06, + "loss": 0.85721207, + "num_input_tokens_seen": 55698890, + "router_z_loss_clip": 3.0703125, + "router_z_loss_mlp": 0.3190918, + "step": 2564, + "time_per_iteration": 2.659982681274414 + }, + { + "auxiliary_loss_clip": 0.06615369, + "auxiliary_loss_mlp": 0.0129364, + "balance_loss_clip": 0.06320594, + "balance_loss_mlp": 0.01264267, + "epoch": 0.15421614309334136, + "flos": 24031243751040.0, + "grad_norm": 1.9916685694635767, + "language_loss": 0.71840364, + "learning_rate": 3.840314894646969e-06, + "loss": 0.7974937, + "num_input_tokens_seen": 55718535, + "router_z_loss_clip": 2.94726562, + "router_z_loss_mlp": 0.29321289, + "step": 2565, + "time_per_iteration": 2.553128480911255 + }, + { + "auxiliary_loss_clip": 0.06614129, + "auxiliary_loss_mlp": 0.01296634, + "balance_loss_clip": 0.06317951, + "balance_loss_mlp": 0.01266212, + "epoch": 0.15427626634600933, + "flos": 24392676337920.0, + "grad_norm": 2.5526224211901676, + "language_loss": 0.72527832, + "learning_rate": 3.840162366596259e-06, + "loss": 0.8043859, + "num_input_tokens_seen": 55738970, + "router_z_loss_clip": 2.9609375, + "router_z_loss_mlp": 0.30419922, + "step": 2566, + "time_per_iteration": 2.6016533374786377 + }, + { + "auxiliary_loss_clip": 0.06605071, + "auxiliary_loss_mlp": 0.01292884, + "balance_loss_clip": 0.06314062, + "balance_loss_mlp": 0.01265263, + "epoch": 0.1543363895986773, + "flos": 23338287285120.0, + "grad_norm": 2.301564838599309, + "language_loss": 0.86417472, + "learning_rate": 3.840009768766408e-06, + "loss": 0.94315434, + "num_input_tokens_seen": 55759585, + "router_z_loss_clip": 2.9140625, + "router_z_loss_mlp": 0.27612305, + "step": 2567, + "time_per_iteration": 2.5882625579833984 + }, + { + "auxiliary_loss_clip": 0.06608227, + "auxiliary_loss_mlp": 0.01293398, + "balance_loss_clip": 0.06315389, + "balance_loss_mlp": 0.01265348, + "epoch": 0.15439651285134526, + "flos": 24280225758720.0, + "grad_norm": 2.3922484360691576, + "language_loss": 0.79661417, + "learning_rate": 3.839857101163202e-06, + "loss": 0.87563044, + "num_input_tokens_seen": 55779250, + "router_z_loss_clip": 2.93164062, + "router_z_loss_mlp": 0.28039551, + "step": 2568, + "time_per_iteration": 2.6128549575805664 + }, + { + "auxiliary_loss_clip": 0.06604031, + "auxiliary_loss_mlp": 0.01296391, + "balance_loss_clip": 0.06313319, + "balance_loss_mlp": 0.01268103, + "epoch": 0.15445663610401322, + "flos": 22462832626560.0, + "grad_norm": 2.2987457723616482, + "language_loss": 0.71156412, + "learning_rate": 3.83970436379243e-06, + "loss": 0.79056835, + "num_input_tokens_seen": 55800470, + "router_z_loss_clip": 2.90820312, + "router_z_loss_mlp": 0.28295898, + "step": 2569, + "time_per_iteration": 2.555661916732788 + }, + { + "auxiliary_loss_clip": 0.06609643, + "auxiliary_loss_mlp": 0.0129108, + "balance_loss_clip": 0.06317194, + "balance_loss_mlp": 0.0126197, + "epoch": 0.1545167593566812, + "flos": 22055223640320.0, + "grad_norm": 2.1871959478456433, + "language_loss": 0.7775144, + "learning_rate": 3.839551556659884e-06, + "loss": 0.85652161, + "num_input_tokens_seen": 55817795, + "router_z_loss_clip": 2.92773438, + "router_z_loss_mlp": 0.29150391, + "step": 2570, + "time_per_iteration": 2.5834736824035645 + }, + { + "auxiliary_loss_clip": 0.06598657, + "auxiliary_loss_mlp": 0.01290077, + "balance_loss_clip": 0.06308745, + "balance_loss_mlp": 0.01260513, + "epoch": 0.15457688260934915, + "flos": 19324375223040.0, + "grad_norm": 2.749201239461968, + "language_loss": 0.7861867, + "learning_rate": 3.839398679771359e-06, + "loss": 0.86507404, + "num_input_tokens_seen": 55836125, + "router_z_loss_clip": 2.8984375, + "router_z_loss_mlp": 0.29541016, + "step": 2571, + "time_per_iteration": 2.5391428470611572 + }, + { + "auxiliary_loss_clip": 0.06606804, + "auxiliary_loss_mlp": 0.01294872, + "balance_loss_clip": 0.06313352, + "balance_loss_mlp": 0.01265785, + "epoch": 0.15463700586201715, + "flos": 24140843291520.0, + "grad_norm": 1.901838675989398, + "language_loss": 0.83756542, + "learning_rate": 3.839245733132652e-06, + "loss": 0.91658223, + "num_input_tokens_seen": 55855280, + "router_z_loss_clip": 2.93554688, + "router_z_loss_mlp": 0.29101562, + "step": 2572, + "time_per_iteration": 2.597111463546753 + }, + { + "auxiliary_loss_clip": 0.06611877, + "auxiliary_loss_mlp": 0.01296064, + "balance_loss_clip": 0.06316563, + "balance_loss_mlp": 0.01266393, + "epoch": 0.1546971291146851, + "flos": 22427808819840.0, + "grad_norm": 2.3334374955274466, + "language_loss": 0.91633451, + "learning_rate": 3.839092716749563e-06, + "loss": 0.9954139, + "num_input_tokens_seen": 55875695, + "router_z_loss_clip": 2.95898438, + "router_z_loss_mlp": 0.29699707, + "step": 2573, + "time_per_iteration": 2.553586721420288 + }, + { + "auxiliary_loss_clip": 0.06606219, + "auxiliary_loss_mlp": 0.01288918, + "balance_loss_clip": 0.06312492, + "balance_loss_mlp": 0.01258639, + "epoch": 0.15475725236735308, + "flos": 17536010330880.0, + "grad_norm": 1.5970575826599196, + "language_loss": 0.71088636, + "learning_rate": 3.838939630627893e-06, + "loss": 0.78983772, + "num_input_tokens_seen": 55894575, + "router_z_loss_clip": 2.93554688, + "router_z_loss_mlp": 0.30249023, + "step": 2574, + "time_per_iteration": 2.5485129356384277 + }, + { + "auxiliary_loss_clip": 0.06606239, + "auxiliary_loss_mlp": 0.01287836, + "balance_loss_clip": 0.06312916, + "balance_loss_mlp": 0.01258439, + "epoch": 0.15481737562002104, + "flos": 22567778265600.0, + "grad_norm": 2.064736624590997, + "language_loss": 0.83194166, + "learning_rate": 3.838786474773448e-06, + "loss": 0.91088241, + "num_input_tokens_seen": 55912855, + "router_z_loss_clip": 2.93359375, + "router_z_loss_mlp": 0.29394531, + "step": 2575, + "time_per_iteration": 2.5202696323394775 + }, + { + "auxiliary_loss_clip": 0.06611623, + "auxiliary_loss_mlp": 0.01295032, + "balance_loss_clip": 0.06317705, + "balance_loss_mlp": 0.01267137, + "epoch": 0.154877498872689, + "flos": 24907620804480.0, + "grad_norm": 1.9923268704643078, + "language_loss": 0.8600359, + "learning_rate": 3.838633249192036e-06, + "loss": 0.93910241, + "num_input_tokens_seen": 55932375, + "router_z_loss_clip": 2.93945312, + "router_z_loss_mlp": 0.27929688, + "step": 2576, + "time_per_iteration": 2.5677525997161865 + }, + { + "auxiliary_loss_clip": 0.06609543, + "auxiliary_loss_mlp": 0.01301269, + "balance_loss_clip": 0.06318229, + "balance_loss_mlp": 0.01275126, + "epoch": 0.15493762212535697, + "flos": 28155048842880.0, + "grad_norm": 2.065090565667539, + "language_loss": 0.82887769, + "learning_rate": 3.838479953889465e-06, + "loss": 0.90798575, + "num_input_tokens_seen": 55953970, + "router_z_loss_clip": 2.91796875, + "router_z_loss_mlp": 0.26147461, + "step": 2577, + "time_per_iteration": 2.5728230476379395 + }, + { + "auxiliary_loss_clip": 0.06618612, + "auxiliary_loss_mlp": 0.01306082, + "balance_loss_clip": 0.06324668, + "balance_loss_mlp": 0.01276852, + "epoch": 0.15499774537802496, + "flos": 25418162931840.0, + "grad_norm": 2.85112064725787, + "language_loss": 0.77597427, + "learning_rate": 3.8383265888715525e-06, + "loss": 0.85522127, + "num_input_tokens_seen": 55973120, + "router_z_loss_clip": 2.94335938, + "router_z_loss_mlp": 0.29199219, + "step": 2578, + "time_per_iteration": 2.5934667587280273 + }, + { + "auxiliary_loss_clip": 0.06630063, + "auxiliary_loss_mlp": 0.01289241, + "balance_loss_clip": 0.06328662, + "balance_loss_mlp": 0.01259224, + "epoch": 0.15505786863069293, + "flos": 22098213584640.0, + "grad_norm": 1.7655677053725216, + "language_loss": 0.8325448, + "learning_rate": 3.83817315414411e-06, + "loss": 0.91173792, + "num_input_tokens_seen": 55993260, + "router_z_loss_clip": 3.01367188, + "router_z_loss_mlp": 0.30004883, + "step": 2579, + "time_per_iteration": 2.5339503288269043 + }, + { + "auxiliary_loss_clip": 0.06624122, + "auxiliary_loss_mlp": 0.01293638, + "balance_loss_clip": 0.06327586, + "balance_loss_mlp": 0.01264074, + "epoch": 0.1551179918833609, + "flos": 18923223000960.0, + "grad_norm": 3.703462791860066, + "language_loss": 0.81290895, + "learning_rate": 3.838019649712958e-06, + "loss": 0.89208651, + "num_input_tokens_seen": 56012130, + "router_z_loss_clip": 2.96484375, + "router_z_loss_mlp": 0.2956543, + "step": 2580, + "time_per_iteration": 2.547076940536499 + }, + { + "auxiliary_loss_clip": 0.06553604, + "auxiliary_loss_mlp": 0.01296097, + "balance_loss_clip": 0.06379167, + "balance_loss_mlp": 0.01287341, + "epoch": 0.15517811513602886, + "flos": 66259281530880.0, + "grad_norm": 0.8290210768149422, + "language_loss": 0.59028411, + "learning_rate": 3.8378660755839166e-06, + "loss": 0.6687811, + "num_input_tokens_seen": 56079045, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.08770752, + "step": 2581, + "time_per_iteration": 4.748734712600708 + }, + { + "auxiliary_loss_clip": 0.06615421, + "auxiliary_loss_mlp": 0.01287932, + "balance_loss_clip": 0.06319774, + "balance_loss_mlp": 0.01259286, + "epoch": 0.15523823838869683, + "flos": 24027344536320.0, + "grad_norm": 2.048194408824491, + "language_loss": 0.86481762, + "learning_rate": 3.8377124317628095e-06, + "loss": 0.94385123, + "num_input_tokens_seen": 56098745, + "router_z_loss_clip": 2.95703125, + "router_z_loss_mlp": 0.28625488, + "step": 2582, + "time_per_iteration": 2.5417592525482178 + }, + { + "auxiliary_loss_clip": 0.0661144, + "auxiliary_loss_mlp": 0.01292493, + "balance_loss_clip": 0.06316175, + "balance_loss_mlp": 0.01262262, + "epoch": 0.1552983616413648, + "flos": 20491256782080.0, + "grad_norm": 2.196568898095916, + "language_loss": 0.79934382, + "learning_rate": 3.8375587182554625e-06, + "loss": 0.87838316, + "num_input_tokens_seen": 56117655, + "router_z_loss_clip": 2.95117188, + "router_z_loss_mlp": 0.30236816, + "step": 2583, + "time_per_iteration": 4.1261961460113525 + }, + { + "auxiliary_loss_clip": 0.06610835, + "auxiliary_loss_mlp": 0.01301507, + "balance_loss_clip": 0.06316249, + "balance_loss_mlp": 0.01272956, + "epoch": 0.15535848489403276, + "flos": 32131798819200.0, + "grad_norm": 2.2182475294075643, + "language_loss": 0.77203268, + "learning_rate": 3.837404935067705e-06, + "loss": 0.85115612, + "num_input_tokens_seen": 56141960, + "router_z_loss_clip": 2.9453125, + "router_z_loss_mlp": 0.28515625, + "step": 2584, + "time_per_iteration": 2.71648907661438 + }, + { + "auxiliary_loss_clip": 0.06603897, + "auxiliary_loss_mlp": 0.01292119, + "balance_loss_clip": 0.06309253, + "balance_loss_mlp": 0.01263676, + "epoch": 0.15541860814670075, + "flos": 19104379528320.0, + "grad_norm": 2.0708341386331157, + "language_loss": 0.76718783, + "learning_rate": 3.837251082205368e-06, + "loss": 0.84614801, + "num_input_tokens_seen": 56161430, + "router_z_loss_clip": 2.9453125, + "router_z_loss_mlp": 0.28442383, + "step": 2585, + "time_per_iteration": 2.548250198364258 + }, + { + "auxiliary_loss_clip": 0.06590863, + "auxiliary_loss_mlp": 0.01288896, + "balance_loss_clip": 0.06303678, + "balance_loss_mlp": 0.01260607, + "epoch": 0.1554787313993687, + "flos": 19178158648320.0, + "grad_norm": 2.0117198745869134, + "language_loss": 0.6235339, + "learning_rate": 3.837097159674286e-06, + "loss": 0.70233154, + "num_input_tokens_seen": 56179390, + "router_z_loss_clip": 2.87109375, + "router_z_loss_mlp": 0.28283691, + "step": 2586, + "time_per_iteration": 2.5397160053253174 + }, + { + "auxiliary_loss_clip": 0.06596754, + "auxiliary_loss_mlp": 0.01289508, + "balance_loss_clip": 0.0630295, + "balance_loss_mlp": 0.0126023, + "epoch": 0.15553885465203668, + "flos": 16149384639360.0, + "grad_norm": 2.0060039427442065, + "language_loss": 0.82540935, + "learning_rate": 3.836943167480296e-06, + "loss": 0.90427202, + "num_input_tokens_seen": 56198020, + "router_z_loss_clip": 2.9453125, + "router_z_loss_mlp": 0.29321289, + "step": 2587, + "time_per_iteration": 2.5246498584747314 + }, + { + "auxiliary_loss_clip": 0.06596097, + "auxiliary_loss_mlp": 0.01287288, + "balance_loss_clip": 0.06299823, + "balance_loss_mlp": 0.01257152, + "epoch": 0.15559897790470464, + "flos": 25344803082240.0, + "grad_norm": 1.8823875807099288, + "language_loss": 0.8996799, + "learning_rate": 3.836789105629236e-06, + "loss": 0.97851378, + "num_input_tokens_seen": 56218165, + "router_z_loss_clip": 2.9609375, + "router_z_loss_mlp": 0.30126953, + "step": 2588, + "time_per_iteration": 4.054608345031738 + }, + { + "auxiliary_loss_clip": 0.06588855, + "auxiliary_loss_mlp": 0.01285264, + "balance_loss_clip": 0.06298578, + "balance_loss_mlp": 0.01255628, + "epoch": 0.1556591011573726, + "flos": 23155453676160.0, + "grad_norm": 2.3276735592444253, + "language_loss": 0.65979421, + "learning_rate": 3.83663497412695e-06, + "loss": 0.7385354, + "num_input_tokens_seen": 56237160, + "router_z_loss_clip": 2.90234375, + "router_z_loss_mlp": 0.29614258, + "step": 2589, + "time_per_iteration": 2.5870378017425537 + }, + { + "auxiliary_loss_clip": 0.06587367, + "auxiliary_loss_mlp": 0.01283128, + "balance_loss_clip": 0.06296779, + "balance_loss_mlp": 0.01254554, + "epoch": 0.15571922441004057, + "flos": 25377353193600.0, + "grad_norm": 1.8444510343536653, + "language_loss": 0.83209628, + "learning_rate": 3.836480772979281e-06, + "loss": 0.91080129, + "num_input_tokens_seen": 56257610, + "router_z_loss_clip": 2.90820312, + "router_z_loss_mlp": 0.2857666, + "step": 2590, + "time_per_iteration": 2.567789316177368 + }, + { + "auxiliary_loss_clip": 0.06586926, + "auxiliary_loss_mlp": 0.01284797, + "balance_loss_clip": 0.06295232, + "balance_loss_mlp": 0.0125819, + "epoch": 0.15577934766270854, + "flos": 14506565489280.0, + "grad_norm": 2.5394168350381956, + "language_loss": 0.80645335, + "learning_rate": 3.836326502192077e-06, + "loss": 0.88517064, + "num_input_tokens_seen": 56275215, + "router_z_loss_clip": 2.91796875, + "router_z_loss_mlp": 0.26635742, + "step": 2591, + "time_per_iteration": 2.552945852279663 + }, + { + "auxiliary_loss_clip": 0.06583126, + "auxiliary_loss_mlp": 0.0128094, + "balance_loss_clip": 0.06296018, + "balance_loss_mlp": 0.01255953, + "epoch": 0.15583947091537653, + "flos": 37423575573120.0, + "grad_norm": 4.213698124732034, + "language_loss": 0.6586749, + "learning_rate": 3.836172161771189e-06, + "loss": 0.73731554, + "num_input_tokens_seen": 56297130, + "router_z_loss_clip": 2.87109375, + "router_z_loss_mlp": 0.25024414, + "step": 2592, + "time_per_iteration": 2.6843414306640625 + }, + { + "auxiliary_loss_clip": 0.06601857, + "auxiliary_loss_mlp": 0.01282978, + "balance_loss_clip": 0.06306329, + "balance_loss_mlp": 0.01254547, + "epoch": 0.1558995941680445, + "flos": 21841097731200.0, + "grad_norm": 2.3724666239354804, + "language_loss": 0.83576721, + "learning_rate": 3.836017751722467e-06, + "loss": 0.91461557, + "num_input_tokens_seen": 56314995, + "router_z_loss_clip": 2.95898438, + "router_z_loss_mlp": 0.28442383, + "step": 2593, + "time_per_iteration": 2.565361738204956 + }, + { + "auxiliary_loss_clip": 0.06586924, + "auxiliary_loss_mlp": 0.01289301, + "balance_loss_clip": 0.06303876, + "balance_loss_mlp": 0.01261526, + "epoch": 0.15595971742071246, + "flos": 19798845367680.0, + "grad_norm": 2.2297480783075847, + "language_loss": 0.74099863, + "learning_rate": 3.8358632720517695e-06, + "loss": 0.8197608, + "num_input_tokens_seen": 56334005, + "router_z_loss_clip": 2.83203125, + "router_z_loss_mlp": 0.27819824, + "step": 2594, + "time_per_iteration": 2.55253267288208 + }, + { + "auxiliary_loss_clip": 0.06601368, + "auxiliary_loss_mlp": 0.01282916, + "balance_loss_clip": 0.06319516, + "balance_loss_mlp": 0.01257346, + "epoch": 0.15601984067338043, + "flos": 26729038932480.0, + "grad_norm": 2.826820029132309, + "language_loss": 0.82562411, + "learning_rate": 3.835708722764952e-06, + "loss": 0.90446699, + "num_input_tokens_seen": 56353795, + "router_z_loss_clip": 2.81835938, + "router_z_loss_mlp": 0.2557373, + "step": 2595, + "time_per_iteration": 2.640240430831909 + }, + { + "auxiliary_loss_clip": 0.06626514, + "auxiliary_loss_mlp": 0.01281437, + "balance_loss_clip": 0.06334631, + "balance_loss_mlp": 0.01254936, + "epoch": 0.1560799639260484, + "flos": 18375183371520.0, + "grad_norm": 9.37489887619581, + "language_loss": 0.87632233, + "learning_rate": 3.835554103867876e-06, + "loss": 0.95540184, + "num_input_tokens_seen": 56373195, + "router_z_loss_clip": 2.91992188, + "router_z_loss_mlp": 0.26538086, + "step": 2596, + "time_per_iteration": 2.529327869415283 + }, + { + "auxiliary_loss_clip": 0.06606492, + "auxiliary_loss_mlp": 0.01287289, + "balance_loss_clip": 0.06323552, + "balance_loss_mlp": 0.01261015, + "epoch": 0.15614008717871636, + "flos": 22605149986560.0, + "grad_norm": 2.807545322610708, + "language_loss": 0.69688505, + "learning_rate": 3.835399415366404e-06, + "loss": 0.77582288, + "num_input_tokens_seen": 56391525, + "router_z_loss_clip": 2.82617188, + "router_z_loss_mlp": 0.26306152, + "step": 2597, + "time_per_iteration": 2.5685815811157227 + }, + { + "auxiliary_loss_clip": 0.0662894, + "auxiliary_loss_mlp": 0.01280666, + "balance_loss_clip": 0.06348241, + "balance_loss_mlp": 0.01256455, + "epoch": 0.15620021043138435, + "flos": 22753379059200.0, + "grad_norm": 2.0232351113841514, + "language_loss": 0.80914307, + "learning_rate": 3.8352446572664035e-06, + "loss": 0.88823915, + "num_input_tokens_seen": 56410715, + "router_z_loss_clip": 2.81054688, + "router_z_loss_mlp": 0.2421875, + "step": 2598, + "time_per_iteration": 2.554202079772949 + }, + { + "auxiliary_loss_clip": 0.0662708, + "auxiliary_loss_mlp": 0.01284312, + "balance_loss_clip": 0.06344105, + "balance_loss_mlp": 0.01257895, + "epoch": 0.15626033368405232, + "flos": 13119897870720.0, + "grad_norm": 2.0408523791990016, + "language_loss": 0.83276039, + "learning_rate": 3.8350898295737405e-06, + "loss": 0.91187429, + "num_input_tokens_seen": 56429170, + "router_z_loss_clip": 2.83007812, + "router_z_loss_mlp": 0.26391602, + "step": 2599, + "time_per_iteration": 2.66353702545166 + }, + { + "auxiliary_loss_clip": 0.06639346, + "auxiliary_loss_mlp": 0.01292644, + "balance_loss_clip": 0.06344323, + "balance_loss_mlp": 0.0126469, + "epoch": 0.15632045693672028, + "flos": 16477931698560.0, + "grad_norm": 2.3045518919772046, + "language_loss": 0.82379115, + "learning_rate": 3.834934932294287e-06, + "loss": 0.9031111, + "num_input_tokens_seen": 56445685, + "router_z_loss_clip": 2.95117188, + "router_z_loss_mlp": 0.27941895, + "step": 2600, + "time_per_iteration": 2.50607967376709 + }, + { + "auxiliary_loss_clip": 0.06646761, + "auxiliary_loss_mlp": 0.01287391, + "balance_loss_clip": 0.0635706, + "balance_loss_mlp": 0.01259305, + "epoch": 0.15638058018938825, + "flos": 20856672437760.0, + "grad_norm": 2.020166421544308, + "language_loss": 0.88839436, + "learning_rate": 3.834779965433917e-06, + "loss": 0.96773589, + "num_input_tokens_seen": 56465900, + "router_z_loss_clip": 2.8984375, + "router_z_loss_mlp": 0.28076172, + "step": 2601, + "time_per_iteration": 2.574437141418457 + }, + { + "auxiliary_loss_clip": 0.06648471, + "auxiliary_loss_mlp": 0.01294906, + "balance_loss_clip": 0.06352241, + "balance_loss_mlp": 0.01267989, + "epoch": 0.1564407034420562, + "flos": 21878762941440.0, + "grad_norm": 2.51177361833528, + "language_loss": 0.79510248, + "learning_rate": 3.834624928998508e-06, + "loss": 0.87453628, + "num_input_tokens_seen": 56485020, + "router_z_loss_clip": 2.96679688, + "router_z_loss_mlp": 0.26940918, + "step": 2602, + "time_per_iteration": 2.5957844257354736 + }, + { + "auxiliary_loss_clip": 0.06633168, + "auxiliary_loss_mlp": 0.01292264, + "balance_loss_clip": 0.06345348, + "balance_loss_mlp": 0.01265979, + "epoch": 0.15650082669472418, + "flos": 21840888096000.0, + "grad_norm": 1.9170738392352888, + "language_loss": 0.7431488, + "learning_rate": 3.8344698229939376e-06, + "loss": 0.82240313, + "num_input_tokens_seen": 56505205, + "router_z_loss_clip": 2.875, + "router_z_loss_mlp": 0.26293945, + "step": 2603, + "time_per_iteration": 2.5696704387664795 + }, + { + "auxiliary_loss_clip": 0.06625052, + "auxiliary_loss_mlp": 0.01287753, + "balance_loss_clip": 0.06337333, + "balance_loss_mlp": 0.01261217, + "epoch": 0.15656094994739214, + "flos": 13804343147520.0, + "grad_norm": 2.480258971716289, + "language_loss": 0.88529468, + "learning_rate": 3.8343146474260865e-06, + "loss": 0.9644227, + "num_input_tokens_seen": 56521495, + "router_z_loss_clip": 2.87890625, + "router_z_loss_mlp": 0.26538086, + "step": 2604, + "time_per_iteration": 2.5110373497009277 + }, + { + "auxiliary_loss_clip": 0.06634312, + "auxiliary_loss_mlp": 0.01291425, + "balance_loss_clip": 0.06341597, + "balance_loss_mlp": 0.01266558, + "epoch": 0.15662107320006013, + "flos": 27315582312960.0, + "grad_norm": 2.192350516429204, + "language_loss": 0.85880566, + "learning_rate": 3.834159402300841e-06, + "loss": 0.93806314, + "num_input_tokens_seen": 56540665, + "router_z_loss_clip": 2.9296875, + "router_z_loss_mlp": 0.2487793, + "step": 2605, + "time_per_iteration": 2.6109507083892822 + }, + { + "auxiliary_loss_clip": 0.06649123, + "auxiliary_loss_mlp": 0.01294389, + "balance_loss_clip": 0.06348212, + "balance_loss_mlp": 0.01265802, + "epoch": 0.1566811964527281, + "flos": 26691876846720.0, + "grad_norm": 1.9127965853266395, + "language_loss": 0.73996091, + "learning_rate": 3.834004087624087e-06, + "loss": 0.81939602, + "num_input_tokens_seen": 56560805, + "router_z_loss_clip": 3.00976562, + "router_z_loss_mlp": 0.28564453, + "step": 2606, + "time_per_iteration": 2.7345151901245117 + }, + { + "auxiliary_loss_clip": 0.06621392, + "auxiliary_loss_mlp": 0.01286091, + "balance_loss_clip": 0.06334884, + "balance_loss_mlp": 0.01260246, + "epoch": 0.15674131970539606, + "flos": 16108323338880.0, + "grad_norm": 2.273122789948623, + "language_loss": 0.77297181, + "learning_rate": 3.8338487034017145e-06, + "loss": 0.85204661, + "num_input_tokens_seen": 56576335, + "router_z_loss_clip": 2.8671875, + "router_z_loss_mlp": 0.25842285, + "step": 2607, + "time_per_iteration": 2.571983575820923 + }, + { + "auxiliary_loss_clip": 0.06614074, + "auxiliary_loss_mlp": 0.01286338, + "balance_loss_clip": 0.06327923, + "balance_loss_mlp": 0.01260791, + "epoch": 0.15680144295806403, + "flos": 19175349536640.0, + "grad_norm": 1.917731361959034, + "language_loss": 0.8328836, + "learning_rate": 3.833693249639615e-06, + "loss": 0.91188771, + "num_input_tokens_seen": 56595880, + "router_z_loss_clip": 2.86328125, + "router_z_loss_mlp": 0.25598145, + "step": 2608, + "time_per_iteration": 2.5823540687561035 + }, + { + "auxiliary_loss_clip": 0.06622173, + "auxiliary_loss_mlp": 0.01295073, + "balance_loss_clip": 0.06326167, + "balance_loss_mlp": 0.01264901, + "epoch": 0.156861566210732, + "flos": 20819678060160.0, + "grad_norm": 2.1481617307418017, + "language_loss": 0.73101258, + "learning_rate": 3.833537726343684e-06, + "loss": 0.81018502, + "num_input_tokens_seen": 56615130, + "router_z_loss_clip": 2.9609375, + "router_z_loss_mlp": 0.30163574, + "step": 2609, + "time_per_iteration": 2.572356700897217 + }, + { + "auxiliary_loss_clip": 0.06605803, + "auxiliary_loss_mlp": 0.01286832, + "balance_loss_clip": 0.06311236, + "balance_loss_mlp": 0.01260928, + "epoch": 0.15692168946339996, + "flos": 20054158358400.0, + "grad_norm": 2.0130429141277446, + "language_loss": 0.73445058, + "learning_rate": 3.833382133519818e-06, + "loss": 0.8133769, + "num_input_tokens_seen": 56634005, + "router_z_loss_clip": 2.94726562, + "router_z_loss_mlp": 0.2590332, + "step": 2610, + "time_per_iteration": 2.567537784576416 + }, + { + "auxiliary_loss_clip": 0.06606032, + "auxiliary_loss_mlp": 0.01288962, + "balance_loss_clip": 0.06310159, + "balance_loss_mlp": 0.01258873, + "epoch": 0.15698181271606793, + "flos": 21404502432000.0, + "grad_norm": 1.9787082052238874, + "language_loss": 0.73279381, + "learning_rate": 3.833226471173919e-06, + "loss": 0.81174374, + "num_input_tokens_seen": 56653480, + "router_z_loss_clip": 2.95898438, + "router_z_loss_mlp": 0.30065918, + "step": 2611, + "time_per_iteration": 2.582390308380127 + }, + { + "auxiliary_loss_clip": 0.06594902, + "auxiliary_loss_mlp": 0.01284265, + "balance_loss_clip": 0.06304685, + "balance_loss_mlp": 0.01259172, + "epoch": 0.15704193596873592, + "flos": 20851347703680.0, + "grad_norm": 2.098501694873674, + "language_loss": 0.71879792, + "learning_rate": 3.833070739311887e-06, + "loss": 0.79758954, + "num_input_tokens_seen": 56672270, + "router_z_loss_clip": 2.90234375, + "router_z_loss_mlp": 0.25097656, + "step": 2612, + "time_per_iteration": 2.577627658843994 + }, + { + "auxiliary_loss_clip": 0.0659887, + "auxiliary_loss_mlp": 0.01283795, + "balance_loss_clip": 0.06308534, + "balance_loss_mlp": 0.0125832, + "epoch": 0.15710205922140388, + "flos": 21769456890240.0, + "grad_norm": 2.359608918603851, + "language_loss": 0.77193695, + "learning_rate": 3.83291493793963e-06, + "loss": 0.85076362, + "num_input_tokens_seen": 56691510, + "router_z_loss_clip": 2.90429688, + "router_z_loss_mlp": 0.2545166, + "step": 2613, + "time_per_iteration": 2.5632479190826416 + }, + { + "auxiliary_loss_clip": 0.06608421, + "auxiliary_loss_mlp": 0.01292559, + "balance_loss_clip": 0.06315231, + "balance_loss_mlp": 0.01266106, + "epoch": 0.15716218247407185, + "flos": 25014453160320.0, + "grad_norm": 1.6622650675423762, + "language_loss": 0.66684031, + "learning_rate": 3.832759067063055e-06, + "loss": 0.74585009, + "num_input_tokens_seen": 56712230, + "router_z_loss_clip": 2.93359375, + "router_z_loss_mlp": 0.26428223, + "step": 2614, + "time_per_iteration": 2.684286117553711 + }, + { + "auxiliary_loss_clip": 0.0661184, + "auxiliary_loss_mlp": 0.01292567, + "balance_loss_clip": 0.06314493, + "balance_loss_mlp": 0.01264255, + "epoch": 0.1572223057267398, + "flos": 20197691602560.0, + "grad_norm": 3.2869095787841576, + "language_loss": 0.76402575, + "learning_rate": 3.832603126688072e-06, + "loss": 0.84306979, + "num_input_tokens_seen": 56727490, + "router_z_loss_clip": 2.9765625, + "router_z_loss_mlp": 0.28308105, + "step": 2615, + "time_per_iteration": 2.551769971847534 + }, + { + "auxiliary_loss_clip": 0.06589202, + "auxiliary_loss_mlp": 0.01285836, + "balance_loss_clip": 0.06304425, + "balance_loss_mlp": 0.01260587, + "epoch": 0.15728242897940778, + "flos": 20965810780800.0, + "grad_norm": 1.7986527043954237, + "language_loss": 0.74040192, + "learning_rate": 3.832447116820594e-06, + "loss": 0.81915236, + "num_input_tokens_seen": 56747385, + "router_z_loss_clip": 2.84960938, + "router_z_loss_mlp": 0.25256348, + "step": 2616, + "time_per_iteration": 2.5935630798339844 + }, + { + "auxiliary_loss_clip": 0.06601542, + "auxiliary_loss_mlp": 0.01283526, + "balance_loss_clip": 0.06305884, + "balance_loss_mlp": 0.01256966, + "epoch": 0.15734255223207574, + "flos": 23044764032640.0, + "grad_norm": 2.1005464521191426, + "language_loss": 0.73305666, + "learning_rate": 3.832291037466539e-06, + "loss": 0.81190741, + "num_input_tokens_seen": 56768055, + "router_z_loss_clip": 2.95703125, + "router_z_loss_mlp": 0.265625, + "step": 2617, + "time_per_iteration": 2.6116013526916504 + }, + { + "auxiliary_loss_clip": 0.06593003, + "auxiliary_loss_mlp": 0.01287239, + "balance_loss_clip": 0.06306564, + "balance_loss_mlp": 0.012605, + "epoch": 0.15740267548474374, + "flos": 20556357004800.0, + "grad_norm": 2.1735503953171813, + "language_loss": 0.75337285, + "learning_rate": 3.8321348886318235e-06, + "loss": 0.83217525, + "num_input_tokens_seen": 56785110, + "router_z_loss_clip": 2.86523438, + "router_z_loss_mlp": 0.26745605, + "step": 2618, + "time_per_iteration": 2.558271884918213 + }, + { + "auxiliary_loss_clip": 0.06606486, + "auxiliary_loss_mlp": 0.01288019, + "balance_loss_clip": 0.06305802, + "balance_loss_mlp": 0.01260052, + "epoch": 0.1574627987374117, + "flos": 22672262707200.0, + "grad_norm": 2.4653942739702277, + "language_loss": 0.79897004, + "learning_rate": 3.8319786703223695e-06, + "loss": 0.87791508, + "num_input_tokens_seen": 56804975, + "router_z_loss_clip": 3.00585938, + "router_z_loss_mlp": 0.2800293, + "step": 2619, + "time_per_iteration": 2.5732688903808594 + }, + { + "auxiliary_loss_clip": 0.06592336, + "auxiliary_loss_mlp": 0.01289339, + "balance_loss_clip": 0.06304029, + "balance_loss_mlp": 0.01263304, + "epoch": 0.15752292199007967, + "flos": 16806352976640.0, + "grad_norm": 1.8956550238632917, + "language_loss": 0.77960408, + "learning_rate": 3.831822382544101e-06, + "loss": 0.85842085, + "num_input_tokens_seen": 56822470, + "router_z_loss_clip": 2.88476562, + "router_z_loss_mlp": 0.26013184, + "step": 2620, + "time_per_iteration": 2.556342363357544 + }, + { + "auxiliary_loss_clip": 0.06608844, + "auxiliary_loss_mlp": 0.01287118, + "balance_loss_clip": 0.06316274, + "balance_loss_mlp": 0.01259843, + "epoch": 0.15758304524274763, + "flos": 29833856121600.0, + "grad_norm": 1.8795614053933318, + "language_loss": 0.72243416, + "learning_rate": 3.831666025302944e-06, + "loss": 0.80139381, + "num_input_tokens_seen": 56842100, + "router_z_loss_clip": 2.92382812, + "router_z_loss_mlp": 0.27282715, + "step": 2621, + "time_per_iteration": 4.014448881149292 + }, + { + "auxiliary_loss_clip": 0.06605494, + "auxiliary_loss_mlp": 0.01287754, + "balance_loss_clip": 0.06309334, + "balance_loss_mlp": 0.01260813, + "epoch": 0.1576431684954156, + "flos": 53589116851200.0, + "grad_norm": 5.362699165833927, + "language_loss": 0.73428345, + "learning_rate": 3.831509598604828e-06, + "loss": 0.81321585, + "num_input_tokens_seen": 56865920, + "router_z_loss_clip": 2.96484375, + "router_z_loss_mlp": 0.26940918, + "step": 2622, + "time_per_iteration": 2.9332852363586426 + }, + { + "auxiliary_loss_clip": 0.06587812, + "auxiliary_loss_mlp": 0.01287353, + "balance_loss_clip": 0.06302886, + "balance_loss_mlp": 0.01262284, + "epoch": 0.15770329174808356, + "flos": 20819887695360.0, + "grad_norm": 1.8034719431418926, + "language_loss": 0.88731241, + "learning_rate": 3.831353102455684e-06, + "loss": 0.96606404, + "num_input_tokens_seen": 56885265, + "router_z_loss_clip": 2.84765625, + "router_z_loss_mlp": 0.25085449, + "step": 2623, + "time_per_iteration": 3.993907928466797 + }, + { + "auxiliary_loss_clip": 0.06595732, + "auxiliary_loss_mlp": 0.01282154, + "balance_loss_clip": 0.0630911, + "balance_loss_mlp": 0.01255594, + "epoch": 0.15776341500075153, + "flos": 24981148362240.0, + "grad_norm": 2.539905380031208, + "language_loss": 0.82629728, + "learning_rate": 3.831196536861448e-06, + "loss": 0.90507615, + "num_input_tokens_seen": 56906710, + "router_z_loss_clip": 2.86523438, + "router_z_loss_mlp": 0.265625, + "step": 2624, + "time_per_iteration": 2.5706846714019775 + }, + { + "auxiliary_loss_clip": 0.06606949, + "auxiliary_loss_mlp": 0.01292533, + "balance_loss_clip": 0.06309812, + "balance_loss_mlp": 0.01266093, + "epoch": 0.15782353825341952, + "flos": 21914331799680.0, + "grad_norm": 3.0693090763099815, + "language_loss": 0.81940538, + "learning_rate": 3.831039901828054e-06, + "loss": 0.89840019, + "num_input_tokens_seen": 56924275, + "router_z_loss_clip": 2.9765625, + "router_z_loss_mlp": 0.26452637, + "step": 2625, + "time_per_iteration": 2.569840669631958 + }, + { + "auxiliary_loss_clip": 0.06593765, + "auxiliary_loss_mlp": 0.01293944, + "balance_loss_clip": 0.06303135, + "balance_loss_mlp": 0.01268064, + "epoch": 0.15788366150608749, + "flos": 26184395393280.0, + "grad_norm": 2.523517901800404, + "language_loss": 0.81776226, + "learning_rate": 3.830883197361445e-06, + "loss": 0.89663935, + "num_input_tokens_seen": 56941525, + "router_z_loss_clip": 2.90429688, + "router_z_loss_mlp": 0.25891113, + "step": 2626, + "time_per_iteration": 2.561379909515381 + }, + { + "auxiliary_loss_clip": 0.06594853, + "auxiliary_loss_mlp": 0.01294161, + "balance_loss_clip": 0.06304863, + "balance_loss_mlp": 0.01267434, + "epoch": 0.15794378475875545, + "flos": 27717321513600.0, + "grad_norm": 1.6929688421529916, + "language_loss": 0.7457962, + "learning_rate": 3.830726423467561e-06, + "loss": 0.82468635, + "num_input_tokens_seen": 56962145, + "router_z_loss_clip": 2.90039062, + "router_z_loss_mlp": 0.26708984, + "step": 2627, + "time_per_iteration": 2.596707344055176 + }, + { + "auxiliary_loss_clip": 0.06587663, + "auxiliary_loss_mlp": 0.01294139, + "balance_loss_clip": 0.06296949, + "balance_loss_mlp": 0.01267007, + "epoch": 0.15800390801142342, + "flos": 12135011379840.0, + "grad_norm": 2.3877400099999413, + "language_loss": 0.87097675, + "learning_rate": 3.830569580152348e-06, + "loss": 0.94979477, + "num_input_tokens_seen": 56977505, + "router_z_loss_clip": 2.91015625, + "router_z_loss_mlp": 0.27172852, + "step": 2628, + "time_per_iteration": 5.372643709182739 + }, + { + "auxiliary_loss_clip": 0.06588875, + "auxiliary_loss_mlp": 0.01280598, + "balance_loss_clip": 0.06300817, + "balance_loss_mlp": 0.0125548, + "epoch": 0.15806403126409138, + "flos": 20711084768640.0, + "grad_norm": 2.1789511738163236, + "language_loss": 0.77439439, + "learning_rate": 3.830412667421752e-06, + "loss": 0.85308909, + "num_input_tokens_seen": 56996770, + "router_z_loss_clip": 2.88085938, + "router_z_loss_mlp": 0.25097656, + "step": 2629, + "time_per_iteration": 2.571425199508667 + }, + { + "auxiliary_loss_clip": 0.06593206, + "auxiliary_loss_mlp": 0.0128531, + "balance_loss_clip": 0.06298864, + "balance_loss_mlp": 0.01257117, + "epoch": 0.15812415451675935, + "flos": 17827479158400.0, + "grad_norm": 2.6284348264521853, + "language_loss": 0.74838495, + "learning_rate": 3.8302556852817245e-06, + "loss": 0.82717013, + "num_input_tokens_seen": 57014970, + "router_z_loss_clip": 2.94726562, + "router_z_loss_mlp": 0.28186035, + "step": 2630, + "time_per_iteration": 2.538496971130371 + }, + { + "auxiliary_loss_clip": 0.06592915, + "auxiliary_loss_mlp": 0.01286291, + "balance_loss_clip": 0.06295606, + "balance_loss_mlp": 0.0125904, + "epoch": 0.15818427776942734, + "flos": 20090230341120.0, + "grad_norm": 3.888480122572148, + "language_loss": 0.84692156, + "learning_rate": 3.8300986337382184e-06, + "loss": 0.9257136, + "num_input_tokens_seen": 57034045, + "router_z_loss_clip": 2.97070312, + "router_z_loss_mlp": 0.27270508, + "step": 2631, + "time_per_iteration": 2.6821517944335938 + }, + { + "auxiliary_loss_clip": 0.06584532, + "auxiliary_loss_mlp": 0.01280599, + "balance_loss_clip": 0.06294788, + "balance_loss_mlp": 0.01253563, + "epoch": 0.1582444010220953, + "flos": 21221249552640.0, + "grad_norm": 8.851391146614638, + "language_loss": 0.79768324, + "learning_rate": 3.8299415127971895e-06, + "loss": 0.87633461, + "num_input_tokens_seen": 57053695, + "router_z_loss_clip": 2.8984375, + "router_z_loss_mlp": 0.27050781, + "step": 2632, + "time_per_iteration": 2.5977976322174072 + }, + { + "auxiliary_loss_clip": 0.06588165, + "auxiliary_loss_mlp": 0.01281414, + "balance_loss_clip": 0.06294183, + "balance_loss_mlp": 0.01255414, + "epoch": 0.15830452427476327, + "flos": 17864138119680.0, + "grad_norm": 1.985726901466477, + "language_loss": 0.83594966, + "learning_rate": 3.829784322464594e-06, + "loss": 0.91464543, + "num_input_tokens_seen": 57071290, + "router_z_loss_clip": 2.93945312, + "router_z_loss_mlp": 0.2598877, + "step": 2633, + "time_per_iteration": 2.569474220275879 + }, + { + "auxiliary_loss_clip": 0.0658908, + "auxiliary_loss_mlp": 0.0128242, + "balance_loss_clip": 0.0629508, + "balance_loss_mlp": 0.01256265, + "epoch": 0.15836464752743123, + "flos": 24541827805440.0, + "grad_norm": 1.6688248008006443, + "language_loss": 0.78379452, + "learning_rate": 3.829627062746394e-06, + "loss": 0.86250955, + "num_input_tokens_seen": 57091465, + "router_z_loss_clip": 2.93945312, + "router_z_loss_mlp": 0.26196289, + "step": 2634, + "time_per_iteration": 2.5919923782348633 + }, + { + "auxiliary_loss_clip": 0.06593279, + "auxiliary_loss_mlp": 0.01291316, + "balance_loss_clip": 0.06295943, + "balance_loss_mlp": 0.01263337, + "epoch": 0.1584247707800992, + "flos": 20127057010560.0, + "grad_norm": 2.0830753641117306, + "language_loss": 0.89997375, + "learning_rate": 3.829469733648552e-06, + "loss": 0.97881973, + "num_input_tokens_seen": 57110075, + "router_z_loss_clip": 2.9765625, + "router_z_loss_mlp": 0.27966309, + "step": 2635, + "time_per_iteration": 2.5786406993865967 + }, + { + "auxiliary_loss_clip": 0.06588058, + "auxiliary_loss_mlp": 0.01288113, + "balance_loss_clip": 0.06292774, + "balance_loss_mlp": 0.01260218, + "epoch": 0.15848489403276717, + "flos": 20382202293120.0, + "grad_norm": 2.014850044069841, + "language_loss": 0.7709136, + "learning_rate": 3.829312335177034e-06, + "loss": 0.8496753, + "num_input_tokens_seen": 57128945, + "router_z_loss_clip": 2.95703125, + "router_z_loss_mlp": 0.27868652, + "step": 2636, + "time_per_iteration": 2.6201331615448 + }, + { + "auxiliary_loss_clip": 0.06586573, + "auxiliary_loss_mlp": 0.0128751, + "balance_loss_clip": 0.06290652, + "balance_loss_mlp": 0.0126101, + "epoch": 0.15854501728543513, + "flos": 39356018760960.0, + "grad_norm": 2.044553358008507, + "language_loss": 0.73238122, + "learning_rate": 3.82915486733781e-06, + "loss": 0.81112206, + "num_input_tokens_seen": 57152385, + "router_z_loss_clip": 2.95898438, + "router_z_loss_mlp": 0.26489258, + "step": 2637, + "time_per_iteration": 2.742854595184326 + }, + { + "auxiliary_loss_clip": 0.06583421, + "auxiliary_loss_mlp": 0.01288932, + "balance_loss_clip": 0.06292208, + "balance_loss_mlp": 0.01262468, + "epoch": 0.15860514053810312, + "flos": 24871297259520.0, + "grad_norm": 1.8074381255816763, + "language_loss": 0.79285657, + "learning_rate": 3.82899733013685e-06, + "loss": 0.87158012, + "num_input_tokens_seen": 57172620, + "router_z_loss_clip": 2.91601562, + "router_z_loss_mlp": 0.26489258, + "step": 2638, + "time_per_iteration": 2.5642874240875244 + }, + { + "auxiliary_loss_clip": 0.06588158, + "auxiliary_loss_mlp": 0.01287351, + "balance_loss_clip": 0.06294204, + "balance_loss_mlp": 0.01258908, + "epoch": 0.1586652637907711, + "flos": 26184982371840.0, + "grad_norm": 2.3471549301232844, + "language_loss": 0.76132977, + "learning_rate": 3.828839723580128e-06, + "loss": 0.84008479, + "num_input_tokens_seen": 57194680, + "router_z_loss_clip": 2.94335938, + "router_z_loss_mlp": 0.28491211, + "step": 2639, + "time_per_iteration": 2.615779399871826 + }, + { + "auxiliary_loss_clip": 0.06586854, + "auxiliary_loss_mlp": 0.01295396, + "balance_loss_clip": 0.06293523, + "balance_loss_mlp": 0.01267299, + "epoch": 0.15872538704343905, + "flos": 19798174535040.0, + "grad_norm": 1.8583301329388602, + "language_loss": 0.82681525, + "learning_rate": 3.82868204767362e-06, + "loss": 0.90563774, + "num_input_tokens_seen": 57214675, + "router_z_loss_clip": 2.93164062, + "router_z_loss_mlp": 0.28076172, + "step": 2640, + "time_per_iteration": 2.5406789779663086 + }, + { + "auxiliary_loss_clip": 0.06583565, + "auxiliary_loss_mlp": 0.0129063, + "balance_loss_clip": 0.06294291, + "balance_loss_mlp": 0.01262342, + "epoch": 0.15878551029610702, + "flos": 28482883142400.0, + "grad_norm": 1.847395702831907, + "language_loss": 0.67676318, + "learning_rate": 3.828524302423306e-06, + "loss": 0.75550508, + "num_input_tokens_seen": 57235830, + "router_z_loss_clip": 2.89453125, + "router_z_loss_mlp": 0.28308105, + "step": 2641, + "time_per_iteration": 2.6107757091522217 + }, + { + "auxiliary_loss_clip": 0.06593709, + "auxiliary_loss_mlp": 0.01287834, + "balance_loss_clip": 0.06291051, + "balance_loss_mlp": 0.01259199, + "epoch": 0.15884563354877498, + "flos": 24213532308480.0, + "grad_norm": 2.4455482341546366, + "language_loss": 0.77487421, + "learning_rate": 3.828366487835167e-06, + "loss": 0.85368967, + "num_input_tokens_seen": 57255970, + "router_z_loss_clip": 3.02929688, + "router_z_loss_mlp": 0.28674316, + "step": 2642, + "time_per_iteration": 2.549790382385254 + }, + { + "auxiliary_loss_clip": 0.06588584, + "auxiliary_loss_mlp": 0.01290508, + "balance_loss_clip": 0.06297128, + "balance_loss_mlp": 0.0126303, + "epoch": 0.15890575680144295, + "flos": 23956332600960.0, + "grad_norm": 2.206510162678276, + "language_loss": 0.71574652, + "learning_rate": 3.828208603915186e-06, + "loss": 0.79453743, + "num_input_tokens_seen": 57274435, + "router_z_loss_clip": 2.91992188, + "router_z_loss_mlp": 0.27478027, + "step": 2643, + "time_per_iteration": 2.5622386932373047 + }, + { + "auxiliary_loss_clip": 0.06581764, + "auxiliary_loss_mlp": 0.01292278, + "balance_loss_clip": 0.06295977, + "balance_loss_mlp": 0.01265432, + "epoch": 0.15896588005411091, + "flos": 21221375333760.0, + "grad_norm": 1.9554363630175624, + "language_loss": 0.78877175, + "learning_rate": 3.828050650669353e-06, + "loss": 0.86751211, + "num_input_tokens_seen": 57293115, + "router_z_loss_clip": 2.859375, + "router_z_loss_mlp": 0.26867676, + "step": 2644, + "time_per_iteration": 2.519049644470215 + }, + { + "auxiliary_loss_clip": 0.06584983, + "auxiliary_loss_mlp": 0.01285638, + "balance_loss_clip": 0.06294875, + "balance_loss_mlp": 0.01257588, + "epoch": 0.1590260033067789, + "flos": 24359203831680.0, + "grad_norm": 1.8306681743440225, + "language_loss": 0.83401352, + "learning_rate": 3.827892628103657e-06, + "loss": 0.91271967, + "num_input_tokens_seen": 57312565, + "router_z_loss_clip": 2.90234375, + "router_z_loss_mlp": 0.28039551, + "step": 2645, + "time_per_iteration": 2.5938899517059326 + }, + { + "auxiliary_loss_clip": 0.06594808, + "auxiliary_loss_mlp": 0.01293395, + "balance_loss_clip": 0.063001, + "balance_loss_mlp": 0.01263914, + "epoch": 0.15908612655944687, + "flos": 32056719960960.0, + "grad_norm": 2.510422612834076, + "language_loss": 0.70788723, + "learning_rate": 3.827734536224087e-06, + "loss": 0.78676921, + "num_input_tokens_seen": 57333360, + "router_z_loss_clip": 2.94921875, + "router_z_loss_mlp": 0.2947998, + "step": 2646, + "time_per_iteration": 2.6329824924468994 + }, + { + "auxiliary_loss_clip": 0.06588359, + "auxiliary_loss_mlp": 0.01289443, + "balance_loss_clip": 0.06303679, + "balance_loss_mlp": 0.01262728, + "epoch": 0.15914624981211484, + "flos": 17791155613440.0, + "grad_norm": 1.930709185953096, + "language_loss": 0.63532102, + "learning_rate": 3.827576375036642e-06, + "loss": 0.71409905, + "num_input_tokens_seen": 57350575, + "router_z_loss_clip": 2.84765625, + "router_z_loss_mlp": 0.26696777, + "step": 2647, + "time_per_iteration": 2.5299501419067383 + }, + { + "auxiliary_loss_clip": 0.06584711, + "auxiliary_loss_mlp": 0.01288467, + "balance_loss_clip": 0.06297973, + "balance_loss_mlp": 0.0126174, + "epoch": 0.1592063730647828, + "flos": 17718298888320.0, + "grad_norm": 2.1247786745604818, + "language_loss": 0.90530396, + "learning_rate": 3.827418144547318e-06, + "loss": 0.98403573, + "num_input_tokens_seen": 57367570, + "router_z_loss_clip": 2.86914062, + "router_z_loss_mlp": 0.26757812, + "step": 2648, + "time_per_iteration": 2.5112242698669434 + }, + { + "auxiliary_loss_clip": 0.06582057, + "auxiliary_loss_mlp": 0.01285915, + "balance_loss_clip": 0.06301906, + "balance_loss_mlp": 0.01259915, + "epoch": 0.15926649631745077, + "flos": 18808927632000.0, + "grad_norm": 2.0063837423825044, + "language_loss": 0.92929685, + "learning_rate": 3.827259844762114e-06, + "loss": 1.00797653, + "num_input_tokens_seen": 57383980, + "router_z_loss_clip": 2.80078125, + "router_z_loss_mlp": 0.26013184, + "step": 2649, + "time_per_iteration": 2.5400166511535645 + }, + { + "auxiliary_loss_clip": 0.06614827, + "auxiliary_loss_mlp": 0.01289461, + "balance_loss_clip": 0.0630791, + "balance_loss_mlp": 0.01258156, + "epoch": 0.15932661957011873, + "flos": 17571956532480.0, + "grad_norm": 3.5338623134858924, + "language_loss": 0.73033249, + "learning_rate": 3.827101475687033e-06, + "loss": 0.80937541, + "num_input_tokens_seen": 57400840, + "router_z_loss_clip": 3.07226562, + "router_z_loss_mlp": 0.31311035, + "step": 2650, + "time_per_iteration": 2.499260187149048 + }, + { + "auxiliary_loss_clip": 0.06585062, + "auxiliary_loss_mlp": 0.01286624, + "balance_loss_clip": 0.06301688, + "balance_loss_mlp": 0.01259837, + "epoch": 0.15938674282278673, + "flos": 13339432368000.0, + "grad_norm": 2.105429239138805, + "language_loss": 0.72751939, + "learning_rate": 3.826943037328082e-06, + "loss": 0.80623615, + "num_input_tokens_seen": 57419230, + "router_z_loss_clip": 2.83398438, + "router_z_loss_mlp": 0.2677002, + "step": 2651, + "time_per_iteration": 2.5559604167938232 + }, + { + "auxiliary_loss_clip": 0.06597096, + "auxiliary_loss_mlp": 0.01284795, + "balance_loss_clip": 0.06307643, + "balance_loss_mlp": 0.01257925, + "epoch": 0.1594468660754547, + "flos": 22494879613440.0, + "grad_norm": 1.8417049105495777, + "language_loss": 0.80598879, + "learning_rate": 3.8267845296912674e-06, + "loss": 0.88480765, + "num_input_tokens_seen": 57439315, + "router_z_loss_clip": 2.89257812, + "router_z_loss_mlp": 0.26855469, + "step": 2652, + "time_per_iteration": 2.562206745147705 + }, + { + "auxiliary_loss_clip": 0.06582868, + "auxiliary_loss_mlp": 0.01288009, + "balance_loss_clip": 0.06299073, + "balance_loss_mlp": 0.01260745, + "epoch": 0.15950698932812266, + "flos": 15011782882560.0, + "grad_norm": 3.0665030726784233, + "language_loss": 0.71219099, + "learning_rate": 3.826625952782601e-06, + "loss": 0.79089975, + "num_input_tokens_seen": 57454635, + "router_z_loss_clip": 2.83984375, + "router_z_loss_mlp": 0.27258301, + "step": 2653, + "time_per_iteration": 2.5217130184173584 + }, + { + "auxiliary_loss_clip": 0.06588405, + "auxiliary_loss_mlp": 0.01286539, + "balance_loss_clip": 0.06299819, + "balance_loss_mlp": 0.01261064, + "epoch": 0.15956711258079062, + "flos": 30163074013440.0, + "grad_norm": 3.2964270915620655, + "language_loss": 0.78400207, + "learning_rate": 3.826467306608095e-06, + "loss": 0.86275154, + "num_input_tokens_seen": 57476805, + "router_z_loss_clip": 2.890625, + "router_z_loss_mlp": 0.25488281, + "step": 2654, + "time_per_iteration": 2.68938946723938 + }, + { + "auxiliary_loss_clip": 0.06585521, + "auxiliary_loss_mlp": 0.01284621, + "balance_loss_clip": 0.06301536, + "balance_loss_mlp": 0.01259265, + "epoch": 0.1596272358334586, + "flos": 21039044849280.0, + "grad_norm": 1.8634603693624054, + "language_loss": 0.82786137, + "learning_rate": 3.826308591173765e-06, + "loss": 0.90656281, + "num_input_tokens_seen": 57496400, + "router_z_loss_clip": 2.83984375, + "router_z_loss_mlp": 0.25341797, + "step": 2655, + "time_per_iteration": 2.5611259937286377 + }, + { + "auxiliary_loss_clip": 0.06585874, + "auxiliary_loss_mlp": 0.01285173, + "balance_loss_clip": 0.06296754, + "balance_loss_mlp": 0.01259937, + "epoch": 0.15968735908612655, + "flos": 15273426856320.0, + "grad_norm": 1.9406686852412747, + "language_loss": 0.74707991, + "learning_rate": 3.826149806485631e-06, + "loss": 0.82579041, + "num_input_tokens_seen": 57513700, + "router_z_loss_clip": 2.890625, + "router_z_loss_mlp": 0.25244141, + "step": 2656, + "time_per_iteration": 2.510824680328369 + }, + { + "auxiliary_loss_clip": 0.06577112, + "auxiliary_loss_mlp": 0.0129381, + "balance_loss_clip": 0.06299932, + "balance_loss_mlp": 0.01268705, + "epoch": 0.15974748233879452, + "flos": 52677338647680.0, + "grad_norm": 1.8958398061879393, + "language_loss": 0.78470719, + "learning_rate": 3.825990952549713e-06, + "loss": 0.86341643, + "num_input_tokens_seen": 57536180, + "router_z_loss_clip": 2.7734375, + "router_z_loss_mlp": 0.25109863, + "step": 2657, + "time_per_iteration": 2.8164706230163574 + }, + { + "auxiliary_loss_clip": 0.06582649, + "auxiliary_loss_mlp": 0.01286585, + "balance_loss_clip": 0.062974, + "balance_loss_mlp": 0.01260514, + "epoch": 0.1598076055914625, + "flos": 18739047726720.0, + "grad_norm": 1.7078792593137306, + "language_loss": 0.75124943, + "learning_rate": 3.825832029372035e-06, + "loss": 0.82994181, + "num_input_tokens_seen": 57555025, + "router_z_loss_clip": 2.84960938, + "router_z_loss_mlp": 0.26098633, + "step": 2658, + "time_per_iteration": 2.539357900619507 + }, + { + "auxiliary_loss_clip": 0.06584077, + "auxiliary_loss_mlp": 0.01290613, + "balance_loss_clip": 0.06297718, + "balance_loss_mlp": 0.0126354, + "epoch": 0.15986772884413047, + "flos": 34357681405440.0, + "grad_norm": 1.7106510421340806, + "language_loss": 0.76173538, + "learning_rate": 3.825673036958624e-06, + "loss": 0.84048235, + "num_input_tokens_seen": 57577660, + "router_z_loss_clip": 2.86328125, + "router_z_loss_mlp": 0.27087402, + "step": 2659, + "time_per_iteration": 2.7063279151916504 + }, + { + "auxiliary_loss_clip": 0.06590043, + "auxiliary_loss_mlp": 0.01292057, + "balance_loss_clip": 0.06300306, + "balance_loss_mlp": 0.01265164, + "epoch": 0.15992785209679844, + "flos": 22061596550400.0, + "grad_norm": 2.109703300615196, + "language_loss": 0.91436422, + "learning_rate": 3.825513975315508e-06, + "loss": 0.99318516, + "num_input_tokens_seen": 57596335, + "router_z_loss_clip": 2.8984375, + "router_z_loss_mlp": 0.26855469, + "step": 2660, + "time_per_iteration": 3.960657835006714 + }, + { + "auxiliary_loss_clip": 0.06587565, + "auxiliary_loss_mlp": 0.01283697, + "balance_loss_clip": 0.06297715, + "balance_loss_mlp": 0.01257018, + "epoch": 0.1599879753494664, + "flos": 33073946928000.0, + "grad_norm": 2.772952590222661, + "language_loss": 0.79090029, + "learning_rate": 3.82535484444872e-06, + "loss": 0.86961293, + "num_input_tokens_seen": 57616830, + "router_z_loss_clip": 2.90039062, + "router_z_loss_mlp": 0.26647949, + "step": 2661, + "time_per_iteration": 2.64117693901062 + }, + { + "auxiliary_loss_clip": 0.0657732, + "auxiliary_loss_mlp": 0.01287922, + "balance_loss_clip": 0.06293119, + "balance_loss_mlp": 0.01262495, + "epoch": 0.16004809860213437, + "flos": 28045533156480.0, + "grad_norm": 1.8363743510340895, + "language_loss": 0.74837106, + "learning_rate": 3.825195644364292e-06, + "loss": 0.82702351, + "num_input_tokens_seen": 57635515, + "router_z_loss_clip": 2.84375, + "router_z_loss_mlp": 0.25390625, + "step": 2662, + "time_per_iteration": 4.100783586502075 + }, + { + "auxiliary_loss_clip": 0.06590086, + "auxiliary_loss_mlp": 0.01285907, + "balance_loss_clip": 0.06299042, + "balance_loss_mlp": 0.01259967, + "epoch": 0.16010822185480234, + "flos": 22786096878720.0, + "grad_norm": 1.8771670502098623, + "language_loss": 0.82632995, + "learning_rate": 3.825036375068263e-06, + "loss": 0.90508991, + "num_input_tokens_seen": 57654250, + "router_z_loss_clip": 2.91015625, + "router_z_loss_mlp": 0.25964355, + "step": 2663, + "time_per_iteration": 2.5558366775512695 + }, + { + "auxiliary_loss_clip": 0.06586467, + "auxiliary_loss_mlp": 0.01285272, + "balance_loss_clip": 0.06297847, + "balance_loss_mlp": 0.01260011, + "epoch": 0.16016834510747033, + "flos": 20090188414080.0, + "grad_norm": 3.3923647685745344, + "language_loss": 0.81316251, + "learning_rate": 3.824877036566672e-06, + "loss": 0.89187992, + "num_input_tokens_seen": 57672645, + "router_z_loss_clip": 2.88671875, + "router_z_loss_mlp": 0.25268555, + "step": 2664, + "time_per_iteration": 2.5118319988250732 + }, + { + "auxiliary_loss_clip": 0.06584498, + "auxiliary_loss_mlp": 0.01285586, + "balance_loss_clip": 0.06298545, + "balance_loss_mlp": 0.01259038, + "epoch": 0.1602284683601383, + "flos": 21179391638400.0, + "grad_norm": 1.6927431664351194, + "language_loss": 0.94832575, + "learning_rate": 3.824717628865561e-06, + "loss": 1.02702665, + "num_input_tokens_seen": 57691055, + "router_z_loss_clip": 2.86132812, + "router_z_loss_mlp": 0.26550293, + "step": 2665, + "time_per_iteration": 2.54654860496521 + }, + { + "auxiliary_loss_clip": 0.06588221, + "auxiliary_loss_mlp": 0.0128992, + "balance_loss_clip": 0.06298642, + "balance_loss_mlp": 0.01263051, + "epoch": 0.16028859161280626, + "flos": 14652823991040.0, + "grad_norm": 2.069431022104881, + "language_loss": 0.85796285, + "learning_rate": 3.824558151970974e-06, + "loss": 0.93674427, + "num_input_tokens_seen": 57707235, + "router_z_loss_clip": 2.89648438, + "router_z_loss_mlp": 0.26879883, + "step": 2666, + "time_per_iteration": 2.483457088470459 + }, + { + "auxiliary_loss_clip": 0.06582008, + "auxiliary_loss_mlp": 0.01292714, + "balance_loss_clip": 0.06294423, + "balance_loss_mlp": 0.01268645, + "epoch": 0.16034871486547422, + "flos": 20995677561600.0, + "grad_norm": 1.9110296287370478, + "language_loss": 0.82042331, + "learning_rate": 3.8243986058889595e-06, + "loss": 0.89917052, + "num_input_tokens_seen": 57724190, + "router_z_loss_clip": 2.875, + "router_z_loss_mlp": 0.24072266, + "step": 2667, + "time_per_iteration": 3.9772729873657227 + }, + { + "auxiliary_loss_clip": 0.06585021, + "auxiliary_loss_mlp": 0.01299108, + "balance_loss_clip": 0.06302348, + "balance_loss_mlp": 0.01272608, + "epoch": 0.1604088381181422, + "flos": 21404167015680.0, + "grad_norm": 2.2548046072843664, + "language_loss": 0.74520987, + "learning_rate": 3.824238990625567e-06, + "loss": 0.82405114, + "num_input_tokens_seen": 57743620, + "router_z_loss_clip": 2.82421875, + "router_z_loss_mlp": 0.26513672, + "step": 2668, + "time_per_iteration": 2.5379245281219482 + }, + { + "auxiliary_loss_clip": 0.06581191, + "auxiliary_loss_mlp": 0.01286404, + "balance_loss_clip": 0.06295477, + "balance_loss_mlp": 0.01259296, + "epoch": 0.16046896137081015, + "flos": 23883601656960.0, + "grad_norm": 1.6904761581724046, + "language_loss": 0.78225315, + "learning_rate": 3.824079306186848e-06, + "loss": 0.86092913, + "num_input_tokens_seen": 57764810, + "router_z_loss_clip": 2.86132812, + "router_z_loss_mlp": 0.27124023, + "step": 2669, + "time_per_iteration": 2.5322623252868652 + }, + { + "auxiliary_loss_clip": 0.06461855, + "auxiliary_loss_mlp": 0.01262059, + "balance_loss_clip": 0.06292316, + "balance_loss_mlp": 0.01253518, + "epoch": 0.16052908462347812, + "flos": 59823907453440.0, + "grad_norm": 0.8025105121256505, + "language_loss": 0.55497211, + "learning_rate": 3.823919552578861e-06, + "loss": 0.63221133, + "num_input_tokens_seen": 57824390, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.08551025, + "step": 2670, + "time_per_iteration": 3.0635480880737305 + }, + { + "auxiliary_loss_clip": 0.06584324, + "auxiliary_loss_mlp": 0.01300694, + "balance_loss_clip": 0.06294604, + "balance_loss_mlp": 0.01273097, + "epoch": 0.1605892078761461, + "flos": 18302494354560.0, + "grad_norm": 1.9278903563018932, + "language_loss": 0.79113603, + "learning_rate": 3.82375972980766e-06, + "loss": 0.86998624, + "num_input_tokens_seen": 57843665, + "router_z_loss_clip": 2.8984375, + "router_z_loss_mlp": 0.27587891, + "step": 2671, + "time_per_iteration": 2.5478527545928955 + }, + { + "auxiliary_loss_clip": 0.06586512, + "auxiliary_loss_mlp": 0.01285282, + "balance_loss_clip": 0.06298812, + "balance_loss_mlp": 0.01259914, + "epoch": 0.16064933112881408, + "flos": 32168918977920.0, + "grad_norm": 2.1901870356390964, + "language_loss": 0.65440154, + "learning_rate": 3.8235998378793086e-06, + "loss": 0.73311949, + "num_input_tokens_seen": 57863305, + "router_z_loss_clip": 2.87890625, + "router_z_loss_mlp": 0.25378418, + "step": 2672, + "time_per_iteration": 2.659353494644165 + }, + { + "auxiliary_loss_clip": 0.06589735, + "auxiliary_loss_mlp": 0.01293218, + "balance_loss_clip": 0.06296135, + "balance_loss_mlp": 0.01263916, + "epoch": 0.16070945438148204, + "flos": 19834959277440.0, + "grad_norm": 2.1290275432047037, + "language_loss": 0.86193001, + "learning_rate": 3.8234398767998675e-06, + "loss": 0.94075954, + "num_input_tokens_seen": 57883025, + "router_z_loss_clip": 2.93554688, + "router_z_loss_mlp": 0.29296875, + "step": 2673, + "time_per_iteration": 2.5288193225860596 + }, + { + "auxiliary_loss_clip": 0.06583102, + "auxiliary_loss_mlp": 0.01291016, + "balance_loss_clip": 0.06295739, + "balance_loss_mlp": 0.0126572, + "epoch": 0.16076957763415, + "flos": 18918569099520.0, + "grad_norm": 2.3065631305512473, + "language_loss": 0.73982865, + "learning_rate": 3.823279846575403e-06, + "loss": 0.81856978, + "num_input_tokens_seen": 57901430, + "router_z_loss_clip": 2.87304688, + "router_z_loss_mlp": 0.25305176, + "step": 2674, + "time_per_iteration": 2.524121046066284 + }, + { + "auxiliary_loss_clip": 0.06576435, + "auxiliary_loss_mlp": 0.0128192, + "balance_loss_clip": 0.06293078, + "balance_loss_mlp": 0.01255086, + "epoch": 0.16082970088681797, + "flos": 16770071358720.0, + "grad_norm": 3.691225614104051, + "language_loss": 0.85411537, + "learning_rate": 3.823119747211986e-06, + "loss": 0.93269891, + "num_input_tokens_seen": 57919550, + "router_z_loss_clip": 2.83007812, + "router_z_loss_mlp": 0.26806641, + "step": 2675, + "time_per_iteration": 2.4984703063964844 + }, + { + "auxiliary_loss_clip": 0.06581541, + "auxiliary_loss_mlp": 0.01285801, + "balance_loss_clip": 0.06293826, + "balance_loss_mlp": 0.01259468, + "epoch": 0.16088982413948594, + "flos": 35158560330240.0, + "grad_norm": 1.8394721735800996, + "language_loss": 0.83251232, + "learning_rate": 3.822959578715685e-06, + "loss": 0.91118574, + "num_input_tokens_seen": 57939890, + "router_z_loss_clip": 2.875, + "router_z_loss_mlp": 0.26306152, + "step": 2676, + "time_per_iteration": 2.6714260578155518 + }, + { + "auxiliary_loss_clip": 0.06567734, + "auxiliary_loss_mlp": 0.01280714, + "balance_loss_clip": 0.06290022, + "balance_loss_mlp": 0.01257456, + "epoch": 0.1609499473921539, + "flos": 18631125267840.0, + "grad_norm": 4.8459600996760805, + "language_loss": 0.74951547, + "learning_rate": 3.822799341092573e-06, + "loss": 0.82799989, + "num_input_tokens_seen": 57957410, + "router_z_loss_clip": 2.77539062, + "router_z_loss_mlp": 0.23266602, + "step": 2677, + "time_per_iteration": 2.5061256885528564 + }, + { + "auxiliary_loss_clip": 0.06577247, + "auxiliary_loss_mlp": 0.01283067, + "balance_loss_clip": 0.06292509, + "balance_loss_mlp": 0.01258164, + "epoch": 0.1610100706448219, + "flos": 33154057031040.0, + "grad_norm": 1.8038433202406936, + "language_loss": 0.77285242, + "learning_rate": 3.822639034348728e-06, + "loss": 0.85145557, + "num_input_tokens_seen": 57977900, + "router_z_loss_clip": 2.84765625, + "router_z_loss_mlp": 0.24926758, + "step": 2678, + "time_per_iteration": 2.6886472702026367 + }, + { + "auxiliary_loss_clip": 0.06581186, + "auxiliary_loss_mlp": 0.01287879, + "balance_loss_clip": 0.06295253, + "balance_loss_mlp": 0.01261271, + "epoch": 0.16107019389748986, + "flos": 34685054507520.0, + "grad_norm": 1.8476006870379242, + "language_loss": 0.71465111, + "learning_rate": 3.822478658490228e-06, + "loss": 0.79334176, + "num_input_tokens_seen": 57998210, + "router_z_loss_clip": 2.85742188, + "router_z_loss_mlp": 0.26611328, + "step": 2679, + "time_per_iteration": 2.646629810333252 + }, + { + "auxiliary_loss_clip": 0.06453654, + "auxiliary_loss_mlp": 0.01258662, + "balance_loss_clip": 0.06285442, + "balance_loss_mlp": 0.01250973, + "epoch": 0.16113031715015783, + "flos": 65730920411520.0, + "grad_norm": 0.7655469055577169, + "language_loss": 0.51874888, + "learning_rate": 3.822318213523154e-06, + "loss": 0.59587204, + "num_input_tokens_seen": 58059420, + "router_z_loss_clip": 1.6875, + "router_z_loss_mlp": 0.07678223, + "step": 2680, + "time_per_iteration": 3.3470637798309326 + }, + { + "auxiliary_loss_clip": 0.06584955, + "auxiliary_loss_mlp": 0.01288163, + "balance_loss_clip": 0.06295321, + "balance_loss_mlp": 0.01259363, + "epoch": 0.1611904404028258, + "flos": 20816156188800.0, + "grad_norm": 2.2126972690115476, + "language_loss": 0.81079412, + "learning_rate": 3.8221576994535925e-06, + "loss": 0.88952529, + "num_input_tokens_seen": 58078370, + "router_z_loss_clip": 2.8984375, + "router_z_loss_mlp": 0.28808594, + "step": 2681, + "time_per_iteration": 2.5526723861694336 + }, + { + "auxiliary_loss_clip": 0.06577247, + "auxiliary_loss_mlp": 0.01287934, + "balance_loss_clip": 0.06295492, + "balance_loss_mlp": 0.01262029, + "epoch": 0.16125056365549376, + "flos": 27020172343680.0, + "grad_norm": 2.1176985882953647, + "language_loss": 0.70093226, + "learning_rate": 3.821997116287627e-06, + "loss": 0.77958405, + "num_input_tokens_seen": 58097395, + "router_z_loss_clip": 2.81835938, + "router_z_loss_mlp": 0.25891113, + "step": 2682, + "time_per_iteration": 2.5618250370025635 + }, + { + "auxiliary_loss_clip": 0.0657934, + "auxiliary_loss_mlp": 0.01288185, + "balance_loss_clip": 0.06295457, + "balance_loss_mlp": 0.01261708, + "epoch": 0.16131068690816172, + "flos": 19281762622080.0, + "grad_norm": 2.105414566897303, + "language_loss": 0.88063419, + "learning_rate": 3.821836464031348e-06, + "loss": 0.9593094, + "num_input_tokens_seen": 58115630, + "router_z_loss_clip": 2.83789062, + "router_z_loss_mlp": 0.26464844, + "step": 2683, + "time_per_iteration": 2.528503656387329 + }, + { + "auxiliary_loss_clip": 0.06581098, + "auxiliary_loss_mlp": 0.01286491, + "balance_loss_clip": 0.06297365, + "balance_loss_mlp": 0.01260718, + "epoch": 0.16137081016082971, + "flos": 35347137943680.0, + "grad_norm": 2.6304159370219447, + "language_loss": 0.75242329, + "learning_rate": 3.821675742690849e-06, + "loss": 0.83109927, + "num_input_tokens_seen": 58138655, + "router_z_loss_clip": 2.83984375, + "router_z_loss_mlp": 0.25744629, + "step": 2684, + "time_per_iteration": 2.6683855056762695 + }, + { + "auxiliary_loss_clip": 0.06584509, + "auxiliary_loss_mlp": 0.01281022, + "balance_loss_clip": 0.0629454, + "balance_loss_mlp": 0.01253831, + "epoch": 0.16143093341349768, + "flos": 34242924839040.0, + "grad_norm": 3.4255618739056395, + "language_loss": 0.70703149, + "learning_rate": 3.821514952272223e-06, + "loss": 0.78568679, + "num_input_tokens_seen": 58157440, + "router_z_loss_clip": 2.90039062, + "router_z_loss_mlp": 0.27185059, + "step": 2685, + "time_per_iteration": 2.6502463817596436 + }, + { + "auxiliary_loss_clip": 0.06573574, + "auxiliary_loss_mlp": 0.01295712, + "balance_loss_clip": 0.06295055, + "balance_loss_mlp": 0.01269724, + "epoch": 0.16149105666616564, + "flos": 28006400499840.0, + "grad_norm": 2.7207808014988495, + "language_loss": 0.72642833, + "learning_rate": 3.821354092781567e-06, + "loss": 0.80512118, + "num_input_tokens_seen": 58176660, + "router_z_loss_clip": 2.78515625, + "router_z_loss_mlp": 0.26000977, + "step": 2686, + "time_per_iteration": 2.5685417652130127 + }, + { + "auxiliary_loss_clip": 0.06583634, + "auxiliary_loss_mlp": 0.01298345, + "balance_loss_clip": 0.06294057, + "balance_loss_mlp": 0.01269628, + "epoch": 0.1615511799188336, + "flos": 19427434145280.0, + "grad_norm": 2.058545535595822, + "language_loss": 0.82461345, + "learning_rate": 3.821193164224981e-06, + "loss": 0.90343326, + "num_input_tokens_seen": 58195085, + "router_z_loss_clip": 2.90234375, + "router_z_loss_mlp": 0.2869873, + "step": 2687, + "time_per_iteration": 2.5222442150115967 + }, + { + "auxiliary_loss_clip": 0.06594162, + "auxiliary_loss_mlp": 0.01299687, + "balance_loss_clip": 0.06299458, + "balance_loss_mlp": 0.01269109, + "epoch": 0.16161130317150157, + "flos": 22861217664000.0, + "grad_norm": 2.6401237934402575, + "language_loss": 0.72416258, + "learning_rate": 3.821032166608568e-06, + "loss": 0.80310106, + "num_input_tokens_seen": 58213540, + "router_z_loss_clip": 2.9453125, + "router_z_loss_mlp": 0.30578613, + "step": 2688, + "time_per_iteration": 2.5157902240753174 + }, + { + "auxiliary_loss_clip": 0.06589709, + "auxiliary_loss_mlp": 0.01309231, + "balance_loss_clip": 0.06303161, + "balance_loss_mlp": 0.0128161, + "epoch": 0.16167142642416954, + "flos": 26118833973120.0, + "grad_norm": 1.7781492277957918, + "language_loss": 0.76426512, + "learning_rate": 3.8208710999384325e-06, + "loss": 0.84325451, + "num_input_tokens_seen": 58236995, + "router_z_loss_clip": 2.8671875, + "router_z_loss_mlp": 0.27636719, + "step": 2689, + "time_per_iteration": 2.61681866645813 + }, + { + "auxiliary_loss_clip": 0.06586435, + "auxiliary_loss_mlp": 0.01313647, + "balance_loss_clip": 0.06302889, + "balance_loss_mlp": 0.01286182, + "epoch": 0.1617315496768375, + "flos": 22785551827200.0, + "grad_norm": 2.168912849024457, + "language_loss": 0.883026, + "learning_rate": 3.820709964220683e-06, + "loss": 0.96202683, + "num_input_tokens_seen": 58257230, + "router_z_loss_clip": 2.8359375, + "router_z_loss_mlp": 0.27478027, + "step": 2690, + "time_per_iteration": 2.542171001434326 + }, + { + "auxiliary_loss_clip": 0.06581193, + "auxiliary_loss_mlp": 0.01303059, + "balance_loss_clip": 0.06297438, + "balance_loss_mlp": 0.01277, + "epoch": 0.1617916729295055, + "flos": 22023721704960.0, + "grad_norm": 1.681429316785462, + "language_loss": 0.88894439, + "learning_rate": 3.8205487594614284e-06, + "loss": 0.96778685, + "num_input_tokens_seen": 58277080, + "router_z_loss_clip": 2.83398438, + "router_z_loss_mlp": 0.26049805, + "step": 2691, + "time_per_iteration": 2.5444743633270264 + }, + { + "auxiliary_loss_clip": 0.06592601, + "auxiliary_loss_mlp": 0.01300554, + "balance_loss_clip": 0.06297764, + "balance_loss_mlp": 0.01270108, + "epoch": 0.16185179618217346, + "flos": 23444574589440.0, + "grad_norm": 5.894128293889176, + "language_loss": 0.8353231, + "learning_rate": 3.820387485666784e-06, + "loss": 0.91425461, + "num_input_tokens_seen": 58294815, + "router_z_loss_clip": 2.94921875, + "router_z_loss_mlp": 0.30456543, + "step": 2692, + "time_per_iteration": 2.5367183685302734 + }, + { + "auxiliary_loss_clip": 0.06601407, + "auxiliary_loss_mlp": 0.01299753, + "balance_loss_clip": 0.06306131, + "balance_loss_mlp": 0.01270404, + "epoch": 0.16191191943484143, + "flos": 25673182433280.0, + "grad_norm": 2.87727514771051, + "language_loss": 0.82700074, + "learning_rate": 3.820226142842862e-06, + "loss": 0.9060123, + "num_input_tokens_seen": 58313215, + "router_z_loss_clip": 2.953125, + "router_z_loss_mlp": 0.29333496, + "step": 2693, + "time_per_iteration": 2.6187057495117188 + }, + { + "auxiliary_loss_clip": 0.06582904, + "auxiliary_loss_mlp": 0.01312533, + "balance_loss_clip": 0.06302174, + "balance_loss_mlp": 0.01286724, + "epoch": 0.1619720426875094, + "flos": 23484126516480.0, + "grad_norm": 1.4528149346161843, + "language_loss": 0.85022998, + "learning_rate": 3.820064730995783e-06, + "loss": 0.92918432, + "num_input_tokens_seen": 58333215, + "router_z_loss_clip": 2.80664062, + "router_z_loss_mlp": 0.25793457, + "step": 2694, + "time_per_iteration": 2.5672922134399414 + }, + { + "auxiliary_loss_clip": 0.06594259, + "auxiliary_loss_mlp": 0.01304563, + "balance_loss_clip": 0.0630251, + "balance_loss_mlp": 0.0127612, + "epoch": 0.16203216594017736, + "flos": 24140465948160.0, + "grad_norm": 2.1096932177369654, + "language_loss": 0.70739377, + "learning_rate": 3.819903250131667e-06, + "loss": 0.78638196, + "num_input_tokens_seen": 58351160, + "router_z_loss_clip": 2.921875, + "router_z_loss_mlp": 0.28442383, + "step": 2695, + "time_per_iteration": 2.5555880069732666 + }, + { + "auxiliary_loss_clip": 0.0659132, + "auxiliary_loss_mlp": 0.01297552, + "balance_loss_clip": 0.0630125, + "balance_loss_mlp": 0.01269943, + "epoch": 0.16209228919284532, + "flos": 22346566686720.0, + "grad_norm": 2.7194545314545153, + "language_loss": 0.83673584, + "learning_rate": 3.819741700256637e-06, + "loss": 0.91562462, + "num_input_tokens_seen": 58368505, + "router_z_loss_clip": 2.90429688, + "router_z_loss_mlp": 0.27600098, + "step": 2696, + "time_per_iteration": 2.520920753479004 + }, + { + "auxiliary_loss_clip": 0.06605247, + "auxiliary_loss_mlp": 0.01295053, + "balance_loss_clip": 0.06302903, + "balance_loss_mlp": 0.01263773, + "epoch": 0.1621524124455133, + "flos": 15820586017920.0, + "grad_norm": 2.3129442406301766, + "language_loss": 0.89183378, + "learning_rate": 3.8195800813768194e-06, + "loss": 0.97083676, + "num_input_tokens_seen": 58385085, + "router_z_loss_clip": 3.02539062, + "router_z_loss_mlp": 0.31274414, + "step": 2697, + "time_per_iteration": 2.5259652137756348 + }, + { + "auxiliary_loss_clip": 0.0658388, + "auxiliary_loss_mlp": 0.01292599, + "balance_loss_clip": 0.06303512, + "balance_loss_mlp": 0.01267004, + "epoch": 0.16221253569818128, + "flos": 30193905116160.0, + "grad_norm": 1.495271767432462, + "language_loss": 0.81588805, + "learning_rate": 3.819418393498343e-06, + "loss": 0.89465284, + "num_input_tokens_seen": 58406985, + "router_z_loss_clip": 2.8046875, + "router_z_loss_mlp": 0.25598145, + "step": 2698, + "time_per_iteration": 2.595975160598755 + }, + { + "auxiliary_loss_clip": 0.06588376, + "auxiliary_loss_mlp": 0.01284743, + "balance_loss_clip": 0.06309167, + "balance_loss_mlp": 0.01257765, + "epoch": 0.16227265895084925, + "flos": 24612546251520.0, + "grad_norm": 1.6873939512975982, + "language_loss": 0.78418016, + "learning_rate": 3.819256636627339e-06, + "loss": 0.86291134, + "num_input_tokens_seen": 58426205, + "router_z_loss_clip": 2.79296875, + "router_z_loss_mlp": 0.26965332, + "step": 2699, + "time_per_iteration": 2.5874006748199463 + }, + { + "auxiliary_loss_clip": 0.06599343, + "auxiliary_loss_mlp": 0.01283682, + "balance_loss_clip": 0.06313124, + "balance_loss_mlp": 0.0125754, + "epoch": 0.1623327822035172, + "flos": 19579436651520.0, + "grad_norm": 5.305505294911747, + "language_loss": 0.86966538, + "learning_rate": 3.81909481076994e-06, + "loss": 0.94849563, + "num_input_tokens_seen": 58443830, + "router_z_loss_clip": 2.859375, + "router_z_loss_mlp": 0.2611084, + "step": 2700, + "time_per_iteration": 4.029258966445923 + }, + { + "auxiliary_loss_clip": 0.06593184, + "auxiliary_loss_mlp": 0.01283437, + "balance_loss_clip": 0.06310724, + "balance_loss_mlp": 0.01256042, + "epoch": 0.16239290545618518, + "flos": 26475612658560.0, + "grad_norm": 1.7724025685719413, + "language_loss": 0.80958557, + "learning_rate": 3.818932915932284e-06, + "loss": 0.8883518, + "num_input_tokens_seen": 58464405, + "router_z_loss_clip": 2.83007812, + "router_z_loss_mlp": 0.27404785, + "step": 2701, + "time_per_iteration": 2.5998921394348145 + }, + { + "auxiliary_loss_clip": 0.06590648, + "auxiliary_loss_mlp": 0.01284929, + "balance_loss_clip": 0.06304645, + "balance_loss_mlp": 0.01256271, + "epoch": 0.16245302870885314, + "flos": 15857454614400.0, + "grad_norm": 1.7204107394325303, + "language_loss": 0.74345064, + "learning_rate": 3.818770952120511e-06, + "loss": 0.8222065, + "num_input_tokens_seen": 58483295, + "router_z_loss_clip": 2.859375, + "router_z_loss_mlp": 0.28649902, + "step": 2702, + "time_per_iteration": 3.937354803085327 + }, + { + "auxiliary_loss_clip": 0.06603839, + "auxiliary_loss_mlp": 0.0128822, + "balance_loss_clip": 0.06313589, + "balance_loss_mlp": 0.01259252, + "epoch": 0.1625131519615211, + "flos": 14761710771840.0, + "grad_norm": 9.119129404803312, + "language_loss": 0.7369948, + "learning_rate": 3.81860891934076e-06, + "loss": 0.81591535, + "num_input_tokens_seen": 58501205, + "router_z_loss_clip": 2.90234375, + "router_z_loss_mlp": 0.28955078, + "step": 2703, + "time_per_iteration": 2.5070807933807373 + }, + { + "auxiliary_loss_clip": 0.066023, + "auxiliary_loss_mlp": 0.01283178, + "balance_loss_clip": 0.0631163, + "balance_loss_mlp": 0.01255033, + "epoch": 0.1625732752141891, + "flos": 28228073276160.0, + "grad_norm": 2.112253840465368, + "language_loss": 0.70914233, + "learning_rate": 3.818446817599176e-06, + "loss": 0.78799713, + "num_input_tokens_seen": 58522315, + "router_z_loss_clip": 2.90820312, + "router_z_loss_mlp": 0.28112793, + "step": 2704, + "time_per_iteration": 2.6071994304656982 + }, + { + "auxiliary_loss_clip": 0.06486984, + "auxiliary_loss_mlp": 0.01271467, + "balance_loss_clip": 0.06323022, + "balance_loss_mlp": 0.01264725, + "epoch": 0.16263339846685707, + "flos": 67347268871040.0, + "grad_norm": 0.7781332743607355, + "language_loss": 0.53379726, + "learning_rate": 3.818284646901907e-06, + "loss": 0.61138183, + "num_input_tokens_seen": 58586695, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.06756592, + "step": 2705, + "time_per_iteration": 3.1592283248901367 + }, + { + "auxiliary_loss_clip": 0.06599878, + "auxiliary_loss_mlp": 0.01288619, + "balance_loss_clip": 0.06308411, + "balance_loss_mlp": 0.01259854, + "epoch": 0.16269352171952503, + "flos": 14324360785920.0, + "grad_norm": 2.6444300047772575, + "language_loss": 0.76420808, + "learning_rate": 3.818122407255102e-06, + "loss": 0.84309304, + "num_input_tokens_seen": 58602435, + "router_z_loss_clip": 2.9140625, + "router_z_loss_mlp": 0.2878418, + "step": 2706, + "time_per_iteration": 2.494798183441162 + }, + { + "auxiliary_loss_clip": 0.06595413, + "auxiliary_loss_mlp": 0.01288657, + "balance_loss_clip": 0.06307741, + "balance_loss_mlp": 0.01263015, + "epoch": 0.162753644972193, + "flos": 28367916940800.0, + "grad_norm": 2.0996317585826727, + "language_loss": 0.73324966, + "learning_rate": 3.817960098664914e-06, + "loss": 0.8120904, + "num_input_tokens_seen": 58621275, + "router_z_loss_clip": 2.87695312, + "router_z_loss_mlp": 0.25646973, + "step": 2707, + "time_per_iteration": 5.361986875534058 + }, + { + "auxiliary_loss_clip": 0.06597963, + "auxiliary_loss_mlp": 0.01297936, + "balance_loss_clip": 0.06310263, + "balance_loss_mlp": 0.01270721, + "epoch": 0.16281376822486096, + "flos": 19943971839360.0, + "grad_norm": 3.72169556400114, + "language_loss": 0.83658004, + "learning_rate": 3.817797721137495e-06, + "loss": 0.91553903, + "num_input_tokens_seen": 58637550, + "router_z_loss_clip": 2.875, + "router_z_loss_mlp": 0.27233887, + "step": 2708, + "time_per_iteration": 2.528703451156616 + }, + { + "auxiliary_loss_clip": 0.0659356, + "auxiliary_loss_mlp": 0.01292098, + "balance_loss_clip": 0.06302815, + "balance_loss_mlp": 0.01262701, + "epoch": 0.16287389147752893, + "flos": 21258118149120.0, + "grad_norm": 2.208557612842335, + "language_loss": 0.86945301, + "learning_rate": 3.817635274679006e-06, + "loss": 0.94830966, + "num_input_tokens_seen": 58654135, + "router_z_loss_clip": 2.91015625, + "router_z_loss_mlp": 0.29394531, + "step": 2709, + "time_per_iteration": 2.5158472061157227 + }, + { + "auxiliary_loss_clip": 0.06590779, + "auxiliary_loss_mlp": 0.01297599, + "balance_loss_clip": 0.06302857, + "balance_loss_mlp": 0.0127123, + "epoch": 0.1629340147301969, + "flos": 19250679957120.0, + "grad_norm": 2.0845626973210942, + "language_loss": 0.926085, + "learning_rate": 3.817472759295605e-06, + "loss": 1.00496876, + "num_input_tokens_seen": 58674320, + "router_z_loss_clip": 2.87695312, + "router_z_loss_mlp": 0.26367188, + "step": 2710, + "time_per_iteration": 2.566678762435913 + }, + { + "auxiliary_loss_clip": 0.06590527, + "auxiliary_loss_mlp": 0.01299634, + "balance_loss_clip": 0.06304915, + "balance_loss_mlp": 0.01271691, + "epoch": 0.16299413798286488, + "flos": 21255896016000.0, + "grad_norm": 2.354283395736919, + "language_loss": 0.82405818, + "learning_rate": 3.817310174993453e-06, + "loss": 0.90295976, + "num_input_tokens_seen": 58691000, + "router_z_loss_clip": 2.859375, + "router_z_loss_mlp": 0.27954102, + "step": 2711, + "time_per_iteration": 2.5129330158233643 + }, + { + "auxiliary_loss_clip": 0.06600536, + "auxiliary_loss_mlp": 0.01290666, + "balance_loss_clip": 0.0630425, + "balance_loss_mlp": 0.0126115, + "epoch": 0.16305426123553285, + "flos": 18776545228800.0, + "grad_norm": 3.9666408475565462, + "language_loss": 0.82468587, + "learning_rate": 3.817147521778719e-06, + "loss": 0.90359789, + "num_input_tokens_seen": 58710230, + "router_z_loss_clip": 2.96289062, + "router_z_loss_mlp": 0.29516602, + "step": 2712, + "time_per_iteration": 2.5337300300598145 + }, + { + "auxiliary_loss_clip": 0.06597727, + "auxiliary_loss_mlp": 0.01290483, + "balance_loss_clip": 0.06302102, + "balance_loss_mlp": 0.01261563, + "epoch": 0.16311438448820081, + "flos": 22093643537280.0, + "grad_norm": 1.9569381877955756, + "language_loss": 0.78029472, + "learning_rate": 3.816984799657568e-06, + "loss": 0.85917681, + "num_input_tokens_seen": 58728610, + "router_z_loss_clip": 2.95898438, + "router_z_loss_mlp": 0.28942871, + "step": 2713, + "time_per_iteration": 2.5238146781921387 + }, + { + "auxiliary_loss_clip": 0.06594867, + "auxiliary_loss_mlp": 0.0130017, + "balance_loss_clip": 0.06315845, + "balance_loss_mlp": 0.01271799, + "epoch": 0.16317450774086878, + "flos": 16472565037440.0, + "grad_norm": 2.250248562702171, + "language_loss": 0.80385303, + "learning_rate": 3.8168220086361715e-06, + "loss": 0.88280344, + "num_input_tokens_seen": 58744385, + "router_z_loss_clip": 2.79101562, + "router_z_loss_mlp": 0.28369141, + "step": 2714, + "time_per_iteration": 2.5166831016540527 + }, + { + "auxiliary_loss_clip": 0.06589634, + "auxiliary_loss_mlp": 0.01294838, + "balance_loss_clip": 0.06306746, + "balance_loss_mlp": 0.01269899, + "epoch": 0.16323463099353674, + "flos": 24359832737280.0, + "grad_norm": 1.8056327126335605, + "language_loss": 0.78403461, + "learning_rate": 3.816659148720702e-06, + "loss": 0.8628794, + "num_input_tokens_seen": 58763905, + "router_z_loss_clip": 2.828125, + "router_z_loss_mlp": 0.24951172, + "step": 2715, + "time_per_iteration": 2.5939090251922607 + }, + { + "auxiliary_loss_clip": 0.06588797, + "auxiliary_loss_mlp": 0.01288106, + "balance_loss_clip": 0.06304932, + "balance_loss_mlp": 0.01261952, + "epoch": 0.1632947542462047, + "flos": 24907872366720.0, + "grad_norm": 2.046246244819102, + "language_loss": 0.82485706, + "learning_rate": 3.816496219917336e-06, + "loss": 0.90362608, + "num_input_tokens_seen": 58785580, + "router_z_loss_clip": 2.84179688, + "router_z_loss_mlp": 0.26147461, + "step": 2716, + "time_per_iteration": 2.593174457550049 + }, + { + "auxiliary_loss_clip": 0.06597836, + "auxiliary_loss_mlp": 0.01294616, + "balance_loss_clip": 0.06307962, + "balance_loss_mlp": 0.01266017, + "epoch": 0.1633548774988727, + "flos": 24907285388160.0, + "grad_norm": 1.9895193792693864, + "language_loss": 0.87446529, + "learning_rate": 3.816333222232251e-06, + "loss": 0.95338982, + "num_input_tokens_seen": 58806075, + "router_z_loss_clip": 2.90234375, + "router_z_loss_mlp": 0.28613281, + "step": 2717, + "time_per_iteration": 2.55460262298584 + }, + { + "auxiliary_loss_clip": 0.0659758, + "auxiliary_loss_mlp": 0.01288078, + "balance_loss_clip": 0.06314965, + "balance_loss_mlp": 0.01262413, + "epoch": 0.16341500075154067, + "flos": 30449008471680.0, + "grad_norm": 1.9093048334188691, + "language_loss": 0.77648151, + "learning_rate": 3.816170155671629e-06, + "loss": 0.8553381, + "num_input_tokens_seen": 58827405, + "router_z_loss_clip": 2.82617188, + "router_z_loss_mlp": 0.25671387, + "step": 2718, + "time_per_iteration": 2.6473746299743652 + }, + { + "auxiliary_loss_clip": 0.06597009, + "auxiliary_loss_mlp": 0.01285687, + "balance_loss_clip": 0.0631033, + "balance_loss_mlp": 0.01259783, + "epoch": 0.16347512400420863, + "flos": 22791253904640.0, + "grad_norm": 2.222005290704418, + "language_loss": 0.74954313, + "learning_rate": 3.816007020241652e-06, + "loss": 0.82837009, + "num_input_tokens_seen": 58847205, + "router_z_loss_clip": 2.8671875, + "router_z_loss_mlp": 0.25866699, + "step": 2719, + "time_per_iteration": 2.551116704940796 + }, + { + "auxiliary_loss_clip": 0.0659292, + "auxiliary_loss_mlp": 0.01283628, + "balance_loss_clip": 0.0630803, + "balance_loss_mlp": 0.01257831, + "epoch": 0.1635352472568766, + "flos": 22639083690240.0, + "grad_norm": 1.7533438569003168, + "language_loss": 0.73446441, + "learning_rate": 3.815843815948507e-06, + "loss": 0.81322992, + "num_input_tokens_seen": 58866865, + "router_z_loss_clip": 2.84765625, + "router_z_loss_mlp": 0.25805664, + "step": 2720, + "time_per_iteration": 2.5771543979644775 + }, + { + "auxiliary_loss_clip": 0.06588636, + "auxiliary_loss_mlp": 0.01282225, + "balance_loss_clip": 0.0630826, + "balance_loss_mlp": 0.01254949, + "epoch": 0.16359537050954456, + "flos": 15528362503680.0, + "grad_norm": 2.643329433322918, + "language_loss": 0.7707237, + "learning_rate": 3.8156805427983824e-06, + "loss": 0.84943235, + "num_input_tokens_seen": 58885200, + "router_z_loss_clip": 2.80664062, + "router_z_loss_mlp": 0.27294922, + "step": 2721, + "time_per_iteration": 2.4961769580841064 + }, + { + "auxiliary_loss_clip": 0.06596414, + "auxiliary_loss_mlp": 0.0128382, + "balance_loss_clip": 0.0630523, + "balance_loss_mlp": 0.01256175, + "epoch": 0.16365549376221253, + "flos": 22096578430080.0, + "grad_norm": 2.1311655694461917, + "language_loss": 0.79885328, + "learning_rate": 3.8155172007974695e-06, + "loss": 0.87765563, + "num_input_tokens_seen": 58906385, + "router_z_loss_clip": 2.91601562, + "router_z_loss_mlp": 0.27648926, + "step": 2722, + "time_per_iteration": 2.614875078201294 + }, + { + "auxiliary_loss_clip": 0.06605944, + "auxiliary_loss_mlp": 0.01289108, + "balance_loss_clip": 0.06310583, + "balance_loss_mlp": 0.01258602, + "epoch": 0.1637156170148805, + "flos": 24067148025600.0, + "grad_norm": 1.9382892216015752, + "language_loss": 0.85628319, + "learning_rate": 3.8153537899519624e-06, + "loss": 0.93523371, + "num_input_tokens_seen": 58925040, + "router_z_loss_clip": 2.95507812, + "router_z_loss_mlp": 0.30493164, + "step": 2723, + "time_per_iteration": 2.531521797180176 + }, + { + "auxiliary_loss_clip": 0.0658607, + "auxiliary_loss_mlp": 0.01286244, + "balance_loss_clip": 0.06307479, + "balance_loss_mlp": 0.01259732, + "epoch": 0.1637757402675485, + "flos": 26692212263040.0, + "grad_norm": 4.459915510598608, + "language_loss": 0.71697843, + "learning_rate": 3.815190310268058e-06, + "loss": 0.7957015, + "num_input_tokens_seen": 58944790, + "router_z_loss_clip": 2.79101562, + "router_z_loss_mlp": 0.26477051, + "step": 2724, + "time_per_iteration": 2.577958822250366 + }, + { + "auxiliary_loss_clip": 0.06581962, + "auxiliary_loss_mlp": 0.01288602, + "balance_loss_clip": 0.06304826, + "balance_loss_mlp": 0.01263521, + "epoch": 0.16383586352021645, + "flos": 16112432188800.0, + "grad_norm": 1.9457979219444324, + "language_loss": 0.71286237, + "learning_rate": 3.815026761751955e-06, + "loss": 0.79156804, + "num_input_tokens_seen": 58962500, + "router_z_loss_clip": 2.77539062, + "router_z_loss_mlp": 0.25085449, + "step": 2725, + "time_per_iteration": 2.497311592102051 + }, + { + "auxiliary_loss_clip": 0.06590257, + "auxiliary_loss_mlp": 0.01285785, + "balance_loss_clip": 0.06310654, + "balance_loss_mlp": 0.01259761, + "epoch": 0.16389598677288442, + "flos": 19171031051520.0, + "grad_norm": 2.1904929355188325, + "language_loss": 0.89010125, + "learning_rate": 3.814863144409855e-06, + "loss": 0.96886164, + "num_input_tokens_seen": 58980355, + "router_z_loss_clip": 2.79101562, + "router_z_loss_mlp": 0.26013184, + "step": 2726, + "time_per_iteration": 2.5101511478424072 + }, + { + "auxiliary_loss_clip": 0.06595127, + "auxiliary_loss_mlp": 0.01285031, + "balance_loss_clip": 0.06307214, + "balance_loss_mlp": 0.01257732, + "epoch": 0.16395611002555238, + "flos": 21513431139840.0, + "grad_norm": 1.9675738265317178, + "language_loss": 0.75618744, + "learning_rate": 3.814699458247963e-06, + "loss": 0.83498907, + "num_input_tokens_seen": 58999505, + "router_z_loss_clip": 2.87890625, + "router_z_loss_mlp": 0.27331543, + "step": 2727, + "time_per_iteration": 2.5322039127349854 + }, + { + "auxiliary_loss_clip": 0.06578872, + "auxiliary_loss_mlp": 0.012812, + "balance_loss_clip": 0.06301126, + "balance_loss_mlp": 0.01257298, + "epoch": 0.16401623327822035, + "flos": 21477401084160.0, + "grad_norm": 2.357425852181157, + "language_loss": 0.82921708, + "learning_rate": 3.8145357032724855e-06, + "loss": 0.90781784, + "num_input_tokens_seen": 59017930, + "router_z_loss_clip": 2.78125, + "router_z_loss_mlp": 0.23913574, + "step": 2728, + "time_per_iteration": 2.538081407546997 + }, + { + "auxiliary_loss_clip": 0.06590319, + "auxiliary_loss_mlp": 0.01282423, + "balance_loss_clip": 0.0630119, + "balance_loss_mlp": 0.01255685, + "epoch": 0.1640763565308883, + "flos": 13631362392960.0, + "grad_norm": 3.359167938327165, + "language_loss": 0.85634404, + "learning_rate": 3.814371879489633e-06, + "loss": 0.93507141, + "num_input_tokens_seen": 59035130, + "router_z_loss_clip": 2.890625, + "router_z_loss_mlp": 0.26745605, + "step": 2729, + "time_per_iteration": 2.555157423019409 + }, + { + "auxiliary_loss_clip": 0.06590364, + "auxiliary_loss_mlp": 0.01282244, + "balance_loss_clip": 0.06303068, + "balance_loss_mlp": 0.01255732, + "epoch": 0.16413647978355628, + "flos": 15457057079040.0, + "grad_norm": 2.0375012641424193, + "language_loss": 0.73386455, + "learning_rate": 3.814207986905616e-06, + "loss": 0.81259066, + "num_input_tokens_seen": 59053080, + "router_z_loss_clip": 2.87695312, + "router_z_loss_mlp": 0.26477051, + "step": 2730, + "time_per_iteration": 2.5347042083740234 + }, + { + "auxiliary_loss_clip": 0.06593673, + "auxiliary_loss_mlp": 0.01289719, + "balance_loss_clip": 0.06303447, + "balance_loss_mlp": 0.01261967, + "epoch": 0.16419660303622427, + "flos": 45889043172480.0, + "grad_norm": 1.5633038653846945, + "language_loss": 0.75101161, + "learning_rate": 3.814044025526651e-06, + "loss": 0.82984555, + "num_input_tokens_seen": 59075610, + "router_z_loss_clip": 2.90234375, + "router_z_loss_mlp": 0.27734375, + "step": 2731, + "time_per_iteration": 2.7257211208343506 + }, + { + "auxiliary_loss_clip": 0.06592289, + "auxiliary_loss_mlp": 0.012866, + "balance_loss_clip": 0.06302358, + "balance_loss_mlp": 0.01258967, + "epoch": 0.16425672628889224, + "flos": 18958791859200.0, + "grad_norm": 2.3112437011786238, + "language_loss": 0.79966319, + "learning_rate": 3.8138799953589548e-06, + "loss": 0.87845206, + "num_input_tokens_seen": 59094555, + "router_z_loss_clip": 2.90039062, + "router_z_loss_mlp": 0.27648926, + "step": 2732, + "time_per_iteration": 2.5160276889801025 + }, + { + "auxiliary_loss_clip": 0.06590726, + "auxiliary_loss_mlp": 0.01293299, + "balance_loss_clip": 0.06301223, + "balance_loss_mlp": 0.01263854, + "epoch": 0.1643168495415602, + "flos": 24319316488320.0, + "grad_norm": 2.024679597680736, + "language_loss": 0.69993633, + "learning_rate": 3.8137158964087473e-06, + "loss": 0.77877665, + "num_input_tokens_seen": 59113515, + "router_z_loss_clip": 2.89648438, + "router_z_loss_mlp": 0.29467773, + "step": 2733, + "time_per_iteration": 2.53328537940979 + }, + { + "auxiliary_loss_clip": 0.06586764, + "auxiliary_loss_mlp": 0.0128512, + "balance_loss_clip": 0.06300272, + "balance_loss_mlp": 0.01256426, + "epoch": 0.16437697279422817, + "flos": 26434970628480.0, + "grad_norm": 2.0387940274909537, + "language_loss": 0.81552017, + "learning_rate": 3.8135517286822508e-06, + "loss": 0.89423895, + "num_input_tokens_seen": 59133275, + "router_z_loss_clip": 2.86132812, + "router_z_loss_mlp": 0.28674316, + "step": 2734, + "time_per_iteration": 2.567229747772217 + }, + { + "auxiliary_loss_clip": 0.0658897, + "auxiliary_loss_mlp": 0.01289023, + "balance_loss_clip": 0.06299339, + "balance_loss_mlp": 0.01261271, + "epoch": 0.16443709604689613, + "flos": 34540808503680.0, + "grad_norm": 4.048112349799869, + "language_loss": 0.82907999, + "learning_rate": 3.8133874921856914e-06, + "loss": 0.90785992, + "num_input_tokens_seen": 59154095, + "router_z_loss_clip": 2.89257812, + "router_z_loss_mlp": 0.27758789, + "step": 2735, + "time_per_iteration": 2.63996958732605 + }, + { + "auxiliary_loss_clip": 0.06579679, + "auxiliary_loss_mlp": 0.01279603, + "balance_loss_clip": 0.06297098, + "balance_loss_mlp": 0.01254783, + "epoch": 0.1644972192995641, + "flos": 23264717800320.0, + "grad_norm": 2.4207218830736417, + "language_loss": 0.80072814, + "learning_rate": 3.813223186925296e-06, + "loss": 0.87932098, + "num_input_tokens_seen": 59173795, + "router_z_loss_clip": 2.828125, + "router_z_loss_mlp": 0.24816895, + "step": 2736, + "time_per_iteration": 2.546694755554199 + }, + { + "auxiliary_loss_clip": 0.0658504, + "auxiliary_loss_mlp": 0.0128325, + "balance_loss_clip": 0.06300261, + "balance_loss_mlp": 0.01256499, + "epoch": 0.1645573425522321, + "flos": 26986825618560.0, + "grad_norm": 1.6682039059194231, + "language_loss": 0.82238322, + "learning_rate": 3.8130588129072964e-06, + "loss": 0.90106606, + "num_input_tokens_seen": 59191610, + "router_z_loss_clip": 2.84765625, + "router_z_loss_mlp": 0.2677002, + "step": 2737, + "time_per_iteration": 2.5593652725219727 + }, + { + "auxiliary_loss_clip": 0.06591076, + "auxiliary_loss_mlp": 0.0128149, + "balance_loss_clip": 0.06302774, + "balance_loss_mlp": 0.01256087, + "epoch": 0.16461746580490005, + "flos": 28739495871360.0, + "grad_norm": 1.7184215818783282, + "language_loss": 0.88135791, + "learning_rate": 3.8128943701379246e-06, + "loss": 0.96008366, + "num_input_tokens_seen": 59213000, + "router_z_loss_clip": 2.88476562, + "router_z_loss_mlp": 0.25402832, + "step": 2738, + "time_per_iteration": 2.6650192737579346 + }, + { + "auxiliary_loss_clip": 0.06589583, + "auxiliary_loss_mlp": 0.0128808, + "balance_loss_clip": 0.06299618, + "balance_loss_mlp": 0.01259446, + "epoch": 0.16467758905756802, + "flos": 24936062065920.0, + "grad_norm": 2.428798415539057, + "language_loss": 0.72705042, + "learning_rate": 3.8127298586234167e-06, + "loss": 0.80582702, + "num_input_tokens_seen": 59232340, + "router_z_loss_clip": 2.90234375, + "router_z_loss_mlp": 0.28649902, + "step": 2739, + "time_per_iteration": 4.007360935211182 + }, + { + "auxiliary_loss_clip": 0.06576341, + "auxiliary_loss_mlp": 0.0128871, + "balance_loss_clip": 0.06294868, + "balance_loss_mlp": 0.01261435, + "epoch": 0.16473771231023598, + "flos": 24833380487040.0, + "grad_norm": 2.4914045636792133, + "language_loss": 0.82377362, + "learning_rate": 3.8125652783700104e-06, + "loss": 0.90242416, + "num_input_tokens_seen": 59253950, + "router_z_loss_clip": 2.81445312, + "router_z_loss_mlp": 0.27270508, + "step": 2740, + "time_per_iteration": 2.5806076526641846 + }, + { + "auxiliary_loss_clip": 0.06593102, + "auxiliary_loss_mlp": 0.01294674, + "balance_loss_clip": 0.0629887, + "balance_loss_mlp": 0.01265218, + "epoch": 0.16479783556290395, + "flos": 39905609690880.0, + "grad_norm": 2.0874742304604785, + "language_loss": 0.6960665, + "learning_rate": 3.8124006293839475e-06, + "loss": 0.77494431, + "num_input_tokens_seen": 59275545, + "router_z_loss_clip": 2.9453125, + "router_z_loss_mlp": 0.29431152, + "step": 2741, + "time_per_iteration": 2.67899489402771 + }, + { + "auxiliary_loss_clip": 0.06583216, + "auxiliary_loss_mlp": 0.01289999, + "balance_loss_clip": 0.06296665, + "balance_loss_mlp": 0.0126295, + "epoch": 0.16485795881557191, + "flos": 19902449341440.0, + "grad_norm": 1.99300527848014, + "language_loss": 0.80380434, + "learning_rate": 3.812235911671472e-06, + "loss": 0.88253653, + "num_input_tokens_seen": 59293480, + "router_z_loss_clip": 2.86523438, + "router_z_loss_mlp": 0.27062988, + "step": 2742, + "time_per_iteration": 4.01186203956604 + }, + { + "auxiliary_loss_clip": 0.06583486, + "auxiliary_loss_mlp": 0.0128544, + "balance_loss_clip": 0.06299208, + "balance_loss_mlp": 0.01258034, + "epoch": 0.16491808206823988, + "flos": 20562017155200.0, + "grad_norm": 1.859989576393153, + "language_loss": 0.85480952, + "learning_rate": 3.8120711252388274e-06, + "loss": 0.9334988, + "num_input_tokens_seen": 59313435, + "router_z_loss_clip": 2.84375, + "router_z_loss_mlp": 0.27392578, + "step": 2743, + "time_per_iteration": 2.531813859939575 + }, + { + "auxiliary_loss_clip": 0.06583907, + "auxiliary_loss_mlp": 0.01288972, + "balance_loss_clip": 0.06300064, + "balance_loss_mlp": 0.01261018, + "epoch": 0.16497820532090787, + "flos": 23806803790080.0, + "grad_norm": 1.9796677960929725, + "language_loss": 0.87141418, + "learning_rate": 3.811906270092265e-06, + "loss": 0.95014304, + "num_input_tokens_seen": 59331535, + "router_z_loss_clip": 2.8359375, + "router_z_loss_mlp": 0.27966309, + "step": 2744, + "time_per_iteration": 2.5968780517578125 + }, + { + "auxiliary_loss_clip": 0.06573457, + "auxiliary_loss_mlp": 0.01283559, + "balance_loss_clip": 0.0629618, + "balance_loss_mlp": 0.01258847, + "epoch": 0.16503832857357584, + "flos": 25489510283520.0, + "grad_norm": 2.535956000825199, + "language_loss": 0.83221614, + "learning_rate": 3.811741346238036e-06, + "loss": 0.91078633, + "num_input_tokens_seen": 59350680, + "router_z_loss_clip": 2.76953125, + "router_z_loss_mlp": 0.24743652, + "step": 2745, + "time_per_iteration": 2.5640015602111816 + }, + { + "auxiliary_loss_clip": 0.06588263, + "auxiliary_loss_mlp": 0.01287637, + "balance_loss_clip": 0.06305014, + "balance_loss_mlp": 0.01261196, + "epoch": 0.1650984518262438, + "flos": 17681849562240.0, + "grad_norm": 2.0373309792274883, + "language_loss": 0.7743578, + "learning_rate": 3.8115763536823923e-06, + "loss": 0.85311675, + "num_input_tokens_seen": 59367020, + "router_z_loss_clip": 2.83007812, + "router_z_loss_mlp": 0.26452637, + "step": 2746, + "time_per_iteration": 5.4125282764434814 + }, + { + "auxiliary_loss_clip": 0.06589019, + "auxiliary_loss_mlp": 0.01289439, + "balance_loss_clip": 0.06303473, + "balance_loss_mlp": 0.01261723, + "epoch": 0.16515857507891177, + "flos": 18704401263360.0, + "grad_norm": 1.60188965958096, + "language_loss": 0.81673479, + "learning_rate": 3.811411292431592e-06, + "loss": 0.89551938, + "num_input_tokens_seen": 59386075, + "router_z_loss_clip": 2.85742188, + "router_z_loss_mlp": 0.27685547, + "step": 2747, + "time_per_iteration": 2.5460550785064697 + }, + { + "auxiliary_loss_clip": 0.06594047, + "auxiliary_loss_mlp": 0.0128679, + "balance_loss_clip": 0.06307407, + "balance_loss_mlp": 0.01260707, + "epoch": 0.16521869833157973, + "flos": 15015472462080.0, + "grad_norm": 2.468884923074517, + "language_loss": 0.71168172, + "learning_rate": 3.8112461624918945e-06, + "loss": 0.79049003, + "num_input_tokens_seen": 59402690, + "router_z_loss_clip": 2.86328125, + "router_z_loss_mlp": 0.26074219, + "step": 2748, + "time_per_iteration": 2.493168592453003 + }, + { + "auxiliary_loss_clip": 0.06589203, + "auxiliary_loss_mlp": 0.01284146, + "balance_loss_clip": 0.06305005, + "balance_loss_mlp": 0.01259732, + "epoch": 0.1652788215842477, + "flos": 22126654846080.0, + "grad_norm": 5.244624397631241, + "language_loss": 0.8897143, + "learning_rate": 3.811080963869561e-06, + "loss": 0.9684478, + "num_input_tokens_seen": 59421130, + "router_z_loss_clip": 2.84179688, + "router_z_loss_mlp": 0.24401855, + "step": 2749, + "time_per_iteration": 2.6453802585601807 + }, + { + "auxiliary_loss_clip": 0.0659653, + "auxiliary_loss_mlp": 0.01290094, + "balance_loss_clip": 0.06307155, + "balance_loss_mlp": 0.01261913, + "epoch": 0.16533894483691566, + "flos": 18339027534720.0, + "grad_norm": 3.9658549336517446, + "language_loss": 0.79764348, + "learning_rate": 3.8109156965708557e-06, + "loss": 0.87650967, + "num_input_tokens_seen": 59438970, + "router_z_loss_clip": 2.88867188, + "router_z_loss_mlp": 0.28210449, + "step": 2750, + "time_per_iteration": 2.5099878311157227 + }, + { + "auxiliary_loss_clip": 0.06587892, + "auxiliary_loss_mlp": 0.01285687, + "balance_loss_clip": 0.06303497, + "balance_loss_mlp": 0.01257673, + "epoch": 0.16539906808958366, + "flos": 22388592309120.0, + "grad_norm": 1.8681239023451541, + "language_loss": 0.95973986, + "learning_rate": 3.8107503606020455e-06, + "loss": 1.03847575, + "num_input_tokens_seen": 59458510, + "router_z_loss_clip": 2.84570312, + "router_z_loss_mlp": 0.2800293, + "step": 2751, + "time_per_iteration": 2.580857753753662 + }, + { + "auxiliary_loss_clip": 0.06591333, + "auxiliary_loss_mlp": 0.01293333, + "balance_loss_clip": 0.06311293, + "balance_loss_mlp": 0.01266344, + "epoch": 0.16545919134225162, + "flos": 22717726346880.0, + "grad_norm": 2.017884310231, + "language_loss": 0.71926272, + "learning_rate": 3.8105849559693997e-06, + "loss": 0.79810935, + "num_input_tokens_seen": 59477110, + "router_z_loss_clip": 2.79882812, + "router_z_loss_mlp": 0.26965332, + "step": 2752, + "time_per_iteration": 2.5533626079559326 + }, + { + "auxiliary_loss_clip": 0.06474683, + "auxiliary_loss_mlp": 0.01280412, + "balance_loss_clip": 0.06313415, + "balance_loss_mlp": 0.01272663, + "epoch": 0.1655193145949196, + "flos": 67822493702400.0, + "grad_norm": 0.7367497765392101, + "language_loss": 0.5395115, + "learning_rate": 3.810419482679192e-06, + "loss": 0.61706245, + "num_input_tokens_seen": 59541155, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.07739258, + "step": 2753, + "time_per_iteration": 3.283729314804077 + }, + { + "auxiliary_loss_clip": 0.06593385, + "auxiliary_loss_mlp": 0.01285286, + "balance_loss_clip": 0.06311026, + "balance_loss_mlp": 0.01258547, + "epoch": 0.16557943784758755, + "flos": 24287353355520.0, + "grad_norm": 1.793852310261697, + "language_loss": 0.75999093, + "learning_rate": 3.8102539407376954e-06, + "loss": 0.8387776, + "num_input_tokens_seen": 59561155, + "router_z_loss_clip": 2.82421875, + "router_z_loss_mlp": 0.26757812, + "step": 2754, + "time_per_iteration": 2.608365297317505 + }, + { + "auxiliary_loss_clip": 0.06608296, + "auxiliary_loss_mlp": 0.01288183, + "balance_loss_clip": 0.06315503, + "balance_loss_mlp": 0.01260575, + "epoch": 0.16563956110025552, + "flos": 20089727216640.0, + "grad_norm": 2.367713266740868, + "language_loss": 0.87993264, + "learning_rate": 3.810088330151188e-06, + "loss": 0.95889747, + "num_input_tokens_seen": 59580460, + "router_z_loss_clip": 2.9296875, + "router_z_loss_mlp": 0.27600098, + "step": 2755, + "time_per_iteration": 2.5239596366882324 + }, + { + "auxiliary_loss_clip": 0.06584992, + "auxiliary_loss_mlp": 0.01279054, + "balance_loss_clip": 0.06303753, + "balance_loss_mlp": 0.01253877, + "epoch": 0.16569968435292348, + "flos": 28041382379520.0, + "grad_norm": 1.6563009546595795, + "language_loss": 0.7383014, + "learning_rate": 3.80992265092595e-06, + "loss": 0.81694186, + "num_input_tokens_seen": 59600025, + "router_z_loss_clip": 2.8125, + "router_z_loss_mlp": 0.25195312, + "step": 2756, + "time_per_iteration": 2.6032936573028564 + }, + { + "auxiliary_loss_clip": 0.06582732, + "auxiliary_loss_mlp": 0.01284003, + "balance_loss_clip": 0.06305105, + "balance_loss_mlp": 0.0125817, + "epoch": 0.16575980760559147, + "flos": 26257461753600.0, + "grad_norm": 1.6426190009356174, + "language_loss": 0.75875264, + "learning_rate": 3.8097569030682636e-06, + "loss": 0.83741999, + "num_input_tokens_seen": 59620600, + "router_z_loss_clip": 2.7734375, + "router_z_loss_mlp": 0.25817871, + "step": 2757, + "time_per_iteration": 2.5745790004730225 + }, + { + "auxiliary_loss_clip": 0.06586438, + "auxiliary_loss_mlp": 0.01285191, + "balance_loss_clip": 0.063063, + "balance_loss_mlp": 0.01258822, + "epoch": 0.16581993085825944, + "flos": 26951382541440.0, + "grad_norm": 1.7077128151850376, + "language_loss": 0.85793787, + "learning_rate": 3.8095910865844137e-06, + "loss": 0.93665409, + "num_input_tokens_seen": 59641385, + "router_z_loss_clip": 2.80078125, + "router_z_loss_mlp": 0.26391602, + "step": 2758, + "time_per_iteration": 2.6094768047332764 + }, + { + "auxiliary_loss_clip": 0.06582282, + "auxiliary_loss_mlp": 0.01281611, + "balance_loss_clip": 0.06301229, + "balance_loss_mlp": 0.01255981, + "epoch": 0.1658800541109274, + "flos": 21660192766080.0, + "grad_norm": 2.0058299268215602, + "language_loss": 0.79821748, + "learning_rate": 3.809425201480689e-06, + "loss": 0.87685645, + "num_input_tokens_seen": 59659865, + "router_z_loss_clip": 2.81054688, + "router_z_loss_mlp": 0.25646973, + "step": 2759, + "time_per_iteration": 2.5326881408691406 + }, + { + "auxiliary_loss_clip": 0.06584738, + "auxiliary_loss_mlp": 0.01287284, + "balance_loss_clip": 0.06296851, + "balance_loss_mlp": 0.01258721, + "epoch": 0.16594017736359537, + "flos": 16441063102080.0, + "grad_norm": 2.640523985370613, + "language_loss": 0.76520288, + "learning_rate": 3.8092592477633793e-06, + "loss": 0.84392309, + "num_input_tokens_seen": 59678780, + "router_z_loss_clip": 2.8828125, + "router_z_loss_mlp": 0.28588867, + "step": 2760, + "time_per_iteration": 2.5365755558013916 + }, + { + "auxiliary_loss_clip": 0.06596339, + "auxiliary_loss_mlp": 0.01287081, + "balance_loss_clip": 0.06307873, + "balance_loss_mlp": 0.01260986, + "epoch": 0.16600030061626334, + "flos": 22643779518720.0, + "grad_norm": 1.8139140163731928, + "language_loss": 0.74449325, + "learning_rate": 3.8090932254387774e-06, + "loss": 0.82332754, + "num_input_tokens_seen": 59698795, + "router_z_loss_clip": 2.88671875, + "router_z_loss_mlp": 0.26086426, + "step": 2761, + "time_per_iteration": 2.5551891326904297 + }, + { + "auxiliary_loss_clip": 0.06586796, + "auxiliary_loss_mlp": 0.01291841, + "balance_loss_clip": 0.0630264, + "balance_loss_mlp": 0.01263922, + "epoch": 0.1660604238689313, + "flos": 26403887963520.0, + "grad_norm": 1.8147235749558717, + "language_loss": 0.89404368, + "learning_rate": 3.8089271345131788e-06, + "loss": 0.97283, + "num_input_tokens_seen": 59718795, + "router_z_loss_clip": 2.83984375, + "router_z_loss_mlp": 0.27905273, + "step": 2762, + "time_per_iteration": 2.587952136993408 + }, + { + "auxiliary_loss_clip": 0.0659417, + "auxiliary_loss_mlp": 0.01281866, + "balance_loss_clip": 0.0630425, + "balance_loss_mlp": 0.01255282, + "epoch": 0.16612054712159927, + "flos": 23046776530560.0, + "grad_norm": 1.779645358746394, + "language_loss": 0.8912673, + "learning_rate": 3.8087609749928822e-06, + "loss": 0.97002763, + "num_input_tokens_seen": 59737555, + "router_z_loss_clip": 2.90234375, + "router_z_loss_mlp": 0.26611328, + "step": 2763, + "time_per_iteration": 2.5509772300720215 + }, + { + "auxiliary_loss_clip": 0.06462647, + "auxiliary_loss_mlp": 0.01266671, + "balance_loss_clip": 0.06301262, + "balance_loss_mlp": 0.01259697, + "epoch": 0.16618067037426726, + "flos": 59261388266880.0, + "grad_norm": 0.7675418877188291, + "language_loss": 0.59855133, + "learning_rate": 3.8085947468841885e-06, + "loss": 0.67584455, + "num_input_tokens_seen": 59800915, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.06988525, + "step": 2764, + "time_per_iteration": 3.221308708190918 + }, + { + "auxiliary_loss_clip": 0.06595036, + "auxiliary_loss_mlp": 0.0129625, + "balance_loss_clip": 0.06311496, + "balance_loss_mlp": 0.01269607, + "epoch": 0.16624079362693522, + "flos": 27206192407680.0, + "grad_norm": 22.231303672766604, + "language_loss": 0.8298772, + "learning_rate": 3.808428450193401e-06, + "loss": 0.90879005, + "num_input_tokens_seen": 59822910, + "router_z_loss_clip": 2.83203125, + "router_z_loss_mlp": 0.26635742, + "step": 2765, + "time_per_iteration": 2.5886435508728027 + }, + { + "auxiliary_loss_clip": 0.06603917, + "auxiliary_loss_mlp": 0.0129703, + "balance_loss_clip": 0.06306268, + "balance_loss_mlp": 0.01269099, + "epoch": 0.1663009168796032, + "flos": 10929542215680.0, + "grad_norm": 2.384069935097126, + "language_loss": 0.7120772, + "learning_rate": 3.8082620849268244e-06, + "loss": 0.79108667, + "num_input_tokens_seen": 59838805, + "router_z_loss_clip": 2.97851562, + "router_z_loss_mlp": 0.27941895, + "step": 2766, + "time_per_iteration": 2.526913642883301 + }, + { + "auxiliary_loss_clip": 0.06591118, + "auxiliary_loss_mlp": 0.0128837, + "balance_loss_clip": 0.06309089, + "balance_loss_mlp": 0.01262526, + "epoch": 0.16636104013227115, + "flos": 17900168175360.0, + "grad_norm": 2.2120517261374593, + "language_loss": 0.89624047, + "learning_rate": 3.808095651090769e-06, + "loss": 0.97503531, + "num_input_tokens_seen": 59855345, + "router_z_loss_clip": 2.82226562, + "router_z_loss_mlp": 0.25830078, + "step": 2767, + "time_per_iteration": 2.4989144802093506 + }, + { + "auxiliary_loss_clip": 0.06446301, + "auxiliary_loss_mlp": 0.0126062, + "balance_loss_clip": 0.0628543, + "balance_loss_mlp": 0.01253307, + "epoch": 0.16642116338493912, + "flos": 66748342285440.0, + "grad_norm": 0.6237778354152628, + "language_loss": 0.52864301, + "learning_rate": 3.8079291486915447e-06, + "loss": 0.60571223, + "num_input_tokens_seen": 59917710, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.07293701, + "step": 2768, + "time_per_iteration": 3.263981580734253 + }, + { + "auxiliary_loss_clip": 0.06597716, + "auxiliary_loss_mlp": 0.01287278, + "balance_loss_clip": 0.06305783, + "balance_loss_mlp": 0.0126048, + "epoch": 0.16648128663760708, + "flos": 19032067854720.0, + "grad_norm": 2.5043941820877524, + "language_loss": 0.85743988, + "learning_rate": 3.8077625777354667e-06, + "loss": 0.93628991, + "num_input_tokens_seen": 59935105, + "router_z_loss_clip": 2.91796875, + "router_z_loss_mlp": 0.26782227, + "step": 2769, + "time_per_iteration": 2.5169060230255127 + }, + { + "auxiliary_loss_clip": 0.06441471, + "auxiliary_loss_mlp": 0.01258691, + "balance_loss_clip": 0.06281121, + "balance_loss_mlp": 0.01251771, + "epoch": 0.16654140989027508, + "flos": 70154370103680.0, + "grad_norm": 0.7855037683883999, + "language_loss": 0.57378197, + "learning_rate": 3.80759593822885e-06, + "loss": 0.65078354, + "num_input_tokens_seen": 59984085, + "router_z_loss_clip": 1.60351562, + "router_z_loss_mlp": 0.06939697, + "step": 2770, + "time_per_iteration": 3.0450947284698486 + }, + { + "auxiliary_loss_clip": 0.0643771, + "auxiliary_loss_mlp": 0.01261832, + "balance_loss_clip": 0.06278233, + "balance_loss_mlp": 0.01254959, + "epoch": 0.16660153314294304, + "flos": 70290398407680.0, + "grad_norm": 0.8814976481921372, + "language_loss": 0.5630703, + "learning_rate": 3.807429230178015e-06, + "loss": 0.64006579, + "num_input_tokens_seen": 60043470, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.06890869, + "step": 2771, + "time_per_iteration": 3.0379133224487305 + }, + { + "auxiliary_loss_clip": 0.06582694, + "auxiliary_loss_mlp": 0.01286148, + "balance_loss_clip": 0.06303653, + "balance_loss_mlp": 0.01260756, + "epoch": 0.166661656395611, + "flos": 23081590702080.0, + "grad_norm": 2.5291823890046534, + "language_loss": 0.71466291, + "learning_rate": 3.8072624535892817e-06, + "loss": 0.79335129, + "num_input_tokens_seen": 60063045, + "router_z_loss_clip": 2.79101562, + "router_z_loss_mlp": 0.25378418, + "step": 2772, + "time_per_iteration": 2.551870584487915 + }, + { + "auxiliary_loss_clip": 0.06576528, + "auxiliary_loss_mlp": 0.01281534, + "balance_loss_clip": 0.06298962, + "balance_loss_mlp": 0.01255082, + "epoch": 0.16672177964827897, + "flos": 28373912507520.0, + "grad_norm": 1.9791838329774285, + "language_loss": 0.87486583, + "learning_rate": 3.807095608468975e-06, + "loss": 0.95344645, + "num_input_tokens_seen": 60081945, + "router_z_loss_clip": 2.77929688, + "router_z_loss_mlp": 0.26452637, + "step": 2773, + "time_per_iteration": 2.613593339920044 + }, + { + "auxiliary_loss_clip": 0.06585228, + "auxiliary_loss_mlp": 0.01284542, + "balance_loss_clip": 0.06305268, + "balance_loss_mlp": 0.01259532, + "epoch": 0.16678190290094694, + "flos": 19095700631040.0, + "grad_norm": 2.4658170667158545, + "language_loss": 0.8279835, + "learning_rate": 3.8069286948234224e-06, + "loss": 0.90668118, + "num_input_tokens_seen": 60096820, + "router_z_loss_clip": 2.80273438, + "router_z_loss_mlp": 0.25012207, + "step": 2774, + "time_per_iteration": 2.5196969509124756 + }, + { + "auxiliary_loss_clip": 0.06592362, + "auxiliary_loss_mlp": 0.01284239, + "balance_loss_clip": 0.06307152, + "balance_loss_mlp": 0.01258871, + "epoch": 0.1668420261536149, + "flos": 21805612727040.0, + "grad_norm": 2.7739422626660053, + "language_loss": 0.84618509, + "learning_rate": 3.806761712658952e-06, + "loss": 0.92495108, + "num_input_tokens_seen": 60116140, + "router_z_loss_clip": 2.85546875, + "router_z_loss_mlp": 0.25354004, + "step": 2775, + "time_per_iteration": 2.5799014568328857 + }, + { + "auxiliary_loss_clip": 0.06591405, + "auxiliary_loss_mlp": 0.01285295, + "balance_loss_clip": 0.06311037, + "balance_loss_mlp": 0.01260702, + "epoch": 0.16690214940628287, + "flos": 19068559107840.0, + "grad_norm": 2.4582225386756793, + "language_loss": 0.81805599, + "learning_rate": 3.806594661981897e-06, + "loss": 0.89682293, + "num_input_tokens_seen": 60134235, + "router_z_loss_clip": 2.8046875, + "router_z_loss_mlp": 0.24584961, + "step": 2776, + "time_per_iteration": 2.547075033187866 + }, + { + "auxiliary_loss_clip": 0.06574798, + "auxiliary_loss_mlp": 0.01281066, + "balance_loss_clip": 0.06304269, + "balance_loss_mlp": 0.01257188, + "epoch": 0.16696227265895086, + "flos": 18594550160640.0, + "grad_norm": 2.127036404214793, + "language_loss": 0.80698764, + "learning_rate": 3.8064275427985906e-06, + "loss": 0.88554621, + "num_input_tokens_seen": 60153275, + "router_z_loss_clip": 2.703125, + "router_z_loss_mlp": 0.2388916, + "step": 2777, + "time_per_iteration": 2.701383352279663 + }, + { + "auxiliary_loss_clip": 0.06586365, + "auxiliary_loss_mlp": 0.0128362, + "balance_loss_clip": 0.06303923, + "balance_loss_mlp": 0.01258323, + "epoch": 0.16702239591161883, + "flos": 23300747856000.0, + "grad_norm": 1.7658630551266277, + "language_loss": 0.85838449, + "learning_rate": 3.806260355115371e-06, + "loss": 0.93708432, + "num_input_tokens_seen": 60173215, + "router_z_loss_clip": 2.82421875, + "router_z_loss_mlp": 0.25305176, + "step": 2778, + "time_per_iteration": 4.054275989532471 + }, + { + "auxiliary_loss_clip": 0.06594409, + "auxiliary_loss_mlp": 0.01286931, + "balance_loss_clip": 0.06310806, + "balance_loss_mlp": 0.01260908, + "epoch": 0.1670825191642868, + "flos": 24432521754240.0, + "grad_norm": 2.130533626904146, + "language_loss": 0.75036883, + "learning_rate": 3.8060930989385778e-06, + "loss": 0.82918215, + "num_input_tokens_seen": 60190515, + "router_z_loss_clip": 2.83789062, + "router_z_loss_mlp": 0.26013184, + "step": 2779, + "time_per_iteration": 2.5570623874664307 + }, + { + "auxiliary_loss_clip": 0.06586824, + "auxiliary_loss_mlp": 0.01289404, + "balance_loss_clip": 0.06304757, + "balance_loss_mlp": 0.01263237, + "epoch": 0.16714264241695476, + "flos": 26804830550400.0, + "grad_norm": 2.754931380433817, + "language_loss": 0.66534865, + "learning_rate": 3.805925774274554e-06, + "loss": 0.74411094, + "num_input_tokens_seen": 60211655, + "router_z_loss_clip": 2.8203125, + "router_z_loss_mlp": 0.26147461, + "step": 2780, + "time_per_iteration": 2.5990118980407715 + }, + { + "auxiliary_loss_clip": 0.06585376, + "auxiliary_loss_mlp": 0.01289397, + "balance_loss_clip": 0.06306757, + "balance_loss_mlp": 0.01263075, + "epoch": 0.16720276566962272, + "flos": 21841768563840.0, + "grad_norm": 3.156228906236902, + "language_loss": 0.80115324, + "learning_rate": 3.805758381129643e-06, + "loss": 0.87990093, + "num_input_tokens_seen": 60230860, + "router_z_loss_clip": 2.78320312, + "router_z_loss_mlp": 0.26318359, + "step": 2781, + "time_per_iteration": 3.9395251274108887 + }, + { + "auxiliary_loss_clip": 0.06586023, + "auxiliary_loss_mlp": 0.01284981, + "balance_loss_clip": 0.06303258, + "balance_loss_mlp": 0.01258791, + "epoch": 0.1672628889222907, + "flos": 21476814105600.0, + "grad_norm": 1.4411022993090745, + "language_loss": 0.75756633, + "learning_rate": 3.805590919510193e-06, + "loss": 0.83627641, + "num_input_tokens_seen": 60250535, + "router_z_loss_clip": 2.828125, + "router_z_loss_mlp": 0.26171875, + "step": 2782, + "time_per_iteration": 2.6298012733459473 + }, + { + "auxiliary_loss_clip": 0.06600203, + "auxiliary_loss_mlp": 0.01288992, + "balance_loss_clip": 0.06305742, + "balance_loss_mlp": 0.0126242, + "epoch": 0.16732301217495865, + "flos": 30781915943040.0, + "grad_norm": 2.647632172572772, + "language_loss": 0.6861552, + "learning_rate": 3.8054233894225547e-06, + "loss": 0.76504719, + "num_input_tokens_seen": 60269530, + "router_z_loss_clip": 2.94335938, + "router_z_loss_mlp": 0.26550293, + "step": 2783, + "time_per_iteration": 2.5996532440185547 + }, + { + "auxiliary_loss_clip": 0.06581019, + "auxiliary_loss_mlp": 0.01284416, + "balance_loss_clip": 0.06301262, + "balance_loss_mlp": 0.0125931, + "epoch": 0.16738313542762664, + "flos": 23480940061440.0, + "grad_norm": 1.7043112393392166, + "language_loss": 0.70624614, + "learning_rate": 3.805255790873081e-06, + "loss": 0.78490055, + "num_input_tokens_seen": 60289900, + "router_z_loss_clip": 2.79492188, + "router_z_loss_mlp": 0.25109863, + "step": 2784, + "time_per_iteration": 2.5658257007598877 + }, + { + "auxiliary_loss_clip": 0.06592201, + "auxiliary_loss_mlp": 0.01289494, + "balance_loss_clip": 0.06306473, + "balance_loss_mlp": 0.01263041, + "epoch": 0.1674432586802946, + "flos": 29796861744000.0, + "grad_norm": 2.259998214947441, + "language_loss": 0.61717749, + "learning_rate": 3.805088123868126e-06, + "loss": 0.69599444, + "num_input_tokens_seen": 60310025, + "router_z_loss_clip": 2.85742188, + "router_z_loss_mlp": 0.2644043, + "step": 2785, + "time_per_iteration": 4.003845691680908 + }, + { + "auxiliary_loss_clip": 0.064503, + "auxiliary_loss_mlp": 0.01262182, + "balance_loss_clip": 0.06288917, + "balance_loss_mlp": 0.01255161, + "epoch": 0.16750338193296258, + "flos": 66157228857600.0, + "grad_norm": 0.7834191651915974, + "language_loss": 0.58330011, + "learning_rate": 3.8049203884140492e-06, + "loss": 0.66042489, + "num_input_tokens_seen": 60377800, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.07037354, + "step": 2786, + "time_per_iteration": 4.598146200180054 + }, + { + "auxiliary_loss_clip": 0.06587794, + "auxiliary_loss_mlp": 0.01289611, + "balance_loss_clip": 0.06301168, + "balance_loss_mlp": 0.0126298, + "epoch": 0.16756350518563054, + "flos": 25702881505920.0, + "grad_norm": 2.328984985341375, + "language_loss": 0.76757109, + "learning_rate": 3.80475258451721e-06, + "loss": 0.84634513, + "num_input_tokens_seen": 60398215, + "router_z_loss_clip": 2.8671875, + "router_z_loss_mlp": 0.26623535, + "step": 2787, + "time_per_iteration": 2.5801339149475098 + }, + { + "auxiliary_loss_clip": 0.06585419, + "auxiliary_loss_mlp": 0.01283974, + "balance_loss_clip": 0.06301223, + "balance_loss_mlp": 0.01257891, + "epoch": 0.1676236284382985, + "flos": 23841911450880.0, + "grad_norm": 1.9360315934234018, + "language_loss": 0.78495795, + "learning_rate": 3.804584712183972e-06, + "loss": 0.86365187, + "num_input_tokens_seen": 60416910, + "router_z_loss_clip": 2.84375, + "router_z_loss_mlp": 0.26086426, + "step": 2788, + "time_per_iteration": 2.5693655014038086 + }, + { + "auxiliary_loss_clip": 0.06435917, + "auxiliary_loss_mlp": 0.0126534, + "balance_loss_clip": 0.06275532, + "balance_loss_mlp": 0.01257765, + "epoch": 0.16768375169096647, + "flos": 59891313663360.0, + "grad_norm": 0.8394736884379908, + "language_loss": 0.59391403, + "learning_rate": 3.8044167714207013e-06, + "loss": 0.67092663, + "num_input_tokens_seen": 60468660, + "router_z_loss_clip": 1.60839844, + "router_z_loss_mlp": 0.07562256, + "step": 2789, + "time_per_iteration": 3.006455659866333 + }, + { + "auxiliary_loss_clip": 0.06580187, + "auxiliary_loss_mlp": 0.01282981, + "balance_loss_clip": 0.06298364, + "balance_loss_mlp": 0.01257566, + "epoch": 0.16774387494363446, + "flos": 38444785608960.0, + "grad_norm": 1.7149926461558054, + "language_loss": 0.71297312, + "learning_rate": 3.804248762233765e-06, + "loss": 0.79160476, + "num_input_tokens_seen": 60492370, + "router_z_loss_clip": 2.8203125, + "router_z_loss_mlp": 0.25427246, + "step": 2790, + "time_per_iteration": 2.6886403560638428 + }, + { + "auxiliary_loss_clip": 0.065869, + "auxiliary_loss_mlp": 0.01286845, + "balance_loss_clip": 0.06305605, + "balance_loss_mlp": 0.01260142, + "epoch": 0.16780399819630243, + "flos": 22644156862080.0, + "grad_norm": 1.6857838889349592, + "language_loss": 0.7969588, + "learning_rate": 3.8040806846295356e-06, + "loss": 0.8756963, + "num_input_tokens_seen": 60512655, + "router_z_loss_clip": 2.8125, + "router_z_loss_mlp": 0.26696777, + "step": 2791, + "time_per_iteration": 2.542351484298706 + }, + { + "auxiliary_loss_clip": 0.06585324, + "auxiliary_loss_mlp": 0.01283873, + "balance_loss_clip": 0.06304726, + "balance_loss_mlp": 0.01256502, + "epoch": 0.1678641214489704, + "flos": 32900001851520.0, + "grad_norm": 1.6260668766519037, + "language_loss": 0.72283256, + "learning_rate": 3.8039125386143853e-06, + "loss": 0.80152452, + "num_input_tokens_seen": 60533090, + "router_z_loss_clip": 2.80664062, + "router_z_loss_mlp": 0.27355957, + "step": 2792, + "time_per_iteration": 2.681652784347534 + }, + { + "auxiliary_loss_clip": 0.06588314, + "auxiliary_loss_mlp": 0.01281257, + "balance_loss_clip": 0.06305955, + "balance_loss_mlp": 0.01256223, + "epoch": 0.16792424470163836, + "flos": 19981133925120.0, + "grad_norm": 2.7315250216088756, + "language_loss": 0.7262826, + "learning_rate": 3.803744324194691e-06, + "loss": 0.80497831, + "num_input_tokens_seen": 60553190, + "router_z_loss_clip": 2.82617188, + "router_z_loss_mlp": 0.25036621, + "step": 2793, + "time_per_iteration": 2.5261969566345215 + }, + { + "auxiliary_loss_clip": 0.06583093, + "auxiliary_loss_mlp": 0.01283488, + "balance_loss_clip": 0.06301598, + "balance_loss_mlp": 0.01257333, + "epoch": 0.16798436795430632, + "flos": 19726114423680.0, + "grad_norm": 2.037397007218884, + "language_loss": 0.78064799, + "learning_rate": 3.803576041376831e-06, + "loss": 0.85931379, + "num_input_tokens_seen": 60571995, + "router_z_loss_clip": 2.81445312, + "router_z_loss_mlp": 0.26135254, + "step": 2794, + "time_per_iteration": 2.5393919944763184 + }, + { + "auxiliary_loss_clip": 0.06580402, + "auxiliary_loss_mlp": 0.01288563, + "balance_loss_clip": 0.06298761, + "balance_loss_mlp": 0.01262206, + "epoch": 0.1680444912069743, + "flos": 28111346138880.0, + "grad_norm": 2.312644294934493, + "language_loss": 0.72345173, + "learning_rate": 3.803407690167187e-06, + "loss": 0.80214143, + "num_input_tokens_seen": 60591275, + "router_z_loss_clip": 2.81640625, + "router_z_loss_mlp": 0.26379395, + "step": 2795, + "time_per_iteration": 2.565215587615967 + }, + { + "auxiliary_loss_clip": 0.06578698, + "auxiliary_loss_mlp": 0.01278302, + "balance_loss_clip": 0.06297935, + "balance_loss_mlp": 0.01254329, + "epoch": 0.16810461445964225, + "flos": 18080695797120.0, + "grad_norm": 1.8533332907405589, + "language_loss": 0.85181081, + "learning_rate": 3.803239270572142e-06, + "loss": 0.93038082, + "num_input_tokens_seen": 60609235, + "router_z_loss_clip": 2.81054688, + "router_z_loss_mlp": 0.23986816, + "step": 2796, + "time_per_iteration": 2.627962112426758 + }, + { + "auxiliary_loss_clip": 0.06595714, + "auxiliary_loss_mlp": 0.01283274, + "balance_loss_clip": 0.0630767, + "balance_loss_mlp": 0.01256571, + "epoch": 0.16816473771231025, + "flos": 23885488373760.0, + "grad_norm": 2.13286065055067, + "language_loss": 0.82093614, + "learning_rate": 3.8030707825980838e-06, + "loss": 0.89972603, + "num_input_tokens_seen": 60629880, + "router_z_loss_clip": 2.8828125, + "router_z_loss_mlp": 0.26696777, + "step": 2797, + "time_per_iteration": 2.5887176990509033 + }, + { + "auxiliary_loss_clip": 0.06571205, + "auxiliary_loss_mlp": 0.01280989, + "balance_loss_clip": 0.06298848, + "balance_loss_mlp": 0.01257922, + "epoch": 0.1682248609649782, + "flos": 22790163801600.0, + "grad_norm": 1.6719709230048432, + "language_loss": 0.75814915, + "learning_rate": 3.802902226251401e-06, + "loss": 0.83667111, + "num_input_tokens_seen": 60651175, + "router_z_loss_clip": 2.72460938, + "router_z_loss_mlp": 0.23071289, + "step": 2798, + "time_per_iteration": 2.5682647228240967 + }, + { + "auxiliary_loss_clip": 0.06575698, + "auxiliary_loss_mlp": 0.01285158, + "balance_loss_clip": 0.06297997, + "balance_loss_mlp": 0.01261483, + "epoch": 0.16828498421764618, + "flos": 20711545966080.0, + "grad_norm": 1.6493106854951614, + "language_loss": 0.8051939, + "learning_rate": 3.8027336015384845e-06, + "loss": 0.88380253, + "num_input_tokens_seen": 60670210, + "router_z_loss_clip": 2.77929688, + "router_z_loss_mlp": 0.23669434, + "step": 2799, + "time_per_iteration": 2.5808820724487305 + }, + { + "auxiliary_loss_clip": 0.06588444, + "auxiliary_loss_mlp": 0.01290497, + "balance_loss_clip": 0.06306663, + "balance_loss_mlp": 0.01264951, + "epoch": 0.16834510747031414, + "flos": 29427714581760.0, + "grad_norm": 2.08568782894778, + "language_loss": 0.71203279, + "learning_rate": 3.8025649084657296e-06, + "loss": 0.79082221, + "num_input_tokens_seen": 60690895, + "router_z_loss_clip": 2.81835938, + "router_z_loss_mlp": 0.25561523, + "step": 2800, + "time_per_iteration": 2.6072590351104736 + }, + { + "auxiliary_loss_clip": 0.06577089, + "auxiliary_loss_mlp": 0.01284192, + "balance_loss_clip": 0.06299706, + "balance_loss_mlp": 0.01258705, + "epoch": 0.1684052307229821, + "flos": 18150407994240.0, + "grad_norm": 2.3689825925758647, + "language_loss": 0.84516144, + "learning_rate": 3.8023961470395326e-06, + "loss": 0.9237743, + "num_input_tokens_seen": 60708280, + "router_z_loss_clip": 2.7734375, + "router_z_loss_mlp": 0.25488281, + "step": 2801, + "time_per_iteration": 2.5283203125 + }, + { + "auxiliary_loss_clip": 0.06582664, + "auxiliary_loss_mlp": 0.01284981, + "balance_loss_clip": 0.06302365, + "balance_loss_mlp": 0.01258612, + "epoch": 0.16846535397565007, + "flos": 16579439320320.0, + "grad_norm": 3.0795087290353744, + "language_loss": 0.84073383, + "learning_rate": 3.8022273172662933e-06, + "loss": 0.91941023, + "num_input_tokens_seen": 60724150, + "router_z_loss_clip": 2.80273438, + "router_z_loss_mlp": 0.26391602, + "step": 2802, + "time_per_iteration": 2.493727684020996 + }, + { + "auxiliary_loss_clip": 0.06582403, + "auxiliary_loss_mlp": 0.01282997, + "balance_loss_clip": 0.0630409, + "balance_loss_mlp": 0.01256831, + "epoch": 0.16852547722831807, + "flos": 30416667995520.0, + "grad_norm": 4.967511006144659, + "language_loss": 0.81234676, + "learning_rate": 3.802058419152413e-06, + "loss": 0.89100075, + "num_input_tokens_seen": 60746485, + "router_z_loss_clip": 2.78320312, + "router_z_loss_mlp": 0.26147461, + "step": 2803, + "time_per_iteration": 2.6188409328460693 + }, + { + "auxiliary_loss_clip": 0.06578018, + "auxiliary_loss_mlp": 0.01280405, + "balance_loss_clip": 0.06301461, + "balance_loss_mlp": 0.01256157, + "epoch": 0.16858560048098603, + "flos": 33515279982720.0, + "grad_norm": 2.6560543874068205, + "language_loss": 0.77301621, + "learning_rate": 3.801889452704297e-06, + "loss": 0.85160041, + "num_input_tokens_seen": 60762875, + "router_z_loss_clip": 2.76367188, + "router_z_loss_mlp": 0.24230957, + "step": 2804, + "time_per_iteration": 2.6222236156463623 + }, + { + "auxiliary_loss_clip": 0.06456417, + "auxiliary_loss_mlp": 0.01261999, + "balance_loss_clip": 0.06296105, + "balance_loss_mlp": 0.0125524, + "epoch": 0.168645723733654, + "flos": 67390845793920.0, + "grad_norm": 0.7985418659660302, + "language_loss": 0.55433214, + "learning_rate": 3.8017204179283526e-06, + "loss": 0.63151628, + "num_input_tokens_seen": 60825510, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.06774902, + "step": 2805, + "time_per_iteration": 3.1424005031585693 + }, + { + "auxiliary_loss_clip": 0.06571464, + "auxiliary_loss_mlp": 0.01283981, + "balance_loss_clip": 0.06301463, + "balance_loss_mlp": 0.01260723, + "epoch": 0.16870584698632196, + "flos": 21331016801280.0, + "grad_norm": 1.8814500249786532, + "language_loss": 0.74235076, + "learning_rate": 3.8015513148309892e-06, + "loss": 0.82090515, + "num_input_tokens_seen": 60844440, + "router_z_loss_clip": 2.70117188, + "router_z_loss_mlp": 0.23254395, + "step": 2806, + "time_per_iteration": 2.5448226928710938 + }, + { + "auxiliary_loss_clip": 0.06569488, + "auxiliary_loss_mlp": 0.01288633, + "balance_loss_clip": 0.06295753, + "balance_loss_mlp": 0.01264123, + "epoch": 0.16876597023898993, + "flos": 20747030970240.0, + "grad_norm": 2.4625186255791407, + "language_loss": 0.70848989, + "learning_rate": 3.80138214341862e-06, + "loss": 0.78707111, + "num_input_tokens_seen": 60863210, + "router_z_loss_clip": 2.734375, + "router_z_loss_mlp": 0.24523926, + "step": 2807, + "time_per_iteration": 2.5282390117645264 + }, + { + "auxiliary_loss_clip": 0.06578949, + "auxiliary_loss_mlp": 0.01289591, + "balance_loss_clip": 0.06299816, + "balance_loss_mlp": 0.0126383, + "epoch": 0.1688260934916579, + "flos": 20309806765440.0, + "grad_norm": 3.7758907272624715, + "language_loss": 0.71724349, + "learning_rate": 3.8012129036976587e-06, + "loss": 0.79592896, + "num_input_tokens_seen": 60882510, + "router_z_loss_clip": 2.79101562, + "router_z_loss_mlp": 0.25744629, + "step": 2808, + "time_per_iteration": 2.5146172046661377 + }, + { + "auxiliary_loss_clip": 0.06592815, + "auxiliary_loss_mlp": 0.01288179, + "balance_loss_clip": 0.06306504, + "balance_loss_mlp": 0.01261119, + "epoch": 0.16888621674432586, + "flos": 20347136559360.0, + "grad_norm": 2.150924717168134, + "language_loss": 0.80452245, + "learning_rate": 3.8010435956745236e-06, + "loss": 0.88333237, + "num_input_tokens_seen": 60901105, + "router_z_loss_clip": 2.86328125, + "router_z_loss_mlp": 0.27075195, + "step": 2809, + "time_per_iteration": 2.590801477432251 + }, + { + "auxiliary_loss_clip": 0.06586212, + "auxiliary_loss_mlp": 0.01286252, + "balance_loss_clip": 0.06299593, + "balance_loss_mlp": 0.01258965, + "epoch": 0.16894633999699385, + "flos": 16248963617280.0, + "grad_norm": 2.023624064417177, + "language_loss": 0.8897475, + "learning_rate": 3.8008742193556358e-06, + "loss": 0.96847212, + "num_input_tokens_seen": 60915340, + "router_z_loss_clip": 2.86523438, + "router_z_loss_mlp": 0.27294922, + "step": 2810, + "time_per_iteration": 2.553370714187622 + }, + { + "auxiliary_loss_clip": 0.0659079, + "auxiliary_loss_mlp": 0.01302127, + "balance_loss_clip": 0.06304274, + "balance_loss_mlp": 0.01273994, + "epoch": 0.16900646324966181, + "flos": 19616347175040.0, + "grad_norm": 1.906856377822649, + "language_loss": 0.93345243, + "learning_rate": 3.800704774747416e-06, + "loss": 1.01238155, + "num_input_tokens_seen": 60933735, + "router_z_loss_clip": 2.86523438, + "router_z_loss_mlp": 0.28137207, + "step": 2811, + "time_per_iteration": 2.5584306716918945 + }, + { + "auxiliary_loss_clip": 0.06579725, + "auxiliary_loss_mlp": 0.01293368, + "balance_loss_clip": 0.0629798, + "balance_loss_mlp": 0.01266534, + "epoch": 0.16906658650232978, + "flos": 22024644099840.0, + "grad_norm": 1.777677884933971, + "language_loss": 0.80087781, + "learning_rate": 3.800535261856291e-06, + "loss": 0.87960875, + "num_input_tokens_seen": 60953105, + "router_z_loss_clip": 2.81445312, + "router_z_loss_mlp": 0.26818848, + "step": 2812, + "time_per_iteration": 2.5193934440612793 + }, + { + "auxiliary_loss_clip": 0.06578699, + "auxiliary_loss_mlp": 0.01288816, + "balance_loss_clip": 0.06299478, + "balance_loss_mlp": 0.01262983, + "epoch": 0.16912670975499774, + "flos": 11768212131840.0, + "grad_norm": 2.3060118484148586, + "language_loss": 0.76260078, + "learning_rate": 3.8003656806886887e-06, + "loss": 0.84127587, + "num_input_tokens_seen": 60969150, + "router_z_loss_clip": 2.79296875, + "router_z_loss_mlp": 0.25830078, + "step": 2813, + "time_per_iteration": 2.5597875118255615 + }, + { + "auxiliary_loss_clip": 0.06583597, + "auxiliary_loss_mlp": 0.01290749, + "balance_loss_clip": 0.06300971, + "balance_loss_mlp": 0.01265083, + "epoch": 0.1691868330076657, + "flos": 17166443898240.0, + "grad_norm": 2.6968588943339444, + "language_loss": 0.70284265, + "learning_rate": 3.8001960312510396e-06, + "loss": 0.78158611, + "num_input_tokens_seen": 60982825, + "router_z_loss_clip": 2.82617188, + "router_z_loss_mlp": 0.2565918, + "step": 2814, + "time_per_iteration": 2.4971132278442383 + }, + { + "auxiliary_loss_clip": 0.06581523, + "auxiliary_loss_mlp": 0.01299068, + "balance_loss_clip": 0.06302465, + "balance_loss_mlp": 0.01272174, + "epoch": 0.16924695626033368, + "flos": 22422693720960.0, + "grad_norm": 1.782997034372258, + "language_loss": 0.63103068, + "learning_rate": 3.800026313549776e-06, + "loss": 0.7098366, + "num_input_tokens_seen": 61000875, + "router_z_loss_clip": 2.79296875, + "router_z_loss_mlp": 0.2689209, + "step": 2815, + "time_per_iteration": 2.583073377609253 + }, + { + "auxiliary_loss_clip": 0.06579722, + "auxiliary_loss_mlp": 0.01301206, + "balance_loss_clip": 0.06305208, + "balance_loss_mlp": 0.01275195, + "epoch": 0.16930707951300164, + "flos": 25746835772160.0, + "grad_norm": 1.6235196600742487, + "language_loss": 0.82652867, + "learning_rate": 3.7998565275913342e-06, + "loss": 0.90533793, + "num_input_tokens_seen": 61021940, + "router_z_loss_clip": 2.7421875, + "router_z_loss_mlp": 0.26037598, + "step": 2816, + "time_per_iteration": 2.567267894744873 + }, + { + "auxiliary_loss_clip": 0.06582578, + "auxiliary_loss_mlp": 0.01283511, + "balance_loss_clip": 0.06305215, + "balance_loss_mlp": 0.01257404, + "epoch": 0.16936720276566963, + "flos": 22753588694400.0, + "grad_norm": 2.305113279035628, + "language_loss": 0.88275278, + "learning_rate": 3.799686673382153e-06, + "loss": 0.96141362, + "num_input_tokens_seen": 61040285, + "router_z_loss_clip": 2.77148438, + "router_z_loss_mlp": 0.26074219, + "step": 2817, + "time_per_iteration": 2.55474853515625 + }, + { + "auxiliary_loss_clip": 0.06582828, + "auxiliary_loss_mlp": 0.0128986, + "balance_loss_clip": 0.06307572, + "balance_loss_mlp": 0.01264326, + "epoch": 0.1694273260183376, + "flos": 19580191338240.0, + "grad_norm": 1.9827332941616407, + "language_loss": 0.82882643, + "learning_rate": 3.799516750928672e-06, + "loss": 0.90755332, + "num_input_tokens_seen": 61059020, + "router_z_loss_clip": 2.75195312, + "router_z_loss_mlp": 0.2557373, + "step": 2818, + "time_per_iteration": 4.006748676300049 + }, + { + "auxiliary_loss_clip": 0.06584448, + "auxiliary_loss_mlp": 0.01293023, + "balance_loss_clip": 0.06306577, + "balance_loss_mlp": 0.01267905, + "epoch": 0.16948744927100556, + "flos": 12462636044160.0, + "grad_norm": 2.7889091010227367, + "language_loss": 0.81285071, + "learning_rate": 3.799346760237336e-06, + "loss": 0.8916254, + "num_input_tokens_seen": 61074245, + "router_z_loss_clip": 2.77929688, + "router_z_loss_mlp": 0.2512207, + "step": 2819, + "time_per_iteration": 2.513493537902832 + }, + { + "auxiliary_loss_clip": 0.06486231, + "auxiliary_loss_mlp": 0.01264851, + "balance_loss_clip": 0.06326687, + "balance_loss_mlp": 0.01257299, + "epoch": 0.16954757252367353, + "flos": 71309470164480.0, + "grad_norm": 0.8945207214981431, + "language_loss": 0.6004045, + "learning_rate": 3.7991767013145902e-06, + "loss": 0.67791533, + "num_input_tokens_seen": 61127080, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.07537842, + "step": 2820, + "time_per_iteration": 3.0841901302337646 + }, + { + "auxiliary_loss_clip": 0.06583934, + "auxiliary_loss_mlp": 0.01283404, + "balance_loss_clip": 0.06305862, + "balance_loss_mlp": 0.01258656, + "epoch": 0.1696076957763415, + "flos": 29614237770240.0, + "grad_norm": 2.2684361224992315, + "language_loss": 0.79040307, + "learning_rate": 3.7990065741668844e-06, + "loss": 0.86907649, + "num_input_tokens_seen": 61146955, + "router_z_loss_clip": 2.78125, + "router_z_loss_mlp": 0.24755859, + "step": 2821, + "time_per_iteration": 4.0664753913879395 + }, + { + "auxiliary_loss_clip": 0.06580411, + "auxiliary_loss_mlp": 0.01287682, + "balance_loss_clip": 0.06301302, + "balance_loss_mlp": 0.01260884, + "epoch": 0.16966781902900946, + "flos": 24395359668480.0, + "grad_norm": 4.427680473234215, + "language_loss": 0.79946303, + "learning_rate": 3.7988363788006685e-06, + "loss": 0.87814403, + "num_input_tokens_seen": 61166605, + "router_z_loss_clip": 2.79101562, + "router_z_loss_mlp": 0.26782227, + "step": 2822, + "time_per_iteration": 2.591439962387085 + }, + { + "auxiliary_loss_clip": 0.06573688, + "auxiliary_loss_mlp": 0.01292623, + "balance_loss_clip": 0.06300368, + "balance_loss_mlp": 0.0126834, + "epoch": 0.16972794228167745, + "flos": 23045392938240.0, + "grad_norm": 1.79403732378333, + "language_loss": 0.75404185, + "learning_rate": 3.7986661152223967e-06, + "loss": 0.83270496, + "num_input_tokens_seen": 61186535, + "router_z_loss_clip": 2.734375, + "router_z_loss_mlp": 0.24291992, + "step": 2823, + "time_per_iteration": 2.607241153717041 + }, + { + "auxiliary_loss_clip": 0.06584911, + "auxiliary_loss_mlp": 0.01296286, + "balance_loss_clip": 0.06309374, + "balance_loss_mlp": 0.01270704, + "epoch": 0.16978806553434542, + "flos": 35237915746560.0, + "grad_norm": 1.9541945473914888, + "language_loss": 0.60637134, + "learning_rate": 3.7984957834385257e-06, + "loss": 0.68518329, + "num_input_tokens_seen": 61208965, + "router_z_loss_clip": 2.7578125, + "router_z_loss_mlp": 0.2557373, + "step": 2824, + "time_per_iteration": 4.110937595367432 + }, + { + "auxiliary_loss_clip": 0.06588213, + "auxiliary_loss_mlp": 0.01295922, + "balance_loss_clip": 0.06311615, + "balance_loss_mlp": 0.01271114, + "epoch": 0.16984818878701338, + "flos": 32022366986880.0, + "grad_norm": 1.641592491230249, + "language_loss": 0.73562557, + "learning_rate": 3.7983253834555144e-06, + "loss": 0.81446695, + "num_input_tokens_seen": 61230670, + "router_z_loss_clip": 2.765625, + "router_z_loss_mlp": 0.24816895, + "step": 2825, + "time_per_iteration": 2.634206533432007 + }, + { + "auxiliary_loss_clip": 0.06593174, + "auxiliary_loss_mlp": 0.01295449, + "balance_loss_clip": 0.06306911, + "balance_loss_mlp": 0.01267411, + "epoch": 0.16990831203968135, + "flos": 22824936046080.0, + "grad_norm": 2.0964880275629465, + "language_loss": 0.86494017, + "learning_rate": 3.7981549152798245e-06, + "loss": 0.94382638, + "num_input_tokens_seen": 61249510, + "router_z_loss_clip": 2.8671875, + "router_z_loss_mlp": 0.28051758, + "step": 2826, + "time_per_iteration": 4.0616254806518555 + }, + { + "auxiliary_loss_clip": 0.0658946, + "auxiliary_loss_mlp": 0.01287444, + "balance_loss_clip": 0.0630484, + "balance_loss_mlp": 0.01260122, + "epoch": 0.1699684352923493, + "flos": 23046315333120.0, + "grad_norm": 1.7026807922554432, + "language_loss": 0.83019429, + "learning_rate": 3.7979843789179196e-06, + "loss": 0.90896332, + "num_input_tokens_seen": 61269440, + "router_z_loss_clip": 2.8515625, + "router_z_loss_mlp": 0.27307129, + "step": 2827, + "time_per_iteration": 2.5943539142608643 + }, + { + "auxiliary_loss_clip": 0.0658665, + "auxiliary_loss_mlp": 0.01291922, + "balance_loss_clip": 0.06303778, + "balance_loss_mlp": 0.01264206, + "epoch": 0.17002855854501728, + "flos": 21440532487680.0, + "grad_norm": 1.9993521816112911, + "language_loss": 0.75042886, + "learning_rate": 3.797813774376267e-06, + "loss": 0.82921457, + "num_input_tokens_seen": 61288195, + "router_z_loss_clip": 2.83203125, + "router_z_loss_mlp": 0.27722168, + "step": 2828, + "time_per_iteration": 2.5574147701263428 + }, + { + "auxiliary_loss_clip": 0.06457284, + "auxiliary_loss_mlp": 0.01264115, + "balance_loss_clip": 0.06297607, + "balance_loss_mlp": 0.01257433, + "epoch": 0.17008868179768524, + "flos": 71473966928640.0, + "grad_norm": 0.7544805989931621, + "language_loss": 0.56274545, + "learning_rate": 3.797643101661336e-06, + "loss": 0.63995945, + "num_input_tokens_seen": 61350850, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.06695557, + "step": 2829, + "time_per_iteration": 3.2194459438323975 + }, + { + "auxiliary_loss_clip": 0.06582125, + "auxiliary_loss_mlp": 0.01292929, + "balance_loss_clip": 0.06305368, + "balance_loss_mlp": 0.01267168, + "epoch": 0.17014880505035324, + "flos": 24907327315200.0, + "grad_norm": 1.8200636755843338, + "language_loss": 0.84280431, + "learning_rate": 3.7974723607795983e-06, + "loss": 0.9215548, + "num_input_tokens_seen": 61370765, + "router_z_loss_clip": 2.76757812, + "router_z_loss_mlp": 0.25769043, + "step": 2830, + "time_per_iteration": 2.5831046104431152 + }, + { + "auxiliary_loss_clip": 0.0658033, + "auxiliary_loss_mlp": 0.01286886, + "balance_loss_clip": 0.06302518, + "balance_loss_mlp": 0.0125985, + "epoch": 0.1702089283030212, + "flos": 29870263520640.0, + "grad_norm": 2.350653052094916, + "language_loss": 0.78878641, + "learning_rate": 3.797301551737529e-06, + "loss": 0.86745858, + "num_input_tokens_seen": 61388935, + "router_z_loss_clip": 2.77929688, + "router_z_loss_mlp": 0.2701416, + "step": 2831, + "time_per_iteration": 2.6084542274475098 + }, + { + "auxiliary_loss_clip": 0.06581105, + "auxiliary_loss_mlp": 0.01292582, + "balance_loss_clip": 0.06301375, + "balance_loss_mlp": 0.01266975, + "epoch": 0.17026905155568917, + "flos": 17749171918080.0, + "grad_norm": 2.0319157009696327, + "language_loss": 0.80466926, + "learning_rate": 3.7971306745416044e-06, + "loss": 0.88340604, + "num_input_tokens_seen": 61407350, + "router_z_loss_clip": 2.79882812, + "router_z_loss_mlp": 0.25610352, + "step": 2832, + "time_per_iteration": 2.5211668014526367 + }, + { + "auxiliary_loss_clip": 0.06573536, + "auxiliary_loss_mlp": 0.01286888, + "balance_loss_clip": 0.06297776, + "balance_loss_mlp": 0.0126133, + "epoch": 0.17032917480835713, + "flos": 23155327895040.0, + "grad_norm": 1.986078489446087, + "language_loss": 0.89480335, + "learning_rate": 3.7969597291983046e-06, + "loss": 0.97340751, + "num_input_tokens_seen": 61429010, + "router_z_loss_clip": 2.7578125, + "router_z_loss_mlp": 0.25561523, + "step": 2833, + "time_per_iteration": 2.560952663421631 + }, + { + "auxiliary_loss_clip": 0.06575279, + "auxiliary_loss_mlp": 0.01285966, + "balance_loss_clip": 0.06302077, + "balance_loss_mlp": 0.01261123, + "epoch": 0.1703892980610251, + "flos": 39211940465280.0, + "grad_norm": 2.220027390834487, + "language_loss": 0.73524815, + "learning_rate": 3.7967887157141115e-06, + "loss": 0.81386054, + "num_input_tokens_seen": 61450040, + "router_z_loss_clip": 2.73046875, + "router_z_loss_mlp": 0.24829102, + "step": 2834, + "time_per_iteration": 2.679527521133423 + }, + { + "auxiliary_loss_clip": 0.06581013, + "auxiliary_loss_mlp": 0.01285804, + "balance_loss_clip": 0.06300581, + "balance_loss_mlp": 0.01260245, + "epoch": 0.17044942131369306, + "flos": 23045728354560.0, + "grad_norm": 1.8327084439605401, + "language_loss": 0.87308288, + "learning_rate": 3.7966176340955106e-06, + "loss": 0.95175111, + "num_input_tokens_seen": 61468585, + "router_z_loss_clip": 2.8046875, + "router_z_loss_mlp": 0.2557373, + "step": 2835, + "time_per_iteration": 2.656421661376953 + }, + { + "auxiliary_loss_clip": 0.06579748, + "auxiliary_loss_mlp": 0.01283404, + "balance_loss_clip": 0.06297451, + "balance_loss_mlp": 0.01256451, + "epoch": 0.17050954456636103, + "flos": 17060533937280.0, + "grad_norm": 2.3811755619363058, + "language_loss": 0.75235045, + "learning_rate": 3.796446484348989e-06, + "loss": 0.83098197, + "num_input_tokens_seen": 61486330, + "router_z_loss_clip": 2.82226562, + "router_z_loss_mlp": 0.26940918, + "step": 2836, + "time_per_iteration": 2.4939451217651367 + }, + { + "auxiliary_loss_clip": 0.06577778, + "auxiliary_loss_mlp": 0.01283432, + "balance_loss_clip": 0.06295718, + "balance_loss_mlp": 0.01256955, + "epoch": 0.17056966781902902, + "flos": 16842634594560.0, + "grad_norm": 2.2113478912931606, + "language_loss": 0.81597924, + "learning_rate": 3.796275266481036e-06, + "loss": 0.89459133, + "num_input_tokens_seen": 61503950, + "router_z_loss_clip": 2.8203125, + "router_z_loss_mlp": 0.26501465, + "step": 2837, + "time_per_iteration": 2.5308785438537598 + }, + { + "auxiliary_loss_clip": 0.06567004, + "auxiliary_loss_mlp": 0.01296468, + "balance_loss_clip": 0.06298919, + "balance_loss_mlp": 0.01272149, + "epoch": 0.17062979107169698, + "flos": 17718340815360.0, + "grad_norm": 2.307982469607828, + "language_loss": 0.84291762, + "learning_rate": 3.7961039804981456e-06, + "loss": 0.92155236, + "num_input_tokens_seen": 61523550, + "router_z_loss_clip": 2.67578125, + "router_z_loss_mlp": 0.24328613, + "step": 2838, + "time_per_iteration": 2.509929895401001 + }, + { + "auxiliary_loss_clip": 0.06570365, + "auxiliary_loss_mlp": 0.01284738, + "balance_loss_clip": 0.06295732, + "balance_loss_mlp": 0.01260264, + "epoch": 0.17068991432436495, + "flos": 22531035450240.0, + "grad_norm": 1.8555127422179185, + "language_loss": 0.94406807, + "learning_rate": 3.795932626406812e-06, + "loss": 1.02261913, + "num_input_tokens_seen": 61542720, + "router_z_loss_clip": 2.74804688, + "router_z_loss_mlp": 0.24450684, + "step": 2839, + "time_per_iteration": 2.588021755218506 + }, + { + "auxiliary_loss_clip": 0.06569307, + "auxiliary_loss_mlp": 0.01284917, + "balance_loss_clip": 0.06293422, + "balance_loss_mlp": 0.01256808, + "epoch": 0.17075003757703291, + "flos": 25889698183680.0, + "grad_norm": 2.1000046554588394, + "language_loss": 0.84480917, + "learning_rate": 3.7957612042135336e-06, + "loss": 0.92335141, + "num_input_tokens_seen": 61563040, + "router_z_loss_clip": 2.75976562, + "router_z_loss_mlp": 0.28100586, + "step": 2840, + "time_per_iteration": 2.5653579235076904 + }, + { + "auxiliary_loss_clip": 0.06573716, + "auxiliary_loss_mlp": 0.01290397, + "balance_loss_clip": 0.06298221, + "balance_loss_mlp": 0.01263503, + "epoch": 0.17081016082970088, + "flos": 20126931229440.0, + "grad_norm": 1.871912800472889, + "language_loss": 0.76954079, + "learning_rate": 3.79558971392481e-06, + "loss": 0.8481819, + "num_input_tokens_seen": 61581890, + "router_z_loss_clip": 2.75390625, + "router_z_loss_mlp": 0.26879883, + "step": 2841, + "time_per_iteration": 2.5525524616241455 + }, + { + "auxiliary_loss_clip": 0.06573537, + "auxiliary_loss_mlp": 0.01282668, + "balance_loss_clip": 0.06297247, + "balance_loss_mlp": 0.01257026, + "epoch": 0.17087028408236885, + "flos": 24943441224960.0, + "grad_norm": 1.6793065618865832, + "language_loss": 0.77364486, + "learning_rate": 3.7954181555471443e-06, + "loss": 0.85220695, + "num_input_tokens_seen": 61602095, + "router_z_loss_clip": 2.765625, + "router_z_loss_mlp": 0.2565918, + "step": 2842, + "time_per_iteration": 2.5674381256103516 + }, + { + "auxiliary_loss_clip": 0.06561892, + "auxiliary_loss_mlp": 0.01282368, + "balance_loss_clip": 0.06295875, + "balance_loss_mlp": 0.01257489, + "epoch": 0.17093040733503684, + "flos": 19063108592640.0, + "grad_norm": 1.967223672886595, + "language_loss": 0.87176019, + "learning_rate": 3.795246529087043e-06, + "loss": 0.95020282, + "num_input_tokens_seen": 61620400, + "router_z_loss_clip": 2.66015625, + "router_z_loss_mlp": 0.24853516, + "step": 2843, + "time_per_iteration": 2.546586036682129 + }, + { + "auxiliary_loss_clip": 0.06571361, + "auxiliary_loss_mlp": 0.01285811, + "balance_loss_clip": 0.06299275, + "balance_loss_mlp": 0.01262339, + "epoch": 0.1709905305877048, + "flos": 13083993596160.0, + "grad_norm": 1.8800221555677419, + "language_loss": 0.69446707, + "learning_rate": 3.7950748345510126e-06, + "loss": 0.7730388, + "num_input_tokens_seen": 61637680, + "router_z_loss_clip": 2.72265625, + "router_z_loss_mlp": 0.23461914, + "step": 2844, + "time_per_iteration": 2.5857818126678467 + }, + { + "auxiliary_loss_clip": 0.06575634, + "auxiliary_loss_mlp": 0.01288208, + "balance_loss_clip": 0.06299984, + "balance_loss_mlp": 0.0126346, + "epoch": 0.17105065384037277, + "flos": 19215530369280.0, + "grad_norm": 1.7660184935388845, + "language_loss": 0.79213876, + "learning_rate": 3.7949030719455646e-06, + "loss": 0.87077713, + "num_input_tokens_seen": 61655630, + "router_z_loss_clip": 2.75585938, + "router_z_loss_mlp": 0.24780273, + "step": 2845, + "time_per_iteration": 2.5564208030700684 + }, + { + "auxiliary_loss_clip": 0.06577709, + "auxiliary_loss_mlp": 0.01293667, + "balance_loss_clip": 0.06302914, + "balance_loss_mlp": 0.01268586, + "epoch": 0.17111077709304073, + "flos": 18521106456960.0, + "grad_norm": 2.255753625544696, + "language_loss": 0.79110825, + "learning_rate": 3.7947312412772127e-06, + "loss": 0.86982203, + "num_input_tokens_seen": 61673475, + "router_z_loss_clip": 2.74804688, + "router_z_loss_mlp": 0.25085449, + "step": 2846, + "time_per_iteration": 2.513607978820801 + }, + { + "auxiliary_loss_clip": 0.06568472, + "auxiliary_loss_mlp": 0.01290569, + "balance_loss_clip": 0.06298524, + "balance_loss_mlp": 0.01266727, + "epoch": 0.1711709003457087, + "flos": 25089699726720.0, + "grad_norm": 1.7214534237870849, + "language_loss": 0.80675447, + "learning_rate": 3.794559342552472e-06, + "loss": 0.88534492, + "num_input_tokens_seen": 61693370, + "router_z_loss_clip": 2.69921875, + "router_z_loss_mlp": 0.23852539, + "step": 2847, + "time_per_iteration": 2.618793249130249 + }, + { + "auxiliary_loss_clip": 0.06569728, + "auxiliary_loss_mlp": 0.01293508, + "balance_loss_clip": 0.0629475, + "balance_loss_mlp": 0.01268796, + "epoch": 0.17123102359837666, + "flos": 17572124240640.0, + "grad_norm": 2.2846174525506973, + "language_loss": 0.88074541, + "learning_rate": 3.7943873757778614e-06, + "loss": 0.95937777, + "num_input_tokens_seen": 61710820, + "router_z_loss_clip": 2.75, + "router_z_loss_mlp": 0.24719238, + "step": 2848, + "time_per_iteration": 2.487272024154663 + }, + { + "auxiliary_loss_clip": 0.06569223, + "auxiliary_loss_mlp": 0.01309638, + "balance_loss_clip": 0.06294799, + "balance_loss_mlp": 0.01284688, + "epoch": 0.17129114685104463, + "flos": 26180244616320.0, + "grad_norm": 1.906108969463994, + "language_loss": 0.76101243, + "learning_rate": 3.794215340959902e-06, + "loss": 0.83980107, + "num_input_tokens_seen": 61729855, + "router_z_loss_clip": 2.74414062, + "router_z_loss_mlp": 0.24938965, + "step": 2849, + "time_per_iteration": 2.620347738265991 + }, + { + "auxiliary_loss_clip": 0.06449599, + "auxiliary_loss_mlp": 0.01264516, + "balance_loss_clip": 0.06291912, + "balance_loss_mlp": 0.01257077, + "epoch": 0.17135127010371262, + "flos": 69290696943360.0, + "grad_norm": 0.770033327211451, + "language_loss": 0.57434958, + "learning_rate": 3.7940432381051163e-06, + "loss": 0.65149075, + "num_input_tokens_seen": 61790290, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.07421875, + "step": 2850, + "time_per_iteration": 3.1464109420776367 + }, + { + "auxiliary_loss_clip": 0.0656237, + "auxiliary_loss_mlp": 0.01301725, + "balance_loss_clip": 0.06296088, + "balance_loss_mlp": 0.01277966, + "epoch": 0.1714113933563806, + "flos": 23556857460480.0, + "grad_norm": 2.479535747356738, + "language_loss": 0.81586778, + "learning_rate": 3.793871067220031e-06, + "loss": 0.89450872, + "num_input_tokens_seen": 61809265, + "router_z_loss_clip": 2.66601562, + "router_z_loss_mlp": 0.23742676, + "step": 2851, + "time_per_iteration": 2.558507204055786 + }, + { + "auxiliary_loss_clip": 0.06565535, + "auxiliary_loss_mlp": 0.01289531, + "balance_loss_clip": 0.06298645, + "balance_loss_mlp": 0.01267119, + "epoch": 0.17147151660904855, + "flos": 21148854024960.0, + "grad_norm": 2.2154108843285107, + "language_loss": 0.94662631, + "learning_rate": 3.7936988283111764e-06, + "loss": 1.025177, + "num_input_tokens_seen": 61828980, + "router_z_loss_clip": 2.66992188, + "router_z_loss_mlp": 0.22412109, + "step": 2852, + "time_per_iteration": 2.518974542617798 + }, + { + "auxiliary_loss_clip": 0.0657506, + "auxiliary_loss_mlp": 0.01290477, + "balance_loss_clip": 0.06300224, + "balance_loss_mlp": 0.01264299, + "epoch": 0.17153163986171652, + "flos": 18630873705600.0, + "grad_norm": 1.8056831581423547, + "language_loss": 0.70245004, + "learning_rate": 3.7935265213850817e-06, + "loss": 0.7811054, + "num_input_tokens_seen": 61847915, + "router_z_loss_clip": 2.74609375, + "router_z_loss_mlp": 0.26184082, + "step": 2853, + "time_per_iteration": 2.552562952041626 + }, + { + "auxiliary_loss_clip": 0.06576742, + "auxiliary_loss_mlp": 0.01296459, + "balance_loss_clip": 0.06299934, + "balance_loss_mlp": 0.01271663, + "epoch": 0.17159176311438448, + "flos": 18229134504960.0, + "grad_norm": 2.1946039611354418, + "language_loss": 0.67477524, + "learning_rate": 3.7933541464482815e-06, + "loss": 0.75350726, + "num_input_tokens_seen": 61865570, + "router_z_loss_clip": 2.765625, + "router_z_loss_mlp": 0.2479248, + "step": 2854, + "time_per_iteration": 2.5350561141967773 + }, + { + "auxiliary_loss_clip": 0.06572944, + "auxiliary_loss_mlp": 0.0128611, + "balance_loss_clip": 0.06305773, + "balance_loss_mlp": 0.01263973, + "epoch": 0.17165188636705245, + "flos": 20744976545280.0, + "grad_norm": 1.5291061865624715, + "language_loss": 0.89537871, + "learning_rate": 3.7931817035073124e-06, + "loss": 0.97396928, + "num_input_tokens_seen": 61883340, + "router_z_loss_clip": 2.671875, + "router_z_loss_mlp": 0.22143555, + "step": 2855, + "time_per_iteration": 2.5376133918762207 + }, + { + "auxiliary_loss_clip": 0.06575546, + "auxiliary_loss_mlp": 0.01295321, + "balance_loss_clip": 0.06304888, + "balance_loss_mlp": 0.01271145, + "epoch": 0.17171200961972044, + "flos": 24906824190720.0, + "grad_norm": 2.4271457535299654, + "language_loss": 0.84835625, + "learning_rate": 3.7930091925687134e-06, + "loss": 0.9270649, + "num_input_tokens_seen": 61900610, + "router_z_loss_clip": 2.70703125, + "router_z_loss_mlp": 0.24206543, + "step": 2856, + "time_per_iteration": 2.551483392715454 + }, + { + "auxiliary_loss_clip": 0.06575087, + "auxiliary_loss_mlp": 0.01290512, + "balance_loss_clip": 0.0630254, + "balance_loss_mlp": 0.01267528, + "epoch": 0.1717721328723884, + "flos": 20163464409600.0, + "grad_norm": 7.491722293090189, + "language_loss": 0.87615776, + "learning_rate": 3.792836613639026e-06, + "loss": 0.95481372, + "num_input_tokens_seen": 61916795, + "router_z_loss_clip": 2.72460938, + "router_z_loss_mlp": 0.23010254, + "step": 2857, + "time_per_iteration": 4.012267112731934 + }, + { + "auxiliary_loss_clip": 0.06572698, + "auxiliary_loss_mlp": 0.01287955, + "balance_loss_clip": 0.06301427, + "balance_loss_mlp": 0.01262385, + "epoch": 0.17183225612505637, + "flos": 23367357452160.0, + "grad_norm": 2.309816452702101, + "language_loss": 0.78393459, + "learning_rate": 3.7926639667247947e-06, + "loss": 0.86254114, + "num_input_tokens_seen": 61936665, + "router_z_loss_clip": 2.71484375, + "router_z_loss_mlp": 0.25585938, + "step": 2858, + "time_per_iteration": 2.58130145072937 + }, + { + "auxiliary_loss_clip": 0.06589144, + "auxiliary_loss_mlp": 0.0128985, + "balance_loss_clip": 0.06303509, + "balance_loss_mlp": 0.0126453, + "epoch": 0.17189237937772434, + "flos": 18120163870080.0, + "grad_norm": 2.664171996061716, + "language_loss": 0.77798349, + "learning_rate": 3.7924912518325663e-06, + "loss": 0.85677344, + "num_input_tokens_seen": 61954415, + "router_z_loss_clip": 2.85742188, + "router_z_loss_mlp": 0.25317383, + "step": 2859, + "time_per_iteration": 2.5043106079101562 + }, + { + "auxiliary_loss_clip": 0.06572397, + "auxiliary_loss_mlp": 0.01281612, + "balance_loss_clip": 0.06301641, + "balance_loss_mlp": 0.01258939, + "epoch": 0.1719525026303923, + "flos": 23265137070720.0, + "grad_norm": 5.679736885155129, + "language_loss": 0.77697283, + "learning_rate": 3.7923184689688902e-06, + "loss": 0.85551292, + "num_input_tokens_seen": 61973940, + "router_z_loss_clip": 2.7109375, + "router_z_loss_mlp": 0.22692871, + "step": 2860, + "time_per_iteration": 2.572662591934204 + }, + { + "auxiliary_loss_clip": 0.06574808, + "auxiliary_loss_mlp": 0.01292828, + "balance_loss_clip": 0.06301817, + "balance_loss_mlp": 0.01270583, + "epoch": 0.17201262588306027, + "flos": 20816156188800.0, + "grad_norm": 2.1792765136561036, + "language_loss": 0.82509398, + "learning_rate": 3.792145618140317e-06, + "loss": 0.90377033, + "num_input_tokens_seen": 61991845, + "router_z_loss_clip": 2.72851562, + "router_z_loss_mlp": 0.22229004, + "step": 2861, + "time_per_iteration": 3.9328150749206543 + }, + { + "auxiliary_loss_clip": 0.06577721, + "auxiliary_loss_mlp": 0.01292683, + "balance_loss_clip": 0.06305138, + "balance_loss_mlp": 0.0126896, + "epoch": 0.17207274913572823, + "flos": 20382076512000.0, + "grad_norm": 2.450020121503541, + "language_loss": 0.8692534, + "learning_rate": 3.7919726993534038e-06, + "loss": 0.9479574, + "num_input_tokens_seen": 62009395, + "router_z_loss_clip": 2.72460938, + "router_z_loss_mlp": 0.23718262, + "step": 2862, + "time_per_iteration": 2.533240795135498 + }, + { + "auxiliary_loss_clip": 0.06570788, + "auxiliary_loss_mlp": 0.01286464, + "balance_loss_clip": 0.06306001, + "balance_loss_mlp": 0.01264387, + "epoch": 0.17213287238839622, + "flos": 26805082112640.0, + "grad_norm": 1.8452916722599864, + "language_loss": 0.78642774, + "learning_rate": 3.7917997126147054e-06, + "loss": 0.86500025, + "num_input_tokens_seen": 62029005, + "router_z_loss_clip": 2.64257812, + "router_z_loss_mlp": 0.22045898, + "step": 2863, + "time_per_iteration": 2.5886759757995605 + }, + { + "auxiliary_loss_clip": 0.06585991, + "auxiliary_loss_mlp": 0.0129295, + "balance_loss_clip": 0.06318994, + "balance_loss_mlp": 0.01270336, + "epoch": 0.1721929956410642, + "flos": 26037927256320.0, + "grad_norm": 1.9522517065159992, + "language_loss": 0.73622, + "learning_rate": 3.7916266579307823e-06, + "loss": 0.81500947, + "num_input_tokens_seen": 62048730, + "router_z_loss_clip": 2.66796875, + "router_z_loss_mlp": 0.22631836, + "step": 2864, + "time_per_iteration": 4.05191445350647 + }, + { + "auxiliary_loss_clip": 0.06582697, + "auxiliary_loss_mlp": 0.01292894, + "balance_loss_clip": 0.06309051, + "balance_loss_mlp": 0.01269362, + "epoch": 0.17225311889373215, + "flos": 22279621674240.0, + "grad_norm": 1.6774687827131978, + "language_loss": 0.73856592, + "learning_rate": 3.7914535353081973e-06, + "loss": 0.81732178, + "num_input_tokens_seen": 62069000, + "router_z_loss_clip": 2.73632812, + "router_z_loss_mlp": 0.23535156, + "step": 2865, + "time_per_iteration": 3.9612531661987305 + }, + { + "auxiliary_loss_clip": 0.06584621, + "auxiliary_loss_mlp": 0.01305521, + "balance_loss_clip": 0.06313194, + "balance_loss_mlp": 0.01281405, + "epoch": 0.17231324214640012, + "flos": 21294106277760.0, + "grad_norm": 2.4869534197111385, + "language_loss": 0.79160404, + "learning_rate": 3.7912803447535145e-06, + "loss": 0.87050545, + "num_input_tokens_seen": 62086750, + "router_z_loss_clip": 2.71484375, + "router_z_loss_mlp": 0.24121094, + "step": 2866, + "time_per_iteration": 2.542663812637329 + }, + { + "auxiliary_loss_clip": 0.06586975, + "auxiliary_loss_mlp": 0.01295234, + "balance_loss_clip": 0.0631168, + "balance_loss_mlp": 0.01269688, + "epoch": 0.17237336539906808, + "flos": 19686520569600.0, + "grad_norm": 2.39942640082668, + "language_loss": 0.80413449, + "learning_rate": 3.7911070862733016e-06, + "loss": 0.8829565, + "num_input_tokens_seen": 62106240, + "router_z_loss_clip": 2.75390625, + "router_z_loss_mlp": 0.25549316, + "step": 2867, + "time_per_iteration": 2.524634599685669 + }, + { + "auxiliary_loss_clip": 0.06577912, + "auxiliary_loss_mlp": 0.01291096, + "balance_loss_clip": 0.063054, + "balance_loss_mlp": 0.01267123, + "epoch": 0.17243348865173605, + "flos": 17535339498240.0, + "grad_norm": 1.6440546002054504, + "language_loss": 0.80347586, + "learning_rate": 3.7909337598741276e-06, + "loss": 0.88216591, + "num_input_tokens_seen": 62124895, + "router_z_loss_clip": 2.72460938, + "router_z_loss_mlp": 0.23974609, + "step": 2868, + "time_per_iteration": 2.5237460136413574 + }, + { + "auxiliary_loss_clip": 0.06586674, + "auxiliary_loss_mlp": 0.0129419, + "balance_loss_clip": 0.06310418, + "balance_loss_mlp": 0.01270241, + "epoch": 0.17249361190440402, + "flos": 18265751539200.0, + "grad_norm": 1.9212015042396675, + "language_loss": 0.84995282, + "learning_rate": 3.7907603655625674e-06, + "loss": 0.92876148, + "num_input_tokens_seen": 62143510, + "router_z_loss_clip": 2.76171875, + "router_z_loss_mlp": 0.23937988, + "step": 2869, + "time_per_iteration": 2.4968101978302 + }, + { + "auxiliary_loss_clip": 0.06574747, + "auxiliary_loss_mlp": 0.01290391, + "balance_loss_clip": 0.06302473, + "balance_loss_mlp": 0.01265393, + "epoch": 0.172553735157072, + "flos": 21180020544000.0, + "grad_norm": 2.372251531694949, + "language_loss": 0.78318757, + "learning_rate": 3.7905869033451932e-06, + "loss": 0.861839, + "num_input_tokens_seen": 62162285, + "router_z_loss_clip": 2.71875, + "router_z_loss_mlp": 0.25, + "step": 2870, + "time_per_iteration": 2.6494200229644775 + }, + { + "auxiliary_loss_clip": 0.06572236, + "auxiliary_loss_mlp": 0.01286981, + "balance_loss_clip": 0.06308384, + "balance_loss_mlp": 0.01266083, + "epoch": 0.17261385840973997, + "flos": 22279831309440.0, + "grad_norm": 1.8100610801094352, + "language_loss": 0.77937269, + "learning_rate": 3.7904133732285857e-06, + "loss": 0.85796487, + "num_input_tokens_seen": 62180970, + "router_z_loss_clip": 2.63671875, + "router_z_loss_mlp": 0.20910645, + "step": 2871, + "time_per_iteration": 2.6145200729370117 + }, + { + "auxiliary_loss_clip": 0.06580749, + "auxiliary_loss_mlp": 0.01284391, + "balance_loss_clip": 0.06306709, + "balance_loss_mlp": 0.01260263, + "epoch": 0.17267398166240794, + "flos": 27928680238080.0, + "grad_norm": 2.361348336036686, + "language_loss": 0.75478256, + "learning_rate": 3.7902397752193228e-06, + "loss": 0.83343399, + "num_input_tokens_seen": 62198965, + "router_z_loss_clip": 2.74414062, + "router_z_loss_mlp": 0.24157715, + "step": 2872, + "time_per_iteration": 2.598762035369873 + }, + { + "auxiliary_loss_clip": 0.06570577, + "auxiliary_loss_mlp": 0.01297063, + "balance_loss_clip": 0.06302171, + "balance_loss_mlp": 0.01274067, + "epoch": 0.1727341049150759, + "flos": 21951661593600.0, + "grad_norm": 1.9699566193216007, + "language_loss": 0.83421481, + "learning_rate": 3.790066109323988e-06, + "loss": 0.91289121, + "num_input_tokens_seen": 62219890, + "router_z_loss_clip": 2.68359375, + "router_z_loss_mlp": 0.23010254, + "step": 2873, + "time_per_iteration": 2.5375001430511475 + }, + { + "auxiliary_loss_clip": 0.06575856, + "auxiliary_loss_mlp": 0.01290457, + "balance_loss_clip": 0.06307539, + "balance_loss_mlp": 0.01266198, + "epoch": 0.17279422816774387, + "flos": 18112742784000.0, + "grad_norm": 2.023952379864123, + "language_loss": 0.75553465, + "learning_rate": 3.7898923755491678e-06, + "loss": 0.83419782, + "num_input_tokens_seen": 62237140, + "router_z_loss_clip": 2.6796875, + "router_z_loss_mlp": 0.24243164, + "step": 2874, + "time_per_iteration": 2.6628403663635254 + }, + { + "auxiliary_loss_clip": 0.06583337, + "auxiliary_loss_mlp": 0.01288686, + "balance_loss_clip": 0.06308968, + "balance_loss_mlp": 0.01261959, + "epoch": 0.17285435142041183, + "flos": 21841936272000.0, + "grad_norm": 2.156422611189301, + "language_loss": 0.81707162, + "learning_rate": 3.7897185739014487e-06, + "loss": 0.89579183, + "num_input_tokens_seen": 62255405, + "router_z_loss_clip": 2.7421875, + "router_z_loss_mlp": 0.26733398, + "step": 2875, + "time_per_iteration": 2.5195512771606445 + }, + { + "auxiliary_loss_clip": 0.06576921, + "auxiliary_loss_mlp": 0.0129142, + "balance_loss_clip": 0.06303119, + "balance_loss_mlp": 0.01265122, + "epoch": 0.17291447467307983, + "flos": 18374219049600.0, + "grad_norm": 2.297860169925143, + "language_loss": 0.89334786, + "learning_rate": 3.7895447043874217e-06, + "loss": 0.9720313, + "num_input_tokens_seen": 62271280, + "router_z_loss_clip": 2.73828125, + "router_z_loss_mlp": 0.26281738, + "step": 2876, + "time_per_iteration": 2.5156540870666504 + }, + { + "auxiliary_loss_clip": 0.06576936, + "auxiliary_loss_mlp": 0.01286777, + "balance_loss_clip": 0.06307175, + "balance_loss_mlp": 0.01262793, + "epoch": 0.1729745979257478, + "flos": 18630580216320.0, + "grad_norm": 2.037856806425618, + "language_loss": 0.85539293, + "learning_rate": 3.789370767013681e-06, + "loss": 0.93403006, + "num_input_tokens_seen": 62289140, + "router_z_loss_clip": 2.69726562, + "router_z_loss_mlp": 0.23986816, + "step": 2877, + "time_per_iteration": 2.4874324798583984 + }, + { + "auxiliary_loss_clip": 0.06576495, + "auxiliary_loss_mlp": 0.01284602, + "balance_loss_clip": 0.06305559, + "balance_loss_mlp": 0.01260593, + "epoch": 0.17303472117841576, + "flos": 23004122002560.0, + "grad_norm": 1.956584823379214, + "language_loss": 0.79972547, + "learning_rate": 3.7891967617868204e-06, + "loss": 0.87833643, + "num_input_tokens_seen": 62307490, + "router_z_loss_clip": 2.70898438, + "router_z_loss_mlp": 0.23986816, + "step": 2878, + "time_per_iteration": 2.5546791553497314 + }, + { + "auxiliary_loss_clip": 0.06571983, + "auxiliary_loss_mlp": 0.01289115, + "balance_loss_clip": 0.06302349, + "balance_loss_mlp": 0.01264558, + "epoch": 0.17309484443108372, + "flos": 25671169935360.0, + "grad_norm": 1.824315336901638, + "language_loss": 0.72073978, + "learning_rate": 3.78902268871344e-06, + "loss": 0.79935074, + "num_input_tokens_seen": 62328570, + "router_z_loss_clip": 2.69335938, + "router_z_loss_mlp": 0.24584961, + "step": 2879, + "time_per_iteration": 2.5585644245147705 + }, + { + "auxiliary_loss_clip": 0.06575425, + "auxiliary_loss_mlp": 0.01284736, + "balance_loss_clip": 0.06301329, + "balance_loss_mlp": 0.01260048, + "epoch": 0.1731549676837517, + "flos": 13557960616320.0, + "grad_norm": 1.9540483547981324, + "language_loss": 0.8431474, + "learning_rate": 3.78884854780014e-06, + "loss": 0.921749, + "num_input_tokens_seen": 62345735, + "router_z_loss_clip": 2.7421875, + "router_z_loss_mlp": 0.24682617, + "step": 2880, + "time_per_iteration": 2.5332508087158203 + }, + { + "auxiliary_loss_clip": 0.06579134, + "auxiliary_loss_mlp": 0.01281408, + "balance_loss_clip": 0.06303075, + "balance_loss_mlp": 0.01256565, + "epoch": 0.17321509093641965, + "flos": 22863733286400.0, + "grad_norm": 3.3854797576129525, + "language_loss": 0.82168967, + "learning_rate": 3.7886743390535236e-06, + "loss": 0.90029514, + "num_input_tokens_seen": 62365525, + "router_z_loss_clip": 2.76171875, + "router_z_loss_mlp": 0.2487793, + "step": 2881, + "time_per_iteration": 2.5265071392059326 + }, + { + "auxiliary_loss_clip": 0.06575799, + "auxiliary_loss_mlp": 0.01283502, + "balance_loss_clip": 0.06305043, + "balance_loss_mlp": 0.0125904, + "epoch": 0.17327521418908762, + "flos": 24359665029120.0, + "grad_norm": 1.8504646386399068, + "language_loss": 0.77975154, + "learning_rate": 3.788500062480197e-06, + "loss": 0.85834455, + "num_input_tokens_seen": 62385160, + "router_z_loss_clip": 2.70898438, + "router_z_loss_mlp": 0.24450684, + "step": 2882, + "time_per_iteration": 2.56476092338562 + }, + { + "auxiliary_loss_clip": 0.0657361, + "auxiliary_loss_mlp": 0.01281786, + "balance_loss_clip": 0.063067, + "balance_loss_mlp": 0.01260495, + "epoch": 0.1733353374417556, + "flos": 33113373073920.0, + "grad_norm": 2.021690524452963, + "language_loss": 0.77161384, + "learning_rate": 3.788325718086769e-06, + "loss": 0.85016787, + "num_input_tokens_seen": 62405280, + "router_z_loss_clip": 2.66992188, + "router_z_loss_mlp": 0.21276855, + "step": 2883, + "time_per_iteration": 2.6154749393463135 + }, + { + "auxiliary_loss_clip": 0.06569435, + "auxiliary_loss_mlp": 0.01278991, + "balance_loss_clip": 0.06301424, + "balance_loss_mlp": 0.01256365, + "epoch": 0.17339546069442358, + "flos": 24395778938880.0, + "grad_norm": 4.943843215515709, + "language_loss": 0.86164784, + "learning_rate": 3.7881513058798503e-06, + "loss": 0.94013214, + "num_input_tokens_seen": 62423665, + "router_z_loss_clip": 2.68164062, + "router_z_loss_mlp": 0.22631836, + "step": 2884, + "time_per_iteration": 2.5598208904266357 + }, + { + "auxiliary_loss_clip": 0.06577636, + "auxiliary_loss_mlp": 0.01280409, + "balance_loss_clip": 0.06308297, + "balance_loss_mlp": 0.01256878, + "epoch": 0.17345558394709154, + "flos": 27461589252480.0, + "grad_norm": 1.714045228397976, + "language_loss": 0.75027329, + "learning_rate": 3.787976825866055e-06, + "loss": 0.82885373, + "num_input_tokens_seen": 62445170, + "router_z_loss_clip": 2.69335938, + "router_z_loss_mlp": 0.23535156, + "step": 2885, + "time_per_iteration": 2.584550619125366 + }, + { + "auxiliary_loss_clip": 0.06567928, + "auxiliary_loss_mlp": 0.01282091, + "balance_loss_clip": 0.06304367, + "balance_loss_mlp": 0.01259954, + "epoch": 0.1735157071997595, + "flos": 24689260264320.0, + "grad_norm": 1.6836608181022428, + "language_loss": 0.71760321, + "learning_rate": 3.7878022780519998e-06, + "loss": 0.79610336, + "num_input_tokens_seen": 62466135, + "router_z_loss_clip": 2.63671875, + "router_z_loss_mlp": 0.22131348, + "step": 2886, + "time_per_iteration": 2.5990986824035645 + }, + { + "auxiliary_loss_clip": 0.06574686, + "auxiliary_loss_mlp": 0.01280319, + "balance_loss_clip": 0.06304233, + "balance_loss_mlp": 0.01257275, + "epoch": 0.17357583045242747, + "flos": 21695300426880.0, + "grad_norm": 2.252280410203818, + "language_loss": 0.70329314, + "learning_rate": 3.7876276624443024e-06, + "loss": 0.78184319, + "num_input_tokens_seen": 62483910, + "router_z_loss_clip": 2.703125, + "router_z_loss_mlp": 0.23071289, + "step": 2887, + "time_per_iteration": 2.528995990753174 + }, + { + "auxiliary_loss_clip": 0.0657585, + "auxiliary_loss_mlp": 0.0127978, + "balance_loss_clip": 0.06305341, + "balance_loss_mlp": 0.01258155, + "epoch": 0.17363595370509544, + "flos": 15380846190720.0, + "grad_norm": 1.8987045627788157, + "language_loss": 0.85982835, + "learning_rate": 3.787452979049585e-06, + "loss": 0.93838477, + "num_input_tokens_seen": 62501530, + "router_z_loss_clip": 2.70507812, + "router_z_loss_mlp": 0.21618652, + "step": 2888, + "time_per_iteration": 2.520200252532959 + }, + { + "auxiliary_loss_clip": 0.06585068, + "auxiliary_loss_mlp": 0.0128524, + "balance_loss_clip": 0.06313335, + "balance_loss_mlp": 0.01262077, + "epoch": 0.1736960769577634, + "flos": 23447719117440.0, + "grad_norm": 1.9850534312792847, + "language_loss": 0.79895031, + "learning_rate": 3.7872782278744718e-06, + "loss": 0.87765336, + "num_input_tokens_seen": 62521295, + "router_z_loss_clip": 2.71679688, + "router_z_loss_mlp": 0.23193359, + "step": 2889, + "time_per_iteration": 2.5683798789978027 + }, + { + "auxiliary_loss_clip": 0.06572761, + "auxiliary_loss_mlp": 0.01291973, + "balance_loss_clip": 0.06309643, + "balance_loss_mlp": 0.01268966, + "epoch": 0.1737562002104314, + "flos": 18593711619840.0, + "grad_norm": 2.1673011596526743, + "language_loss": 0.85773498, + "learning_rate": 3.7871034089255883e-06, + "loss": 0.93638229, + "num_input_tokens_seen": 62539615, + "router_z_loss_clip": 2.62890625, + "router_z_loss_mlp": 0.23010254, + "step": 2890, + "time_per_iteration": 2.5268702507019043 + }, + { + "auxiliary_loss_clip": 0.06571183, + "auxiliary_loss_mlp": 0.0127752, + "balance_loss_clip": 0.06302673, + "balance_loss_mlp": 0.0125493, + "epoch": 0.17381632346309936, + "flos": 16003629262080.0, + "grad_norm": 2.262236435886973, + "language_loss": 0.8327142, + "learning_rate": 3.7869285222095653e-06, + "loss": 0.91120124, + "num_input_tokens_seen": 62556820, + "router_z_loss_clip": 2.68359375, + "router_z_loss_mlp": 0.22595215, + "step": 2891, + "time_per_iteration": 2.4975481033325195 + }, + { + "auxiliary_loss_clip": 0.065819, + "auxiliary_loss_mlp": 0.01286901, + "balance_loss_clip": 0.06304774, + "balance_loss_mlp": 0.01263512, + "epoch": 0.17387644671576732, + "flos": 13374749664000.0, + "grad_norm": 2.593478250918492, + "language_loss": 0.82133532, + "learning_rate": 3.7867535677330334e-06, + "loss": 0.9000234, + "num_input_tokens_seen": 62572450, + "router_z_loss_clip": 2.76953125, + "router_z_loss_mlp": 0.23388672, + "step": 2892, + "time_per_iteration": 2.488811492919922 + }, + { + "auxiliary_loss_clip": 0.06588026, + "auxiliary_loss_mlp": 0.0128266, + "balance_loss_clip": 0.06313482, + "balance_loss_mlp": 0.0125759, + "epoch": 0.1739365699684353, + "flos": 26622877409280.0, + "grad_norm": 1.869199176824797, + "language_loss": 0.7570942, + "learning_rate": 3.786578545502627e-06, + "loss": 0.83580112, + "num_input_tokens_seen": 62592580, + "router_z_loss_clip": 2.74804688, + "router_z_loss_mlp": 0.25061035, + "step": 2893, + "time_per_iteration": 2.6775050163269043 + }, + { + "auxiliary_loss_clip": 0.06578243, + "auxiliary_loss_mlp": 0.01282281, + "balance_loss_clip": 0.06306182, + "balance_loss_mlp": 0.01257903, + "epoch": 0.17399669322110325, + "flos": 23374736611200.0, + "grad_norm": 1.8950837051329763, + "language_loss": 0.82900345, + "learning_rate": 3.7864034555249828e-06, + "loss": 0.90760863, + "num_input_tokens_seen": 62611220, + "router_z_loss_clip": 2.72265625, + "router_z_loss_mlp": 0.24377441, + "step": 2894, + "time_per_iteration": 2.5567498207092285 + }, + { + "auxiliary_loss_clip": 0.06582697, + "auxiliary_loss_mlp": 0.01287491, + "balance_loss_clip": 0.06309928, + "balance_loss_mlp": 0.01263232, + "epoch": 0.17405681647377122, + "flos": 22060590301440.0, + "grad_norm": 2.244882299044818, + "language_loss": 0.74999332, + "learning_rate": 3.786228297806741e-06, + "loss": 0.82869518, + "num_input_tokens_seen": 62629185, + "router_z_loss_clip": 2.73046875, + "router_z_loss_mlp": 0.24279785, + "step": 2895, + "time_per_iteration": 2.535771369934082 + }, + { + "auxiliary_loss_clip": 0.06500985, + "auxiliary_loss_mlp": 0.01252168, + "balance_loss_clip": 0.06341717, + "balance_loss_mlp": 0.01244449, + "epoch": 0.1741169397264392, + "flos": 61476537530880.0, + "grad_norm": 0.8158755233881254, + "language_loss": 0.62716168, + "learning_rate": 3.7860530723545435e-06, + "loss": 0.7046932, + "num_input_tokens_seen": 62691895, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.0770874, + "step": 2896, + "time_per_iteration": 3.260303497314453 + }, + { + "auxiliary_loss_clip": 0.06578183, + "auxiliary_loss_mlp": 0.01278967, + "balance_loss_clip": 0.06304477, + "balance_loss_mlp": 0.01254791, + "epoch": 0.17417706297910718, + "flos": 27025245515520.0, + "grad_norm": 1.768440838457988, + "language_loss": 0.76261735, + "learning_rate": 3.785877779175034e-06, + "loss": 0.84118891, + "num_input_tokens_seen": 62713790, + "router_z_loss_clip": 2.73632812, + "router_z_loss_mlp": 0.24157715, + "step": 2897, + "time_per_iteration": 3.9564483165740967 + }, + { + "auxiliary_loss_clip": 0.06567717, + "auxiliary_loss_mlp": 0.01283821, + "balance_loss_clip": 0.06302972, + "balance_loss_mlp": 0.01260325, + "epoch": 0.17423718623177514, + "flos": 33516957064320.0, + "grad_norm": 2.1770598890745694, + "language_loss": 0.7037769, + "learning_rate": 3.7857024182748606e-06, + "loss": 0.78229225, + "num_input_tokens_seen": 62736285, + "router_z_loss_clip": 2.64453125, + "router_z_loss_mlp": 0.23486328, + "step": 2898, + "time_per_iteration": 2.6747710704803467 + }, + { + "auxiliary_loss_clip": 0.06586026, + "auxiliary_loss_mlp": 0.01283538, + "balance_loss_clip": 0.0630955, + "balance_loss_mlp": 0.01261008, + "epoch": 0.1742973094844431, + "flos": 27205982772480.0, + "grad_norm": 2.322018652940294, + "language_loss": 0.77535176, + "learning_rate": 3.7855269896606717e-06, + "loss": 0.85404742, + "num_input_tokens_seen": 62756240, + "router_z_loss_clip": 2.76757812, + "router_z_loss_mlp": 0.22509766, + "step": 2899, + "time_per_iteration": 2.5824503898620605 + }, + { + "auxiliary_loss_clip": 0.06566149, + "auxiliary_loss_mlp": 0.01285927, + "balance_loss_clip": 0.06301811, + "balance_loss_mlp": 0.01263611, + "epoch": 0.17435743273711107, + "flos": 22717307076480.0, + "grad_norm": 1.8730005414784603, + "language_loss": 0.7345652, + "learning_rate": 3.785351493339121e-06, + "loss": 0.81308603, + "num_input_tokens_seen": 62775910, + "router_z_loss_clip": 2.64453125, + "router_z_loss_mlp": 0.22302246, + "step": 2900, + "time_per_iteration": 3.9656574726104736 + }, + { + "auxiliary_loss_clip": 0.06572049, + "auxiliary_loss_mlp": 0.01282784, + "balance_loss_clip": 0.06301104, + "balance_loss_mlp": 0.01259311, + "epoch": 0.17441755598977904, + "flos": 41656141664640.0, + "grad_norm": 1.6285149505686385, + "language_loss": 0.70661789, + "learning_rate": 3.785175929316863e-06, + "loss": 0.7851662, + "num_input_tokens_seen": 62799385, + "router_z_loss_clip": 2.71289062, + "router_z_loss_mlp": 0.23474121, + "step": 2901, + "time_per_iteration": 2.6915066242218018 + }, + { + "auxiliary_loss_clip": 0.06578797, + "auxiliary_loss_mlp": 0.01281619, + "balance_loss_clip": 0.06304422, + "balance_loss_mlp": 0.0125885, + "epoch": 0.174477679242447, + "flos": 26294372277120.0, + "grad_norm": 4.182093359181909, + "language_loss": 0.76958787, + "learning_rate": 3.7850002976005543e-06, + "loss": 0.84819204, + "num_input_tokens_seen": 62819380, + "router_z_loss_clip": 2.74414062, + "router_z_loss_mlp": 0.2277832, + "step": 2902, + "time_per_iteration": 2.58911395072937 + }, + { + "auxiliary_loss_clip": 0.06574767, + "auxiliary_loss_mlp": 0.0128676, + "balance_loss_clip": 0.06303128, + "balance_loss_mlp": 0.01265076, + "epoch": 0.174537802495115, + "flos": 17864221973760.0, + "grad_norm": 2.5386707468858942, + "language_loss": 0.82260907, + "learning_rate": 3.7848245981968558e-06, + "loss": 0.90122437, + "num_input_tokens_seen": 62836205, + "router_z_loss_clip": 2.71875, + "router_z_loss_mlp": 0.21679688, + "step": 2903, + "time_per_iteration": 3.919084072113037 + }, + { + "auxiliary_loss_clip": 0.06573024, + "auxiliary_loss_mlp": 0.01291861, + "balance_loss_clip": 0.06307561, + "balance_loss_mlp": 0.01269139, + "epoch": 0.17459792574778296, + "flos": 16945441954560.0, + "grad_norm": 1.7914306748896518, + "language_loss": 0.7447511, + "learning_rate": 3.784648831112429e-06, + "loss": 0.82340002, + "num_input_tokens_seen": 62854045, + "router_z_loss_clip": 2.65039062, + "router_z_loss_mlp": 0.22717285, + "step": 2904, + "time_per_iteration": 2.578841209411621 + }, + { + "auxiliary_loss_clip": 0.06575242, + "auxiliary_loss_mlp": 0.01290708, + "balance_loss_clip": 0.0630535, + "balance_loss_mlp": 0.01266592, + "epoch": 0.17465804900045093, + "flos": 25527049712640.0, + "grad_norm": 2.1432197986147004, + "language_loss": 0.65256733, + "learning_rate": 3.7844729963539406e-06, + "loss": 0.73122686, + "num_input_tokens_seen": 62873075, + "router_z_loss_clip": 2.703125, + "router_z_loss_mlp": 0.24133301, + "step": 2905, + "time_per_iteration": 3.9871487617492676 + }, + { + "auxiliary_loss_clip": 0.06593791, + "auxiliary_loss_mlp": 0.0129467, + "balance_loss_clip": 0.06312381, + "balance_loss_mlp": 0.01270137, + "epoch": 0.1747181722531189, + "flos": 24135853973760.0, + "grad_norm": 2.2797831517729046, + "language_loss": 0.80441433, + "learning_rate": 3.7842970939280566e-06, + "loss": 0.88329899, + "num_input_tokens_seen": 62892675, + "router_z_loss_clip": 2.8125, + "router_z_loss_mlp": 0.24511719, + "step": 2906, + "time_per_iteration": 2.556459903717041 + }, + { + "auxiliary_loss_clip": 0.065907, + "auxiliary_loss_mlp": 0.01299352, + "balance_loss_clip": 0.0631306, + "balance_loss_mlp": 0.01274306, + "epoch": 0.17477829550578686, + "flos": 17754580506240.0, + "grad_norm": 7.784703467250062, + "language_loss": 0.81983393, + "learning_rate": 3.784121123841449e-06, + "loss": 0.89873445, + "num_input_tokens_seen": 62910675, + "router_z_loss_clip": 2.77734375, + "router_z_loss_mlp": 0.25024414, + "step": 2907, + "time_per_iteration": 2.5256009101867676 + }, + { + "auxiliary_loss_clip": 0.06586979, + "auxiliary_loss_mlp": 0.01293929, + "balance_loss_clip": 0.06311269, + "balance_loss_mlp": 0.01269777, + "epoch": 0.17483841875845482, + "flos": 15382732907520.0, + "grad_norm": 1.9551973542338994, + "language_loss": 0.82190001, + "learning_rate": 3.7839450861007886e-06, + "loss": 0.90070903, + "num_input_tokens_seen": 62928130, + "router_z_loss_clip": 2.7578125, + "router_z_loss_mlp": 0.24133301, + "step": 2908, + "time_per_iteration": 2.5280957221984863 + }, + { + "auxiliary_loss_clip": 0.0658935, + "auxiliary_loss_mlp": 0.01308706, + "balance_loss_clip": 0.06314441, + "balance_loss_mlp": 0.01283279, + "epoch": 0.17489854201112282, + "flos": 17168624104320.0, + "grad_norm": 3.0308502496460243, + "language_loss": 0.8151319, + "learning_rate": 3.7837689807127518e-06, + "loss": 0.89411247, + "num_input_tokens_seen": 62944290, + "router_z_loss_clip": 2.74804688, + "router_z_loss_mlp": 0.25427246, + "step": 2909, + "time_per_iteration": 2.501805543899536 + }, + { + "auxiliary_loss_clip": 0.06591058, + "auxiliary_loss_mlp": 0.01307034, + "balance_loss_clip": 0.06313848, + "balance_loss_mlp": 0.01280235, + "epoch": 0.17495866526379078, + "flos": 19761347865600.0, + "grad_norm": 2.106593508541441, + "language_loss": 0.77213359, + "learning_rate": 3.783592807684017e-06, + "loss": 0.85111451, + "num_input_tokens_seen": 62963505, + "router_z_loss_clip": 2.76953125, + "router_z_loss_mlp": 0.26818848, + "step": 2910, + "time_per_iteration": 2.5401246547698975 + }, + { + "auxiliary_loss_clip": 0.065902, + "auxiliary_loss_mlp": 0.01309875, + "balance_loss_clip": 0.06316847, + "balance_loss_mlp": 0.01282147, + "epoch": 0.17501878851645875, + "flos": 28518535854720.0, + "grad_norm": 6.625386462851426, + "language_loss": 0.8799597, + "learning_rate": 3.7834165670212645e-06, + "loss": 0.95896053, + "num_input_tokens_seen": 62985020, + "router_z_loss_clip": 2.73242188, + "router_z_loss_mlp": 0.27770996, + "step": 2911, + "time_per_iteration": 2.60190486907959 + }, + { + "auxiliary_loss_clip": 0.06591105, + "auxiliary_loss_mlp": 0.01300463, + "balance_loss_clip": 0.06318109, + "balance_loss_mlp": 0.0127537, + "epoch": 0.1750789117691267, + "flos": 17936994844800.0, + "grad_norm": 2.1857421016012832, + "language_loss": 0.90469962, + "learning_rate": 3.7832402587311764e-06, + "loss": 0.98361528, + "num_input_tokens_seen": 63001745, + "router_z_loss_clip": 2.73046875, + "router_z_loss_mlp": 0.2512207, + "step": 2912, + "time_per_iteration": 2.5914218425750732 + }, + { + "auxiliary_loss_clip": 0.06588344, + "auxiliary_loss_mlp": 0.01304507, + "balance_loss_clip": 0.06308792, + "balance_loss_mlp": 0.01277041, + "epoch": 0.17513903502179468, + "flos": 18265248414720.0, + "grad_norm": 2.129743219312126, + "language_loss": 0.74037218, + "learning_rate": 3.783063882820439e-06, + "loss": 0.81930077, + "num_input_tokens_seen": 63019750, + "router_z_loss_clip": 2.79296875, + "router_z_loss_mlp": 0.27453613, + "step": 2913, + "time_per_iteration": 2.5509839057922363 + }, + { + "auxiliary_loss_clip": 0.06580269, + "auxiliary_loss_mlp": 0.01314219, + "balance_loss_clip": 0.06308483, + "balance_loss_mlp": 0.01289781, + "epoch": 0.17519915827446264, + "flos": 20711084768640.0, + "grad_norm": 1.8784732947097995, + "language_loss": 0.70240569, + "learning_rate": 3.782887439295741e-06, + "loss": 0.78135055, + "num_input_tokens_seen": 63039500, + "router_z_loss_clip": 2.71289062, + "router_z_loss_mlp": 0.24450684, + "step": 2914, + "time_per_iteration": 2.560774564743042 + }, + { + "auxiliary_loss_clip": 0.06575729, + "auxiliary_loss_mlp": 0.0130416, + "balance_loss_clip": 0.06304997, + "balance_loss_mlp": 0.01278935, + "epoch": 0.1752592815271306, + "flos": 20529928241280.0, + "grad_norm": 1.7233134110017265, + "language_loss": 0.94360971, + "learning_rate": 3.782710928163772e-06, + "loss": 1.0224086, + "num_input_tokens_seen": 63059785, + "router_z_loss_clip": 2.70507812, + "router_z_loss_mlp": 0.25231934, + "step": 2915, + "time_per_iteration": 2.5500216484069824 + }, + { + "auxiliary_loss_clip": 0.06576817, + "auxiliary_loss_mlp": 0.01301313, + "balance_loss_clip": 0.06306335, + "balance_loss_mlp": 0.01277269, + "epoch": 0.1753194047797986, + "flos": 21805696581120.0, + "grad_norm": 1.6995224084103926, + "language_loss": 0.81995428, + "learning_rate": 3.782534349431226e-06, + "loss": 0.89873564, + "num_input_tokens_seen": 63079385, + "router_z_loss_clip": 2.70703125, + "router_z_loss_mlp": 0.24060059, + "step": 2916, + "time_per_iteration": 2.6210248470306396 + }, + { + "auxiliary_loss_clip": 0.06578801, + "auxiliary_loss_mlp": 0.01308944, + "balance_loss_clip": 0.06305841, + "balance_loss_mlp": 0.01282694, + "epoch": 0.17537952803246656, + "flos": 20674719296640.0, + "grad_norm": 7.015160336993527, + "language_loss": 0.74587643, + "learning_rate": 3.782357703104799e-06, + "loss": 0.82475388, + "num_input_tokens_seen": 63098970, + "router_z_loss_clip": 2.72851562, + "router_z_loss_mlp": 0.26245117, + "step": 2917, + "time_per_iteration": 2.5568697452545166 + }, + { + "auxiliary_loss_clip": 0.06575756, + "auxiliary_loss_mlp": 0.01293408, + "balance_loss_clip": 0.06306349, + "balance_loss_mlp": 0.01269018, + "epoch": 0.17543965128513453, + "flos": 23301837959040.0, + "grad_norm": 1.9034970134752385, + "language_loss": 0.77783519, + "learning_rate": 3.7821809891911897e-06, + "loss": 0.85652685, + "num_input_tokens_seen": 63118750, + "router_z_loss_clip": 2.6953125, + "router_z_loss_mlp": 0.24414062, + "step": 2918, + "time_per_iteration": 2.592294692993164 + }, + { + "auxiliary_loss_clip": 0.06589542, + "auxiliary_loss_mlp": 0.01295236, + "balance_loss_clip": 0.06310425, + "balance_loss_mlp": 0.01271549, + "epoch": 0.1754997745378025, + "flos": 29103234445440.0, + "grad_norm": 2.152727236459042, + "language_loss": 0.75315654, + "learning_rate": 3.782004207697098e-06, + "loss": 0.83200431, + "num_input_tokens_seen": 63136865, + "router_z_loss_clip": 2.79101562, + "router_z_loss_mlp": 0.23693848, + "step": 2919, + "time_per_iteration": 2.67553973197937 + }, + { + "auxiliary_loss_clip": 0.06596158, + "auxiliary_loss_mlp": 0.01303514, + "balance_loss_clip": 0.06314485, + "balance_loss_mlp": 0.01279601, + "epoch": 0.17555989779047046, + "flos": 30379547836800.0, + "grad_norm": 1.8096477139902465, + "language_loss": 0.74872279, + "learning_rate": 3.781827358629228e-06, + "loss": 0.82771957, + "num_input_tokens_seen": 63158325, + "router_z_loss_clip": 2.81835938, + "router_z_loss_mlp": 0.23925781, + "step": 2920, + "time_per_iteration": 2.6885359287261963 + }, + { + "auxiliary_loss_clip": 0.06577891, + "auxiliary_loss_mlp": 0.01294192, + "balance_loss_clip": 0.06307238, + "balance_loss_mlp": 0.01270982, + "epoch": 0.17562002104313842, + "flos": 23293284842880.0, + "grad_norm": 2.5308626608738423, + "language_loss": 0.80572176, + "learning_rate": 3.7816504419942873e-06, + "loss": 0.88444257, + "num_input_tokens_seen": 63173115, + "router_z_loss_clip": 2.70703125, + "router_z_loss_mlp": 0.23217773, + "step": 2921, + "time_per_iteration": 2.51985502243042 + }, + { + "auxiliary_loss_clip": 0.06590457, + "auxiliary_loss_mlp": 0.01284789, + "balance_loss_clip": 0.06311172, + "balance_loss_mlp": 0.01260971, + "epoch": 0.1756801442958064, + "flos": 24797434285440.0, + "grad_norm": 1.5780045761030037, + "language_loss": 0.88755381, + "learning_rate": 3.7814734577989823e-06, + "loss": 0.96630621, + "num_input_tokens_seen": 63192880, + "router_z_loss_clip": 2.79296875, + "router_z_loss_mlp": 0.23815918, + "step": 2922, + "time_per_iteration": 2.595477819442749 + }, + { + "auxiliary_loss_clip": 0.06584172, + "auxiliary_loss_mlp": 0.01290113, + "balance_loss_clip": 0.06306588, + "balance_loss_mlp": 0.01265211, + "epoch": 0.17574026754847438, + "flos": 25778086145280.0, + "grad_norm": 2.2356333874414043, + "language_loss": 0.63389397, + "learning_rate": 3.7812964060500253e-06, + "loss": 0.71263683, + "num_input_tokens_seen": 63214395, + "router_z_loss_clip": 2.78125, + "router_z_loss_mlp": 0.24890137, + "step": 2923, + "time_per_iteration": 2.56712007522583 + }, + { + "auxiliary_loss_clip": 0.06590886, + "auxiliary_loss_mlp": 0.01293522, + "balance_loss_clip": 0.06313786, + "balance_loss_mlp": 0.01269394, + "epoch": 0.17580039080114235, + "flos": 17462273137920.0, + "grad_norm": 2.8211803221017617, + "language_loss": 0.81614435, + "learning_rate": 3.78111928675413e-06, + "loss": 0.89498842, + "num_input_tokens_seen": 63231020, + "router_z_loss_clip": 2.77148438, + "router_z_loss_mlp": 0.24145508, + "step": 2924, + "time_per_iteration": 2.5396065711975098 + }, + { + "auxiliary_loss_clip": 0.06586142, + "auxiliary_loss_mlp": 0.01294774, + "balance_loss_clip": 0.06306558, + "balance_loss_mlp": 0.01269108, + "epoch": 0.1758605140538103, + "flos": 14869633230720.0, + "grad_norm": 2.6608767055753244, + "language_loss": 0.71953624, + "learning_rate": 3.7809420999180126e-06, + "loss": 0.79834545, + "num_input_tokens_seen": 63246245, + "router_z_loss_clip": 2.79492188, + "router_z_loss_mlp": 0.25671387, + "step": 2925, + "time_per_iteration": 2.594172239303589 + }, + { + "auxiliary_loss_clip": 0.0657725, + "auxiliary_loss_mlp": 0.01284494, + "balance_loss_clip": 0.06310555, + "balance_loss_mlp": 0.01261546, + "epoch": 0.17592063730647828, + "flos": 23011165745280.0, + "grad_norm": 1.6593164954495325, + "language_loss": 0.72342992, + "learning_rate": 3.7807648455483934e-06, + "loss": 0.80204731, + "num_input_tokens_seen": 63267790, + "router_z_loss_clip": 2.6640625, + "router_z_loss_mlp": 0.22961426, + "step": 2926, + "time_per_iteration": 2.592061758041382 + }, + { + "auxiliary_loss_clip": 0.06592301, + "auxiliary_loss_mlp": 0.0128622, + "balance_loss_clip": 0.06310115, + "balance_loss_mlp": 0.01260911, + "epoch": 0.17598076055914624, + "flos": 20747911438080.0, + "grad_norm": 1.7750261498089963, + "language_loss": 0.85897779, + "learning_rate": 3.7805875236519918e-06, + "loss": 0.93776292, + "num_input_tokens_seen": 63286830, + "router_z_loss_clip": 2.8203125, + "router_z_loss_mlp": 0.25317383, + "step": 2927, + "time_per_iteration": 2.546537160873413 + }, + { + "auxiliary_loss_clip": 0.06583759, + "auxiliary_loss_mlp": 0.01277616, + "balance_loss_clip": 0.06312352, + "balance_loss_mlp": 0.01255431, + "epoch": 0.1760408838118142, + "flos": 34100607479040.0, + "grad_norm": 1.9484214610767971, + "language_loss": 0.72539592, + "learning_rate": 3.7804101342355336e-06, + "loss": 0.80400968, + "num_input_tokens_seen": 63308870, + "router_z_loss_clip": 2.71289062, + "router_z_loss_mlp": 0.22167969, + "step": 2928, + "time_per_iteration": 2.674516201019287 + }, + { + "auxiliary_loss_clip": 0.06577812, + "auxiliary_loss_mlp": 0.01278822, + "balance_loss_clip": 0.06308608, + "balance_loss_mlp": 0.01256292, + "epoch": 0.1761010070644822, + "flos": 24174902776320.0, + "grad_norm": 1.786019104625144, + "language_loss": 0.83572811, + "learning_rate": 3.780232677305744e-06, + "loss": 0.91429448, + "num_input_tokens_seen": 63329005, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 0.22521973, + "step": 2929, + "time_per_iteration": 2.5528249740600586 + }, + { + "auxiliary_loss_clip": 0.06584716, + "auxiliary_loss_mlp": 0.01284422, + "balance_loss_clip": 0.06311291, + "balance_loss_mlp": 0.01261439, + "epoch": 0.17616113031715017, + "flos": 26583660898560.0, + "grad_norm": 1.8454669041222298, + "language_loss": 0.80018413, + "learning_rate": 3.7800551528693535e-06, + "loss": 0.87887549, + "num_input_tokens_seen": 63349390, + "router_z_loss_clip": 2.73242188, + "router_z_loss_mlp": 0.2298584, + "step": 2930, + "time_per_iteration": 2.6004958152770996 + }, + { + "auxiliary_loss_clip": 0.06579742, + "auxiliary_loss_mlp": 0.01287089, + "balance_loss_clip": 0.06306133, + "balance_loss_mlp": 0.01261935, + "epoch": 0.17622125356981813, + "flos": 25673853265920.0, + "grad_norm": 2.4724081113031677, + "language_loss": 0.77905595, + "learning_rate": 3.7798775609330927e-06, + "loss": 0.85772425, + "num_input_tokens_seen": 63368835, + "router_z_loss_clip": 2.73632812, + "router_z_loss_mlp": 0.25195312, + "step": 2931, + "time_per_iteration": 2.580275774002075 + }, + { + "auxiliary_loss_clip": 0.0657528, + "auxiliary_loss_mlp": 0.01279459, + "balance_loss_clip": 0.063051, + "balance_loss_mlp": 0.01256988, + "epoch": 0.1762813768224861, + "flos": 16514129462400.0, + "grad_norm": 2.8370907048277973, + "language_loss": 0.75863802, + "learning_rate": 3.779699901503696e-06, + "loss": 0.83718544, + "num_input_tokens_seen": 63385220, + "router_z_loss_clip": 2.703125, + "router_z_loss_mlp": 0.22473145, + "step": 2932, + "time_per_iteration": 2.5535829067230225 + }, + { + "auxiliary_loss_clip": 0.06587049, + "auxiliary_loss_mlp": 0.0128414, + "balance_loss_clip": 0.06307124, + "balance_loss_mlp": 0.01258975, + "epoch": 0.17634150007515406, + "flos": 11215518600960.0, + "grad_norm": 2.570844699660862, + "language_loss": 0.90240741, + "learning_rate": 3.7795221745879016e-06, + "loss": 0.98111933, + "num_input_tokens_seen": 63400865, + "router_z_loss_clip": 2.80273438, + "router_z_loss_mlp": 0.25146484, + "step": 2933, + "time_per_iteration": 2.5120935440063477 + }, + { + "auxiliary_loss_clip": 0.06578325, + "auxiliary_loss_mlp": 0.01278816, + "balance_loss_clip": 0.06313163, + "balance_loss_mlp": 0.01256893, + "epoch": 0.17640162332782203, + "flos": 23666750490240.0, + "grad_norm": 2.3821255620265376, + "language_loss": 0.89272201, + "learning_rate": 3.779344380192448e-06, + "loss": 0.97129339, + "num_input_tokens_seen": 63421390, + "router_z_loss_clip": 2.65429688, + "router_z_loss_mlp": 0.21936035, + "step": 2934, + "time_per_iteration": 2.5753555297851562 + }, + { + "auxiliary_loss_clip": 0.06578338, + "auxiliary_loss_mlp": 0.0128005, + "balance_loss_clip": 0.0630947, + "balance_loss_mlp": 0.0125709, + "epoch": 0.17646174658049, + "flos": 53808819056640.0, + "grad_norm": 1.971590125699774, + "language_loss": 0.71700215, + "learning_rate": 3.779166518324077e-06, + "loss": 0.79558611, + "num_input_tokens_seen": 63444715, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 0.2298584, + "step": 2935, + "time_per_iteration": 2.8537397384643555 + }, + { + "auxiliary_loss_clip": 0.06584434, + "auxiliary_loss_mlp": 0.01288458, + "balance_loss_clip": 0.06307955, + "balance_loss_mlp": 0.01264401, + "epoch": 0.17652186983315798, + "flos": 24250820175360.0, + "grad_norm": 8.554775287736033, + "language_loss": 0.71186781, + "learning_rate": 3.7789885889895325e-06, + "loss": 0.79059678, + "num_input_tokens_seen": 63465525, + "router_z_loss_clip": 2.76953125, + "router_z_loss_mlp": 0.24047852, + "step": 2936, + "time_per_iteration": 4.091250896453857 + }, + { + "auxiliary_loss_clip": 0.06580865, + "auxiliary_loss_mlp": 0.01286216, + "balance_loss_clip": 0.06309694, + "balance_loss_mlp": 0.01263745, + "epoch": 0.17658199308582595, + "flos": 27461715033600.0, + "grad_norm": 1.9442195602404513, + "language_loss": 0.72206265, + "learning_rate": 3.7788105921955634e-06, + "loss": 0.80073345, + "num_input_tokens_seen": 63485815, + "router_z_loss_clip": 2.7109375, + "router_z_loss_mlp": 0.22473145, + "step": 2937, + "time_per_iteration": 2.5836215019226074 + }, + { + "auxiliary_loss_clip": 0.06581761, + "auxiliary_loss_mlp": 0.0128249, + "balance_loss_clip": 0.06303879, + "balance_loss_mlp": 0.01258088, + "epoch": 0.17664211633849392, + "flos": 22425167416320.0, + "grad_norm": 2.618384752485795, + "language_loss": 0.76896954, + "learning_rate": 3.7786325279489184e-06, + "loss": 0.84761202, + "num_input_tokens_seen": 63503905, + "router_z_loss_clip": 2.77929688, + "router_z_loss_mlp": 0.24389648, + "step": 2938, + "time_per_iteration": 2.5426154136657715 + }, + { + "auxiliary_loss_clip": 0.06581972, + "auxiliary_loss_mlp": 0.0129211, + "balance_loss_clip": 0.06306289, + "balance_loss_mlp": 0.01268638, + "epoch": 0.17670223959116188, + "flos": 24721642667520.0, + "grad_norm": 2.0224209621562803, + "language_loss": 0.72049117, + "learning_rate": 3.7784543962563495e-06, + "loss": 0.79923201, + "num_input_tokens_seen": 63521985, + "router_z_loss_clip": 2.7578125, + "router_z_loss_mlp": 0.23474121, + "step": 2939, + "time_per_iteration": 4.034467935562134 + }, + { + "auxiliary_loss_clip": 0.06574269, + "auxiliary_loss_mlp": 0.01281711, + "balance_loss_clip": 0.06305616, + "balance_loss_mlp": 0.01258668, + "epoch": 0.17676236284382985, + "flos": 22533383364480.0, + "grad_norm": 2.2379803860691667, + "language_loss": 0.75736713, + "learning_rate": 3.7782761971246115e-06, + "loss": 0.83592695, + "num_input_tokens_seen": 63539830, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 0.23034668, + "step": 2940, + "time_per_iteration": 2.6091058254241943 + }, + { + "auxiliary_loss_clip": 0.06579125, + "auxiliary_loss_mlp": 0.01284811, + "balance_loss_clip": 0.06305407, + "balance_loss_mlp": 0.01261494, + "epoch": 0.1768224860964978, + "flos": 12389988954240.0, + "grad_norm": 2.2625025035762443, + "language_loss": 0.86326134, + "learning_rate": 3.7780979305604616e-06, + "loss": 0.94190073, + "num_input_tokens_seen": 63555495, + "router_z_loss_clip": 2.73632812, + "router_z_loss_mlp": 0.2331543, + "step": 2941, + "time_per_iteration": 2.529346227645874 + }, + { + "auxiliary_loss_clip": 0.06590004, + "auxiliary_loss_mlp": 0.01292545, + "balance_loss_clip": 0.06314506, + "balance_loss_mlp": 0.01269073, + "epoch": 0.1768826093491658, + "flos": 24360335861760.0, + "grad_norm": 2.5150262997144806, + "language_loss": 0.78079373, + "learning_rate": 3.7779195965706607e-06, + "loss": 0.8596192, + "num_input_tokens_seen": 63575290, + "router_z_loss_clip": 2.75390625, + "router_z_loss_mlp": 0.23498535, + "step": 2942, + "time_per_iteration": 2.5893354415893555 + }, + { + "auxiliary_loss_clip": 0.06590073, + "auxiliary_loss_mlp": 0.01285718, + "balance_loss_clip": 0.06313878, + "balance_loss_mlp": 0.01261745, + "epoch": 0.17694273260183377, + "flos": 23593893765120.0, + "grad_norm": 1.793399089669822, + "language_loss": 0.81007993, + "learning_rate": 3.77774119516197e-06, + "loss": 0.88883781, + "num_input_tokens_seen": 63594670, + "router_z_loss_clip": 2.76171875, + "router_z_loss_mlp": 0.23962402, + "step": 2943, + "time_per_iteration": 4.085087537765503 + }, + { + "auxiliary_loss_clip": 0.065895, + "auxiliary_loss_mlp": 0.01284454, + "balance_loss_clip": 0.06311318, + "balance_loss_mlp": 0.01260266, + "epoch": 0.17700285585450173, + "flos": 26768297370240.0, + "grad_norm": 2.7078535987609524, + "language_loss": 0.81690747, + "learning_rate": 3.777562726341155e-06, + "loss": 0.89564693, + "num_input_tokens_seen": 63614780, + "router_z_loss_clip": 2.78125, + "router_z_loss_mlp": 0.24194336, + "step": 2944, + "time_per_iteration": 4.037370204925537 + }, + { + "auxiliary_loss_clip": 0.06577846, + "auxiliary_loss_mlp": 0.01285687, + "balance_loss_clip": 0.06307179, + "balance_loss_mlp": 0.01262, + "epoch": 0.1770629791071697, + "flos": 42785986919040.0, + "grad_norm": 3.287704950657118, + "language_loss": 0.74187398, + "learning_rate": 3.7773841901149835e-06, + "loss": 0.82050931, + "num_input_tokens_seen": 63637190, + "router_z_loss_clip": 2.70898438, + "router_z_loss_mlp": 0.23693848, + "step": 2945, + "time_per_iteration": 2.726703405380249 + }, + { + "auxiliary_loss_clip": 0.06568955, + "auxiliary_loss_mlp": 0.01286818, + "balance_loss_clip": 0.06300092, + "balance_loss_mlp": 0.01263596, + "epoch": 0.17712310235983766, + "flos": 17350954588800.0, + "grad_norm": 3.5781735305150013, + "language_loss": 0.78848231, + "learning_rate": 3.7772055864902256e-06, + "loss": 0.86704004, + "num_input_tokens_seen": 63652140, + "router_z_loss_clip": 2.68554688, + "router_z_loss_mlp": 0.23217773, + "step": 2946, + "time_per_iteration": 2.6050639152526855 + }, + { + "auxiliary_loss_clip": 0.06568858, + "auxiliary_loss_mlp": 0.01284865, + "balance_loss_clip": 0.06300168, + "balance_loss_mlp": 0.01262156, + "epoch": 0.17718322561250563, + "flos": 23885278738560.0, + "grad_norm": 1.9584306466242212, + "language_loss": 0.77679253, + "learning_rate": 3.7770269154736535e-06, + "loss": 0.85532975, + "num_input_tokens_seen": 63671700, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 0.22705078, + "step": 2947, + "time_per_iteration": 2.562394857406616 + }, + { + "auxiliary_loss_clip": 0.06579228, + "auxiliary_loss_mlp": 0.01286605, + "balance_loss_clip": 0.06305858, + "balance_loss_mlp": 0.01262573, + "epoch": 0.1772433488651736, + "flos": 36475306116480.0, + "grad_norm": 3.3061595908349193, + "language_loss": 0.7337119, + "learning_rate": 3.7768481770720424e-06, + "loss": 0.81237024, + "num_input_tokens_seen": 63691685, + "router_z_loss_clip": 2.734375, + "router_z_loss_mlp": 0.24023438, + "step": 2948, + "time_per_iteration": 2.6738815307617188 + }, + { + "auxiliary_loss_clip": 0.06568594, + "auxiliary_loss_mlp": 0.01285694, + "balance_loss_clip": 0.06305531, + "balance_loss_mlp": 0.01263915, + "epoch": 0.1773034721178416, + "flos": 26691457576320.0, + "grad_norm": 2.3861566912178915, + "language_loss": 0.82720947, + "learning_rate": 3.776669371292171e-06, + "loss": 0.90575236, + "num_input_tokens_seen": 63711720, + "router_z_loss_clip": 2.63085938, + "router_z_loss_mlp": 0.21777344, + "step": 2949, + "time_per_iteration": 2.6339261531829834 + }, + { + "auxiliary_loss_clip": 0.06558515, + "auxiliary_loss_mlp": 0.0129088, + "balance_loss_clip": 0.06397671, + "balance_loss_mlp": 0.01282136, + "epoch": 0.17736359537050955, + "flos": 57136007053440.0, + "grad_norm": 0.7127406603181583, + "language_loss": 0.65079832, + "learning_rate": 3.7764904981408186e-06, + "loss": 0.72929227, + "num_input_tokens_seen": 63776280, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.08758545, + "step": 2950, + "time_per_iteration": 3.2668871879577637 + }, + { + "auxiliary_loss_clip": 0.06572378, + "auxiliary_loss_mlp": 0.01284106, + "balance_loss_clip": 0.06306554, + "balance_loss_mlp": 0.01260896, + "epoch": 0.17742371862317752, + "flos": 27205479648000.0, + "grad_norm": 1.9196695606626306, + "language_loss": 0.84746122, + "learning_rate": 3.7763115576247686e-06, + "loss": 0.92602605, + "num_input_tokens_seen": 63797535, + "router_z_loss_clip": 2.65820312, + "router_z_loss_mlp": 0.2322998, + "step": 2951, + "time_per_iteration": 2.585566520690918 + }, + { + "auxiliary_loss_clip": 0.06574618, + "auxiliary_loss_mlp": 0.01283229, + "balance_loss_clip": 0.06301534, + "balance_loss_mlp": 0.01260556, + "epoch": 0.17748384187584548, + "flos": 20966020416000.0, + "grad_norm": 2.232427680766164, + "language_loss": 0.82122993, + "learning_rate": 3.776132549750806e-06, + "loss": 0.89980847, + "num_input_tokens_seen": 63817045, + "router_z_loss_clip": 2.72851562, + "router_z_loss_mlp": 0.22680664, + "step": 2952, + "time_per_iteration": 2.55747652053833 + }, + { + "auxiliary_loss_clip": 0.06570595, + "auxiliary_loss_mlp": 0.01296069, + "balance_loss_clip": 0.06303248, + "balance_loss_mlp": 0.01272251, + "epoch": 0.17754396512851345, + "flos": 25017052636800.0, + "grad_norm": 5.629810818318968, + "language_loss": 0.8066265, + "learning_rate": 3.7759534745257194e-06, + "loss": 0.88529313, + "num_input_tokens_seen": 63837665, + "router_z_loss_clip": 2.671875, + "router_z_loss_mlp": 0.23840332, + "step": 2953, + "time_per_iteration": 2.5756490230560303 + }, + { + "auxiliary_loss_clip": 0.06576403, + "auxiliary_loss_mlp": 0.01299444, + "balance_loss_clip": 0.06307617, + "balance_loss_mlp": 0.01275877, + "epoch": 0.1776040883811814, + "flos": 32059780634880.0, + "grad_norm": 1.9568540134603198, + "language_loss": 0.89472413, + "learning_rate": 3.7757743319562994e-06, + "loss": 0.97348255, + "num_input_tokens_seen": 63858455, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 0.2355957, + "step": 2954, + "time_per_iteration": 2.64989972114563 + }, + { + "auxiliary_loss_clip": 0.06576417, + "auxiliary_loss_mlp": 0.01304463, + "balance_loss_clip": 0.06308817, + "balance_loss_mlp": 0.01280788, + "epoch": 0.17766421163384938, + "flos": 21579579538560.0, + "grad_norm": 2.0844074095191423, + "language_loss": 0.85445726, + "learning_rate": 3.7755951220493386e-06, + "loss": 0.93326604, + "num_input_tokens_seen": 63876935, + "router_z_loss_clip": 2.6796875, + "router_z_loss_mlp": 0.23693848, + "step": 2955, + "time_per_iteration": 2.5314552783966064 + }, + { + "auxiliary_loss_clip": 0.06566998, + "auxiliary_loss_mlp": 0.01298177, + "balance_loss_clip": 0.06301849, + "balance_loss_mlp": 0.01274287, + "epoch": 0.17772433488651737, + "flos": 22425922103040.0, + "grad_norm": 1.629233918934169, + "language_loss": 0.7198323, + "learning_rate": 3.7754158448116327e-06, + "loss": 0.79848409, + "num_input_tokens_seen": 63896815, + "router_z_loss_clip": 2.65039062, + "router_z_loss_mlp": 0.2388916, + "step": 2956, + "time_per_iteration": 2.5686161518096924 + }, + { + "auxiliary_loss_clip": 0.06565966, + "auxiliary_loss_mlp": 0.01302663, + "balance_loss_clip": 0.06303196, + "balance_loss_mlp": 0.01279632, + "epoch": 0.17778445813918534, + "flos": 25636481544960.0, + "grad_norm": 1.8690466813220736, + "language_loss": 0.8383618, + "learning_rate": 3.7752365002499795e-06, + "loss": 0.9170481, + "num_input_tokens_seen": 63916140, + "router_z_loss_clip": 2.63085938, + "router_z_loss_mlp": 0.23034668, + "step": 2957, + "time_per_iteration": 2.5693180561065674 + }, + { + "auxiliary_loss_clip": 0.06574687, + "auxiliary_loss_mlp": 0.0129708, + "balance_loss_clip": 0.06307757, + "balance_loss_mlp": 0.01274323, + "epoch": 0.1778445813918533, + "flos": 25635810712320.0, + "grad_norm": 1.5960329991483622, + "language_loss": 0.75535214, + "learning_rate": 3.7750570883711807e-06, + "loss": 0.83406979, + "num_input_tokens_seen": 63935220, + "router_z_loss_clip": 2.66992188, + "router_z_loss_mlp": 0.22753906, + "step": 2958, + "time_per_iteration": 2.6068832874298096 + }, + { + "auxiliary_loss_clip": 0.06572513, + "auxiliary_loss_mlp": 0.01295837, + "balance_loss_clip": 0.06305174, + "balance_loss_mlp": 0.01273533, + "epoch": 0.17790470464452127, + "flos": 22351975274880.0, + "grad_norm": 2.4916809347301867, + "language_loss": 0.8152473, + "learning_rate": 3.7748776091820397e-06, + "loss": 0.89393079, + "num_input_tokens_seen": 63954550, + "router_z_loss_clip": 2.67773438, + "router_z_loss_mlp": 0.22302246, + "step": 2959, + "time_per_iteration": 2.532893419265747 + }, + { + "auxiliary_loss_clip": 0.06580231, + "auxiliary_loss_mlp": 0.01291039, + "balance_loss_clip": 0.06308466, + "balance_loss_mlp": 0.01267293, + "epoch": 0.17796482789718923, + "flos": 18771052786560.0, + "grad_norm": 1.971364332808954, + "language_loss": 0.52699149, + "learning_rate": 3.774698062689362e-06, + "loss": 0.60570425, + "num_input_tokens_seen": 63972425, + "router_z_loss_clip": 2.71679688, + "router_z_loss_mlp": 0.23754883, + "step": 2960, + "time_per_iteration": 2.5427799224853516 + }, + { + "auxiliary_loss_clip": 0.06575893, + "auxiliary_loss_mlp": 0.01290781, + "balance_loss_clip": 0.06308038, + "balance_loss_mlp": 0.01267726, + "epoch": 0.1780249511498572, + "flos": 23447719117440.0, + "grad_norm": 1.7972451693934908, + "language_loss": 0.90068716, + "learning_rate": 3.7745184488999548e-06, + "loss": 0.97935379, + "num_input_tokens_seen": 63992165, + "router_z_loss_clip": 2.6796875, + "router_z_loss_mlp": 0.23083496, + "step": 2961, + "time_per_iteration": 2.5641977787017822 + }, + { + "auxiliary_loss_clip": 0.06579147, + "auxiliary_loss_mlp": 0.01285295, + "balance_loss_clip": 0.06309063, + "balance_loss_mlp": 0.0126075, + "epoch": 0.1780850744025252, + "flos": 23374149632640.0, + "grad_norm": 3.006724243875413, + "language_loss": 0.79600328, + "learning_rate": 3.774338767820631e-06, + "loss": 0.87464768, + "num_input_tokens_seen": 64013470, + "router_z_loss_clip": 2.703125, + "router_z_loss_mlp": 0.2454834, + "step": 2962, + "time_per_iteration": 2.605395555496216 + }, + { + "auxiliary_loss_clip": 0.06579778, + "auxiliary_loss_mlp": 0.01288142, + "balance_loss_clip": 0.06310856, + "balance_loss_mlp": 0.01262977, + "epoch": 0.17814519765519315, + "flos": 13777117770240.0, + "grad_norm": 1.8585534107816564, + "language_loss": 0.75987798, + "learning_rate": 3.774159019458203e-06, + "loss": 0.83855718, + "num_input_tokens_seen": 64030975, + "router_z_loss_clip": 2.6875, + "router_z_loss_mlp": 0.25146484, + "step": 2963, + "time_per_iteration": 2.4989051818847656 + }, + { + "auxiliary_loss_clip": 0.06582604, + "auxiliary_loss_mlp": 0.01280238, + "balance_loss_clip": 0.06308165, + "balance_loss_mlp": 0.01255573, + "epoch": 0.17820532090786112, + "flos": 21982073425920.0, + "grad_norm": 2.394373782804808, + "language_loss": 0.79892176, + "learning_rate": 3.7739792038194877e-06, + "loss": 0.87755024, + "num_input_tokens_seen": 64050075, + "router_z_loss_clip": 2.74609375, + "router_z_loss_mlp": 0.24682617, + "step": 2964, + "time_per_iteration": 2.6040844917297363 + }, + { + "auxiliary_loss_clip": 0.06584992, + "auxiliary_loss_mlp": 0.01284037, + "balance_loss_clip": 0.06315298, + "balance_loss_mlp": 0.01259289, + "epoch": 0.17826544416052909, + "flos": 24797727774720.0, + "grad_norm": 4.1010799155066, + "language_loss": 0.8221398, + "learning_rate": 3.7737993209113027e-06, + "loss": 0.90083003, + "num_input_tokens_seen": 64071920, + "router_z_loss_clip": 2.69726562, + "router_z_loss_mlp": 0.24755859, + "step": 2965, + "time_per_iteration": 2.5539731979370117 + }, + { + "auxiliary_loss_clip": 0.06570912, + "auxiliary_loss_mlp": 0.01281116, + "balance_loss_clip": 0.06306428, + "balance_loss_mlp": 0.01258788, + "epoch": 0.17832556741319705, + "flos": 13884411323520.0, + "grad_norm": 2.4679554184574974, + "language_loss": 0.96086347, + "learning_rate": 3.7736193707404698e-06, + "loss": 1.03938377, + "num_input_tokens_seen": 64086835, + "router_z_loss_clip": 2.64257812, + "router_z_loss_mlp": 0.22338867, + "step": 2966, + "time_per_iteration": 2.527735948562622 + }, + { + "auxiliary_loss_clip": 0.06579631, + "auxiliary_loss_mlp": 0.01280876, + "balance_loss_clip": 0.06311509, + "balance_loss_mlp": 0.0125688, + "epoch": 0.17838569066586502, + "flos": 36649502755200.0, + "grad_norm": 2.0843689120837965, + "language_loss": 0.73698831, + "learning_rate": 3.7734393533138127e-06, + "loss": 0.81559336, + "num_input_tokens_seen": 64107360, + "router_z_loss_clip": 2.68359375, + "router_z_loss_mlp": 0.24023438, + "step": 2967, + "time_per_iteration": 2.7015600204467773 + }, + { + "auxiliary_loss_clip": 0.06577688, + "auxiliary_loss_mlp": 0.01283294, + "balance_loss_clip": 0.06315881, + "balance_loss_mlp": 0.01260192, + "epoch": 0.17844581391853298, + "flos": 18732087838080.0, + "grad_norm": 3.4272342033369956, + "language_loss": 0.77622253, + "learning_rate": 3.773259268638157e-06, + "loss": 0.85483229, + "num_input_tokens_seen": 64124690, + "router_z_loss_clip": 2.62109375, + "router_z_loss_mlp": 0.2310791, + "step": 2968, + "time_per_iteration": 2.5782222747802734 + }, + { + "auxiliary_loss_clip": 0.06574235, + "auxiliary_loss_mlp": 0.01280569, + "balance_loss_clip": 0.06309816, + "balance_loss_mlp": 0.01257716, + "epoch": 0.17850593717120097, + "flos": 27385168728960.0, + "grad_norm": 2.732998701382931, + "language_loss": 0.76891911, + "learning_rate": 3.7730791167203333e-06, + "loss": 0.84746712, + "num_input_tokens_seen": 64146315, + "router_z_loss_clip": 2.64453125, + "router_z_loss_mlp": 0.2286377, + "step": 2969, + "time_per_iteration": 2.596932888031006 + }, + { + "auxiliary_loss_clip": 0.06469887, + "auxiliary_loss_mlp": 0.01257031, + "balance_loss_clip": 0.06316882, + "balance_loss_mlp": 0.01250105, + "epoch": 0.17856606042386894, + "flos": 67014696816000.0, + "grad_norm": 0.8163537423270849, + "language_loss": 0.69127434, + "learning_rate": 3.772898897567171e-06, + "loss": 0.76854354, + "num_input_tokens_seen": 64210875, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.06939697, + "step": 2970, + "time_per_iteration": 3.239208221435547 + }, + { + "auxiliary_loss_clip": 0.06585611, + "auxiliary_loss_mlp": 0.01285467, + "balance_loss_clip": 0.06311353, + "balance_loss_mlp": 0.01261936, + "epoch": 0.1786261836765369, + "flos": 36986015952000.0, + "grad_norm": 1.9165060952178286, + "language_loss": 0.67737955, + "learning_rate": 3.772718611185505e-06, + "loss": 0.75609034, + "num_input_tokens_seen": 64230740, + "router_z_loss_clip": 2.7421875, + "router_z_loss_mlp": 0.23522949, + "step": 2971, + "time_per_iteration": 2.6962218284606934 + }, + { + "auxiliary_loss_clip": 0.06573113, + "auxiliary_loss_mlp": 0.01289649, + "balance_loss_clip": 0.06303954, + "balance_loss_mlp": 0.01265164, + "epoch": 0.17868630692920487, + "flos": 24832122675840.0, + "grad_norm": 2.3195878790033992, + "language_loss": 0.90615618, + "learning_rate": 3.7725382575821717e-06, + "loss": 0.98478377, + "num_input_tokens_seen": 64252300, + "router_z_loss_clip": 2.69335938, + "router_z_loss_mlp": 0.24475098, + "step": 2972, + "time_per_iteration": 2.5959432125091553 + }, + { + "auxiliary_loss_clip": 0.06576589, + "auxiliary_loss_mlp": 0.01296839, + "balance_loss_clip": 0.06306117, + "balance_loss_mlp": 0.01272747, + "epoch": 0.17874643018187283, + "flos": 16987509504000.0, + "grad_norm": 2.140735852517547, + "language_loss": 0.89032125, + "learning_rate": 3.77235783676401e-06, + "loss": 0.96905553, + "num_input_tokens_seen": 64270105, + "router_z_loss_clip": 2.70507812, + "router_z_loss_mlp": 0.24084473, + "step": 2973, + "time_per_iteration": 2.5378026962280273 + }, + { + "auxiliary_loss_clip": 0.06586085, + "auxiliary_loss_mlp": 0.01287873, + "balance_loss_clip": 0.06315553, + "balance_loss_mlp": 0.01263459, + "epoch": 0.1788065534345408, + "flos": 21038499797760.0, + "grad_norm": 2.0743135363702097, + "language_loss": 0.77368832, + "learning_rate": 3.7721773487378615e-06, + "loss": 0.8524279, + "num_input_tokens_seen": 64287250, + "router_z_loss_clip": 2.70703125, + "router_z_loss_mlp": 0.2442627, + "step": 2974, + "time_per_iteration": 2.53279972076416 + }, + { + "auxiliary_loss_clip": 0.06580098, + "auxiliary_loss_mlp": 0.01294024, + "balance_loss_clip": 0.06311634, + "balance_loss_mlp": 0.01269825, + "epoch": 0.17886667668720876, + "flos": 23994500935680.0, + "grad_norm": 2.8964956916015323, + "language_loss": 0.75456583, + "learning_rate": 3.7719967935105705e-06, + "loss": 0.83330709, + "num_input_tokens_seen": 64307140, + "router_z_loss_clip": 2.68359375, + "router_z_loss_mlp": 0.24182129, + "step": 2975, + "time_per_iteration": 2.5941531658172607 + }, + { + "auxiliary_loss_clip": 0.06574937, + "auxiliary_loss_mlp": 0.01296496, + "balance_loss_clip": 0.06309143, + "balance_loss_mlp": 0.0127443, + "epoch": 0.17892679993987676, + "flos": 25746626136960.0, + "grad_norm": 1.5983536265516811, + "language_loss": 0.73931366, + "learning_rate": 3.7718161710889833e-06, + "loss": 0.81802797, + "num_input_tokens_seen": 64328760, + "router_z_loss_clip": 2.65820312, + "router_z_loss_mlp": 0.22070312, + "step": 2976, + "time_per_iteration": 3.9981672763824463 + }, + { + "auxiliary_loss_clip": 0.06569345, + "auxiliary_loss_mlp": 0.01289522, + "balance_loss_clip": 0.06309073, + "balance_loss_mlp": 0.01268697, + "epoch": 0.17898692319254472, + "flos": 25706277596160.0, + "grad_norm": 1.568582717127115, + "language_loss": 0.7779026, + "learning_rate": 3.7716354814799495e-06, + "loss": 0.85649121, + "num_input_tokens_seen": 64348800, + "router_z_loss_clip": 2.60351562, + "router_z_loss_mlp": 0.20837402, + "step": 2977, + "time_per_iteration": 2.6050028800964355 + }, + { + "auxiliary_loss_clip": 0.06579779, + "auxiliary_loss_mlp": 0.01290892, + "balance_loss_clip": 0.06314169, + "balance_loss_mlp": 0.01267538, + "epoch": 0.1790470464452127, + "flos": 19323830171520.0, + "grad_norm": 2.1998049901746395, + "language_loss": 0.80421352, + "learning_rate": 3.7714547246903203e-06, + "loss": 0.88292015, + "num_input_tokens_seen": 64367955, + "router_z_loss_clip": 2.65234375, + "router_z_loss_mlp": 0.23339844, + "step": 2978, + "time_per_iteration": 4.010040044784546 + }, + { + "auxiliary_loss_clip": 0.06576563, + "auxiliary_loss_mlp": 0.01293687, + "balance_loss_clip": 0.06306942, + "balance_loss_mlp": 0.01267556, + "epoch": 0.17910716969788065, + "flos": 30052048953600.0, + "grad_norm": 1.73318348994846, + "language_loss": 0.77042997, + "learning_rate": 3.7712739007269508e-06, + "loss": 0.84913242, + "num_input_tokens_seen": 64389805, + "router_z_loss_clip": 2.69335938, + "router_z_loss_mlp": 0.2611084, + "step": 2979, + "time_per_iteration": 2.608980655670166 + }, + { + "auxiliary_loss_clip": 0.06560802, + "auxiliary_loss_mlp": 0.01281236, + "balance_loss_clip": 0.06300105, + "balance_loss_mlp": 0.01258264, + "epoch": 0.17916729295054862, + "flos": 19433848982400.0, + "grad_norm": 2.44165935104879, + "language_loss": 0.69755781, + "learning_rate": 3.7710930095966976e-06, + "loss": 0.77597821, + "num_input_tokens_seen": 64408220, + "router_z_loss_clip": 2.60742188, + "router_z_loss_mlp": 0.22961426, + "step": 2980, + "time_per_iteration": 2.5433084964752197 + }, + { + "auxiliary_loss_clip": 0.06568111, + "auxiliary_loss_mlp": 0.01287625, + "balance_loss_clip": 0.06298865, + "balance_loss_mlp": 0.01262627, + "epoch": 0.17922741620321658, + "flos": 14616877789440.0, + "grad_norm": 2.147684280368508, + "language_loss": 0.7145257, + "learning_rate": 3.7709120513064196e-06, + "loss": 0.79308307, + "num_input_tokens_seen": 64426380, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 0.25, + "step": 2981, + "time_per_iteration": 2.500054359436035 + }, + { + "auxiliary_loss_clip": 0.06576173, + "auxiliary_loss_mlp": 0.01291804, + "balance_loss_clip": 0.06304301, + "balance_loss_mlp": 0.01267676, + "epoch": 0.17928753945588458, + "flos": 17171013945600.0, + "grad_norm": 2.0884907581744514, + "language_loss": 0.82620054, + "learning_rate": 3.7707310258629796e-06, + "loss": 0.90488029, + "num_input_tokens_seen": 64444355, + "router_z_loss_clip": 2.72070312, + "router_z_loss_mlp": 0.24145508, + "step": 2982, + "time_per_iteration": 2.5748655796051025 + }, + { + "auxiliary_loss_clip": 0.06564468, + "auxiliary_loss_mlp": 0.01285766, + "balance_loss_clip": 0.06298885, + "balance_loss_mlp": 0.01263212, + "epoch": 0.17934766270855254, + "flos": 31403860473600.0, + "grad_norm": 1.5724638299649338, + "language_loss": 0.83894312, + "learning_rate": 3.7705499332732413e-06, + "loss": 0.91744542, + "num_input_tokens_seen": 64467800, + "router_z_loss_clip": 2.65429688, + "router_z_loss_mlp": 0.2253418, + "step": 2983, + "time_per_iteration": 5.515043497085571 + }, + { + "auxiliary_loss_clip": 0.0656914, + "auxiliary_loss_mlp": 0.01282068, + "balance_loss_clip": 0.06294827, + "balance_loss_mlp": 0.01257571, + "epoch": 0.1794077859612205, + "flos": 20820558528000.0, + "grad_norm": 2.232182880378402, + "language_loss": 0.86948806, + "learning_rate": 3.7703687735440718e-06, + "loss": 0.94800013, + "num_input_tokens_seen": 64487230, + "router_z_loss_clip": 2.74414062, + "router_z_loss_mlp": 0.24523926, + "step": 2984, + "time_per_iteration": 2.51488995552063 + }, + { + "auxiliary_loss_clip": 0.0657285, + "auxiliary_loss_mlp": 0.0128885, + "balance_loss_clip": 0.06300434, + "balance_loss_mlp": 0.01263315, + "epoch": 0.17946790921388847, + "flos": 28994096102400.0, + "grad_norm": 1.3770556187482685, + "language_loss": 0.90024149, + "learning_rate": 3.7701875466823416e-06, + "loss": 0.97885847, + "num_input_tokens_seen": 64509165, + "router_z_loss_clip": 2.72851562, + "router_z_loss_mlp": 0.25537109, + "step": 2985, + "time_per_iteration": 2.6063013076782227 + }, + { + "auxiliary_loss_clip": 0.06556329, + "auxiliary_loss_mlp": 0.01283368, + "balance_loss_clip": 0.06297163, + "balance_loss_mlp": 0.01261088, + "epoch": 0.17952803246655644, + "flos": 20743131755520.0, + "grad_norm": 1.9976249367728316, + "language_loss": 0.71013325, + "learning_rate": 3.770006252694922e-06, + "loss": 0.78853023, + "num_input_tokens_seen": 64527940, + "router_z_loss_clip": 2.58984375, + "router_z_loss_mlp": 0.22277832, + "step": 2986, + "time_per_iteration": 2.519601345062256 + }, + { + "auxiliary_loss_clip": 0.0656532, + "auxiliary_loss_mlp": 0.01291064, + "balance_loss_clip": 0.06300499, + "balance_loss_mlp": 0.01266805, + "epoch": 0.1795881557192244, + "flos": 28263390572160.0, + "grad_norm": 2.1489314529360994, + "language_loss": 0.78320301, + "learning_rate": 3.769824891588688e-06, + "loss": 0.86176682, + "num_input_tokens_seen": 64545230, + "router_z_loss_clip": 2.65039062, + "router_z_loss_mlp": 0.24243164, + "step": 2987, + "time_per_iteration": 2.6449100971221924 + }, + { + "auxiliary_loss_clip": 0.06569126, + "auxiliary_loss_mlp": 0.01288456, + "balance_loss_clip": 0.06297948, + "balance_loss_mlp": 0.01263589, + "epoch": 0.17964827897189237, + "flos": 18558016980480.0, + "grad_norm": 1.9340316390641499, + "language_loss": 0.78628373, + "learning_rate": 3.7696434633705164e-06, + "loss": 0.86485958, + "num_input_tokens_seen": 64563820, + "router_z_loss_clip": 2.7109375, + "router_z_loss_mlp": 0.24890137, + "step": 2988, + "time_per_iteration": 2.53200101852417 + }, + { + "auxiliary_loss_clip": 0.06451814, + "auxiliary_loss_mlp": 0.01275074, + "balance_loss_clip": 0.06303016, + "balance_loss_mlp": 0.01267408, + "epoch": 0.17970840222456036, + "flos": 58182052625280.0, + "grad_norm": 0.7360596365876024, + "language_loss": 0.62615538, + "learning_rate": 3.7694619680472875e-06, + "loss": 0.70342427, + "num_input_tokens_seen": 64621315, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.07653809, + "step": 2989, + "time_per_iteration": 3.076199769973755 + }, + { + "auxiliary_loss_clip": 0.06567107, + "auxiliary_loss_mlp": 0.01292244, + "balance_loss_clip": 0.06300405, + "balance_loss_mlp": 0.0126808, + "epoch": 0.17976852547722832, + "flos": 20306662237440.0, + "grad_norm": 2.2696852334697035, + "language_loss": 0.71750367, + "learning_rate": 3.7692804056258837e-06, + "loss": 0.79609722, + "num_input_tokens_seen": 64639885, + "router_z_loss_clip": 2.66601562, + "router_z_loss_mlp": 0.24157715, + "step": 2990, + "time_per_iteration": 2.5519793033599854 + }, + { + "auxiliary_loss_clip": 0.06572431, + "auxiliary_loss_mlp": 0.01293466, + "balance_loss_clip": 0.0629989, + "balance_loss_mlp": 0.0126873, + "epoch": 0.1798286487298963, + "flos": 39677564004480.0, + "grad_norm": 1.9736942492438545, + "language_loss": 0.69419956, + "learning_rate": 3.7690987761131893e-06, + "loss": 0.77285856, + "num_input_tokens_seen": 64661220, + "router_z_loss_clip": 2.7265625, + "router_z_loss_mlp": 0.24743652, + "step": 2991, + "time_per_iteration": 2.6942460536956787 + }, + { + "auxiliary_loss_clip": 0.06566148, + "auxiliary_loss_mlp": 0.01286066, + "balance_loss_clip": 0.0629756, + "balance_loss_mlp": 0.012617, + "epoch": 0.17988877198256426, + "flos": 25527385128960.0, + "grad_norm": 1.696800264728132, + "language_loss": 0.83554435, + "learning_rate": 3.7689170795160924e-06, + "loss": 0.91406649, + "num_input_tokens_seen": 64682530, + "router_z_loss_clip": 2.68359375, + "router_z_loss_mlp": 0.24365234, + "step": 2992, + "time_per_iteration": 2.5905981063842773 + }, + { + "auxiliary_loss_clip": 0.06555136, + "auxiliary_loss_mlp": 0.01287452, + "balance_loss_clip": 0.06296399, + "balance_loss_mlp": 0.01264087, + "epoch": 0.17994889523523222, + "flos": 18813539606400.0, + "grad_norm": 1.8489809189150626, + "language_loss": 0.83113515, + "learning_rate": 3.7687353158414822e-06, + "loss": 0.90956104, + "num_input_tokens_seen": 64701025, + "router_z_loss_clip": 2.59179688, + "router_z_loss_mlp": 0.23352051, + "step": 2993, + "time_per_iteration": 2.52469801902771 + }, + { + "auxiliary_loss_clip": 0.06567293, + "auxiliary_loss_mlp": 0.01295673, + "balance_loss_clip": 0.06297931, + "balance_loss_mlp": 0.01270532, + "epoch": 0.18000901848790019, + "flos": 21110601836160.0, + "grad_norm": 1.6727087173341013, + "language_loss": 0.79138827, + "learning_rate": 3.7685534850962517e-06, + "loss": 0.87001795, + "num_input_tokens_seen": 64719570, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 0.25134277, + "step": 2994, + "time_per_iteration": 2.6068711280822754 + }, + { + "auxiliary_loss_clip": 0.06570512, + "auxiliary_loss_mlp": 0.01299664, + "balance_loss_clip": 0.06303661, + "balance_loss_mlp": 0.01275656, + "epoch": 0.18006914174056818, + "flos": 19652586865920.0, + "grad_norm": 2.057688194559839, + "language_loss": 0.81263554, + "learning_rate": 3.768371587287296e-06, + "loss": 0.89133728, + "num_input_tokens_seen": 64738110, + "router_z_loss_clip": 2.66796875, + "router_z_loss_mlp": 0.24023438, + "step": 2995, + "time_per_iteration": 2.55191707611084 + }, + { + "auxiliary_loss_clip": 0.06569074, + "auxiliary_loss_mlp": 0.0128305, + "balance_loss_clip": 0.06302823, + "balance_loss_mlp": 0.012599, + "epoch": 0.18012926499323614, + "flos": 19505909093760.0, + "grad_norm": 1.5669289310044971, + "language_loss": 0.84560204, + "learning_rate": 3.768189622421512e-06, + "loss": 0.92412329, + "num_input_tokens_seen": 64756345, + "router_z_loss_clip": 2.66210938, + "router_z_loss_mlp": 0.23156738, + "step": 2996, + "time_per_iteration": 2.5438597202301025 + }, + { + "auxiliary_loss_clip": 0.06562654, + "auxiliary_loss_mlp": 0.012845, + "balance_loss_clip": 0.06302606, + "balance_loss_mlp": 0.01261124, + "epoch": 0.1801893882459041, + "flos": 19470759505920.0, + "grad_norm": 1.7191902249906965, + "language_loss": 0.88438457, + "learning_rate": 3.7680075905058006e-06, + "loss": 0.96285611, + "num_input_tokens_seen": 64776375, + "router_z_loss_clip": 2.6015625, + "router_z_loss_mlp": 0.23352051, + "step": 2997, + "time_per_iteration": 2.5537290573120117 + }, + { + "auxiliary_loss_clip": 0.06589026, + "auxiliary_loss_mlp": 0.01294218, + "balance_loss_clip": 0.06317096, + "balance_loss_mlp": 0.01268731, + "epoch": 0.18024951149857207, + "flos": 26877938837760.0, + "grad_norm": 1.8629134602199495, + "language_loss": 0.86106455, + "learning_rate": 3.7678254915470643e-06, + "loss": 0.939897, + "num_input_tokens_seen": 64796210, + "router_z_loss_clip": 2.71679688, + "router_z_loss_mlp": 0.25500488, + "step": 2998, + "time_per_iteration": 2.6256613731384277 + }, + { + "auxiliary_loss_clip": 0.06576181, + "auxiliary_loss_mlp": 0.01293189, + "balance_loss_clip": 0.06311405, + "balance_loss_mlp": 0.01269573, + "epoch": 0.18030963475124004, + "flos": 30234421365120.0, + "grad_norm": 1.8712207411963018, + "language_loss": 0.84650278, + "learning_rate": 3.7676433255522084e-06, + "loss": 0.92519647, + "num_input_tokens_seen": 64818590, + "router_z_loss_clip": 2.64648438, + "router_z_loss_mlp": 0.23608398, + "step": 2999, + "time_per_iteration": 2.6169869899749756 + }, + { + "auxiliary_loss_clip": 0.06576863, + "auxiliary_loss_mlp": 0.01287758, + "balance_loss_clip": 0.06310622, + "balance_loss_mlp": 0.01263905, + "epoch": 0.180369758003908, + "flos": 22313681159040.0, + "grad_norm": 2.163703762887268, + "language_loss": 0.75604963, + "learning_rate": 3.76746109252814e-06, + "loss": 0.83469582, + "num_input_tokens_seen": 64838350, + "router_z_loss_clip": 2.6640625, + "router_z_loss_mlp": 0.23852539, + "step": 3000, + "time_per_iteration": 2.6028895378112793 + }, + { + "auxiliary_loss_clip": 0.06574081, + "auxiliary_loss_mlp": 0.01292075, + "balance_loss_clip": 0.06310557, + "balance_loss_mlp": 0.01270034, + "epoch": 0.18042988125657597, + "flos": 23738726747520.0, + "grad_norm": 2.5967993482221114, + "language_loss": 0.72796941, + "learning_rate": 3.76727879248177e-06, + "loss": 0.80663097, + "num_input_tokens_seen": 64858065, + "router_z_loss_clip": 2.63671875, + "router_z_loss_mlp": 0.22033691, + "step": 3001, + "time_per_iteration": 2.5506463050842285 + }, + { + "auxiliary_loss_clip": 0.06583872, + "auxiliary_loss_mlp": 0.01288133, + "balance_loss_clip": 0.06311986, + "balance_loss_mlp": 0.01262336, + "epoch": 0.18049000450924396, + "flos": 24099781991040.0, + "grad_norm": 2.0612506576335488, + "language_loss": 0.88948703, + "learning_rate": 3.767096425420011e-06, + "loss": 0.96820712, + "num_input_tokens_seen": 64877305, + "router_z_loss_clip": 2.71484375, + "router_z_loss_mlp": 0.25793457, + "step": 3002, + "time_per_iteration": 2.606262683868408 + }, + { + "auxiliary_loss_clip": 0.06584583, + "auxiliary_loss_mlp": 0.01297298, + "balance_loss_clip": 0.06316328, + "balance_loss_mlp": 0.01274613, + "epoch": 0.18055012776191193, + "flos": 22169602863360.0, + "grad_norm": 1.9471434915323604, + "language_loss": 0.82044661, + "learning_rate": 3.7669139913497788e-06, + "loss": 0.89926547, + "num_input_tokens_seen": 64896955, + "router_z_loss_clip": 2.68359375, + "router_z_loss_mlp": 0.22705078, + "step": 3003, + "time_per_iteration": 2.519054889678955 + }, + { + "auxiliary_loss_clip": 0.06584047, + "auxiliary_loss_mlp": 0.01304701, + "balance_loss_clip": 0.0631455, + "balance_loss_mlp": 0.01281098, + "epoch": 0.1806102510145799, + "flos": 28921155523200.0, + "grad_norm": 1.9671809983045359, + "language_loss": 0.67718011, + "learning_rate": 3.7667314902779907e-06, + "loss": 0.75606757, + "num_input_tokens_seen": 64917080, + "router_z_loss_clip": 2.6953125, + "router_z_loss_mlp": 0.23608398, + "step": 3004, + "time_per_iteration": 2.576216459274292 + }, + { + "auxiliary_loss_clip": 0.06581833, + "auxiliary_loss_mlp": 0.01290497, + "balance_loss_clip": 0.06313001, + "balance_loss_mlp": 0.01265976, + "epoch": 0.18067037426724786, + "flos": 19031648584320.0, + "grad_norm": 1.7292261015630317, + "language_loss": 0.86117315, + "learning_rate": 3.7665489222115677e-06, + "loss": 0.93989646, + "num_input_tokens_seen": 64935215, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 0.2454834, + "step": 3005, + "time_per_iteration": 2.51688814163208 + }, + { + "auxiliary_loss_clip": 0.06579112, + "auxiliary_loss_mlp": 0.01292933, + "balance_loss_clip": 0.0631589, + "balance_loss_mlp": 0.01270247, + "epoch": 0.18073049751991582, + "flos": 27460960346880.0, + "grad_norm": 1.9900110027616933, + "language_loss": 0.84054905, + "learning_rate": 3.766366287157432e-06, + "loss": 0.9192695, + "num_input_tokens_seen": 64956275, + "router_z_loss_clip": 2.6328125, + "router_z_loss_mlp": 0.22692871, + "step": 3006, + "time_per_iteration": 2.6471307277679443 + }, + { + "auxiliary_loss_clip": 0.06573892, + "auxiliary_loss_mlp": 0.01293776, + "balance_loss_clip": 0.06311665, + "balance_loss_mlp": 0.01270399, + "epoch": 0.1807906207725838, + "flos": 28736309416320.0, + "grad_norm": 1.8980852178108305, + "language_loss": 0.77909601, + "learning_rate": 3.7661835851225103e-06, + "loss": 0.85777271, + "num_input_tokens_seen": 64979390, + "router_z_loss_clip": 2.62304688, + "router_z_loss_mlp": 0.23376465, + "step": 3007, + "time_per_iteration": 2.596728801727295 + }, + { + "auxiliary_loss_clip": 0.06488212, + "auxiliary_loss_mlp": 0.01341948, + "balance_loss_clip": 0.06340114, + "balance_loss_mlp": 0.01332817, + "epoch": 0.18085074402525175, + "flos": 64488861411840.0, + "grad_norm": 0.8091646786767962, + "language_loss": 0.57128072, + "learning_rate": 3.7660008161137294e-06, + "loss": 0.64958233, + "num_input_tokens_seen": 65043135, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.09136963, + "step": 3008, + "time_per_iteration": 3.2818551063537598 + }, + { + "auxiliary_loss_clip": 0.06575561, + "auxiliary_loss_mlp": 0.0128936, + "balance_loss_clip": 0.06307852, + "balance_loss_mlp": 0.0126528, + "epoch": 0.18091086727791975, + "flos": 23483665319040.0, + "grad_norm": 2.791287786369512, + "language_loss": 0.68172324, + "learning_rate": 3.765817980138021e-06, + "loss": 0.76037246, + "num_input_tokens_seen": 65062845, + "router_z_loss_clip": 2.67773438, + "router_z_loss_mlp": 0.24072266, + "step": 3009, + "time_per_iteration": 2.612866163253784 + }, + { + "auxiliary_loss_clip": 0.06566571, + "auxiliary_loss_mlp": 0.01283544, + "balance_loss_clip": 0.06299911, + "balance_loss_mlp": 0.01261228, + "epoch": 0.1809709905305877, + "flos": 24177334544640.0, + "grad_norm": 2.2065616524174745, + "language_loss": 0.76732111, + "learning_rate": 3.7656350772023177e-06, + "loss": 0.84582222, + "num_input_tokens_seen": 65082110, + "router_z_loss_clip": 2.6640625, + "router_z_loss_mlp": 0.22314453, + "step": 3010, + "time_per_iteration": 2.570751190185547 + }, + { + "auxiliary_loss_clip": 0.0656049, + "auxiliary_loss_mlp": 0.01277678, + "balance_loss_clip": 0.06301664, + "balance_loss_mlp": 0.01255028, + "epoch": 0.18103111378325568, + "flos": 21657006311040.0, + "grad_norm": 1.5802962280270132, + "language_loss": 0.68172359, + "learning_rate": 3.7654521073135553e-06, + "loss": 0.76010525, + "num_input_tokens_seen": 65101985, + "router_z_loss_clip": 2.58789062, + "router_z_loss_mlp": 0.22644043, + "step": 3011, + "time_per_iteration": 2.5724563598632812 + }, + { + "auxiliary_loss_clip": 0.0656517, + "auxiliary_loss_mlp": 0.01279328, + "balance_loss_clip": 0.06304309, + "balance_loss_mlp": 0.01256989, + "epoch": 0.18109123703592364, + "flos": 53698632537600.0, + "grad_norm": 1.5833259733478497, + "language_loss": 0.71816081, + "learning_rate": 3.7652690704786723e-06, + "loss": 0.79660583, + "num_input_tokens_seen": 65129295, + "router_z_loss_clip": 2.60546875, + "router_z_loss_mlp": 0.22351074, + "step": 3012, + "time_per_iteration": 2.810831069946289 + }, + { + "auxiliary_loss_clip": 0.06566492, + "auxiliary_loss_mlp": 0.01285528, + "balance_loss_clip": 0.06309225, + "balance_loss_mlp": 0.01261376, + "epoch": 0.1811513602885916, + "flos": 35854325907840.0, + "grad_norm": 2.597528045864961, + "language_loss": 0.63496852, + "learning_rate": 3.765085966704609e-06, + "loss": 0.7134887, + "num_input_tokens_seen": 65150625, + "router_z_loss_clip": 2.57617188, + "router_z_loss_mlp": 0.24169922, + "step": 3013, + "time_per_iteration": 2.728149175643921 + }, + { + "auxiliary_loss_clip": 0.0656557, + "auxiliary_loss_mlp": 0.01286402, + "balance_loss_clip": 0.06302488, + "balance_loss_mlp": 0.01262405, + "epoch": 0.18121148354125957, + "flos": 23739355653120.0, + "grad_norm": 1.5758176693533255, + "language_loss": 0.76564461, + "learning_rate": 3.764902795998309e-06, + "loss": 0.84416431, + "num_input_tokens_seen": 65170880, + "router_z_loss_clip": 2.63085938, + "router_z_loss_mlp": 0.23986816, + "step": 3014, + "time_per_iteration": 2.547717332839966 + }, + { + "auxiliary_loss_clip": 0.06584823, + "auxiliary_loss_mlp": 0.01295776, + "balance_loss_clip": 0.06314109, + "balance_loss_mlp": 0.01270336, + "epoch": 0.18127160679392756, + "flos": 28735470875520.0, + "grad_norm": 2.560866552798296, + "language_loss": 0.66988617, + "learning_rate": 3.7647195583667184e-06, + "loss": 0.74869215, + "num_input_tokens_seen": 65192530, + "router_z_loss_clip": 2.70898438, + "router_z_loss_mlp": 0.2545166, + "step": 3015, + "time_per_iteration": 2.69026780128479 + }, + { + "auxiliary_loss_clip": 0.06569196, + "auxiliary_loss_mlp": 0.01280146, + "balance_loss_clip": 0.06306805, + "balance_loss_mlp": 0.0125696, + "epoch": 0.18133173004659553, + "flos": 20491256782080.0, + "grad_norm": 2.469275114619788, + "language_loss": 0.78958207, + "learning_rate": 3.764536253816785e-06, + "loss": 0.86807549, + "num_input_tokens_seen": 65211675, + "router_z_loss_clip": 2.62304688, + "router_z_loss_mlp": 0.23168945, + "step": 3016, + "time_per_iteration": 3.9831480979919434 + }, + { + "auxiliary_loss_clip": 0.0657627, + "auxiliary_loss_mlp": 0.01288204, + "balance_loss_clip": 0.06304967, + "balance_loss_mlp": 0.01262967, + "epoch": 0.1813918532992635, + "flos": 22857905427840.0, + "grad_norm": 1.6723213639278358, + "language_loss": 0.84196192, + "learning_rate": 3.7643528823554602e-06, + "loss": 0.92060661, + "num_input_tokens_seen": 65231185, + "router_z_loss_clip": 2.71484375, + "router_z_loss_mlp": 0.25256348, + "step": 3017, + "time_per_iteration": 2.5418076515197754 + }, + { + "auxiliary_loss_clip": 0.06562062, + "auxiliary_loss_mlp": 0.01287085, + "balance_loss_clip": 0.063041, + "balance_loss_mlp": 0.01264197, + "epoch": 0.18145197655193146, + "flos": 36074028113280.0, + "grad_norm": 1.9391079186566258, + "language_loss": 0.68509835, + "learning_rate": 3.764169443989697e-06, + "loss": 0.76358986, + "num_input_tokens_seen": 65251645, + "router_z_loss_clip": 2.58203125, + "router_z_loss_mlp": 0.22900391, + "step": 3018, + "time_per_iteration": 4.119429111480713 + }, + { + "auxiliary_loss_clip": 0.06567694, + "auxiliary_loss_mlp": 0.01285506, + "balance_loss_clip": 0.06301513, + "balance_loss_mlp": 0.01262296, + "epoch": 0.18151209980459942, + "flos": 24030698699520.0, + "grad_norm": 1.811235496294486, + "language_loss": 0.76789671, + "learning_rate": 3.7639859387264518e-06, + "loss": 0.84642869, + "num_input_tokens_seen": 65271125, + "router_z_loss_clip": 2.65820312, + "router_z_loss_mlp": 0.23205566, + "step": 3019, + "time_per_iteration": 2.5501174926757812 + }, + { + "auxiliary_loss_clip": 0.06571496, + "auxiliary_loss_mlp": 0.01294569, + "balance_loss_clip": 0.0630317, + "balance_loss_mlp": 0.01267544, + "epoch": 0.1815722230572674, + "flos": 23958470880000.0, + "grad_norm": 3.3265475746221305, + "language_loss": 0.82225502, + "learning_rate": 3.7638023665726834e-06, + "loss": 0.90091568, + "num_input_tokens_seen": 65290600, + "router_z_loss_clip": 2.68554688, + "router_z_loss_mlp": 0.26989746, + "step": 3020, + "time_per_iteration": 2.5695080757141113 + }, + { + "auxiliary_loss_clip": 0.06568192, + "auxiliary_loss_mlp": 0.01285845, + "balance_loss_clip": 0.06304596, + "balance_loss_mlp": 0.01262433, + "epoch": 0.18163234630993536, + "flos": 24392885973120.0, + "grad_norm": 1.8328180932997555, + "language_loss": 0.78643721, + "learning_rate": 3.763618727535352e-06, + "loss": 0.8649776, + "num_input_tokens_seen": 65311040, + "router_z_loss_clip": 2.63867188, + "router_z_loss_mlp": 0.234375, + "step": 3021, + "time_per_iteration": 2.551942825317383 + }, + { + "auxiliary_loss_clip": 0.06560968, + "auxiliary_loss_mlp": 0.01283899, + "balance_loss_clip": 0.06301476, + "balance_loss_mlp": 0.01261034, + "epoch": 0.18169246956260335, + "flos": 24688295942400.0, + "grad_norm": 2.040482316083418, + "language_loss": 0.85882831, + "learning_rate": 3.763435021621422e-06, + "loss": 0.93727696, + "num_input_tokens_seen": 65332115, + "router_z_loss_clip": 2.59179688, + "router_z_loss_mlp": 0.22851562, + "step": 3022, + "time_per_iteration": 5.58092737197876 + }, + { + "auxiliary_loss_clip": 0.06578015, + "auxiliary_loss_mlp": 0.01285165, + "balance_loss_clip": 0.06310268, + "balance_loss_mlp": 0.0126031, + "epoch": 0.1817525928152713, + "flos": 24250149342720.0, + "grad_norm": 1.8455534069636814, + "language_loss": 0.7011804, + "learning_rate": 3.763251248837859e-06, + "loss": 0.77981222, + "num_input_tokens_seen": 65352210, + "router_z_loss_clip": 2.67773438, + "router_z_loss_mlp": 0.24853516, + "step": 3023, + "time_per_iteration": 2.5510292053222656 + }, + { + "auxiliary_loss_clip": 0.06576993, + "auxiliary_loss_mlp": 0.01285425, + "balance_loss_clip": 0.06311849, + "balance_loss_mlp": 0.01262382, + "epoch": 0.18181271606793928, + "flos": 16477680136320.0, + "grad_norm": 3.5802196750479753, + "language_loss": 0.7475239, + "learning_rate": 3.7630674091916317e-06, + "loss": 0.82614803, + "num_input_tokens_seen": 65370600, + "router_z_loss_clip": 2.6484375, + "router_z_loss_mlp": 0.23034668, + "step": 3024, + "time_per_iteration": 2.532150983810425 + }, + { + "auxiliary_loss_clip": 0.0657917, + "auxiliary_loss_mlp": 0.01281973, + "balance_loss_clip": 0.06315119, + "balance_loss_mlp": 0.01258239, + "epoch": 0.18187283932060724, + "flos": 18585787409280.0, + "grad_norm": 2.5283577302616593, + "language_loss": 0.89396572, + "learning_rate": 3.7628835026897123e-06, + "loss": 0.97257715, + "num_input_tokens_seen": 65387270, + "router_z_loss_clip": 2.640625, + "router_z_loss_mlp": 0.23742676, + "step": 3025, + "time_per_iteration": 2.503992795944214 + }, + { + "auxiliary_loss_clip": 0.0657706, + "auxiliary_loss_mlp": 0.01284845, + "balance_loss_clip": 0.06313155, + "balance_loss_mlp": 0.01260049, + "epoch": 0.1819329625732752, + "flos": 20273105877120.0, + "grad_norm": 1.766887401432974, + "language_loss": 0.80214149, + "learning_rate": 3.7626995293390735e-06, + "loss": 0.88076055, + "num_input_tokens_seen": 65406550, + "router_z_loss_clip": 2.63867188, + "router_z_loss_mlp": 0.24804688, + "step": 3026, + "time_per_iteration": 2.5226128101348877 + }, + { + "auxiliary_loss_clip": 0.06583989, + "auxiliary_loss_mlp": 0.01292049, + "balance_loss_clip": 0.06316754, + "balance_loss_mlp": 0.01267695, + "epoch": 0.18199308582594317, + "flos": 25921242046080.0, + "grad_norm": 3.8781285127645924, + "language_loss": 0.76237446, + "learning_rate": 3.762515489146692e-06, + "loss": 0.84113485, + "num_input_tokens_seen": 65425955, + "router_z_loss_clip": 2.67578125, + "router_z_loss_mlp": 0.2434082, + "step": 3027, + "time_per_iteration": 2.578749418258667 + }, + { + "auxiliary_loss_clip": 0.06592765, + "auxiliary_loss_mlp": 0.01296803, + "balance_loss_clip": 0.06322083, + "balance_loss_mlp": 0.01271328, + "epoch": 0.18205320907861114, + "flos": 15382942542720.0, + "grad_norm": 3.274226659229475, + "language_loss": 0.86130804, + "learning_rate": 3.762331382119546e-06, + "loss": 0.94020373, + "num_input_tokens_seen": 65442820, + "router_z_loss_clip": 2.71289062, + "router_z_loss_mlp": 0.25476074, + "step": 3028, + "time_per_iteration": 2.5201306343078613 + }, + { + "auxiliary_loss_clip": 0.06585124, + "auxiliary_loss_mlp": 0.01291016, + "balance_loss_clip": 0.06319305, + "balance_loss_mlp": 0.01263896, + "epoch": 0.18211333233127913, + "flos": 25630485978240.0, + "grad_norm": 1.8702692274079507, + "language_loss": 0.83509612, + "learning_rate": 3.7621472082646183e-06, + "loss": 0.91385752, + "num_input_tokens_seen": 65461825, + "router_z_loss_clip": 2.65429688, + "router_z_loss_mlp": 0.27111816, + "step": 3029, + "time_per_iteration": 2.562183380126953 + }, + { + "auxiliary_loss_clip": 0.06592625, + "auxiliary_loss_mlp": 0.01296678, + "balance_loss_clip": 0.06326656, + "balance_loss_mlp": 0.01269153, + "epoch": 0.1821734555839471, + "flos": 14981329123200.0, + "grad_norm": 1.9791177396807749, + "language_loss": 0.78960443, + "learning_rate": 3.761962967588891e-06, + "loss": 0.86849743, + "num_input_tokens_seen": 65479480, + "router_z_loss_clip": 2.65820312, + "router_z_loss_mlp": 0.27514648, + "step": 3030, + "time_per_iteration": 2.5145437717437744 + }, + { + "auxiliary_loss_clip": 0.06592657, + "auxiliary_loss_mlp": 0.01296331, + "balance_loss_clip": 0.06325006, + "balance_loss_mlp": 0.01269748, + "epoch": 0.18223357883661506, + "flos": 20200291079040.0, + "grad_norm": 1.9881761765350903, + "language_loss": 0.86102521, + "learning_rate": 3.761778660099352e-06, + "loss": 0.93991506, + "num_input_tokens_seen": 65497775, + "router_z_loss_clip": 2.6796875, + "router_z_loss_mlp": 0.26623535, + "step": 3031, + "time_per_iteration": 2.5260634422302246 + }, + { + "auxiliary_loss_clip": 0.06592748, + "auxiliary_loss_mlp": 0.01294791, + "balance_loss_clip": 0.06325988, + "balance_loss_mlp": 0.01270473, + "epoch": 0.18229370208928303, + "flos": 15237438727680.0, + "grad_norm": 2.0909174524979033, + "language_loss": 0.8092168, + "learning_rate": 3.76159428580299e-06, + "loss": 0.88809216, + "num_input_tokens_seen": 65516505, + "router_z_loss_clip": 2.66796875, + "router_z_loss_mlp": 0.24316406, + "step": 3032, + "time_per_iteration": 2.5710113048553467 + }, + { + "auxiliary_loss_clip": 0.06594816, + "auxiliary_loss_mlp": 0.01293656, + "balance_loss_clip": 0.06321192, + "balance_loss_mlp": 0.0126718, + "epoch": 0.182353825341951, + "flos": 23847026549760.0, + "grad_norm": 1.952875580311909, + "language_loss": 0.81854784, + "learning_rate": 3.761409844706795e-06, + "loss": 0.89743257, + "num_input_tokens_seen": 65536160, + "router_z_loss_clip": 2.73632812, + "router_z_loss_mlp": 0.26501465, + "step": 3033, + "time_per_iteration": 2.5495798587799072 + }, + { + "auxiliary_loss_clip": 0.06484132, + "auxiliary_loss_mlp": 0.01303963, + "balance_loss_clip": 0.06340252, + "balance_loss_mlp": 0.01294378, + "epoch": 0.18241394859461896, + "flos": 61208017522560.0, + "grad_norm": 0.8447557433525825, + "language_loss": 0.63402653, + "learning_rate": 3.7612253368177625e-06, + "loss": 0.71190745, + "num_input_tokens_seen": 65589375, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.09570312, + "step": 3034, + "time_per_iteration": 3.0660452842712402 + }, + { + "auxiliary_loss_clip": 0.0658728, + "auxiliary_loss_mlp": 0.01296965, + "balance_loss_clip": 0.0632379, + "balance_loss_mlp": 0.01271896, + "epoch": 0.18247407184728695, + "flos": 18476439431040.0, + "grad_norm": 2.061097584564917, + "language_loss": 0.80526477, + "learning_rate": 3.7610407621428893e-06, + "loss": 0.88410723, + "num_input_tokens_seen": 65606720, + "router_z_loss_clip": 2.63476562, + "router_z_loss_mlp": 0.25073242, + "step": 3035, + "time_per_iteration": 2.5506694316864014 + }, + { + "auxiliary_loss_clip": 0.06580287, + "auxiliary_loss_mlp": 0.01288285, + "balance_loss_clip": 0.06319961, + "balance_loss_mlp": 0.01264181, + "epoch": 0.18253419509995492, + "flos": 21801042679680.0, + "grad_norm": 1.6140632959859456, + "language_loss": 0.85371202, + "learning_rate": 3.7608561206891735e-06, + "loss": 0.93239772, + "num_input_tokens_seen": 65625495, + "router_z_loss_clip": 2.6015625, + "router_z_loss_mlp": 0.24108887, + "step": 3036, + "time_per_iteration": 2.6029741764068604 + }, + { + "auxiliary_loss_clip": 0.06580038, + "auxiliary_loss_mlp": 0.01290184, + "balance_loss_clip": 0.0632468, + "balance_loss_mlp": 0.01266843, + "epoch": 0.18259431835262288, + "flos": 20154743585280.0, + "grad_norm": 2.265799944133398, + "language_loss": 0.80322921, + "learning_rate": 3.760671412463617e-06, + "loss": 0.88193142, + "num_input_tokens_seen": 65643515, + "router_z_loss_clip": 2.54882812, + "router_z_loss_mlp": 0.23327637, + "step": 3037, + "time_per_iteration": 2.519632577896118 + }, + { + "auxiliary_loss_clip": 0.06593587, + "auxiliary_loss_mlp": 0.01295693, + "balance_loss_clip": 0.063269, + "balance_loss_mlp": 0.01270373, + "epoch": 0.18265444160529085, + "flos": 16987132160640.0, + "grad_norm": 4.978587383263401, + "language_loss": 0.80596817, + "learning_rate": 3.7604866374732246e-06, + "loss": 0.88486093, + "num_input_tokens_seen": 65658155, + "router_z_loss_clip": 2.66601562, + "router_z_loss_mlp": 0.25341797, + "step": 3038, + "time_per_iteration": 2.549565315246582 + }, + { + "auxiliary_loss_clip": 0.06577064, + "auxiliary_loss_mlp": 0.01293219, + "balance_loss_clip": 0.06316892, + "balance_loss_mlp": 0.01268221, + "epoch": 0.1827145648579588, + "flos": 34431879795840.0, + "grad_norm": 3.0715308969073907, + "language_loss": 0.6822418, + "learning_rate": 3.7603017957250023e-06, + "loss": 0.76094472, + "num_input_tokens_seen": 65679310, + "router_z_loss_clip": 2.60351562, + "router_z_loss_mlp": 0.24987793, + "step": 3039, + "time_per_iteration": 2.664839267730713 + }, + { + "auxiliary_loss_clip": 0.06579359, + "auxiliary_loss_mlp": 0.01283138, + "balance_loss_clip": 0.06312781, + "balance_loss_mlp": 0.0125783, + "epoch": 0.18277468811062678, + "flos": 53298905834880.0, + "grad_norm": 2.0617529505454866, + "language_loss": 0.74242914, + "learning_rate": 3.7601168872259593e-06, + "loss": 0.82105416, + "num_input_tokens_seen": 65705235, + "router_z_loss_clip": 2.66796875, + "router_z_loss_mlp": 0.25305176, + "step": 3040, + "time_per_iteration": 2.8341598510742188 + }, + { + "auxiliary_loss_clip": 0.06576048, + "auxiliary_loss_mlp": 0.01287293, + "balance_loss_clip": 0.06314505, + "balance_loss_mlp": 0.01261997, + "epoch": 0.18283481136329474, + "flos": 31658879975040.0, + "grad_norm": 2.270513376553218, + "language_loss": 0.61012894, + "learning_rate": 3.7599319119831075e-06, + "loss": 0.68876237, + "num_input_tokens_seen": 65727575, + "router_z_loss_clip": 2.61328125, + "router_z_loss_mlp": 0.25305176, + "step": 3041, + "time_per_iteration": 2.6312432289123535 + }, + { + "auxiliary_loss_clip": 0.065763, + "auxiliary_loss_mlp": 0.01280171, + "balance_loss_clip": 0.06311682, + "balance_loss_mlp": 0.01254779, + "epoch": 0.18289493461596273, + "flos": 53148957753600.0, + "grad_norm": 1.9789856473501881, + "language_loss": 0.60569113, + "learning_rate": 3.7597468700034616e-06, + "loss": 0.68425584, + "num_input_tokens_seen": 65751370, + "router_z_loss_clip": 2.6484375, + "router_z_loss_mlp": 0.25366211, + "step": 3042, + "time_per_iteration": 2.8294289112091064 + }, + { + "auxiliary_loss_clip": 0.06571855, + "auxiliary_loss_mlp": 0.01284933, + "balance_loss_clip": 0.06311391, + "balance_loss_mlp": 0.01261818, + "epoch": 0.1829550578686307, + "flos": 25595797587840.0, + "grad_norm": 2.1969947776781593, + "language_loss": 0.87948751, + "learning_rate": 3.7595617612940374e-06, + "loss": 0.95805538, + "num_input_tokens_seen": 65771040, + "router_z_loss_clip": 2.60742188, + "router_z_loss_mlp": 0.2310791, + "step": 3043, + "time_per_iteration": 2.5895864963531494 + }, + { + "auxiliary_loss_clip": 0.06576079, + "auxiliary_loss_mlp": 0.01280472, + "balance_loss_clip": 0.06308874, + "balance_loss_mlp": 0.01255737, + "epoch": 0.18301518112129866, + "flos": 22608001025280.0, + "grad_norm": 2.7546688504112633, + "language_loss": 0.71556103, + "learning_rate": 3.7593765858618552e-06, + "loss": 0.79412657, + "num_input_tokens_seen": 65789345, + "router_z_loss_clip": 2.671875, + "router_z_loss_mlp": 0.24731445, + "step": 3044, + "time_per_iteration": 2.524653196334839 + }, + { + "auxiliary_loss_clip": 0.06580091, + "auxiliary_loss_mlp": 0.0128018, + "balance_loss_clip": 0.06309704, + "balance_loss_mlp": 0.01255277, + "epoch": 0.18307530437396663, + "flos": 34029176273280.0, + "grad_norm": 2.5838478211487406, + "language_loss": 0.65133858, + "learning_rate": 3.7591913437139365e-06, + "loss": 0.72994125, + "num_input_tokens_seen": 65810990, + "router_z_loss_clip": 2.70117188, + "router_z_loss_mlp": 0.24914551, + "step": 3045, + "time_per_iteration": 2.656454086303711 + }, + { + "auxiliary_loss_clip": 0.06567913, + "auxiliary_loss_mlp": 0.01279381, + "balance_loss_clip": 0.06306372, + "balance_loss_mlp": 0.01256898, + "epoch": 0.1831354276266346, + "flos": 21284756547840.0, + "grad_norm": 3.147408680423339, + "language_loss": 0.803563, + "learning_rate": 3.7590060348573066e-06, + "loss": 0.88203591, + "num_input_tokens_seen": 65827230, + "router_z_loss_clip": 2.6171875, + "router_z_loss_mlp": 0.22497559, + "step": 3046, + "time_per_iteration": 2.503777503967285 + }, + { + "auxiliary_loss_clip": 0.06581149, + "auxiliary_loss_mlp": 0.01284573, + "balance_loss_clip": 0.06310049, + "balance_loss_mlp": 0.01259217, + "epoch": 0.18319555087930256, + "flos": 21039338338560.0, + "grad_norm": 2.4200593706157627, + "language_loss": 0.79505324, + "learning_rate": 3.7588206592989903e-06, + "loss": 0.87371051, + "num_input_tokens_seen": 65845900, + "router_z_loss_clip": 2.71289062, + "router_z_loss_mlp": 0.25354004, + "step": 3047, + "time_per_iteration": 2.5604546070098877 + }, + { + "auxiliary_loss_clip": 0.06579873, + "auxiliary_loss_mlp": 0.01282037, + "balance_loss_clip": 0.06320655, + "balance_loss_mlp": 0.01258243, + "epoch": 0.18325567413197055, + "flos": 34390944276480.0, + "grad_norm": 1.4781726378987778, + "language_loss": 0.81601483, + "learning_rate": 3.7586352170460194e-06, + "loss": 0.89463389, + "num_input_tokens_seen": 65868730, + "router_z_loss_clip": 2.59570312, + "router_z_loss_mlp": 0.23779297, + "step": 3048, + "time_per_iteration": 2.6359665393829346 + }, + { + "auxiliary_loss_clip": 0.06575403, + "auxiliary_loss_mlp": 0.01285089, + "balance_loss_clip": 0.0631268, + "balance_loss_mlp": 0.01260472, + "epoch": 0.18331579738463852, + "flos": 20564742412800.0, + "grad_norm": 2.1940168845136045, + "language_loss": 0.87414008, + "learning_rate": 3.758449708105424e-06, + "loss": 0.95274496, + "num_input_tokens_seen": 65888420, + "router_z_loss_clip": 2.62890625, + "router_z_loss_mlp": 0.24633789, + "step": 3049, + "time_per_iteration": 2.5575695037841797 + }, + { + "auxiliary_loss_clip": 0.06592787, + "auxiliary_loss_mlp": 0.01283738, + "balance_loss_clip": 0.0632069, + "balance_loss_mlp": 0.01259086, + "epoch": 0.18337592063730648, + "flos": 19613663844480.0, + "grad_norm": 3.2022638976819486, + "language_loss": 0.78845787, + "learning_rate": 3.75826413248424e-06, + "loss": 0.86722308, + "num_input_tokens_seen": 65905840, + "router_z_loss_clip": 2.72265625, + "router_z_loss_mlp": 0.24694824, + "step": 3050, + "time_per_iteration": 2.5530426502227783 + }, + { + "auxiliary_loss_clip": 0.06580114, + "auxiliary_loss_mlp": 0.01276938, + "balance_loss_clip": 0.06318066, + "balance_loss_mlp": 0.01253466, + "epoch": 0.18343604388997445, + "flos": 20857301343360.0, + "grad_norm": 2.3642096483096764, + "language_loss": 1.00007951, + "learning_rate": 3.7580784901895035e-06, + "loss": 1.07865, + "num_input_tokens_seen": 65922845, + "router_z_loss_clip": 2.62109375, + "router_z_loss_mlp": 0.23474121, + "step": 3051, + "time_per_iteration": 2.53879714012146 + }, + { + "auxiliary_loss_clip": 0.06576733, + "auxiliary_loss_mlp": 0.01279033, + "balance_loss_clip": 0.06316614, + "balance_loss_mlp": 0.01255025, + "epoch": 0.1834961671426424, + "flos": 24402109921920.0, + "grad_norm": 1.6089937167063422, + "language_loss": 0.87510651, + "learning_rate": 3.7578927812282542e-06, + "loss": 0.95366418, + "num_input_tokens_seen": 65945555, + "router_z_loss_clip": 2.6015625, + "router_z_loss_mlp": 0.23999023, + "step": 3052, + "time_per_iteration": 2.616711378097534 + }, + { + "auxiliary_loss_clip": 0.06578867, + "auxiliary_loss_mlp": 0.01277944, + "balance_loss_clip": 0.06319693, + "balance_loss_mlp": 0.01255485, + "epoch": 0.18355629039531038, + "flos": 21257992368000.0, + "grad_norm": 1.906783267886923, + "language_loss": 0.73879737, + "learning_rate": 3.7577070056075356e-06, + "loss": 0.81736547, + "num_input_tokens_seen": 65963965, + "router_z_loss_clip": 2.59179688, + "router_z_loss_mlp": 0.22473145, + "step": 3053, + "time_per_iteration": 2.5624823570251465 + }, + { + "auxiliary_loss_clip": 0.06577893, + "auxiliary_loss_mlp": 0.01281464, + "balance_loss_clip": 0.06309894, + "balance_loss_mlp": 0.01257264, + "epoch": 0.18361641364797834, + "flos": 28663830034560.0, + "grad_norm": 2.5767200648108233, + "language_loss": 0.6330536, + "learning_rate": 3.7575211633343902e-06, + "loss": 0.71164715, + "num_input_tokens_seen": 65985965, + "router_z_loss_clip": 2.6796875, + "router_z_loss_mlp": 0.24194336, + "step": 3054, + "time_per_iteration": 2.6126291751861572 + }, + { + "auxiliary_loss_clip": 0.06580043, + "auxiliary_loss_mlp": 0.01278803, + "balance_loss_clip": 0.0631642, + "balance_loss_mlp": 0.0125539, + "epoch": 0.18367653690064634, + "flos": 20924414064000.0, + "grad_norm": 2.0083810279560192, + "language_loss": 0.79178774, + "learning_rate": 3.7573352544158663e-06, + "loss": 0.87037629, + "num_input_tokens_seen": 66005645, + "router_z_loss_clip": 2.63671875, + "router_z_loss_mlp": 0.23400879, + "step": 3055, + "time_per_iteration": 3.9858450889587402 + }, + { + "auxiliary_loss_clip": 0.06567059, + "auxiliary_loss_mlp": 0.01278609, + "balance_loss_clip": 0.06311087, + "balance_loss_mlp": 0.01255971, + "epoch": 0.1837366601533143, + "flos": 28772884523520.0, + "grad_norm": 1.844309785332071, + "language_loss": 0.71021843, + "learning_rate": 3.757149278859014e-06, + "loss": 0.78867513, + "num_input_tokens_seen": 66025675, + "router_z_loss_clip": 2.56054688, + "router_z_loss_mlp": 0.2265625, + "step": 3056, + "time_per_iteration": 2.623892068862915 + }, + { + "auxiliary_loss_clip": 0.06573971, + "auxiliary_loss_mlp": 0.01282679, + "balance_loss_clip": 0.06309162, + "balance_loss_mlp": 0.0125954, + "epoch": 0.18379678340598227, + "flos": 21257782732800.0, + "grad_norm": 1.9202402240588465, + "language_loss": 0.81177384, + "learning_rate": 3.7569632366708842e-06, + "loss": 0.89034033, + "num_input_tokens_seen": 66046125, + "router_z_loss_clip": 2.64648438, + "router_z_loss_mlp": 0.23144531, + "step": 3057, + "time_per_iteration": 3.994014263153076 + }, + { + "auxiliary_loss_clip": 0.06576763, + "auxiliary_loss_mlp": 0.01288527, + "balance_loss_clip": 0.06303927, + "balance_loss_mlp": 0.01263029, + "epoch": 0.18385690665865023, + "flos": 20455981413120.0, + "grad_norm": 5.209505310648867, + "language_loss": 0.83562195, + "learning_rate": 3.756777127858533e-06, + "loss": 0.91427481, + "num_input_tokens_seen": 66064375, + "router_z_loss_clip": 2.73242188, + "router_z_loss_mlp": 0.25500488, + "step": 3058, + "time_per_iteration": 2.559356689453125 + }, + { + "auxiliary_loss_clip": 0.0658073, + "auxiliary_loss_mlp": 0.01283954, + "balance_loss_clip": 0.06315949, + "balance_loss_mlp": 0.01259278, + "epoch": 0.1839170299113182, + "flos": 26147736432000.0, + "grad_norm": 2.1347539719525552, + "language_loss": 0.86113238, + "learning_rate": 3.756590952429017e-06, + "loss": 0.93977928, + "num_input_tokens_seen": 66084590, + "router_z_loss_clip": 2.6484375, + "router_z_loss_mlp": 0.2467041, + "step": 3059, + "time_per_iteration": 2.5702602863311768 + }, + { + "auxiliary_loss_clip": 0.0656752, + "auxiliary_loss_mlp": 0.01279577, + "balance_loss_clip": 0.06304803, + "balance_loss_mlp": 0.01255997, + "epoch": 0.18397715316398616, + "flos": 31765921966080.0, + "grad_norm": 1.5595075663945241, + "language_loss": 0.73269093, + "learning_rate": 3.756404710389396e-06, + "loss": 0.81116188, + "num_input_tokens_seen": 66107105, + "router_z_loss_clip": 2.625, + "router_z_loss_mlp": 0.23583984, + "step": 3060, + "time_per_iteration": 2.6496734619140625 + }, + { + "auxiliary_loss_clip": 0.06572919, + "auxiliary_loss_mlp": 0.01280202, + "balance_loss_clip": 0.06306632, + "balance_loss_mlp": 0.01254715, + "epoch": 0.18403727641665413, + "flos": 24619548067200.0, + "grad_norm": 1.685629450787069, + "language_loss": 0.73033082, + "learning_rate": 3.7562184017467323e-06, + "loss": 0.80886197, + "num_input_tokens_seen": 66129295, + "router_z_loss_clip": 2.6640625, + "router_z_loss_mlp": 0.25512695, + "step": 3061, + "time_per_iteration": 2.611788034439087 + }, + { + "auxiliary_loss_clip": 0.06574027, + "auxiliary_loss_mlp": 0.01285757, + "balance_loss_clip": 0.06309725, + "balance_loss_mlp": 0.01262666, + "epoch": 0.18409739966932212, + "flos": 23446503233280.0, + "grad_norm": 3.8650330009727893, + "language_loss": 0.81972837, + "learning_rate": 3.7560320265080906e-06, + "loss": 0.89832628, + "num_input_tokens_seen": 66146910, + "router_z_loss_clip": 2.64453125, + "router_z_loss_mlp": 0.23095703, + "step": 3062, + "time_per_iteration": 5.428592920303345 + }, + { + "auxiliary_loss_clip": 0.06579094, + "auxiliary_loss_mlp": 0.01285398, + "balance_loss_clip": 0.06309452, + "balance_loss_mlp": 0.01260806, + "epoch": 0.18415752292199009, + "flos": 21878637160320.0, + "grad_norm": 1.977008299285237, + "language_loss": 0.74067175, + "learning_rate": 3.7558455846805383e-06, + "loss": 0.81931663, + "num_input_tokens_seen": 66165370, + "router_z_loss_clip": 2.69921875, + "router_z_loss_mlp": 0.24572754, + "step": 3063, + "time_per_iteration": 2.53143572807312 + }, + { + "auxiliary_loss_clip": 0.06568366, + "auxiliary_loss_mlp": 0.0128141, + "balance_loss_clip": 0.06305687, + "balance_loss_mlp": 0.01257556, + "epoch": 0.18421764617465805, + "flos": 25417701734400.0, + "grad_norm": 1.7280289049146156, + "language_loss": 0.66864884, + "learning_rate": 3.7556590762711463e-06, + "loss": 0.74714661, + "num_input_tokens_seen": 66186210, + "router_z_loss_clip": 2.62890625, + "router_z_loss_mlp": 0.23864746, + "step": 3064, + "time_per_iteration": 2.595961332321167 + }, + { + "auxiliary_loss_clip": 0.06569844, + "auxiliary_loss_mlp": 0.0127972, + "balance_loss_clip": 0.06304256, + "balance_loss_mlp": 0.01256748, + "epoch": 0.18427776942732602, + "flos": 27205395793920.0, + "grad_norm": 1.7817654183541871, + "language_loss": 0.69580668, + "learning_rate": 3.7554725012869853e-06, + "loss": 0.77430236, + "num_input_tokens_seen": 66204800, + "router_z_loss_clip": 2.65625, + "router_z_loss_mlp": 0.22937012, + "step": 3065, + "time_per_iteration": 2.5717501640319824 + }, + { + "auxiliary_loss_clip": 0.06574196, + "auxiliary_loss_mlp": 0.01283905, + "balance_loss_clip": 0.06306924, + "balance_loss_mlp": 0.01258168, + "epoch": 0.18433789267999398, + "flos": 27859303457280.0, + "grad_norm": 2.294674560085645, + "language_loss": 0.73328084, + "learning_rate": 3.7552858597351318e-06, + "loss": 0.81186187, + "num_input_tokens_seen": 66222195, + "router_z_loss_clip": 2.671875, + "router_z_loss_mlp": 0.25720215, + "step": 3066, + "time_per_iteration": 2.5840933322906494 + }, + { + "auxiliary_loss_clip": 0.06567979, + "auxiliary_loss_mlp": 0.01283252, + "balance_loss_clip": 0.06303403, + "balance_loss_mlp": 0.01259458, + "epoch": 0.18439801593266195, + "flos": 17862502965120.0, + "grad_norm": 1.9426241343058523, + "language_loss": 0.8287726, + "learning_rate": 3.7550991516226622e-06, + "loss": 0.90728498, + "num_input_tokens_seen": 66239505, + "router_z_loss_clip": 2.64453125, + "router_z_loss_mlp": 0.23791504, + "step": 3067, + "time_per_iteration": 2.510010004043579 + }, + { + "auxiliary_loss_clip": 0.06482083, + "auxiliary_loss_mlp": 0.01256206, + "balance_loss_clip": 0.06330505, + "balance_loss_mlp": 0.01248302, + "epoch": 0.18445813918532994, + "flos": 56408236416000.0, + "grad_norm": 0.8014843936748705, + "language_loss": 0.59808761, + "learning_rate": 3.754912376956657e-06, + "loss": 0.67547047, + "num_input_tokens_seen": 66295695, + "router_z_loss_clip": 1.515625, + "router_z_loss_mlp": 0.07897949, + "step": 3068, + "time_per_iteration": 3.036146879196167 + }, + { + "auxiliary_loss_clip": 0.06564388, + "auxiliary_loss_mlp": 0.01280505, + "balance_loss_clip": 0.06303549, + "balance_loss_mlp": 0.01256687, + "epoch": 0.1845182624379979, + "flos": 20963085523200.0, + "grad_norm": 1.8439912741449518, + "language_loss": 0.77266169, + "learning_rate": 3.7547255357441987e-06, + "loss": 0.8511107, + "num_input_tokens_seen": 66315315, + "router_z_loss_clip": 2.60742188, + "router_z_loss_mlp": 0.23840332, + "step": 3069, + "time_per_iteration": 2.5499565601348877 + }, + { + "auxiliary_loss_clip": 0.06570058, + "auxiliary_loss_mlp": 0.01283287, + "balance_loss_clip": 0.06303704, + "balance_loss_mlp": 0.01258038, + "epoch": 0.18457838569066587, + "flos": 20491382563200.0, + "grad_norm": 2.2630610204441655, + "language_loss": 0.86447155, + "learning_rate": 3.7545386279923718e-06, + "loss": 0.94300503, + "num_input_tokens_seen": 66333675, + "router_z_loss_clip": 2.6640625, + "router_z_loss_mlp": 0.25280762, + "step": 3070, + "time_per_iteration": 2.573843479156494 + }, + { + "auxiliary_loss_clip": 0.06575848, + "auxiliary_loss_mlp": 0.0128984, + "balance_loss_clip": 0.06307413, + "balance_loss_mlp": 0.01265545, + "epoch": 0.18463850894333383, + "flos": 25017094563840.0, + "grad_norm": 2.0459920671080725, + "language_loss": 0.78778827, + "learning_rate": 3.754351653708265e-06, + "loss": 0.86644518, + "num_input_tokens_seen": 66354075, + "router_z_loss_clip": 2.68164062, + "router_z_loss_mlp": 0.24279785, + "step": 3071, + "time_per_iteration": 2.6498963832855225 + }, + { + "auxiliary_loss_clip": 0.06567957, + "auxiliary_loss_mlp": 0.01281558, + "balance_loss_clip": 0.06301579, + "balance_loss_mlp": 0.01256142, + "epoch": 0.1846986321960018, + "flos": 16806311049600.0, + "grad_norm": 2.346095649750701, + "language_loss": 0.77759838, + "learning_rate": 3.7541646128989674e-06, + "loss": 0.85609353, + "num_input_tokens_seen": 66372520, + "router_z_loss_clip": 2.66210938, + "router_z_loss_mlp": 0.25427246, + "step": 3072, + "time_per_iteration": 2.5731780529022217 + }, + { + "auxiliary_loss_clip": 0.06569058, + "auxiliary_loss_mlp": 0.01286345, + "balance_loss_clip": 0.06299037, + "balance_loss_mlp": 0.01261096, + "epoch": 0.18475875544866976, + "flos": 20820726236160.0, + "grad_norm": 1.9004070702769575, + "language_loss": 0.87276495, + "learning_rate": 3.7539775055715715e-06, + "loss": 0.95131898, + "num_input_tokens_seen": 66390745, + "router_z_loss_clip": 2.703125, + "router_z_loss_mlp": 0.25231934, + "step": 3073, + "time_per_iteration": 2.5327014923095703 + }, + { + "auxiliary_loss_clip": 0.06571067, + "auxiliary_loss_mlp": 0.01285925, + "balance_loss_clip": 0.06302057, + "balance_loss_mlp": 0.01261523, + "epoch": 0.18481887870133773, + "flos": 22608001025280.0, + "grad_norm": 2.4702398063651314, + "language_loss": 0.9204939, + "learning_rate": 3.7537903317331732e-06, + "loss": 0.99906385, + "num_input_tokens_seen": 66410525, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 0.24401855, + "step": 3074, + "time_per_iteration": 2.6219372749328613 + }, + { + "auxiliary_loss_clip": 0.06566601, + "auxiliary_loss_mlp": 0.01284131, + "balance_loss_clip": 0.06302647, + "balance_loss_mlp": 0.01257583, + "epoch": 0.18487900195400572, + "flos": 29466218332800.0, + "grad_norm": 2.295087571563985, + "language_loss": 0.64970315, + "learning_rate": 3.75360309139087e-06, + "loss": 0.72821045, + "num_input_tokens_seen": 66432535, + "router_z_loss_clip": 2.640625, + "router_z_loss_mlp": 0.26550293, + "step": 3075, + "time_per_iteration": 2.6108217239379883 + }, + { + "auxiliary_loss_clip": 0.06563977, + "auxiliary_loss_mlp": 0.0128829, + "balance_loss_clip": 0.06303947, + "balance_loss_mlp": 0.01264519, + "epoch": 0.1849391252066737, + "flos": 20634622318080.0, + "grad_norm": 2.1580493004205943, + "language_loss": 0.7321173, + "learning_rate": 3.753415784551761e-06, + "loss": 0.81063998, + "num_input_tokens_seen": 66450620, + "router_z_loss_clip": 2.60351562, + "router_z_loss_mlp": 0.23742676, + "step": 3076, + "time_per_iteration": 2.552551746368408 + }, + { + "auxiliary_loss_clip": 0.06574243, + "auxiliary_loss_mlp": 0.01280151, + "balance_loss_clip": 0.06304738, + "balance_loss_mlp": 0.01256309, + "epoch": 0.18499924845934165, + "flos": 14433750691200.0, + "grad_norm": 2.459416187119703, + "language_loss": 0.82324487, + "learning_rate": 3.7532284112229507e-06, + "loss": 0.90178883, + "num_input_tokens_seen": 66467865, + "router_z_loss_clip": 2.69726562, + "router_z_loss_mlp": 0.23864746, + "step": 3077, + "time_per_iteration": 2.493069648742676 + }, + { + "auxiliary_loss_clip": 0.06560019, + "auxiliary_loss_mlp": 0.01280161, + "balance_loss_clip": 0.06302261, + "balance_loss_mlp": 0.01256748, + "epoch": 0.18505937171200962, + "flos": 23733611648640.0, + "grad_norm": 1.8347096473751274, + "language_loss": 0.79534197, + "learning_rate": 3.7530409714115424e-06, + "loss": 0.87374371, + "num_input_tokens_seen": 66486245, + "router_z_loss_clip": 2.57617188, + "router_z_loss_mlp": 0.23425293, + "step": 3078, + "time_per_iteration": 2.5838091373443604 + }, + { + "auxiliary_loss_clip": 0.0657796, + "auxiliary_loss_mlp": 0.01288284, + "balance_loss_clip": 0.06314268, + "balance_loss_mlp": 0.0126536, + "epoch": 0.18511949496467758, + "flos": 25964525479680.0, + "grad_norm": 2.3879568543100174, + "language_loss": 0.78543603, + "learning_rate": 3.7528534651246453e-06, + "loss": 0.86409843, + "num_input_tokens_seen": 66506510, + "router_z_loss_clip": 2.63867188, + "router_z_loss_mlp": 0.22937012, + "step": 3079, + "time_per_iteration": 2.5836563110351562 + }, + { + "auxiliary_loss_clip": 0.06574707, + "auxiliary_loss_mlp": 0.01290584, + "balance_loss_clip": 0.06311746, + "balance_loss_mlp": 0.01266921, + "epoch": 0.18517961821734555, + "flos": 42423506156160.0, + "grad_norm": 2.6792059094445393, + "language_loss": 0.82738018, + "learning_rate": 3.752665892369369e-06, + "loss": 0.90603304, + "num_input_tokens_seen": 66530960, + "router_z_loss_clip": 2.62890625, + "router_z_loss_mlp": 0.23669434, + "step": 3080, + "time_per_iteration": 2.7419395446777344 + }, + { + "auxiliary_loss_clip": 0.06581488, + "auxiliary_loss_mlp": 0.01283912, + "balance_loss_clip": 0.06312552, + "balance_loss_mlp": 0.01258306, + "epoch": 0.18523974147001354, + "flos": 24104435892480.0, + "grad_norm": 2.0136248585759815, + "language_loss": 0.75280142, + "learning_rate": 3.7524782531528266e-06, + "loss": 0.83145541, + "num_input_tokens_seen": 66550275, + "router_z_loss_clip": 2.69140625, + "router_z_loss_mlp": 0.25622559, + "step": 3081, + "time_per_iteration": 2.558880567550659 + }, + { + "auxiliary_loss_clip": 0.06580579, + "auxiliary_loss_mlp": 0.01294641, + "balance_loss_clip": 0.06314941, + "balance_loss_mlp": 0.01267354, + "epoch": 0.1852998647226815, + "flos": 27381688784640.0, + "grad_norm": 2.2228183561660533, + "language_loss": 0.72592467, + "learning_rate": 3.7522905474821334e-06, + "loss": 0.80467689, + "num_input_tokens_seen": 66569040, + "router_z_loss_clip": 2.65820312, + "router_z_loss_mlp": 0.27282715, + "step": 3082, + "time_per_iteration": 2.588782787322998 + }, + { + "auxiliary_loss_clip": 0.06586821, + "auxiliary_loss_mlp": 0.01289587, + "balance_loss_clip": 0.06314754, + "balance_loss_mlp": 0.01263409, + "epoch": 0.18535998797534947, + "flos": 18338650191360.0, + "grad_norm": 1.9336985276158285, + "language_loss": 0.70667702, + "learning_rate": 3.752102775364407e-06, + "loss": 0.78544116, + "num_input_tokens_seen": 66587775, + "router_z_loss_clip": 2.72070312, + "router_z_loss_mlp": 0.26184082, + "step": 3083, + "time_per_iteration": 2.630099296569824 + }, + { + "auxiliary_loss_clip": 0.06573243, + "auxiliary_loss_mlp": 0.01286773, + "balance_loss_clip": 0.06312741, + "balance_loss_mlp": 0.01261548, + "epoch": 0.18542011122801744, + "flos": 37853881816320.0, + "grad_norm": 1.8745280868212635, + "language_loss": 0.69687432, + "learning_rate": 3.751914936806767e-06, + "loss": 0.77547449, + "num_input_tokens_seen": 66610800, + "router_z_loss_clip": 2.60742188, + "router_z_loss_mlp": 0.25244141, + "step": 3084, + "time_per_iteration": 2.7246148586273193 + }, + { + "auxiliary_loss_clip": 0.06577612, + "auxiliary_loss_mlp": 0.01284469, + "balance_loss_clip": 0.06314437, + "balance_loss_mlp": 0.01261402, + "epoch": 0.1854802344806854, + "flos": 25192171670400.0, + "grad_norm": 1.5329506051970134, + "language_loss": 0.78209639, + "learning_rate": 3.7517270318163377e-06, + "loss": 0.86071718, + "num_input_tokens_seen": 66630960, + "router_z_loss_clip": 2.6328125, + "router_z_loss_mlp": 0.23071289, + "step": 3085, + "time_per_iteration": 2.6189463138580322 + }, + { + "auxiliary_loss_clip": 0.06579587, + "auxiliary_loss_mlp": 0.01287952, + "balance_loss_clip": 0.06314654, + "balance_loss_mlp": 0.01261964, + "epoch": 0.18554035773335337, + "flos": 26691541430400.0, + "grad_norm": 1.8306415954747441, + "language_loss": 0.74554545, + "learning_rate": 3.751539060400244e-06, + "loss": 0.82422084, + "num_input_tokens_seen": 66650585, + "router_z_loss_clip": 2.65234375, + "router_z_loss_mlp": 0.2598877, + "step": 3086, + "time_per_iteration": 2.5668296813964844 + }, + { + "auxiliary_loss_clip": 0.06581503, + "auxiliary_loss_mlp": 0.0129843, + "balance_loss_clip": 0.06316213, + "balance_loss_mlp": 0.01272026, + "epoch": 0.18560048098602133, + "flos": 22353568502400.0, + "grad_norm": 2.451797107788235, + "language_loss": 0.70597452, + "learning_rate": 3.7513510225656132e-06, + "loss": 0.78477389, + "num_input_tokens_seen": 66670045, + "router_z_loss_clip": 2.65429688, + "router_z_loss_mlp": 0.26391602, + "step": 3087, + "time_per_iteration": 2.568380117416382 + }, + { + "auxiliary_loss_clip": 0.06584737, + "auxiliary_loss_mlp": 0.01292318, + "balance_loss_clip": 0.06317757, + "balance_loss_mlp": 0.01264543, + "epoch": 0.18566060423868933, + "flos": 17754245089920.0, + "grad_norm": 1.9281487675228464, + "language_loss": 0.73915106, + "learning_rate": 3.7511629183195764e-06, + "loss": 0.81792164, + "num_input_tokens_seen": 66688790, + "router_z_loss_clip": 2.671875, + "router_z_loss_mlp": 0.27783203, + "step": 3088, + "time_per_iteration": 2.536055326461792 + }, + { + "auxiliary_loss_clip": 0.06578237, + "auxiliary_loss_mlp": 0.01288694, + "balance_loss_clip": 0.06316703, + "balance_loss_mlp": 0.0126571, + "epoch": 0.1857207274913573, + "flos": 24683558186880.0, + "grad_norm": 1.798814131108877, + "language_loss": 0.92793214, + "learning_rate": 3.7509747476692663e-06, + "loss": 1.00660145, + "num_input_tokens_seen": 66708090, + "router_z_loss_clip": 2.6171875, + "router_z_loss_mlp": 0.2298584, + "step": 3089, + "time_per_iteration": 2.591520071029663 + }, + { + "auxiliary_loss_clip": 0.06581305, + "auxiliary_loss_mlp": 0.01284125, + "balance_loss_clip": 0.06316443, + "balance_loss_mlp": 0.01260772, + "epoch": 0.18578085074402526, + "flos": 28155426186240.0, + "grad_norm": 2.9732427277308724, + "language_loss": 0.59245396, + "learning_rate": 3.7507865106218176e-06, + "loss": 0.67110825, + "num_input_tokens_seen": 66727320, + "router_z_loss_clip": 2.6484375, + "router_z_loss_mlp": 0.23352051, + "step": 3090, + "time_per_iteration": 2.587693452835083 + }, + { + "auxiliary_loss_clip": 0.06569171, + "auxiliary_loss_mlp": 0.01294048, + "balance_loss_clip": 0.06308332, + "balance_loss_mlp": 0.01269372, + "epoch": 0.18584097399669322, + "flos": 23958764369280.0, + "grad_norm": 1.6455413495288673, + "language_loss": 0.825216, + "learning_rate": 3.7505982071843695e-06, + "loss": 0.90384817, + "num_input_tokens_seen": 66747505, + "router_z_loss_clip": 2.61132812, + "router_z_loss_mlp": 0.24694824, + "step": 3091, + "time_per_iteration": 2.564748525619507 + }, + { + "auxiliary_loss_clip": 0.06580666, + "auxiliary_loss_mlp": 0.01293234, + "balance_loss_clip": 0.06311382, + "balance_loss_mlp": 0.01266758, + "epoch": 0.18590109724936119, + "flos": 17207379417600.0, + "grad_norm": 2.4797040605264904, + "language_loss": 0.8537268, + "learning_rate": 3.7504098373640617e-06, + "loss": 0.93246579, + "num_input_tokens_seen": 66766425, + "router_z_loss_clip": 2.69335938, + "router_z_loss_mlp": 0.2644043, + "step": 3092, + "time_per_iteration": 2.514536142349243 + }, + { + "auxiliary_loss_clip": 0.06587748, + "auxiliary_loss_mlp": 0.01293739, + "balance_loss_clip": 0.06317791, + "balance_loss_mlp": 0.012665, + "epoch": 0.18596122050202915, + "flos": 17239761820800.0, + "grad_norm": 2.2590627268781316, + "language_loss": 0.93402261, + "learning_rate": 3.750221401168038e-06, + "loss": 1.01283741, + "num_input_tokens_seen": 66781130, + "router_z_loss_clip": 2.69921875, + "router_z_loss_mlp": 0.27246094, + "step": 3093, + "time_per_iteration": 2.5037660598754883 + }, + { + "auxiliary_loss_clip": 0.06575991, + "auxiliary_loss_mlp": 0.01284238, + "balance_loss_clip": 0.06309767, + "balance_loss_mlp": 0.01258477, + "epoch": 0.18602134375469712, + "flos": 19025862652800.0, + "grad_norm": 1.8616717248352448, + "language_loss": 0.77931499, + "learning_rate": 3.750032898603443e-06, + "loss": 0.85791731, + "num_input_tokens_seen": 66797535, + "router_z_loss_clip": 2.66210938, + "router_z_loss_mlp": 0.25744629, + "step": 3094, + "time_per_iteration": 2.529491662979126 + }, + { + "auxiliary_loss_clip": 0.06576168, + "auxiliary_loss_mlp": 0.0128492, + "balance_loss_clip": 0.06311647, + "balance_loss_mlp": 0.01260637, + "epoch": 0.1860814670073651, + "flos": 50961285429120.0, + "grad_norm": 1.6485050019084173, + "language_loss": 0.70511484, + "learning_rate": 3.749844329677425e-06, + "loss": 0.7837258, + "num_input_tokens_seen": 66821720, + "router_z_loss_clip": 2.64257812, + "router_z_loss_mlp": 0.24291992, + "step": 3095, + "time_per_iteration": 4.124077558517456 + }, + { + "auxiliary_loss_clip": 0.0658177, + "auxiliary_loss_mlp": 0.01296881, + "balance_loss_clip": 0.06310082, + "balance_loss_mlp": 0.01268819, + "epoch": 0.18614159026003307, + "flos": 19397064240000.0, + "grad_norm": 1.9264485804072164, + "language_loss": 0.81302798, + "learning_rate": 3.749655694397135e-06, + "loss": 0.89181447, + "num_input_tokens_seen": 66839060, + "router_z_loss_clip": 2.71484375, + "router_z_loss_mlp": 0.28051758, + "step": 3096, + "time_per_iteration": 2.5277867317199707 + }, + { + "auxiliary_loss_clip": 0.06581111, + "auxiliary_loss_mlp": 0.01285017, + "balance_loss_clip": 0.06310429, + "balance_loss_mlp": 0.01259173, + "epoch": 0.18620171351270104, + "flos": 21805235383680.0, + "grad_norm": 1.9931413029080365, + "language_loss": 0.76143897, + "learning_rate": 3.7494669927697255e-06, + "loss": 0.84010023, + "num_input_tokens_seen": 66857760, + "router_z_loss_clip": 2.70898438, + "router_z_loss_mlp": 0.25842285, + "step": 3097, + "time_per_iteration": 3.982475996017456 + }, + { + "auxiliary_loss_clip": 0.06569855, + "auxiliary_loss_mlp": 0.01288887, + "balance_loss_clip": 0.06308468, + "balance_loss_mlp": 0.01263877, + "epoch": 0.186261836765369, + "flos": 16368499866240.0, + "grad_norm": 2.207337076402474, + "language_loss": 0.67101508, + "learning_rate": 3.749278224802352e-06, + "loss": 0.74960256, + "num_input_tokens_seen": 66876460, + "router_z_loss_clip": 2.61328125, + "router_z_loss_mlp": 0.25061035, + "step": 3098, + "time_per_iteration": 2.5570473670959473 + }, + { + "auxiliary_loss_clip": 0.06578363, + "auxiliary_loss_mlp": 0.01287977, + "balance_loss_clip": 0.06308189, + "balance_loss_mlp": 0.0126044, + "epoch": 0.18632196001803697, + "flos": 23377168379520.0, + "grad_norm": 1.559550653919394, + "language_loss": 0.70188725, + "learning_rate": 3.7490893905021733e-06, + "loss": 0.7805506, + "num_input_tokens_seen": 66897960, + "router_z_loss_clip": 2.703125, + "router_z_loss_mlp": 0.2755127, + "step": 3099, + "time_per_iteration": 2.5704476833343506 + }, + { + "auxiliary_loss_clip": 0.0657559, + "auxiliary_loss_mlp": 0.01292152, + "balance_loss_clip": 0.06309687, + "balance_loss_mlp": 0.01266689, + "epoch": 0.18638208327070493, + "flos": 22498569192960.0, + "grad_norm": 1.5145032946618349, + "language_loss": 0.72489583, + "learning_rate": 3.7489004898763494e-06, + "loss": 0.80357325, + "num_input_tokens_seen": 66917675, + "router_z_loss_clip": 2.66015625, + "router_z_loss_mlp": 0.25463867, + "step": 3100, + "time_per_iteration": 2.628770351409912 + }, + { + "auxiliary_loss_clip": 0.06585407, + "auxiliary_loss_mlp": 0.01287458, + "balance_loss_clip": 0.06314865, + "balance_loss_mlp": 0.01261971, + "epoch": 0.18644220652337293, + "flos": 29172317736960.0, + "grad_norm": 1.7314771672192502, + "language_loss": 0.80930734, + "learning_rate": 3.7487115229320444e-06, + "loss": 0.88803601, + "num_input_tokens_seen": 66936000, + "router_z_loss_clip": 2.703125, + "router_z_loss_mlp": 0.25524902, + "step": 3101, + "time_per_iteration": 4.063347578048706 + }, + { + "auxiliary_loss_clip": 0.0657436, + "auxiliary_loss_mlp": 0.01283038, + "balance_loss_clip": 0.06309733, + "balance_loss_mlp": 0.01259494, + "epoch": 0.1865023297760409, + "flos": 24250736321280.0, + "grad_norm": 2.4348094857493834, + "language_loss": 0.77630436, + "learning_rate": 3.7485224896764222e-06, + "loss": 0.85487837, + "num_input_tokens_seen": 66955700, + "router_z_loss_clip": 2.64453125, + "router_z_loss_mlp": 0.23535156, + "step": 3102, + "time_per_iteration": 3.9878056049346924 + }, + { + "auxiliary_loss_clip": 0.06580452, + "auxiliary_loss_mlp": 0.01284097, + "balance_loss_clip": 0.0631346, + "balance_loss_mlp": 0.01259504, + "epoch": 0.18656245302870886, + "flos": 19133617403520.0, + "grad_norm": 4.261808326107292, + "language_loss": 0.77043533, + "learning_rate": 3.7483333901166525e-06, + "loss": 0.8490808, + "num_input_tokens_seen": 66972815, + "router_z_loss_clip": 2.66992188, + "router_z_loss_mlp": 0.24584961, + "step": 3103, + "time_per_iteration": 2.5497515201568604 + }, + { + "auxiliary_loss_clip": 0.06580411, + "auxiliary_loss_mlp": 0.01279736, + "balance_loss_clip": 0.06311087, + "balance_loss_mlp": 0.01255596, + "epoch": 0.18662257628137682, + "flos": 17791994154240.0, + "grad_norm": 1.8534126866214053, + "language_loss": 0.80155015, + "learning_rate": 3.7481442242599054e-06, + "loss": 0.88015163, + "num_input_tokens_seen": 66992280, + "router_z_loss_clip": 2.69335938, + "router_z_loss_mlp": 0.24157715, + "step": 3104, + "time_per_iteration": 2.5436315536499023 + }, + { + "auxiliary_loss_clip": 0.06576735, + "auxiliary_loss_mlp": 0.01287024, + "balance_loss_clip": 0.06310537, + "balance_loss_mlp": 0.01262884, + "epoch": 0.1866826995340448, + "flos": 24031201824000.0, + "grad_norm": 1.9078675803700618, + "language_loss": 0.86523151, + "learning_rate": 3.747954992113354e-06, + "loss": 0.94386911, + "num_input_tokens_seen": 67012220, + "router_z_loss_clip": 2.66210938, + "router_z_loss_mlp": 0.24169922, + "step": 3105, + "time_per_iteration": 2.5862667560577393 + }, + { + "auxiliary_loss_clip": 0.06594124, + "auxiliary_loss_mlp": 0.01282565, + "balance_loss_clip": 0.06317551, + "balance_loss_mlp": 0.01257853, + "epoch": 0.18674282278671275, + "flos": 26148533045760.0, + "grad_norm": 3.6817594399013203, + "language_loss": 0.87727821, + "learning_rate": 3.7477656936841742e-06, + "loss": 0.95604515, + "num_input_tokens_seen": 67032030, + "router_z_loss_clip": 2.765625, + "router_z_loss_mlp": 0.24719238, + "step": 3106, + "time_per_iteration": 2.6158018112182617 + }, + { + "auxiliary_loss_clip": 0.06587484, + "auxiliary_loss_mlp": 0.01282217, + "balance_loss_clip": 0.06311296, + "balance_loss_mlp": 0.01259078, + "epoch": 0.18680294603938072, + "flos": 19206893399040.0, + "grad_norm": 1.800292289422269, + "language_loss": 0.78916037, + "learning_rate": 3.7475763289795445e-06, + "loss": 0.86785746, + "num_input_tokens_seen": 67048920, + "router_z_loss_clip": 2.76171875, + "router_z_loss_mlp": 0.23132324, + "step": 3107, + "time_per_iteration": 2.519771099090576 + }, + { + "auxiliary_loss_clip": 0.06579127, + "auxiliary_loss_mlp": 0.01290711, + "balance_loss_clip": 0.06304596, + "balance_loss_mlp": 0.01264997, + "epoch": 0.1868630692920487, + "flos": 28551840652800.0, + "grad_norm": 3.3283393961991345, + "language_loss": 0.75120842, + "learning_rate": 3.7473868980066446e-06, + "loss": 0.82990676, + "num_input_tokens_seen": 67068645, + "router_z_loss_clip": 2.75, + "router_z_loss_mlp": 0.25720215, + "step": 3108, + "time_per_iteration": 2.5681068897247314 + }, + { + "auxiliary_loss_clip": 0.06588297, + "auxiliary_loss_mlp": 0.01287258, + "balance_loss_clip": 0.06313515, + "balance_loss_mlp": 0.01262451, + "epoch": 0.18692319254471668, + "flos": 17243702962560.0, + "grad_norm": 1.5585462553143232, + "language_loss": 0.7488178, + "learning_rate": 3.747197400772658e-06, + "loss": 0.82757336, + "num_input_tokens_seen": 67087075, + "router_z_loss_clip": 2.7421875, + "router_z_loss_mlp": 0.24816895, + "step": 3109, + "time_per_iteration": 2.5719470977783203 + }, + { + "auxiliary_loss_clip": 0.06585538, + "auxiliary_loss_mlp": 0.01282339, + "balance_loss_clip": 0.06316088, + "balance_loss_mlp": 0.01256113, + "epoch": 0.18698331579738464, + "flos": 23191861075200.0, + "grad_norm": 1.4817620217833272, + "language_loss": 0.85173523, + "learning_rate": 3.747007837284772e-06, + "loss": 0.93041396, + "num_input_tokens_seen": 67108040, + "router_z_loss_clip": 2.69726562, + "router_z_loss_mlp": 0.26220703, + "step": 3110, + "time_per_iteration": 2.604595899581909 + }, + { + "auxiliary_loss_clip": 0.06572624, + "auxiliary_loss_mlp": 0.01284902, + "balance_loss_clip": 0.06305574, + "balance_loss_mlp": 0.01260142, + "epoch": 0.1870434390500526, + "flos": 25523192424960.0, + "grad_norm": 2.402854340329271, + "language_loss": 0.85246378, + "learning_rate": 3.7468182075501737e-06, + "loss": 0.93103909, + "num_input_tokens_seen": 67127605, + "router_z_loss_clip": 2.671875, + "router_z_loss_mlp": 0.24755859, + "step": 3111, + "time_per_iteration": 2.58076810836792 + }, + { + "auxiliary_loss_clip": 0.06578258, + "auxiliary_loss_mlp": 0.0128217, + "balance_loss_clip": 0.06306738, + "balance_loss_mlp": 0.0125778, + "epoch": 0.18710356230272057, + "flos": 19506999196800.0, + "grad_norm": 1.9642208489694009, + "language_loss": 0.77830005, + "learning_rate": 3.7466285115760536e-06, + "loss": 0.85690439, + "num_input_tokens_seen": 67145785, + "router_z_loss_clip": 2.71679688, + "router_z_loss_mlp": 0.24365234, + "step": 3112, + "time_per_iteration": 2.5625264644622803 + }, + { + "auxiliary_loss_clip": 0.06577107, + "auxiliary_loss_mlp": 0.01281729, + "balance_loss_clip": 0.06307282, + "balance_loss_mlp": 0.01258113, + "epoch": 0.18716368555538854, + "flos": 26768129662080.0, + "grad_norm": 2.238016316213089, + "language_loss": 0.65778387, + "learning_rate": 3.7464387493696046e-06, + "loss": 0.73637217, + "num_input_tokens_seen": 67165930, + "router_z_loss_clip": 2.70117188, + "router_z_loss_mlp": 0.23620605, + "step": 3113, + "time_per_iteration": 2.6080710887908936 + }, + { + "auxiliary_loss_clip": 0.06588607, + "auxiliary_loss_mlp": 0.01279317, + "balance_loss_clip": 0.06312529, + "balance_loss_mlp": 0.01254962, + "epoch": 0.1872238088080565, + "flos": 25196490155520.0, + "grad_norm": 2.335075222112074, + "language_loss": 0.82613724, + "learning_rate": 3.746248920938024e-06, + "loss": 0.90481651, + "num_input_tokens_seen": 67185830, + "router_z_loss_clip": 2.75976562, + "router_z_loss_mlp": 0.2434082, + "step": 3114, + "time_per_iteration": 2.5988082885742188 + }, + { + "auxiliary_loss_clip": 0.06587939, + "auxiliary_loss_mlp": 0.01289131, + "balance_loss_clip": 0.06312289, + "balance_loss_mlp": 0.01262655, + "epoch": 0.1872839320607245, + "flos": 24141220634880.0, + "grad_norm": 2.589653310619875, + "language_loss": 0.58319235, + "learning_rate": 3.74605902628851e-06, + "loss": 0.66196311, + "num_input_tokens_seen": 67206930, + "router_z_loss_clip": 2.75390625, + "router_z_loss_mlp": 0.26464844, + "step": 3115, + "time_per_iteration": 2.597001552581787 + }, + { + "auxiliary_loss_clip": 0.06578196, + "auxiliary_loss_mlp": 0.01284839, + "balance_loss_clip": 0.06308471, + "balance_loss_mlp": 0.01261676, + "epoch": 0.18734405531339246, + "flos": 21179349711360.0, + "grad_norm": 2.089321408475999, + "language_loss": 0.7264486, + "learning_rate": 3.745869065428261e-06, + "loss": 0.80507892, + "num_input_tokens_seen": 67226290, + "router_z_loss_clip": 2.6953125, + "router_z_loss_mlp": 0.23168945, + "step": 3116, + "time_per_iteration": 2.559483051300049 + }, + { + "auxiliary_loss_clip": 0.06573902, + "auxiliary_loss_mlp": 0.01278215, + "balance_loss_clip": 0.06309307, + "balance_loss_mlp": 0.01256292, + "epoch": 0.18740417856606043, + "flos": 17243325619200.0, + "grad_norm": 2.0473943382883184, + "language_loss": 0.79514784, + "learning_rate": 3.7456790383644833e-06, + "loss": 0.87366909, + "num_input_tokens_seen": 67244410, + "router_z_loss_clip": 2.64648438, + "router_z_loss_mlp": 0.21936035, + "step": 3117, + "time_per_iteration": 2.5308892726898193 + }, + { + "auxiliary_loss_clip": 0.06575021, + "auxiliary_loss_mlp": 0.01286113, + "balance_loss_clip": 0.06310903, + "balance_loss_mlp": 0.01262426, + "epoch": 0.1874643018187284, + "flos": 32565626933760.0, + "grad_norm": 1.6927935343473184, + "language_loss": 0.84475845, + "learning_rate": 3.745488945104381e-06, + "loss": 0.92336977, + "num_input_tokens_seen": 67264470, + "router_z_loss_clip": 2.64257812, + "router_z_loss_mlp": 0.23669434, + "step": 3118, + "time_per_iteration": 2.645819902420044 + }, + { + "auxiliary_loss_clip": 0.06577513, + "auxiliary_loss_mlp": 0.01281432, + "balance_loss_clip": 0.06306227, + "balance_loss_mlp": 0.01256184, + "epoch": 0.18752442507139636, + "flos": 23264843581440.0, + "grad_norm": 1.8564508885039195, + "language_loss": 0.77631271, + "learning_rate": 3.7452987856551636e-06, + "loss": 0.85490215, + "num_input_tokens_seen": 67284315, + "router_z_loss_clip": 2.7109375, + "router_z_loss_mlp": 0.25280762, + "step": 3119, + "time_per_iteration": 2.5282692909240723 + }, + { + "auxiliary_loss_clip": 0.06577515, + "auxiliary_loss_mlp": 0.01280917, + "balance_loss_clip": 0.06308109, + "balance_loss_mlp": 0.01257934, + "epoch": 0.18758454832406432, + "flos": 21767150903040.0, + "grad_norm": 1.872231122069903, + "language_loss": 0.83286214, + "learning_rate": 3.7451085600240406e-06, + "loss": 0.91144645, + "num_input_tokens_seen": 67302780, + "router_z_loss_clip": 2.69921875, + "router_z_loss_mlp": 0.22973633, + "step": 3120, + "time_per_iteration": 2.5557563304901123 + }, + { + "auxiliary_loss_clip": 0.06574757, + "auxiliary_loss_mlp": 0.01283184, + "balance_loss_clip": 0.06308539, + "balance_loss_mlp": 0.01260606, + "epoch": 0.1876446715767323, + "flos": 29577956152320.0, + "grad_norm": 1.9256466590755805, + "language_loss": 0.85764915, + "learning_rate": 3.7449182682182263e-06, + "loss": 0.93622863, + "num_input_tokens_seen": 67323405, + "router_z_loss_clip": 2.6640625, + "router_z_loss_mlp": 0.22595215, + "step": 3121, + "time_per_iteration": 2.5938265323638916 + }, + { + "auxiliary_loss_clip": 0.06579052, + "auxiliary_loss_mlp": 0.01278188, + "balance_loss_clip": 0.06313133, + "balance_loss_mlp": 0.01255037, + "epoch": 0.18770479482940028, + "flos": 30348465171840.0, + "grad_norm": 1.7101492266675271, + "language_loss": 0.71341884, + "learning_rate": 3.744727910244937e-06, + "loss": 0.79199123, + "num_input_tokens_seen": 67345800, + "router_z_loss_clip": 2.66015625, + "router_z_loss_mlp": 0.23156738, + "step": 3122, + "time_per_iteration": 2.6486034393310547 + }, + { + "auxiliary_loss_clip": 0.06583723, + "auxiliary_loss_mlp": 0.01279754, + "balance_loss_clip": 0.06317301, + "balance_loss_mlp": 0.01255602, + "epoch": 0.18776491808206824, + "flos": 14470619287680.0, + "grad_norm": 1.9121070999681127, + "language_loss": 0.71984768, + "learning_rate": 3.7445374861113905e-06, + "loss": 0.79848242, + "num_input_tokens_seen": 67363575, + "router_z_loss_clip": 2.6640625, + "router_z_loss_mlp": 0.24157715, + "step": 3123, + "time_per_iteration": 2.50598406791687 + }, + { + "auxiliary_loss_clip": 0.06582906, + "auxiliary_loss_mlp": 0.01279768, + "balance_loss_clip": 0.06318765, + "balance_loss_mlp": 0.01258251, + "epoch": 0.1878250413347362, + "flos": 24505420406400.0, + "grad_norm": 1.8100549345620827, + "language_loss": 0.74830985, + "learning_rate": 3.7443469958248066e-06, + "loss": 0.8269366, + "num_input_tokens_seen": 67381765, + "router_z_loss_clip": 2.63867188, + "router_z_loss_mlp": 0.21520996, + "step": 3124, + "time_per_iteration": 2.588963031768799 + }, + { + "auxiliary_loss_clip": 0.06579177, + "auxiliary_loss_mlp": 0.01284317, + "balance_loss_clip": 0.06309149, + "balance_loss_mlp": 0.01260177, + "epoch": 0.18788516458740417, + "flos": 39795632807040.0, + "grad_norm": 2.0156197395212225, + "language_loss": 0.81827998, + "learning_rate": 3.7441564393924106e-06, + "loss": 0.89691496, + "num_input_tokens_seen": 67405000, + "router_z_loss_clip": 2.703125, + "router_z_loss_mlp": 0.24133301, + "step": 3125, + "time_per_iteration": 2.6984996795654297 + }, + { + "auxiliary_loss_clip": 0.06689048, + "auxiliary_loss_mlp": 0.01323199, + "balance_loss_clip": 0.06516109, + "balance_loss_mlp": 0.01312268, + "epoch": 0.18794528784007214, + "flos": 64717844221440.0, + "grad_norm": 0.9517259918121469, + "language_loss": 0.63560247, + "learning_rate": 3.7439658168214273e-06, + "loss": 0.715725, + "num_input_tokens_seen": 67467140, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.10949707, + "step": 3126, + "time_per_iteration": 3.246349811553955 + }, + { + "auxiliary_loss_clip": 0.06580469, + "auxiliary_loss_mlp": 0.01289138, + "balance_loss_clip": 0.06317941, + "balance_loss_mlp": 0.01265118, + "epoch": 0.1880054110927401, + "flos": 28629728622720.0, + "grad_norm": 1.7132867879725662, + "language_loss": 0.81907004, + "learning_rate": 3.7437751281190857e-06, + "loss": 0.89776611, + "num_input_tokens_seen": 67487980, + "router_z_loss_clip": 2.62304688, + "router_z_loss_mlp": 0.24035645, + "step": 3127, + "time_per_iteration": 2.6359355449676514 + }, + { + "auxiliary_loss_clip": 0.06571439, + "auxiliary_loss_mlp": 0.01288176, + "balance_loss_clip": 0.06401625, + "balance_loss_mlp": 0.01277983, + "epoch": 0.1880655343454081, + "flos": 64508959192320.0, + "grad_norm": 0.7555261261025208, + "language_loss": 0.61928779, + "learning_rate": 3.7435843732926164e-06, + "loss": 0.69788396, + "num_input_tokens_seen": 67552500, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.10192871, + "step": 3128, + "time_per_iteration": 3.3078746795654297 + }, + { + "auxiliary_loss_clip": 0.06593472, + "auxiliary_loss_mlp": 0.01285866, + "balance_loss_clip": 0.06323253, + "balance_loss_mlp": 0.0126243, + "epoch": 0.18812565759807606, + "flos": 32132679287040.0, + "grad_norm": 2.3201362692378806, + "language_loss": 0.72451007, + "learning_rate": 3.7433935523492536e-06, + "loss": 0.80330348, + "num_input_tokens_seen": 67573295, + "router_z_loss_clip": 2.70117188, + "router_z_loss_mlp": 0.234375, + "step": 3129, + "time_per_iteration": 2.684316396713257 + }, + { + "auxiliary_loss_clip": 0.06599562, + "auxiliary_loss_mlp": 0.01283183, + "balance_loss_clip": 0.06331511, + "balance_loss_mlp": 0.01259294, + "epoch": 0.18818578085074403, + "flos": 20629674927360.0, + "grad_norm": 2.0063290669545024, + "language_loss": 0.85961545, + "learning_rate": 3.7432026652962314e-06, + "loss": 0.93844295, + "num_input_tokens_seen": 67590010, + "router_z_loss_clip": 2.68359375, + "router_z_loss_mlp": 0.23876953, + "step": 3130, + "time_per_iteration": 2.5385701656341553 + }, + { + "auxiliary_loss_clip": 0.0659353, + "auxiliary_loss_mlp": 0.0128556, + "balance_loss_clip": 0.06323448, + "balance_loss_mlp": 0.01262564, + "epoch": 0.188245904103412, + "flos": 28848131089920.0, + "grad_norm": 1.7743332045981155, + "language_loss": 0.77165318, + "learning_rate": 3.7430117121407897e-06, + "loss": 0.85044408, + "num_input_tokens_seen": 67611110, + "router_z_loss_clip": 2.69921875, + "router_z_loss_mlp": 0.23010254, + "step": 3131, + "time_per_iteration": 2.6456139087677 + }, + { + "auxiliary_loss_clip": 0.06594209, + "auxiliary_loss_mlp": 0.0129295, + "balance_loss_clip": 0.06329745, + "balance_loss_mlp": 0.01266891, + "epoch": 0.18830602735607996, + "flos": 29427379165440.0, + "grad_norm": 1.8335043044334671, + "language_loss": 0.8226279, + "learning_rate": 3.74282069289017e-06, + "loss": 0.90149951, + "num_input_tokens_seen": 67631990, + "router_z_loss_clip": 2.640625, + "router_z_loss_mlp": 0.26049805, + "step": 3132, + "time_per_iteration": 2.604219436645508 + }, + { + "auxiliary_loss_clip": 0.06612615, + "auxiliary_loss_mlp": 0.01296327, + "balance_loss_clip": 0.06340778, + "balance_loss_mlp": 0.01269886, + "epoch": 0.18836615060874792, + "flos": 28879884587520.0, + "grad_norm": 2.5361304129104476, + "language_loss": 0.80964118, + "learning_rate": 3.742629607551614e-06, + "loss": 0.88873059, + "num_input_tokens_seen": 67650490, + "router_z_loss_clip": 2.72070312, + "router_z_loss_mlp": 0.26452637, + "step": 3133, + "time_per_iteration": 2.6110780239105225 + }, + { + "auxiliary_loss_clip": 0.06596034, + "auxiliary_loss_mlp": 0.01290384, + "balance_loss_clip": 0.06326675, + "balance_loss_mlp": 0.01266709, + "epoch": 0.18842627386141592, + "flos": 22608294514560.0, + "grad_norm": 1.918700832470348, + "language_loss": 0.83331311, + "learning_rate": 3.7424384561323698e-06, + "loss": 0.91217732, + "num_input_tokens_seen": 67668860, + "router_z_loss_clip": 2.69921875, + "router_z_loss_mlp": 0.23669434, + "step": 3134, + "time_per_iteration": 3.9871177673339844 + }, + { + "auxiliary_loss_clip": 0.06585519, + "auxiliary_loss_mlp": 0.01303727, + "balance_loss_clip": 0.06320879, + "balance_loss_mlp": 0.01279873, + "epoch": 0.18848639711408388, + "flos": 24580834680960.0, + "grad_norm": 1.5688225209098985, + "language_loss": 0.83794045, + "learning_rate": 3.742247238639684e-06, + "loss": 0.91683292, + "num_input_tokens_seen": 67690220, + "router_z_loss_clip": 2.64648438, + "router_z_loss_mlp": 0.23852539, + "step": 3135, + "time_per_iteration": 2.576728343963623 + }, + { + "auxiliary_loss_clip": 0.06580248, + "auxiliary_loss_mlp": 0.01300724, + "balance_loss_clip": 0.06314597, + "balance_loss_mlp": 0.01277049, + "epoch": 0.18854652036675185, + "flos": 34175350920960.0, + "grad_norm": 2.0171444284890674, + "language_loss": 0.79025453, + "learning_rate": 3.7420559550808083e-06, + "loss": 0.86906427, + "num_input_tokens_seen": 67709820, + "router_z_loss_clip": 2.65429688, + "router_z_loss_mlp": 0.23681641, + "step": 3136, + "time_per_iteration": 4.059029817581177 + }, + { + "auxiliary_loss_clip": 0.06580447, + "auxiliary_loss_mlp": 0.01296286, + "balance_loss_clip": 0.06314041, + "balance_loss_mlp": 0.01272348, + "epoch": 0.1886066436194198, + "flos": 24205985441280.0, + "grad_norm": 1.848748774649379, + "language_loss": 0.82736617, + "learning_rate": 3.741864605462996e-06, + "loss": 0.90613353, + "num_input_tokens_seen": 67729490, + "router_z_loss_clip": 2.66796875, + "router_z_loss_mlp": 0.23925781, + "step": 3137, + "time_per_iteration": 2.5432510375976562 + }, + { + "auxiliary_loss_clip": 0.06589224, + "auxiliary_loss_mlp": 0.01291304, + "balance_loss_clip": 0.0632188, + "balance_loss_mlp": 0.0126745, + "epoch": 0.18866676687208778, + "flos": 21257405389440.0, + "grad_norm": 1.7037003999682347, + "language_loss": 0.81716311, + "learning_rate": 3.741673189793504e-06, + "loss": 0.89596832, + "num_input_tokens_seen": 67749665, + "router_z_loss_clip": 2.67382812, + "router_z_loss_mlp": 0.23864746, + "step": 3138, + "time_per_iteration": 2.5536084175109863 + }, + { + "auxiliary_loss_clip": 0.06589679, + "auxiliary_loss_mlp": 0.01290101, + "balance_loss_clip": 0.06319093, + "balance_loss_mlp": 0.01265985, + "epoch": 0.18872689012475574, + "flos": 37318294517760.0, + "grad_norm": 2.1585183145570723, + "language_loss": 0.64404404, + "learning_rate": 3.7414817080795896e-06, + "loss": 0.72284186, + "num_input_tokens_seen": 67776230, + "router_z_loss_clip": 2.70507812, + "router_z_loss_mlp": 0.24133301, + "step": 3139, + "time_per_iteration": 2.7355217933654785 + }, + { + "auxiliary_loss_clip": 0.06586127, + "auxiliary_loss_mlp": 0.01305421, + "balance_loss_clip": 0.06318149, + "balance_loss_mlp": 0.01280554, + "epoch": 0.1887870133774237, + "flos": 21658641465600.0, + "grad_norm": 2.033663323673097, + "language_loss": 0.72120833, + "learning_rate": 3.741290160328514e-06, + "loss": 0.80012381, + "num_input_tokens_seen": 67795080, + "router_z_loss_clip": 2.68164062, + "router_z_loss_mlp": 0.24865723, + "step": 3140, + "time_per_iteration": 2.556196928024292 + }, + { + "auxiliary_loss_clip": 0.06585391, + "auxiliary_loss_mlp": 0.01291018, + "balance_loss_clip": 0.06316558, + "balance_loss_mlp": 0.01264935, + "epoch": 0.1888471366300917, + "flos": 15930143631360.0, + "grad_norm": 2.3984250647338254, + "language_loss": 0.88684165, + "learning_rate": 3.7410985465475412e-06, + "loss": 0.9656058, + "num_input_tokens_seen": 67813110, + "router_z_loss_clip": 2.68554688, + "router_z_loss_mlp": 0.26086426, + "step": 3141, + "time_per_iteration": 5.341757774353027 + }, + { + "auxiliary_loss_clip": 0.06587377, + "auxiliary_loss_mlp": 0.01281785, + "balance_loss_clip": 0.06315634, + "balance_loss_mlp": 0.01256358, + "epoch": 0.18890725988275966, + "flos": 18557933126400.0, + "grad_norm": 1.8324612256611552, + "language_loss": 0.7775296, + "learning_rate": 3.7409068667439378e-06, + "loss": 0.85622126, + "num_input_tokens_seen": 67831070, + "router_z_loss_clip": 2.72070312, + "router_z_loss_mlp": 0.25390625, + "step": 3142, + "time_per_iteration": 2.5836708545684814 + }, + { + "auxiliary_loss_clip": 0.06576081, + "auxiliary_loss_mlp": 0.01283372, + "balance_loss_clip": 0.06312332, + "balance_loss_mlp": 0.01261413, + "epoch": 0.18896738313542763, + "flos": 28848550360320.0, + "grad_norm": 1.9913316615923113, + "language_loss": 0.79816502, + "learning_rate": 3.740715120924971e-06, + "loss": 0.87675953, + "num_input_tokens_seen": 67852170, + "router_z_loss_clip": 2.640625, + "router_z_loss_mlp": 0.21972656, + "step": 3143, + "time_per_iteration": 2.6068625450134277 + }, + { + "auxiliary_loss_clip": 0.06581955, + "auxiliary_loss_mlp": 0.01290595, + "balance_loss_clip": 0.0631283, + "balance_loss_mlp": 0.01266146, + "epoch": 0.1890275063880956, + "flos": 22418249454720.0, + "grad_norm": 2.17929571565749, + "language_loss": 0.72435296, + "learning_rate": 3.740523309097912e-06, + "loss": 0.80307841, + "num_input_tokens_seen": 67869945, + "router_z_loss_clip": 2.69140625, + "router_z_loss_mlp": 0.24475098, + "step": 3144, + "time_per_iteration": 2.565488338470459 + }, + { + "auxiliary_loss_clip": 0.06576345, + "auxiliary_loss_mlp": 0.0128465, + "balance_loss_clip": 0.0630596, + "balance_loss_mlp": 0.012602, + "epoch": 0.18908762964076356, + "flos": 24250862102400.0, + "grad_norm": 2.4312750691575253, + "language_loss": 0.74294418, + "learning_rate": 3.7403314312700356e-06, + "loss": 0.82155418, + "num_input_tokens_seen": 67890240, + "router_z_loss_clip": 2.70507812, + "router_z_loss_mlp": 0.24438477, + "step": 3145, + "time_per_iteration": 2.582784414291382 + }, + { + "auxiliary_loss_clip": 0.0656594, + "auxiliary_loss_mlp": 0.01281011, + "balance_loss_clip": 0.063042, + "balance_loss_mlp": 0.01258385, + "epoch": 0.18914775289343153, + "flos": 16988599607040.0, + "grad_norm": 2.264042873648611, + "language_loss": 0.77487111, + "learning_rate": 3.740139487448616e-06, + "loss": 0.85334063, + "num_input_tokens_seen": 67907825, + "router_z_loss_clip": 2.6171875, + "router_z_loss_mlp": 0.22631836, + "step": 3146, + "time_per_iteration": 2.5446579456329346 + }, + { + "auxiliary_loss_clip": 0.06567892, + "auxiliary_loss_mlp": 0.01282874, + "balance_loss_clip": 0.06302544, + "balance_loss_mlp": 0.01259342, + "epoch": 0.1892078761460995, + "flos": 21550257809280.0, + "grad_norm": 2.367888350934947, + "language_loss": 0.79622674, + "learning_rate": 3.7399474776409326e-06, + "loss": 0.87473428, + "num_input_tokens_seen": 67926670, + "router_z_loss_clip": 2.66015625, + "router_z_loss_mlp": 0.23535156, + "step": 3147, + "time_per_iteration": 2.5432369709014893 + }, + { + "auxiliary_loss_clip": 0.06564464, + "auxiliary_loss_mlp": 0.0128295, + "balance_loss_clip": 0.06297393, + "balance_loss_mlp": 0.01259096, + "epoch": 0.18926799939876748, + "flos": 23007979290240.0, + "grad_norm": 3.3066597325179443, + "language_loss": 0.67790151, + "learning_rate": 3.739755401854267e-06, + "loss": 0.75637561, + "num_input_tokens_seen": 67943645, + "router_z_loss_clip": 2.66992188, + "router_z_loss_mlp": 0.23864746, + "step": 3148, + "time_per_iteration": 2.5936107635498047 + }, + { + "auxiliary_loss_clip": 0.06566582, + "auxiliary_loss_mlp": 0.01281142, + "balance_loss_clip": 0.06297165, + "balance_loss_mlp": 0.01256693, + "epoch": 0.18932812265143545, + "flos": 22279537820160.0, + "grad_norm": 2.2349625482761843, + "language_loss": 0.76378185, + "learning_rate": 3.739563260095902e-06, + "loss": 0.84225905, + "num_input_tokens_seen": 67962345, + "router_z_loss_clip": 2.69335938, + "router_z_loss_mlp": 0.24450684, + "step": 3149, + "time_per_iteration": 2.5491833686828613 + }, + { + "auxiliary_loss_clip": 0.0656079, + "auxiliary_loss_mlp": 0.01279685, + "balance_loss_clip": 0.06300658, + "balance_loss_mlp": 0.01256785, + "epoch": 0.1893882459041034, + "flos": 18630328654080.0, + "grad_norm": 2.2856364952022687, + "language_loss": 0.81782246, + "learning_rate": 3.7393710523731245e-06, + "loss": 0.89622724, + "num_input_tokens_seen": 67979760, + "router_z_loss_clip": 2.59960938, + "router_z_loss_mlp": 0.22912598, + "step": 3150, + "time_per_iteration": 2.568166494369507 + }, + { + "auxiliary_loss_clip": 0.06565347, + "auxiliary_loss_mlp": 0.01285958, + "balance_loss_clip": 0.06297709, + "balance_loss_mlp": 0.01262617, + "epoch": 0.18944836915677138, + "flos": 22899553706880.0, + "grad_norm": 2.23925150788406, + "language_loss": 0.86091208, + "learning_rate": 3.7391787786932215e-06, + "loss": 0.93942523, + "num_input_tokens_seen": 67996895, + "router_z_loss_clip": 2.67773438, + "router_z_loss_mlp": 0.2331543, + "step": 3151, + "time_per_iteration": 2.520254373550415 + }, + { + "auxiliary_loss_clip": 0.06570399, + "auxiliary_loss_mlp": 0.01289995, + "balance_loss_clip": 0.06303516, + "balance_loss_mlp": 0.01266297, + "epoch": 0.18950849240943934, + "flos": 26803698520320.0, + "grad_norm": 1.7542668261130185, + "language_loss": 0.75358492, + "learning_rate": 3.7389864390634857e-06, + "loss": 0.83218884, + "num_input_tokens_seen": 68018365, + "router_z_loss_clip": 2.66796875, + "router_z_loss_mlp": 0.23706055, + "step": 3152, + "time_per_iteration": 2.612248182296753 + }, + { + "auxiliary_loss_clip": 0.06565326, + "auxiliary_loss_mlp": 0.01283167, + "balance_loss_clip": 0.06301029, + "balance_loss_mlp": 0.01258431, + "epoch": 0.1895686156621073, + "flos": 24977919980160.0, + "grad_norm": 1.8204901028243692, + "language_loss": 0.76455373, + "learning_rate": 3.738794033491209e-06, + "loss": 0.84303862, + "num_input_tokens_seen": 68037985, + "router_z_loss_clip": 2.64257812, + "router_z_loss_mlp": 0.24755859, + "step": 3153, + "time_per_iteration": 2.5559494495391846 + }, + { + "auxiliary_loss_clip": 0.06567015, + "auxiliary_loss_mlp": 0.01280834, + "balance_loss_clip": 0.06300367, + "balance_loss_mlp": 0.01256599, + "epoch": 0.1896287389147753, + "flos": 21950990760960.0, + "grad_norm": 1.7894410743269322, + "language_loss": 0.80290896, + "learning_rate": 3.7386015619836887e-06, + "loss": 0.88138747, + "num_input_tokens_seen": 68057975, + "router_z_loss_clip": 2.66601562, + "router_z_loss_mlp": 0.24255371, + "step": 3154, + "time_per_iteration": 2.554861545562744 + }, + { + "auxiliary_loss_clip": 0.06572987, + "auxiliary_loss_mlp": 0.01294065, + "balance_loss_clip": 0.06302256, + "balance_loss_mlp": 0.01267612, + "epoch": 0.18968886216744327, + "flos": 18183628938240.0, + "grad_norm": 2.9256856308256447, + "language_loss": 0.74259496, + "learning_rate": 3.738409024548223e-06, + "loss": 0.82126546, + "num_input_tokens_seen": 68074175, + "router_z_loss_clip": 2.70898438, + "router_z_loss_mlp": 0.26452637, + "step": 3155, + "time_per_iteration": 2.473719358444214 + }, + { + "auxiliary_loss_clip": 0.06557501, + "auxiliary_loss_mlp": 0.01284077, + "balance_loss_clip": 0.06296935, + "balance_loss_mlp": 0.01260247, + "epoch": 0.18974898542011123, + "flos": 20418735473280.0, + "grad_norm": 2.585248701074102, + "language_loss": 0.74503541, + "learning_rate": 3.7382164211921136e-06, + "loss": 0.82345116, + "num_input_tokens_seen": 68095230, + "router_z_loss_clip": 2.60546875, + "router_z_loss_mlp": 0.23815918, + "step": 3156, + "time_per_iteration": 2.5825979709625244 + }, + { + "auxiliary_loss_clip": 0.06561351, + "auxiliary_loss_mlp": 0.01283032, + "balance_loss_clip": 0.06294506, + "balance_loss_mlp": 0.01259786, + "epoch": 0.1898091086727792, + "flos": 23991356407680.0, + "grad_norm": 1.7654819302184697, + "language_loss": 0.68914878, + "learning_rate": 3.7380237519226623e-06, + "loss": 0.76759267, + "num_input_tokens_seen": 68113805, + "router_z_loss_clip": 2.66796875, + "router_z_loss_mlp": 0.23266602, + "step": 3157, + "time_per_iteration": 2.614276170730591 + }, + { + "auxiliary_loss_clip": 0.06562739, + "auxiliary_loss_mlp": 0.01287461, + "balance_loss_clip": 0.06299365, + "balance_loss_mlp": 0.01263822, + "epoch": 0.18986923192544716, + "flos": 27644590569600.0, + "grad_norm": 1.6841569236878713, + "language_loss": 0.80553401, + "learning_rate": 3.737831016747176e-06, + "loss": 0.88403606, + "num_input_tokens_seen": 68133190, + "router_z_loss_clip": 2.63867188, + "router_z_loss_mlp": 0.23657227, + "step": 3158, + "time_per_iteration": 2.6667590141296387 + }, + { + "auxiliary_loss_clip": 0.06570458, + "auxiliary_loss_mlp": 0.01285173, + "balance_loss_clip": 0.06298561, + "balance_loss_mlp": 0.01260509, + "epoch": 0.18992935517811513, + "flos": 25491271219200.0, + "grad_norm": 2.1165299373469755, + "language_loss": 0.72984976, + "learning_rate": 3.737638215672964e-06, + "loss": 0.808406, + "num_input_tokens_seen": 68152330, + "router_z_loss_clip": 2.71875, + "router_z_loss_mlp": 0.2467041, + "step": 3159, + "time_per_iteration": 2.5685224533081055 + }, + { + "auxiliary_loss_clip": 0.06567825, + "auxiliary_loss_mlp": 0.01281428, + "balance_loss_clip": 0.06301159, + "balance_loss_mlp": 0.01257014, + "epoch": 0.1899894784307831, + "flos": 17426578498560.0, + "grad_norm": 1.8951112773112917, + "language_loss": 0.86019123, + "learning_rate": 3.7374453487073366e-06, + "loss": 0.93868375, + "num_input_tokens_seen": 68170185, + "router_z_loss_clip": 2.66601562, + "router_z_loss_mlp": 0.24438477, + "step": 3160, + "time_per_iteration": 2.533764362335205 + }, + { + "auxiliary_loss_clip": 0.06553883, + "auxiliary_loss_mlp": 0.0128672, + "balance_loss_clip": 0.06294671, + "balance_loss_mlp": 0.01264154, + "epoch": 0.19004960168345109, + "flos": 27499925295360.0, + "grad_norm": 1.7631570201415632, + "language_loss": 0.74244189, + "learning_rate": 3.7372524158576074e-06, + "loss": 0.82084787, + "num_input_tokens_seen": 68191665, + "router_z_loss_clip": 2.59570312, + "router_z_loss_mlp": 0.22570801, + "step": 3161, + "time_per_iteration": 2.590913772583008 + }, + { + "auxiliary_loss_clip": 0.06558438, + "auxiliary_loss_mlp": 0.01279623, + "balance_loss_clip": 0.06296802, + "balance_loss_mlp": 0.01255817, + "epoch": 0.19010972493611905, + "flos": 38663858908800.0, + "grad_norm": 1.9041337161295762, + "language_loss": 0.81525451, + "learning_rate": 3.7370594171310926e-06, + "loss": 0.89363515, + "num_input_tokens_seen": 68214635, + "router_z_loss_clip": 2.61523438, + "router_z_loss_mlp": 0.23803711, + "step": 3162, + "time_per_iteration": 2.7009496688842773 + }, + { + "auxiliary_loss_clip": 0.06556226, + "auxiliary_loss_mlp": 0.01281702, + "balance_loss_clip": 0.06291863, + "balance_loss_mlp": 0.012573, + "epoch": 0.19016984818878702, + "flos": 19250763811200.0, + "grad_norm": 2.198798501736265, + "language_loss": 0.77194953, + "learning_rate": 3.73686635253511e-06, + "loss": 0.8503288, + "num_input_tokens_seen": 68232150, + "router_z_loss_clip": 2.64453125, + "router_z_loss_mlp": 0.2442627, + "step": 3163, + "time_per_iteration": 2.5443172454833984 + }, + { + "auxiliary_loss_clip": 0.06551848, + "auxiliary_loss_mlp": 0.01280109, + "balance_loss_clip": 0.06291605, + "balance_loss_mlp": 0.01256291, + "epoch": 0.19022997144145498, + "flos": 37605947984640.0, + "grad_norm": 1.6741633946121544, + "language_loss": 0.75098169, + "learning_rate": 3.736673222076982e-06, + "loss": 0.82930118, + "num_input_tokens_seen": 68253370, + "router_z_loss_clip": 2.6015625, + "router_z_loss_mlp": 0.23815918, + "step": 3164, + "time_per_iteration": 2.6625473499298096 + }, + { + "auxiliary_loss_clip": 0.06555005, + "auxiliary_loss_mlp": 0.01280136, + "balance_loss_clip": 0.06294911, + "balance_loss_mlp": 0.01256759, + "epoch": 0.19029009469412295, + "flos": 61543874615040.0, + "grad_norm": 2.119573778415358, + "language_loss": 0.67527556, + "learning_rate": 3.7364800257640313e-06, + "loss": 0.75362694, + "num_input_tokens_seen": 68278895, + "router_z_loss_clip": 2.60351562, + "router_z_loss_mlp": 0.23364258, + "step": 3165, + "time_per_iteration": 2.8877623081207275 + }, + { + "auxiliary_loss_clip": 0.06552027, + "auxiliary_loss_mlp": 0.01278943, + "balance_loss_clip": 0.06292567, + "balance_loss_mlp": 0.01254433, + "epoch": 0.1903502179467909, + "flos": 13960077160320.0, + "grad_norm": 2.3966036589645916, + "language_loss": 0.75069398, + "learning_rate": 3.7362867636035835e-06, + "loss": 0.82900369, + "num_input_tokens_seen": 68294880, + "router_z_loss_clip": 2.59765625, + "router_z_loss_mlp": 0.24523926, + "step": 3166, + "time_per_iteration": 2.505680799484253 + }, + { + "auxiliary_loss_clip": 0.06499279, + "auxiliary_loss_mlp": 0.0131955, + "balance_loss_clip": 0.06350935, + "balance_loss_mlp": 0.01311236, + "epoch": 0.1904103411994589, + "flos": 66920484499200.0, + "grad_norm": 0.8228799096925371, + "language_loss": 0.50405741, + "learning_rate": 3.736093435602968e-06, + "loss": 0.58224571, + "num_input_tokens_seen": 68359665, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.08319092, + "step": 3167, + "time_per_iteration": 3.1767730712890625 + }, + { + "auxiliary_loss_clip": 0.06551085, + "auxiliary_loss_mlp": 0.0128493, + "balance_loss_clip": 0.06295685, + "balance_loss_mlp": 0.0126141, + "epoch": 0.19047046445212687, + "flos": 21915296121600.0, + "grad_norm": 1.8666443369688703, + "language_loss": 0.75258517, + "learning_rate": 3.7359000417695156e-06, + "loss": 0.83094531, + "num_input_tokens_seen": 68378950, + "router_z_loss_clip": 2.5546875, + "router_z_loss_mlp": 0.23522949, + "step": 3168, + "time_per_iteration": 2.539647102355957 + }, + { + "auxiliary_loss_clip": 0.06476398, + "auxiliary_loss_mlp": 0.01306941, + "balance_loss_clip": 0.06328493, + "balance_loss_mlp": 0.01299204, + "epoch": 0.19053058770479483, + "flos": 59271549338880.0, + "grad_norm": 0.8502356895352512, + "language_loss": 0.60174263, + "learning_rate": 3.73570658211056e-06, + "loss": 0.67957604, + "num_input_tokens_seen": 68434235, + "router_z_loss_clip": 1.4765625, + "router_z_loss_mlp": 0.07727051, + "step": 3169, + "time_per_iteration": 3.0786385536193848 + }, + { + "auxiliary_loss_clip": 0.06569149, + "auxiliary_loss_mlp": 0.01284984, + "balance_loss_clip": 0.06301555, + "balance_loss_mlp": 0.01260057, + "epoch": 0.1905907109574628, + "flos": 23958093536640.0, + "grad_norm": 1.6203962411975037, + "language_loss": 0.79296863, + "learning_rate": 3.735513056633436e-06, + "loss": 0.87151003, + "num_input_tokens_seen": 68453830, + "router_z_loss_clip": 2.67773438, + "router_z_loss_mlp": 0.24926758, + "step": 3170, + "time_per_iteration": 2.5439629554748535 + }, + { + "auxiliary_loss_clip": 0.06568529, + "auxiliary_loss_mlp": 0.01282478, + "balance_loss_clip": 0.06308423, + "balance_loss_mlp": 0.01258636, + "epoch": 0.19065083421013077, + "flos": 20818378321920.0, + "grad_norm": 3.266788836182488, + "language_loss": 0.78913432, + "learning_rate": 3.7353194653454834e-06, + "loss": 0.86764443, + "num_input_tokens_seen": 68473005, + "router_z_loss_clip": 2.6015625, + "router_z_loss_mlp": 0.23840332, + "step": 3171, + "time_per_iteration": 2.5944604873657227 + }, + { + "auxiliary_loss_clip": 0.06584235, + "auxiliary_loss_mlp": 0.01294559, + "balance_loss_clip": 0.06313154, + "balance_loss_mlp": 0.01269323, + "epoch": 0.19071095746279873, + "flos": 31293003121920.0, + "grad_norm": 1.9362395671252917, + "language_loss": 0.79769027, + "learning_rate": 3.7351258082540426e-06, + "loss": 0.8764782, + "num_input_tokens_seen": 68493470, + "router_z_loss_clip": 2.71484375, + "router_z_loss_mlp": 0.25256348, + "step": 3172, + "time_per_iteration": 2.6039323806762695 + }, + { + "auxiliary_loss_clip": 0.06578603, + "auxiliary_loss_mlp": 0.01291257, + "balance_loss_clip": 0.06316808, + "balance_loss_mlp": 0.0126738, + "epoch": 0.1907710807154667, + "flos": 14361397090560.0, + "grad_norm": 1.549568453685288, + "language_loss": 0.81519973, + "learning_rate": 3.7349320853664576e-06, + "loss": 0.89389837, + "num_input_tokens_seen": 68511290, + "router_z_loss_clip": 2.6171875, + "router_z_loss_mlp": 0.2388916, + "step": 3173, + "time_per_iteration": 2.566249132156372 + }, + { + "auxiliary_loss_clip": 0.06577085, + "auxiliary_loss_mlp": 0.01291087, + "balance_loss_clip": 0.06311868, + "balance_loss_mlp": 0.01266077, + "epoch": 0.1908312039681347, + "flos": 26914388163840.0, + "grad_norm": 1.4831321875737526, + "language_loss": 0.79620194, + "learning_rate": 3.7347382966900735e-06, + "loss": 0.87488365, + "num_input_tokens_seen": 68532575, + "router_z_loss_clip": 2.65429688, + "router_z_loss_mlp": 0.25012207, + "step": 3174, + "time_per_iteration": 4.032260179519653 + }, + { + "auxiliary_loss_clip": 0.06571774, + "auxiliary_loss_mlp": 0.01295417, + "balance_loss_clip": 0.06307514, + "balance_loss_mlp": 0.01271563, + "epoch": 0.19089132722080265, + "flos": 14498767059840.0, + "grad_norm": 1.9289574693520037, + "language_loss": 0.82161433, + "learning_rate": 3.7345444422322395e-06, + "loss": 0.9002862, + "num_input_tokens_seen": 68548760, + "router_z_loss_clip": 2.640625, + "router_z_loss_mlp": 0.23864746, + "step": 3175, + "time_per_iteration": 3.92791748046875 + }, + { + "auxiliary_loss_clip": 0.06570717, + "auxiliary_loss_mlp": 0.01290773, + "balance_loss_clip": 0.06306395, + "balance_loss_mlp": 0.01265393, + "epoch": 0.19095145047347062, + "flos": 13957771173120.0, + "grad_norm": 2.497584127695701, + "language_loss": 0.86521202, + "learning_rate": 3.7343505220003067e-06, + "loss": 0.94382691, + "num_input_tokens_seen": 68563100, + "router_z_loss_clip": 2.640625, + "router_z_loss_mlp": 0.25390625, + "step": 3176, + "time_per_iteration": 2.5083093643188477 + }, + { + "auxiliary_loss_clip": 0.06573781, + "auxiliary_loss_mlp": 0.01293305, + "balance_loss_clip": 0.06304888, + "balance_loss_mlp": 0.01265148, + "epoch": 0.19101157372613858, + "flos": 25308940734720.0, + "grad_norm": 2.21127293150792, + "language_loss": 0.82911885, + "learning_rate": 3.7341565360016285e-06, + "loss": 0.90778971, + "num_input_tokens_seen": 68581650, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 0.28137207, + "step": 3177, + "time_per_iteration": 2.5615227222442627 + }, + { + "auxiliary_loss_clip": 0.06560818, + "auxiliary_loss_mlp": 0.01287183, + "balance_loss_clip": 0.06300267, + "balance_loss_mlp": 0.01263985, + "epoch": 0.19107169697880655, + "flos": 20564448923520.0, + "grad_norm": 2.02770964818788, + "language_loss": 0.75787783, + "learning_rate": 3.73396248424356e-06, + "loss": 0.83635783, + "num_input_tokens_seen": 68600360, + "router_z_loss_clip": 2.60351562, + "router_z_loss_mlp": 0.23205566, + "step": 3178, + "time_per_iteration": 2.6215403079986572 + }, + { + "auxiliary_loss_clip": 0.06568342, + "auxiliary_loss_mlp": 0.01282871, + "balance_loss_clip": 0.06301986, + "balance_loss_mlp": 0.01260233, + "epoch": 0.19113182023147451, + "flos": 22169644790400.0, + "grad_norm": 1.6828125352275214, + "language_loss": 0.82549155, + "learning_rate": 3.7337683667334606e-06, + "loss": 0.90400362, + "num_input_tokens_seen": 68617885, + "router_z_loss_clip": 2.6640625, + "router_z_loss_mlp": 0.22644043, + "step": 3179, + "time_per_iteration": 2.5675652027130127 + }, + { + "auxiliary_loss_clip": 0.06569887, + "auxiliary_loss_mlp": 0.01296491, + "balance_loss_clip": 0.06307887, + "balance_loss_mlp": 0.012734, + "epoch": 0.19119194348414248, + "flos": 18586667877120.0, + "grad_norm": 2.5330173520749124, + "language_loss": 0.80732077, + "learning_rate": 3.733574183478691e-06, + "loss": 0.88598454, + "num_input_tokens_seen": 68634550, + "router_z_loss_clip": 2.62109375, + "router_z_loss_mlp": 0.23095703, + "step": 3180, + "time_per_iteration": 3.945387601852417 + }, + { + "auxiliary_loss_clip": 0.06563538, + "auxiliary_loss_mlp": 0.01290582, + "balance_loss_clip": 0.06302621, + "balance_loss_mlp": 0.01266883, + "epoch": 0.19125206673681047, + "flos": 19032738687360.0, + "grad_norm": 2.1003445268953373, + "language_loss": 0.79773259, + "learning_rate": 3.733379934486615e-06, + "loss": 0.87627381, + "num_input_tokens_seen": 68651895, + "router_z_loss_clip": 2.609375, + "router_z_loss_mlp": 0.23706055, + "step": 3181, + "time_per_iteration": 3.9274189472198486 + }, + { + "auxiliary_loss_clip": 0.06568001, + "auxiliary_loss_mlp": 0.01288302, + "balance_loss_clip": 0.06304715, + "balance_loss_mlp": 0.0126477, + "epoch": 0.19131218998947844, + "flos": 21696725946240.0, + "grad_norm": 2.2417902838655888, + "language_loss": 0.74386561, + "learning_rate": 3.7331856197645973e-06, + "loss": 0.82242858, + "num_input_tokens_seen": 68671500, + "router_z_loss_clip": 2.63476562, + "router_z_loss_mlp": 0.23547363, + "step": 3182, + "time_per_iteration": 2.550570487976074 + }, + { + "auxiliary_loss_clip": 0.06570706, + "auxiliary_loss_mlp": 0.0129189, + "balance_loss_clip": 0.06306151, + "balance_loss_mlp": 0.01267166, + "epoch": 0.1913723132421464, + "flos": 18448459367040.0, + "grad_norm": 1.7754326163332461, + "language_loss": 0.66467738, + "learning_rate": 3.7329912393200084e-06, + "loss": 0.7433033, + "num_input_tokens_seen": 68690570, + "router_z_loss_clip": 2.6484375, + "router_z_loss_mlp": 0.24719238, + "step": 3183, + "time_per_iteration": 2.589555501937866 + }, + { + "auxiliary_loss_clip": 0.06578184, + "auxiliary_loss_mlp": 0.01296721, + "balance_loss_clip": 0.06308434, + "balance_loss_mlp": 0.01268659, + "epoch": 0.19143243649481437, + "flos": 27167101678080.0, + "grad_norm": 1.7849918331200134, + "language_loss": 0.73866975, + "learning_rate": 3.7327967931602173e-06, + "loss": 0.81741881, + "num_input_tokens_seen": 68709735, + "router_z_loss_clip": 2.69921875, + "router_z_loss_mlp": 0.28076172, + "step": 3184, + "time_per_iteration": 2.7020864486694336 + }, + { + "auxiliary_loss_clip": 0.06571424, + "auxiliary_loss_mlp": 0.01290073, + "balance_loss_clip": 0.06304838, + "balance_loss_mlp": 0.01264049, + "epoch": 0.19149255974748233, + "flos": 21724244812800.0, + "grad_norm": 1.9651356872089878, + "language_loss": 0.89339554, + "learning_rate": 3.732602281292598e-06, + "loss": 0.97201049, + "num_input_tokens_seen": 68727565, + "router_z_loss_clip": 2.66601562, + "router_z_loss_mlp": 0.26037598, + "step": 3185, + "time_per_iteration": 2.512737512588501 + }, + { + "auxiliary_loss_clip": 0.06568564, + "auxiliary_loss_mlp": 0.01286821, + "balance_loss_clip": 0.06304171, + "balance_loss_mlp": 0.01261429, + "epoch": 0.1915526830001503, + "flos": 22969433612160.0, + "grad_norm": 2.041503418641191, + "language_loss": 0.74291968, + "learning_rate": 3.7324077037245267e-06, + "loss": 0.82147354, + "num_input_tokens_seen": 68748110, + "router_z_loss_clip": 2.64648438, + "router_z_loss_mlp": 0.25390625, + "step": 3186, + "time_per_iteration": 2.577359676361084 + }, + { + "auxiliary_loss_clip": 0.06579722, + "auxiliary_loss_mlp": 0.01289876, + "balance_loss_clip": 0.06312623, + "balance_loss_mlp": 0.01264675, + "epoch": 0.1916128062528183, + "flos": 26147946067200.0, + "grad_norm": 1.9086459802632982, + "language_loss": 0.84205973, + "learning_rate": 3.7322130604633825e-06, + "loss": 0.92075574, + "num_input_tokens_seen": 68769765, + "router_z_loss_clip": 2.671875, + "router_z_loss_mlp": 0.25231934, + "step": 3187, + "time_per_iteration": 2.575345039367676 + }, + { + "auxiliary_loss_clip": 0.06462009, + "auxiliary_loss_mlp": 0.01273815, + "balance_loss_clip": 0.06313258, + "balance_loss_mlp": 0.01266967, + "epoch": 0.19167292950548626, + "flos": 54943513119360.0, + "grad_norm": 0.8344019653061644, + "language_loss": 0.56017417, + "learning_rate": 3.732018351516544e-06, + "loss": 0.63753241, + "num_input_tokens_seen": 68826815, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.06866455, + "step": 3188, + "time_per_iteration": 3.186802387237549 + }, + { + "auxiliary_loss_clip": 0.06575608, + "auxiliary_loss_mlp": 0.01301201, + "balance_loss_clip": 0.06310253, + "balance_loss_mlp": 0.01276942, + "epoch": 0.19173305275815422, + "flos": 29943497589120.0, + "grad_norm": 2.242687399889932, + "language_loss": 0.70996517, + "learning_rate": 3.731823576891397e-06, + "loss": 0.78873324, + "num_input_tokens_seen": 68847585, + "router_z_loss_clip": 2.65039062, + "router_z_loss_mlp": 0.24267578, + "step": 3189, + "time_per_iteration": 2.5879886150360107 + }, + { + "auxiliary_loss_clip": 0.0656148, + "auxiliary_loss_mlp": 0.01285809, + "balance_loss_clip": 0.06303851, + "balance_loss_mlp": 0.01263994, + "epoch": 0.1917931760108222, + "flos": 24759140169600.0, + "grad_norm": 2.034629185065424, + "language_loss": 0.74848962, + "learning_rate": 3.7316287365953266e-06, + "loss": 0.82696253, + "num_input_tokens_seen": 68866620, + "router_z_loss_clip": 2.578125, + "router_z_loss_mlp": 0.21813965, + "step": 3190, + "time_per_iteration": 2.618912696838379 + }, + { + "auxiliary_loss_clip": 0.06566381, + "auxiliary_loss_mlp": 0.01292718, + "balance_loss_clip": 0.06306858, + "balance_loss_mlp": 0.01268614, + "epoch": 0.19185329926349015, + "flos": 18849527735040.0, + "grad_norm": 1.9370060266864375, + "language_loss": 0.84794742, + "learning_rate": 3.73143383063572e-06, + "loss": 0.92653841, + "num_input_tokens_seen": 68885515, + "router_z_loss_clip": 2.59179688, + "router_z_loss_mlp": 0.24108887, + "step": 3191, + "time_per_iteration": 2.5354197025299072 + }, + { + "auxiliary_loss_clip": 0.06560425, + "auxiliary_loss_mlp": 0.01288793, + "balance_loss_clip": 0.06303156, + "balance_loss_mlp": 0.01265595, + "epoch": 0.19191342251615812, + "flos": 22092721142400.0, + "grad_norm": 1.810553957384375, + "language_loss": 0.90797645, + "learning_rate": 3.73123885901997e-06, + "loss": 0.98646855, + "num_input_tokens_seen": 68903225, + "router_z_loss_clip": 2.57421875, + "router_z_loss_mlp": 0.23193359, + "step": 3192, + "time_per_iteration": 2.594034433364868 + }, + { + "auxiliary_loss_clip": 0.06575879, + "auxiliary_loss_mlp": 0.01297652, + "balance_loss_clip": 0.06307722, + "balance_loss_mlp": 0.01273727, + "epoch": 0.19197354576882608, + "flos": 22205465210880.0, + "grad_norm": 3.128458316309985, + "language_loss": 0.76021564, + "learning_rate": 3.7310438217554687e-06, + "loss": 0.83895093, + "num_input_tokens_seen": 68922860, + "router_z_loss_clip": 2.68554688, + "router_z_loss_mlp": 0.23925781, + "step": 3193, + "time_per_iteration": 2.5328986644744873 + }, + { + "auxiliary_loss_clip": 0.06572805, + "auxiliary_loss_mlp": 0.01303133, + "balance_loss_clip": 0.06305176, + "balance_loss_mlp": 0.01278504, + "epoch": 0.19203366902149407, + "flos": 24902505705600.0, + "grad_norm": 1.8726296466629722, + "language_loss": 0.75837868, + "learning_rate": 3.730848718849612e-06, + "loss": 0.83713806, + "num_input_tokens_seen": 68943000, + "router_z_loss_clip": 2.67382812, + "router_z_loss_mlp": 0.24633789, + "step": 3194, + "time_per_iteration": 2.594693660736084 + }, + { + "auxiliary_loss_clip": 0.06443634, + "auxiliary_loss_mlp": 0.01272062, + "balance_loss_clip": 0.06298726, + "balance_loss_mlp": 0.01264749, + "epoch": 0.19209379227416204, + "flos": 68435256211200.0, + "grad_norm": 0.738426265798758, + "language_loss": 0.68323666, + "learning_rate": 3.7306535503097985e-06, + "loss": 0.76039362, + "num_input_tokens_seen": 69000255, + "router_z_loss_clip": 1.453125, + "router_z_loss_mlp": 0.07293701, + "step": 3195, + "time_per_iteration": 3.082646369934082 + }, + { + "auxiliary_loss_clip": 0.0656238, + "auxiliary_loss_mlp": 0.0129433, + "balance_loss_clip": 0.06301422, + "balance_loss_mlp": 0.01270488, + "epoch": 0.19215391552683, + "flos": 22061848112640.0, + "grad_norm": 2.817360442151248, + "language_loss": 0.74132156, + "learning_rate": 3.730458316143429e-06, + "loss": 0.81988871, + "num_input_tokens_seen": 69019665, + "router_z_loss_clip": 2.61132812, + "router_z_loss_mlp": 0.23852539, + "step": 3196, + "time_per_iteration": 2.5596578121185303 + }, + { + "auxiliary_loss_clip": 0.0656443, + "auxiliary_loss_mlp": 0.01296614, + "balance_loss_clip": 0.06303307, + "balance_loss_mlp": 0.01272939, + "epoch": 0.19221403877949797, + "flos": 20309177859840.0, + "grad_norm": 2.156505210347581, + "language_loss": 0.84144557, + "learning_rate": 3.7302630163579068e-06, + "loss": 0.92005599, + "num_input_tokens_seen": 69039055, + "router_z_loss_clip": 2.61523438, + "router_z_loss_mlp": 0.23657227, + "step": 3197, + "time_per_iteration": 2.505884885787964 + }, + { + "auxiliary_loss_clip": 0.06563333, + "auxiliary_loss_mlp": 0.01294057, + "balance_loss_clip": 0.06297445, + "balance_loss_mlp": 0.0126894, + "epoch": 0.19227416203216594, + "flos": 23192028783360.0, + "grad_norm": 2.1973705189643042, + "language_loss": 0.8105517, + "learning_rate": 3.7300676509606373e-06, + "loss": 0.88912559, + "num_input_tokens_seen": 69056370, + "router_z_loss_clip": 2.65625, + "router_z_loss_mlp": 0.25109863, + "step": 3198, + "time_per_iteration": 2.5759875774383545 + }, + { + "auxiliary_loss_clip": 0.06570526, + "auxiliary_loss_mlp": 0.01303751, + "balance_loss_clip": 0.06301676, + "balance_loss_mlp": 0.01279194, + "epoch": 0.1923342852848339, + "flos": 25783872076800.0, + "grad_norm": 2.3405078734196274, + "language_loss": 0.79434526, + "learning_rate": 3.729872219959029e-06, + "loss": 0.873088, + "num_input_tokens_seen": 69075915, + "router_z_loss_clip": 2.6875, + "router_z_loss_mlp": 0.24536133, + "step": 3199, + "time_per_iteration": 2.57918643951416 + }, + { + "auxiliary_loss_clip": 0.06561789, + "auxiliary_loss_mlp": 0.01291155, + "balance_loss_clip": 0.06299184, + "balance_loss_mlp": 0.01267694, + "epoch": 0.19239440853750187, + "flos": 17133977640960.0, + "grad_norm": 1.9996812909650197, + "language_loss": 0.84443569, + "learning_rate": 3.7296767233604934e-06, + "loss": 0.92296517, + "num_input_tokens_seen": 69094145, + "router_z_loss_clip": 2.625, + "router_z_loss_mlp": 0.23449707, + "step": 3200, + "time_per_iteration": 2.5089356899261475 + }, + { + "auxiliary_loss_clip": 0.06560853, + "auxiliary_loss_mlp": 0.01287978, + "balance_loss_clip": 0.06299884, + "balance_loss_mlp": 0.01265185, + "epoch": 0.19245453179016986, + "flos": 16440601904640.0, + "grad_norm": 1.9071909055640763, + "language_loss": 0.79753184, + "learning_rate": 3.729481161172443e-06, + "loss": 0.87602013, + "num_input_tokens_seen": 69111110, + "router_z_loss_clip": 2.609375, + "router_z_loss_mlp": 0.22790527, + "step": 3201, + "time_per_iteration": 2.5428295135498047 + }, + { + "auxiliary_loss_clip": 0.06563856, + "auxiliary_loss_mlp": 0.01287849, + "balance_loss_clip": 0.06298736, + "balance_loss_mlp": 0.01264365, + "epoch": 0.19251465504283782, + "flos": 20236530769920.0, + "grad_norm": 3.4105372180153273, + "language_loss": 0.70024735, + "learning_rate": 3.7292855334022927e-06, + "loss": 0.77876443, + "num_input_tokens_seen": 69130280, + "router_z_loss_clip": 2.65039062, + "router_z_loss_mlp": 0.23498535, + "step": 3202, + "time_per_iteration": 2.545257806777954 + }, + { + "auxiliary_loss_clip": 0.06559525, + "auxiliary_loss_mlp": 0.01288531, + "balance_loss_clip": 0.06303041, + "balance_loss_mlp": 0.01265965, + "epoch": 0.1925747782955058, + "flos": 19470549870720.0, + "grad_norm": 1.8972638993856672, + "language_loss": 0.9187758, + "learning_rate": 3.7290898400574627e-06, + "loss": 0.9972564, + "num_input_tokens_seen": 69149570, + "router_z_loss_clip": 2.56445312, + "router_z_loss_mlp": 0.22570801, + "step": 3203, + "time_per_iteration": 2.52083420753479 + }, + { + "auxiliary_loss_clip": 0.06569508, + "auxiliary_loss_mlp": 0.01288191, + "balance_loss_clip": 0.06305829, + "balance_loss_mlp": 0.01263193, + "epoch": 0.19263490154817375, + "flos": 17791407175680.0, + "grad_norm": 2.3309919698880637, + "language_loss": 0.82672936, + "learning_rate": 3.7288940811453725e-06, + "loss": 0.9053064, + "num_input_tokens_seen": 69168190, + "router_z_loss_clip": 2.63671875, + "router_z_loss_mlp": 0.25012207, + "step": 3204, + "time_per_iteration": 2.552898645401001 + }, + { + "auxiliary_loss_clip": 0.06554051, + "auxiliary_loss_mlp": 0.01280623, + "balance_loss_clip": 0.06297573, + "balance_loss_mlp": 0.01257437, + "epoch": 0.19269502480084172, + "flos": 17462818189440.0, + "grad_norm": 2.4686415170818927, + "language_loss": 0.76927221, + "learning_rate": 3.7286982566734454e-06, + "loss": 0.84761888, + "num_input_tokens_seen": 69186950, + "router_z_loss_clip": 2.56835938, + "router_z_loss_mlp": 0.23181152, + "step": 3205, + "time_per_iteration": 2.635087251663208 + }, + { + "auxiliary_loss_clip": 0.06570686, + "auxiliary_loss_mlp": 0.01281824, + "balance_loss_clip": 0.06303753, + "balance_loss_mlp": 0.01259913, + "epoch": 0.19275514805350968, + "flos": 21513305358720.0, + "grad_norm": 2.6796703276560034, + "language_loss": 0.84088528, + "learning_rate": 3.728502366649107e-06, + "loss": 0.91941041, + "num_input_tokens_seen": 69204850, + "router_z_loss_clip": 2.671875, + "router_z_loss_mlp": 0.21911621, + "step": 3206, + "time_per_iteration": 2.5875258445739746 + }, + { + "auxiliary_loss_clip": 0.06462742, + "auxiliary_loss_mlp": 0.01299031, + "balance_loss_clip": 0.06320498, + "balance_loss_mlp": 0.01291426, + "epoch": 0.19281527130617768, + "flos": 47711578602240.0, + "grad_norm": 0.8155276906071137, + "language_loss": 0.60688889, + "learning_rate": 3.728306411079786e-06, + "loss": 0.68450665, + "num_input_tokens_seen": 69259200, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.07592773, + "step": 3207, + "time_per_iteration": 2.98170804977417 + }, + { + "auxiliary_loss_clip": 0.06570975, + "auxiliary_loss_mlp": 0.01284779, + "balance_loss_clip": 0.06306583, + "balance_loss_mlp": 0.01261426, + "epoch": 0.19287539455884564, + "flos": 11805961196160.0, + "grad_norm": 2.350100512422909, + "language_loss": 0.76272619, + "learning_rate": 3.7281103899729125e-06, + "loss": 0.8412838, + "num_input_tokens_seen": 69275835, + "router_z_loss_clip": 2.64453125, + "router_z_loss_mlp": 0.23364258, + "step": 3208, + "time_per_iteration": 2.495664358139038 + }, + { + "auxiliary_loss_clip": 0.06570548, + "auxiliary_loss_mlp": 0.01287656, + "balance_loss_clip": 0.06303693, + "balance_loss_mlp": 0.01263253, + "epoch": 0.1929355178115136, + "flos": 20637724919040.0, + "grad_norm": 2.572131519169912, + "language_loss": 0.61787575, + "learning_rate": 3.7279143033359195e-06, + "loss": 0.69645774, + "num_input_tokens_seen": 69294810, + "router_z_loss_clip": 2.66796875, + "router_z_loss_mlp": 0.24389648, + "step": 3209, + "time_per_iteration": 2.5720291137695312 + }, + { + "auxiliary_loss_clip": 0.06569174, + "auxiliary_loss_mlp": 0.0128696, + "balance_loss_clip": 0.06303342, + "balance_loss_mlp": 0.01262832, + "epoch": 0.19299564106418157, + "flos": 40817555602560.0, + "grad_norm": 2.1926342764258773, + "language_loss": 0.80817664, + "learning_rate": 3.727718151176243e-06, + "loss": 0.88673794, + "num_input_tokens_seen": 69316065, + "router_z_loss_clip": 2.65820312, + "router_z_loss_mlp": 0.24133301, + "step": 3210, + "time_per_iteration": 2.6967084407806396 + }, + { + "auxiliary_loss_clip": 0.06562287, + "auxiliary_loss_mlp": 0.01281086, + "balance_loss_clip": 0.06305824, + "balance_loss_mlp": 0.01258913, + "epoch": 0.19305576431684954, + "flos": 11365718244480.0, + "grad_norm": 4.335018711819376, + "language_loss": 0.83798629, + "learning_rate": 3.7275219335013217e-06, + "loss": 0.9164201, + "num_input_tokens_seen": 69332900, + "router_z_loss_clip": 2.56445312, + "router_z_loss_mlp": 0.22167969, + "step": 3211, + "time_per_iteration": 2.522151470184326 + }, + { + "auxiliary_loss_clip": 0.06460443, + "auxiliary_loss_mlp": 0.01261987, + "balance_loss_clip": 0.06318722, + "balance_loss_mlp": 0.01254787, + "epoch": 0.1931158875695175, + "flos": 54527476798080.0, + "grad_norm": 0.9401062048905866, + "language_loss": 0.63522434, + "learning_rate": 3.7273256503185953e-06, + "loss": 0.71244872, + "num_input_tokens_seen": 69382535, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.07196045, + "step": 3212, + "time_per_iteration": 3.0072474479675293 + }, + { + "auxiliary_loss_clip": 0.06559554, + "auxiliary_loss_mlp": 0.01284587, + "balance_loss_clip": 0.06301133, + "balance_loss_mlp": 0.01260936, + "epoch": 0.19317601082218547, + "flos": 19834540007040.0, + "grad_norm": 1.629103353649286, + "language_loss": 0.7732501, + "learning_rate": 3.7271293016355074e-06, + "loss": 0.85169148, + "num_input_tokens_seen": 69400600, + "router_z_loss_clip": 2.58398438, + "router_z_loss_mlp": 0.23669434, + "step": 3213, + "time_per_iteration": 3.972214698791504 + }, + { + "auxiliary_loss_clip": 0.06571522, + "auxiliary_loss_mlp": 0.01282458, + "balance_loss_clip": 0.06306578, + "balance_loss_mlp": 0.01259749, + "epoch": 0.19323613407485346, + "flos": 13157143810560.0, + "grad_norm": 2.0451873974907864, + "language_loss": 0.71339387, + "learning_rate": 3.726932887459503e-06, + "loss": 0.79193366, + "num_input_tokens_seen": 69417350, + "router_z_loss_clip": 2.64648438, + "router_z_loss_mlp": 0.22729492, + "step": 3214, + "time_per_iteration": 2.542698383331299 + }, + { + "auxiliary_loss_clip": 0.06565271, + "auxiliary_loss_mlp": 0.01287539, + "balance_loss_clip": 0.06303567, + "balance_loss_mlp": 0.01264365, + "epoch": 0.19329625732752143, + "flos": 14032388833920.0, + "grad_norm": 2.534528672768976, + "language_loss": 0.75987494, + "learning_rate": 3.72673640779803e-06, + "loss": 0.83840305, + "num_input_tokens_seen": 69431845, + "router_z_loss_clip": 2.61328125, + "router_z_loss_mlp": 0.23205566, + "step": 3215, + "time_per_iteration": 3.8739888668060303 + }, + { + "auxiliary_loss_clip": 0.06557035, + "auxiliary_loss_mlp": 0.01279943, + "balance_loss_clip": 0.06302097, + "balance_loss_mlp": 0.01257615, + "epoch": 0.1933563805801894, + "flos": 23448641512320.0, + "grad_norm": 2.010602658012729, + "language_loss": 0.88668227, + "learning_rate": 3.72653986265854e-06, + "loss": 0.96505201, + "num_input_tokens_seen": 69453275, + "router_z_loss_clip": 2.55273438, + "router_z_loss_mlp": 0.22338867, + "step": 3216, + "time_per_iteration": 2.5690455436706543 + }, + { + "auxiliary_loss_clip": 0.06557489, + "auxiliary_loss_mlp": 0.01281443, + "balance_loss_clip": 0.06301452, + "balance_loss_mlp": 0.01259019, + "epoch": 0.19341650383285736, + "flos": 20491550271360.0, + "grad_norm": 2.1677144094151823, + "language_loss": 0.80915409, + "learning_rate": 3.726343252048485e-06, + "loss": 0.88754338, + "num_input_tokens_seen": 69471830, + "router_z_loss_clip": 2.55859375, + "router_z_loss_mlp": 0.2244873, + "step": 3217, + "time_per_iteration": 2.522089958190918 + }, + { + "auxiliary_loss_clip": 0.06573136, + "auxiliary_loss_mlp": 0.01282755, + "balance_loss_clip": 0.06306149, + "balance_loss_mlp": 0.01257709, + "epoch": 0.19347662708552532, + "flos": 17864305827840.0, + "grad_norm": 3.8111547770960907, + "language_loss": 0.63612419, + "learning_rate": 3.7261465759753206e-06, + "loss": 0.71468312, + "num_input_tokens_seen": 69489320, + "router_z_loss_clip": 2.66992188, + "router_z_loss_mlp": 0.25048828, + "step": 3218, + "time_per_iteration": 2.511009693145752 + }, + { + "auxiliary_loss_clip": 0.06568655, + "auxiliary_loss_mlp": 0.01286799, + "balance_loss_clip": 0.06304532, + "balance_loss_mlp": 0.01262945, + "epoch": 0.1935367503381933, + "flos": 18193188303360.0, + "grad_norm": 1.6615722636986479, + "language_loss": 0.80769217, + "learning_rate": 3.7259498344465053e-06, + "loss": 0.88624674, + "num_input_tokens_seen": 69506665, + "router_z_loss_clip": 2.63867188, + "router_z_loss_mlp": 0.23852539, + "step": 3219, + "time_per_iteration": 2.49652099609375 + }, + { + "auxiliary_loss_clip": 0.06560229, + "auxiliary_loss_mlp": 0.01283688, + "balance_loss_clip": 0.06305727, + "balance_loss_mlp": 0.01262183, + "epoch": 0.19359687359086128, + "flos": 15961939056000.0, + "grad_norm": 2.4004031272371096, + "language_loss": 0.87055713, + "learning_rate": 3.7257530274694993e-06, + "loss": 0.94899631, + "num_input_tokens_seen": 69523835, + "router_z_loss_clip": 2.54492188, + "router_z_loss_mlp": 0.21520996, + "step": 3220, + "time_per_iteration": 3.9898974895477295 + }, + { + "auxiliary_loss_clip": 0.06557765, + "auxiliary_loss_mlp": 0.01279498, + "balance_loss_clip": 0.06308522, + "balance_loss_mlp": 0.0125829, + "epoch": 0.19365699684352924, + "flos": 21221584968960.0, + "grad_norm": 2.3273733740868296, + "language_loss": 0.84724689, + "learning_rate": 3.725556155051766e-06, + "loss": 0.92561948, + "num_input_tokens_seen": 69542620, + "router_z_loss_clip": 2.48828125, + "router_z_loss_mlp": 0.21191406, + "step": 3221, + "time_per_iteration": 2.546876907348633 + }, + { + "auxiliary_loss_clip": 0.06557351, + "auxiliary_loss_mlp": 0.01282697, + "balance_loss_clip": 0.06305219, + "balance_loss_mlp": 0.01260333, + "epoch": 0.1937171200961972, + "flos": 17316811249920.0, + "grad_norm": 2.1420374809622507, + "language_loss": 0.8628484, + "learning_rate": 3.7253592172007702e-06, + "loss": 0.94124895, + "num_input_tokens_seen": 69561130, + "router_z_loss_clip": 2.52148438, + "router_z_loss_mlp": 0.22351074, + "step": 3222, + "time_per_iteration": 2.497483015060425 + }, + { + "auxiliary_loss_clip": 0.06565784, + "auxiliary_loss_mlp": 0.0127706, + "balance_loss_clip": 0.06304947, + "balance_loss_mlp": 0.01255114, + "epoch": 0.19377724334886517, + "flos": 22642228218240.0, + "grad_norm": 2.292443034833117, + "language_loss": 0.7909472, + "learning_rate": 3.72516221392398e-06, + "loss": 0.86937559, + "num_input_tokens_seen": 69580425, + "router_z_loss_clip": 2.609375, + "router_z_loss_mlp": 0.21948242, + "step": 3223, + "time_per_iteration": 2.63804292678833 + }, + { + "auxiliary_loss_clip": 0.06563858, + "auxiliary_loss_mlp": 0.01278148, + "balance_loss_clip": 0.06308811, + "balance_loss_mlp": 0.01256452, + "epoch": 0.19383736660153314, + "flos": 15081872423040.0, + "grad_norm": 2.2027436227921977, + "language_loss": 0.76066363, + "learning_rate": 3.7249651452288653e-06, + "loss": 0.83908367, + "num_input_tokens_seen": 69597085, + "router_z_loss_clip": 2.55273438, + "router_z_loss_mlp": 0.21728516, + "step": 3224, + "time_per_iteration": 2.4926822185516357 + }, + { + "auxiliary_loss_clip": 0.06569614, + "auxiliary_loss_mlp": 0.01280842, + "balance_loss_clip": 0.06311695, + "balance_loss_mlp": 0.01257155, + "epoch": 0.1938974898542011, + "flos": 47130626246400.0, + "grad_norm": 2.47304361876348, + "language_loss": 0.71419585, + "learning_rate": 3.7247680111229e-06, + "loss": 0.79270041, + "num_input_tokens_seen": 69618885, + "router_z_loss_clip": 2.578125, + "router_z_loss_mlp": 0.23681641, + "step": 3225, + "time_per_iteration": 2.8417437076568604 + }, + { + "auxiliary_loss_clip": 0.0656653, + "auxiliary_loss_mlp": 0.01277702, + "balance_loss_clip": 0.06306545, + "balance_loss_mlp": 0.01255076, + "epoch": 0.19395761310686907, + "flos": 25819734424320.0, + "grad_norm": 2.3579945849430235, + "language_loss": 0.6987173, + "learning_rate": 3.7245708116135585e-06, + "loss": 0.77715963, + "num_input_tokens_seen": 69638200, + "router_z_loss_clip": 2.59960938, + "router_z_loss_mlp": 0.22619629, + "step": 3226, + "time_per_iteration": 2.5816895961761475 + }, + { + "auxiliary_loss_clip": 0.06556038, + "auxiliary_loss_mlp": 0.01279426, + "balance_loss_clip": 0.06305292, + "balance_loss_mlp": 0.01255608, + "epoch": 0.19401773635953706, + "flos": 23046315333120.0, + "grad_norm": 1.6993594132957168, + "language_loss": 0.76826584, + "learning_rate": 3.7243735467083193e-06, + "loss": 0.84662044, + "num_input_tokens_seen": 69657550, + "router_z_loss_clip": 2.5078125, + "router_z_loss_mlp": 0.23815918, + "step": 3227, + "time_per_iteration": 2.5873494148254395 + }, + { + "auxiliary_loss_clip": 0.06565821, + "auxiliary_loss_mlp": 0.01279646, + "balance_loss_clip": 0.063063, + "balance_loss_mlp": 0.01257187, + "epoch": 0.19407785961220503, + "flos": 15925615511040.0, + "grad_norm": 1.984580707337323, + "language_loss": 0.70403302, + "learning_rate": 3.724176216414662e-06, + "loss": 0.78248763, + "num_input_tokens_seen": 69675005, + "router_z_loss_clip": 2.59765625, + "router_z_loss_mlp": 0.22460938, + "step": 3228, + "time_per_iteration": 2.5275485515594482 + }, + { + "auxiliary_loss_clip": 0.06563079, + "auxiliary_loss_mlp": 0.01279835, + "balance_loss_clip": 0.06306829, + "balance_loss_mlp": 0.01257662, + "epoch": 0.194137982864873, + "flos": 25928872767360.0, + "grad_norm": 1.8334459249779138, + "language_loss": 0.74913502, + "learning_rate": 3.72397882074007e-06, + "loss": 0.82756412, + "num_input_tokens_seen": 69696455, + "router_z_loss_clip": 2.56640625, + "router_z_loss_mlp": 0.2220459, + "step": 3229, + "time_per_iteration": 2.588756561279297 + }, + { + "auxiliary_loss_clip": 0.06561101, + "auxiliary_loss_mlp": 0.01283623, + "balance_loss_clip": 0.06304256, + "balance_loss_mlp": 0.01260126, + "epoch": 0.19419810611754096, + "flos": 13266407934720.0, + "grad_norm": 2.0512138922716034, + "language_loss": 0.66050041, + "learning_rate": 3.7237813596920285e-06, + "loss": 0.73894763, + "num_input_tokens_seen": 69714245, + "router_z_loss_clip": 2.57226562, + "router_z_loss_mlp": 0.23486328, + "step": 3230, + "time_per_iteration": 2.51173996925354 + }, + { + "auxiliary_loss_clip": 0.06559683, + "auxiliary_loss_mlp": 0.01282022, + "balance_loss_clip": 0.06306173, + "balance_loss_mlp": 0.01259444, + "epoch": 0.19425822937020892, + "flos": 15710986477440.0, + "grad_norm": 1.9323382078744304, + "language_loss": 0.82361978, + "learning_rate": 3.7235838332780254e-06, + "loss": 0.90203679, + "num_input_tokens_seen": 69731515, + "router_z_loss_clip": 2.53515625, + "router_z_loss_mlp": 0.22583008, + "step": 3231, + "time_per_iteration": 2.5331170558929443 + }, + { + "auxiliary_loss_clip": 0.06565376, + "auxiliary_loss_mlp": 0.01284277, + "balance_loss_clip": 0.0630706, + "balance_loss_mlp": 0.01260793, + "epoch": 0.1943183526228769, + "flos": 23110912431360.0, + "grad_norm": 1.7851653331870696, + "language_loss": 0.8806898, + "learning_rate": 3.72338624150555e-06, + "loss": 0.95918632, + "num_input_tokens_seen": 69748885, + "router_z_loss_clip": 2.5859375, + "router_z_loss_mlp": 0.23474121, + "step": 3232, + "time_per_iteration": 2.556128740310669 + }, + { + "auxiliary_loss_clip": 0.06561054, + "auxiliary_loss_mlp": 0.01288213, + "balance_loss_clip": 0.06308518, + "balance_loss_mlp": 0.01265718, + "epoch": 0.19437847587554485, + "flos": 24718707774720.0, + "grad_norm": 1.9425002506843316, + "language_loss": 0.8592729, + "learning_rate": 3.723188584382096e-06, + "loss": 0.93776554, + "num_input_tokens_seen": 69767540, + "router_z_loss_clip": 2.52929688, + "router_z_loss_mlp": 0.22497559, + "step": 3233, + "time_per_iteration": 2.5888071060180664 + }, + { + "auxiliary_loss_clip": 0.06570844, + "auxiliary_loss_mlp": 0.01287681, + "balance_loss_clip": 0.06309654, + "balance_loss_mlp": 0.01263195, + "epoch": 0.19443859912821285, + "flos": 23123448616320.0, + "grad_norm": 2.322933236090491, + "language_loss": 0.8952834, + "learning_rate": 3.722990861915158e-06, + "loss": 0.97386861, + "num_input_tokens_seen": 69789340, + "router_z_loss_clip": 2.61328125, + "router_z_loss_mlp": 0.24499512, + "step": 3234, + "time_per_iteration": 2.598424196243286 + }, + { + "auxiliary_loss_clip": 0.0656711, + "auxiliary_loss_mlp": 0.01279524, + "balance_loss_clip": 0.06307149, + "balance_loss_mlp": 0.01256243, + "epoch": 0.1944987223808808, + "flos": 15089545071360.0, + "grad_norm": 2.0762312051619993, + "language_loss": 0.7883603, + "learning_rate": 3.722793074112234e-06, + "loss": 0.86682659, + "num_input_tokens_seen": 69806470, + "router_z_loss_clip": 2.59765625, + "router_z_loss_mlp": 0.23291016, + "step": 3235, + "time_per_iteration": 2.518150806427002 + }, + { + "auxiliary_loss_clip": 0.06562902, + "auxiliary_loss_mlp": 0.01278747, + "balance_loss_clip": 0.06309078, + "balance_loss_mlp": 0.01257253, + "epoch": 0.19455884563354878, + "flos": 17132258632320.0, + "grad_norm": 2.012702835830896, + "language_loss": 0.79693586, + "learning_rate": 3.7225952209808233e-06, + "loss": 0.87535232, + "num_input_tokens_seen": 69822655, + "router_z_loss_clip": 2.54296875, + "router_z_loss_mlp": 0.21520996, + "step": 3236, + "time_per_iteration": 2.5621957778930664 + }, + { + "auxiliary_loss_clip": 0.06562862, + "auxiliary_loss_mlp": 0.01279358, + "balance_loss_clip": 0.06309117, + "balance_loss_mlp": 0.0125635, + "epoch": 0.19461896888621674, + "flos": 20199578319360.0, + "grad_norm": 1.7644130728207734, + "language_loss": 0.76505381, + "learning_rate": 3.72239730252843e-06, + "loss": 0.84347594, + "num_input_tokens_seen": 69841895, + "router_z_loss_clip": 2.53710938, + "router_z_loss_mlp": 0.23010254, + "step": 3237, + "time_per_iteration": 2.545138359069824 + }, + { + "auxiliary_loss_clip": 0.06572011, + "auxiliary_loss_mlp": 0.01287724, + "balance_loss_clip": 0.06309787, + "balance_loss_mlp": 0.01264455, + "epoch": 0.1946790921388847, + "flos": 25308395683200.0, + "grad_norm": 3.0171180207385855, + "language_loss": 0.75939953, + "learning_rate": 3.7221993187625583e-06, + "loss": 0.8379969, + "num_input_tokens_seen": 69862220, + "router_z_loss_clip": 2.62304688, + "router_z_loss_mlp": 0.23291016, + "step": 3238, + "time_per_iteration": 2.6292033195495605 + }, + { + "auxiliary_loss_clip": 0.06564013, + "auxiliary_loss_mlp": 0.01283016, + "balance_loss_clip": 0.0631004, + "balance_loss_mlp": 0.0126033, + "epoch": 0.19473921539155267, + "flos": 20199578319360.0, + "grad_norm": 5.2039179549819, + "language_loss": 0.740753, + "learning_rate": 3.7220012696907155e-06, + "loss": 0.81922328, + "num_input_tokens_seen": 69881830, + "router_z_loss_clip": 2.5390625, + "router_z_loss_mlp": 0.22692871, + "step": 3239, + "time_per_iteration": 2.5251026153564453 + }, + { + "auxiliary_loss_clip": 0.06561047, + "auxiliary_loss_mlp": 0.01279887, + "balance_loss_clip": 0.06308049, + "balance_loss_mlp": 0.01257464, + "epoch": 0.19479933864422067, + "flos": 20894002231680.0, + "grad_norm": 2.589752485587752, + "language_loss": 0.74076676, + "learning_rate": 3.721803155320412e-06, + "loss": 0.8191762, + "num_input_tokens_seen": 69900515, + "router_z_loss_clip": 2.53125, + "router_z_loss_mlp": 0.22424316, + "step": 3240, + "time_per_iteration": 2.5630886554718018 + }, + { + "auxiliary_loss_clip": 0.06569096, + "auxiliary_loss_mlp": 0.01285658, + "balance_loss_clip": 0.06312588, + "balance_loss_mlp": 0.01262758, + "epoch": 0.19485946189688863, + "flos": 23301837959040.0, + "grad_norm": 2.269188581778515, + "language_loss": 0.67009896, + "learning_rate": 3.7216049756591606e-06, + "loss": 0.7486465, + "num_input_tokens_seen": 69920060, + "router_z_loss_clip": 2.56640625, + "router_z_loss_mlp": 0.22888184, + "step": 3241, + "time_per_iteration": 2.5366311073303223 + }, + { + "auxiliary_loss_clip": 0.0657091, + "auxiliary_loss_mlp": 0.01284859, + "balance_loss_clip": 0.06315701, + "balance_loss_mlp": 0.01261017, + "epoch": 0.1949195851495566, + "flos": 23301796032000.0, + "grad_norm": 1.7252715969085026, + "language_loss": 0.8313868, + "learning_rate": 3.7214067307144754e-06, + "loss": 0.90994453, + "num_input_tokens_seen": 69939820, + "router_z_loss_clip": 2.5546875, + "router_z_loss_mlp": 0.23828125, + "step": 3242, + "time_per_iteration": 2.5582659244537354 + }, + { + "auxiliary_loss_clip": 0.06462191, + "auxiliary_loss_mlp": 0.01271622, + "balance_loss_clip": 0.06317475, + "balance_loss_mlp": 0.01264684, + "epoch": 0.19497970840222456, + "flos": 64982884285440.0, + "grad_norm": 0.8039225971535554, + "language_loss": 0.57435864, + "learning_rate": 3.721208420493875e-06, + "loss": 0.6516968, + "num_input_tokens_seen": 70002145, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.06951904, + "step": 3243, + "time_per_iteration": 3.1517677307128906 + }, + { + "auxiliary_loss_clip": 0.06582105, + "auxiliary_loss_mlp": 0.01289713, + "balance_loss_clip": 0.06324299, + "balance_loss_mlp": 0.01264619, + "epoch": 0.19503983165489253, + "flos": 19650574368000.0, + "grad_norm": 1.7327160710810887, + "language_loss": 0.83662367, + "learning_rate": 3.7210100450048784e-06, + "loss": 0.91534185, + "num_input_tokens_seen": 70020510, + "router_z_loss_clip": 2.58007812, + "router_z_loss_mlp": 0.25085449, + "step": 3244, + "time_per_iteration": 2.580615282058716 + }, + { + "auxiliary_loss_clip": 0.06580628, + "auxiliary_loss_mlp": 0.01287488, + "balance_loss_clip": 0.06321178, + "balance_loss_mlp": 0.01264206, + "epoch": 0.1950999549075605, + "flos": 21148308973440.0, + "grad_norm": 1.8443508562563502, + "language_loss": 0.77383208, + "learning_rate": 3.7208116042550088e-06, + "loss": 0.85251331, + "num_input_tokens_seen": 70040760, + "router_z_loss_clip": 2.59375, + "router_z_loss_mlp": 0.23278809, + "step": 3245, + "time_per_iteration": 2.562547206878662 + }, + { + "auxiliary_loss_clip": 0.06574707, + "auxiliary_loss_mlp": 0.01284069, + "balance_loss_clip": 0.06316134, + "balance_loss_mlp": 0.01260168, + "epoch": 0.19516007816022846, + "flos": 20890815776640.0, + "grad_norm": 1.9180190042930891, + "language_loss": 0.84645605, + "learning_rate": 3.7206130982517906e-06, + "loss": 0.92504388, + "num_input_tokens_seen": 70058720, + "router_z_loss_clip": 2.58789062, + "router_z_loss_mlp": 0.2388916, + "step": 3246, + "time_per_iteration": 2.5781290531158447 + }, + { + "auxiliary_loss_clip": 0.06585012, + "auxiliary_loss_mlp": 0.01283635, + "balance_loss_clip": 0.0632351, + "balance_loss_mlp": 0.012612, + "epoch": 0.19522020141289645, + "flos": 16916287933440.0, + "grad_norm": 2.4019655481348177, + "language_loss": 0.77056623, + "learning_rate": 3.7204145270027514e-06, + "loss": 0.8492527, + "num_input_tokens_seen": 70076470, + "router_z_loss_clip": 2.61523438, + "router_z_loss_mlp": 0.22436523, + "step": 3247, + "time_per_iteration": 2.5042033195495605 + }, + { + "auxiliary_loss_clip": 0.06582692, + "auxiliary_loss_mlp": 0.01287787, + "balance_loss_clip": 0.06325091, + "balance_loss_mlp": 0.01264136, + "epoch": 0.19528032466556441, + "flos": 26732183460480.0, + "grad_norm": 1.5912411640106108, + "language_loss": 0.75763261, + "learning_rate": 3.720215890515421e-06, + "loss": 0.83633739, + "num_input_tokens_seen": 70096220, + "router_z_loss_clip": 2.58007812, + "router_z_loss_mlp": 0.23669434, + "step": 3248, + "time_per_iteration": 2.629751205444336 + }, + { + "auxiliary_loss_clip": 0.0657216, + "auxiliary_loss_mlp": 0.01286346, + "balance_loss_clip": 0.06312956, + "balance_loss_mlp": 0.01263994, + "epoch": 0.19534044791823238, + "flos": 21039170630400.0, + "grad_norm": 2.0257715109614822, + "language_loss": 0.79102194, + "learning_rate": 3.7200171887973316e-06, + "loss": 0.86960697, + "num_input_tokens_seen": 70114800, + "router_z_loss_clip": 2.58984375, + "router_z_loss_mlp": 0.22375488, + "step": 3249, + "time_per_iteration": 2.5774686336517334 + }, + { + "auxiliary_loss_clip": 0.06565905, + "auxiliary_loss_mlp": 0.01285899, + "balance_loss_clip": 0.06309386, + "balance_loss_mlp": 0.01263035, + "epoch": 0.19540057117090034, + "flos": 22350256266240.0, + "grad_norm": 1.6645797480066, + "language_loss": 0.73634374, + "learning_rate": 3.7198184218560176e-06, + "loss": 0.81486177, + "num_input_tokens_seen": 70134930, + "router_z_loss_clip": 2.56835938, + "router_z_loss_mlp": 0.2286377, + "step": 3250, + "time_per_iteration": 2.5834462642669678 + }, + { + "auxiliary_loss_clip": 0.06557436, + "auxiliary_loss_mlp": 0.01284202, + "balance_loss_clip": 0.06304777, + "balance_loss_mlp": 0.01261791, + "epoch": 0.1954606944235683, + "flos": 20307626559360.0, + "grad_norm": 5.203824713813235, + "language_loss": 0.80619103, + "learning_rate": 3.719619589699017e-06, + "loss": 0.88460743, + "num_input_tokens_seen": 70152045, + "router_z_loss_clip": 2.52929688, + "router_z_loss_mlp": 0.22399902, + "step": 3251, + "time_per_iteration": 2.5159976482391357 + }, + { + "auxiliary_loss_clip": 0.06569009, + "auxiliary_loss_mlp": 0.0128766, + "balance_loss_clip": 0.06309755, + "balance_loss_mlp": 0.01264593, + "epoch": 0.19552081767623627, + "flos": 17352463962240.0, + "grad_norm": 2.6280610562746882, + "language_loss": 0.84652966, + "learning_rate": 3.7194206923338695e-06, + "loss": 0.92509639, + "num_input_tokens_seen": 70169240, + "router_z_loss_clip": 2.59375, + "router_z_loss_mlp": 0.23071289, + "step": 3252, + "time_per_iteration": 2.584712505340576 + }, + { + "auxiliary_loss_clip": 0.0657175, + "auxiliary_loss_mlp": 0.01284666, + "balance_loss_clip": 0.06305347, + "balance_loss_mlp": 0.01258559, + "epoch": 0.19558094092890424, + "flos": 31985666098560.0, + "grad_norm": 1.8259798075239808, + "language_loss": 0.74205744, + "learning_rate": 3.719221729768117e-06, + "loss": 0.82062161, + "num_input_tokens_seen": 70192690, + "router_z_loss_clip": 2.66601562, + "router_z_loss_mlp": 0.26098633, + "step": 3253, + "time_per_iteration": 4.126874685287476 + }, + { + "auxiliary_loss_clip": 0.06567718, + "auxiliary_loss_mlp": 0.01281159, + "balance_loss_clip": 0.06301166, + "balance_loss_mlp": 0.0125721, + "epoch": 0.19564106418157223, + "flos": 22274716210560.0, + "grad_norm": 1.973936337746025, + "language_loss": 0.77398765, + "learning_rate": 3.7190227020093037e-06, + "loss": 0.85247642, + "num_input_tokens_seen": 70209685, + "router_z_loss_clip": 2.66210938, + "router_z_loss_mlp": 0.23962402, + "step": 3254, + "time_per_iteration": 2.6537773609161377 + }, + { + "auxiliary_loss_clip": 0.06437294, + "auxiliary_loss_mlp": 0.01260118, + "balance_loss_clip": 0.06291844, + "balance_loss_mlp": 0.01253204, + "epoch": 0.1957011874342402, + "flos": 54379876631040.0, + "grad_norm": 0.7412950515810539, + "language_loss": 0.55013955, + "learning_rate": 3.7188236090649774e-06, + "loss": 0.62711358, + "num_input_tokens_seen": 70265050, + "router_z_loss_clip": 1.453125, + "router_z_loss_mlp": 0.06933594, + "step": 3255, + "time_per_iteration": 4.54949426651001 + }, + { + "auxiliary_loss_clip": 0.06563026, + "auxiliary_loss_mlp": 0.01289416, + "balance_loss_clip": 0.06301506, + "balance_loss_mlp": 0.01265407, + "epoch": 0.19576131068690816, + "flos": 16511991183360.0, + "grad_norm": 2.710710922193229, + "language_loss": 0.71672189, + "learning_rate": 3.718624450942688e-06, + "loss": 0.79524636, + "num_input_tokens_seen": 70281830, + "router_z_loss_clip": 2.61523438, + "router_z_loss_mlp": 0.2401123, + "step": 3256, + "time_per_iteration": 2.563938617706299 + }, + { + "auxiliary_loss_clip": 0.06557887, + "auxiliary_loss_mlp": 0.01283051, + "balance_loss_clip": 0.06298412, + "balance_loss_mlp": 0.01259591, + "epoch": 0.19582143393957613, + "flos": 14724800248320.0, + "grad_norm": 2.2116868908222176, + "language_loss": 0.8133806, + "learning_rate": 3.718425227649987e-06, + "loss": 0.89178997, + "num_input_tokens_seen": 70297420, + "router_z_loss_clip": 2.59570312, + "router_z_loss_mlp": 0.23461914, + "step": 3257, + "time_per_iteration": 2.546842336654663 + }, + { + "auxiliary_loss_clip": 0.06568147, + "auxiliary_loss_mlp": 0.01289159, + "balance_loss_clip": 0.06309533, + "balance_loss_mlp": 0.01264554, + "epoch": 0.1958815571922441, + "flos": 24432354046080.0, + "grad_norm": 4.3707104143190785, + "language_loss": 0.76246595, + "learning_rate": 3.7182259391944292e-06, + "loss": 0.841039, + "num_input_tokens_seen": 70319210, + "router_z_loss_clip": 2.58398438, + "router_z_loss_mlp": 0.24609375, + "step": 3258, + "time_per_iteration": 2.596585273742676 + }, + { + "auxiliary_loss_clip": 0.06562606, + "auxiliary_loss_mlp": 0.01282027, + "balance_loss_clip": 0.06300102, + "balance_loss_mlp": 0.01257828, + "epoch": 0.19594168044491206, + "flos": 24907285388160.0, + "grad_norm": 1.9490064747675282, + "language_loss": 0.74507892, + "learning_rate": 3.7180265855835714e-06, + "loss": 0.82352525, + "num_input_tokens_seen": 70339045, + "router_z_loss_clip": 2.62695312, + "router_z_loss_mlp": 0.24230957, + "step": 3259, + "time_per_iteration": 2.572443723678589 + }, + { + "auxiliary_loss_clip": 0.06562422, + "auxiliary_loss_mlp": 0.01289683, + "balance_loss_clip": 0.06298189, + "balance_loss_mlp": 0.01263302, + "epoch": 0.19600180369758005, + "flos": 12061819238400.0, + "grad_norm": 2.2810085679716106, + "language_loss": 0.7772423, + "learning_rate": 3.7178271668249735e-06, + "loss": 0.85576332, + "num_input_tokens_seen": 70356505, + "router_z_loss_clip": 2.63867188, + "router_z_loss_mlp": 0.26379395, + "step": 3260, + "time_per_iteration": 5.330974340438843 + }, + { + "auxiliary_loss_clip": 0.06562512, + "auxiliary_loss_mlp": 0.01290293, + "balance_loss_clip": 0.06300309, + "balance_loss_mlp": 0.01266046, + "epoch": 0.19606192695024802, + "flos": 20856504729600.0, + "grad_norm": 2.085882514659535, + "language_loss": 0.83190846, + "learning_rate": 3.7176276829261975e-06, + "loss": 0.91043651, + "num_input_tokens_seen": 70375410, + "router_z_loss_clip": 2.62109375, + "router_z_loss_mlp": 0.24279785, + "step": 3261, + "time_per_iteration": 2.5832743644714355 + }, + { + "auxiliary_loss_clip": 0.06565593, + "auxiliary_loss_mlp": 0.01288067, + "balance_loss_clip": 0.06304751, + "balance_loss_mlp": 0.01263296, + "epoch": 0.19612205020291598, + "flos": 28483050850560.0, + "grad_norm": 1.7951789750723233, + "language_loss": 0.77451867, + "learning_rate": 3.717428133894807e-06, + "loss": 0.85305524, + "num_input_tokens_seen": 70396315, + "router_z_loss_clip": 2.60546875, + "router_z_loss_mlp": 0.24768066, + "step": 3262, + "time_per_iteration": 2.5895204544067383 + }, + { + "auxiliary_loss_clip": 0.06560683, + "auxiliary_loss_mlp": 0.01286928, + "balance_loss_clip": 0.06303811, + "balance_loss_mlp": 0.01264004, + "epoch": 0.19618217345558395, + "flos": 25563666746880.0, + "grad_norm": 1.6758780497522678, + "language_loss": 0.87025416, + "learning_rate": 3.71722851973837e-06, + "loss": 0.94873023, + "num_input_tokens_seen": 70417945, + "router_z_loss_clip": 2.57226562, + "router_z_loss_mlp": 0.22937012, + "step": 3263, + "time_per_iteration": 2.5864033699035645 + }, + { + "auxiliary_loss_clip": 0.0656628, + "auxiliary_loss_mlp": 0.01296773, + "balance_loss_clip": 0.06306224, + "balance_loss_mlp": 0.0127137, + "epoch": 0.1962422967082519, + "flos": 25271359378560.0, + "grad_norm": 1.67172611639437, + "language_loss": 0.74829996, + "learning_rate": 3.717028840464455e-06, + "loss": 0.82693052, + "num_input_tokens_seen": 70438690, + "router_z_loss_clip": 2.60546875, + "router_z_loss_mlp": 0.25390625, + "step": 3264, + "time_per_iteration": 2.5601091384887695 + }, + { + "auxiliary_loss_clip": 0.06569743, + "auxiliary_loss_mlp": 0.01288835, + "balance_loss_clip": 0.0631538, + "balance_loss_mlp": 0.01264337, + "epoch": 0.19630241996091988, + "flos": 18813371898240.0, + "grad_norm": 2.189524829184907, + "language_loss": 0.7983582, + "learning_rate": 3.7168290960806344e-06, + "loss": 0.87694395, + "num_input_tokens_seen": 70455385, + "router_z_loss_clip": 2.546875, + "router_z_loss_mlp": 0.24511719, + "step": 3265, + "time_per_iteration": 2.540691614151001 + }, + { + "auxiliary_loss_clip": 0.06455089, + "auxiliary_loss_mlp": 0.01265834, + "balance_loss_clip": 0.06313262, + "balance_loss_mlp": 0.01257317, + "epoch": 0.19636254321358784, + "flos": 62338240120320.0, + "grad_norm": 0.7691014679533006, + "language_loss": 0.53069305, + "learning_rate": 3.716629286594483e-06, + "loss": 0.60790235, + "num_input_tokens_seen": 70514280, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.08526611, + "step": 3266, + "time_per_iteration": 3.1712465286254883 + }, + { + "auxiliary_loss_clip": 0.06579427, + "auxiliary_loss_mlp": 0.01300624, + "balance_loss_clip": 0.06317084, + "balance_loss_mlp": 0.01276138, + "epoch": 0.19642266646625584, + "flos": 21075703810560.0, + "grad_norm": 2.1807082930425548, + "language_loss": 0.8080219, + "learning_rate": 3.7164294120135767e-06, + "loss": 0.88682246, + "num_input_tokens_seen": 70531800, + "router_z_loss_clip": 2.61914062, + "router_z_loss_mlp": 0.24487305, + "step": 3267, + "time_per_iteration": 2.551907539367676 + }, + { + "auxiliary_loss_clip": 0.06564153, + "auxiliary_loss_mlp": 0.0128147, + "balance_loss_clip": 0.06308893, + "balance_loss_mlp": 0.01257366, + "epoch": 0.1964827897189238, + "flos": 14543979137280.0, + "grad_norm": 2.1592598522148694, + "language_loss": 0.8731035, + "learning_rate": 3.7162294723454953e-06, + "loss": 0.95155978, + "num_input_tokens_seen": 70550615, + "router_z_loss_clip": 2.55078125, + "router_z_loss_mlp": 0.24108887, + "step": 3268, + "time_per_iteration": 2.520824909210205 + }, + { + "auxiliary_loss_clip": 0.06570253, + "auxiliary_loss_mlp": 0.01291413, + "balance_loss_clip": 0.0631839, + "balance_loss_mlp": 0.01268858, + "epoch": 0.19654291297159177, + "flos": 19250638030080.0, + "grad_norm": 2.3684809338902215, + "language_loss": 0.70127171, + "learning_rate": 3.7160294675978197e-06, + "loss": 0.77988833, + "num_input_tokens_seen": 70568690, + "router_z_loss_clip": 2.52148438, + "router_z_loss_mlp": 0.22546387, + "step": 3269, + "time_per_iteration": 2.542065382003784 + }, + { + "auxiliary_loss_clip": 0.06579614, + "auxiliary_loss_mlp": 0.01289007, + "balance_loss_clip": 0.06318989, + "balance_loss_mlp": 0.01263008, + "epoch": 0.19660303622425973, + "flos": 25782823900800.0, + "grad_norm": 3.1056086534351324, + "language_loss": 0.80997849, + "learning_rate": 3.715829397778135e-06, + "loss": 0.88866472, + "num_input_tokens_seen": 70588665, + "router_z_loss_clip": 2.60351562, + "router_z_loss_mlp": 0.25976562, + "step": 3270, + "time_per_iteration": 2.5732779502868652 + }, + { + "auxiliary_loss_clip": 0.0656828, + "auxiliary_loss_mlp": 0.0128367, + "balance_loss_clip": 0.06310552, + "balance_loss_mlp": 0.01257468, + "epoch": 0.1966631594769277, + "flos": 20601401374080.0, + "grad_norm": 4.117702501056874, + "language_loss": 0.84620351, + "learning_rate": 3.715629262894028e-06, + "loss": 0.92472303, + "num_input_tokens_seen": 70606900, + "router_z_loss_clip": 2.58007812, + "router_z_loss_mlp": 0.26220703, + "step": 3271, + "time_per_iteration": 2.54874587059021 + }, + { + "auxiliary_loss_clip": 0.06565209, + "auxiliary_loss_mlp": 0.01287963, + "balance_loss_clip": 0.06316341, + "balance_loss_mlp": 0.01263311, + "epoch": 0.19672328272959566, + "flos": 23629965747840.0, + "grad_norm": 1.9724475535226151, + "language_loss": 0.8064115, + "learning_rate": 3.715429062953087e-06, + "loss": 0.88494325, + "num_input_tokens_seen": 70625955, + "router_z_loss_clip": 2.48828125, + "router_z_loss_mlp": 0.2467041, + "step": 3272, + "time_per_iteration": 2.5446958541870117 + }, + { + "auxiliary_loss_clip": 0.06582461, + "auxiliary_loss_mlp": 0.01289002, + "balance_loss_clip": 0.06322335, + "balance_loss_mlp": 0.0126218, + "epoch": 0.19678340598226365, + "flos": 23117369195520.0, + "grad_norm": 1.7276133269560208, + "language_loss": 0.81592834, + "learning_rate": 3.7152287979629043e-06, + "loss": 0.89464301, + "num_input_tokens_seen": 70646090, + "router_z_loss_clip": 2.60351562, + "router_z_loss_mlp": 0.26831055, + "step": 3273, + "time_per_iteration": 2.625422239303589 + }, + { + "auxiliary_loss_clip": 0.06569564, + "auxiliary_loss_mlp": 0.01284595, + "balance_loss_clip": 0.06313652, + "balance_loss_mlp": 0.0126142, + "epoch": 0.19684352923493162, + "flos": 24541702024320.0, + "grad_norm": 1.8603958272733907, + "language_loss": 0.78998351, + "learning_rate": 3.7150284679310735e-06, + "loss": 0.86852515, + "num_input_tokens_seen": 70666065, + "router_z_loss_clip": 2.5625, + "router_z_loss_mlp": 0.23181152, + "step": 3274, + "time_per_iteration": 2.6299047470092773 + }, + { + "auxiliary_loss_clip": 0.06566115, + "auxiliary_loss_mlp": 0.01283599, + "balance_loss_clip": 0.0630929, + "balance_loss_mlp": 0.01259722, + "epoch": 0.19690365248759958, + "flos": 21802510126080.0, + "grad_norm": 2.495100495270235, + "language_loss": 0.82370663, + "learning_rate": 3.7148280728651914e-06, + "loss": 0.90220374, + "num_input_tokens_seen": 70681580, + "router_z_loss_clip": 2.56835938, + "router_z_loss_mlp": 0.23864746, + "step": 3275, + "time_per_iteration": 2.532348394393921 + }, + { + "auxiliary_loss_clip": 0.06571324, + "auxiliary_loss_mlp": 0.0128437, + "balance_loss_clip": 0.06313166, + "balance_loss_mlp": 0.01259134, + "epoch": 0.19696377574026755, + "flos": 19061683073280.0, + "grad_norm": 2.1007591714873968, + "language_loss": 0.81547761, + "learning_rate": 3.7146276127728563e-06, + "loss": 0.8940345, + "num_input_tokens_seen": 70697745, + "router_z_loss_clip": 2.58203125, + "router_z_loss_mlp": 0.25244141, + "step": 3276, + "time_per_iteration": 2.533137798309326 + }, + { + "auxiliary_loss_clip": 0.06571773, + "auxiliary_loss_mlp": 0.01280216, + "balance_loss_clip": 0.0631392, + "balance_loss_mlp": 0.01256135, + "epoch": 0.19702389899293551, + "flos": 22827325887360.0, + "grad_norm": 2.204561669505926, + "language_loss": 0.89893198, + "learning_rate": 3.7144270876616713e-06, + "loss": 0.97745186, + "num_input_tokens_seen": 70715110, + "router_z_loss_clip": 2.58007812, + "router_z_loss_mlp": 0.24084473, + "step": 3277, + "time_per_iteration": 2.5781216621398926 + }, + { + "auxiliary_loss_clip": 0.0657627, + "auxiliary_loss_mlp": 0.01285494, + "balance_loss_clip": 0.06313394, + "balance_loss_mlp": 0.01258922, + "epoch": 0.19708402224560348, + "flos": 22901021153280.0, + "grad_norm": 2.1685116517567273, + "language_loss": 0.63218272, + "learning_rate": 3.714226497539239e-06, + "loss": 0.71080041, + "num_input_tokens_seen": 70734715, + "router_z_loss_clip": 2.62890625, + "router_z_loss_mlp": 0.26574707, + "step": 3278, + "time_per_iteration": 2.5733482837677 + }, + { + "auxiliary_loss_clip": 0.06573428, + "auxiliary_loss_mlp": 0.01286907, + "balance_loss_clip": 0.0631459, + "balance_loss_mlp": 0.01261515, + "epoch": 0.19714414549827144, + "flos": 25668989729280.0, + "grad_norm": 2.1172991336759983, + "language_loss": 0.75555933, + "learning_rate": 3.714025842413166e-06, + "loss": 0.83416271, + "num_input_tokens_seen": 70752650, + "router_z_loss_clip": 2.5859375, + "router_z_loss_mlp": 0.25378418, + "step": 3279, + "time_per_iteration": 2.598710775375366 + }, + { + "auxiliary_loss_clip": 0.06574699, + "auxiliary_loss_mlp": 0.0128012, + "balance_loss_clip": 0.06317799, + "balance_loss_mlp": 0.01256671, + "epoch": 0.19720426875093944, + "flos": 23922776240640.0, + "grad_norm": 1.6530428540457747, + "language_loss": 0.82974696, + "learning_rate": 3.713825122291061e-06, + "loss": 0.90829515, + "num_input_tokens_seen": 70772365, + "router_z_loss_clip": 2.56835938, + "router_z_loss_mlp": 0.23449707, + "step": 3280, + "time_per_iteration": 2.618016481399536 + }, + { + "auxiliary_loss_clip": 0.06568167, + "auxiliary_loss_mlp": 0.01283165, + "balance_loss_clip": 0.0630914, + "balance_loss_mlp": 0.01259085, + "epoch": 0.1972643920036074, + "flos": 13887178508160.0, + "grad_norm": 2.6497469055747036, + "language_loss": 0.78509879, + "learning_rate": 3.713624337180536e-06, + "loss": 0.86361206, + "num_input_tokens_seen": 70790340, + "router_z_loss_clip": 2.58984375, + "router_z_loss_mlp": 0.24084473, + "step": 3281, + "time_per_iteration": 2.5222740173339844 + }, + { + "auxiliary_loss_clip": 0.06561945, + "auxiliary_loss_mlp": 0.01286304, + "balance_loss_clip": 0.06312899, + "balance_loss_mlp": 0.01263952, + "epoch": 0.19732451525627537, + "flos": 19869479959680.0, + "grad_norm": 1.7725817592402109, + "language_loss": 0.80340242, + "learning_rate": 3.7134234870892045e-06, + "loss": 0.88188481, + "num_input_tokens_seen": 70809295, + "router_z_loss_clip": 2.4921875, + "router_z_loss_mlp": 0.22351074, + "step": 3282, + "time_per_iteration": 2.6235008239746094 + }, + { + "auxiliary_loss_clip": 0.06573974, + "auxiliary_loss_mlp": 0.01283963, + "balance_loss_clip": 0.06315407, + "balance_loss_mlp": 0.01259668, + "epoch": 0.19738463850894333, + "flos": 24980477529600.0, + "grad_norm": 1.861487958506938, + "language_loss": 0.72318685, + "learning_rate": 3.7132225720246826e-06, + "loss": 0.80176622, + "num_input_tokens_seen": 70828765, + "router_z_loss_clip": 2.58398438, + "router_z_loss_mlp": 0.24304199, + "step": 3283, + "time_per_iteration": 2.5938494205474854 + }, + { + "auxiliary_loss_clip": 0.06574511, + "auxiliary_loss_mlp": 0.01281543, + "balance_loss_clip": 0.06317373, + "balance_loss_mlp": 0.01256247, + "epoch": 0.1974447617616113, + "flos": 18374722174080.0, + "grad_norm": 1.6759301931344739, + "language_loss": 0.79791147, + "learning_rate": 3.7130215919945886e-06, + "loss": 0.87647206, + "num_input_tokens_seen": 70846805, + "router_z_loss_clip": 2.56640625, + "router_z_loss_mlp": 0.25292969, + "step": 3284, + "time_per_iteration": 2.530935049057007 + }, + { + "auxiliary_loss_clip": 0.06572407, + "auxiliary_loss_mlp": 0.01285612, + "balance_loss_clip": 0.06312867, + "balance_loss_mlp": 0.01260554, + "epoch": 0.19750488501427926, + "flos": 22899511779840.0, + "grad_norm": 1.8637255752391477, + "language_loss": 0.87043929, + "learning_rate": 3.7128205470065445e-06, + "loss": 0.94901949, + "num_input_tokens_seen": 70863805, + "router_z_loss_clip": 2.59570312, + "router_z_loss_mlp": 0.25061035, + "step": 3285, + "time_per_iteration": 2.5539395809173584 + }, + { + "auxiliary_loss_clip": 0.06561802, + "auxiliary_loss_mlp": 0.01282259, + "balance_loss_clip": 0.06307627, + "balance_loss_mlp": 0.01258012, + "epoch": 0.19756500826694723, + "flos": 21877924400640.0, + "grad_norm": 2.4795216745498956, + "language_loss": 0.88948774, + "learning_rate": 3.712619437068174e-06, + "loss": 0.96792841, + "num_input_tokens_seen": 70882660, + "router_z_loss_clip": 2.54296875, + "router_z_loss_mlp": 0.24243164, + "step": 3286, + "time_per_iteration": 2.5367021560668945 + }, + { + "auxiliary_loss_clip": 0.06569161, + "auxiliary_loss_mlp": 0.01280864, + "balance_loss_clip": 0.06308903, + "balance_loss_mlp": 0.01256641, + "epoch": 0.19762513151961522, + "flos": 15164414294400.0, + "grad_norm": 2.1735993607640904, + "language_loss": 0.79236507, + "learning_rate": 3.712418262187102e-06, + "loss": 0.87086535, + "num_input_tokens_seen": 70898765, + "router_z_loss_clip": 2.6015625, + "router_z_loss_mlp": 0.24230957, + "step": 3287, + "time_per_iteration": 2.4954702854156494 + }, + { + "auxiliary_loss_clip": 0.0656468, + "auxiliary_loss_mlp": 0.01280142, + "balance_loss_clip": 0.0630395, + "balance_loss_mlp": 0.01256824, + "epoch": 0.1976852547722832, + "flos": 16984239194880.0, + "grad_norm": 4.513328663516958, + "language_loss": 0.81957221, + "learning_rate": 3.7122170223709584e-06, + "loss": 0.89802045, + "num_input_tokens_seen": 70916370, + "router_z_loss_clip": 2.60742188, + "router_z_loss_mlp": 0.23303223, + "step": 3288, + "time_per_iteration": 2.504995584487915 + }, + { + "auxiliary_loss_clip": 0.0655796, + "auxiliary_loss_mlp": 0.01284666, + "balance_loss_clip": 0.06307058, + "balance_loss_mlp": 0.01260526, + "epoch": 0.19774537802495115, + "flos": 20309135932800.0, + "grad_norm": 2.127297919409227, + "language_loss": 0.73378497, + "learning_rate": 3.712015717627374e-06, + "loss": 0.81221128, + "num_input_tokens_seen": 70934870, + "router_z_loss_clip": 2.50976562, + "router_z_loss_mlp": 0.24157715, + "step": 3289, + "time_per_iteration": 2.5189085006713867 + }, + { + "auxiliary_loss_clip": 0.06562441, + "auxiliary_loss_mlp": 0.01280497, + "balance_loss_clip": 0.06308928, + "balance_loss_mlp": 0.0125718, + "epoch": 0.19780550127761912, + "flos": 27242893296000.0, + "grad_norm": 3.229663808517491, + "language_loss": 0.79990375, + "learning_rate": 3.7118143479639813e-06, + "loss": 0.87833309, + "num_input_tokens_seen": 70955140, + "router_z_loss_clip": 2.53320312, + "router_z_loss_mlp": 0.2331543, + "step": 3290, + "time_per_iteration": 2.615630626678467 + }, + { + "auxiliary_loss_clip": 0.06446102, + "auxiliary_loss_mlp": 0.01262954, + "balance_loss_clip": 0.06305116, + "balance_loss_mlp": 0.01256308, + "epoch": 0.19786562453028708, + "flos": 63572597015040.0, + "grad_norm": 0.871535655745335, + "language_loss": 0.60331321, + "learning_rate": 3.711612913388418e-06, + "loss": 0.68040371, + "num_input_tokens_seen": 71012005, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.06658936, + "step": 3291, + "time_per_iteration": 3.1708285808563232 + }, + { + "auxiliary_loss_clip": 0.06578626, + "auxiliary_loss_mlp": 0.01283318, + "balance_loss_clip": 0.06312629, + "balance_loss_mlp": 0.0125621, + "epoch": 0.19792574778295505, + "flos": 26293869152640.0, + "grad_norm": 1.6662005392394712, + "language_loss": 0.82490212, + "learning_rate": 3.7114114139083204e-06, + "loss": 0.90352154, + "num_input_tokens_seen": 71031140, + "router_z_loss_clip": 2.65820312, + "router_z_loss_mlp": 0.2713623, + "step": 3292, + "time_per_iteration": 4.009428024291992 + }, + { + "auxiliary_loss_clip": 0.06559315, + "auxiliary_loss_mlp": 0.01281718, + "balance_loss_clip": 0.06308785, + "balance_loss_mlp": 0.01259641, + "epoch": 0.19798587103562304, + "flos": 19944265328640.0, + "grad_norm": 2.398610043576172, + "language_loss": 0.82271063, + "learning_rate": 3.7112098495313313e-06, + "loss": 0.9011209, + "num_input_tokens_seen": 71050250, + "router_z_loss_clip": 2.50390625, + "router_z_loss_mlp": 0.2208252, + "step": 3293, + "time_per_iteration": 2.5567917823791504 + }, + { + "auxiliary_loss_clip": 0.06584712, + "auxiliary_loss_mlp": 0.0128547, + "balance_loss_clip": 0.06316388, + "balance_loss_mlp": 0.01259351, + "epoch": 0.198045994288291, + "flos": 20126428104960.0, + "grad_norm": 22.121432113432896, + "language_loss": 0.62642097, + "learning_rate": 3.711008220265093e-06, + "loss": 0.70512283, + "num_input_tokens_seen": 71068665, + "router_z_loss_clip": 2.68554688, + "router_z_loss_mlp": 0.26135254, + "step": 3294, + "time_per_iteration": 4.055817365646362 + }, + { + "auxiliary_loss_clip": 0.06568369, + "auxiliary_loss_mlp": 0.01283249, + "balance_loss_clip": 0.06312987, + "balance_loss_mlp": 0.01259849, + "epoch": 0.19810611754095897, + "flos": 17973444170880.0, + "grad_norm": 2.078666367863598, + "language_loss": 0.88182533, + "learning_rate": 3.710806526117251e-06, + "loss": 0.96034157, + "num_input_tokens_seen": 71085320, + "router_z_loss_clip": 2.55273438, + "router_z_loss_mlp": 0.23413086, + "step": 3295, + "time_per_iteration": 2.616658926010132 + }, + { + "auxiliary_loss_clip": 0.06566019, + "auxiliary_loss_mlp": 0.01286636, + "balance_loss_clip": 0.06313851, + "balance_loss_mlp": 0.01265298, + "epoch": 0.19816624079362694, + "flos": 15090257831040.0, + "grad_norm": 2.9890739239636575, + "language_loss": 0.82427287, + "learning_rate": 3.7106047670954544e-06, + "loss": 0.90279943, + "num_input_tokens_seen": 71102020, + "router_z_loss_clip": 2.5234375, + "router_z_loss_mlp": 0.21337891, + "step": 3296, + "time_per_iteration": 2.642479658126831 + }, + { + "auxiliary_loss_clip": 0.06579386, + "auxiliary_loss_mlp": 0.01281841, + "balance_loss_clip": 0.06320241, + "balance_loss_mlp": 0.01256593, + "epoch": 0.1982263640462949, + "flos": 24907327315200.0, + "grad_norm": 2.6461649791490522, + "language_loss": 0.69111884, + "learning_rate": 3.710402943207354e-06, + "loss": 0.76973104, + "num_input_tokens_seen": 71123390, + "router_z_loss_clip": 2.59179688, + "router_z_loss_mlp": 0.25268555, + "step": 3297, + "time_per_iteration": 2.5983548164367676 + }, + { + "auxiliary_loss_clip": 0.06568186, + "auxiliary_loss_mlp": 0.01294298, + "balance_loss_clip": 0.06316572, + "balance_loss_mlp": 0.01272125, + "epoch": 0.19828648729896287, + "flos": 20382453855360.0, + "grad_norm": 1.615710211373745, + "language_loss": 0.8249923, + "learning_rate": 3.7102010544606016e-06, + "loss": 0.90361714, + "num_input_tokens_seen": 71141800, + "router_z_loss_clip": 2.51757812, + "router_z_loss_mlp": 0.22167969, + "step": 3298, + "time_per_iteration": 2.548333168029785 + }, + { + "auxiliary_loss_clip": 0.0657866, + "auxiliary_loss_mlp": 0.01298019, + "balance_loss_clip": 0.06318102, + "balance_loss_mlp": 0.01272592, + "epoch": 0.19834661055163083, + "flos": 18886018988160.0, + "grad_norm": 1.9534827487794544, + "language_loss": 0.86188138, + "learning_rate": 3.7099991008628544e-06, + "loss": 0.94064808, + "num_input_tokens_seen": 71159505, + "router_z_loss_clip": 2.60546875, + "router_z_loss_mlp": 0.25402832, + "step": 3299, + "time_per_iteration": 3.944326400756836 + }, + { + "auxiliary_loss_clip": 0.06449087, + "auxiliary_loss_mlp": 0.01270227, + "balance_loss_clip": 0.06307668, + "balance_loss_mlp": 0.01262615, + "epoch": 0.19840673380429882, + "flos": 60278908723200.0, + "grad_norm": 0.7519898728992364, + "language_loss": 0.53224742, + "learning_rate": 3.7097970824217706e-06, + "loss": 0.60944057, + "num_input_tokens_seen": 71223265, + "router_z_loss_clip": 1.41308594, + "router_z_loss_mlp": 0.07598877, + "step": 3300, + "time_per_iteration": 4.6055073738098145 + }, + { + "auxiliary_loss_clip": 0.06570522, + "auxiliary_loss_mlp": 0.01292871, + "balance_loss_clip": 0.06315967, + "balance_loss_mlp": 0.01267706, + "epoch": 0.1984668570569668, + "flos": 19908235272960.0, + "grad_norm": 2.2853574973511472, + "language_loss": 0.73847342, + "learning_rate": 3.7095949991450093e-06, + "loss": 0.81710732, + "num_input_tokens_seen": 71242385, + "router_z_loss_clip": 2.546875, + "router_z_loss_mlp": 0.25183105, + "step": 3301, + "time_per_iteration": 2.6006925106048584 + }, + { + "auxiliary_loss_clip": 0.06563142, + "auxiliary_loss_mlp": 0.01290092, + "balance_loss_clip": 0.0631086, + "balance_loss_mlp": 0.01267239, + "epoch": 0.19852698030963475, + "flos": 15635865692160.0, + "grad_norm": 3.8656690955217976, + "language_loss": 0.8953101, + "learning_rate": 3.709392851040235e-06, + "loss": 0.9738425, + "num_input_tokens_seen": 71258990, + "router_z_loss_clip": 2.5234375, + "router_z_loss_mlp": 0.22851562, + "step": 3302, + "time_per_iteration": 2.487173080444336 + }, + { + "auxiliary_loss_clip": 0.06567049, + "auxiliary_loss_mlp": 0.0128658, + "balance_loss_clip": 0.06310292, + "balance_loss_mlp": 0.01263013, + "epoch": 0.19858710356230272, + "flos": 43153037729280.0, + "grad_norm": 2.6127475741484347, + "language_loss": 0.74595749, + "learning_rate": 3.709190638115111e-06, + "loss": 0.82449377, + "num_input_tokens_seen": 71282770, + "router_z_loss_clip": 2.56835938, + "router_z_loss_mlp": 0.23596191, + "step": 3303, + "time_per_iteration": 2.733031749725342 + }, + { + "auxiliary_loss_clip": 0.06567588, + "auxiliary_loss_mlp": 0.0129499, + "balance_loss_clip": 0.06313773, + "balance_loss_mlp": 0.01270373, + "epoch": 0.19864722681497068, + "flos": 35151348879360.0, + "grad_norm": 2.3312818962460686, + "language_loss": 0.75973707, + "learning_rate": 3.7089883603773084e-06, + "loss": 0.83836287, + "num_input_tokens_seen": 71301410, + "router_z_loss_clip": 2.54101562, + "router_z_loss_mlp": 0.24597168, + "step": 3304, + "time_per_iteration": 2.627612829208374 + }, + { + "auxiliary_loss_clip": 0.06565879, + "auxiliary_loss_mlp": 0.01301567, + "balance_loss_clip": 0.06315561, + "balance_loss_mlp": 0.01279156, + "epoch": 0.19870735006763865, + "flos": 19432088046720.0, + "grad_norm": 2.2073504264205277, + "language_loss": 0.86939341, + "learning_rate": 3.7087860178344955e-06, + "loss": 0.9480679, + "num_input_tokens_seen": 71319670, + "router_z_loss_clip": 2.50390625, + "router_z_loss_mlp": 0.22399902, + "step": 3305, + "time_per_iteration": 2.5243277549743652 + }, + { + "auxiliary_loss_clip": 0.06573498, + "auxiliary_loss_mlp": 0.01293424, + "balance_loss_clip": 0.06314258, + "balance_loss_mlp": 0.01270035, + "epoch": 0.19876747332030664, + "flos": 23553671005440.0, + "grad_norm": 1.7277126311559312, + "language_loss": 0.69397068, + "learning_rate": 3.7085836104943445e-06, + "loss": 0.77263987, + "num_input_tokens_seen": 71339850, + "router_z_loss_clip": 2.59570312, + "router_z_loss_mlp": 0.23388672, + "step": 3306, + "time_per_iteration": 2.6042323112487793 + }, + { + "auxiliary_loss_clip": 0.06570327, + "auxiliary_loss_mlp": 0.01299594, + "balance_loss_clip": 0.06314942, + "balance_loss_mlp": 0.0127723, + "epoch": 0.1988275965729746, + "flos": 19835672037120.0, + "grad_norm": 3.1120189325389735, + "language_loss": 0.77373499, + "learning_rate": 3.7083811383645332e-06, + "loss": 0.85243422, + "num_input_tokens_seen": 71359795, + "router_z_loss_clip": 2.55273438, + "router_z_loss_mlp": 0.22375488, + "step": 3307, + "time_per_iteration": 2.6128084659576416 + }, + { + "auxiliary_loss_clip": 0.06569448, + "auxiliary_loss_mlp": 0.01292327, + "balance_loss_clip": 0.06316574, + "balance_loss_mlp": 0.01270452, + "epoch": 0.19888771982564257, + "flos": 23520366207360.0, + "grad_norm": 3.545114094394172, + "language_loss": 0.7662878, + "learning_rate": 3.708178601452737e-06, + "loss": 0.84490561, + "num_input_tokens_seen": 71378885, + "router_z_loss_clip": 2.52929688, + "router_z_loss_mlp": 0.21875, + "step": 3308, + "time_per_iteration": 2.5699222087860107 + }, + { + "auxiliary_loss_clip": 0.06565186, + "auxiliary_loss_mlp": 0.01291629, + "balance_loss_clip": 0.0631263, + "balance_loss_mlp": 0.0126799, + "epoch": 0.19894784307831054, + "flos": 18156403560960.0, + "grad_norm": 1.7056349525902872, + "language_loss": 0.76261461, + "learning_rate": 3.7079759997666374e-06, + "loss": 0.84118271, + "num_input_tokens_seen": 71397285, + "router_z_loss_clip": 2.52539062, + "router_z_loss_mlp": 0.23657227, + "step": 3309, + "time_per_iteration": 2.5804028511047363 + }, + { + "auxiliary_loss_clip": 0.06557433, + "auxiliary_loss_mlp": 0.01287248, + "balance_loss_clip": 0.06309506, + "balance_loss_mlp": 0.0126287, + "epoch": 0.1990079663309785, + "flos": 24282280183680.0, + "grad_norm": 1.5893437900436935, + "language_loss": 0.8845197, + "learning_rate": 3.707773333313917e-06, + "loss": 0.96296644, + "num_input_tokens_seen": 71415775, + "router_z_loss_clip": 2.47851562, + "router_z_loss_mlp": 0.24377441, + "step": 3310, + "time_per_iteration": 2.540788412094116 + }, + { + "auxiliary_loss_clip": 0.06554775, + "auxiliary_loss_mlp": 0.01280476, + "balance_loss_clip": 0.06304908, + "balance_loss_mlp": 0.01256575, + "epoch": 0.19906808958364647, + "flos": 34906391867520.0, + "grad_norm": 2.4688423193302347, + "language_loss": 0.64663219, + "learning_rate": 3.70757060210226e-06, + "loss": 0.72498477, + "num_input_tokens_seen": 71437315, + "router_z_loss_clip": 2.50195312, + "router_z_loss_mlp": 0.23925781, + "step": 3311, + "time_per_iteration": 2.6754508018493652 + }, + { + "auxiliary_loss_clip": 0.06567319, + "auxiliary_loss_mlp": 0.01285122, + "balance_loss_clip": 0.06310549, + "balance_loss_mlp": 0.01261351, + "epoch": 0.19912821283631443, + "flos": 24031788802560.0, + "grad_norm": 3.0857408174701186, + "language_loss": 0.75624847, + "learning_rate": 3.707367806139355e-06, + "loss": 0.83477283, + "num_input_tokens_seen": 71456320, + "router_z_loss_clip": 2.56445312, + "router_z_loss_mlp": 0.23779297, + "step": 3312, + "time_per_iteration": 2.5815083980560303 + }, + { + "auxiliary_loss_clip": 0.06553487, + "auxiliary_loss_mlp": 0.01286524, + "balance_loss_clip": 0.06300232, + "balance_loss_mlp": 0.01262611, + "epoch": 0.19918833608898243, + "flos": 19864155225600.0, + "grad_norm": 2.0583715987658264, + "language_loss": 0.84526402, + "learning_rate": 3.7071649454328915e-06, + "loss": 0.92366409, + "num_input_tokens_seen": 71475360, + "router_z_loss_clip": 2.53125, + "router_z_loss_mlp": 0.23937988, + "step": 3313, + "time_per_iteration": 2.5260941982269287 + }, + { + "auxiliary_loss_clip": 0.06547163, + "auxiliary_loss_mlp": 0.01284622, + "balance_loss_clip": 0.06294618, + "balance_loss_mlp": 0.01261376, + "epoch": 0.1992484593416504, + "flos": 29103444080640.0, + "grad_norm": 1.8813056340492245, + "language_loss": 0.81481469, + "learning_rate": 3.7069620199905625e-06, + "loss": 0.89313251, + "num_input_tokens_seen": 71496155, + "router_z_loss_clip": 2.52539062, + "router_z_loss_mlp": 0.2322998, + "step": 3314, + "time_per_iteration": 2.618865966796875 + }, + { + "auxiliary_loss_clip": 0.06544838, + "auxiliary_loss_mlp": 0.01278619, + "balance_loss_clip": 0.06300788, + "balance_loss_mlp": 0.01257924, + "epoch": 0.19930858259431836, + "flos": 23301754104960.0, + "grad_norm": 1.60969518187187, + "language_loss": 0.88063407, + "learning_rate": 3.7067590298200627e-06, + "loss": 0.95886856, + "num_input_tokens_seen": 71517295, + "router_z_loss_clip": 2.43945312, + "router_z_loss_mlp": 0.20690918, + "step": 3315, + "time_per_iteration": 2.5732057094573975 + }, + { + "auxiliary_loss_clip": 0.06550217, + "auxiliary_loss_mlp": 0.01280633, + "balance_loss_clip": 0.06298293, + "balance_loss_mlp": 0.0125728, + "epoch": 0.19936870584698632, + "flos": 25386619069440.0, + "grad_norm": 1.6023919835075873, + "language_loss": 0.71362162, + "learning_rate": 3.7065559749290892e-06, + "loss": 0.79193014, + "num_input_tokens_seen": 71540000, + "router_z_loss_clip": 2.51953125, + "router_z_loss_mlp": 0.23352051, + "step": 3316, + "time_per_iteration": 2.6071085929870605 + }, + { + "auxiliary_loss_clip": 0.06427301, + "auxiliary_loss_mlp": 0.01269861, + "balance_loss_clip": 0.06290084, + "balance_loss_mlp": 0.01263975, + "epoch": 0.1994288290996543, + "flos": 62190038246400.0, + "grad_norm": 0.8251623423654184, + "language_loss": 0.6634506, + "learning_rate": 3.706352855325342e-06, + "loss": 0.74042213, + "num_input_tokens_seen": 71607880, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.05880737, + "step": 3317, + "time_per_iteration": 3.216862201690674 + }, + { + "auxiliary_loss_clip": 0.06558052, + "auxiliary_loss_mlp": 0.01286476, + "balance_loss_clip": 0.06302503, + "balance_loss_mlp": 0.01262813, + "epoch": 0.19948895235232225, + "flos": 19031816292480.0, + "grad_norm": 2.159914212237722, + "language_loss": 0.74519444, + "learning_rate": 3.7061496710165233e-06, + "loss": 0.82363975, + "num_input_tokens_seen": 71625695, + "router_z_loss_clip": 2.55078125, + "router_z_loss_mlp": 0.23669434, + "step": 3318, + "time_per_iteration": 2.5432114601135254 + }, + { + "auxiliary_loss_clip": 0.06544004, + "auxiliary_loss_mlp": 0.01278248, + "balance_loss_clip": 0.06298326, + "balance_loss_mlp": 0.01256266, + "epoch": 0.19954907560499022, + "flos": 37824895503360.0, + "grad_norm": 2.0763327087054604, + "language_loss": 0.79865813, + "learning_rate": 3.7059464220103385e-06, + "loss": 0.87688065, + "num_input_tokens_seen": 71648520, + "router_z_loss_clip": 2.45703125, + "router_z_loss_mlp": 0.21984863, + "step": 3319, + "time_per_iteration": 2.6703901290893555 + }, + { + "auxiliary_loss_clip": 0.06551617, + "auxiliary_loss_mlp": 0.01282829, + "balance_loss_clip": 0.06300303, + "balance_loss_mlp": 0.01259631, + "epoch": 0.1996091988576582, + "flos": 49576420673280.0, + "grad_norm": 2.869788826425785, + "language_loss": 0.763668, + "learning_rate": 3.7057431083144945e-06, + "loss": 0.84201247, + "num_input_tokens_seen": 71672185, + "router_z_loss_clip": 2.51757812, + "router_z_loss_mlp": 0.2322998, + "step": 3320, + "time_per_iteration": 2.817199945449829 + }, + { + "auxiliary_loss_clip": 0.06552573, + "auxiliary_loss_mlp": 0.01291841, + "balance_loss_clip": 0.06302333, + "balance_loss_mlp": 0.01269608, + "epoch": 0.19966932211032618, + "flos": 22642018583040.0, + "grad_norm": 1.4988243809721686, + "language_loss": 0.81033528, + "learning_rate": 3.705539729936701e-06, + "loss": 0.8887794, + "num_input_tokens_seen": 71692890, + "router_z_loss_clip": 2.50390625, + "router_z_loss_mlp": 0.22229004, + "step": 3321, + "time_per_iteration": 2.6688761711120605 + }, + { + "auxiliary_loss_clip": 0.06416404, + "auxiliary_loss_mlp": 0.01265491, + "balance_loss_clip": 0.06281121, + "balance_loss_mlp": 0.01258195, + "epoch": 0.19972944536299414, + "flos": 54098973417600.0, + "grad_norm": 0.8569411614728654, + "language_loss": 0.65245974, + "learning_rate": 3.7053362868846696e-06, + "loss": 0.72927874, + "num_input_tokens_seen": 71745815, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.07275391, + "step": 3322, + "time_per_iteration": 3.000269651412964 + }, + { + "auxiliary_loss_clip": 0.06410387, + "auxiliary_loss_mlp": 0.01261864, + "balance_loss_clip": 0.06274698, + "balance_loss_mlp": 0.01254372, + "epoch": 0.1997895686156621, + "flos": 69371995731840.0, + "grad_norm": 0.7694165297899808, + "language_loss": 0.56849998, + "learning_rate": 3.7051327791661153e-06, + "loss": 0.64522249, + "num_input_tokens_seen": 71806915, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.07476807, + "step": 3323, + "time_per_iteration": 3.330606698989868 + }, + { + "auxiliary_loss_clip": 0.06562012, + "auxiliary_loss_mlp": 0.01292664, + "balance_loss_clip": 0.06316413, + "balance_loss_mlp": 0.01268596, + "epoch": 0.19984969186833007, + "flos": 18558058907520.0, + "grad_norm": 1.8232624283894519, + "language_loss": 0.81610429, + "learning_rate": 3.7049292067887555e-06, + "loss": 0.89465106, + "num_input_tokens_seen": 71824645, + "router_z_loss_clip": 2.45703125, + "router_z_loss_mlp": 0.24084473, + "step": 3324, + "time_per_iteration": 2.5314769744873047 + }, + { + "auxiliary_loss_clip": 0.06558169, + "auxiliary_loss_mlp": 0.01292911, + "balance_loss_clip": 0.06310347, + "balance_loss_mlp": 0.01268318, + "epoch": 0.19990981512099804, + "flos": 26436438074880.0, + "grad_norm": 1.6515442637335616, + "language_loss": 0.54047406, + "learning_rate": 3.7047255697603092e-06, + "loss": 0.61898488, + "num_input_tokens_seen": 71845125, + "router_z_loss_clip": 2.48242188, + "router_z_loss_mlp": 0.24609375, + "step": 3325, + "time_per_iteration": 2.6192479133605957 + }, + { + "auxiliary_loss_clip": 0.06565623, + "auxiliary_loss_mlp": 0.01288281, + "balance_loss_clip": 0.063146, + "balance_loss_mlp": 0.01265572, + "epoch": 0.19996993837366603, + "flos": 16331547415680.0, + "grad_norm": 1.9371709062145088, + "language_loss": 0.8658272, + "learning_rate": 3.7045218680884984e-06, + "loss": 0.94436622, + "num_input_tokens_seen": 71863500, + "router_z_loss_clip": 2.50976562, + "router_z_loss_mlp": 0.22729492, + "step": 3326, + "time_per_iteration": 2.5111629962921143 + }, + { + "auxiliary_loss_clip": 0.06551019, + "auxiliary_loss_mlp": 0.01289033, + "balance_loss_clip": 0.06305069, + "balance_loss_mlp": 0.01266705, + "epoch": 0.200030061626334, + "flos": 20849460986880.0, + "grad_norm": 6.809877440219623, + "language_loss": 0.7272824, + "learning_rate": 3.7043181017810476e-06, + "loss": 0.8056829, + "num_input_tokens_seen": 71881845, + "router_z_loss_clip": 2.45898438, + "router_z_loss_mlp": 0.22314453, + "step": 3327, + "time_per_iteration": 2.5571372509002686 + }, + { + "auxiliary_loss_clip": 0.06566358, + "auxiliary_loss_mlp": 0.01287053, + "balance_loss_clip": 0.06313111, + "balance_loss_mlp": 0.01261756, + "epoch": 0.20009018487900196, + "flos": 23768341966080.0, + "grad_norm": 1.841950801645188, + "language_loss": 0.77914047, + "learning_rate": 3.7041142708456833e-06, + "loss": 0.8576746, + "num_input_tokens_seen": 71900940, + "router_z_loss_clip": 2.53320312, + "router_z_loss_mlp": 0.25317383, + "step": 3328, + "time_per_iteration": 2.5489912033081055 + }, + { + "auxiliary_loss_clip": 0.06559211, + "auxiliary_loss_mlp": 0.01288822, + "balance_loss_clip": 0.06314486, + "balance_loss_mlp": 0.01265338, + "epoch": 0.20015030813166992, + "flos": 28119186495360.0, + "grad_norm": 1.7739956363125764, + "language_loss": 0.6938678, + "learning_rate": 3.7039103752901353e-06, + "loss": 0.77234817, + "num_input_tokens_seen": 71921925, + "router_z_loss_clip": 2.4453125, + "router_z_loss_mlp": 0.23474121, + "step": 3329, + "time_per_iteration": 2.790318489074707 + }, + { + "auxiliary_loss_clip": 0.06562928, + "auxiliary_loss_mlp": 0.01288787, + "balance_loss_clip": 0.06310034, + "balance_loss_mlp": 0.01263396, + "epoch": 0.2002104313843379, + "flos": 26074250801280.0, + "grad_norm": 1.6222638892170962, + "language_loss": 0.81793886, + "learning_rate": 3.7037064151221353e-06, + "loss": 0.896456, + "num_input_tokens_seen": 71941855, + "router_z_loss_clip": 2.52929688, + "router_z_loss_mlp": 0.25415039, + "step": 3330, + "time_per_iteration": 2.6165175437927246 + }, + { + "auxiliary_loss_clip": 0.06561245, + "auxiliary_loss_mlp": 0.01293061, + "balance_loss_clip": 0.06310615, + "balance_loss_mlp": 0.01268874, + "epoch": 0.20027055463700585, + "flos": 22973332826880.0, + "grad_norm": 3.6220429921180877, + "language_loss": 0.7808395, + "learning_rate": 3.703502390349417e-06, + "loss": 0.85938263, + "num_input_tokens_seen": 71960915, + "router_z_loss_clip": 2.50585938, + "router_z_loss_mlp": 0.24194336, + "step": 3331, + "time_per_iteration": 4.07051157951355 + }, + { + "auxiliary_loss_clip": 0.06564473, + "auxiliary_loss_mlp": 0.01290798, + "balance_loss_clip": 0.06310149, + "balance_loss_mlp": 0.01266014, + "epoch": 0.20033067788967382, + "flos": 17171433216000.0, + "grad_norm": 1.7477664730796658, + "language_loss": 0.79863441, + "learning_rate": 3.7032983009797176e-06, + "loss": 0.87718713, + "num_input_tokens_seen": 71979220, + "router_z_loss_clip": 2.54296875, + "router_z_loss_mlp": 0.24780273, + "step": 3332, + "time_per_iteration": 2.5321452617645264 + }, + { + "auxiliary_loss_clip": 0.06409155, + "auxiliary_loss_mlp": 0.01261657, + "balance_loss_clip": 0.06275231, + "balance_loss_mlp": 0.01253551, + "epoch": 0.2003908011423418, + "flos": 60842476085760.0, + "grad_norm": 0.9021189232739572, + "language_loss": 0.61913729, + "learning_rate": 3.703094147020776e-06, + "loss": 0.69584543, + "num_input_tokens_seen": 72033950, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.08105469, + "step": 3333, + "time_per_iteration": 4.713933706283569 + }, + { + "auxiliary_loss_clip": 0.06552575, + "auxiliary_loss_mlp": 0.0128469, + "balance_loss_clip": 0.06299093, + "balance_loss_mlp": 0.0126123, + "epoch": 0.20045092439500978, + "flos": 24212987256960.0, + "grad_norm": 1.8847951547254278, + "language_loss": 0.82181144, + "learning_rate": 3.7028899284803334e-06, + "loss": 0.90018404, + "num_input_tokens_seen": 72051395, + "router_z_loss_clip": 2.53320312, + "router_z_loss_mlp": 0.23461914, + "step": 3334, + "time_per_iteration": 2.597038984298706 + }, + { + "auxiliary_loss_clip": 0.0654801, + "auxiliary_loss_mlp": 0.01282898, + "balance_loss_clip": 0.06293298, + "balance_loss_mlp": 0.01256874, + "epoch": 0.20051104764767774, + "flos": 29395290251520.0, + "grad_norm": 2.256626356817437, + "language_loss": 0.7536357, + "learning_rate": 3.702685645366134e-06, + "loss": 0.83194482, + "num_input_tokens_seen": 72071305, + "router_z_loss_clip": 2.54882812, + "router_z_loss_mlp": 0.26049805, + "step": 3335, + "time_per_iteration": 2.5860390663146973 + }, + { + "auxiliary_loss_clip": 0.06552432, + "auxiliary_loss_mlp": 0.01280424, + "balance_loss_clip": 0.06300009, + "balance_loss_mlp": 0.0125632, + "epoch": 0.2005711709003457, + "flos": 23520575842560.0, + "grad_norm": 6.047041669068293, + "language_loss": 0.80452931, + "learning_rate": 3.7024812976859243e-06, + "loss": 0.88285786, + "num_input_tokens_seen": 72090165, + "router_z_loss_clip": 2.5234375, + "router_z_loss_mlp": 0.24108887, + "step": 3336, + "time_per_iteration": 2.662705898284912 + }, + { + "auxiliary_loss_clip": 0.06555694, + "auxiliary_loss_mlp": 0.01283807, + "balance_loss_clip": 0.06297083, + "balance_loss_mlp": 0.01258045, + "epoch": 0.20063129415301367, + "flos": 22529106806400.0, + "grad_norm": 1.88296777376126, + "language_loss": 0.78839928, + "learning_rate": 3.7022768854474532e-06, + "loss": 0.86679429, + "num_input_tokens_seen": 72107210, + "router_z_loss_clip": 2.5859375, + "router_z_loss_mlp": 0.25756836, + "step": 3337, + "time_per_iteration": 2.541239023208618 + }, + { + "auxiliary_loss_clip": 0.06548997, + "auxiliary_loss_mlp": 0.01282446, + "balance_loss_clip": 0.06296889, + "balance_loss_mlp": 0.01258389, + "epoch": 0.20069141740568164, + "flos": 25965405947520.0, + "grad_norm": 2.093788516709133, + "language_loss": 0.69608915, + "learning_rate": 3.7020724086584724e-06, + "loss": 0.77440357, + "num_input_tokens_seen": 72126315, + "router_z_loss_clip": 2.51953125, + "router_z_loss_mlp": 0.24072266, + "step": 3338, + "time_per_iteration": 4.011674165725708 + }, + { + "auxiliary_loss_clip": 0.06553162, + "auxiliary_loss_mlp": 0.01285819, + "balance_loss_clip": 0.06298589, + "balance_loss_mlp": 0.01261703, + "epoch": 0.2007515406583496, + "flos": 24797560066560.0, + "grad_norm": 2.5614555335728375, + "language_loss": 0.70278549, + "learning_rate": 3.701867867326735e-06, + "loss": 0.78117526, + "num_input_tokens_seen": 72146470, + "router_z_loss_clip": 2.54492188, + "router_z_loss_mlp": 0.24121094, + "step": 3339, + "time_per_iteration": 4.021097183227539 + }, + { + "auxiliary_loss_clip": 0.06558233, + "auxiliary_loss_mlp": 0.01288707, + "balance_loss_clip": 0.06300814, + "balance_loss_mlp": 0.01264401, + "epoch": 0.2008116639110176, + "flos": 37934746606080.0, + "grad_norm": 2.4782874615073265, + "language_loss": 0.67773008, + "learning_rate": 3.7016632614599974e-06, + "loss": 0.75619948, + "num_input_tokens_seen": 72166600, + "router_z_loss_clip": 2.578125, + "router_z_loss_mlp": 0.24291992, + "step": 3340, + "time_per_iteration": 2.741156816482544 + }, + { + "auxiliary_loss_clip": 0.06555235, + "auxiliary_loss_mlp": 0.01284766, + "balance_loss_clip": 0.06297287, + "balance_loss_mlp": 0.01258122, + "epoch": 0.20087178716368556, + "flos": 20746779408000.0, + "grad_norm": 2.067820693237163, + "language_loss": 0.74698186, + "learning_rate": 3.701458591066019e-06, + "loss": 0.82538182, + "num_input_tokens_seen": 72185160, + "router_z_loss_clip": 2.58007812, + "router_z_loss_mlp": 0.26623535, + "step": 3341, + "time_per_iteration": 2.564480781555176 + }, + { + "auxiliary_loss_clip": 0.06547385, + "auxiliary_loss_mlp": 0.01280207, + "balance_loss_clip": 0.06298249, + "balance_loss_mlp": 0.01256532, + "epoch": 0.20093191041635353, + "flos": 23849122901760.0, + "grad_norm": 1.820842392943319, + "language_loss": 0.7265389, + "learning_rate": 3.70125385615256e-06, + "loss": 0.80481482, + "num_input_tokens_seen": 72205160, + "router_z_loss_clip": 2.48828125, + "router_z_loss_mlp": 0.23657227, + "step": 3342, + "time_per_iteration": 2.5828449726104736 + }, + { + "auxiliary_loss_clip": 0.065575, + "auxiliary_loss_mlp": 0.01288338, + "balance_loss_clip": 0.06302083, + "balance_loss_mlp": 0.01264174, + "epoch": 0.2009920336690215, + "flos": 21797395027200.0, + "grad_norm": 1.987813203177408, + "language_loss": 0.73357129, + "learning_rate": 3.701049056727384e-06, + "loss": 0.81202972, + "num_input_tokens_seen": 72223555, + "router_z_loss_clip": 2.55273438, + "router_z_loss_mlp": 0.24169922, + "step": 3343, + "time_per_iteration": 2.547868490219116 + }, + { + "auxiliary_loss_clip": 0.06554922, + "auxiliary_loss_mlp": 0.012954, + "balance_loss_clip": 0.06301528, + "balance_loss_mlp": 0.01269865, + "epoch": 0.20105215692168946, + "flos": 26366390461440.0, + "grad_norm": 2.115251797604865, + "language_loss": 0.81433517, + "learning_rate": 3.7008441927982574e-06, + "loss": 0.89283836, + "num_input_tokens_seen": 72242465, + "router_z_loss_clip": 2.53515625, + "router_z_loss_mlp": 0.25524902, + "step": 3344, + "time_per_iteration": 2.6067302227020264 + }, + { + "auxiliary_loss_clip": 0.06556335, + "auxiliary_loss_mlp": 0.01281302, + "balance_loss_clip": 0.06301118, + "balance_loss_mlp": 0.01258426, + "epoch": 0.20111228017435742, + "flos": 18813288044160.0, + "grad_norm": 4.0042293338609385, + "language_loss": 0.84618676, + "learning_rate": 3.700639264372948e-06, + "loss": 0.92456311, + "num_input_tokens_seen": 72260655, + "router_z_loss_clip": 2.55273438, + "router_z_loss_mlp": 0.2286377, + "step": 3345, + "time_per_iteration": 2.554713726043701 + }, + { + "auxiliary_loss_clip": 0.06542929, + "auxiliary_loss_mlp": 0.01295407, + "balance_loss_clip": 0.0629687, + "balance_loss_mlp": 0.01272697, + "epoch": 0.20117240342702541, + "flos": 19981301633280.0, + "grad_norm": 2.1108086187654025, + "language_loss": 0.68437809, + "learning_rate": 3.7004342714592283e-06, + "loss": 0.76276147, + "num_input_tokens_seen": 72279055, + "router_z_loss_clip": 2.4609375, + "router_z_loss_mlp": 0.22705078, + "step": 3346, + "time_per_iteration": 2.5748066902160645 + }, + { + "auxiliary_loss_clip": 0.06553109, + "auxiliary_loss_mlp": 0.01283392, + "balance_loss_clip": 0.06301546, + "balance_loss_mlp": 0.01258739, + "epoch": 0.20123252667969338, + "flos": 23148368006400.0, + "grad_norm": 1.9426154174848713, + "language_loss": 0.73952061, + "learning_rate": 3.70022921406487e-06, + "loss": 0.81788564, + "num_input_tokens_seen": 72297895, + "router_z_loss_clip": 2.51757812, + "router_z_loss_mlp": 0.24682617, + "step": 3347, + "time_per_iteration": 2.5353236198425293 + }, + { + "auxiliary_loss_clip": 0.06546339, + "auxiliary_loss_mlp": 0.01287781, + "balance_loss_clip": 0.0629671, + "balance_loss_mlp": 0.01263487, + "epoch": 0.20129264993236134, + "flos": 23228352328320.0, + "grad_norm": 1.557023243146552, + "language_loss": 0.87284029, + "learning_rate": 3.70002409219765e-06, + "loss": 0.95118147, + "num_input_tokens_seen": 72318385, + "router_z_loss_clip": 2.49414062, + "router_z_loss_mlp": 0.24316406, + "step": 3348, + "time_per_iteration": 2.5943105220794678 + }, + { + "auxiliary_loss_clip": 0.06550047, + "auxiliary_loss_mlp": 0.01294068, + "balance_loss_clip": 0.06302264, + "balance_loss_mlp": 0.01269034, + "epoch": 0.2013527731850293, + "flos": 21877882473600.0, + "grad_norm": 1.6966939322149492, + "language_loss": 0.71502012, + "learning_rate": 3.699818905865346e-06, + "loss": 0.7934612, + "num_input_tokens_seen": 72338235, + "router_z_loss_clip": 2.47851562, + "router_z_loss_mlp": 0.25061035, + "step": 3349, + "time_per_iteration": 2.5671966075897217 + }, + { + "auxiliary_loss_clip": 0.06552055, + "auxiliary_loss_mlp": 0.01290022, + "balance_loss_clip": 0.06301533, + "balance_loss_mlp": 0.01263486, + "epoch": 0.20141289643769728, + "flos": 18046636312320.0, + "grad_norm": 1.7460886195435679, + "language_loss": 0.72473693, + "learning_rate": 3.6996136550757377e-06, + "loss": 0.80315775, + "num_input_tokens_seen": 72357825, + "router_z_loss_clip": 2.50195312, + "router_z_loss_mlp": 0.26501465, + "step": 3350, + "time_per_iteration": 2.558486223220825 + }, + { + "auxiliary_loss_clip": 0.06561922, + "auxiliary_loss_mlp": 0.01282894, + "balance_loss_clip": 0.0630732, + "balance_loss_mlp": 0.01256728, + "epoch": 0.20147301969036524, + "flos": 23958219317760.0, + "grad_norm": 2.4285458765514623, + "language_loss": 0.76773715, + "learning_rate": 3.69940833983661e-06, + "loss": 0.84618533, + "num_input_tokens_seen": 72376335, + "router_z_loss_clip": 2.54882812, + "router_z_loss_mlp": 0.26135254, + "step": 3351, + "time_per_iteration": 2.5236856937408447 + }, + { + "auxiliary_loss_clip": 0.0657143, + "auxiliary_loss_mlp": 0.01289916, + "balance_loss_clip": 0.06311074, + "balance_loss_mlp": 0.01260638, + "epoch": 0.2015331429430332, + "flos": 25594749411840.0, + "grad_norm": 1.6280311670130643, + "language_loss": 0.81367022, + "learning_rate": 3.699202960155748e-06, + "loss": 0.89228368, + "num_input_tokens_seen": 72395440, + "router_z_loss_clip": 2.60742188, + "router_z_loss_mlp": 0.29248047, + "step": 3352, + "time_per_iteration": 2.603740692138672 + }, + { + "auxiliary_loss_clip": 0.06557955, + "auxiliary_loss_mlp": 0.01286544, + "balance_loss_clip": 0.06303968, + "balance_loss_mlp": 0.01258458, + "epoch": 0.2015932661957012, + "flos": 26732351168640.0, + "grad_norm": 2.001275007108419, + "language_loss": 0.81670761, + "learning_rate": 3.6989975160409396e-06, + "loss": 0.89515263, + "num_input_tokens_seen": 72414670, + "router_z_loss_clip": 2.54101562, + "router_z_loss_mlp": 0.28063965, + "step": 3353, + "time_per_iteration": 2.5631332397460938 + }, + { + "auxiliary_loss_clip": 0.06555627, + "auxiliary_loss_mlp": 0.01278407, + "balance_loss_clip": 0.0630668, + "balance_loss_mlp": 0.01253206, + "epoch": 0.20165338944836916, + "flos": 15638632876800.0, + "grad_norm": 1.8574199324884482, + "language_loss": 0.9049592, + "learning_rate": 3.6987920074999747e-06, + "loss": 0.98329961, + "num_input_tokens_seen": 72432210, + "router_z_loss_clip": 2.49414062, + "router_z_loss_mlp": 0.2520752, + "step": 3354, + "time_per_iteration": 2.567229986190796 + }, + { + "auxiliary_loss_clip": 0.06439115, + "auxiliary_loss_mlp": 0.01275126, + "balance_loss_clip": 0.06305242, + "balance_loss_mlp": 0.01268129, + "epoch": 0.20171351270103713, + "flos": 57929926089600.0, + "grad_norm": 0.8202677442032412, + "language_loss": 0.55840385, + "learning_rate": 3.6985864345406465e-06, + "loss": 0.63554633, + "num_input_tokens_seen": 72489225, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.07012939, + "step": 3355, + "time_per_iteration": 3.118603229522705 + }, + { + "auxiliary_loss_clip": 0.06557105, + "auxiliary_loss_mlp": 0.01281149, + "balance_loss_clip": 0.06309459, + "balance_loss_mlp": 0.01257474, + "epoch": 0.2017736359537051, + "flos": 20820768163200.0, + "grad_norm": 1.5861142309185163, + "language_loss": 0.84845644, + "learning_rate": 3.698380797170751e-06, + "loss": 0.92683893, + "num_input_tokens_seen": 72508715, + "router_z_loss_clip": 2.47265625, + "router_z_loss_mlp": 0.23669434, + "step": 3356, + "time_per_iteration": 2.5407068729400635 + }, + { + "auxiliary_loss_clip": 0.06578876, + "auxiliary_loss_mlp": 0.01283859, + "balance_loss_clip": 0.06314196, + "balance_loss_mlp": 0.01255344, + "epoch": 0.20183375920637306, + "flos": 17097696023040.0, + "grad_norm": 3.7689574240726147, + "language_loss": 0.71072245, + "learning_rate": 3.698175095398085e-06, + "loss": 0.78934979, + "num_input_tokens_seen": 72525135, + "router_z_loss_clip": 2.64257812, + "router_z_loss_mlp": 0.28515625, + "step": 3357, + "time_per_iteration": 2.4921233654022217 + }, + { + "auxiliary_loss_clip": 0.065685, + "auxiliary_loss_mlp": 0.01288812, + "balance_loss_clip": 0.0631017, + "balance_loss_mlp": 0.01263206, + "epoch": 0.20189388245904102, + "flos": 18667323031680.0, + "grad_norm": 2.064581487792546, + "language_loss": 0.72707927, + "learning_rate": 3.6979693292304493e-06, + "loss": 0.80565238, + "num_input_tokens_seen": 72543690, + "router_z_loss_clip": 2.58007812, + "router_z_loss_mlp": 0.25585938, + "step": 3358, + "time_per_iteration": 2.531280040740967 + }, + { + "auxiliary_loss_clip": 0.06550319, + "auxiliary_loss_mlp": 0.0128707, + "balance_loss_clip": 0.06304348, + "balance_loss_mlp": 0.01263633, + "epoch": 0.20195400571170902, + "flos": 16802705324160.0, + "grad_norm": 1.761827203655194, + "language_loss": 0.83542818, + "learning_rate": 3.6977634986756463e-06, + "loss": 0.91380209, + "num_input_tokens_seen": 72560725, + "router_z_loss_clip": 2.46289062, + "router_z_loss_mlp": 0.234375, + "step": 3359, + "time_per_iteration": 2.5004122257232666 + }, + { + "auxiliary_loss_clip": 0.06415485, + "auxiliary_loss_mlp": 0.01275385, + "balance_loss_clip": 0.06281421, + "balance_loss_mlp": 0.01269109, + "epoch": 0.20201412896437698, + "flos": 67192792669440.0, + "grad_norm": 0.7763137973079639, + "language_loss": 0.58718604, + "learning_rate": 3.697557603741482e-06, + "loss": 0.66409475, + "num_input_tokens_seen": 72621940, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.06274414, + "step": 3360, + "time_per_iteration": 3.202280282974243 + }, + { + "auxiliary_loss_clip": 0.06567518, + "auxiliary_loss_mlp": 0.01281863, + "balance_loss_clip": 0.06312253, + "balance_loss_mlp": 0.01257055, + "epoch": 0.20207425221704495, + "flos": 21331477998720.0, + "grad_norm": 2.7701451368403767, + "language_loss": 0.63371557, + "learning_rate": 3.697351644435763e-06, + "loss": 0.71220934, + "num_input_tokens_seen": 72639135, + "router_z_loss_clip": 2.55078125, + "router_z_loss_mlp": 0.24841309, + "step": 3361, + "time_per_iteration": 2.591505527496338 + }, + { + "auxiliary_loss_clip": 0.06556661, + "auxiliary_loss_mlp": 0.01280295, + "balance_loss_clip": 0.06304803, + "balance_loss_mlp": 0.01257049, + "epoch": 0.2021343754697129, + "flos": 22533509145600.0, + "grad_norm": 1.837331842396403, + "language_loss": 0.76495373, + "learning_rate": 3.6971456207662993e-06, + "loss": 0.84332329, + "num_input_tokens_seen": 72658525, + "router_z_loss_clip": 2.51953125, + "router_z_loss_mlp": 0.23254395, + "step": 3362, + "time_per_iteration": 2.5748798847198486 + }, + { + "auxiliary_loss_clip": 0.06552652, + "auxiliary_loss_mlp": 0.01281781, + "balance_loss_clip": 0.06300291, + "balance_loss_mlp": 0.01257379, + "epoch": 0.20219449872238088, + "flos": 19068852597120.0, + "grad_norm": 1.6506097934595576, + "language_loss": 0.77716577, + "learning_rate": 3.6969395327409035e-06, + "loss": 0.85551012, + "num_input_tokens_seen": 72678085, + "router_z_loss_clip": 2.52734375, + "router_z_loss_mlp": 0.24365234, + "step": 3363, + "time_per_iteration": 2.5682361125946045 + }, + { + "auxiliary_loss_clip": 0.06556462, + "auxiliary_loss_mlp": 0.01285372, + "balance_loss_clip": 0.06303493, + "balance_loss_mlp": 0.01262198, + "epoch": 0.20225462197504884, + "flos": 24723864800640.0, + "grad_norm": 1.5662342973814338, + "language_loss": 0.75767177, + "learning_rate": 3.696733380367391e-06, + "loss": 0.83609009, + "num_input_tokens_seen": 72698695, + "router_z_loss_clip": 2.53515625, + "router_z_loss_mlp": 0.23181152, + "step": 3364, + "time_per_iteration": 2.620352029800415 + }, + { + "auxiliary_loss_clip": 0.06564072, + "auxiliary_loss_mlp": 0.01282858, + "balance_loss_clip": 0.06306748, + "balance_loss_mlp": 0.01259374, + "epoch": 0.2023147452277168, + "flos": 22024895662080.0, + "grad_norm": 2.684464985384485, + "language_loss": 0.72232616, + "learning_rate": 3.6965271636535783e-06, + "loss": 0.80079544, + "num_input_tokens_seen": 72717880, + "router_z_loss_clip": 2.57226562, + "router_z_loss_mlp": 0.23474121, + "step": 3365, + "time_per_iteration": 2.6884727478027344 + }, + { + "auxiliary_loss_clip": 0.06551654, + "auxiliary_loss_mlp": 0.01279748, + "balance_loss_clip": 0.0629961, + "balance_loss_mlp": 0.01256336, + "epoch": 0.2023748684803848, + "flos": 17750555510400.0, + "grad_norm": 1.8865204005259733, + "language_loss": 0.86329257, + "learning_rate": 3.696320882607286e-06, + "loss": 0.94160658, + "num_input_tokens_seen": 72736410, + "router_z_loss_clip": 2.52148438, + "router_z_loss_mlp": 0.23425293, + "step": 3366, + "time_per_iteration": 2.541398525238037 + }, + { + "auxiliary_loss_clip": 0.06552443, + "auxiliary_loss_mlp": 0.01277252, + "balance_loss_clip": 0.06302489, + "balance_loss_mlp": 0.01254698, + "epoch": 0.20243499173305277, + "flos": 31146912328320.0, + "grad_norm": 1.6069123477498997, + "language_loss": 0.69763649, + "learning_rate": 3.696114537236335e-06, + "loss": 0.77593338, + "num_input_tokens_seen": 72758295, + "router_z_loss_clip": 2.49804688, + "router_z_loss_mlp": 0.22558594, + "step": 3367, + "time_per_iteration": 2.674370527267456 + }, + { + "auxiliary_loss_clip": 0.06562914, + "auxiliary_loss_mlp": 0.01285589, + "balance_loss_clip": 0.06300482, + "balance_loss_mlp": 0.01257777, + "epoch": 0.20249511498572073, + "flos": 33847726256640.0, + "grad_norm": 1.76028679400595, + "language_loss": 0.69152057, + "learning_rate": 3.6959081275485512e-06, + "loss": 0.77000558, + "num_input_tokens_seen": 72782495, + "router_z_loss_clip": 2.62109375, + "router_z_loss_mlp": 0.27819824, + "step": 3368, + "time_per_iteration": 2.6662635803222656 + }, + { + "auxiliary_loss_clip": 0.06551345, + "auxiliary_loss_mlp": 0.0128738, + "balance_loss_clip": 0.06301117, + "balance_loss_mlp": 0.01263657, + "epoch": 0.2025552382383887, + "flos": 21222088093440.0, + "grad_norm": 1.819755421756695, + "language_loss": 0.78064144, + "learning_rate": 3.6957016535517615e-06, + "loss": 0.8590287, + "num_input_tokens_seen": 72801885, + "router_z_loss_clip": 2.50390625, + "router_z_loss_mlp": 0.23718262, + "step": 3369, + "time_per_iteration": 2.5846660137176514 + }, + { + "auxiliary_loss_clip": 0.06560668, + "auxiliary_loss_mlp": 0.01282514, + "balance_loss_clip": 0.06299458, + "balance_loss_mlp": 0.01257492, + "epoch": 0.20261536149105666, + "flos": 14652614355840.0, + "grad_norm": 3.2010156823618687, + "language_loss": 0.66533637, + "learning_rate": 3.695495115253795e-06, + "loss": 0.74376816, + "num_input_tokens_seen": 72816990, + "router_z_loss_clip": 2.61132812, + "router_z_loss_mlp": 0.25024414, + "step": 3370, + "time_per_iteration": 3.953664541244507 + }, + { + "auxiliary_loss_clip": 0.06420556, + "auxiliary_loss_mlp": 0.01256354, + "balance_loss_clip": 0.06284036, + "balance_loss_mlp": 0.01249797, + "epoch": 0.20267548474372463, + "flos": 66803380018560.0, + "grad_norm": 0.6606134365812599, + "language_loss": 0.58273321, + "learning_rate": 3.6952885126624834e-06, + "loss": 0.65950233, + "num_input_tokens_seen": 72879240, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.06567383, + "step": 3371, + "time_per_iteration": 3.2517025470733643 + }, + { + "auxiliary_loss_clip": 0.06555597, + "auxiliary_loss_mlp": 0.01283717, + "balance_loss_clip": 0.06300298, + "balance_loss_mlp": 0.01257944, + "epoch": 0.2027356079963926, + "flos": 24687667036800.0, + "grad_norm": 1.6416079718190109, + "language_loss": 0.92020303, + "learning_rate": 3.6950818457856617e-06, + "loss": 0.99859619, + "num_input_tokens_seen": 72899030, + "router_z_loss_clip": 2.55273438, + "router_z_loss_mlp": 0.25769043, + "step": 3372, + "time_per_iteration": 4.108370065689087 + }, + { + "auxiliary_loss_clip": 0.06555616, + "auxiliary_loss_mlp": 0.01283062, + "balance_loss_clip": 0.06298956, + "balance_loss_mlp": 0.01258672, + "epoch": 0.20279573124906058, + "flos": 26399443697280.0, + "grad_norm": 1.769817073167301, + "language_loss": 0.79293168, + "learning_rate": 3.694875114631167e-06, + "loss": 0.87131846, + "num_input_tokens_seen": 72919190, + "router_z_loss_clip": 2.56640625, + "router_z_loss_mlp": 0.24414062, + "step": 3373, + "time_per_iteration": 2.6076717376708984 + }, + { + "auxiliary_loss_clip": 0.06543471, + "auxiliary_loss_mlp": 0.01280674, + "balance_loss_clip": 0.06296648, + "balance_loss_mlp": 0.01256343, + "epoch": 0.20285585450172855, + "flos": 33808006621440.0, + "grad_norm": 3.4143342380796255, + "language_loss": 0.72364163, + "learning_rate": 3.6946683192068377e-06, + "loss": 0.8018831, + "num_input_tokens_seen": 72939720, + "router_z_loss_clip": 2.46679688, + "router_z_loss_mlp": 0.24328613, + "step": 3374, + "time_per_iteration": 2.6686174869537354 + }, + { + "auxiliary_loss_clip": 0.06419748, + "auxiliary_loss_mlp": 0.01258876, + "balance_loss_clip": 0.06284177, + "balance_loss_mlp": 0.01252266, + "epoch": 0.20291597775439651, + "flos": 71185768410240.0, + "grad_norm": 1.0120800133799934, + "language_loss": 0.62520474, + "learning_rate": 3.694461459520516e-06, + "loss": 0.70199096, + "num_input_tokens_seen": 73000015, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.06622314, + "step": 3375, + "time_per_iteration": 3.159513473510742 + }, + { + "auxiliary_loss_clip": 0.06548455, + "auxiliary_loss_mlp": 0.01283408, + "balance_loss_clip": 0.06294296, + "balance_loss_mlp": 0.0125891, + "epoch": 0.20297610100706448, + "flos": 19499368475520.0, + "grad_norm": 1.6178559610323104, + "language_loss": 0.82908762, + "learning_rate": 3.6942545355800463e-06, + "loss": 0.90740621, + "num_input_tokens_seen": 73017675, + "router_z_loss_clip": 2.54296875, + "router_z_loss_mlp": 0.24499512, + "step": 3376, + "time_per_iteration": 2.5366275310516357 + }, + { + "auxiliary_loss_clip": 0.06553418, + "auxiliary_loss_mlp": 0.01284265, + "balance_loss_clip": 0.06296962, + "balance_loss_mlp": 0.0125854, + "epoch": 0.20303622425973245, + "flos": 25050944413440.0, + "grad_norm": 2.015544075965587, + "language_loss": 0.82464767, + "learning_rate": 3.6940475473932743e-06, + "loss": 0.90302449, + "num_input_tokens_seen": 73036135, + "router_z_loss_clip": 2.56835938, + "router_z_loss_mlp": 0.25720215, + "step": 3377, + "time_per_iteration": 2.579468250274658 + }, + { + "auxiliary_loss_clip": 0.06554671, + "auxiliary_loss_mlp": 0.01287763, + "balance_loss_clip": 0.06300091, + "balance_loss_mlp": 0.01261453, + "epoch": 0.2030963475124004, + "flos": 21986266129920.0, + "grad_norm": 1.7361857812490578, + "language_loss": 0.7745406, + "learning_rate": 3.69384049496805e-06, + "loss": 0.85296494, + "num_input_tokens_seen": 73054075, + "router_z_loss_clip": 2.546875, + "router_z_loss_mlp": 0.26306152, + "step": 3378, + "time_per_iteration": 3.999164342880249 + }, + { + "auxiliary_loss_clip": 0.06557525, + "auxiliary_loss_mlp": 0.01285912, + "balance_loss_clip": 0.06298093, + "balance_loss_mlp": 0.01259423, + "epoch": 0.2031564707650684, + "flos": 19506496072320.0, + "grad_norm": 1.7814270376711854, + "language_loss": 0.80552137, + "learning_rate": 3.6936333783122242e-06, + "loss": 0.88395572, + "num_input_tokens_seen": 73073530, + "router_z_loss_clip": 2.59375, + "router_z_loss_mlp": 0.26525879, + "step": 3379, + "time_per_iteration": 3.94376277923584 + }, + { + "auxiliary_loss_clip": 0.06547987, + "auxiliary_loss_mlp": 0.01283987, + "balance_loss_clip": 0.06298195, + "balance_loss_mlp": 0.01259799, + "epoch": 0.20321659401773637, + "flos": 22753630621440.0, + "grad_norm": 1.8399421212903948, + "language_loss": 0.87578034, + "learning_rate": 3.6934261974336505e-06, + "loss": 0.95410013, + "num_input_tokens_seen": 73092820, + "router_z_loss_clip": 2.49609375, + "router_z_loss_mlp": 0.24206543, + "step": 3380, + "time_per_iteration": 2.5826356410980225 + }, + { + "auxiliary_loss_clip": 0.06554954, + "auxiliary_loss_mlp": 0.01300173, + "balance_loss_clip": 0.06299303, + "balance_loss_mlp": 0.01274817, + "epoch": 0.20327671727040433, + "flos": 22462455283200.0, + "grad_norm": 2.147675917051705, + "language_loss": 0.75801265, + "learning_rate": 3.693218952340186e-06, + "loss": 0.83656389, + "num_input_tokens_seen": 73113385, + "router_z_loss_clip": 2.55664062, + "router_z_loss_mlp": 0.2532959, + "step": 3381, + "time_per_iteration": 2.580035924911499 + }, + { + "auxiliary_loss_clip": 0.06559204, + "auxiliary_loss_mlp": 0.0128659, + "balance_loss_clip": 0.06297147, + "balance_loss_mlp": 0.01260198, + "epoch": 0.2033368405230723, + "flos": 19540807119360.0, + "grad_norm": 1.8225171591496117, + "language_loss": 0.79701936, + "learning_rate": 3.6930116430396895e-06, + "loss": 0.87547731, + "num_input_tokens_seen": 73131195, + "router_z_loss_clip": 2.62304688, + "router_z_loss_mlp": 0.26391602, + "step": 3382, + "time_per_iteration": 2.743842601776123 + }, + { + "auxiliary_loss_clip": 0.06551235, + "auxiliary_loss_mlp": 0.01283934, + "balance_loss_clip": 0.06293041, + "balance_loss_mlp": 0.01258745, + "epoch": 0.20339696377574026, + "flos": 13814489491200.0, + "grad_norm": 1.712325191768153, + "language_loss": 0.80308962, + "learning_rate": 3.6928042695400214e-06, + "loss": 0.8814413, + "num_input_tokens_seen": 73148850, + "router_z_loss_clip": 2.58203125, + "router_z_loss_mlp": 0.25195312, + "step": 3383, + "time_per_iteration": 2.6428067684173584 + }, + { + "auxiliary_loss_clip": 0.06548008, + "auxiliary_loss_mlp": 0.01285433, + "balance_loss_clip": 0.06295451, + "balance_loss_mlp": 0.01259541, + "epoch": 0.20345708702840823, + "flos": 20345627185920.0, + "grad_norm": 1.7809184522678074, + "language_loss": 0.75199848, + "learning_rate": 3.6925968318490464e-06, + "loss": 0.83033288, + "num_input_tokens_seen": 73166775, + "router_z_loss_clip": 2.52734375, + "router_z_loss_mlp": 0.25891113, + "step": 3384, + "time_per_iteration": 2.5601112842559814 + }, + { + "auxiliary_loss_clip": 0.06573269, + "auxiliary_loss_mlp": 0.01282943, + "balance_loss_clip": 0.06306025, + "balance_loss_mlp": 0.01256229, + "epoch": 0.2035172102810762, + "flos": 20339254275840.0, + "grad_norm": 2.5841350087074852, + "language_loss": 0.77226508, + "learning_rate": 3.6923893299746293e-06, + "loss": 0.85082722, + "num_input_tokens_seen": 73183215, + "router_z_loss_clip": 2.66992188, + "router_z_loss_mlp": 0.26745605, + "step": 3385, + "time_per_iteration": 2.527583122253418 + }, + { + "auxiliary_loss_clip": 0.06553946, + "auxiliary_loss_mlp": 0.01288968, + "balance_loss_clip": 0.06300423, + "balance_loss_mlp": 0.01263934, + "epoch": 0.2035773335337442, + "flos": 23337658379520.0, + "grad_norm": 1.6683994830989402, + "language_loss": 0.70000219, + "learning_rate": 3.692181763924639e-06, + "loss": 0.7784313, + "num_input_tokens_seen": 73203290, + "router_z_loss_clip": 2.53710938, + "router_z_loss_mlp": 0.25048828, + "step": 3386, + "time_per_iteration": 2.583940029144287 + }, + { + "auxiliary_loss_clip": 0.06550556, + "auxiliary_loss_mlp": 0.01289862, + "balance_loss_clip": 0.0629431, + "balance_loss_mlp": 0.01265495, + "epoch": 0.20363745678641215, + "flos": 28337924378880.0, + "grad_norm": 1.2744067098921972, + "language_loss": 0.81998229, + "learning_rate": 3.691974133706947e-06, + "loss": 0.89838648, + "num_input_tokens_seen": 73226185, + "router_z_loss_clip": 2.5625, + "router_z_loss_mlp": 0.24365234, + "step": 3387, + "time_per_iteration": 2.624765634536743 + }, + { + "auxiliary_loss_clip": 0.06543861, + "auxiliary_loss_mlp": 0.01285642, + "balance_loss_clip": 0.06297304, + "balance_loss_mlp": 0.01261705, + "epoch": 0.20369758003908012, + "flos": 18921503992320.0, + "grad_norm": 2.338231566069276, + "language_loss": 0.80333674, + "learning_rate": 3.6917664393294262e-06, + "loss": 0.88163185, + "num_input_tokens_seen": 73243300, + "router_z_loss_clip": 2.47070312, + "router_z_loss_mlp": 0.23925781, + "step": 3388, + "time_per_iteration": 2.565795421600342 + }, + { + "auxiliary_loss_clip": 0.06553982, + "auxiliary_loss_mlp": 0.01281213, + "balance_loss_clip": 0.06297579, + "balance_loss_mlp": 0.0125693, + "epoch": 0.20375770329174808, + "flos": 19212218133120.0, + "grad_norm": 1.8814817968190891, + "language_loss": 0.72894287, + "learning_rate": 3.6915586807999527e-06, + "loss": 0.80729485, + "num_input_tokens_seen": 73261490, + "router_z_loss_clip": 2.56445312, + "router_z_loss_mlp": 0.24279785, + "step": 3389, + "time_per_iteration": 2.5263590812683105 + }, + { + "auxiliary_loss_clip": 0.06544612, + "auxiliary_loss_mlp": 0.01286594, + "balance_loss_clip": 0.06296231, + "balance_loss_mlp": 0.01262204, + "epoch": 0.20381782654441605, + "flos": 19397106167040.0, + "grad_norm": 2.5524619095037626, + "language_loss": 0.88214552, + "learning_rate": 3.691350858126404e-06, + "loss": 0.96045768, + "num_input_tokens_seen": 73280180, + "router_z_loss_clip": 2.484375, + "router_z_loss_mlp": 0.24377441, + "step": 3390, + "time_per_iteration": 2.5450997352600098 + }, + { + "auxiliary_loss_clip": 0.06546676, + "auxiliary_loss_mlp": 0.01283726, + "balance_loss_clip": 0.06297011, + "balance_loss_mlp": 0.01260683, + "epoch": 0.203877949797084, + "flos": 24834764079360.0, + "grad_norm": 2.430374095532116, + "language_loss": 0.71690643, + "learning_rate": 3.691142971316662e-06, + "loss": 0.79521036, + "num_input_tokens_seen": 73300680, + "router_z_loss_clip": 2.49609375, + "router_z_loss_mlp": 0.23022461, + "step": 3391, + "time_per_iteration": 2.5983424186706543 + }, + { + "auxiliary_loss_clip": 0.06548478, + "auxiliary_loss_mlp": 0.01287319, + "balance_loss_clip": 0.06300271, + "balance_loss_mlp": 0.01263799, + "epoch": 0.20393807304975198, + "flos": 18009432299520.0, + "grad_norm": 3.271459971820983, + "language_loss": 0.87029123, + "learning_rate": 3.6909350203786086e-06, + "loss": 0.94864917, + "num_input_tokens_seen": 73316760, + "router_z_loss_clip": 2.48242188, + "router_z_loss_mlp": 0.2355957, + "step": 3392, + "time_per_iteration": 2.5094432830810547 + }, + { + "auxiliary_loss_clip": 0.06555735, + "auxiliary_loss_mlp": 0.01288889, + "balance_loss_clip": 0.06302007, + "balance_loss_mlp": 0.0126432, + "epoch": 0.20399819630241997, + "flos": 24213867724800.0, + "grad_norm": 1.4298747009925739, + "language_loss": 0.8143822, + "learning_rate": 3.69072700532013e-06, + "loss": 0.8928284, + "num_input_tokens_seen": 73339385, + "router_z_loss_clip": 2.53710938, + "router_z_loss_mlp": 0.24560547, + "step": 3393, + "time_per_iteration": 2.674898147583008 + }, + { + "auxiliary_loss_clip": 0.06555712, + "auxiliary_loss_mlp": 0.01283361, + "balance_loss_clip": 0.0630876, + "balance_loss_mlp": 0.01260747, + "epoch": 0.20405831955508794, + "flos": 20783396442240.0, + "grad_norm": 2.2973425083766377, + "language_loss": 0.87181509, + "learning_rate": 3.6905189261491137e-06, + "loss": 0.9502058, + "num_input_tokens_seen": 73357235, + "router_z_loss_clip": 2.47070312, + "router_z_loss_mlp": 0.22619629, + "step": 3394, + "time_per_iteration": 2.5489470958709717 + }, + { + "auxiliary_loss_clip": 0.06548424, + "auxiliary_loss_mlp": 0.0128548, + "balance_loss_clip": 0.06299029, + "balance_loss_mlp": 0.01262448, + "epoch": 0.2041184428077559, + "flos": 15492332448000.0, + "grad_norm": 2.1306464149991027, + "language_loss": 0.8456347, + "learning_rate": 3.69031078287345e-06, + "loss": 0.92397374, + "num_input_tokens_seen": 73374435, + "router_z_loss_clip": 2.49609375, + "router_z_loss_mlp": 0.23034668, + "step": 3395, + "time_per_iteration": 2.5297558307647705 + }, + { + "auxiliary_loss_clip": 0.06554371, + "auxiliary_loss_mlp": 0.01288203, + "balance_loss_clip": 0.06299008, + "balance_loss_mlp": 0.0126448, + "epoch": 0.20417856606042387, + "flos": 15592582258560.0, + "grad_norm": 1.9297262637725432, + "language_loss": 0.84104818, + "learning_rate": 3.690102575501033e-06, + "loss": 0.91947389, + "num_input_tokens_seen": 73391025, + "router_z_loss_clip": 2.55273438, + "router_z_loss_mlp": 0.23730469, + "step": 3396, + "time_per_iteration": 2.492448568344116 + }, + { + "auxiliary_loss_clip": 0.0654766, + "auxiliary_loss_mlp": 0.01296047, + "balance_loss_clip": 0.06301443, + "balance_loss_mlp": 0.01272706, + "epoch": 0.20423868931309183, + "flos": 24286137471360.0, + "grad_norm": 2.084884773893835, + "language_loss": 0.7751056, + "learning_rate": 3.6898943040397556e-06, + "loss": 0.85354269, + "num_input_tokens_seen": 73409270, + "router_z_loss_clip": 2.46289062, + "router_z_loss_mlp": 0.2331543, + "step": 3397, + "time_per_iteration": 2.5621836185455322 + }, + { + "auxiliary_loss_clip": 0.06547033, + "auxiliary_loss_mlp": 0.01291146, + "balance_loss_clip": 0.06300367, + "balance_loss_mlp": 0.01268067, + "epoch": 0.2042988125657598, + "flos": 18619176061440.0, + "grad_norm": 3.401004534017878, + "language_loss": 0.88746947, + "learning_rate": 3.689685968497518e-06, + "loss": 0.96585131, + "num_input_tokens_seen": 73425225, + "router_z_loss_clip": 2.46875, + "router_z_loss_mlp": 0.23083496, + "step": 3398, + "time_per_iteration": 2.4821889400482178 + }, + { + "auxiliary_loss_clip": 0.06555858, + "auxiliary_loss_mlp": 0.01287072, + "balance_loss_clip": 0.06305312, + "balance_loss_mlp": 0.01263361, + "epoch": 0.2043589358184278, + "flos": 17855836565760.0, + "grad_norm": 2.044777021305177, + "language_loss": 0.79053116, + "learning_rate": 3.6894775688822186e-06, + "loss": 0.8689605, + "num_input_tokens_seen": 73440940, + "router_z_loss_clip": 2.50390625, + "router_z_loss_mlp": 0.23706055, + "step": 3399, + "time_per_iteration": 2.5007028579711914 + }, + { + "auxiliary_loss_clip": 0.06554085, + "auxiliary_loss_mlp": 0.01288353, + "balance_loss_clip": 0.06300685, + "balance_loss_mlp": 0.01264678, + "epoch": 0.20441905907109575, + "flos": 21441832225920.0, + "grad_norm": 3.4484144890832327, + "language_loss": 0.77263522, + "learning_rate": 3.6892691052017603e-06, + "loss": 0.85105962, + "num_input_tokens_seen": 73458805, + "router_z_loss_clip": 2.53125, + "router_z_loss_mlp": 0.23669434, + "step": 3400, + "time_per_iteration": 2.524930715560913 + }, + { + "auxiliary_loss_clip": 0.06546277, + "auxiliary_loss_mlp": 0.0128369, + "balance_loss_clip": 0.0630067, + "balance_loss_mlp": 0.01262423, + "epoch": 0.20447918232376372, + "flos": 27714847818240.0, + "grad_norm": 1.566944783994086, + "language_loss": 0.7976017, + "learning_rate": 3.6890605774640487e-06, + "loss": 0.87590134, + "num_input_tokens_seen": 73479380, + "router_z_loss_clip": 2.45703125, + "router_z_loss_mlp": 0.21264648, + "step": 3401, + "time_per_iteration": 2.5868172645568848 + }, + { + "auxiliary_loss_clip": 0.06547564, + "auxiliary_loss_mlp": 0.01287222, + "balance_loss_clip": 0.06297088, + "balance_loss_mlp": 0.01263833, + "epoch": 0.20453930557643168, + "flos": 30533017789440.0, + "grad_norm": 1.6743436404675067, + "language_loss": 0.69998658, + "learning_rate": 3.688851985676991e-06, + "loss": 0.7783345, + "num_input_tokens_seen": 73505105, + "router_z_loss_clip": 2.50195312, + "router_z_loss_mlp": 0.23400879, + "step": 3402, + "time_per_iteration": 2.664961099624634 + }, + { + "auxiliary_loss_clip": 0.06561718, + "auxiliary_loss_mlp": 0.01282309, + "balance_loss_clip": 0.06309628, + "balance_loss_mlp": 0.01259981, + "epoch": 0.20459942882909965, + "flos": 18993480249600.0, + "grad_norm": 2.0207590642868736, + "language_loss": 0.82498461, + "learning_rate": 3.688643329848496e-06, + "loss": 0.90342486, + "num_input_tokens_seen": 73523700, + "router_z_loss_clip": 2.52148438, + "router_z_loss_mlp": 0.2232666, + "step": 3403, + "time_per_iteration": 2.527240514755249 + }, + { + "auxiliary_loss_clip": 0.0655287, + "auxiliary_loss_mlp": 0.0128312, + "balance_loss_clip": 0.06304024, + "balance_loss_mlp": 0.01260256, + "epoch": 0.20465955208176762, + "flos": 20345207915520.0, + "grad_norm": 1.870475930372837, + "language_loss": 0.83792305, + "learning_rate": 3.6884346099864772e-06, + "loss": 0.91628289, + "num_input_tokens_seen": 73542625, + "router_z_loss_clip": 2.48632812, + "router_z_loss_mlp": 0.22900391, + "step": 3404, + "time_per_iteration": 2.5108580589294434 + }, + { + "auxiliary_loss_clip": 0.06555478, + "auxiliary_loss_mlp": 0.01280254, + "balance_loss_clip": 0.06302839, + "balance_loss_mlp": 0.0125671, + "epoch": 0.20471967533443558, + "flos": 21257615024640.0, + "grad_norm": 1.9668153962924477, + "language_loss": 0.86568373, + "learning_rate": 3.6882258260988487e-06, + "loss": 0.94404107, + "num_input_tokens_seen": 73561450, + "router_z_loss_clip": 2.52539062, + "router_z_loss_mlp": 0.2355957, + "step": 3405, + "time_per_iteration": 2.6064257621765137 + }, + { + "auxiliary_loss_clip": 0.06551084, + "auxiliary_loss_mlp": 0.0128024, + "balance_loss_clip": 0.06302287, + "balance_loss_mlp": 0.01257256, + "epoch": 0.20477979858710357, + "flos": 14506775124480.0, + "grad_norm": 2.695451734790842, + "language_loss": 0.85318458, + "learning_rate": 3.6880169781935276e-06, + "loss": 0.93149781, + "num_input_tokens_seen": 73577155, + "router_z_loss_clip": 2.49023438, + "router_z_loss_mlp": 0.22973633, + "step": 3406, + "time_per_iteration": 2.490360975265503 + }, + { + "auxiliary_loss_clip": 0.06551544, + "auxiliary_loss_mlp": 0.01279954, + "balance_loss_clip": 0.06302837, + "balance_loss_mlp": 0.01256768, + "epoch": 0.20483992183977154, + "flos": 11405018609280.0, + "grad_norm": 8.923539759508978, + "language_loss": 0.69000643, + "learning_rate": 3.6878080662784336e-06, + "loss": 0.76832145, + "num_input_tokens_seen": 73594900, + "router_z_loss_clip": 2.484375, + "router_z_loss_mlp": 0.23193359, + "step": 3407, + "time_per_iteration": 2.5344340801239014 + }, + { + "auxiliary_loss_clip": 0.06549555, + "auxiliary_loss_mlp": 0.01280964, + "balance_loss_clip": 0.06303824, + "balance_loss_mlp": 0.01258374, + "epoch": 0.2049000450924395, + "flos": 19065917704320.0, + "grad_norm": 2.112423962078429, + "language_loss": 0.85367447, + "learning_rate": 3.6875990903614886e-06, + "loss": 0.93197966, + "num_input_tokens_seen": 73613810, + "router_z_loss_clip": 2.45703125, + "router_z_loss_mlp": 0.22583008, + "step": 3408, + "time_per_iteration": 2.5491087436676025 + }, + { + "auxiliary_loss_clip": 0.06564584, + "auxiliary_loss_mlp": 0.0128728, + "balance_loss_clip": 0.06310433, + "balance_loss_mlp": 0.0126314, + "epoch": 0.20496016834510747, + "flos": 14579799557760.0, + "grad_norm": 2.4221013711544876, + "language_loss": 0.65169537, + "learning_rate": 3.6873900504506166e-06, + "loss": 0.730214, + "num_input_tokens_seen": 73631495, + "router_z_loss_clip": 2.54492188, + "router_z_loss_mlp": 0.24121094, + "step": 3409, + "time_per_iteration": 2.5570828914642334 + }, + { + "auxiliary_loss_clip": 0.06553619, + "auxiliary_loss_mlp": 0.0128187, + "balance_loss_clip": 0.06302843, + "balance_loss_mlp": 0.01259029, + "epoch": 0.20502029159777543, + "flos": 22133069683200.0, + "grad_norm": 1.5677004994493864, + "language_loss": 0.81331646, + "learning_rate": 3.687180946553745e-06, + "loss": 0.89167136, + "num_input_tokens_seen": 73652840, + "router_z_loss_clip": 2.5078125, + "router_z_loss_mlp": 0.22851562, + "step": 3410, + "time_per_iteration": 3.9941341876983643 + }, + { + "auxiliary_loss_clip": 0.06562116, + "auxiliary_loss_mlp": 0.01278044, + "balance_loss_clip": 0.06316169, + "balance_loss_mlp": 0.01256252, + "epoch": 0.2050804148504434, + "flos": 25373873249280.0, + "grad_norm": 2.231323409005704, + "language_loss": 0.76898587, + "learning_rate": 3.686971778678803e-06, + "loss": 0.84738749, + "num_input_tokens_seen": 73672150, + "router_z_loss_clip": 2.45898438, + "router_z_loss_mlp": 0.21801758, + "step": 3411, + "time_per_iteration": 2.557502031326294 + }, + { + "auxiliary_loss_clip": 0.06566584, + "auxiliary_loss_mlp": 0.01283098, + "balance_loss_clip": 0.06318649, + "balance_loss_mlp": 0.01260567, + "epoch": 0.2051405381031114, + "flos": 23626443876480.0, + "grad_norm": 1.9814328821552187, + "language_loss": 0.73997778, + "learning_rate": 3.686762546833722e-06, + "loss": 0.81847459, + "num_input_tokens_seen": 73691940, + "router_z_loss_clip": 2.47851562, + "router_z_loss_mlp": 0.22521973, + "step": 3412, + "time_per_iteration": 4.038960695266724 + }, + { + "auxiliary_loss_clip": 0.06568237, + "auxiliary_loss_mlp": 0.01280941, + "balance_loss_clip": 0.06316938, + "balance_loss_mlp": 0.01257183, + "epoch": 0.20520066135577936, + "flos": 19570338483840.0, + "grad_norm": 2.4438525241528963, + "language_loss": 0.79063112, + "learning_rate": 3.6865532510264362e-06, + "loss": 0.86912292, + "num_input_tokens_seen": 73709080, + "router_z_loss_clip": 2.515625, + "router_z_loss_mlp": 0.23754883, + "step": 3413, + "time_per_iteration": 2.5169565677642822 + }, + { + "auxiliary_loss_clip": 0.0655475, + "auxiliary_loss_mlp": 0.0128187, + "balance_loss_clip": 0.06315412, + "balance_loss_mlp": 0.01259423, + "epoch": 0.20526078460844732, + "flos": 17682184978560.0, + "grad_norm": 1.8594099787920526, + "language_loss": 0.85324407, + "learning_rate": 3.6863438912648823e-06, + "loss": 0.93161035, + "num_input_tokens_seen": 73727670, + "router_z_loss_clip": 2.39453125, + "router_z_loss_mlp": 0.2244873, + "step": 3414, + "time_per_iteration": 2.51891827583313 + }, + { + "auxiliary_loss_clip": 0.06556672, + "auxiliary_loss_mlp": 0.01283982, + "balance_loss_clip": 0.0631127, + "balance_loss_mlp": 0.01261451, + "epoch": 0.2053209078611153, + "flos": 21505632710400.0, + "grad_norm": 1.8989416463636506, + "language_loss": 0.8139196, + "learning_rate": 3.6861344675569986e-06, + "loss": 0.89232612, + "num_input_tokens_seen": 73747170, + "router_z_loss_clip": 2.453125, + "router_z_loss_mlp": 0.22521973, + "step": 3415, + "time_per_iteration": 2.534064769744873 + }, + { + "auxiliary_loss_clip": 0.06545444, + "auxiliary_loss_mlp": 0.01280017, + "balance_loss_clip": 0.06300274, + "balance_loss_mlp": 0.01259048, + "epoch": 0.20538103111378325, + "flos": 25670163686400.0, + "grad_norm": 1.9272907146050138, + "language_loss": 0.73450923, + "learning_rate": 3.6859249799107275e-06, + "loss": 0.81276381, + "num_input_tokens_seen": 73767690, + "router_z_loss_clip": 2.44921875, + "router_z_loss_mlp": 0.20959473, + "step": 3416, + "time_per_iteration": 2.5862622261047363 + }, + { + "auxiliary_loss_clip": 0.06555279, + "auxiliary_loss_mlp": 0.01278586, + "balance_loss_clip": 0.06309061, + "balance_loss_mlp": 0.01256342, + "epoch": 0.20544115436645122, + "flos": 23155663311360.0, + "grad_norm": 3.21470343355828, + "language_loss": 0.79731691, + "learning_rate": 3.6857154283340115e-06, + "loss": 0.87565553, + "num_input_tokens_seen": 73786900, + "router_z_loss_clip": 2.46484375, + "router_z_loss_mlp": 0.22253418, + "step": 3417, + "time_per_iteration": 2.5488288402557373 + }, + { + "auxiliary_loss_clip": 0.06553051, + "auxiliary_loss_mlp": 0.0128197, + "balance_loss_clip": 0.06304517, + "balance_loss_mlp": 0.01258248, + "epoch": 0.20550127761911918, + "flos": 19396435334400.0, + "grad_norm": 3.2012221600430744, + "language_loss": 0.88593423, + "learning_rate": 3.685505812834798e-06, + "loss": 0.96428442, + "num_input_tokens_seen": 73804515, + "router_z_loss_clip": 2.484375, + "router_z_loss_mlp": 0.23681641, + "step": 3418, + "time_per_iteration": 5.385840177536011 + }, + { + "auxiliary_loss_clip": 0.06553373, + "auxiliary_loss_mlp": 0.01284895, + "balance_loss_clip": 0.06304052, + "balance_loss_mlp": 0.0125998, + "epoch": 0.20556140087178718, + "flos": 22899721415040.0, + "grad_norm": 2.325256215928591, + "language_loss": 0.63040721, + "learning_rate": 3.685296133421035e-06, + "loss": 0.70878994, + "num_input_tokens_seen": 73822910, + "router_z_loss_clip": 2.49609375, + "router_z_loss_mlp": 0.24926758, + "step": 3419, + "time_per_iteration": 2.5786759853363037 + }, + { + "auxiliary_loss_clip": 0.06563735, + "auxiliary_loss_mlp": 0.01291649, + "balance_loss_clip": 0.06310479, + "balance_loss_mlp": 0.01265554, + "epoch": 0.20562152412445514, + "flos": 19795365423360.0, + "grad_norm": 1.7732270709951168, + "language_loss": 0.86988509, + "learning_rate": 3.685086390100674e-06, + "loss": 0.948439, + "num_input_tokens_seen": 73841160, + "router_z_loss_clip": 2.53515625, + "router_z_loss_mlp": 0.26098633, + "step": 3420, + "time_per_iteration": 2.5364928245544434 + }, + { + "auxiliary_loss_clip": 0.06546585, + "auxiliary_loss_mlp": 0.01284653, + "balance_loss_clip": 0.0630153, + "balance_loss_mlp": 0.01261109, + "epoch": 0.2056816473771231, + "flos": 31509728507520.0, + "grad_norm": 10.333340616962191, + "language_loss": 0.71886712, + "learning_rate": 3.684876582881668e-06, + "loss": 0.79717946, + "num_input_tokens_seen": 73862795, + "router_z_loss_clip": 2.44726562, + "router_z_loss_mlp": 0.2355957, + "step": 3421, + "time_per_iteration": 2.6350786685943604 + }, + { + "auxiliary_loss_clip": 0.06544094, + "auxiliary_loss_mlp": 0.01288814, + "balance_loss_clip": 0.0629928, + "balance_loss_mlp": 0.0126564, + "epoch": 0.20574177062979107, + "flos": 23265095143680.0, + "grad_norm": 2.122387036588777, + "language_loss": 0.72175372, + "learning_rate": 3.6846667117719732e-06, + "loss": 0.8000828, + "num_input_tokens_seen": 73881525, + "router_z_loss_clip": 2.44921875, + "router_z_loss_mlp": 0.23168945, + "step": 3422, + "time_per_iteration": 2.578552007675171 + }, + { + "auxiliary_loss_clip": 0.06409879, + "auxiliary_loss_mlp": 0.01269861, + "balance_loss_clip": 0.06279843, + "balance_loss_mlp": 0.01263078, + "epoch": 0.20580189388245904, + "flos": 70331124291840.0, + "grad_norm": 0.7131964126658911, + "language_loss": 0.551377, + "learning_rate": 3.684456776779548e-06, + "loss": 0.62817442, + "num_input_tokens_seen": 73937775, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.06799316, + "step": 3423, + "time_per_iteration": 3.2106337547302246 + }, + { + "auxiliary_loss_clip": 0.06548166, + "auxiliary_loss_mlp": 0.0128448, + "balance_loss_clip": 0.06301543, + "balance_loss_mlp": 0.01261091, + "epoch": 0.205862017135127, + "flos": 30745802033280.0, + "grad_norm": 1.8660135712145316, + "language_loss": 0.72238076, + "learning_rate": 3.684246777912353e-06, + "loss": 0.80070728, + "num_input_tokens_seen": 73958250, + "router_z_loss_clip": 2.46875, + "router_z_loss_mlp": 0.23400879, + "step": 3424, + "time_per_iteration": 2.614389181137085 + }, + { + "auxiliary_loss_clip": 0.06544662, + "auxiliary_loss_mlp": 0.01287262, + "balance_loss_clip": 0.06303795, + "balance_loss_mlp": 0.01263229, + "epoch": 0.20592214038779497, + "flos": 21330932947200.0, + "grad_norm": 1.6926765615616197, + "language_loss": 0.75646138, + "learning_rate": 3.684036715178351e-06, + "loss": 0.83478063, + "num_input_tokens_seen": 73977775, + "router_z_loss_clip": 2.41015625, + "router_z_loss_mlp": 0.24023438, + "step": 3425, + "time_per_iteration": 2.5351436138153076 + }, + { + "auxiliary_loss_clip": 0.06546403, + "auxiliary_loss_mlp": 0.01289796, + "balance_loss_clip": 0.06304145, + "balance_loss_mlp": 0.01266813, + "epoch": 0.20598226364046296, + "flos": 22898002406400.0, + "grad_norm": 1.848184132977354, + "language_loss": 0.88618112, + "learning_rate": 3.683826588585508e-06, + "loss": 0.9645431, + "num_input_tokens_seen": 73996590, + "router_z_loss_clip": 2.421875, + "router_z_loss_mlp": 0.22998047, + "step": 3426, + "time_per_iteration": 2.604752779006958 + }, + { + "auxiliary_loss_clip": 0.06551787, + "auxiliary_loss_mlp": 0.01284615, + "balance_loss_clip": 0.06311674, + "balance_loss_mlp": 0.01261226, + "epoch": 0.20604238689313092, + "flos": 23885362592640.0, + "grad_norm": 1.5517486951437824, + "language_loss": 0.77144063, + "learning_rate": 3.6836163981417926e-06, + "loss": 0.8498047, + "num_input_tokens_seen": 74015935, + "router_z_loss_clip": 2.40625, + "router_z_loss_mlp": 0.23376465, + "step": 3427, + "time_per_iteration": 2.5526115894317627 + }, + { + "auxiliary_loss_clip": 0.06556956, + "auxiliary_loss_mlp": 0.01287227, + "balance_loss_clip": 0.06309945, + "balance_loss_mlp": 0.01264661, + "epoch": 0.2061025101457989, + "flos": 22498024141440.0, + "grad_norm": 1.8896972045039995, + "language_loss": 0.74443614, + "learning_rate": 3.683406143855174e-06, + "loss": 0.822878, + "num_input_tokens_seen": 74036575, + "router_z_loss_clip": 2.46875, + "router_z_loss_mlp": 0.22558594, + "step": 3428, + "time_per_iteration": 2.5644474029541016 + }, + { + "auxiliary_loss_clip": 0.06552382, + "auxiliary_loss_mlp": 0.01283805, + "balance_loss_clip": 0.06304047, + "balance_loss_mlp": 0.01259427, + "epoch": 0.20616263339846685, + "flos": 22784713286400.0, + "grad_norm": 1.96097325322206, + "language_loss": 0.74164659, + "learning_rate": 3.6831958257336256e-06, + "loss": 0.82000846, + "num_input_tokens_seen": 74055365, + "router_z_loss_clip": 2.484375, + "router_z_loss_mlp": 0.24377441, + "step": 3429, + "time_per_iteration": 2.5337913036346436 + }, + { + "auxiliary_loss_clip": 0.06551956, + "auxiliary_loss_mlp": 0.01286455, + "balance_loss_clip": 0.06304303, + "balance_loss_mlp": 0.01263126, + "epoch": 0.20622275665113482, + "flos": 20887755102720.0, + "grad_norm": 2.9642283368918863, + "language_loss": 0.86220586, + "learning_rate": 3.6829854437851237e-06, + "loss": 0.94058996, + "num_input_tokens_seen": 74074875, + "router_z_loss_clip": 2.4765625, + "router_z_loss_mlp": 0.23327637, + "step": 3430, + "time_per_iteration": 2.5939443111419678 + }, + { + "auxiliary_loss_clip": 0.06546243, + "auxiliary_loss_mlp": 0.01280876, + "balance_loss_clip": 0.06300765, + "balance_loss_mlp": 0.01257607, + "epoch": 0.20628287990380278, + "flos": 19360489132800.0, + "grad_norm": 1.6588894263331828, + "language_loss": 0.70011377, + "learning_rate": 3.6827749980176444e-06, + "loss": 0.77838504, + "num_input_tokens_seen": 74094505, + "router_z_loss_clip": 2.45507812, + "router_z_loss_mlp": 0.23278809, + "step": 3431, + "time_per_iteration": 2.565840482711792 + }, + { + "auxiliary_loss_clip": 0.06410907, + "auxiliary_loss_mlp": 0.0126731, + "balance_loss_clip": 0.06280327, + "balance_loss_mlp": 0.01261215, + "epoch": 0.20634300315647078, + "flos": 71536970799360.0, + "grad_norm": 0.791675242165557, + "language_loss": 0.60400987, + "learning_rate": 3.6825644884391693e-06, + "loss": 0.68079197, + "num_input_tokens_seen": 74158500, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.0609436, + "step": 3432, + "time_per_iteration": 3.305082082748413 + }, + { + "auxiliary_loss_clip": 0.06552991, + "auxiliary_loss_mlp": 0.01280414, + "balance_loss_clip": 0.06308176, + "balance_loss_mlp": 0.01257561, + "epoch": 0.20640312640913874, + "flos": 21730072671360.0, + "grad_norm": 1.5897016059046762, + "language_loss": 0.72477019, + "learning_rate": 3.682353915057679e-06, + "loss": 0.80310422, + "num_input_tokens_seen": 74176685, + "router_z_loss_clip": 2.44921875, + "router_z_loss_mlp": 0.22875977, + "step": 3433, + "time_per_iteration": 2.564393997192383 + }, + { + "auxiliary_loss_clip": 0.06561184, + "auxiliary_loss_mlp": 0.01281531, + "balance_loss_clip": 0.06312474, + "balance_loss_mlp": 0.01258512, + "epoch": 0.2064632496618067, + "flos": 20560256219520.0, + "grad_norm": 1.7877531320590552, + "language_loss": 0.87141019, + "learning_rate": 3.6821432778811604e-06, + "loss": 0.94983733, + "num_input_tokens_seen": 74194935, + "router_z_loss_clip": 2.484375, + "router_z_loss_mlp": 0.23010254, + "step": 3434, + "time_per_iteration": 2.5466108322143555 + }, + { + "auxiliary_loss_clip": 0.06556005, + "auxiliary_loss_mlp": 0.01283316, + "balance_loss_clip": 0.06305495, + "balance_loss_mlp": 0.01259427, + "epoch": 0.20652337291447467, + "flos": 29830669666560.0, + "grad_norm": 1.6526860814470912, + "language_loss": 0.6970489, + "learning_rate": 3.6819325769176004e-06, + "loss": 0.77544212, + "num_input_tokens_seen": 74215400, + "router_z_loss_clip": 2.50585938, + "router_z_loss_mlp": 0.2388916, + "step": 3435, + "time_per_iteration": 2.613896369934082 + }, + { + "auxiliary_loss_clip": 0.06545977, + "auxiliary_loss_mlp": 0.01289312, + "balance_loss_clip": 0.0630382, + "balance_loss_mlp": 0.01264325, + "epoch": 0.20658349616714264, + "flos": 26220844719360.0, + "grad_norm": 1.7674379542335852, + "language_loss": 0.89957321, + "learning_rate": 3.681721812174988e-06, + "loss": 0.97792608, + "num_input_tokens_seen": 74234090, + "router_z_loss_clip": 2.42578125, + "router_z_loss_mlp": 0.24975586, + "step": 3436, + "time_per_iteration": 2.590360641479492 + }, + { + "auxiliary_loss_clip": 0.06548543, + "auxiliary_loss_mlp": 0.01277538, + "balance_loss_clip": 0.06303848, + "balance_loss_mlp": 0.01254209, + "epoch": 0.2066436194198106, + "flos": 26001477930240.0, + "grad_norm": 1.7140409089026185, + "language_loss": 0.77244872, + "learning_rate": 3.6815109836613163e-06, + "loss": 0.8507095, + "num_input_tokens_seen": 74253345, + "router_z_loss_clip": 2.44335938, + "router_z_loss_mlp": 0.23339844, + "step": 3437, + "time_per_iteration": 2.6068568229675293 + }, + { + "auxiliary_loss_clip": 0.06548648, + "auxiliary_loss_mlp": 0.01280201, + "balance_loss_clip": 0.06300757, + "balance_loss_mlp": 0.01257682, + "epoch": 0.20670374267247857, + "flos": 21367466127360.0, + "grad_norm": 2.0146667208247355, + "language_loss": 0.78725338, + "learning_rate": 3.6813000913845795e-06, + "loss": 0.86554188, + "num_input_tokens_seen": 74271615, + "router_z_loss_clip": 2.47460938, + "router_z_loss_mlp": 0.22521973, + "step": 3438, + "time_per_iteration": 2.567963123321533 + }, + { + "auxiliary_loss_clip": 0.06407821, + "auxiliary_loss_mlp": 0.01263014, + "balance_loss_clip": 0.06278364, + "balance_loss_mlp": 0.01257164, + "epoch": 0.20676386592514656, + "flos": 66403108264320.0, + "grad_norm": 0.8029327028802032, + "language_loss": 0.66817588, + "learning_rate": 3.6810891353527747e-06, + "loss": 0.74488425, + "num_input_tokens_seen": 74331390, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.05844116, + "step": 3439, + "time_per_iteration": 3.1231849193573 + }, + { + "auxiliary_loss_clip": 0.06557775, + "auxiliary_loss_mlp": 0.01283609, + "balance_loss_clip": 0.06302103, + "balance_loss_mlp": 0.01260423, + "epoch": 0.20682398917781453, + "flos": 17280278069760.0, + "grad_norm": 1.9287299109512155, + "language_loss": 0.8404541, + "learning_rate": 3.6808781155739014e-06, + "loss": 0.91886795, + "num_input_tokens_seen": 74347335, + "router_z_loss_clip": 2.5546875, + "router_z_loss_mlp": 0.23168945, + "step": 3440, + "time_per_iteration": 2.496563196182251 + }, + { + "auxiliary_loss_clip": 0.06545421, + "auxiliary_loss_mlp": 0.01282262, + "balance_loss_clip": 0.06298509, + "balance_loss_mlp": 0.0126028, + "epoch": 0.2068841124304825, + "flos": 18083127565440.0, + "grad_norm": 3.100665935871663, + "language_loss": 0.85299611, + "learning_rate": 3.6806670320559614e-06, + "loss": 0.93127292, + "num_input_tokens_seen": 74366310, + "router_z_loss_clip": 2.47265625, + "router_z_loss_mlp": 0.2199707, + "step": 3441, + "time_per_iteration": 2.528823137283325 + }, + { + "auxiliary_loss_clip": 0.06546343, + "auxiliary_loss_mlp": 0.01282668, + "balance_loss_clip": 0.06300771, + "balance_loss_mlp": 0.01258958, + "epoch": 0.20694423568315046, + "flos": 27354798823680.0, + "grad_norm": 1.6487564578537555, + "language_loss": 0.86298448, + "learning_rate": 3.680455884806959e-06, + "loss": 0.94127464, + "num_input_tokens_seen": 74387100, + "router_z_loss_clip": 2.45898438, + "router_z_loss_mlp": 0.23693848, + "step": 3442, + "time_per_iteration": 2.5904433727264404 + }, + { + "auxiliary_loss_clip": 0.06553168, + "auxiliary_loss_mlp": 0.0128107, + "balance_loss_clip": 0.06302296, + "balance_loss_mlp": 0.01256298, + "epoch": 0.20700435893581842, + "flos": 20236027645440.0, + "grad_norm": 1.991917549605425, + "language_loss": 0.74110967, + "learning_rate": 3.6802446738349014e-06, + "loss": 0.81945205, + "num_input_tokens_seen": 74404460, + "router_z_loss_clip": 2.50976562, + "router_z_loss_mlp": 0.24755859, + "step": 3443, + "time_per_iteration": 2.546297311782837 + }, + { + "auxiliary_loss_clip": 0.06540793, + "auxiliary_loss_mlp": 0.01282, + "balance_loss_clip": 0.06298059, + "balance_loss_mlp": 0.01259183, + "epoch": 0.2070644821884864, + "flos": 20637347575680.0, + "grad_norm": 5.522598582225395, + "language_loss": 0.86263227, + "learning_rate": 3.680033399147797e-06, + "loss": 0.94086015, + "num_input_tokens_seen": 74423790, + "router_z_loss_clip": 2.42773438, + "router_z_loss_mlp": 0.22814941, + "step": 3444, + "time_per_iteration": 2.5644776821136475 + }, + { + "auxiliary_loss_clip": 0.06396829, + "auxiliary_loss_mlp": 0.01270586, + "balance_loss_clip": 0.06267206, + "balance_loss_mlp": 0.01264399, + "epoch": 0.20712460544115438, + "flos": 65960098128000.0, + "grad_norm": 0.6752802627643808, + "language_loss": 0.56895542, + "learning_rate": 3.6798220607536585e-06, + "loss": 0.64562953, + "num_input_tokens_seen": 74488130, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.06185913, + "step": 3445, + "time_per_iteration": 3.133159637451172 + }, + { + "auxiliary_loss_clip": 0.06550106, + "auxiliary_loss_mlp": 0.0128273, + "balance_loss_clip": 0.06305806, + "balance_loss_mlp": 0.01259412, + "epoch": 0.20718472869382235, + "flos": 19431542995200.0, + "grad_norm": 1.845349461285762, + "language_loss": 0.78388685, + "learning_rate": 3.6796106586604987e-06, + "loss": 0.86221522, + "num_input_tokens_seen": 74506720, + "router_z_loss_clip": 2.4453125, + "router_z_loss_mlp": 0.23327637, + "step": 3446, + "time_per_iteration": 2.5563149452209473 + }, + { + "auxiliary_loss_clip": 0.06562304, + "auxiliary_loss_mlp": 0.0128875, + "balance_loss_clip": 0.06302087, + "balance_loss_mlp": 0.01263215, + "epoch": 0.2072448519464903, + "flos": 24506007384960.0, + "grad_norm": 2.528724295630225, + "language_loss": 0.63215572, + "learning_rate": 3.679399192876334e-06, + "loss": 0.7106663, + "num_input_tokens_seen": 74525330, + "router_z_loss_clip": 2.60351562, + "router_z_loss_mlp": 0.25549316, + "step": 3447, + "time_per_iteration": 2.5858354568481445 + }, + { + "auxiliary_loss_clip": 0.06550243, + "auxiliary_loss_mlp": 0.01285454, + "balance_loss_clip": 0.06302016, + "balance_loss_mlp": 0.01261624, + "epoch": 0.20730497519915828, + "flos": 23082345388800.0, + "grad_norm": 1.7246458475869415, + "language_loss": 0.87330115, + "learning_rate": 3.679187663409184e-06, + "loss": 0.95165813, + "num_input_tokens_seen": 74544535, + "router_z_loss_clip": 2.47851562, + "router_z_loss_mlp": 0.23840332, + "step": 3448, + "time_per_iteration": 2.5367424488067627 + }, + { + "auxiliary_loss_clip": 0.06547908, + "auxiliary_loss_mlp": 0.01287375, + "balance_loss_clip": 0.06301224, + "balance_loss_mlp": 0.0126407, + "epoch": 0.20736509845182624, + "flos": 21075368394240.0, + "grad_norm": 2.238353970842136, + "language_loss": 0.75934261, + "learning_rate": 3.6789760702670696e-06, + "loss": 0.83769548, + "num_input_tokens_seen": 74562300, + "router_z_loss_clip": 2.47070312, + "router_z_loss_mlp": 0.23291016, + "step": 3449, + "time_per_iteration": 3.94480562210083 + }, + { + "auxiliary_loss_clip": 0.06557415, + "auxiliary_loss_mlp": 0.01291462, + "balance_loss_clip": 0.06305711, + "balance_loss_mlp": 0.01267262, + "epoch": 0.2074252217044942, + "flos": 17638021077120.0, + "grad_norm": 1.9890451191355467, + "language_loss": 0.77508813, + "learning_rate": 3.6787644134580134e-06, + "loss": 0.8535769, + "num_input_tokens_seen": 74580080, + "router_z_loss_clip": 2.51757812, + "router_z_loss_mlp": 0.24243164, + "step": 3450, + "time_per_iteration": 2.545430898666382 + }, + { + "auxiliary_loss_clip": 0.06561074, + "auxiliary_loss_mlp": 0.01294493, + "balance_loss_clip": 0.06309673, + "balance_loss_mlp": 0.01270579, + "epoch": 0.20748534495716217, + "flos": 23553209808000.0, + "grad_norm": 2.274256725147599, + "language_loss": 0.823879, + "learning_rate": 3.6785526929900436e-06, + "loss": 0.90243471, + "num_input_tokens_seen": 74598980, + "router_z_loss_clip": 2.515625, + "router_z_loss_mlp": 0.23913574, + "step": 3451, + "time_per_iteration": 4.003388404846191 + }, + { + "auxiliary_loss_clip": 0.0640305, + "auxiliary_loss_mlp": 0.01254439, + "balance_loss_clip": 0.06273949, + "balance_loss_mlp": 0.01248494, + "epoch": 0.20754546820983016, + "flos": 52268666757120.0, + "grad_norm": 0.7675919354914552, + "language_loss": 0.56549037, + "learning_rate": 3.6783409088711875e-06, + "loss": 0.64206523, + "num_input_tokens_seen": 74655275, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.05941772, + "step": 3452, + "time_per_iteration": 3.0660083293914795 + }, + { + "auxiliary_loss_clip": 0.06557937, + "auxiliary_loss_mlp": 0.01287582, + "balance_loss_clip": 0.06309802, + "balance_loss_mlp": 0.01264956, + "epoch": 0.20760559146249813, + "flos": 20418609692160.0, + "grad_norm": 1.8872949255610445, + "language_loss": 0.88967919, + "learning_rate": 3.6781290611094755e-06, + "loss": 0.9681344, + "num_input_tokens_seen": 74674560, + "router_z_loss_clip": 2.48046875, + "router_z_loss_mlp": 0.22619629, + "step": 3453, + "time_per_iteration": 2.581430673599243 + }, + { + "auxiliary_loss_clip": 0.06554953, + "auxiliary_loss_mlp": 0.01287205, + "balance_loss_clip": 0.06307904, + "balance_loss_mlp": 0.01263256, + "epoch": 0.2076657147151661, + "flos": 23192825397120.0, + "grad_norm": 1.4776896143180385, + "language_loss": 0.80720532, + "learning_rate": 3.6779171497129407e-06, + "loss": 0.88562691, + "num_input_tokens_seen": 74694500, + "router_z_loss_clip": 2.47070312, + "router_z_loss_mlp": 0.23962402, + "step": 3454, + "time_per_iteration": 2.5793018341064453 + }, + { + "auxiliary_loss_clip": 0.06549348, + "auxiliary_loss_mlp": 0.01286388, + "balance_loss_clip": 0.06301847, + "balance_loss_mlp": 0.01263476, + "epoch": 0.20772583796783406, + "flos": 18298595139840.0, + "grad_norm": 4.241833159654324, + "language_loss": 0.78446364, + "learning_rate": 3.6777051746896202e-06, + "loss": 0.86282104, + "num_input_tokens_seen": 74710485, + "router_z_loss_clip": 2.4765625, + "router_z_loss_mlp": 0.22912598, + "step": 3455, + "time_per_iteration": 2.5377535820007324 + }, + { + "auxiliary_loss_clip": 0.0654678, + "auxiliary_loss_mlp": 0.01279125, + "balance_loss_clip": 0.06301546, + "balance_loss_mlp": 0.01256547, + "epoch": 0.20778596122050202, + "flos": 17608531639680.0, + "grad_norm": 1.6321737814924744, + "language_loss": 0.81251496, + "learning_rate": 3.6774931360475516e-06, + "loss": 0.89077407, + "num_input_tokens_seen": 74727450, + "router_z_loss_clip": 2.453125, + "router_z_loss_mlp": 0.22595215, + "step": 3456, + "time_per_iteration": 2.5125768184661865 + }, + { + "auxiliary_loss_clip": 0.06554688, + "auxiliary_loss_mlp": 0.01282924, + "balance_loss_clip": 0.06304802, + "balance_loss_mlp": 0.01259893, + "epoch": 0.20784608447317, + "flos": 23812380086400.0, + "grad_norm": 2.3276439316102695, + "language_loss": 0.79071975, + "learning_rate": 3.6772810337947745e-06, + "loss": 0.86909586, + "num_input_tokens_seen": 74746725, + "router_z_loss_clip": 2.49804688, + "router_z_loss_mlp": 0.23022461, + "step": 3457, + "time_per_iteration": 5.41590428352356 + }, + { + "auxiliary_loss_clip": 0.06553855, + "auxiliary_loss_mlp": 0.01279092, + "balance_loss_clip": 0.0630386, + "balance_loss_mlp": 0.01255739, + "epoch": 0.20790620772583795, + "flos": 17645022892800.0, + "grad_norm": 1.9963286729709264, + "language_loss": 0.84664595, + "learning_rate": 3.677068867939333e-06, + "loss": 0.9249754, + "num_input_tokens_seen": 74765255, + "router_z_loss_clip": 2.49609375, + "router_z_loss_mlp": 0.23364258, + "step": 3458, + "time_per_iteration": 2.610107183456421 + }, + { + "auxiliary_loss_clip": 0.06541788, + "auxiliary_loss_mlp": 0.01277058, + "balance_loss_clip": 0.06299603, + "balance_loss_mlp": 0.01254289, + "epoch": 0.20796633097850595, + "flos": 27680997968640.0, + "grad_norm": 1.7522329071194311, + "language_loss": 0.76853168, + "learning_rate": 3.676856638489272e-06, + "loss": 0.8467201, + "num_input_tokens_seen": 74785710, + "router_z_loss_clip": 2.41796875, + "router_z_loss_mlp": 0.2277832, + "step": 3459, + "time_per_iteration": 2.63517689704895 + }, + { + "auxiliary_loss_clip": 0.06543219, + "auxiliary_loss_mlp": 0.01279579, + "balance_loss_clip": 0.06299554, + "balance_loss_mlp": 0.01257024, + "epoch": 0.2080264542311739, + "flos": 19251770060160.0, + "grad_norm": 1.8057193688460893, + "language_loss": 0.77803749, + "learning_rate": 3.6766443454526382e-06, + "loss": 0.85626543, + "num_input_tokens_seen": 74804490, + "router_z_loss_clip": 2.43945312, + "router_z_loss_mlp": 0.22570801, + "step": 3460, + "time_per_iteration": 2.5500359535217285 + }, + { + "auxiliary_loss_clip": 0.06544735, + "auxiliary_loss_mlp": 0.01276939, + "balance_loss_clip": 0.06297737, + "balance_loss_mlp": 0.01255315, + "epoch": 0.20808657748384188, + "flos": 27533146239360.0, + "grad_norm": 1.865214089074118, + "language_loss": 0.76152873, + "learning_rate": 3.6764319888374836e-06, + "loss": 0.8397454, + "num_input_tokens_seen": 74826340, + "router_z_loss_clip": 2.46484375, + "router_z_loss_mlp": 0.21618652, + "step": 3461, + "time_per_iteration": 2.575975179672241 + }, + { + "auxiliary_loss_clip": 0.06554922, + "auxiliary_loss_mlp": 0.01279751, + "balance_loss_clip": 0.06301013, + "balance_loss_mlp": 0.01256183, + "epoch": 0.20814670073650984, + "flos": 26914262382720.0, + "grad_norm": 2.229402903272821, + "language_loss": 0.89438462, + "learning_rate": 3.6762195686518604e-06, + "loss": 0.97273135, + "num_input_tokens_seen": 74844960, + "router_z_loss_clip": 2.53515625, + "router_z_loss_mlp": 0.23571777, + "step": 3462, + "time_per_iteration": 2.5732173919677734 + }, + { + "auxiliary_loss_clip": 0.06402825, + "auxiliary_loss_mlp": 0.01283843, + "balance_loss_clip": 0.06274004, + "balance_loss_mlp": 0.01278395, + "epoch": 0.2082068239891778, + "flos": 70195850674560.0, + "grad_norm": 0.9150130859854356, + "language_loss": 0.59001637, + "learning_rate": 3.6760070849038226e-06, + "loss": 0.66688299, + "num_input_tokens_seen": 74909075, + "router_z_loss_clip": 1.28417969, + "router_z_loss_mlp": 0.05456543, + "step": 3463, + "time_per_iteration": 3.269202709197998 + }, + { + "auxiliary_loss_clip": 0.06550549, + "auxiliary_loss_mlp": 0.01282784, + "balance_loss_clip": 0.06300794, + "balance_loss_mlp": 0.01257929, + "epoch": 0.20826694724184577, + "flos": 24614978019840.0, + "grad_norm": 2.6522237220698663, + "language_loss": 0.66949397, + "learning_rate": 3.675794537601429e-06, + "loss": 0.74782729, + "num_input_tokens_seen": 74928125, + "router_z_loss_clip": 2.5, + "router_z_loss_mlp": 0.2487793, + "step": 3464, + "time_per_iteration": 2.5638158321380615 + }, + { + "auxiliary_loss_clip": 0.06556059, + "auxiliary_loss_mlp": 0.01287892, + "balance_loss_clip": 0.06307128, + "balance_loss_mlp": 0.01263299, + "epoch": 0.20832707049451377, + "flos": 12897218845440.0, + "grad_norm": 2.2476817474527913, + "language_loss": 0.84321886, + "learning_rate": 3.6755819267527373e-06, + "loss": 0.9216584, + "num_input_tokens_seen": 74945090, + "router_z_loss_clip": 2.49023438, + "router_z_loss_mlp": 0.24609375, + "step": 3465, + "time_per_iteration": 2.5794646739959717 + }, + { + "auxiliary_loss_clip": 0.06542073, + "auxiliary_loss_mlp": 0.01282156, + "balance_loss_clip": 0.06295872, + "balance_loss_mlp": 0.01258326, + "epoch": 0.20838719374718173, + "flos": 22205129794560.0, + "grad_norm": 3.281235222185926, + "language_loss": 0.82741451, + "learning_rate": 3.6753692523658113e-06, + "loss": 0.90565681, + "num_input_tokens_seen": 74963630, + "router_z_loss_clip": 2.46289062, + "router_z_loss_mlp": 0.23828125, + "step": 3466, + "time_per_iteration": 2.540011405944824 + }, + { + "auxiliary_loss_clip": 0.06540319, + "auxiliary_loss_mlp": 0.01287937, + "balance_loss_clip": 0.06300111, + "balance_loss_mlp": 0.01267243, + "epoch": 0.2084473169998497, + "flos": 15164036951040.0, + "grad_norm": 2.490655035944783, + "language_loss": 0.82892549, + "learning_rate": 3.675156514448716e-06, + "loss": 0.90720803, + "num_input_tokens_seen": 74981875, + "router_z_loss_clip": 2.40234375, + "router_z_loss_mlp": 0.20690918, + "step": 3467, + "time_per_iteration": 2.54622745513916 + }, + { + "auxiliary_loss_clip": 0.06540733, + "auxiliary_loss_mlp": 0.01289148, + "balance_loss_clip": 0.06303266, + "balance_loss_mlp": 0.01268167, + "epoch": 0.20850744025251766, + "flos": 17462482773120.0, + "grad_norm": 1.8114532422505003, + "language_loss": 0.82299387, + "learning_rate": 3.674943713009518e-06, + "loss": 0.90129268, + "num_input_tokens_seen": 74999155, + "router_z_loss_clip": 2.37304688, + "router_z_loss_mlp": 0.2097168, + "step": 3468, + "time_per_iteration": 2.5321285724639893 + }, + { + "auxiliary_loss_clip": 0.06553383, + "auxiliary_loss_mlp": 0.01280357, + "balance_loss_clip": 0.06302625, + "balance_loss_mlp": 0.01257158, + "epoch": 0.20856756350518563, + "flos": 25705439055360.0, + "grad_norm": 1.667306072143411, + "language_loss": 0.9042781, + "learning_rate": 3.6747308480562856e-06, + "loss": 0.98261553, + "num_input_tokens_seen": 75017850, + "router_z_loss_clip": 2.5078125, + "router_z_loss_mlp": 0.23217773, + "step": 3469, + "time_per_iteration": 2.6107866764068604 + }, + { + "auxiliary_loss_clip": 0.0655106, + "auxiliary_loss_mlp": 0.01281556, + "balance_loss_clip": 0.06308927, + "balance_loss_mlp": 0.01259872, + "epoch": 0.2086276867578536, + "flos": 37898213425920.0, + "grad_norm": 1.9476878714472061, + "language_loss": 0.77294397, + "learning_rate": 3.674517919597092e-06, + "loss": 0.85127008, + "num_input_tokens_seen": 75039270, + "router_z_loss_clip": 2.421875, + "router_z_loss_mlp": 0.21679688, + "step": 3470, + "time_per_iteration": 2.7083425521850586 + }, + { + "auxiliary_loss_clip": 0.06547298, + "auxiliary_loss_mlp": 0.01289218, + "balance_loss_clip": 0.06307482, + "balance_loss_mlp": 0.01266283, + "epoch": 0.20868781001052156, + "flos": 25564169871360.0, + "grad_norm": 1.8036684586339249, + "language_loss": 0.76289082, + "learning_rate": 3.674304927640011e-06, + "loss": 0.84125602, + "num_input_tokens_seen": 75059350, + "router_z_loss_clip": 2.40039062, + "router_z_loss_mlp": 0.22937012, + "step": 3471, + "time_per_iteration": 2.589884042739868 + }, + { + "auxiliary_loss_clip": 0.06554438, + "auxiliary_loss_mlp": 0.01280867, + "balance_loss_clip": 0.06303854, + "balance_loss_mlp": 0.01259028, + "epoch": 0.20874793326318955, + "flos": 27536961600000.0, + "grad_norm": 1.6381609540737498, + "language_loss": 0.76341867, + "learning_rate": 3.67409187219312e-06, + "loss": 0.84177172, + "num_input_tokens_seen": 75080150, + "router_z_loss_clip": 2.5078125, + "router_z_loss_mlp": 0.21813965, + "step": 3472, + "time_per_iteration": 2.610260009765625 + }, + { + "auxiliary_loss_clip": 0.06544036, + "auxiliary_loss_mlp": 0.01279562, + "balance_loss_clip": 0.06302247, + "balance_loss_mlp": 0.01259022, + "epoch": 0.20880805651585752, + "flos": 18554243546880.0, + "grad_norm": 2.073955911698539, + "language_loss": 0.85418117, + "learning_rate": 3.6738787532644966e-06, + "loss": 0.93241715, + "num_input_tokens_seen": 75097920, + "router_z_loss_clip": 2.41601562, + "router_z_loss_mlp": 0.20532227, + "step": 3473, + "time_per_iteration": 2.5741372108459473 + }, + { + "auxiliary_loss_clip": 0.06431094, + "auxiliary_loss_mlp": 0.01255526, + "balance_loss_clip": 0.06305239, + "balance_loss_mlp": 0.01250132, + "epoch": 0.20886817976852548, + "flos": 65966596819200.0, + "grad_norm": 0.8661888314681573, + "language_loss": 0.63746876, + "learning_rate": 3.6736655708622235e-06, + "loss": 0.71433502, + "num_input_tokens_seen": 75152410, + "router_z_loss_clip": 1.25585938, + "router_z_loss_mlp": 0.05401611, + "step": 3474, + "time_per_iteration": 3.061617612838745 + }, + { + "auxiliary_loss_clip": 0.06545534, + "auxiliary_loss_mlp": 0.01278543, + "balance_loss_clip": 0.06299987, + "balance_loss_mlp": 0.01255751, + "epoch": 0.20892830302119345, + "flos": 36548120914560.0, + "grad_norm": 1.9594452651536962, + "language_loss": 0.70746702, + "learning_rate": 3.6734523249943844e-06, + "loss": 0.78570777, + "num_input_tokens_seen": 75173265, + "router_z_loss_clip": 2.45898438, + "router_z_loss_mlp": 0.22790527, + "step": 3475, + "time_per_iteration": 2.7295854091644287 + }, + { + "auxiliary_loss_clip": 0.06544538, + "auxiliary_loss_mlp": 0.01277403, + "balance_loss_clip": 0.06299123, + "balance_loss_mlp": 0.01255754, + "epoch": 0.2089884262738614, + "flos": 20962582398720.0, + "grad_norm": 1.6086426160627472, + "language_loss": 0.70801485, + "learning_rate": 3.673239015669065e-06, + "loss": 0.78623426, + "num_input_tokens_seen": 75193640, + "router_z_loss_clip": 2.45703125, + "router_z_loss_mlp": 0.21643066, + "step": 3476, + "time_per_iteration": 2.6065874099731445 + }, + { + "auxiliary_loss_clip": 0.06538086, + "auxiliary_loss_mlp": 0.01275305, + "balance_loss_clip": 0.06299278, + "balance_loss_mlp": 0.0125523, + "epoch": 0.20904854952652938, + "flos": 22790666926080.0, + "grad_norm": 1.9785394209574967, + "language_loss": 0.90003526, + "learning_rate": 3.6730256428943544e-06, + "loss": 0.9781692, + "num_input_tokens_seen": 75212545, + "router_z_loss_clip": 2.390625, + "router_z_loss_mlp": 0.20080566, + "step": 3477, + "time_per_iteration": 2.5576000213623047 + }, + { + "auxiliary_loss_clip": 0.06542666, + "auxiliary_loss_mlp": 0.01278801, + "balance_loss_clip": 0.06302647, + "balance_loss_mlp": 0.01257594, + "epoch": 0.20910867277919734, + "flos": 27309838308480.0, + "grad_norm": 2.554960999675803, + "language_loss": 0.69433093, + "learning_rate": 3.672812206678344e-06, + "loss": 0.77254558, + "num_input_tokens_seen": 75230865, + "router_z_loss_clip": 2.40234375, + "router_z_loss_mlp": 0.21203613, + "step": 3478, + "time_per_iteration": 2.605890989303589 + }, + { + "auxiliary_loss_clip": 0.0654031, + "auxiliary_loss_mlp": 0.01282288, + "balance_loss_clip": 0.06298592, + "balance_loss_mlp": 0.01260461, + "epoch": 0.20916879603186533, + "flos": 14324444640000.0, + "grad_norm": 1.9959140715838508, + "language_loss": 0.85550553, + "learning_rate": 3.672598707029127e-06, + "loss": 0.93373156, + "num_input_tokens_seen": 75248285, + "router_z_loss_clip": 2.41992188, + "router_z_loss_mlp": 0.21813965, + "step": 3479, + "time_per_iteration": 2.5808637142181396 + }, + { + "auxiliary_loss_clip": 0.06542581, + "auxiliary_loss_mlp": 0.01279649, + "balance_loss_clip": 0.06299447, + "balance_loss_mlp": 0.01258072, + "epoch": 0.2092289192845333, + "flos": 22279537820160.0, + "grad_norm": 2.3833241848820372, + "language_loss": 0.75129831, + "learning_rate": 3.6723851439548003e-06, + "loss": 0.82952058, + "num_input_tokens_seen": 75266310, + "router_z_loss_clip": 2.43359375, + "router_z_loss_mlp": 0.21569824, + "step": 3480, + "time_per_iteration": 2.519789218902588 + }, + { + "auxiliary_loss_clip": 0.06546038, + "auxiliary_loss_mlp": 0.01278892, + "balance_loss_clip": 0.06306421, + "balance_loss_mlp": 0.01258495, + "epoch": 0.20928904253720126, + "flos": 14836118797440.0, + "grad_norm": 2.1621149118450163, + "language_loss": 0.7689389, + "learning_rate": 3.67217151746346e-06, + "loss": 0.84718817, + "num_input_tokens_seen": 75284175, + "router_z_loss_clip": 2.39648438, + "router_z_loss_mlp": 0.20410156, + "step": 3481, + "time_per_iteration": 2.541019916534424 + }, + { + "auxiliary_loss_clip": 0.06542054, + "auxiliary_loss_mlp": 0.01279748, + "balance_loss_clip": 0.06299154, + "balance_loss_mlp": 0.01257718, + "epoch": 0.20934916578986923, + "flos": 23266017538560.0, + "grad_norm": 1.9029543431357738, + "language_loss": 0.85756385, + "learning_rate": 3.671957827563209e-06, + "loss": 0.93578184, + "num_input_tokens_seen": 75303465, + "router_z_loss_clip": 2.4296875, + "router_z_loss_mlp": 0.22021484, + "step": 3482, + "time_per_iteration": 2.57550048828125 + }, + { + "auxiliary_loss_clip": 0.06538534, + "auxiliary_loss_mlp": 0.01281551, + "balance_loss_clip": 0.0629866, + "balance_loss_mlp": 0.01260237, + "epoch": 0.2094092890425372, + "flos": 32022492768000.0, + "grad_norm": 2.0122422455266076, + "language_loss": 0.71876764, + "learning_rate": 3.6717440742621494e-06, + "loss": 0.79696846, + "num_input_tokens_seen": 75325290, + "router_z_loss_clip": 2.3984375, + "router_z_loss_mlp": 0.21325684, + "step": 3483, + "time_per_iteration": 2.6664113998413086 + }, + { + "auxiliary_loss_clip": 0.06543796, + "auxiliary_loss_mlp": 0.01277426, + "balance_loss_clip": 0.06297769, + "balance_loss_mlp": 0.0125567, + "epoch": 0.20946941229520516, + "flos": 20016744710400.0, + "grad_norm": 1.623254768822543, + "language_loss": 0.75620067, + "learning_rate": 3.6715302575683865e-06, + "loss": 0.83441281, + "num_input_tokens_seen": 75343895, + "router_z_loss_clip": 2.45898438, + "router_z_loss_mlp": 0.21728516, + "step": 3484, + "time_per_iteration": 2.537745714187622 + }, + { + "auxiliary_loss_clip": 0.06537648, + "auxiliary_loss_mlp": 0.01274667, + "balance_loss_clip": 0.0629506, + "balance_loss_mlp": 0.01252733, + "epoch": 0.20952953554787315, + "flos": 30748401509760.0, + "grad_norm": 1.6710062021876058, + "language_loss": 0.71473777, + "learning_rate": 3.6713163774900292e-06, + "loss": 0.79286093, + "num_input_tokens_seen": 75367100, + "router_z_loss_clip": 2.42382812, + "router_z_loss_mlp": 0.21936035, + "step": 3485, + "time_per_iteration": 2.6310439109802246 + }, + { + "auxiliary_loss_clip": 0.0654947, + "auxiliary_loss_mlp": 0.01281894, + "balance_loss_clip": 0.06304678, + "balance_loss_mlp": 0.01258517, + "epoch": 0.20958965880054112, + "flos": 27055950837120.0, + "grad_norm": 1.7793136829828902, + "language_loss": 0.83105123, + "learning_rate": 3.6711024340351875e-06, + "loss": 0.90936482, + "num_input_tokens_seen": 75389925, + "router_z_loss_clip": 2.44921875, + "router_z_loss_mlp": 0.23376465, + "step": 3486, + "time_per_iteration": 2.5819222927093506 + }, + { + "auxiliary_loss_clip": 0.06539689, + "auxiliary_loss_mlp": 0.01279221, + "balance_loss_clip": 0.06297638, + "balance_loss_mlp": 0.01257978, + "epoch": 0.20964978205320908, + "flos": 34212680714880.0, + "grad_norm": 2.582218695391969, + "language_loss": 0.87821579, + "learning_rate": 3.6708884272119737e-06, + "loss": 0.95640486, + "num_input_tokens_seen": 75408575, + "router_z_loss_clip": 2.421875, + "router_z_loss_mlp": 0.21240234, + "step": 3487, + "time_per_iteration": 2.639369487762451 + }, + { + "auxiliary_loss_clip": 0.06538714, + "auxiliary_loss_mlp": 0.01279661, + "balance_loss_clip": 0.06298582, + "balance_loss_mlp": 0.01258227, + "epoch": 0.20970990530587705, + "flos": 23484168443520.0, + "grad_norm": 2.287931950731532, + "language_loss": 0.72719586, + "learning_rate": 3.670674357028504e-06, + "loss": 0.80537963, + "num_input_tokens_seen": 75427155, + "router_z_loss_clip": 2.40039062, + "router_z_loss_mlp": 0.21411133, + "step": 3488, + "time_per_iteration": 3.9480032920837402 + }, + { + "auxiliary_loss_clip": 0.06540683, + "auxiliary_loss_mlp": 0.01275293, + "balance_loss_clip": 0.06299531, + "balance_loss_mlp": 0.01255123, + "epoch": 0.209770028558545, + "flos": 18557346147840.0, + "grad_norm": 2.67396224290917, + "language_loss": 0.81189376, + "learning_rate": 3.6704602234928945e-06, + "loss": 0.89005351, + "num_input_tokens_seen": 75444450, + "router_z_loss_clip": 2.41210938, + "router_z_loss_mlp": 0.20178223, + "step": 3489, + "time_per_iteration": 2.500709295272827 + }, + { + "auxiliary_loss_clip": 0.0654545, + "auxiliary_loss_mlp": 0.01278304, + "balance_loss_clip": 0.06303608, + "balance_loss_mlp": 0.0125724, + "epoch": 0.20983015181121298, + "flos": 21623533804800.0, + "grad_norm": 2.0567102060198743, + "language_loss": 0.73407692, + "learning_rate": 3.670246026613266e-06, + "loss": 0.81231445, + "num_input_tokens_seen": 75462625, + "router_z_loss_clip": 2.41992188, + "router_z_loss_mlp": 0.21057129, + "step": 3490, + "time_per_iteration": 2.5622947216033936 + }, + { + "auxiliary_loss_clip": 0.06534347, + "auxiliary_loss_mlp": 0.01280989, + "balance_loss_clip": 0.06300151, + "balance_loss_mlp": 0.01260128, + "epoch": 0.20989027506388094, + "flos": 16619787861120.0, + "grad_norm": 1.7677892351641744, + "language_loss": 0.71503973, + "learning_rate": 3.6700317663977415e-06, + "loss": 0.7931931, + "num_input_tokens_seen": 75480640, + "router_z_loss_clip": 2.34570312, + "router_z_loss_mlp": 0.20849609, + "step": 3491, + "time_per_iteration": 4.0022783279418945 + }, + { + "auxiliary_loss_clip": 0.06542461, + "auxiliary_loss_mlp": 0.01283797, + "balance_loss_clip": 0.0629908, + "balance_loss_mlp": 0.01260957, + "epoch": 0.20995039831654894, + "flos": 23222692177920.0, + "grad_norm": 2.702657778988086, + "language_loss": 0.80329478, + "learning_rate": 3.669817442854444e-06, + "loss": 0.88155735, + "num_input_tokens_seen": 75494900, + "router_z_loss_clip": 2.43164062, + "router_z_loss_mlp": 0.22839355, + "step": 3492, + "time_per_iteration": 2.5376975536346436 + }, + { + "auxiliary_loss_clip": 0.06546506, + "auxiliary_loss_mlp": 0.01283519, + "balance_loss_clip": 0.06307527, + "balance_loss_mlp": 0.01262741, + "epoch": 0.2100105215692169, + "flos": 18152881689600.0, + "grad_norm": 1.9319737068083613, + "language_loss": 0.87613726, + "learning_rate": 3.669603055991502e-06, + "loss": 0.95443749, + "num_input_tokens_seen": 75513370, + "router_z_loss_clip": 2.39453125, + "router_z_loss_mlp": 0.20800781, + "step": 3493, + "time_per_iteration": 2.5462660789489746 + }, + { + "auxiliary_loss_clip": 0.06538918, + "auxiliary_loss_mlp": 0.01283808, + "balance_loss_clip": 0.06303683, + "balance_loss_mlp": 0.01262673, + "epoch": 0.21007064482188487, + "flos": 15967179936000.0, + "grad_norm": 1.7380368048158776, + "language_loss": 0.69753766, + "learning_rate": 3.6693886058170455e-06, + "loss": 0.77576494, + "num_input_tokens_seen": 75532480, + "router_z_loss_clip": 2.35351562, + "router_z_loss_mlp": 0.21130371, + "step": 3494, + "time_per_iteration": 2.523575782775879 + }, + { + "auxiliary_loss_clip": 0.0654956, + "auxiliary_loss_mlp": 0.0128408, + "balance_loss_clip": 0.06306064, + "balance_loss_mlp": 0.01262598, + "epoch": 0.21013076807455283, + "flos": 32242614243840.0, + "grad_norm": 1.6795437076377473, + "language_loss": 0.79639518, + "learning_rate": 3.6691740923392053e-06, + "loss": 0.87473154, + "num_input_tokens_seen": 75552745, + "router_z_loss_clip": 2.4375, + "router_z_loss_mlp": 0.21472168, + "step": 3495, + "time_per_iteration": 2.679564952850342 + }, + { + "auxiliary_loss_clip": 0.06543255, + "auxiliary_loss_mlp": 0.01280683, + "balance_loss_clip": 0.06300748, + "balance_loss_mlp": 0.01258832, + "epoch": 0.2101908913272208, + "flos": 23703493305600.0, + "grad_norm": 2.110842443067005, + "language_loss": 0.77733672, + "learning_rate": 3.668959515566116e-06, + "loss": 0.85557616, + "num_input_tokens_seen": 75574355, + "router_z_loss_clip": 2.42773438, + "router_z_loss_mlp": 0.21862793, + "step": 3496, + "time_per_iteration": 2.5728261470794678 + }, + { + "auxiliary_loss_clip": 0.06546371, + "auxiliary_loss_mlp": 0.01280297, + "balance_loss_clip": 0.06304142, + "balance_loss_mlp": 0.01257993, + "epoch": 0.21025101457988876, + "flos": 20381992657920.0, + "grad_norm": 2.1840810602746643, + "language_loss": 0.82214069, + "learning_rate": 3.668744875505915e-06, + "loss": 0.90040743, + "num_input_tokens_seen": 75592215, + "router_z_loss_clip": 2.421875, + "router_z_loss_mlp": 0.22302246, + "step": 3497, + "time_per_iteration": 5.435751438140869 + }, + { + "auxiliary_loss_clip": 0.06554863, + "auxiliary_loss_mlp": 0.01281759, + "balance_loss_clip": 0.06307989, + "balance_loss_mlp": 0.01259205, + "epoch": 0.21031113783255675, + "flos": 25782740046720.0, + "grad_norm": 1.9653925911520136, + "language_loss": 0.68009126, + "learning_rate": 3.668530172166741e-06, + "loss": 0.75845742, + "num_input_tokens_seen": 75610740, + "router_z_loss_clip": 2.46875, + "router_z_loss_mlp": 0.22558594, + "step": 3498, + "time_per_iteration": 2.6047511100769043 + }, + { + "auxiliary_loss_clip": 0.06550896, + "auxiliary_loss_mlp": 0.01291723, + "balance_loss_clip": 0.06304521, + "balance_loss_mlp": 0.01269789, + "epoch": 0.21037126108522472, + "flos": 22024769880960.0, + "grad_norm": 1.5964372308761317, + "language_loss": 0.81248403, + "learning_rate": 3.6683154055567352e-06, + "loss": 0.89091027, + "num_input_tokens_seen": 75631005, + "router_z_loss_clip": 2.46484375, + "router_z_loss_mlp": 0.21948242, + "step": 3499, + "time_per_iteration": 2.5279107093811035 + }, + { + "auxiliary_loss_clip": 0.06537838, + "auxiliary_loss_mlp": 0.01278117, + "balance_loss_clip": 0.06300277, + "balance_loss_mlp": 0.01257911, + "epoch": 0.21043138433789269, + "flos": 25340861940480.0, + "grad_norm": 2.3111316875342274, + "language_loss": 0.78733355, + "learning_rate": 3.668100575684043e-06, + "loss": 0.86549306, + "num_input_tokens_seen": 75650655, + "router_z_loss_clip": 2.37890625, + "router_z_loss_mlp": 0.20214844, + "step": 3500, + "time_per_iteration": 2.5789358615875244 + }, + { + "auxiliary_loss_clip": 0.06548081, + "auxiliary_loss_mlp": 0.01281815, + "balance_loss_clip": 0.06307902, + "balance_loss_mlp": 0.01259809, + "epoch": 0.21049150759056065, + "flos": 25563708673920.0, + "grad_norm": 1.5222387073827752, + "language_loss": 0.74519855, + "learning_rate": 3.6678856825568094e-06, + "loss": 0.82349753, + "num_input_tokens_seen": 75669895, + "router_z_loss_clip": 2.40234375, + "router_z_loss_mlp": 0.22021484, + "step": 3501, + "time_per_iteration": 2.5740344524383545 + }, + { + "auxiliary_loss_clip": 0.06532234, + "auxiliary_loss_mlp": 0.01279657, + "balance_loss_clip": 0.06293183, + "balance_loss_mlp": 0.01258521, + "epoch": 0.21055163084322862, + "flos": 24501982389120.0, + "grad_norm": 1.5726278305934103, + "language_loss": 0.75732303, + "learning_rate": 3.667670726183183e-06, + "loss": 0.83544195, + "num_input_tokens_seen": 75689535, + "router_z_loss_clip": 2.39453125, + "router_z_loss_mlp": 0.21142578, + "step": 3502, + "time_per_iteration": 2.564650535583496 + }, + { + "auxiliary_loss_clip": 0.06532737, + "auxiliary_loss_mlp": 0.01282141, + "balance_loss_clip": 0.06294994, + "balance_loss_mlp": 0.01260731, + "epoch": 0.21061175409589658, + "flos": 25746123012480.0, + "grad_norm": 2.0578640076956165, + "language_loss": 0.78642297, + "learning_rate": 3.667455706571316e-06, + "loss": 0.86457181, + "num_input_tokens_seen": 75709265, + "router_z_loss_clip": 2.37695312, + "router_z_loss_mlp": 0.21411133, + "step": 3503, + "time_per_iteration": 2.5651087760925293 + }, + { + "auxiliary_loss_clip": 0.06548393, + "auxiliary_loss_mlp": 0.01287579, + "balance_loss_clip": 0.06300595, + "balance_loss_mlp": 0.01262426, + "epoch": 0.21067187734856455, + "flos": 18995115404160.0, + "grad_norm": 2.3829290271278363, + "language_loss": 0.79109055, + "learning_rate": 3.6672406237293617e-06, + "loss": 0.86945021, + "num_input_tokens_seen": 75727050, + "router_z_loss_clip": 2.47460938, + "router_z_loss_mlp": 0.25134277, + "step": 3504, + "time_per_iteration": 2.5907576084136963 + }, + { + "auxiliary_loss_clip": 0.06540846, + "auxiliary_loss_mlp": 0.01277653, + "balance_loss_clip": 0.06295908, + "balance_loss_mlp": 0.012561, + "epoch": 0.21073200060123254, + "flos": 24688337869440.0, + "grad_norm": 2.6276986020802386, + "language_loss": 0.77414715, + "learning_rate": 3.6670254776654754e-06, + "loss": 0.85233212, + "num_input_tokens_seen": 75747175, + "router_z_loss_clip": 2.44921875, + "router_z_loss_mlp": 0.21557617, + "step": 3505, + "time_per_iteration": 2.564504861831665 + }, + { + "auxiliary_loss_clip": 0.06529057, + "auxiliary_loss_mlp": 0.01277583, + "balance_loss_clip": 0.06294015, + "balance_loss_mlp": 0.01257186, + "epoch": 0.2107921238539005, + "flos": 28557039605760.0, + "grad_norm": 2.0513581673642434, + "language_loss": 0.64351165, + "learning_rate": 3.6668102683878163e-06, + "loss": 0.721578, + "num_input_tokens_seen": 75767690, + "router_z_loss_clip": 2.34960938, + "router_z_loss_mlp": 0.20397949, + "step": 3506, + "time_per_iteration": 2.641390323638916 + }, + { + "auxiliary_loss_clip": 0.06535215, + "auxiliary_loss_mlp": 0.01278768, + "balance_loss_clip": 0.0629719, + "balance_loss_mlp": 0.01257656, + "epoch": 0.21085224710656847, + "flos": 25893094273920.0, + "grad_norm": 2.3889311598286436, + "language_loss": 0.82716179, + "learning_rate": 3.6665949959045443e-06, + "loss": 0.90530163, + "num_input_tokens_seen": 75787255, + "router_z_loss_clip": 2.38085938, + "router_z_loss_mlp": 0.21105957, + "step": 3507, + "time_per_iteration": 2.5718142986297607 + }, + { + "auxiliary_loss_clip": 0.06534198, + "auxiliary_loss_mlp": 0.01280018, + "balance_loss_clip": 0.06294642, + "balance_loss_mlp": 0.0125769, + "epoch": 0.21091237035923643, + "flos": 14981664539520.0, + "grad_norm": 1.9856074738329712, + "language_loss": 0.76547742, + "learning_rate": 3.666379660223824e-06, + "loss": 0.84361959, + "num_input_tokens_seen": 75805890, + "router_z_loss_clip": 2.3984375, + "router_z_loss_mlp": 0.22338867, + "step": 3508, + "time_per_iteration": 2.5104117393493652 + }, + { + "auxiliary_loss_clip": 0.06543706, + "auxiliary_loss_mlp": 0.01282498, + "balance_loss_clip": 0.06299506, + "balance_loss_mlp": 0.01261159, + "epoch": 0.2109724936119044, + "flos": 16368080595840.0, + "grad_norm": 2.529935640705384, + "language_loss": 0.86242574, + "learning_rate": 3.6661642613538192e-06, + "loss": 0.94068778, + "num_input_tokens_seen": 75821620, + "router_z_loss_clip": 2.44335938, + "router_z_loss_mlp": 0.21325684, + "step": 3509, + "time_per_iteration": 2.508370876312256 + }, + { + "auxiliary_loss_clip": 0.06541994, + "auxiliary_loss_mlp": 0.01280685, + "balance_loss_clip": 0.06295836, + "balance_loss_mlp": 0.01258679, + "epoch": 0.21103261686457236, + "flos": 31510315486080.0, + "grad_norm": 1.7053981088389916, + "language_loss": 0.68853724, + "learning_rate": 3.6659487993026987e-06, + "loss": 0.76676404, + "num_input_tokens_seen": 75842490, + "router_z_loss_clip": 2.46289062, + "router_z_loss_mlp": 0.22009277, + "step": 3510, + "time_per_iteration": 2.6452746391296387 + }, + { + "auxiliary_loss_clip": 0.06542882, + "auxiliary_loss_mlp": 0.01284418, + "balance_loss_clip": 0.06299803, + "balance_loss_mlp": 0.01263259, + "epoch": 0.21109274011724033, + "flos": 27351360806400.0, + "grad_norm": 1.7932280077203222, + "language_loss": 0.7352736, + "learning_rate": 3.6657332740786327e-06, + "loss": 0.8135466, + "num_input_tokens_seen": 75865985, + "router_z_loss_clip": 2.43164062, + "router_z_loss_mlp": 0.21154785, + "step": 3511, + "time_per_iteration": 2.6538095474243164 + }, + { + "auxiliary_loss_clip": 0.06553793, + "auxiliary_loss_mlp": 0.01288613, + "balance_loss_clip": 0.06308056, + "balance_loss_mlp": 0.01265546, + "epoch": 0.21115286336990832, + "flos": 17825927857920.0, + "grad_norm": 2.4490749473958577, + "language_loss": 0.70309734, + "learning_rate": 3.665517685689794e-06, + "loss": 0.78152132, + "num_input_tokens_seen": 75882745, + "router_z_loss_clip": 2.4609375, + "router_z_loss_mlp": 0.23071289, + "step": 3512, + "time_per_iteration": 2.5178020000457764 + }, + { + "auxiliary_loss_clip": 0.06542063, + "auxiliary_loss_mlp": 0.01280138, + "balance_loss_clip": 0.06299283, + "balance_loss_mlp": 0.01257739, + "epoch": 0.2112129866225763, + "flos": 27205228085760.0, + "grad_norm": 1.580176351931222, + "language_loss": 0.73930323, + "learning_rate": 3.6653020341443584e-06, + "loss": 0.81752527, + "num_input_tokens_seen": 75904305, + "router_z_loss_clip": 2.43164062, + "router_z_loss_mlp": 0.22412109, + "step": 3513, + "time_per_iteration": 2.62662410736084 + }, + { + "auxiliary_loss_clip": 0.06537203, + "auxiliary_loss_mlp": 0.01281283, + "balance_loss_clip": 0.06301522, + "balance_loss_mlp": 0.01260303, + "epoch": 0.21127310987524425, + "flos": 23737846279680.0, + "grad_norm": 1.7494748899805272, + "language_loss": 0.75353736, + "learning_rate": 3.665086319450502e-06, + "loss": 0.8317222, + "num_input_tokens_seen": 75923710, + "router_z_loss_clip": 2.359375, + "router_z_loss_mlp": 0.20983887, + "step": 3514, + "time_per_iteration": 2.584502696990967 + }, + { + "auxiliary_loss_clip": 0.06546184, + "auxiliary_loss_mlp": 0.01281455, + "balance_loss_clip": 0.06301809, + "balance_loss_mlp": 0.01261309, + "epoch": 0.21133323312791222, + "flos": 18338356702080.0, + "grad_norm": 1.6761924057980855, + "language_loss": 0.77322358, + "learning_rate": 3.6648705416164062e-06, + "loss": 0.85149997, + "num_input_tokens_seen": 75942625, + "router_z_loss_clip": 2.4453125, + "router_z_loss_mlp": 0.20141602, + "step": 3515, + "time_per_iteration": 2.552231550216675 + }, + { + "auxiliary_loss_clip": 0.06544478, + "auxiliary_loss_mlp": 0.0128088, + "balance_loss_clip": 0.06304052, + "balance_loss_mlp": 0.01260865, + "epoch": 0.21139335638058018, + "flos": 17936994844800.0, + "grad_norm": 2.0687526262765212, + "language_loss": 0.69083852, + "learning_rate": 3.6646547006502518e-06, + "loss": 0.76909214, + "num_input_tokens_seen": 75959930, + "router_z_loss_clip": 2.40625, + "router_z_loss_mlp": 0.19995117, + "step": 3516, + "time_per_iteration": 2.535282611846924 + }, + { + "auxiliary_loss_clip": 0.0654862, + "auxiliary_loss_mlp": 0.01279905, + "balance_loss_clip": 0.0630609, + "balance_loss_mlp": 0.01257756, + "epoch": 0.21145347963324815, + "flos": 24579073745280.0, + "grad_norm": 1.818548989117399, + "language_loss": 0.85523438, + "learning_rate": 3.664438796560225e-06, + "loss": 0.93351966, + "num_input_tokens_seen": 75980335, + "router_z_loss_clip": 2.42578125, + "router_z_loss_mlp": 0.22155762, + "step": 3517, + "time_per_iteration": 2.5862202644348145 + }, + { + "auxiliary_loss_clip": 0.06554718, + "auxiliary_loss_mlp": 0.01280908, + "balance_loss_clip": 0.06311698, + "balance_loss_mlp": 0.01260368, + "epoch": 0.21151360288591614, + "flos": 35854787105280.0, + "grad_norm": 2.178791897783965, + "language_loss": 0.6333189, + "learning_rate": 3.664222829354512e-06, + "loss": 0.71167523, + "num_input_tokens_seen": 76002095, + "router_z_loss_clip": 2.43164062, + "router_z_loss_mlp": 0.20532227, + "step": 3518, + "time_per_iteration": 2.6618587970733643 + }, + { + "auxiliary_loss_clip": 0.0654604, + "auxiliary_loss_mlp": 0.0129195, + "balance_loss_clip": 0.06306089, + "balance_loss_mlp": 0.01271625, + "epoch": 0.2115737261385841, + "flos": 24647989328640.0, + "grad_norm": 1.8588369306942552, + "language_loss": 0.90024757, + "learning_rate": 3.664006799041303e-06, + "loss": 0.97862744, + "num_input_tokens_seen": 76020425, + "router_z_loss_clip": 2.39648438, + "router_z_loss_mlp": 0.20336914, + "step": 3519, + "time_per_iteration": 2.5962281227111816 + }, + { + "auxiliary_loss_clip": 0.06553498, + "auxiliary_loss_mlp": 0.0129082, + "balance_loss_clip": 0.06309567, + "balance_loss_mlp": 0.01268945, + "epoch": 0.21163384939125207, + "flos": 25233652241280.0, + "grad_norm": 1.74321759448714, + "language_loss": 0.81933582, + "learning_rate": 3.6637907056287886e-06, + "loss": 0.89777905, + "num_input_tokens_seen": 76041210, + "router_z_loss_clip": 2.43945312, + "router_z_loss_mlp": 0.21862793, + "step": 3520, + "time_per_iteration": 2.6036746501922607 + }, + { + "auxiliary_loss_clip": 0.06544603, + "auxiliary_loss_mlp": 0.0127827, + "balance_loss_clip": 0.0630887, + "balance_loss_mlp": 0.01257576, + "epoch": 0.21169397264392004, + "flos": 26074670071680.0, + "grad_norm": 1.5989262406015683, + "language_loss": 0.76731956, + "learning_rate": 3.6635745491251642e-06, + "loss": 0.84554833, + "num_input_tokens_seen": 76062685, + "router_z_loss_clip": 2.35742188, + "router_z_loss_mlp": 0.20690918, + "step": 3521, + "time_per_iteration": 2.613945960998535 + }, + { + "auxiliary_loss_clip": 0.06548078, + "auxiliary_loss_mlp": 0.01281462, + "balance_loss_clip": 0.06310651, + "balance_loss_mlp": 0.01261364, + "epoch": 0.211754095896588, + "flos": 23114266594560.0, + "grad_norm": 2.104686387571933, + "language_loss": 0.75886559, + "learning_rate": 3.663358329538626e-06, + "loss": 0.83716094, + "num_input_tokens_seen": 76082300, + "router_z_loss_clip": 2.37695312, + "router_z_loss_mlp": 0.20092773, + "step": 3522, + "time_per_iteration": 2.530388355255127 + }, + { + "auxiliary_loss_clip": 0.06550008, + "auxiliary_loss_mlp": 0.01276271, + "balance_loss_clip": 0.06309568, + "balance_loss_mlp": 0.01255994, + "epoch": 0.21181421914925597, + "flos": 27928806019200.0, + "grad_norm": 2.55069435165465, + "language_loss": 0.71218652, + "learning_rate": 3.663142046877374e-06, + "loss": 0.79044926, + "num_input_tokens_seen": 76101135, + "router_z_loss_clip": 2.40625, + "router_z_loss_mlp": 0.20288086, + "step": 3523, + "time_per_iteration": 2.6448264122009277 + }, + { + "auxiliary_loss_clip": 0.06544726, + "auxiliary_loss_mlp": 0.01276969, + "balance_loss_clip": 0.06308427, + "balance_loss_mlp": 0.01256191, + "epoch": 0.21187434240192393, + "flos": 17134313057280.0, + "grad_norm": 2.0846198886990566, + "language_loss": 0.77930927, + "learning_rate": 3.6629257011496085e-06, + "loss": 0.8575263, + "num_input_tokens_seen": 76119320, + "router_z_loss_clip": 2.36523438, + "router_z_loss_mlp": 0.20788574, + "step": 3524, + "time_per_iteration": 2.527096748352051 + }, + { + "auxiliary_loss_clip": 0.06557429, + "auxiliary_loss_mlp": 0.01277075, + "balance_loss_clip": 0.0631334, + "balance_loss_mlp": 0.01255045, + "epoch": 0.21193446565459192, + "flos": 22354071626880.0, + "grad_norm": 2.138137470282545, + "language_loss": 0.82111794, + "learning_rate": 3.6627092923635338e-06, + "loss": 0.89946306, + "num_input_tokens_seen": 76137445, + "router_z_loss_clip": 2.43945312, + "router_z_loss_mlp": 0.22033691, + "step": 3525, + "time_per_iteration": 2.583249807357788 + }, + { + "auxiliary_loss_clip": 0.06547971, + "auxiliary_loss_mlp": 0.01274856, + "balance_loss_clip": 0.06308704, + "balance_loss_mlp": 0.01254519, + "epoch": 0.2119945889072599, + "flos": 27206779386240.0, + "grad_norm": 1.7514877674009408, + "language_loss": 0.75671291, + "learning_rate": 3.662492820527356e-06, + "loss": 0.83494115, + "num_input_tokens_seen": 76159500, + "router_z_loss_clip": 2.39453125, + "router_z_loss_mlp": 0.20324707, + "step": 3526, + "time_per_iteration": 2.56286883354187 + }, + { + "auxiliary_loss_clip": 0.06556675, + "auxiliary_loss_mlp": 0.01279028, + "balance_loss_clip": 0.0631361, + "balance_loss_mlp": 0.01258107, + "epoch": 0.21205471215992786, + "flos": 20997480424320.0, + "grad_norm": 1.9989732630407808, + "language_loss": 0.77276337, + "learning_rate": 3.662276285649284e-06, + "loss": 0.85112035, + "num_input_tokens_seen": 76177990, + "router_z_loss_clip": 2.4296875, + "router_z_loss_mlp": 0.20910645, + "step": 3527, + "time_per_iteration": 2.7162973880767822 + }, + { + "auxiliary_loss_clip": 0.06551696, + "auxiliary_loss_mlp": 0.01279873, + "balance_loss_clip": 0.06314081, + "balance_loss_mlp": 0.01258224, + "epoch": 0.21211483541259582, + "flos": 20784025347840.0, + "grad_norm": 2.0427089539116783, + "language_loss": 0.78184944, + "learning_rate": 3.662059687737528e-06, + "loss": 0.86016512, + "num_input_tokens_seen": 76197125, + "router_z_loss_clip": 2.375, + "router_z_loss_mlp": 0.21643066, + "step": 3528, + "time_per_iteration": 3.990530490875244 + }, + { + "auxiliary_loss_clip": 0.06551792, + "auxiliary_loss_mlp": 0.01277875, + "balance_loss_clip": 0.06313196, + "balance_loss_mlp": 0.01257025, + "epoch": 0.21217495866526379, + "flos": 18996079726080.0, + "grad_norm": 1.942993331862389, + "language_loss": 0.82054245, + "learning_rate": 3.6618430268003024e-06, + "loss": 0.89883912, + "num_input_tokens_seen": 76216215, + "router_z_loss_clip": 2.38671875, + "router_z_loss_mlp": 0.20861816, + "step": 3529, + "time_per_iteration": 2.564383029937744 + }, + { + "auxiliary_loss_clip": 0.06555474, + "auxiliary_loss_mlp": 0.01278138, + "balance_loss_clip": 0.06313926, + "balance_loss_mlp": 0.01257134, + "epoch": 0.21223508191793175, + "flos": 20673503412480.0, + "grad_norm": 2.2777790477523236, + "language_loss": 0.77694297, + "learning_rate": 3.6616263028458235e-06, + "loss": 0.85527909, + "num_input_tokens_seen": 76237010, + "router_z_loss_clip": 2.41796875, + "router_z_loss_mlp": 0.21008301, + "step": 3530, + "time_per_iteration": 2.576662540435791 + }, + { + "auxiliary_loss_clip": 0.06550869, + "auxiliary_loss_mlp": 0.01274157, + "balance_loss_clip": 0.06314521, + "balance_loss_mlp": 0.01254106, + "epoch": 0.21229520517059972, + "flos": 21622904899200.0, + "grad_norm": 2.3150689342230644, + "language_loss": 0.83926791, + "learning_rate": 3.661409515882308e-06, + "loss": 0.91751814, + "num_input_tokens_seen": 76255965, + "router_z_loss_clip": 2.36523438, + "router_z_loss_mlp": 0.20043945, + "step": 3531, + "time_per_iteration": 4.092180252075195 + }, + { + "auxiliary_loss_clip": 0.06553733, + "auxiliary_loss_mlp": 0.01280648, + "balance_loss_clip": 0.06313696, + "balance_loss_mlp": 0.0125888, + "epoch": 0.2123553284232677, + "flos": 13996232997120.0, + "grad_norm": 2.2553338764718145, + "language_loss": 0.74256229, + "learning_rate": 3.661192665917977e-06, + "loss": 0.82090604, + "num_input_tokens_seen": 76272150, + "router_z_loss_clip": 2.40039062, + "router_z_loss_mlp": 0.21777344, + "step": 3532, + "time_per_iteration": 2.5215070247650146 + }, + { + "auxiliary_loss_clip": 0.06549011, + "auxiliary_loss_mlp": 0.01276957, + "balance_loss_clip": 0.06309506, + "balance_loss_mlp": 0.01255714, + "epoch": 0.21241545167593567, + "flos": 18302745916800.0, + "grad_norm": 1.8963653738624293, + "language_loss": 0.74378759, + "learning_rate": 3.660975752961054e-06, + "loss": 0.82204729, + "num_input_tokens_seen": 76291425, + "router_z_loss_clip": 2.39648438, + "router_z_loss_mlp": 0.21252441, + "step": 3533, + "time_per_iteration": 2.5286645889282227 + }, + { + "auxiliary_loss_clip": 0.06554842, + "auxiliary_loss_mlp": 0.01279741, + "balance_loss_clip": 0.06312128, + "balance_loss_mlp": 0.01257341, + "epoch": 0.21247557492860364, + "flos": 34721461906560.0, + "grad_norm": 1.8118406193913599, + "language_loss": 0.71620667, + "learning_rate": 3.6607587770197634e-06, + "loss": 0.79455251, + "num_input_tokens_seen": 76313975, + "router_z_loss_clip": 2.42773438, + "router_z_loss_mlp": 0.22399902, + "step": 3534, + "time_per_iteration": 2.6872916221618652 + }, + { + "auxiliary_loss_clip": 0.06548804, + "auxiliary_loss_mlp": 0.01283614, + "balance_loss_clip": 0.0630706, + "balance_loss_mlp": 0.01262586, + "epoch": 0.2125356981812716, + "flos": 22060254885120.0, + "grad_norm": 2.3502862502903046, + "language_loss": 0.72866982, + "learning_rate": 3.6605417381023346e-06, + "loss": 0.80699402, + "num_input_tokens_seen": 76330955, + "router_z_loss_clip": 2.41992188, + "router_z_loss_mlp": 0.21032715, + "step": 3535, + "time_per_iteration": 2.5843448638916016 + }, + { + "auxiliary_loss_clip": 0.06546953, + "auxiliary_loss_mlp": 0.01279722, + "balance_loss_clip": 0.06307133, + "balance_loss_mlp": 0.01257621, + "epoch": 0.21259582143393957, + "flos": 28555865648640.0, + "grad_norm": 2.199655139190772, + "language_loss": 0.70759106, + "learning_rate": 3.660324636216996e-06, + "loss": 0.7858578, + "num_input_tokens_seen": 76352680, + "router_z_loss_clip": 2.3984375, + "router_z_loss_mlp": 0.22106934, + "step": 3536, + "time_per_iteration": 4.056318998336792 + }, + { + "auxiliary_loss_clip": 0.06557733, + "auxiliary_loss_mlp": 0.01286072, + "balance_loss_clip": 0.06310252, + "balance_loss_mlp": 0.0126415, + "epoch": 0.21265594468660753, + "flos": 20127140864640.0, + "grad_norm": 2.2134041941920897, + "language_loss": 0.8820163, + "learning_rate": 3.660107471371981e-06, + "loss": 0.96045434, + "num_input_tokens_seen": 76370750, + "router_z_loss_clip": 2.4765625, + "router_z_loss_mlp": 0.21911621, + "step": 3537, + "time_per_iteration": 2.6233468055725098 + }, + { + "auxiliary_loss_clip": 0.06541121, + "auxiliary_loss_mlp": 0.01278147, + "balance_loss_clip": 0.06304413, + "balance_loss_mlp": 0.01256094, + "epoch": 0.21271606793927553, + "flos": 23082890440320.0, + "grad_norm": 1.7848498720134809, + "language_loss": 0.81086004, + "learning_rate": 3.659890243575524e-06, + "loss": 0.88905263, + "num_input_tokens_seen": 76390610, + "router_z_loss_clip": 2.36914062, + "router_z_loss_mlp": 0.22058105, + "step": 3538, + "time_per_iteration": 2.5589442253112793 + }, + { + "auxiliary_loss_clip": 0.06545715, + "auxiliary_loss_mlp": 0.01283722, + "balance_loss_clip": 0.06305592, + "balance_loss_mlp": 0.01263981, + "epoch": 0.2127761911919435, + "flos": 26394118963200.0, + "grad_norm": 2.023826748108625, + "language_loss": 0.87817419, + "learning_rate": 3.659672952835863e-06, + "loss": 0.95646858, + "num_input_tokens_seen": 76408860, + "router_z_loss_clip": 2.40234375, + "router_z_loss_mlp": 0.19763184, + "step": 3539, + "time_per_iteration": 2.6115527153015137 + }, + { + "auxiliary_loss_clip": 0.06554011, + "auxiliary_loss_mlp": 0.01284638, + "balance_loss_clip": 0.06309317, + "balance_loss_mlp": 0.01264277, + "epoch": 0.21283631444461146, + "flos": 20234182855680.0, + "grad_norm": 3.1687626880856667, + "language_loss": 0.59144789, + "learning_rate": 3.659455599161237e-06, + "loss": 0.66983438, + "num_input_tokens_seen": 76424980, + "router_z_loss_clip": 2.4453125, + "router_z_loss_mlp": 0.20361328, + "step": 3540, + "time_per_iteration": 2.525139570236206 + }, + { + "auxiliary_loss_clip": 0.06543202, + "auxiliary_loss_mlp": 0.01278528, + "balance_loss_clip": 0.0630211, + "balance_loss_mlp": 0.01256557, + "epoch": 0.21289643769727942, + "flos": 13522140195840.0, + "grad_norm": 1.940296770056649, + "language_loss": 0.7721082, + "learning_rate": 3.659238182559888e-06, + "loss": 0.85032547, + "num_input_tokens_seen": 76443135, + "router_z_loss_clip": 2.40820312, + "router_z_loss_mlp": 0.21972656, + "step": 3541, + "time_per_iteration": 2.563164234161377 + }, + { + "auxiliary_loss_clip": 0.06542824, + "auxiliary_loss_mlp": 0.01283205, + "balance_loss_clip": 0.06305471, + "balance_loss_mlp": 0.01262486, + "epoch": 0.2129565609499474, + "flos": 24833967465600.0, + "grad_norm": 1.7979798329536472, + "language_loss": 0.69596064, + "learning_rate": 3.6590207030400615e-06, + "loss": 0.77422094, + "num_input_tokens_seen": 76462470, + "router_z_loss_clip": 2.37304688, + "router_z_loss_mlp": 0.20703125, + "step": 3542, + "time_per_iteration": 2.6213386058807373 + }, + { + "auxiliary_loss_clip": 0.06542216, + "auxiliary_loss_mlp": 0.01284362, + "balance_loss_clip": 0.0630642, + "balance_loss_mlp": 0.01264692, + "epoch": 0.21301668420261535, + "flos": 23665953876480.0, + "grad_norm": 1.8238030340304547, + "language_loss": 0.77012485, + "learning_rate": 3.658803160610004e-06, + "loss": 0.84839058, + "num_input_tokens_seen": 76481995, + "router_z_loss_clip": 2.359375, + "router_z_loss_mlp": 0.19677734, + "step": 3543, + "time_per_iteration": 2.5654232501983643 + }, + { + "auxiliary_loss_clip": 0.0654586, + "auxiliary_loss_mlp": 0.01282767, + "balance_loss_clip": 0.0630815, + "balance_loss_mlp": 0.01261488, + "epoch": 0.21307680745528332, + "flos": 16368416012160.0, + "grad_norm": 2.0315626098903468, + "language_loss": 0.67305464, + "learning_rate": 3.6585855552779634e-06, + "loss": 0.75134087, + "num_input_tokens_seen": 76500245, + "router_z_loss_clip": 2.37890625, + "router_z_loss_mlp": 0.2130127, + "step": 3544, + "time_per_iteration": 2.513288736343384 + }, + { + "auxiliary_loss_clip": 0.06542834, + "auxiliary_loss_mlp": 0.01284, + "balance_loss_clip": 0.06304078, + "balance_loss_mlp": 0.01264223, + "epoch": 0.2131369307079513, + "flos": 19105092288000.0, + "grad_norm": 1.7034786511890583, + "language_loss": 0.71322483, + "learning_rate": 3.6583678870521934e-06, + "loss": 0.79149318, + "num_input_tokens_seen": 76519535, + "router_z_loss_clip": 2.38867188, + "router_z_loss_mlp": 0.19763184, + "step": 3545, + "time_per_iteration": 2.5347442626953125 + }, + { + "auxiliary_loss_clip": 0.06549121, + "auxiliary_loss_mlp": 0.01288311, + "balance_loss_clip": 0.06306408, + "balance_loss_mlp": 0.01268224, + "epoch": 0.21319705396061928, + "flos": 30380050961280.0, + "grad_norm": 2.304335172733059, + "language_loss": 0.73178399, + "learning_rate": 3.658150155940946e-06, + "loss": 0.81015837, + "num_input_tokens_seen": 76542065, + "router_z_loss_clip": 2.42382812, + "router_z_loss_mlp": 0.20092773, + "step": 3546, + "time_per_iteration": 2.6647720336914062 + }, + { + "auxiliary_loss_clip": 0.0655164, + "auxiliary_loss_mlp": 0.01278696, + "balance_loss_clip": 0.06310475, + "balance_loss_mlp": 0.01258609, + "epoch": 0.21325717721328724, + "flos": 21761616533760.0, + "grad_norm": 1.9338253687785023, + "language_loss": 0.81206107, + "learning_rate": 3.657932361952479e-06, + "loss": 0.89036447, + "num_input_tokens_seen": 76560540, + "router_z_loss_clip": 2.41210938, + "router_z_loss_mlp": 0.20092773, + "step": 3547, + "time_per_iteration": 2.533062696456909 + }, + { + "auxiliary_loss_clip": 0.06547703, + "auxiliary_loss_mlp": 0.01281658, + "balance_loss_clip": 0.06302875, + "balance_loss_mlp": 0.01259127, + "epoch": 0.2133173004659552, + "flos": 28738447695360.0, + "grad_norm": 3.206018032759459, + "language_loss": 0.74960929, + "learning_rate": 3.6577145050950504e-06, + "loss": 0.82790291, + "num_input_tokens_seen": 76581760, + "router_z_loss_clip": 2.45117188, + "router_z_loss_mlp": 0.22521973, + "step": 3548, + "time_per_iteration": 2.605151414871216 + }, + { + "auxiliary_loss_clip": 0.06554648, + "auxiliary_loss_mlp": 0.01281207, + "balance_loss_clip": 0.06309359, + "balance_loss_mlp": 0.01259236, + "epoch": 0.21337742371862317, + "flos": 16842760375680.0, + "grad_norm": 2.056331081084102, + "language_loss": 0.74889886, + "learning_rate": 3.657496585376922e-06, + "loss": 0.82725745, + "num_input_tokens_seen": 76599940, + "router_z_loss_clip": 2.453125, + "router_z_loss_mlp": 0.21972656, + "step": 3549, + "time_per_iteration": 2.518305540084839 + }, + { + "auxiliary_loss_clip": 0.06547625, + "auxiliary_loss_mlp": 0.01281066, + "balance_loss_clip": 0.06306933, + "balance_loss_mlp": 0.01261278, + "epoch": 0.21343754697129114, + "flos": 24431683213440.0, + "grad_norm": 1.7052192349692608, + "language_loss": 0.8095907, + "learning_rate": 3.657278602806357e-06, + "loss": 0.88787764, + "num_input_tokens_seen": 76619580, + "router_z_loss_clip": 2.40625, + "router_z_loss_mlp": 0.19787598, + "step": 3550, + "time_per_iteration": 2.621840715408325 + }, + { + "auxiliary_loss_clip": 0.06544942, + "auxiliary_loss_mlp": 0.01278049, + "balance_loss_clip": 0.06309815, + "balance_loss_mlp": 0.01258653, + "epoch": 0.21349767022395913, + "flos": 19283271995520.0, + "grad_norm": 1.8011583081598594, + "language_loss": 0.88582718, + "learning_rate": 3.657060557391621e-06, + "loss": 0.96405709, + "num_input_tokens_seen": 76638195, + "router_z_loss_clip": 2.35546875, + "router_z_loss_mlp": 0.19384766, + "step": 3551, + "time_per_iteration": 2.5354909896850586 + }, + { + "auxiliary_loss_clip": 0.06541884, + "auxiliary_loss_mlp": 0.01276889, + "balance_loss_clip": 0.06304973, + "balance_loss_mlp": 0.01256635, + "epoch": 0.2135577934766271, + "flos": 17353260576000.0, + "grad_norm": 1.8291964059748265, + "language_loss": 0.83669794, + "learning_rate": 3.656842449140983e-06, + "loss": 0.91488564, + "num_input_tokens_seen": 76656695, + "router_z_loss_clip": 2.36914062, + "router_z_loss_mlp": 0.20275879, + "step": 3552, + "time_per_iteration": 2.5428099632263184 + }, + { + "auxiliary_loss_clip": 0.06543534, + "auxiliary_loss_mlp": 0.01282655, + "balance_loss_clip": 0.06305505, + "balance_loss_mlp": 0.01261329, + "epoch": 0.21361791672929506, + "flos": 24063416519040.0, + "grad_norm": 1.71251087169846, + "language_loss": 0.77181637, + "learning_rate": 3.656624278062713e-06, + "loss": 0.85007823, + "num_input_tokens_seen": 76677430, + "router_z_loss_clip": 2.38085938, + "router_z_loss_mlp": 0.21325684, + "step": 3553, + "time_per_iteration": 2.5453906059265137 + }, + { + "auxiliary_loss_clip": 0.06546006, + "auxiliary_loss_mlp": 0.01280965, + "balance_loss_clip": 0.06308904, + "balance_loss_mlp": 0.01260556, + "epoch": 0.21367803998196302, + "flos": 22168596614400.0, + "grad_norm": 1.6386548216082337, + "language_loss": 0.72918522, + "learning_rate": 3.6564060441650843e-06, + "loss": 0.80745488, + "num_input_tokens_seen": 76697615, + "router_z_loss_clip": 2.37109375, + "router_z_loss_mlp": 0.20397949, + "step": 3554, + "time_per_iteration": 2.610447883605957 + }, + { + "auxiliary_loss_clip": 0.06543835, + "auxiliary_loss_mlp": 0.01296522, + "balance_loss_clip": 0.06304221, + "balance_loss_mlp": 0.01276483, + "epoch": 0.213738163234631, + "flos": 20893205617920.0, + "grad_norm": 2.167468133085416, + "language_loss": 0.6838634, + "learning_rate": 3.6561877474563724e-06, + "loss": 0.76226699, + "num_input_tokens_seen": 76715685, + "router_z_loss_clip": 2.39453125, + "router_z_loss_mlp": 0.20043945, + "step": 3555, + "time_per_iteration": 2.6348068714141846 + }, + { + "auxiliary_loss_clip": 0.06544648, + "auxiliary_loss_mlp": 0.01283651, + "balance_loss_clip": 0.06303324, + "balance_loss_mlp": 0.01262861, + "epoch": 0.21379828648729896, + "flos": 28410739176960.0, + "grad_norm": 1.8068010568670265, + "language_loss": 0.6581043, + "learning_rate": 3.6559693879448553e-06, + "loss": 0.73638725, + "num_input_tokens_seen": 76735405, + "router_z_loss_clip": 2.4140625, + "router_z_loss_mlp": 0.20800781, + "step": 3556, + "time_per_iteration": 2.6547720432281494 + }, + { + "auxiliary_loss_clip": 0.06542179, + "auxiliary_loss_mlp": 0.0129054, + "balance_loss_clip": 0.06305043, + "balance_loss_mlp": 0.01269905, + "epoch": 0.21385840973996692, + "flos": 25486030339200.0, + "grad_norm": 1.6965425102308196, + "language_loss": 0.73263884, + "learning_rate": 3.6557509656388125e-06, + "loss": 0.81096601, + "num_input_tokens_seen": 76754395, + "router_z_loss_clip": 2.375, + "router_z_loss_mlp": 0.20617676, + "step": 3557, + "time_per_iteration": 2.5850143432617188 + }, + { + "auxiliary_loss_clip": 0.06555384, + "auxiliary_loss_mlp": 0.01282331, + "balance_loss_clip": 0.06310774, + "balance_loss_mlp": 0.01260814, + "epoch": 0.2139185329926349, + "flos": 28081772847360.0, + "grad_norm": 1.6861756161591135, + "language_loss": 0.67894918, + "learning_rate": 3.655532480546528e-06, + "loss": 0.75732636, + "num_input_tokens_seen": 76777210, + "router_z_loss_clip": 2.4453125, + "router_z_loss_mlp": 0.21508789, + "step": 3558, + "time_per_iteration": 2.6937482357025146 + }, + { + "auxiliary_loss_clip": 0.06554736, + "auxiliary_loss_mlp": 0.01278958, + "balance_loss_clip": 0.06306359, + "balance_loss_mlp": 0.0125905, + "epoch": 0.21397865624530288, + "flos": 19614628166400.0, + "grad_norm": 2.1418574307637575, + "language_loss": 0.81358159, + "learning_rate": 3.655313932676286e-06, + "loss": 0.89191854, + "num_input_tokens_seen": 76795830, + "router_z_loss_clip": 2.484375, + "router_z_loss_mlp": 0.19909668, + "step": 3559, + "time_per_iteration": 2.5145814418792725 + }, + { + "auxiliary_loss_clip": 0.06551723, + "auxiliary_loss_mlp": 0.01281472, + "balance_loss_clip": 0.06314635, + "balance_loss_mlp": 0.01262899, + "epoch": 0.21403877949797084, + "flos": 24688463650560.0, + "grad_norm": 1.6715073288493136, + "language_loss": 0.68710625, + "learning_rate": 3.655095322036373e-06, + "loss": 0.7654382, + "num_input_tokens_seen": 76814700, + "router_z_loss_clip": 2.36914062, + "router_z_loss_mlp": 0.18554688, + "step": 3560, + "time_per_iteration": 2.5956926345825195 + }, + { + "auxiliary_loss_clip": 0.06554615, + "auxiliary_loss_mlp": 0.01279566, + "balance_loss_clip": 0.0631121, + "balance_loss_mlp": 0.01259313, + "epoch": 0.2140989027506388, + "flos": 19866628920960.0, + "grad_norm": 1.9885830979576231, + "language_loss": 0.73618603, + "learning_rate": 3.65487664863508e-06, + "loss": 0.81452787, + "num_input_tokens_seen": 76833400, + "router_z_loss_clip": 2.43359375, + "router_z_loss_mlp": 0.20263672, + "step": 3561, + "time_per_iteration": 2.5286123752593994 + }, + { + "auxiliary_loss_clip": 0.06553814, + "auxiliary_loss_mlp": 0.01282143, + "balance_loss_clip": 0.06311779, + "balance_loss_mlp": 0.01262402, + "epoch": 0.21415902600330677, + "flos": 19141331978880.0, + "grad_norm": 2.350872095274855, + "language_loss": 0.78756285, + "learning_rate": 3.654657912480698e-06, + "loss": 0.86592233, + "num_input_tokens_seen": 76850645, + "router_z_loss_clip": 2.41992188, + "router_z_loss_mlp": 0.19763184, + "step": 3562, + "time_per_iteration": 2.608041286468506 + }, + { + "auxiliary_loss_clip": 0.06546983, + "auxiliary_loss_mlp": 0.01281911, + "balance_loss_clip": 0.06309752, + "balance_loss_mlp": 0.01261788, + "epoch": 0.21421914925597474, + "flos": 22279076622720.0, + "grad_norm": 1.5018972458321598, + "language_loss": 0.85257983, + "learning_rate": 3.6544391135815237e-06, + "loss": 0.93086874, + "num_input_tokens_seen": 76870135, + "router_z_loss_clip": 2.37304688, + "router_z_loss_mlp": 0.20117188, + "step": 3563, + "time_per_iteration": 2.5593912601470947 + }, + { + "auxiliary_loss_clip": 0.06548097, + "auxiliary_loss_mlp": 0.01281509, + "balance_loss_clip": 0.06308593, + "balance_loss_mlp": 0.01262531, + "epoch": 0.2142792725086427, + "flos": 33883504750080.0, + "grad_norm": 1.9248219523503745, + "language_loss": 0.76925778, + "learning_rate": 3.6542202519458507e-06, + "loss": 0.84755385, + "num_input_tokens_seen": 76893905, + "router_z_loss_clip": 2.39453125, + "router_z_loss_mlp": 0.18981934, + "step": 3564, + "time_per_iteration": 2.668755531311035 + }, + { + "auxiliary_loss_clip": 0.06542072, + "auxiliary_loss_mlp": 0.01280159, + "balance_loss_clip": 0.06305549, + "balance_loss_mlp": 0.01261181, + "epoch": 0.2143393957613107, + "flos": 19865538817920.0, + "grad_norm": 1.690691453330226, + "language_loss": 0.89139843, + "learning_rate": 3.654001327581981e-06, + "loss": 0.9696207, + "num_input_tokens_seen": 76914205, + "router_z_loss_clip": 2.36328125, + "router_z_loss_mlp": 0.18969727, + "step": 3565, + "time_per_iteration": 2.660306215286255 + }, + { + "auxiliary_loss_clip": 0.06436334, + "auxiliary_loss_mlp": 0.01286647, + "balance_loss_clip": 0.06303974, + "balance_loss_mlp": 0.01279924, + "epoch": 0.21439951901397866, + "flos": 68549300017920.0, + "grad_norm": 0.8225285981700966, + "language_loss": 0.52211988, + "learning_rate": 3.653782340498215e-06, + "loss": 0.59934968, + "num_input_tokens_seen": 76975650, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.06738281, + "step": 3566, + "time_per_iteration": 3.0845720767974854 + }, + { + "auxiliary_loss_clip": 0.06539588, + "auxiliary_loss_mlp": 0.01284533, + "balance_loss_clip": 0.06306818, + "balance_loss_mlp": 0.0126478, + "epoch": 0.21445964226664663, + "flos": 19689161973120.0, + "grad_norm": 1.8060006281631265, + "language_loss": 0.68295264, + "learning_rate": 3.6535632907028566e-06, + "loss": 0.76119387, + "num_input_tokens_seen": 76992615, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.19775391, + "step": 3567, + "time_per_iteration": 2.5250415802001953 + }, + { + "auxiliary_loss_clip": 0.06543978, + "auxiliary_loss_mlp": 0.01283364, + "balance_loss_clip": 0.06310168, + "balance_loss_mlp": 0.012641, + "epoch": 0.2145197655193146, + "flos": 31116039298560.0, + "grad_norm": 2.0548954423707753, + "language_loss": 0.75150776, + "learning_rate": 3.6533441782042126e-06, + "loss": 0.82978123, + "num_input_tokens_seen": 77017005, + "router_z_loss_clip": 2.33984375, + "router_z_loss_mlp": 0.19250488, + "step": 3568, + "time_per_iteration": 4.018412113189697 + }, + { + "auxiliary_loss_clip": 0.06538366, + "auxiliary_loss_mlp": 0.01282205, + "balance_loss_clip": 0.063043, + "balance_loss_mlp": 0.01261773, + "epoch": 0.21457988877198256, + "flos": 20127015083520.0, + "grad_norm": 2.3975687399079284, + "language_loss": 0.78487438, + "learning_rate": 3.6531250030105917e-06, + "loss": 0.86308008, + "num_input_tokens_seen": 77034990, + "router_z_loss_clip": 2.34179688, + "router_z_loss_mlp": 0.20446777, + "step": 3569, + "time_per_iteration": 2.6051042079925537 + }, + { + "auxiliary_loss_clip": 0.06554128, + "auxiliary_loss_mlp": 0.01283223, + "balance_loss_clip": 0.06309038, + "balance_loss_mlp": 0.01262183, + "epoch": 0.21464001202465052, + "flos": 18593963182080.0, + "grad_norm": 2.5916710851503173, + "language_loss": 0.7048617, + "learning_rate": 3.6529057651303053e-06, + "loss": 0.78323519, + "num_input_tokens_seen": 77052610, + "router_z_loss_clip": 2.453125, + "router_z_loss_mlp": 0.21032715, + "step": 3570, + "time_per_iteration": 2.5029172897338867 + }, + { + "auxiliary_loss_clip": 0.06548594, + "auxiliary_loss_mlp": 0.01293921, + "balance_loss_clip": 0.06305287, + "balance_loss_mlp": 0.01274621, + "epoch": 0.21470013527731852, + "flos": 21841600855680.0, + "grad_norm": 3.519297534980699, + "language_loss": 0.79412138, + "learning_rate": 3.6526864645716666e-06, + "loss": 0.87254649, + "num_input_tokens_seen": 77072475, + "router_z_loss_clip": 2.43554688, + "router_z_loss_mlp": 0.19311523, + "step": 3571, + "time_per_iteration": 3.984830141067505 + }, + { + "auxiliary_loss_clip": 0.06547887, + "auxiliary_loss_mlp": 0.01283536, + "balance_loss_clip": 0.06306981, + "balance_loss_mlp": 0.01263413, + "epoch": 0.21476025852998648, + "flos": 17608992837120.0, + "grad_norm": 2.1137138833129114, + "language_loss": 0.83417559, + "learning_rate": 3.652467101342991e-06, + "loss": 0.91248989, + "num_input_tokens_seen": 77089930, + "router_z_loss_clip": 2.41210938, + "router_z_loss_mlp": 0.20117188, + "step": 3572, + "time_per_iteration": 2.550900459289551 + }, + { + "auxiliary_loss_clip": 0.06544446, + "auxiliary_loss_mlp": 0.01290796, + "balance_loss_clip": 0.06300403, + "balance_loss_mlp": 0.01271114, + "epoch": 0.21482038178265445, + "flos": 24835267203840.0, + "grad_norm": 5.91831897424108, + "language_loss": 0.6534397, + "learning_rate": 3.652247675452598e-06, + "loss": 0.73179209, + "num_input_tokens_seen": 77108970, + "router_z_loss_clip": 2.44140625, + "router_z_loss_mlp": 0.19677734, + "step": 3573, + "time_per_iteration": 2.574037551879883 + }, + { + "auxiliary_loss_clip": 0.06536618, + "auxiliary_loss_mlp": 0.01287357, + "balance_loss_clip": 0.06305118, + "balance_loss_mlp": 0.0126814, + "epoch": 0.2148805050353224, + "flos": 23264927435520.0, + "grad_norm": 1.8228372560216166, + "language_loss": 0.76129293, + "learning_rate": 3.652028186908807e-06, + "loss": 0.83953267, + "num_input_tokens_seen": 77126045, + "router_z_loss_clip": 2.31640625, + "router_z_loss_mlp": 0.1920166, + "step": 3574, + "time_per_iteration": 2.610541343688965 + }, + { + "auxiliary_loss_clip": 0.06537417, + "auxiliary_loss_mlp": 0.01280783, + "balance_loss_clip": 0.06298707, + "balance_loss_mlp": 0.0126066, + "epoch": 0.21494062828799038, + "flos": 21326907951360.0, + "grad_norm": 2.0935140233911644, + "language_loss": 0.72909325, + "learning_rate": 3.6518086357199416e-06, + "loss": 0.8072753, + "num_input_tokens_seen": 77144600, + "router_z_loss_clip": 2.38867188, + "router_z_loss_mlp": 0.20117188, + "step": 3575, + "time_per_iteration": 2.581932306289673 + }, + { + "auxiliary_loss_clip": 0.06537387, + "auxiliary_loss_mlp": 0.01288909, + "balance_loss_clip": 0.06302074, + "balance_loss_mlp": 0.01269657, + "epoch": 0.21500075154065834, + "flos": 18849276172800.0, + "grad_norm": 2.2103119968131986, + "language_loss": 0.6923548, + "learning_rate": 3.6515890218943277e-06, + "loss": 0.77061772, + "num_input_tokens_seen": 77162965, + "router_z_loss_clip": 2.35742188, + "router_z_loss_mlp": 0.19262695, + "step": 3576, + "time_per_iteration": 5.394233703613281 + }, + { + "auxiliary_loss_clip": 0.06547244, + "auxiliary_loss_mlp": 0.01282016, + "balance_loss_clip": 0.06304461, + "balance_loss_mlp": 0.0126069, + "epoch": 0.2150608747933263, + "flos": 18447872388480.0, + "grad_norm": 1.9274083971527407, + "language_loss": 0.89371777, + "learning_rate": 3.651369345440292e-06, + "loss": 0.97201031, + "num_input_tokens_seen": 77179960, + "router_z_loss_clip": 2.42773438, + "router_z_loss_mlp": 0.21337891, + "step": 3577, + "time_per_iteration": 2.5629777908325195 + }, + { + "auxiliary_loss_clip": 0.06425267, + "auxiliary_loss_mlp": 0.01303124, + "balance_loss_clip": 0.06298774, + "balance_loss_mlp": 0.01297548, + "epoch": 0.2151209980459943, + "flos": 66617443808640.0, + "grad_norm": 0.7978427219987446, + "language_loss": 0.56304139, + "learning_rate": 3.6511496063661654e-06, + "loss": 0.64032531, + "num_input_tokens_seen": 77239500, + "router_z_loss_clip": 1.265625, + "router_z_loss_mlp": 0.05581665, + "step": 3578, + "time_per_iteration": 3.0982370376586914 + }, + { + "auxiliary_loss_clip": 0.06546376, + "auxiliary_loss_mlp": 0.0128684, + "balance_loss_clip": 0.06309081, + "balance_loss_mlp": 0.0126729, + "epoch": 0.21518112129866226, + "flos": 21581633963520.0, + "grad_norm": 1.7619248126111737, + "language_loss": 0.89097106, + "learning_rate": 3.6509298046802807e-06, + "loss": 0.96930325, + "num_input_tokens_seen": 77254680, + "router_z_loss_clip": 2.37304688, + "router_z_loss_mlp": 0.19555664, + "step": 3579, + "time_per_iteration": 2.5552327632904053 + }, + { + "auxiliary_loss_clip": 0.06544919, + "auxiliary_loss_mlp": 0.01281002, + "balance_loss_clip": 0.06304899, + "balance_loss_mlp": 0.01260498, + "epoch": 0.21524124455133023, + "flos": 20053822942080.0, + "grad_norm": 1.8548300822509616, + "language_loss": 0.78671825, + "learning_rate": 3.650709940390972e-06, + "loss": 0.86497748, + "num_input_tokens_seen": 77274060, + "router_z_loss_clip": 2.39648438, + "router_z_loss_mlp": 0.20507812, + "step": 3580, + "time_per_iteration": 2.538740634918213 + }, + { + "auxiliary_loss_clip": 0.06547832, + "auxiliary_loss_mlp": 0.01284221, + "balance_loss_clip": 0.06311843, + "balance_loss_mlp": 0.01265279, + "epoch": 0.2153013678039982, + "flos": 23958680515200.0, + "grad_norm": 2.0040984242528905, + "language_loss": 0.73520374, + "learning_rate": 3.6504900135065775e-06, + "loss": 0.81352425, + "num_input_tokens_seen": 77293255, + "router_z_loss_clip": 2.359375, + "router_z_loss_mlp": 0.18933105, + "step": 3581, + "time_per_iteration": 2.5783493518829346 + }, + { + "auxiliary_loss_clip": 0.06544261, + "auxiliary_loss_mlp": 0.01283002, + "balance_loss_clip": 0.06307264, + "balance_loss_mlp": 0.01262438, + "epoch": 0.21536149105666616, + "flos": 20601107884800.0, + "grad_norm": 2.9043222851567574, + "language_loss": 0.71477044, + "learning_rate": 3.6502700240354357e-06, + "loss": 0.79304302, + "num_input_tokens_seen": 77312390, + "router_z_loss_clip": 2.3671875, + "router_z_loss_mlp": 0.20556641, + "step": 3582, + "time_per_iteration": 2.5253281593322754 + }, + { + "auxiliary_loss_clip": 0.06553562, + "auxiliary_loss_mlp": 0.01282797, + "balance_loss_clip": 0.06315581, + "balance_loss_mlp": 0.01262209, + "epoch": 0.21542161430933413, + "flos": 12865046077440.0, + "grad_norm": 2.5916269023447795, + "language_loss": 0.85900396, + "learning_rate": 3.650049971985889e-06, + "loss": 0.93736756, + "num_input_tokens_seen": 77330985, + "router_z_loss_clip": 2.37890625, + "router_z_loss_mlp": 0.20568848, + "step": 3583, + "time_per_iteration": 2.580411434173584 + }, + { + "auxiliary_loss_clip": 0.0655268, + "auxiliary_loss_mlp": 0.01295505, + "balance_loss_clip": 0.06312086, + "balance_loss_mlp": 0.01275561, + "epoch": 0.21548173756200212, + "flos": 26111077470720.0, + "grad_norm": 2.720923149453336, + "language_loss": 0.83510441, + "learning_rate": 3.6498298573662824e-06, + "loss": 0.91358626, + "num_input_tokens_seen": 77350770, + "router_z_loss_clip": 2.40625, + "router_z_loss_mlp": 0.19934082, + "step": 3584, + "time_per_iteration": 2.587843179702759 + }, + { + "auxiliary_loss_clip": 0.06549002, + "auxiliary_loss_mlp": 0.01288111, + "balance_loss_clip": 0.06314336, + "balance_loss_mlp": 0.01267667, + "epoch": 0.21554186081467008, + "flos": 22170315623040.0, + "grad_norm": 2.7712372256622357, + "language_loss": 0.91010725, + "learning_rate": 3.6496096801849625e-06, + "loss": 0.9884783, + "num_input_tokens_seen": 77370510, + "router_z_loss_clip": 2.34375, + "router_z_loss_mlp": 0.20446777, + "step": 3585, + "time_per_iteration": 2.5638017654418945 + }, + { + "auxiliary_loss_clip": 0.06548285, + "auxiliary_loss_mlp": 0.0129374, + "balance_loss_clip": 0.0631054, + "balance_loss_mlp": 0.012745, + "epoch": 0.21560198406733805, + "flos": 22973458608000.0, + "grad_norm": 2.0799258962001548, + "language_loss": 0.75285476, + "learning_rate": 3.649389440450277e-06, + "loss": 0.83127499, + "num_input_tokens_seen": 77390645, + "router_z_loss_clip": 2.37890625, + "router_z_loss_mlp": 0.19238281, + "step": 3586, + "time_per_iteration": 2.5816385746002197 + }, + { + "auxiliary_loss_clip": 0.06560329, + "auxiliary_loss_mlp": 0.01301548, + "balance_loss_clip": 0.06317623, + "balance_loss_mlp": 0.012817, + "epoch": 0.215662107320006, + "flos": 22790708853120.0, + "grad_norm": 1.7819627104594034, + "language_loss": 0.83628035, + "learning_rate": 3.6491691381705804e-06, + "loss": 0.91489911, + "num_input_tokens_seen": 77409655, + "router_z_loss_clip": 2.42382812, + "router_z_loss_mlp": 0.19848633, + "step": 3587, + "time_per_iteration": 2.5768468379974365 + }, + { + "auxiliary_loss_clip": 0.06549525, + "auxiliary_loss_mlp": 0.01284104, + "balance_loss_clip": 0.06311873, + "balance_loss_mlp": 0.01265114, + "epoch": 0.21572223057267398, + "flos": 30891850899840.0, + "grad_norm": 2.819752743062096, + "language_loss": 0.764575, + "learning_rate": 3.648948773354224e-06, + "loss": 0.8429113, + "num_input_tokens_seen": 77430560, + "router_z_loss_clip": 2.37890625, + "router_z_loss_mlp": 0.18981934, + "step": 3588, + "time_per_iteration": 2.6578357219696045 + }, + { + "auxiliary_loss_clip": 0.06557232, + "auxiliary_loss_mlp": 0.01294163, + "balance_loss_clip": 0.06316121, + "balance_loss_mlp": 0.01274494, + "epoch": 0.21578235382534194, + "flos": 26918413159680.0, + "grad_norm": 3.674353356251158, + "language_loss": 0.8181411, + "learning_rate": 3.6487283460095643e-06, + "loss": 0.89665502, + "num_input_tokens_seen": 77455000, + "router_z_loss_clip": 2.4140625, + "router_z_loss_mlp": 0.19689941, + "step": 3589, + "time_per_iteration": 2.6730964183807373 + }, + { + "auxiliary_loss_clip": 0.06560542, + "auxiliary_loss_mlp": 0.01287343, + "balance_loss_clip": 0.06321919, + "balance_loss_mlp": 0.01267959, + "epoch": 0.2158424770780099, + "flos": 24432605608320.0, + "grad_norm": 2.119721317496626, + "language_loss": 0.73323047, + "learning_rate": 3.648507856144961e-06, + "loss": 0.81170928, + "num_input_tokens_seen": 77475075, + "router_z_loss_clip": 2.38671875, + "router_z_loss_mlp": 0.19384766, + "step": 3590, + "time_per_iteration": 2.5885848999023438 + }, + { + "auxiliary_loss_clip": 0.06554762, + "auxiliary_loss_mlp": 0.0128494, + "balance_loss_clip": 0.06310897, + "balance_loss_mlp": 0.01264401, + "epoch": 0.2159026003306779, + "flos": 23956542236160.0, + "grad_norm": 2.0666561712978813, + "language_loss": 0.84929311, + "learning_rate": 3.648287303768775e-06, + "loss": 0.92769015, + "num_input_tokens_seen": 77495945, + "router_z_loss_clip": 2.44140625, + "router_z_loss_mlp": 0.20544434, + "step": 3591, + "time_per_iteration": 2.5598154067993164 + }, + { + "auxiliary_loss_clip": 0.0656037, + "auxiliary_loss_mlp": 0.01294269, + "balance_loss_clip": 0.06315921, + "balance_loss_mlp": 0.01272776, + "epoch": 0.21596272358334587, + "flos": 30048191665920.0, + "grad_norm": 1.8943006547331833, + "language_loss": 0.69118065, + "learning_rate": 3.6480666888893686e-06, + "loss": 0.76972699, + "num_input_tokens_seen": 77517140, + "router_z_loss_clip": 2.44140625, + "router_z_loss_mlp": 0.21496582, + "step": 3592, + "time_per_iteration": 2.623124599456787 + }, + { + "auxiliary_loss_clip": 0.06558264, + "auxiliary_loss_mlp": 0.01284651, + "balance_loss_clip": 0.06314576, + "balance_loss_mlp": 0.01264218, + "epoch": 0.21602284683601383, + "flos": 20382495782400.0, + "grad_norm": 3.2836833125469753, + "language_loss": 0.84947151, + "learning_rate": 3.647846011515108e-06, + "loss": 0.92790061, + "num_input_tokens_seen": 77536085, + "router_z_loss_clip": 2.4375, + "router_z_loss_mlp": 0.2043457, + "step": 3593, + "time_per_iteration": 2.5159051418304443 + }, + { + "auxiliary_loss_clip": 0.06551524, + "auxiliary_loss_mlp": 0.01289729, + "balance_loss_clip": 0.06309479, + "balance_loss_mlp": 0.01267615, + "epoch": 0.2160829700886818, + "flos": 20783648004480.0, + "grad_norm": 2.6962087820066567, + "language_loss": 0.76424301, + "learning_rate": 3.6476252716543625e-06, + "loss": 0.84265554, + "num_input_tokens_seen": 77553675, + "router_z_loss_clip": 2.42382812, + "router_z_loss_mlp": 0.22119141, + "step": 3594, + "time_per_iteration": 2.530874490737915 + }, + { + "auxiliary_loss_clip": 0.06549954, + "auxiliary_loss_mlp": 0.01280574, + "balance_loss_clip": 0.06313863, + "balance_loss_mlp": 0.01260189, + "epoch": 0.21614309334134976, + "flos": 22316322562560.0, + "grad_norm": 1.5622924015328905, + "language_loss": 0.80828846, + "learning_rate": 3.6474044693155007e-06, + "loss": 0.88659382, + "num_input_tokens_seen": 77573360, + "router_z_loss_clip": 2.36132812, + "router_z_loss_mlp": 0.20385742, + "step": 3595, + "time_per_iteration": 2.5720436573028564 + }, + { + "auxiliary_loss_clip": 0.0655812, + "auxiliary_loss_mlp": 0.01282788, + "balance_loss_clip": 0.06310599, + "balance_loss_mlp": 0.01261962, + "epoch": 0.21620321659401773, + "flos": 19615592488320.0, + "grad_norm": 2.071968351759389, + "language_loss": 0.79120421, + "learning_rate": 3.647183604506897e-06, + "loss": 0.86961329, + "num_input_tokens_seen": 77591865, + "router_z_loss_clip": 2.4765625, + "router_z_loss_mlp": 0.20825195, + "step": 3596, + "time_per_iteration": 2.529978036880493 + }, + { + "auxiliary_loss_clip": 0.06547653, + "auxiliary_loss_mlp": 0.01279822, + "balance_loss_clip": 0.06309111, + "balance_loss_mlp": 0.01258615, + "epoch": 0.2162633398466857, + "flos": 18850701692160.0, + "grad_norm": 1.8098333997433065, + "language_loss": 0.83728772, + "learning_rate": 3.6469626772369253e-06, + "loss": 0.91556245, + "num_input_tokens_seen": 77611600, + "router_z_loss_clip": 2.38671875, + "router_z_loss_mlp": 0.2121582, + "step": 3597, + "time_per_iteration": 2.514389991760254 + }, + { + "auxiliary_loss_clip": 0.06559294, + "auxiliary_loss_mlp": 0.01284022, + "balance_loss_clip": 0.06315802, + "balance_loss_mlp": 0.01262421, + "epoch": 0.21632346309935369, + "flos": 18774490803840.0, + "grad_norm": 2.0845397374343655, + "language_loss": 0.81213892, + "learning_rate": 3.6467416875139642e-06, + "loss": 0.89057213, + "num_input_tokens_seen": 77630665, + "router_z_loss_clip": 2.4375, + "router_z_loss_mlp": 0.21606445, + "step": 3598, + "time_per_iteration": 2.517596960067749 + }, + { + "auxiliary_loss_clip": 0.06554621, + "auxiliary_loss_mlp": 0.01287936, + "balance_loss_clip": 0.06312433, + "balance_loss_mlp": 0.01265072, + "epoch": 0.21638358635202165, + "flos": 26331576289920.0, + "grad_norm": 1.6266226591192001, + "language_loss": 0.82318664, + "learning_rate": 3.6465206353463934e-06, + "loss": 0.90161228, + "num_input_tokens_seen": 77650835, + "router_z_loss_clip": 2.41992188, + "router_z_loss_mlp": 0.22851562, + "step": 3599, + "time_per_iteration": 2.567528486251831 + }, + { + "auxiliary_loss_clip": 0.06553015, + "auxiliary_loss_mlp": 0.01284743, + "balance_loss_clip": 0.06314674, + "balance_loss_mlp": 0.01263107, + "epoch": 0.21644370960468962, + "flos": 20747156751360.0, + "grad_norm": 2.0891036476830585, + "language_loss": 0.76652539, + "learning_rate": 3.6462995207425947e-06, + "loss": 0.84490293, + "num_input_tokens_seen": 77669000, + "router_z_loss_clip": 2.38476562, + "router_z_loss_mlp": 0.21618652, + "step": 3600, + "time_per_iteration": 2.5642178058624268 + }, + { + "auxiliary_loss_clip": 0.06555548, + "auxiliary_loss_mlp": 0.01287253, + "balance_loss_clip": 0.06316924, + "balance_loss_mlp": 0.01267512, + "epoch": 0.21650383285735758, + "flos": 23959183639680.0, + "grad_norm": 1.8375873098897355, + "language_loss": 0.80812716, + "learning_rate": 3.6460783437109533e-06, + "loss": 0.88655519, + "num_input_tokens_seen": 77688745, + "router_z_loss_clip": 2.38476562, + "router_z_loss_mlp": 0.19726562, + "step": 3601, + "time_per_iteration": 2.536790132522583 + }, + { + "auxiliary_loss_clip": 0.06558496, + "auxiliary_loss_mlp": 0.01286287, + "balance_loss_clip": 0.06317312, + "balance_loss_mlp": 0.01265783, + "epoch": 0.21656395611002555, + "flos": 23702864400000.0, + "grad_norm": 1.8593805820505158, + "language_loss": 0.84205902, + "learning_rate": 3.6458571042598565e-06, + "loss": 0.92050683, + "num_input_tokens_seen": 77708445, + "router_z_loss_clip": 2.41210938, + "router_z_loss_mlp": 0.2052002, + "step": 3602, + "time_per_iteration": 2.5919816493988037 + }, + { + "auxiliary_loss_clip": 0.06553967, + "auxiliary_loss_mlp": 0.01285958, + "balance_loss_clip": 0.06313825, + "balance_loss_mlp": 0.01265371, + "epoch": 0.2166240793626935, + "flos": 20672035966080.0, + "grad_norm": 1.6537912100509087, + "language_loss": 0.75107038, + "learning_rate": 3.645635802397693e-06, + "loss": 0.82946962, + "num_input_tokens_seen": 77728465, + "router_z_loss_clip": 2.40429688, + "router_z_loss_mlp": 0.20581055, + "step": 3603, + "time_per_iteration": 2.5602827072143555 + }, + { + "auxiliary_loss_clip": 0.06545025, + "auxiliary_loss_mlp": 0.01279689, + "balance_loss_clip": 0.06314509, + "balance_loss_mlp": 0.0125996, + "epoch": 0.2166842026153615, + "flos": 21586916770560.0, + "grad_norm": 1.9607230977514314, + "language_loss": 0.75016356, + "learning_rate": 3.645414438132855e-06, + "loss": 0.82841063, + "num_input_tokens_seen": 77746735, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.1973877, + "step": 3604, + "time_per_iteration": 2.7099287509918213 + }, + { + "auxiliary_loss_clip": 0.06550605, + "auxiliary_loss_mlp": 0.01283396, + "balance_loss_clip": 0.06315283, + "balance_loss_mlp": 0.01263881, + "epoch": 0.21674432586802947, + "flos": 25637068523520.0, + "grad_norm": 1.5948705207891358, + "language_loss": 0.80732697, + "learning_rate": 3.6451930114737366e-06, + "loss": 0.88566697, + "num_input_tokens_seen": 77768105, + "router_z_loss_clip": 2.35351562, + "router_z_loss_mlp": 0.19506836, + "step": 3605, + "time_per_iteration": 2.601269483566284 + }, + { + "auxiliary_loss_clip": 0.06465107, + "auxiliary_loss_mlp": 0.01314575, + "balance_loss_clip": 0.0633797, + "balance_loss_mlp": 0.01307596, + "epoch": 0.21680444912069743, + "flos": 56435126376960.0, + "grad_norm": 0.68181157035555, + "language_loss": 0.58316016, + "learning_rate": 3.6449715224287347e-06, + "loss": 0.66095698, + "num_input_tokens_seen": 77833750, + "router_z_loss_clip": 1.27246094, + "router_z_loss_mlp": 0.06994629, + "step": 3606, + "time_per_iteration": 3.2531886100769043 + }, + { + "auxiliary_loss_clip": 0.06547002, + "auxiliary_loss_mlp": 0.01286663, + "balance_loss_clip": 0.06303971, + "balance_loss_mlp": 0.01264502, + "epoch": 0.2168645723733654, + "flos": 23885823790080.0, + "grad_norm": 1.8693102201830953, + "language_loss": 0.73682618, + "learning_rate": 3.644749971006248e-06, + "loss": 0.81516284, + "num_input_tokens_seen": 77853780, + "router_z_loss_clip": 2.42773438, + "router_z_loss_mlp": 0.22155762, + "step": 3607, + "time_per_iteration": 4.0285868644714355 + }, + { + "auxiliary_loss_clip": 0.06548688, + "auxiliary_loss_mlp": 0.01281672, + "balance_loss_clip": 0.06306184, + "balance_loss_mlp": 0.01259595, + "epoch": 0.21692469562603336, + "flos": 16951814864640.0, + "grad_norm": 1.845726065350227, + "language_loss": 0.78116572, + "learning_rate": 3.6445283572146765e-06, + "loss": 0.85946935, + "num_input_tokens_seen": 77872575, + "router_z_loss_clip": 2.421875, + "router_z_loss_mlp": 0.22070312, + "step": 3608, + "time_per_iteration": 2.4997665882110596 + }, + { + "auxiliary_loss_clip": 0.06549841, + "auxiliary_loss_mlp": 0.01279583, + "balance_loss_clip": 0.06307275, + "balance_loss_mlp": 0.01260248, + "epoch": 0.21698481887870133, + "flos": 25126065198720.0, + "grad_norm": 2.052249511327834, + "language_loss": 0.74638152, + "learning_rate": 3.6443066810624255e-06, + "loss": 0.82467568, + "num_input_tokens_seen": 77892700, + "router_z_loss_clip": 2.42773438, + "router_z_loss_mlp": 0.19335938, + "step": 3609, + "time_per_iteration": 2.5834193229675293 + }, + { + "auxiliary_loss_clip": 0.06540599, + "auxiliary_loss_mlp": 0.01279572, + "balance_loss_clip": 0.06301089, + "balance_loss_mlp": 0.01258221, + "epoch": 0.2170449421313693, + "flos": 17900461664640.0, + "grad_norm": 2.066668805909691, + "language_loss": 0.8888129, + "learning_rate": 3.6440849425579e-06, + "loss": 0.96701467, + "num_input_tokens_seen": 77911060, + "router_z_loss_clip": 2.39648438, + "router_z_loss_mlp": 0.21374512, + "step": 3610, + "time_per_iteration": 3.978980302810669 + }, + { + "auxiliary_loss_clip": 0.06540407, + "auxiliary_loss_mlp": 0.01284961, + "balance_loss_clip": 0.06302356, + "balance_loss_mlp": 0.01264457, + "epoch": 0.2171050653840373, + "flos": 22645121184000.0, + "grad_norm": 2.4524698956279978, + "language_loss": 0.78034103, + "learning_rate": 3.6438631417095095e-06, + "loss": 0.85859472, + "num_input_tokens_seen": 77929930, + "router_z_loss_clip": 2.38085938, + "router_z_loss_mlp": 0.20507812, + "step": 3611, + "time_per_iteration": 2.537783622741699 + }, + { + "auxiliary_loss_clip": 0.06539893, + "auxiliary_loss_mlp": 0.01277493, + "balance_loss_clip": 0.06301216, + "balance_loss_mlp": 0.0125619, + "epoch": 0.21716518863670525, + "flos": 19506034874880.0, + "grad_norm": 1.9372172398113192, + "language_loss": 0.63866782, + "learning_rate": 3.6436412785256637e-06, + "loss": 0.71684164, + "num_input_tokens_seen": 77949060, + "router_z_loss_clip": 2.38671875, + "router_z_loss_mlp": 0.21313477, + "step": 3612, + "time_per_iteration": 2.5200283527374268 + }, + { + "auxiliary_loss_clip": 0.06543254, + "auxiliary_loss_mlp": 0.01280194, + "balance_loss_clip": 0.06303414, + "balance_loss_mlp": 0.01259761, + "epoch": 0.21722531188937322, + "flos": 19798132608000.0, + "grad_norm": 1.7866878621114652, + "language_loss": 0.76463711, + "learning_rate": 3.643419353014776e-06, + "loss": 0.84287155, + "num_input_tokens_seen": 77967920, + "router_z_loss_clip": 2.40039062, + "router_z_loss_mlp": 0.2043457, + "step": 3613, + "time_per_iteration": 2.536395311355591 + }, + { + "auxiliary_loss_clip": 0.06540725, + "auxiliary_loss_mlp": 0.01277778, + "balance_loss_clip": 0.06303174, + "balance_loss_mlp": 0.01256165, + "epoch": 0.21728543514204118, + "flos": 13339474295040.0, + "grad_norm": 1.8023674067133515, + "language_loss": 0.72213733, + "learning_rate": 3.643197365185261e-06, + "loss": 0.80032235, + "num_input_tokens_seen": 77985330, + "router_z_loss_clip": 2.375, + "router_z_loss_mlp": 0.21582031, + "step": 3614, + "time_per_iteration": 2.5000360012054443 + }, + { + "auxiliary_loss_clip": 0.06542929, + "auxiliary_loss_mlp": 0.01277823, + "balance_loss_clip": 0.06304483, + "balance_loss_mlp": 0.01256973, + "epoch": 0.21734555839470915, + "flos": 15237312946560.0, + "grad_norm": 2.7303590898197463, + "language_loss": 0.73928845, + "learning_rate": 3.6429753150455378e-06, + "loss": 0.81749594, + "num_input_tokens_seen": 78003105, + "router_z_loss_clip": 2.38476562, + "router_z_loss_mlp": 0.20849609, + "step": 3615, + "time_per_iteration": 3.924616813659668 + }, + { + "auxiliary_loss_clip": 0.0654763, + "auxiliary_loss_mlp": 0.01279327, + "balance_loss_clip": 0.06301322, + "balance_loss_mlp": 0.0125694, + "epoch": 0.2174056816473771, + "flos": 19980043822080.0, + "grad_norm": 2.1391350951981467, + "language_loss": 0.913239, + "learning_rate": 3.6427532026040263e-06, + "loss": 0.99150848, + "num_input_tokens_seen": 78019655, + "router_z_loss_clip": 2.46875, + "router_z_loss_mlp": 0.22387695, + "step": 3616, + "time_per_iteration": 3.9379403591156006 + }, + { + "auxiliary_loss_clip": 0.06540038, + "auxiliary_loss_mlp": 0.01284656, + "balance_loss_clip": 0.06298746, + "balance_loss_mlp": 0.01263163, + "epoch": 0.21746580490004508, + "flos": 16692309169920.0, + "grad_norm": 2.057861674488091, + "language_loss": 0.81572813, + "learning_rate": 3.642531027869148e-06, + "loss": 0.89397502, + "num_input_tokens_seen": 78036025, + "router_z_loss_clip": 2.41601562, + "router_z_loss_mlp": 0.21496582, + "step": 3617, + "time_per_iteration": 2.5517330169677734 + }, + { + "auxiliary_loss_clip": 0.06543958, + "auxiliary_loss_mlp": 0.01280959, + "balance_loss_clip": 0.06300673, + "balance_loss_mlp": 0.01258881, + "epoch": 0.21752592815271307, + "flos": 25778840832000.0, + "grad_norm": 1.7475820668036919, + "language_loss": 0.76030993, + "learning_rate": 3.642308790849329e-06, + "loss": 0.83855915, + "num_input_tokens_seen": 78055645, + "router_z_loss_clip": 2.43164062, + "router_z_loss_mlp": 0.2208252, + "step": 3618, + "time_per_iteration": 2.5874650478363037 + }, + { + "auxiliary_loss_clip": 0.06542084, + "auxiliary_loss_mlp": 0.01277743, + "balance_loss_clip": 0.06299525, + "balance_loss_mlp": 0.01255928, + "epoch": 0.21758605140538104, + "flos": 11259430940160.0, + "grad_norm": 1.9309868599682727, + "language_loss": 0.69592559, + "learning_rate": 3.642086491552996e-06, + "loss": 0.77412391, + "num_input_tokens_seen": 78071660, + "router_z_loss_clip": 2.421875, + "router_z_loss_mlp": 0.21826172, + "step": 3619, + "time_per_iteration": 2.5259079933166504 + }, + { + "auxiliary_loss_clip": 0.06549741, + "auxiliary_loss_mlp": 0.01287424, + "balance_loss_clip": 0.06307657, + "balance_loss_mlp": 0.01264906, + "epoch": 0.217646174658049, + "flos": 19248290115840.0, + "grad_norm": 1.6696593228851853, + "language_loss": 0.78744078, + "learning_rate": 3.641864129988579e-06, + "loss": 0.86581242, + "num_input_tokens_seen": 78091265, + "router_z_loss_clip": 2.41992188, + "router_z_loss_mlp": 0.22521973, + "step": 3620, + "time_per_iteration": 2.5225844383239746 + }, + { + "auxiliary_loss_clip": 0.06542689, + "auxiliary_loss_mlp": 0.01283495, + "balance_loss_clip": 0.06306273, + "balance_loss_mlp": 0.01263349, + "epoch": 0.21770629791071697, + "flos": 21951619666560.0, + "grad_norm": 1.6751510482296663, + "language_loss": 0.80184436, + "learning_rate": 3.641641706164509e-06, + "loss": 0.88010621, + "num_input_tokens_seen": 78110095, + "router_z_loss_clip": 2.36328125, + "router_z_loss_mlp": 0.20141602, + "step": 3621, + "time_per_iteration": 2.5528457164764404 + }, + { + "auxiliary_loss_clip": 0.0654473, + "auxiliary_loss_mlp": 0.01278712, + "balance_loss_clip": 0.06305254, + "balance_loss_mlp": 0.012594, + "epoch": 0.21776642116338493, + "flos": 24943776641280.0, + "grad_norm": 1.5217586163816694, + "language_loss": 0.87951142, + "learning_rate": 3.641419220089221e-06, + "loss": 0.95774585, + "num_input_tokens_seen": 78129475, + "router_z_loss_clip": 2.39648438, + "router_z_loss_mlp": 0.19299316, + "step": 3622, + "time_per_iteration": 2.621716022491455 + }, + { + "auxiliary_loss_clip": 0.06559718, + "auxiliary_loss_mlp": 0.01277107, + "balance_loss_clip": 0.06313318, + "balance_loss_mlp": 0.01254445, + "epoch": 0.2178265444160529, + "flos": 17827017960960.0, + "grad_norm": 3.34018590012949, + "language_loss": 0.77879506, + "learning_rate": 3.641196671771152e-06, + "loss": 0.85716331, + "num_input_tokens_seen": 78146880, + "router_z_loss_clip": 2.46289062, + "router_z_loss_mlp": 0.22668457, + "step": 3623, + "time_per_iteration": 2.5479788780212402 + }, + { + "auxiliary_loss_clip": 0.0655373, + "auxiliary_loss_mlp": 0.01283267, + "balance_loss_clip": 0.06310436, + "balance_loss_mlp": 0.0126132, + "epoch": 0.2178866676687209, + "flos": 17718760085760.0, + "grad_norm": 2.118806527220675, + "language_loss": 0.85078007, + "learning_rate": 3.640974061218741e-06, + "loss": 0.92914999, + "num_input_tokens_seen": 78165065, + "router_z_loss_clip": 2.43359375, + "router_z_loss_mlp": 0.21936035, + "step": 3624, + "time_per_iteration": 2.4991443157196045 + }, + { + "auxiliary_loss_clip": 0.06544428, + "auxiliary_loss_mlp": 0.01281962, + "balance_loss_clip": 0.06301346, + "balance_loss_mlp": 0.01259014, + "epoch": 0.21794679092138886, + "flos": 16951437521280.0, + "grad_norm": 2.3785715622769357, + "language_loss": 0.7814458, + "learning_rate": 3.640751388440429e-06, + "loss": 0.85970974, + "num_input_tokens_seen": 78180005, + "router_z_loss_clip": 2.42773438, + "router_z_loss_mlp": 0.22961426, + "step": 3625, + "time_per_iteration": 2.5113301277160645 + }, + { + "auxiliary_loss_clip": 0.06435797, + "auxiliary_loss_mlp": 0.01281105, + "balance_loss_clip": 0.0630773, + "balance_loss_mlp": 0.01275631, + "epoch": 0.21800691417405682, + "flos": 63737737413120.0, + "grad_norm": 0.7732492376258139, + "language_loss": 0.60674119, + "learning_rate": 3.64052865344466e-06, + "loss": 0.68391013, + "num_input_tokens_seen": 78245350, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.05477905, + "step": 3626, + "time_per_iteration": 3.230576992034912 + }, + { + "auxiliary_loss_clip": 0.06551459, + "auxiliary_loss_mlp": 0.01275255, + "balance_loss_clip": 0.06306285, + "balance_loss_mlp": 0.01252271, + "epoch": 0.21806703742672479, + "flos": 21622821045120.0, + "grad_norm": 2.0426080259896664, + "language_loss": 0.91217983, + "learning_rate": 3.6403058562398795e-06, + "loss": 0.99044704, + "num_input_tokens_seen": 78264165, + "router_z_loss_clip": 2.45117188, + "router_z_loss_mlp": 0.22961426, + "step": 3627, + "time_per_iteration": 2.571704149246216 + }, + { + "auxiliary_loss_clip": 0.06549745, + "auxiliary_loss_mlp": 0.01279629, + "balance_loss_clip": 0.06307864, + "balance_loss_mlp": 0.01257313, + "epoch": 0.21812716067939275, + "flos": 19361034184320.0, + "grad_norm": 1.8240036323551578, + "language_loss": 0.74830574, + "learning_rate": 3.6400829968345365e-06, + "loss": 0.82659948, + "num_input_tokens_seen": 78283745, + "router_z_loss_clip": 2.421875, + "router_z_loss_mlp": 0.2232666, + "step": 3628, + "time_per_iteration": 2.5547990798950195 + }, + { + "auxiliary_loss_clip": 0.06543273, + "auxiliary_loss_mlp": 0.01279581, + "balance_loss_clip": 0.06304347, + "balance_loss_mlp": 0.01257039, + "epoch": 0.21818728393206072, + "flos": 23554467619200.0, + "grad_norm": 1.7805187473711719, + "language_loss": 0.77940357, + "learning_rate": 3.6398600752370826e-06, + "loss": 0.85763204, + "num_input_tokens_seen": 78302900, + "router_z_loss_clip": 2.38867188, + "router_z_loss_mlp": 0.2253418, + "step": 3629, + "time_per_iteration": 2.5777294635772705 + }, + { + "auxiliary_loss_clip": 0.06540327, + "auxiliary_loss_mlp": 0.01278528, + "balance_loss_clip": 0.06302765, + "balance_loss_mlp": 0.01257822, + "epoch": 0.21824740718472868, + "flos": 30233289335040.0, + "grad_norm": 1.6105707802077895, + "language_loss": 0.72294879, + "learning_rate": 3.63963709145597e-06, + "loss": 0.80113733, + "num_input_tokens_seen": 78326470, + "router_z_loss_clip": 2.375, + "router_z_loss_mlp": 0.20703125, + "step": 3630, + "time_per_iteration": 2.6015560626983643 + }, + { + "auxiliary_loss_clip": 0.06535304, + "auxiliary_loss_mlp": 0.01279689, + "balance_loss_clip": 0.06303381, + "balance_loss_mlp": 0.01259364, + "epoch": 0.21830753043739667, + "flos": 26140860397440.0, + "grad_norm": 1.9295675894773927, + "language_loss": 0.77031553, + "learning_rate": 3.6394140454996544e-06, + "loss": 0.8484655, + "num_input_tokens_seen": 78345810, + "router_z_loss_clip": 2.31835938, + "router_z_loss_mlp": 0.203125, + "step": 3631, + "time_per_iteration": 2.5712599754333496 + }, + { + "auxiliary_loss_clip": 0.06546577, + "auxiliary_loss_mlp": 0.01286362, + "balance_loss_clip": 0.06304416, + "balance_loss_mlp": 0.01265274, + "epoch": 0.21836765369006464, + "flos": 21726299237760.0, + "grad_norm": 24.58992261392957, + "language_loss": 0.76358086, + "learning_rate": 3.639190937376594e-06, + "loss": 0.84191024, + "num_input_tokens_seen": 78364085, + "router_z_loss_clip": 2.41992188, + "router_z_loss_mlp": 0.21081543, + "step": 3632, + "time_per_iteration": 2.5312108993530273 + }, + { + "auxiliary_loss_clip": 0.06541382, + "auxiliary_loss_mlp": 0.01277975, + "balance_loss_clip": 0.06306228, + "balance_loss_mlp": 0.01258008, + "epoch": 0.2184277769427326, + "flos": 19943678350080.0, + "grad_norm": 2.014902514553352, + "language_loss": 0.8455261, + "learning_rate": 3.638967767095249e-06, + "loss": 0.9237197, + "num_input_tokens_seen": 78381385, + "router_z_loss_clip": 2.35351562, + "router_z_loss_mlp": 0.19958496, + "step": 3633, + "time_per_iteration": 2.5392541885375977 + }, + { + "auxiliary_loss_clip": 0.06536385, + "auxiliary_loss_mlp": 0.01279679, + "balance_loss_clip": 0.06300621, + "balance_loss_mlp": 0.0125821, + "epoch": 0.21848790019540057, + "flos": 20346591507840.0, + "grad_norm": 2.269088705731375, + "language_loss": 0.82069844, + "learning_rate": 3.6387445346640823e-06, + "loss": 0.89885902, + "num_input_tokens_seen": 78400500, + "router_z_loss_clip": 2.35742188, + "router_z_loss_mlp": 0.21484375, + "step": 3634, + "time_per_iteration": 2.5536303520202637 + }, + { + "auxiliary_loss_clip": 0.06544928, + "auxiliary_loss_mlp": 0.01275115, + "balance_loss_clip": 0.063034, + "balance_loss_mlp": 0.01254063, + "epoch": 0.21854802344806853, + "flos": 15456302392320.0, + "grad_norm": 2.1744892406337133, + "language_loss": 0.75276726, + "learning_rate": 3.638521240091558e-06, + "loss": 0.83096772, + "num_input_tokens_seen": 78418340, + "router_z_loss_clip": 2.41796875, + "router_z_loss_mlp": 0.21044922, + "step": 3635, + "time_per_iteration": 2.5158851146698 + }, + { + "auxiliary_loss_clip": 0.0653572, + "auxiliary_loss_mlp": 0.01278867, + "balance_loss_clip": 0.06301719, + "balance_loss_mlp": 0.01259018, + "epoch": 0.2186081467007365, + "flos": 16325384140800.0, + "grad_norm": 1.9753193728837781, + "language_loss": 0.88470638, + "learning_rate": 3.6382978833861445e-06, + "loss": 0.96285218, + "num_input_tokens_seen": 78434375, + "router_z_loss_clip": 2.33984375, + "router_z_loss_mlp": 0.19836426, + "step": 3636, + "time_per_iteration": 2.5056772232055664 + }, + { + "auxiliary_loss_clip": 0.06536973, + "auxiliary_loss_mlp": 0.01285934, + "balance_loss_clip": 0.06300446, + "balance_loss_mlp": 0.01264798, + "epoch": 0.2186682699534045, + "flos": 21695677770240.0, + "grad_norm": 1.933426681732421, + "language_loss": 0.76219505, + "learning_rate": 3.638074464556311e-06, + "loss": 0.84042412, + "num_input_tokens_seen": 78451735, + "router_z_loss_clip": 2.3671875, + "router_z_loss_mlp": 0.21118164, + "step": 3637, + "time_per_iteration": 2.5159406661987305 + }, + { + "auxiliary_loss_clip": 0.06547473, + "auxiliary_loss_mlp": 0.01280148, + "balance_loss_clip": 0.06303671, + "balance_loss_mlp": 0.0125726, + "epoch": 0.21872839320607246, + "flos": 17743427913600.0, + "grad_norm": 3.0066644559057867, + "language_loss": 0.90341294, + "learning_rate": 3.63785098361053e-06, + "loss": 0.98168921, + "num_input_tokens_seen": 78462730, + "router_z_loss_clip": 2.43945312, + "router_z_loss_mlp": 0.22888184, + "step": 3638, + "time_per_iteration": 2.475271224975586 + }, + { + "auxiliary_loss_clip": 0.06535378, + "auxiliary_loss_mlp": 0.01286586, + "balance_loss_clip": 0.06297417, + "balance_loss_mlp": 0.01264318, + "epoch": 0.21878851645874042, + "flos": 18656757417600.0, + "grad_norm": 3.417327747399998, + "language_loss": 0.90034223, + "learning_rate": 3.637627440557275e-06, + "loss": 0.97856188, + "num_input_tokens_seen": 78476300, + "router_z_loss_clip": 2.38085938, + "router_z_loss_mlp": 0.22265625, + "step": 3639, + "time_per_iteration": 2.4722554683685303 + }, + { + "auxiliary_loss_clip": 0.06531254, + "auxiliary_loss_mlp": 0.01281993, + "balance_loss_clip": 0.06296734, + "balance_loss_mlp": 0.01262264, + "epoch": 0.2188486397114084, + "flos": 25564463360640.0, + "grad_norm": 1.6695470201966474, + "language_loss": 0.7997371, + "learning_rate": 3.637403835405024e-06, + "loss": 0.87786961, + "num_input_tokens_seen": 78496135, + "router_z_loss_clip": 2.34375, + "router_z_loss_mlp": 0.19726562, + "step": 3640, + "time_per_iteration": 2.5905494689941406 + }, + { + "auxiliary_loss_clip": 0.06541579, + "auxiliary_loss_mlp": 0.01284166, + "balance_loss_clip": 0.06302525, + "balance_loss_mlp": 0.01260074, + "epoch": 0.21890876296407635, + "flos": 17897400990720.0, + "grad_norm": 8.732271245188107, + "language_loss": 0.72940969, + "learning_rate": 3.637180168162255e-06, + "loss": 0.80766714, + "num_input_tokens_seen": 78513855, + "router_z_loss_clip": 2.38671875, + "router_z_loss_mlp": 0.24084473, + "step": 3641, + "time_per_iteration": 2.5452075004577637 + }, + { + "auxiliary_loss_clip": 0.06541288, + "auxiliary_loss_mlp": 0.01280481, + "balance_loss_clip": 0.06304857, + "balance_loss_mlp": 0.01259619, + "epoch": 0.21896888621674432, + "flos": 17754915922560.0, + "grad_norm": 1.8801395061290727, + "language_loss": 0.81693721, + "learning_rate": 3.63695643883745e-06, + "loss": 0.89515489, + "num_input_tokens_seen": 78531740, + "router_z_loss_clip": 2.36328125, + "router_z_loss_mlp": 0.20874023, + "step": 3642, + "time_per_iteration": 2.5234179496765137 + }, + { + "auxiliary_loss_clip": 0.06550857, + "auxiliary_loss_mlp": 0.01284985, + "balance_loss_clip": 0.06311135, + "balance_loss_mlp": 0.01262204, + "epoch": 0.21902900946941228, + "flos": 23082890440320.0, + "grad_norm": 1.5963488152753738, + "language_loss": 0.71952182, + "learning_rate": 3.6367326474390928e-06, + "loss": 0.79788017, + "num_input_tokens_seen": 78549600, + "router_z_loss_clip": 2.39648438, + "router_z_loss_mlp": 0.2277832, + "step": 3643, + "time_per_iteration": 2.5542049407958984 + }, + { + "auxiliary_loss_clip": 0.06535246, + "auxiliary_loss_mlp": 0.01285725, + "balance_loss_clip": 0.06298445, + "balance_loss_mlp": 0.01264506, + "epoch": 0.21908913272208028, + "flos": 48189501492480.0, + "grad_norm": 1.9271022520918928, + "language_loss": 0.69055694, + "learning_rate": 3.6365087939756696e-06, + "loss": 0.76876664, + "num_input_tokens_seen": 78573350, + "router_z_loss_clip": 2.37109375, + "router_z_loss_mlp": 0.21228027, + "step": 3644, + "time_per_iteration": 2.8034632205963135 + }, + { + "auxiliary_loss_clip": 0.06548485, + "auxiliary_loss_mlp": 0.01283418, + "balance_loss_clip": 0.06302129, + "balance_loss_mlp": 0.01261531, + "epoch": 0.21914925597474824, + "flos": 22243298129280.0, + "grad_norm": 2.4423330778710937, + "language_loss": 0.78728521, + "learning_rate": 3.636284878455669e-06, + "loss": 0.86560422, + "num_input_tokens_seen": 78591005, + "router_z_loss_clip": 2.46484375, + "router_z_loss_mlp": 0.21911621, + "step": 3645, + "time_per_iteration": 2.547746419906616 + }, + { + "auxiliary_loss_clip": 0.06531754, + "auxiliary_loss_mlp": 0.01275201, + "balance_loss_clip": 0.06300971, + "balance_loss_mlp": 0.01254936, + "epoch": 0.2192093792274162, + "flos": 22131853799040.0, + "grad_norm": 1.5020846701532837, + "language_loss": 0.82847381, + "learning_rate": 3.636060900887582e-06, + "loss": 0.90654337, + "num_input_tokens_seen": 78610645, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.20263672, + "step": 3646, + "time_per_iteration": 2.569216012954712 + }, + { + "auxiliary_loss_clip": 0.06536786, + "auxiliary_loss_mlp": 0.01283667, + "balance_loss_clip": 0.06302559, + "balance_loss_mlp": 0.01263449, + "epoch": 0.21926950248008417, + "flos": 15674914494720.0, + "grad_norm": 1.6949719683005162, + "language_loss": 0.83080441, + "learning_rate": 3.635836861279901e-06, + "loss": 0.90900892, + "num_input_tokens_seen": 78628340, + "router_z_loss_clip": 2.34179688, + "router_z_loss_mlp": 0.20227051, + "step": 3647, + "time_per_iteration": 3.9349160194396973 + }, + { + "auxiliary_loss_clip": 0.06534994, + "auxiliary_loss_mlp": 0.01281644, + "balance_loss_clip": 0.06301765, + "balance_loss_mlp": 0.01261105, + "epoch": 0.21932962573275214, + "flos": 30270199858560.0, + "grad_norm": 1.587891801710132, + "language_loss": 0.7257458, + "learning_rate": 3.635612759641123e-06, + "loss": 0.80391216, + "num_input_tokens_seen": 78649355, + "router_z_loss_clip": 2.33203125, + "router_z_loss_mlp": 0.20532227, + "step": 3648, + "time_per_iteration": 2.6465656757354736 + }, + { + "auxiliary_loss_clip": 0.06545104, + "auxiliary_loss_mlp": 0.0128538, + "balance_loss_clip": 0.06304809, + "balance_loss_mlp": 0.01263434, + "epoch": 0.2193897489854201, + "flos": 10784751160320.0, + "grad_norm": 3.088861131276654, + "language_loss": 0.74724281, + "learning_rate": 3.635388595979745e-06, + "loss": 0.8255477, + "num_input_tokens_seen": 78664915, + "router_z_loss_clip": 2.40039062, + "router_z_loss_mlp": 0.21960449, + "step": 3649, + "time_per_iteration": 2.510040283203125 + }, + { + "auxiliary_loss_clip": 0.06531087, + "auxiliary_loss_mlp": 0.01295006, + "balance_loss_clip": 0.06299826, + "balance_loss_mlp": 0.01274752, + "epoch": 0.21944987223808807, + "flos": 19138984064640.0, + "grad_norm": 4.303407628828735, + "language_loss": 0.86915123, + "learning_rate": 3.635164370304267e-06, + "loss": 0.94741207, + "num_input_tokens_seen": 78681475, + "router_z_loss_clip": 2.3125, + "router_z_loss_mlp": 0.20251465, + "step": 3650, + "time_per_iteration": 3.93752384185791 + }, + { + "auxiliary_loss_clip": 0.06543732, + "auxiliary_loss_mlp": 0.01294843, + "balance_loss_clip": 0.06307691, + "balance_loss_mlp": 0.01273422, + "epoch": 0.21950999549075606, + "flos": 22717726346880.0, + "grad_norm": 2.457938069648898, + "language_loss": 0.8456791, + "learning_rate": 3.6349400826231927e-06, + "loss": 0.92406487, + "num_input_tokens_seen": 78702300, + "router_z_loss_clip": 2.359375, + "router_z_loss_mlp": 0.2142334, + "step": 3651, + "time_per_iteration": 2.7058322429656982 + }, + { + "auxiliary_loss_clip": 0.06539044, + "auxiliary_loss_mlp": 0.01290725, + "balance_loss_clip": 0.06304742, + "balance_loss_mlp": 0.01270257, + "epoch": 0.21957011874342403, + "flos": 10565929422720.0, + "grad_norm": 1.8310150193660448, + "language_loss": 0.74885792, + "learning_rate": 3.634715732945027e-06, + "loss": 0.82715559, + "num_input_tokens_seen": 78720230, + "router_z_loss_clip": 2.34570312, + "router_z_loss_mlp": 0.20458984, + "step": 3652, + "time_per_iteration": 2.512620210647583 + }, + { + "auxiliary_loss_clip": 0.06458014, + "auxiliary_loss_mlp": 0.01487979, + "balance_loss_clip": 0.06335165, + "balance_loss_mlp": 0.01477775, + "epoch": 0.219630241996092, + "flos": 65765105677440.0, + "grad_norm": 0.8085744951241601, + "language_loss": 0.51588702, + "learning_rate": 3.6344913212782764e-06, + "loss": 0.59534693, + "num_input_tokens_seen": 78780200, + "router_z_loss_clip": 1.234375, + "router_z_loss_mlp": 0.10205078, + "step": 3653, + "time_per_iteration": 3.156705617904663 + }, + { + "auxiliary_loss_clip": 0.06532414, + "auxiliary_loss_mlp": 0.01292976, + "balance_loss_clip": 0.06300488, + "balance_loss_mlp": 0.01271685, + "epoch": 0.21969036524875996, + "flos": 23703367524480.0, + "grad_norm": 2.2498105533123467, + "language_loss": 0.7598449, + "learning_rate": 3.6342668476314514e-06, + "loss": 0.83809876, + "num_input_tokens_seen": 78800575, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.21289062, + "step": 3654, + "time_per_iteration": 2.5549349784851074 + }, + { + "auxiliary_loss_clip": 0.06539033, + "auxiliary_loss_mlp": 0.01287688, + "balance_loss_clip": 0.06300852, + "balance_loss_mlp": 0.01265277, + "epoch": 0.21975048850142792, + "flos": 19646130101760.0, + "grad_norm": 1.856190016757107, + "language_loss": 0.72937429, + "learning_rate": 3.634042312013064e-06, + "loss": 0.80764157, + "num_input_tokens_seen": 78819585, + "router_z_loss_clip": 2.3828125, + "router_z_loss_mlp": 0.22412109, + "step": 3655, + "time_per_iteration": 5.397899866104126 + }, + { + "auxiliary_loss_clip": 0.06537225, + "auxiliary_loss_mlp": 0.01285968, + "balance_loss_clip": 0.06301227, + "balance_loss_mlp": 0.01265667, + "epoch": 0.21981061175409589, + "flos": 22453944094080.0, + "grad_norm": 1.6446350088012902, + "language_loss": 0.81351042, + "learning_rate": 3.6338177144316276e-06, + "loss": 0.89174235, + "num_input_tokens_seen": 78837330, + "router_z_loss_clip": 2.36328125, + "router_z_loss_mlp": 0.20300293, + "step": 3656, + "time_per_iteration": 2.53308367729187 + }, + { + "auxiliary_loss_clip": 0.06536204, + "auxiliary_loss_mlp": 0.01286139, + "balance_loss_clip": 0.06302683, + "balance_loss_mlp": 0.01265027, + "epoch": 0.21987073500676388, + "flos": 18157032466560.0, + "grad_norm": 2.081609460517537, + "language_loss": 0.86280632, + "learning_rate": 3.63359305489566e-06, + "loss": 0.94102979, + "num_input_tokens_seen": 78854955, + "router_z_loss_clip": 2.33398438, + "router_z_loss_mlp": 0.21105957, + "step": 3657, + "time_per_iteration": 2.5165464878082275 + }, + { + "auxiliary_loss_clip": 0.06534712, + "auxiliary_loss_mlp": 0.01283645, + "balance_loss_clip": 0.0629717, + "balance_loss_mlp": 0.01263439, + "epoch": 0.21993085825943184, + "flos": 25632666184320.0, + "grad_norm": 1.606816904846988, + "language_loss": 0.80728716, + "learning_rate": 3.6333683334136803e-06, + "loss": 0.88547069, + "num_input_tokens_seen": 78874965, + "router_z_loss_clip": 2.375, + "router_z_loss_mlp": 0.20202637, + "step": 3658, + "time_per_iteration": 2.5528533458709717 + }, + { + "auxiliary_loss_clip": 0.06407537, + "auxiliary_loss_mlp": 0.01256954, + "balance_loss_clip": 0.0628604, + "balance_loss_mlp": 0.01250839, + "epoch": 0.2199909815120998, + "flos": 70946429621760.0, + "grad_norm": 0.7593962827668586, + "language_loss": 0.58126092, + "learning_rate": 3.6331435499942095e-06, + "loss": 0.65790582, + "num_input_tokens_seen": 78937740, + "router_z_loss_clip": 1.21875, + "router_z_loss_mlp": 0.06103516, + "step": 3659, + "time_per_iteration": 3.237276077270508 + }, + { + "auxiliary_loss_clip": 0.06524363, + "auxiliary_loss_mlp": 0.01284023, + "balance_loss_clip": 0.06293888, + "balance_loss_mlp": 0.01264091, + "epoch": 0.22005110476476777, + "flos": 21549964320000.0, + "grad_norm": 2.05919214646248, + "language_loss": 0.75117528, + "learning_rate": 3.632918704645772e-06, + "loss": 0.82925916, + "num_input_tokens_seen": 78955055, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.19946289, + "step": 3660, + "time_per_iteration": 2.5259556770324707 + }, + { + "auxiliary_loss_clip": 0.06528022, + "auxiliary_loss_mlp": 0.01287991, + "balance_loss_clip": 0.06292684, + "balance_loss_mlp": 0.01267976, + "epoch": 0.22011122801743574, + "flos": 22061051498880.0, + "grad_norm": 2.4805712407940645, + "language_loss": 0.81579179, + "learning_rate": 3.632693797376893e-06, + "loss": 0.89395189, + "num_input_tokens_seen": 78974895, + "router_z_loss_clip": 2.35546875, + "router_z_loss_mlp": 0.20019531, + "step": 3661, + "time_per_iteration": 2.5724833011627197 + }, + { + "auxiliary_loss_clip": 0.06527096, + "auxiliary_loss_mlp": 0.01283614, + "balance_loss_clip": 0.06295218, + "balance_loss_mlp": 0.01264039, + "epoch": 0.2201713512701037, + "flos": 26694811739520.0, + "grad_norm": 2.4209612671003993, + "language_loss": 0.73935246, + "learning_rate": 3.632468828196102e-06, + "loss": 0.81745958, + "num_input_tokens_seen": 78994990, + "router_z_loss_clip": 2.31640625, + "router_z_loss_mlp": 0.19567871, + "step": 3662, + "time_per_iteration": 2.594336986541748 + }, + { + "auxiliary_loss_clip": 0.06524752, + "auxiliary_loss_mlp": 0.01286026, + "balance_loss_clip": 0.06294893, + "balance_loss_mlp": 0.01266976, + "epoch": 0.22023147452277167, + "flos": 22168470833280.0, + "grad_norm": 1.5979135918213576, + "language_loss": 0.79490995, + "learning_rate": 3.632243797111929e-06, + "loss": 0.87301779, + "num_input_tokens_seen": 79014405, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.19042969, + "step": 3663, + "time_per_iteration": 2.6437172889709473 + }, + { + "auxiliary_loss_clip": 0.06536885, + "auxiliary_loss_mlp": 0.01285417, + "balance_loss_clip": 0.06298422, + "balance_loss_mlp": 0.01264627, + "epoch": 0.22029159777543966, + "flos": 22528981025280.0, + "grad_norm": 1.9228872111745317, + "language_loss": 0.81154871, + "learning_rate": 3.632018704132908e-06, + "loss": 0.8897717, + "num_input_tokens_seen": 79032375, + "router_z_loss_clip": 2.38671875, + "router_z_loss_mlp": 0.20800781, + "step": 3664, + "time_per_iteration": 2.551218271255493 + }, + { + "auxiliary_loss_clip": 0.06543128, + "auxiliary_loss_mlp": 0.01279618, + "balance_loss_clip": 0.06298529, + "balance_loss_mlp": 0.01257457, + "epoch": 0.22035172102810763, + "flos": 13047502343040.0, + "grad_norm": 2.388837963421245, + "language_loss": 0.77563322, + "learning_rate": 3.6317935492675742e-06, + "loss": 0.85386074, + "num_input_tokens_seen": 79049635, + "router_z_loss_clip": 2.44335938, + "router_z_loss_mlp": 0.22167969, + "step": 3665, + "time_per_iteration": 2.5317838191986084 + }, + { + "auxiliary_loss_clip": 0.06533245, + "auxiliary_loss_mlp": 0.0128412, + "balance_loss_clip": 0.06298798, + "balance_loss_mlp": 0.01263616, + "epoch": 0.2204118442807756, + "flos": 12170538311040.0, + "grad_norm": 5.328131395204355, + "language_loss": 0.98459631, + "learning_rate": 3.631568332524466e-06, + "loss": 1.06277001, + "num_input_tokens_seen": 79062890, + "router_z_loss_clip": 2.34765625, + "router_z_loss_mlp": 0.20507812, + "step": 3666, + "time_per_iteration": 2.500293254852295 + }, + { + "auxiliary_loss_clip": 0.06531642, + "auxiliary_loss_mlp": 0.01281342, + "balance_loss_clip": 0.06297208, + "balance_loss_mlp": 0.01260767, + "epoch": 0.22047196753344356, + "flos": 40117345758720.0, + "grad_norm": 2.0087807452217143, + "language_loss": 0.81544572, + "learning_rate": 3.631343053912122e-06, + "loss": 0.89357555, + "num_input_tokens_seen": 79085495, + "router_z_loss_clip": 2.34179688, + "router_z_loss_mlp": 0.20568848, + "step": 3667, + "time_per_iteration": 2.7539899349212646 + }, + { + "auxiliary_loss_clip": 0.06542197, + "auxiliary_loss_mlp": 0.0128155, + "balance_loss_clip": 0.06300189, + "balance_loss_mlp": 0.01258363, + "epoch": 0.22053209078611152, + "flos": 20706892064640.0, + "grad_norm": 2.631241235852179, + "language_loss": 0.77648765, + "learning_rate": 3.631117713439087e-06, + "loss": 0.85472512, + "num_input_tokens_seen": 79101820, + "router_z_loss_clip": 2.41992188, + "router_z_loss_mlp": 0.23168945, + "step": 3668, + "time_per_iteration": 2.524740695953369 + }, + { + "auxiliary_loss_clip": 0.06534266, + "auxiliary_loss_mlp": 0.01279226, + "balance_loss_clip": 0.06300663, + "balance_loss_mlp": 0.01258758, + "epoch": 0.2205922140387795, + "flos": 24723026259840.0, + "grad_norm": 2.1996350177899386, + "language_loss": 0.72024125, + "learning_rate": 3.630892311113904e-06, + "loss": 0.7983762, + "num_input_tokens_seen": 79123320, + "router_z_loss_clip": 2.33789062, + "router_z_loss_mlp": 0.20471191, + "step": 3669, + "time_per_iteration": 2.5901756286621094 + }, + { + "auxiliary_loss_clip": 0.06540591, + "auxiliary_loss_mlp": 0.01281842, + "balance_loss_clip": 0.06304247, + "balance_loss_mlp": 0.01261398, + "epoch": 0.22065233729144745, + "flos": 23484000735360.0, + "grad_norm": 1.708018932230371, + "language_loss": 0.85830641, + "learning_rate": 3.6306668469451215e-06, + "loss": 0.93653071, + "num_input_tokens_seen": 79141615, + "router_z_loss_clip": 2.36132812, + "router_z_loss_mlp": 0.20422363, + "step": 3670, + "time_per_iteration": 2.6102726459503174 + }, + { + "auxiliary_loss_clip": 0.06539471, + "auxiliary_loss_mlp": 0.01279884, + "balance_loss_clip": 0.06300244, + "balance_loss_mlp": 0.01259678, + "epoch": 0.22071246054411545, + "flos": 35234268094080.0, + "grad_norm": 1.8596418583208814, + "language_loss": 0.77398729, + "learning_rate": 3.6304413209412886e-06, + "loss": 0.85218084, + "num_input_tokens_seen": 79164910, + "router_z_loss_clip": 2.39453125, + "router_z_loss_mlp": 0.20202637, + "step": 3671, + "time_per_iteration": 2.6463472843170166 + }, + { + "auxiliary_loss_clip": 0.06536315, + "auxiliary_loss_mlp": 0.01275828, + "balance_loss_clip": 0.06302021, + "balance_loss_mlp": 0.01256934, + "epoch": 0.2207725837967834, + "flos": 18156151998720.0, + "grad_norm": 3.3605951725525807, + "language_loss": 0.81071377, + "learning_rate": 3.6302157331109573e-06, + "loss": 0.88883519, + "num_input_tokens_seen": 79179685, + "router_z_loss_clip": 2.34375, + "router_z_loss_mlp": 0.18896484, + "step": 3672, + "time_per_iteration": 2.522409200668335 + }, + { + "auxiliary_loss_clip": 0.06541845, + "auxiliary_loss_mlp": 0.01282888, + "balance_loss_clip": 0.06304064, + "balance_loss_mlp": 0.01262086, + "epoch": 0.22083270704945138, + "flos": 20484967726080.0, + "grad_norm": 2.0276751679318905, + "language_loss": 0.74039209, + "learning_rate": 3.629990083462682e-06, + "loss": 0.8186394, + "num_input_tokens_seen": 79196285, + "router_z_loss_clip": 2.38085938, + "router_z_loss_mlp": 0.20800781, + "step": 3673, + "time_per_iteration": 2.5588481426239014 + }, + { + "auxiliary_loss_clip": 0.06537451, + "auxiliary_loss_mlp": 0.0127904, + "balance_loss_clip": 0.06301045, + "balance_loss_mlp": 0.01258154, + "epoch": 0.22089283030211934, + "flos": 34133451079680.0, + "grad_norm": 2.1113123853963223, + "language_loss": 0.77576697, + "learning_rate": 3.6297643720050203e-06, + "loss": 0.85393184, + "num_input_tokens_seen": 79216060, + "router_z_loss_clip": 2.3671875, + "router_z_loss_mlp": 0.2088623, + "step": 3674, + "time_per_iteration": 2.6212525367736816 + }, + { + "auxiliary_loss_clip": 0.06539989, + "auxiliary_loss_mlp": 0.01277379, + "balance_loss_clip": 0.06303889, + "balance_loss_mlp": 0.01255349, + "epoch": 0.2209529535547873, + "flos": 18083043711360.0, + "grad_norm": 2.9913121905850213, + "language_loss": 0.7632584, + "learning_rate": 3.6295385987465293e-06, + "loss": 0.84143209, + "num_input_tokens_seen": 79235145, + "router_z_loss_clip": 2.36523438, + "router_z_loss_mlp": 0.22033691, + "step": 3675, + "time_per_iteration": 2.529346466064453 + }, + { + "auxiliary_loss_clip": 0.06540923, + "auxiliary_loss_mlp": 0.01279311, + "balance_loss_clip": 0.06303286, + "balance_loss_mlp": 0.01258592, + "epoch": 0.22101307680745527, + "flos": 27242725587840.0, + "grad_norm": 1.8493496269427605, + "language_loss": 0.8074736, + "learning_rate": 3.629312763695772e-06, + "loss": 0.88567591, + "num_input_tokens_seen": 79256960, + "router_z_loss_clip": 2.38085938, + "router_z_loss_mlp": 0.20727539, + "step": 3676, + "time_per_iteration": 2.5729713439941406 + }, + { + "auxiliary_loss_clip": 0.06539683, + "auxiliary_loss_mlp": 0.0128126, + "balance_loss_clip": 0.06299066, + "balance_loss_mlp": 0.01260637, + "epoch": 0.22107320006012326, + "flos": 16548566290560.0, + "grad_norm": 2.695197102889201, + "language_loss": 0.76204234, + "learning_rate": 3.6290868668613107e-06, + "loss": 0.84025168, + "num_input_tokens_seen": 79274860, + "router_z_loss_clip": 2.40820312, + "router_z_loss_mlp": 0.2064209, + "step": 3677, + "time_per_iteration": 2.5165653228759766 + }, + { + "auxiliary_loss_clip": 0.0653778, + "auxiliary_loss_mlp": 0.01277642, + "balance_loss_clip": 0.06301221, + "balance_loss_mlp": 0.01257889, + "epoch": 0.22113332331279123, + "flos": 22061009571840.0, + "grad_norm": 1.9269573452829223, + "language_loss": 0.84673274, + "learning_rate": 3.628860908251712e-06, + "loss": 0.92488694, + "num_input_tokens_seen": 79294005, + "router_z_loss_clip": 2.36523438, + "router_z_loss_mlp": 0.19750977, + "step": 3678, + "time_per_iteration": 2.5460638999938965 + }, + { + "auxiliary_loss_clip": 0.06537814, + "auxiliary_loss_mlp": 0.01282989, + "balance_loss_clip": 0.06304095, + "balance_loss_mlp": 0.01262354, + "epoch": 0.2211934465654592, + "flos": 26619690954240.0, + "grad_norm": 2.1729831488916327, + "language_loss": 0.89362311, + "learning_rate": 3.6286348878755452e-06, + "loss": 0.9718312, + "num_input_tokens_seen": 79314005, + "router_z_loss_clip": 2.34179688, + "router_z_loss_mlp": 0.20629883, + "step": 3679, + "time_per_iteration": 2.596503973007202 + }, + { + "auxiliary_loss_clip": 0.06542142, + "auxiliary_loss_mlp": 0.01291632, + "balance_loss_clip": 0.06301068, + "balance_loss_mlp": 0.01269817, + "epoch": 0.22125356981812716, + "flos": 16365564973440.0, + "grad_norm": 3.197923457760992, + "language_loss": 0.87311327, + "learning_rate": 3.6284088057413803e-06, + "loss": 0.95145106, + "num_input_tokens_seen": 79331030, + "router_z_loss_clip": 2.40820312, + "router_z_loss_mlp": 0.21801758, + "step": 3680, + "time_per_iteration": 2.507798433303833 + }, + { + "auxiliary_loss_clip": 0.06534758, + "auxiliary_loss_mlp": 0.01279239, + "balance_loss_clip": 0.06302372, + "balance_loss_mlp": 0.01258211, + "epoch": 0.22131369307079513, + "flos": 21657257873280.0, + "grad_norm": 1.8058433539562604, + "language_loss": 0.81643963, + "learning_rate": 3.6281826618577894e-06, + "loss": 0.89457959, + "num_input_tokens_seen": 79348560, + "router_z_loss_clip": 2.32226562, + "router_z_loss_mlp": 0.21032715, + "step": 3681, + "time_per_iteration": 2.536559820175171 + }, + { + "auxiliary_loss_clip": 0.06530598, + "auxiliary_loss_mlp": 0.01283453, + "balance_loss_clip": 0.06302136, + "balance_loss_mlp": 0.01264344, + "epoch": 0.2213738163234631, + "flos": 19615592488320.0, + "grad_norm": 3.0843961282743138, + "language_loss": 0.80613208, + "learning_rate": 3.62795645623335e-06, + "loss": 0.88427258, + "num_input_tokens_seen": 79367175, + "router_z_loss_clip": 2.28320312, + "router_z_loss_mlp": 0.19116211, + "step": 3682, + "time_per_iteration": 2.5523715019226074 + }, + { + "auxiliary_loss_clip": 0.06540116, + "auxiliary_loss_mlp": 0.01284666, + "balance_loss_clip": 0.06302039, + "balance_loss_mlp": 0.01261933, + "epoch": 0.22143393957613106, + "flos": 23630217310080.0, + "grad_norm": 1.560467578099588, + "language_loss": 0.78323001, + "learning_rate": 3.627730188876638e-06, + "loss": 0.86147785, + "num_input_tokens_seen": 79388435, + "router_z_loss_clip": 2.38085938, + "router_z_loss_mlp": 0.22729492, + "step": 3683, + "time_per_iteration": 2.563915491104126 + }, + { + "auxiliary_loss_clip": 0.06546305, + "auxiliary_loss_mlp": 0.01292128, + "balance_loss_clip": 0.06304266, + "balance_loss_mlp": 0.01270801, + "epoch": 0.22149406282879905, + "flos": 26185108152960.0, + "grad_norm": 2.3659446396904276, + "language_loss": 0.73827177, + "learning_rate": 3.627503859796234e-06, + "loss": 0.81665611, + "num_input_tokens_seen": 79407910, + "router_z_loss_clip": 2.42382812, + "router_z_loss_mlp": 0.21337891, + "step": 3684, + "time_per_iteration": 2.5829403400421143 + }, + { + "auxiliary_loss_clip": 0.06539842, + "auxiliary_loss_mlp": 0.01288295, + "balance_loss_clip": 0.06303138, + "balance_loss_mlp": 0.01266396, + "epoch": 0.221554186081467, + "flos": 14544104918400.0, + "grad_norm": 1.9346272357304948, + "language_loss": 0.81055164, + "learning_rate": 3.6272774690007207e-06, + "loss": 0.88883299, + "num_input_tokens_seen": 79424020, + "router_z_loss_clip": 2.3671875, + "router_z_loss_mlp": 0.21899414, + "step": 3685, + "time_per_iteration": 2.5229949951171875 + }, + { + "auxiliary_loss_clip": 0.06531791, + "auxiliary_loss_mlp": 0.0128599, + "balance_loss_clip": 0.06302623, + "balance_loss_mlp": 0.01266607, + "epoch": 0.22161430933413498, + "flos": 22245059064960.0, + "grad_norm": 1.5947500054188823, + "language_loss": 0.87523818, + "learning_rate": 3.6270510164986823e-06, + "loss": 0.95341599, + "num_input_tokens_seen": 79445605, + "router_z_loss_clip": 2.29296875, + "router_z_loss_mlp": 0.19372559, + "step": 3686, + "time_per_iteration": 4.0018064975738525 + }, + { + "auxiliary_loss_clip": 0.06530964, + "auxiliary_loss_mlp": 0.01294037, + "balance_loss_clip": 0.06297237, + "balance_loss_mlp": 0.01272198, + "epoch": 0.22167443258680294, + "flos": 23483162194560.0, + "grad_norm": 2.0272053301197186, + "language_loss": 0.78420949, + "learning_rate": 3.626824502298707e-06, + "loss": 0.86245942, + "num_input_tokens_seen": 79463850, + "router_z_loss_clip": 2.3359375, + "router_z_loss_mlp": 0.21826172, + "step": 3687, + "time_per_iteration": 2.543321132659912 + }, + { + "auxiliary_loss_clip": 0.06551681, + "auxiliary_loss_mlp": 0.01283958, + "balance_loss_clip": 0.0630649, + "balance_loss_mlp": 0.01261177, + "epoch": 0.2217345558394709, + "flos": 23227723422720.0, + "grad_norm": 1.7957197826329643, + "language_loss": 0.85492283, + "learning_rate": 3.626597926409383e-06, + "loss": 0.93327922, + "num_input_tokens_seen": 79482845, + "router_z_loss_clip": 2.44921875, + "router_z_loss_mlp": 0.2277832, + "step": 3688, + "time_per_iteration": 2.5456702709198 + }, + { + "auxiliary_loss_clip": 0.06557921, + "auxiliary_loss_mlp": 0.01283081, + "balance_loss_clip": 0.0631456, + "balance_loss_mlp": 0.01260812, + "epoch": 0.22179467909213887, + "flos": 20017247834880.0, + "grad_norm": 1.8193279444648072, + "language_loss": 0.81821239, + "learning_rate": 3.6263712888393027e-06, + "loss": 0.89662236, + "num_input_tokens_seen": 79501550, + "router_z_loss_clip": 2.43554688, + "router_z_loss_mlp": 0.22265625, + "step": 3689, + "time_per_iteration": 4.073091506958008 + }, + { + "auxiliary_loss_clip": 0.06540284, + "auxiliary_loss_mlp": 0.0128456, + "balance_loss_clip": 0.06304172, + "balance_loss_mlp": 0.01263269, + "epoch": 0.22185480234480687, + "flos": 19689203900160.0, + "grad_norm": 2.302195520769192, + "language_loss": 0.70934272, + "learning_rate": 3.626144589597061e-06, + "loss": 0.7875911, + "num_input_tokens_seen": 79519680, + "router_z_loss_clip": 2.36132812, + "router_z_loss_mlp": 0.2130127, + "step": 3690, + "time_per_iteration": 2.5177161693573 + }, + { + "auxiliary_loss_clip": 0.06548303, + "auxiliary_loss_mlp": 0.01286756, + "balance_loss_clip": 0.06307022, + "balance_loss_mlp": 0.01264416, + "epoch": 0.22191492559747483, + "flos": 21987817430400.0, + "grad_norm": 2.3084892961245576, + "language_loss": 0.7285862, + "learning_rate": 3.6259178286912528e-06, + "loss": 0.80693686, + "num_input_tokens_seen": 79539000, + "router_z_loss_clip": 2.4140625, + "router_z_loss_mlp": 0.22338867, + "step": 3691, + "time_per_iteration": 2.545271873474121 + }, + { + "auxiliary_loss_clip": 0.0654895, + "auxiliary_loss_mlp": 0.01283693, + "balance_loss_clip": 0.06313456, + "balance_loss_mlp": 0.01261771, + "epoch": 0.2219750488501428, + "flos": 23228813525760.0, + "grad_norm": 2.0680633952732195, + "language_loss": 0.71962094, + "learning_rate": 3.625691006130477e-06, + "loss": 0.79794735, + "num_input_tokens_seen": 79559695, + "router_z_loss_clip": 2.36132812, + "router_z_loss_mlp": 0.21936035, + "step": 3692, + "time_per_iteration": 2.543306350708008 + }, + { + "auxiliary_loss_clip": 0.06558576, + "auxiliary_loss_mlp": 0.0128071, + "balance_loss_clip": 0.06317012, + "balance_loss_mlp": 0.01258394, + "epoch": 0.22203517210281076, + "flos": 22459939660800.0, + "grad_norm": 1.9780142392305156, + "language_loss": 0.87528688, + "learning_rate": 3.6254641219233362e-06, + "loss": 0.95367974, + "num_input_tokens_seen": 79579095, + "router_z_loss_clip": 2.41601562, + "router_z_loss_mlp": 0.22338867, + "step": 3693, + "time_per_iteration": 2.571045398712158 + }, + { + "auxiliary_loss_clip": 0.06534213, + "auxiliary_loss_mlp": 0.01282043, + "balance_loss_clip": 0.06303744, + "balance_loss_mlp": 0.01261122, + "epoch": 0.22209529535547873, + "flos": 17569985961600.0, + "grad_norm": 2.4004359049860824, + "language_loss": 0.86418116, + "learning_rate": 3.6252371760784325e-06, + "loss": 0.94234371, + "num_input_tokens_seen": 79596430, + "router_z_loss_clip": 2.30859375, + "router_z_loss_mlp": 0.20922852, + "step": 3694, + "time_per_iteration": 4.03299617767334 + }, + { + "auxiliary_loss_clip": 0.06554222, + "auxiliary_loss_mlp": 0.0127962, + "balance_loss_clip": 0.06307386, + "balance_loss_mlp": 0.0125815, + "epoch": 0.2221554186081467, + "flos": 21475178951040.0, + "grad_norm": 1.7692850214061204, + "language_loss": 0.69924927, + "learning_rate": 3.6250101686043725e-06, + "loss": 0.77758765, + "num_input_tokens_seen": 79615825, + "router_z_loss_clip": 2.46875, + "router_z_loss_mlp": 0.21472168, + "step": 3695, + "time_per_iteration": 3.989173412322998 + }, + { + "auxiliary_loss_clip": 0.06536973, + "auxiliary_loss_mlp": 0.01283487, + "balance_loss_clip": 0.0630603, + "balance_loss_mlp": 0.01262781, + "epoch": 0.22221554186081466, + "flos": 27680956041600.0, + "grad_norm": 1.7088419756312998, + "language_loss": 0.72215462, + "learning_rate": 3.6247830995097637e-06, + "loss": 0.80035925, + "num_input_tokens_seen": 79637875, + "router_z_loss_clip": 2.30859375, + "router_z_loss_mlp": 0.20715332, + "step": 3696, + "time_per_iteration": 2.6339590549468994 + }, + { + "auxiliary_loss_clip": 0.06543445, + "auxiliary_loss_mlp": 0.01279581, + "balance_loss_clip": 0.06307454, + "balance_loss_mlp": 0.01257825, + "epoch": 0.22227566511348265, + "flos": 25966202561280.0, + "grad_norm": 1.8417969407055101, + "language_loss": 0.88068652, + "learning_rate": 3.624555968803217e-06, + "loss": 0.95891678, + "num_input_tokens_seen": 79656970, + "router_z_loss_clip": 2.36132812, + "router_z_loss_mlp": 0.21740723, + "step": 3697, + "time_per_iteration": 2.5599191188812256 + }, + { + "auxiliary_loss_clip": 0.06533489, + "auxiliary_loss_mlp": 0.01284902, + "balance_loss_clip": 0.06305174, + "balance_loss_mlp": 0.01265042, + "epoch": 0.22233578836615062, + "flos": 39213240203520.0, + "grad_norm": 2.5935528152985867, + "language_loss": 0.6687606, + "learning_rate": 3.624328776493346e-06, + "loss": 0.74694455, + "num_input_tokens_seen": 79680275, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.1986084, + "step": 3698, + "time_per_iteration": 2.812140703201294 + }, + { + "auxiliary_loss_clip": 0.06546268, + "auxiliary_loss_mlp": 0.01282222, + "balance_loss_clip": 0.06307642, + "balance_loss_mlp": 0.01260216, + "epoch": 0.22239591161881858, + "flos": 36292682142720.0, + "grad_norm": 1.853195446284453, + "language_loss": 0.82990527, + "learning_rate": 3.6241015225887637e-06, + "loss": 0.90819019, + "num_input_tokens_seen": 79701255, + "router_z_loss_clip": 2.38671875, + "router_z_loss_mlp": 0.22009277, + "step": 3699, + "time_per_iteration": 2.667423725128174 + }, + { + "auxiliary_loss_clip": 0.06537004, + "auxiliary_loss_mlp": 0.01281329, + "balance_loss_clip": 0.06302205, + "balance_loss_mlp": 0.01260014, + "epoch": 0.22245603487148655, + "flos": 19725779007360.0, + "grad_norm": 1.45021308141165, + "language_loss": 0.80335897, + "learning_rate": 3.62387420709809e-06, + "loss": 0.88154227, + "num_input_tokens_seen": 79721315, + "router_z_loss_clip": 2.34765625, + "router_z_loss_mlp": 0.21313477, + "step": 3700, + "time_per_iteration": 2.5526716709136963 + }, + { + "auxiliary_loss_clip": 0.06548695, + "auxiliary_loss_mlp": 0.01279557, + "balance_loss_clip": 0.06306358, + "balance_loss_mlp": 0.01257885, + "epoch": 0.2225161581241545, + "flos": 46290950081280.0, + "grad_norm": 3.047641549556173, + "language_loss": 0.73186177, + "learning_rate": 3.623646830029943e-06, + "loss": 0.81014431, + "num_input_tokens_seen": 79742705, + "router_z_loss_clip": 2.42382812, + "router_z_loss_mlp": 0.21655273, + "step": 3701, + "time_per_iteration": 2.776974678039551 + }, + { + "auxiliary_loss_clip": 0.06535295, + "auxiliary_loss_mlp": 0.01280405, + "balance_loss_clip": 0.06300849, + "balance_loss_mlp": 0.01259734, + "epoch": 0.22257628137682248, + "flos": 23702990181120.0, + "grad_norm": 4.404280219854046, + "language_loss": 0.80455184, + "learning_rate": 3.6234193913929454e-06, + "loss": 0.88270885, + "num_input_tokens_seen": 79763000, + "router_z_loss_clip": 2.34570312, + "router_z_loss_mlp": 0.20666504, + "step": 3702, + "time_per_iteration": 2.5657999515533447 + }, + { + "auxiliary_loss_clip": 0.06524558, + "auxiliary_loss_mlp": 0.01274253, + "balance_loss_clip": 0.06297488, + "balance_loss_mlp": 0.01253331, + "epoch": 0.22263640462949044, + "flos": 19359986008320.0, + "grad_norm": 3.4101413472023405, + "language_loss": 0.78629804, + "learning_rate": 3.623191891195723e-06, + "loss": 0.86428618, + "num_input_tokens_seen": 79781335, + "router_z_loss_clip": 2.27148438, + "router_z_loss_mlp": 0.20910645, + "step": 3703, + "time_per_iteration": 2.550189971923828 + }, + { + "auxiliary_loss_clip": 0.06541737, + "auxiliary_loss_mlp": 0.01279602, + "balance_loss_clip": 0.06300878, + "balance_loss_mlp": 0.01257084, + "epoch": 0.22269652788215843, + "flos": 20782138631040.0, + "grad_norm": 2.0986231414271828, + "language_loss": 0.75210625, + "learning_rate": 3.6229643294469005e-06, + "loss": 0.83031964, + "num_input_tokens_seen": 79800150, + "router_z_loss_clip": 2.41015625, + "router_z_loss_mlp": 0.22509766, + "step": 3704, + "time_per_iteration": 2.5540754795074463 + }, + { + "auxiliary_loss_clip": 0.06527826, + "auxiliary_loss_mlp": 0.01288936, + "balance_loss_clip": 0.06299336, + "balance_loss_mlp": 0.01268682, + "epoch": 0.2227566511348264, + "flos": 47969631578880.0, + "grad_norm": 1.891044771341396, + "language_loss": 0.65108556, + "learning_rate": 3.6227367061551074e-06, + "loss": 0.72925317, + "num_input_tokens_seen": 79822390, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.20239258, + "step": 3705, + "time_per_iteration": 2.8109097480773926 + }, + { + "auxiliary_loss_clip": 0.06438605, + "auxiliary_loss_mlp": 0.01266416, + "balance_loss_clip": 0.0631493, + "balance_loss_mlp": 0.012611, + "epoch": 0.22281677438749437, + "flos": 66235676607360.0, + "grad_norm": 1.322453387614222, + "language_loss": 0.65218806, + "learning_rate": 3.6225090213289766e-06, + "loss": 0.72923827, + "num_input_tokens_seen": 79873350, + "router_z_loss_clip": 1.234375, + "router_z_loss_mlp": 0.05322266, + "step": 3706, + "time_per_iteration": 3.059636354446411 + }, + { + "auxiliary_loss_clip": 0.06534128, + "auxiliary_loss_mlp": 0.01286492, + "balance_loss_clip": 0.06297205, + "balance_loss_mlp": 0.01266274, + "epoch": 0.22287689764016233, + "flos": 21878050181760.0, + "grad_norm": 2.374246987916323, + "language_loss": 0.80905002, + "learning_rate": 3.622281274977141e-06, + "loss": 0.88725626, + "num_input_tokens_seen": 79891715, + "router_z_loss_clip": 2.36914062, + "router_z_loss_mlp": 0.20202637, + "step": 3707, + "time_per_iteration": 2.5891129970550537 + }, + { + "auxiliary_loss_clip": 0.06532377, + "auxiliary_loss_mlp": 0.01280313, + "balance_loss_clip": 0.06298505, + "balance_loss_mlp": 0.01257854, + "epoch": 0.2229370208928303, + "flos": 27679824011520.0, + "grad_norm": 1.802742500055583, + "language_loss": 0.79219007, + "learning_rate": 3.6220534671082367e-06, + "loss": 0.87031698, + "num_input_tokens_seen": 79911175, + "router_z_loss_clip": 2.34179688, + "router_z_loss_mlp": 0.2244873, + "step": 3708, + "time_per_iteration": 2.5907180309295654 + }, + { + "auxiliary_loss_clip": 0.06539932, + "auxiliary_loss_mlp": 0.01293698, + "balance_loss_clip": 0.06300655, + "balance_loss_mlp": 0.01271525, + "epoch": 0.22299714414549826, + "flos": 30162612816000.0, + "grad_norm": 1.9019649120082793, + "language_loss": 0.81583631, + "learning_rate": 3.6218255977309024e-06, + "loss": 0.89417267, + "num_input_tokens_seen": 79931875, + "router_z_loss_clip": 2.39453125, + "router_z_loss_mlp": 0.22167969, + "step": 3709, + "time_per_iteration": 2.658768892288208 + }, + { + "auxiliary_loss_clip": 0.06540084, + "auxiliary_loss_mlp": 0.01295766, + "balance_loss_clip": 0.0630019, + "balance_loss_mlp": 0.01274464, + "epoch": 0.22305726739816625, + "flos": 23148871130880.0, + "grad_norm": 2.9556041497723236, + "language_loss": 0.69413233, + "learning_rate": 3.6215976668537787e-06, + "loss": 0.77249086, + "num_input_tokens_seen": 79952445, + "router_z_loss_clip": 2.39648438, + "router_z_loss_mlp": 0.21289062, + "step": 3710, + "time_per_iteration": 2.603476047515869 + }, + { + "auxiliary_loss_clip": 0.06536471, + "auxiliary_loss_mlp": 0.01286054, + "balance_loss_clip": 0.06297636, + "balance_loss_mlp": 0.01264429, + "epoch": 0.22311739065083422, + "flos": 19178116721280.0, + "grad_norm": 2.184897161331363, + "language_loss": 0.91282266, + "learning_rate": 3.6213696744855096e-06, + "loss": 0.99104792, + "num_input_tokens_seen": 79971030, + "router_z_loss_clip": 2.39257812, + "router_z_loss_mlp": 0.21606445, + "step": 3711, + "time_per_iteration": 2.6093854904174805 + }, + { + "auxiliary_loss_clip": 0.06539471, + "auxiliary_loss_mlp": 0.01298084, + "balance_loss_clip": 0.06302293, + "balance_loss_mlp": 0.01275911, + "epoch": 0.22317751390350218, + "flos": 13621467611520.0, + "grad_norm": 2.3638705243519142, + "language_loss": 0.89271343, + "learning_rate": 3.6211416206347395e-06, + "loss": 0.97108901, + "num_input_tokens_seen": 79982085, + "router_z_loss_clip": 2.375, + "router_z_loss_mlp": 0.22192383, + "step": 3712, + "time_per_iteration": 2.5170199871063232 + }, + { + "auxiliary_loss_clip": 0.06530519, + "auxiliary_loss_mlp": 0.01292247, + "balance_loss_clip": 0.06299888, + "balance_loss_mlp": 0.01271481, + "epoch": 0.22323763715617015, + "flos": 11032643064960.0, + "grad_norm": 2.927785991832361, + "language_loss": 0.74880064, + "learning_rate": 3.620913505310117e-06, + "loss": 0.82702827, + "num_input_tokens_seen": 79997460, + "router_z_loss_clip": 2.30859375, + "router_z_loss_mlp": 0.2076416, + "step": 3713, + "time_per_iteration": 2.521813154220581 + }, + { + "auxiliary_loss_clip": 0.06534518, + "auxiliary_loss_mlp": 0.0130023, + "balance_loss_clip": 0.06298245, + "balance_loss_mlp": 0.01277556, + "epoch": 0.22329776040883811, + "flos": 41360647841280.0, + "grad_norm": 2.458794372685298, + "language_loss": 0.62675929, + "learning_rate": 3.6206853285202917e-06, + "loss": 0.70510674, + "num_input_tokens_seen": 80022450, + "router_z_loss_clip": 2.36328125, + "router_z_loss_mlp": 0.22668457, + "step": 3714, + "time_per_iteration": 2.704357862472534 + }, + { + "auxiliary_loss_clip": 0.06529912, + "auxiliary_loss_mlp": 0.01289936, + "balance_loss_clip": 0.06295826, + "balance_loss_mlp": 0.01267906, + "epoch": 0.22335788366150608, + "flos": 25126568323200.0, + "grad_norm": 1.757427072944695, + "language_loss": 0.79499549, + "learning_rate": 3.6204570902739164e-06, + "loss": 0.87319398, + "num_input_tokens_seen": 80042100, + "router_z_loss_clip": 2.33984375, + "router_z_loss_mlp": 0.22009277, + "step": 3715, + "time_per_iteration": 2.571711301803589 + }, + { + "auxiliary_loss_clip": 0.06527971, + "auxiliary_loss_mlp": 0.01294287, + "balance_loss_clip": 0.06293058, + "balance_loss_mlp": 0.0127302, + "epoch": 0.22341800691417404, + "flos": 16989144658560.0, + "grad_norm": 1.5961840175356918, + "language_loss": 0.77329421, + "learning_rate": 3.620228790579645e-06, + "loss": 0.85151684, + "num_input_tokens_seen": 80059690, + "router_z_loss_clip": 2.34765625, + "router_z_loss_mlp": 0.21276855, + "step": 3716, + "time_per_iteration": 2.502037286758423 + }, + { + "auxiliary_loss_clip": 0.06529684, + "auxiliary_loss_mlp": 0.0129404, + "balance_loss_clip": 0.06297298, + "balance_loss_mlp": 0.01273977, + "epoch": 0.22347813016684204, + "flos": 14141904520320.0, + "grad_norm": 2.4369226344025665, + "language_loss": 0.80004126, + "learning_rate": 3.6200004294461367e-06, + "loss": 0.87827849, + "num_input_tokens_seen": 80076060, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.20068359, + "step": 3717, + "time_per_iteration": 2.5208563804626465 + }, + { + "auxiliary_loss_clip": 0.065373, + "auxiliary_loss_mlp": 0.01297317, + "balance_loss_clip": 0.06298472, + "balance_loss_mlp": 0.01275215, + "epoch": 0.22353825341951, + "flos": 23589323717760.0, + "grad_norm": 2.564573329936102, + "language_loss": 0.68781847, + "learning_rate": 3.6197720068820497e-06, + "loss": 0.76616466, + "num_input_tokens_seen": 80094760, + "router_z_loss_clip": 2.38867188, + "router_z_loss_mlp": 0.22106934, + "step": 3718, + "time_per_iteration": 2.6491305828094482 + }, + { + "auxiliary_loss_clip": 0.06536659, + "auxiliary_loss_mlp": 0.01296292, + "balance_loss_clip": 0.06298986, + "balance_loss_mlp": 0.01271187, + "epoch": 0.22359837667217797, + "flos": 29831759769600.0, + "grad_norm": 1.515297493499622, + "language_loss": 0.80957985, + "learning_rate": 3.619543522896045e-06, + "loss": 0.88790929, + "num_input_tokens_seen": 80114475, + "router_z_loss_clip": 2.37695312, + "router_z_loss_mlp": 0.25085449, + "step": 3719, + "time_per_iteration": 2.6334550380706787 + }, + { + "auxiliary_loss_clip": 0.06540611, + "auxiliary_loss_mlp": 0.01300766, + "balance_loss_clip": 0.06299402, + "balance_loss_mlp": 0.01276793, + "epoch": 0.22365849992484593, + "flos": 17608867056000.0, + "grad_norm": 2.352033480486632, + "language_loss": 0.87360144, + "learning_rate": 3.6193149774967885e-06, + "loss": 0.95201522, + "num_input_tokens_seen": 80132920, + "router_z_loss_clip": 2.41210938, + "router_z_loss_mlp": 0.23962402, + "step": 3720, + "time_per_iteration": 2.5415003299713135 + }, + { + "auxiliary_loss_clip": 0.06526608, + "auxiliary_loss_mlp": 0.01292998, + "balance_loss_clip": 0.06295964, + "balance_loss_mlp": 0.01271672, + "epoch": 0.2237186231775139, + "flos": 22717558638720.0, + "grad_norm": 1.8478771577440833, + "language_loss": 0.75151736, + "learning_rate": 3.619086370692945e-06, + "loss": 0.8297134, + "num_input_tokens_seen": 80152845, + "router_z_loss_clip": 2.30664062, + "router_z_loss_mlp": 0.21325684, + "step": 3721, + "time_per_iteration": 2.548450469970703 + }, + { + "auxiliary_loss_clip": 0.06540586, + "auxiliary_loss_mlp": 0.0129148, + "balance_loss_clip": 0.06299528, + "balance_loss_mlp": 0.01269105, + "epoch": 0.22377874643018186, + "flos": 13376720234880.0, + "grad_norm": 2.2094798322640736, + "language_loss": 0.79352558, + "learning_rate": 3.6188577024931844e-06, + "loss": 0.87184626, + "num_input_tokens_seen": 80170680, + "router_z_loss_clip": 2.41210938, + "router_z_loss_mlp": 0.22375488, + "step": 3722, + "time_per_iteration": 2.519277572631836 + }, + { + "auxiliary_loss_clip": 0.06531984, + "auxiliary_loss_mlp": 0.01288897, + "balance_loss_clip": 0.06299505, + "balance_loss_mlp": 0.01267964, + "epoch": 0.22383886968284986, + "flos": 17900797080960.0, + "grad_norm": 2.2930078409484196, + "language_loss": 0.83410442, + "learning_rate": 3.618628972906178e-06, + "loss": 0.91231328, + "num_input_tokens_seen": 80189030, + "router_z_loss_clip": 2.32421875, + "router_z_loss_mlp": 0.20922852, + "step": 3723, + "time_per_iteration": 2.5086076259613037 + }, + { + "auxiliary_loss_clip": 0.06544059, + "auxiliary_loss_mlp": 0.01285781, + "balance_loss_clip": 0.06305651, + "balance_loss_mlp": 0.01263834, + "epoch": 0.22389899293551782, + "flos": 23886033425280.0, + "grad_norm": 4.429276920778782, + "language_loss": 0.84606177, + "learning_rate": 3.6184001819405984e-06, + "loss": 0.92436016, + "num_input_tokens_seen": 80208365, + "router_z_loss_clip": 2.3828125, + "router_z_loss_mlp": 0.21960449, + "step": 3724, + "time_per_iteration": 2.574178695678711 + }, + { + "auxiliary_loss_clip": 0.06534179, + "auxiliary_loss_mlp": 0.01287846, + "balance_loss_clip": 0.06299204, + "balance_loss_mlp": 0.01267211, + "epoch": 0.2239591161881858, + "flos": 27279929600640.0, + "grad_norm": 1.978846940821608, + "language_loss": 0.79885381, + "learning_rate": 3.618171329605121e-06, + "loss": 0.87707412, + "num_input_tokens_seen": 80228685, + "router_z_loss_clip": 2.3515625, + "router_z_loss_mlp": 0.20617676, + "step": 3725, + "time_per_iteration": 2.589184522628784 + }, + { + "auxiliary_loss_clip": 0.06541407, + "auxiliary_loss_mlp": 0.01289084, + "balance_loss_clip": 0.06307919, + "balance_loss_mlp": 0.01267197, + "epoch": 0.22401923944085375, + "flos": 22243423910400.0, + "grad_norm": 1.7178260071510263, + "language_loss": 0.78001326, + "learning_rate": 3.6179424159084254e-06, + "loss": 0.85831815, + "num_input_tokens_seen": 80247635, + "router_z_loss_clip": 2.3359375, + "router_z_loss_mlp": 0.21875, + "step": 3726, + "time_per_iteration": 3.980494976043701 + }, + { + "auxiliary_loss_clip": 0.06552388, + "auxiliary_loss_mlp": 0.01297244, + "balance_loss_clip": 0.06307887, + "balance_loss_mlp": 0.01272175, + "epoch": 0.22407936269352172, + "flos": 12057920023680.0, + "grad_norm": 3.478702992871699, + "language_loss": 0.73437679, + "learning_rate": 3.6177134408591914e-06, + "loss": 0.81287301, + "num_input_tokens_seen": 80260045, + "router_z_loss_clip": 2.45117188, + "router_z_loss_mlp": 0.25097656, + "step": 3727, + "time_per_iteration": 2.4799015522003174 + }, + { + "auxiliary_loss_clip": 0.06549139, + "auxiliary_loss_mlp": 0.01296668, + "balance_loss_clip": 0.06309944, + "balance_loss_mlp": 0.0127341, + "epoch": 0.22413948594618968, + "flos": 19359482883840.0, + "grad_norm": 2.179866459674304, + "language_loss": 0.8799302, + "learning_rate": 3.6174844044661013e-06, + "loss": 0.95838827, + "num_input_tokens_seen": 80277680, + "router_z_loss_clip": 2.39257812, + "router_z_loss_mlp": 0.23254395, + "step": 3728, + "time_per_iteration": 2.547523021697998 + }, + { + "auxiliary_loss_clip": 0.0653842, + "auxiliary_loss_mlp": 0.01294185, + "balance_loss_clip": 0.06303863, + "balance_loss_mlp": 0.0126989, + "epoch": 0.22419960919885765, + "flos": 24176789493120.0, + "grad_norm": 1.9160734665449493, + "language_loss": 0.80446088, + "learning_rate": 3.6172553067378406e-06, + "loss": 0.88278687, + "num_input_tokens_seen": 80294795, + "router_z_loss_clip": 2.34375, + "router_z_loss_mlp": 0.24328613, + "step": 3729, + "time_per_iteration": 4.021615266799927 + }, + { + "auxiliary_loss_clip": 0.06533324, + "auxiliary_loss_mlp": 0.01292111, + "balance_loss_clip": 0.06302898, + "balance_loss_mlp": 0.01271237, + "epoch": 0.22425973245152564, + "flos": 27386007269760.0, + "grad_norm": 1.6841051152750983, + "language_loss": 0.87170112, + "learning_rate": 3.6170261476830964e-06, + "loss": 0.94995546, + "num_input_tokens_seen": 80315425, + "router_z_loss_clip": 2.30273438, + "router_z_loss_mlp": 0.2088623, + "step": 3730, + "time_per_iteration": 2.598576307296753 + }, + { + "auxiliary_loss_clip": 0.0653019, + "auxiliary_loss_mlp": 0.01298076, + "balance_loss_clip": 0.06300467, + "balance_loss_mlp": 0.01276403, + "epoch": 0.2243198557041936, + "flos": 13740794225280.0, + "grad_norm": 2.088554635044429, + "language_loss": 0.73449922, + "learning_rate": 3.616796927310559e-06, + "loss": 0.81278187, + "num_input_tokens_seen": 80333905, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.21655273, + "step": 3731, + "time_per_iteration": 2.5361716747283936 + }, + { + "auxiliary_loss_clip": 0.06541456, + "auxiliary_loss_mlp": 0.01292681, + "balance_loss_clip": 0.06301124, + "balance_loss_mlp": 0.01267933, + "epoch": 0.22437997895686157, + "flos": 19535775874560.0, + "grad_norm": 5.172507402775724, + "language_loss": 0.75803339, + "learning_rate": 3.6165676456289195e-06, + "loss": 0.83637482, + "num_input_tokens_seen": 80352165, + "router_z_loss_clip": 2.40625, + "router_z_loss_mlp": 0.24755859, + "step": 3732, + "time_per_iteration": 2.5423076152801514 + }, + { + "auxiliary_loss_clip": 0.06533462, + "auxiliary_loss_mlp": 0.01296517, + "balance_loss_clip": 0.06298938, + "balance_loss_mlp": 0.01273664, + "epoch": 0.22444010220952954, + "flos": 23703032108160.0, + "grad_norm": 1.6752991374876018, + "language_loss": 0.89338291, + "learning_rate": 3.616338302646873e-06, + "loss": 0.97168273, + "num_input_tokens_seen": 80371305, + "router_z_loss_clip": 2.34375, + "router_z_loss_mlp": 0.2286377, + "step": 3733, + "time_per_iteration": 4.021088123321533 + }, + { + "auxiliary_loss_clip": 0.065323, + "auxiliary_loss_mlp": 0.01294952, + "balance_loss_clip": 0.06298727, + "balance_loss_mlp": 0.01270193, + "epoch": 0.2245002254621975, + "flos": 22389514704000.0, + "grad_norm": 1.4651206016819107, + "language_loss": 0.85422146, + "learning_rate": 3.6161088983731166e-06, + "loss": 0.93249398, + "num_input_tokens_seen": 80391020, + "router_z_loss_clip": 2.33398438, + "router_z_loss_mlp": 0.24780273, + "step": 3734, + "time_per_iteration": 2.5562949180603027 + }, + { + "auxiliary_loss_clip": 0.06539299, + "auxiliary_loss_mlp": 0.01283537, + "balance_loss_clip": 0.06303868, + "balance_loss_mlp": 0.01261113, + "epoch": 0.22456034871486547, + "flos": 26949453897600.0, + "grad_norm": 1.579737554219585, + "language_loss": 0.77332962, + "learning_rate": 3.6158794328163482e-06, + "loss": 0.85155803, + "num_input_tokens_seen": 80411365, + "router_z_loss_clip": 2.3515625, + "router_z_loss_mlp": 0.22436523, + "step": 3735, + "time_per_iteration": 4.016703367233276 + }, + { + "auxiliary_loss_clip": 0.06526705, + "auxiliary_loss_mlp": 0.01290552, + "balance_loss_clip": 0.06298478, + "balance_loss_mlp": 0.01269559, + "epoch": 0.22462047196753343, + "flos": 28990700012160.0, + "grad_norm": 1.885472064442235, + "language_loss": 0.84907603, + "learning_rate": 3.6156499059852702e-06, + "loss": 0.92724866, + "num_input_tokens_seen": 80431075, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.21008301, + "step": 3736, + "time_per_iteration": 2.6118290424346924 + }, + { + "auxiliary_loss_clip": 0.06536424, + "auxiliary_loss_mlp": 0.01285836, + "balance_loss_clip": 0.0630133, + "balance_loss_mlp": 0.01261922, + "epoch": 0.22468059522020142, + "flos": 20017541324160.0, + "grad_norm": 1.5290746464045628, + "language_loss": 0.87103891, + "learning_rate": 3.615420317888586e-06, + "loss": 0.94926155, + "num_input_tokens_seen": 80449240, + "router_z_loss_clip": 2.3515625, + "router_z_loss_mlp": 0.23913574, + "step": 3737, + "time_per_iteration": 2.5211808681488037 + }, + { + "auxiliary_loss_clip": 0.06534176, + "auxiliary_loss_mlp": 0.01288351, + "balance_loss_clip": 0.06294889, + "balance_loss_mlp": 0.01263949, + "epoch": 0.2247407184728694, + "flos": 29321846547840.0, + "grad_norm": 1.8581473098744326, + "language_loss": 0.80131769, + "learning_rate": 3.6151906685350006e-06, + "loss": 0.87954295, + "num_input_tokens_seen": 80467900, + "router_z_loss_clip": 2.39257812, + "router_z_loss_mlp": 0.24389648, + "step": 3738, + "time_per_iteration": 2.604417085647583 + }, + { + "auxiliary_loss_clip": 0.06530435, + "auxiliary_loss_mlp": 0.01285051, + "balance_loss_clip": 0.06293893, + "balance_loss_mlp": 0.01263295, + "epoch": 0.22480084172553735, + "flos": 22317035322240.0, + "grad_norm": 1.7432458267253939, + "language_loss": 0.77190316, + "learning_rate": 3.614960957933224e-06, + "loss": 0.85005802, + "num_input_tokens_seen": 80487100, + "router_z_loss_clip": 2.36914062, + "router_z_loss_mlp": 0.21728516, + "step": 3739, + "time_per_iteration": 2.540266275405884 + }, + { + "auxiliary_loss_clip": 0.06531328, + "auxiliary_loss_mlp": 0.01283134, + "balance_loss_clip": 0.06295189, + "balance_loss_mlp": 0.01260091, + "epoch": 0.22486096497820532, + "flos": 25598019720960.0, + "grad_norm": 4.441094103460663, + "language_loss": 0.74799633, + "learning_rate": 3.6147311860919655e-06, + "loss": 0.82614094, + "num_input_tokens_seen": 80508625, + "router_z_loss_clip": 2.36132812, + "router_z_loss_mlp": 0.23022461, + "step": 3740, + "time_per_iteration": 2.640592575073242 + }, + { + "auxiliary_loss_clip": 0.06520827, + "auxiliary_loss_mlp": 0.01278747, + "balance_loss_clip": 0.06289122, + "balance_loss_mlp": 0.01256681, + "epoch": 0.22492108823087328, + "flos": 17645651798400.0, + "grad_norm": 2.0040821388775285, + "language_loss": 0.75983584, + "learning_rate": 3.614501353019939e-06, + "loss": 0.83783156, + "num_input_tokens_seen": 80527345, + "router_z_loss_clip": 2.31640625, + "router_z_loss_mlp": 0.22070312, + "step": 3741, + "time_per_iteration": 2.513965129852295 + }, + { + "auxiliary_loss_clip": 0.06526901, + "auxiliary_loss_mlp": 0.01283674, + "balance_loss_clip": 0.06296658, + "balance_loss_mlp": 0.0126224, + "epoch": 0.22498121148354125, + "flos": 16040246296320.0, + "grad_norm": 1.702368757801579, + "language_loss": 0.87747514, + "learning_rate": 3.6142714587258592e-06, + "loss": 0.95558089, + "num_input_tokens_seen": 80545545, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.21435547, + "step": 3742, + "time_per_iteration": 2.5164167881011963 + }, + { + "auxiliary_loss_clip": 0.0652426, + "auxiliary_loss_mlp": 0.01281848, + "balance_loss_clip": 0.06294844, + "balance_loss_mlp": 0.01259389, + "epoch": 0.22504133473620924, + "flos": 24030489064320.0, + "grad_norm": 1.7109022824395175, + "language_loss": 0.82010657, + "learning_rate": 3.614041503218444e-06, + "loss": 0.89816761, + "num_input_tokens_seen": 80565040, + "router_z_loss_clip": 2.29296875, + "router_z_loss_mlp": 0.22473145, + "step": 3743, + "time_per_iteration": 2.5486276149749756 + }, + { + "auxiliary_loss_clip": 0.06524298, + "auxiliary_loss_mlp": 0.0127565, + "balance_loss_clip": 0.06291372, + "balance_loss_mlp": 0.01254562, + "epoch": 0.2251014579888772, + "flos": 16769610161280.0, + "grad_norm": 2.126207867209009, + "language_loss": 0.64185399, + "learning_rate": 3.6138114865064134e-06, + "loss": 0.7198534, + "num_input_tokens_seen": 80582815, + "router_z_loss_clip": 2.328125, + "router_z_loss_mlp": 0.2109375, + "step": 3744, + "time_per_iteration": 2.535020351409912 + }, + { + "auxiliary_loss_clip": 0.06527244, + "auxiliary_loss_mlp": 0.01277496, + "balance_loss_clip": 0.06293654, + "balance_loss_mlp": 0.01256372, + "epoch": 0.22516158124154517, + "flos": 13996191070080.0, + "grad_norm": 3.1643825534304684, + "language_loss": 0.76886272, + "learning_rate": 3.613581408598489e-06, + "loss": 0.84691012, + "num_input_tokens_seen": 80600865, + "router_z_loss_clip": 2.3359375, + "router_z_loss_mlp": 0.21105957, + "step": 3745, + "time_per_iteration": 2.5233495235443115 + }, + { + "auxiliary_loss_clip": 0.06522205, + "auxiliary_loss_mlp": 0.01281406, + "balance_loss_clip": 0.06290923, + "balance_loss_mlp": 0.01260675, + "epoch": 0.22522170449421314, + "flos": 14394869596800.0, + "grad_norm": 1.6969236990578618, + "language_loss": 0.80721819, + "learning_rate": 3.6133512695033965e-06, + "loss": 0.88525426, + "num_input_tokens_seen": 80617455, + "router_z_loss_clip": 2.3125, + "router_z_loss_mlp": 0.20739746, + "step": 3746, + "time_per_iteration": 2.559129476547241 + }, + { + "auxiliary_loss_clip": 0.06533524, + "auxiliary_loss_mlp": 0.01280566, + "balance_loss_clip": 0.06296681, + "balance_loss_mlp": 0.0125881, + "epoch": 0.2252818277468811, + "flos": 23812338159360.0, + "grad_norm": 2.077776202364112, + "language_loss": 0.86226261, + "learning_rate": 3.613121069229862e-06, + "loss": 0.94040346, + "num_input_tokens_seen": 80635125, + "router_z_loss_clip": 2.3671875, + "router_z_loss_mlp": 0.21765137, + "step": 3747, + "time_per_iteration": 2.5834550857543945 + }, + { + "auxiliary_loss_clip": 0.06530412, + "auxiliary_loss_mlp": 0.01275087, + "balance_loss_clip": 0.06296586, + "balance_loss_mlp": 0.01255095, + "epoch": 0.22534195099954907, + "flos": 24725038757760.0, + "grad_norm": 1.8595393434505574, + "language_loss": 0.76982796, + "learning_rate": 3.6128908077866145e-06, + "loss": 0.84788299, + "num_input_tokens_seen": 80656370, + "router_z_loss_clip": 2.34179688, + "router_z_loss_mlp": 0.1998291, + "step": 3748, + "time_per_iteration": 2.5877788066864014 + }, + { + "auxiliary_loss_clip": 0.0652978, + "auxiliary_loss_mlp": 0.0128313, + "balance_loss_clip": 0.06296694, + "balance_loss_mlp": 0.01261768, + "epoch": 0.22540207425221703, + "flos": 21038625578880.0, + "grad_norm": 1.5282192474331018, + "language_loss": 0.80547005, + "learning_rate": 3.6126604851823864e-06, + "loss": 0.88359916, + "num_input_tokens_seen": 80676495, + "router_z_loss_clip": 2.33203125, + "router_z_loss_mlp": 0.21374512, + "step": 3749, + "time_per_iteration": 2.5356597900390625 + }, + { + "auxiliary_loss_clip": 0.06526259, + "auxiliary_loss_mlp": 0.01273546, + "balance_loss_clip": 0.06298405, + "balance_loss_mlp": 0.01253698, + "epoch": 0.22546219750488503, + "flos": 19396351480320.0, + "grad_norm": 1.5225090015602234, + "language_loss": 0.80070651, + "learning_rate": 3.6124301014259108e-06, + "loss": 0.87870455, + "num_input_tokens_seen": 80694755, + "router_z_loss_clip": 2.2734375, + "router_z_loss_mlp": 0.19848633, + "step": 3750, + "time_per_iteration": 2.524614095687866 + }, + { + "auxiliary_loss_clip": 0.06532078, + "auxiliary_loss_mlp": 0.01279372, + "balance_loss_clip": 0.06297495, + "balance_loss_mlp": 0.01258117, + "epoch": 0.225522320757553, + "flos": 25199760464640.0, + "grad_norm": 5.336084937176506, + "language_loss": 0.8300491, + "learning_rate": 3.6121996565259244e-06, + "loss": 0.90816361, + "num_input_tokens_seen": 80713670, + "router_z_loss_clip": 2.34765625, + "router_z_loss_mlp": 0.21264648, + "step": 3751, + "time_per_iteration": 2.5638771057128906 + }, + { + "auxiliary_loss_clip": 0.06527963, + "auxiliary_loss_mlp": 0.01280546, + "balance_loss_clip": 0.06296829, + "balance_loss_mlp": 0.01260149, + "epoch": 0.22558244401022096, + "flos": 17168456396160.0, + "grad_norm": 1.7246902184661286, + "language_loss": 0.8427825, + "learning_rate": 3.611969150491165e-06, + "loss": 0.92086762, + "num_input_tokens_seen": 80731450, + "router_z_loss_clip": 2.31054688, + "router_z_loss_mlp": 0.20385742, + "step": 3752, + "time_per_iteration": 2.5650362968444824 + }, + { + "auxiliary_loss_clip": 0.06527157, + "auxiliary_loss_mlp": 0.01275092, + "balance_loss_clip": 0.06298538, + "balance_loss_mlp": 0.01254839, + "epoch": 0.22564256726288892, + "flos": 15236306697600.0, + "grad_norm": 1.7312534305272433, + "language_loss": 0.78620666, + "learning_rate": 3.611738583330375e-06, + "loss": 0.8642292, + "num_input_tokens_seen": 80748415, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.20251465, + "step": 3753, + "time_per_iteration": 2.510344982147217 + }, + { + "auxiliary_loss_clip": 0.06525348, + "auxiliary_loss_mlp": 0.01279816, + "balance_loss_clip": 0.06296748, + "balance_loss_mlp": 0.01257869, + "epoch": 0.2257026905155569, + "flos": 34577215902720.0, + "grad_norm": 1.9706921359503449, + "language_loss": 0.79448152, + "learning_rate": 3.611507955052295e-06, + "loss": 0.8725332, + "num_input_tokens_seen": 80770835, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.21948242, + "step": 3754, + "time_per_iteration": 2.6429665088653564 + }, + { + "auxiliary_loss_clip": 0.06526577, + "auxiliary_loss_mlp": 0.01281198, + "balance_loss_clip": 0.06299241, + "balance_loss_mlp": 0.01259835, + "epoch": 0.22576281376822485, + "flos": 19944642672000.0, + "grad_norm": 1.7667035857085684, + "language_loss": 0.70640147, + "learning_rate": 3.6112772656656727e-06, + "loss": 0.78447914, + "num_input_tokens_seen": 80787840, + "router_z_loss_clip": 2.2734375, + "router_z_loss_mlp": 0.21374512, + "step": 3755, + "time_per_iteration": 2.5482447147369385 + }, + { + "auxiliary_loss_clip": 0.06530152, + "auxiliary_loss_mlp": 0.01282078, + "balance_loss_clip": 0.06295566, + "balance_loss_mlp": 0.01261085, + "epoch": 0.22582293702089282, + "flos": 24607892350080.0, + "grad_norm": 2.6955819116528588, + "language_loss": 0.77899122, + "learning_rate": 3.6110465151792547e-06, + "loss": 0.85711348, + "num_input_tokens_seen": 80806335, + "router_z_loss_clip": 2.34570312, + "router_z_loss_mlp": 0.21008301, + "step": 3756, + "time_per_iteration": 2.573639392852783 + }, + { + "auxiliary_loss_clip": 0.06536651, + "auxiliary_loss_mlp": 0.01278842, + "balance_loss_clip": 0.0629873, + "balance_loss_mlp": 0.01255394, + "epoch": 0.2258830602735608, + "flos": 23041451796480.0, + "grad_norm": 2.9460656412940405, + "language_loss": 0.82867002, + "learning_rate": 3.6108157036017916e-06, + "loss": 0.90682495, + "num_input_tokens_seen": 80825355, + "router_z_loss_clip": 2.38085938, + "router_z_loss_mlp": 0.23461914, + "step": 3757, + "time_per_iteration": 2.5425305366516113 + }, + { + "auxiliary_loss_clip": 0.06538612, + "auxiliary_loss_mlp": 0.01279229, + "balance_loss_clip": 0.06302969, + "balance_loss_mlp": 0.01257164, + "epoch": 0.22594318352622877, + "flos": 22164068494080.0, + "grad_norm": 3.099441845199118, + "language_loss": 0.73941171, + "learning_rate": 3.6105848309420358e-06, + "loss": 0.81759018, + "num_input_tokens_seen": 80842570, + "router_z_loss_clip": 2.359375, + "router_z_loss_mlp": 0.2208252, + "step": 3758, + "time_per_iteration": 2.506148099899292 + }, + { + "auxiliary_loss_clip": 0.06531477, + "auxiliary_loss_mlp": 0.01288595, + "balance_loss_clip": 0.06296086, + "balance_loss_mlp": 0.01266816, + "epoch": 0.22600330677889674, + "flos": 20600478979200.0, + "grad_norm": 2.4125098710516117, + "language_loss": 0.77881908, + "learning_rate": 3.6103538972087412e-06, + "loss": 0.85701978, + "num_input_tokens_seen": 80858745, + "router_z_loss_clip": 2.35546875, + "router_z_loss_mlp": 0.21777344, + "step": 3759, + "time_per_iteration": 2.5171775817871094 + }, + { + "auxiliary_loss_clip": 0.06534176, + "auxiliary_loss_mlp": 0.01288917, + "balance_loss_clip": 0.06296586, + "balance_loss_mlp": 0.01266267, + "epoch": 0.2260634300315647, + "flos": 35667970427520.0, + "grad_norm": 1.6851914496917324, + "language_loss": 0.7921207, + "learning_rate": 3.6101229024106655e-06, + "loss": 0.87035167, + "num_input_tokens_seen": 80880085, + "router_z_loss_clip": 2.37695312, + "router_z_loss_mlp": 0.22644043, + "step": 3760, + "time_per_iteration": 2.6410677433013916 + }, + { + "auxiliary_loss_clip": 0.06433272, + "auxiliary_loss_mlp": 0.01258557, + "balance_loss_clip": 0.06311189, + "balance_loss_mlp": 0.01252156, + "epoch": 0.22612355328423267, + "flos": 72107707685760.0, + "grad_norm": 0.875668320300708, + "language_loss": 0.60230321, + "learning_rate": 3.609891846556569e-06, + "loss": 0.67922151, + "num_input_tokens_seen": 80937660, + "router_z_loss_clip": 1.21875, + "router_z_loss_mlp": 0.06408691, + "step": 3761, + "time_per_iteration": 3.1083786487579346 + }, + { + "auxiliary_loss_clip": 0.06545433, + "auxiliary_loss_mlp": 0.01288291, + "balance_loss_clip": 0.06303856, + "balance_loss_mlp": 0.01267012, + "epoch": 0.22618367653690064, + "flos": 22790373436800.0, + "grad_norm": 3.0022983434583783, + "language_loss": 0.77876461, + "learning_rate": 3.609660729655211e-06, + "loss": 0.8571018, + "num_input_tokens_seen": 80956265, + "router_z_loss_clip": 2.41601562, + "router_z_loss_mlp": 0.21289062, + "step": 3762, + "time_per_iteration": 2.5256128311157227 + }, + { + "auxiliary_loss_clip": 0.06531228, + "auxiliary_loss_mlp": 0.01280361, + "balance_loss_clip": 0.06294668, + "balance_loss_mlp": 0.01258343, + "epoch": 0.22624379978956863, + "flos": 20454388185600.0, + "grad_norm": 1.959767281760525, + "language_loss": 0.79828411, + "learning_rate": 3.6094295517153573e-06, + "loss": 0.87639999, + "num_input_tokens_seen": 80975185, + "router_z_loss_clip": 2.36328125, + "router_z_loss_mlp": 0.22033691, + "step": 3763, + "time_per_iteration": 2.528965950012207 + }, + { + "auxiliary_loss_clip": 0.06540731, + "auxiliary_loss_mlp": 0.01291635, + "balance_loss_clip": 0.06300753, + "balance_loss_mlp": 0.01268949, + "epoch": 0.2263039230422366, + "flos": 17500189910400.0, + "grad_norm": 1.5800574189561347, + "language_loss": 0.91907668, + "learning_rate": 3.6091983127457743e-06, + "loss": 0.99740022, + "num_input_tokens_seen": 80992830, + "router_z_loss_clip": 2.3984375, + "router_z_loss_mlp": 0.22705078, + "step": 3764, + "time_per_iteration": 2.5012450218200684 + }, + { + "auxiliary_loss_clip": 0.06527007, + "auxiliary_loss_mlp": 0.01291683, + "balance_loss_clip": 0.06295396, + "balance_loss_mlp": 0.01271001, + "epoch": 0.22636404629490456, + "flos": 28337295473280.0, + "grad_norm": 3.379650672619254, + "language_loss": 0.75542498, + "learning_rate": 3.6089670127552293e-06, + "loss": 0.83361191, + "num_input_tokens_seen": 81013675, + "router_z_loss_clip": 2.31445312, + "router_z_loss_mlp": 0.20690918, + "step": 3765, + "time_per_iteration": 2.6149775981903076 + }, + { + "auxiliary_loss_clip": 0.06519896, + "auxiliary_loss_mlp": 0.01290584, + "balance_loss_clip": 0.06290467, + "balance_loss_mlp": 0.01268256, + "epoch": 0.22642416954757252, + "flos": 17494152416640.0, + "grad_norm": 2.1325205607667526, + "language_loss": 0.90732884, + "learning_rate": 3.608735651752494e-06, + "loss": 0.98543364, + "num_input_tokens_seen": 81030345, + "router_z_loss_clip": 2.29492188, + "router_z_loss_mlp": 0.22338867, + "step": 3766, + "time_per_iteration": 3.925321340560913 + }, + { + "auxiliary_loss_clip": 0.06520344, + "auxiliary_loss_mlp": 0.01279841, + "balance_loss_clip": 0.0629393, + "balance_loss_mlp": 0.0125756, + "epoch": 0.2264842928002405, + "flos": 24390621912960.0, + "grad_norm": 1.5335844294501488, + "language_loss": 0.74866152, + "learning_rate": 3.6085042297463417e-06, + "loss": 0.82666337, + "num_input_tokens_seen": 81051000, + "router_z_loss_clip": 2.26757812, + "router_z_loss_mlp": 0.22290039, + "step": 3767, + "time_per_iteration": 2.585827589035034 + }, + { + "auxiliary_loss_clip": 0.06526411, + "auxiliary_loss_mlp": 0.01285323, + "balance_loss_clip": 0.06291486, + "balance_loss_mlp": 0.01262816, + "epoch": 0.22654441605290845, + "flos": 19836971775360.0, + "grad_norm": 1.5156609478299474, + "language_loss": 0.72064531, + "learning_rate": 3.6082727467455477e-06, + "loss": 0.79876268, + "num_input_tokens_seen": 81071205, + "router_z_loss_clip": 2.35351562, + "router_z_loss_mlp": 0.22521973, + "step": 3768, + "time_per_iteration": 3.9932377338409424 + }, + { + "auxiliary_loss_clip": 0.06525982, + "auxiliary_loss_mlp": 0.01291355, + "balance_loss_clip": 0.06294759, + "balance_loss_mlp": 0.01268347, + "epoch": 0.22660453930557642, + "flos": 27462050449920.0, + "grad_norm": 1.8227506475765343, + "language_loss": 0.78781188, + "learning_rate": 3.6080412027588905e-06, + "loss": 0.86598527, + "num_input_tokens_seen": 81091880, + "router_z_loss_clip": 2.31445312, + "router_z_loss_mlp": 0.22998047, + "step": 3769, + "time_per_iteration": 2.5796549320220947 + }, + { + "auxiliary_loss_clip": 0.06531481, + "auxiliary_loss_mlp": 0.01287446, + "balance_loss_clip": 0.06292526, + "balance_loss_mlp": 0.01265428, + "epoch": 0.2266646625582444, + "flos": 23995004060160.0, + "grad_norm": 2.604534401291856, + "language_loss": 0.69374454, + "learning_rate": 3.6078095977951488e-06, + "loss": 0.77193379, + "num_input_tokens_seen": 81113290, + "router_z_loss_clip": 2.38867188, + "router_z_loss_mlp": 0.22021484, + "step": 3770, + "time_per_iteration": 2.6160407066345215 + }, + { + "auxiliary_loss_clip": 0.065291, + "auxiliary_loss_mlp": 0.01289999, + "balance_loss_clip": 0.06292273, + "balance_loss_mlp": 0.01269077, + "epoch": 0.22672478581091238, + "flos": 26034698874240.0, + "grad_norm": 1.4830972618629188, + "language_loss": 0.8083868, + "learning_rate": 3.6075779318631067e-06, + "loss": 0.88657784, + "num_input_tokens_seen": 81133535, + "router_z_loss_clip": 2.37109375, + "router_z_loss_mlp": 0.20922852, + "step": 3771, + "time_per_iteration": 2.576948642730713 + }, + { + "auxiliary_loss_clip": 0.06521479, + "auxiliary_loss_mlp": 0.01283736, + "balance_loss_clip": 0.06290901, + "balance_loss_mlp": 0.012613, + "epoch": 0.22678490906358034, + "flos": 23848577850240.0, + "grad_norm": 1.5694676435300003, + "language_loss": 0.79189658, + "learning_rate": 3.6073462049715486e-06, + "loss": 0.86994874, + "num_input_tokens_seen": 81154650, + "router_z_loss_clip": 2.30273438, + "router_z_loss_mlp": 0.22436523, + "step": 3772, + "time_per_iteration": 4.012827396392822 + }, + { + "auxiliary_loss_clip": 0.06410234, + "auxiliary_loss_mlp": 0.01286376, + "balance_loss_clip": 0.06287075, + "balance_loss_mlp": 0.01280571, + "epoch": 0.2268450323162483, + "flos": 65070163912320.0, + "grad_norm": 0.6415690360853892, + "language_loss": 0.53899318, + "learning_rate": 3.607114417129261e-06, + "loss": 0.61595929, + "num_input_tokens_seen": 81221240, + "router_z_loss_clip": 1.234375, + "router_z_loss_mlp": 0.0579834, + "step": 3773, + "time_per_iteration": 3.249551773071289 + }, + { + "auxiliary_loss_clip": 0.06526346, + "auxiliary_loss_mlp": 0.01287624, + "balance_loss_clip": 0.06294057, + "balance_loss_mlp": 0.01266238, + "epoch": 0.22690515556891627, + "flos": 22532251334400.0, + "grad_norm": 1.8359701531623327, + "language_loss": 0.70997107, + "learning_rate": 3.6068825683450334e-06, + "loss": 0.78811073, + "num_input_tokens_seen": 81241520, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.21386719, + "step": 3774, + "time_per_iteration": 2.558279275894165 + }, + { + "auxiliary_loss_clip": 0.06521672, + "auxiliary_loss_mlp": 0.01287873, + "balance_loss_clip": 0.06291246, + "balance_loss_mlp": 0.01266857, + "epoch": 0.22696527882158424, + "flos": 18229344140160.0, + "grad_norm": 2.047907778931267, + "language_loss": 0.75449002, + "learning_rate": 3.606650658627658e-06, + "loss": 0.83258545, + "num_input_tokens_seen": 81256825, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.21008301, + "step": 3775, + "time_per_iteration": 3.928666353225708 + }, + { + "auxiliary_loss_clip": 0.06524701, + "auxiliary_loss_mlp": 0.01286732, + "balance_loss_clip": 0.06292307, + "balance_loss_mlp": 0.01266168, + "epoch": 0.22702540207425223, + "flos": 17024923152000.0, + "grad_norm": 2.031895062113734, + "language_loss": 0.82818532, + "learning_rate": 3.606418687985928e-06, + "loss": 0.90629965, + "num_input_tokens_seen": 81275695, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.20581055, + "step": 3776, + "time_per_iteration": 2.5941483974456787 + }, + { + "auxiliary_loss_clip": 0.06528914, + "auxiliary_loss_mlp": 0.01279846, + "balance_loss_clip": 0.06293055, + "balance_loss_mlp": 0.01259222, + "epoch": 0.2270855253269202, + "flos": 21332316539520.0, + "grad_norm": 1.645158938946052, + "language_loss": 0.83362442, + "learning_rate": 3.606186656428641e-06, + "loss": 0.91171205, + "num_input_tokens_seen": 81294920, + "router_z_loss_clip": 2.359375, + "router_z_loss_mlp": 0.20617676, + "step": 3777, + "time_per_iteration": 2.5177228450775146 + }, + { + "auxiliary_loss_clip": 0.06532624, + "auxiliary_loss_mlp": 0.01278936, + "balance_loss_clip": 0.06296799, + "balance_loss_mlp": 0.01257002, + "epoch": 0.22714564857958816, + "flos": 23557276730880.0, + "grad_norm": 1.8837878269403912, + "language_loss": 0.73246169, + "learning_rate": 3.6059545639645955e-06, + "loss": 0.81057739, + "num_input_tokens_seen": 81314275, + "router_z_loss_clip": 2.36328125, + "router_z_loss_mlp": 0.21948242, + "step": 3778, + "time_per_iteration": 2.5589511394500732 + }, + { + "auxiliary_loss_clip": 0.06530988, + "auxiliary_loss_mlp": 0.01275867, + "balance_loss_clip": 0.06293572, + "balance_loss_mlp": 0.01255673, + "epoch": 0.22720577183225613, + "flos": 25996237050240.0, + "grad_norm": 2.9659284448048555, + "language_loss": 0.65779513, + "learning_rate": 3.605722410602591e-06, + "loss": 0.73586369, + "num_input_tokens_seen": 81333890, + "router_z_loss_clip": 2.37304688, + "router_z_loss_mlp": 0.20178223, + "step": 3779, + "time_per_iteration": 2.543818950653076 + }, + { + "auxiliary_loss_clip": 0.06525169, + "auxiliary_loss_mlp": 0.01276701, + "balance_loss_clip": 0.06295511, + "balance_loss_mlp": 0.01255982, + "epoch": 0.2272658950849241, + "flos": 20820432746880.0, + "grad_norm": 1.7825989229768946, + "language_loss": 0.70823693, + "learning_rate": 3.6054901963514323e-06, + "loss": 0.7862556, + "num_input_tokens_seen": 81353640, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.20703125, + "step": 3780, + "time_per_iteration": 2.558850049972534 + }, + { + "auxiliary_loss_clip": 0.06528573, + "auxiliary_loss_mlp": 0.01280577, + "balance_loss_clip": 0.06296494, + "balance_loss_mlp": 0.01257927, + "epoch": 0.22732601833759206, + "flos": 23915187446400.0, + "grad_norm": 1.6463040629853982, + "language_loss": 0.89639765, + "learning_rate": 3.6052579212199246e-06, + "loss": 0.97448915, + "num_input_tokens_seen": 81371595, + "router_z_loss_clip": 2.32226562, + "router_z_loss_mlp": 0.2265625, + "step": 3781, + "time_per_iteration": 2.527230739593506 + }, + { + "auxiliary_loss_clip": 0.06532317, + "auxiliary_loss_mlp": 0.01280346, + "balance_loss_clip": 0.06296034, + "balance_loss_mlp": 0.01257672, + "epoch": 0.22738614159026002, + "flos": 15929850142080.0, + "grad_norm": 2.4692396393453016, + "language_loss": 0.75309098, + "learning_rate": 3.6050255852168753e-06, + "loss": 0.83121765, + "num_input_tokens_seen": 81388435, + "router_z_loss_clip": 2.36523438, + "router_z_loss_mlp": 0.2265625, + "step": 3782, + "time_per_iteration": 2.4901020526885986 + }, + { + "auxiliary_loss_clip": 0.06532567, + "auxiliary_loss_mlp": 0.01278379, + "balance_loss_clip": 0.06300219, + "balance_loss_mlp": 0.01257959, + "epoch": 0.22744626484292801, + "flos": 24212148716160.0, + "grad_norm": 1.7681967435875452, + "language_loss": 0.8314634, + "learning_rate": 3.604793188351095e-06, + "loss": 0.90957284, + "num_input_tokens_seen": 81410195, + "router_z_loss_clip": 2.32421875, + "router_z_loss_mlp": 0.20422363, + "step": 3783, + "time_per_iteration": 2.559361696243286 + }, + { + "auxiliary_loss_clip": 0.06539755, + "auxiliary_loss_mlp": 0.0128451, + "balance_loss_clip": 0.06305835, + "balance_loss_mlp": 0.01262266, + "epoch": 0.22750638809559598, + "flos": 24798734023680.0, + "grad_norm": 1.794476113807414, + "language_loss": 0.76757884, + "learning_rate": 3.6045607306313964e-06, + "loss": 0.8458215, + "num_input_tokens_seen": 81430060, + "router_z_loss_clip": 2.34179688, + "router_z_loss_mlp": 0.22229004, + "step": 3784, + "time_per_iteration": 2.6693339347839355 + }, + { + "auxiliary_loss_clip": 0.06533188, + "auxiliary_loss_mlp": 0.012806, + "balance_loss_clip": 0.06299379, + "balance_loss_mlp": 0.01257998, + "epoch": 0.22756651134826394, + "flos": 22243004640000.0, + "grad_norm": 1.5985438146538498, + "language_loss": 0.71667248, + "learning_rate": 3.604328212066594e-06, + "loss": 0.79481035, + "num_input_tokens_seen": 81447375, + "router_z_loss_clip": 2.34179688, + "router_z_loss_mlp": 0.22583008, + "step": 3785, + "time_per_iteration": 2.5436675548553467 + }, + { + "auxiliary_loss_clip": 0.06421004, + "auxiliary_loss_mlp": 0.0127133, + "balance_loss_clip": 0.0629871, + "balance_loss_mlp": 0.01265915, + "epoch": 0.2276266346009319, + "flos": 62728225021440.0, + "grad_norm": 1.545506426452605, + "language_loss": 0.63058448, + "learning_rate": 3.6040956326655047e-06, + "loss": 0.70750785, + "num_input_tokens_seen": 81505235, + "router_z_loss_clip": 1.21875, + "router_z_loss_mlp": 0.05422974, + "step": 3786, + "time_per_iteration": 3.1247661113739014 + }, + { + "auxiliary_loss_clip": 0.06538717, + "auxiliary_loss_mlp": 0.01277474, + "balance_loss_clip": 0.06302891, + "balance_loss_mlp": 0.01254299, + "epoch": 0.22768675785359987, + "flos": 18618085958400.0, + "grad_norm": 2.466113986800572, + "language_loss": 0.8751514, + "learning_rate": 3.6038629924369486e-06, + "loss": 0.95331335, + "num_input_tokens_seen": 81518685, + "router_z_loss_clip": 2.35742188, + "router_z_loss_mlp": 0.23156738, + "step": 3787, + "time_per_iteration": 2.488539457321167 + }, + { + "auxiliary_loss_clip": 0.06537791, + "auxiliary_loss_mlp": 0.01280159, + "balance_loss_clip": 0.06305036, + "balance_loss_mlp": 0.01259488, + "epoch": 0.22774688110626784, + "flos": 26877477640320.0, + "grad_norm": 2.053207704033697, + "language_loss": 0.73054254, + "learning_rate": 3.6036302913897474e-06, + "loss": 0.80872202, + "num_input_tokens_seen": 81538940, + "router_z_loss_clip": 2.33007812, + "router_z_loss_mlp": 0.20678711, + "step": 3788, + "time_per_iteration": 2.5763657093048096 + }, + { + "auxiliary_loss_clip": 0.06534025, + "auxiliary_loss_mlp": 0.01282834, + "balance_loss_clip": 0.06303776, + "balance_loss_mlp": 0.01260971, + "epoch": 0.2278070043589358, + "flos": 15557977722240.0, + "grad_norm": 4.57361945380841, + "language_loss": 0.68007839, + "learning_rate": 3.6033975295327243e-06, + "loss": 0.75824702, + "num_input_tokens_seen": 81555525, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.21850586, + "step": 3789, + "time_per_iteration": 2.4907443523406982 + }, + { + "auxiliary_loss_clip": 0.0653897, + "auxiliary_loss_mlp": 0.01283477, + "balance_loss_clip": 0.06308074, + "balance_loss_mlp": 0.0126115, + "epoch": 0.2278671276116038, + "flos": 22422987210240.0, + "grad_norm": 2.4388022002275243, + "language_loss": 0.76775718, + "learning_rate": 3.6031647068747065e-06, + "loss": 0.84598166, + "num_input_tokens_seen": 81576305, + "router_z_loss_clip": 2.31054688, + "router_z_loss_mlp": 0.22338867, + "step": 3790, + "time_per_iteration": 2.5787651538848877 + }, + { + "auxiliary_loss_clip": 0.06540109, + "auxiliary_loss_mlp": 0.01282259, + "balance_loss_clip": 0.06309578, + "balance_loss_mlp": 0.01259252, + "epoch": 0.22792725086427176, + "flos": 20637641064960.0, + "grad_norm": 1.9300771626575046, + "language_loss": 0.91910696, + "learning_rate": 3.602931823424522e-06, + "loss": 0.99733061, + "num_input_tokens_seen": 81594115, + "router_z_loss_clip": 2.30664062, + "router_z_loss_mlp": 0.23010254, + "step": 3791, + "time_per_iteration": 2.52327823638916 + }, + { + "auxiliary_loss_clip": 0.06538808, + "auxiliary_loss_mlp": 0.01277492, + "balance_loss_clip": 0.06302848, + "balance_loss_mlp": 0.01256893, + "epoch": 0.22798737411693973, + "flos": 31436662147200.0, + "grad_norm": 1.9637481556258098, + "language_loss": 0.83064067, + "learning_rate": 3.6026988791910026e-06, + "loss": 0.9088037, + "num_input_tokens_seen": 81615355, + "router_z_loss_clip": 2.36132812, + "router_z_loss_mlp": 0.20617676, + "step": 3792, + "time_per_iteration": 2.6190388202667236 + }, + { + "auxiliary_loss_clip": 0.06410792, + "auxiliary_loss_mlp": 0.01268683, + "balance_loss_clip": 0.06289717, + "balance_loss_mlp": 0.01263259, + "epoch": 0.2280474973696077, + "flos": 52412074220160.0, + "grad_norm": 1.1033671526650368, + "language_loss": 0.65792358, + "learning_rate": 3.602465874182981e-06, + "loss": 0.73471832, + "num_input_tokens_seen": 81662075, + "router_z_loss_clip": 1.2109375, + "router_z_loss_mlp": 0.05432129, + "step": 3793, + "time_per_iteration": 2.9110665321350098 + }, + { + "auxiliary_loss_clip": 0.0654863, + "auxiliary_loss_mlp": 0.01287304, + "balance_loss_clip": 0.06306019, + "balance_loss_mlp": 0.01261889, + "epoch": 0.22810762062227566, + "flos": 26403300984960.0, + "grad_norm": 1.9908643306499119, + "language_loss": 0.78207439, + "learning_rate": 3.602232808409293e-06, + "loss": 0.8604337, + "num_input_tokens_seen": 81681625, + "router_z_loss_clip": 2.4296875, + "router_z_loss_mlp": 0.25415039, + "step": 3794, + "time_per_iteration": 2.5911734104156494 + }, + { + "auxiliary_loss_clip": 0.06544799, + "auxiliary_loss_mlp": 0.01285336, + "balance_loss_clip": 0.06310074, + "balance_loss_mlp": 0.01262412, + "epoch": 0.22816774387494362, + "flos": 25637445866880.0, + "grad_norm": 3.443157636284035, + "language_loss": 0.81285226, + "learning_rate": 3.6019996818787755e-06, + "loss": 0.89115357, + "num_input_tokens_seen": 81701170, + "router_z_loss_clip": 2.34960938, + "router_z_loss_mlp": 0.22912598, + "step": 3795, + "time_per_iteration": 2.6825528144836426 + }, + { + "auxiliary_loss_clip": 0.06536914, + "auxiliary_loss_mlp": 0.0128896, + "balance_loss_clip": 0.06306744, + "balance_loss_mlp": 0.01267586, + "epoch": 0.22822786712761162, + "flos": 22457507892480.0, + "grad_norm": 1.703568435651106, + "language_loss": 0.77948368, + "learning_rate": 3.6017664946002704e-06, + "loss": 0.85774243, + "num_input_tokens_seen": 81721265, + "router_z_loss_clip": 2.30664062, + "router_z_loss_mlp": 0.21362305, + "step": 3796, + "time_per_iteration": 2.5418922901153564 + }, + { + "auxiliary_loss_clip": 0.06535624, + "auxiliary_loss_mlp": 0.01278994, + "balance_loss_clip": 0.06302401, + "balance_loss_mlp": 0.01258692, + "epoch": 0.22828799038027958, + "flos": 12207323053440.0, + "grad_norm": 2.5041816771456076, + "language_loss": 0.96305406, + "learning_rate": 3.6015332465826188e-06, + "loss": 1.04120016, + "num_input_tokens_seen": 81736565, + "router_z_loss_clip": 2.33203125, + "router_z_loss_mlp": 0.20324707, + "step": 3797, + "time_per_iteration": 2.5794107913970947 + }, + { + "auxiliary_loss_clip": 0.06537494, + "auxiliary_loss_mlp": 0.01281478, + "balance_loss_clip": 0.06304877, + "balance_loss_mlp": 0.01260057, + "epoch": 0.22834811363294755, + "flos": 22091379477120.0, + "grad_norm": 1.517581709018558, + "language_loss": 0.82277977, + "learning_rate": 3.601299937834666e-06, + "loss": 0.90096951, + "num_input_tokens_seen": 81756240, + "router_z_loss_clip": 2.32421875, + "router_z_loss_mlp": 0.2142334, + "step": 3798, + "time_per_iteration": 2.618784189224243 + }, + { + "auxiliary_loss_clip": 0.06536907, + "auxiliary_loss_mlp": 0.01279844, + "balance_loss_clip": 0.06300005, + "balance_loss_mlp": 0.01257146, + "epoch": 0.2284082368856155, + "flos": 24867104555520.0, + "grad_norm": 1.8603662335211264, + "language_loss": 0.79381669, + "learning_rate": 3.6010665683652596e-06, + "loss": 0.87198418, + "num_input_tokens_seen": 81775720, + "router_z_loss_clip": 2.36914062, + "router_z_loss_mlp": 0.22705078, + "step": 3799, + "time_per_iteration": 2.591053009033203 + }, + { + "auxiliary_loss_clip": 0.06534393, + "auxiliary_loss_mlp": 0.0128126, + "balance_loss_clip": 0.06300979, + "balance_loss_mlp": 0.01258646, + "epoch": 0.22846836013828348, + "flos": 23299280409600.0, + "grad_norm": 1.5152328596048934, + "language_loss": 0.75782096, + "learning_rate": 3.6008331381832484e-06, + "loss": 0.83597749, + "num_input_tokens_seen": 81795830, + "router_z_loss_clip": 2.33398438, + "router_z_loss_mlp": 0.22619629, + "step": 3800, + "time_per_iteration": 2.5370395183563232 + }, + { + "auxiliary_loss_clip": 0.06535068, + "auxiliary_loss_mlp": 0.01279113, + "balance_loss_clip": 0.06302812, + "balance_loss_mlp": 0.01258001, + "epoch": 0.22852848339095144, + "flos": 27423462844800.0, + "grad_norm": 1.9420817073182375, + "language_loss": 0.64685607, + "learning_rate": 3.600599647297484e-06, + "loss": 0.72499788, + "num_input_tokens_seen": 81815745, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.21105957, + "step": 3801, + "time_per_iteration": 2.6190593242645264 + }, + { + "auxiliary_loss_clip": 0.06524718, + "auxiliary_loss_mlp": 0.01278565, + "balance_loss_clip": 0.06296816, + "balance_loss_mlp": 0.01257835, + "epoch": 0.2285886066436194, + "flos": 26328054418560.0, + "grad_norm": 1.6808395254049295, + "language_loss": 0.81957126, + "learning_rate": 3.60036609571682e-06, + "loss": 0.89760411, + "num_input_tokens_seen": 81835155, + "router_z_loss_clip": 2.27929688, + "router_z_loss_mlp": 0.20727539, + "step": 3802, + "time_per_iteration": 2.554079055786133 + }, + { + "auxiliary_loss_clip": 0.06534229, + "auxiliary_loss_mlp": 0.01286931, + "balance_loss_clip": 0.06299631, + "balance_loss_mlp": 0.0126415, + "epoch": 0.2286487298962874, + "flos": 29724298508160.0, + "grad_norm": 1.6760491170738747, + "language_loss": 0.79838073, + "learning_rate": 3.600132483450114e-06, + "loss": 0.87659228, + "num_input_tokens_seen": 81855655, + "router_z_loss_clip": 2.34960938, + "router_z_loss_mlp": 0.22790527, + "step": 3803, + "time_per_iteration": 2.6287641525268555 + }, + { + "auxiliary_loss_clip": 0.0653572, + "auxiliary_loss_mlp": 0.01279074, + "balance_loss_clip": 0.06296768, + "balance_loss_mlp": 0.012559, + "epoch": 0.22870885314895537, + "flos": 21293435445120.0, + "grad_norm": 1.7238152987334623, + "language_loss": 0.86273003, + "learning_rate": 3.5998988105062235e-06, + "loss": 0.94087803, + "num_input_tokens_seen": 81876385, + "router_z_loss_clip": 2.390625, + "router_z_loss_mlp": 0.23168945, + "step": 3804, + "time_per_iteration": 2.511462450027466 + }, + { + "auxiliary_loss_clip": 0.06539486, + "auxiliary_loss_mlp": 0.01279472, + "balance_loss_clip": 0.06301028, + "balance_loss_mlp": 0.01257537, + "epoch": 0.22876897640162333, + "flos": 14944754016000.0, + "grad_norm": 1.89266353651555, + "language_loss": 0.76854289, + "learning_rate": 3.59966507689401e-06, + "loss": 0.84673244, + "num_input_tokens_seen": 81893225, + "router_z_loss_clip": 2.38476562, + "router_z_loss_mlp": 0.21923828, + "step": 3805, + "time_per_iteration": 3.929358959197998 + }, + { + "auxiliary_loss_clip": 0.0654166, + "auxiliary_loss_mlp": 0.01280204, + "balance_loss_clip": 0.06298529, + "balance_loss_mlp": 0.01257542, + "epoch": 0.2288290996542913, + "flos": 18119786526720.0, + "grad_norm": 2.0123502787071073, + "language_loss": 0.79403114, + "learning_rate": 3.5994312826223363e-06, + "loss": 0.87224978, + "num_input_tokens_seen": 81911350, + "router_z_loss_clip": 2.43359375, + "router_z_loss_mlp": 0.22680664, + "step": 3806, + "time_per_iteration": 2.538203477859497 + }, + { + "auxiliary_loss_clip": 0.06540429, + "auxiliary_loss_mlp": 0.01282432, + "balance_loss_clip": 0.06303287, + "balance_loss_mlp": 0.01259878, + "epoch": 0.22888922290695926, + "flos": 39864296828160.0, + "grad_norm": 1.8839046523975558, + "language_loss": 0.70310783, + "learning_rate": 3.5991974277000684e-06, + "loss": 0.78133643, + "num_input_tokens_seen": 81935420, + "router_z_loss_clip": 2.37304688, + "router_z_loss_mlp": 0.22546387, + "step": 3807, + "time_per_iteration": 4.134840488433838 + }, + { + "auxiliary_loss_clip": 0.06550615, + "auxiliary_loss_mlp": 0.01290274, + "balance_loss_clip": 0.06307966, + "balance_loss_mlp": 0.01265121, + "epoch": 0.22894934615962723, + "flos": 23410431250560.0, + "grad_norm": 2.1946772997431103, + "language_loss": 0.65960705, + "learning_rate": 3.5989635121360733e-06, + "loss": 0.73801601, + "num_input_tokens_seen": 81953845, + "router_z_loss_clip": 2.42578125, + "router_z_loss_mlp": 0.25183105, + "step": 3808, + "time_per_iteration": 2.561497688293457 + }, + { + "auxiliary_loss_clip": 0.06539108, + "auxiliary_loss_mlp": 0.01281064, + "balance_loss_clip": 0.06300798, + "balance_loss_mlp": 0.01259154, + "epoch": 0.22900946941229522, + "flos": 18848898829440.0, + "grad_norm": 1.7761632941249064, + "language_loss": 0.75198555, + "learning_rate": 3.598729535939222e-06, + "loss": 0.83018732, + "num_input_tokens_seen": 81972100, + "router_z_loss_clip": 2.38476562, + "router_z_loss_mlp": 0.21899414, + "step": 3809, + "time_per_iteration": 2.490895986557007 + }, + { + "auxiliary_loss_clip": 0.06533305, + "auxiliary_loss_mlp": 0.0127892, + "balance_loss_clip": 0.06299955, + "balance_loss_mlp": 0.01257331, + "epoch": 0.22906959266496318, + "flos": 22935961105920.0, + "grad_norm": 1.4656596651362013, + "language_loss": 0.82576305, + "learning_rate": 3.5984954991183862e-06, + "loss": 0.90388525, + "num_input_tokens_seen": 81992760, + "router_z_loss_clip": 2.33398438, + "router_z_loss_mlp": 0.21606445, + "step": 3810, + "time_per_iteration": 2.5684924125671387 + }, + { + "auxiliary_loss_clip": 0.06535805, + "auxiliary_loss_mlp": 0.01278794, + "balance_loss_clip": 0.06303711, + "balance_loss_mlp": 0.01259041, + "epoch": 0.22912971591763115, + "flos": 19360614913920.0, + "grad_norm": 1.8664104481323773, + "language_loss": 0.79914212, + "learning_rate": 3.598261401682441e-06, + "loss": 0.8772881, + "num_input_tokens_seen": 82009080, + "router_z_loss_clip": 2.31835938, + "router_z_loss_mlp": 0.19750977, + "step": 3811, + "time_per_iteration": 3.9766526222229004 + }, + { + "auxiliary_loss_clip": 0.0653518, + "auxiliary_loss_mlp": 0.01280553, + "balance_loss_clip": 0.06300636, + "balance_loss_mlp": 0.01258976, + "epoch": 0.22918983917029911, + "flos": 19938940594560.0, + "grad_norm": 1.7476175457386653, + "language_loss": 0.83391893, + "learning_rate": 3.5980272436402632e-06, + "loss": 0.91207623, + "num_input_tokens_seen": 82026705, + "router_z_loss_clip": 2.34765625, + "router_z_loss_mlp": 0.21569824, + "step": 3812, + "time_per_iteration": 2.5174708366394043 + }, + { + "auxiliary_loss_clip": 0.0655017, + "auxiliary_loss_mlp": 0.01288002, + "balance_loss_clip": 0.06306149, + "balance_loss_mlp": 0.01264673, + "epoch": 0.22924996242296708, + "flos": 16696501873920.0, + "grad_norm": 2.3839142545709886, + "language_loss": 0.8400377, + "learning_rate": 3.5977930250007324e-06, + "loss": 0.91841948, + "num_input_tokens_seen": 82043245, + "router_z_loss_clip": 2.44335938, + "router_z_loss_mlp": 0.2331543, + "step": 3813, + "time_per_iteration": 2.554779052734375 + }, + { + "auxiliary_loss_clip": 0.06538843, + "auxiliary_loss_mlp": 0.01276305, + "balance_loss_clip": 0.06301966, + "balance_loss_mlp": 0.01255456, + "epoch": 0.22931008567563504, + "flos": 33044457490560.0, + "grad_norm": 1.6858267943586043, + "language_loss": 0.70580167, + "learning_rate": 3.5975587457727298e-06, + "loss": 0.78395313, + "num_input_tokens_seen": 82066870, + "router_z_loss_clip": 2.36914062, + "router_z_loss_mlp": 0.20861816, + "step": 3814, + "time_per_iteration": 2.6764509677886963 + }, + { + "auxiliary_loss_clip": 0.06536946, + "auxiliary_loss_mlp": 0.01276372, + "balance_loss_clip": 0.06305344, + "balance_loss_mlp": 0.01256786, + "epoch": 0.229370208928303, + "flos": 23337322963200.0, + "grad_norm": 2.8831118113675114, + "language_loss": 0.67954975, + "learning_rate": 3.597324405965139e-06, + "loss": 0.75768292, + "num_input_tokens_seen": 82083180, + "router_z_loss_clip": 2.31445312, + "router_z_loss_mlp": 0.19604492, + "step": 3815, + "time_per_iteration": 3.9759562015533447 + }, + { + "auxiliary_loss_clip": 0.06547147, + "auxiliary_loss_mlp": 0.01282792, + "balance_loss_clip": 0.06311129, + "balance_loss_mlp": 0.01259952, + "epoch": 0.229430332180971, + "flos": 28624068472320.0, + "grad_norm": 1.7261339214380451, + "language_loss": 0.83511633, + "learning_rate": 3.597090005586848e-06, + "loss": 0.91341567, + "num_input_tokens_seen": 82102950, + "router_z_loss_clip": 2.359375, + "router_z_loss_mlp": 0.22839355, + "step": 3816, + "time_per_iteration": 2.6059420108795166 + }, + { + "auxiliary_loss_clip": 0.06539545, + "auxiliary_loss_mlp": 0.01275582, + "balance_loss_clip": 0.06303526, + "balance_loss_mlp": 0.01253302, + "epoch": 0.22949045543363897, + "flos": 17243912597760.0, + "grad_norm": 2.759151157832335, + "language_loss": 0.87850988, + "learning_rate": 3.596855544646742e-06, + "loss": 0.95666116, + "num_input_tokens_seen": 82119510, + "router_z_loss_clip": 2.36132812, + "router_z_loss_mlp": 0.22290039, + "step": 3817, + "time_per_iteration": 2.4830808639526367 + }, + { + "auxiliary_loss_clip": 0.06543944, + "auxiliary_loss_mlp": 0.01278311, + "balance_loss_clip": 0.06306894, + "balance_loss_mlp": 0.01256412, + "epoch": 0.22955057868630693, + "flos": 27496654986240.0, + "grad_norm": 1.6534336608142677, + "language_loss": 0.75343978, + "learning_rate": 3.5966210231537154e-06, + "loss": 0.83166242, + "num_input_tokens_seen": 82140095, + "router_z_loss_clip": 2.37304688, + "router_z_loss_mlp": 0.21899414, + "step": 3818, + "time_per_iteration": 2.634387969970703 + }, + { + "auxiliary_loss_clip": 0.06541272, + "auxiliary_loss_mlp": 0.01278617, + "balance_loss_clip": 0.06305389, + "balance_loss_mlp": 0.0125524, + "epoch": 0.2296107019389749, + "flos": 23483036413440.0, + "grad_norm": 1.7338201278327374, + "language_loss": 0.75486314, + "learning_rate": 3.596386441116659e-06, + "loss": 0.83306205, + "num_input_tokens_seen": 82159510, + "router_z_loss_clip": 2.359375, + "router_z_loss_mlp": 0.23376465, + "step": 3819, + "time_per_iteration": 2.593780279159546 + }, + { + "auxiliary_loss_clip": 0.06542156, + "auxiliary_loss_mlp": 0.01283095, + "balance_loss_clip": 0.06305272, + "balance_loss_mlp": 0.01263009, + "epoch": 0.22967082519164286, + "flos": 31293212757120.0, + "grad_norm": 1.753994919034331, + "language_loss": 0.8208195, + "learning_rate": 3.5961517985444684e-06, + "loss": 0.89907205, + "num_input_tokens_seen": 82179580, + "router_z_loss_clip": 2.3671875, + "router_z_loss_mlp": 0.20092773, + "step": 3820, + "time_per_iteration": 2.6047699451446533 + }, + { + "auxiliary_loss_clip": 0.06548945, + "auxiliary_loss_mlp": 0.0128207, + "balance_loss_clip": 0.06306617, + "balance_loss_mlp": 0.0125892, + "epoch": 0.22973094844431083, + "flos": 14647415402880.0, + "grad_norm": 4.329935521611207, + "language_loss": 0.70069146, + "learning_rate": 3.595917095446042e-06, + "loss": 0.77900159, + "num_input_tokens_seen": 82195585, + "router_z_loss_clip": 2.42773438, + "router_z_loss_mlp": 0.23156738, + "step": 3821, + "time_per_iteration": 2.479454517364502 + }, + { + "auxiliary_loss_clip": 0.06540461, + "auxiliary_loss_mlp": 0.01284444, + "balance_loss_clip": 0.06305948, + "balance_loss_mlp": 0.0126177, + "epoch": 0.2297910716969788, + "flos": 22831057393920.0, + "grad_norm": 2.1026243527938897, + "language_loss": 0.83607674, + "learning_rate": 3.5956823318302796e-06, + "loss": 0.91432583, + "num_input_tokens_seen": 82217530, + "router_z_loss_clip": 2.34375, + "router_z_loss_mlp": 0.22668457, + "step": 3822, + "time_per_iteration": 2.6070644855499268 + }, + { + "auxiliary_loss_clip": 0.06532617, + "auxiliary_loss_mlp": 0.01279894, + "balance_loss_clip": 0.06300794, + "balance_loss_mlp": 0.01256637, + "epoch": 0.2298511949496468, + "flos": 23045644500480.0, + "grad_norm": 1.4679532921797136, + "language_loss": 0.66860032, + "learning_rate": 3.5954475077060833e-06, + "loss": 0.74672538, + "num_input_tokens_seen": 82237980, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.23266602, + "step": 3823, + "time_per_iteration": 2.5421886444091797 + }, + { + "auxiliary_loss_clip": 0.06414426, + "auxiliary_loss_mlp": 0.01282472, + "balance_loss_clip": 0.062925, + "balance_loss_mlp": 0.01277524, + "epoch": 0.22991131820231475, + "flos": 66910296228480.0, + "grad_norm": 0.7674542175482253, + "language_loss": 0.56982124, + "learning_rate": 3.595212623082357e-06, + "loss": 0.64679027, + "num_input_tokens_seen": 82301785, + "router_z_loss_clip": 1.21875, + "router_z_loss_mlp": 0.04943848, + "step": 3824, + "time_per_iteration": 3.2466728687286377 + }, + { + "auxiliary_loss_clip": 0.06530097, + "auxiliary_loss_mlp": 0.0127961, + "balance_loss_clip": 0.06299412, + "balance_loss_mlp": 0.01258975, + "epoch": 0.22997144145498272, + "flos": 17891782767360.0, + "grad_norm": 2.0818696062092643, + "language_loss": 0.73658061, + "learning_rate": 3.594977677968009e-06, + "loss": 0.81467766, + "num_input_tokens_seen": 82317355, + "router_z_loss_clip": 2.30664062, + "router_z_loss_mlp": 0.2064209, + "step": 3825, + "time_per_iteration": 2.4705512523651123 + }, + { + "auxiliary_loss_clip": 0.06534772, + "auxiliary_loss_mlp": 0.01279784, + "balance_loss_clip": 0.06299614, + "balance_loss_mlp": 0.01257432, + "epoch": 0.23003156470765068, + "flos": 24683055062400.0, + "grad_norm": 2.356013632504241, + "language_loss": 0.88289648, + "learning_rate": 3.5947426723719473e-06, + "loss": 0.96104205, + "num_input_tokens_seen": 82336645, + "router_z_loss_clip": 2.3515625, + "router_z_loss_mlp": 0.22351074, + "step": 3826, + "time_per_iteration": 2.5636119842529297 + }, + { + "auxiliary_loss_clip": 0.06540347, + "auxiliary_loss_mlp": 0.01282145, + "balance_loss_clip": 0.0629928, + "balance_loss_mlp": 0.0125897, + "epoch": 0.23009168796031865, + "flos": 15819412060800.0, + "grad_norm": 2.476820030154751, + "language_loss": 0.81866372, + "learning_rate": 3.594507606303083e-06, + "loss": 0.89688861, + "num_input_tokens_seen": 82354225, + "router_z_loss_clip": 2.41601562, + "router_z_loss_mlp": 0.23181152, + "step": 3827, + "time_per_iteration": 2.4817094802856445 + }, + { + "auxiliary_loss_clip": 0.06527712, + "auxiliary_loss_mlp": 0.01278643, + "balance_loss_clip": 0.06294617, + "balance_loss_mlp": 0.01256911, + "epoch": 0.2301518112129866, + "flos": 16217755171200.0, + "grad_norm": 1.7308897820243296, + "language_loss": 0.87303799, + "learning_rate": 3.5942724797703314e-06, + "loss": 0.95110154, + "num_input_tokens_seen": 82370240, + "router_z_loss_clip": 2.33398438, + "router_z_loss_mlp": 0.21716309, + "step": 3828, + "time_per_iteration": 2.517916202545166 + }, + { + "auxiliary_loss_clip": 0.06537049, + "auxiliary_loss_mlp": 0.01281894, + "balance_loss_clip": 0.06300969, + "balance_loss_mlp": 0.01260686, + "epoch": 0.2302119344656546, + "flos": 20601820644480.0, + "grad_norm": 2.1621841127041668, + "language_loss": 0.71223086, + "learning_rate": 3.594037292782607e-06, + "loss": 0.79042029, + "num_input_tokens_seen": 82389145, + "router_z_loss_clip": 2.36328125, + "router_z_loss_mlp": 0.21191406, + "step": 3829, + "time_per_iteration": 2.5232293605804443 + }, + { + "auxiliary_loss_clip": 0.06527743, + "auxiliary_loss_mlp": 0.01278561, + "balance_loss_clip": 0.06299868, + "balance_loss_mlp": 0.01258629, + "epoch": 0.23027205771832257, + "flos": 26804117790720.0, + "grad_norm": 1.5730479724984117, + "language_loss": 0.84944689, + "learning_rate": 3.5938020453488293e-06, + "loss": 0.92750996, + "num_input_tokens_seen": 82409185, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.19934082, + "step": 3830, + "time_per_iteration": 2.6153595447540283 + }, + { + "auxiliary_loss_clip": 0.0653088, + "auxiliary_loss_mlp": 0.01278488, + "balance_loss_clip": 0.06299009, + "balance_loss_mlp": 0.01256863, + "epoch": 0.23033218097099054, + "flos": 43883365916160.0, + "grad_norm": 2.1076872960056834, + "language_loss": 0.67121679, + "learning_rate": 3.5935667374779177e-06, + "loss": 0.74931049, + "num_input_tokens_seen": 82432070, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.21630859, + "step": 3831, + "time_per_iteration": 2.7302401065826416 + }, + { + "auxiliary_loss_clip": 0.06528492, + "auxiliary_loss_mlp": 0.0127826, + "balance_loss_clip": 0.06295311, + "balance_loss_mlp": 0.01255944, + "epoch": 0.2303923042236585, + "flos": 26074837779840.0, + "grad_norm": 2.0679638399971525, + "language_loss": 0.7580992, + "learning_rate": 3.5933313691787957e-06, + "loss": 0.83616674, + "num_input_tokens_seen": 82450625, + "router_z_loss_clip": 2.33203125, + "router_z_loss_mlp": 0.2232666, + "step": 3832, + "time_per_iteration": 2.5789363384246826 + }, + { + "auxiliary_loss_clip": 0.06538022, + "auxiliary_loss_mlp": 0.01277154, + "balance_loss_clip": 0.06301656, + "balance_loss_mlp": 0.01254731, + "epoch": 0.23045242747632647, + "flos": 18302284719360.0, + "grad_norm": 1.9809188001289737, + "language_loss": 0.88229948, + "learning_rate": 3.593095940460389e-06, + "loss": 0.96045125, + "num_input_tokens_seen": 82468575, + "router_z_loss_clip": 2.36523438, + "router_z_loss_mlp": 0.22387695, + "step": 3833, + "time_per_iteration": 2.4890406131744385 + }, + { + "auxiliary_loss_clip": 0.06526786, + "auxiliary_loss_mlp": 0.01275622, + "balance_loss_clip": 0.06291149, + "balance_loss_mlp": 0.01253295, + "epoch": 0.23051255072899443, + "flos": 25527636691200.0, + "grad_norm": 1.751792699614105, + "language_loss": 0.75447762, + "learning_rate": 3.592860451331624e-06, + "loss": 0.83250165, + "num_input_tokens_seen": 82488655, + "router_z_loss_clip": 2.35742188, + "router_z_loss_mlp": 0.2232666, + "step": 3834, + "time_per_iteration": 2.5706892013549805 + }, + { + "auxiliary_loss_clip": 0.06528607, + "auxiliary_loss_mlp": 0.0128462, + "balance_loss_clip": 0.06295913, + "balance_loss_mlp": 0.01262089, + "epoch": 0.2305726739816624, + "flos": 21221584968960.0, + "grad_norm": 2.065687600185831, + "language_loss": 0.86859775, + "learning_rate": 3.592624901801432e-06, + "loss": 0.94673002, + "num_input_tokens_seen": 82507220, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.2253418, + "step": 3835, + "time_per_iteration": 2.5243782997131348 + }, + { + "auxiliary_loss_clip": 0.06531255, + "auxiliary_loss_mlp": 0.01277066, + "balance_loss_clip": 0.06292518, + "balance_loss_mlp": 0.01255489, + "epoch": 0.2306327972343304, + "flos": 23337826087680.0, + "grad_norm": 2.699164056519065, + "language_loss": 0.8346436, + "learning_rate": 3.5923892918787432e-06, + "loss": 0.91272676, + "num_input_tokens_seen": 82527920, + "router_z_loss_clip": 2.38671875, + "router_z_loss_mlp": 0.21594238, + "step": 3836, + "time_per_iteration": 2.588916301727295 + }, + { + "auxiliary_loss_clip": 0.06530184, + "auxiliary_loss_mlp": 0.01278505, + "balance_loss_clip": 0.0629724, + "balance_loss_mlp": 0.01257918, + "epoch": 0.23069292048699835, + "flos": 20672832579840.0, + "grad_norm": 1.5308621387149557, + "language_loss": 0.80123997, + "learning_rate": 3.5921536215724934e-06, + "loss": 0.87932694, + "num_input_tokens_seen": 82549040, + "router_z_loss_clip": 2.328125, + "router_z_loss_mlp": 0.20581055, + "step": 3837, + "time_per_iteration": 2.5265891551971436 + }, + { + "auxiliary_loss_clip": 0.06398934, + "auxiliary_loss_mlp": 0.01263477, + "balance_loss_clip": 0.06276935, + "balance_loss_mlp": 0.01257871, + "epoch": 0.23075304373966632, + "flos": 70472854673280.0, + "grad_norm": 0.8661269137999401, + "language_loss": 0.65425092, + "learning_rate": 3.5919178908916184e-06, + "loss": 0.73087507, + "num_input_tokens_seen": 82604070, + "router_z_loss_clip": 1.21875, + "router_z_loss_mlp": 0.05606079, + "step": 3838, + "time_per_iteration": 3.0690691471099854 + }, + { + "auxiliary_loss_clip": 0.06529964, + "auxiliary_loss_mlp": 0.01281931, + "balance_loss_clip": 0.0629662, + "balance_loss_mlp": 0.01260592, + "epoch": 0.23081316699233428, + "flos": 16623603221760.0, + "grad_norm": 1.9712307402798914, + "language_loss": 0.76919234, + "learning_rate": 3.591682099845058e-06, + "loss": 0.84731126, + "num_input_tokens_seen": 82619665, + "router_z_loss_clip": 2.33398438, + "router_z_loss_mlp": 0.21337891, + "step": 3839, + "time_per_iteration": 2.507899522781372 + }, + { + "auxiliary_loss_clip": 0.06539556, + "auxiliary_loss_mlp": 0.01283771, + "balance_loss_clip": 0.06303147, + "balance_loss_mlp": 0.01261873, + "epoch": 0.23087329024500225, + "flos": 13303192677120.0, + "grad_norm": 1.9535711626830803, + "language_loss": 0.6973604, + "learning_rate": 3.591446248441752e-06, + "loss": 0.77559364, + "num_input_tokens_seen": 82637530, + "router_z_loss_clip": 2.36523438, + "router_z_loss_mlp": 0.21899414, + "step": 3840, + "time_per_iteration": 2.507403612136841 + }, + { + "auxiliary_loss_clip": 0.06524121, + "auxiliary_loss_mlp": 0.01283726, + "balance_loss_clip": 0.06291715, + "balance_loss_mlp": 0.01261994, + "epoch": 0.23093341349767021, + "flos": 17791574883840.0, + "grad_norm": 2.1010490795203967, + "language_loss": 0.79679501, + "learning_rate": 3.591210336690645e-06, + "loss": 0.87487352, + "num_input_tokens_seen": 82656130, + "router_z_loss_clip": 2.32421875, + "router_z_loss_mlp": 0.21740723, + "step": 3841, + "time_per_iteration": 2.542506456375122 + }, + { + "auxiliary_loss_clip": 0.06525128, + "auxiliary_loss_mlp": 0.0128577, + "balance_loss_clip": 0.06292316, + "balance_loss_mlp": 0.0126591, + "epoch": 0.23099353675033818, + "flos": 23994920206080.0, + "grad_norm": 2.202794692504719, + "language_loss": 0.83472121, + "learning_rate": 3.590974364600683e-06, + "loss": 0.91283023, + "num_input_tokens_seen": 82675295, + "router_z_loss_clip": 2.328125, + "router_z_loss_mlp": 0.19873047, + "step": 3842, + "time_per_iteration": 2.5885045528411865 + }, + { + "auxiliary_loss_clip": 0.06525495, + "auxiliary_loss_mlp": 0.01277864, + "balance_loss_clip": 0.06294134, + "balance_loss_mlp": 0.01256251, + "epoch": 0.23105366000300617, + "flos": 36004567478400.0, + "grad_norm": 1.5198018897685672, + "language_loss": 0.66582537, + "learning_rate": 3.5907383321808135e-06, + "loss": 0.74385899, + "num_input_tokens_seen": 82703260, + "router_z_loss_clip": 2.3125, + "router_z_loss_mlp": 0.21630859, + "step": 3843, + "time_per_iteration": 2.7418570518493652 + }, + { + "auxiliary_loss_clip": 0.06517389, + "auxiliary_loss_mlp": 0.01282302, + "balance_loss_clip": 0.06289946, + "balance_loss_mlp": 0.01261667, + "epoch": 0.23111378325567414, + "flos": 31252822289280.0, + "grad_norm": 2.0273673860648613, + "language_loss": 0.77953953, + "learning_rate": 3.590502239439987e-06, + "loss": 0.85753644, + "num_input_tokens_seen": 82725060, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.2064209, + "step": 3844, + "time_per_iteration": 2.697105884552002 + }, + { + "auxiliary_loss_clip": 0.0652685, + "auxiliary_loss_mlp": 0.01278908, + "balance_loss_clip": 0.0629425, + "balance_loss_mlp": 0.01258618, + "epoch": 0.2311739065083421, + "flos": 19214230631040.0, + "grad_norm": 1.5733936305181, + "language_loss": 0.78526026, + "learning_rate": 3.590266086387156e-06, + "loss": 0.86331779, + "num_input_tokens_seen": 82742960, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.20275879, + "step": 3845, + "time_per_iteration": 3.9081645011901855 + }, + { + "auxiliary_loss_clip": 0.06512116, + "auxiliary_loss_mlp": 0.01275528, + "balance_loss_clip": 0.06288872, + "balance_loss_mlp": 0.01256323, + "epoch": 0.23123402976101007, + "flos": 23365638443520.0, + "grad_norm": 2.144369954512039, + "language_loss": 0.7696318, + "learning_rate": 3.590029873031276e-06, + "loss": 0.84750825, + "num_input_tokens_seen": 82760205, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.1920166, + "step": 3846, + "time_per_iteration": 2.5204334259033203 + }, + { + "auxiliary_loss_clip": 0.06530652, + "auxiliary_loss_mlp": 0.01280785, + "balance_loss_clip": 0.06296441, + "balance_loss_mlp": 0.01258946, + "epoch": 0.23129415301367803, + "flos": 13740458808960.0, + "grad_norm": 2.058546116129278, + "language_loss": 0.70736533, + "learning_rate": 3.589793599381304e-06, + "loss": 0.78547966, + "num_input_tokens_seen": 82778590, + "router_z_loss_clip": 2.34570312, + "router_z_loss_mlp": 0.21862793, + "step": 3847, + "time_per_iteration": 3.955061197280884 + }, + { + "auxiliary_loss_clip": 0.06395237, + "auxiliary_loss_mlp": 0.01270099, + "balance_loss_clip": 0.06274906, + "balance_loss_mlp": 0.01264553, + "epoch": 0.231354276266346, + "flos": 69756907461120.0, + "grad_norm": 0.7764718422559022, + "language_loss": 0.60909712, + "learning_rate": 3.589557265446198e-06, + "loss": 0.68575048, + "num_input_tokens_seen": 82833925, + "router_z_loss_clip": 1.203125, + "router_z_loss_mlp": 0.05557251, + "step": 3848, + "time_per_iteration": 3.0406246185302734 + }, + { + "auxiliary_loss_clip": 0.0652846, + "auxiliary_loss_mlp": 0.0128118, + "balance_loss_clip": 0.06295802, + "balance_loss_mlp": 0.01259925, + "epoch": 0.231414399519014, + "flos": 18840597275520.0, + "grad_norm": 2.051565204924659, + "language_loss": 0.79345453, + "learning_rate": 3.589320871234923e-06, + "loss": 0.87155092, + "num_input_tokens_seen": 82850625, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.21252441, + "step": 3849, + "time_per_iteration": 2.508357048034668 + }, + { + "auxiliary_loss_clip": 0.06525768, + "auxiliary_loss_mlp": 0.01279584, + "balance_loss_clip": 0.06294318, + "balance_loss_mlp": 0.01257995, + "epoch": 0.23147452277168196, + "flos": 36143949945600.0, + "grad_norm": 1.9799304996672493, + "language_loss": 0.72033536, + "learning_rate": 3.5890844167564405e-06, + "loss": 0.7983889, + "num_input_tokens_seen": 82872105, + "router_z_loss_clip": 2.31640625, + "router_z_loss_mlp": 0.21594238, + "step": 3850, + "time_per_iteration": 2.6283209323883057 + }, + { + "auxiliary_loss_clip": 0.06522007, + "auxiliary_loss_mlp": 0.012814, + "balance_loss_clip": 0.06293751, + "balance_loss_mlp": 0.01260562, + "epoch": 0.23153464602434992, + "flos": 20819091081600.0, + "grad_norm": 2.1585980033328216, + "language_loss": 0.76770389, + "learning_rate": 3.588847902019718e-06, + "loss": 0.84573799, + "num_input_tokens_seen": 82890595, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.20825195, + "step": 3851, + "time_per_iteration": 3.9542527198791504 + }, + { + "auxiliary_loss_clip": 0.06522575, + "auxiliary_loss_mlp": 0.01285563, + "balance_loss_clip": 0.06294242, + "balance_loss_mlp": 0.01264367, + "epoch": 0.2315947692770179, + "flos": 19945606993920.0, + "grad_norm": 4.396515099862161, + "language_loss": 0.70780337, + "learning_rate": 3.588611327033723e-06, + "loss": 0.78588474, + "num_input_tokens_seen": 82908910, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.21191406, + "step": 3852, + "time_per_iteration": 2.5292365550994873 + }, + { + "auxiliary_loss_clip": 0.06530476, + "auxiliary_loss_mlp": 0.01287483, + "balance_loss_clip": 0.0629744, + "balance_loss_mlp": 0.01267027, + "epoch": 0.23165489252968585, + "flos": 12859805197440.0, + "grad_norm": 2.0519661349019906, + "language_loss": 0.68142366, + "learning_rate": 3.588374691807428e-06, + "loss": 0.75960326, + "num_input_tokens_seen": 82925405, + "router_z_loss_clip": 2.33203125, + "router_z_loss_mlp": 0.20471191, + "step": 3853, + "time_per_iteration": 2.524214267730713 + }, + { + "auxiliary_loss_clip": 0.06532255, + "auxiliary_loss_mlp": 0.0127975, + "balance_loss_clip": 0.06299816, + "balance_loss_mlp": 0.01258579, + "epoch": 0.23171501578235382, + "flos": 30636202492800.0, + "grad_norm": 2.067759569090495, + "language_loss": 0.80620718, + "learning_rate": 3.5881379963498053e-06, + "loss": 0.88432729, + "num_input_tokens_seen": 82945615, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.21166992, + "step": 3854, + "time_per_iteration": 3.9913628101348877 + }, + { + "auxiliary_loss_clip": 0.06540599, + "auxiliary_loss_mlp": 0.0128392, + "balance_loss_clip": 0.06299743, + "balance_loss_mlp": 0.0126201, + "epoch": 0.23177513903502178, + "flos": 23849709880320.0, + "grad_norm": 1.9679065377847755, + "language_loss": 0.66096866, + "learning_rate": 3.587901240669831e-06, + "loss": 0.73921382, + "num_input_tokens_seen": 82967570, + "router_z_loss_clip": 2.4140625, + "router_z_loss_mlp": 0.21899414, + "step": 3855, + "time_per_iteration": 2.560032844543457 + }, + { + "auxiliary_loss_clip": 0.06526054, + "auxiliary_loss_mlp": 0.0129156, + "balance_loss_clip": 0.06295231, + "balance_loss_mlp": 0.0126972, + "epoch": 0.23183526228768978, + "flos": 29578040006400.0, + "grad_norm": 1.903884891832667, + "language_loss": 0.71179903, + "learning_rate": 3.5876644247764815e-06, + "loss": 0.78997517, + "num_input_tokens_seen": 82987435, + "router_z_loss_clip": 2.30664062, + "router_z_loss_mlp": 0.21838379, + "step": 3856, + "time_per_iteration": 2.602130174636841 + }, + { + "auxiliary_loss_clip": 0.06526691, + "auxiliary_loss_mlp": 0.01281572, + "balance_loss_clip": 0.06295416, + "balance_loss_mlp": 0.01261032, + "epoch": 0.23189538554035774, + "flos": 34467155164800.0, + "grad_norm": 1.5724941960823864, + "language_loss": 0.77830631, + "learning_rate": 3.5874275486787387e-06, + "loss": 0.85638893, + "num_input_tokens_seen": 83010505, + "router_z_loss_clip": 2.3125, + "router_z_loss_mlp": 0.20532227, + "step": 3857, + "time_per_iteration": 2.6366043090820312 + }, + { + "auxiliary_loss_clip": 0.06534412, + "auxiliary_loss_mlp": 0.0128226, + "balance_loss_clip": 0.06299518, + "balance_loss_mlp": 0.01259813, + "epoch": 0.2319555087930257, + "flos": 18009558080640.0, + "grad_norm": 2.2572913357008804, + "language_loss": 0.91563249, + "learning_rate": 3.587190612385584e-06, + "loss": 0.99379921, + "num_input_tokens_seen": 83026705, + "router_z_loss_clip": 2.34765625, + "router_z_loss_mlp": 0.2244873, + "step": 3858, + "time_per_iteration": 2.532270908355713 + }, + { + "auxiliary_loss_clip": 0.06524485, + "auxiliary_loss_mlp": 0.01281992, + "balance_loss_clip": 0.06299204, + "balance_loss_mlp": 0.01261833, + "epoch": 0.23201563204569367, + "flos": 23149709671680.0, + "grad_norm": 2.204043049012761, + "language_loss": 0.77328205, + "learning_rate": 3.5869536159060026e-06, + "loss": 0.85134679, + "num_input_tokens_seen": 83046500, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.20153809, + "step": 3859, + "time_per_iteration": 2.539982318878174 + }, + { + "auxiliary_loss_clip": 0.06526206, + "auxiliary_loss_mlp": 0.01282174, + "balance_loss_clip": 0.0629694, + "balance_loss_mlp": 0.01261098, + "epoch": 0.23207575529836164, + "flos": 20674300026240.0, + "grad_norm": 1.845949683873727, + "language_loss": 0.84980345, + "learning_rate": 3.58671655924898e-06, + "loss": 0.9278872, + "num_input_tokens_seen": 83065280, + "router_z_loss_clip": 2.29101562, + "router_z_loss_mlp": 0.21057129, + "step": 3860, + "time_per_iteration": 2.5464277267456055 + }, + { + "auxiliary_loss_clip": 0.06522566, + "auxiliary_loss_mlp": 0.01275514, + "balance_loss_clip": 0.06296555, + "balance_loss_mlp": 0.01254927, + "epoch": 0.2321358785510296, + "flos": 16477805917440.0, + "grad_norm": 2.2860023761203423, + "language_loss": 0.83316106, + "learning_rate": 3.586479442423508e-06, + "loss": 0.91114187, + "num_input_tokens_seen": 83082310, + "router_z_loss_clip": 2.25976562, + "router_z_loss_mlp": 0.20581055, + "step": 3861, + "time_per_iteration": 2.611527681350708 + }, + { + "auxiliary_loss_clip": 0.06526297, + "auxiliary_loss_mlp": 0.01281702, + "balance_loss_clip": 0.06296666, + "balance_loss_mlp": 0.01261198, + "epoch": 0.2321960018036976, + "flos": 21622737191040.0, + "grad_norm": 1.932164160561112, + "language_loss": 0.86100018, + "learning_rate": 3.586242265438576e-06, + "loss": 0.93908012, + "num_input_tokens_seen": 83102065, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.2052002, + "step": 3862, + "time_per_iteration": 2.599078893661499 + }, + { + "auxiliary_loss_clip": 0.06517789, + "auxiliary_loss_mlp": 0.01277863, + "balance_loss_clip": 0.0629621, + "balance_loss_mlp": 0.0125898, + "epoch": 0.23225612505636556, + "flos": 22277734957440.0, + "grad_norm": 1.8279700206037066, + "language_loss": 0.75524014, + "learning_rate": 3.5860050283031773e-06, + "loss": 0.83319664, + "num_input_tokens_seen": 83121445, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.18884277, + "step": 3863, + "time_per_iteration": 2.5592801570892334 + }, + { + "auxiliary_loss_clip": 0.06518993, + "auxiliary_loss_mlp": 0.01279608, + "balance_loss_clip": 0.06295245, + "balance_loss_mlp": 0.01260237, + "epoch": 0.23231624830903352, + "flos": 17057431336320.0, + "grad_norm": 1.8656538002376628, + "language_loss": 0.7504397, + "learning_rate": 3.58576773102631e-06, + "loss": 0.82842577, + "num_input_tokens_seen": 83138175, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.19372559, + "step": 3864, + "time_per_iteration": 2.549480438232422 + }, + { + "auxiliary_loss_clip": 0.06521947, + "auxiliary_loss_mlp": 0.01276148, + "balance_loss_clip": 0.06295212, + "balance_loss_mlp": 0.01255572, + "epoch": 0.2323763715617015, + "flos": 34648353619200.0, + "grad_norm": 2.1960138476201023, + "language_loss": 0.70505309, + "learning_rate": 3.5855303736169714e-06, + "loss": 0.78303403, + "num_input_tokens_seen": 83161975, + "router_z_loss_clip": 2.26757812, + "router_z_loss_mlp": 0.20568848, + "step": 3865, + "time_per_iteration": 2.6358752250671387 + }, + { + "auxiliary_loss_clip": 0.06539118, + "auxiliary_loss_mlp": 0.01279948, + "balance_loss_clip": 0.06299968, + "balance_loss_mlp": 0.01256464, + "epoch": 0.23243649481436945, + "flos": 25557922742400.0, + "grad_norm": 1.8533317501805489, + "language_loss": 0.95648015, + "learning_rate": 3.5852929560841617e-06, + "loss": 1.03467083, + "num_input_tokens_seen": 83180905, + "router_z_loss_clip": 2.39257812, + "router_z_loss_mlp": 0.23510742, + "step": 3866, + "time_per_iteration": 2.5805771350860596 + }, + { + "auxiliary_loss_clip": 0.06523386, + "auxiliary_loss_mlp": 0.0128215, + "balance_loss_clip": 0.06294955, + "balance_loss_mlp": 0.01260561, + "epoch": 0.23249661806703742, + "flos": 20489411992320.0, + "grad_norm": 3.3036871554572285, + "language_loss": 0.74161094, + "learning_rate": 3.5850554784368846e-06, + "loss": 0.81966627, + "num_input_tokens_seen": 83196390, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.21569824, + "step": 3867, + "time_per_iteration": 2.485872268676758 + }, + { + "auxiliary_loss_clip": 0.06527717, + "auxiliary_loss_mlp": 0.01278812, + "balance_loss_clip": 0.06298171, + "balance_loss_mlp": 0.01257271, + "epoch": 0.23255674131970538, + "flos": 20382956979840.0, + "grad_norm": 1.7596317335066716, + "language_loss": 0.82912898, + "learning_rate": 3.584817940684145e-06, + "loss": 0.90719432, + "num_input_tokens_seen": 83216165, + "router_z_loss_clip": 2.29296875, + "router_z_loss_mlp": 0.2154541, + "step": 3868, + "time_per_iteration": 2.5404841899871826 + }, + { + "auxiliary_loss_clip": 0.06518516, + "auxiliary_loss_mlp": 0.01279395, + "balance_loss_clip": 0.0629604, + "balance_loss_mlp": 0.01260321, + "epoch": 0.23261686457237338, + "flos": 17061833675520.0, + "grad_norm": 1.6597028261056146, + "language_loss": 0.73686016, + "learning_rate": 3.58458034283495e-06, + "loss": 0.81483924, + "num_input_tokens_seen": 83233845, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.1907959, + "step": 3869, + "time_per_iteration": 2.4850685596466064 + }, + { + "auxiliary_loss_clip": 0.06524374, + "auxiliary_loss_mlp": 0.01289937, + "balance_loss_clip": 0.06296247, + "balance_loss_mlp": 0.01268241, + "epoch": 0.23267698782504134, + "flos": 29177726325120.0, + "grad_norm": 1.8030595092782438, + "language_loss": 0.8079325, + "learning_rate": 3.5843426848983097e-06, + "loss": 0.88607562, + "num_input_tokens_seen": 83254930, + "router_z_loss_clip": 2.28320312, + "router_z_loss_mlp": 0.21716309, + "step": 3870, + "time_per_iteration": 2.5915870666503906 + }, + { + "auxiliary_loss_clip": 0.06532744, + "auxiliary_loss_mlp": 0.01283178, + "balance_loss_clip": 0.06299601, + "balance_loss_mlp": 0.0126178, + "epoch": 0.2327371110777093, + "flos": 21180355960320.0, + "grad_norm": 1.9640097574691695, + "language_loss": 0.71693742, + "learning_rate": 3.5841049668832357e-06, + "loss": 0.79509664, + "num_input_tokens_seen": 83272095, + "router_z_loss_clip": 2.33398438, + "router_z_loss_mlp": 0.21411133, + "step": 3871, + "time_per_iteration": 2.4897918701171875 + }, + { + "auxiliary_loss_clip": 0.065286, + "auxiliary_loss_mlp": 0.01280741, + "balance_loss_clip": 0.06295659, + "balance_loss_mlp": 0.01260034, + "epoch": 0.23279723433037727, + "flos": 24869997521280.0, + "grad_norm": 2.5352867939179484, + "language_loss": 0.69289309, + "learning_rate": 3.5838671887987433e-06, + "loss": 0.77098656, + "num_input_tokens_seen": 83290980, + "router_z_loss_clip": 2.33007812, + "router_z_loss_mlp": 0.20715332, + "step": 3872, + "time_per_iteration": 2.5636072158813477 + }, + { + "auxiliary_loss_clip": 0.06535204, + "auxiliary_loss_mlp": 0.01285984, + "balance_loss_clip": 0.06299452, + "balance_loss_mlp": 0.01263894, + "epoch": 0.23285735758304524, + "flos": 38809823921280.0, + "grad_norm": 2.0709139139802497, + "language_loss": 0.78303361, + "learning_rate": 3.5836293506538474e-06, + "loss": 0.86124545, + "num_input_tokens_seen": 83315175, + "router_z_loss_clip": 2.35742188, + "router_z_loss_mlp": 0.22094727, + "step": 3873, + "time_per_iteration": 2.671551465988159 + }, + { + "auxiliary_loss_clip": 0.06419215, + "auxiliary_loss_mlp": 0.01286246, + "balance_loss_clip": 0.06302594, + "balance_loss_mlp": 0.01280601, + "epoch": 0.2329174808357132, + "flos": 53962274280960.0, + "grad_norm": 0.8377063316545934, + "language_loss": 0.60286367, + "learning_rate": 3.5833914524575687e-06, + "loss": 0.67991829, + "num_input_tokens_seen": 83372060, + "router_z_loss_clip": 1.171875, + "router_z_loss_mlp": 0.05636597, + "step": 3874, + "time_per_iteration": 3.087822675704956 + }, + { + "auxiliary_loss_clip": 0.06525364, + "auxiliary_loss_mlp": 0.01281697, + "balance_loss_clip": 0.06298245, + "balance_loss_mlp": 0.012608, + "epoch": 0.23297760408838117, + "flos": 21222549290880.0, + "grad_norm": 2.3064833177652773, + "language_loss": 0.81324208, + "learning_rate": 3.583153494218927e-06, + "loss": 0.89131272, + "num_input_tokens_seen": 83389795, + "router_z_loss_clip": 2.2734375, + "router_z_loss_mlp": 0.20898438, + "step": 3875, + "time_per_iteration": 2.560511589050293 + }, + { + "auxiliary_loss_clip": 0.06520373, + "auxiliary_loss_mlp": 0.01275593, + "balance_loss_clip": 0.06294609, + "balance_loss_mlp": 0.01255983, + "epoch": 0.23303772734104916, + "flos": 28410613395840.0, + "grad_norm": 2.285945976693144, + "language_loss": 0.62077069, + "learning_rate": 3.5829154759469464e-06, + "loss": 0.69873035, + "num_input_tokens_seen": 83410005, + "router_z_loss_clip": 2.25976562, + "router_z_loss_mlp": 0.19628906, + "step": 3876, + "time_per_iteration": 2.63901948928833 + }, + { + "auxiliary_loss_clip": 0.06525883, + "auxiliary_loss_mlp": 0.01277799, + "balance_loss_clip": 0.06296121, + "balance_loss_mlp": 0.01258034, + "epoch": 0.23309785059371713, + "flos": 24321328986240.0, + "grad_norm": 1.9984006432494335, + "language_loss": 0.71087664, + "learning_rate": 3.5826773976506523e-06, + "loss": 0.78891349, + "num_input_tokens_seen": 83430250, + "router_z_loss_clip": 2.29492188, + "router_z_loss_mlp": 0.19787598, + "step": 3877, + "time_per_iteration": 2.533858299255371 + }, + { + "auxiliary_loss_clip": 0.06524412, + "auxiliary_loss_mlp": 0.01274037, + "balance_loss_clip": 0.06297307, + "balance_loss_mlp": 0.01253485, + "epoch": 0.2331579738463851, + "flos": 15997633695360.0, + "grad_norm": 2.4085120625047143, + "language_loss": 0.81286502, + "learning_rate": 3.582439259339073e-06, + "loss": 0.89084947, + "num_input_tokens_seen": 83447950, + "router_z_loss_clip": 2.27148438, + "router_z_loss_mlp": 0.20556641, + "step": 3878, + "time_per_iteration": 2.5396199226379395 + }, + { + "auxiliary_loss_clip": 0.06534204, + "auxiliary_loss_mlp": 0.01280932, + "balance_loss_clip": 0.06299698, + "balance_loss_mlp": 0.0126013, + "epoch": 0.23321809709905306, + "flos": 36435418773120.0, + "grad_norm": 2.3738521781051207, + "language_loss": 0.75046253, + "learning_rate": 3.5822010610212374e-06, + "loss": 0.82861388, + "num_input_tokens_seen": 83467785, + "router_z_loss_clip": 2.34570312, + "router_z_loss_mlp": 0.20788574, + "step": 3879, + "time_per_iteration": 2.6389944553375244 + }, + { + "auxiliary_loss_clip": 0.06528227, + "auxiliary_loss_mlp": 0.01279465, + "balance_loss_clip": 0.06299725, + "balance_loss_mlp": 0.01257972, + "epoch": 0.23327822035172102, + "flos": 21331184509440.0, + "grad_norm": 4.081669167605711, + "language_loss": 0.90526301, + "learning_rate": 3.5819628027061795e-06, + "loss": 0.98333991, + "num_input_tokens_seen": 83485390, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.21496582, + "step": 3880, + "time_per_iteration": 2.5659923553466797 + }, + { + "auxiliary_loss_clip": 0.06530303, + "auxiliary_loss_mlp": 0.01278258, + "balance_loss_clip": 0.06297769, + "balance_loss_mlp": 0.0125841, + "epoch": 0.233338343604389, + "flos": 19177907086080.0, + "grad_norm": 1.8856968798779488, + "language_loss": 0.72716117, + "learning_rate": 3.5817244844029334e-06, + "loss": 0.80524671, + "num_input_tokens_seen": 83504890, + "router_z_loss_clip": 2.32421875, + "router_z_loss_mlp": 0.19848633, + "step": 3881, + "time_per_iteration": 2.528083324432373 + }, + { + "auxiliary_loss_clip": 0.0653114, + "auxiliary_loss_mlp": 0.01278184, + "balance_loss_clip": 0.06302784, + "balance_loss_mlp": 0.0125805, + "epoch": 0.23339846685705698, + "flos": 26915939464320.0, + "grad_norm": 1.6578041146422486, + "language_loss": 0.68699455, + "learning_rate": 3.581486106120537e-06, + "loss": 0.76508778, + "num_input_tokens_seen": 83526475, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.20129395, + "step": 3882, + "time_per_iteration": 2.575275182723999 + }, + { + "auxiliary_loss_clip": 0.06529698, + "auxiliary_loss_mlp": 0.0127867, + "balance_loss_clip": 0.0629693, + "balance_loss_mlp": 0.01258226, + "epoch": 0.23345859010972494, + "flos": 32351375243520.0, + "grad_norm": 2.0584115637368767, + "language_loss": 0.77458596, + "learning_rate": 3.5812476678680287e-06, + "loss": 0.8526696, + "num_input_tokens_seen": 83546620, + "router_z_loss_clip": 2.33007812, + "router_z_loss_mlp": 0.20446777, + "step": 3883, + "time_per_iteration": 2.626533269882202 + }, + { + "auxiliary_loss_clip": 0.06405331, + "auxiliary_loss_mlp": 0.01262592, + "balance_loss_clip": 0.0628854, + "balance_loss_mlp": 0.01257663, + "epoch": 0.2335187133623929, + "flos": 58505805273600.0, + "grad_norm": 0.7704933603606158, + "language_loss": 0.59193355, + "learning_rate": 3.58100916965445e-06, + "loss": 0.66861278, + "num_input_tokens_seen": 83616160, + "router_z_loss_clip": 1.17089844, + "router_z_loss_mlp": 0.04925537, + "step": 3884, + "time_per_iteration": 4.6365087032318115 + }, + { + "auxiliary_loss_clip": 0.06533933, + "auxiliary_loss_mlp": 0.01280044, + "balance_loss_clip": 0.06302384, + "balance_loss_mlp": 0.01260017, + "epoch": 0.23357883661506088, + "flos": 24509822745600.0, + "grad_norm": 1.6610169782824564, + "language_loss": 0.80755335, + "learning_rate": 3.5807706114888455e-06, + "loss": 0.88569313, + "num_input_tokens_seen": 83636795, + "router_z_loss_clip": 2.31640625, + "router_z_loss_mlp": 0.20031738, + "step": 3885, + "time_per_iteration": 2.6180286407470703 + }, + { + "auxiliary_loss_clip": 0.06523974, + "auxiliary_loss_mlp": 0.01286823, + "balance_loss_clip": 0.06296945, + "balance_loss_mlp": 0.01265687, + "epoch": 0.23363895986772884, + "flos": 18953760614400.0, + "grad_norm": 2.3207575064623613, + "language_loss": 0.88500953, + "learning_rate": 3.580531993380261e-06, + "loss": 0.96311754, + "num_input_tokens_seen": 83654050, + "router_z_loss_clip": 2.26953125, + "router_z_loss_mlp": 0.21130371, + "step": 3886, + "time_per_iteration": 2.5116477012634277 + }, + { + "auxiliary_loss_clip": 0.06532702, + "auxiliary_loss_mlp": 0.01282855, + "balance_loss_clip": 0.06302926, + "balance_loss_mlp": 0.01262518, + "epoch": 0.2336990831203968, + "flos": 31694993884800.0, + "grad_norm": 1.8877154320423692, + "language_loss": 0.74203557, + "learning_rate": 3.5802933153377445e-06, + "loss": 0.82019114, + "num_input_tokens_seen": 83673720, + "router_z_loss_clip": 2.29882812, + "router_z_loss_mlp": 0.20336914, + "step": 3887, + "time_per_iteration": 4.024793863296509 + }, + { + "auxiliary_loss_clip": 0.06531121, + "auxiliary_loss_mlp": 0.01281305, + "balance_loss_clip": 0.06301375, + "balance_loss_mlp": 0.01261206, + "epoch": 0.23375920637306477, + "flos": 27717237659520.0, + "grad_norm": 1.8176198265631485, + "language_loss": 0.84478307, + "learning_rate": 3.5800545773703475e-06, + "loss": 0.92290735, + "num_input_tokens_seen": 83693470, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.20092773, + "step": 3888, + "time_per_iteration": 2.6297786235809326 + }, + { + "auxiliary_loss_clip": 0.06524558, + "auxiliary_loss_mlp": 0.01283639, + "balance_loss_clip": 0.06298919, + "balance_loss_mlp": 0.01263934, + "epoch": 0.23381932962573276, + "flos": 17681346437760.0, + "grad_norm": 2.056965631559896, + "language_loss": 0.88319886, + "learning_rate": 3.5798157794871225e-06, + "loss": 0.96128076, + "num_input_tokens_seen": 83711620, + "router_z_loss_clip": 2.25585938, + "router_z_loss_mlp": 0.19689941, + "step": 3889, + "time_per_iteration": 2.524937152862549 + }, + { + "auxiliary_loss_clip": 0.06524722, + "auxiliary_loss_mlp": 0.01282198, + "balance_loss_clip": 0.06299812, + "balance_loss_mlp": 0.01262708, + "epoch": 0.23387945287840073, + "flos": 14395833918720.0, + "grad_norm": 2.5361674913720487, + "language_loss": 0.7777229, + "learning_rate": 3.579576921697125e-06, + "loss": 0.85579211, + "num_input_tokens_seen": 83727890, + "router_z_loss_clip": 2.24804688, + "router_z_loss_mlp": 0.19470215, + "step": 3890, + "time_per_iteration": 4.02982497215271 + }, + { + "auxiliary_loss_clip": 0.06526545, + "auxiliary_loss_mlp": 0.01284178, + "balance_loss_clip": 0.06297928, + "balance_loss_mlp": 0.01264008, + "epoch": 0.2339395761310687, + "flos": 46108451888640.0, + "grad_norm": 1.897831891943022, + "language_loss": 0.74213481, + "learning_rate": 3.579338004009412e-06, + "loss": 0.82024205, + "num_input_tokens_seen": 83749370, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.20166016, + "step": 3891, + "time_per_iteration": 2.7951042652130127 + }, + { + "auxiliary_loss_clip": 0.06524959, + "auxiliary_loss_mlp": 0.01281513, + "balance_loss_clip": 0.06301059, + "balance_loss_mlp": 0.01262821, + "epoch": 0.23399969938373666, + "flos": 22388508455040.0, + "grad_norm": 1.6273389699862264, + "language_loss": 0.82863498, + "learning_rate": 3.5790990264330433e-06, + "loss": 0.90669972, + "num_input_tokens_seen": 83769560, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.18688965, + "step": 3892, + "time_per_iteration": 2.530782461166382 + }, + { + "auxiliary_loss_clip": 0.06531358, + "auxiliary_loss_mlp": 0.01281181, + "balance_loss_clip": 0.06301633, + "balance_loss_mlp": 0.01260951, + "epoch": 0.23405982263640462, + "flos": 43518746874240.0, + "grad_norm": 1.4575042253356143, + "language_loss": 0.65593249, + "learning_rate": 3.578859988977082e-06, + "loss": 0.7340579, + "num_input_tokens_seen": 83795635, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.20227051, + "step": 3893, + "time_per_iteration": 4.212572813034058 + }, + { + "auxiliary_loss_clip": 0.06519544, + "auxiliary_loss_mlp": 0.01283369, + "balance_loss_clip": 0.06297972, + "balance_loss_mlp": 0.01263259, + "epoch": 0.2341199458890726, + "flos": 22571216282880.0, + "grad_norm": 2.0084649252152564, + "language_loss": 0.79620147, + "learning_rate": 3.5786208916505916e-06, + "loss": 0.87423062, + "num_input_tokens_seen": 83814090, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.20117188, + "step": 3894, + "time_per_iteration": 2.580109119415283 + }, + { + "auxiliary_loss_clip": 0.06524212, + "auxiliary_loss_mlp": 0.01276443, + "balance_loss_clip": 0.06300013, + "balance_loss_mlp": 0.01257763, + "epoch": 0.23418006914174055, + "flos": 25641764352000.0, + "grad_norm": 1.5130292757453454, + "language_loss": 0.82681906, + "learning_rate": 3.5783817344626383e-06, + "loss": 0.90482563, + "num_input_tokens_seen": 83836870, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.18664551, + "step": 3895, + "time_per_iteration": 2.583759069442749 + }, + { + "auxiliary_loss_clip": 0.06520028, + "auxiliary_loss_mlp": 0.01278233, + "balance_loss_clip": 0.06295593, + "balance_loss_mlp": 0.0125885, + "epoch": 0.23424019239440855, + "flos": 13549826770560.0, + "grad_norm": 2.4592405022159496, + "language_loss": 0.81334293, + "learning_rate": 3.578142517422292e-06, + "loss": 0.89132559, + "num_input_tokens_seen": 83853275, + "router_z_loss_clip": 2.24414062, + "router_z_loss_mlp": 0.19372559, + "step": 3896, + "time_per_iteration": 2.536252021789551 + }, + { + "auxiliary_loss_clip": 0.06530771, + "auxiliary_loss_mlp": 0.012867, + "balance_loss_clip": 0.06299435, + "balance_loss_mlp": 0.01264253, + "epoch": 0.2343003156470765, + "flos": 22426131738240.0, + "grad_norm": 3.0940729647414598, + "language_loss": 0.83988011, + "learning_rate": 3.577903240538623e-06, + "loss": 0.91805482, + "num_input_tokens_seen": 83872340, + "router_z_loss_clip": 2.31445312, + "router_z_loss_mlp": 0.2244873, + "step": 3897, + "time_per_iteration": 2.572230577468872 + }, + { + "auxiliary_loss_clip": 0.06528857, + "auxiliary_loss_mlp": 0.01279177, + "balance_loss_clip": 0.06296414, + "balance_loss_mlp": 0.01258626, + "epoch": 0.23436043889974448, + "flos": 14795644475520.0, + "grad_norm": 2.317273344502078, + "language_loss": 0.79819012, + "learning_rate": 3.577663903820705e-06, + "loss": 0.87627041, + "num_input_tokens_seen": 83888795, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.20544434, + "step": 3898, + "time_per_iteration": 2.5207583904266357 + }, + { + "auxiliary_loss_clip": 0.0651897, + "auxiliary_loss_mlp": 0.01278878, + "balance_loss_clip": 0.06297988, + "balance_loss_mlp": 0.0126021, + "epoch": 0.23442056215241244, + "flos": 22972242723840.0, + "grad_norm": 1.88849810547605, + "language_loss": 0.7476474, + "learning_rate": 3.577424507277614e-06, + "loss": 0.82562584, + "num_input_tokens_seen": 83906820, + "router_z_loss_clip": 2.20800781, + "router_z_loss_mlp": 0.18676758, + "step": 3899, + "time_per_iteration": 2.535256862640381 + }, + { + "auxiliary_loss_clip": 0.06525272, + "auxiliary_loss_mlp": 0.01280019, + "balance_loss_clip": 0.06296974, + "balance_loss_mlp": 0.01259515, + "epoch": 0.2344806854050804, + "flos": 23077901122560.0, + "grad_norm": 1.7218865416029, + "language_loss": 0.75599915, + "learning_rate": 3.5771850509184277e-06, + "loss": 0.83405209, + "num_input_tokens_seen": 83926370, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.20507812, + "step": 3900, + "time_per_iteration": 2.5674827098846436 + }, + { + "auxiliary_loss_clip": 0.06524841, + "auxiliary_loss_mlp": 0.01281356, + "balance_loss_clip": 0.06296976, + "balance_loss_mlp": 0.01260959, + "epoch": 0.23454080865774837, + "flos": 16332805226880.0, + "grad_norm": 2.155964713283421, + "language_loss": 0.67468774, + "learning_rate": 3.5769455347522256e-06, + "loss": 0.75274968, + "num_input_tokens_seen": 83944600, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.20410156, + "step": 3901, + "time_per_iteration": 2.536736249923706 + }, + { + "auxiliary_loss_clip": 0.06415819, + "auxiliary_loss_mlp": 0.01256149, + "balance_loss_clip": 0.06299057, + "balance_loss_mlp": 0.01251181, + "epoch": 0.23460093191041637, + "flos": 67779545685120.0, + "grad_norm": 0.7514179301091559, + "language_loss": 0.58278525, + "learning_rate": 3.576705958788091e-06, + "loss": 0.65950489, + "num_input_tokens_seen": 84005100, + "router_z_loss_clip": 1.16796875, + "router_z_loss_mlp": 0.0496521, + "step": 3902, + "time_per_iteration": 3.134718894958496 + }, + { + "auxiliary_loss_clip": 0.06519462, + "auxiliary_loss_mlp": 0.01278211, + "balance_loss_clip": 0.06292997, + "balance_loss_mlp": 0.01258375, + "epoch": 0.23466105516308433, + "flos": 20082725400960.0, + "grad_norm": 4.781089560028637, + "language_loss": 0.80931306, + "learning_rate": 3.576466323035108e-06, + "loss": 0.88728976, + "num_input_tokens_seen": 84023775, + "router_z_loss_clip": 2.26171875, + "router_z_loss_mlp": 0.19836426, + "step": 3903, + "time_per_iteration": 2.525059938430786 + }, + { + "auxiliary_loss_clip": 0.06522641, + "auxiliary_loss_mlp": 0.01278887, + "balance_loss_clip": 0.06295069, + "balance_loss_mlp": 0.01258955, + "epoch": 0.2347211784157523, + "flos": 24542708273280.0, + "grad_norm": 1.8578223556950417, + "language_loss": 0.82988703, + "learning_rate": 3.5762266275023645e-06, + "loss": 0.90790236, + "num_input_tokens_seen": 84042605, + "router_z_loss_clip": 2.27929688, + "router_z_loss_mlp": 0.19909668, + "step": 3904, + "time_per_iteration": 2.5903875827789307 + }, + { + "auxiliary_loss_clip": 0.0652332, + "auxiliary_loss_mlp": 0.01285911, + "balance_loss_clip": 0.06295672, + "balance_loss_mlp": 0.01265562, + "epoch": 0.23478130166842026, + "flos": 23811751180800.0, + "grad_norm": 1.985666710181995, + "language_loss": 0.7223646, + "learning_rate": 3.57598687219895e-06, + "loss": 0.80045688, + "num_input_tokens_seen": 84061520, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.20361328, + "step": 3905, + "time_per_iteration": 2.5441884994506836 + }, + { + "auxiliary_loss_clip": 0.06517074, + "auxiliary_loss_mlp": 0.01274876, + "balance_loss_clip": 0.06294023, + "balance_loss_mlp": 0.01255564, + "epoch": 0.23484142492108823, + "flos": 24099823918080.0, + "grad_norm": 2.433861192511871, + "language_loss": 0.71703601, + "learning_rate": 3.5757470571339543e-06, + "loss": 0.79495549, + "num_input_tokens_seen": 84081800, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.19311523, + "step": 3906, + "time_per_iteration": 2.698309898376465 + }, + { + "auxiliary_loss_clip": 0.06533175, + "auxiliary_loss_mlp": 0.01285298, + "balance_loss_clip": 0.06297503, + "balance_loss_mlp": 0.01264341, + "epoch": 0.2349015481737562, + "flos": 29103486007680.0, + "grad_norm": 2.7858195598302014, + "language_loss": 0.74089986, + "learning_rate": 3.575507182316473e-06, + "loss": 0.81908458, + "num_input_tokens_seen": 84102340, + "router_z_loss_clip": 2.35742188, + "router_z_loss_mlp": 0.20959473, + "step": 3907, + "time_per_iteration": 2.578900098800659 + }, + { + "auxiliary_loss_clip": 0.06524273, + "auxiliary_loss_mlp": 0.01280946, + "balance_loss_clip": 0.06294693, + "balance_loss_mlp": 0.01260418, + "epoch": 0.23496167142642416, + "flos": 18922258679040.0, + "grad_norm": 2.1308722973133385, + "language_loss": 0.73705935, + "learning_rate": 3.575267247755601e-06, + "loss": 0.81511152, + "num_input_tokens_seen": 84120370, + "router_z_loss_clip": 2.29492188, + "router_z_loss_mlp": 0.2052002, + "step": 3908, + "time_per_iteration": 2.599888801574707 + }, + { + "auxiliary_loss_clip": 0.06415461, + "auxiliary_loss_mlp": 0.01265268, + "balance_loss_clip": 0.06297816, + "balance_loss_mlp": 0.01259901, + "epoch": 0.23502179467909215, + "flos": 55884906541440.0, + "grad_norm": 1.2475277524680826, + "language_loss": 0.73364127, + "learning_rate": 3.5750272534604367e-06, + "loss": 0.81044865, + "num_input_tokens_seen": 84165515, + "router_z_loss_clip": 1.171875, + "router_z_loss_mlp": 0.05374146, + "step": 3909, + "time_per_iteration": 2.9221227169036865 + }, + { + "auxiliary_loss_clip": 0.06524064, + "auxiliary_loss_mlp": 0.01285302, + "balance_loss_clip": 0.06297419, + "balance_loss_mlp": 0.01265013, + "epoch": 0.23508191793176011, + "flos": 23408083336320.0, + "grad_norm": 1.6005271399570604, + "language_loss": 0.88581395, + "learning_rate": 3.5747871994400822e-06, + "loss": 0.9639076, + "num_input_tokens_seen": 84184540, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.20288086, + "step": 3910, + "time_per_iteration": 2.571974277496338 + }, + { + "auxiliary_loss_clip": 0.06520193, + "auxiliary_loss_mlp": 0.01278841, + "balance_loss_clip": 0.06293051, + "balance_loss_mlp": 0.01258658, + "epoch": 0.23514204118442808, + "flos": 20053864869120.0, + "grad_norm": 1.9643755437340527, + "language_loss": 0.76589572, + "learning_rate": 3.5745470857036386e-06, + "loss": 0.84388608, + "num_input_tokens_seen": 84202025, + "router_z_loss_clip": 2.27148438, + "router_z_loss_mlp": 0.2019043, + "step": 3911, + "time_per_iteration": 2.5159506797790527 + }, + { + "auxiliary_loss_clip": 0.06514487, + "auxiliary_loss_mlp": 0.01291153, + "balance_loss_clip": 0.06293596, + "balance_loss_mlp": 0.01272568, + "epoch": 0.23520216443709605, + "flos": 21587126405760.0, + "grad_norm": 1.5390832092388007, + "language_loss": 0.82200038, + "learning_rate": 3.5743069122602122e-06, + "loss": 0.90005672, + "num_input_tokens_seen": 84221895, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.18579102, + "step": 3912, + "time_per_iteration": 2.53330135345459 + }, + { + "auxiliary_loss_clip": 0.06515642, + "auxiliary_loss_mlp": 0.01288785, + "balance_loss_clip": 0.06294793, + "balance_loss_mlp": 0.01269604, + "epoch": 0.235262287689764, + "flos": 23192573834880.0, + "grad_norm": 1.8330232089961167, + "language_loss": 0.72023201, + "learning_rate": 3.574066679118909e-06, + "loss": 0.79827625, + "num_input_tokens_seen": 84240455, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.19177246, + "step": 3913, + "time_per_iteration": 2.5643818378448486 + }, + { + "auxiliary_loss_clip": 0.06528541, + "auxiliary_loss_mlp": 0.01277731, + "balance_loss_clip": 0.0629672, + "balance_loss_mlp": 0.01257238, + "epoch": 0.23532241094243198, + "flos": 23191903002240.0, + "grad_norm": 1.784539383466316, + "language_loss": 0.76976919, + "learning_rate": 3.57382638628884e-06, + "loss": 0.84783185, + "num_input_tokens_seen": 84261605, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.20483398, + "step": 3914, + "time_per_iteration": 2.575133800506592 + }, + { + "auxiliary_loss_clip": 0.06525879, + "auxiliary_loss_mlp": 0.01279953, + "balance_loss_clip": 0.06294835, + "balance_loss_mlp": 0.01259759, + "epoch": 0.23538253419509997, + "flos": 17025007006080.0, + "grad_norm": 2.4875564397369745, + "language_loss": 0.90170735, + "learning_rate": 3.5735860337791174e-06, + "loss": 0.97976559, + "num_input_tokens_seen": 84278675, + "router_z_loss_clip": 2.30859375, + "router_z_loss_mlp": 0.2019043, + "step": 3915, + "time_per_iteration": 2.563430070877075 + }, + { + "auxiliary_loss_clip": 0.06418007, + "auxiliary_loss_mlp": 0.01258116, + "balance_loss_clip": 0.06301998, + "balance_loss_mlp": 0.0125336, + "epoch": 0.23544265744776793, + "flos": 63465276263040.0, + "grad_norm": 0.7933859009920101, + "language_loss": 0.59378946, + "learning_rate": 3.573345621598854e-06, + "loss": 0.6705507, + "num_input_tokens_seen": 84329765, + "router_z_loss_clip": 1.15625, + "router_z_loss_mlp": 0.04748535, + "step": 3916, + "time_per_iteration": 3.0965490341186523 + }, + { + "auxiliary_loss_clip": 0.06410776, + "auxiliary_loss_mlp": 0.01260488, + "balance_loss_clip": 0.06295535, + "balance_loss_mlp": 0.01255756, + "epoch": 0.2355027807004359, + "flos": 70537395116160.0, + "grad_norm": 0.7426668339088592, + "language_loss": 0.49443412, + "learning_rate": 3.5731051497571675e-06, + "loss": 0.57114673, + "num_input_tokens_seen": 84393680, + "router_z_loss_clip": 1.15625, + "router_z_loss_mlp": 0.04724121, + "step": 3917, + "time_per_iteration": 3.180136203765869 + }, + { + "auxiliary_loss_clip": 0.06525698, + "auxiliary_loss_mlp": 0.01279416, + "balance_loss_clip": 0.06297344, + "balance_loss_mlp": 0.01259687, + "epoch": 0.23556290395310386, + "flos": 21440742122880.0, + "grad_norm": 2.189382839240281, + "language_loss": 0.77017808, + "learning_rate": 3.5728646182631756e-06, + "loss": 0.84822929, + "num_input_tokens_seen": 84412640, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.19714355, + "step": 3918, + "time_per_iteration": 2.546833038330078 + }, + { + "auxiliary_loss_clip": 0.0652653, + "auxiliary_loss_mlp": 0.01274201, + "balance_loss_clip": 0.06294574, + "balance_loss_mlp": 0.01254353, + "epoch": 0.23562302720577183, + "flos": 18192223981440.0, + "grad_norm": 2.402769767514051, + "language_loss": 0.70165813, + "learning_rate": 3.5726240271259995e-06, + "loss": 0.77966547, + "num_input_tokens_seen": 84431605, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.1986084, + "step": 3919, + "time_per_iteration": 2.561800479888916 + }, + { + "auxiliary_loss_clip": 0.06516096, + "auxiliary_loss_mlp": 0.01279326, + "balance_loss_clip": 0.06294449, + "balance_loss_mlp": 0.0125999, + "epoch": 0.2356831504584398, + "flos": 33739091038080.0, + "grad_norm": 1.6359966895302622, + "language_loss": 0.71094656, + "learning_rate": 3.5723833763547634e-06, + "loss": 0.78890085, + "num_input_tokens_seen": 84454210, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.19335938, + "step": 3920, + "time_per_iteration": 2.672703504562378 + }, + { + "auxiliary_loss_clip": 0.065192, + "auxiliary_loss_mlp": 0.0127625, + "balance_loss_clip": 0.06295229, + "balance_loss_mlp": 0.0125707, + "epoch": 0.23574327371110776, + "flos": 24939122739840.0, + "grad_norm": 1.9300596293530992, + "language_loss": 0.77833009, + "learning_rate": 3.5721426659585916e-06, + "loss": 0.85628462, + "num_input_tokens_seen": 84475540, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.19189453, + "step": 3921, + "time_per_iteration": 2.5823934078216553 + }, + { + "auxiliary_loss_clip": 0.06519832, + "auxiliary_loss_mlp": 0.01273471, + "balance_loss_clip": 0.06293498, + "balance_loss_mlp": 0.01254898, + "epoch": 0.23580339696377575, + "flos": 17827940355840.0, + "grad_norm": 2.282195745019935, + "language_loss": 0.76750088, + "learning_rate": 3.571901895946612e-06, + "loss": 0.84543383, + "num_input_tokens_seen": 84494580, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.18566895, + "step": 3922, + "time_per_iteration": 2.5005834102630615 + }, + { + "auxiliary_loss_clip": 0.06518443, + "auxiliary_loss_mlp": 0.01276376, + "balance_loss_clip": 0.06292558, + "balance_loss_mlp": 0.01257255, + "epoch": 0.23586352021644372, + "flos": 26293827225600.0, + "grad_norm": 2.0102031772622277, + "language_loss": 0.80626559, + "learning_rate": 3.571661066327956e-06, + "loss": 0.88421381, + "num_input_tokens_seen": 84513850, + "router_z_loss_clip": 2.25976562, + "router_z_loss_mlp": 0.19128418, + "step": 3923, + "time_per_iteration": 2.581338882446289 + }, + { + "auxiliary_loss_clip": 0.0652013, + "auxiliary_loss_mlp": 0.01275781, + "balance_loss_clip": 0.06296518, + "balance_loss_mlp": 0.01256326, + "epoch": 0.23592364346911168, + "flos": 14251965258240.0, + "grad_norm": 1.780788070615976, + "language_loss": 0.7507394, + "learning_rate": 3.571420177111754e-06, + "loss": 0.82869852, + "num_input_tokens_seen": 84532315, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.19458008, + "step": 3924, + "time_per_iteration": 3.9297289848327637 + }, + { + "auxiliary_loss_clip": 0.06516001, + "auxiliary_loss_mlp": 0.01276934, + "balance_loss_clip": 0.06293369, + "balance_loss_mlp": 0.01258039, + "epoch": 0.23598376672177965, + "flos": 18593837400960.0, + "grad_norm": 1.7528516859224217, + "language_loss": 0.83231425, + "learning_rate": 3.5711792283071416e-06, + "loss": 0.91024363, + "num_input_tokens_seen": 84550970, + "router_z_loss_clip": 2.2265625, + "router_z_loss_mlp": 0.18884277, + "step": 3925, + "time_per_iteration": 2.5267770290374756 + }, + { + "auxiliary_loss_clip": 0.06520985, + "auxiliary_loss_mlp": 0.01279855, + "balance_loss_clip": 0.06293195, + "balance_loss_mlp": 0.01259673, + "epoch": 0.2360438899744476, + "flos": 22682325196800.0, + "grad_norm": 1.753261892654821, + "language_loss": 0.60038519, + "learning_rate": 3.5709382199232564e-06, + "loss": 0.6783936, + "num_input_tokens_seen": 84571655, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.20178223, + "step": 3926, + "time_per_iteration": 4.023118257522583 + }, + { + "auxiliary_loss_clip": 0.06514051, + "auxiliary_loss_mlp": 0.01276677, + "balance_loss_clip": 0.06293727, + "balance_loss_mlp": 0.01257735, + "epoch": 0.23610401322711558, + "flos": 29577872298240.0, + "grad_norm": 1.9607796947198142, + "language_loss": 0.72402066, + "learning_rate": 3.570697151969235e-06, + "loss": 0.80192792, + "num_input_tokens_seen": 84593130, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.1895752, + "step": 3927, + "time_per_iteration": 2.6113367080688477 + }, + { + "auxiliary_loss_clip": 0.06515504, + "auxiliary_loss_mlp": 0.01275291, + "balance_loss_clip": 0.06291251, + "balance_loss_mlp": 0.01256373, + "epoch": 0.23616413647978354, + "flos": 17864347754880.0, + "grad_norm": 2.08357001670468, + "language_loss": 0.75570691, + "learning_rate": 3.570456024454221e-06, + "loss": 0.83361489, + "num_input_tokens_seen": 84612410, + "router_z_loss_clip": 2.24609375, + "router_z_loss_mlp": 0.18920898, + "step": 3928, + "time_per_iteration": 2.601884365081787 + }, + { + "auxiliary_loss_clip": 0.06522287, + "auxiliary_loss_mlp": 0.01280424, + "balance_loss_clip": 0.06293722, + "balance_loss_mlp": 0.01260338, + "epoch": 0.23622425973245154, + "flos": 11039393318400.0, + "grad_norm": 3.3378461006384788, + "language_loss": 0.82518888, + "learning_rate": 3.5702148373873576e-06, + "loss": 0.903216, + "num_input_tokens_seen": 84627610, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.20080566, + "step": 3929, + "time_per_iteration": 3.9035136699676514 + }, + { + "auxiliary_loss_clip": 0.0652993, + "auxiliary_loss_mlp": 0.01281554, + "balance_loss_clip": 0.06295136, + "balance_loss_mlp": 0.01261228, + "epoch": 0.2362843829851195, + "flos": 23410766666880.0, + "grad_norm": 2.0127268398029607, + "language_loss": 0.7229315, + "learning_rate": 3.569973590777789e-06, + "loss": 0.80104637, + "num_input_tokens_seen": 84648415, + "router_z_loss_clip": 2.34960938, + "router_z_loss_mlp": 0.203125, + "step": 3930, + "time_per_iteration": 2.5537455081939697 + }, + { + "auxiliary_loss_clip": 0.06516138, + "auxiliary_loss_mlp": 0.01275778, + "balance_loss_clip": 0.06290947, + "balance_loss_mlp": 0.01257312, + "epoch": 0.23634450623778747, + "flos": 39539103932160.0, + "grad_norm": 1.8975533795335693, + "language_loss": 0.74476141, + "learning_rate": 3.569732284634665e-06, + "loss": 0.82268059, + "num_input_tokens_seen": 84670080, + "router_z_loss_clip": 2.25390625, + "router_z_loss_mlp": 0.18444824, + "step": 3931, + "time_per_iteration": 2.6975677013397217 + }, + { + "auxiliary_loss_clip": 0.06517775, + "auxiliary_loss_mlp": 0.01279269, + "balance_loss_clip": 0.06291172, + "balance_loss_mlp": 0.01260208, + "epoch": 0.23640462949045543, + "flos": 24214077360000.0, + "grad_norm": 2.102820580807434, + "language_loss": 0.8105433, + "learning_rate": 3.569490918967136e-06, + "loss": 0.88851368, + "num_input_tokens_seen": 84686465, + "router_z_loss_clip": 2.26757812, + "router_z_loss_mlp": 0.19055176, + "step": 3932, + "time_per_iteration": 2.539280652999878 + }, + { + "auxiliary_loss_clip": 0.06510118, + "auxiliary_loss_mlp": 0.01272436, + "balance_loss_clip": 0.06289183, + "balance_loss_mlp": 0.01254949, + "epoch": 0.2364647527431234, + "flos": 26184898517760.0, + "grad_norm": 1.6370407311570319, + "language_loss": 0.85819322, + "learning_rate": 3.5692494937843537e-06, + "loss": 0.93601882, + "num_input_tokens_seen": 84708825, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.17480469, + "step": 3933, + "time_per_iteration": 4.0140979290008545 + }, + { + "auxiliary_loss_clip": 0.06528582, + "auxiliary_loss_mlp": 0.01277532, + "balance_loss_clip": 0.06296912, + "balance_loss_mlp": 0.01257314, + "epoch": 0.23652487599579136, + "flos": 22643444102400.0, + "grad_norm": 3.233125821654351, + "language_loss": 0.83709848, + "learning_rate": 3.5690080090954727e-06, + "loss": 0.91515964, + "num_input_tokens_seen": 84726165, + "router_z_loss_clip": 2.31835938, + "router_z_loss_mlp": 0.20214844, + "step": 3934, + "time_per_iteration": 2.542692184448242 + }, + { + "auxiliary_loss_clip": 0.06519171, + "auxiliary_loss_mlp": 0.01281493, + "balance_loss_clip": 0.06292115, + "balance_loss_mlp": 0.01262896, + "epoch": 0.23658499924845935, + "flos": 21768702203520.0, + "grad_norm": 1.7174434370199074, + "language_loss": 0.7898351, + "learning_rate": 3.5687664649096515e-06, + "loss": 0.86784172, + "num_input_tokens_seen": 84745815, + "router_z_loss_clip": 2.26953125, + "router_z_loss_mlp": 0.18615723, + "step": 3935, + "time_per_iteration": 2.5311288833618164 + }, + { + "auxiliary_loss_clip": 0.0651848, + "auxiliary_loss_mlp": 0.01276844, + "balance_loss_clip": 0.06296465, + "balance_loss_mlp": 0.01258533, + "epoch": 0.23664512250112732, + "flos": 21805486945920.0, + "grad_norm": 1.7511193987533888, + "language_loss": 0.80239666, + "learning_rate": 3.5685248612360487e-06, + "loss": 0.88034987, + "num_input_tokens_seen": 84765415, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.1829834, + "step": 3936, + "time_per_iteration": 2.5497477054595947 + }, + { + "auxiliary_loss_clip": 0.06513149, + "auxiliary_loss_mlp": 0.01276001, + "balance_loss_clip": 0.06288509, + "balance_loss_mlp": 0.01256593, + "epoch": 0.23670524575379528, + "flos": 22644450351360.0, + "grad_norm": 1.4782770271817958, + "language_loss": 0.79820013, + "learning_rate": 3.568283198083826e-06, + "loss": 0.8760916, + "num_input_tokens_seen": 84787080, + "router_z_loss_clip": 2.24804688, + "router_z_loss_mlp": 0.19396973, + "step": 3937, + "time_per_iteration": 2.5636842250823975 + }, + { + "auxiliary_loss_clip": 0.06515164, + "auxiliary_loss_mlp": 0.0127913, + "balance_loss_clip": 0.06294726, + "balance_loss_mlp": 0.01261487, + "epoch": 0.23676536900646325, + "flos": 16730225942400.0, + "grad_norm": 2.2850190898814686, + "language_loss": 0.85810506, + "learning_rate": 3.568041475462147e-06, + "loss": 0.93604803, + "num_input_tokens_seen": 84805395, + "router_z_loss_clip": 2.20507812, + "router_z_loss_mlp": 0.1763916, + "step": 3938, + "time_per_iteration": 2.568195343017578 + }, + { + "auxiliary_loss_clip": 0.06509314, + "auxiliary_loss_mlp": 0.01278669, + "balance_loss_clip": 0.06288411, + "balance_loss_mlp": 0.01259393, + "epoch": 0.23682549225913122, + "flos": 11138720734080.0, + "grad_norm": 3.1023600205020876, + "language_loss": 0.94564033, + "learning_rate": 3.5677996933801785e-06, + "loss": 1.02351999, + "num_input_tokens_seen": 84818090, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.19287109, + "step": 3939, + "time_per_iteration": 2.4615180492401123 + }, + { + "auxiliary_loss_clip": 0.0652378, + "auxiliary_loss_mlp": 0.01277473, + "balance_loss_clip": 0.06294175, + "balance_loss_mlp": 0.0125803, + "epoch": 0.23688561551179918, + "flos": 22564843372800.0, + "grad_norm": 5.475058210638743, + "language_loss": 0.82803464, + "learning_rate": 3.567557851847088e-06, + "loss": 0.90604717, + "num_input_tokens_seen": 84837695, + "router_z_loss_clip": 2.29492188, + "router_z_loss_mlp": 0.19445801, + "step": 3940, + "time_per_iteration": 2.573552131652832 + }, + { + "auxiliary_loss_clip": 0.06531326, + "auxiliary_loss_mlp": 0.01276996, + "balance_loss_clip": 0.06295921, + "balance_loss_mlp": 0.0125679, + "epoch": 0.23694573876446715, + "flos": 18520771040640.0, + "grad_norm": 2.098492916494123, + "language_loss": 0.8946867, + "learning_rate": 3.5673159508720464e-06, + "loss": 0.97276992, + "num_input_tokens_seen": 84854630, + "router_z_loss_clip": 2.35351562, + "router_z_loss_mlp": 0.2019043, + "step": 3941, + "time_per_iteration": 2.5142972469329834 + }, + { + "auxiliary_loss_clip": 0.06529268, + "auxiliary_loss_mlp": 0.01286958, + "balance_loss_clip": 0.06297106, + "balance_loss_mlp": 0.01267503, + "epoch": 0.23700586201713514, + "flos": 15340246087680.0, + "grad_norm": 1.8886698836060631, + "language_loss": 0.84989077, + "learning_rate": 3.5670739904642274e-06, + "loss": 0.92805308, + "num_input_tokens_seen": 84871805, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.19458008, + "step": 3942, + "time_per_iteration": 2.56052827835083 + }, + { + "auxiliary_loss_clip": 0.06538361, + "auxiliary_loss_mlp": 0.01285865, + "balance_loss_clip": 0.06307331, + "balance_loss_mlp": 0.01265492, + "epoch": 0.2370659852698031, + "flos": 23953775051520.0, + "grad_norm": 2.0845511028002197, + "language_loss": 0.81156456, + "learning_rate": 3.5668319706328065e-06, + "loss": 0.88980681, + "num_input_tokens_seen": 84889815, + "router_z_loss_clip": 2.31054688, + "router_z_loss_mlp": 0.20373535, + "step": 3943, + "time_per_iteration": 2.539264678955078 + }, + { + "auxiliary_loss_clip": 0.06543057, + "auxiliary_loss_mlp": 0.01292355, + "balance_loss_clip": 0.06306483, + "balance_loss_mlp": 0.01271494, + "epoch": 0.23712610852247107, + "flos": 15336514581120.0, + "grad_norm": 2.5863771047568926, + "language_loss": 0.682428, + "learning_rate": 3.566589891386959e-06, + "loss": 0.76078212, + "num_input_tokens_seen": 84904380, + "router_z_loss_clip": 2.3671875, + "router_z_loss_mlp": 0.20861816, + "step": 3944, + "time_per_iteration": 2.520453929901123 + }, + { + "auxiliary_loss_clip": 0.06529288, + "auxiliary_loss_mlp": 0.01297026, + "balance_loss_clip": 0.06299931, + "balance_loss_mlp": 0.01276963, + "epoch": 0.23718623177513903, + "flos": 19688658848640.0, + "grad_norm": 1.6926271274644824, + "language_loss": 0.76068223, + "learning_rate": 3.566347752735866e-06, + "loss": 0.83894539, + "num_input_tokens_seen": 84922935, + "router_z_loss_clip": 2.29296875, + "router_z_loss_mlp": 0.20043945, + "step": 3945, + "time_per_iteration": 2.517084836959839 + }, + { + "auxiliary_loss_clip": 0.06535566, + "auxiliary_loss_mlp": 0.01288141, + "balance_loss_clip": 0.06307153, + "balance_loss_mlp": 0.0126859, + "epoch": 0.237246355027807, + "flos": 24980351748480.0, + "grad_norm": 1.7408538946114391, + "language_loss": 0.63962567, + "learning_rate": 3.5661055546887094e-06, + "loss": 0.71786278, + "num_input_tokens_seen": 84943685, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.19555664, + "step": 3946, + "time_per_iteration": 2.6133670806884766 + }, + { + "auxiliary_loss_clip": 0.06535441, + "auxiliary_loss_mlp": 0.01289697, + "balance_loss_clip": 0.06306995, + "balance_loss_mlp": 0.01269324, + "epoch": 0.23730647828047496, + "flos": 15382816761600.0, + "grad_norm": 3.1254224655104252, + "language_loss": 0.77114201, + "learning_rate": 3.5658632972546734e-06, + "loss": 0.84939343, + "num_input_tokens_seen": 84959505, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.20385742, + "step": 3947, + "time_per_iteration": 2.495837926864624 + }, + { + "auxiliary_loss_clip": 0.06540522, + "auxiliary_loss_mlp": 0.01290208, + "balance_loss_clip": 0.06311937, + "balance_loss_mlp": 0.01270431, + "epoch": 0.23736660153314296, + "flos": 28158738422400.0, + "grad_norm": 1.595292591120463, + "language_loss": 0.80941439, + "learning_rate": 3.565620980442944e-06, + "loss": 0.88772172, + "num_input_tokens_seen": 84982130, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.19775391, + "step": 3948, + "time_per_iteration": 2.6460211277008057 + }, + { + "auxiliary_loss_clip": 0.06542704, + "auxiliary_loss_mlp": 0.01297731, + "balance_loss_clip": 0.06312679, + "balance_loss_mlp": 0.01277025, + "epoch": 0.23742672478581092, + "flos": 22092385726080.0, + "grad_norm": 1.753357741589714, + "language_loss": 0.80419362, + "learning_rate": 3.5653786042627107e-06, + "loss": 0.88259804, + "num_input_tokens_seen": 85000640, + "router_z_loss_clip": 2.30078125, + "router_z_loss_mlp": 0.20715332, + "step": 3949, + "time_per_iteration": 2.5428664684295654 + }, + { + "auxiliary_loss_clip": 0.06549721, + "auxiliary_loss_mlp": 0.01294419, + "balance_loss_clip": 0.06317213, + "balance_loss_mlp": 0.012732, + "epoch": 0.2374868480384789, + "flos": 19543238887680.0, + "grad_norm": 1.6923054699564082, + "language_loss": 0.73375976, + "learning_rate": 3.565136168723163e-06, + "loss": 0.81220114, + "num_input_tokens_seen": 85018970, + "router_z_loss_clip": 2.32421875, + "router_z_loss_mlp": 0.2121582, + "step": 3950, + "time_per_iteration": 2.6125261783599854 + }, + { + "auxiliary_loss_clip": 0.06527583, + "auxiliary_loss_mlp": 0.01288007, + "balance_loss_clip": 0.06302388, + "balance_loss_mlp": 0.01268957, + "epoch": 0.23754697129114685, + "flos": 19427769561600.0, + "grad_norm": 1.893051910973559, + "language_loss": 0.73254943, + "learning_rate": 3.564893673833495e-06, + "loss": 0.8107053, + "num_input_tokens_seen": 85035905, + "router_z_loss_clip": 2.25390625, + "router_z_loss_mlp": 0.1907959, + "step": 3951, + "time_per_iteration": 2.501091957092285 + }, + { + "auxiliary_loss_clip": 0.06543966, + "auxiliary_loss_mlp": 0.01301622, + "balance_loss_clip": 0.06315006, + "balance_loss_mlp": 0.01280332, + "epoch": 0.23760709454381482, + "flos": 19507208832000.0, + "grad_norm": 1.727887568846887, + "language_loss": 0.7427932, + "learning_rate": 3.564651119602903e-06, + "loss": 0.82124901, + "num_input_tokens_seen": 85054560, + "router_z_loss_clip": 2.2890625, + "router_z_loss_mlp": 0.2130127, + "step": 3952, + "time_per_iteration": 2.5467019081115723 + }, + { + "auxiliary_loss_clip": 0.06536686, + "auxiliary_loss_mlp": 0.01292988, + "balance_loss_clip": 0.0630881, + "balance_loss_mlp": 0.01273379, + "epoch": 0.23766721779648278, + "flos": 27644045518080.0, + "grad_norm": 3.105577179216311, + "language_loss": 0.71633041, + "learning_rate": 3.564408506040583e-06, + "loss": 0.79462719, + "num_input_tokens_seen": 85074425, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.19604492, + "step": 3953, + "time_per_iteration": 2.599946975708008 + }, + { + "auxiliary_loss_clip": 0.06537458, + "auxiliary_loss_mlp": 0.01292831, + "balance_loss_clip": 0.06305911, + "balance_loss_mlp": 0.01272673, + "epoch": 0.23772734104915075, + "flos": 23411102083200.0, + "grad_norm": 6.547469437533346, + "language_loss": 0.82534778, + "learning_rate": 3.5641658331557356e-06, + "loss": 0.90365064, + "num_input_tokens_seen": 85092865, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.20166016, + "step": 3954, + "time_per_iteration": 2.595163583755493 + }, + { + "auxiliary_loss_clip": 0.06538694, + "auxiliary_loss_mlp": 0.01291334, + "balance_loss_clip": 0.0630859, + "balance_loss_mlp": 0.01271486, + "epoch": 0.23778746430181874, + "flos": 15710902623360.0, + "grad_norm": 2.2065720754909606, + "language_loss": 0.66202033, + "learning_rate": 3.5639231009575634e-06, + "loss": 0.74032056, + "num_input_tokens_seen": 85110175, + "router_z_loss_clip": 2.30273438, + "router_z_loss_mlp": 0.19848633, + "step": 3955, + "time_per_iteration": 2.5345511436462402 + }, + { + "auxiliary_loss_clip": 0.06527859, + "auxiliary_loss_mlp": 0.01285762, + "balance_loss_clip": 0.06301668, + "balance_loss_mlp": 0.01266081, + "epoch": 0.2378475875544867, + "flos": 19432381536000.0, + "grad_norm": 1.4478942147045952, + "language_loss": 0.84203303, + "learning_rate": 3.5636803094552704e-06, + "loss": 0.92016923, + "num_input_tokens_seen": 85129925, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.19689941, + "step": 3956, + "time_per_iteration": 2.5458483695983887 + }, + { + "auxiliary_loss_clip": 0.06526335, + "auxiliary_loss_mlp": 0.01287929, + "balance_loss_clip": 0.06303546, + "balance_loss_mlp": 0.01268438, + "epoch": 0.23790771080715467, + "flos": 22274338867200.0, + "grad_norm": 2.194064451149358, + "language_loss": 0.8561964, + "learning_rate": 3.5634374586580635e-06, + "loss": 0.93433905, + "num_input_tokens_seen": 85147755, + "router_z_loss_clip": 2.2265625, + "router_z_loss_mlp": 0.19494629, + "step": 3957, + "time_per_iteration": 2.5579113960266113 + }, + { + "auxiliary_loss_clip": 0.06532466, + "auxiliary_loss_mlp": 0.01283677, + "balance_loss_clip": 0.0630599, + "balance_loss_mlp": 0.01264008, + "epoch": 0.23796783405982264, + "flos": 20053445598720.0, + "grad_norm": 2.4454692262909856, + "language_loss": 0.7073434, + "learning_rate": 3.563194548575151e-06, + "loss": 0.78550482, + "num_input_tokens_seen": 85165270, + "router_z_loss_clip": 2.26757812, + "router_z_loss_mlp": 0.19665527, + "step": 3958, + "time_per_iteration": 2.556201219558716 + }, + { + "auxiliary_loss_clip": 0.06533751, + "auxiliary_loss_mlp": 0.01277914, + "balance_loss_clip": 0.06301822, + "balance_loss_mlp": 0.01257303, + "epoch": 0.2380279573124906, + "flos": 14251084790400.0, + "grad_norm": 4.548053192599961, + "language_loss": 0.66760004, + "learning_rate": 3.562951579215745e-06, + "loss": 0.74571669, + "num_input_tokens_seen": 85181555, + "router_z_loss_clip": 2.32226562, + "router_z_loss_mlp": 0.20617676, + "step": 3959, + "time_per_iteration": 2.491999626159668 + }, + { + "auxiliary_loss_clip": 0.06529753, + "auxiliary_loss_mlp": 0.01278003, + "balance_loss_clip": 0.06303047, + "balance_loss_mlp": 0.01259228, + "epoch": 0.23808808056515857, + "flos": 21185638767360.0, + "grad_norm": 1.7806564555446132, + "language_loss": 0.72341377, + "learning_rate": 3.5627085505890586e-06, + "loss": 0.80149138, + "num_input_tokens_seen": 85199455, + "router_z_loss_clip": 2.26757812, + "router_z_loss_mlp": 0.18774414, + "step": 3960, + "time_per_iteration": 2.523761034011841 + }, + { + "auxiliary_loss_clip": 0.0652384, + "auxiliary_loss_mlp": 0.0127522, + "balance_loss_clip": 0.06296217, + "balance_loss_mlp": 0.01255169, + "epoch": 0.23814820381782653, + "flos": 22534850810880.0, + "grad_norm": 1.610971251516654, + "language_loss": 0.7476449, + "learning_rate": 3.562465462704307e-06, + "loss": 0.82563543, + "num_input_tokens_seen": 85219170, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.20031738, + "step": 3961, + "time_per_iteration": 2.5350120067596436 + }, + { + "auxiliary_loss_clip": 0.06528293, + "auxiliary_loss_mlp": 0.01283237, + "balance_loss_clip": 0.06297825, + "balance_loss_mlp": 0.01261505, + "epoch": 0.23820832707049452, + "flos": 22309991579520.0, + "grad_norm": 2.008938617955162, + "language_loss": 0.66267157, + "learning_rate": 3.5622223155707085e-06, + "loss": 0.74078679, + "num_input_tokens_seen": 85238480, + "router_z_loss_clip": 2.30273438, + "router_z_loss_mlp": 0.21728516, + "step": 3962, + "time_per_iteration": 2.554936170578003 + }, + { + "auxiliary_loss_clip": 0.06522447, + "auxiliary_loss_mlp": 0.01279056, + "balance_loss_clip": 0.0629696, + "balance_loss_mlp": 0.0126009, + "epoch": 0.2382684503231625, + "flos": 24871297259520.0, + "grad_norm": 1.868964177707197, + "language_loss": 0.75134146, + "learning_rate": 3.561979109197483e-06, + "loss": 0.82935649, + "num_input_tokens_seen": 85259180, + "router_z_loss_clip": 2.25585938, + "router_z_loss_mlp": 0.18969727, + "step": 3963, + "time_per_iteration": 3.9841935634613037 + }, + { + "auxiliary_loss_clip": 0.0652955, + "auxiliary_loss_mlp": 0.01278457, + "balance_loss_clip": 0.06298651, + "balance_loss_mlp": 0.01257428, + "epoch": 0.23832857357583045, + "flos": 21878050181760.0, + "grad_norm": 2.083636930734351, + "language_loss": 0.77508426, + "learning_rate": 3.5617358435938538e-06, + "loss": 0.85316432, + "num_input_tokens_seen": 85278550, + "router_z_loss_clip": 2.30859375, + "router_z_loss_mlp": 0.21032715, + "step": 3964, + "time_per_iteration": 2.546093463897705 + }, + { + "auxiliary_loss_clip": 0.06513681, + "auxiliary_loss_mlp": 0.01275741, + "balance_loss_clip": 0.06290185, + "balance_loss_mlp": 0.01256275, + "epoch": 0.23838869682849842, + "flos": 21294441694080.0, + "grad_norm": 2.0070777911568207, + "language_loss": 0.72507781, + "learning_rate": 3.561492518769045e-06, + "loss": 0.80297208, + "num_input_tokens_seen": 85297345, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.19458008, + "step": 3965, + "time_per_iteration": 2.605717182159424 + }, + { + "auxiliary_loss_clip": 0.06518564, + "auxiliary_loss_mlp": 0.012776, + "balance_loss_clip": 0.06293208, + "balance_loss_mlp": 0.01258181, + "epoch": 0.23844882008116638, + "flos": 16186211308800.0, + "grad_norm": 2.069567415104782, + "language_loss": 0.79030257, + "learning_rate": 3.561249134732282e-06, + "loss": 0.8682642, + "num_input_tokens_seen": 85315105, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.19396973, + "step": 3966, + "time_per_iteration": 3.980722427368164 + }, + { + "auxiliary_loss_clip": 0.06517511, + "auxiliary_loss_mlp": 0.01283232, + "balance_loss_clip": 0.06290257, + "balance_loss_mlp": 0.01264647, + "epoch": 0.23850894333383435, + "flos": 21076165008000.0, + "grad_norm": 3.0015774693629433, + "language_loss": 0.69417417, + "learning_rate": 3.561005691492797e-06, + "loss": 0.77218163, + "num_input_tokens_seen": 85334735, + "router_z_loss_clip": 2.27929688, + "router_z_loss_mlp": 0.18579102, + "step": 3967, + "time_per_iteration": 2.542595386505127 + }, + { + "auxiliary_loss_clip": 0.06523537, + "auxiliary_loss_mlp": 0.01278611, + "balance_loss_clip": 0.0629587, + "balance_loss_mlp": 0.01257821, + "epoch": 0.23856906658650234, + "flos": 17207295563520.0, + "grad_norm": 1.9959497275253817, + "language_loss": 0.68410718, + "learning_rate": 3.5607621890598185e-06, + "loss": 0.76212859, + "num_input_tokens_seen": 85352875, + "router_z_loss_clip": 2.27539062, + "router_z_loss_mlp": 0.20800781, + "step": 3968, + "time_per_iteration": 2.5275728702545166 + }, + { + "auxiliary_loss_clip": 0.06526159, + "auxiliary_loss_mlp": 0.01279655, + "balance_loss_clip": 0.0629804, + "balance_loss_mlp": 0.01261392, + "epoch": 0.2386291898391703, + "flos": 29501451774720.0, + "grad_norm": 2.0078802263631994, + "language_loss": 0.77147222, + "learning_rate": 3.5605186274425823e-06, + "loss": 0.84953034, + "num_input_tokens_seen": 85372205, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.18261719, + "step": 3969, + "time_per_iteration": 4.006864547729492 + }, + { + "auxiliary_loss_clip": 0.06514208, + "auxiliary_loss_mlp": 0.01292793, + "balance_loss_clip": 0.06291697, + "balance_loss_mlp": 0.01274602, + "epoch": 0.23868931309183827, + "flos": 21148854024960.0, + "grad_norm": 1.9717404660495825, + "language_loss": 0.76892555, + "learning_rate": 3.5602750066503225e-06, + "loss": 0.84699559, + "num_input_tokens_seen": 85389705, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.18188477, + "step": 3970, + "time_per_iteration": 2.558915615081787 + }, + { + "auxiliary_loss_clip": 0.06523073, + "auxiliary_loss_mlp": 0.0128602, + "balance_loss_clip": 0.0629265, + "balance_loss_mlp": 0.01265969, + "epoch": 0.23874943634450624, + "flos": 25665342076800.0, + "grad_norm": 2.212795121423013, + "language_loss": 0.85452002, + "learning_rate": 3.5600313266922793e-06, + "loss": 0.93261099, + "num_input_tokens_seen": 85407855, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.20043945, + "step": 3971, + "time_per_iteration": 2.5621652603149414 + }, + { + "auxiliary_loss_clip": 0.06391954, + "auxiliary_loss_mlp": 0.01255828, + "balance_loss_clip": 0.06279661, + "balance_loss_mlp": 0.01251122, + "epoch": 0.2388095595971742, + "flos": 59006871889920.0, + "grad_norm": 0.7183517633018239, + "language_loss": 0.62744105, + "learning_rate": 3.5597875875776915e-06, + "loss": 0.70391893, + "num_input_tokens_seen": 85470885, + "router_z_loss_clip": 1.125, + "router_z_loss_mlp": 0.04696655, + "step": 3972, + "time_per_iteration": 4.643376350402832 + }, + { + "auxiliary_loss_clip": 0.06515118, + "auxiliary_loss_mlp": 0.01277926, + "balance_loss_clip": 0.06290536, + "balance_loss_mlp": 0.01258399, + "epoch": 0.23886968284984217, + "flos": 16805975633280.0, + "grad_norm": 3.0192177240020976, + "language_loss": 0.81866533, + "learning_rate": 3.5595437893158013e-06, + "loss": 0.89659578, + "num_input_tokens_seen": 85488460, + "router_z_loss_clip": 2.2421875, + "router_z_loss_mlp": 0.19543457, + "step": 3973, + "time_per_iteration": 2.5597283840179443 + }, + { + "auxiliary_loss_clip": 0.06517763, + "auxiliary_loss_mlp": 0.01283675, + "balance_loss_clip": 0.06291795, + "balance_loss_mlp": 0.01265162, + "epoch": 0.23892980610251013, + "flos": 22389221214720.0, + "grad_norm": 1.829209898292947, + "language_loss": 0.79696077, + "learning_rate": 3.5592999319158546e-06, + "loss": 0.8749752, + "num_input_tokens_seen": 85508590, + "router_z_loss_clip": 2.25976562, + "router_z_loss_mlp": 0.18518066, + "step": 3974, + "time_per_iteration": 2.5331227779388428 + }, + { + "auxiliary_loss_clip": 0.06524864, + "auxiliary_loss_mlp": 0.01291591, + "balance_loss_clip": 0.06296244, + "balance_loss_mlp": 0.01272279, + "epoch": 0.23898992935517813, + "flos": 12828135553920.0, + "grad_norm": 6.773745042238101, + "language_loss": 0.85156423, + "learning_rate": 3.5590560153870984e-06, + "loss": 0.92972875, + "num_input_tokens_seen": 85525970, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.19311523, + "step": 3975, + "time_per_iteration": 2.6016342639923096 + }, + { + "auxiliary_loss_clip": 0.06513388, + "auxiliary_loss_mlp": 0.01278416, + "balance_loss_clip": 0.06290747, + "balance_loss_mlp": 0.01260117, + "epoch": 0.2390500526078461, + "flos": 22352142983040.0, + "grad_norm": 3.375355565005516, + "language_loss": 0.84191501, + "learning_rate": 3.5588120397387816e-06, + "loss": 0.91983294, + "num_input_tokens_seen": 85543700, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.1829834, + "step": 3976, + "time_per_iteration": 2.5339527130126953 + }, + { + "auxiliary_loss_clip": 0.06511909, + "auxiliary_loss_mlp": 0.01282136, + "balance_loss_clip": 0.06290296, + "balance_loss_mlp": 0.01264111, + "epoch": 0.23911017586051406, + "flos": 22641263896320.0, + "grad_norm": 3.0704844059493497, + "language_loss": 0.74960983, + "learning_rate": 3.5585680049801566e-06, + "loss": 0.82755029, + "num_input_tokens_seen": 85562765, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.18029785, + "step": 3977, + "time_per_iteration": 2.5528597831726074 + }, + { + "auxiliary_loss_clip": 0.06524444, + "auxiliary_loss_mlp": 0.01281803, + "balance_loss_clip": 0.06296922, + "balance_loss_mlp": 0.01261478, + "epoch": 0.23917029911318202, + "flos": 23658993987840.0, + "grad_norm": 3.246082679368102, + "language_loss": 0.7235828, + "learning_rate": 3.5583239111204764e-06, + "loss": 0.80164528, + "num_input_tokens_seen": 85581755, + "router_z_loss_clip": 2.2734375, + "router_z_loss_mlp": 0.203125, + "step": 3978, + "time_per_iteration": 2.548459768295288 + }, + { + "auxiliary_loss_clip": 0.06536747, + "auxiliary_loss_mlp": 0.01279264, + "balance_loss_clip": 0.06306014, + "balance_loss_mlp": 0.0125994, + "epoch": 0.23923042236585, + "flos": 22790163801600.0, + "grad_norm": 2.3394422136849875, + "language_loss": 0.79264927, + "learning_rate": 3.558079758168997e-06, + "loss": 0.87080932, + "num_input_tokens_seen": 85599455, + "router_z_loss_clip": 2.30859375, + "router_z_loss_mlp": 0.1932373, + "step": 3979, + "time_per_iteration": 2.5696120262145996 + }, + { + "auxiliary_loss_clip": 0.06521225, + "auxiliary_loss_mlp": 0.01282521, + "balance_loss_clip": 0.06295727, + "balance_loss_mlp": 0.01263185, + "epoch": 0.23929054561851795, + "flos": 28155300405120.0, + "grad_norm": 1.7900268576070866, + "language_loss": 0.81971824, + "learning_rate": 3.557835546134977e-06, + "loss": 0.89775562, + "num_input_tokens_seen": 85619970, + "router_z_loss_clip": 2.25585938, + "router_z_loss_mlp": 0.1932373, + "step": 3980, + "time_per_iteration": 2.587286949157715 + }, + { + "auxiliary_loss_clip": 0.06519361, + "auxiliary_loss_mlp": 0.01281001, + "balance_loss_clip": 0.06296664, + "balance_loss_mlp": 0.01261891, + "epoch": 0.23935066887118592, + "flos": 21692491315200.0, + "grad_norm": 1.7930077111492302, + "language_loss": 0.84270984, + "learning_rate": 3.5575912750276775e-06, + "loss": 0.92071348, + "num_input_tokens_seen": 85638850, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.19091797, + "step": 3981, + "time_per_iteration": 2.550725221633911 + }, + { + "auxiliary_loss_clip": 0.06535558, + "auxiliary_loss_mlp": 0.01280601, + "balance_loss_clip": 0.06303745, + "balance_loss_mlp": 0.01260669, + "epoch": 0.2394107921238539, + "flos": 32130121737600.0, + "grad_norm": 2.0248039039910393, + "language_loss": 0.77712274, + "learning_rate": 3.5573469448563607e-06, + "loss": 0.85528433, + "num_input_tokens_seen": 85656285, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.19934082, + "step": 3982, + "time_per_iteration": 2.594698667526245 + }, + { + "auxiliary_loss_clip": 0.06530322, + "auxiliary_loss_mlp": 0.01280321, + "balance_loss_clip": 0.06304529, + "balance_loss_mlp": 0.01261307, + "epoch": 0.23947091537652188, + "flos": 17024839297920.0, + "grad_norm": 1.9623565914246572, + "language_loss": 0.7809152, + "learning_rate": 3.5571025556302915e-06, + "loss": 0.85902166, + "num_input_tokens_seen": 85673020, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.19006348, + "step": 3983, + "time_per_iteration": 2.537132740020752 + }, + { + "auxiliary_loss_clip": 0.06527262, + "auxiliary_loss_mlp": 0.01280803, + "balance_loss_clip": 0.0630171, + "balance_loss_mlp": 0.01261956, + "epoch": 0.23953103862918984, + "flos": 20599640438400.0, + "grad_norm": 2.137172968887566, + "language_loss": 0.73945713, + "learning_rate": 3.556858107358737e-06, + "loss": 0.81753772, + "num_input_tokens_seen": 85692565, + "router_z_loss_clip": 2.25585938, + "router_z_loss_mlp": 0.18835449, + "step": 3984, + "time_per_iteration": 2.538221836090088 + }, + { + "auxiliary_loss_clip": 0.06531888, + "auxiliary_loss_mlp": 0.01281613, + "balance_loss_clip": 0.06302323, + "balance_loss_mlp": 0.01262587, + "epoch": 0.2395911618818578, + "flos": 20710707425280.0, + "grad_norm": 1.9765684717262704, + "language_loss": 0.7965889, + "learning_rate": 3.5566136000509674e-06, + "loss": 0.87472391, + "num_input_tokens_seen": 85709730, + "router_z_loss_clip": 2.29492188, + "router_z_loss_mlp": 0.19030762, + "step": 3985, + "time_per_iteration": 2.551649570465088 + }, + { + "auxiliary_loss_clip": 0.06532246, + "auxiliary_loss_mlp": 0.0127953, + "balance_loss_clip": 0.06304285, + "balance_loss_mlp": 0.01259265, + "epoch": 0.23965128513452577, + "flos": 27060982081920.0, + "grad_norm": 1.916737509209056, + "language_loss": 0.73610401, + "learning_rate": 3.556369033716254e-06, + "loss": 0.8142218, + "num_input_tokens_seen": 85730045, + "router_z_loss_clip": 2.27929688, + "router_z_loss_mlp": 0.20263672, + "step": 3986, + "time_per_iteration": 2.710397481918335 + }, + { + "auxiliary_loss_clip": 0.06540911, + "auxiliary_loss_mlp": 0.01281338, + "balance_loss_clip": 0.0630495, + "balance_loss_mlp": 0.01261, + "epoch": 0.23971140838719374, + "flos": 23150254723200.0, + "grad_norm": 1.785192597796332, + "language_loss": 0.88325328, + "learning_rate": 3.556124408363871e-06, + "loss": 0.96147585, + "num_input_tokens_seen": 85747590, + "router_z_loss_clip": 2.36328125, + "router_z_loss_mlp": 0.20336914, + "step": 3987, + "time_per_iteration": 2.6331911087036133 + }, + { + "auxiliary_loss_clip": 0.06529854, + "auxiliary_loss_mlp": 0.01278502, + "balance_loss_clip": 0.06312454, + "balance_loss_mlp": 0.0126043, + "epoch": 0.23977153163986173, + "flos": 18039341007360.0, + "grad_norm": 2.2552133940915224, + "language_loss": 0.84056735, + "learning_rate": 3.5558797240030945e-06, + "loss": 0.91865093, + "num_input_tokens_seen": 85763460, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.18078613, + "step": 3988, + "time_per_iteration": 2.5413994789123535 + }, + { + "auxiliary_loss_clip": 0.06533512, + "auxiliary_loss_mlp": 0.01288032, + "balance_loss_clip": 0.06306052, + "balance_loss_mlp": 0.01267052, + "epoch": 0.2398316548925297, + "flos": 18119157621120.0, + "grad_norm": 1.6232739060807335, + "language_loss": 0.85473406, + "learning_rate": 3.5556349806432035e-06, + "loss": 0.93294942, + "num_input_tokens_seen": 85782050, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.2097168, + "step": 3989, + "time_per_iteration": 2.528348207473755 + }, + { + "auxiliary_loss_clip": 0.06527147, + "auxiliary_loss_mlp": 0.01286562, + "balance_loss_clip": 0.06305796, + "balance_loss_mlp": 0.01266642, + "epoch": 0.23989177814519766, + "flos": 12572612928000.0, + "grad_norm": 2.695913709141839, + "language_loss": 0.8517406, + "learning_rate": 3.555390178293477e-06, + "loss": 0.92987764, + "num_input_tokens_seen": 85797400, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.19909668, + "step": 3990, + "time_per_iteration": 2.52915358543396 + }, + { + "auxiliary_loss_clip": 0.06527729, + "auxiliary_loss_mlp": 0.01283435, + "balance_loss_clip": 0.06302518, + "balance_loss_mlp": 0.01264064, + "epoch": 0.23995190139786562, + "flos": 25271569013760.0, + "grad_norm": 1.4267230320219149, + "language_loss": 0.76345301, + "learning_rate": 3.5551453169631994e-06, + "loss": 0.84156466, + "num_input_tokens_seen": 85818995, + "router_z_loss_clip": 2.25, + "router_z_loss_mlp": 0.19372559, + "step": 3991, + "time_per_iteration": 2.556820869445801 + }, + { + "auxiliary_loss_clip": 0.06413993, + "auxiliary_loss_mlp": 0.0126815, + "balance_loss_clip": 0.06298733, + "balance_loss_mlp": 0.01262789, + "epoch": 0.2400120246505336, + "flos": 61978107271680.0, + "grad_norm": 0.8724678757997124, + "language_loss": 0.6358996, + "learning_rate": 3.554900396661656e-06, + "loss": 0.71272099, + "num_input_tokens_seen": 85876695, + "router_z_loss_clip": 1.15625, + "router_z_loss_mlp": 0.05368042, + "step": 3992, + "time_per_iteration": 3.0817418098449707 + }, + { + "auxiliary_loss_clip": 0.06411353, + "auxiliary_loss_mlp": 0.01264238, + "balance_loss_clip": 0.06297012, + "balance_loss_mlp": 0.01259121, + "epoch": 0.24007214790320155, + "flos": 66727923816960.0, + "grad_norm": 0.7394753945990321, + "language_loss": 0.62864375, + "learning_rate": 3.5546554173981334e-06, + "loss": 0.70539963, + "num_input_tokens_seen": 85940990, + "router_z_loss_clip": 1.140625, + "router_z_loss_mlp": 0.05117798, + "step": 3993, + "time_per_iteration": 3.2552971839904785 + }, + { + "auxiliary_loss_clip": 0.0652933, + "auxiliary_loss_mlp": 0.01280032, + "balance_loss_clip": 0.062997, + "balance_loss_mlp": 0.0125886, + "epoch": 0.24013227115586952, + "flos": 25815667501440.0, + "grad_norm": 1.8775036450716396, + "language_loss": 0.77610862, + "learning_rate": 3.5544103791819218e-06, + "loss": 0.85420227, + "num_input_tokens_seen": 85961165, + "router_z_loss_clip": 2.30078125, + "router_z_loss_mlp": 0.21154785, + "step": 3994, + "time_per_iteration": 2.6225738525390625 + }, + { + "auxiliary_loss_clip": 0.06526788, + "auxiliary_loss_mlp": 0.01288387, + "balance_loss_clip": 0.06296962, + "balance_loss_mlp": 0.01266822, + "epoch": 0.2401923944085375, + "flos": 25564672995840.0, + "grad_norm": 1.626402048760673, + "language_loss": 0.78733414, + "learning_rate": 3.5541652820223124e-06, + "loss": 0.86548591, + "num_input_tokens_seen": 85982710, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.21557617, + "step": 3995, + "time_per_iteration": 2.5860579013824463 + }, + { + "auxiliary_loss_clip": 0.06395802, + "auxiliary_loss_mlp": 0.01265549, + "balance_loss_clip": 0.06282736, + "balance_loss_mlp": 0.01260685, + "epoch": 0.24025251766120548, + "flos": 54961457892480.0, + "grad_norm": 0.8928130340410044, + "language_loss": 0.63566971, + "learning_rate": 3.5539201259286006e-06, + "loss": 0.71228325, + "num_input_tokens_seen": 86046935, + "router_z_loss_clip": 1.12597656, + "router_z_loss_mlp": 0.04858398, + "step": 3996, + "time_per_iteration": 3.232227087020874 + }, + { + "auxiliary_loss_clip": 0.06522241, + "auxiliary_loss_mlp": 0.01283128, + "balance_loss_clip": 0.06290409, + "balance_loss_mlp": 0.0126328, + "epoch": 0.24031264091387344, + "flos": 20637305648640.0, + "grad_norm": 2.8724335092069864, + "language_loss": 0.71121502, + "learning_rate": 3.5536749109100808e-06, + "loss": 0.78926873, + "num_input_tokens_seen": 86064355, + "router_z_loss_clip": 2.31835938, + "router_z_loss_mlp": 0.19848633, + "step": 3997, + "time_per_iteration": 2.5484869480133057 + }, + { + "auxiliary_loss_clip": 0.06510898, + "auxiliary_loss_mlp": 0.01285703, + "balance_loss_clip": 0.06285729, + "balance_loss_mlp": 0.01265473, + "epoch": 0.2403727641665414, + "flos": 20892492858240.0, + "grad_norm": 1.7909711234465908, + "language_loss": 0.87516266, + "learning_rate": 3.5534296369760535e-06, + "loss": 0.9531287, + "num_input_tokens_seen": 86081340, + "router_z_loss_clip": 2.25390625, + "router_z_loss_mlp": 0.20227051, + "step": 3998, + "time_per_iteration": 2.563215970993042 + }, + { + "auxiliary_loss_clip": 0.06526193, + "auxiliary_loss_mlp": 0.01279159, + "balance_loss_clip": 0.06292593, + "balance_loss_mlp": 0.01258762, + "epoch": 0.24043288741920937, + "flos": 22826613127680.0, + "grad_norm": 1.593528116777893, + "language_loss": 0.76414531, + "learning_rate": 3.5531843041358183e-06, + "loss": 0.84219879, + "num_input_tokens_seen": 86102260, + "router_z_loss_clip": 2.3359375, + "router_z_loss_mlp": 0.20410156, + "step": 3999, + "time_per_iteration": 2.5577592849731445 + }, + { + "auxiliary_loss_clip": 0.06511137, + "auxiliary_loss_mlp": 0.01275527, + "balance_loss_clip": 0.0628795, + "balance_loss_mlp": 0.01256716, + "epoch": 0.24049301067187734, + "flos": 27966261594240.0, + "grad_norm": 2.3407253335254086, + "language_loss": 0.73292184, + "learning_rate": 3.552938912398679e-06, + "loss": 0.81078851, + "num_input_tokens_seen": 86123400, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.18823242, + "step": 4000, + "time_per_iteration": 2.583524703979492 + }, + { + "auxiliary_loss_clip": 0.06528921, + "auxiliary_loss_mlp": 0.01283655, + "balance_loss_clip": 0.06293923, + "balance_loss_mlp": 0.01261935, + "epoch": 0.24055313392454533, + "flos": 27458360870400.0, + "grad_norm": 2.671051655318694, + "language_loss": 0.67159665, + "learning_rate": 3.5526934617739397e-06, + "loss": 0.74972242, + "num_input_tokens_seen": 86144060, + "router_z_loss_clip": 2.3515625, + "router_z_loss_mlp": 0.21728516, + "step": 4001, + "time_per_iteration": 2.6188552379608154 + }, + { + "auxiliary_loss_clip": 0.06522354, + "auxiliary_loss_mlp": 0.01279459, + "balance_loss_clip": 0.06293849, + "balance_loss_mlp": 0.01257703, + "epoch": 0.2406132571772133, + "flos": 25563666746880.0, + "grad_norm": 5.034242823707272, + "language_loss": 0.83152658, + "learning_rate": 3.5524479522709095e-06, + "loss": 0.90954471, + "num_input_tokens_seen": 86163005, + "router_z_loss_clip": 2.28320312, + "router_z_loss_mlp": 0.21740723, + "step": 4002, + "time_per_iteration": 3.9769785404205322 + }, + { + "auxiliary_loss_clip": 0.06519094, + "auxiliary_loss_mlp": 0.01282536, + "balance_loss_clip": 0.06293523, + "balance_loss_mlp": 0.01262032, + "epoch": 0.24067338042988126, + "flos": 24798482461440.0, + "grad_norm": 2.0463487498067323, + "language_loss": 0.83599687, + "learning_rate": 3.552202383898897e-06, + "loss": 0.91401321, + "num_input_tokens_seen": 86182580, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.20483398, + "step": 4003, + "time_per_iteration": 2.581669569015503 + }, + { + "auxiliary_loss_clip": 0.06526292, + "auxiliary_loss_mlp": 0.01281725, + "balance_loss_clip": 0.06295015, + "balance_loss_mlp": 0.01261412, + "epoch": 0.24073350368254923, + "flos": 21184171320960.0, + "grad_norm": 2.0670244348036646, + "language_loss": 0.87907362, + "learning_rate": 3.551956756667215e-06, + "loss": 0.9571538, + "num_input_tokens_seen": 86200665, + "router_z_loss_clip": 2.31445312, + "router_z_loss_mlp": 0.20300293, + "step": 4004, + "time_per_iteration": 2.514268636703491 + }, + { + "auxiliary_loss_clip": 0.06526911, + "auxiliary_loss_mlp": 0.01282868, + "balance_loss_clip": 0.06294513, + "balance_loss_mlp": 0.01261815, + "epoch": 0.2407936269352172, + "flos": 22501252523520.0, + "grad_norm": 3.538522770409821, + "language_loss": 0.78168321, + "learning_rate": 3.551711070585177e-06, + "loss": 0.85978097, + "num_input_tokens_seen": 86221640, + "router_z_loss_clip": 2.32421875, + "router_z_loss_mlp": 0.21057129, + "step": 4005, + "time_per_iteration": 2.67775559425354 + }, + { + "auxiliary_loss_clip": 0.0651572, + "auxiliary_loss_mlp": 0.01283457, + "balance_loss_clip": 0.06293365, + "balance_loss_mlp": 0.01263084, + "epoch": 0.24085375018788516, + "flos": 18556968804480.0, + "grad_norm": 2.371719422478697, + "language_loss": 0.79360878, + "learning_rate": 3.5514653256620995e-06, + "loss": 0.87160051, + "num_input_tokens_seen": 86240795, + "router_z_loss_clip": 2.2265625, + "router_z_loss_mlp": 0.20373535, + "step": 4006, + "time_per_iteration": 4.034858465194702 + }, + { + "auxiliary_loss_clip": 0.0653493, + "auxiliary_loss_mlp": 0.01283621, + "balance_loss_clip": 0.06296381, + "balance_loss_mlp": 0.01260709, + "epoch": 0.24091387344055312, + "flos": 24177418398720.0, + "grad_norm": 1.8737477168573817, + "language_loss": 0.71813238, + "learning_rate": 3.551219521907302e-06, + "loss": 0.79631788, + "num_input_tokens_seen": 86262000, + "router_z_loss_clip": 2.38671875, + "router_z_loss_mlp": 0.22912598, + "step": 4007, + "time_per_iteration": 2.5730202198028564 + }, + { + "auxiliary_loss_clip": 0.06518448, + "auxiliary_loss_mlp": 0.01300708, + "balance_loss_clip": 0.06295364, + "balance_loss_mlp": 0.01278773, + "epoch": 0.24097399669322112, + "flos": 11041112327040.0, + "grad_norm": 6.473369852788927, + "language_loss": 0.76978099, + "learning_rate": 3.5509736593301042e-06, + "loss": 0.84797251, + "num_input_tokens_seen": 86279680, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.21936035, + "step": 4008, + "time_per_iteration": 2.55989146232605 + }, + { + "auxiliary_loss_clip": 0.06518552, + "auxiliary_loss_mlp": 0.01286303, + "balance_loss_clip": 0.062894, + "balance_loss_mlp": 0.01264928, + "epoch": 0.24103411994588908, + "flos": 17170762383360.0, + "grad_norm": 2.1979472110907556, + "language_loss": 0.75080305, + "learning_rate": 3.5507277379398295e-06, + "loss": 0.82885164, + "num_input_tokens_seen": 86297180, + "router_z_loss_clip": 2.29296875, + "router_z_loss_mlp": 0.21398926, + "step": 4009, + "time_per_iteration": 3.957920551300049 + }, + { + "auxiliary_loss_clip": 0.06521554, + "auxiliary_loss_mlp": 0.01301136, + "balance_loss_clip": 0.06293823, + "balance_loss_mlp": 0.01279869, + "epoch": 0.24109424319855705, + "flos": 20674258099200.0, + "grad_norm": 1.5898496231384156, + "language_loss": 0.80111217, + "learning_rate": 3.550481757745804e-06, + "loss": 0.8793391, + "num_input_tokens_seen": 86317660, + "router_z_loss_clip": 2.27539062, + "router_z_loss_mlp": 0.21264648, + "step": 4010, + "time_per_iteration": 2.5475916862487793 + }, + { + "auxiliary_loss_clip": 0.06527252, + "auxiliary_loss_mlp": 0.01291864, + "balance_loss_clip": 0.06297424, + "balance_loss_mlp": 0.01268964, + "epoch": 0.241154366451225, + "flos": 28188982546560.0, + "grad_norm": 2.0856120841249366, + "language_loss": 0.70933908, + "learning_rate": 3.5502357187573555e-06, + "loss": 0.78753024, + "num_input_tokens_seen": 86338325, + "router_z_loss_clip": 2.29882812, + "router_z_loss_mlp": 0.22912598, + "step": 4011, + "time_per_iteration": 2.630932092666626 + }, + { + "auxiliary_loss_clip": 0.06528456, + "auxiliary_loss_mlp": 0.0128714, + "balance_loss_clip": 0.06298923, + "balance_loss_mlp": 0.01265766, + "epoch": 0.24121448970389298, + "flos": 21696222821760.0, + "grad_norm": 1.7418824634594252, + "language_loss": 0.694484, + "learning_rate": 3.5499896209838118e-06, + "loss": 0.77263987, + "num_input_tokens_seen": 86357615, + "router_z_loss_clip": 2.29492188, + "router_z_loss_mlp": 0.21362305, + "step": 4012, + "time_per_iteration": 3.988281726837158 + }, + { + "auxiliary_loss_clip": 0.06528036, + "auxiliary_loss_mlp": 0.01287792, + "balance_loss_clip": 0.06296879, + "balance_loss_mlp": 0.01264391, + "epoch": 0.24127461295656094, + "flos": 39685530142080.0, + "grad_norm": 1.5971840931497265, + "language_loss": 0.74512959, + "learning_rate": 3.5497434644345073e-06, + "loss": 0.82328784, + "num_input_tokens_seen": 86380355, + "router_z_loss_clip": 2.31054688, + "router_z_loss_mlp": 0.23388672, + "step": 4013, + "time_per_iteration": 2.7159719467163086 + }, + { + "auxiliary_loss_clip": 0.06531674, + "auxiliary_loss_mlp": 0.01283711, + "balance_loss_clip": 0.0630402, + "balance_loss_mlp": 0.01263231, + "epoch": 0.2413347362092289, + "flos": 19141960884480.0, + "grad_norm": 1.667652232266074, + "language_loss": 0.89031768, + "learning_rate": 3.5494972491187753e-06, + "loss": 0.96847153, + "num_input_tokens_seen": 86399125, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.20483398, + "step": 4014, + "time_per_iteration": 2.5638303756713867 + }, + { + "auxiliary_loss_clip": 0.06538786, + "auxiliary_loss_mlp": 0.01289681, + "balance_loss_clip": 0.06304225, + "balance_loss_mlp": 0.01268831, + "epoch": 0.2413948594618969, + "flos": 26946099734400.0, + "grad_norm": 1.9521080560444544, + "language_loss": 0.95043075, + "learning_rate": 3.549250975045952e-06, + "loss": 1.02871537, + "num_input_tokens_seen": 86418625, + "router_z_loss_clip": 2.34179688, + "router_z_loss_mlp": 0.20849609, + "step": 4015, + "time_per_iteration": 2.5697052478790283 + }, + { + "auxiliary_loss_clip": 0.0653477, + "auxiliary_loss_mlp": 0.01278309, + "balance_loss_clip": 0.06306844, + "balance_loss_mlp": 0.01257781, + "epoch": 0.24145498271456486, + "flos": 25235077760640.0, + "grad_norm": 1.8045004389175856, + "language_loss": 0.83243644, + "learning_rate": 3.5490046422253768e-06, + "loss": 0.91056728, + "num_input_tokens_seen": 86438375, + "router_z_loss_clip": 2.27929688, + "router_z_loss_mlp": 0.2052002, + "step": 4016, + "time_per_iteration": 2.5709176063537598 + }, + { + "auxiliary_loss_clip": 0.06532364, + "auxiliary_loss_mlp": 0.01285254, + "balance_loss_clip": 0.06311545, + "balance_loss_mlp": 0.0126463, + "epoch": 0.24151510596723283, + "flos": 40671339027840.0, + "grad_norm": 2.079467312298135, + "language_loss": 0.69439638, + "learning_rate": 3.54875825066639e-06, + "loss": 0.77257252, + "num_input_tokens_seen": 86463230, + "router_z_loss_clip": 2.20507812, + "router_z_loss_mlp": 0.20617676, + "step": 4017, + "time_per_iteration": 2.6893186569213867 + }, + { + "auxiliary_loss_clip": 0.06536807, + "auxiliary_loss_mlp": 0.01288936, + "balance_loss_clip": 0.06305309, + "balance_loss_mlp": 0.01266286, + "epoch": 0.2415752292199008, + "flos": 18151917367680.0, + "grad_norm": 1.6840714927615923, + "language_loss": 0.84970623, + "learning_rate": 3.5485118003783353e-06, + "loss": 0.92796361, + "num_input_tokens_seen": 86481230, + "router_z_loss_clip": 2.31445312, + "router_z_loss_mlp": 0.2265625, + "step": 4018, + "time_per_iteration": 2.521129608154297 + }, + { + "auxiliary_loss_clip": 0.06448493, + "auxiliary_loss_mlp": 0.01257752, + "balance_loss_clip": 0.06334345, + "balance_loss_mlp": 0.01253335, + "epoch": 0.24163535247256876, + "flos": 67307213819520.0, + "grad_norm": 1.2396896293086193, + "language_loss": 0.6054306, + "learning_rate": 3.548265291370558e-06, + "loss": 0.68249303, + "num_input_tokens_seen": 86541260, + "router_z_loss_clip": 1.140625, + "router_z_loss_mlp": 0.04425049, + "step": 4019, + "time_per_iteration": 3.2191333770751953 + }, + { + "auxiliary_loss_clip": 0.06539527, + "auxiliary_loss_mlp": 0.0127314, + "balance_loss_clip": 0.06310145, + "balance_loss_mlp": 0.01253983, + "epoch": 0.24169547572523672, + "flos": 24935810503680.0, + "grad_norm": 1.839335570686334, + "language_loss": 0.73635018, + "learning_rate": 3.5480187236524055e-06, + "loss": 0.81447685, + "num_input_tokens_seen": 86559580, + "router_z_loss_clip": 2.29296875, + "router_z_loss_mlp": 0.19140625, + "step": 4020, + "time_per_iteration": 2.587033271789551 + }, + { + "auxiliary_loss_clip": 0.06547633, + "auxiliary_loss_mlp": 0.01279706, + "balance_loss_clip": 0.06321433, + "balance_loss_mlp": 0.01259094, + "epoch": 0.24175559897790472, + "flos": 18733303722240.0, + "grad_norm": 1.757855043925666, + "language_loss": 0.81927264, + "learning_rate": 3.5477720972332285e-06, + "loss": 0.89754599, + "num_input_tokens_seen": 86577560, + "router_z_loss_clip": 2.26171875, + "router_z_loss_mlp": 0.20617676, + "step": 4021, + "time_per_iteration": 2.516295909881592 + }, + { + "auxiliary_loss_clip": 0.06542306, + "auxiliary_loss_mlp": 0.0127859, + "balance_loss_clip": 0.06314138, + "balance_loss_mlp": 0.01255201, + "epoch": 0.24181572223057268, + "flos": 23045937989760.0, + "grad_norm": 1.9677245364232816, + "language_loss": 0.76831293, + "learning_rate": 3.547525412122378e-06, + "loss": 0.84652191, + "num_input_tokens_seen": 86595350, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.23388672, + "step": 4022, + "time_per_iteration": 2.560833692550659 + }, + { + "auxiliary_loss_clip": 0.0655847, + "auxiliary_loss_mlp": 0.01279281, + "balance_loss_clip": 0.06321847, + "balance_loss_mlp": 0.01257477, + "epoch": 0.24187584548324065, + "flos": 20382411928320.0, + "grad_norm": 1.7589452517035808, + "language_loss": 0.75334597, + "learning_rate": 3.5472786683292083e-06, + "loss": 0.83172357, + "num_input_tokens_seen": 86614805, + "router_z_loss_clip": 2.36523438, + "router_z_loss_mlp": 0.21789551, + "step": 4023, + "time_per_iteration": 2.5414137840270996 + }, + { + "auxiliary_loss_clip": 0.06554291, + "auxiliary_loss_mlp": 0.01279196, + "balance_loss_clip": 0.06325305, + "balance_loss_mlp": 0.01258466, + "epoch": 0.2419359687359086, + "flos": 21403915453440.0, + "grad_norm": 1.837159559636974, + "language_loss": 0.82581335, + "learning_rate": 3.5470318658630766e-06, + "loss": 0.90414816, + "num_input_tokens_seen": 86633700, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.20751953, + "step": 4024, + "time_per_iteration": 2.570636034011841 + }, + { + "auxiliary_loss_clip": 0.06544912, + "auxiliary_loss_mlp": 0.01281053, + "balance_loss_clip": 0.06319256, + "balance_loss_mlp": 0.01260394, + "epoch": 0.24199609198857658, + "flos": 18375309152640.0, + "grad_norm": 1.8763334718563411, + "language_loss": 0.86724782, + "learning_rate": 3.5467850047333424e-06, + "loss": 0.94550753, + "num_input_tokens_seen": 86650905, + "router_z_loss_clip": 2.25585938, + "router_z_loss_mlp": 0.20654297, + "step": 4025, + "time_per_iteration": 2.507725715637207 + }, + { + "auxiliary_loss_clip": 0.0654591, + "auxiliary_loss_mlp": 0.01281956, + "balance_loss_clip": 0.06312732, + "balance_loss_mlp": 0.01261905, + "epoch": 0.24205621524124454, + "flos": 19469962892160.0, + "grad_norm": 2.105058685916829, + "language_loss": 0.72386706, + "learning_rate": 3.546538084949365e-06, + "loss": 0.80214572, + "num_input_tokens_seen": 86669185, + "router_z_loss_clip": 2.33398438, + "router_z_loss_mlp": 0.20068359, + "step": 4026, + "time_per_iteration": 2.573822498321533 + }, + { + "auxiliary_loss_clip": 0.06536272, + "auxiliary_loss_mlp": 0.01278576, + "balance_loss_clip": 0.06314979, + "balance_loss_mlp": 0.01258191, + "epoch": 0.2421163384939125, + "flos": 14981706466560.0, + "grad_norm": 5.331027510747572, + "language_loss": 0.64474452, + "learning_rate": 3.546291106520509e-06, + "loss": 0.722893, + "num_input_tokens_seen": 86686805, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.20397949, + "step": 4027, + "time_per_iteration": 2.5038652420043945 + }, + { + "auxiliary_loss_clip": 0.06553975, + "auxiliary_loss_mlp": 0.01291382, + "balance_loss_clip": 0.063242, + "balance_loss_mlp": 0.01271069, + "epoch": 0.2421764617465805, + "flos": 18668161572480.0, + "grad_norm": 2.149571528027882, + "language_loss": 0.70816404, + "learning_rate": 3.5460440694561388e-06, + "loss": 0.78661758, + "num_input_tokens_seen": 86705520, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.203125, + "step": 4028, + "time_per_iteration": 2.5707366466522217 + }, + { + "auxiliary_loss_clip": 0.06448589, + "auxiliary_loss_mlp": 0.01261037, + "balance_loss_clip": 0.06335288, + "balance_loss_mlp": 0.01254865, + "epoch": 0.24223658499924847, + "flos": 64368025424640.0, + "grad_norm": 0.8397041896242922, + "language_loss": 0.55315495, + "learning_rate": 3.545796973765623e-06, + "loss": 0.63025129, + "num_input_tokens_seen": 86767320, + "router_z_loss_clip": 1.13085938, + "router_z_loss_mlp": 0.06170654, + "step": 4029, + "time_per_iteration": 3.149601936340332 + }, + { + "auxiliary_loss_clip": 0.06557409, + "auxiliary_loss_mlp": 0.01307587, + "balance_loss_clip": 0.06331506, + "balance_loss_mlp": 0.01284615, + "epoch": 0.24229670825191643, + "flos": 25782278849280.0, + "grad_norm": 2.2612571716693664, + "language_loss": 0.75111073, + "learning_rate": 3.54554981945833e-06, + "loss": 0.82976073, + "num_input_tokens_seen": 86788110, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.22998047, + "step": 4030, + "time_per_iteration": 2.5939297676086426 + }, + { + "auxiliary_loss_clip": 0.0654521, + "auxiliary_loss_mlp": 0.0130894, + "balance_loss_clip": 0.06321512, + "balance_loss_mlp": 0.01287733, + "epoch": 0.2423568315045844, + "flos": 20673251850240.0, + "grad_norm": 1.8607136485921192, + "language_loss": 0.77126729, + "learning_rate": 3.5453026065436343e-06, + "loss": 0.84980875, + "num_input_tokens_seen": 86807640, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.2121582, + "step": 4031, + "time_per_iteration": 2.5886638164520264 + }, + { + "auxiliary_loss_clip": 0.06556953, + "auxiliary_loss_mlp": 0.01312472, + "balance_loss_clip": 0.06323709, + "balance_loss_mlp": 0.01290252, + "epoch": 0.24241695475725236, + "flos": 22422987210240.0, + "grad_norm": 1.956173023936914, + "language_loss": 0.66108859, + "learning_rate": 3.5450553350309083e-06, + "loss": 0.73978281, + "num_input_tokens_seen": 86826795, + "router_z_loss_clip": 2.33203125, + "router_z_loss_mlp": 0.22216797, + "step": 4032, + "time_per_iteration": 2.5665037631988525 + }, + { + "auxiliary_loss_clip": 0.06539695, + "auxiliary_loss_mlp": 0.01309421, + "balance_loss_clip": 0.06316876, + "balance_loss_mlp": 0.0128751, + "epoch": 0.24247707800992033, + "flos": 17134732327680.0, + "grad_norm": 3.4494454498841725, + "language_loss": 0.81464761, + "learning_rate": 3.5448080049295286e-06, + "loss": 0.89313877, + "num_input_tokens_seen": 86843175, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.21911621, + "step": 4033, + "time_per_iteration": 2.5237317085266113 + }, + { + "auxiliary_loss_clip": 0.06538171, + "auxiliary_loss_mlp": 0.01328283, + "balance_loss_clip": 0.06318024, + "balance_loss_mlp": 0.01305359, + "epoch": 0.2425372012625883, + "flos": 31621885597440.0, + "grad_norm": 1.909836856098088, + "language_loss": 0.69935066, + "learning_rate": 3.5445606162488754e-06, + "loss": 0.7780152, + "num_input_tokens_seen": 86863185, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.22900391, + "step": 4034, + "time_per_iteration": 2.713991641998291 + }, + { + "auxiliary_loss_clip": 0.06546839, + "auxiliary_loss_mlp": 0.01319063, + "balance_loss_clip": 0.06324256, + "balance_loss_mlp": 0.01298273, + "epoch": 0.24259732451525629, + "flos": 16331589342720.0, + "grad_norm": 2.1729941621503532, + "language_loss": 0.96340013, + "learning_rate": 3.5443131689983283e-06, + "loss": 1.04205918, + "num_input_tokens_seen": 86880040, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.20776367, + "step": 4035, + "time_per_iteration": 2.532848596572876 + }, + { + "auxiliary_loss_clip": 0.06537193, + "auxiliary_loss_mlp": 0.01327475, + "balance_loss_clip": 0.06319901, + "balance_loss_mlp": 0.01307447, + "epoch": 0.24265744776792425, + "flos": 22863230161920.0, + "grad_norm": 1.6992215283488847, + "language_loss": 0.78653824, + "learning_rate": 3.5440656631872715e-06, + "loss": 0.8651849, + "num_input_tokens_seen": 86900610, + "router_z_loss_clip": 2.17578125, + "router_z_loss_mlp": 0.20019531, + "step": 4036, + "time_per_iteration": 2.6079328060150146 + }, + { + "auxiliary_loss_clip": 0.06539825, + "auxiliary_loss_mlp": 0.01304693, + "balance_loss_clip": 0.06315397, + "balance_loss_mlp": 0.01282806, + "epoch": 0.24271757102059222, + "flos": 21878008254720.0, + "grad_norm": 1.624872867937933, + "language_loss": 0.74970233, + "learning_rate": 3.5438180988250898e-06, + "loss": 0.82814753, + "num_input_tokens_seen": 86919385, + "router_z_loss_clip": 2.2421875, + "router_z_loss_mlp": 0.21887207, + "step": 4037, + "time_per_iteration": 2.561479091644287 + }, + { + "auxiliary_loss_clip": 0.06526245, + "auxiliary_loss_mlp": 0.01308805, + "balance_loss_clip": 0.06302498, + "balance_loss_mlp": 0.01287539, + "epoch": 0.24277769427326018, + "flos": 19214649901440.0, + "grad_norm": 4.15075765155633, + "language_loss": 0.76952362, + "learning_rate": 3.543570475921171e-06, + "loss": 0.84787416, + "num_input_tokens_seen": 86938885, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.21276855, + "step": 4038, + "time_per_iteration": 2.514899492263794 + }, + { + "auxiliary_loss_clip": 0.06539176, + "auxiliary_loss_mlp": 0.01295141, + "balance_loss_clip": 0.06314565, + "balance_loss_mlp": 0.01272992, + "epoch": 0.24283781752592815, + "flos": 19505909093760.0, + "grad_norm": 2.116114626089979, + "language_loss": 0.72802031, + "learning_rate": 3.543322794484905e-06, + "loss": 0.80636352, + "num_input_tokens_seen": 86957705, + "router_z_loss_clip": 2.24414062, + "router_z_loss_mlp": 0.22167969, + "step": 4039, + "time_per_iteration": 2.603787422180176 + }, + { + "auxiliary_loss_clip": 0.06537706, + "auxiliary_loss_mlp": 0.01290985, + "balance_loss_clip": 0.06312682, + "balance_loss_mlp": 0.01269372, + "epoch": 0.2428979407785961, + "flos": 19908444908160.0, + "grad_norm": 1.7691638050154863, + "language_loss": 0.78818536, + "learning_rate": 3.5430750545256843e-06, + "loss": 0.86647218, + "num_input_tokens_seen": 86975845, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.21606445, + "step": 4040, + "time_per_iteration": 2.570063829421997 + }, + { + "auxiliary_loss_clip": 0.06530759, + "auxiliary_loss_mlp": 0.01283615, + "balance_loss_clip": 0.06313588, + "balance_loss_mlp": 0.01265162, + "epoch": 0.2429580640312641, + "flos": 24722523135360.0, + "grad_norm": 1.6907745152184719, + "language_loss": 0.81039703, + "learning_rate": 3.5428272560529027e-06, + "loss": 0.8885408, + "num_input_tokens_seen": 86994800, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.18444824, + "step": 4041, + "time_per_iteration": 2.5693795680999756 + }, + { + "auxiliary_loss_clip": 0.06532191, + "auxiliary_loss_mlp": 0.01286793, + "balance_loss_clip": 0.06311769, + "balance_loss_mlp": 0.01267529, + "epoch": 0.24301818728393207, + "flos": 25637529720960.0, + "grad_norm": 3.2457124561568, + "language_loss": 0.77433085, + "learning_rate": 3.542579399075957e-06, + "loss": 0.8525207, + "num_input_tokens_seen": 87016845, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.19262695, + "step": 4042, + "time_per_iteration": 3.9626972675323486 + }, + { + "auxiliary_loss_clip": 0.0653407, + "auxiliary_loss_mlp": 0.01279857, + "balance_loss_clip": 0.06316316, + "balance_loss_mlp": 0.01260652, + "epoch": 0.24307831053660003, + "flos": 26148700753920.0, + "grad_norm": 1.8532279658121147, + "language_loss": 0.82188201, + "learning_rate": 3.542331483604246e-06, + "loss": 0.90002131, + "num_input_tokens_seen": 87036270, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.19226074, + "step": 4043, + "time_per_iteration": 2.598202705383301 + }, + { + "auxiliary_loss_clip": 0.06538229, + "auxiliary_loss_mlp": 0.0127841, + "balance_loss_clip": 0.06309159, + "balance_loss_mlp": 0.01256594, + "epoch": 0.243138433789268, + "flos": 14977136419200.0, + "grad_norm": 2.775508644952731, + "language_loss": 0.73897892, + "learning_rate": 3.5420835096471706e-06, + "loss": 0.81714529, + "num_input_tokens_seen": 87049920, + "router_z_loss_clip": 2.29101562, + "router_z_loss_mlp": 0.21801758, + "step": 4044, + "time_per_iteration": 2.483752489089966 + }, + { + "auxiliary_loss_clip": 0.06534028, + "auxiliary_loss_mlp": 0.01284645, + "balance_loss_clip": 0.0631184, + "balance_loss_mlp": 0.01263629, + "epoch": 0.24319855704193596, + "flos": 25198670361600.0, + "grad_norm": 2.3685654829247227, + "language_loss": 0.83778739, + "learning_rate": 3.5418354772141337e-06, + "loss": 0.91597402, + "num_input_tokens_seen": 87068230, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.21020508, + "step": 4045, + "time_per_iteration": 2.60435152053833 + }, + { + "auxiliary_loss_clip": 0.06529962, + "auxiliary_loss_mlp": 0.0127985, + "balance_loss_clip": 0.06307946, + "balance_loss_mlp": 0.01260323, + "epoch": 0.24325868029460393, + "flos": 22133740515840.0, + "grad_norm": 1.834350653864789, + "language_loss": 0.87040859, + "learning_rate": 3.541587386314541e-06, + "loss": 0.94850671, + "num_input_tokens_seen": 87086435, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.19519043, + "step": 4046, + "time_per_iteration": 3.990011692047119 + }, + { + "auxiliary_loss_clip": 0.0652798, + "auxiliary_loss_mlp": 0.01281438, + "balance_loss_clip": 0.06311028, + "balance_loss_mlp": 0.01260922, + "epoch": 0.2433188035472719, + "flos": 23588107833600.0, + "grad_norm": 2.274532821816236, + "language_loss": 0.72945291, + "learning_rate": 3.5413392369578e-06, + "loss": 0.80754709, + "num_input_tokens_seen": 87105340, + "router_z_loss_clip": 2.16894531, + "router_z_loss_mlp": 0.20495605, + "step": 4047, + "time_per_iteration": 2.552464246749878 + }, + { + "auxiliary_loss_clip": 0.06530058, + "auxiliary_loss_mlp": 0.01284969, + "balance_loss_clip": 0.06306041, + "balance_loss_mlp": 0.01263666, + "epoch": 0.2433789267999399, + "flos": 24469809621120.0, + "grad_norm": 3.993347012147321, + "language_loss": 0.74453223, + "learning_rate": 3.5410910291533213e-06, + "loss": 0.8226825, + "num_input_tokens_seen": 87125780, + "router_z_loss_clip": 2.23828125, + "router_z_loss_mlp": 0.21325684, + "step": 4048, + "time_per_iteration": 4.027734279632568 + }, + { + "auxiliary_loss_clip": 0.06529407, + "auxiliary_loss_mlp": 0.01275879, + "balance_loss_clip": 0.06309648, + "balance_loss_mlp": 0.0125671, + "epoch": 0.24343905005260785, + "flos": 16733622032640.0, + "grad_norm": 2.185429514920852, + "language_loss": 0.73832756, + "learning_rate": 3.5408427629105155e-06, + "loss": 0.81638038, + "num_input_tokens_seen": 87144470, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.19165039, + "step": 4049, + "time_per_iteration": 2.5527403354644775 + }, + { + "auxiliary_loss_clip": 0.06525055, + "auxiliary_loss_mlp": 0.01275563, + "balance_loss_clip": 0.06306046, + "balance_loss_mlp": 0.01256084, + "epoch": 0.24349917330527582, + "flos": 20049294821760.0, + "grad_norm": 1.6558681415401064, + "language_loss": 0.74824917, + "learning_rate": 3.5405944382387985e-06, + "loss": 0.82625538, + "num_input_tokens_seen": 87162830, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.19482422, + "step": 4050, + "time_per_iteration": 2.517671585083008 + }, + { + "auxiliary_loss_clip": 0.06520879, + "auxiliary_loss_mlp": 0.0127856, + "balance_loss_clip": 0.06303313, + "balance_loss_mlp": 0.01258187, + "epoch": 0.24355929655794378, + "flos": 17426285009280.0, + "grad_norm": 2.447710360159803, + "language_loss": 0.75780261, + "learning_rate": 3.5403460551475854e-06, + "loss": 0.83579695, + "num_input_tokens_seen": 87180905, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.20361328, + "step": 4051, + "time_per_iteration": 3.961841583251953 + }, + { + "auxiliary_loss_clip": 0.06532377, + "auxiliary_loss_mlp": 0.01277824, + "balance_loss_clip": 0.06310124, + "balance_loss_mlp": 0.01257343, + "epoch": 0.24361941981061175, + "flos": 25417995223680.0, + "grad_norm": 2.289221862828171, + "language_loss": 0.71344352, + "learning_rate": 3.540097613646296e-06, + "loss": 0.79154545, + "num_input_tokens_seen": 87202290, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.20471191, + "step": 4052, + "time_per_iteration": 2.5851869583129883 + }, + { + "auxiliary_loss_clip": 0.06524909, + "auxiliary_loss_mlp": 0.01278097, + "balance_loss_clip": 0.06306259, + "balance_loss_mlp": 0.01258583, + "epoch": 0.2436795430632797, + "flos": 22827493595520.0, + "grad_norm": 1.7731467261886882, + "language_loss": 0.82073057, + "learning_rate": 3.539849113744351e-06, + "loss": 0.89876068, + "num_input_tokens_seen": 87221650, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.19519043, + "step": 4053, + "time_per_iteration": 2.6217734813690186 + }, + { + "auxiliary_loss_clip": 0.06533736, + "auxiliary_loss_mlp": 0.01278722, + "balance_loss_clip": 0.06309207, + "balance_loss_mlp": 0.01260126, + "epoch": 0.2437396663159477, + "flos": 15163030702080.0, + "grad_norm": 1.5690390746940162, + "language_loss": 0.78588867, + "learning_rate": 3.539600555451172e-06, + "loss": 0.86401325, + "num_input_tokens_seen": 87238515, + "router_z_loss_clip": 2.24414062, + "router_z_loss_mlp": 0.18615723, + "step": 4054, + "time_per_iteration": 2.513720750808716 + }, + { + "auxiliary_loss_clip": 0.06529565, + "auxiliary_loss_mlp": 0.0128197, + "balance_loss_clip": 0.06307493, + "balance_loss_mlp": 0.01263111, + "epoch": 0.24379978956861567, + "flos": 22097710460160.0, + "grad_norm": 1.7039269278884617, + "language_loss": 0.84417951, + "learning_rate": 3.5393519387761866e-06, + "loss": 0.92229491, + "num_input_tokens_seen": 87256290, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.1887207, + "step": 4055, + "time_per_iteration": 2.557584524154663 + }, + { + "auxiliary_loss_clip": 0.06542832, + "auxiliary_loss_mlp": 0.01280691, + "balance_loss_clip": 0.06312343, + "balance_loss_mlp": 0.01259508, + "epoch": 0.24385991282128364, + "flos": 31475878657920.0, + "grad_norm": 2.786051029634521, + "language_loss": 0.56684959, + "learning_rate": 3.5391032637288217e-06, + "loss": 0.6450848, + "num_input_tokens_seen": 87277085, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.21179199, + "step": 4056, + "time_per_iteration": 2.6548893451690674 + }, + { + "auxiliary_loss_clip": 0.06533613, + "auxiliary_loss_mlp": 0.01283826, + "balance_loss_clip": 0.06307291, + "balance_loss_mlp": 0.01262321, + "epoch": 0.2439200360739516, + "flos": 23845055978880.0, + "grad_norm": 2.215401064957846, + "language_loss": 0.80586845, + "learning_rate": 3.538854530318506e-06, + "loss": 0.88404286, + "num_input_tokens_seen": 87293020, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.21520996, + "step": 4057, + "time_per_iteration": 2.5563580989837646 + }, + { + "auxiliary_loss_clip": 0.06533922, + "auxiliary_loss_mlp": 0.01279797, + "balance_loss_clip": 0.06311886, + "balance_loss_mlp": 0.01261009, + "epoch": 0.24398015932661957, + "flos": 19175684952960.0, + "grad_norm": 1.7331406857586058, + "language_loss": 0.79934907, + "learning_rate": 3.538605738554673e-06, + "loss": 0.87748623, + "num_input_tokens_seen": 87311445, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.18786621, + "step": 4058, + "time_per_iteration": 2.5552098751068115 + }, + { + "auxiliary_loss_clip": 0.06541391, + "auxiliary_loss_mlp": 0.01280168, + "balance_loss_clip": 0.06312001, + "balance_loss_mlp": 0.01259772, + "epoch": 0.24404028257928753, + "flos": 25269095318400.0, + "grad_norm": 1.7324044437804977, + "language_loss": 0.86104828, + "learning_rate": 3.538356888446756e-06, + "loss": 0.93926388, + "num_input_tokens_seen": 87332055, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.20410156, + "step": 4059, + "time_per_iteration": 2.575345754623413 + }, + { + "auxiliary_loss_clip": 0.06538763, + "auxiliary_loss_mlp": 0.01274337, + "balance_loss_clip": 0.06318676, + "balance_loss_mlp": 0.01255621, + "epoch": 0.2441004058319555, + "flos": 26474606409600.0, + "grad_norm": 1.5285193147278118, + "language_loss": 0.74698234, + "learning_rate": 3.5381079800041913e-06, + "loss": 0.8251133, + "num_input_tokens_seen": 87351295, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.18713379, + "step": 4060, + "time_per_iteration": 2.6277999877929688 + }, + { + "auxiliary_loss_clip": 0.06560756, + "auxiliary_loss_mlp": 0.01280844, + "balance_loss_clip": 0.06327853, + "balance_loss_mlp": 0.01259469, + "epoch": 0.2441605290846235, + "flos": 26767752318720.0, + "grad_norm": 1.6858410849727605, + "language_loss": 0.73894358, + "learning_rate": 3.5378590132364182e-06, + "loss": 0.81735957, + "num_input_tokens_seen": 87370650, + "router_z_loss_clip": 2.328125, + "router_z_loss_mlp": 0.21374512, + "step": 4061, + "time_per_iteration": 2.5895774364471436 + }, + { + "auxiliary_loss_clip": 0.06538899, + "auxiliary_loss_mlp": 0.01273593, + "balance_loss_clip": 0.0631846, + "balance_loss_mlp": 0.01254103, + "epoch": 0.24422065233729146, + "flos": 21112236990720.0, + "grad_norm": 1.7809128746808311, + "language_loss": 0.76782405, + "learning_rate": 3.5376099881528768e-06, + "loss": 0.84594905, + "num_input_tokens_seen": 87389020, + "router_z_loss_clip": 2.20507812, + "router_z_loss_mlp": 0.19494629, + "step": 4062, + "time_per_iteration": 2.5655109882354736 + }, + { + "auxiliary_loss_clip": 0.06538436, + "auxiliary_loss_mlp": 0.01278297, + "balance_loss_clip": 0.06319936, + "balance_loss_mlp": 0.01258019, + "epoch": 0.24428077558995942, + "flos": 25269891932160.0, + "grad_norm": 1.624722619478305, + "language_loss": 0.84975201, + "learning_rate": 3.537360904763011e-06, + "loss": 0.92791933, + "num_input_tokens_seen": 87409695, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.20263672, + "step": 4063, + "time_per_iteration": 2.569420576095581 + }, + { + "auxiliary_loss_clip": 0.06559969, + "auxiliary_loss_mlp": 0.01275678, + "balance_loss_clip": 0.06327148, + "balance_loss_mlp": 0.01254459, + "epoch": 0.24434089884262739, + "flos": 20491508344320.0, + "grad_norm": 2.099790248638241, + "language_loss": 0.68837494, + "learning_rate": 3.5371117630762656e-06, + "loss": 0.76673138, + "num_input_tokens_seen": 87428250, + "router_z_loss_clip": 2.328125, + "router_z_loss_mlp": 0.2121582, + "step": 4064, + "time_per_iteration": 2.560065984725952 + }, + { + "auxiliary_loss_clip": 0.06547809, + "auxiliary_loss_mlp": 0.01276127, + "balance_loss_clip": 0.06317605, + "balance_loss_mlp": 0.01255349, + "epoch": 0.24440102209529535, + "flos": 23628456374400.0, + "grad_norm": 1.7607893449036869, + "language_loss": 0.70700729, + "learning_rate": 3.536862563102088e-06, + "loss": 0.78524667, + "num_input_tokens_seen": 87449380, + "router_z_loss_clip": 2.29882812, + "router_z_loss_mlp": 0.20788574, + "step": 4065, + "time_per_iteration": 2.5619614124298096 + }, + { + "auxiliary_loss_clip": 0.06554856, + "auxiliary_loss_mlp": 0.0127847, + "balance_loss_clip": 0.06322616, + "balance_loss_mlp": 0.01256726, + "epoch": 0.24446114534796332, + "flos": 20560382000640.0, + "grad_norm": 2.0639555504298372, + "language_loss": 0.84639663, + "learning_rate": 3.5366133048499282e-06, + "loss": 0.92472994, + "num_input_tokens_seen": 87465365, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.21765137, + "step": 4066, + "time_per_iteration": 2.5640382766723633 + }, + { + "auxiliary_loss_clip": 0.0647334, + "auxiliary_loss_mlp": 0.01266455, + "balance_loss_clip": 0.06356817, + "balance_loss_mlp": 0.01260456, + "epoch": 0.24452126860063128, + "flos": 60406719327360.0, + "grad_norm": 0.7224646734980834, + "language_loss": 0.52123713, + "learning_rate": 3.5363639883292374e-06, + "loss": 0.59863508, + "num_input_tokens_seen": 87522525, + "router_z_loss_clip": 1.1640625, + "router_z_loss_mlp": 0.05990601, + "step": 4067, + "time_per_iteration": 3.067857503890991 + }, + { + "auxiliary_loss_clip": 0.06549152, + "auxiliary_loss_mlp": 0.01275932, + "balance_loss_clip": 0.063198, + "balance_loss_mlp": 0.01255106, + "epoch": 0.24458139185329927, + "flos": 15126958719360.0, + "grad_norm": 4.582785635832698, + "language_loss": 0.72625411, + "learning_rate": 3.5361146135494706e-06, + "loss": 0.80450499, + "num_input_tokens_seen": 87539170, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.20825195, + "step": 4068, + "time_per_iteration": 2.5490705966949463 + }, + { + "auxiliary_loss_clip": 0.06542531, + "auxiliary_loss_mlp": 0.0127677, + "balance_loss_clip": 0.06318012, + "balance_loss_mlp": 0.01256111, + "epoch": 0.24464151510596724, + "flos": 28005771594240.0, + "grad_norm": 1.4744908303961997, + "language_loss": 0.7839663, + "learning_rate": 3.5358651805200835e-06, + "loss": 0.86215931, + "num_input_tokens_seen": 87558875, + "router_z_loss_clip": 2.24609375, + "router_z_loss_mlp": 0.20654297, + "step": 4069, + "time_per_iteration": 2.6064302921295166 + }, + { + "auxiliary_loss_clip": 0.06535528, + "auxiliary_loss_mlp": 0.01277448, + "balance_loss_clip": 0.06312935, + "balance_loss_mlp": 0.01257493, + "epoch": 0.2447016383586352, + "flos": 19799138856960.0, + "grad_norm": 1.9167348410225946, + "language_loss": 0.80741036, + "learning_rate": 3.5356156892505347e-06, + "loss": 0.88554007, + "num_input_tokens_seen": 87576485, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.19946289, + "step": 4070, + "time_per_iteration": 2.633073568344116 + }, + { + "auxiliary_loss_clip": 0.06543916, + "auxiliary_loss_mlp": 0.0127809, + "balance_loss_clip": 0.06317008, + "balance_loss_mlp": 0.01258825, + "epoch": 0.24476176161130317, + "flos": 26074460436480.0, + "grad_norm": 1.476613235331205, + "language_loss": 0.8444066, + "learning_rate": 3.5353661397502854e-06, + "loss": 0.92262667, + "num_input_tokens_seen": 87598620, + "router_z_loss_clip": 2.2734375, + "router_z_loss_mlp": 0.19262695, + "step": 4071, + "time_per_iteration": 2.6165285110473633 + }, + { + "auxiliary_loss_clip": 0.06545337, + "auxiliary_loss_mlp": 0.01275719, + "balance_loss_clip": 0.06310376, + "balance_loss_mlp": 0.01254679, + "epoch": 0.24482188486397113, + "flos": 18849527735040.0, + "grad_norm": 2.1913275656577857, + "language_loss": 0.8027429, + "learning_rate": 3.535116532028798e-06, + "loss": 0.88095343, + "num_input_tokens_seen": 87616595, + "router_z_loss_clip": 2.3515625, + "router_z_loss_mlp": 0.21032715, + "step": 4072, + "time_per_iteration": 2.580077648162842 + }, + { + "auxiliary_loss_clip": 0.06531823, + "auxiliary_loss_mlp": 0.01275557, + "balance_loss_clip": 0.06311209, + "balance_loss_mlp": 0.01257031, + "epoch": 0.2448820081166391, + "flos": 21258202003200.0, + "grad_norm": 1.4781582217057618, + "language_loss": 0.7076053, + "learning_rate": 3.5348668660955382e-06, + "loss": 0.7856791, + "num_input_tokens_seen": 87635755, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.18505859, + "step": 4073, + "time_per_iteration": 2.5430707931518555 + }, + { + "auxiliary_loss_clip": 0.06525481, + "auxiliary_loss_mlp": 0.01279613, + "balance_loss_clip": 0.06303517, + "balance_loss_mlp": 0.01260921, + "epoch": 0.2449421313693071, + "flos": 23957254995840.0, + "grad_norm": 2.412576467354098, + "language_loss": 0.67577648, + "learning_rate": 3.5346171419599728e-06, + "loss": 0.75382745, + "num_input_tokens_seen": 87652885, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.18676758, + "step": 4074, + "time_per_iteration": 2.5616037845611572 + }, + { + "auxiliary_loss_clip": 0.06435025, + "auxiliary_loss_mlp": 0.01257107, + "balance_loss_clip": 0.06320108, + "balance_loss_mlp": 0.01251907, + "epoch": 0.24500225462197506, + "flos": 60705902730240.0, + "grad_norm": 0.8764237694402175, + "language_loss": 0.68656927, + "learning_rate": 3.5343673596315718e-06, + "loss": 0.76349056, + "num_input_tokens_seen": 87713220, + "router_z_loss_clip": 1.1484375, + "router_z_loss_mlp": 0.05203247, + "step": 4075, + "time_per_iteration": 3.2623581886291504 + }, + { + "auxiliary_loss_clip": 0.06527948, + "auxiliary_loss_mlp": 0.01276643, + "balance_loss_clip": 0.06305515, + "balance_loss_mlp": 0.01257414, + "epoch": 0.24506237787464302, + "flos": 26291018113920.0, + "grad_norm": 2.301278269127432, + "language_loss": 0.79781568, + "learning_rate": 3.5341175191198063e-06, + "loss": 0.87586164, + "num_input_tokens_seen": 87732680, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.19226074, + "step": 4076, + "time_per_iteration": 2.6342012882232666 + }, + { + "auxiliary_loss_clip": 0.06535772, + "auxiliary_loss_mlp": 0.01280909, + "balance_loss_clip": 0.06304428, + "balance_loss_mlp": 0.01258462, + "epoch": 0.245122501127311, + "flos": 20557530961920.0, + "grad_norm": 1.9232761502629154, + "language_loss": 0.82461953, + "learning_rate": 3.533867620434151e-06, + "loss": 0.90278631, + "num_input_tokens_seen": 87751880, + "router_z_loss_clip": 2.31445312, + "router_z_loss_mlp": 0.2244873, + "step": 4077, + "time_per_iteration": 2.5863101482391357 + }, + { + "auxiliary_loss_clip": 0.06532669, + "auxiliary_loss_mlp": 0.01279172, + "balance_loss_clip": 0.06305817, + "balance_loss_mlp": 0.01257774, + "epoch": 0.24518262437997895, + "flos": 29140312677120.0, + "grad_norm": 2.8377644839815357, + "language_loss": 0.63268852, + "learning_rate": 3.533617663584082e-06, + "loss": 0.71080685, + "num_input_tokens_seen": 87771795, + "router_z_loss_clip": 2.26953125, + "router_z_loss_mlp": 0.21398926, + "step": 4078, + "time_per_iteration": 2.6045711040496826 + }, + { + "auxiliary_loss_clip": 0.06522519, + "auxiliary_loss_mlp": 0.01277179, + "balance_loss_clip": 0.06301752, + "balance_loss_mlp": 0.01258249, + "epoch": 0.24524274763264692, + "flos": 23483623392000.0, + "grad_norm": 1.4700896000405594, + "language_loss": 0.75762683, + "learning_rate": 3.5333676485790765e-06, + "loss": 0.8356238, + "num_input_tokens_seen": 87793640, + "router_z_loss_clip": 2.20800781, + "router_z_loss_mlp": 0.18933105, + "step": 4079, + "time_per_iteration": 2.6327531337738037 + }, + { + "auxiliary_loss_clip": 0.06521107, + "auxiliary_loss_mlp": 0.01276139, + "balance_loss_clip": 0.06297373, + "balance_loss_mlp": 0.01256171, + "epoch": 0.24530287088531488, + "flos": 17206792439040.0, + "grad_norm": 1.743597814486786, + "language_loss": 0.75652814, + "learning_rate": 3.5331175754286173e-06, + "loss": 0.83450055, + "num_input_tokens_seen": 87812390, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.1998291, + "step": 4080, + "time_per_iteration": 2.5027806758880615 + }, + { + "auxiliary_loss_clip": 0.06517033, + "auxiliary_loss_mlp": 0.01282693, + "balance_loss_clip": 0.06296979, + "balance_loss_mlp": 0.01262129, + "epoch": 0.24536299413798288, + "flos": 14872903539840.0, + "grad_norm": 1.7999885027482954, + "language_loss": 0.83532149, + "learning_rate": 3.532867444142186e-06, + "loss": 0.91331875, + "num_input_tokens_seen": 87830640, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.20544434, + "step": 4081, + "time_per_iteration": 3.9672679901123047 + }, + { + "auxiliary_loss_clip": 0.06524678, + "auxiliary_loss_mlp": 0.01276758, + "balance_loss_clip": 0.06300613, + "balance_loss_mlp": 0.01257458, + "epoch": 0.24542311739065084, + "flos": 35270759347200.0, + "grad_norm": 2.0934334924975797, + "language_loss": 0.7376107, + "learning_rate": 3.532617254729267e-06, + "loss": 0.81562507, + "num_input_tokens_seen": 87850450, + "router_z_loss_clip": 2.2421875, + "router_z_loss_mlp": 0.19311523, + "step": 4082, + "time_per_iteration": 2.687596559524536 + }, + { + "auxiliary_loss_clip": 0.06520141, + "auxiliary_loss_mlp": 0.01272132, + "balance_loss_clip": 0.06301866, + "balance_loss_mlp": 0.01254334, + "epoch": 0.2454832406433188, + "flos": 21508903019520.0, + "grad_norm": 4.081398895882933, + "language_loss": 0.72681344, + "learning_rate": 3.5323670071993485e-06, + "loss": 0.8047362, + "num_input_tokens_seen": 87868810, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.17810059, + "step": 4083, + "time_per_iteration": 2.5715560913085938 + }, + { + "auxiliary_loss_clip": 0.06531677, + "auxiliary_loss_mlp": 0.01285124, + "balance_loss_clip": 0.06304878, + "balance_loss_mlp": 0.01263404, + "epoch": 0.24554336389598677, + "flos": 14761878480000.0, + "grad_norm": 2.078496591548884, + "language_loss": 0.75461411, + "learning_rate": 3.532116701561919e-06, + "loss": 0.83278215, + "num_input_tokens_seen": 87885685, + "router_z_loss_clip": 2.26953125, + "router_z_loss_mlp": 0.21704102, + "step": 4084, + "time_per_iteration": 2.527059316635132 + }, + { + "auxiliary_loss_clip": 0.06521569, + "auxiliary_loss_mlp": 0.01278312, + "balance_loss_clip": 0.06299873, + "balance_loss_mlp": 0.01259238, + "epoch": 0.24560348714865474, + "flos": 14981790320640.0, + "grad_norm": 1.9240939687866982, + "language_loss": 0.85311353, + "learning_rate": 3.531866337826471e-06, + "loss": 0.93111229, + "num_input_tokens_seen": 87903715, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.19055176, + "step": 4085, + "time_per_iteration": 4.107008695602417 + }, + { + "auxiliary_loss_clip": 0.06523392, + "auxiliary_loss_mlp": 0.01277742, + "balance_loss_clip": 0.06299591, + "balance_loss_mlp": 0.0125725, + "epoch": 0.2456636104013227, + "flos": 22682073634560.0, + "grad_norm": 1.671481131781836, + "language_loss": 0.79073685, + "learning_rate": 3.5316159160024982e-06, + "loss": 0.86874819, + "num_input_tokens_seen": 87923375, + "router_z_loss_clip": 2.23828125, + "router_z_loss_mlp": 0.20495605, + "step": 4086, + "time_per_iteration": 2.5609679222106934 + }, + { + "auxiliary_loss_clip": 0.06519614, + "auxiliary_loss_mlp": 0.01278477, + "balance_loss_clip": 0.06300113, + "balance_loss_mlp": 0.01260107, + "epoch": 0.2457237336539907, + "flos": 27425307634560.0, + "grad_norm": 1.6115503736345718, + "language_loss": 0.75352013, + "learning_rate": 3.531365436099496e-06, + "loss": 0.83150113, + "num_input_tokens_seen": 87943115, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.18359375, + "step": 4087, + "time_per_iteration": 4.046957015991211 + }, + { + "auxiliary_loss_clip": 0.06525059, + "auxiliary_loss_mlp": 0.01276774, + "balance_loss_clip": 0.06299827, + "balance_loss_mlp": 0.0125633, + "epoch": 0.24578385690665866, + "flos": 20418609692160.0, + "grad_norm": 2.7081304915573914, + "language_loss": 0.79987848, + "learning_rate": 3.5311148981269635e-06, + "loss": 0.87789685, + "num_input_tokens_seen": 87959505, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.20458984, + "step": 4088, + "time_per_iteration": 2.5119664669036865 + }, + { + "auxiliary_loss_clip": 0.06519316, + "auxiliary_loss_mlp": 0.01276403, + "balance_loss_clip": 0.06303117, + "balance_loss_mlp": 0.01258152, + "epoch": 0.24584398015932662, + "flos": 23922273116160.0, + "grad_norm": 2.802199957042034, + "language_loss": 0.77758735, + "learning_rate": 3.5308643020944e-06, + "loss": 0.85554451, + "num_input_tokens_seen": 87979725, + "router_z_loss_clip": 2.16601562, + "router_z_loss_mlp": 0.18249512, + "step": 4089, + "time_per_iteration": 2.5686089992523193 + }, + { + "auxiliary_loss_clip": 0.06525148, + "auxiliary_loss_mlp": 0.01281238, + "balance_loss_clip": 0.0630155, + "balance_loss_mlp": 0.01261021, + "epoch": 0.2459041034119946, + "flos": 41505313115520.0, + "grad_norm": 1.8031915906993192, + "language_loss": 0.81701422, + "learning_rate": 3.530613648011309e-06, + "loss": 0.89507812, + "num_input_tokens_seen": 87998270, + "router_z_loss_clip": 2.23828125, + "router_z_loss_mlp": 0.20214844, + "step": 4090, + "time_per_iteration": 2.678403377532959 + }, + { + "auxiliary_loss_clip": 0.065328, + "auxiliary_loss_mlp": 0.01279305, + "balance_loss_clip": 0.06309135, + "balance_loss_mlp": 0.01258861, + "epoch": 0.24596422666466256, + "flos": 19942755955200.0, + "grad_norm": 2.438516046551743, + "language_loss": 0.73629344, + "learning_rate": 3.5303629358871946e-06, + "loss": 0.8144145, + "num_input_tokens_seen": 88016760, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.20446777, + "step": 4091, + "time_per_iteration": 3.961276054382324 + }, + { + "auxiliary_loss_clip": 0.06539448, + "auxiliary_loss_mlp": 0.01279874, + "balance_loss_clip": 0.06316313, + "balance_loss_mlp": 0.0126148, + "epoch": 0.24602434991733052, + "flos": 21550970568960.0, + "grad_norm": 2.2480658521871897, + "language_loss": 0.77723873, + "learning_rate": 3.5301121657315653e-06, + "loss": 0.85543197, + "num_input_tokens_seen": 88036465, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.18408203, + "step": 4092, + "time_per_iteration": 2.5494375228881836 + }, + { + "auxiliary_loss_clip": 0.06537454, + "auxiliary_loss_mlp": 0.01278374, + "balance_loss_clip": 0.06307742, + "balance_loss_mlp": 0.01258907, + "epoch": 0.24608447316999849, + "flos": 23191735294080.0, + "grad_norm": 2.380112015735871, + "language_loss": 0.82381165, + "learning_rate": 3.5298613375539287e-06, + "loss": 0.90196991, + "num_input_tokens_seen": 88053270, + "router_z_loss_clip": 2.30078125, + "router_z_loss_mlp": 0.19470215, + "step": 4093, + "time_per_iteration": 2.5551040172576904 + }, + { + "auxiliary_loss_clip": 0.06532703, + "auxiliary_loss_mlp": 0.01285, + "balance_loss_clip": 0.06305315, + "balance_loss_mlp": 0.01264412, + "epoch": 0.24614459642266648, + "flos": 19647345985920.0, + "grad_norm": 21.11973952887688, + "language_loss": 0.87671578, + "learning_rate": 3.529610451363797e-06, + "loss": 0.95489287, + "num_input_tokens_seen": 88072305, + "router_z_loss_clip": 2.27148438, + "router_z_loss_mlp": 0.20581055, + "step": 4094, + "time_per_iteration": 2.534127712249756 + }, + { + "auxiliary_loss_clip": 0.06404499, + "auxiliary_loss_mlp": 0.01293713, + "balance_loss_clip": 0.06291573, + "balance_loss_mlp": 0.01289332, + "epoch": 0.24620471967533444, + "flos": 61757231109120.0, + "grad_norm": 0.7533459551406883, + "language_loss": 0.57023478, + "learning_rate": 3.5293595071706833e-06, + "loss": 0.64721692, + "num_input_tokens_seen": 88137995, + "router_z_loss_clip": 1.125, + "router_z_loss_mlp": 0.04388428, + "step": 4095, + "time_per_iteration": 3.238482713699341 + }, + { + "auxiliary_loss_clip": 0.06404348, + "auxiliary_loss_mlp": 0.01286038, + "balance_loss_clip": 0.06290346, + "balance_loss_mlp": 0.01281767, + "epoch": 0.2462648429280024, + "flos": 69174431003520.0, + "grad_norm": 0.6365745764429788, + "language_loss": 0.56240451, + "learning_rate": 3.5291085049841042e-06, + "loss": 0.63930833, + "num_input_tokens_seen": 88208490, + "router_z_loss_clip": 1.140625, + "router_z_loss_mlp": 0.04275513, + "step": 4096, + "time_per_iteration": 3.3192596435546875 + }, + { + "auxiliary_loss_clip": 0.06545975, + "auxiliary_loss_mlp": 0.01281956, + "balance_loss_clip": 0.06318395, + "balance_loss_mlp": 0.01262143, + "epoch": 0.24632496618067037, + "flos": 29467140727680.0, + "grad_norm": 1.505356285132213, + "language_loss": 0.78075927, + "learning_rate": 3.5288574448135773e-06, + "loss": 0.85903859, + "num_input_tokens_seen": 88228050, + "router_z_loss_clip": 2.2734375, + "router_z_loss_mlp": 0.19812012, + "step": 4097, + "time_per_iteration": 2.617108106613159 + }, + { + "auxiliary_loss_clip": 0.06547391, + "auxiliary_loss_mlp": 0.01279842, + "balance_loss_clip": 0.06315026, + "balance_loss_mlp": 0.01259993, + "epoch": 0.24638508943333834, + "flos": 24323341484160.0, + "grad_norm": 2.0372573834811267, + "language_loss": 0.77321315, + "learning_rate": 3.5286063266686235e-06, + "loss": 0.85148549, + "num_input_tokens_seen": 88248090, + "router_z_loss_clip": 2.32421875, + "router_z_loss_mlp": 0.1986084, + "step": 4098, + "time_per_iteration": 2.6069419384002686 + }, + { + "auxiliary_loss_clip": 0.06542017, + "auxiliary_loss_mlp": 0.01275521, + "balance_loss_clip": 0.0631687, + "balance_loss_mlp": 0.01257341, + "epoch": 0.2464452126860063, + "flos": 26620236005760.0, + "grad_norm": 2.17921698337753, + "language_loss": 0.69183016, + "learning_rate": 3.528355150558764e-06, + "loss": 0.77000558, + "num_input_tokens_seen": 88267545, + "router_z_loss_clip": 2.25, + "router_z_loss_mlp": 0.1817627, + "step": 4099, + "time_per_iteration": 2.655956506729126 + }, + { + "auxiliary_loss_clip": 0.06525709, + "auxiliary_loss_mlp": 0.01274552, + "balance_loss_clip": 0.06309929, + "balance_loss_mlp": 0.01256062, + "epoch": 0.24650533593867427, + "flos": 31220481813120.0, + "grad_norm": 2.2743270797915076, + "language_loss": 0.67268491, + "learning_rate": 3.5281039164935237e-06, + "loss": 0.75068748, + "num_input_tokens_seen": 88289785, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.18493652, + "step": 4100, + "time_per_iteration": 2.6497559547424316 + }, + { + "auxiliary_loss_clip": 0.0641202, + "auxiliary_loss_mlp": 0.01258309, + "balance_loss_clip": 0.06296985, + "balance_loss_mlp": 0.01253758, + "epoch": 0.24656545919134226, + "flos": 68513269962240.0, + "grad_norm": 0.6889590379062642, + "language_loss": 0.61607081, + "learning_rate": 3.5278526244824304e-06, + "loss": 0.69277412, + "num_input_tokens_seen": 88357320, + "router_z_loss_clip": 1.1484375, + "router_z_loss_mlp": 0.04559326, + "step": 4101, + "time_per_iteration": 3.2961082458496094 + }, + { + "auxiliary_loss_clip": 0.06538613, + "auxiliary_loss_mlp": 0.01277942, + "balance_loss_clip": 0.06317261, + "balance_loss_mlp": 0.01259, + "epoch": 0.24662558244401023, + "flos": 20090398049280.0, + "grad_norm": 1.6193028382456236, + "language_loss": 0.73591036, + "learning_rate": 3.527601274535012e-06, + "loss": 0.81407589, + "num_input_tokens_seen": 88377040, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.18945312, + "step": 4102, + "time_per_iteration": 2.542275905609131 + }, + { + "auxiliary_loss_clip": 0.0654332, + "auxiliary_loss_mlp": 0.01273749, + "balance_loss_clip": 0.06317908, + "balance_loss_mlp": 0.01255152, + "epoch": 0.2466857056966782, + "flos": 30709310780160.0, + "grad_norm": 2.0137613654817854, + "language_loss": 0.76325667, + "learning_rate": 3.5273498666608004e-06, + "loss": 0.84142733, + "num_input_tokens_seen": 88395085, + "router_z_loss_clip": 2.25, + "router_z_loss_mlp": 0.18603516, + "step": 4103, + "time_per_iteration": 2.6544189453125 + }, + { + "auxiliary_loss_clip": 0.06542745, + "auxiliary_loss_mlp": 0.01273413, + "balance_loss_clip": 0.06315098, + "balance_loss_mlp": 0.01253159, + "epoch": 0.24674582894934616, + "flos": 22535102373120.0, + "grad_norm": 2.0816413841430697, + "language_loss": 0.79265451, + "learning_rate": 3.5270984008693288e-06, + "loss": 0.87081611, + "num_input_tokens_seen": 88413205, + "router_z_loss_clip": 2.27929688, + "router_z_loss_mlp": 0.20275879, + "step": 4104, + "time_per_iteration": 2.5569820404052734 + }, + { + "auxiliary_loss_clip": 0.06525403, + "auxiliary_loss_mlp": 0.01276885, + "balance_loss_clip": 0.06306183, + "balance_loss_mlp": 0.01257251, + "epoch": 0.24680595220201412, + "flos": 20710581644160.0, + "grad_norm": 1.7450607123984514, + "language_loss": 0.83681756, + "learning_rate": 3.526846877170133e-06, + "loss": 0.9148404, + "num_input_tokens_seen": 88431525, + "router_z_loss_clip": 2.19335938, + "router_z_loss_mlp": 0.19641113, + "step": 4105, + "time_per_iteration": 2.553579330444336 + }, + { + "auxiliary_loss_clip": 0.06533727, + "auxiliary_loss_mlp": 0.01273598, + "balance_loss_clip": 0.06309752, + "balance_loss_mlp": 0.01255371, + "epoch": 0.2468660754546821, + "flos": 21836946954240.0, + "grad_norm": 1.9208859898797113, + "language_loss": 0.77469373, + "learning_rate": 3.52659529557275e-06, + "loss": 0.85276699, + "num_input_tokens_seen": 88451210, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.18212891, + "step": 4106, + "time_per_iteration": 2.5389256477355957 + }, + { + "auxiliary_loss_clip": 0.06534247, + "auxiliary_loss_mlp": 0.01276275, + "balance_loss_clip": 0.06310344, + "balance_loss_mlp": 0.01257463, + "epoch": 0.24692619870735008, + "flos": 15273049512960.0, + "grad_norm": 2.4615103155960485, + "language_loss": 0.73436344, + "learning_rate": 3.5263436560867205e-06, + "loss": 0.81246865, + "num_input_tokens_seen": 88467790, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.18798828, + "step": 4107, + "time_per_iteration": 2.5545566082000732 + }, + { + "auxiliary_loss_clip": 0.06538644, + "auxiliary_loss_mlp": 0.01275055, + "balance_loss_clip": 0.06314194, + "balance_loss_mlp": 0.01256745, + "epoch": 0.24698632196001805, + "flos": 29687933036160.0, + "grad_norm": 2.1377324014009504, + "language_loss": 0.66432422, + "learning_rate": 3.526091958721587e-06, + "loss": 0.7424612, + "num_input_tokens_seen": 88490330, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.18322754, + "step": 4108, + "time_per_iteration": 2.6196486949920654 + }, + { + "auxiliary_loss_clip": 0.06540007, + "auxiliary_loss_mlp": 0.01277779, + "balance_loss_clip": 0.06313555, + "balance_loss_mlp": 0.01259623, + "epoch": 0.247046445212686, + "flos": 39174736452480.0, + "grad_norm": 2.010829594577025, + "language_loss": 0.73608756, + "learning_rate": 3.5258402034868936e-06, + "loss": 0.81426549, + "num_input_tokens_seen": 88512435, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.18151855, + "step": 4109, + "time_per_iteration": 2.764406442642212 + }, + { + "auxiliary_loss_clip": 0.06534623, + "auxiliary_loss_mlp": 0.01277352, + "balance_loss_clip": 0.06311052, + "balance_loss_mlp": 0.01259077, + "epoch": 0.24710656846535398, + "flos": 23004834762240.0, + "grad_norm": 1.68605601916547, + "language_loss": 0.79419786, + "learning_rate": 3.5255883903921866e-06, + "loss": 0.87231761, + "num_input_tokens_seen": 88529780, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.18249512, + "step": 4110, + "time_per_iteration": 2.5460774898529053 + }, + { + "auxiliary_loss_clip": 0.06540776, + "auxiliary_loss_mlp": 0.01276666, + "balance_loss_clip": 0.06313831, + "balance_loss_mlp": 0.01257032, + "epoch": 0.24716669171802194, + "flos": 26440085727360.0, + "grad_norm": 2.6454329848736604, + "language_loss": 0.81789577, + "learning_rate": 3.5253365194470144e-06, + "loss": 0.89607012, + "num_input_tokens_seen": 88547200, + "router_z_loss_clip": 2.26953125, + "router_z_loss_mlp": 0.19628906, + "step": 4111, + "time_per_iteration": 2.632023811340332 + }, + { + "auxiliary_loss_clip": 0.06537174, + "auxiliary_loss_mlp": 0.0127416, + "balance_loss_clip": 0.06311068, + "balance_loss_mlp": 0.01256065, + "epoch": 0.2472268149706899, + "flos": 23336358641280.0, + "grad_norm": 1.983709335436533, + "language_loss": 0.75390071, + "learning_rate": 3.5250845906609294e-06, + "loss": 0.83201408, + "num_input_tokens_seen": 88566415, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.18115234, + "step": 4112, + "time_per_iteration": 2.5546083450317383 + }, + { + "auxiliary_loss_clip": 0.06533875, + "auxiliary_loss_mlp": 0.01274467, + "balance_loss_clip": 0.06308994, + "balance_loss_mlp": 0.01255548, + "epoch": 0.24728693822335787, + "flos": 23775469562880.0, + "grad_norm": 2.380234182887367, + "language_loss": 0.83472633, + "learning_rate": 3.5248326040434835e-06, + "loss": 0.91280973, + "num_input_tokens_seen": 88585225, + "router_z_loss_clip": 2.24804688, + "router_z_loss_mlp": 0.18920898, + "step": 4113, + "time_per_iteration": 2.6223254203796387 + }, + { + "auxiliary_loss_clip": 0.06540644, + "auxiliary_loss_mlp": 0.01276865, + "balance_loss_clip": 0.06315883, + "balance_loss_mlp": 0.01257279, + "epoch": 0.24734706147602586, + "flos": 19323494755200.0, + "grad_norm": 2.0367731486494636, + "language_loss": 0.87924093, + "learning_rate": 3.5245805596042322e-06, + "loss": 0.95741606, + "num_input_tokens_seen": 88603280, + "router_z_loss_clip": 2.24804688, + "router_z_loss_mlp": 0.19580078, + "step": 4114, + "time_per_iteration": 2.5495545864105225 + }, + { + "auxiliary_loss_clip": 0.06532501, + "auxiliary_loss_mlp": 0.01273212, + "balance_loss_clip": 0.06308883, + "balance_loss_mlp": 0.01255474, + "epoch": 0.24740718472869383, + "flos": 28044275345280.0, + "grad_norm": 1.9170399047542779, + "language_loss": 0.75640035, + "learning_rate": 3.524328457352734e-06, + "loss": 0.83445752, + "num_input_tokens_seen": 88624925, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.17736816, + "step": 4115, + "time_per_iteration": 2.6333982944488525 + }, + { + "auxiliary_loss_clip": 0.0642873, + "auxiliary_loss_mlp": 0.01264911, + "balance_loss_clip": 0.06315603, + "balance_loss_mlp": 0.01259151, + "epoch": 0.2474673079813618, + "flos": 68129265899520.0, + "grad_norm": 0.63897767002188, + "language_loss": 0.58004332, + "learning_rate": 3.5240762972985475e-06, + "loss": 0.65697974, + "num_input_tokens_seen": 88691475, + "router_z_loss_clip": 1.1328125, + "router_z_loss_mlp": 0.05752563, + "step": 4116, + "time_per_iteration": 3.251235246658325 + }, + { + "auxiliary_loss_clip": 0.06532618, + "auxiliary_loss_mlp": 0.01276179, + "balance_loss_clip": 0.063094, + "balance_loss_mlp": 0.01257022, + "epoch": 0.24752743123402976, + "flos": 29470075620480.0, + "grad_norm": 1.407143363910891, + "language_loss": 0.8425988, + "learning_rate": 3.523824079451235e-06, + "loss": 0.92068678, + "num_input_tokens_seen": 88713425, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.19152832, + "step": 4117, + "time_per_iteration": 2.640665292739868 + }, + { + "auxiliary_loss_clip": 0.06425081, + "auxiliary_loss_mlp": 0.01267093, + "balance_loss_clip": 0.0631275, + "balance_loss_mlp": 0.01262089, + "epoch": 0.24758755448669773, + "flos": 58367946908160.0, + "grad_norm": 0.8764773034828885, + "language_loss": 0.63508207, + "learning_rate": 3.5235718038203602e-06, + "loss": 0.71200383, + "num_input_tokens_seen": 88769995, + "router_z_loss_clip": 1.125, + "router_z_loss_mlp": 0.05001831, + "step": 4118, + "time_per_iteration": 3.052507162094116 + }, + { + "auxiliary_loss_clip": 0.0652981, + "auxiliary_loss_mlp": 0.01277419, + "balance_loss_clip": 0.06307684, + "balance_loss_mlp": 0.01258203, + "epoch": 0.2476476777393657, + "flos": 20490502095360.0, + "grad_norm": 1.7262960547494681, + "language_loss": 0.80051601, + "learning_rate": 3.523319470415491e-06, + "loss": 0.87858826, + "num_input_tokens_seen": 88789970, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.19238281, + "step": 4119, + "time_per_iteration": 2.554318428039551 + }, + { + "auxiliary_loss_clip": 0.06530587, + "auxiliary_loss_mlp": 0.01282865, + "balance_loss_clip": 0.06310613, + "balance_loss_mlp": 0.01265198, + "epoch": 0.24770780099203366, + "flos": 20492179176960.0, + "grad_norm": 2.4192345138137386, + "language_loss": 0.74556476, + "learning_rate": 3.5230670792461943e-06, + "loss": 0.8236993, + "num_input_tokens_seen": 88810000, + "router_z_loss_clip": 2.19921875, + "router_z_loss_mlp": 0.17663574, + "step": 4120, + "time_per_iteration": 3.996234655380249 + }, + { + "auxiliary_loss_clip": 0.06531808, + "auxiliary_loss_mlp": 0.01276043, + "balance_loss_clip": 0.06307146, + "balance_loss_mlp": 0.01256362, + "epoch": 0.24776792424470165, + "flos": 15157915603200.0, + "grad_norm": 2.13486110959629, + "language_loss": 0.89734054, + "learning_rate": 3.522814630322041e-06, + "loss": 0.97541904, + "num_input_tokens_seen": 88827515, + "router_z_loss_clip": 2.24804688, + "router_z_loss_mlp": 0.19689941, + "step": 4121, + "time_per_iteration": 2.5337533950805664 + }, + { + "auxiliary_loss_clip": 0.06540959, + "auxiliary_loss_mlp": 0.01278306, + "balance_loss_clip": 0.06314932, + "balance_loss_mlp": 0.01258744, + "epoch": 0.2478280474973696, + "flos": 21731833607040.0, + "grad_norm": 2.0829104418917646, + "language_loss": 0.69792116, + "learning_rate": 3.5225621236526045e-06, + "loss": 0.77611381, + "num_input_tokens_seen": 88845025, + "router_z_loss_clip": 2.25585938, + "router_z_loss_mlp": 0.19580078, + "step": 4122, + "time_per_iteration": 2.5857455730438232 + }, + { + "auxiliary_loss_clip": 0.06535036, + "auxiliary_loss_mlp": 0.01273779, + "balance_loss_clip": 0.0630946, + "balance_loss_mlp": 0.01254729, + "epoch": 0.24788817075003758, + "flos": 20418400056960.0, + "grad_norm": 2.5894895086667264, + "language_loss": 0.80832231, + "learning_rate": 3.5223095592474596e-06, + "loss": 0.88641047, + "num_input_tokens_seen": 88861740, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.19042969, + "step": 4123, + "time_per_iteration": 2.533696174621582 + }, + { + "auxiliary_loss_clip": 0.06528741, + "auxiliary_loss_mlp": 0.01276684, + "balance_loss_clip": 0.06306656, + "balance_loss_mlp": 0.01259625, + "epoch": 0.24794829400270554, + "flos": 22599867179520.0, + "grad_norm": 2.45373622595604, + "language_loss": 0.75091624, + "learning_rate": 3.5220569371161846e-06, + "loss": 0.82897043, + "num_input_tokens_seen": 88879740, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.1706543, + "step": 4124, + "time_per_iteration": 2.5478947162628174 + }, + { + "auxiliary_loss_clip": 0.06523614, + "auxiliary_loss_mlp": 0.01276208, + "balance_loss_clip": 0.06306844, + "balance_loss_mlp": 0.01258708, + "epoch": 0.2480084172553735, + "flos": 39685362433920.0, + "grad_norm": 1.4066224864196382, + "language_loss": 0.74510413, + "learning_rate": 3.521804257268357e-06, + "loss": 0.82310236, + "num_input_tokens_seen": 88904095, + "router_z_loss_clip": 2.16503906, + "router_z_loss_mlp": 0.17504883, + "step": 4125, + "time_per_iteration": 4.164500951766968 + }, + { + "auxiliary_loss_clip": 0.06546921, + "auxiliary_loss_mlp": 0.01279637, + "balance_loss_clip": 0.06313127, + "balance_loss_mlp": 0.01260599, + "epoch": 0.24806854050804147, + "flos": 22060129104000.0, + "grad_norm": 1.9518521214536066, + "language_loss": 0.69807184, + "learning_rate": 3.5215515197135595e-06, + "loss": 0.77633739, + "num_input_tokens_seen": 88920740, + "router_z_loss_clip": 2.34179688, + "router_z_loss_mlp": 0.19030762, + "step": 4126, + "time_per_iteration": 2.520550489425659 + }, + { + "auxiliary_loss_clip": 0.06526291, + "auxiliary_loss_mlp": 0.0127589, + "balance_loss_clip": 0.06304894, + "balance_loss_mlp": 0.01257281, + "epoch": 0.24812866376070947, + "flos": 15492164739840.0, + "grad_norm": 2.6036079521490834, + "language_loss": 0.81805199, + "learning_rate": 3.5212987244613764e-06, + "loss": 0.89607382, + "num_input_tokens_seen": 88938510, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.18615723, + "step": 4127, + "time_per_iteration": 4.052755832672119 + }, + { + "auxiliary_loss_clip": 0.06533966, + "auxiliary_loss_mlp": 0.012739, + "balance_loss_clip": 0.06306454, + "balance_loss_mlp": 0.01255494, + "epoch": 0.24818878701337743, + "flos": 14762758947840.0, + "grad_norm": 2.4130643839940746, + "language_loss": 0.85122234, + "learning_rate": 3.5210458715213927e-06, + "loss": 0.92930102, + "num_input_tokens_seen": 88955235, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.18395996, + "step": 4128, + "time_per_iteration": 2.5801029205322266 + }, + { + "auxiliary_loss_clip": 0.06541854, + "auxiliary_loss_mlp": 0.01278965, + "balance_loss_clip": 0.06316209, + "balance_loss_mlp": 0.01260821, + "epoch": 0.2482489102660454, + "flos": 27096886356480.0, + "grad_norm": 2.0112959815575713, + "language_loss": 0.66149813, + "learning_rate": 3.5207929609031973e-06, + "loss": 0.73970628, + "num_input_tokens_seen": 88975210, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.18151855, + "step": 4129, + "time_per_iteration": 2.5865726470947266 + }, + { + "auxiliary_loss_clip": 0.06528358, + "auxiliary_loss_mlp": 0.01276243, + "balance_loss_clip": 0.06307153, + "balance_loss_mlp": 0.01257444, + "epoch": 0.24830903351871336, + "flos": 26474522555520.0, + "grad_norm": 1.7021812681223303, + "language_loss": 0.75761282, + "learning_rate": 3.5205399926163806e-06, + "loss": 0.83565885, + "num_input_tokens_seen": 88996120, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.18811035, + "step": 4130, + "time_per_iteration": 2.6659512519836426 + }, + { + "auxiliary_loss_clip": 0.06526491, + "auxiliary_loss_mlp": 0.01274514, + "balance_loss_clip": 0.06302534, + "balance_loss_mlp": 0.01255, + "epoch": 0.24836915677138133, + "flos": 10232225337600.0, + "grad_norm": 2.0871707802719004, + "language_loss": 0.77625716, + "learning_rate": 3.520286966670535e-06, + "loss": 0.85426718, + "num_input_tokens_seen": 89008685, + "router_z_loss_clip": 2.23828125, + "router_z_loss_mlp": 0.19519043, + "step": 4131, + "time_per_iteration": 3.906522274017334 + }, + { + "auxiliary_loss_clip": 0.06519566, + "auxiliary_loss_mlp": 0.01270892, + "balance_loss_clip": 0.0630278, + "balance_loss_mlp": 0.01253582, + "epoch": 0.2484292800240493, + "flos": 30088162863360.0, + "grad_norm": 1.7622390062278706, + "language_loss": 0.84475207, + "learning_rate": 3.520033883075255e-06, + "loss": 0.92265671, + "num_input_tokens_seen": 89031160, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.17297363, + "step": 4132, + "time_per_iteration": 2.6436057090759277 + }, + { + "auxiliary_loss_clip": 0.06525066, + "auxiliary_loss_mlp": 0.01275924, + "balance_loss_clip": 0.06302708, + "balance_loss_mlp": 0.01256779, + "epoch": 0.24848940327671726, + "flos": 13447899878400.0, + "grad_norm": 1.545647189211169, + "language_loss": 0.71393758, + "learning_rate": 3.5197807418401386e-06, + "loss": 0.79194742, + "num_input_tokens_seen": 89047235, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.19152832, + "step": 4133, + "time_per_iteration": 2.5431106090545654 + }, + { + "auxiliary_loss_clip": 0.06542444, + "auxiliary_loss_mlp": 0.01275489, + "balance_loss_clip": 0.06309851, + "balance_loss_mlp": 0.01255116, + "epoch": 0.24854952652938525, + "flos": 19975683409920.0, + "grad_norm": 2.3352452144714513, + "language_loss": 0.6286931, + "learning_rate": 3.5195275429747834e-06, + "loss": 0.70687246, + "num_input_tokens_seen": 89064790, + "router_z_loss_clip": 2.328125, + "router_z_loss_mlp": 0.20373535, + "step": 4134, + "time_per_iteration": 2.571525812149048 + }, + { + "auxiliary_loss_clip": 0.06524864, + "auxiliary_loss_mlp": 0.01277288, + "balance_loss_clip": 0.06301688, + "balance_loss_mlp": 0.01258883, + "epoch": 0.24860964978205322, + "flos": 18156026217600.0, + "grad_norm": 1.960513817978903, + "language_loss": 0.79140246, + "learning_rate": 3.5192742864887914e-06, + "loss": 0.86942399, + "num_input_tokens_seen": 89083250, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.18383789, + "step": 4135, + "time_per_iteration": 2.588916301727295 + }, + { + "auxiliary_loss_clip": 0.06524552, + "auxiliary_loss_mlp": 0.01274974, + "balance_loss_clip": 0.06303368, + "balance_loss_mlp": 0.01256294, + "epoch": 0.24866977303472118, + "flos": 11733397960320.0, + "grad_norm": 2.2852251503119234, + "language_loss": 0.8410641, + "learning_rate": 3.5190209723917662e-06, + "loss": 0.9190594, + "num_input_tokens_seen": 89100905, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.18676758, + "step": 4136, + "time_per_iteration": 2.497654676437378 + }, + { + "auxiliary_loss_clip": 0.06524116, + "auxiliary_loss_mlp": 0.01273427, + "balance_loss_clip": 0.06297501, + "balance_loss_mlp": 0.01254521, + "epoch": 0.24872989628738915, + "flos": 34832109623040.0, + "grad_norm": 1.7046352309858128, + "language_loss": 0.71601558, + "learning_rate": 3.518767600693314e-06, + "loss": 0.79399109, + "num_input_tokens_seen": 89122630, + "router_z_loss_clip": 2.26757812, + "router_z_loss_mlp": 0.18908691, + "step": 4137, + "time_per_iteration": 2.732480764389038 + }, + { + "auxiliary_loss_clip": 0.06525281, + "auxiliary_loss_mlp": 0.01273776, + "balance_loss_clip": 0.06299166, + "balance_loss_mlp": 0.01255549, + "epoch": 0.2487900195400571, + "flos": 13704512607360.0, + "grad_norm": 2.5230361612400296, + "language_loss": 0.67583597, + "learning_rate": 3.518514171403042e-06, + "loss": 0.7538265, + "num_input_tokens_seen": 89141050, + "router_z_loss_clip": 2.26171875, + "router_z_loss_mlp": 0.18212891, + "step": 4138, + "time_per_iteration": 2.531855583190918 + }, + { + "auxiliary_loss_clip": 0.06519014, + "auxiliary_loss_mlp": 0.01272692, + "balance_loss_clip": 0.06298752, + "balance_loss_mlp": 0.01254501, + "epoch": 0.24885014279272508, + "flos": 25344845009280.0, + "grad_norm": 1.9341473695701388, + "language_loss": 0.83479851, + "learning_rate": 3.51826068453056e-06, + "loss": 0.91271555, + "num_input_tokens_seen": 89160810, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.18188477, + "step": 4139, + "time_per_iteration": 2.6051557064056396 + }, + { + "auxiliary_loss_clip": 0.06528804, + "auxiliary_loss_mlp": 0.01275882, + "balance_loss_clip": 0.06300579, + "balance_loss_mlp": 0.01255711, + "epoch": 0.24891026604539307, + "flos": 20637724919040.0, + "grad_norm": 1.6977646822397727, + "language_loss": 0.79297662, + "learning_rate": 3.518007140085481e-06, + "loss": 0.87102342, + "num_input_tokens_seen": 89180610, + "router_z_loss_clip": 2.2890625, + "router_z_loss_mlp": 0.20178223, + "step": 4140, + "time_per_iteration": 2.5448291301727295 + }, + { + "auxiliary_loss_clip": 0.0641291, + "auxiliary_loss_mlp": 0.01270262, + "balance_loss_clip": 0.06303305, + "balance_loss_mlp": 0.0126555, + "epoch": 0.24897038929806103, + "flos": 66979086030720.0, + "grad_norm": 0.8107945435966392, + "language_loss": 0.60717231, + "learning_rate": 3.51775353807742e-06, + "loss": 0.68400407, + "num_input_tokens_seen": 89241880, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.04705811, + "step": 4141, + "time_per_iteration": 3.2685940265655518 + }, + { + "auxiliary_loss_clip": 0.06525983, + "auxiliary_loss_mlp": 0.01275717, + "balance_loss_clip": 0.06301422, + "balance_loss_mlp": 0.01256894, + "epoch": 0.249030512550729, + "flos": 36401359288320.0, + "grad_norm": 1.7802793710753735, + "language_loss": 0.72871864, + "learning_rate": 3.5174998785159913e-06, + "loss": 0.80673563, + "num_input_tokens_seen": 89263340, + "router_z_loss_clip": 2.24609375, + "router_z_loss_mlp": 0.18823242, + "step": 4142, + "time_per_iteration": 2.6564056873321533 + }, + { + "auxiliary_loss_clip": 0.0652248, + "auxiliary_loss_mlp": 0.01276725, + "balance_loss_clip": 0.06302793, + "balance_loss_mlp": 0.0125789, + "epoch": 0.24909063580339696, + "flos": 20160361808640.0, + "grad_norm": 1.9535741137498925, + "language_loss": 0.81280798, + "learning_rate": 3.5172461614108157e-06, + "loss": 0.8908, + "num_input_tokens_seen": 89282870, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.18823242, + "step": 4143, + "time_per_iteration": 2.5795881748199463 + }, + { + "auxiliary_loss_clip": 0.06522508, + "auxiliary_loss_mlp": 0.01275624, + "balance_loss_clip": 0.06301625, + "balance_loss_mlp": 0.01257039, + "epoch": 0.24915075905606493, + "flos": 26403887963520.0, + "grad_norm": 1.964912825826696, + "language_loss": 0.59448719, + "learning_rate": 3.5169923867715137e-06, + "loss": 0.67246854, + "num_input_tokens_seen": 89303830, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.18579102, + "step": 4144, + "time_per_iteration": 2.5888898372650146 + }, + { + "auxiliary_loss_clip": 0.06520054, + "auxiliary_loss_mlp": 0.01279478, + "balance_loss_clip": 0.06300642, + "balance_loss_mlp": 0.01260608, + "epoch": 0.2492108823087329, + "flos": 27534655612800.0, + "grad_norm": 2.2926576094039253, + "language_loss": 0.79198605, + "learning_rate": 3.516738554607708e-06, + "loss": 0.86998141, + "num_input_tokens_seen": 89324350, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.18859863, + "step": 4145, + "time_per_iteration": 2.6068575382232666 + }, + { + "auxiliary_loss_clip": 0.06539698, + "auxiliary_loss_mlp": 0.01282889, + "balance_loss_clip": 0.06307465, + "balance_loss_mlp": 0.01262587, + "epoch": 0.24927100556140086, + "flos": 16697088852480.0, + "grad_norm": 2.388513156986414, + "language_loss": 0.65914291, + "learning_rate": 3.5164846649290253e-06, + "loss": 0.73736882, + "num_input_tokens_seen": 89342875, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.20300293, + "step": 4146, + "time_per_iteration": 2.550225019454956 + }, + { + "auxiliary_loss_clip": 0.06418058, + "auxiliary_loss_mlp": 0.01257626, + "balance_loss_clip": 0.06307501, + "balance_loss_mlp": 0.01252389, + "epoch": 0.24933112881406885, + "flos": 62791899724800.0, + "grad_norm": 0.9255702942051489, + "language_loss": 0.67495543, + "learning_rate": 3.5162307177450915e-06, + "loss": 0.75171226, + "num_input_tokens_seen": 89404925, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.05239868, + "step": 4147, + "time_per_iteration": 3.2676596641540527 + }, + { + "auxiliary_loss_clip": 0.06525366, + "auxiliary_loss_mlp": 0.01281982, + "balance_loss_clip": 0.06302118, + "balance_loss_mlp": 0.01261764, + "epoch": 0.24939125206673682, + "flos": 26659242881280.0, + "grad_norm": 1.678024692441642, + "language_loss": 0.89250457, + "learning_rate": 3.5159767130655366e-06, + "loss": 0.97057807, + "num_input_tokens_seen": 89425090, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.20214844, + "step": 4148, + "time_per_iteration": 2.5950350761413574 + }, + { + "auxiliary_loss_clip": 0.06529681, + "auxiliary_loss_mlp": 0.01281757, + "balance_loss_clip": 0.06300169, + "balance_loss_mlp": 0.0125968, + "epoch": 0.24945137531940478, + "flos": 20710623571200.0, + "grad_norm": 1.8952521518004763, + "language_loss": 0.68350649, + "learning_rate": 3.5157226508999935e-06, + "loss": 0.76162088, + "num_input_tokens_seen": 89442615, + "router_z_loss_clip": 2.29492188, + "router_z_loss_mlp": 0.22070312, + "step": 4149, + "time_per_iteration": 2.52567720413208 + }, + { + "auxiliary_loss_clip": 0.06528307, + "auxiliary_loss_mlp": 0.0128627, + "balance_loss_clip": 0.06306647, + "balance_loss_mlp": 0.01266398, + "epoch": 0.24951149857207275, + "flos": 23775385708800.0, + "grad_norm": 1.639238516163445, + "language_loss": 0.71759897, + "learning_rate": 3.515468531258095e-06, + "loss": 0.79574472, + "num_input_tokens_seen": 89463025, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.1986084, + "step": 4150, + "time_per_iteration": 2.580000877380371 + }, + { + "auxiliary_loss_clip": 0.06529218, + "auxiliary_loss_mlp": 0.01284871, + "balance_loss_clip": 0.06303831, + "balance_loss_mlp": 0.01264129, + "epoch": 0.2495716218247407, + "flos": 15669589760640.0, + "grad_norm": 1.939767404293352, + "language_loss": 0.73002028, + "learning_rate": 3.515214354149478e-06, + "loss": 0.80816114, + "num_input_tokens_seen": 89480225, + "router_z_loss_clip": 2.25390625, + "router_z_loss_mlp": 0.20739746, + "step": 4151, + "time_per_iteration": 2.4935879707336426 + }, + { + "auxiliary_loss_clip": 0.06534886, + "auxiliary_loss_mlp": 0.01281273, + "balance_loss_clip": 0.0630331, + "balance_loss_mlp": 0.01261055, + "epoch": 0.24963174507740868, + "flos": 24057924076800.0, + "grad_norm": 4.265592628376469, + "language_loss": 0.64070994, + "learning_rate": 3.514960119583781e-06, + "loss": 0.71887159, + "num_input_tokens_seen": 89496985, + "router_z_loss_clip": 2.31445312, + "router_z_loss_mlp": 0.20227051, + "step": 4152, + "time_per_iteration": 2.5687365531921387 + }, + { + "auxiliary_loss_clip": 0.06516105, + "auxiliary_loss_mlp": 0.01279803, + "balance_loss_clip": 0.06296911, + "balance_loss_mlp": 0.01259979, + "epoch": 0.24969186833007664, + "flos": 21806073924480.0, + "grad_norm": 2.335025994250793, + "language_loss": 0.7798419, + "learning_rate": 3.514705827570645e-06, + "loss": 0.85780108, + "num_input_tokens_seen": 89514420, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.19812012, + "step": 4153, + "time_per_iteration": 2.5565860271453857 + }, + { + "auxiliary_loss_clip": 0.06523906, + "auxiliary_loss_mlp": 0.01276939, + "balance_loss_clip": 0.06304043, + "balance_loss_mlp": 0.01257806, + "epoch": 0.24975199158274464, + "flos": 19944307255680.0, + "grad_norm": 2.3946475317027978, + "language_loss": 0.77287221, + "learning_rate": 3.514451478119711e-06, + "loss": 0.85088068, + "num_input_tokens_seen": 89532925, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.19152832, + "step": 4154, + "time_per_iteration": 2.5327064990997314 + }, + { + "auxiliary_loss_clip": 0.06533594, + "auxiliary_loss_mlp": 0.0128089, + "balance_loss_clip": 0.06299926, + "balance_loss_mlp": 0.01258145, + "epoch": 0.2498121148354126, + "flos": 25345515841920.0, + "grad_norm": 1.7912237432514402, + "language_loss": 0.71052945, + "learning_rate": 3.5141970712406258e-06, + "loss": 0.78867429, + "num_input_tokens_seen": 89552855, + "router_z_loss_clip": 2.33984375, + "router_z_loss_mlp": 0.22766113, + "step": 4155, + "time_per_iteration": 2.566044330596924 + }, + { + "auxiliary_loss_clip": 0.06528749, + "auxiliary_loss_mlp": 0.01277956, + "balance_loss_clip": 0.06300025, + "balance_loss_mlp": 0.01257809, + "epoch": 0.24987223808808057, + "flos": 20565119756160.0, + "grad_norm": 1.6974291352944781, + "language_loss": 0.75592315, + "learning_rate": 3.513942606943036e-06, + "loss": 0.83399028, + "num_input_tokens_seen": 89572830, + "router_z_loss_clip": 2.2890625, + "router_z_loss_mlp": 0.20141602, + "step": 4156, + "time_per_iteration": 2.5388355255126953 + }, + { + "auxiliary_loss_clip": 0.06524897, + "auxiliary_loss_mlp": 0.01278507, + "balance_loss_clip": 0.06303049, + "balance_loss_mlp": 0.0125842, + "epoch": 0.24993236134074853, + "flos": 19754052560640.0, + "grad_norm": 3.125892113983293, + "language_loss": 0.77757698, + "learning_rate": 3.513688085236591e-06, + "loss": 0.85561097, + "num_input_tokens_seen": 89590345, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.20068359, + "step": 4157, + "time_per_iteration": 2.5327329635620117 + }, + { + "auxiliary_loss_clip": 0.06527505, + "auxiliary_loss_mlp": 0.012775, + "balance_loss_clip": 0.06301083, + "balance_loss_mlp": 0.01257068, + "epoch": 0.2499924845934165, + "flos": 18776209812480.0, + "grad_norm": 1.8891569690037928, + "language_loss": 0.82203197, + "learning_rate": 3.513433506130942e-06, + "loss": 0.90008199, + "num_input_tokens_seen": 89610295, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.20422363, + "step": 4158, + "time_per_iteration": 2.5894827842712402 + }, + { + "auxiliary_loss_clip": 0.06518973, + "auxiliary_loss_mlp": 0.01272913, + "balance_loss_clip": 0.06295922, + "balance_loss_mlp": 0.012544, + "epoch": 0.25005260784608446, + "flos": 16877658401280.0, + "grad_norm": 2.206587551308884, + "language_loss": 0.75718945, + "learning_rate": 3.5131788696357427e-06, + "loss": 0.83510834, + "num_input_tokens_seen": 89627795, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.18505859, + "step": 4159, + "time_per_iteration": 2.5279693603515625 + }, + { + "auxiliary_loss_clip": 0.06529576, + "auxiliary_loss_mlp": 0.01278956, + "balance_loss_clip": 0.06300279, + "balance_loss_mlp": 0.01258142, + "epoch": 0.2501127310987524, + "flos": 22131057185280.0, + "grad_norm": 2.1699031495969354, + "language_loss": 0.71598893, + "learning_rate": 3.512924175760649e-06, + "loss": 0.7940743, + "num_input_tokens_seen": 89648090, + "router_z_loss_clip": 2.29492188, + "router_z_loss_mlp": 0.20812988, + "step": 4160, + "time_per_iteration": 3.9746532440185547 + }, + { + "auxiliary_loss_clip": 0.06424317, + "auxiliary_loss_mlp": 0.01267599, + "balance_loss_clip": 0.06313459, + "balance_loss_mlp": 0.01263326, + "epoch": 0.2501728543514204, + "flos": 69480071170560.0, + "grad_norm": 0.7438462037708533, + "language_loss": 0.56844532, + "learning_rate": 3.5126694245153186e-06, + "loss": 0.64536446, + "num_input_tokens_seen": 89710345, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.04278564, + "step": 4161, + "time_per_iteration": 3.233760356903076 + }, + { + "auxiliary_loss_clip": 0.06530809, + "auxiliary_loss_mlp": 0.01282686, + "balance_loss_clip": 0.06298731, + "balance_loss_mlp": 0.01261848, + "epoch": 0.25023297760408836, + "flos": 16295601214080.0, + "grad_norm": 2.49700797922569, + "language_loss": 0.8179751, + "learning_rate": 3.5124146159094125e-06, + "loss": 0.89611006, + "num_input_tokens_seen": 89729390, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.20849609, + "step": 4162, + "time_per_iteration": 2.553572654724121 + }, + { + "auxiliary_loss_clip": 0.0652239, + "auxiliary_loss_mlp": 0.01280647, + "balance_loss_clip": 0.06294353, + "balance_loss_mlp": 0.01260358, + "epoch": 0.2502931008567563, + "flos": 12242598422400.0, + "grad_norm": 2.2503072324763616, + "language_loss": 0.88019562, + "learning_rate": 3.5121597499525927e-06, + "loss": 0.95822597, + "num_input_tokens_seen": 89742805, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.203125, + "step": 4163, + "time_per_iteration": 2.531467914581299 + }, + { + "auxiliary_loss_clip": 0.06520548, + "auxiliary_loss_mlp": 0.01277405, + "balance_loss_clip": 0.06293885, + "balance_loss_mlp": 0.01257092, + "epoch": 0.25035322410942434, + "flos": 23188003787520.0, + "grad_norm": 1.6365124228332002, + "language_loss": 0.83867121, + "learning_rate": 3.5119048266545232e-06, + "loss": 0.91665077, + "num_input_tokens_seen": 89761145, + "router_z_loss_clip": 2.26757812, + "router_z_loss_mlp": 0.20300293, + "step": 4164, + "time_per_iteration": 4.068189382553101 + }, + { + "auxiliary_loss_clip": 0.06509531, + "auxiliary_loss_mlp": 0.01280667, + "balance_loss_clip": 0.06292763, + "balance_loss_mlp": 0.01262106, + "epoch": 0.2504133473620923, + "flos": 20922904690560.0, + "grad_norm": 1.788160941639295, + "language_loss": 0.7460506, + "learning_rate": 3.5116498460248716e-06, + "loss": 0.82395256, + "num_input_tokens_seen": 89780905, + "router_z_loss_clip": 2.16601562, + "router_z_loss_mlp": 0.18579102, + "step": 4165, + "time_per_iteration": 2.568701982498169 + }, + { + "auxiliary_loss_clip": 0.06526586, + "auxiliary_loss_mlp": 0.01278077, + "balance_loss_clip": 0.06293961, + "balance_loss_mlp": 0.01257883, + "epoch": 0.2504734706147603, + "flos": 20782725609600.0, + "grad_norm": 1.8100288551258081, + "language_loss": 0.74429101, + "learning_rate": 3.5113948080733062e-06, + "loss": 0.82233763, + "num_input_tokens_seen": 89799230, + "router_z_loss_clip": 2.33007812, + "router_z_loss_mlp": 0.2019043, + "step": 4166, + "time_per_iteration": 3.989368438720703 + }, + { + "auxiliary_loss_clip": 0.065147, + "auxiliary_loss_mlp": 0.01277163, + "balance_loss_clip": 0.06293219, + "balance_loss_mlp": 0.0125778, + "epoch": 0.25053359386742824, + "flos": 24355681960320.0, + "grad_norm": 1.5960764456675967, + "language_loss": 0.82469785, + "learning_rate": 3.5111397128094973e-06, + "loss": 0.90261644, + "num_input_tokens_seen": 89818240, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.19384766, + "step": 4167, + "time_per_iteration": 2.554733991622925 + }, + { + "auxiliary_loss_clip": 0.06513357, + "auxiliary_loss_mlp": 0.01280403, + "balance_loss_clip": 0.06292276, + "balance_loss_mlp": 0.01260614, + "epoch": 0.2505937171200962, + "flos": 21220578720000.0, + "grad_norm": 1.9887592956808484, + "language_loss": 0.80394876, + "learning_rate": 3.51088456024312e-06, + "loss": 0.88188636, + "num_input_tokens_seen": 89834485, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.19799805, + "step": 4168, + "time_per_iteration": 2.576969623565674 + }, + { + "auxiliary_loss_clip": 0.06531397, + "auxiliary_loss_mlp": 0.01277594, + "balance_loss_clip": 0.06300385, + "balance_loss_mlp": 0.01256196, + "epoch": 0.25065384037276417, + "flos": 41436816802560.0, + "grad_norm": 4.930314721126017, + "language_loss": 0.69985271, + "learning_rate": 3.510629350383849e-06, + "loss": 0.7779426, + "num_input_tokens_seen": 89855645, + "router_z_loss_clip": 2.31445312, + "router_z_loss_mlp": 0.21386719, + "step": 4169, + "time_per_iteration": 2.709149122238159 + }, + { + "auxiliary_loss_clip": 0.06511761, + "auxiliary_loss_mlp": 0.01277868, + "balance_loss_clip": 0.06292351, + "balance_loss_mlp": 0.0125827, + "epoch": 0.25071396362543213, + "flos": 26109274608000.0, + "grad_norm": 1.904216953279787, + "language_loss": 0.77927327, + "learning_rate": 3.510374083241361e-06, + "loss": 0.85716957, + "num_input_tokens_seen": 89874895, + "router_z_loss_clip": 2.19335938, + "router_z_loss_mlp": 0.19592285, + "step": 4170, + "time_per_iteration": 4.016170024871826 + }, + { + "auxiliary_loss_clip": 0.0651409, + "auxiliary_loss_mlp": 0.01278168, + "balance_loss_clip": 0.06291165, + "balance_loss_mlp": 0.01258975, + "epoch": 0.2507740868781001, + "flos": 19105008433920.0, + "grad_norm": 2.5077494433812966, + "language_loss": 0.76900339, + "learning_rate": 3.5101187588253368e-06, + "loss": 0.84692597, + "num_input_tokens_seen": 89891700, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.1920166, + "step": 4171, + "time_per_iteration": 2.5651609897613525 + }, + { + "auxiliary_loss_clip": 0.06406491, + "auxiliary_loss_mlp": 0.01262132, + "balance_loss_clip": 0.06297328, + "balance_loss_mlp": 0.01257083, + "epoch": 0.25083421013076806, + "flos": 64361652514560.0, + "grad_norm": 0.8214086964760371, + "language_loss": 0.6006844, + "learning_rate": 3.509863377145458e-06, + "loss": 0.67737067, + "num_input_tokens_seen": 89955775, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.05047607, + "step": 4172, + "time_per_iteration": 3.1837103366851807 + }, + { + "auxiliary_loss_clip": 0.06520402, + "auxiliary_loss_mlp": 0.01281138, + "balance_loss_clip": 0.06295419, + "balance_loss_mlp": 0.012603, + "epoch": 0.25089433338343603, + "flos": 24286430960640.0, + "grad_norm": 1.3489665028935822, + "language_loss": 0.79424238, + "learning_rate": 3.509607938211409e-06, + "loss": 0.87225777, + "num_input_tokens_seen": 89977150, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.20849609, + "step": 4173, + "time_per_iteration": 2.6214826107025146 + }, + { + "auxiliary_loss_clip": 0.06513289, + "auxiliary_loss_mlp": 0.01273745, + "balance_loss_clip": 0.06291197, + "balance_loss_mlp": 0.01254398, + "epoch": 0.250954456636104, + "flos": 14726896600320.0, + "grad_norm": 1.8312177549547823, + "language_loss": 0.83930022, + "learning_rate": 3.509352442032875e-06, + "loss": 0.91717052, + "num_input_tokens_seen": 89994925, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.19360352, + "step": 4174, + "time_per_iteration": 2.5973377227783203 + }, + { + "auxiliary_loss_clip": 0.06519122, + "auxiliary_loss_mlp": 0.0127901, + "balance_loss_clip": 0.0629285, + "balance_loss_mlp": 0.01259341, + "epoch": 0.25101457988877196, + "flos": 22280208652800.0, + "grad_norm": 2.088546315652338, + "language_loss": 0.71558678, + "learning_rate": 3.509096888619545e-06, + "loss": 0.79356813, + "num_input_tokens_seen": 90013235, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.19665527, + "step": 4175, + "time_per_iteration": 2.6718719005584717 + }, + { + "auxiliary_loss_clip": 0.06522886, + "auxiliary_loss_mlp": 0.01277602, + "balance_loss_clip": 0.06295571, + "balance_loss_mlp": 0.01256502, + "epoch": 0.2510747031414399, + "flos": 25195441979520.0, + "grad_norm": 1.9595604726907228, + "language_loss": 0.81335604, + "learning_rate": 3.50884127798111e-06, + "loss": 0.891361, + "num_input_tokens_seen": 90032150, + "router_z_loss_clip": 2.2734375, + "router_z_loss_mlp": 0.2109375, + "step": 4176, + "time_per_iteration": 2.5455691814422607 + }, + { + "auxiliary_loss_clip": 0.06515132, + "auxiliary_loss_mlp": 0.01279504, + "balance_loss_clip": 0.06292217, + "balance_loss_mlp": 0.01257319, + "epoch": 0.25113482639410795, + "flos": 20710455863040.0, + "grad_norm": 1.8805810902271358, + "language_loss": 0.83346581, + "learning_rate": 3.5085856101272623e-06, + "loss": 0.91141224, + "num_input_tokens_seen": 90049085, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.22167969, + "step": 4177, + "time_per_iteration": 2.5471949577331543 + }, + { + "auxiliary_loss_clip": 0.06520942, + "auxiliary_loss_mlp": 0.01276628, + "balance_loss_clip": 0.06300486, + "balance_loss_mlp": 0.01256375, + "epoch": 0.2511949496467759, + "flos": 21513347285760.0, + "grad_norm": 2.081094632338002, + "language_loss": 0.83410418, + "learning_rate": 3.508329885067698e-06, + "loss": 0.91207987, + "num_input_tokens_seen": 90067695, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.20251465, + "step": 4178, + "time_per_iteration": 2.5352370738983154 + }, + { + "auxiliary_loss_clip": 0.06514454, + "auxiliary_loss_mlp": 0.01274486, + "balance_loss_clip": 0.06294617, + "balance_loss_mlp": 0.01255949, + "epoch": 0.2512550728994439, + "flos": 20707898313600.0, + "grad_norm": 2.160080340734635, + "language_loss": 0.75744665, + "learning_rate": 3.508074102812112e-06, + "loss": 0.83533603, + "num_input_tokens_seen": 90083890, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.18554688, + "step": 4179, + "time_per_iteration": 2.560995578765869 + }, + { + "auxiliary_loss_clip": 0.0652363, + "auxiliary_loss_mlp": 0.0128226, + "balance_loss_clip": 0.06298499, + "balance_loss_mlp": 0.01261053, + "epoch": 0.25131519615211184, + "flos": 18484531349760.0, + "grad_norm": 2.0850842878171347, + "language_loss": 0.70515448, + "learning_rate": 3.507818263370206e-06, + "loss": 0.78321338, + "num_input_tokens_seen": 90100995, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.2121582, + "step": 4180, + "time_per_iteration": 2.510233163833618 + }, + { + "auxiliary_loss_clip": 0.06511761, + "auxiliary_loss_mlp": 0.01275296, + "balance_loss_clip": 0.06292045, + "balance_loss_mlp": 0.0125565, + "epoch": 0.2513753194047798, + "flos": 20491131000960.0, + "grad_norm": 1.8144815234901748, + "language_loss": 0.86591852, + "learning_rate": 3.5075623667516796e-06, + "loss": 0.94378912, + "num_input_tokens_seen": 90120365, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.19628906, + "step": 4181, + "time_per_iteration": 2.546736240386963 + }, + { + "auxiliary_loss_clip": 0.06519435, + "auxiliary_loss_mlp": 0.01276165, + "balance_loss_clip": 0.06298217, + "balance_loss_mlp": 0.01256555, + "epoch": 0.25143544265744777, + "flos": 37679182053120.0, + "grad_norm": 1.8572714108551465, + "language_loss": 0.68626046, + "learning_rate": 3.507306412966238e-06, + "loss": 0.76421642, + "num_input_tokens_seen": 90142610, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.19616699, + "step": 4182, + "time_per_iteration": 2.6632721424102783 + }, + { + "auxiliary_loss_clip": 0.06408723, + "auxiliary_loss_mlp": 0.01271787, + "balance_loss_clip": 0.0630056, + "balance_loss_mlp": 0.012679, + "epoch": 0.25149556591011574, + "flos": 69386502487680.0, + "grad_norm": 0.837431587640593, + "language_loss": 0.70118701, + "learning_rate": 3.5070504020235853e-06, + "loss": 0.77799207, + "num_input_tokens_seen": 90200555, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.03881836, + "step": 4183, + "time_per_iteration": 3.194293737411499 + }, + { + "auxiliary_loss_clip": 0.0651418, + "auxiliary_loss_mlp": 0.01278526, + "balance_loss_clip": 0.06292195, + "balance_loss_mlp": 0.01258725, + "epoch": 0.2515556891627837, + "flos": 13995478310400.0, + "grad_norm": 2.4106350957321805, + "language_loss": 0.74627292, + "learning_rate": 3.506794333933431e-06, + "loss": 0.82419991, + "num_input_tokens_seen": 90218120, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.19799805, + "step": 4184, + "time_per_iteration": 2.589237689971924 + }, + { + "auxiliary_loss_clip": 0.0652144, + "auxiliary_loss_mlp": 0.01279322, + "balance_loss_clip": 0.06299628, + "balance_loss_mlp": 0.01258496, + "epoch": 0.25161581241545167, + "flos": 22170022133760.0, + "grad_norm": 2.9216799071507964, + "language_loss": 0.83484751, + "learning_rate": 3.506538208705484e-06, + "loss": 0.91285515, + "num_input_tokens_seen": 90236790, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.20837402, + "step": 4185, + "time_per_iteration": 2.5535552501678467 + }, + { + "auxiliary_loss_clip": 0.06393237, + "auxiliary_loss_mlp": 0.01262208, + "balance_loss_clip": 0.06284703, + "balance_loss_mlp": 0.01258632, + "epoch": 0.25167593566811963, + "flos": 69375936873600.0, + "grad_norm": 0.7619629684954553, + "language_loss": 0.61517715, + "learning_rate": 3.5062820263494574e-06, + "loss": 0.69173163, + "num_input_tokens_seen": 90297070, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.03567505, + "step": 4186, + "time_per_iteration": 3.0749270915985107 + }, + { + "auxiliary_loss_clip": 0.06517404, + "auxiliary_loss_mlp": 0.01276354, + "balance_loss_clip": 0.06296861, + "balance_loss_mlp": 0.01256946, + "epoch": 0.2517360589207876, + "flos": 13266533715840.0, + "grad_norm": 1.9855339768496567, + "language_loss": 0.79795682, + "learning_rate": 3.5060257868750656e-06, + "loss": 0.87589443, + "num_input_tokens_seen": 90315255, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.1940918, + "step": 4187, + "time_per_iteration": 2.507354974746704 + }, + { + "auxiliary_loss_clip": 0.06517795, + "auxiliary_loss_mlp": 0.01276527, + "balance_loss_clip": 0.06298955, + "balance_loss_mlp": 0.01257001, + "epoch": 0.25179618217345556, + "flos": 20383208542080.0, + "grad_norm": 1.642205422551737, + "language_loss": 0.80147833, + "learning_rate": 3.5057694902920244e-06, + "loss": 0.87942159, + "num_input_tokens_seen": 90334990, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.19519043, + "step": 4188, + "time_per_iteration": 2.5763680934906006 + }, + { + "auxiliary_loss_clip": 0.06512115, + "auxiliary_loss_mlp": 0.01281194, + "balance_loss_clip": 0.06293076, + "balance_loss_mlp": 0.01261405, + "epoch": 0.25185630542612353, + "flos": 27670767770880.0, + "grad_norm": 1.9118309511671905, + "language_loss": 0.75198257, + "learning_rate": 3.5055131366100534e-06, + "loss": 0.8299157, + "num_input_tokens_seen": 90351825, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.19775391, + "step": 4189, + "time_per_iteration": 2.5764901638031006 + }, + { + "auxiliary_loss_clip": 0.06511948, + "auxiliary_loss_mlp": 0.01272813, + "balance_loss_clip": 0.06296545, + "balance_loss_mlp": 0.01255253, + "epoch": 0.25191642867879155, + "flos": 21002805158400.0, + "grad_norm": 1.9652552730181423, + "language_loss": 0.84938216, + "learning_rate": 3.5052567258388745e-06, + "loss": 0.92722976, + "num_input_tokens_seen": 90369860, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.17565918, + "step": 4190, + "time_per_iteration": 2.592289447784424 + }, + { + "auxiliary_loss_clip": 0.06519347, + "auxiliary_loss_mlp": 0.01277887, + "balance_loss_clip": 0.0629743, + "balance_loss_mlp": 0.01256513, + "epoch": 0.2519765519314595, + "flos": 21112027355520.0, + "grad_norm": 3.618444667756858, + "language_loss": 0.7581113, + "learning_rate": 3.5050002579882082e-06, + "loss": 0.83608365, + "num_input_tokens_seen": 90389245, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.21386719, + "step": 4191, + "time_per_iteration": 2.526263952255249 + }, + { + "auxiliary_loss_clip": 0.06391463, + "auxiliary_loss_mlp": 0.01256383, + "balance_loss_clip": 0.06282607, + "balance_loss_mlp": 0.01252372, + "epoch": 0.2520366751841275, + "flos": 62765932158720.0, + "grad_norm": 0.7119135795788611, + "language_loss": 0.56952, + "learning_rate": 3.5047437330677823e-06, + "loss": 0.64599848, + "num_input_tokens_seen": 90456735, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.0401001, + "step": 4192, + "time_per_iteration": 3.271810531616211 + }, + { + "auxiliary_loss_clip": 0.06513695, + "auxiliary_loss_mlp": 0.01277171, + "balance_loss_clip": 0.06298056, + "balance_loss_mlp": 0.01257835, + "epoch": 0.25209679843679544, + "flos": 22236254386560.0, + "grad_norm": 1.9003966807864532, + "language_loss": 0.77017993, + "learning_rate": 3.504487151087323e-06, + "loss": 0.84808856, + "num_input_tokens_seen": 90474165, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.19335938, + "step": 4193, + "time_per_iteration": 2.57377028465271 + }, + { + "auxiliary_loss_clip": 0.06516427, + "auxiliary_loss_mlp": 0.01274362, + "balance_loss_clip": 0.06290127, + "balance_loss_mlp": 0.01254573, + "epoch": 0.2521569216894634, + "flos": 12171502632960.0, + "grad_norm": 10.029516736128722, + "language_loss": 0.84954166, + "learning_rate": 3.5042305120565598e-06, + "loss": 0.92744958, + "num_input_tokens_seen": 90491660, + "router_z_loss_clip": 2.26171875, + "router_z_loss_mlp": 0.19787598, + "step": 4194, + "time_per_iteration": 2.553053140640259 + }, + { + "auxiliary_loss_clip": 0.06517825, + "auxiliary_loss_mlp": 0.01277837, + "balance_loss_clip": 0.06293463, + "balance_loss_mlp": 0.01258668, + "epoch": 0.2522170449421314, + "flos": 23707182885120.0, + "grad_norm": 1.454284137617771, + "language_loss": 0.88584, + "learning_rate": 3.5039738159852253e-06, + "loss": 0.96379662, + "num_input_tokens_seen": 90514025, + "router_z_loss_clip": 2.2421875, + "router_z_loss_mlp": 0.19165039, + "step": 4195, + "time_per_iteration": 2.576735734939575 + }, + { + "auxiliary_loss_clip": 0.06516481, + "auxiliary_loss_mlp": 0.01280538, + "balance_loss_clip": 0.06291771, + "balance_loss_mlp": 0.01258258, + "epoch": 0.25227716819479934, + "flos": 20961073025280.0, + "grad_norm": 2.023401186655312, + "language_loss": 0.86073804, + "learning_rate": 3.503717062883053e-06, + "loss": 0.93870831, + "num_input_tokens_seen": 90533530, + "router_z_loss_clip": 2.24609375, + "router_z_loss_mlp": 0.22290039, + "step": 4196, + "time_per_iteration": 2.561074733734131 + }, + { + "auxiliary_loss_clip": 0.06519768, + "auxiliary_loss_mlp": 0.01277786, + "balance_loss_clip": 0.06297043, + "balance_loss_mlp": 0.01258486, + "epoch": 0.2523372914474673, + "flos": 23338077649920.0, + "grad_norm": 1.7735111095668046, + "language_loss": 0.8382597, + "learning_rate": 3.5034602527597786e-06, + "loss": 0.91623521, + "num_input_tokens_seen": 90554025, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.19299316, + "step": 4197, + "time_per_iteration": 2.606966018676758 + }, + { + "auxiliary_loss_clip": 0.06523669, + "auxiliary_loss_mlp": 0.01282022, + "balance_loss_clip": 0.06298, + "balance_loss_mlp": 0.01260898, + "epoch": 0.25239741470013527, + "flos": 36978217522560.0, + "grad_norm": 2.239450775339409, + "language_loss": 0.72922301, + "learning_rate": 3.5032033856251405e-06, + "loss": 0.80727994, + "num_input_tokens_seen": 90576930, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.21130371, + "step": 4198, + "time_per_iteration": 2.6708526611328125 + }, + { + "auxiliary_loss_clip": 0.06527208, + "auxiliary_loss_mlp": 0.012804, + "balance_loss_clip": 0.06297485, + "balance_loss_mlp": 0.01258967, + "epoch": 0.25245753795280323, + "flos": 18521777289600.0, + "grad_norm": 2.0891954597653055, + "language_loss": 0.77475321, + "learning_rate": 3.50294646148888e-06, + "loss": 0.85282922, + "num_input_tokens_seen": 90595710, + "router_z_loss_clip": 2.29882812, + "router_z_loss_mlp": 0.21447754, + "step": 4199, + "time_per_iteration": 3.9535269737243652 + }, + { + "auxiliary_loss_clip": 0.06522667, + "auxiliary_loss_mlp": 0.01277202, + "balance_loss_clip": 0.06296766, + "balance_loss_mlp": 0.01257485, + "epoch": 0.2525176612054712, + "flos": 32353387741440.0, + "grad_norm": 1.7804914051128766, + "language_loss": 0.74169135, + "learning_rate": 3.502689480360739e-06, + "loss": 0.81969011, + "num_input_tokens_seen": 90617945, + "router_z_loss_clip": 2.25976562, + "router_z_loss_mlp": 0.19714355, + "step": 4200, + "time_per_iteration": 2.637592315673828 + }, + { + "auxiliary_loss_clip": 0.06517747, + "auxiliary_loss_mlp": 0.01275367, + "balance_loss_clip": 0.06294595, + "balance_loss_mlp": 0.01255602, + "epoch": 0.25257778445813917, + "flos": 45268440307200.0, + "grad_norm": 1.5897560976370495, + "language_loss": 0.82704282, + "learning_rate": 3.5024324422504616e-06, + "loss": 0.90497398, + "num_input_tokens_seen": 90640855, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.19775391, + "step": 4201, + "time_per_iteration": 2.740555763244629 + }, + { + "auxiliary_loss_clip": 0.06520839, + "auxiliary_loss_mlp": 0.01279409, + "balance_loss_clip": 0.06295383, + "balance_loss_mlp": 0.01259048, + "epoch": 0.25263790771080713, + "flos": 23374526976000.0, + "grad_norm": 1.712909977397354, + "language_loss": 0.75193971, + "learning_rate": 3.5021753471677965e-06, + "loss": 0.82994223, + "num_input_tokens_seen": 90661350, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.20361328, + "step": 4202, + "time_per_iteration": 2.55350661277771 + }, + { + "auxiliary_loss_clip": 0.06512797, + "auxiliary_loss_mlp": 0.01277812, + "balance_loss_clip": 0.06294158, + "balance_loss_mlp": 0.01258226, + "epoch": 0.25269803096347515, + "flos": 18520938748800.0, + "grad_norm": 3.10045167794265, + "language_loss": 0.73924601, + "learning_rate": 3.501918195122491e-06, + "loss": 0.81715208, + "num_input_tokens_seen": 90680540, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.19592285, + "step": 4203, + "time_per_iteration": 2.539475917816162 + }, + { + "auxiliary_loss_clip": 0.06523657, + "auxiliary_loss_mlp": 0.01272979, + "balance_loss_clip": 0.0629805, + "balance_loss_mlp": 0.01252964, + "epoch": 0.2527581542161431, + "flos": 24617870985600.0, + "grad_norm": 1.4931409888350198, + "language_loss": 0.78306639, + "learning_rate": 3.501660986124297e-06, + "loss": 0.86103272, + "num_input_tokens_seen": 90703460, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.20007324, + "step": 4204, + "time_per_iteration": 4.058368682861328 + }, + { + "auxiliary_loss_clip": 0.0651952, + "auxiliary_loss_mlp": 0.01278061, + "balance_loss_clip": 0.06294288, + "balance_loss_mlp": 0.01258427, + "epoch": 0.2528182774688111, + "flos": 12646266266880.0, + "grad_norm": 2.5678524165435928, + "language_loss": 0.72629768, + "learning_rate": 3.5014037201829684e-06, + "loss": 0.80427349, + "num_input_tokens_seen": 90718815, + "router_z_loss_clip": 2.25585938, + "router_z_loss_mlp": 0.19616699, + "step": 4205, + "time_per_iteration": 2.503054618835449 + }, + { + "auxiliary_loss_clip": 0.06508891, + "auxiliary_loss_mlp": 0.01281235, + "balance_loss_clip": 0.06295824, + "balance_loss_mlp": 0.01264164, + "epoch": 0.25287840072147905, + "flos": 46947331440000.0, + "grad_norm": 1.3326329418173375, + "language_loss": 0.76355231, + "learning_rate": 3.50114639730826e-06, + "loss": 0.84145361, + "num_input_tokens_seen": 90742125, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.17077637, + "step": 4206, + "time_per_iteration": 4.097341537475586 + }, + { + "auxiliary_loss_clip": 0.06516857, + "auxiliary_loss_mlp": 0.01278993, + "balance_loss_clip": 0.06295118, + "balance_loss_mlp": 0.0126042, + "epoch": 0.252938523974147, + "flos": 18885641644800.0, + "grad_norm": 1.8849973173990275, + "language_loss": 0.79775047, + "learning_rate": 3.5008890175099296e-06, + "loss": 0.875709, + "num_input_tokens_seen": 90760785, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.18579102, + "step": 4207, + "time_per_iteration": 2.545203447341919 + }, + { + "auxiliary_loss_clip": 0.06511112, + "auxiliary_loss_mlp": 0.01280475, + "balance_loss_clip": 0.06293532, + "balance_loss_mlp": 0.01261628, + "epoch": 0.252998647226815, + "flos": 21441245247360.0, + "grad_norm": 1.449056492648579, + "language_loss": 0.76862776, + "learning_rate": 3.5006315807977375e-06, + "loss": 0.84654361, + "num_input_tokens_seen": 90780045, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.18859863, + "step": 4208, + "time_per_iteration": 2.540531873703003 + }, + { + "auxiliary_loss_clip": 0.06512551, + "auxiliary_loss_mlp": 0.01282266, + "balance_loss_clip": 0.06295963, + "balance_loss_mlp": 0.01264098, + "epoch": 0.25305877047948294, + "flos": 25448365128960.0, + "grad_norm": 1.8025422596027827, + "language_loss": 0.70108622, + "learning_rate": 3.5003740871814456e-06, + "loss": 0.77903438, + "num_input_tokens_seen": 90797980, + "router_z_loss_clip": 2.1640625, + "router_z_loss_mlp": 0.1817627, + "step": 4209, + "time_per_iteration": 2.586179256439209 + }, + { + "auxiliary_loss_clip": 0.06401253, + "auxiliary_loss_mlp": 0.01256172, + "balance_loss_clip": 0.06294125, + "balance_loss_mlp": 0.01251663, + "epoch": 0.2531188937321509, + "flos": 60205213457280.0, + "grad_norm": 0.7328516672129679, + "language_loss": 0.55096745, + "learning_rate": 3.5001165366708175e-06, + "loss": 0.62754166, + "num_input_tokens_seen": 90864865, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.0451355, + "step": 4210, + "time_per_iteration": 4.676252841949463 + }, + { + "auxiliary_loss_clip": 0.06515378, + "auxiliary_loss_mlp": 0.01285614, + "balance_loss_clip": 0.06294395, + "balance_loss_mlp": 0.01265861, + "epoch": 0.25317901698481887, + "flos": 19688449213440.0, + "grad_norm": 2.0935195986224837, + "language_loss": 0.81166065, + "learning_rate": 3.4998589292756204e-06, + "loss": 0.88967055, + "num_input_tokens_seen": 90882885, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.19763184, + "step": 4211, + "time_per_iteration": 2.5251474380493164 + }, + { + "auxiliary_loss_clip": 0.06513076, + "auxiliary_loss_mlp": 0.01275756, + "balance_loss_clip": 0.06299528, + "balance_loss_mlp": 0.01258554, + "epoch": 0.25323914023748684, + "flos": 24431012380800.0, + "grad_norm": 1.7184165713115493, + "language_loss": 0.78543985, + "learning_rate": 3.499601265005622e-06, + "loss": 0.86332822, + "num_input_tokens_seen": 90902985, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.17199707, + "step": 4212, + "time_per_iteration": 2.609750986099243 + }, + { + "auxiliary_loss_clip": 0.06514729, + "auxiliary_loss_mlp": 0.01278491, + "balance_loss_clip": 0.06293602, + "balance_loss_mlp": 0.0125912, + "epoch": 0.2532992634901548, + "flos": 25454528403840.0, + "grad_norm": 1.862422609084939, + "language_loss": 0.53407073, + "learning_rate": 3.4993435438705938e-06, + "loss": 0.61200291, + "num_input_tokens_seen": 90923550, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.19384766, + "step": 4213, + "time_per_iteration": 2.5825159549713135 + }, + { + "auxiliary_loss_clip": 0.06517738, + "auxiliary_loss_mlp": 0.01278881, + "balance_loss_clip": 0.06296406, + "balance_loss_mlp": 0.01259832, + "epoch": 0.25335938674282277, + "flos": 18886605966720.0, + "grad_norm": 2.428420926128805, + "language_loss": 0.65041012, + "learning_rate": 3.499085765880308e-06, + "loss": 0.72837627, + "num_input_tokens_seen": 90943260, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.19030762, + "step": 4214, + "time_per_iteration": 2.567539930343628 + }, + { + "auxiliary_loss_clip": 0.06391697, + "auxiliary_loss_mlp": 0.01257675, + "balance_loss_clip": 0.06284457, + "balance_loss_mlp": 0.01253702, + "epoch": 0.25341950999549073, + "flos": 53079692025600.0, + "grad_norm": 0.8253897319773601, + "language_loss": 0.57886475, + "learning_rate": 3.4988279310445396e-06, + "loss": 0.65535849, + "num_input_tokens_seen": 90996295, + "router_z_loss_clip": 1.07324219, + "router_z_loss_mlp": 0.03970337, + "step": 4215, + "time_per_iteration": 2.941021680831909 + }, + { + "auxiliary_loss_clip": 0.06512114, + "auxiliary_loss_mlp": 0.01274398, + "balance_loss_clip": 0.0629489, + "balance_loss_mlp": 0.0125604, + "epoch": 0.2534796332481587, + "flos": 39029609980800.0, + "grad_norm": 1.6071125602920209, + "language_loss": 0.84078032, + "learning_rate": 3.498570039373066e-06, + "loss": 0.9186455, + "num_input_tokens_seen": 91017545, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.18359375, + "step": 4216, + "time_per_iteration": 2.732790946960449 + }, + { + "auxiliary_loss_clip": 0.06509562, + "auxiliary_loss_mlp": 0.0127764, + "balance_loss_clip": 0.06290903, + "balance_loss_mlp": 0.01259294, + "epoch": 0.2535397565008267, + "flos": 23593809911040.0, + "grad_norm": 1.7865601815504963, + "language_loss": 0.81036615, + "learning_rate": 3.498312090875666e-06, + "loss": 0.88823819, + "num_input_tokens_seen": 91037715, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.18371582, + "step": 4217, + "time_per_iteration": 2.5606398582458496 + }, + { + "auxiliary_loss_clip": 0.06514265, + "auxiliary_loss_mlp": 0.01279769, + "balance_loss_clip": 0.06294704, + "balance_loss_mlp": 0.01260255, + "epoch": 0.2535998797534947, + "flos": 19287422772480.0, + "grad_norm": 2.529157470409933, + "language_loss": 0.761132, + "learning_rate": 3.4980540855621218e-06, + "loss": 0.83907235, + "num_input_tokens_seen": 91055295, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.19519043, + "step": 4218, + "time_per_iteration": 2.623429298400879 + }, + { + "auxiliary_loss_clip": 0.06516235, + "auxiliary_loss_mlp": 0.01282224, + "balance_loss_clip": 0.06296211, + "balance_loss_mlp": 0.01262757, + "epoch": 0.25366000300616265, + "flos": 24031201824000.0, + "grad_norm": 1.721807278316132, + "language_loss": 0.75063616, + "learning_rate": 3.4977960234422167e-06, + "loss": 0.82862079, + "num_input_tokens_seen": 91075485, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.19482422, + "step": 4219, + "time_per_iteration": 2.564220428466797 + }, + { + "auxiliary_loss_clip": 0.06520407, + "auxiliary_loss_mlp": 0.0127968, + "balance_loss_clip": 0.06298073, + "balance_loss_mlp": 0.01259713, + "epoch": 0.2537201262588306, + "flos": 16294888454400.0, + "grad_norm": 1.6804083546431516, + "language_loss": 0.81834626, + "learning_rate": 3.497537904525736e-06, + "loss": 0.89634717, + "num_input_tokens_seen": 91093620, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.19970703, + "step": 4220, + "time_per_iteration": 2.576335906982422 + }, + { + "auxiliary_loss_clip": 0.0652357, + "auxiliary_loss_mlp": 0.01275521, + "balance_loss_clip": 0.06301299, + "balance_loss_mlp": 0.01256936, + "epoch": 0.2537802495114986, + "flos": 23301376761600.0, + "grad_norm": 2.4535775533256796, + "language_loss": 0.71752739, + "learning_rate": 3.497279728822468e-06, + "loss": 0.79551834, + "num_input_tokens_seen": 91114110, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.18579102, + "step": 4221, + "time_per_iteration": 2.561870813369751 + }, + { + "auxiliary_loss_clip": 0.06528511, + "auxiliary_loss_mlp": 0.01279389, + "balance_loss_clip": 0.0630452, + "balance_loss_mlp": 0.01259148, + "epoch": 0.25384037276416654, + "flos": 17644855184640.0, + "grad_norm": 1.5017476973585115, + "language_loss": 0.62507772, + "learning_rate": 3.497021496342202e-06, + "loss": 0.70315671, + "num_input_tokens_seen": 91133135, + "router_z_loss_clip": 2.23828125, + "router_z_loss_mlp": 0.20239258, + "step": 4222, + "time_per_iteration": 2.6921043395996094 + }, + { + "auxiliary_loss_clip": 0.06520825, + "auxiliary_loss_mlp": 0.01278393, + "balance_loss_clip": 0.06297866, + "balance_loss_mlp": 0.0125864, + "epoch": 0.2539004960168345, + "flos": 21513473066880.0, + "grad_norm": 1.6064438591236823, + "language_loss": 0.75066334, + "learning_rate": 3.496763207094731e-06, + "loss": 0.82865554, + "num_input_tokens_seen": 91151805, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.19763184, + "step": 4223, + "time_per_iteration": 2.525251626968384 + }, + { + "auxiliary_loss_clip": 0.06514867, + "auxiliary_loss_mlp": 0.01278352, + "balance_loss_clip": 0.06297616, + "balance_loss_mlp": 0.01260101, + "epoch": 0.2539606192695025, + "flos": 23957632339200.0, + "grad_norm": 1.753259760034452, + "language_loss": 0.80341679, + "learning_rate": 3.49650486108985e-06, + "loss": 0.88134897, + "num_input_tokens_seen": 91172270, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.18261719, + "step": 4224, + "time_per_iteration": 2.6002583503723145 + }, + { + "auxiliary_loss_clip": 0.06515887, + "auxiliary_loss_mlp": 0.01281311, + "balance_loss_clip": 0.0629767, + "balance_loss_mlp": 0.01261999, + "epoch": 0.25402074252217044, + "flos": 24176537930880.0, + "grad_norm": 1.4707313275482783, + "language_loss": 0.78211224, + "learning_rate": 3.496246458337354e-06, + "loss": 0.8600843, + "num_input_tokens_seen": 91192080, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.19299316, + "step": 4225, + "time_per_iteration": 2.5527138710021973 + }, + { + "auxiliary_loss_clip": 0.06521728, + "auxiliary_loss_mlp": 0.01282671, + "balance_loss_clip": 0.06302264, + "balance_loss_mlp": 0.01263013, + "epoch": 0.2540808657748384, + "flos": 22309320746880.0, + "grad_norm": 1.6188569007516582, + "language_loss": 0.85543132, + "learning_rate": 3.4959879988470426e-06, + "loss": 0.93347526, + "num_input_tokens_seen": 91211450, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.1965332, + "step": 4226, + "time_per_iteration": 2.5676872730255127 + }, + { + "auxiliary_loss_clip": 0.06515788, + "auxiliary_loss_mlp": 0.01277599, + "balance_loss_clip": 0.06296097, + "balance_loss_mlp": 0.01258883, + "epoch": 0.25414098902750637, + "flos": 27606883432320.0, + "grad_norm": 1.6805883261517605, + "language_loss": 0.71414381, + "learning_rate": 3.4957294826287164e-06, + "loss": 0.79207766, + "num_input_tokens_seen": 91231835, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.18713379, + "step": 4227, + "time_per_iteration": 2.5918691158294678 + }, + { + "auxiliary_loss_clip": 0.06387169, + "auxiliary_loss_mlp": 0.01261576, + "balance_loss_clip": 0.06279954, + "balance_loss_mlp": 0.01257166, + "epoch": 0.25420111228017434, + "flos": 58188760951680.0, + "grad_norm": 0.9697801274632529, + "language_loss": 0.61857057, + "learning_rate": 3.4954709096921785e-06, + "loss": 0.69505799, + "num_input_tokens_seen": 91288755, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.04418945, + "step": 4228, + "time_per_iteration": 3.01169490814209 + }, + { + "auxiliary_loss_clip": 0.06514917, + "auxiliary_loss_mlp": 0.01279347, + "balance_loss_clip": 0.0629469, + "balance_loss_mlp": 0.01258235, + "epoch": 0.2542612355328423, + "flos": 11467645136640.0, + "grad_norm": 2.3876652287650577, + "language_loss": 0.8721081, + "learning_rate": 3.4952122800472336e-06, + "loss": 0.95005071, + "num_input_tokens_seen": 91302485, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.21130371, + "step": 4229, + "time_per_iteration": 2.5960769653320312 + }, + { + "auxiliary_loss_clip": 0.06519967, + "auxiliary_loss_mlp": 0.01277589, + "balance_loss_clip": 0.06299049, + "balance_loss_mlp": 0.01257836, + "epoch": 0.2543213587855103, + "flos": 22972452359040.0, + "grad_norm": 2.100172466954555, + "language_loss": 0.78119314, + "learning_rate": 3.4949535937036892e-06, + "loss": 0.85916877, + "num_input_tokens_seen": 91321120, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.19775391, + "step": 4230, + "time_per_iteration": 2.5483899116516113 + }, + { + "auxiliary_loss_clip": 0.06511904, + "auxiliary_loss_mlp": 0.01277721, + "balance_loss_clip": 0.06292608, + "balance_loss_mlp": 0.01257622, + "epoch": 0.2543814820381783, + "flos": 18257953109760.0, + "grad_norm": 2.00545114565419, + "language_loss": 0.75687885, + "learning_rate": 3.4946948506713544e-06, + "loss": 0.83477509, + "num_input_tokens_seen": 91338575, + "router_z_loss_clip": 2.19335938, + "router_z_loss_mlp": 0.20092773, + "step": 4231, + "time_per_iteration": 2.566326379776001 + }, + { + "auxiliary_loss_clip": 0.06520282, + "auxiliary_loss_mlp": 0.01278584, + "balance_loss_clip": 0.06300422, + "balance_loss_mlp": 0.01259761, + "epoch": 0.25444160529084625, + "flos": 15638129752320.0, + "grad_norm": 1.7887257039808522, + "language_loss": 0.74637282, + "learning_rate": 3.4944360509600416e-06, + "loss": 0.82436144, + "num_input_tokens_seen": 91357355, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.18823242, + "step": 4232, + "time_per_iteration": 2.5229685306549072 + }, + { + "auxiliary_loss_clip": 0.0652221, + "auxiliary_loss_mlp": 0.01293975, + "balance_loss_clip": 0.06303085, + "balance_loss_mlp": 0.01272947, + "epoch": 0.2545017285435142, + "flos": 24607431152640.0, + "grad_norm": 1.8617746927090988, + "language_loss": 0.87183899, + "learning_rate": 3.4941771945795637e-06, + "loss": 0.95000088, + "num_input_tokens_seen": 91376515, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.21032715, + "step": 4233, + "time_per_iteration": 2.6281485557556152 + }, + { + "auxiliary_loss_clip": 0.06505871, + "auxiliary_loss_mlp": 0.01278753, + "balance_loss_clip": 0.06294682, + "balance_loss_mlp": 0.01260442, + "epoch": 0.2545618517961822, + "flos": 24685654538880.0, + "grad_norm": 1.601433299567329, + "language_loss": 0.75604707, + "learning_rate": 3.493918281539737e-06, + "loss": 0.8338933, + "num_input_tokens_seen": 91397595, + "router_z_loss_clip": 2.11328125, + "router_z_loss_mlp": 0.18322754, + "step": 4234, + "time_per_iteration": 2.596642017364502 + }, + { + "auxiliary_loss_clip": 0.06514844, + "auxiliary_loss_mlp": 0.01287463, + "balance_loss_clip": 0.06292339, + "balance_loss_mlp": 0.01268938, + "epoch": 0.25462197504885015, + "flos": 23921937699840.0, + "grad_norm": 1.4560099290474922, + "language_loss": 0.75372213, + "learning_rate": 3.493659311850379e-06, + "loss": 0.83174521, + "num_input_tokens_seen": 91417775, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.18518066, + "step": 4235, + "time_per_iteration": 2.592942953109741 + }, + { + "auxiliary_loss_clip": 0.06532556, + "auxiliary_loss_mlp": 0.01283911, + "balance_loss_clip": 0.06299181, + "balance_loss_mlp": 0.01261797, + "epoch": 0.2546820983015181, + "flos": 24796134547200.0, + "grad_norm": 1.9414760170646592, + "language_loss": 0.65519691, + "learning_rate": 3.4934002855213106e-06, + "loss": 0.73336154, + "num_input_tokens_seen": 91437665, + "router_z_loss_clip": 2.33398438, + "router_z_loss_mlp": 0.22131348, + "step": 4236, + "time_per_iteration": 2.5583407878875732 + }, + { + "auxiliary_loss_clip": 0.06512251, + "auxiliary_loss_mlp": 0.01281938, + "balance_loss_clip": 0.06294776, + "balance_loss_mlp": 0.01262984, + "epoch": 0.2547422215541861, + "flos": 18740095902720.0, + "grad_norm": 1.5016735811799797, + "language_loss": 0.678509, + "learning_rate": 3.493141202562354e-06, + "loss": 0.75645095, + "num_input_tokens_seen": 91456705, + "router_z_loss_clip": 2.17578125, + "router_z_loss_mlp": 0.18945312, + "step": 4237, + "time_per_iteration": 2.5650389194488525 + }, + { + "auxiliary_loss_clip": 0.0651492, + "auxiliary_loss_mlp": 0.01282053, + "balance_loss_clip": 0.06293051, + "balance_loss_mlp": 0.01261394, + "epoch": 0.25480234480685404, + "flos": 21038751360000.0, + "grad_norm": 2.061881611294133, + "language_loss": 0.75628269, + "learning_rate": 3.492882062983333e-06, + "loss": 0.83425242, + "num_input_tokens_seen": 91475535, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.20654297, + "step": 4238, + "time_per_iteration": 2.529883861541748 + }, + { + "auxiliary_loss_clip": 0.06513957, + "auxiliary_loss_mlp": 0.0127785, + "balance_loss_clip": 0.06292559, + "balance_loss_mlp": 0.01258287, + "epoch": 0.254862468059522, + "flos": 25089112748160.0, + "grad_norm": 1.8905919191970875, + "language_loss": 0.81253731, + "learning_rate": 3.492622866794074e-06, + "loss": 0.89045537, + "num_input_tokens_seen": 91499140, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.19555664, + "step": 4239, + "time_per_iteration": 4.02100944519043 + }, + { + "auxiliary_loss_clip": 0.06508629, + "auxiliary_loss_mlp": 0.01294237, + "balance_loss_clip": 0.06291452, + "balance_loss_mlp": 0.01273471, + "epoch": 0.25492259131219, + "flos": 20564658558720.0, + "grad_norm": 1.7183169382614727, + "language_loss": 0.7800405, + "learning_rate": 3.492363614004407e-06, + "loss": 0.85806918, + "num_input_tokens_seen": 91518335, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.2076416, + "step": 4240, + "time_per_iteration": 2.5323400497436523 + }, + { + "auxiliary_loss_clip": 0.06515411, + "auxiliary_loss_mlp": 0.01282684, + "balance_loss_clip": 0.06290809, + "balance_loss_mlp": 0.01262037, + "epoch": 0.25498271456485794, + "flos": 25048889988480.0, + "grad_norm": 1.7684080721058644, + "language_loss": 0.83764112, + "learning_rate": 3.492104304624162e-06, + "loss": 0.915622, + "num_input_tokens_seen": 91537655, + "router_z_loss_clip": 2.24804688, + "router_z_loss_mlp": 0.20629883, + "step": 4241, + "time_per_iteration": 2.618563413619995 + }, + { + "auxiliary_loss_clip": 0.06511963, + "auxiliary_loss_mlp": 0.01282405, + "balance_loss_clip": 0.06292334, + "balance_loss_mlp": 0.01262676, + "epoch": 0.2550428378175259, + "flos": 26185820912640.0, + "grad_norm": 1.7847215082139707, + "language_loss": 0.73873413, + "learning_rate": 3.4918449386631725e-06, + "loss": 0.81667781, + "num_input_tokens_seen": 91557545, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.19750977, + "step": 4242, + "time_per_iteration": 2.6289515495300293 + }, + { + "auxiliary_loss_clip": 0.06517772, + "auxiliary_loss_mlp": 0.01279972, + "balance_loss_clip": 0.06296564, + "balance_loss_mlp": 0.01260398, + "epoch": 0.2551029610701939, + "flos": 15272420607360.0, + "grad_norm": 2.4567533637161896, + "language_loss": 0.72771823, + "learning_rate": 3.491585516131273e-06, + "loss": 0.80569565, + "num_input_tokens_seen": 91574405, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.19567871, + "step": 4243, + "time_per_iteration": 3.9432499408721924 + }, + { + "auxiliary_loss_clip": 0.06515735, + "auxiliary_loss_mlp": 0.0127996, + "balance_loss_clip": 0.06295779, + "balance_loss_mlp": 0.01260195, + "epoch": 0.2551630843228619, + "flos": 18117774028800.0, + "grad_norm": 1.7474968125895491, + "language_loss": 0.82239074, + "learning_rate": 3.491326037038301e-06, + "loss": 0.90034771, + "num_input_tokens_seen": 91593755, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.19750977, + "step": 4244, + "time_per_iteration": 2.6024672985076904 + }, + { + "auxiliary_loss_clip": 0.06397872, + "auxiliary_loss_mlp": 0.01258297, + "balance_loss_clip": 0.06291912, + "balance_loss_mlp": 0.01253388, + "epoch": 0.25522320757552985, + "flos": 70543055266560.0, + "grad_norm": 0.6771353060664416, + "language_loss": 0.57579219, + "learning_rate": 3.4910665013940967e-06, + "loss": 0.65235388, + "num_input_tokens_seen": 91660335, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.04904175, + "step": 4245, + "time_per_iteration": 4.687421083450317 + }, + { + "auxiliary_loss_clip": 0.06516664, + "auxiliary_loss_mlp": 0.01277203, + "balance_loss_clip": 0.06290803, + "balance_loss_mlp": 0.01256628, + "epoch": 0.2552833308281978, + "flos": 22899679488000.0, + "grad_norm": 2.827648139992037, + "language_loss": 0.65781415, + "learning_rate": 3.4908069092085015e-06, + "loss": 0.73575282, + "num_input_tokens_seen": 91678500, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.20593262, + "step": 4246, + "time_per_iteration": 2.542945384979248 + }, + { + "auxiliary_loss_clip": 0.06504452, + "auxiliary_loss_mlp": 0.01278422, + "balance_loss_clip": 0.06290503, + "balance_loss_mlp": 0.01258455, + "epoch": 0.2553434540808658, + "flos": 22060003322880.0, + "grad_norm": 2.2137811054544003, + "language_loss": 0.82470047, + "learning_rate": 3.4905472604913585e-06, + "loss": 0.90252924, + "num_input_tokens_seen": 91696430, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.19970703, + "step": 4247, + "time_per_iteration": 2.5786685943603516 + }, + { + "auxiliary_loss_clip": 0.06521233, + "auxiliary_loss_mlp": 0.01279993, + "balance_loss_clip": 0.062906, + "balance_loss_mlp": 0.01257271, + "epoch": 0.25540357733353375, + "flos": 16549656393600.0, + "grad_norm": 2.135954108256579, + "language_loss": 0.83991635, + "learning_rate": 3.490287555252514e-06, + "loss": 0.91792852, + "num_input_tokens_seen": 91713270, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.22729492, + "step": 4248, + "time_per_iteration": 2.5408127307891846 + }, + { + "auxiliary_loss_clip": 0.06511332, + "auxiliary_loss_mlp": 0.01273979, + "balance_loss_clip": 0.062884, + "balance_loss_mlp": 0.01253773, + "epoch": 0.2554637005862017, + "flos": 17570531013120.0, + "grad_norm": 2.3193810219262585, + "language_loss": 0.84631854, + "learning_rate": 3.4900277935018166e-06, + "loss": 0.92417163, + "num_input_tokens_seen": 91728865, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.20202637, + "step": 4249, + "time_per_iteration": 4.003984212875366 + }, + { + "auxiliary_loss_clip": 0.06380495, + "auxiliary_loss_mlp": 0.01253384, + "balance_loss_clip": 0.06276014, + "balance_loss_mlp": 0.01249388, + "epoch": 0.2555238238388697, + "flos": 72263441698560.0, + "grad_norm": 0.7365466774710785, + "language_loss": 0.56168175, + "learning_rate": 3.489767975249115e-06, + "loss": 0.63802058, + "num_input_tokens_seen": 91787470, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.03994751, + "step": 4250, + "time_per_iteration": 3.169614553451538 + }, + { + "auxiliary_loss_clip": 0.06511974, + "auxiliary_loss_mlp": 0.01277356, + "balance_loss_clip": 0.06289789, + "balance_loss_mlp": 0.01255433, + "epoch": 0.25558394709153764, + "flos": 24396323990400.0, + "grad_norm": 2.4378887831258527, + "language_loss": 0.81129342, + "learning_rate": 3.4895081005042632e-06, + "loss": 0.88918668, + "num_input_tokens_seen": 91805640, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.21936035, + "step": 4251, + "time_per_iteration": 2.576631784439087 + }, + { + "auxiliary_loss_clip": 0.06382731, + "auxiliary_loss_mlp": 0.01258719, + "balance_loss_clip": 0.06278136, + "balance_loss_mlp": 0.01254794, + "epoch": 0.2556440703442056, + "flos": 69251857776000.0, + "grad_norm": 0.7756464213587903, + "language_loss": 0.66132653, + "learning_rate": 3.4892481692771146e-06, + "loss": 0.73774105, + "num_input_tokens_seen": 91869695, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.03921509, + "step": 4252, + "time_per_iteration": 3.2080140113830566 + }, + { + "auxiliary_loss_clip": 0.06505658, + "auxiliary_loss_mlp": 0.0127465, + "balance_loss_clip": 0.06288829, + "balance_loss_mlp": 0.01255922, + "epoch": 0.2557041935968736, + "flos": 24870919916160.0, + "grad_norm": 1.8769862610793295, + "language_loss": 0.74028432, + "learning_rate": 3.4889881815775267e-06, + "loss": 0.81808746, + "num_input_tokens_seen": 91889920, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.18737793, + "step": 4253, + "time_per_iteration": 2.569730520248413 + }, + { + "auxiliary_loss_clip": 0.06509089, + "auxiliary_loss_mlp": 0.01282548, + "balance_loss_clip": 0.06290878, + "balance_loss_mlp": 0.01261746, + "epoch": 0.25576431684954154, + "flos": 22498694974080.0, + "grad_norm": 4.507455095580577, + "language_loss": 0.742535, + "learning_rate": 3.488728137415357e-06, + "loss": 0.82045132, + "num_input_tokens_seen": 91908665, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.20800781, + "step": 4254, + "time_per_iteration": 2.58933424949646 + }, + { + "auxiliary_loss_clip": 0.0651402, + "auxiliary_loss_mlp": 0.0127796, + "balance_loss_clip": 0.06292839, + "balance_loss_mlp": 0.01257253, + "epoch": 0.2558244401022095, + "flos": 19832569436160.0, + "grad_norm": 1.7853658258569405, + "language_loss": 0.81599152, + "learning_rate": 3.4884680368004675e-06, + "loss": 0.89391136, + "num_input_tokens_seen": 91927855, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.20703125, + "step": 4255, + "time_per_iteration": 2.5198400020599365 + }, + { + "auxiliary_loss_clip": 0.06507239, + "auxiliary_loss_mlp": 0.01282593, + "balance_loss_clip": 0.06290218, + "balance_loss_mlp": 0.01262304, + "epoch": 0.2558845633548775, + "flos": 23226968736000.0, + "grad_norm": 1.3889535500711463, + "language_loss": 0.85781598, + "learning_rate": 3.488207879742721e-06, + "loss": 0.93571424, + "num_input_tokens_seen": 91948500, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.20275879, + "step": 4256, + "time_per_iteration": 2.6466193199157715 + }, + { + "auxiliary_loss_clip": 0.06518268, + "auxiliary_loss_mlp": 0.01279996, + "balance_loss_clip": 0.06292354, + "balance_loss_mlp": 0.01259432, + "epoch": 0.2559446866075455, + "flos": 16843682770560.0, + "grad_norm": 2.0395659723156814, + "language_loss": 0.75505483, + "learning_rate": 3.4879476662519826e-06, + "loss": 0.83303738, + "num_input_tokens_seen": 91968375, + "router_z_loss_clip": 2.25585938, + "router_z_loss_mlp": 0.20556641, + "step": 4257, + "time_per_iteration": 2.5399420261383057 + }, + { + "auxiliary_loss_clip": 0.06380453, + "auxiliary_loss_mlp": 0.01254162, + "balance_loss_clip": 0.06277193, + "balance_loss_mlp": 0.01249772, + "epoch": 0.25600480986021346, + "flos": 57612741258240.0, + "grad_norm": 0.7838298602570629, + "language_loss": 0.65205377, + "learning_rate": 3.4876873963381196e-06, + "loss": 0.72839993, + "num_input_tokens_seen": 92028490, + "router_z_loss_clip": 1.03125, + "router_z_loss_mlp": 0.04397583, + "step": 4258, + "time_per_iteration": 3.1310055255889893 + }, + { + "auxiliary_loss_clip": 0.06504042, + "auxiliary_loss_mlp": 0.01278745, + "balance_loss_clip": 0.06291071, + "balance_loss_mlp": 0.01257192, + "epoch": 0.2560649331128814, + "flos": 27827088762240.0, + "grad_norm": 1.6413095395992356, + "language_loss": 0.76769841, + "learning_rate": 3.4874270700110013e-06, + "loss": 0.84552622, + "num_input_tokens_seen": 92048060, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.2154541, + "step": 4259, + "time_per_iteration": 2.6200387477874756 + }, + { + "auxiliary_loss_clip": 0.06386054, + "auxiliary_loss_mlp": 0.01255029, + "balance_loss_clip": 0.06282675, + "balance_loss_mlp": 0.01250824, + "epoch": 0.2561250563655494, + "flos": 70972187552640.0, + "grad_norm": 0.7732791072218576, + "language_loss": 0.58378285, + "learning_rate": 3.4871666872804994e-06, + "loss": 0.66019368, + "num_input_tokens_seen": 92118180, + "router_z_loss_clip": 1.03125, + "router_z_loss_mlp": 0.04208374, + "step": 4260, + "time_per_iteration": 3.2671031951904297 + }, + { + "auxiliary_loss_clip": 0.06510498, + "auxiliary_loss_mlp": 0.01277826, + "balance_loss_clip": 0.06290598, + "balance_loss_mlp": 0.0125824, + "epoch": 0.25618517961821735, + "flos": 27018998386560.0, + "grad_norm": 1.6762593333812295, + "language_loss": 0.77063274, + "learning_rate": 3.4869062481564875e-06, + "loss": 0.84851599, + "num_input_tokens_seen": 92137570, + "router_z_loss_clip": 2.19921875, + "router_z_loss_mlp": 0.19580078, + "step": 4261, + "time_per_iteration": 2.6590030193328857 + }, + { + "auxiliary_loss_clip": 0.06510883, + "auxiliary_loss_mlp": 0.01281621, + "balance_loss_clip": 0.06293076, + "balance_loss_mlp": 0.01261534, + "epoch": 0.2562453028708853, + "flos": 23073708418560.0, + "grad_norm": 1.5026397479094624, + "language_loss": 0.83196223, + "learning_rate": 3.486645752648842e-06, + "loss": 0.90988725, + "num_input_tokens_seen": 92157625, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.20080566, + "step": 4262, + "time_per_iteration": 2.606386661529541 + }, + { + "auxiliary_loss_clip": 0.06520962, + "auxiliary_loss_mlp": 0.01278022, + "balance_loss_clip": 0.06295811, + "balance_loss_mlp": 0.0125778, + "epoch": 0.2563054261235533, + "flos": 15126120178560.0, + "grad_norm": 2.976746783245639, + "language_loss": 0.7460134, + "learning_rate": 3.4863852007674405e-06, + "loss": 0.82400322, + "num_input_tokens_seen": 92175350, + "router_z_loss_clip": 2.25585938, + "router_z_loss_mlp": 0.20239258, + "step": 4263, + "time_per_iteration": 2.573204517364502 + }, + { + "auxiliary_loss_clip": 0.06511976, + "auxiliary_loss_mlp": 0.01275308, + "balance_loss_clip": 0.0629802, + "balance_loss_mlp": 0.01256008, + "epoch": 0.25636554937622125, + "flos": 27862238350080.0, + "grad_norm": 1.7189236473805392, + "language_loss": 0.83209884, + "learning_rate": 3.486124592522163e-06, + "loss": 0.90997171, + "num_input_tokens_seen": 92196070, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.19299316, + "step": 4264, + "time_per_iteration": 2.5768978595733643 + }, + { + "auxiliary_loss_clip": 0.06522107, + "auxiliary_loss_mlp": 0.01275609, + "balance_loss_clip": 0.06300539, + "balance_loss_mlp": 0.01255403, + "epoch": 0.2564256726288892, + "flos": 28912979750400.0, + "grad_norm": 2.7518222985569247, + "language_loss": 0.75264466, + "learning_rate": 3.4858639279228924e-06, + "loss": 0.83062184, + "num_input_tokens_seen": 92216310, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.20202637, + "step": 4265, + "time_per_iteration": 2.6022770404815674 + }, + { + "auxiliary_loss_clip": 0.06514756, + "auxiliary_loss_mlp": 0.01276084, + "balance_loss_clip": 0.06293936, + "balance_loss_mlp": 0.01256701, + "epoch": 0.2564857958815572, + "flos": 18520812967680.0, + "grad_norm": 2.7205564726060754, + "language_loss": 0.82059085, + "learning_rate": 3.485603206979513e-06, + "loss": 0.89849925, + "num_input_tokens_seen": 92234510, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.19396973, + "step": 4266, + "time_per_iteration": 2.5768039226531982 + }, + { + "auxiliary_loss_clip": 0.06513181, + "auxiliary_loss_mlp": 0.01282165, + "balance_loss_clip": 0.06295994, + "balance_loss_mlp": 0.01263199, + "epoch": 0.25654591913422514, + "flos": 25814745106560.0, + "grad_norm": 2.256505464235654, + "language_loss": 0.79590619, + "learning_rate": 3.4853424297019103e-06, + "loss": 0.8738597, + "num_input_tokens_seen": 92254070, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.1895752, + "step": 4267, + "time_per_iteration": 2.58900785446167 + }, + { + "auxiliary_loss_clip": 0.06512932, + "auxiliary_loss_mlp": 0.01282882, + "balance_loss_clip": 0.06302384, + "balance_loss_mlp": 0.01263439, + "epoch": 0.2566060423868931, + "flos": 19105805047680.0, + "grad_norm": 1.7450924080459818, + "language_loss": 0.79543281, + "learning_rate": 3.4850815960999736e-06, + "loss": 0.87339091, + "num_input_tokens_seen": 92275060, + "router_z_loss_clip": 2.10546875, + "router_z_loss_mlp": 0.19421387, + "step": 4268, + "time_per_iteration": 2.532245635986328 + }, + { + "auxiliary_loss_clip": 0.06515032, + "auxiliary_loss_mlp": 0.01281336, + "balance_loss_clip": 0.06296947, + "balance_loss_mlp": 0.01261166, + "epoch": 0.25666616563956113, + "flos": 23849584099200.0, + "grad_norm": 1.6329297187056233, + "language_loss": 0.69106698, + "learning_rate": 3.484820706183595e-06, + "loss": 0.76903057, + "num_input_tokens_seen": 92293610, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.20153809, + "step": 4269, + "time_per_iteration": 2.7064032554626465 + }, + { + "auxiliary_loss_clip": 0.06520134, + "auxiliary_loss_mlp": 0.01278603, + "balance_loss_clip": 0.06299803, + "balance_loss_mlp": 0.01259016, + "epoch": 0.2567262888922291, + "flos": 14608366600320.0, + "grad_norm": 2.976489070793836, + "language_loss": 0.79361498, + "learning_rate": 3.484559759962666e-06, + "loss": 0.8716023, + "num_input_tokens_seen": 92308305, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.19580078, + "step": 4270, + "time_per_iteration": 2.5247366428375244 + }, + { + "auxiliary_loss_clip": 0.06528008, + "auxiliary_loss_mlp": 0.01281711, + "balance_loss_clip": 0.0630113, + "balance_loss_mlp": 0.0125899, + "epoch": 0.25678641214489706, + "flos": 32930791027200.0, + "grad_norm": 2.0785991894062104, + "language_loss": 0.68438745, + "learning_rate": 3.4842987574470816e-06, + "loss": 0.76248461, + "num_input_tokens_seen": 92329875, + "router_z_loss_clip": 2.26757812, + "router_z_loss_mlp": 0.22717285, + "step": 4271, + "time_per_iteration": 2.6327364444732666 + }, + { + "auxiliary_loss_clip": 0.06521121, + "auxiliary_loss_mlp": 0.01277495, + "balance_loss_clip": 0.06297284, + "balance_loss_mlp": 0.01256395, + "epoch": 0.256846535397565, + "flos": 24106029120000.0, + "grad_norm": 1.3298745054932861, + "language_loss": 0.87827712, + "learning_rate": 3.4840376986467403e-06, + "loss": 0.9562633, + "num_input_tokens_seen": 92348780, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.2109375, + "step": 4272, + "time_per_iteration": 2.5886576175689697 + }, + { + "auxiliary_loss_clip": 0.06520741, + "auxiliary_loss_mlp": 0.0127846, + "balance_loss_clip": 0.06299604, + "balance_loss_mlp": 0.01256204, + "epoch": 0.256906658650233, + "flos": 19724437342080.0, + "grad_norm": 1.6471317846086577, + "language_loss": 0.8228811, + "learning_rate": 3.483776583571541e-06, + "loss": 0.90087312, + "num_input_tokens_seen": 92368175, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.22253418, + "step": 4273, + "time_per_iteration": 2.5273654460906982 + }, + { + "auxiliary_loss_clip": 0.06513067, + "auxiliary_loss_mlp": 0.0127658, + "balance_loss_clip": 0.06299708, + "balance_loss_mlp": 0.01257638, + "epoch": 0.25696678190290095, + "flos": 22932019964160.0, + "grad_norm": 1.4706338186359442, + "language_loss": 0.77439249, + "learning_rate": 3.4835154122313846e-06, + "loss": 0.85228896, + "num_input_tokens_seen": 92387755, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.18933105, + "step": 4274, + "time_per_iteration": 2.5805962085723877 + }, + { + "auxiliary_loss_clip": 0.06508841, + "auxiliary_loss_mlp": 0.01274973, + "balance_loss_clip": 0.06295496, + "balance_loss_mlp": 0.0125435, + "epoch": 0.2570269051555689, + "flos": 27315163042560.0, + "grad_norm": 1.5809391622925344, + "language_loss": 0.84101403, + "learning_rate": 3.4832541846361743e-06, + "loss": 0.91885215, + "num_input_tokens_seen": 92409850, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.20629883, + "step": 4275, + "time_per_iteration": 2.5743672847747803 + }, + { + "auxiliary_loss_clip": 0.0652002, + "auxiliary_loss_mlp": 0.01273541, + "balance_loss_clip": 0.06298091, + "balance_loss_mlp": 0.01252965, + "epoch": 0.2570870284082369, + "flos": 27570811449600.0, + "grad_norm": 2.3295240533415016, + "language_loss": 0.78590673, + "learning_rate": 3.4829929007958175e-06, + "loss": 0.86384231, + "num_input_tokens_seen": 92431250, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.20581055, + "step": 4276, + "time_per_iteration": 2.631866216659546 + }, + { + "auxiliary_loss_clip": 0.06515533, + "auxiliary_loss_mlp": 0.01279943, + "balance_loss_clip": 0.06298599, + "balance_loss_mlp": 0.01260237, + "epoch": 0.25714715166090485, + "flos": 28738405768320.0, + "grad_norm": 1.6396366021430353, + "language_loss": 0.79803967, + "learning_rate": 3.4827315607202214e-06, + "loss": 0.8759945, + "num_input_tokens_seen": 92452065, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.19714355, + "step": 4277, + "time_per_iteration": 2.5990161895751953 + }, + { + "auxiliary_loss_clip": 0.06513472, + "auxiliary_loss_mlp": 0.01272259, + "balance_loss_clip": 0.06296529, + "balance_loss_mlp": 0.01254377, + "epoch": 0.2572072749135728, + "flos": 20121606495360.0, + "grad_norm": 1.9596681746733369, + "language_loss": 0.78998482, + "learning_rate": 3.482470164419295e-06, + "loss": 0.8678422, + "num_input_tokens_seen": 92470025, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.17883301, + "step": 4278, + "time_per_iteration": 4.02304744720459 + }, + { + "auxiliary_loss_clip": 0.06522302, + "auxiliary_loss_mlp": 0.01278536, + "balance_loss_clip": 0.06301469, + "balance_loss_mlp": 0.01256911, + "epoch": 0.2572673981662408, + "flos": 26037969183360.0, + "grad_norm": 2.3063853220673067, + "language_loss": 0.75400203, + "learning_rate": 3.482208711902952e-06, + "loss": 0.83201039, + "num_input_tokens_seen": 92489825, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.21618652, + "step": 4279, + "time_per_iteration": 2.5523123741149902 + }, + { + "auxiliary_loss_clip": 0.06516609, + "auxiliary_loss_mlp": 0.0128394, + "balance_loss_clip": 0.06297271, + "balance_loss_mlp": 0.01262721, + "epoch": 0.25732752141890874, + "flos": 16112054845440.0, + "grad_norm": 3.423283610494841, + "language_loss": 0.85997081, + "learning_rate": 3.4819472031811065e-06, + "loss": 0.9379763, + "num_input_tokens_seen": 92507270, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.2121582, + "step": 4280, + "time_per_iteration": 2.5104546546936035 + }, + { + "auxiliary_loss_clip": 0.06517641, + "auxiliary_loss_mlp": 0.01282108, + "balance_loss_clip": 0.06295675, + "balance_loss_mlp": 0.0126133, + "epoch": 0.2573876446715767, + "flos": 22530322690560.0, + "grad_norm": 2.5830483171875955, + "language_loss": 0.78735828, + "learning_rate": 3.4816856382636744e-06, + "loss": 0.86535579, + "num_input_tokens_seen": 92526300, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.20788574, + "step": 4281, + "time_per_iteration": 2.511723279953003 + }, + { + "auxiliary_loss_clip": 0.06512952, + "auxiliary_loss_mlp": 0.01285256, + "balance_loss_clip": 0.06294534, + "balance_loss_mlp": 0.01264048, + "epoch": 0.2574477679242447, + "flos": 23957548485120.0, + "grad_norm": 1.8266556980022217, + "language_loss": 0.87782013, + "learning_rate": 3.4814240171605737e-06, + "loss": 0.9558022, + "num_input_tokens_seen": 92546465, + "router_z_loss_clip": 2.18066406, + "router_z_loss_mlp": 0.21203613, + "step": 4282, + "time_per_iteration": 2.5573971271514893 + }, + { + "auxiliary_loss_clip": 0.06509817, + "auxiliary_loss_mlp": 0.0128236, + "balance_loss_clip": 0.06291438, + "balance_loss_mlp": 0.01262905, + "epoch": 0.2575078911769127, + "flos": 21988278627840.0, + "grad_norm": 1.3881538001933933, + "language_loss": 0.71042287, + "learning_rate": 3.4811623398817267e-06, + "loss": 0.78834462, + "num_input_tokens_seen": 92567260, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.19470215, + "step": 4283, + "time_per_iteration": 3.9826109409332275 + }, + { + "auxiliary_loss_clip": 0.06500088, + "auxiliary_loss_mlp": 0.01289815, + "balance_loss_clip": 0.06290558, + "balance_loss_mlp": 0.01271051, + "epoch": 0.25756801442958066, + "flos": 21951997009920.0, + "grad_norm": 1.9398744879334104, + "language_loss": 0.80991805, + "learning_rate": 3.4809006064370553e-06, + "loss": 0.88781703, + "num_input_tokens_seen": 92585425, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 0.18762207, + "step": 4284, + "time_per_iteration": 2.512471914291382 + }, + { + "auxiliary_loss_clip": 0.06508928, + "auxiliary_loss_mlp": 0.01294414, + "balance_loss_clip": 0.06291771, + "balance_loss_mlp": 0.01274923, + "epoch": 0.2576281376822486, + "flos": 35270675493120.0, + "grad_norm": 2.158245566426343, + "language_loss": 0.70814562, + "learning_rate": 3.4806388168364835e-06, + "loss": 0.78617907, + "num_input_tokens_seen": 92604770, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.19494629, + "step": 4285, + "time_per_iteration": 4.088344097137451 + }, + { + "auxiliary_loss_clip": 0.06504595, + "auxiliary_loss_mlp": 0.0128171, + "balance_loss_clip": 0.06288387, + "balance_loss_mlp": 0.01262505, + "epoch": 0.2576882609349166, + "flos": 14136705567360.0, + "grad_norm": 1.771877130646751, + "language_loss": 0.58818436, + "learning_rate": 3.4803769710899402e-06, + "loss": 0.66604745, + "num_input_tokens_seen": 92622635, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.1920166, + "step": 4286, + "time_per_iteration": 2.5344176292419434 + }, + { + "auxiliary_loss_clip": 0.0650837, + "auxiliary_loss_mlp": 0.01278621, + "balance_loss_clip": 0.06287025, + "balance_loss_mlp": 0.01259118, + "epoch": 0.25774838418758456, + "flos": 23265053216640.0, + "grad_norm": 2.057811055203196, + "language_loss": 0.6464054, + "learning_rate": 3.480115069207354e-06, + "loss": 0.72427529, + "num_input_tokens_seen": 92642960, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.19494629, + "step": 4287, + "time_per_iteration": 2.5958328247070312 + }, + { + "auxiliary_loss_clip": 0.0650748, + "auxiliary_loss_mlp": 0.01286721, + "balance_loss_clip": 0.06287187, + "balance_loss_mlp": 0.01265824, + "epoch": 0.2578085074402525, + "flos": 22608378368640.0, + "grad_norm": 1.9946373780944937, + "language_loss": 0.7222265, + "learning_rate": 3.4798531111986557e-06, + "loss": 0.80016851, + "num_input_tokens_seen": 92662455, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.2088623, + "step": 4288, + "time_per_iteration": 2.5767109394073486 + }, + { + "auxiliary_loss_clip": 0.06504134, + "auxiliary_loss_mlp": 0.01288175, + "balance_loss_clip": 0.06288374, + "balance_loss_mlp": 0.01268851, + "epoch": 0.2578686306929205, + "flos": 24578780256000.0, + "grad_norm": 1.4737569046844996, + "language_loss": 0.77657092, + "learning_rate": 3.4795910970737786e-06, + "loss": 0.85449398, + "num_input_tokens_seen": 92683520, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.1932373, + "step": 4289, + "time_per_iteration": 3.9734480381011963 + }, + { + "auxiliary_loss_clip": 0.0651005, + "auxiliary_loss_mlp": 0.01285951, + "balance_loss_clip": 0.06290901, + "balance_loss_mlp": 0.012641, + "epoch": 0.25792875394558845, + "flos": 18119828453760.0, + "grad_norm": 2.192134211179858, + "language_loss": 0.8580482, + "learning_rate": 3.4793290268426592e-06, + "loss": 0.93600821, + "num_input_tokens_seen": 92701450, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.21838379, + "step": 4290, + "time_per_iteration": 2.5564229488372803 + }, + { + "auxiliary_loss_clip": 0.0651224, + "auxiliary_loss_mlp": 0.01283874, + "balance_loss_clip": 0.06293762, + "balance_loss_mlp": 0.01263573, + "epoch": 0.2579888771982564, + "flos": 17718760085760.0, + "grad_norm": 2.0247866667145344, + "language_loss": 0.73390263, + "learning_rate": 3.4790669005152354e-06, + "loss": 0.81186378, + "num_input_tokens_seen": 92720355, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.20300293, + "step": 4291, + "time_per_iteration": 2.497671365737915 + }, + { + "auxiliary_loss_clip": 0.06508101, + "auxiliary_loss_mlp": 0.01275245, + "balance_loss_clip": 0.06287237, + "balance_loss_mlp": 0.01255647, + "epoch": 0.2580490004509244, + "flos": 16440350342400.0, + "grad_norm": 2.23272675200871, + "language_loss": 0.82139969, + "learning_rate": 3.4788047181014458e-06, + "loss": 0.8992331, + "num_input_tokens_seen": 92736755, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.19604492, + "step": 4292, + "time_per_iteration": 2.5467498302459717 + }, + { + "auxiliary_loss_clip": 0.06505652, + "auxiliary_loss_mlp": 0.01282583, + "balance_loss_clip": 0.06289525, + "balance_loss_mlp": 0.01262532, + "epoch": 0.25810912370359235, + "flos": 33842946574080.0, + "grad_norm": 1.9023591833174374, + "language_loss": 0.67644775, + "learning_rate": 3.4785424796112337e-06, + "loss": 0.7543301, + "num_input_tokens_seen": 92757655, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.20043945, + "step": 4293, + "time_per_iteration": 2.626880168914795 + }, + { + "auxiliary_loss_clip": 0.06507371, + "auxiliary_loss_mlp": 0.01275889, + "balance_loss_clip": 0.06295517, + "balance_loss_mlp": 0.01257244, + "epoch": 0.2581692469562603, + "flos": 25199257340160.0, + "grad_norm": 2.9603548878770387, + "language_loss": 0.76158464, + "learning_rate": 3.478280185054542e-06, + "loss": 0.83941722, + "num_input_tokens_seen": 92776100, + "router_z_loss_clip": 2.12109375, + "router_z_loss_mlp": 0.18640137, + "step": 4294, + "time_per_iteration": 2.5711581707000732 + }, + { + "auxiliary_loss_clip": 0.06508272, + "auxiliary_loss_mlp": 0.01277058, + "balance_loss_clip": 0.06293358, + "balance_loss_mlp": 0.01257866, + "epoch": 0.2582293702089283, + "flos": 34940619060480.0, + "grad_norm": 2.382767918587226, + "language_loss": 0.81769538, + "learning_rate": 3.478017834441318e-06, + "loss": 0.8955487, + "num_input_tokens_seen": 92798880, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.1920166, + "step": 4295, + "time_per_iteration": 2.635817766189575 + }, + { + "auxiliary_loss_clip": 0.06519823, + "auxiliary_loss_mlp": 0.01276702, + "balance_loss_clip": 0.06295969, + "balance_loss_mlp": 0.01256496, + "epoch": 0.2582894934615963, + "flos": 26841028314240.0, + "grad_norm": 1.964012337767824, + "language_loss": 0.72949934, + "learning_rate": 3.4777554277815096e-06, + "loss": 0.80746454, + "num_input_tokens_seen": 92817750, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.20214844, + "step": 4296, + "time_per_iteration": 2.569481134414673 + }, + { + "auxiliary_loss_clip": 0.06514452, + "auxiliary_loss_mlp": 0.01277621, + "balance_loss_clip": 0.0629501, + "balance_loss_mlp": 0.0125732, + "epoch": 0.25834961671426426, + "flos": 23522252924160.0, + "grad_norm": 1.7245670135783875, + "language_loss": 0.87440747, + "learning_rate": 3.477492965085067e-06, + "loss": 0.95232815, + "num_input_tokens_seen": 92837995, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.20288086, + "step": 4297, + "time_per_iteration": 2.5871896743774414 + }, + { + "auxiliary_loss_clip": 0.06510781, + "auxiliary_loss_mlp": 0.01279012, + "balance_loss_clip": 0.062922, + "balance_loss_mlp": 0.01260558, + "epoch": 0.25840973996693223, + "flos": 22456837059840.0, + "grad_norm": 2.9037965134923076, + "language_loss": 0.84894854, + "learning_rate": 3.477230446361943e-06, + "loss": 0.9268465, + "num_input_tokens_seen": 92857245, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.18469238, + "step": 4298, + "time_per_iteration": 2.5290613174438477 + }, + { + "auxiliary_loss_clip": 0.06510766, + "auxiliary_loss_mlp": 0.01276006, + "balance_loss_clip": 0.06292143, + "balance_loss_mlp": 0.01256158, + "epoch": 0.2584698632196002, + "flos": 11295544849920.0, + "grad_norm": 2.12928453409433, + "language_loss": 0.83727312, + "learning_rate": 3.4769678716220927e-06, + "loss": 0.91514087, + "num_input_tokens_seen": 92873265, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.1986084, + "step": 4299, + "time_per_iteration": 2.5314571857452393 + }, + { + "auxiliary_loss_clip": 0.06506392, + "auxiliary_loss_mlp": 0.01272204, + "balance_loss_clip": 0.06292024, + "balance_loss_mlp": 0.01253214, + "epoch": 0.25852998647226816, + "flos": 17935569325440.0, + "grad_norm": 2.08690605682093, + "language_loss": 0.83303946, + "learning_rate": 3.4767052408754726e-06, + "loss": 0.91082543, + "num_input_tokens_seen": 92890880, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.18981934, + "step": 4300, + "time_per_iteration": 2.494170904159546 + }, + { + "auxiliary_loss_clip": 0.06507458, + "auxiliary_loss_mlp": 0.01272704, + "balance_loss_clip": 0.06287713, + "balance_loss_mlp": 0.01254012, + "epoch": 0.2585901097249361, + "flos": 33264620893440.0, + "grad_norm": 3.3706811216639307, + "language_loss": 0.67941749, + "learning_rate": 3.4764425541320417e-06, + "loss": 0.75721914, + "num_input_tokens_seen": 92910770, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.18688965, + "step": 4301, + "time_per_iteration": 2.6923537254333496 + }, + { + "auxiliary_loss_clip": 0.06512292, + "auxiliary_loss_mlp": 0.01275999, + "balance_loss_clip": 0.06289004, + "balance_loss_mlp": 0.01257009, + "epoch": 0.2586502329776041, + "flos": 18447033847680.0, + "grad_norm": 2.7819934823512282, + "language_loss": 0.83073664, + "learning_rate": 3.4761798114017617e-06, + "loss": 0.90861952, + "num_input_tokens_seen": 92929520, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.18994141, + "step": 4302, + "time_per_iteration": 2.5102365016937256 + }, + { + "auxiliary_loss_clip": 0.06508462, + "auxiliary_loss_mlp": 0.01276586, + "balance_loss_clip": 0.06292115, + "balance_loss_mlp": 0.01257358, + "epoch": 0.25871035623027205, + "flos": 17973989222400.0, + "grad_norm": 1.7107484291097332, + "language_loss": 0.91874599, + "learning_rate": 3.475917012694595e-06, + "loss": 0.99659652, + "num_input_tokens_seen": 92947890, + "router_z_loss_clip": 2.1640625, + "router_z_loss_mlp": 0.19238281, + "step": 4303, + "time_per_iteration": 2.5386602878570557 + }, + { + "auxiliary_loss_clip": 0.06508803, + "auxiliary_loss_mlp": 0.01277595, + "balance_loss_clip": 0.0629281, + "balance_loss_mlp": 0.01258569, + "epoch": 0.25877047948294, + "flos": 27784392307200.0, + "grad_norm": 1.7938003883067368, + "language_loss": 0.67601281, + "learning_rate": 3.475654158020507e-06, + "loss": 0.75387681, + "num_input_tokens_seen": 92967690, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.19018555, + "step": 4304, + "time_per_iteration": 2.5739033222198486 + }, + { + "auxiliary_loss_clip": 0.06507856, + "auxiliary_loss_mlp": 0.01276896, + "balance_loss_clip": 0.06286401, + "balance_loss_mlp": 0.01257477, + "epoch": 0.258830602735608, + "flos": 27133209901440.0, + "grad_norm": 2.1929382614593242, + "language_loss": 0.73436916, + "learning_rate": 3.4753912473894657e-06, + "loss": 0.81221676, + "num_input_tokens_seen": 92986830, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.1940918, + "step": 4305, + "time_per_iteration": 2.5877888202667236 + }, + { + "auxiliary_loss_clip": 0.06515621, + "auxiliary_loss_mlp": 0.01276889, + "balance_loss_clip": 0.06293313, + "balance_loss_mlp": 0.01255992, + "epoch": 0.25889072598827595, + "flos": 17896730158080.0, + "grad_norm": 1.8662067033328453, + "language_loss": 0.76418924, + "learning_rate": 3.4751282808114403e-06, + "loss": 0.84211433, + "num_input_tokens_seen": 93002740, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.20898438, + "step": 4306, + "time_per_iteration": 2.482933282852173 + }, + { + "auxiliary_loss_clip": 0.06403579, + "auxiliary_loss_mlp": 0.01258203, + "balance_loss_clip": 0.06296976, + "balance_loss_mlp": 0.01253566, + "epoch": 0.2589508492409439, + "flos": 53951582885760.0, + "grad_norm": 0.8023409981232837, + "language_loss": 0.56592381, + "learning_rate": 3.474865258296403e-06, + "loss": 0.64254159, + "num_input_tokens_seen": 93058645, + "router_z_loss_clip": 1.06347656, + "router_z_loss_mlp": 0.04629517, + "step": 4307, + "time_per_iteration": 3.1265084743499756 + }, + { + "auxiliary_loss_clip": 0.06500413, + "auxiliary_loss_mlp": 0.0127407, + "balance_loss_clip": 0.06289256, + "balance_loss_mlp": 0.01256105, + "epoch": 0.2590109724936119, + "flos": 22132063434240.0, + "grad_norm": 1.735104377472534, + "language_loss": 0.71851504, + "learning_rate": 3.474602179854327e-06, + "loss": 0.79625988, + "num_input_tokens_seen": 93077140, + "router_z_loss_clip": 2.11328125, + "router_z_loss_mlp": 0.17956543, + "step": 4308, + "time_per_iteration": 2.5442304611206055 + }, + { + "auxiliary_loss_clip": 0.06513858, + "auxiliary_loss_mlp": 0.01278615, + "balance_loss_clip": 0.0629196, + "balance_loss_mlp": 0.01258993, + "epoch": 0.2590710957462799, + "flos": 13478395564800.0, + "grad_norm": 2.8033587428294657, + "language_loss": 0.84278727, + "learning_rate": 3.4743390454951886e-06, + "loss": 0.92071199, + "num_input_tokens_seen": 93093580, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.19628906, + "step": 4309, + "time_per_iteration": 2.546034336090088 + }, + { + "auxiliary_loss_clip": 0.06504438, + "auxiliary_loss_mlp": 0.01276588, + "balance_loss_clip": 0.06290606, + "balance_loss_mlp": 0.01258814, + "epoch": 0.25913121899894787, + "flos": 22313219961600.0, + "grad_norm": 1.5400127324827177, + "language_loss": 0.84972912, + "learning_rate": 3.474075855228966e-06, + "loss": 0.92753935, + "num_input_tokens_seen": 93112345, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.17785645, + "step": 4310, + "time_per_iteration": 2.5188028812408447 + }, + { + "auxiliary_loss_clip": 0.06511362, + "auxiliary_loss_mlp": 0.0127375, + "balance_loss_clip": 0.06293052, + "balance_loss_mlp": 0.01254533, + "epoch": 0.25919134225161583, + "flos": 25818770102400.0, + "grad_norm": 1.8118221315599161, + "language_loss": 0.78088975, + "learning_rate": 3.473812609065639e-06, + "loss": 0.85874081, + "num_input_tokens_seen": 93131545, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.19213867, + "step": 4311, + "time_per_iteration": 2.6044604778289795 + }, + { + "auxiliary_loss_clip": 0.06511068, + "auxiliary_loss_mlp": 0.01275144, + "balance_loss_clip": 0.06293963, + "balance_loss_mlp": 0.01256666, + "epoch": 0.2592514655042838, + "flos": 31220314104960.0, + "grad_norm": 4.381167674093932, + "language_loss": 0.73062587, + "learning_rate": 3.4735493070151904e-06, + "loss": 0.80848801, + "num_input_tokens_seen": 93150730, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.18469238, + "step": 4312, + "time_per_iteration": 2.587942600250244 + }, + { + "auxiliary_loss_clip": 0.06508243, + "auxiliary_loss_mlp": 0.01275986, + "balance_loss_clip": 0.06291987, + "balance_loss_mlp": 0.012569, + "epoch": 0.25931158875695176, + "flos": 18480296718720.0, + "grad_norm": 1.7543304647253515, + "language_loss": 0.70305753, + "learning_rate": 3.4732859490876044e-06, + "loss": 0.78089976, + "num_input_tokens_seen": 93167895, + "router_z_loss_clip": 2.16113281, + "router_z_loss_mlp": 0.19091797, + "step": 4313, + "time_per_iteration": 2.5092732906341553 + }, + { + "auxiliary_loss_clip": 0.06508952, + "auxiliary_loss_mlp": 0.01278616, + "balance_loss_clip": 0.06293979, + "balance_loss_mlp": 0.0125971, + "epoch": 0.2593717120096197, + "flos": 19213895214720.0, + "grad_norm": 1.751562510714179, + "language_loss": 0.81158572, + "learning_rate": 3.473022535292867e-06, + "loss": 0.8894614, + "num_input_tokens_seen": 93187650, + "router_z_loss_clip": 2.15332031, + "router_z_loss_mlp": 0.18908691, + "step": 4314, + "time_per_iteration": 2.5584335327148438 + }, + { + "auxiliary_loss_clip": 0.06515148, + "auxiliary_loss_mlp": 0.01278316, + "balance_loss_clip": 0.06292658, + "balance_loss_mlp": 0.01257359, + "epoch": 0.2594318352622877, + "flos": 31256050671360.0, + "grad_norm": 1.9178095473181331, + "language_loss": 0.67283171, + "learning_rate": 3.472759065640968e-06, + "loss": 0.7507664, + "num_input_tokens_seen": 93207370, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.20959473, + "step": 4315, + "time_per_iteration": 2.6295278072357178 + }, + { + "auxiliary_loss_clip": 0.06506292, + "auxiliary_loss_mlp": 0.01277654, + "balance_loss_clip": 0.06292329, + "balance_loss_mlp": 0.01259463, + "epoch": 0.25949195851495566, + "flos": 22243759326720.0, + "grad_norm": 1.412764147956583, + "language_loss": 0.80242419, + "learning_rate": 3.4724955401418976e-06, + "loss": 0.88026369, + "num_input_tokens_seen": 93227925, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.18212891, + "step": 4316, + "time_per_iteration": 2.5277152061462402 + }, + { + "auxiliary_loss_clip": 0.06510989, + "auxiliary_loss_mlp": 0.01277583, + "balance_loss_clip": 0.06290686, + "balance_loss_mlp": 0.01256781, + "epoch": 0.2595520817676236, + "flos": 28083449928960.0, + "grad_norm": 1.6660208675023864, + "language_loss": 0.78127223, + "learning_rate": 3.4722319588056487e-06, + "loss": 0.85915792, + "num_input_tokens_seen": 93250020, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.20812988, + "step": 4317, + "time_per_iteration": 2.6210665702819824 + }, + { + "auxiliary_loss_clip": 0.06507257, + "auxiliary_loss_mlp": 0.01281581, + "balance_loss_clip": 0.06291957, + "balance_loss_mlp": 0.01262054, + "epoch": 0.2596122050202916, + "flos": 20196727280640.0, + "grad_norm": 2.4040812102587377, + "language_loss": 0.78420109, + "learning_rate": 3.4719683216422163e-06, + "loss": 0.86208946, + "num_input_tokens_seen": 93269070, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.19519043, + "step": 4318, + "time_per_iteration": 3.9600155353546143 + }, + { + "auxiliary_loss_clip": 0.06505568, + "auxiliary_loss_mlp": 0.01276855, + "balance_loss_clip": 0.06290057, + "balance_loss_mlp": 0.01256637, + "epoch": 0.25967232827295955, + "flos": 22534431540480.0, + "grad_norm": 2.66294558684285, + "language_loss": 0.77022719, + "learning_rate": 3.471704628661598e-06, + "loss": 0.84805143, + "num_input_tokens_seen": 93290250, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.20227051, + "step": 4319, + "time_per_iteration": 2.544752836227417 + }, + { + "auxiliary_loss_clip": 0.0650554, + "auxiliary_loss_mlp": 0.01280509, + "balance_loss_clip": 0.06290743, + "balance_loss_mlp": 0.01261555, + "epoch": 0.2597324515256275, + "flos": 21074445999360.0, + "grad_norm": 1.7925219732685136, + "language_loss": 0.77426791, + "learning_rate": 3.4714408798737925e-06, + "loss": 0.85212845, + "num_input_tokens_seen": 93310090, + "router_z_loss_clip": 2.14941406, + "router_z_loss_mlp": 0.18945312, + "step": 4320, + "time_per_iteration": 2.569967269897461 + }, + { + "auxiliary_loss_clip": 0.06508496, + "auxiliary_loss_mlp": 0.01273671, + "balance_loss_clip": 0.06289761, + "balance_loss_mlp": 0.01254634, + "epoch": 0.2597925747782955, + "flos": 22055810618880.0, + "grad_norm": 1.593385908573569, + "language_loss": 0.71533716, + "learning_rate": 3.471177075288801e-06, + "loss": 0.79315877, + "num_input_tokens_seen": 93329570, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.19042969, + "step": 4321, + "time_per_iteration": 2.5314829349517822 + }, + { + "auxiliary_loss_clip": 0.0650996, + "auxiliary_loss_mlp": 0.01274348, + "balance_loss_clip": 0.06287652, + "balance_loss_mlp": 0.01254011, + "epoch": 0.2598526980309635, + "flos": 19543071179520.0, + "grad_norm": 2.282331155451991, + "language_loss": 0.75262189, + "learning_rate": 3.4709132149166277e-06, + "loss": 0.83046496, + "num_input_tokens_seen": 93347920, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.20336914, + "step": 4322, + "time_per_iteration": 2.525724411010742 + }, + { + "auxiliary_loss_clip": 0.06509394, + "auxiliary_loss_mlp": 0.01275417, + "balance_loss_clip": 0.06289983, + "balance_loss_mlp": 0.0125533, + "epoch": 0.25991282128363147, + "flos": 24501521191680.0, + "grad_norm": 2.623736611083137, + "language_loss": 0.7442928, + "learning_rate": 3.470649298767278e-06, + "loss": 0.82214087, + "num_input_tokens_seen": 93367145, + "router_z_loss_clip": 2.19335938, + "router_z_loss_mlp": 0.20092773, + "step": 4323, + "time_per_iteration": 3.957674026489258 + }, + { + "auxiliary_loss_clip": 0.06515582, + "auxiliary_loss_mlp": 0.01279409, + "balance_loss_clip": 0.0628901, + "balance_loss_mlp": 0.01258893, + "epoch": 0.25997294453629943, + "flos": 24207410960640.0, + "grad_norm": 1.7976461796423409, + "language_loss": 0.68052149, + "learning_rate": 3.4703853268507597e-06, + "loss": 0.75847143, + "num_input_tokens_seen": 93386555, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.20495605, + "step": 4324, + "time_per_iteration": 4.001135349273682 + }, + { + "auxiliary_loss_clip": 0.06505544, + "auxiliary_loss_mlp": 0.01272584, + "balance_loss_clip": 0.06286605, + "balance_loss_mlp": 0.01254608, + "epoch": 0.2600330677889674, + "flos": 31439597040000.0, + "grad_norm": 1.7946989584541546, + "language_loss": 0.71402133, + "learning_rate": 3.470121299177082e-06, + "loss": 0.79180264, + "num_input_tokens_seen": 93405590, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.1796875, + "step": 4325, + "time_per_iteration": 2.6213603019714355 + }, + { + "auxiliary_loss_clip": 0.06501837, + "auxiliary_loss_mlp": 0.01274613, + "balance_loss_clip": 0.06284901, + "balance_loss_mlp": 0.01255004, + "epoch": 0.26009319104163536, + "flos": 32274116179200.0, + "grad_norm": 1.826124228611905, + "language_loss": 0.73262805, + "learning_rate": 3.469857215756257e-06, + "loss": 0.81039256, + "num_input_tokens_seen": 93424750, + "router_z_loss_clip": 2.16601562, + "router_z_loss_mlp": 0.19592285, + "step": 4326, + "time_per_iteration": 2.593801736831665 + }, + { + "auxiliary_loss_clip": 0.06500994, + "auxiliary_loss_mlp": 0.01276886, + "balance_loss_clip": 0.06288173, + "balance_loss_mlp": 0.01258051, + "epoch": 0.26015331429430333, + "flos": 26293994933760.0, + "grad_norm": 1.858424121782002, + "language_loss": 0.8722446, + "learning_rate": 3.4695930765982997e-06, + "loss": 0.95002341, + "num_input_tokens_seen": 93443465, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.18835449, + "step": 4327, + "time_per_iteration": 2.5950510501861572 + }, + { + "auxiliary_loss_clip": 0.06508228, + "auxiliary_loss_mlp": 0.01274095, + "balance_loss_clip": 0.06287643, + "balance_loss_mlp": 0.01254271, + "epoch": 0.2602134375469713, + "flos": 21148728243840.0, + "grad_norm": 1.765295937421399, + "language_loss": 0.8100785, + "learning_rate": 3.4693288817132255e-06, + "loss": 0.88790172, + "num_input_tokens_seen": 93462580, + "router_z_loss_clip": 2.20507812, + "router_z_loss_mlp": 0.19824219, + "step": 4328, + "time_per_iteration": 3.923682928085327 + }, + { + "auxiliary_loss_clip": 0.06502862, + "auxiliary_loss_mlp": 0.01277051, + "balance_loss_clip": 0.06285354, + "balance_loss_mlp": 0.01258704, + "epoch": 0.26027356079963926, + "flos": 25928411569920.0, + "grad_norm": 1.3948699622732248, + "language_loss": 0.88172936, + "learning_rate": 3.4690646311110525e-06, + "loss": 0.95952845, + "num_input_tokens_seen": 93482790, + "router_z_loss_clip": 2.17578125, + "router_z_loss_mlp": 0.18347168, + "step": 4329, + "time_per_iteration": 2.5685267448425293 + }, + { + "auxiliary_loss_clip": 0.06502585, + "auxiliary_loss_mlp": 0.01271461, + "balance_loss_clip": 0.06288581, + "balance_loss_mlp": 0.0125327, + "epoch": 0.2603336840523072, + "flos": 26366390461440.0, + "grad_norm": 1.8811175805050973, + "language_loss": 0.77705932, + "learning_rate": 3.468800324801802e-06, + "loss": 0.85479975, + "num_input_tokens_seen": 93498795, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.18188477, + "step": 4330, + "time_per_iteration": 2.6185224056243896 + }, + { + "auxiliary_loss_clip": 0.06508863, + "auxiliary_loss_mlp": 0.01277238, + "balance_loss_clip": 0.06289242, + "balance_loss_mlp": 0.0125826, + "epoch": 0.2603938073049752, + "flos": 23520408134400.0, + "grad_norm": 1.5596482888270802, + "language_loss": 0.76200908, + "learning_rate": 3.4685359627954958e-06, + "loss": 0.8398701, + "num_input_tokens_seen": 93518335, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.18981934, + "step": 4331, + "time_per_iteration": 2.5152506828308105 + }, + { + "auxiliary_loss_clip": 0.06507871, + "auxiliary_loss_mlp": 0.01272217, + "balance_loss_clip": 0.06292268, + "balance_loss_mlp": 0.01254527, + "epoch": 0.26045393055764315, + "flos": 25381336262400.0, + "grad_norm": 1.426884348550376, + "language_loss": 0.69540298, + "learning_rate": 3.4682715451021584e-06, + "loss": 0.77320385, + "num_input_tokens_seen": 93539170, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.17700195, + "step": 4332, + "time_per_iteration": 2.5776190757751465 + }, + { + "auxiliary_loss_clip": 0.06511752, + "auxiliary_loss_mlp": 0.01275479, + "balance_loss_clip": 0.0629351, + "balance_loss_mlp": 0.0125693, + "epoch": 0.2605140538103111, + "flos": 27642494217600.0, + "grad_norm": 1.8844860211449586, + "language_loss": 0.79951644, + "learning_rate": 3.4680070717318174e-06, + "loss": 0.87738872, + "num_input_tokens_seen": 93558480, + "router_z_loss_clip": 2.18164062, + "router_z_loss_mlp": 0.1854248, + "step": 4333, + "time_per_iteration": 2.5523998737335205 + }, + { + "auxiliary_loss_clip": 0.06501235, + "auxiliary_loss_mlp": 0.01272154, + "balance_loss_clip": 0.06290703, + "balance_loss_mlp": 0.01254714, + "epoch": 0.2605741770629791, + "flos": 13774602147840.0, + "grad_norm": 1.6726919145500945, + "language_loss": 0.81128466, + "learning_rate": 3.467742542694501e-06, + "loss": 0.8890186, + "num_input_tokens_seen": 93575220, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 0.17443848, + "step": 4334, + "time_per_iteration": 2.522210121154785 + }, + { + "auxiliary_loss_clip": 0.06510483, + "auxiliary_loss_mlp": 0.01278802, + "balance_loss_clip": 0.06293018, + "balance_loss_mlp": 0.01259859, + "epoch": 0.26063430031564705, + "flos": 26038933505280.0, + "grad_norm": 1.7438742011205015, + "language_loss": 0.80170292, + "learning_rate": 3.46747795800024e-06, + "loss": 0.87959582, + "num_input_tokens_seen": 93597015, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.18945312, + "step": 4335, + "time_per_iteration": 2.582817792892456 + }, + { + "auxiliary_loss_clip": 0.06403506, + "auxiliary_loss_mlp": 0.01257225, + "balance_loss_clip": 0.06297, + "balance_loss_mlp": 0.01252544, + "epoch": 0.26069442356831507, + "flos": 62463143030400.0, + "grad_norm": 0.8284851894367303, + "language_loss": 0.60816151, + "learning_rate": 3.467213317659068e-06, + "loss": 0.6847688, + "num_input_tokens_seen": 93657775, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.04672241, + "step": 4336, + "time_per_iteration": 3.2036406993865967 + }, + { + "auxiliary_loss_clip": 0.0651319, + "auxiliary_loss_mlp": 0.0127574, + "balance_loss_clip": 0.06294517, + "balance_loss_mlp": 0.01257405, + "epoch": 0.26075454682098304, + "flos": 13631530101120.0, + "grad_norm": 1.8662385080657846, + "language_loss": 0.78028893, + "learning_rate": 3.46694862168102e-06, + "loss": 0.85817826, + "num_input_tokens_seen": 93676145, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.18322754, + "step": 4337, + "time_per_iteration": 2.4899747371673584 + }, + { + "auxiliary_loss_clip": 0.06515083, + "auxiliary_loss_mlp": 0.01276173, + "balance_loss_clip": 0.06295171, + "balance_loss_mlp": 0.01256289, + "epoch": 0.260814670073651, + "flos": 12130776748800.0, + "grad_norm": 2.165940638299647, + "language_loss": 0.74851859, + "learning_rate": 3.4666838700761334e-06, + "loss": 0.82643116, + "num_input_tokens_seen": 93692480, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.19897461, + "step": 4338, + "time_per_iteration": 2.5323259830474854 + }, + { + "auxiliary_loss_clip": 0.06522977, + "auxiliary_loss_mlp": 0.01274339, + "balance_loss_clip": 0.0629933, + "balance_loss_mlp": 0.01255039, + "epoch": 0.26087479332631897, + "flos": 15127964968320.0, + "grad_norm": 2.9662822483112388, + "language_loss": 0.81419933, + "learning_rate": 3.466419062854447e-06, + "loss": 0.89217252, + "num_input_tokens_seen": 93710165, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.19287109, + "step": 4339, + "time_per_iteration": 2.486024856567383 + }, + { + "auxiliary_loss_clip": 0.06514673, + "auxiliary_loss_mlp": 0.0127648, + "balance_loss_clip": 0.06300991, + "balance_loss_mlp": 0.01259278, + "epoch": 0.26093491657898693, + "flos": 24687834744960.0, + "grad_norm": 1.5467473582016638, + "language_loss": 0.77106607, + "learning_rate": 3.4661542000260033e-06, + "loss": 0.84897768, + "num_input_tokens_seen": 93730185, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.17199707, + "step": 4340, + "time_per_iteration": 2.570777416229248 + }, + { + "auxiliary_loss_clip": 0.06513949, + "auxiliary_loss_mlp": 0.01274956, + "balance_loss_clip": 0.062961, + "balance_loss_mlp": 0.01255788, + "epoch": 0.2609950398316549, + "flos": 25122669108480.0, + "grad_norm": 1.4533527138525517, + "language_loss": 0.82740015, + "learning_rate": 3.465889281600845e-06, + "loss": 0.90528917, + "num_input_tokens_seen": 93747690, + "router_z_loss_clip": 2.18164062, + "router_z_loss_mlp": 0.19177246, + "step": 4341, + "time_per_iteration": 2.5946342945098877 + }, + { + "auxiliary_loss_clip": 0.06519589, + "auxiliary_loss_mlp": 0.01282035, + "balance_loss_clip": 0.06303687, + "balance_loss_mlp": 0.01261794, + "epoch": 0.26105516308432286, + "flos": 28556159137920.0, + "grad_norm": 1.7858700463590271, + "language_loss": 0.77163744, + "learning_rate": 3.4656243075890183e-06, + "loss": 0.84965372, + "num_input_tokens_seen": 93767405, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.20251465, + "step": 4342, + "time_per_iteration": 2.5742342472076416 + }, + { + "auxiliary_loss_clip": 0.06521034, + "auxiliary_loss_mlp": 0.01277248, + "balance_loss_clip": 0.06303718, + "balance_loss_mlp": 0.01258115, + "epoch": 0.2611152863369908, + "flos": 39539984400000.0, + "grad_norm": 1.7100835603344944, + "language_loss": 0.66681403, + "learning_rate": 3.4653592780005707e-06, + "loss": 0.74479687, + "num_input_tokens_seen": 93789950, + "router_z_loss_clip": 2.17578125, + "router_z_loss_mlp": 0.19140625, + "step": 4343, + "time_per_iteration": 2.662271738052368 + }, + { + "auxiliary_loss_clip": 0.06524731, + "auxiliary_loss_mlp": 0.01280109, + "balance_loss_clip": 0.0630408, + "balance_loss_mlp": 0.01261917, + "epoch": 0.2611754095896588, + "flos": 13740416881920.0, + "grad_norm": 1.8127929734390111, + "language_loss": 0.74220115, + "learning_rate": 3.465094192845553e-06, + "loss": 0.82024956, + "num_input_tokens_seen": 93807835, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.18200684, + "step": 4344, + "time_per_iteration": 2.5201361179351807 + }, + { + "auxiliary_loss_clip": 0.06524797, + "auxiliary_loss_mlp": 0.01284917, + "balance_loss_clip": 0.06307752, + "balance_loss_mlp": 0.01264484, + "epoch": 0.26123553284232676, + "flos": 21513011869440.0, + "grad_norm": 2.1854473316742338, + "language_loss": 0.8696478, + "learning_rate": 3.4648290521340165e-06, + "loss": 0.94774491, + "num_input_tokens_seen": 93825670, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.20422363, + "step": 4345, + "time_per_iteration": 2.510000228881836 + }, + { + "auxiliary_loss_clip": 0.06521724, + "auxiliary_loss_mlp": 0.01276675, + "balance_loss_clip": 0.06307776, + "balance_loss_mlp": 0.01258293, + "epoch": 0.2612956560949947, + "flos": 21145751424000.0, + "grad_norm": 2.0739898036059095, + "language_loss": 0.76897335, + "learning_rate": 3.464563855876015e-06, + "loss": 0.84695733, + "num_input_tokens_seen": 93844045, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.18371582, + "step": 4346, + "time_per_iteration": 2.5322000980377197 + }, + { + "auxiliary_loss_clip": 0.06522055, + "auxiliary_loss_mlp": 0.01275162, + "balance_loss_clip": 0.06305227, + "balance_loss_mlp": 0.01256911, + "epoch": 0.2613557793476627, + "flos": 25126023271680.0, + "grad_norm": 1.5562871556893731, + "language_loss": 0.76140273, + "learning_rate": 3.464298604081606e-06, + "loss": 0.83937496, + "num_input_tokens_seen": 93864380, + "router_z_loss_clip": 2.16894531, + "router_z_loss_mlp": 0.18249512, + "step": 4347, + "time_per_iteration": 2.557077169418335 + }, + { + "auxiliary_loss_clip": 0.06522661, + "auxiliary_loss_mlp": 0.01286127, + "balance_loss_clip": 0.06307539, + "balance_loss_mlp": 0.01267208, + "epoch": 0.26141590260033065, + "flos": 26074879706880.0, + "grad_norm": 1.3369896368920637, + "language_loss": 0.7377249, + "learning_rate": 3.4640332967608476e-06, + "loss": 0.81581283, + "num_input_tokens_seen": 93885475, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.18920898, + "step": 4348, + "time_per_iteration": 2.5915603637695312 + }, + { + "auxiliary_loss_clip": 0.06527912, + "auxiliary_loss_mlp": 0.01280562, + "balance_loss_clip": 0.06309946, + "balance_loss_mlp": 0.01260881, + "epoch": 0.2614760258529987, + "flos": 25708415875200.0, + "grad_norm": 1.876318754691465, + "language_loss": 0.9123491, + "learning_rate": 3.463767933923799e-06, + "loss": 0.99043381, + "num_input_tokens_seen": 93905545, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.19689941, + "step": 4349, + "time_per_iteration": 2.594332218170166 + }, + { + "auxiliary_loss_clip": 0.06524529, + "auxiliary_loss_mlp": 0.01276126, + "balance_loss_clip": 0.0631379, + "balance_loss_mlp": 0.01256695, + "epoch": 0.26153614910566664, + "flos": 17462902043520.0, + "grad_norm": 1.601755901803269, + "language_loss": 0.80459869, + "learning_rate": 3.463502515580524e-06, + "loss": 0.8826052, + "num_input_tokens_seen": 93924185, + "router_z_loss_clip": 2.10546875, + "router_z_loss_mlp": 0.19433594, + "step": 4350, + "time_per_iteration": 2.509274482727051 + }, + { + "auxiliary_loss_clip": 0.06520928, + "auxiliary_loss_mlp": 0.01277683, + "balance_loss_clip": 0.0631097, + "balance_loss_mlp": 0.01259063, + "epoch": 0.2615962723583346, + "flos": 17718676231680.0, + "grad_norm": 1.8928977658247819, + "language_loss": 0.62482548, + "learning_rate": 3.4632370417410866e-06, + "loss": 0.7028116, + "num_input_tokens_seen": 93942825, + "router_z_loss_clip": 2.1015625, + "router_z_loss_mlp": 0.18615723, + "step": 4351, + "time_per_iteration": 2.522862672805786 + }, + { + "auxiliary_loss_clip": 0.06526107, + "auxiliary_loss_mlp": 0.01278827, + "balance_loss_clip": 0.06308405, + "balance_loss_mlp": 0.01259396, + "epoch": 0.26165639561100257, + "flos": 23264340456960.0, + "grad_norm": 2.4783042039829546, + "language_loss": 0.84264326, + "learning_rate": 3.462971512415555e-06, + "loss": 0.92069256, + "num_input_tokens_seen": 93962045, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.19445801, + "step": 4352, + "time_per_iteration": 2.5326311588287354 + }, + { + "auxiliary_loss_clip": 0.06398427, + "auxiliary_loss_mlp": 0.01261209, + "balance_loss_clip": 0.06294002, + "balance_loss_mlp": 0.01256817, + "epoch": 0.26171651886367053, + "flos": 66756155443200.0, + "grad_norm": 0.7669563885543124, + "language_loss": 0.7057451, + "learning_rate": 3.462705927613996e-06, + "loss": 0.78234154, + "num_input_tokens_seen": 94021175, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.04397583, + "step": 4353, + "time_per_iteration": 3.093543529510498 + }, + { + "auxiliary_loss_clip": 0.06517833, + "auxiliary_loss_mlp": 0.01279039, + "balance_loss_clip": 0.06305997, + "balance_loss_mlp": 0.01259619, + "epoch": 0.2617766421163385, + "flos": 22356713030400.0, + "grad_norm": 1.943198757771125, + "language_loss": 0.77770078, + "learning_rate": 3.4624402873464816e-06, + "loss": 0.8556695, + "num_input_tokens_seen": 94043370, + "router_z_loss_clip": 2.1171875, + "router_z_loss_mlp": 0.19433594, + "step": 4354, + "time_per_iteration": 2.5782573223114014 + }, + { + "auxiliary_loss_clip": 0.06522856, + "auxiliary_loss_mlp": 0.01279183, + "balance_loss_clip": 0.06303968, + "balance_loss_mlp": 0.01259907, + "epoch": 0.26183676536900646, + "flos": 26074208874240.0, + "grad_norm": 2.16382169558429, + "language_loss": 0.68941987, + "learning_rate": 3.462174591623085e-06, + "loss": 0.7674402, + "num_input_tokens_seen": 94063510, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.19274902, + "step": 4355, + "time_per_iteration": 2.608482599258423 + }, + { + "auxiliary_loss_clip": 0.06517249, + "auxiliary_loss_mlp": 0.01282478, + "balance_loss_clip": 0.06301509, + "balance_loss_mlp": 0.01260889, + "epoch": 0.26189688862167443, + "flos": 21002847085440.0, + "grad_norm": 2.1598133279644554, + "language_loss": 0.68533909, + "learning_rate": 3.4619088404538815e-06, + "loss": 0.76333642, + "num_input_tokens_seen": 94083865, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.21594238, + "step": 4356, + "time_per_iteration": 2.526376247406006 + }, + { + "auxiliary_loss_clip": 0.06398848, + "auxiliary_loss_mlp": 0.01254107, + "balance_loss_clip": 0.06295048, + "balance_loss_mlp": 0.01249723, + "epoch": 0.2619570118743424, + "flos": 65817780768000.0, + "grad_norm": 0.6753767209108164, + "language_loss": 0.5316326, + "learning_rate": 3.4616430338489487e-06, + "loss": 0.60816211, + "num_input_tokens_seen": 94144095, + "router_z_loss_clip": 1.03125, + "router_z_loss_mlp": 0.04391479, + "step": 4357, + "time_per_iteration": 4.58653450012207 + }, + { + "auxiliary_loss_clip": 0.065238, + "auxiliary_loss_mlp": 0.01280125, + "balance_loss_clip": 0.06302594, + "balance_loss_mlp": 0.01261183, + "epoch": 0.26201713512701036, + "flos": 28774310042880.0, + "grad_norm": 1.9589657113609436, + "language_loss": 0.85308599, + "learning_rate": 3.4613771718183654e-06, + "loss": 0.93112528, + "num_input_tokens_seen": 94163035, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.18933105, + "step": 4358, + "time_per_iteration": 2.65427303314209 + }, + { + "auxiliary_loss_clip": 0.0652793, + "auxiliary_loss_mlp": 0.0127535, + "balance_loss_clip": 0.06300082, + "balance_loss_mlp": 0.01254917, + "epoch": 0.2620772583796783, + "flos": 26439750311040.0, + "grad_norm": 2.2013035586341663, + "language_loss": 0.68206531, + "learning_rate": 3.4611112543722127e-06, + "loss": 0.7600981, + "num_input_tokens_seen": 94182520, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.20422363, + "step": 4359, + "time_per_iteration": 2.5460946559906006 + }, + { + "auxiliary_loss_clip": 0.06517753, + "auxiliary_loss_mlp": 0.01278599, + "balance_loss_clip": 0.06299832, + "balance_loss_mlp": 0.01258763, + "epoch": 0.2621373816323463, + "flos": 20162667795840.0, + "grad_norm": 1.9413360196767273, + "language_loss": 0.7857362, + "learning_rate": 3.4608452815205757e-06, + "loss": 0.86369967, + "num_input_tokens_seen": 94201795, + "router_z_loss_clip": 2.18164062, + "router_z_loss_mlp": 0.19848633, + "step": 4360, + "time_per_iteration": 2.5442395210266113 + }, + { + "auxiliary_loss_clip": 0.06513859, + "auxiliary_loss_mlp": 0.01282389, + "balance_loss_clip": 0.06305451, + "balance_loss_mlp": 0.01262839, + "epoch": 0.26219750488501425, + "flos": 28628764300800.0, + "grad_norm": 1.9016418571028826, + "language_loss": 0.68632245, + "learning_rate": 3.4605792532735387e-06, + "loss": 0.76428491, + "num_input_tokens_seen": 94222390, + "router_z_loss_clip": 2.08105469, + "router_z_loss_mlp": 0.19519043, + "step": 4361, + "time_per_iteration": 2.5506739616394043 + }, + { + "auxiliary_loss_clip": 0.0652248, + "auxiliary_loss_mlp": 0.01277506, + "balance_loss_clip": 0.06302515, + "balance_loss_mlp": 0.01256298, + "epoch": 0.2622576281376823, + "flos": 15046806689280.0, + "grad_norm": 1.72568625675014, + "language_loss": 0.84433615, + "learning_rate": 3.46031316964119e-06, + "loss": 0.92233592, + "num_input_tokens_seen": 94239980, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.21179199, + "step": 4362, + "time_per_iteration": 3.9455041885375977 + }, + { + "auxiliary_loss_clip": 0.06516212, + "auxiliary_loss_mlp": 0.01274047, + "balance_loss_clip": 0.06303745, + "balance_loss_mlp": 0.01254914, + "epoch": 0.26231775139035024, + "flos": 26403426766080.0, + "grad_norm": 1.7310155723144771, + "language_loss": 0.65182602, + "learning_rate": 3.4600470306336197e-06, + "loss": 0.72972858, + "num_input_tokens_seen": 94260715, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.19140625, + "step": 4363, + "time_per_iteration": 2.5710229873657227 + }, + { + "auxiliary_loss_clip": 0.06417713, + "auxiliary_loss_mlp": 0.01270336, + "balance_loss_clip": 0.06313097, + "balance_loss_mlp": 0.01263804, + "epoch": 0.2623778746430182, + "flos": 65430380615040.0, + "grad_norm": 0.9022976396731897, + "language_loss": 0.61189461, + "learning_rate": 3.4597808362609194e-06, + "loss": 0.68877506, + "num_input_tokens_seen": 94321285, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.06542969, + "step": 4364, + "time_per_iteration": 4.728578805923462 + }, + { + "auxiliary_loss_clip": 0.06528256, + "auxiliary_loss_mlp": 0.01280703, + "balance_loss_clip": 0.06308191, + "balance_loss_mlp": 0.01260402, + "epoch": 0.26243799789568617, + "flos": 12609104181120.0, + "grad_norm": 2.531531320883944, + "language_loss": 0.72247571, + "learning_rate": 3.459514586533184e-06, + "loss": 0.80056524, + "num_input_tokens_seen": 94335420, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.20300293, + "step": 4365, + "time_per_iteration": 2.5567469596862793 + }, + { + "auxiliary_loss_clip": 0.06519997, + "auxiliary_loss_mlp": 0.01276088, + "balance_loss_clip": 0.06307054, + "balance_loss_mlp": 0.01257146, + "epoch": 0.26249812114835414, + "flos": 28631783047680.0, + "grad_norm": 1.7351756990107399, + "language_loss": 0.78023124, + "learning_rate": 3.459248281460509e-06, + "loss": 0.85819209, + "num_input_tokens_seen": 94357440, + "router_z_loss_clip": 2.12792969, + "router_z_loss_mlp": 0.18945312, + "step": 4366, + "time_per_iteration": 2.6212668418884277 + }, + { + "auxiliary_loss_clip": 0.06522524, + "auxiliary_loss_mlp": 0.01276459, + "balance_loss_clip": 0.06305946, + "balance_loss_mlp": 0.01258351, + "epoch": 0.2625582444010221, + "flos": 14470661214720.0, + "grad_norm": 1.579355851615032, + "language_loss": 0.77007079, + "learning_rate": 3.4589819210529927e-06, + "loss": 0.84806067, + "num_input_tokens_seen": 94375690, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.18103027, + "step": 4367, + "time_per_iteration": 2.602072238922119 + }, + { + "auxiliary_loss_clip": 0.06517363, + "auxiliary_loss_mlp": 0.01271186, + "balance_loss_clip": 0.06304537, + "balance_loss_mlp": 0.01253471, + "epoch": 0.26261836765369007, + "flos": 16617984998400.0, + "grad_norm": 1.5269013949985815, + "language_loss": 0.70157337, + "learning_rate": 3.458715505320736e-06, + "loss": 0.77945888, + "num_input_tokens_seen": 94393190, + "router_z_loss_clip": 2.13183594, + "router_z_loss_mlp": 0.17700195, + "step": 4368, + "time_per_iteration": 4.012764930725098 + }, + { + "auxiliary_loss_clip": 0.06516206, + "auxiliary_loss_mlp": 0.01278713, + "balance_loss_clip": 0.06299432, + "balance_loss_mlp": 0.01256635, + "epoch": 0.26267849090635803, + "flos": 20525861318400.0, + "grad_norm": 1.916794033771568, + "language_loss": 0.79240829, + "learning_rate": 3.458449034273841e-06, + "loss": 0.87035751, + "num_input_tokens_seen": 94410975, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.22070312, + "step": 4369, + "time_per_iteration": 2.51906418800354 + }, + { + "auxiliary_loss_clip": 0.06514631, + "auxiliary_loss_mlp": 0.01276005, + "balance_loss_clip": 0.06301987, + "balance_loss_mlp": 0.01256883, + "epoch": 0.262738614159026, + "flos": 21330220187520.0, + "grad_norm": 3.2285566965587873, + "language_loss": 0.83905816, + "learning_rate": 3.4581825079224133e-06, + "loss": 0.91696453, + "num_input_tokens_seen": 94429985, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.19116211, + "step": 4370, + "time_per_iteration": 2.562302589416504 + }, + { + "auxiliary_loss_clip": 0.06520583, + "auxiliary_loss_mlp": 0.01275132, + "balance_loss_clip": 0.06299531, + "balance_loss_mlp": 0.01253972, + "epoch": 0.26279873741169396, + "flos": 17609454034560.0, + "grad_norm": 1.7096089610285066, + "language_loss": 0.71678042, + "learning_rate": 3.4579159262765575e-06, + "loss": 0.79473758, + "num_input_tokens_seen": 94448660, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.21179199, + "step": 4371, + "time_per_iteration": 2.4965152740478516 + }, + { + "auxiliary_loss_clip": 0.06398421, + "auxiliary_loss_mlp": 0.01256739, + "balance_loss_clip": 0.0629326, + "balance_loss_mlp": 0.01252516, + "epoch": 0.2628588606643619, + "flos": 60969139931520.0, + "grad_norm": 0.666639264120038, + "language_loss": 0.56056166, + "learning_rate": 3.457649289346384e-06, + "loss": 0.63711321, + "num_input_tokens_seen": 94515630, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.04226685, + "step": 4372, + "time_per_iteration": 3.2867443561553955 + }, + { + "auxiliary_loss_clip": 0.06512036, + "auxiliary_loss_mlp": 0.01277679, + "balance_loss_clip": 0.06298684, + "balance_loss_mlp": 0.01259178, + "epoch": 0.2629189839170299, + "flos": 27023652288000.0, + "grad_norm": 1.5439358769508327, + "language_loss": 0.78190762, + "learning_rate": 3.4573825971420042e-06, + "loss": 0.85980475, + "num_input_tokens_seen": 94535385, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.18505859, + "step": 4373, + "time_per_iteration": 2.577479362487793 + }, + { + "auxiliary_loss_clip": 0.06510606, + "auxiliary_loss_mlp": 0.01278833, + "balance_loss_clip": 0.06297645, + "balance_loss_mlp": 0.01260427, + "epoch": 0.26297910716969786, + "flos": 17025635911680.0, + "grad_norm": 2.1443132622279664, + "language_loss": 0.723768, + "learning_rate": 3.4571158496735294e-06, + "loss": 0.80166239, + "num_input_tokens_seen": 94552650, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.18383789, + "step": 4374, + "time_per_iteration": 2.5588772296905518 + }, + { + "auxiliary_loss_clip": 0.06517059, + "auxiliary_loss_mlp": 0.01278902, + "balance_loss_clip": 0.0630156, + "balance_loss_mlp": 0.01258505, + "epoch": 0.2630392304223659, + "flos": 24903889297920.0, + "grad_norm": 2.1190930293084933, + "language_loss": 0.81199759, + "learning_rate": 3.4568490469510756e-06, + "loss": 0.88995719, + "num_input_tokens_seen": 94574075, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.20373535, + "step": 4375, + "time_per_iteration": 2.591381311416626 + }, + { + "auxiliary_loss_clip": 0.0651055, + "auxiliary_loss_mlp": 0.01275326, + "balance_loss_clip": 0.0629838, + "balance_loss_mlp": 0.01257289, + "epoch": 0.26309935367503384, + "flos": 32862336641280.0, + "grad_norm": 1.9139045559413268, + "language_loss": 0.66626596, + "learning_rate": 3.4565821889847603e-06, + "loss": 0.74412477, + "num_input_tokens_seen": 94594255, + "router_z_loss_clip": 2.12304688, + "router_z_loss_mlp": 0.18041992, + "step": 4376, + "time_per_iteration": 2.643944025039673 + }, + { + "auxiliary_loss_clip": 0.06515232, + "auxiliary_loss_mlp": 0.01276237, + "balance_loss_clip": 0.06297503, + "balance_loss_mlp": 0.01257485, + "epoch": 0.2631594769277018, + "flos": 15893400816000.0, + "grad_norm": 1.6251454157029055, + "language_loss": 0.70145154, + "learning_rate": 3.4563152757847026e-06, + "loss": 0.77936625, + "num_input_tokens_seen": 94611410, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.1875, + "step": 4377, + "time_per_iteration": 2.5593788623809814 + }, + { + "auxiliary_loss_clip": 0.06513406, + "auxiliary_loss_mlp": 0.01274994, + "balance_loss_clip": 0.06296869, + "balance_loss_mlp": 0.01255408, + "epoch": 0.2632196001803698, + "flos": 50816242811520.0, + "grad_norm": 1.6666327452584295, + "language_loss": 0.80235565, + "learning_rate": 3.4560483073610233e-06, + "loss": 0.88023967, + "num_input_tokens_seen": 94636575, + "router_z_loss_clip": 2.16601562, + "router_z_loss_mlp": 0.19592285, + "step": 4378, + "time_per_iteration": 2.794290065765381 + }, + { + "auxiliary_loss_clip": 0.0651051, + "auxiliary_loss_mlp": 0.01272396, + "balance_loss_clip": 0.06297652, + "balance_loss_mlp": 0.0125492, + "epoch": 0.26327972343303774, + "flos": 13737733551360.0, + "grad_norm": 2.7188396998417548, + "language_loss": 0.77230549, + "learning_rate": 3.455781283723846e-06, + "loss": 0.85013449, + "num_input_tokens_seen": 94654345, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.17480469, + "step": 4379, + "time_per_iteration": 2.542442560195923 + }, + { + "auxiliary_loss_clip": 0.06519607, + "auxiliary_loss_mlp": 0.01274968, + "balance_loss_clip": 0.06299821, + "balance_loss_mlp": 0.01255084, + "epoch": 0.2633398466857057, + "flos": 23775846906240.0, + "grad_norm": 1.9724368576120554, + "language_loss": 0.78418016, + "learning_rate": 3.4555142048832975e-06, + "loss": 0.86212587, + "num_input_tokens_seen": 94673985, + "router_z_loss_clip": 2.19921875, + "router_z_loss_mlp": 0.19897461, + "step": 4380, + "time_per_iteration": 2.529573440551758 + }, + { + "auxiliary_loss_clip": 0.06516172, + "auxiliary_loss_mlp": 0.012759, + "balance_loss_clip": 0.06296928, + "balance_loss_mlp": 0.01257518, + "epoch": 0.26339996993837367, + "flos": 27607680046080.0, + "grad_norm": 1.9046534185934374, + "language_loss": 0.6460917, + "learning_rate": 3.4552470708495036e-06, + "loss": 0.72401243, + "num_input_tokens_seen": 94693145, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.18383789, + "step": 4381, + "time_per_iteration": 2.5774149894714355 + }, + { + "auxiliary_loss_clip": 0.06511073, + "auxiliary_loss_mlp": 0.01273848, + "balance_loss_clip": 0.06295128, + "balance_loss_mlp": 0.01255394, + "epoch": 0.26346009319104163, + "flos": 16951982572800.0, + "grad_norm": 1.8115834165165374, + "language_loss": 0.8293367, + "learning_rate": 3.454979881632595e-06, + "loss": 0.90718591, + "num_input_tokens_seen": 94710185, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.18444824, + "step": 4382, + "time_per_iteration": 2.503119945526123 + }, + { + "auxiliary_loss_clip": 0.06526808, + "auxiliary_loss_mlp": 0.01282548, + "balance_loss_clip": 0.06304507, + "balance_loss_mlp": 0.0126196, + "epoch": 0.2635202164437096, + "flos": 37241245088640.0, + "grad_norm": 2.8611377763647363, + "language_loss": 0.70728219, + "learning_rate": 3.4547126372427035e-06, + "loss": 0.78537577, + "num_input_tokens_seen": 94730280, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.20581055, + "step": 4383, + "time_per_iteration": 2.7256851196289062 + }, + { + "auxiliary_loss_clip": 0.06511825, + "auxiliary_loss_mlp": 0.01278143, + "balance_loss_clip": 0.0629648, + "balance_loss_mlp": 0.01260214, + "epoch": 0.26358033969637756, + "flos": 21002721304320.0, + "grad_norm": 1.8636489890531567, + "language_loss": 0.69725919, + "learning_rate": 3.4544453376899638e-06, + "loss": 0.77515888, + "num_input_tokens_seen": 94748560, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.17919922, + "step": 4384, + "time_per_iteration": 2.526306629180908 + }, + { + "auxiliary_loss_clip": 0.06514609, + "auxiliary_loss_mlp": 0.01274952, + "balance_loss_clip": 0.06301568, + "balance_loss_mlp": 0.01256355, + "epoch": 0.26364046294904553, + "flos": 27753561204480.0, + "grad_norm": 2.704228439938978, + "language_loss": 0.70769042, + "learning_rate": 3.45417798298451e-06, + "loss": 0.785586, + "num_input_tokens_seen": 94767570, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.18603516, + "step": 4385, + "time_per_iteration": 2.6091294288635254 + }, + { + "auxiliary_loss_clip": 0.06510788, + "auxiliary_loss_mlp": 0.01275036, + "balance_loss_clip": 0.06297003, + "balance_loss_mlp": 0.01255903, + "epoch": 0.2637005862017135, + "flos": 22899679488000.0, + "grad_norm": 1.8400483569046413, + "language_loss": 0.85200071, + "learning_rate": 3.453910573136482e-06, + "loss": 0.92985892, + "num_input_tokens_seen": 94784985, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.19116211, + "step": 4386, + "time_per_iteration": 2.5284476280212402 + }, + { + "auxiliary_loss_clip": 0.06516191, + "auxiliary_loss_mlp": 0.01275321, + "balance_loss_clip": 0.06302508, + "balance_loss_mlp": 0.01255759, + "epoch": 0.26376070945438146, + "flos": 15054143921280.0, + "grad_norm": 1.9881194524454247, + "language_loss": 0.77597183, + "learning_rate": 3.4536431081560196e-06, + "loss": 0.85388696, + "num_input_tokens_seen": 94802545, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.19567871, + "step": 4387, + "time_per_iteration": 2.522135019302368 + }, + { + "auxiliary_loss_clip": 0.0651316, + "auxiliary_loss_mlp": 0.01278261, + "balance_loss_clip": 0.06301039, + "balance_loss_mlp": 0.01259378, + "epoch": 0.2638208327070494, + "flos": 21148141265280.0, + "grad_norm": 2.1303107819849316, + "language_loss": 0.76193964, + "learning_rate": 3.453375588053264e-06, + "loss": 0.83985388, + "num_input_tokens_seen": 94820730, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.1887207, + "step": 4388, + "time_per_iteration": 2.5082008838653564 + }, + { + "auxiliary_loss_clip": 0.06516623, + "auxiliary_loss_mlp": 0.01271478, + "balance_loss_clip": 0.06302176, + "balance_loss_mlp": 0.01253681, + "epoch": 0.26388095595971744, + "flos": 21732001315200.0, + "grad_norm": 2.125202232596161, + "language_loss": 0.86967361, + "learning_rate": 3.4531080128383617e-06, + "loss": 0.94755471, + "num_input_tokens_seen": 94839175, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.17785645, + "step": 4389, + "time_per_iteration": 2.570643901824951 + }, + { + "auxiliary_loss_clip": 0.06416489, + "auxiliary_loss_mlp": 0.01268137, + "balance_loss_clip": 0.0630957, + "balance_loss_mlp": 0.01263464, + "epoch": 0.2639410792123854, + "flos": 65536542138240.0, + "grad_norm": 0.8199197454978128, + "language_loss": 0.60138249, + "learning_rate": 3.452840382521457e-06, + "loss": 0.6782288, + "num_input_tokens_seen": 94898865, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.04666138, + "step": 4390, + "time_per_iteration": 3.174226999282837 + }, + { + "auxiliary_loss_clip": 0.06524064, + "auxiliary_loss_mlp": 0.01274153, + "balance_loss_clip": 0.06302064, + "balance_loss_mlp": 0.01255008, + "epoch": 0.2640012024650534, + "flos": 23954907081600.0, + "grad_norm": 1.739207981028, + "language_loss": 0.77995527, + "learning_rate": 3.4525726971127e-06, + "loss": 0.85793746, + "num_input_tokens_seen": 94917490, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.19152832, + "step": 4391, + "time_per_iteration": 2.5869362354278564 + }, + { + "auxiliary_loss_clip": 0.06415629, + "auxiliary_loss_mlp": 0.01265443, + "balance_loss_clip": 0.06309642, + "balance_loss_mlp": 0.0126082, + "epoch": 0.26406132571772134, + "flos": 56462420880000.0, + "grad_norm": 0.8885893091984226, + "language_loss": 0.58835375, + "learning_rate": 3.45230495662224e-06, + "loss": 0.66516447, + "num_input_tokens_seen": 94969065, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.04620361, + "step": 4392, + "time_per_iteration": 3.1856343746185303 + }, + { + "auxiliary_loss_clip": 0.0652501, + "auxiliary_loss_mlp": 0.0127481, + "balance_loss_clip": 0.06303259, + "balance_loss_mlp": 0.01256631, + "epoch": 0.2641214489703893, + "flos": 22097039627520.0, + "grad_norm": 1.7095674260711007, + "language_loss": 0.69284153, + "learning_rate": 3.4520371610602306e-06, + "loss": 0.77083969, + "num_input_tokens_seen": 94988540, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.1817627, + "step": 4393, + "time_per_iteration": 2.5519895553588867 + }, + { + "auxiliary_loss_clip": 0.06526117, + "auxiliary_loss_mlp": 0.01277548, + "balance_loss_clip": 0.06302489, + "balance_loss_mlp": 0.01255959, + "epoch": 0.26418157222305727, + "flos": 16550327226240.0, + "grad_norm": 2.304177456685855, + "language_loss": 0.84805501, + "learning_rate": 3.4517693104368267e-06, + "loss": 0.92609167, + "num_input_tokens_seen": 95004810, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.21594238, + "step": 4394, + "time_per_iteration": 2.5253031253814697 + }, + { + "auxiliary_loss_clip": 0.06528334, + "auxiliary_loss_mlp": 0.01280976, + "balance_loss_clip": 0.06304967, + "balance_loss_mlp": 0.01260066, + "epoch": 0.26424169547572524, + "flos": 18008006780160.0, + "grad_norm": 1.9555526734650441, + "language_loss": 0.70342916, + "learning_rate": 3.4515014047621856e-06, + "loss": 0.78152227, + "num_input_tokens_seen": 95024085, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.20910645, + "step": 4395, + "time_per_iteration": 2.5117664337158203 + }, + { + "auxiliary_loss_clip": 0.06512758, + "auxiliary_loss_mlp": 0.01272399, + "balance_loss_clip": 0.06300145, + "balance_loss_mlp": 0.01253171, + "epoch": 0.2643018187283932, + "flos": 16988893096320.0, + "grad_norm": 1.791387622967983, + "language_loss": 0.87312353, + "learning_rate": 3.4512334440464655e-06, + "loss": 0.95097506, + "num_input_tokens_seen": 95042515, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.19238281, + "step": 4396, + "time_per_iteration": 2.566774368286133 + }, + { + "auxiliary_loss_clip": 0.06404904, + "auxiliary_loss_mlp": 0.01257464, + "balance_loss_clip": 0.06300922, + "balance_loss_mlp": 0.01252997, + "epoch": 0.26436194198106117, + "flos": 59682135144960.0, + "grad_norm": 0.7723405564107855, + "language_loss": 0.54990101, + "learning_rate": 3.4509654282998277e-06, + "loss": 0.62652469, + "num_input_tokens_seen": 95094835, + "router_z_loss_clip": 1.0390625, + "router_z_loss_mlp": 0.04473877, + "step": 4397, + "time_per_iteration": 4.373678684234619 + }, + { + "auxiliary_loss_clip": 0.06510547, + "auxiliary_loss_mlp": 0.01274266, + "balance_loss_clip": 0.06297219, + "balance_loss_mlp": 0.01255633, + "epoch": 0.26442206523372913, + "flos": 32928694675200.0, + "grad_norm": 2.4292177107300224, + "language_loss": 0.78606653, + "learning_rate": 3.450697357532435e-06, + "loss": 0.86391467, + "num_input_tokens_seen": 95113480, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.1862793, + "step": 4398, + "time_per_iteration": 2.6890292167663574 + }, + { + "auxiliary_loss_clip": 0.06511252, + "auxiliary_loss_mlp": 0.01279415, + "balance_loss_clip": 0.06294377, + "balance_loss_mlp": 0.01259244, + "epoch": 0.2644821884863971, + "flos": 21037409694720.0, + "grad_norm": 1.6698754866149341, + "language_loss": 0.67733896, + "learning_rate": 3.4504292317544534e-06, + "loss": 0.75524557, + "num_input_tokens_seen": 95132580, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.20178223, + "step": 4399, + "time_per_iteration": 2.5403761863708496 + }, + { + "auxiliary_loss_clip": 0.06507229, + "auxiliary_loss_mlp": 0.01274507, + "balance_loss_clip": 0.06301808, + "balance_loss_mlp": 0.01256841, + "epoch": 0.26454231173906506, + "flos": 20783019098880.0, + "grad_norm": 1.5093240378212085, + "language_loss": 0.8695311, + "learning_rate": 3.4501610509760504e-06, + "loss": 0.94734848, + "num_input_tokens_seen": 95152375, + "router_z_loss_clip": 2.05175781, + "router_z_loss_mlp": 0.17675781, + "step": 4400, + "time_per_iteration": 2.546402931213379 + }, + { + "auxiliary_loss_clip": 0.06514899, + "auxiliary_loss_mlp": 0.01275157, + "balance_loss_clip": 0.06298938, + "balance_loss_mlp": 0.01255404, + "epoch": 0.264602434991733, + "flos": 16624399835520.0, + "grad_norm": 2.9592381962347076, + "language_loss": 0.77008456, + "learning_rate": 3.4498928152073944e-06, + "loss": 0.84798515, + "num_input_tokens_seen": 95170265, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.19750977, + "step": 4401, + "time_per_iteration": 4.000045537948608 + }, + { + "auxiliary_loss_clip": 0.06515318, + "auxiliary_loss_mlp": 0.01277892, + "balance_loss_clip": 0.0629567, + "balance_loss_mlp": 0.01257149, + "epoch": 0.26466255824440105, + "flos": 19068726816000.0, + "grad_norm": 1.7667226788610035, + "language_loss": 0.88791883, + "learning_rate": 3.4496245244586577e-06, + "loss": 0.96585095, + "num_input_tokens_seen": 95188655, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.20739746, + "step": 4402, + "time_per_iteration": 2.504951000213623 + }, + { + "auxiliary_loss_clip": 0.06514971, + "auxiliary_loss_mlp": 0.01280074, + "balance_loss_clip": 0.06299384, + "balance_loss_mlp": 0.01261203, + "epoch": 0.264722681497069, + "flos": 22645246965120.0, + "grad_norm": 2.1016866817380944, + "language_loss": 0.78604829, + "learning_rate": 3.4493561787400137e-06, + "loss": 0.86399865, + "num_input_tokens_seen": 95209615, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.18884277, + "step": 4403, + "time_per_iteration": 3.9830996990203857 + }, + { + "auxiliary_loss_clip": 0.06513863, + "auxiliary_loss_mlp": 0.01273109, + "balance_loss_clip": 0.0629956, + "balance_loss_mlp": 0.01254322, + "epoch": 0.264782804749737, + "flos": 22498862682240.0, + "grad_norm": 2.2718142403423887, + "language_loss": 0.88776851, + "learning_rate": 3.4490877780616387e-06, + "loss": 0.96563816, + "num_input_tokens_seen": 95227810, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.18774414, + "step": 4404, + "time_per_iteration": 2.5655670166015625 + }, + { + "auxiliary_loss_clip": 0.06512003, + "auxiliary_loss_mlp": 0.01272083, + "balance_loss_clip": 0.06294957, + "balance_loss_mlp": 0.01253666, + "epoch": 0.26484292800240494, + "flos": 16805891779200.0, + "grad_norm": 1.6853243703943699, + "language_loss": 0.77144921, + "learning_rate": 3.448819322433709e-06, + "loss": 0.84929001, + "num_input_tokens_seen": 95245890, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.18408203, + "step": 4405, + "time_per_iteration": 2.5151660442352295 + }, + { + "auxiliary_loss_clip": 0.06518488, + "auxiliary_loss_mlp": 0.01280263, + "balance_loss_clip": 0.06303011, + "balance_loss_mlp": 0.0126113, + "epoch": 0.2649030512550729, + "flos": 20455939486080.0, + "grad_norm": 1.6552463254663874, + "language_loss": 0.70570582, + "learning_rate": 3.4485508118664066e-06, + "loss": 0.78369337, + "num_input_tokens_seen": 95264955, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.19152832, + "step": 4406, + "time_per_iteration": 2.5817081928253174 + }, + { + "auxiliary_loss_clip": 0.06515051, + "auxiliary_loss_mlp": 0.01282775, + "balance_loss_clip": 0.06304015, + "balance_loss_mlp": 0.01264071, + "epoch": 0.2649631745077409, + "flos": 22422190596480.0, + "grad_norm": 1.6043271976664373, + "language_loss": 0.84213567, + "learning_rate": 3.448282246369912e-06, + "loss": 0.92011392, + "num_input_tokens_seen": 95284245, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 0.18701172, + "step": 4407, + "time_per_iteration": 2.5317513942718506 + }, + { + "auxiliary_loss_clip": 0.06506669, + "auxiliary_loss_mlp": 0.01274017, + "balance_loss_clip": 0.06294346, + "balance_loss_mlp": 0.01255384, + "epoch": 0.26502329776040884, + "flos": 35124794334720.0, + "grad_norm": 1.8863485028384246, + "language_loss": 0.76080608, + "learning_rate": 3.4480136259544084e-06, + "loss": 0.83861291, + "num_input_tokens_seen": 95307125, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.18615723, + "step": 4408, + "time_per_iteration": 4.144388675689697 + }, + { + "auxiliary_loss_clip": 0.06504838, + "auxiliary_loss_mlp": 0.01278565, + "balance_loss_clip": 0.06293095, + "balance_loss_mlp": 0.01259765, + "epoch": 0.2650834210130768, + "flos": 38696073603840.0, + "grad_norm": 1.6572856868324277, + "language_loss": 0.71237993, + "learning_rate": 3.447744950630084e-06, + "loss": 0.79021394, + "num_input_tokens_seen": 95329150, + "router_z_loss_clip": 2.11523438, + "router_z_loss_mlp": 0.18786621, + "step": 4409, + "time_per_iteration": 2.6830790042877197 + }, + { + "auxiliary_loss_clip": 0.06513892, + "auxiliary_loss_mlp": 0.01277956, + "balance_loss_clip": 0.06296389, + "balance_loss_mlp": 0.01258513, + "epoch": 0.26514354426574477, + "flos": 24723655165440.0, + "grad_norm": 1.9985850932403133, + "language_loss": 0.74335337, + "learning_rate": 3.4474762204071253e-06, + "loss": 0.82127184, + "num_input_tokens_seen": 95349880, + "router_z_loss_clip": 2.17578125, + "router_z_loss_mlp": 0.19445801, + "step": 4410, + "time_per_iteration": 2.5640783309936523 + }, + { + "auxiliary_loss_clip": 0.06510055, + "auxiliary_loss_mlp": 0.01275315, + "balance_loss_clip": 0.06293881, + "balance_loss_mlp": 0.01256873, + "epoch": 0.26520366751841273, + "flos": 20346381872640.0, + "grad_norm": 1.7362440314024254, + "language_loss": 0.74604267, + "learning_rate": 3.4472074352957244e-06, + "loss": 0.82389635, + "num_input_tokens_seen": 95368570, + "router_z_loss_clip": 2.1640625, + "router_z_loss_mlp": 0.18457031, + "step": 4411, + "time_per_iteration": 2.681392192840576 + }, + { + "auxiliary_loss_clip": 0.06503807, + "auxiliary_loss_mlp": 0.0127974, + "balance_loss_clip": 0.06292095, + "balance_loss_mlp": 0.01260941, + "epoch": 0.2652637907710807, + "flos": 22350046631040.0, + "grad_norm": 1.9068391403977176, + "language_loss": 0.83043784, + "learning_rate": 3.446938595306071e-06, + "loss": 0.90827328, + "num_input_tokens_seen": 95387065, + "router_z_loss_clip": 2.1171875, + "router_z_loss_mlp": 0.18798828, + "step": 4412, + "time_per_iteration": 2.570462942123413 + }, + { + "auxiliary_loss_clip": 0.06509882, + "auxiliary_loss_mlp": 0.01280008, + "balance_loss_clip": 0.0629638, + "balance_loss_mlp": 0.01260327, + "epoch": 0.26532391402374866, + "flos": 19360279497600.0, + "grad_norm": 1.6015505507863077, + "language_loss": 0.75010121, + "learning_rate": 3.4466697004483622e-06, + "loss": 0.82800013, + "num_input_tokens_seen": 95406345, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.19677734, + "step": 4413, + "time_per_iteration": 2.5575060844421387 + }, + { + "auxiliary_loss_clip": 0.06392879, + "auxiliary_loss_mlp": 0.01259819, + "balance_loss_clip": 0.06288524, + "balance_loss_mlp": 0.01255307, + "epoch": 0.26538403727641663, + "flos": 44804479121280.0, + "grad_norm": 0.9088609657061584, + "language_loss": 0.57055008, + "learning_rate": 3.446400750732793e-06, + "loss": 0.64707708, + "num_input_tokens_seen": 95463595, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.04522705, + "step": 4414, + "time_per_iteration": 3.090242624282837 + }, + { + "auxiliary_loss_clip": 0.06501576, + "auxiliary_loss_mlp": 0.01278206, + "balance_loss_clip": 0.06294522, + "balance_loss_mlp": 0.01260587, + "epoch": 0.26544416052908465, + "flos": 28189359889920.0, + "grad_norm": 1.5322949912702364, + "language_loss": 0.74997067, + "learning_rate": 3.4461317461695625e-06, + "loss": 0.82776845, + "num_input_tokens_seen": 95484115, + "router_z_loss_clip": 2.0703125, + "router_z_loss_mlp": 0.17626953, + "step": 4415, + "time_per_iteration": 2.6143665313720703 + }, + { + "auxiliary_loss_clip": 0.06505995, + "auxiliary_loss_mlp": 0.01278176, + "balance_loss_clip": 0.06289595, + "balance_loss_mlp": 0.0125791, + "epoch": 0.2655042837817526, + "flos": 17570824502400.0, + "grad_norm": 4.108925661978825, + "language_loss": 0.87716872, + "learning_rate": 3.4458626867688707e-06, + "loss": 0.95501041, + "num_input_tokens_seen": 95501435, + "router_z_loss_clip": 2.1640625, + "router_z_loss_mlp": 0.20263672, + "step": 4416, + "time_per_iteration": 2.4974279403686523 + }, + { + "auxiliary_loss_clip": 0.06510112, + "auxiliary_loss_mlp": 0.01280216, + "balance_loss_clip": 0.0629703, + "balance_loss_mlp": 0.0126094, + "epoch": 0.2655644070344206, + "flos": 23411437499520.0, + "grad_norm": 1.4955026126411677, + "language_loss": 0.77089638, + "learning_rate": 3.4455935725409217e-06, + "loss": 0.84879971, + "num_input_tokens_seen": 95520135, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.19274902, + "step": 4417, + "time_per_iteration": 2.576826572418213 + }, + { + "auxiliary_loss_clip": 0.0650158, + "auxiliary_loss_mlp": 0.01274734, + "balance_loss_clip": 0.06292985, + "balance_loss_mlp": 0.01255946, + "epoch": 0.26562453028708854, + "flos": 26475612658560.0, + "grad_norm": 1.3751463134954343, + "language_loss": 0.80062425, + "learning_rate": 3.4453244034959196e-06, + "loss": 0.87838733, + "num_input_tokens_seen": 95541705, + "router_z_loss_clip": 2.08203125, + "router_z_loss_mlp": 0.18786621, + "step": 4418, + "time_per_iteration": 2.573490619659424 + }, + { + "auxiliary_loss_clip": 0.06510676, + "auxiliary_loss_mlp": 0.01274316, + "balance_loss_clip": 0.06295326, + "balance_loss_mlp": 0.01254945, + "epoch": 0.2656846535397565, + "flos": 19213475944320.0, + "grad_norm": 2.092556142181657, + "language_loss": 0.67613918, + "learning_rate": 3.445055179644071e-06, + "loss": 0.7539891, + "num_input_tokens_seen": 95560300, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.19372559, + "step": 4419, + "time_per_iteration": 2.5705552101135254 + }, + { + "auxiliary_loss_clip": 0.06507199, + "auxiliary_loss_mlp": 0.01281966, + "balance_loss_clip": 0.06293494, + "balance_loss_mlp": 0.01262153, + "epoch": 0.2657447767924245, + "flos": 30558566085120.0, + "grad_norm": 1.8356097714997412, + "language_loss": 0.79905182, + "learning_rate": 3.444785900995585e-06, + "loss": 0.87694353, + "num_input_tokens_seen": 95580150, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.19799805, + "step": 4420, + "time_per_iteration": 2.5966663360595703 + }, + { + "auxiliary_loss_clip": 0.06514539, + "auxiliary_loss_mlp": 0.01276693, + "balance_loss_clip": 0.06294198, + "balance_loss_mlp": 0.01256367, + "epoch": 0.26580490004509244, + "flos": 20928984111360.0, + "grad_norm": 2.015825119850129, + "language_loss": 0.81966692, + "learning_rate": 3.444516567560673e-06, + "loss": 0.89757919, + "num_input_tokens_seen": 95597570, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.20324707, + "step": 4421, + "time_per_iteration": 2.5285565853118896 + }, + { + "auxiliary_loss_clip": 0.06503608, + "auxiliary_loss_mlp": 0.01277509, + "balance_loss_clip": 0.06293386, + "balance_loss_mlp": 0.01259341, + "epoch": 0.2658650232977604, + "flos": 43955845297920.0, + "grad_norm": 1.6494646012937118, + "language_loss": 0.66448712, + "learning_rate": 3.444247179349548e-06, + "loss": 0.74229831, + "num_input_tokens_seen": 95619415, + "router_z_loss_clip": 2.1015625, + "router_z_loss_mlp": 0.1817627, + "step": 4422, + "time_per_iteration": 2.715272903442383 + }, + { + "auxiliary_loss_clip": 0.0650918, + "auxiliary_loss_mlp": 0.01275047, + "balance_loss_clip": 0.06296968, + "balance_loss_mlp": 0.01257011, + "epoch": 0.26592514655042837, + "flos": 29724256581120.0, + "grad_norm": 6.571308072686312, + "language_loss": 0.75332773, + "learning_rate": 3.4439777363724252e-06, + "loss": 0.83116996, + "num_input_tokens_seen": 95639155, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.18029785, + "step": 4423, + "time_per_iteration": 2.5891942977905273 + }, + { + "auxiliary_loss_clip": 0.06514621, + "auxiliary_loss_mlp": 0.01277348, + "balance_loss_clip": 0.06297594, + "balance_loss_mlp": 0.01257619, + "epoch": 0.26598526980309634, + "flos": 46687616110080.0, + "grad_norm": 1.5716819541281883, + "language_loss": 0.78054529, + "learning_rate": 3.443708238639522e-06, + "loss": 0.85846502, + "num_input_tokens_seen": 95663320, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.19726562, + "step": 4424, + "time_per_iteration": 2.731308698654175 + }, + { + "auxiliary_loss_clip": 0.06513417, + "auxiliary_loss_mlp": 0.01282972, + "balance_loss_clip": 0.06298374, + "balance_loss_mlp": 0.01263147, + "epoch": 0.2660453930557643, + "flos": 11514115025280.0, + "grad_norm": 1.8953438163908696, + "language_loss": 0.7980895, + "learning_rate": 3.4434386861610573e-06, + "loss": 0.87605333, + "num_input_tokens_seen": 95680260, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.19824219, + "step": 4425, + "time_per_iteration": 2.536639928817749 + }, + { + "auxiliary_loss_clip": 0.0650531, + "auxiliary_loss_mlp": 0.01275945, + "balance_loss_clip": 0.06295954, + "balance_loss_mlp": 0.01257837, + "epoch": 0.26610551630843227, + "flos": 24798692096640.0, + "grad_norm": 1.624984400061838, + "language_loss": 0.81150436, + "learning_rate": 3.4431690789472532e-06, + "loss": 0.88931698, + "num_input_tokens_seen": 95701140, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 0.18103027, + "step": 4426, + "time_per_iteration": 2.55570912361145 + }, + { + "auxiliary_loss_clip": 0.06512492, + "auxiliary_loss_mlp": 0.01281328, + "balance_loss_clip": 0.06298596, + "balance_loss_mlp": 0.01262302, + "epoch": 0.26616563956110023, + "flos": 27643793955840.0, + "grad_norm": 1.6446869519549492, + "language_loss": 0.77695107, + "learning_rate": 3.442899417008333e-06, + "loss": 0.85488927, + "num_input_tokens_seen": 95722060, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.19042969, + "step": 4427, + "time_per_iteration": 2.609236001968384 + }, + { + "auxiliary_loss_clip": 0.06512281, + "auxiliary_loss_mlp": 0.01275028, + "balance_loss_clip": 0.06306126, + "balance_loss_mlp": 0.01257588, + "epoch": 0.26622576281376825, + "flos": 28369887511680.0, + "grad_norm": 1.5754757805335664, + "language_loss": 0.77615106, + "learning_rate": 3.4426297003545227e-06, + "loss": 0.85402417, + "num_input_tokens_seen": 95742495, + "router_z_loss_clip": 2.0625, + "router_z_loss_mlp": 0.17443848, + "step": 4428, + "time_per_iteration": 2.5886542797088623 + }, + { + "auxiliary_loss_clip": 0.06507164, + "auxiliary_loss_mlp": 0.01273818, + "balance_loss_clip": 0.06292614, + "balance_loss_mlp": 0.0125627, + "epoch": 0.2662858860664362, + "flos": 18047265217920.0, + "grad_norm": 1.9210496781424948, + "language_loss": 0.83184117, + "learning_rate": 3.4423599289960495e-06, + "loss": 0.90965092, + "num_input_tokens_seen": 95761510, + "router_z_loss_clip": 2.14746094, + "router_z_loss_mlp": 0.17541504, + "step": 4429, + "time_per_iteration": 2.5387768745422363 + }, + { + "auxiliary_loss_clip": 0.06512052, + "auxiliary_loss_mlp": 0.01276801, + "balance_loss_clip": 0.06301999, + "balance_loss_mlp": 0.01256762, + "epoch": 0.2663460093191042, + "flos": 22752163175040.0, + "grad_norm": 1.799497911690532, + "language_loss": 0.73120302, + "learning_rate": 3.442090102943143e-06, + "loss": 0.80909157, + "num_input_tokens_seen": 95782385, + "router_z_loss_clip": 2.1015625, + "router_z_loss_mlp": 0.20043945, + "step": 4430, + "time_per_iteration": 2.6026084423065186 + }, + { + "auxiliary_loss_clip": 0.06508531, + "auxiliary_loss_mlp": 0.0127429, + "balance_loss_clip": 0.06296858, + "balance_loss_mlp": 0.012548, + "epoch": 0.26640613257177215, + "flos": 16514422951680.0, + "grad_norm": 2.040164300856009, + "language_loss": 0.83262235, + "learning_rate": 3.441820222206035e-06, + "loss": 0.91045058, + "num_input_tokens_seen": 95800595, + "router_z_loss_clip": 2.1171875, + "router_z_loss_mlp": 0.19482422, + "step": 4431, + "time_per_iteration": 2.5464959144592285 + }, + { + "auxiliary_loss_clip": 0.0651544, + "auxiliary_loss_mlp": 0.01281122, + "balance_loss_clip": 0.06296271, + "balance_loss_mlp": 0.01261488, + "epoch": 0.2664662558244401, + "flos": 23082638878080.0, + "grad_norm": 2.4012085548553537, + "language_loss": 0.76319212, + "learning_rate": 3.44155028679496e-06, + "loss": 0.84115773, + "num_input_tokens_seen": 95818480, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.19641113, + "step": 4432, + "time_per_iteration": 2.5570900440216064 + }, + { + "auxiliary_loss_clip": 0.06513382, + "auxiliary_loss_mlp": 0.01279336, + "balance_loss_clip": 0.0629918, + "balance_loss_mlp": 0.01259011, + "epoch": 0.2665263790771081, + "flos": 23776098468480.0, + "grad_norm": 1.7645797084145118, + "language_loss": 0.8352288, + "learning_rate": 3.441280296720154e-06, + "loss": 0.91315603, + "num_input_tokens_seen": 95837205, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.20324707, + "step": 4433, + "time_per_iteration": 2.5431323051452637 + }, + { + "auxiliary_loss_clip": 0.06506403, + "auxiliary_loss_mlp": 0.01279917, + "balance_loss_clip": 0.06294529, + "balance_loss_mlp": 0.01260248, + "epoch": 0.26658650232977604, + "flos": 28008748414080.0, + "grad_norm": 2.0130085710694097, + "language_loss": 0.77006185, + "learning_rate": 3.441010251991854e-06, + "loss": 0.84792507, + "num_input_tokens_seen": 95858395, + "router_z_loss_clip": 2.11230469, + "router_z_loss_mlp": 0.19677734, + "step": 4434, + "time_per_iteration": 2.626286268234253 + }, + { + "auxiliary_loss_clip": 0.06505096, + "auxiliary_loss_mlp": 0.01274565, + "balance_loss_clip": 0.06296869, + "balance_loss_mlp": 0.01255563, + "epoch": 0.266646625582444, + "flos": 22170147914880.0, + "grad_norm": 1.9216331890087734, + "language_loss": 0.82643783, + "learning_rate": 3.440740152620301e-06, + "loss": 0.90423441, + "num_input_tokens_seen": 95877875, + "router_z_loss_clip": 2.07910156, + "router_z_loss_mlp": 0.18994141, + "step": 4435, + "time_per_iteration": 2.519731283187866 + }, + { + "auxiliary_loss_clip": 0.06515168, + "auxiliary_loss_mlp": 0.01287569, + "balance_loss_clip": 0.06296054, + "balance_loss_mlp": 0.01267065, + "epoch": 0.266706748835112, + "flos": 27860687049600.0, + "grad_norm": 2.5550616111147257, + "language_loss": 0.88173652, + "learning_rate": 3.4404699986157376e-06, + "loss": 0.95976388, + "num_input_tokens_seen": 95895820, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.2052002, + "step": 4436, + "time_per_iteration": 2.5790481567382812 + }, + { + "auxiliary_loss_clip": 0.0650726, + "auxiliary_loss_mlp": 0.01276794, + "balance_loss_clip": 0.0629128, + "balance_loss_mlp": 0.01258507, + "epoch": 0.26676687208777994, + "flos": 25819231299840.0, + "grad_norm": 5.920609689832761, + "language_loss": 0.79025435, + "learning_rate": 3.440199789988407e-06, + "loss": 0.86809486, + "num_input_tokens_seen": 95918025, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.1829834, + "step": 4437, + "time_per_iteration": 3.9761762619018555 + }, + { + "auxiliary_loss_clip": 0.06508271, + "auxiliary_loss_mlp": 0.01274387, + "balance_loss_clip": 0.06295269, + "balance_loss_mlp": 0.01256065, + "epoch": 0.2668269953404479, + "flos": 36073399207680.0, + "grad_norm": 3.5501154130665333, + "language_loss": 0.64866304, + "learning_rate": 3.439929526748556e-06, + "loss": 0.72648954, + "num_input_tokens_seen": 95937725, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.18322754, + "step": 4438, + "time_per_iteration": 2.655214786529541 + }, + { + "auxiliary_loss_clip": 0.0650841, + "auxiliary_loss_mlp": 0.01282243, + "balance_loss_clip": 0.0629243, + "balance_loss_mlp": 0.01263015, + "epoch": 0.26688711859311587, + "flos": 26576994499200.0, + "grad_norm": 1.9779853569110368, + "language_loss": 0.76120412, + "learning_rate": 3.4396592089064334e-06, + "loss": 0.83911061, + "num_input_tokens_seen": 95956335, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.1920166, + "step": 4439, + "time_per_iteration": 2.5468099117279053 + }, + { + "auxiliary_loss_clip": 0.06509372, + "auxiliary_loss_mlp": 0.01279302, + "balance_loss_clip": 0.06293344, + "balance_loss_mlp": 0.01259156, + "epoch": 0.26694724184578383, + "flos": 26768968202880.0, + "grad_norm": 1.7452542153948158, + "language_loss": 0.71747917, + "learning_rate": 3.4393888364722897e-06, + "loss": 0.79536593, + "num_input_tokens_seen": 95977135, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.20141602, + "step": 4440, + "time_per_iteration": 2.5845727920532227 + }, + { + "auxiliary_loss_clip": 0.06513558, + "auxiliary_loss_mlp": 0.01278841, + "balance_loss_clip": 0.06297302, + "balance_loss_mlp": 0.01258003, + "epoch": 0.2670073650984518, + "flos": 20965894634880.0, + "grad_norm": 2.018310090260772, + "language_loss": 0.67180222, + "learning_rate": 3.439118409456376e-06, + "loss": 0.74972624, + "num_input_tokens_seen": 95995435, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.20837402, + "step": 4441, + "time_per_iteration": 4.018662691116333 + }, + { + "auxiliary_loss_clip": 0.06511593, + "auxiliary_loss_mlp": 0.01281375, + "balance_loss_clip": 0.06295494, + "balance_loss_mlp": 0.01260692, + "epoch": 0.2670674883511198, + "flos": 28373577091200.0, + "grad_norm": 1.7028334543675463, + "language_loss": 0.77360296, + "learning_rate": 3.4388479278689486e-06, + "loss": 0.8515327, + "num_input_tokens_seen": 96016340, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.20690918, + "step": 4442, + "time_per_iteration": 2.613529682159424 + }, + { + "auxiliary_loss_clip": 0.06397913, + "auxiliary_loss_mlp": 0.0126448, + "balance_loss_clip": 0.06295023, + "balance_loss_mlp": 0.01259818, + "epoch": 0.2671276116037878, + "flos": 58989010970880.0, + "grad_norm": 0.9159689493293411, + "language_loss": 0.61561328, + "learning_rate": 3.4385773917202637e-06, + "loss": 0.6922372, + "num_input_tokens_seen": 96071205, + "router_z_loss_clip": 1.03125, + "router_z_loss_mlp": 0.04653931, + "step": 4443, + "time_per_iteration": 4.460381031036377 + }, + { + "auxiliary_loss_clip": 0.06510781, + "auxiliary_loss_mlp": 0.01278926, + "balance_loss_clip": 0.06294855, + "balance_loss_mlp": 0.0126021, + "epoch": 0.26718773485645575, + "flos": 43955132538240.0, + "grad_norm": 8.593795125602613, + "language_loss": 0.76795793, + "learning_rate": 3.4383068010205793e-06, + "loss": 0.845855, + "num_input_tokens_seen": 96094240, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.18725586, + "step": 4444, + "time_per_iteration": 2.7442104816436768 + }, + { + "auxiliary_loss_clip": 0.06512623, + "auxiliary_loss_mlp": 0.0127732, + "balance_loss_clip": 0.06297334, + "balance_loss_mlp": 0.01256255, + "epoch": 0.2672478581091237, + "flos": 25235329322880.0, + "grad_norm": 2.0392997213265867, + "language_loss": 0.81111336, + "learning_rate": 3.438036155780158e-06, + "loss": 0.88901269, + "num_input_tokens_seen": 96114105, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.21057129, + "step": 4445, + "time_per_iteration": 2.5493359565734863 + }, + { + "auxiliary_loss_clip": 0.06511448, + "auxiliary_loss_mlp": 0.01275318, + "balance_loss_clip": 0.0629541, + "balance_loss_mlp": 0.01256054, + "epoch": 0.2673079813617917, + "flos": 15273594564480.0, + "grad_norm": 1.8279407549944744, + "language_loss": 0.89906365, + "learning_rate": 3.43776545600926e-06, + "loss": 0.97693127, + "num_input_tokens_seen": 96132140, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.19262695, + "step": 4446, + "time_per_iteration": 2.536916971206665 + }, + { + "auxiliary_loss_clip": 0.06512347, + "auxiliary_loss_mlp": 0.01275408, + "balance_loss_clip": 0.06295379, + "balance_loss_mlp": 0.01256894, + "epoch": 0.26736810461445965, + "flos": 25819944059520.0, + "grad_norm": 1.8969857257431861, + "language_loss": 0.68977708, + "learning_rate": 3.437494701718153e-06, + "loss": 0.76765466, + "num_input_tokens_seen": 96152090, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.18518066, + "step": 4447, + "time_per_iteration": 4.071701526641846 + }, + { + "auxiliary_loss_clip": 0.06511723, + "auxiliary_loss_mlp": 0.01279215, + "balance_loss_clip": 0.06295793, + "balance_loss_mlp": 0.01259116, + "epoch": 0.2674282278671276, + "flos": 24318981072000.0, + "grad_norm": 1.8615578685879888, + "language_loss": 0.83522677, + "learning_rate": 3.4372238929171026e-06, + "loss": 0.91313618, + "num_input_tokens_seen": 96170015, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.2010498, + "step": 4448, + "time_per_iteration": 2.581207036972046 + }, + { + "auxiliary_loss_clip": 0.06506026, + "auxiliary_loss_mlp": 0.0127612, + "balance_loss_clip": 0.06295379, + "balance_loss_mlp": 0.01256855, + "epoch": 0.2674883511197956, + "flos": 22821330320640.0, + "grad_norm": 1.5806903023960923, + "language_loss": 0.84385109, + "learning_rate": 3.436953029616378e-06, + "loss": 0.92167258, + "num_input_tokens_seen": 96188065, + "router_z_loss_clip": 2.10742188, + "router_z_loss_mlp": 0.19262695, + "step": 4449, + "time_per_iteration": 2.556368827819824 + }, + { + "auxiliary_loss_clip": 0.06523807, + "auxiliary_loss_mlp": 0.01278506, + "balance_loss_clip": 0.06298804, + "balance_loss_mlp": 0.01256679, + "epoch": 0.26754847437246354, + "flos": 25376514652800.0, + "grad_norm": 2.5106466446094275, + "language_loss": 0.84170121, + "learning_rate": 3.4366821118262506e-06, + "loss": 0.91972435, + "num_input_tokens_seen": 96205780, + "router_z_loss_clip": 2.25390625, + "router_z_loss_mlp": 0.21838379, + "step": 4450, + "time_per_iteration": 2.540792465209961 + }, + { + "auxiliary_loss_clip": 0.06503032, + "auxiliary_loss_mlp": 0.01274274, + "balance_loss_clip": 0.06293193, + "balance_loss_mlp": 0.01255248, + "epoch": 0.2676085976251315, + "flos": 20236698478080.0, + "grad_norm": 1.7838817445044992, + "language_loss": 0.81239712, + "learning_rate": 3.4364111395569937e-06, + "loss": 0.8901701, + "num_input_tokens_seen": 96224990, + "router_z_loss_clip": 2.09570312, + "router_z_loss_mlp": 0.19042969, + "step": 4451, + "time_per_iteration": 2.552764892578125 + }, + { + "auxiliary_loss_clip": 0.06515267, + "auxiliary_loss_mlp": 0.01275992, + "balance_loss_clip": 0.06304526, + "balance_loss_mlp": 0.01257324, + "epoch": 0.26766872087779947, + "flos": 28045784718720.0, + "grad_norm": 1.859886698365648, + "language_loss": 0.87156057, + "learning_rate": 3.436140112818882e-06, + "loss": 0.94947314, + "num_input_tokens_seen": 96245345, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.18664551, + "step": 4452, + "time_per_iteration": 2.580838918685913 + }, + { + "auxiliary_loss_clip": 0.06515863, + "auxiliary_loss_mlp": 0.01278142, + "balance_loss_clip": 0.06301846, + "balance_loss_mlp": 0.01258377, + "epoch": 0.26772884413046744, + "flos": 18329803585920.0, + "grad_norm": 2.0572254627861577, + "language_loss": 0.84003425, + "learning_rate": 3.435869031622194e-06, + "loss": 0.91797435, + "num_input_tokens_seen": 96259000, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.19775391, + "step": 4453, + "time_per_iteration": 2.5120368003845215 + }, + { + "auxiliary_loss_clip": 0.06513035, + "auxiliary_loss_mlp": 0.01281566, + "balance_loss_clip": 0.06298169, + "balance_loss_mlp": 0.01261992, + "epoch": 0.2677889673831354, + "flos": 22134075932160.0, + "grad_norm": 1.66096029715733, + "language_loss": 0.79950684, + "learning_rate": 3.435597895977208e-06, + "loss": 0.87745285, + "num_input_tokens_seen": 96277000, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.19580078, + "step": 4454, + "time_per_iteration": 2.5411524772644043 + }, + { + "auxiliary_loss_clip": 0.06518991, + "auxiliary_loss_mlp": 0.0127963, + "balance_loss_clip": 0.0630191, + "balance_loss_mlp": 0.01259949, + "epoch": 0.2678490906358034, + "flos": 23736001489920.0, + "grad_norm": 1.4726826789128313, + "language_loss": 0.72626883, + "learning_rate": 3.435326705894206e-06, + "loss": 0.80425501, + "num_input_tokens_seen": 96297010, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.19689941, + "step": 4455, + "time_per_iteration": 2.600341558456421 + }, + { + "auxiliary_loss_clip": 0.0650526, + "auxiliary_loss_mlp": 0.01280807, + "balance_loss_clip": 0.06295176, + "balance_loss_mlp": 0.01262675, + "epoch": 0.2679092138884714, + "flos": 21769414963200.0, + "grad_norm": 1.6724393178855028, + "language_loss": 0.74066579, + "learning_rate": 3.435055461383471e-06, + "loss": 0.81852639, + "num_input_tokens_seen": 96315780, + "router_z_loss_clip": 2.09863281, + "router_z_loss_mlp": 0.18139648, + "step": 4456, + "time_per_iteration": 2.5469894409179688 + }, + { + "auxiliary_loss_clip": 0.06520095, + "auxiliary_loss_mlp": 0.01278452, + "balance_loss_clip": 0.06300029, + "balance_loss_mlp": 0.01258127, + "epoch": 0.26796933714113935, + "flos": 19866670848000.0, + "grad_norm": 2.417277333537857, + "language_loss": 0.71260488, + "learning_rate": 3.4347841624552896e-06, + "loss": 0.79059041, + "num_input_tokens_seen": 96333465, + "router_z_loss_clip": 2.20507812, + "router_z_loss_mlp": 0.20324707, + "step": 4457, + "time_per_iteration": 2.592397451400757 + }, + { + "auxiliary_loss_clip": 0.06517951, + "auxiliary_loss_mlp": 0.01279854, + "balance_loss_clip": 0.06301091, + "balance_loss_mlp": 0.01259183, + "epoch": 0.2680294603938073, + "flos": 20054116431360.0, + "grad_norm": 2.0107664890053143, + "language_loss": 0.79466271, + "learning_rate": 3.4345128091199493e-06, + "loss": 0.87264079, + "num_input_tokens_seen": 96352005, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.20666504, + "step": 4458, + "time_per_iteration": 2.5134661197662354 + }, + { + "auxiliary_loss_clip": 0.06383923, + "auxiliary_loss_mlp": 0.01263146, + "balance_loss_clip": 0.06281242, + "balance_loss_mlp": 0.01258718, + "epoch": 0.2680895836464753, + "flos": 72134918334720.0, + "grad_norm": 0.8734266993254428, + "language_loss": 0.5870322, + "learning_rate": 3.434241401387739e-06, + "loss": 0.66350281, + "num_input_tokens_seen": 96406265, + "router_z_loss_clip": 1.02734375, + "router_z_loss_mlp": 0.04437256, + "step": 4459, + "time_per_iteration": 3.2277050018310547 + }, + { + "auxiliary_loss_clip": 0.06506394, + "auxiliary_loss_mlp": 0.01274552, + "balance_loss_clip": 0.06292672, + "balance_loss_mlp": 0.01255633, + "epoch": 0.26814970689914325, + "flos": 20455310580480.0, + "grad_norm": 1.8403982609946155, + "language_loss": 0.85477257, + "learning_rate": 3.4339699392689507e-06, + "loss": 0.93258202, + "num_input_tokens_seen": 96425225, + "router_z_loss_clip": 2.13769531, + "router_z_loss_mlp": 0.18920898, + "step": 4460, + "time_per_iteration": 2.513317346572876 + }, + { + "auxiliary_loss_clip": 0.06504844, + "auxiliary_loss_mlp": 0.01281285, + "balance_loss_clip": 0.06292892, + "balance_loss_mlp": 0.01261866, + "epoch": 0.2682098301518112, + "flos": 17572459656960.0, + "grad_norm": 1.8133404743184358, + "language_loss": 0.69389015, + "learning_rate": 3.4336984227738796e-06, + "loss": 0.7717514, + "num_input_tokens_seen": 96443780, + "router_z_loss_clip": 2.12011719, + "router_z_loss_mlp": 0.19421387, + "step": 4461, + "time_per_iteration": 2.5566093921661377 + }, + { + "auxiliary_loss_clip": 0.06506921, + "auxiliary_loss_mlp": 0.01281085, + "balance_loss_clip": 0.06293105, + "balance_loss_mlp": 0.01260152, + "epoch": 0.2682699534044792, + "flos": 18339237169920.0, + "grad_norm": 1.6584506269914416, + "language_loss": 0.67031932, + "learning_rate": 3.43342685191282e-06, + "loss": 0.74819934, + "num_input_tokens_seen": 96464530, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.20935059, + "step": 4462, + "time_per_iteration": 2.5427775382995605 + }, + { + "auxiliary_loss_clip": 0.06508102, + "auxiliary_loss_mlp": 0.01282385, + "balance_loss_clip": 0.0629629, + "balance_loss_mlp": 0.01263287, + "epoch": 0.26833007665714714, + "flos": 25308311829120.0, + "grad_norm": 1.7808644454945033, + "language_loss": 0.69747704, + "learning_rate": 3.4331552266960705e-06, + "loss": 0.77538192, + "num_input_tokens_seen": 96483345, + "router_z_loss_clip": 2.11523438, + "router_z_loss_mlp": 0.19116211, + "step": 4463, + "time_per_iteration": 2.6194493770599365 + }, + { + "auxiliary_loss_clip": 0.06508362, + "auxiliary_loss_mlp": 0.01280959, + "balance_loss_clip": 0.06291216, + "balance_loss_mlp": 0.0126092, + "epoch": 0.2683901999098151, + "flos": 16104046780800.0, + "grad_norm": 2.9245690778148465, + "language_loss": 0.78600121, + "learning_rate": 3.432883547133931e-06, + "loss": 0.86389446, + "num_input_tokens_seen": 96498305, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.20056152, + "step": 4464, + "time_per_iteration": 2.463418483734131 + }, + { + "auxiliary_loss_clip": 0.06508331, + "auxiliary_loss_mlp": 0.01281824, + "balance_loss_clip": 0.06294504, + "balance_loss_mlp": 0.01262154, + "epoch": 0.2684503231624831, + "flos": 27315414604800.0, + "grad_norm": 1.7531136867378412, + "language_loss": 0.71091688, + "learning_rate": 3.432611813236704e-06, + "loss": 0.78881842, + "num_input_tokens_seen": 96519740, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.19665527, + "step": 4465, + "time_per_iteration": 2.6083028316497803 + }, + { + "auxiliary_loss_clip": 0.06379254, + "auxiliary_loss_mlp": 0.01259677, + "balance_loss_clip": 0.0627647, + "balance_loss_mlp": 0.01255094, + "epoch": 0.26851044641515104, + "flos": 71879060292480.0, + "grad_norm": 0.6551429372657154, + "language_loss": 0.52683848, + "learning_rate": 3.4323400250146943e-06, + "loss": 0.60322779, + "num_input_tokens_seen": 96588870, + "router_z_loss_clip": 1.02929688, + "router_z_loss_mlp": 0.04577637, + "step": 4466, + "time_per_iteration": 3.2851803302764893 + }, + { + "auxiliary_loss_clip": 0.06507096, + "auxiliary_loss_mlp": 0.01283442, + "balance_loss_clip": 0.06291512, + "balance_loss_mlp": 0.01263105, + "epoch": 0.268570569667819, + "flos": 18739676632320.0, + "grad_norm": 10.994589827837663, + "language_loss": 0.74195564, + "learning_rate": 3.4320681824782057e-06, + "loss": 0.81986099, + "num_input_tokens_seen": 96605100, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.20324707, + "step": 4467, + "time_per_iteration": 2.4971463680267334 + }, + { + "auxiliary_loss_clip": 0.06517448, + "auxiliary_loss_mlp": 0.01283031, + "balance_loss_clip": 0.06297839, + "balance_loss_mlp": 0.01264005, + "epoch": 0.268630692920487, + "flos": 18182832324480.0, + "grad_norm": 2.2391086352503504, + "language_loss": 0.81577581, + "learning_rate": 3.4317962856375493e-06, + "loss": 0.89378059, + "num_input_tokens_seen": 96621410, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.19042969, + "step": 4468, + "time_per_iteration": 2.547626256942749 + }, + { + "auxiliary_loss_clip": 0.06377872, + "auxiliary_loss_mlp": 0.01264177, + "balance_loss_clip": 0.06275174, + "balance_loss_mlp": 0.01259552, + "epoch": 0.268690816173155, + "flos": 68754229176960.0, + "grad_norm": 0.8279608156690638, + "language_loss": 0.59413958, + "learning_rate": 3.4315243345030334e-06, + "loss": 0.67056012, + "num_input_tokens_seen": 96684810, + "router_z_loss_clip": 1.03125, + "router_z_loss_mlp": 0.0461731, + "step": 4469, + "time_per_iteration": 3.2565419673919678 + }, + { + "auxiliary_loss_clip": 0.06507242, + "auxiliary_loss_mlp": 0.01284548, + "balance_loss_clip": 0.06292132, + "balance_loss_mlp": 0.01263304, + "epoch": 0.26875093942582295, + "flos": 23300160877440.0, + "grad_norm": 1.9707129205098373, + "language_loss": 0.8163017, + "learning_rate": 3.431252329084972e-06, + "loss": 0.89421958, + "num_input_tokens_seen": 96701920, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.21240234, + "step": 4470, + "time_per_iteration": 2.542893171310425 + }, + { + "auxiliary_loss_clip": 0.06497125, + "auxiliary_loss_mlp": 0.0128145, + "balance_loss_clip": 0.0628901, + "balance_loss_mlp": 0.012619, + "epoch": 0.2688110626784909, + "flos": 21549880465920.0, + "grad_norm": 1.5945085425671264, + "language_loss": 0.83326346, + "learning_rate": 3.4309802693936786e-06, + "loss": 0.91104919, + "num_input_tokens_seen": 96721260, + "router_z_loss_clip": 2.08496094, + "router_z_loss_mlp": 0.19555664, + "step": 4471, + "time_per_iteration": 2.5213489532470703 + }, + { + "auxiliary_loss_clip": 0.06497657, + "auxiliary_loss_mlp": 0.01284463, + "balance_loss_clip": 0.06289607, + "balance_loss_mlp": 0.01264365, + "epoch": 0.2688711859311589, + "flos": 28407804284160.0, + "grad_norm": 1.9607526414443455, + "language_loss": 0.70046443, + "learning_rate": 3.43070815543947e-06, + "loss": 0.77828562, + "num_input_tokens_seen": 96740385, + "router_z_loss_clip": 2.08007812, + "router_z_loss_mlp": 0.20092773, + "step": 4472, + "time_per_iteration": 2.6251678466796875 + }, + { + "auxiliary_loss_clip": 0.06504884, + "auxiliary_loss_mlp": 0.0128234, + "balance_loss_clip": 0.06293008, + "balance_loss_mlp": 0.01263112, + "epoch": 0.26893130918382685, + "flos": 26002148762880.0, + "grad_norm": 1.9293915951077794, + "language_loss": 0.68364072, + "learning_rate": 3.4304359872326656e-06, + "loss": 0.76151299, + "num_input_tokens_seen": 96761860, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.19213867, + "step": 4473, + "time_per_iteration": 2.5682830810546875 + }, + { + "auxiliary_loss_clip": 0.06499921, + "auxiliary_loss_mlp": 0.01278958, + "balance_loss_clip": 0.06292213, + "balance_loss_mlp": 0.01259467, + "epoch": 0.2689914324364948, + "flos": 20345878748160.0, + "grad_norm": 1.608174101079712, + "language_loss": 0.83682281, + "learning_rate": 3.4301637647835843e-06, + "loss": 0.91461158, + "num_input_tokens_seen": 96781890, + "router_z_loss_clip": 2.08007812, + "router_z_loss_mlp": 0.19470215, + "step": 4474, + "time_per_iteration": 2.554151773452759 + }, + { + "auxiliary_loss_clip": 0.06502855, + "auxiliary_loss_mlp": 0.01275806, + "balance_loss_clip": 0.06296148, + "balance_loss_mlp": 0.01256482, + "epoch": 0.2690515556891628, + "flos": 19470759505920.0, + "grad_norm": 1.847749203594977, + "language_loss": 0.70725596, + "learning_rate": 3.4298914881025494e-06, + "loss": 0.78504252, + "num_input_tokens_seen": 96800390, + "router_z_loss_clip": 2.06835938, + "router_z_loss_mlp": 0.19348145, + "step": 4475, + "time_per_iteration": 2.5116677284240723 + }, + { + "auxiliary_loss_clip": 0.06503256, + "auxiliary_loss_mlp": 0.01277275, + "balance_loss_clip": 0.06287575, + "balance_loss_mlp": 0.01257188, + "epoch": 0.26911167894183075, + "flos": 18151875440640.0, + "grad_norm": 2.2814450019498236, + "language_loss": 0.73125452, + "learning_rate": 3.4296191571998863e-06, + "loss": 0.80905986, + "num_input_tokens_seen": 96816685, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.20092773, + "step": 4476, + "time_per_iteration": 3.923501968383789 + }, + { + "auxiliary_loss_clip": 0.0650249, + "auxiliary_loss_mlp": 0.01274263, + "balance_loss_clip": 0.06291398, + "balance_loss_mlp": 0.01255511, + "epoch": 0.2691718021944987, + "flos": 19981385487360.0, + "grad_norm": 1.4862356596427981, + "language_loss": 0.80676347, + "learning_rate": 3.429346772085922e-06, + "loss": 0.88453096, + "num_input_tokens_seen": 96836285, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 0.18762207, + "step": 4477, + "time_per_iteration": 2.562681198120117 + }, + { + "auxiliary_loss_clip": 0.06506729, + "auxiliary_loss_mlp": 0.01275723, + "balance_loss_clip": 0.06289821, + "balance_loss_mlp": 0.01254873, + "epoch": 0.2692319254471667, + "flos": 37455622560000.0, + "grad_norm": 1.8507584096301994, + "language_loss": 0.65612036, + "learning_rate": 3.429074332770984e-06, + "loss": 0.73394483, + "num_input_tokens_seen": 96857745, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.20861816, + "step": 4478, + "time_per_iteration": 2.6743321418762207 + }, + { + "auxiliary_loss_clip": 0.06505084, + "auxiliary_loss_mlp": 0.01278495, + "balance_loss_clip": 0.06291381, + "balance_loss_mlp": 0.01259242, + "epoch": 0.26929204869983464, + "flos": 22134411348480.0, + "grad_norm": 2.2415663972983864, + "language_loss": 0.81841063, + "learning_rate": 3.4288018392654047e-06, + "loss": 0.89624637, + "num_input_tokens_seen": 96877295, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.19250488, + "step": 4479, + "time_per_iteration": 2.563365936279297 + }, + { + "auxiliary_loss_clip": 0.06510025, + "auxiliary_loss_mlp": 0.01277354, + "balance_loss_clip": 0.06295313, + "balance_loss_mlp": 0.01258305, + "epoch": 0.2693521719525026, + "flos": 19799055002880.0, + "grad_norm": 1.97047433874797, + "language_loss": 0.81362212, + "learning_rate": 3.4285292915795166e-06, + "loss": 0.89149588, + "num_input_tokens_seen": 96896160, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.19055176, + "step": 4480, + "time_per_iteration": 2.505098342895508 + }, + { + "auxiliary_loss_clip": 0.06504171, + "auxiliary_loss_mlp": 0.01276381, + "balance_loss_clip": 0.06296593, + "balance_loss_mlp": 0.01257677, + "epoch": 0.2694122952051706, + "flos": 21000415317120.0, + "grad_norm": 1.6210366032838512, + "language_loss": 0.7826978, + "learning_rate": 3.4282566897236543e-06, + "loss": 0.86050338, + "num_input_tokens_seen": 96915410, + "router_z_loss_clip": 2.07421875, + "router_z_loss_mlp": 0.18713379, + "step": 4481, + "time_per_iteration": 4.100890874862671 + }, + { + "auxiliary_loss_clip": 0.06511036, + "auxiliary_loss_mlp": 0.01275006, + "balance_loss_clip": 0.06298155, + "balance_loss_mlp": 0.01254192, + "epoch": 0.2694724184578386, + "flos": 25856519166720.0, + "grad_norm": 1.8924674974759383, + "language_loss": 0.74293458, + "learning_rate": 3.4279840337081547e-06, + "loss": 0.820795, + "num_input_tokens_seen": 96937865, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.20788574, + "step": 4482, + "time_per_iteration": 4.145740747451782 + }, + { + "auxiliary_loss_clip": 0.06511661, + "auxiliary_loss_mlp": 0.01276613, + "balance_loss_clip": 0.06299306, + "balance_loss_mlp": 0.01256836, + "epoch": 0.26953254171050656, + "flos": 21733594542720.0, + "grad_norm": 2.48131981073459, + "language_loss": 0.72700799, + "learning_rate": 3.4277113235433584e-06, + "loss": 0.80489069, + "num_input_tokens_seen": 96957710, + "router_z_loss_clip": 2.12109375, + "router_z_loss_mlp": 0.19763184, + "step": 4483, + "time_per_iteration": 2.5375680923461914 + }, + { + "auxiliary_loss_clip": 0.06523035, + "auxiliary_loss_mlp": 0.01278438, + "balance_loss_clip": 0.0630566, + "balance_loss_mlp": 0.01257994, + "epoch": 0.2695926649631745, + "flos": 19689078119040.0, + "grad_norm": 2.054691934345778, + "language_loss": 0.87485874, + "learning_rate": 3.427438559239605e-06, + "loss": 0.95287347, + "num_input_tokens_seen": 96975890, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.20446777, + "step": 4484, + "time_per_iteration": 2.541909694671631 + }, + { + "auxiliary_loss_clip": 0.06515766, + "auxiliary_loss_mlp": 0.01278738, + "balance_loss_clip": 0.06300886, + "balance_loss_mlp": 0.01259474, + "epoch": 0.2696527882158425, + "flos": 32894257847040.0, + "grad_norm": 2.0183728032076966, + "language_loss": 0.66971946, + "learning_rate": 3.427165740807239e-06, + "loss": 0.74766451, + "num_input_tokens_seen": 96998595, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.19262695, + "step": 4485, + "time_per_iteration": 2.623896598815918 + }, + { + "auxiliary_loss_clip": 0.06514997, + "auxiliary_loss_mlp": 0.01282999, + "balance_loss_clip": 0.06301111, + "balance_loss_mlp": 0.01262877, + "epoch": 0.26971291146851045, + "flos": 12128806177920.0, + "grad_norm": 3.3281733059389498, + "language_loss": 0.74281263, + "learning_rate": 3.426892868256604e-06, + "loss": 0.82079262, + "num_input_tokens_seen": 97013715, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.2010498, + "step": 4486, + "time_per_iteration": 2.525820016860962 + }, + { + "auxiliary_loss_clip": 0.06519947, + "auxiliary_loss_mlp": 0.01289409, + "balance_loss_clip": 0.06302445, + "balance_loss_mlp": 0.01268846, + "epoch": 0.2697730347211784, + "flos": 22640467282560.0, + "grad_norm": 2.8316541967285183, + "language_loss": 0.84592897, + "learning_rate": 3.4266199415980495e-06, + "loss": 0.92402256, + "num_input_tokens_seen": 97031570, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.20556641, + "step": 4487, + "time_per_iteration": 3.936244249343872 + }, + { + "auxiliary_loss_clip": 0.06520635, + "auxiliary_loss_mlp": 0.01285695, + "balance_loss_clip": 0.06303369, + "balance_loss_mlp": 0.01264845, + "epoch": 0.2698331579738464, + "flos": 23519695374720.0, + "grad_norm": 2.431656191901387, + "language_loss": 0.73194599, + "learning_rate": 3.4263469608419234e-06, + "loss": 0.81000936, + "num_input_tokens_seen": 97049815, + "router_z_loss_clip": 2.17578125, + "router_z_loss_mlp": 0.20861816, + "step": 4488, + "time_per_iteration": 2.522861957550049 + }, + { + "auxiliary_loss_clip": 0.06516892, + "auxiliary_loss_mlp": 0.0127853, + "balance_loss_clip": 0.06303044, + "balance_loss_mlp": 0.01258681, + "epoch": 0.26989328122651435, + "flos": 24647360423040.0, + "grad_norm": 1.6427618857215789, + "language_loss": 0.84162384, + "learning_rate": 3.426073925998578e-06, + "loss": 0.91957808, + "num_input_tokens_seen": 97067570, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.1986084, + "step": 4489, + "time_per_iteration": 2.558133602142334 + }, + { + "auxiliary_loss_clip": 0.06523076, + "auxiliary_loss_mlp": 0.0128704, + "balance_loss_clip": 0.0630331, + "balance_loss_mlp": 0.01265821, + "epoch": 0.2699534044791823, + "flos": 10775904554880.0, + "grad_norm": 2.0847356564254014, + "language_loss": 0.90199494, + "learning_rate": 3.4258008370783656e-06, + "loss": 0.98009604, + "num_input_tokens_seen": 97082180, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.21228027, + "step": 4490, + "time_per_iteration": 2.461840867996216 + }, + { + "auxiliary_loss_clip": 0.06505966, + "auxiliary_loss_mlp": 0.01275421, + "balance_loss_clip": 0.06297465, + "balance_loss_mlp": 0.01256288, + "epoch": 0.2700135277318503, + "flos": 36180021928320.0, + "grad_norm": 2.13129158363681, + "language_loss": 0.73836827, + "learning_rate": 3.4255276940916434e-06, + "loss": 0.81618214, + "num_input_tokens_seen": 97103470, + "router_z_loss_clip": 2.08496094, + "router_z_loss_mlp": 0.19128418, + "step": 4491, + "time_per_iteration": 2.6479640007019043 + }, + { + "auxiliary_loss_clip": 0.06516409, + "auxiliary_loss_mlp": 0.01284517, + "balance_loss_clip": 0.06303698, + "balance_loss_mlp": 0.01264788, + "epoch": 0.27007365098451824, + "flos": 17424020949120.0, + "grad_norm": 2.8438546283757793, + "language_loss": 0.74296927, + "learning_rate": 3.4252544970487676e-06, + "loss": 0.82097852, + "num_input_tokens_seen": 97118100, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.19726562, + "step": 4492, + "time_per_iteration": 2.462226629257202 + }, + { + "auxiliary_loss_clip": 0.06510016, + "auxiliary_loss_mlp": 0.01279369, + "balance_loss_clip": 0.06300159, + "balance_loss_mlp": 0.01259926, + "epoch": 0.2701337742371862, + "flos": 23192448053760.0, + "grad_norm": 1.7359009481863723, + "language_loss": 0.88954818, + "learning_rate": 3.4249812459600986e-06, + "loss": 0.96744204, + "num_input_tokens_seen": 97136765, + "router_z_loss_clip": 2.09960938, + "router_z_loss_mlp": 0.19445801, + "step": 4493, + "time_per_iteration": 2.5385639667510986 + }, + { + "auxiliary_loss_clip": 0.06509903, + "auxiliary_loss_mlp": 0.01283619, + "balance_loss_clip": 0.06296834, + "balance_loss_mlp": 0.01265201, + "epoch": 0.2701938974898542, + "flos": 24396365917440.0, + "grad_norm": 1.3961943163888275, + "language_loss": 0.71571529, + "learning_rate": 3.424707940835998e-06, + "loss": 0.79365045, + "num_input_tokens_seen": 97157470, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.1842041, + "step": 4494, + "time_per_iteration": 2.542644500732422 + }, + { + "auxiliary_loss_clip": 0.06499185, + "auxiliary_loss_mlp": 0.01282381, + "balance_loss_clip": 0.0629191, + "balance_loss_mlp": 0.01263713, + "epoch": 0.2702540207425222, + "flos": 26221641333120.0, + "grad_norm": 2.6689304552375366, + "language_loss": 0.8697859, + "learning_rate": 3.42443458168683e-06, + "loss": 0.94760156, + "num_input_tokens_seen": 97176905, + "router_z_loss_clip": 2.07226562, + "router_z_loss_mlp": 0.18652344, + "step": 4495, + "time_per_iteration": 2.6052844524383545 + }, + { + "auxiliary_loss_clip": 0.06507061, + "auxiliary_loss_mlp": 0.01284126, + "balance_loss_clip": 0.06293719, + "balance_loss_mlp": 0.01263944, + "epoch": 0.27031414399519016, + "flos": 22932439234560.0, + "grad_norm": 1.7866659337876034, + "language_loss": 0.76608586, + "learning_rate": 3.424161168522959e-06, + "loss": 0.84399772, + "num_input_tokens_seen": 97196380, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.20166016, + "step": 4496, + "time_per_iteration": 2.5191855430603027 + }, + { + "auxiliary_loss_clip": 0.06445029, + "auxiliary_loss_mlp": 0.01262058, + "balance_loss_clip": 0.06340651, + "balance_loss_mlp": 0.01257498, + "epoch": 0.2703742672478581, + "flos": 63037904912640.0, + "grad_norm": 0.6591771406427821, + "language_loss": 0.49976462, + "learning_rate": 3.423887701354754e-06, + "loss": 0.57683551, + "num_input_tokens_seen": 97260100, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.0456543, + "step": 4497, + "time_per_iteration": 3.2403736114501953 + }, + { + "auxiliary_loss_clip": 0.06506558, + "auxiliary_loss_mlp": 0.01283587, + "balance_loss_clip": 0.06295481, + "balance_loss_mlp": 0.01266039, + "epoch": 0.2704343905005261, + "flos": 18846341280000.0, + "grad_norm": 2.8639988273107657, + "language_loss": 0.72431815, + "learning_rate": 3.4236141801925847e-06, + "loss": 0.80221957, + "num_input_tokens_seen": 97277935, + "router_z_loss_clip": 2.11328125, + "router_z_loss_mlp": 0.17553711, + "step": 4498, + "time_per_iteration": 2.509298086166382 + }, + { + "auxiliary_loss_clip": 0.06432115, + "auxiliary_loss_mlp": 0.01259251, + "balance_loss_clip": 0.06327531, + "balance_loss_mlp": 0.01254679, + "epoch": 0.27049451375319405, + "flos": 71253635817600.0, + "grad_norm": 0.9422572009255263, + "language_loss": 0.5900467, + "learning_rate": 3.4233406050468237e-06, + "loss": 0.66696036, + "num_input_tokens_seen": 97338845, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.04577637, + "step": 4499, + "time_per_iteration": 3.2116270065307617 + }, + { + "auxiliary_loss_clip": 0.06502165, + "auxiliary_loss_mlp": 0.01281307, + "balance_loss_clip": 0.06292122, + "balance_loss_mlp": 0.01261422, + "epoch": 0.270554637005862, + "flos": 24285257003520.0, + "grad_norm": 2.589715304320551, + "language_loss": 0.73975158, + "learning_rate": 3.4230669759278438e-06, + "loss": 0.8175863, + "num_input_tokens_seen": 97356640, + "router_z_loss_clip": 2.09960938, + "router_z_loss_mlp": 0.19897461, + "step": 4500, + "time_per_iteration": 2.537710189819336 + }, + { + "auxiliary_loss_clip": 0.06501484, + "auxiliary_loss_mlp": 0.01276741, + "balance_loss_clip": 0.06289591, + "balance_loss_mlp": 0.01257965, + "epoch": 0.27061476025853, + "flos": 17636889047040.0, + "grad_norm": 2.788947169536346, + "language_loss": 0.81470346, + "learning_rate": 3.4227932928460215e-06, + "loss": 0.89248574, + "num_input_tokens_seen": 97372585, + "router_z_loss_clip": 2.11523438, + "router_z_loss_mlp": 0.18774414, + "step": 4501, + "time_per_iteration": 2.5423648357391357 + }, + { + "auxiliary_loss_clip": 0.06510358, + "auxiliary_loss_mlp": 0.01287368, + "balance_loss_clip": 0.06294559, + "balance_loss_mlp": 0.01267579, + "epoch": 0.27067488351119795, + "flos": 22716594316800.0, + "grad_norm": 1.5278818221734496, + "language_loss": 0.7303015, + "learning_rate": 3.422519555811735e-06, + "loss": 0.8082788, + "num_input_tokens_seen": 97393315, + "router_z_loss_clip": 2.15917969, + "router_z_loss_mlp": 0.19775391, + "step": 4502, + "time_per_iteration": 2.5804011821746826 + }, + { + "auxiliary_loss_clip": 0.06507368, + "auxiliary_loss_mlp": 0.01278341, + "balance_loss_clip": 0.06289332, + "balance_loss_mlp": 0.01258576, + "epoch": 0.2707350067638659, + "flos": 41729333806080.0, + "grad_norm": 1.6949775973694576, + "language_loss": 0.69090897, + "learning_rate": 3.4222457648353642e-06, + "loss": 0.76876605, + "num_input_tokens_seen": 97417860, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.19763184, + "step": 4503, + "time_per_iteration": 2.740292549133301 + }, + { + "auxiliary_loss_clip": 0.06502387, + "auxiliary_loss_mlp": 0.0128307, + "balance_loss_clip": 0.06290283, + "balance_loss_mlp": 0.01263746, + "epoch": 0.2707951300165339, + "flos": 20199159048960.0, + "grad_norm": 1.9752400870870641, + "language_loss": 0.69172543, + "learning_rate": 3.4219719199272918e-06, + "loss": 0.76958001, + "num_input_tokens_seen": 97436780, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.1932373, + "step": 4504, + "time_per_iteration": 2.548069477081299 + }, + { + "auxiliary_loss_clip": 0.06502561, + "auxiliary_loss_mlp": 0.0128216, + "balance_loss_clip": 0.06291538, + "balance_loss_mlp": 0.01263492, + "epoch": 0.27085525326920185, + "flos": 21440364779520.0, + "grad_norm": 2.9855030089462993, + "language_loss": 0.76122642, + "learning_rate": 3.421698021097902e-06, + "loss": 0.8390736, + "num_input_tokens_seen": 97456190, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 0.18652344, + "step": 4505, + "time_per_iteration": 2.527165651321411 + }, + { + "auxiliary_loss_clip": 0.06505956, + "auxiliary_loss_mlp": 0.0128432, + "balance_loss_clip": 0.06289993, + "balance_loss_mlp": 0.01264459, + "epoch": 0.2709153765218698, + "flos": 17680885240320.0, + "grad_norm": 2.0693026918396487, + "language_loss": 0.73959178, + "learning_rate": 3.42142406835758e-06, + "loss": 0.81749451, + "num_input_tokens_seen": 97474545, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.1986084, + "step": 4506, + "time_per_iteration": 2.5131149291992188 + }, + { + "auxiliary_loss_clip": 0.0650361, + "auxiliary_loss_mlp": 0.01278265, + "balance_loss_clip": 0.06290495, + "balance_loss_mlp": 0.01258595, + "epoch": 0.2709754997745378, + "flos": 24462136972800.0, + "grad_norm": 1.8128724600792683, + "language_loss": 0.81647539, + "learning_rate": 3.421150061716715e-06, + "loss": 0.89429414, + "num_input_tokens_seen": 97494520, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.1965332, + "step": 4507, + "time_per_iteration": 2.684535503387451 + }, + { + "auxiliary_loss_clip": 0.06395597, + "auxiliary_loss_mlp": 0.01254395, + "balance_loss_clip": 0.0629042, + "balance_loss_mlp": 0.01250205, + "epoch": 0.2710356230272058, + "flos": 65229602232960.0, + "grad_norm": 0.712447813073055, + "language_loss": 0.50718415, + "learning_rate": 3.420876001185698e-06, + "loss": 0.58368409, + "num_input_tokens_seen": 97552455, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.04193115, + "step": 4508, + "time_per_iteration": 3.111752986907959 + }, + { + "auxiliary_loss_clip": 0.0649793, + "auxiliary_loss_mlp": 0.01272465, + "balance_loss_clip": 0.06289998, + "balance_loss_mlp": 0.01255263, + "epoch": 0.27109574627987376, + "flos": 25491606635520.0, + "grad_norm": 2.0258218163980213, + "language_loss": 0.75015354, + "learning_rate": 3.4206018867749197e-06, + "loss": 0.82785749, + "num_input_tokens_seen": 97572650, + "router_z_loss_clip": 2.08007812, + "router_z_loss_mlp": 0.171875, + "step": 4509, + "time_per_iteration": 2.555316209793091 + }, + { + "auxiliary_loss_clip": 0.06495094, + "auxiliary_loss_mlp": 0.01275639, + "balance_loss_clip": 0.06289092, + "balance_loss_mlp": 0.01256947, + "epoch": 0.2711558695325417, + "flos": 19688910410880.0, + "grad_norm": 2.3712253737099767, + "language_loss": 0.71864915, + "learning_rate": 3.4203277184947757e-06, + "loss": 0.79635644, + "num_input_tokens_seen": 97591150, + "router_z_loss_clip": 2.05859375, + "router_z_loss_mlp": 0.18688965, + "step": 4510, + "time_per_iteration": 2.5428407192230225 + }, + { + "auxiliary_loss_clip": 0.06499062, + "auxiliary_loss_mlp": 0.01279444, + "balance_loss_clip": 0.0629103, + "balance_loss_mlp": 0.012608, + "epoch": 0.2712159927852097, + "flos": 18593627765760.0, + "grad_norm": 2.5496745820614515, + "language_loss": 0.71357799, + "learning_rate": 3.4200534963556627e-06, + "loss": 0.791363, + "num_input_tokens_seen": 97607410, + "router_z_loss_clip": 2.07910156, + "router_z_loss_mlp": 0.1862793, + "step": 4511, + "time_per_iteration": 2.483739137649536 + }, + { + "auxiliary_loss_clip": 0.06505338, + "auxiliary_loss_mlp": 0.01274141, + "balance_loss_clip": 0.06292383, + "balance_loss_mlp": 0.01254817, + "epoch": 0.27127611603787766, + "flos": 25637403939840.0, + "grad_norm": 1.9202075405224084, + "language_loss": 0.81604505, + "learning_rate": 3.419779220367979e-06, + "loss": 0.89383984, + "num_input_tokens_seen": 97626870, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.1932373, + "step": 4512, + "time_per_iteration": 2.593388795852661 + }, + { + "auxiliary_loss_clip": 0.06503928, + "auxiliary_loss_mlp": 0.01273233, + "balance_loss_clip": 0.06296667, + "balance_loss_mlp": 0.01255554, + "epoch": 0.2713362392905456, + "flos": 23155663311360.0, + "grad_norm": 1.8072498717910284, + "language_loss": 0.809147, + "learning_rate": 3.419504890542124e-06, + "loss": 0.88691866, + "num_input_tokens_seen": 97646595, + "router_z_loss_clip": 2.07226562, + "router_z_loss_mlp": 0.17663574, + "step": 4513, + "time_per_iteration": 2.519502639770508 + }, + { + "auxiliary_loss_clip": 0.06501831, + "auxiliary_loss_mlp": 0.01278947, + "balance_loss_clip": 0.0628939, + "balance_loss_mlp": 0.01261018, + "epoch": 0.2713963625432136, + "flos": 18371409937920.0, + "grad_norm": 3.81368034370299, + "language_loss": 0.88867396, + "learning_rate": 3.4192305068885026e-06, + "loss": 0.96648169, + "num_input_tokens_seen": 97665485, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.17932129, + "step": 4514, + "time_per_iteration": 2.54484224319458 + }, + { + "auxiliary_loss_clip": 0.06502509, + "auxiliary_loss_mlp": 0.01277056, + "balance_loss_clip": 0.06292502, + "balance_loss_mlp": 0.01258709, + "epoch": 0.27145648579588155, + "flos": 22498275703680.0, + "grad_norm": 1.610354502574947, + "language_loss": 0.92402363, + "learning_rate": 3.418956069417517e-06, + "loss": 1.00181937, + "num_input_tokens_seen": 97683800, + "router_z_loss_clip": 2.09863281, + "router_z_loss_mlp": 0.18347168, + "step": 4515, + "time_per_iteration": 2.5121350288391113 + }, + { + "auxiliary_loss_clip": 0.06511631, + "auxiliary_loss_mlp": 0.01281138, + "balance_loss_clip": 0.06296228, + "balance_loss_mlp": 0.01259669, + "epoch": 0.2715166090485495, + "flos": 19244265120000.0, + "grad_norm": 2.423654901761582, + "language_loss": 0.73979908, + "learning_rate": 3.4186815781395756e-06, + "loss": 0.81772685, + "num_input_tokens_seen": 97700505, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.21435547, + "step": 4516, + "time_per_iteration": 3.917318344116211 + }, + { + "auxiliary_loss_clip": 0.06498563, + "auxiliary_loss_mlp": 0.01272915, + "balance_loss_clip": 0.06289151, + "balance_loss_mlp": 0.01253627, + "epoch": 0.2715767323012175, + "flos": 17714902798080.0, + "grad_norm": 1.854313921742246, + "language_loss": 0.76927733, + "learning_rate": 3.4184070330650866e-06, + "loss": 0.84699214, + "num_input_tokens_seen": 97717410, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 0.19287109, + "step": 4517, + "time_per_iteration": 2.576723098754883 + }, + { + "auxiliary_loss_clip": 0.06500702, + "auxiliary_loss_mlp": 0.01276287, + "balance_loss_clip": 0.06291518, + "balance_loss_mlp": 0.01256701, + "epoch": 0.27163685555388545, + "flos": 22389430849920.0, + "grad_norm": 2.0334929641517956, + "language_loss": 0.7833634, + "learning_rate": 3.4181324342044607e-06, + "loss": 0.86113334, + "num_input_tokens_seen": 97734545, + "router_z_loss_clip": 2.09179688, + "router_z_loss_mlp": 0.19592285, + "step": 4518, + "time_per_iteration": 2.5335004329681396 + }, + { + "auxiliary_loss_clip": 0.06502728, + "auxiliary_loss_mlp": 0.01276691, + "balance_loss_clip": 0.06292961, + "balance_loss_mlp": 0.0125925, + "epoch": 0.2716969788065534, + "flos": 22353358867200.0, + "grad_norm": 1.6261203259974584, + "language_loss": 0.68873644, + "learning_rate": 3.41785778156811e-06, + "loss": 0.76653063, + "num_input_tokens_seen": 97754000, + "router_z_loss_clip": 2.09863281, + "router_z_loss_mlp": 0.17443848, + "step": 4519, + "time_per_iteration": 2.60939359664917 + }, + { + "auxiliary_loss_clip": 0.06500532, + "auxiliary_loss_mlp": 0.0127723, + "balance_loss_clip": 0.06291862, + "balance_loss_mlp": 0.01260302, + "epoch": 0.2717571020592214, + "flos": 25235497031040.0, + "grad_norm": 1.9620818548787327, + "language_loss": 0.75925875, + "learning_rate": 3.417583075166451e-06, + "loss": 0.83703637, + "num_input_tokens_seen": 97772080, + "router_z_loss_clip": 2.08789062, + "router_z_loss_mlp": 0.16931152, + "step": 4520, + "time_per_iteration": 3.988518238067627 + }, + { + "auxiliary_loss_clip": 0.06503896, + "auxiliary_loss_mlp": 0.012736, + "balance_loss_clip": 0.06291716, + "balance_loss_mlp": 0.01253942, + "epoch": 0.2718172253118894, + "flos": 20195343688320.0, + "grad_norm": 3.05783023991908, + "language_loss": 0.76690799, + "learning_rate": 3.4173083150099e-06, + "loss": 0.84468293, + "num_input_tokens_seen": 97789370, + "router_z_loss_clip": 2.12304688, + "router_z_loss_mlp": 0.1965332, + "step": 4521, + "time_per_iteration": 3.9463987350463867 + }, + { + "auxiliary_loss_clip": 0.0650706, + "auxiliary_loss_mlp": 0.0127528, + "balance_loss_clip": 0.06291709, + "balance_loss_mlp": 0.01255432, + "epoch": 0.27187734856455736, + "flos": 14324318858880.0, + "grad_norm": 2.0792585055499435, + "language_loss": 0.74927616, + "learning_rate": 3.417033501108875e-06, + "loss": 0.82709956, + "num_input_tokens_seen": 97807385, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.19824219, + "step": 4522, + "time_per_iteration": 2.576792001724243 + }, + { + "auxiliary_loss_clip": 0.06503602, + "auxiliary_loss_mlp": 0.01276885, + "balance_loss_clip": 0.06291734, + "balance_loss_mlp": 0.01258884, + "epoch": 0.27193747181722533, + "flos": 21114375269760.0, + "grad_norm": 1.7974712998396492, + "language_loss": 0.73055947, + "learning_rate": 3.416758633473798e-06, + "loss": 0.80836433, + "num_input_tokens_seen": 97827930, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.17993164, + "step": 4523, + "time_per_iteration": 2.5116758346557617 + }, + { + "auxiliary_loss_clip": 0.06493908, + "auxiliary_loss_mlp": 0.01278011, + "balance_loss_clip": 0.06286807, + "balance_loss_mlp": 0.01259665, + "epoch": 0.2719975950698933, + "flos": 19688910410880.0, + "grad_norm": 1.3231652709358832, + "language_loss": 0.74779463, + "learning_rate": 3.4164837121150915e-06, + "loss": 0.82551384, + "num_input_tokens_seen": 97847440, + "router_z_loss_clip": 2.0703125, + "router_z_loss_mlp": 0.18334961, + "step": 4524, + "time_per_iteration": 2.5318901538848877 + }, + { + "auxiliary_loss_clip": 0.06503987, + "auxiliary_loss_mlp": 0.01277059, + "balance_loss_clip": 0.06291917, + "balance_loss_mlp": 0.01258248, + "epoch": 0.27205771832256126, + "flos": 24761488083840.0, + "grad_norm": 2.222226091972884, + "language_loss": 0.76783192, + "learning_rate": 3.4162087370431803e-06, + "loss": 0.84564239, + "num_input_tokens_seen": 97867620, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.18811035, + "step": 4525, + "time_per_iteration": 2.594209909439087 + }, + { + "auxiliary_loss_clip": 0.06492639, + "auxiliary_loss_mlp": 0.01271759, + "balance_loss_clip": 0.0628486, + "balance_loss_mlp": 0.01254712, + "epoch": 0.2721178415752292, + "flos": 21760903774080.0, + "grad_norm": 1.8877793172534498, + "language_loss": 0.82166058, + "learning_rate": 3.4159337082684926e-06, + "loss": 0.89930463, + "num_input_tokens_seen": 97884345, + "router_z_loss_clip": 2.07617188, + "router_z_loss_mlp": 0.17041016, + "step": 4526, + "time_per_iteration": 3.9739785194396973 + }, + { + "auxiliary_loss_clip": 0.06510428, + "auxiliary_loss_mlp": 0.01273954, + "balance_loss_clip": 0.06292043, + "balance_loss_mlp": 0.01254189, + "epoch": 0.2721779648278972, + "flos": 12681667416960.0, + "grad_norm": 2.608637418907724, + "language_loss": 0.77407986, + "learning_rate": 3.4156586258014566e-06, + "loss": 0.8519237, + "num_input_tokens_seen": 97901500, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.19763184, + "step": 4527, + "time_per_iteration": 2.5017969608306885 + }, + { + "auxiliary_loss_clip": 0.06502572, + "auxiliary_loss_mlp": 0.01278457, + "balance_loss_clip": 0.0629287, + "balance_loss_mlp": 0.01260194, + "epoch": 0.27223808808056515, + "flos": 16258774544640.0, + "grad_norm": 2.1231016049423608, + "language_loss": 0.82676923, + "learning_rate": 3.415383489652503e-06, + "loss": 0.90457952, + "num_input_tokens_seen": 97917800, + "router_z_loss_clip": 2.09570312, + "router_z_loss_mlp": 0.18249512, + "step": 4528, + "time_per_iteration": 2.5011186599731445 + }, + { + "auxiliary_loss_clip": 0.06500327, + "auxiliary_loss_mlp": 0.012781, + "balance_loss_clip": 0.06293638, + "balance_loss_mlp": 0.01260064, + "epoch": 0.2722982113332331, + "flos": 27753225788160.0, + "grad_norm": 1.6573852241711216, + "language_loss": 0.77553773, + "learning_rate": 3.4151082998320666e-06, + "loss": 0.85332191, + "num_input_tokens_seen": 97937225, + "router_z_loss_clip": 2.06640625, + "router_z_loss_mlp": 0.18041992, + "step": 4529, + "time_per_iteration": 2.5810396671295166 + }, + { + "auxiliary_loss_clip": 0.06499013, + "auxiliary_loss_mlp": 0.01277211, + "balance_loss_clip": 0.06287834, + "balance_loss_mlp": 0.01259055, + "epoch": 0.2723583345859011, + "flos": 21732756001920.0, + "grad_norm": 2.1115027178358354, + "language_loss": 0.82665265, + "learning_rate": 3.4148330563505805e-06, + "loss": 0.90441489, + "num_input_tokens_seen": 97956845, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.18164062, + "step": 4530, + "time_per_iteration": 2.586454391479492 + }, + { + "auxiliary_loss_clip": 0.06502904, + "auxiliary_loss_mlp": 0.01282339, + "balance_loss_clip": 0.06295159, + "balance_loss_mlp": 0.0126379, + "epoch": 0.27241845783856905, + "flos": 17352925159680.0, + "grad_norm": 2.154635693147181, + "language_loss": 0.92694783, + "learning_rate": 3.4145577592184838e-06, + "loss": 1.0048002, + "num_input_tokens_seen": 97972465, + "router_z_loss_clip": 2.078125, + "router_z_loss_mlp": 0.18530273, + "step": 4531, + "time_per_iteration": 2.5160703659057617 + }, + { + "auxiliary_loss_clip": 0.06501545, + "auxiliary_loss_mlp": 0.01275858, + "balance_loss_clip": 0.06288925, + "balance_loss_mlp": 0.01257928, + "epoch": 0.272478581091237, + "flos": 24761278448640.0, + "grad_norm": 1.903467624841223, + "language_loss": 0.76781744, + "learning_rate": 3.4142824084462155e-06, + "loss": 0.84559143, + "num_input_tokens_seen": 97990770, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.17919922, + "step": 4532, + "time_per_iteration": 2.568319082260132 + }, + { + "auxiliary_loss_clip": 0.06500092, + "auxiliary_loss_mlp": 0.01272039, + "balance_loss_clip": 0.06296673, + "balance_loss_mlp": 0.0125448, + "epoch": 0.272538704343905, + "flos": 17895723909120.0, + "grad_norm": 2.5230523304945685, + "language_loss": 0.89717656, + "learning_rate": 3.4140070040442162e-06, + "loss": 0.97489792, + "num_input_tokens_seen": 98005775, + "router_z_loss_clip": 2.03125, + "router_z_loss_mlp": 0.17565918, + "step": 4533, + "time_per_iteration": 2.538637399673462 + }, + { + "auxiliary_loss_clip": 0.06497633, + "auxiliary_loss_mlp": 0.01272152, + "balance_loss_clip": 0.06294405, + "balance_loss_mlp": 0.01255559, + "epoch": 0.272598827596573, + "flos": 22939021779840.0, + "grad_norm": 1.9282389689502992, + "language_loss": 0.72213519, + "learning_rate": 3.413731546022929e-06, + "loss": 0.79983306, + "num_input_tokens_seen": 98025750, + "router_z_loss_clip": 2.03027344, + "router_z_loss_mlp": 0.16589355, + "step": 4534, + "time_per_iteration": 2.5503549575805664 + }, + { + "auxiliary_loss_clip": 0.06500763, + "auxiliary_loss_mlp": 0.01275564, + "balance_loss_clip": 0.06290451, + "balance_loss_mlp": 0.01255847, + "epoch": 0.27265895084924097, + "flos": 24244447265280.0, + "grad_norm": 1.8514773269853142, + "language_loss": 0.91784394, + "learning_rate": 3.4134560343928005e-06, + "loss": 0.99560714, + "num_input_tokens_seen": 98044955, + "router_z_loss_clip": 2.10449219, + "router_z_loss_mlp": 0.19702148, + "step": 4535, + "time_per_iteration": 2.558943510055542 + }, + { + "auxiliary_loss_clip": 0.06506651, + "auxiliary_loss_mlp": 0.01276542, + "balance_loss_clip": 0.06297188, + "balance_loss_mlp": 0.01258768, + "epoch": 0.27271907410190893, + "flos": 27019962708480.0, + "grad_norm": 1.7799258806344853, + "language_loss": 0.73195565, + "learning_rate": 3.4131804691642778e-06, + "loss": 0.80978757, + "num_input_tokens_seen": 98065860, + "router_z_loss_clip": 2.09472656, + "router_z_loss_mlp": 0.17773438, + "step": 4536, + "time_per_iteration": 2.5590782165527344 + }, + { + "auxiliary_loss_clip": 0.06502935, + "auxiliary_loss_mlp": 0.01275256, + "balance_loss_clip": 0.0629502, + "balance_loss_mlp": 0.01257351, + "epoch": 0.2727791973545769, + "flos": 34460027568000.0, + "grad_norm": 1.8462150885541477, + "language_loss": 0.72167033, + "learning_rate": 3.41290485034781e-06, + "loss": 0.79945225, + "num_input_tokens_seen": 98085450, + "router_z_loss_clip": 2.07714844, + "router_z_loss_mlp": 0.17907715, + "step": 4537, + "time_per_iteration": 2.680515766143799 + }, + { + "auxiliary_loss_clip": 0.06501988, + "auxiliary_loss_mlp": 0.01276469, + "balance_loss_clip": 0.06293489, + "balance_loss_mlp": 0.0125829, + "epoch": 0.27283932060724486, + "flos": 15045842367360.0, + "grad_norm": 2.3888098238231503, + "language_loss": 0.78421736, + "learning_rate": 3.4126291779538485e-06, + "loss": 0.8620019, + "num_input_tokens_seen": 98099115, + "router_z_loss_clip": 2.08496094, + "router_z_loss_mlp": 0.1817627, + "step": 4538, + "time_per_iteration": 2.4626059532165527 + }, + { + "auxiliary_loss_clip": 0.06506806, + "auxiliary_loss_mlp": 0.01275863, + "balance_loss_clip": 0.06298484, + "balance_loss_mlp": 0.01258566, + "epoch": 0.2728994438599128, + "flos": 21658767246720.0, + "grad_norm": 1.6357140094020364, + "language_loss": 0.90640903, + "learning_rate": 3.412353451992847e-06, + "loss": 0.9842357, + "num_input_tokens_seen": 98118415, + "router_z_loss_clip": 2.08007812, + "router_z_loss_mlp": 0.17297363, + "step": 4539, + "time_per_iteration": 2.5629584789276123 + }, + { + "auxiliary_loss_clip": 0.06501281, + "auxiliary_loss_mlp": 0.01272923, + "balance_loss_clip": 0.06294584, + "balance_loss_mlp": 0.01253778, + "epoch": 0.2729595671125808, + "flos": 17493313875840.0, + "grad_norm": 1.7229738452441967, + "language_loss": 0.88610893, + "learning_rate": 3.4120776724752607e-06, + "loss": 0.96385098, + "num_input_tokens_seen": 98136300, + "router_z_loss_clip": 2.06542969, + "router_z_loss_mlp": 0.19140625, + "step": 4540, + "time_per_iteration": 2.4959304332733154 + }, + { + "auxiliary_loss_clip": 0.06504017, + "auxiliary_loss_mlp": 0.01274499, + "balance_loss_clip": 0.06294081, + "balance_loss_mlp": 0.0125744, + "epoch": 0.27301969036524876, + "flos": 19324249441920.0, + "grad_norm": 2.2191409784662, + "language_loss": 0.8242712, + "learning_rate": 3.4118018394115476e-06, + "loss": 0.9020564, + "num_input_tokens_seen": 98154580, + "router_z_loss_clip": 2.09960938, + "router_z_loss_mlp": 0.17053223, + "step": 4541, + "time_per_iteration": 2.550239086151123 + }, + { + "auxiliary_loss_clip": 0.06500127, + "auxiliary_loss_mlp": 0.01279087, + "balance_loss_clip": 0.06291916, + "balance_loss_mlp": 0.01260431, + "epoch": 0.2730798136179167, + "flos": 21071427252480.0, + "grad_norm": 2.3060281935178795, + "language_loss": 0.80131608, + "learning_rate": 3.4115259528121678e-06, + "loss": 0.87910819, + "num_input_tokens_seen": 98173115, + "router_z_loss_clip": 2.08007812, + "router_z_loss_mlp": 0.18664551, + "step": 4542, + "time_per_iteration": 2.519717216491699 + }, + { + "auxiliary_loss_clip": 0.06509651, + "auxiliary_loss_mlp": 0.01276731, + "balance_loss_clip": 0.06301565, + "balance_loss_mlp": 0.01258599, + "epoch": 0.2731399368705847, + "flos": 19177739377920.0, + "grad_norm": 1.9524817452008785, + "language_loss": 0.89606124, + "learning_rate": 3.411250012687582e-06, + "loss": 0.97392499, + "num_input_tokens_seen": 98190260, + "router_z_loss_clip": 2.08105469, + "router_z_loss_mlp": 0.18139648, + "step": 4543, + "time_per_iteration": 2.5182156562805176 + }, + { + "auxiliary_loss_clip": 0.06509942, + "auxiliary_loss_mlp": 0.01279235, + "balance_loss_clip": 0.06297313, + "balance_loss_mlp": 0.012604, + "epoch": 0.27320006012325265, + "flos": 18294989414400.0, + "grad_norm": 2.101118642115193, + "language_loss": 0.64112943, + "learning_rate": 3.410974019048255e-06, + "loss": 0.7190212, + "num_input_tokens_seen": 98207115, + "router_z_loss_clip": 2.12597656, + "router_z_loss_mlp": 0.18823242, + "step": 4544, + "time_per_iteration": 2.482348918914795 + }, + { + "auxiliary_loss_clip": 0.06504791, + "auxiliary_loss_mlp": 0.01282982, + "balance_loss_clip": 0.06296986, + "balance_loss_mlp": 0.01264231, + "epoch": 0.2732601833759206, + "flos": 34869607125120.0, + "grad_norm": 1.6845842729353224, + "language_loss": 0.70290005, + "learning_rate": 3.410697971904651e-06, + "loss": 0.78077781, + "num_input_tokens_seen": 98230610, + "router_z_loss_clip": 2.07714844, + "router_z_loss_mlp": 0.1875, + "step": 4545, + "time_per_iteration": 2.6779940128326416 + }, + { + "auxiliary_loss_clip": 0.06375119, + "auxiliary_loss_mlp": 0.01256033, + "balance_loss_clip": 0.06273499, + "balance_loss_mlp": 0.01252296, + "epoch": 0.2733203066285886, + "flos": 53929514534400.0, + "grad_norm": 0.7176798913576009, + "language_loss": 0.61676908, + "learning_rate": 3.4104218712672383e-06, + "loss": 0.6930806, + "num_input_tokens_seen": 98293585, + "router_z_loss_clip": 1.015625, + "router_z_loss_mlp": 0.03729248, + "step": 4546, + "time_per_iteration": 3.1508243083953857 + }, + { + "auxiliary_loss_clip": 0.06510071, + "auxiliary_loss_mlp": 0.01277702, + "balance_loss_clip": 0.06301852, + "balance_loss_mlp": 0.01258843, + "epoch": 0.2733804298812566, + "flos": 20665411493760.0, + "grad_norm": 1.9095347334938924, + "language_loss": 0.65170372, + "learning_rate": 3.410145717146488e-06, + "loss": 0.72958136, + "num_input_tokens_seen": 98311680, + "router_z_loss_clip": 2.08105469, + "router_z_loss_mlp": 0.1887207, + "step": 4547, + "time_per_iteration": 2.57828426361084 + }, + { + "auxiliary_loss_clip": 0.06498976, + "auxiliary_loss_mlp": 0.0127425, + "balance_loss_clip": 0.06296893, + "balance_loss_mlp": 0.01257799, + "epoch": 0.27344055313392457, + "flos": 25891333338240.0, + "grad_norm": 2.438857151480637, + "language_loss": 0.78365928, + "learning_rate": 3.4098695095528694e-06, + "loss": 0.86139154, + "num_input_tokens_seen": 98330770, + "router_z_loss_clip": 2.02246094, + "router_z_loss_mlp": 0.16455078, + "step": 4548, + "time_per_iteration": 2.566077470779419 + }, + { + "auxiliary_loss_clip": 0.0650417, + "auxiliary_loss_mlp": 0.01276844, + "balance_loss_clip": 0.06295689, + "balance_loss_mlp": 0.01259785, + "epoch": 0.27350067638659253, + "flos": 22936380376320.0, + "grad_norm": 2.3129649243249157, + "language_loss": 0.83350241, + "learning_rate": 3.4095932484968585e-06, + "loss": 0.91131258, + "num_input_tokens_seen": 98349860, + "router_z_loss_clip": 2.08496094, + "router_z_loss_mlp": 0.17053223, + "step": 4549, + "time_per_iteration": 2.560349941253662 + }, + { + "auxiliary_loss_clip": 0.06503863, + "auxiliary_loss_mlp": 0.01277801, + "balance_loss_clip": 0.06292209, + "balance_loss_mlp": 0.0125707, + "epoch": 0.2735607996392605, + "flos": 16579313539200.0, + "grad_norm": 2.1355332193902568, + "language_loss": 0.71687186, + "learning_rate": 3.4093169339889305e-06, + "loss": 0.79468852, + "num_input_tokens_seen": 98367040, + "router_z_loss_clip": 2.11328125, + "router_z_loss_mlp": 0.20727539, + "step": 4550, + "time_per_iteration": 2.4829771518707275 + }, + { + "auxiliary_loss_clip": 0.06503724, + "auxiliary_loss_mlp": 0.01271817, + "balance_loss_clip": 0.06298332, + "balance_loss_mlp": 0.01253435, + "epoch": 0.27362092289192846, + "flos": 19651245200640.0, + "grad_norm": 2.4590448673698546, + "language_loss": 0.79561722, + "learning_rate": 3.409040566039563e-06, + "loss": 0.87337267, + "num_input_tokens_seen": 98384010, + "router_z_loss_clip": 2.05078125, + "router_z_loss_mlp": 0.18371582, + "step": 4551, + "time_per_iteration": 2.5074269771575928 + }, + { + "auxiliary_loss_clip": 0.06500211, + "auxiliary_loss_mlp": 0.01281852, + "balance_loss_clip": 0.06290769, + "balance_loss_mlp": 0.01263565, + "epoch": 0.27368104614459643, + "flos": 17644855184640.0, + "grad_norm": 2.2858009613836465, + "language_loss": 0.71362597, + "learning_rate": 3.4087641446592362e-06, + "loss": 0.79144663, + "num_input_tokens_seen": 98399625, + "router_z_loss_clip": 2.09765625, + "router_z_loss_mlp": 0.18286133, + "step": 4552, + "time_per_iteration": 2.478208541870117 + }, + { + "auxiliary_loss_clip": 0.0650662, + "auxiliary_loss_mlp": 0.01277463, + "balance_loss_clip": 0.06295393, + "balance_loss_mlp": 0.01258759, + "epoch": 0.2737411693972644, + "flos": 21586455573120.0, + "grad_norm": 1.8660820035104149, + "language_loss": 0.71756262, + "learning_rate": 3.408487669858431e-06, + "loss": 0.79540348, + "num_input_tokens_seen": 98417310, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.18701172, + "step": 4553, + "time_per_iteration": 2.5268712043762207 + }, + { + "auxiliary_loss_clip": 0.0650337, + "auxiliary_loss_mlp": 0.01274907, + "balance_loss_clip": 0.06293483, + "balance_loss_mlp": 0.01255738, + "epoch": 0.27380129264993236, + "flos": 25491145438080.0, + "grad_norm": 1.7561499880950933, + "language_loss": 0.60065031, + "learning_rate": 3.4082111416476337e-06, + "loss": 0.67843306, + "num_input_tokens_seen": 98438670, + "router_z_loss_clip": 2.09765625, + "router_z_loss_mlp": 0.19177246, + "step": 4554, + "time_per_iteration": 2.5836522579193115 + }, + { + "auxiliary_loss_clip": 0.06509934, + "auxiliary_loss_mlp": 0.01281174, + "balance_loss_clip": 0.06291255, + "balance_loss_mlp": 0.01261838, + "epoch": 0.2738614159026003, + "flos": 18667155323520.0, + "grad_norm": 1.5632450212680145, + "language_loss": 0.74850649, + "learning_rate": 3.4079345600373275e-06, + "loss": 0.82641757, + "num_input_tokens_seen": 98456060, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.1932373, + "step": 4555, + "time_per_iteration": 3.9590039253234863 + }, + { + "auxiliary_loss_clip": 0.06511028, + "auxiliary_loss_mlp": 0.01279514, + "balance_loss_clip": 0.0629926, + "balance_loss_mlp": 0.0125982, + "epoch": 0.2739215391552683, + "flos": 23483874954240.0, + "grad_norm": 6.994475758797384, + "language_loss": 0.7822473, + "learning_rate": 3.407657925038002e-06, + "loss": 0.86015272, + "num_input_tokens_seen": 98473765, + "router_z_loss_clip": 2.11523438, + "router_z_loss_mlp": 0.19677734, + "step": 4556, + "time_per_iteration": 2.5688674449920654 + }, + { + "auxiliary_loss_clip": 0.06517123, + "auxiliary_loss_mlp": 0.01280796, + "balance_loss_clip": 0.06293104, + "balance_loss_mlp": 0.01260125, + "epoch": 0.27398166240793626, + "flos": 17134313057280.0, + "grad_norm": 1.8677949115203087, + "language_loss": 0.83077759, + "learning_rate": 3.4073812366601473e-06, + "loss": 0.90875673, + "num_input_tokens_seen": 98490590, + "router_z_loss_clip": 2.2421875, + "router_z_loss_mlp": 0.20690918, + "step": 4557, + "time_per_iteration": 2.490562915802002 + }, + { + "auxiliary_loss_clip": 0.06504503, + "auxiliary_loss_mlp": 0.01276773, + "balance_loss_clip": 0.06292793, + "balance_loss_mlp": 0.01256292, + "epoch": 0.2740417856606042, + "flos": 23411563280640.0, + "grad_norm": 1.9738441909854203, + "language_loss": 0.73066616, + "learning_rate": 3.4071044949142547e-06, + "loss": 0.80847895, + "num_input_tokens_seen": 98510590, + "router_z_loss_clip": 2.1171875, + "router_z_loss_mlp": 0.20483398, + "step": 4558, + "time_per_iteration": 2.5761232376098633 + }, + { + "auxiliary_loss_clip": 0.06504066, + "auxiliary_loss_mlp": 0.01276845, + "balance_loss_clip": 0.06292865, + "balance_loss_mlp": 0.01256651, + "epoch": 0.2741019089132722, + "flos": 12784307068800.0, + "grad_norm": 2.149984670873407, + "language_loss": 0.68751299, + "learning_rate": 3.406827699810819e-06, + "loss": 0.76532209, + "num_input_tokens_seen": 98527875, + "router_z_loss_clip": 2.10839844, + "router_z_loss_mlp": 0.2019043, + "step": 4559, + "time_per_iteration": 2.4976439476013184 + }, + { + "auxiliary_loss_clip": 0.06501673, + "auxiliary_loss_mlp": 0.01278249, + "balance_loss_clip": 0.0629222, + "balance_loss_mlp": 0.01259676, + "epoch": 0.27416203216594015, + "flos": 20637850700160.0, + "grad_norm": 1.7403202614473876, + "language_loss": 0.72741163, + "learning_rate": 3.4065508513603353e-06, + "loss": 0.80521083, + "num_input_tokens_seen": 98547575, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 0.18566895, + "step": 4560, + "time_per_iteration": 4.005557537078857 + }, + { + "auxiliary_loss_clip": 0.06501405, + "auxiliary_loss_mlp": 0.01278052, + "balance_loss_clip": 0.06289977, + "balance_loss_mlp": 0.01259718, + "epoch": 0.27422215541860817, + "flos": 26548762872960.0, + "grad_norm": 1.7791790627265829, + "language_loss": 0.82245278, + "learning_rate": 3.406273949573303e-06, + "loss": 0.90024734, + "num_input_tokens_seen": 98566290, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.18334961, + "step": 4561, + "time_per_iteration": 4.059048652648926 + }, + { + "auxiliary_loss_clip": 0.06510133, + "auxiliary_loss_mlp": 0.01276094, + "balance_loss_clip": 0.06296331, + "balance_loss_mlp": 0.012564, + "epoch": 0.27428227867127614, + "flos": 23337868014720.0, + "grad_norm": 1.9098162884662422, + "language_loss": 0.75760031, + "learning_rate": 3.4059969944602214e-06, + "loss": 0.83546257, + "num_input_tokens_seen": 98586255, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.19702148, + "step": 4562, + "time_per_iteration": 2.558397054672241 + }, + { + "auxiliary_loss_clip": 0.06506505, + "auxiliary_loss_mlp": 0.01277189, + "balance_loss_clip": 0.06293164, + "balance_loss_mlp": 0.01258092, + "epoch": 0.2743424019239441, + "flos": 23041074453120.0, + "grad_norm": 1.577834756327151, + "language_loss": 0.75198597, + "learning_rate": 3.4057199860315928e-06, + "loss": 0.8298229, + "num_input_tokens_seen": 98606030, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.19091797, + "step": 4563, + "time_per_iteration": 2.5698354244232178 + }, + { + "auxiliary_loss_clip": 0.06524341, + "auxiliary_loss_mlp": 0.01283879, + "balance_loss_clip": 0.06305183, + "balance_loss_mlp": 0.01262302, + "epoch": 0.27440252517661207, + "flos": 21987565868160.0, + "grad_norm": 2.0193615345580085, + "language_loss": 0.6348893, + "learning_rate": 3.4054429242979213e-06, + "loss": 0.71297145, + "num_input_tokens_seen": 98625225, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.21569824, + "step": 4564, + "time_per_iteration": 2.545741558074951 + }, + { + "auxiliary_loss_clip": 0.06513885, + "auxiliary_loss_mlp": 0.01280066, + "balance_loss_clip": 0.06299828, + "balance_loss_mlp": 0.01260647, + "epoch": 0.27446264842928003, + "flos": 40196952737280.0, + "grad_norm": 2.2005709679787153, + "language_loss": 0.7878077, + "learning_rate": 3.4051658092697135e-06, + "loss": 0.86574721, + "num_input_tokens_seen": 98649470, + "router_z_loss_clip": 2.14160156, + "router_z_loss_mlp": 0.19433594, + "step": 4565, + "time_per_iteration": 2.7061169147491455 + }, + { + "auxiliary_loss_clip": 0.0650921, + "auxiliary_loss_mlp": 0.01277346, + "balance_loss_clip": 0.06296623, + "balance_loss_mlp": 0.01257903, + "epoch": 0.274522771681948, + "flos": 13484684620800.0, + "grad_norm": 1.9604173340299715, + "language_loss": 0.69729757, + "learning_rate": 3.404888640957477e-06, + "loss": 0.77516317, + "num_input_tokens_seen": 98666915, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.19458008, + "step": 4566, + "time_per_iteration": 3.9156126976013184 + }, + { + "auxiliary_loss_clip": 0.06511474, + "auxiliary_loss_mlp": 0.0128161, + "balance_loss_clip": 0.06300822, + "balance_loss_mlp": 0.0126318, + "epoch": 0.27458289493461596, + "flos": 28629812476800.0, + "grad_norm": 1.605297231279352, + "language_loss": 0.61699307, + "learning_rate": 3.404611419371723e-06, + "loss": 0.69492388, + "num_input_tokens_seen": 98688240, + "router_z_loss_clip": 2.10546875, + "router_z_loss_mlp": 0.18432617, + "step": 4567, + "time_per_iteration": 2.5721306800842285 + }, + { + "auxiliary_loss_clip": 0.06514515, + "auxiliary_loss_mlp": 0.01275467, + "balance_loss_clip": 0.06299441, + "balance_loss_mlp": 0.01255511, + "epoch": 0.2746430181872839, + "flos": 20125883053440.0, + "grad_norm": 1.9422441687055725, + "language_loss": 0.83055782, + "learning_rate": 3.4043341445229627e-06, + "loss": 0.90845764, + "num_input_tokens_seen": 98708245, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.19970703, + "step": 4568, + "time_per_iteration": 2.5616700649261475 + }, + { + "auxiliary_loss_clip": 0.06521738, + "auxiliary_loss_mlp": 0.01275653, + "balance_loss_clip": 0.06304733, + "balance_loss_mlp": 0.01255709, + "epoch": 0.2747031414399519, + "flos": 20199662173440.0, + "grad_norm": 2.1285143693034367, + "language_loss": 0.6896143, + "learning_rate": 3.4040568164217117e-06, + "loss": 0.76758814, + "num_input_tokens_seen": 98724575, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.19934082, + "step": 4569, + "time_per_iteration": 2.531096935272217 + }, + { + "auxiliary_loss_clip": 0.06517979, + "auxiliary_loss_mlp": 0.01281496, + "balance_loss_clip": 0.06303072, + "balance_loss_mlp": 0.0126216, + "epoch": 0.27476326469261986, + "flos": 13521385509120.0, + "grad_norm": 2.4613635331126926, + "language_loss": 0.71897286, + "learning_rate": 3.4037794350784848e-06, + "loss": 0.79696763, + "num_input_tokens_seen": 98740700, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.19360352, + "step": 4570, + "time_per_iteration": 2.5235774517059326 + }, + { + "auxiliary_loss_clip": 0.06414898, + "auxiliary_loss_mlp": 0.01257276, + "balance_loss_clip": 0.06312878, + "balance_loss_mlp": 0.01253897, + "epoch": 0.2748233879452878, + "flos": 65955486153600.0, + "grad_norm": 0.6977768363268191, + "language_loss": 0.5577414, + "learning_rate": 3.4035020005038014e-06, + "loss": 0.63446319, + "num_input_tokens_seen": 98803030, + "router_z_loss_clip": 1.015625, + "router_z_loss_mlp": 0.03387451, + "step": 4571, + "time_per_iteration": 3.234433889389038 + }, + { + "auxiliary_loss_clip": 0.06526154, + "auxiliary_loss_mlp": 0.01279423, + "balance_loss_clip": 0.06308736, + "balance_loss_mlp": 0.01260326, + "epoch": 0.2748835111979558, + "flos": 17389961464320.0, + "grad_norm": 2.165338105639142, + "language_loss": 0.78105313, + "learning_rate": 3.4032245127081812e-06, + "loss": 0.85910892, + "num_input_tokens_seen": 98820505, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.19104004, + "step": 4572, + "time_per_iteration": 2.562450647354126 + }, + { + "auxiliary_loss_clip": 0.06506811, + "auxiliary_loss_mlp": 0.01278507, + "balance_loss_clip": 0.06298923, + "balance_loss_mlp": 0.01261711, + "epoch": 0.27494363445062375, + "flos": 23594480743680.0, + "grad_norm": 2.0912194071895014, + "language_loss": 0.81855798, + "learning_rate": 3.402946971702147e-06, + "loss": 0.89641118, + "num_input_tokens_seen": 98842150, + "router_z_loss_clip": 2.078125, + "router_z_loss_mlp": 0.16809082, + "step": 4573, + "time_per_iteration": 2.575467824935913 + }, + { + "auxiliary_loss_clip": 0.06512269, + "auxiliary_loss_mlp": 0.01277933, + "balance_loss_clip": 0.06303579, + "balance_loss_mlp": 0.01258585, + "epoch": 0.2750037577032918, + "flos": 17170175404800.0, + "grad_norm": 1.5550185346959569, + "language_loss": 0.79688454, + "learning_rate": 3.402669377496223e-06, + "loss": 0.87478662, + "num_input_tokens_seen": 98861050, + "router_z_loss_clip": 2.08398438, + "router_z_loss_mlp": 0.19360352, + "step": 4574, + "time_per_iteration": 2.522381067276001 + }, + { + "auxiliary_loss_clip": 0.06514049, + "auxiliary_loss_mlp": 0.012813, + "balance_loss_clip": 0.06300252, + "balance_loss_mlp": 0.01263383, + "epoch": 0.27506388095595974, + "flos": 24497663904000.0, + "grad_norm": 1.9638366231768782, + "language_loss": 0.75217533, + "learning_rate": 3.402391730100936e-06, + "loss": 0.83012879, + "num_input_tokens_seen": 98879695, + "router_z_loss_clip": 2.14160156, + "router_z_loss_mlp": 0.17907715, + "step": 4575, + "time_per_iteration": 2.564023971557617 + }, + { + "auxiliary_loss_clip": 0.06513455, + "auxiliary_loss_mlp": 0.01285217, + "balance_loss_clip": 0.06304657, + "balance_loss_mlp": 0.01267562, + "epoch": 0.2751240042086277, + "flos": 38774003500800.0, + "grad_norm": 1.5894976166299741, + "language_loss": 0.71788073, + "learning_rate": 3.402114029526814e-06, + "loss": 0.79586744, + "num_input_tokens_seen": 98902035, + "router_z_loss_clip": 2.08789062, + "router_z_loss_mlp": 0.17663574, + "step": 4576, + "time_per_iteration": 2.6856141090393066 + }, + { + "auxiliary_loss_clip": 0.06515673, + "auxiliary_loss_mlp": 0.01294199, + "balance_loss_clip": 0.06304252, + "balance_loss_mlp": 0.0127447, + "epoch": 0.27518412746129567, + "flos": 26914388163840.0, + "grad_norm": 1.693116107866749, + "language_loss": 0.73358452, + "learning_rate": 3.4018362757843866e-06, + "loss": 0.81168324, + "num_input_tokens_seen": 98921835, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.19726562, + "step": 4577, + "time_per_iteration": 2.5795719623565674 + }, + { + "auxiliary_loss_clip": 0.06517484, + "auxiliary_loss_mlp": 0.01279945, + "balance_loss_clip": 0.0630409, + "balance_loss_mlp": 0.01260514, + "epoch": 0.27524425071396363, + "flos": 24907578877440.0, + "grad_norm": 1.9498672791378742, + "language_loss": 0.76234132, + "learning_rate": 3.401558468884188e-06, + "loss": 0.84031564, + "num_input_tokens_seen": 98939610, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.19433594, + "step": 4578, + "time_per_iteration": 2.5547378063201904 + }, + { + "auxiliary_loss_clip": 0.06518476, + "auxiliary_loss_mlp": 0.01286331, + "balance_loss_clip": 0.06304644, + "balance_loss_mlp": 0.01265255, + "epoch": 0.2753043739666316, + "flos": 26295504307200.0, + "grad_norm": 1.3718100748583155, + "language_loss": 0.66504484, + "learning_rate": 3.4012806088367516e-06, + "loss": 0.74309289, + "num_input_tokens_seen": 98962250, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.21069336, + "step": 4579, + "time_per_iteration": 2.6126484870910645 + }, + { + "auxiliary_loss_clip": 0.06516613, + "auxiliary_loss_mlp": 0.01291851, + "balance_loss_clip": 0.06301446, + "balance_loss_mlp": 0.01271753, + "epoch": 0.27536449721929956, + "flos": 24213616162560.0, + "grad_norm": 3.1986582184359853, + "language_loss": 0.80722374, + "learning_rate": 3.4010026956526137e-06, + "loss": 0.88530838, + "num_input_tokens_seen": 98981845, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.2010498, + "step": 4580, + "time_per_iteration": 2.571364164352417 + }, + { + "auxiliary_loss_clip": 0.06513728, + "auxiliary_loss_mlp": 0.01285107, + "balance_loss_clip": 0.06304168, + "balance_loss_mlp": 0.01264305, + "epoch": 0.27542462047196753, + "flos": 19543448522880.0, + "grad_norm": 1.580662182314359, + "language_loss": 0.68234229, + "learning_rate": 3.4007247293423137e-06, + "loss": 0.76033062, + "num_input_tokens_seen": 99001855, + "router_z_loss_clip": 2.09570312, + "router_z_loss_mlp": 0.20788574, + "step": 4581, + "time_per_iteration": 2.5507936477661133 + }, + { + "auxiliary_loss_clip": 0.06515522, + "auxiliary_loss_mlp": 0.01276377, + "balance_loss_clip": 0.06298342, + "balance_loss_mlp": 0.01258448, + "epoch": 0.2754847437246355, + "flos": 14324360785920.0, + "grad_norm": 1.5474830525473977, + "language_loss": 0.78408682, + "learning_rate": 3.400446709916392e-06, + "loss": 0.86200583, + "num_input_tokens_seen": 99019880, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.17919922, + "step": 4582, + "time_per_iteration": 2.511134624481201 + }, + { + "auxiliary_loss_clip": 0.06505451, + "auxiliary_loss_mlp": 0.01284248, + "balance_loss_clip": 0.06298563, + "balance_loss_mlp": 0.01266605, + "epoch": 0.27554486697730346, + "flos": 18843951438720.0, + "grad_norm": 1.627014419094476, + "language_loss": 0.84829235, + "learning_rate": 3.4001686373853895e-06, + "loss": 0.92618936, + "num_input_tokens_seen": 99037570, + "router_z_loss_clip": 2.06933594, + "router_z_loss_mlp": 0.17663574, + "step": 4583, + "time_per_iteration": 2.5625038146972656 + }, + { + "auxiliary_loss_clip": 0.065156, + "auxiliary_loss_mlp": 0.01295136, + "balance_loss_clip": 0.0629985, + "balance_loss_mlp": 0.01274799, + "epoch": 0.2756049902299714, + "flos": 22388801944320.0, + "grad_norm": 2.5216327683147104, + "language_loss": 0.67592049, + "learning_rate": 3.3998905117598528e-06, + "loss": 0.75402784, + "num_input_tokens_seen": 99056875, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.20349121, + "step": 4584, + "time_per_iteration": 2.5712413787841797 + }, + { + "auxiliary_loss_clip": 0.06508277, + "auxiliary_loss_mlp": 0.01286302, + "balance_loss_clip": 0.06299593, + "balance_loss_mlp": 0.01268385, + "epoch": 0.2756651134826394, + "flos": 19580107484160.0, + "grad_norm": 1.7056038485870715, + "language_loss": 0.77640843, + "learning_rate": 3.399612333050327e-06, + "loss": 0.8543542, + "num_input_tokens_seen": 99074685, + "router_z_loss_clip": 2.08789062, + "router_z_loss_mlp": 0.17919922, + "step": 4585, + "time_per_iteration": 2.5581910610198975 + }, + { + "auxiliary_loss_clip": 0.06520131, + "auxiliary_loss_mlp": 0.01290999, + "balance_loss_clip": 0.06302814, + "balance_loss_mlp": 0.01271151, + "epoch": 0.27572523673530736, + "flos": 23593306786560.0, + "grad_norm": 1.6012607614221503, + "language_loss": 0.72652835, + "learning_rate": 3.399334101267362e-06, + "loss": 0.8046397, + "num_input_tokens_seen": 99095300, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.1986084, + "step": 4586, + "time_per_iteration": 2.5581955909729004 + }, + { + "auxiliary_loss_clip": 0.06512299, + "auxiliary_loss_mlp": 0.01283131, + "balance_loss_clip": 0.06300563, + "balance_loss_mlp": 0.01264475, + "epoch": 0.2757853599879754, + "flos": 22826696981760.0, + "grad_norm": 1.4211606049909042, + "language_loss": 0.8102116, + "learning_rate": 3.3990558164215073e-06, + "loss": 0.88816595, + "num_input_tokens_seen": 99115965, + "router_z_loss_clip": 2.11523438, + "router_z_loss_mlp": 0.18664551, + "step": 4587, + "time_per_iteration": 2.6184678077697754 + }, + { + "auxiliary_loss_clip": 0.0651072, + "auxiliary_loss_mlp": 0.01292397, + "balance_loss_clip": 0.06300361, + "balance_loss_mlp": 0.01273037, + "epoch": 0.27584548324064334, + "flos": 18557639637120.0, + "grad_norm": 2.3677019636161716, + "language_loss": 0.83699477, + "learning_rate": 3.398777478523316e-06, + "loss": 0.91502589, + "num_input_tokens_seen": 99134265, + "router_z_loss_clip": 2.10351562, + "router_z_loss_mlp": 0.19348145, + "step": 4588, + "time_per_iteration": 2.5100526809692383 + }, + { + "auxiliary_loss_clip": 0.06502403, + "auxiliary_loss_mlp": 0.01287014, + "balance_loss_clip": 0.06294176, + "balance_loss_mlp": 0.0126856, + "epoch": 0.2759056064933113, + "flos": 23776811228160.0, + "grad_norm": 1.8520309888563375, + "language_loss": 0.76066566, + "learning_rate": 3.398499087583342e-06, + "loss": 0.83855987, + "num_input_tokens_seen": 99156185, + "router_z_loss_clip": 2.08398438, + "router_z_loss_mlp": 0.18457031, + "step": 4589, + "time_per_iteration": 2.5906028747558594 + }, + { + "auxiliary_loss_clip": 0.06503198, + "auxiliary_loss_mlp": 0.01281135, + "balance_loss_clip": 0.06293473, + "balance_loss_mlp": 0.01261703, + "epoch": 0.27596572974597927, + "flos": 24289114291200.0, + "grad_norm": 1.7619688929899446, + "language_loss": 0.88857687, + "learning_rate": 3.398220643612143e-06, + "loss": 0.96642017, + "num_input_tokens_seen": 99176735, + "router_z_loss_clip": 2.09570312, + "router_z_loss_mlp": 0.19421387, + "step": 4590, + "time_per_iteration": 2.5526933670043945 + }, + { + "auxiliary_loss_clip": 0.0650104, + "auxiliary_loss_mlp": 0.01279948, + "balance_loss_clip": 0.06291595, + "balance_loss_mlp": 0.01261041, + "epoch": 0.27602585299864724, + "flos": 35049296206080.0, + "grad_norm": 1.573202994920717, + "language_loss": 0.71835011, + "learning_rate": 3.397942146620277e-06, + "loss": 0.79615998, + "num_input_tokens_seen": 99199765, + "router_z_loss_clip": 2.09667969, + "router_z_loss_mlp": 0.18908691, + "step": 4591, + "time_per_iteration": 2.659573554992676 + }, + { + "auxiliary_loss_clip": 0.06502488, + "auxiliary_loss_mlp": 0.01277501, + "balance_loss_clip": 0.06290874, + "balance_loss_mlp": 0.01258964, + "epoch": 0.2760859762513152, + "flos": 24315123784320.0, + "grad_norm": 2.0980893762293866, + "language_loss": 0.80327255, + "learning_rate": 3.3976635966183046e-06, + "loss": 0.8810724, + "num_input_tokens_seen": 99218435, + "router_z_loss_clip": 2.11621094, + "router_z_loss_mlp": 0.18530273, + "step": 4592, + "time_per_iteration": 2.5534770488739014 + }, + { + "auxiliary_loss_clip": 0.06405188, + "auxiliary_loss_mlp": 0.01272136, + "balance_loss_clip": 0.06302959, + "balance_loss_mlp": 0.0126841, + "epoch": 0.27614609950398317, + "flos": 71279435675520.0, + "grad_norm": 0.6848268802880488, + "language_loss": 0.6162945, + "learning_rate": 3.3973849936167886e-06, + "loss": 0.69306767, + "num_input_tokens_seen": 99276200, + "router_z_loss_clip": 1.0234375, + "router_z_loss_mlp": 0.03717041, + "step": 4593, + "time_per_iteration": 3.127192735671997 + }, + { + "auxiliary_loss_clip": 0.06506699, + "auxiliary_loss_mlp": 0.01276217, + "balance_loss_clip": 0.0629646, + "balance_loss_mlp": 0.01256881, + "epoch": 0.27620622275665113, + "flos": 29681811688320.0, + "grad_norm": 2.6081053554454363, + "language_loss": 0.77380788, + "learning_rate": 3.3971063376262937e-06, + "loss": 0.85163713, + "num_input_tokens_seen": 99297625, + "router_z_loss_clip": 2.1015625, + "router_z_loss_mlp": 0.1932373, + "step": 4594, + "time_per_iteration": 2.5809319019317627 + }, + { + "auxiliary_loss_clip": 0.06503148, + "auxiliary_loss_mlp": 0.01273163, + "balance_loss_clip": 0.06295307, + "balance_loss_mlp": 0.01255138, + "epoch": 0.2762663460093191, + "flos": 15383571448320.0, + "grad_norm": 1.4453472339612206, + "language_loss": 0.9229176, + "learning_rate": 3.3968276286573866e-06, + "loss": 1.00068069, + "num_input_tokens_seen": 99315790, + "router_z_loss_clip": 2.08007812, + "router_z_loss_mlp": 0.18029785, + "step": 4595, + "time_per_iteration": 3.9466536045074463 + }, + { + "auxiliary_loss_clip": 0.06509015, + "auxiliary_loss_mlp": 0.01281786, + "balance_loss_clip": 0.06294905, + "balance_loss_mlp": 0.01261592, + "epoch": 0.27632646926198706, + "flos": 20710330081920.0, + "grad_norm": 1.8151181533722092, + "language_loss": 0.69491673, + "learning_rate": 3.3965488667206353e-06, + "loss": 0.77282476, + "num_input_tokens_seen": 99334615, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.2019043, + "step": 4596, + "time_per_iteration": 2.552893877029419 + }, + { + "auxiliary_loss_clip": 0.06517404, + "auxiliary_loss_mlp": 0.01272476, + "balance_loss_clip": 0.0629788, + "balance_loss_mlp": 0.0125382, + "epoch": 0.276386592514655, + "flos": 32820981851520.0, + "grad_norm": 1.6734752779014743, + "language_loss": 0.64091378, + "learning_rate": 3.3962700518266113e-06, + "loss": 0.71881258, + "num_input_tokens_seen": 99356685, + "router_z_loss_clip": 2.19335938, + "router_z_loss_mlp": 0.18652344, + "step": 4597, + "time_per_iteration": 2.61291766166687 + }, + { + "auxiliary_loss_clip": 0.06500123, + "auxiliary_loss_mlp": 0.01279427, + "balance_loss_clip": 0.0629456, + "balance_loss_mlp": 0.01260616, + "epoch": 0.276446715767323, + "flos": 18557639637120.0, + "grad_norm": 1.8925825739150304, + "language_loss": 0.86690855, + "learning_rate": 3.395991183985887e-06, + "loss": 0.94470406, + "num_input_tokens_seen": 99374810, + "router_z_loss_clip": 2.05664062, + "router_z_loss_mlp": 0.18835449, + "step": 4598, + "time_per_iteration": 2.5411598682403564 + }, + { + "auxiliary_loss_clip": 0.0650408, + "auxiliary_loss_mlp": 0.0127527, + "balance_loss_clip": 0.06291056, + "balance_loss_mlp": 0.01256554, + "epoch": 0.27650683901999096, + "flos": 22826110003200.0, + "grad_norm": 2.378506410601605, + "language_loss": 0.79588032, + "learning_rate": 3.395712263209037e-06, + "loss": 0.8736738, + "num_input_tokens_seen": 99391290, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.18725586, + "step": 4599, + "time_per_iteration": 2.515411138534546 + }, + { + "auxiliary_loss_clip": 0.06518425, + "auxiliary_loss_mlp": 0.01279235, + "balance_loss_clip": 0.06301137, + "balance_loss_mlp": 0.01259756, + "epoch": 0.276566962272659, + "flos": 21368011178880.0, + "grad_norm": 2.1602669865212487, + "language_loss": 0.80043805, + "learning_rate": 3.395433289506639e-06, + "loss": 0.87841463, + "num_input_tokens_seen": 99409120, + "router_z_loss_clip": 2.17480469, + "router_z_loss_mlp": 0.19482422, + "step": 4600, + "time_per_iteration": 5.317862033843994 + }, + { + "auxiliary_loss_clip": 0.06511359, + "auxiliary_loss_mlp": 0.01276749, + "balance_loss_clip": 0.06296661, + "balance_loss_mlp": 0.01258843, + "epoch": 0.27662708552532694, + "flos": 17716076755200.0, + "grad_norm": 12.932121146702709, + "language_loss": 0.73461431, + "learning_rate": 3.3951542628892694e-06, + "loss": 0.81249541, + "num_input_tokens_seen": 99426180, + "router_z_loss_clip": 2.14550781, + "router_z_loss_mlp": 0.17907715, + "step": 4601, + "time_per_iteration": 2.5192854404449463 + }, + { + "auxiliary_loss_clip": 0.0650773, + "auxiliary_loss_mlp": 0.01282643, + "balance_loss_clip": 0.06297003, + "balance_loss_mlp": 0.01263676, + "epoch": 0.2766872087779949, + "flos": 21259292106240.0, + "grad_norm": 1.833059055741047, + "language_loss": 0.8051585, + "learning_rate": 3.3948751833675113e-06, + "loss": 0.88306224, + "num_input_tokens_seen": 99447720, + "router_z_loss_clip": 2.10742188, + "router_z_loss_mlp": 0.18981934, + "step": 4602, + "time_per_iteration": 2.635265350341797 + }, + { + "auxiliary_loss_clip": 0.06517955, + "auxiliary_loss_mlp": 0.01279657, + "balance_loss_clip": 0.06297721, + "balance_loss_mlp": 0.01259749, + "epoch": 0.2767473320306629, + "flos": 12936728845440.0, + "grad_norm": 2.082735068257359, + "language_loss": 0.7691201, + "learning_rate": 3.3945960509519455e-06, + "loss": 0.8470962, + "num_input_tokens_seen": 99464720, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.19921875, + "step": 4603, + "time_per_iteration": 2.6102261543273926 + }, + { + "auxiliary_loss_clip": 0.06506386, + "auxiliary_loss_mlp": 0.01276601, + "balance_loss_clip": 0.06300791, + "balance_loss_mlp": 0.01259017, + "epoch": 0.27680745528333084, + "flos": 15018239646720.0, + "grad_norm": 1.5173997695974415, + "language_loss": 0.81704807, + "learning_rate": 3.3943168656531585e-06, + "loss": 0.89487797, + "num_input_tokens_seen": 99482310, + "router_z_loss_clip": 2.05859375, + "router_z_loss_mlp": 0.17578125, + "step": 4604, + "time_per_iteration": 2.5022366046905518 + }, + { + "auxiliary_loss_clip": 0.06510165, + "auxiliary_loss_mlp": 0.01279666, + "balance_loss_clip": 0.06295862, + "balance_loss_mlp": 0.01261367, + "epoch": 0.2768675785359988, + "flos": 22644408424320.0, + "grad_norm": 1.8407701121062605, + "language_loss": 0.70736969, + "learning_rate": 3.3940376274817363e-06, + "loss": 0.78526795, + "num_input_tokens_seen": 99501255, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.18310547, + "step": 4605, + "time_per_iteration": 4.068409442901611 + }, + { + "auxiliary_loss_clip": 0.06402105, + "auxiliary_loss_mlp": 0.01269906, + "balance_loss_clip": 0.0629937, + "balance_loss_mlp": 0.01266097, + "epoch": 0.27692770178866677, + "flos": 66150772093440.0, + "grad_norm": 0.7075303746126435, + "language_loss": 0.57218695, + "learning_rate": 3.3937583364482673e-06, + "loss": 0.64890707, + "num_input_tokens_seen": 99568925, + "router_z_loss_clip": 1.03027344, + "router_z_loss_mlp": 0.0380249, + "step": 4606, + "time_per_iteration": 3.269275426864624 + }, + { + "auxiliary_loss_clip": 0.06516754, + "auxiliary_loss_mlp": 0.01286288, + "balance_loss_clip": 0.06299627, + "balance_loss_mlp": 0.01266118, + "epoch": 0.27698782504133473, + "flos": 26471545735680.0, + "grad_norm": 1.9632725808751148, + "language_loss": 0.69427574, + "learning_rate": 3.3934789925633424e-06, + "loss": 0.77230614, + "num_input_tokens_seen": 99588455, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.20153809, + "step": 4607, + "time_per_iteration": 2.566908836364746 + }, + { + "auxiliary_loss_clip": 0.06512889, + "auxiliary_loss_mlp": 0.01276778, + "balance_loss_clip": 0.06304939, + "balance_loss_mlp": 0.01258849, + "epoch": 0.2770479482940027, + "flos": 25891878389760.0, + "grad_norm": 1.6636880421304368, + "language_loss": 0.70338356, + "learning_rate": 3.393199595837555e-06, + "loss": 0.78128028, + "num_input_tokens_seen": 99609355, + "router_z_loss_clip": 2.08203125, + "router_z_loss_mlp": 0.17919922, + "step": 4608, + "time_per_iteration": 2.709989309310913 + }, + { + "auxiliary_loss_clip": 0.06514756, + "auxiliary_loss_mlp": 0.01279509, + "balance_loss_clip": 0.06298438, + "balance_loss_mlp": 0.01260781, + "epoch": 0.27710807154667066, + "flos": 22863942921600.0, + "grad_norm": 1.8326330841759049, + "language_loss": 0.73323762, + "learning_rate": 3.392920146281499e-06, + "loss": 0.81118023, + "num_input_tokens_seen": 99628780, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.18725586, + "step": 4609, + "time_per_iteration": 2.530625581741333 + }, + { + "auxiliary_loss_clip": 0.06522895, + "auxiliary_loss_mlp": 0.01276886, + "balance_loss_clip": 0.063067, + "balance_loss_mlp": 0.0125749, + "epoch": 0.27716819479933863, + "flos": 17716621806720.0, + "grad_norm": 2.1915868475112714, + "language_loss": 0.84688777, + "learning_rate": 3.3926406439057714e-06, + "loss": 0.92488557, + "num_input_tokens_seen": 99644545, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.19396973, + "step": 4610, + "time_per_iteration": 2.578780174255371 + }, + { + "auxiliary_loss_clip": 0.06521606, + "auxiliary_loss_mlp": 0.01280928, + "balance_loss_clip": 0.06305112, + "balance_loss_mlp": 0.01260054, + "epoch": 0.2772283180520066, + "flos": 19652125668480.0, + "grad_norm": 1.9738462991775114, + "language_loss": 0.69718874, + "learning_rate": 3.3923610887209705e-06, + "loss": 0.77521408, + "num_input_tokens_seen": 99663125, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.20874023, + "step": 4611, + "time_per_iteration": 2.5499660968780518 + }, + { + "auxiliary_loss_clip": 0.0651576, + "auxiliary_loss_mlp": 0.0127314, + "balance_loss_clip": 0.06309414, + "balance_loss_mlp": 0.01254997, + "epoch": 0.27728844130467456, + "flos": 21038960995200.0, + "grad_norm": 1.8677227151172762, + "language_loss": 0.74507141, + "learning_rate": 3.392081480737698e-06, + "loss": 0.82296044, + "num_input_tokens_seen": 99682645, + "router_z_loss_clip": 2.06347656, + "router_z_loss_mlp": 0.18151855, + "step": 4612, + "time_per_iteration": 2.567218065261841 + }, + { + "auxiliary_loss_clip": 0.06522087, + "auxiliary_loss_mlp": 0.01282319, + "balance_loss_clip": 0.06306847, + "balance_loss_mlp": 0.01263067, + "epoch": 0.2773485645573425, + "flos": 18995157331200.0, + "grad_norm": 2.3882423035535063, + "language_loss": 0.67084455, + "learning_rate": 3.3918018199665563e-06, + "loss": 0.74888861, + "num_input_tokens_seen": 99700520, + "router_z_loss_clip": 2.14941406, + "router_z_loss_mlp": 0.19250488, + "step": 4613, + "time_per_iteration": 2.5458126068115234 + }, + { + "auxiliary_loss_clip": 0.06515062, + "auxiliary_loss_mlp": 0.01275499, + "balance_loss_clip": 0.06304698, + "balance_loss_mlp": 0.0125577, + "epoch": 0.27740868781001055, + "flos": 21474508118400.0, + "grad_norm": 1.6100748666203144, + "language_loss": 0.79936564, + "learning_rate": 3.39152210641815e-06, + "loss": 0.87727129, + "num_input_tokens_seen": 99720355, + "router_z_loss_clip": 2.1015625, + "router_z_loss_mlp": 0.19750977, + "step": 4614, + "time_per_iteration": 2.5586962699890137 + }, + { + "auxiliary_loss_clip": 0.06520429, + "auxiliary_loss_mlp": 0.01279079, + "balance_loss_clip": 0.06305806, + "balance_loss_mlp": 0.01257884, + "epoch": 0.2774688110626785, + "flos": 19833827247360.0, + "grad_norm": 2.249482091575283, + "language_loss": 0.81082475, + "learning_rate": 3.3912423401030865e-06, + "loss": 0.88881981, + "num_input_tokens_seen": 99736090, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.21179199, + "step": 4615, + "time_per_iteration": 2.5192136764526367 + }, + { + "auxiliary_loss_clip": 0.0652476, + "auxiliary_loss_mlp": 0.0127518, + "balance_loss_clip": 0.06306368, + "balance_loss_mlp": 0.01256655, + "epoch": 0.2775289343153465, + "flos": 18220916805120.0, + "grad_norm": 2.6879454427381715, + "language_loss": 0.64382082, + "learning_rate": 3.3909625210319735e-06, + "loss": 0.72182024, + "num_input_tokens_seen": 99751805, + "router_z_loss_clip": 2.18164062, + "router_z_loss_mlp": 0.18518066, + "step": 4616, + "time_per_iteration": 2.528766393661499 + }, + { + "auxiliary_loss_clip": 0.06523173, + "auxiliary_loss_mlp": 0.01284441, + "balance_loss_clip": 0.06308753, + "balance_loss_mlp": 0.0126377, + "epoch": 0.27758905756801444, + "flos": 16478141333760.0, + "grad_norm": 2.0768832102625296, + "language_loss": 0.82857239, + "learning_rate": 3.3906826492154226e-06, + "loss": 0.90664852, + "num_input_tokens_seen": 99770610, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.20678711, + "step": 4617, + "time_per_iteration": 2.5130555629730225 + }, + { + "auxiliary_loss_clip": 0.06522305, + "auxiliary_loss_mlp": 0.01278739, + "balance_loss_clip": 0.06308021, + "balance_loss_mlp": 0.01260059, + "epoch": 0.2776491808206824, + "flos": 18733219868160.0, + "grad_norm": 2.583119020836192, + "language_loss": 0.77338278, + "learning_rate": 3.3904027246640458e-06, + "loss": 0.85139322, + "num_input_tokens_seen": 99787305, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.18676758, + "step": 4618, + "time_per_iteration": 2.5491156578063965 + }, + { + "auxiliary_loss_clip": 0.06524394, + "auxiliary_loss_mlp": 0.01277476, + "balance_loss_clip": 0.06309742, + "balance_loss_mlp": 0.01260191, + "epoch": 0.27770930407335037, + "flos": 28045742791680.0, + "grad_norm": 1.764934716544716, + "language_loss": 0.85733759, + "learning_rate": 3.390122747388459e-06, + "loss": 0.93535626, + "num_input_tokens_seen": 99808940, + "router_z_loss_clip": 2.14648438, + "router_z_loss_mlp": 0.17297363, + "step": 4619, + "time_per_iteration": 2.5741615295410156 + }, + { + "auxiliary_loss_clip": 0.06514929, + "auxiliary_loss_mlp": 0.01285121, + "balance_loss_clip": 0.06308962, + "balance_loss_mlp": 0.01266798, + "epoch": 0.27776942732601834, + "flos": 23556522044160.0, + "grad_norm": 1.4813387132666624, + "language_loss": 0.77092409, + "learning_rate": 3.3898427173992778e-06, + "loss": 0.84892452, + "num_input_tokens_seen": 99829575, + "router_z_loss_clip": 2.0625, + "router_z_loss_mlp": 0.18322754, + "step": 4620, + "time_per_iteration": 2.690934658050537 + }, + { + "auxiliary_loss_clip": 0.0651743, + "auxiliary_loss_mlp": 0.01277569, + "balance_loss_clip": 0.06309397, + "balance_loss_mlp": 0.0125821, + "epoch": 0.2778295505786863, + "flos": 23914474686720.0, + "grad_norm": 1.8907472710416175, + "language_loss": 0.78585863, + "learning_rate": 3.389562634707122e-06, + "loss": 0.86380863, + "num_input_tokens_seen": 99847575, + "router_z_loss_clip": 2.07910156, + "router_z_loss_mlp": 0.19360352, + "step": 4621, + "time_per_iteration": 2.5846168994903564 + }, + { + "auxiliary_loss_clip": 0.06522836, + "auxiliary_loss_mlp": 0.01279001, + "balance_loss_clip": 0.0630835, + "balance_loss_mlp": 0.01259701, + "epoch": 0.27788967383135427, + "flos": 25561276905600.0, + "grad_norm": 2.170367430288875, + "language_loss": 0.88217753, + "learning_rate": 3.389282499322611e-06, + "loss": 0.96019584, + "num_input_tokens_seen": 99864995, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.1932373, + "step": 4622, + "time_per_iteration": 2.6036407947540283 + }, + { + "auxiliary_loss_clip": 0.06512653, + "auxiliary_loss_mlp": 0.01273349, + "balance_loss_clip": 0.06299745, + "balance_loss_mlp": 0.01254919, + "epoch": 0.27794979708402223, + "flos": 16258103712000.0, + "grad_norm": 2.5896700244630018, + "language_loss": 0.81515396, + "learning_rate": 3.389002311256369e-06, + "loss": 0.89301395, + "num_input_tokens_seen": 99881540, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.18432617, + "step": 4623, + "time_per_iteration": 2.539655923843384 + }, + { + "auxiliary_loss_clip": 0.06518189, + "auxiliary_loss_mlp": 0.01278229, + "balance_loss_clip": 0.06306686, + "balance_loss_mlp": 0.01258941, + "epoch": 0.2780099203366902, + "flos": 20673880755840.0, + "grad_norm": 1.9609752985345037, + "language_loss": 0.82099682, + "learning_rate": 3.3887220705190204e-06, + "loss": 0.89896095, + "num_input_tokens_seen": 99899595, + "router_z_loss_clip": 2.11621094, + "router_z_loss_mlp": 0.19274902, + "step": 4624, + "time_per_iteration": 2.5662107467651367 + }, + { + "auxiliary_loss_clip": 0.06512089, + "auxiliary_loss_mlp": 0.01276338, + "balance_loss_clip": 0.06303106, + "balance_loss_mlp": 0.01258004, + "epoch": 0.27807004358935816, + "flos": 17743805256960.0, + "grad_norm": 3.013190567677447, + "language_loss": 0.77269506, + "learning_rate": 3.388441777121191e-06, + "loss": 0.85057938, + "num_input_tokens_seen": 99913020, + "router_z_loss_clip": 2.08789062, + "router_z_loss_mlp": 0.18322754, + "step": 4625, + "time_per_iteration": 2.5685927867889404 + }, + { + "auxiliary_loss_clip": 0.06507699, + "auxiliary_loss_mlp": 0.01271539, + "balance_loss_clip": 0.06299223, + "balance_loss_mlp": 0.01253658, + "epoch": 0.2781301668420261, + "flos": 16732699637760.0, + "grad_norm": 1.9769276375727096, + "language_loss": 0.70884871, + "learning_rate": 3.388161431073511e-06, + "loss": 0.78664112, + "num_input_tokens_seen": 99931405, + "router_z_loss_clip": 2.08398438, + "router_z_loss_mlp": 0.17883301, + "step": 4626, + "time_per_iteration": 2.527975559234619 + }, + { + "auxiliary_loss_clip": 0.06520554, + "auxiliary_loss_mlp": 0.01273216, + "balance_loss_clip": 0.06304689, + "balance_loss_mlp": 0.01254798, + "epoch": 0.27819029009469415, + "flos": 13849848714240.0, + "grad_norm": 2.4481240639566013, + "language_loss": 0.93016249, + "learning_rate": 3.38788103238661e-06, + "loss": 1.00810015, + "num_input_tokens_seen": 99948100, + "router_z_loss_clip": 2.15917969, + "router_z_loss_mlp": 0.18432617, + "step": 4627, + "time_per_iteration": 2.551558494567871 + }, + { + "auxiliary_loss_clip": 0.06514014, + "auxiliary_loss_mlp": 0.01276758, + "balance_loss_clip": 0.06298277, + "balance_loss_mlp": 0.01258364, + "epoch": 0.2782504133473621, + "flos": 27096634794240.0, + "grad_norm": 1.6603793888564844, + "language_loss": 0.85558021, + "learning_rate": 3.387600581071121e-06, + "loss": 0.93348801, + "num_input_tokens_seen": 99966470, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.1842041, + "step": 4628, + "time_per_iteration": 2.56680965423584 + }, + { + "auxiliary_loss_clip": 0.06511193, + "auxiliary_loss_mlp": 0.01275379, + "balance_loss_clip": 0.06301076, + "balance_loss_mlp": 0.01257569, + "epoch": 0.2783105366000301, + "flos": 21075116832000.0, + "grad_norm": 1.7183700627805243, + "language_loss": 0.79370463, + "learning_rate": 3.387320077137679e-06, + "loss": 0.87157035, + "num_input_tokens_seen": 99985930, + "router_z_loss_clip": 2.1015625, + "router_z_loss_mlp": 0.17810059, + "step": 4629, + "time_per_iteration": 2.579024076461792 + }, + { + "auxiliary_loss_clip": 0.06504764, + "auxiliary_loss_mlp": 0.01277211, + "balance_loss_clip": 0.06300465, + "balance_loss_mlp": 0.01259699, + "epoch": 0.27837065985269804, + "flos": 26508456259200.0, + "grad_norm": 2.4632649346037856, + "language_loss": 0.84664094, + "learning_rate": 3.3870395205969208e-06, + "loss": 0.92446071, + "num_input_tokens_seen": 100006235, + "router_z_loss_clip": 2.04296875, + "router_z_loss_mlp": 0.17529297, + "step": 4630, + "time_per_iteration": 2.568190336227417 + }, + { + "auxiliary_loss_clip": 0.06516108, + "auxiliary_loss_mlp": 0.01271169, + "balance_loss_clip": 0.06302783, + "balance_loss_mlp": 0.01253395, + "epoch": 0.278430783105366, + "flos": 20228271143040.0, + "grad_norm": 1.8872458968592738, + "language_loss": 0.80858278, + "learning_rate": 3.386758911459485e-06, + "loss": 0.8864556, + "num_input_tokens_seen": 100023655, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.17773438, + "step": 4631, + "time_per_iteration": 2.5658912658691406 + }, + { + "auxiliary_loss_clip": 0.06512441, + "auxiliary_loss_mlp": 0.01275522, + "balance_loss_clip": 0.06299636, + "balance_loss_mlp": 0.01256866, + "epoch": 0.278490906358034, + "flos": 25599906437760.0, + "grad_norm": 2.407277572133289, + "language_loss": 0.715128, + "learning_rate": 3.3864782497360126e-06, + "loss": 0.79300761, + "num_input_tokens_seen": 100043280, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.18652344, + "step": 4632, + "time_per_iteration": 2.620729446411133 + }, + { + "auxiliary_loss_clip": 0.06502309, + "auxiliary_loss_mlp": 0.01270787, + "balance_loss_clip": 0.06296511, + "balance_loss_mlp": 0.01253502, + "epoch": 0.27855102961070194, + "flos": 16175645694720.0, + "grad_norm": 1.8302171024684264, + "language_loss": 0.82394838, + "learning_rate": 3.386197535437145e-06, + "loss": 0.9016794, + "num_input_tokens_seen": 100057690, + "router_z_loss_clip": 2.05859375, + "router_z_loss_mlp": 0.17297363, + "step": 4633, + "time_per_iteration": 2.513705015182495 + }, + { + "auxiliary_loss_clip": 0.06511516, + "auxiliary_loss_mlp": 0.01278904, + "balance_loss_clip": 0.06299913, + "balance_loss_mlp": 0.012597, + "epoch": 0.2786111528633699, + "flos": 22933864753920.0, + "grad_norm": 1.5843012688553681, + "language_loss": 0.8872478, + "learning_rate": 3.385916768573529e-06, + "loss": 0.96515197, + "num_input_tokens_seen": 100075875, + "router_z_loss_clip": 2.11523438, + "router_z_loss_mlp": 0.19213867, + "step": 4634, + "time_per_iteration": 2.5471088886260986 + }, + { + "auxiliary_loss_clip": 0.06514788, + "auxiliary_loss_mlp": 0.01276007, + "balance_loss_clip": 0.06301814, + "balance_loss_mlp": 0.01256588, + "epoch": 0.27867127611603787, + "flos": 23410934375040.0, + "grad_norm": 1.5369483246730489, + "language_loss": 0.77466059, + "learning_rate": 3.38563594915581e-06, + "loss": 0.85256851, + "num_input_tokens_seen": 100092930, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.19433594, + "step": 4635, + "time_per_iteration": 3.9016311168670654 + }, + { + "auxiliary_loss_clip": 0.06508552, + "auxiliary_loss_mlp": 0.01273062, + "balance_loss_clip": 0.06295648, + "balance_loss_mlp": 0.01254859, + "epoch": 0.27873139936870583, + "flos": 19835210839680.0, + "grad_norm": 1.7801998538005617, + "language_loss": 0.66571766, + "learning_rate": 3.385355077194637e-06, + "loss": 0.74353385, + "num_input_tokens_seen": 100110790, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.18188477, + "step": 4636, + "time_per_iteration": 2.5264599323272705 + }, + { + "auxiliary_loss_clip": 0.06519878, + "auxiliary_loss_mlp": 0.01275894, + "balance_loss_clip": 0.06302889, + "balance_loss_mlp": 0.01256392, + "epoch": 0.2787915226213738, + "flos": 17712638737920.0, + "grad_norm": 2.933733922484583, + "language_loss": 0.83255613, + "learning_rate": 3.3850741527006604e-06, + "loss": 0.91051382, + "num_input_tokens_seen": 100126970, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.19506836, + "step": 4637, + "time_per_iteration": 2.5344014167785645 + }, + { + "auxiliary_loss_clip": 0.06505676, + "auxiliary_loss_mlp": 0.01276787, + "balance_loss_clip": 0.06297021, + "balance_loss_mlp": 0.01258918, + "epoch": 0.27885164587404176, + "flos": 22097039627520.0, + "grad_norm": 1.4932909871395708, + "language_loss": 0.76038569, + "learning_rate": 3.384793175684533e-06, + "loss": 0.83821034, + "num_input_tokens_seen": 100146720, + "router_z_loss_clip": 2.08691406, + "router_z_loss_mlp": 0.17871094, + "step": 4638, + "time_per_iteration": 2.544187068939209 + }, + { + "auxiliary_loss_clip": 0.06510019, + "auxiliary_loss_mlp": 0.01280274, + "balance_loss_clip": 0.06297282, + "balance_loss_mlp": 0.01262511, + "epoch": 0.27891176912670973, + "flos": 19213601725440.0, + "grad_norm": 2.235877812045319, + "language_loss": 0.72492748, + "learning_rate": 3.38451214615691e-06, + "loss": 0.8028304, + "num_input_tokens_seen": 100165920, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.17749023, + "step": 4639, + "time_per_iteration": 4.002680063247681 + }, + { + "auxiliary_loss_clip": 0.06515414, + "auxiliary_loss_mlp": 0.0127372, + "balance_loss_clip": 0.06300536, + "balance_loss_mlp": 0.01254813, + "epoch": 0.27897189237937775, + "flos": 27607428483840.0, + "grad_norm": 1.8877142592522154, + "language_loss": 0.66217673, + "learning_rate": 3.384231064128447e-06, + "loss": 0.74006808, + "num_input_tokens_seen": 100185525, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.18896484, + "step": 4640, + "time_per_iteration": 4.054874420166016 + }, + { + "auxiliary_loss_clip": 0.0651349, + "auxiliary_loss_mlp": 0.01272631, + "balance_loss_clip": 0.06301108, + "balance_loss_mlp": 0.01254654, + "epoch": 0.2790320156320457, + "flos": 21184506737280.0, + "grad_norm": 2.077527470737851, + "language_loss": 0.72818768, + "learning_rate": 3.383949929609804e-06, + "loss": 0.80604887, + "num_input_tokens_seen": 100204850, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.1796875, + "step": 4641, + "time_per_iteration": 2.566758155822754 + }, + { + "auxiliary_loss_clip": 0.06517549, + "auxiliary_loss_mlp": 0.01276062, + "balance_loss_clip": 0.06298883, + "balance_loss_mlp": 0.01256488, + "epoch": 0.2790921388847137, + "flos": 22790541144960.0, + "grad_norm": 1.8548696214163785, + "language_loss": 0.75277239, + "learning_rate": 3.383668742611641e-06, + "loss": 0.8307085, + "num_input_tokens_seen": 100224520, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.19567871, + "step": 4642, + "time_per_iteration": 2.5531389713287354 + }, + { + "auxiliary_loss_clip": 0.0651103, + "auxiliary_loss_mlp": 0.01281312, + "balance_loss_clip": 0.06296819, + "balance_loss_mlp": 0.01261631, + "epoch": 0.27915226213738165, + "flos": 23406783598080.0, + "grad_norm": 1.8301300365045747, + "language_loss": 0.85787475, + "learning_rate": 3.3833875031446205e-06, + "loss": 0.93579817, + "num_input_tokens_seen": 100243935, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.19689941, + "step": 4643, + "time_per_iteration": 2.561692714691162 + }, + { + "auxiliary_loss_clip": 0.06505755, + "auxiliary_loss_mlp": 0.01281002, + "balance_loss_clip": 0.06292956, + "balance_loss_mlp": 0.01262572, + "epoch": 0.2792123853900496, + "flos": 22754469162240.0, + "grad_norm": 2.128449816262669, + "language_loss": 0.83027583, + "learning_rate": 3.383106211219407e-06, + "loss": 0.9081434, + "num_input_tokens_seen": 100262290, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.1842041, + "step": 4644, + "time_per_iteration": 2.5298962593078613 + }, + { + "auxiliary_loss_clip": 0.06505448, + "auxiliary_loss_mlp": 0.01273805, + "balance_loss_clip": 0.0629155, + "balance_loss_mlp": 0.01256174, + "epoch": 0.2792725086427176, + "flos": 15054772826880.0, + "grad_norm": 1.7497246062339578, + "language_loss": 0.79546082, + "learning_rate": 3.3828248668466673e-06, + "loss": 0.87325335, + "num_input_tokens_seen": 100280015, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.17626953, + "step": 4645, + "time_per_iteration": 3.9172677993774414 + }, + { + "auxiliary_loss_clip": 0.06419063, + "auxiliary_loss_mlp": 0.01254208, + "balance_loss_clip": 0.0631457, + "balance_loss_mlp": 0.0125017, + "epoch": 0.27933263189538554, + "flos": 62562805862400.0, + "grad_norm": 0.7707831229317741, + "language_loss": 0.62136066, + "learning_rate": 3.3825434700370705e-06, + "loss": 0.6980933, + "num_input_tokens_seen": 100338935, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.04037476, + "step": 4646, + "time_per_iteration": 3.1527390480041504 + }, + { + "auxiliary_loss_clip": 0.06500821, + "auxiliary_loss_mlp": 0.01275319, + "balance_loss_clip": 0.0629313, + "balance_loss_mlp": 0.01257581, + "epoch": 0.2793927551480535, + "flos": 25125268584960.0, + "grad_norm": 1.6018723981737446, + "language_loss": 0.89582062, + "learning_rate": 3.3822620208012865e-06, + "loss": 0.97358203, + "num_input_tokens_seen": 100359905, + "router_z_loss_clip": 2.078125, + "router_z_loss_mlp": 0.17736816, + "step": 4647, + "time_per_iteration": 2.564333915710449 + }, + { + "auxiliary_loss_clip": 0.06509704, + "auxiliary_loss_mlp": 0.01277108, + "balance_loss_clip": 0.06292088, + "balance_loss_mlp": 0.01258142, + "epoch": 0.27945287840072147, + "flos": 21330974874240.0, + "grad_norm": 1.6381839497334347, + "language_loss": 0.87525821, + "learning_rate": 3.381980519149988e-06, + "loss": 0.95312631, + "num_input_tokens_seen": 100376955, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.1895752, + "step": 4648, + "time_per_iteration": 2.5516953468322754 + }, + { + "auxiliary_loss_clip": 0.06507549, + "auxiliary_loss_mlp": 0.01274847, + "balance_loss_clip": 0.06291072, + "balance_loss_mlp": 0.01256643, + "epoch": 0.27951300165338944, + "flos": 27457354621440.0, + "grad_norm": 2.652634800411286, + "language_loss": 0.73020303, + "learning_rate": 3.38169896509385e-06, + "loss": 0.80802703, + "num_input_tokens_seen": 100397545, + "router_z_loss_clip": 2.1640625, + "router_z_loss_mlp": 0.18212891, + "step": 4649, + "time_per_iteration": 2.5767719745635986 + }, + { + "auxiliary_loss_clip": 0.06508242, + "auxiliary_loss_mlp": 0.01277361, + "balance_loss_clip": 0.0629622, + "balance_loss_mlp": 0.01259003, + "epoch": 0.2795731249060574, + "flos": 15164456221440.0, + "grad_norm": 2.110277953429804, + "language_loss": 0.81314564, + "learning_rate": 3.381417358643549e-06, + "loss": 0.8910017, + "num_input_tokens_seen": 100415080, + "router_z_loss_clip": 2.12109375, + "router_z_loss_mlp": 0.18347168, + "step": 4650, + "time_per_iteration": 2.663588285446167 + }, + { + "auxiliary_loss_clip": 0.06406052, + "auxiliary_loss_mlp": 0.01252705, + "balance_loss_clip": 0.06303374, + "balance_loss_mlp": 0.01248944, + "epoch": 0.27963324815872537, + "flos": 60140951775360.0, + "grad_norm": 0.800089640521837, + "language_loss": 0.5874877, + "learning_rate": 3.3811356998097624e-06, + "loss": 0.66407531, + "num_input_tokens_seen": 100471105, + "router_z_loss_clip": 1.0234375, + "router_z_loss_mlp": 0.03753662, + "step": 4651, + "time_per_iteration": 3.205563545227051 + }, + { + "auxiliary_loss_clip": 0.06513405, + "auxiliary_loss_mlp": 0.01276159, + "balance_loss_clip": 0.06293929, + "balance_loss_mlp": 0.01257205, + "epoch": 0.27969337141139333, + "flos": 21773020688640.0, + "grad_norm": 1.70848848544609, + "language_loss": 0.74928713, + "learning_rate": 3.3808539886031726e-06, + "loss": 0.82718277, + "num_input_tokens_seen": 100492520, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.18945312, + "step": 4652, + "time_per_iteration": 2.620284080505371 + }, + { + "auxiliary_loss_clip": 0.06513481, + "auxiliary_loss_mlp": 0.01277362, + "balance_loss_clip": 0.06297033, + "balance_loss_mlp": 0.01259517, + "epoch": 0.27975349466406135, + "flos": 39859559072640.0, + "grad_norm": 2.257859492249039, + "language_loss": 0.81193566, + "learning_rate": 3.380572225034461e-06, + "loss": 0.88984406, + "num_input_tokens_seen": 100512870, + "router_z_loss_clip": 2.16601562, + "router_z_loss_mlp": 0.17834473, + "step": 4653, + "time_per_iteration": 2.6902103424072266 + }, + { + "auxiliary_loss_clip": 0.06505801, + "auxiliary_loss_mlp": 0.01275903, + "balance_loss_clip": 0.06293398, + "balance_loss_mlp": 0.01257939, + "epoch": 0.2798136179167293, + "flos": 21586204010880.0, + "grad_norm": 2.2005279612587647, + "language_loss": 0.78939915, + "learning_rate": 3.380290409114312e-06, + "loss": 0.86721623, + "num_input_tokens_seen": 100531655, + "router_z_loss_clip": 2.12304688, + "router_z_loss_mlp": 0.17956543, + "step": 4654, + "time_per_iteration": 2.5862321853637695 + }, + { + "auxiliary_loss_clip": 0.06514826, + "auxiliary_loss_mlp": 0.01276603, + "balance_loss_clip": 0.06294681, + "balance_loss_mlp": 0.01256457, + "epoch": 0.2798737411693973, + "flos": 21543130212480.0, + "grad_norm": 2.786817882874951, + "language_loss": 0.81491858, + "learning_rate": 3.3800085408534127e-06, + "loss": 0.89283288, + "num_input_tokens_seen": 100548005, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.20153809, + "step": 4655, + "time_per_iteration": 2.5335962772369385 + }, + { + "auxiliary_loss_clip": 0.06503223, + "auxiliary_loss_mlp": 0.01276615, + "balance_loss_clip": 0.06287771, + "balance_loss_mlp": 0.0125778, + "epoch": 0.27993386442206525, + "flos": 26988586554240.0, + "grad_norm": 1.7572759264995625, + "language_loss": 0.82015479, + "learning_rate": 3.3797266202624506e-06, + "loss": 0.89795309, + "num_input_tokens_seen": 100567980, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.18847656, + "step": 4656, + "time_per_iteration": 2.5953826904296875 + }, + { + "auxiliary_loss_clip": 0.0650457, + "auxiliary_loss_mlp": 0.01280726, + "balance_loss_clip": 0.06291523, + "balance_loss_mlp": 0.01261319, + "epoch": 0.2799939876747332, + "flos": 24356268938880.0, + "grad_norm": 1.602501989097996, + "language_loss": 0.83292782, + "learning_rate": 3.3794446473521176e-06, + "loss": 0.91078079, + "num_input_tokens_seen": 100588630, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.19396973, + "step": 4657, + "time_per_iteration": 2.546698808670044 + }, + { + "auxiliary_loss_clip": 0.06501682, + "auxiliary_loss_mlp": 0.01283943, + "balance_loss_clip": 0.06287715, + "balance_loss_mlp": 0.01265847, + "epoch": 0.2800541109274012, + "flos": 33665479626240.0, + "grad_norm": 2.056920585114217, + "language_loss": 0.64474404, + "learning_rate": 3.379162622133105e-06, + "loss": 0.72260022, + "num_input_tokens_seen": 100608775, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.18103027, + "step": 4658, + "time_per_iteration": 2.633352041244507 + }, + { + "auxiliary_loss_clip": 0.0650496, + "auxiliary_loss_mlp": 0.01278289, + "balance_loss_clip": 0.06292152, + "balance_loss_mlp": 0.01258298, + "epoch": 0.28011423418006914, + "flos": 21620515057920.0, + "grad_norm": 1.9139831777919125, + "language_loss": 0.78200769, + "learning_rate": 3.3788805446161073e-06, + "loss": 0.85984015, + "num_input_tokens_seen": 100627975, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.19995117, + "step": 4659, + "time_per_iteration": 2.5146000385284424 + }, + { + "auxiliary_loss_clip": 0.06512548, + "auxiliary_loss_mlp": 0.01279668, + "balance_loss_clip": 0.06298335, + "balance_loss_mlp": 0.01260582, + "epoch": 0.2801743574327371, + "flos": 23119130131200.0, + "grad_norm": 1.8180566150817747, + "language_loss": 0.79711032, + "learning_rate": 3.3785984148118215e-06, + "loss": 0.87503254, + "num_input_tokens_seen": 100645430, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.1907959, + "step": 4660, + "time_per_iteration": 2.5558273792266846 + }, + { + "auxiliary_loss_clip": 0.06502102, + "auxiliary_loss_mlp": 0.01275983, + "balance_loss_clip": 0.06293646, + "balance_loss_mlp": 0.01257732, + "epoch": 0.2802344806854051, + "flos": 12646433975040.0, + "grad_norm": 2.0195446081970685, + "language_loss": 0.8127892, + "learning_rate": 3.3783162327309453e-06, + "loss": 0.89057004, + "num_input_tokens_seen": 100663775, + "router_z_loss_clip": 2.08300781, + "router_z_loss_mlp": 0.18237305, + "step": 4661, + "time_per_iteration": 2.475562572479248 + }, + { + "auxiliary_loss_clip": 0.06508808, + "auxiliary_loss_mlp": 0.01277709, + "balance_loss_clip": 0.06296618, + "balance_loss_mlp": 0.01258898, + "epoch": 0.28029460393807304, + "flos": 37276772019840.0, + "grad_norm": 2.0240330571158904, + "language_loss": 0.79226935, + "learning_rate": 3.3780339983841794e-06, + "loss": 0.87013447, + "num_input_tokens_seen": 100686085, + "router_z_loss_clip": 2.12109375, + "router_z_loss_mlp": 0.18823242, + "step": 4662, + "time_per_iteration": 2.6644277572631836 + }, + { + "auxiliary_loss_clip": 0.06515819, + "auxiliary_loss_mlp": 0.01277387, + "balance_loss_clip": 0.06296565, + "balance_loss_mlp": 0.01258349, + "epoch": 0.280354727190741, + "flos": 20747450240640.0, + "grad_norm": 1.722651872041065, + "language_loss": 0.70744783, + "learning_rate": 3.377751711782227e-06, + "loss": 0.78537989, + "num_input_tokens_seen": 100705135, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.19042969, + "step": 4663, + "time_per_iteration": 2.5365068912506104 + }, + { + "auxiliary_loss_clip": 0.06510712, + "auxiliary_loss_mlp": 0.01280818, + "balance_loss_clip": 0.06293653, + "balance_loss_mlp": 0.01259312, + "epoch": 0.28041485044340897, + "flos": 21477526865280.0, + "grad_norm": 1.8007469711633386, + "language_loss": 0.77919745, + "learning_rate": 3.377469372935791e-06, + "loss": 0.85711277, + "num_input_tokens_seen": 100724960, + "router_z_loss_clip": 2.17089844, + "router_z_loss_mlp": 0.21520996, + "step": 4664, + "time_per_iteration": 2.578552484512329 + }, + { + "auxiliary_loss_clip": 0.06500383, + "auxiliary_loss_mlp": 0.01277041, + "balance_loss_clip": 0.06293675, + "balance_loss_mlp": 0.01259374, + "epoch": 0.28047497369607693, + "flos": 14799669471360.0, + "grad_norm": 1.9758280924180103, + "language_loss": 0.80386382, + "learning_rate": 3.377186981855578e-06, + "loss": 0.88163805, + "num_input_tokens_seen": 100741995, + "router_z_loss_clip": 2.06738281, + "router_z_loss_mlp": 0.17675781, + "step": 4665, + "time_per_iteration": 2.5088212490081787 + }, + { + "auxiliary_loss_clip": 0.06506059, + "auxiliary_loss_mlp": 0.01274647, + "balance_loss_clip": 0.06294893, + "balance_loss_mlp": 0.01257397, + "epoch": 0.2805350969487449, + "flos": 23076559457280.0, + "grad_norm": 2.052054159073397, + "language_loss": 0.81109238, + "learning_rate": 3.3769045385522968e-06, + "loss": 0.88889945, + "num_input_tokens_seen": 100758985, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.17236328, + "step": 4666, + "time_per_iteration": 2.5765438079833984 + }, + { + "auxiliary_loss_clip": 0.06505027, + "auxiliary_loss_mlp": 0.01282246, + "balance_loss_clip": 0.0629367, + "balance_loss_mlp": 0.01263149, + "epoch": 0.2805952202014129, + "flos": 20485177361280.0, + "grad_norm": 2.1346617464039395, + "language_loss": 0.84940714, + "learning_rate": 3.376622043036658e-06, + "loss": 0.92727995, + "num_input_tokens_seen": 100777820, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.19104004, + "step": 4667, + "time_per_iteration": 2.536466360092163 + }, + { + "auxiliary_loss_clip": 0.06510031, + "auxiliary_loss_mlp": 0.01284991, + "balance_loss_clip": 0.0629562, + "balance_loss_mlp": 0.0126581, + "epoch": 0.2806553434540809, + "flos": 27424678728960.0, + "grad_norm": 1.8168022919289022, + "language_loss": 0.80077279, + "learning_rate": 3.376339495319373e-06, + "loss": 0.87872303, + "num_input_tokens_seen": 100798205, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.19177246, + "step": 4668, + "time_per_iteration": 2.620793581008911 + }, + { + "auxiliary_loss_clip": 0.06508033, + "auxiliary_loss_mlp": 0.01279574, + "balance_loss_clip": 0.06290744, + "balance_loss_mlp": 0.0126124, + "epoch": 0.28071546670674885, + "flos": 26512187765760.0, + "grad_norm": 1.3575587104794173, + "language_loss": 0.76748574, + "learning_rate": 3.3760568954111563e-06, + "loss": 0.84536183, + "num_input_tokens_seen": 100819800, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.18334961, + "step": 4669, + "time_per_iteration": 2.629755973815918 + }, + { + "auxiliary_loss_clip": 0.06511085, + "auxiliary_loss_mlp": 0.01281258, + "balance_loss_clip": 0.06298456, + "balance_loss_mlp": 0.01263376, + "epoch": 0.2807755899594168, + "flos": 20564993975040.0, + "grad_norm": 1.8976620486576934, + "language_loss": 0.79953671, + "learning_rate": 3.375774243322725e-06, + "loss": 0.87746012, + "num_input_tokens_seen": 100837880, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.17883301, + "step": 4670, + "time_per_iteration": 2.630960702896118 + }, + { + "auxiliary_loss_clip": 0.06512859, + "auxiliary_loss_mlp": 0.0128758, + "balance_loss_clip": 0.06296019, + "balance_loss_mlp": 0.0126859, + "epoch": 0.2808357132120848, + "flos": 24319693831680.0, + "grad_norm": 2.1242803821214915, + "language_loss": 0.79548872, + "learning_rate": 3.3754915390647955e-06, + "loss": 0.87349308, + "num_input_tokens_seen": 100856350, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.18981934, + "step": 4671, + "time_per_iteration": 2.5943963527679443 + }, + { + "auxiliary_loss_clip": 0.06499608, + "auxiliary_loss_mlp": 0.01282791, + "balance_loss_clip": 0.06293108, + "balance_loss_mlp": 0.01265124, + "epoch": 0.28089583646475275, + "flos": 26439624529920.0, + "grad_norm": 1.773606658736433, + "language_loss": 0.75789028, + "learning_rate": 3.37520878264809e-06, + "loss": 0.83571434, + "num_input_tokens_seen": 100876135, + "router_z_loss_clip": 2.06640625, + "router_z_loss_mlp": 0.17663574, + "step": 4672, + "time_per_iteration": 2.5819919109344482 + }, + { + "auxiliary_loss_clip": 0.06515782, + "auxiliary_loss_mlp": 0.0128082, + "balance_loss_clip": 0.06299746, + "balance_loss_mlp": 0.01260412, + "epoch": 0.2809559597174207, + "flos": 23118417371520.0, + "grad_norm": 2.723902952009536, + "language_loss": 0.76012361, + "learning_rate": 3.3749259740833286e-06, + "loss": 0.83808959, + "num_input_tokens_seen": 100894790, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.20410156, + "step": 4673, + "time_per_iteration": 2.579460859298706 + }, + { + "auxiliary_loss_clip": 0.06510463, + "auxiliary_loss_mlp": 0.01285315, + "balance_loss_clip": 0.06297876, + "balance_loss_mlp": 0.0126704, + "epoch": 0.2810160829700887, + "flos": 20929864579200.0, + "grad_norm": 1.8153863613356214, + "language_loss": 0.72824192, + "learning_rate": 3.374643113381237e-06, + "loss": 0.80619967, + "num_input_tokens_seen": 100915100, + "router_z_loss_clip": 2.12792969, + "router_z_loss_mlp": 0.18261719, + "step": 4674, + "time_per_iteration": 4.0586278438568115 + }, + { + "auxiliary_loss_clip": 0.06522093, + "auxiliary_loss_mlp": 0.01283708, + "balance_loss_clip": 0.06307152, + "balance_loss_mlp": 0.0126405, + "epoch": 0.28107620622275664, + "flos": 14361145528320.0, + "grad_norm": 1.8954321480679195, + "language_loss": 0.77875817, + "learning_rate": 3.374360200552541e-06, + "loss": 0.85681611, + "num_input_tokens_seen": 100932795, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.1965332, + "step": 4675, + "time_per_iteration": 2.550075054168701 + }, + { + "auxiliary_loss_clip": 0.06512761, + "auxiliary_loss_mlp": 0.01288962, + "balance_loss_clip": 0.06296991, + "balance_loss_mlp": 0.01269531, + "epoch": 0.2811363294754246, + "flos": 20924707553280.0, + "grad_norm": 3.9789590396078784, + "language_loss": 0.70705891, + "learning_rate": 3.374077235607968e-06, + "loss": 0.78507614, + "num_input_tokens_seen": 100950505, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.19433594, + "step": 4676, + "time_per_iteration": 2.519028425216675 + }, + { + "auxiliary_loss_clip": 0.06504105, + "auxiliary_loss_mlp": 0.01278874, + "balance_loss_clip": 0.0629884, + "balance_loss_mlp": 0.01260611, + "epoch": 0.28119645272809257, + "flos": 20601107884800.0, + "grad_norm": 1.5779309471284284, + "language_loss": 0.70529211, + "learning_rate": 3.3737942185582487e-06, + "loss": 0.78312188, + "num_input_tokens_seen": 100968790, + "router_z_loss_clip": 2.04882812, + "router_z_loss_mlp": 0.18286133, + "step": 4677, + "time_per_iteration": 2.5834195613861084 + }, + { + "auxiliary_loss_clip": 0.06516379, + "auxiliary_loss_mlp": 0.01281791, + "balance_loss_clip": 0.06302937, + "balance_loss_mlp": 0.0126193, + "epoch": 0.28125657598076054, + "flos": 25344383811840.0, + "grad_norm": 1.5021857900224345, + "language_loss": 0.64105308, + "learning_rate": 3.3735111494141153e-06, + "loss": 0.71903479, + "num_input_tokens_seen": 100990205, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.1986084, + "step": 4678, + "time_per_iteration": 2.618948221206665 + }, + { + "auxiliary_loss_clip": 0.06517099, + "auxiliary_loss_mlp": 0.01278079, + "balance_loss_clip": 0.06306246, + "balance_loss_mlp": 0.01259947, + "epoch": 0.2813166992334285, + "flos": 24834051319680.0, + "grad_norm": 1.437486997447774, + "language_loss": 0.71167207, + "learning_rate": 3.3732280281863013e-06, + "loss": 0.7896238, + "num_input_tokens_seen": 101009815, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.18139648, + "step": 4679, + "time_per_iteration": 5.466668128967285 + }, + { + "auxiliary_loss_clip": 0.06520079, + "auxiliary_loss_mlp": 0.0127734, + "balance_loss_clip": 0.06306013, + "balance_loss_mlp": 0.01257491, + "epoch": 0.2813768224860965, + "flos": 21766941267840.0, + "grad_norm": 1.8819388160659554, + "language_loss": 0.75122017, + "learning_rate": 3.3729448548855422e-06, + "loss": 0.82919437, + "num_input_tokens_seen": 101026780, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.19848633, + "step": 4680, + "time_per_iteration": 2.5146636962890625 + }, + { + "auxiliary_loss_clip": 0.06519224, + "auxiliary_loss_mlp": 0.01276065, + "balance_loss_clip": 0.06307293, + "balance_loss_mlp": 0.01257969, + "epoch": 0.2814369457387645, + "flos": 24323760754560.0, + "grad_norm": 2.4475033368931984, + "language_loss": 0.77670574, + "learning_rate": 3.3726616295225774e-06, + "loss": 0.8546586, + "num_input_tokens_seen": 101046215, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.18103027, + "step": 4681, + "time_per_iteration": 2.576263189315796 + }, + { + "auxiliary_loss_clip": 0.06524731, + "auxiliary_loss_mlp": 0.01277602, + "balance_loss_clip": 0.06309941, + "balance_loss_mlp": 0.01259208, + "epoch": 0.28149706899143245, + "flos": 18521274165120.0, + "grad_norm": 2.513172937911882, + "language_loss": 0.7420646, + "learning_rate": 3.372378352108146e-06, + "loss": 0.82008791, + "num_input_tokens_seen": 101063365, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.18383789, + "step": 4682, + "time_per_iteration": 2.5019047260284424 + }, + { + "auxiliary_loss_clip": 0.06516165, + "auxiliary_loss_mlp": 0.01280522, + "balance_loss_clip": 0.06307921, + "balance_loss_mlp": 0.01262879, + "epoch": 0.2815571922441004, + "flos": 24870165229440.0, + "grad_norm": 1.4634735151261165, + "language_loss": 0.81619561, + "learning_rate": 3.3720950226529894e-06, + "loss": 0.89416242, + "num_input_tokens_seen": 101083835, + "router_z_loss_clip": 2.08105469, + "router_z_loss_mlp": 0.17626953, + "step": 4683, + "time_per_iteration": 2.6108040809631348 + }, + { + "auxiliary_loss_clip": 0.06511167, + "auxiliary_loss_mlp": 0.01277368, + "balance_loss_clip": 0.06297079, + "balance_loss_mlp": 0.01258771, + "epoch": 0.2816173154967684, + "flos": 19907774075520.0, + "grad_norm": 1.6126473409715323, + "language_loss": 0.76514447, + "learning_rate": 3.371811641167852e-06, + "loss": 0.8430298, + "num_input_tokens_seen": 101101740, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.18579102, + "step": 4684, + "time_per_iteration": 3.9593515396118164 + }, + { + "auxiliary_loss_clip": 0.06509569, + "auxiliary_loss_mlp": 0.0127644, + "balance_loss_clip": 0.06298888, + "balance_loss_mlp": 0.01257474, + "epoch": 0.28167743874943635, + "flos": 17496709966080.0, + "grad_norm": 1.741664239740996, + "language_loss": 0.76634955, + "learning_rate": 3.3715282076634807e-06, + "loss": 0.84420967, + "num_input_tokens_seen": 101120480, + "router_z_loss_clip": 2.10742188, + "router_z_loss_mlp": 0.18969727, + "step": 4685, + "time_per_iteration": 2.533033847808838 + }, + { + "auxiliary_loss_clip": 0.06512235, + "auxiliary_loss_mlp": 0.01277016, + "balance_loss_clip": 0.06303049, + "balance_loss_mlp": 0.01258002, + "epoch": 0.2817375620021043, + "flos": 25309276151040.0, + "grad_norm": 1.5379443905684582, + "language_loss": 0.76075816, + "learning_rate": 3.3712447221506218e-06, + "loss": 0.8386507, + "num_input_tokens_seen": 101142910, + "router_z_loss_clip": 2.09179688, + "router_z_loss_mlp": 0.19006348, + "step": 4686, + "time_per_iteration": 2.5632452964782715 + }, + { + "auxiliary_loss_clip": 0.0651376, + "auxiliary_loss_mlp": 0.01282744, + "balance_loss_clip": 0.06298173, + "balance_loss_mlp": 0.01262705, + "epoch": 0.2817976852547723, + "flos": 18698447623680.0, + "grad_norm": 3.4763910689128945, + "language_loss": 0.63974833, + "learning_rate": 3.370961184640025e-06, + "loss": 0.71771336, + "num_input_tokens_seen": 101160030, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.20043945, + "step": 4687, + "time_per_iteration": 2.5520877838134766 + }, + { + "auxiliary_loss_clip": 0.0651626, + "auxiliary_loss_mlp": 0.01278308, + "balance_loss_clip": 0.06302825, + "balance_loss_mlp": 0.01258889, + "epoch": 0.28185780850744024, + "flos": 22748012398080.0, + "grad_norm": 2.5451270798344208, + "language_loss": 0.76514482, + "learning_rate": 3.3706775951424433e-06, + "loss": 0.84309042, + "num_input_tokens_seen": 101177675, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.1940918, + "step": 4688, + "time_per_iteration": 2.5427582263946533 + }, + { + "auxiliary_loss_clip": 0.06506021, + "auxiliary_loss_mlp": 0.01276039, + "balance_loss_clip": 0.06297493, + "balance_loss_mlp": 0.01258622, + "epoch": 0.2819179317601082, + "flos": 14938297251840.0, + "grad_norm": 2.0673048339937394, + "language_loss": 0.79160047, + "learning_rate": 3.37039395366863e-06, + "loss": 0.86942106, + "num_input_tokens_seen": 101192225, + "router_z_loss_clip": 2.0859375, + "router_z_loss_mlp": 0.17407227, + "step": 4689, + "time_per_iteration": 2.514857769012451 + }, + { + "auxiliary_loss_clip": 0.06505655, + "auxiliary_loss_mlp": 0.01279731, + "balance_loss_clip": 0.06295724, + "balance_loss_mlp": 0.0126098, + "epoch": 0.2819780550127762, + "flos": 23151428680320.0, + "grad_norm": 2.0480677905828664, + "language_loss": 0.78403682, + "learning_rate": 3.37011026022934e-06, + "loss": 0.86189067, + "num_input_tokens_seen": 101210870, + "router_z_loss_clip": 2.1015625, + "router_z_loss_mlp": 0.18762207, + "step": 4690, + "time_per_iteration": 2.5567362308502197 + }, + { + "auxiliary_loss_clip": 0.06514366, + "auxiliary_loss_mlp": 0.01275411, + "balance_loss_clip": 0.06301816, + "balance_loss_mlp": 0.01256981, + "epoch": 0.28203817826544414, + "flos": 21622779118080.0, + "grad_norm": 2.5530247222146976, + "language_loss": 0.87619591, + "learning_rate": 3.369826514835332e-06, + "loss": 0.95409369, + "num_input_tokens_seen": 101229965, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.18432617, + "step": 4691, + "time_per_iteration": 2.5987935066223145 + }, + { + "auxiliary_loss_clip": 0.0651565, + "auxiliary_loss_mlp": 0.0127711, + "balance_loss_clip": 0.0629878, + "balance_loss_mlp": 0.01258787, + "epoch": 0.2820983015181121, + "flos": 24034010935680.0, + "grad_norm": 1.7719901211447804, + "language_loss": 0.82443225, + "learning_rate": 3.3695427174973654e-06, + "loss": 0.90235984, + "num_input_tokens_seen": 101250980, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.18322754, + "step": 4692, + "time_per_iteration": 2.607388496398926 + }, + { + "auxiliary_loss_clip": 0.06515577, + "auxiliary_loss_mlp": 0.01278887, + "balance_loss_clip": 0.06304249, + "balance_loss_mlp": 0.01259921, + "epoch": 0.2821584247707801, + "flos": 30015725408640.0, + "grad_norm": 1.5203777397001885, + "language_loss": 0.74437934, + "learning_rate": 3.3692588682262022e-06, + "loss": 0.82232404, + "num_input_tokens_seen": 101273335, + "router_z_loss_clip": 2.11328125, + "router_z_loss_mlp": 0.1895752, + "step": 4693, + "time_per_iteration": 2.6104559898376465 + }, + { + "auxiliary_loss_clip": 0.06512225, + "auxiliary_loss_mlp": 0.0127432, + "balance_loss_clip": 0.06298921, + "balance_loss_mlp": 0.01255593, + "epoch": 0.2822185480234481, + "flos": 21403034985600.0, + "grad_norm": 1.7641787467317929, + "language_loss": 0.77641487, + "learning_rate": 3.3689749670326046e-06, + "loss": 0.85428035, + "num_input_tokens_seen": 101292110, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.18737793, + "step": 4694, + "time_per_iteration": 2.5619184970855713 + }, + { + "auxiliary_loss_clip": 0.06513312, + "auxiliary_loss_mlp": 0.01274888, + "balance_loss_clip": 0.0630666, + "balance_loss_mlp": 0.01255898, + "epoch": 0.28227867127611606, + "flos": 27459996024960.0, + "grad_norm": 2.064814820064932, + "language_loss": 0.67270994, + "learning_rate": 3.3686910139273392e-06, + "loss": 0.75059193, + "num_input_tokens_seen": 101312815, + "router_z_loss_clip": 2.06640625, + "router_z_loss_mlp": 0.18969727, + "step": 4695, + "time_per_iteration": 2.5849459171295166 + }, + { + "auxiliary_loss_clip": 0.06524754, + "auxiliary_loss_mlp": 0.01277288, + "balance_loss_clip": 0.06312457, + "balance_loss_mlp": 0.01255914, + "epoch": 0.282338794528784, + "flos": 22599028638720.0, + "grad_norm": 2.3022925444863747, + "language_loss": 0.75992346, + "learning_rate": 3.3684070089211736e-06, + "loss": 0.83794391, + "num_input_tokens_seen": 101329045, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.21362305, + "step": 4696, + "time_per_iteration": 2.5599312782287598 + }, + { + "auxiliary_loss_clip": 0.06528555, + "auxiliary_loss_mlp": 0.01276345, + "balance_loss_clip": 0.06319815, + "balance_loss_mlp": 0.01257915, + "epoch": 0.282398917781452, + "flos": 42020592998400.0, + "grad_norm": 1.6923608864022255, + "language_loss": 0.62607121, + "learning_rate": 3.368122952024877e-06, + "loss": 0.70412022, + "num_input_tokens_seen": 101352715, + "router_z_loss_clip": 2.08691406, + "router_z_loss_mlp": 0.1842041, + "step": 4697, + "time_per_iteration": 2.719783067703247 + }, + { + "auxiliary_loss_clip": 0.0651894, + "auxiliary_loss_mlp": 0.01278397, + "balance_loss_clip": 0.0631054, + "balance_loss_mlp": 0.01260564, + "epoch": 0.28245904103411995, + "flos": 23231916126720.0, + "grad_norm": 1.330125700327103, + "language_loss": 0.73835146, + "learning_rate": 3.3678388432492214e-06, + "loss": 0.81632483, + "num_input_tokens_seen": 101374640, + "router_z_loss_clip": 2.08398438, + "router_z_loss_mlp": 0.17834473, + "step": 4698, + "time_per_iteration": 2.671154260635376 + }, + { + "auxiliary_loss_clip": 0.06520095, + "auxiliary_loss_mlp": 0.01274177, + "balance_loss_clip": 0.06314629, + "balance_loss_mlp": 0.01255699, + "epoch": 0.2825191642867879, + "flos": 25381713605760.0, + "grad_norm": 1.8806904568543696, + "language_loss": 0.75498992, + "learning_rate": 3.3675546826049788e-06, + "loss": 0.83293265, + "num_input_tokens_seen": 101393595, + "router_z_loss_clip": 2.05371094, + "router_z_loss_mlp": 0.18481445, + "step": 4699, + "time_per_iteration": 2.749073028564453 + }, + { + "auxiliary_loss_clip": 0.06532586, + "auxiliary_loss_mlp": 0.0127858, + "balance_loss_clip": 0.06318063, + "balance_loss_mlp": 0.01257969, + "epoch": 0.2825792875394559, + "flos": 17242277443200.0, + "grad_norm": 2.5468251061801697, + "language_loss": 0.80103695, + "learning_rate": 3.3672704701029265e-06, + "loss": 0.87914866, + "num_input_tokens_seen": 101409265, + "router_z_loss_clip": 2.14355469, + "router_z_loss_mlp": 0.20617676, + "step": 4700, + "time_per_iteration": 2.539794683456421 + }, + { + "auxiliary_loss_clip": 0.06516679, + "auxiliary_loss_mlp": 0.01274136, + "balance_loss_clip": 0.06314512, + "balance_loss_mlp": 0.01257006, + "epoch": 0.28263941079212385, + "flos": 26731177211520.0, + "grad_norm": 2.1068022199140213, + "language_loss": 0.8243857, + "learning_rate": 3.3669862057538402e-06, + "loss": 0.90229392, + "num_input_tokens_seen": 101428365, + "router_z_loss_clip": 2.02246094, + "router_z_loss_mlp": 0.17114258, + "step": 4701, + "time_per_iteration": 2.5763485431671143 + }, + { + "auxiliary_loss_clip": 0.06520683, + "auxiliary_loss_mlp": 0.01274057, + "balance_loss_clip": 0.06312392, + "balance_loss_mlp": 0.01256116, + "epoch": 0.2826995340447918, + "flos": 25928411569920.0, + "grad_norm": 2.2990609650841276, + "language_loss": 0.73153478, + "learning_rate": 3.3667018895685004e-06, + "loss": 0.80948216, + "num_input_tokens_seen": 101447280, + "router_z_loss_clip": 2.08007812, + "router_z_loss_mlp": 0.17956543, + "step": 4702, + "time_per_iteration": 2.5968289375305176 + }, + { + "auxiliary_loss_clip": 0.06520355, + "auxiliary_loss_mlp": 0.01275823, + "balance_loss_clip": 0.06316096, + "balance_loss_mlp": 0.01258848, + "epoch": 0.2827596572974598, + "flos": 22385783197440.0, + "grad_norm": 1.6603391807745085, + "language_loss": 0.78883457, + "learning_rate": 3.3664175215576886e-06, + "loss": 0.86679637, + "num_input_tokens_seen": 101465435, + "router_z_loss_clip": 2.04296875, + "router_z_loss_mlp": 0.1697998, + "step": 4703, + "time_per_iteration": 2.56088924407959 + }, + { + "auxiliary_loss_clip": 0.06518066, + "auxiliary_loss_mlp": 0.01281519, + "balance_loss_clip": 0.06307587, + "balance_loss_mlp": 0.01261885, + "epoch": 0.28281978055012774, + "flos": 33555544669440.0, + "grad_norm": 1.530922589206002, + "language_loss": 0.69937778, + "learning_rate": 3.3661331017321867e-06, + "loss": 0.77737355, + "num_input_tokens_seen": 101486355, + "router_z_loss_clip": 2.10253906, + "router_z_loss_mlp": 0.19628906, + "step": 4704, + "time_per_iteration": 2.725234031677246 + }, + { + "auxiliary_loss_clip": 0.0652602, + "auxiliary_loss_mlp": 0.01283133, + "balance_loss_clip": 0.06319317, + "balance_loss_mlp": 0.01264119, + "epoch": 0.2828799038027957, + "flos": 23447635263360.0, + "grad_norm": 1.9265232828394878, + "language_loss": 0.70927215, + "learning_rate": 3.3658486301027807e-06, + "loss": 0.78736377, + "num_input_tokens_seen": 101505875, + "router_z_loss_clip": 2.06445312, + "router_z_loss_mlp": 0.19006348, + "step": 4705, + "time_per_iteration": 2.5391383171081543 + }, + { + "auxiliary_loss_clip": 0.06482799, + "auxiliary_loss_mlp": 0.0126888, + "balance_loss_clip": 0.06378852, + "balance_loss_mlp": 0.01263947, + "epoch": 0.2829400270554637, + "flos": 69892055297280.0, + "grad_norm": 0.9159756060868983, + "language_loss": 0.59201139, + "learning_rate": 3.3655641066802577e-06, + "loss": 0.66952819, + "num_input_tokens_seen": 101565045, + "router_z_loss_clip": 1.0390625, + "router_z_loss_mlp": 0.04928589, + "step": 4706, + "time_per_iteration": 3.219618797302246 + }, + { + "auxiliary_loss_clip": 0.06512764, + "auxiliary_loss_mlp": 0.01277701, + "balance_loss_clip": 0.06312177, + "balance_loss_mlp": 0.01260547, + "epoch": 0.2830001503081317, + "flos": 24795715276800.0, + "grad_norm": 1.373077415158703, + "language_loss": 0.82380199, + "learning_rate": 3.365279531475407e-06, + "loss": 0.90170658, + "num_input_tokens_seen": 101585825, + "router_z_loss_clip": 2.00683594, + "router_z_loss_mlp": 0.17138672, + "step": 4707, + "time_per_iteration": 2.5680840015411377 + }, + { + "auxiliary_loss_clip": 0.06518079, + "auxiliary_loss_mlp": 0.01276357, + "balance_loss_clip": 0.06304221, + "balance_loss_mlp": 0.01257391, + "epoch": 0.28306027356079966, + "flos": 27676218286080.0, + "grad_norm": 1.5569970524845527, + "language_loss": 0.81077999, + "learning_rate": 3.36499490449902e-06, + "loss": 0.88872433, + "num_input_tokens_seen": 101606105, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.18969727, + "step": 4708, + "time_per_iteration": 2.643389940261841 + }, + { + "auxiliary_loss_clip": 0.06443536, + "auxiliary_loss_mlp": 0.01268639, + "balance_loss_clip": 0.06339511, + "balance_loss_mlp": 0.01264025, + "epoch": 0.2831203968134676, + "flos": 60543837734400.0, + "grad_norm": 0.8586282544888121, + "language_loss": 0.62812036, + "learning_rate": 3.3647102257618895e-06, + "loss": 0.7052421, + "num_input_tokens_seen": 101656875, + "router_z_loss_clip": 1.04199219, + "router_z_loss_mlp": 0.04608154, + "step": 4709, + "time_per_iteration": 3.0554397106170654 + }, + { + "auxiliary_loss_clip": 0.06507774, + "auxiliary_loss_mlp": 0.01270408, + "balance_loss_clip": 0.06301016, + "balance_loss_mlp": 0.01253015, + "epoch": 0.2831805200661356, + "flos": 22061386915200.0, + "grad_norm": 1.4201642822404892, + "language_loss": 0.74412584, + "learning_rate": 3.3644254952748103e-06, + "loss": 0.82190764, + "num_input_tokens_seen": 101676225, + "router_z_loss_clip": 2.06835938, + "router_z_loss_mlp": 0.1739502, + "step": 4710, + "time_per_iteration": 2.555367946624756 + }, + { + "auxiliary_loss_clip": 0.06514937, + "auxiliary_loss_mlp": 0.01275331, + "balance_loss_clip": 0.06306514, + "balance_loss_mlp": 0.01256627, + "epoch": 0.28324064331880355, + "flos": 22607120557440.0, + "grad_norm": 1.9767009095982746, + "language_loss": 0.8018595, + "learning_rate": 3.364140713048579e-06, + "loss": 0.87976217, + "num_input_tokens_seen": 101693710, + "router_z_loss_clip": 2.08300781, + "router_z_loss_mlp": 0.18713379, + "step": 4711, + "time_per_iteration": 2.610027313232422 + }, + { + "auxiliary_loss_clip": 0.06509729, + "auxiliary_loss_mlp": 0.01278493, + "balance_loss_clip": 0.06300638, + "balance_loss_mlp": 0.01260385, + "epoch": 0.2833007665714715, + "flos": 30411133626240.0, + "grad_norm": 1.982526263820073, + "language_loss": 0.70604694, + "learning_rate": 3.363855879093996e-06, + "loss": 0.78392917, + "num_input_tokens_seen": 101714010, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 0.18103027, + "step": 4712, + "time_per_iteration": 2.602795124053955 + }, + { + "auxiliary_loss_clip": 0.06508194, + "auxiliary_loss_mlp": 0.01282495, + "balance_loss_clip": 0.06299947, + "balance_loss_mlp": 0.01262992, + "epoch": 0.2833608898241395, + "flos": 23556144700800.0, + "grad_norm": 1.7823239687069516, + "language_loss": 0.8193841, + "learning_rate": 3.3635709934218605e-06, + "loss": 0.89729095, + "num_input_tokens_seen": 101732995, + "router_z_loss_clip": 2.08398438, + "router_z_loss_mlp": 0.19494629, + "step": 4713, + "time_per_iteration": 2.6088523864746094 + }, + { + "auxiliary_loss_clip": 0.06512519, + "auxiliary_loss_mlp": 0.01275048, + "balance_loss_clip": 0.06304006, + "balance_loss_mlp": 0.01255236, + "epoch": 0.28342101307680745, + "flos": 20272980096000.0, + "grad_norm": 2.6212370689858493, + "language_loss": 0.75431275, + "learning_rate": 3.3632860560429766e-06, + "loss": 0.83218849, + "num_input_tokens_seen": 101751385, + "router_z_loss_clip": 2.08398438, + "router_z_loss_mlp": 0.19799805, + "step": 4714, + "time_per_iteration": 3.986696243286133 + }, + { + "auxiliary_loss_clip": 0.06505996, + "auxiliary_loss_mlp": 0.01276776, + "balance_loss_clip": 0.06297115, + "balance_loss_mlp": 0.01259324, + "epoch": 0.2834811363294754, + "flos": 30854982303360.0, + "grad_norm": 1.3268888753773178, + "language_loss": 0.78198218, + "learning_rate": 3.3630010669681494e-06, + "loss": 0.85980994, + "num_input_tokens_seen": 101773825, + "router_z_loss_clip": 2.08691406, + "router_z_loss_mlp": 0.17468262, + "step": 4715, + "time_per_iteration": 2.652470111846924 + }, + { + "auxiliary_loss_clip": 0.06506517, + "auxiliary_loss_mlp": 0.01277278, + "balance_loss_clip": 0.06300199, + "balance_loss_mlp": 0.01260088, + "epoch": 0.2835412595821434, + "flos": 22717642492800.0, + "grad_norm": 1.6173599581374518, + "language_loss": 0.74551272, + "learning_rate": 3.3627160262081845e-06, + "loss": 0.82335067, + "num_input_tokens_seen": 101791920, + "router_z_loss_clip": 2.06542969, + "router_z_loss_mlp": 0.17175293, + "step": 4716, + "time_per_iteration": 2.597083806991577 + }, + { + "auxiliary_loss_clip": 0.06516325, + "auxiliary_loss_mlp": 0.01281584, + "balance_loss_clip": 0.06298752, + "balance_loss_mlp": 0.0126189, + "epoch": 0.28360138283481134, + "flos": 18083630689920.0, + "grad_norm": 2.1150039301458112, + "language_loss": 0.75477433, + "learning_rate": 3.3624309337738917e-06, + "loss": 0.83275348, + "num_input_tokens_seen": 101809515, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.19702148, + "step": 4717, + "time_per_iteration": 2.5648136138916016 + }, + { + "auxiliary_loss_clip": 0.06514253, + "auxiliary_loss_mlp": 0.01277656, + "balance_loss_clip": 0.06302426, + "balance_loss_mlp": 0.01258606, + "epoch": 0.2836615060874793, + "flos": 17859987342720.0, + "grad_norm": 1.540618458402471, + "language_loss": 0.67445159, + "learning_rate": 3.3621457896760813e-06, + "loss": 0.75237072, + "num_input_tokens_seen": 101827735, + "router_z_loss_clip": 2.1171875, + "router_z_loss_mlp": 0.19042969, + "step": 4718, + "time_per_iteration": 3.962265968322754 + }, + { + "auxiliary_loss_clip": 0.06507722, + "auxiliary_loss_mlp": 0.01278787, + "balance_loss_clip": 0.06295013, + "balance_loss_mlp": 0.01258772, + "epoch": 0.2837216293401473, + "flos": 25747590458880.0, + "grad_norm": 1.8038295919740834, + "language_loss": 0.73164374, + "learning_rate": 3.361860593925566e-06, + "loss": 0.8095088, + "num_input_tokens_seen": 101845970, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.20007324, + "step": 4719, + "time_per_iteration": 4.095008134841919 + }, + { + "auxiliary_loss_clip": 0.0650832, + "auxiliary_loss_mlp": 0.01277839, + "balance_loss_clip": 0.06301163, + "balance_loss_mlp": 0.01259386, + "epoch": 0.2837817525928153, + "flos": 20929906506240.0, + "grad_norm": 1.8981156672354917, + "language_loss": 0.80600828, + "learning_rate": 3.3615753465331605e-06, + "loss": 0.88386989, + "num_input_tokens_seen": 101865040, + "router_z_loss_clip": 2.07324219, + "router_z_loss_mlp": 0.18444824, + "step": 4720, + "time_per_iteration": 2.53869366645813 + }, + { + "auxiliary_loss_clip": 0.06515027, + "auxiliary_loss_mlp": 0.01280641, + "balance_loss_clip": 0.06304276, + "balance_loss_mlp": 0.01261687, + "epoch": 0.28384187584548326, + "flos": 18922719876480.0, + "grad_norm": 1.7940545446838874, + "language_loss": 0.7966662, + "learning_rate": 3.3612900475096817e-06, + "loss": 0.87462288, + "num_input_tokens_seen": 101883735, + "router_z_loss_clip": 2.10742188, + "router_z_loss_mlp": 0.18945312, + "step": 4721, + "time_per_iteration": 2.5736734867095947 + }, + { + "auxiliary_loss_clip": 0.06507237, + "auxiliary_loss_mlp": 0.01272866, + "balance_loss_clip": 0.06298702, + "balance_loss_mlp": 0.01254996, + "epoch": 0.2839019990981512, + "flos": 27351235025280.0, + "grad_norm": 1.8504915753410351, + "language_loss": 0.83238685, + "learning_rate": 3.3610046968659474e-06, + "loss": 0.91018784, + "num_input_tokens_seen": 101903025, + "router_z_loss_clip": 2.0859375, + "router_z_loss_mlp": 0.17871094, + "step": 4722, + "time_per_iteration": 2.5798823833465576 + }, + { + "auxiliary_loss_clip": 0.06511718, + "auxiliary_loss_mlp": 0.01273786, + "balance_loss_clip": 0.06302544, + "balance_loss_mlp": 0.01255547, + "epoch": 0.2839621223508192, + "flos": 18120247724160.0, + "grad_norm": 1.9056364243243222, + "language_loss": 0.71157932, + "learning_rate": 3.3607192946127785e-06, + "loss": 0.78943431, + "num_input_tokens_seen": 101922255, + "router_z_loss_clip": 2.09179688, + "router_z_loss_mlp": 0.18225098, + "step": 4723, + "time_per_iteration": 2.5472381114959717 + }, + { + "auxiliary_loss_clip": 0.06508423, + "auxiliary_loss_mlp": 0.01279225, + "balance_loss_clip": 0.06299602, + "balance_loss_mlp": 0.01259937, + "epoch": 0.28402224560348716, + "flos": 26365384212480.0, + "grad_norm": 1.5487216964387416, + "language_loss": 0.7882036, + "learning_rate": 3.360433840760998e-06, + "loss": 0.86608005, + "num_input_tokens_seen": 101943100, + "router_z_loss_clip": 2.09082031, + "router_z_loss_mlp": 0.19299316, + "step": 4724, + "time_per_iteration": 4.039300203323364 + }, + { + "auxiliary_loss_clip": 0.0650482, + "auxiliary_loss_mlp": 0.01275588, + "balance_loss_clip": 0.06294143, + "balance_loss_mlp": 0.0125754, + "epoch": 0.2840823688561551, + "flos": 24067609223040.0, + "grad_norm": 1.5786087270385247, + "language_loss": 0.92781484, + "learning_rate": 3.36014833532143e-06, + "loss": 1.00561893, + "num_input_tokens_seen": 101963160, + "router_z_loss_clip": 2.10546875, + "router_z_loss_mlp": 0.18066406, + "step": 4725, + "time_per_iteration": 2.5839502811431885 + }, + { + "auxiliary_loss_clip": 0.06504668, + "auxiliary_loss_mlp": 0.01283756, + "balance_loss_clip": 0.06292438, + "balance_loss_mlp": 0.01263097, + "epoch": 0.2841424921088231, + "flos": 29467392289920.0, + "grad_norm": 1.5513315701194426, + "language_loss": 0.89446843, + "learning_rate": 3.3598627783049e-06, + "loss": 0.97235262, + "num_input_tokens_seen": 101984300, + "router_z_loss_clip": 2.12304688, + "router_z_loss_mlp": 0.20666504, + "step": 4726, + "time_per_iteration": 2.617002010345459 + }, + { + "auxiliary_loss_clip": 0.06507252, + "auxiliary_loss_mlp": 0.01284138, + "balance_loss_clip": 0.0629679, + "balance_loss_mlp": 0.01264409, + "epoch": 0.28420261536149105, + "flos": 48110439565440.0, + "grad_norm": 2.259876030173266, + "language_loss": 0.79337573, + "learning_rate": 3.359577169722238e-06, + "loss": 0.87128961, + "num_input_tokens_seen": 102005765, + "router_z_loss_clip": 2.1015625, + "router_z_loss_mlp": 0.19763184, + "step": 4727, + "time_per_iteration": 2.774508476257324 + }, + { + "auxiliary_loss_clip": 0.06499238, + "auxiliary_loss_mlp": 0.01275292, + "balance_loss_clip": 0.06294493, + "balance_loss_mlp": 0.01257483, + "epoch": 0.284262738614159, + "flos": 25673224360320.0, + "grad_norm": 2.051338722061539, + "language_loss": 0.67073631, + "learning_rate": 3.3592915095842733e-06, + "loss": 0.74848163, + "num_input_tokens_seen": 102022755, + "router_z_loss_clip": 2.04589844, + "router_z_loss_mlp": 0.17810059, + "step": 4728, + "time_per_iteration": 2.614614725112915 + }, + { + "auxiliary_loss_clip": 0.06494898, + "auxiliary_loss_mlp": 0.01274263, + "balance_loss_clip": 0.06287634, + "balance_loss_mlp": 0.01255702, + "epoch": 0.284322861866827, + "flos": 19725066247680.0, + "grad_norm": 2.0236031999203132, + "language_loss": 0.76682353, + "learning_rate": 3.3590057979018386e-06, + "loss": 0.84451514, + "num_input_tokens_seen": 102041850, + "router_z_loss_clip": 2.07617188, + "router_z_loss_mlp": 0.18554688, + "step": 4729, + "time_per_iteration": 2.542400360107422 + }, + { + "auxiliary_loss_clip": 0.06505589, + "auxiliary_loss_mlp": 0.01273011, + "balance_loss_clip": 0.06292985, + "balance_loss_mlp": 0.0125414, + "epoch": 0.28438298511949495, + "flos": 23922105408000.0, + "grad_norm": 1.7626205541686495, + "language_loss": 0.67443657, + "learning_rate": 3.3587200346857674e-06, + "loss": 0.75222254, + "num_input_tokens_seen": 102059500, + "router_z_loss_clip": 2.12304688, + "router_z_loss_mlp": 0.1887207, + "step": 4730, + "time_per_iteration": 2.6005139350891113 + }, + { + "auxiliary_loss_clip": 0.06503962, + "auxiliary_loss_mlp": 0.01275972, + "balance_loss_clip": 0.06292562, + "balance_loss_mlp": 0.01256219, + "epoch": 0.2844431083721629, + "flos": 26074460436480.0, + "grad_norm": 1.9951841893982447, + "language_loss": 0.74777246, + "learning_rate": 3.3584342199468965e-06, + "loss": 0.82557184, + "num_input_tokens_seen": 102080460, + "router_z_loss_clip": 2.1171875, + "router_z_loss_mlp": 0.1973877, + "step": 4731, + "time_per_iteration": 2.571259021759033 + }, + { + "auxiliary_loss_clip": 0.06501718, + "auxiliary_loss_mlp": 0.01275528, + "balance_loss_clip": 0.06291741, + "balance_loss_mlp": 0.01257384, + "epoch": 0.2845032316248309, + "flos": 25817260728960.0, + "grad_norm": 1.5216025808612688, + "language_loss": 0.8435545, + "learning_rate": 3.3581483536960638e-06, + "loss": 0.92132688, + "num_input_tokens_seen": 102100950, + "router_z_loss_clip": 2.09570312, + "router_z_loss_mlp": 0.18139648, + "step": 4732, + "time_per_iteration": 2.604717254638672 + }, + { + "auxiliary_loss_clip": 0.06508272, + "auxiliary_loss_mlp": 0.01277146, + "balance_loss_clip": 0.06295733, + "balance_loss_mlp": 0.01256082, + "epoch": 0.2845633548774989, + "flos": 19828418659200.0, + "grad_norm": 1.722472955192697, + "language_loss": 0.79522747, + "learning_rate": 3.357862435944109e-06, + "loss": 0.87308168, + "num_input_tokens_seen": 102119345, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.21069336, + "step": 4733, + "time_per_iteration": 2.517216444015503 + }, + { + "auxiliary_loss_clip": 0.06511072, + "auxiliary_loss_mlp": 0.01275761, + "balance_loss_clip": 0.06296709, + "balance_loss_mlp": 0.01256878, + "epoch": 0.28462347813016686, + "flos": 23189093890560.0, + "grad_norm": 2.336729990473161, + "language_loss": 0.72093451, + "learning_rate": 3.357576466701875e-06, + "loss": 0.79880273, + "num_input_tokens_seen": 102139050, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.1887207, + "step": 4734, + "time_per_iteration": 2.5948264598846436 + }, + { + "auxiliary_loss_clip": 0.06501292, + "auxiliary_loss_mlp": 0.01274129, + "balance_loss_clip": 0.06292972, + "balance_loss_mlp": 0.01256283, + "epoch": 0.2846836013828348, + "flos": 18666316782720.0, + "grad_norm": 1.7839237241912007, + "language_loss": 0.74739748, + "learning_rate": 3.3572904459802056e-06, + "loss": 0.82515168, + "num_input_tokens_seen": 102157935, + "router_z_loss_clip": 2.08203125, + "router_z_loss_mlp": 0.1784668, + "step": 4735, + "time_per_iteration": 2.5192623138427734 + }, + { + "auxiliary_loss_clip": 0.06500865, + "auxiliary_loss_mlp": 0.01274478, + "balance_loss_clip": 0.06291883, + "balance_loss_mlp": 0.01256096, + "epoch": 0.2847437246355028, + "flos": 14178731189760.0, + "grad_norm": 1.8549790130823454, + "language_loss": 0.81047934, + "learning_rate": 3.357004373789946e-06, + "loss": 0.88823277, + "num_input_tokens_seen": 102175325, + "router_z_loss_clip": 2.09179688, + "router_z_loss_mlp": 0.18383789, + "step": 4736, + "time_per_iteration": 2.593890905380249 + }, + { + "auxiliary_loss_clip": 0.06503595, + "auxiliary_loss_mlp": 0.01274596, + "balance_loss_clip": 0.06293313, + "balance_loss_mlp": 0.01256285, + "epoch": 0.28480384788817076, + "flos": 29286068054400.0, + "grad_norm": 3.1700593253391895, + "language_loss": 0.60580242, + "learning_rate": 3.3567182501419453e-06, + "loss": 0.68358433, + "num_input_tokens_seen": 102196625, + "router_z_loss_clip": 2.10644531, + "router_z_loss_mlp": 0.18310547, + "step": 4737, + "time_per_iteration": 2.591672897338867 + }, + { + "auxiliary_loss_clip": 0.06501776, + "auxiliary_loss_mlp": 0.01274486, + "balance_loss_clip": 0.06295541, + "balance_loss_mlp": 0.01256855, + "epoch": 0.2848639711408387, + "flos": 22607875244160.0, + "grad_norm": 1.8212806326874897, + "language_loss": 0.86685491, + "learning_rate": 3.356432075047052e-06, + "loss": 0.94461757, + "num_input_tokens_seen": 102214975, + "router_z_loss_clip": 2.06347656, + "router_z_loss_mlp": 0.1763916, + "step": 4738, + "time_per_iteration": 2.5788064002990723 + }, + { + "auxiliary_loss_clip": 0.06504256, + "auxiliary_loss_mlp": 0.01280924, + "balance_loss_clip": 0.06291994, + "balance_loss_mlp": 0.01260575, + "epoch": 0.2849240943935067, + "flos": 17604632424960.0, + "grad_norm": 2.187311269731562, + "language_loss": 0.90640962, + "learning_rate": 3.356145848516118e-06, + "loss": 0.98426139, + "num_input_tokens_seen": 102231885, + "router_z_loss_clip": 2.12304688, + "router_z_loss_mlp": 0.20336914, + "step": 4739, + "time_per_iteration": 2.491391897201538 + }, + { + "auxiliary_loss_clip": 0.06502014, + "auxiliary_loss_mlp": 0.01271887, + "balance_loss_clip": 0.06294325, + "balance_loss_mlp": 0.01254363, + "epoch": 0.28498421764617465, + "flos": 24869368615680.0, + "grad_norm": 1.2838984451042732, + "language_loss": 0.72652215, + "learning_rate": 3.355859570559998e-06, + "loss": 0.80426115, + "num_input_tokens_seen": 102252725, + "router_z_loss_clip": 2.07714844, + "router_z_loss_mlp": 0.17529297, + "step": 4740, + "time_per_iteration": 2.628420352935791 + }, + { + "auxiliary_loss_clip": 0.06497836, + "auxiliary_loss_mlp": 0.01273023, + "balance_loss_clip": 0.06293581, + "balance_loss_mlp": 0.01254069, + "epoch": 0.2850443408988426, + "flos": 22788947917440.0, + "grad_norm": 1.7372555552312992, + "language_loss": 0.77982342, + "learning_rate": 3.3555732411895477e-06, + "loss": 0.85753202, + "num_input_tokens_seen": 102271730, + "router_z_loss_clip": 2.04296875, + "router_z_loss_mlp": 0.1895752, + "step": 4741, + "time_per_iteration": 2.5205776691436768 + }, + { + "auxiliary_loss_clip": 0.06505083, + "auxiliary_loss_mlp": 0.01279172, + "balance_loss_clip": 0.06290049, + "balance_loss_mlp": 0.01260278, + "epoch": 0.2851044641515106, + "flos": 18850114713600.0, + "grad_norm": 2.3624012556043246, + "language_loss": 0.7702412, + "learning_rate": 3.3552868604156235e-06, + "loss": 0.84808373, + "num_input_tokens_seen": 102291325, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.18896484, + "step": 4742, + "time_per_iteration": 2.5852768421173096 + }, + { + "auxiliary_loss_clip": 0.06507465, + "auxiliary_loss_mlp": 0.01281198, + "balance_loss_clip": 0.06292667, + "balance_loss_mlp": 0.01260252, + "epoch": 0.28516458740417855, + "flos": 18886564039680.0, + "grad_norm": 2.066213096861692, + "language_loss": 0.57976151, + "learning_rate": 3.355000428249086e-06, + "loss": 0.65764809, + "num_input_tokens_seen": 102309000, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.20959473, + "step": 4743, + "time_per_iteration": 2.562298059463501 + }, + { + "auxiliary_loss_clip": 0.06507643, + "auxiliary_loss_mlp": 0.01278324, + "balance_loss_clip": 0.06297275, + "balance_loss_mlp": 0.01259787, + "epoch": 0.2852247106568465, + "flos": 25306592820480.0, + "grad_norm": 1.602300087654556, + "language_loss": 0.75013685, + "learning_rate": 3.354713944700797e-06, + "loss": 0.82799655, + "num_input_tokens_seen": 102329240, + "router_z_loss_clip": 2.10546875, + "router_z_loss_mlp": 0.1854248, + "step": 4744, + "time_per_iteration": 2.610302209854126 + }, + { + "auxiliary_loss_clip": 0.06500175, + "auxiliary_loss_mlp": 0.01276557, + "balance_loss_clip": 0.06292172, + "balance_loss_mlp": 0.01258794, + "epoch": 0.2852848339095145, + "flos": 11660080037760.0, + "grad_norm": 2.2644691376510844, + "language_loss": 0.78515136, + "learning_rate": 3.3544274097816185e-06, + "loss": 0.86291873, + "num_input_tokens_seen": 102344440, + "router_z_loss_clip": 2.08105469, + "router_z_loss_mlp": 0.17749023, + "step": 4745, + "time_per_iteration": 2.5170419216156006 + }, + { + "auxiliary_loss_clip": 0.06491117, + "auxiliary_loss_mlp": 0.01272956, + "balance_loss_clip": 0.06290857, + "balance_loss_mlp": 0.01254836, + "epoch": 0.2853449571621825, + "flos": 12938280145920.0, + "grad_norm": 1.7221704990089022, + "language_loss": 0.83220983, + "learning_rate": 3.3541408235024173e-06, + "loss": 0.9098506, + "num_input_tokens_seen": 102360985, + "router_z_loss_clip": 2.00097656, + "router_z_loss_mlp": 0.18127441, + "step": 4746, + "time_per_iteration": 2.6257071495056152 + }, + { + "auxiliary_loss_clip": 0.06514393, + "auxiliary_loss_mlp": 0.01278566, + "balance_loss_clip": 0.06295399, + "balance_loss_mlp": 0.01257943, + "epoch": 0.28540508041485046, + "flos": 20016660856320.0, + "grad_norm": 1.8084134515670756, + "language_loss": 0.80507863, + "learning_rate": 3.3538541858740604e-06, + "loss": 0.88300824, + "num_input_tokens_seen": 102380320, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.20617676, + "step": 4747, + "time_per_iteration": 2.5699074268341064 + }, + { + "auxiliary_loss_clip": 0.06375369, + "auxiliary_loss_mlp": 0.0127529, + "balance_loss_clip": 0.0627491, + "balance_loss_mlp": 0.01269043, + "epoch": 0.28546520366751843, + "flos": 68160264710400.0, + "grad_norm": 0.7514031277524565, + "language_loss": 0.60153103, + "learning_rate": 3.3535674969074173e-06, + "loss": 0.67803764, + "num_input_tokens_seen": 102439140, + "router_z_loss_clip": 1.0078125, + "router_z_loss_mlp": 0.06237793, + "step": 4748, + "time_per_iteration": 3.1155877113342285 + }, + { + "auxiliary_loss_clip": 0.06492989, + "auxiliary_loss_mlp": 0.01272874, + "balance_loss_clip": 0.06285426, + "balance_loss_mlp": 0.01255791, + "epoch": 0.2855253269201864, + "flos": 13254961852800.0, + "grad_norm": 2.1744647780903352, + "language_loss": 0.80643219, + "learning_rate": 3.3532807566133592e-06, + "loss": 0.88409078, + "num_input_tokens_seen": 102450990, + "router_z_loss_clip": 2.07617188, + "router_z_loss_mlp": 0.17089844, + "step": 4749, + "time_per_iteration": 2.5422439575195312 + }, + { + "auxiliary_loss_clip": 0.06506198, + "auxiliary_loss_mlp": 0.01278695, + "balance_loss_clip": 0.06295547, + "balance_loss_mlp": 0.0126011, + "epoch": 0.28558545017285436, + "flos": 28628345030400.0, + "grad_norm": 1.9900791940744995, + "language_loss": 0.70889151, + "learning_rate": 3.3529939650027587e-06, + "loss": 0.78674042, + "num_input_tokens_seen": 102471820, + "router_z_loss_clip": 2.10644531, + "router_z_loss_mlp": 0.18579102, + "step": 4750, + "time_per_iteration": 2.6223177909851074 + }, + { + "auxiliary_loss_clip": 0.06498066, + "auxiliary_loss_mlp": 0.01278692, + "balance_loss_clip": 0.06294224, + "balance_loss_mlp": 0.01261562, + "epoch": 0.2856455734255223, + "flos": 34138901594880.0, + "grad_norm": 1.523200352045364, + "language_loss": 0.82438904, + "learning_rate": 3.3527071220864917e-06, + "loss": 0.90215659, + "num_input_tokens_seen": 102492625, + "router_z_loss_clip": 2.04101562, + "router_z_loss_mlp": 0.17138672, + "step": 4751, + "time_per_iteration": 2.710822582244873 + }, + { + "auxiliary_loss_clip": 0.06498431, + "auxiliary_loss_mlp": 0.01276615, + "balance_loss_clip": 0.06291521, + "balance_loss_mlp": 0.01258424, + "epoch": 0.2857056966781903, + "flos": 39795590880000.0, + "grad_norm": 1.6833478059847915, + "language_loss": 0.80598158, + "learning_rate": 3.3524202278754353e-06, + "loss": 0.88373208, + "num_input_tokens_seen": 102514145, + "router_z_loss_clip": 2.06835938, + "router_z_loss_mlp": 0.1817627, + "step": 4752, + "time_per_iteration": 2.685669422149658 + }, + { + "auxiliary_loss_clip": 0.0649987, + "auxiliary_loss_mlp": 0.01272426, + "balance_loss_clip": 0.06292621, + "balance_loss_mlp": 0.01254223, + "epoch": 0.28576581993085826, + "flos": 21878846795520.0, + "grad_norm": 1.793038640961372, + "language_loss": 0.79062063, + "learning_rate": 3.3521332823804676e-06, + "loss": 0.86834359, + "num_input_tokens_seen": 102532365, + "router_z_loss_clip": 2.07324219, + "router_z_loss_mlp": 0.18200684, + "step": 4753, + "time_per_iteration": 2.612639904022217 + }, + { + "auxiliary_loss_clip": 0.06511062, + "auxiliary_loss_mlp": 0.01278051, + "balance_loss_clip": 0.06298037, + "balance_loss_mlp": 0.01257523, + "epoch": 0.2858259431835262, + "flos": 19096455317760.0, + "grad_norm": 2.5775982542053963, + "language_loss": 0.89774185, + "learning_rate": 3.3518462856124704e-06, + "loss": 0.97563303, + "num_input_tokens_seen": 102548425, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.20532227, + "step": 4754, + "time_per_iteration": 3.914802312850952 + }, + { + "auxiliary_loss_clip": 0.06494384, + "auxiliary_loss_mlp": 0.01278048, + "balance_loss_clip": 0.06293342, + "balance_loss_mlp": 0.01259988, + "epoch": 0.2858860664361942, + "flos": 20339673546240.0, + "grad_norm": 1.9874166310668562, + "language_loss": 0.82672411, + "learning_rate": 3.3515592375823267e-06, + "loss": 0.90444839, + "num_input_tokens_seen": 102566370, + "router_z_loss_clip": 2.01171875, + "router_z_loss_mlp": 0.18066406, + "step": 4755, + "time_per_iteration": 2.673158884048462 + }, + { + "auxiliary_loss_clip": 0.06498866, + "auxiliary_loss_mlp": 0.01274185, + "balance_loss_clip": 0.06291682, + "balance_loss_mlp": 0.0125721, + "epoch": 0.28594618968886215, + "flos": 24468551809920.0, + "grad_norm": 1.6562500913369433, + "language_loss": 0.83843541, + "learning_rate": 3.351272138300922e-06, + "loss": 0.91616589, + "num_input_tokens_seen": 102588715, + "router_z_loss_clip": 2.06835938, + "router_z_loss_mlp": 0.16992188, + "step": 4756, + "time_per_iteration": 2.6029391288757324 + }, + { + "auxiliary_loss_clip": 0.06377822, + "auxiliary_loss_mlp": 0.01262219, + "balance_loss_clip": 0.06279279, + "balance_loss_mlp": 0.01256002, + "epoch": 0.2860063129415301, + "flos": 71676170830080.0, + "grad_norm": 1.4612509113917642, + "language_loss": 0.6086607, + "learning_rate": 3.350984987779142e-06, + "loss": 0.68506116, + "num_input_tokens_seen": 102656715, + "router_z_loss_clip": 0.984375, + "router_z_loss_mlp": 0.06207275, + "step": 4757, + "time_per_iteration": 3.326833963394165 + }, + { + "auxiliary_loss_clip": 0.0650306, + "auxiliary_loss_mlp": 0.01277184, + "balance_loss_clip": 0.06298901, + "balance_loss_mlp": 0.01260459, + "epoch": 0.2860664361941981, + "flos": 20564993975040.0, + "grad_norm": 2.5468639815388996, + "language_loss": 0.66759324, + "learning_rate": 3.3506977860278756e-06, + "loss": 0.74539566, + "num_input_tokens_seen": 102676545, + "router_z_loss_clip": 2.04101562, + "router_z_loss_mlp": 0.1673584, + "step": 4758, + "time_per_iteration": 5.454218626022339 + }, + { + "auxiliary_loss_clip": 0.06503905, + "auxiliary_loss_mlp": 0.01277556, + "balance_loss_clip": 0.06294875, + "balance_loss_mlp": 0.01258817, + "epoch": 0.2861265594468661, + "flos": 36005992997760.0, + "grad_norm": 1.4420872105733484, + "language_loss": 0.63405287, + "learning_rate": 3.3504105330580143e-06, + "loss": 0.71186751, + "num_input_tokens_seen": 102702875, + "router_z_loss_clip": 2.09082031, + "router_z_loss_mlp": 0.1875, + "step": 4759, + "time_per_iteration": 2.745704174041748 + }, + { + "auxiliary_loss_clip": 0.06510226, + "auxiliary_loss_mlp": 0.01276918, + "balance_loss_clip": 0.06302258, + "balance_loss_mlp": 0.01257892, + "epoch": 0.28618668269953407, + "flos": 20053571379840.0, + "grad_norm": 2.14199936751817, + "language_loss": 0.74684435, + "learning_rate": 3.3501232288804496e-06, + "loss": 0.82471573, + "num_input_tokens_seen": 102723160, + "router_z_loss_clip": 2.078125, + "router_z_loss_mlp": 0.19030762, + "step": 4760, + "time_per_iteration": 2.541759490966797 + }, + { + "auxiliary_loss_clip": 0.06496474, + "auxiliary_loss_mlp": 0.01276289, + "balance_loss_clip": 0.06297328, + "balance_loss_mlp": 0.01260482, + "epoch": 0.28624680595220203, + "flos": 24978632739840.0, + "grad_norm": 1.8333731861449165, + "language_loss": 0.72652757, + "learning_rate": 3.3498358735060773e-06, + "loss": 0.80425525, + "num_input_tokens_seen": 102743855, + "router_z_loss_clip": 1.9921875, + "router_z_loss_mlp": 0.15795898, + "step": 4761, + "time_per_iteration": 2.57940673828125 + }, + { + "auxiliary_loss_clip": 0.06509258, + "auxiliary_loss_mlp": 0.01273154, + "balance_loss_clip": 0.06299996, + "balance_loss_mlp": 0.01256095, + "epoch": 0.28630692920487, + "flos": 22498862682240.0, + "grad_norm": 1.9183655494362113, + "language_loss": 0.74669504, + "learning_rate": 3.349548466945793e-06, + "loss": 0.82451922, + "num_input_tokens_seen": 102761370, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 0.1706543, + "step": 4762, + "time_per_iteration": 2.5321590900421143 + }, + { + "auxiliary_loss_clip": 0.06505883, + "auxiliary_loss_mlp": 0.01274368, + "balance_loss_clip": 0.06301434, + "balance_loss_mlp": 0.0125694, + "epoch": 0.28636705245753796, + "flos": 21255979870080.0, + "grad_norm": 2.6303759088840413, + "language_loss": 0.76297629, + "learning_rate": 3.349261009210496e-06, + "loss": 0.84077883, + "num_input_tokens_seen": 102780885, + "router_z_loss_clip": 2.04101562, + "router_z_loss_mlp": 0.17443848, + "step": 4763, + "time_per_iteration": 3.979782819747925 + }, + { + "auxiliary_loss_clip": 0.06506684, + "auxiliary_loss_mlp": 0.01275654, + "balance_loss_clip": 0.06298703, + "balance_loss_mlp": 0.012572, + "epoch": 0.28642717571020593, + "flos": 24102339540480.0, + "grad_norm": 1.7484925103151405, + "language_loss": 0.77499843, + "learning_rate": 3.348973500311086e-06, + "loss": 0.85282177, + "num_input_tokens_seen": 102801000, + "router_z_loss_clip": 2.08105469, + "router_z_loss_mlp": 0.18444824, + "step": 4764, + "time_per_iteration": 2.6036336421966553 + }, + { + "auxiliary_loss_clip": 0.0651267, + "auxiliary_loss_mlp": 0.01277486, + "balance_loss_clip": 0.06302905, + "balance_loss_mlp": 0.01257829, + "epoch": 0.2864872989628739, + "flos": 22607959098240.0, + "grad_norm": 5.154577786286556, + "language_loss": 0.71671587, + "learning_rate": 3.348685940258466e-06, + "loss": 0.79461741, + "num_input_tokens_seen": 102820230, + "router_z_loss_clip": 2.09765625, + "router_z_loss_mlp": 0.1965332, + "step": 4765, + "time_per_iteration": 2.5488131046295166 + }, + { + "auxiliary_loss_clip": 0.0651048, + "auxiliary_loss_mlp": 0.01272743, + "balance_loss_clip": 0.06304644, + "balance_loss_mlp": 0.01255684, + "epoch": 0.28654742221554186, + "flos": 32753449860480.0, + "grad_norm": 1.504395922922802, + "language_loss": 0.7630865, + "learning_rate": 3.3483983290635395e-06, + "loss": 0.84091872, + "num_input_tokens_seen": 102842670, + "router_z_loss_clip": 2.05957031, + "router_z_loss_mlp": 0.17053223, + "step": 4766, + "time_per_iteration": 2.659499406814575 + }, + { + "auxiliary_loss_clip": 0.0650377, + "auxiliary_loss_mlp": 0.01271145, + "balance_loss_clip": 0.0630042, + "balance_loss_mlp": 0.01254277, + "epoch": 0.2866075454682098, + "flos": 26989257386880.0, + "grad_norm": 2.0841406955827075, + "language_loss": 0.78443938, + "learning_rate": 3.348110666737214e-06, + "loss": 0.86218858, + "num_input_tokens_seen": 102864480, + "router_z_loss_clip": 2.03320312, + "router_z_loss_mlp": 0.16870117, + "step": 4767, + "time_per_iteration": 2.5891125202178955 + }, + { + "auxiliary_loss_clip": 0.06511022, + "auxiliary_loss_mlp": 0.01279425, + "balance_loss_clip": 0.06305116, + "balance_loss_mlp": 0.01261746, + "epoch": 0.2866676687208778, + "flos": 23259812336640.0, + "grad_norm": 2.0448044221544737, + "language_loss": 0.65430236, + "learning_rate": 3.3478229532903956e-06, + "loss": 0.73220682, + "num_input_tokens_seen": 102883740, + "router_z_loss_clip": 2.05761719, + "router_z_loss_mlp": 0.17675781, + "step": 4768, + "time_per_iteration": 2.572230815887451 + }, + { + "auxiliary_loss_clip": 0.0651636, + "auxiliary_loss_mlp": 0.01271508, + "balance_loss_clip": 0.06302489, + "balance_loss_mlp": 0.01253782, + "epoch": 0.28672779197354575, + "flos": 21586120156800.0, + "grad_norm": 1.6016626643500549, + "language_loss": 0.71173406, + "learning_rate": 3.3475351887339967e-06, + "loss": 0.78961271, + "num_input_tokens_seen": 102902945, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.17724609, + "step": 4769, + "time_per_iteration": 2.5180304050445557 + }, + { + "auxiliary_loss_clip": 0.06513099, + "auxiliary_loss_mlp": 0.01273812, + "balance_loss_clip": 0.06304821, + "balance_loss_mlp": 0.01256562, + "epoch": 0.2867879152262137, + "flos": 19871785946880.0, + "grad_norm": 1.7128041826885096, + "language_loss": 0.75347042, + "learning_rate": 3.3472473730789288e-06, + "loss": 0.83133948, + "num_input_tokens_seen": 102922405, + "router_z_loss_clip": 2.08496094, + "router_z_loss_mlp": 0.17248535, + "step": 4770, + "time_per_iteration": 2.575993537902832 + }, + { + "auxiliary_loss_clip": 0.06514675, + "auxiliary_loss_mlp": 0.01275884, + "balance_loss_clip": 0.06304142, + "balance_loss_mlp": 0.01257967, + "epoch": 0.2868480384788817, + "flos": 28219687868160.0, + "grad_norm": 4.606069071133779, + "language_loss": 0.68064034, + "learning_rate": 3.3469595063361045e-06, + "loss": 0.75854599, + "num_input_tokens_seen": 102938980, + "router_z_loss_clip": 2.10546875, + "router_z_loss_mlp": 0.17907715, + "step": 4771, + "time_per_iteration": 2.5533907413482666 + }, + { + "auxiliary_loss_clip": 0.06411134, + "auxiliary_loss_mlp": 0.0125763, + "balance_loss_clip": 0.06311508, + "balance_loss_mlp": 0.01253345, + "epoch": 0.2869081617315497, + "flos": 65442218768640.0, + "grad_norm": 0.7478629548239109, + "language_loss": 0.56696546, + "learning_rate": 3.3466715885164414e-06, + "loss": 0.64365304, + "num_input_tokens_seen": 103000405, + "router_z_loss_clip": 0.99609375, + "router_z_loss_mlp": 0.04290771, + "step": 4772, + "time_per_iteration": 3.1295437812805176 + }, + { + "auxiliary_loss_clip": 0.06515288, + "auxiliary_loss_mlp": 0.01274714, + "balance_loss_clip": 0.06305212, + "balance_loss_mlp": 0.01256165, + "epoch": 0.28696828498421767, + "flos": 18666610272000.0, + "grad_norm": 3.729070810615603, + "language_loss": 0.84013474, + "learning_rate": 3.346383619630856e-06, + "loss": 0.91803479, + "num_input_tokens_seen": 103017970, + "router_z_loss_clip": 2.09765625, + "router_z_loss_mlp": 0.1854248, + "step": 4773, + "time_per_iteration": 2.5181708335876465 + }, + { + "auxiliary_loss_clip": 0.06518447, + "auxiliary_loss_mlp": 0.01274166, + "balance_loss_clip": 0.06306095, + "balance_loss_mlp": 0.01254985, + "epoch": 0.28702840823688563, + "flos": 23666540855040.0, + "grad_norm": 2.856350636496585, + "language_loss": 0.78241181, + "learning_rate": 3.34609559969027e-06, + "loss": 0.86033797, + "num_input_tokens_seen": 103036385, + "router_z_loss_clip": 2.12109375, + "router_z_loss_mlp": 0.19177246, + "step": 4774, + "time_per_iteration": 2.5831918716430664 + }, + { + "auxiliary_loss_clip": 0.06519175, + "auxiliary_loss_mlp": 0.01275468, + "balance_loss_clip": 0.06307949, + "balance_loss_mlp": 0.01255703, + "epoch": 0.2870885314895536, + "flos": 13809248611200.0, + "grad_norm": 1.8762920881530476, + "language_loss": 0.74056339, + "learning_rate": 3.3458075287056034e-06, + "loss": 0.81850982, + "num_input_tokens_seen": 103052170, + "router_z_loss_clip": 2.11328125, + "router_z_loss_mlp": 0.19763184, + "step": 4775, + "time_per_iteration": 2.505293369293213 + }, + { + "auxiliary_loss_clip": 0.06520346, + "auxiliary_loss_mlp": 0.01275844, + "balance_loss_clip": 0.06309157, + "balance_loss_mlp": 0.01258142, + "epoch": 0.28714865474222157, + "flos": 17792790768000.0, + "grad_norm": 1.8823617406689648, + "language_loss": 0.88338864, + "learning_rate": 3.34551940668778e-06, + "loss": 0.96135056, + "num_input_tokens_seen": 103070510, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.17687988, + "step": 4776, + "time_per_iteration": 2.5638997554779053 + }, + { + "auxiliary_loss_clip": 0.06511634, + "auxiliary_loss_mlp": 0.01275769, + "balance_loss_clip": 0.06302971, + "balance_loss_mlp": 0.01258269, + "epoch": 0.28720877799488953, + "flos": 16002958429440.0, + "grad_norm": 2.648093963017482, + "language_loss": 0.74451852, + "learning_rate": 3.345231233647726e-06, + "loss": 0.82239252, + "num_input_tokens_seen": 103089590, + "router_z_loss_clip": 2.08789062, + "router_z_loss_mlp": 0.17492676, + "step": 4777, + "time_per_iteration": 2.5142223834991455 + }, + { + "auxiliary_loss_clip": 0.06527238, + "auxiliary_loss_mlp": 0.01280106, + "balance_loss_clip": 0.06311023, + "balance_loss_mlp": 0.01259924, + "epoch": 0.2872689012475575, + "flos": 20929445308800.0, + "grad_norm": 2.200879096052639, + "language_loss": 0.80539143, + "learning_rate": 3.3449430095963696e-06, + "loss": 0.88346487, + "num_input_tokens_seen": 103109080, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.20202637, + "step": 4778, + "time_per_iteration": 2.563994884490967 + }, + { + "auxiliary_loss_clip": 0.06511427, + "auxiliary_loss_mlp": 0.01281129, + "balance_loss_clip": 0.06304548, + "balance_loss_mlp": 0.01263223, + "epoch": 0.28732902450022546, + "flos": 21331603779840.0, + "grad_norm": 1.7996465112645923, + "language_loss": 0.73886508, + "learning_rate": 3.3446547345446386e-06, + "loss": 0.8167907, + "num_input_tokens_seen": 103127755, + "router_z_loss_clip": 2.06542969, + "router_z_loss_mlp": 0.17895508, + "step": 4779, + "time_per_iteration": 2.5394158363342285 + }, + { + "auxiliary_loss_clip": 0.06518923, + "auxiliary_loss_mlp": 0.01275383, + "balance_loss_clip": 0.06307982, + "balance_loss_mlp": 0.01255379, + "epoch": 0.2873891477528934, + "flos": 20856714364800.0, + "grad_norm": 1.509851280453794, + "language_loss": 0.76844704, + "learning_rate": 3.3443664085034656e-06, + "loss": 0.84639007, + "num_input_tokens_seen": 103147035, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.19995117, + "step": 4780, + "time_per_iteration": 2.5928425788879395 + }, + { + "auxiliary_loss_clip": 0.06507713, + "auxiliary_loss_mlp": 0.01271777, + "balance_loss_clip": 0.06302975, + "balance_loss_mlp": 0.01254014, + "epoch": 0.2874492710055614, + "flos": 17425698030720.0, + "grad_norm": 1.6471362454858889, + "language_loss": 0.81874287, + "learning_rate": 3.344078031483784e-06, + "loss": 0.89653778, + "num_input_tokens_seen": 103165410, + "router_z_loss_clip": 2.04296875, + "router_z_loss_mlp": 0.17773438, + "step": 4781, + "time_per_iteration": 2.6121537685394287 + }, + { + "auxiliary_loss_clip": 0.06521222, + "auxiliary_loss_mlp": 0.0127902, + "balance_loss_clip": 0.06306514, + "balance_loss_mlp": 0.01257002, + "epoch": 0.28750939425822936, + "flos": 13411827895680.0, + "grad_norm": 2.0671181517724966, + "language_loss": 0.86987036, + "learning_rate": 3.3437896034965283e-06, + "loss": 0.94787276, + "num_input_tokens_seen": 103183710, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.22009277, + "step": 4782, + "time_per_iteration": 2.554326057434082 + }, + { + "auxiliary_loss_clip": 0.06525762, + "auxiliary_loss_mlp": 0.01282396, + "balance_loss_clip": 0.06310341, + "balance_loss_mlp": 0.01262238, + "epoch": 0.2875695175108973, + "flos": 21876205392000.0, + "grad_norm": 1.4282255381090248, + "language_loss": 0.71525908, + "learning_rate": 3.3435011245526357e-06, + "loss": 0.79334062, + "num_input_tokens_seen": 103203790, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.20153809, + "step": 4783, + "time_per_iteration": 2.5632100105285645 + }, + { + "auxiliary_loss_clip": 0.06514136, + "auxiliary_loss_mlp": 0.01279499, + "balance_loss_clip": 0.06305264, + "balance_loss_mlp": 0.01259186, + "epoch": 0.2876296407635653, + "flos": 26251885457280.0, + "grad_norm": 1.5568964680804804, + "language_loss": 0.77152872, + "learning_rate": 3.343212594663047e-06, + "loss": 0.84946513, + "num_input_tokens_seen": 103223925, + "router_z_loss_clip": 2.08886719, + "router_z_loss_mlp": 0.203125, + "step": 4784, + "time_per_iteration": 2.589073657989502 + }, + { + "auxiliary_loss_clip": 0.06506136, + "auxiliary_loss_mlp": 0.01278073, + "balance_loss_clip": 0.06301259, + "balance_loss_mlp": 0.01257914, + "epoch": 0.28768976401623325, + "flos": 25380581575680.0, + "grad_norm": 1.5725877671574655, + "language_loss": 0.76106405, + "learning_rate": 3.3429240138387015e-06, + "loss": 0.83890617, + "num_input_tokens_seen": 103244760, + "router_z_loss_clip": 2.04785156, + "router_z_loss_mlp": 0.20153809, + "step": 4785, + "time_per_iteration": 2.6051061153411865 + }, + { + "auxiliary_loss_clip": 0.06513079, + "auxiliary_loss_mlp": 0.0127873, + "balance_loss_clip": 0.06302914, + "balance_loss_mlp": 0.01259394, + "epoch": 0.28774988726890127, + "flos": 30672232548480.0, + "grad_norm": 2.246179731229797, + "language_loss": 0.83339965, + "learning_rate": 3.3426353820905425e-06, + "loss": 0.91131771, + "num_input_tokens_seen": 103261995, + "router_z_loss_clip": 2.1015625, + "router_z_loss_mlp": 0.19348145, + "step": 4786, + "time_per_iteration": 2.6064071655273438 + }, + { + "auxiliary_loss_clip": 0.06512371, + "auxiliary_loss_mlp": 0.01278365, + "balance_loss_clip": 0.06303188, + "balance_loss_mlp": 0.01258934, + "epoch": 0.28781001052156924, + "flos": 20601820644480.0, + "grad_norm": 2.4876341958211037, + "language_loss": 0.80607671, + "learning_rate": 3.342346699429516e-06, + "loss": 0.88398409, + "num_input_tokens_seen": 103279780, + "router_z_loss_clip": 2.09179688, + "router_z_loss_mlp": 0.19433594, + "step": 4787, + "time_per_iteration": 2.574558973312378 + }, + { + "auxiliary_loss_clip": 0.06516974, + "auxiliary_loss_mlp": 0.01280481, + "balance_loss_clip": 0.0630367, + "balance_loss_mlp": 0.01260191, + "epoch": 0.2878701337742372, + "flos": 26549643340800.0, + "grad_norm": 1.713934654291453, + "language_loss": 0.84188497, + "learning_rate": 3.3420579658665677e-06, + "loss": 0.91985947, + "num_input_tokens_seen": 103300580, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.20288086, + "step": 4788, + "time_per_iteration": 2.610520362854004 + }, + { + "auxiliary_loss_clip": 0.06528202, + "auxiliary_loss_mlp": 0.01278372, + "balance_loss_clip": 0.06311956, + "balance_loss_mlp": 0.01257594, + "epoch": 0.28793025702690517, + "flos": 28154294156160.0, + "grad_norm": 1.8819133496848792, + "language_loss": 0.73887986, + "learning_rate": 3.3417691814126468e-06, + "loss": 0.81694555, + "num_input_tokens_seen": 103320430, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.2076416, + "step": 4789, + "time_per_iteration": 2.637234687805176 + }, + { + "auxiliary_loss_clip": 0.06504419, + "auxiliary_loss_mlp": 0.0127649, + "balance_loss_clip": 0.06300576, + "balance_loss_mlp": 0.01259014, + "epoch": 0.28799038027957313, + "flos": 23812254305280.0, + "grad_norm": 1.6484379512289788, + "language_loss": 0.84411776, + "learning_rate": 3.341480346078704e-06, + "loss": 0.92192692, + "num_input_tokens_seen": 103337695, + "router_z_loss_clip": 2.03808594, + "router_z_loss_mlp": 0.17492676, + "step": 4790, + "time_per_iteration": 2.5587222576141357 + }, + { + "auxiliary_loss_clip": 0.06518544, + "auxiliary_loss_mlp": 0.01278217, + "balance_loss_clip": 0.06308021, + "balance_loss_mlp": 0.01259728, + "epoch": 0.2880505035322411, + "flos": 22350340120320.0, + "grad_norm": 1.9872780385985664, + "language_loss": 0.78222489, + "learning_rate": 3.3411914598756922e-06, + "loss": 0.86019248, + "num_input_tokens_seen": 103357010, + "router_z_loss_clip": 2.10546875, + "router_z_loss_mlp": 0.18481445, + "step": 4791, + "time_per_iteration": 2.624457359313965 + }, + { + "auxiliary_loss_clip": 0.06518695, + "auxiliary_loss_mlp": 0.01277015, + "balance_loss_clip": 0.06302316, + "balance_loss_mlp": 0.01257286, + "epoch": 0.28811062678490906, + "flos": 18010061205120.0, + "grad_norm": 3.7561845310327002, + "language_loss": 0.71278274, + "learning_rate": 3.3409025228145654e-06, + "loss": 0.79073977, + "num_input_tokens_seen": 103375600, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.19726562, + "step": 4792, + "time_per_iteration": 2.5208675861358643 + }, + { + "auxiliary_loss_clip": 0.06512474, + "auxiliary_loss_mlp": 0.01276677, + "balance_loss_clip": 0.06301394, + "balance_loss_mlp": 0.01258391, + "epoch": 0.28817075003757703, + "flos": 22097416970880.0, + "grad_norm": 1.8001054572072859, + "language_loss": 0.80413318, + "learning_rate": 3.3406135349062812e-06, + "loss": 0.88202471, + "num_input_tokens_seen": 103395225, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 0.18286133, + "step": 4793, + "time_per_iteration": 4.170284271240234 + }, + { + "auxiliary_loss_clip": 0.06499149, + "auxiliary_loss_mlp": 0.01283104, + "balance_loss_clip": 0.06297339, + "balance_loss_mlp": 0.01264484, + "epoch": 0.288230873290245, + "flos": 41692842552960.0, + "grad_norm": 1.6709200510021447, + "language_loss": 0.78107667, + "learning_rate": 3.340324496161797e-06, + "loss": 0.85889918, + "num_input_tokens_seen": 103417245, + "router_z_loss_clip": 2.01757812, + "router_z_loss_mlp": 0.18603516, + "step": 4794, + "time_per_iteration": 2.8557510375976562 + }, + { + "auxiliary_loss_clip": 0.06510264, + "auxiliary_loss_mlp": 0.01279527, + "balance_loss_clip": 0.06298079, + "balance_loss_mlp": 0.01260882, + "epoch": 0.28829099654291296, + "flos": 18630328654080.0, + "grad_norm": 2.1208293695579608, + "language_loss": 0.83245766, + "learning_rate": 3.340035406592074e-06, + "loss": 0.91035557, + "num_input_tokens_seen": 103435500, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.18652344, + "step": 4795, + "time_per_iteration": 2.535163164138794 + }, + { + "auxiliary_loss_clip": 0.06498718, + "auxiliary_loss_mlp": 0.0128311, + "balance_loss_clip": 0.06297053, + "balance_loss_mlp": 0.01266099, + "epoch": 0.2883511197955809, + "flos": 24680707148160.0, + "grad_norm": 2.078774389913416, + "language_loss": 0.75219119, + "learning_rate": 3.339746266208074e-06, + "loss": 0.83000946, + "num_input_tokens_seen": 103451040, + "router_z_loss_clip": 2.015625, + "router_z_loss_mlp": 0.17004395, + "step": 4796, + "time_per_iteration": 2.567488670349121 + }, + { + "auxiliary_loss_clip": 0.06509424, + "auxiliary_loss_mlp": 0.01276979, + "balance_loss_clip": 0.06296358, + "balance_loss_mlp": 0.01257798, + "epoch": 0.2884112430482489, + "flos": 23118794714880.0, + "grad_norm": 2.1968759883463513, + "language_loss": 0.73290622, + "learning_rate": 3.3394570750207614e-06, + "loss": 0.81077027, + "num_input_tokens_seen": 103471330, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.19189453, + "step": 4797, + "time_per_iteration": 3.975389242172241 + }, + { + "auxiliary_loss_clip": 0.06507025, + "auxiliary_loss_mlp": 0.01273799, + "balance_loss_clip": 0.0629791, + "balance_loss_mlp": 0.0125556, + "epoch": 0.28847136630091685, + "flos": 16879000066560.0, + "grad_norm": 2.2937655739300373, + "language_loss": 0.74862409, + "learning_rate": 3.3391678330411017e-06, + "loss": 0.82643229, + "num_input_tokens_seen": 103488060, + "router_z_loss_clip": 2.08984375, + "router_z_loss_mlp": 0.18212891, + "step": 4798, + "time_per_iteration": 3.9849729537963867 + }, + { + "auxiliary_loss_clip": 0.06517179, + "auxiliary_loss_mlp": 0.01285883, + "balance_loss_clip": 0.06306559, + "balance_loss_mlp": 0.01266381, + "epoch": 0.2885314895535849, + "flos": 25663161870720.0, + "grad_norm": 2.626807334731923, + "language_loss": 0.65891635, + "learning_rate": 3.3388785402800642e-06, + "loss": 0.736947, + "num_input_tokens_seen": 103503600, + "router_z_loss_clip": 2.09960938, + "router_z_loss_mlp": 0.19494629, + "step": 4799, + "time_per_iteration": 2.6063008308410645 + }, + { + "auxiliary_loss_clip": 0.06513311, + "auxiliary_loss_mlp": 0.01278669, + "balance_loss_clip": 0.06300591, + "balance_loss_mlp": 0.01260013, + "epoch": 0.28859161280625284, + "flos": 21113872145280.0, + "grad_norm": 1.5942901452973643, + "language_loss": 0.82659006, + "learning_rate": 3.3385891967486178e-06, + "loss": 0.9045099, + "num_input_tokens_seen": 103524195, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.18664551, + "step": 4800, + "time_per_iteration": 2.5522704124450684 + }, + { + "auxiliary_loss_clip": 0.06498213, + "auxiliary_loss_mlp": 0.01277775, + "balance_loss_clip": 0.06294428, + "balance_loss_mlp": 0.01260609, + "epoch": 0.2886517360589208, + "flos": 26476870469760.0, + "grad_norm": 1.7957021177556654, + "language_loss": 0.91005886, + "learning_rate": 3.3382998024577347e-06, + "loss": 0.98781872, + "num_input_tokens_seen": 103545235, + "router_z_loss_clip": 2.03613281, + "router_z_loss_mlp": 0.17175293, + "step": 4801, + "time_per_iteration": 2.648975372314453 + }, + { + "auxiliary_loss_clip": 0.06509861, + "auxiliary_loss_mlp": 0.01278615, + "balance_loss_clip": 0.06299478, + "balance_loss_mlp": 0.01260722, + "epoch": 0.28871185931158877, + "flos": 25272365627520.0, + "grad_norm": 1.8432796050129874, + "language_loss": 0.74294543, + "learning_rate": 3.33801035741839e-06, + "loss": 0.82083023, + "num_input_tokens_seen": 103563305, + "router_z_loss_clip": 2.10546875, + "router_z_loss_mlp": 0.17895508, + "step": 4802, + "time_per_iteration": 2.5519795417785645 + }, + { + "auxiliary_loss_clip": 0.0639186, + "auxiliary_loss_mlp": 0.01290861, + "balance_loss_clip": 0.06293292, + "balance_loss_mlp": 0.01286456, + "epoch": 0.28877198256425674, + "flos": 66683676061440.0, + "grad_norm": 0.7742675136744124, + "language_loss": 0.62925327, + "learning_rate": 3.337720861641558e-06, + "loss": 0.70608056, + "num_input_tokens_seen": 103625025, + "router_z_loss_clip": 0.984375, + "router_z_loss_mlp": 0.04412842, + "step": 4803, + "time_per_iteration": 4.557742595672607 + }, + { + "auxiliary_loss_clip": 0.06504417, + "auxiliary_loss_mlp": 0.01273971, + "balance_loss_clip": 0.06297504, + "balance_loss_mlp": 0.01256721, + "epoch": 0.2888321058169247, + "flos": 20309261713920.0, + "grad_norm": 2.312081796144873, + "language_loss": 0.71418971, + "learning_rate": 3.3374313151382165e-06, + "loss": 0.79197359, + "num_input_tokens_seen": 103644235, + "router_z_loss_clip": 2.06738281, + "router_z_loss_mlp": 0.17248535, + "step": 4804, + "time_per_iteration": 2.5679221153259277 + }, + { + "auxiliary_loss_clip": 0.06511839, + "auxiliary_loss_mlp": 0.01276786, + "balance_loss_clip": 0.06299883, + "balance_loss_mlp": 0.01258892, + "epoch": 0.28889222906959267, + "flos": 25523192424960.0, + "grad_norm": 2.035708939634364, + "language_loss": 0.68254268, + "learning_rate": 3.337141717919346e-06, + "loss": 0.76042891, + "num_input_tokens_seen": 103664700, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.17907715, + "step": 4805, + "time_per_iteration": 2.5894699096679688 + }, + { + "auxiliary_loss_clip": 0.06510667, + "auxiliary_loss_mlp": 0.01276264, + "balance_loss_clip": 0.06300112, + "balance_loss_mlp": 0.01258955, + "epoch": 0.28895235232226063, + "flos": 32679544959360.0, + "grad_norm": 1.67836402891337, + "language_loss": 0.69622278, + "learning_rate": 3.3368520699959272e-06, + "loss": 0.77409214, + "num_input_tokens_seen": 103686595, + "router_z_loss_clip": 2.10546875, + "router_z_loss_mlp": 0.1730957, + "step": 4806, + "time_per_iteration": 2.6661036014556885 + }, + { + "auxiliary_loss_clip": 0.06499489, + "auxiliary_loss_mlp": 0.01273073, + "balance_loss_clip": 0.06297253, + "balance_loss_mlp": 0.01256133, + "epoch": 0.2890124755749286, + "flos": 29722202156160.0, + "grad_norm": 1.5048672267596763, + "language_loss": 0.71718901, + "learning_rate": 3.3365623713789443e-06, + "loss": 0.7949146, + "num_input_tokens_seen": 103707525, + "router_z_loss_clip": 2.02441406, + "router_z_loss_mlp": 0.16931152, + "step": 4807, + "time_per_iteration": 2.6082210540771484 + }, + { + "auxiliary_loss_clip": 0.06506096, + "auxiliary_loss_mlp": 0.01273173, + "balance_loss_clip": 0.06298453, + "balance_loss_mlp": 0.01255769, + "epoch": 0.28907259882759656, + "flos": 22681067385600.0, + "grad_norm": 1.6103433555287536, + "language_loss": 0.8189373, + "learning_rate": 3.336272622079382e-06, + "loss": 0.89672995, + "num_input_tokens_seen": 103727905, + "router_z_loss_clip": 2.07421875, + "router_z_loss_mlp": 0.17407227, + "step": 4808, + "time_per_iteration": 2.575005292892456 + }, + { + "auxiliary_loss_clip": 0.0649471, + "auxiliary_loss_mlp": 0.01279377, + "balance_loss_clip": 0.06293811, + "balance_loss_mlp": 0.01261543, + "epoch": 0.2891327220802645, + "flos": 22572809510400.0, + "grad_norm": 1.6658984409983257, + "language_loss": 0.79128641, + "learning_rate": 3.3359828221082276e-06, + "loss": 0.86902726, + "num_input_tokens_seen": 103748335, + "router_z_loss_clip": 2.00878906, + "router_z_loss_mlp": 0.17834473, + "step": 4809, + "time_per_iteration": 2.563202142715454 + }, + { + "auxiliary_loss_clip": 0.06509645, + "auxiliary_loss_mlp": 0.01274578, + "balance_loss_clip": 0.06294866, + "balance_loss_mlp": 0.01256411, + "epoch": 0.2891928453329325, + "flos": 21659228444160.0, + "grad_norm": 1.9154470794900575, + "language_loss": 0.79370517, + "learning_rate": 3.3356929714764714e-06, + "loss": 0.8715474, + "num_input_tokens_seen": 103767020, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.18151855, + "step": 4810, + "time_per_iteration": 2.555290460586548 + }, + { + "auxiliary_loss_clip": 0.06499892, + "auxiliary_loss_mlp": 0.01275722, + "balance_loss_clip": 0.06295595, + "balance_loss_mlp": 0.01259259, + "epoch": 0.28925296858560046, + "flos": 23228855452800.0, + "grad_norm": 1.5886971021791327, + "language_loss": 0.77595514, + "learning_rate": 3.3354030701951032e-06, + "loss": 0.85371131, + "num_input_tokens_seen": 103786355, + "router_z_loss_clip": 2.04199219, + "router_z_loss_mlp": 0.16467285, + "step": 4811, + "time_per_iteration": 2.5522642135620117 + }, + { + "auxiliary_loss_clip": 0.06509165, + "auxiliary_loss_mlp": 0.01277164, + "balance_loss_clip": 0.06302579, + "balance_loss_mlp": 0.01259497, + "epoch": 0.2893130918382685, + "flos": 28629267425280.0, + "grad_norm": 1.704164513062304, + "language_loss": 0.78002596, + "learning_rate": 3.335113118275117e-06, + "loss": 0.85788929, + "num_input_tokens_seen": 103809345, + "router_z_loss_clip": 2.06933594, + "router_z_loss_mlp": 0.17675781, + "step": 4812, + "time_per_iteration": 2.6069154739379883 + }, + { + "auxiliary_loss_clip": 0.06384769, + "auxiliary_loss_mlp": 0.01270413, + "balance_loss_clip": 0.06288065, + "balance_loss_mlp": 0.01266965, + "epoch": 0.28937321509093644, + "flos": 72323328240000.0, + "grad_norm": 0.7614773045430072, + "language_loss": 0.60086656, + "learning_rate": 3.3348231157275085e-06, + "loss": 0.67741829, + "num_input_tokens_seen": 103871180, + "router_z_loss_clip": 0.96875, + "router_z_loss_mlp": 0.03457642, + "step": 4813, + "time_per_iteration": 3.3377795219421387 + }, + { + "auxiliary_loss_clip": 0.06503347, + "auxiliary_loss_mlp": 0.01279669, + "balance_loss_clip": 0.0629978, + "balance_loss_mlp": 0.01262253, + "epoch": 0.2894333383436044, + "flos": 16221905948160.0, + "grad_norm": 2.095142654160917, + "language_loss": 0.83059847, + "learning_rate": 3.3345330625632725e-06, + "loss": 0.90842861, + "num_input_tokens_seen": 103889040, + "router_z_loss_clip": 2.03515625, + "router_z_loss_mlp": 0.17407227, + "step": 4814, + "time_per_iteration": 2.519822120666504 + }, + { + "auxiliary_loss_clip": 0.06510264, + "auxiliary_loss_mlp": 0.0128276, + "balance_loss_clip": 0.06297985, + "balance_loss_mlp": 0.01264389, + "epoch": 0.2894934615962724, + "flos": 24835434912000.0, + "grad_norm": 1.4921373382431753, + "language_loss": 0.72583377, + "learning_rate": 3.3342429587934094e-06, + "loss": 0.80376399, + "num_input_tokens_seen": 103910380, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.18371582, + "step": 4815, + "time_per_iteration": 2.613424301147461 + }, + { + "auxiliary_loss_clip": 0.06496876, + "auxiliary_loss_mlp": 0.01270189, + "balance_loss_clip": 0.06299625, + "balance_loss_mlp": 0.01253858, + "epoch": 0.28955358484894034, + "flos": 20456400683520.0, + "grad_norm": 1.478095248571898, + "language_loss": 0.71455014, + "learning_rate": 3.3339528044289198e-06, + "loss": 0.79222083, + "num_input_tokens_seen": 103929955, + "router_z_loss_clip": 1.97265625, + "router_z_loss_mlp": 0.16345215, + "step": 4816, + "time_per_iteration": 2.523789644241333 + }, + { + "auxiliary_loss_clip": 0.0651416, + "auxiliary_loss_mlp": 0.01273853, + "balance_loss_clip": 0.06301913, + "balance_loss_mlp": 0.01256007, + "epoch": 0.2896137081016083, + "flos": 22571803261440.0, + "grad_norm": 2.1886400582799643, + "language_loss": 0.75928313, + "learning_rate": 3.3336625994808055e-06, + "loss": 0.83716327, + "num_input_tokens_seen": 103948020, + "router_z_loss_clip": 2.12109375, + "router_z_loss_mlp": 0.17834473, + "step": 4817, + "time_per_iteration": 2.5829625129699707 + }, + { + "auxiliary_loss_clip": 0.0650699, + "auxiliary_loss_mlp": 0.0127444, + "balance_loss_clip": 0.06299114, + "balance_loss_mlp": 0.01255486, + "epoch": 0.28967383135427627, + "flos": 26695231009920.0, + "grad_norm": 2.009148210409016, + "language_loss": 0.77384543, + "learning_rate": 3.3333723439600723e-06, + "loss": 0.85165972, + "num_input_tokens_seen": 103968740, + "router_z_loss_clip": 2.078125, + "router_z_loss_mlp": 0.18933105, + "step": 4818, + "time_per_iteration": 2.583580732345581 + }, + { + "auxiliary_loss_clip": 0.06511898, + "auxiliary_loss_mlp": 0.01274642, + "balance_loss_clip": 0.063049, + "balance_loss_mlp": 0.01257833, + "epoch": 0.28973395460694423, + "flos": 15563428237440.0, + "grad_norm": 1.8180363278883531, + "language_loss": 0.80166686, + "learning_rate": 3.3330820378777263e-06, + "loss": 0.87953222, + "num_input_tokens_seen": 103986005, + "router_z_loss_clip": 2.07128906, + "router_z_loss_mlp": 0.16833496, + "step": 4819, + "time_per_iteration": 2.58598256111145 + }, + { + "auxiliary_loss_clip": 0.06512412, + "auxiliary_loss_mlp": 0.01275212, + "balance_loss_clip": 0.06301294, + "balance_loss_mlp": 0.01256543, + "epoch": 0.2897940778596122, + "flos": 18703395014400.0, + "grad_norm": 1.8889731698350438, + "language_loss": 0.79784238, + "learning_rate": 3.332791681244776e-06, + "loss": 0.87571859, + "num_input_tokens_seen": 104005070, + "router_z_loss_clip": 2.11328125, + "router_z_loss_mlp": 0.18664551, + "step": 4820, + "time_per_iteration": 2.514738082885742 + }, + { + "auxiliary_loss_clip": 0.06519003, + "auxiliary_loss_mlp": 0.01272112, + "balance_loss_clip": 0.06309246, + "balance_loss_mlp": 0.01254612, + "epoch": 0.28985420111228016, + "flos": 18776209812480.0, + "grad_norm": 1.948801074603747, + "language_loss": 0.73537958, + "learning_rate": 3.332501274072231e-06, + "loss": 0.81329072, + "num_input_tokens_seen": 104022945, + "router_z_loss_clip": 2.09863281, + "router_z_loss_mlp": 0.17492676, + "step": 4821, + "time_per_iteration": 2.6552352905273438 + }, + { + "auxiliary_loss_clip": 0.06509826, + "auxiliary_loss_mlp": 0.01279091, + "balance_loss_clip": 0.06303322, + "balance_loss_mlp": 0.01260733, + "epoch": 0.28991432436494813, + "flos": 23075511281280.0, + "grad_norm": 1.9415887628712303, + "language_loss": 0.7256397, + "learning_rate": 3.332210816371104e-06, + "loss": 0.8035289, + "num_input_tokens_seen": 104042080, + "router_z_loss_clip": 2.06347656, + "router_z_loss_mlp": 0.18347168, + "step": 4822, + "time_per_iteration": 2.5311806201934814 + }, + { + "auxiliary_loss_clip": 0.06508678, + "auxiliary_loss_mlp": 0.0127532, + "balance_loss_clip": 0.06304502, + "balance_loss_mlp": 0.01258237, + "epoch": 0.2899744476176161, + "flos": 17608992837120.0, + "grad_norm": 1.6868082855094653, + "language_loss": 0.66498971, + "learning_rate": 3.3319203081524102e-06, + "loss": 0.74282968, + "num_input_tokens_seen": 104060975, + "router_z_loss_clip": 2.04003906, + "router_z_loss_mlp": 0.17077637, + "step": 4823, + "time_per_iteration": 2.5582497119903564 + }, + { + "auxiliary_loss_clip": 0.06507877, + "auxiliary_loss_mlp": 0.0127093, + "balance_loss_clip": 0.06303018, + "balance_loss_mlp": 0.01253728, + "epoch": 0.29003457087028406, + "flos": 22315861365120.0, + "grad_norm": 2.007628710478466, + "language_loss": 0.81589168, + "learning_rate": 3.331629749427164e-06, + "loss": 0.89367974, + "num_input_tokens_seen": 104081395, + "router_z_loss_clip": 2.04882812, + "router_z_loss_mlp": 0.171875, + "step": 4824, + "time_per_iteration": 2.5258595943450928 + }, + { + "auxiliary_loss_clip": 0.06510833, + "auxiliary_loss_mlp": 0.01276305, + "balance_loss_clip": 0.06301483, + "balance_loss_mlp": 0.01258376, + "epoch": 0.2900946941229521, + "flos": 21951493885440.0, + "grad_norm": 1.837693758429887, + "language_loss": 0.73192668, + "learning_rate": 3.331339140206385e-06, + "loss": 0.80979806, + "num_input_tokens_seen": 104099995, + "router_z_loss_clip": 2.09179688, + "router_z_loss_mlp": 0.17932129, + "step": 4825, + "time_per_iteration": 2.558096170425415 + }, + { + "auxiliary_loss_clip": 0.0651435, + "auxiliary_loss_mlp": 0.01275324, + "balance_loss_clip": 0.06305824, + "balance_loss_mlp": 0.01257049, + "epoch": 0.29015481737562004, + "flos": 17938126874880.0, + "grad_norm": 2.202818652908599, + "language_loss": 0.7426061, + "learning_rate": 3.331048480501092e-06, + "loss": 0.82050288, + "num_input_tokens_seen": 104118930, + "router_z_loss_clip": 2.08496094, + "router_z_loss_mlp": 0.18273926, + "step": 4826, + "time_per_iteration": 2.497711420059204 + }, + { + "auxiliary_loss_clip": 0.06516986, + "auxiliary_loss_mlp": 0.01278902, + "balance_loss_clip": 0.06309567, + "balance_loss_mlp": 0.01262141, + "epoch": 0.290214940628288, + "flos": 22790079947520.0, + "grad_norm": 1.934932602801083, + "language_loss": 0.69077051, + "learning_rate": 3.3307577703223073e-06, + "loss": 0.76872945, + "num_input_tokens_seen": 104136940, + "router_z_loss_clip": 2.07324219, + "router_z_loss_mlp": 0.16748047, + "step": 4827, + "time_per_iteration": 2.5729641914367676 + }, + { + "auxiliary_loss_clip": 0.06517433, + "auxiliary_loss_mlp": 0.0127379, + "balance_loss_clip": 0.06311382, + "balance_loss_mlp": 0.01255646, + "epoch": 0.290275063880956, + "flos": 20011881173760.0, + "grad_norm": 1.8047855406998587, + "language_loss": 0.80766201, + "learning_rate": 3.3304670096810545e-06, + "loss": 0.88557422, + "num_input_tokens_seen": 104154280, + "router_z_loss_clip": 2.06054688, + "router_z_loss_mlp": 0.18151855, + "step": 4828, + "time_per_iteration": 2.5190348625183105 + }, + { + "auxiliary_loss_clip": 0.0651058, + "auxiliary_loss_mlp": 0.01278642, + "balance_loss_clip": 0.06308287, + "balance_loss_mlp": 0.01260809, + "epoch": 0.29033518713362394, + "flos": 22060003322880.0, + "grad_norm": 1.646725141321262, + "language_loss": 0.80908686, + "learning_rate": 3.33017619858836e-06, + "loss": 0.8869791, + "num_input_tokens_seen": 104172605, + "router_z_loss_clip": 2.02246094, + "router_z_loss_mlp": 0.17822266, + "step": 4829, + "time_per_iteration": 2.564837694168091 + }, + { + "auxiliary_loss_clip": 0.06503877, + "auxiliary_loss_mlp": 0.01277613, + "balance_loss_clip": 0.06304269, + "balance_loss_mlp": 0.0126059, + "epoch": 0.2903953103862919, + "flos": 25637194304640.0, + "grad_norm": 1.4271698228137566, + "language_loss": 0.82616186, + "learning_rate": 3.329885337055249e-06, + "loss": 0.90397674, + "num_input_tokens_seen": 104194120, + "router_z_loss_clip": 1.99414062, + "router_z_loss_mlp": 0.17016602, + "step": 4830, + "time_per_iteration": 2.557326555252075 + }, + { + "auxiliary_loss_clip": 0.0652103, + "auxiliary_loss_mlp": 0.01280335, + "balance_loss_clip": 0.06313583, + "balance_loss_mlp": 0.01262036, + "epoch": 0.29045543363895987, + "flos": 16951437521280.0, + "grad_norm": 2.247105417787089, + "language_loss": 0.79901475, + "learning_rate": 3.3295944250927546e-06, + "loss": 0.87702841, + "num_input_tokens_seen": 104210875, + "router_z_loss_clip": 2.07421875, + "router_z_loss_mlp": 0.18310547, + "step": 4831, + "time_per_iteration": 2.5306637287139893 + }, + { + "auxiliary_loss_clip": 0.06507042, + "auxiliary_loss_mlp": 0.01277723, + "balance_loss_clip": 0.06307022, + "balance_loss_mlp": 0.01261392, + "epoch": 0.29051555689162784, + "flos": 26402630152320.0, + "grad_norm": 2.3059080747570775, + "language_loss": 0.75331926, + "learning_rate": 3.3293034627119055e-06, + "loss": 0.83116686, + "num_input_tokens_seen": 104229875, + "router_z_loss_clip": 2.00097656, + "router_z_loss_mlp": 0.16333008, + "step": 4832, + "time_per_iteration": 2.5603439807891846 + }, + { + "auxiliary_loss_clip": 0.06503655, + "auxiliary_loss_mlp": 0.01283448, + "balance_loss_clip": 0.06302731, + "balance_loss_mlp": 0.01267271, + "epoch": 0.2905756801442958, + "flos": 21109931003520.0, + "grad_norm": 1.626645949157208, + "language_loss": 0.76312864, + "learning_rate": 3.329012449923736e-06, + "loss": 0.8409996, + "num_input_tokens_seen": 104250405, + "router_z_loss_clip": 2.00878906, + "router_z_loss_mlp": 0.16162109, + "step": 4833, + "time_per_iteration": 4.029958963394165 + }, + { + "auxiliary_loss_clip": 0.06504881, + "auxiliary_loss_mlp": 0.01280243, + "balance_loss_clip": 0.06303954, + "balance_loss_mlp": 0.01263363, + "epoch": 0.29063580339696377, + "flos": 15711573456000.0, + "grad_norm": 1.645904053352059, + "language_loss": 0.65383506, + "learning_rate": 3.3287213867392813e-06, + "loss": 0.73168635, + "num_input_tokens_seen": 104269185, + "router_z_loss_clip": 2.00976562, + "router_z_loss_mlp": 0.16882324, + "step": 4834, + "time_per_iteration": 2.5233187675476074 + }, + { + "auxiliary_loss_clip": 0.06499655, + "auxiliary_loss_mlp": 0.01274915, + "balance_loss_clip": 0.06299647, + "balance_loss_mlp": 0.01258893, + "epoch": 0.29069592664963173, + "flos": 24651972397440.0, + "grad_norm": 1.808411103531711, + "language_loss": 0.71914709, + "learning_rate": 3.3284302731695783e-06, + "loss": 0.79689276, + "num_input_tokens_seen": 104289400, + "router_z_loss_clip": 1.99902344, + "router_z_loss_mlp": 0.16027832, + "step": 4835, + "time_per_iteration": 2.555670738220215 + }, + { + "auxiliary_loss_clip": 0.06500543, + "auxiliary_loss_mlp": 0.01275594, + "balance_loss_clip": 0.06299368, + "balance_loss_mlp": 0.01259536, + "epoch": 0.2907560499022997, + "flos": 24980854872960.0, + "grad_norm": 1.750724607078226, + "language_loss": 0.80319953, + "learning_rate": 3.3281391092256668e-06, + "loss": 0.88096082, + "num_input_tokens_seen": 104310485, + "router_z_loss_clip": 2.00878906, + "router_z_loss_mlp": 0.16052246, + "step": 4836, + "time_per_iteration": 3.9953579902648926 + }, + { + "auxiliary_loss_clip": 0.0650623, + "auxiliary_loss_mlp": 0.01276306, + "balance_loss_clip": 0.06305872, + "balance_loss_mlp": 0.01260236, + "epoch": 0.29081617315496766, + "flos": 18662836838400.0, + "grad_norm": 1.8282626295265978, + "language_loss": 0.81337535, + "learning_rate": 3.3278478949185865e-06, + "loss": 0.89120078, + "num_input_tokens_seen": 104327330, + "router_z_loss_clip": 2.00585938, + "router_z_loss_mlp": 0.16064453, + "step": 4837, + "time_per_iteration": 3.9492576122283936 + }, + { + "auxiliary_loss_clip": 0.06508449, + "auxiliary_loss_mlp": 0.01275256, + "balance_loss_clip": 0.06305645, + "balance_loss_mlp": 0.01257362, + "epoch": 0.2908762964076356, + "flos": 35339087952000.0, + "grad_norm": 1.819350457328488, + "language_loss": 0.67809796, + "learning_rate": 3.327556630259381e-06, + "loss": 0.75593495, + "num_input_tokens_seen": 104350350, + "router_z_loss_clip": 2.02734375, + "router_z_loss_mlp": 0.17895508, + "step": 4838, + "time_per_iteration": 2.6575772762298584 + }, + { + "auxiliary_loss_clip": 0.06511781, + "auxiliary_loss_mlp": 0.01274117, + "balance_loss_clip": 0.06305051, + "balance_loss_mlp": 0.01256688, + "epoch": 0.29093641966030365, + "flos": 23083058148480.0, + "grad_norm": 2.3112745331966185, + "language_loss": 0.71775508, + "learning_rate": 3.327265315259095e-06, + "loss": 0.79561406, + "num_input_tokens_seen": 104369995, + "router_z_loss_clip": 2.06640625, + "router_z_loss_mlp": 0.17419434, + "step": 4839, + "time_per_iteration": 2.6057844161987305 + }, + { + "auxiliary_loss_clip": 0.06504601, + "auxiliary_loss_mlp": 0.0127457, + "balance_loss_clip": 0.06301045, + "balance_loss_mlp": 0.01258071, + "epoch": 0.2909965429129716, + "flos": 35964260864640.0, + "grad_norm": 1.8988017352340443, + "language_loss": 0.75792682, + "learning_rate": 3.326973949928776e-06, + "loss": 0.83571851, + "num_input_tokens_seen": 104392285, + "router_z_loss_clip": 2.03515625, + "router_z_loss_mlp": 0.16503906, + "step": 4840, + "time_per_iteration": 2.7049334049224854 + }, + { + "auxiliary_loss_clip": 0.06503059, + "auxiliary_loss_mlp": 0.0127188, + "balance_loss_clip": 0.06299757, + "balance_loss_mlp": 0.01255417, + "epoch": 0.2910566661656396, + "flos": 30887616268800.0, + "grad_norm": 1.8129671702232821, + "language_loss": 0.60949063, + "learning_rate": 3.326682534279471e-06, + "loss": 0.68724, + "num_input_tokens_seen": 104412640, + "router_z_loss_clip": 2.03125, + "router_z_loss_mlp": 0.16479492, + "step": 4841, + "time_per_iteration": 2.7237274646759033 + }, + { + "auxiliary_loss_clip": 0.06506652, + "auxiliary_loss_mlp": 0.01272342, + "balance_loss_clip": 0.06303366, + "balance_loss_mlp": 0.01255021, + "epoch": 0.29111678941830754, + "flos": 30018366812160.0, + "grad_norm": 1.3487344136639734, + "language_loss": 0.71762401, + "learning_rate": 3.326391068322232e-06, + "loss": 0.79541385, + "num_input_tokens_seen": 104435245, + "router_z_loss_clip": 2.03320312, + "router_z_loss_mlp": 0.17333984, + "step": 4842, + "time_per_iteration": 4.036385774612427 + }, + { + "auxiliary_loss_clip": 0.06507391, + "auxiliary_loss_mlp": 0.01271836, + "balance_loss_clip": 0.06304808, + "balance_loss_mlp": 0.01256423, + "epoch": 0.2911769126709755, + "flos": 22864110629760.0, + "grad_norm": 1.4808705717301018, + "language_loss": 0.74052906, + "learning_rate": 3.3260995520681098e-06, + "loss": 0.81832135, + "num_input_tokens_seen": 104455395, + "router_z_loss_clip": 2.0234375, + "router_z_loss_mlp": 0.1541748, + "step": 4843, + "time_per_iteration": 2.565093755722046 + }, + { + "auxiliary_loss_clip": 0.06510359, + "auxiliary_loss_mlp": 0.01272609, + "balance_loss_clip": 0.06305443, + "balance_loss_mlp": 0.01256742, + "epoch": 0.2912370359236435, + "flos": 21656545113600.0, + "grad_norm": 3.6041214714298806, + "language_loss": 0.5879783, + "learning_rate": 3.3258079855281602e-06, + "loss": 0.66580796, + "num_input_tokens_seen": 104473350, + "router_z_loss_clip": 2.04785156, + "router_z_loss_mlp": 0.15856934, + "step": 4844, + "time_per_iteration": 2.636667490005493 + }, + { + "auxiliary_loss_clip": 0.06518383, + "auxiliary_loss_mlp": 0.01278792, + "balance_loss_clip": 0.06309091, + "balance_loss_mlp": 0.01261566, + "epoch": 0.29129715917631144, + "flos": 22899972977280.0, + "grad_norm": 1.9195914149996331, + "language_loss": 0.86846137, + "learning_rate": 3.3255163687134396e-06, + "loss": 0.94643313, + "num_input_tokens_seen": 104492265, + "router_z_loss_clip": 2.09277344, + "router_z_loss_mlp": 0.17224121, + "step": 4845, + "time_per_iteration": 2.549297571182251 + }, + { + "auxiliary_loss_clip": 0.06508736, + "auxiliary_loss_mlp": 0.01273322, + "balance_loss_clip": 0.06304652, + "balance_loss_mlp": 0.01256144, + "epoch": 0.2913572824289794, + "flos": 22681067385600.0, + "grad_norm": 1.8711717874469986, + "language_loss": 0.67698014, + "learning_rate": 3.3252247016350046e-06, + "loss": 0.75480074, + "num_input_tokens_seen": 104510755, + "router_z_loss_clip": 2.04199219, + "router_z_loss_mlp": 0.17175293, + "step": 4846, + "time_per_iteration": 2.607025146484375 + }, + { + "auxiliary_loss_clip": 0.06502484, + "auxiliary_loss_mlp": 0.01275425, + "balance_loss_clip": 0.06301165, + "balance_loss_mlp": 0.01258771, + "epoch": 0.29141740568164737, + "flos": 23113260345600.0, + "grad_norm": 4.990917175371688, + "language_loss": 0.708718, + "learning_rate": 3.3249329843039166e-06, + "loss": 0.78649712, + "num_input_tokens_seen": 104530830, + "router_z_loss_clip": 2.01171875, + "router_z_loss_mlp": 0.16674805, + "step": 4847, + "time_per_iteration": 2.5293991565704346 + }, + { + "auxiliary_loss_clip": 0.06504785, + "auxiliary_loss_mlp": 0.01278673, + "balance_loss_clip": 0.06301495, + "balance_loss_mlp": 0.01261877, + "epoch": 0.29147752893431533, + "flos": 23593851838080.0, + "grad_norm": 1.4565796817402286, + "language_loss": 0.74258435, + "learning_rate": 3.324641216731237e-06, + "loss": 0.82041889, + "num_input_tokens_seen": 104550115, + "router_z_loss_clip": 2.03125, + "router_z_loss_mlp": 0.16796875, + "step": 4848, + "time_per_iteration": 2.585296630859375 + }, + { + "auxiliary_loss_clip": 0.06502895, + "auxiliary_loss_mlp": 0.01276049, + "balance_loss_clip": 0.06298006, + "balance_loss_mlp": 0.01259729, + "epoch": 0.2915376521869833, + "flos": 20597753721600.0, + "grad_norm": 2.1223800155182624, + "language_loss": 0.77561575, + "learning_rate": 3.3243493989280295e-06, + "loss": 0.85340518, + "num_input_tokens_seen": 104566255, + "router_z_loss_clip": 2.046875, + "router_z_loss_mlp": 0.16333008, + "step": 4849, + "time_per_iteration": 2.4936819076538086 + }, + { + "auxiliary_loss_clip": 0.06514408, + "auxiliary_loss_mlp": 0.01274174, + "balance_loss_clip": 0.06305997, + "balance_loss_mlp": 0.01257723, + "epoch": 0.29159777543965126, + "flos": 20817414000000.0, + "grad_norm": 1.652469266745217, + "language_loss": 0.79415965, + "learning_rate": 3.3240575309053596e-06, + "loss": 0.87204546, + "num_input_tokens_seen": 104585235, + "router_z_loss_clip": 2.08789062, + "router_z_loss_mlp": 0.16442871, + "step": 4850, + "time_per_iteration": 2.55340313911438 + }, + { + "auxiliary_loss_clip": 0.06494947, + "auxiliary_loss_mlp": 0.0127524, + "balance_loss_clip": 0.06295137, + "balance_loss_mlp": 0.01258479, + "epoch": 0.29165789869231923, + "flos": 24251155591680.0, + "grad_norm": 1.7747423674847125, + "language_loss": 0.76365012, + "learning_rate": 3.323765612674296e-06, + "loss": 0.84135199, + "num_input_tokens_seen": 104605315, + "router_z_loss_clip": 1.99902344, + "router_z_loss_mlp": 0.16748047, + "step": 4851, + "time_per_iteration": 2.5335612297058105 + }, + { + "auxiliary_loss_clip": 0.06499958, + "auxiliary_loss_mlp": 0.01272557, + "balance_loss_clip": 0.06300404, + "balance_loss_mlp": 0.01256929, + "epoch": 0.29171802194498725, + "flos": 28957562922240.0, + "grad_norm": 1.3481127708223366, + "language_loss": 0.7781775, + "learning_rate": 3.3234736442459078e-06, + "loss": 0.85590267, + "num_input_tokens_seen": 104626055, + "router_z_loss_clip": 1.99609375, + "router_z_loss_mlp": 0.15612793, + "step": 4852, + "time_per_iteration": 2.6266329288482666 + }, + { + "auxiliary_loss_clip": 0.06501517, + "auxiliary_loss_mlp": 0.0127959, + "balance_loss_clip": 0.06297216, + "balance_loss_mlp": 0.01261697, + "epoch": 0.2917781451976552, + "flos": 22604269518720.0, + "grad_norm": 1.5006442804531215, + "language_loss": 0.78676021, + "learning_rate": 3.3231816256312665e-06, + "loss": 0.86457133, + "num_input_tokens_seen": 104646005, + "router_z_loss_clip": 2.04296875, + "router_z_loss_mlp": 0.17883301, + "step": 4853, + "time_per_iteration": 2.5417568683624268 + }, + { + "auxiliary_loss_clip": 0.06501997, + "auxiliary_loss_mlp": 0.01270758, + "balance_loss_clip": 0.06296347, + "balance_loss_mlp": 0.01253818, + "epoch": 0.2918382684503232, + "flos": 21579956881920.0, + "grad_norm": 4.190137743849971, + "language_loss": 0.88580358, + "learning_rate": 3.322889556841445e-06, + "loss": 0.96353114, + "num_input_tokens_seen": 104661620, + "router_z_loss_clip": 2.05761719, + "router_z_loss_mlp": 0.16943359, + "step": 4854, + "time_per_iteration": 2.537247896194458 + }, + { + "auxiliary_loss_clip": 0.06492339, + "auxiliary_loss_mlp": 0.01274317, + "balance_loss_clip": 0.06290923, + "balance_loss_mlp": 0.01255517, + "epoch": 0.29189839170299114, + "flos": 24360503569920.0, + "grad_norm": 1.79615422427109, + "language_loss": 0.86863208, + "learning_rate": 3.322597437887519e-06, + "loss": 0.94629866, + "num_input_tokens_seen": 104681445, + "router_z_loss_clip": 2.01464844, + "router_z_loss_mlp": 0.18798828, + "step": 4855, + "time_per_iteration": 2.5408217906951904 + }, + { + "auxiliary_loss_clip": 0.06394155, + "auxiliary_loss_mlp": 0.01254999, + "balance_loss_clip": 0.0629582, + "balance_loss_mlp": 0.01250765, + "epoch": 0.2919585149556591, + "flos": 71338693311360.0, + "grad_norm": 0.8469602753394808, + "language_loss": 0.60232264, + "learning_rate": 3.322305268780566e-06, + "loss": 0.67881417, + "num_input_tokens_seen": 104747945, + "router_z_loss_clip": 0.98388672, + "router_z_loss_mlp": 0.04238892, + "step": 4856, + "time_per_iteration": 3.245720863342285 + }, + { + "auxiliary_loss_clip": 0.06496054, + "auxiliary_loss_mlp": 0.01271452, + "balance_loss_clip": 0.06293447, + "balance_loss_mlp": 0.01254966, + "epoch": 0.2920186382083271, + "flos": 15638716730880.0, + "grad_norm": 1.9340338412348166, + "language_loss": 0.69134986, + "learning_rate": 3.322013049531664e-06, + "loss": 0.76902497, + "num_input_tokens_seen": 104766225, + "router_z_loss_clip": 2.02734375, + "router_z_loss_mlp": 0.16479492, + "step": 4857, + "time_per_iteration": 2.492515802383423 + }, + { + "auxiliary_loss_clip": 0.0649875, + "auxiliary_loss_mlp": 0.01275648, + "balance_loss_clip": 0.06298544, + "balance_loss_mlp": 0.01258863, + "epoch": 0.29207876146099504, + "flos": 28373535164160.0, + "grad_norm": 2.0544380804392346, + "language_loss": 0.84425288, + "learning_rate": 3.321720780151895e-06, + "loss": 0.92199689, + "num_input_tokens_seen": 104785345, + "router_z_loss_clip": 2.00195312, + "router_z_loss_mlp": 0.16772461, + "step": 4858, + "time_per_iteration": 2.596036434173584 + }, + { + "auxiliary_loss_clip": 0.06500848, + "auxiliary_loss_mlp": 0.01274974, + "balance_loss_clip": 0.06300872, + "balance_loss_mlp": 0.01257879, + "epoch": 0.292138884713663, + "flos": 21877295495040.0, + "grad_norm": 1.6880642207641439, + "language_loss": 0.781169, + "learning_rate": 3.321428460652342e-06, + "loss": 0.85892725, + "num_input_tokens_seen": 104804560, + "router_z_loss_clip": 2.0, + "router_z_loss_mlp": 0.17102051, + "step": 4859, + "time_per_iteration": 2.5885818004608154 + }, + { + "auxiliary_loss_clip": 0.06508546, + "auxiliary_loss_mlp": 0.01274065, + "balance_loss_clip": 0.06301034, + "balance_loss_mlp": 0.0125684, + "epoch": 0.29219900796633097, + "flos": 20998277038080.0, + "grad_norm": 2.276956308498861, + "language_loss": 0.68823123, + "learning_rate": 3.3211360910440885e-06, + "loss": 0.76605731, + "num_input_tokens_seen": 104821105, + "router_z_loss_clip": 2.07421875, + "router_z_loss_mlp": 0.17224121, + "step": 4860, + "time_per_iteration": 2.6006133556365967 + }, + { + "auxiliary_loss_clip": 0.06497137, + "auxiliary_loss_mlp": 0.01273361, + "balance_loss_clip": 0.06296673, + "balance_loss_mlp": 0.01256743, + "epoch": 0.29225913121899894, + "flos": 35012930734080.0, + "grad_norm": 1.9621079535677741, + "language_loss": 0.75927335, + "learning_rate": 3.320843671338222e-06, + "loss": 0.83697826, + "num_input_tokens_seen": 104841440, + "router_z_loss_clip": 2.00292969, + "router_z_loss_mlp": 0.16625977, + "step": 4861, + "time_per_iteration": 2.6738815307617188 + }, + { + "auxiliary_loss_clip": 0.06498605, + "auxiliary_loss_mlp": 0.01278705, + "balance_loss_clip": 0.06298269, + "balance_loss_mlp": 0.0126229, + "epoch": 0.2923192544716669, + "flos": 13520588895360.0, + "grad_norm": 2.4944662876521027, + "language_loss": 0.91953582, + "learning_rate": 3.320551201545832e-06, + "loss": 0.99730897, + "num_input_tokens_seen": 104858210, + "router_z_loss_clip": 2.00097656, + "router_z_loss_mlp": 0.16418457, + "step": 4862, + "time_per_iteration": 2.523393392562866 + }, + { + "auxiliary_loss_clip": 0.06498902, + "auxiliary_loss_mlp": 0.01275122, + "balance_loss_clip": 0.06296849, + "balance_loss_mlp": 0.01258325, + "epoch": 0.29237937772433487, + "flos": 19469543621760.0, + "grad_norm": 2.367835349845546, + "language_loss": 0.74302417, + "learning_rate": 3.320258681678008e-06, + "loss": 0.82076436, + "num_input_tokens_seen": 104875620, + "router_z_loss_clip": 2.02441406, + "router_z_loss_mlp": 0.16809082, + "step": 4863, + "time_per_iteration": 2.5615665912628174 + }, + { + "auxiliary_loss_clip": 0.06495367, + "auxiliary_loss_mlp": 0.01274458, + "balance_loss_clip": 0.06298485, + "balance_loss_mlp": 0.01257041, + "epoch": 0.29243950097700283, + "flos": 20856965927040.0, + "grad_norm": 1.6096808438714836, + "language_loss": 0.78180861, + "learning_rate": 3.319966111745842e-06, + "loss": 0.85950685, + "num_input_tokens_seen": 104894600, + "router_z_loss_clip": 1.96679688, + "router_z_loss_mlp": 0.17419434, + "step": 4864, + "time_per_iteration": 2.543239116668701 + }, + { + "auxiliary_loss_clip": 0.06506015, + "auxiliary_loss_mlp": 0.01278091, + "balance_loss_clip": 0.06299396, + "balance_loss_mlp": 0.01260127, + "epoch": 0.29249962422967085, + "flos": 23590581528960.0, + "grad_norm": 1.7200803595236853, + "language_loss": 0.82166076, + "learning_rate": 3.319673491760429e-06, + "loss": 0.8995018, + "num_input_tokens_seen": 104914530, + "router_z_loss_clip": 2.06640625, + "router_z_loss_mlp": 0.1796875, + "step": 4865, + "time_per_iteration": 2.6162562370300293 + }, + { + "auxiliary_loss_clip": 0.06504746, + "auxiliary_loss_mlp": 0.01277785, + "balance_loss_clip": 0.06300808, + "balance_loss_mlp": 0.01258783, + "epoch": 0.2925597474823388, + "flos": 22279915163520.0, + "grad_norm": 1.8207973709117147, + "language_loss": 0.85861242, + "learning_rate": 3.3193808217328645e-06, + "loss": 0.93643779, + "num_input_tokens_seen": 104933460, + "router_z_loss_clip": 2.04003906, + "router_z_loss_mlp": 0.18994141, + "step": 4866, + "time_per_iteration": 2.5991125106811523 + }, + { + "auxiliary_loss_clip": 0.06498669, + "auxiliary_loss_mlp": 0.01277634, + "balance_loss_clip": 0.06298468, + "balance_loss_mlp": 0.0126005, + "epoch": 0.2926198707350068, + "flos": 34464136417920.0, + "grad_norm": 1.677629799943763, + "language_loss": 0.76065934, + "learning_rate": 3.3190881016742476e-06, + "loss": 0.83842242, + "num_input_tokens_seen": 104954495, + "router_z_loss_clip": 2.00097656, + "router_z_loss_mlp": 0.17578125, + "step": 4867, + "time_per_iteration": 2.652083396911621 + }, + { + "auxiliary_loss_clip": 0.06508122, + "auxiliary_loss_mlp": 0.01277995, + "balance_loss_clip": 0.06302974, + "balance_loss_mlp": 0.01260483, + "epoch": 0.29267999398767475, + "flos": 20710413936000.0, + "grad_norm": 2.5581846543962197, + "language_loss": 0.73412025, + "learning_rate": 3.3187953315956776e-06, + "loss": 0.81198144, + "num_input_tokens_seen": 104971915, + "router_z_loss_clip": 2.04980469, + "router_z_loss_mlp": 0.1751709, + "step": 4868, + "time_per_iteration": 2.5104074478149414 + }, + { + "auxiliary_loss_clip": 0.06504919, + "auxiliary_loss_mlp": 0.0127382, + "balance_loss_clip": 0.06304781, + "balance_loss_mlp": 0.01256558, + "epoch": 0.2927401172403427, + "flos": 18374470611840.0, + "grad_norm": 1.376823387605754, + "language_loss": 0.74768585, + "learning_rate": 3.3185025115082566e-06, + "loss": 0.82547319, + "num_input_tokens_seen": 104991335, + "router_z_loss_clip": 2.00292969, + "router_z_loss_mlp": 0.17260742, + "step": 4869, + "time_per_iteration": 2.517545461654663 + }, + { + "auxiliary_loss_clip": 0.06509744, + "auxiliary_loss_mlp": 0.01275578, + "balance_loss_clip": 0.06308037, + "balance_loss_mlp": 0.01258627, + "epoch": 0.2928002404930107, + "flos": 26111203251840.0, + "grad_norm": 1.453461002371515, + "language_loss": 0.76538026, + "learning_rate": 3.318209641423088e-06, + "loss": 0.84323347, + "num_input_tokens_seen": 105012015, + "router_z_loss_clip": 2.01660156, + "router_z_loss_mlp": 0.16931152, + "step": 4870, + "time_per_iteration": 2.571554183959961 + }, + { + "auxiliary_loss_clip": 0.06512202, + "auxiliary_loss_mlp": 0.01274146, + "balance_loss_clip": 0.06304315, + "balance_loss_mlp": 0.01255967, + "epoch": 0.29286036374567864, + "flos": 21331142582400.0, + "grad_norm": 3.1299518178223726, + "language_loss": 0.67793286, + "learning_rate": 3.3179167213512777e-06, + "loss": 0.75579637, + "num_input_tokens_seen": 105031460, + "router_z_loss_clip": 2.078125, + "router_z_loss_mlp": 0.18188477, + "step": 4871, + "time_per_iteration": 2.5867390632629395 + }, + { + "auxiliary_loss_clip": 0.06504084, + "auxiliary_loss_mlp": 0.01272553, + "balance_loss_clip": 0.0630291, + "balance_loss_mlp": 0.01256973, + "epoch": 0.2929204869983466, + "flos": 29577117611520.0, + "grad_norm": 1.7840080197301964, + "language_loss": 0.78071094, + "learning_rate": 3.317623751303933e-06, + "loss": 0.85847723, + "num_input_tokens_seen": 105052965, + "router_z_loss_clip": 2.01171875, + "router_z_loss_mlp": 0.15588379, + "step": 4872, + "time_per_iteration": 2.598357915878296 + }, + { + "auxiliary_loss_clip": 0.06511893, + "auxiliary_loss_mlp": 0.01279899, + "balance_loss_clip": 0.06305112, + "balance_loss_mlp": 0.01260313, + "epoch": 0.2929806102510146, + "flos": 19063569790080.0, + "grad_norm": 1.7763964443019538, + "language_loss": 0.72879624, + "learning_rate": 3.317330731292164e-06, + "loss": 0.80671406, + "num_input_tokens_seen": 105071840, + "router_z_loss_clip": 2.06640625, + "router_z_loss_mlp": 0.19580078, + "step": 4873, + "time_per_iteration": 3.9404540061950684 + }, + { + "auxiliary_loss_clip": 0.06511085, + "auxiliary_loss_mlp": 0.01274077, + "balance_loss_clip": 0.06303495, + "balance_loss_mlp": 0.01256386, + "epoch": 0.29304073350368254, + "flos": 21950613417600.0, + "grad_norm": 1.85182595241139, + "language_loss": 0.79023468, + "learning_rate": 3.3170376613270812e-06, + "loss": 0.86808634, + "num_input_tokens_seen": 105089445, + "router_z_loss_clip": 2.07617188, + "router_z_loss_mlp": 0.17675781, + "step": 4874, + "time_per_iteration": 2.523942470550537 + }, + { + "auxiliary_loss_clip": 0.06517696, + "auxiliary_loss_mlp": 0.01272827, + "balance_loss_clip": 0.06305568, + "balance_loss_mlp": 0.01255315, + "epoch": 0.2931008567563505, + "flos": 15456302392320.0, + "grad_norm": 2.3441988108556377, + "language_loss": 0.7791701, + "learning_rate": 3.3167445414197985e-06, + "loss": 0.85707539, + "num_input_tokens_seen": 105106210, + "router_z_loss_clip": 2.12304688, + "router_z_loss_mlp": 0.17504883, + "step": 4875, + "time_per_iteration": 2.4990556240081787 + }, + { + "auxiliary_loss_clip": 0.06506883, + "auxiliary_loss_mlp": 0.01280573, + "balance_loss_clip": 0.06301031, + "balance_loss_mlp": 0.01263252, + "epoch": 0.29316098000901847, + "flos": 16988893096320.0, + "grad_norm": 1.859745338516673, + "language_loss": 0.70031023, + "learning_rate": 3.316451371581431e-06, + "loss": 0.77818477, + "num_input_tokens_seen": 105124200, + "router_z_loss_clip": 2.05859375, + "router_z_loss_mlp": 0.17321777, + "step": 4876, + "time_per_iteration": 5.4681243896484375 + }, + { + "auxiliary_loss_clip": 0.06504045, + "auxiliary_loss_mlp": 0.01275518, + "balance_loss_clip": 0.06302452, + "balance_loss_mlp": 0.01259174, + "epoch": 0.29322110326168643, + "flos": 16362462372480.0, + "grad_norm": 1.8247622937841679, + "language_loss": 0.82480925, + "learning_rate": 3.316158151823096e-06, + "loss": 0.90260488, + "num_input_tokens_seen": 105140400, + "router_z_loss_clip": 2.015625, + "router_z_loss_mlp": 0.16345215, + "step": 4877, + "time_per_iteration": 2.5517635345458984 + }, + { + "auxiliary_loss_clip": 0.06509132, + "auxiliary_loss_mlp": 0.01278665, + "balance_loss_clip": 0.06299806, + "balance_loss_mlp": 0.0126064, + "epoch": 0.29328122651435445, + "flos": 13996023361920.0, + "grad_norm": 2.6416558700601334, + "language_loss": 0.6810987, + "learning_rate": 3.315864882155911e-06, + "loss": 0.75897658, + "num_input_tokens_seen": 105157535, + "router_z_loss_clip": 2.09179688, + "router_z_loss_mlp": 0.18017578, + "step": 4878, + "time_per_iteration": 2.511922597885132 + }, + { + "auxiliary_loss_clip": 0.0649902, + "auxiliary_loss_mlp": 0.01275226, + "balance_loss_clip": 0.06298085, + "balance_loss_mlp": 0.01257697, + "epoch": 0.2933413497670224, + "flos": 25271569013760.0, + "grad_norm": 1.8820124674491874, + "language_loss": 0.74030542, + "learning_rate": 3.3155715625909982e-06, + "loss": 0.81804794, + "num_input_tokens_seen": 105175185, + "router_z_loss_clip": 2.01074219, + "router_z_loss_mlp": 0.17510986, + "step": 4879, + "time_per_iteration": 2.6044318675994873 + }, + { + "auxiliary_loss_clip": 0.06501681, + "auxiliary_loss_mlp": 0.01277426, + "balance_loss_clip": 0.0629803, + "balance_loss_mlp": 0.01259187, + "epoch": 0.2934014730196904, + "flos": 32131840746240.0, + "grad_norm": 2.9151820016542183, + "language_loss": 0.67178017, + "learning_rate": 3.3152781931394803e-06, + "loss": 0.7495712, + "num_input_tokens_seen": 105194540, + "router_z_loss_clip": 2.03515625, + "router_z_loss_mlp": 0.18237305, + "step": 4880, + "time_per_iteration": 2.603761672973633 + }, + { + "auxiliary_loss_clip": 0.06503071, + "auxiliary_loss_mlp": 0.01271949, + "balance_loss_clip": 0.0629775, + "balance_loss_mlp": 0.01255367, + "epoch": 0.29346159627235835, + "flos": 24359329612800.0, + "grad_norm": 2.6105900749093633, + "language_loss": 0.71260536, + "learning_rate": 3.314984773812481e-06, + "loss": 0.79035556, + "num_input_tokens_seen": 105213215, + "router_z_loss_clip": 2.05273438, + "router_z_loss_mlp": 0.16577148, + "step": 4881, + "time_per_iteration": 2.593226432800293 + }, + { + "auxiliary_loss_clip": 0.06502824, + "auxiliary_loss_mlp": 0.01275283, + "balance_loss_clip": 0.06298223, + "balance_loss_mlp": 0.01256603, + "epoch": 0.2935217195250263, + "flos": 22753253278080.0, + "grad_norm": 1.6618295774620153, + "language_loss": 0.83893931, + "learning_rate": 3.314691304621127e-06, + "loss": 0.91672039, + "num_input_tokens_seen": 105231585, + "router_z_loss_clip": 2.04589844, + "router_z_loss_mlp": 0.18688965, + "step": 4882, + "time_per_iteration": 3.9488399028778076 + }, + { + "auxiliary_loss_clip": 0.06502259, + "auxiliary_loss_mlp": 0.01273532, + "balance_loss_clip": 0.06293593, + "balance_loss_mlp": 0.01255961, + "epoch": 0.2935818427776943, + "flos": 21731959388160.0, + "grad_norm": 4.210124979545191, + "language_loss": 0.72920972, + "learning_rate": 3.314397785576548e-06, + "loss": 0.80696762, + "num_input_tokens_seen": 105250120, + "router_z_loss_clip": 2.08789062, + "router_z_loss_mlp": 0.17565918, + "step": 4883, + "time_per_iteration": 2.557283878326416 + }, + { + "auxiliary_loss_clip": 0.06496279, + "auxiliary_loss_mlp": 0.01274258, + "balance_loss_clip": 0.06292833, + "balance_loss_mlp": 0.01257103, + "epoch": 0.29364196603036224, + "flos": 23811667326720.0, + "grad_norm": 2.0649535872154217, + "language_loss": 0.93051624, + "learning_rate": 3.3141042166898726e-06, + "loss": 1.00822163, + "num_input_tokens_seen": 105266065, + "router_z_loss_clip": 2.03417969, + "router_z_loss_mlp": 0.17150879, + "step": 4884, + "time_per_iteration": 2.5359458923339844 + }, + { + "auxiliary_loss_clip": 0.06506841, + "auxiliary_loss_mlp": 0.01273123, + "balance_loss_clip": 0.06302871, + "balance_loss_mlp": 0.01255409, + "epoch": 0.2937020892830302, + "flos": 23475615327360.0, + "grad_norm": 2.6201562161688017, + "language_loss": 0.73813069, + "learning_rate": 3.313810597972234e-06, + "loss": 0.81593031, + "num_input_tokens_seen": 105282155, + "router_z_loss_clip": 2.04101562, + "router_z_loss_mlp": 0.17712402, + "step": 4885, + "time_per_iteration": 2.547731637954712 + }, + { + "auxiliary_loss_clip": 0.06506574, + "auxiliary_loss_mlp": 0.01271233, + "balance_loss_clip": 0.06302118, + "balance_loss_mlp": 0.01253936, + "epoch": 0.2937622125356982, + "flos": 24278422896000.0, + "grad_norm": 2.0067568315745907, + "language_loss": 0.8568837, + "learning_rate": 3.3135169294347655e-06, + "loss": 0.93466175, + "num_input_tokens_seen": 105299225, + "router_z_loss_clip": 2.04492188, + "router_z_loss_mlp": 0.1730957, + "step": 4886, + "time_per_iteration": 2.5345749855041504 + }, + { + "auxiliary_loss_clip": 0.06516494, + "auxiliary_loss_mlp": 0.01282352, + "balance_loss_clip": 0.06309356, + "balance_loss_mlp": 0.01266223, + "epoch": 0.29382233578836614, + "flos": 20667843262080.0, + "grad_norm": 2.2972144011917863, + "language_loss": 0.7819618, + "learning_rate": 3.313223211088603e-06, + "loss": 0.85995024, + "num_input_tokens_seen": 105315710, + "router_z_loss_clip": 2.07128906, + "router_z_loss_mlp": 0.16137695, + "step": 4887, + "time_per_iteration": 2.5718464851379395 + }, + { + "auxiliary_loss_clip": 0.06508423, + "auxiliary_loss_mlp": 0.01281343, + "balance_loss_clip": 0.06301117, + "balance_loss_mlp": 0.01263962, + "epoch": 0.2938824590410341, + "flos": 16550662642560.0, + "grad_norm": 2.5346543108244366, + "language_loss": 0.80135798, + "learning_rate": 3.3129294429448855e-06, + "loss": 0.87925565, + "num_input_tokens_seen": 105333505, + "router_z_loss_clip": 2.07226562, + "router_z_loss_mlp": 0.1739502, + "step": 4888, + "time_per_iteration": 2.5823678970336914 + }, + { + "auxiliary_loss_clip": 0.06512221, + "auxiliary_loss_mlp": 0.01274662, + "balance_loss_clip": 0.06308408, + "balance_loss_mlp": 0.01257878, + "epoch": 0.29394258229370207, + "flos": 37934620824960.0, + "grad_norm": 1.521834171262281, + "language_loss": 0.55984998, + "learning_rate": 3.3126356250147517e-06, + "loss": 0.63771886, + "num_input_tokens_seen": 105355605, + "router_z_loss_clip": 2.04101562, + "router_z_loss_mlp": 0.16784668, + "step": 4889, + "time_per_iteration": 2.6925320625305176 + }, + { + "auxiliary_loss_clip": 0.06519246, + "auxiliary_loss_mlp": 0.01278013, + "balance_loss_clip": 0.06313413, + "balance_loss_mlp": 0.0126056, + "epoch": 0.29400270554637004, + "flos": 20050384924800.0, + "grad_norm": 1.7589662768394465, + "language_loss": 0.85257453, + "learning_rate": 3.3123417573093434e-06, + "loss": 0.93054712, + "num_input_tokens_seen": 105374225, + "router_z_loss_clip": 2.05761719, + "router_z_loss_mlp": 0.17443848, + "step": 4890, + "time_per_iteration": 2.546391010284424 + }, + { + "auxiliary_loss_clip": 0.06513973, + "auxiliary_loss_mlp": 0.01284253, + "balance_loss_clip": 0.06307942, + "balance_loss_mlp": 0.01266288, + "epoch": 0.294062828799038, + "flos": 15271498212480.0, + "grad_norm": 1.9077501912209676, + "language_loss": 0.73679662, + "learning_rate": 3.3120478398398046e-06, + "loss": 0.81477886, + "num_input_tokens_seen": 105391565, + "router_z_loss_clip": 2.06152344, + "router_z_loss_mlp": 0.17956543, + "step": 4891, + "time_per_iteration": 2.496230125427246 + }, + { + "auxiliary_loss_clip": 0.06519526, + "auxiliary_loss_mlp": 0.01285439, + "balance_loss_clip": 0.06312989, + "balance_loss_mlp": 0.01267468, + "epoch": 0.294122952051706, + "flos": 22753714475520.0, + "grad_norm": 1.802215562222595, + "language_loss": 0.77636111, + "learning_rate": 3.3117538726172797e-06, + "loss": 0.85441071, + "num_input_tokens_seen": 105409840, + "router_z_loss_clip": 2.06445312, + "router_z_loss_mlp": 0.17974854, + "step": 4892, + "time_per_iteration": 2.556626796722412 + }, + { + "auxiliary_loss_clip": 0.06508264, + "auxiliary_loss_mlp": 0.01274763, + "balance_loss_clip": 0.06305899, + "balance_loss_mlp": 0.01257096, + "epoch": 0.294183075304374, + "flos": 24979848624000.0, + "grad_norm": 1.857019535889917, + "language_loss": 0.78546309, + "learning_rate": 3.3114598556529164e-06, + "loss": 0.86329335, + "num_input_tokens_seen": 105428645, + "router_z_loss_clip": 2.02539062, + "router_z_loss_mlp": 0.17675781, + "step": 4893, + "time_per_iteration": 2.5583088397979736 + }, + { + "auxiliary_loss_clip": 0.06512541, + "auxiliary_loss_mlp": 0.01279131, + "balance_loss_clip": 0.06308632, + "balance_loss_mlp": 0.01262764, + "epoch": 0.29424319855704195, + "flos": 30960347212800.0, + "grad_norm": 7.778949224672863, + "language_loss": 0.85594332, + "learning_rate": 3.311165788957864e-06, + "loss": 0.93386006, + "num_input_tokens_seen": 105447480, + "router_z_loss_clip": 2.04101562, + "router_z_loss_mlp": 0.16357422, + "step": 4894, + "time_per_iteration": 2.642275094985962 + }, + { + "auxiliary_loss_clip": 0.06515005, + "auxiliary_loss_mlp": 0.01277674, + "balance_loss_clip": 0.06308285, + "balance_loss_mlp": 0.01260639, + "epoch": 0.2943033218097099, + "flos": 15236977530240.0, + "grad_norm": 2.7328127009682617, + "language_loss": 0.91485763, + "learning_rate": 3.310871672543274e-06, + "loss": 0.99278444, + "num_input_tokens_seen": 105464600, + "router_z_loss_clip": 2.06640625, + "router_z_loss_mlp": 0.17028809, + "step": 4895, + "time_per_iteration": 2.499884605407715 + }, + { + "auxiliary_loss_clip": 0.06521617, + "auxiliary_loss_mlp": 0.01275591, + "balance_loss_clip": 0.06309959, + "balance_loss_mlp": 0.01257519, + "epoch": 0.2943634450623779, + "flos": 21732336731520.0, + "grad_norm": 1.9156960384195119, + "language_loss": 0.86768568, + "learning_rate": 3.3105775064202982e-06, + "loss": 0.94565773, + "num_input_tokens_seen": 105481510, + "router_z_loss_clip": 2.11328125, + "router_z_loss_mlp": 0.18078613, + "step": 4896, + "time_per_iteration": 2.5482704639434814 + }, + { + "auxiliary_loss_clip": 0.06512056, + "auxiliary_loss_mlp": 0.01275376, + "balance_loss_clip": 0.06306215, + "balance_loss_mlp": 0.01257996, + "epoch": 0.29442356831504585, + "flos": 22608797639040.0, + "grad_norm": 2.0283086901116354, + "language_loss": 0.73915696, + "learning_rate": 3.3102832906000924e-06, + "loss": 0.81703126, + "num_input_tokens_seen": 105501390, + "router_z_loss_clip": 2.05566406, + "router_z_loss_mlp": 0.17382812, + "step": 4897, + "time_per_iteration": 2.5434658527374268 + }, + { + "auxiliary_loss_clip": 0.0652054, + "auxiliary_loss_mlp": 0.01280641, + "balance_loss_clip": 0.06307404, + "balance_loss_mlp": 0.01262378, + "epoch": 0.2944836915677138, + "flos": 20017625178240.0, + "grad_norm": 1.9321922101744466, + "language_loss": 0.74697995, + "learning_rate": 3.309989025093813e-06, + "loss": 0.82499176, + "num_input_tokens_seen": 105519600, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.18261719, + "step": 4898, + "time_per_iteration": 2.5770161151885986 + }, + { + "auxiliary_loss_clip": 0.06516017, + "auxiliary_loss_mlp": 0.01278564, + "balance_loss_clip": 0.06305353, + "balance_loss_mlp": 0.01259586, + "epoch": 0.2945438148203818, + "flos": 20051768517120.0, + "grad_norm": 2.462097706840479, + "language_loss": 0.71617198, + "learning_rate": 3.309694709912618e-06, + "loss": 0.79411781, + "num_input_tokens_seen": 105535970, + "router_z_loss_clip": 2.10742188, + "router_z_loss_mlp": 0.18969727, + "step": 4899, + "time_per_iteration": 2.5297536849975586 + }, + { + "auxiliary_loss_clip": 0.06510775, + "auxiliary_loss_mlp": 0.01278061, + "balance_loss_clip": 0.06304912, + "balance_loss_mlp": 0.01259727, + "epoch": 0.29460393807304974, + "flos": 23740487683200.0, + "grad_norm": 9.70716698994663, + "language_loss": 0.79828262, + "learning_rate": 3.3094003450676685e-06, + "loss": 0.87617099, + "num_input_tokens_seen": 105556735, + "router_z_loss_clip": 2.05859375, + "router_z_loss_mlp": 0.18322754, + "step": 4900, + "time_per_iteration": 2.589350461959839 + }, + { + "auxiliary_loss_clip": 0.06501958, + "auxiliary_loss_mlp": 0.01277561, + "balance_loss_clip": 0.06297968, + "balance_loss_mlp": 0.01260025, + "epoch": 0.2946640613257177, + "flos": 14981412977280.0, + "grad_norm": 1.6788003410312407, + "language_loss": 0.81419849, + "learning_rate": 3.3091059305701268e-06, + "loss": 0.89199364, + "num_input_tokens_seen": 105574875, + "router_z_loss_clip": 2.0390625, + "router_z_loss_mlp": 0.1751709, + "step": 4901, + "time_per_iteration": 2.4958457946777344 + }, + { + "auxiliary_loss_clip": 0.06498285, + "auxiliary_loss_mlp": 0.01276891, + "balance_loss_clip": 0.0630265, + "balance_loss_mlp": 0.01261095, + "epoch": 0.2947241845783857, + "flos": 24250862102400.0, + "grad_norm": 2.051988062923015, + "language_loss": 0.58211619, + "learning_rate": 3.308811466431157e-06, + "loss": 0.659868, + "num_input_tokens_seen": 105594225, + "router_z_loss_clip": 1.95800781, + "router_z_loss_mlp": 0.15783691, + "step": 4902, + "time_per_iteration": 2.5867393016815186 + }, + { + "auxiliary_loss_clip": 0.06509895, + "auxiliary_loss_mlp": 0.01278228, + "balance_loss_clip": 0.06304582, + "balance_loss_mlp": 0.01261825, + "epoch": 0.29478430783105364, + "flos": 19944600744960.0, + "grad_norm": 1.670035021285574, + "language_loss": 0.75883406, + "learning_rate": 3.308516952661925e-06, + "loss": 0.83671534, + "num_input_tokens_seen": 105614000, + "router_z_loss_clip": 2.05371094, + "router_z_loss_mlp": 0.16418457, + "step": 4903, + "time_per_iteration": 2.5120930671691895 + }, + { + "auxiliary_loss_clip": 0.06499215, + "auxiliary_loss_mlp": 0.01273387, + "balance_loss_clip": 0.06295954, + "balance_loss_mlp": 0.01255612, + "epoch": 0.2948444310837216, + "flos": 27388774454400.0, + "grad_norm": 1.8166217426315454, + "language_loss": 0.6305517, + "learning_rate": 3.3082223892736e-06, + "loss": 0.7082777, + "num_input_tokens_seen": 105634575, + "router_z_loss_clip": 2.03320312, + "router_z_loss_mlp": 0.17773438, + "step": 4904, + "time_per_iteration": 2.610600709915161 + }, + { + "auxiliary_loss_clip": 0.06509106, + "auxiliary_loss_mlp": 0.01272684, + "balance_loss_clip": 0.06301488, + "balance_loss_mlp": 0.01255983, + "epoch": 0.2949045543363896, + "flos": 23412401821440.0, + "grad_norm": 1.721115639485294, + "language_loss": 0.73724848, + "learning_rate": 3.3079277762773496e-06, + "loss": 0.8150664, + "num_input_tokens_seen": 105654385, + "router_z_loss_clip": 2.07519531, + "router_z_loss_mlp": 0.16711426, + "step": 4905, + "time_per_iteration": 2.5330429077148438 + }, + { + "auxiliary_loss_clip": 0.06501255, + "auxiliary_loss_mlp": 0.01270139, + "balance_loss_clip": 0.06297939, + "balance_loss_mlp": 0.01252508, + "epoch": 0.2949646775890576, + "flos": 23958303171840.0, + "grad_norm": 1.607284793713989, + "language_loss": 0.81930244, + "learning_rate": 3.3076331136843476e-06, + "loss": 0.89701641, + "num_input_tokens_seen": 105673570, + "router_z_loss_clip": 2.03320312, + "router_z_loss_mlp": 0.17614746, + "step": 4906, + "time_per_iteration": 2.5717568397521973 + }, + { + "auxiliary_loss_clip": 0.06499709, + "auxiliary_loss_mlp": 0.0127024, + "balance_loss_clip": 0.06300811, + "balance_loss_mlp": 0.01254051, + "epoch": 0.29502480084172555, + "flos": 22791002342400.0, + "grad_norm": 1.8767623479937394, + "language_loss": 0.88041449, + "learning_rate": 3.3073384015057667e-06, + "loss": 0.95811397, + "num_input_tokens_seen": 105691940, + "router_z_loss_clip": 1.98632812, + "router_z_loss_mlp": 0.16186523, + "step": 4907, + "time_per_iteration": 2.532233238220215 + }, + { + "auxiliary_loss_clip": 0.06504819, + "auxiliary_loss_mlp": 0.01277393, + "balance_loss_clip": 0.06294614, + "balance_loss_mlp": 0.01257592, + "epoch": 0.2950849240943935, + "flos": 19652838428160.0, + "grad_norm": 2.2863974346720837, + "language_loss": 0.82530308, + "learning_rate": 3.307043639752782e-06, + "loss": 0.90312517, + "num_input_tokens_seen": 105709825, + "router_z_loss_clip": 2.10058594, + "router_z_loss_mlp": 0.19812012, + "step": 4908, + "time_per_iteration": 2.6338536739349365 + }, + { + "auxiliary_loss_clip": 0.06393203, + "auxiliary_loss_mlp": 0.01256311, + "balance_loss_clip": 0.06296152, + "balance_loss_mlp": 0.01251251, + "epoch": 0.2951450473470615, + "flos": 71021062010880.0, + "grad_norm": 0.749349843123412, + "language_loss": 0.57384133, + "learning_rate": 3.3067488284365728e-06, + "loss": 0.65033644, + "num_input_tokens_seen": 105766880, + "router_z_loss_clip": 0.96875, + "router_z_loss_mlp": 0.05059814, + "step": 4909, + "time_per_iteration": 3.0084846019744873 + }, + { + "auxiliary_loss_clip": 0.06500423, + "auxiliary_loss_mlp": 0.01279147, + "balance_loss_clip": 0.06298146, + "balance_loss_mlp": 0.0126278, + "epoch": 0.29520517059972945, + "flos": 22972955483520.0, + "grad_norm": 1.5167904233162786, + "language_loss": 0.87274551, + "learning_rate": 3.3064539675683163e-06, + "loss": 0.9505412, + "num_input_tokens_seen": 105786875, + "router_z_loss_clip": 2.02441406, + "router_z_loss_mlp": 0.16381836, + "step": 4910, + "time_per_iteration": 2.615015745162964 + }, + { + "auxiliary_loss_clip": 0.06494174, + "auxiliary_loss_mlp": 0.01271849, + "balance_loss_clip": 0.06294993, + "balance_loss_mlp": 0.01255017, + "epoch": 0.2952652938523974, + "flos": 20491969541760.0, + "grad_norm": 1.9871602841434197, + "language_loss": 0.72998595, + "learning_rate": 3.3061590571591946e-06, + "loss": 0.80764621, + "num_input_tokens_seen": 105805315, + "router_z_loss_clip": 1.99316406, + "router_z_loss_mlp": 0.16821289, + "step": 4911, + "time_per_iteration": 2.5274527072906494 + }, + { + "auxiliary_loss_clip": 0.06493053, + "auxiliary_loss_mlp": 0.01276167, + "balance_loss_clip": 0.06295265, + "balance_loss_mlp": 0.01260122, + "epoch": 0.2953254171050654, + "flos": 19652754574080.0, + "grad_norm": 1.8153147203758204, + "language_loss": 0.90350848, + "learning_rate": 3.3058640972203904e-06, + "loss": 0.98120075, + "num_input_tokens_seen": 105825125, + "router_z_loss_clip": 1.9765625, + "router_z_loss_mlp": 0.16040039, + "step": 4912, + "time_per_iteration": 4.015045881271362 + }, + { + "auxiliary_loss_clip": 0.06500725, + "auxiliary_loss_mlp": 0.01273843, + "balance_loss_clip": 0.06298609, + "balance_loss_mlp": 0.01256474, + "epoch": 0.29538554035773334, + "flos": 22754678797440.0, + "grad_norm": 1.456675217678442, + "language_loss": 0.83491737, + "learning_rate": 3.3055690877630894e-06, + "loss": 0.91266304, + "num_input_tokens_seen": 105846085, + "router_z_loss_clip": 2.0234375, + "router_z_loss_mlp": 0.17370605, + "step": 4913, + "time_per_iteration": 2.5691113471984863 + }, + { + "auxiliary_loss_clip": 0.06499185, + "auxiliary_loss_mlp": 0.01271149, + "balance_loss_clip": 0.06297807, + "balance_loss_mlp": 0.01255163, + "epoch": 0.2954456636104013, + "flos": 21878343671040.0, + "grad_norm": 1.7751266266229593, + "language_loss": 0.77296054, + "learning_rate": 3.3052740287984765e-06, + "loss": 0.85066384, + "num_input_tokens_seen": 105865400, + "router_z_loss_clip": 2.01367188, + "router_z_loss_mlp": 0.15991211, + "step": 4914, + "time_per_iteration": 2.5379679203033447 + }, + { + "auxiliary_loss_clip": 0.06494316, + "auxiliary_loss_mlp": 0.01276253, + "balance_loss_clip": 0.06294423, + "balance_loss_mlp": 0.01259563, + "epoch": 0.2955057868630693, + "flos": 40452056092800.0, + "grad_norm": 1.8412710776020966, + "language_loss": 0.81848276, + "learning_rate": 3.3049789203377424e-06, + "loss": 0.89618844, + "num_input_tokens_seen": 105887920, + "router_z_loss_clip": 2.0, + "router_z_loss_mlp": 0.16674805, + "step": 4915, + "time_per_iteration": 4.123507261276245 + }, + { + "auxiliary_loss_clip": 0.06504083, + "auxiliary_loss_mlp": 0.01278112, + "balance_loss_clip": 0.06299824, + "balance_loss_mlp": 0.01260707, + "epoch": 0.29556591011573724, + "flos": 22571006647680.0, + "grad_norm": 1.7265680083109098, + "language_loss": 0.85337454, + "learning_rate": 3.3046837623920772e-06, + "loss": 0.93119645, + "num_input_tokens_seen": 105904035, + "router_z_loss_clip": 2.04296875, + "router_z_loss_mlp": 0.1739502, + "step": 4916, + "time_per_iteration": 3.964902400970459 + }, + { + "auxiliary_loss_clip": 0.06496175, + "auxiliary_loss_mlp": 0.01273483, + "balance_loss_clip": 0.06292706, + "balance_loss_mlp": 0.01257187, + "epoch": 0.2956260333684052, + "flos": 22095572181120.0, + "grad_norm": 2.6877460244099254, + "language_loss": 0.71410239, + "learning_rate": 3.3043885549726723e-06, + "loss": 0.79179895, + "num_input_tokens_seen": 105922685, + "router_z_loss_clip": 2.03613281, + "router_z_loss_mlp": 0.16296387, + "step": 4917, + "time_per_iteration": 2.510061502456665 + }, + { + "auxiliary_loss_clip": 0.06495264, + "auxiliary_loss_mlp": 0.01273068, + "balance_loss_clip": 0.06293347, + "balance_loss_mlp": 0.01255771, + "epoch": 0.2956861566210732, + "flos": 16441063102080.0, + "grad_norm": 1.9904514264943383, + "language_loss": 0.9154985, + "learning_rate": 3.3040932980907226e-06, + "loss": 0.99318182, + "num_input_tokens_seen": 105940425, + "router_z_loss_clip": 2.01757812, + "router_z_loss_mlp": 0.1730957, + "step": 4918, + "time_per_iteration": 2.5177812576293945 + }, + { + "auxiliary_loss_clip": 0.06500694, + "auxiliary_loss_mlp": 0.01270804, + "balance_loss_clip": 0.0629639, + "balance_loss_mlp": 0.01252887, + "epoch": 0.2957462798737412, + "flos": 25819189372800.0, + "grad_norm": 2.9632565132584587, + "language_loss": 0.73171133, + "learning_rate": 3.303797991757425e-06, + "loss": 0.80942631, + "num_input_tokens_seen": 105960550, + "router_z_loss_clip": 2.04394531, + "router_z_loss_mlp": 0.17919922, + "step": 4919, + "time_per_iteration": 2.548271656036377 + }, + { + "auxiliary_loss_clip": 0.06494663, + "auxiliary_loss_mlp": 0.01276246, + "balance_loss_clip": 0.062939, + "balance_loss_mlp": 0.01259104, + "epoch": 0.29580640312640916, + "flos": 16696459946880.0, + "grad_norm": 2.067015346809242, + "language_loss": 0.76653767, + "learning_rate": 3.3035026359839763e-06, + "loss": 0.84424675, + "num_input_tokens_seen": 105978820, + "router_z_loss_clip": 2.00878906, + "router_z_loss_mlp": 0.17138672, + "step": 4920, + "time_per_iteration": 2.5283315181732178 + }, + { + "auxiliary_loss_clip": 0.06505087, + "auxiliary_loss_mlp": 0.01280613, + "balance_loss_clip": 0.06298134, + "balance_loss_mlp": 0.01262886, + "epoch": 0.2958665263790771, + "flos": 23951427137280.0, + "grad_norm": 2.1683803944953786, + "language_loss": 0.69314063, + "learning_rate": 3.3032072307815774e-06, + "loss": 0.77099764, + "num_input_tokens_seen": 105997545, + "router_z_loss_clip": 2.06738281, + "router_z_loss_mlp": 0.17724609, + "step": 4921, + "time_per_iteration": 3.9904286861419678 + }, + { + "auxiliary_loss_clip": 0.06507339, + "auxiliary_loss_mlp": 0.01279047, + "balance_loss_clip": 0.06297763, + "balance_loss_mlp": 0.01261023, + "epoch": 0.2959266496317451, + "flos": 18484279787520.0, + "grad_norm": 1.8551497184563221, + "language_loss": 0.75478184, + "learning_rate": 3.3029117761614298e-06, + "loss": 0.83264565, + "num_input_tokens_seen": 106015320, + "router_z_loss_clip": 2.09570312, + "router_z_loss_mlp": 0.18017578, + "step": 4922, + "time_per_iteration": 2.5025644302368164 + }, + { + "auxiliary_loss_clip": 0.06508595, + "auxiliary_loss_mlp": 0.01277113, + "balance_loss_clip": 0.06298192, + "balance_loss_mlp": 0.01258051, + "epoch": 0.29598677288441305, + "flos": 25964525479680.0, + "grad_norm": 1.7877276864194063, + "language_loss": 0.77317607, + "learning_rate": 3.302616272134737e-06, + "loss": 0.85103309, + "num_input_tokens_seen": 106034555, + "router_z_loss_clip": 2.10742188, + "router_z_loss_mlp": 0.19067383, + "step": 4923, + "time_per_iteration": 2.57328462600708 + }, + { + "auxiliary_loss_clip": 0.06498858, + "auxiliary_loss_mlp": 0.01279587, + "balance_loss_clip": 0.06293048, + "balance_loss_mlp": 0.01262016, + "epoch": 0.296046896137081, + "flos": 25163101503360.0, + "grad_norm": 2.2992847921393174, + "language_loss": 0.8687042, + "learning_rate": 3.3023207187127042e-06, + "loss": 0.94648862, + "num_input_tokens_seen": 106054200, + "router_z_loss_clip": 2.05859375, + "router_z_loss_mlp": 0.17565918, + "step": 4924, + "time_per_iteration": 2.569819450378418 + }, + { + "auxiliary_loss_clip": 0.06495638, + "auxiliary_loss_mlp": 0.01274356, + "balance_loss_clip": 0.06293976, + "balance_loss_mlp": 0.01256891, + "epoch": 0.296107019389749, + "flos": 21767402465280.0, + "grad_norm": 1.4490170840920502, + "language_loss": 0.823627, + "learning_rate": 3.3020251159065396e-06, + "loss": 0.90132689, + "num_input_tokens_seen": 106074700, + "router_z_loss_clip": 2.015625, + "router_z_loss_mlp": 0.17468262, + "step": 4925, + "time_per_iteration": 2.586395025253296 + }, + { + "auxiliary_loss_clip": 0.06496158, + "auxiliary_loss_mlp": 0.01278426, + "balance_loss_clip": 0.06294197, + "balance_loss_mlp": 0.01261415, + "epoch": 0.29616714264241695, + "flos": 17964555638400.0, + "grad_norm": 3.115838377994743, + "language_loss": 0.87332439, + "learning_rate": 3.301729463727452e-06, + "loss": 0.95107025, + "num_input_tokens_seen": 106091415, + "router_z_loss_clip": 2.01953125, + "router_z_loss_mlp": 0.17016602, + "step": 4926, + "time_per_iteration": 2.480851411819458 + }, + { + "auxiliary_loss_clip": 0.06502646, + "auxiliary_loss_mlp": 0.01277188, + "balance_loss_clip": 0.06295682, + "balance_loss_mlp": 0.0125995, + "epoch": 0.2962272658950849, + "flos": 15018155792640.0, + "grad_norm": 2.5897634799766296, + "language_loss": 0.86097062, + "learning_rate": 3.3014337621866527e-06, + "loss": 0.93876898, + "num_input_tokens_seen": 106109135, + "router_z_loss_clip": 2.06738281, + "router_z_loss_mlp": 0.17236328, + "step": 4927, + "time_per_iteration": 2.524277687072754 + }, + { + "auxiliary_loss_clip": 0.06496821, + "auxiliary_loss_mlp": 0.01273329, + "balance_loss_clip": 0.06295302, + "balance_loss_mlp": 0.01256545, + "epoch": 0.2962873891477529, + "flos": 14726183840640.0, + "grad_norm": 1.628327768422068, + "language_loss": 0.80864251, + "learning_rate": 3.3011380112953553e-06, + "loss": 0.88634396, + "num_input_tokens_seen": 106125750, + "router_z_loss_clip": 2.01367188, + "router_z_loss_mlp": 0.16772461, + "step": 4928, + "time_per_iteration": 2.495842933654785 + }, + { + "auxiliary_loss_clip": 0.06510531, + "auxiliary_loss_mlp": 0.01280378, + "balance_loss_clip": 0.0629655, + "balance_loss_mlp": 0.012609, + "epoch": 0.29634751240042084, + "flos": 26730967576320.0, + "grad_norm": 3.186979474193142, + "language_loss": 0.72557974, + "learning_rate": 3.300842211064773e-06, + "loss": 0.80348885, + "num_input_tokens_seen": 106142835, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.19482422, + "step": 4929, + "time_per_iteration": 2.5845630168914795 + }, + { + "auxiliary_loss_clip": 0.06503193, + "auxiliary_loss_mlp": 0.01287506, + "balance_loss_clip": 0.06293295, + "balance_loss_mlp": 0.01268456, + "epoch": 0.2964076356530888, + "flos": 14575984197120.0, + "grad_norm": 2.811052251549286, + "language_loss": 0.73200721, + "learning_rate": 3.3005463615061246e-06, + "loss": 0.80991417, + "num_input_tokens_seen": 106160680, + "router_z_loss_clip": 2.09960938, + "router_z_loss_mlp": 0.19042969, + "step": 4930, + "time_per_iteration": 2.488785982131958 + }, + { + "auxiliary_loss_clip": 0.06387739, + "auxiliary_loss_mlp": 0.01269345, + "balance_loss_clip": 0.06290003, + "balance_loss_mlp": 0.0126519, + "epoch": 0.29646775890575683, + "flos": 63124387925760.0, + "grad_norm": 0.773484435694784, + "language_loss": 0.60626972, + "learning_rate": 3.3002504626306275e-06, + "loss": 0.68284053, + "num_input_tokens_seen": 106224415, + "router_z_loss_clip": 0.9765625, + "router_z_loss_mlp": 0.04156494, + "step": 4931, + "time_per_iteration": 3.1399567127227783 + }, + { + "auxiliary_loss_clip": 0.06390411, + "auxiliary_loss_mlp": 0.01264384, + "balance_loss_clip": 0.06293079, + "balance_loss_mlp": 0.0126054, + "epoch": 0.2965278821584248, + "flos": 63087728964480.0, + "grad_norm": 0.7260178151779769, + "language_loss": 0.52335358, + "learning_rate": 3.2999545144495023e-06, + "loss": 0.59990156, + "num_input_tokens_seen": 106279140, + "router_z_loss_clip": 0.97265625, + "router_z_loss_mlp": 0.03839111, + "step": 4932, + "time_per_iteration": 3.0242393016815186 + }, + { + "auxiliary_loss_clip": 0.06496995, + "auxiliary_loss_mlp": 0.01277379, + "balance_loss_clip": 0.06294326, + "balance_loss_mlp": 0.01260368, + "epoch": 0.29658800541109276, + "flos": 23775469562880.0, + "grad_norm": 1.6744964780290639, + "language_loss": 0.82042706, + "learning_rate": 3.299658516973972e-06, + "loss": 0.89817077, + "num_input_tokens_seen": 106298190, + "router_z_loss_clip": 2.02734375, + "router_z_loss_mlp": 0.17028809, + "step": 4933, + "time_per_iteration": 2.5955240726470947 + }, + { + "auxiliary_loss_clip": 0.06493178, + "auxiliary_loss_mlp": 0.01272888, + "balance_loss_clip": 0.06293809, + "balance_loss_mlp": 0.01256377, + "epoch": 0.2966481286637607, + "flos": 23995465257600.0, + "grad_norm": 1.8381459517159284, + "language_loss": 0.75639498, + "learning_rate": 3.299362470215261e-06, + "loss": 0.83405566, + "num_input_tokens_seen": 106319065, + "router_z_loss_clip": 1.99414062, + "router_z_loss_mlp": 0.16503906, + "step": 4934, + "time_per_iteration": 2.5714681148529053 + }, + { + "auxiliary_loss_clip": 0.06508597, + "auxiliary_loss_mlp": 0.01280413, + "balance_loss_clip": 0.06299804, + "balance_loss_mlp": 0.01261697, + "epoch": 0.2967082519164287, + "flos": 17170846237440.0, + "grad_norm": 1.723450067314057, + "language_loss": 0.63127494, + "learning_rate": 3.299066374184594e-06, + "loss": 0.70916504, + "num_input_tokens_seen": 106338040, + "router_z_loss_clip": 2.0859375, + "router_z_loss_mlp": 0.18713379, + "step": 4935, + "time_per_iteration": 2.513557195663452 + }, + { + "auxiliary_loss_clip": 0.06500618, + "auxiliary_loss_mlp": 0.01281806, + "balance_loss_clip": 0.06298316, + "balance_loss_mlp": 0.01263424, + "epoch": 0.29676837516909665, + "flos": 29395416032640.0, + "grad_norm": 1.6887254989691298, + "language_loss": 0.80239189, + "learning_rate": 3.2987702288932e-06, + "loss": 0.88021612, + "num_input_tokens_seen": 106358900, + "router_z_loss_clip": 2.02148438, + "router_z_loss_mlp": 0.18383789, + "step": 4936, + "time_per_iteration": 2.6222426891326904 + }, + { + "auxiliary_loss_clip": 0.06510909, + "auxiliary_loss_mlp": 0.0128109, + "balance_loss_clip": 0.06301413, + "balance_loss_mlp": 0.01261444, + "epoch": 0.2968284984217646, + "flos": 34759839876480.0, + "grad_norm": 1.4826285887608224, + "language_loss": 0.74831104, + "learning_rate": 3.298474034352309e-06, + "loss": 0.826231, + "num_input_tokens_seen": 106381805, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 0.19665527, + "step": 4937, + "time_per_iteration": 2.7231242656707764 + }, + { + "auxiliary_loss_clip": 0.06501779, + "auxiliary_loss_mlp": 0.01274387, + "balance_loss_clip": 0.06297591, + "balance_loss_mlp": 0.01256768, + "epoch": 0.2968886216744326, + "flos": 21550635152640.0, + "grad_norm": 1.507706154697653, + "language_loss": 0.78372371, + "learning_rate": 3.2981777905731526e-06, + "loss": 0.86148536, + "num_input_tokens_seen": 106402365, + "router_z_loss_clip": 2.04101562, + "router_z_loss_mlp": 0.17614746, + "step": 4938, + "time_per_iteration": 2.564958095550537 + }, + { + "auxiliary_loss_clip": 0.06506119, + "auxiliary_loss_mlp": 0.01279001, + "balance_loss_clip": 0.06296918, + "balance_loss_mlp": 0.01260643, + "epoch": 0.29694874492710055, + "flos": 12792357060480.0, + "grad_norm": 3.019574533594622, + "language_loss": 0.76788878, + "learning_rate": 3.297881497566964e-06, + "loss": 0.84574002, + "num_input_tokens_seen": 106419800, + "router_z_loss_clip": 2.09179688, + "router_z_loss_mlp": 0.18359375, + "step": 4939, + "time_per_iteration": 2.514143943786621 + }, + { + "auxiliary_loss_clip": 0.06509334, + "auxiliary_loss_mlp": 0.01271997, + "balance_loss_clip": 0.06296703, + "balance_loss_mlp": 0.01254259, + "epoch": 0.2970088681797685, + "flos": 24576600049920.0, + "grad_norm": 1.687046897883716, + "language_loss": 0.78335512, + "learning_rate": 3.297585155344979e-06, + "loss": 0.86116844, + "num_input_tokens_seen": 106440300, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.17736816, + "step": 4940, + "time_per_iteration": 2.570279359817505 + }, + { + "auxiliary_loss_clip": 0.06508817, + "auxiliary_loss_mlp": 0.01275865, + "balance_loss_clip": 0.06300067, + "balance_loss_mlp": 0.01257113, + "epoch": 0.2970689914324365, + "flos": 23665870022400.0, + "grad_norm": 1.5281741947741105, + "language_loss": 0.75415564, + "learning_rate": 3.297288763918435e-06, + "loss": 0.8320024, + "num_input_tokens_seen": 106460035, + "router_z_loss_clip": 2.08789062, + "router_z_loss_mlp": 0.1875, + "step": 4941, + "time_per_iteration": 2.549976348876953 + }, + { + "auxiliary_loss_clip": 0.06509985, + "auxiliary_loss_mlp": 0.01274098, + "balance_loss_clip": 0.06298217, + "balance_loss_mlp": 0.01254667, + "epoch": 0.29712911468510445, + "flos": 39678654107520.0, + "grad_norm": 2.245999939669129, + "language_loss": 0.74959898, + "learning_rate": 3.2969923232985712e-06, + "loss": 0.82743979, + "num_input_tokens_seen": 106481095, + "router_z_loss_clip": 2.1171875, + "router_z_loss_mlp": 0.19445801, + "step": 4942, + "time_per_iteration": 2.7199416160583496 + }, + { + "auxiliary_loss_clip": 0.0651295, + "auxiliary_loss_mlp": 0.01282177, + "balance_loss_clip": 0.06299168, + "balance_loss_mlp": 0.01261744, + "epoch": 0.2971892379377724, + "flos": 26402420517120.0, + "grad_norm": 1.727137408051059, + "language_loss": 0.70931113, + "learning_rate": 3.2966958334966287e-06, + "loss": 0.78726244, + "num_input_tokens_seen": 106501590, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.2043457, + "step": 4943, + "time_per_iteration": 2.5410006046295166 + }, + { + "auxiliary_loss_clip": 0.06508674, + "auxiliary_loss_mlp": 0.01274303, + "balance_loss_clip": 0.06296329, + "balance_loss_mlp": 0.01255599, + "epoch": 0.2972493611904404, + "flos": 17608992837120.0, + "grad_norm": 2.280832061666768, + "language_loss": 0.8012532, + "learning_rate": 3.2963992945238497e-06, + "loss": 0.87908292, + "num_input_tokens_seen": 106519430, + "router_z_loss_clip": 2.12207031, + "router_z_loss_mlp": 0.18725586, + "step": 4944, + "time_per_iteration": 2.5628697872161865 + }, + { + "auxiliary_loss_clip": 0.06495067, + "auxiliary_loss_mlp": 0.01272551, + "balance_loss_clip": 0.06293043, + "balance_loss_mlp": 0.01255194, + "epoch": 0.2973094844431084, + "flos": 20419070889600.0, + "grad_norm": 2.0196449856406704, + "language_loss": 0.83490258, + "learning_rate": 3.2961027063914795e-06, + "loss": 0.91257876, + "num_input_tokens_seen": 106535870, + "router_z_loss_clip": 2.01953125, + "router_z_loss_mlp": 0.17346191, + "step": 4945, + "time_per_iteration": 2.5184381008148193 + }, + { + "auxiliary_loss_clip": 0.06494735, + "auxiliary_loss_mlp": 0.01274271, + "balance_loss_clip": 0.0629338, + "balance_loss_mlp": 0.01257081, + "epoch": 0.29736960769577636, + "flos": 17499225588480.0, + "grad_norm": 1.8481246337269472, + "language_loss": 0.67665654, + "learning_rate": 3.2958060691107654e-06, + "loss": 0.75434661, + "num_input_tokens_seen": 106553560, + "router_z_loss_clip": 2.00976562, + "router_z_loss_mlp": 0.171875, + "step": 4946, + "time_per_iteration": 2.524073362350464 + }, + { + "auxiliary_loss_clip": 0.06500807, + "auxiliary_loss_mlp": 0.01272914, + "balance_loss_clip": 0.06294695, + "balance_loss_mlp": 0.01255462, + "epoch": 0.2974297309484443, + "flos": 26111119397760.0, + "grad_norm": 1.9041348906467674, + "language_loss": 0.74493206, + "learning_rate": 3.2955093826929547e-06, + "loss": 0.82266927, + "num_input_tokens_seen": 106574115, + "router_z_loss_clip": 2.05957031, + "router_z_loss_mlp": 0.17443848, + "step": 4947, + "time_per_iteration": 2.55096435546875 + }, + { + "auxiliary_loss_clip": 0.06508033, + "auxiliary_loss_mlp": 0.01274303, + "balance_loss_clip": 0.06299601, + "balance_loss_mlp": 0.01255396, + "epoch": 0.2974898542011123, + "flos": 25673559776640.0, + "grad_norm": 5.5840313105791894, + "language_loss": 0.73332673, + "learning_rate": 3.2952126471492985e-06, + "loss": 0.81115007, + "num_input_tokens_seen": 106593070, + "router_z_loss_clip": 2.08496094, + "router_z_loss_mlp": 0.18896484, + "step": 4948, + "time_per_iteration": 2.604213237762451 + }, + { + "auxiliary_loss_clip": 0.06494544, + "auxiliary_loss_mlp": 0.01275305, + "balance_loss_clip": 0.06292598, + "balance_loss_mlp": 0.01258687, + "epoch": 0.29754997745378026, + "flos": 18667323031680.0, + "grad_norm": 1.916403484704169, + "language_loss": 0.84057009, + "learning_rate": 3.2949158624910497e-06, + "loss": 0.91826856, + "num_input_tokens_seen": 106610695, + "router_z_loss_clip": 2.015625, + "router_z_loss_mlp": 0.1661377, + "step": 4949, + "time_per_iteration": 2.4725756645202637 + }, + { + "auxiliary_loss_clip": 0.06495193, + "auxiliary_loss_mlp": 0.01276752, + "balance_loss_clip": 0.06291104, + "balance_loss_mlp": 0.01258692, + "epoch": 0.2976101007064482, + "flos": 22281382609920.0, + "grad_norm": 2.0864257908602464, + "language_loss": 0.71227181, + "learning_rate": 3.2946190287294603e-06, + "loss": 0.78999126, + "num_input_tokens_seen": 106631300, + "router_z_loss_clip": 2.04101562, + "router_z_loss_mlp": 0.18078613, + "step": 4950, + "time_per_iteration": 2.5644164085388184 + }, + { + "auxiliary_loss_clip": 0.06486266, + "auxiliary_loss_mlp": 0.01272795, + "balance_loss_clip": 0.06290439, + "balance_loss_mlp": 0.01256308, + "epoch": 0.2976702239591162, + "flos": 21952290499200.0, + "grad_norm": 2.1576156011429597, + "language_loss": 0.83112931, + "learning_rate": 3.294322145875789e-06, + "loss": 0.9087199, + "num_input_tokens_seen": 106650065, + "router_z_loss_clip": 1.95898438, + "router_z_loss_mlp": 0.16467285, + "step": 4951, + "time_per_iteration": 2.5149009227752686 + }, + { + "auxiliary_loss_clip": 0.06493516, + "auxiliary_loss_mlp": 0.01274653, + "balance_loss_clip": 0.06287138, + "balance_loss_mlp": 0.01257248, + "epoch": 0.29773034721178415, + "flos": 24642874229760.0, + "grad_norm": 2.538162384222029, + "language_loss": 0.73777694, + "learning_rate": 3.2940252139412912e-06, + "loss": 0.81545866, + "num_input_tokens_seen": 106668230, + "router_z_loss_clip": 2.06347656, + "router_z_loss_mlp": 0.17407227, + "step": 4952, + "time_per_iteration": 3.9977774620056152 + }, + { + "auxiliary_loss_clip": 0.06494328, + "auxiliary_loss_mlp": 0.01279914, + "balance_loss_clip": 0.06291338, + "balance_loss_mlp": 0.01261472, + "epoch": 0.2977904704644521, + "flos": 20563694236800.0, + "grad_norm": 1.830993802630573, + "language_loss": 0.8420608, + "learning_rate": 3.293728232937228e-06, + "loss": 0.91980314, + "num_input_tokens_seen": 106687785, + "router_z_loss_clip": 2.03027344, + "router_z_loss_mlp": 0.18444824, + "step": 4953, + "time_per_iteration": 2.556278944015503 + }, + { + "auxiliary_loss_clip": 0.0649702, + "auxiliary_loss_mlp": 0.01271138, + "balance_loss_clip": 0.06289494, + "balance_loss_mlp": 0.01254246, + "epoch": 0.2978505937171201, + "flos": 18922426387200.0, + "grad_norm": 2.0824874332629113, + "language_loss": 0.74276727, + "learning_rate": 3.2934312028748597e-06, + "loss": 0.82044888, + "num_input_tokens_seen": 106706875, + "router_z_loss_clip": 2.07617188, + "router_z_loss_mlp": 0.16894531, + "step": 4954, + "time_per_iteration": 3.9108667373657227 + }, + { + "auxiliary_loss_clip": 0.06489201, + "auxiliary_loss_mlp": 0.01275174, + "balance_loss_clip": 0.06286507, + "balance_loss_mlp": 0.01259164, + "epoch": 0.29791071696978805, + "flos": 19323788244480.0, + "grad_norm": 1.865430683209025, + "language_loss": 0.75582623, + "learning_rate": 3.293134123765452e-06, + "loss": 0.83346999, + "num_input_tokens_seen": 106725105, + "router_z_loss_clip": 2.02929688, + "router_z_loss_mlp": 0.16003418, + "step": 4955, + "time_per_iteration": 4.034101724624634 + }, + { + "auxiliary_loss_clip": 0.06493168, + "auxiliary_loss_mlp": 0.01273359, + "balance_loss_clip": 0.06285557, + "balance_loss_mlp": 0.0125593, + "epoch": 0.297970840222456, + "flos": 18812742992640.0, + "grad_norm": 1.8893942834003292, + "language_loss": 0.72569048, + "learning_rate": 3.2928369956202684e-06, + "loss": 0.80335575, + "num_input_tokens_seen": 106744780, + "router_z_loss_clip": 2.07421875, + "router_z_loss_mlp": 0.17419434, + "step": 4956, + "time_per_iteration": 2.523688793182373 + }, + { + "auxiliary_loss_clip": 0.06498902, + "auxiliary_loss_mlp": 0.01272155, + "balance_loss_clip": 0.06287451, + "balance_loss_mlp": 0.01253141, + "epoch": 0.298030963475124, + "flos": 22858702041600.0, + "grad_norm": 1.7093127439145954, + "language_loss": 0.79588521, + "learning_rate": 3.2925398184505754e-06, + "loss": 0.87359571, + "num_input_tokens_seen": 106764670, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.19006348, + "step": 4957, + "time_per_iteration": 2.5350780487060547 + }, + { + "auxiliary_loss_clip": 0.0648672, + "auxiliary_loss_mlp": 0.01278155, + "balance_loss_clip": 0.06281397, + "balance_loss_mlp": 0.01261084, + "epoch": 0.298091086727792, + "flos": 21874402529280.0, + "grad_norm": 1.5033412482034976, + "language_loss": 0.70601791, + "learning_rate": 3.2922425922676437e-06, + "loss": 0.78366661, + "num_input_tokens_seen": 106783695, + "router_z_loss_clip": 2.05273438, + "router_z_loss_mlp": 0.17077637, + "step": 4958, + "time_per_iteration": 2.52998948097229 + }, + { + "auxiliary_loss_clip": 0.06484255, + "auxiliary_loss_mlp": 0.01275467, + "balance_loss_clip": 0.06283475, + "balance_loss_mlp": 0.01256954, + "epoch": 0.29815120998045996, + "flos": 21180775230720.0, + "grad_norm": 1.4471916983062794, + "language_loss": 0.78955591, + "learning_rate": 3.291945317082743e-06, + "loss": 0.86715317, + "num_input_tokens_seen": 106803150, + "router_z_loss_clip": 2.00683594, + "router_z_loss_mlp": 0.18505859, + "step": 4959, + "time_per_iteration": 2.5247116088867188 + }, + { + "auxiliary_loss_clip": 0.06484501, + "auxiliary_loss_mlp": 0.01275754, + "balance_loss_clip": 0.06281502, + "balance_loss_mlp": 0.01258183, + "epoch": 0.29821133323312793, + "flos": 19901526946560.0, + "grad_norm": 1.8097637226237389, + "language_loss": 0.79637736, + "learning_rate": 3.291647992907147e-06, + "loss": 0.87397993, + "num_input_tokens_seen": 106820705, + "router_z_loss_clip": 2.02734375, + "router_z_loss_mlp": 0.17578125, + "step": 4960, + "time_per_iteration": 2.544517755508423 + }, + { + "auxiliary_loss_clip": 0.06493803, + "auxiliary_loss_mlp": 0.01272699, + "balance_loss_clip": 0.06284714, + "balance_loss_mlp": 0.01254483, + "epoch": 0.2982714564857959, + "flos": 12755781953280.0, + "grad_norm": 2.226713674353186, + "language_loss": 0.74493575, + "learning_rate": 3.291350619752129e-06, + "loss": 0.82260078, + "num_input_tokens_seen": 106837335, + "router_z_loss_clip": 2.09082031, + "router_z_loss_mlp": 0.18225098, + "step": 4961, + "time_per_iteration": 3.9662065505981445 + }, + { + "auxiliary_loss_clip": 0.06486452, + "auxiliary_loss_mlp": 0.01274578, + "balance_loss_clip": 0.062804, + "balance_loss_mlp": 0.01256756, + "epoch": 0.29833157973846386, + "flos": 22278238081920.0, + "grad_norm": 2.8000667311611167, + "language_loss": 0.62968349, + "learning_rate": 3.291053197628967e-06, + "loss": 0.70729387, + "num_input_tokens_seen": 106856250, + "router_z_loss_clip": 2.06054688, + "router_z_loss_mlp": 0.17810059, + "step": 4962, + "time_per_iteration": 2.533984661102295 + }, + { + "auxiliary_loss_clip": 0.06485053, + "auxiliary_loss_mlp": 0.01276691, + "balance_loss_clip": 0.06281514, + "balance_loss_mlp": 0.01259596, + "epoch": 0.2983917029911318, + "flos": 15377659735680.0, + "grad_norm": 1.6706058401186525, + "language_loss": 0.83686638, + "learning_rate": 3.2907557265489375e-06, + "loss": 0.91448379, + "num_input_tokens_seen": 106873370, + "router_z_loss_clip": 2.03613281, + "router_z_loss_mlp": 0.17102051, + "step": 4963, + "time_per_iteration": 2.524486780166626 + }, + { + "auxiliary_loss_clip": 0.0648464, + "auxiliary_loss_mlp": 0.01276785, + "balance_loss_clip": 0.06283776, + "balance_loss_mlp": 0.01259572, + "epoch": 0.2984518262437998, + "flos": 15383068323840.0, + "grad_norm": 2.213795741630968, + "language_loss": 0.66932309, + "learning_rate": 3.290458206523322e-06, + "loss": 0.74693739, + "num_input_tokens_seen": 106890330, + "router_z_loss_clip": 2.01074219, + "router_z_loss_mlp": 0.17224121, + "step": 4964, + "time_per_iteration": 2.5100491046905518 + }, + { + "auxiliary_loss_clip": 0.06485043, + "auxiliary_loss_mlp": 0.01273472, + "balance_loss_clip": 0.06283367, + "balance_loss_mlp": 0.01257701, + "epoch": 0.29851194949646775, + "flos": 18113413616640.0, + "grad_norm": 1.8232440195867097, + "language_loss": 0.72163451, + "learning_rate": 3.2901606375634015e-06, + "loss": 0.79921961, + "num_input_tokens_seen": 106909190, + "router_z_loss_clip": 2.01660156, + "router_z_loss_mlp": 0.15771484, + "step": 4965, + "time_per_iteration": 2.5180373191833496 + }, + { + "auxiliary_loss_clip": 0.06490128, + "auxiliary_loss_mlp": 0.01278877, + "balance_loss_clip": 0.06284484, + "balance_loss_mlp": 0.01261139, + "epoch": 0.2985720727491357, + "flos": 22024811808000.0, + "grad_norm": 1.7919900337102326, + "language_loss": 0.66928089, + "learning_rate": 3.289863019680461e-06, + "loss": 0.74697095, + "num_input_tokens_seen": 106927825, + "router_z_loss_clip": 2.05664062, + "router_z_loss_mlp": 0.17724609, + "step": 4966, + "time_per_iteration": 2.5509839057922363 + }, + { + "auxiliary_loss_clip": 0.06492805, + "auxiliary_loss_mlp": 0.01279859, + "balance_loss_clip": 0.06288783, + "balance_loss_mlp": 0.01262026, + "epoch": 0.2986321960018037, + "flos": 13046202604800.0, + "grad_norm": 2.9983208236286862, + "language_loss": 0.74761832, + "learning_rate": 3.289565352885785e-06, + "loss": 0.82534492, + "num_input_tokens_seen": 106943155, + "router_z_loss_clip": 2.04199219, + "router_z_loss_mlp": 0.17822266, + "step": 4967, + "time_per_iteration": 2.5119001865386963 + }, + { + "auxiliary_loss_clip": 0.06492577, + "auxiliary_loss_mlp": 0.01276602, + "balance_loss_clip": 0.06288804, + "balance_loss_mlp": 0.01260294, + "epoch": 0.29869231925447165, + "flos": 14470241944320.0, + "grad_norm": 1.9901449284839132, + "language_loss": 0.72232509, + "learning_rate": 3.2892676371906614e-06, + "loss": 0.80001682, + "num_input_tokens_seen": 106960295, + "router_z_loss_clip": 2.03613281, + "router_z_loss_mlp": 0.16308594, + "step": 4968, + "time_per_iteration": 2.49646258354187 + }, + { + "auxiliary_loss_clip": 0.06497695, + "auxiliary_loss_mlp": 0.01278817, + "balance_loss_clip": 0.06290321, + "balance_loss_mlp": 0.01261007, + "epoch": 0.2987524425071396, + "flos": 31658376850560.0, + "grad_norm": 1.780098836704026, + "language_loss": 0.76775402, + "learning_rate": 3.2889698726063805e-06, + "loss": 0.84551913, + "num_input_tokens_seen": 106982870, + "router_z_loss_clip": 2.07324219, + "router_z_loss_mlp": 0.17810059, + "step": 4969, + "time_per_iteration": 2.677133321762085 + }, + { + "auxiliary_loss_clip": 0.0649517, + "auxiliary_loss_mlp": 0.01279823, + "balance_loss_clip": 0.06290856, + "balance_loss_mlp": 0.0126355, + "epoch": 0.2988125657598076, + "flos": 21439735873920.0, + "grad_norm": 1.6530964666677603, + "language_loss": 0.702811, + "learning_rate": 3.2886720591442327e-06, + "loss": 0.78056097, + "num_input_tokens_seen": 107002405, + "router_z_loss_clip": 2.04589844, + "router_z_loss_mlp": 0.16271973, + "step": 4970, + "time_per_iteration": 2.542041301727295 + }, + { + "auxiliary_loss_clip": 0.06501894, + "auxiliary_loss_mlp": 0.01279087, + "balance_loss_clip": 0.06289935, + "balance_loss_mlp": 0.01260336, + "epoch": 0.2988726890124756, + "flos": 18082750222080.0, + "grad_norm": 2.836679638175962, + "language_loss": 0.84790057, + "learning_rate": 3.2883741968155103e-06, + "loss": 0.92571044, + "num_input_tokens_seen": 107017310, + "router_z_loss_clip": 2.11816406, + "router_z_loss_mlp": 0.18737793, + "step": 4971, + "time_per_iteration": 2.5460052490234375 + }, + { + "auxiliary_loss_clip": 0.06490934, + "auxiliary_loss_mlp": 0.01273993, + "balance_loss_clip": 0.06292243, + "balance_loss_mlp": 0.01257691, + "epoch": 0.29893281226514357, + "flos": 21760987628160.0, + "grad_norm": 1.7104631490326472, + "language_loss": 0.79530191, + "learning_rate": 3.2880762856315107e-06, + "loss": 0.87295115, + "num_input_tokens_seen": 107034645, + "router_z_loss_clip": 1.98925781, + "router_z_loss_mlp": 0.16314697, + "step": 4972, + "time_per_iteration": 2.521575689315796 + }, + { + "auxiliary_loss_clip": 0.0650093, + "auxiliary_loss_mlp": 0.01282709, + "balance_loss_clip": 0.06297094, + "balance_loss_mlp": 0.01266234, + "epoch": 0.29899293551781153, + "flos": 16842341105280.0, + "grad_norm": 1.7682293865220609, + "language_loss": 0.85643351, + "learning_rate": 3.2877783256035285e-06, + "loss": 0.93426991, + "num_input_tokens_seen": 107051125, + "router_z_loss_clip": 2.04101562, + "router_z_loss_mlp": 0.16467285, + "step": 4973, + "time_per_iteration": 2.546552896499634 + }, + { + "auxiliary_loss_clip": 0.06486042, + "auxiliary_loss_mlp": 0.01280538, + "balance_loss_clip": 0.06291717, + "balance_loss_mlp": 0.01263539, + "epoch": 0.2990530587704795, + "flos": 11734068792960.0, + "grad_norm": 1.5403026658154284, + "language_loss": 0.78163445, + "learning_rate": 3.287480316742863e-06, + "loss": 0.85930026, + "num_input_tokens_seen": 107068815, + "router_z_loss_clip": 1.94433594, + "router_z_loss_mlp": 0.17004395, + "step": 4974, + "time_per_iteration": 2.519416093826294 + }, + { + "auxiliary_loss_clip": 0.06492939, + "auxiliary_loss_mlp": 0.01274131, + "balance_loss_clip": 0.06288281, + "balance_loss_mlp": 0.01257001, + "epoch": 0.29911318202314746, + "flos": 28047713362560.0, + "grad_norm": 1.767842246111843, + "language_loss": 0.73036933, + "learning_rate": 3.287182259060815e-06, + "loss": 0.80804002, + "num_input_tokens_seen": 107090420, + "router_z_loss_clip": 2.04589844, + "router_z_loss_mlp": 0.17126465, + "step": 4975, + "time_per_iteration": 2.6099252700805664 + }, + { + "auxiliary_loss_clip": 0.0649198, + "auxiliary_loss_mlp": 0.01278331, + "balance_loss_clip": 0.06288506, + "balance_loss_mlp": 0.0126163, + "epoch": 0.2991733052758154, + "flos": 18739425070080.0, + "grad_norm": 3.7568061887968374, + "language_loss": 0.76564699, + "learning_rate": 3.286884152568687e-06, + "loss": 0.84335011, + "num_input_tokens_seen": 107107255, + "router_z_loss_clip": 2.03222656, + "router_z_loss_mlp": 0.16711426, + "step": 4976, + "time_per_iteration": 2.4865057468414307 + }, + { + "auxiliary_loss_clip": 0.0649081, + "auxiliary_loss_mlp": 0.01274025, + "balance_loss_clip": 0.06290253, + "balance_loss_mlp": 0.01257574, + "epoch": 0.2992334285284834, + "flos": 15564476413440.0, + "grad_norm": 2.0027584051633256, + "language_loss": 0.86547983, + "learning_rate": 3.2865859972777827e-06, + "loss": 0.94312823, + "num_input_tokens_seen": 107123840, + "router_z_loss_clip": 2.00585938, + "router_z_loss_mlp": 0.16455078, + "step": 4977, + "time_per_iteration": 2.5564377307891846 + }, + { + "auxiliary_loss_clip": 0.06492308, + "auxiliary_loss_mlp": 0.01273791, + "balance_loss_clip": 0.06289831, + "balance_loss_mlp": 0.0125684, + "epoch": 0.29929355178115136, + "flos": 21803809864320.0, + "grad_norm": 1.498415139231663, + "language_loss": 0.69035208, + "learning_rate": 3.2862877931994088e-06, + "loss": 0.76801312, + "num_input_tokens_seen": 107143475, + "router_z_loss_clip": 2.02539062, + "router_z_loss_mlp": 0.16943359, + "step": 4978, + "time_per_iteration": 2.519927978515625 + }, + { + "auxiliary_loss_clip": 0.06498158, + "auxiliary_loss_mlp": 0.01273756, + "balance_loss_clip": 0.06295491, + "balance_loss_mlp": 0.0125634, + "epoch": 0.2993536750338193, + "flos": 21184884080640.0, + "grad_norm": 2.2981139003330924, + "language_loss": 0.76821494, + "learning_rate": 3.2859895403448726e-06, + "loss": 0.84593409, + "num_input_tokens_seen": 107161725, + "router_z_loss_clip": 2.02832031, + "router_z_loss_mlp": 0.17407227, + "step": 4979, + "time_per_iteration": 2.5783658027648926 + }, + { + "auxiliary_loss_clip": 0.06495501, + "auxiliary_loss_mlp": 0.01275001, + "balance_loss_clip": 0.06288472, + "balance_loss_mlp": 0.0125762, + "epoch": 0.2994137982864873, + "flos": 32129954029440.0, + "grad_norm": 1.9038495469030372, + "language_loss": 0.69286489, + "learning_rate": 3.285691238725484e-06, + "loss": 0.77056986, + "num_input_tokens_seen": 107183935, + "router_z_loss_clip": 2.06835938, + "router_z_loss_mlp": 0.17382812, + "step": 4980, + "time_per_iteration": 2.582043170928955 + }, + { + "auxiliary_loss_clip": 0.06490306, + "auxiliary_loss_mlp": 0.01274236, + "balance_loss_clip": 0.06288646, + "balance_loss_mlp": 0.01257177, + "epoch": 0.29947392153915525, + "flos": 21111733866240.0, + "grad_norm": 1.7308746684442236, + "language_loss": 0.74001658, + "learning_rate": 3.285392888352555e-06, + "loss": 0.817662, + "num_input_tokens_seen": 107204285, + "router_z_loss_clip": 2.015625, + "router_z_loss_mlp": 0.17053223, + "step": 4981, + "time_per_iteration": 2.580580711364746 + }, + { + "auxiliary_loss_clip": 0.06490904, + "auxiliary_loss_mlp": 0.01273981, + "balance_loss_clip": 0.06281741, + "balance_loss_mlp": 0.0125635, + "epoch": 0.2995340447918232, + "flos": 21548916144000.0, + "grad_norm": 1.9422940804684126, + "language_loss": 0.86877131, + "learning_rate": 3.2850944892373987e-06, + "loss": 0.94642013, + "num_input_tokens_seen": 107225265, + "router_z_loss_clip": 2.08984375, + "router_z_loss_mlp": 0.17626953, + "step": 4982, + "time_per_iteration": 2.4962990283966064 + }, + { + "auxiliary_loss_clip": 0.06497963, + "auxiliary_loss_mlp": 0.0127461, + "balance_loss_clip": 0.06287588, + "balance_loss_mlp": 0.01257241, + "epoch": 0.2995941680444912, + "flos": 16730393650560.0, + "grad_norm": 2.5640920256819886, + "language_loss": 0.87797368, + "learning_rate": 3.2847960413913307e-06, + "loss": 0.95569938, + "num_input_tokens_seen": 107241335, + "router_z_loss_clip": 2.1015625, + "router_z_loss_mlp": 0.17382812, + "step": 4983, + "time_per_iteration": 2.5295448303222656 + }, + { + "auxiliary_loss_clip": 0.0649021, + "auxiliary_loss_mlp": 0.01273363, + "balance_loss_clip": 0.06287163, + "balance_loss_mlp": 0.012569, + "epoch": 0.2996542912971592, + "flos": 20929864579200.0, + "grad_norm": 2.1931631477553943, + "language_loss": 0.78985476, + "learning_rate": 3.284497544825668e-06, + "loss": 0.86749053, + "num_input_tokens_seen": 107259375, + "router_z_loss_clip": 2.03027344, + "router_z_loss_mlp": 0.16467285, + "step": 4984, + "time_per_iteration": 2.510861873626709 + }, + { + "auxiliary_loss_clip": 0.06490169, + "auxiliary_loss_mlp": 0.01276988, + "balance_loss_clip": 0.06284384, + "balance_loss_mlp": 0.01259702, + "epoch": 0.29971441454982717, + "flos": 25086429417600.0, + "grad_norm": 1.6549542244227224, + "language_loss": 0.78558743, + "learning_rate": 3.2841989995517303e-06, + "loss": 0.86325896, + "num_input_tokens_seen": 107279890, + "router_z_loss_clip": 2.05859375, + "router_z_loss_mlp": 0.17285156, + "step": 4985, + "time_per_iteration": 2.6011219024658203 + }, + { + "auxiliary_loss_clip": 0.06501257, + "auxiliary_loss_mlp": 0.01278562, + "balance_loss_clip": 0.06288561, + "balance_loss_mlp": 0.0125968, + "epoch": 0.29977453780249513, + "flos": 52567445617920.0, + "grad_norm": 2.1128232330624757, + "language_loss": 0.71929544, + "learning_rate": 3.283900405580837e-06, + "loss": 0.79709363, + "num_input_tokens_seen": 107303430, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.1887207, + "step": 4986, + "time_per_iteration": 2.8261890411376953 + }, + { + "auxiliary_loss_clip": 0.06496918, + "auxiliary_loss_mlp": 0.01277715, + "balance_loss_clip": 0.06288348, + "balance_loss_mlp": 0.0125981, + "epoch": 0.2998346610551631, + "flos": 22243759326720.0, + "grad_norm": 2.0495005677193703, + "language_loss": 0.73353851, + "learning_rate": 3.283601762924312e-06, + "loss": 0.81128478, + "num_input_tokens_seen": 107323700, + "router_z_loss_clip": 2.08203125, + "router_z_loss_mlp": 0.17907715, + "step": 4987, + "time_per_iteration": 2.5969009399414062 + }, + { + "auxiliary_loss_clip": 0.06487568, + "auxiliary_loss_mlp": 0.01277048, + "balance_loss_clip": 0.06283796, + "balance_loss_mlp": 0.01260561, + "epoch": 0.29989478430783106, + "flos": 16878832358400.0, + "grad_norm": 1.677350703029162, + "language_loss": 0.80982405, + "learning_rate": 3.2833030715934793e-06, + "loss": 0.88747025, + "num_input_tokens_seen": 107341965, + "router_z_loss_clip": 2.03710938, + "router_z_loss_mlp": 0.16479492, + "step": 4988, + "time_per_iteration": 2.4802756309509277 + }, + { + "auxiliary_loss_clip": 0.06489251, + "auxiliary_loss_mlp": 0.0127416, + "balance_loss_clip": 0.06285515, + "balance_loss_mlp": 0.0125759, + "epoch": 0.29995490756049903, + "flos": 23775637271040.0, + "grad_norm": 1.830625198484136, + "language_loss": 0.7097913, + "learning_rate": 3.2830043315996658e-06, + "loss": 0.7874254, + "num_input_tokens_seen": 107362615, + "router_z_loss_clip": 2.03613281, + "router_z_loss_mlp": 0.16577148, + "step": 4989, + "time_per_iteration": 2.5968902111053467 + }, + { + "auxiliary_loss_clip": 0.06498987, + "auxiliary_loss_mlp": 0.01283365, + "balance_loss_clip": 0.0628901, + "balance_loss_mlp": 0.01264948, + "epoch": 0.300015030813167, + "flos": 14470577360640.0, + "grad_norm": 2.8004651200920576, + "language_loss": 0.85787904, + "learning_rate": 3.282705542954199e-06, + "loss": 0.93570256, + "num_input_tokens_seen": 107378980, + "router_z_loss_clip": 2.09570312, + "router_z_loss_mlp": 0.18408203, + "step": 4990, + "time_per_iteration": 2.4837355613708496 + }, + { + "auxiliary_loss_clip": 0.06499861, + "auxiliary_loss_mlp": 0.01278121, + "balance_loss_clip": 0.06287368, + "balance_loss_mlp": 0.01260204, + "epoch": 0.30007515406583496, + "flos": 25199005777920.0, + "grad_norm": 1.6608247288012334, + "language_loss": 0.67339301, + "learning_rate": 3.28240670566841e-06, + "loss": 0.75117278, + "num_input_tokens_seen": 107397640, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.17919922, + "step": 4991, + "time_per_iteration": 4.060553312301636 + }, + { + "auxiliary_loss_clip": 0.0649571, + "auxiliary_loss_mlp": 0.01277369, + "balance_loss_clip": 0.06284688, + "balance_loss_mlp": 0.01259022, + "epoch": 0.3001352773185029, + "flos": 19397315802240.0, + "grad_norm": 1.7545259775845383, + "language_loss": 0.79479051, + "learning_rate": 3.28210781975363e-06, + "loss": 0.87252128, + "num_input_tokens_seen": 107416020, + "router_z_loss_clip": 2.10546875, + "router_z_loss_mlp": 0.18347168, + "step": 4992, + "time_per_iteration": 2.5394246578216553 + }, + { + "auxiliary_loss_clip": 0.06496455, + "auxiliary_loss_mlp": 0.01272727, + "balance_loss_clip": 0.06291893, + "balance_loss_mlp": 0.01255061, + "epoch": 0.3001954005711709, + "flos": 21550341663360.0, + "grad_norm": 1.8174225064451806, + "language_loss": 0.83191693, + "learning_rate": 3.281808885221193e-06, + "loss": 0.90960872, + "num_input_tokens_seen": 107436340, + "router_z_loss_clip": 2.04589844, + "router_z_loss_mlp": 0.17675781, + "step": 4993, + "time_per_iteration": 2.536900520324707 + }, + { + "auxiliary_loss_clip": 0.06502604, + "auxiliary_loss_mlp": 0.0127659, + "balance_loss_clip": 0.06290129, + "balance_loss_mlp": 0.01257051, + "epoch": 0.30025552382383885, + "flos": 17390087245440.0, + "grad_norm": 2.3964724385856955, + "language_loss": 0.8713994, + "learning_rate": 3.2815099020824345e-06, + "loss": 0.94919133, + "num_input_tokens_seen": 107454585, + "router_z_loss_clip": 2.12109375, + "router_z_loss_mlp": 0.1953125, + "step": 4994, + "time_per_iteration": 5.451568603515625 + }, + { + "auxiliary_loss_clip": 0.06500117, + "auxiliary_loss_mlp": 0.01272707, + "balance_loss_clip": 0.06293428, + "balance_loss_mlp": 0.01255696, + "epoch": 0.3003156470765068, + "flos": 29541003701760.0, + "grad_norm": 1.492375768993242, + "language_loss": 0.81277597, + "learning_rate": 3.2812108703486924e-06, + "loss": 0.89050424, + "num_input_tokens_seen": 107477180, + "router_z_loss_clip": 2.06445312, + "router_z_loss_mlp": 0.17016602, + "step": 4995, + "time_per_iteration": 2.6498701572418213 + }, + { + "auxiliary_loss_clip": 0.06495272, + "auxiliary_loss_mlp": 0.01276355, + "balance_loss_clip": 0.06291361, + "balance_loss_mlp": 0.01257818, + "epoch": 0.3003757703291748, + "flos": 43655278302720.0, + "grad_norm": 1.561088997277918, + "language_loss": 0.67591625, + "learning_rate": 3.2809117900313055e-06, + "loss": 0.75363255, + "num_input_tokens_seen": 107500250, + "router_z_loss_clip": 2.03808594, + "router_z_loss_mlp": 0.18530273, + "step": 4996, + "time_per_iteration": 2.6940386295318604 + }, + { + "auxiliary_loss_clip": 0.06490915, + "auxiliary_loss_mlp": 0.01277922, + "balance_loss_clip": 0.06287466, + "balance_loss_mlp": 0.0125985, + "epoch": 0.30043589358184275, + "flos": 22534934664960.0, + "grad_norm": 1.8202769971321224, + "language_loss": 0.76585484, + "learning_rate": 3.280612661141615e-06, + "loss": 0.84354323, + "num_input_tokens_seen": 107520070, + "router_z_loss_clip": 2.03515625, + "router_z_loss_mlp": 0.18054199, + "step": 4997, + "time_per_iteration": 2.551025629043579 + }, + { + "auxiliary_loss_clip": 0.06488951, + "auxiliary_loss_mlp": 0.01282226, + "balance_loss_clip": 0.06286483, + "balance_loss_mlp": 0.01264785, + "epoch": 0.30049601683451077, + "flos": 21002176252800.0, + "grad_norm": 1.7136041248753544, + "language_loss": 0.78929758, + "learning_rate": 3.2803134836909646e-06, + "loss": 0.86700928, + "num_input_tokens_seen": 107539285, + "router_z_loss_clip": 2.0234375, + "router_z_loss_mlp": 0.17443848, + "step": 4998, + "time_per_iteration": 2.4853529930114746 + }, + { + "auxiliary_loss_clip": 0.06495959, + "auxiliary_loss_mlp": 0.01277634, + "balance_loss_clip": 0.06296599, + "balance_loss_mlp": 0.0126104, + "epoch": 0.30055614008717874, + "flos": 23922985875840.0, + "grad_norm": 1.6408959445510187, + "language_loss": 0.73985869, + "learning_rate": 3.2800142576906985e-06, + "loss": 0.81759465, + "num_input_tokens_seen": 107560260, + "router_z_loss_clip": 1.99121094, + "router_z_loss_mlp": 0.16589355, + "step": 4999, + "time_per_iteration": 2.565272331237793 + }, + { + "auxiliary_loss_clip": 0.06497648, + "auxiliary_loss_mlp": 0.01276599, + "balance_loss_clip": 0.06290608, + "balance_loss_mlp": 0.01258837, + "epoch": 0.3006162633398467, + "flos": 19175475317760.0, + "grad_norm": 1.6585129963537202, + "language_loss": 0.76246512, + "learning_rate": 3.2797149831521626e-06, + "loss": 0.84020758, + "num_input_tokens_seen": 107579260, + "router_z_loss_clip": 2.0703125, + "router_z_loss_mlp": 0.1776123, + "step": 5000, + "time_per_iteration": 3.978001117706299 + }, + { + "auxiliary_loss_clip": 0.06488875, + "auxiliary_loss_mlp": 0.01280464, + "balance_loss_clip": 0.06288455, + "balance_loss_mlp": 0.0126244, + "epoch": 0.30067638659251467, + "flos": 14683697020800.0, + "grad_norm": 1.838860389970219, + "language_loss": 0.81972182, + "learning_rate": 3.2794156600867073e-06, + "loss": 0.89741528, + "num_input_tokens_seen": 107595245, + "router_z_loss_clip": 2.00390625, + "router_z_loss_mlp": 0.18041992, + "step": 5001, + "time_per_iteration": 2.4995031356811523 + }, + { + "auxiliary_loss_clip": 0.06495227, + "auxiliary_loss_mlp": 0.01279132, + "balance_loss_clip": 0.06291329, + "balance_loss_mlp": 0.01261322, + "epoch": 0.30073650984518263, + "flos": 23374778538240.0, + "grad_norm": 1.6002838962292127, + "language_loss": 0.81160742, + "learning_rate": 3.2791162885056815e-06, + "loss": 0.88935101, + "num_input_tokens_seen": 107613985, + "router_z_loss_clip": 2.04003906, + "router_z_loss_mlp": 0.17797852, + "step": 5002, + "time_per_iteration": 2.549882650375366 + }, + { + "auxiliary_loss_clip": 0.06502556, + "auxiliary_loss_mlp": 0.01273216, + "balance_loss_clip": 0.06290467, + "balance_loss_mlp": 0.01255728, + "epoch": 0.3007966330978506, + "flos": 22973332826880.0, + "grad_norm": 1.7018817575326768, + "language_loss": 0.71524274, + "learning_rate": 3.2788168684204376e-06, + "loss": 0.79300046, + "num_input_tokens_seen": 107631435, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.17504883, + "step": 5003, + "time_per_iteration": 2.537760019302368 + }, + { + "auxiliary_loss_clip": 0.06502316, + "auxiliary_loss_mlp": 0.01275597, + "balance_loss_clip": 0.06290226, + "balance_loss_mlp": 0.01257441, + "epoch": 0.30085675635051856, + "flos": 27825830951040.0, + "grad_norm": 1.9954765529899763, + "language_loss": 0.706792, + "learning_rate": 3.27851739984233e-06, + "loss": 0.78457117, + "num_input_tokens_seen": 107650530, + "router_z_loss_clip": 2.12304688, + "router_z_loss_mlp": 0.18151855, + "step": 5004, + "time_per_iteration": 2.6357674598693848 + }, + { + "auxiliary_loss_clip": 0.06504735, + "auxiliary_loss_mlp": 0.01282861, + "balance_loss_clip": 0.06296123, + "balance_loss_mlp": 0.01263513, + "epoch": 0.3009168796031865, + "flos": 10886216855040.0, + "grad_norm": 2.7451882694975662, + "language_loss": 0.81914413, + "learning_rate": 3.278217882782715e-06, + "loss": 0.89702016, + "num_input_tokens_seen": 107662240, + "router_z_loss_clip": 2.08398438, + "router_z_loss_mlp": 0.19335938, + "step": 5005, + "time_per_iteration": 2.4386463165283203 + }, + { + "auxiliary_loss_clip": 0.06497307, + "auxiliary_loss_mlp": 0.01278667, + "balance_loss_clip": 0.06293161, + "balance_loss_mlp": 0.01261179, + "epoch": 0.3009770028558545, + "flos": 23812170451200.0, + "grad_norm": 3.689468326241579, + "language_loss": 0.74513727, + "learning_rate": 3.2779183172529497e-06, + "loss": 0.82289702, + "num_input_tokens_seen": 107680330, + "router_z_loss_clip": 2.04296875, + "router_z_loss_mlp": 0.17492676, + "step": 5006, + "time_per_iteration": 2.6309902667999268 + }, + { + "auxiliary_loss_clip": 0.06490835, + "auxiliary_loss_mlp": 0.01274011, + "balance_loss_clip": 0.06288077, + "balance_loss_mlp": 0.01255247, + "epoch": 0.30103712610852246, + "flos": 26475319169280.0, + "grad_norm": 1.9837745378518294, + "language_loss": 0.71514297, + "learning_rate": 3.2776187032643932e-06, + "loss": 0.79279143, + "num_input_tokens_seen": 107700020, + "router_z_loss_clip": 2.0234375, + "router_z_loss_mlp": 0.18762207, + "step": 5007, + "time_per_iteration": 2.5425140857696533 + }, + { + "auxiliary_loss_clip": 0.06499007, + "auxiliary_loss_mlp": 0.01277558, + "balance_loss_clip": 0.062922, + "balance_loss_mlp": 0.01258961, + "epoch": 0.3010972493611904, + "flos": 22863020526720.0, + "grad_norm": 2.135948160193648, + "language_loss": 0.76715112, + "learning_rate": 3.2773190408284075e-06, + "loss": 0.84491682, + "num_input_tokens_seen": 107718575, + "router_z_loss_clip": 2.06835938, + "router_z_loss_mlp": 0.18579102, + "step": 5008, + "time_per_iteration": 2.560136556625366 + }, + { + "auxiliary_loss_clip": 0.06498778, + "auxiliary_loss_mlp": 0.01277162, + "balance_loss_clip": 0.06291865, + "balance_loss_mlp": 0.01258959, + "epoch": 0.3011573726138584, + "flos": 24059307669120.0, + "grad_norm": 1.8647165617813573, + "language_loss": 0.85181898, + "learning_rate": 3.2770193299563564e-06, + "loss": 0.92957842, + "num_input_tokens_seen": 107738635, + "router_z_loss_clip": 2.06835938, + "router_z_loss_mlp": 0.18200684, + "step": 5009, + "time_per_iteration": 2.5235841274261475 + }, + { + "auxiliary_loss_clip": 0.06506295, + "auxiliary_loss_mlp": 0.01281474, + "balance_loss_clip": 0.06291408, + "balance_loss_mlp": 0.0126041, + "epoch": 0.30121749586652635, + "flos": 20264762396160.0, + "grad_norm": 1.8315766872525614, + "language_loss": 0.84202898, + "learning_rate": 3.276719570659604e-06, + "loss": 0.91990662, + "num_input_tokens_seen": 107753415, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.21069336, + "step": 5010, + "time_per_iteration": 2.5768747329711914 + }, + { + "auxiliary_loss_clip": 0.06499103, + "auxiliary_loss_mlp": 0.01276454, + "balance_loss_clip": 0.06292678, + "balance_loss_mlp": 0.01258728, + "epoch": 0.3012776191191944, + "flos": 26950334365440.0, + "grad_norm": 2.3479091749479593, + "language_loss": 0.85299456, + "learning_rate": 3.2764197629495176e-06, + "loss": 0.93075019, + "num_input_tokens_seen": 107773840, + "router_z_loss_clip": 2.06445312, + "router_z_loss_mlp": 0.17724609, + "step": 5011, + "time_per_iteration": 2.5496773719787598 + }, + { + "auxiliary_loss_clip": 0.06498772, + "auxiliary_loss_mlp": 0.01276485, + "balance_loss_clip": 0.06287067, + "balance_loss_mlp": 0.01258472, + "epoch": 0.30133774237186234, + "flos": 20418525838080.0, + "grad_norm": 2.2969937551574615, + "language_loss": 0.73043567, + "learning_rate": 3.2761199068374656e-06, + "loss": 0.80818832, + "num_input_tokens_seen": 107792020, + "router_z_loss_clip": 2.1171875, + "router_z_loss_mlp": 0.18017578, + "step": 5012, + "time_per_iteration": 2.5352632999420166 + }, + { + "auxiliary_loss_clip": 0.06502604, + "auxiliary_loss_mlp": 0.01275987, + "balance_loss_clip": 0.06294451, + "balance_loss_mlp": 0.01257581, + "epoch": 0.3013978656245303, + "flos": 19798635732480.0, + "grad_norm": 2.0714365992737247, + "language_loss": 0.88282806, + "learning_rate": 3.275820002334819e-06, + "loss": 0.96061397, + "num_input_tokens_seen": 107809595, + "router_z_loss_clip": 2.08007812, + "router_z_loss_mlp": 0.1842041, + "step": 5013, + "time_per_iteration": 2.5217273235321045 + }, + { + "auxiliary_loss_clip": 0.06510235, + "auxiliary_loss_mlp": 0.01281959, + "balance_loss_clip": 0.06297365, + "balance_loss_mlp": 0.01261956, + "epoch": 0.30145798887719827, + "flos": 16254623767680.0, + "grad_norm": 2.0397198762739253, + "language_loss": 0.8413021, + "learning_rate": 3.2755200494529496e-06, + "loss": 0.91922402, + "num_input_tokens_seen": 107827230, + "router_z_loss_clip": 2.12988281, + "router_z_loss_mlp": 0.19995117, + "step": 5014, + "time_per_iteration": 2.543929100036621 + }, + { + "auxiliary_loss_clip": 0.06496109, + "auxiliary_loss_mlp": 0.01278136, + "balance_loss_clip": 0.06295025, + "balance_loss_mlp": 0.01260934, + "epoch": 0.30151811212986623, + "flos": 24578654474880.0, + "grad_norm": 1.6793816963153507, + "language_loss": 0.68929201, + "learning_rate": 3.2752200482032323e-06, + "loss": 0.76703441, + "num_input_tokens_seen": 107847195, + "router_z_loss_clip": 2.0078125, + "router_z_loss_mlp": 0.17199707, + "step": 5015, + "time_per_iteration": 2.5488638877868652 + }, + { + "auxiliary_loss_clip": 0.06498226, + "auxiliary_loss_mlp": 0.01282599, + "balance_loss_clip": 0.06293575, + "balance_loss_mlp": 0.01262989, + "epoch": 0.3015782353825342, + "flos": 21878595233280.0, + "grad_norm": 2.19954780338382, + "language_loss": 0.75070626, + "learning_rate": 3.2749199985970436e-06, + "loss": 0.82851446, + "num_input_tokens_seen": 107866420, + "router_z_loss_clip": 2.046875, + "router_z_loss_mlp": 0.19604492, + "step": 5016, + "time_per_iteration": 2.6430094242095947 + }, + { + "auxiliary_loss_clip": 0.06498955, + "auxiliary_loss_mlp": 0.01278069, + "balance_loss_clip": 0.06290609, + "balance_loss_mlp": 0.01260009, + "epoch": 0.30163835863520216, + "flos": 28777244935680.0, + "grad_norm": 1.487936670829871, + "language_loss": 0.657938, + "learning_rate": 3.2746199006457603e-06, + "loss": 0.73570824, + "num_input_tokens_seen": 107889090, + "router_z_loss_clip": 2.08300781, + "router_z_loss_mlp": 0.18041992, + "step": 5017, + "time_per_iteration": 2.62882661819458 + }, + { + "auxiliary_loss_clip": 0.06504996, + "auxiliary_loss_mlp": 0.0127571, + "balance_loss_clip": 0.06297189, + "balance_loss_mlp": 0.01258019, + "epoch": 0.30169848188787013, + "flos": 22972829702400.0, + "grad_norm": 1.7163502989136974, + "language_loss": 0.68538272, + "learning_rate": 3.2743197543607628e-06, + "loss": 0.76318979, + "num_input_tokens_seen": 107907520, + "router_z_loss_clip": 2.078125, + "router_z_loss_mlp": 0.17675781, + "step": 5018, + "time_per_iteration": 2.5743629932403564 + }, + { + "auxiliary_loss_clip": 0.06490742, + "auxiliary_loss_mlp": 0.01280876, + "balance_loss_clip": 0.06291413, + "balance_loss_mlp": 0.01263102, + "epoch": 0.3017586051405381, + "flos": 21841726636800.0, + "grad_norm": 1.8632302123292983, + "language_loss": 0.79424834, + "learning_rate": 3.2740195597534327e-06, + "loss": 0.87196445, + "num_input_tokens_seen": 107925650, + "router_z_loss_clip": 1.99023438, + "router_z_loss_mlp": 0.17773438, + "step": 5019, + "time_per_iteration": 2.490190029144287 + }, + { + "auxiliary_loss_clip": 0.06497257, + "auxiliary_loss_mlp": 0.01272585, + "balance_loss_clip": 0.06291286, + "balance_loss_mlp": 0.01255932, + "epoch": 0.30181872839320606, + "flos": 22166374481280.0, + "grad_norm": 1.9171916392208899, + "language_loss": 0.70839167, + "learning_rate": 3.2737193168351527e-06, + "loss": 0.78609014, + "num_input_tokens_seen": 107943975, + "router_z_loss_clip": 2.05957031, + "router_z_loss_mlp": 0.16650391, + "step": 5020, + "time_per_iteration": 2.5635480880737305 + }, + { + "auxiliary_loss_clip": 0.06504546, + "auxiliary_loss_mlp": 0.01281398, + "balance_loss_clip": 0.06293903, + "balance_loss_mlp": 0.01263063, + "epoch": 0.301878851645874, + "flos": 18120080016000.0, + "grad_norm": 1.792157390717078, + "language_loss": 0.78276378, + "learning_rate": 3.2734190256173085e-06, + "loss": 0.86062324, + "num_input_tokens_seen": 107962950, + "router_z_loss_clip": 2.10742188, + "router_z_loss_mlp": 0.18347168, + "step": 5021, + "time_per_iteration": 2.4956390857696533 + }, + { + "auxiliary_loss_clip": 0.06497782, + "auxiliary_loss_mlp": 0.01276425, + "balance_loss_clip": 0.06289995, + "balance_loss_mlp": 0.01258758, + "epoch": 0.301938974898542, + "flos": 17607860807040.0, + "grad_norm": 2.1405998927344774, + "language_loss": 0.77019519, + "learning_rate": 3.2731186861112877e-06, + "loss": 0.84793723, + "num_input_tokens_seen": 107979700, + "router_z_loss_clip": 2.078125, + "router_z_loss_mlp": 0.17663574, + "step": 5022, + "time_per_iteration": 2.5157957077026367 + }, + { + "auxiliary_loss_clip": 0.06495966, + "auxiliary_loss_mlp": 0.01276385, + "balance_loss_clip": 0.0628897, + "balance_loss_mlp": 0.01258766, + "epoch": 0.30199909815120995, + "flos": 11185861455360.0, + "grad_norm": 1.768248661027107, + "language_loss": 0.70051187, + "learning_rate": 3.2728182983284793e-06, + "loss": 0.77823544, + "num_input_tokens_seen": 107996645, + "router_z_loss_clip": 2.06835938, + "router_z_loss_mlp": 0.17626953, + "step": 5023, + "time_per_iteration": 2.466554641723633 + }, + { + "auxiliary_loss_clip": 0.06500031, + "auxiliary_loss_mlp": 0.01272609, + "balance_loss_clip": 0.0628899, + "balance_loss_mlp": 0.0125586, + "epoch": 0.302059221403878, + "flos": 21914247945600.0, + "grad_norm": 1.9915350532209553, + "language_loss": 0.72159773, + "learning_rate": 3.2725178622802724e-06, + "loss": 0.7993241, + "num_input_tokens_seen": 108015020, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.16748047, + "step": 5024, + "time_per_iteration": 2.550529956817627 + }, + { + "auxiliary_loss_clip": 0.06490807, + "auxiliary_loss_mlp": 0.0127689, + "balance_loss_clip": 0.06288145, + "balance_loss_mlp": 0.01259068, + "epoch": 0.30211934465654594, + "flos": 26403678328320.0, + "grad_norm": 1.894121412902458, + "language_loss": 0.74805325, + "learning_rate": 3.272217377978061e-06, + "loss": 0.8257302, + "num_input_tokens_seen": 108036430, + "router_z_loss_clip": 2.02832031, + "router_z_loss_mlp": 0.17822266, + "step": 5025, + "time_per_iteration": 2.566805124282837 + }, + { + "auxiliary_loss_clip": 0.06489006, + "auxiliary_loss_mlp": 0.01277493, + "balance_loss_clip": 0.06288895, + "balance_loss_mlp": 0.01260649, + "epoch": 0.3021794679092139, + "flos": 23406573962880.0, + "grad_norm": 1.5421556017832176, + "language_loss": 0.67708206, + "learning_rate": 3.2719168454332387e-06, + "loss": 0.75474703, + "num_input_tokens_seen": 108054250, + "router_z_loss_clip": 1.99902344, + "router_z_loss_mlp": 0.16845703, + "step": 5026, + "time_per_iteration": 2.5388495922088623 + }, + { + "auxiliary_loss_clip": 0.06496219, + "auxiliary_loss_mlp": 0.01276315, + "balance_loss_clip": 0.0629091, + "balance_loss_mlp": 0.0125829, + "epoch": 0.30223959116188187, + "flos": 20266271769600.0, + "grad_norm": 1.7822947119811494, + "language_loss": 0.851165, + "learning_rate": 3.2716162646572034e-06, + "loss": 0.92889023, + "num_input_tokens_seen": 108071495, + "router_z_loss_clip": 2.05175781, + "router_z_loss_mlp": 0.18017578, + "step": 5027, + "time_per_iteration": 2.4944281578063965 + }, + { + "auxiliary_loss_clip": 0.06486274, + "auxiliary_loss_mlp": 0.01272499, + "balance_loss_clip": 0.06286463, + "balance_loss_mlp": 0.012555, + "epoch": 0.30229971441454984, + "flos": 26695105228800.0, + "grad_norm": 1.4959542036115716, + "language_loss": 0.79103637, + "learning_rate": 3.271315635661351e-06, + "loss": 0.86862409, + "num_input_tokens_seen": 108092135, + "router_z_loss_clip": 1.99804688, + "router_z_loss_mlp": 0.17004395, + "step": 5028, + "time_per_iteration": 2.559110403060913 + }, + { + "auxiliary_loss_clip": 0.06488896, + "auxiliary_loss_mlp": 0.01272685, + "balance_loss_clip": 0.06286621, + "balance_loss_mlp": 0.01255114, + "epoch": 0.3023598376672178, + "flos": 34353111358080.0, + "grad_norm": 2.034560710438702, + "language_loss": 0.777421, + "learning_rate": 3.2710149584570826e-06, + "loss": 0.8550368, + "num_input_tokens_seen": 108112945, + "router_z_loss_clip": 2.02246094, + "router_z_loss_mlp": 0.17553711, + "step": 5029, + "time_per_iteration": 2.616746187210083 + }, + { + "auxiliary_loss_clip": 0.06491397, + "auxiliary_loss_mlp": 0.012793, + "balance_loss_clip": 0.06285096, + "balance_loss_mlp": 0.0126112, + "epoch": 0.30241996091988577, + "flos": 23118794714880.0, + "grad_norm": 1.8709670039612754, + "language_loss": 0.83096594, + "learning_rate": 3.2707142330557993e-06, + "loss": 0.90867293, + "num_input_tokens_seen": 108130325, + "router_z_loss_clip": 2.06152344, + "router_z_loss_mlp": 0.1817627, + "step": 5030, + "time_per_iteration": 2.56754994392395 + }, + { + "auxiliary_loss_clip": 0.06496526, + "auxiliary_loss_mlp": 0.01269852, + "balance_loss_clip": 0.06289787, + "balance_loss_mlp": 0.01252817, + "epoch": 0.30248008417255373, + "flos": 19395932209920.0, + "grad_norm": 1.6009792224367259, + "language_loss": 0.70107001, + "learning_rate": 3.270413459468905e-06, + "loss": 0.77873379, + "num_input_tokens_seen": 108150300, + "router_z_loss_clip": 2.06542969, + "router_z_loss_mlp": 0.17028809, + "step": 5031, + "time_per_iteration": 3.9598355293273926 + }, + { + "auxiliary_loss_clip": 0.06489968, + "auxiliary_loss_mlp": 0.01272903, + "balance_loss_clip": 0.06286315, + "balance_loss_mlp": 0.01254843, + "epoch": 0.3025402074252217, + "flos": 23776601592960.0, + "grad_norm": 1.6577801639127376, + "language_loss": 0.83241403, + "learning_rate": 3.2701126377078047e-06, + "loss": 0.91004276, + "num_input_tokens_seen": 108170330, + "router_z_loss_clip": 2.03417969, + "router_z_loss_mlp": 0.18066406, + "step": 5032, + "time_per_iteration": 2.5589263439178467 + }, + { + "auxiliary_loss_clip": 0.064991, + "auxiliary_loss_mlp": 0.01275787, + "balance_loss_clip": 0.06290475, + "balance_loss_mlp": 0.01257846, + "epoch": 0.30260033067788966, + "flos": 26001184440960.0, + "grad_norm": 2.284722647008976, + "language_loss": 0.73521686, + "learning_rate": 3.269811767783906e-06, + "loss": 0.81296575, + "num_input_tokens_seen": 108191265, + "router_z_loss_clip": 2.08984375, + "router_z_loss_mlp": 0.17956543, + "step": 5033, + "time_per_iteration": 4.029735088348389 + }, + { + "auxiliary_loss_clip": 0.06487451, + "auxiliary_loss_mlp": 0.01273985, + "balance_loss_clip": 0.06287168, + "balance_loss_mlp": 0.01257201, + "epoch": 0.3026604539305576, + "flos": 25381629751680.0, + "grad_norm": 1.972268943863271, + "language_loss": 0.74434245, + "learning_rate": 3.2695108497086185e-06, + "loss": 0.82195687, + "num_input_tokens_seen": 108211615, + "router_z_loss_clip": 2.0, + "router_z_loss_mlp": 0.16784668, + "step": 5034, + "time_per_iteration": 4.0717785358428955 + }, + { + "auxiliary_loss_clip": 0.06489293, + "auxiliary_loss_mlp": 0.01272883, + "balance_loss_clip": 0.06285236, + "balance_loss_mlp": 0.01253785, + "epoch": 0.3027205771832256, + "flos": 25819944059520.0, + "grad_norm": 2.1341895685230434, + "language_loss": 0.72872615, + "learning_rate": 3.269209883493352e-06, + "loss": 0.80634785, + "num_input_tokens_seen": 108231080, + "router_z_loss_clip": 2.04003906, + "router_z_loss_mlp": 0.19104004, + "step": 5035, + "time_per_iteration": 2.552910804748535 + }, + { + "auxiliary_loss_clip": 0.06487517, + "auxiliary_loss_mlp": 0.01272592, + "balance_loss_clip": 0.06287874, + "balance_loss_mlp": 0.01255545, + "epoch": 0.30278070043589356, + "flos": 27351905857920.0, + "grad_norm": 2.3429469920607384, + "language_loss": 0.87837774, + "learning_rate": 3.2689088691495196e-06, + "loss": 0.95597875, + "num_input_tokens_seen": 108251125, + "router_z_loss_clip": 1.99414062, + "router_z_loss_mlp": 0.17041016, + "step": 5036, + "time_per_iteration": 2.5958964824676514 + }, + { + "auxiliary_loss_clip": 0.06487815, + "auxiliary_loss_mlp": 0.01273729, + "balance_loss_clip": 0.06288295, + "balance_loss_mlp": 0.0125574, + "epoch": 0.3028408236885616, + "flos": 24792444967680.0, + "grad_norm": 1.4626052772561229, + "language_loss": 0.77969307, + "learning_rate": 3.268607806688536e-06, + "loss": 0.85730845, + "num_input_tokens_seen": 108272545, + "router_z_loss_clip": 1.99609375, + "router_z_loss_mlp": 0.17980957, + "step": 5037, + "time_per_iteration": 2.556859016418457 + }, + { + "auxiliary_loss_clip": 0.06492691, + "auxiliary_loss_mlp": 0.01276846, + "balance_loss_clip": 0.06287664, + "balance_loss_mlp": 0.01258381, + "epoch": 0.30290094694122954, + "flos": 12937399678080.0, + "grad_norm": 2.1717737457337236, + "language_loss": 0.78095227, + "learning_rate": 3.268306696121816e-06, + "loss": 0.85864764, + "num_input_tokens_seen": 108289725, + "router_z_loss_clip": 2.04882812, + "router_z_loss_mlp": 0.18469238, + "step": 5038, + "time_per_iteration": 2.534095525741577 + }, + { + "auxiliary_loss_clip": 0.06487858, + "auxiliary_loss_mlp": 0.01274285, + "balance_loss_clip": 0.06289861, + "balance_loss_mlp": 0.01257631, + "epoch": 0.3029610701938975, + "flos": 25922709492480.0, + "grad_norm": 1.6864855803341283, + "language_loss": 0.74257523, + "learning_rate": 3.2680055374607804e-06, + "loss": 0.82019669, + "num_input_tokens_seen": 108310690, + "router_z_loss_clip": 1.9765625, + "router_z_loss_mlp": 0.16650391, + "step": 5039, + "time_per_iteration": 3.9620656967163086 + }, + { + "auxiliary_loss_clip": 0.06482661, + "auxiliary_loss_mlp": 0.01275025, + "balance_loss_clip": 0.06285235, + "balance_loss_mlp": 0.0125923, + "epoch": 0.3030211934465655, + "flos": 21987440087040.0, + "grad_norm": 1.8054159725903498, + "language_loss": 0.80141723, + "learning_rate": 3.267704330716847e-06, + "loss": 0.87899411, + "num_input_tokens_seen": 108328905, + "router_z_loss_clip": 1.97070312, + "router_z_loss_mlp": 0.15795898, + "step": 5040, + "time_per_iteration": 2.5038623809814453 + }, + { + "auxiliary_loss_clip": 0.06493679, + "auxiliary_loss_mlp": 0.01273287, + "balance_loss_clip": 0.06295684, + "balance_loss_mlp": 0.01256705, + "epoch": 0.30308131669923344, + "flos": 20997606205440.0, + "grad_norm": 1.5545793881611087, + "language_loss": 0.82498085, + "learning_rate": 3.267403075901438e-06, + "loss": 0.90265048, + "num_input_tokens_seen": 108346680, + "router_z_loss_clip": 1.97851562, + "router_z_loss_mlp": 0.16589355, + "step": 5041, + "time_per_iteration": 2.5619800090789795 + }, + { + "auxiliary_loss_clip": 0.06388037, + "auxiliary_loss_mlp": 0.01273694, + "balance_loss_clip": 0.062912, + "balance_loss_mlp": 0.012703, + "epoch": 0.3031414399519014, + "flos": 60568281198720.0, + "grad_norm": 0.7609258494567089, + "language_loss": 0.59132683, + "learning_rate": 3.267101773025978e-06, + "loss": 0.66794419, + "num_input_tokens_seen": 108413885, + "router_z_loss_clip": 0.96875, + "router_z_loss_mlp": 0.0340271, + "step": 5042, + "time_per_iteration": 3.2389016151428223 + }, + { + "auxiliary_loss_clip": 0.06493344, + "auxiliary_loss_mlp": 0.01274817, + "balance_loss_clip": 0.06288245, + "balance_loss_mlp": 0.0125808, + "epoch": 0.30320156320456937, + "flos": 21914038310400.0, + "grad_norm": 1.8743682054895758, + "language_loss": 0.71638298, + "learning_rate": 3.266800422101892e-06, + "loss": 0.79406464, + "num_input_tokens_seen": 108433640, + "router_z_loss_clip": 2.05175781, + "router_z_loss_mlp": 0.1673584, + "step": 5043, + "time_per_iteration": 2.5684726238250732 + }, + { + "auxiliary_loss_clip": 0.06492111, + "auxiliary_loss_mlp": 0.01274799, + "balance_loss_clip": 0.06289819, + "balance_loss_mlp": 0.01258121, + "epoch": 0.30326168645723733, + "flos": 21659186517120.0, + "grad_norm": 1.7052050019212173, + "language_loss": 0.70087332, + "learning_rate": 3.266499023140606e-06, + "loss": 0.7785424, + "num_input_tokens_seen": 108452640, + "router_z_loss_clip": 2.02441406, + "router_z_loss_mlp": 0.16699219, + "step": 5044, + "time_per_iteration": 2.517548084259033 + }, + { + "auxiliary_loss_clip": 0.06487354, + "auxiliary_loss_mlp": 0.01273722, + "balance_loss_clip": 0.06289065, + "balance_loss_mlp": 0.01257641, + "epoch": 0.3033218097099053, + "flos": 21877672838400.0, + "grad_norm": 1.4072868323237386, + "language_loss": 0.77798641, + "learning_rate": 3.2661975761535513e-06, + "loss": 0.85559714, + "num_input_tokens_seen": 108472470, + "router_z_loss_clip": 1.98242188, + "router_z_loss_mlp": 0.16088867, + "step": 5045, + "time_per_iteration": 2.5525407791137695 + }, + { + "auxiliary_loss_clip": 0.06487602, + "auxiliary_loss_mlp": 0.01277286, + "balance_loss_clip": 0.06286202, + "balance_loss_mlp": 0.01260096, + "epoch": 0.30338193296257326, + "flos": 27097137918720.0, + "grad_norm": 1.6677605508610576, + "language_loss": 0.72664404, + "learning_rate": 3.2658960811521564e-06, + "loss": 0.80429292, + "num_input_tokens_seen": 108493025, + "router_z_loss_clip": 2.01171875, + "router_z_loss_mlp": 0.171875, + "step": 5046, + "time_per_iteration": 2.5747427940368652 + }, + { + "auxiliary_loss_clip": 0.06495762, + "auxiliary_loss_mlp": 0.01276721, + "balance_loss_clip": 0.0629053, + "balance_loss_mlp": 0.0125897, + "epoch": 0.30344205621524123, + "flos": 19540052432640.0, + "grad_norm": 1.932306391246397, + "language_loss": 0.81483316, + "learning_rate": 3.2655945381478564e-06, + "loss": 0.89255798, + "num_input_tokens_seen": 108513480, + "router_z_loss_clip": 2.0546875, + "router_z_loss_mlp": 0.1776123, + "step": 5047, + "time_per_iteration": 2.5763392448425293 + }, + { + "auxiliary_loss_clip": 0.0648682, + "auxiliary_loss_mlp": 0.01271507, + "balance_loss_clip": 0.06287121, + "balance_loss_mlp": 0.01255568, + "epoch": 0.3035021794679092, + "flos": 23917116090240.0, + "grad_norm": 1.635585540948891, + "language_loss": 0.72204739, + "learning_rate": 3.265292947152084e-06, + "loss": 0.7996307, + "num_input_tokens_seen": 108533155, + "router_z_loss_clip": 1.99707031, + "router_z_loss_mlp": 0.15942383, + "step": 5048, + "time_per_iteration": 2.5134665966033936 + }, + { + "auxiliary_loss_clip": 0.06488065, + "auxiliary_loss_mlp": 0.01279017, + "balance_loss_clip": 0.0628863, + "balance_loss_mlp": 0.0126296, + "epoch": 0.30356230272057716, + "flos": 16149133077120.0, + "grad_norm": 2.0386560470204804, + "language_loss": 0.75622666, + "learning_rate": 3.2649913081762763e-06, + "loss": 0.83389747, + "num_input_tokens_seen": 108551900, + "router_z_loss_clip": 1.99707031, + "router_z_loss_mlp": 0.16052246, + "step": 5049, + "time_per_iteration": 2.516463279724121 + }, + { + "auxiliary_loss_clip": 0.06494351, + "auxiliary_loss_mlp": 0.01274287, + "balance_loss_clip": 0.06289351, + "balance_loss_mlp": 0.01257597, + "epoch": 0.3036224259732452, + "flos": 28922539115520.0, + "grad_norm": 1.525083803020086, + "language_loss": 0.82698894, + "learning_rate": 3.2646896212318717e-06, + "loss": 0.90467536, + "num_input_tokens_seen": 108574005, + "router_z_loss_clip": 2.05078125, + "router_z_loss_mlp": 0.16687012, + "step": 5050, + "time_per_iteration": 2.558199405670166 + }, + { + "auxiliary_loss_clip": 0.0649763, + "auxiliary_loss_mlp": 0.01273759, + "balance_loss_clip": 0.06295735, + "balance_loss_mlp": 0.01256617, + "epoch": 0.30368254922591315, + "flos": 21111943501440.0, + "grad_norm": 2.311701267026144, + "language_loss": 0.74346399, + "learning_rate": 3.2643878863303106e-06, + "loss": 0.82117784, + "num_input_tokens_seen": 108592715, + "router_z_loss_clip": 2.01953125, + "router_z_loss_mlp": 0.17150879, + "step": 5051, + "time_per_iteration": 2.530457019805908 + }, + { + "auxiliary_loss_clip": 0.06494159, + "auxiliary_loss_mlp": 0.01277624, + "balance_loss_clip": 0.06292571, + "balance_loss_mlp": 0.01260339, + "epoch": 0.3037426724785811, + "flos": 23008859758080.0, + "grad_norm": 1.7255753861859113, + "language_loss": 0.76444, + "learning_rate": 3.264086103483033e-06, + "loss": 0.84215784, + "num_input_tokens_seen": 108611770, + "router_z_loss_clip": 2.01367188, + "router_z_loss_mlp": 0.17297363, + "step": 5052, + "time_per_iteration": 2.596210479736328 + }, + { + "auxiliary_loss_clip": 0.06501957, + "auxiliary_loss_mlp": 0.01280226, + "balance_loss_clip": 0.06295583, + "balance_loss_mlp": 0.01262332, + "epoch": 0.3038027957312491, + "flos": 15638129752320.0, + "grad_norm": 1.9820354931454651, + "language_loss": 0.83096367, + "learning_rate": 3.2637842727014836e-06, + "loss": 0.90878546, + "num_input_tokens_seen": 108629070, + "router_z_loss_clip": 2.0625, + "router_z_loss_mlp": 0.17871094, + "step": 5053, + "time_per_iteration": 2.5384886264801025 + }, + { + "auxiliary_loss_clip": 0.06489826, + "auxiliary_loss_mlp": 0.0127909, + "balance_loss_clip": 0.06288566, + "balance_loss_mlp": 0.01262174, + "epoch": 0.30386291898391704, + "flos": 12718955283840.0, + "grad_norm": 1.6755872357210637, + "language_loss": 0.7197504, + "learning_rate": 3.2634823939971083e-06, + "loss": 0.79743958, + "num_input_tokens_seen": 108646315, + "router_z_loss_clip": 2.01171875, + "router_z_loss_mlp": 0.16906738, + "step": 5054, + "time_per_iteration": 2.4787559509277344 + }, + { + "auxiliary_loss_clip": 0.06500221, + "auxiliary_loss_mlp": 0.01282757, + "balance_loss_clip": 0.06298432, + "balance_loss_mlp": 0.01265805, + "epoch": 0.303923042236585, + "flos": 26366642023680.0, + "grad_norm": 1.8480883425842163, + "language_loss": 0.70137346, + "learning_rate": 3.2631804673813545e-06, + "loss": 0.77920318, + "num_input_tokens_seen": 108665920, + "router_z_loss_clip": 2.01757812, + "router_z_loss_mlp": 0.16943359, + "step": 5055, + "time_per_iteration": 2.5929152965545654 + }, + { + "auxiliary_loss_clip": 0.06494389, + "auxiliary_loss_mlp": 0.01279452, + "balance_loss_clip": 0.0629337, + "balance_loss_mlp": 0.01262488, + "epoch": 0.30398316548925297, + "flos": 19725359736960.0, + "grad_norm": 2.1405790356583516, + "language_loss": 0.68347496, + "learning_rate": 3.2628784928656707e-06, + "loss": 0.7612133, + "num_input_tokens_seen": 108683485, + "router_z_loss_clip": 2.00878906, + "router_z_loss_mlp": 0.16955566, + "step": 5056, + "time_per_iteration": 2.531677007675171 + }, + { + "auxiliary_loss_clip": 0.06490116, + "auxiliary_loss_mlp": 0.01281162, + "balance_loss_clip": 0.06292629, + "balance_loss_mlp": 0.01264377, + "epoch": 0.30404328874192094, + "flos": 24246124346880.0, + "grad_norm": 1.6503197514246037, + "language_loss": 0.83083463, + "learning_rate": 3.262576470461507e-06, + "loss": 0.9085474, + "num_input_tokens_seen": 108702700, + "router_z_loss_clip": 1.97753906, + "router_z_loss_mlp": 0.16796875, + "step": 5057, + "time_per_iteration": 2.5836069583892822 + }, + { + "auxiliary_loss_clip": 0.06484263, + "auxiliary_loss_mlp": 0.01272995, + "balance_loss_clip": 0.06286788, + "balance_loss_mlp": 0.01256603, + "epoch": 0.3041034119945889, + "flos": 24505881603840.0, + "grad_norm": 1.6860023663091837, + "language_loss": 0.89784855, + "learning_rate": 3.2622744001803176e-06, + "loss": 0.97542113, + "num_input_tokens_seen": 108721860, + "router_z_loss_clip": 1.97265625, + "router_z_loss_mlp": 0.16394043, + "step": 5058, + "time_per_iteration": 2.589932918548584 + }, + { + "auxiliary_loss_clip": 0.06495658, + "auxiliary_loss_mlp": 0.01274369, + "balance_loss_clip": 0.06294262, + "balance_loss_mlp": 0.01256524, + "epoch": 0.30416353524725687, + "flos": 28295689121280.0, + "grad_norm": 2.5117349508823392, + "language_loss": 0.71471179, + "learning_rate": 3.2619722820335564e-06, + "loss": 0.79241204, + "num_input_tokens_seen": 108743215, + "router_z_loss_clip": 2.01464844, + "router_z_loss_mlp": 0.17858887, + "step": 5059, + "time_per_iteration": 2.5827505588531494 + }, + { + "auxiliary_loss_clip": 0.06486548, + "auxiliary_loss_mlp": 0.01273567, + "balance_loss_clip": 0.06286812, + "balance_loss_mlp": 0.01257367, + "epoch": 0.30422365849992483, + "flos": 23667295541760.0, + "grad_norm": 1.868956784724377, + "language_loss": 0.73344606, + "learning_rate": 3.26167011603268e-06, + "loss": 0.8110472, + "num_input_tokens_seen": 108765505, + "router_z_loss_clip": 1.99609375, + "router_z_loss_mlp": 0.16174316, + "step": 5060, + "time_per_iteration": 2.624408006668091 + }, + { + "auxiliary_loss_clip": 0.06490071, + "auxiliary_loss_mlp": 0.01273663, + "balance_loss_clip": 0.06289257, + "balance_loss_mlp": 0.01257451, + "epoch": 0.3042837817525928, + "flos": 23004750908160.0, + "grad_norm": 1.75217091558972, + "language_loss": 0.7751621, + "learning_rate": 3.2613679021891463e-06, + "loss": 0.85279948, + "num_input_tokens_seen": 108783370, + "router_z_loss_clip": 2.0078125, + "router_z_loss_mlp": 0.16210938, + "step": 5061, + "time_per_iteration": 2.542299509048462 + }, + { + "auxiliary_loss_clip": 0.06496524, + "auxiliary_loss_mlp": 0.01274148, + "balance_loss_clip": 0.06292392, + "balance_loss_mlp": 0.01256362, + "epoch": 0.30434390500526076, + "flos": 22087438335360.0, + "grad_norm": 2.647933932315435, + "language_loss": 0.8275395, + "learning_rate": 3.261065640514415e-06, + "loss": 0.90524626, + "num_input_tokens_seen": 108797430, + "router_z_loss_clip": 2.03808594, + "router_z_loss_mlp": 0.17773438, + "step": 5062, + "time_per_iteration": 2.5313212871551514 + }, + { + "auxiliary_loss_clip": 0.06485732, + "auxiliary_loss_mlp": 0.01270116, + "balance_loss_clip": 0.06286077, + "balance_loss_mlp": 0.01253689, + "epoch": 0.3044040282579287, + "flos": 25490516532480.0, + "grad_norm": 1.803893214603413, + "language_loss": 0.74348861, + "learning_rate": 3.2607633310199483e-06, + "loss": 0.82104707, + "num_input_tokens_seen": 108816945, + "router_z_loss_clip": 1.99609375, + "router_z_loss_mlp": 0.16394043, + "step": 5063, + "time_per_iteration": 2.553527355194092 + }, + { + "auxiliary_loss_clip": 0.0649004, + "auxiliary_loss_mlp": 0.01274813, + "balance_loss_clip": 0.06291289, + "balance_loss_mlp": 0.01256753, + "epoch": 0.30446415151059675, + "flos": 21952080864000.0, + "grad_norm": 1.6090072895521823, + "language_loss": 0.84824491, + "learning_rate": 3.26046097371721e-06, + "loss": 0.92589343, + "num_input_tokens_seen": 108836615, + "router_z_loss_clip": 1.98925781, + "router_z_loss_mlp": 0.18066406, + "step": 5064, + "time_per_iteration": 2.558650493621826 + }, + { + "auxiliary_loss_clip": 0.06490266, + "auxiliary_loss_mlp": 0.01274023, + "balance_loss_clip": 0.06290541, + "balance_loss_mlp": 0.0125644, + "epoch": 0.3045242747632647, + "flos": 16440979248000.0, + "grad_norm": 2.1763674367183965, + "language_loss": 0.76565492, + "learning_rate": 3.2601585686176655e-06, + "loss": 0.84329784, + "num_input_tokens_seen": 108855165, + "router_z_loss_clip": 1.99804688, + "router_z_loss_mlp": 0.17578125, + "step": 5065, + "time_per_iteration": 2.50644588470459 + }, + { + "auxiliary_loss_clip": 0.06490786, + "auxiliary_loss_mlp": 0.01279051, + "balance_loss_clip": 0.06288782, + "balance_loss_mlp": 0.01260586, + "epoch": 0.3045843980159327, + "flos": 31548399966720.0, + "grad_norm": 1.8114152917186497, + "language_loss": 0.62859941, + "learning_rate": 3.2598561157327814e-06, + "loss": 0.70629776, + "num_input_tokens_seen": 108874690, + "router_z_loss_clip": 2.01953125, + "router_z_loss_mlp": 0.18469238, + "step": 5066, + "time_per_iteration": 2.6319751739501953 + }, + { + "auxiliary_loss_clip": 0.06499436, + "auxiliary_loss_mlp": 0.01273162, + "balance_loss_clip": 0.0629437, + "balance_loss_mlp": 0.01255602, + "epoch": 0.30464452126860064, + "flos": 17858645677440.0, + "grad_norm": 2.0549933694905653, + "language_loss": 0.82941914, + "learning_rate": 3.2595536150740265e-06, + "loss": 0.90714514, + "num_input_tokens_seen": 108893140, + "router_z_loss_clip": 2.05078125, + "router_z_loss_mlp": 0.17565918, + "step": 5067, + "time_per_iteration": 2.483863592147827 + }, + { + "auxiliary_loss_clip": 0.06485019, + "auxiliary_loss_mlp": 0.0127176, + "balance_loss_clip": 0.06289113, + "balance_loss_mlp": 0.01255643, + "epoch": 0.3047046445212686, + "flos": 20637682992000.0, + "grad_norm": 1.9234738451458053, + "language_loss": 0.63749218, + "learning_rate": 3.259251066652873e-06, + "loss": 0.71506, + "num_input_tokens_seen": 108911880, + "router_z_loss_clip": 1.95800781, + "router_z_loss_mlp": 0.16113281, + "step": 5068, + "time_per_iteration": 2.5133988857269287 + }, + { + "auxiliary_loss_clip": 0.06487909, + "auxiliary_loss_mlp": 0.01273097, + "balance_loss_clip": 0.06291264, + "balance_loss_mlp": 0.01256884, + "epoch": 0.3047647677739366, + "flos": 21293896642560.0, + "grad_norm": 1.767828765686575, + "language_loss": 0.75521863, + "learning_rate": 3.258948470480793e-06, + "loss": 0.8328287, + "num_input_tokens_seen": 108930440, + "router_z_loss_clip": 1.96679688, + "router_z_loss_mlp": 0.1619873, + "step": 5069, + "time_per_iteration": 2.5039985179901123 + }, + { + "auxiliary_loss_clip": 0.06492448, + "auxiliary_loss_mlp": 0.01270604, + "balance_loss_clip": 0.06298955, + "balance_loss_mlp": 0.01255047, + "epoch": 0.30482489102660454, + "flos": 21002218179840.0, + "grad_norm": 2.053197356954631, + "language_loss": 0.76551294, + "learning_rate": 3.258645826569261e-06, + "loss": 0.84314346, + "num_input_tokens_seen": 108949125, + "router_z_loss_clip": 1.93457031, + "router_z_loss_mlp": 0.15551758, + "step": 5070, + "time_per_iteration": 2.56703519821167 + }, + { + "auxiliary_loss_clip": 0.06501058, + "auxiliary_loss_mlp": 0.01275886, + "balance_loss_clip": 0.06296416, + "balance_loss_mlp": 0.01257742, + "epoch": 0.3048850142792725, + "flos": 26298732689280.0, + "grad_norm": 1.581704774716999, + "language_loss": 0.82567108, + "learning_rate": 3.2583431349297527e-06, + "loss": 0.90344059, + "num_input_tokens_seen": 108972190, + "router_z_loss_clip": 2.046875, + "router_z_loss_mlp": 0.18139648, + "step": 5071, + "time_per_iteration": 3.9534900188446045 + }, + { + "auxiliary_loss_clip": 0.06502657, + "auxiliary_loss_mlp": 0.01270862, + "balance_loss_clip": 0.06296133, + "balance_loss_mlp": 0.01253374, + "epoch": 0.30494513753194047, + "flos": 22352813815680.0, + "grad_norm": 1.6603887086526505, + "language_loss": 0.76386344, + "learning_rate": 3.2580403955737467e-06, + "loss": 0.84159869, + "num_input_tokens_seen": 108990325, + "router_z_loss_clip": 2.06542969, + "router_z_loss_mlp": 0.17492676, + "step": 5072, + "time_per_iteration": 3.9736859798431396 + }, + { + "auxiliary_loss_clip": 0.06492919, + "auxiliary_loss_mlp": 0.01277102, + "balance_loss_clip": 0.06293403, + "balance_loss_mlp": 0.01260544, + "epoch": 0.30500526078460843, + "flos": 19543909720320.0, + "grad_norm": 1.870095200943675, + "language_loss": 0.71741343, + "learning_rate": 3.257737608512723e-06, + "loss": 0.79511362, + "num_input_tokens_seen": 109009505, + "router_z_loss_clip": 1.99609375, + "router_z_loss_mlp": 0.16564941, + "step": 5073, + "time_per_iteration": 3.961787700653076 + }, + { + "auxiliary_loss_clip": 0.064973, + "auxiliary_loss_mlp": 0.01276358, + "balance_loss_clip": 0.06293514, + "balance_loss_mlp": 0.01259752, + "epoch": 0.3050653840372764, + "flos": 14470577360640.0, + "grad_norm": 2.0196062448027843, + "language_loss": 0.76699424, + "learning_rate": 3.257434773758163e-06, + "loss": 0.84473085, + "num_input_tokens_seen": 109026350, + "router_z_loss_clip": 2.03613281, + "router_z_loss_mlp": 0.16601562, + "step": 5074, + "time_per_iteration": 2.498986005783081 + }, + { + "auxiliary_loss_clip": 0.06498405, + "auxiliary_loss_mlp": 0.01271199, + "balance_loss_clip": 0.06298129, + "balance_loss_mlp": 0.01254534, + "epoch": 0.30512550728994436, + "flos": 24250736321280.0, + "grad_norm": 2.0830863268570496, + "language_loss": 0.75075227, + "learning_rate": 3.25713189132155e-06, + "loss": 0.8284483, + "num_input_tokens_seen": 109044165, + "router_z_loss_clip": 2.00292969, + "router_z_loss_mlp": 0.16662598, + "step": 5075, + "time_per_iteration": 2.586857557296753 + }, + { + "auxiliary_loss_clip": 0.06500411, + "auxiliary_loss_mlp": 0.01274386, + "balance_loss_clip": 0.06294686, + "balance_loss_mlp": 0.01256004, + "epoch": 0.30518563054261233, + "flos": 16365774608640.0, + "grad_norm": 1.8100237719305525, + "language_loss": 0.75655556, + "learning_rate": 3.2568289612143703e-06, + "loss": 0.8343035, + "num_input_tokens_seen": 109060665, + "router_z_loss_clip": 2.0546875, + "router_z_loss_mlp": 0.18371582, + "step": 5076, + "time_per_iteration": 2.4945309162139893 + }, + { + "auxiliary_loss_clip": 0.06496741, + "auxiliary_loss_mlp": 0.01270713, + "balance_loss_clip": 0.06296699, + "balance_loss_mlp": 0.01252712, + "epoch": 0.30524575379528035, + "flos": 21585952448640.0, + "grad_norm": 4.173383760279569, + "language_loss": 0.79782987, + "learning_rate": 3.25652598344811e-06, + "loss": 0.87550437, + "num_input_tokens_seen": 109080035, + "router_z_loss_clip": 2.0, + "router_z_loss_mlp": 0.17993164, + "step": 5077, + "time_per_iteration": 2.534932851791382 + }, + { + "auxiliary_loss_clip": 0.06489561, + "auxiliary_loss_mlp": 0.01270916, + "balance_loss_clip": 0.06295882, + "balance_loss_mlp": 0.01254012, + "epoch": 0.3053058770479483, + "flos": 16550872277760.0, + "grad_norm": 2.5701417949840146, + "language_loss": 0.7555238, + "learning_rate": 3.256222958034259e-06, + "loss": 0.83312857, + "num_input_tokens_seen": 109097385, + "router_z_loss_clip": 1.93652344, + "router_z_loss_mlp": 0.16894531, + "step": 5078, + "time_per_iteration": 2.530031442642212 + }, + { + "auxiliary_loss_clip": 0.06495726, + "auxiliary_loss_mlp": 0.01279629, + "balance_loss_clip": 0.06297612, + "balance_loss_mlp": 0.01262487, + "epoch": 0.3053660003006163, + "flos": 12317844988800.0, + "grad_norm": 1.8416681282179364, + "language_loss": 0.67517591, + "learning_rate": 3.255919884984307e-06, + "loss": 0.75292945, + "num_input_tokens_seen": 109115495, + "router_z_loss_clip": 1.98046875, + "router_z_loss_mlp": 0.17126465, + "step": 5079, + "time_per_iteration": 3.8981266021728516 + }, + { + "auxiliary_loss_clip": 0.06496017, + "auxiliary_loss_mlp": 0.01271448, + "balance_loss_clip": 0.06296019, + "balance_loss_mlp": 0.01253757, + "epoch": 0.30542612355328425, + "flos": 23118962423040.0, + "grad_norm": 1.7235884914338329, + "language_loss": 0.8044346, + "learning_rate": 3.2556167643097477e-06, + "loss": 0.88210917, + "num_input_tokens_seen": 109134235, + "router_z_loss_clip": 1.99707031, + "router_z_loss_mlp": 0.17687988, + "step": 5080, + "time_per_iteration": 2.562946081161499 + }, + { + "auxiliary_loss_clip": 0.06497588, + "auxiliary_loss_mlp": 0.01276495, + "balance_loss_clip": 0.06297643, + "balance_loss_mlp": 0.01259377, + "epoch": 0.3054862468059522, + "flos": 24396365917440.0, + "grad_norm": 2.5665035909877725, + "language_loss": 0.81653202, + "learning_rate": 3.255313596022074e-06, + "loss": 0.89427292, + "num_input_tokens_seen": 109152760, + "router_z_loss_clip": 1.99707031, + "router_z_loss_mlp": 0.17114258, + "step": 5081, + "time_per_iteration": 2.6026763916015625 + }, + { + "auxiliary_loss_clip": 0.06490453, + "auxiliary_loss_mlp": 0.0127058, + "balance_loss_clip": 0.06291625, + "balance_loss_mlp": 0.01253962, + "epoch": 0.3055463700586202, + "flos": 29393529315840.0, + "grad_norm": 1.580638075296793, + "language_loss": 0.72516012, + "learning_rate": 3.255010380132783e-06, + "loss": 0.80277044, + "num_input_tokens_seen": 109173925, + "router_z_loss_clip": 1.98632812, + "router_z_loss_mlp": 0.16619873, + "step": 5082, + "time_per_iteration": 2.650310516357422 + }, + { + "auxiliary_loss_clip": 0.06499462, + "auxiliary_loss_mlp": 0.01274957, + "balance_loss_clip": 0.06293429, + "balance_loss_mlp": 0.01257159, + "epoch": 0.30560649331128814, + "flos": 25598606699520.0, + "grad_norm": 2.3807589086926533, + "language_loss": 0.73733467, + "learning_rate": 3.2547071166533736e-06, + "loss": 0.81507885, + "num_input_tokens_seen": 109192510, + "router_z_loss_clip": 2.05761719, + "router_z_loss_mlp": 0.17797852, + "step": 5083, + "time_per_iteration": 2.595439910888672 + }, + { + "auxiliary_loss_clip": 0.06488115, + "auxiliary_loss_mlp": 0.01272372, + "balance_loss_clip": 0.0628676, + "balance_loss_mlp": 0.01254729, + "epoch": 0.3056666165639561, + "flos": 19133156206080.0, + "grad_norm": 1.8141392710911106, + "language_loss": 0.71165347, + "learning_rate": 3.254403805595344e-06, + "loss": 0.78925836, + "num_input_tokens_seen": 109210885, + "router_z_loss_clip": 2.01367188, + "router_z_loss_mlp": 0.17626953, + "step": 5084, + "time_per_iteration": 2.499873161315918 + }, + { + "auxiliary_loss_clip": 0.06505337, + "auxiliary_loss_mlp": 0.01276239, + "balance_loss_clip": 0.063004, + "balance_loss_mlp": 0.01260194, + "epoch": 0.30572673981662407, + "flos": 15529368752640.0, + "grad_norm": 2.0821129981034567, + "language_loss": 0.79337353, + "learning_rate": 3.2541004469701962e-06, + "loss": 0.87118936, + "num_input_tokens_seen": 109229180, + "router_z_loss_clip": 2.04882812, + "router_z_loss_mlp": 0.16027832, + "step": 5085, + "time_per_iteration": 2.479790449142456 + }, + { + "auxiliary_loss_clip": 0.06486039, + "auxiliary_loss_mlp": 0.01278912, + "balance_loss_clip": 0.06289506, + "balance_loss_mlp": 0.01260602, + "epoch": 0.30578686306929204, + "flos": 21512886088320.0, + "grad_norm": 2.123366644532801, + "language_loss": 0.78524947, + "learning_rate": 3.2537970407894342e-06, + "loss": 0.86289901, + "num_input_tokens_seen": 109249510, + "router_z_loss_clip": 1.96484375, + "router_z_loss_mlp": 0.18310547, + "step": 5086, + "time_per_iteration": 2.5372772216796875 + }, + { + "auxiliary_loss_clip": 0.06487311, + "auxiliary_loss_mlp": 0.01277834, + "balance_loss_clip": 0.06289313, + "balance_loss_mlp": 0.01259797, + "epoch": 0.30584698632196, + "flos": 20959689432960.0, + "grad_norm": 1.7535206397091907, + "language_loss": 0.77160186, + "learning_rate": 3.253493587064563e-06, + "loss": 0.8492533, + "num_input_tokens_seen": 109268200, + "router_z_loss_clip": 1.98144531, + "router_z_loss_mlp": 0.18041992, + "step": 5087, + "time_per_iteration": 2.4971578121185303 + }, + { + "auxiliary_loss_clip": 0.06492934, + "auxiliary_loss_mlp": 0.01277252, + "balance_loss_clip": 0.06288779, + "balance_loss_mlp": 0.01258154, + "epoch": 0.30590710957462797, + "flos": 24688044380160.0, + "grad_norm": 1.802467786704899, + "language_loss": 0.7266196, + "learning_rate": 3.2531900858070885e-06, + "loss": 0.80432141, + "num_input_tokens_seen": 109288370, + "router_z_loss_clip": 2.04492188, + "router_z_loss_mlp": 0.19091797, + "step": 5088, + "time_per_iteration": 2.5416259765625 + }, + { + "auxiliary_loss_clip": 0.06501624, + "auxiliary_loss_mlp": 0.0127311, + "balance_loss_clip": 0.06292014, + "balance_loss_mlp": 0.01253893, + "epoch": 0.30596723282729593, + "flos": 17091700456320.0, + "grad_norm": 2.3226252492467037, + "language_loss": 0.79702371, + "learning_rate": 3.252886537028521e-06, + "loss": 0.874771, + "num_input_tokens_seen": 109306730, + "router_z_loss_clip": 2.09570312, + "router_z_loss_mlp": 0.19226074, + "step": 5089, + "time_per_iteration": 2.4745559692382812 + }, + { + "auxiliary_loss_clip": 0.06491631, + "auxiliary_loss_mlp": 0.01275196, + "balance_loss_clip": 0.06291364, + "balance_loss_mlp": 0.01256981, + "epoch": 0.30602735607996395, + "flos": 22863775213440.0, + "grad_norm": 6.857787253608019, + "language_loss": 0.77299303, + "learning_rate": 3.2525829407403703e-06, + "loss": 0.85066134, + "num_input_tokens_seen": 109327360, + "router_z_loss_clip": 2.00097656, + "router_z_loss_mlp": 0.18225098, + "step": 5090, + "time_per_iteration": 2.5330631732940674 + }, + { + "auxiliary_loss_clip": 0.06500913, + "auxiliary_loss_mlp": 0.01279012, + "balance_loss_clip": 0.06295903, + "balance_loss_mlp": 0.01260773, + "epoch": 0.3060874793326319, + "flos": 29869173417600.0, + "grad_norm": 1.854909004407163, + "language_loss": 0.76970392, + "learning_rate": 3.2522792969541488e-06, + "loss": 0.84750324, + "num_input_tokens_seen": 109348135, + "router_z_loss_clip": 2.04882812, + "router_z_loss_mlp": 0.18237305, + "step": 5091, + "time_per_iteration": 2.561894178390503 + }, + { + "auxiliary_loss_clip": 0.06491988, + "auxiliary_loss_mlp": 0.01272552, + "balance_loss_clip": 0.06287533, + "balance_loss_mlp": 0.01254551, + "epoch": 0.3061476025852999, + "flos": 20454765528960.0, + "grad_norm": 1.7300285931862276, + "language_loss": 0.72878456, + "learning_rate": 3.2519756056813705e-06, + "loss": 0.80642998, + "num_input_tokens_seen": 109366220, + "router_z_loss_clip": 2.04589844, + "router_z_loss_mlp": 0.18005371, + "step": 5092, + "time_per_iteration": 2.5661561489105225 + }, + { + "auxiliary_loss_clip": 0.06495406, + "auxiliary_loss_mlp": 0.01276172, + "balance_loss_clip": 0.06294402, + "balance_loss_mlp": 0.01258696, + "epoch": 0.30620772583796785, + "flos": 19397651218560.0, + "grad_norm": 1.8286917674158676, + "language_loss": 0.83293521, + "learning_rate": 3.2516718669335522e-06, + "loss": 0.91065109, + "num_input_tokens_seen": 109385260, + "router_z_loss_clip": 2.00976562, + "router_z_loss_mlp": 0.17468262, + "step": 5093, + "time_per_iteration": 2.49686336517334 + }, + { + "auxiliary_loss_clip": 0.06495437, + "auxiliary_loss_mlp": 0.01277069, + "balance_loss_clip": 0.06295857, + "balance_loss_mlp": 0.01259652, + "epoch": 0.3062678490906358, + "flos": 24031411459200.0, + "grad_norm": 1.7386581048181018, + "language_loss": 0.74963737, + "learning_rate": 3.2513680807222114e-06, + "loss": 0.82736242, + "num_input_tokens_seen": 109405025, + "router_z_loss_clip": 1.99414062, + "router_z_loss_mlp": 0.17419434, + "step": 5094, + "time_per_iteration": 2.5497004985809326 + }, + { + "auxiliary_loss_clip": 0.06491575, + "auxiliary_loss_mlp": 0.01272234, + "balance_loss_clip": 0.06293601, + "balance_loss_mlp": 0.01255735, + "epoch": 0.3063279723433038, + "flos": 19760593178880.0, + "grad_norm": 1.8971341227661025, + "language_loss": 0.76389223, + "learning_rate": 3.251064247058868e-06, + "loss": 0.84153032, + "num_input_tokens_seen": 109422465, + "router_z_loss_clip": 1.9765625, + "router_z_loss_mlp": 0.16503906, + "step": 5095, + "time_per_iteration": 2.493479013442993 + }, + { + "auxiliary_loss_clip": 0.06485657, + "auxiliary_loss_mlp": 0.0128124, + "balance_loss_clip": 0.06288686, + "balance_loss_mlp": 0.01262727, + "epoch": 0.30638809559597174, + "flos": 22455663102720.0, + "grad_norm": 1.6310889817091494, + "language_loss": 0.81246006, + "learning_rate": 3.250760365955042e-06, + "loss": 0.89012897, + "num_input_tokens_seen": 109440575, + "router_z_loss_clip": 1.97070312, + "router_z_loss_mlp": 0.18518066, + "step": 5096, + "time_per_iteration": 2.606100559234619 + }, + { + "auxiliary_loss_clip": 0.06500001, + "auxiliary_loss_mlp": 0.01286183, + "balance_loss_clip": 0.06297529, + "balance_loss_mlp": 0.01269947, + "epoch": 0.3064482188486397, + "flos": 17170846237440.0, + "grad_norm": 2.1701963694762862, + "language_loss": 0.81871414, + "learning_rate": 3.250456437422258e-06, + "loss": 0.89657605, + "num_input_tokens_seen": 109459050, + "router_z_loss_clip": 2.0234375, + "router_z_loss_mlp": 0.16235352, + "step": 5097, + "time_per_iteration": 2.506908893585205 + }, + { + "auxiliary_loss_clip": 0.06498241, + "auxiliary_loss_mlp": 0.01288982, + "balance_loss_clip": 0.06297113, + "balance_loss_mlp": 0.01269647, + "epoch": 0.3065083421013077, + "flos": 23775176073600.0, + "grad_norm": 2.1266024193404385, + "language_loss": 0.7855283, + "learning_rate": 3.250152461472041e-06, + "loss": 0.86340058, + "num_input_tokens_seen": 109475860, + "router_z_loss_clip": 2.0078125, + "router_z_loss_mlp": 0.19335938, + "step": 5098, + "time_per_iteration": 2.546875238418579 + }, + { + "auxiliary_loss_clip": 0.06494713, + "auxiliary_loss_mlp": 0.01291897, + "balance_loss_clip": 0.06296527, + "balance_loss_mlp": 0.0127367, + "epoch": 0.30656846535397564, + "flos": 26438953697280.0, + "grad_norm": 1.8261556885246946, + "language_loss": 0.84430897, + "learning_rate": 3.249848438115917e-06, + "loss": 0.92217511, + "num_input_tokens_seen": 109494760, + "router_z_loss_clip": 1.98242188, + "router_z_loss_mlp": 0.18225098, + "step": 5099, + "time_per_iteration": 2.5726583003997803 + }, + { + "auxiliary_loss_clip": 0.06498358, + "auxiliary_loss_mlp": 0.01287293, + "balance_loss_clip": 0.06295489, + "balance_loss_mlp": 0.01268434, + "epoch": 0.3066285886066436, + "flos": 26659117100160.0, + "grad_norm": 1.588615118025773, + "language_loss": 0.86241573, + "learning_rate": 3.2495443673654148e-06, + "loss": 0.94027227, + "num_input_tokens_seen": 109516480, + "router_z_loss_clip": 2.02832031, + "router_z_loss_mlp": 0.18859863, + "step": 5100, + "time_per_iteration": 2.5711421966552734 + }, + { + "auxiliary_loss_clip": 0.06496789, + "auxiliary_loss_mlp": 0.01283562, + "balance_loss_clip": 0.06296922, + "balance_loss_mlp": 0.01264345, + "epoch": 0.30668871185931157, + "flos": 15055443659520.0, + "grad_norm": 1.7244173580954059, + "language_loss": 0.79369497, + "learning_rate": 3.249240249232065e-06, + "loss": 0.87149858, + "num_input_tokens_seen": 109534615, + "router_z_loss_clip": 1.99804688, + "router_z_loss_mlp": 0.19226074, + "step": 5101, + "time_per_iteration": 2.539132833480835 + }, + { + "auxiliary_loss_clip": 0.0650195, + "auxiliary_loss_mlp": 0.01287055, + "balance_loss_clip": 0.06299084, + "balance_loss_mlp": 0.01268172, + "epoch": 0.30674883511197953, + "flos": 20087966280960.0, + "grad_norm": 1.7739241542858428, + "language_loss": 0.80435872, + "learning_rate": 3.2489360837273998e-06, + "loss": 0.88224876, + "num_input_tokens_seen": 109554040, + "router_z_loss_clip": 2.02832031, + "router_z_loss_mlp": 0.1887207, + "step": 5102, + "time_per_iteration": 2.5558016300201416 + }, + { + "auxiliary_loss_clip": 0.06503183, + "auxiliary_loss_mlp": 0.01284648, + "balance_loss_clip": 0.06301928, + "balance_loss_mlp": 0.01265253, + "epoch": 0.30680895836464755, + "flos": 22900518028800.0, + "grad_norm": 1.6865927559982214, + "language_loss": 0.89335668, + "learning_rate": 3.2486318708629532e-06, + "loss": 0.97123504, + "num_input_tokens_seen": 109574345, + "router_z_loss_clip": 2.015625, + "router_z_loss_mlp": 0.19396973, + "step": 5103, + "time_per_iteration": 2.542555570602417 + }, + { + "auxiliary_loss_clip": 0.06501935, + "auxiliary_loss_mlp": 0.01286618, + "balance_loss_clip": 0.06302223, + "balance_loss_mlp": 0.0126876, + "epoch": 0.3068690816173155, + "flos": 23702948254080.0, + "grad_norm": 2.119732369805114, + "language_loss": 0.74448419, + "learning_rate": 3.2483276106502607e-06, + "loss": 0.82236969, + "num_input_tokens_seen": 109593670, + "router_z_loss_clip": 1.99707031, + "router_z_loss_mlp": 0.17871094, + "step": 5104, + "time_per_iteration": 2.560253143310547 + }, + { + "auxiliary_loss_clip": 0.06502049, + "auxiliary_loss_mlp": 0.01274873, + "balance_loss_clip": 0.06295487, + "balance_loss_mlp": 0.01257552, + "epoch": 0.3069292048699835, + "flos": 23557947563520.0, + "grad_norm": 1.7334515387821061, + "language_loss": 0.72909176, + "learning_rate": 3.2480233031008605e-06, + "loss": 0.80686092, + "num_input_tokens_seen": 109613385, + "router_z_loss_clip": 2.06835938, + "router_z_loss_mlp": 0.17321777, + "step": 5105, + "time_per_iteration": 2.5751454830169678 + }, + { + "auxiliary_loss_clip": 0.06498945, + "auxiliary_loss_mlp": 0.01282015, + "balance_loss_clip": 0.06297372, + "balance_loss_mlp": 0.01263907, + "epoch": 0.30698932812265145, + "flos": 24537970517760.0, + "grad_norm": 2.0977567017321608, + "language_loss": 0.87578112, + "learning_rate": 3.2477189482262916e-06, + "loss": 0.95359075, + "num_input_tokens_seen": 109632395, + "router_z_loss_clip": 2.01464844, + "router_z_loss_mlp": 0.18103027, + "step": 5106, + "time_per_iteration": 2.54413104057312 + }, + { + "auxiliary_loss_clip": 0.06503764, + "auxiliary_loss_mlp": 0.01279082, + "balance_loss_clip": 0.06296381, + "balance_loss_mlp": 0.01261189, + "epoch": 0.3070494513753194, + "flos": 21002805158400.0, + "grad_norm": 2.310425767564757, + "language_loss": 0.72092319, + "learning_rate": 3.2474145460380945e-06, + "loss": 0.79875165, + "num_input_tokens_seen": 109651380, + "router_z_loss_clip": 2.0703125, + "router_z_loss_mlp": 0.17883301, + "step": 5107, + "time_per_iteration": 2.571430206298828 + }, + { + "auxiliary_loss_clip": 0.06493405, + "auxiliary_loss_mlp": 0.01275594, + "balance_loss_clip": 0.06294269, + "balance_loss_mlp": 0.01256735, + "epoch": 0.3071095746279874, + "flos": 19031942073600.0, + "grad_norm": 1.99593781887154, + "language_loss": 0.72653455, + "learning_rate": 3.247110096547814e-06, + "loss": 0.80422449, + "num_input_tokens_seen": 109670240, + "router_z_loss_clip": 1.99121094, + "router_z_loss_mlp": 0.18847656, + "step": 5108, + "time_per_iteration": 2.497788190841675 + }, + { + "auxiliary_loss_clip": 0.06499057, + "auxiliary_loss_mlp": 0.01277116, + "balance_loss_clip": 0.06297708, + "balance_loss_mlp": 0.01259533, + "epoch": 0.30716969788065535, + "flos": 21221962312320.0, + "grad_norm": 1.48656392648579, + "language_loss": 0.86441541, + "learning_rate": 3.2468055997669926e-06, + "loss": 0.94217712, + "num_input_tokens_seen": 109690810, + "router_z_loss_clip": 2.01074219, + "router_z_loss_mlp": 0.17578125, + "step": 5109, + "time_per_iteration": 2.563480854034424 + }, + { + "auxiliary_loss_clip": 0.06501789, + "auxiliary_loss_mlp": 0.01278566, + "balance_loss_clip": 0.063005, + "balance_loss_mlp": 0.01260541, + "epoch": 0.3072298211333233, + "flos": 25779385883520.0, + "grad_norm": 1.8235353484155168, + "language_loss": 0.67904091, + "learning_rate": 3.2465010557071788e-06, + "loss": 0.75684446, + "num_input_tokens_seen": 109711145, + "router_z_loss_clip": 2.01171875, + "router_z_loss_mlp": 0.18029785, + "step": 5110, + "time_per_iteration": 3.9785540103912354 + }, + { + "auxiliary_loss_clip": 0.06493396, + "auxiliary_loss_mlp": 0.01273369, + "balance_loss_clip": 0.06295427, + "balance_loss_mlp": 0.01256727, + "epoch": 0.3072899443859913, + "flos": 25856099896320.0, + "grad_norm": 1.4123986071879864, + "language_loss": 0.76984161, + "learning_rate": 3.246196464379919e-06, + "loss": 0.84750926, + "num_input_tokens_seen": 109731425, + "router_z_loss_clip": 1.97753906, + "router_z_loss_mlp": 0.16638184, + "step": 5111, + "time_per_iteration": 2.5771117210388184 + }, + { + "auxiliary_loss_clip": 0.06498265, + "auxiliary_loss_mlp": 0.01277301, + "balance_loss_clip": 0.06293567, + "balance_loss_mlp": 0.01258585, + "epoch": 0.30735006763865924, + "flos": 25930130578560.0, + "grad_norm": 2.349951455822933, + "language_loss": 0.67755288, + "learning_rate": 3.245891825796765e-06, + "loss": 0.75530857, + "num_input_tokens_seen": 109752720, + "router_z_loss_clip": 2.04394531, + "router_z_loss_mlp": 0.18713379, + "step": 5112, + "time_per_iteration": 3.963136672973633 + }, + { + "auxiliary_loss_clip": 0.0650286, + "auxiliary_loss_mlp": 0.01277737, + "balance_loss_clip": 0.06295824, + "balance_loss_mlp": 0.01257614, + "epoch": 0.3074101908913272, + "flos": 30924442938240.0, + "grad_norm": 2.270303220058131, + "language_loss": 0.79939896, + "learning_rate": 3.2455871399692678e-06, + "loss": 0.87720484, + "num_input_tokens_seen": 109772840, + "router_z_loss_clip": 2.06640625, + "router_z_loss_mlp": 0.20117188, + "step": 5113, + "time_per_iteration": 4.084795236587524 + }, + { + "auxiliary_loss_clip": 0.06502695, + "auxiliary_loss_mlp": 0.01276516, + "balance_loss_clip": 0.06297943, + "balance_loss_mlp": 0.01258599, + "epoch": 0.30747031414399517, + "flos": 18406182182400.0, + "grad_norm": 2.072714063381377, + "language_loss": 0.77269047, + "learning_rate": 3.2452824069089815e-06, + "loss": 0.85048258, + "num_input_tokens_seen": 109790150, + "router_z_loss_clip": 2.04589844, + "router_z_loss_mlp": 0.17919922, + "step": 5114, + "time_per_iteration": 2.4906773567199707 + }, + { + "auxiliary_loss_clip": 0.06498024, + "auxiliary_loss_mlp": 0.01283612, + "balance_loss_clip": 0.06298083, + "balance_loss_mlp": 0.01265087, + "epoch": 0.30753043739666314, + "flos": 22638957909120.0, + "grad_norm": 1.8131309248321845, + "language_loss": 0.62640405, + "learning_rate": 3.2449776266274623e-06, + "loss": 0.70422041, + "num_input_tokens_seen": 109807985, + "router_z_loss_clip": 2.0, + "router_z_loss_mlp": 0.18530273, + "step": 5115, + "time_per_iteration": 2.5328574180603027 + }, + { + "auxiliary_loss_clip": 0.06499057, + "auxiliary_loss_mlp": 0.01274347, + "balance_loss_clip": 0.06295817, + "balance_loss_mlp": 0.0125513, + "epoch": 0.3075905606493311, + "flos": 27351360806400.0, + "grad_norm": 1.7894066300170501, + "language_loss": 0.83589995, + "learning_rate": 3.2446727991362657e-06, + "loss": 0.91363406, + "num_input_tokens_seen": 109825920, + "router_z_loss_clip": 2.03222656, + "router_z_loss_mlp": 0.19213867, + "step": 5116, + "time_per_iteration": 2.562014102935791 + }, + { + "auxiliary_loss_clip": 0.06500115, + "auxiliary_loss_mlp": 0.01273454, + "balance_loss_clip": 0.06298394, + "balance_loss_mlp": 0.0125512, + "epoch": 0.3076506839019991, + "flos": 22097333116800.0, + "grad_norm": 1.8649453582041782, + "language_loss": 0.76016742, + "learning_rate": 3.244367924446952e-06, + "loss": 0.83790314, + "num_input_tokens_seen": 109846220, + "router_z_loss_clip": 2.01660156, + "router_z_loss_mlp": 0.18322754, + "step": 5117, + "time_per_iteration": 2.5509209632873535 + }, + { + "auxiliary_loss_clip": 0.06498168, + "auxiliary_loss_mlp": 0.01274202, + "balance_loss_clip": 0.0629583, + "balance_loss_mlp": 0.01256142, + "epoch": 0.3077108071546671, + "flos": 21296160702720.0, + "grad_norm": 2.167097847201453, + "language_loss": 0.72108531, + "learning_rate": 3.2440630025710826e-06, + "loss": 0.79880905, + "num_input_tokens_seen": 109863870, + "router_z_loss_clip": 2.02246094, + "router_z_loss_mlp": 0.18054199, + "step": 5118, + "time_per_iteration": 2.5190913677215576 + }, + { + "auxiliary_loss_clip": 0.06502286, + "auxiliary_loss_mlp": 0.01275745, + "balance_loss_clip": 0.06299888, + "balance_loss_mlp": 0.01258198, + "epoch": 0.30777093040733505, + "flos": 21436884835200.0, + "grad_norm": 2.760855389686565, + "language_loss": 0.74956095, + "learning_rate": 3.243758033520219e-06, + "loss": 0.82734126, + "num_input_tokens_seen": 109883500, + "router_z_loss_clip": 2.0234375, + "router_z_loss_mlp": 0.17553711, + "step": 5119, + "time_per_iteration": 3.973721981048584 + }, + { + "auxiliary_loss_clip": 0.06494488, + "auxiliary_loss_mlp": 0.01279388, + "balance_loss_clip": 0.06289928, + "balance_loss_mlp": 0.01259814, + "epoch": 0.307831053660003, + "flos": 23156040654720.0, + "grad_norm": 1.7924264386276263, + "language_loss": 0.80264926, + "learning_rate": 3.243453017305926e-06, + "loss": 0.88038802, + "num_input_tokens_seen": 109904620, + "router_z_loss_clip": 2.04101562, + "router_z_loss_mlp": 0.19580078, + "step": 5120, + "time_per_iteration": 2.54705548286438 + }, + { + "auxiliary_loss_clip": 0.06492078, + "auxiliary_loss_mlp": 0.01273208, + "balance_loss_clip": 0.06292595, + "balance_loss_mlp": 0.01255445, + "epoch": 0.307891176912671, + "flos": 17025510130560.0, + "grad_norm": 1.642273509687288, + "language_loss": 0.80521786, + "learning_rate": 3.24314795393977e-06, + "loss": 0.88287073, + "num_input_tokens_seen": 109922275, + "router_z_loss_clip": 1.99707031, + "router_z_loss_mlp": 0.1776123, + "step": 5121, + "time_per_iteration": 2.515054702758789 + }, + { + "auxiliary_loss_clip": 0.06496292, + "auxiliary_loss_mlp": 0.0127453, + "balance_loss_clip": 0.06298114, + "balance_loss_mlp": 0.01256875, + "epoch": 0.30795130016533895, + "flos": 27711745217280.0, + "grad_norm": 1.3913461280715187, + "language_loss": 0.82847351, + "learning_rate": 3.242842843433319e-06, + "loss": 0.90618169, + "num_input_tokens_seen": 109944265, + "router_z_loss_clip": 1.98339844, + "router_z_loss_mlp": 0.17651367, + "step": 5122, + "time_per_iteration": 2.5832252502441406 + }, + { + "auxiliary_loss_clip": 0.06416376, + "auxiliary_loss_mlp": 0.01252861, + "balance_loss_clip": 0.0632116, + "balance_loss_mlp": 0.01249526, + "epoch": 0.3080114234180069, + "flos": 69080973373440.0, + "grad_norm": 0.7221499072225652, + "language_loss": 0.58650029, + "learning_rate": 3.242537685798143e-06, + "loss": 0.66319263, + "num_input_tokens_seen": 110014160, + "router_z_loss_clip": 0.953125, + "router_z_loss_mlp": 0.03341675, + "step": 5123, + "time_per_iteration": 3.3316402435302734 + }, + { + "auxiliary_loss_clip": 0.06503562, + "auxiliary_loss_mlp": 0.01279925, + "balance_loss_clip": 0.06296872, + "balance_loss_mlp": 0.01260744, + "epoch": 0.3080715466706749, + "flos": 24066938390400.0, + "grad_norm": 1.6584153298959496, + "language_loss": 0.83586073, + "learning_rate": 3.242232481045813e-06, + "loss": 0.91369557, + "num_input_tokens_seen": 110034865, + "router_z_loss_clip": 2.06640625, + "router_z_loss_mlp": 0.1920166, + "step": 5124, + "time_per_iteration": 2.589906930923462 + }, + { + "auxiliary_loss_clip": 0.06498908, + "auxiliary_loss_mlp": 0.01271737, + "balance_loss_clip": 0.06294107, + "balance_loss_mlp": 0.01253629, + "epoch": 0.30813166992334284, + "flos": 25855806407040.0, + "grad_norm": 2.061271988083176, + "language_loss": 0.79248756, + "learning_rate": 3.2419272291879035e-06, + "loss": 0.87019402, + "num_input_tokens_seen": 110052930, + "router_z_loss_clip": 2.04589844, + "router_z_loss_mlp": 0.1809082, + "step": 5125, + "time_per_iteration": 2.550884485244751 + }, + { + "auxiliary_loss_clip": 0.06501068, + "auxiliary_loss_mlp": 0.012774, + "balance_loss_clip": 0.06292764, + "balance_loss_mlp": 0.01258374, + "epoch": 0.3081917931760108, + "flos": 20455981413120.0, + "grad_norm": 2.085029494567846, + "language_loss": 0.64930958, + "learning_rate": 3.241621930235989e-06, + "loss": 0.72709423, + "num_input_tokens_seen": 110071765, + "router_z_loss_clip": 2.08203125, + "router_z_loss_mlp": 0.19018555, + "step": 5126, + "time_per_iteration": 2.576352119445801 + }, + { + "auxiliary_loss_clip": 0.06490224, + "auxiliary_loss_mlp": 0.01277045, + "balance_loss_clip": 0.06294391, + "balance_loss_mlp": 0.01259533, + "epoch": 0.3082519164286788, + "flos": 22173208588800.0, + "grad_norm": 1.5681866965441809, + "language_loss": 0.87117672, + "learning_rate": 3.241316584201646e-06, + "loss": 0.94884944, + "num_input_tokens_seen": 110092660, + "router_z_loss_clip": 1.95800781, + "router_z_loss_mlp": 0.17504883, + "step": 5127, + "time_per_iteration": 2.567615270614624 + }, + { + "auxiliary_loss_clip": 0.0649047, + "auxiliary_loss_mlp": 0.01273562, + "balance_loss_clip": 0.06291968, + "balance_loss_mlp": 0.0125593, + "epoch": 0.30831203968134674, + "flos": 28921029742080.0, + "grad_norm": 1.4544126326452276, + "language_loss": 0.69282925, + "learning_rate": 3.2410111910964538e-06, + "loss": 0.77046961, + "num_input_tokens_seen": 110114960, + "router_z_loss_clip": 1.984375, + "router_z_loss_mlp": 0.1763916, + "step": 5128, + "time_per_iteration": 2.6129322052001953 + }, + { + "auxiliary_loss_clip": 0.06499469, + "auxiliary_loss_mlp": 0.01276178, + "balance_loss_clip": 0.06295171, + "balance_loss_mlp": 0.01257843, + "epoch": 0.3083721629340147, + "flos": 25675069150080.0, + "grad_norm": 2.0282558045061396, + "language_loss": 0.7195785, + "learning_rate": 3.240705750931993e-06, + "loss": 0.79733503, + "num_input_tokens_seen": 110135750, + "router_z_loss_clip": 2.04394531, + "router_z_loss_mlp": 0.18334961, + "step": 5129, + "time_per_iteration": 2.5587165355682373 + }, + { + "auxiliary_loss_clip": 0.06388761, + "auxiliary_loss_mlp": 0.01275431, + "balance_loss_clip": 0.06292662, + "balance_loss_mlp": 0.01271816, + "epoch": 0.3084322861866827, + "flos": 68233666487040.0, + "grad_norm": 0.8077979927321801, + "language_loss": 0.58935201, + "learning_rate": 3.240400263719846e-06, + "loss": 0.66599393, + "num_input_tokens_seen": 110189480, + "router_z_loss_clip": 0.9609375, + "router_z_loss_mlp": 0.03607178, + "step": 5130, + "time_per_iteration": 3.2353098392486572 + }, + { + "auxiliary_loss_clip": 0.06498231, + "auxiliary_loss_mlp": 0.012758, + "balance_loss_clip": 0.0629265, + "balance_loss_mlp": 0.01258443, + "epoch": 0.3084924094393507, + "flos": 20301630992640.0, + "grad_norm": 2.071340626605126, + "language_loss": 0.73298538, + "learning_rate": 3.2400947294715957e-06, + "loss": 0.81072569, + "num_input_tokens_seen": 110206445, + "router_z_loss_clip": 2.05664062, + "router_z_loss_mlp": 0.17370605, + "step": 5131, + "time_per_iteration": 2.523510456085205 + }, + { + "auxiliary_loss_clip": 0.06487547, + "auxiliary_loss_mlp": 0.01274811, + "balance_loss_clip": 0.06290068, + "balance_loss_mlp": 0.01257728, + "epoch": 0.30855253269201866, + "flos": 23956374528000.0, + "grad_norm": 1.6208223340220833, + "language_loss": 0.71358359, + "learning_rate": 3.2397891481988303e-06, + "loss": 0.79120713, + "num_input_tokens_seen": 110226845, + "router_z_loss_clip": 1.97558594, + "router_z_loss_mlp": 0.17077637, + "step": 5132, + "time_per_iteration": 2.581470012664795 + }, + { + "auxiliary_loss_clip": 0.06487268, + "auxiliary_loss_mlp": 0.01273323, + "balance_loss_clip": 0.06290212, + "balance_loss_mlp": 0.01255262, + "epoch": 0.3086126559446866, + "flos": 19288009751040.0, + "grad_norm": 1.7801590489825803, + "language_loss": 0.90374929, + "learning_rate": 3.239483519913136e-06, + "loss": 0.98135513, + "num_input_tokens_seen": 110244095, + "router_z_loss_clip": 1.96972656, + "router_z_loss_mlp": 0.18066406, + "step": 5133, + "time_per_iteration": 2.5197763442993164 + }, + { + "auxiliary_loss_clip": 0.06499831, + "auxiliary_loss_mlp": 0.01275105, + "balance_loss_clip": 0.06295495, + "balance_loss_mlp": 0.01257105, + "epoch": 0.3086727791973546, + "flos": 33768328913280.0, + "grad_norm": 1.8524807236065886, + "language_loss": 0.67443442, + "learning_rate": 3.239177844626102e-06, + "loss": 0.75218379, + "num_input_tokens_seen": 110264240, + "router_z_loss_clip": 2.04101562, + "router_z_loss_mlp": 0.18017578, + "step": 5134, + "time_per_iteration": 2.664303779602051 + }, + { + "auxiliary_loss_clip": 0.06498815, + "auxiliary_loss_mlp": 0.01275704, + "balance_loss_clip": 0.06293166, + "balance_loss_mlp": 0.01257167, + "epoch": 0.30873290245002255, + "flos": 16039659317760.0, + "grad_norm": 1.8927812104332384, + "language_loss": 0.83517784, + "learning_rate": 3.2388721223493197e-06, + "loss": 0.91292304, + "num_input_tokens_seen": 110282450, + "router_z_loss_clip": 2.05664062, + "router_z_loss_mlp": 0.18518066, + "step": 5135, + "time_per_iteration": 2.505138397216797 + }, + { + "auxiliary_loss_clip": 0.06377634, + "auxiliary_loss_mlp": 0.01258895, + "balance_loss_clip": 0.06282344, + "balance_loss_mlp": 0.01255602, + "epoch": 0.3087930257026905, + "flos": 65070415474560.0, + "grad_norm": 0.6863645266912056, + "language_loss": 0.55337238, + "learning_rate": 3.2385663530943824e-06, + "loss": 0.62973773, + "num_input_tokens_seen": 110343715, + "router_z_loss_clip": 0.953125, + "router_z_loss_mlp": 0.0329895, + "step": 5136, + "time_per_iteration": 3.179166555404663 + }, + { + "auxiliary_loss_clip": 0.06488921, + "auxiliary_loss_mlp": 0.01274465, + "balance_loss_clip": 0.06290261, + "balance_loss_mlp": 0.01257085, + "epoch": 0.3088531489553585, + "flos": 74754001733760.0, + "grad_norm": 1.8635236180899502, + "language_loss": 0.76610464, + "learning_rate": 3.2382605368728852e-06, + "loss": 0.8437385, + "num_input_tokens_seen": 110368430, + "router_z_loss_clip": 1.98632812, + "router_z_loss_mlp": 0.1739502, + "step": 5137, + "time_per_iteration": 2.9993999004364014 + }, + { + "auxiliary_loss_clip": 0.06489644, + "auxiliary_loss_mlp": 0.01272707, + "balance_loss_clip": 0.06290223, + "balance_loss_mlp": 0.01255458, + "epoch": 0.30891327220802645, + "flos": 21148686316800.0, + "grad_norm": 1.7480087539569926, + "language_loss": 0.80450445, + "learning_rate": 3.237954673696424e-06, + "loss": 0.882128, + "num_input_tokens_seen": 110386735, + "router_z_loss_clip": 1.9921875, + "router_z_loss_mlp": 0.17248535, + "step": 5138, + "time_per_iteration": 2.531916856765747 + }, + { + "auxiliary_loss_clip": 0.06496161, + "auxiliary_loss_mlp": 0.01276289, + "balance_loss_clip": 0.06294001, + "balance_loss_mlp": 0.01258896, + "epoch": 0.3089733954606944, + "flos": 25671295716480.0, + "grad_norm": 1.629930216805369, + "language_loss": 0.81626344, + "learning_rate": 3.2376487635765983e-06, + "loss": 0.89398789, + "num_input_tokens_seen": 110406820, + "router_z_loss_clip": 2.02050781, + "router_z_loss_mlp": 0.1739502, + "step": 5139, + "time_per_iteration": 2.585380792617798 + }, + { + "auxiliary_loss_clip": 0.06501773, + "auxiliary_loss_mlp": 0.01277306, + "balance_loss_clip": 0.06292425, + "balance_loss_mlp": 0.01258817, + "epoch": 0.3090335187133624, + "flos": 19433429712000.0, + "grad_norm": 2.0033599705043854, + "language_loss": 0.77724934, + "learning_rate": 3.2373428065250067e-06, + "loss": 0.85504013, + "num_input_tokens_seen": 110424225, + "router_z_loss_clip": 2.09179688, + "router_z_loss_mlp": 0.18481445, + "step": 5140, + "time_per_iteration": 2.504387617111206 + }, + { + "auxiliary_loss_clip": 0.06482549, + "auxiliary_loss_mlp": 0.01272919, + "balance_loss_clip": 0.06290817, + "balance_loss_mlp": 0.0125741, + "epoch": 0.30909364196603034, + "flos": 20017541324160.0, + "grad_norm": 1.9132937458234096, + "language_loss": 0.78916645, + "learning_rate": 3.237036802553252e-06, + "loss": 0.86672109, + "num_input_tokens_seen": 110443310, + "router_z_loss_clip": 1.91601562, + "router_z_loss_mlp": 0.15515137, + "step": 5141, + "time_per_iteration": 2.5588464736938477 + }, + { + "auxiliary_loss_clip": 0.06494773, + "auxiliary_loss_mlp": 0.01277459, + "balance_loss_clip": 0.06291379, + "balance_loss_mlp": 0.01260543, + "epoch": 0.3091537652186983, + "flos": 19682830990080.0, + "grad_norm": 2.2087235088394728, + "language_loss": 0.8789897, + "learning_rate": 3.2367307516729377e-06, + "loss": 0.95671201, + "num_input_tokens_seen": 110460215, + "router_z_loss_clip": 2.03222656, + "router_z_loss_mlp": 0.16906738, + "step": 5142, + "time_per_iteration": 2.52750825881958 + }, + { + "auxiliary_loss_clip": 0.06498981, + "auxiliary_loss_mlp": 0.01276818, + "balance_loss_clip": 0.06294474, + "balance_loss_mlp": 0.01259438, + "epoch": 0.3092138884713663, + "flos": 17025845546880.0, + "grad_norm": 2.3473661014686984, + "language_loss": 0.7985431, + "learning_rate": 3.23642465389567e-06, + "loss": 0.87630117, + "num_input_tokens_seen": 110479385, + "router_z_loss_clip": 2.04785156, + "router_z_loss_mlp": 0.17382812, + "step": 5143, + "time_per_iteration": 2.658299207687378 + }, + { + "auxiliary_loss_clip": 0.06489455, + "auxiliary_loss_mlp": 0.01277055, + "balance_loss_clip": 0.06291586, + "balance_loss_mlp": 0.01260378, + "epoch": 0.3092740117240343, + "flos": 25017052636800.0, + "grad_norm": 1.6187717199492768, + "language_loss": 0.72479737, + "learning_rate": 3.236118509233055e-06, + "loss": 0.8024624, + "num_input_tokens_seen": 110499885, + "router_z_loss_clip": 1.97949219, + "router_z_loss_mlp": 0.16662598, + "step": 5144, + "time_per_iteration": 2.547358989715576 + }, + { + "auxiliary_loss_clip": 0.06496169, + "auxiliary_loss_mlp": 0.01272398, + "balance_loss_clip": 0.06292454, + "balance_loss_mlp": 0.01256138, + "epoch": 0.30933413497670226, + "flos": 25597013472000.0, + "grad_norm": 2.2714150562550466, + "language_loss": 0.74676621, + "learning_rate": 3.235812317696702e-06, + "loss": 0.82445192, + "num_input_tokens_seen": 110519690, + "router_z_loss_clip": 2.03515625, + "router_z_loss_mlp": 0.16271973, + "step": 5145, + "time_per_iteration": 2.6273365020751953 + }, + { + "auxiliary_loss_clip": 0.06490701, + "auxiliary_loss_mlp": 0.01273039, + "balance_loss_clip": 0.06289125, + "balance_loss_mlp": 0.01256296, + "epoch": 0.3093942582293702, + "flos": 24396617479680.0, + "grad_norm": 1.731689317121935, + "language_loss": 0.76830649, + "learning_rate": 3.2355060792982224e-06, + "loss": 0.84594393, + "num_input_tokens_seen": 110540520, + "router_z_loss_clip": 2.01464844, + "router_z_loss_mlp": 0.16729736, + "step": 5146, + "time_per_iteration": 2.5352702140808105 + }, + { + "auxiliary_loss_clip": 0.06485911, + "auxiliary_loss_mlp": 0.01273533, + "balance_loss_clip": 0.06287882, + "balance_loss_mlp": 0.0125707, + "epoch": 0.3094543814820382, + "flos": 19652586865920.0, + "grad_norm": 1.8011449994622988, + "language_loss": 0.66675043, + "learning_rate": 3.2351997940492286e-06, + "loss": 0.74434483, + "num_input_tokens_seen": 110557950, + "router_z_loss_clip": 1.97949219, + "router_z_loss_mlp": 0.16467285, + "step": 5147, + "time_per_iteration": 2.545940637588501 + }, + { + "auxiliary_loss_clip": 0.06492072, + "auxiliary_loss_mlp": 0.01271267, + "balance_loss_clip": 0.0628895, + "balance_loss_mlp": 0.01253731, + "epoch": 0.30951450473470615, + "flos": 25670499102720.0, + "grad_norm": 1.8580519203508368, + "language_loss": 0.74971956, + "learning_rate": 3.2348934619613346e-06, + "loss": 0.82735288, + "num_input_tokens_seen": 110578215, + "router_z_loss_clip": 2.03125, + "router_z_loss_mlp": 0.17529297, + "step": 5148, + "time_per_iteration": 2.5673537254333496 + }, + { + "auxiliary_loss_clip": 0.06501722, + "auxiliary_loss_mlp": 0.01278545, + "balance_loss_clip": 0.06290632, + "balance_loss_mlp": 0.01260342, + "epoch": 0.3095746279873741, + "flos": 12025202204160.0, + "grad_norm": 2.1335435485893166, + "language_loss": 0.73367, + "learning_rate": 3.2345870830461567e-06, + "loss": 0.81147265, + "num_input_tokens_seen": 110592990, + "router_z_loss_clip": 2.11230469, + "router_z_loss_mlp": 0.18212891, + "step": 5149, + "time_per_iteration": 2.682609796524048 + }, + { + "auxiliary_loss_clip": 0.06497431, + "auxiliary_loss_mlp": 0.01277143, + "balance_loss_clip": 0.06292653, + "balance_loss_mlp": 0.01258534, + "epoch": 0.3096347512400421, + "flos": 23629798039680.0, + "grad_norm": 1.913638713978071, + "language_loss": 0.85296845, + "learning_rate": 3.2342806573153132e-06, + "loss": 0.93071413, + "num_input_tokens_seen": 110612130, + "router_z_loss_clip": 2.04785156, + "router_z_loss_mlp": 0.18591309, + "step": 5150, + "time_per_iteration": 3.9813008308410645 + }, + { + "auxiliary_loss_clip": 0.06483387, + "auxiliary_loss_mlp": 0.01274387, + "balance_loss_clip": 0.06285527, + "balance_loss_mlp": 0.01256815, + "epoch": 0.30969487449271005, + "flos": 22536024768000.0, + "grad_norm": 1.8960829077128427, + "language_loss": 0.79181123, + "learning_rate": 3.233974184780424e-06, + "loss": 0.86938894, + "num_input_tokens_seen": 110632045, + "router_z_loss_clip": 1.97851562, + "router_z_loss_mlp": 0.17565918, + "step": 5151, + "time_per_iteration": 2.5336477756500244 + }, + { + "auxiliary_loss_clip": 0.06493182, + "auxiliary_loss_mlp": 0.01274378, + "balance_loss_clip": 0.06291731, + "balance_loss_mlp": 0.01257426, + "epoch": 0.309754997745378, + "flos": 15273301075200.0, + "grad_norm": 2.079664023782487, + "language_loss": 0.67843604, + "learning_rate": 3.2336676654531084e-06, + "loss": 0.75611162, + "num_input_tokens_seen": 110649340, + "router_z_loss_clip": 2.01367188, + "router_z_loss_mlp": 0.16931152, + "step": 5152, + "time_per_iteration": 5.332815647125244 + }, + { + "auxiliary_loss_clip": 0.06492282, + "auxiliary_loss_mlp": 0.01278303, + "balance_loss_clip": 0.06293005, + "balance_loss_mlp": 0.01261888, + "epoch": 0.309815120998046, + "flos": 26986532129280.0, + "grad_norm": 1.9990242894688834, + "language_loss": 0.83170605, + "learning_rate": 3.2333610993449926e-06, + "loss": 0.90941191, + "num_input_tokens_seen": 110668450, + "router_z_loss_clip": 1.98925781, + "router_z_loss_mlp": 0.16394043, + "step": 5153, + "time_per_iteration": 2.5944862365722656 + }, + { + "auxiliary_loss_clip": 0.06488585, + "auxiliary_loss_mlp": 0.0127453, + "balance_loss_clip": 0.06290878, + "balance_loss_mlp": 0.0125709, + "epoch": 0.30987524425071394, + "flos": 21149692565760.0, + "grad_norm": 1.7708804151784365, + "language_loss": 0.74136615, + "learning_rate": 3.2330544864676997e-06, + "loss": 0.81899732, + "num_input_tokens_seen": 110689410, + "router_z_loss_clip": 1.9765625, + "router_z_loss_mlp": 0.17456055, + "step": 5154, + "time_per_iteration": 2.529526948928833 + }, + { + "auxiliary_loss_clip": 0.0648791, + "auxiliary_loss_mlp": 0.01284436, + "balance_loss_clip": 0.06292189, + "balance_loss_mlp": 0.01267544, + "epoch": 0.3099353675033819, + "flos": 15273720345600.0, + "grad_norm": 2.7515131151360763, + "language_loss": 0.76419097, + "learning_rate": 3.232747826832858e-06, + "loss": 0.84191442, + "num_input_tokens_seen": 110707350, + "router_z_loss_clip": 1.95605469, + "router_z_loss_mlp": 0.16882324, + "step": 5155, + "time_per_iteration": 2.5338993072509766 + }, + { + "auxiliary_loss_clip": 0.06490543, + "auxiliary_loss_mlp": 0.01273122, + "balance_loss_clip": 0.06289169, + "balance_loss_mlp": 0.01256373, + "epoch": 0.30999549075604993, + "flos": 15419182233600.0, + "grad_norm": 1.684257178792462, + "language_loss": 0.79886794, + "learning_rate": 3.232441120452094e-06, + "loss": 0.87650466, + "num_input_tokens_seen": 110724910, + "router_z_loss_clip": 2.01367188, + "router_z_loss_mlp": 0.1673584, + "step": 5156, + "time_per_iteration": 2.5190272331237793 + }, + { + "auxiliary_loss_clip": 0.06493768, + "auxiliary_loss_mlp": 0.01281451, + "balance_loss_clip": 0.06290715, + "balance_loss_mlp": 0.01264821, + "epoch": 0.3100556140087179, + "flos": 23191106388480.0, + "grad_norm": 2.1803769191775197, + "language_loss": 0.74967813, + "learning_rate": 3.23213436733704e-06, + "loss": 0.82743037, + "num_input_tokens_seen": 110744010, + "router_z_loss_clip": 2.03125, + "router_z_loss_mlp": 0.16625977, + "step": 5157, + "time_per_iteration": 2.59045147895813 + }, + { + "auxiliary_loss_clip": 0.06486322, + "auxiliary_loss_mlp": 0.01274347, + "balance_loss_clip": 0.06289537, + "balance_loss_mlp": 0.01258921, + "epoch": 0.31011573726138586, + "flos": 25749770664960.0, + "grad_norm": 2.4337865277632065, + "language_loss": 0.69860423, + "learning_rate": 3.231827567499327e-06, + "loss": 0.7762109, + "num_input_tokens_seen": 110765835, + "router_z_loss_clip": 1.96777344, + "router_z_loss_mlp": 0.1541748, + "step": 5158, + "time_per_iteration": 4.041999578475952 + }, + { + "auxiliary_loss_clip": 0.06488799, + "auxiliary_loss_mlp": 0.0127365, + "balance_loss_clip": 0.0629247, + "balance_loss_mlp": 0.0125795, + "epoch": 0.3101758605140538, + "flos": 20017541324160.0, + "grad_norm": 2.0387737109261477, + "language_loss": 0.84883308, + "learning_rate": 3.2315207209505896e-06, + "loss": 0.92645758, + "num_input_tokens_seen": 110784655, + "router_z_loss_clip": 1.96484375, + "router_z_loss_mlp": 0.15673828, + "step": 5159, + "time_per_iteration": 2.5081369876861572 + }, + { + "auxiliary_loss_clip": 0.06491841, + "auxiliary_loss_mlp": 0.0127455, + "balance_loss_clip": 0.06293043, + "balance_loss_mlp": 0.01257002, + "epoch": 0.3102359837667218, + "flos": 19141751249280.0, + "grad_norm": 1.926707434190644, + "language_loss": 0.85498118, + "learning_rate": 3.231213827702462e-06, + "loss": 0.93264508, + "num_input_tokens_seen": 110802545, + "router_z_loss_clip": 1.98828125, + "router_z_loss_mlp": 0.17529297, + "step": 5160, + "time_per_iteration": 2.5466468334198 + }, + { + "auxiliary_loss_clip": 0.06486624, + "auxiliary_loss_mlp": 0.01270062, + "balance_loss_clip": 0.06291263, + "balance_loss_mlp": 0.01253945, + "epoch": 0.31029610701938976, + "flos": 22270649287680.0, + "grad_norm": 1.6869427612303989, + "language_loss": 0.75787026, + "learning_rate": 3.230906887766584e-06, + "loss": 0.83543712, + "num_input_tokens_seen": 110820265, + "router_z_loss_clip": 1.95214844, + "router_z_loss_mlp": 0.16113281, + "step": 5161, + "time_per_iteration": 2.518521785736084 + }, + { + "auxiliary_loss_clip": 0.06491208, + "auxiliary_loss_mlp": 0.0127494, + "balance_loss_clip": 0.06289751, + "balance_loss_mlp": 0.01256915, + "epoch": 0.3103562302720577, + "flos": 20810244476160.0, + "grad_norm": 2.463900279304932, + "language_loss": 0.8222912, + "learning_rate": 3.2305999011545924e-06, + "loss": 0.89995265, + "num_input_tokens_seen": 110836195, + "router_z_loss_clip": 2.01367188, + "router_z_loss_mlp": 0.18029785, + "step": 5162, + "time_per_iteration": 2.5057315826416016 + }, + { + "auxiliary_loss_clip": 0.06485277, + "auxiliary_loss_mlp": 0.01269002, + "balance_loss_clip": 0.06289959, + "balance_loss_mlp": 0.01253594, + "epoch": 0.3104163535247257, + "flos": 22350382047360.0, + "grad_norm": 1.4717884967200954, + "language_loss": 0.83087295, + "learning_rate": 3.2302928678781295e-06, + "loss": 0.90841573, + "num_input_tokens_seen": 110856420, + "router_z_loss_clip": 1.95507812, + "router_z_loss_mlp": 0.15423584, + "step": 5163, + "time_per_iteration": 2.542052745819092 + }, + { + "auxiliary_loss_clip": 0.06490193, + "auxiliary_loss_mlp": 0.01271791, + "balance_loss_clip": 0.06290418, + "balance_loss_mlp": 0.0125559, + "epoch": 0.31047647677739365, + "flos": 21695803551360.0, + "grad_norm": 1.756895513371669, + "language_loss": 0.76630449, + "learning_rate": 3.2299857879488376e-06, + "loss": 0.84392428, + "num_input_tokens_seen": 110876650, + "router_z_loss_clip": 1.99707031, + "router_z_loss_mlp": 0.16186523, + "step": 5164, + "time_per_iteration": 2.5616652965545654 + }, + { + "auxiliary_loss_clip": 0.06486434, + "auxiliary_loss_mlp": 0.01274591, + "balance_loss_clip": 0.0628885, + "balance_loss_mlp": 0.01258331, + "epoch": 0.3105366000300616, + "flos": 18923390709120.0, + "grad_norm": 1.866784827400394, + "language_loss": 0.75307393, + "learning_rate": 3.2296786613783626e-06, + "loss": 0.83068419, + "num_input_tokens_seen": 110894445, + "router_z_loss_clip": 1.97558594, + "router_z_loss_mlp": 0.16271973, + "step": 5165, + "time_per_iteration": 2.5190699100494385 + }, + { + "auxiliary_loss_clip": 0.06483215, + "auxiliary_loss_mlp": 0.01271797, + "balance_loss_clip": 0.062862, + "balance_loss_mlp": 0.01255096, + "epoch": 0.3105967232827296, + "flos": 18266380444800.0, + "grad_norm": 1.5432274368627708, + "language_loss": 0.76476973, + "learning_rate": 3.229371488178348e-06, + "loss": 0.84231985, + "num_input_tokens_seen": 110912855, + "router_z_loss_clip": 1.97070312, + "router_z_loss_mlp": 0.16699219, + "step": 5166, + "time_per_iteration": 2.5421557426452637 + }, + { + "auxiliary_loss_clip": 0.06486712, + "auxiliary_loss_mlp": 0.01273485, + "balance_loss_clip": 0.06287863, + "balance_loss_mlp": 0.01256796, + "epoch": 0.31065684653539755, + "flos": 17677279514880.0, + "grad_norm": 2.119255684006569, + "language_loss": 0.74129677, + "learning_rate": 3.229064268360444e-06, + "loss": 0.81889874, + "num_input_tokens_seen": 110928025, + "router_z_loss_clip": 1.98828125, + "router_z_loss_mlp": 0.16687012, + "step": 5167, + "time_per_iteration": 2.5039737224578857 + }, + { + "auxiliary_loss_clip": 0.06378125, + "auxiliary_loss_mlp": 0.01261765, + "balance_loss_clip": 0.06284033, + "balance_loss_mlp": 0.01258356, + "epoch": 0.3107169697880655, + "flos": 68551522151040.0, + "grad_norm": 0.7172817016896729, + "language_loss": 0.53065968, + "learning_rate": 3.2287570019362997e-06, + "loss": 0.60705864, + "num_input_tokens_seen": 110992215, + "router_z_loss_clip": 0.93945312, + "router_z_loss_mlp": 0.03417969, + "step": 5168, + "time_per_iteration": 3.211498737335205 + }, + { + "auxiliary_loss_clip": 0.06491841, + "auxiliary_loss_mlp": 0.0127061, + "balance_loss_clip": 0.06290184, + "balance_loss_mlp": 0.01254052, + "epoch": 0.3107770930407335, + "flos": 13193844698880.0, + "grad_norm": 1.7226101243088363, + "language_loss": 0.79536855, + "learning_rate": 3.2284496889175668e-06, + "loss": 0.87299311, + "num_input_tokens_seen": 111010400, + "router_z_loss_clip": 2.01757812, + "router_z_loss_mlp": 0.16552734, + "step": 5169, + "time_per_iteration": 2.526906728744507 + }, + { + "auxiliary_loss_clip": 0.06491011, + "auxiliary_loss_mlp": 0.01271387, + "balance_loss_clip": 0.06288561, + "balance_loss_mlp": 0.01254328, + "epoch": 0.3108372162934015, + "flos": 31589587048320.0, + "grad_norm": 1.7384868970357352, + "language_loss": 0.6439994, + "learning_rate": 3.2281423293158986e-06, + "loss": 0.7216233, + "num_input_tokens_seen": 111033960, + "router_z_loss_clip": 2.02636719, + "router_z_loss_mlp": 0.17077637, + "step": 5170, + "time_per_iteration": 2.659008264541626 + }, + { + "auxiliary_loss_clip": 0.06488822, + "auxiliary_loss_mlp": 0.01276189, + "balance_loss_clip": 0.06288925, + "balance_loss_mlp": 0.01258927, + "epoch": 0.31089733954606946, + "flos": 28737231811200.0, + "grad_norm": 2.2754975952460086, + "language_loss": 0.77238673, + "learning_rate": 3.22783492314295e-06, + "loss": 0.8500368, + "num_input_tokens_seen": 111053265, + "router_z_loss_clip": 1.99902344, + "router_z_loss_mlp": 0.17260742, + "step": 5171, + "time_per_iteration": 2.5726847648620605 + }, + { + "auxiliary_loss_clip": 0.06489364, + "auxiliary_loss_mlp": 0.01274912, + "balance_loss_clip": 0.06290348, + "balance_loss_mlp": 0.01258294, + "epoch": 0.3109574627987374, + "flos": 19689455462400.0, + "grad_norm": 1.774750718996553, + "language_loss": 0.84023309, + "learning_rate": 3.2275274704103785e-06, + "loss": 0.91787583, + "num_input_tokens_seen": 111071130, + "router_z_loss_clip": 1.9921875, + "router_z_loss_mlp": 0.16625977, + "step": 5172, + "time_per_iteration": 2.5289804935455322 + }, + { + "auxiliary_loss_clip": 0.06485899, + "auxiliary_loss_mlp": 0.01271683, + "balance_loss_clip": 0.06284268, + "balance_loss_mlp": 0.01254493, + "epoch": 0.3110175860514054, + "flos": 14689231390080.0, + "grad_norm": 2.444929493076507, + "language_loss": 0.8466565, + "learning_rate": 3.227219971129842e-06, + "loss": 0.92423236, + "num_input_tokens_seen": 111089560, + "router_z_loss_clip": 2.01855469, + "router_z_loss_mlp": 0.17199707, + "step": 5173, + "time_per_iteration": 2.477851629257202 + }, + { + "auxiliary_loss_clip": 0.06478094, + "auxiliary_loss_mlp": 0.01270979, + "balance_loss_clip": 0.06285643, + "balance_loss_mlp": 0.01255279, + "epoch": 0.31107770930407336, + "flos": 25746835772160.0, + "grad_norm": 1.6684709759498597, + "language_loss": 0.83928138, + "learning_rate": 3.226912425313001e-06, + "loss": 0.91677213, + "num_input_tokens_seen": 111109960, + "router_z_loss_clip": 1.92382812, + "router_z_loss_mlp": 0.15698242, + "step": 5174, + "time_per_iteration": 2.6188318729400635 + }, + { + "auxiliary_loss_clip": 0.06483682, + "auxiliary_loss_mlp": 0.0127308, + "balance_loss_clip": 0.06284115, + "balance_loss_mlp": 0.01256057, + "epoch": 0.3111378325567413, + "flos": 19214272558080.0, + "grad_norm": 2.0188284806938945, + "language_loss": 0.85820258, + "learning_rate": 3.2266048329715183e-06, + "loss": 0.93577021, + "num_input_tokens_seen": 111127960, + "router_z_loss_clip": 1.99414062, + "router_z_loss_mlp": 0.17016602, + "step": 5175, + "time_per_iteration": 2.489356756210327 + }, + { + "auxiliary_loss_clip": 0.06477995, + "auxiliary_loss_mlp": 0.01275126, + "balance_loss_clip": 0.0628527, + "balance_loss_mlp": 0.01257352, + "epoch": 0.3111979558094093, + "flos": 23703199816320.0, + "grad_norm": 1.907748003287586, + "language_loss": 0.84357607, + "learning_rate": 3.2262971941170575e-06, + "loss": 0.92110729, + "num_input_tokens_seen": 111146730, + "router_z_loss_clip": 1.92675781, + "router_z_loss_mlp": 0.17773438, + "step": 5176, + "time_per_iteration": 2.599229574203491 + }, + { + "auxiliary_loss_clip": 0.06476277, + "auxiliary_loss_mlp": 0.01273206, + "balance_loss_clip": 0.06279132, + "balance_loss_mlp": 0.01255468, + "epoch": 0.31125807906207725, + "flos": 21039422192640.0, + "grad_norm": 2.9714078029027977, + "language_loss": 0.80720133, + "learning_rate": 3.2259895087612837e-06, + "loss": 0.88469613, + "num_input_tokens_seen": 111166295, + "router_z_loss_clip": 1.96972656, + "router_z_loss_mlp": 0.17736816, + "step": 5177, + "time_per_iteration": 2.500892162322998 + }, + { + "auxiliary_loss_clip": 0.06482373, + "auxiliary_loss_mlp": 0.01272639, + "balance_loss_clip": 0.06283157, + "balance_loss_mlp": 0.01255353, + "epoch": 0.3113182023147452, + "flos": 23083435491840.0, + "grad_norm": 1.9531801027744504, + "language_loss": 0.81037831, + "learning_rate": 3.2256817769158657e-06, + "loss": 0.88792837, + "num_input_tokens_seen": 111185665, + "router_z_loss_clip": 1.9921875, + "router_z_loss_mlp": 0.17285156, + "step": 5178, + "time_per_iteration": 2.6086864471435547 + }, + { + "auxiliary_loss_clip": 0.06483644, + "auxiliary_loss_mlp": 0.01276661, + "balance_loss_clip": 0.06283852, + "balance_loss_mlp": 0.01259316, + "epoch": 0.3113783255674132, + "flos": 11843919895680.0, + "grad_norm": 1.9055325557306373, + "language_loss": 0.81524587, + "learning_rate": 3.225373998592471e-06, + "loss": 0.89284897, + "num_input_tokens_seen": 111201615, + "router_z_loss_clip": 1.99511719, + "router_z_loss_mlp": 0.17346191, + "step": 5179, + "time_per_iteration": 2.4582295417785645 + }, + { + "auxiliary_loss_clip": 0.06482498, + "auxiliary_loss_mlp": 0.01272412, + "balance_loss_clip": 0.06285708, + "balance_loss_mlp": 0.01255926, + "epoch": 0.31143844882008115, + "flos": 16295098089600.0, + "grad_norm": 1.625598326664227, + "language_loss": 0.78714401, + "learning_rate": 3.2250661738027715e-06, + "loss": 0.86469316, + "num_input_tokens_seen": 111220515, + "router_z_loss_clip": 1.96582031, + "router_z_loss_mlp": 0.16491699, + "step": 5180, + "time_per_iteration": 2.4980807304382324 + }, + { + "auxiliary_loss_clip": 0.06486566, + "auxiliary_loss_mlp": 0.01274849, + "balance_loss_clip": 0.06288585, + "balance_loss_mlp": 0.01257742, + "epoch": 0.3114985720727491, + "flos": 23223824208000.0, + "grad_norm": 4.8505374097148595, + "language_loss": 0.83649975, + "learning_rate": 3.22475830255844e-06, + "loss": 0.91411394, + "num_input_tokens_seen": 111240395, + "router_z_loss_clip": 1.98144531, + "router_z_loss_mlp": 0.17102051, + "step": 5181, + "time_per_iteration": 2.519810438156128 + }, + { + "auxiliary_loss_clip": 0.0648061, + "auxiliary_loss_mlp": 0.01273344, + "balance_loss_clip": 0.06285872, + "balance_loss_mlp": 0.01258348, + "epoch": 0.3115586953254171, + "flos": 30052468224000.0, + "grad_norm": 1.6592506395593873, + "language_loss": 0.74442661, + "learning_rate": 3.2244503848711516e-06, + "loss": 0.82196611, + "num_input_tokens_seen": 111261100, + "router_z_loss_clip": 1.94726562, + "router_z_loss_mlp": 0.15002441, + "step": 5182, + "time_per_iteration": 2.6227729320526123 + }, + { + "auxiliary_loss_clip": 0.06490366, + "auxiliary_loss_mlp": 0.01270872, + "balance_loss_clip": 0.06288615, + "balance_loss_mlp": 0.01254362, + "epoch": 0.3116188185780851, + "flos": 25673433995520.0, + "grad_norm": 2.0195817263542852, + "language_loss": 0.70974112, + "learning_rate": 3.2241424207525815e-06, + "loss": 0.78735352, + "num_input_tokens_seen": 111281320, + "router_z_loss_clip": 2.015625, + "router_z_loss_mlp": 0.16503906, + "step": 5183, + "time_per_iteration": 2.5801775455474854 + }, + { + "auxiliary_loss_clip": 0.06369011, + "auxiliary_loss_mlp": 0.0126694, + "balance_loss_clip": 0.06276023, + "balance_loss_mlp": 0.0126376, + "epoch": 0.31167894183075306, + "flos": 69528568285440.0, + "grad_norm": 0.9410725627351464, + "language_loss": 0.59133947, + "learning_rate": 3.223834410214408e-06, + "loss": 0.66769892, + "num_input_tokens_seen": 111341405, + "router_z_loss_clip": 0.9296875, + "router_z_loss_mlp": 0.03182983, + "step": 5184, + "time_per_iteration": 3.1446807384490967 + }, + { + "auxiliary_loss_clip": 0.06488199, + "auxiliary_loss_mlp": 0.01277241, + "balance_loss_clip": 0.06288702, + "balance_loss_mlp": 0.01260206, + "epoch": 0.31173906508342103, + "flos": 14945215213440.0, + "grad_norm": 2.5697318046341424, + "language_loss": 0.69689488, + "learning_rate": 3.223526353268311e-06, + "loss": 0.77454925, + "num_input_tokens_seen": 111358975, + "router_z_loss_clip": 1.99121094, + "router_z_loss_mlp": 0.17041016, + "step": 5185, + "time_per_iteration": 2.51505446434021 + }, + { + "auxiliary_loss_clip": 0.06492566, + "auxiliary_loss_mlp": 0.01273506, + "balance_loss_clip": 0.06291321, + "balance_loss_mlp": 0.01256507, + "epoch": 0.311799188336089, + "flos": 16180886574720.0, + "grad_norm": 2.500262239817252, + "language_loss": 0.63946617, + "learning_rate": 3.2232182499259725e-06, + "loss": 0.71712691, + "num_input_tokens_seen": 111375845, + "router_z_loss_clip": 2.01269531, + "router_z_loss_mlp": 0.17004395, + "step": 5186, + "time_per_iteration": 2.505030870437622 + }, + { + "auxiliary_loss_clip": 0.06492127, + "auxiliary_loss_mlp": 0.01277284, + "balance_loss_clip": 0.06286798, + "balance_loss_mlp": 0.01258592, + "epoch": 0.31185931158875696, + "flos": 25016633366400.0, + "grad_norm": 2.1681671670490603, + "language_loss": 0.86641979, + "learning_rate": 3.2229101001990747e-06, + "loss": 0.94411391, + "num_input_tokens_seen": 111394150, + "router_z_loss_clip": 2.05664062, + "router_z_loss_mlp": 0.18688965, + "step": 5187, + "time_per_iteration": 2.583510160446167 + }, + { + "auxiliary_loss_clip": 0.06487665, + "auxiliary_loss_mlp": 0.01281669, + "balance_loss_clip": 0.06287494, + "balance_loss_mlp": 0.01264527, + "epoch": 0.3119194348414249, + "flos": 37242041702400.0, + "grad_norm": 1.4465041932602023, + "language_loss": 0.6305244, + "learning_rate": 3.2226019040993036e-06, + "loss": 0.70821768, + "num_input_tokens_seen": 111418355, + "router_z_loss_clip": 2.00097656, + "router_z_loss_mlp": 0.17138672, + "step": 5188, + "time_per_iteration": 2.7036139965057373 + }, + { + "auxiliary_loss_clip": 0.06486794, + "auxiliary_loss_mlp": 0.01278194, + "balance_loss_clip": 0.06286722, + "balance_loss_mlp": 0.01261397, + "epoch": 0.3119795580940929, + "flos": 15018155792640.0, + "grad_norm": 2.1005201528303683, + "language_loss": 0.83722234, + "learning_rate": 3.222293661638346e-06, + "loss": 0.91487223, + "num_input_tokens_seen": 111435445, + "router_z_loss_clip": 1.99902344, + "router_z_loss_mlp": 0.16796875, + "step": 5189, + "time_per_iteration": 3.933061361312866 + }, + { + "auxiliary_loss_clip": 0.06481164, + "auxiliary_loss_mlp": 0.0127866, + "balance_loss_clip": 0.06286235, + "balance_loss_mlp": 0.01262602, + "epoch": 0.31203968134676086, + "flos": 16003755043200.0, + "grad_norm": 2.4405990352060862, + "language_loss": 0.79429829, + "learning_rate": 3.22198537282789e-06, + "loss": 0.87189662, + "num_input_tokens_seen": 111453430, + "router_z_loss_clip": 1.94824219, + "router_z_loss_mlp": 0.16064453, + "step": 5190, + "time_per_iteration": 2.479335308074951 + }, + { + "auxiliary_loss_clip": 0.0648755, + "auxiliary_loss_mlp": 0.01275874, + "balance_loss_clip": 0.06287287, + "balance_loss_mlp": 0.01259292, + "epoch": 0.3120998045994288, + "flos": 23843378897280.0, + "grad_norm": 1.451249914697294, + "language_loss": 0.75502658, + "learning_rate": 3.2216770376796262e-06, + "loss": 0.83266091, + "num_input_tokens_seen": 111475325, + "router_z_loss_clip": 2.00195312, + "router_z_loss_mlp": 0.16589355, + "step": 5191, + "time_per_iteration": 3.997621536254883 + }, + { + "auxiliary_loss_clip": 0.06364973, + "auxiliary_loss_mlp": 0.01267778, + "balance_loss_clip": 0.0627303, + "balance_loss_mlp": 0.01264178, + "epoch": 0.3121599278520968, + "flos": 69203081900160.0, + "grad_norm": 0.8286054534369729, + "language_loss": 0.63964236, + "learning_rate": 3.221368656205247e-06, + "loss": 0.71596992, + "num_input_tokens_seen": 111533960, + "router_z_loss_clip": 0.91796875, + "router_z_loss_mlp": 0.03594971, + "step": 5192, + "time_per_iteration": 4.631687879562378 + }, + { + "auxiliary_loss_clip": 0.06487048, + "auxiliary_loss_mlp": 0.01274026, + "balance_loss_clip": 0.06284614, + "balance_loss_mlp": 0.01254916, + "epoch": 0.31222005110476475, + "flos": 23813302481280.0, + "grad_norm": 1.6272414578256373, + "language_loss": 0.80280936, + "learning_rate": 3.221060228416446e-06, + "loss": 0.88042009, + "num_input_tokens_seen": 111554055, + "router_z_loss_clip": 2.02441406, + "router_z_loss_mlp": 0.19116211, + "step": 5193, + "time_per_iteration": 2.5469777584075928 + }, + { + "auxiliary_loss_clip": 0.06487141, + "auxiliary_loss_mlp": 0.01274246, + "balance_loss_clip": 0.06286725, + "balance_loss_mlp": 0.01255244, + "epoch": 0.3122801743574327, + "flos": 25232771773440.0, + "grad_norm": 1.8740192083695482, + "language_loss": 0.72266662, + "learning_rate": 3.2207517543249183e-06, + "loss": 0.80028057, + "num_input_tokens_seen": 111574305, + "router_z_loss_clip": 2.00292969, + "router_z_loss_mlp": 0.19006348, + "step": 5194, + "time_per_iteration": 2.5416929721832275 + }, + { + "auxiliary_loss_clip": 0.06483766, + "auxiliary_loss_mlp": 0.01273792, + "balance_loss_clip": 0.06285778, + "balance_loss_mlp": 0.01257604, + "epoch": 0.3123402976101007, + "flos": 22973165118720.0, + "grad_norm": 1.4810805631902553, + "language_loss": 0.77076054, + "learning_rate": 3.2204432339423616e-06, + "loss": 0.8483361, + "num_input_tokens_seen": 111595680, + "router_z_loss_clip": 1.97949219, + "router_z_loss_mlp": 0.16186523, + "step": 5195, + "time_per_iteration": 2.5890305042266846 + }, + { + "auxiliary_loss_clip": 0.06489303, + "auxiliary_loss_mlp": 0.01273064, + "balance_loss_clip": 0.06285589, + "balance_loss_mlp": 0.01256268, + "epoch": 0.3124004208627687, + "flos": 25199131559040.0, + "grad_norm": 1.3828607146804377, + "language_loss": 0.78218812, + "learning_rate": 3.220134667280476e-06, + "loss": 0.85981178, + "num_input_tokens_seen": 111618135, + "router_z_loss_clip": 2.03710938, + "router_z_loss_mlp": 0.16796875, + "step": 5196, + "time_per_iteration": 2.608607769012451 + }, + { + "auxiliary_loss_clip": 0.06360652, + "auxiliary_loss_mlp": 0.0126022, + "balance_loss_clip": 0.06268834, + "balance_loss_mlp": 0.01256831, + "epoch": 0.31246054411543667, + "flos": 67506398974080.0, + "grad_norm": 0.7576873975695796, + "language_loss": 0.54860902, + "learning_rate": 3.2198260543509613e-06, + "loss": 0.62481773, + "num_input_tokens_seen": 111682220, + "router_z_loss_clip": 0.91455078, + "router_z_loss_mlp": 0.03396606, + "step": 5197, + "time_per_iteration": 4.588749170303345 + }, + { + "auxiliary_loss_clip": 0.06482677, + "auxiliary_loss_mlp": 0.0127766, + "balance_loss_clip": 0.06286696, + "balance_loss_mlp": 0.01261424, + "epoch": 0.31252066736810463, + "flos": 17864347754880.0, + "grad_norm": 1.7824095594325715, + "language_loss": 0.67078102, + "learning_rate": 3.21951739516552e-06, + "loss": 0.74838442, + "num_input_tokens_seen": 111700815, + "router_z_loss_clip": 1.9609375, + "router_z_loss_mlp": 0.16247559, + "step": 5198, + "time_per_iteration": 2.5304651260375977 + }, + { + "auxiliary_loss_clip": 0.06490927, + "auxiliary_loss_mlp": 0.01280145, + "balance_loss_clip": 0.06286959, + "balance_loss_mlp": 0.01261596, + "epoch": 0.3125807906207726, + "flos": 18480338645760.0, + "grad_norm": 2.4146329055675264, + "language_loss": 0.70401263, + "learning_rate": 3.219208689735857e-06, + "loss": 0.78172338, + "num_input_tokens_seen": 111718195, + "router_z_loss_clip": 2.04101562, + "router_z_loss_mlp": 0.1854248, + "step": 5199, + "time_per_iteration": 2.5358517169952393 + }, + { + "auxiliary_loss_clip": 0.06486207, + "auxiliary_loss_mlp": 0.01275953, + "balance_loss_clip": 0.06286721, + "balance_loss_mlp": 0.01258751, + "epoch": 0.31264091387344056, + "flos": 18951454627200.0, + "grad_norm": 1.7917967449154466, + "language_loss": 0.79258394, + "learning_rate": 3.2188999380736785e-06, + "loss": 0.87020558, + "num_input_tokens_seen": 111734440, + "router_z_loss_clip": 1.99316406, + "router_z_loss_mlp": 0.17211914, + "step": 5200, + "time_per_iteration": 2.5519278049468994 + }, + { + "auxiliary_loss_clip": 0.06479242, + "auxiliary_loss_mlp": 0.0127792, + "balance_loss_clip": 0.06284697, + "balance_loss_mlp": 0.01261195, + "epoch": 0.3127010371261085, + "flos": 21474591972480.0, + "grad_norm": 1.8808343302197998, + "language_loss": 0.83758473, + "learning_rate": 3.2185911401906917e-06, + "loss": 0.91515636, + "num_input_tokens_seen": 111751960, + "router_z_loss_clip": 1.94726562, + "router_z_loss_mlp": 0.16711426, + "step": 5201, + "time_per_iteration": 2.509331226348877 + }, + { + "auxiliary_loss_clip": 0.06487838, + "auxiliary_loss_mlp": 0.0127922, + "balance_loss_clip": 0.06288306, + "balance_loss_mlp": 0.01262006, + "epoch": 0.3127611603787765, + "flos": 15340623431040.0, + "grad_norm": 2.173524859167814, + "language_loss": 0.69690537, + "learning_rate": 3.2182822960986072e-06, + "loss": 0.77457595, + "num_input_tokens_seen": 111769585, + "router_z_loss_clip": 1.99414062, + "router_z_loss_mlp": 0.17224121, + "step": 5202, + "time_per_iteration": 2.52652907371521 + }, + { + "auxiliary_loss_clip": 0.06486704, + "auxiliary_loss_mlp": 0.01278352, + "balance_loss_clip": 0.06286184, + "balance_loss_mlp": 0.01261257, + "epoch": 0.31282128363144446, + "flos": 17608741274880.0, + "grad_norm": 2.6038382996561604, + "language_loss": 0.83874559, + "learning_rate": 3.2179734058091358e-06, + "loss": 0.91639626, + "num_input_tokens_seen": 111787880, + "router_z_loss_clip": 2.00488281, + "router_z_loss_mlp": 0.17077637, + "step": 5203, + "time_per_iteration": 2.502721071243286 + }, + { + "auxiliary_loss_clip": 0.06488604, + "auxiliary_loss_mlp": 0.01274199, + "balance_loss_clip": 0.06287186, + "balance_loss_mlp": 0.01256604, + "epoch": 0.3128814068841124, + "flos": 26763349979520.0, + "grad_norm": 2.412675439541041, + "language_loss": 0.61310971, + "learning_rate": 3.2176644693339913e-06, + "loss": 0.69073772, + "num_input_tokens_seen": 111805950, + "router_z_loss_clip": 2.01464844, + "router_z_loss_mlp": 0.17602539, + "step": 5204, + "time_per_iteration": 2.62591814994812 + }, + { + "auxiliary_loss_clip": 0.06482827, + "auxiliary_loss_mlp": 0.01275158, + "balance_loss_clip": 0.0628654, + "balance_loss_mlp": 0.01259553, + "epoch": 0.3129415301367804, + "flos": 22278783133440.0, + "grad_norm": 1.7324044566720012, + "language_loss": 0.66418731, + "learning_rate": 3.217355486684887e-06, + "loss": 0.74176717, + "num_input_tokens_seen": 111826135, + "router_z_loss_clip": 1.96191406, + "router_z_loss_mlp": 0.15582275, + "step": 5205, + "time_per_iteration": 2.512777328491211 + }, + { + "auxiliary_loss_clip": 0.06487758, + "auxiliary_loss_mlp": 0.01277628, + "balance_loss_clip": 0.06287788, + "balance_loss_mlp": 0.01260021, + "epoch": 0.31300165338944835, + "flos": 26471461881600.0, + "grad_norm": 1.8344199627772577, + "language_loss": 0.77298087, + "learning_rate": 3.2170464578735414e-06, + "loss": 0.85063475, + "num_input_tokens_seen": 111844700, + "router_z_loss_clip": 1.99902344, + "router_z_loss_mlp": 0.17614746, + "step": 5206, + "time_per_iteration": 2.5712244510650635 + }, + { + "auxiliary_loss_clip": 0.06485735, + "auxiliary_loss_mlp": 0.01271701, + "balance_loss_clip": 0.06288184, + "balance_loss_mlp": 0.01255488, + "epoch": 0.3130617766421163, + "flos": 21951116542080.0, + "grad_norm": 2.0121384013718226, + "language_loss": 0.83184564, + "learning_rate": 3.216737382911672e-06, + "loss": 0.90941995, + "num_input_tokens_seen": 111861585, + "router_z_loss_clip": 1.97558594, + "router_z_loss_mlp": 0.16210938, + "step": 5207, + "time_per_iteration": 2.5004825592041016 + }, + { + "auxiliary_loss_clip": 0.06481713, + "auxiliary_loss_mlp": 0.01271341, + "balance_loss_clip": 0.06286129, + "balance_loss_mlp": 0.0125489, + "epoch": 0.3131218998947843, + "flos": 23299154628480.0, + "grad_norm": 2.0890442442793478, + "language_loss": 0.71795774, + "learning_rate": 3.216428261810999e-06, + "loss": 0.79548824, + "num_input_tokens_seen": 111882950, + "router_z_loss_clip": 1.95605469, + "router_z_loss_mlp": 0.16442871, + "step": 5208, + "time_per_iteration": 2.5763585567474365 + }, + { + "auxiliary_loss_clip": 0.06485837, + "auxiliary_loss_mlp": 0.01275661, + "balance_loss_clip": 0.06287587, + "balance_loss_mlp": 0.0125927, + "epoch": 0.3131820231474523, + "flos": 21145583715840.0, + "grad_norm": 1.890905451265213, + "language_loss": 0.74832964, + "learning_rate": 3.2161190945832445e-06, + "loss": 0.82594466, + "num_input_tokens_seen": 111901640, + "router_z_loss_clip": 1.98046875, + "router_z_loss_mlp": 0.1640625, + "step": 5209, + "time_per_iteration": 2.510582685470581 + }, + { + "auxiliary_loss_clip": 0.06483819, + "auxiliary_loss_mlp": 0.01271712, + "balance_loss_clip": 0.06284019, + "balance_loss_mlp": 0.01255678, + "epoch": 0.31324214640012027, + "flos": 23915816352000.0, + "grad_norm": 1.8368712630160764, + "language_loss": 0.77846575, + "learning_rate": 3.2158098812401325e-06, + "loss": 0.85602105, + "num_input_tokens_seen": 111919615, + "router_z_loss_clip": 1.99609375, + "router_z_loss_mlp": 0.16027832, + "step": 5210, + "time_per_iteration": 2.5457394123077393 + }, + { + "auxiliary_loss_clip": 0.06472643, + "auxiliary_loss_mlp": 0.01278598, + "balance_loss_clip": 0.06280389, + "balance_loss_mlp": 0.01262963, + "epoch": 0.31330226965278823, + "flos": 22243507764480.0, + "grad_norm": 1.7690758446531836, + "language_loss": 0.79563594, + "learning_rate": 3.2155006217933874e-06, + "loss": 0.87314838, + "num_input_tokens_seen": 111938485, + "router_z_loss_clip": 1.92285156, + "router_z_loss_mlp": 0.15643311, + "step": 5211, + "time_per_iteration": 2.5383517742156982 + }, + { + "auxiliary_loss_clip": 0.0648172, + "auxiliary_loss_mlp": 0.01270065, + "balance_loss_clip": 0.06285914, + "balance_loss_mlp": 0.01254699, + "epoch": 0.3133623929054562, + "flos": 19759838492160.0, + "grad_norm": 1.6892345584465767, + "language_loss": 0.79993588, + "learning_rate": 3.2151913162547367e-06, + "loss": 0.87745374, + "num_input_tokens_seen": 111956425, + "router_z_loss_clip": 1.95996094, + "router_z_loss_mlp": 0.15368652, + "step": 5212, + "time_per_iteration": 2.5550856590270996 + }, + { + "auxiliary_loss_clip": 0.06489062, + "auxiliary_loss_mlp": 0.01276168, + "balance_loss_clip": 0.06287421, + "balance_loss_mlp": 0.01258919, + "epoch": 0.31342251615812416, + "flos": 27169617300480.0, + "grad_norm": 2.030797991853156, + "language_loss": 0.71651685, + "learning_rate": 3.2148819646359097e-06, + "loss": 0.79416913, + "num_input_tokens_seen": 111975915, + "router_z_loss_clip": 2.01660156, + "router_z_loss_mlp": 0.17248535, + "step": 5213, + "time_per_iteration": 2.5827908515930176 + }, + { + "auxiliary_loss_clip": 0.06486979, + "auxiliary_loss_mlp": 0.01275678, + "balance_loss_clip": 0.06285015, + "balance_loss_mlp": 0.01258763, + "epoch": 0.31348263941079213, + "flos": 20235985718400.0, + "grad_norm": 2.164105834219518, + "language_loss": 0.77949297, + "learning_rate": 3.2145725669486374e-06, + "loss": 0.85711956, + "num_input_tokens_seen": 111995055, + "router_z_loss_clip": 2.01757812, + "router_z_loss_mlp": 0.16918945, + "step": 5214, + "time_per_iteration": 2.539149761199951 + }, + { + "auxiliary_loss_clip": 0.06478322, + "auxiliary_loss_mlp": 0.0127674, + "balance_loss_clip": 0.06285194, + "balance_loss_mlp": 0.01261267, + "epoch": 0.3135427626634601, + "flos": 24614474895360.0, + "grad_norm": 1.5354860146289633, + "language_loss": 0.82935429, + "learning_rate": 3.2142631232046517e-06, + "loss": 0.90690494, + "num_input_tokens_seen": 112015830, + "router_z_loss_clip": 1.92871094, + "router_z_loss_mlp": 0.15472412, + "step": 5215, + "time_per_iteration": 2.541269302368164 + }, + { + "auxiliary_loss_clip": 0.06486098, + "auxiliary_loss_mlp": 0.01273565, + "balance_loss_clip": 0.06288007, + "balance_loss_mlp": 0.01257186, + "epoch": 0.31360288591612806, + "flos": 20966230051200.0, + "grad_norm": 1.8278899125375987, + "language_loss": 0.79790628, + "learning_rate": 3.213953633415686e-06, + "loss": 0.87550294, + "num_input_tokens_seen": 112035065, + "router_z_loss_clip": 1.98144531, + "router_z_loss_mlp": 0.16369629, + "step": 5216, + "time_per_iteration": 2.5465261936187744 + }, + { + "auxiliary_loss_clip": 0.06489767, + "auxiliary_loss_mlp": 0.0127801, + "balance_loss_clip": 0.06286536, + "balance_loss_mlp": 0.01258722, + "epoch": 0.313663009168796, + "flos": 26987957648640.0, + "grad_norm": 1.8964979694160957, + "language_loss": 0.68953168, + "learning_rate": 3.213644097593477e-06, + "loss": 0.76720947, + "num_input_tokens_seen": 112058405, + "router_z_loss_clip": 2.03125, + "router_z_loss_mlp": 0.19299316, + "step": 5217, + "time_per_iteration": 2.5518875122070312 + }, + { + "auxiliary_loss_clip": 0.06480299, + "auxiliary_loss_mlp": 0.01275451, + "balance_loss_clip": 0.06283456, + "balance_loss_mlp": 0.01259298, + "epoch": 0.313723132421464, + "flos": 18046762093440.0, + "grad_norm": 1.6389262097165689, + "language_loss": 0.80772746, + "learning_rate": 3.2133345157497624e-06, + "loss": 0.88528496, + "num_input_tokens_seen": 112076420, + "router_z_loss_clip": 1.96679688, + "router_z_loss_mlp": 0.16149902, + "step": 5218, + "time_per_iteration": 2.5255727767944336 + }, + { + "auxiliary_loss_clip": 0.06485314, + "auxiliary_loss_mlp": 0.0127641, + "balance_loss_clip": 0.06285116, + "balance_loss_mlp": 0.01259363, + "epoch": 0.31378325567413196, + "flos": 22494963467520.0, + "grad_norm": 2.253901481236794, + "language_loss": 0.70057523, + "learning_rate": 3.2130248878962813e-06, + "loss": 0.77819252, + "num_input_tokens_seen": 112090775, + "router_z_loss_clip": 2.00195312, + "router_z_loss_mlp": 0.17047119, + "step": 5219, + "time_per_iteration": 2.487877368927002 + }, + { + "auxiliary_loss_clip": 0.06483484, + "auxiliary_loss_mlp": 0.0127692, + "balance_loss_clip": 0.06284904, + "balance_loss_mlp": 0.01259181, + "epoch": 0.3138433789267999, + "flos": 22425838248960.0, + "grad_norm": 1.9320324134388631, + "language_loss": 0.80156839, + "learning_rate": 3.2127152140447747e-06, + "loss": 0.87917244, + "num_input_tokens_seen": 112110980, + "router_z_loss_clip": 1.98535156, + "router_z_loss_mlp": 0.17736816, + "step": 5220, + "time_per_iteration": 2.5364530086517334 + }, + { + "auxiliary_loss_clip": 0.06484166, + "auxiliary_loss_mlp": 0.01276534, + "balance_loss_clip": 0.06287254, + "balance_loss_mlp": 0.01260751, + "epoch": 0.3139035021794679, + "flos": 13010927235840.0, + "grad_norm": 1.8390249578816682, + "language_loss": 0.73235905, + "learning_rate": 3.212405494206986e-06, + "loss": 0.80996603, + "num_input_tokens_seen": 112129020, + "router_z_loss_clip": 1.96777344, + "router_z_loss_mlp": 0.15771484, + "step": 5221, + "time_per_iteration": 2.477369546890259 + }, + { + "auxiliary_loss_clip": 0.06480553, + "auxiliary_loss_mlp": 0.0127616, + "balance_loss_clip": 0.0628504, + "balance_loss_mlp": 0.0125996, + "epoch": 0.31396362543213585, + "flos": 16951605229440.0, + "grad_norm": 1.9354629264259422, + "language_loss": 0.81906354, + "learning_rate": 3.2120957283946588e-06, + "loss": 0.89663064, + "num_input_tokens_seen": 112147865, + "router_z_loss_clip": 1.95605469, + "router_z_loss_mlp": 0.16223145, + "step": 5222, + "time_per_iteration": 2.5057129859924316 + }, + { + "auxiliary_loss_clip": 0.06490297, + "auxiliary_loss_mlp": 0.01284294, + "balance_loss_clip": 0.06288279, + "balance_loss_mlp": 0.01266555, + "epoch": 0.31402374868480387, + "flos": 20162877431040.0, + "grad_norm": 1.9084075298763516, + "language_loss": 0.70490289, + "learning_rate": 3.2117859166195407e-06, + "loss": 0.78264874, + "num_input_tokens_seen": 112166745, + "router_z_loss_clip": 2.02050781, + "router_z_loss_mlp": 0.17749023, + "step": 5223, + "time_per_iteration": 2.4747233390808105 + }, + { + "auxiliary_loss_clip": 0.06484593, + "auxiliary_loss_mlp": 0.01276253, + "balance_loss_clip": 0.06287414, + "balance_loss_mlp": 0.01259718, + "epoch": 0.31408387193747184, + "flos": 21257363462400.0, + "grad_norm": 1.5262001080385015, + "language_loss": 0.80608702, + "learning_rate": 3.211476058893379e-06, + "loss": 0.88369542, + "num_input_tokens_seen": 112185895, + "router_z_loss_clip": 1.96972656, + "router_z_loss_mlp": 0.1652832, + "step": 5224, + "time_per_iteration": 2.576864004135132 + }, + { + "auxiliary_loss_clip": 0.06497495, + "auxiliary_loss_mlp": 0.01279621, + "balance_loss_clip": 0.06291461, + "balance_loss_mlp": 0.01261632, + "epoch": 0.3141439951901398, + "flos": 27490617492480.0, + "grad_norm": 2.962077450034062, + "language_loss": 0.58624607, + "learning_rate": 3.2111661552279243e-06, + "loss": 0.66401726, + "num_input_tokens_seen": 112204465, + "router_z_loss_clip": 2.05859375, + "router_z_loss_mlp": 0.17993164, + "step": 5225, + "time_per_iteration": 2.558159828186035 + }, + { + "auxiliary_loss_clip": 0.06482717, + "auxiliary_loss_mlp": 0.0128044, + "balance_loss_clip": 0.06289019, + "balance_loss_mlp": 0.0126505, + "epoch": 0.31420411844280777, + "flos": 17857010522880.0, + "grad_norm": 1.7568792542410607, + "language_loss": 0.81975454, + "learning_rate": 3.2108562056349273e-06, + "loss": 0.89738619, + "num_input_tokens_seen": 112221635, + "router_z_loss_clip": 1.93847656, + "router_z_loss_mlp": 0.15380859, + "step": 5226, + "time_per_iteration": 2.5197925567626953 + }, + { + "auxiliary_loss_clip": 0.06493273, + "auxiliary_loss_mlp": 0.01283534, + "balance_loss_clip": 0.0629416, + "balance_loss_mlp": 0.01265998, + "epoch": 0.31426424169547573, + "flos": 21623491877760.0, + "grad_norm": 1.9094319640845634, + "language_loss": 0.74358761, + "learning_rate": 3.210546210126141e-06, + "loss": 0.8213557, + "num_input_tokens_seen": 112241240, + "router_z_loss_clip": 1.99023438, + "router_z_loss_mlp": 0.17529297, + "step": 5227, + "time_per_iteration": 2.6723456382751465 + }, + { + "auxiliary_loss_clip": 0.06493893, + "auxiliary_loss_mlp": 0.01287677, + "balance_loss_clip": 0.0629607, + "balance_loss_mlp": 0.01270392, + "epoch": 0.3143243649481437, + "flos": 30928677569280.0, + "grad_norm": 1.9492252245216757, + "language_loss": 0.68802202, + "learning_rate": 3.2102361687133213e-06, + "loss": 0.76583767, + "num_input_tokens_seen": 112262350, + "router_z_loss_clip": 1.97949219, + "router_z_loss_mlp": 0.17297363, + "step": 5228, + "time_per_iteration": 2.724705934524536 + }, + { + "auxiliary_loss_clip": 0.06488988, + "auxiliary_loss_mlp": 0.01281066, + "balance_loss_clip": 0.06292454, + "balance_loss_mlp": 0.01265044, + "epoch": 0.31438448820081166, + "flos": 22828206355200.0, + "grad_norm": 1.7089427628420442, + "language_loss": 0.80276144, + "learning_rate": 3.2099260814082254e-06, + "loss": 0.88046199, + "num_input_tokens_seen": 112283710, + "router_z_loss_clip": 1.96484375, + "router_z_loss_mlp": 0.16015625, + "step": 5229, + "time_per_iteration": 4.091265678405762 + }, + { + "auxiliary_loss_clip": 0.06481495, + "auxiliary_loss_mlp": 0.01275808, + "balance_loss_clip": 0.06287295, + "balance_loss_mlp": 0.01259428, + "epoch": 0.3144446114534796, + "flos": 23298399941760.0, + "grad_norm": 1.658320923858175, + "language_loss": 0.70112014, + "learning_rate": 3.209615948222611e-06, + "loss": 0.7786932, + "num_input_tokens_seen": 112304285, + "router_z_loss_clip": 1.93945312, + "router_z_loss_mlp": 0.16381836, + "step": 5230, + "time_per_iteration": 2.5652499198913574 + }, + { + "auxiliary_loss_clip": 0.06489812, + "auxiliary_loss_mlp": 0.01281571, + "balance_loss_clip": 0.06291179, + "balance_loss_mlp": 0.01264572, + "epoch": 0.3145047347061476, + "flos": 31363679640960.0, + "grad_norm": 2.930398163442548, + "language_loss": 0.80236816, + "learning_rate": 3.209305769168239e-06, + "loss": 0.88008201, + "num_input_tokens_seen": 112325110, + "router_z_loss_clip": 1.984375, + "router_z_loss_mlp": 0.17004395, + "step": 5231, + "time_per_iteration": 5.461926698684692 + }, + { + "auxiliary_loss_clip": 0.06483024, + "auxiliary_loss_mlp": 0.01279077, + "balance_loss_clip": 0.062879, + "balance_loss_mlp": 0.01262912, + "epoch": 0.31456485795881556, + "flos": 10894182992640.0, + "grad_norm": 3.377505802107346, + "language_loss": 0.85102671, + "learning_rate": 3.2089955442568704e-06, + "loss": 0.92864776, + "num_input_tokens_seen": 112339855, + "router_z_loss_clip": 1.95117188, + "router_z_loss_mlp": 0.16149902, + "step": 5232, + "time_per_iteration": 2.549555778503418 + }, + { + "auxiliary_loss_clip": 0.06479923, + "auxiliary_loss_mlp": 0.01286528, + "balance_loss_clip": 0.06287266, + "balance_loss_mlp": 0.01269779, + "epoch": 0.3146249812114835, + "flos": 17098157220480.0, + "grad_norm": 1.5771176865385883, + "language_loss": 0.80666757, + "learning_rate": 3.2086852735002692e-06, + "loss": 0.88433212, + "num_input_tokens_seen": 112358480, + "router_z_loss_clip": 1.92675781, + "router_z_loss_mlp": 0.16748047, + "step": 5233, + "time_per_iteration": 2.502790927886963 + }, + { + "auxiliary_loss_clip": 0.06496342, + "auxiliary_loss_mlp": 0.01276742, + "balance_loss_clip": 0.06294576, + "balance_loss_mlp": 0.01260768, + "epoch": 0.3146851044641515, + "flos": 55303283352960.0, + "grad_norm": 1.6501859452394316, + "language_loss": 0.71124518, + "learning_rate": 3.2083749569102024e-06, + "loss": 0.78897607, + "num_input_tokens_seen": 112382350, + "router_z_loss_clip": 2.01660156, + "router_z_loss_mlp": 0.15966797, + "step": 5234, + "time_per_iteration": 2.8301026821136475 + }, + { + "auxiliary_loss_clip": 0.06491733, + "auxiliary_loss_mlp": 0.01276589, + "balance_loss_clip": 0.06292239, + "balance_loss_mlp": 0.01259566, + "epoch": 0.31474522771681945, + "flos": 27023149163520.0, + "grad_norm": 1.9231261360365097, + "language_loss": 0.73437119, + "learning_rate": 3.2080645944984356e-06, + "loss": 0.8120544, + "num_input_tokens_seen": 112400260, + "router_z_loss_clip": 1.99902344, + "router_z_loss_mlp": 0.17004395, + "step": 5235, + "time_per_iteration": 2.543799638748169 + }, + { + "auxiliary_loss_clip": 0.0648193, + "auxiliary_loss_mlp": 0.0127527, + "balance_loss_clip": 0.0628682, + "balance_loss_mlp": 0.01259308, + "epoch": 0.3148053509694875, + "flos": 21258369711360.0, + "grad_norm": 1.9283939280374622, + "language_loss": 0.79554284, + "learning_rate": 3.2077541862767384e-06, + "loss": 0.87311482, + "num_input_tokens_seen": 112419400, + "router_z_loss_clip": 1.95214844, + "router_z_loss_mlp": 0.15942383, + "step": 5236, + "time_per_iteration": 2.5356431007385254 + }, + { + "auxiliary_loss_clip": 0.06493077, + "auxiliary_loss_mlp": 0.01277667, + "balance_loss_clip": 0.06288847, + "balance_loss_mlp": 0.01260942, + "epoch": 0.31486547422215544, + "flos": 31256721504000.0, + "grad_norm": 2.880510555000243, + "language_loss": 0.76337612, + "learning_rate": 3.207443732256881e-06, + "loss": 0.84108353, + "num_input_tokens_seen": 112440825, + "router_z_loss_clip": 2.04101562, + "router_z_loss_mlp": 0.16723633, + "step": 5237, + "time_per_iteration": 4.129598379135132 + }, + { + "auxiliary_loss_clip": 0.0648271, + "auxiliary_loss_mlp": 0.01277744, + "balance_loss_clip": 0.06291585, + "balance_loss_mlp": 0.01262843, + "epoch": 0.3149255974748234, + "flos": 19834749642240.0, + "grad_norm": 1.6736027402410734, + "language_loss": 0.7951014, + "learning_rate": 3.2071332324506372e-06, + "loss": 0.87270594, + "num_input_tokens_seen": 112459180, + "router_z_loss_clip": 1.91015625, + "router_z_loss_mlp": 0.14916992, + "step": 5238, + "time_per_iteration": 2.504612445831299 + }, + { + "auxiliary_loss_clip": 0.06376656, + "auxiliary_loss_mlp": 0.01267743, + "balance_loss_clip": 0.06282751, + "balance_loss_mlp": 0.01263604, + "epoch": 0.31498572072749137, + "flos": 67701867350400.0, + "grad_norm": 0.8276402478045692, + "language_loss": 0.68007928, + "learning_rate": 3.2068226868697795e-06, + "loss": 0.75652325, + "num_input_tokens_seen": 112516680, + "router_z_loss_clip": 0.9375, + "router_z_loss_mlp": 0.04141235, + "step": 5239, + "time_per_iteration": 3.174287796020508 + }, + { + "auxiliary_loss_clip": 0.06498836, + "auxiliary_loss_mlp": 0.01274257, + "balance_loss_clip": 0.06292844, + "balance_loss_mlp": 0.01256376, + "epoch": 0.31504584398015933, + "flos": 19799432346240.0, + "grad_norm": 2.176171670908613, + "language_loss": 0.82951081, + "learning_rate": 3.2065120955260846e-06, + "loss": 0.9072417, + "num_input_tokens_seen": 112535895, + "router_z_loss_clip": 2.0625, + "router_z_loss_mlp": 0.17883301, + "step": 5240, + "time_per_iteration": 2.509793996810913 + }, + { + "auxiliary_loss_clip": 0.06485248, + "auxiliary_loss_mlp": 0.01278588, + "balance_loss_clip": 0.06288239, + "balance_loss_mlp": 0.01262125, + "epoch": 0.3151059672328273, + "flos": 26622751628160.0, + "grad_norm": 1.8077188253124041, + "language_loss": 0.81193888, + "learning_rate": 3.2062014584313302e-06, + "loss": 0.88957721, + "num_input_tokens_seen": 112557490, + "router_z_loss_clip": 1.97363281, + "router_z_loss_mlp": 0.16455078, + "step": 5241, + "time_per_iteration": 2.571192502975464 + }, + { + "auxiliary_loss_clip": 0.06482153, + "auxiliary_loss_mlp": 0.01277268, + "balance_loss_clip": 0.06291743, + "balance_loss_mlp": 0.01260912, + "epoch": 0.31516609048549526, + "flos": 24210890904960.0, + "grad_norm": 1.4478120037649602, + "language_loss": 0.74484038, + "learning_rate": 3.2058907755972956e-06, + "loss": 0.82243454, + "num_input_tokens_seen": 112577075, + "router_z_loss_clip": 1.90332031, + "router_z_loss_mlp": 0.16357422, + "step": 5242, + "time_per_iteration": 2.526357650756836 + }, + { + "auxiliary_loss_clip": 0.06487267, + "auxiliary_loss_mlp": 0.01275494, + "balance_loss_clip": 0.06292535, + "balance_loss_mlp": 0.01259163, + "epoch": 0.31522621373816323, + "flos": 25965950999040.0, + "grad_norm": 1.6442244241642663, + "language_loss": 0.73668325, + "learning_rate": 3.2055800470357626e-06, + "loss": 0.81431091, + "num_input_tokens_seen": 112597620, + "router_z_loss_clip": 1.95019531, + "router_z_loss_mlp": 0.16320801, + "step": 5243, + "time_per_iteration": 2.606276273727417 + }, + { + "auxiliary_loss_clip": 0.06485401, + "auxiliary_loss_mlp": 0.01273843, + "balance_loss_clip": 0.0628818, + "balance_loss_mlp": 0.0125713, + "epoch": 0.3152863369908312, + "flos": 21915379975680.0, + "grad_norm": 1.7357669101009914, + "language_loss": 0.64914608, + "learning_rate": 3.205269272758513e-06, + "loss": 0.72673857, + "num_input_tokens_seen": 112617150, + "router_z_loss_clip": 1.97265625, + "router_z_loss_mlp": 0.16711426, + "step": 5244, + "time_per_iteration": 2.5950305461883545 + }, + { + "auxiliary_loss_clip": 0.06492754, + "auxiliary_loss_mlp": 0.01274277, + "balance_loss_clip": 0.06292984, + "balance_loss_mlp": 0.01257743, + "epoch": 0.31534646024349916, + "flos": 16285203308160.0, + "grad_norm": 2.8540583379791005, + "language_loss": 0.91357732, + "learning_rate": 3.2049584527773313e-06, + "loss": 0.99124765, + "num_input_tokens_seen": 112631090, + "router_z_loss_clip": 2.0, + "router_z_loss_mlp": 0.16540527, + "step": 5245, + "time_per_iteration": 2.510085105895996 + }, + { + "auxiliary_loss_clip": 0.06488977, + "auxiliary_loss_mlp": 0.01277309, + "balance_loss_clip": 0.06291293, + "balance_loss_mlp": 0.01260596, + "epoch": 0.3154065834961671, + "flos": 24724116362880.0, + "grad_norm": 1.9445780779956967, + "language_loss": 0.75699973, + "learning_rate": 3.2046475871040048e-06, + "loss": 0.83466256, + "num_input_tokens_seen": 112651220, + "router_z_loss_clip": 1.97851562, + "router_z_loss_mlp": 0.1673584, + "step": 5246, + "time_per_iteration": 2.543600559234619 + }, + { + "auxiliary_loss_clip": 0.06488622, + "auxiliary_loss_mlp": 0.01279725, + "balance_loss_clip": 0.06290317, + "balance_loss_mlp": 0.01262833, + "epoch": 0.3154667067488351, + "flos": 35379813836160.0, + "grad_norm": 1.6152414177037249, + "language_loss": 0.61608225, + "learning_rate": 3.204336675750321e-06, + "loss": 0.69376576, + "num_input_tokens_seen": 112671560, + "router_z_loss_clip": 1.98144531, + "router_z_loss_mlp": 0.16882324, + "step": 5247, + "time_per_iteration": 2.6849827766418457 + }, + { + "auxiliary_loss_clip": 0.06491058, + "auxiliary_loss_mlp": 0.01281873, + "balance_loss_clip": 0.06290263, + "balance_loss_mlp": 0.0126417, + "epoch": 0.31552683000150306, + "flos": 17462105429760.0, + "grad_norm": 2.6938697298202667, + "language_loss": 0.82848823, + "learning_rate": 3.2040257187280693e-06, + "loss": 0.90621758, + "num_input_tokens_seen": 112689790, + "router_z_loss_clip": 2.00585938, + "router_z_loss_mlp": 0.17687988, + "step": 5248, + "time_per_iteration": 2.4956586360931396 + }, + { + "auxiliary_loss_clip": 0.06488842, + "auxiliary_loss_mlp": 0.01282146, + "balance_loss_clip": 0.06291078, + "balance_loss_mlp": 0.01264121, + "epoch": 0.3155869532541711, + "flos": 18411674624640.0, + "grad_norm": 4.654519722073602, + "language_loss": 0.85721719, + "learning_rate": 3.2037147160490423e-06, + "loss": 0.93492711, + "num_input_tokens_seen": 112708265, + "router_z_loss_clip": 1.9765625, + "router_z_loss_mlp": 0.18029785, + "step": 5249, + "time_per_iteration": 2.568054437637329 + }, + { + "auxiliary_loss_clip": 0.06489561, + "auxiliary_loss_mlp": 0.01280069, + "balance_loss_clip": 0.06290483, + "balance_loss_mlp": 0.01261198, + "epoch": 0.31564707650683904, + "flos": 21586162083840.0, + "grad_norm": 1.7795262086342007, + "language_loss": 0.86067384, + "learning_rate": 3.2034036677250322e-06, + "loss": 0.93837023, + "num_input_tokens_seen": 112727820, + "router_z_loss_clip": 1.98730469, + "router_z_loss_mlp": 0.1887207, + "step": 5250, + "time_per_iteration": 2.508528709411621 + }, + { + "auxiliary_loss_clip": 0.06486481, + "auxiliary_loss_mlp": 0.01279989, + "balance_loss_clip": 0.06289366, + "balance_loss_mlp": 0.01262334, + "epoch": 0.315707199759507, + "flos": 21037032351360.0, + "grad_norm": 2.1261014211455063, + "language_loss": 0.6942147, + "learning_rate": 3.203092573767835e-06, + "loss": 0.77187943, + "num_input_tokens_seen": 112743140, + "router_z_loss_clip": 1.96777344, + "router_z_loss_mlp": 0.1763916, + "step": 5251, + "time_per_iteration": 2.526685953140259 + }, + { + "auxiliary_loss_clip": 0.06487083, + "auxiliary_loss_mlp": 0.01273182, + "balance_loss_clip": 0.06288725, + "balance_loss_mlp": 0.01255586, + "epoch": 0.31576732301217497, + "flos": 26835326236800.0, + "grad_norm": 2.019211823887184, + "language_loss": 0.78895354, + "learning_rate": 3.202781434189246e-06, + "loss": 0.86655623, + "num_input_tokens_seen": 112764705, + "router_z_loss_clip": 1.98339844, + "router_z_loss_mlp": 0.17602539, + "step": 5252, + "time_per_iteration": 2.570160150527954 + }, + { + "auxiliary_loss_clip": 0.06486022, + "auxiliary_loss_mlp": 0.01277329, + "balance_loss_clip": 0.06289184, + "balance_loss_mlp": 0.01261664, + "epoch": 0.31582744626484294, + "flos": 22717810200960.0, + "grad_norm": 1.5436537660689573, + "language_loss": 0.74377203, + "learning_rate": 3.202470249001066e-06, + "loss": 0.82140553, + "num_input_tokens_seen": 112785310, + "router_z_loss_clip": 1.96679688, + "router_z_loss_mlp": 0.15661621, + "step": 5253, + "time_per_iteration": 2.587277412414551 + }, + { + "auxiliary_loss_clip": 0.06489179, + "auxiliary_loss_mlp": 0.01281773, + "balance_loss_clip": 0.06290863, + "balance_loss_mlp": 0.01264309, + "epoch": 0.3158875695175109, + "flos": 23958806296320.0, + "grad_norm": 1.6773864910066614, + "language_loss": 0.73971915, + "learning_rate": 3.2021590182150924e-06, + "loss": 0.81742871, + "num_input_tokens_seen": 112802905, + "router_z_loss_clip": 1.98339844, + "router_z_loss_mlp": 0.17456055, + "step": 5254, + "time_per_iteration": 2.588543653488159 + }, + { + "auxiliary_loss_clip": 0.06491473, + "auxiliary_loss_mlp": 0.01275265, + "balance_loss_clip": 0.06290045, + "balance_loss_mlp": 0.01257408, + "epoch": 0.31594769277017887, + "flos": 13267036840320.0, + "grad_norm": 2.7381317978754933, + "language_loss": 0.78115344, + "learning_rate": 3.201847741843128e-06, + "loss": 0.85882092, + "num_input_tokens_seen": 112820305, + "router_z_loss_clip": 2.01367188, + "router_z_loss_mlp": 0.17858887, + "step": 5255, + "time_per_iteration": 2.5159435272216797 + }, + { + "auxiliary_loss_clip": 0.0648552, + "auxiliary_loss_mlp": 0.01275031, + "balance_loss_clip": 0.06288838, + "balance_loss_mlp": 0.01255921, + "epoch": 0.31600781602284683, + "flos": 23375072027520.0, + "grad_norm": 2.9601180138118286, + "language_loss": 0.78838313, + "learning_rate": 3.2015364198969772e-06, + "loss": 0.86598861, + "num_input_tokens_seen": 112841185, + "router_z_loss_clip": 1.96875, + "router_z_loss_mlp": 0.19104004, + "step": 5256, + "time_per_iteration": 2.560702085494995 + }, + { + "auxiliary_loss_clip": 0.06480406, + "auxiliary_loss_mlp": 0.01272902, + "balance_loss_clip": 0.06291319, + "balance_loss_mlp": 0.01257352, + "epoch": 0.3160679392755148, + "flos": 19834707715200.0, + "grad_norm": 1.443888473305352, + "language_loss": 0.71476674, + "learning_rate": 3.2012250523884453e-06, + "loss": 0.79229981, + "num_input_tokens_seen": 112860570, + "router_z_loss_clip": 1.89257812, + "router_z_loss_mlp": 0.15533447, + "step": 5257, + "time_per_iteration": 2.515044927597046 + }, + { + "auxiliary_loss_clip": 0.06490695, + "auxiliary_loss_mlp": 0.01275192, + "balance_loss_clip": 0.06291541, + "balance_loss_mlp": 0.01257787, + "epoch": 0.31612806252818276, + "flos": 20199368684160.0, + "grad_norm": 3.1125237193001967, + "language_loss": 0.77181315, + "learning_rate": 3.2009136393293393e-06, + "loss": 0.84947205, + "num_input_tokens_seen": 112877975, + "router_z_loss_clip": 1.9921875, + "router_z_loss_mlp": 0.17419434, + "step": 5258, + "time_per_iteration": 2.544926166534424 + }, + { + "auxiliary_loss_clip": 0.06484105, + "auxiliary_loss_mlp": 0.01276302, + "balance_loss_clip": 0.06286652, + "balance_loss_mlp": 0.01258624, + "epoch": 0.31618818578085073, + "flos": 24241596226560.0, + "grad_norm": 2.554871248122792, + "language_loss": 0.73012489, + "learning_rate": 3.200602180731467e-06, + "loss": 0.80772901, + "num_input_tokens_seen": 112896170, + "router_z_loss_clip": 1.97363281, + "router_z_loss_mlp": 0.17675781, + "step": 5259, + "time_per_iteration": 2.5244109630584717 + }, + { + "auxiliary_loss_clip": 0.06490766, + "auxiliary_loss_mlp": 0.01272581, + "balance_loss_clip": 0.06291697, + "balance_loss_mlp": 0.01256106, + "epoch": 0.3162483090335187, + "flos": 25088735404800.0, + "grad_norm": 2.502439629336286, + "language_loss": 0.66774327, + "learning_rate": 3.20029067660664e-06, + "loss": 0.74537671, + "num_input_tokens_seen": 112916180, + "router_z_loss_clip": 1.9921875, + "router_z_loss_mlp": 0.16455078, + "step": 5260, + "time_per_iteration": 2.575772762298584 + }, + { + "auxiliary_loss_clip": 0.06481651, + "auxiliary_loss_mlp": 0.01272837, + "balance_loss_clip": 0.06285223, + "balance_loss_mlp": 0.01256386, + "epoch": 0.31630843228618666, + "flos": 26330653895040.0, + "grad_norm": 2.0766337978972023, + "language_loss": 0.72817439, + "learning_rate": 3.1999791269666706e-06, + "loss": 0.80571926, + "num_input_tokens_seen": 112936745, + "router_z_loss_clip": 1.96679688, + "router_z_loss_mlp": 0.16455078, + "step": 5261, + "time_per_iteration": 2.559112548828125 + }, + { + "auxiliary_loss_clip": 0.06366719, + "auxiliary_loss_mlp": 0.01254616, + "balance_loss_clip": 0.06272718, + "balance_loss_mlp": 0.01250792, + "epoch": 0.3163685555388547, + "flos": 66780053856000.0, + "grad_norm": 0.7132570662369885, + "language_loss": 0.50697625, + "learning_rate": 3.1996675318233716e-06, + "loss": 0.58318961, + "num_input_tokens_seen": 112994845, + "router_z_loss_clip": 0.9375, + "router_z_loss_mlp": 0.03817749, + "step": 5262, + "time_per_iteration": 3.1381468772888184 + }, + { + "auxiliary_loss_clip": 0.06487425, + "auxiliary_loss_mlp": 0.01273056, + "balance_loss_clip": 0.06289163, + "balance_loss_mlp": 0.01256224, + "epoch": 0.31642867879152264, + "flos": 26002987303680.0, + "grad_norm": 1.713052875923359, + "language_loss": 0.85966682, + "learning_rate": 3.19935589118856e-06, + "loss": 0.9372716, + "num_input_tokens_seen": 113015125, + "router_z_loss_clip": 1.98046875, + "router_z_loss_mlp": 0.16833496, + "step": 5263, + "time_per_iteration": 2.5385844707489014 + }, + { + "auxiliary_loss_clip": 0.0647549, + "auxiliary_loss_mlp": 0.01273956, + "balance_loss_clip": 0.06283621, + "balance_loss_mlp": 0.01257695, + "epoch": 0.3164888020441906, + "flos": 25781943432960.0, + "grad_norm": 1.4697461293234868, + "language_loss": 0.82077682, + "learning_rate": 3.1990442050740535e-06, + "loss": 0.89827132, + "num_input_tokens_seen": 113035535, + "router_z_loss_clip": 1.91894531, + "router_z_loss_mlp": 0.16247559, + "step": 5264, + "time_per_iteration": 2.558708429336548 + }, + { + "auxiliary_loss_clip": 0.06488511, + "auxiliary_loss_mlp": 0.01271533, + "balance_loss_clip": 0.06288397, + "balance_loss_mlp": 0.01254117, + "epoch": 0.3165489252968586, + "flos": 19762437968640.0, + "grad_norm": 1.8601211050375244, + "language_loss": 0.80259931, + "learning_rate": 3.19873247349167e-06, + "loss": 0.88019973, + "num_input_tokens_seen": 113052720, + "router_z_loss_clip": 2.00097656, + "router_z_loss_mlp": 0.17419434, + "step": 5265, + "time_per_iteration": 2.492342948913574 + }, + { + "auxiliary_loss_clip": 0.06481829, + "auxiliary_loss_mlp": 0.01275233, + "balance_loss_clip": 0.06283312, + "balance_loss_mlp": 0.01257148, + "epoch": 0.31660904854952654, + "flos": 23190393628800.0, + "grad_norm": 2.032053662698869, + "language_loss": 0.75410831, + "learning_rate": 3.1984206964532307e-06, + "loss": 0.83167893, + "num_input_tokens_seen": 113071435, + "router_z_loss_clip": 1.98730469, + "router_z_loss_mlp": 0.1809082, + "step": 5266, + "time_per_iteration": 2.5563931465148926 + }, + { + "auxiliary_loss_clip": 0.06488708, + "auxiliary_loss_mlp": 0.01276821, + "balance_loss_clip": 0.06287502, + "balance_loss_mlp": 0.01258308, + "epoch": 0.3166691718021945, + "flos": 20414081571840.0, + "grad_norm": 2.020882594632444, + "language_loss": 0.79489279, + "learning_rate": 3.1981088739705585e-06, + "loss": 0.87254804, + "num_input_tokens_seen": 113088645, + "router_z_loss_clip": 2.01269531, + "router_z_loss_mlp": 0.18518066, + "step": 5267, + "time_per_iteration": 2.509413242340088 + }, + { + "auxiliary_loss_clip": 0.06371635, + "auxiliary_loss_mlp": 0.01254873, + "balance_loss_clip": 0.06277829, + "balance_loss_mlp": 0.01251359, + "epoch": 0.31672929505486247, + "flos": 70165816185600.0, + "grad_norm": 1.145238273522293, + "language_loss": 0.57623893, + "learning_rate": 3.197797006055478e-06, + "loss": 0.65250397, + "num_input_tokens_seen": 113152775, + "router_z_loss_clip": 0.9375, + "router_z_loss_mlp": 0.03518677, + "step": 5268, + "time_per_iteration": 4.6658477783203125 + }, + { + "auxiliary_loss_clip": 0.06486145, + "auxiliary_loss_mlp": 0.01271551, + "balance_loss_clip": 0.06287054, + "balance_loss_mlp": 0.01253884, + "epoch": 0.31678941830753043, + "flos": 14360977820160.0, + "grad_norm": 2.2953322915245784, + "language_loss": 0.73492396, + "learning_rate": 3.197485092719815e-06, + "loss": 0.81250083, + "num_input_tokens_seen": 113171410, + "router_z_loss_clip": 1.98925781, + "router_z_loss_mlp": 0.17651367, + "step": 5269, + "time_per_iteration": 2.500276565551758 + }, + { + "auxiliary_loss_clip": 0.06490922, + "auxiliary_loss_mlp": 0.01279355, + "balance_loss_clip": 0.06295022, + "balance_loss_mlp": 0.01261652, + "epoch": 0.3168495415601984, + "flos": 22754385308160.0, + "grad_norm": 1.8930521062253438, + "language_loss": 0.80391312, + "learning_rate": 3.1971731339753973e-06, + "loss": 0.88161588, + "num_input_tokens_seen": 113189965, + "router_z_loss_clip": 1.9609375, + "router_z_loss_mlp": 0.17700195, + "step": 5270, + "time_per_iteration": 4.030852794647217 + }, + { + "auxiliary_loss_clip": 0.0648749, + "auxiliary_loss_mlp": 0.01275027, + "balance_loss_clip": 0.06288311, + "balance_loss_mlp": 0.01257742, + "epoch": 0.31690966481286637, + "flos": 20120558319360.0, + "grad_norm": 2.0275703030815744, + "language_loss": 0.79860884, + "learning_rate": 3.1968611298340545e-06, + "loss": 0.87623405, + "num_input_tokens_seen": 113206355, + "router_z_loss_clip": 1.99121094, + "router_z_loss_mlp": 0.17285156, + "step": 5271, + "time_per_iteration": 3.963491201400757 + }, + { + "auxiliary_loss_clip": 0.06485552, + "auxiliary_loss_mlp": 0.01274595, + "balance_loss_clip": 0.06286864, + "balance_loss_mlp": 0.01256344, + "epoch": 0.31696978806553433, + "flos": 21185345278080.0, + "grad_norm": 2.0532864997035616, + "language_loss": 0.7348994, + "learning_rate": 3.1965490803076173e-06, + "loss": 0.81250083, + "num_input_tokens_seen": 113225440, + "router_z_loss_clip": 1.98828125, + "router_z_loss_mlp": 0.18237305, + "step": 5272, + "time_per_iteration": 2.5324926376342773 + }, + { + "auxiliary_loss_clip": 0.06497657, + "auxiliary_loss_mlp": 0.01275072, + "balance_loss_clip": 0.06294467, + "balance_loss_mlp": 0.01255629, + "epoch": 0.3170299113182023, + "flos": 43007030789760.0, + "grad_norm": 2.3636013379780083, + "language_loss": 0.69916022, + "learning_rate": 3.1962369854079194e-06, + "loss": 0.77688754, + "num_input_tokens_seen": 113248840, + "router_z_loss_clip": 2.03320312, + "router_z_loss_mlp": 0.19458008, + "step": 5273, + "time_per_iteration": 2.8313193321228027 + }, + { + "auxiliary_loss_clip": 0.0648469, + "auxiliary_loss_mlp": 0.01272488, + "balance_loss_clip": 0.06288255, + "balance_loss_mlp": 0.01255954, + "epoch": 0.31709003457087026, + "flos": 24466707020160.0, + "grad_norm": 3.373298123766896, + "language_loss": 0.68486917, + "learning_rate": 3.195924845146795e-06, + "loss": 0.76244098, + "num_input_tokens_seen": 113269630, + "router_z_loss_clip": 1.9609375, + "router_z_loss_mlp": 0.1652832, + "step": 5274, + "time_per_iteration": 2.5647053718566895 + }, + { + "auxiliary_loss_clip": 0.06486842, + "auxiliary_loss_mlp": 0.01272159, + "balance_loss_clip": 0.06295811, + "balance_loss_mlp": 0.01256114, + "epoch": 0.3171501578235382, + "flos": 24142394592000.0, + "grad_norm": 1.437173314012816, + "language_loss": 0.8105545, + "learning_rate": 3.195612659536081e-06, + "loss": 0.88814449, + "num_input_tokens_seen": 113291200, + "router_z_loss_clip": 1.91308594, + "router_z_loss_mlp": 0.16052246, + "step": 5275, + "time_per_iteration": 2.545689821243286 + }, + { + "auxiliary_loss_clip": 0.06496362, + "auxiliary_loss_mlp": 0.01272499, + "balance_loss_clip": 0.0629561, + "balance_loss_mlp": 0.01254296, + "epoch": 0.31721028107620625, + "flos": 18885641644800.0, + "grad_norm": 1.7797970991839078, + "language_loss": 0.73459136, + "learning_rate": 3.1953004285876147e-06, + "loss": 0.81228, + "num_input_tokens_seen": 113310170, + "router_z_loss_clip": 2.00683594, + "router_z_loss_mlp": 0.18212891, + "step": 5276, + "time_per_iteration": 3.978994131088257 + }, + { + "auxiliary_loss_clip": 0.06480486, + "auxiliary_loss_mlp": 0.01276369, + "balance_loss_clip": 0.06288992, + "balance_loss_mlp": 0.01259811, + "epoch": 0.3172704043288742, + "flos": 23154405500160.0, + "grad_norm": 1.4192945576637652, + "language_loss": 0.78409082, + "learning_rate": 3.194988152313236e-06, + "loss": 0.86165935, + "num_input_tokens_seen": 113331140, + "router_z_loss_clip": 1.91699219, + "router_z_loss_mlp": 0.16552734, + "step": 5277, + "time_per_iteration": 2.6181840896606445 + }, + { + "auxiliary_loss_clip": 0.06493685, + "auxiliary_loss_mlp": 0.01273951, + "balance_loss_clip": 0.06294833, + "balance_loss_mlp": 0.01256653, + "epoch": 0.3173305275815422, + "flos": 17864347754880.0, + "grad_norm": 1.9934204528772321, + "language_loss": 0.79709554, + "learning_rate": 3.1946758307247878e-06, + "loss": 0.87477195, + "num_input_tokens_seen": 113350030, + "router_z_loss_clip": 1.98828125, + "router_z_loss_mlp": 0.17297363, + "step": 5278, + "time_per_iteration": 2.4955894947052 + }, + { + "auxiliary_loss_clip": 0.06380783, + "auxiliary_loss_mlp": 0.01265109, + "balance_loss_clip": 0.06288064, + "balance_loss_mlp": 0.01260886, + "epoch": 0.31739065083421014, + "flos": 59988083529600.0, + "grad_norm": 0.841903886868049, + "language_loss": 0.62797457, + "learning_rate": 3.1943634638341114e-06, + "loss": 0.7044335, + "num_input_tokens_seen": 113395820, + "router_z_loss_clip": 0.92724609, + "router_z_loss_mlp": 0.04226685, + "step": 5279, + "time_per_iteration": 2.920987367630005 + }, + { + "auxiliary_loss_clip": 0.06489395, + "auxiliary_loss_mlp": 0.01285376, + "balance_loss_clip": 0.06287935, + "balance_loss_mlp": 0.01265265, + "epoch": 0.3174507740868781, + "flos": 23807013425280.0, + "grad_norm": 2.0709232065681475, + "language_loss": 0.81487882, + "learning_rate": 3.194051051653053e-06, + "loss": 0.89262652, + "num_input_tokens_seen": 113416835, + "router_z_loss_clip": 2.01367188, + "router_z_loss_mlp": 0.2010498, + "step": 5280, + "time_per_iteration": 2.537612199783325 + }, + { + "auxiliary_loss_clip": 0.06483282, + "auxiliary_loss_mlp": 0.01281645, + "balance_loss_clip": 0.06291374, + "balance_loss_mlp": 0.01264276, + "epoch": 0.31751089733954607, + "flos": 27646728848640.0, + "grad_norm": 1.437826441265799, + "language_loss": 0.78464299, + "learning_rate": 3.19373859419346e-06, + "loss": 0.86229229, + "num_input_tokens_seen": 113440850, + "router_z_loss_clip": 1.91699219, + "router_z_loss_mlp": 0.17358398, + "step": 5281, + "time_per_iteration": 2.6482186317443848 + }, + { + "auxiliary_loss_clip": 0.06485789, + "auxiliary_loss_mlp": 0.01283007, + "balance_loss_clip": 0.06290175, + "balance_loss_mlp": 0.01265424, + "epoch": 0.31757102059221404, + "flos": 23776098468480.0, + "grad_norm": 1.5338111796323235, + "language_loss": 0.78882301, + "learning_rate": 3.193426091467179e-06, + "loss": 0.86651099, + "num_input_tokens_seen": 113461000, + "router_z_loss_clip": 1.95410156, + "router_z_loss_mlp": 0.17590332, + "step": 5282, + "time_per_iteration": 2.5157217979431152 + }, + { + "auxiliary_loss_clip": 0.06494205, + "auxiliary_loss_mlp": 0.01276135, + "balance_loss_clip": 0.0629286, + "balance_loss_mlp": 0.01258373, + "epoch": 0.317631143844882, + "flos": 25271485159680.0, + "grad_norm": 2.0006947857157753, + "language_loss": 0.67952389, + "learning_rate": 3.193113543486061e-06, + "loss": 0.7572273, + "num_input_tokens_seen": 113480820, + "router_z_loss_clip": 2.01367188, + "router_z_loss_mlp": 0.1776123, + "step": 5283, + "time_per_iteration": 2.565925359725952 + }, + { + "auxiliary_loss_clip": 0.06373101, + "auxiliary_loss_mlp": 0.01271528, + "balance_loss_clip": 0.0628058, + "balance_loss_mlp": 0.01267352, + "epoch": 0.31769126709754997, + "flos": 55841832743040.0, + "grad_norm": 0.7241871595116953, + "language_loss": 0.52631503, + "learning_rate": 3.192800950261958e-06, + "loss": 0.60276127, + "num_input_tokens_seen": 113536910, + "router_z_loss_clip": 0.921875, + "router_z_loss_mlp": 0.04177856, + "step": 5284, + "time_per_iteration": 3.1037213802337646 + }, + { + "auxiliary_loss_clip": 0.0649649, + "auxiliary_loss_mlp": 0.01274319, + "balance_loss_clip": 0.06291351, + "balance_loss_mlp": 0.01257225, + "epoch": 0.31775139035021793, + "flos": 16696124530560.0, + "grad_norm": 2.2460762000689294, + "language_loss": 0.70842284, + "learning_rate": 3.1924883118067235e-06, + "loss": 0.78613091, + "num_input_tokens_seen": 113555480, + "router_z_loss_clip": 2.04980469, + "router_z_loss_mlp": 0.17102051, + "step": 5285, + "time_per_iteration": 2.5407655239105225 + }, + { + "auxiliary_loss_clip": 0.06366412, + "auxiliary_loss_mlp": 0.01262401, + "balance_loss_clip": 0.06274283, + "balance_loss_mlp": 0.01258384, + "epoch": 0.3178115136028859, + "flos": 64246141261440.0, + "grad_norm": 1.0137073922687154, + "language_loss": 0.60545647, + "learning_rate": 3.1921756281322123e-06, + "loss": 0.68174458, + "num_input_tokens_seen": 113616790, + "router_z_loss_clip": 0.921875, + "router_z_loss_mlp": 0.04016113, + "step": 5286, + "time_per_iteration": 3.1833202838897705 + }, + { + "auxiliary_loss_clip": 0.06498363, + "auxiliary_loss_mlp": 0.01284909, + "balance_loss_clip": 0.06297486, + "balance_loss_mlp": 0.01267051, + "epoch": 0.31787163685555386, + "flos": 18703395014400.0, + "grad_norm": 1.7319286904547555, + "language_loss": 0.72404122, + "learning_rate": 3.1918628992502826e-06, + "loss": 0.80187392, + "num_input_tokens_seen": 113635320, + "router_z_loss_clip": 2.0078125, + "router_z_loss_mlp": 0.17871094, + "step": 5287, + "time_per_iteration": 2.50571608543396 + }, + { + "auxiliary_loss_clip": 0.06495041, + "auxiliary_loss_mlp": 0.01276683, + "balance_loss_clip": 0.06292516, + "balance_loss_mlp": 0.012578, + "epoch": 0.31793176010822183, + "flos": 21331184509440.0, + "grad_norm": 1.978321388726588, + "language_loss": 0.76231503, + "learning_rate": 3.191550125172792e-06, + "loss": 0.84003228, + "num_input_tokens_seen": 113654000, + "router_z_loss_clip": 2.02539062, + "router_z_loss_mlp": 0.18884277, + "step": 5288, + "time_per_iteration": 2.5568416118621826 + }, + { + "auxiliary_loss_clip": 0.06485806, + "auxiliary_loss_mlp": 0.01283528, + "balance_loss_clip": 0.06293501, + "balance_loss_mlp": 0.01267816, + "epoch": 0.31799188336088985, + "flos": 20964846458880.0, + "grad_norm": 1.7076221862053031, + "language_loss": 0.88265222, + "learning_rate": 3.1912373059116007e-06, + "loss": 0.96034551, + "num_input_tokens_seen": 113672375, + "router_z_loss_clip": 1.92578125, + "router_z_loss_mlp": 0.15710449, + "step": 5289, + "time_per_iteration": 2.5359349250793457 + }, + { + "auxiliary_loss_clip": 0.06488061, + "auxiliary_loss_mlp": 0.01286652, + "balance_loss_clip": 0.06295781, + "balance_loss_mlp": 0.01269724, + "epoch": 0.3180520066135578, + "flos": 22498485338880.0, + "grad_norm": 1.4069348748047803, + "language_loss": 0.68210149, + "learning_rate": 3.190924441478572e-06, + "loss": 0.75984859, + "num_input_tokens_seen": 113692385, + "router_z_loss_clip": 1.92382812, + "router_z_loss_mlp": 0.16906738, + "step": 5290, + "time_per_iteration": 2.5393311977386475 + }, + { + "auxiliary_loss_clip": 0.06494544, + "auxiliary_loss_mlp": 0.0128386, + "balance_loss_clip": 0.06290419, + "balance_loss_mlp": 0.01265788, + "epoch": 0.3181121298662258, + "flos": 27242725587840.0, + "grad_norm": 3.4346413288346, + "language_loss": 0.79944348, + "learning_rate": 3.1906115318855687e-06, + "loss": 0.87722754, + "num_input_tokens_seen": 113712145, + "router_z_loss_clip": 2.04003906, + "router_z_loss_mlp": 0.18066406, + "step": 5291, + "time_per_iteration": 2.564091444015503 + }, + { + "auxiliary_loss_clip": 0.06485635, + "auxiliary_loss_mlp": 0.01278435, + "balance_loss_clip": 0.06287642, + "balance_loss_mlp": 0.01259361, + "epoch": 0.31817225311889374, + "flos": 23185991289600.0, + "grad_norm": 2.0451390273410004, + "language_loss": 0.79931051, + "learning_rate": 3.1902985771444577e-06, + "loss": 0.87695122, + "num_input_tokens_seen": 113731435, + "router_z_loss_clip": 1.97949219, + "router_z_loss_mlp": 0.19067383, + "step": 5292, + "time_per_iteration": 2.743156671524048 + }, + { + "auxiliary_loss_clip": 0.06476898, + "auxiliary_loss_mlp": 0.01275055, + "balance_loss_clip": 0.06287324, + "balance_loss_mlp": 0.01258044, + "epoch": 0.3182323763715617, + "flos": 23265598268160.0, + "grad_norm": 1.819133879513315, + "language_loss": 0.75602406, + "learning_rate": 3.1899855772671043e-06, + "loss": 0.8335436, + "num_input_tokens_seen": 113750825, + "router_z_loss_clip": 1.89550781, + "router_z_loss_mlp": 0.17004395, + "step": 5293, + "time_per_iteration": 2.523386001586914 + }, + { + "auxiliary_loss_clip": 0.06482453, + "auxiliary_loss_mlp": 0.01276012, + "balance_loss_clip": 0.06290737, + "balance_loss_mlp": 0.01260204, + "epoch": 0.3182924996242297, + "flos": 29023292050560.0, + "grad_norm": 2.0524562129349526, + "language_loss": 0.75145984, + "learning_rate": 3.189672532265379e-06, + "loss": 0.82904446, + "num_input_tokens_seen": 113770010, + "router_z_loss_clip": 1.91601562, + "router_z_loss_mlp": 0.15808105, + "step": 5294, + "time_per_iteration": 2.607849597930908 + }, + { + "auxiliary_loss_clip": 0.06489888, + "auxiliary_loss_mlp": 0.01277785, + "balance_loss_clip": 0.06293514, + "balance_loss_mlp": 0.01259201, + "epoch": 0.31835262287689764, + "flos": 20455478288640.0, + "grad_norm": 2.029675905915872, + "language_loss": 0.76497674, + "learning_rate": 3.189359442151152e-06, + "loss": 0.84265351, + "num_input_tokens_seen": 113788640, + "router_z_loss_clip": 1.96582031, + "router_z_loss_mlp": 0.18591309, + "step": 5295, + "time_per_iteration": 2.4980461597442627 + }, + { + "auxiliary_loss_clip": 0.06494178, + "auxiliary_loss_mlp": 0.01278535, + "balance_loss_clip": 0.06293284, + "balance_loss_mlp": 0.01261166, + "epoch": 0.3184127461295656, + "flos": 25126568323200.0, + "grad_norm": 2.03182891885516, + "language_loss": 0.70142519, + "learning_rate": 3.189046306936296e-06, + "loss": 0.77915227, + "num_input_tokens_seen": 113809515, + "router_z_loss_clip": 2.0078125, + "router_z_loss_mlp": 0.17358398, + "step": 5296, + "time_per_iteration": 2.610671043395996 + }, + { + "auxiliary_loss_clip": 0.06483515, + "auxiliary_loss_mlp": 0.01274893, + "balance_loss_clip": 0.0628704, + "balance_loss_mlp": 0.01258371, + "epoch": 0.31847286938223357, + "flos": 25557377690880.0, + "grad_norm": 1.5251920176335134, + "language_loss": 0.77957898, + "learning_rate": 3.1887331266326846e-06, + "loss": 0.85716307, + "num_input_tokens_seen": 113829770, + "router_z_loss_clip": 1.96484375, + "router_z_loss_mlp": 0.16516113, + "step": 5297, + "time_per_iteration": 2.539649486541748 + }, + { + "auxiliary_loss_clip": 0.06479752, + "auxiliary_loss_mlp": 0.01272766, + "balance_loss_clip": 0.06283344, + "balance_loss_mlp": 0.01255516, + "epoch": 0.31853299263490154, + "flos": 27789926676480.0, + "grad_norm": 1.8177911904554251, + "language_loss": 0.80074358, + "learning_rate": 3.1884199012521942e-06, + "loss": 0.87826872, + "num_input_tokens_seen": 113849320, + "router_z_loss_clip": 1.96386719, + "router_z_loss_mlp": 0.17248535, + "step": 5298, + "time_per_iteration": 2.6127634048461914 + }, + { + "auxiliary_loss_clip": 0.06487016, + "auxiliary_loss_mlp": 0.0127216, + "balance_loss_clip": 0.06284906, + "balance_loss_mlp": 0.01254815, + "epoch": 0.3185931158875695, + "flos": 22712653175040.0, + "grad_norm": 1.6158824069779534, + "language_loss": 0.74615932, + "learning_rate": 3.1881066308067016e-06, + "loss": 0.82375109, + "num_input_tokens_seen": 113867860, + "router_z_loss_clip": 2.02148438, + "router_z_loss_mlp": 0.17346191, + "step": 5299, + "time_per_iteration": 2.570178508758545 + }, + { + "auxiliary_loss_clip": 0.06491919, + "auxiliary_loss_mlp": 0.01275355, + "balance_loss_clip": 0.06290901, + "balance_loss_mlp": 0.01258249, + "epoch": 0.31865323914023747, + "flos": 24578402912640.0, + "grad_norm": 1.9760141697724851, + "language_loss": 0.78568625, + "learning_rate": 3.1877933153080873e-06, + "loss": 0.86335897, + "num_input_tokens_seen": 113886375, + "router_z_loss_clip": 2.0078125, + "router_z_loss_mlp": 0.17102051, + "step": 5300, + "time_per_iteration": 2.7260777950286865 + }, + { + "auxiliary_loss_clip": 0.06483838, + "auxiliary_loss_mlp": 0.01272854, + "balance_loss_clip": 0.06287212, + "balance_loss_mlp": 0.01254495, + "epoch": 0.31871336239290543, + "flos": 18192391689600.0, + "grad_norm": 2.1538981188283195, + "language_loss": 0.84250915, + "learning_rate": 3.1874799547682304e-06, + "loss": 0.92007607, + "num_input_tokens_seen": 113904065, + "router_z_loss_clip": 1.96386719, + "router_z_loss_mlp": 0.18347168, + "step": 5301, + "time_per_iteration": 2.485152244567871 + }, + { + "auxiliary_loss_clip": 0.06484723, + "auxiliary_loss_mlp": 0.01274861, + "balance_loss_clip": 0.06291914, + "balance_loss_mlp": 0.01256777, + "epoch": 0.31877348564557345, + "flos": 21831789928320.0, + "grad_norm": 2.0482094969798696, + "language_loss": 0.7812382, + "learning_rate": 3.187166549199015e-06, + "loss": 0.85883403, + "num_input_tokens_seen": 113918415, + "router_z_loss_clip": 1.92675781, + "router_z_loss_mlp": 0.18066406, + "step": 5302, + "time_per_iteration": 2.528764247894287 + }, + { + "auxiliary_loss_clip": 0.0648333, + "auxiliary_loss_mlp": 0.01275814, + "balance_loss_clip": 0.06290714, + "balance_loss_mlp": 0.01258159, + "epoch": 0.3188336088982414, + "flos": 22021331863680.0, + "grad_norm": 1.6144767194600491, + "language_loss": 0.79736584, + "learning_rate": 3.1868530986123255e-06, + "loss": 0.8749572, + "num_input_tokens_seen": 113938135, + "router_z_loss_clip": 1.92675781, + "router_z_loss_mlp": 0.17651367, + "step": 5303, + "time_per_iteration": 2.5235095024108887 + }, + { + "auxiliary_loss_clip": 0.06497993, + "auxiliary_loss_mlp": 0.01276899, + "balance_loss_clip": 0.06290174, + "balance_loss_mlp": 0.01258159, + "epoch": 0.3188937321509094, + "flos": 20054116431360.0, + "grad_norm": 1.7320090718032515, + "language_loss": 0.73529422, + "learning_rate": 3.186539603020047e-06, + "loss": 0.81304312, + "num_input_tokens_seen": 113957125, + "router_z_loss_clip": 2.08007812, + "router_z_loss_mlp": 0.18737793, + "step": 5304, + "time_per_iteration": 2.5141329765319824 + }, + { + "auxiliary_loss_clip": 0.06481734, + "auxiliary_loss_mlp": 0.01278154, + "balance_loss_clip": 0.06290816, + "balance_loss_mlp": 0.01260928, + "epoch": 0.31895385540357735, + "flos": 25855135574400.0, + "grad_norm": 1.8091269764667626, + "language_loss": 0.72548914, + "learning_rate": 3.186226062434068e-06, + "loss": 0.80308801, + "num_input_tokens_seen": 113974875, + "router_z_loss_clip": 1.91113281, + "router_z_loss_mlp": 0.17236328, + "step": 5305, + "time_per_iteration": 2.5648975372314453 + }, + { + "auxiliary_loss_clip": 0.06487268, + "auxiliary_loss_mlp": 0.01270708, + "balance_loss_clip": 0.06292576, + "balance_loss_mlp": 0.01254603, + "epoch": 0.3190139786562453, + "flos": 23484545786880.0, + "grad_norm": 2.116447005947582, + "language_loss": 0.64815247, + "learning_rate": 3.1859124768662778e-06, + "loss": 0.72573221, + "num_input_tokens_seen": 113994450, + "router_z_loss_clip": 1.94628906, + "router_z_loss_mlp": 0.16113281, + "step": 5306, + "time_per_iteration": 2.5745668411254883 + }, + { + "auxiliary_loss_clip": 0.06483987, + "auxiliary_loss_mlp": 0.01282676, + "balance_loss_clip": 0.0628574, + "balance_loss_mlp": 0.01264413, + "epoch": 0.3190741019089133, + "flos": 29103150591360.0, + "grad_norm": 2.0084949709877726, + "language_loss": 0.79260421, + "learning_rate": 3.1855988463285678e-06, + "loss": 0.87027091, + "num_input_tokens_seen": 114013945, + "router_z_loss_clip": 1.98144531, + "router_z_loss_mlp": 0.18273926, + "step": 5307, + "time_per_iteration": 2.557509183883667 + }, + { + "auxiliary_loss_clip": 0.06481419, + "auxiliary_loss_mlp": 0.01278653, + "balance_loss_clip": 0.06289747, + "balance_loss_mlp": 0.01260736, + "epoch": 0.31913422516158124, + "flos": 17135361233280.0, + "grad_norm": 3.9021838038471097, + "language_loss": 0.78660965, + "learning_rate": 3.1852851708328308e-06, + "loss": 0.86421037, + "num_input_tokens_seen": 114031375, + "router_z_loss_clip": 1.91796875, + "router_z_loss_mlp": 0.17907715, + "step": 5308, + "time_per_iteration": 3.906280994415283 + }, + { + "auxiliary_loss_clip": 0.06493698, + "auxiliary_loss_mlp": 0.01278493, + "balance_loss_clip": 0.06287338, + "balance_loss_mlp": 0.01259408, + "epoch": 0.3191943484142492, + "flos": 16075228176000.0, + "grad_norm": 3.1945469837170215, + "language_loss": 0.74758154, + "learning_rate": 3.184971450390961e-06, + "loss": 0.82530349, + "num_input_tokens_seen": 114048465, + "router_z_loss_clip": 2.06152344, + "router_z_loss_mlp": 0.19091797, + "step": 5309, + "time_per_iteration": 2.4796438217163086 + }, + { + "auxiliary_loss_clip": 0.06480245, + "auxiliary_loss_mlp": 0.01274762, + "balance_loss_clip": 0.06283399, + "balance_loss_mlp": 0.01257954, + "epoch": 0.3192544716669172, + "flos": 22972787775360.0, + "grad_norm": 1.6995242114780418, + "language_loss": 0.83242565, + "learning_rate": 3.184657685014856e-06, + "loss": 0.90997577, + "num_input_tokens_seen": 114068415, + "router_z_loss_clip": 1.96777344, + "router_z_loss_mlp": 0.16809082, + "step": 5310, + "time_per_iteration": 5.470219373703003 + }, + { + "auxiliary_loss_clip": 0.06475915, + "auxiliary_loss_mlp": 0.01272391, + "balance_loss_clip": 0.06281388, + "balance_loss_mlp": 0.01255868, + "epoch": 0.31931459491958514, + "flos": 26877645348480.0, + "grad_norm": 1.407923936832892, + "language_loss": 0.78906345, + "learning_rate": 3.184343874716412e-06, + "loss": 0.86654651, + "num_input_tokens_seen": 114088565, + "router_z_loss_clip": 1.94628906, + "router_z_loss_mlp": 0.1652832, + "step": 5311, + "time_per_iteration": 2.546112298965454 + }, + { + "auxiliary_loss_clip": 0.06477334, + "auxiliary_loss_mlp": 0.01272194, + "balance_loss_clip": 0.06282097, + "balance_loss_mlp": 0.01255254, + "epoch": 0.3193747181722531, + "flos": 21843194083200.0, + "grad_norm": 1.8192899238067177, + "language_loss": 0.84889889, + "learning_rate": 3.1840300195075295e-06, + "loss": 0.92639416, + "num_input_tokens_seen": 114107160, + "router_z_loss_clip": 1.95117188, + "router_z_loss_mlp": 0.16943359, + "step": 5312, + "time_per_iteration": 2.5534987449645996 + }, + { + "auxiliary_loss_clip": 0.06489489, + "auxiliary_loss_mlp": 0.01274677, + "balance_loss_clip": 0.06284228, + "balance_loss_mlp": 0.012567, + "epoch": 0.31943484142492107, + "flos": 18329593950720.0, + "grad_norm": 3.1557419136729536, + "language_loss": 0.79280984, + "learning_rate": 3.1837161194001102e-06, + "loss": 0.87045145, + "num_input_tokens_seen": 114123420, + "router_z_loss_clip": 2.05078125, + "router_z_loss_mlp": 0.17980957, + "step": 5313, + "time_per_iteration": 2.47098445892334 + }, + { + "auxiliary_loss_clip": 0.06477478, + "auxiliary_loss_mlp": 0.01274452, + "balance_loss_clip": 0.06281047, + "balance_loss_mlp": 0.01256618, + "epoch": 0.31949496467758903, + "flos": 21622150212480.0, + "grad_norm": 2.7721598847405584, + "language_loss": 0.86245549, + "learning_rate": 3.183402174406057e-06, + "loss": 0.93997484, + "num_input_tokens_seen": 114139230, + "router_z_loss_clip": 1.96484375, + "router_z_loss_mlp": 0.17834473, + "step": 5314, + "time_per_iteration": 2.531196117401123 + }, + { + "auxiliary_loss_clip": 0.0647811, + "auxiliary_loss_mlp": 0.0127239, + "balance_loss_clip": 0.06281686, + "balance_loss_mlp": 0.01255188, + "epoch": 0.31955508793025705, + "flos": 21766312362240.0, + "grad_norm": 1.712027342879292, + "language_loss": 0.80238831, + "learning_rate": 3.1830881845372747e-06, + "loss": 0.8798933, + "num_input_tokens_seen": 114159290, + "router_z_loss_clip": 1.96386719, + "router_z_loss_mlp": 0.17199707, + "step": 5315, + "time_per_iteration": 2.5066771507263184 + }, + { + "auxiliary_loss_clip": 0.06485026, + "auxiliary_loss_mlp": 0.01283831, + "balance_loss_clip": 0.06286455, + "balance_loss_mlp": 0.01265854, + "epoch": 0.319615211182925, + "flos": 17169881915520.0, + "grad_norm": 2.687676993792702, + "language_loss": 0.67569852, + "learning_rate": 3.18277414980567e-06, + "loss": 0.75338709, + "num_input_tokens_seen": 114177655, + "router_z_loss_clip": 1.98632812, + "router_z_loss_mlp": 0.17980957, + "step": 5316, + "time_per_iteration": 3.943110942840576 + }, + { + "auxiliary_loss_clip": 0.0648303, + "auxiliary_loss_mlp": 0.01272207, + "balance_loss_clip": 0.0628902, + "balance_loss_mlp": 0.01255566, + "epoch": 0.319675334435593, + "flos": 28120653941760.0, + "grad_norm": 1.5692381446514811, + "language_loss": 0.69637752, + "learning_rate": 3.1824600702231515e-06, + "loss": 0.77392983, + "num_input_tokens_seen": 114200880, + "router_z_loss_clip": 1.93652344, + "router_z_loss_mlp": 0.16650391, + "step": 5317, + "time_per_iteration": 2.642251491546631 + }, + { + "auxiliary_loss_clip": 0.06377298, + "auxiliary_loss_mlp": 0.0129256, + "balance_loss_clip": 0.06285109, + "balance_loss_mlp": 0.01288716, + "epoch": 0.31973545768826095, + "flos": 69524235072000.0, + "grad_norm": 0.7198160842036254, + "language_loss": 0.5281924, + "learning_rate": 3.182145945801628e-06, + "loss": 0.60489094, + "num_input_tokens_seen": 114267145, + "router_z_loss_clip": 0.921875, + "router_z_loss_mlp": 0.03839111, + "step": 5318, + "time_per_iteration": 3.2718679904937744 + }, + { + "auxiliary_loss_clip": 0.06479475, + "auxiliary_loss_mlp": 0.01271921, + "balance_loss_clip": 0.0628712, + "balance_loss_mlp": 0.01254969, + "epoch": 0.3197955809409289, + "flos": 13704344899200.0, + "grad_norm": 1.5995609143402318, + "language_loss": 0.84504628, + "learning_rate": 3.181831776553012e-06, + "loss": 0.92256021, + "num_input_tokens_seen": 114284630, + "router_z_loss_clip": 1.92480469, + "router_z_loss_mlp": 0.16955566, + "step": 5319, + "time_per_iteration": 2.5372629165649414 + }, + { + "auxiliary_loss_clip": 0.06480815, + "auxiliary_loss_mlp": 0.01279474, + "balance_loss_clip": 0.06286162, + "balance_loss_mlp": 0.01261199, + "epoch": 0.3198557041935969, + "flos": 33226368704640.0, + "grad_norm": 1.6136244255626262, + "language_loss": 0.64208525, + "learning_rate": 3.1815175624892165e-06, + "loss": 0.71968812, + "num_input_tokens_seen": 114305830, + "router_z_loss_clip": 1.9453125, + "router_z_loss_mlp": 0.18273926, + "step": 5320, + "time_per_iteration": 2.675477981567383 + }, + { + "auxiliary_loss_clip": 0.0648189, + "auxiliary_loss_mlp": 0.01271878, + "balance_loss_clip": 0.06280586, + "balance_loss_mlp": 0.01254402, + "epoch": 0.31991582744626484, + "flos": 23738726747520.0, + "grad_norm": 1.9696222638037655, + "language_loss": 0.71059012, + "learning_rate": 3.1812033036221567e-06, + "loss": 0.78812778, + "num_input_tokens_seen": 114325165, + "router_z_loss_clip": 2.01171875, + "router_z_loss_mlp": 0.17480469, + "step": 5321, + "time_per_iteration": 2.6383230686187744 + }, + { + "auxiliary_loss_clip": 0.06491005, + "auxiliary_loss_mlp": 0.01288903, + "balance_loss_clip": 0.06286187, + "balance_loss_mlp": 0.01270318, + "epoch": 0.3199759506989328, + "flos": 18556633388160.0, + "grad_norm": 2.30981924299517, + "language_loss": 0.86988461, + "learning_rate": 3.180888999963749e-06, + "loss": 0.94768369, + "num_input_tokens_seen": 114341310, + "router_z_loss_clip": 2.04980469, + "router_z_loss_mlp": 0.18591309, + "step": 5322, + "time_per_iteration": 2.4862442016601562 + }, + { + "auxiliary_loss_clip": 0.0648296, + "auxiliary_loss_mlp": 0.01273077, + "balance_loss_clip": 0.06285054, + "balance_loss_mlp": 0.01256722, + "epoch": 0.3200360739516008, + "flos": 22425418978560.0, + "grad_norm": 1.6041292280722281, + "language_loss": 0.83380175, + "learning_rate": 3.1805746515259123e-06, + "loss": 0.91136217, + "num_input_tokens_seen": 114360355, + "router_z_loss_clip": 1.9765625, + "router_z_loss_mlp": 0.16369629, + "step": 5323, + "time_per_iteration": 2.5262420177459717 + }, + { + "auxiliary_loss_clip": 0.06476378, + "auxiliary_loss_mlp": 0.01277972, + "balance_loss_clip": 0.06284457, + "balance_loss_mlp": 0.01258529, + "epoch": 0.32009619720426874, + "flos": 20601569082240.0, + "grad_norm": 1.775654796490425, + "language_loss": 0.78471839, + "learning_rate": 3.1802602583205663e-06, + "loss": 0.86226195, + "num_input_tokens_seen": 114379220, + "router_z_loss_clip": 1.91796875, + "router_z_loss_mlp": 0.19433594, + "step": 5324, + "time_per_iteration": 2.492380380630493 + }, + { + "auxiliary_loss_clip": 0.06478705, + "auxiliary_loss_mlp": 0.01274174, + "balance_loss_clip": 0.06283212, + "balance_loss_mlp": 0.01256042, + "epoch": 0.3201563204569367, + "flos": 18153049397760.0, + "grad_norm": 1.7224742254360714, + "language_loss": 0.80742848, + "learning_rate": 3.1799458203596333e-06, + "loss": 0.88495719, + "num_input_tokens_seen": 114396365, + "router_z_loss_clip": 1.95507812, + "router_z_loss_mlp": 0.18139648, + "step": 5325, + "time_per_iteration": 2.4962642192840576 + }, + { + "auxiliary_loss_clip": 0.06478769, + "auxiliary_loss_mlp": 0.01277308, + "balance_loss_clip": 0.06280222, + "balance_loss_mlp": 0.01259701, + "epoch": 0.32021644370960467, + "flos": 31691975137920.0, + "grad_norm": 1.8321318923341703, + "language_loss": 0.75898254, + "learning_rate": 3.179631337655037e-06, + "loss": 0.83654332, + "num_input_tokens_seen": 114416780, + "router_z_loss_clip": 1.98632812, + "router_z_loss_mlp": 0.17602539, + "step": 5326, + "time_per_iteration": 2.5752692222595215 + }, + { + "auxiliary_loss_clip": 0.06472234, + "auxiliary_loss_mlp": 0.01278108, + "balance_loss_clip": 0.06281741, + "balance_loss_mlp": 0.01260918, + "epoch": 0.32027656696227264, + "flos": 26872488322560.0, + "grad_norm": 1.458996564995821, + "language_loss": 0.81400204, + "learning_rate": 3.179316810218701e-06, + "loss": 0.89150548, + "num_input_tokens_seen": 114437405, + "router_z_loss_clip": 1.90527344, + "router_z_loss_mlp": 0.171875, + "step": 5327, + "time_per_iteration": 2.5635383129119873 + }, + { + "auxiliary_loss_clip": 0.06486546, + "auxiliary_loss_mlp": 0.01273421, + "balance_loss_clip": 0.062847, + "balance_loss_mlp": 0.01256207, + "epoch": 0.32033669021494066, + "flos": 24176705639040.0, + "grad_norm": 1.3787000535244864, + "language_loss": 0.77910948, + "learning_rate": 3.179002238062554e-06, + "loss": 0.85670912, + "num_input_tokens_seen": 114458505, + "router_z_loss_clip": 2.01855469, + "router_z_loss_mlp": 0.17211914, + "step": 5328, + "time_per_iteration": 2.514646053314209 + }, + { + "auxiliary_loss_clip": 0.06484267, + "auxiliary_loss_mlp": 0.01278516, + "balance_loss_clip": 0.06287045, + "balance_loss_mlp": 0.0125992, + "epoch": 0.3203968134676086, + "flos": 24467419779840.0, + "grad_norm": 1.5501370939230803, + "language_loss": 0.74267161, + "learning_rate": 3.178687621198524e-06, + "loss": 0.82029939, + "num_input_tokens_seen": 114479050, + "router_z_loss_clip": 1.97265625, + "router_z_loss_mlp": 0.18591309, + "step": 5329, + "time_per_iteration": 2.5436654090881348 + }, + { + "auxiliary_loss_clip": 0.06471072, + "auxiliary_loss_mlp": 0.01278598, + "balance_loss_clip": 0.06282842, + "balance_loss_mlp": 0.01262434, + "epoch": 0.3204569367202766, + "flos": 18010606256640.0, + "grad_norm": 1.7046636031855489, + "language_loss": 0.71222955, + "learning_rate": 3.1783729596385415e-06, + "loss": 0.78972626, + "num_input_tokens_seen": 114497415, + "router_z_loss_clip": 1.8828125, + "router_z_loss_mlp": 0.16162109, + "step": 5330, + "time_per_iteration": 2.479647397994995 + }, + { + "auxiliary_loss_clip": 0.06485157, + "auxiliary_loss_mlp": 0.01277162, + "balance_loss_clip": 0.0628237, + "balance_loss_mlp": 0.0125791, + "epoch": 0.32051705997294455, + "flos": 30597237544320.0, + "grad_norm": 1.705143811074938, + "language_loss": 0.80496192, + "learning_rate": 3.1780582533945376e-06, + "loss": 0.88258511, + "num_input_tokens_seen": 114518785, + "router_z_loss_clip": 2.02832031, + "router_z_loss_mlp": 0.19250488, + "step": 5331, + "time_per_iteration": 2.5741958618164062 + }, + { + "auxiliary_loss_clip": 0.06384323, + "auxiliary_loss_mlp": 0.0125803, + "balance_loss_clip": 0.06292279, + "balance_loss_mlp": 0.01253741, + "epoch": 0.3205771832256125, + "flos": 68436723657600.0, + "grad_norm": 0.7949538218297083, + "language_loss": 0.5776577, + "learning_rate": 3.177743502478447e-06, + "loss": 0.65408123, + "num_input_tokens_seen": 114577710, + "router_z_loss_clip": 0.921875, + "router_z_loss_mlp": 0.04293823, + "step": 5332, + "time_per_iteration": 3.084747314453125 + }, + { + "auxiliary_loss_clip": 0.06488422, + "auxiliary_loss_mlp": 0.01272523, + "balance_loss_clip": 0.06286052, + "balance_loss_mlp": 0.01255154, + "epoch": 0.3206373064782805, + "flos": 30451524094080.0, + "grad_norm": 1.5377704746044631, + "language_loss": 0.73702615, + "learning_rate": 3.177428706902205e-06, + "loss": 0.81463563, + "num_input_tokens_seen": 114598640, + "router_z_loss_clip": 2.0234375, + "router_z_loss_mlp": 0.17358398, + "step": 5333, + "time_per_iteration": 2.6130683422088623 + }, + { + "auxiliary_loss_clip": 0.06480561, + "auxiliary_loss_mlp": 0.01273615, + "balance_loss_clip": 0.06284031, + "balance_loss_mlp": 0.01256246, + "epoch": 0.32069742973094845, + "flos": 22061051498880.0, + "grad_norm": 1.6882238799892797, + "language_loss": 0.70957875, + "learning_rate": 3.1771138666777485e-06, + "loss": 0.78712052, + "num_input_tokens_seen": 114618780, + "router_z_loss_clip": 1.96484375, + "router_z_loss_mlp": 0.17382812, + "step": 5334, + "time_per_iteration": 2.5501654148101807 + }, + { + "auxiliary_loss_clip": 0.06476508, + "auxiliary_loss_mlp": 0.01276305, + "balance_loss_clip": 0.06281763, + "balance_loss_mlp": 0.01257947, + "epoch": 0.3207575529836164, + "flos": 22060464520320.0, + "grad_norm": 1.723674002448169, + "language_loss": 0.77349097, + "learning_rate": 3.1767989818170156e-06, + "loss": 0.85101908, + "num_input_tokens_seen": 114637525, + "router_z_loss_clip": 1.94433594, + "router_z_loss_mlp": 0.18347168, + "step": 5335, + "time_per_iteration": 2.5194711685180664 + }, + { + "auxiliary_loss_clip": 0.06479798, + "auxiliary_loss_mlp": 0.0127571, + "balance_loss_clip": 0.06285612, + "balance_loss_mlp": 0.0125889, + "epoch": 0.3208176762362844, + "flos": 34065961015680.0, + "grad_norm": 1.52521333905674, + "language_loss": 0.68891776, + "learning_rate": 3.1764840523319477e-06, + "loss": 0.76647282, + "num_input_tokens_seen": 114659705, + "router_z_loss_clip": 1.94238281, + "router_z_loss_mlp": 0.16809082, + "step": 5336, + "time_per_iteration": 2.6550848484039307 + }, + { + "auxiliary_loss_clip": 0.06481949, + "auxiliary_loss_mlp": 0.01285819, + "balance_loss_clip": 0.06286713, + "balance_loss_mlp": 0.01268343, + "epoch": 0.32087779948895234, + "flos": 21805151529600.0, + "grad_norm": 1.6666772631518172, + "language_loss": 0.79367507, + "learning_rate": 3.176169078234487e-06, + "loss": 0.87135273, + "num_input_tokens_seen": 114678340, + "router_z_loss_clip": 1.95410156, + "router_z_loss_mlp": 0.17480469, + "step": 5337, + "time_per_iteration": 2.5133795738220215 + }, + { + "auxiliary_loss_clip": 0.06473362, + "auxiliary_loss_mlp": 0.01277197, + "balance_loss_clip": 0.06284505, + "balance_loss_mlp": 0.01260865, + "epoch": 0.3209379227416203, + "flos": 21440532487680.0, + "grad_norm": 1.6244255970978692, + "language_loss": 0.75145769, + "learning_rate": 3.1758540595365766e-06, + "loss": 0.82896328, + "num_input_tokens_seen": 114696980, + "router_z_loss_clip": 1.88964844, + "router_z_loss_mlp": 0.16320801, + "step": 5338, + "time_per_iteration": 2.526841402053833 + }, + { + "auxiliary_loss_clip": 0.06482957, + "auxiliary_loss_mlp": 0.01277739, + "balance_loss_clip": 0.06285477, + "balance_loss_mlp": 0.01260216, + "epoch": 0.3209980459942883, + "flos": 25856267604480.0, + "grad_norm": 1.7965894601451369, + "language_loss": 0.63241929, + "learning_rate": 3.1755389962501626e-06, + "loss": 0.7100262, + "num_input_tokens_seen": 114717330, + "router_z_loss_clip": 1.97558594, + "router_z_loss_mlp": 0.17504883, + "step": 5339, + "time_per_iteration": 2.5847740173339844 + }, + { + "auxiliary_loss_clip": 0.06482022, + "auxiliary_loss_mlp": 0.0127165, + "balance_loss_clip": 0.06283947, + "balance_loss_mlp": 0.01255151, + "epoch": 0.32105816924695624, + "flos": 19105218069120.0, + "grad_norm": 2.418138513897033, + "language_loss": 0.81912339, + "learning_rate": 3.175223888387192e-06, + "loss": 0.89666009, + "num_input_tokens_seen": 114736320, + "router_z_loss_clip": 1.97949219, + "router_z_loss_mlp": 0.16491699, + "step": 5340, + "time_per_iteration": 2.5764145851135254 + }, + { + "auxiliary_loss_clip": 0.06475554, + "auxiliary_loss_mlp": 0.01271917, + "balance_loss_clip": 0.06281976, + "balance_loss_mlp": 0.01254774, + "epoch": 0.3211182924996242, + "flos": 16587531239040.0, + "grad_norm": 1.7719401771551753, + "language_loss": 0.76604897, + "learning_rate": 3.1749087359596137e-06, + "loss": 0.84352368, + "num_input_tokens_seen": 114754575, + "router_z_loss_clip": 1.93652344, + "router_z_loss_mlp": 0.17150879, + "step": 5341, + "time_per_iteration": 2.505668878555298 + }, + { + "auxiliary_loss_clip": 0.06474154, + "auxiliary_loss_mlp": 0.01272611, + "balance_loss_clip": 0.0628191, + "balance_loss_mlp": 0.01255969, + "epoch": 0.3211784157522922, + "flos": 22678425982080.0, + "grad_norm": 1.4764530250267398, + "language_loss": 0.79422891, + "learning_rate": 3.1745935389793786e-06, + "loss": 0.87169659, + "num_input_tokens_seen": 114773590, + "router_z_loss_clip": 1.92382812, + "router_z_loss_mlp": 0.16662598, + "step": 5342, + "time_per_iteration": 2.5391595363616943 + }, + { + "auxiliary_loss_clip": 0.06483465, + "auxiliary_loss_mlp": 0.01277474, + "balance_loss_clip": 0.06286988, + "balance_loss_mlp": 0.01260141, + "epoch": 0.3212385390049602, + "flos": 20565119756160.0, + "grad_norm": 2.45787142613039, + "language_loss": 0.75074786, + "learning_rate": 3.174278297458438e-06, + "loss": 0.82835722, + "num_input_tokens_seen": 114790775, + "router_z_loss_clip": 1.96484375, + "router_z_loss_mlp": 0.17321777, + "step": 5343, + "time_per_iteration": 2.4957783222198486 + }, + { + "auxiliary_loss_clip": 0.06479985, + "auxiliary_loss_mlp": 0.01272066, + "balance_loss_clip": 0.06286201, + "balance_loss_mlp": 0.01255043, + "epoch": 0.32129866225762815, + "flos": 24798188972160.0, + "grad_norm": 1.5494427093400844, + "language_loss": 0.82596725, + "learning_rate": 3.173963011408748e-06, + "loss": 0.9034878, + "num_input_tokens_seen": 114809835, + "router_z_loss_clip": 1.94042969, + "router_z_loss_mlp": 0.17028809, + "step": 5344, + "time_per_iteration": 2.5672519207000732 + }, + { + "auxiliary_loss_clip": 0.06478736, + "auxiliary_loss_mlp": 0.01273821, + "balance_loss_clip": 0.06282513, + "balance_loss_mlp": 0.0125731, + "epoch": 0.3213587855102961, + "flos": 18372374259840.0, + "grad_norm": 1.9111940233558649, + "language_loss": 0.80321491, + "learning_rate": 3.173647680842262e-06, + "loss": 0.8807404, + "num_input_tokens_seen": 114826505, + "router_z_loss_clip": 1.96289062, + "router_z_loss_mlp": 0.16516113, + "step": 5345, + "time_per_iteration": 2.479442834854126 + }, + { + "auxiliary_loss_clip": 0.06478975, + "auxiliary_loss_mlp": 0.01271046, + "balance_loss_clip": 0.06283471, + "balance_loss_mlp": 0.01254321, + "epoch": 0.3214189087629641, + "flos": 27023274944640.0, + "grad_norm": 1.7019036305222461, + "language_loss": 0.83604348, + "learning_rate": 3.1733323057709384e-06, + "loss": 0.9135437, + "num_input_tokens_seen": 114846140, + "router_z_loss_clip": 1.95507812, + "router_z_loss_mlp": 0.16723633, + "step": 5346, + "time_per_iteration": 2.549257755279541 + }, + { + "auxiliary_loss_clip": 0.0648382, + "auxiliary_loss_mlp": 0.01272196, + "balance_loss_clip": 0.06285056, + "balance_loss_mlp": 0.0125528, + "epoch": 0.32147903201563205, + "flos": 23154866697600.0, + "grad_norm": 1.4545038816344273, + "language_loss": 0.81656283, + "learning_rate": 3.1730168862067366e-06, + "loss": 0.89412296, + "num_input_tokens_seen": 114866660, + "router_z_loss_clip": 1.98925781, + "router_z_loss_mlp": 0.16918945, + "step": 5347, + "time_per_iteration": 2.5096054077148438 + }, + { + "auxiliary_loss_clip": 0.06480029, + "auxiliary_loss_mlp": 0.01274054, + "balance_loss_clip": 0.06286772, + "balance_loss_mlp": 0.01256673, + "epoch": 0.3215391552683, + "flos": 16586231500800.0, + "grad_norm": 2.536962878441814, + "language_loss": 0.80386555, + "learning_rate": 3.1727014221616164e-06, + "loss": 0.88140643, + "num_input_tokens_seen": 114882820, + "router_z_loss_clip": 1.9296875, + "router_z_loss_mlp": 0.1739502, + "step": 5348, + "time_per_iteration": 3.9639015197753906 + }, + { + "auxiliary_loss_clip": 0.06474565, + "auxiliary_loss_mlp": 0.01276371, + "balance_loss_clip": 0.06280862, + "balance_loss_mlp": 0.01259431, + "epoch": 0.321599278520968, + "flos": 17827604939520.0, + "grad_norm": 2.026618804026968, + "language_loss": 0.85758352, + "learning_rate": 3.172385913647542e-06, + "loss": 0.93509287, + "num_input_tokens_seen": 114900745, + "router_z_loss_clip": 1.93554688, + "router_z_loss_mlp": 0.16943359, + "step": 5349, + "time_per_iteration": 3.8848202228546143 + }, + { + "auxiliary_loss_clip": 0.06481349, + "auxiliary_loss_mlp": 0.01274724, + "balance_loss_clip": 0.06286412, + "balance_loss_mlp": 0.01257022, + "epoch": 0.32165940177363594, + "flos": 16257097463040.0, + "grad_norm": 1.7607877661370477, + "language_loss": 0.8123306, + "learning_rate": 3.172070360676475e-06, + "loss": 0.88989133, + "num_input_tokens_seen": 114917940, + "router_z_loss_clip": 1.95019531, + "router_z_loss_mlp": 0.17700195, + "step": 5350, + "time_per_iteration": 3.9589500427246094 + }, + { + "auxiliary_loss_clip": 0.06471309, + "auxiliary_loss_mlp": 0.01270957, + "balance_loss_clip": 0.06282239, + "balance_loss_mlp": 0.01255055, + "epoch": 0.3217195250263039, + "flos": 27607302702720.0, + "grad_norm": 1.8529018663543275, + "language_loss": 0.80116528, + "learning_rate": 3.1717547632603828e-06, + "loss": 0.87858802, + "num_input_tokens_seen": 114937735, + "router_z_loss_clip": 1.89257812, + "router_z_loss_mlp": 0.15905762, + "step": 5351, + "time_per_iteration": 2.562232732772827 + }, + { + "auxiliary_loss_clip": 0.06476732, + "auxiliary_loss_mlp": 0.01274907, + "balance_loss_clip": 0.06284767, + "balance_loss_mlp": 0.01256668, + "epoch": 0.3217796482789719, + "flos": 21477023740800.0, + "grad_norm": 2.0321110975992562, + "language_loss": 0.7641573, + "learning_rate": 3.1714391214112326e-06, + "loss": 0.84167361, + "num_input_tokens_seen": 114956630, + "router_z_loss_clip": 1.91699219, + "router_z_loss_mlp": 0.18249512, + "step": 5352, + "time_per_iteration": 2.5320773124694824 + }, + { + "auxiliary_loss_clip": 0.0648407, + "auxiliary_loss_mlp": 0.01278365, + "balance_loss_clip": 0.06291708, + "balance_loss_mlp": 0.0126133, + "epoch": 0.32183977153163984, + "flos": 21222046166400.0, + "grad_norm": 1.9188598206640457, + "language_loss": 0.82159722, + "learning_rate": 3.1711234351409933e-06, + "loss": 0.89922154, + "num_input_tokens_seen": 114976470, + "router_z_loss_clip": 1.92089844, + "router_z_loss_mlp": 0.17028809, + "step": 5353, + "time_per_iteration": 2.5061802864074707 + }, + { + "auxiliary_loss_clip": 0.06480308, + "auxiliary_loss_mlp": 0.01275858, + "balance_loss_clip": 0.0629053, + "balance_loss_mlp": 0.0125837, + "epoch": 0.3218998947843078, + "flos": 24615103800960.0, + "grad_norm": 1.8505936463490174, + "language_loss": 0.74125177, + "learning_rate": 3.1708077044616365e-06, + "loss": 0.81881344, + "num_input_tokens_seen": 114996710, + "router_z_loss_clip": 1.89941406, + "router_z_loss_mlp": 0.17480469, + "step": 5354, + "time_per_iteration": 2.5725185871124268 + }, + { + "auxiliary_loss_clip": 0.06479903, + "auxiliary_loss_mlp": 0.01277081, + "balance_loss_clip": 0.06284866, + "balance_loss_mlp": 0.01259951, + "epoch": 0.3219600180369758, + "flos": 22276686781440.0, + "grad_norm": 2.612968571970558, + "language_loss": 0.83769405, + "learning_rate": 3.1704919293851334e-06, + "loss": 0.91526389, + "num_input_tokens_seen": 115015775, + "router_z_loss_clip": 1.95019531, + "router_z_loss_mlp": 0.17126465, + "step": 5355, + "time_per_iteration": 3.985846757888794 + }, + { + "auxiliary_loss_clip": 0.0647967, + "auxiliary_loss_mlp": 0.01272253, + "balance_loss_clip": 0.0628608, + "balance_loss_mlp": 0.01255528, + "epoch": 0.3220201412896438, + "flos": 14944376672640.0, + "grad_norm": 1.8959584470465125, + "language_loss": 0.71344721, + "learning_rate": 3.1701761099234597e-06, + "loss": 0.79096651, + "num_input_tokens_seen": 115034265, + "router_z_loss_clip": 1.93554688, + "router_z_loss_mlp": 0.1673584, + "step": 5356, + "time_per_iteration": 2.5644400119781494 + }, + { + "auxiliary_loss_clip": 0.06494904, + "auxiliary_loss_mlp": 0.01280986, + "balance_loss_clip": 0.0629259, + "balance_loss_mlp": 0.01263367, + "epoch": 0.32208026454231176, + "flos": 22672807758720.0, + "grad_norm": 2.5335154176231525, + "language_loss": 0.67879629, + "learning_rate": 3.1698602460885903e-06, + "loss": 0.7565552, + "num_input_tokens_seen": 115051945, + "router_z_loss_clip": 2.02050781, + "router_z_loss_mlp": 0.17614746, + "step": 5357, + "time_per_iteration": 2.546654224395752 + }, + { + "auxiliary_loss_clip": 0.06384487, + "auxiliary_loss_mlp": 0.01261366, + "balance_loss_clip": 0.06294875, + "balance_loss_mlp": 0.01257649, + "epoch": 0.3221403877949797, + "flos": 64626273308160.0, + "grad_norm": 0.6824166316331671, + "language_loss": 0.58314437, + "learning_rate": 3.1695443378925035e-06, + "loss": 0.65960288, + "num_input_tokens_seen": 115119090, + "router_z_loss_clip": 0.89648438, + "router_z_loss_mlp": 0.03707886, + "step": 5358, + "time_per_iteration": 3.2290756702423096 + }, + { + "auxiliary_loss_clip": 0.06481851, + "auxiliary_loss_mlp": 0.01282518, + "balance_loss_clip": 0.06287378, + "balance_loss_mlp": 0.01264839, + "epoch": 0.3222005110476477, + "flos": 20163212847360.0, + "grad_norm": 1.9186908993809755, + "language_loss": 0.84190667, + "learning_rate": 3.1692283853471777e-06, + "loss": 0.91955042, + "num_input_tokens_seen": 115137755, + "router_z_loss_clip": 1.94628906, + "router_z_loss_mlp": 0.17675781, + "step": 5359, + "time_per_iteration": 2.531033754348755 + }, + { + "auxiliary_loss_clip": 0.06480163, + "auxiliary_loss_mlp": 0.01277134, + "balance_loss_clip": 0.06287846, + "balance_loss_mlp": 0.01260051, + "epoch": 0.32226063430031565, + "flos": 22680731969280.0, + "grad_norm": 1.6695480137557102, + "language_loss": 0.79997146, + "learning_rate": 3.168912388464595e-06, + "loss": 0.87754452, + "num_input_tokens_seen": 115158150, + "router_z_loss_clip": 1.921875, + "router_z_loss_mlp": 0.17077637, + "step": 5360, + "time_per_iteration": 2.544461727142334 + }, + { + "auxiliary_loss_clip": 0.06382456, + "auxiliary_loss_mlp": 0.01256795, + "balance_loss_clip": 0.06292457, + "balance_loss_mlp": 0.01253353, + "epoch": 0.3223207575529836, + "flos": 63847798151040.0, + "grad_norm": 0.6356253914940931, + "language_loss": 0.56731617, + "learning_rate": 3.168596347256737e-06, + "loss": 0.64370871, + "num_input_tokens_seen": 115212755, + "router_z_loss_clip": 0.8984375, + "router_z_loss_mlp": 0.03451538, + "step": 5361, + "time_per_iteration": 3.0336568355560303 + }, + { + "auxiliary_loss_clip": 0.06478466, + "auxiliary_loss_mlp": 0.01277797, + "balance_loss_clip": 0.06288562, + "balance_loss_mlp": 0.01261346, + "epoch": 0.3223808808056516, + "flos": 26877393786240.0, + "grad_norm": 2.167930910708006, + "language_loss": 0.71792114, + "learning_rate": 3.168280261735588e-06, + "loss": 0.79548371, + "num_input_tokens_seen": 115233090, + "router_z_loss_clip": 1.89746094, + "router_z_loss_mlp": 0.16442871, + "step": 5362, + "time_per_iteration": 2.561345338821411 + }, + { + "auxiliary_loss_clip": 0.06483887, + "auxiliary_loss_mlp": 0.01279203, + "balance_loss_clip": 0.06293412, + "balance_loss_mlp": 0.01262692, + "epoch": 0.32244100405831955, + "flos": 26768716640640.0, + "grad_norm": 1.5327886568658977, + "language_loss": 0.73854291, + "learning_rate": 3.167964131913135e-06, + "loss": 0.81617379, + "num_input_tokens_seen": 115252645, + "router_z_loss_clip": 1.90722656, + "router_z_loss_mlp": 0.16503906, + "step": 5363, + "time_per_iteration": 2.583064556121826 + }, + { + "auxiliary_loss_clip": 0.06489229, + "auxiliary_loss_mlp": 0.01275466, + "balance_loss_clip": 0.06291971, + "balance_loss_mlp": 0.01258717, + "epoch": 0.3225011273109875, + "flos": 23809403266560.0, + "grad_norm": 2.354374584633167, + "language_loss": 0.76664144, + "learning_rate": 3.167647957801365e-06, + "loss": 0.84428835, + "num_input_tokens_seen": 115269085, + "router_z_loss_clip": 1.97265625, + "router_z_loss_mlp": 0.16748047, + "step": 5364, + "time_per_iteration": 2.5177268981933594 + }, + { + "auxiliary_loss_clip": 0.06479897, + "auxiliary_loss_mlp": 0.01275674, + "balance_loss_clip": 0.06290577, + "balance_loss_mlp": 0.01259473, + "epoch": 0.3225612505636555, + "flos": 17280194215680.0, + "grad_norm": 2.1891061142162327, + "language_loss": 0.7715044, + "learning_rate": 3.1673317394122672e-06, + "loss": 0.84906018, + "num_input_tokens_seen": 115286470, + "router_z_loss_clip": 1.89257812, + "router_z_loss_mlp": 0.1619873, + "step": 5365, + "time_per_iteration": 2.5122928619384766 + }, + { + "auxiliary_loss_clip": 0.06484331, + "auxiliary_loss_mlp": 0.01277663, + "balance_loss_clip": 0.06292351, + "balance_loss_mlp": 0.01260711, + "epoch": 0.32262137381632344, + "flos": 23372724113280.0, + "grad_norm": 2.314444268247813, + "language_loss": 0.77153468, + "learning_rate": 3.1670154767578333e-06, + "loss": 0.84915465, + "num_input_tokens_seen": 115307000, + "router_z_loss_clip": 1.92089844, + "router_z_loss_mlp": 0.16955566, + "step": 5366, + "time_per_iteration": 2.514768362045288 + }, + { + "auxiliary_loss_clip": 0.06481092, + "auxiliary_loss_mlp": 0.01280366, + "balance_loss_clip": 0.0629226, + "balance_loss_mlp": 0.0126388, + "epoch": 0.3226814970689914, + "flos": 23265598268160.0, + "grad_norm": 1.8642315088319754, + "language_loss": 0.72423649, + "learning_rate": 3.166699169850055e-06, + "loss": 0.80185115, + "num_input_tokens_seen": 115325925, + "router_z_loss_clip": 1.88769531, + "router_z_loss_mlp": 0.16491699, + "step": 5367, + "time_per_iteration": 2.544145345687866 + }, + { + "auxiliary_loss_clip": 0.06480073, + "auxiliary_loss_mlp": 0.01278287, + "balance_loss_clip": 0.06290721, + "balance_loss_mlp": 0.01262248, + "epoch": 0.32274162032165943, + "flos": 16400127582720.0, + "grad_norm": 1.9542840286813894, + "language_loss": 0.74559301, + "learning_rate": 3.1663828187009274e-06, + "loss": 0.82317662, + "num_input_tokens_seen": 115343705, + "router_z_loss_clip": 1.89453125, + "router_z_loss_mlp": 0.16033936, + "step": 5368, + "time_per_iteration": 2.4653942584991455 + }, + { + "auxiliary_loss_clip": 0.06481207, + "auxiliary_loss_mlp": 0.01271425, + "balance_loss_clip": 0.06294385, + "balance_loss_mlp": 0.01255874, + "epoch": 0.3228017435743274, + "flos": 27862489912320.0, + "grad_norm": 2.016369988637382, + "language_loss": 0.79033995, + "learning_rate": 3.1660664233224467e-06, + "loss": 0.86786628, + "num_input_tokens_seen": 115364170, + "router_z_loss_clip": 1.86621094, + "router_z_loss_mlp": 0.15533447, + "step": 5369, + "time_per_iteration": 2.6923141479492188 + }, + { + "auxiliary_loss_clip": 0.06471382, + "auxiliary_loss_mlp": 0.01280148, + "balance_loss_clip": 0.0628759, + "balance_loss_mlp": 0.01264567, + "epoch": 0.32286186682699536, + "flos": 19614712020480.0, + "grad_norm": 1.8619928029866217, + "language_loss": 0.83607441, + "learning_rate": 3.16574998372661e-06, + "loss": 0.91358972, + "num_input_tokens_seen": 115382495, + "router_z_loss_clip": 1.83984375, + "router_z_loss_mlp": 0.15576172, + "step": 5370, + "time_per_iteration": 2.4963490962982178 + }, + { + "auxiliary_loss_clip": 0.06481104, + "auxiliary_loss_mlp": 0.01278081, + "balance_loss_clip": 0.062904, + "balance_loss_mlp": 0.01262703, + "epoch": 0.3229219900796633, + "flos": 24140885218560.0, + "grad_norm": 2.7780356443351146, + "language_loss": 0.83346975, + "learning_rate": 3.1654334999254177e-06, + "loss": 0.91106164, + "num_input_tokens_seen": 115399450, + "router_z_loss_clip": 1.90625, + "router_z_loss_mlp": 0.15368652, + "step": 5371, + "time_per_iteration": 2.554034948348999 + }, + { + "auxiliary_loss_clip": 0.06486623, + "auxiliary_loss_mlp": 0.01278101, + "balance_loss_clip": 0.0629211, + "balance_loss_mlp": 0.01260434, + "epoch": 0.3229821133323313, + "flos": 17754454725120.0, + "grad_norm": 2.279534384310274, + "language_loss": 0.89153087, + "learning_rate": 3.1651169719308695e-06, + "loss": 0.96917808, + "num_input_tokens_seen": 115417700, + "router_z_loss_clip": 1.94433594, + "router_z_loss_mlp": 0.17663574, + "step": 5372, + "time_per_iteration": 2.468693971633911 + }, + { + "auxiliary_loss_clip": 0.06478924, + "auxiliary_loss_mlp": 0.01278448, + "balance_loss_clip": 0.06288313, + "balance_loss_mlp": 0.01261843, + "epoch": 0.32304223658499925, + "flos": 22352562253440.0, + "grad_norm": 1.986067660558338, + "language_loss": 0.730793, + "learning_rate": 3.1648003997549694e-06, + "loss": 0.80836678, + "num_input_tokens_seen": 115435840, + "router_z_loss_clip": 1.90332031, + "router_z_loss_mlp": 0.16601562, + "step": 5373, + "time_per_iteration": 2.5757906436920166 + }, + { + "auxiliary_loss_clip": 0.06476311, + "auxiliary_loss_mlp": 0.0127432, + "balance_loss_clip": 0.06293686, + "balance_loss_mlp": 0.01258227, + "epoch": 0.3231023598376672, + "flos": 18484154006400.0, + "grad_norm": 2.1970042176000963, + "language_loss": 0.82592154, + "learning_rate": 3.1644837834097214e-06, + "loss": 0.90342778, + "num_input_tokens_seen": 115454210, + "router_z_loss_clip": 1.82714844, + "router_z_loss_mlp": 0.1607666, + "step": 5374, + "time_per_iteration": 2.4853713512420654 + }, + { + "auxiliary_loss_clip": 0.06474404, + "auxiliary_loss_mlp": 0.01271223, + "balance_loss_clip": 0.06291121, + "balance_loss_mlp": 0.0125544, + "epoch": 0.3231624830903352, + "flos": 27643710101760.0, + "grad_norm": 1.9120740622639463, + "language_loss": 0.88405079, + "learning_rate": 3.1641671229071317e-06, + "loss": 0.96150708, + "num_input_tokens_seen": 115471785, + "router_z_loss_clip": 1.83007812, + "router_z_loss_mlp": 0.15783691, + "step": 5375, + "time_per_iteration": 2.58644700050354 + }, + { + "auxiliary_loss_clip": 0.06483716, + "auxiliary_loss_mlp": 0.01275166, + "balance_loss_clip": 0.06289761, + "balance_loss_mlp": 0.01258799, + "epoch": 0.32322260634300315, + "flos": 21732965637120.0, + "grad_norm": 2.2884949024183983, + "language_loss": 0.76224899, + "learning_rate": 3.1638504182592076e-06, + "loss": 0.83983773, + "num_input_tokens_seen": 115491405, + "router_z_loss_clip": 1.94042969, + "router_z_loss_mlp": 0.16345215, + "step": 5376, + "time_per_iteration": 2.5090999603271484 + }, + { + "auxiliary_loss_clip": 0.0647772, + "auxiliary_loss_mlp": 0.01272254, + "balance_loss_clip": 0.06289793, + "balance_loss_mlp": 0.01256649, + "epoch": 0.3232827295956711, + "flos": 22644198789120.0, + "grad_norm": 1.5259481118475857, + "language_loss": 0.67275858, + "learning_rate": 3.1635336694779594e-06, + "loss": 0.75025833, + "num_input_tokens_seen": 115511555, + "router_z_loss_clip": 1.88085938, + "router_z_loss_mlp": 0.15594482, + "step": 5377, + "time_per_iteration": 2.592737913131714 + }, + { + "auxiliary_loss_clip": 0.06482306, + "auxiliary_loss_mlp": 0.01279693, + "balance_loss_clip": 0.06294581, + "balance_loss_mlp": 0.01262158, + "epoch": 0.3233428528483391, + "flos": 26329731500160.0, + "grad_norm": 1.747214931760967, + "language_loss": 0.73022175, + "learning_rate": 3.1632168765753982e-06, + "loss": 0.80784178, + "num_input_tokens_seen": 115532860, + "router_z_loss_clip": 1.87597656, + "router_z_loss_mlp": 0.17541504, + "step": 5378, + "time_per_iteration": 2.560969114303589 + }, + { + "auxiliary_loss_clip": 0.06476232, + "auxiliary_loss_mlp": 0.01272167, + "balance_loss_clip": 0.06289409, + "balance_loss_mlp": 0.01256598, + "epoch": 0.32340297610100704, + "flos": 28592818099200.0, + "grad_norm": 2.0362074337070832, + "language_loss": 0.82332939, + "learning_rate": 3.1629000395635357e-06, + "loss": 0.90081334, + "num_input_tokens_seen": 115553850, + "router_z_loss_clip": 1.86816406, + "router_z_loss_mlp": 0.15563965, + "step": 5379, + "time_per_iteration": 2.661787986755371 + }, + { + "auxiliary_loss_clip": 0.06481552, + "auxiliary_loss_mlp": 0.01276474, + "balance_loss_clip": 0.06288823, + "balance_loss_mlp": 0.01260548, + "epoch": 0.323463099353675, + "flos": 30781664380800.0, + "grad_norm": 1.6212615798097256, + "language_loss": 0.78942055, + "learning_rate": 3.162583158454388e-06, + "loss": 0.86700082, + "num_input_tokens_seen": 115575530, + "router_z_loss_clip": 1.92285156, + "router_z_loss_mlp": 0.15942383, + "step": 5380, + "time_per_iteration": 2.593618631362915 + }, + { + "auxiliary_loss_clip": 0.06489569, + "auxiliary_loss_mlp": 0.01272069, + "balance_loss_clip": 0.06298643, + "balance_loss_mlp": 0.01255368, + "epoch": 0.32352322260634303, + "flos": 25235664739200.0, + "grad_norm": 1.685322069138263, + "language_loss": 0.77853882, + "learning_rate": 3.1622662332599697e-06, + "loss": 0.85615522, + "num_input_tokens_seen": 115594885, + "router_z_loss_clip": 1.91015625, + "router_z_loss_mlp": 0.16699219, + "step": 5381, + "time_per_iteration": 2.5967609882354736 + }, + { + "auxiliary_loss_clip": 0.06476527, + "auxiliary_loss_mlp": 0.01269308, + "balance_loss_clip": 0.06292967, + "balance_loss_mlp": 0.01255438, + "epoch": 0.323583345859011, + "flos": 23337071400960.0, + "grad_norm": 1.9004028984655497, + "language_loss": 0.72391021, + "learning_rate": 3.1619492639922998e-06, + "loss": 0.80136859, + "num_input_tokens_seen": 115614080, + "router_z_loss_clip": 1.83496094, + "router_z_loss_mlp": 0.13848877, + "step": 5382, + "time_per_iteration": 2.5095293521881104 + }, + { + "auxiliary_loss_clip": 0.06488711, + "auxiliary_loss_mlp": 0.01277606, + "balance_loss_clip": 0.06295708, + "balance_loss_mlp": 0.01262157, + "epoch": 0.32364346911167896, + "flos": 26213675195520.0, + "grad_norm": 2.3447859303702883, + "language_loss": 0.71528596, + "learning_rate": 3.1616322506633964e-06, + "loss": 0.79294908, + "num_input_tokens_seen": 115632820, + "router_z_loss_clip": 1.9296875, + "router_z_loss_mlp": 0.15441895, + "step": 5383, + "time_per_iteration": 2.5806562900543213 + }, + { + "auxiliary_loss_clip": 0.06476977, + "auxiliary_loss_mlp": 0.01276799, + "balance_loss_clip": 0.06292375, + "balance_loss_mlp": 0.01261564, + "epoch": 0.3237035923643469, + "flos": 23702487056640.0, + "grad_norm": 1.948915226701978, + "language_loss": 0.78857487, + "learning_rate": 3.161315193285283e-06, + "loss": 0.86611259, + "num_input_tokens_seen": 115652860, + "router_z_loss_clip": 1.84472656, + "router_z_loss_mlp": 0.15234375, + "step": 5384, + "time_per_iteration": 2.548797369003296 + }, + { + "auxiliary_loss_clip": 0.06481218, + "auxiliary_loss_mlp": 0.01274762, + "balance_loss_clip": 0.06288576, + "balance_loss_mlp": 0.0125793, + "epoch": 0.3237637156170149, + "flos": 14433960326400.0, + "grad_norm": 1.885180362402172, + "language_loss": 0.75034815, + "learning_rate": 3.16099809186998e-06, + "loss": 0.82790792, + "num_input_tokens_seen": 115670940, + "router_z_loss_clip": 1.92675781, + "router_z_loss_mlp": 0.16821289, + "step": 5385, + "time_per_iteration": 2.577547073364258 + }, + { + "auxiliary_loss_clip": 0.06486371, + "auxiliary_loss_mlp": 0.0127007, + "balance_loss_clip": 0.06298091, + "balance_loss_mlp": 0.01255032, + "epoch": 0.32382383886968286, + "flos": 31070449877760.0, + "grad_norm": 1.8174179211363362, + "language_loss": 0.72224641, + "learning_rate": 3.1606809464295145e-06, + "loss": 0.79981083, + "num_input_tokens_seen": 115691155, + "router_z_loss_clip": 1.8828125, + "router_z_loss_mlp": 0.15032959, + "step": 5386, + "time_per_iteration": 2.585822820663452 + }, + { + "auxiliary_loss_clip": 0.06485418, + "auxiliary_loss_mlp": 0.01273325, + "balance_loss_clip": 0.06292341, + "balance_loss_mlp": 0.01256803, + "epoch": 0.3238839621223508, + "flos": 23263418062080.0, + "grad_norm": 3.182973165751226, + "language_loss": 0.95573068, + "learning_rate": 3.1603637569759095e-06, + "loss": 1.03331804, + "num_input_tokens_seen": 115710340, + "router_z_loss_clip": 1.93261719, + "router_z_loss_mlp": 0.16503906, + "step": 5387, + "time_per_iteration": 4.075104236602783 + }, + { + "auxiliary_loss_clip": 0.06490889, + "auxiliary_loss_mlp": 0.01270509, + "balance_loss_clip": 0.06298059, + "balance_loss_mlp": 0.0125376, + "epoch": 0.3239440853750188, + "flos": 22971026839680.0, + "grad_norm": 2.142304582151843, + "language_loss": 0.78141761, + "learning_rate": 3.1600465235211956e-06, + "loss": 0.85903162, + "num_input_tokens_seen": 115726745, + "router_z_loss_clip": 1.92675781, + "router_z_loss_mlp": 0.16748047, + "step": 5388, + "time_per_iteration": 2.623976707458496 + }, + { + "auxiliary_loss_clip": 0.06478786, + "auxiliary_loss_mlp": 0.01276501, + "balance_loss_clip": 0.06289905, + "balance_loss_mlp": 0.01259704, + "epoch": 0.32400420862768675, + "flos": 36255394275840.0, + "grad_norm": 1.9954909505528162, + "language_loss": 0.71735168, + "learning_rate": 3.1597292460774006e-06, + "loss": 0.79490453, + "num_input_tokens_seen": 115749385, + "router_z_loss_clip": 1.88867188, + "router_z_loss_mlp": 0.16796875, + "step": 5389, + "time_per_iteration": 4.133269309997559 + }, + { + "auxiliary_loss_clip": 0.06479806, + "auxiliary_loss_mlp": 0.01273464, + "balance_loss_clip": 0.06294239, + "balance_loss_mlp": 0.01257872, + "epoch": 0.3240643318803547, + "flos": 21622946826240.0, + "grad_norm": 1.7464997421167434, + "language_loss": 0.81443554, + "learning_rate": 3.159411924656557e-06, + "loss": 0.89196825, + "num_input_tokens_seen": 115768105, + "router_z_loss_clip": 1.85449219, + "router_z_loss_mlp": 0.15588379, + "step": 5390, + "time_per_iteration": 3.9378364086151123 + }, + { + "auxiliary_loss_clip": 0.06491944, + "auxiliary_loss_mlp": 0.01278594, + "balance_loss_clip": 0.06301276, + "balance_loss_mlp": 0.01261296, + "epoch": 0.3241244551330227, + "flos": 23302466864640.0, + "grad_norm": 1.9807661160762629, + "language_loss": 0.73182476, + "learning_rate": 3.1590945592706967e-06, + "loss": 0.80953014, + "num_input_tokens_seen": 115787340, + "router_z_loss_clip": 1.90625, + "router_z_loss_mlp": 0.1730957, + "step": 5391, + "time_per_iteration": 2.532317638397217 + }, + { + "auxiliary_loss_clip": 0.06482222, + "auxiliary_loss_mlp": 0.01278908, + "balance_loss_clip": 0.06294864, + "balance_loss_mlp": 0.0126241, + "epoch": 0.32418457838569065, + "flos": 14101891395840.0, + "grad_norm": 1.5457442510257688, + "language_loss": 0.77541089, + "learning_rate": 3.158777149931855e-06, + "loss": 0.85302216, + "num_input_tokens_seen": 115805565, + "router_z_loss_clip": 1.87304688, + "router_z_loss_mlp": 0.16491699, + "step": 5392, + "time_per_iteration": 2.486161470413208 + }, + { + "auxiliary_loss_clip": 0.06490408, + "auxiliary_loss_mlp": 0.01278183, + "balance_loss_clip": 0.0629712, + "balance_loss_mlp": 0.01261411, + "epoch": 0.3242447016383586, + "flos": 29760454344960.0, + "grad_norm": 1.849936210081937, + "language_loss": 0.63213563, + "learning_rate": 3.158459696652067e-06, + "loss": 0.70982158, + "num_input_tokens_seen": 115826725, + "router_z_loss_clip": 1.93066406, + "router_z_loss_mlp": 0.16760254, + "step": 5393, + "time_per_iteration": 2.5853707790374756 + }, + { + "auxiliary_loss_clip": 0.06489256, + "auxiliary_loss_mlp": 0.01282677, + "balance_loss_clip": 0.06301466, + "balance_loss_mlp": 0.01266011, + "epoch": 0.3243048248910266, + "flos": 24357820239360.0, + "grad_norm": 1.7023503315224988, + "language_loss": 0.82889545, + "learning_rate": 3.158142199443371e-06, + "loss": 0.90661478, + "num_input_tokens_seen": 115846955, + "router_z_loss_clip": 1.88085938, + "router_z_loss_mlp": 0.16674805, + "step": 5394, + "time_per_iteration": 3.946955680847168 + }, + { + "auxiliary_loss_clip": 0.06480435, + "auxiliary_loss_mlp": 0.01285084, + "balance_loss_clip": 0.06298714, + "balance_loss_mlp": 0.01269825, + "epoch": 0.3243649481436946, + "flos": 24359958518400.0, + "grad_norm": 2.1573093021253333, + "language_loss": 0.82280314, + "learning_rate": 3.1578246583178076e-06, + "loss": 0.90045834, + "num_input_tokens_seen": 115865975, + "router_z_loss_clip": 1.81738281, + "router_z_loss_mlp": 0.15270996, + "step": 5395, + "time_per_iteration": 2.537313222885132 + }, + { + "auxiliary_loss_clip": 0.06480338, + "auxiliary_loss_mlp": 0.01292267, + "balance_loss_clip": 0.06300412, + "balance_loss_mlp": 0.01276424, + "epoch": 0.32442507139636256, + "flos": 22931097569280.0, + "grad_norm": 1.7302006802896392, + "language_loss": 0.839818, + "learning_rate": 3.157507073287417e-06, + "loss": 0.91754401, + "num_input_tokens_seen": 115884950, + "router_z_loss_clip": 1.79785156, + "router_z_loss_mlp": 0.15844727, + "step": 5396, + "time_per_iteration": 2.6440067291259766 + }, + { + "auxiliary_loss_clip": 0.06491997, + "auxiliary_loss_mlp": 0.01291538, + "balance_loss_clip": 0.06299315, + "balance_loss_mlp": 0.01274121, + "epoch": 0.32448519464903053, + "flos": 22206723022080.0, + "grad_norm": 1.8684779143202024, + "language_loss": 0.76113403, + "learning_rate": 3.1571894443642414e-06, + "loss": 0.83896935, + "num_input_tokens_seen": 115904170, + "router_z_loss_clip": 1.92578125, + "router_z_loss_mlp": 0.17419434, + "step": 5397, + "time_per_iteration": 2.506601095199585 + }, + { + "auxiliary_loss_clip": 0.06473789, + "auxiliary_loss_mlp": 0.01290487, + "balance_loss_clip": 0.06290997, + "balance_loss_mlp": 0.0127387, + "epoch": 0.3245453179016985, + "flos": 18843574095360.0, + "grad_norm": 2.304762567896747, + "language_loss": 0.67975587, + "learning_rate": 3.1568717715603263e-06, + "loss": 0.75739866, + "num_input_tokens_seen": 115919255, + "router_z_loss_clip": 1.828125, + "router_z_loss_mlp": 0.1661377, + "step": 5398, + "time_per_iteration": 2.50168514251709 + }, + { + "auxiliary_loss_clip": 0.06478744, + "auxiliary_loss_mlp": 0.01288926, + "balance_loss_clip": 0.06293125, + "balance_loss_mlp": 0.01272189, + "epoch": 0.32460544115436646, + "flos": 21184716372480.0, + "grad_norm": 1.3685049489713428, + "language_loss": 0.73232323, + "learning_rate": 3.156554054887718e-06, + "loss": 0.80999994, + "num_input_tokens_seen": 115938535, + "router_z_loss_clip": 1.85742188, + "router_z_loss_mlp": 0.16748047, + "step": 5399, + "time_per_iteration": 2.5114216804504395 + }, + { + "auxiliary_loss_clip": 0.0648094, + "auxiliary_loss_mlp": 0.01289931, + "balance_loss_clip": 0.06293677, + "balance_loss_mlp": 0.01273241, + "epoch": 0.3246655644070344, + "flos": 21987607795200.0, + "grad_norm": 2.072173153822147, + "language_loss": 0.71044981, + "learning_rate": 3.1562362943584645e-06, + "loss": 0.78815848, + "num_input_tokens_seen": 115955005, + "router_z_loss_clip": 1.87207031, + "router_z_loss_mlp": 0.16687012, + "step": 5400, + "time_per_iteration": 2.548757314682007 + }, + { + "auxiliary_loss_clip": 0.06480449, + "auxiliary_loss_mlp": 0.01279651, + "balance_loss_clip": 0.06289301, + "balance_loss_mlp": 0.01263355, + "epoch": 0.3247256876597024, + "flos": 32167745020800.0, + "grad_norm": 2.104371315429844, + "language_loss": 0.80626661, + "learning_rate": 3.155918489984614e-06, + "loss": 0.88386756, + "num_input_tokens_seen": 115975305, + "router_z_loss_clip": 1.91210938, + "router_z_loss_mlp": 0.16296387, + "step": 5401, + "time_per_iteration": 2.59226393699646 + }, + { + "auxiliary_loss_clip": 0.06483636, + "auxiliary_loss_mlp": 0.01281263, + "balance_loss_clip": 0.06294005, + "balance_loss_mlp": 0.01264073, + "epoch": 0.32478581091237035, + "flos": 21004104896640.0, + "grad_norm": 1.4796090680940444, + "language_loss": 0.87935805, + "learning_rate": 3.1556006417782196e-06, + "loss": 0.95700705, + "num_input_tokens_seen": 115994810, + "router_z_loss_clip": 1.89746094, + "router_z_loss_mlp": 0.17175293, + "step": 5402, + "time_per_iteration": 2.5548956394195557 + }, + { + "auxiliary_loss_clip": 0.06474966, + "auxiliary_loss_mlp": 0.0127368, + "balance_loss_clip": 0.06291528, + "balance_loss_mlp": 0.01258767, + "epoch": 0.3248459341650383, + "flos": 17929741466880.0, + "grad_norm": 2.584856005153906, + "language_loss": 0.85243386, + "learning_rate": 3.155282749751332e-06, + "loss": 0.92992032, + "num_input_tokens_seen": 116011095, + "router_z_loss_clip": 1.83691406, + "router_z_loss_mlp": 0.14904785, + "step": 5403, + "time_per_iteration": 2.479205369949341 + }, + { + "auxiliary_loss_clip": 0.06468324, + "auxiliary_loss_mlp": 0.01277336, + "balance_loss_clip": 0.06290223, + "balance_loss_mlp": 0.01262667, + "epoch": 0.3249060574177063, + "flos": 24542582492160.0, + "grad_norm": 2.1052258035485214, + "language_loss": 0.8828373, + "learning_rate": 3.154964813916007e-06, + "loss": 0.96029389, + "num_input_tokens_seen": 116028805, + "router_z_loss_clip": 1.77832031, + "router_z_loss_mlp": 0.14672852, + "step": 5404, + "time_per_iteration": 2.5845093727111816 + }, + { + "auxiliary_loss_clip": 0.06473936, + "auxiliary_loss_mlp": 0.01275771, + "balance_loss_clip": 0.06291413, + "balance_loss_mlp": 0.01259368, + "epoch": 0.32496618067037425, + "flos": 26001939127680.0, + "grad_norm": 1.6833557203411496, + "language_loss": 0.72900558, + "learning_rate": 3.1546468342843008e-06, + "loss": 0.80650264, + "num_input_tokens_seen": 116047765, + "router_z_loss_clip": 1.82617188, + "router_z_loss_mlp": 0.1640625, + "step": 5405, + "time_per_iteration": 2.542433500289917 + }, + { + "auxiliary_loss_clip": 0.06474283, + "auxiliary_loss_mlp": 0.01273684, + "balance_loss_clip": 0.06290333, + "balance_loss_mlp": 0.01258264, + "epoch": 0.3250263039230422, + "flos": 19579939776000.0, + "grad_norm": 1.7320098663924197, + "language_loss": 0.83355331, + "learning_rate": 3.1543288108682707e-06, + "loss": 0.91103297, + "num_input_tokens_seen": 116068385, + "router_z_loss_clip": 1.83789062, + "router_z_loss_mlp": 0.15435791, + "step": 5406, + "time_per_iteration": 2.591207265853882 + }, + { + "auxiliary_loss_clip": 0.06474167, + "auxiliary_loss_mlp": 0.01270112, + "balance_loss_clip": 0.06290454, + "balance_loss_mlp": 0.01254949, + "epoch": 0.3250864271757102, + "flos": 16769232817920.0, + "grad_norm": 2.13827452533593, + "language_loss": 0.87879711, + "learning_rate": 3.1540107436799764e-06, + "loss": 0.95623994, + "num_input_tokens_seen": 116085350, + "router_z_loss_clip": 1.83886719, + "router_z_loss_mlp": 0.15161133, + "step": 5407, + "time_per_iteration": 2.4856173992156982 + }, + { + "auxiliary_loss_clip": 0.06469748, + "auxiliary_loss_mlp": 0.01276836, + "balance_loss_clip": 0.06284758, + "balance_loss_mlp": 0.01261195, + "epoch": 0.3251465504283782, + "flos": 27827004908160.0, + "grad_norm": 2.430972813034592, + "language_loss": 0.69975567, + "learning_rate": 3.153692632731479e-06, + "loss": 0.77722144, + "num_input_tokens_seen": 116107560, + "router_z_loss_clip": 1.84960938, + "router_z_loss_mlp": 0.15649414, + "step": 5408, + "time_per_iteration": 2.5838799476623535 + }, + { + "auxiliary_loss_clip": 0.06481153, + "auxiliary_loss_mlp": 0.01282988, + "balance_loss_clip": 0.06286341, + "balance_loss_mlp": 0.01267396, + "epoch": 0.32520667368104617, + "flos": 19069271867520.0, + "grad_norm": 3.909403651515765, + "language_loss": 0.78053123, + "learning_rate": 3.153374478034841e-06, + "loss": 0.85817266, + "num_input_tokens_seen": 116125980, + "router_z_loss_clip": 1.94824219, + "router_z_loss_mlp": 0.15588379, + "step": 5409, + "time_per_iteration": 2.5178377628326416 + }, + { + "auxiliary_loss_clip": 0.06473932, + "auxiliary_loss_mlp": 0.01272582, + "balance_loss_clip": 0.06286227, + "balance_loss_mlp": 0.01256202, + "epoch": 0.32526679693371413, + "flos": 29388917341440.0, + "grad_norm": 1.8050072916987376, + "language_loss": 0.83473468, + "learning_rate": 3.1530562796021285e-06, + "loss": 0.91219985, + "num_input_tokens_seen": 116146530, + "router_z_loss_clip": 1.87695312, + "router_z_loss_mlp": 0.16381836, + "step": 5410, + "time_per_iteration": 2.5948092937469482 + }, + { + "auxiliary_loss_clip": 0.06466505, + "auxiliary_loss_mlp": 0.01275621, + "balance_loss_clip": 0.06286819, + "balance_loss_mlp": 0.01261274, + "epoch": 0.3253269201863821, + "flos": 20710833206400.0, + "grad_norm": 1.580323990141508, + "language_loss": 0.72005814, + "learning_rate": 3.152738037445405e-06, + "loss": 0.79747939, + "num_input_tokens_seen": 116165695, + "router_z_loss_clip": 1.79589844, + "router_z_loss_mlp": 0.14349365, + "step": 5411, + "time_per_iteration": 2.515542507171631 + }, + { + "auxiliary_loss_clip": 0.06472497, + "auxiliary_loss_mlp": 0.0127713, + "balance_loss_clip": 0.06287136, + "balance_loss_mlp": 0.01261632, + "epoch": 0.32538704343905006, + "flos": 29101515436800.0, + "grad_norm": 1.470162471805647, + "language_loss": 0.83496881, + "learning_rate": 3.1524197515767403e-06, + "loss": 0.91246504, + "num_input_tokens_seen": 116185375, + "router_z_loss_clip": 1.85351562, + "router_z_loss_mlp": 0.15490723, + "step": 5412, + "time_per_iteration": 2.55008602142334 + }, + { + "auxiliary_loss_clip": 0.06476887, + "auxiliary_loss_mlp": 0.01277617, + "balance_loss_clip": 0.06287435, + "balance_loss_mlp": 0.01260904, + "epoch": 0.325447166691718, + "flos": 24682216521600.0, + "grad_norm": 1.5504273053971407, + "language_loss": 0.8129071, + "learning_rate": 3.152101422008203e-06, + "loss": 0.89045215, + "num_input_tokens_seen": 116204335, + "router_z_loss_clip": 1.89550781, + "router_z_loss_mlp": 0.16711426, + "step": 5413, + "time_per_iteration": 2.54195499420166 + }, + { + "auxiliary_loss_clip": 0.06477104, + "auxiliary_loss_mlp": 0.0127801, + "balance_loss_clip": 0.0628976, + "balance_loss_mlp": 0.01261643, + "epoch": 0.325507289944386, + "flos": 21549503122560.0, + "grad_norm": 1.5527044192655586, + "language_loss": 0.76985061, + "learning_rate": 3.151783048751864e-06, + "loss": 0.84740174, + "num_input_tokens_seen": 116222840, + "router_z_loss_clip": 1.87402344, + "router_z_loss_mlp": 0.16363525, + "step": 5414, + "time_per_iteration": 2.5435919761657715 + }, + { + "auxiliary_loss_clip": 0.063807, + "auxiliary_loss_mlp": 0.01284661, + "balance_loss_clip": 0.06291388, + "balance_loss_mlp": 0.01280793, + "epoch": 0.32556741319705396, + "flos": 71537893194240.0, + "grad_norm": 0.9015335749308697, + "language_loss": 0.64095414, + "learning_rate": 3.1514646318197965e-06, + "loss": 0.71760774, + "num_input_tokens_seen": 116274940, + "router_z_loss_clip": 0.890625, + "router_z_loss_mlp": 0.03863525, + "step": 5415, + "time_per_iteration": 3.0875957012176514 + }, + { + "auxiliary_loss_clip": 0.0647157, + "auxiliary_loss_mlp": 0.01275105, + "balance_loss_clip": 0.06285933, + "balance_loss_mlp": 0.01258845, + "epoch": 0.3256275364497219, + "flos": 23739187944960.0, + "grad_norm": 1.4815485577141352, + "language_loss": 0.74123245, + "learning_rate": 3.151146171224075e-06, + "loss": 0.81869924, + "num_input_tokens_seen": 116297300, + "router_z_loss_clip": 1.85546875, + "router_z_loss_mlp": 0.16235352, + "step": 5416, + "time_per_iteration": 2.5792665481567383 + }, + { + "auxiliary_loss_clip": 0.06381539, + "auxiliary_loss_mlp": 0.01266569, + "balance_loss_clip": 0.06293018, + "balance_loss_mlp": 0.01262769, + "epoch": 0.3256876597023899, + "flos": 67308136214400.0, + "grad_norm": 0.7704887993649999, + "language_loss": 0.57850802, + "learning_rate": 3.1508276669767757e-06, + "loss": 0.65498912, + "num_input_tokens_seen": 116362370, + "router_z_loss_clip": 0.88378906, + "router_z_loss_mlp": 0.03793335, + "step": 5417, + "time_per_iteration": 3.2770884037017822 + }, + { + "auxiliary_loss_clip": 0.06373264, + "auxiliary_loss_mlp": 0.01258837, + "balance_loss_clip": 0.06284805, + "balance_loss_mlp": 0.01254933, + "epoch": 0.32574778295505785, + "flos": 71304633826560.0, + "grad_norm": 0.8775074523137479, + "language_loss": 0.63674986, + "learning_rate": 3.150509119089975e-06, + "loss": 0.71307087, + "num_input_tokens_seen": 116430365, + "router_z_loss_clip": 0.8828125, + "router_z_loss_mlp": 0.03900146, + "step": 5418, + "time_per_iteration": 3.315948724746704 + }, + { + "auxiliary_loss_clip": 0.06476019, + "auxiliary_loss_mlp": 0.01273465, + "balance_loss_clip": 0.06290952, + "balance_loss_mlp": 0.01258111, + "epoch": 0.3258079062077258, + "flos": 20782515974400.0, + "grad_norm": 1.8847025208507953, + "language_loss": 0.6957128, + "learning_rate": 3.1501905275757537e-06, + "loss": 0.77320766, + "num_input_tokens_seen": 116447525, + "router_z_loss_clip": 1.85058594, + "router_z_loss_mlp": 0.15344238, + "step": 5419, + "time_per_iteration": 2.5722780227661133 + }, + { + "auxiliary_loss_clip": 0.06480842, + "auxiliary_loss_mlp": 0.01275789, + "balance_loss_clip": 0.06291591, + "balance_loss_mlp": 0.01260006, + "epoch": 0.3258680294603938, + "flos": 22241788755840.0, + "grad_norm": 2.023173952709465, + "language_loss": 0.77398664, + "learning_rate": 3.1498718924461926e-06, + "loss": 0.85155296, + "num_input_tokens_seen": 116466310, + "router_z_loss_clip": 1.890625, + "router_z_loss_mlp": 0.15783691, + "step": 5420, + "time_per_iteration": 2.5199873447418213 + }, + { + "auxiliary_loss_clip": 0.06478356, + "auxiliary_loss_mlp": 0.0127343, + "balance_loss_clip": 0.06290038, + "balance_loss_mlp": 0.0125798, + "epoch": 0.3259281527130618, + "flos": 26987328743040.0, + "grad_norm": 1.5124533627457746, + "language_loss": 0.80826706, + "learning_rate": 3.1495532137133736e-06, + "loss": 0.88578492, + "num_input_tokens_seen": 116487825, + "router_z_loss_clip": 1.88378906, + "router_z_loss_mlp": 0.15441895, + "step": 5421, + "time_per_iteration": 2.6014363765716553 + }, + { + "auxiliary_loss_clip": 0.06476312, + "auxiliary_loss_mlp": 0.0127337, + "balance_loss_clip": 0.06293876, + "balance_loss_mlp": 0.01258982, + "epoch": 0.32598827596572977, + "flos": 26221557479040.0, + "grad_norm": 1.4846059645471, + "language_loss": 0.76098251, + "learning_rate": 3.149234491389381e-06, + "loss": 0.8384794, + "num_input_tokens_seen": 116509950, + "router_z_loss_clip": 1.82421875, + "router_z_loss_mlp": 0.1439209, + "step": 5422, + "time_per_iteration": 2.5738978385925293 + }, + { + "auxiliary_loss_clip": 0.06480287, + "auxiliary_loss_mlp": 0.01270664, + "balance_loss_clip": 0.06288645, + "balance_loss_mlp": 0.01255095, + "epoch": 0.32604839921839773, + "flos": 17645567944320.0, + "grad_norm": 2.282982793788361, + "language_loss": 0.63826233, + "learning_rate": 3.1489157254863026e-06, + "loss": 0.71577179, + "num_input_tokens_seen": 116527695, + "router_z_loss_clip": 1.91699219, + "router_z_loss_mlp": 0.15576172, + "step": 5423, + "time_per_iteration": 2.5513644218444824 + }, + { + "auxiliary_loss_clip": 0.06469624, + "auxiliary_loss_mlp": 0.01273816, + "balance_loss_clip": 0.06290927, + "balance_loss_mlp": 0.01258748, + "epoch": 0.3261085224710657, + "flos": 23629420696320.0, + "grad_norm": 1.6690467832946037, + "language_loss": 0.75170749, + "learning_rate": 3.148596916016224e-06, + "loss": 0.82914186, + "num_input_tokens_seen": 116547800, + "router_z_loss_clip": 1.78710938, + "router_z_loss_mlp": 0.1505127, + "step": 5424, + "time_per_iteration": 2.546074151992798 + }, + { + "auxiliary_loss_clip": 0.06470636, + "auxiliary_loss_mlp": 0.01274311, + "balance_loss_clip": 0.06288706, + "balance_loss_mlp": 0.01258945, + "epoch": 0.32616864572373366, + "flos": 23267526912000.0, + "grad_norm": 1.6415169459291201, + "language_loss": 0.7718606, + "learning_rate": 3.1482780629912355e-06, + "loss": 0.84931004, + "num_input_tokens_seen": 116568460, + "router_z_loss_clip": 1.81933594, + "router_z_loss_mlp": 0.15368652, + "step": 5425, + "time_per_iteration": 2.5883710384368896 + }, + { + "auxiliary_loss_clip": 0.06476015, + "auxiliary_loss_mlp": 0.01273254, + "balance_loss_clip": 0.06284191, + "balance_loss_mlp": 0.01256589, + "epoch": 0.32622876897640163, + "flos": 25600535343360.0, + "grad_norm": 2.4681515054731924, + "language_loss": 0.78599709, + "learning_rate": 3.147959166423428e-06, + "loss": 0.86348987, + "num_input_tokens_seen": 116588705, + "router_z_loss_clip": 1.91992188, + "router_z_loss_mlp": 0.16650391, + "step": 5426, + "time_per_iteration": 2.569566488265991 + }, + { + "auxiliary_loss_clip": 0.06473041, + "auxiliary_loss_mlp": 0.01273059, + "balance_loss_clip": 0.06286261, + "balance_loss_mlp": 0.0125749, + "epoch": 0.3262888922290696, + "flos": 22425544759680.0, + "grad_norm": 1.6671872965592953, + "language_loss": 0.74719262, + "learning_rate": 3.147640226324893e-06, + "loss": 0.82465363, + "num_input_tokens_seen": 116608845, + "router_z_loss_clip": 1.86816406, + "router_z_loss_mlp": 0.15563965, + "step": 5427, + "time_per_iteration": 3.941770315170288 + }, + { + "auxiliary_loss_clip": 0.06474692, + "auxiliary_loss_mlp": 0.0127251, + "balance_loss_clip": 0.06285059, + "balance_loss_mlp": 0.01256154, + "epoch": 0.32634901548173756, + "flos": 19724982393600.0, + "grad_norm": 2.0508761677602965, + "language_loss": 0.79472262, + "learning_rate": 3.1473212427077266e-06, + "loss": 0.87219465, + "num_input_tokens_seen": 116628145, + "router_z_loss_clip": 1.89550781, + "router_z_loss_mlp": 0.16357422, + "step": 5428, + "time_per_iteration": 3.9950850009918213 + }, + { + "auxiliary_loss_clip": 0.06475013, + "auxiliary_loss_mlp": 0.01275116, + "balance_loss_clip": 0.0628937, + "balance_loss_mlp": 0.01259309, + "epoch": 0.3264091387344055, + "flos": 16148336463360.0, + "grad_norm": 1.5445825374219135, + "language_loss": 0.71770716, + "learning_rate": 3.147002215584023e-06, + "loss": 0.79520845, + "num_input_tokens_seen": 116646920, + "router_z_loss_clip": 1.85644531, + "router_z_loss_mlp": 0.15808105, + "step": 5429, + "time_per_iteration": 3.922197103500366 + }, + { + "auxiliary_loss_clip": 0.06468233, + "auxiliary_loss_mlp": 0.01269844, + "balance_loss_clip": 0.06283497, + "balance_loss_mlp": 0.01254466, + "epoch": 0.3264692619870735, + "flos": 16404655703040.0, + "grad_norm": 1.5791835311639297, + "language_loss": 0.78689212, + "learning_rate": 3.146683144965881e-06, + "loss": 0.86427283, + "num_input_tokens_seen": 116665100, + "router_z_loss_clip": 1.84863281, + "router_z_loss_mlp": 0.15380859, + "step": 5430, + "time_per_iteration": 2.4873790740966797 + }, + { + "auxiliary_loss_clip": 0.06468185, + "auxiliary_loss_mlp": 0.0127668, + "balance_loss_clip": 0.06281599, + "balance_loss_mlp": 0.01259561, + "epoch": 0.32652938523974145, + "flos": 22388843871360.0, + "grad_norm": 1.9481749952405665, + "language_loss": 0.84556186, + "learning_rate": 3.146364030865399e-06, + "loss": 0.92301053, + "num_input_tokens_seen": 116682205, + "router_z_loss_clip": 1.86425781, + "router_z_loss_mlp": 0.17126465, + "step": 5431, + "time_per_iteration": 2.522075653076172 + }, + { + "auxiliary_loss_clip": 0.06468672, + "auxiliary_loss_mlp": 0.01274085, + "balance_loss_clip": 0.06286903, + "balance_loss_mlp": 0.01259327, + "epoch": 0.3265895084924094, + "flos": 21914499507840.0, + "grad_norm": 1.6266920997971765, + "language_loss": 0.71123517, + "learning_rate": 3.146044873294678e-06, + "loss": 0.78866279, + "num_input_tokens_seen": 116702575, + "router_z_loss_clip": 1.81835938, + "router_z_loss_mlp": 0.14758301, + "step": 5432, + "time_per_iteration": 2.513209104537964 + }, + { + "auxiliary_loss_clip": 0.06469099, + "auxiliary_loss_mlp": 0.01272277, + "balance_loss_clip": 0.06282821, + "balance_loss_mlp": 0.01257424, + "epoch": 0.3266496317450774, + "flos": 16072083648000.0, + "grad_norm": 1.3982751613904698, + "language_loss": 0.84207368, + "learning_rate": 3.1457256722658203e-06, + "loss": 0.91948748, + "num_input_tokens_seen": 116720885, + "router_z_loss_clip": 1.86328125, + "router_z_loss_mlp": 0.14855957, + "step": 5433, + "time_per_iteration": 2.5324172973632812 + }, + { + "auxiliary_loss_clip": 0.06463822, + "auxiliary_loss_mlp": 0.01279207, + "balance_loss_clip": 0.06283711, + "balance_loss_mlp": 0.01264049, + "epoch": 0.3267097549977454, + "flos": 22534766956800.0, + "grad_norm": 1.4562075652627795, + "language_loss": 0.85916972, + "learning_rate": 3.145406427790931e-06, + "loss": 0.93660003, + "num_input_tokens_seen": 116740395, + "router_z_loss_clip": 1.80175781, + "router_z_loss_mlp": 0.15155029, + "step": 5434, + "time_per_iteration": 3.9434614181518555 + }, + { + "auxiliary_loss_clip": 0.06468898, + "auxiliary_loss_mlp": 0.01277076, + "balance_loss_clip": 0.06281307, + "balance_loss_mlp": 0.0126134, + "epoch": 0.32676987825041337, + "flos": 27277581686400.0, + "grad_norm": 1.6909362765146225, + "language_loss": 0.88470823, + "learning_rate": 3.1450871398821147e-06, + "loss": 0.96216792, + "num_input_tokens_seen": 116758870, + "router_z_loss_clip": 1.875, + "router_z_loss_mlp": 0.1574707, + "step": 5435, + "time_per_iteration": 2.5430006980895996 + }, + { + "auxiliary_loss_clip": 0.06469613, + "auxiliary_loss_mlp": 0.01271625, + "balance_loss_clip": 0.06283396, + "balance_loss_mlp": 0.01256306, + "epoch": 0.32683000150308134, + "flos": 11512731432960.0, + "grad_norm": 2.3091497119382733, + "language_loss": 0.77129918, + "learning_rate": 3.144767808551479e-06, + "loss": 0.84871155, + "num_input_tokens_seen": 116773440, + "router_z_loss_clip": 1.86035156, + "router_z_loss_mlp": 0.15307617, + "step": 5436, + "time_per_iteration": 2.486003875732422 + }, + { + "auxiliary_loss_clip": 0.06464212, + "auxiliary_loss_mlp": 0.01277236, + "balance_loss_clip": 0.0628064, + "balance_loss_mlp": 0.01261977, + "epoch": 0.3268901247557493, + "flos": 25637362012800.0, + "grad_norm": 1.5303988762112921, + "language_loss": 0.72448635, + "learning_rate": 3.144448433811134e-06, + "loss": 0.80190074, + "num_input_tokens_seen": 116794375, + "router_z_loss_clip": 1.83398438, + "router_z_loss_mlp": 0.15270996, + "step": 5437, + "time_per_iteration": 2.545548915863037 + }, + { + "auxiliary_loss_clip": 0.06472606, + "auxiliary_loss_mlp": 0.01275014, + "balance_loss_clip": 0.06282267, + "balance_loss_mlp": 0.01258253, + "epoch": 0.32695024800841727, + "flos": 24867356117760.0, + "grad_norm": 1.604360978002023, + "language_loss": 0.64194709, + "learning_rate": 3.144129015673189e-06, + "loss": 0.71942323, + "num_input_tokens_seen": 116815095, + "router_z_loss_clip": 1.90429688, + "router_z_loss_mlp": 0.16760254, + "step": 5438, + "time_per_iteration": 2.5657694339752197 + }, + { + "auxiliary_loss_clip": 0.06462848, + "auxiliary_loss_mlp": 0.01273041, + "balance_loss_clip": 0.0627985, + "balance_loss_mlp": 0.01257246, + "epoch": 0.32701037126108523, + "flos": 28846663643520.0, + "grad_norm": 1.637174889107761, + "language_loss": 0.74795192, + "learning_rate": 3.1438095541497576e-06, + "loss": 0.82531083, + "num_input_tokens_seen": 116836630, + "router_z_loss_clip": 1.82910156, + "router_z_loss_mlp": 0.15795898, + "step": 5439, + "time_per_iteration": 2.5655689239501953 + }, + { + "auxiliary_loss_clip": 0.06473309, + "auxiliary_loss_mlp": 0.01272512, + "balance_loss_clip": 0.06286814, + "balance_loss_mlp": 0.01257087, + "epoch": 0.3270704945137532, + "flos": 27972592577280.0, + "grad_norm": 1.745503595629167, + "language_loss": 0.74950606, + "learning_rate": 3.1434900492529527e-06, + "loss": 0.82696426, + "num_input_tokens_seen": 116856880, + "router_z_loss_clip": 1.86425781, + "router_z_loss_mlp": 0.1541748, + "step": 5440, + "time_per_iteration": 2.601821184158325 + }, + { + "auxiliary_loss_clip": 0.06460315, + "auxiliary_loss_mlp": 0.01269526, + "balance_loss_clip": 0.06277528, + "balance_loss_mlp": 0.01254947, + "epoch": 0.32713061776642116, + "flos": 23696575344000.0, + "grad_norm": 1.95462638600934, + "language_loss": 0.84695202, + "learning_rate": 3.1431705009948914e-06, + "loss": 0.92425048, + "num_input_tokens_seen": 116873770, + "router_z_loss_clip": 1.82910156, + "router_z_loss_mlp": 0.14599609, + "step": 5441, + "time_per_iteration": 2.5020570755004883 + }, + { + "auxiliary_loss_clip": 0.06466734, + "auxiliary_loss_mlp": 0.01272021, + "balance_loss_clip": 0.06280614, + "balance_loss_mlp": 0.01256798, + "epoch": 0.3271907410190891, + "flos": 22462203720960.0, + "grad_norm": 1.9620532707625304, + "language_loss": 0.86928713, + "learning_rate": 3.1428509093876897e-06, + "loss": 0.9466747, + "num_input_tokens_seen": 116891225, + "router_z_loss_clip": 1.86035156, + "router_z_loss_mlp": 0.15222168, + "step": 5442, + "time_per_iteration": 2.5388059616088867 + }, + { + "auxiliary_loss_clip": 0.06470812, + "auxiliary_loss_mlp": 0.0126936, + "balance_loss_clip": 0.06282146, + "balance_loss_mlp": 0.01254399, + "epoch": 0.3272508642717571, + "flos": 22826696981760.0, + "grad_norm": 1.5979656279548642, + "language_loss": 0.77388418, + "learning_rate": 3.1425312744434668e-06, + "loss": 0.85128593, + "num_input_tokens_seen": 116912300, + "router_z_loss_clip": 1.88671875, + "router_z_loss_mlp": 0.1496582, + "step": 5443, + "time_per_iteration": 2.5765621662139893 + }, + { + "auxiliary_loss_clip": 0.0646731, + "auxiliary_loss_mlp": 0.01271075, + "balance_loss_clip": 0.06280384, + "balance_loss_mlp": 0.01255518, + "epoch": 0.32731098752442506, + "flos": 11806086977280.0, + "grad_norm": 2.2200780771744073, + "language_loss": 0.82818562, + "learning_rate": 3.142211596174343e-06, + "loss": 0.90556955, + "num_input_tokens_seen": 116929425, + "router_z_loss_clip": 1.8671875, + "router_z_loss_mlp": 0.15551758, + "step": 5444, + "time_per_iteration": 2.5514841079711914 + }, + { + "auxiliary_loss_clip": 0.06468201, + "auxiliary_loss_mlp": 0.01274937, + "balance_loss_clip": 0.06282412, + "balance_loss_mlp": 0.01258295, + "epoch": 0.327371110777093, + "flos": 21033300844800.0, + "grad_norm": 2.365977713323657, + "language_loss": 0.59248179, + "learning_rate": 3.1418918745924423e-06, + "loss": 0.66991317, + "num_input_tokens_seen": 116948255, + "router_z_loss_clip": 1.85742188, + "router_z_loss_mlp": 0.16638184, + "step": 5445, + "time_per_iteration": 2.5325539112091064 + }, + { + "auxiliary_loss_clip": 0.06469189, + "auxiliary_loss_mlp": 0.01278146, + "balance_loss_clip": 0.0628283, + "balance_loss_mlp": 0.01261278, + "epoch": 0.327431234029761, + "flos": 19068055983360.0, + "grad_norm": 2.7570820492615886, + "language_loss": 0.89260846, + "learning_rate": 3.1415721097098865e-06, + "loss": 0.97008175, + "num_input_tokens_seen": 116964905, + "router_z_loss_clip": 1.86425781, + "router_z_loss_mlp": 0.16870117, + "step": 5446, + "time_per_iteration": 2.576833724975586 + }, + { + "auxiliary_loss_clip": 0.06476346, + "auxiliary_loss_mlp": 0.01274903, + "balance_loss_clip": 0.06282137, + "balance_loss_mlp": 0.01257403, + "epoch": 0.32749135728242895, + "flos": 25856435312640.0, + "grad_norm": 1.9641165872810087, + "language_loss": 0.79404771, + "learning_rate": 3.141252301538802e-06, + "loss": 0.87156022, + "num_input_tokens_seen": 116983650, + "router_z_loss_clip": 1.94238281, + "router_z_loss_mlp": 0.17480469, + "step": 5447, + "time_per_iteration": 2.5539090633392334 + }, + { + "auxiliary_loss_clip": 0.06462374, + "auxiliary_loss_mlp": 0.01278273, + "balance_loss_clip": 0.06277826, + "balance_loss_mlp": 0.01263277, + "epoch": 0.327551480535097, + "flos": 20126721594240.0, + "grad_norm": 1.953936246680755, + "language_loss": 0.73150277, + "learning_rate": 3.1409324500913157e-06, + "loss": 0.80890924, + "num_input_tokens_seen": 117003265, + "router_z_loss_clip": 1.84667969, + "router_z_loss_mlp": 0.14990234, + "step": 5448, + "time_per_iteration": 2.633612871170044 + }, + { + "auxiliary_loss_clip": 0.06464307, + "auxiliary_loss_mlp": 0.01272265, + "balance_loss_clip": 0.0628064, + "balance_loss_mlp": 0.01256291, + "epoch": 0.32761160378776494, + "flos": 28811094785280.0, + "grad_norm": 1.3623614976773524, + "language_loss": 0.67002481, + "learning_rate": 3.1406125553795567e-06, + "loss": 0.74739063, + "num_input_tokens_seen": 117025370, + "router_z_loss_clip": 1.83789062, + "router_z_loss_mlp": 0.15966797, + "step": 5449, + "time_per_iteration": 2.5777859687805176 + }, + { + "auxiliary_loss_clip": 0.0647198, + "auxiliary_loss_mlp": 0.01270062, + "balance_loss_clip": 0.0628611, + "balance_loss_mlp": 0.01254493, + "epoch": 0.3276717270404329, + "flos": 26944171090560.0, + "grad_norm": 1.378619651715801, + "language_loss": 0.65736711, + "learning_rate": 3.1402926174156556e-06, + "loss": 0.73478758, + "num_input_tokens_seen": 117044350, + "router_z_loss_clip": 1.85644531, + "router_z_loss_mlp": 0.15576172, + "step": 5450, + "time_per_iteration": 2.5971169471740723 + }, + { + "auxiliary_loss_clip": 0.06468028, + "auxiliary_loss_mlp": 0.01275162, + "balance_loss_clip": 0.06280884, + "balance_loss_mlp": 0.01258509, + "epoch": 0.32773185029310087, + "flos": 25345557768960.0, + "grad_norm": 7.041147023955008, + "language_loss": 0.77832162, + "learning_rate": 3.1399726362117437e-06, + "loss": 0.85575354, + "num_input_tokens_seen": 117064450, + "router_z_loss_clip": 1.87207031, + "router_z_loss_mlp": 0.16662598, + "step": 5451, + "time_per_iteration": 2.572112560272217 + }, + { + "auxiliary_loss_clip": 0.06472664, + "auxiliary_loss_mlp": 0.01278588, + "balance_loss_clip": 0.06283467, + "balance_loss_mlp": 0.01262042, + "epoch": 0.32779197354576883, + "flos": 26398227813120.0, + "grad_norm": 1.9495025825112327, + "language_loss": 0.70696288, + "learning_rate": 3.1396526117799555e-06, + "loss": 0.78447533, + "num_input_tokens_seen": 117083060, + "router_z_loss_clip": 1.88867188, + "router_z_loss_mlp": 0.16540527, + "step": 5452, + "time_per_iteration": 2.6081676483154297 + }, + { + "auxiliary_loss_clip": 0.0646618, + "auxiliary_loss_mlp": 0.01272924, + "balance_loss_clip": 0.06283787, + "balance_loss_mlp": 0.01256938, + "epoch": 0.3278520967984368, + "flos": 24906237212160.0, + "grad_norm": 1.6132254933408041, + "language_loss": 0.7924304, + "learning_rate": 3.1393325441324256e-06, + "loss": 0.86982143, + "num_input_tokens_seen": 117101860, + "router_z_loss_clip": 1.82519531, + "router_z_loss_mlp": 0.15979004, + "step": 5453, + "time_per_iteration": 2.5893869400024414 + }, + { + "auxiliary_loss_clip": 0.06469721, + "auxiliary_loss_mlp": 0.01274795, + "balance_loss_clip": 0.06282013, + "balance_loss_mlp": 0.01259309, + "epoch": 0.32791222005110476, + "flos": 29760831688320.0, + "grad_norm": 2.0442879632543476, + "language_loss": 0.758448, + "learning_rate": 3.1390124332812916e-06, + "loss": 0.83589315, + "num_input_tokens_seen": 117123100, + "router_z_loss_clip": 1.87792969, + "router_z_loss_mlp": 0.15478516, + "step": 5454, + "time_per_iteration": 2.590080499649048 + }, + { + "auxiliary_loss_clip": 0.06461332, + "auxiliary_loss_mlp": 0.01271865, + "balance_loss_clip": 0.06280516, + "balance_loss_mlp": 0.01257536, + "epoch": 0.32797234330377273, + "flos": 16513584410880.0, + "grad_norm": 2.183253633037468, + "language_loss": 0.77119774, + "learning_rate": 3.1386922792386924e-06, + "loss": 0.8485297, + "num_input_tokens_seen": 117140515, + "router_z_loss_clip": 1.80761719, + "router_z_loss_mlp": 0.14318848, + "step": 5455, + "time_per_iteration": 2.4873318672180176 + }, + { + "auxiliary_loss_clip": 0.06482153, + "auxiliary_loss_mlp": 0.01285817, + "balance_loss_clip": 0.06290287, + "balance_loss_mlp": 0.01268377, + "epoch": 0.3280324665564407, + "flos": 26585086417920.0, + "grad_norm": 1.6915080932551223, + "language_loss": 0.74407738, + "learning_rate": 3.138372082016768e-06, + "loss": 0.82175708, + "num_input_tokens_seen": 117161485, + "router_z_loss_clip": 1.91894531, + "router_z_loss_mlp": 0.17443848, + "step": 5456, + "time_per_iteration": 2.593258857727051 + }, + { + "auxiliary_loss_clip": 0.0646835, + "auxiliary_loss_mlp": 0.01277637, + "balance_loss_clip": 0.06282572, + "balance_loss_mlp": 0.01261306, + "epoch": 0.32809258980910866, + "flos": 22936631938560.0, + "grad_norm": 1.4862092693082851, + "language_loss": 0.78666067, + "learning_rate": 3.1380518416276596e-06, + "loss": 0.8641206, + "num_input_tokens_seen": 117181870, + "router_z_loss_clip": 1.85546875, + "router_z_loss_mlp": 0.16345215, + "step": 5457, + "time_per_iteration": 2.523540496826172 + }, + { + "auxiliary_loss_clip": 0.06473868, + "auxiliary_loss_mlp": 0.01274513, + "balance_loss_clip": 0.06281006, + "balance_loss_mlp": 0.0125873, + "epoch": 0.3281527130617766, + "flos": 22790457290880.0, + "grad_norm": 2.0769759307730644, + "language_loss": 0.78958774, + "learning_rate": 3.1377315580835115e-06, + "loss": 0.86707151, + "num_input_tokens_seen": 117201380, + "router_z_loss_clip": 1.92773438, + "router_z_loss_mlp": 0.15795898, + "step": 5458, + "time_per_iteration": 2.552680015563965 + }, + { + "auxiliary_loss_clip": 0.06469774, + "auxiliary_loss_mlp": 0.01274499, + "balance_loss_clip": 0.06284518, + "balance_loss_mlp": 0.01258215, + "epoch": 0.3282128363144446, + "flos": 21256902264960.0, + "grad_norm": 1.5512978296749391, + "language_loss": 0.73655844, + "learning_rate": 3.1374112313964686e-06, + "loss": 0.8140012, + "num_input_tokens_seen": 117221040, + "router_z_loss_clip": 1.84960938, + "router_z_loss_mlp": 0.1628418, + "step": 5459, + "time_per_iteration": 2.5166404247283936 + }, + { + "auxiliary_loss_clip": 0.0647283, + "auxiliary_loss_mlp": 0.01274033, + "balance_loss_clip": 0.0628351, + "balance_loss_mlp": 0.01257761, + "epoch": 0.32827295956711255, + "flos": 30850328401920.0, + "grad_norm": 2.2277675097031993, + "language_loss": 0.84476066, + "learning_rate": 3.1370908615786783e-06, + "loss": 0.92222929, + "num_input_tokens_seen": 117241395, + "router_z_loss_clip": 1.89257812, + "router_z_loss_mlp": 0.16271973, + "step": 5460, + "time_per_iteration": 2.6067721843719482 + }, + { + "auxiliary_loss_clip": 0.06469227, + "auxiliary_loss_mlp": 0.01276293, + "balance_loss_clip": 0.06282166, + "balance_loss_mlp": 0.01260319, + "epoch": 0.3283330828197806, + "flos": 25921032410880.0, + "grad_norm": 2.3722751928185297, + "language_loss": 0.78114808, + "learning_rate": 3.136770448642288e-06, + "loss": 0.8586033, + "num_input_tokens_seen": 117259340, + "router_z_loss_clip": 1.86914062, + "router_z_loss_mlp": 0.15991211, + "step": 5461, + "time_per_iteration": 2.550417184829712 + }, + { + "auxiliary_loss_clip": 0.06469681, + "auxiliary_loss_mlp": 0.01279493, + "balance_loss_clip": 0.06282061, + "balance_loss_mlp": 0.01261361, + "epoch": 0.32839320607244854, + "flos": 38591295672960.0, + "grad_norm": 1.5965953358146812, + "language_loss": 0.62925887, + "learning_rate": 3.1364499925994484e-06, + "loss": 0.70675063, + "num_input_tokens_seen": 117282375, + "router_z_loss_clip": 1.87695312, + "router_z_loss_mlp": 0.18115234, + "step": 5462, + "time_per_iteration": 2.7004194259643555 + }, + { + "auxiliary_loss_clip": 0.06467308, + "auxiliary_loss_mlp": 0.0128086, + "balance_loss_clip": 0.06284478, + "balance_loss_mlp": 0.01265077, + "epoch": 0.3284533293251165, + "flos": 26658068924160.0, + "grad_norm": 1.3126719376538145, + "language_loss": 0.78502059, + "learning_rate": 3.1361294934623115e-06, + "loss": 0.86250222, + "num_input_tokens_seen": 117303830, + "router_z_loss_clip": 1.828125, + "router_z_loss_mlp": 0.15783691, + "step": 5463, + "time_per_iteration": 2.6072070598602295 + }, + { + "auxiliary_loss_clip": 0.0647091, + "auxiliary_loss_mlp": 0.01272646, + "balance_loss_clip": 0.06283993, + "balance_loss_mlp": 0.01256589, + "epoch": 0.32851345257778447, + "flos": 15309498839040.0, + "grad_norm": 1.727782559794916, + "language_loss": 0.70068884, + "learning_rate": 3.1358089512430303e-06, + "loss": 0.77812445, + "num_input_tokens_seen": 117320665, + "router_z_loss_clip": 1.87109375, + "router_z_loss_mlp": 0.16064453, + "step": 5464, + "time_per_iteration": 2.519319534301758 + }, + { + "auxiliary_loss_clip": 0.06466094, + "auxiliary_loss_mlp": 0.01275271, + "balance_loss_clip": 0.06284432, + "balance_loss_mlp": 0.01257938, + "epoch": 0.32857357583045244, + "flos": 23520491988480.0, + "grad_norm": 1.6619431416557902, + "language_loss": 0.72759986, + "learning_rate": 3.1354883659537594e-06, + "loss": 0.80501354, + "num_input_tokens_seen": 117339795, + "router_z_loss_clip": 1.81445312, + "router_z_loss_mlp": 0.17333984, + "step": 5465, + "time_per_iteration": 2.573444366455078 + }, + { + "auxiliary_loss_clip": 0.06473792, + "auxiliary_loss_mlp": 0.01281793, + "balance_loss_clip": 0.06287415, + "balance_loss_mlp": 0.01265509, + "epoch": 0.3286336990831204, + "flos": 21001379639040.0, + "grad_norm": 1.5232981833560715, + "language_loss": 0.82967317, + "learning_rate": 3.1351677376066567e-06, + "loss": 0.90722907, + "num_input_tokens_seen": 117359525, + "router_z_loss_clip": 1.86328125, + "router_z_loss_mlp": 0.16271973, + "step": 5466, + "time_per_iteration": 4.012515306472778 + }, + { + "auxiliary_loss_clip": 0.0647275, + "auxiliary_loss_mlp": 0.01271061, + "balance_loss_clip": 0.06285034, + "balance_loss_mlp": 0.01254932, + "epoch": 0.32869382233578837, + "flos": 23665450752000.0, + "grad_norm": 1.6606265994221874, + "language_loss": 0.79192597, + "learning_rate": 3.134847066213879e-06, + "loss": 0.86936402, + "num_input_tokens_seen": 117380320, + "router_z_loss_clip": 1.87597656, + "router_z_loss_mlp": 0.16125488, + "step": 5467, + "time_per_iteration": 4.000247955322266 + }, + { + "auxiliary_loss_clip": 0.06467809, + "auxiliary_loss_mlp": 0.01271951, + "balance_loss_clip": 0.06279044, + "balance_loss_mlp": 0.01255333, + "epoch": 0.32875394558845633, + "flos": 25343335635840.0, + "grad_norm": 1.5510134892276737, + "language_loss": 0.74865687, + "learning_rate": 3.134526351787587e-06, + "loss": 0.82605445, + "num_input_tokens_seen": 117400695, + "router_z_loss_clip": 1.88574219, + "router_z_loss_mlp": 0.16601562, + "step": 5468, + "time_per_iteration": 2.5805253982543945 + }, + { + "auxiliary_loss_clip": 0.06474267, + "auxiliary_loss_mlp": 0.01276703, + "balance_loss_clip": 0.0628129, + "balance_loss_mlp": 0.01259108, + "epoch": 0.3288140688411243, + "flos": 14908430471040.0, + "grad_norm": 1.672146103500693, + "language_loss": 0.78728724, + "learning_rate": 3.134205594339942e-06, + "loss": 0.86479694, + "num_input_tokens_seen": 117418800, + "router_z_loss_clip": 1.9296875, + "router_z_loss_mlp": 0.17614746, + "step": 5469, + "time_per_iteration": 3.955373525619507 + }, + { + "auxiliary_loss_clip": 0.06466976, + "auxiliary_loss_mlp": 0.01273245, + "balance_loss_clip": 0.06279504, + "balance_loss_mlp": 0.01257224, + "epoch": 0.32887419209379226, + "flos": 18557220366720.0, + "grad_norm": 1.6018901390748483, + "language_loss": 0.82183433, + "learning_rate": 3.133884793883107e-06, + "loss": 0.89923656, + "num_input_tokens_seen": 117438220, + "router_z_loss_clip": 1.87402344, + "router_z_loss_mlp": 0.16015625, + "step": 5470, + "time_per_iteration": 2.5481319427490234 + }, + { + "auxiliary_loss_clip": 0.06467617, + "auxiliary_loss_mlp": 0.01271427, + "balance_loss_clip": 0.06279681, + "balance_loss_mlp": 0.01254869, + "epoch": 0.3289343153464602, + "flos": 48116560913280.0, + "grad_norm": 1.6166643495117736, + "language_loss": 0.68441176, + "learning_rate": 3.1335639504292478e-06, + "loss": 0.76180226, + "num_input_tokens_seen": 117462560, + "router_z_loss_clip": 1.87890625, + "router_z_loss_mlp": 0.16564941, + "step": 5471, + "time_per_iteration": 2.780454158782959 + }, + { + "auxiliary_loss_clip": 0.06479289, + "auxiliary_loss_mlp": 0.012789, + "balance_loss_clip": 0.06285035, + "balance_loss_mlp": 0.01260637, + "epoch": 0.3289944385991282, + "flos": 27607763900160.0, + "grad_norm": 1.5078842371471577, + "language_loss": 0.65564525, + "learning_rate": 3.1332430639905288e-06, + "loss": 0.73322713, + "num_input_tokens_seen": 117483665, + "router_z_loss_clip": 1.94140625, + "router_z_loss_mlp": 0.18273926, + "step": 5472, + "time_per_iteration": 2.580644369125366 + }, + { + "auxiliary_loss_clip": 0.06472386, + "auxiliary_loss_mlp": 0.01277133, + "balance_loss_clip": 0.06281875, + "balance_loss_mlp": 0.01259144, + "epoch": 0.32905456185179616, + "flos": 20126470032000.0, + "grad_norm": 1.614198879205061, + "language_loss": 0.88538003, + "learning_rate": 3.13292213457912e-06, + "loss": 0.96287525, + "num_input_tokens_seen": 117503565, + "router_z_loss_clip": 1.90527344, + "router_z_loss_mlp": 0.17993164, + "step": 5473, + "time_per_iteration": 4.021254062652588 + }, + { + "auxiliary_loss_clip": 0.06475069, + "auxiliary_loss_mlp": 0.01270272, + "balance_loss_clip": 0.06285396, + "balance_loss_mlp": 0.01253714, + "epoch": 0.3291146851044642, + "flos": 23186075143680.0, + "grad_norm": 1.7643015597930078, + "language_loss": 0.78719336, + "learning_rate": 3.1326011622071903e-06, + "loss": 0.86464679, + "num_input_tokens_seen": 117521460, + "router_z_loss_clip": 1.89453125, + "router_z_loss_mlp": 0.16552734, + "step": 5474, + "time_per_iteration": 2.5416688919067383 + }, + { + "auxiliary_loss_clip": 0.06379573, + "auxiliary_loss_mlp": 0.0134405, + "balance_loss_clip": 0.06291323, + "balance_loss_mlp": 0.01340224, + "epoch": 0.32917480835713214, + "flos": 67641630664320.0, + "grad_norm": 0.8577160187921843, + "language_loss": 0.60258645, + "learning_rate": 3.132280146886911e-06, + "loss": 0.67982268, + "num_input_tokens_seen": 117580550, + "router_z_loss_clip": 0.8828125, + "router_z_loss_mlp": 0.03820801, + "step": 5475, + "time_per_iteration": 3.1267805099487305 + }, + { + "auxiliary_loss_clip": 0.06479369, + "auxiliary_loss_mlp": 0.01279647, + "balance_loss_clip": 0.06284596, + "balance_loss_mlp": 0.01261599, + "epoch": 0.3292349316098001, + "flos": 27971963671680.0, + "grad_norm": 3.252822648856248, + "language_loss": 0.7712574, + "learning_rate": 3.131959088630455e-06, + "loss": 0.84884757, + "num_input_tokens_seen": 117600645, + "router_z_loss_clip": 1.94824219, + "router_z_loss_mlp": 0.18041992, + "step": 5476, + "time_per_iteration": 2.5819692611694336 + }, + { + "auxiliary_loss_clip": 0.06469015, + "auxiliary_loss_mlp": 0.01275163, + "balance_loss_clip": 0.06282525, + "balance_loss_mlp": 0.01258956, + "epoch": 0.3292950548624681, + "flos": 20269416297600.0, + "grad_norm": 1.7333439092472165, + "language_loss": 0.7556808, + "learning_rate": 3.131637987449997e-06, + "loss": 0.83312255, + "num_input_tokens_seen": 117618880, + "router_z_loss_clip": 1.86425781, + "router_z_loss_mlp": 0.1619873, + "step": 5477, + "time_per_iteration": 2.532106637954712 + }, + { + "auxiliary_loss_clip": 0.06470291, + "auxiliary_loss_mlp": 0.01275718, + "balance_loss_clip": 0.0628788, + "balance_loss_mlp": 0.01259541, + "epoch": 0.32935517811513604, + "flos": 20819174935680.0, + "grad_norm": 2.104456143380591, + "language_loss": 0.75728148, + "learning_rate": 3.131316843357713e-06, + "loss": 0.83474159, + "num_input_tokens_seen": 117636445, + "router_z_loss_clip": 1.82226562, + "router_z_loss_mlp": 0.16174316, + "step": 5478, + "time_per_iteration": 2.5293543338775635 + }, + { + "auxiliary_loss_clip": 0.06470281, + "auxiliary_loss_mlp": 0.01278094, + "balance_loss_clip": 0.06287058, + "balance_loss_mlp": 0.01261631, + "epoch": 0.329415301367804, + "flos": 18447704680320.0, + "grad_norm": 2.368560120299576, + "language_loss": 0.80772918, + "learning_rate": 3.1309956563657807e-06, + "loss": 0.8852129, + "num_input_tokens_seen": 117653105, + "router_z_loss_clip": 1.83203125, + "router_z_loss_mlp": 0.16455078, + "step": 5479, + "time_per_iteration": 2.5154647827148438 + }, + { + "auxiliary_loss_clip": 0.06362775, + "auxiliary_loss_mlp": 0.01272199, + "balance_loss_clip": 0.06275004, + "balance_loss_mlp": 0.01268579, + "epoch": 0.32947542462047197, + "flos": 66344967930240.0, + "grad_norm": 0.7366188072531391, + "language_loss": 0.56333017, + "learning_rate": 3.1306744264863804e-06, + "loss": 0.63967991, + "num_input_tokens_seen": 117719225, + "router_z_loss_clip": 0.87597656, + "router_z_loss_mlp": 0.03616333, + "step": 5480, + "time_per_iteration": 3.2369706630706787 + }, + { + "auxiliary_loss_clip": 0.06477214, + "auxiliary_loss_mlp": 0.01278618, + "balance_loss_clip": 0.06290235, + "balance_loss_mlp": 0.01262179, + "epoch": 0.32953554787313993, + "flos": 23228268474240.0, + "grad_norm": 1.631877255513098, + "language_loss": 0.7736274, + "learning_rate": 3.1303531537316915e-06, + "loss": 0.85118574, + "num_input_tokens_seen": 117738725, + "router_z_loss_clip": 1.8671875, + "router_z_loss_mlp": 0.16442871, + "step": 5481, + "time_per_iteration": 2.5206968784332275 + }, + { + "auxiliary_loss_clip": 0.06479073, + "auxiliary_loss_mlp": 0.01277292, + "balance_loss_clip": 0.0628771, + "balance_loss_mlp": 0.01260686, + "epoch": 0.3295956711258079, + "flos": 27015686150400.0, + "grad_norm": 1.3752047504599005, + "language_loss": 0.78639877, + "learning_rate": 3.130031838113899e-06, + "loss": 0.86396235, + "num_input_tokens_seen": 117757765, + "router_z_loss_clip": 1.91601562, + "router_z_loss_mlp": 0.16601562, + "step": 5482, + "time_per_iteration": 2.604720115661621 + }, + { + "auxiliary_loss_clip": 0.06475698, + "auxiliary_loss_mlp": 0.01274916, + "balance_loss_clip": 0.06286834, + "balance_loss_mlp": 0.01258274, + "epoch": 0.32965579437847586, + "flos": 19177697450880.0, + "grad_norm": 2.0027782692889358, + "language_loss": 0.74399549, + "learning_rate": 3.129710479645185e-06, + "loss": 0.82150161, + "num_input_tokens_seen": 117776810, + "router_z_loss_clip": 1.88964844, + "router_z_loss_mlp": 0.16662598, + "step": 5483, + "time_per_iteration": 2.5124409198760986 + }, + { + "auxiliary_loss_clip": 0.06472629, + "auxiliary_loss_mlp": 0.01273838, + "balance_loss_clip": 0.06286867, + "balance_loss_mlp": 0.01258472, + "epoch": 0.32971591763114383, + "flos": 30490447115520.0, + "grad_norm": 1.7640387903996015, + "language_loss": 0.7588225, + "learning_rate": 3.1293890783377366e-06, + "loss": 0.83628714, + "num_input_tokens_seen": 117797730, + "router_z_loss_clip": 1.85742188, + "router_z_loss_mlp": 0.15368652, + "step": 5484, + "time_per_iteration": 2.64021635055542 + }, + { + "auxiliary_loss_clip": 0.06469439, + "auxiliary_loss_mlp": 0.01274788, + "balance_loss_clip": 0.06284587, + "balance_loss_mlp": 0.01259232, + "epoch": 0.3297760408838118, + "flos": 16295140016640.0, + "grad_norm": 1.7787654746377481, + "language_loss": 0.72680974, + "learning_rate": 3.129067634203742e-06, + "loss": 0.80425203, + "num_input_tokens_seen": 117815365, + "router_z_loss_clip": 1.84960938, + "router_z_loss_mlp": 0.15563965, + "step": 5485, + "time_per_iteration": 2.516080379486084 + }, + { + "auxiliary_loss_clip": 0.06466281, + "auxiliary_loss_mlp": 0.01274799, + "balance_loss_clip": 0.06281459, + "balance_loss_mlp": 0.0125991, + "epoch": 0.32983616413647976, + "flos": 29538194590080.0, + "grad_norm": 2.336444213272706, + "language_loss": 0.80720758, + "learning_rate": 3.128746147255388e-06, + "loss": 0.8846184, + "num_input_tokens_seen": 117836095, + "router_z_loss_clip": 1.84765625, + "router_z_loss_mlp": 0.14904785, + "step": 5486, + "time_per_iteration": 2.633730173110962 + }, + { + "auxiliary_loss_clip": 0.06467714, + "auxiliary_loss_mlp": 0.01276658, + "balance_loss_clip": 0.06283799, + "balance_loss_mlp": 0.01261828, + "epoch": 0.3298962873891478, + "flos": 20637682992000.0, + "grad_norm": 1.9361428819205904, + "language_loss": 0.84726417, + "learning_rate": 3.1284246175048683e-06, + "loss": 0.92470789, + "num_input_tokens_seen": 117854655, + "router_z_loss_clip": 1.83886719, + "router_z_loss_mlp": 0.14819336, + "step": 5487, + "time_per_iteration": 2.5073888301849365 + }, + { + "auxiliary_loss_clip": 0.06473765, + "auxiliary_loss_mlp": 0.01275689, + "balance_loss_clip": 0.06283425, + "balance_loss_mlp": 0.01258845, + "epoch": 0.32995641064181574, + "flos": 14981329123200.0, + "grad_norm": 2.0510786453666707, + "language_loss": 0.74805683, + "learning_rate": 3.1281030449643735e-06, + "loss": 0.82555139, + "num_input_tokens_seen": 117873300, + "router_z_loss_clip": 1.90429688, + "router_z_loss_mlp": 0.16833496, + "step": 5488, + "time_per_iteration": 2.5195999145507812 + }, + { + "auxiliary_loss_clip": 0.06475645, + "auxiliary_loss_mlp": 0.01276585, + "balance_loss_clip": 0.06288432, + "balance_loss_mlp": 0.012611, + "epoch": 0.3300165338944837, + "flos": 18667448812800.0, + "grad_norm": 2.2567239989743912, + "language_loss": 0.73048651, + "learning_rate": 3.127781429646098e-06, + "loss": 0.80800879, + "num_input_tokens_seen": 117891540, + "router_z_loss_clip": 1.87207031, + "router_z_loss_mlp": 0.15466309, + "step": 5489, + "time_per_iteration": 2.489529609680176 + }, + { + "auxiliary_loss_clip": 0.06468415, + "auxiliary_loss_mlp": 0.01275877, + "balance_loss_clip": 0.06282636, + "balance_loss_mlp": 0.01260987, + "epoch": 0.3300766571471517, + "flos": 25589215042560.0, + "grad_norm": 2.1838257682132256, + "language_loss": 0.89381063, + "learning_rate": 3.127459771562238e-06, + "loss": 0.97125351, + "num_input_tokens_seen": 117907690, + "router_z_loss_clip": 1.859375, + "router_z_loss_mlp": 0.14898682, + "step": 5490, + "time_per_iteration": 2.583505153656006 + }, + { + "auxiliary_loss_clip": 0.06470391, + "auxiliary_loss_mlp": 0.01273693, + "balance_loss_clip": 0.06285221, + "balance_loss_mlp": 0.01258339, + "epoch": 0.33013678039981964, + "flos": 11368150012800.0, + "grad_norm": 1.8708534793530802, + "language_loss": 0.82974613, + "learning_rate": 3.1271380707249907e-06, + "loss": 0.90718699, + "num_input_tokens_seen": 117925640, + "router_z_loss_clip": 1.85253906, + "router_z_loss_mlp": 0.15344238, + "step": 5491, + "time_per_iteration": 2.4903311729431152 + }, + { + "auxiliary_loss_clip": 0.06473103, + "auxiliary_loss_mlp": 0.01274646, + "balance_loss_clip": 0.06285959, + "balance_loss_mlp": 0.01258589, + "epoch": 0.3301969036524876, + "flos": 24827175285120.0, + "grad_norm": 1.8609460693795263, + "language_loss": 0.77910721, + "learning_rate": 3.126816327146554e-06, + "loss": 0.85658479, + "num_input_tokens_seen": 117944525, + "router_z_loss_clip": 1.87402344, + "router_z_loss_mlp": 0.16052246, + "step": 5492, + "time_per_iteration": 2.5615334510803223 + }, + { + "auxiliary_loss_clip": 0.06478797, + "auxiliary_loss_mlp": 0.01277822, + "balance_loss_clip": 0.06287751, + "balance_loss_mlp": 0.01261324, + "epoch": 0.33025702690515557, + "flos": 15966634884480.0, + "grad_norm": 2.4722908606070875, + "language_loss": 0.75614154, + "learning_rate": 3.12649454083913e-06, + "loss": 0.83370769, + "num_input_tokens_seen": 117962515, + "router_z_loss_clip": 1.91015625, + "router_z_loss_mlp": 0.16503906, + "step": 5493, + "time_per_iteration": 2.489143133163452 + }, + { + "auxiliary_loss_clip": 0.06366986, + "auxiliary_loss_mlp": 0.01258616, + "balance_loss_clip": 0.06280049, + "balance_loss_mlp": 0.0125515, + "epoch": 0.33031715015782354, + "flos": 59435794540800.0, + "grad_norm": 0.7878547289977352, + "language_loss": 0.54030049, + "learning_rate": 3.12617271181492e-06, + "loss": 0.61655653, + "num_input_tokens_seen": 118018780, + "router_z_loss_clip": 0.86767578, + "router_z_loss_mlp": 0.03475952, + "step": 5494, + "time_per_iteration": 3.0869832038879395 + }, + { + "auxiliary_loss_clip": 0.06482484, + "auxiliary_loss_mlp": 0.01281394, + "balance_loss_clip": 0.0629174, + "balance_loss_mlp": 0.01264753, + "epoch": 0.3303772734104915, + "flos": 23190896753280.0, + "grad_norm": 1.4215593277180028, + "language_loss": 0.87367666, + "learning_rate": 3.1258508400861276e-06, + "loss": 0.9513154, + "num_input_tokens_seen": 118038610, + "router_z_loss_clip": 1.90625, + "router_z_loss_mlp": 0.16625977, + "step": 5495, + "time_per_iteration": 2.5188820362091064 + }, + { + "auxiliary_loss_clip": 0.06477214, + "auxiliary_loss_mlp": 0.0127749, + "balance_loss_clip": 0.06287535, + "balance_loss_mlp": 0.01260038, + "epoch": 0.33043739666315947, + "flos": 33080068275840.0, + "grad_norm": 2.0083800771900995, + "language_loss": 0.74168754, + "learning_rate": 3.1255289256649587e-06, + "loss": 0.81923461, + "num_input_tokens_seen": 118055905, + "router_z_loss_clip": 1.89550781, + "router_z_loss_mlp": 0.17443848, + "step": 5496, + "time_per_iteration": 2.6151347160339355 + }, + { + "auxiliary_loss_clip": 0.06470463, + "auxiliary_loss_mlp": 0.01272194, + "balance_loss_clip": 0.0628539, + "balance_loss_mlp": 0.01256434, + "epoch": 0.33049751991582743, + "flos": 24901625237760.0, + "grad_norm": 1.9468549986980455, + "language_loss": 0.72676557, + "learning_rate": 3.1252069685636196e-06, + "loss": 0.80419219, + "num_input_tokens_seen": 118073695, + "router_z_loss_clip": 1.84960938, + "router_z_loss_mlp": 0.15759277, + "step": 5497, + "time_per_iteration": 2.51874041557312 + }, + { + "auxiliary_loss_clip": 0.06472345, + "auxiliary_loss_mlp": 0.0127459, + "balance_loss_clip": 0.06286049, + "balance_loss_mlp": 0.01259343, + "epoch": 0.3305576431684954, + "flos": 29468272757760.0, + "grad_norm": 1.8137955115189202, + "language_loss": 0.80825889, + "learning_rate": 3.124884968794321e-06, + "loss": 0.88572824, + "num_input_tokens_seen": 118094030, + "router_z_loss_clip": 1.86328125, + "router_z_loss_mlp": 0.15234375, + "step": 5498, + "time_per_iteration": 2.6010656356811523 + }, + { + "auxiliary_loss_clip": 0.06476308, + "auxiliary_loss_mlp": 0.0127559, + "balance_loss_clip": 0.0628619, + "balance_loss_mlp": 0.01258281, + "epoch": 0.33061776642116336, + "flos": 22637951660160.0, + "grad_norm": 1.8227647554707032, + "language_loss": 0.76843095, + "learning_rate": 3.12456292636927e-06, + "loss": 0.84594989, + "num_input_tokens_seen": 118111665, + "router_z_loss_clip": 1.90039062, + "router_z_loss_mlp": 0.1730957, + "step": 5499, + "time_per_iteration": 2.5308327674865723 + }, + { + "auxiliary_loss_clip": 0.06475572, + "auxiliary_loss_mlp": 0.01277032, + "balance_loss_clip": 0.06287447, + "balance_loss_mlp": 0.01260832, + "epoch": 0.3306778896738313, + "flos": 25783536660480.0, + "grad_norm": 1.5377855738322084, + "language_loss": 0.79203349, + "learning_rate": 3.124240841300681e-06, + "loss": 0.86955953, + "num_input_tokens_seen": 118132435, + "router_z_loss_clip": 1.88183594, + "router_z_loss_mlp": 0.16186523, + "step": 5500, + "time_per_iteration": 2.5970370769500732 + }, + { + "auxiliary_loss_clip": 0.0648918, + "auxiliary_loss_mlp": 0.01275283, + "balance_loss_clip": 0.06298861, + "balance_loss_mlp": 0.01257544, + "epoch": 0.33073801292649935, + "flos": 36949566625920.0, + "grad_norm": 1.9211086255091194, + "language_loss": 0.66916561, + "learning_rate": 3.1239187136007665e-06, + "loss": 0.7468102, + "num_input_tokens_seen": 118155255, + "router_z_loss_clip": 1.90527344, + "router_z_loss_mlp": 0.17724609, + "step": 5501, + "time_per_iteration": 2.687847375869751 + }, + { + "auxiliary_loss_clip": 0.06481969, + "auxiliary_loss_mlp": 0.01273275, + "balance_loss_clip": 0.06291866, + "balance_loss_mlp": 0.01255763, + "epoch": 0.3307981361791673, + "flos": 12972465411840.0, + "grad_norm": 2.0893698607967957, + "language_loss": 0.77978551, + "learning_rate": 3.1235965432817417e-06, + "loss": 0.85733795, + "num_input_tokens_seen": 118169865, + "router_z_loss_clip": 1.90039062, + "router_z_loss_mlp": 0.17504883, + "step": 5502, + "time_per_iteration": 2.500303268432617 + }, + { + "auxiliary_loss_clip": 0.06481159, + "auxiliary_loss_mlp": 0.0127421, + "balance_loss_clip": 0.06290131, + "balance_loss_mlp": 0.01256424, + "epoch": 0.3308582594318353, + "flos": 25381420116480.0, + "grad_norm": 1.7450780858535315, + "language_loss": 0.72841054, + "learning_rate": 3.123274330355824e-06, + "loss": 0.80596423, + "num_input_tokens_seen": 118190760, + "router_z_loss_clip": 1.90917969, + "router_z_loss_mlp": 0.17773438, + "step": 5503, + "time_per_iteration": 2.5851874351501465 + }, + { + "auxiliary_loss_clip": 0.06475106, + "auxiliary_loss_mlp": 0.01274446, + "balance_loss_clip": 0.06287622, + "balance_loss_mlp": 0.01257769, + "epoch": 0.33091838268450324, + "flos": 26475738439680.0, + "grad_norm": 1.4901464435255347, + "language_loss": 0.7565586, + "learning_rate": 3.12295207483523e-06, + "loss": 0.83405411, + "num_input_tokens_seen": 118213620, + "router_z_loss_clip": 1.87597656, + "router_z_loss_mlp": 0.16674805, + "step": 5504, + "time_per_iteration": 2.5670559406280518 + }, + { + "auxiliary_loss_clip": 0.06476955, + "auxiliary_loss_mlp": 0.01276594, + "balance_loss_clip": 0.06289346, + "balance_loss_mlp": 0.01261025, + "epoch": 0.3309785059371712, + "flos": 24977836126080.0, + "grad_norm": 1.5646403370775293, + "language_loss": 0.70214427, + "learning_rate": 3.1226297767321816e-06, + "loss": 0.77967972, + "num_input_tokens_seen": 118235010, + "router_z_loss_clip": 1.87597656, + "router_z_loss_mlp": 0.15545654, + "step": 5505, + "time_per_iteration": 2.628267288208008 + }, + { + "auxiliary_loss_clip": 0.06474259, + "auxiliary_loss_mlp": 0.01275018, + "balance_loss_clip": 0.06288742, + "balance_loss_mlp": 0.01258543, + "epoch": 0.3310386291898392, + "flos": 20452585322880.0, + "grad_norm": 1.7982072656373813, + "language_loss": 0.8240785, + "learning_rate": 3.122307436058899e-06, + "loss": 0.90157127, + "num_input_tokens_seen": 118255820, + "router_z_loss_clip": 1.85644531, + "router_z_loss_mlp": 0.16467285, + "step": 5506, + "time_per_iteration": 4.10949444770813 + }, + { + "auxiliary_loss_clip": 0.06476486, + "auxiliary_loss_mlp": 0.01275135, + "balance_loss_clip": 0.0628888, + "balance_loss_mlp": 0.01258428, + "epoch": 0.33109875244250714, + "flos": 23188926182400.0, + "grad_norm": 1.740251919086934, + "language_loss": 0.79860532, + "learning_rate": 3.121985052827606e-06, + "loss": 0.87612152, + "num_input_tokens_seen": 118274160, + "router_z_loss_clip": 1.875, + "router_z_loss_mlp": 0.16705322, + "step": 5507, + "time_per_iteration": 4.12217903137207 + }, + { + "auxiliary_loss_clip": 0.06468768, + "auxiliary_loss_mlp": 0.01276749, + "balance_loss_clip": 0.06281893, + "balance_loss_mlp": 0.01260477, + "epoch": 0.3311588756951751, + "flos": 24174902776320.0, + "grad_norm": 1.6433149866128014, + "language_loss": 0.71967649, + "learning_rate": 3.1216626270505274e-06, + "loss": 0.79713166, + "num_input_tokens_seen": 118294385, + "router_z_loss_clip": 1.87109375, + "router_z_loss_mlp": 0.1628418, + "step": 5508, + "time_per_iteration": 2.5890002250671387 + }, + { + "auxiliary_loss_clip": 0.06468692, + "auxiliary_loss_mlp": 0.01272213, + "balance_loss_clip": 0.06284875, + "balance_loss_mlp": 0.01256788, + "epoch": 0.33121899894784307, + "flos": 28152994417920.0, + "grad_norm": 1.6757523088462936, + "language_loss": 0.71588784, + "learning_rate": 3.12134015873989e-06, + "loss": 0.79329687, + "num_input_tokens_seen": 118313105, + "router_z_loss_clip": 1.83691406, + "router_z_loss_mlp": 0.15429688, + "step": 5509, + "time_per_iteration": 3.976996660232544 + }, + { + "auxiliary_loss_clip": 0.06471643, + "auxiliary_loss_mlp": 0.01279857, + "balance_loss_clip": 0.06286702, + "balance_loss_mlp": 0.01264396, + "epoch": 0.33127912220051103, + "flos": 29574979332480.0, + "grad_norm": 1.5753317257606638, + "language_loss": 0.73806137, + "learning_rate": 3.121017647907921e-06, + "loss": 0.81557631, + "num_input_tokens_seen": 118335250, + "router_z_loss_clip": 1.84960938, + "router_z_loss_mlp": 0.15460205, + "step": 5510, + "time_per_iteration": 2.576838731765747 + }, + { + "auxiliary_loss_clip": 0.06473264, + "auxiliary_loss_mlp": 0.01276647, + "balance_loss_clip": 0.06286872, + "balance_loss_mlp": 0.01261019, + "epoch": 0.331339245453179, + "flos": 14434086107520.0, + "grad_norm": 2.529546935928515, + "language_loss": 0.88507652, + "learning_rate": 3.1206950945668508e-06, + "loss": 0.96257567, + "num_input_tokens_seen": 118351470, + "router_z_loss_clip": 1.86328125, + "router_z_loss_mlp": 0.15612793, + "step": 5511, + "time_per_iteration": 2.550442695617676 + }, + { + "auxiliary_loss_clip": 0.06464168, + "auxiliary_loss_mlp": 0.01275515, + "balance_loss_clip": 0.06286532, + "balance_loss_mlp": 0.01260494, + "epoch": 0.33139936870584696, + "flos": 20893499107200.0, + "grad_norm": 1.6341387009287651, + "language_loss": 0.73559558, + "learning_rate": 3.12037249872891e-06, + "loss": 0.81299245, + "num_input_tokens_seen": 118370970, + "router_z_loss_clip": 1.77441406, + "router_z_loss_mlp": 0.15026855, + "step": 5512, + "time_per_iteration": 2.5596871376037598 + }, + { + "auxiliary_loss_clip": 0.06468001, + "auxiliary_loss_mlp": 0.01278341, + "balance_loss_clip": 0.06286225, + "balance_loss_mlp": 0.01262438, + "epoch": 0.33145949195851493, + "flos": 36293352975360.0, + "grad_norm": 1.8738374179289, + "language_loss": 0.72677827, + "learning_rate": 3.1200498604063317e-06, + "loss": 0.80424166, + "num_input_tokens_seen": 118393125, + "router_z_loss_clip": 1.81738281, + "router_z_loss_mlp": 0.15905762, + "step": 5513, + "time_per_iteration": 4.148774147033691 + }, + { + "auxiliary_loss_clip": 0.06472933, + "auxiliary_loss_mlp": 0.01275876, + "balance_loss_clip": 0.06284368, + "balance_loss_mlp": 0.0125958, + "epoch": 0.33151961521118295, + "flos": 14284431515520.0, + "grad_norm": 1.8311253656567958, + "language_loss": 0.69026303, + "learning_rate": 3.1197271796113507e-06, + "loss": 0.7677511, + "num_input_tokens_seen": 118410860, + "router_z_loss_clip": 1.8828125, + "router_z_loss_mlp": 0.16296387, + "step": 5514, + "time_per_iteration": 2.486818313598633 + }, + { + "auxiliary_loss_clip": 0.06477968, + "auxiliary_loss_mlp": 0.0127816, + "balance_loss_clip": 0.06291951, + "balance_loss_mlp": 0.01261089, + "epoch": 0.3315797384638509, + "flos": 20780126133120.0, + "grad_norm": 1.9656560392088134, + "language_loss": 0.66393441, + "learning_rate": 3.1194044563562026e-06, + "loss": 0.74149573, + "num_input_tokens_seen": 118429570, + "router_z_loss_clip": 1.859375, + "router_z_loss_mlp": 0.17053223, + "step": 5515, + "time_per_iteration": 2.531658411026001 + }, + { + "auxiliary_loss_clip": 0.06473279, + "auxiliary_loss_mlp": 0.01275122, + "balance_loss_clip": 0.06286342, + "balance_loss_mlp": 0.01258885, + "epoch": 0.3316398617165189, + "flos": 24686115736320.0, + "grad_norm": 3.8914339391091732, + "language_loss": 0.69369388, + "learning_rate": 3.1190816906531257e-06, + "loss": 0.77117789, + "num_input_tokens_seen": 118450285, + "router_z_loss_clip": 1.8671875, + "router_z_loss_mlp": 0.16235352, + "step": 5516, + "time_per_iteration": 2.5392425060272217 + }, + { + "auxiliary_loss_clip": 0.06476592, + "auxiliary_loss_mlp": 0.01274968, + "balance_loss_clip": 0.06287026, + "balance_loss_mlp": 0.0125959, + "epoch": 0.33169998496918685, + "flos": 18593879328000.0, + "grad_norm": 2.757231582138207, + "language_loss": 0.80914545, + "learning_rate": 3.118758882514359e-06, + "loss": 0.88666099, + "num_input_tokens_seen": 118468270, + "router_z_loss_clip": 1.89550781, + "router_z_loss_mlp": 0.15368652, + "step": 5517, + "time_per_iteration": 2.4851818084716797 + }, + { + "auxiliary_loss_clip": 0.06465174, + "auxiliary_loss_mlp": 0.01279818, + "balance_loss_clip": 0.06284687, + "balance_loss_mlp": 0.01264142, + "epoch": 0.3317601082218548, + "flos": 20199871808640.0, + "grad_norm": 1.6705032998917397, + "language_loss": 0.74656814, + "learning_rate": 3.118436031952143e-06, + "loss": 0.82401806, + "num_input_tokens_seen": 118486615, + "router_z_loss_clip": 1.8046875, + "router_z_loss_mlp": 0.15686035, + "step": 5518, + "time_per_iteration": 2.518036127090454 + }, + { + "auxiliary_loss_clip": 0.06372921, + "auxiliary_loss_mlp": 0.01283465, + "balance_loss_clip": 0.06286249, + "balance_loss_mlp": 0.01279764, + "epoch": 0.3318202314745228, + "flos": 68995119265920.0, + "grad_norm": 0.7149144856696655, + "language_loss": 0.54263318, + "learning_rate": 3.1181131389787206e-06, + "loss": 0.61919701, + "num_input_tokens_seen": 118553580, + "router_z_loss_clip": 0.8671875, + "router_z_loss_mlp": 0.03692627, + "step": 5519, + "time_per_iteration": 3.246586322784424 + }, + { + "auxiliary_loss_clip": 0.06472577, + "auxiliary_loss_mlp": 0.01276695, + "balance_loss_clip": 0.06288108, + "balance_loss_mlp": 0.0125966, + "epoch": 0.33188035472719074, + "flos": 21505381148160.0, + "grad_norm": 2.182658812554146, + "language_loss": 0.79452467, + "learning_rate": 3.117790203606336e-06, + "loss": 0.87201744, + "num_input_tokens_seen": 118570280, + "router_z_loss_clip": 1.84375, + "router_z_loss_mlp": 0.17028809, + "step": 5520, + "time_per_iteration": 2.517853260040283 + }, + { + "auxiliary_loss_clip": 0.06465811, + "auxiliary_loss_mlp": 0.01271287, + "balance_loss_clip": 0.06283027, + "balance_loss_mlp": 0.01256279, + "epoch": 0.3319404779798587, + "flos": 28877033548800.0, + "grad_norm": 1.8300903967069966, + "language_loss": 0.77067709, + "learning_rate": 3.1174672258472344e-06, + "loss": 0.84804809, + "num_input_tokens_seen": 118590455, + "router_z_loss_clip": 1.82910156, + "router_z_loss_mlp": 0.15002441, + "step": 5521, + "time_per_iteration": 2.555697441101074 + }, + { + "auxiliary_loss_clip": 0.06478226, + "auxiliary_loss_mlp": 0.01278256, + "balance_loss_clip": 0.06288885, + "balance_loss_mlp": 0.01261542, + "epoch": 0.33200060123252667, + "flos": 23083770908160.0, + "grad_norm": 1.9119948906690396, + "language_loss": 0.70441258, + "learning_rate": 3.117144205713664e-06, + "loss": 0.78197736, + "num_input_tokens_seen": 118609495, + "router_z_loss_clip": 1.89355469, + "router_z_loss_mlp": 0.16699219, + "step": 5522, + "time_per_iteration": 2.5673933029174805 + }, + { + "auxiliary_loss_clip": 0.06474358, + "auxiliary_loss_mlp": 0.01271133, + "balance_loss_clip": 0.06290573, + "balance_loss_mlp": 0.01255255, + "epoch": 0.33206072448519464, + "flos": 21148895952000.0, + "grad_norm": 1.6906348218339255, + "language_loss": 0.74640656, + "learning_rate": 3.1168211432178735e-06, + "loss": 0.82386148, + "num_input_tokens_seen": 118628720, + "router_z_loss_clip": 1.83789062, + "router_z_loss_mlp": 0.15881348, + "step": 5523, + "time_per_iteration": 2.516275405883789 + }, + { + "auxiliary_loss_clip": 0.06473421, + "auxiliary_loss_mlp": 0.01271212, + "balance_loss_clip": 0.06292297, + "balance_loss_mlp": 0.01255763, + "epoch": 0.3321208477378626, + "flos": 13084161304320.0, + "grad_norm": 2.1726495268835024, + "language_loss": 0.82172406, + "learning_rate": 3.116498038372114e-06, + "loss": 0.8991704, + "num_input_tokens_seen": 118645955, + "router_z_loss_clip": 1.80957031, + "router_z_loss_mlp": 0.15454102, + "step": 5524, + "time_per_iteration": 2.557941198348999 + }, + { + "auxiliary_loss_clip": 0.06473367, + "auxiliary_loss_mlp": 0.01272915, + "balance_loss_clip": 0.06289522, + "balance_loss_mlp": 0.01257251, + "epoch": 0.33218097099053057, + "flos": 21221836531200.0, + "grad_norm": 1.6566666481357326, + "language_loss": 0.83100772, + "learning_rate": 3.116174891188636e-06, + "loss": 0.90847051, + "num_input_tokens_seen": 118665605, + "router_z_loss_clip": 1.8359375, + "router_z_loss_mlp": 0.15649414, + "step": 5525, + "time_per_iteration": 2.527944564819336 + }, + { + "auxiliary_loss_clip": 0.06379532, + "auxiliary_loss_mlp": 0.01265433, + "balance_loss_clip": 0.06292765, + "balance_loss_mlp": 0.01261484, + "epoch": 0.33224109424319853, + "flos": 64369954068480.0, + "grad_norm": 0.7407224947932968, + "language_loss": 0.52533764, + "learning_rate": 3.1158517016796945e-06, + "loss": 0.60178727, + "num_input_tokens_seen": 118728155, + "router_z_loss_clip": 0.8671875, + "router_z_loss_mlp": 0.03945923, + "step": 5526, + "time_per_iteration": 3.1679162979125977 + }, + { + "auxiliary_loss_clip": 0.0647909, + "auxiliary_loss_mlp": 0.01274604, + "balance_loss_clip": 0.06291543, + "balance_loss_mlp": 0.01258391, + "epoch": 0.33230121749586655, + "flos": 17351457713280.0, + "grad_norm": 1.970764365513445, + "language_loss": 0.79041827, + "learning_rate": 3.1155284698575445e-06, + "loss": 0.86795521, + "num_input_tokens_seen": 118743955, + "router_z_loss_clip": 1.87597656, + "router_z_loss_mlp": 0.1619873, + "step": 5527, + "time_per_iteration": 2.5327274799346924 + }, + { + "auxiliary_loss_clip": 0.06477004, + "auxiliary_loss_mlp": 0.01278538, + "balance_loss_clip": 0.06294803, + "balance_loss_mlp": 0.01263458, + "epoch": 0.3323613407485345, + "flos": 21003517918080.0, + "grad_norm": 1.6591522480418575, + "language_loss": 0.72383821, + "learning_rate": 3.1152051957344434e-06, + "loss": 0.80139363, + "num_input_tokens_seen": 118763275, + "router_z_loss_clip": 1.82226562, + "router_z_loss_mlp": 0.15063477, + "step": 5528, + "time_per_iteration": 2.6072213649749756 + }, + { + "auxiliary_loss_clip": 0.06477713, + "auxiliary_loss_mlp": 0.01274869, + "balance_loss_clip": 0.06292165, + "balance_loss_mlp": 0.01259396, + "epoch": 0.3324214640012025, + "flos": 13157688862080.0, + "grad_norm": 1.8543805866880412, + "language_loss": 0.8336091, + "learning_rate": 3.1148818793226497e-06, + "loss": 0.91113496, + "num_input_tokens_seen": 118781110, + "router_z_loss_clip": 1.85546875, + "router_z_loss_mlp": 0.15466309, + "step": 5529, + "time_per_iteration": 2.5001087188720703 + }, + { + "auxiliary_loss_clip": 0.06479646, + "auxiliary_loss_mlp": 0.01270144, + "balance_loss_clip": 0.06290583, + "balance_loss_mlp": 0.01254587, + "epoch": 0.33248158725387045, + "flos": 22280124798720.0, + "grad_norm": 1.7380748666321508, + "language_loss": 0.70133483, + "learning_rate": 3.114558520634423e-06, + "loss": 0.77883273, + "num_input_tokens_seen": 118800620, + "router_z_loss_clip": 1.89355469, + "router_z_loss_mlp": 0.15551758, + "step": 5530, + "time_per_iteration": 2.5806338787078857 + }, + { + "auxiliary_loss_clip": 0.06479505, + "auxiliary_loss_mlp": 0.01275357, + "balance_loss_clip": 0.06291899, + "balance_loss_mlp": 0.01258751, + "epoch": 0.3325417105065384, + "flos": 20747324459520.0, + "grad_norm": 2.7342028000668552, + "language_loss": 0.77694213, + "learning_rate": 3.1142351196820256e-06, + "loss": 0.85449082, + "num_input_tokens_seen": 118818725, + "router_z_loss_clip": 1.87597656, + "router_z_loss_mlp": 0.16589355, + "step": 5531, + "time_per_iteration": 2.5307323932647705 + }, + { + "auxiliary_loss_clip": 0.06477839, + "auxiliary_loss_mlp": 0.01280766, + "balance_loss_clip": 0.06290089, + "balance_loss_mlp": 0.01263552, + "epoch": 0.3326018337592064, + "flos": 24797476212480.0, + "grad_norm": 1.9473942094883194, + "language_loss": 0.73779702, + "learning_rate": 3.1139116764777206e-06, + "loss": 0.81538308, + "num_input_tokens_seen": 118839390, + "router_z_loss_clip": 1.87597656, + "router_z_loss_mlp": 0.17211914, + "step": 5532, + "time_per_iteration": 2.5989890098571777 + }, + { + "auxiliary_loss_clip": 0.06472681, + "auxiliary_loss_mlp": 0.01278728, + "balance_loss_clip": 0.06288014, + "balance_loss_mlp": 0.01263147, + "epoch": 0.33266195701187434, + "flos": 14506942832640.0, + "grad_norm": 1.825417572799306, + "language_loss": 0.66042602, + "learning_rate": 3.1135881910337735e-06, + "loss": 0.73794013, + "num_input_tokens_seen": 118856275, + "router_z_loss_clip": 1.84667969, + "router_z_loss_mlp": 0.15576172, + "step": 5533, + "time_per_iteration": 2.47566294670105 + }, + { + "auxiliary_loss_clip": 0.06474279, + "auxiliary_loss_mlp": 0.012755, + "balance_loss_clip": 0.06289338, + "balance_loss_mlp": 0.01258954, + "epoch": 0.3327220802645423, + "flos": 15309792328320.0, + "grad_norm": 1.6677538876536442, + "language_loss": 0.71568084, + "learning_rate": 3.113264663362451e-06, + "loss": 0.79317868, + "num_input_tokens_seen": 118873830, + "router_z_loss_clip": 1.84863281, + "router_z_loss_mlp": 0.16552734, + "step": 5534, + "time_per_iteration": 2.5140762329101562 + }, + { + "auxiliary_loss_clip": 0.06474573, + "auxiliary_loss_mlp": 0.01273002, + "balance_loss_clip": 0.06290095, + "balance_loss_mlp": 0.01257088, + "epoch": 0.3327822035172103, + "flos": 23484336151680.0, + "grad_norm": 1.635346823223845, + "language_loss": 0.67885029, + "learning_rate": 3.1129410934760204e-06, + "loss": 0.75632608, + "num_input_tokens_seen": 118891560, + "router_z_loss_clip": 1.84277344, + "router_z_loss_mlp": 0.15917969, + "step": 5535, + "time_per_iteration": 2.522270917892456 + }, + { + "auxiliary_loss_clip": 0.0647034, + "auxiliary_loss_mlp": 0.01273438, + "balance_loss_clip": 0.06284929, + "balance_loss_mlp": 0.01257547, + "epoch": 0.33284232676987824, + "flos": 25381587824640.0, + "grad_norm": 2.3715726564419155, + "language_loss": 0.72782886, + "learning_rate": 3.1126174813867517e-06, + "loss": 0.80526668, + "num_input_tokens_seen": 118910260, + "router_z_loss_clip": 1.85449219, + "router_z_loss_mlp": 0.15893555, + "step": 5536, + "time_per_iteration": 2.5831825733184814 + }, + { + "auxiliary_loss_clip": 0.06470598, + "auxiliary_loss_mlp": 0.01270866, + "balance_loss_clip": 0.06285597, + "balance_loss_mlp": 0.01255464, + "epoch": 0.3329024500225462, + "flos": 23700851902080.0, + "grad_norm": 1.6831469867631554, + "language_loss": 0.81958938, + "learning_rate": 3.112293827106917e-06, + "loss": 0.89700401, + "num_input_tokens_seen": 118929985, + "router_z_loss_clip": 1.84960938, + "router_z_loss_mlp": 0.15405273, + "step": 5537, + "time_per_iteration": 2.520211935043335 + }, + { + "auxiliary_loss_clip": 0.06473641, + "auxiliary_loss_mlp": 0.01270298, + "balance_loss_clip": 0.06284811, + "balance_loss_mlp": 0.01253799, + "epoch": 0.33296257327521417, + "flos": 31731317429760.0, + "grad_norm": 1.8576028267218818, + "language_loss": 0.71933794, + "learning_rate": 3.111970130648789e-06, + "loss": 0.79677737, + "num_input_tokens_seen": 118951355, + "router_z_loss_clip": 1.88671875, + "router_z_loss_mlp": 0.16491699, + "step": 5538, + "time_per_iteration": 2.6061229705810547 + }, + { + "auxiliary_loss_clip": 0.06466128, + "auxiliary_loss_mlp": 0.01271828, + "balance_loss_clip": 0.06283107, + "balance_loss_mlp": 0.01256784, + "epoch": 0.33302269652788213, + "flos": 22750863436800.0, + "grad_norm": 1.8542539639588682, + "language_loss": 0.75063813, + "learning_rate": 3.1116463920246424e-06, + "loss": 0.82801771, + "num_input_tokens_seen": 118970910, + "router_z_loss_clip": 1.83007812, + "router_z_loss_mlp": 0.15039062, + "step": 5539, + "time_per_iteration": 2.5176634788513184 + }, + { + "auxiliary_loss_clip": 0.06473792, + "auxiliary_loss_mlp": 0.0127244, + "balance_loss_clip": 0.06284824, + "balance_loss_mlp": 0.01255739, + "epoch": 0.33308281978055015, + "flos": 11478546167040.0, + "grad_norm": 1.8040392528519402, + "language_loss": 0.71489209, + "learning_rate": 3.1113226112467527e-06, + "loss": 0.79235446, + "num_input_tokens_seen": 118989200, + "router_z_loss_clip": 1.890625, + "router_z_loss_mlp": 0.16699219, + "step": 5540, + "time_per_iteration": 2.536752939224243 + }, + { + "auxiliary_loss_clip": 0.06462967, + "auxiliary_loss_mlp": 0.01271775, + "balance_loss_clip": 0.06280267, + "balance_loss_mlp": 0.01256576, + "epoch": 0.3331429430332181, + "flos": 38222274291840.0, + "grad_norm": 3.095851444688792, + "language_loss": 0.60970843, + "learning_rate": 3.1109987883273983e-06, + "loss": 0.68705589, + "num_input_tokens_seen": 119011030, + "router_z_loss_clip": 1.82519531, + "router_z_loss_mlp": 0.15197754, + "step": 5541, + "time_per_iteration": 2.6592354774475098 + }, + { + "auxiliary_loss_clip": 0.06472225, + "auxiliary_loss_mlp": 0.01276024, + "balance_loss_clip": 0.06284402, + "balance_loss_mlp": 0.01259872, + "epoch": 0.3332030662858861, + "flos": 22535270081280.0, + "grad_norm": 1.770287690308821, + "language_loss": 0.69711685, + "learning_rate": 3.1106749232788584e-06, + "loss": 0.77459931, + "num_input_tokens_seen": 119030620, + "router_z_loss_clip": 1.87597656, + "router_z_loss_mlp": 0.16149902, + "step": 5542, + "time_per_iteration": 2.5427184104919434 + }, + { + "auxiliary_loss_clip": 0.06473213, + "auxiliary_loss_mlp": 0.01276881, + "balance_loss_clip": 0.06286451, + "balance_loss_mlp": 0.01261658, + "epoch": 0.33326318953855405, + "flos": 16003293845760.0, + "grad_norm": 1.6729265705607443, + "language_loss": 0.75927889, + "learning_rate": 3.110351016113414e-06, + "loss": 0.83677983, + "num_input_tokens_seen": 119048015, + "router_z_loss_clip": 1.86914062, + "router_z_loss_mlp": 0.15222168, + "step": 5543, + "time_per_iteration": 2.4745616912841797 + }, + { + "auxiliary_loss_clip": 0.06475509, + "auxiliary_loss_mlp": 0.01276899, + "balance_loss_clip": 0.06287046, + "balance_loss_mlp": 0.01260281, + "epoch": 0.333323312791222, + "flos": 25600661124480.0, + "grad_norm": 1.7242995092969657, + "language_loss": 0.75332278, + "learning_rate": 3.110027066843348e-06, + "loss": 0.83084685, + "num_input_tokens_seen": 119066280, + "router_z_loss_clip": 1.88476562, + "router_z_loss_mlp": 0.16601562, + "step": 5544, + "time_per_iteration": 2.565572738647461 + }, + { + "auxiliary_loss_clip": 0.06467521, + "auxiliary_loss_mlp": 0.01270286, + "balance_loss_clip": 0.06283619, + "balance_loss_mlp": 0.01254848, + "epoch": 0.33338343604389, + "flos": 25126652177280.0, + "grad_norm": 1.4364166263140996, + "language_loss": 0.71556139, + "learning_rate": 3.1097030754809456e-06, + "loss": 0.79293942, + "num_input_tokens_seen": 119087680, + "router_z_loss_clip": 1.83984375, + "router_z_loss_mlp": 0.1541748, + "step": 5545, + "time_per_iteration": 3.9951117038726807 + }, + { + "auxiliary_loss_clip": 0.0646642, + "auxiliary_loss_mlp": 0.01275763, + "balance_loss_clip": 0.0628425, + "balance_loss_mlp": 0.01260063, + "epoch": 0.33344355929655795, + "flos": 16953114602880.0, + "grad_norm": 1.5928525652704049, + "language_loss": 0.69892073, + "learning_rate": 3.1093790420384894e-06, + "loss": 0.77634251, + "num_input_tokens_seen": 119105820, + "router_z_loss_clip": 1.8203125, + "router_z_loss_mlp": 0.15722656, + "step": 5546, + "time_per_iteration": 4.069552659988403 + }, + { + "auxiliary_loss_clip": 0.06469481, + "auxiliary_loss_mlp": 0.01273771, + "balance_loss_clip": 0.06280591, + "balance_loss_mlp": 0.01257308, + "epoch": 0.3335036825492259, + "flos": 27896675178240.0, + "grad_norm": 1.5973320112543803, + "language_loss": 0.65030676, + "learning_rate": 3.1090549665282702e-06, + "loss": 0.72773933, + "num_input_tokens_seen": 119126630, + "router_z_loss_clip": 1.88964844, + "router_z_loss_mlp": 0.16455078, + "step": 5547, + "time_per_iteration": 2.578320026397705 + }, + { + "auxiliary_loss_clip": 0.06468174, + "auxiliary_loss_mlp": 0.01274769, + "balance_loss_clip": 0.06284153, + "balance_loss_mlp": 0.01258736, + "epoch": 0.3335638058018939, + "flos": 16184995424640.0, + "grad_norm": 1.9789366990729325, + "language_loss": 0.85645819, + "learning_rate": 3.1087308489625742e-06, + "loss": 0.9338876, + "num_input_tokens_seen": 119143375, + "router_z_loss_clip": 1.84082031, + "router_z_loss_mlp": 0.16040039, + "step": 5548, + "time_per_iteration": 3.917346477508545 + }, + { + "auxiliary_loss_clip": 0.06473708, + "auxiliary_loss_mlp": 0.01275416, + "balance_loss_clip": 0.06283803, + "balance_loss_mlp": 0.01259264, + "epoch": 0.33362392905456184, + "flos": 39905651617920.0, + "grad_norm": 1.927393858225298, + "language_loss": 0.74956143, + "learning_rate": 3.1084066893536945e-06, + "loss": 0.82705271, + "num_input_tokens_seen": 119166450, + "router_z_loss_clip": 1.89453125, + "router_z_loss_mlp": 0.16149902, + "step": 5549, + "time_per_iteration": 2.662152051925659 + }, + { + "auxiliary_loss_clip": 0.0647629, + "auxiliary_loss_mlp": 0.01276829, + "balance_loss_clip": 0.06287523, + "balance_loss_mlp": 0.0125946, + "epoch": 0.3336840523072298, + "flos": 44280954339840.0, + "grad_norm": 3.284743863263659, + "language_loss": 0.68874133, + "learning_rate": 3.108082487713921e-06, + "loss": 0.76627254, + "num_input_tokens_seen": 119189645, + "router_z_loss_clip": 1.88476562, + "router_z_loss_mlp": 0.17370605, + "step": 5550, + "time_per_iteration": 2.703099250793457 + }, + { + "auxiliary_loss_clip": 0.06476407, + "auxiliary_loss_mlp": 0.01275354, + "balance_loss_clip": 0.06290508, + "balance_loss_mlp": 0.01259488, + "epoch": 0.33374417555989777, + "flos": 15091054444800.0, + "grad_norm": 2.6465919002896436, + "language_loss": 0.60992151, + "learning_rate": 3.1077582440555495e-06, + "loss": 0.6874392, + "num_input_tokens_seen": 119208045, + "router_z_loss_clip": 1.85839844, + "router_z_loss_mlp": 0.15856934, + "step": 5551, + "time_per_iteration": 2.5024354457855225 + }, + { + "auxiliary_loss_clip": 0.06471356, + "auxiliary_loss_mlp": 0.01275291, + "balance_loss_clip": 0.06287605, + "balance_loss_mlp": 0.01259985, + "epoch": 0.33380429881256574, + "flos": 15854226232320.0, + "grad_norm": 1.6170207033712265, + "language_loss": 0.71155131, + "learning_rate": 3.1074339583908746e-06, + "loss": 0.78901786, + "num_input_tokens_seen": 119224910, + "router_z_loss_clip": 1.83691406, + "router_z_loss_mlp": 0.15307617, + "step": 5552, + "time_per_iteration": 4.0786826610565186 + }, + { + "auxiliary_loss_clip": 0.06476602, + "auxiliary_loss_mlp": 0.01270143, + "balance_loss_clip": 0.06291272, + "balance_loss_mlp": 0.01255182, + "epoch": 0.33386442206523376, + "flos": 13485439307520.0, + "grad_norm": 2.244029622012826, + "language_loss": 0.83864999, + "learning_rate": 3.107109630732192e-06, + "loss": 0.91611743, + "num_input_tokens_seen": 119243290, + "router_z_loss_clip": 1.85644531, + "router_z_loss_mlp": 0.1496582, + "step": 5553, + "time_per_iteration": 2.603986978530884 + }, + { + "auxiliary_loss_clip": 0.06474789, + "auxiliary_loss_mlp": 0.0127187, + "balance_loss_clip": 0.06288507, + "balance_loss_mlp": 0.01255562, + "epoch": 0.3339245453179017, + "flos": 16696250311680.0, + "grad_norm": 2.098616423404285, + "language_loss": 0.81424135, + "learning_rate": 3.1067852610918017e-06, + "loss": 0.89170802, + "num_input_tokens_seen": 119261195, + "router_z_loss_clip": 1.86132812, + "router_z_loss_mlp": 0.16320801, + "step": 5554, + "time_per_iteration": 2.4884121417999268 + }, + { + "auxiliary_loss_clip": 0.06477922, + "auxiliary_loss_mlp": 0.01277907, + "balance_loss_clip": 0.06288742, + "balance_loss_mlp": 0.01261647, + "epoch": 0.3339846685705697, + "flos": 24617954839680.0, + "grad_norm": 1.4369599322997015, + "language_loss": 0.81866252, + "learning_rate": 3.1064608494820032e-06, + "loss": 0.89622086, + "num_input_tokens_seen": 119282845, + "router_z_loss_clip": 1.89160156, + "router_z_loss_mlp": 0.16259766, + "step": 5555, + "time_per_iteration": 2.6273152828216553 + }, + { + "auxiliary_loss_clip": 0.06478396, + "auxiliary_loss_mlp": 0.01271619, + "balance_loss_clip": 0.06292441, + "balance_loss_mlp": 0.01256325, + "epoch": 0.33404479182323765, + "flos": 30961311534720.0, + "grad_norm": 1.7387044564853729, + "language_loss": 0.74836755, + "learning_rate": 3.106136395915099e-06, + "loss": 0.82586771, + "num_input_tokens_seen": 119304430, + "router_z_loss_clip": 1.85839844, + "router_z_loss_mlp": 0.1529541, + "step": 5556, + "time_per_iteration": 2.5936899185180664 + }, + { + "auxiliary_loss_clip": 0.06476042, + "auxiliary_loss_mlp": 0.01275785, + "balance_loss_clip": 0.06293188, + "balance_loss_mlp": 0.01260562, + "epoch": 0.3341049150759056, + "flos": 23519988864000.0, + "grad_norm": 1.3815052276914728, + "language_loss": 0.82545519, + "learning_rate": 3.105811900403391e-06, + "loss": 0.90297353, + "num_input_tokens_seen": 119323830, + "router_z_loss_clip": 1.82617188, + "router_z_loss_mlp": 0.15222168, + "step": 5557, + "time_per_iteration": 2.5862598419189453 + }, + { + "auxiliary_loss_clip": 0.06477734, + "auxiliary_loss_mlp": 0.01279505, + "balance_loss_clip": 0.0629133, + "balance_loss_mlp": 0.01264067, + "epoch": 0.3341650383285736, + "flos": 24034052862720.0, + "grad_norm": 2.760917503655681, + "language_loss": 0.80188966, + "learning_rate": 3.1054873629591855e-06, + "loss": 0.87946206, + "num_input_tokens_seen": 119346340, + "router_z_loss_clip": 1.86230469, + "router_z_loss_mlp": 0.15429688, + "step": 5558, + "time_per_iteration": 2.596344232559204 + }, + { + "auxiliary_loss_clip": 0.06475051, + "auxiliary_loss_mlp": 0.01282797, + "balance_loss_clip": 0.06287208, + "balance_loss_mlp": 0.01267646, + "epoch": 0.33422516158124155, + "flos": 24909255959040.0, + "grad_norm": 1.7423955567809428, + "language_loss": 0.81954122, + "learning_rate": 3.105162783594788e-06, + "loss": 0.8971197, + "num_input_tokens_seen": 119367285, + "router_z_loss_clip": 1.87792969, + "router_z_loss_mlp": 0.1517334, + "step": 5559, + "time_per_iteration": 2.587005376815796 + }, + { + "auxiliary_loss_clip": 0.06467593, + "auxiliary_loss_mlp": 0.01279767, + "balance_loss_clip": 0.06286522, + "balance_loss_mlp": 0.01265224, + "epoch": 0.3342852848339095, + "flos": 18339404878080.0, + "grad_norm": 2.1220335034517093, + "language_loss": 0.72058392, + "learning_rate": 3.1048381623225074e-06, + "loss": 0.79805756, + "num_input_tokens_seen": 119385370, + "router_z_loss_clip": 1.81054688, + "router_z_loss_mlp": 0.14550781, + "step": 5560, + "time_per_iteration": 2.536546230316162 + }, + { + "auxiliary_loss_clip": 0.06481705, + "auxiliary_loss_mlp": 0.01285397, + "balance_loss_clip": 0.06292065, + "balance_loss_mlp": 0.01269458, + "epoch": 0.3343454080865775, + "flos": 30054690357120.0, + "grad_norm": 1.596178779859494, + "language_loss": 0.75386882, + "learning_rate": 3.1045134991546526e-06, + "loss": 0.83153981, + "num_input_tokens_seen": 119409150, + "router_z_loss_clip": 1.89453125, + "router_z_loss_mlp": 0.15930176, + "step": 5561, + "time_per_iteration": 2.672700881958008 + }, + { + "auxiliary_loss_clip": 0.06477022, + "auxiliary_loss_mlp": 0.01277798, + "balance_loss_clip": 0.06291385, + "balance_loss_mlp": 0.01262551, + "epoch": 0.33440553133924544, + "flos": 16404362213760.0, + "grad_norm": 1.6462526862455489, + "language_loss": 0.70108986, + "learning_rate": 3.1041887941035355e-06, + "loss": 0.77863806, + "num_input_tokens_seen": 119426475, + "router_z_loss_clip": 1.85742188, + "router_z_loss_mlp": 0.15246582, + "step": 5562, + "time_per_iteration": 2.501317024230957 + }, + { + "auxiliary_loss_clip": 0.06472157, + "auxiliary_loss_mlp": 0.01280428, + "balance_loss_clip": 0.06287345, + "balance_loss_mlp": 0.01265396, + "epoch": 0.3344656545919134, + "flos": 24248723823360.0, + "grad_norm": 1.5361546803562123, + "language_loss": 0.65648419, + "learning_rate": 3.1038640471814685e-06, + "loss": 0.7340101, + "num_input_tokens_seen": 119446900, + "router_z_loss_clip": 1.84667969, + "router_z_loss_mlp": 0.15026855, + "step": 5563, + "time_per_iteration": 2.5564165115356445 + }, + { + "auxiliary_loss_clip": 0.06477885, + "auxiliary_loss_mlp": 0.01282181, + "balance_loss_clip": 0.06290222, + "balance_loss_mlp": 0.01264752, + "epoch": 0.3345257778445814, + "flos": 52130431048320.0, + "grad_norm": 1.3531042812140452, + "language_loss": 0.74246049, + "learning_rate": 3.103539258400766e-06, + "loss": 0.82006115, + "num_input_tokens_seen": 119470945, + "router_z_loss_clip": 1.87792969, + "router_z_loss_mlp": 0.17431641, + "step": 5564, + "time_per_iteration": 2.810534715652466 + }, + { + "auxiliary_loss_clip": 0.06356741, + "auxiliary_loss_mlp": 0.01295627, + "balance_loss_clip": 0.06270711, + "balance_loss_mlp": 0.01291562, + "epoch": 0.33458590109724934, + "flos": 68066528319360.0, + "grad_norm": 0.78222915395806, + "language_loss": 0.55275309, + "learning_rate": 3.103214427773745e-06, + "loss": 0.62927675, + "num_input_tokens_seen": 119529925, + "router_z_loss_clip": 0.859375, + "router_z_loss_mlp": 0.04064941, + "step": 5565, + "time_per_iteration": 3.1279821395874023 + }, + { + "auxiliary_loss_clip": 0.06471252, + "auxiliary_loss_mlp": 0.01279791, + "balance_loss_clip": 0.06288698, + "balance_loss_mlp": 0.01264163, + "epoch": 0.3346460243499173, + "flos": 37423869062400.0, + "grad_norm": 1.705115292174207, + "language_loss": 0.65565574, + "learning_rate": 3.102889555312721e-06, + "loss": 0.73316622, + "num_input_tokens_seen": 119550700, + "router_z_loss_clip": 1.82519531, + "router_z_loss_mlp": 0.15625, + "step": 5566, + "time_per_iteration": 2.712435245513916 + }, + { + "auxiliary_loss_clip": 0.0647177, + "auxiliary_loss_mlp": 0.01282122, + "balance_loss_clip": 0.06289912, + "balance_loss_mlp": 0.01266529, + "epoch": 0.3347061476025853, + "flos": 18703269233280.0, + "grad_norm": 1.6655571733561654, + "language_loss": 0.77372861, + "learning_rate": 3.102564641030016e-06, + "loss": 0.85126758, + "num_input_tokens_seen": 119569295, + "router_z_loss_clip": 1.81835938, + "router_z_loss_mlp": 0.15588379, + "step": 5567, + "time_per_iteration": 2.4871251583099365 + }, + { + "auxiliary_loss_clip": 0.06471208, + "auxiliary_loss_mlp": 0.01275703, + "balance_loss_clip": 0.06285998, + "balance_loss_mlp": 0.01259491, + "epoch": 0.3347662708552533, + "flos": 13922957001600.0, + "grad_norm": 1.6558873666299474, + "language_loss": 0.77099127, + "learning_rate": 3.102239684937949e-06, + "loss": 0.84846038, + "num_input_tokens_seen": 119587375, + "router_z_loss_clip": 1.8515625, + "router_z_loss_mlp": 0.16223145, + "step": 5568, + "time_per_iteration": 2.5343427658081055 + }, + { + "auxiliary_loss_clip": 0.06472506, + "auxiliary_loss_mlp": 0.01277777, + "balance_loss_clip": 0.06286565, + "balance_loss_mlp": 0.01262136, + "epoch": 0.33482639410792125, + "flos": 19755645788160.0, + "grad_norm": 1.9310298365294178, + "language_loss": 0.71334505, + "learning_rate": 3.101914687048842e-06, + "loss": 0.7908479, + "num_input_tokens_seen": 119604530, + "router_z_loss_clip": 1.85742188, + "router_z_loss_mlp": 0.15643311, + "step": 5569, + "time_per_iteration": 2.5091118812561035 + }, + { + "auxiliary_loss_clip": 0.06473939, + "auxiliary_loss_mlp": 0.01271857, + "balance_loss_clip": 0.06285448, + "balance_loss_mlp": 0.01256479, + "epoch": 0.3348865173605892, + "flos": 16107820214400.0, + "grad_norm": 1.931700529164995, + "language_loss": 0.90211284, + "learning_rate": 3.10158964737502e-06, + "loss": 0.97957081, + "num_input_tokens_seen": 119621025, + "router_z_loss_clip": 1.88476562, + "router_z_loss_mlp": 0.15380859, + "step": 5570, + "time_per_iteration": 2.6067447662353516 + }, + { + "auxiliary_loss_clip": 0.06465288, + "auxiliary_loss_mlp": 0.01272678, + "balance_loss_clip": 0.06282274, + "balance_loss_mlp": 0.01257264, + "epoch": 0.3349466406132572, + "flos": 25015836752640.0, + "grad_norm": 1.5216158426421846, + "language_loss": 0.79890078, + "learning_rate": 3.101264565928808e-06, + "loss": 0.87628049, + "num_input_tokens_seen": 119641725, + "router_z_loss_clip": 1.82910156, + "router_z_loss_mlp": 0.15405273, + "step": 5571, + "time_per_iteration": 2.5423781871795654 + }, + { + "auxiliary_loss_clip": 0.06342317, + "auxiliary_loss_mlp": 0.01254883, + "balance_loss_clip": 0.06257176, + "balance_loss_mlp": 0.01251411, + "epoch": 0.33500676386592515, + "flos": 54340058413440.0, + "grad_norm": 0.8278358272998855, + "language_loss": 0.55695772, + "learning_rate": 3.1009394427225335e-06, + "loss": 0.63292974, + "num_input_tokens_seen": 119693560, + "router_z_loss_clip": 0.8515625, + "router_z_loss_mlp": 0.03482056, + "step": 5572, + "time_per_iteration": 3.1027615070343018 + }, + { + "auxiliary_loss_clip": 0.06472763, + "auxiliary_loss_mlp": 0.0127696, + "balance_loss_clip": 0.06287524, + "balance_loss_mlp": 0.01261677, + "epoch": 0.3350668871185931, + "flos": 26804620915200.0, + "grad_norm": 1.9863197052332227, + "language_loss": 0.78856999, + "learning_rate": 3.1006142777685257e-06, + "loss": 0.86606717, + "num_input_tokens_seen": 119712935, + "router_z_loss_clip": 1.85253906, + "router_z_loss_mlp": 0.15283203, + "step": 5573, + "time_per_iteration": 2.571803331375122 + }, + { + "auxiliary_loss_clip": 0.06473139, + "auxiliary_loss_mlp": 0.01274748, + "balance_loss_clip": 0.06286675, + "balance_loss_mlp": 0.01257999, + "epoch": 0.3351270103712611, + "flos": 33518885708160.0, + "grad_norm": 2.2174625445936256, + "language_loss": 0.72959399, + "learning_rate": 3.1002890710791133e-06, + "loss": 0.80707288, + "num_input_tokens_seen": 119731680, + "router_z_loss_clip": 1.86523438, + "router_z_loss_mlp": 0.16723633, + "step": 5574, + "time_per_iteration": 2.660301923751831 + }, + { + "auxiliary_loss_clip": 0.06465638, + "auxiliary_loss_mlp": 0.01271718, + "balance_loss_clip": 0.06284496, + "balance_loss_mlp": 0.01256042, + "epoch": 0.33518713362392905, + "flos": 26513613285120.0, + "grad_norm": 1.6818935039401424, + "language_loss": 0.88364851, + "learning_rate": 3.0999638226666287e-06, + "loss": 0.96102208, + "num_input_tokens_seen": 119752155, + "router_z_loss_clip": 1.81152344, + "router_z_loss_mlp": 0.15661621, + "step": 5575, + "time_per_iteration": 2.5729191303253174 + }, + { + "auxiliary_loss_clip": 0.0648465, + "auxiliary_loss_mlp": 0.01276363, + "balance_loss_clip": 0.06290504, + "balance_loss_mlp": 0.01259316, + "epoch": 0.335247256876597, + "flos": 17237078490240.0, + "grad_norm": 1.9893319880263207, + "language_loss": 0.83043218, + "learning_rate": 3.0996385325434063e-06, + "loss": 0.90804225, + "num_input_tokens_seen": 119769195, + "router_z_loss_clip": 1.94042969, + "router_z_loss_mlp": 0.17053223, + "step": 5576, + "time_per_iteration": 2.5360445976257324 + }, + { + "auxiliary_loss_clip": 0.06478332, + "auxiliary_loss_mlp": 0.01275534, + "balance_loss_clip": 0.06288211, + "balance_loss_mlp": 0.01259095, + "epoch": 0.335307380129265, + "flos": 25636397690880.0, + "grad_norm": 2.0001339744496622, + "language_loss": 0.73279572, + "learning_rate": 3.0993132007217806e-06, + "loss": 0.81033432, + "num_input_tokens_seen": 119786810, + "router_z_loss_clip": 1.89941406, + "router_z_loss_mlp": 0.16442871, + "step": 5577, + "time_per_iteration": 2.575026750564575 + }, + { + "auxiliary_loss_clip": 0.06475031, + "auxiliary_loss_mlp": 0.01274987, + "balance_loss_clip": 0.0628825, + "balance_loss_mlp": 0.01257689, + "epoch": 0.33536750338193294, + "flos": 19685765882880.0, + "grad_norm": 1.6019428598408136, + "language_loss": 0.82233781, + "learning_rate": 3.0989878272140883e-06, + "loss": 0.89983797, + "num_input_tokens_seen": 119805395, + "router_z_loss_clip": 1.86425781, + "router_z_loss_mlp": 0.17297363, + "step": 5578, + "time_per_iteration": 2.544978380203247 + }, + { + "auxiliary_loss_clip": 0.06461956, + "auxiliary_loss_mlp": 0.01278493, + "balance_loss_clip": 0.06282087, + "balance_loss_mlp": 0.01262907, + "epoch": 0.3354276266346009, + "flos": 18338482483200.0, + "grad_norm": 1.788420802177993, + "language_loss": 0.72050315, + "learning_rate": 3.0986624120326676e-06, + "loss": 0.79790771, + "num_input_tokens_seen": 119823135, + "router_z_loss_clip": 1.796875, + "router_z_loss_mlp": 0.15582275, + "step": 5579, + "time_per_iteration": 2.50080943107605 + }, + { + "auxiliary_loss_clip": 0.06478497, + "auxiliary_loss_mlp": 0.01282646, + "balance_loss_clip": 0.06290549, + "balance_loss_mlp": 0.01266898, + "epoch": 0.3354877498872689, + "flos": 17864389681920.0, + "grad_norm": 2.052679713623706, + "language_loss": 0.81401342, + "learning_rate": 3.0983369551898573e-06, + "loss": 0.89162487, + "num_input_tokens_seen": 119842265, + "router_z_loss_clip": 1.87597656, + "router_z_loss_mlp": 0.15734863, + "step": 5580, + "time_per_iteration": 2.566675901412964 + }, + { + "auxiliary_loss_clip": 0.06473458, + "auxiliary_loss_mlp": 0.0128019, + "balance_loss_clip": 0.06284851, + "balance_loss_mlp": 0.01263691, + "epoch": 0.3355478731399369, + "flos": 24724703341440.0, + "grad_norm": 1.6024353673136869, + "language_loss": 0.78190315, + "learning_rate": 3.0980114566980003e-06, + "loss": 0.85943961, + "num_input_tokens_seen": 119862500, + "router_z_loss_clip": 1.88378906, + "router_z_loss_mlp": 0.16485596, + "step": 5581, + "time_per_iteration": 2.539208173751831 + }, + { + "auxiliary_loss_clip": 0.06482114, + "auxiliary_loss_mlp": 0.01276643, + "balance_loss_clip": 0.06289735, + "balance_loss_mlp": 0.01259084, + "epoch": 0.33560799639260486, + "flos": 16879628972160.0, + "grad_norm": 2.359779356701633, + "language_loss": 0.74923486, + "learning_rate": 3.0976859165694384e-06, + "loss": 0.8268224, + "num_input_tokens_seen": 119880160, + "router_z_loss_clip": 1.92480469, + "router_z_loss_mlp": 0.17565918, + "step": 5582, + "time_per_iteration": 2.5489563941955566 + }, + { + "auxiliary_loss_clip": 0.06478906, + "auxiliary_loss_mlp": 0.01276582, + "balance_loss_clip": 0.06287926, + "balance_loss_mlp": 0.01260191, + "epoch": 0.3356681196452728, + "flos": 18339530659200.0, + "grad_norm": 1.5985505462491367, + "language_loss": 0.82591236, + "learning_rate": 3.0973603348165166e-06, + "loss": 0.90346718, + "num_input_tokens_seen": 119899040, + "router_z_loss_clip": 1.91113281, + "router_z_loss_mlp": 0.16369629, + "step": 5583, + "time_per_iteration": 2.4985439777374268 + }, + { + "auxiliary_loss_clip": 0.06466989, + "auxiliary_loss_mlp": 0.01276424, + "balance_loss_clip": 0.06282677, + "balance_loss_mlp": 0.01260664, + "epoch": 0.3357282428979408, + "flos": 34759127116800.0, + "grad_norm": 1.8261350586664176, + "language_loss": 0.77844834, + "learning_rate": 3.097034711451581e-06, + "loss": 0.85588253, + "num_input_tokens_seen": 119921120, + "router_z_loss_clip": 1.84277344, + "router_z_loss_mlp": 0.15771484, + "step": 5584, + "time_per_iteration": 2.649090051651001 + }, + { + "auxiliary_loss_clip": 0.06475179, + "auxiliary_loss_mlp": 0.01274752, + "balance_loss_clip": 0.06285385, + "balance_loss_mlp": 0.01259427, + "epoch": 0.33578836615060875, + "flos": 21586539427200.0, + "grad_norm": 1.6814695059799305, + "language_loss": 0.76339197, + "learning_rate": 3.0967090464869795e-06, + "loss": 0.84089124, + "num_input_tokens_seen": 119940165, + "router_z_loss_clip": 1.89941406, + "router_z_loss_mlp": 0.15313721, + "step": 5585, + "time_per_iteration": 5.408076763153076 + }, + { + "auxiliary_loss_clip": 0.06463687, + "auxiliary_loss_mlp": 0.01277288, + "balance_loss_clip": 0.06280811, + "balance_loss_mlp": 0.0126054, + "epoch": 0.3358484894032767, + "flos": 24536377290240.0, + "grad_norm": 1.7085225722674646, + "language_loss": 0.78121984, + "learning_rate": 3.0963833399350608e-06, + "loss": 0.85862964, + "num_input_tokens_seen": 119959730, + "router_z_loss_clip": 1.83007812, + "router_z_loss_mlp": 0.16760254, + "step": 5586, + "time_per_iteration": 2.5785536766052246 + }, + { + "auxiliary_loss_clip": 0.06482486, + "auxiliary_loss_mlp": 0.01271246, + "balance_loss_clip": 0.06290784, + "balance_loss_mlp": 0.01254902, + "epoch": 0.3359086126559447, + "flos": 22462161793920.0, + "grad_norm": 1.9607494340110725, + "language_loss": 0.81952178, + "learning_rate": 3.0960575918081756e-06, + "loss": 0.89705908, + "num_input_tokens_seen": 119979315, + "router_z_loss_clip": 1.91796875, + "router_z_loss_mlp": 0.16357422, + "step": 5587, + "time_per_iteration": 3.9456732273101807 + }, + { + "auxiliary_loss_clip": 0.06460288, + "auxiliary_loss_mlp": 0.01274939, + "balance_loss_clip": 0.06281327, + "balance_loss_mlp": 0.01259692, + "epoch": 0.33596873590861265, + "flos": 16549069415040.0, + "grad_norm": 1.7386991231776667, + "language_loss": 0.67118108, + "learning_rate": 3.095731802118677e-06, + "loss": 0.74853337, + "num_input_tokens_seen": 119996140, + "router_z_loss_clip": 1.79003906, + "router_z_loss_mlp": 0.15234375, + "step": 5588, + "time_per_iteration": 2.6328773498535156 + }, + { + "auxiliary_loss_clip": 0.06471635, + "auxiliary_loss_mlp": 0.01272286, + "balance_loss_clip": 0.0628484, + "balance_loss_mlp": 0.01255215, + "epoch": 0.3360288591612806, + "flos": 31183864778880.0, + "grad_norm": 2.547244730124186, + "language_loss": 0.70319438, + "learning_rate": 3.095405970878919e-06, + "loss": 0.78063357, + "num_input_tokens_seen": 120017720, + "router_z_loss_clip": 1.86621094, + "router_z_loss_mlp": 0.17077637, + "step": 5589, + "time_per_iteration": 2.631972074508667 + }, + { + "auxiliary_loss_clip": 0.06473772, + "auxiliary_loss_mlp": 0.01270331, + "balance_loss_clip": 0.06286001, + "balance_loss_mlp": 0.01255096, + "epoch": 0.3360889824139486, + "flos": 23703828721920.0, + "grad_norm": 1.7722032929069027, + "language_loss": 0.67818141, + "learning_rate": 3.0950800981012567e-06, + "loss": 0.75562239, + "num_input_tokens_seen": 120036335, + "router_z_loss_clip": 1.87695312, + "router_z_loss_mlp": 0.15258789, + "step": 5590, + "time_per_iteration": 2.582160711288452 + }, + { + "auxiliary_loss_clip": 0.0646477, + "auxiliary_loss_mlp": 0.01273314, + "balance_loss_clip": 0.06283349, + "balance_loss_mlp": 0.01257972, + "epoch": 0.33614910566661654, + "flos": 19324207514880.0, + "grad_norm": 1.8733623292805037, + "language_loss": 0.73821473, + "learning_rate": 3.094754183798047e-06, + "loss": 0.81559563, + "num_input_tokens_seen": 120056120, + "router_z_loss_clip": 1.8125, + "router_z_loss_mlp": 0.15344238, + "step": 5591, + "time_per_iteration": 2.5325355529785156 + }, + { + "auxiliary_loss_clip": 0.06462986, + "auxiliary_loss_mlp": 0.01270586, + "balance_loss_clip": 0.06280106, + "balance_loss_mlp": 0.01254945, + "epoch": 0.3362092289192845, + "flos": 16477889771520.0, + "grad_norm": 3.0838875929044036, + "language_loss": 0.70195794, + "learning_rate": 3.0944282279816493e-06, + "loss": 0.77929366, + "num_input_tokens_seen": 120073650, + "router_z_loss_clip": 1.828125, + "router_z_loss_mlp": 0.15637207, + "step": 5592, + "time_per_iteration": 3.919609546661377 + }, + { + "auxiliary_loss_clip": 0.06466913, + "auxiliary_loss_mlp": 0.01271712, + "balance_loss_clip": 0.06283789, + "balance_loss_mlp": 0.01257014, + "epoch": 0.33626935217195253, + "flos": 24250484759040.0, + "grad_norm": 2.017741256836838, + "language_loss": 0.76621854, + "learning_rate": 3.094102230664423e-06, + "loss": 0.8436048, + "num_input_tokens_seen": 120093260, + "router_z_loss_clip": 1.83007812, + "router_z_loss_mlp": 0.14697266, + "step": 5593, + "time_per_iteration": 2.582902431488037 + }, + { + "auxiliary_loss_clip": 0.06476289, + "auxiliary_loss_mlp": 0.01272909, + "balance_loss_clip": 0.06285767, + "balance_loss_mlp": 0.01255445, + "epoch": 0.3363294754246205, + "flos": 19724814685440.0, + "grad_norm": 3.212319882003512, + "language_loss": 0.72710228, + "learning_rate": 3.093776191858731e-06, + "loss": 0.80459422, + "num_input_tokens_seen": 120111830, + "router_z_loss_clip": 1.90625, + "router_z_loss_mlp": 0.17456055, + "step": 5594, + "time_per_iteration": 2.495196580886841 + }, + { + "auxiliary_loss_clip": 0.06477273, + "auxiliary_loss_mlp": 0.01272377, + "balance_loss_clip": 0.06289684, + "balance_loss_mlp": 0.01256379, + "epoch": 0.33638959867728846, + "flos": 22602005458560.0, + "grad_norm": 1.7565144487218112, + "language_loss": 0.8009572, + "learning_rate": 3.0934501115769363e-06, + "loss": 0.87845373, + "num_input_tokens_seen": 120130470, + "router_z_loss_clip": 1.87402344, + "router_z_loss_mlp": 0.16003418, + "step": 5595, + "time_per_iteration": 2.5639891624450684 + }, + { + "auxiliary_loss_clip": 0.06468762, + "auxiliary_loss_mlp": 0.01271282, + "balance_loss_clip": 0.06285411, + "balance_loss_mlp": 0.01256691, + "epoch": 0.3364497219299564, + "flos": 21000834587520.0, + "grad_norm": 1.6187307873664143, + "language_loss": 0.81718135, + "learning_rate": 3.0931239898314037e-06, + "loss": 0.89458185, + "num_input_tokens_seen": 120150735, + "router_z_loss_clip": 1.83203125, + "router_z_loss_mlp": 0.14587402, + "step": 5596, + "time_per_iteration": 2.579089403152466 + }, + { + "auxiliary_loss_clip": 0.06470582, + "auxiliary_loss_mlp": 0.01270351, + "balance_loss_clip": 0.06285384, + "balance_loss_mlp": 0.01256034, + "epoch": 0.3365098451826244, + "flos": 25235664739200.0, + "grad_norm": 1.5539796133352632, + "language_loss": 0.76225436, + "learning_rate": 3.0927978266344995e-06, + "loss": 0.83966368, + "num_input_tokens_seen": 120173230, + "router_z_loss_clip": 1.8515625, + "router_z_loss_mlp": 0.14318848, + "step": 5597, + "time_per_iteration": 2.6059625148773193 + }, + { + "auxiliary_loss_clip": 0.06473622, + "auxiliary_loss_mlp": 0.01271725, + "balance_loss_clip": 0.06290761, + "balance_loss_mlp": 0.01257206, + "epoch": 0.33656996843529235, + "flos": 24578612547840.0, + "grad_norm": 1.67554812607641, + "language_loss": 0.78886169, + "learning_rate": 3.0924716219985916e-06, + "loss": 0.86631513, + "num_input_tokens_seen": 120191860, + "router_z_loss_clip": 1.83007812, + "router_z_loss_mlp": 0.14520264, + "step": 5598, + "time_per_iteration": 2.54971981048584 + }, + { + "auxiliary_loss_clip": 0.06487022, + "auxiliary_loss_mlp": 0.01275679, + "balance_loss_clip": 0.0629402, + "balance_loss_mlp": 0.01259353, + "epoch": 0.3366300916879603, + "flos": 44101223331840.0, + "grad_norm": 1.966389459711274, + "language_loss": 0.64792764, + "learning_rate": 3.0921453759360514e-06, + "loss": 0.7255547, + "num_input_tokens_seen": 120219195, + "router_z_loss_clip": 1.92871094, + "router_z_loss_mlp": 0.16326904, + "step": 5599, + "time_per_iteration": 2.741544723510742 + }, + { + "auxiliary_loss_clip": 0.06483869, + "auxiliary_loss_mlp": 0.01276046, + "balance_loss_clip": 0.06290758, + "balance_loss_mlp": 0.01259118, + "epoch": 0.3366902149406283, + "flos": 13884746739840.0, + "grad_norm": 2.857086104177812, + "language_loss": 0.82787466, + "learning_rate": 3.091819088459249e-06, + "loss": 0.90547383, + "num_input_tokens_seen": 120232950, + "router_z_loss_clip": 1.93164062, + "router_z_loss_mlp": 0.16906738, + "step": 5600, + "time_per_iteration": 2.4761526584625244 + }, + { + "auxiliary_loss_clip": 0.06480727, + "auxiliary_loss_mlp": 0.01272907, + "balance_loss_clip": 0.06289887, + "balance_loss_mlp": 0.01257255, + "epoch": 0.33675033819329625, + "flos": 16258648763520.0, + "grad_norm": 2.1921833677853853, + "language_loss": 0.83268821, + "learning_rate": 3.0914927595805573e-06, + "loss": 0.91022456, + "num_input_tokens_seen": 120248865, + "router_z_loss_clip": 1.90625, + "router_z_loss_mlp": 0.15649414, + "step": 5601, + "time_per_iteration": 2.5205788612365723 + }, + { + "auxiliary_loss_clip": 0.06469133, + "auxiliary_loss_mlp": 0.01269312, + "balance_loss_clip": 0.0628977, + "balance_loss_mlp": 0.01255382, + "epoch": 0.3368104614459642, + "flos": 17061498259200.0, + "grad_norm": 1.6270640398275205, + "language_loss": 0.83791035, + "learning_rate": 3.0911663893123507e-06, + "loss": 0.91529477, + "num_input_tokens_seen": 120267820, + "router_z_loss_clip": 1.79199219, + "router_z_loss_mlp": 0.1394043, + "step": 5602, + "time_per_iteration": 2.5069589614868164 + }, + { + "auxiliary_loss_clip": 0.06479525, + "auxiliary_loss_mlp": 0.01274011, + "balance_loss_clip": 0.06294133, + "balance_loss_mlp": 0.01258645, + "epoch": 0.3368705846986322, + "flos": 17864473536000.0, + "grad_norm": 2.666791314538914, + "language_loss": 0.69934028, + "learning_rate": 3.0908399776670048e-06, + "loss": 0.77687562, + "num_input_tokens_seen": 120286540, + "router_z_loss_clip": 1.85351562, + "router_z_loss_mlp": 0.15380859, + "step": 5603, + "time_per_iteration": 2.5512561798095703 + }, + { + "auxiliary_loss_clip": 0.0648806, + "auxiliary_loss_mlp": 0.01271029, + "balance_loss_clip": 0.06298037, + "balance_loss_mlp": 0.01255376, + "epoch": 0.33693070795130015, + "flos": 22936086887040.0, + "grad_norm": 1.5393691582180518, + "language_loss": 0.83336604, + "learning_rate": 3.090513524656898e-06, + "loss": 0.91095686, + "num_input_tokens_seen": 120307305, + "router_z_loss_clip": 1.90234375, + "router_z_loss_mlp": 0.15661621, + "step": 5604, + "time_per_iteration": 2.542419910430908 + }, + { + "auxiliary_loss_clip": 0.06487563, + "auxiliary_loss_mlp": 0.01271201, + "balance_loss_clip": 0.06296179, + "balance_loss_mlp": 0.01255, + "epoch": 0.3369908312039681, + "flos": 22023889413120.0, + "grad_norm": 1.7290560496085086, + "language_loss": 0.74166059, + "learning_rate": 3.090187030294409e-06, + "loss": 0.8192482, + "num_input_tokens_seen": 120327845, + "router_z_loss_clip": 1.91015625, + "router_z_loss_mlp": 0.1619873, + "step": 5605, + "time_per_iteration": 2.551250696182251 + }, + { + "auxiliary_loss_clip": 0.0648852, + "auxiliary_loss_mlp": 0.01268868, + "balance_loss_clip": 0.06295876, + "balance_loss_mlp": 0.01253347, + "epoch": 0.33705095445663613, + "flos": 11806799736960.0, + "grad_norm": 2.683910051705504, + "language_loss": 0.84068418, + "learning_rate": 3.089860494591919e-06, + "loss": 0.91825807, + "num_input_tokens_seen": 120343255, + "router_z_loss_clip": 1.92675781, + "router_z_loss_mlp": 0.15515137, + "step": 5606, + "time_per_iteration": 2.4841489791870117 + }, + { + "auxiliary_loss_clip": 0.0647673, + "auxiliary_loss_mlp": 0.01269431, + "balance_loss_clip": 0.06290583, + "balance_loss_mlp": 0.01254721, + "epoch": 0.3371110777093041, + "flos": 25053460035840.0, + "grad_norm": 1.669780314791874, + "language_loss": 0.68210214, + "learning_rate": 3.089533917561809e-06, + "loss": 0.7595638, + "num_input_tokens_seen": 120361745, + "router_z_loss_clip": 1.85839844, + "router_z_loss_mlp": 0.14709473, + "step": 5607, + "time_per_iteration": 2.6018009185791016 + }, + { + "auxiliary_loss_clip": 0.0648887, + "auxiliary_loss_mlp": 0.01274582, + "balance_loss_clip": 0.06295381, + "balance_loss_mlp": 0.01258131, + "epoch": 0.33717120096197206, + "flos": 26586386156160.0, + "grad_norm": 1.643709475435958, + "language_loss": 0.71566343, + "learning_rate": 3.089207299216464e-06, + "loss": 0.79329789, + "num_input_tokens_seen": 120380565, + "router_z_loss_clip": 1.93261719, + "router_z_loss_mlp": 0.16442871, + "step": 5608, + "time_per_iteration": 2.5980639457702637 + }, + { + "auxiliary_loss_clip": 0.06479236, + "auxiliary_loss_mlp": 0.01274936, + "balance_loss_clip": 0.06291037, + "balance_loss_mlp": 0.01258712, + "epoch": 0.33723132421464, + "flos": 15163911169920.0, + "grad_norm": 1.8781248289320855, + "language_loss": 0.79662472, + "learning_rate": 3.088880639568269e-06, + "loss": 0.87416643, + "num_input_tokens_seen": 120399235, + "router_z_loss_clip": 1.8828125, + "router_z_loss_mlp": 0.16223145, + "step": 5609, + "time_per_iteration": 2.6196935176849365 + }, + { + "auxiliary_loss_clip": 0.06480544, + "auxiliary_loss_mlp": 0.01274048, + "balance_loss_clip": 0.06290779, + "balance_loss_mlp": 0.01256262, + "epoch": 0.337291447467308, + "flos": 23442058967040.0, + "grad_norm": 1.7293742366408622, + "language_loss": 0.83075953, + "learning_rate": 3.0885539386296114e-06, + "loss": 0.90830547, + "num_input_tokens_seen": 120420095, + "router_z_loss_clip": 1.89550781, + "router_z_loss_mlp": 0.17785645, + "step": 5610, + "time_per_iteration": 2.53485369682312 + }, + { + "auxiliary_loss_clip": 0.06471263, + "auxiliary_loss_mlp": 0.01269511, + "balance_loss_clip": 0.06288794, + "balance_loss_mlp": 0.01254097, + "epoch": 0.33735157071997596, + "flos": 17243870670720.0, + "grad_norm": 1.916021570377688, + "language_loss": 0.82657987, + "learning_rate": 3.088227196412879e-06, + "loss": 0.90398765, + "num_input_tokens_seen": 120437690, + "router_z_loss_clip": 1.82519531, + "router_z_loss_mlp": 0.1541748, + "step": 5611, + "time_per_iteration": 2.5164084434509277 + }, + { + "auxiliary_loss_clip": 0.06478009, + "auxiliary_loss_mlp": 0.01278112, + "balance_loss_clip": 0.0629037, + "balance_loss_mlp": 0.01260005, + "epoch": 0.3374116939726439, + "flos": 28265025726720.0, + "grad_norm": 3.0042840390827106, + "language_loss": 0.79815799, + "learning_rate": 3.0879004129304626e-06, + "loss": 0.87571925, + "num_input_tokens_seen": 120459240, + "router_z_loss_clip": 1.87597656, + "router_z_loss_mlp": 0.18084717, + "step": 5612, + "time_per_iteration": 2.582742929458618 + }, + { + "auxiliary_loss_clip": 0.06476334, + "auxiliary_loss_mlp": 0.0127707, + "balance_loss_clip": 0.06288031, + "balance_loss_mlp": 0.01261597, + "epoch": 0.3374718172253119, + "flos": 35928314663040.0, + "grad_norm": 2.3711016444568003, + "language_loss": 0.69757682, + "learning_rate": 3.087573588194753e-06, + "loss": 0.7751109, + "num_input_tokens_seen": 120481090, + "router_z_loss_clip": 1.88378906, + "router_z_loss_mlp": 0.15466309, + "step": 5613, + "time_per_iteration": 2.6553308963775635 + }, + { + "auxiliary_loss_clip": 0.06477948, + "auxiliary_loss_mlp": 0.01274833, + "balance_loss_clip": 0.06288674, + "balance_loss_mlp": 0.01259181, + "epoch": 0.33753194047797985, + "flos": 18192517470720.0, + "grad_norm": 1.7341744507496721, + "language_loss": 0.80043244, + "learning_rate": 3.087246722218144e-06, + "loss": 0.87796032, + "num_input_tokens_seen": 120500045, + "router_z_loss_clip": 1.89257812, + "router_z_loss_mlp": 0.15673828, + "step": 5614, + "time_per_iteration": 2.5162055492401123 + }, + { + "auxiliary_loss_clip": 0.06479318, + "auxiliary_loss_mlp": 0.01274123, + "balance_loss_clip": 0.06289384, + "balance_loss_mlp": 0.01257684, + "epoch": 0.3375920637306478, + "flos": 23155621384320.0, + "grad_norm": 1.8737965791301845, + "language_loss": 0.91138643, + "learning_rate": 3.086919815013031e-06, + "loss": 0.98892087, + "num_input_tokens_seen": 120521125, + "router_z_loss_clip": 1.90234375, + "router_z_loss_mlp": 0.16430664, + "step": 5615, + "time_per_iteration": 2.5491819381713867 + }, + { + "auxiliary_loss_clip": 0.0646698, + "auxiliary_loss_mlp": 0.01277747, + "balance_loss_clip": 0.06282586, + "balance_loss_mlp": 0.01261857, + "epoch": 0.3376521869833158, + "flos": 23118878568960.0, + "grad_norm": 1.8899714235087088, + "language_loss": 0.81227732, + "learning_rate": 3.086592866591809e-06, + "loss": 0.88972461, + "num_input_tokens_seen": 120539180, + "router_z_loss_clip": 1.84375, + "router_z_loss_mlp": 0.15881348, + "step": 5616, + "time_per_iteration": 2.551891803741455 + }, + { + "auxiliary_loss_clip": 0.0647929, + "auxiliary_loss_mlp": 0.01281624, + "balance_loss_clip": 0.06285349, + "balance_loss_mlp": 0.01263576, + "epoch": 0.33771231023598375, + "flos": 19279498561920.0, + "grad_norm": 1.7280186066143421, + "language_loss": 0.84097004, + "learning_rate": 3.0862658769668774e-06, + "loss": 0.91857922, + "num_input_tokens_seen": 120556280, + "router_z_loss_clip": 1.9375, + "router_z_loss_mlp": 0.18054199, + "step": 5617, + "time_per_iteration": 2.532703161239624 + }, + { + "auxiliary_loss_clip": 0.06466082, + "auxiliary_loss_mlp": 0.01273548, + "balance_loss_clip": 0.06279126, + "balance_loss_mlp": 0.01257073, + "epoch": 0.3377724334886517, + "flos": 18156026217600.0, + "grad_norm": 1.631465963150073, + "language_loss": 0.80857313, + "learning_rate": 3.0859388461506343e-06, + "loss": 0.8859694, + "num_input_tokens_seen": 120575395, + "router_z_loss_clip": 1.86816406, + "router_z_loss_mlp": 0.16467285, + "step": 5618, + "time_per_iteration": 2.5592081546783447 + }, + { + "auxiliary_loss_clip": 0.06473768, + "auxiliary_loss_mlp": 0.01275311, + "balance_loss_clip": 0.06286047, + "balance_loss_mlp": 0.01258514, + "epoch": 0.3378325567413197, + "flos": 25783159317120.0, + "grad_norm": 2.0305417192076267, + "language_loss": 0.71181929, + "learning_rate": 3.085611774155481e-06, + "loss": 0.7893101, + "num_input_tokens_seen": 120596075, + "router_z_loss_clip": 1.87695312, + "router_z_loss_mlp": 0.16809082, + "step": 5619, + "time_per_iteration": 2.5726358890533447 + }, + { + "auxiliary_loss_clip": 0.06476114, + "auxiliary_loss_mlp": 0.01271613, + "balance_loss_clip": 0.06289306, + "balance_loss_mlp": 0.01256688, + "epoch": 0.3378926799939877, + "flos": 21322254049920.0, + "grad_norm": 2.6280659122339496, + "language_loss": 0.70615005, + "learning_rate": 3.085284660993821e-06, + "loss": 0.78362733, + "num_input_tokens_seen": 120614195, + "router_z_loss_clip": 1.86523438, + "router_z_loss_mlp": 0.14929199, + "step": 5620, + "time_per_iteration": 2.604161500930786 + }, + { + "auxiliary_loss_clip": 0.06467394, + "auxiliary_loss_mlp": 0.0127348, + "balance_loss_clip": 0.0628472, + "balance_loss_mlp": 0.01258054, + "epoch": 0.33795280324665566, + "flos": 24906991898880.0, + "grad_norm": 2.3940060195146384, + "language_loss": 0.6847257, + "learning_rate": 3.084957506678058e-06, + "loss": 0.76213443, + "num_input_tokens_seen": 120634475, + "router_z_loss_clip": 1.82421875, + "router_z_loss_mlp": 0.1541748, + "step": 5621, + "time_per_iteration": 2.559730052947998 + }, + { + "auxiliary_loss_clip": 0.06469798, + "auxiliary_loss_mlp": 0.01273981, + "balance_loss_clip": 0.06287812, + "balance_loss_mlp": 0.0125914, + "epoch": 0.33801292649932363, + "flos": 24760859178240.0, + "grad_norm": 1.8671152624425502, + "language_loss": 0.82685888, + "learning_rate": 3.0846303112205975e-06, + "loss": 0.90429658, + "num_input_tokens_seen": 120654980, + "router_z_loss_clip": 1.81933594, + "router_z_loss_mlp": 0.1484375, + "step": 5622, + "time_per_iteration": 2.5722928047180176 + }, + { + "auxiliary_loss_clip": 0.06466316, + "auxiliary_loss_mlp": 0.01274625, + "balance_loss_clip": 0.06284748, + "balance_loss_mlp": 0.01260564, + "epoch": 0.3380730497519916, + "flos": 26731177211520.0, + "grad_norm": 1.4865849557607265, + "language_loss": 0.74114043, + "learning_rate": 3.0843030746338464e-06, + "loss": 0.81854987, + "num_input_tokens_seen": 120676245, + "router_z_loss_clip": 1.81738281, + "router_z_loss_mlp": 0.14056396, + "step": 5623, + "time_per_iteration": 2.5830907821655273 + }, + { + "auxiliary_loss_clip": 0.06389539, + "auxiliary_loss_mlp": 0.01273334, + "balance_loss_clip": 0.06299451, + "balance_loss_mlp": 0.01265943, + "epoch": 0.33813317300465956, + "flos": 70056845550720.0, + "grad_norm": 0.7132848624035326, + "language_loss": 0.54856884, + "learning_rate": 3.083975796930215e-06, + "loss": 0.62519753, + "num_input_tokens_seen": 120741965, + "router_z_loss_clip": 0.8984375, + "router_z_loss_mlp": 0.07373047, + "step": 5624, + "time_per_iteration": 4.680114030838013 + }, + { + "auxiliary_loss_clip": 0.06475174, + "auxiliary_loss_mlp": 0.01272775, + "balance_loss_clip": 0.06285602, + "balance_loss_mlp": 0.01256086, + "epoch": 0.3381932962573275, + "flos": 24104142403200.0, + "grad_norm": 3.6042241236842267, + "language_loss": 0.73496938, + "learning_rate": 3.083648478122111e-06, + "loss": 0.81244886, + "num_input_tokens_seen": 120760410, + "router_z_loss_clip": 1.89550781, + "router_z_loss_mlp": 0.16687012, + "step": 5625, + "time_per_iteration": 4.002846956253052 + }, + { + "auxiliary_loss_clip": 0.06480759, + "auxiliary_loss_mlp": 0.01274878, + "balance_loss_clip": 0.06288841, + "balance_loss_mlp": 0.01257021, + "epoch": 0.3382534195099955, + "flos": 19283775120000.0, + "grad_norm": 1.9831743515273117, + "language_loss": 0.7176404, + "learning_rate": 3.0833211182219497e-06, + "loss": 0.79519677, + "num_input_tokens_seen": 120777705, + "router_z_loss_clip": 1.91796875, + "router_z_loss_mlp": 0.17858887, + "step": 5626, + "time_per_iteration": 2.4999427795410156 + }, + { + "auxiliary_loss_clip": 0.06468458, + "auxiliary_loss_mlp": 0.01272986, + "balance_loss_clip": 0.06287608, + "balance_loss_mlp": 0.01257739, + "epoch": 0.33831354276266346, + "flos": 25232897554560.0, + "grad_norm": 2.987617225478933, + "language_loss": 0.81275499, + "learning_rate": 3.0829937172421425e-06, + "loss": 0.8901695, + "num_input_tokens_seen": 120798660, + "router_z_loss_clip": 1.80957031, + "router_z_loss_mlp": 0.15246582, + "step": 5627, + "time_per_iteration": 3.951984405517578 + }, + { + "auxiliary_loss_clip": 0.06478465, + "auxiliary_loss_mlp": 0.01272976, + "balance_loss_clip": 0.06288861, + "balance_loss_mlp": 0.0125668, + "epoch": 0.3383736660153314, + "flos": 23118627006720.0, + "grad_norm": 1.844905449272807, + "language_loss": 0.80405974, + "learning_rate": 3.0826662751951055e-06, + "loss": 0.88157415, + "num_input_tokens_seen": 120816705, + "router_z_loss_clip": 1.89648438, + "router_z_loss_mlp": 0.16296387, + "step": 5628, + "time_per_iteration": 2.5670697689056396 + }, + { + "auxiliary_loss_clip": 0.06477988, + "auxiliary_loss_mlp": 0.01270735, + "balance_loss_clip": 0.06288996, + "balance_loss_mlp": 0.0125457, + "epoch": 0.3384337892679994, + "flos": 23483874954240.0, + "grad_norm": 2.662319374226008, + "language_loss": 0.77757806, + "learning_rate": 3.082338792093254e-06, + "loss": 0.85506529, + "num_input_tokens_seen": 120835375, + "router_z_loss_clip": 1.88867188, + "router_z_loss_mlp": 0.16174316, + "step": 5629, + "time_per_iteration": 2.5463128089904785 + }, + { + "auxiliary_loss_clip": 0.06482605, + "auxiliary_loss_mlp": 0.01280413, + "balance_loss_clip": 0.06291752, + "balance_loss_mlp": 0.01262758, + "epoch": 0.33849391252066735, + "flos": 19431626849280.0, + "grad_norm": 1.826421419331283, + "language_loss": 0.85789764, + "learning_rate": 3.0820112679490074e-06, + "loss": 0.9355278, + "num_input_tokens_seen": 120854260, + "router_z_loss_clip": 1.90722656, + "router_z_loss_mlp": 0.17663574, + "step": 5630, + "time_per_iteration": 2.5818262100219727 + }, + { + "auxiliary_loss_clip": 0.06476109, + "auxiliary_loss_mlp": 0.01274878, + "balance_loss_clip": 0.06290477, + "balance_loss_mlp": 0.01260073, + "epoch": 0.3385540357733353, + "flos": 21070462930560.0, + "grad_norm": 2.179516256809373, + "language_loss": 0.72520673, + "learning_rate": 3.0816837027747857e-06, + "loss": 0.80271661, + "num_input_tokens_seen": 120871590, + "router_z_loss_clip": 1.85351562, + "router_z_loss_mlp": 0.14807129, + "step": 5631, + "time_per_iteration": 3.9340498447418213 + }, + { + "auxiliary_loss_clip": 0.06388511, + "auxiliary_loss_mlp": 0.01280567, + "balance_loss_clip": 0.06298131, + "balance_loss_mlp": 0.01272211, + "epoch": 0.3386141590260033, + "flos": 69224772908160.0, + "grad_norm": 0.8339652565495183, + "language_loss": 0.56105018, + "learning_rate": 3.0813560965830084e-06, + "loss": 0.63774097, + "num_input_tokens_seen": 120925550, + "router_z_loss_clip": 0.90234375, + "router_z_loss_mlp": 0.08361816, + "step": 5632, + "time_per_iteration": 3.215395450592041 + }, + { + "auxiliary_loss_clip": 0.06477562, + "auxiliary_loss_mlp": 0.01271677, + "balance_loss_clip": 0.06290288, + "balance_loss_mlp": 0.01256573, + "epoch": 0.3386742822786713, + "flos": 25526420807040.0, + "grad_norm": 3.459768837753136, + "language_loss": 0.81030583, + "learning_rate": 3.0810284493861005e-06, + "loss": 0.88779831, + "num_input_tokens_seen": 120947620, + "router_z_loss_clip": 1.87011719, + "router_z_loss_mlp": 0.15112305, + "step": 5633, + "time_per_iteration": 2.6278936862945557 + }, + { + "auxiliary_loss_clip": 0.06473435, + "auxiliary_loss_mlp": 0.01274796, + "balance_loss_clip": 0.06287597, + "balance_loss_mlp": 0.01258942, + "epoch": 0.33873440553133927, + "flos": 23629881893760.0, + "grad_norm": 2.634738846372382, + "language_loss": 0.59410667, + "learning_rate": 3.0807007611964855e-06, + "loss": 0.67158902, + "num_input_tokens_seen": 120965205, + "router_z_loss_clip": 1.85839844, + "router_z_loss_mlp": 0.15856934, + "step": 5634, + "time_per_iteration": 2.565622091293335 + }, + { + "auxiliary_loss_clip": 0.06475686, + "auxiliary_loss_mlp": 0.01270379, + "balance_loss_clip": 0.0628805, + "balance_loss_mlp": 0.01255216, + "epoch": 0.33879452878400723, + "flos": 17094006443520.0, + "grad_norm": 1.81394172090833, + "language_loss": 0.92877531, + "learning_rate": 3.080373032026589e-06, + "loss": 1.00623596, + "num_input_tokens_seen": 120983560, + "router_z_loss_clip": 1.87597656, + "router_z_loss_mlp": 0.15161133, + "step": 5635, + "time_per_iteration": 2.539051055908203 + }, + { + "auxiliary_loss_clip": 0.06470082, + "auxiliary_loss_mlp": 0.01273079, + "balance_loss_clip": 0.0629005, + "balance_loss_mlp": 0.01257457, + "epoch": 0.3388546520366752, + "flos": 15747477730560.0, + "grad_norm": 1.8703432540182672, + "language_loss": 0.75823128, + "learning_rate": 3.0800452618888386e-06, + "loss": 0.83566296, + "num_input_tokens_seen": 121001400, + "router_z_loss_clip": 1.80078125, + "router_z_loss_mlp": 0.15618896, + "step": 5636, + "time_per_iteration": 2.4998726844787598 + }, + { + "auxiliary_loss_clip": 0.064714, + "auxiliary_loss_mlp": 0.01275037, + "balance_loss_clip": 0.06288341, + "balance_loss_mlp": 0.01258848, + "epoch": 0.33891477528934316, + "flos": 22425251270400.0, + "grad_norm": 1.6981405891584176, + "language_loss": 0.83775222, + "learning_rate": 3.0797174507956637e-06, + "loss": 0.91521657, + "num_input_tokens_seen": 121021760, + "router_z_loss_clip": 1.828125, + "router_z_loss_mlp": 0.1619873, + "step": 5637, + "time_per_iteration": 2.551074981689453 + }, + { + "auxiliary_loss_clip": 0.06474115, + "auxiliary_loss_mlp": 0.01272331, + "balance_loss_clip": 0.06286962, + "balance_loss_mlp": 0.01254736, + "epoch": 0.3389748985420111, + "flos": 17280571559040.0, + "grad_norm": 1.787045955061502, + "language_loss": 0.70609659, + "learning_rate": 3.079389598759495e-06, + "loss": 0.78356105, + "num_input_tokens_seen": 121041070, + "router_z_loss_clip": 1.87304688, + "router_z_loss_mlp": 0.17590332, + "step": 5638, + "time_per_iteration": 2.5479955673217773 + }, + { + "auxiliary_loss_clip": 0.06478329, + "auxiliary_loss_mlp": 0.01289332, + "balance_loss_clip": 0.06293231, + "balance_loss_mlp": 0.01272404, + "epoch": 0.3390350217946791, + "flos": 27752261466240.0, + "grad_norm": 1.7018866339003167, + "language_loss": 0.81276166, + "learning_rate": 3.079061705792765e-06, + "loss": 0.89043832, + "num_input_tokens_seen": 121060890, + "router_z_loss_clip": 1.84960938, + "router_z_loss_mlp": 0.16931152, + "step": 5639, + "time_per_iteration": 2.614819288253784 + }, + { + "auxiliary_loss_clip": 0.06487049, + "auxiliary_loss_mlp": 0.01288743, + "balance_loss_clip": 0.06296147, + "balance_loss_mlp": 0.01270635, + "epoch": 0.33909514504734706, + "flos": 20346088383360.0, + "grad_norm": 6.449374256721531, + "language_loss": 0.68149316, + "learning_rate": 3.078733771907907e-06, + "loss": 0.75925112, + "num_input_tokens_seen": 121079135, + "router_z_loss_clip": 1.90917969, + "router_z_loss_mlp": 0.18103027, + "step": 5640, + "time_per_iteration": 2.496300220489502 + }, + { + "auxiliary_loss_clip": 0.06471096, + "auxiliary_loss_mlp": 0.01277542, + "balance_loss_clip": 0.06286727, + "balance_loss_mlp": 0.0125978, + "epoch": 0.339155268300015, + "flos": 14835322183680.0, + "grad_norm": 1.7549267997867504, + "language_loss": 0.70165765, + "learning_rate": 3.0784057971173554e-06, + "loss": 0.77914405, + "num_input_tokens_seen": 121097685, + "router_z_loss_clip": 1.84570312, + "router_z_loss_mlp": 0.1776123, + "step": 5641, + "time_per_iteration": 2.524548053741455 + }, + { + "auxiliary_loss_clip": 0.0647646, + "auxiliary_loss_mlp": 0.0128105, + "balance_loss_clip": 0.06289618, + "balance_loss_mlp": 0.01264611, + "epoch": 0.339215391552683, + "flos": 26075173196160.0, + "grad_norm": 2.2643311920206592, + "language_loss": 0.88204467, + "learning_rate": 3.0780777814335483e-06, + "loss": 0.95961982, + "num_input_tokens_seen": 121115640, + "router_z_loss_clip": 1.86914062, + "router_z_loss_mlp": 0.16430664, + "step": 5642, + "time_per_iteration": 2.551790237426758 + }, + { + "auxiliary_loss_clip": 0.06466684, + "auxiliary_loss_mlp": 0.01272488, + "balance_loss_clip": 0.06289211, + "balance_loss_mlp": 0.01258195, + "epoch": 0.33927551480535095, + "flos": 14579967265920.0, + "grad_norm": 2.023061860440481, + "language_loss": 0.84285331, + "learning_rate": 3.077749724868924e-06, + "loss": 0.92024505, + "num_input_tokens_seen": 121132485, + "router_z_loss_clip": 1.7734375, + "router_z_loss_mlp": 0.1428833, + "step": 5643, + "time_per_iteration": 2.542921304702759 + }, + { + "auxiliary_loss_clip": 0.06468654, + "auxiliary_loss_mlp": 0.01272873, + "balance_loss_clip": 0.06285787, + "balance_loss_mlp": 0.01256708, + "epoch": 0.3393356380580189, + "flos": 23812380086400.0, + "grad_norm": 6.736940029896959, + "language_loss": 0.77634799, + "learning_rate": 3.077421627435922e-06, + "loss": 0.85376322, + "num_input_tokens_seen": 121152935, + "router_z_loss_clip": 1.828125, + "router_z_loss_mlp": 0.16162109, + "step": 5644, + "time_per_iteration": 2.523386240005493 + }, + { + "auxiliary_loss_clip": 0.06472027, + "auxiliary_loss_mlp": 0.01274584, + "balance_loss_clip": 0.06288091, + "balance_loss_mlp": 0.0125873, + "epoch": 0.3393957613106869, + "flos": 17353637919360.0, + "grad_norm": 2.9654561398927752, + "language_loss": 0.6324017, + "learning_rate": 3.0770934891469832e-06, + "loss": 0.70986784, + "num_input_tokens_seen": 121169835, + "router_z_loss_clip": 1.84179688, + "router_z_loss_mlp": 0.15856934, + "step": 5645, + "time_per_iteration": 2.51273775100708 + }, + { + "auxiliary_loss_clip": 0.06466414, + "auxiliary_loss_mlp": 0.01272532, + "balance_loss_clip": 0.06285059, + "balance_loss_mlp": 0.01256284, + "epoch": 0.3394558845633549, + "flos": 28440647884800.0, + "grad_norm": 2.089100449350665, + "language_loss": 0.77295536, + "learning_rate": 3.076765310014552e-06, + "loss": 0.8503449, + "num_input_tokens_seen": 121190290, + "router_z_loss_clip": 1.8125, + "router_z_loss_mlp": 0.16247559, + "step": 5646, + "time_per_iteration": 2.5461859703063965 + }, + { + "auxiliary_loss_clip": 0.06477356, + "auxiliary_loss_mlp": 0.01274638, + "balance_loss_clip": 0.06287819, + "balance_loss_mlp": 0.01257568, + "epoch": 0.33951600781602287, + "flos": 22092804996480.0, + "grad_norm": 2.533529984962848, + "language_loss": 0.79702288, + "learning_rate": 3.0764370900510727e-06, + "loss": 0.87454283, + "num_input_tokens_seen": 121209060, + "router_z_loss_clip": 1.89550781, + "router_z_loss_mlp": 0.17077637, + "step": 5647, + "time_per_iteration": 2.5699684619903564 + }, + { + "auxiliary_loss_clip": 0.0647471, + "auxiliary_loss_mlp": 0.01272685, + "balance_loss_clip": 0.06288452, + "balance_loss_mlp": 0.01256067, + "epoch": 0.33957613106869083, + "flos": 23885027176320.0, + "grad_norm": 2.1454269075726535, + "language_loss": 0.78001738, + "learning_rate": 3.0761088292689904e-06, + "loss": 0.85749137, + "num_input_tokens_seen": 121227480, + "router_z_loss_clip": 1.86328125, + "router_z_loss_mlp": 0.16625977, + "step": 5648, + "time_per_iteration": 2.5294926166534424 + }, + { + "auxiliary_loss_clip": 0.063921, + "auxiliary_loss_mlp": 0.01261966, + "balance_loss_clip": 0.0630298, + "balance_loss_mlp": 0.01254759, + "epoch": 0.3396362543213588, + "flos": 71264411066880.0, + "grad_norm": 0.7604552176896413, + "language_loss": 0.56109136, + "learning_rate": 3.075780527680754e-06, + "loss": 0.63763207, + "num_input_tokens_seen": 121291305, + "router_z_loss_clip": 0.890625, + "router_z_loss_mlp": 0.07196045, + "step": 5649, + "time_per_iteration": 3.2003703117370605 + }, + { + "auxiliary_loss_clip": 0.06473398, + "auxiliary_loss_mlp": 0.01280094, + "balance_loss_clip": 0.06287606, + "balance_loss_mlp": 0.01263274, + "epoch": 0.33969637757402676, + "flos": 25928746986240.0, + "grad_norm": 1.4812234353432667, + "language_loss": 0.85783911, + "learning_rate": 3.0754521852988117e-06, + "loss": 0.93537402, + "num_input_tokens_seen": 121312740, + "router_z_loss_clip": 1.85839844, + "router_z_loss_mlp": 0.16821289, + "step": 5650, + "time_per_iteration": 2.551633834838867 + }, + { + "auxiliary_loss_clip": 0.06475022, + "auxiliary_loss_mlp": 0.01277613, + "balance_loss_clip": 0.06292272, + "balance_loss_mlp": 0.01261841, + "epoch": 0.33975650082669473, + "flos": 35270382003840.0, + "grad_norm": 3.382903843955623, + "language_loss": 0.71404934, + "learning_rate": 3.0751238021356152e-06, + "loss": 0.79157567, + "num_input_tokens_seen": 121334220, + "router_z_loss_clip": 1.82617188, + "router_z_loss_mlp": 0.15759277, + "step": 5651, + "time_per_iteration": 2.665083885192871 + }, + { + "auxiliary_loss_clip": 0.06471914, + "auxiliary_loss_mlp": 0.01278706, + "balance_loss_clip": 0.06286959, + "balance_loss_mlp": 0.01261922, + "epoch": 0.3398166240793627, + "flos": 16651373650560.0, + "grad_norm": 4.478617872089092, + "language_loss": 0.81850624, + "learning_rate": 3.074795378203616e-06, + "loss": 0.89601243, + "num_input_tokens_seen": 121351870, + "router_z_loss_clip": 1.84960938, + "router_z_loss_mlp": 0.16772461, + "step": 5652, + "time_per_iteration": 2.5136160850524902 + }, + { + "auxiliary_loss_clip": 0.06483054, + "auxiliary_loss_mlp": 0.01281024, + "balance_loss_clip": 0.06293614, + "balance_loss_mlp": 0.0126344, + "epoch": 0.33987674733203066, + "flos": 24069244377600.0, + "grad_norm": 3.0225456344203088, + "language_loss": 0.77707815, + "learning_rate": 3.0744669135152685e-06, + "loss": 0.85471892, + "num_input_tokens_seen": 121373400, + "router_z_loss_clip": 1.89257812, + "router_z_loss_mlp": 0.17590332, + "step": 5653, + "time_per_iteration": 2.6221256256103516 + }, + { + "auxiliary_loss_clip": 0.06478614, + "auxiliary_loss_mlp": 0.01275428, + "balance_loss_clip": 0.06293246, + "balance_loss_mlp": 0.01259788, + "epoch": 0.3399368705846986, + "flos": 13253955603840.0, + "grad_norm": 4.6454995512067745, + "language_loss": 0.86809218, + "learning_rate": 3.0741384080830278e-06, + "loss": 0.94563264, + "num_input_tokens_seen": 121385225, + "router_z_loss_clip": 1.85449219, + "router_z_loss_mlp": 0.15625, + "step": 5654, + "time_per_iteration": 2.4661965370178223 + }, + { + "auxiliary_loss_clip": 0.06476527, + "auxiliary_loss_mlp": 0.01283952, + "balance_loss_clip": 0.06292438, + "balance_loss_mlp": 0.01267584, + "epoch": 0.3399969938373666, + "flos": 27019585365120.0, + "grad_norm": 2.782601809339298, + "language_loss": 0.65974486, + "learning_rate": 3.073809861919351e-06, + "loss": 0.73734963, + "num_input_tokens_seen": 121404735, + "router_z_loss_clip": 1.83984375, + "router_z_loss_mlp": 0.16369629, + "step": 5655, + "time_per_iteration": 2.555647611618042 + }, + { + "auxiliary_loss_clip": 0.06478781, + "auxiliary_loss_mlp": 0.01275484, + "balance_loss_clip": 0.06293027, + "balance_loss_mlp": 0.01259558, + "epoch": 0.34005711709003456, + "flos": 28557920073600.0, + "grad_norm": 1.4106761603755547, + "language_loss": 0.76612461, + "learning_rate": 3.073481275036697e-06, + "loss": 0.84366733, + "num_input_tokens_seen": 121426780, + "router_z_loss_clip": 1.85839844, + "router_z_loss_mlp": 0.15917969, + "step": 5656, + "time_per_iteration": 2.644866466522217 + }, + { + "auxiliary_loss_clip": 0.06484362, + "auxiliary_loss_mlp": 0.01277113, + "balance_loss_clip": 0.06293096, + "balance_loss_mlp": 0.01260436, + "epoch": 0.3401172403427025, + "flos": 21623533804800.0, + "grad_norm": 1.950261924987131, + "language_loss": 0.83422613, + "learning_rate": 3.073152647447525e-06, + "loss": 0.9118408, + "num_input_tokens_seen": 121447245, + "router_z_loss_clip": 1.91210938, + "router_z_loss_mlp": 0.16674805, + "step": 5657, + "time_per_iteration": 2.701688051223755 + }, + { + "auxiliary_loss_clip": 0.06477939, + "auxiliary_loss_mlp": 0.01276671, + "balance_loss_clip": 0.06292981, + "balance_loss_mlp": 0.01259851, + "epoch": 0.3401773635953705, + "flos": 25893010419840.0, + "grad_norm": 5.064784702806917, + "language_loss": 0.86277437, + "learning_rate": 3.0728239791642976e-06, + "loss": 0.94032043, + "num_input_tokens_seen": 121468165, + "router_z_loss_clip": 1.84765625, + "router_z_loss_mlp": 0.16833496, + "step": 5658, + "time_per_iteration": 2.622107744216919 + }, + { + "auxiliary_loss_clip": 0.06400045, + "auxiliary_loss_mlp": 0.01275632, + "balance_loss_clip": 0.06310016, + "balance_loss_mlp": 0.01268671, + "epoch": 0.3402374868480385, + "flos": 65527737459840.0, + "grad_norm": 0.8082747939523138, + "language_loss": 0.59960568, + "learning_rate": 3.072495270199477e-06, + "loss": 0.67636251, + "num_input_tokens_seen": 121523795, + "router_z_loss_clip": 0.8984375, + "router_z_loss_mlp": 0.06970215, + "step": 5659, + "time_per_iteration": 3.1002566814422607 + }, + { + "auxiliary_loss_clip": 0.0647618, + "auxiliary_loss_mlp": 0.01281423, + "balance_loss_clip": 0.06294397, + "balance_loss_mlp": 0.01264591, + "epoch": 0.34029761010070647, + "flos": 24067357660800.0, + "grad_norm": 2.7764582815625514, + "language_loss": 0.68693221, + "learning_rate": 3.0721665205655284e-06, + "loss": 0.76450825, + "num_input_tokens_seen": 121542950, + "router_z_loss_clip": 1.81542969, + "router_z_loss_mlp": 0.16821289, + "step": 5660, + "time_per_iteration": 2.620135545730591 + }, + { + "auxiliary_loss_clip": 0.06473149, + "auxiliary_loss_mlp": 0.01278369, + "balance_loss_clip": 0.06289428, + "balance_loss_mlp": 0.01262157, + "epoch": 0.34035773335337444, + "flos": 27607093067520.0, + "grad_norm": 2.0682817387265477, + "language_loss": 0.6727913, + "learning_rate": 3.071837730274918e-06, + "loss": 0.75030649, + "num_input_tokens_seen": 121562765, + "router_z_loss_clip": 1.83691406, + "router_z_loss_mlp": 0.16210938, + "step": 5661, + "time_per_iteration": 2.56429123878479 + }, + { + "auxiliary_loss_clip": 0.06469939, + "auxiliary_loss_mlp": 0.01280149, + "balance_loss_clip": 0.06289508, + "balance_loss_mlp": 0.01264175, + "epoch": 0.3404178566060424, + "flos": 20818923373440.0, + "grad_norm": 1.802665197928241, + "language_loss": 0.79380333, + "learning_rate": 3.071508899340113e-06, + "loss": 0.87130427, + "num_input_tokens_seen": 121581610, + "router_z_loss_clip": 1.80566406, + "router_z_loss_mlp": 0.15966797, + "step": 5662, + "time_per_iteration": 2.552755832672119 + }, + { + "auxiliary_loss_clip": 0.06474, + "auxiliary_loss_mlp": 0.01278156, + "balance_loss_clip": 0.06290844, + "balance_loss_mlp": 0.01260454, + "epoch": 0.34047797985871037, + "flos": 26840818679040.0, + "grad_norm": 2.1558050020889894, + "language_loss": 0.73809367, + "learning_rate": 3.0711800277735833e-06, + "loss": 0.8156153, + "num_input_tokens_seen": 121601885, + "router_z_loss_clip": 1.83007812, + "router_z_loss_mlp": 0.17700195, + "step": 5663, + "time_per_iteration": 2.5490622520446777 + }, + { + "auxiliary_loss_clip": 0.06470126, + "auxiliary_loss_mlp": 0.01281986, + "balance_loss_clip": 0.06290488, + "balance_loss_mlp": 0.01265714, + "epoch": 0.34053810311137833, + "flos": 19688742702720.0, + "grad_norm": 1.852400144955729, + "language_loss": 0.86839676, + "learning_rate": 3.0708511155877997e-06, + "loss": 0.94591784, + "num_input_tokens_seen": 121621335, + "router_z_loss_clip": 1.796875, + "router_z_loss_mlp": 0.16259766, + "step": 5664, + "time_per_iteration": 5.419060707092285 + }, + { + "auxiliary_loss_clip": 0.06483276, + "auxiliary_loss_mlp": 0.01274362, + "balance_loss_clip": 0.06295361, + "balance_loss_mlp": 0.01257423, + "epoch": 0.3405982263640463, + "flos": 21732169023360.0, + "grad_norm": 1.8640809787797845, + "language_loss": 0.69509971, + "learning_rate": 3.070522162795235e-06, + "loss": 0.77267611, + "num_input_tokens_seen": 121641310, + "router_z_loss_clip": 1.88183594, + "router_z_loss_mlp": 0.16943359, + "step": 5665, + "time_per_iteration": 2.547194719314575 + }, + { + "auxiliary_loss_clip": 0.06482168, + "auxiliary_loss_mlp": 0.01274659, + "balance_loss_clip": 0.0629427, + "balance_loss_mlp": 0.01257648, + "epoch": 0.34065834961671426, + "flos": 18047600634240.0, + "grad_norm": 2.6257214905883237, + "language_loss": 0.73526829, + "learning_rate": 3.0701931694083626e-06, + "loss": 0.81283653, + "num_input_tokens_seen": 121659625, + "router_z_loss_clip": 1.87890625, + "router_z_loss_mlp": 0.17016602, + "step": 5666, + "time_per_iteration": 2.527994155883789 + }, + { + "auxiliary_loss_clip": 0.06482688, + "auxiliary_loss_mlp": 0.01272217, + "balance_loss_clip": 0.06295212, + "balance_loss_mlp": 0.01255373, + "epoch": 0.3407184728693822, + "flos": 21403705818240.0, + "grad_norm": 1.661941695135435, + "language_loss": 0.74005675, + "learning_rate": 3.0698641354396576e-06, + "loss": 0.81760579, + "num_input_tokens_seen": 121679205, + "router_z_loss_clip": 1.87304688, + "router_z_loss_mlp": 0.1685791, + "step": 5667, + "time_per_iteration": 4.029574155807495 + }, + { + "auxiliary_loss_clip": 0.06378959, + "auxiliary_loss_mlp": 0.01268313, + "balance_loss_clip": 0.06290369, + "balance_loss_mlp": 0.01260898, + "epoch": 0.3407785961220502, + "flos": 68709352515840.0, + "grad_norm": 0.8062084259911544, + "language_loss": 0.63318539, + "learning_rate": 3.069535060901597e-06, + "loss": 0.70965815, + "num_input_tokens_seen": 121751085, + "router_z_loss_clip": 0.8828125, + "router_z_loss_mlp": 0.07397461, + "step": 5668, + "time_per_iteration": 3.3641560077667236 + }, + { + "auxiliary_loss_clip": 0.06472414, + "auxiliary_loss_mlp": 0.01272754, + "balance_loss_clip": 0.0628752, + "balance_loss_mlp": 0.01256863, + "epoch": 0.34083871937471816, + "flos": 14069634773760.0, + "grad_norm": 2.007810831329869, + "language_loss": 0.73127198, + "learning_rate": 3.0692059458066596e-06, + "loss": 0.80872369, + "num_input_tokens_seen": 121768565, + "router_z_loss_clip": 1.84765625, + "router_z_loss_mlp": 0.15893555, + "step": 5669, + "time_per_iteration": 2.4918038845062256 + }, + { + "auxiliary_loss_clip": 0.06479842, + "auxiliary_loss_mlp": 0.0127954, + "balance_loss_clip": 0.06292197, + "balance_loss_mlp": 0.01263423, + "epoch": 0.3408988426273861, + "flos": 17089981447680.0, + "grad_norm": 2.0642744441347287, + "language_loss": 0.80626565, + "learning_rate": 3.0688767901673265e-06, + "loss": 0.88385952, + "num_input_tokens_seen": 121784925, + "router_z_loss_clip": 1.87597656, + "router_z_loss_mlp": 0.16125488, + "step": 5670, + "time_per_iteration": 2.5270040035247803 + }, + { + "auxiliary_loss_clip": 0.06481062, + "auxiliary_loss_mlp": 0.01275164, + "balance_loss_clip": 0.06291522, + "balance_loss_mlp": 0.0125838, + "epoch": 0.3409589658800541, + "flos": 24031411459200.0, + "grad_norm": 1.863009265742361, + "language_loss": 0.77916187, + "learning_rate": 3.068547593996078e-06, + "loss": 0.85672414, + "num_input_tokens_seen": 121804425, + "router_z_loss_clip": 1.89355469, + "router_z_loss_mlp": 0.16784668, + "step": 5671, + "time_per_iteration": 4.039815664291382 + }, + { + "auxiliary_loss_clip": 0.06473973, + "auxiliary_loss_mlp": 0.01276984, + "balance_loss_clip": 0.06289308, + "balance_loss_mlp": 0.01260712, + "epoch": 0.34101908913272205, + "flos": 21148350900480.0, + "grad_norm": 1.9142883162018633, + "language_loss": 0.74626315, + "learning_rate": 3.0682183573053974e-06, + "loss": 0.82377267, + "num_input_tokens_seen": 121825145, + "router_z_loss_clip": 1.84960938, + "router_z_loss_mlp": 0.16259766, + "step": 5672, + "time_per_iteration": 2.564887762069702 + }, + { + "auxiliary_loss_clip": 0.06475951, + "auxiliary_loss_mlp": 0.01275656, + "balance_loss_clip": 0.06287946, + "balance_loss_mlp": 0.01259265, + "epoch": 0.3410792123853901, + "flos": 15706835700480.0, + "grad_norm": 1.714309741158987, + "language_loss": 0.73791027, + "learning_rate": 3.06788908010777e-06, + "loss": 0.81542635, + "num_input_tokens_seen": 121842185, + "router_z_loss_clip": 1.88085938, + "router_z_loss_mlp": 0.16394043, + "step": 5673, + "time_per_iteration": 2.540194511413574 + }, + { + "auxiliary_loss_clip": 0.06466323, + "auxiliary_loss_mlp": 0.01283225, + "balance_loss_clip": 0.06284231, + "balance_loss_mlp": 0.01266584, + "epoch": 0.34113933563805804, + "flos": 23042122629120.0, + "grad_norm": 1.8379615104267257, + "language_loss": 0.7978701, + "learning_rate": 3.067559762415682e-06, + "loss": 0.87536556, + "num_input_tokens_seen": 121862260, + "router_z_loss_clip": 1.81835938, + "router_z_loss_mlp": 0.16638184, + "step": 5674, + "time_per_iteration": 2.5462148189544678 + }, + { + "auxiliary_loss_clip": 0.06364837, + "auxiliary_loss_mlp": 0.01262017, + "balance_loss_clip": 0.06277348, + "balance_loss_mlp": 0.01255442, + "epoch": 0.341199458890726, + "flos": 69631878769920.0, + "grad_norm": 0.7752872762952348, + "language_loss": 0.56147063, + "learning_rate": 3.0672304042416198e-06, + "loss": 0.63773918, + "num_input_tokens_seen": 121923560, + "router_z_loss_clip": 0.875, + "router_z_loss_mlp": 0.06585693, + "step": 5675, + "time_per_iteration": 3.370281457901001 + }, + { + "auxiliary_loss_clip": 0.0645988, + "auxiliary_loss_mlp": 0.01273396, + "balance_loss_clip": 0.06281768, + "balance_loss_mlp": 0.01257398, + "epoch": 0.34125958214339397, + "flos": 22352939596800.0, + "grad_norm": 2.600205708544321, + "language_loss": 0.79689062, + "learning_rate": 3.0669010055980734e-06, + "loss": 0.87422335, + "num_input_tokens_seen": 121943515, + "router_z_loss_clip": 1.78125, + "router_z_loss_mlp": 0.16003418, + "step": 5676, + "time_per_iteration": 2.5312321186065674 + }, + { + "auxiliary_loss_clip": 0.06470488, + "auxiliary_loss_mlp": 0.01271752, + "balance_loss_clip": 0.06286064, + "balance_loss_mlp": 0.01255051, + "epoch": 0.34131970539606193, + "flos": 21878427525120.0, + "grad_norm": 2.203551534393157, + "language_loss": 0.8601976, + "learning_rate": 3.0665715664975357e-06, + "loss": 0.93761992, + "num_input_tokens_seen": 121962540, + "router_z_loss_clip": 1.84375, + "router_z_loss_mlp": 0.16699219, + "step": 5677, + "time_per_iteration": 2.555037260055542 + }, + { + "auxiliary_loss_clip": 0.06463757, + "auxiliary_loss_mlp": 0.01274207, + "balance_loss_clip": 0.06280699, + "balance_loss_mlp": 0.01257268, + "epoch": 0.3413798286487299, + "flos": 24942560757120.0, + "grad_norm": 2.786164717546535, + "language_loss": 0.80252033, + "learning_rate": 3.0662420869524966e-06, + "loss": 0.87989998, + "num_input_tokens_seen": 121979830, + "router_z_loss_clip": 1.83203125, + "router_z_loss_mlp": 0.16955566, + "step": 5678, + "time_per_iteration": 2.6321489810943604 + }, + { + "auxiliary_loss_clip": 0.06467854, + "auxiliary_loss_mlp": 0.01270663, + "balance_loss_clip": 0.06282793, + "balance_loss_mlp": 0.01255404, + "epoch": 0.34143995190139786, + "flos": 25381420116480.0, + "grad_norm": 1.8772848902338297, + "language_loss": 0.75927806, + "learning_rate": 3.0659125669754506e-06, + "loss": 0.83666325, + "num_input_tokens_seen": 121999055, + "router_z_loss_clip": 1.85253906, + "router_z_loss_mlp": 0.15246582, + "step": 5679, + "time_per_iteration": 2.5981781482696533 + }, + { + "auxiliary_loss_clip": 0.06365222, + "auxiliary_loss_mlp": 0.01260685, + "balance_loss_clip": 0.06278291, + "balance_loss_mlp": 0.01253538, + "epoch": 0.34150007515406583, + "flos": 67804785763200.0, + "grad_norm": 0.7019635675964923, + "language_loss": 0.59521842, + "learning_rate": 3.0655830065788923e-06, + "loss": 0.67147756, + "num_input_tokens_seen": 122067015, + "router_z_loss_clip": 0.8671875, + "router_z_loss_mlp": 0.0713501, + "step": 5680, + "time_per_iteration": 3.2768852710723877 + }, + { + "auxiliary_loss_clip": 0.06464119, + "auxiliary_loss_mlp": 0.01271493, + "balance_loss_clip": 0.06282759, + "balance_loss_mlp": 0.01255602, + "epoch": 0.3415601984067338, + "flos": 20308548954240.0, + "grad_norm": 1.756785442101194, + "language_loss": 0.72804415, + "learning_rate": 3.0652534057753206e-06, + "loss": 0.80540025, + "num_input_tokens_seen": 122085295, + "router_z_loss_clip": 1.81445312, + "router_z_loss_mlp": 0.15881348, + "step": 5681, + "time_per_iteration": 2.540839195251465 + }, + { + "auxiliary_loss_clip": 0.06462204, + "auxiliary_loss_mlp": 0.01272244, + "balance_loss_clip": 0.06283034, + "balance_loss_mlp": 0.01256806, + "epoch": 0.34162032165940176, + "flos": 26038346526720.0, + "grad_norm": 5.204332383129175, + "language_loss": 0.71220171, + "learning_rate": 3.064923764577233e-06, + "loss": 0.78954625, + "num_input_tokens_seen": 122104020, + "router_z_loss_clip": 1.79199219, + "router_z_loss_mlp": 0.15454102, + "step": 5682, + "time_per_iteration": 2.5933032035827637 + }, + { + "auxiliary_loss_clip": 0.06466864, + "auxiliary_loss_mlp": 0.0127503, + "balance_loss_clip": 0.06286532, + "balance_loss_mlp": 0.01258711, + "epoch": 0.3416804449120697, + "flos": 28810843223040.0, + "grad_norm": 1.4703350638010875, + "language_loss": 0.83879244, + "learning_rate": 3.0645940829971295e-06, + "loss": 0.91621137, + "num_input_tokens_seen": 122125080, + "router_z_loss_clip": 1.80273438, + "router_z_loss_mlp": 0.16320801, + "step": 5683, + "time_per_iteration": 2.595921277999878 + }, + { + "auxiliary_loss_clip": 0.06468399, + "auxiliary_loss_mlp": 0.01274924, + "balance_loss_clip": 0.06284815, + "balance_loss_mlp": 0.01258354, + "epoch": 0.3417405681647377, + "flos": 22608210660480.0, + "grad_norm": 1.8188343464074745, + "language_loss": 0.71334541, + "learning_rate": 3.0642643610475116e-06, + "loss": 0.79077864, + "num_input_tokens_seen": 122146350, + "router_z_loss_clip": 1.8359375, + "router_z_loss_mlp": 0.16577148, + "step": 5684, + "time_per_iteration": 2.5821194648742676 + }, + { + "auxiliary_loss_clip": 0.06462076, + "auxiliary_loss_mlp": 0.01268234, + "balance_loss_clip": 0.06284268, + "balance_loss_mlp": 0.01253816, + "epoch": 0.34180069141740566, + "flos": 24722942405760.0, + "grad_norm": 1.4943065575919134, + "language_loss": 0.75352108, + "learning_rate": 3.0639345987408823e-06, + "loss": 0.8308242, + "num_input_tokens_seen": 122168085, + "router_z_loss_clip": 1.77636719, + "router_z_loss_mlp": 0.144104, + "step": 5685, + "time_per_iteration": 2.545419216156006 + }, + { + "auxiliary_loss_clip": 0.06457227, + "auxiliary_loss_mlp": 0.01270508, + "balance_loss_clip": 0.06281762, + "balance_loss_mlp": 0.0125501, + "epoch": 0.3418608146700737, + "flos": 30526644879360.0, + "grad_norm": 1.8907916568784255, + "language_loss": 0.70833004, + "learning_rate": 3.0636047960897468e-06, + "loss": 0.7856074, + "num_input_tokens_seen": 122191040, + "router_z_loss_clip": 1.75390625, + "router_z_loss_mlp": 0.1550293, + "step": 5686, + "time_per_iteration": 2.645081043243408 + }, + { + "auxiliary_loss_clip": 0.06467415, + "auxiliary_loss_mlp": 0.01269453, + "balance_loss_clip": 0.06284459, + "balance_loss_mlp": 0.01253407, + "epoch": 0.34192093792274164, + "flos": 15127755333120.0, + "grad_norm": 2.1973050683231303, + "language_loss": 0.77864039, + "learning_rate": 3.06327495310661e-06, + "loss": 0.85600907, + "num_input_tokens_seen": 122209225, + "router_z_loss_clip": 1.82910156, + "router_z_loss_mlp": 0.16052246, + "step": 5687, + "time_per_iteration": 2.501957654953003 + }, + { + "auxiliary_loss_clip": 0.06462508, + "auxiliary_loss_mlp": 0.01273993, + "balance_loss_clip": 0.06286186, + "balance_loss_mlp": 0.01257435, + "epoch": 0.3419810611754096, + "flos": 13192754595840.0, + "grad_norm": 1.8198375176693335, + "language_loss": 0.87159389, + "learning_rate": 3.062945069803981e-06, + "loss": 0.94895893, + "num_input_tokens_seen": 122226160, + "router_z_loss_clip": 1.76269531, + "router_z_loss_mlp": 0.16552734, + "step": 5688, + "time_per_iteration": 2.514558792114258 + }, + { + "auxiliary_loss_clip": 0.06470017, + "auxiliary_loss_mlp": 0.01272882, + "balance_loss_clip": 0.06283651, + "balance_loss_mlp": 0.01255025, + "epoch": 0.34204118442807757, + "flos": 19542274565760.0, + "grad_norm": 1.9150705307332732, + "language_loss": 0.80177575, + "learning_rate": 3.0626151461943684e-06, + "loss": 0.87920475, + "num_input_tokens_seen": 122243115, + "router_z_loss_clip": 1.86328125, + "router_z_loss_mlp": 0.17858887, + "step": 5689, + "time_per_iteration": 2.4941842555999756 + }, + { + "auxiliary_loss_clip": 0.06471369, + "auxiliary_loss_mlp": 0.01270545, + "balance_loss_clip": 0.06286988, + "balance_loss_mlp": 0.01254476, + "epoch": 0.34210130768074554, + "flos": 15200192787840.0, + "grad_norm": 1.8413075326603192, + "language_loss": 0.74004579, + "learning_rate": 3.0622851822902834e-06, + "loss": 0.81746483, + "num_input_tokens_seen": 122261105, + "router_z_loss_clip": 1.84277344, + "router_z_loss_mlp": 0.1607666, + "step": 5690, + "time_per_iteration": 2.5133728981018066 + }, + { + "auxiliary_loss_clip": 0.06470567, + "auxiliary_loss_mlp": 0.01270745, + "balance_loss_clip": 0.06288044, + "balance_loss_mlp": 0.01254854, + "epoch": 0.3421614309334135, + "flos": 24943147735680.0, + "grad_norm": 2.8439157619722666, + "language_loss": 0.76563686, + "learning_rate": 3.061955178104237e-06, + "loss": 0.84305, + "num_input_tokens_seen": 122279995, + "router_z_loss_clip": 1.82519531, + "router_z_loss_mlp": 0.15893555, + "step": 5691, + "time_per_iteration": 2.5346477031707764 + }, + { + "auxiliary_loss_clip": 0.06465675, + "auxiliary_loss_mlp": 0.01269129, + "balance_loss_clip": 0.06286939, + "balance_loss_mlp": 0.01254395, + "epoch": 0.34222155418608147, + "flos": 21915170340480.0, + "grad_norm": 1.7269103068173344, + "language_loss": 0.6888957, + "learning_rate": 3.0616251336487447e-06, + "loss": 0.7662437, + "num_input_tokens_seen": 122299070, + "router_z_loss_clip": 1.78710938, + "router_z_loss_mlp": 0.1472168, + "step": 5692, + "time_per_iteration": 2.544475793838501 + }, + { + "auxiliary_loss_clip": 0.06469652, + "auxiliary_loss_mlp": 0.01276187, + "balance_loss_clip": 0.06286649, + "balance_loss_mlp": 0.01259069, + "epoch": 0.34228167743874943, + "flos": 18119954234880.0, + "grad_norm": 2.5543870280075494, + "language_loss": 0.72691154, + "learning_rate": 3.06129504893632e-06, + "loss": 0.80436993, + "num_input_tokens_seen": 122316800, + "router_z_loss_clip": 1.83203125, + "router_z_loss_mlp": 0.17126465, + "step": 5693, + "time_per_iteration": 2.4823062419891357 + }, + { + "auxiliary_loss_clip": 0.06469734, + "auxiliary_loss_mlp": 0.01268069, + "balance_loss_clip": 0.06291726, + "balance_loss_mlp": 0.01253049, + "epoch": 0.3423418006914174, + "flos": 21295070599680.0, + "grad_norm": 1.6526919771326485, + "language_loss": 0.76433146, + "learning_rate": 3.0609649239794813e-06, + "loss": 0.84170949, + "num_input_tokens_seen": 122335275, + "router_z_loss_clip": 1.78027344, + "router_z_loss_mlp": 0.15008545, + "step": 5694, + "time_per_iteration": 2.5759999752044678 + }, + { + "auxiliary_loss_clip": 0.06469683, + "auxiliary_loss_mlp": 0.01269733, + "balance_loss_clip": 0.06292015, + "balance_loss_mlp": 0.01254498, + "epoch": 0.34240192394408536, + "flos": 19828754075520.0, + "grad_norm": 1.7073290043069882, + "language_loss": 0.80359411, + "learning_rate": 3.060634758790747e-06, + "loss": 0.88098824, + "num_input_tokens_seen": 122353215, + "router_z_loss_clip": 1.77734375, + "router_z_loss_mlp": 0.15222168, + "step": 5695, + "time_per_iteration": 2.53019118309021 + }, + { + "auxiliary_loss_clip": 0.06473886, + "auxiliary_loss_mlp": 0.01274215, + "balance_loss_clip": 0.06292082, + "balance_loss_mlp": 0.01257335, + "epoch": 0.3424620471967533, + "flos": 24542498638080.0, + "grad_norm": 2.150928833794339, + "language_loss": 0.74189723, + "learning_rate": 3.060304553382635e-06, + "loss": 0.81937826, + "num_input_tokens_seen": 122372495, + "router_z_loss_clip": 1.81640625, + "router_z_loss_mlp": 0.16882324, + "step": 5696, + "time_per_iteration": 2.6046504974365234 + }, + { + "auxiliary_loss_clip": 0.06472932, + "auxiliary_loss_mlp": 0.01273918, + "balance_loss_clip": 0.062935, + "balance_loss_mlp": 0.0125786, + "epoch": 0.3425221704494213, + "flos": 25856057969280.0, + "grad_norm": 1.9268953245740004, + "language_loss": 0.71419311, + "learning_rate": 3.0599743077676685e-06, + "loss": 0.79166162, + "num_input_tokens_seen": 122394600, + "router_z_loss_clip": 1.79394531, + "router_z_loss_mlp": 0.16052246, + "step": 5697, + "time_per_iteration": 2.565295696258545 + }, + { + "auxiliary_loss_clip": 0.06469944, + "auxiliary_loss_mlp": 0.01270077, + "balance_loss_clip": 0.06292768, + "balance_loss_mlp": 0.01254293, + "epoch": 0.34258229370208926, + "flos": 21546442448640.0, + "grad_norm": 1.77565898086167, + "language_loss": 0.82456839, + "learning_rate": 3.05964402195837e-06, + "loss": 0.90196872, + "num_input_tokens_seen": 122414700, + "router_z_loss_clip": 1.77050781, + "router_z_loss_mlp": 0.15795898, + "step": 5698, + "time_per_iteration": 2.636547327041626 + }, + { + "auxiliary_loss_clip": 0.06476933, + "auxiliary_loss_mlp": 0.01277942, + "balance_loss_clip": 0.06293021, + "balance_loss_mlp": 0.01260573, + "epoch": 0.3426424169547573, + "flos": 23658407009280.0, + "grad_norm": 1.9460205950694964, + "language_loss": 0.69722092, + "learning_rate": 3.0593136959672645e-06, + "loss": 0.77476966, + "num_input_tokens_seen": 122432760, + "router_z_loss_clip": 1.83886719, + "router_z_loss_mlp": 0.17358398, + "step": 5699, + "time_per_iteration": 2.523766040802002 + }, + { + "auxiliary_loss_clip": 0.06469926, + "auxiliary_loss_mlp": 0.0127405, + "balance_loss_clip": 0.06289239, + "balance_loss_mlp": 0.01257719, + "epoch": 0.34270254020742524, + "flos": 24651846616320.0, + "grad_norm": 2.105384484263751, + "language_loss": 0.72511256, + "learning_rate": 3.058983329806877e-06, + "loss": 0.80255234, + "num_input_tokens_seen": 122449105, + "router_z_loss_clip": 1.80566406, + "router_z_loss_mlp": 0.16320801, + "step": 5700, + "time_per_iteration": 2.57511568069458 + }, + { + "auxiliary_loss_clip": 0.06467311, + "auxiliary_loss_mlp": 0.01271093, + "balance_loss_clip": 0.06288276, + "balance_loss_mlp": 0.01254273, + "epoch": 0.3427626634600932, + "flos": 21003182501760.0, + "grad_norm": 2.114283139984186, + "language_loss": 0.82378924, + "learning_rate": 3.0586529234897354e-06, + "loss": 0.90117323, + "num_input_tokens_seen": 122468700, + "router_z_loss_clip": 1.79003906, + "router_z_loss_mlp": 0.16821289, + "step": 5701, + "time_per_iteration": 2.496392250061035 + }, + { + "auxiliary_loss_clip": 0.06469429, + "auxiliary_loss_mlp": 0.0127326, + "balance_loss_clip": 0.06287375, + "balance_loss_mlp": 0.01256439, + "epoch": 0.3428227867127612, + "flos": 21440155144320.0, + "grad_norm": 1.6330699344557849, + "language_loss": 0.71898985, + "learning_rate": 3.0583224770283694e-06, + "loss": 0.79641676, + "num_input_tokens_seen": 122488160, + "router_z_loss_clip": 1.8203125, + "router_z_loss_mlp": 0.16821289, + "step": 5702, + "time_per_iteration": 2.566856861114502 + }, + { + "auxiliary_loss_clip": 0.06377172, + "auxiliary_loss_mlp": 0.01259818, + "balance_loss_clip": 0.06290582, + "balance_loss_mlp": 0.01252552, + "epoch": 0.34288290996542914, + "flos": 55750219902720.0, + "grad_norm": 0.7671857510805999, + "language_loss": 0.56708395, + "learning_rate": 3.057991990435309e-06, + "loss": 0.64345384, + "num_input_tokens_seen": 122542890, + "router_z_loss_clip": 0.8671875, + "router_z_loss_mlp": 0.07244873, + "step": 5703, + "time_per_iteration": 4.447732925415039 + }, + { + "auxiliary_loss_clip": 0.06465772, + "auxiliary_loss_mlp": 0.01272683, + "balance_loss_clip": 0.06283242, + "balance_loss_mlp": 0.01255207, + "epoch": 0.3429430332180971, + "flos": 20162961285120.0, + "grad_norm": 1.88810633796735, + "language_loss": 0.74954486, + "learning_rate": 3.057661463723086e-06, + "loss": 0.82692933, + "num_input_tokens_seen": 122561770, + "router_z_loss_clip": 1.82617188, + "router_z_loss_mlp": 0.17468262, + "step": 5704, + "time_per_iteration": 4.062070608139038 + }, + { + "auxiliary_loss_clip": 0.06463447, + "auxiliary_loss_mlp": 0.01275499, + "balance_loss_clip": 0.06284866, + "balance_loss_mlp": 0.01259716, + "epoch": 0.34300315647076507, + "flos": 17971347818880.0, + "grad_norm": 2.0890845856962565, + "language_loss": 0.73438597, + "learning_rate": 3.0573308969042346e-06, + "loss": 0.81177545, + "num_input_tokens_seen": 122580580, + "router_z_loss_clip": 1.78417969, + "router_z_loss_mlp": 0.15795898, + "step": 5705, + "time_per_iteration": 2.5125277042388916 + }, + { + "auxiliary_loss_clip": 0.06466857, + "auxiliary_loss_mlp": 0.01271633, + "balance_loss_clip": 0.0628458, + "balance_loss_mlp": 0.01255194, + "epoch": 0.34306327972343303, + "flos": 22092679215360.0, + "grad_norm": 2.3658652894382075, + "language_loss": 0.80144984, + "learning_rate": 3.057000289991289e-06, + "loss": 0.87883472, + "num_input_tokens_seen": 122599810, + "router_z_loss_clip": 1.82226562, + "router_z_loss_mlp": 0.16430664, + "step": 5706, + "time_per_iteration": 2.524531364440918 + }, + { + "auxiliary_loss_clip": 0.06468605, + "auxiliary_loss_mlp": 0.01272132, + "balance_loss_clip": 0.06282079, + "balance_loss_mlp": 0.0125493, + "epoch": 0.343123402976101, + "flos": 18448669002240.0, + "grad_norm": 1.9272208577124825, + "language_loss": 0.83210528, + "learning_rate": 3.056669642996787e-06, + "loss": 0.90951264, + "num_input_tokens_seen": 122616035, + "router_z_loss_clip": 1.86621094, + "router_z_loss_mlp": 0.17199707, + "step": 5707, + "time_per_iteration": 4.017935514450073 + }, + { + "auxiliary_loss_clip": 0.06464301, + "auxiliary_loss_mlp": 0.01275983, + "balance_loss_clip": 0.06283538, + "balance_loss_mlp": 0.01259544, + "epoch": 0.34318352622876896, + "flos": 17169127228800.0, + "grad_norm": 1.5274992455100316, + "language_loss": 0.74774885, + "learning_rate": 3.056338955933266e-06, + "loss": 0.82515168, + "num_input_tokens_seen": 122633785, + "router_z_loss_clip": 1.80566406, + "router_z_loss_mlp": 0.16442871, + "step": 5708, + "time_per_iteration": 2.6189568042755127 + }, + { + "auxiliary_loss_clip": 0.06460952, + "auxiliary_loss_mlp": 0.01273078, + "balance_loss_clip": 0.06282704, + "balance_loss_mlp": 0.01256365, + "epoch": 0.34324364948143693, + "flos": 26695482572160.0, + "grad_norm": 1.5717787719434457, + "language_loss": 0.80904007, + "learning_rate": 3.0560082288132662e-06, + "loss": 0.88638043, + "num_input_tokens_seen": 122652100, + "router_z_loss_clip": 1.78515625, + "router_z_loss_mlp": 0.16711426, + "step": 5709, + "time_per_iteration": 2.563938617706299 + }, + { + "auxiliary_loss_clip": 0.06471742, + "auxiliary_loss_mlp": 0.01280104, + "balance_loss_clip": 0.06286193, + "balance_loss_mlp": 0.01260685, + "epoch": 0.3433037727341049, + "flos": 21257950440960.0, + "grad_norm": 2.571520261591023, + "language_loss": 0.79460347, + "learning_rate": 3.055677461649329e-06, + "loss": 0.87212193, + "num_input_tokens_seen": 122669720, + "router_z_loss_clip": 1.85449219, + "router_z_loss_mlp": 0.1940918, + "step": 5710, + "time_per_iteration": 2.5515291690826416 + }, + { + "auxiliary_loss_clip": 0.06468266, + "auxiliary_loss_mlp": 0.0127181, + "balance_loss_clip": 0.06282788, + "balance_loss_mlp": 0.01254334, + "epoch": 0.34336389598677286, + "flos": 20635377004800.0, + "grad_norm": 1.916674758610419, + "language_loss": 0.70532334, + "learning_rate": 3.055346654453996e-06, + "loss": 0.78272408, + "num_input_tokens_seen": 122688715, + "router_z_loss_clip": 1.85253906, + "router_z_loss_mlp": 0.17468262, + "step": 5711, + "time_per_iteration": 3.958890914916992 + }, + { + "auxiliary_loss_clip": 0.06467056, + "auxiliary_loss_mlp": 0.01273896, + "balance_loss_clip": 0.0628437, + "balance_loss_mlp": 0.01256909, + "epoch": 0.3434240192394409, + "flos": 14543895283200.0, + "grad_norm": 2.810027228242578, + "language_loss": 0.67786914, + "learning_rate": 3.055015807239812e-06, + "loss": 0.75527865, + "num_input_tokens_seen": 122706970, + "router_z_loss_clip": 1.82421875, + "router_z_loss_mlp": 0.16992188, + "step": 5712, + "time_per_iteration": 2.4752726554870605 + }, + { + "auxiliary_loss_clip": 0.06366295, + "auxiliary_loss_mlp": 0.01262856, + "balance_loss_clip": 0.06280869, + "balance_loss_mlp": 0.01254685, + "epoch": 0.34348414249210885, + "flos": 58067799183360.0, + "grad_norm": 0.8383081559544242, + "language_loss": 0.58214718, + "learning_rate": 3.0546849200193226e-06, + "loss": 0.65843868, + "num_input_tokens_seen": 122758095, + "router_z_loss_clip": 0.85205078, + "router_z_loss_mlp": 0.08172607, + "step": 5713, + "time_per_iteration": 3.11580491065979 + }, + { + "auxiliary_loss_clip": 0.06465655, + "auxiliary_loss_mlp": 0.01274581, + "balance_loss_clip": 0.06281169, + "balance_loss_mlp": 0.01257308, + "epoch": 0.3435442657447768, + "flos": 20710749352320.0, + "grad_norm": 1.8141637433077298, + "language_loss": 0.81045675, + "learning_rate": 3.054353992805076e-06, + "loss": 0.88785917, + "num_input_tokens_seen": 122777815, + "router_z_loss_clip": 1.84472656, + "router_z_loss_mlp": 0.17272949, + "step": 5714, + "time_per_iteration": 2.510929822921753 + }, + { + "auxiliary_loss_clip": 0.0646632, + "auxiliary_loss_mlp": 0.01276019, + "balance_loss_clip": 0.06283875, + "balance_loss_mlp": 0.01260045, + "epoch": 0.3436043889974448, + "flos": 22936967354880.0, + "grad_norm": 2.602776673257047, + "language_loss": 0.72001171, + "learning_rate": 3.05402302560962e-06, + "loss": 0.79743505, + "num_input_tokens_seen": 122797555, + "router_z_loss_clip": 1.82421875, + "router_z_loss_mlp": 0.15991211, + "step": 5715, + "time_per_iteration": 2.5680224895477295 + }, + { + "auxiliary_loss_clip": 0.06365244, + "auxiliary_loss_mlp": 0.01259148, + "balance_loss_clip": 0.06280053, + "balance_loss_mlp": 0.01251191, + "epoch": 0.34366451225011274, + "flos": 58423514964480.0, + "grad_norm": 0.8879413605742031, + "language_loss": 0.65628481, + "learning_rate": 3.053692018445505e-06, + "loss": 0.73252875, + "num_input_tokens_seen": 122863955, + "router_z_loss_clip": 0.8515625, + "router_z_loss_mlp": 0.07952881, + "step": 5716, + "time_per_iteration": 3.184952735900879 + }, + { + "auxiliary_loss_clip": 0.06463662, + "auxiliary_loss_mlp": 0.01279768, + "balance_loss_clip": 0.0628469, + "balance_loss_mlp": 0.01264509, + "epoch": 0.3437246355027807, + "flos": 15601722353280.0, + "grad_norm": 1.9800950186090778, + "language_loss": 0.74289393, + "learning_rate": 3.0533609713252838e-06, + "loss": 0.82032824, + "num_input_tokens_seen": 122883000, + "router_z_loss_clip": 1.78808594, + "router_z_loss_mlp": 0.15252686, + "step": 5717, + "time_per_iteration": 2.5220494270324707 + }, + { + "auxiliary_loss_clip": 0.06466433, + "auxiliary_loss_mlp": 0.01278824, + "balance_loss_clip": 0.0628383, + "balance_loss_mlp": 0.01262946, + "epoch": 0.34378475875544867, + "flos": 27679572449280.0, + "grad_norm": 1.8348085520910409, + "language_loss": 0.75694019, + "learning_rate": 3.0530298842615077e-06, + "loss": 0.83439279, + "num_input_tokens_seen": 122903265, + "router_z_loss_clip": 1.82617188, + "router_z_loss_mlp": 0.15869141, + "step": 5718, + "time_per_iteration": 2.5983147621154785 + }, + { + "auxiliary_loss_clip": 0.06468937, + "auxiliary_loss_mlp": 0.01273829, + "balance_loss_clip": 0.06284641, + "balance_loss_mlp": 0.01256829, + "epoch": 0.34384488200811664, + "flos": 31439638967040.0, + "grad_norm": 1.8816683210791167, + "language_loss": 0.6437763, + "learning_rate": 3.052698757266734e-06, + "loss": 0.72120392, + "num_input_tokens_seen": 122923860, + "router_z_loss_clip": 1.84179688, + "router_z_loss_mlp": 0.17004395, + "step": 5719, + "time_per_iteration": 2.7075517177581787 + }, + { + "auxiliary_loss_clip": 0.06472047, + "auxiliary_loss_mlp": 0.0127673, + "balance_loss_clip": 0.06285335, + "balance_loss_mlp": 0.012596, + "epoch": 0.3439050052607846, + "flos": 24906866117760.0, + "grad_norm": 1.6709560385881974, + "language_loss": 0.73730874, + "learning_rate": 3.0523675903535183e-06, + "loss": 0.81479651, + "num_input_tokens_seen": 122945305, + "router_z_loss_clip": 1.86816406, + "router_z_loss_mlp": 0.17150879, + "step": 5720, + "time_per_iteration": 2.5936295986175537 + }, + { + "auxiliary_loss_clip": 0.06469208, + "auxiliary_loss_mlp": 0.01280833, + "balance_loss_clip": 0.06286804, + "balance_loss_mlp": 0.01264072, + "epoch": 0.34396512851345257, + "flos": 18155900436480.0, + "grad_norm": 1.8909667336437188, + "language_loss": 0.74550021, + "learning_rate": 3.0520363835344173e-06, + "loss": 0.82300061, + "num_input_tokens_seen": 122962535, + "router_z_loss_clip": 1.82421875, + "router_z_loss_mlp": 0.16748047, + "step": 5721, + "time_per_iteration": 2.5109763145446777 + }, + { + "auxiliary_loss_clip": 0.06468637, + "auxiliary_loss_mlp": 0.01276688, + "balance_loss_clip": 0.06284628, + "balance_loss_mlp": 0.01260208, + "epoch": 0.34402525176612053, + "flos": 16039994734080.0, + "grad_norm": 3.7669546448597497, + "language_loss": 0.80102623, + "learning_rate": 3.051705136821992e-06, + "loss": 0.87847948, + "num_input_tokens_seen": 122979750, + "router_z_loss_clip": 1.83984375, + "router_z_loss_mlp": 0.16479492, + "step": 5722, + "time_per_iteration": 2.5231471061706543 + }, + { + "auxiliary_loss_clip": 0.06467631, + "auxiliary_loss_mlp": 0.01281232, + "balance_loss_clip": 0.06286201, + "balance_loss_mlp": 0.01265806, + "epoch": 0.3440853750187885, + "flos": 21185009861760.0, + "grad_norm": 1.9591310013999468, + "language_loss": 0.82034022, + "learning_rate": 3.051373850228801e-06, + "loss": 0.89782888, + "num_input_tokens_seen": 122998955, + "router_z_loss_clip": 1.81445312, + "router_z_loss_mlp": 0.1541748, + "step": 5723, + "time_per_iteration": 2.5556578636169434 + }, + { + "auxiliary_loss_clip": 0.06471531, + "auxiliary_loss_mlp": 0.01281521, + "balance_loss_clip": 0.0628756, + "balance_loss_mlp": 0.0126588, + "epoch": 0.34414549827145646, + "flos": 12682883301120.0, + "grad_norm": 1.867182825140108, + "language_loss": 0.8172524, + "learning_rate": 3.0510425237674096e-06, + "loss": 0.8947829, + "num_input_tokens_seen": 123016165, + "router_z_loss_clip": 1.83984375, + "router_z_loss_mlp": 0.15661621, + "step": 5724, + "time_per_iteration": 2.509129524230957 + }, + { + "auxiliary_loss_clip": 0.06476942, + "auxiliary_loss_mlp": 0.01281282, + "balance_loss_clip": 0.06292838, + "balance_loss_mlp": 0.01265237, + "epoch": 0.3442056215241244, + "flos": 31292458070400.0, + "grad_norm": 1.852126712281853, + "language_loss": 0.69186389, + "learning_rate": 3.05071115745038e-06, + "loss": 0.76944625, + "num_input_tokens_seen": 123036900, + "router_z_loss_clip": 1.83886719, + "router_z_loss_mlp": 0.16040039, + "step": 5725, + "time_per_iteration": 2.6253697872161865 + }, + { + "auxiliary_loss_clip": 0.06482734, + "auxiliary_loss_mlp": 0.01284248, + "balance_loss_clip": 0.06293113, + "balance_loss_mlp": 0.01266462, + "epoch": 0.34426574477679245, + "flos": 23373939997440.0, + "grad_norm": 1.5373453518160676, + "language_loss": 0.69532049, + "learning_rate": 3.0503797512902773e-06, + "loss": 0.77299035, + "num_input_tokens_seen": 123057480, + "router_z_loss_clip": 1.89648438, + "router_z_loss_mlp": 0.17785645, + "step": 5726, + "time_per_iteration": 2.5495173931121826 + }, + { + "auxiliary_loss_clip": 0.06477433, + "auxiliary_loss_mlp": 0.01281684, + "balance_loss_clip": 0.06292193, + "balance_loss_mlp": 0.01265948, + "epoch": 0.3443258680294604, + "flos": 24542372856960.0, + "grad_norm": 3.3735616171284453, + "language_loss": 0.73631704, + "learning_rate": 3.0500483052996703e-06, + "loss": 0.81390822, + "num_input_tokens_seen": 123076890, + "router_z_loss_clip": 1.85253906, + "router_z_loss_mlp": 0.15734863, + "step": 5727, + "time_per_iteration": 2.5395119190216064 + }, + { + "auxiliary_loss_clip": 0.06474276, + "auxiliary_loss_mlp": 0.01274594, + "balance_loss_clip": 0.06292102, + "balance_loss_mlp": 0.01259049, + "epoch": 0.3443859912821284, + "flos": 20236363061760.0, + "grad_norm": 1.756953821036591, + "language_loss": 0.88303459, + "learning_rate": 3.0497168194911257e-06, + "loss": 0.96052337, + "num_input_tokens_seen": 123092530, + "router_z_loss_clip": 1.8203125, + "router_z_loss_mlp": 0.15551758, + "step": 5728, + "time_per_iteration": 2.5943620204925537 + }, + { + "auxiliary_loss_clip": 0.06472028, + "auxiliary_loss_mlp": 0.01275786, + "balance_loss_clip": 0.06289984, + "balance_loss_mlp": 0.01259382, + "epoch": 0.34444611453479634, + "flos": 24323425338240.0, + "grad_norm": 1.9801243778486481, + "language_loss": 0.70532095, + "learning_rate": 3.0493852938772143e-06, + "loss": 0.78279907, + "num_input_tokens_seen": 123110560, + "router_z_loss_clip": 1.8203125, + "router_z_loss_mlp": 0.1640625, + "step": 5729, + "time_per_iteration": 2.5122504234313965 + }, + { + "auxiliary_loss_clip": 0.06472413, + "auxiliary_loss_mlp": 0.01278834, + "balance_loss_clip": 0.06293523, + "balance_loss_mlp": 0.01263123, + "epoch": 0.3445062377874643, + "flos": 16989186585600.0, + "grad_norm": 2.065738946159642, + "language_loss": 0.74902749, + "learning_rate": 3.0490537284705078e-06, + "loss": 0.82653993, + "num_input_tokens_seen": 123128655, + "router_z_loss_clip": 1.79003906, + "router_z_loss_mlp": 0.15710449, + "step": 5730, + "time_per_iteration": 2.4971024990081787 + }, + { + "auxiliary_loss_clip": 0.06477457, + "auxiliary_loss_mlp": 0.01272788, + "balance_loss_clip": 0.06295118, + "balance_loss_mlp": 0.01256921, + "epoch": 0.3445663610401323, + "flos": 20308884370560.0, + "grad_norm": 2.25692333978076, + "language_loss": 0.79881716, + "learning_rate": 3.048722123283578e-06, + "loss": 0.87631959, + "num_input_tokens_seen": 123145130, + "router_z_loss_clip": 1.82128906, + "router_z_loss_mlp": 0.15869141, + "step": 5731, + "time_per_iteration": 2.5055606365203857 + }, + { + "auxiliary_loss_clip": 0.0647382, + "auxiliary_loss_mlp": 0.01272088, + "balance_loss_clip": 0.06289574, + "balance_loss_mlp": 0.01256532, + "epoch": 0.34462648429280024, + "flos": 15893568524160.0, + "grad_norm": 2.0529883798711586, + "language_loss": 0.78536034, + "learning_rate": 3.0483904783290006e-06, + "loss": 0.86281943, + "num_input_tokens_seen": 123162265, + "router_z_loss_clip": 1.84277344, + "router_z_loss_mlp": 0.15545654, + "step": 5732, + "time_per_iteration": 2.58428692817688 + }, + { + "auxiliary_loss_clip": 0.06393671, + "auxiliary_loss_mlp": 0.01269392, + "balance_loss_clip": 0.06309536, + "balance_loss_mlp": 0.01263571, + "epoch": 0.3446866075454682, + "flos": 59330681193600.0, + "grad_norm": 0.7296400398421587, + "language_loss": 0.53166986, + "learning_rate": 3.0480587936193505e-06, + "loss": 0.60830045, + "num_input_tokens_seen": 123218620, + "router_z_loss_clip": 0.84375, + "router_z_loss_mlp": 0.05813599, + "step": 5733, + "time_per_iteration": 3.1921679973602295 + }, + { + "auxiliary_loss_clip": 0.06473544, + "auxiliary_loss_mlp": 0.01275818, + "balance_loss_clip": 0.06292105, + "balance_loss_mlp": 0.01259248, + "epoch": 0.34474673079813617, + "flos": 22349962776960.0, + "grad_norm": 1.6143563972241732, + "language_loss": 0.83787543, + "learning_rate": 3.047727069167207e-06, + "loss": 0.91536903, + "num_input_tokens_seen": 123237325, + "router_z_loss_clip": 1.81542969, + "router_z_loss_mlp": 0.16564941, + "step": 5734, + "time_per_iteration": 2.5630810260772705 + }, + { + "auxiliary_loss_clip": 0.06472072, + "auxiliary_loss_mlp": 0.01278915, + "balance_loss_clip": 0.0628967, + "balance_loss_mlp": 0.01262834, + "epoch": 0.34480685405080413, + "flos": 27677098753920.0, + "grad_norm": 1.7144738343554842, + "language_loss": 0.93389094, + "learning_rate": 3.0473953049851478e-06, + "loss": 1.01140082, + "num_input_tokens_seen": 123258650, + "router_z_loss_clip": 1.82324219, + "router_z_loss_mlp": 0.1607666, + "step": 5735, + "time_per_iteration": 2.5621798038482666 + }, + { + "auxiliary_loss_clip": 0.06471383, + "auxiliary_loss_mlp": 0.01276844, + "balance_loss_clip": 0.06284925, + "balance_loss_mlp": 0.01259273, + "epoch": 0.3448669773034721, + "flos": 22462664918400.0, + "grad_norm": 1.7840822264419087, + "language_loss": 0.77095437, + "learning_rate": 3.0470635010857533e-06, + "loss": 0.84843659, + "num_input_tokens_seen": 123277155, + "router_z_loss_clip": 1.86523438, + "router_z_loss_mlp": 0.17578125, + "step": 5736, + "time_per_iteration": 2.5377349853515625 + }, + { + "auxiliary_loss_clip": 0.06471781, + "auxiliary_loss_mlp": 0.01270645, + "balance_loss_clip": 0.06287266, + "balance_loss_mlp": 0.01255326, + "epoch": 0.34492710055614006, + "flos": 24943105808640.0, + "grad_norm": 1.6287034776462515, + "language_loss": 0.79113513, + "learning_rate": 3.0467316574816064e-06, + "loss": 0.86855936, + "num_input_tokens_seen": 123297640, + "router_z_loss_clip": 1.84667969, + "router_z_loss_mlp": 0.15319824, + "step": 5737, + "time_per_iteration": 2.5471904277801514 + }, + { + "auxiliary_loss_clip": 0.06473309, + "auxiliary_loss_mlp": 0.01276485, + "balance_loss_clip": 0.06285917, + "balance_loss_mlp": 0.0125976, + "epoch": 0.34498722380880803, + "flos": 20127057010560.0, + "grad_norm": 2.191814396638409, + "language_loss": 0.72072059, + "learning_rate": 3.0463997741852893e-06, + "loss": 0.79821849, + "num_input_tokens_seen": 123314370, + "router_z_loss_clip": 1.875, + "router_z_loss_mlp": 0.16723633, + "step": 5738, + "time_per_iteration": 2.540442943572998 + }, + { + "auxiliary_loss_clip": 0.06471272, + "auxiliary_loss_mlp": 0.01272808, + "balance_loss_clip": 0.06284432, + "balance_loss_mlp": 0.01255821, + "epoch": 0.34504734706147605, + "flos": 28445511421440.0, + "grad_norm": 1.9413212194180998, + "language_loss": 0.82238245, + "learning_rate": 3.046067851209389e-06, + "loss": 0.89982325, + "num_input_tokens_seen": 123336085, + "router_z_loss_clip": 1.8671875, + "router_z_loss_mlp": 0.16992188, + "step": 5739, + "time_per_iteration": 2.57327938079834 + }, + { + "auxiliary_loss_clip": 0.06469989, + "auxiliary_loss_mlp": 0.0127904, + "balance_loss_clip": 0.06284826, + "balance_loss_mlp": 0.01261862, + "epoch": 0.345107470314144, + "flos": 22681067385600.0, + "grad_norm": 1.914547064909644, + "language_loss": 0.83564734, + "learning_rate": 3.0457358885664898e-06, + "loss": 0.91313767, + "num_input_tokens_seen": 123354460, + "router_z_loss_clip": 1.84960938, + "router_z_loss_mlp": 0.171875, + "step": 5740, + "time_per_iteration": 2.5514895915985107 + }, + { + "auxiliary_loss_clip": 0.06466584, + "auxiliary_loss_mlp": 0.01275646, + "balance_loss_clip": 0.06283005, + "balance_loss_mlp": 0.01258921, + "epoch": 0.345167593566812, + "flos": 20636886378240.0, + "grad_norm": 2.1474795597791734, + "language_loss": 0.76802379, + "learning_rate": 3.045403886269181e-06, + "loss": 0.84544611, + "num_input_tokens_seen": 123373420, + "router_z_loss_clip": 1.83496094, + "router_z_loss_mlp": 0.16723633, + "step": 5741, + "time_per_iteration": 2.511997699737549 + }, + { + "auxiliary_loss_clip": 0.06466299, + "auxiliary_loss_mlp": 0.0127053, + "balance_loss_clip": 0.06279384, + "balance_loss_mlp": 0.01254544, + "epoch": 0.34522771681947995, + "flos": 26221683260160.0, + "grad_norm": 1.6006732343467382, + "language_loss": 0.77803171, + "learning_rate": 3.045071844330053e-06, + "loss": 0.85540009, + "num_input_tokens_seen": 123394730, + "router_z_loss_clip": 1.87011719, + "router_z_loss_mlp": 0.15966797, + "step": 5742, + "time_per_iteration": 2.5593955516815186 + }, + { + "auxiliary_loss_clip": 0.06464212, + "auxiliary_loss_mlp": 0.01272894, + "balance_loss_clip": 0.06281982, + "balance_loss_mlp": 0.01256074, + "epoch": 0.3452878400721479, + "flos": 19068349472640.0, + "grad_norm": 2.2544306863162538, + "language_loss": 0.76459014, + "learning_rate": 3.0447397627616955e-06, + "loss": 0.84196126, + "num_input_tokens_seen": 123412895, + "router_z_loss_clip": 1.82421875, + "router_z_loss_mlp": 0.16821289, + "step": 5743, + "time_per_iteration": 3.996267557144165 + }, + { + "auxiliary_loss_clip": 0.06462429, + "auxiliary_loss_mlp": 0.0126984, + "balance_loss_clip": 0.06281956, + "balance_loss_mlp": 0.01255118, + "epoch": 0.3453479633248159, + "flos": 27937442989440.0, + "grad_norm": 1.578255214465821, + "language_loss": 0.7080915, + "learning_rate": 3.0444076415767016e-06, + "loss": 0.78541422, + "num_input_tokens_seen": 123432320, + "router_z_loss_clip": 1.80761719, + "router_z_loss_mlp": 0.14727783, + "step": 5744, + "time_per_iteration": 2.5594234466552734 + }, + { + "auxiliary_loss_clip": 0.06462625, + "auxiliary_loss_mlp": 0.01272389, + "balance_loss_clip": 0.0628416, + "balance_loss_mlp": 0.01256523, + "epoch": 0.34540808657748384, + "flos": 19611609419520.0, + "grad_norm": 1.8945383960499247, + "language_loss": 0.79877782, + "learning_rate": 3.044075480787665e-06, + "loss": 0.87612802, + "num_input_tokens_seen": 123450980, + "router_z_loss_clip": 1.78320312, + "router_z_loss_mlp": 0.15881348, + "step": 5745, + "time_per_iteration": 2.5577902793884277 + }, + { + "auxiliary_loss_clip": 0.0646376, + "auxiliary_loss_mlp": 0.0127446, + "balance_loss_clip": 0.0627804, + "balance_loss_mlp": 0.01258343, + "epoch": 0.3454682098301518, + "flos": 20417771151360.0, + "grad_norm": 2.2215207406176063, + "language_loss": 0.90027881, + "learning_rate": 3.043743280407182e-06, + "loss": 0.97766101, + "num_input_tokens_seen": 123469365, + "router_z_loss_clip": 1.85742188, + "router_z_loss_mlp": 0.16113281, + "step": 5746, + "time_per_iteration": 4.126953840255737 + }, + { + "auxiliary_loss_clip": 0.06469168, + "auxiliary_loss_mlp": 0.01271588, + "balance_loss_clip": 0.06281114, + "balance_loss_mlp": 0.01254648, + "epoch": 0.34552833308281977, + "flos": 21331603779840.0, + "grad_norm": 1.8420175913064167, + "language_loss": 0.65233189, + "learning_rate": 3.043411040447849e-06, + "loss": 0.72973943, + "num_input_tokens_seen": 123489425, + "router_z_loss_clip": 1.88085938, + "router_z_loss_mlp": 0.16931152, + "step": 5747, + "time_per_iteration": 2.6445960998535156 + }, + { + "auxiliary_loss_clip": 0.06461484, + "auxiliary_loss_mlp": 0.01274425, + "balance_loss_clip": 0.06279166, + "balance_loss_mlp": 0.01259166, + "epoch": 0.34558845633548774, + "flos": 36251914331520.0, + "grad_norm": 1.6152983170909512, + "language_loss": 0.72912234, + "learning_rate": 3.043078760922264e-06, + "loss": 0.80648136, + "num_input_tokens_seen": 123509970, + "router_z_loss_clip": 1.82226562, + "router_z_loss_mlp": 0.15246582, + "step": 5748, + "time_per_iteration": 2.668628692626953 + }, + { + "auxiliary_loss_clip": 0.0646018, + "auxiliary_loss_mlp": 0.01271906, + "balance_loss_clip": 0.06281725, + "balance_loss_mlp": 0.01257268, + "epoch": 0.3456485795881557, + "flos": 22456292008320.0, + "grad_norm": 2.139365243179929, + "language_loss": 0.75935584, + "learning_rate": 3.042746441843029e-06, + "loss": 0.83667672, + "num_input_tokens_seen": 123531055, + "router_z_loss_clip": 1.78320312, + "router_z_loss_mlp": 0.14648438, + "step": 5749, + "time_per_iteration": 2.533357620239258 + }, + { + "auxiliary_loss_clip": 0.06372777, + "auxiliary_loss_mlp": 0.01259534, + "balance_loss_clip": 0.06290279, + "balance_loss_mlp": 0.0125392, + "epoch": 0.34570870284082367, + "flos": 62023277422080.0, + "grad_norm": 0.8741398929973155, + "language_loss": 0.62861037, + "learning_rate": 3.0424140832227437e-06, + "loss": 0.70493352, + "num_input_tokens_seen": 123584720, + "router_z_loss_clip": 0.82470703, + "router_z_loss_mlp": 0.05612183, + "step": 5750, + "time_per_iteration": 4.42021369934082 + }, + { + "auxiliary_loss_clip": 0.06455849, + "auxiliary_loss_mlp": 0.0126761, + "balance_loss_clip": 0.06279862, + "balance_loss_mlp": 0.01253383, + "epoch": 0.34576882609349163, + "flos": 22788528647040.0, + "grad_norm": 2.5604939014714043, + "language_loss": 0.80745482, + "learning_rate": 3.042081685074012e-06, + "loss": 0.88468945, + "num_input_tokens_seen": 123604465, + "router_z_loss_clip": 1.7578125, + "router_z_loss_mlp": 0.14227295, + "step": 5751, + "time_per_iteration": 2.610229730606079 + }, + { + "auxiliary_loss_clip": 0.06461278, + "auxiliary_loss_mlp": 0.01273124, + "balance_loss_clip": 0.06282206, + "balance_loss_mlp": 0.01258199, + "epoch": 0.34582894934615965, + "flos": 12353665409280.0, + "grad_norm": 2.333174149642167, + "language_loss": 0.85112172, + "learning_rate": 3.041749247409439e-06, + "loss": 0.92846578, + "num_input_tokens_seen": 123622320, + "router_z_loss_clip": 1.79101562, + "router_z_loss_mlp": 0.14904785, + "step": 5752, + "time_per_iteration": 2.49895977973938 + }, + { + "auxiliary_loss_clip": 0.06379203, + "auxiliary_loss_mlp": 0.01260282, + "balance_loss_clip": 0.06296635, + "balance_loss_mlp": 0.01254092, + "epoch": 0.3458890725988276, + "flos": 70186459017600.0, + "grad_norm": 0.7233537791569425, + "language_loss": 0.63163221, + "learning_rate": 3.0414167702416296e-06, + "loss": 0.70802706, + "num_input_tokens_seen": 123678010, + "router_z_loss_clip": 0.828125, + "router_z_loss_mlp": 0.06185913, + "step": 5753, + "time_per_iteration": 3.0605263710021973 + }, + { + "auxiliary_loss_clip": 0.06463367, + "auxiliary_loss_mlp": 0.01274407, + "balance_loss_clip": 0.06282756, + "balance_loss_mlp": 0.01258498, + "epoch": 0.3459491958514956, + "flos": 17098324928640.0, + "grad_norm": 2.0282181813946116, + "language_loss": 0.71483171, + "learning_rate": 3.0410842535831914e-06, + "loss": 0.79220951, + "num_input_tokens_seen": 123696830, + "router_z_loss_clip": 1.80566406, + "router_z_loss_mlp": 0.15899658, + "step": 5754, + "time_per_iteration": 2.499213457107544 + }, + { + "auxiliary_loss_clip": 0.06470741, + "auxiliary_loss_mlp": 0.01271896, + "balance_loss_clip": 0.06282809, + "balance_loss_mlp": 0.01255898, + "epoch": 0.34600931910416355, + "flos": 16655985624960.0, + "grad_norm": 2.0834630321372534, + "language_loss": 0.7328862, + "learning_rate": 3.0407516974467343e-06, + "loss": 0.81031251, + "num_input_tokens_seen": 123714360, + "router_z_loss_clip": 1.87695312, + "router_z_loss_mlp": 0.15979004, + "step": 5755, + "time_per_iteration": 2.540292263031006 + }, + { + "auxiliary_loss_clip": 0.0646005, + "auxiliary_loss_mlp": 0.01272619, + "balance_loss_clip": 0.06280342, + "balance_loss_mlp": 0.01257801, + "epoch": 0.3460694423568315, + "flos": 38555517179520.0, + "grad_norm": 1.432388080922509, + "language_loss": 0.7255426, + "learning_rate": 3.040419101844869e-06, + "loss": 0.80286932, + "num_input_tokens_seen": 123739250, + "router_z_loss_clip": 1.79589844, + "router_z_loss_mlp": 0.14813232, + "step": 5756, + "time_per_iteration": 2.679203510284424 + }, + { + "auxiliary_loss_clip": 0.06371044, + "auxiliary_loss_mlp": 0.01257585, + "balance_loss_clip": 0.06288835, + "balance_loss_mlp": 0.01251058, + "epoch": 0.3461295656094995, + "flos": 72103332545280.0, + "grad_norm": 0.6902951700774806, + "language_loss": 0.62318385, + "learning_rate": 3.040086466790207e-06, + "loss": 0.69947016, + "num_input_tokens_seen": 123802845, + "router_z_loss_clip": 0.8203125, + "router_z_loss_mlp": 0.06536865, + "step": 5757, + "time_per_iteration": 3.209688901901245 + }, + { + "auxiliary_loss_clip": 0.06363717, + "auxiliary_loss_mlp": 0.01259824, + "balance_loss_clip": 0.06281064, + "balance_loss_mlp": 0.01253244, + "epoch": 0.34618968886216744, + "flos": 65477913408000.0, + "grad_norm": 0.8114970964410039, + "language_loss": 0.59130025, + "learning_rate": 3.039753792295362e-06, + "loss": 0.66753566, + "num_input_tokens_seen": 123861805, + "router_z_loss_clip": 0.828125, + "router_z_loss_mlp": 0.06591797, + "step": 5758, + "time_per_iteration": 3.139495372772217 + }, + { + "auxiliary_loss_clip": 0.06467785, + "auxiliary_loss_mlp": 0.01274731, + "balance_loss_clip": 0.06288655, + "balance_loss_mlp": 0.01259747, + "epoch": 0.3462498121148354, + "flos": 23478508293120.0, + "grad_norm": 1.7665020183034759, + "language_loss": 0.72321635, + "learning_rate": 3.0394210783729487e-06, + "loss": 0.80064148, + "num_input_tokens_seen": 123881820, + "router_z_loss_clip": 1.79101562, + "router_z_loss_mlp": 0.14990234, + "step": 5759, + "time_per_iteration": 2.575479745864868 + }, + { + "auxiliary_loss_clip": 0.06456805, + "auxiliary_loss_mlp": 0.01274415, + "balance_loss_clip": 0.06277698, + "balance_loss_mlp": 0.01258632, + "epoch": 0.3463099353675034, + "flos": 24177711888000.0, + "grad_norm": 1.8760422141660649, + "language_loss": 0.83568478, + "learning_rate": 3.0390883250355836e-06, + "loss": 0.91299695, + "num_input_tokens_seen": 123903700, + "router_z_loss_clip": 1.79199219, + "router_z_loss_mlp": 0.15771484, + "step": 5760, + "time_per_iteration": 2.5610272884368896 + }, + { + "auxiliary_loss_clip": 0.06358143, + "auxiliary_loss_mlp": 0.01257449, + "balance_loss_clip": 0.06276596, + "balance_loss_mlp": 0.0125125, + "epoch": 0.34637005862017134, + "flos": 63716773893120.0, + "grad_norm": 0.8043642187655193, + "language_loss": 0.56576806, + "learning_rate": 3.0387555322958865e-06, + "loss": 0.64192402, + "num_input_tokens_seen": 123960075, + "router_z_loss_clip": 0.81591797, + "router_z_loss_mlp": 0.06195068, + "step": 5761, + "time_per_iteration": 3.2343695163726807 + }, + { + "auxiliary_loss_clip": 0.06453449, + "auxiliary_loss_mlp": 0.01270941, + "balance_loss_clip": 0.06277917, + "balance_loss_mlp": 0.01256457, + "epoch": 0.3464301818728393, + "flos": 13149513089280.0, + "grad_norm": 1.936786863895872, + "language_loss": 0.9549523, + "learning_rate": 3.038422700166474e-06, + "loss": 1.03219616, + "num_input_tokens_seen": 123975805, + "router_z_loss_clip": 1.75585938, + "router_z_loss_mlp": 0.14477539, + "step": 5762, + "time_per_iteration": 2.496039390563965 + }, + { + "auxiliary_loss_clip": 0.06467324, + "auxiliary_loss_mlp": 0.01276759, + "balance_loss_clip": 0.06279808, + "balance_loss_mlp": 0.01260928, + "epoch": 0.34649030512550727, + "flos": 29322936650880.0, + "grad_norm": 1.870020160295256, + "language_loss": 0.69913763, + "learning_rate": 3.0380898286599692e-06, + "loss": 0.77657849, + "num_input_tokens_seen": 123997530, + "router_z_loss_clip": 1.875, + "router_z_loss_mlp": 0.15820312, + "step": 5763, + "time_per_iteration": 2.5929718017578125 + }, + { + "auxiliary_loss_clip": 0.06466965, + "auxiliary_loss_mlp": 0.01270957, + "balance_loss_clip": 0.06278971, + "balance_loss_mlp": 0.01253922, + "epoch": 0.34655042837817523, + "flos": 23737385082240.0, + "grad_norm": 1.7922805842181977, + "language_loss": 0.83863467, + "learning_rate": 3.0377569177889945e-06, + "loss": 0.9160139, + "num_input_tokens_seen": 124016375, + "router_z_loss_clip": 1.88183594, + "router_z_loss_mlp": 0.17028809, + "step": 5764, + "time_per_iteration": 2.634692668914795 + }, + { + "auxiliary_loss_clip": 0.06459094, + "auxiliary_loss_mlp": 0.01274486, + "balance_loss_clip": 0.06279744, + "balance_loss_mlp": 0.01259263, + "epoch": 0.34661055163084326, + "flos": 22060716082560.0, + "grad_norm": 2.9007104109569943, + "language_loss": 0.67647815, + "learning_rate": 3.0374239675661722e-06, + "loss": 0.75381392, + "num_input_tokens_seen": 124033975, + "router_z_loss_clip": 1.79394531, + "router_z_loss_mlp": 0.15234375, + "step": 5765, + "time_per_iteration": 2.5028090476989746 + }, + { + "auxiliary_loss_clip": 0.06460512, + "auxiliary_loss_mlp": 0.0127692, + "balance_loss_clip": 0.06280708, + "balance_loss_mlp": 0.01262233, + "epoch": 0.3466706748835112, + "flos": 21805738508160.0, + "grad_norm": 3.5961884004183426, + "language_loss": 0.77947313, + "learning_rate": 3.03709097800413e-06, + "loss": 0.85684741, + "num_input_tokens_seen": 124051930, + "router_z_loss_clip": 1.79980469, + "router_z_loss_mlp": 0.14709473, + "step": 5766, + "time_per_iteration": 2.5584661960601807 + }, + { + "auxiliary_loss_clip": 0.06460432, + "auxiliary_loss_mlp": 0.01274096, + "balance_loss_clip": 0.06278767, + "balance_loss_mlp": 0.01260614, + "epoch": 0.3467307981361792, + "flos": 19467405342720.0, + "grad_norm": 1.5497773141022704, + "language_loss": 0.73886019, + "learning_rate": 3.0367579491154943e-06, + "loss": 0.8162055, + "num_input_tokens_seen": 124071220, + "router_z_loss_clip": 1.81640625, + "router_z_loss_mlp": 0.13500977, + "step": 5767, + "time_per_iteration": 2.571500062942505 + }, + { + "auxiliary_loss_clip": 0.06461183, + "auxiliary_loss_mlp": 0.01276021, + "balance_loss_clip": 0.06279645, + "balance_loss_mlp": 0.01260107, + "epoch": 0.34679092138884715, + "flos": 24834470590080.0, + "grad_norm": 2.0350854996297696, + "language_loss": 0.78955162, + "learning_rate": 3.036424880912893e-06, + "loss": 0.86692369, + "num_input_tokens_seen": 124090140, + "router_z_loss_clip": 1.8125, + "router_z_loss_mlp": 0.15917969, + "step": 5768, + "time_per_iteration": 2.5747995376586914 + }, + { + "auxiliary_loss_clip": 0.06369781, + "auxiliary_loss_mlp": 0.01257254, + "balance_loss_clip": 0.06288455, + "balance_loss_mlp": 0.01251723, + "epoch": 0.3468510446415151, + "flos": 63253791757440.0, + "grad_norm": 0.7431238132649503, + "language_loss": 0.57319033, + "learning_rate": 3.036091773408956e-06, + "loss": 0.64946061, + "num_input_tokens_seen": 124152025, + "router_z_loss_clip": 0.8125, + "router_z_loss_mlp": 0.05535889, + "step": 5769, + "time_per_iteration": 3.176074981689453 + }, + { + "auxiliary_loss_clip": 0.06479758, + "auxiliary_loss_mlp": 0.01277235, + "balance_loss_clip": 0.06285711, + "balance_loss_mlp": 0.01260212, + "epoch": 0.3469111678941831, + "flos": 12123984568320.0, + "grad_norm": 2.4016361546378158, + "language_loss": 0.85419703, + "learning_rate": 3.0357586266163154e-06, + "loss": 0.93176699, + "num_input_tokens_seen": 124165795, + "router_z_loss_clip": 1.94042969, + "router_z_loss_mlp": 0.17028809, + "step": 5770, + "time_per_iteration": 2.5156779289245605 + }, + { + "auxiliary_loss_clip": 0.06372644, + "auxiliary_loss_mlp": 0.01258777, + "balance_loss_clip": 0.0629043, + "balance_loss_mlp": 0.01253087, + "epoch": 0.34697129114685105, + "flos": 65951964282240.0, + "grad_norm": 0.7493725348793998, + "language_loss": 0.59862447, + "learning_rate": 3.0354254405476036e-06, + "loss": 0.67493868, + "num_input_tokens_seen": 124222925, + "router_z_loss_clip": 0.82080078, + "router_z_loss_mlp": 0.05685425, + "step": 5771, + "time_per_iteration": 2.938957691192627 + }, + { + "auxiliary_loss_clip": 0.0646434, + "auxiliary_loss_mlp": 0.012787, + "balance_loss_clip": 0.06282143, + "balance_loss_mlp": 0.01263572, + "epoch": 0.347031414399519, + "flos": 34461914284800.0, + "grad_norm": 1.9396999801577832, + "language_loss": 0.72527683, + "learning_rate": 3.0350922152154557e-06, + "loss": 0.80270731, + "num_input_tokens_seen": 124240915, + "router_z_loss_clip": 1.82324219, + "router_z_loss_mlp": 0.15136719, + "step": 5772, + "time_per_iteration": 2.6529078483581543 + }, + { + "auxiliary_loss_clip": 0.06462972, + "auxiliary_loss_mlp": 0.01272172, + "balance_loss_clip": 0.06281382, + "balance_loss_mlp": 0.01256246, + "epoch": 0.347091537652187, + "flos": 26951592176640.0, + "grad_norm": 1.5709710398058576, + "language_loss": 0.76695967, + "learning_rate": 3.034758950632507e-06, + "loss": 0.84431112, + "num_input_tokens_seen": 124262770, + "router_z_loss_clip": 1.81445312, + "router_z_loss_mlp": 0.15924072, + "step": 5773, + "time_per_iteration": 2.5785317420959473 + }, + { + "auxiliary_loss_clip": 0.06466497, + "auxiliary_loss_mlp": 0.01271256, + "balance_loss_clip": 0.06280655, + "balance_loss_mlp": 0.01255366, + "epoch": 0.34715166090485494, + "flos": 21148602462720.0, + "grad_norm": 2.4326309651076463, + "language_loss": 0.70796078, + "learning_rate": 3.034425646811396e-06, + "loss": 0.78533834, + "num_input_tokens_seen": 124280950, + "router_z_loss_clip": 1.85644531, + "router_z_loss_mlp": 0.15893555, + "step": 5774, + "time_per_iteration": 2.5585873126983643 + }, + { + "auxiliary_loss_clip": 0.06458526, + "auxiliary_loss_mlp": 0.01271942, + "balance_loss_clip": 0.06278332, + "balance_loss_mlp": 0.01256707, + "epoch": 0.3472117841575229, + "flos": 23484881203200.0, + "grad_norm": 2.2084812675777474, + "language_loss": 0.76485682, + "learning_rate": 3.0340923037647602e-06, + "loss": 0.84216148, + "num_input_tokens_seen": 124299540, + "router_z_loss_clip": 1.80175781, + "router_z_loss_mlp": 0.15228271, + "step": 5775, + "time_per_iteration": 2.5899477005004883 + }, + { + "auxiliary_loss_clip": 0.06472419, + "auxiliary_loss_mlp": 0.01271173, + "balance_loss_clip": 0.06281743, + "balance_loss_mlp": 0.01255163, + "epoch": 0.34727190741019087, + "flos": 17498428974720.0, + "grad_norm": 2.2070819655775282, + "language_loss": 0.7869916, + "learning_rate": 3.0337589215052404e-06, + "loss": 0.86442757, + "num_input_tokens_seen": 124316285, + "router_z_loss_clip": 1.90625, + "router_z_loss_mlp": 0.16009521, + "step": 5776, + "time_per_iteration": 2.5874037742614746 + }, + { + "auxiliary_loss_clip": 0.0636313, + "auxiliary_loss_mlp": 0.01265305, + "balance_loss_clip": 0.06280468, + "balance_loss_mlp": 0.0125983, + "epoch": 0.34733203066285884, + "flos": 65287350495360.0, + "grad_norm": 0.8333293277096808, + "language_loss": 0.63448966, + "learning_rate": 3.033425500045478e-06, + "loss": 0.710774, + "num_input_tokens_seen": 124376650, + "router_z_loss_clip": 0.82666016, + "router_z_loss_mlp": 0.05477905, + "step": 5777, + "time_per_iteration": 3.168325185775757 + }, + { + "auxiliary_loss_clip": 0.0646584, + "auxiliary_loss_mlp": 0.01270867, + "balance_loss_clip": 0.06279471, + "balance_loss_mlp": 0.01255048, + "epoch": 0.3473921539155268, + "flos": 28666429511040.0, + "grad_norm": 3.258496862714712, + "language_loss": 0.65075529, + "learning_rate": 3.033092039398119e-06, + "loss": 0.72812235, + "num_input_tokens_seen": 124396475, + "router_z_loss_clip": 1.86425781, + "router_z_loss_mlp": 0.15808105, + "step": 5778, + "time_per_iteration": 2.5797836780548096 + }, + { + "auxiliary_loss_clip": 0.06467149, + "auxiliary_loss_mlp": 0.01271344, + "balance_loss_clip": 0.06278305, + "balance_loss_mlp": 0.0125633, + "epoch": 0.3474522771681948, + "flos": 40845284104320.0, + "grad_norm": 1.7195764072446118, + "language_loss": 0.722601, + "learning_rate": 3.0327585395758046e-06, + "loss": 0.79998595, + "num_input_tokens_seen": 124416480, + "router_z_loss_clip": 1.88769531, + "router_z_loss_mlp": 0.15008545, + "step": 5779, + "time_per_iteration": 2.6901330947875977 + }, + { + "auxiliary_loss_clip": 0.06474127, + "auxiliary_loss_mlp": 0.01275722, + "balance_loss_clip": 0.06282836, + "balance_loss_mlp": 0.01259092, + "epoch": 0.3475124004208628, + "flos": 24615564998400.0, + "grad_norm": 2.601451729132101, + "language_loss": 0.62399209, + "learning_rate": 3.0324250005911837e-06, + "loss": 0.70149052, + "num_input_tokens_seen": 124435950, + "router_z_loss_clip": 1.9140625, + "router_z_loss_mlp": 0.1663208, + "step": 5780, + "time_per_iteration": 2.5493476390838623 + }, + { + "auxiliary_loss_clip": 0.0647147, + "auxiliary_loss_mlp": 0.01271785, + "balance_loss_clip": 0.06285025, + "balance_loss_mlp": 0.01256264, + "epoch": 0.34757252367353075, + "flos": 22717977909120.0, + "grad_norm": 3.4183593986527043, + "language_loss": 0.72164977, + "learning_rate": 3.0320914224569033e-06, + "loss": 0.79908228, + "num_input_tokens_seen": 124455410, + "router_z_loss_clip": 1.86523438, + "router_z_loss_mlp": 0.15515137, + "step": 5781, + "time_per_iteration": 2.610198974609375 + }, + { + "auxiliary_loss_clip": 0.06471756, + "auxiliary_loss_mlp": 0.01273476, + "balance_loss_clip": 0.06282213, + "balance_loss_mlp": 0.01257228, + "epoch": 0.3476326469261987, + "flos": 19834246517760.0, + "grad_norm": 2.4264406265191325, + "language_loss": 0.77686667, + "learning_rate": 3.031757805185612e-06, + "loss": 0.85431898, + "num_input_tokens_seen": 124474870, + "router_z_loss_clip": 1.89648438, + "router_z_loss_mlp": 0.16235352, + "step": 5782, + "time_per_iteration": 3.918602705001831 + }, + { + "auxiliary_loss_clip": 0.06470296, + "auxiliary_loss_mlp": 0.01277549, + "balance_loss_clip": 0.0628626, + "balance_loss_mlp": 0.01262695, + "epoch": 0.3476927701788667, + "flos": 19944265328640.0, + "grad_norm": 2.639685157679876, + "language_loss": 0.63410383, + "learning_rate": 3.0314241487899622e-06, + "loss": 0.7115823, + "num_input_tokens_seen": 124494105, + "router_z_loss_clip": 1.84082031, + "router_z_loss_mlp": 0.14855957, + "step": 5783, + "time_per_iteration": 4.021190881729126 + }, + { + "auxiliary_loss_clip": 0.06469369, + "auxiliary_loss_mlp": 0.01277895, + "balance_loss_clip": 0.06290524, + "balance_loss_mlp": 0.01264121, + "epoch": 0.34775289343153465, + "flos": 20740448424960.0, + "grad_norm": 1.686879732071426, + "language_loss": 0.89054763, + "learning_rate": 3.031090453282605e-06, + "loss": 0.9680202, + "num_input_tokens_seen": 124512030, + "router_z_loss_clip": 1.78808594, + "router_z_loss_mlp": 0.13763428, + "step": 5784, + "time_per_iteration": 2.553847074508667 + }, + { + "auxiliary_loss_clip": 0.06470798, + "auxiliary_loss_mlp": 0.01275566, + "balance_loss_clip": 0.06289466, + "balance_loss_mlp": 0.01260903, + "epoch": 0.3478130166842026, + "flos": 19360992257280.0, + "grad_norm": 1.643062521609265, + "language_loss": 0.82068878, + "learning_rate": 3.0307567186761946e-06, + "loss": 0.89815247, + "num_input_tokens_seen": 124530980, + "router_z_loss_clip": 1.8125, + "router_z_loss_mlp": 0.14672852, + "step": 5785, + "time_per_iteration": 2.5452024936676025 + }, + { + "auxiliary_loss_clip": 0.06472684, + "auxiliary_loss_mlp": 0.01281071, + "balance_loss_clip": 0.06290045, + "balance_loss_mlp": 0.01267004, + "epoch": 0.3478731399368706, + "flos": 22057194211200.0, + "grad_norm": 1.6654216237849466, + "language_loss": 0.80731958, + "learning_rate": 3.0304229449833862e-06, + "loss": 0.88485718, + "num_input_tokens_seen": 124549330, + "router_z_loss_clip": 1.82617188, + "router_z_loss_mlp": 0.14074707, + "step": 5786, + "time_per_iteration": 4.040801286697388 + }, + { + "auxiliary_loss_clip": 0.06468868, + "auxiliary_loss_mlp": 0.01275893, + "balance_loss_clip": 0.06289011, + "balance_loss_mlp": 0.01260515, + "epoch": 0.34793326318953854, + "flos": 18047390999040.0, + "grad_norm": 1.5833193798509506, + "language_loss": 0.75743961, + "learning_rate": 3.030089132216836e-06, + "loss": 0.83488721, + "num_input_tokens_seen": 124567200, + "router_z_loss_clip": 1.79785156, + "router_z_loss_mlp": 0.15368652, + "step": 5787, + "time_per_iteration": 2.5231845378875732 + }, + { + "auxiliary_loss_clip": 0.06470607, + "auxiliary_loss_mlp": 0.01273428, + "balance_loss_clip": 0.06287535, + "balance_loss_mlp": 0.01259111, + "epoch": 0.3479933864422065, + "flos": 29322349672320.0, + "grad_norm": 1.5447805606313796, + "language_loss": 0.81661141, + "learning_rate": 3.029755280389203e-06, + "loss": 0.89405167, + "num_input_tokens_seen": 124587025, + "router_z_loss_clip": 1.83007812, + "router_z_loss_mlp": 0.14312744, + "step": 5788, + "time_per_iteration": 2.5828304290771484 + }, + { + "auxiliary_loss_clip": 0.064804, + "auxiliary_loss_mlp": 0.01277805, + "balance_loss_clip": 0.06290662, + "balance_loss_mlp": 0.01261831, + "epoch": 0.3480535096948745, + "flos": 20126931229440.0, + "grad_norm": 1.9688082680528027, + "language_loss": 0.85984367, + "learning_rate": 3.029421389513147e-06, + "loss": 0.93742573, + "num_input_tokens_seen": 124605860, + "router_z_loss_clip": 1.8984375, + "router_z_loss_mlp": 0.15979004, + "step": 5789, + "time_per_iteration": 2.582662343978882 + }, + { + "auxiliary_loss_clip": 0.06479242, + "auxiliary_loss_mlp": 0.0127695, + "balance_loss_clip": 0.06292568, + "balance_loss_mlp": 0.0126178, + "epoch": 0.34811363294754244, + "flos": 18554453182080.0, + "grad_norm": 1.6869236803506542, + "language_loss": 0.84773821, + "learning_rate": 3.029087459601328e-06, + "loss": 0.92530012, + "num_input_tokens_seen": 124624270, + "router_z_loss_clip": 1.86816406, + "router_z_loss_mlp": 0.15185547, + "step": 5790, + "time_per_iteration": 3.942929983139038 + }, + { + "auxiliary_loss_clip": 0.06469919, + "auxiliary_loss_mlp": 0.01274378, + "balance_loss_clip": 0.0628828, + "balance_loss_mlp": 0.01259465, + "epoch": 0.3481737562002104, + "flos": 26877603421440.0, + "grad_norm": 1.9257745343225423, + "language_loss": 0.81410027, + "learning_rate": 3.0287534906664097e-06, + "loss": 0.89154327, + "num_input_tokens_seen": 124644005, + "router_z_loss_clip": 1.81738281, + "router_z_loss_mlp": 0.14904785, + "step": 5791, + "time_per_iteration": 2.5533103942871094 + }, + { + "auxiliary_loss_clip": 0.06478444, + "auxiliary_loss_mlp": 0.01278573, + "balance_loss_clip": 0.0629065, + "balance_loss_mlp": 0.01263356, + "epoch": 0.3482338794528784, + "flos": 28915495372800.0, + "grad_norm": 1.656722788090249, + "language_loss": 0.78119808, + "learning_rate": 3.028419482721056e-06, + "loss": 0.85876822, + "num_input_tokens_seen": 124663020, + "router_z_loss_clip": 1.87988281, + "router_z_loss_mlp": 0.15216064, + "step": 5792, + "time_per_iteration": 2.5784294605255127 + }, + { + "auxiliary_loss_clip": 0.06471643, + "auxiliary_loss_mlp": 0.01270557, + "balance_loss_clip": 0.06287935, + "balance_loss_mlp": 0.01255989, + "epoch": 0.3482940027055464, + "flos": 22207393854720.0, + "grad_norm": 1.5928062225109956, + "language_loss": 0.82187879, + "learning_rate": 3.0280854357779325e-06, + "loss": 0.89930081, + "num_input_tokens_seen": 124682975, + "router_z_loss_clip": 1.83496094, + "router_z_loss_mlp": 0.14575195, + "step": 5793, + "time_per_iteration": 2.545158624649048 + }, + { + "auxiliary_loss_clip": 0.06472721, + "auxiliary_loss_mlp": 0.01275633, + "balance_loss_clip": 0.06286202, + "balance_loss_mlp": 0.01259438, + "epoch": 0.34835412595821436, + "flos": 20308884370560.0, + "grad_norm": 1.8552979095996294, + "language_loss": 0.7616328, + "learning_rate": 3.027751349849706e-06, + "loss": 0.83911633, + "num_input_tokens_seen": 124701340, + "router_z_loss_clip": 1.86328125, + "router_z_loss_mlp": 0.1618042, + "step": 5794, + "time_per_iteration": 2.548841953277588 + }, + { + "auxiliary_loss_clip": 0.06468202, + "auxiliary_loss_mlp": 0.01277142, + "balance_loss_clip": 0.06286102, + "balance_loss_mlp": 0.01262271, + "epoch": 0.3484142492108823, + "flos": 20456065267200.0, + "grad_norm": 2.5979910850639336, + "language_loss": 0.57406038, + "learning_rate": 3.0274172249490456e-06, + "loss": 0.65151387, + "num_input_tokens_seen": 124719165, + "router_z_loss_clip": 1.82226562, + "router_z_loss_mlp": 0.14868164, + "step": 5795, + "time_per_iteration": 2.5222668647766113 + }, + { + "auxiliary_loss_clip": 0.06465806, + "auxiliary_loss_mlp": 0.01271041, + "balance_loss_clip": 0.06285395, + "balance_loss_mlp": 0.01257469, + "epoch": 0.3484743724635503, + "flos": 24359832737280.0, + "grad_norm": 1.8988060542741243, + "language_loss": 0.83093596, + "learning_rate": 3.0270830610886213e-06, + "loss": 0.90830439, + "num_input_tokens_seen": 124738670, + "router_z_loss_clip": 1.80371094, + "router_z_loss_mlp": 0.13580322, + "step": 5796, + "time_per_iteration": 2.5901992321014404 + }, + { + "auxiliary_loss_clip": 0.06459932, + "auxiliary_loss_mlp": 0.01272067, + "balance_loss_clip": 0.06285086, + "balance_loss_mlp": 0.01258692, + "epoch": 0.34853449571621825, + "flos": 24359916591360.0, + "grad_norm": 1.6441838604480552, + "language_loss": 0.83544898, + "learning_rate": 3.0267488582811033e-06, + "loss": 0.91276896, + "num_input_tokens_seen": 124758760, + "router_z_loss_clip": 1.74511719, + "router_z_loss_mlp": 0.13378906, + "step": 5797, + "time_per_iteration": 2.5595455169677734 + }, + { + "auxiliary_loss_clip": 0.06466283, + "auxiliary_loss_mlp": 0.01269705, + "balance_loss_clip": 0.06287575, + "balance_loss_mlp": 0.01256055, + "epoch": 0.3485946189688862, + "flos": 27274395231360.0, + "grad_norm": 1.5517160717894904, + "language_loss": 0.73727238, + "learning_rate": 3.026414616539167e-06, + "loss": 0.81463224, + "num_input_tokens_seen": 124777765, + "router_z_loss_clip": 1.78808594, + "router_z_loss_mlp": 0.13647461, + "step": 5798, + "time_per_iteration": 2.716830015182495 + }, + { + "auxiliary_loss_clip": 0.06466942, + "auxiliary_loss_mlp": 0.012712, + "balance_loss_clip": 0.06280895, + "balance_loss_mlp": 0.0125618, + "epoch": 0.3486547422215542, + "flos": 20162835504000.0, + "grad_norm": 1.8098383323780278, + "language_loss": 0.76806593, + "learning_rate": 3.026080335875485e-06, + "loss": 0.84544736, + "num_input_tokens_seen": 124796775, + "router_z_loss_clip": 1.86035156, + "router_z_loss_mlp": 0.15014648, + "step": 5799, + "time_per_iteration": 2.550356149673462 + }, + { + "auxiliary_loss_clip": 0.06464861, + "auxiliary_loss_mlp": 0.01267271, + "balance_loss_clip": 0.06284796, + "balance_loss_mlp": 0.01253735, + "epoch": 0.34871486547422215, + "flos": 20236614624000.0, + "grad_norm": 2.6888551620055363, + "language_loss": 0.75880742, + "learning_rate": 3.025746016302734e-06, + "loss": 0.83612871, + "num_input_tokens_seen": 124815825, + "router_z_loss_clip": 1.79980469, + "router_z_loss_mlp": 0.13543701, + "step": 5800, + "time_per_iteration": 2.559406042098999 + }, + { + "auxiliary_loss_clip": 0.06468332, + "auxiliary_loss_mlp": 0.01272895, + "balance_loss_clip": 0.06284243, + "balance_loss_mlp": 0.01258375, + "epoch": 0.3487749887268901, + "flos": 44063096924160.0, + "grad_norm": 1.6752863637060063, + "language_loss": 0.67620414, + "learning_rate": 3.025411657833591e-06, + "loss": 0.75361645, + "num_input_tokens_seen": 124838420, + "router_z_loss_clip": 1.84082031, + "router_z_loss_mlp": 0.14538574, + "step": 5801, + "time_per_iteration": 2.7286293506622314 + }, + { + "auxiliary_loss_clip": 0.064619, + "auxiliary_loss_mlp": 0.01268462, + "balance_loss_clip": 0.06283519, + "balance_loss_mlp": 0.01253406, + "epoch": 0.3488351119795581, + "flos": 23301921813120.0, + "grad_norm": 1.7427843167651098, + "language_loss": 0.76900619, + "learning_rate": 3.025077260480735e-06, + "loss": 0.84630978, + "num_input_tokens_seen": 124857320, + "router_z_loss_clip": 1.78320312, + "router_z_loss_mlp": 0.15075684, + "step": 5802, + "time_per_iteration": 2.5632455348968506 + }, + { + "auxiliary_loss_clip": 0.0645422, + "auxiliary_loss_mlp": 0.01273067, + "balance_loss_clip": 0.06281535, + "balance_loss_mlp": 0.01260109, + "epoch": 0.34889523523222604, + "flos": 19940449968000.0, + "grad_norm": 1.7168444943641856, + "language_loss": 0.79347479, + "learning_rate": 3.0247428242568474e-06, + "loss": 0.87074769, + "num_input_tokens_seen": 124875685, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.12957764, + "step": 5803, + "time_per_iteration": 2.5202274322509766 + }, + { + "auxiliary_loss_clip": 0.06462935, + "auxiliary_loss_mlp": 0.01269017, + "balance_loss_clip": 0.06277519, + "balance_loss_mlp": 0.01255212, + "epoch": 0.348955358484894, + "flos": 30454123570560.0, + "grad_norm": 2.672940484210586, + "language_loss": 0.67680007, + "learning_rate": 3.0244083491746085e-06, + "loss": 0.75411958, + "num_input_tokens_seen": 124895960, + "router_z_loss_clip": 1.85351562, + "router_z_loss_mlp": 0.13812256, + "step": 5804, + "time_per_iteration": 2.636371374130249 + }, + { + "auxiliary_loss_clip": 0.06455779, + "auxiliary_loss_mlp": 0.01267233, + "balance_loss_clip": 0.06282568, + "balance_loss_mlp": 0.01253989, + "epoch": 0.349015481737562, + "flos": 18005071887360.0, + "grad_norm": 1.776416664420285, + "language_loss": 0.76608741, + "learning_rate": 3.024073835246702e-06, + "loss": 0.84331751, + "num_input_tokens_seen": 124914140, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.13238525, + "step": 5805, + "time_per_iteration": 2.4746642112731934 + }, + { + "auxiliary_loss_clip": 0.06461459, + "auxiliary_loss_mlp": 0.01269872, + "balance_loss_clip": 0.06281143, + "balance_loss_mlp": 0.0125568, + "epoch": 0.34907560499023, + "flos": 27205815064320.0, + "grad_norm": 2.094620432718779, + "language_loss": 0.67626035, + "learning_rate": 3.023739282485814e-06, + "loss": 0.7535736, + "num_input_tokens_seen": 124934180, + "router_z_loss_clip": 1.80273438, + "router_z_loss_mlp": 0.14178467, + "step": 5806, + "time_per_iteration": 2.6109619140625 + }, + { + "auxiliary_loss_clip": 0.06461781, + "auxiliary_loss_mlp": 0.01268424, + "balance_loss_clip": 0.06281736, + "balance_loss_mlp": 0.01254596, + "epoch": 0.34913572824289796, + "flos": 30234714854400.0, + "grad_norm": 1.7462714312606824, + "language_loss": 0.71972066, + "learning_rate": 3.023404690904629e-06, + "loss": 0.7970227, + "num_input_tokens_seen": 124956060, + "router_z_loss_clip": 1.80273438, + "router_z_loss_mlp": 0.1383667, + "step": 5807, + "time_per_iteration": 2.6023621559143066 + }, + { + "auxiliary_loss_clip": 0.06464535, + "auxiliary_loss_mlp": 0.01272433, + "balance_loss_clip": 0.06279333, + "balance_loss_mlp": 0.01257425, + "epoch": 0.3491958514955659, + "flos": 29979779207040.0, + "grad_norm": 2.0002365662223727, + "language_loss": 0.74799109, + "learning_rate": 3.0230700605158364e-06, + "loss": 0.82536077, + "num_input_tokens_seen": 124976070, + "router_z_loss_clip": 1.8515625, + "router_z_loss_mlp": 0.15002441, + "step": 5808, + "time_per_iteration": 2.661327362060547 + }, + { + "auxiliary_loss_clip": 0.0645329, + "auxiliary_loss_mlp": 0.01272203, + "balance_loss_clip": 0.06278954, + "balance_loss_mlp": 0.0125828, + "epoch": 0.3492559747482339, + "flos": 22789786458240.0, + "grad_norm": 1.539446612060682, + "language_loss": 0.84555626, + "learning_rate": 3.0227353913321238e-06, + "loss": 0.92281115, + "num_input_tokens_seen": 124996995, + "router_z_loss_clip": 1.74316406, + "router_z_loss_mlp": 0.13922119, + "step": 5809, + "time_per_iteration": 2.577709197998047 + }, + { + "auxiliary_loss_clip": 0.06454454, + "auxiliary_loss_mlp": 0.01270466, + "balance_loss_clip": 0.06282149, + "balance_loss_mlp": 0.0125755, + "epoch": 0.34931609800090185, + "flos": 26075257050240.0, + "grad_norm": 1.9706347482771516, + "language_loss": 0.80724359, + "learning_rate": 3.0224006833661835e-06, + "loss": 0.88449275, + "num_input_tokens_seen": 125015600, + "router_z_loss_clip": 1.72167969, + "router_z_loss_mlp": 0.12921143, + "step": 5810, + "time_per_iteration": 2.583709955215454 + }, + { + "auxiliary_loss_clip": 0.06460047, + "auxiliary_loss_mlp": 0.01274437, + "balance_loss_clip": 0.06281585, + "balance_loss_mlp": 0.01260477, + "epoch": 0.3493762212535698, + "flos": 29249744509440.0, + "grad_norm": 1.580057936247994, + "language_loss": 0.75975537, + "learning_rate": 3.0220659366307057e-06, + "loss": 0.83710015, + "num_input_tokens_seen": 125035290, + "router_z_loss_clip": 1.78515625, + "router_z_loss_mlp": 0.1395874, + "step": 5811, + "time_per_iteration": 2.6304807662963867 + }, + { + "auxiliary_loss_clip": 0.06459605, + "auxiliary_loss_mlp": 0.01268711, + "balance_loss_clip": 0.06280548, + "balance_loss_mlp": 0.01254746, + "epoch": 0.3494363445062378, + "flos": 27133461463680.0, + "grad_norm": 1.6291603050336358, + "language_loss": 0.80527401, + "learning_rate": 3.021731151138386e-06, + "loss": 0.88255721, + "num_input_tokens_seen": 125057130, + "router_z_loss_clip": 1.79003906, + "router_z_loss_mlp": 0.1395874, + "step": 5812, + "time_per_iteration": 2.657989025115967 + }, + { + "auxiliary_loss_clip": 0.06462281, + "auxiliary_loss_mlp": 0.01270882, + "balance_loss_clip": 0.0628228, + "balance_loss_mlp": 0.01257179, + "epoch": 0.34949646775890575, + "flos": 12281102173440.0, + "grad_norm": 2.0118644405033463, + "language_loss": 0.701132, + "learning_rate": 3.021396326901918e-06, + "loss": 0.7784636, + "num_input_tokens_seen": 125073720, + "router_z_loss_clip": 1.80273438, + "router_z_loss_mlp": 0.137146, + "step": 5813, + "time_per_iteration": 2.47231388092041 + }, + { + "auxiliary_loss_clip": 0.06457584, + "auxiliary_loss_mlp": 0.01270878, + "balance_loss_clip": 0.06281666, + "balance_loss_mlp": 0.01257401, + "epoch": 0.3495565910115737, + "flos": 17171265507840.0, + "grad_norm": 1.9224367307793844, + "language_loss": 0.76310062, + "learning_rate": 3.0210614639339998e-06, + "loss": 0.8403852, + "num_input_tokens_seen": 125090635, + "router_z_loss_clip": 1.75976562, + "router_z_loss_mlp": 0.13482666, + "step": 5814, + "time_per_iteration": 2.4967095851898193 + }, + { + "auxiliary_loss_clip": 0.06471042, + "auxiliary_loss_mlp": 0.01271787, + "balance_loss_clip": 0.06288652, + "balance_loss_mlp": 0.01257076, + "epoch": 0.3496167142642417, + "flos": 26472342349440.0, + "grad_norm": 1.8186936331307002, + "language_loss": 0.85099685, + "learning_rate": 3.020726562247328e-06, + "loss": 0.92842519, + "num_input_tokens_seen": 125110070, + "router_z_loss_clip": 1.82519531, + "router_z_loss_mlp": 0.1472168, + "step": 5815, + "time_per_iteration": 2.597399950027466 + }, + { + "auxiliary_loss_clip": 0.06466906, + "auxiliary_loss_mlp": 0.01275707, + "balance_loss_clip": 0.06285352, + "balance_loss_mlp": 0.01261712, + "epoch": 0.34967683751690964, + "flos": 17419618609920.0, + "grad_norm": 2.3640337842934565, + "language_loss": 0.78006089, + "learning_rate": 3.0203916218546024e-06, + "loss": 0.85748702, + "num_input_tokens_seen": 125125730, + "router_z_loss_clip": 1.81542969, + "router_z_loss_mlp": 0.13995361, + "step": 5816, + "time_per_iteration": 2.5164036750793457 + }, + { + "auxiliary_loss_clip": 0.0646984, + "auxiliary_loss_mlp": 0.01273456, + "balance_loss_clip": 0.06286636, + "balance_loss_mlp": 0.01258692, + "epoch": 0.3497369607695776, + "flos": 22606365870720.0, + "grad_norm": 1.8515414586733512, + "language_loss": 0.59787703, + "learning_rate": 3.0200566427685246e-06, + "loss": 0.6753099, + "num_input_tokens_seen": 125146195, + "router_z_loss_clip": 1.83398438, + "router_z_loss_mlp": 0.14764404, + "step": 5817, + "time_per_iteration": 2.542877674102783 + }, + { + "auxiliary_loss_clip": 0.06358884, + "auxiliary_loss_mlp": 0.01261904, + "balance_loss_clip": 0.06277611, + "balance_loss_mlp": 0.01257669, + "epoch": 0.34979708402224563, + "flos": 68548461477120.0, + "grad_norm": 0.858700346008579, + "language_loss": 0.59824663, + "learning_rate": 3.0197216250017975e-06, + "loss": 0.67445457, + "num_input_tokens_seen": 125207790, + "router_z_loss_clip": 0.8125, + "router_z_loss_mlp": 0.04238892, + "step": 5818, + "time_per_iteration": 3.1992976665496826 + }, + { + "auxiliary_loss_clip": 0.06459703, + "auxiliary_loss_mlp": 0.01271152, + "balance_loss_clip": 0.06283519, + "balance_loss_mlp": 0.01257109, + "epoch": 0.3498572072749136, + "flos": 18995660455680.0, + "grad_norm": 1.926998914600137, + "language_loss": 0.83806789, + "learning_rate": 3.019386568567123e-06, + "loss": 0.91537642, + "num_input_tokens_seen": 125226220, + "router_z_loss_clip": 1.76074219, + "router_z_loss_mlp": 0.14031982, + "step": 5819, + "time_per_iteration": 2.5241613388061523 + }, + { + "auxiliary_loss_clip": 0.06466879, + "auxiliary_loss_mlp": 0.01269175, + "balance_loss_clip": 0.0628517, + "balance_loss_mlp": 0.0125493, + "epoch": 0.34991733052758156, + "flos": 27826334075520.0, + "grad_norm": 2.092302610514248, + "language_loss": 0.71273863, + "learning_rate": 3.0190514734772083e-06, + "loss": 0.79009914, + "num_input_tokens_seen": 125247485, + "router_z_loss_clip": 1.81835938, + "router_z_loss_mlp": 0.14245605, + "step": 5820, + "time_per_iteration": 2.569838762283325 + }, + { + "auxiliary_loss_clip": 0.06470378, + "auxiliary_loss_mlp": 0.01270567, + "balance_loss_clip": 0.06288413, + "balance_loss_mlp": 0.01256292, + "epoch": 0.3499774537802495, + "flos": 33592706755200.0, + "grad_norm": 2.4345068466865083, + "language_loss": 0.70581877, + "learning_rate": 3.018716339744759e-06, + "loss": 0.78322828, + "num_input_tokens_seen": 125268625, + "router_z_loss_clip": 1.81835938, + "router_z_loss_mlp": 0.14294434, + "step": 5821, + "time_per_iteration": 2.6535534858703613 + }, + { + "auxiliary_loss_clip": 0.06479154, + "auxiliary_loss_mlp": 0.0127118, + "balance_loss_clip": 0.06291604, + "balance_loss_mlp": 0.01254538, + "epoch": 0.3500375770329175, + "flos": 23483413756800.0, + "grad_norm": 1.9533795991074365, + "language_loss": 0.74227631, + "learning_rate": 3.0183811673824842e-06, + "loss": 0.81977963, + "num_input_tokens_seen": 125287530, + "router_z_loss_clip": 1.87402344, + "router_z_loss_mlp": 0.16650391, + "step": 5822, + "time_per_iteration": 5.406672716140747 + }, + { + "auxiliary_loss_clip": 0.06470097, + "auxiliary_loss_mlp": 0.01273086, + "balance_loss_clip": 0.06285684, + "balance_loss_mlp": 0.01257588, + "epoch": 0.35009770028558546, + "flos": 19032067854720.0, + "grad_norm": 2.646032233627204, + "language_loss": 0.7905609, + "learning_rate": 3.018045956403094e-06, + "loss": 0.86799276, + "num_input_tokens_seen": 125307020, + "router_z_loss_clip": 1.84570312, + "router_z_loss_mlp": 0.15496826, + "step": 5823, + "time_per_iteration": 2.5048515796661377 + }, + { + "auxiliary_loss_clip": 0.06353101, + "auxiliary_loss_mlp": 0.01254576, + "balance_loss_clip": 0.06271273, + "balance_loss_mlp": 0.01249748, + "epoch": 0.3501578235382534, + "flos": 68371749216000.0, + "grad_norm": 0.6915411290730273, + "language_loss": 0.58945203, + "learning_rate": 3.017710706819298e-06, + "loss": 0.66552877, + "num_input_tokens_seen": 125370445, + "router_z_loss_clip": 0.81689453, + "router_z_loss_mlp": 0.04821777, + "step": 5824, + "time_per_iteration": 3.209726333618164 + }, + { + "auxiliary_loss_clip": 0.06465952, + "auxiliary_loss_mlp": 0.01274281, + "balance_loss_clip": 0.06284555, + "balance_loss_mlp": 0.01258045, + "epoch": 0.3502179467909214, + "flos": 21257153827200.0, + "grad_norm": 3.0621504018438164, + "language_loss": 0.85168576, + "learning_rate": 3.017375418643811e-06, + "loss": 0.92908812, + "num_input_tokens_seen": 125388900, + "router_z_loss_clip": 1.81445312, + "router_z_loss_mlp": 0.16223145, + "step": 5825, + "time_per_iteration": 2.513498067855835 + }, + { + "auxiliary_loss_clip": 0.06462917, + "auxiliary_loss_mlp": 0.01268842, + "balance_loss_clip": 0.06283134, + "balance_loss_mlp": 0.01254275, + "epoch": 0.35027807004358935, + "flos": 11946978817920.0, + "grad_norm": 2.498923152973308, + "language_loss": 0.83643848, + "learning_rate": 3.0170400918893464e-06, + "loss": 0.91375613, + "num_input_tokens_seen": 125402675, + "router_z_loss_clip": 1.79882812, + "router_z_loss_mlp": 0.14556885, + "step": 5826, + "time_per_iteration": 3.9313511848449707 + }, + { + "auxiliary_loss_clip": 0.06470059, + "auxiliary_loss_mlp": 0.01272158, + "balance_loss_clip": 0.06284411, + "balance_loss_mlp": 0.01254956, + "epoch": 0.3503381932962573, + "flos": 21477401084160.0, + "grad_norm": 2.100708343809493, + "language_loss": 0.81216669, + "learning_rate": 3.0167047265686186e-06, + "loss": 0.88958883, + "num_input_tokens_seen": 125421360, + "router_z_loss_clip": 1.85644531, + "router_z_loss_mlp": 0.17211914, + "step": 5827, + "time_per_iteration": 2.556704044342041 + }, + { + "auxiliary_loss_clip": 0.06462219, + "auxiliary_loss_mlp": 0.01272255, + "balance_loss_clip": 0.06283772, + "balance_loss_mlp": 0.01257473, + "epoch": 0.3503983165489253, + "flos": 21257405389440.0, + "grad_norm": 2.0166313071454858, + "language_loss": 0.71145403, + "learning_rate": 3.0163693226943467e-06, + "loss": 0.78879881, + "num_input_tokens_seen": 125440000, + "router_z_loss_clip": 1.78515625, + "router_z_loss_mlp": 0.14794922, + "step": 5828, + "time_per_iteration": 2.5173239707946777 + }, + { + "auxiliary_loss_clip": 0.06467165, + "auxiliary_loss_mlp": 0.01274622, + "balance_loss_clip": 0.06285597, + "balance_loss_mlp": 0.01257539, + "epoch": 0.35045843980159325, + "flos": 27822644496000.0, + "grad_norm": 1.678964319221545, + "language_loss": 0.79897165, + "learning_rate": 3.016033880279248e-06, + "loss": 0.8763895, + "num_input_tokens_seen": 125460390, + "router_z_loss_clip": 1.81542969, + "router_z_loss_mlp": 0.17077637, + "step": 5829, + "time_per_iteration": 4.086450099945068 + }, + { + "auxiliary_loss_clip": 0.06475446, + "auxiliary_loss_mlp": 0.01275238, + "balance_loss_clip": 0.06286699, + "balance_loss_mlp": 0.01257988, + "epoch": 0.3505185630542612, + "flos": 25928201934720.0, + "grad_norm": 1.7428196933402165, + "language_loss": 0.72440839, + "learning_rate": 3.0156983993360417e-06, + "loss": 0.80191517, + "num_input_tokens_seen": 125478410, + "router_z_loss_clip": 1.88769531, + "router_z_loss_mlp": 0.17248535, + "step": 5830, + "time_per_iteration": 2.625723361968994 + }, + { + "auxiliary_loss_clip": 0.06461293, + "auxiliary_loss_mlp": 0.01273843, + "balance_loss_clip": 0.06283247, + "balance_loss_mlp": 0.01259633, + "epoch": 0.35057868630692923, + "flos": 20527999597440.0, + "grad_norm": 2.5118715805025884, + "language_loss": 0.88613749, + "learning_rate": 3.0153628798774513e-06, + "loss": 0.96348894, + "num_input_tokens_seen": 125495975, + "router_z_loss_clip": 1.78222656, + "router_z_loss_mlp": 0.14221191, + "step": 5831, + "time_per_iteration": 2.577260732650757 + }, + { + "auxiliary_loss_clip": 0.06466363, + "auxiliary_loss_mlp": 0.01273549, + "balance_loss_clip": 0.06284641, + "balance_loss_mlp": 0.01258672, + "epoch": 0.3506388095595972, + "flos": 20454849383040.0, + "grad_norm": 2.013142681723478, + "language_loss": 0.78719735, + "learning_rate": 3.0150273219161985e-06, + "loss": 0.86459637, + "num_input_tokens_seen": 125515035, + "router_z_loss_clip": 1.81738281, + "router_z_loss_mlp": 0.14868164, + "step": 5832, + "time_per_iteration": 2.584496021270752 + }, + { + "auxiliary_loss_clip": 0.06470136, + "auxiliary_loss_mlp": 0.01274951, + "balance_loss_clip": 0.06284127, + "balance_loss_mlp": 0.01258536, + "epoch": 0.35069893281226516, + "flos": 23115901749120.0, + "grad_norm": 3.869403317005625, + "language_loss": 0.71628016, + "learning_rate": 3.014691725465008e-06, + "loss": 0.79373109, + "num_input_tokens_seen": 125535555, + "router_z_loss_clip": 1.86230469, + "router_z_loss_mlp": 0.1640625, + "step": 5833, + "time_per_iteration": 2.559213161468506 + }, + { + "auxiliary_loss_clip": 0.06462866, + "auxiliary_loss_mlp": 0.01271192, + "balance_loss_clip": 0.06285653, + "balance_loss_mlp": 0.01256291, + "epoch": 0.35075905606493313, + "flos": 27279426476160.0, + "grad_norm": 2.081089463640026, + "language_loss": 0.80963689, + "learning_rate": 3.014356090536606e-06, + "loss": 0.88697743, + "num_input_tokens_seen": 125558195, + "router_z_loss_clip": 1.77246094, + "router_z_loss_mlp": 0.14892578, + "step": 5834, + "time_per_iteration": 2.6462955474853516 + }, + { + "auxiliary_loss_clip": 0.06469317, + "auxiliary_loss_mlp": 0.0127505, + "balance_loss_clip": 0.06288308, + "balance_loss_mlp": 0.01258634, + "epoch": 0.3508191793176011, + "flos": 19133491622400.0, + "grad_norm": 2.5340357013843566, + "language_loss": 0.84608614, + "learning_rate": 3.0140204171437183e-06, + "loss": 0.92352986, + "num_input_tokens_seen": 125575375, + "router_z_loss_clip": 1.80957031, + "router_z_loss_mlp": 0.1640625, + "step": 5835, + "time_per_iteration": 2.5068061351776123 + }, + { + "auxiliary_loss_clip": 0.06463549, + "auxiliary_loss_mlp": 0.01274357, + "balance_loss_clip": 0.0628426, + "balance_loss_mlp": 0.01259122, + "epoch": 0.35087930257026906, + "flos": 25564798776960.0, + "grad_norm": 1.6798272602016127, + "language_loss": 0.77162683, + "learning_rate": 3.0136847052990754e-06, + "loss": 0.84900588, + "num_input_tokens_seen": 125596745, + "router_z_loss_clip": 1.79296875, + "router_z_loss_mlp": 0.15234375, + "step": 5836, + "time_per_iteration": 2.628737449645996 + }, + { + "auxiliary_loss_clip": 0.06462973, + "auxiliary_loss_mlp": 0.01284097, + "balance_loss_clip": 0.06285001, + "balance_loss_mlp": 0.01268767, + "epoch": 0.350939425822937, + "flos": 18010061205120.0, + "grad_norm": 1.7914903677000888, + "language_loss": 0.7777887, + "learning_rate": 3.0133489550154074e-06, + "loss": 0.85525942, + "num_input_tokens_seen": 125613980, + "router_z_loss_clip": 1.77929688, + "router_z_loss_mlp": 0.15325928, + "step": 5837, + "time_per_iteration": 2.4906866550445557 + }, + { + "auxiliary_loss_clip": 0.06464779, + "auxiliary_loss_mlp": 0.0127724, + "balance_loss_clip": 0.0628402, + "balance_loss_mlp": 0.01261575, + "epoch": 0.350999549075605, + "flos": 22279747455360.0, + "grad_norm": 2.3774474075228995, + "language_loss": 0.68712002, + "learning_rate": 3.0130131663054442e-06, + "loss": 0.7645402, + "num_input_tokens_seen": 125632100, + "router_z_loss_clip": 1.8046875, + "router_z_loss_mlp": 0.15649414, + "step": 5838, + "time_per_iteration": 2.616330862045288 + }, + { + "auxiliary_loss_clip": 0.06463079, + "auxiliary_loss_mlp": 0.01275242, + "balance_loss_clip": 0.0628327, + "balance_loss_mlp": 0.01259554, + "epoch": 0.35105967232827295, + "flos": 14397511000320.0, + "grad_norm": 2.135026117356547, + "language_loss": 0.83941519, + "learning_rate": 3.0126773391819215e-06, + "loss": 0.91679841, + "num_input_tokens_seen": 125649190, + "router_z_loss_clip": 1.79785156, + "router_z_loss_mlp": 0.15686035, + "step": 5839, + "time_per_iteration": 2.475210428237915 + }, + { + "auxiliary_loss_clip": 0.06472797, + "auxiliary_loss_mlp": 0.01274732, + "balance_loss_clip": 0.06285894, + "balance_loss_mlp": 0.01258376, + "epoch": 0.3511197955809409, + "flos": 25089322383360.0, + "grad_norm": 2.313381638226651, + "language_loss": 0.58970249, + "learning_rate": 3.012341473657572e-06, + "loss": 0.6671778, + "num_input_tokens_seen": 125668680, + "router_z_loss_clip": 1.86914062, + "router_z_loss_mlp": 0.16357422, + "step": 5840, + "time_per_iteration": 2.5654497146606445 + }, + { + "auxiliary_loss_clip": 0.06465258, + "auxiliary_loss_mlp": 0.01277785, + "balance_loss_clip": 0.06280696, + "balance_loss_mlp": 0.0126174, + "epoch": 0.3511799188336089, + "flos": 25891123703040.0, + "grad_norm": 2.5798747861510254, + "language_loss": 0.87567091, + "learning_rate": 3.0120055697451322e-06, + "loss": 0.9531014, + "num_input_tokens_seen": 125686935, + "router_z_loss_clip": 1.84570312, + "router_z_loss_mlp": 0.16040039, + "step": 5841, + "time_per_iteration": 2.5275204181671143 + }, + { + "auxiliary_loss_clip": 0.06473795, + "auxiliary_loss_mlp": 0.01278097, + "balance_loss_clip": 0.0628502, + "balance_loss_mlp": 0.01261038, + "epoch": 0.35124004208627685, + "flos": 20089852997760.0, + "grad_norm": 1.7442007932185601, + "language_loss": 0.7546367, + "learning_rate": 3.0116696274573406e-06, + "loss": 0.83215564, + "num_input_tokens_seen": 125707180, + "router_z_loss_clip": 1.88964844, + "router_z_loss_mlp": 0.17077637, + "step": 5842, + "time_per_iteration": 2.5876784324645996 + }, + { + "auxiliary_loss_clip": 0.06465417, + "auxiliary_loss_mlp": 0.01280375, + "balance_loss_clip": 0.06280544, + "balance_loss_mlp": 0.01265105, + "epoch": 0.3513001653389448, + "flos": 17788891553280.0, + "grad_norm": 2.704982383226077, + "language_loss": 0.68951106, + "learning_rate": 3.0113336468069346e-06, + "loss": 0.76696897, + "num_input_tokens_seen": 125722780, + "router_z_loss_clip": 1.84765625, + "router_z_loss_mlp": 0.15258789, + "step": 5843, + "time_per_iteration": 2.4710304737091064 + }, + { + "auxiliary_loss_clip": 0.06466319, + "auxiliary_loss_mlp": 0.01285229, + "balance_loss_clip": 0.0628369, + "balance_loss_mlp": 0.01268892, + "epoch": 0.3513602885916128, + "flos": 29394745200000.0, + "grad_norm": 2.1140022916881525, + "language_loss": 0.66181982, + "learning_rate": 3.010997627806655e-06, + "loss": 0.7393353, + "num_input_tokens_seen": 125742110, + "router_z_loss_clip": 1.82714844, + "router_z_loss_mlp": 0.16326904, + "step": 5844, + "time_per_iteration": 2.585793972015381 + }, + { + "auxiliary_loss_clip": 0.06472903, + "auxiliary_loss_mlp": 0.01282408, + "balance_loss_clip": 0.0628912, + "balance_loss_mlp": 0.01265761, + "epoch": 0.3514204118442808, + "flos": 16185372768000.0, + "grad_norm": 2.0590361589883206, + "language_loss": 0.75743866, + "learning_rate": 3.010661570469245e-06, + "loss": 0.83499175, + "num_input_tokens_seen": 125759980, + "router_z_loss_clip": 1.83789062, + "router_z_loss_mlp": 0.1663208, + "step": 5845, + "time_per_iteration": 2.50748348236084 + }, + { + "auxiliary_loss_clip": 0.06463686, + "auxiliary_loss_mlp": 0.01285129, + "balance_loss_clip": 0.06284383, + "balance_loss_mlp": 0.01270102, + "epoch": 0.35148053509694877, + "flos": 23840234369280.0, + "grad_norm": 5.020955850717412, + "language_loss": 0.73988718, + "learning_rate": 3.0103254748074465e-06, + "loss": 0.8173753, + "num_input_tokens_seen": 125772660, + "router_z_loss_clip": 1.79101562, + "router_z_loss_mlp": 0.15032959, + "step": 5846, + "time_per_iteration": 2.626898765563965 + }, + { + "auxiliary_loss_clip": 0.06470932, + "auxiliary_loss_mlp": 0.01280544, + "balance_loss_clip": 0.06285631, + "balance_loss_mlp": 0.01265482, + "epoch": 0.35154065834961673, + "flos": 20996809591680.0, + "grad_norm": 1.7410870567887373, + "language_loss": 0.75501883, + "learning_rate": 3.0099893408340046e-06, + "loss": 0.8325336, + "num_input_tokens_seen": 125791935, + "router_z_loss_clip": 1.85253906, + "router_z_loss_mlp": 0.1506958, + "step": 5847, + "time_per_iteration": 2.559131383895874 + }, + { + "auxiliary_loss_clip": 0.06472816, + "auxiliary_loss_mlp": 0.01272158, + "balance_loss_clip": 0.06284919, + "balance_loss_mlp": 0.01257316, + "epoch": 0.3516007816022847, + "flos": 33263866206720.0, + "grad_norm": 1.8955744454716683, + "language_loss": 0.72774404, + "learning_rate": 3.009653168561666e-06, + "loss": 0.80519378, + "num_input_tokens_seen": 125813455, + "router_z_loss_clip": 1.87792969, + "router_z_loss_mlp": 0.1484375, + "step": 5848, + "time_per_iteration": 2.6645965576171875 + }, + { + "auxiliary_loss_clip": 0.06467354, + "auxiliary_loss_mlp": 0.01280776, + "balance_loss_clip": 0.06280826, + "balance_loss_mlp": 0.01265124, + "epoch": 0.35166090485495266, + "flos": 11731427389440.0, + "grad_norm": 2.1922530808110983, + "language_loss": 0.90064394, + "learning_rate": 3.009316958003178e-06, + "loss": 0.97812521, + "num_input_tokens_seen": 125827660, + "router_z_loss_clip": 1.86425781, + "router_z_loss_mlp": 0.15655518, + "step": 5849, + "time_per_iteration": 2.4567575454711914 + }, + { + "auxiliary_loss_clip": 0.06464183, + "auxiliary_loss_mlp": 0.01272929, + "balance_loss_clip": 0.06281896, + "balance_loss_mlp": 0.01257461, + "epoch": 0.3517210281076206, + "flos": 22645121184000.0, + "grad_norm": 2.4964624006606946, + "language_loss": 0.75405449, + "learning_rate": 3.0089807091712897e-06, + "loss": 0.83142555, + "num_input_tokens_seen": 125846655, + "router_z_loss_clip": 1.82324219, + "router_z_loss_mlp": 0.15472412, + "step": 5850, + "time_per_iteration": 2.5980029106140137 + }, + { + "auxiliary_loss_clip": 0.06463099, + "auxiliary_loss_mlp": 0.01274678, + "balance_loss_clip": 0.06282984, + "balance_loss_mlp": 0.01259842, + "epoch": 0.3517811513602886, + "flos": 21328836595200.0, + "grad_norm": 2.0250770904548303, + "language_loss": 0.76385641, + "learning_rate": 3.0086444220787515e-06, + "loss": 0.84123409, + "num_input_tokens_seen": 125866290, + "router_z_loss_clip": 1.80371094, + "router_z_loss_mlp": 0.14825439, + "step": 5851, + "time_per_iteration": 2.5065958499908447 + }, + { + "auxiliary_loss_clip": 0.06463097, + "auxiliary_loss_mlp": 0.01275014, + "balance_loss_clip": 0.06281513, + "balance_loss_mlp": 0.01258933, + "epoch": 0.35184127461295656, + "flos": 21039254484480.0, + "grad_norm": 1.95256002439052, + "language_loss": 0.88133335, + "learning_rate": 3.0083080967383165e-06, + "loss": 0.95871449, + "num_input_tokens_seen": 125884620, + "router_z_loss_clip": 1.81738281, + "router_z_loss_mlp": 0.1607666, + "step": 5852, + "time_per_iteration": 2.571439266204834 + }, + { + "auxiliary_loss_clip": 0.06461711, + "auxiliary_loss_mlp": 0.01273084, + "balance_loss_clip": 0.06282608, + "balance_loss_mlp": 0.01258087, + "epoch": 0.3519013978656245, + "flos": 22461784450560.0, + "grad_norm": 2.1690150127965038, + "language_loss": 0.68480182, + "learning_rate": 3.007971733162737e-06, + "loss": 0.76214981, + "num_input_tokens_seen": 125902430, + "router_z_loss_clip": 1.79101562, + "router_z_loss_mlp": 0.14990234, + "step": 5853, + "time_per_iteration": 2.5121214389801025 + }, + { + "auxiliary_loss_clip": 0.06466305, + "auxiliary_loss_mlp": 0.0127272, + "balance_loss_clip": 0.06282477, + "balance_loss_mlp": 0.01256972, + "epoch": 0.3519615211182925, + "flos": 13120317141120.0, + "grad_norm": 2.1084516189193403, + "language_loss": 0.81284809, + "learning_rate": 3.0076353313647686e-06, + "loss": 0.89023829, + "num_input_tokens_seen": 125920570, + "router_z_loss_clip": 1.83691406, + "router_z_loss_mlp": 0.15734863, + "step": 5854, + "time_per_iteration": 2.644672155380249 + }, + { + "auxiliary_loss_clip": 0.06456967, + "auxiliary_loss_mlp": 0.01267471, + "balance_loss_clip": 0.06279022, + "balance_loss_mlp": 0.01253481, + "epoch": 0.35202164437096045, + "flos": 19141122343680.0, + "grad_norm": 1.5283351736697255, + "language_loss": 0.73366165, + "learning_rate": 3.0072988913571666e-06, + "loss": 0.81090605, + "num_input_tokens_seen": 125939800, + "router_z_loss_clip": 1.77832031, + "router_z_loss_mlp": 0.13970947, + "step": 5855, + "time_per_iteration": 2.489614486694336 + }, + { + "auxiliary_loss_clip": 0.06458069, + "auxiliary_loss_mlp": 0.01271058, + "balance_loss_clip": 0.06279419, + "balance_loss_mlp": 0.01256717, + "epoch": 0.3520817676236284, + "flos": 26549475632640.0, + "grad_norm": 1.8023400431296785, + "language_loss": 0.71055883, + "learning_rate": 3.006962413152691e-06, + "loss": 0.78785008, + "num_input_tokens_seen": 125958720, + "router_z_loss_clip": 1.78613281, + "router_z_loss_mlp": 0.14337158, + "step": 5856, + "time_per_iteration": 2.5643463134765625 + }, + { + "auxiliary_loss_clip": 0.064651, + "auxiliary_loss_mlp": 0.01271649, + "balance_loss_clip": 0.062787, + "balance_loss_mlp": 0.01255663, + "epoch": 0.3521418908762964, + "flos": 44903653557120.0, + "grad_norm": 1.9243906825553334, + "language_loss": 0.61456323, + "learning_rate": 3.0066258967640987e-06, + "loss": 0.69193071, + "num_input_tokens_seen": 125984310, + "router_z_loss_clip": 1.86523438, + "router_z_loss_mlp": 0.16003418, + "step": 5857, + "time_per_iteration": 2.723026752471924 + }, + { + "auxiliary_loss_clip": 0.06463988, + "auxiliary_loss_mlp": 0.0126934, + "balance_loss_clip": 0.06281644, + "balance_loss_mlp": 0.01253569, + "epoch": 0.3522020141289644, + "flos": 20192576503680.0, + "grad_norm": 1.9490734994800325, + "language_loss": 0.73682863, + "learning_rate": 3.006289342204152e-06, + "loss": 0.8141619, + "num_input_tokens_seen": 126002410, + "router_z_loss_clip": 1.82324219, + "router_z_loss_mlp": 0.15765381, + "step": 5858, + "time_per_iteration": 2.5245583057403564 + }, + { + "auxiliary_loss_clip": 0.0646653, + "auxiliary_loss_mlp": 0.01270245, + "balance_loss_clip": 0.06283493, + "balance_loss_mlp": 0.01255368, + "epoch": 0.35226213738163237, + "flos": 27571398428160.0, + "grad_norm": 1.5191641480211209, + "language_loss": 0.76385832, + "learning_rate": 3.0059527494856126e-06, + "loss": 0.8412261, + "num_input_tokens_seen": 126022490, + "router_z_loss_clip": 1.83105469, + "router_z_loss_mlp": 0.14880371, + "step": 5859, + "time_per_iteration": 2.5650510787963867 + }, + { + "auxiliary_loss_clip": 0.06474233, + "auxiliary_loss_mlp": 0.01272168, + "balance_loss_clip": 0.06283402, + "balance_loss_mlp": 0.01256862, + "epoch": 0.35232226063430033, + "flos": 22972955483520.0, + "grad_norm": 2.0210321352313305, + "language_loss": 0.72436023, + "learning_rate": 3.0056161186212435e-06, + "loss": 0.80182427, + "num_input_tokens_seen": 126042895, + "router_z_loss_clip": 1.90820312, + "router_z_loss_mlp": 0.15307617, + "step": 5860, + "time_per_iteration": 2.557419776916504 + }, + { + "auxiliary_loss_clip": 0.06468037, + "auxiliary_loss_mlp": 0.01273009, + "balance_loss_clip": 0.06280215, + "balance_loss_mlp": 0.01257304, + "epoch": 0.3523823838869683, + "flos": 19173714382080.0, + "grad_norm": 2.1675794505809076, + "language_loss": 0.66646308, + "learning_rate": 3.005279449623811e-06, + "loss": 0.74387354, + "num_input_tokens_seen": 126060130, + "router_z_loss_clip": 1.87890625, + "router_z_loss_mlp": 0.15704346, + "step": 5861, + "time_per_iteration": 5.330287218093872 + }, + { + "auxiliary_loss_clip": 0.06464717, + "auxiliary_loss_mlp": 0.01272322, + "balance_loss_clip": 0.06283809, + "balance_loss_mlp": 0.01257331, + "epoch": 0.35244250713963626, + "flos": 17936743282560.0, + "grad_norm": 1.8073030876467324, + "language_loss": 0.67339319, + "learning_rate": 3.0049427425060815e-06, + "loss": 0.7507636, + "num_input_tokens_seen": 126077850, + "router_z_loss_clip": 1.80859375, + "router_z_loss_mlp": 0.15002441, + "step": 5862, + "time_per_iteration": 2.545534372329712 + }, + { + "auxiliary_loss_clip": 0.06465253, + "auxiliary_loss_mlp": 0.01277428, + "balance_loss_clip": 0.06279148, + "balance_loss_mlp": 0.01260775, + "epoch": 0.35250263039230423, + "flos": 21438687697920.0, + "grad_norm": 2.06594301339393, + "language_loss": 0.76956195, + "learning_rate": 3.0046059972808215e-06, + "loss": 0.8469888, + "num_input_tokens_seen": 126095985, + "router_z_loss_clip": 1.859375, + "router_z_loss_mlp": 0.16650391, + "step": 5863, + "time_per_iteration": 2.5614800453186035 + }, + { + "auxiliary_loss_clip": 0.06466909, + "auxiliary_loss_mlp": 0.01270449, + "balance_loss_clip": 0.06283094, + "balance_loss_mlp": 0.01255846, + "epoch": 0.3525627536449722, + "flos": 27424133677440.0, + "grad_norm": 1.7204880099735786, + "language_loss": 0.75455201, + "learning_rate": 3.0042692139608024e-06, + "loss": 0.83192563, + "num_input_tokens_seen": 126116070, + "router_z_loss_clip": 1.83789062, + "router_z_loss_mlp": 0.14605713, + "step": 5864, + "time_per_iteration": 2.590428113937378 + }, + { + "auxiliary_loss_clip": 0.06465425, + "auxiliary_loss_mlp": 0.01271849, + "balance_loss_clip": 0.06283714, + "balance_loss_mlp": 0.01257306, + "epoch": 0.35262287689764016, + "flos": 24796637671680.0, + "grad_norm": 2.274548371802061, + "language_loss": 0.79325253, + "learning_rate": 3.003932392558793e-06, + "loss": 0.87062526, + "num_input_tokens_seen": 126135205, + "router_z_loss_clip": 1.81738281, + "router_z_loss_mlp": 0.14550781, + "step": 5865, + "time_per_iteration": 4.090251922607422 + }, + { + "auxiliary_loss_clip": 0.06479216, + "auxiliary_loss_mlp": 0.01273849, + "balance_loss_clip": 0.06290671, + "balance_loss_mlp": 0.01257935, + "epoch": 0.3526830001503081, + "flos": 17827353377280.0, + "grad_norm": 3.6346687905375155, + "language_loss": 0.81561065, + "learning_rate": 3.0035955330875677e-06, + "loss": 0.89314139, + "num_input_tokens_seen": 126151895, + "router_z_loss_clip": 1.88476562, + "router_z_loss_mlp": 0.15917969, + "step": 5866, + "time_per_iteration": 2.5417611598968506 + }, + { + "auxiliary_loss_clip": 0.06481875, + "auxiliary_loss_mlp": 0.01272499, + "balance_loss_clip": 0.06287797, + "balance_loss_mlp": 0.01255226, + "epoch": 0.3527431234029761, + "flos": 18084091887360.0, + "grad_norm": 2.1275369997353692, + "language_loss": 0.84947896, + "learning_rate": 3.0032586355598986e-06, + "loss": 0.9270227, + "num_input_tokens_seen": 126168515, + "router_z_loss_clip": 1.94042969, + "router_z_loss_mlp": 0.17272949, + "step": 5867, + "time_per_iteration": 2.487138509750366 + }, + { + "auxiliary_loss_clip": 0.06472977, + "auxiliary_loss_mlp": 0.01270369, + "balance_loss_clip": 0.06285943, + "balance_loss_mlp": 0.01254431, + "epoch": 0.35280324665564405, + "flos": 19433429712000.0, + "grad_norm": 2.157782607866355, + "language_loss": 0.74828005, + "learning_rate": 3.0029216999885613e-06, + "loss": 0.82571352, + "num_input_tokens_seen": 126186460, + "router_z_loss_clip": 1.86816406, + "router_z_loss_mlp": 0.15942383, + "step": 5868, + "time_per_iteration": 2.536522150039673 + }, + { + "auxiliary_loss_clip": 0.06471637, + "auxiliary_loss_mlp": 0.01277122, + "balance_loss_clip": 0.06284134, + "balance_loss_mlp": 0.01260277, + "epoch": 0.352863369908312, + "flos": 21509951195520.0, + "grad_norm": 2.023756469283546, + "language_loss": 0.6153, + "learning_rate": 3.0025847263863327e-06, + "loss": 0.69278765, + "num_input_tokens_seen": 126206170, + "router_z_loss_clip": 1.87402344, + "router_z_loss_mlp": 0.16845703, + "step": 5869, + "time_per_iteration": 3.977250099182129 + }, + { + "auxiliary_loss_clip": 0.06469242, + "auxiliary_loss_mlp": 0.01275411, + "balance_loss_clip": 0.06282457, + "balance_loss_mlp": 0.01259985, + "epoch": 0.35292349316098, + "flos": 22316029073280.0, + "grad_norm": 3.8155591266042173, + "language_loss": 0.75253737, + "learning_rate": 3.0022477147659917e-06, + "loss": 0.82998383, + "num_input_tokens_seen": 126225605, + "router_z_loss_clip": 1.86914062, + "router_z_loss_mlp": 0.1541748, + "step": 5870, + "time_per_iteration": 2.5275635719299316 + }, + { + "auxiliary_loss_clip": 0.06466261, + "auxiliary_loss_mlp": 0.01271259, + "balance_loss_clip": 0.06282211, + "balance_loss_mlp": 0.01255964, + "epoch": 0.352983616413648, + "flos": 33118152756480.0, + "grad_norm": 1.8217533687724534, + "language_loss": 0.72204906, + "learning_rate": 3.001910665140316e-06, + "loss": 0.79942429, + "num_input_tokens_seen": 126250230, + "router_z_loss_clip": 1.84082031, + "router_z_loss_mlp": 0.1529541, + "step": 5871, + "time_per_iteration": 2.660351037979126 + }, + { + "auxiliary_loss_clip": 0.06463222, + "auxiliary_loss_mlp": 0.012708, + "balance_loss_clip": 0.0628562, + "balance_loss_mlp": 0.01257389, + "epoch": 0.35304373966631597, + "flos": 18702388765440.0, + "grad_norm": 1.8432981727531608, + "language_loss": 0.73899144, + "learning_rate": 3.0015735775220873e-06, + "loss": 0.81633162, + "num_input_tokens_seen": 126268315, + "router_z_loss_clip": 1.77636719, + "router_z_loss_mlp": 0.13415527, + "step": 5872, + "time_per_iteration": 2.501868724822998 + }, + { + "auxiliary_loss_clip": 0.06467956, + "auxiliary_loss_mlp": 0.01269552, + "balance_loss_clip": 0.06285646, + "balance_loss_mlp": 0.01255163, + "epoch": 0.35310386291898394, + "flos": 23371214739840.0, + "grad_norm": 1.6596154000518588, + "language_loss": 0.83059716, + "learning_rate": 3.001236451924089e-06, + "loss": 0.90797222, + "num_input_tokens_seen": 126288390, + "router_z_loss_clip": 1.8203125, + "router_z_loss_mlp": 0.14404297, + "step": 5873, + "time_per_iteration": 2.6044130325317383 + }, + { + "auxiliary_loss_clip": 0.06475792, + "auxiliary_loss_mlp": 0.01275098, + "balance_loss_clip": 0.06285458, + "balance_loss_mlp": 0.0125879, + "epoch": 0.3531639861716519, + "flos": 24468803372160.0, + "grad_norm": 2.6977932070351183, + "language_loss": 0.65726781, + "learning_rate": 3.000899288359104e-06, + "loss": 0.73477674, + "num_input_tokens_seen": 126305750, + "router_z_loss_clip": 1.90234375, + "router_z_loss_mlp": 0.16308594, + "step": 5874, + "time_per_iteration": 2.558915138244629 + }, + { + "auxiliary_loss_clip": 0.06370112, + "auxiliary_loss_mlp": 0.01273024, + "balance_loss_clip": 0.06287491, + "balance_loss_mlp": 0.01268941, + "epoch": 0.35322410942431987, + "flos": 70331040437760.0, + "grad_norm": 0.7490717453474699, + "language_loss": 0.616135, + "learning_rate": 3.000562086839917e-06, + "loss": 0.69256639, + "num_input_tokens_seen": 126362495, + "router_z_loss_clip": 0.82714844, + "router_z_loss_mlp": 0.04083252, + "step": 5875, + "time_per_iteration": 3.1286721229553223 + }, + { + "auxiliary_loss_clip": 0.06475496, + "auxiliary_loss_mlp": 0.01277595, + "balance_loss_clip": 0.06289661, + "balance_loss_mlp": 0.01262086, + "epoch": 0.35328423267698783, + "flos": 19825735328640.0, + "grad_norm": 2.073373185113386, + "language_loss": 0.8042345, + "learning_rate": 3.0002248473793163e-06, + "loss": 0.88176548, + "num_input_tokens_seen": 126378320, + "router_z_loss_clip": 1.85742188, + "router_z_loss_mlp": 0.15509033, + "step": 5876, + "time_per_iteration": 2.5174875259399414 + }, + { + "auxiliary_loss_clip": 0.063563, + "auxiliary_loss_mlp": 0.01261292, + "balance_loss_clip": 0.06274077, + "balance_loss_mlp": 0.01257364, + "epoch": 0.3533443559296558, + "flos": 60843398480640.0, + "grad_norm": 0.6578323239794136, + "language_loss": 0.56720114, + "learning_rate": 2.999887569990088e-06, + "loss": 0.64337707, + "num_input_tokens_seen": 126442735, + "router_z_loss_clip": 0.82128906, + "router_z_loss_mlp": 0.03924561, + "step": 5877, + "time_per_iteration": 3.239800214767456 + }, + { + "auxiliary_loss_clip": 0.0647119, + "auxiliary_loss_mlp": 0.01275609, + "balance_loss_clip": 0.06286252, + "balance_loss_mlp": 0.01259301, + "epoch": 0.35340447918232376, + "flos": 24762997457280.0, + "grad_norm": 1.7728898292153, + "language_loss": 0.72425848, + "learning_rate": 2.999550254685024e-06, + "loss": 0.80172646, + "num_input_tokens_seen": 126463090, + "router_z_loss_clip": 1.84863281, + "router_z_loss_mlp": 0.16308594, + "step": 5878, + "time_per_iteration": 2.576354742050171 + }, + { + "auxiliary_loss_clip": 0.06470102, + "auxiliary_loss_mlp": 0.01272441, + "balance_loss_clip": 0.06286008, + "balance_loss_mlp": 0.01256789, + "epoch": 0.3534646024349917, + "flos": 21802342417920.0, + "grad_norm": 2.4353464978664494, + "language_loss": 0.78682542, + "learning_rate": 2.9992129014769136e-06, + "loss": 0.86425084, + "num_input_tokens_seen": 126482105, + "router_z_loss_clip": 1.84082031, + "router_z_loss_mlp": 0.15649414, + "step": 5879, + "time_per_iteration": 2.535600423812866 + }, + { + "auxiliary_loss_clip": 0.06481053, + "auxiliary_loss_mlp": 0.01271703, + "balance_loss_clip": 0.0628894, + "balance_loss_mlp": 0.01253714, + "epoch": 0.3535247256876597, + "flos": 20018463719040.0, + "grad_norm": 2.0590866059314035, + "language_loss": 0.63551295, + "learning_rate": 2.9988755103785493e-06, + "loss": 0.71304053, + "num_input_tokens_seen": 126502125, + "router_z_loss_clip": 1.91992188, + "router_z_loss_mlp": 0.17980957, + "step": 5880, + "time_per_iteration": 2.5576937198638916 + }, + { + "auxiliary_loss_clip": 0.06481048, + "auxiliary_loss_mlp": 0.01274855, + "balance_loss_clip": 0.06292346, + "balance_loss_mlp": 0.01258035, + "epoch": 0.35358484894032766, + "flos": 18193984917120.0, + "grad_norm": 2.6506562916801273, + "language_loss": 0.66346908, + "learning_rate": 2.998538081402727e-06, + "loss": 0.74102807, + "num_input_tokens_seen": 126521950, + "router_z_loss_clip": 1.88671875, + "router_z_loss_mlp": 0.16821289, + "step": 5881, + "time_per_iteration": 2.5375049114227295 + }, + { + "auxiliary_loss_clip": 0.06465093, + "auxiliary_loss_mlp": 0.01272514, + "balance_loss_clip": 0.06285467, + "balance_loss_mlp": 0.0125818, + "epoch": 0.3536449721929956, + "flos": 22826990471040.0, + "grad_norm": 1.7415962616346485, + "language_loss": 0.75838578, + "learning_rate": 2.998200614562239e-06, + "loss": 0.8357619, + "num_input_tokens_seen": 126542445, + "router_z_loss_clip": 1.79492188, + "router_z_loss_mlp": 0.14337158, + "step": 5882, + "time_per_iteration": 2.546163558959961 + }, + { + "auxiliary_loss_clip": 0.06472618, + "auxiliary_loss_mlp": 0.01271877, + "balance_loss_clip": 0.06285325, + "balance_loss_mlp": 0.01256189, + "epoch": 0.3537050954456636, + "flos": 26439540675840.0, + "grad_norm": 2.210270342508568, + "language_loss": 0.70790988, + "learning_rate": 2.9978631098698847e-06, + "loss": 0.78535485, + "num_input_tokens_seen": 126560690, + "router_z_loss_clip": 1.87402344, + "router_z_loss_mlp": 0.15692139, + "step": 5883, + "time_per_iteration": 2.5813896656036377 + }, + { + "auxiliary_loss_clip": 0.06481725, + "auxiliary_loss_mlp": 0.01274676, + "balance_loss_clip": 0.0628854, + "balance_loss_mlp": 0.01258105, + "epoch": 0.3537652186983316, + "flos": 17202096610560.0, + "grad_norm": 3.5308447991949348, + "language_loss": 0.7912811, + "learning_rate": 2.9975255673384614e-06, + "loss": 0.86884505, + "num_input_tokens_seen": 126577620, + "router_z_loss_clip": 1.9296875, + "router_z_loss_mlp": 0.16564941, + "step": 5884, + "time_per_iteration": 2.564178228378296 + }, + { + "auxiliary_loss_clip": 0.06469014, + "auxiliary_loss_mlp": 0.01273424, + "balance_loss_clip": 0.06285414, + "balance_loss_mlp": 0.01258142, + "epoch": 0.3538253419509996, + "flos": 19542861544320.0, + "grad_norm": 3.0890260502514173, + "language_loss": 0.76079619, + "learning_rate": 2.9971879869807673e-06, + "loss": 0.83822054, + "num_input_tokens_seen": 126596235, + "router_z_loss_clip": 1.83691406, + "router_z_loss_mlp": 0.15283203, + "step": 5885, + "time_per_iteration": 2.5860350131988525 + }, + { + "auxiliary_loss_clip": 0.06473316, + "auxiliary_loss_mlp": 0.01274145, + "balance_loss_clip": 0.06285691, + "balance_loss_mlp": 0.01257766, + "epoch": 0.35388546520366754, + "flos": 12133166590080.0, + "grad_norm": 4.983567417880078, + "language_loss": 0.83563066, + "learning_rate": 2.996850368809606e-06, + "loss": 0.91310525, + "num_input_tokens_seen": 126612830, + "router_z_loss_clip": 1.87597656, + "router_z_loss_mlp": 0.16357422, + "step": 5886, + "time_per_iteration": 2.549227714538574 + }, + { + "auxiliary_loss_clip": 0.06464715, + "auxiliary_loss_mlp": 0.01274591, + "balance_loss_clip": 0.06282739, + "balance_loss_mlp": 0.0125851, + "epoch": 0.3539455884563355, + "flos": 19683501822720.0, + "grad_norm": 3.219387216821374, + "language_loss": 0.78429639, + "learning_rate": 2.9965127128377787e-06, + "loss": 0.86168945, + "num_input_tokens_seen": 126630910, + "router_z_loss_clip": 1.82128906, + "router_z_loss_mlp": 0.16088867, + "step": 5887, + "time_per_iteration": 2.523743152618408 + }, + { + "auxiliary_loss_clip": 0.0646676, + "auxiliary_loss_mlp": 0.0127383, + "balance_loss_clip": 0.06282511, + "balance_loss_mlp": 0.01258631, + "epoch": 0.35400571170900347, + "flos": 18077006217600.0, + "grad_norm": 1.8956957640615841, + "language_loss": 0.66116667, + "learning_rate": 2.996175019078089e-06, + "loss": 0.7385726, + "num_input_tokens_seen": 126648365, + "router_z_loss_clip": 1.84570312, + "router_z_loss_mlp": 0.15197754, + "step": 5888, + "time_per_iteration": 2.5279300212860107 + }, + { + "auxiliary_loss_clip": 0.06467725, + "auxiliary_loss_mlp": 0.01271718, + "balance_loss_clip": 0.06284125, + "balance_loss_mlp": 0.01256185, + "epoch": 0.35406583496167143, + "flos": 26075298977280.0, + "grad_norm": 2.3097601077816443, + "language_loss": 0.76721621, + "learning_rate": 2.9958372875433437e-06, + "loss": 0.84461069, + "num_input_tokens_seen": 126667500, + "router_z_loss_clip": 1.8359375, + "router_z_loss_mlp": 0.15527344, + "step": 5889, + "time_per_iteration": 2.564761161804199 + }, + { + "auxiliary_loss_clip": 0.06465457, + "auxiliary_loss_mlp": 0.01270164, + "balance_loss_clip": 0.06283142, + "balance_loss_mlp": 0.01254357, + "epoch": 0.3541259582143394, + "flos": 19798635732480.0, + "grad_norm": 2.1640548649274116, + "language_loss": 0.81408846, + "learning_rate": 2.9954995182463478e-06, + "loss": 0.89144462, + "num_input_tokens_seen": 126686820, + "router_z_loss_clip": 1.82421875, + "router_z_loss_mlp": 0.15808105, + "step": 5890, + "time_per_iteration": 2.5614936351776123 + }, + { + "auxiliary_loss_clip": 0.06466024, + "auxiliary_loss_mlp": 0.01270173, + "balance_loss_clip": 0.06285816, + "balance_loss_mlp": 0.01256094, + "epoch": 0.35418608146700736, + "flos": 24028518493440.0, + "grad_norm": 1.6495661544524922, + "language_loss": 0.80017459, + "learning_rate": 2.99516171119991e-06, + "loss": 0.87753654, + "num_input_tokens_seen": 126706965, + "router_z_loss_clip": 1.80078125, + "router_z_loss_mlp": 0.14074707, + "step": 5891, + "time_per_iteration": 2.553158760070801 + }, + { + "auxiliary_loss_clip": 0.06471643, + "auxiliary_loss_mlp": 0.01282427, + "balance_loss_clip": 0.06289162, + "balance_loss_mlp": 0.01265928, + "epoch": 0.35424620471967533, + "flos": 12390701713920.0, + "grad_norm": 1.7694155250203176, + "language_loss": 0.73450041, + "learning_rate": 2.9948238664168415e-06, + "loss": 0.81204116, + "num_input_tokens_seen": 126724015, + "router_z_loss_clip": 1.82226562, + "router_z_loss_mlp": 0.16516113, + "step": 5892, + "time_per_iteration": 2.529136896133423 + }, + { + "auxiliary_loss_clip": 0.06470741, + "auxiliary_loss_mlp": 0.01274401, + "balance_loss_clip": 0.06286078, + "balance_loss_mlp": 0.01259059, + "epoch": 0.3543063279723433, + "flos": 19678219015680.0, + "grad_norm": 3.019670501918518, + "language_loss": 0.67408991, + "learning_rate": 2.9944859839099518e-06, + "loss": 0.75154132, + "num_input_tokens_seen": 126737565, + "router_z_loss_clip": 1.84667969, + "router_z_loss_mlp": 0.15344238, + "step": 5893, + "time_per_iteration": 2.507456064224243 + }, + { + "auxiliary_loss_clip": 0.06469926, + "auxiliary_loss_mlp": 0.01274247, + "balance_loss_clip": 0.06287754, + "balance_loss_mlp": 0.01257545, + "epoch": 0.35436645122501126, + "flos": 21915841173120.0, + "grad_norm": 1.8801549379271045, + "language_loss": 0.70079887, + "learning_rate": 2.9941480636920533e-06, + "loss": 0.77824062, + "num_input_tokens_seen": 126756095, + "router_z_loss_clip": 1.82128906, + "router_z_loss_mlp": 0.16711426, + "step": 5894, + "time_per_iteration": 2.5596466064453125 + }, + { + "auxiliary_loss_clip": 0.0646911, + "auxiliary_loss_mlp": 0.0127714, + "balance_loss_clip": 0.06291118, + "balance_loss_mlp": 0.0126256, + "epoch": 0.3544265744776792, + "flos": 21724915645440.0, + "grad_norm": 1.8040348457355686, + "language_loss": 0.74516678, + "learning_rate": 2.9938101057759615e-06, + "loss": 0.82262927, + "num_input_tokens_seen": 126775455, + "router_z_loss_clip": 1.78027344, + "router_z_loss_mlp": 0.14569092, + "step": 5895, + "time_per_iteration": 2.602884531021118 + }, + { + "auxiliary_loss_clip": 0.06476314, + "auxiliary_loss_mlp": 0.01274747, + "balance_loss_clip": 0.06292941, + "balance_loss_mlp": 0.01259643, + "epoch": 0.3544866977303472, + "flos": 21219278981760.0, + "grad_norm": 1.7647167527567422, + "language_loss": 0.83600783, + "learning_rate": 2.993472110174491e-06, + "loss": 0.91351843, + "num_input_tokens_seen": 126792320, + "router_z_loss_clip": 1.83300781, + "router_z_loss_mlp": 0.15100098, + "step": 5896, + "time_per_iteration": 2.5642035007476807 + }, + { + "auxiliary_loss_clip": 0.06472465, + "auxiliary_loss_mlp": 0.01278933, + "balance_loss_clip": 0.06292751, + "balance_loss_mlp": 0.01261576, + "epoch": 0.35454682098301515, + "flos": 29318534311680.0, + "grad_norm": 1.8515152904238923, + "language_loss": 0.70294917, + "learning_rate": 2.9931340769004576e-06, + "loss": 0.7804631, + "num_input_tokens_seen": 126813680, + "router_z_loss_clip": 1.79785156, + "router_z_loss_mlp": 0.17346191, + "step": 5897, + "time_per_iteration": 2.613032341003418 + }, + { + "auxiliary_loss_clip": 0.06475735, + "auxiliary_loss_mlp": 0.01274261, + "balance_loss_clip": 0.06293957, + "balance_loss_mlp": 0.01259205, + "epoch": 0.3546069442356832, + "flos": 24323509192320.0, + "grad_norm": 1.6960731630978507, + "language_loss": 0.81964374, + "learning_rate": 2.9927960059666816e-06, + "loss": 0.89714372, + "num_input_tokens_seen": 126834395, + "router_z_loss_clip": 1.81933594, + "router_z_loss_mlp": 0.15063477, + "step": 5898, + "time_per_iteration": 2.6033098697662354 + }, + { + "auxiliary_loss_clip": 0.06471986, + "auxiliary_loss_mlp": 0.01279895, + "balance_loss_clip": 0.0629501, + "balance_loss_mlp": 0.01265173, + "epoch": 0.35466706748835114, + "flos": 22863984848640.0, + "grad_norm": 1.4933011631381068, + "language_loss": 0.74405515, + "learning_rate": 2.9924578973859804e-06, + "loss": 0.82157397, + "num_input_tokens_seen": 126855145, + "router_z_loss_clip": 1.76855469, + "router_z_loss_mlp": 0.14727783, + "step": 5899, + "time_per_iteration": 2.5492894649505615 + }, + { + "auxiliary_loss_clip": 0.0647797, + "auxiliary_loss_mlp": 0.01272872, + "balance_loss_clip": 0.06294148, + "balance_loss_mlp": 0.01257196, + "epoch": 0.3547271907410191, + "flos": 28337714743680.0, + "grad_norm": 3.4583325446366673, + "language_loss": 0.80211669, + "learning_rate": 2.9921197511711763e-06, + "loss": 0.87962508, + "num_input_tokens_seen": 126873790, + "router_z_loss_clip": 1.83886719, + "router_z_loss_mlp": 0.15698242, + "step": 5900, + "time_per_iteration": 5.435121774673462 + }, + { + "auxiliary_loss_clip": 0.06478105, + "auxiliary_loss_mlp": 0.01279951, + "balance_loss_clip": 0.06296446, + "balance_loss_mlp": 0.01263607, + "epoch": 0.35478731399368707, + "flos": 23520911258880.0, + "grad_norm": 2.0942596894242533, + "language_loss": 0.8216058, + "learning_rate": 2.991781567335093e-06, + "loss": 0.89918637, + "num_input_tokens_seen": 126892865, + "router_z_loss_clip": 1.81445312, + "router_z_loss_mlp": 0.16357422, + "step": 5901, + "time_per_iteration": 2.603769540786743 + }, + { + "auxiliary_loss_clip": 0.06480999, + "auxiliary_loss_mlp": 0.01277169, + "balance_loss_clip": 0.06295676, + "balance_loss_mlp": 0.01261899, + "epoch": 0.35484743724635504, + "flos": 18630202872960.0, + "grad_norm": 2.2545917554681663, + "language_loss": 0.75979805, + "learning_rate": 2.9914433458905525e-06, + "loss": 0.83737969, + "num_input_tokens_seen": 126911935, + "router_z_loss_clip": 1.85253906, + "router_z_loss_mlp": 0.152771, + "step": 5902, + "time_per_iteration": 2.5356359481811523 + }, + { + "auxiliary_loss_clip": 0.06482422, + "auxiliary_loss_mlp": 0.01280542, + "balance_loss_clip": 0.06300852, + "balance_loss_mlp": 0.01265331, + "epoch": 0.354907560499023, + "flos": 17390296880640.0, + "grad_norm": 1.6908684001073404, + "language_loss": 0.70729327, + "learning_rate": 2.991105086850381e-06, + "loss": 0.78492296, + "num_input_tokens_seen": 126930040, + "router_z_loss_clip": 1.81640625, + "router_z_loss_mlp": 0.15209961, + "step": 5903, + "time_per_iteration": 2.52494478225708 + }, + { + "auxiliary_loss_clip": 0.06482972, + "auxiliary_loss_mlp": 0.01276075, + "balance_loss_clip": 0.06297173, + "balance_loss_mlp": 0.0125929, + "epoch": 0.35496768375169097, + "flos": 19214607974400.0, + "grad_norm": 2.9744492269587153, + "language_loss": 0.75001359, + "learning_rate": 2.9907667902274053e-06, + "loss": 0.82760406, + "num_input_tokens_seen": 126948390, + "router_z_loss_clip": 1.85742188, + "router_z_loss_mlp": 0.16784668, + "step": 5904, + "time_per_iteration": 2.5316994190216064 + }, + { + "auxiliary_loss_clip": 0.0648163, + "auxiliary_loss_mlp": 0.01277137, + "balance_loss_clip": 0.06297497, + "balance_loss_mlp": 0.01261902, + "epoch": 0.35502780700435893, + "flos": 18338692118400.0, + "grad_norm": 2.2144866791488536, + "language_loss": 0.78981996, + "learning_rate": 2.9904284560344536e-06, + "loss": 0.86740756, + "num_input_tokens_seen": 126964905, + "router_z_loss_clip": 1.84179688, + "router_z_loss_mlp": 0.15246582, + "step": 5905, + "time_per_iteration": 3.9867374897003174 + }, + { + "auxiliary_loss_clip": 0.06472038, + "auxiliary_loss_mlp": 0.01276232, + "balance_loss_clip": 0.06301226, + "balance_loss_mlp": 0.01262249, + "epoch": 0.3550879302570269, + "flos": 15453660988800.0, + "grad_norm": 1.8340819850757704, + "language_loss": 0.72531646, + "learning_rate": 2.990090084284356e-06, + "loss": 0.80279917, + "num_input_tokens_seen": 126982000, + "router_z_loss_clip": 1.70703125, + "router_z_loss_mlp": 0.13977051, + "step": 5906, + "time_per_iteration": 2.5326547622680664 + }, + { + "auxiliary_loss_clip": 0.06491787, + "auxiliary_loss_mlp": 0.01272032, + "balance_loss_clip": 0.06306198, + "balance_loss_mlp": 0.01256046, + "epoch": 0.35514805350969486, + "flos": 21985343735040.0, + "grad_norm": 1.9483914182465616, + "language_loss": 0.75052631, + "learning_rate": 2.9897516749899426e-06, + "loss": 0.82816458, + "num_input_tokens_seen": 126998390, + "router_z_loss_clip": 1.85839844, + "router_z_loss_mlp": 0.15991211, + "step": 5907, + "time_per_iteration": 2.526137113571167 + }, + { + "auxiliary_loss_clip": 0.06486456, + "auxiliary_loss_mlp": 0.01280245, + "balance_loss_clip": 0.06305459, + "balance_loss_mlp": 0.01264271, + "epoch": 0.3552081767623628, + "flos": 29869718469120.0, + "grad_norm": 2.2786495725258424, + "language_loss": 0.76563632, + "learning_rate": 2.989413228164047e-06, + "loss": 0.84330332, + "num_input_tokens_seen": 127020220, + "router_z_loss_clip": 1.81152344, + "router_z_loss_mlp": 0.15966797, + "step": 5908, + "time_per_iteration": 4.063998222351074 + }, + { + "auxiliary_loss_clip": 0.06491728, + "auxiliary_loss_mlp": 0.01276886, + "balance_loss_clip": 0.06310974, + "balance_loss_mlp": 0.0126146, + "epoch": 0.3552683000150308, + "flos": 26439456821760.0, + "grad_norm": 2.352503484530038, + "language_loss": 0.68572766, + "learning_rate": 2.989074743819502e-06, + "loss": 0.76341379, + "num_input_tokens_seen": 127038585, + "router_z_loss_clip": 1.80566406, + "router_z_loss_mlp": 0.15429688, + "step": 5909, + "time_per_iteration": 2.6902143955230713 + }, + { + "auxiliary_loss_clip": 0.0648414, + "auxiliary_loss_mlp": 0.01282146, + "balance_loss_clip": 0.06310885, + "balance_loss_mlp": 0.01268061, + "epoch": 0.35532842326769876, + "flos": 19791088865280.0, + "grad_norm": 1.9680680199916993, + "language_loss": 0.79103023, + "learning_rate": 2.988736221969144e-06, + "loss": 0.86869311, + "num_input_tokens_seen": 127056215, + "router_z_loss_clip": 1.73144531, + "router_z_loss_mlp": 0.14086914, + "step": 5910, + "time_per_iteration": 2.535050630569458 + }, + { + "auxiliary_loss_clip": 0.06495271, + "auxiliary_loss_mlp": 0.01274944, + "balance_loss_clip": 0.06310071, + "balance_loss_mlp": 0.0125841, + "epoch": 0.3553885465203668, + "flos": 17245170408960.0, + "grad_norm": 1.607302447744311, + "language_loss": 0.7130779, + "learning_rate": 2.98839766262581e-06, + "loss": 0.79078007, + "num_input_tokens_seen": 127075825, + "router_z_loss_clip": 1.85253906, + "router_z_loss_mlp": 0.1652832, + "step": 5911, + "time_per_iteration": 2.572942018508911 + }, + { + "auxiliary_loss_clip": 0.06485709, + "auxiliary_loss_mlp": 0.01272785, + "balance_loss_clip": 0.06309631, + "balance_loss_mlp": 0.01258313, + "epoch": 0.35544866977303474, + "flos": 14938800376320.0, + "grad_norm": 2.1423891041027514, + "language_loss": 0.87973344, + "learning_rate": 2.9880590658023366e-06, + "loss": 0.95731837, + "num_input_tokens_seen": 127091205, + "router_z_loss_clip": 1.76074219, + "router_z_loss_mlp": 0.14477539, + "step": 5912, + "time_per_iteration": 2.4826059341430664 + }, + { + "auxiliary_loss_clip": 0.0648666, + "auxiliary_loss_mlp": 0.01278679, + "balance_loss_clip": 0.0630875, + "balance_loss_mlp": 0.0126441, + "epoch": 0.3555087930257027, + "flos": 19762228333440.0, + "grad_norm": 2.0928412919366477, + "language_loss": 0.77506435, + "learning_rate": 2.9877204315115646e-06, + "loss": 0.8527177, + "num_input_tokens_seen": 127109210, + "router_z_loss_clip": 1.77832031, + "router_z_loss_mlp": 0.14251709, + "step": 5913, + "time_per_iteration": 2.577362060546875 + }, + { + "auxiliary_loss_clip": 0.06486008, + "auxiliary_loss_mlp": 0.01273445, + "balance_loss_clip": 0.06311025, + "balance_loss_mlp": 0.01258789, + "epoch": 0.3555689162783707, + "flos": 21074445999360.0, + "grad_norm": 5.920108951080063, + "language_loss": 0.82525283, + "learning_rate": 2.9873817597663353e-06, + "loss": 0.90284735, + "num_input_tokens_seen": 127128400, + "router_z_loss_clip": 1.75097656, + "router_z_loss_mlp": 0.14660645, + "step": 5914, + "time_per_iteration": 2.521756649017334 + }, + { + "auxiliary_loss_clip": 0.06490604, + "auxiliary_loss_mlp": 0.01268632, + "balance_loss_clip": 0.06310836, + "balance_loss_mlp": 0.01254118, + "epoch": 0.35562903953103864, + "flos": 33077426872320.0, + "grad_norm": 3.2692214801304686, + "language_loss": 0.7113682, + "learning_rate": 2.98704305057949e-06, + "loss": 0.78896052, + "num_input_tokens_seen": 127149965, + "router_z_loss_clip": 1.79882812, + "router_z_loss_mlp": 0.14508057, + "step": 5915, + "time_per_iteration": 2.6931562423706055 + }, + { + "auxiliary_loss_clip": 0.06477264, + "auxiliary_loss_mlp": 0.01269512, + "balance_loss_clip": 0.06297429, + "balance_loss_mlp": 0.01254814, + "epoch": 0.3556891627837066, + "flos": 20564029653120.0, + "grad_norm": 4.458093980019367, + "language_loss": 0.76718718, + "learning_rate": 2.9867043039638737e-06, + "loss": 0.84465492, + "num_input_tokens_seen": 127169865, + "router_z_loss_clip": 1.796875, + "router_z_loss_mlp": 0.14697266, + "step": 5916, + "time_per_iteration": 2.5489182472229004 + }, + { + "auxiliary_loss_clip": 0.06487325, + "auxiliary_loss_mlp": 0.01272059, + "balance_loss_clip": 0.06307879, + "balance_loss_mlp": 0.01256651, + "epoch": 0.35574928603637457, + "flos": 20709449614080.0, + "grad_norm": 1.674174142445476, + "language_loss": 0.88208687, + "learning_rate": 2.986365519932332e-06, + "loss": 0.95968074, + "num_input_tokens_seen": 127188075, + "router_z_loss_clip": 1.79589844, + "router_z_loss_mlp": 0.1539917, + "step": 5917, + "time_per_iteration": 2.6043195724487305 + }, + { + "auxiliary_loss_clip": 0.0649041, + "auxiliary_loss_mlp": 0.0126981, + "balance_loss_clip": 0.0631107, + "balance_loss_mlp": 0.01254289, + "epoch": 0.35580940928904253, + "flos": 15199899298560.0, + "grad_norm": 3.6980401889874086, + "language_loss": 0.75538862, + "learning_rate": 2.98602669849771e-06, + "loss": 0.83299077, + "num_input_tokens_seen": 127206065, + "router_z_loss_clip": 1.79296875, + "router_z_loss_mlp": 0.15515137, + "step": 5918, + "time_per_iteration": 2.5186190605163574 + }, + { + "auxiliary_loss_clip": 0.06461592, + "auxiliary_loss_mlp": 0.01285001, + "balance_loss_clip": 0.06381316, + "balance_loss_mlp": 0.01279086, + "epoch": 0.3558695325417105, + "flos": 58656145426560.0, + "grad_norm": 0.8458689331650495, + "language_loss": 0.63255095, + "learning_rate": 2.985687839672857e-06, + "loss": 0.71001691, + "num_input_tokens_seen": 127257885, + "router_z_loss_clip": 0.80224609, + "router_z_loss_mlp": 0.05911255, + "step": 5919, + "time_per_iteration": 2.9552297592163086 + }, + { + "auxiliary_loss_clip": 0.06485933, + "auxiliary_loss_mlp": 0.01271829, + "balance_loss_clip": 0.06302524, + "balance_loss_mlp": 0.01255998, + "epoch": 0.35592965579437846, + "flos": 22024811808000.0, + "grad_norm": 2.2679396062128188, + "language_loss": 0.74402696, + "learning_rate": 2.9853489434706223e-06, + "loss": 0.82160461, + "num_input_tokens_seen": 127275550, + "router_z_loss_clip": 1.83496094, + "router_z_loss_mlp": 0.1583252, + "step": 5920, + "time_per_iteration": 2.54848313331604 + }, + { + "auxiliary_loss_clip": 0.06483243, + "auxiliary_loss_mlp": 0.01277956, + "balance_loss_clip": 0.06304519, + "balance_loss_mlp": 0.01262638, + "epoch": 0.35598977904704643, + "flos": 23374401194880.0, + "grad_norm": 3.1552684799501733, + "language_loss": 0.77735227, + "learning_rate": 2.985010009903857e-06, + "loss": 0.85496426, + "num_input_tokens_seen": 127295110, + "router_z_loss_clip": 1.78710938, + "router_z_loss_mlp": 0.15332031, + "step": 5921, + "time_per_iteration": 2.6517810821533203 + }, + { + "auxiliary_loss_clip": 0.06490617, + "auxiliary_loss_mlp": 0.01276672, + "balance_loss_clip": 0.06309058, + "balance_loss_mlp": 0.01261329, + "epoch": 0.3560499022997144, + "flos": 17791113686400.0, + "grad_norm": 2.349487021583332, + "language_loss": 0.6770314, + "learning_rate": 2.9846710389854133e-06, + "loss": 0.75470436, + "num_input_tokens_seen": 127312865, + "router_z_loss_clip": 1.81640625, + "router_z_loss_mlp": 0.15332031, + "step": 5922, + "time_per_iteration": 2.525566577911377 + }, + { + "auxiliary_loss_clip": 0.06484485, + "auxiliary_loss_mlp": 0.0127389, + "balance_loss_clip": 0.06306913, + "balance_loss_mlp": 0.01258524, + "epoch": 0.35611002555238236, + "flos": 20746695553920.0, + "grad_norm": 2.231194122260979, + "language_loss": 0.79304701, + "learning_rate": 2.9843320307281454e-06, + "loss": 0.87063074, + "num_input_tokens_seen": 127331710, + "router_z_loss_clip": 1.77441406, + "router_z_loss_mlp": 0.15380859, + "step": 5923, + "time_per_iteration": 2.5809409618377686 + }, + { + "auxiliary_loss_clip": 0.06479051, + "auxiliary_loss_mlp": 0.01272719, + "balance_loss_clip": 0.06301268, + "balance_loss_mlp": 0.01257579, + "epoch": 0.3561701488050504, + "flos": 19468034248320.0, + "grad_norm": 1.61778925366919, + "language_loss": 0.8543126, + "learning_rate": 2.983992985144908e-06, + "loss": 0.93183035, + "num_input_tokens_seen": 127350950, + "router_z_loss_clip": 1.77929688, + "router_z_loss_mlp": 0.15148926, + "step": 5924, + "time_per_iteration": 2.524949312210083 + }, + { + "auxiliary_loss_clip": 0.06478724, + "auxiliary_loss_mlp": 0.01271843, + "balance_loss_clip": 0.06301951, + "balance_loss_mlp": 0.01255797, + "epoch": 0.35623027205771834, + "flos": 30783006046080.0, + "grad_norm": 1.9504196686726267, + "language_loss": 0.77609557, + "learning_rate": 2.9836539022485578e-06, + "loss": 0.85360122, + "num_input_tokens_seen": 127369385, + "router_z_loss_clip": 1.76660156, + "router_z_loss_mlp": 0.16033936, + "step": 5925, + "time_per_iteration": 2.6268069744110107 + }, + { + "auxiliary_loss_clip": 0.06472521, + "auxiliary_loss_mlp": 0.01273729, + "balance_loss_clip": 0.06292735, + "balance_loss_mlp": 0.01258291, + "epoch": 0.3562903953103863, + "flos": 16986461328000.0, + "grad_norm": 1.8072288436418724, + "language_loss": 0.76488966, + "learning_rate": 2.9833147820519535e-06, + "loss": 0.84235215, + "num_input_tokens_seen": 127386965, + "router_z_loss_clip": 1.79589844, + "router_z_loss_mlp": 0.15441895, + "step": 5926, + "time_per_iteration": 2.492009401321411 + }, + { + "auxiliary_loss_clip": 0.064781, + "auxiliary_loss_mlp": 0.01271518, + "balance_loss_clip": 0.06293385, + "balance_loss_mlp": 0.01255478, + "epoch": 0.3563505185630543, + "flos": 23846271863040.0, + "grad_norm": 2.038892178711472, + "language_loss": 0.69665909, + "learning_rate": 2.9829756245679544e-06, + "loss": 0.77415526, + "num_input_tokens_seen": 127406075, + "router_z_loss_clip": 1.84863281, + "router_z_loss_mlp": 0.16046143, + "step": 5927, + "time_per_iteration": 2.555192708969116 + }, + { + "auxiliary_loss_clip": 0.06471409, + "auxiliary_loss_mlp": 0.01273845, + "balance_loss_clip": 0.06293224, + "balance_loss_mlp": 0.0125889, + "epoch": 0.35641064181572224, + "flos": 22280040944640.0, + "grad_norm": 1.7768317666214009, + "language_loss": 0.79454333, + "learning_rate": 2.9826364298094212e-06, + "loss": 0.87199581, + "num_input_tokens_seen": 127425350, + "router_z_loss_clip": 1.78027344, + "router_z_loss_mlp": 0.1494751, + "step": 5928, + "time_per_iteration": 2.5192928314208984 + }, + { + "auxiliary_loss_clip": 0.06473258, + "auxiliary_loss_mlp": 0.01271381, + "balance_loss_clip": 0.06294424, + "balance_loss_mlp": 0.01256439, + "epoch": 0.3564707650683902, + "flos": 23007643873920.0, + "grad_norm": 1.230692465633979, + "language_loss": 0.8197661, + "learning_rate": 2.982297197789215e-06, + "loss": 0.89721251, + "num_input_tokens_seen": 127446335, + "router_z_loss_clip": 1.78808594, + "router_z_loss_mlp": 0.1494751, + "step": 5929, + "time_per_iteration": 2.6044368743896484 + }, + { + "auxiliary_loss_clip": 0.0646459, + "auxiliary_loss_mlp": 0.01268428, + "balance_loss_clip": 0.06289564, + "balance_loss_mlp": 0.01253765, + "epoch": 0.35653088832105817, + "flos": 14689566806400.0, + "grad_norm": 1.5209281639747478, + "language_loss": 0.70385516, + "learning_rate": 2.981957928520201e-06, + "loss": 0.78118533, + "num_input_tokens_seen": 127462795, + "router_z_loss_clip": 1.75097656, + "router_z_loss_mlp": 0.14685059, + "step": 5930, + "time_per_iteration": 2.498253107070923 + }, + { + "auxiliary_loss_clip": 0.06473252, + "auxiliary_loss_mlp": 0.01273096, + "balance_loss_clip": 0.06289653, + "balance_loss_mlp": 0.01256943, + "epoch": 0.35659101157372614, + "flos": 23483791100160.0, + "grad_norm": 2.174064041384607, + "language_loss": 0.68760598, + "learning_rate": 2.981618622015244e-06, + "loss": 0.76506943, + "num_input_tokens_seen": 127482675, + "router_z_loss_clip": 1.83496094, + "router_z_loss_mlp": 0.16162109, + "step": 5931, + "time_per_iteration": 2.5391998291015625 + }, + { + "auxiliary_loss_clip": 0.06463969, + "auxiliary_loss_mlp": 0.0126845, + "balance_loss_clip": 0.06288578, + "balance_loss_mlp": 0.01253788, + "epoch": 0.3566511348263941, + "flos": 26585966885760.0, + "grad_norm": 1.5444695234240167, + "language_loss": 0.68331707, + "learning_rate": 2.981279278287211e-06, + "loss": 0.76064122, + "num_input_tokens_seen": 127502275, + "router_z_loss_clip": 1.75195312, + "router_z_loss_mlp": 0.14660645, + "step": 5932, + "time_per_iteration": 2.553738832473755 + }, + { + "auxiliary_loss_clip": 0.06465189, + "auxiliary_loss_mlp": 0.01272147, + "balance_loss_clip": 0.06290227, + "balance_loss_mlp": 0.01257854, + "epoch": 0.35671125807906207, + "flos": 13119981724800.0, + "grad_norm": 2.4744838507658917, + "language_loss": 0.79635656, + "learning_rate": 2.980939897348969e-06, + "loss": 0.87372994, + "num_input_tokens_seen": 127520195, + "router_z_loss_clip": 1.74902344, + "router_z_loss_mlp": 0.14294434, + "step": 5933, + "time_per_iteration": 2.573812961578369 + }, + { + "auxiliary_loss_clip": 0.06470121, + "auxiliary_loss_mlp": 0.01270309, + "balance_loss_clip": 0.06288668, + "balance_loss_mlp": 0.01255372, + "epoch": 0.35677138133173003, + "flos": 33009014413440.0, + "grad_norm": 1.4096936090904761, + "language_loss": 0.69970256, + "learning_rate": 2.980600479213388e-06, + "loss": 0.77710688, + "num_input_tokens_seen": 127544495, + "router_z_loss_clip": 1.81445312, + "router_z_loss_mlp": 0.14929199, + "step": 5934, + "time_per_iteration": 2.6381173133850098 + }, + { + "auxiliary_loss_clip": 0.06481285, + "auxiliary_loss_mlp": 0.01277705, + "balance_loss_clip": 0.06294179, + "balance_loss_mlp": 0.01260741, + "epoch": 0.356831504584398, + "flos": 20784234983040.0, + "grad_norm": 2.103415594097178, + "language_loss": 0.72006869, + "learning_rate": 2.9802610238933384e-06, + "loss": 0.79765862, + "num_input_tokens_seen": 127563810, + "router_z_loss_clip": 1.87304688, + "router_z_loss_mlp": 0.16967773, + "step": 5935, + "time_per_iteration": 2.620471954345703 + }, + { + "auxiliary_loss_clip": 0.06467808, + "auxiliary_loss_mlp": 0.01275583, + "balance_loss_clip": 0.06287988, + "balance_loss_mlp": 0.01261004, + "epoch": 0.35689162783706596, + "flos": 12170244821760.0, + "grad_norm": 2.011082803426264, + "language_loss": 0.78423738, + "learning_rate": 2.979921531401692e-06, + "loss": 0.86167133, + "num_input_tokens_seen": 127579065, + "router_z_loss_clip": 1.796875, + "router_z_loss_mlp": 0.14569092, + "step": 5936, + "time_per_iteration": 2.4827091693878174 + }, + { + "auxiliary_loss_clip": 0.06466486, + "auxiliary_loss_mlp": 0.01273239, + "balance_loss_clip": 0.06289199, + "balance_loss_mlp": 0.01258147, + "epoch": 0.356951751089734, + "flos": 23848200506880.0, + "grad_norm": 1.8250890312079233, + "language_loss": 0.64893055, + "learning_rate": 2.9795820017513242e-06, + "loss": 0.72632784, + "num_input_tokens_seen": 127599105, + "router_z_loss_clip": 1.77636719, + "router_z_loss_mlp": 0.15100098, + "step": 5937, + "time_per_iteration": 2.5968148708343506 + }, + { + "auxiliary_loss_clip": 0.06470716, + "auxiliary_loss_mlp": 0.01277052, + "balance_loss_clip": 0.06291182, + "balance_loss_mlp": 0.01261644, + "epoch": 0.35701187434240195, + "flos": 11725851093120.0, + "grad_norm": 3.2825373138133633, + "language_loss": 0.79029787, + "learning_rate": 2.9792424349551073e-06, + "loss": 0.86777556, + "num_input_tokens_seen": 127614940, + "router_z_loss_clip": 1.79492188, + "router_z_loss_mlp": 0.15429688, + "step": 5938, + "time_per_iteration": 2.4724228382110596 + }, + { + "auxiliary_loss_clip": 0.0646999, + "auxiliary_loss_mlp": 0.01275118, + "balance_loss_clip": 0.06289655, + "balance_loss_mlp": 0.01259835, + "epoch": 0.3570719975950699, + "flos": 24905650233600.0, + "grad_norm": 2.3707612213619624, + "language_loss": 0.80684471, + "learning_rate": 2.9789028310259202e-06, + "loss": 0.88429582, + "num_input_tokens_seen": 127634960, + "router_z_loss_clip": 1.80566406, + "router_z_loss_mlp": 0.15307617, + "step": 5939, + "time_per_iteration": 4.067660331726074 + }, + { + "auxiliary_loss_clip": 0.06474897, + "auxiliary_loss_mlp": 0.01278586, + "balance_loss_clip": 0.06288245, + "balance_loss_mlp": 0.01263357, + "epoch": 0.3571321208477379, + "flos": 26002022981760.0, + "grad_norm": 1.7209958005115653, + "language_loss": 0.79509544, + "learning_rate": 2.9785631899766395e-06, + "loss": 0.8726303, + "num_input_tokens_seen": 127654545, + "router_z_loss_clip": 1.8671875, + "router_z_loss_mlp": 0.15228271, + "step": 5940, + "time_per_iteration": 3.961956262588501 + }, + { + "auxiliary_loss_clip": 0.06472583, + "auxiliary_loss_mlp": 0.01274024, + "balance_loss_clip": 0.0628977, + "balance_loss_mlp": 0.01258223, + "epoch": 0.35719224410040584, + "flos": 14506900905600.0, + "grad_norm": 2.455654522420387, + "language_loss": 0.72918689, + "learning_rate": 2.9782235118201443e-06, + "loss": 0.80665296, + "num_input_tokens_seen": 127672320, + "router_z_loss_clip": 1.82714844, + "router_z_loss_mlp": 0.15802002, + "step": 5941, + "time_per_iteration": 2.529376745223999 + }, + { + "auxiliary_loss_clip": 0.06469624, + "auxiliary_loss_mlp": 0.01274223, + "balance_loss_clip": 0.06291723, + "balance_loss_mlp": 0.01258577, + "epoch": 0.3572523673530738, + "flos": 31183445508480.0, + "grad_norm": 1.9522398224767823, + "language_loss": 0.64961332, + "learning_rate": 2.9778837965693154e-06, + "loss": 0.72705185, + "num_input_tokens_seen": 127693315, + "router_z_loss_clip": 1.77636719, + "router_z_loss_mlp": 0.15667725, + "step": 5942, + "time_per_iteration": 2.6694955825805664 + }, + { + "auxiliary_loss_clip": 0.06470639, + "auxiliary_loss_mlp": 0.01273062, + "balance_loss_clip": 0.06291504, + "balance_loss_mlp": 0.01257124, + "epoch": 0.3573124906057418, + "flos": 15857496541440.0, + "grad_norm": 1.9232266262089555, + "language_loss": 0.7463761, + "learning_rate": 2.9775440442370354e-06, + "loss": 0.82381314, + "num_input_tokens_seen": 127711570, + "router_z_loss_clip": 1.79101562, + "router_z_loss_mlp": 0.1595459, + "step": 5943, + "time_per_iteration": 2.5988807678222656 + }, + { + "auxiliary_loss_clip": 0.06416824, + "auxiliary_loss_mlp": 0.01259877, + "balance_loss_clip": 0.06336363, + "balance_loss_mlp": 0.01254631, + "epoch": 0.35737261385840974, + "flos": 60839163849600.0, + "grad_norm": 0.8122274991603828, + "language_loss": 0.60684133, + "learning_rate": 2.9772042548361867e-06, + "loss": 0.68360829, + "num_input_tokens_seen": 127772475, + "router_z_loss_clip": 0.8046875, + "router_z_loss_mlp": 0.05249023, + "step": 5944, + "time_per_iteration": 3.2639529705047607 + }, + { + "auxiliary_loss_clip": 0.06467592, + "auxiliary_loss_mlp": 0.01274246, + "balance_loss_clip": 0.06290887, + "balance_loss_mlp": 0.01259464, + "epoch": 0.3574327371110777, + "flos": 18849779297280.0, + "grad_norm": 1.8477550360079977, + "language_loss": 0.7280755, + "learning_rate": 2.976864428379655e-06, + "loss": 0.80549395, + "num_input_tokens_seen": 127790940, + "router_z_loss_clip": 1.765625, + "router_z_loss_mlp": 0.14782715, + "step": 5945, + "time_per_iteration": 3.974971294403076 + }, + { + "auxiliary_loss_clip": 0.06464474, + "auxiliary_loss_mlp": 0.01274521, + "balance_loss_clip": 0.06288721, + "balance_loss_mlp": 0.01259619, + "epoch": 0.35749286036374567, + "flos": 23556354336000.0, + "grad_norm": 1.6530257311602492, + "language_loss": 0.8152287, + "learning_rate": 2.976524564880326e-06, + "loss": 0.89261866, + "num_input_tokens_seen": 127808275, + "router_z_loss_clip": 1.7578125, + "router_z_loss_mlp": 0.14892578, + "step": 5946, + "time_per_iteration": 2.567702531814575 + }, + { + "auxiliary_loss_clip": 0.06472433, + "auxiliary_loss_mlp": 0.01275229, + "balance_loss_clip": 0.06292298, + "balance_loss_mlp": 0.01260036, + "epoch": 0.35755298361641363, + "flos": 21111817720320.0, + "grad_norm": 1.4004407917222146, + "language_loss": 0.69023073, + "learning_rate": 2.9761846643510882e-06, + "loss": 0.76770723, + "num_input_tokens_seen": 127828840, + "router_z_loss_clip": 1.80273438, + "router_z_loss_mlp": 0.15209961, + "step": 5947, + "time_per_iteration": 2.531938076019287 + }, + { + "auxiliary_loss_clip": 0.06458312, + "auxiliary_loss_mlp": 0.01270008, + "balance_loss_clip": 0.06284653, + "balance_loss_mlp": 0.01256109, + "epoch": 0.3576131068690816, + "flos": 19251099227520.0, + "grad_norm": 2.059659188145791, + "language_loss": 0.75891036, + "learning_rate": 2.9758447268048297e-06, + "loss": 0.83619356, + "num_input_tokens_seen": 127846240, + "router_z_loss_clip": 1.73535156, + "router_z_loss_mlp": 0.13916016, + "step": 5948, + "time_per_iteration": 3.9236361980438232 + }, + { + "auxiliary_loss_clip": 0.06466205, + "auxiliary_loss_mlp": 0.01276458, + "balance_loss_clip": 0.06287337, + "balance_loss_mlp": 0.01261462, + "epoch": 0.35767323012174956, + "flos": 28661733682560.0, + "grad_norm": 1.6908098548641093, + "language_loss": 0.71228039, + "learning_rate": 2.9755047522544415e-06, + "loss": 0.78970701, + "num_input_tokens_seen": 127866880, + "router_z_loss_clip": 1.78808594, + "router_z_loss_mlp": 0.15002441, + "step": 5949, + "time_per_iteration": 2.56809663772583 + }, + { + "auxiliary_loss_clip": 0.06464282, + "auxiliary_loss_mlp": 0.01281848, + "balance_loss_clip": 0.06286816, + "balance_loss_mlp": 0.01266995, + "epoch": 0.35773335337441753, + "flos": 17089897593600.0, + "grad_norm": 1.7763817610233048, + "language_loss": 0.77821207, + "learning_rate": 2.9751647407128154e-06, + "loss": 0.85567343, + "num_input_tokens_seen": 127883560, + "router_z_loss_clip": 1.7734375, + "router_z_loss_mlp": 0.1484375, + "step": 5950, + "time_per_iteration": 2.529543876647949 + }, + { + "auxiliary_loss_clip": 0.06465182, + "auxiliary_loss_mlp": 0.01276208, + "balance_loss_clip": 0.0628643, + "balance_loss_mlp": 0.01261331, + "epoch": 0.35779347662708555, + "flos": 15894155502720.0, + "grad_norm": 2.1549260339424725, + "language_loss": 0.73109937, + "learning_rate": 2.9748246921928445e-06, + "loss": 0.80851334, + "num_input_tokens_seen": 127902330, + "router_z_loss_clip": 1.78710938, + "router_z_loss_mlp": 0.14892578, + "step": 5951, + "time_per_iteration": 2.5201168060302734 + }, + { + "auxiliary_loss_clip": 0.06470691, + "auxiliary_loss_mlp": 0.01277881, + "balance_loss_clip": 0.06287189, + "balance_loss_mlp": 0.01262181, + "epoch": 0.3578535998797535, + "flos": 28666555292160.0, + "grad_norm": 1.9784791605149854, + "language_loss": 0.7026071, + "learning_rate": 2.9744846067074236e-06, + "loss": 0.78009284, + "num_input_tokens_seen": 127922325, + "router_z_loss_clip": 1.83691406, + "router_z_loss_mlp": 0.15698242, + "step": 5952, + "time_per_iteration": 2.5931434631347656 + }, + { + "auxiliary_loss_clip": 0.0646029, + "auxiliary_loss_mlp": 0.01277333, + "balance_loss_clip": 0.06284408, + "balance_loss_mlp": 0.01263069, + "epoch": 0.3579137231324215, + "flos": 37861554464640.0, + "grad_norm": 1.6267089711440414, + "language_loss": 0.69578886, + "learning_rate": 2.974144484269449e-06, + "loss": 0.77316511, + "num_input_tokens_seen": 127942635, + "router_z_loss_clip": 1.75878906, + "router_z_loss_mlp": 0.14276123, + "step": 5953, + "time_per_iteration": 2.668464422225952 + }, + { + "auxiliary_loss_clip": 0.0645823, + "auxiliary_loss_mlp": 0.01275685, + "balance_loss_clip": 0.06282876, + "balance_loss_mlp": 0.01261117, + "epoch": 0.35797384638508944, + "flos": 22353526575360.0, + "grad_norm": 1.5719996722989455, + "language_loss": 0.67333478, + "learning_rate": 2.9738043248918175e-06, + "loss": 0.75067389, + "num_input_tokens_seen": 127962520, + "router_z_loss_clip": 1.75390625, + "router_z_loss_mlp": 0.14562988, + "step": 5954, + "time_per_iteration": 2.5791454315185547 + }, + { + "auxiliary_loss_clip": 0.06459846, + "auxiliary_loss_mlp": 0.01278708, + "balance_loss_clip": 0.06287006, + "balance_loss_mlp": 0.0126414, + "epoch": 0.3580339696377574, + "flos": 13594829212800.0, + "grad_norm": 1.8066455981447187, + "language_loss": 0.75335681, + "learning_rate": 2.9734641285874282e-06, + "loss": 0.83074236, + "num_input_tokens_seen": 127981180, + "router_z_loss_clip": 1.73144531, + "router_z_loss_mlp": 0.14556885, + "step": 5955, + "time_per_iteration": 2.5049943923950195 + }, + { + "auxiliary_loss_clip": 0.06458074, + "auxiliary_loss_mlp": 0.01270596, + "balance_loss_clip": 0.06286005, + "balance_loss_mlp": 0.01256595, + "epoch": 0.3580940928904254, + "flos": 23774882584320.0, + "grad_norm": 1.7018331496498176, + "language_loss": 0.76155579, + "learning_rate": 2.973123895369182e-06, + "loss": 0.83884245, + "num_input_tokens_seen": 127999725, + "router_z_loss_clip": 1.72070312, + "router_z_loss_mlp": 0.14007568, + "step": 5956, + "time_per_iteration": 2.565455675125122 + }, + { + "auxiliary_loss_clip": 0.06456999, + "auxiliary_loss_mlp": 0.01278066, + "balance_loss_clip": 0.06286499, + "balance_loss_mlp": 0.01263415, + "epoch": 0.35815421614309334, + "flos": 19469962892160.0, + "grad_norm": 1.5319401259692025, + "language_loss": 0.73558611, + "learning_rate": 2.9727836252499805e-06, + "loss": 0.81293678, + "num_input_tokens_seen": 128018885, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.14642334, + "step": 5957, + "time_per_iteration": 2.5241572856903076 + }, + { + "auxiliary_loss_clip": 0.064648, + "auxiliary_loss_mlp": 0.01274688, + "balance_loss_clip": 0.06291045, + "balance_loss_mlp": 0.01260204, + "epoch": 0.3582143393957613, + "flos": 23374988173440.0, + "grad_norm": 2.1285308943055727, + "language_loss": 0.71748459, + "learning_rate": 2.972443318242726e-06, + "loss": 0.79487944, + "num_input_tokens_seen": 128037875, + "router_z_loss_clip": 1.73828125, + "router_z_loss_mlp": 0.14477539, + "step": 5958, + "time_per_iteration": 2.566181182861328 + }, + { + "auxiliary_loss_clip": 0.06459813, + "auxiliary_loss_mlp": 0.01267621, + "balance_loss_clip": 0.06289116, + "balance_loss_mlp": 0.0125415, + "epoch": 0.35827446264842927, + "flos": 26330528113920.0, + "grad_norm": 1.6357791647016078, + "language_loss": 0.88725436, + "learning_rate": 2.972102974360324e-06, + "loss": 0.96452874, + "num_input_tokens_seen": 128056045, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.13452148, + "step": 5959, + "time_per_iteration": 2.6218011379241943 + }, + { + "auxiliary_loss_clip": 0.06463417, + "auxiliary_loss_mlp": 0.01271505, + "balance_loss_clip": 0.06288788, + "balance_loss_mlp": 0.0125816, + "epoch": 0.35833458590109724, + "flos": 30454626695040.0, + "grad_norm": 1.5143701220572547, + "language_loss": 0.58769095, + "learning_rate": 2.971762593615679e-06, + "loss": 0.66504014, + "num_input_tokens_seen": 128077815, + "router_z_loss_clip": 1.74707031, + "router_z_loss_mlp": 0.13348389, + "step": 5960, + "time_per_iteration": 2.636439800262451 + }, + { + "auxiliary_loss_clip": 0.06462947, + "auxiliary_loss_mlp": 0.01269103, + "balance_loss_clip": 0.06286879, + "balance_loss_mlp": 0.01253469, + "epoch": 0.3583947091537652, + "flos": 14835154475520.0, + "grad_norm": 2.541265940729937, + "language_loss": 0.76686686, + "learning_rate": 2.9714221760216993e-06, + "loss": 0.84418738, + "num_input_tokens_seen": 128095460, + "router_z_loss_clip": 1.75976562, + "router_z_loss_mlp": 0.15631104, + "step": 5961, + "time_per_iteration": 2.523674249649048 + }, + { + "auxiliary_loss_clip": 0.06464821, + "auxiliary_loss_mlp": 0.01275426, + "balance_loss_clip": 0.06287968, + "balance_loss_mlp": 0.01261324, + "epoch": 0.35845483240643317, + "flos": 34249213895040.0, + "grad_norm": 1.6475679018941416, + "language_loss": 0.70478481, + "learning_rate": 2.971081721591294e-06, + "loss": 0.78218734, + "num_input_tokens_seen": 128118605, + "router_z_loss_clip": 1.76953125, + "router_z_loss_mlp": 0.14099121, + "step": 5962, + "time_per_iteration": 2.6199357509613037 + }, + { + "auxiliary_loss_clip": 0.06464063, + "auxiliary_loss_mlp": 0.01269417, + "balance_loss_clip": 0.06289653, + "balance_loss_mlp": 0.01255207, + "epoch": 0.35851495565910113, + "flos": 20966481613440.0, + "grad_norm": 1.6496872805273144, + "language_loss": 0.75120842, + "learning_rate": 2.9707412303373716e-06, + "loss": 0.82854319, + "num_input_tokens_seen": 128139205, + "router_z_loss_clip": 1.74609375, + "router_z_loss_mlp": 0.14221191, + "step": 5963, + "time_per_iteration": 2.5526950359344482 + }, + { + "auxiliary_loss_clip": 0.06467253, + "auxiliary_loss_mlp": 0.01271151, + "balance_loss_clip": 0.06291784, + "balance_loss_mlp": 0.01256322, + "epoch": 0.35857507891176915, + "flos": 22316448343680.0, + "grad_norm": 1.675466861885377, + "language_loss": 0.78945208, + "learning_rate": 2.9704007022728447e-06, + "loss": 0.86683613, + "num_input_tokens_seen": 128158765, + "router_z_loss_clip": 1.75292969, + "router_z_loss_mlp": 0.14831543, + "step": 5964, + "time_per_iteration": 2.5257983207702637 + }, + { + "auxiliary_loss_clip": 0.0647264, + "auxiliary_loss_mlp": 0.01272042, + "balance_loss_clip": 0.06292663, + "balance_loss_mlp": 0.0125726, + "epoch": 0.3586352021644371, + "flos": 23374610830080.0, + "grad_norm": 3.2898914726182684, + "language_loss": 0.667786, + "learning_rate": 2.970060137410626e-06, + "loss": 0.74523282, + "num_input_tokens_seen": 128177850, + "router_z_loss_clip": 1.79785156, + "router_z_loss_mlp": 0.14764404, + "step": 5965, + "time_per_iteration": 2.5664315223693848 + }, + { + "auxiliary_loss_clip": 0.06463271, + "auxiliary_loss_mlp": 0.01271526, + "balance_loss_clip": 0.06287476, + "balance_loss_mlp": 0.01256773, + "epoch": 0.3586953254171051, + "flos": 27855655804800.0, + "grad_norm": 1.5935311272675807, + "language_loss": 0.79428947, + "learning_rate": 2.9697195357636294e-06, + "loss": 0.87163734, + "num_input_tokens_seen": 128196925, + "router_z_loss_clip": 1.75585938, + "router_z_loss_mlp": 0.14746094, + "step": 5966, + "time_per_iteration": 2.576537609100342 + }, + { + "auxiliary_loss_clip": 0.06467331, + "auxiliary_loss_mlp": 0.01268742, + "balance_loss_clip": 0.06291486, + "balance_loss_mlp": 0.01254717, + "epoch": 0.35875544866977305, + "flos": 19506621853440.0, + "grad_norm": 2.077713447457672, + "language_loss": 0.91477883, + "learning_rate": 2.9693788973447715e-06, + "loss": 0.99213958, + "num_input_tokens_seen": 128213955, + "router_z_loss_clip": 1.75683594, + "router_z_loss_mlp": 0.14044189, + "step": 5967, + "time_per_iteration": 2.553084135055542 + }, + { + "auxiliary_loss_clip": 0.06466691, + "auxiliary_loss_mlp": 0.01272699, + "balance_loss_clip": 0.06288824, + "balance_loss_mlp": 0.01257261, + "epoch": 0.358815571922441, + "flos": 21477652646400.0, + "grad_norm": 1.8463229992001005, + "language_loss": 0.80835712, + "learning_rate": 2.9690382221669682e-06, + "loss": 0.88575101, + "num_input_tokens_seen": 128232980, + "router_z_loss_clip": 1.77832031, + "router_z_loss_mlp": 0.15435791, + "step": 5968, + "time_per_iteration": 2.526298761367798 + }, + { + "auxiliary_loss_clip": 0.06467028, + "auxiliary_loss_mlp": 0.0127428, + "balance_loss_clip": 0.06287041, + "balance_loss_mlp": 0.012587, + "epoch": 0.358875695175109, + "flos": 21841894344960.0, + "grad_norm": 1.8179824378655614, + "language_loss": 0.84621, + "learning_rate": 2.9686975102431384e-06, + "loss": 0.92362314, + "num_input_tokens_seen": 128252795, + "router_z_loss_clip": 1.80078125, + "router_z_loss_mlp": 0.15588379, + "step": 5969, + "time_per_iteration": 2.5340397357940674 + }, + { + "auxiliary_loss_clip": 0.0646342, + "auxiliary_loss_mlp": 0.0127204, + "balance_loss_clip": 0.06288599, + "balance_loss_mlp": 0.01258664, + "epoch": 0.35893581842777694, + "flos": 32019264385920.0, + "grad_norm": 1.8505987075691241, + "language_loss": 0.72233456, + "learning_rate": 2.968356761586202e-06, + "loss": 0.79968911, + "num_input_tokens_seen": 128273115, + "router_z_loss_clip": 1.74804688, + "router_z_loss_mlp": 0.13366699, + "step": 5970, + "time_per_iteration": 2.581071615219116 + }, + { + "auxiliary_loss_clip": 0.06468321, + "auxiliary_loss_mlp": 0.01272468, + "balance_loss_clip": 0.06292167, + "balance_loss_mlp": 0.01258056, + "epoch": 0.3589959416804449, + "flos": 20492137249920.0, + "grad_norm": 1.5610077365233734, + "language_loss": 0.79753757, + "learning_rate": 2.9680159762090805e-06, + "loss": 0.87494546, + "num_input_tokens_seen": 128292220, + "router_z_loss_clip": 1.76269531, + "router_z_loss_mlp": 0.14422607, + "step": 5971, + "time_per_iteration": 2.5405828952789307 + }, + { + "auxiliary_loss_clip": 0.0646906, + "auxiliary_loss_mlp": 0.01270026, + "balance_loss_clip": 0.06288019, + "balance_loss_mlp": 0.01255006, + "epoch": 0.3590560649331129, + "flos": 16186295162880.0, + "grad_norm": 1.6291573791515084, + "language_loss": 0.78869599, + "learning_rate": 2.967675154124696e-06, + "loss": 0.86608684, + "num_input_tokens_seen": 128310305, + "router_z_loss_clip": 1.81054688, + "router_z_loss_mlp": 0.15026855, + "step": 5972, + "time_per_iteration": 2.4778740406036377 + }, + { + "auxiliary_loss_clip": 0.06465904, + "auxiliary_loss_mlp": 0.01274602, + "balance_loss_clip": 0.06286226, + "balance_loss_mlp": 0.01260201, + "epoch": 0.35911618818578084, + "flos": 20381531460480.0, + "grad_norm": 2.0141455740295875, + "language_loss": 0.81742013, + "learning_rate": 2.9673342953459722e-06, + "loss": 0.89482516, + "num_input_tokens_seen": 128328305, + "router_z_loss_clip": 1.79492188, + "router_z_loss_mlp": 0.1439209, + "step": 5973, + "time_per_iteration": 2.532027006149292 + }, + { + "auxiliary_loss_clip": 0.06404248, + "auxiliary_loss_mlp": 0.01258065, + "balance_loss_clip": 0.06324309, + "balance_loss_mlp": 0.01254096, + "epoch": 0.3591763114384488, + "flos": 41250991645440.0, + "grad_norm": 0.9082562918021452, + "language_loss": 0.56514442, + "learning_rate": 2.9669933998858355e-06, + "loss": 0.64176756, + "num_input_tokens_seen": 128378380, + "router_z_loss_clip": 0.796875, + "router_z_loss_mlp": 0.03967285, + "step": 5974, + "time_per_iteration": 3.0029375553131104 + }, + { + "auxiliary_loss_clip": 0.06464389, + "auxiliary_loss_mlp": 0.01272027, + "balance_loss_clip": 0.06286667, + "balance_loss_mlp": 0.01257781, + "epoch": 0.35923643469111677, + "flos": 18701047100160.0, + "grad_norm": 1.9591615340661908, + "language_loss": 0.69342583, + "learning_rate": 2.9666524677572114e-06, + "loss": 0.77078998, + "num_input_tokens_seen": 128394315, + "router_z_loss_clip": 1.77636719, + "router_z_loss_mlp": 0.14227295, + "step": 5975, + "time_per_iteration": 2.5330698490142822 + }, + { + "auxiliary_loss_clip": 0.06462636, + "auxiliary_loss_mlp": 0.0126783, + "balance_loss_clip": 0.06286036, + "balance_loss_mlp": 0.0125325, + "epoch": 0.35929655794378473, + "flos": 25017010709760.0, + "grad_norm": 1.597565036747504, + "language_loss": 0.8049522, + "learning_rate": 2.96631149897303e-06, + "loss": 0.88225687, + "num_input_tokens_seen": 128414515, + "router_z_loss_clip": 1.765625, + "router_z_loss_mlp": 0.14575195, + "step": 5976, + "time_per_iteration": 2.5599968433380127 + }, + { + "auxiliary_loss_clip": 0.0646351, + "auxiliary_loss_mlp": 0.0126845, + "balance_loss_clip": 0.06286681, + "balance_loss_mlp": 0.01253489, + "epoch": 0.35935668119645275, + "flos": 14980825998720.0, + "grad_norm": 1.8019140268476472, + "language_loss": 0.79171205, + "learning_rate": 2.9659704935462194e-06, + "loss": 0.86903155, + "num_input_tokens_seen": 128430615, + "router_z_loss_clip": 1.76757812, + "router_z_loss_mlp": 0.1496582, + "step": 5977, + "time_per_iteration": 2.4876949787139893 + }, + { + "auxiliary_loss_clip": 0.06459211, + "auxiliary_loss_mlp": 0.01266574, + "balance_loss_clip": 0.0628271, + "balance_loss_mlp": 0.0125324, + "epoch": 0.3594168044491207, + "flos": 21184422883200.0, + "grad_norm": 1.897291031169604, + "language_loss": 0.80843097, + "learning_rate": 2.9656294514897102e-06, + "loss": 0.88568884, + "num_input_tokens_seen": 128449480, + "router_z_loss_clip": 1.76464844, + "router_z_loss_mlp": 0.13342285, + "step": 5978, + "time_per_iteration": 2.5270771980285645 + }, + { + "auxiliary_loss_clip": 0.06458849, + "auxiliary_loss_mlp": 0.01272545, + "balance_loss_clip": 0.06279429, + "balance_loss_mlp": 0.01257703, + "epoch": 0.3594769277017887, + "flos": 27679446668160.0, + "grad_norm": 1.6570486295636508, + "language_loss": 0.67797875, + "learning_rate": 2.965288372816436e-06, + "loss": 0.75529265, + "num_input_tokens_seen": 128471465, + "router_z_loss_clip": 1.79394531, + "router_z_loss_mlp": 0.14819336, + "step": 5979, + "time_per_iteration": 5.427239179611206 + }, + { + "auxiliary_loss_clip": 0.06460471, + "auxiliary_loss_mlp": 0.01270252, + "balance_loss_clip": 0.06282781, + "balance_loss_mlp": 0.01256323, + "epoch": 0.35953705095445665, + "flos": 23008901685120.0, + "grad_norm": 2.1534655116077928, + "language_loss": 0.67667198, + "learning_rate": 2.9649472575393296e-06, + "loss": 0.75397921, + "num_input_tokens_seen": 128490645, + "router_z_loss_clip": 1.77929688, + "router_z_loss_mlp": 0.13928223, + "step": 5980, + "time_per_iteration": 2.538149833679199 + }, + { + "auxiliary_loss_clip": 0.0647162, + "auxiliary_loss_mlp": 0.01273216, + "balance_loss_clip": 0.06285568, + "balance_loss_mlp": 0.01257146, + "epoch": 0.3595971742071246, + "flos": 25520005969920.0, + "grad_norm": 2.2162969460708597, + "language_loss": 0.71122372, + "learning_rate": 2.964606105671327e-06, + "loss": 0.78867209, + "num_input_tokens_seen": 128510225, + "router_z_loss_clip": 1.86132812, + "router_z_loss_mlp": 0.16064453, + "step": 5981, + "time_per_iteration": 2.5711326599121094 + }, + { + "auxiliary_loss_clip": 0.06464566, + "auxiliary_loss_mlp": 0.01272445, + "balance_loss_clip": 0.06283125, + "balance_loss_mlp": 0.01256709, + "epoch": 0.3596572974597926, + "flos": 29870431228800.0, + "grad_norm": 2.0278025655936958, + "language_loss": 0.71914935, + "learning_rate": 2.9642649172253635e-06, + "loss": 0.7965194, + "num_input_tokens_seen": 128530195, + "router_z_loss_clip": 1.81640625, + "router_z_loss_mlp": 0.1572876, + "step": 5982, + "time_per_iteration": 2.6292126178741455 + }, + { + "auxiliary_loss_clip": 0.06458835, + "auxiliary_loss_mlp": 0.01267882, + "balance_loss_clip": 0.06286852, + "balance_loss_mlp": 0.0125428, + "epoch": 0.35971742071246054, + "flos": 23119255912320.0, + "grad_norm": 1.6791573126106523, + "language_loss": 0.7649492, + "learning_rate": 2.9639236922143786e-06, + "loss": 0.84221637, + "num_input_tokens_seen": 128549990, + "router_z_loss_clip": 1.72070312, + "router_z_loss_mlp": 0.13598633, + "step": 5983, + "time_per_iteration": 2.540801763534546 + }, + { + "auxiliary_loss_clip": 0.06468493, + "auxiliary_loss_mlp": 0.01273263, + "balance_loss_clip": 0.06285352, + "balance_loss_mlp": 0.01257206, + "epoch": 0.3597775439651285, + "flos": 16730645212800.0, + "grad_norm": 1.651729152091261, + "language_loss": 0.77260226, + "learning_rate": 2.96358243065131e-06, + "loss": 0.85001981, + "num_input_tokens_seen": 128567925, + "router_z_loss_clip": 1.83007812, + "router_z_loss_mlp": 0.16052246, + "step": 5984, + "time_per_iteration": 2.5278737545013428 + }, + { + "auxiliary_loss_clip": 0.06458455, + "auxiliary_loss_mlp": 0.01270496, + "balance_loss_clip": 0.0628411, + "balance_loss_mlp": 0.01256155, + "epoch": 0.3598376672177965, + "flos": 19725653226240.0, + "grad_norm": 2.0268922239891163, + "language_loss": 0.87093443, + "learning_rate": 2.9632411325490993e-06, + "loss": 0.94822395, + "num_input_tokens_seen": 128585655, + "router_z_loss_clip": 1.7421875, + "router_z_loss_mlp": 0.14355469, + "step": 5985, + "time_per_iteration": 3.9569170475006104 + }, + { + "auxiliary_loss_clip": 0.06461216, + "auxiliary_loss_mlp": 0.01272807, + "balance_loss_clip": 0.06284203, + "balance_loss_mlp": 0.01258109, + "epoch": 0.35989779047046444, + "flos": 17317314374400.0, + "grad_norm": 1.4939910635791536, + "language_loss": 0.72980917, + "learning_rate": 2.9628997979206884e-06, + "loss": 0.80714941, + "num_input_tokens_seen": 128604820, + "router_z_loss_clip": 1.77246094, + "router_z_loss_mlp": 0.14709473, + "step": 5986, + "time_per_iteration": 2.5065739154815674 + }, + { + "auxiliary_loss_clip": 0.06469383, + "auxiliary_loss_mlp": 0.0126965, + "balance_loss_clip": 0.06283881, + "balance_loss_mlp": 0.01254761, + "epoch": 0.3599579137231324, + "flos": 22717894055040.0, + "grad_norm": 2.903112824764454, + "language_loss": 0.73792106, + "learning_rate": 2.9625584267790204e-06, + "loss": 0.81531143, + "num_input_tokens_seen": 128623070, + "router_z_loss_clip": 1.85449219, + "router_z_loss_mlp": 0.14892578, + "step": 5987, + "time_per_iteration": 3.961486339569092 + }, + { + "auxiliary_loss_clip": 0.06467381, + "auxiliary_loss_mlp": 0.01269998, + "balance_loss_clip": 0.06286356, + "balance_loss_mlp": 0.01255347, + "epoch": 0.36001803697580037, + "flos": 20966230051200.0, + "grad_norm": 1.8945086710394061, + "language_loss": 0.69721663, + "learning_rate": 2.9622170191370404e-06, + "loss": 0.77459043, + "num_input_tokens_seen": 128642430, + "router_z_loss_clip": 1.80859375, + "router_z_loss_mlp": 0.14648438, + "step": 5988, + "time_per_iteration": 2.5483100414276123 + }, + { + "auxiliary_loss_clip": 0.0647547, + "auxiliary_loss_mlp": 0.01273545, + "balance_loss_clip": 0.06292704, + "balance_loss_mlp": 0.01258209, + "epoch": 0.36007816022846834, + "flos": 20491843760640.0, + "grad_norm": 1.7927951606002523, + "language_loss": 0.7305057, + "learning_rate": 2.9618755750076953e-06, + "loss": 0.80799592, + "num_input_tokens_seen": 128661285, + "router_z_loss_clip": 1.82714844, + "router_z_loss_mlp": 0.15344238, + "step": 5989, + "time_per_iteration": 2.5010430812835693 + }, + { + "auxiliary_loss_clip": 0.06467338, + "auxiliary_loss_mlp": 0.01268061, + "balance_loss_clip": 0.06289014, + "balance_loss_mlp": 0.01254173, + "epoch": 0.36013828348113636, + "flos": 28008706487040.0, + "grad_norm": 1.4999082498201763, + "language_loss": 0.80117184, + "learning_rate": 2.961534094403931e-06, + "loss": 0.87852585, + "num_input_tokens_seen": 128682210, + "router_z_loss_clip": 1.78515625, + "router_z_loss_mlp": 0.13897705, + "step": 5990, + "time_per_iteration": 2.6733410358428955 + }, + { + "auxiliary_loss_clip": 0.06464024, + "auxiliary_loss_mlp": 0.01270971, + "balance_loss_clip": 0.0628631, + "balance_loss_mlp": 0.01255938, + "epoch": 0.3601984067338043, + "flos": 20088050135040.0, + "grad_norm": 1.799909646769202, + "language_loss": 0.84338784, + "learning_rate": 2.961192577338698e-06, + "loss": 0.92073774, + "num_input_tokens_seen": 128700445, + "router_z_loss_clip": 1.77441406, + "router_z_loss_mlp": 0.15032959, + "step": 5991, + "time_per_iteration": 2.518554925918579 + }, + { + "auxiliary_loss_clip": 0.06474696, + "auxiliary_loss_mlp": 0.01276578, + "balance_loss_clip": 0.06292041, + "balance_loss_mlp": 0.01261367, + "epoch": 0.3602585299864723, + "flos": 18622362516480.0, + "grad_norm": 1.891276760716041, + "language_loss": 0.76406145, + "learning_rate": 2.9608510238249463e-06, + "loss": 0.84157419, + "num_input_tokens_seen": 128716855, + "router_z_loss_clip": 1.82617188, + "router_z_loss_mlp": 0.1519165, + "step": 5992, + "time_per_iteration": 2.5224106311798096 + }, + { + "auxiliary_loss_clip": 0.06471405, + "auxiliary_loss_mlp": 0.01273147, + "balance_loss_clip": 0.06294376, + "balance_loss_mlp": 0.01258496, + "epoch": 0.36031865323914025, + "flos": 19579059308160.0, + "grad_norm": 2.086772991356176, + "language_loss": 0.78120929, + "learning_rate": 2.960509433875627e-06, + "loss": 0.8586548, + "num_input_tokens_seen": 128735835, + "router_z_loss_clip": 1.76953125, + "router_z_loss_mlp": 0.14648438, + "step": 5993, + "time_per_iteration": 2.5155129432678223 + }, + { + "auxiliary_loss_clip": 0.06474859, + "auxiliary_loss_mlp": 0.01271898, + "balance_loss_clip": 0.06293729, + "balance_loss_mlp": 0.01257807, + "epoch": 0.3603787764918082, + "flos": 17495871425280.0, + "grad_norm": 1.6487847999674183, + "language_loss": 0.74534261, + "learning_rate": 2.9601678075036943e-06, + "loss": 0.82281017, + "num_input_tokens_seen": 128752465, + "router_z_loss_clip": 1.81445312, + "router_z_loss_mlp": 0.14086914, + "step": 5994, + "time_per_iteration": 2.647794723510742 + }, + { + "auxiliary_loss_clip": 0.06474246, + "auxiliary_loss_mlp": 0.01268785, + "balance_loss_clip": 0.06290799, + "balance_loss_mlp": 0.01254415, + "epoch": 0.3604388997444762, + "flos": 15528823701120.0, + "grad_norm": 1.8873654318884407, + "language_loss": 0.69500113, + "learning_rate": 2.9598261447221024e-06, + "loss": 0.77243149, + "num_input_tokens_seen": 128770865, + "router_z_loss_clip": 1.83691406, + "router_z_loss_mlp": 0.14361572, + "step": 5995, + "time_per_iteration": 2.501981019973755 + }, + { + "auxiliary_loss_clip": 0.06479774, + "auxiliary_loss_mlp": 0.01276345, + "balance_loss_clip": 0.06295834, + "balance_loss_mlp": 0.01261688, + "epoch": 0.36049902299714415, + "flos": 17316559687680.0, + "grad_norm": 1.8201062799427143, + "language_loss": 0.8309989, + "learning_rate": 2.9594844455438057e-06, + "loss": 0.90856004, + "num_input_tokens_seen": 128789730, + "router_z_loss_clip": 1.83886719, + "router_z_loss_mlp": 0.14642334, + "step": 5996, + "time_per_iteration": 2.551095962524414 + }, + { + "auxiliary_loss_clip": 0.06472808, + "auxiliary_loss_mlp": 0.01275418, + "balance_loss_clip": 0.06293936, + "balance_loss_mlp": 0.01260493, + "epoch": 0.3605591462498121, + "flos": 17061749821440.0, + "grad_norm": 2.2503529028172804, + "language_loss": 0.73762429, + "learning_rate": 2.959142709981763e-06, + "loss": 0.81510657, + "num_input_tokens_seen": 128806610, + "router_z_loss_clip": 1.79003906, + "router_z_loss_mlp": 0.14910889, + "step": 5997, + "time_per_iteration": 2.493100881576538 + }, + { + "auxiliary_loss_clip": 0.06465439, + "auxiliary_loss_mlp": 0.0127421, + "balance_loss_clip": 0.06288476, + "balance_loss_mlp": 0.0125944, + "epoch": 0.3606192695024801, + "flos": 16842508813440.0, + "grad_norm": 2.0075843423569326, + "language_loss": 0.69582814, + "learning_rate": 2.9588009380489337e-06, + "loss": 0.77322465, + "num_input_tokens_seen": 128824830, + "router_z_loss_clip": 1.76855469, + "router_z_loss_mlp": 0.14758301, + "step": 5998, + "time_per_iteration": 2.54227352142334 + }, + { + "auxiliary_loss_clip": 0.06468997, + "auxiliary_loss_mlp": 0.01272453, + "balance_loss_clip": 0.06292363, + "balance_loss_mlp": 0.01258243, + "epoch": 0.36067939275514804, + "flos": 12134424401280.0, + "grad_norm": 2.607888629955908, + "language_loss": 0.77566224, + "learning_rate": 2.9584591297582758e-06, + "loss": 0.8530767, + "num_input_tokens_seen": 128838170, + "router_z_loss_clip": 1.76757812, + "router_z_loss_mlp": 0.14208984, + "step": 5999, + "time_per_iteration": 2.456887722015381 + }, + { + "auxiliary_loss_clip": 0.06474666, + "auxiliary_loss_mlp": 0.01272087, + "balance_loss_clip": 0.06294585, + "balance_loss_mlp": 0.01257776, + "epoch": 0.360739516007816, + "flos": 18047390999040.0, + "grad_norm": 1.725953097254869, + "language_loss": 0.78777629, + "learning_rate": 2.9581172851227516e-06, + "loss": 0.86524385, + "num_input_tokens_seen": 128855625, + "router_z_loss_clip": 1.80273438, + "router_z_loss_mlp": 0.14300537, + "step": 6000, + "time_per_iteration": 2.523397445678711 + }, + { + "auxiliary_loss_clip": 0.06471578, + "auxiliary_loss_mlp": 0.01271527, + "balance_loss_clip": 0.06294253, + "balance_loss_mlp": 0.01257854, + "epoch": 0.360799639260484, + "flos": 18555417504000.0, + "grad_norm": 1.7389483603698193, + "language_loss": 0.78602117, + "learning_rate": 2.9577754041553243e-06, + "loss": 0.86345226, + "num_input_tokens_seen": 128873540, + "router_z_loss_clip": 1.77148438, + "router_z_loss_mlp": 0.13671875, + "step": 6001, + "time_per_iteration": 2.4887304306030273 + }, + { + "auxiliary_loss_clip": 0.06462014, + "auxiliary_loss_mlp": 0.01269074, + "balance_loss_clip": 0.06287026, + "balance_loss_mlp": 0.012549, + "epoch": 0.36085976251315194, + "flos": 19688029943040.0, + "grad_norm": 2.5640130860082206, + "language_loss": 0.83264118, + "learning_rate": 2.9574334868689575e-06, + "loss": 0.90995204, + "num_input_tokens_seen": 128889925, + "router_z_loss_clip": 1.74902344, + "router_z_loss_mlp": 0.14178467, + "step": 6002, + "time_per_iteration": 2.523263931274414 + }, + { + "auxiliary_loss_clip": 0.06462792, + "auxiliary_loss_mlp": 0.01274754, + "balance_loss_clip": 0.06293326, + "balance_loss_mlp": 0.01262034, + "epoch": 0.3609198857658199, + "flos": 24204476067840.0, + "grad_norm": 2.058215255218527, + "language_loss": 0.91365647, + "learning_rate": 2.9570915332766165e-06, + "loss": 0.991032, + "num_input_tokens_seen": 128906890, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.12713623, + "step": 6003, + "time_per_iteration": 2.5147922039031982 + }, + { + "auxiliary_loss_clip": 0.06424739, + "auxiliary_loss_mlp": 0.01257394, + "balance_loss_clip": 0.06345953, + "balance_loss_mlp": 0.01254351, + "epoch": 0.3609800090184879, + "flos": 57134288044800.0, + "grad_norm": 0.8495896975763515, + "language_loss": 0.53457719, + "learning_rate": 2.9567495433912693e-06, + "loss": 0.61139846, + "num_input_tokens_seen": 128965940, + "router_z_loss_clip": 0.78857422, + "router_z_loss_mlp": 0.03041077, + "step": 6004, + "time_per_iteration": 3.1006038188934326 + }, + { + "auxiliary_loss_clip": 0.06473242, + "auxiliary_loss_mlp": 0.01270523, + "balance_loss_clip": 0.06291834, + "balance_loss_mlp": 0.0125549, + "epoch": 0.3610401322711559, + "flos": 20817120510720.0, + "grad_norm": 1.7032625156204924, + "language_loss": 0.78291458, + "learning_rate": 2.956407517225883e-06, + "loss": 0.86035228, + "num_input_tokens_seen": 128985835, + "router_z_loss_clip": 1.81640625, + "router_z_loss_mlp": 0.15026855, + "step": 6005, + "time_per_iteration": 2.507681369781494 + }, + { + "auxiliary_loss_clip": 0.06466124, + "auxiliary_loss_mlp": 0.01274708, + "balance_loss_clip": 0.06289654, + "balance_loss_mlp": 0.01260373, + "epoch": 0.36110025552382385, + "flos": 13704302972160.0, + "grad_norm": 1.9788670063291258, + "language_loss": 0.79365236, + "learning_rate": 2.956065454793429e-06, + "loss": 0.87106061, + "num_input_tokens_seen": 129003120, + "router_z_loss_clip": 1.76464844, + "router_z_loss_mlp": 0.14349365, + "step": 6006, + "time_per_iteration": 2.6221675872802734 + }, + { + "auxiliary_loss_clip": 0.06467897, + "auxiliary_loss_mlp": 0.01276481, + "balance_loss_clip": 0.06290089, + "balance_loss_mlp": 0.01260317, + "epoch": 0.3611603787764918, + "flos": 22461490961280.0, + "grad_norm": 1.8947484153914913, + "language_loss": 0.84532005, + "learning_rate": 2.955723356106876e-06, + "loss": 0.92276382, + "num_input_tokens_seen": 129021645, + "router_z_loss_clip": 1.77734375, + "router_z_loss_mlp": 0.16162109, + "step": 6007, + "time_per_iteration": 2.5697944164276123 + }, + { + "auxiliary_loss_clip": 0.06477423, + "auxiliary_loss_mlp": 0.01275582, + "balance_loss_clip": 0.06289505, + "balance_loss_mlp": 0.0126018, + "epoch": 0.3612205020291598, + "flos": 20892954055680.0, + "grad_norm": 2.2451481952848953, + "language_loss": 0.73192191, + "learning_rate": 2.955381221179198e-06, + "loss": 0.80945194, + "num_input_tokens_seen": 129038375, + "router_z_loss_clip": 1.87890625, + "router_z_loss_mlp": 0.1541748, + "step": 6008, + "time_per_iteration": 2.5410661697387695 + }, + { + "auxiliary_loss_clip": 0.06468849, + "auxiliary_loss_mlp": 0.01276747, + "balance_loss_clip": 0.06288531, + "balance_loss_mlp": 0.01262036, + "epoch": 0.36128062528182775, + "flos": 15747393876480.0, + "grad_norm": 2.0636796050179194, + "language_loss": 0.83194089, + "learning_rate": 2.955039050023368e-06, + "loss": 0.90939683, + "num_input_tokens_seen": 129056235, + "router_z_loss_clip": 1.80273438, + "router_z_loss_mlp": 0.1472168, + "step": 6009, + "time_per_iteration": 2.4896605014801025 + }, + { + "auxiliary_loss_clip": 0.06467466, + "auxiliary_loss_mlp": 0.01270582, + "balance_loss_clip": 0.0629051, + "balance_loss_mlp": 0.012553, + "epoch": 0.3613407485344957, + "flos": 16770239066880.0, + "grad_norm": 1.996577445690206, + "language_loss": 0.7613554, + "learning_rate": 2.954696842652362e-06, + "loss": 0.83873594, + "num_input_tokens_seen": 129072405, + "router_z_loss_clip": 1.76953125, + "router_z_loss_mlp": 0.15258789, + "step": 6010, + "time_per_iteration": 2.501328468322754 + }, + { + "auxiliary_loss_clip": 0.064712, + "auxiliary_loss_mlp": 0.0127317, + "balance_loss_clip": 0.06292284, + "balance_loss_mlp": 0.01258734, + "epoch": 0.3614008717871637, + "flos": 20376625996800.0, + "grad_norm": 1.7565456089129825, + "language_loss": 0.8353886, + "learning_rate": 2.9543545990791554e-06, + "loss": 0.91283226, + "num_input_tokens_seen": 129090225, + "router_z_loss_clip": 1.78710938, + "router_z_loss_mlp": 0.14440918, + "step": 6011, + "time_per_iteration": 2.5080785751342773 + }, + { + "auxiliary_loss_clip": 0.06473367, + "auxiliary_loss_mlp": 0.01273027, + "balance_loss_clip": 0.06288376, + "balance_loss_mlp": 0.0125784, + "epoch": 0.36146099503983165, + "flos": 22782071882880.0, + "grad_norm": 2.5852128775447536, + "language_loss": 0.62982023, + "learning_rate": 2.954012319316727e-06, + "loss": 0.70728415, + "num_input_tokens_seen": 129107685, + "router_z_loss_clip": 1.84863281, + "router_z_loss_mlp": 0.15185547, + "step": 6012, + "time_per_iteration": 2.5285983085632324 + }, + { + "auxiliary_loss_clip": 0.06468817, + "auxiliary_loss_mlp": 0.01279391, + "balance_loss_clip": 0.06292222, + "balance_loss_mlp": 0.01264728, + "epoch": 0.3615211182924996, + "flos": 23002277212800.0, + "grad_norm": 2.060645495819417, + "language_loss": 0.83850408, + "learning_rate": 2.9536700033780565e-06, + "loss": 0.91598618, + "num_input_tokens_seen": 129125315, + "router_z_loss_clip": 1.76660156, + "router_z_loss_mlp": 0.14648438, + "step": 6013, + "time_per_iteration": 2.511187791824341 + }, + { + "auxiliary_loss_clip": 0.06469796, + "auxiliary_loss_mlp": 0.01276155, + "balance_loss_clip": 0.06291521, + "balance_loss_mlp": 0.01259501, + "epoch": 0.3615812415451676, + "flos": 16652631461760.0, + "grad_norm": 1.9072870373759168, + "language_loss": 0.92107058, + "learning_rate": 2.9533276512761228e-06, + "loss": 0.99853015, + "num_input_tokens_seen": 129141600, + "router_z_loss_clip": 1.78417969, + "router_z_loss_mlp": 0.16638184, + "step": 6014, + "time_per_iteration": 2.498011350631714 + }, + { + "auxiliary_loss_clip": 0.06466013, + "auxiliary_loss_mlp": 0.01275475, + "balance_loss_clip": 0.06290498, + "balance_loss_mlp": 0.01260097, + "epoch": 0.36164136479783554, + "flos": 21325733994240.0, + "grad_norm": 8.045361949377702, + "language_loss": 0.73973721, + "learning_rate": 2.95298526302391e-06, + "loss": 0.81715214, + "num_input_tokens_seen": 129160665, + "router_z_loss_clip": 1.75683594, + "router_z_loss_mlp": 0.15393066, + "step": 6015, + "time_per_iteration": 2.5139665603637695 + }, + { + "auxiliary_loss_clip": 0.0646963, + "auxiliary_loss_mlp": 0.01277804, + "balance_loss_clip": 0.06291166, + "balance_loss_mlp": 0.01262151, + "epoch": 0.3617014880505035, + "flos": 24176286368640.0, + "grad_norm": 1.9455925595590893, + "language_loss": 0.65181047, + "learning_rate": 2.9526428386344e-06, + "loss": 0.72928476, + "num_input_tokens_seen": 129179220, + "router_z_loss_clip": 1.78320312, + "router_z_loss_mlp": 0.15637207, + "step": 6016, + "time_per_iteration": 2.5485315322875977 + }, + { + "auxiliary_loss_clip": 0.06469464, + "auxiliary_loss_mlp": 0.01276058, + "balance_loss_clip": 0.06288736, + "balance_loss_mlp": 0.01259261, + "epoch": 0.3617616113031715, + "flos": 39023278997760.0, + "grad_norm": 1.6846943976812254, + "language_loss": 0.72102833, + "learning_rate": 2.9523003781205785e-06, + "loss": 0.79848349, + "num_input_tokens_seen": 129200385, + "router_z_loss_clip": 1.81152344, + "router_z_loss_mlp": 0.16784668, + "step": 6017, + "time_per_iteration": 2.6685996055603027 + }, + { + "auxiliary_loss_clip": 0.06470844, + "auxiliary_loss_mlp": 0.01272479, + "balance_loss_clip": 0.06287402, + "balance_loss_mlp": 0.01256886, + "epoch": 0.3618217345558395, + "flos": 12135807993600.0, + "grad_norm": 2.3155685522099962, + "language_loss": 0.74387789, + "learning_rate": 2.9519578814954307e-06, + "loss": 0.82131112, + "num_input_tokens_seen": 129217395, + "router_z_loss_clip": 1.83398438, + "router_z_loss_mlp": 0.15600586, + "step": 6018, + "time_per_iteration": 3.93249249458313 + }, + { + "auxiliary_loss_clip": 0.06458628, + "auxiliary_loss_mlp": 0.01273986, + "balance_loss_clip": 0.06287278, + "balance_loss_mlp": 0.0125856, + "epoch": 0.36188185780850746, + "flos": 24941722216320.0, + "grad_norm": 2.406612181934337, + "language_loss": 0.69554305, + "learning_rate": 2.9516153487719448e-06, + "loss": 0.77286923, + "num_input_tokens_seen": 129238940, + "router_z_loss_clip": 1.71191406, + "router_z_loss_mlp": 0.1541748, + "step": 6019, + "time_per_iteration": 4.000872373580933 + }, + { + "auxiliary_loss_clip": 0.06472806, + "auxiliary_loss_mlp": 0.01271681, + "balance_loss_clip": 0.0628852, + "balance_loss_mlp": 0.01255815, + "epoch": 0.3619419810611754, + "flos": 20965014167040.0, + "grad_norm": 2.953778610066193, + "language_loss": 0.76874363, + "learning_rate": 2.95127277996311e-06, + "loss": 0.84618843, + "num_input_tokens_seen": 129258240, + "router_z_loss_clip": 1.84277344, + "router_z_loss_mlp": 0.15869141, + "step": 6020, + "time_per_iteration": 2.5465614795684814 + }, + { + "auxiliary_loss_clip": 0.06471147, + "auxiliary_loss_mlp": 0.01273965, + "balance_loss_clip": 0.06288891, + "balance_loss_mlp": 0.01257264, + "epoch": 0.3620021043138434, + "flos": 22535521643520.0, + "grad_norm": 2.2311166939070097, + "language_loss": 0.74090236, + "learning_rate": 2.9509301750819156e-06, + "loss": 0.81835353, + "num_input_tokens_seen": 129279040, + "router_z_loss_clip": 1.82519531, + "router_z_loss_mlp": 0.16687012, + "step": 6021, + "time_per_iteration": 2.57817006111145 + }, + { + "auxiliary_loss_clip": 0.06467178, + "auxiliary_loss_mlp": 0.01270658, + "balance_loss_clip": 0.0628859, + "balance_loss_mlp": 0.01255685, + "epoch": 0.36206222756651135, + "flos": 15602183550720.0, + "grad_norm": 5.238961551513005, + "language_loss": 0.81591839, + "learning_rate": 2.9505875341413533e-06, + "loss": 0.89329672, + "num_input_tokens_seen": 129295415, + "router_z_loss_clip": 1.78710938, + "router_z_loss_mlp": 0.1496582, + "step": 6022, + "time_per_iteration": 2.5385305881500244 + }, + { + "auxiliary_loss_clip": 0.06457289, + "auxiliary_loss_mlp": 0.0127544, + "balance_loss_clip": 0.06285636, + "balance_loss_mlp": 0.01260349, + "epoch": 0.3621223508191793, + "flos": 23594019546240.0, + "grad_norm": 2.318322058767841, + "language_loss": 0.81707698, + "learning_rate": 2.950244857154417e-06, + "loss": 0.89440429, + "num_input_tokens_seen": 129312620, + "router_z_loss_clip": 1.71484375, + "router_z_loss_mlp": 0.15075684, + "step": 6023, + "time_per_iteration": 2.604048013687134 + }, + { + "auxiliary_loss_clip": 0.0647051, + "auxiliary_loss_mlp": 0.01276448, + "balance_loss_clip": 0.06288643, + "balance_loss_mlp": 0.01259795, + "epoch": 0.3621824740718473, + "flos": 22316490270720.0, + "grad_norm": 2.4056275848880038, + "language_loss": 0.80008531, + "learning_rate": 2.9499021441341e-06, + "loss": 0.87755489, + "num_input_tokens_seen": 129331825, + "router_z_loss_clip": 1.81640625, + "router_z_loss_mlp": 0.16650391, + "step": 6024, + "time_per_iteration": 3.9998557567596436 + }, + { + "auxiliary_loss_clip": 0.06462081, + "auxiliary_loss_mlp": 0.01273703, + "balance_loss_clip": 0.06288754, + "balance_loss_mlp": 0.01258599, + "epoch": 0.36224259732451525, + "flos": 16769232817920.0, + "grad_norm": 2.2201652107227354, + "language_loss": 0.75149572, + "learning_rate": 2.9495593950933997e-06, + "loss": 0.82885349, + "num_input_tokens_seen": 129350400, + "router_z_loss_clip": 1.73535156, + "router_z_loss_mlp": 0.15112305, + "step": 6025, + "time_per_iteration": 2.5139317512512207 + }, + { + "auxiliary_loss_clip": 0.06466474, + "auxiliary_loss_mlp": 0.01274175, + "balance_loss_clip": 0.06290425, + "balance_loss_mlp": 0.01260198, + "epoch": 0.3623027205771832, + "flos": 23156585706240.0, + "grad_norm": 1.704945166995659, + "language_loss": 0.72471905, + "learning_rate": 2.9492166100453107e-06, + "loss": 0.80212557, + "num_input_tokens_seen": 129371155, + "router_z_loss_clip": 1.76074219, + "router_z_loss_mlp": 0.13989258, + "step": 6026, + "time_per_iteration": 3.974848985671997 + }, + { + "auxiliary_loss_clip": 0.06476888, + "auxiliary_loss_mlp": 0.01276899, + "balance_loss_clip": 0.06290971, + "balance_loss_mlp": 0.01260233, + "epoch": 0.3623628438298512, + "flos": 28556829970560.0, + "grad_norm": 1.945563554904942, + "language_loss": 0.79502189, + "learning_rate": 2.948873789002833e-06, + "loss": 0.87255979, + "num_input_tokens_seen": 129391230, + "router_z_loss_clip": 1.859375, + "router_z_loss_mlp": 0.16662598, + "step": 6027, + "time_per_iteration": 2.614713430404663 + }, + { + "auxiliary_loss_clip": 0.06469107, + "auxiliary_loss_mlp": 0.01272818, + "balance_loss_clip": 0.06288799, + "balance_loss_mlp": 0.01256427, + "epoch": 0.36242296708251914, + "flos": 25492193614080.0, + "grad_norm": 4.95803648299326, + "language_loss": 0.68042505, + "learning_rate": 2.9485309319789667e-06, + "loss": 0.75784421, + "num_input_tokens_seen": 129410065, + "router_z_loss_clip": 1.80371094, + "router_z_loss_mlp": 0.16381836, + "step": 6028, + "time_per_iteration": 2.5680782794952393 + }, + { + "auxiliary_loss_clip": 0.06467344, + "auxiliary_loss_mlp": 0.01275782, + "balance_loss_clip": 0.0629041, + "balance_loss_mlp": 0.01260273, + "epoch": 0.3624830903351871, + "flos": 16296062411520.0, + "grad_norm": 2.2968183263714983, + "language_loss": 0.85463655, + "learning_rate": 2.9481880389867117e-06, + "loss": 0.93206775, + "num_input_tokens_seen": 129428655, + "router_z_loss_clip": 1.76953125, + "router_z_loss_mlp": 0.1550293, + "step": 6029, + "time_per_iteration": 2.519960403442383 + }, + { + "auxiliary_loss_clip": 0.06462874, + "auxiliary_loss_mlp": 0.01270115, + "balance_loss_clip": 0.0628645, + "balance_loss_mlp": 0.01255107, + "epoch": 0.36254321358785513, + "flos": 18302200865280.0, + "grad_norm": 1.7460468862336926, + "language_loss": 0.72888201, + "learning_rate": 2.9478451100390714e-06, + "loss": 0.80621189, + "num_input_tokens_seen": 129447845, + "router_z_loss_clip": 1.76367188, + "router_z_loss_mlp": 0.15008545, + "step": 6030, + "time_per_iteration": 2.480053663253784 + }, + { + "auxiliary_loss_clip": 0.06476077, + "auxiliary_loss_mlp": 0.01274605, + "balance_loss_clip": 0.06291036, + "balance_loss_mlp": 0.01257558, + "epoch": 0.3626033368405231, + "flos": 14870387917440.0, + "grad_norm": 3.30241855147188, + "language_loss": 0.75249928, + "learning_rate": 2.94750214514905e-06, + "loss": 0.83000606, + "num_input_tokens_seen": 129463275, + "router_z_loss_clip": 1.84960938, + "router_z_loss_mlp": 0.17041016, + "step": 6031, + "time_per_iteration": 2.4887540340423584 + }, + { + "auxiliary_loss_clip": 0.06465365, + "auxiliary_loss_mlp": 0.01279599, + "balance_loss_clip": 0.06287815, + "balance_loss_mlp": 0.01264245, + "epoch": 0.36266346009319106, + "flos": 22312632983040.0, + "grad_norm": 2.377019393957944, + "language_loss": 0.73490477, + "learning_rate": 2.9471591443296516e-06, + "loss": 0.81235439, + "num_input_tokens_seen": 129483205, + "router_z_loss_clip": 1.77539062, + "router_z_loss_mlp": 0.15344238, + "step": 6032, + "time_per_iteration": 2.5194106101989746 + }, + { + "auxiliary_loss_clip": 0.06471337, + "auxiliary_loss_mlp": 0.01274047, + "balance_loss_clip": 0.06290144, + "balance_loss_mlp": 0.01258776, + "epoch": 0.362723583345859, + "flos": 18228044401920.0, + "grad_norm": 1.8908046818451942, + "language_loss": 0.78089464, + "learning_rate": 2.946816107593884e-06, + "loss": 0.85834849, + "num_input_tokens_seen": 129499885, + "router_z_loss_clip": 1.81054688, + "router_z_loss_mlp": 0.15270996, + "step": 6033, + "time_per_iteration": 2.6062612533569336 + }, + { + "auxiliary_loss_clip": 0.06434236, + "auxiliary_loss_mlp": 0.01267532, + "balance_loss_clip": 0.06350702, + "balance_loss_mlp": 0.01264055, + "epoch": 0.362783706598527, + "flos": 68519307456000.0, + "grad_norm": 0.7613876705351186, + "language_loss": 0.64809752, + "learning_rate": 2.9464730349547547e-06, + "loss": 0.72511524, + "num_input_tokens_seen": 129561885, + "router_z_loss_clip": 0.8359375, + "router_z_loss_mlp": 0.03485107, + "step": 6034, + "time_per_iteration": 3.216454267501831 + }, + { + "auxiliary_loss_clip": 0.06466131, + "auxiliary_loss_mlp": 0.01276184, + "balance_loss_clip": 0.06289437, + "balance_loss_mlp": 0.01260222, + "epoch": 0.36284382985119495, + "flos": 26583535117440.0, + "grad_norm": 2.053623051898619, + "language_loss": 0.89456552, + "learning_rate": 2.946129926425273e-06, + "loss": 0.97198874, + "num_input_tokens_seen": 129582325, + "router_z_loss_clip": 1.765625, + "router_z_loss_mlp": 0.15966797, + "step": 6035, + "time_per_iteration": 2.5606629848480225 + }, + { + "auxiliary_loss_clip": 0.06479318, + "auxiliary_loss_mlp": 0.01272395, + "balance_loss_clip": 0.06295764, + "balance_loss_mlp": 0.0125592, + "epoch": 0.3629039531038629, + "flos": 20162919358080.0, + "grad_norm": 1.7740824971358589, + "language_loss": 0.73855877, + "learning_rate": 2.9457867820184496e-06, + "loss": 0.81607592, + "num_input_tokens_seen": 129600350, + "router_z_loss_clip": 1.83398438, + "router_z_loss_mlp": 0.16455078, + "step": 6036, + "time_per_iteration": 2.5144500732421875 + }, + { + "auxiliary_loss_clip": 0.06482191, + "auxiliary_loss_mlp": 0.01272832, + "balance_loss_clip": 0.06296846, + "balance_loss_mlp": 0.01256823, + "epoch": 0.3629640763565309, + "flos": 18631838027520.0, + "grad_norm": 1.8050884717083873, + "language_loss": 0.76438695, + "learning_rate": 2.945443601747297e-06, + "loss": 0.84193718, + "num_input_tokens_seen": 129618425, + "router_z_loss_clip": 1.8515625, + "router_z_loss_mlp": 0.16015625, + "step": 6037, + "time_per_iteration": 2.5286643505096436 + }, + { + "auxiliary_loss_clip": 0.06467965, + "auxiliary_loss_mlp": 0.01277972, + "balance_loss_clip": 0.06292737, + "balance_loss_mlp": 0.01262546, + "epoch": 0.36302419960919885, + "flos": 19577256445440.0, + "grad_norm": 1.633141884703147, + "language_loss": 0.78871524, + "learning_rate": 2.945100385624828e-06, + "loss": 0.86617458, + "num_input_tokens_seen": 129636750, + "router_z_loss_clip": 1.75097656, + "router_z_loss_mlp": 0.1541748, + "step": 6038, + "time_per_iteration": 2.5062947273254395 + }, + { + "auxiliary_loss_clip": 0.06400688, + "auxiliary_loss_mlp": 0.01261234, + "balance_loss_clip": 0.06318134, + "balance_loss_mlp": 0.01257723, + "epoch": 0.3630843228618668, + "flos": 63817805589120.0, + "grad_norm": 0.8140528620617334, + "language_loss": 0.63225597, + "learning_rate": 2.9447571336640573e-06, + "loss": 0.70887518, + "num_input_tokens_seen": 129699030, + "router_z_loss_clip": 0.82470703, + "router_z_loss_mlp": 0.03512573, + "step": 6039, + "time_per_iteration": 3.269761323928833 + }, + { + "auxiliary_loss_clip": 0.06467007, + "auxiliary_loss_mlp": 0.01269703, + "balance_loss_clip": 0.06289599, + "balance_loss_mlp": 0.01253932, + "epoch": 0.3631444461145348, + "flos": 21841600855680.0, + "grad_norm": 2.592040544468795, + "language_loss": 0.71409321, + "learning_rate": 2.944413845878002e-06, + "loss": 0.79146034, + "num_input_tokens_seen": 129717135, + "router_z_loss_clip": 1.7734375, + "router_z_loss_mlp": 0.15783691, + "step": 6040, + "time_per_iteration": 2.5549709796905518 + }, + { + "auxiliary_loss_clip": 0.06477243, + "auxiliary_loss_mlp": 0.01276394, + "balance_loss_clip": 0.06293249, + "balance_loss_mlp": 0.01260277, + "epoch": 0.36320456936720275, + "flos": 21727850538240.0, + "grad_norm": 1.6745525965006305, + "language_loss": 0.81387192, + "learning_rate": 2.9440705222796783e-06, + "loss": 0.89140832, + "num_input_tokens_seen": 129735940, + "router_z_loss_clip": 1.83789062, + "router_z_loss_mlp": 0.16113281, + "step": 6041, + "time_per_iteration": 2.529555320739746 + }, + { + "auxiliary_loss_clip": 0.06473525, + "auxiliary_loss_mlp": 0.01278326, + "balance_loss_clip": 0.0629223, + "balance_loss_mlp": 0.01261291, + "epoch": 0.3632646926198707, + "flos": 17024713516800.0, + "grad_norm": 3.0330286867158547, + "language_loss": 0.8477391, + "learning_rate": 2.943727162882107e-06, + "loss": 0.92525762, + "num_input_tokens_seen": 129752790, + "router_z_loss_clip": 1.81347656, + "router_z_loss_mlp": 0.17016602, + "step": 6042, + "time_per_iteration": 2.52242112159729 + }, + { + "auxiliary_loss_clip": 0.06469671, + "auxiliary_loss_mlp": 0.01277961, + "balance_loss_clip": 0.06290909, + "balance_loss_mlp": 0.01261892, + "epoch": 0.36332481587253873, + "flos": 23337868014720.0, + "grad_norm": 1.7311470578574424, + "language_loss": 0.78563523, + "learning_rate": 2.9433837676983064e-06, + "loss": 0.86311156, + "num_input_tokens_seen": 129773655, + "router_z_loss_clip": 1.78808594, + "router_z_loss_mlp": 0.16088867, + "step": 6043, + "time_per_iteration": 2.5507187843322754 + }, + { + "auxiliary_loss_clip": 0.06465393, + "auxiliary_loss_mlp": 0.0127573, + "balance_loss_clip": 0.06289753, + "balance_loss_mlp": 0.01258755, + "epoch": 0.3633849391252067, + "flos": 10748134126080.0, + "grad_norm": 2.0752100798218245, + "language_loss": 0.66141021, + "learning_rate": 2.943040336741298e-06, + "loss": 0.73882145, + "num_input_tokens_seen": 129791605, + "router_z_loss_clip": 1.75683594, + "router_z_loss_mlp": 0.16967773, + "step": 6044, + "time_per_iteration": 2.5431315898895264 + }, + { + "auxiliary_loss_clip": 0.06470387, + "auxiliary_loss_mlp": 0.0127397, + "balance_loss_clip": 0.06293066, + "balance_loss_mlp": 0.01258794, + "epoch": 0.36344506237787466, + "flos": 25856351458560.0, + "grad_norm": 1.7019744870222642, + "language_loss": 0.81317604, + "learning_rate": 2.9426968700241066e-06, + "loss": 0.89061964, + "num_input_tokens_seen": 129811075, + "router_z_loss_clip": 1.7734375, + "router_z_loss_mlp": 0.15185547, + "step": 6045, + "time_per_iteration": 2.578608274459839 + }, + { + "auxiliary_loss_clip": 0.06471765, + "auxiliary_loss_mlp": 0.01277035, + "balance_loss_clip": 0.06291001, + "balance_loss_mlp": 0.01260977, + "epoch": 0.3635051856305426, + "flos": 30161900056320.0, + "grad_norm": 1.9031490691130954, + "language_loss": 0.64869618, + "learning_rate": 2.942353367559755e-06, + "loss": 0.72618413, + "num_input_tokens_seen": 129833755, + "router_z_loss_clip": 1.80761719, + "router_z_loss_mlp": 0.16064453, + "step": 6046, + "time_per_iteration": 2.6581788063049316 + }, + { + "auxiliary_loss_clip": 0.06469898, + "auxiliary_loss_mlp": 0.01279877, + "balance_loss_clip": 0.06291277, + "balance_loss_mlp": 0.01264082, + "epoch": 0.3635653088832106, + "flos": 22204626670080.0, + "grad_norm": 1.4883910134219482, + "language_loss": 0.77790976, + "learning_rate": 2.9420098293612692e-06, + "loss": 0.85540754, + "num_input_tokens_seen": 129854475, + "router_z_loss_clip": 1.78417969, + "router_z_loss_mlp": 0.15783691, + "step": 6047, + "time_per_iteration": 2.59384822845459 + }, + { + "auxiliary_loss_clip": 0.06482202, + "auxiliary_loss_mlp": 0.01277437, + "balance_loss_clip": 0.0629375, + "balance_loss_mlp": 0.01259794, + "epoch": 0.36362543213587856, + "flos": 24793409289600.0, + "grad_norm": 2.402065763679051, + "language_loss": 0.79315472, + "learning_rate": 2.9416662554416767e-06, + "loss": 0.87075114, + "num_input_tokens_seen": 129873530, + "router_z_loss_clip": 1.88574219, + "router_z_loss_mlp": 0.1763916, + "step": 6048, + "time_per_iteration": 2.586355447769165 + }, + { + "auxiliary_loss_clip": 0.06388409, + "auxiliary_loss_mlp": 0.01275978, + "balance_loss_clip": 0.06308184, + "balance_loss_mlp": 0.01272211, + "epoch": 0.3636855553885465, + "flos": 62547320056320.0, + "grad_norm": 0.756250652706744, + "language_loss": 0.52505761, + "learning_rate": 2.9413226458140054e-06, + "loss": 0.6017015, + "num_input_tokens_seen": 129940400, + "router_z_loss_clip": 0.80126953, + "router_z_loss_mlp": 0.03759766, + "step": 6049, + "time_per_iteration": 3.1991608142852783 + }, + { + "auxiliary_loss_clip": 0.06471006, + "auxiliary_loss_mlp": 0.01281005, + "balance_loss_clip": 0.06289691, + "balance_loss_mlp": 0.01264518, + "epoch": 0.3637456786412145, + "flos": 24067441514880.0, + "grad_norm": 1.9518715754512581, + "language_loss": 0.8677333, + "learning_rate": 2.9409790004912845e-06, + "loss": 0.94525343, + "num_input_tokens_seen": 129958635, + "router_z_loss_clip": 1.81152344, + "router_z_loss_mlp": 0.16467285, + "step": 6050, + "time_per_iteration": 2.619880437850952 + }, + { + "auxiliary_loss_clip": 0.06465575, + "auxiliary_loss_mlp": 0.01288294, + "balance_loss_clip": 0.06288004, + "balance_loss_mlp": 0.01271784, + "epoch": 0.36380580189388245, + "flos": 16697214633600.0, + "grad_norm": 2.0514222430242937, + "language_loss": 0.78671187, + "learning_rate": 2.940635319486546e-06, + "loss": 0.86425054, + "num_input_tokens_seen": 129977685, + "router_z_loss_clip": 1.77539062, + "router_z_loss_mlp": 0.16491699, + "step": 6051, + "time_per_iteration": 2.5192694664001465 + }, + { + "auxiliary_loss_clip": 0.064697, + "auxiliary_loss_mlp": 0.0128748, + "balance_loss_clip": 0.06289212, + "balance_loss_mlp": 0.01271315, + "epoch": 0.3638659251465504, + "flos": 25120279267200.0, + "grad_norm": 2.1218426019343943, + "language_loss": 0.82423818, + "learning_rate": 2.940291602812822e-06, + "loss": 0.90180993, + "num_input_tokens_seen": 129997530, + "router_z_loss_clip": 1.8046875, + "router_z_loss_mlp": 0.16174316, + "step": 6052, + "time_per_iteration": 2.6190178394317627 + }, + { + "auxiliary_loss_clip": 0.06462704, + "auxiliary_loss_mlp": 0.01293914, + "balance_loss_clip": 0.06289209, + "balance_loss_mlp": 0.0127831, + "epoch": 0.3639260483992184, + "flos": 23009698298880.0, + "grad_norm": 1.6976848198598335, + "language_loss": 0.72702307, + "learning_rate": 2.939947850483145e-06, + "loss": 0.80458927, + "num_input_tokens_seen": 130017955, + "router_z_loss_clip": 1.73339844, + "router_z_loss_mlp": 0.15588379, + "step": 6053, + "time_per_iteration": 2.5632545948028564 + }, + { + "auxiliary_loss_clip": 0.0637124, + "auxiliary_loss_mlp": 0.0126271, + "balance_loss_clip": 0.06291765, + "balance_loss_mlp": 0.01258046, + "epoch": 0.36398617165188635, + "flos": 70735043698560.0, + "grad_norm": 0.7367280535398725, + "language_loss": 0.61109686, + "learning_rate": 2.9396040625105532e-06, + "loss": 0.68743634, + "num_input_tokens_seen": 130074275, + "router_z_loss_clip": 0.79541016, + "router_z_loss_mlp": 0.04656982, + "step": 6054, + "time_per_iteration": 3.1670703887939453 + }, + { + "auxiliary_loss_clip": 0.06468257, + "auxiliary_loss_mlp": 0.01284514, + "balance_loss_clip": 0.06288631, + "balance_loss_mlp": 0.01267062, + "epoch": 0.3640462949045543, + "flos": 22241788755840.0, + "grad_norm": 2.4941401517388795, + "language_loss": 0.76399368, + "learning_rate": 2.9392602389080802e-06, + "loss": 0.84152138, + "num_input_tokens_seen": 130091375, + "router_z_loss_clip": 1.79589844, + "router_z_loss_mlp": 0.17456055, + "step": 6055, + "time_per_iteration": 2.5719425678253174 + }, + { + "auxiliary_loss_clip": 0.06463572, + "auxiliary_loss_mlp": 0.0128082, + "balance_loss_clip": 0.06286994, + "balance_loss_mlp": 0.01264023, + "epoch": 0.3641064181572223, + "flos": 21549964320000.0, + "grad_norm": 1.5003458585655993, + "language_loss": 0.75247842, + "learning_rate": 2.938916379688765e-06, + "loss": 0.82992232, + "num_input_tokens_seen": 130111595, + "router_z_loss_clip": 1.765625, + "router_z_loss_mlp": 0.16784668, + "step": 6056, + "time_per_iteration": 2.548563241958618 + }, + { + "auxiliary_loss_clip": 0.06463505, + "auxiliary_loss_mlp": 0.01288137, + "balance_loss_clip": 0.06286436, + "balance_loss_mlp": 0.01271805, + "epoch": 0.3641665414098903, + "flos": 22279873236480.0, + "grad_norm": 1.8427248639079936, + "language_loss": 0.80231911, + "learning_rate": 2.9385724848656468e-06, + "loss": 0.87983549, + "num_input_tokens_seen": 130131440, + "router_z_loss_clip": 1.77050781, + "router_z_loss_mlp": 0.16320801, + "step": 6057, + "time_per_iteration": 2.590890645980835 + }, + { + "auxiliary_loss_clip": 0.06463237, + "auxiliary_loss_mlp": 0.01288366, + "balance_loss_clip": 0.06286855, + "balance_loss_mlp": 0.01271259, + "epoch": 0.36422666466255826, + "flos": 28337211619200.0, + "grad_norm": 2.0267495677395106, + "language_loss": 0.80895132, + "learning_rate": 2.9382285544517647e-06, + "loss": 0.88646734, + "num_input_tokens_seen": 130151375, + "router_z_loss_clip": 1.76367188, + "router_z_loss_mlp": 0.17114258, + "step": 6058, + "time_per_iteration": 3.9912350177764893 + }, + { + "auxiliary_loss_clip": 0.06462751, + "auxiliary_loss_mlp": 0.01284352, + "balance_loss_clip": 0.06282878, + "balance_loss_mlp": 0.01267794, + "epoch": 0.36428678791522623, + "flos": 24177376471680.0, + "grad_norm": 1.829086801108262, + "language_loss": 0.84467566, + "learning_rate": 2.9378845884601636e-06, + "loss": 0.9221468, + "num_input_tokens_seen": 130169960, + "router_z_loss_clip": 1.80078125, + "router_z_loss_mlp": 0.16552734, + "step": 6059, + "time_per_iteration": 3.9484288692474365 + }, + { + "auxiliary_loss_clip": 0.06463904, + "auxiliary_loss_mlp": 0.01290231, + "balance_loss_clip": 0.06284287, + "balance_loss_mlp": 0.01274006, + "epoch": 0.3643469111678942, + "flos": 22535018519040.0, + "grad_norm": 1.8662633122766634, + "language_loss": 0.88296366, + "learning_rate": 2.937540586903884e-06, + "loss": 0.96050501, + "num_input_tokens_seen": 130189800, + "router_z_loss_clip": 1.796875, + "router_z_loss_mlp": 0.16223145, + "step": 6060, + "time_per_iteration": 2.580472946166992 + }, + { + "auxiliary_loss_clip": 0.06469811, + "auxiliary_loss_mlp": 0.01278183, + "balance_loss_clip": 0.06287585, + "balance_loss_mlp": 0.01260611, + "epoch": 0.36440703442056216, + "flos": 19432549244160.0, + "grad_norm": 2.050716636944588, + "language_loss": 0.66968513, + "learning_rate": 2.937196549795971e-06, + "loss": 0.74716496, + "num_input_tokens_seen": 130206370, + "router_z_loss_clip": 1.82226562, + "router_z_loss_mlp": 0.17578125, + "step": 6061, + "time_per_iteration": 2.4934303760528564 + }, + { + "auxiliary_loss_clip": 0.06472699, + "auxiliary_loss_mlp": 0.01276187, + "balance_loss_clip": 0.06290831, + "balance_loss_mlp": 0.01259283, + "epoch": 0.3644671576732301, + "flos": 18046300896000.0, + "grad_norm": 2.6099029342135838, + "language_loss": 0.76223081, + "learning_rate": 2.9368524771494718e-06, + "loss": 0.83971971, + "num_input_tokens_seen": 130224445, + "router_z_loss_clip": 1.81835938, + "router_z_loss_mlp": 0.16918945, + "step": 6062, + "time_per_iteration": 2.5342442989349365 + }, + { + "auxiliary_loss_clip": 0.06462175, + "auxiliary_loss_mlp": 0.01277866, + "balance_loss_clip": 0.06284274, + "balance_loss_mlp": 0.01261844, + "epoch": 0.3645272809258981, + "flos": 21549125779200.0, + "grad_norm": 1.679264330509425, + "language_loss": 0.7250427, + "learning_rate": 2.936508368977432e-06, + "loss": 0.80244315, + "num_input_tokens_seen": 130245380, + "router_z_loss_clip": 1.77929688, + "router_z_loss_mlp": 0.16027832, + "step": 6063, + "time_per_iteration": 2.560140609741211 + }, + { + "auxiliary_loss_clip": 0.06463223, + "auxiliary_loss_mlp": 0.01278838, + "balance_loss_clip": 0.0628884, + "balance_loss_mlp": 0.0126256, + "epoch": 0.36458740417856605, + "flos": 22753379059200.0, + "grad_norm": 1.9927269992491163, + "language_loss": 0.67982519, + "learning_rate": 2.936164225292901e-06, + "loss": 0.75724578, + "num_input_tokens_seen": 130265575, + "router_z_loss_clip": 1.74414062, + "router_z_loss_mlp": 0.16265869, + "step": 6064, + "time_per_iteration": 4.001475095748901 + }, + { + "auxiliary_loss_clip": 0.06469691, + "auxiliary_loss_mlp": 0.01281677, + "balance_loss_clip": 0.06288914, + "balance_loss_mlp": 0.01265131, + "epoch": 0.364647527431234, + "flos": 26147862213120.0, + "grad_norm": 2.2981357468080725, + "language_loss": 0.75006247, + "learning_rate": 2.9358200461089297e-06, + "loss": 0.82757616, + "num_input_tokens_seen": 130286195, + "router_z_loss_clip": 1.80761719, + "router_z_loss_mlp": 0.16540527, + "step": 6065, + "time_per_iteration": 2.557175397872925 + }, + { + "auxiliary_loss_clip": 0.06475934, + "auxiliary_loss_mlp": 0.01274844, + "balance_loss_clip": 0.06292161, + "balance_loss_mlp": 0.01257487, + "epoch": 0.364707650683902, + "flos": 31037941693440.0, + "grad_norm": 1.8804228270875918, + "language_loss": 0.75913531, + "learning_rate": 2.9354758314385676e-06, + "loss": 0.8366431, + "num_input_tokens_seen": 130306095, + "router_z_loss_clip": 1.83691406, + "router_z_loss_mlp": 0.17370605, + "step": 6066, + "time_per_iteration": 4.028696537017822 + }, + { + "auxiliary_loss_clip": 0.06465262, + "auxiliary_loss_mlp": 0.01275132, + "balance_loss_clip": 0.06290717, + "balance_loss_mlp": 0.01260124, + "epoch": 0.36476777393656995, + "flos": 19578933527040.0, + "grad_norm": 2.1324188585544293, + "language_loss": 0.77645338, + "learning_rate": 2.9351315812948684e-06, + "loss": 0.85385728, + "num_input_tokens_seen": 130324685, + "router_z_loss_clip": 1.74414062, + "router_z_loss_mlp": 0.15014648, + "step": 6067, + "time_per_iteration": 2.5697665214538574 + }, + { + "auxiliary_loss_clip": 0.06463823, + "auxiliary_loss_mlp": 0.01273764, + "balance_loss_clip": 0.06289702, + "balance_loss_mlp": 0.01258684, + "epoch": 0.3648278971892379, + "flos": 17754622433280.0, + "grad_norm": 1.930394247385299, + "language_loss": 0.71678597, + "learning_rate": 2.934787295690886e-06, + "loss": 0.7941618, + "num_input_tokens_seen": 130343855, + "router_z_loss_clip": 1.74023438, + "router_z_loss_mlp": 0.15063477, + "step": 6068, + "time_per_iteration": 2.4845492839813232 + }, + { + "auxiliary_loss_clip": 0.06473656, + "auxiliary_loss_mlp": 0.0127485, + "balance_loss_clip": 0.06290961, + "balance_loss_mlp": 0.01258005, + "epoch": 0.3648880204419059, + "flos": 17936952917760.0, + "grad_norm": 1.8532098574136342, + "language_loss": 0.73989958, + "learning_rate": 2.9344429746396755e-06, + "loss": 0.8173846, + "num_input_tokens_seen": 130362320, + "router_z_loss_clip": 1.82519531, + "router_z_loss_mlp": 0.16845703, + "step": 6069, + "time_per_iteration": 2.508863687515259 + }, + { + "auxiliary_loss_clip": 0.06469753, + "auxiliary_loss_mlp": 0.01277718, + "balance_loss_clip": 0.06287999, + "balance_loss_mlp": 0.01261684, + "epoch": 0.3649481436945739, + "flos": 22644911548800.0, + "grad_norm": 1.9157179359535086, + "language_loss": 0.66736126, + "learning_rate": 2.9340986181542945e-06, + "loss": 0.74483597, + "num_input_tokens_seen": 130383165, + "router_z_loss_clip": 1.81738281, + "router_z_loss_mlp": 0.16027832, + "step": 6070, + "time_per_iteration": 2.516735076904297 + }, + { + "auxiliary_loss_clip": 0.06467332, + "auxiliary_loss_mlp": 0.01274362, + "balance_loss_clip": 0.06291667, + "balance_loss_mlp": 0.01259169, + "epoch": 0.36500826694724187, + "flos": 21586036302720.0, + "grad_norm": 1.8858284323375742, + "language_loss": 0.7453323, + "learning_rate": 2.9337542262477994e-06, + "loss": 0.82274926, + "num_input_tokens_seen": 130402425, + "router_z_loss_clip": 1.75878906, + "router_z_loss_mlp": 0.1519165, + "step": 6071, + "time_per_iteration": 2.566274642944336 + }, + { + "auxiliary_loss_clip": 0.06468312, + "auxiliary_loss_mlp": 0.01272795, + "balance_loss_clip": 0.0629068, + "balance_loss_mlp": 0.0125703, + "epoch": 0.36506839019990983, + "flos": 13777746675840.0, + "grad_norm": 1.7184690359068113, + "language_loss": 0.88681865, + "learning_rate": 2.9334097989332506e-06, + "loss": 0.96422982, + "num_input_tokens_seen": 130419440, + "router_z_loss_clip": 1.77539062, + "router_z_loss_mlp": 0.15771484, + "step": 6072, + "time_per_iteration": 2.510390043258667 + }, + { + "auxiliary_loss_clip": 0.06471045, + "auxiliary_loss_mlp": 0.01276068, + "balance_loss_clip": 0.06292107, + "balance_loss_mlp": 0.01260285, + "epoch": 0.3651285134525778, + "flos": 17280739267200.0, + "grad_norm": 2.591250971390436, + "language_loss": 0.72601849, + "learning_rate": 2.9330653362237094e-06, + "loss": 0.80348963, + "num_input_tokens_seen": 130438495, + "router_z_loss_clip": 1.79101562, + "router_z_loss_mlp": 0.15771484, + "step": 6073, + "time_per_iteration": 2.5448079109191895 + }, + { + "auxiliary_loss_clip": 0.06476631, + "auxiliary_loss_mlp": 0.012706, + "balance_loss_clip": 0.06296042, + "balance_loss_mlp": 0.0125422, + "epoch": 0.36518863670524576, + "flos": 21914415653760.0, + "grad_norm": 2.188049192517554, + "language_loss": 0.66876209, + "learning_rate": 2.932720838132236e-06, + "loss": 0.74623442, + "num_input_tokens_seen": 130455575, + "router_z_loss_clip": 1.8046875, + "router_z_loss_mlp": 0.16394043, + "step": 6074, + "time_per_iteration": 2.5186121463775635 + }, + { + "auxiliary_loss_clip": 0.06466351, + "auxiliary_loss_mlp": 0.01270864, + "balance_loss_clip": 0.06289779, + "balance_loss_mlp": 0.01255319, + "epoch": 0.3652487599579137, + "flos": 27128933343360.0, + "grad_norm": 1.455377552522792, + "language_loss": 0.73552799, + "learning_rate": 2.9323763046718954e-06, + "loss": 0.81290013, + "num_input_tokens_seen": 130476385, + "router_z_loss_clip": 1.76269531, + "router_z_loss_mlp": 0.15551758, + "step": 6075, + "time_per_iteration": 2.5611414909362793 + }, + { + "auxiliary_loss_clip": 0.06476435, + "auxiliary_loss_mlp": 0.01270879, + "balance_loss_clip": 0.06292082, + "balance_loss_mlp": 0.01255107, + "epoch": 0.3653088832105817, + "flos": 19761683281920.0, + "grad_norm": 3.551310730384351, + "language_loss": 0.89872956, + "learning_rate": 2.9320317358557524e-06, + "loss": 0.97620273, + "num_input_tokens_seen": 130493630, + "router_z_loss_clip": 1.84179688, + "router_z_loss_mlp": 0.15771484, + "step": 6076, + "time_per_iteration": 2.491070508956909 + }, + { + "auxiliary_loss_clip": 0.06471214, + "auxiliary_loss_mlp": 0.01269524, + "balance_loss_clip": 0.06294619, + "balance_loss_mlp": 0.01253782, + "epoch": 0.36536900646324966, + "flos": 13119981724800.0, + "grad_norm": 1.9522812947590364, + "language_loss": 0.69894624, + "learning_rate": 2.931687131696872e-06, + "loss": 0.7763536, + "num_input_tokens_seen": 130510735, + "router_z_loss_clip": 1.76855469, + "router_z_loss_mlp": 0.15740967, + "step": 6077, + "time_per_iteration": 2.5298445224761963 + }, + { + "auxiliary_loss_clip": 0.06367216, + "auxiliary_loss_mlp": 0.01255974, + "balance_loss_clip": 0.06288684, + "balance_loss_mlp": 0.0125196, + "epoch": 0.3654291297159176, + "flos": 71122848393600.0, + "grad_norm": 0.715882721223993, + "language_loss": 0.61670828, + "learning_rate": 2.9313424922083224e-06, + "loss": 0.69294018, + "num_input_tokens_seen": 130577050, + "router_z_loss_clip": 0.78271484, + "router_z_loss_mlp": 0.04013062, + "step": 6078, + "time_per_iteration": 3.245680093765259 + }, + { + "auxiliary_loss_clip": 0.06468864, + "auxiliary_loss_mlp": 0.01269715, + "balance_loss_clip": 0.0628942, + "balance_loss_mlp": 0.01254217, + "epoch": 0.3654892529685856, + "flos": 23623299348480.0, + "grad_norm": 2.6954686860737427, + "language_loss": 0.78565228, + "learning_rate": 2.930997817403173e-06, + "loss": 0.86303806, + "num_input_tokens_seen": 130593780, + "router_z_loss_clip": 1.79492188, + "router_z_loss_mlp": 0.1550293, + "step": 6079, + "time_per_iteration": 2.5243916511535645 + }, + { + "auxiliary_loss_clip": 0.06474455, + "auxiliary_loss_mlp": 0.0127227, + "balance_loss_clip": 0.06293908, + "balance_loss_mlp": 0.01255557, + "epoch": 0.36554937622125355, + "flos": 43480788174720.0, + "grad_norm": 2.827080544182906, + "language_loss": 0.62854588, + "learning_rate": 2.9306531072944913e-06, + "loss": 0.70601308, + "num_input_tokens_seen": 130615510, + "router_z_loss_clip": 1.80371094, + "router_z_loss_mlp": 0.16711426, + "step": 6080, + "time_per_iteration": 2.755979299545288 + }, + { + "auxiliary_loss_clip": 0.06473932, + "auxiliary_loss_mlp": 0.01273454, + "balance_loss_clip": 0.06292675, + "balance_loss_mlp": 0.012568, + "epoch": 0.3656094994739215, + "flos": 23301334834560.0, + "grad_norm": 2.0380719718304046, + "language_loss": 0.68215913, + "learning_rate": 2.930308361895352e-06, + "loss": 0.75963295, + "num_input_tokens_seen": 130635410, + "router_z_loss_clip": 1.8125, + "router_z_loss_mlp": 0.16674805, + "step": 6081, + "time_per_iteration": 2.5318713188171387 + }, + { + "auxiliary_loss_clip": 0.06476995, + "auxiliary_loss_mlp": 0.01283221, + "balance_loss_clip": 0.06289314, + "balance_loss_mlp": 0.01267021, + "epoch": 0.3656696227265895, + "flos": 24578947964160.0, + "grad_norm": 1.6214502004720641, + "language_loss": 0.75242162, + "learning_rate": 2.9299635812188257e-06, + "loss": 0.83002377, + "num_input_tokens_seen": 130657725, + "router_z_loss_clip": 1.87597656, + "router_z_loss_mlp": 0.1619873, + "step": 6082, + "time_per_iteration": 2.614473819732666 + }, + { + "auxiliary_loss_clip": 0.06474194, + "auxiliary_loss_mlp": 0.0127049, + "balance_loss_clip": 0.06295186, + "balance_loss_mlp": 0.01255851, + "epoch": 0.3657297459792575, + "flos": 27935849761920.0, + "grad_norm": 4.519769037138984, + "language_loss": 0.83192384, + "learning_rate": 2.929618765277987e-06, + "loss": 0.90937066, + "num_input_tokens_seen": 130678360, + "router_z_loss_clip": 1.79199219, + "router_z_loss_mlp": 0.14660645, + "step": 6083, + "time_per_iteration": 2.569382429122925 + }, + { + "auxiliary_loss_clip": 0.06373743, + "auxiliary_loss_mlp": 0.01258609, + "balance_loss_clip": 0.06293802, + "balance_loss_mlp": 0.01254855, + "epoch": 0.36578986923192547, + "flos": 67410566231040.0, + "grad_norm": 0.7897440828264927, + "language_loss": 0.59315842, + "learning_rate": 2.9292739140859125e-06, + "loss": 0.66948193, + "num_input_tokens_seen": 130742110, + "router_z_loss_clip": 0.796875, + "router_z_loss_mlp": 0.03747559, + "step": 6084, + "time_per_iteration": 3.2453150749206543 + }, + { + "auxiliary_loss_clip": 0.0646999, + "auxiliary_loss_mlp": 0.0127453, + "balance_loss_clip": 0.06292025, + "balance_loss_mlp": 0.01258801, + "epoch": 0.36584999248459343, + "flos": 20233302387840.0, + "grad_norm": 1.9605927592145687, + "language_loss": 0.73469806, + "learning_rate": 2.9289290276556767e-06, + "loss": 0.81214333, + "num_input_tokens_seen": 130759870, + "router_z_loss_clip": 1.78320312, + "router_z_loss_mlp": 0.15734863, + "step": 6085, + "time_per_iteration": 2.5149080753326416 + }, + { + "auxiliary_loss_clip": 0.06475443, + "auxiliary_loss_mlp": 0.01272781, + "balance_loss_clip": 0.06296027, + "balance_loss_mlp": 0.01256974, + "epoch": 0.3659101157372614, + "flos": 19068475253760.0, + "grad_norm": 1.7755618246241633, + "language_loss": 0.78367889, + "learning_rate": 2.9285841060003604e-06, + "loss": 0.86116111, + "num_input_tokens_seen": 130778510, + "router_z_loss_clip": 1.79492188, + "router_z_loss_mlp": 0.15802002, + "step": 6086, + "time_per_iteration": 2.6959855556488037 + }, + { + "auxiliary_loss_clip": 0.06460601, + "auxiliary_loss_mlp": 0.01277624, + "balance_loss_clip": 0.0628686, + "balance_loss_mlp": 0.01262449, + "epoch": 0.36597023898992936, + "flos": 30818658758400.0, + "grad_norm": 2.7333963743808387, + "language_loss": 0.77419388, + "learning_rate": 2.9282391491330416e-06, + "loss": 0.85157609, + "num_input_tokens_seen": 130798535, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.15185547, + "step": 6087, + "time_per_iteration": 2.660513401031494 + }, + { + "auxiliary_loss_clip": 0.06470397, + "auxiliary_loss_mlp": 0.01281375, + "balance_loss_clip": 0.06288096, + "balance_loss_mlp": 0.0126543, + "epoch": 0.36603036224259733, + "flos": 20528041524480.0, + "grad_norm": 2.0856395013908005, + "language_loss": 0.70779794, + "learning_rate": 2.9278941570668002e-06, + "loss": 0.78531569, + "num_input_tokens_seen": 130816655, + "router_z_loss_clip": 1.82226562, + "router_z_loss_mlp": 0.15948486, + "step": 6088, + "time_per_iteration": 2.5904111862182617 + }, + { + "auxiliary_loss_clip": 0.064822, + "auxiliary_loss_mlp": 0.0127711, + "balance_loss_clip": 0.06290494, + "balance_loss_mlp": 0.01258835, + "epoch": 0.3660904854952653, + "flos": 38339043356160.0, + "grad_norm": 1.5018444157956148, + "language_loss": 0.8073988, + "learning_rate": 2.92754912981472e-06, + "loss": 0.88499188, + "num_input_tokens_seen": 130841225, + "router_z_loss_clip": 1.9140625, + "router_z_loss_mlp": 0.18273926, + "step": 6089, + "time_per_iteration": 2.695387125015259 + }, + { + "auxiliary_loss_clip": 0.06466638, + "auxiliary_loss_mlp": 0.0126828, + "balance_loss_clip": 0.06289521, + "balance_loss_mlp": 0.01254065, + "epoch": 0.36615060874793326, + "flos": 21842062053120.0, + "grad_norm": 1.783943984741075, + "language_loss": 0.71745276, + "learning_rate": 2.927204067389884e-06, + "loss": 0.79480195, + "num_input_tokens_seen": 130861050, + "router_z_loss_clip": 1.77050781, + "router_z_loss_mlp": 0.14208984, + "step": 6090, + "time_per_iteration": 2.5730583667755127 + }, + { + "auxiliary_loss_clip": 0.06467035, + "auxiliary_loss_mlp": 0.01270022, + "balance_loss_clip": 0.06292006, + "balance_loss_mlp": 0.01254585, + "epoch": 0.3662107320006012, + "flos": 16587153895680.0, + "grad_norm": 1.8168526275922985, + "language_loss": 0.74269617, + "learning_rate": 2.9268589698053763e-06, + "loss": 0.82006675, + "num_input_tokens_seen": 130879775, + "router_z_loss_clip": 1.75195312, + "router_z_loss_mlp": 0.1541748, + "step": 6091, + "time_per_iteration": 2.5094668865203857 + }, + { + "auxiliary_loss_clip": 0.06470925, + "auxiliary_loss_mlp": 0.01271934, + "balance_loss_clip": 0.062924, + "balance_loss_mlp": 0.01256699, + "epoch": 0.3662708552532692, + "flos": 20964469115520.0, + "grad_norm": 2.9410218249320796, + "language_loss": 0.72888803, + "learning_rate": 2.926513837074284e-06, + "loss": 0.80631661, + "num_input_tokens_seen": 130898070, + "router_z_loss_clip": 1.78613281, + "router_z_loss_mlp": 0.15234375, + "step": 6092, + "time_per_iteration": 2.525499105453491 + }, + { + "auxiliary_loss_clip": 0.06472248, + "auxiliary_loss_mlp": 0.01276986, + "balance_loss_clip": 0.06288992, + "balance_loss_mlp": 0.01260833, + "epoch": 0.36633097850593715, + "flos": 21908252378880.0, + "grad_norm": 2.382181592286333, + "language_loss": 0.78829455, + "learning_rate": 2.9261686692096942e-06, + "loss": 0.86578685, + "num_input_tokens_seen": 130915250, + "router_z_loss_clip": 1.83105469, + "router_z_loss_mlp": 0.16174316, + "step": 6093, + "time_per_iteration": 2.519925355911255 + }, + { + "auxiliary_loss_clip": 0.06470528, + "auxiliary_loss_mlp": 0.01272915, + "balance_loss_clip": 0.06288898, + "balance_loss_mlp": 0.0125743, + "epoch": 0.3663911017586051, + "flos": 32862462422400.0, + "grad_norm": 1.6789792555665461, + "language_loss": 0.74561131, + "learning_rate": 2.925823466224696e-06, + "loss": 0.82304573, + "num_input_tokens_seen": 130936995, + "router_z_loss_clip": 1.81640625, + "router_z_loss_mlp": 0.15478516, + "step": 6094, + "time_per_iteration": 2.6374077796936035 + }, + { + "auxiliary_loss_clip": 0.06470601, + "auxiliary_loss_mlp": 0.01277645, + "balance_loss_clip": 0.06289363, + "balance_loss_mlp": 0.01261421, + "epoch": 0.3664512250112731, + "flos": 27279132986880.0, + "grad_norm": 1.6273421100585188, + "language_loss": 0.7975142, + "learning_rate": 2.9254782281323785e-06, + "loss": 0.87499666, + "num_input_tokens_seen": 130957970, + "router_z_loss_clip": 1.81152344, + "router_z_loss_mlp": 0.16223145, + "step": 6095, + "time_per_iteration": 2.565009117126465 + }, + { + "auxiliary_loss_clip": 0.06480707, + "auxiliary_loss_mlp": 0.01275122, + "balance_loss_clip": 0.06295107, + "balance_loss_mlp": 0.01258552, + "epoch": 0.3665113482639411, + "flos": 17790065510400.0, + "grad_norm": 2.4875649346087725, + "language_loss": 0.73963505, + "learning_rate": 2.925132954945834e-06, + "loss": 0.81719339, + "num_input_tokens_seen": 130974915, + "router_z_loss_clip": 1.859375, + "router_z_loss_mlp": 0.16577148, + "step": 6096, + "time_per_iteration": 2.5150070190429688 + }, + { + "auxiliary_loss_clip": 0.06474067, + "auxiliary_loss_mlp": 0.01271541, + "balance_loss_clip": 0.06288943, + "balance_loss_mlp": 0.01255901, + "epoch": 0.36657147151660907, + "flos": 27861944860800.0, + "grad_norm": 1.9533584433338151, + "language_loss": 0.67592847, + "learning_rate": 2.924787646678155e-06, + "loss": 0.75338453, + "num_input_tokens_seen": 130995745, + "router_z_loss_clip": 1.8515625, + "router_z_loss_mlp": 0.15649414, + "step": 6097, + "time_per_iteration": 4.085919618606567 + }, + { + "auxiliary_loss_clip": 0.06474558, + "auxiliary_loss_mlp": 0.01273059, + "balance_loss_clip": 0.06292384, + "balance_loss_mlp": 0.01257204, + "epoch": 0.36663159476927704, + "flos": 25381000846080.0, + "grad_norm": 1.4284875999183062, + "language_loss": 0.77924675, + "learning_rate": 2.9244423033424365e-06, + "loss": 0.85672289, + "num_input_tokens_seen": 131015545, + "router_z_loss_clip": 1.82421875, + "router_z_loss_mlp": 0.15856934, + "step": 6098, + "time_per_iteration": 4.075935363769531 + }, + { + "auxiliary_loss_clip": 0.06469452, + "auxiliary_loss_mlp": 0.01270135, + "balance_loss_clip": 0.06291129, + "balance_loss_mlp": 0.01254751, + "epoch": 0.366691718021945, + "flos": 21362979934080.0, + "grad_norm": 2.6338542151665862, + "language_loss": 0.73907244, + "learning_rate": 2.9240969249517723e-06, + "loss": 0.81646824, + "num_input_tokens_seen": 131033990, + "router_z_loss_clip": 1.78320312, + "router_z_loss_mlp": 0.15386963, + "step": 6099, + "time_per_iteration": 2.5343947410583496 + }, + { + "auxiliary_loss_clip": 0.06462912, + "auxiliary_loss_mlp": 0.01271369, + "balance_loss_clip": 0.06286579, + "balance_loss_mlp": 0.01256695, + "epoch": 0.36675184127461297, + "flos": 16806017560320.0, + "grad_norm": 1.7024924966611934, + "language_loss": 0.84795189, + "learning_rate": 2.9237515115192602e-06, + "loss": 0.92529464, + "num_input_tokens_seen": 131050710, + "router_z_loss_clip": 1.76171875, + "router_z_loss_mlp": 0.14660645, + "step": 6100, + "time_per_iteration": 2.5503897666931152 + }, + { + "auxiliary_loss_clip": 0.06478457, + "auxiliary_loss_mlp": 0.01268408, + "balance_loss_clip": 0.06293124, + "balance_loss_mlp": 0.0125216, + "epoch": 0.36681196452728093, + "flos": 21912696645120.0, + "grad_norm": 2.268106387872694, + "language_loss": 0.712331, + "learning_rate": 2.9234060630579992e-06, + "loss": 0.78979969, + "num_input_tokens_seen": 131071435, + "router_z_loss_clip": 1.85351562, + "router_z_loss_mlp": 0.16235352, + "step": 6101, + "time_per_iteration": 2.5698294639587402 + }, + { + "auxiliary_loss_clip": 0.06474541, + "auxiliary_loss_mlp": 0.01273553, + "balance_loss_clip": 0.0629383, + "balance_loss_mlp": 0.01257137, + "epoch": 0.3668720877799489, + "flos": 17718215034240.0, + "grad_norm": 2.179497141372214, + "language_loss": 0.76701671, + "learning_rate": 2.9230605795810865e-06, + "loss": 0.84449768, + "num_input_tokens_seen": 131088775, + "router_z_loss_clip": 1.8046875, + "router_z_loss_mlp": 0.16418457, + "step": 6102, + "time_per_iteration": 2.653047561645508 + }, + { + "auxiliary_loss_clip": 0.06477299, + "auxiliary_loss_mlp": 0.01279444, + "balance_loss_clip": 0.06290299, + "balance_loss_mlp": 0.01262099, + "epoch": 0.36693221103261686, + "flos": 47055882804480.0, + "grad_norm": 1.641444039565929, + "language_loss": 0.70188046, + "learning_rate": 2.922715061101625e-06, + "loss": 0.77944791, + "num_input_tokens_seen": 131112800, + "router_z_loss_clip": 1.86621094, + "router_z_loss_mlp": 0.17333984, + "step": 6103, + "time_per_iteration": 2.7502424716949463 + }, + { + "auxiliary_loss_clip": 0.06472746, + "auxiliary_loss_mlp": 0.01272056, + "balance_loss_clip": 0.06290279, + "balance_loss_mlp": 0.01255581, + "epoch": 0.3669923342852848, + "flos": 15966383322240.0, + "grad_norm": 1.6662921664183201, + "language_loss": 0.71920598, + "learning_rate": 2.922369507632716e-06, + "loss": 0.79665399, + "num_input_tokens_seen": 131131150, + "router_z_loss_clip": 1.82324219, + "router_z_loss_mlp": 0.16467285, + "step": 6104, + "time_per_iteration": 3.993805408477783 + }, + { + "auxiliary_loss_clip": 0.0647142, + "auxiliary_loss_mlp": 0.01272456, + "balance_loss_clip": 0.06291486, + "balance_loss_mlp": 0.01256494, + "epoch": 0.3670524575379528, + "flos": 19980630800640.0, + "grad_norm": 1.7978052174853272, + "language_loss": 0.81448174, + "learning_rate": 2.9220239191874617e-06, + "loss": 0.89192045, + "num_input_tokens_seen": 131150365, + "router_z_loss_clip": 1.79882812, + "router_z_loss_mlp": 0.15966797, + "step": 6105, + "time_per_iteration": 3.907820463180542 + }, + { + "auxiliary_loss_clip": 0.06477002, + "auxiliary_loss_mlp": 0.01272813, + "balance_loss_clip": 0.06288886, + "balance_loss_mlp": 0.01254896, + "epoch": 0.36711258079062076, + "flos": 25710092956800.0, + "grad_norm": 1.7139492182529468, + "language_loss": 0.81421959, + "learning_rate": 2.9216782957789692e-06, + "loss": 0.89171767, + "num_input_tokens_seen": 131169310, + "router_z_loss_clip": 1.88183594, + "router_z_loss_mlp": 0.17919922, + "step": 6106, + "time_per_iteration": 2.5623860359191895 + }, + { + "auxiliary_loss_clip": 0.06422871, + "auxiliary_loss_mlp": 0.01259281, + "balance_loss_clip": 0.06342293, + "balance_loss_mlp": 0.01254903, + "epoch": 0.3671727040432887, + "flos": 60793014648960.0, + "grad_norm": 0.6928078159632836, + "language_loss": 0.59215379, + "learning_rate": 2.9213326374203426e-06, + "loss": 0.66897523, + "num_input_tokens_seen": 131232900, + "router_z_loss_clip": 0.8046875, + "router_z_loss_mlp": 0.04385376, + "step": 6107, + "time_per_iteration": 3.2451207637786865 + }, + { + "auxiliary_loss_clip": 0.06468046, + "auxiliary_loss_mlp": 0.01273048, + "balance_loss_clip": 0.06291793, + "balance_loss_mlp": 0.01257396, + "epoch": 0.3672328272959567, + "flos": 18667281104640.0, + "grad_norm": 1.5826982165866754, + "language_loss": 0.74750638, + "learning_rate": 2.92098694412469e-06, + "loss": 0.82491726, + "num_input_tokens_seen": 131250920, + "router_z_loss_clip": 1.76171875, + "router_z_loss_mlp": 0.15631104, + "step": 6108, + "time_per_iteration": 2.5317509174346924 + }, + { + "auxiliary_loss_clip": 0.06472465, + "auxiliary_loss_mlp": 0.01275992, + "balance_loss_clip": 0.06289458, + "balance_loss_mlp": 0.01260482, + "epoch": 0.3672929505486247, + "flos": 15054395483520.0, + "grad_norm": 2.0251921146130547, + "language_loss": 0.74524188, + "learning_rate": 2.9206412159051213e-06, + "loss": 0.82272649, + "num_input_tokens_seen": 131267910, + "router_z_loss_clip": 1.83203125, + "router_z_loss_mlp": 0.15490723, + "step": 6109, + "time_per_iteration": 2.530214309692383 + }, + { + "auxiliary_loss_clip": 0.06464404, + "auxiliary_loss_mlp": 0.01270146, + "balance_loss_clip": 0.06286883, + "balance_loss_mlp": 0.0125503, + "epoch": 0.3673530738012927, + "flos": 20594693047680.0, + "grad_norm": 1.6431777634434088, + "language_loss": 0.53560948, + "learning_rate": 2.920295452774744e-06, + "loss": 0.61295497, + "num_input_tokens_seen": 131287150, + "router_z_loss_clip": 1.77734375, + "router_z_loss_mlp": 0.15112305, + "step": 6110, + "time_per_iteration": 2.5247035026550293 + }, + { + "auxiliary_loss_clip": 0.06459565, + "auxiliary_loss_mlp": 0.01275062, + "balance_loss_clip": 0.06284792, + "balance_loss_mlp": 0.01258957, + "epoch": 0.36741319705396064, + "flos": 21696348602880.0, + "grad_norm": 1.814369900920369, + "language_loss": 0.80767608, + "learning_rate": 2.919949654746672e-06, + "loss": 0.8850224, + "num_input_tokens_seen": 131308225, + "router_z_loss_clip": 1.74707031, + "router_z_loss_mlp": 0.16088867, + "step": 6111, + "time_per_iteration": 2.6213719844818115 + }, + { + "auxiliary_loss_clip": 0.06459287, + "auxiliary_loss_mlp": 0.01273038, + "balance_loss_clip": 0.06284556, + "balance_loss_mlp": 0.01256861, + "epoch": 0.3674733203066286, + "flos": 29870011958400.0, + "grad_norm": 1.7131296557309772, + "language_loss": 0.72860467, + "learning_rate": 2.9196038218340163e-06, + "loss": 0.80592787, + "num_input_tokens_seen": 131332115, + "router_z_loss_clip": 1.74609375, + "router_z_loss_mlp": 0.16174316, + "step": 6112, + "time_per_iteration": 2.656101703643799 + }, + { + "auxiliary_loss_clip": 0.06459092, + "auxiliary_loss_mlp": 0.01272993, + "balance_loss_clip": 0.06283998, + "balance_loss_mlp": 0.01257866, + "epoch": 0.36753344355929657, + "flos": 18262439303040.0, + "grad_norm": 1.5099687925303509, + "language_loss": 0.85667342, + "learning_rate": 2.919257954049892e-06, + "loss": 0.93399429, + "num_input_tokens_seen": 131351885, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.15124512, + "step": 6113, + "time_per_iteration": 2.5230536460876465 + }, + { + "auxiliary_loss_clip": 0.06460717, + "auxiliary_loss_mlp": 0.01276985, + "balance_loss_clip": 0.06281444, + "balance_loss_mlp": 0.01260439, + "epoch": 0.36759356681196453, + "flos": 25308144120960.0, + "grad_norm": 1.9025835930032806, + "language_loss": 0.78706479, + "learning_rate": 2.918912051407413e-06, + "loss": 0.86444181, + "num_input_tokens_seen": 131370245, + "router_z_loss_clip": 1.79101562, + "router_z_loss_mlp": 0.16540527, + "step": 6114, + "time_per_iteration": 2.6091229915618896 + }, + { + "auxiliary_loss_clip": 0.06466475, + "auxiliary_loss_mlp": 0.01272915, + "balance_loss_clip": 0.0628548, + "balance_loss_mlp": 0.01255725, + "epoch": 0.3676536900646325, + "flos": 21039338338560.0, + "grad_norm": 1.6305517572579116, + "language_loss": 0.67626929, + "learning_rate": 2.918566113919698e-06, + "loss": 0.75366318, + "num_input_tokens_seen": 131388115, + "router_z_loss_clip": 1.80859375, + "router_z_loss_mlp": 0.17199707, + "step": 6115, + "time_per_iteration": 2.5226221084594727 + }, + { + "auxiliary_loss_clip": 0.06454025, + "auxiliary_loss_mlp": 0.01272139, + "balance_loss_clip": 0.06280309, + "balance_loss_mlp": 0.01257077, + "epoch": 0.36771381331730046, + "flos": 16293882205440.0, + "grad_norm": 2.2835896682412105, + "language_loss": 0.76996851, + "learning_rate": 2.9182201415998636e-06, + "loss": 0.84723008, + "num_input_tokens_seen": 131404595, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.15063477, + "step": 6116, + "time_per_iteration": 2.504951238632202 + }, + { + "auxiliary_loss_clip": 0.06459618, + "auxiliary_loss_mlp": 0.01274615, + "balance_loss_clip": 0.06282905, + "balance_loss_mlp": 0.01259153, + "epoch": 0.36777393656996843, + "flos": 22316574124800.0, + "grad_norm": 1.8264539284878285, + "language_loss": 0.62890095, + "learning_rate": 2.9178741344610286e-06, + "loss": 0.70624328, + "num_input_tokens_seen": 131423760, + "router_z_loss_clip": 1.76367188, + "router_z_loss_mlp": 0.15454102, + "step": 6117, + "time_per_iteration": 2.529193639755249 + }, + { + "auxiliary_loss_clip": 0.06458353, + "auxiliary_loss_mlp": 0.01270127, + "balance_loss_clip": 0.06285255, + "balance_loss_mlp": 0.01254749, + "epoch": 0.3678340598226364, + "flos": 26841405657600.0, + "grad_norm": 1.7359331247938332, + "language_loss": 0.73532575, + "learning_rate": 2.9175280925163156e-06, + "loss": 0.81261057, + "num_input_tokens_seen": 131444955, + "router_z_loss_clip": 1.73046875, + "router_z_loss_mlp": 0.15368652, + "step": 6118, + "time_per_iteration": 2.6261374950408936 + }, + { + "auxiliary_loss_clip": 0.06469986, + "auxiliary_loss_mlp": 0.01276003, + "balance_loss_clip": 0.06289329, + "balance_loss_mlp": 0.01259707, + "epoch": 0.36789418307530436, + "flos": 21768073297920.0, + "grad_norm": 1.5781425493049515, + "language_loss": 0.73047614, + "learning_rate": 2.9171820157788445e-06, + "loss": 0.80793607, + "num_input_tokens_seen": 131465720, + "router_z_loss_clip": 1.80566406, + "router_z_loss_mlp": 0.16320801, + "step": 6119, + "time_per_iteration": 2.5320048332214355 + }, + { + "auxiliary_loss_clip": 0.06466002, + "auxiliary_loss_mlp": 0.0127303, + "balance_loss_clip": 0.06290065, + "balance_loss_mlp": 0.0125789, + "epoch": 0.3679543063279723, + "flos": 15929598579840.0, + "grad_norm": 2.0565678381587307, + "language_loss": 0.8018201, + "learning_rate": 2.9168359042617404e-06, + "loss": 0.87921047, + "num_input_tokens_seen": 131483080, + "router_z_loss_clip": 1.75976562, + "router_z_loss_mlp": 0.15136719, + "step": 6120, + "time_per_iteration": 2.5085418224334717 + }, + { + "auxiliary_loss_clip": 0.06467941, + "auxiliary_loss_mlp": 0.01276389, + "balance_loss_clip": 0.0629365, + "balance_loss_mlp": 0.01260868, + "epoch": 0.3680144295806403, + "flos": 24281693205120.0, + "grad_norm": 2.0719591239633703, + "language_loss": 0.64803445, + "learning_rate": 2.916489757978126e-06, + "loss": 0.72547781, + "num_input_tokens_seen": 131502545, + "router_z_loss_clip": 1.74316406, + "router_z_loss_mlp": 0.15515137, + "step": 6121, + "time_per_iteration": 2.532470703125 + }, + { + "auxiliary_loss_clip": 0.06466727, + "auxiliary_loss_mlp": 0.01268749, + "balance_loss_clip": 0.06293779, + "balance_loss_mlp": 0.01254527, + "epoch": 0.36807455283330826, + "flos": 26111329032960.0, + "grad_norm": 1.9648479350594452, + "language_loss": 0.71416938, + "learning_rate": 2.9161435769411286e-06, + "loss": 0.79152405, + "num_input_tokens_seen": 131522155, + "router_z_loss_clip": 1.72851562, + "router_z_loss_mlp": 0.14221191, + "step": 6122, + "time_per_iteration": 2.5836074352264404 + }, + { + "auxiliary_loss_clip": 0.06461313, + "auxiliary_loss_mlp": 0.01273307, + "balance_loss_clip": 0.06291762, + "balance_loss_mlp": 0.0125831, + "epoch": 0.3681346760859763, + "flos": 24651972397440.0, + "grad_norm": 1.8972357597085572, + "language_loss": 0.69858962, + "learning_rate": 2.915797361163875e-06, + "loss": 0.77593577, + "num_input_tokens_seen": 131543865, + "router_z_loss_clip": 1.69726562, + "router_z_loss_mlp": 0.15002441, + "step": 6123, + "time_per_iteration": 2.5574307441711426 + }, + { + "auxiliary_loss_clip": 0.06474412, + "auxiliary_loss_mlp": 0.0127264, + "balance_loss_clip": 0.06293641, + "balance_loss_mlp": 0.01256094, + "epoch": 0.36819479933864424, + "flos": 23885152957440.0, + "grad_norm": 2.796866262853862, + "language_loss": 0.74766016, + "learning_rate": 2.9154511106594933e-06, + "loss": 0.8251307, + "num_input_tokens_seen": 131562155, + "router_z_loss_clip": 1.80957031, + "router_z_loss_mlp": 0.16540527, + "step": 6124, + "time_per_iteration": 2.5769121646881104 + }, + { + "auxiliary_loss_clip": 0.06470435, + "auxiliary_loss_mlp": 0.01274758, + "balance_loss_clip": 0.06295419, + "balance_loss_mlp": 0.01258116, + "epoch": 0.3682549225913122, + "flos": 25560606072960.0, + "grad_norm": 3.2532876436035236, + "language_loss": 0.74467599, + "learning_rate": 2.915104825441114e-06, + "loss": 0.82212794, + "num_input_tokens_seen": 131581695, + "router_z_loss_clip": 1.75195312, + "router_z_loss_mlp": 0.16625977, + "step": 6125, + "time_per_iteration": 2.5822880268096924 + }, + { + "auxiliary_loss_clip": 0.06476732, + "auxiliary_loss_mlp": 0.01270787, + "balance_loss_clip": 0.06296605, + "balance_loss_mlp": 0.01253967, + "epoch": 0.36831504584398017, + "flos": 16952317989120.0, + "grad_norm": 1.938795434914092, + "language_loss": 0.7843706, + "learning_rate": 2.9147585055218686e-06, + "loss": 0.86184579, + "num_input_tokens_seen": 131599465, + "router_z_loss_clip": 1.80078125, + "router_z_loss_mlp": 0.16809082, + "step": 6126, + "time_per_iteration": 2.5298731327056885 + }, + { + "auxiliary_loss_clip": 0.06483818, + "auxiliary_loss_mlp": 0.01275366, + "balance_loss_clip": 0.06301596, + "balance_loss_mlp": 0.01257413, + "epoch": 0.36837516909664814, + "flos": 19871198968320.0, + "grad_norm": 2.3034543329783173, + "language_loss": 0.66139042, + "learning_rate": 2.914412150914888e-06, + "loss": 0.73898232, + "num_input_tokens_seen": 131618330, + "router_z_loss_clip": 1.82128906, + "router_z_loss_mlp": 0.17980957, + "step": 6127, + "time_per_iteration": 2.5208253860473633 + }, + { + "auxiliary_loss_clip": 0.06475674, + "auxiliary_loss_mlp": 0.01272228, + "balance_loss_clip": 0.06294744, + "balance_loss_mlp": 0.01256409, + "epoch": 0.3684352923493161, + "flos": 37634976224640.0, + "grad_norm": 1.7597572196634643, + "language_loss": 0.70472896, + "learning_rate": 2.9140657616333074e-06, + "loss": 0.78220791, + "num_input_tokens_seen": 131638960, + "router_z_loss_clip": 1.80761719, + "router_z_loss_mlp": 0.15808105, + "step": 6128, + "time_per_iteration": 2.6984474658966064 + }, + { + "auxiliary_loss_clip": 0.06467833, + "auxiliary_loss_mlp": 0.01270944, + "balance_loss_clip": 0.06293194, + "balance_loss_mlp": 0.01255613, + "epoch": 0.36849541560198407, + "flos": 14470786995840.0, + "grad_norm": 1.6868142680460214, + "language_loss": 0.7591843, + "learning_rate": 2.9137193376902614e-06, + "loss": 0.83657211, + "num_input_tokens_seen": 131657440, + "router_z_loss_clip": 1.74511719, + "router_z_loss_mlp": 0.15332031, + "step": 6129, + "time_per_iteration": 2.49924898147583 + }, + { + "auxiliary_loss_clip": 0.06473218, + "auxiliary_loss_mlp": 0.01270816, + "balance_loss_clip": 0.06296876, + "balance_loss_mlp": 0.01255844, + "epoch": 0.36855553885465203, + "flos": 25777037969280.0, + "grad_norm": 1.6502765336301308, + "language_loss": 0.85087365, + "learning_rate": 2.9133728790988868e-06, + "loss": 0.92831397, + "num_input_tokens_seen": 131678035, + "router_z_loss_clip": 1.76464844, + "router_z_loss_mlp": 0.1496582, + "step": 6130, + "time_per_iteration": 2.604851484298706 + }, + { + "auxiliary_loss_clip": 0.06391466, + "auxiliary_loss_mlp": 0.01263828, + "balance_loss_clip": 0.06313837, + "balance_loss_mlp": 0.01261091, + "epoch": 0.36861566210732, + "flos": 65071715212800.0, + "grad_norm": 0.7916436629428728, + "language_loss": 0.60275888, + "learning_rate": 2.913026385872321e-06, + "loss": 0.67931175, + "num_input_tokens_seen": 131742470, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.02740479, + "step": 6131, + "time_per_iteration": 3.228571891784668 + }, + { + "auxiliary_loss_clip": 0.0647023, + "auxiliary_loss_mlp": 0.01270609, + "balance_loss_clip": 0.06296837, + "balance_loss_mlp": 0.01255332, + "epoch": 0.36867578535998796, + "flos": 30962108148480.0, + "grad_norm": 1.7580055354180455, + "language_loss": 0.73204952, + "learning_rate": 2.9126798580237034e-06, + "loss": 0.8094579, + "num_input_tokens_seen": 131764570, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.152771, + "step": 6132, + "time_per_iteration": 2.6286978721618652 + }, + { + "auxiliary_loss_clip": 0.06478602, + "auxiliary_loss_mlp": 0.01273616, + "balance_loss_clip": 0.0629575, + "balance_loss_mlp": 0.0125738, + "epoch": 0.3687359086126559, + "flos": 28845154270080.0, + "grad_norm": 1.8077518075699008, + "language_loss": 0.7455107, + "learning_rate": 2.9123332955661736e-06, + "loss": 0.82303286, + "num_input_tokens_seen": 131785720, + "router_z_loss_clip": 1.83007812, + "router_z_loss_mlp": 0.16235352, + "step": 6133, + "time_per_iteration": 2.6024398803710938 + }, + { + "auxiliary_loss_clip": 0.06463782, + "auxiliary_loss_mlp": 0.0127464, + "balance_loss_clip": 0.06292324, + "balance_loss_mlp": 0.01258618, + "epoch": 0.3687960318653239, + "flos": 21403076912640.0, + "grad_norm": 1.7721182564640174, + "language_loss": 0.7199074, + "learning_rate": 2.911986698512874e-06, + "loss": 0.79729164, + "num_input_tokens_seen": 131804430, + "router_z_loss_clip": 1.71386719, + "router_z_loss_mlp": 0.16027832, + "step": 6134, + "time_per_iteration": 2.646097421646118 + }, + { + "auxiliary_loss_clip": 0.0646476, + "auxiliary_loss_mlp": 0.0126875, + "balance_loss_clip": 0.06289706, + "balance_loss_mlp": 0.01252288, + "epoch": 0.36885615511799186, + "flos": 20272183482240.0, + "grad_norm": 4.124945820193244, + "language_loss": 0.7570188, + "learning_rate": 2.9116400668769477e-06, + "loss": 0.83435392, + "num_input_tokens_seen": 131822060, + "router_z_loss_clip": 1.74804688, + "router_z_loss_mlp": 0.16455078, + "step": 6135, + "time_per_iteration": 2.6019539833068848 + }, + { + "auxiliary_loss_clip": 0.06382909, + "auxiliary_loss_mlp": 0.01256883, + "balance_loss_clip": 0.06304377, + "balance_loss_mlp": 0.0125392, + "epoch": 0.3689162783706599, + "flos": 63106317371520.0, + "grad_norm": 0.7816734524389999, + "language_loss": 0.58664352, + "learning_rate": 2.9112934006715376e-06, + "loss": 0.66304147, + "num_input_tokens_seen": 131880715, + "router_z_loss_clip": 0.78515625, + "router_z_loss_mlp": 0.02960205, + "step": 6136, + "time_per_iteration": 3.139789342880249 + }, + { + "auxiliary_loss_clip": 0.06465235, + "auxiliary_loss_mlp": 0.01270986, + "balance_loss_clip": 0.06292487, + "balance_loss_mlp": 0.012563, + "epoch": 0.36897640162332784, + "flos": 10966536593280.0, + "grad_norm": 2.7370945268269806, + "language_loss": 0.79547632, + "learning_rate": 2.9109466999097918e-06, + "loss": 0.8728385, + "num_input_tokens_seen": 131895850, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.14678955, + "step": 6137, + "time_per_iteration": 3.937328577041626 + }, + { + "auxiliary_loss_clip": 0.06472172, + "auxiliary_loss_mlp": 0.01271273, + "balance_loss_clip": 0.06297816, + "balance_loss_mlp": 0.01255764, + "epoch": 0.3690365248759958, + "flos": 20710581644160.0, + "grad_norm": 1.9257362559650297, + "language_loss": 0.74479491, + "learning_rate": 2.9105999646048552e-06, + "loss": 0.82222939, + "num_input_tokens_seen": 131915775, + "router_z_loss_clip": 1.74414062, + "router_z_loss_mlp": 0.15515137, + "step": 6138, + "time_per_iteration": 4.004723072052002 + }, + { + "auxiliary_loss_clip": 0.06475753, + "auxiliary_loss_mlp": 0.01270871, + "balance_loss_clip": 0.06296947, + "balance_loss_mlp": 0.01255827, + "epoch": 0.3690966481286638, + "flos": 31833495884160.0, + "grad_norm": 1.986271481109943, + "language_loss": 0.65762347, + "learning_rate": 2.9102531947698764e-06, + "loss": 0.73508972, + "num_input_tokens_seen": 131935715, + "router_z_loss_clip": 1.78808594, + "router_z_loss_mlp": 0.1505127, + "step": 6139, + "time_per_iteration": 2.621832847595215 + }, + { + "auxiliary_loss_clip": 0.06460394, + "auxiliary_loss_mlp": 0.01271698, + "balance_loss_clip": 0.06290884, + "balance_loss_mlp": 0.0125626, + "epoch": 0.36915677138133174, + "flos": 13119897870720.0, + "grad_norm": 1.9334180469367421, + "language_loss": 0.72060692, + "learning_rate": 2.909906390418006e-06, + "loss": 0.7979278, + "num_input_tokens_seen": 131954120, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.15429688, + "step": 6140, + "time_per_iteration": 2.542410135269165 + }, + { + "auxiliary_loss_clip": 0.06370358, + "auxiliary_loss_mlp": 0.01255246, + "balance_loss_clip": 0.06292184, + "balance_loss_mlp": 0.01252388, + "epoch": 0.3692168946339997, + "flos": 68707926996480.0, + "grad_norm": 0.7297912869343693, + "language_loss": 0.59210759, + "learning_rate": 2.9095595515623934e-06, + "loss": 0.66836369, + "num_input_tokens_seen": 132017485, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 0.02853394, + "step": 6141, + "time_per_iteration": 3.242342710494995 + }, + { + "auxiliary_loss_clip": 0.06465677, + "auxiliary_loss_mlp": 0.01272477, + "balance_loss_clip": 0.06289662, + "balance_loss_mlp": 0.01256336, + "epoch": 0.36927701788666767, + "flos": 22024392537600.0, + "grad_norm": 1.6449420117919953, + "language_loss": 0.75489783, + "learning_rate": 2.909212678216192e-06, + "loss": 0.83227944, + "num_input_tokens_seen": 132036760, + "router_z_loss_clip": 1.76074219, + "router_z_loss_mlp": 0.16149902, + "step": 6142, + "time_per_iteration": 2.552541732788086 + }, + { + "auxiliary_loss_clip": 0.06459697, + "auxiliary_loss_mlp": 0.01271426, + "balance_loss_clip": 0.06287819, + "balance_loss_mlp": 0.01256883, + "epoch": 0.36933714113933563, + "flos": 21842103980160.0, + "grad_norm": 2.1834908331499694, + "language_loss": 0.77180201, + "learning_rate": 2.908865770392555e-06, + "loss": 0.84911323, + "num_input_tokens_seen": 132056935, + "router_z_loss_clip": 1.71777344, + "router_z_loss_mlp": 0.14544678, + "step": 6143, + "time_per_iteration": 3.990859031677246 + }, + { + "auxiliary_loss_clip": 0.06461622, + "auxiliary_loss_mlp": 0.01265429, + "balance_loss_clip": 0.06289461, + "balance_loss_mlp": 0.01251565, + "epoch": 0.3693972643920036, + "flos": 23697749301120.0, + "grad_norm": 1.9416354027972629, + "language_loss": 0.82307315, + "learning_rate": 2.9085188281046364e-06, + "loss": 0.9003436, + "num_input_tokens_seen": 132077285, + "router_z_loss_clip": 1.72265625, + "router_z_loss_mlp": 0.13867188, + "step": 6144, + "time_per_iteration": 2.5504705905914307 + }, + { + "auxiliary_loss_clip": 0.06462898, + "auxiliary_loss_mlp": 0.01269861, + "balance_loss_clip": 0.06287374, + "balance_loss_mlp": 0.01255586, + "epoch": 0.36945738764467156, + "flos": 22863355943040.0, + "grad_norm": 2.172105123479451, + "language_loss": 0.78995448, + "learning_rate": 2.908171851365593e-06, + "loss": 0.86728209, + "num_input_tokens_seen": 132095520, + "router_z_loss_clip": 1.75390625, + "router_z_loss_mlp": 0.14282227, + "step": 6145, + "time_per_iteration": 3.9733781814575195 + }, + { + "auxiliary_loss_clip": 0.06468924, + "auxiliary_loss_mlp": 0.01271457, + "balance_loss_clip": 0.06291068, + "balance_loss_mlp": 0.01256067, + "epoch": 0.36951751089733953, + "flos": 16621213380480.0, + "grad_norm": 1.6722610276638135, + "language_loss": 0.77129662, + "learning_rate": 2.9078248401885815e-06, + "loss": 0.8487004, + "num_input_tokens_seen": 132112810, + "router_z_loss_clip": 1.77832031, + "router_z_loss_mlp": 0.15380859, + "step": 6146, + "time_per_iteration": 2.5411174297332764 + }, + { + "auxiliary_loss_clip": 0.06466483, + "auxiliary_loss_mlp": 0.0127594, + "balance_loss_clip": 0.06289164, + "balance_loss_mlp": 0.01260419, + "epoch": 0.3695776341500075, + "flos": 18920204254080.0, + "grad_norm": 1.6293394058894772, + "language_loss": 0.81346822, + "learning_rate": 2.907477794586761e-06, + "loss": 0.89089251, + "num_input_tokens_seen": 132131615, + "router_z_loss_clip": 1.7734375, + "router_z_loss_mlp": 0.1550293, + "step": 6147, + "time_per_iteration": 2.5456924438476562 + }, + { + "auxiliary_loss_clip": 0.06463629, + "auxiliary_loss_mlp": 0.01275917, + "balance_loss_clip": 0.06286413, + "balance_loss_mlp": 0.01261684, + "epoch": 0.36963775740267546, + "flos": 20813892128640.0, + "grad_norm": 1.8090658573318705, + "language_loss": 0.83484954, + "learning_rate": 2.9071307145732926e-06, + "loss": 0.91224504, + "num_input_tokens_seen": 132149585, + "router_z_loss_clip": 1.77246094, + "router_z_loss_mlp": 0.14227295, + "step": 6148, + "time_per_iteration": 2.6318178176879883 + }, + { + "auxiliary_loss_clip": 0.06458767, + "auxiliary_loss_mlp": 0.01266964, + "balance_loss_clip": 0.06284354, + "balance_loss_mlp": 0.01252814, + "epoch": 0.3696978806553435, + "flos": 26068087526400.0, + "grad_norm": 2.191330684134815, + "language_loss": 0.74277508, + "learning_rate": 2.9067836001613357e-06, + "loss": 0.82003242, + "num_input_tokens_seen": 132165555, + "router_z_loss_clip": 1.74414062, + "router_z_loss_mlp": 0.14147949, + "step": 6149, + "time_per_iteration": 2.6037940979003906 + }, + { + "auxiliary_loss_clip": 0.06464496, + "auxiliary_loss_mlp": 0.01271867, + "balance_loss_clip": 0.06287233, + "balance_loss_mlp": 0.01256203, + "epoch": 0.36975800390801145, + "flos": 26841237949440.0, + "grad_norm": 2.856714094904378, + "language_loss": 0.71066409, + "learning_rate": 2.906436451364054e-06, + "loss": 0.78802776, + "num_input_tokens_seen": 132185100, + "router_z_loss_clip": 1.77050781, + "router_z_loss_mlp": 0.15667725, + "step": 6150, + "time_per_iteration": 2.612860918045044 + }, + { + "auxiliary_loss_clip": 0.06457143, + "auxiliary_loss_mlp": 0.01270306, + "balance_loss_clip": 0.06283612, + "balance_loss_mlp": 0.01256341, + "epoch": 0.3698181271606794, + "flos": 21149063660160.0, + "grad_norm": 1.8423166255946122, + "language_loss": 0.81970799, + "learning_rate": 2.906089268194611e-06, + "loss": 0.89698249, + "num_input_tokens_seen": 132203930, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.1395874, + "step": 6151, + "time_per_iteration": 2.535888195037842 + }, + { + "auxiliary_loss_clip": 0.0635625, + "auxiliary_loss_mlp": 0.01266021, + "balance_loss_clip": 0.06277541, + "balance_loss_mlp": 0.01262752, + "epoch": 0.3698782504133474, + "flos": 66761605958400.0, + "grad_norm": 0.7660918799950965, + "language_loss": 0.63089043, + "learning_rate": 2.9057420506661726e-06, + "loss": 0.70711315, + "num_input_tokens_seen": 132263845, + "router_z_loss_clip": 0.78857422, + "router_z_loss_mlp": 0.03274536, + "step": 6152, + "time_per_iteration": 3.27481746673584 + }, + { + "auxiliary_loss_clip": 0.06456928, + "auxiliary_loss_mlp": 0.01269625, + "balance_loss_clip": 0.06289765, + "balance_loss_mlp": 0.01256709, + "epoch": 0.36993837366601534, + "flos": 24317597479680.0, + "grad_norm": 2.4460843976292455, + "language_loss": 0.7067228, + "learning_rate": 2.9053947987919044e-06, + "loss": 0.78398836, + "num_input_tokens_seen": 132282350, + "router_z_loss_clip": 1.671875, + "router_z_loss_mlp": 0.12921143, + "step": 6153, + "time_per_iteration": 2.561366319656372 + }, + { + "auxiliary_loss_clip": 0.06461591, + "auxiliary_loss_mlp": 0.01272426, + "balance_loss_clip": 0.06285959, + "balance_loss_mlp": 0.0125796, + "epoch": 0.3699984969186833, + "flos": 24355472325120.0, + "grad_norm": 1.7390512131477307, + "language_loss": 0.72820848, + "learning_rate": 2.9050475125849755e-06, + "loss": 0.80554867, + "num_input_tokens_seen": 132301930, + "router_z_loss_clip": 1.75585938, + "router_z_loss_mlp": 0.14459229, + "step": 6154, + "time_per_iteration": 2.6359784603118896 + }, + { + "auxiliary_loss_clip": 0.06464534, + "auxiliary_loss_mlp": 0.01270069, + "balance_loss_clip": 0.06290819, + "balance_loss_mlp": 0.01256468, + "epoch": 0.37005862017135127, + "flos": 19835378547840.0, + "grad_norm": 1.7720975153034155, + "language_loss": 0.68251342, + "learning_rate": 2.9047001920585534e-06, + "loss": 0.75985944, + "num_input_tokens_seen": 132320915, + "router_z_loss_clip": 1.73828125, + "router_z_loss_mlp": 0.1361084, + "step": 6155, + "time_per_iteration": 2.6026792526245117 + }, + { + "auxiliary_loss_clip": 0.06462097, + "auxiliary_loss_mlp": 0.01275284, + "balance_loss_clip": 0.06290478, + "balance_loss_mlp": 0.01261551, + "epoch": 0.37011874342401924, + "flos": 19579981703040.0, + "grad_norm": 1.763175663447542, + "language_loss": 0.68228447, + "learning_rate": 2.9043528372258097e-06, + "loss": 0.75965828, + "num_input_tokens_seen": 132340415, + "router_z_loss_clip": 1.71679688, + "router_z_loss_mlp": 0.13745117, + "step": 6156, + "time_per_iteration": 2.5805797576904297 + }, + { + "auxiliary_loss_clip": 0.06460856, + "auxiliary_loss_mlp": 0.01276122, + "balance_loss_clip": 0.06292138, + "balance_loss_mlp": 0.01263051, + "epoch": 0.3701788666766872, + "flos": 20380315576320.0, + "grad_norm": 2.4756712581972673, + "language_loss": 0.82280111, + "learning_rate": 2.904005448099916e-06, + "loss": 0.9001708, + "num_input_tokens_seen": 132358600, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.13061523, + "step": 6157, + "time_per_iteration": 2.5239012241363525 + }, + { + "auxiliary_loss_clip": 0.06472905, + "auxiliary_loss_mlp": 0.01276517, + "balance_loss_clip": 0.06294029, + "balance_loss_mlp": 0.0126136, + "epoch": 0.37023898992935517, + "flos": 15346325508480.0, + "grad_norm": 2.1879647979069055, + "language_loss": 0.77007514, + "learning_rate": 2.9036580246940444e-06, + "loss": 0.84756935, + "num_input_tokens_seen": 132373160, + "router_z_loss_clip": 1.78710938, + "router_z_loss_mlp": 0.15142822, + "step": 6158, + "time_per_iteration": 2.5507380962371826 + }, + { + "auxiliary_loss_clip": 0.06472066, + "auxiliary_loss_mlp": 0.01273585, + "balance_loss_clip": 0.0629342, + "balance_loss_mlp": 0.0125872, + "epoch": 0.37029911318202313, + "flos": 19580149411200.0, + "grad_norm": 1.9796058392103062, + "language_loss": 0.68833315, + "learning_rate": 2.9033105670213708e-06, + "loss": 0.76578963, + "num_input_tokens_seen": 132392345, + "router_z_loss_clip": 1.78710938, + "router_z_loss_mlp": 0.14880371, + "step": 6159, + "time_per_iteration": 2.4941582679748535 + }, + { + "auxiliary_loss_clip": 0.06464109, + "auxiliary_loss_mlp": 0.01275069, + "balance_loss_clip": 0.06292266, + "balance_loss_mlp": 0.01261986, + "epoch": 0.3703592364346911, + "flos": 26220509303040.0, + "grad_norm": 1.9367461088396363, + "language_loss": 0.71322787, + "learning_rate": 2.9029630750950697e-06, + "loss": 0.79061961, + "num_input_tokens_seen": 132412620, + "router_z_loss_clip": 1.71679688, + "router_z_loss_mlp": 0.13079834, + "step": 6160, + "time_per_iteration": 2.5934555530548096 + }, + { + "auxiliary_loss_clip": 0.06465742, + "auxiliary_loss_mlp": 0.01272758, + "balance_loss_clip": 0.06295532, + "balance_loss_mlp": 0.0125958, + "epoch": 0.37041935968735906, + "flos": 20054619555840.0, + "grad_norm": 1.6534007301448785, + "language_loss": 0.78978807, + "learning_rate": 2.9026155489283176e-06, + "loss": 0.86717302, + "num_input_tokens_seen": 132431570, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.1317749, + "step": 6161, + "time_per_iteration": 2.5337588787078857 + }, + { + "auxiliary_loss_clip": 0.06465232, + "auxiliary_loss_mlp": 0.01270423, + "balance_loss_clip": 0.06291839, + "balance_loss_mlp": 0.01255837, + "epoch": 0.3704794829400271, + "flos": 24140633656320.0, + "grad_norm": 1.7631614273732186, + "language_loss": 0.79746109, + "learning_rate": 2.902267988534295e-06, + "loss": 0.87481761, + "num_input_tokens_seen": 132451525, + "router_z_loss_clip": 1.73339844, + "router_z_loss_mlp": 0.14587402, + "step": 6162, + "time_per_iteration": 2.5815200805664062 + }, + { + "auxiliary_loss_clip": 0.06466715, + "auxiliary_loss_mlp": 0.01274307, + "balance_loss_clip": 0.06292939, + "balance_loss_mlp": 0.01260717, + "epoch": 0.37053960619269505, + "flos": 14872232707200.0, + "grad_norm": 1.8866019587111915, + "language_loss": 0.80318987, + "learning_rate": 2.9019203939261783e-06, + "loss": 0.88060015, + "num_input_tokens_seen": 132469875, + "router_z_loss_clip": 1.73925781, + "router_z_loss_mlp": 0.13580322, + "step": 6163, + "time_per_iteration": 2.501971483230591 + }, + { + "auxiliary_loss_clip": 0.06466764, + "auxiliary_loss_mlp": 0.01273928, + "balance_loss_clip": 0.0629348, + "balance_loss_mlp": 0.01260315, + "epoch": 0.370599729445363, + "flos": 21367969251840.0, + "grad_norm": 1.81392406825425, + "language_loss": 0.68857837, + "learning_rate": 2.9015727651171507e-06, + "loss": 0.76598537, + "num_input_tokens_seen": 132488360, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.13598633, + "step": 6164, + "time_per_iteration": 2.557870388031006 + }, + { + "auxiliary_loss_clip": 0.06463528, + "auxiliary_loss_mlp": 0.01275542, + "balance_loss_clip": 0.06290606, + "balance_loss_mlp": 0.0126064, + "epoch": 0.370659852698031, + "flos": 26835535872000.0, + "grad_norm": 2.3609289004256984, + "language_loss": 0.83364576, + "learning_rate": 2.9012251021203935e-06, + "loss": 0.91103643, + "num_input_tokens_seen": 132508630, + "router_z_loss_clip": 1.72851562, + "router_z_loss_mlp": 0.14916992, + "step": 6165, + "time_per_iteration": 2.5597267150878906 + }, + { + "auxiliary_loss_clip": 0.06475651, + "auxiliary_loss_mlp": 0.01276631, + "balance_loss_clip": 0.06294797, + "balance_loss_mlp": 0.01261086, + "epoch": 0.37071997595069894, + "flos": 19105050360960.0, + "grad_norm": 1.8212520052796557, + "language_loss": 0.69703627, + "learning_rate": 2.9008774049490896e-06, + "loss": 0.77455908, + "num_input_tokens_seen": 132527465, + "router_z_loss_clip": 1.81054688, + "router_z_loss_mlp": 0.15551758, + "step": 6166, + "time_per_iteration": 2.7443737983703613 + }, + { + "auxiliary_loss_clip": 0.06351966, + "auxiliary_loss_mlp": 0.01259396, + "balance_loss_clip": 0.0627325, + "balance_loss_mlp": 0.01255936, + "epoch": 0.3707800992033669, + "flos": 52193839461120.0, + "grad_norm": 0.7767712005900987, + "language_loss": 0.55992532, + "learning_rate": 2.9005296736164244e-06, + "loss": 0.6360389, + "num_input_tokens_seen": 132579940, + "router_z_loss_clip": 0.78808594, + "router_z_loss_mlp": 0.03469849, + "step": 6167, + "time_per_iteration": 3.122786045074463 + }, + { + "auxiliary_loss_clip": 0.06470326, + "auxiliary_loss_mlp": 0.01270542, + "balance_loss_clip": 0.06298738, + "balance_loss_mlp": 0.01256553, + "epoch": 0.3708402224560349, + "flos": 19908025637760.0, + "grad_norm": 1.887650816435161, + "language_loss": 0.75851792, + "learning_rate": 2.900181908135584e-06, + "loss": 0.83592659, + "num_input_tokens_seen": 132598390, + "router_z_loss_clip": 1.71484375, + "router_z_loss_mlp": 0.13983154, + "step": 6168, + "time_per_iteration": 2.516329050064087 + }, + { + "auxiliary_loss_clip": 0.06462339, + "auxiliary_loss_mlp": 0.01269774, + "balance_loss_clip": 0.0628986, + "balance_loss_mlp": 0.01255833, + "epoch": 0.37090034570870284, + "flos": 20013222839040.0, + "grad_norm": 1.688087532093935, + "language_loss": 0.74697542, + "learning_rate": 2.899834108519755e-06, + "loss": 0.82429659, + "num_input_tokens_seen": 132616920, + "router_z_loss_clip": 1.72363281, + "router_z_loss_mlp": 0.13946533, + "step": 6169, + "time_per_iteration": 2.571059226989746 + }, + { + "auxiliary_loss_clip": 0.06462043, + "auxiliary_loss_mlp": 0.01269285, + "balance_loss_clip": 0.06291892, + "balance_loss_mlp": 0.0125526, + "epoch": 0.3709604689613708, + "flos": 24141681832320.0, + "grad_norm": 1.6120375976718775, + "language_loss": 0.79462636, + "learning_rate": 2.899486274782127e-06, + "loss": 0.87193966, + "num_input_tokens_seen": 132637660, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.14007568, + "step": 6170, + "time_per_iteration": 2.539099931716919 + }, + { + "auxiliary_loss_clip": 0.06461793, + "auxiliary_loss_mlp": 0.01268893, + "balance_loss_clip": 0.06289523, + "balance_loss_mlp": 0.01254183, + "epoch": 0.37102059221403877, + "flos": 23882469626880.0, + "grad_norm": 1.7170622011660002, + "language_loss": 0.76363444, + "learning_rate": 2.8991384069358885e-06, + "loss": 0.84094131, + "num_input_tokens_seen": 132657635, + "router_z_loss_clip": 1.72167969, + "router_z_loss_mlp": 0.14703369, + "step": 6171, + "time_per_iteration": 2.5565338134765625 + }, + { + "auxiliary_loss_clip": 0.06464403, + "auxiliary_loss_mlp": 0.01269741, + "balance_loss_clip": 0.06292279, + "balance_loss_mlp": 0.0125568, + "epoch": 0.37108071546670673, + "flos": 14506439708160.0, + "grad_norm": 2.2434941236901222, + "language_loss": 0.80974334, + "learning_rate": 2.898790504994232e-06, + "loss": 0.88708472, + "num_input_tokens_seen": 132674455, + "router_z_loss_clip": 1.71972656, + "router_z_loss_mlp": 0.140625, + "step": 6172, + "time_per_iteration": 2.496101140975952 + }, + { + "auxiliary_loss_clip": 0.06468061, + "auxiliary_loss_mlp": 0.01272991, + "balance_loss_clip": 0.06291698, + "balance_loss_mlp": 0.01258352, + "epoch": 0.3711408387193747, + "flos": 34570172160000.0, + "grad_norm": 1.701200983183655, + "language_loss": 0.59536189, + "learning_rate": 2.89844256897035e-06, + "loss": 0.67277241, + "num_input_tokens_seen": 132695140, + "router_z_loss_clip": 1.76367188, + "router_z_loss_mlp": 0.14648438, + "step": 6173, + "time_per_iteration": 2.68860125541687 + }, + { + "auxiliary_loss_clip": 0.06465948, + "auxiliary_loss_mlp": 0.01267208, + "balance_loss_clip": 0.06291407, + "balance_loss_mlp": 0.01252825, + "epoch": 0.37120096197204266, + "flos": 17316350052480.0, + "grad_norm": 3.482738270256764, + "language_loss": 0.81161231, + "learning_rate": 2.898094598877435e-06, + "loss": 0.88894391, + "num_input_tokens_seen": 132712470, + "router_z_loss_clip": 1.74414062, + "router_z_loss_mlp": 0.1439209, + "step": 6174, + "time_per_iteration": 2.498631238937378 + }, + { + "auxiliary_loss_clip": 0.06459825, + "auxiliary_loss_mlp": 0.01267088, + "balance_loss_clip": 0.06290745, + "balance_loss_mlp": 0.01253826, + "epoch": 0.37126108522471063, + "flos": 30671855205120.0, + "grad_norm": 1.7762050826086826, + "language_loss": 0.79733562, + "learning_rate": 2.8977465947286826e-06, + "loss": 0.87460476, + "num_input_tokens_seen": 132732945, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.13275146, + "step": 6175, + "time_per_iteration": 2.6155989170074463 + }, + { + "auxiliary_loss_clip": 0.06469794, + "auxiliary_loss_mlp": 0.01268815, + "balance_loss_clip": 0.06296568, + "balance_loss_mlp": 0.01253926, + "epoch": 0.37132120847737865, + "flos": 25162682232960.0, + "grad_norm": 2.183025760433602, + "language_loss": 0.8886646, + "learning_rate": 2.89739855653729e-06, + "loss": 0.96605068, + "num_input_tokens_seen": 132752470, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.14880371, + "step": 6176, + "time_per_iteration": 3.9855380058288574 + }, + { + "auxiliary_loss_clip": 0.06463525, + "auxiliary_loss_mlp": 0.01266267, + "balance_loss_clip": 0.0629091, + "balance_loss_mlp": 0.01252331, + "epoch": 0.3713813317300466, + "flos": 21219572471040.0, + "grad_norm": 1.8377156327305517, + "language_loss": 0.73693877, + "learning_rate": 2.8970504843164546e-06, + "loss": 0.8142367, + "num_input_tokens_seen": 132771485, + "router_z_loss_clip": 1.72363281, + "router_z_loss_mlp": 0.13952637, + "step": 6177, + "time_per_iteration": 2.584007501602173 + }, + { + "auxiliary_loss_clip": 0.06460603, + "auxiliary_loss_mlp": 0.01270943, + "balance_loss_clip": 0.06288581, + "balance_loss_mlp": 0.01256722, + "epoch": 0.3714414549827146, + "flos": 21623114534400.0, + "grad_norm": 3.348536242845292, + "language_loss": 0.75657964, + "learning_rate": 2.896702378079374e-06, + "loss": 0.83389515, + "num_input_tokens_seen": 132791465, + "router_z_loss_clip": 1.72265625, + "router_z_loss_mlp": 0.14227295, + "step": 6178, + "time_per_iteration": 4.047810077667236 + }, + { + "auxiliary_loss_clip": 0.06459013, + "auxiliary_loss_mlp": 0.01268256, + "balance_loss_clip": 0.06288654, + "balance_loss_mlp": 0.01253796, + "epoch": 0.37150157823538255, + "flos": 19978073251200.0, + "grad_norm": 1.677068577007521, + "language_loss": 0.7243154, + "learning_rate": 2.8963542378392502e-06, + "loss": 0.80158818, + "num_input_tokens_seen": 132810160, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.14465332, + "step": 6179, + "time_per_iteration": 2.525162696838379 + }, + { + "auxiliary_loss_clip": 0.06464912, + "auxiliary_loss_mlp": 0.01268865, + "balance_loss_clip": 0.06289817, + "balance_loss_mlp": 0.01254506, + "epoch": 0.3715617014880505, + "flos": 24867020701440.0, + "grad_norm": 1.5744290711880986, + "language_loss": 0.70164317, + "learning_rate": 2.896006063609283e-06, + "loss": 0.77898097, + "num_input_tokens_seen": 132831265, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.14361572, + "step": 6180, + "time_per_iteration": 2.564251661300659 + }, + { + "auxiliary_loss_clip": 0.06459807, + "auxiliary_loss_mlp": 0.01269776, + "balance_loss_clip": 0.0628929, + "balance_loss_mlp": 0.01255173, + "epoch": 0.3716218247407185, + "flos": 20455352507520.0, + "grad_norm": 1.6669585833251956, + "language_loss": 0.78357702, + "learning_rate": 2.8956578554026767e-06, + "loss": 0.86087286, + "num_input_tokens_seen": 132850005, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.14605713, + "step": 6181, + "time_per_iteration": 2.5857934951782227 + }, + { + "auxiliary_loss_clip": 0.06456675, + "auxiliary_loss_mlp": 0.01268697, + "balance_loss_clip": 0.06286183, + "balance_loss_mlp": 0.01254195, + "epoch": 0.37168194799338644, + "flos": 24140256312960.0, + "grad_norm": 1.7806049549646892, + "language_loss": 0.78926349, + "learning_rate": 2.8953096132326343e-06, + "loss": 0.86651719, + "num_input_tokens_seen": 132865790, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.14520264, + "step": 6182, + "time_per_iteration": 2.572563409805298 + }, + { + "auxiliary_loss_clip": 0.0637676, + "auxiliary_loss_mlp": 0.01256678, + "balance_loss_clip": 0.06297279, + "balance_loss_mlp": 0.01253508, + "epoch": 0.3717420712460544, + "flos": 67429601107200.0, + "grad_norm": 0.7782169453066291, + "language_loss": 0.57265592, + "learning_rate": 2.894961337112362e-06, + "loss": 0.64899027, + "num_input_tokens_seen": 132921775, + "router_z_loss_clip": 0.79296875, + "router_z_loss_mlp": 0.03170776, + "step": 6183, + "time_per_iteration": 4.616533279418945 + }, + { + "auxiliary_loss_clip": 0.06460768, + "auxiliary_loss_mlp": 0.0127302, + "balance_loss_clip": 0.06282368, + "balance_loss_mlp": 0.01258059, + "epoch": 0.37180219449872237, + "flos": 22382512888320.0, + "grad_norm": 2.288371354177028, + "language_loss": 0.77116179, + "learning_rate": 2.894613027055066e-06, + "loss": 0.84849966, + "num_input_tokens_seen": 132941060, + "router_z_loss_clip": 1.78320312, + "router_z_loss_mlp": 0.1496582, + "step": 6184, + "time_per_iteration": 2.5182292461395264 + }, + { + "auxiliary_loss_clip": 0.06457444, + "auxiliary_loss_mlp": 0.01269752, + "balance_loss_clip": 0.0628842, + "balance_loss_mlp": 0.01255739, + "epoch": 0.37186231775139034, + "flos": 21876037683840.0, + "grad_norm": 2.2342830987852023, + "language_loss": 0.72608167, + "learning_rate": 2.894264683073954e-06, + "loss": 0.80335367, + "num_input_tokens_seen": 132961850, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.14007568, + "step": 6185, + "time_per_iteration": 3.928272247314453 + }, + { + "auxiliary_loss_clip": 0.06453837, + "auxiliary_loss_mlp": 0.01267225, + "balance_loss_clip": 0.06286646, + "balance_loss_mlp": 0.01253075, + "epoch": 0.3719224410040583, + "flos": 22421142420480.0, + "grad_norm": 1.6056881027286982, + "language_loss": 0.77329034, + "learning_rate": 2.8939163051822363e-06, + "loss": 0.85050094, + "num_input_tokens_seen": 132981625, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.14160156, + "step": 6186, + "time_per_iteration": 2.549499988555908 + }, + { + "auxiliary_loss_clip": 0.0646092, + "auxiliary_loss_mlp": 0.01274226, + "balance_loss_clip": 0.06283663, + "balance_loss_mlp": 0.01258121, + "epoch": 0.37198256425672627, + "flos": 25157525207040.0, + "grad_norm": 1.8763954627941488, + "language_loss": 0.84227252, + "learning_rate": 2.8935678933931224e-06, + "loss": 0.91962403, + "num_input_tokens_seen": 133001225, + "router_z_loss_clip": 1.77246094, + "router_z_loss_mlp": 0.16101074, + "step": 6187, + "time_per_iteration": 2.542978048324585 + }, + { + "auxiliary_loss_clip": 0.06456143, + "auxiliary_loss_mlp": 0.01269651, + "balance_loss_clip": 0.06286585, + "balance_loss_mlp": 0.01255919, + "epoch": 0.37204268750939423, + "flos": 21144032415360.0, + "grad_norm": 2.100791898470326, + "language_loss": 0.84696567, + "learning_rate": 2.893219447719824e-06, + "loss": 0.9242236, + "num_input_tokens_seen": 133018820, + "router_z_loss_clip": 1.69433594, + "router_z_loss_mlp": 0.13726807, + "step": 6188, + "time_per_iteration": 2.626126766204834 + }, + { + "auxiliary_loss_clip": 0.06458837, + "auxiliary_loss_mlp": 0.01269894, + "balance_loss_clip": 0.06288396, + "balance_loss_mlp": 0.01256232, + "epoch": 0.37210281076206225, + "flos": 21513221504640.0, + "grad_norm": 2.2586863759616564, + "language_loss": 0.66390121, + "learning_rate": 2.8928709681755548e-06, + "loss": 0.74118853, + "num_input_tokens_seen": 133040205, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.13653564, + "step": 6189, + "time_per_iteration": 2.5793135166168213 + }, + { + "auxiliary_loss_clip": 0.06460261, + "auxiliary_loss_mlp": 0.01271387, + "balance_loss_clip": 0.0628726, + "balance_loss_mlp": 0.01255926, + "epoch": 0.3721629340147302, + "flos": 17353595992320.0, + "grad_norm": 2.971940637043147, + "language_loss": 0.84218514, + "learning_rate": 2.8925224547735293e-06, + "loss": 0.91950166, + "num_input_tokens_seen": 133058095, + "router_z_loss_clip": 1.72949219, + "router_z_loss_mlp": 0.15466309, + "step": 6190, + "time_per_iteration": 2.530977487564087 + }, + { + "auxiliary_loss_clip": 0.06464738, + "auxiliary_loss_mlp": 0.01270544, + "balance_loss_clip": 0.06287063, + "balance_loss_mlp": 0.01255905, + "epoch": 0.3722230572673982, + "flos": 16437457376640.0, + "grad_norm": 2.7368484374177076, + "language_loss": 0.89274895, + "learning_rate": 2.8921739075269633e-06, + "loss": 0.97010183, + "num_input_tokens_seen": 133071530, + "router_z_loss_clip": 1.77441406, + "router_z_loss_mlp": 0.14648438, + "step": 6191, + "time_per_iteration": 2.4786319732666016 + }, + { + "auxiliary_loss_clip": 0.06463645, + "auxiliary_loss_mlp": 0.01271285, + "balance_loss_clip": 0.06286322, + "balance_loss_mlp": 0.01254465, + "epoch": 0.37228318052006615, + "flos": 22681360874880.0, + "grad_norm": 2.1321020045013577, + "language_loss": 0.74374199, + "learning_rate": 2.891825326449073e-06, + "loss": 0.82109123, + "num_input_tokens_seen": 133091410, + "router_z_loss_clip": 1.77148438, + "router_z_loss_mlp": 0.16790771, + "step": 6192, + "time_per_iteration": 2.6107547283172607 + }, + { + "auxiliary_loss_clip": 0.06461145, + "auxiliary_loss_mlp": 0.01269074, + "balance_loss_clip": 0.06288278, + "balance_loss_mlp": 0.0125493, + "epoch": 0.3723433037727341, + "flos": 25272617189760.0, + "grad_norm": 2.3785606336548124, + "language_loss": 0.79934001, + "learning_rate": 2.8914767115530766e-06, + "loss": 0.87664223, + "num_input_tokens_seen": 133110365, + "router_z_loss_clip": 1.72753906, + "router_z_loss_mlp": 0.14154053, + "step": 6193, + "time_per_iteration": 2.5584514141082764 + }, + { + "auxiliary_loss_clip": 0.06469596, + "auxiliary_loss_mlp": 0.01270113, + "balance_loss_clip": 0.06293128, + "balance_loss_mlp": 0.01255594, + "epoch": 0.3724034270254021, + "flos": 10529228534400.0, + "grad_norm": 1.7620775512614164, + "language_loss": 0.84889179, + "learning_rate": 2.891128062852194e-06, + "loss": 0.92628884, + "num_input_tokens_seen": 133128255, + "router_z_loss_clip": 1.76269531, + "router_z_loss_mlp": 0.14526367, + "step": 6194, + "time_per_iteration": 2.5419061183929443 + }, + { + "auxiliary_loss_clip": 0.06460975, + "auxiliary_loss_mlp": 0.01266847, + "balance_loss_clip": 0.06288271, + "balance_loss_mlp": 0.01253317, + "epoch": 0.37246355027807004, + "flos": 20272393117440.0, + "grad_norm": 2.226391461709797, + "language_loss": 0.78030515, + "learning_rate": 2.890779380359646e-06, + "loss": 0.85758334, + "num_input_tokens_seen": 133143975, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.13543701, + "step": 6195, + "time_per_iteration": 2.51361346244812 + }, + { + "auxiliary_loss_clip": 0.06459115, + "auxiliary_loss_mlp": 0.01274112, + "balance_loss_clip": 0.06288831, + "balance_loss_mlp": 0.01258955, + "epoch": 0.372523673530738, + "flos": 19506705707520.0, + "grad_norm": 1.8216220923823887, + "language_loss": 0.79924363, + "learning_rate": 2.890430664088655e-06, + "loss": 0.87657595, + "num_input_tokens_seen": 133162935, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.15155029, + "step": 6196, + "time_per_iteration": 2.6005568504333496 + }, + { + "auxiliary_loss_clip": 0.06458211, + "auxiliary_loss_mlp": 0.01270847, + "balance_loss_clip": 0.06289028, + "balance_loss_mlp": 0.01256888, + "epoch": 0.372583796783406, + "flos": 16769945577600.0, + "grad_norm": 2.2795878215352396, + "language_loss": 0.84059894, + "learning_rate": 2.890081914052443e-06, + "loss": 0.91788948, + "num_input_tokens_seen": 133181180, + "router_z_loss_clip": 1.69140625, + "router_z_loss_mlp": 0.13952637, + "step": 6197, + "time_per_iteration": 2.538058042526245 + }, + { + "auxiliary_loss_clip": 0.06456813, + "auxiliary_loss_mlp": 0.01270068, + "balance_loss_clip": 0.06289704, + "balance_loss_mlp": 0.01255095, + "epoch": 0.37264392003607394, + "flos": 22644576132480.0, + "grad_norm": 1.7143100919816474, + "language_loss": 0.64964151, + "learning_rate": 2.889733130264237e-06, + "loss": 0.72691035, + "num_input_tokens_seen": 133199615, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.14971924, + "step": 6198, + "time_per_iteration": 2.5891072750091553 + }, + { + "auxiliary_loss_clip": 0.06454235, + "auxiliary_loss_mlp": 0.0127235, + "balance_loss_clip": 0.0628581, + "balance_loss_mlp": 0.01258367, + "epoch": 0.3727040432887419, + "flos": 19979037573120.0, + "grad_norm": 1.4303592099178044, + "language_loss": 0.74534631, + "learning_rate": 2.889384312737261e-06, + "loss": 0.82261217, + "num_input_tokens_seen": 133219650, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.13977051, + "step": 6199, + "time_per_iteration": 2.5612289905548096 + }, + { + "auxiliary_loss_clip": 0.06453978, + "auxiliary_loss_mlp": 0.01269323, + "balance_loss_clip": 0.06284302, + "balance_loss_mlp": 0.01255095, + "epoch": 0.37276416654140987, + "flos": 63911906853120.0, + "grad_norm": 1.6001689252403943, + "language_loss": 0.81250614, + "learning_rate": 2.889035461484742e-06, + "loss": 0.88973916, + "num_input_tokens_seen": 133245675, + "router_z_loss_clip": 1.69433594, + "router_z_loss_mlp": 0.14227295, + "step": 6200, + "time_per_iteration": 2.9802377223968506 + }, + { + "auxiliary_loss_clip": 0.06452343, + "auxiliary_loss_mlp": 0.01273173, + "balance_loss_clip": 0.06282811, + "balance_loss_mlp": 0.0125907, + "epoch": 0.37282428979407783, + "flos": 39795381244800.0, + "grad_norm": 2.0282879733455776, + "language_loss": 0.61128068, + "learning_rate": 2.88868657651991e-06, + "loss": 0.68853581, + "num_input_tokens_seen": 133266905, + "router_z_loss_clip": 1.69238281, + "router_z_loss_mlp": 0.14123535, + "step": 6201, + "time_per_iteration": 2.6786048412323 + }, + { + "auxiliary_loss_clip": 0.06460309, + "auxiliary_loss_mlp": 0.01271912, + "balance_loss_clip": 0.06284842, + "balance_loss_mlp": 0.01257166, + "epoch": 0.37288441304674586, + "flos": 22715336505600.0, + "grad_norm": 1.562126243298772, + "language_loss": 0.73424393, + "learning_rate": 2.8883376578559934e-06, + "loss": 0.81156611, + "num_input_tokens_seen": 133286865, + "router_z_loss_clip": 1.75390625, + "router_z_loss_mlp": 0.14746094, + "step": 6202, + "time_per_iteration": 2.5774593353271484 + }, + { + "auxiliary_loss_clip": 0.06450565, + "auxiliary_loss_mlp": 0.01268741, + "balance_loss_clip": 0.06279768, + "balance_loss_mlp": 0.01253697, + "epoch": 0.3729445362994138, + "flos": 18776209812480.0, + "grad_norm": 3.8476229642649895, + "language_loss": 0.73690808, + "learning_rate": 2.8879887055062243e-06, + "loss": 0.81410116, + "num_input_tokens_seen": 133305295, + "router_z_loss_clip": 1.70898438, + "router_z_loss_mlp": 0.1505127, + "step": 6203, + "time_per_iteration": 2.4786221981048584 + }, + { + "auxiliary_loss_clip": 0.06448745, + "auxiliary_loss_mlp": 0.0126679, + "balance_loss_clip": 0.06280582, + "balance_loss_mlp": 0.01253402, + "epoch": 0.3730046595520818, + "flos": 22462874553600.0, + "grad_norm": 1.6222639611717555, + "language_loss": 0.82113981, + "learning_rate": 2.8876397194838353e-06, + "loss": 0.89829516, + "num_input_tokens_seen": 133324625, + "router_z_loss_clip": 1.68066406, + "router_z_loss_mlp": 0.13391113, + "step": 6204, + "time_per_iteration": 2.5474419593811035 + }, + { + "auxiliary_loss_clip": 0.06456085, + "auxiliary_loss_mlp": 0.01267649, + "balance_loss_clip": 0.06282973, + "balance_loss_mlp": 0.01253094, + "epoch": 0.37306478280474975, + "flos": 24323257630080.0, + "grad_norm": 1.5013454609640156, + "language_loss": 0.75699729, + "learning_rate": 2.8872906998020577e-06, + "loss": 0.8342346, + "num_input_tokens_seen": 133344625, + "router_z_loss_clip": 1.73144531, + "router_z_loss_mlp": 0.14562988, + "step": 6205, + "time_per_iteration": 2.5284838676452637 + }, + { + "auxiliary_loss_clip": 0.06453846, + "auxiliary_loss_mlp": 0.01269403, + "balance_loss_clip": 0.06283775, + "balance_loss_mlp": 0.01254538, + "epoch": 0.3731249060574177, + "flos": 15820627944960.0, + "grad_norm": 2.409990557003708, + "language_loss": 0.78042793, + "learning_rate": 2.886941646474128e-06, + "loss": 0.85766041, + "num_input_tokens_seen": 133363605, + "router_z_loss_clip": 1.70019531, + "router_z_loss_mlp": 0.14868164, + "step": 6206, + "time_per_iteration": 2.5130996704101562 + }, + { + "auxiliary_loss_clip": 0.06455843, + "auxiliary_loss_mlp": 0.01268821, + "balance_loss_clip": 0.06284125, + "balance_loss_mlp": 0.01253085, + "epoch": 0.3731850293100857, + "flos": 19834120736640.0, + "grad_norm": 3.8358433201526334, + "language_loss": 0.93966329, + "learning_rate": 2.886592559513283e-06, + "loss": 1.01690984, + "num_input_tokens_seen": 133379405, + "router_z_loss_clip": 1.71484375, + "router_z_loss_mlp": 0.15734863, + "step": 6207, + "time_per_iteration": 2.4994020462036133 + }, + { + "auxiliary_loss_clip": 0.06459471, + "auxiliary_loss_mlp": 0.01269129, + "balance_loss_clip": 0.06283936, + "balance_loss_mlp": 0.01254561, + "epoch": 0.37324515256275365, + "flos": 19068349472640.0, + "grad_norm": 2.1400449567396826, + "language_loss": 0.82643408, + "learning_rate": 2.886243438932759e-06, + "loss": 0.90372002, + "num_input_tokens_seen": 133397585, + "router_z_loss_clip": 1.75488281, + "router_z_loss_mlp": 0.14575195, + "step": 6208, + "time_per_iteration": 2.5359628200531006 + }, + { + "auxiliary_loss_clip": 0.06460227, + "auxiliary_loss_mlp": 0.01272188, + "balance_loss_clip": 0.06285752, + "balance_loss_mlp": 0.01255904, + "epoch": 0.3733052758154216, + "flos": 20710623571200.0, + "grad_norm": 2.148305950788212, + "language_loss": 0.73528939, + "learning_rate": 2.8858942847457953e-06, + "loss": 0.81261349, + "num_input_tokens_seen": 133415365, + "router_z_loss_clip": 1.7421875, + "router_z_loss_mlp": 0.1628418, + "step": 6209, + "time_per_iteration": 2.499209403991699 + }, + { + "auxiliary_loss_clip": 0.06455819, + "auxiliary_loss_mlp": 0.01273959, + "balance_loss_clip": 0.06285547, + "balance_loss_mlp": 0.01258593, + "epoch": 0.3733653990680896, + "flos": 20199704100480.0, + "grad_norm": 2.014449395888949, + "language_loss": 0.71212471, + "learning_rate": 2.8855450969656305e-06, + "loss": 0.78942245, + "num_input_tokens_seen": 133435700, + "router_z_loss_clip": 1.70117188, + "router_z_loss_mlp": 0.15368652, + "step": 6210, + "time_per_iteration": 2.5324270725250244 + }, + { + "auxiliary_loss_clip": 0.06468424, + "auxiliary_loss_mlp": 0.01268574, + "balance_loss_clip": 0.06295058, + "balance_loss_mlp": 0.01253631, + "epoch": 0.37342552232075754, + "flos": 20345920675200.0, + "grad_norm": 1.543701660359285, + "language_loss": 0.7823801, + "learning_rate": 2.8851958756055073e-06, + "loss": 0.85975003, + "num_input_tokens_seen": 133455180, + "router_z_loss_clip": 1.73535156, + "router_z_loss_mlp": 0.1494751, + "step": 6211, + "time_per_iteration": 2.5388078689575195 + }, + { + "auxiliary_loss_clip": 0.06464606, + "auxiliary_loss_mlp": 0.01268752, + "balance_loss_clip": 0.06291494, + "balance_loss_mlp": 0.0125347, + "epoch": 0.3734856455734255, + "flos": 35526701243520.0, + "grad_norm": 1.6765525733287814, + "language_loss": 0.73612988, + "learning_rate": 2.884846620678668e-06, + "loss": 0.81346345, + "num_input_tokens_seen": 133476715, + "router_z_loss_clip": 1.73046875, + "router_z_loss_mlp": 0.15283203, + "step": 6212, + "time_per_iteration": 2.663950204849243 + }, + { + "auxiliary_loss_clip": 0.06477734, + "auxiliary_loss_mlp": 0.01272795, + "balance_loss_clip": 0.06294222, + "balance_loss_mlp": 0.01256345, + "epoch": 0.37354576882609347, + "flos": 21148686316800.0, + "grad_norm": 1.865900947954382, + "language_loss": 0.82430422, + "learning_rate": 2.884497332198356e-06, + "loss": 0.90180945, + "num_input_tokens_seen": 133494550, + "router_z_loss_clip": 1.83496094, + "router_z_loss_mlp": 0.16455078, + "step": 6213, + "time_per_iteration": 2.541431427001953 + }, + { + "auxiliary_loss_clip": 0.06467836, + "auxiliary_loss_mlp": 0.01271096, + "balance_loss_clip": 0.06295606, + "balance_loss_mlp": 0.01255623, + "epoch": 0.37360589207876144, + "flos": 21513179577600.0, + "grad_norm": 2.345206885791162, + "language_loss": 0.7896657, + "learning_rate": 2.8841480101778167e-06, + "loss": 0.86705506, + "num_input_tokens_seen": 133512640, + "router_z_loss_clip": 1.72070312, + "router_z_loss_mlp": 0.15466309, + "step": 6214, + "time_per_iteration": 2.545792579650879 + }, + { + "auxiliary_loss_clip": 0.06466322, + "auxiliary_loss_mlp": 0.01270745, + "balance_loss_clip": 0.06297071, + "balance_loss_mlp": 0.01255981, + "epoch": 0.37366601533142946, + "flos": 38444953317120.0, + "grad_norm": 1.6116656191599898, + "language_loss": 0.85112274, + "learning_rate": 2.883798654630296e-06, + "loss": 0.92849338, + "num_input_tokens_seen": 133535540, + "router_z_loss_clip": 1.69433594, + "router_z_loss_mlp": 0.14758301, + "step": 6215, + "time_per_iteration": 2.70700740814209 + }, + { + "auxiliary_loss_clip": 0.06472297, + "auxiliary_loss_mlp": 0.01270089, + "balance_loss_clip": 0.06296762, + "balance_loss_mlp": 0.01254044, + "epoch": 0.3737261385840974, + "flos": 18446908066560.0, + "grad_norm": 1.6510257786225762, + "language_loss": 0.6833967, + "learning_rate": 2.8834492655690423e-06, + "loss": 0.76082057, + "num_input_tokens_seen": 133555795, + "router_z_loss_clip": 1.75683594, + "router_z_loss_mlp": 0.16040039, + "step": 6216, + "time_per_iteration": 3.941821575164795 + }, + { + "auxiliary_loss_clip": 0.06466141, + "auxiliary_loss_mlp": 0.01276294, + "balance_loss_clip": 0.06293347, + "balance_loss_mlp": 0.01260224, + "epoch": 0.3737862618367654, + "flos": 22936506157440.0, + "grad_norm": 2.1208446300989983, + "language_loss": 0.6621505, + "learning_rate": 2.883099843007303e-06, + "loss": 0.73957485, + "num_input_tokens_seen": 133575905, + "router_z_loss_clip": 1.72753906, + "router_z_loss_mlp": 0.1607666, + "step": 6217, + "time_per_iteration": 4.067852258682251 + }, + { + "auxiliary_loss_clip": 0.06468368, + "auxiliary_loss_mlp": 0.01272371, + "balance_loss_clip": 0.06294458, + "balance_loss_mlp": 0.0125772, + "epoch": 0.37384638508943335, + "flos": 15414360624000.0, + "grad_norm": 1.5564133784357135, + "language_loss": 0.80760753, + "learning_rate": 2.88275038695833e-06, + "loss": 0.88501501, + "num_input_tokens_seen": 133592585, + "router_z_loss_clip": 1.73925781, + "router_z_loss_mlp": 0.1463623, + "step": 6218, + "time_per_iteration": 2.5253372192382812 + }, + { + "auxiliary_loss_clip": 0.06465785, + "auxiliary_loss_mlp": 0.01272039, + "balance_loss_clip": 0.06298652, + "balance_loss_mlp": 0.01256661, + "epoch": 0.3739065083421013, + "flos": 24287856480000.0, + "grad_norm": 2.4835018506755566, + "language_loss": 0.79185957, + "learning_rate": 2.8824008974353736e-06, + "loss": 0.86923778, + "num_input_tokens_seen": 133615070, + "router_z_loss_clip": 1.671875, + "router_z_loss_mlp": 0.15380859, + "step": 6219, + "time_per_iteration": 2.595684289932251 + }, + { + "auxiliary_loss_clip": 0.06464131, + "auxiliary_loss_mlp": 0.01274727, + "balance_loss_clip": 0.06296779, + "balance_loss_mlp": 0.01260177, + "epoch": 0.3739666315947693, + "flos": 23009488663680.0, + "grad_norm": 2.098390778414135, + "language_loss": 0.77614415, + "learning_rate": 2.8820513744516866e-06, + "loss": 0.85353279, + "num_input_tokens_seen": 133633490, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.14538574, + "step": 6220, + "time_per_iteration": 2.5899298191070557 + }, + { + "auxiliary_loss_clip": 0.06466513, + "auxiliary_loss_mlp": 0.01270657, + "balance_loss_clip": 0.06292208, + "balance_loss_mlp": 0.0125541, + "epoch": 0.37402675484743725, + "flos": 19397231948160.0, + "grad_norm": 1.5821121915867322, + "language_loss": 0.83564717, + "learning_rate": 2.8817018180205235e-06, + "loss": 0.91301888, + "num_input_tokens_seen": 133653425, + "router_z_loss_clip": 1.74121094, + "router_z_loss_mlp": 0.15240479, + "step": 6221, + "time_per_iteration": 2.540102481842041 + }, + { + "auxiliary_loss_clip": 0.06464627, + "auxiliary_loss_mlp": 0.0127713, + "balance_loss_clip": 0.06293692, + "balance_loss_mlp": 0.01262647, + "epoch": 0.3740868781001052, + "flos": 17131420091520.0, + "grad_norm": 1.6401420513761291, + "language_loss": 0.76738596, + "learning_rate": 2.8813522281551387e-06, + "loss": 0.84480345, + "num_input_tokens_seen": 133670220, + "router_z_loss_clip": 1.7109375, + "router_z_loss_mlp": 0.14477539, + "step": 6222, + "time_per_iteration": 4.020254850387573 + }, + { + "auxiliary_loss_clip": 0.06466988, + "auxiliary_loss_mlp": 0.01277801, + "balance_loss_clip": 0.06296736, + "balance_loss_mlp": 0.01263467, + "epoch": 0.3741470013527732, + "flos": 20049001332480.0, + "grad_norm": 1.799306271558528, + "language_loss": 0.70768011, + "learning_rate": 2.881002604868789e-06, + "loss": 0.785128, + "num_input_tokens_seen": 133688910, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.14349365, + "step": 6223, + "time_per_iteration": 2.6146726608276367 + }, + { + "auxiliary_loss_clip": 0.0646846, + "auxiliary_loss_mlp": 0.01272132, + "balance_loss_clip": 0.06299432, + "balance_loss_mlp": 0.01258954, + "epoch": 0.37420712460544114, + "flos": 36905151162240.0, + "grad_norm": 1.9191598081110601, + "language_loss": 0.69292819, + "learning_rate": 2.8806529481747325e-06, + "loss": 0.77033412, + "num_input_tokens_seen": 133708690, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.1317749, + "step": 6224, + "time_per_iteration": 4.144296407699585 + }, + { + "auxiliary_loss_clip": 0.06463895, + "auxiliary_loss_mlp": 0.01274949, + "balance_loss_clip": 0.06296779, + "balance_loss_mlp": 0.01260126, + "epoch": 0.3742672478581091, + "flos": 22207896979200.0, + "grad_norm": 1.811742579086715, + "language_loss": 0.70166373, + "learning_rate": 2.880303258086228e-06, + "loss": 0.77905214, + "num_input_tokens_seen": 133728095, + "router_z_loss_clip": 1.66894531, + "router_z_loss_mlp": 0.14819336, + "step": 6225, + "time_per_iteration": 2.562023162841797 + }, + { + "auxiliary_loss_clip": 0.06462345, + "auxiliary_loss_mlp": 0.0127698, + "balance_loss_clip": 0.06296264, + "balance_loss_mlp": 0.01262257, + "epoch": 0.3743273711107771, + "flos": 24688547504640.0, + "grad_norm": 2.0306145345851614, + "language_loss": 0.79386592, + "learning_rate": 2.879953534616536e-06, + "loss": 0.87125921, + "num_input_tokens_seen": 133745590, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.14715576, + "step": 6226, + "time_per_iteration": 2.5372707843780518 + }, + { + "auxiliary_loss_clip": 0.06464548, + "auxiliary_loss_mlp": 0.01273743, + "balance_loss_clip": 0.0629389, + "balance_loss_mlp": 0.01259021, + "epoch": 0.37438749436344504, + "flos": 24466078114560.0, + "grad_norm": 1.6346435650910545, + "language_loss": 0.68240035, + "learning_rate": 2.879603777778917e-06, + "loss": 0.75978327, + "num_input_tokens_seen": 133766155, + "router_z_loss_clip": 1.70800781, + "router_z_loss_mlp": 0.14733887, + "step": 6227, + "time_per_iteration": 2.5752079486846924 + }, + { + "auxiliary_loss_clip": 0.06464467, + "auxiliary_loss_mlp": 0.01270066, + "balance_loss_clip": 0.06297411, + "balance_loss_mlp": 0.0125588, + "epoch": 0.374447617616113, + "flos": 21805193456640.0, + "grad_norm": 1.6298548281431393, + "language_loss": 0.83520573, + "learning_rate": 2.879253987586635e-06, + "loss": 0.91255105, + "num_input_tokens_seen": 133783185, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.14190674, + "step": 6228, + "time_per_iteration": 2.605607748031616 + }, + { + "auxiliary_loss_clip": 0.06458256, + "auxiliary_loss_mlp": 0.01270458, + "balance_loss_clip": 0.06288552, + "balance_loss_mlp": 0.01256033, + "epoch": 0.374507740868781, + "flos": 17974073076480.0, + "grad_norm": 1.5343038876343353, + "language_loss": 0.75450277, + "learning_rate": 2.8789041640529535e-06, + "loss": 0.83178985, + "num_input_tokens_seen": 133800975, + "router_z_loss_clip": 1.69824219, + "router_z_loss_mlp": 0.14428711, + "step": 6229, + "time_per_iteration": 2.607506036758423 + }, + { + "auxiliary_loss_clip": 0.06464534, + "auxiliary_loss_mlp": 0.012714, + "balance_loss_clip": 0.06293011, + "balance_loss_mlp": 0.01256249, + "epoch": 0.374567864121449, + "flos": 16111132450560.0, + "grad_norm": 3.0205318355467083, + "language_loss": 0.84065855, + "learning_rate": 2.8785543071911383e-06, + "loss": 0.91801792, + "num_input_tokens_seen": 133818020, + "router_z_loss_clip": 1.71386719, + "router_z_loss_mlp": 0.15142822, + "step": 6230, + "time_per_iteration": 2.4964523315429688 + }, + { + "auxiliary_loss_clip": 0.06463904, + "auxiliary_loss_mlp": 0.01275239, + "balance_loss_clip": 0.06291893, + "balance_loss_mlp": 0.01259569, + "epoch": 0.37462798737411696, + "flos": 25779847080960.0, + "grad_norm": 1.7178487844900587, + "language_loss": 0.73793018, + "learning_rate": 2.878204417014456e-06, + "loss": 0.81532168, + "num_input_tokens_seen": 133840690, + "router_z_loss_clip": 1.72167969, + "router_z_loss_mlp": 0.15667725, + "step": 6231, + "time_per_iteration": 2.589771270751953 + }, + { + "auxiliary_loss_clip": 0.06465879, + "auxiliary_loss_mlp": 0.01270223, + "balance_loss_clip": 0.06291361, + "balance_loss_mlp": 0.01255298, + "epoch": 0.3746881106267849, + "flos": 16660136401920.0, + "grad_norm": 1.8762806294571872, + "language_loss": 0.74086344, + "learning_rate": 2.8778544935361735e-06, + "loss": 0.81822443, + "num_input_tokens_seen": 133858350, + "router_z_loss_clip": 1.74414062, + "router_z_loss_mlp": 0.14929199, + "step": 6232, + "time_per_iteration": 2.483219861984253 + }, + { + "auxiliary_loss_clip": 0.06463014, + "auxiliary_loss_mlp": 0.01270796, + "balance_loss_clip": 0.06290261, + "balance_loss_mlp": 0.0125605, + "epoch": 0.3747482338794529, + "flos": 26185317788160.0, + "grad_norm": 1.743409558247901, + "language_loss": 0.77404612, + "learning_rate": 2.877504536769561e-06, + "loss": 0.85138428, + "num_input_tokens_seen": 133879775, + "router_z_loss_clip": 1.72753906, + "router_z_loss_mlp": 0.14758301, + "step": 6233, + "time_per_iteration": 2.5796406269073486 + }, + { + "auxiliary_loss_clip": 0.06463634, + "auxiliary_loss_mlp": 0.01269135, + "balance_loss_clip": 0.06292734, + "balance_loss_mlp": 0.01255432, + "epoch": 0.37480835713212085, + "flos": 12025956890880.0, + "grad_norm": 1.7958128584553208, + "language_loss": 0.69650698, + "learning_rate": 2.8771545467278883e-06, + "loss": 0.77383471, + "num_input_tokens_seen": 133898295, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.13690186, + "step": 6234, + "time_per_iteration": 2.524226188659668 + }, + { + "auxiliary_loss_clip": 0.06464471, + "auxiliary_loss_mlp": 0.01267248, + "balance_loss_clip": 0.06295948, + "balance_loss_mlp": 0.0125311, + "epoch": 0.3748684803847888, + "flos": 19684801560960.0, + "grad_norm": 2.1537876510353597, + "language_loss": 0.83551729, + "learning_rate": 2.8768045234244276e-06, + "loss": 0.91283447, + "num_input_tokens_seen": 133915230, + "router_z_loss_clip": 1.68554688, + "router_z_loss_mlp": 0.14135742, + "step": 6235, + "time_per_iteration": 2.5380606651306152 + }, + { + "auxiliary_loss_clip": 0.06462481, + "auxiliary_loss_mlp": 0.01267157, + "balance_loss_clip": 0.06289958, + "balance_loss_mlp": 0.0125222, + "epoch": 0.3749286036374568, + "flos": 20527328764800.0, + "grad_norm": 1.8434440291752416, + "language_loss": 0.78213942, + "learning_rate": 2.8764544668724517e-06, + "loss": 0.8594358, + "num_input_tokens_seen": 133934110, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.14941406, + "step": 6236, + "time_per_iteration": 2.507180690765381 + }, + { + "auxiliary_loss_clip": 0.06465082, + "auxiliary_loss_mlp": 0.0127323, + "balance_loss_clip": 0.06288011, + "balance_loss_mlp": 0.0125616, + "epoch": 0.37498872689012475, + "flos": 20710958987520.0, + "grad_norm": 1.9437086154972172, + "language_loss": 0.73305297, + "learning_rate": 2.876104377085234e-06, + "loss": 0.81043607, + "num_input_tokens_seen": 133952395, + "router_z_loss_clip": 1.76953125, + "router_z_loss_mlp": 0.17077637, + "step": 6237, + "time_per_iteration": 2.5545706748962402 + }, + { + "auxiliary_loss_clip": 0.06460923, + "auxiliary_loss_mlp": 0.01271336, + "balance_loss_clip": 0.0628608, + "balance_loss_mlp": 0.01256548, + "epoch": 0.3750488501427927, + "flos": 21580418079360.0, + "grad_norm": 2.5847168840400787, + "language_loss": 0.93616223, + "learning_rate": 2.8757542540760508e-06, + "loss": 1.01348472, + "num_input_tokens_seen": 133969635, + "router_z_loss_clip": 1.74804688, + "router_z_loss_mlp": 0.14788818, + "step": 6238, + "time_per_iteration": 2.544524669647217 + }, + { + "auxiliary_loss_clip": 0.06457306, + "auxiliary_loss_mlp": 0.01272243, + "balance_loss_clip": 0.06286643, + "balance_loss_mlp": 0.01257127, + "epoch": 0.3751089733954607, + "flos": 15929221236480.0, + "grad_norm": 2.2437121352489093, + "language_loss": 0.71661341, + "learning_rate": 2.8754040978581777e-06, + "loss": 0.79390883, + "num_input_tokens_seen": 133987215, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.15106201, + "step": 6239, + "time_per_iteration": 2.519807815551758 + }, + { + "auxiliary_loss_clip": 0.06461261, + "auxiliary_loss_mlp": 0.01271582, + "balance_loss_clip": 0.06287319, + "balance_loss_mlp": 0.01256485, + "epoch": 0.37516909664812864, + "flos": 36293688391680.0, + "grad_norm": 1.5212724151961043, + "language_loss": 0.65758455, + "learning_rate": 2.875053908444895e-06, + "loss": 0.73491299, + "num_input_tokens_seen": 134009250, + "router_z_loss_clip": 1.73730469, + "router_z_loss_mlp": 0.15118408, + "step": 6240, + "time_per_iteration": 2.6838748455047607 + }, + { + "auxiliary_loss_clip": 0.06461462, + "auxiliary_loss_mlp": 0.0126514, + "balance_loss_clip": 0.06288624, + "balance_loss_mlp": 0.01251258, + "epoch": 0.3752292199007966, + "flos": 13520882384640.0, + "grad_norm": 2.454894337240739, + "language_loss": 0.76209545, + "learning_rate": 2.8747036858494795e-06, + "loss": 0.83936143, + "num_input_tokens_seen": 134026875, + "router_z_loss_clip": 1.73046875, + "router_z_loss_mlp": 0.13867188, + "step": 6241, + "time_per_iteration": 2.498286008834839 + }, + { + "auxiliary_loss_clip": 0.06461808, + "auxiliary_loss_mlp": 0.01268507, + "balance_loss_clip": 0.06289176, + "balance_loss_mlp": 0.01253206, + "epoch": 0.3752893431534646, + "flos": 27205353866880.0, + "grad_norm": 2.0832931967812853, + "language_loss": 0.84671998, + "learning_rate": 2.874353430085213e-06, + "loss": 0.92402315, + "num_input_tokens_seen": 134047185, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.15313721, + "step": 6242, + "time_per_iteration": 2.6289877891540527 + }, + { + "auxiliary_loss_clip": 0.06457841, + "auxiliary_loss_mlp": 0.01272178, + "balance_loss_clip": 0.06285247, + "balance_loss_mlp": 0.01257379, + "epoch": 0.3753494664061326, + "flos": 30015431919360.0, + "grad_norm": 2.6434313807577112, + "language_loss": 0.68551457, + "learning_rate": 2.8740031411653766e-06, + "loss": 0.76281476, + "num_input_tokens_seen": 134067330, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.14813232, + "step": 6243, + "time_per_iteration": 2.7211153507232666 + }, + { + "auxiliary_loss_clip": 0.0645824, + "auxiliary_loss_mlp": 0.01270289, + "balance_loss_clip": 0.06286814, + "balance_loss_mlp": 0.01254482, + "epoch": 0.37540958965880056, + "flos": 24468803372160.0, + "grad_norm": 1.7478523324296555, + "language_loss": 0.8397631, + "learning_rate": 2.8736528191032535e-06, + "loss": 0.91704839, + "num_input_tokens_seen": 134085525, + "router_z_loss_clip": 1.71386719, + "router_z_loss_mlp": 0.15808105, + "step": 6244, + "time_per_iteration": 2.5738887786865234 + }, + { + "auxiliary_loss_clip": 0.0645659, + "auxiliary_loss_mlp": 0.01266605, + "balance_loss_clip": 0.06290226, + "balance_loss_mlp": 0.01252842, + "epoch": 0.3754697129114685, + "flos": 16513961754240.0, + "grad_norm": 3.8447339818169257, + "language_loss": 0.83823436, + "learning_rate": 2.8733024639121277e-06, + "loss": 0.91546631, + "num_input_tokens_seen": 134101855, + "router_z_loss_clip": 1.66113281, + "router_z_loss_mlp": 0.13751221, + "step": 6245, + "time_per_iteration": 2.5320816040039062 + }, + { + "auxiliary_loss_clip": 0.06453504, + "auxiliary_loss_mlp": 0.0127263, + "balance_loss_clip": 0.06282875, + "balance_loss_mlp": 0.01257633, + "epoch": 0.3755298361641365, + "flos": 19396980385920.0, + "grad_norm": 2.4621620681348295, + "language_loss": 0.64685225, + "learning_rate": 2.8729520756052853e-06, + "loss": 0.72411358, + "num_input_tokens_seen": 134119360, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.14990234, + "step": 6246, + "time_per_iteration": 2.58577561378479 + }, + { + "auxiliary_loss_clip": 0.06466524, + "auxiliary_loss_mlp": 0.01278259, + "balance_loss_clip": 0.06289048, + "balance_loss_mlp": 0.01262428, + "epoch": 0.37558995941680445, + "flos": 14725638789120.0, + "grad_norm": 2.3474335464279648, + "language_loss": 0.75348055, + "learning_rate": 2.8726016541960124e-06, + "loss": 0.83092844, + "num_input_tokens_seen": 134137475, + "router_z_loss_clip": 1.77148438, + "router_z_loss_mlp": 0.1583252, + "step": 6247, + "time_per_iteration": 2.47930908203125 + }, + { + "auxiliary_loss_clip": 0.06456453, + "auxiliary_loss_mlp": 0.012715, + "balance_loss_clip": 0.06282347, + "balance_loss_mlp": 0.01255503, + "epoch": 0.3756500826694724, + "flos": 21696432456960.0, + "grad_norm": 3.5646784592424017, + "language_loss": 0.55380279, + "learning_rate": 2.872251199697598e-06, + "loss": 0.6310823, + "num_input_tokens_seen": 134154580, + "router_z_loss_clip": 1.74121094, + "router_z_loss_mlp": 0.16003418, + "step": 6248, + "time_per_iteration": 2.5266313552856445 + }, + { + "auxiliary_loss_clip": 0.06453443, + "auxiliary_loss_mlp": 0.01268535, + "balance_loss_clip": 0.06283841, + "balance_loss_mlp": 0.01253109, + "epoch": 0.3757102059221404, + "flos": 26512942452480.0, + "grad_norm": 1.7302245846967215, + "language_loss": 0.84781861, + "learning_rate": 2.8719007121233297e-06, + "loss": 0.92503834, + "num_input_tokens_seen": 134174285, + "router_z_loss_clip": 1.69433594, + "router_z_loss_mlp": 0.15429688, + "step": 6249, + "time_per_iteration": 2.5590078830718994 + }, + { + "auxiliary_loss_clip": 0.06456596, + "auxiliary_loss_mlp": 0.01267858, + "balance_loss_clip": 0.0628508, + "balance_loss_mlp": 0.01253481, + "epoch": 0.37577032917480835, + "flos": 37346526144000.0, + "grad_norm": 1.6299752789251518, + "language_loss": 0.68482721, + "learning_rate": 2.8715501914864993e-06, + "loss": 0.76207179, + "num_input_tokens_seen": 134195940, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.14361572, + "step": 6250, + "time_per_iteration": 2.6926450729370117 + }, + { + "auxiliary_loss_clip": 0.06454285, + "auxiliary_loss_mlp": 0.01268088, + "balance_loss_clip": 0.06283066, + "balance_loss_mlp": 0.01254099, + "epoch": 0.3758304524274763, + "flos": 21915128413440.0, + "grad_norm": 2.0147801854845895, + "language_loss": 0.78550422, + "learning_rate": 2.8711996378003987e-06, + "loss": 0.862728, + "num_input_tokens_seen": 134212235, + "router_z_loss_clip": 1.71191406, + "router_z_loss_mlp": 0.13995361, + "step": 6251, + "time_per_iteration": 2.5072193145751953 + }, + { + "auxiliary_loss_clip": 0.06455163, + "auxiliary_loss_mlp": 0.01271265, + "balance_loss_clip": 0.06285167, + "balance_loss_mlp": 0.01257139, + "epoch": 0.3758905756801443, + "flos": 36577233008640.0, + "grad_norm": 2.2428429985343543, + "language_loss": 0.58560276, + "learning_rate": 2.8708490510783203e-06, + "loss": 0.66286701, + "num_input_tokens_seen": 134233810, + "router_z_loss_clip": 1.69824219, + "router_z_loss_mlp": 0.14111328, + "step": 6252, + "time_per_iteration": 2.684899091720581 + }, + { + "auxiliary_loss_clip": 0.06456266, + "auxiliary_loss_mlp": 0.01271539, + "balance_loss_clip": 0.06283682, + "balance_loss_mlp": 0.01255649, + "epoch": 0.37595069893281224, + "flos": 24534616354560.0, + "grad_norm": 1.5871699178816958, + "language_loss": 0.8998009, + "learning_rate": 2.8704984313335584e-06, + "loss": 0.97707891, + "num_input_tokens_seen": 134252020, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.15869141, + "step": 6253, + "time_per_iteration": 2.539088010787964 + }, + { + "auxiliary_loss_clip": 0.0645566, + "auxiliary_loss_mlp": 0.01270173, + "balance_loss_clip": 0.06288448, + "balance_loss_mlp": 0.01255523, + "epoch": 0.3760108221854802, + "flos": 16440518050560.0, + "grad_norm": 2.3821241740713086, + "language_loss": 0.77027023, + "learning_rate": 2.8701477785794097e-06, + "loss": 0.84752858, + "num_input_tokens_seen": 134269495, + "router_z_loss_clip": 1.671875, + "router_z_loss_mlp": 0.14648438, + "step": 6254, + "time_per_iteration": 2.545330047607422 + }, + { + "auxiliary_loss_clip": 0.06454843, + "auxiliary_loss_mlp": 0.01270718, + "balance_loss_clip": 0.06281418, + "balance_loss_mlp": 0.01254386, + "epoch": 0.37607094543814823, + "flos": 13776824280960.0, + "grad_norm": 2.2494955117694007, + "language_loss": 0.62504637, + "learning_rate": 2.869797092829169e-06, + "loss": 0.70230198, + "num_input_tokens_seen": 134287035, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.16333008, + "step": 6255, + "time_per_iteration": 3.937791109085083 + }, + { + "auxiliary_loss_clip": 0.06456207, + "auxiliary_loss_mlp": 0.0127009, + "balance_loss_clip": 0.06282066, + "balance_loss_mlp": 0.01253758, + "epoch": 0.3761310686908162, + "flos": 19862855487360.0, + "grad_norm": 2.2501042164391634, + "language_loss": 0.74801397, + "learning_rate": 2.869446374096135e-06, + "loss": 0.82527697, + "num_input_tokens_seen": 134304840, + "router_z_loss_clip": 1.74023438, + "router_z_loss_mlp": 0.16345215, + "step": 6256, + "time_per_iteration": 2.52768611907959 + }, + { + "auxiliary_loss_clip": 0.06456085, + "auxiliary_loss_mlp": 0.01270671, + "balance_loss_clip": 0.06281887, + "balance_loss_mlp": 0.01254637, + "epoch": 0.37619119194348416, + "flos": 12755823880320.0, + "grad_norm": 1.8167076240371511, + "language_loss": 0.70818299, + "learning_rate": 2.8690956223936088e-06, + "loss": 0.78545058, + "num_input_tokens_seen": 134323180, + "router_z_loss_clip": 1.74023438, + "router_z_loss_mlp": 0.16040039, + "step": 6257, + "time_per_iteration": 4.052328824996948 + }, + { + "auxiliary_loss_clip": 0.06452011, + "auxiliary_loss_mlp": 0.01268418, + "balance_loss_clip": 0.0628053, + "balance_loss_mlp": 0.01253743, + "epoch": 0.3762513151961521, + "flos": 17536387674240.0, + "grad_norm": 1.6926603581335775, + "language_loss": 0.85114312, + "learning_rate": 2.868744837734889e-06, + "loss": 0.92834735, + "num_input_tokens_seen": 134341390, + "router_z_loss_clip": 1.71386719, + "router_z_loss_mlp": 0.14672852, + "step": 6258, + "time_per_iteration": 2.50252366065979 + }, + { + "auxiliary_loss_clip": 0.06455131, + "auxiliary_loss_mlp": 0.0127104, + "balance_loss_clip": 0.06282814, + "balance_loss_mlp": 0.01256503, + "epoch": 0.3763114384488201, + "flos": 23623215494400.0, + "grad_norm": 1.3678719492617617, + "language_loss": 0.81156051, + "learning_rate": 2.868394020133277e-06, + "loss": 0.8888222, + "num_input_tokens_seen": 134360425, + "router_z_loss_clip": 1.72265625, + "router_z_loss_mlp": 0.14532471, + "step": 6259, + "time_per_iteration": 2.5430314540863037 + }, + { + "auxiliary_loss_clip": 0.06458686, + "auxiliary_loss_mlp": 0.01274293, + "balance_loss_clip": 0.06282908, + "balance_loss_mlp": 0.0125696, + "epoch": 0.37637156170148806, + "flos": 25413383249280.0, + "grad_norm": 1.809326583941318, + "language_loss": 0.71774137, + "learning_rate": 2.8680431696020783e-06, + "loss": 0.79507113, + "num_input_tokens_seen": 134379775, + "router_z_loss_clip": 1.75585938, + "router_z_loss_mlp": 0.17321777, + "step": 6260, + "time_per_iteration": 2.566267490386963 + }, + { + "auxiliary_loss_clip": 0.06455691, + "auxiliary_loss_mlp": 0.0127871, + "balance_loss_clip": 0.06279852, + "balance_loss_mlp": 0.01262128, + "epoch": 0.376431684954156, + "flos": 23447677190400.0, + "grad_norm": 1.8475234283885087, + "language_loss": 0.78925788, + "learning_rate": 2.867692286154594e-06, + "loss": 0.86660182, + "num_input_tokens_seen": 134400315, + "router_z_loss_clip": 1.75878906, + "router_z_loss_mlp": 0.16589355, + "step": 6261, + "time_per_iteration": 2.5848124027252197 + }, + { + "auxiliary_loss_clip": 0.06455033, + "auxiliary_loss_mlp": 0.01273009, + "balance_loss_clip": 0.06278862, + "balance_loss_mlp": 0.01257607, + "epoch": 0.376491808206824, + "flos": 34213099985280.0, + "grad_norm": 2.1653724604475255, + "language_loss": 0.80626601, + "learning_rate": 2.867341369804132e-06, + "loss": 0.88354641, + "num_input_tokens_seen": 134422875, + "router_z_loss_clip": 1.76171875, + "router_z_loss_mlp": 0.15405273, + "step": 6262, + "time_per_iteration": 4.146479368209839 + }, + { + "auxiliary_loss_clip": 0.06453078, + "auxiliary_loss_mlp": 0.01268581, + "balance_loss_clip": 0.06282018, + "balance_loss_mlp": 0.01253799, + "epoch": 0.37655193145949195, + "flos": 35193793772160.0, + "grad_norm": 1.6953841761456194, + "language_loss": 0.81274903, + "learning_rate": 2.866990420563998e-06, + "loss": 0.88996559, + "num_input_tokens_seen": 134443025, + "router_z_loss_clip": 1.70996094, + "router_z_loss_mlp": 0.14794922, + "step": 6263, + "time_per_iteration": 2.6529650688171387 + }, + { + "auxiliary_loss_clip": 0.06460523, + "auxiliary_loss_mlp": 0.01276014, + "balance_loss_clip": 0.06286405, + "balance_loss_mlp": 0.01261172, + "epoch": 0.3766120547121599, + "flos": 16767136465920.0, + "grad_norm": 1.8888627452248796, + "language_loss": 0.79794824, + "learning_rate": 2.866639438447501e-06, + "loss": 0.87531358, + "num_input_tokens_seen": 134460945, + "router_z_loss_clip": 1.74121094, + "router_z_loss_mlp": 0.14831543, + "step": 6264, + "time_per_iteration": 3.9715349674224854 + }, + { + "auxiliary_loss_clip": 0.06455237, + "auxiliary_loss_mlp": 0.01269266, + "balance_loss_clip": 0.06284397, + "balance_loss_mlp": 0.0125396, + "epoch": 0.3766721779648279, + "flos": 23557150949760.0, + "grad_norm": 1.690336708132248, + "language_loss": 0.7363869, + "learning_rate": 2.8662884234679497e-06, + "loss": 0.81363189, + "num_input_tokens_seen": 134480440, + "router_z_loss_clip": 1.70898438, + "router_z_loss_mlp": 0.15307617, + "step": 6265, + "time_per_iteration": 2.5544657707214355 + }, + { + "auxiliary_loss_clip": 0.06451584, + "auxiliary_loss_mlp": 0.01276088, + "balance_loss_clip": 0.06283864, + "balance_loss_mlp": 0.01262486, + "epoch": 0.37673230121749585, + "flos": 29136329608320.0, + "grad_norm": 1.6256668529315172, + "language_loss": 0.6925773, + "learning_rate": 2.865937375638654e-06, + "loss": 0.76985407, + "num_input_tokens_seen": 134501110, + "router_z_loss_clip": 1.67675781, + "router_z_loss_mlp": 0.1361084, + "step": 6266, + "time_per_iteration": 2.5735552310943604 + }, + { + "auxiliary_loss_clip": 0.06456051, + "auxiliary_loss_mlp": 0.01274095, + "balance_loss_clip": 0.06279004, + "balance_loss_mlp": 0.01258825, + "epoch": 0.3767924244701638, + "flos": 28154210302080.0, + "grad_norm": 2.361518747365002, + "language_loss": 0.63358176, + "learning_rate": 2.8655862949729264e-06, + "loss": 0.7108832, + "num_input_tokens_seen": 134522460, + "router_z_loss_clip": 1.76855469, + "router_z_loss_mlp": 0.15270996, + "step": 6267, + "time_per_iteration": 2.6408746242523193 + }, + { + "auxiliary_loss_clip": 0.0637848, + "auxiliary_loss_mlp": 0.01265654, + "balance_loss_clip": 0.0630175, + "balance_loss_mlp": 0.01263043, + "epoch": 0.37685254772283183, + "flos": 60815460343680.0, + "grad_norm": 0.7019670976586264, + "language_loss": 0.58932841, + "learning_rate": 2.8652351814840795e-06, + "loss": 0.66576976, + "num_input_tokens_seen": 134589545, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.02612305, + "step": 6268, + "time_per_iteration": 3.3041250705718994 + }, + { + "auxiliary_loss_clip": 0.06448595, + "auxiliary_loss_mlp": 0.01272563, + "balance_loss_clip": 0.06277184, + "balance_loss_mlp": 0.01256756, + "epoch": 0.3769126709754998, + "flos": 26039939754240.0, + "grad_norm": 1.4401012750228117, + "language_loss": 0.65166855, + "learning_rate": 2.8648840351854283e-06, + "loss": 0.72888005, + "num_input_tokens_seen": 134610550, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.15795898, + "step": 6269, + "time_per_iteration": 2.654707670211792 + }, + { + "auxiliary_loss_clip": 0.06454687, + "auxiliary_loss_mlp": 0.01276662, + "balance_loss_clip": 0.06286559, + "balance_loss_mlp": 0.01261296, + "epoch": 0.37697279422816776, + "flos": 23585508357120.0, + "grad_norm": 1.4576669810179597, + "language_loss": 0.71144199, + "learning_rate": 2.8645328560902874e-06, + "loss": 0.78875554, + "num_input_tokens_seen": 134630485, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.15362549, + "step": 6270, + "time_per_iteration": 2.5369231700897217 + }, + { + "auxiliary_loss_clip": 0.06374384, + "auxiliary_loss_mlp": 0.0126987, + "balance_loss_clip": 0.062971, + "balance_loss_mlp": 0.01266305, + "epoch": 0.3770329174808357, + "flos": 64766242753920.0, + "grad_norm": 0.6950430831807741, + "language_loss": 0.56232381, + "learning_rate": 2.8641816442119746e-06, + "loss": 0.63876635, + "num_input_tokens_seen": 134693510, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.03561401, + "step": 6271, + "time_per_iteration": 3.1599924564361572 + }, + { + "auxiliary_loss_clip": 0.06448443, + "auxiliary_loss_mlp": 0.01272708, + "balance_loss_clip": 0.06279441, + "balance_loss_mlp": 0.0125696, + "epoch": 0.3770930407335037, + "flos": 21841768563840.0, + "grad_norm": 1.6801171250404496, + "language_loss": 0.80461442, + "learning_rate": 2.8638303995638066e-06, + "loss": 0.88182592, + "num_input_tokens_seen": 134713115, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.1574707, + "step": 6272, + "time_per_iteration": 2.524846076965332 + }, + { + "auxiliary_loss_clip": 0.06450769, + "auxiliary_loss_mlp": 0.01273349, + "balance_loss_clip": 0.06283743, + "balance_loss_mlp": 0.01258329, + "epoch": 0.37715316398617166, + "flos": 22754594943360.0, + "grad_norm": 1.6672783573066894, + "language_loss": 0.74972034, + "learning_rate": 2.863479122159103e-06, + "loss": 0.82696146, + "num_input_tokens_seen": 134732635, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.15026855, + "step": 6273, + "time_per_iteration": 2.5571129322052 + }, + { + "auxiliary_loss_clip": 0.06449255, + "auxiliary_loss_mlp": 0.01271721, + "balance_loss_clip": 0.06280608, + "balance_loss_mlp": 0.01257148, + "epoch": 0.3772132872388396, + "flos": 18920246181120.0, + "grad_norm": 1.32773283576084, + "language_loss": 0.72241038, + "learning_rate": 2.8631278120111858e-06, + "loss": 0.79962015, + "num_input_tokens_seen": 134750695, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.14569092, + "step": 6274, + "time_per_iteration": 2.4966516494750977 + }, + { + "auxiliary_loss_clip": 0.06454083, + "auxiliary_loss_mlp": 0.01271444, + "balance_loss_clip": 0.06282286, + "balance_loss_mlp": 0.01257467, + "epoch": 0.3772734104915076, + "flos": 17351709275520.0, + "grad_norm": 1.8983068498635614, + "language_loss": 0.84638643, + "learning_rate": 2.8627764691333742e-06, + "loss": 0.92364168, + "num_input_tokens_seen": 134768935, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.13983154, + "step": 6275, + "time_per_iteration": 2.534308910369873 + }, + { + "auxiliary_loss_clip": 0.06448515, + "auxiliary_loss_mlp": 0.01272502, + "balance_loss_clip": 0.06282812, + "balance_loss_mlp": 0.01258865, + "epoch": 0.37733353374417555, + "flos": 32350452848640.0, + "grad_norm": 1.3669254528099, + "language_loss": 0.75387293, + "learning_rate": 2.8624250935389935e-06, + "loss": 0.83108306, + "num_input_tokens_seen": 134791260, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.13641357, + "step": 6276, + "time_per_iteration": 2.6563172340393066 + }, + { + "auxiliary_loss_clip": 0.06453335, + "auxiliary_loss_mlp": 0.0127286, + "balance_loss_clip": 0.06282486, + "balance_loss_mlp": 0.0125803, + "epoch": 0.3773936569968435, + "flos": 23366225422080.0, + "grad_norm": 1.9054341571687776, + "language_loss": 0.86016738, + "learning_rate": 2.862073685241366e-06, + "loss": 0.93742937, + "num_input_tokens_seen": 134808350, + "router_z_loss_clip": 1.70898438, + "router_z_loss_mlp": 0.1484375, + "step": 6277, + "time_per_iteration": 2.6153500080108643 + }, + { + "auxiliary_loss_clip": 0.06448077, + "auxiliary_loss_mlp": 0.01271912, + "balance_loss_clip": 0.0628462, + "balance_loss_mlp": 0.01257488, + "epoch": 0.3774537802495115, + "flos": 21472579474560.0, + "grad_norm": 1.5956300393708251, + "language_loss": 0.78636366, + "learning_rate": 2.861722244253818e-06, + "loss": 0.86356354, + "num_input_tokens_seen": 134826005, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.14428711, + "step": 6278, + "time_per_iteration": 2.564234495162964 + }, + { + "auxiliary_loss_clip": 0.06459187, + "auxiliary_loss_mlp": 0.01270608, + "balance_loss_clip": 0.06284142, + "balance_loss_mlp": 0.01255075, + "epoch": 0.37751390350217945, + "flos": 24980812945920.0, + "grad_norm": 1.8067410295121689, + "language_loss": 0.8371948, + "learning_rate": 2.8613707705896767e-06, + "loss": 0.91449273, + "num_input_tokens_seen": 134844995, + "router_z_loss_clip": 1.75097656, + "router_z_loss_mlp": 0.15527344, + "step": 6279, + "time_per_iteration": 2.6134567260742188 + }, + { + "auxiliary_loss_clip": 0.06454675, + "auxiliary_loss_mlp": 0.01271405, + "balance_loss_clip": 0.06282948, + "balance_loss_mlp": 0.01257117, + "epoch": 0.3775740267548474, + "flos": 27826585637760.0, + "grad_norm": 1.84994794715845, + "language_loss": 0.74995327, + "learning_rate": 2.861019264262269e-06, + "loss": 0.82721412, + "num_input_tokens_seen": 134865285, + "router_z_loss_clip": 1.71777344, + "router_z_loss_mlp": 0.1428833, + "step": 6280, + "time_per_iteration": 2.6029937267303467 + }, + { + "auxiliary_loss_clip": 0.06448464, + "auxiliary_loss_mlp": 0.01272763, + "balance_loss_clip": 0.06282684, + "balance_loss_mlp": 0.01259156, + "epoch": 0.3776341500075154, + "flos": 22571845188480.0, + "grad_norm": 1.3018494364650444, + "language_loss": 0.76205039, + "learning_rate": 2.8606677252849242e-06, + "loss": 0.83926266, + "num_input_tokens_seen": 134886535, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.13592529, + "step": 6281, + "time_per_iteration": 2.524489641189575 + }, + { + "auxiliary_loss_clip": 0.06448536, + "auxiliary_loss_mlp": 0.01271342, + "balance_loss_clip": 0.06279069, + "balance_loss_mlp": 0.0125718, + "epoch": 0.3776942732601834, + "flos": 23084148251520.0, + "grad_norm": 1.5306913056637732, + "language_loss": 0.84658033, + "learning_rate": 2.860316153670974e-06, + "loss": 0.92377913, + "num_input_tokens_seen": 134907435, + "router_z_loss_clip": 1.69238281, + "router_z_loss_mlp": 0.14160156, + "step": 6282, + "time_per_iteration": 2.6190710067749023 + }, + { + "auxiliary_loss_clip": 0.06449918, + "auxiliary_loss_mlp": 0.01269426, + "balance_loss_clip": 0.06282572, + "balance_loss_mlp": 0.0125555, + "epoch": 0.37775439651285136, + "flos": 21730617722880.0, + "grad_norm": 1.840636786741823, + "language_loss": 0.70143461, + "learning_rate": 2.8599645494337484e-06, + "loss": 0.77862805, + "num_input_tokens_seen": 134925360, + "router_z_loss_clip": 1.67578125, + "router_z_loss_mlp": 0.13879395, + "step": 6283, + "time_per_iteration": 2.555816411972046 + }, + { + "auxiliary_loss_clip": 0.06452499, + "auxiliary_loss_mlp": 0.01274632, + "balance_loss_clip": 0.06285429, + "balance_loss_mlp": 0.01259957, + "epoch": 0.37781451976551933, + "flos": 23994542862720.0, + "grad_norm": 1.743481736886233, + "language_loss": 0.76856482, + "learning_rate": 2.859612912586581e-06, + "loss": 0.8458361, + "num_input_tokens_seen": 134944205, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.14648438, + "step": 6284, + "time_per_iteration": 2.560770034790039 + }, + { + "auxiliary_loss_clip": 0.06464045, + "auxiliary_loss_mlp": 0.01271283, + "balance_loss_clip": 0.06286186, + "balance_loss_mlp": 0.01254725, + "epoch": 0.3778746430181873, + "flos": 13731821838720.0, + "grad_norm": 2.746966655353194, + "language_loss": 0.85536617, + "learning_rate": 2.8592612431428055e-06, + "loss": 0.93271947, + "num_input_tokens_seen": 134960255, + "router_z_loss_clip": 1.77636719, + "router_z_loss_mlp": 0.16564941, + "step": 6285, + "time_per_iteration": 2.5006392002105713 + }, + { + "auxiliary_loss_clip": 0.06451872, + "auxiliary_loss_mlp": 0.01271139, + "balance_loss_clip": 0.06279811, + "balance_loss_mlp": 0.01256065, + "epoch": 0.37793476627085526, + "flos": 19466021750400.0, + "grad_norm": 1.7632018529100697, + "language_loss": 0.84913701, + "learning_rate": 2.858909541115758e-06, + "loss": 0.9263671, + "num_input_tokens_seen": 134978605, + "router_z_loss_clip": 1.71777344, + "router_z_loss_mlp": 0.1506958, + "step": 6286, + "time_per_iteration": 2.566092014312744 + }, + { + "auxiliary_loss_clip": 0.06452557, + "auxiliary_loss_mlp": 0.01269453, + "balance_loss_clip": 0.06281806, + "balance_loss_mlp": 0.01254182, + "epoch": 0.3779948895235232, + "flos": 10711600945920.0, + "grad_norm": 1.9010574176879877, + "language_loss": 0.823708, + "learning_rate": 2.858557806518775e-06, + "loss": 0.90092808, + "num_input_tokens_seen": 134995020, + "router_z_loss_clip": 1.70800781, + "router_z_loss_mlp": 0.15258789, + "step": 6287, + "time_per_iteration": 2.4892444610595703 + }, + { + "auxiliary_loss_clip": 0.06454234, + "auxiliary_loss_mlp": 0.01274095, + "balance_loss_clip": 0.06284317, + "balance_loss_mlp": 0.01258408, + "epoch": 0.3780550127761912, + "flos": 22316616051840.0, + "grad_norm": 2.1030531862013584, + "language_loss": 0.7330361, + "learning_rate": 2.8582060393651927e-06, + "loss": 0.81031942, + "num_input_tokens_seen": 135012620, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.15679932, + "step": 6288, + "time_per_iteration": 2.5415592193603516 + }, + { + "auxiliary_loss_clip": 0.06453485, + "auxiliary_loss_mlp": 0.01269135, + "balance_loss_clip": 0.06282276, + "balance_loss_mlp": 0.01254359, + "epoch": 0.37811513602885916, + "flos": 28958401463040.0, + "grad_norm": 1.6277535048544236, + "language_loss": 0.75782627, + "learning_rate": 2.857854239668352e-06, + "loss": 0.83505249, + "num_input_tokens_seen": 135033365, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.14770508, + "step": 6289, + "time_per_iteration": 2.5579047203063965 + }, + { + "auxiliary_loss_clip": 0.06454412, + "auxiliary_loss_mlp": 0.01273518, + "balance_loss_clip": 0.06284275, + "balance_loss_mlp": 0.01257925, + "epoch": 0.3781752592815271, + "flos": 23119717109760.0, + "grad_norm": 1.945372772068441, + "language_loss": 0.74155736, + "learning_rate": 2.857502407441593e-06, + "loss": 0.81883669, + "num_input_tokens_seen": 135052185, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.15588379, + "step": 6290, + "time_per_iteration": 2.5697786808013916 + }, + { + "auxiliary_loss_clip": 0.06458094, + "auxiliary_loss_mlp": 0.01273362, + "balance_loss_clip": 0.06281058, + "balance_loss_mlp": 0.0125653, + "epoch": 0.3782353825341951, + "flos": 19762102552320.0, + "grad_norm": 2.4066647483264596, + "language_loss": 0.80529308, + "learning_rate": 2.8571505426982566e-06, + "loss": 0.88260764, + "num_input_tokens_seen": 135070425, + "router_z_loss_clip": 1.77050781, + "router_z_loss_mlp": 0.16833496, + "step": 6291, + "time_per_iteration": 2.4970998764038086 + }, + { + "auxiliary_loss_clip": 0.06456125, + "auxiliary_loss_mlp": 0.01270776, + "balance_loss_clip": 0.06283687, + "balance_loss_mlp": 0.01254933, + "epoch": 0.37829550578686305, + "flos": 22056774940800.0, + "grad_norm": 1.7419894192909393, + "language_loss": 0.76369846, + "learning_rate": 2.8567986454516854e-06, + "loss": 0.84096742, + "num_input_tokens_seen": 135090525, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.1583252, + "step": 6292, + "time_per_iteration": 2.572916030883789 + }, + { + "auxiliary_loss_clip": 0.06452248, + "auxiliary_loss_mlp": 0.0127064, + "balance_loss_clip": 0.06281239, + "balance_loss_mlp": 0.01255631, + "epoch": 0.378355629039531, + "flos": 16475667638400.0, + "grad_norm": 1.682972265329385, + "language_loss": 0.70006013, + "learning_rate": 2.856446715715224e-06, + "loss": 0.77728903, + "num_input_tokens_seen": 135109575, + "router_z_loss_clip": 1.70996094, + "router_z_loss_mlp": 0.15014648, + "step": 6293, + "time_per_iteration": 2.5161240100860596 + }, + { + "auxiliary_loss_clip": 0.06449296, + "auxiliary_loss_mlp": 0.01271246, + "balance_loss_clip": 0.06281447, + "balance_loss_mlp": 0.01255934, + "epoch": 0.378415752292199, + "flos": 19981050071040.0, + "grad_norm": 1.9898859900525039, + "language_loss": 0.7173214, + "learning_rate": 2.8560947535022173e-06, + "loss": 0.79452682, + "num_input_tokens_seen": 135127000, + "router_z_loss_clip": 1.6796875, + "router_z_loss_mlp": 0.15332031, + "step": 6294, + "time_per_iteration": 3.9304022789001465 + }, + { + "auxiliary_loss_clip": 0.06465693, + "auxiliary_loss_mlp": 0.01279732, + "balance_loss_clip": 0.06285857, + "balance_loss_mlp": 0.01264068, + "epoch": 0.378475875544867, + "flos": 14652614355840.0, + "grad_norm": 2.57033704665896, + "language_loss": 0.83215445, + "learning_rate": 2.855742758826011e-06, + "loss": 0.90960872, + "num_input_tokens_seen": 135145285, + "router_z_loss_clip": 1.79980469, + "router_z_loss_mlp": 0.15655518, + "step": 6295, + "time_per_iteration": 2.488780975341797 + }, + { + "auxiliary_loss_clip": 0.06459963, + "auxiliary_loss_mlp": 0.01268811, + "balance_loss_clip": 0.06288329, + "balance_loss_mlp": 0.01253255, + "epoch": 0.37853599879753497, + "flos": 26658194705280.0, + "grad_norm": 1.6154959379599871, + "language_loss": 0.71442378, + "learning_rate": 2.8553907316999547e-06, + "loss": 0.79171151, + "num_input_tokens_seen": 135165240, + "router_z_loss_clip": 1.71484375, + "router_z_loss_mlp": 0.15563965, + "step": 6296, + "time_per_iteration": 4.0578773021698 + }, + { + "auxiliary_loss_clip": 0.06454356, + "auxiliary_loss_mlp": 0.01274534, + "balance_loss_clip": 0.06287888, + "balance_loss_mlp": 0.01260455, + "epoch": 0.37859612205020293, + "flos": 17317817498880.0, + "grad_norm": 1.7695984237012152, + "language_loss": 0.77514613, + "learning_rate": 2.855038672137396e-06, + "loss": 0.85243499, + "num_input_tokens_seen": 135184045, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.14074707, + "step": 6297, + "time_per_iteration": 2.54968523979187 + }, + { + "auxiliary_loss_clip": 0.06462398, + "auxiliary_loss_mlp": 0.01275228, + "balance_loss_clip": 0.0628902, + "balance_loss_mlp": 0.01259481, + "epoch": 0.3786562453028709, + "flos": 18225780341760.0, + "grad_norm": 1.977165612519376, + "language_loss": 0.80132794, + "learning_rate": 2.854686580151684e-06, + "loss": 0.87870419, + "num_input_tokens_seen": 135202365, + "router_z_loss_clip": 1.73339844, + "router_z_loss_mlp": 0.1574707, + "step": 6298, + "time_per_iteration": 2.5013349056243896 + }, + { + "auxiliary_loss_clip": 0.06454945, + "auxiliary_loss_mlp": 0.01270815, + "balance_loss_clip": 0.06285203, + "balance_loss_mlp": 0.01255711, + "epoch": 0.37871636855553886, + "flos": 21221207625600.0, + "grad_norm": 1.480969598733767, + "language_loss": 0.8501091, + "learning_rate": 2.8543344557561722e-06, + "loss": 0.92736673, + "num_input_tokens_seen": 135220955, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.15087891, + "step": 6299, + "time_per_iteration": 2.5749709606170654 + }, + { + "auxiliary_loss_clip": 0.06460874, + "auxiliary_loss_mlp": 0.01272586, + "balance_loss_clip": 0.06288288, + "balance_loss_mlp": 0.01256844, + "epoch": 0.3787764918082068, + "flos": 20957886570240.0, + "grad_norm": 2.4357425027716895, + "language_loss": 0.77022231, + "learning_rate": 2.8539822989642116e-06, + "loss": 0.84755683, + "num_input_tokens_seen": 135239715, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.15740967, + "step": 6300, + "time_per_iteration": 2.521772623062134 + }, + { + "auxiliary_loss_clip": 0.06472084, + "auxiliary_loss_mlp": 0.01275415, + "balance_loss_clip": 0.06293886, + "balance_loss_mlp": 0.01258177, + "epoch": 0.3788366150608748, + "flos": 17313205524480.0, + "grad_norm": 1.8143586204861406, + "language_loss": 0.83141446, + "learning_rate": 2.8536301097891577e-06, + "loss": 0.90888953, + "num_input_tokens_seen": 135257035, + "router_z_loss_clip": 1.78125, + "router_z_loss_mlp": 0.17236328, + "step": 6301, + "time_per_iteration": 3.982780933380127 + }, + { + "auxiliary_loss_clip": 0.0646001, + "auxiliary_loss_mlp": 0.01270469, + "balance_loss_clip": 0.06287184, + "balance_loss_mlp": 0.0125428, + "epoch": 0.37889673831354276, + "flos": 24317094355200.0, + "grad_norm": 1.8203378599779103, + "language_loss": 0.68096328, + "learning_rate": 2.8532778882443636e-06, + "loss": 0.75826812, + "num_input_tokens_seen": 135275720, + "router_z_loss_clip": 1.72753906, + "router_z_loss_mlp": 0.16186523, + "step": 6302, + "time_per_iteration": 2.5983002185821533 + }, + { + "auxiliary_loss_clip": 0.06455475, + "auxiliary_loss_mlp": 0.01270441, + "balance_loss_clip": 0.06284864, + "balance_loss_mlp": 0.01255718, + "epoch": 0.3789568615662107, + "flos": 26690157838080.0, + "grad_norm": 2.521279180058548, + "language_loss": 0.68357861, + "learning_rate": 2.8529256343431867e-06, + "loss": 0.76083779, + "num_input_tokens_seen": 135294140, + "router_z_loss_clip": 1.70703125, + "router_z_loss_mlp": 0.1472168, + "step": 6303, + "time_per_iteration": 2.5610175132751465 + }, + { + "auxiliary_loss_clip": 0.06458124, + "auxiliary_loss_mlp": 0.01272095, + "balance_loss_clip": 0.06285581, + "balance_loss_mlp": 0.01257265, + "epoch": 0.3790169848188787, + "flos": 23591713559040.0, + "grad_norm": 1.604251878296904, + "language_loss": 0.78095663, + "learning_rate": 2.8525733480989846e-06, + "loss": 0.85825884, + "num_input_tokens_seen": 135314845, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.14807129, + "step": 6304, + "time_per_iteration": 3.994072437286377 + }, + { + "auxiliary_loss_clip": 0.06468576, + "auxiliary_loss_mlp": 0.01269708, + "balance_loss_clip": 0.06292479, + "balance_loss_mlp": 0.01253806, + "epoch": 0.37907710807154665, + "flos": 18442547654400.0, + "grad_norm": 1.8924180649319282, + "language_loss": 0.80524492, + "learning_rate": 2.8522210295251146e-06, + "loss": 0.88262779, + "num_input_tokens_seen": 135333055, + "router_z_loss_clip": 1.76074219, + "router_z_loss_mlp": 0.15881348, + "step": 6305, + "time_per_iteration": 2.5073235034942627 + }, + { + "auxiliary_loss_clip": 0.06370047, + "auxiliary_loss_mlp": 0.01262008, + "balance_loss_clip": 0.06291789, + "balance_loss_mlp": 0.01258527, + "epoch": 0.3791372313242146, + "flos": 50123690887680.0, + "grad_norm": 0.9538902579511545, + "language_loss": 0.64400995, + "learning_rate": 2.8518686786349387e-06, + "loss": 0.72033048, + "num_input_tokens_seen": 135387865, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 0.03491211, + "step": 6306, + "time_per_iteration": 3.106515645980835 + }, + { + "auxiliary_loss_clip": 0.06464424, + "auxiliary_loss_mlp": 0.01273174, + "balance_loss_clip": 0.06292081, + "balance_loss_mlp": 0.01257683, + "epoch": 0.3791973545768826, + "flos": 24323467265280.0, + "grad_norm": 1.5167178412192643, + "language_loss": 0.73534656, + "learning_rate": 2.851516295441817e-06, + "loss": 0.8127225, + "num_input_tokens_seen": 135409095, + "router_z_loss_clip": 1.72363281, + "router_z_loss_mlp": 0.15484619, + "step": 6307, + "time_per_iteration": 2.6272099018096924 + }, + { + "auxiliary_loss_clip": 0.06462627, + "auxiliary_loss_mlp": 0.01270499, + "balance_loss_clip": 0.06287986, + "balance_loss_mlp": 0.0125505, + "epoch": 0.3792574778295506, + "flos": 21586329792000.0, + "grad_norm": 1.8539993286062635, + "language_loss": 0.78603798, + "learning_rate": 2.851163879959112e-06, + "loss": 0.86336923, + "num_input_tokens_seen": 135429585, + "router_z_loss_clip": 1.74511719, + "router_z_loss_mlp": 0.15441895, + "step": 6308, + "time_per_iteration": 2.518927574157715 + }, + { + "auxiliary_loss_clip": 0.06459265, + "auxiliary_loss_mlp": 0.01272841, + "balance_loss_clip": 0.06287025, + "balance_loss_mlp": 0.01257028, + "epoch": 0.37931760108221857, + "flos": 22279202403840.0, + "grad_norm": 4.0253147283534, + "language_loss": 0.73503512, + "learning_rate": 2.8508114322001876e-06, + "loss": 0.81235617, + "num_input_tokens_seen": 135446320, + "router_z_loss_clip": 1.72265625, + "router_z_loss_mlp": 0.15814209, + "step": 6309, + "time_per_iteration": 2.539158344268799 + }, + { + "auxiliary_loss_clip": 0.06457806, + "auxiliary_loss_mlp": 0.01274513, + "balance_loss_clip": 0.06288067, + "balance_loss_mlp": 0.0125963, + "epoch": 0.37937772433488653, + "flos": 19689161973120.0, + "grad_norm": 1.3654110952225158, + "language_loss": 0.79184294, + "learning_rate": 2.8504589521784083e-06, + "loss": 0.86916614, + "num_input_tokens_seen": 135465720, + "router_z_loss_clip": 1.69726562, + "router_z_loss_mlp": 0.14886475, + "step": 6310, + "time_per_iteration": 2.4997847080230713 + }, + { + "auxiliary_loss_clip": 0.06457442, + "auxiliary_loss_mlp": 0.01268809, + "balance_loss_clip": 0.06285986, + "balance_loss_mlp": 0.01253586, + "epoch": 0.3794378475875545, + "flos": 19105469631360.0, + "grad_norm": 1.8573579951480166, + "language_loss": 0.76741791, + "learning_rate": 2.8501064399071403e-06, + "loss": 0.84468043, + "num_input_tokens_seen": 135485155, + "router_z_loss_clip": 1.71484375, + "router_z_loss_mlp": 0.15222168, + "step": 6311, + "time_per_iteration": 2.5216546058654785 + }, + { + "auxiliary_loss_clip": 0.06457929, + "auxiliary_loss_mlp": 0.01276784, + "balance_loss_clip": 0.06287444, + "balance_loss_mlp": 0.01261746, + "epoch": 0.37949797084022246, + "flos": 20345920675200.0, + "grad_norm": 1.4012846072012495, + "language_loss": 0.71063423, + "learning_rate": 2.8497538953997504e-06, + "loss": 0.78798139, + "num_input_tokens_seen": 135502675, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.15032959, + "step": 6312, + "time_per_iteration": 2.4909064769744873 + }, + { + "auxiliary_loss_clip": 0.06361144, + "auxiliary_loss_mlp": 0.01254908, + "balance_loss_clip": 0.06283364, + "balance_loss_mlp": 0.01251185, + "epoch": 0.37955809409289043, + "flos": 63991121760000.0, + "grad_norm": 0.7457914665340521, + "language_loss": 0.55941355, + "learning_rate": 2.849401318669608e-06, + "loss": 0.63557404, + "num_input_tokens_seen": 135562005, + "router_z_loss_clip": 0.77734375, + "router_z_loss_mlp": 0.03713989, + "step": 6313, + "time_per_iteration": 3.1312170028686523 + }, + { + "auxiliary_loss_clip": 0.06457204, + "auxiliary_loss_mlp": 0.0127245, + "balance_loss_clip": 0.06285529, + "balance_loss_mlp": 0.01258211, + "epoch": 0.3796182173455584, + "flos": 31548777310080.0, + "grad_norm": 1.7202421351204062, + "language_loss": 0.71222353, + "learning_rate": 2.849048709730083e-06, + "loss": 0.78952008, + "num_input_tokens_seen": 135582600, + "router_z_loss_clip": 1.71679688, + "router_z_loss_mlp": 0.14233398, + "step": 6314, + "time_per_iteration": 2.5876691341400146 + }, + { + "auxiliary_loss_clip": 0.06465393, + "auxiliary_loss_mlp": 0.01270992, + "balance_loss_clip": 0.06290812, + "balance_loss_mlp": 0.01254922, + "epoch": 0.37967834059822636, + "flos": 12135766066560.0, + "grad_norm": 2.8019471516683985, + "language_loss": 0.74203241, + "learning_rate": 2.848696068594545e-06, + "loss": 0.81939626, + "num_input_tokens_seen": 135600280, + "router_z_loss_clip": 1.74511719, + "router_z_loss_mlp": 0.16064453, + "step": 6315, + "time_per_iteration": 2.5312654972076416 + }, + { + "auxiliary_loss_clip": 0.06455735, + "auxiliary_loss_mlp": 0.01269414, + "balance_loss_clip": 0.0628659, + "balance_loss_mlp": 0.01253512, + "epoch": 0.3797384638508943, + "flos": 39357989331840.0, + "grad_norm": 5.544256779510487, + "language_loss": 0.7095021, + "learning_rate": 2.8483433952763677e-06, + "loss": 0.78675354, + "num_input_tokens_seen": 135621560, + "router_z_loss_clip": 1.69238281, + "router_z_loss_mlp": 0.15905762, + "step": 6316, + "time_per_iteration": 2.642946481704712 + }, + { + "auxiliary_loss_clip": 0.06458603, + "auxiliary_loss_mlp": 0.0127094, + "balance_loss_clip": 0.06288237, + "balance_loss_mlp": 0.01255991, + "epoch": 0.3797985871035623, + "flos": 34061852165760.0, + "grad_norm": 2.4477129072331656, + "language_loss": 0.65612113, + "learning_rate": 2.847990689788923e-06, + "loss": 0.7334165, + "num_input_tokens_seen": 135641745, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.1496582, + "step": 6317, + "time_per_iteration": 2.634066104888916 + }, + { + "auxiliary_loss_clip": 0.0645286, + "auxiliary_loss_mlp": 0.0127098, + "balance_loss_clip": 0.06285463, + "balance_loss_mlp": 0.0125702, + "epoch": 0.37985871035623026, + "flos": 23228939306880.0, + "grad_norm": 1.9893651635894969, + "language_loss": 0.86348939, + "learning_rate": 2.8476379521455877e-06, + "loss": 0.94072783, + "num_input_tokens_seen": 135660650, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.13964844, + "step": 6318, + "time_per_iteration": 2.50665545463562 + }, + { + "auxiliary_loss_clip": 0.06460046, + "auxiliary_loss_mlp": 0.01273041, + "balance_loss_clip": 0.06287004, + "balance_loss_mlp": 0.01257675, + "epoch": 0.3799188336088982, + "flos": 18121002410880.0, + "grad_norm": 2.356531700065532, + "language_loss": 0.76647675, + "learning_rate": 2.8472851823597354e-06, + "loss": 0.84380764, + "num_input_tokens_seen": 135679980, + "router_z_loss_clip": 1.73046875, + "router_z_loss_mlp": 0.15368652, + "step": 6319, + "time_per_iteration": 2.50382137298584 + }, + { + "auxiliary_loss_clip": 0.06453398, + "auxiliary_loss_mlp": 0.01272745, + "balance_loss_clip": 0.06284256, + "balance_loss_mlp": 0.01258082, + "epoch": 0.3799789568615662, + "flos": 21878385598080.0, + "grad_norm": 6.804259628026359, + "language_loss": 0.6451484, + "learning_rate": 2.846932380444744e-06, + "loss": 0.72240984, + "num_input_tokens_seen": 135699400, + "router_z_loss_clip": 1.69140625, + "router_z_loss_mlp": 0.14660645, + "step": 6320, + "time_per_iteration": 2.516150712966919 + }, + { + "auxiliary_loss_clip": 0.06456275, + "auxiliary_loss_mlp": 0.01268463, + "balance_loss_clip": 0.06285265, + "balance_loss_mlp": 0.01252846, + "epoch": 0.3800390801142342, + "flos": 32971181495040.0, + "grad_norm": 1.7343317020382172, + "language_loss": 0.71855223, + "learning_rate": 2.846579546413992e-06, + "loss": 0.79579961, + "num_input_tokens_seen": 135723455, + "router_z_loss_clip": 1.71191406, + "router_z_loss_mlp": 0.15612793, + "step": 6321, + "time_per_iteration": 2.6204988956451416 + }, + { + "auxiliary_loss_clip": 0.06458073, + "auxiliary_loss_mlp": 0.01268703, + "balance_loss_clip": 0.06285845, + "balance_loss_mlp": 0.01253784, + "epoch": 0.38009920336690217, + "flos": 26914430090880.0, + "grad_norm": 1.8398392312515923, + "language_loss": 0.75578612, + "learning_rate": 2.846226680280859e-06, + "loss": 0.83305389, + "num_input_tokens_seen": 135744335, + "router_z_loss_clip": 1.72070312, + "router_z_loss_mlp": 0.14923096, + "step": 6322, + "time_per_iteration": 2.5463461875915527 + }, + { + "auxiliary_loss_clip": 0.06454624, + "auxiliary_loss_mlp": 0.01271033, + "balance_loss_clip": 0.06285781, + "balance_loss_mlp": 0.01256823, + "epoch": 0.38015932661957014, + "flos": 22494963467520.0, + "grad_norm": 1.8201003599281902, + "language_loss": 0.85709381, + "learning_rate": 2.845873782058725e-06, + "loss": 0.93435031, + "num_input_tokens_seen": 135761440, + "router_z_loss_clip": 1.6875, + "router_z_loss_mlp": 0.14215088, + "step": 6323, + "time_per_iteration": 2.4927124977111816 + }, + { + "auxiliary_loss_clip": 0.06458908, + "auxiliary_loss_mlp": 0.01270641, + "balance_loss_clip": 0.06286593, + "balance_loss_mlp": 0.01254596, + "epoch": 0.3802194498722381, + "flos": 21987440087040.0, + "grad_norm": 2.2452863694907426, + "language_loss": 0.73932886, + "learning_rate": 2.845520851760973e-06, + "loss": 0.81662428, + "num_input_tokens_seen": 135779955, + "router_z_loss_clip": 1.72265625, + "router_z_loss_mlp": 0.16027832, + "step": 6324, + "time_per_iteration": 2.4913861751556396 + }, + { + "auxiliary_loss_clip": 0.06464465, + "auxiliary_loss_mlp": 0.01272923, + "balance_loss_clip": 0.06288414, + "balance_loss_mlp": 0.01257724, + "epoch": 0.38027957312490607, + "flos": 21331310290560.0, + "grad_norm": 1.7884051563809298, + "language_loss": 0.84122628, + "learning_rate": 2.8451678894009847e-06, + "loss": 0.91860014, + "num_input_tokens_seen": 135799840, + "router_z_loss_clip": 1.76171875, + "router_z_loss_mlp": 0.15203857, + "step": 6325, + "time_per_iteration": 2.6119046211242676 + }, + { + "auxiliary_loss_clip": 0.06455745, + "auxiliary_loss_mlp": 0.01266737, + "balance_loss_clip": 0.06285073, + "balance_loss_mlp": 0.01252712, + "epoch": 0.38033969637757403, + "flos": 16696921144320.0, + "grad_norm": 2.2200302984742915, + "language_loss": 0.79868543, + "learning_rate": 2.8448148949921465e-06, + "loss": 0.87591028, + "num_input_tokens_seen": 135817880, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.14019775, + "step": 6326, + "time_per_iteration": 2.5188262462615967 + }, + { + "auxiliary_loss_clip": 0.06455691, + "auxiliary_loss_mlp": 0.01270203, + "balance_loss_clip": 0.06286497, + "balance_loss_mlp": 0.01255242, + "epoch": 0.380399819630242, + "flos": 36219741563520.0, + "grad_norm": 3.3742704435112025, + "language_loss": 0.73389304, + "learning_rate": 2.844461868547842e-06, + "loss": 0.81115204, + "num_input_tokens_seen": 135838940, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.14978027, + "step": 6327, + "time_per_iteration": 2.649383783340454 + }, + { + "auxiliary_loss_clip": 0.06459647, + "auxiliary_loss_mlp": 0.01269027, + "balance_loss_clip": 0.06290785, + "balance_loss_mlp": 0.01255145, + "epoch": 0.38045994288290996, + "flos": 21295364088960.0, + "grad_norm": 1.4936601975654378, + "language_loss": 0.83229524, + "learning_rate": 2.844108810081459e-06, + "loss": 0.90958202, + "num_input_tokens_seen": 135858325, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.13867188, + "step": 6328, + "time_per_iteration": 2.527261972427368 + }, + { + "auxiliary_loss_clip": 0.06452741, + "auxiliary_loss_mlp": 0.01268758, + "balance_loss_clip": 0.06281206, + "balance_loss_mlp": 0.01253755, + "epoch": 0.38052006613557793, + "flos": 20929151819520.0, + "grad_norm": 1.5056942690240434, + "language_loss": 0.61757982, + "learning_rate": 2.843755719606385e-06, + "loss": 0.69479483, + "num_input_tokens_seen": 135878430, + "router_z_loss_clip": 1.71679688, + "router_z_loss_mlp": 0.15008545, + "step": 6329, + "time_per_iteration": 2.54025936126709 + }, + { + "auxiliary_loss_clip": 0.0645529, + "auxiliary_loss_mlp": 0.01268187, + "balance_loss_clip": 0.06283917, + "balance_loss_mlp": 0.01254037, + "epoch": 0.3805801893882459, + "flos": 20996138759040.0, + "grad_norm": 2.0488191193117316, + "language_loss": 0.56127822, + "learning_rate": 2.8434025971360104e-06, + "loss": 0.63851297, + "num_input_tokens_seen": 135894755, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.14160156, + "step": 6330, + "time_per_iteration": 2.4913628101348877 + }, + { + "auxiliary_loss_clip": 0.06449446, + "auxiliary_loss_mlp": 0.01269693, + "balance_loss_clip": 0.06282543, + "balance_loss_mlp": 0.01255781, + "epoch": 0.38064031264091386, + "flos": 25565972734080.0, + "grad_norm": 1.4483276491856993, + "language_loss": 0.65912807, + "learning_rate": 2.8430494426837243e-06, + "loss": 0.73631942, + "num_input_tokens_seen": 135918275, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.13903809, + "step": 6331, + "time_per_iteration": 2.6071105003356934 + }, + { + "auxiliary_loss_clip": 0.0645493, + "auxiliary_loss_mlp": 0.01269934, + "balance_loss_clip": 0.06284193, + "balance_loss_mlp": 0.01254312, + "epoch": 0.3807004358935818, + "flos": 15091264080000.0, + "grad_norm": 1.528944840420101, + "language_loss": 0.7597304, + "learning_rate": 2.842696256262919e-06, + "loss": 0.83697909, + "num_input_tokens_seen": 135937430, + "router_z_loss_clip": 1.70898438, + "router_z_loss_mlp": 0.15618896, + "step": 6332, + "time_per_iteration": 2.4808928966522217 + }, + { + "auxiliary_loss_clip": 0.06456427, + "auxiliary_loss_mlp": 0.01273089, + "balance_loss_clip": 0.06283183, + "balance_loss_mlp": 0.01257943, + "epoch": 0.3807605591462498, + "flos": 16405033046400.0, + "grad_norm": 2.2042220893600226, + "language_loss": 0.82397389, + "learning_rate": 2.842343037886987e-06, + "loss": 0.90126908, + "num_input_tokens_seen": 135954210, + "router_z_loss_clip": 1.73144531, + "router_z_loss_mlp": 0.15142822, + "step": 6333, + "time_per_iteration": 2.5033013820648193 + }, + { + "auxiliary_loss_clip": 0.06454624, + "auxiliary_loss_mlp": 0.01269205, + "balance_loss_clip": 0.06283775, + "balance_loss_mlp": 0.01254655, + "epoch": 0.3808206823989178, + "flos": 29064353351040.0, + "grad_norm": 1.4831969327294916, + "language_loss": 0.86723578, + "learning_rate": 2.8419897875693226e-06, + "loss": 0.9444741, + "num_input_tokens_seen": 135974425, + "router_z_loss_clip": 1.70703125, + "router_z_loss_mlp": 0.14538574, + "step": 6334, + "time_per_iteration": 4.024240493774414 + }, + { + "auxiliary_loss_clip": 0.06455058, + "auxiliary_loss_mlp": 0.01270467, + "balance_loss_clip": 0.06282362, + "balance_loss_mlp": 0.01255155, + "epoch": 0.3808808056515858, + "flos": 15711321893760.0, + "grad_norm": 2.3448311359770795, + "language_loss": 0.79450226, + "learning_rate": 2.841636505323321e-06, + "loss": 0.87175757, + "num_input_tokens_seen": 135991985, + "router_z_loss_clip": 1.72753906, + "router_z_loss_mlp": 0.15301514, + "step": 6335, + "time_per_iteration": 2.4698357582092285 + }, + { + "auxiliary_loss_clip": 0.06453745, + "auxiliary_loss_mlp": 0.0127096, + "balance_loss_clip": 0.06281872, + "balance_loss_mlp": 0.0125517, + "epoch": 0.38094092890425374, + "flos": 20710917060480.0, + "grad_norm": 1.9128487431319638, + "language_loss": 0.72795898, + "learning_rate": 2.8412831911623795e-06, + "loss": 0.80520606, + "num_input_tokens_seen": 136010015, + "router_z_loss_clip": 1.71777344, + "router_z_loss_mlp": 0.15802002, + "step": 6336, + "time_per_iteration": 3.9780919551849365 + }, + { + "auxiliary_loss_clip": 0.06449959, + "auxiliary_loss_mlp": 0.01267203, + "balance_loss_clip": 0.06281384, + "balance_loss_mlp": 0.01252826, + "epoch": 0.3810010521569217, + "flos": 20674258099200.0, + "grad_norm": 2.2277206975915362, + "language_loss": 0.69756234, + "learning_rate": 2.840929845099894e-06, + "loss": 0.77473396, + "num_input_tokens_seen": 136028440, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.14373779, + "step": 6337, + "time_per_iteration": 2.5475378036499023 + }, + { + "auxiliary_loss_clip": 0.06454941, + "auxiliary_loss_mlp": 0.01273075, + "balance_loss_clip": 0.06282912, + "balance_loss_mlp": 0.012579, + "epoch": 0.38106117540958967, + "flos": 31834963330560.0, + "grad_norm": 1.987280020069696, + "language_loss": 0.64026022, + "learning_rate": 2.8405764671492652e-06, + "loss": 0.71754032, + "num_input_tokens_seen": 136048360, + "router_z_loss_clip": 1.71777344, + "router_z_loss_mlp": 0.1517334, + "step": 6338, + "time_per_iteration": 2.5795555114746094 + }, + { + "auxiliary_loss_clip": 0.06456137, + "auxiliary_loss_mlp": 0.01271603, + "balance_loss_clip": 0.06282276, + "balance_loss_mlp": 0.01255772, + "epoch": 0.38112129866225763, + "flos": 16907231692800.0, + "grad_norm": 1.6550535893348008, + "language_loss": 0.69685936, + "learning_rate": 2.8402230573238923e-06, + "loss": 0.77413678, + "num_input_tokens_seen": 136065500, + "router_z_loss_clip": 1.74121094, + "router_z_loss_mlp": 0.15856934, + "step": 6339, + "time_per_iteration": 2.48705792427063 + }, + { + "auxiliary_loss_clip": 0.06451984, + "auxiliary_loss_mlp": 0.01267449, + "balance_loss_clip": 0.06281533, + "balance_loss_mlp": 0.01253913, + "epoch": 0.3811814219149256, + "flos": 20893624888320.0, + "grad_norm": 2.252585455539085, + "language_loss": 0.68345773, + "learning_rate": 2.839869615637177e-06, + "loss": 0.76065207, + "num_input_tokens_seen": 136084060, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.13519287, + "step": 6340, + "time_per_iteration": 2.5142812728881836 + }, + { + "auxiliary_loss_clip": 0.06456652, + "auxiliary_loss_mlp": 0.01275426, + "balance_loss_clip": 0.06282599, + "balance_loss_mlp": 0.01260083, + "epoch": 0.38124154516759357, + "flos": 16696418019840.0, + "grad_norm": 2.4997436549257754, + "language_loss": 0.89721388, + "learning_rate": 2.839516142102522e-06, + "loss": 0.97453463, + "num_input_tokens_seen": 136102310, + "router_z_loss_clip": 1.74023438, + "router_z_loss_mlp": 0.15332031, + "step": 6341, + "time_per_iteration": 4.08266806602478 + }, + { + "auxiliary_loss_clip": 0.06461132, + "auxiliary_loss_mlp": 0.01272557, + "balance_loss_clip": 0.06284279, + "balance_loss_mlp": 0.01255427, + "epoch": 0.38130166842026153, + "flos": 19687946088960.0, + "grad_norm": 1.4891162994718032, + "language_loss": 0.75298452, + "learning_rate": 2.83916263673333e-06, + "loss": 0.83032143, + "num_input_tokens_seen": 136120725, + "router_z_loss_clip": 1.76855469, + "router_z_loss_mlp": 0.17138672, + "step": 6342, + "time_per_iteration": 2.496697425842285 + }, + { + "auxiliary_loss_clip": 0.06453368, + "auxiliary_loss_mlp": 0.01271075, + "balance_loss_clip": 0.06281647, + "balance_loss_mlp": 0.0125646, + "epoch": 0.3813617916729295, + "flos": 22204668597120.0, + "grad_norm": 1.7145643847071266, + "language_loss": 0.83785719, + "learning_rate": 2.838809099543007e-06, + "loss": 0.91510159, + "num_input_tokens_seen": 136139105, + "router_z_loss_clip": 1.71777344, + "router_z_loss_mlp": 0.14599609, + "step": 6343, + "time_per_iteration": 4.049302339553833 + }, + { + "auxiliary_loss_clip": 0.0645491, + "auxiliary_loss_mlp": 0.01269585, + "balance_loss_clip": 0.06281073, + "balance_loss_mlp": 0.01254905, + "epoch": 0.38142191492559746, + "flos": 19102576665600.0, + "grad_norm": 1.619462393744454, + "language_loss": 0.77529186, + "learning_rate": 2.838455530544959e-06, + "loss": 0.8525368, + "num_input_tokens_seen": 136158265, + "router_z_loss_clip": 1.73828125, + "router_z_loss_mlp": 0.14678955, + "step": 6344, + "time_per_iteration": 2.579394817352295 + }, + { + "auxiliary_loss_clip": 0.06456682, + "auxiliary_loss_mlp": 0.01271203, + "balance_loss_clip": 0.06285504, + "balance_loss_mlp": 0.01256635, + "epoch": 0.3814820381782654, + "flos": 24104645527680.0, + "grad_norm": 1.8871239884396722, + "language_loss": 0.74166036, + "learning_rate": 2.838101929752593e-06, + "loss": 0.81893921, + "num_input_tokens_seen": 136176100, + "router_z_loss_clip": 1.7109375, + "router_z_loss_mlp": 0.14587402, + "step": 6345, + "time_per_iteration": 2.5367093086242676 + }, + { + "auxiliary_loss_clip": 0.06457509, + "auxiliary_loss_mlp": 0.0127188, + "balance_loss_clip": 0.06287415, + "balance_loss_mlp": 0.01257765, + "epoch": 0.3815421614309334, + "flos": 15783927056640.0, + "grad_norm": 1.7118462514914357, + "language_loss": 0.69868183, + "learning_rate": 2.8377482971793187e-06, + "loss": 0.7759757, + "num_input_tokens_seen": 136195125, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.14111328, + "step": 6346, + "time_per_iteration": 2.5815930366516113 + }, + { + "auxiliary_loss_clip": 0.06466204, + "auxiliary_loss_mlp": 0.0127262, + "balance_loss_clip": 0.06290555, + "balance_loss_mlp": 0.01257236, + "epoch": 0.38160228468360136, + "flos": 19905593869440.0, + "grad_norm": 1.781545419456976, + "language_loss": 0.7611326, + "learning_rate": 2.8373946328385437e-06, + "loss": 0.83852088, + "num_input_tokens_seen": 136213885, + "router_z_loss_clip": 1.75585938, + "router_z_loss_mlp": 0.15374756, + "step": 6347, + "time_per_iteration": 2.5027284622192383 + }, + { + "auxiliary_loss_clip": 0.06456521, + "auxiliary_loss_mlp": 0.01269003, + "balance_loss_clip": 0.06283832, + "balance_loss_mlp": 0.012553, + "epoch": 0.3816624079362694, + "flos": 19287045429120.0, + "grad_norm": 1.488288802844173, + "language_loss": 0.75192666, + "learning_rate": 2.8370409367436813e-06, + "loss": 0.82918191, + "num_input_tokens_seen": 136232700, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.13702393, + "step": 6348, + "time_per_iteration": 2.559131383895874 + }, + { + "auxiliary_loss_clip": 0.0645996, + "auxiliary_loss_mlp": 0.01270391, + "balance_loss_clip": 0.06286097, + "balance_loss_mlp": 0.01256599, + "epoch": 0.38172253118893734, + "flos": 21183752050560.0, + "grad_norm": 1.729316797973715, + "language_loss": 0.88237411, + "learning_rate": 2.836687208908142e-06, + "loss": 0.95967764, + "num_input_tokens_seen": 136248975, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.13775635, + "step": 6349, + "time_per_iteration": 2.525542974472046 + }, + { + "auxiliary_loss_clip": 0.06453095, + "auxiliary_loss_mlp": 0.0126974, + "balance_loss_clip": 0.06281723, + "balance_loss_mlp": 0.01255149, + "epoch": 0.3817826544416053, + "flos": 17534836373760.0, + "grad_norm": 1.7576595366031973, + "language_loss": 0.76939785, + "learning_rate": 2.836333449345341e-06, + "loss": 0.84662628, + "num_input_tokens_seen": 136266710, + "router_z_loss_clip": 1.71386719, + "router_z_loss_mlp": 0.14593506, + "step": 6350, + "time_per_iteration": 2.532376289367676 + }, + { + "auxiliary_loss_clip": 0.06458531, + "auxiliary_loss_mlp": 0.01273484, + "balance_loss_clip": 0.06286063, + "balance_loss_mlp": 0.01258231, + "epoch": 0.38184277769427327, + "flos": 16332176321280.0, + "grad_norm": 2.21296257119241, + "language_loss": 0.77054518, + "learning_rate": 2.8359796580686907e-06, + "loss": 0.84786528, + "num_input_tokens_seen": 136284445, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.15264893, + "step": 6351, + "time_per_iteration": 2.4930031299591064 + }, + { + "auxiliary_loss_clip": 0.06457832, + "auxiliary_loss_mlp": 0.01273263, + "balance_loss_clip": 0.0628476, + "balance_loss_mlp": 0.012577, + "epoch": 0.38190290094694124, + "flos": 30450937115520.0, + "grad_norm": 2.2550067272061254, + "language_loss": 0.74895489, + "learning_rate": 2.8356258350916085e-06, + "loss": 0.82626581, + "num_input_tokens_seen": 136305730, + "router_z_loss_clip": 1.73144531, + "router_z_loss_mlp": 0.15563965, + "step": 6352, + "time_per_iteration": 2.6078808307647705 + }, + { + "auxiliary_loss_clip": 0.0645259, + "auxiliary_loss_mlp": 0.01270341, + "balance_loss_clip": 0.06283389, + "balance_loss_mlp": 0.0125659, + "epoch": 0.3819630241996092, + "flos": 14215138588800.0, + "grad_norm": 2.0554991668998777, + "language_loss": 0.63961715, + "learning_rate": 2.8352719804275104e-06, + "loss": 0.71684647, + "num_input_tokens_seen": 136323850, + "router_z_loss_clip": 1.69140625, + "router_z_loss_mlp": 0.13739014, + "step": 6353, + "time_per_iteration": 2.476759433746338 + }, + { + "auxiliary_loss_clip": 0.06456264, + "auxiliary_loss_mlp": 0.01279815, + "balance_loss_clip": 0.06283177, + "balance_loss_mlp": 0.01266112, + "epoch": 0.38202314745227717, + "flos": 25016717220480.0, + "grad_norm": 1.720129608989886, + "language_loss": 0.83556378, + "learning_rate": 2.834918094089816e-06, + "loss": 0.91292459, + "num_input_tokens_seen": 136344880, + "router_z_loss_clip": 1.72949219, + "router_z_loss_mlp": 0.13702393, + "step": 6354, + "time_per_iteration": 2.5726418495178223 + }, + { + "auxiliary_loss_clip": 0.06456912, + "auxiliary_loss_mlp": 0.01271961, + "balance_loss_clip": 0.06290418, + "balance_loss_mlp": 0.0125911, + "epoch": 0.38208327070494513, + "flos": 20820935871360.0, + "grad_norm": 1.6482101436629937, + "language_loss": 0.81480742, + "learning_rate": 2.834564176091943e-06, + "loss": 0.89209616, + "num_input_tokens_seen": 136366060, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.12854004, + "step": 6355, + "time_per_iteration": 2.5225114822387695 + }, + { + "auxiliary_loss_clip": 0.06459523, + "auxiliary_loss_mlp": 0.01273228, + "balance_loss_clip": 0.06289364, + "balance_loss_mlp": 0.01259179, + "epoch": 0.3821433939576131, + "flos": 22644282643200.0, + "grad_norm": 1.8808367718392982, + "language_loss": 0.75647783, + "learning_rate": 2.8342102264473125e-06, + "loss": 0.83380532, + "num_input_tokens_seen": 136385625, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.14031982, + "step": 6356, + "time_per_iteration": 2.5584537982940674 + }, + { + "auxiliary_loss_clip": 0.0646046, + "auxiliary_loss_mlp": 0.01272045, + "balance_loss_clip": 0.06287301, + "balance_loss_mlp": 0.01257645, + "epoch": 0.38220351721028106, + "flos": 26877100296960.0, + "grad_norm": 1.8976132208861074, + "language_loss": 0.82161039, + "learning_rate": 2.833856245169348e-06, + "loss": 0.89893544, + "num_input_tokens_seen": 136405750, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.14398193, + "step": 6357, + "time_per_iteration": 2.546190023422241 + }, + { + "auxiliary_loss_clip": 0.06463508, + "auxiliary_loss_mlp": 0.01275628, + "balance_loss_clip": 0.0629019, + "balance_loss_mlp": 0.01260035, + "epoch": 0.38226364046294903, + "flos": 23374149632640.0, + "grad_norm": 1.7334885634957151, + "language_loss": 0.78531659, + "learning_rate": 2.8335022322714695e-06, + "loss": 0.86270791, + "num_input_tokens_seen": 136426085, + "router_z_loss_clip": 1.73535156, + "router_z_loss_mlp": 0.15612793, + "step": 6358, + "time_per_iteration": 2.5330071449279785 + }, + { + "auxiliary_loss_clip": 0.06462916, + "auxiliary_loss_mlp": 0.01271369, + "balance_loss_clip": 0.06287834, + "balance_loss_mlp": 0.01256086, + "epoch": 0.382323763715617, + "flos": 19652335303680.0, + "grad_norm": 1.9007754709735623, + "language_loss": 0.79191673, + "learning_rate": 2.8331481877671036e-06, + "loss": 0.86925954, + "num_input_tokens_seen": 136442670, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.15270996, + "step": 6359, + "time_per_iteration": 2.5185654163360596 + }, + { + "auxiliary_loss_clip": 0.06457044, + "auxiliary_loss_mlp": 0.01275796, + "balance_loss_clip": 0.06287733, + "balance_loss_mlp": 0.01261884, + "epoch": 0.38238388696828496, + "flos": 54136527575040.0, + "grad_norm": 1.6591220194179586, + "language_loss": 0.70001733, + "learning_rate": 2.8327941116696754e-06, + "loss": 0.77734572, + "num_input_tokens_seen": 136465730, + "router_z_loss_clip": 1.69238281, + "router_z_loss_mlp": 0.13903809, + "step": 6360, + "time_per_iteration": 2.8067054748535156 + }, + { + "auxiliary_loss_clip": 0.06461466, + "auxiliary_loss_mlp": 0.01277777, + "balance_loss_clip": 0.06292595, + "balance_loss_mlp": 0.01262923, + "epoch": 0.382444010220953, + "flos": 24943105808640.0, + "grad_norm": 1.5737902616354833, + "language_loss": 0.79093289, + "learning_rate": 2.83244000399261e-06, + "loss": 0.86832535, + "num_input_tokens_seen": 136487215, + "router_z_loss_clip": 1.6875, + "router_z_loss_mlp": 0.14849854, + "step": 6361, + "time_per_iteration": 2.558579683303833 + }, + { + "auxiliary_loss_clip": 0.0645285, + "auxiliary_loss_mlp": 0.01272146, + "balance_loss_clip": 0.06286099, + "balance_loss_mlp": 0.01257996, + "epoch": 0.38250413347362094, + "flos": 42346750216320.0, + "grad_norm": 1.4645255919949542, + "language_loss": 0.65580732, + "learning_rate": 2.832085864749337e-06, + "loss": 0.73305726, + "num_input_tokens_seen": 136510365, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.14154053, + "step": 6362, + "time_per_iteration": 2.709390878677368 + }, + { + "auxiliary_loss_clip": 0.06459438, + "auxiliary_loss_mlp": 0.01270758, + "balance_loss_clip": 0.06287294, + "balance_loss_mlp": 0.01255415, + "epoch": 0.3825642567262889, + "flos": 16294720746240.0, + "grad_norm": 1.6166481183320216, + "language_loss": 0.8211807, + "learning_rate": 2.8317316939532848e-06, + "loss": 0.89848268, + "num_input_tokens_seen": 136527100, + "router_z_loss_clip": 1.72070312, + "router_z_loss_mlp": 0.15332031, + "step": 6363, + "time_per_iteration": 2.468846559524536 + }, + { + "auxiliary_loss_clip": 0.06453779, + "auxiliary_loss_mlp": 0.01274743, + "balance_loss_clip": 0.06286556, + "balance_loss_mlp": 0.01259401, + "epoch": 0.3826243799789569, + "flos": 45664267795200.0, + "grad_norm": 1.6258867054195516, + "language_loss": 0.59107661, + "learning_rate": 2.8313774916178825e-06, + "loss": 0.6683619, + "num_input_tokens_seen": 136550870, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.15356445, + "step": 6364, + "time_per_iteration": 2.745589256286621 + }, + { + "auxiliary_loss_clip": 0.06465845, + "auxiliary_loss_mlp": 0.0127531, + "balance_loss_clip": 0.06290866, + "balance_loss_mlp": 0.01261058, + "epoch": 0.38268450323162484, + "flos": 25308647245440.0, + "grad_norm": 2.2940920681906873, + "language_loss": 0.6951021, + "learning_rate": 2.8310232577565635e-06, + "loss": 0.77251363, + "num_input_tokens_seen": 136569895, + "router_z_loss_clip": 1.74902344, + "router_z_loss_mlp": 0.14257812, + "step": 6365, + "time_per_iteration": 2.561795473098755 + }, + { + "auxiliary_loss_clip": 0.06461614, + "auxiliary_loss_mlp": 0.01269888, + "balance_loss_clip": 0.06285347, + "balance_loss_mlp": 0.0125451, + "epoch": 0.3827446264842928, + "flos": 21842607104640.0, + "grad_norm": 2.2040506714686208, + "language_loss": 0.73211187, + "learning_rate": 2.830668992382758e-06, + "loss": 0.8094269, + "num_input_tokens_seen": 136588585, + "router_z_loss_clip": 1.76464844, + "router_z_loss_mlp": 0.15374756, + "step": 6366, + "time_per_iteration": 2.527252435684204 + }, + { + "auxiliary_loss_clip": 0.06455328, + "auxiliary_loss_mlp": 0.01270912, + "balance_loss_clip": 0.06284537, + "balance_loss_mlp": 0.0125703, + "epoch": 0.38280474973696077, + "flos": 25740924059520.0, + "grad_norm": 2.537372436592335, + "language_loss": 0.69208872, + "learning_rate": 2.830314695509902e-06, + "loss": 0.76935112, + "num_input_tokens_seen": 136606640, + "router_z_loss_clip": 1.70898438, + "router_z_loss_mlp": 0.13885498, + "step": 6367, + "time_per_iteration": 2.563174247741699 + }, + { + "auxiliary_loss_clip": 0.06445135, + "auxiliary_loss_mlp": 0.01267364, + "balance_loss_clip": 0.06281811, + "balance_loss_mlp": 0.01253482, + "epoch": 0.38286487298962874, + "flos": 24902212216320.0, + "grad_norm": 2.529219827632029, + "language_loss": 0.64519894, + "learning_rate": 2.82996036715143e-06, + "loss": 0.72232389, + "num_input_tokens_seen": 136624940, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.13897705, + "step": 6368, + "time_per_iteration": 2.5240230560302734 + }, + { + "auxiliary_loss_clip": 0.0644632, + "auxiliary_loss_mlp": 0.0126879, + "balance_loss_clip": 0.06279288, + "balance_loss_mlp": 0.01255111, + "epoch": 0.3829249962422967, + "flos": 28550457060480.0, + "grad_norm": 1.3073196657605344, + "language_loss": 0.68441451, + "learning_rate": 2.8296060073207763e-06, + "loss": 0.76156569, + "num_input_tokens_seen": 136645540, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.13677979, + "step": 6369, + "time_per_iteration": 2.623020887374878 + }, + { + "auxiliary_loss_clip": 0.06452611, + "auxiliary_loss_mlp": 0.01268713, + "balance_loss_clip": 0.0628352, + "balance_loss_mlp": 0.01254724, + "epoch": 0.38298511949496467, + "flos": 21477736500480.0, + "grad_norm": 1.6896603918496267, + "language_loss": 0.79100078, + "learning_rate": 2.8292516160313804e-06, + "loss": 0.86821401, + "num_input_tokens_seen": 136664530, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.13995361, + "step": 6370, + "time_per_iteration": 2.5265746116638184 + }, + { + "auxiliary_loss_clip": 0.06451623, + "auxiliary_loss_mlp": 0.0127085, + "balance_loss_clip": 0.06281339, + "balance_loss_mlp": 0.01256265, + "epoch": 0.38304524274763263, + "flos": 31687027747200.0, + "grad_norm": 2.908092380852583, + "language_loss": 0.651667, + "learning_rate": 2.8288971932966805e-06, + "loss": 0.72889173, + "num_input_tokens_seen": 136682315, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.14587402, + "step": 6371, + "time_per_iteration": 2.6345784664154053 + }, + { + "auxiliary_loss_clip": 0.06459577, + "auxiliary_loss_mlp": 0.01272301, + "balance_loss_clip": 0.06283382, + "balance_loss_mlp": 0.01257543, + "epoch": 0.3831053660003006, + "flos": 25082865619200.0, + "grad_norm": 2.362243450203488, + "language_loss": 0.73142469, + "learning_rate": 2.8285427391301155e-06, + "loss": 0.80874348, + "num_input_tokens_seen": 136701185, + "router_z_loss_clip": 1.76464844, + "router_z_loss_mlp": 0.14746094, + "step": 6372, + "time_per_iteration": 2.5150070190429688 + }, + { + "auxiliary_loss_clip": 0.06454702, + "auxiliary_loss_mlp": 0.01266707, + "balance_loss_clip": 0.06282556, + "balance_loss_mlp": 0.01252485, + "epoch": 0.38316548925296856, + "flos": 23265849830400.0, + "grad_norm": 1.5439174716844835, + "language_loss": 0.85255867, + "learning_rate": 2.8281882535451266e-06, + "loss": 0.92977273, + "num_input_tokens_seen": 136721265, + "router_z_loss_clip": 1.72167969, + "router_z_loss_mlp": 0.14221191, + "step": 6373, + "time_per_iteration": 4.056765794754028 + }, + { + "auxiliary_loss_clip": 0.0645606, + "auxiliary_loss_mlp": 0.01272183, + "balance_loss_clip": 0.06281903, + "balance_loss_mlp": 0.01257431, + "epoch": 0.3832256125056366, + "flos": 34432131358080.0, + "grad_norm": 8.29118461423438, + "language_loss": 0.75127506, + "learning_rate": 2.8278337365551567e-06, + "loss": 0.82855743, + "num_input_tokens_seen": 136741885, + "router_z_loss_clip": 1.74121094, + "router_z_loss_mlp": 0.14758301, + "step": 6374, + "time_per_iteration": 2.739825963973999 + }, + { + "auxiliary_loss_clip": 0.06457414, + "auxiliary_loss_mlp": 0.01272454, + "balance_loss_clip": 0.0628335, + "balance_loss_mlp": 0.01258042, + "epoch": 0.38328573575830455, + "flos": 21769289182080.0, + "grad_norm": 1.9434329018980874, + "language_loss": 0.76033717, + "learning_rate": 2.8274791881736485e-06, + "loss": 0.83763582, + "num_input_tokens_seen": 136760905, + "router_z_loss_clip": 1.74121094, + "router_z_loss_mlp": 0.14416504, + "step": 6375, + "time_per_iteration": 2.521092176437378 + }, + { + "auxiliary_loss_clip": 0.06457017, + "auxiliary_loss_mlp": 0.01267252, + "balance_loss_clip": 0.06283681, + "balance_loss_mlp": 0.01252541, + "epoch": 0.3833458590109725, + "flos": 17385056000640.0, + "grad_norm": 2.081333613596134, + "language_loss": 0.73067588, + "learning_rate": 2.8271246084140457e-06, + "loss": 0.80791855, + "num_input_tokens_seen": 136777240, + "router_z_loss_clip": 1.73046875, + "router_z_loss_mlp": 0.1472168, + "step": 6376, + "time_per_iteration": 3.913828134536743 + }, + { + "auxiliary_loss_clip": 0.06451094, + "auxiliary_loss_mlp": 0.01266207, + "balance_loss_clip": 0.06282462, + "balance_loss_mlp": 0.01251294, + "epoch": 0.3834059822636405, + "flos": 29432326556160.0, + "grad_norm": 1.6469866452188906, + "language_loss": 0.68444526, + "learning_rate": 2.826769997289796e-06, + "loss": 0.76161826, + "num_input_tokens_seen": 136801040, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.14916992, + "step": 6377, + "time_per_iteration": 2.552703857421875 + }, + { + "auxiliary_loss_clip": 0.0646103, + "auxiliary_loss_mlp": 0.01268999, + "balance_loss_clip": 0.06285432, + "balance_loss_mlp": 0.01253413, + "epoch": 0.38346610551630844, + "flos": 21477191448960.0, + "grad_norm": 1.937210921117629, + "language_loss": 0.73608565, + "learning_rate": 2.826415354814344e-06, + "loss": 0.8133859, + "num_input_tokens_seen": 136819495, + "router_z_loss_clip": 1.75488281, + "router_z_loss_mlp": 0.15582275, + "step": 6378, + "time_per_iteration": 2.554784059524536 + }, + { + "auxiliary_loss_clip": 0.06455162, + "auxiliary_loss_mlp": 0.01271763, + "balance_loss_clip": 0.06283469, + "balance_loss_mlp": 0.01257661, + "epoch": 0.3835262287689764, + "flos": 27568253900160.0, + "grad_norm": 1.6187724503548255, + "language_loss": 0.69142127, + "learning_rate": 2.8260606810011396e-06, + "loss": 0.76869053, + "num_input_tokens_seen": 136838840, + "router_z_loss_clip": 1.72070312, + "router_z_loss_mlp": 0.14099121, + "step": 6379, + "time_per_iteration": 2.540184736251831 + }, + { + "auxiliary_loss_clip": 0.06449591, + "auxiliary_loss_mlp": 0.01271871, + "balance_loss_clip": 0.06281038, + "balance_loss_mlp": 0.01258209, + "epoch": 0.3835863520216444, + "flos": 15529201044480.0, + "grad_norm": 1.7677581121541173, + "language_loss": 0.8420229, + "learning_rate": 2.8257059758636315e-06, + "loss": 0.91923743, + "num_input_tokens_seen": 136854425, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.13659668, + "step": 6380, + "time_per_iteration": 3.9425628185272217 + }, + { + "auxiliary_loss_clip": 0.06454644, + "auxiliary_loss_mlp": 0.01270371, + "balance_loss_clip": 0.06286694, + "balance_loss_mlp": 0.01255786, + "epoch": 0.38364647527431234, + "flos": 21910851855360.0, + "grad_norm": 1.4264464063638025, + "language_loss": 0.81255281, + "learning_rate": 2.8253512394152697e-06, + "loss": 0.88980293, + "num_input_tokens_seen": 136874355, + "router_z_loss_clip": 1.67871094, + "router_z_loss_mlp": 0.14569092, + "step": 6381, + "time_per_iteration": 2.5692083835601807 + }, + { + "auxiliary_loss_clip": 0.06363897, + "auxiliary_loss_mlp": 0.0126892, + "balance_loss_clip": 0.06286111, + "balance_loss_mlp": 0.01265082, + "epoch": 0.3837065985269803, + "flos": 65553076120320.0, + "grad_norm": 0.8198763586735168, + "language_loss": 0.60085058, + "learning_rate": 2.8249964716695068e-06, + "loss": 0.67717874, + "num_input_tokens_seen": 136937475, + "router_z_loss_clip": 0.77734375, + "router_z_loss_mlp": 0.03833008, + "step": 6382, + "time_per_iteration": 3.1118690967559814 + }, + { + "auxiliary_loss_clip": 0.06458844, + "auxiliary_loss_mlp": 0.0127264, + "balance_loss_clip": 0.06285119, + "balance_loss_mlp": 0.01257375, + "epoch": 0.38376672177964827, + "flos": 28264103331840.0, + "grad_norm": 2.361672223919581, + "language_loss": 0.67004663, + "learning_rate": 2.824641672639794e-06, + "loss": 0.74736154, + "num_input_tokens_seen": 136955805, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.15264893, + "step": 6383, + "time_per_iteration": 3.949587345123291 + }, + { + "auxiliary_loss_clip": 0.06458098, + "auxiliary_loss_mlp": 0.01270272, + "balance_loss_clip": 0.06285569, + "balance_loss_mlp": 0.01255919, + "epoch": 0.38382684503231623, + "flos": 20637641064960.0, + "grad_norm": 1.580160930907899, + "language_loss": 0.75169957, + "learning_rate": 2.824286842339587e-06, + "loss": 0.82898319, + "num_input_tokens_seen": 136975240, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.14355469, + "step": 6384, + "time_per_iteration": 2.5578341484069824 + }, + { + "auxiliary_loss_clip": 0.0645394, + "auxiliary_loss_mlp": 0.01272921, + "balance_loss_clip": 0.06286485, + "balance_loss_mlp": 0.01259819, + "epoch": 0.3838869682849842, + "flos": 19611274003200.0, + "grad_norm": 1.4416039952500834, + "language_loss": 0.76348937, + "learning_rate": 2.823931980782341e-06, + "loss": 0.84075809, + "num_input_tokens_seen": 136994985, + "router_z_loss_clip": 1.67578125, + "router_z_loss_mlp": 0.13092041, + "step": 6385, + "time_per_iteration": 2.5225770473480225 + }, + { + "auxiliary_loss_clip": 0.06357871, + "auxiliary_loss_mlp": 0.01265462, + "balance_loss_clip": 0.06280675, + "balance_loss_mlp": 0.01261296, + "epoch": 0.38394709153765216, + "flos": 56572202856960.0, + "grad_norm": 1.1093406194632214, + "language_loss": 0.67841589, + "learning_rate": 2.82357708798151e-06, + "loss": 0.75464916, + "num_input_tokens_seen": 137046290, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.04168701, + "step": 6386, + "time_per_iteration": 3.0481390953063965 + }, + { + "auxiliary_loss_clip": 0.06453113, + "auxiliary_loss_mlp": 0.01268796, + "balance_loss_clip": 0.06286535, + "balance_loss_mlp": 0.01254777, + "epoch": 0.3840072147903202, + "flos": 15894323210880.0, + "grad_norm": 1.5665063027995272, + "language_loss": 0.72740716, + "learning_rate": 2.8232221639505547e-06, + "loss": 0.80462623, + "num_input_tokens_seen": 137064725, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.14025879, + "step": 6387, + "time_per_iteration": 2.514692783355713 + }, + { + "auxiliary_loss_clip": 0.06447147, + "auxiliary_loss_mlp": 0.01275854, + "balance_loss_clip": 0.06283197, + "balance_loss_mlp": 0.0126187, + "epoch": 0.38406733804298815, + "flos": 28225180310400.0, + "grad_norm": 2.2869557055676095, + "language_loss": 0.81707162, + "learning_rate": 2.822867208702932e-06, + "loss": 0.89430165, + "num_input_tokens_seen": 137086030, + "router_z_loss_clip": 1.63867188, + "router_z_loss_mlp": 0.13989258, + "step": 6388, + "time_per_iteration": 2.6592257022857666 + }, + { + "auxiliary_loss_clip": 0.06454118, + "auxiliary_loss_mlp": 0.01267752, + "balance_loss_clip": 0.0628527, + "balance_loss_mlp": 0.01253888, + "epoch": 0.3841274612956561, + "flos": 18229511848320.0, + "grad_norm": 1.6912658906890043, + "language_loss": 0.76762819, + "learning_rate": 2.8225122222521026e-06, + "loss": 0.84484684, + "num_input_tokens_seen": 137105400, + "router_z_loss_clip": 1.6875, + "router_z_loss_mlp": 0.13873291, + "step": 6389, + "time_per_iteration": 2.5315403938293457 + }, + { + "auxiliary_loss_clip": 0.06454799, + "auxiliary_loss_mlp": 0.01270773, + "balance_loss_clip": 0.06281878, + "balance_loss_mlp": 0.01254847, + "epoch": 0.3841875845483241, + "flos": 19799138856960.0, + "grad_norm": 1.6723623276481432, + "language_loss": 0.76991975, + "learning_rate": 2.8221572046115273e-06, + "loss": 0.84717548, + "num_input_tokens_seen": 137124985, + "router_z_loss_clip": 1.73046875, + "router_z_loss_mlp": 0.15905762, + "step": 6390, + "time_per_iteration": 2.5315029621124268 + }, + { + "auxiliary_loss_clip": 0.0646126, + "auxiliary_loss_mlp": 0.01271779, + "balance_loss_clip": 0.06286746, + "balance_loss_mlp": 0.01255572, + "epoch": 0.38424770780099204, + "flos": 29906670919680.0, + "grad_norm": 1.876202489708209, + "language_loss": 0.70321602, + "learning_rate": 2.821802155794668e-06, + "loss": 0.78054643, + "num_input_tokens_seen": 137146745, + "router_z_loss_clip": 1.74414062, + "router_z_loss_mlp": 0.1618042, + "step": 6391, + "time_per_iteration": 2.6110270023345947 + }, + { + "auxiliary_loss_clip": 0.06455616, + "auxiliary_loss_mlp": 0.01272965, + "balance_loss_clip": 0.06284156, + "balance_loss_mlp": 0.01258499, + "epoch": 0.38430783105366, + "flos": 20820013476480.0, + "grad_norm": 1.8135855175826887, + "language_loss": 0.83923954, + "learning_rate": 2.8214470758149884e-06, + "loss": 0.91652524, + "num_input_tokens_seen": 137163195, + "router_z_loss_clip": 1.71386719, + "router_z_loss_mlp": 0.14459229, + "step": 6392, + "time_per_iteration": 2.5735576152801514 + }, + { + "auxiliary_loss_clip": 0.06461488, + "auxiliary_loss_mlp": 0.01268829, + "balance_loss_clip": 0.06290185, + "balance_loss_mlp": 0.01255162, + "epoch": 0.384367954306328, + "flos": 11003153627520.0, + "grad_norm": 1.9242234625767662, + "language_loss": 0.61454862, + "learning_rate": 2.8210919646859536e-06, + "loss": 0.69185179, + "num_input_tokens_seen": 137179330, + "router_z_loss_clip": 1.7109375, + "router_z_loss_mlp": 0.13677979, + "step": 6393, + "time_per_iteration": 2.4626450538635254 + }, + { + "auxiliary_loss_clip": 0.06467697, + "auxiliary_loss_mlp": 0.01271997, + "balance_loss_clip": 0.06290497, + "balance_loss_mlp": 0.01256071, + "epoch": 0.38442807755899594, + "flos": 25345096571520.0, + "grad_norm": 2.1306446802295325, + "language_loss": 0.71410203, + "learning_rate": 2.820736822421029e-06, + "loss": 0.79149896, + "num_input_tokens_seen": 137198655, + "router_z_loss_clip": 1.7734375, + "router_z_loss_mlp": 0.15905762, + "step": 6394, + "time_per_iteration": 2.5997071266174316 + }, + { + "auxiliary_loss_clip": 0.06463788, + "auxiliary_loss_mlp": 0.01269365, + "balance_loss_clip": 0.0628664, + "balance_loss_mlp": 0.01254082, + "epoch": 0.3844882008116639, + "flos": 21076206935040.0, + "grad_norm": 1.9216116882295546, + "language_loss": 0.82087183, + "learning_rate": 2.8203816490336822e-06, + "loss": 0.89820337, + "num_input_tokens_seen": 137217120, + "router_z_loss_clip": 1.76953125, + "router_z_loss_mlp": 0.1529541, + "step": 6395, + "time_per_iteration": 2.517411470413208 + }, + { + "auxiliary_loss_clip": 0.06460339, + "auxiliary_loss_mlp": 0.01275993, + "balance_loss_clip": 0.06287727, + "balance_loss_mlp": 0.01261831, + "epoch": 0.38454832406433187, + "flos": 17968287144960.0, + "grad_norm": 2.112818402600052, + "language_loss": 0.70801687, + "learning_rate": 2.8200264445373813e-06, + "loss": 0.78538024, + "num_input_tokens_seen": 137234410, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.14160156, + "step": 6396, + "time_per_iteration": 2.50288987159729 + }, + { + "auxiliary_loss_clip": 0.06365301, + "auxiliary_loss_mlp": 0.01257609, + "balance_loss_clip": 0.06287754, + "balance_loss_mlp": 0.01253767, + "epoch": 0.38460844731699984, + "flos": 67946641925760.0, + "grad_norm": 0.873922952794391, + "language_loss": 0.59863293, + "learning_rate": 2.8196712089455954e-06, + "loss": 0.67486203, + "num_input_tokens_seen": 137294940, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.0383606, + "step": 6397, + "time_per_iteration": 3.206678628921509 + }, + { + "auxiliary_loss_clip": 0.06450997, + "auxiliary_loss_mlp": 0.01276354, + "balance_loss_clip": 0.06284742, + "balance_loss_mlp": 0.0126187, + "epoch": 0.3846685705696678, + "flos": 25856267604480.0, + "grad_norm": 1.772406293141946, + "language_loss": 0.85227352, + "learning_rate": 2.819315942271794e-06, + "loss": 0.92954701, + "num_input_tokens_seen": 137315035, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.14477539, + "step": 6398, + "time_per_iteration": 2.5761947631835938 + }, + { + "auxiliary_loss_clip": 0.06453151, + "auxiliary_loss_mlp": 0.01277177, + "balance_loss_clip": 0.06285614, + "balance_loss_mlp": 0.01262467, + "epoch": 0.38472869382233577, + "flos": 16295852776320.0, + "grad_norm": 2.386881726324987, + "language_loss": 0.80489028, + "learning_rate": 2.8189606445294515e-06, + "loss": 0.88219357, + "num_input_tokens_seen": 137333155, + "router_z_loss_clip": 1.67578125, + "router_z_loss_mlp": 0.14715576, + "step": 6399, + "time_per_iteration": 2.4882943630218506 + }, + { + "auxiliary_loss_clip": 0.06455526, + "auxiliary_loss_mlp": 0.01279196, + "balance_loss_clip": 0.06283697, + "balance_loss_mlp": 0.01263592, + "epoch": 0.38478881707500373, + "flos": 19358979759360.0, + "grad_norm": 1.8772073039605681, + "language_loss": 0.67565721, + "learning_rate": 2.818605315732038e-06, + "loss": 0.75300437, + "num_input_tokens_seen": 137351515, + "router_z_loss_clip": 1.71972656, + "router_z_loss_mlp": 0.15588379, + "step": 6400, + "time_per_iteration": 2.5162830352783203 + }, + { + "auxiliary_loss_clip": 0.06460319, + "auxiliary_loss_mlp": 0.01269914, + "balance_loss_clip": 0.06288355, + "balance_loss_mlp": 0.01255454, + "epoch": 0.38484894032767175, + "flos": 24867356117760.0, + "grad_norm": 1.6933093627789975, + "language_loss": 0.7382642, + "learning_rate": 2.81824995589303e-06, + "loss": 0.81556654, + "num_input_tokens_seen": 137371255, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.14459229, + "step": 6401, + "time_per_iteration": 2.5274739265441895 + }, + { + "auxiliary_loss_clip": 0.06457724, + "auxiliary_loss_mlp": 0.01277936, + "balance_loss_clip": 0.06285743, + "balance_loss_mlp": 0.01262296, + "epoch": 0.3849090635803397, + "flos": 14507068613760.0, + "grad_norm": 1.836175131611194, + "language_loss": 0.72368169, + "learning_rate": 2.8178945650259012e-06, + "loss": 0.80103827, + "num_input_tokens_seen": 137388980, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.15637207, + "step": 6402, + "time_per_iteration": 2.509624481201172 + }, + { + "auxiliary_loss_clip": 0.06455728, + "auxiliary_loss_mlp": 0.01275333, + "balance_loss_clip": 0.06288305, + "balance_loss_mlp": 0.01261195, + "epoch": 0.3849691868330077, + "flos": 18521903070720.0, + "grad_norm": 1.8063322577059318, + "language_loss": 0.83321881, + "learning_rate": 2.817539143144128e-06, + "loss": 0.91052943, + "num_input_tokens_seen": 137406885, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.14147949, + "step": 6403, + "time_per_iteration": 2.469576835632324 + }, + { + "auxiliary_loss_clip": 0.06451748, + "auxiliary_loss_mlp": 0.01274136, + "balance_loss_clip": 0.06283461, + "balance_loss_mlp": 0.01259813, + "epoch": 0.38502931008567565, + "flos": 21622821045120.0, + "grad_norm": 1.901744090638215, + "language_loss": 0.83685166, + "learning_rate": 2.817183690261189e-06, + "loss": 0.91411054, + "num_input_tokens_seen": 137425535, + "router_z_loss_clip": 1.68066406, + "router_z_loss_mlp": 0.14331055, + "step": 6404, + "time_per_iteration": 2.53399920463562 + }, + { + "auxiliary_loss_clip": 0.06460617, + "auxiliary_loss_mlp": 0.01279935, + "balance_loss_clip": 0.06287636, + "balance_loss_mlp": 0.01265844, + "epoch": 0.3850894333383436, + "flos": 25423152249600.0, + "grad_norm": 1.4804001380923333, + "language_loss": 0.70053053, + "learning_rate": 2.816828206390563e-06, + "loss": 0.77793604, + "num_input_tokens_seen": 137447700, + "router_z_loss_clip": 1.72949219, + "router_z_loss_mlp": 0.14105225, + "step": 6405, + "time_per_iteration": 2.577394485473633 + }, + { + "auxiliary_loss_clip": 0.06446706, + "auxiliary_loss_mlp": 0.01276604, + "balance_loss_clip": 0.06280848, + "balance_loss_mlp": 0.01263628, + "epoch": 0.3851495565910116, + "flos": 20233721658240.0, + "grad_norm": 1.9002503642999313, + "language_loss": 0.7926501, + "learning_rate": 2.816472691545729e-06, + "loss": 0.86988324, + "num_input_tokens_seen": 137462245, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.12976074, + "step": 6406, + "time_per_iteration": 2.491785764694214 + }, + { + "auxiliary_loss_clip": 0.06454885, + "auxiliary_loss_mlp": 0.01271692, + "balance_loss_clip": 0.06282916, + "balance_loss_mlp": 0.01256516, + "epoch": 0.38520967984367954, + "flos": 16514045608320.0, + "grad_norm": 2.2453520034380463, + "language_loss": 0.84628403, + "learning_rate": 2.8161171457401694e-06, + "loss": 0.92354977, + "num_input_tokens_seen": 137476455, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.1517334, + "step": 6407, + "time_per_iteration": 2.461927890777588 + }, + { + "auxiliary_loss_clip": 0.06351051, + "auxiliary_loss_mlp": 0.01274061, + "balance_loss_clip": 0.06273395, + "balance_loss_mlp": 0.01270625, + "epoch": 0.3852698030963475, + "flos": 61333088140800.0, + "grad_norm": 0.7518927461814024, + "language_loss": 0.64829391, + "learning_rate": 2.815761568987365e-06, + "loss": 0.72454506, + "num_input_tokens_seen": 137539845, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.03445435, + "step": 6408, + "time_per_iteration": 3.195535659790039 + }, + { + "auxiliary_loss_clip": 0.06454469, + "auxiliary_loss_mlp": 0.01271284, + "balance_loss_clip": 0.06283102, + "balance_loss_mlp": 0.01256383, + "epoch": 0.3853299263490155, + "flos": 22899595633920.0, + "grad_norm": 1.3862214198415879, + "language_loss": 0.73785079, + "learning_rate": 2.8154059613008e-06, + "loss": 0.8151083, + "num_input_tokens_seen": 137559880, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.14904785, + "step": 6409, + "time_per_iteration": 2.5463829040527344 + }, + { + "auxiliary_loss_clip": 0.06465833, + "auxiliary_loss_mlp": 0.01272782, + "balance_loss_clip": 0.06287792, + "balance_loss_mlp": 0.01257667, + "epoch": 0.38539004960168344, + "flos": 20053655233920.0, + "grad_norm": 2.2638026574615076, + "language_loss": 0.70597708, + "learning_rate": 2.81505032269396e-06, + "loss": 0.78336322, + "num_input_tokens_seen": 137578225, + "router_z_loss_clip": 1.78027344, + "router_z_loss_mlp": 0.15100098, + "step": 6410, + "time_per_iteration": 2.4989383220672607 + }, + { + "auxiliary_loss_clip": 0.06347367, + "auxiliary_loss_mlp": 0.01259072, + "balance_loss_clip": 0.06269964, + "balance_loss_mlp": 0.01255689, + "epoch": 0.3854501728543514, + "flos": 68752971365760.0, + "grad_norm": 0.6472142759451909, + "language_loss": 0.6009953, + "learning_rate": 2.81469465318033e-06, + "loss": 0.67705965, + "num_input_tokens_seen": 137645770, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.03390503, + "step": 6411, + "time_per_iteration": 3.221977472305298 + }, + { + "auxiliary_loss_clip": 0.06456396, + "auxiliary_loss_mlp": 0.01271512, + "balance_loss_clip": 0.06285078, + "balance_loss_mlp": 0.01257266, + "epoch": 0.38551029610701937, + "flos": 20491214855040.0, + "grad_norm": 1.7976443608036217, + "language_loss": 0.78197634, + "learning_rate": 2.814338952773397e-06, + "loss": 0.85925543, + "num_input_tokens_seen": 137664090, + "router_z_loss_clip": 1.71484375, + "router_z_loss_mlp": 0.14245605, + "step": 6412, + "time_per_iteration": 2.5103437900543213 + }, + { + "auxiliary_loss_clip": 0.06460511, + "auxiliary_loss_mlp": 0.01272302, + "balance_loss_clip": 0.06287103, + "balance_loss_mlp": 0.01255267, + "epoch": 0.38557041935968733, + "flos": 23477627825280.0, + "grad_norm": 1.8586112834781277, + "language_loss": 0.78031844, + "learning_rate": 2.8139832214866493e-06, + "loss": 0.85764652, + "num_input_tokens_seen": 137683190, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.17041016, + "step": 6413, + "time_per_iteration": 3.933619499206543 + }, + { + "auxiliary_loss_clip": 0.06342902, + "auxiliary_loss_mlp": 0.01258937, + "balance_loss_clip": 0.06265719, + "balance_loss_mlp": 0.01255421, + "epoch": 0.38563054261235535, + "flos": 63984623068800.0, + "grad_norm": 0.7920557210391271, + "language_loss": 0.61310911, + "learning_rate": 2.813627459333576e-06, + "loss": 0.6891275, + "num_input_tokens_seen": 137737315, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.03527832, + "step": 6414, + "time_per_iteration": 3.063016891479492 + }, + { + "auxiliary_loss_clip": 0.06460327, + "auxiliary_loss_mlp": 0.0126994, + "balance_loss_clip": 0.06286235, + "balance_loss_mlp": 0.01255552, + "epoch": 0.3856906658650233, + "flos": 23994584789760.0, + "grad_norm": 1.981122511442252, + "language_loss": 0.78303337, + "learning_rate": 2.8132716663276685e-06, + "loss": 0.86033607, + "num_input_tokens_seen": 137753535, + "router_z_loss_clip": 1.74121094, + "router_z_loss_mlp": 0.14379883, + "step": 6415, + "time_per_iteration": 3.915883779525757 + }, + { + "auxiliary_loss_clip": 0.06448652, + "auxiliary_loss_mlp": 0.0126708, + "balance_loss_clip": 0.06285002, + "balance_loss_mlp": 0.01253842, + "epoch": 0.3857507891176913, + "flos": 25014075816960.0, + "grad_norm": 1.7132059772930233, + "language_loss": 0.8030045, + "learning_rate": 2.8129158424824173e-06, + "loss": 0.88016176, + "num_input_tokens_seen": 137773405, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.13244629, + "step": 6416, + "time_per_iteration": 2.5699849128723145 + }, + { + "auxiliary_loss_clip": 0.06451176, + "auxiliary_loss_mlp": 0.01270271, + "balance_loss_clip": 0.06281747, + "balance_loss_mlp": 0.01256353, + "epoch": 0.38581091237035925, + "flos": 21542082036480.0, + "grad_norm": 1.7425936217489657, + "language_loss": 0.79650658, + "learning_rate": 2.8125599878113155e-06, + "loss": 0.87372106, + "num_input_tokens_seen": 137790810, + "router_z_loss_clip": 1.69433594, + "router_z_loss_mlp": 0.13909912, + "step": 6417, + "time_per_iteration": 2.490114450454712 + }, + { + "auxiliary_loss_clip": 0.06448381, + "auxiliary_loss_mlp": 0.01266538, + "balance_loss_clip": 0.06279223, + "balance_loss_mlp": 0.01252602, + "epoch": 0.3858710356230272, + "flos": 17389584120960.0, + "grad_norm": 1.6880082960892822, + "language_loss": 0.80518526, + "learning_rate": 2.8122041023278583e-06, + "loss": 0.88233447, + "num_input_tokens_seen": 137810265, + "router_z_loss_clip": 1.69042969, + "router_z_loss_mlp": 0.13922119, + "step": 6418, + "time_per_iteration": 2.5246312618255615 + }, + { + "auxiliary_loss_clip": 0.06443715, + "auxiliary_loss_mlp": 0.01268216, + "balance_loss_clip": 0.06276865, + "balance_loss_mlp": 0.01254662, + "epoch": 0.3859311588756952, + "flos": 20345836821120.0, + "grad_norm": 1.685120659988575, + "language_loss": 0.79909503, + "learning_rate": 2.8118481860455407e-06, + "loss": 0.87621439, + "num_input_tokens_seen": 137828580, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.13568115, + "step": 6419, + "time_per_iteration": 3.9288835525512695 + }, + { + "auxiliary_loss_clip": 0.06446663, + "auxiliary_loss_mlp": 0.01270123, + "balance_loss_clip": 0.06280138, + "balance_loss_mlp": 0.01254745, + "epoch": 0.38599128212836314, + "flos": 26328054418560.0, + "grad_norm": 1.9252922162684358, + "language_loss": 0.67831242, + "learning_rate": 2.8114922389778573e-06, + "loss": 0.75548029, + "num_input_tokens_seen": 137846145, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.15362549, + "step": 6420, + "time_per_iteration": 2.5568132400512695 + }, + { + "auxiliary_loss_clip": 0.06447464, + "auxiliary_loss_mlp": 0.0127397, + "balance_loss_clip": 0.06282772, + "balance_loss_mlp": 0.01260267, + "epoch": 0.3860514053810311, + "flos": 13559050719360.0, + "grad_norm": 1.8138727093850848, + "language_loss": 0.81903851, + "learning_rate": 2.8111362611383076e-06, + "loss": 0.89625287, + "num_input_tokens_seen": 137863705, + "router_z_loss_clip": 1.64746094, + "router_z_loss_mlp": 0.13690186, + "step": 6421, + "time_per_iteration": 2.6095190048217773 + }, + { + "auxiliary_loss_clip": 0.06448883, + "auxiliary_loss_mlp": 0.01270223, + "balance_loss_clip": 0.06279142, + "balance_loss_mlp": 0.01254654, + "epoch": 0.3861115286336991, + "flos": 20959689432960.0, + "grad_norm": 1.9472147710185277, + "language_loss": 0.72463268, + "learning_rate": 2.8107802525403886e-06, + "loss": 0.80182374, + "num_input_tokens_seen": 137880285, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.15576172, + "step": 6422, + "time_per_iteration": 3.9032654762268066 + }, + { + "auxiliary_loss_clip": 0.06443937, + "auxiliary_loss_mlp": 0.01268443, + "balance_loss_clip": 0.06280221, + "balance_loss_mlp": 0.01254925, + "epoch": 0.38617165188636704, + "flos": 16368290231040.0, + "grad_norm": 1.6312257254810183, + "language_loss": 0.66935605, + "learning_rate": 2.8104242131976025e-06, + "loss": 0.74647987, + "num_input_tokens_seen": 137898335, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.13531494, + "step": 6423, + "time_per_iteration": 2.4858603477478027 + }, + { + "auxiliary_loss_clip": 0.06452656, + "auxiliary_loss_mlp": 0.01269446, + "balance_loss_clip": 0.06281117, + "balance_loss_mlp": 0.01254771, + "epoch": 0.386231775139035, + "flos": 34795828005120.0, + "grad_norm": 1.7836916741722195, + "language_loss": 0.69448572, + "learning_rate": 2.810068143123449e-06, + "loss": 0.77170676, + "num_input_tokens_seen": 137918605, + "router_z_loss_clip": 1.71679688, + "router_z_loss_mlp": 0.14685059, + "step": 6424, + "time_per_iteration": 2.636545181274414 + }, + { + "auxiliary_loss_clip": 0.06446116, + "auxiliary_loss_mlp": 0.01269815, + "balance_loss_clip": 0.0628031, + "balance_loss_mlp": 0.0125616, + "epoch": 0.38629189839170297, + "flos": 21732672147840.0, + "grad_norm": 1.4876753960050375, + "language_loss": 0.72829968, + "learning_rate": 2.809712042331429e-06, + "loss": 0.80545902, + "num_input_tokens_seen": 137938245, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.13677979, + "step": 6425, + "time_per_iteration": 2.520872116088867 + }, + { + "auxiliary_loss_clip": 0.06454374, + "auxiliary_loss_mlp": 0.01269159, + "balance_loss_clip": 0.06279134, + "balance_loss_mlp": 0.01254383, + "epoch": 0.38635202164437094, + "flos": 27930315392640.0, + "grad_norm": 3.253764220801107, + "language_loss": 0.8113848, + "learning_rate": 2.8093559108350484e-06, + "loss": 0.88862014, + "num_input_tokens_seen": 137956770, + "router_z_loss_clip": 1.75097656, + "router_z_loss_mlp": 0.14752197, + "step": 6426, + "time_per_iteration": 2.577439785003662 + }, + { + "auxiliary_loss_clip": 0.06458677, + "auxiliary_loss_mlp": 0.01277199, + "balance_loss_clip": 0.06288534, + "balance_loss_mlp": 0.01261797, + "epoch": 0.38641214489703896, + "flos": 23593390640640.0, + "grad_norm": 1.9966810796758758, + "language_loss": 0.75299263, + "learning_rate": 2.80899974864781e-06, + "loss": 0.83035141, + "num_input_tokens_seen": 137977040, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.15393066, + "step": 6427, + "time_per_iteration": 2.538494825363159 + }, + { + "auxiliary_loss_clip": 0.06449243, + "auxiliary_loss_mlp": 0.01269948, + "balance_loss_clip": 0.0627961, + "balance_loss_mlp": 0.01255512, + "epoch": 0.3864722681497069, + "flos": 12646224339840.0, + "grad_norm": 1.7399599530073546, + "language_loss": 0.70451963, + "learning_rate": 2.8086435557832203e-06, + "loss": 0.78171146, + "num_input_tokens_seen": 137993545, + "router_z_loss_clip": 1.69726562, + "router_z_loss_mlp": 0.14428711, + "step": 6428, + "time_per_iteration": 2.501620292663574 + }, + { + "auxiliary_loss_clip": 0.06450263, + "auxiliary_loss_mlp": 0.01273584, + "balance_loss_clip": 0.06279485, + "balance_loss_mlp": 0.01259517, + "epoch": 0.3865323914023749, + "flos": 17604003519360.0, + "grad_norm": 1.9791686977360912, + "language_loss": 0.84605539, + "learning_rate": 2.8082873322547863e-06, + "loss": 0.92329377, + "num_input_tokens_seen": 138010140, + "router_z_loss_clip": 1.70898438, + "router_z_loss_mlp": 0.14074707, + "step": 6429, + "time_per_iteration": 2.4769797325134277 + }, + { + "auxiliary_loss_clip": 0.06453393, + "auxiliary_loss_mlp": 0.01272687, + "balance_loss_clip": 0.06283154, + "balance_loss_mlp": 0.01258679, + "epoch": 0.38659251465504285, + "flos": 18484908693120.0, + "grad_norm": 1.8799663311521415, + "language_loss": 0.81149292, + "learning_rate": 2.807931078076015e-06, + "loss": 0.88875371, + "num_input_tokens_seen": 138028880, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.13995361, + "step": 6430, + "time_per_iteration": 2.552243232727051 + }, + { + "auxiliary_loss_clip": 0.06342202, + "auxiliary_loss_mlp": 0.0126596, + "balance_loss_clip": 0.06266356, + "balance_loss_mlp": 0.0126256, + "epoch": 0.3866526379077108, + "flos": 64186533480960.0, + "grad_norm": 0.7018569193916078, + "language_loss": 0.58841789, + "learning_rate": 2.807574793260416e-06, + "loss": 0.66449958, + "num_input_tokens_seen": 138098090, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.03408813, + "step": 6431, + "time_per_iteration": 3.1865365505218506 + }, + { + "auxiliary_loss_clip": 0.06457522, + "auxiliary_loss_mlp": 0.01268383, + "balance_loss_clip": 0.06283836, + "balance_loss_mlp": 0.01253464, + "epoch": 0.3867127611603788, + "flos": 14392857098880.0, + "grad_norm": 1.8389423140015868, + "language_loss": 0.79719216, + "learning_rate": 2.8072184778215004e-06, + "loss": 0.87445116, + "num_input_tokens_seen": 138114735, + "router_z_loss_clip": 1.73828125, + "router_z_loss_mlp": 0.14910889, + "step": 6432, + "time_per_iteration": 2.5060834884643555 + }, + { + "auxiliary_loss_clip": 0.06456694, + "auxiliary_loss_mlp": 0.0127456, + "balance_loss_clip": 0.06279335, + "balance_loss_mlp": 0.01259217, + "epoch": 0.38677288441304675, + "flos": 20016870491520.0, + "grad_norm": 2.041684818915054, + "language_loss": 0.80982423, + "learning_rate": 2.806862131772779e-06, + "loss": 0.88713682, + "num_input_tokens_seen": 138130480, + "router_z_loss_clip": 1.77050781, + "router_z_loss_mlp": 0.15350342, + "step": 6433, + "time_per_iteration": 2.4978644847869873 + }, + { + "auxiliary_loss_clip": 0.06452015, + "auxiliary_loss_mlp": 0.01268045, + "balance_loss_clip": 0.06280582, + "balance_loss_mlp": 0.01251725, + "epoch": 0.3868330076657147, + "flos": 22243465837440.0, + "grad_norm": 1.5518308416482827, + "language_loss": 0.71316475, + "learning_rate": 2.806505755127765e-06, + "loss": 0.79036534, + "num_input_tokens_seen": 138150640, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.16308594, + "step": 6434, + "time_per_iteration": 2.5623676776885986 + }, + { + "auxiliary_loss_clip": 0.06457677, + "auxiliary_loss_mlp": 0.01269901, + "balance_loss_clip": 0.06278059, + "balance_loss_mlp": 0.01254547, + "epoch": 0.3868931309183827, + "flos": 16733076981120.0, + "grad_norm": 1.5292505515468358, + "language_loss": 0.77740347, + "learning_rate": 2.806149347899972e-06, + "loss": 0.85467923, + "num_input_tokens_seen": 138169700, + "router_z_loss_clip": 1.79394531, + "router_z_loss_mlp": 0.15350342, + "step": 6435, + "time_per_iteration": 2.4930777549743652 + }, + { + "auxiliary_loss_clip": 0.06446007, + "auxiliary_loss_mlp": 0.01272949, + "balance_loss_clip": 0.0627854, + "balance_loss_mlp": 0.01257594, + "epoch": 0.38695325417105064, + "flos": 22681360874880.0, + "grad_norm": 2.334489182765127, + "language_loss": 0.79902756, + "learning_rate": 2.805792910102915e-06, + "loss": 0.87621707, + "num_input_tokens_seen": 138185835, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.15362549, + "step": 6436, + "time_per_iteration": 2.595480442047119 + }, + { + "auxiliary_loss_clip": 0.06446151, + "auxiliary_loss_mlp": 0.01269254, + "balance_loss_clip": 0.0628051, + "balance_loss_mlp": 0.01255312, + "epoch": 0.3870133774237186, + "flos": 23118668933760.0, + "grad_norm": 1.736913277816888, + "language_loss": 0.77232099, + "learning_rate": 2.8054364417501093e-06, + "loss": 0.84947503, + "num_input_tokens_seen": 138204080, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.13934326, + "step": 6437, + "time_per_iteration": 2.6555299758911133 + }, + { + "auxiliary_loss_clip": 0.064465, + "auxiliary_loss_mlp": 0.01272869, + "balance_loss_clip": 0.06279578, + "balance_loss_mlp": 0.01259422, + "epoch": 0.3870735006763866, + "flos": 17681430291840.0, + "grad_norm": 2.573442514460841, + "language_loss": 0.81961322, + "learning_rate": 2.805079942855074e-06, + "loss": 0.89680696, + "num_input_tokens_seen": 138220710, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.13452148, + "step": 6438, + "time_per_iteration": 2.55658221244812 + }, + { + "auxiliary_loss_clip": 0.06449786, + "auxiliary_loss_mlp": 0.01268651, + "balance_loss_clip": 0.06278464, + "balance_loss_mlp": 0.01253869, + "epoch": 0.38713362392905454, + "flos": 23302676499840.0, + "grad_norm": 1.3535213690135137, + "language_loss": 0.75684851, + "learning_rate": 2.804723413431326e-06, + "loss": 0.83403289, + "num_input_tokens_seen": 138241720, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.14782715, + "step": 6439, + "time_per_iteration": 2.5023999214172363 + }, + { + "auxiliary_loss_clip": 0.06452194, + "auxiliary_loss_mlp": 0.01275332, + "balance_loss_clip": 0.06287295, + "balance_loss_mlp": 0.0126083, + "epoch": 0.38719374718172256, + "flos": 21037283913600.0, + "grad_norm": 2.8624272787557556, + "language_loss": 0.74227071, + "learning_rate": 2.8043668534923855e-06, + "loss": 0.81954598, + "num_input_tokens_seen": 138261885, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.1449585, + "step": 6440, + "time_per_iteration": 2.5370354652404785 + }, + { + "auxiliary_loss_clip": 0.06454886, + "auxiliary_loss_mlp": 0.01272767, + "balance_loss_clip": 0.06279822, + "balance_loss_mlp": 0.01257401, + "epoch": 0.3872538704343905, + "flos": 19615885977600.0, + "grad_norm": 1.8472167429080706, + "language_loss": 0.82205182, + "learning_rate": 2.804010263051774e-06, + "loss": 0.89932835, + "num_input_tokens_seen": 138280255, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.15368652, + "step": 6441, + "time_per_iteration": 2.4829154014587402 + }, + { + "auxiliary_loss_clip": 0.06449816, + "auxiliary_loss_mlp": 0.01273448, + "balance_loss_clip": 0.0628119, + "balance_loss_mlp": 0.01258833, + "epoch": 0.3873139936870585, + "flos": 17535800695680.0, + "grad_norm": 2.061540845511299, + "language_loss": 0.80687004, + "learning_rate": 2.8036536421230118e-06, + "loss": 0.8841027, + "num_input_tokens_seen": 138296675, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.14593506, + "step": 6442, + "time_per_iteration": 2.5348403453826904 + }, + { + "auxiliary_loss_clip": 0.0645024, + "auxiliary_loss_mlp": 0.01274941, + "balance_loss_clip": 0.0628161, + "balance_loss_mlp": 0.01260302, + "epoch": 0.38737411693972645, + "flos": 17792539205760.0, + "grad_norm": 1.5850563005203315, + "language_loss": 0.84242606, + "learning_rate": 2.803296990719624e-06, + "loss": 0.91967785, + "num_input_tokens_seen": 138314985, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.14642334, + "step": 6443, + "time_per_iteration": 2.475142240524292 + }, + { + "auxiliary_loss_clip": 0.06346577, + "auxiliary_loss_mlp": 0.01257136, + "balance_loss_clip": 0.06270638, + "balance_loss_mlp": 0.01253804, + "epoch": 0.3874342401923944, + "flos": 58320554624640.0, + "grad_norm": 0.7460963165264183, + "language_loss": 0.5025984, + "learning_rate": 2.8029403088551327e-06, + "loss": 0.57863545, + "num_input_tokens_seen": 138373275, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.03338623, + "step": 6444, + "time_per_iteration": 3.146993398666382 + }, + { + "auxiliary_loss_clip": 0.06439754, + "auxiliary_loss_mlp": 0.01267857, + "balance_loss_clip": 0.0627708, + "balance_loss_mlp": 0.01254088, + "epoch": 0.3874943634450624, + "flos": 17717628055680.0, + "grad_norm": 1.4103476418524727, + "language_loss": 0.79081571, + "learning_rate": 2.802583596543065e-06, + "loss": 0.86789179, + "num_input_tokens_seen": 138391145, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.13757324, + "step": 6445, + "time_per_iteration": 2.4769954681396484 + }, + { + "auxiliary_loss_clip": 0.06442489, + "auxiliary_loss_mlp": 0.01275349, + "balance_loss_clip": 0.06277544, + "balance_loss_mlp": 0.01261497, + "epoch": 0.38755448669773035, + "flos": 19250889592320.0, + "grad_norm": 1.890349589911811, + "language_loss": 0.81530821, + "learning_rate": 2.8022268537969474e-06, + "loss": 0.89248657, + "num_input_tokens_seen": 138409875, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.13861084, + "step": 6446, + "time_per_iteration": 2.5224525928497314 + }, + { + "auxiliary_loss_clip": 0.06442682, + "auxiliary_loss_mlp": 0.01277068, + "balance_loss_clip": 0.06275576, + "balance_loss_mlp": 0.01262489, + "epoch": 0.3876146099503983, + "flos": 20600437052160.0, + "grad_norm": 2.019397578580159, + "language_loss": 0.77555805, + "learning_rate": 2.801870080630306e-06, + "loss": 0.85275555, + "num_input_tokens_seen": 138428965, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.14575195, + "step": 6447, + "time_per_iteration": 2.4808783531188965 + }, + { + "auxiliary_loss_clip": 0.06441282, + "auxiliary_loss_mlp": 0.01273458, + "balance_loss_clip": 0.06277911, + "balance_loss_mlp": 0.01259355, + "epoch": 0.3876747332030663, + "flos": 19287129283200.0, + "grad_norm": 1.5926200346390118, + "language_loss": 0.76299512, + "learning_rate": 2.801513277056671e-06, + "loss": 0.84014249, + "num_input_tokens_seen": 138448090, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.14099121, + "step": 6448, + "time_per_iteration": 2.532101631164551 + }, + { + "auxiliary_loss_clip": 0.06445228, + "auxiliary_loss_mlp": 0.01276025, + "balance_loss_clip": 0.06280892, + "balance_loss_mlp": 0.01262363, + "epoch": 0.38773485645573424, + "flos": 18950699940480.0, + "grad_norm": 1.5288018173805344, + "language_loss": 0.76734072, + "learning_rate": 2.8011564430895725e-06, + "loss": 0.84455323, + "num_input_tokens_seen": 138466105, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.13647461, + "step": 6449, + "time_per_iteration": 2.515660524368286 + }, + { + "auxiliary_loss_clip": 0.06448871, + "auxiliary_loss_mlp": 0.01273884, + "balance_loss_clip": 0.0627744, + "balance_loss_mlp": 0.01258673, + "epoch": 0.3877949797084022, + "flos": 23077272216960.0, + "grad_norm": 1.7542495709483765, + "language_loss": 0.78832948, + "learning_rate": 2.800799578742542e-06, + "loss": 0.86555696, + "num_input_tokens_seen": 138485160, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.15209961, + "step": 6450, + "time_per_iteration": 2.5662050247192383 + }, + { + "auxiliary_loss_clip": 0.06452119, + "auxiliary_loss_mlp": 0.01276385, + "balance_loss_clip": 0.06276712, + "balance_loss_mlp": 0.01261317, + "epoch": 0.3878551029610702, + "flos": 29103150591360.0, + "grad_norm": 2.1638461576043095, + "language_loss": 0.78188771, + "learning_rate": 2.8004426840291106e-06, + "loss": 0.8591727, + "num_input_tokens_seen": 138504135, + "router_z_loss_clip": 1.75390625, + "router_z_loss_mlp": 0.15063477, + "step": 6451, + "time_per_iteration": 2.5734686851501465 + }, + { + "auxiliary_loss_clip": 0.06442447, + "auxiliary_loss_mlp": 0.01277813, + "balance_loss_clip": 0.06278168, + "balance_loss_mlp": 0.01263967, + "epoch": 0.38791522621373814, + "flos": 21002763231360.0, + "grad_norm": 1.7745661107883532, + "language_loss": 0.76657486, + "learning_rate": 2.800085758962812e-06, + "loss": 0.84377748, + "num_input_tokens_seen": 138523955, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.13842773, + "step": 6452, + "time_per_iteration": 4.083965301513672 + }, + { + "auxiliary_loss_clip": 0.06445795, + "auxiliary_loss_mlp": 0.01272941, + "balance_loss_clip": 0.06277959, + "balance_loss_mlp": 0.01258457, + "epoch": 0.3879753494664061, + "flos": 15492248593920.0, + "grad_norm": 1.5775897118958155, + "language_loss": 0.80075014, + "learning_rate": 2.799728803557182e-06, + "loss": 0.87793756, + "num_input_tokens_seen": 138541655, + "router_z_loss_clip": 1.67871094, + "router_z_loss_mlp": 0.14483643, + "step": 6453, + "time_per_iteration": 2.5186924934387207 + }, + { + "auxiliary_loss_clip": 0.06452494, + "auxiliary_loss_mlp": 0.01273182, + "balance_loss_clip": 0.06277925, + "balance_loss_mlp": 0.01258472, + "epoch": 0.3880354727190741, + "flos": 22060422593280.0, + "grad_norm": 1.7271767654368522, + "language_loss": 0.71748114, + "learning_rate": 2.7993718178257555e-06, + "loss": 0.79473794, + "num_input_tokens_seen": 138560860, + "router_z_loss_clip": 1.74609375, + "router_z_loss_mlp": 0.14697266, + "step": 6454, + "time_per_iteration": 2.516023635864258 + }, + { + "auxiliary_loss_clip": 0.0645522, + "auxiliary_loss_mlp": 0.01280556, + "balance_loss_clip": 0.06279911, + "balance_loss_mlp": 0.01263986, + "epoch": 0.3880955959717421, + "flos": 20346675361920.0, + "grad_norm": 2.0562500360548452, + "language_loss": 0.77941358, + "learning_rate": 2.7990148017820694e-06, + "loss": 0.85677135, + "num_input_tokens_seen": 138580200, + "router_z_loss_clip": 1.75097656, + "router_z_loss_mlp": 0.16577148, + "step": 6455, + "time_per_iteration": 3.9251530170440674 + }, + { + "auxiliary_loss_clip": 0.0644723, + "auxiliary_loss_mlp": 0.0127199, + "balance_loss_clip": 0.062791, + "balance_loss_mlp": 0.01257804, + "epoch": 0.38815571922441006, + "flos": 23082009972480.0, + "grad_norm": 1.5355571660803105, + "language_loss": 0.76081556, + "learning_rate": 2.798657755439662e-06, + "loss": 0.83800781, + "num_input_tokens_seen": 138598315, + "router_z_loss_clip": 1.68066406, + "router_z_loss_mlp": 0.14196777, + "step": 6456, + "time_per_iteration": 2.5377979278564453 + }, + { + "auxiliary_loss_clip": 0.064498, + "auxiliary_loss_mlp": 0.01279611, + "balance_loss_clip": 0.06277888, + "balance_loss_mlp": 0.01264811, + "epoch": 0.388215842477078, + "flos": 20783186807040.0, + "grad_norm": 2.2521174172947838, + "language_loss": 0.60975528, + "learning_rate": 2.7983006788120726e-06, + "loss": 0.68704933, + "num_input_tokens_seen": 138615695, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.14801025, + "step": 6457, + "time_per_iteration": 2.500054121017456 + }, + { + "auxiliary_loss_clip": 0.06447765, + "auxiliary_loss_mlp": 0.01274853, + "balance_loss_clip": 0.06275971, + "balance_loss_mlp": 0.01259308, + "epoch": 0.388275965729746, + "flos": 20454304331520.0, + "grad_norm": 3.4499577756661384, + "language_loss": 0.80527538, + "learning_rate": 2.797943571912841e-06, + "loss": 0.88250154, + "num_input_tokens_seen": 138633180, + "router_z_loss_clip": 1.71777344, + "router_z_loss_mlp": 0.15551758, + "step": 6458, + "time_per_iteration": 2.5349881649017334 + }, + { + "auxiliary_loss_clip": 0.06448271, + "auxiliary_loss_mlp": 0.01274317, + "balance_loss_clip": 0.06278434, + "balance_loss_mlp": 0.0125938, + "epoch": 0.38833608898241395, + "flos": 27899945487360.0, + "grad_norm": 3.532155031934189, + "language_loss": 0.8156774, + "learning_rate": 2.797586434755509e-06, + "loss": 0.89290321, + "num_input_tokens_seen": 138654785, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.14941406, + "step": 6459, + "time_per_iteration": 4.015187978744507 + }, + { + "auxiliary_loss_clip": 0.0644253, + "auxiliary_loss_mlp": 0.01277266, + "balance_loss_clip": 0.06278129, + "balance_loss_mlp": 0.01263789, + "epoch": 0.3883962122350819, + "flos": 18082079389440.0, + "grad_norm": 1.6405749509561738, + "language_loss": 0.62564123, + "learning_rate": 2.7972292673536202e-06, + "loss": 0.7028392, + "num_input_tokens_seen": 138673330, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.13470459, + "step": 6460, + "time_per_iteration": 2.497053861618042 + }, + { + "auxiliary_loss_clip": 0.06445154, + "auxiliary_loss_mlp": 0.01273315, + "balance_loss_clip": 0.06277992, + "balance_loss_mlp": 0.01259374, + "epoch": 0.3884563354877499, + "flos": 23628875644800.0, + "grad_norm": 1.560750838950793, + "language_loss": 0.86785483, + "learning_rate": 2.796872069720717e-06, + "loss": 0.94503951, + "num_input_tokens_seen": 138694185, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.1394043, + "step": 6461, + "time_per_iteration": 2.5308427810668945 + }, + { + "auxiliary_loss_clip": 0.06442384, + "auxiliary_loss_mlp": 0.01273139, + "balance_loss_clip": 0.06272203, + "balance_loss_mlp": 0.01258369, + "epoch": 0.38851645874041785, + "flos": 27460834565760.0, + "grad_norm": 2.5738865735247285, + "language_loss": 0.71770304, + "learning_rate": 2.7965148418703456e-06, + "loss": 0.79485828, + "num_input_tokens_seen": 138714625, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.14782715, + "step": 6462, + "time_per_iteration": 3.942819833755493 + }, + { + "auxiliary_loss_clip": 0.06442184, + "auxiliary_loss_mlp": 0.01271045, + "balance_loss_clip": 0.0627287, + "balance_loss_mlp": 0.01256036, + "epoch": 0.3885765819930858, + "flos": 25235035833600.0, + "grad_norm": 2.2250707690072886, + "language_loss": 0.76693827, + "learning_rate": 2.796157583816052e-06, + "loss": 0.84407055, + "num_input_tokens_seen": 138733585, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.15014648, + "step": 6463, + "time_per_iteration": 2.577254056930542 + }, + { + "auxiliary_loss_clip": 0.06458563, + "auxiliary_loss_mlp": 0.01275367, + "balance_loss_clip": 0.06282724, + "balance_loss_mlp": 0.01259441, + "epoch": 0.3886367052457538, + "flos": 16952317989120.0, + "grad_norm": 2.5235079856597196, + "language_loss": 0.70838499, + "learning_rate": 2.795800295571382e-06, + "loss": 0.78572428, + "num_input_tokens_seen": 138752335, + "router_z_loss_clip": 1.75683594, + "router_z_loss_mlp": 0.15930176, + "step": 6464, + "time_per_iteration": 2.501830816268921 + }, + { + "auxiliary_loss_clip": 0.06442419, + "auxiliary_loss_mlp": 0.01270994, + "balance_loss_clip": 0.06275325, + "balance_loss_mlp": 0.01255699, + "epoch": 0.38869682849842174, + "flos": 27160141789440.0, + "grad_norm": 1.8571499226781363, + "language_loss": 0.69473737, + "learning_rate": 2.7954429771498858e-06, + "loss": 0.77187151, + "num_input_tokens_seen": 138768450, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.15301514, + "step": 6465, + "time_per_iteration": 2.6060595512390137 + }, + { + "auxiliary_loss_clip": 0.06446355, + "auxiliary_loss_mlp": 0.01273054, + "balance_loss_clip": 0.06276145, + "balance_loss_mlp": 0.01257271, + "epoch": 0.3887569517510897, + "flos": 21069037411200.0, + "grad_norm": 2.3078416168388243, + "language_loss": 0.78628361, + "learning_rate": 2.7950856285651117e-06, + "loss": 0.86347771, + "num_input_tokens_seen": 138786775, + "router_z_loss_clip": 1.70019531, + "router_z_loss_mlp": 0.15771484, + "step": 6466, + "time_per_iteration": 2.503218650817871 + }, + { + "auxiliary_loss_clip": 0.06447446, + "auxiliary_loss_mlp": 0.01269245, + "balance_loss_clip": 0.0627599, + "balance_loss_mlp": 0.01255, + "epoch": 0.38881707500375773, + "flos": 29505141354240.0, + "grad_norm": 1.7748655394270907, + "language_loss": 0.695912, + "learning_rate": 2.794728249830611e-06, + "loss": 0.77307892, + "num_input_tokens_seen": 138810100, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.1427002, + "step": 6467, + "time_per_iteration": 2.6156952381134033 + }, + { + "auxiliary_loss_clip": 0.0644877, + "auxiliary_loss_mlp": 0.01269809, + "balance_loss_clip": 0.06277345, + "balance_loss_mlp": 0.01255403, + "epoch": 0.3888771982564257, + "flos": 17493146167680.0, + "grad_norm": 2.2278384059050285, + "language_loss": 0.83988351, + "learning_rate": 2.794370840959936e-06, + "loss": 0.91706932, + "num_input_tokens_seen": 138825140, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.14404297, + "step": 6468, + "time_per_iteration": 2.446979522705078 + }, + { + "auxiliary_loss_clip": 0.0644114, + "auxiliary_loss_mlp": 0.01268766, + "balance_loss_clip": 0.06273733, + "balance_loss_mlp": 0.01254628, + "epoch": 0.38893732150909366, + "flos": 21948517065600.0, + "grad_norm": 2.4269891965149837, + "language_loss": 0.84667963, + "learning_rate": 2.7940134019666383e-06, + "loss": 0.92377871, + "num_input_tokens_seen": 138844115, + "router_z_loss_clip": 1.67480469, + "router_z_loss_mlp": 0.14141846, + "step": 6469, + "time_per_iteration": 2.6123251914978027 + }, + { + "auxiliary_loss_clip": 0.06445388, + "auxiliary_loss_mlp": 0.01267071, + "balance_loss_clip": 0.06276623, + "balance_loss_mlp": 0.01252575, + "epoch": 0.3889974447617616, + "flos": 24282657527040.0, + "grad_norm": 1.7885497899924685, + "language_loss": 0.75114912, + "learning_rate": 2.793655932864273e-06, + "loss": 0.82827377, + "num_input_tokens_seen": 138860860, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.14508057, + "step": 6470, + "time_per_iteration": 2.5293121337890625 + }, + { + "auxiliary_loss_clip": 0.06447375, + "auxiliary_loss_mlp": 0.01272376, + "balance_loss_clip": 0.06277959, + "balance_loss_mlp": 0.01257785, + "epoch": 0.3890575680144296, + "flos": 25674356390400.0, + "grad_norm": 2.975621998510204, + "language_loss": 0.75126278, + "learning_rate": 2.7932984336663953e-06, + "loss": 0.8284604, + "num_input_tokens_seen": 138881910, + "router_z_loss_clip": 1.69433594, + "router_z_loss_mlp": 0.14575195, + "step": 6471, + "time_per_iteration": 2.6211233139038086 + }, + { + "auxiliary_loss_clip": 0.0644885, + "auxiliary_loss_mlp": 0.01268799, + "balance_loss_clip": 0.06277963, + "balance_loss_mlp": 0.01254291, + "epoch": 0.38911769126709755, + "flos": 22861636934400.0, + "grad_norm": 1.6871762941495017, + "language_loss": 0.68158531, + "learning_rate": 2.792940904386562e-06, + "loss": 0.75876176, + "num_input_tokens_seen": 138900975, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.1451416, + "step": 6472, + "time_per_iteration": 2.5192203521728516 + }, + { + "auxiliary_loss_clip": 0.06449802, + "auxiliary_loss_mlp": 0.01271384, + "balance_loss_clip": 0.06278318, + "balance_loss_mlp": 0.01256739, + "epoch": 0.3891778145197655, + "flos": 25454612257920.0, + "grad_norm": 1.6537492711017865, + "language_loss": 0.76761287, + "learning_rate": 2.7925833450383293e-06, + "loss": 0.84482473, + "num_input_tokens_seen": 138920795, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.14654541, + "step": 6473, + "time_per_iteration": 2.588179349899292 + }, + { + "auxiliary_loss_clip": 0.06451473, + "auxiliary_loss_mlp": 0.01269072, + "balance_loss_clip": 0.0627984, + "balance_loss_mlp": 0.01254803, + "epoch": 0.3892379377724335, + "flos": 14033227374720.0, + "grad_norm": 1.8453216957475485, + "language_loss": 0.71886337, + "learning_rate": 2.792225755635257e-06, + "loss": 0.79606879, + "num_input_tokens_seen": 138938770, + "router_z_loss_clip": 1.71582031, + "router_z_loss_mlp": 0.1427002, + "step": 6474, + "time_per_iteration": 2.5054657459259033 + }, + { + "auxiliary_loss_clip": 0.06452703, + "auxiliary_loss_mlp": 0.01266582, + "balance_loss_clip": 0.06280853, + "balance_loss_mlp": 0.01252945, + "epoch": 0.38929806102510145, + "flos": 20163715971840.0, + "grad_norm": 1.4152146042292184, + "language_loss": 0.68943882, + "learning_rate": 2.7918681361909046e-06, + "loss": 0.76663172, + "num_input_tokens_seen": 138958880, + "router_z_loss_clip": 1.71582031, + "router_z_loss_mlp": 0.1362915, + "step": 6475, + "time_per_iteration": 2.5646328926086426 + }, + { + "auxiliary_loss_clip": 0.06459899, + "auxiliary_loss_mlp": 0.01272247, + "balance_loss_clip": 0.06281739, + "balance_loss_mlp": 0.01257107, + "epoch": 0.3893581842777694, + "flos": 22170525258240.0, + "grad_norm": 1.7897820076570896, + "language_loss": 0.75474584, + "learning_rate": 2.7915104867188332e-06, + "loss": 0.83206725, + "num_input_tokens_seen": 138977240, + "router_z_loss_clip": 1.78125, + "router_z_loss_mlp": 0.15142822, + "step": 6476, + "time_per_iteration": 2.515145778656006 + }, + { + "auxiliary_loss_clip": 0.06356712, + "auxiliary_loss_mlp": 0.01262119, + "balance_loss_clip": 0.06275933, + "balance_loss_mlp": 0.01259353, + "epoch": 0.3894183075304374, + "flos": 67322936459520.0, + "grad_norm": 0.7612569916112396, + "language_loss": 0.58157814, + "learning_rate": 2.7911528072326055e-06, + "loss": 0.65776634, + "num_input_tokens_seen": 139039035, + "router_z_loss_clip": 0.80615234, + "router_z_loss_mlp": 0.0276947, + "step": 6477, + "time_per_iteration": 3.147226572036743 + }, + { + "auxiliary_loss_clip": 0.06461065, + "auxiliary_loss_mlp": 0.0127366, + "balance_loss_clip": 0.06287047, + "balance_loss_mlp": 0.01258711, + "epoch": 0.38947843078310534, + "flos": 18552734173440.0, + "grad_norm": 2.207057593016708, + "language_loss": 0.77832031, + "learning_rate": 2.7907950977457832e-06, + "loss": 0.85566759, + "num_input_tokens_seen": 139055560, + "router_z_loss_clip": 1.73925781, + "router_z_loss_mlp": 0.14953613, + "step": 6478, + "time_per_iteration": 2.5238850116729736 + }, + { + "auxiliary_loss_clip": 0.06450923, + "auxiliary_loss_mlp": 0.01273895, + "balance_loss_clip": 0.06281843, + "balance_loss_mlp": 0.01260162, + "epoch": 0.3895385540357733, + "flos": 14610253317120.0, + "grad_norm": 2.187508322407885, + "language_loss": 0.83306336, + "learning_rate": 2.7904373582719317e-06, + "loss": 0.91031158, + "num_input_tokens_seen": 139071865, + "router_z_loss_clip": 1.69140625, + "router_z_loss_mlp": 0.13739014, + "step": 6479, + "time_per_iteration": 2.5355920791625977 + }, + { + "auxiliary_loss_clip": 0.06451993, + "auxiliary_loss_mlp": 0.0126931, + "balance_loss_clip": 0.06282853, + "balance_loss_mlp": 0.01254414, + "epoch": 0.38959867728844133, + "flos": 19981469341440.0, + "grad_norm": 1.7759645272954405, + "language_loss": 0.80297941, + "learning_rate": 2.790079588824617e-06, + "loss": 0.8801924, + "num_input_tokens_seen": 139089640, + "router_z_loss_clip": 1.69140625, + "router_z_loss_mlp": 0.14892578, + "step": 6480, + "time_per_iteration": 2.51645565032959 + }, + { + "auxiliary_loss_clip": 0.06447603, + "auxiliary_loss_mlp": 0.01270991, + "balance_loss_clip": 0.06278986, + "balance_loss_mlp": 0.01256924, + "epoch": 0.3896588005411093, + "flos": 22678342128000.0, + "grad_norm": 1.6438066173178132, + "language_loss": 0.83259583, + "learning_rate": 2.7897217894174038e-06, + "loss": 0.90978175, + "num_input_tokens_seen": 139109365, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.140625, + "step": 6481, + "time_per_iteration": 2.542642116546631 + }, + { + "auxiliary_loss_clip": 0.06446713, + "auxiliary_loss_mlp": 0.0127065, + "balance_loss_clip": 0.0628217, + "balance_loss_mlp": 0.01257204, + "epoch": 0.38971892379377726, + "flos": 21002343960960.0, + "grad_norm": 1.5951406272778517, + "language_loss": 0.75640547, + "learning_rate": 2.789363960063863e-06, + "loss": 0.83357906, + "num_input_tokens_seen": 139128260, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.13458252, + "step": 6482, + "time_per_iteration": 2.5500056743621826 + }, + { + "auxiliary_loss_clip": 0.06452929, + "auxiliary_loss_mlp": 0.01268783, + "balance_loss_clip": 0.06281099, + "balance_loss_mlp": 0.01254853, + "epoch": 0.3897790470464452, + "flos": 22535060446080.0, + "grad_norm": 1.9197222218969183, + "language_loss": 0.78993875, + "learning_rate": 2.78900610077756e-06, + "loss": 0.86715591, + "num_input_tokens_seen": 139147315, + "router_z_loss_clip": 1.71777344, + "router_z_loss_mlp": 0.13922119, + "step": 6483, + "time_per_iteration": 2.5677597522735596 + }, + { + "auxiliary_loss_clip": 0.06452915, + "auxiliary_loss_mlp": 0.01271475, + "balance_loss_clip": 0.06281908, + "balance_loss_mlp": 0.01256157, + "epoch": 0.3898391702991132, + "flos": 26216484307200.0, + "grad_norm": 1.4915682478636534, + "language_loss": 0.80430162, + "learning_rate": 2.788648211572067e-06, + "loss": 0.88154554, + "num_input_tokens_seen": 139167270, + "router_z_loss_clip": 1.70898438, + "router_z_loss_mlp": 0.15307617, + "step": 6484, + "time_per_iteration": 2.582933187484741 + }, + { + "auxiliary_loss_clip": 0.06455952, + "auxiliary_loss_mlp": 0.01270999, + "balance_loss_clip": 0.06285131, + "balance_loss_mlp": 0.01255347, + "epoch": 0.38989929355178116, + "flos": 21071301471360.0, + "grad_norm": 1.959559170578303, + "language_loss": 0.7792083, + "learning_rate": 2.7882902924609557e-06, + "loss": 0.8564778, + "num_input_tokens_seen": 139185970, + "router_z_loss_clip": 1.70800781, + "router_z_loss_mlp": 0.15637207, + "step": 6485, + "time_per_iteration": 2.532944917678833 + }, + { + "auxiliary_loss_clip": 0.06453831, + "auxiliary_loss_mlp": 0.01268339, + "balance_loss_clip": 0.06280229, + "balance_loss_mlp": 0.01253444, + "epoch": 0.3899594168044491, + "flos": 25491229292160.0, + "grad_norm": 2.289645436499478, + "language_loss": 0.84979439, + "learning_rate": 2.7879323434577965e-06, + "loss": 0.92701602, + "num_input_tokens_seen": 139203730, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.14898682, + "step": 6486, + "time_per_iteration": 2.5743820667266846 + }, + { + "auxiliary_loss_clip": 0.06453397, + "auxiliary_loss_mlp": 0.01267827, + "balance_loss_clip": 0.06278502, + "balance_loss_mlp": 0.01253141, + "epoch": 0.3900195400571171, + "flos": 31147415452800.0, + "grad_norm": 1.9273192838933928, + "language_loss": 0.85622168, + "learning_rate": 2.7875743645761645e-06, + "loss": 0.93343389, + "num_input_tokens_seen": 139222560, + "router_z_loss_clip": 1.74902344, + "router_z_loss_mlp": 0.14672852, + "step": 6487, + "time_per_iteration": 2.580012321472168 + }, + { + "auxiliary_loss_clip": 0.06449067, + "auxiliary_loss_mlp": 0.01273707, + "balance_loss_clip": 0.06279142, + "balance_loss_mlp": 0.01259121, + "epoch": 0.39007966330978505, + "flos": 20236111499520.0, + "grad_norm": 1.468779525903349, + "language_loss": 0.73436427, + "learning_rate": 2.787216355829633e-06, + "loss": 0.81159198, + "num_input_tokens_seen": 139242165, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.14569092, + "step": 6488, + "time_per_iteration": 2.54925274848938 + }, + { + "auxiliary_loss_clip": 0.06455337, + "auxiliary_loss_mlp": 0.0127042, + "balance_loss_clip": 0.06281433, + "balance_loss_mlp": 0.01255072, + "epoch": 0.390139786562453, + "flos": 22535353935360.0, + "grad_norm": 1.7339556546984902, + "language_loss": 0.68455738, + "learning_rate": 2.786858317231779e-06, + "loss": 0.76181495, + "num_input_tokens_seen": 139262525, + "router_z_loss_clip": 1.73730469, + "router_z_loss_mlp": 0.15344238, + "step": 6489, + "time_per_iteration": 2.529337167739868 + }, + { + "auxiliary_loss_clip": 0.06445001, + "auxiliary_loss_mlp": 0.01269777, + "balance_loss_clip": 0.0627808, + "balance_loss_mlp": 0.01256079, + "epoch": 0.390199909815121, + "flos": 26440211508480.0, + "grad_norm": 1.5752653046558913, + "language_loss": 0.81221771, + "learning_rate": 2.7865002487961788e-06, + "loss": 0.88936543, + "num_input_tokens_seen": 139282835, + "router_z_loss_clip": 1.66894531, + "router_z_loss_mlp": 0.13690186, + "step": 6490, + "time_per_iteration": 2.580287218093872 + }, + { + "auxiliary_loss_clip": 0.06445351, + "auxiliary_loss_mlp": 0.01270566, + "balance_loss_clip": 0.06275269, + "balance_loss_mlp": 0.01255784, + "epoch": 0.39026003306778895, + "flos": 17280278069760.0, + "grad_norm": 1.8612382479767444, + "language_loss": 0.89715946, + "learning_rate": 2.7861421505364104e-06, + "loss": 0.97431856, + "num_input_tokens_seen": 139299490, + "router_z_loss_clip": 1.69824219, + "router_z_loss_mlp": 0.14782715, + "step": 6491, + "time_per_iteration": 2.476393461227417 + }, + { + "auxiliary_loss_clip": 0.06446734, + "auxiliary_loss_mlp": 0.01271059, + "balance_loss_clip": 0.06275047, + "balance_loss_mlp": 0.01256325, + "epoch": 0.3903201563204569, + "flos": 24539354110080.0, + "grad_norm": 1.7715634168525083, + "language_loss": 0.78570807, + "learning_rate": 2.7857840224660523e-06, + "loss": 0.86288601, + "num_input_tokens_seen": 139317865, + "router_z_loss_clip": 1.71679688, + "router_z_loss_mlp": 0.14746094, + "step": 6492, + "time_per_iteration": 3.918022871017456 + }, + { + "auxiliary_loss_clip": 0.06448489, + "auxiliary_loss_mlp": 0.01270077, + "balance_loss_clip": 0.06278895, + "balance_loss_mlp": 0.01255528, + "epoch": 0.39038027957312493, + "flos": 23774547168000.0, + "grad_norm": 1.9649032306705667, + "language_loss": 0.74995399, + "learning_rate": 2.7854258645986857e-06, + "loss": 0.82713962, + "num_input_tokens_seen": 139339840, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.14544678, + "step": 6493, + "time_per_iteration": 2.5337636470794678 + }, + { + "auxiliary_loss_clip": 0.06457585, + "auxiliary_loss_mlp": 0.0126917, + "balance_loss_clip": 0.06280027, + "balance_loss_mlp": 0.0125341, + "epoch": 0.3904404028257929, + "flos": 14105832537600.0, + "grad_norm": 2.4323863844033498, + "language_loss": 0.76480663, + "learning_rate": 2.7850676769478916e-06, + "loss": 0.84207416, + "num_input_tokens_seen": 139357555, + "router_z_loss_clip": 1.77636719, + "router_z_loss_mlp": 0.15771484, + "step": 6494, + "time_per_iteration": 3.9828202724456787 + }, + { + "auxiliary_loss_clip": 0.06461826, + "auxiliary_loss_mlp": 0.01272307, + "balance_loss_clip": 0.06279928, + "balance_loss_mlp": 0.01255582, + "epoch": 0.39050052607846086, + "flos": 16915742881920.0, + "grad_norm": 1.9306711407360488, + "language_loss": 0.74818373, + "learning_rate": 2.7847094595272525e-06, + "loss": 0.82552505, + "num_input_tokens_seen": 139374455, + "router_z_loss_clip": 1.81933594, + "router_z_loss_mlp": 0.16723633, + "step": 6495, + "time_per_iteration": 2.5104000568389893 + }, + { + "auxiliary_loss_clip": 0.06450078, + "auxiliary_loss_mlp": 0.01273142, + "balance_loss_clip": 0.06281738, + "balance_loss_mlp": 0.01257358, + "epoch": 0.39056064933112883, + "flos": 25921912878720.0, + "grad_norm": 2.748187950361319, + "language_loss": 0.68202364, + "learning_rate": 2.784351212350352e-06, + "loss": 0.75925589, + "num_input_tokens_seen": 139394770, + "router_z_loss_clip": 1.68554688, + "router_z_loss_mlp": 0.15783691, + "step": 6496, + "time_per_iteration": 2.550957202911377 + }, + { + "auxiliary_loss_clip": 0.0637021, + "auxiliary_loss_mlp": 0.01254222, + "balance_loss_clip": 0.06292024, + "balance_loss_mlp": 0.01251394, + "epoch": 0.3906207725837968, + "flos": 60046125281280.0, + "grad_norm": 0.6447698339715318, + "language_loss": 0.53706288, + "learning_rate": 2.783992935430775e-06, + "loss": 0.61330724, + "num_input_tokens_seen": 139454760, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 0.02824402, + "step": 6497, + "time_per_iteration": 3.2988505363464355 + }, + { + "auxiliary_loss_clip": 0.06453034, + "auxiliary_loss_mlp": 0.01276113, + "balance_loss_clip": 0.06281406, + "balance_loss_mlp": 0.01261265, + "epoch": 0.39068089583646476, + "flos": 21074949123840.0, + "grad_norm": 2.0090604178847795, + "language_loss": 0.68947327, + "learning_rate": 2.7836346287821068e-06, + "loss": 0.76676476, + "num_input_tokens_seen": 139472645, + "router_z_loss_clip": 1.71679688, + "router_z_loss_mlp": 0.14837646, + "step": 6498, + "time_per_iteration": 3.9722609519958496 + }, + { + "auxiliary_loss_clip": 0.06365327, + "auxiliary_loss_mlp": 0.01254987, + "balance_loss_clip": 0.06287005, + "balance_loss_mlp": 0.01252178, + "epoch": 0.3907410190891327, + "flos": 70468269897600.0, + "grad_norm": 0.719858085665683, + "language_loss": 0.51721394, + "learning_rate": 2.783276292417936e-06, + "loss": 0.59341711, + "num_input_tokens_seen": 139536730, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 0.02807617, + "step": 6499, + "time_per_iteration": 3.209885835647583 + }, + { + "auxiliary_loss_clip": 0.06452541, + "auxiliary_loss_mlp": 0.01273785, + "balance_loss_clip": 0.06277416, + "balance_loss_mlp": 0.0125681, + "epoch": 0.3908011423418007, + "flos": 27969531903360.0, + "grad_norm": 1.5964691032272669, + "language_loss": 0.7347858, + "learning_rate": 2.7829179263518487e-06, + "loss": 0.81204903, + "num_input_tokens_seen": 139557540, + "router_z_loss_clip": 1.75195312, + "router_z_loss_mlp": 0.16992188, + "step": 6500, + "time_per_iteration": 2.5915534496307373 + }, + { + "auxiliary_loss_clip": 0.06456988, + "auxiliary_loss_mlp": 0.01269402, + "balance_loss_clip": 0.06284038, + "balance_loss_mlp": 0.01254728, + "epoch": 0.39086126559446865, + "flos": 24468971080320.0, + "grad_norm": 2.170342944486325, + "language_loss": 0.68858671, + "learning_rate": 2.7825595305974354e-06, + "loss": 0.7658506, + "num_input_tokens_seen": 139576875, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.14691162, + "step": 6501, + "time_per_iteration": 3.948155164718628 + }, + { + "auxiliary_loss_clip": 0.06445958, + "auxiliary_loss_mlp": 0.01271431, + "balance_loss_clip": 0.06277448, + "balance_loss_mlp": 0.01256327, + "epoch": 0.3909213888471366, + "flos": 16946406276480.0, + "grad_norm": 1.631531331045391, + "language_loss": 0.78994954, + "learning_rate": 2.782201105168287e-06, + "loss": 0.86712337, + "num_input_tokens_seen": 139594295, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.15100098, + "step": 6502, + "time_per_iteration": 2.505021810531616 + }, + { + "auxiliary_loss_clip": 0.06451446, + "auxiliary_loss_mlp": 0.01272758, + "balance_loss_clip": 0.06288067, + "balance_loss_mlp": 0.01259133, + "epoch": 0.3909815120998046, + "flos": 29286109981440.0, + "grad_norm": 4.8026818588998115, + "language_loss": 0.80286908, + "learning_rate": 2.7818426500779932e-06, + "loss": 0.88011116, + "num_input_tokens_seen": 139614080, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.13623047, + "step": 6503, + "time_per_iteration": 2.6041667461395264 + }, + { + "auxiliary_loss_clip": 0.06444375, + "auxiliary_loss_mlp": 0.01267951, + "balance_loss_clip": 0.06278107, + "balance_loss_mlp": 0.01253574, + "epoch": 0.39104163535247255, + "flos": 18956947069440.0, + "grad_norm": 1.8714653526076386, + "language_loss": 0.71717298, + "learning_rate": 2.7814841653401485e-06, + "loss": 0.79429626, + "num_input_tokens_seen": 139632755, + "router_z_loss_clip": 1.66113281, + "router_z_loss_mlp": 0.14379883, + "step": 6504, + "time_per_iteration": 2.499645471572876 + }, + { + "auxiliary_loss_clip": 0.06449269, + "auxiliary_loss_mlp": 0.01267487, + "balance_loss_clip": 0.06279607, + "balance_loss_mlp": 0.0125379, + "epoch": 0.3911017586051405, + "flos": 26330611968000.0, + "grad_norm": 1.7094242767760466, + "language_loss": 0.83403468, + "learning_rate": 2.7811256509683454e-06, + "loss": 0.91120219, + "num_input_tokens_seen": 139654205, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.137146, + "step": 6505, + "time_per_iteration": 2.5698060989379883 + }, + { + "auxiliary_loss_clip": 0.06447234, + "auxiliary_loss_mlp": 0.01267488, + "balance_loss_clip": 0.06281015, + "balance_loss_mlp": 0.01253022, + "epoch": 0.3911618818578085, + "flos": 21842313615360.0, + "grad_norm": 2.3254017668705083, + "language_loss": 0.71427596, + "learning_rate": 2.7807671069761797e-06, + "loss": 0.7914232, + "num_input_tokens_seen": 139673595, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.14465332, + "step": 6506, + "time_per_iteration": 2.4988996982574463 + }, + { + "auxiliary_loss_clip": 0.06443267, + "auxiliary_loss_mlp": 0.01271489, + "balance_loss_clip": 0.0628104, + "balance_loss_mlp": 0.01258149, + "epoch": 0.3912220051104765, + "flos": 16364768359680.0, + "grad_norm": 2.639532414168514, + "language_loss": 0.75588799, + "learning_rate": 2.7804085333772477e-06, + "loss": 0.83303547, + "num_input_tokens_seen": 139690565, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.13348389, + "step": 6507, + "time_per_iteration": 2.506723403930664 + }, + { + "auxiliary_loss_clip": 0.06355534, + "auxiliary_loss_mlp": 0.01255368, + "balance_loss_clip": 0.0627788, + "balance_loss_mlp": 0.01252429, + "epoch": 0.39128212836314447, + "flos": 71071179552000.0, + "grad_norm": 0.751869236178363, + "language_loss": 0.56649405, + "learning_rate": 2.7800499301851446e-06, + "loss": 0.64260316, + "num_input_tokens_seen": 139749420, + "router_z_loss_clip": 0.77441406, + "router_z_loss_mlp": 0.02935791, + "step": 6508, + "time_per_iteration": 3.282604455947876 + }, + { + "auxiliary_loss_clip": 0.06448714, + "auxiliary_loss_mlp": 0.01268575, + "balance_loss_clip": 0.06280237, + "balance_loss_mlp": 0.01254294, + "epoch": 0.39134225161581243, + "flos": 20336948288640.0, + "grad_norm": 1.8618605672003898, + "language_loss": 0.76758552, + "learning_rate": 2.779691297413471e-06, + "loss": 0.84475839, + "num_input_tokens_seen": 139766265, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.14276123, + "step": 6509, + "time_per_iteration": 2.5330445766448975 + }, + { + "auxiliary_loss_clip": 0.0644654, + "auxiliary_loss_mlp": 0.01272023, + "balance_loss_clip": 0.06278333, + "balance_loss_mlp": 0.01256073, + "epoch": 0.3914023748684804, + "flos": 17023916903040.0, + "grad_norm": 3.0317271524647427, + "language_loss": 0.83418059, + "learning_rate": 2.779332635075825e-06, + "loss": 0.91136616, + "num_input_tokens_seen": 139782400, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.1595459, + "step": 6510, + "time_per_iteration": 2.484149217605591 + }, + { + "auxiliary_loss_clip": 0.06450167, + "auxiliary_loss_mlp": 0.01268149, + "balance_loss_clip": 0.06277542, + "balance_loss_mlp": 0.01254463, + "epoch": 0.39146249812114836, + "flos": 18411045719040.0, + "grad_norm": 1.8343195842354416, + "language_loss": 0.77659726, + "learning_rate": 2.7789739431858073e-06, + "loss": 0.85378045, + "num_input_tokens_seen": 139801435, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.13684082, + "step": 6511, + "time_per_iteration": 2.493088722229004 + }, + { + "auxiliary_loss_clip": 0.06343137, + "auxiliary_loss_mlp": 0.01261237, + "balance_loss_clip": 0.06266295, + "balance_loss_mlp": 0.01258513, + "epoch": 0.3915226213738163, + "flos": 67659659291520.0, + "grad_norm": 0.7080449531762238, + "language_loss": 0.57720256, + "learning_rate": 2.7786152217570196e-06, + "loss": 0.65324628, + "num_input_tokens_seen": 139869700, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.02726746, + "step": 6512, + "time_per_iteration": 3.217658042907715 + }, + { + "auxiliary_loss_clip": 0.06445479, + "auxiliary_loss_mlp": 0.01273045, + "balance_loss_clip": 0.06275767, + "balance_loss_mlp": 0.01257452, + "epoch": 0.3915827446264843, + "flos": 26366516242560.0, + "grad_norm": 1.5252758876056967, + "language_loss": 0.69950658, + "learning_rate": 2.7782564708030647e-06, + "loss": 0.77669179, + "num_input_tokens_seen": 139890140, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.15600586, + "step": 6513, + "time_per_iteration": 2.560802936553955 + }, + { + "auxiliary_loss_clip": 0.06451759, + "auxiliary_loss_mlp": 0.01273121, + "balance_loss_clip": 0.06276838, + "balance_loss_mlp": 0.01258208, + "epoch": 0.39164286787915226, + "flos": 21950236074240.0, + "grad_norm": 2.7587511630204777, + "language_loss": 0.76322639, + "learning_rate": 2.7778976903375464e-06, + "loss": 0.8404752, + "num_input_tokens_seen": 139908020, + "router_z_loss_clip": 1.74707031, + "router_z_loss_mlp": 0.14916992, + "step": 6514, + "time_per_iteration": 2.499101400375366 + }, + { + "auxiliary_loss_clip": 0.0644438, + "auxiliary_loss_mlp": 0.01269565, + "balance_loss_clip": 0.06276566, + "balance_loss_mlp": 0.0125619, + "epoch": 0.3917029911318202, + "flos": 16405536170880.0, + "grad_norm": 1.811906351936664, + "language_loss": 0.782359, + "learning_rate": 2.7775388803740693e-06, + "loss": 0.8594985, + "num_input_tokens_seen": 139926180, + "router_z_loss_clip": 1.67773438, + "router_z_loss_mlp": 0.13378906, + "step": 6515, + "time_per_iteration": 2.5104947090148926 + }, + { + "auxiliary_loss_clip": 0.06443886, + "auxiliary_loss_mlp": 0.01270163, + "balance_loss_clip": 0.06277545, + "balance_loss_mlp": 0.0125705, + "epoch": 0.3917631143844882, + "flos": 26218580659200.0, + "grad_norm": 1.4298617884300358, + "language_loss": 0.79790455, + "learning_rate": 2.7771800409262406e-06, + "loss": 0.87504506, + "num_input_tokens_seen": 139947420, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.13122559, + "step": 6516, + "time_per_iteration": 2.5912764072418213 + }, + { + "auxiliary_loss_clip": 0.06446922, + "auxiliary_loss_mlp": 0.0126951, + "balance_loss_clip": 0.06278265, + "balance_loss_mlp": 0.0125511, + "epoch": 0.39182323763715615, + "flos": 18553740422400.0, + "grad_norm": 1.8457537699229483, + "language_loss": 0.70234001, + "learning_rate": 2.7768211720076665e-06, + "loss": 0.7795043, + "num_input_tokens_seen": 139965800, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.14404297, + "step": 6517, + "time_per_iteration": 2.630155324935913 + }, + { + "auxiliary_loss_clip": 0.06449963, + "auxiliary_loss_mlp": 0.01269735, + "balance_loss_clip": 0.06279542, + "balance_loss_mlp": 0.01254905, + "epoch": 0.3918833608898241, + "flos": 34322112547200.0, + "grad_norm": 1.6944592538331644, + "language_loss": 0.72209281, + "learning_rate": 2.776462273631956e-06, + "loss": 0.79928982, + "num_input_tokens_seen": 139988140, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.1484375, + "step": 6518, + "time_per_iteration": 2.6439340114593506 + }, + { + "auxiliary_loss_clip": 0.06453219, + "auxiliary_loss_mlp": 0.0127268, + "balance_loss_clip": 0.06280756, + "balance_loss_mlp": 0.0125751, + "epoch": 0.3919434841424921, + "flos": 36948434595840.0, + "grad_norm": 1.7409198797741048, + "language_loss": 0.62180024, + "learning_rate": 2.7761033458127177e-06, + "loss": 0.69905925, + "num_input_tokens_seen": 140010060, + "router_z_loss_clip": 1.72363281, + "router_z_loss_mlp": 0.15179443, + "step": 6519, + "time_per_iteration": 2.6407580375671387 + }, + { + "auxiliary_loss_clip": 0.06457552, + "auxiliary_loss_mlp": 0.01269986, + "balance_loss_clip": 0.06280086, + "balance_loss_mlp": 0.01253535, + "epoch": 0.3920036073951601, + "flos": 23514915692160.0, + "grad_norm": 2.3243103288051485, + "language_loss": 0.6728406, + "learning_rate": 2.775744388563563e-06, + "loss": 0.75011599, + "num_input_tokens_seen": 140029400, + "router_z_loss_clip": 1.77441406, + "router_z_loss_mlp": 0.16442871, + "step": 6520, + "time_per_iteration": 2.557736396789551 + }, + { + "auxiliary_loss_clip": 0.06452015, + "auxiliary_loss_mlp": 0.01272672, + "balance_loss_clip": 0.06281003, + "balance_loss_mlp": 0.0125845, + "epoch": 0.39206373064782807, + "flos": 18412051968000.0, + "grad_norm": 5.792319014223258, + "language_loss": 0.79119205, + "learning_rate": 2.775385401898104e-06, + "loss": 0.86843884, + "num_input_tokens_seen": 140048940, + "router_z_loss_clip": 1.70898438, + "router_z_loss_mlp": 0.14233398, + "step": 6521, + "time_per_iteration": 2.487144947052002 + }, + { + "auxiliary_loss_clip": 0.0645816, + "auxiliary_loss_mlp": 0.01271766, + "balance_loss_clip": 0.06282392, + "balance_loss_mlp": 0.01255297, + "epoch": 0.39212385390049603, + "flos": 12318012696960.0, + "grad_norm": 2.63137671789129, + "language_loss": 0.70893902, + "learning_rate": 2.775026385829952e-06, + "loss": 0.78623831, + "num_input_tokens_seen": 140066380, + "router_z_loss_clip": 1.75976562, + "router_z_loss_mlp": 0.16473389, + "step": 6522, + "time_per_iteration": 2.501777410507202 + }, + { + "auxiliary_loss_clip": 0.06455532, + "auxiliary_loss_mlp": 0.01272148, + "balance_loss_clip": 0.06282486, + "balance_loss_mlp": 0.01257693, + "epoch": 0.392183977153164, + "flos": 19725275882880.0, + "grad_norm": 2.1277990565539087, + "language_loss": 0.77424598, + "learning_rate": 2.774667340372722e-06, + "loss": 0.8515228, + "num_input_tokens_seen": 140085275, + "router_z_loss_clip": 1.73046875, + "router_z_loss_mlp": 0.14453125, + "step": 6523, + "time_per_iteration": 2.494900941848755 + }, + { + "auxiliary_loss_clip": 0.0645543, + "auxiliary_loss_mlp": 0.01272716, + "balance_loss_clip": 0.06282179, + "balance_loss_mlp": 0.01258769, + "epoch": 0.39224410040583196, + "flos": 33153092709120.0, + "grad_norm": 2.7826558407508855, + "language_loss": 0.62314886, + "learning_rate": 2.7743082655400293e-06, + "loss": 0.70043033, + "num_input_tokens_seen": 140105105, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.13964844, + "step": 6524, + "time_per_iteration": 2.6380085945129395 + }, + { + "auxiliary_loss_clip": 0.06452876, + "auxiliary_loss_mlp": 0.01268165, + "balance_loss_clip": 0.06281661, + "balance_loss_mlp": 0.01252895, + "epoch": 0.39230422365849993, + "flos": 27789884749440.0, + "grad_norm": 1.7105729654368218, + "language_loss": 0.74638754, + "learning_rate": 2.773949161345489e-06, + "loss": 0.82359803, + "num_input_tokens_seen": 140125645, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.15264893, + "step": 6525, + "time_per_iteration": 2.5430080890655518 + }, + { + "auxiliary_loss_clip": 0.06454577, + "auxiliary_loss_mlp": 0.0126824, + "balance_loss_clip": 0.06280737, + "balance_loss_mlp": 0.01253863, + "epoch": 0.3923643469111679, + "flos": 17937497969280.0, + "grad_norm": 2.1060109606385673, + "language_loss": 0.8182255, + "learning_rate": 2.773590027802719e-06, + "loss": 0.89545369, + "num_input_tokens_seen": 140141925, + "router_z_loss_clip": 1.73925781, + "router_z_loss_mlp": 0.14367676, + "step": 6526, + "time_per_iteration": 2.4994354248046875 + }, + { + "auxiliary_loss_clip": 0.06454204, + "auxiliary_loss_mlp": 0.01269978, + "balance_loss_clip": 0.06281518, + "balance_loss_mlp": 0.01255482, + "epoch": 0.39242447016383586, + "flos": 24066141776640.0, + "grad_norm": 1.5927090967738864, + "language_loss": 0.70157206, + "learning_rate": 2.7732308649253383e-06, + "loss": 0.77881384, + "num_input_tokens_seen": 140160965, + "router_z_loss_clip": 1.72753906, + "router_z_loss_mlp": 0.14501953, + "step": 6527, + "time_per_iteration": 2.5232326984405518 + }, + { + "auxiliary_loss_clip": 0.06452368, + "auxiliary_loss_mlp": 0.01268854, + "balance_loss_clip": 0.06281934, + "balance_loss_mlp": 0.01254245, + "epoch": 0.3924845934165038, + "flos": 10667562825600.0, + "grad_norm": 3.256824520755738, + "language_loss": 0.82039493, + "learning_rate": 2.772871672726965e-06, + "loss": 0.89760715, + "num_input_tokens_seen": 140177780, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.14605713, + "step": 6528, + "time_per_iteration": 2.498852014541626 + }, + { + "auxiliary_loss_clip": 0.06450985, + "auxiliary_loss_mlp": 0.0127277, + "balance_loss_clip": 0.06284485, + "balance_loss_mlp": 0.01258048, + "epoch": 0.3925447166691718, + "flos": 31253493121920.0, + "grad_norm": 1.712128770360143, + "language_loss": 0.68666142, + "learning_rate": 2.7725124512212205e-06, + "loss": 0.76389897, + "num_input_tokens_seen": 140201660, + "router_z_loss_clip": 1.66308594, + "router_z_loss_mlp": 0.14733887, + "step": 6529, + "time_per_iteration": 2.588303565979004 + }, + { + "auxiliary_loss_clip": 0.06454393, + "auxiliary_loss_mlp": 0.01267174, + "balance_loss_clip": 0.06281163, + "balance_loss_mlp": 0.01252213, + "epoch": 0.39260483992183975, + "flos": 29421215890560.0, + "grad_norm": 2.512935177473184, + "language_loss": 0.80622673, + "learning_rate": 2.7721532004217267e-06, + "loss": 0.8834424, + "num_input_tokens_seen": 140218585, + "router_z_loss_clip": 1.73339844, + "router_z_loss_mlp": 0.14959717, + "step": 6530, + "time_per_iteration": 2.5896732807159424 + }, + { + "auxiliary_loss_clip": 0.06449011, + "auxiliary_loss_mlp": 0.01267415, + "balance_loss_clip": 0.06279489, + "balance_loss_mlp": 0.0125252, + "epoch": 0.3926649631745077, + "flos": 22864571827200.0, + "grad_norm": 1.8446830755174628, + "language_loss": 0.76176864, + "learning_rate": 2.7717939203421063e-06, + "loss": 0.83893287, + "num_input_tokens_seen": 140239905, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.14892578, + "step": 6531, + "time_per_iteration": 3.9335060119628906 + }, + { + "auxiliary_loss_clip": 0.06348795, + "auxiliary_loss_mlp": 0.01256081, + "balance_loss_clip": 0.06271987, + "balance_loss_mlp": 0.01253434, + "epoch": 0.3927250864271757, + "flos": 63911892124800.0, + "grad_norm": 0.7987882767963658, + "language_loss": 0.6030035, + "learning_rate": 2.7714346109959822e-06, + "loss": 0.67905223, + "num_input_tokens_seen": 140293820, + "router_z_loss_clip": 0.76611328, + "router_z_loss_mlp": 0.02648926, + "step": 6532, + "time_per_iteration": 3.023615598678589 + }, + { + "auxiliary_loss_clip": 0.06346735, + "auxiliary_loss_mlp": 0.01258162, + "balance_loss_clip": 0.06270058, + "balance_loss_mlp": 0.01255445, + "epoch": 0.3927852096798437, + "flos": 68931486489600.0, + "grad_norm": 0.7618686105615924, + "language_loss": 0.55496854, + "learning_rate": 2.771075272396981e-06, + "loss": 0.63101745, + "num_input_tokens_seen": 140360420, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.02720642, + "step": 6533, + "time_per_iteration": 3.2504148483276367 + }, + { + "auxiliary_loss_clip": 0.06452841, + "auxiliary_loss_mlp": 0.01269959, + "balance_loss_clip": 0.06277935, + "balance_loss_mlp": 0.01254557, + "epoch": 0.39284533293251167, + "flos": 29723711529600.0, + "grad_norm": 1.823371664681604, + "language_loss": 0.76552856, + "learning_rate": 2.7707159045587284e-06, + "loss": 0.84275657, + "num_input_tokens_seen": 140381950, + "router_z_loss_clip": 1.74804688, + "router_z_loss_mlp": 0.15405273, + "step": 6534, + "time_per_iteration": 4.098775148391724 + }, + { + "auxiliary_loss_clip": 0.06459314, + "auxiliary_loss_mlp": 0.01269352, + "balance_loss_clip": 0.06282811, + "balance_loss_mlp": 0.01253974, + "epoch": 0.39290545618517964, + "flos": 18558016980480.0, + "grad_norm": 2.2164588420846267, + "language_loss": 0.78656316, + "learning_rate": 2.770356507494851e-06, + "loss": 0.86384982, + "num_input_tokens_seen": 140399410, + "router_z_loss_clip": 1.76464844, + "router_z_loss_mlp": 0.15380859, + "step": 6535, + "time_per_iteration": 2.4923341274261475 + }, + { + "auxiliary_loss_clip": 0.06449763, + "auxiliary_loss_mlp": 0.01266735, + "balance_loss_clip": 0.06282885, + "balance_loss_mlp": 0.01253592, + "epoch": 0.3929655794378476, + "flos": 26256581285760.0, + "grad_norm": 2.2738959430224326, + "language_loss": 0.69076276, + "learning_rate": 2.769997081218978e-06, + "loss": 0.76792771, + "num_input_tokens_seen": 140419055, + "router_z_loss_clip": 1.66699219, + "router_z_loss_mlp": 0.1315918, + "step": 6536, + "time_per_iteration": 2.5980727672576904 + }, + { + "auxiliary_loss_clip": 0.06448898, + "auxiliary_loss_mlp": 0.0127095, + "balance_loss_clip": 0.0628474, + "balance_loss_mlp": 0.01257265, + "epoch": 0.39302570269051557, + "flos": 29285564929920.0, + "grad_norm": 1.8741537429596062, + "language_loss": 0.69716197, + "learning_rate": 2.769637625744738e-06, + "loss": 0.77436042, + "num_input_tokens_seen": 140438800, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.13684082, + "step": 6537, + "time_per_iteration": 4.096014499664307 + }, + { + "auxiliary_loss_clip": 0.064602, + "auxiliary_loss_mlp": 0.01269576, + "balance_loss_clip": 0.06288625, + "balance_loss_mlp": 0.01255432, + "epoch": 0.39308582594318353, + "flos": 17353134794880.0, + "grad_norm": 1.7942703591990323, + "language_loss": 0.79606509, + "learning_rate": 2.769278141085763e-06, + "loss": 0.8733629, + "num_input_tokens_seen": 140456880, + "router_z_loss_clip": 1.71582031, + "router_z_loss_mlp": 0.14129639, + "step": 6538, + "time_per_iteration": 2.578815221786499 + }, + { + "auxiliary_loss_clip": 0.06359898, + "auxiliary_loss_mlp": 0.01255927, + "balance_loss_clip": 0.06283404, + "balance_loss_mlp": 0.0125297, + "epoch": 0.3931459491958515, + "flos": 61023884175360.0, + "grad_norm": 0.7947880980854773, + "language_loss": 0.61826062, + "learning_rate": 2.768918627255683e-06, + "loss": 0.69441885, + "num_input_tokens_seen": 140507510, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.02955627, + "step": 6539, + "time_per_iteration": 2.9553403854370117 + }, + { + "auxiliary_loss_clip": 0.06458268, + "auxiliary_loss_mlp": 0.01272646, + "balance_loss_clip": 0.06289513, + "balance_loss_mlp": 0.01257339, + "epoch": 0.39320607244851946, + "flos": 39024662590080.0, + "grad_norm": 2.4294685123961295, + "language_loss": 0.68263721, + "learning_rate": 2.7685590842681315e-06, + "loss": 0.75994635, + "num_input_tokens_seen": 140528740, + "router_z_loss_clip": 1.68554688, + "router_z_loss_mlp": 0.15307617, + "step": 6540, + "time_per_iteration": 2.732541799545288 + }, + { + "auxiliary_loss_clip": 0.06455955, + "auxiliary_loss_mlp": 0.01271651, + "balance_loss_clip": 0.06287128, + "balance_loss_mlp": 0.0125613, + "epoch": 0.3932661957011874, + "flos": 24686451152640.0, + "grad_norm": 1.7600019176005988, + "language_loss": 0.72681171, + "learning_rate": 2.7681995121367433e-06, + "loss": 0.80408776, + "num_input_tokens_seen": 140547560, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.15527344, + "step": 6541, + "time_per_iteration": 4.03834342956543 + }, + { + "auxiliary_loss_clip": 0.06358681, + "auxiliary_loss_mlp": 0.01262146, + "balance_loss_clip": 0.06282184, + "balance_loss_mlp": 0.01259297, + "epoch": 0.3933263189538554, + "flos": 70115614790400.0, + "grad_norm": 0.7938144397826515, + "language_loss": 0.60408866, + "learning_rate": 2.7678399108751516e-06, + "loss": 0.6802969, + "num_input_tokens_seen": 140601175, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.02844238, + "step": 6542, + "time_per_iteration": 3.0015151500701904 + }, + { + "auxiliary_loss_clip": 0.06453243, + "auxiliary_loss_mlp": 0.01279318, + "balance_loss_clip": 0.0628323, + "balance_loss_mlp": 0.01265305, + "epoch": 0.39338644220652336, + "flos": 22935583762560.0, + "grad_norm": 1.4413337304531033, + "language_loss": 0.82278919, + "learning_rate": 2.7674802804969947e-06, + "loss": 0.90011483, + "num_input_tokens_seen": 140622200, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.14013672, + "step": 6543, + "time_per_iteration": 2.6289048194885254 + }, + { + "auxiliary_loss_clip": 0.06454003, + "auxiliary_loss_mlp": 0.01270252, + "balance_loss_clip": 0.06284549, + "balance_loss_mlp": 0.01255768, + "epoch": 0.3934465654591913, + "flos": 30856282041600.0, + "grad_norm": 1.7408174737933344, + "language_loss": 0.69224536, + "learning_rate": 2.767120621015908e-06, + "loss": 0.76948798, + "num_input_tokens_seen": 140643125, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.14489746, + "step": 6544, + "time_per_iteration": 2.6554784774780273 + }, + { + "auxiliary_loss_clip": 0.06466363, + "auxiliary_loss_mlp": 0.01274712, + "balance_loss_clip": 0.06291823, + "balance_loss_mlp": 0.01258524, + "epoch": 0.3935066887118593, + "flos": 29243329672320.0, + "grad_norm": 2.0329338261061887, + "language_loss": 0.75462705, + "learning_rate": 2.76676093244553e-06, + "loss": 0.83203781, + "num_input_tokens_seen": 140662500, + "router_z_loss_clip": 1.74511719, + "router_z_loss_mlp": 0.1619873, + "step": 6545, + "time_per_iteration": 2.606234312057495 + }, + { + "auxiliary_loss_clip": 0.06446254, + "auxiliary_loss_mlp": 0.01275344, + "balance_loss_clip": 0.06285709, + "balance_loss_mlp": 0.01262309, + "epoch": 0.3935668119645273, + "flos": 19141290051840.0, + "grad_norm": 1.4467327313094591, + "language_loss": 0.75122333, + "learning_rate": 2.7664012147995015e-06, + "loss": 0.82843935, + "num_input_tokens_seen": 140681960, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.13043213, + "step": 6546, + "time_per_iteration": 2.5514185428619385 + }, + { + "auxiliary_loss_clip": 0.06461848, + "auxiliary_loss_mlp": 0.01270617, + "balance_loss_clip": 0.06285486, + "balance_loss_mlp": 0.01254822, + "epoch": 0.3936269352171953, + "flos": 18522196560000.0, + "grad_norm": 2.187625212538507, + "language_loss": 0.82285661, + "learning_rate": 2.7660414680914617e-06, + "loss": 0.90018129, + "num_input_tokens_seen": 140699170, + "router_z_loss_clip": 1.76367188, + "router_z_loss_mlp": 0.15783691, + "step": 6547, + "time_per_iteration": 2.536921501159668 + }, + { + "auxiliary_loss_clip": 0.06454909, + "auxiliary_loss_mlp": 0.01273072, + "balance_loss_clip": 0.06285325, + "balance_loss_mlp": 0.01259685, + "epoch": 0.39368705846986324, + "flos": 15638255533440.0, + "grad_norm": 1.8611217813328955, + "language_loss": 0.84309554, + "learning_rate": 2.7656816923350525e-06, + "loss": 0.92037535, + "num_input_tokens_seen": 140714920, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.1340332, + "step": 6548, + "time_per_iteration": 2.586596727371216 + }, + { + "auxiliary_loss_clip": 0.06451154, + "auxiliary_loss_mlp": 0.01275141, + "balance_loss_clip": 0.06285168, + "balance_loss_mlp": 0.01261325, + "epoch": 0.3937471817225312, + "flos": 21332442320640.0, + "grad_norm": 1.5541020214417252, + "language_loss": 0.7306931, + "learning_rate": 2.7653218875439174e-06, + "loss": 0.8079561, + "num_input_tokens_seen": 140734595, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.13842773, + "step": 6549, + "time_per_iteration": 2.5176355838775635 + }, + { + "auxiliary_loss_clip": 0.06453951, + "auxiliary_loss_mlp": 0.0127461, + "balance_loss_clip": 0.06282675, + "balance_loss_mlp": 0.01258398, + "epoch": 0.39380730497519917, + "flos": 20782893317760.0, + "grad_norm": 1.443831260247086, + "language_loss": 0.77958995, + "learning_rate": 2.764962053731699e-06, + "loss": 0.85687554, + "num_input_tokens_seen": 140754050, + "router_z_loss_clip": 1.71191406, + "router_z_loss_mlp": 0.16204834, + "step": 6550, + "time_per_iteration": 2.5665266513824463 + }, + { + "auxiliary_loss_clip": 0.06449334, + "auxiliary_loss_mlp": 0.01268564, + "balance_loss_clip": 0.0628082, + "balance_loss_mlp": 0.01254455, + "epoch": 0.39386742822786713, + "flos": 21615106469760.0, + "grad_norm": 1.5479702434138036, + "language_loss": 0.81395853, + "learning_rate": 2.7646021909120434e-06, + "loss": 0.89113748, + "num_input_tokens_seen": 140771440, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.14129639, + "step": 6551, + "time_per_iteration": 2.509472370147705 + }, + { + "auxiliary_loss_clip": 0.06452134, + "auxiliary_loss_mlp": 0.01274621, + "balance_loss_clip": 0.06282679, + "balance_loss_mlp": 0.01259791, + "epoch": 0.3939275514805351, + "flos": 12418304434560.0, + "grad_norm": 2.3772322810911892, + "language_loss": 0.80163503, + "learning_rate": 2.764242299098596e-06, + "loss": 0.87890255, + "num_input_tokens_seen": 140786715, + "router_z_loss_clip": 1.69433594, + "router_z_loss_mlp": 0.14825439, + "step": 6552, + "time_per_iteration": 2.512632369995117 + }, + { + "auxiliary_loss_clip": 0.06458388, + "auxiliary_loss_mlp": 0.01271806, + "balance_loss_clip": 0.06285821, + "balance_loss_mlp": 0.01256803, + "epoch": 0.39398767473320306, + "flos": 18558016980480.0, + "grad_norm": 1.9836463121020687, + "language_loss": 0.71468151, + "learning_rate": 2.763882378305003e-06, + "loss": 0.79198349, + "num_input_tokens_seen": 140804950, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.14996338, + "step": 6553, + "time_per_iteration": 2.4973459243774414 + }, + { + "auxiliary_loss_clip": 0.06447914, + "auxiliary_loss_mlp": 0.01269169, + "balance_loss_clip": 0.06280744, + "balance_loss_mlp": 0.0125422, + "epoch": 0.39404779798587103, + "flos": 29315599418880.0, + "grad_norm": 1.8230931816174483, + "language_loss": 0.64176017, + "learning_rate": 2.7635224285449144e-06, + "loss": 0.71893102, + "num_input_tokens_seen": 140822800, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.14941406, + "step": 6554, + "time_per_iteration": 2.6340816020965576 + }, + { + "auxiliary_loss_clip": 0.06448209, + "auxiliary_loss_mlp": 0.01269545, + "balance_loss_clip": 0.06281387, + "balance_loss_mlp": 0.0125561, + "epoch": 0.394107921238539, + "flos": 34905679107840.0, + "grad_norm": 1.8577413865682035, + "language_loss": 0.79801202, + "learning_rate": 2.7631624498319796e-06, + "loss": 0.8751896, + "num_input_tokens_seen": 140842940, + "router_z_loss_clip": 1.66894531, + "router_z_loss_mlp": 0.13934326, + "step": 6555, + "time_per_iteration": 2.673266887664795 + }, + { + "auxiliary_loss_clip": 0.06451041, + "auxiliary_loss_mlp": 0.01267708, + "balance_loss_clip": 0.06280783, + "balance_loss_mlp": 0.01252748, + "epoch": 0.39416804449120696, + "flos": 25088232280320.0, + "grad_norm": 1.8326733466575391, + "language_loss": 0.72028196, + "learning_rate": 2.7628024421798473e-06, + "loss": 0.79746938, + "num_input_tokens_seen": 140863060, + "router_z_loss_clip": 1.70117188, + "router_z_loss_mlp": 0.1496582, + "step": 6556, + "time_per_iteration": 2.572880744934082 + }, + { + "auxiliary_loss_clip": 0.06448796, + "auxiliary_loss_mlp": 0.01268731, + "balance_loss_clip": 0.06281175, + "balance_loss_mlp": 0.01254348, + "epoch": 0.3942281677438749, + "flos": 32314842063360.0, + "grad_norm": 2.2262653228658666, + "language_loss": 0.83903825, + "learning_rate": 2.7624424056021705e-06, + "loss": 0.91621351, + "num_input_tokens_seen": 140883795, + "router_z_loss_clip": 1.67675781, + "router_z_loss_mlp": 0.14373779, + "step": 6557, + "time_per_iteration": 2.605922222137451 + }, + { + "auxiliary_loss_clip": 0.06447846, + "auxiliary_loss_mlp": 0.01272636, + "balance_loss_clip": 0.06281336, + "balance_loss_mlp": 0.01258671, + "epoch": 0.3942882909965429, + "flos": 24943608933120.0, + "grad_norm": 2.1784611950300605, + "language_loss": 0.80248392, + "learning_rate": 2.7620823401126004e-06, + "loss": 0.87968874, + "num_input_tokens_seen": 140903055, + "router_z_loss_clip": 1.6640625, + "router_z_loss_mlp": 0.1395874, + "step": 6558, + "time_per_iteration": 2.5902092456817627 + }, + { + "auxiliary_loss_clip": 0.06445447, + "auxiliary_loss_mlp": 0.01267686, + "balance_loss_clip": 0.06280681, + "balance_loss_mlp": 0.01253816, + "epoch": 0.39434841424921085, + "flos": 11879614535040.0, + "grad_norm": 2.1357186014692546, + "language_loss": 0.71689725, + "learning_rate": 2.761722245724792e-06, + "loss": 0.79402852, + "num_input_tokens_seen": 140920685, + "router_z_loss_clip": 1.64648438, + "router_z_loss_mlp": 0.13873291, + "step": 6559, + "time_per_iteration": 2.4894917011260986 + }, + { + "auxiliary_loss_clip": 0.06456885, + "auxiliary_loss_mlp": 0.01269254, + "balance_loss_clip": 0.0628094, + "balance_loss_mlp": 0.01254622, + "epoch": 0.3944085375018789, + "flos": 16367032419840.0, + "grad_norm": 2.0841749511208705, + "language_loss": 0.81285572, + "learning_rate": 2.7613621224524003e-06, + "loss": 0.89011705, + "num_input_tokens_seen": 140937320, + "router_z_loss_clip": 1.7578125, + "router_z_loss_mlp": 0.14630127, + "step": 6560, + "time_per_iteration": 2.522434711456299 + }, + { + "auxiliary_loss_clip": 0.06452034, + "auxiliary_loss_mlp": 0.0126948, + "balance_loss_clip": 0.06282307, + "balance_loss_mlp": 0.01254078, + "epoch": 0.39446866075454684, + "flos": 10637821825920.0, + "grad_norm": 3.641985825462619, + "language_loss": 0.83127379, + "learning_rate": 2.7610019703090803e-06, + "loss": 0.90848899, + "num_input_tokens_seen": 140954855, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.15386963, + "step": 6561, + "time_per_iteration": 2.4804983139038086 + }, + { + "auxiliary_loss_clip": 0.06450383, + "auxiliary_loss_mlp": 0.0127031, + "balance_loss_clip": 0.06283262, + "balance_loss_mlp": 0.01257102, + "epoch": 0.3945287840072148, + "flos": 18193481792640.0, + "grad_norm": 2.043086634933395, + "language_loss": 0.80616236, + "learning_rate": 2.7606417893084887e-06, + "loss": 0.88336933, + "num_input_tokens_seen": 140973250, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.13208008, + "step": 6562, + "time_per_iteration": 2.5335006713867188 + }, + { + "auxiliary_loss_clip": 0.06448314, + "auxiliary_loss_mlp": 0.01268686, + "balance_loss_clip": 0.06283693, + "balance_loss_mlp": 0.01254476, + "epoch": 0.39458890725988277, + "flos": 23046650749440.0, + "grad_norm": 1.5717146465742573, + "language_loss": 0.81509531, + "learning_rate": 2.7602815794642853e-06, + "loss": 0.89226532, + "num_input_tokens_seen": 140993050, + "router_z_loss_clip": 1.64648438, + "router_z_loss_mlp": 0.14215088, + "step": 6563, + "time_per_iteration": 2.5315918922424316 + }, + { + "auxiliary_loss_clip": 0.06453238, + "auxiliary_loss_mlp": 0.01270349, + "balance_loss_clip": 0.0628344, + "balance_loss_mlp": 0.0125608, + "epoch": 0.39464903051255074, + "flos": 17163718640640.0, + "grad_norm": 1.8608988788141587, + "language_loss": 0.70080984, + "learning_rate": 2.759921340790127e-06, + "loss": 0.77804577, + "num_input_tokens_seen": 141010815, + "router_z_loss_clip": 1.69726562, + "router_z_loss_mlp": 0.14257812, + "step": 6564, + "time_per_iteration": 2.543459415435791 + }, + { + "auxiliary_loss_clip": 0.06449583, + "auxiliary_loss_mlp": 0.01269395, + "balance_loss_clip": 0.06281252, + "balance_loss_mlp": 0.01254648, + "epoch": 0.3947091537652187, + "flos": 15894616700160.0, + "grad_norm": 2.288586168499947, + "language_loss": 0.83967394, + "learning_rate": 2.759561073299676e-06, + "loss": 0.91686368, + "num_input_tokens_seen": 141028720, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.14746094, + "step": 6565, + "time_per_iteration": 2.5438666343688965 + }, + { + "auxiliary_loss_clip": 0.06447474, + "auxiliary_loss_mlp": 0.01269356, + "balance_loss_clip": 0.06280743, + "balance_loss_mlp": 0.01255229, + "epoch": 0.39476927701788667, + "flos": 18550386259200.0, + "grad_norm": 2.0020652066074285, + "language_loss": 0.83519006, + "learning_rate": 2.7592007770065937e-06, + "loss": 0.91235834, + "num_input_tokens_seen": 141046025, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.14129639, + "step": 6566, + "time_per_iteration": 2.550548791885376 + }, + { + "auxiliary_loss_clip": 0.06459671, + "auxiliary_loss_mlp": 0.01271058, + "balance_loss_clip": 0.06282969, + "balance_loss_mlp": 0.01255072, + "epoch": 0.39482940027055463, + "flos": 22282682348160.0, + "grad_norm": 1.770017298907609, + "language_loss": 0.77499187, + "learning_rate": 2.7588404519245403e-06, + "loss": 0.85229909, + "num_input_tokens_seen": 141066865, + "router_z_loss_clip": 1.76660156, + "router_z_loss_mlp": 0.15979004, + "step": 6567, + "time_per_iteration": 2.535980463027954 + }, + { + "auxiliary_loss_clip": 0.0644526, + "auxiliary_loss_mlp": 0.01270792, + "balance_loss_clip": 0.06283294, + "balance_loss_mlp": 0.01257851, + "epoch": 0.3948895235232226, + "flos": 14763010510080.0, + "grad_norm": 1.9280900707618294, + "language_loss": 0.80259991, + "learning_rate": 2.758480098067182e-06, + "loss": 0.87976044, + "num_input_tokens_seen": 141084210, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.12945557, + "step": 6568, + "time_per_iteration": 2.56528639793396 + }, + { + "auxiliary_loss_clip": 0.06451409, + "auxiliary_loss_mlp": 0.01272888, + "balance_loss_clip": 0.06283959, + "balance_loss_mlp": 0.01258356, + "epoch": 0.39494964677589056, + "flos": 22572474094080.0, + "grad_norm": 2.8189067544408166, + "language_loss": 0.84836519, + "learning_rate": 2.7581197154481816e-06, + "loss": 0.9256081, + "num_input_tokens_seen": 141103895, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.1451416, + "step": 6569, + "time_per_iteration": 2.512678623199463 + }, + { + "auxiliary_loss_clip": 0.06448043, + "auxiliary_loss_mlp": 0.01269688, + "balance_loss_clip": 0.06284526, + "balance_loss_mlp": 0.01255538, + "epoch": 0.3950097700285585, + "flos": 22969307831040.0, + "grad_norm": 1.7602858722639216, + "language_loss": 0.74665594, + "learning_rate": 2.7577593040812066e-06, + "loss": 0.82383323, + "num_input_tokens_seen": 141124000, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.14147949, + "step": 6570, + "time_per_iteration": 2.611072063446045 + }, + { + "auxiliary_loss_clip": 0.06447589, + "auxiliary_loss_mlp": 0.01270515, + "balance_loss_clip": 0.06279834, + "balance_loss_mlp": 0.01256305, + "epoch": 0.3950698932812265, + "flos": 20601569082240.0, + "grad_norm": 1.9769080404363342, + "language_loss": 0.80472994, + "learning_rate": 2.757398863979922e-06, + "loss": 0.88191104, + "num_input_tokens_seen": 141142535, + "router_z_loss_clip": 1.67871094, + "router_z_loss_mlp": 0.14196777, + "step": 6571, + "time_per_iteration": 4.037761688232422 + }, + { + "auxiliary_loss_clip": 0.06446905, + "auxiliary_loss_mlp": 0.012706, + "balance_loss_clip": 0.06278758, + "balance_loss_mlp": 0.01257022, + "epoch": 0.39513001653389446, + "flos": 20381992657920.0, + "grad_norm": 1.599556952476494, + "language_loss": 0.78081018, + "learning_rate": 2.757038395157997e-06, + "loss": 0.8579852, + "num_input_tokens_seen": 141161575, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.13574219, + "step": 6572, + "time_per_iteration": 2.542388439178467 + }, + { + "auxiliary_loss_clip": 0.06450671, + "auxiliary_loss_mlp": 0.01268422, + "balance_loss_clip": 0.06281148, + "balance_loss_mlp": 0.01253991, + "epoch": 0.3951901397865625, + "flos": 26469994435200.0, + "grad_norm": 1.9679034095416588, + "language_loss": 0.74861181, + "learning_rate": 2.7566778976291002e-06, + "loss": 0.8258028, + "num_input_tokens_seen": 141181150, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.14434814, + "step": 6573, + "time_per_iteration": 3.9954564571380615 + }, + { + "auxiliary_loss_clip": 0.06447303, + "auxiliary_loss_mlp": 0.012677, + "balance_loss_clip": 0.06282736, + "balance_loss_mlp": 0.0125492, + "epoch": 0.39525026303923044, + "flos": 43848845233920.0, + "grad_norm": 1.4348738267970096, + "language_loss": 0.67874503, + "learning_rate": 2.7563173714069017e-06, + "loss": 0.75589502, + "num_input_tokens_seen": 141206310, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.12799072, + "step": 6574, + "time_per_iteration": 2.75056791305542 + }, + { + "auxiliary_loss_clip": 0.06451984, + "auxiliary_loss_mlp": 0.01270185, + "balance_loss_clip": 0.06284595, + "balance_loss_mlp": 0.01255832, + "epoch": 0.3953103862918984, + "flos": 18046636312320.0, + "grad_norm": 3.0759560063082736, + "language_loss": 0.72770178, + "learning_rate": 2.755956816505072e-06, + "loss": 0.80492353, + "num_input_tokens_seen": 141223925, + "router_z_loss_clip": 1.67675781, + "router_z_loss_mlp": 0.14355469, + "step": 6575, + "time_per_iteration": 2.508314847946167 + }, + { + "auxiliary_loss_clip": 0.06452627, + "auxiliary_loss_mlp": 0.01270422, + "balance_loss_clip": 0.0628259, + "balance_loss_mlp": 0.01256015, + "epoch": 0.3953705095445664, + "flos": 16980549615360.0, + "grad_norm": 2.3956956088423382, + "language_loss": 0.73929548, + "learning_rate": 2.7555962329372845e-06, + "loss": 0.816526, + "num_input_tokens_seen": 141239010, + "router_z_loss_clip": 1.69824219, + "router_z_loss_mlp": 0.1439209, + "step": 6576, + "time_per_iteration": 2.4877238273620605 + }, + { + "auxiliary_loss_clip": 0.06453596, + "auxiliary_loss_mlp": 0.01269813, + "balance_loss_clip": 0.06286615, + "balance_loss_mlp": 0.0125704, + "epoch": 0.39543063279723434, + "flos": 17415300124800.0, + "grad_norm": 2.3089155525157397, + "language_loss": 0.8424108, + "learning_rate": 2.7552356207172124e-06, + "loss": 0.91964483, + "num_input_tokens_seen": 141252255, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.12786865, + "step": 6577, + "time_per_iteration": 3.9026546478271484 + }, + { + "auxiliary_loss_clip": 0.06447916, + "auxiliary_loss_mlp": 0.01269176, + "balance_loss_clip": 0.06283568, + "balance_loss_mlp": 0.01255788, + "epoch": 0.3954907560499023, + "flos": 22790876561280.0, + "grad_norm": 2.6090797034217603, + "language_loss": 0.90399998, + "learning_rate": 2.75487497985853e-06, + "loss": 0.98117089, + "num_input_tokens_seen": 141269325, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.1338501, + "step": 6578, + "time_per_iteration": 2.624901533126831 + }, + { + "auxiliary_loss_clip": 0.06451896, + "auxiliary_loss_mlp": 0.01269341, + "balance_loss_clip": 0.06281315, + "balance_loss_mlp": 0.01254284, + "epoch": 0.39555087930257027, + "flos": 21950823052800.0, + "grad_norm": 1.8247592517251146, + "language_loss": 0.78543842, + "learning_rate": 2.7545143103749117e-06, + "loss": 0.86265075, + "num_input_tokens_seen": 141288505, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.15063477, + "step": 6579, + "time_per_iteration": 2.5111443996429443 + }, + { + "auxiliary_loss_clip": 0.06456701, + "auxiliary_loss_mlp": 0.01273715, + "balance_loss_clip": 0.0628474, + "balance_loss_mlp": 0.01258492, + "epoch": 0.39561100255523823, + "flos": 20409553451520.0, + "grad_norm": 2.1653293739232753, + "language_loss": 0.68659246, + "learning_rate": 2.754153612280037e-06, + "loss": 0.76389658, + "num_input_tokens_seen": 141303680, + "router_z_loss_clip": 1.71972656, + "router_z_loss_mlp": 0.15216064, + "step": 6580, + "time_per_iteration": 4.038321495056152 + }, + { + "auxiliary_loss_clip": 0.06448758, + "auxiliary_loss_mlp": 0.01270958, + "balance_loss_clip": 0.06283981, + "balance_loss_mlp": 0.01256635, + "epoch": 0.3956711258079062, + "flos": 27972005598720.0, + "grad_norm": 1.867170796056586, + "language_loss": 0.58577931, + "learning_rate": 2.7537928855875797e-06, + "loss": 0.6629765, + "num_input_tokens_seen": 141324090, + "router_z_loss_clip": 1.64648438, + "router_z_loss_mlp": 0.14318848, + "step": 6581, + "time_per_iteration": 2.618917942047119 + }, + { + "auxiliary_loss_clip": 0.0645448, + "auxiliary_loss_mlp": 0.0127135, + "balance_loss_clip": 0.06288571, + "balance_loss_mlp": 0.01256413, + "epoch": 0.39573124906057416, + "flos": 14433457201920.0, + "grad_norm": 2.002939068333409, + "language_loss": 0.69910431, + "learning_rate": 2.7534321303112224e-06, + "loss": 0.77636254, + "num_input_tokens_seen": 141342235, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.14929199, + "step": 6582, + "time_per_iteration": 2.530895709991455 + }, + { + "auxiliary_loss_clip": 0.06451949, + "auxiliary_loss_mlp": 0.01273006, + "balance_loss_clip": 0.06283893, + "balance_loss_mlp": 0.01258546, + "epoch": 0.39579137231324213, + "flos": 18739592778240.0, + "grad_norm": 2.2302551557868457, + "language_loss": 0.76587689, + "learning_rate": 2.753071346464642e-06, + "loss": 0.84312642, + "num_input_tokens_seen": 141361195, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.14453125, + "step": 6583, + "time_per_iteration": 2.5276317596435547 + }, + { + "auxiliary_loss_clip": 0.0645259, + "auxiliary_loss_mlp": 0.0127002, + "balance_loss_clip": 0.06284047, + "balance_loss_mlp": 0.01256562, + "epoch": 0.3958514955659101, + "flos": 17682268832640.0, + "grad_norm": 1.926047340176765, + "language_loss": 0.66262352, + "learning_rate": 2.7527105340615207e-06, + "loss": 0.73984963, + "num_input_tokens_seen": 141378275, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.13458252, + "step": 6584, + "time_per_iteration": 2.501209259033203 + }, + { + "auxiliary_loss_clip": 0.06456675, + "auxiliary_loss_mlp": 0.01270923, + "balance_loss_clip": 0.06285589, + "balance_loss_mlp": 0.01256803, + "epoch": 0.39591161881857806, + "flos": 29315850981120.0, + "grad_norm": 1.992954295318491, + "language_loss": 0.72398281, + "learning_rate": 2.7523496931155413e-06, + "loss": 0.8012588, + "num_input_tokens_seen": 141396960, + "router_z_loss_clip": 1.7109375, + "router_z_loss_mlp": 0.14111328, + "step": 6585, + "time_per_iteration": 2.617694616317749 + }, + { + "auxiliary_loss_clip": 0.06457305, + "auxiliary_loss_mlp": 0.0127182, + "balance_loss_clip": 0.06288064, + "balance_loss_mlp": 0.01257336, + "epoch": 0.3959717420712461, + "flos": 25778295780480.0, + "grad_norm": 1.6889684303793513, + "language_loss": 0.73472714, + "learning_rate": 2.7519888236403856e-06, + "loss": 0.81201839, + "num_input_tokens_seen": 141417320, + "router_z_loss_clip": 1.69238281, + "router_z_loss_mlp": 0.14477539, + "step": 6586, + "time_per_iteration": 2.565883159637451 + }, + { + "auxiliary_loss_clip": 0.06454571, + "auxiliary_loss_mlp": 0.01267143, + "balance_loss_clip": 0.06286268, + "balance_loss_mlp": 0.01252969, + "epoch": 0.39603186532391405, + "flos": 20930199995520.0, + "grad_norm": 1.6150585752618039, + "language_loss": 0.71662915, + "learning_rate": 2.7516279256497382e-06, + "loss": 0.79384637, + "num_input_tokens_seen": 141435985, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.14160156, + "step": 6587, + "time_per_iteration": 2.5788414478302 + }, + { + "auxiliary_loss_clip": 0.06362241, + "auxiliary_loss_mlp": 0.01254401, + "balance_loss_clip": 0.06286076, + "balance_loss_mlp": 0.01251419, + "epoch": 0.396091988576582, + "flos": 54897336720000.0, + "grad_norm": 0.8108180128275717, + "language_loss": 0.60705078, + "learning_rate": 2.751266999157285e-06, + "loss": 0.68321717, + "num_input_tokens_seen": 141486075, + "router_z_loss_clip": 0.75830078, + "router_z_loss_mlp": 0.02980042, + "step": 6588, + "time_per_iteration": 2.973475217819214 + }, + { + "auxiliary_loss_clip": 0.06457016, + "auxiliary_loss_mlp": 0.01266428, + "balance_loss_clip": 0.06285909, + "balance_loss_mlp": 0.01251873, + "epoch": 0.39615211182925, + "flos": 20708946489600.0, + "grad_norm": 1.752385405351709, + "language_loss": 0.81335068, + "learning_rate": 2.7509060441767115e-06, + "loss": 0.89058518, + "num_input_tokens_seen": 141505280, + "router_z_loss_clip": 1.71191406, + "router_z_loss_mlp": 0.14575195, + "step": 6589, + "time_per_iteration": 2.557732582092285 + }, + { + "auxiliary_loss_clip": 0.06456019, + "auxiliary_loss_mlp": 0.01269797, + "balance_loss_clip": 0.06286196, + "balance_loss_mlp": 0.01254431, + "epoch": 0.39621223508191794, + "flos": 21000331463040.0, + "grad_norm": 1.8508577793480634, + "language_loss": 0.71167219, + "learning_rate": 2.7505450607217057e-06, + "loss": 0.7889303, + "num_input_tokens_seen": 141523930, + "router_z_loss_clip": 1.69726562, + "router_z_loss_mlp": 0.15368652, + "step": 6590, + "time_per_iteration": 2.5155017375946045 + }, + { + "auxiliary_loss_clip": 0.06451933, + "auxiliary_loss_mlp": 0.01266373, + "balance_loss_clip": 0.06285245, + "balance_loss_mlp": 0.01253284, + "epoch": 0.3962723583345859, + "flos": 23375742860160.0, + "grad_norm": 1.6853348593397999, + "language_loss": 0.75984478, + "learning_rate": 2.750184048805956e-06, + "loss": 0.83702791, + "num_input_tokens_seen": 141541320, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.13098145, + "step": 6591, + "time_per_iteration": 2.569958448410034 + }, + { + "auxiliary_loss_clip": 0.06454425, + "auxiliary_loss_mlp": 0.01268025, + "balance_loss_clip": 0.06288329, + "balance_loss_mlp": 0.01254215, + "epoch": 0.39633248158725387, + "flos": 25122040202880.0, + "grad_norm": 1.5542594066551045, + "language_loss": 0.78422546, + "learning_rate": 2.749823008443152e-06, + "loss": 0.8614499, + "num_input_tokens_seen": 141561880, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.13806152, + "step": 6592, + "time_per_iteration": 2.5509040355682373 + }, + { + "auxiliary_loss_clip": 0.06448938, + "auxiliary_loss_mlp": 0.0127036, + "balance_loss_clip": 0.062861, + "balance_loss_mlp": 0.01256615, + "epoch": 0.39639260483992184, + "flos": 39797309888640.0, + "grad_norm": 1.716432087396327, + "language_loss": 0.69405383, + "learning_rate": 2.7494619396469843e-06, + "loss": 0.77124685, + "num_input_tokens_seen": 141586460, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.13751221, + "step": 6593, + "time_per_iteration": 2.742421865463257 + }, + { + "auxiliary_loss_clip": 0.06455009, + "auxiliary_loss_mlp": 0.01268833, + "balance_loss_clip": 0.06285039, + "balance_loss_mlp": 0.01253896, + "epoch": 0.3964527280925898, + "flos": 17352673597440.0, + "grad_norm": 2.6756229463225134, + "language_loss": 0.78082192, + "learning_rate": 2.7491008424311452e-06, + "loss": 0.85806036, + "num_input_tokens_seen": 141605955, + "router_z_loss_clip": 1.69824219, + "router_z_loss_mlp": 0.14929199, + "step": 6594, + "time_per_iteration": 2.5240583419799805 + }, + { + "auxiliary_loss_clip": 0.06345355, + "auxiliary_loss_mlp": 0.01253278, + "balance_loss_clip": 0.06269702, + "balance_loss_mlp": 0.0125056, + "epoch": 0.39651285134525777, + "flos": 71739845533440.0, + "grad_norm": 0.9367359782969226, + "language_loss": 0.6293599, + "learning_rate": 2.7487397168093265e-06, + "loss": 0.70534623, + "num_input_tokens_seen": 141673140, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.02722168, + "step": 6595, + "time_per_iteration": 3.195411205291748 + }, + { + "auxiliary_loss_clip": 0.06455558, + "auxiliary_loss_mlp": 0.01273293, + "balance_loss_clip": 0.0628309, + "balance_loss_mlp": 0.0125714, + "epoch": 0.39657297459792573, + "flos": 25782823900800.0, + "grad_norm": 2.0629727816625656, + "language_loss": 0.63503623, + "learning_rate": 2.748378562795223e-06, + "loss": 0.71232474, + "num_input_tokens_seen": 141692955, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.16149902, + "step": 6596, + "time_per_iteration": 2.564436197280884 + }, + { + "auxiliary_loss_clip": 0.06445512, + "auxiliary_loss_mlp": 0.01270278, + "balance_loss_clip": 0.0628349, + "balance_loss_mlp": 0.01256086, + "epoch": 0.3966330978505937, + "flos": 20272267336320.0, + "grad_norm": 3.0845696935228646, + "language_loss": 0.79033494, + "learning_rate": 2.7480173804025293e-06, + "loss": 0.86749279, + "num_input_tokens_seen": 141710680, + "router_z_loss_clip": 1.61914062, + "router_z_loss_mlp": 0.14202881, + "step": 6597, + "time_per_iteration": 2.5187220573425293 + }, + { + "auxiliary_loss_clip": 0.0645806, + "auxiliary_loss_mlp": 0.01272047, + "balance_loss_clip": 0.06285266, + "balance_loss_mlp": 0.01257259, + "epoch": 0.39669322110326166, + "flos": 20637431429760.0, + "grad_norm": 1.9127598273467419, + "language_loss": 0.67675543, + "learning_rate": 2.747656169644941e-06, + "loss": 0.75405657, + "num_input_tokens_seen": 141729860, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.14776611, + "step": 6598, + "time_per_iteration": 2.5287654399871826 + }, + { + "auxiliary_loss_clip": 0.06448894, + "auxiliary_loss_mlp": 0.01270917, + "balance_loss_clip": 0.06280929, + "balance_loss_mlp": 0.01257643, + "epoch": 0.3967533443559297, + "flos": 21732546366720.0, + "grad_norm": 1.6941457063111416, + "language_loss": 0.79130334, + "learning_rate": 2.747294930536157e-06, + "loss": 0.86850142, + "num_input_tokens_seen": 141749060, + "router_z_loss_clip": 1.68066406, + "router_z_loss_mlp": 0.13269043, + "step": 6599, + "time_per_iteration": 2.564073324203491 + }, + { + "auxiliary_loss_clip": 0.06447926, + "auxiliary_loss_mlp": 0.01270436, + "balance_loss_clip": 0.06279482, + "balance_loss_mlp": 0.01254289, + "epoch": 0.39681346760859765, + "flos": 25491271219200.0, + "grad_norm": 1.7355689440790156, + "language_loss": 0.72895992, + "learning_rate": 2.7469336630898737e-06, + "loss": 0.80614352, + "num_input_tokens_seen": 141769860, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.16149902, + "step": 6600, + "time_per_iteration": 2.6141197681427 + }, + { + "auxiliary_loss_clip": 0.06448444, + "auxiliary_loss_mlp": 0.01274951, + "balance_loss_clip": 0.06280382, + "balance_loss_mlp": 0.01261045, + "epoch": 0.3968735908612656, + "flos": 20965894634880.0, + "grad_norm": 1.918502465070546, + "language_loss": 0.85902363, + "learning_rate": 2.746572367319791e-06, + "loss": 0.9362576, + "num_input_tokens_seen": 141788465, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.13909912, + "step": 6601, + "time_per_iteration": 2.539337396621704 + }, + { + "auxiliary_loss_clip": 0.06455625, + "auxiliary_loss_mlp": 0.01273924, + "balance_loss_clip": 0.06281834, + "balance_loss_mlp": 0.0125773, + "epoch": 0.3969337141139336, + "flos": 10711684800000.0, + "grad_norm": 2.4177834123100412, + "language_loss": 0.70406669, + "learning_rate": 2.7462110432396095e-06, + "loss": 0.78136218, + "num_input_tokens_seen": 141804955, + "router_z_loss_clip": 1.74023438, + "router_z_loss_mlp": 0.16192627, + "step": 6602, + "time_per_iteration": 2.5344958305358887 + }, + { + "auxiliary_loss_clip": 0.06450728, + "auxiliary_loss_mlp": 0.01272133, + "balance_loss_clip": 0.06280322, + "balance_loss_mlp": 0.01257583, + "epoch": 0.39699383736660154, + "flos": 17597924098560.0, + "grad_norm": 4.3880896635048865, + "language_loss": 0.84332073, + "learning_rate": 2.7458496908630305e-06, + "loss": 0.92054927, + "num_input_tokens_seen": 141820025, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.14550781, + "step": 6603, + "time_per_iteration": 2.4587697982788086 + }, + { + "auxiliary_loss_clip": 0.06445679, + "auxiliary_loss_mlp": 0.01276756, + "balance_loss_clip": 0.06278397, + "balance_loss_mlp": 0.01263017, + "epoch": 0.3970539606192695, + "flos": 17791826446080.0, + "grad_norm": 1.5258003920697418, + "language_loss": 0.7302916, + "learning_rate": 2.7454883102037563e-06, + "loss": 0.80751598, + "num_input_tokens_seen": 141838735, + "router_z_loss_clip": 1.67480469, + "router_z_loss_mlp": 0.13751221, + "step": 6604, + "time_per_iteration": 2.525475025177002 + }, + { + "auxiliary_loss_clip": 0.06437713, + "auxiliary_loss_mlp": 0.01269691, + "balance_loss_clip": 0.06277181, + "balance_loss_mlp": 0.0125609, + "epoch": 0.3971140838719375, + "flos": 24796260328320.0, + "grad_norm": 1.5312177971095886, + "language_loss": 0.82809514, + "learning_rate": 2.745126901275491e-06, + "loss": 0.90516913, + "num_input_tokens_seen": 141858090, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.13598633, + "step": 6605, + "time_per_iteration": 2.5601069927215576 + }, + { + "auxiliary_loss_clip": 0.06439412, + "auxiliary_loss_mlp": 0.01269635, + "balance_loss_clip": 0.06274941, + "balance_loss_mlp": 0.01256337, + "epoch": 0.39717420712460544, + "flos": 24250484759040.0, + "grad_norm": 1.721474173213711, + "language_loss": 0.74617773, + "learning_rate": 2.7447654640919383e-06, + "loss": 0.82326818, + "num_input_tokens_seen": 141877540, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.13293457, + "step": 6606, + "time_per_iteration": 2.570338726043701 + }, + { + "auxiliary_loss_clip": 0.06450282, + "auxiliary_loss_mlp": 0.01268462, + "balance_loss_clip": 0.06279129, + "balance_loss_mlp": 0.01255343, + "epoch": 0.3972343303772734, + "flos": 25891752608640.0, + "grad_norm": 1.7826498780228273, + "language_loss": 0.74625784, + "learning_rate": 2.744403998666805e-06, + "loss": 0.8234452, + "num_input_tokens_seen": 141897315, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.13122559, + "step": 6607, + "time_per_iteration": 2.554779052734375 + }, + { + "auxiliary_loss_clip": 0.06451584, + "auxiliary_loss_mlp": 0.01271624, + "balance_loss_clip": 0.0628166, + "balance_loss_mlp": 0.01257366, + "epoch": 0.39729445362994137, + "flos": 45634107525120.0, + "grad_norm": 2.013518755058626, + "language_loss": 0.68503535, + "learning_rate": 2.744042505013797e-06, + "loss": 0.76226741, + "num_input_tokens_seen": 141919580, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.1427002, + "step": 6608, + "time_per_iteration": 2.814741611480713 + }, + { + "auxiliary_loss_clip": 0.06453016, + "auxiliary_loss_mlp": 0.01270819, + "balance_loss_clip": 0.06280445, + "balance_loss_mlp": 0.01256496, + "epoch": 0.39735457688260933, + "flos": 20200249152000.0, + "grad_norm": 2.238404873213265, + "language_loss": 0.74168068, + "learning_rate": 2.7436809831466233e-06, + "loss": 0.818919, + "num_input_tokens_seen": 141937045, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.14318848, + "step": 6609, + "time_per_iteration": 2.549020767211914 + }, + { + "auxiliary_loss_clip": 0.06450722, + "auxiliary_loss_mlp": 0.01268408, + "balance_loss_clip": 0.06281993, + "balance_loss_mlp": 0.0125424, + "epoch": 0.3974147001352773, + "flos": 23337868014720.0, + "grad_norm": 1.4758458837885644, + "language_loss": 0.71468556, + "learning_rate": 2.7433194330789927e-06, + "loss": 0.79187685, + "num_input_tokens_seen": 141956695, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.14154053, + "step": 6610, + "time_per_iteration": 3.985957622528076 + }, + { + "auxiliary_loss_clip": 0.06440872, + "auxiliary_loss_mlp": 0.01270494, + "balance_loss_clip": 0.062764, + "balance_loss_mlp": 0.01256559, + "epoch": 0.39747482338794526, + "flos": 21694965010560.0, + "grad_norm": 1.555692262156073, + "language_loss": 0.7854501, + "learning_rate": 2.7429578548246133e-06, + "loss": 0.86256385, + "num_input_tokens_seen": 141975935, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.13934326, + "step": 6611, + "time_per_iteration": 2.5972208976745605 + }, + { + "auxiliary_loss_clip": 0.06447503, + "auxiliary_loss_mlp": 0.01268941, + "balance_loss_clip": 0.06280762, + "balance_loss_mlp": 0.01255065, + "epoch": 0.3975349466406133, + "flos": 30995957998080.0, + "grad_norm": 2.19308398220208, + "language_loss": 0.79606485, + "learning_rate": 2.7425962483971985e-06, + "loss": 0.87322932, + "num_input_tokens_seen": 141995750, + "router_z_loss_clip": 1.66699219, + "router_z_loss_mlp": 0.13891602, + "step": 6612, + "time_per_iteration": 2.6106274127960205 + }, + { + "auxiliary_loss_clip": 0.0634682, + "auxiliary_loss_mlp": 0.01253265, + "balance_loss_clip": 0.06270839, + "balance_loss_mlp": 0.01250469, + "epoch": 0.39759506989328125, + "flos": 63703426366080.0, + "grad_norm": 0.8245936024085626, + "language_loss": 0.6463905, + "learning_rate": 2.742234613810459e-06, + "loss": 0.72239137, + "num_input_tokens_seen": 142057655, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.02796936, + "step": 6613, + "time_per_iteration": 4.473678112030029 + }, + { + "auxiliary_loss_clip": 0.06450668, + "auxiliary_loss_mlp": 0.01269678, + "balance_loss_clip": 0.06282368, + "balance_loss_mlp": 0.01255367, + "epoch": 0.3976551931459492, + "flos": 23702570910720.0, + "grad_norm": 2.448614415916545, + "language_loss": 0.72596258, + "learning_rate": 2.741872951078109e-06, + "loss": 0.80316603, + "num_input_tokens_seen": 142076020, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.14312744, + "step": 6614, + "time_per_iteration": 2.5691444873809814 + }, + { + "auxiliary_loss_clip": 0.06449673, + "auxiliary_loss_mlp": 0.0127007, + "balance_loss_clip": 0.06283288, + "balance_loss_mlp": 0.01256051, + "epoch": 0.3977153163986172, + "flos": 15675166056960.0, + "grad_norm": 2.2284862441621995, + "language_loss": 0.81666011, + "learning_rate": 2.741511260213862e-06, + "loss": 0.89385748, + "num_input_tokens_seen": 142093790, + "router_z_loss_clip": 1.6640625, + "router_z_loss_mlp": 0.14013672, + "step": 6615, + "time_per_iteration": 2.55078387260437 + }, + { + "auxiliary_loss_clip": 0.06452717, + "auxiliary_loss_mlp": 0.01269531, + "balance_loss_clip": 0.06284063, + "balance_loss_mlp": 0.01255679, + "epoch": 0.39777543965128515, + "flos": 14070012117120.0, + "grad_norm": 1.96274897748641, + "language_loss": 0.67687142, + "learning_rate": 2.741149541231434e-06, + "loss": 0.75409389, + "num_input_tokens_seen": 142110545, + "router_z_loss_clip": 1.68554688, + "router_z_loss_mlp": 0.13842773, + "step": 6616, + "time_per_iteration": 2.533982992172241 + }, + { + "auxiliary_loss_clip": 0.06455097, + "auxiliary_loss_mlp": 0.0126897, + "balance_loss_clip": 0.06281532, + "balance_loss_mlp": 0.01253986, + "epoch": 0.3978355629039531, + "flos": 23374149632640.0, + "grad_norm": 2.1811174101900552, + "language_loss": 0.8396368, + "learning_rate": 2.740787794144541e-06, + "loss": 0.91687751, + "num_input_tokens_seen": 142128695, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.14978027, + "step": 6617, + "time_per_iteration": 3.9742090702056885 + }, + { + "auxiliary_loss_clip": 0.06446042, + "auxiliary_loss_mlp": 0.01268103, + "balance_loss_clip": 0.06283504, + "balance_loss_mlp": 0.01255556, + "epoch": 0.3978956861566211, + "flos": 19068852597120.0, + "grad_norm": 1.7253210008214133, + "language_loss": 0.73000187, + "learning_rate": 2.7404260189669e-06, + "loss": 0.80714333, + "num_input_tokens_seen": 142148375, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.12536621, + "step": 6618, + "time_per_iteration": 2.562913179397583 + }, + { + "auxiliary_loss_clip": 0.06454587, + "auxiliary_loss_mlp": 0.01274299, + "balance_loss_clip": 0.06285769, + "balance_loss_mlp": 0.01258576, + "epoch": 0.39795580940928904, + "flos": 30235679176320.0, + "grad_norm": 1.6365941861062427, + "language_loss": 0.65343797, + "learning_rate": 2.740064215712231e-06, + "loss": 0.73072684, + "num_input_tokens_seen": 142169735, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.15710449, + "step": 6619, + "time_per_iteration": 2.598667860031128 + }, + { + "auxiliary_loss_clip": 0.06341819, + "auxiliary_loss_mlp": 0.01254465, + "balance_loss_clip": 0.06266081, + "balance_loss_mlp": 0.01251738, + "epoch": 0.398015932661957, + "flos": 69867261688320.0, + "grad_norm": 0.7579483566665592, + "language_loss": 0.582268, + "learning_rate": 2.7397023843942527e-06, + "loss": 0.65823084, + "num_input_tokens_seen": 142229520, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.02731323, + "step": 6620, + "time_per_iteration": 4.528149604797363 + }, + { + "auxiliary_loss_clip": 0.06446633, + "auxiliary_loss_mlp": 0.01270084, + "balance_loss_clip": 0.06280729, + "balance_loss_mlp": 0.01256858, + "epoch": 0.39807605591462497, + "flos": 20164093315200.0, + "grad_norm": 1.5024608902652035, + "language_loss": 0.79499102, + "learning_rate": 2.739340525026686e-06, + "loss": 0.87215811, + "num_input_tokens_seen": 142247660, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.13232422, + "step": 6621, + "time_per_iteration": 2.559305191040039 + }, + { + "auxiliary_loss_clip": 0.06445563, + "auxiliary_loss_mlp": 0.01270989, + "balance_loss_clip": 0.06279579, + "balance_loss_mlp": 0.01257435, + "epoch": 0.39813617916729294, + "flos": 21148057411200.0, + "grad_norm": 1.7591122738615637, + "language_loss": 0.78347874, + "learning_rate": 2.738978637623252e-06, + "loss": 0.86064428, + "num_input_tokens_seen": 142266990, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.13568115, + "step": 6622, + "time_per_iteration": 2.541325569152832 + }, + { + "auxiliary_loss_clip": 0.06444648, + "auxiliary_loss_mlp": 0.01270694, + "balance_loss_clip": 0.06278688, + "balance_loss_mlp": 0.01255948, + "epoch": 0.3981963024199609, + "flos": 18994318790400.0, + "grad_norm": 9.51473607747463, + "language_loss": 0.75430334, + "learning_rate": 2.738616722197674e-06, + "loss": 0.83145678, + "num_input_tokens_seen": 142287170, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.14733887, + "step": 6623, + "time_per_iteration": 2.5859150886535645 + }, + { + "auxiliary_loss_clip": 0.06449074, + "auxiliary_loss_mlp": 0.0127457, + "balance_loss_clip": 0.06282511, + "balance_loss_mlp": 0.01260551, + "epoch": 0.39825642567262887, + "flos": 16579648955520.0, + "grad_norm": 1.7143371951380526, + "language_loss": 0.79926246, + "learning_rate": 2.7382547787636766e-06, + "loss": 0.87649894, + "num_input_tokens_seen": 142305405, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.14025879, + "step": 6624, + "time_per_iteration": 2.509500026702881 + }, + { + "auxiliary_loss_clip": 0.06454292, + "auxiliary_loss_mlp": 0.01269994, + "balance_loss_clip": 0.06280515, + "balance_loss_mlp": 0.01254234, + "epoch": 0.39831654892529683, + "flos": 22206303751680.0, + "grad_norm": 2.195062259081814, + "language_loss": 0.84314877, + "learning_rate": 2.7378928073349832e-06, + "loss": 0.92039162, + "num_input_tokens_seen": 142322710, + "router_z_loss_clip": 1.73535156, + "router_z_loss_mlp": 0.15759277, + "step": 6625, + "time_per_iteration": 2.5617175102233887 + }, + { + "auxiliary_loss_clip": 0.06446299, + "auxiliary_loss_mlp": 0.01272387, + "balance_loss_clip": 0.06279518, + "balance_loss_mlp": 0.01258517, + "epoch": 0.39837667217796485, + "flos": 10492485719040.0, + "grad_norm": 1.8250293636172175, + "language_loss": 0.8709324, + "learning_rate": 2.737530807925321e-06, + "loss": 0.94811928, + "num_input_tokens_seen": 142338535, + "router_z_loss_clip": 1.66699219, + "router_z_loss_mlp": 0.13867188, + "step": 6626, + "time_per_iteration": 2.72031307220459 + }, + { + "auxiliary_loss_clip": 0.06447423, + "auxiliary_loss_mlp": 0.01271086, + "balance_loss_clip": 0.0627908, + "balance_loss_mlp": 0.01256531, + "epoch": 0.3984367954306328, + "flos": 17970676986240.0, + "grad_norm": 2.760632977827581, + "language_loss": 0.84402627, + "learning_rate": 2.737168780548417e-06, + "loss": 0.9212113, + "num_input_tokens_seen": 142354570, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.14575195, + "step": 6627, + "time_per_iteration": 2.6228654384613037 + }, + { + "auxiliary_loss_clip": 0.06445234, + "auxiliary_loss_mlp": 0.01268693, + "balance_loss_clip": 0.0627917, + "balance_loss_mlp": 0.01255443, + "epoch": 0.3984969186833008, + "flos": 22717684419840.0, + "grad_norm": 3.2429830324928095, + "language_loss": 0.83402491, + "learning_rate": 2.736806725217998e-06, + "loss": 0.91116416, + "num_input_tokens_seen": 142374395, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.13250732, + "step": 6628, + "time_per_iteration": 2.6287484169006348 + }, + { + "auxiliary_loss_clip": 0.06449139, + "auxiliary_loss_mlp": 0.01271852, + "balance_loss_clip": 0.06279008, + "balance_loss_mlp": 0.01256981, + "epoch": 0.39855704193596875, + "flos": 23412779164800.0, + "grad_norm": 1.5731823007903518, + "language_loss": 0.71793973, + "learning_rate": 2.7364446419477945e-06, + "loss": 0.79514968, + "num_input_tokens_seen": 142396040, + "router_z_loss_clip": 1.70117188, + "router_z_loss_mlp": 0.14868164, + "step": 6629, + "time_per_iteration": 2.5752875804901123 + }, + { + "auxiliary_loss_clip": 0.06441505, + "auxiliary_loss_mlp": 0.01268472, + "balance_loss_clip": 0.06280406, + "balance_loss_mlp": 0.01254834, + "epoch": 0.3986171651886367, + "flos": 21258369711360.0, + "grad_norm": 2.035566678796665, + "language_loss": 0.80905473, + "learning_rate": 2.7360825307515366e-06, + "loss": 0.88615453, + "num_input_tokens_seen": 142415495, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.1362915, + "step": 6630, + "time_per_iteration": 2.5329513549804688 + }, + { + "auxiliary_loss_clip": 0.06445715, + "auxiliary_loss_mlp": 0.01269878, + "balance_loss_clip": 0.06276714, + "balance_loss_mlp": 0.01255693, + "epoch": 0.3986772884413047, + "flos": 12463642293120.0, + "grad_norm": 2.1251751047068783, + "language_loss": 0.75146663, + "learning_rate": 2.7357203916429555e-06, + "loss": 0.82862258, + "num_input_tokens_seen": 142431865, + "router_z_loss_clip": 1.69042969, + "router_z_loss_mlp": 0.14190674, + "step": 6631, + "time_per_iteration": 2.5500082969665527 + }, + { + "auxiliary_loss_clip": 0.06448178, + "auxiliary_loss_mlp": 0.01269111, + "balance_loss_clip": 0.06279311, + "balance_loss_mlp": 0.0125505, + "epoch": 0.39873741169397264, + "flos": 19652209522560.0, + "grad_norm": 1.6915315525927903, + "language_loss": 0.71496904, + "learning_rate": 2.735358224635783e-06, + "loss": 0.79214191, + "num_input_tokens_seen": 142450595, + "router_z_loss_clip": 1.69042969, + "router_z_loss_mlp": 0.140625, + "step": 6632, + "time_per_iteration": 2.563776731491089 + }, + { + "auxiliary_loss_clip": 0.06444843, + "auxiliary_loss_mlp": 0.01269449, + "balance_loss_clip": 0.06279632, + "balance_loss_mlp": 0.01255955, + "epoch": 0.3987975349466406, + "flos": 21690436890240.0, + "grad_norm": 1.8116978167005697, + "language_loss": 0.75623924, + "learning_rate": 2.7349960297437533e-06, + "loss": 0.83338219, + "num_input_tokens_seen": 142466650, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.13494873, + "step": 6633, + "time_per_iteration": 2.5171151161193848 + }, + { + "auxiliary_loss_clip": 0.06449188, + "auxiliary_loss_mlp": 0.01272467, + "balance_loss_clip": 0.06280442, + "balance_loss_mlp": 0.0125846, + "epoch": 0.3988576581993086, + "flos": 23920721815680.0, + "grad_norm": 1.9002609831735993, + "language_loss": 0.81678545, + "learning_rate": 2.7346338069806e-06, + "loss": 0.89400202, + "num_input_tokens_seen": 142486165, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.14001465, + "step": 6634, + "time_per_iteration": 2.539128065109253 + }, + { + "auxiliary_loss_clip": 0.06453361, + "auxiliary_loss_mlp": 0.01269766, + "balance_loss_clip": 0.06283009, + "balance_loss_mlp": 0.01255449, + "epoch": 0.39891778145197654, + "flos": 18155690801280.0, + "grad_norm": 1.9946050359209588, + "language_loss": 0.7547667, + "learning_rate": 2.7342715563600597e-06, + "loss": 0.83199799, + "num_input_tokens_seen": 142505035, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.14306641, + "step": 6635, + "time_per_iteration": 2.5426242351531982 + }, + { + "auxiliary_loss_clip": 0.06468328, + "auxiliary_loss_mlp": 0.01272826, + "balance_loss_clip": 0.06289048, + "balance_loss_mlp": 0.01256053, + "epoch": 0.3989779047046445, + "flos": 22600831501440.0, + "grad_norm": 1.9740114535883675, + "language_loss": 0.66474432, + "learning_rate": 2.733909277895868e-06, + "loss": 0.74215585, + "num_input_tokens_seen": 142521870, + "router_z_loss_clip": 1.79296875, + "router_z_loss_mlp": 0.16760254, + "step": 6636, + "time_per_iteration": 2.5290956497192383 + }, + { + "auxiliary_loss_clip": 0.06452767, + "auxiliary_loss_mlp": 0.01270258, + "balance_loss_clip": 0.06285115, + "balance_loss_mlp": 0.01255012, + "epoch": 0.39903802795731247, + "flos": 18083043711360.0, + "grad_norm": 1.6936131920640751, + "language_loss": 0.82211542, + "learning_rate": 2.733546971601763e-06, + "loss": 0.89934564, + "num_input_tokens_seen": 142540455, + "router_z_loss_clip": 1.67773438, + "router_z_loss_mlp": 0.15246582, + "step": 6637, + "time_per_iteration": 2.516279458999634 + }, + { + "auxiliary_loss_clip": 0.06353697, + "auxiliary_loss_mlp": 0.01252791, + "balance_loss_clip": 0.06278069, + "balance_loss_mlp": 0.01250418, + "epoch": 0.39909815120998043, + "flos": 70463238652800.0, + "grad_norm": 0.7262189478909644, + "language_loss": 0.531524, + "learning_rate": 2.733184637491484e-06, + "loss": 0.60758889, + "num_input_tokens_seen": 142599665, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.0236969, + "step": 6638, + "time_per_iteration": 3.2179603576660156 + }, + { + "auxiliary_loss_clip": 0.06449973, + "auxiliary_loss_mlp": 0.01277744, + "balance_loss_clip": 0.06279011, + "balance_loss_mlp": 0.0126304, + "epoch": 0.39915827446264845, + "flos": 18554788598400.0, + "grad_norm": 1.4980640352775056, + "language_loss": 0.75670731, + "learning_rate": 2.732822275578769e-06, + "loss": 0.83398449, + "num_input_tokens_seen": 142618845, + "router_z_loss_clip": 1.70898438, + "router_z_loss_mlp": 0.14715576, + "step": 6639, + "time_per_iteration": 2.5319011211395264 + }, + { + "auxiliary_loss_clip": 0.06442601, + "auxiliary_loss_mlp": 0.01272751, + "balance_loss_clip": 0.0627881, + "balance_loss_mlp": 0.01258249, + "epoch": 0.3992183977153164, + "flos": 29904826129920.0, + "grad_norm": 2.014095124557279, + "language_loss": 0.76376802, + "learning_rate": 2.7324598858773603e-06, + "loss": 0.84092152, + "num_input_tokens_seen": 142640885, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.1451416, + "step": 6640, + "time_per_iteration": 2.642223834991455 + }, + { + "auxiliary_loss_clip": 0.06449724, + "auxiliary_loss_mlp": 0.01270265, + "balance_loss_clip": 0.06280393, + "balance_loss_mlp": 0.01255757, + "epoch": 0.3992785209679844, + "flos": 22571677480320.0, + "grad_norm": 2.238528881986372, + "language_loss": 0.8211664, + "learning_rate": 2.7320974684009996e-06, + "loss": 0.89836633, + "num_input_tokens_seen": 142659340, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.14501953, + "step": 6641, + "time_per_iteration": 2.530189275741577 + }, + { + "auxiliary_loss_clip": 0.06456075, + "auxiliary_loss_mlp": 0.01270045, + "balance_loss_clip": 0.06284191, + "balance_loss_mlp": 0.01254971, + "epoch": 0.39933864422065235, + "flos": 19688784629760.0, + "grad_norm": 1.8306704082742173, + "language_loss": 0.77208257, + "learning_rate": 2.7317350231634288e-06, + "loss": 0.84934378, + "num_input_tokens_seen": 142677085, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.15081787, + "step": 6642, + "time_per_iteration": 2.5495219230651855 + }, + { + "auxiliary_loss_clip": 0.06453043, + "auxiliary_loss_mlp": 0.01270555, + "balance_loss_clip": 0.06281064, + "balance_loss_mlp": 0.01255564, + "epoch": 0.3993987674733203, + "flos": 23045015594880.0, + "grad_norm": 2.242078242091602, + "language_loss": 0.72883618, + "learning_rate": 2.731372550178393e-06, + "loss": 0.80607212, + "num_input_tokens_seen": 142694595, + "router_z_loss_clip": 1.71972656, + "router_z_loss_mlp": 0.14984131, + "step": 6643, + "time_per_iteration": 2.521857500076294 + }, + { + "auxiliary_loss_clip": 0.06456347, + "auxiliary_loss_mlp": 0.01273961, + "balance_loss_clip": 0.06283459, + "balance_loss_mlp": 0.01259317, + "epoch": 0.3994588907259883, + "flos": 19396896531840.0, + "grad_norm": 1.7649027305896348, + "language_loss": 0.66785717, + "learning_rate": 2.7310100494596375e-06, + "loss": 0.74516022, + "num_input_tokens_seen": 142714175, + "router_z_loss_clip": 1.72851562, + "router_z_loss_mlp": 0.14642334, + "step": 6644, + "time_per_iteration": 2.571690320968628 + }, + { + "auxiliary_loss_clip": 0.06454624, + "auxiliary_loss_mlp": 0.0127806, + "balance_loss_clip": 0.06282313, + "balance_loss_mlp": 0.01263737, + "epoch": 0.39951901397865625, + "flos": 13739326778880.0, + "grad_norm": 1.9095077452421072, + "language_loss": 0.78757256, + "learning_rate": 2.730647521020907e-06, + "loss": 0.86489946, + "num_input_tokens_seen": 142730955, + "router_z_loss_clip": 1.72363281, + "router_z_loss_mlp": 0.14312744, + "step": 6645, + "time_per_iteration": 2.499361753463745 + }, + { + "auxiliary_loss_clip": 0.06458238, + "auxiliary_loss_mlp": 0.01274341, + "balance_loss_clip": 0.06283879, + "balance_loss_mlp": 0.01259321, + "epoch": 0.3995791372313242, + "flos": 23593181005440.0, + "grad_norm": 1.5926569767996783, + "language_loss": 0.7044934, + "learning_rate": 2.73028496487595e-06, + "loss": 0.78181922, + "num_input_tokens_seen": 142751200, + "router_z_loss_clip": 1.74414062, + "router_z_loss_mlp": 0.15026855, + "step": 6646, + "time_per_iteration": 2.619114875793457 + }, + { + "auxiliary_loss_clip": 0.06456489, + "auxiliary_loss_mlp": 0.01271766, + "balance_loss_clip": 0.06284152, + "balance_loss_mlp": 0.01257103, + "epoch": 0.3996392604839922, + "flos": 21361428633600.0, + "grad_norm": 2.2667385155288917, + "language_loss": 0.72035694, + "learning_rate": 2.729922381038513e-06, + "loss": 0.79763949, + "num_input_tokens_seen": 142770170, + "router_z_loss_clip": 1.72265625, + "router_z_loss_mlp": 0.14660645, + "step": 6647, + "time_per_iteration": 2.58251953125 + }, + { + "auxiliary_loss_clip": 0.06449988, + "auxiliary_loss_mlp": 0.01272061, + "balance_loss_clip": 0.06284988, + "balance_loss_mlp": 0.01257195, + "epoch": 0.39969938373666014, + "flos": 26039604337920.0, + "grad_norm": 1.4692875023338006, + "language_loss": 0.74830031, + "learning_rate": 2.7295597695223463e-06, + "loss": 0.82552081, + "num_input_tokens_seen": 142792680, + "router_z_loss_clip": 1.64941406, + "router_z_loss_mlp": 0.14849854, + "step": 6648, + "time_per_iteration": 2.7020201683044434 + }, + { + "auxiliary_loss_clip": 0.06453955, + "auxiliary_loss_mlp": 0.0126884, + "balance_loss_clip": 0.06283584, + "balance_loss_mlp": 0.0125472, + "epoch": 0.3997595069893281, + "flos": 20121858057600.0, + "grad_norm": 2.0106261298514907, + "language_loss": 0.65986454, + "learning_rate": 2.7291971303412006e-06, + "loss": 0.73709244, + "num_input_tokens_seen": 142810510, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.14117432, + "step": 6649, + "time_per_iteration": 3.9323928356170654 + }, + { + "auxiliary_loss_clip": 0.06463098, + "auxiliary_loss_mlp": 0.0127713, + "balance_loss_clip": 0.06290667, + "balance_loss_mlp": 0.01260774, + "epoch": 0.39981963024199607, + "flos": 27791016779520.0, + "grad_norm": 1.831691866077207, + "language_loss": 0.75774682, + "learning_rate": 2.728834463508826e-06, + "loss": 0.83514905, + "num_input_tokens_seen": 142832455, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.16357422, + "step": 6650, + "time_per_iteration": 2.6374714374542236 + }, + { + "auxiliary_loss_clip": 0.06454846, + "auxiliary_loss_mlp": 0.01272611, + "balance_loss_clip": 0.06283177, + "balance_loss_mlp": 0.01257782, + "epoch": 0.39987975349466404, + "flos": 21950864979840.0, + "grad_norm": 1.4608995971033776, + "language_loss": 0.7199676, + "learning_rate": 2.728471769038975e-06, + "loss": 0.79724216, + "num_input_tokens_seen": 142852590, + "router_z_loss_clip": 1.71582031, + "router_z_loss_mlp": 0.14831543, + "step": 6651, + "time_per_iteration": 2.5789706707000732 + }, + { + "auxiliary_loss_clip": 0.06457064, + "auxiliary_loss_mlp": 0.01269592, + "balance_loss_clip": 0.06283179, + "balance_loss_mlp": 0.01255245, + "epoch": 0.39993987674733206, + "flos": 20710707425280.0, + "grad_norm": 1.930350074981486, + "language_loss": 0.73724478, + "learning_rate": 2.728109046945403e-06, + "loss": 0.8145113, + "num_input_tokens_seen": 142870595, + "router_z_loss_clip": 1.74023438, + "router_z_loss_mlp": 0.14331055, + "step": 6652, + "time_per_iteration": 3.9592838287353516 + }, + { + "auxiliary_loss_clip": 0.06347093, + "auxiliary_loss_mlp": 0.01255075, + "balance_loss_clip": 0.06271589, + "balance_loss_mlp": 0.01252878, + "epoch": 0.4, + "flos": 61543566397440.0, + "grad_norm": 0.8159851457251004, + "language_loss": 0.60542929, + "learning_rate": 2.727746297241862e-06, + "loss": 0.68145096, + "num_input_tokens_seen": 142925805, + "router_z_loss_clip": 0.75732422, + "router_z_loss_mlp": 0.02201843, + "step": 6653, + "time_per_iteration": 3.0700466632843018 + }, + { + "auxiliary_loss_clip": 0.06454087, + "auxiliary_loss_mlp": 0.01272182, + "balance_loss_clip": 0.0629051, + "balance_loss_mlp": 0.01257698, + "epoch": 0.400060123252668, + "flos": 14507655592320.0, + "grad_norm": 1.9278074838902122, + "language_loss": 0.66929328, + "learning_rate": 2.7273835199421085e-06, + "loss": 0.74655592, + "num_input_tokens_seen": 142943145, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.14477539, + "step": 6654, + "time_per_iteration": 2.5292413234710693 + }, + { + "auxiliary_loss_clip": 0.06457023, + "auxiliary_loss_mlp": 0.01271182, + "balance_loss_clip": 0.06287654, + "balance_loss_mlp": 0.01257396, + "epoch": 0.40012024650533595, + "flos": 19098383961600.0, + "grad_norm": 1.998304088554008, + "language_loss": 0.90550762, + "learning_rate": 2.7270207150599e-06, + "loss": 0.98278964, + "num_input_tokens_seen": 142956925, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.13775635, + "step": 6655, + "time_per_iteration": 2.529496192932129 + }, + { + "auxiliary_loss_clip": 0.06450539, + "auxiliary_loss_mlp": 0.012675, + "balance_loss_clip": 0.06286812, + "balance_loss_mlp": 0.01254899, + "epoch": 0.4001803697580039, + "flos": 29358673217280.0, + "grad_norm": 1.6559902316252946, + "language_loss": 0.73729336, + "learning_rate": 2.7266578826089917e-06, + "loss": 0.81447375, + "num_input_tokens_seen": 142978040, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.1260376, + "step": 6656, + "time_per_iteration": 4.062687158584595 + }, + { + "auxiliary_loss_clip": 0.0645894, + "auxiliary_loss_mlp": 0.01271003, + "balance_loss_clip": 0.06288408, + "balance_loss_mlp": 0.01255696, + "epoch": 0.4002404930106719, + "flos": 20925839583360.0, + "grad_norm": 1.4738199157728433, + "language_loss": 0.73207194, + "learning_rate": 2.726295022603144e-06, + "loss": 0.80937135, + "num_input_tokens_seen": 142998390, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.15307617, + "step": 6657, + "time_per_iteration": 2.5996904373168945 + }, + { + "auxiliary_loss_clip": 0.06458808, + "auxiliary_loss_mlp": 0.0127186, + "balance_loss_clip": 0.06288153, + "balance_loss_mlp": 0.01256506, + "epoch": 0.40030061626333985, + "flos": 28413799850880.0, + "grad_norm": 1.489557881553797, + "language_loss": 0.79247761, + "learning_rate": 2.725932135056117e-06, + "loss": 0.86978424, + "num_input_tokens_seen": 143021505, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.15350342, + "step": 6658, + "time_per_iteration": 2.7172279357910156 + }, + { + "auxiliary_loss_clip": 0.06459276, + "auxiliary_loss_mlp": 0.01278121, + "balance_loss_clip": 0.06289512, + "balance_loss_mlp": 0.01264084, + "epoch": 0.4003607395160078, + "flos": 25928746986240.0, + "grad_norm": 2.1209995886317956, + "language_loss": 0.77640641, + "learning_rate": 2.72556921998167e-06, + "loss": 0.85378039, + "num_input_tokens_seen": 143041375, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.14050293, + "step": 6659, + "time_per_iteration": 4.3210484981536865 + }, + { + "auxiliary_loss_clip": 0.06450686, + "auxiliary_loss_mlp": 0.01279792, + "balance_loss_clip": 0.06291049, + "balance_loss_mlp": 0.01267442, + "epoch": 0.4004208627686758, + "flos": 20773501660800.0, + "grad_norm": 1.7380110296153854, + "language_loss": 0.73432875, + "learning_rate": 2.7252062773935662e-06, + "loss": 0.81163359, + "num_input_tokens_seen": 143058725, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.12359619, + "step": 6660, + "time_per_iteration": 2.668088436126709 + }, + { + "auxiliary_loss_clip": 0.06457424, + "auxiliary_loss_mlp": 0.01270844, + "balance_loss_clip": 0.06287603, + "balance_loss_mlp": 0.01258077, + "epoch": 0.40048098602134374, + "flos": 24688170161280.0, + "grad_norm": 2.131845423391088, + "language_loss": 0.71318859, + "learning_rate": 2.7248433073055674e-06, + "loss": 0.79047126, + "num_input_tokens_seen": 143076995, + "router_z_loss_clip": 1.69726562, + "router_z_loss_mlp": 0.12786865, + "step": 6661, + "time_per_iteration": 2.5673065185546875 + }, + { + "auxiliary_loss_clip": 0.06462744, + "auxiliary_loss_mlp": 0.01272248, + "balance_loss_clip": 0.06291083, + "balance_loss_mlp": 0.01257889, + "epoch": 0.4005411092740117, + "flos": 23192448053760.0, + "grad_norm": 1.7831816831822005, + "language_loss": 0.75751495, + "learning_rate": 2.724480309731437e-06, + "loss": 0.83486485, + "num_input_tokens_seen": 143096780, + "router_z_loss_clip": 1.71582031, + "router_z_loss_mlp": 0.14361572, + "step": 6662, + "time_per_iteration": 2.5870559215545654 + }, + { + "auxiliary_loss_clip": 0.06461672, + "auxiliary_loss_mlp": 0.01271183, + "balance_loss_clip": 0.0628756, + "balance_loss_mlp": 0.01256175, + "epoch": 0.4006012325266797, + "flos": 17526786382080.0, + "grad_norm": 2.241735466255753, + "language_loss": 0.66247231, + "learning_rate": 2.7241172846849417e-06, + "loss": 0.73980081, + "num_input_tokens_seen": 143112590, + "router_z_loss_clip": 1.7421875, + "router_z_loss_mlp": 0.15014648, + "step": 6663, + "time_per_iteration": 2.5879623889923096 + }, + { + "auxiliary_loss_clip": 0.06461117, + "auxiliary_loss_mlp": 0.01271573, + "balance_loss_clip": 0.06290103, + "balance_loss_mlp": 0.01257316, + "epoch": 0.40066135577934764, + "flos": 19862016946560.0, + "grad_norm": 2.129058070747091, + "language_loss": 0.86377645, + "learning_rate": 2.7237542321798455e-06, + "loss": 0.94110334, + "num_input_tokens_seen": 143130220, + "router_z_loss_clip": 1.70898438, + "router_z_loss_mlp": 0.14251709, + "step": 6664, + "time_per_iteration": 2.580240249633789 + }, + { + "auxiliary_loss_clip": 0.06459028, + "auxiliary_loss_mlp": 0.01272821, + "balance_loss_clip": 0.06287652, + "balance_loss_mlp": 0.01259064, + "epoch": 0.40072147903201566, + "flos": 18155816582400.0, + "grad_norm": 1.9805392577959038, + "language_loss": 0.84895325, + "learning_rate": 2.723391152229917e-06, + "loss": 0.92627168, + "num_input_tokens_seen": 143147160, + "router_z_loss_clip": 1.71191406, + "router_z_loss_mlp": 0.13751221, + "step": 6665, + "time_per_iteration": 2.50386381149292 + }, + { + "auxiliary_loss_clip": 0.06457423, + "auxiliary_loss_mlp": 0.01268968, + "balance_loss_clip": 0.06286919, + "balance_loss_mlp": 0.0125458, + "epoch": 0.4007816022846836, + "flos": 18667239177600.0, + "grad_norm": 1.826402815553393, + "language_loss": 0.78598213, + "learning_rate": 2.7230280448489236e-06, + "loss": 0.86324608, + "num_input_tokens_seen": 143164605, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.14404297, + "step": 6666, + "time_per_iteration": 2.5133461952209473 + }, + { + "auxiliary_loss_clip": 0.06465514, + "auxiliary_loss_mlp": 0.01268831, + "balance_loss_clip": 0.06295928, + "balance_loss_mlp": 0.01253834, + "epoch": 0.4008417255373516, + "flos": 25710344519040.0, + "grad_norm": 1.8943268651740763, + "language_loss": 0.74139559, + "learning_rate": 2.7226649100506333e-06, + "loss": 0.81873906, + "num_input_tokens_seen": 143183965, + "router_z_loss_clip": 1.69433594, + "router_z_loss_mlp": 0.14990234, + "step": 6667, + "time_per_iteration": 2.635195732116699 + }, + { + "auxiliary_loss_clip": 0.06460091, + "auxiliary_loss_mlp": 0.01273802, + "balance_loss_clip": 0.06287248, + "balance_loss_mlp": 0.01258519, + "epoch": 0.40090184879001955, + "flos": 22865536149120.0, + "grad_norm": 1.4912552700664468, + "language_loss": 0.75818384, + "learning_rate": 2.7223017478488183e-06, + "loss": 0.83552277, + "num_input_tokens_seen": 143204965, + "router_z_loss_clip": 1.72851562, + "router_z_loss_mlp": 0.15270996, + "step": 6668, + "time_per_iteration": 2.567748546600342 + }, + { + "auxiliary_loss_clip": 0.06454465, + "auxiliary_loss_mlp": 0.01272615, + "balance_loss_clip": 0.0628936, + "balance_loss_mlp": 0.01258572, + "epoch": 0.4009619720426875, + "flos": 29067581733120.0, + "grad_norm": 1.8066450616757106, + "language_loss": 0.82171971, + "learning_rate": 2.721938558257248e-06, + "loss": 0.89899051, + "num_input_tokens_seen": 143225015, + "router_z_loss_clip": 1.64941406, + "router_z_loss_mlp": 0.14050293, + "step": 6669, + "time_per_iteration": 2.614875555038452 + }, + { + "auxiliary_loss_clip": 0.06349576, + "auxiliary_loss_mlp": 0.01259788, + "balance_loss_clip": 0.06273951, + "balance_loss_mlp": 0.01257549, + "epoch": 0.4010220952953555, + "flos": 66080347136640.0, + "grad_norm": 0.6837113267664942, + "language_loss": 0.53268963, + "learning_rate": 2.721575341289695e-06, + "loss": 0.60878325, + "num_input_tokens_seen": 143294925, + "router_z_loss_clip": 0.75683594, + "router_z_loss_mlp": 0.02243042, + "step": 6670, + "time_per_iteration": 3.2985219955444336 + }, + { + "auxiliary_loss_clip": 0.06453651, + "auxiliary_loss_mlp": 0.01274966, + "balance_loss_clip": 0.06286684, + "balance_loss_mlp": 0.01260405, + "epoch": 0.40108221854802345, + "flos": 29650519388160.0, + "grad_norm": 1.6370315093264123, + "language_loss": 0.88528681, + "learning_rate": 2.7212120969599333e-06, + "loss": 0.96257305, + "num_input_tokens_seen": 143314170, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.14556885, + "step": 6671, + "time_per_iteration": 2.6268246173858643 + }, + { + "auxiliary_loss_clip": 0.06460971, + "auxiliary_loss_mlp": 0.01272066, + "balance_loss_clip": 0.06289764, + "balance_loss_mlp": 0.01256861, + "epoch": 0.4011423418006914, + "flos": 19934286693120.0, + "grad_norm": 1.7015153377224497, + "language_loss": 0.78868973, + "learning_rate": 2.720848825281736e-06, + "loss": 0.86602008, + "num_input_tokens_seen": 143330050, + "router_z_loss_clip": 1.7109375, + "router_z_loss_mlp": 0.1519165, + "step": 6672, + "time_per_iteration": 2.4949698448181152 + }, + { + "auxiliary_loss_clip": 0.06458279, + "auxiliary_loss_mlp": 0.01271887, + "balance_loss_clip": 0.06290099, + "balance_loss_mlp": 0.01257701, + "epoch": 0.4012024650533594, + "flos": 20090523830400.0, + "grad_norm": 2.076088840896174, + "language_loss": 0.63474464, + "learning_rate": 2.72048552626888e-06, + "loss": 0.71204633, + "num_input_tokens_seen": 143348650, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.1418457, + "step": 6673, + "time_per_iteration": 2.644050121307373 + }, + { + "auxiliary_loss_clip": 0.06458048, + "auxiliary_loss_mlp": 0.0127375, + "balance_loss_clip": 0.062879, + "balance_loss_mlp": 0.01259827, + "epoch": 0.40126258830602735, + "flos": 21703224637440.0, + "grad_norm": 1.4478595936596839, + "language_loss": 0.80581552, + "learning_rate": 2.7201221999351402e-06, + "loss": 0.88313353, + "num_input_tokens_seen": 143370275, + "router_z_loss_clip": 1.70117188, + "router_z_loss_mlp": 0.13903809, + "step": 6674, + "time_per_iteration": 2.559034824371338 + }, + { + "auxiliary_loss_clip": 0.0646532, + "auxiliary_loss_mlp": 0.01272896, + "balance_loss_clip": 0.06289816, + "balance_loss_mlp": 0.01258269, + "epoch": 0.4013227115586953, + "flos": 12025160277120.0, + "grad_norm": 2.4455561687367195, + "language_loss": 0.82561237, + "learning_rate": 2.719758846294294e-06, + "loss": 0.90299457, + "num_input_tokens_seen": 143385390, + "router_z_loss_clip": 1.75390625, + "router_z_loss_mlp": 0.14624023, + "step": 6675, + "time_per_iteration": 2.5448951721191406 + }, + { + "auxiliary_loss_clip": 0.06465134, + "auxiliary_loss_mlp": 0.01268709, + "balance_loss_clip": 0.06295693, + "balance_loss_mlp": 0.01254106, + "epoch": 0.4013828348113633, + "flos": 25454612257920.0, + "grad_norm": 1.6408733853472015, + "language_loss": 0.93777156, + "learning_rate": 2.71939546536012e-06, + "loss": 1.01511002, + "num_input_tokens_seen": 143404215, + "router_z_loss_clip": 1.69433594, + "router_z_loss_mlp": 0.14581299, + "step": 6676, + "time_per_iteration": 2.5721349716186523 + }, + { + "auxiliary_loss_clip": 0.06469207, + "auxiliary_loss_mlp": 0.01274451, + "balance_loss_clip": 0.06291738, + "balance_loss_mlp": 0.01258274, + "epoch": 0.40144295806403124, + "flos": 18588009542400.0, + "grad_norm": 2.5026106137632222, + "language_loss": 0.80060673, + "learning_rate": 2.719032057146399e-06, + "loss": 0.87804335, + "num_input_tokens_seen": 143422245, + "router_z_loss_clip": 1.7734375, + "router_z_loss_mlp": 0.16186523, + "step": 6677, + "time_per_iteration": 2.5438191890716553 + }, + { + "auxiliary_loss_clip": 0.06455022, + "auxiliary_loss_mlp": 0.01270715, + "balance_loss_clip": 0.0628567, + "balance_loss_mlp": 0.01256934, + "epoch": 0.4015030813166992, + "flos": 22936925427840.0, + "grad_norm": 1.8567640541952835, + "language_loss": 0.83925951, + "learning_rate": 2.71866862166691e-06, + "loss": 0.9165169, + "num_input_tokens_seen": 143443130, + "router_z_loss_clip": 1.69042969, + "router_z_loss_mlp": 0.13793945, + "step": 6678, + "time_per_iteration": 2.5458457469940186 + }, + { + "auxiliary_loss_clip": 0.06455562, + "auxiliary_loss_mlp": 0.0127344, + "balance_loss_clip": 0.06287661, + "balance_loss_mlp": 0.01258325, + "epoch": 0.4015632045693672, + "flos": 20601359447040.0, + "grad_norm": 2.2595275456436767, + "language_loss": 0.6400671, + "learning_rate": 2.718305158935434e-06, + "loss": 0.7173571, + "num_input_tokens_seen": 143461385, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.15124512, + "step": 6679, + "time_per_iteration": 2.553312063217163 + }, + { + "auxiliary_loss_clip": 0.0645475, + "auxiliary_loss_mlp": 0.01270251, + "balance_loss_clip": 0.06285992, + "balance_loss_mlp": 0.01256268, + "epoch": 0.4016233278220352, + "flos": 23445371203200.0, + "grad_norm": 1.525723625053638, + "language_loss": 0.78686285, + "learning_rate": 2.7179416689657554e-06, + "loss": 0.86411297, + "num_input_tokens_seen": 143481750, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.14001465, + "step": 6680, + "time_per_iteration": 2.5376389026641846 + }, + { + "auxiliary_loss_clip": 0.0646753, + "auxiliary_loss_mlp": 0.0127372, + "balance_loss_clip": 0.06289258, + "balance_loss_mlp": 0.01258008, + "epoch": 0.40168345107470316, + "flos": 21436968689280.0, + "grad_norm": 1.5038657697958466, + "language_loss": 0.76059246, + "learning_rate": 2.7175781517716556e-06, + "loss": 0.83800501, + "num_input_tokens_seen": 143501540, + "router_z_loss_clip": 1.78417969, + "router_z_loss_mlp": 0.15710449, + "step": 6681, + "time_per_iteration": 2.532668352127075 + }, + { + "auxiliary_loss_clip": 0.06461542, + "auxiliary_loss_mlp": 0.01268459, + "balance_loss_clip": 0.06289437, + "balance_loss_mlp": 0.01254285, + "epoch": 0.4017435743273711, + "flos": 22863900994560.0, + "grad_norm": 2.212326324471445, + "language_loss": 0.6446861, + "learning_rate": 2.7172146073669213e-06, + "loss": 0.72198606, + "num_input_tokens_seen": 143520530, + "router_z_loss_clip": 1.72070312, + "router_z_loss_mlp": 0.1416626, + "step": 6682, + "time_per_iteration": 2.585963010787964 + }, + { + "auxiliary_loss_clip": 0.06452938, + "auxiliary_loss_mlp": 0.01271302, + "balance_loss_clip": 0.06279296, + "balance_loss_mlp": 0.01257288, + "epoch": 0.4018036975800391, + "flos": 28630022112000.0, + "grad_norm": 1.839007150843812, + "language_loss": 0.73340857, + "learning_rate": 2.716851035765337e-06, + "loss": 0.81065094, + "num_input_tokens_seen": 143540210, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.14013672, + "step": 6683, + "time_per_iteration": 2.5977652072906494 + }, + { + "auxiliary_loss_clip": 0.06452199, + "auxiliary_loss_mlp": 0.01270902, + "balance_loss_clip": 0.0628196, + "balance_loss_mlp": 0.01257252, + "epoch": 0.40186382083270705, + "flos": 26658446267520.0, + "grad_norm": 1.545951486041889, + "language_loss": 0.73326242, + "learning_rate": 2.7164874369806896e-06, + "loss": 0.81049347, + "num_input_tokens_seen": 143560940, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.13671875, + "step": 6684, + "time_per_iteration": 2.579061985015869 + }, + { + "auxiliary_loss_clip": 0.06341122, + "auxiliary_loss_mlp": 0.01263578, + "balance_loss_clip": 0.06265609, + "balance_loss_mlp": 0.01260683, + "epoch": 0.401923944085375, + "flos": 59277167562240.0, + "grad_norm": 0.7966859396902427, + "language_loss": 0.60515714, + "learning_rate": 2.716123811026767e-06, + "loss": 0.68120408, + "num_input_tokens_seen": 143624015, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 0.02891541, + "step": 6685, + "time_per_iteration": 3.2738587856292725 + }, + { + "auxiliary_loss_clip": 0.06456321, + "auxiliary_loss_mlp": 0.01269632, + "balance_loss_clip": 0.06278493, + "balance_loss_mlp": 0.01255291, + "epoch": 0.401984067338043, + "flos": 16988473825920.0, + "grad_norm": 1.7615677724791905, + "language_loss": 0.70125616, + "learning_rate": 2.715760157917357e-06, + "loss": 0.77851576, + "num_input_tokens_seen": 143642750, + "router_z_loss_clip": 1.77929688, + "router_z_loss_mlp": 0.14343262, + "step": 6686, + "time_per_iteration": 2.565185070037842 + }, + { + "auxiliary_loss_clip": 0.06450202, + "auxiliary_loss_mlp": 0.01269094, + "balance_loss_clip": 0.06279442, + "balance_loss_mlp": 0.0125554, + "epoch": 0.40204419059071095, + "flos": 24979387426560.0, + "grad_norm": 1.3440220766592053, + "language_loss": 0.74867636, + "learning_rate": 2.7153964776662504e-06, + "loss": 0.82586932, + "num_input_tokens_seen": 143664515, + "router_z_loss_clip": 1.70800781, + "router_z_loss_mlp": 0.13549805, + "step": 6687, + "time_per_iteration": 2.6009433269500732 + }, + { + "auxiliary_loss_clip": 0.06451625, + "auxiliary_loss_mlp": 0.01275028, + "balance_loss_clip": 0.06281097, + "balance_loss_mlp": 0.01261164, + "epoch": 0.4021043138433789, + "flos": 23484252297600.0, + "grad_norm": 1.7565801002117698, + "language_loss": 0.71198428, + "learning_rate": 2.7150327702872385e-06, + "loss": 0.78925073, + "num_input_tokens_seen": 143683135, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.13873291, + "step": 6688, + "time_per_iteration": 3.9550609588623047 + }, + { + "auxiliary_loss_clip": 0.06455014, + "auxiliary_loss_mlp": 0.01278979, + "balance_loss_clip": 0.06277826, + "balance_loss_mlp": 0.01263506, + "epoch": 0.4021644370960469, + "flos": 26003155011840.0, + "grad_norm": 1.6503070586239919, + "language_loss": 0.64854121, + "learning_rate": 2.7146690357941112e-06, + "loss": 0.7258811, + "num_input_tokens_seen": 143703985, + "router_z_loss_clip": 1.77246094, + "router_z_loss_mlp": 0.15478516, + "step": 6689, + "time_per_iteration": 2.552058458328247 + }, + { + "auxiliary_loss_clip": 0.06450799, + "auxiliary_loss_mlp": 0.01267992, + "balance_loss_clip": 0.06276366, + "balance_loss_mlp": 0.0125417, + "epoch": 0.40222456034871484, + "flos": 13592816714880.0, + "grad_norm": 1.9543405887805447, + "language_loss": 0.73594153, + "learning_rate": 2.7143052742006632e-06, + "loss": 0.81312943, + "num_input_tokens_seen": 143719245, + "router_z_loss_clip": 1.74414062, + "router_z_loss_mlp": 0.13824463, + "step": 6690, + "time_per_iteration": 2.5484251976013184 + }, + { + "auxiliary_loss_clip": 0.06448495, + "auxiliary_loss_mlp": 0.0127057, + "balance_loss_clip": 0.06278096, + "balance_loss_mlp": 0.01256682, + "epoch": 0.4022846836013828, + "flos": 24284586170880.0, + "grad_norm": 1.722227920192768, + "language_loss": 0.74861401, + "learning_rate": 2.7139414855206872e-06, + "loss": 0.82580471, + "num_input_tokens_seen": 143739575, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.13903809, + "step": 6691, + "time_per_iteration": 3.9708051681518555 + }, + { + "auxiliary_loss_clip": 0.06451076, + "auxiliary_loss_mlp": 0.01277672, + "balance_loss_clip": 0.0627808, + "balance_loss_mlp": 0.01262151, + "epoch": 0.40234480685405083, + "flos": 20156881864320.0, + "grad_norm": 1.7761891830354823, + "language_loss": 0.72677463, + "learning_rate": 2.7135776697679785e-06, + "loss": 0.80406213, + "num_input_tokens_seen": 143758515, + "router_z_loss_clip": 1.72949219, + "router_z_loss_mlp": 0.15515137, + "step": 6692, + "time_per_iteration": 2.5179357528686523 + }, + { + "auxiliary_loss_clip": 0.06447224, + "auxiliary_loss_mlp": 0.01270814, + "balance_loss_clip": 0.06276847, + "balance_loss_mlp": 0.0125664, + "epoch": 0.4024049301067188, + "flos": 22936925427840.0, + "grad_norm": 1.7625804596819372, + "language_loss": 0.8401857, + "learning_rate": 2.7132138269563333e-06, + "loss": 0.91736615, + "num_input_tokens_seen": 143776770, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.1418457, + "step": 6693, + "time_per_iteration": 2.707941770553589 + }, + { + "auxiliary_loss_clip": 0.06452498, + "auxiliary_loss_mlp": 0.01266294, + "balance_loss_clip": 0.06281643, + "balance_loss_mlp": 0.01252865, + "epoch": 0.40246505335938676, + "flos": 36037285297920.0, + "grad_norm": 1.8844808694168769, + "language_loss": 0.70966387, + "learning_rate": 2.7128499570995483e-06, + "loss": 0.78685182, + "num_input_tokens_seen": 143798450, + "router_z_loss_clip": 1.70996094, + "router_z_loss_mlp": 0.13433838, + "step": 6694, + "time_per_iteration": 2.637481927871704 + }, + { + "auxiliary_loss_clip": 0.06444509, + "auxiliary_loss_mlp": 0.01272964, + "balance_loss_clip": 0.0627351, + "balance_loss_mlp": 0.01258552, + "epoch": 0.4025251766120547, + "flos": 20600478979200.0, + "grad_norm": 1.9746374404018712, + "language_loss": 0.68475246, + "learning_rate": 2.7124860602114212e-06, + "loss": 0.76192719, + "num_input_tokens_seen": 143816995, + "router_z_loss_clip": 1.7109375, + "router_z_loss_mlp": 0.14428711, + "step": 6695, + "time_per_iteration": 3.9740405082702637 + }, + { + "auxiliary_loss_clip": 0.06446315, + "auxiliary_loss_mlp": 0.01270396, + "balance_loss_clip": 0.06276862, + "balance_loss_mlp": 0.01256484, + "epoch": 0.4025852998647227, + "flos": 64537582890240.0, + "grad_norm": 2.0865884556399363, + "language_loss": 0.79765463, + "learning_rate": 2.7121221363057515e-06, + "loss": 0.87482178, + "num_input_tokens_seen": 143842090, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.13897705, + "step": 6696, + "time_per_iteration": 3.0413708686828613 + }, + { + "auxiliary_loss_clip": 0.06454235, + "auxiliary_loss_mlp": 0.01269123, + "balance_loss_clip": 0.06281278, + "balance_loss_mlp": 0.01254473, + "epoch": 0.40264542311739066, + "flos": 20892534785280.0, + "grad_norm": 1.7976365729577468, + "language_loss": 0.71608603, + "learning_rate": 2.7117581853963393e-06, + "loss": 0.79331958, + "num_input_tokens_seen": 143860800, + "router_z_loss_clip": 1.72851562, + "router_z_loss_mlp": 0.14660645, + "step": 6697, + "time_per_iteration": 2.5200350284576416 + }, + { + "auxiliary_loss_clip": 0.06445032, + "auxiliary_loss_mlp": 0.01270069, + "balance_loss_clip": 0.06276169, + "balance_loss_mlp": 0.0125658, + "epoch": 0.4027055463700586, + "flos": 26257419826560.0, + "grad_norm": 1.9918981514977272, + "language_loss": 0.61230171, + "learning_rate": 2.711394207496984e-06, + "loss": 0.68945277, + "num_input_tokens_seen": 143878950, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.13464355, + "step": 6698, + "time_per_iteration": 2.576472520828247 + }, + { + "auxiliary_loss_clip": 0.06449181, + "auxiliary_loss_mlp": 0.0126685, + "balance_loss_clip": 0.06276856, + "balance_loss_mlp": 0.01252849, + "epoch": 0.4027656696227266, + "flos": 20637682992000.0, + "grad_norm": 2.0070875825685266, + "language_loss": 0.77479243, + "learning_rate": 2.711030202621491e-06, + "loss": 0.85195273, + "num_input_tokens_seen": 143898385, + "router_z_loss_clip": 1.72363281, + "router_z_loss_mlp": 0.14001465, + "step": 6699, + "time_per_iteration": 3.937375545501709 + }, + { + "auxiliary_loss_clip": 0.0644554, + "auxiliary_loss_mlp": 0.01267758, + "balance_loss_clip": 0.0627719, + "balance_loss_mlp": 0.01253977, + "epoch": 0.40282579287539455, + "flos": 22352855742720.0, + "grad_norm": 1.735185416550665, + "language_loss": 0.80698907, + "learning_rate": 2.7106661707836605e-06, + "loss": 0.88412201, + "num_input_tokens_seen": 143918795, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.13793945, + "step": 6700, + "time_per_iteration": 2.535510540008545 + }, + { + "auxiliary_loss_clip": 0.06459837, + "auxiliary_loss_mlp": 0.01268332, + "balance_loss_clip": 0.06282608, + "balance_loss_mlp": 0.01253157, + "epoch": 0.4028859161280625, + "flos": 29282126912640.0, + "grad_norm": 1.7653471156752092, + "language_loss": 0.74938649, + "learning_rate": 2.7103021119972977e-06, + "loss": 0.82666814, + "num_input_tokens_seen": 143938245, + "router_z_loss_clip": 1.77246094, + "router_z_loss_mlp": 0.1517334, + "step": 6701, + "time_per_iteration": 2.6509363651275635 + }, + { + "auxiliary_loss_clip": 0.06451308, + "auxiliary_loss_mlp": 0.01270948, + "balance_loss_clip": 0.06281418, + "balance_loss_mlp": 0.01257329, + "epoch": 0.4029460393807305, + "flos": 28630022112000.0, + "grad_norm": 1.48917022125432, + "language_loss": 0.66283298, + "learning_rate": 2.709938026276208e-06, + "loss": 0.74005556, + "num_input_tokens_seen": 143960995, + "router_z_loss_clip": 1.69824219, + "router_z_loss_mlp": 0.13641357, + "step": 6702, + "time_per_iteration": 2.6183536052703857 + }, + { + "auxiliary_loss_clip": 0.06460792, + "auxiliary_loss_mlp": 0.0127397, + "balance_loss_clip": 0.06286055, + "balance_loss_mlp": 0.01259117, + "epoch": 0.40300616263339845, + "flos": 22608588003840.0, + "grad_norm": 1.5996325972429297, + "language_loss": 0.66632348, + "learning_rate": 2.7095739136341964e-06, + "loss": 0.74367112, + "num_input_tokens_seen": 143979910, + "router_z_loss_clip": 1.74511719, + "router_z_loss_mlp": 0.14849854, + "step": 6703, + "time_per_iteration": 2.583040237426758 + }, + { + "auxiliary_loss_clip": 0.06456298, + "auxiliary_loss_mlp": 0.01273361, + "balance_loss_clip": 0.06282306, + "balance_loss_mlp": 0.012584, + "epoch": 0.4030662858860664, + "flos": 25527385128960.0, + "grad_norm": 1.7345540067512994, + "language_loss": 0.82398093, + "learning_rate": 2.709209774085071e-06, + "loss": 0.90127754, + "num_input_tokens_seen": 144000095, + "router_z_loss_clip": 1.73925781, + "router_z_loss_mlp": 0.14959717, + "step": 6704, + "time_per_iteration": 2.564052104949951 + }, + { + "auxiliary_loss_clip": 0.06457714, + "auxiliary_loss_mlp": 0.01272416, + "balance_loss_clip": 0.06283459, + "balance_loss_mlp": 0.01258332, + "epoch": 0.40312640913873443, + "flos": 23593474494720.0, + "grad_norm": 1.6434462448941187, + "language_loss": 0.73919153, + "learning_rate": 2.7088456076426407e-06, + "loss": 0.81649286, + "num_input_tokens_seen": 144019695, + "router_z_loss_clip": 1.74316406, + "router_z_loss_mlp": 0.140625, + "step": 6705, + "time_per_iteration": 2.609738349914551 + }, + { + "auxiliary_loss_clip": 0.06450006, + "auxiliary_loss_mlp": 0.01270089, + "balance_loss_clip": 0.06282469, + "balance_loss_mlp": 0.01256481, + "epoch": 0.4031865323914024, + "flos": 20017205907840.0, + "grad_norm": 1.6242014521871173, + "language_loss": 0.66795284, + "learning_rate": 2.708481414320713e-06, + "loss": 0.74515378, + "num_input_tokens_seen": 144038525, + "router_z_loss_clip": 1.67480469, + "router_z_loss_mlp": 0.1361084, + "step": 6706, + "time_per_iteration": 2.5215423107147217 + }, + { + "auxiliary_loss_clip": 0.06452154, + "auxiliary_loss_mlp": 0.01268976, + "balance_loss_clip": 0.06282388, + "balance_loss_mlp": 0.0125513, + "epoch": 0.40324665564407036, + "flos": 21877840546560.0, + "grad_norm": 1.6449246324910813, + "language_loss": 0.71481538, + "learning_rate": 2.7081171941330992e-06, + "loss": 0.79202664, + "num_input_tokens_seen": 144059485, + "router_z_loss_clip": 1.70019531, + "router_z_loss_mlp": 0.13842773, + "step": 6707, + "time_per_iteration": 2.5762581825256348 + }, + { + "auxiliary_loss_clip": 0.0644149, + "auxiliary_loss_mlp": 0.01271296, + "balance_loss_clip": 0.06278867, + "balance_loss_mlp": 0.01258379, + "epoch": 0.4033067788967383, + "flos": 23885572227840.0, + "grad_norm": 1.6148090336243837, + "language_loss": 0.80062628, + "learning_rate": 2.707752947093611e-06, + "loss": 0.87775409, + "num_input_tokens_seen": 144080265, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.12908936, + "step": 6708, + "time_per_iteration": 2.5509586334228516 + }, + { + "auxiliary_loss_clip": 0.06459241, + "auxiliary_loss_mlp": 0.01271237, + "balance_loss_clip": 0.0628079, + "balance_loss_mlp": 0.01256133, + "epoch": 0.4033669021494063, + "flos": 17425530322560.0, + "grad_norm": 2.5431099630067435, + "language_loss": 0.8334195, + "learning_rate": 2.70738867321606e-06, + "loss": 0.91072428, + "num_input_tokens_seen": 144098040, + "router_z_loss_clip": 1.78417969, + "router_z_loss_mlp": 0.15100098, + "step": 6709, + "time_per_iteration": 2.5844790935516357 + }, + { + "auxiliary_loss_clip": 0.06454608, + "auxiliary_loss_mlp": 0.01274744, + "balance_loss_clip": 0.0628157, + "balance_loss_mlp": 0.01259211, + "epoch": 0.40342702540207426, + "flos": 29607277881600.0, + "grad_norm": 1.5307534200842645, + "language_loss": 0.71642667, + "learning_rate": 2.70702437251426e-06, + "loss": 0.79372019, + "num_input_tokens_seen": 144118265, + "router_z_loss_clip": 1.72753906, + "router_z_loss_mlp": 0.15527344, + "step": 6710, + "time_per_iteration": 2.5950214862823486 + }, + { + "auxiliary_loss_clip": 0.06448973, + "auxiliary_loss_mlp": 0.01270551, + "balance_loss_clip": 0.06280518, + "balance_loss_mlp": 0.01256037, + "epoch": 0.4034871486547422, + "flos": 11288249544960.0, + "grad_norm": 5.632076524924719, + "language_loss": 0.85771239, + "learning_rate": 2.7066600450020236e-06, + "loss": 0.93490767, + "num_input_tokens_seen": 144133865, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.1451416, + "step": 6711, + "time_per_iteration": 2.530691146850586 + }, + { + "auxiliary_loss_clip": 0.06457499, + "auxiliary_loss_mlp": 0.01273198, + "balance_loss_clip": 0.0628542, + "balance_loss_mlp": 0.01258732, + "epoch": 0.4035472719074102, + "flos": 15557097254400.0, + "grad_norm": 2.360012043566648, + "language_loss": 0.76516247, + "learning_rate": 2.706295690693168e-06, + "loss": 0.84246945, + "num_input_tokens_seen": 144150125, + "router_z_loss_clip": 1.72167969, + "router_z_loss_mlp": 0.14471436, + "step": 6712, + "time_per_iteration": 2.485973358154297 + }, + { + "auxiliary_loss_clip": 0.06453355, + "auxiliary_loss_mlp": 0.01270625, + "balance_loss_clip": 0.06282951, + "balance_loss_mlp": 0.01256249, + "epoch": 0.40360739516007815, + "flos": 24680162096640.0, + "grad_norm": 2.2673991582834803, + "language_loss": 0.80280489, + "learning_rate": 2.7059313096015096e-06, + "loss": 0.88004464, + "num_input_tokens_seen": 144169295, + "router_z_loss_clip": 1.70117188, + "router_z_loss_mlp": 0.14379883, + "step": 6713, + "time_per_iteration": 2.604844093322754 + }, + { + "auxiliary_loss_clip": 0.06452335, + "auxiliary_loss_mlp": 0.01272867, + "balance_loss_clip": 0.06279401, + "balance_loss_mlp": 0.01258824, + "epoch": 0.4036675184127461, + "flos": 17308635477120.0, + "grad_norm": 2.487123438751718, + "language_loss": 0.88458717, + "learning_rate": 2.705566901740865e-06, + "loss": 0.9618392, + "num_input_tokens_seen": 144185790, + "router_z_loss_clip": 1.73144531, + "router_z_loss_mlp": 0.14038086, + "step": 6714, + "time_per_iteration": 2.4827568531036377 + }, + { + "auxiliary_loss_clip": 0.06454237, + "auxiliary_loss_mlp": 0.01268245, + "balance_loss_clip": 0.06281483, + "balance_loss_mlp": 0.01254011, + "epoch": 0.4037276416654141, + "flos": 19869983084160.0, + "grad_norm": 1.5212273970247687, + "language_loss": 0.69752967, + "learning_rate": 2.7052024671250527e-06, + "loss": 0.77475452, + "num_input_tokens_seen": 144205190, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.14233398, + "step": 6715, + "time_per_iteration": 2.5602893829345703 + }, + { + "auxiliary_loss_clip": 0.06458366, + "auxiliary_loss_mlp": 0.01269769, + "balance_loss_clip": 0.06281729, + "balance_loss_mlp": 0.0125541, + "epoch": 0.40378776491808205, + "flos": 18302158938240.0, + "grad_norm": 1.8718399277124913, + "language_loss": 0.78095776, + "learning_rate": 2.704838005767892e-06, + "loss": 0.85823905, + "num_input_tokens_seen": 144222705, + "router_z_loss_clip": 1.76757812, + "router_z_loss_mlp": 0.14367676, + "step": 6716, + "time_per_iteration": 2.4911210536956787 + }, + { + "auxiliary_loss_clip": 0.06449929, + "auxiliary_loss_mlp": 0.01275524, + "balance_loss_clip": 0.0628348, + "balance_loss_mlp": 0.01262185, + "epoch": 0.40384788817075, + "flos": 15054772826880.0, + "grad_norm": 1.8985450182353327, + "language_loss": 0.76491797, + "learning_rate": 2.7044735176832037e-06, + "loss": 0.8421725, + "num_input_tokens_seen": 144239545, + "router_z_loss_clip": 1.66308594, + "router_z_loss_mlp": 0.13342285, + "step": 6717, + "time_per_iteration": 2.5457956790924072 + }, + { + "auxiliary_loss_clip": 0.0634857, + "auxiliary_loss_mlp": 0.01256954, + "balance_loss_clip": 0.06272445, + "balance_loss_mlp": 0.01254165, + "epoch": 0.40390801142341803, + "flos": 61948659761280.0, + "grad_norm": 0.8842261639057883, + "language_loss": 0.60140264, + "learning_rate": 2.7041090028848084e-06, + "loss": 0.67745787, + "num_input_tokens_seen": 144288145, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.02790833, + "step": 6718, + "time_per_iteration": 2.9733822345733643 + }, + { + "auxiliary_loss_clip": 0.06457312, + "auxiliary_loss_mlp": 0.0127584, + "balance_loss_clip": 0.06279647, + "balance_loss_mlp": 0.01260366, + "epoch": 0.403968134676086, + "flos": 22743945475200.0, + "grad_norm": 1.799198719667369, + "language_loss": 0.75286412, + "learning_rate": 2.7037444613865306e-06, + "loss": 0.83019567, + "num_input_tokens_seen": 144302315, + "router_z_loss_clip": 1.77441406, + "router_z_loss_mlp": 0.15490723, + "step": 6719, + "time_per_iteration": 2.5417115688323975 + }, + { + "auxiliary_loss_clip": 0.06454173, + "auxiliary_loss_mlp": 0.01269672, + "balance_loss_clip": 0.06282561, + "balance_loss_mlp": 0.01254592, + "epoch": 0.40402825792875396, + "flos": 19789244075520.0, + "grad_norm": 2.1951890128687257, + "language_loss": 0.81351668, + "learning_rate": 2.7033798932021906e-06, + "loss": 0.89075512, + "num_input_tokens_seen": 144318990, + "router_z_loss_clip": 1.71484375, + "router_z_loss_mlp": 0.15100098, + "step": 6720, + "time_per_iteration": 2.4906880855560303 + }, + { + "auxiliary_loss_clip": 0.06453006, + "auxiliary_loss_mlp": 0.01269643, + "balance_loss_clip": 0.06279179, + "balance_loss_mlp": 0.01254742, + "epoch": 0.40408838118142193, + "flos": 19615298999040.0, + "grad_norm": 1.8273574705972042, + "language_loss": 0.77227581, + "learning_rate": 2.7030152983456153e-06, + "loss": 0.84950233, + "num_input_tokens_seen": 144335765, + "router_z_loss_clip": 1.73925781, + "router_z_loss_mlp": 0.14904785, + "step": 6721, + "time_per_iteration": 2.5645196437835693 + }, + { + "auxiliary_loss_clip": 0.06447627, + "auxiliary_loss_mlp": 0.01264811, + "balance_loss_clip": 0.06279851, + "balance_loss_mlp": 0.01251931, + "epoch": 0.4041485044340899, + "flos": 24432982951680.0, + "grad_norm": 1.7503779333013576, + "language_loss": 0.72784024, + "learning_rate": 2.7026506768306304e-06, + "loss": 0.80496466, + "num_input_tokens_seen": 144355825, + "router_z_loss_clip": 1.67773438, + "router_z_loss_mlp": 0.12884521, + "step": 6722, + "time_per_iteration": 2.5520758628845215 + }, + { + "auxiliary_loss_clip": 0.06450947, + "auxiliary_loss_mlp": 0.01270139, + "balance_loss_clip": 0.06280953, + "balance_loss_mlp": 0.01256972, + "epoch": 0.40420862768675786, + "flos": 16765207822080.0, + "grad_norm": 1.6533819858806273, + "language_loss": 0.65986466, + "learning_rate": 2.7022860286710602e-06, + "loss": 0.73707551, + "num_input_tokens_seen": 144374320, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.13165283, + "step": 6723, + "time_per_iteration": 2.5385141372680664 + }, + { + "auxiliary_loss_clip": 0.06456833, + "auxiliary_loss_mlp": 0.01276273, + "balance_loss_clip": 0.06280676, + "balance_loss_mlp": 0.01262039, + "epoch": 0.4042687509394258, + "flos": 22498066068480.0, + "grad_norm": 1.4281101192387737, + "language_loss": 0.74082482, + "learning_rate": 2.701921353880734e-06, + "loss": 0.81815588, + "num_input_tokens_seen": 144394325, + "router_z_loss_clip": 1.76464844, + "router_z_loss_mlp": 0.14227295, + "step": 6724, + "time_per_iteration": 2.5705087184906006 + }, + { + "auxiliary_loss_clip": 0.06445859, + "auxiliary_loss_mlp": 0.01269625, + "balance_loss_clip": 0.06280795, + "balance_loss_mlp": 0.01256226, + "epoch": 0.4043288741920938, + "flos": 30343978978560.0, + "grad_norm": 1.716107680872733, + "language_loss": 0.75255632, + "learning_rate": 2.7015566524734787e-06, + "loss": 0.8297112, + "num_input_tokens_seen": 144412765, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.13409424, + "step": 6725, + "time_per_iteration": 2.6433653831481934 + }, + { + "auxiliary_loss_clip": 0.06451583, + "auxiliary_loss_mlp": 0.01271794, + "balance_loss_clip": 0.06282748, + "balance_loss_mlp": 0.01257054, + "epoch": 0.40438899744476176, + "flos": 46357978947840.0, + "grad_norm": 1.593616701788039, + "language_loss": 0.77198207, + "learning_rate": 2.701191924463126e-06, + "loss": 0.84921581, + "num_input_tokens_seen": 144435400, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.14733887, + "step": 6726, + "time_per_iteration": 2.8469409942626953 + }, + { + "auxiliary_loss_clip": 0.06452948, + "auxiliary_loss_mlp": 0.0127047, + "balance_loss_clip": 0.06279704, + "balance_loss_mlp": 0.01256058, + "epoch": 0.4044491206974297, + "flos": 13338468046080.0, + "grad_norm": 2.072990787427281, + "language_loss": 0.82297921, + "learning_rate": 2.7008271698635054e-06, + "loss": 0.90021348, + "num_input_tokens_seen": 144452925, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.14404297, + "step": 6727, + "time_per_iteration": 2.5381619930267334 + }, + { + "auxiliary_loss_clip": 0.06453642, + "auxiliary_loss_mlp": 0.01266247, + "balance_loss_clip": 0.06281026, + "balance_loss_mlp": 0.01252413, + "epoch": 0.4045092439500977, + "flos": 12098603980800.0, + "grad_norm": 2.0199249210029055, + "language_loss": 0.86119437, + "learning_rate": 2.700462388688447e-06, + "loss": 0.93839324, + "num_input_tokens_seen": 144470195, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.13830566, + "step": 6728, + "time_per_iteration": 3.903547763824463 + }, + { + "auxiliary_loss_clip": 0.06450571, + "auxiliary_loss_mlp": 0.01275259, + "balance_loss_clip": 0.06281772, + "balance_loss_mlp": 0.01260567, + "epoch": 0.40456936720276565, + "flos": 21186225745920.0, + "grad_norm": 1.6307737524107195, + "language_loss": 0.82346553, + "learning_rate": 2.700097580951786e-06, + "loss": 0.90072381, + "num_input_tokens_seen": 144490320, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.14697266, + "step": 6729, + "time_per_iteration": 2.5673158168792725 + }, + { + "auxiliary_loss_clip": 0.06454299, + "auxiliary_loss_mlp": 0.01268394, + "balance_loss_clip": 0.06281105, + "balance_loss_mlp": 0.01253755, + "epoch": 0.4046294904554336, + "flos": 23922147335040.0, + "grad_norm": 1.7857320211804986, + "language_loss": 0.73840159, + "learning_rate": 2.6997327466673533e-06, + "loss": 0.81562853, + "num_input_tokens_seen": 144508990, + "router_z_loss_clip": 1.73144531, + "router_z_loss_mlp": 0.14630127, + "step": 6730, + "time_per_iteration": 4.11122727394104 + }, + { + "auxiliary_loss_clip": 0.0645189, + "auxiliary_loss_mlp": 0.01268684, + "balance_loss_clip": 0.06282154, + "balance_loss_mlp": 0.01254767, + "epoch": 0.4046896137081016, + "flos": 38080376202240.0, + "grad_norm": 1.7383158082611918, + "language_loss": 0.67290312, + "learning_rate": 2.699367885848985e-06, + "loss": 0.75010884, + "num_input_tokens_seen": 144529550, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.13922119, + "step": 6731, + "time_per_iteration": 2.8046634197235107 + }, + { + "auxiliary_loss_clip": 0.06450266, + "auxiliary_loss_mlp": 0.01270158, + "balance_loss_clip": 0.0628126, + "balance_loss_mlp": 0.01256175, + "epoch": 0.4047497369607696, + "flos": 23623047786240.0, + "grad_norm": 1.7716081402001673, + "language_loss": 0.74489558, + "learning_rate": 2.699002998510517e-06, + "loss": 0.8220998, + "num_input_tokens_seen": 144549310, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.13977051, + "step": 6732, + "time_per_iteration": 2.608191728591919 + }, + { + "auxiliary_loss_clip": 0.06450449, + "auxiliary_loss_mlp": 0.01269365, + "balance_loss_clip": 0.06283008, + "balance_loss_mlp": 0.01255978, + "epoch": 0.40480986021343757, + "flos": 12828596751360.0, + "grad_norm": 1.6538752037468725, + "language_loss": 0.77253687, + "learning_rate": 2.6986380846657852e-06, + "loss": 0.84973502, + "num_input_tokens_seen": 144567430, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.13391113, + "step": 6733, + "time_per_iteration": 2.525399923324585 + }, + { + "auxiliary_loss_clip": 0.06457898, + "auxiliary_loss_mlp": 0.01270828, + "balance_loss_clip": 0.06280859, + "balance_loss_mlp": 0.01255176, + "epoch": 0.40486998346610553, + "flos": 23775511489920.0, + "grad_norm": 4.637374264151728, + "language_loss": 0.76891112, + "learning_rate": 2.698273144328627e-06, + "loss": 0.84619832, + "num_input_tokens_seen": 144585975, + "router_z_loss_clip": 1.76953125, + "router_z_loss_mlp": 0.15661621, + "step": 6734, + "time_per_iteration": 4.040409564971924 + }, + { + "auxiliary_loss_clip": 0.06455547, + "auxiliary_loss_mlp": 0.01267949, + "balance_loss_clip": 0.0627891, + "balance_loss_mlp": 0.0125421, + "epoch": 0.4049301067187735, + "flos": 22863439797120.0, + "grad_norm": 2.24732512167567, + "language_loss": 0.64935613, + "learning_rate": 2.6979081775128805e-06, + "loss": 0.72659111, + "num_input_tokens_seen": 144605225, + "router_z_loss_clip": 1.76757812, + "router_z_loss_mlp": 0.13745117, + "step": 6735, + "time_per_iteration": 2.5326993465423584 + }, + { + "auxiliary_loss_clip": 0.06448689, + "auxiliary_loss_mlp": 0.01271873, + "balance_loss_clip": 0.06279301, + "balance_loss_mlp": 0.01258849, + "epoch": 0.40499022997144146, + "flos": 22790624999040.0, + "grad_norm": 1.962844708798157, + "language_loss": 0.83769405, + "learning_rate": 2.697543184232387e-06, + "loss": 0.91489971, + "num_input_tokens_seen": 144624145, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.13024902, + "step": 6736, + "time_per_iteration": 2.5863215923309326 + }, + { + "auxiliary_loss_clip": 0.06454039, + "auxiliary_loss_mlp": 0.01271412, + "balance_loss_clip": 0.06281038, + "balance_loss_mlp": 0.01256832, + "epoch": 0.4050503532241094, + "flos": 23046021843840.0, + "grad_norm": 1.714368942149708, + "language_loss": 0.75428641, + "learning_rate": 2.6971781645009863e-06, + "loss": 0.83154088, + "num_input_tokens_seen": 144644470, + "router_z_loss_clip": 1.72949219, + "router_z_loss_mlp": 0.14569092, + "step": 6737, + "time_per_iteration": 2.6163716316223145 + }, + { + "auxiliary_loss_clip": 0.06448484, + "auxiliary_loss_mlp": 0.01271121, + "balance_loss_clip": 0.06280237, + "balance_loss_mlp": 0.01257644, + "epoch": 0.4051104764767774, + "flos": 16652254118400.0, + "grad_norm": 4.810644037565116, + "language_loss": 0.72306561, + "learning_rate": 2.696813118332519e-06, + "loss": 0.80026174, + "num_input_tokens_seen": 144661055, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.13470459, + "step": 6738, + "time_per_iteration": 4.0618274211883545 + }, + { + "auxiliary_loss_clip": 0.06449332, + "auxiliary_loss_mlp": 0.01270399, + "balance_loss_clip": 0.06280854, + "balance_loss_mlp": 0.01257399, + "epoch": 0.40517059972944536, + "flos": 16363929818880.0, + "grad_norm": 1.8147061411614016, + "language_loss": 0.75123262, + "learning_rate": 2.696448045740828e-06, + "loss": 0.82842994, + "num_input_tokens_seen": 144677935, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.13000488, + "step": 6739, + "time_per_iteration": 2.489001512527466 + }, + { + "auxiliary_loss_clip": 0.06454495, + "auxiliary_loss_mlp": 0.0126968, + "balance_loss_clip": 0.06282163, + "balance_loss_mlp": 0.01255405, + "epoch": 0.4052307229821133, + "flos": 28810885150080.0, + "grad_norm": 1.87280601387568, + "language_loss": 0.74278009, + "learning_rate": 2.6960829467397576e-06, + "loss": 0.82002187, + "num_input_tokens_seen": 144697725, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.14257812, + "step": 6740, + "time_per_iteration": 2.616560220718384 + }, + { + "auxiliary_loss_clip": 0.0644789, + "auxiliary_loss_mlp": 0.01270934, + "balance_loss_clip": 0.06280458, + "balance_loss_mlp": 0.01257076, + "epoch": 0.4052908462347813, + "flos": 21404334723840.0, + "grad_norm": 1.6527814212000655, + "language_loss": 0.77083528, + "learning_rate": 2.695717821343153e-06, + "loss": 0.84802353, + "num_input_tokens_seen": 144718805, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.1385498, + "step": 6741, + "time_per_iteration": 2.5236477851867676 + }, + { + "auxiliary_loss_clip": 0.06449165, + "auxiliary_loss_mlp": 0.01274329, + "balance_loss_clip": 0.06278783, + "balance_loss_mlp": 0.01259606, + "epoch": 0.40535096948744925, + "flos": 22425628613760.0, + "grad_norm": 1.6285650306233073, + "language_loss": 0.7166388, + "learning_rate": 2.6953526695648577e-06, + "loss": 0.79387373, + "num_input_tokens_seen": 144737105, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.1472168, + "step": 6742, + "time_per_iteration": 2.588928699493408 + }, + { + "auxiliary_loss_clip": 0.06454468, + "auxiliary_loss_mlp": 0.01273335, + "balance_loss_clip": 0.06282452, + "balance_loss_mlp": 0.01258016, + "epoch": 0.4054110927401172, + "flos": 17015028370560.0, + "grad_norm": 2.751799665484638, + "language_loss": 0.73206228, + "learning_rate": 2.6949874914187202e-06, + "loss": 0.80934024, + "num_input_tokens_seen": 144751350, + "router_z_loss_clip": 1.72167969, + "router_z_loss_mlp": 0.15332031, + "step": 6743, + "time_per_iteration": 2.519907236099243 + }, + { + "auxiliary_loss_clip": 0.0645441, + "auxiliary_loss_mlp": 0.01272217, + "balance_loss_clip": 0.06280394, + "balance_loss_mlp": 0.01257494, + "epoch": 0.4054712159927852, + "flos": 21621018182400.0, + "grad_norm": 2.0068914143371623, + "language_loss": 0.7128458, + "learning_rate": 2.694622286918588e-06, + "loss": 0.79011208, + "num_input_tokens_seen": 144770030, + "router_z_loss_clip": 1.73925781, + "router_z_loss_mlp": 0.14733887, + "step": 6744, + "time_per_iteration": 2.641242742538452 + }, + { + "auxiliary_loss_clip": 0.06447047, + "auxiliary_loss_mlp": 0.01269556, + "balance_loss_clip": 0.06280165, + "balance_loss_mlp": 0.01255722, + "epoch": 0.4055313392454532, + "flos": 25819734424320.0, + "grad_norm": 1.5431481906112547, + "language_loss": 0.80460721, + "learning_rate": 2.6942570560783076e-06, + "loss": 0.88177323, + "num_input_tokens_seen": 144790965, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.13830566, + "step": 6745, + "time_per_iteration": 2.563445806503296 + }, + { + "auxiliary_loss_clip": 0.06450857, + "auxiliary_loss_mlp": 0.01269463, + "balance_loss_clip": 0.06282623, + "balance_loss_mlp": 0.01255009, + "epoch": 0.40559146249812117, + "flos": 14142323790720.0, + "grad_norm": 1.9690336991849304, + "language_loss": 0.67176485, + "learning_rate": 2.693891798911731e-06, + "loss": 0.74896801, + "num_input_tokens_seen": 144807755, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.14465332, + "step": 6746, + "time_per_iteration": 2.532186508178711 + }, + { + "auxiliary_loss_clip": 0.064533, + "auxiliary_loss_mlp": 0.01266546, + "balance_loss_clip": 0.06283557, + "balance_loss_mlp": 0.01253272, + "epoch": 0.40565158575078913, + "flos": 41365259815680.0, + "grad_norm": 1.4380414737187444, + "language_loss": 0.57222033, + "learning_rate": 2.6935265154327075e-06, + "loss": 0.64941883, + "num_input_tokens_seen": 144832405, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.1328125, + "step": 6747, + "time_per_iteration": 2.7487149238586426 + }, + { + "auxiliary_loss_clip": 0.06454123, + "auxiliary_loss_mlp": 0.01269064, + "balance_loss_clip": 0.06282702, + "balance_loss_mlp": 0.01255319, + "epoch": 0.4057117090034571, + "flos": 28551421382400.0, + "grad_norm": 2.093705794925994, + "language_loss": 0.84795344, + "learning_rate": 2.693161205655089e-06, + "loss": 0.92518532, + "num_input_tokens_seen": 144853890, + "router_z_loss_clip": 1.71582031, + "router_z_loss_mlp": 0.13739014, + "step": 6748, + "time_per_iteration": 2.5967648029327393 + }, + { + "auxiliary_loss_clip": 0.06453951, + "auxiliary_loss_mlp": 0.01269749, + "balance_loss_clip": 0.06281549, + "balance_loss_mlp": 0.01254794, + "epoch": 0.40577183225612506, + "flos": 18009851569920.0, + "grad_norm": 1.9056349360303495, + "language_loss": 0.81943792, + "learning_rate": 2.6927958695927287e-06, + "loss": 0.89667493, + "num_input_tokens_seen": 144871395, + "router_z_loss_clip": 1.72265625, + "router_z_loss_mlp": 0.14953613, + "step": 6749, + "time_per_iteration": 2.546419143676758 + }, + { + "auxiliary_loss_clip": 0.06450339, + "auxiliary_loss_mlp": 0.01270578, + "balance_loss_clip": 0.06281818, + "balance_loss_mlp": 0.01256762, + "epoch": 0.40583195550879303, + "flos": 19542819617280.0, + "grad_norm": 1.7354001752331154, + "language_loss": 0.75251377, + "learning_rate": 2.6924305072594784e-06, + "loss": 0.82972294, + "num_input_tokens_seen": 144890975, + "router_z_loss_clip": 1.68554688, + "router_z_loss_mlp": 0.13824463, + "step": 6750, + "time_per_iteration": 2.633349895477295 + }, + { + "auxiliary_loss_clip": 0.06461279, + "auxiliary_loss_mlp": 0.01270913, + "balance_loss_clip": 0.06282868, + "balance_loss_mlp": 0.01256441, + "epoch": 0.405892078761461, + "flos": 22315987146240.0, + "grad_norm": 2.3215315740209026, + "language_loss": 0.73715317, + "learning_rate": 2.692065118669195e-06, + "loss": 0.81447506, + "num_input_tokens_seen": 144908170, + "router_z_loss_clip": 1.78125, + "router_z_loss_mlp": 0.14459229, + "step": 6751, + "time_per_iteration": 2.579233169555664 + }, + { + "auxiliary_loss_clip": 0.06456044, + "auxiliary_loss_mlp": 0.01276434, + "balance_loss_clip": 0.06282923, + "balance_loss_mlp": 0.01261622, + "epoch": 0.40595220201412896, + "flos": 25491564708480.0, + "grad_norm": 1.5288716905414277, + "language_loss": 0.66520017, + "learning_rate": 2.6916997038357326e-06, + "loss": 0.74252492, + "num_input_tokens_seen": 144928020, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.14788818, + "step": 6752, + "time_per_iteration": 2.5768818855285645 + }, + { + "auxiliary_loss_clip": 0.06457777, + "auxiliary_loss_mlp": 0.01274224, + "balance_loss_clip": 0.06281942, + "balance_loss_mlp": 0.01259025, + "epoch": 0.4060123252667969, + "flos": 49867092887040.0, + "grad_norm": 1.7025851849816316, + "language_loss": 0.71210098, + "learning_rate": 2.691334262772948e-06, + "loss": 0.78942096, + "num_input_tokens_seen": 144951240, + "router_z_loss_clip": 1.75976562, + "router_z_loss_mlp": 0.15197754, + "step": 6753, + "time_per_iteration": 2.807713031768799 + }, + { + "auxiliary_loss_clip": 0.06455305, + "auxiliary_loss_mlp": 0.01268505, + "balance_loss_clip": 0.06281379, + "balance_loss_mlp": 0.01254736, + "epoch": 0.4060724485194649, + "flos": 21140720179200.0, + "grad_norm": 2.0551663576230657, + "language_loss": 0.72102135, + "learning_rate": 2.690968795494699e-06, + "loss": 0.7982595, + "num_input_tokens_seen": 144969100, + "router_z_loss_clip": 1.73925781, + "router_z_loss_mlp": 0.13763428, + "step": 6754, + "time_per_iteration": 2.5342867374420166 + }, + { + "auxiliary_loss_clip": 0.0645773, + "auxiliary_loss_mlp": 0.01273848, + "balance_loss_clip": 0.06283537, + "balance_loss_mlp": 0.0125931, + "epoch": 0.40613257177213286, + "flos": 21763796739840.0, + "grad_norm": 1.762365568083109, + "language_loss": 0.83186102, + "learning_rate": 2.690603302014844e-06, + "loss": 0.90917671, + "num_input_tokens_seen": 144987065, + "router_z_loss_clip": 1.74121094, + "router_z_loss_mlp": 0.14520264, + "step": 6755, + "time_per_iteration": 2.6024997234344482 + }, + { + "auxiliary_loss_clip": 0.06461492, + "auxiliary_loss_mlp": 0.01268966, + "balance_loss_clip": 0.06283044, + "balance_loss_mlp": 0.01254047, + "epoch": 0.4061926950248008, + "flos": 25561863884160.0, + "grad_norm": 1.6099502444653784, + "language_loss": 0.71436989, + "learning_rate": 2.6902377823472426e-06, + "loss": 0.79167449, + "num_input_tokens_seen": 145007310, + "router_z_loss_clip": 1.78417969, + "router_z_loss_mlp": 0.14923096, + "step": 6756, + "time_per_iteration": 2.5427916049957275 + }, + { + "auxiliary_loss_clip": 0.06455702, + "auxiliary_loss_mlp": 0.01272698, + "balance_loss_clip": 0.06279261, + "balance_loss_mlp": 0.01257726, + "epoch": 0.4062528182774688, + "flos": 23702528983680.0, + "grad_norm": 1.686471122095966, + "language_loss": 0.79134113, + "learning_rate": 2.689872236505755e-06, + "loss": 0.86862516, + "num_input_tokens_seen": 145026210, + "router_z_loss_clip": 1.76464844, + "router_z_loss_mlp": 0.14990234, + "step": 6757, + "time_per_iteration": 2.573546886444092 + }, + { + "auxiliary_loss_clip": 0.06451409, + "auxiliary_loss_mlp": 0.01275677, + "balance_loss_clip": 0.0627944, + "balance_loss_mlp": 0.01260561, + "epoch": 0.4063129415301368, + "flos": 21732504439680.0, + "grad_norm": 1.6631673854083442, + "language_loss": 0.78665155, + "learning_rate": 2.6895066645042437e-06, + "loss": 0.86392242, + "num_input_tokens_seen": 145045475, + "router_z_loss_clip": 1.71972656, + "router_z_loss_mlp": 0.15100098, + "step": 6758, + "time_per_iteration": 2.5283167362213135 + }, + { + "auxiliary_loss_clip": 0.06450847, + "auxiliary_loss_mlp": 0.01276876, + "balance_loss_clip": 0.06280972, + "balance_loss_mlp": 0.0126331, + "epoch": 0.40637306478280477, + "flos": 12792650549760.0, + "grad_norm": 2.0123521464099183, + "language_loss": 0.89116049, + "learning_rate": 2.6891410663565703e-06, + "loss": 0.96843767, + "num_input_tokens_seen": 145062260, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.13568115, + "step": 6759, + "time_per_iteration": 2.5211679935455322 + }, + { + "auxiliary_loss_clip": 0.06457647, + "auxiliary_loss_mlp": 0.01273439, + "balance_loss_clip": 0.06284226, + "balance_loss_mlp": 0.01259742, + "epoch": 0.40643318803547274, + "flos": 24031327605120.0, + "grad_norm": 2.379594130925159, + "language_loss": 0.64235389, + "learning_rate": 2.688775442076598e-06, + "loss": 0.71966481, + "num_input_tokens_seen": 145082470, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.13690186, + "step": 6760, + "time_per_iteration": 2.546807050704956 + }, + { + "auxiliary_loss_clip": 0.0645775, + "auxiliary_loss_mlp": 0.01275543, + "balance_loss_clip": 0.06282319, + "balance_loss_mlp": 0.01260856, + "epoch": 0.4064933112881407, + "flos": 25599361386240.0, + "grad_norm": 1.4617486076979092, + "language_loss": 0.75530171, + "learning_rate": 2.688409791678193e-06, + "loss": 0.83263463, + "num_input_tokens_seen": 145105685, + "router_z_loss_clip": 1.75390625, + "router_z_loss_mlp": 0.14666748, + "step": 6761, + "time_per_iteration": 2.635345935821533 + }, + { + "auxiliary_loss_clip": 0.0645279, + "auxiliary_loss_mlp": 0.01275826, + "balance_loss_clip": 0.06285599, + "balance_loss_mlp": 0.01262183, + "epoch": 0.40655343454080867, + "flos": 22060841863680.0, + "grad_norm": 1.3772427401241372, + "language_loss": 0.70268184, + "learning_rate": 2.6880441151752185e-06, + "loss": 0.77996796, + "num_input_tokens_seen": 145125590, + "router_z_loss_clip": 1.671875, + "router_z_loss_mlp": 0.1362915, + "step": 6762, + "time_per_iteration": 2.5381741523742676 + }, + { + "auxiliary_loss_clip": 0.06454535, + "auxiliary_loss_mlp": 0.01269241, + "balance_loss_clip": 0.06282306, + "balance_loss_mlp": 0.01255532, + "epoch": 0.40661355779347663, + "flos": 26476115783040.0, + "grad_norm": 2.097586218934523, + "language_loss": 0.74072015, + "learning_rate": 2.6876784125815433e-06, + "loss": 0.81795788, + "num_input_tokens_seen": 145146810, + "router_z_loss_clip": 1.72265625, + "router_z_loss_mlp": 0.13708496, + "step": 6763, + "time_per_iteration": 2.6068081855773926 + }, + { + "auxiliary_loss_clip": 0.06460483, + "auxiliary_loss_mlp": 0.01272662, + "balance_loss_clip": 0.06284823, + "balance_loss_mlp": 0.01257946, + "epoch": 0.4066736810461446, + "flos": 13266156372480.0, + "grad_norm": 1.6908157420926835, + "language_loss": 0.69497877, + "learning_rate": 2.687312683911033e-06, + "loss": 0.77231026, + "num_input_tokens_seen": 145163130, + "router_z_loss_clip": 1.7578125, + "router_z_loss_mlp": 0.14703369, + "step": 6764, + "time_per_iteration": 2.511901378631592 + }, + { + "auxiliary_loss_clip": 0.06461611, + "auxiliary_loss_mlp": 0.01272386, + "balance_loss_clip": 0.06284289, + "balance_loss_mlp": 0.01255995, + "epoch": 0.40673380429881256, + "flos": 28811178639360.0, + "grad_norm": 2.09874166778498, + "language_loss": 0.91354716, + "learning_rate": 2.686946929177557e-06, + "loss": 0.99088717, + "num_input_tokens_seen": 145181420, + "router_z_loss_clip": 1.7734375, + "router_z_loss_mlp": 0.16381836, + "step": 6765, + "time_per_iteration": 2.614131450653076 + }, + { + "auxiliary_loss_clip": 0.06467324, + "auxiliary_loss_mlp": 0.01271556, + "balance_loss_clip": 0.06289016, + "balance_loss_mlp": 0.01256959, + "epoch": 0.4067939275514805, + "flos": 12500301254400.0, + "grad_norm": 2.6861779086384945, + "language_loss": 0.7896508, + "learning_rate": 2.6865811483949855e-06, + "loss": 0.86703956, + "num_input_tokens_seen": 145198545, + "router_z_loss_clip": 1.78222656, + "router_z_loss_mlp": 0.14599609, + "step": 6766, + "time_per_iteration": 2.5117299556732178 + }, + { + "auxiliary_loss_clip": 0.06462067, + "auxiliary_loss_mlp": 0.01273332, + "balance_loss_clip": 0.0628517, + "balance_loss_mlp": 0.01258306, + "epoch": 0.4068540508041485, + "flos": 18776461374720.0, + "grad_norm": 40.22612567694579, + "language_loss": 0.77094513, + "learning_rate": 2.6862153415771867e-06, + "loss": 0.84829921, + "num_input_tokens_seen": 145215835, + "router_z_loss_clip": 1.76855469, + "router_z_loss_mlp": 0.15020752, + "step": 6767, + "time_per_iteration": 2.5433967113494873 + }, + { + "auxiliary_loss_clip": 0.06456982, + "auxiliary_loss_mlp": 0.01274714, + "balance_loss_clip": 0.06286283, + "balance_loss_mlp": 0.01260784, + "epoch": 0.40691417405681646, + "flos": 28520506425600.0, + "grad_norm": 1.6477494711234055, + "language_loss": 0.77846849, + "learning_rate": 2.685849508738034e-06, + "loss": 0.85578549, + "num_input_tokens_seen": 145236555, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.1394043, + "step": 6768, + "time_per_iteration": 4.049299478530884 + }, + { + "auxiliary_loss_clip": 0.06460279, + "auxiliary_loss_mlp": 0.0127197, + "balance_loss_clip": 0.06286994, + "balance_loss_mlp": 0.01258213, + "epoch": 0.4069742973094844, + "flos": 20820390819840.0, + "grad_norm": 1.9557468193178857, + "language_loss": 0.87631512, + "learning_rate": 2.6854836498913995e-06, + "loss": 0.9536376, + "num_input_tokens_seen": 145254595, + "router_z_loss_clip": 1.73339844, + "router_z_loss_mlp": 0.13757324, + "step": 6769, + "time_per_iteration": 2.540104389190674 + }, + { + "auxiliary_loss_clip": 0.06461371, + "auxiliary_loss_mlp": 0.01272921, + "balance_loss_clip": 0.06292167, + "balance_loss_mlp": 0.01259504, + "epoch": 0.4070344205621524, + "flos": 21476646397440.0, + "grad_norm": 2.001246026688969, + "language_loss": 0.80859989, + "learning_rate": 2.685117765051156e-06, + "loss": 0.88594282, + "num_input_tokens_seen": 145274005, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.13421631, + "step": 6770, + "time_per_iteration": 3.9851884841918945 + }, + { + "auxiliary_loss_clip": 0.06465216, + "auxiliary_loss_mlp": 0.01270985, + "balance_loss_clip": 0.06288273, + "balance_loss_mlp": 0.01256203, + "epoch": 0.4070945438148204, + "flos": 26836709829120.0, + "grad_norm": 1.8007492597774561, + "language_loss": 0.80221689, + "learning_rate": 2.6847518542311783e-06, + "loss": 0.87957895, + "num_input_tokens_seen": 145294850, + "router_z_loss_clip": 1.77148438, + "router_z_loss_mlp": 0.14770508, + "step": 6771, + "time_per_iteration": 2.5747835636138916 + }, + { + "auxiliary_loss_clip": 0.06460344, + "auxiliary_loss_mlp": 0.01270623, + "balance_loss_clip": 0.06287014, + "balance_loss_mlp": 0.01256926, + "epoch": 0.4071546670674884, + "flos": 26360478748800.0, + "grad_norm": 1.364923552922522, + "language_loss": 0.7623316, + "learning_rate": 2.6843859174453417e-06, + "loss": 0.83964121, + "num_input_tokens_seen": 145317050, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.13696289, + "step": 6772, + "time_per_iteration": 2.628304958343506 + }, + { + "auxiliary_loss_clip": 0.06461407, + "auxiliary_loss_mlp": 0.01270312, + "balance_loss_clip": 0.06287165, + "balance_loss_mlp": 0.01255471, + "epoch": 0.40721479032015634, + "flos": 17901300205440.0, + "grad_norm": 1.7629352970283074, + "language_loss": 0.81345379, + "learning_rate": 2.6840199547075218e-06, + "loss": 0.89077097, + "num_input_tokens_seen": 145334480, + "router_z_loss_clip": 1.74316406, + "router_z_loss_mlp": 0.1484375, + "step": 6773, + "time_per_iteration": 2.5225751399993896 + }, + { + "auxiliary_loss_clip": 0.06368425, + "auxiliary_loss_mlp": 0.01263617, + "balance_loss_clip": 0.06289985, + "balance_loss_mlp": 0.01259653, + "epoch": 0.4072749135728243, + "flos": 49871522424960.0, + "grad_norm": 0.8094154348681942, + "language_loss": 0.64365125, + "learning_rate": 2.683653966031597e-06, + "loss": 0.71997166, + "num_input_tokens_seen": 145388695, + "router_z_loss_clip": 0.78515625, + "router_z_loss_mlp": 0.03961182, + "step": 6774, + "time_per_iteration": 4.446218967437744 + }, + { + "auxiliary_loss_clip": 0.06460027, + "auxiliary_loss_mlp": 0.01268161, + "balance_loss_clip": 0.06283361, + "balance_loss_mlp": 0.01254481, + "epoch": 0.40733503682549227, + "flos": 27571063011840.0, + "grad_norm": 1.7398483222375367, + "language_loss": 0.7269184, + "learning_rate": 2.683287951431446e-06, + "loss": 0.80420029, + "num_input_tokens_seen": 145408240, + "router_z_loss_clip": 1.765625, + "router_z_loss_mlp": 0.13659668, + "step": 6775, + "time_per_iteration": 2.599534511566162 + }, + { + "auxiliary_loss_clip": 0.0645956, + "auxiliary_loss_mlp": 0.01271281, + "balance_loss_clip": 0.06285449, + "balance_loss_mlp": 0.01257328, + "epoch": 0.40739516007816023, + "flos": 22133447026560.0, + "grad_norm": 1.36694346344043, + "language_loss": 0.78053248, + "learning_rate": 2.6829219109209474e-06, + "loss": 0.8578409, + "num_input_tokens_seen": 145428395, + "router_z_loss_clip": 1.74023438, + "router_z_loss_mlp": 0.13946533, + "step": 6776, + "time_per_iteration": 2.6111807823181152 + }, + { + "auxiliary_loss_clip": 0.06466034, + "auxiliary_loss_mlp": 0.01268413, + "balance_loss_clip": 0.06288318, + "balance_loss_mlp": 0.01254358, + "epoch": 0.4074552833308282, + "flos": 23849080974720.0, + "grad_norm": 2.6992343713036933, + "language_loss": 0.79444098, + "learning_rate": 2.682555844513981e-06, + "loss": 0.87178552, + "num_input_tokens_seen": 145448290, + "router_z_loss_clip": 1.77734375, + "router_z_loss_mlp": 0.14056396, + "step": 6777, + "time_per_iteration": 2.6968321800231934 + }, + { + "auxiliary_loss_clip": 0.0635563, + "auxiliary_loss_mlp": 0.01254556, + "balance_loss_clip": 0.06276868, + "balance_loss_mlp": 0.01251499, + "epoch": 0.40751540658349616, + "flos": 58019847120000.0, + "grad_norm": 0.6740608536307336, + "language_loss": 0.53006828, + "learning_rate": 2.6821897522244286e-06, + "loss": 0.60617012, + "num_input_tokens_seen": 145509785, + "router_z_loss_clip": 0.7890625, + "router_z_loss_mlp": 0.0305481, + "step": 6778, + "time_per_iteration": 4.5793616771698 + }, + { + "auxiliary_loss_clip": 0.0645799, + "auxiliary_loss_mlp": 0.01272337, + "balance_loss_clip": 0.06285123, + "balance_loss_mlp": 0.01257996, + "epoch": 0.40757552983616413, + "flos": 21220956063360.0, + "grad_norm": 2.166644010842874, + "language_loss": 0.8325671, + "learning_rate": 2.6818236340661718e-06, + "loss": 0.90987039, + "num_input_tokens_seen": 145528620, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.14349365, + "step": 6779, + "time_per_iteration": 2.5122289657592773 + }, + { + "auxiliary_loss_clip": 0.06459656, + "auxiliary_loss_mlp": 0.01270176, + "balance_loss_clip": 0.06286415, + "balance_loss_mlp": 0.01255752, + "epoch": 0.4076356530888321, + "flos": 26840776752000.0, + "grad_norm": 1.555798351548063, + "language_loss": 0.76392281, + "learning_rate": 2.6814574900530957e-06, + "loss": 0.84122109, + "num_input_tokens_seen": 145547775, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.14440918, + "step": 6780, + "time_per_iteration": 2.5635926723480225 + }, + { + "auxiliary_loss_clip": 0.06453321, + "auxiliary_loss_mlp": 0.01268481, + "balance_loss_clip": 0.06285319, + "balance_loss_mlp": 0.01255964, + "epoch": 0.40769577634150006, + "flos": 12207868104960.0, + "grad_norm": 2.3318684771465388, + "language_loss": 0.66762495, + "learning_rate": 2.6810913201990827e-06, + "loss": 0.74484301, + "num_input_tokens_seen": 145564465, + "router_z_loss_clip": 1.67871094, + "router_z_loss_mlp": 0.12512207, + "step": 6781, + "time_per_iteration": 2.4998953342437744 + }, + { + "auxiliary_loss_clip": 0.06457075, + "auxiliary_loss_mlp": 0.01270756, + "balance_loss_clip": 0.06285501, + "balance_loss_mlp": 0.01257005, + "epoch": 0.407755899594168, + "flos": 33663467128320.0, + "grad_norm": 1.4801990709986605, + "language_loss": 0.71833825, + "learning_rate": 2.6807251245180183e-06, + "loss": 0.79561651, + "num_input_tokens_seen": 145585965, + "router_z_loss_clip": 1.71679688, + "router_z_loss_mlp": 0.13757324, + "step": 6782, + "time_per_iteration": 2.6407761573791504 + }, + { + "auxiliary_loss_clip": 0.06455722, + "auxiliary_loss_mlp": 0.01265619, + "balance_loss_clip": 0.06282325, + "balance_loss_mlp": 0.01252804, + "epoch": 0.407816022846836, + "flos": 20163590190720.0, + "grad_norm": 1.6531823939859909, + "language_loss": 0.82546687, + "learning_rate": 2.6803589030237897e-06, + "loss": 0.90268028, + "num_input_tokens_seen": 145605000, + "router_z_loss_clip": 1.73535156, + "router_z_loss_mlp": 0.12823486, + "step": 6783, + "time_per_iteration": 2.521007776260376 + }, + { + "auxiliary_loss_clip": 0.06456424, + "auxiliary_loss_mlp": 0.01272041, + "balance_loss_clip": 0.06284439, + "balance_loss_mlp": 0.01258504, + "epoch": 0.40787614609950396, + "flos": 21185219496960.0, + "grad_norm": 3.105146861858365, + "language_loss": 0.80980694, + "learning_rate": 2.679992655730283e-06, + "loss": 0.88709158, + "num_input_tokens_seen": 145623740, + "router_z_loss_clip": 1.71972656, + "router_z_loss_mlp": 0.13549805, + "step": 6784, + "time_per_iteration": 2.555502414703369 + }, + { + "auxiliary_loss_clip": 0.06462008, + "auxiliary_loss_mlp": 0.01270528, + "balance_loss_clip": 0.06282149, + "balance_loss_mlp": 0.01254888, + "epoch": 0.407936269352172, + "flos": 20526699859200.0, + "grad_norm": 1.8248584482375538, + "language_loss": 0.65994555, + "learning_rate": 2.679626382651386e-06, + "loss": 0.73727089, + "num_input_tokens_seen": 145643515, + "router_z_loss_clip": 1.79882812, + "router_z_loss_mlp": 0.15661621, + "step": 6785, + "time_per_iteration": 2.5122246742248535 + }, + { + "auxiliary_loss_clip": 0.06453374, + "auxiliary_loss_mlp": 0.01270477, + "balance_loss_clip": 0.06281981, + "balance_loss_mlp": 0.01256505, + "epoch": 0.40799639260483994, + "flos": 20124709096320.0, + "grad_norm": 2.5052548980669487, + "language_loss": 0.80350053, + "learning_rate": 2.679260083800989e-06, + "loss": 0.88073903, + "num_input_tokens_seen": 145660890, + "router_z_loss_clip": 1.71191406, + "router_z_loss_mlp": 0.13970947, + "step": 6786, + "time_per_iteration": 2.554553985595703 + }, + { + "auxiliary_loss_clip": 0.0645851, + "auxiliary_loss_mlp": 0.01272529, + "balance_loss_clip": 0.06286281, + "balance_loss_mlp": 0.01258874, + "epoch": 0.4080565158575079, + "flos": 21003853334400.0, + "grad_norm": 1.5530341827396597, + "language_loss": 0.81621969, + "learning_rate": 2.678893759192982e-06, + "loss": 0.89353013, + "num_input_tokens_seen": 145680070, + "router_z_loss_clip": 1.72070312, + "router_z_loss_mlp": 0.13665771, + "step": 6787, + "time_per_iteration": 2.536215305328369 + }, + { + "auxiliary_loss_clip": 0.06458452, + "auxiliary_loss_mlp": 0.01268932, + "balance_loss_clip": 0.0628721, + "balance_loss_mlp": 0.01255623, + "epoch": 0.40811663911017587, + "flos": 19323746317440.0, + "grad_norm": 1.9049170263972377, + "language_loss": 0.6798445, + "learning_rate": 2.678527408841255e-06, + "loss": 0.75711828, + "num_input_tokens_seen": 145698010, + "router_z_loss_clip": 1.7109375, + "router_z_loss_mlp": 0.13323975, + "step": 6788, + "time_per_iteration": 2.533457040786743 + }, + { + "auxiliary_loss_clip": 0.06456561, + "auxiliary_loss_mlp": 0.01272482, + "balance_loss_clip": 0.06284444, + "balance_loss_mlp": 0.01258952, + "epoch": 0.40817676236284384, + "flos": 40634973555840.0, + "grad_norm": 1.8916550457168047, + "language_loss": 0.66478348, + "learning_rate": 2.678161032759701e-06, + "loss": 0.74207389, + "num_input_tokens_seen": 145722215, + "router_z_loss_clip": 1.72167969, + "router_z_loss_mlp": 0.13537598, + "step": 6789, + "time_per_iteration": 2.726292371749878 + }, + { + "auxiliary_loss_clip": 0.06456382, + "auxiliary_loss_mlp": 0.01270282, + "balance_loss_clip": 0.06284897, + "balance_loss_mlp": 0.01256383, + "epoch": 0.4082368856155118, + "flos": 20528376940800.0, + "grad_norm": 1.5670896359254076, + "language_loss": 0.61192298, + "learning_rate": 2.6777946309622123e-06, + "loss": 0.68918967, + "num_input_tokens_seen": 145741090, + "router_z_loss_clip": 1.71484375, + "router_z_loss_mlp": 0.13885498, + "step": 6790, + "time_per_iteration": 2.5437731742858887 + }, + { + "auxiliary_loss_clip": 0.06455828, + "auxiliary_loss_mlp": 0.01271387, + "balance_loss_clip": 0.062863, + "balance_loss_mlp": 0.01257928, + "epoch": 0.40829700886817977, + "flos": 11430944248320.0, + "grad_norm": 3.0698605132878076, + "language_loss": 0.69964224, + "learning_rate": 2.677428203462683e-06, + "loss": 0.77691442, + "num_input_tokens_seen": 145754985, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.13452148, + "step": 6791, + "time_per_iteration": 2.4941210746765137 + }, + { + "auxiliary_loss_clip": 0.0635563, + "auxiliary_loss_mlp": 0.01262815, + "balance_loss_clip": 0.06277826, + "balance_loss_mlp": 0.01259486, + "epoch": 0.40835713212084773, + "flos": 67350455326080.0, + "grad_norm": 0.7295736549212738, + "language_loss": 0.59295797, + "learning_rate": 2.6770617502750093e-06, + "loss": 0.66914248, + "num_input_tokens_seen": 145815260, + "router_z_loss_clip": 0.77734375, + "router_z_loss_mlp": 0.03335571, + "step": 6792, + "time_per_iteration": 3.153479814529419 + }, + { + "auxiliary_loss_clip": 0.06459208, + "auxiliary_loss_mlp": 0.01270498, + "balance_loss_clip": 0.06285354, + "balance_loss_mlp": 0.01256193, + "epoch": 0.4084172553735157, + "flos": 21768408714240.0, + "grad_norm": 1.6689878199369865, + "language_loss": 0.80186534, + "learning_rate": 2.6766952714130857e-06, + "loss": 0.87916243, + "num_input_tokens_seen": 145832665, + "router_z_loss_clip": 1.73828125, + "router_z_loss_mlp": 0.14306641, + "step": 6793, + "time_per_iteration": 2.562311887741089 + }, + { + "auxiliary_loss_clip": 0.06458702, + "auxiliary_loss_mlp": 0.01272476, + "balance_loss_clip": 0.06283591, + "balance_loss_mlp": 0.01258237, + "epoch": 0.40847737862618366, + "flos": 27424594874880.0, + "grad_norm": 3.9059129474249, + "language_loss": 0.85597503, + "learning_rate": 2.6763287668908094e-06, + "loss": 0.93328679, + "num_input_tokens_seen": 145850240, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.14227295, + "step": 6794, + "time_per_iteration": 2.558554172515869 + }, + { + "auxiliary_loss_clip": 0.06457786, + "auxiliary_loss_mlp": 0.01274296, + "balance_loss_clip": 0.0628652, + "balance_loss_mlp": 0.01259991, + "epoch": 0.4085375018788516, + "flos": 18593040787200.0, + "grad_norm": 1.7852935587618148, + "language_loss": 0.80216181, + "learning_rate": 2.6759622367220788e-06, + "loss": 0.87948263, + "num_input_tokens_seen": 145869545, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.14306641, + "step": 6795, + "time_per_iteration": 2.540349006652832 + }, + { + "auxiliary_loss_clip": 0.06465046, + "auxiliary_loss_mlp": 0.01270762, + "balance_loss_clip": 0.0628596, + "balance_loss_mlp": 0.01255718, + "epoch": 0.4085976251315196, + "flos": 15416834319360.0, + "grad_norm": 2.647671549267762, + "language_loss": 0.70204669, + "learning_rate": 2.675595680920792e-06, + "loss": 0.77940476, + "num_input_tokens_seen": 145884025, + "router_z_loss_clip": 1.79101562, + "router_z_loss_mlp": 0.15057373, + "step": 6796, + "time_per_iteration": 2.483670711517334 + }, + { + "auxiliary_loss_clip": 0.06458762, + "auxiliary_loss_mlp": 0.01269742, + "balance_loss_clip": 0.06285367, + "balance_loss_mlp": 0.01256558, + "epoch": 0.40865774838418756, + "flos": 21258705127680.0, + "grad_norm": 1.5727118215642113, + "language_loss": 0.78255171, + "learning_rate": 2.6752290995008498e-06, + "loss": 0.85983676, + "num_input_tokens_seen": 145903210, + "router_z_loss_clip": 1.73535156, + "router_z_loss_mlp": 0.13189697, + "step": 6797, + "time_per_iteration": 2.580595016479492 + }, + { + "auxiliary_loss_clip": 0.06459324, + "auxiliary_loss_mlp": 0.01274053, + "balance_loss_clip": 0.06286809, + "balance_loss_mlp": 0.01260183, + "epoch": 0.4087178716368556, + "flos": 13777411259520.0, + "grad_norm": 1.8045279385790254, + "language_loss": 0.86005986, + "learning_rate": 2.6748624924761523e-06, + "loss": 0.93739361, + "num_input_tokens_seen": 145920985, + "router_z_loss_clip": 1.72363281, + "router_z_loss_mlp": 0.13885498, + "step": 6798, + "time_per_iteration": 2.525223970413208 + }, + { + "auxiliary_loss_clip": 0.0645816, + "auxiliary_loss_mlp": 0.01271081, + "balance_loss_clip": 0.06287363, + "balance_loss_mlp": 0.01258308, + "epoch": 0.40877799488952354, + "flos": 23628288666240.0, + "grad_norm": 1.532136532380416, + "language_loss": 0.84202659, + "learning_rate": 2.674495859860601e-06, + "loss": 0.91931903, + "num_input_tokens_seen": 145940350, + "router_z_loss_clip": 1.70800781, + "router_z_loss_mlp": 0.12774658, + "step": 6799, + "time_per_iteration": 2.5898637771606445 + }, + { + "auxiliary_loss_clip": 0.06456885, + "auxiliary_loss_mlp": 0.01270815, + "balance_loss_clip": 0.06284514, + "balance_loss_mlp": 0.01256695, + "epoch": 0.4088381181421915, + "flos": 20924372136960.0, + "grad_norm": 3.2861641598601516, + "language_loss": 0.83725351, + "learning_rate": 2.6741292016681e-06, + "loss": 0.91453052, + "num_input_tokens_seen": 145957460, + "router_z_loss_clip": 1.72265625, + "router_z_loss_mlp": 0.14129639, + "step": 6800, + "time_per_iteration": 2.5050573348999023 + }, + { + "auxiliary_loss_clip": 0.06460495, + "auxiliary_loss_mlp": 0.0127488, + "balance_loss_clip": 0.06284706, + "balance_loss_mlp": 0.01260324, + "epoch": 0.4088982413948595, + "flos": 13302605698560.0, + "grad_norm": 2.1402246624759225, + "language_loss": 0.74944514, + "learning_rate": 2.6737625179125514e-06, + "loss": 0.82679886, + "num_input_tokens_seen": 145975285, + "router_z_loss_clip": 1.7578125, + "router_z_loss_mlp": 0.14532471, + "step": 6801, + "time_per_iteration": 2.546226978302002 + }, + { + "auxiliary_loss_clip": 0.0646005, + "auxiliary_loss_mlp": 0.0127012, + "balance_loss_clip": 0.06286253, + "balance_loss_mlp": 0.01256358, + "epoch": 0.40895836464752744, + "flos": 15273007585920.0, + "grad_norm": 2.8712837575861316, + "language_loss": 0.80348778, + "learning_rate": 2.673395808607861e-06, + "loss": 0.8807894, + "num_input_tokens_seen": 145989150, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.13775635, + "step": 6802, + "time_per_iteration": 2.4804327487945557 + }, + { + "auxiliary_loss_clip": 0.06463334, + "auxiliary_loss_mlp": 0.01271488, + "balance_loss_clip": 0.06286001, + "balance_loss_mlp": 0.01256813, + "epoch": 0.4090184879001954, + "flos": 14506607416320.0, + "grad_norm": 2.1610413406346147, + "language_loss": 0.7616486, + "learning_rate": 2.673029073767934e-06, + "loss": 0.83899677, + "num_input_tokens_seen": 146006980, + "router_z_loss_clip": 1.77246094, + "router_z_loss_mlp": 0.14660645, + "step": 6803, + "time_per_iteration": 2.5792553424835205 + }, + { + "auxiliary_loss_clip": 0.06459032, + "auxiliary_loss_mlp": 0.01268618, + "balance_loss_clip": 0.06286538, + "balance_loss_mlp": 0.01255017, + "epoch": 0.40907861115286337, + "flos": 13886759237760.0, + "grad_norm": 1.7652651103072021, + "language_loss": 0.79160619, + "learning_rate": 2.6726623134066764e-06, + "loss": 0.86888266, + "num_input_tokens_seen": 146025125, + "router_z_loss_clip": 1.72363281, + "router_z_loss_mlp": 0.1361084, + "step": 6804, + "time_per_iteration": 2.489569902420044 + }, + { + "auxiliary_loss_clip": 0.06464031, + "auxiliary_loss_mlp": 0.01273102, + "balance_loss_clip": 0.06285653, + "balance_loss_mlp": 0.0125919, + "epoch": 0.40913873440553133, + "flos": 28045071959040.0, + "grad_norm": 1.8644340771163777, + "language_loss": 0.75315928, + "learning_rate": 2.672295527537998e-06, + "loss": 0.83053064, + "num_input_tokens_seen": 146044990, + "router_z_loss_clip": 1.78125, + "router_z_loss_mlp": 0.13909912, + "step": 6805, + "time_per_iteration": 2.6142778396606445 + }, + { + "auxiliary_loss_clip": 0.06465782, + "auxiliary_loss_mlp": 0.01272786, + "balance_loss_clip": 0.06288569, + "balance_loss_mlp": 0.01257957, + "epoch": 0.4091988576581993, + "flos": 21624917397120.0, + "grad_norm": 1.7712960163929097, + "language_loss": 0.7965951, + "learning_rate": 2.671928716175804e-06, + "loss": 0.87398076, + "num_input_tokens_seen": 146066045, + "router_z_loss_clip": 1.77050781, + "router_z_loss_mlp": 0.14825439, + "step": 6806, + "time_per_iteration": 2.567579984664917 + }, + { + "auxiliary_loss_clip": 0.06464592, + "auxiliary_loss_mlp": 0.01268771, + "balance_loss_clip": 0.06287415, + "balance_loss_mlp": 0.01254609, + "epoch": 0.40925898091086726, + "flos": 25230381932160.0, + "grad_norm": 1.8487150493759184, + "language_loss": 0.725999, + "learning_rate": 2.671561879334007e-06, + "loss": 0.80333263, + "num_input_tokens_seen": 146086280, + "router_z_loss_clip": 1.77148438, + "router_z_loss_mlp": 0.14147949, + "step": 6807, + "time_per_iteration": 4.0469160079956055 + }, + { + "auxiliary_loss_clip": 0.06359696, + "auxiliary_loss_mlp": 0.012552, + "balance_loss_clip": 0.06279803, + "balance_loss_mlp": 0.01251397, + "epoch": 0.40931910416353523, + "flos": 68949697553280.0, + "grad_norm": 0.8076862955861985, + "language_loss": 0.5884732, + "learning_rate": 2.6711950170265155e-06, + "loss": 0.66462219, + "num_input_tokens_seen": 146148840, + "router_z_loss_clip": 0.79833984, + "router_z_loss_mlp": 0.03796387, + "step": 6808, + "time_per_iteration": 3.236466407775879 + }, + { + "auxiliary_loss_clip": 0.0646228, + "auxiliary_loss_mlp": 0.01268444, + "balance_loss_clip": 0.06290961, + "balance_loss_mlp": 0.0125511, + "epoch": 0.4093792274162032, + "flos": 20195092126080.0, + "grad_norm": 2.068974912031903, + "language_loss": 0.54879391, + "learning_rate": 2.670828129267242e-06, + "loss": 0.62610114, + "num_input_tokens_seen": 146166195, + "router_z_loss_clip": 1.71386719, + "router_z_loss_mlp": 0.13342285, + "step": 6809, + "time_per_iteration": 4.028552055358887 + }, + { + "auxiliary_loss_clip": 0.06460767, + "auxiliary_loss_mlp": 0.01271891, + "balance_loss_clip": 0.06288341, + "balance_loss_mlp": 0.0125805, + "epoch": 0.40943935066887116, + "flos": 25235832447360.0, + "grad_norm": 1.6877735836202645, + "language_loss": 0.83297133, + "learning_rate": 2.6704612160700983e-06, + "loss": 0.91029787, + "num_input_tokens_seen": 146185045, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.13830566, + "step": 6810, + "time_per_iteration": 2.5688657760620117 + }, + { + "auxiliary_loss_clip": 0.06467541, + "auxiliary_loss_mlp": 0.01274919, + "balance_loss_clip": 0.06291755, + "balance_loss_mlp": 0.01260376, + "epoch": 0.4094994739215392, + "flos": 23261531345280.0, + "grad_norm": 2.1410482965152475, + "language_loss": 0.78002244, + "learning_rate": 2.670094277448999e-06, + "loss": 0.85744703, + "num_input_tokens_seen": 146204655, + "router_z_loss_clip": 1.75878906, + "router_z_loss_mlp": 0.14526367, + "step": 6811, + "time_per_iteration": 2.5859668254852295 + }, + { + "auxiliary_loss_clip": 0.06461761, + "auxiliary_loss_mlp": 0.01270439, + "balance_loss_clip": 0.06286068, + "balance_loss_mlp": 0.01255705, + "epoch": 0.40955959717420715, + "flos": 17387571623040.0, + "grad_norm": 1.532323288412775, + "language_loss": 0.70159924, + "learning_rate": 2.669727313417857e-06, + "loss": 0.77892125, + "num_input_tokens_seen": 146222000, + "router_z_loss_clip": 1.75683594, + "router_z_loss_mlp": 0.1472168, + "step": 6812, + "time_per_iteration": 2.5128583908081055 + }, + { + "auxiliary_loss_clip": 0.06459609, + "auxiliary_loss_mlp": 0.01271673, + "balance_loss_clip": 0.06286342, + "balance_loss_mlp": 0.01257689, + "epoch": 0.4096197204268751, + "flos": 25089406237440.0, + "grad_norm": 1.5016829758663763, + "language_loss": 0.6657182, + "learning_rate": 2.6693603239905872e-06, + "loss": 0.74303102, + "num_input_tokens_seen": 146242630, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.13989258, + "step": 6813, + "time_per_iteration": 4.086791515350342 + }, + { + "auxiliary_loss_clip": 0.06457571, + "auxiliary_loss_mlp": 0.01273443, + "balance_loss_clip": 0.06284814, + "balance_loss_mlp": 0.01259186, + "epoch": 0.4096798436795431, + "flos": 30593841454080.0, + "grad_norm": 3.468085127477164, + "language_loss": 0.74528515, + "learning_rate": 2.6689933091811087e-06, + "loss": 0.82259536, + "num_input_tokens_seen": 146263070, + "router_z_loss_clip": 1.72949219, + "router_z_loss_mlp": 0.14282227, + "step": 6814, + "time_per_iteration": 2.6079764366149902 + }, + { + "auxiliary_loss_clip": 0.06469103, + "auxiliary_loss_mlp": 0.0126922, + "balance_loss_clip": 0.06290863, + "balance_loss_mlp": 0.01254927, + "epoch": 0.40973996693221104, + "flos": 24140424021120.0, + "grad_norm": 2.1723549744151573, + "language_loss": 0.66418713, + "learning_rate": 2.6686262690033357e-06, + "loss": 0.74157035, + "num_input_tokens_seen": 146282890, + "router_z_loss_clip": 1.78222656, + "router_z_loss_mlp": 0.14276123, + "step": 6815, + "time_per_iteration": 2.574538469314575 + }, + { + "auxiliary_loss_clip": 0.06459038, + "auxiliary_loss_mlp": 0.01277533, + "balance_loss_clip": 0.06290913, + "balance_loss_mlp": 0.01264116, + "epoch": 0.409800090184879, + "flos": 23995968382080.0, + "grad_norm": 1.5545179592453178, + "language_loss": 0.76523387, + "learning_rate": 2.668259203471188e-06, + "loss": 0.84259957, + "num_input_tokens_seen": 146301755, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.13433838, + "step": 6816, + "time_per_iteration": 2.5691564083099365 + }, + { + "auxiliary_loss_clip": 0.06462897, + "auxiliary_loss_mlp": 0.01272633, + "balance_loss_clip": 0.06288977, + "balance_loss_mlp": 0.01258834, + "epoch": 0.40986021343754697, + "flos": 16149216931200.0, + "grad_norm": 2.0573498340626957, + "language_loss": 0.82244468, + "learning_rate": 2.6678921125985843e-06, + "loss": 0.8998, + "num_input_tokens_seen": 146316835, + "router_z_loss_clip": 1.73925781, + "router_z_loss_mlp": 0.13812256, + "step": 6817, + "time_per_iteration": 3.992452621459961 + }, + { + "auxiliary_loss_clip": 0.06471414, + "auxiliary_loss_mlp": 0.0127126, + "balance_loss_clip": 0.06288736, + "balance_loss_mlp": 0.0125556, + "epoch": 0.40992033669021494, + "flos": 24797811628800.0, + "grad_norm": 1.5933135055943601, + "language_loss": 0.80022383, + "learning_rate": 2.667524996399444e-06, + "loss": 0.87765062, + "num_input_tokens_seen": 146336650, + "router_z_loss_clip": 1.82714844, + "router_z_loss_mlp": 0.15698242, + "step": 6818, + "time_per_iteration": 2.6226916313171387 + }, + { + "auxiliary_loss_clip": 0.06458658, + "auxiliary_loss_mlp": 0.01265615, + "balance_loss_clip": 0.06287554, + "balance_loss_mlp": 0.01252609, + "epoch": 0.4099804599428829, + "flos": 29649429285120.0, + "grad_norm": 1.5014418509343528, + "language_loss": 0.66358954, + "learning_rate": 2.66715785488769e-06, + "loss": 0.74083227, + "num_input_tokens_seen": 146357640, + "router_z_loss_clip": 1.7109375, + "router_z_loss_mlp": 0.13006592, + "step": 6819, + "time_per_iteration": 2.5726187229156494 + }, + { + "auxiliary_loss_clip": 0.06472912, + "auxiliary_loss_mlp": 0.01275099, + "balance_loss_clip": 0.06290931, + "balance_loss_mlp": 0.01259566, + "epoch": 0.41004058319555087, + "flos": 24833464341120.0, + "grad_norm": 1.4779477588129932, + "language_loss": 0.85265613, + "learning_rate": 2.6667906880772428e-06, + "loss": 0.9301362, + "num_input_tokens_seen": 146379325, + "router_z_loss_clip": 1.8203125, + "router_z_loss_mlp": 0.15527344, + "step": 6820, + "time_per_iteration": 2.5997445583343506 + }, + { + "auxiliary_loss_clip": 0.06459977, + "auxiliary_loss_mlp": 0.01274929, + "balance_loss_clip": 0.06289133, + "balance_loss_mlp": 0.01261571, + "epoch": 0.41010070644821883, + "flos": 25744278222720.0, + "grad_norm": 1.6716831778372079, + "language_loss": 0.71520668, + "learning_rate": 2.6664234959820256e-06, + "loss": 0.79255575, + "num_input_tokens_seen": 146398635, + "router_z_loss_clip": 1.70800781, + "router_z_loss_mlp": 0.13360596, + "step": 6821, + "time_per_iteration": 2.5686511993408203 + }, + { + "auxiliary_loss_clip": 0.06462038, + "auxiliary_loss_mlp": 0.01275085, + "balance_loss_clip": 0.06288444, + "balance_loss_mlp": 0.01262037, + "epoch": 0.4101608297008868, + "flos": 22352604180480.0, + "grad_norm": 1.920651769082741, + "language_loss": 0.74875939, + "learning_rate": 2.6660562786159634e-06, + "loss": 0.82613057, + "num_input_tokens_seen": 146417585, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.13049316, + "step": 6822, + "time_per_iteration": 2.5453121662139893 + }, + { + "auxiliary_loss_clip": 0.0646743, + "auxiliary_loss_mlp": 0.01270606, + "balance_loss_clip": 0.06293608, + "balance_loss_mlp": 0.01256408, + "epoch": 0.41022095295355476, + "flos": 21951619666560.0, + "grad_norm": 2.1329933375936045, + "language_loss": 0.75859648, + "learning_rate": 2.6656890359929796e-06, + "loss": 0.83597684, + "num_input_tokens_seen": 146437035, + "router_z_loss_clip": 1.73730469, + "router_z_loss_mlp": 0.14208984, + "step": 6823, + "time_per_iteration": 2.514934539794922 + }, + { + "auxiliary_loss_clip": 0.06469562, + "auxiliary_loss_mlp": 0.01272535, + "balance_loss_clip": 0.06289219, + "balance_loss_mlp": 0.01257276, + "epoch": 0.4102810762062228, + "flos": 27457312694400.0, + "grad_norm": 5.1897859223278004, + "language_loss": 0.74005461, + "learning_rate": 2.665321768127001e-06, + "loss": 0.81747556, + "num_input_tokens_seen": 146457370, + "router_z_loss_clip": 1.80078125, + "router_z_loss_mlp": 0.15258789, + "step": 6824, + "time_per_iteration": 2.645362615585327 + }, + { + "auxiliary_loss_clip": 0.06472579, + "auxiliary_loss_mlp": 0.01268406, + "balance_loss_clip": 0.06292652, + "balance_loss_mlp": 0.01253589, + "epoch": 0.41034119945889075, + "flos": 24506258947200.0, + "grad_norm": 2.0548664701913215, + "language_loss": 0.72348672, + "learning_rate": 2.6649544750319548e-06, + "loss": 0.80089658, + "num_input_tokens_seen": 146478105, + "router_z_loss_clip": 1.79882812, + "router_z_loss_mlp": 0.14788818, + "step": 6825, + "time_per_iteration": 2.5779926776885986 + }, + { + "auxiliary_loss_clip": 0.0646458, + "auxiliary_loss_mlp": 0.01269358, + "balance_loss_clip": 0.06292018, + "balance_loss_mlp": 0.01255822, + "epoch": 0.4104013227115587, + "flos": 24359497320960.0, + "grad_norm": 2.1141131447671, + "language_loss": 0.85571408, + "learning_rate": 2.664587156721768e-06, + "loss": 0.93305349, + "num_input_tokens_seen": 146497835, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.13537598, + "step": 6826, + "time_per_iteration": 2.556445598602295 + }, + { + "auxiliary_loss_clip": 0.06462094, + "auxiliary_loss_mlp": 0.01278764, + "balance_loss_clip": 0.0629297, + "balance_loss_mlp": 0.0126468, + "epoch": 0.4104614459642267, + "flos": 23735582219520.0, + "grad_norm": 2.6430290167775037, + "language_loss": 0.6714378, + "learning_rate": 2.6642198132103696e-06, + "loss": 0.74884635, + "num_input_tokens_seen": 146517735, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.14080811, + "step": 6827, + "time_per_iteration": 2.55556058883667 + }, + { + "auxiliary_loss_clip": 0.06463977, + "auxiliary_loss_mlp": 0.01267684, + "balance_loss_clip": 0.06292337, + "balance_loss_mlp": 0.01254017, + "epoch": 0.41052156921689464, + "flos": 22134620983680.0, + "grad_norm": 1.346138162541555, + "language_loss": 0.72310138, + "learning_rate": 2.663852444511689e-06, + "loss": 0.80041802, + "num_input_tokens_seen": 146537640, + "router_z_loss_clip": 1.71777344, + "router_z_loss_mlp": 0.13665771, + "step": 6828, + "time_per_iteration": 2.6050894260406494 + }, + { + "auxiliary_loss_clip": 0.06477004, + "auxiliary_loss_mlp": 0.01275424, + "balance_loss_clip": 0.06296174, + "balance_loss_mlp": 0.01259855, + "epoch": 0.4105816924695626, + "flos": 20090607684480.0, + "grad_norm": 2.1527229818824196, + "language_loss": 0.84003794, + "learning_rate": 2.6634850506396574e-06, + "loss": 0.91756219, + "num_input_tokens_seen": 146554695, + "router_z_loss_clip": 1.81054688, + "router_z_loss_mlp": 0.15588379, + "step": 6829, + "time_per_iteration": 2.5358362197875977 + }, + { + "auxiliary_loss_clip": 0.06466494, + "auxiliary_loss_mlp": 0.01273558, + "balance_loss_clip": 0.0629379, + "balance_loss_mlp": 0.01259789, + "epoch": 0.4106418157222306, + "flos": 18082540586880.0, + "grad_norm": 1.474811924806309, + "language_loss": 0.90568459, + "learning_rate": 2.663117631608206e-06, + "loss": 0.98308516, + "num_input_tokens_seen": 146573740, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.13781738, + "step": 6830, + "time_per_iteration": 2.5749125480651855 + }, + { + "auxiliary_loss_clip": 0.06471005, + "auxiliary_loss_mlp": 0.01271813, + "balance_loss_clip": 0.06296638, + "balance_loss_mlp": 0.01257729, + "epoch": 0.41070193897489854, + "flos": 21653442512640.0, + "grad_norm": 1.8339460976388509, + "language_loss": 0.6606307, + "learning_rate": 2.662750187431268e-06, + "loss": 0.73805887, + "num_input_tokens_seen": 146592885, + "router_z_loss_clip": 1.74414062, + "router_z_loss_mlp": 0.14080811, + "step": 6831, + "time_per_iteration": 2.5448153018951416 + }, + { + "auxiliary_loss_clip": 0.06473927, + "auxiliary_loss_mlp": 0.01269964, + "balance_loss_clip": 0.06301369, + "balance_loss_mlp": 0.01256613, + "epoch": 0.4107620622275665, + "flos": 26654924396160.0, + "grad_norm": 2.1106075691496766, + "language_loss": 0.69853723, + "learning_rate": 2.662382718122776e-06, + "loss": 0.77597612, + "num_input_tokens_seen": 146611995, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.13360596, + "step": 6832, + "time_per_iteration": 2.61200213432312 + }, + { + "auxiliary_loss_clip": 0.06467804, + "auxiliary_loss_mlp": 0.01274675, + "balance_loss_clip": 0.06296351, + "balance_loss_mlp": 0.01261586, + "epoch": 0.41082218548023447, + "flos": 18740305537920.0, + "grad_norm": 3.2749058883058177, + "language_loss": 0.73955101, + "learning_rate": 2.662015223696666e-06, + "loss": 0.81697583, + "num_input_tokens_seen": 146628045, + "router_z_loss_clip": 1.71386719, + "router_z_loss_mlp": 0.13092041, + "step": 6833, + "time_per_iteration": 2.5293643474578857 + }, + { + "auxiliary_loss_clip": 0.06477401, + "auxiliary_loss_mlp": 0.01270878, + "balance_loss_clip": 0.06301869, + "balance_loss_mlp": 0.01256334, + "epoch": 0.41088230873290243, + "flos": 22900476101760.0, + "grad_norm": 1.6362019789175348, + "language_loss": 0.72870773, + "learning_rate": 2.6616477041668713e-06, + "loss": 0.80619049, + "num_input_tokens_seen": 146648355, + "router_z_loss_clip": 1.75488281, + "router_z_loss_mlp": 0.14532471, + "step": 6834, + "time_per_iteration": 2.5534543991088867 + }, + { + "auxiliary_loss_clip": 0.06479818, + "auxiliary_loss_mlp": 0.01271417, + "balance_loss_clip": 0.0630189, + "balance_loss_mlp": 0.01257601, + "epoch": 0.4109424319855704, + "flos": 24283370286720.0, + "grad_norm": 2.482567827780577, + "language_loss": 0.71274042, + "learning_rate": 2.661280159547329e-06, + "loss": 0.7902528, + "num_input_tokens_seen": 146668370, + "router_z_loss_clip": 1.78027344, + "router_z_loss_mlp": 0.13824463, + "step": 6835, + "time_per_iteration": 2.6012609004974365 + }, + { + "auxiliary_loss_clip": 0.06481166, + "auxiliary_loss_mlp": 0.012697, + "balance_loss_clip": 0.06306168, + "balance_loss_mlp": 0.01255318, + "epoch": 0.41100255523823837, + "flos": 12974100566400.0, + "grad_norm": 1.7690004377507398, + "language_loss": 0.87590879, + "learning_rate": 2.660912589851978e-06, + "loss": 0.95341742, + "num_input_tokens_seen": 146686665, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.14373779, + "step": 6836, + "time_per_iteration": 2.5210461616516113 + }, + { + "auxiliary_loss_clip": 0.06475058, + "auxiliary_loss_mlp": 0.0127358, + "balance_loss_clip": 0.06304475, + "balance_loss_mlp": 0.01259937, + "epoch": 0.4110626784909064, + "flos": 23151806023680.0, + "grad_norm": 1.7062413123689164, + "language_loss": 0.69134921, + "learning_rate": 2.6605449950947547e-06, + "loss": 0.76883554, + "num_input_tokens_seen": 146706570, + "router_z_loss_clip": 1.70703125, + "router_z_loss_mlp": 0.13641357, + "step": 6837, + "time_per_iteration": 2.58320689201355 + }, + { + "auxiliary_loss_clip": 0.06479225, + "auxiliary_loss_mlp": 0.01273179, + "balance_loss_clip": 0.06301909, + "balance_loss_mlp": 0.01258248, + "epoch": 0.41112280174357435, + "flos": 22754007964800.0, + "grad_norm": 1.9797600155486905, + "language_loss": 0.7565136, + "learning_rate": 2.660177375289599e-06, + "loss": 0.83403766, + "num_input_tokens_seen": 146723425, + "router_z_loss_clip": 1.7734375, + "router_z_loss_mlp": 0.1494751, + "step": 6838, + "time_per_iteration": 2.5357375144958496 + }, + { + "auxiliary_loss_clip": 0.06478335, + "auxiliary_loss_mlp": 0.01273659, + "balance_loss_clip": 0.06305958, + "balance_loss_mlp": 0.01259318, + "epoch": 0.4111829249962423, + "flos": 21108211994880.0, + "grad_norm": 2.0771476339041635, + "language_loss": 0.82403398, + "learning_rate": 2.659809730450451e-06, + "loss": 0.90155393, + "num_input_tokens_seen": 146741640, + "router_z_loss_clip": 1.72265625, + "router_z_loss_mlp": 0.14343262, + "step": 6839, + "time_per_iteration": 2.596498489379883 + }, + { + "auxiliary_loss_clip": 0.06477809, + "auxiliary_loss_mlp": 0.01273131, + "balance_loss_clip": 0.06305793, + "balance_loss_mlp": 0.01259404, + "epoch": 0.4112430482489103, + "flos": 21512005620480.0, + "grad_norm": 1.908617135949294, + "language_loss": 0.8080616, + "learning_rate": 2.6594420605912523e-06, + "loss": 0.885571, + "num_input_tokens_seen": 146759195, + "router_z_loss_clip": 1.72070312, + "router_z_loss_mlp": 0.13726807, + "step": 6840, + "time_per_iteration": 2.575131893157959 + }, + { + "auxiliary_loss_clip": 0.06480156, + "auxiliary_loss_mlp": 0.01275329, + "balance_loss_clip": 0.06307412, + "balance_loss_mlp": 0.01262639, + "epoch": 0.41130317150157825, + "flos": 19575579363840.0, + "grad_norm": 1.874526459917051, + "language_loss": 0.67950094, + "learning_rate": 2.6590743657259442e-06, + "loss": 0.75705582, + "num_input_tokens_seen": 146774990, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.12701416, + "step": 6841, + "time_per_iteration": 2.5642948150634766 + }, + { + "auxiliary_loss_clip": 0.06386833, + "auxiliary_loss_mlp": 0.01258898, + "balance_loss_clip": 0.06308911, + "balance_loss_mlp": 0.01256092, + "epoch": 0.4113632947542462, + "flos": 62404541498880.0, + "grad_norm": 0.7544179812034518, + "language_loss": 0.59557825, + "learning_rate": 2.65870664586847e-06, + "loss": 0.67203557, + "num_input_tokens_seen": 146839610, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 0.02804565, + "step": 6842, + "time_per_iteration": 3.2257192134857178 + }, + { + "auxiliary_loss_clip": 0.06472278, + "auxiliary_loss_mlp": 0.01271531, + "balance_loss_clip": 0.06304677, + "balance_loss_mlp": 0.01257977, + "epoch": 0.4114234180069142, + "flos": 13923879396480.0, + "grad_norm": 2.0142050293437803, + "language_loss": 0.70280814, + "learning_rate": 2.6583389010327742e-06, + "loss": 0.78024626, + "num_input_tokens_seen": 146857360, + "router_z_loss_clip": 1.67480469, + "router_z_loss_mlp": 0.13562012, + "step": 6843, + "time_per_iteration": 2.565969944000244 + }, + { + "auxiliary_loss_clip": 0.06380486, + "auxiliary_loss_mlp": 0.01256868, + "balance_loss_clip": 0.06302112, + "balance_loss_mlp": 0.01253599, + "epoch": 0.41148354125958214, + "flos": 64948866727680.0, + "grad_norm": 0.7130365683812196, + "language_loss": 0.53645009, + "learning_rate": 2.6579711312328013e-06, + "loss": 0.61282361, + "num_input_tokens_seen": 146917055, + "router_z_loss_clip": 0.78222656, + "router_z_loss_mlp": 0.03274536, + "step": 6844, + "time_per_iteration": 3.16054105758667 + }, + { + "auxiliary_loss_clip": 0.06475421, + "auxiliary_loss_mlp": 0.0126646, + "balance_loss_clip": 0.06304798, + "balance_loss_mlp": 0.01253144, + "epoch": 0.4115436645122501, + "flos": 18733848773760.0, + "grad_norm": 1.6055019254999645, + "language_loss": 0.66105658, + "learning_rate": 2.6576033364824967e-06, + "loss": 0.73847538, + "num_input_tokens_seen": 146935215, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.13317871, + "step": 6845, + "time_per_iteration": 2.5785298347473145 + }, + { + "auxiliary_loss_clip": 0.06478415, + "auxiliary_loss_mlp": 0.01267629, + "balance_loss_clip": 0.06307876, + "balance_loss_mlp": 0.01254176, + "epoch": 0.41160378776491807, + "flos": 16258439128320.0, + "grad_norm": 2.0979946916750594, + "language_loss": 0.70201457, + "learning_rate": 2.657235516795808e-06, + "loss": 0.77947497, + "num_input_tokens_seen": 146951970, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.13446045, + "step": 6846, + "time_per_iteration": 2.510215997695923 + }, + { + "auxiliary_loss_clip": 0.06481081, + "auxiliary_loss_mlp": 0.01271315, + "balance_loss_clip": 0.06309364, + "balance_loss_mlp": 0.01257391, + "epoch": 0.41166391101758604, + "flos": 27978378508800.0, + "grad_norm": 1.4002739744354715, + "language_loss": 0.65459704, + "learning_rate": 2.6568676721866826e-06, + "loss": 0.73212105, + "num_input_tokens_seen": 146975615, + "router_z_loss_clip": 1.71777344, + "router_z_loss_mlp": 0.13922119, + "step": 6847, + "time_per_iteration": 4.048614025115967 + }, + { + "auxiliary_loss_clip": 0.06476664, + "auxiliary_loss_mlp": 0.01270454, + "balance_loss_clip": 0.06304531, + "balance_loss_mlp": 0.01256459, + "epoch": 0.411724034270254, + "flos": 34139865916800.0, + "grad_norm": 1.3666484547506623, + "language_loss": 0.7086308, + "learning_rate": 2.656499802669069e-06, + "loss": 0.78610194, + "num_input_tokens_seen": 146998855, + "router_z_loss_clip": 1.72363281, + "router_z_loss_mlp": 0.13983154, + "step": 6848, + "time_per_iteration": 4.219269037246704 + }, + { + "auxiliary_loss_clip": 0.06375948, + "auxiliary_loss_mlp": 0.01253417, + "balance_loss_clip": 0.06298448, + "balance_loss_mlp": 0.01250777, + "epoch": 0.41178415752292197, + "flos": 67945090625280.0, + "grad_norm": 0.8791919044020794, + "language_loss": 0.56300032, + "learning_rate": 2.6561319082569174e-06, + "loss": 0.63929397, + "num_input_tokens_seen": 147062710, + "router_z_loss_clip": 0.77441406, + "router_z_loss_mlp": 0.02642822, + "step": 6849, + "time_per_iteration": 3.226757287979126 + }, + { + "auxiliary_loss_clip": 0.06472921, + "auxiliary_loss_mlp": 0.0127066, + "balance_loss_clip": 0.06303038, + "balance_loss_mlp": 0.0125707, + "epoch": 0.41184428077558993, + "flos": 34322573744640.0, + "grad_norm": 1.830210581648694, + "language_loss": 0.76533353, + "learning_rate": 2.6557639889641783e-06, + "loss": 0.84276927, + "num_input_tokens_seen": 147086075, + "router_z_loss_clip": 1.70019531, + "router_z_loss_mlp": 0.13598633, + "step": 6850, + "time_per_iteration": 2.653665542602539 + }, + { + "auxiliary_loss_clip": 0.06475841, + "auxiliary_loss_mlp": 0.01268752, + "balance_loss_clip": 0.06303935, + "balance_loss_mlp": 0.0125484, + "epoch": 0.41190440402825795, + "flos": 35452796342400.0, + "grad_norm": 1.6037978840830116, + "language_loss": 0.68379039, + "learning_rate": 2.6553960448048025e-06, + "loss": 0.76123631, + "num_input_tokens_seen": 147107590, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.13909912, + "step": 6851, + "time_per_iteration": 2.72273588180542 + }, + { + "auxiliary_loss_clip": 0.06482952, + "auxiliary_loss_mlp": 0.01272578, + "balance_loss_clip": 0.06306773, + "balance_loss_mlp": 0.01256437, + "epoch": 0.4119645272809259, + "flos": 20856127386240.0, + "grad_norm": 2.4937650031840275, + "language_loss": 0.80344605, + "learning_rate": 2.655028075792743e-06, + "loss": 0.88100129, + "num_input_tokens_seen": 147123715, + "router_z_loss_clip": 1.76171875, + "router_z_loss_mlp": 0.16162109, + "step": 6852, + "time_per_iteration": 2.563422679901123 + }, + { + "auxiliary_loss_clip": 0.06490047, + "auxiliary_loss_mlp": 0.01270823, + "balance_loss_clip": 0.06310906, + "balance_loss_mlp": 0.01256267, + "epoch": 0.4120246505335939, + "flos": 27569218222080.0, + "grad_norm": 2.025784739879877, + "language_loss": 0.77943873, + "learning_rate": 2.6546600819419537e-06, + "loss": 0.8570475, + "num_input_tokens_seen": 147144290, + "router_z_loss_clip": 1.79003906, + "router_z_loss_mlp": 0.14538574, + "step": 6853, + "time_per_iteration": 4.108957290649414 + }, + { + "auxiliary_loss_clip": 0.06493531, + "auxiliary_loss_mlp": 0.0127083, + "balance_loss_clip": 0.06310283, + "balance_loss_mlp": 0.01254618, + "epoch": 0.41208477378626185, + "flos": 37824476232960.0, + "grad_norm": 1.7138113243533049, + "language_loss": 0.66213286, + "learning_rate": 2.6542920632663883e-06, + "loss": 0.73977649, + "num_input_tokens_seen": 147166340, + "router_z_loss_clip": 1.83203125, + "router_z_loss_mlp": 0.16223145, + "step": 6854, + "time_per_iteration": 2.706514596939087 + }, + { + "auxiliary_loss_clip": 0.06481706, + "auxiliary_loss_mlp": 0.012695, + "balance_loss_clip": 0.06308492, + "balance_loss_mlp": 0.01256268, + "epoch": 0.4121448970389298, + "flos": 23447509482240.0, + "grad_norm": 1.8819465084993465, + "language_loss": 0.83935457, + "learning_rate": 2.6539240197800023e-06, + "loss": 0.9168666, + "num_input_tokens_seen": 147184025, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.13238525, + "step": 6855, + "time_per_iteration": 2.6131205558776855 + }, + { + "auxiliary_loss_clip": 0.06478727, + "auxiliary_loss_mlp": 0.01272662, + "balance_loss_clip": 0.06308559, + "balance_loss_mlp": 0.01258524, + "epoch": 0.4122050202915978, + "flos": 21331813415040.0, + "grad_norm": 1.6556690578140216, + "language_loss": 0.79642534, + "learning_rate": 2.6535559514967517e-06, + "loss": 0.87393928, + "num_input_tokens_seen": 147202730, + "router_z_loss_clip": 1.70019531, + "router_z_loss_mlp": 0.14129639, + "step": 6856, + "time_per_iteration": 2.6186776161193848 + }, + { + "auxiliary_loss_clip": 0.06486623, + "auxiliary_loss_mlp": 0.01271133, + "balance_loss_clip": 0.06312534, + "balance_loss_mlp": 0.01257383, + "epoch": 0.41226514354426574, + "flos": 17311193026560.0, + "grad_norm": 2.5768867092656516, + "language_loss": 0.80543911, + "learning_rate": 2.6531878584305935e-06, + "loss": 0.88301665, + "num_input_tokens_seen": 147215315, + "router_z_loss_clip": 1.74023438, + "router_z_loss_mlp": 0.13739014, + "step": 6857, + "time_per_iteration": 4.0222320556640625 + }, + { + "auxiliary_loss_clip": 0.06484015, + "auxiliary_loss_mlp": 0.01273092, + "balance_loss_clip": 0.06307175, + "balance_loss_mlp": 0.01259168, + "epoch": 0.4123252667969337, + "flos": 17644519768320.0, + "grad_norm": 1.8891533513627916, + "language_loss": 0.71074593, + "learning_rate": 2.6528197405954873e-06, + "loss": 0.78831697, + "num_input_tokens_seen": 147233330, + "router_z_loss_clip": 1.765625, + "router_z_loss_mlp": 0.13934326, + "step": 6858, + "time_per_iteration": 2.598215341567993 + }, + { + "auxiliary_loss_clip": 0.06484012, + "auxiliary_loss_mlp": 0.01270468, + "balance_loss_clip": 0.06310833, + "balance_loss_mlp": 0.01256109, + "epoch": 0.4123853900496017, + "flos": 46435070304000.0, + "grad_norm": 1.791293678645808, + "language_loss": 0.59712768, + "learning_rate": 2.652451598005391e-06, + "loss": 0.67467248, + "num_input_tokens_seen": 147257780, + "router_z_loss_clip": 1.73339844, + "router_z_loss_mlp": 0.14361572, + "step": 6859, + "time_per_iteration": 2.818535804748535 + }, + { + "auxiliary_loss_clip": 0.0648525, + "auxiliary_loss_mlp": 0.01269281, + "balance_loss_clip": 0.06306802, + "balance_loss_mlp": 0.01255423, + "epoch": 0.41244551330226964, + "flos": 17680801386240.0, + "grad_norm": 3.190643468711074, + "language_loss": 0.73818636, + "learning_rate": 2.652083430674264e-06, + "loss": 0.81573164, + "num_input_tokens_seen": 147276055, + "router_z_loss_clip": 1.78417969, + "router_z_loss_mlp": 0.13861084, + "step": 6860, + "time_per_iteration": 2.559460163116455 + }, + { + "auxiliary_loss_clip": 0.06473921, + "auxiliary_loss_mlp": 0.01270813, + "balance_loss_clip": 0.06301314, + "balance_loss_mlp": 0.01257706, + "epoch": 0.4125056365549376, + "flos": 18699034602240.0, + "grad_norm": 1.5713730110506565, + "language_loss": 0.74087375, + "learning_rate": 2.651715238616068e-06, + "loss": 0.81832111, + "num_input_tokens_seen": 147293200, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.13110352, + "step": 6861, + "time_per_iteration": 2.563107967376709 + }, + { + "auxiliary_loss_clip": 0.06476536, + "auxiliary_loss_mlp": 0.01266788, + "balance_loss_clip": 0.06306636, + "balance_loss_mlp": 0.01253425, + "epoch": 0.41256575980760557, + "flos": 17901174424320.0, + "grad_norm": 2.040837827964215, + "language_loss": 0.8021872, + "learning_rate": 2.651347021844765e-06, + "loss": 0.87962043, + "num_input_tokens_seen": 147310640, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.13354492, + "step": 6862, + "time_per_iteration": 2.4968619346618652 + }, + { + "auxiliary_loss_clip": 0.06481781, + "auxiliary_loss_mlp": 0.01269578, + "balance_loss_clip": 0.06308153, + "balance_loss_mlp": 0.01255881, + "epoch": 0.41262588306027354, + "flos": 21987817430400.0, + "grad_norm": 2.204342418200638, + "language_loss": 0.767263, + "learning_rate": 2.650978780374318e-06, + "loss": 0.84477663, + "num_input_tokens_seen": 147329435, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.13708496, + "step": 6863, + "time_per_iteration": 2.5787971019744873 + }, + { + "auxiliary_loss_clip": 0.06377177, + "auxiliary_loss_mlp": 0.01254592, + "balance_loss_clip": 0.06300335, + "balance_loss_mlp": 0.01252135, + "epoch": 0.41268600631294156, + "flos": 53366339243520.0, + "grad_norm": 0.6821216328900507, + "language_loss": 0.52583742, + "learning_rate": 2.650610514218691e-06, + "loss": 0.60215503, + "num_input_tokens_seen": 147385805, + "router_z_loss_clip": 0.77001953, + "router_z_loss_mlp": 0.02455139, + "step": 6864, + "time_per_iteration": 3.1086013317108154 + }, + { + "auxiliary_loss_clip": 0.06480177, + "auxiliary_loss_mlp": 0.01271204, + "balance_loss_clip": 0.06300756, + "balance_loss_mlp": 0.01256714, + "epoch": 0.4127461295656095, + "flos": 24391586234880.0, + "grad_norm": 1.7134572277425464, + "language_loss": 0.72468507, + "learning_rate": 2.6502422233918468e-06, + "loss": 0.80219889, + "num_input_tokens_seen": 147405160, + "router_z_loss_clip": 1.79394531, + "router_z_loss_mlp": 0.14489746, + "step": 6865, + "time_per_iteration": 2.6081020832061768 + }, + { + "auxiliary_loss_clip": 0.06375298, + "auxiliary_loss_mlp": 0.01255641, + "balance_loss_clip": 0.06298722, + "balance_loss_mlp": 0.01252579, + "epoch": 0.4128062528182775, + "flos": 71725129142400.0, + "grad_norm": 0.9099190790692077, + "language_loss": 0.66497219, + "learning_rate": 2.649873907907753e-06, + "loss": 0.74128163, + "num_input_tokens_seen": 147460245, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.03059387, + "step": 6866, + "time_per_iteration": 3.0357213020324707 + }, + { + "auxiliary_loss_clip": 0.06476509, + "auxiliary_loss_mlp": 0.01269311, + "balance_loss_clip": 0.06301893, + "balance_loss_mlp": 0.01255799, + "epoch": 0.41286637607094545, + "flos": 17853362870400.0, + "grad_norm": 2.1198776843792357, + "language_loss": 0.81617618, + "learning_rate": 2.649505567780375e-06, + "loss": 0.89363438, + "num_input_tokens_seen": 147476200, + "router_z_loss_clip": 1.74414062, + "router_z_loss_mlp": 0.13500977, + "step": 6867, + "time_per_iteration": 2.6095240116119385 + }, + { + "auxiliary_loss_clip": 0.06482062, + "auxiliary_loss_mlp": 0.01270795, + "balance_loss_clip": 0.06303717, + "balance_loss_mlp": 0.01256657, + "epoch": 0.4129264993236134, + "flos": 25555407120000.0, + "grad_norm": 2.8405529060711006, + "language_loss": 0.78333044, + "learning_rate": 2.6491372030236815e-06, + "loss": 0.86085904, + "num_input_tokens_seen": 147494315, + "router_z_loss_clip": 1.78222656, + "router_z_loss_mlp": 0.14147949, + "step": 6868, + "time_per_iteration": 2.558155059814453 + }, + { + "auxiliary_loss_clip": 0.06374986, + "auxiliary_loss_mlp": 0.01255045, + "balance_loss_clip": 0.06298015, + "balance_loss_mlp": 0.01251991, + "epoch": 0.4129866225762814, + "flos": 65430730759680.0, + "grad_norm": 0.8212939455862347, + "language_loss": 0.57654673, + "learning_rate": 2.64876881365164e-06, + "loss": 0.65284705, + "num_input_tokens_seen": 147543665, + "router_z_loss_clip": 0.77001953, + "router_z_loss_mlp": 0.03051758, + "step": 6869, + "time_per_iteration": 2.9284112453460693 + }, + { + "auxiliary_loss_clip": 0.06481783, + "auxiliary_loss_mlp": 0.01277222, + "balance_loss_clip": 0.06310707, + "balance_loss_mlp": 0.01263472, + "epoch": 0.41304674582894935, + "flos": 28884622343040.0, + "grad_norm": 2.4401499988028594, + "language_loss": 0.75528967, + "learning_rate": 2.64840039967822e-06, + "loss": 0.83287978, + "num_input_tokens_seen": 147564870, + "router_z_loss_clip": 1.70996094, + "router_z_loss_mlp": 0.13763428, + "step": 6870, + "time_per_iteration": 2.6844911575317383 + }, + { + "auxiliary_loss_clip": 0.0647882, + "auxiliary_loss_mlp": 0.01278278, + "balance_loss_clip": 0.06302784, + "balance_loss_mlp": 0.0126414, + "epoch": 0.4131068690816173, + "flos": 22898379749760.0, + "grad_norm": 1.5575458850844177, + "language_loss": 0.83697838, + "learning_rate": 2.6480319611173912e-06, + "loss": 0.91454935, + "num_input_tokens_seen": 147584840, + "router_z_loss_clip": 1.76074219, + "router_z_loss_mlp": 0.14135742, + "step": 6871, + "time_per_iteration": 2.636808156967163 + }, + { + "auxiliary_loss_clip": 0.06479517, + "auxiliary_loss_mlp": 0.0126964, + "balance_loss_clip": 0.06303998, + "balance_loss_mlp": 0.01256033, + "epoch": 0.4131669923342853, + "flos": 26071944814080.0, + "grad_norm": 2.2227773400911732, + "language_loss": 0.69246161, + "learning_rate": 2.6476634979831263e-06, + "loss": 0.76995325, + "num_input_tokens_seen": 147604635, + "router_z_loss_clip": 1.75585938, + "router_z_loss_mlp": 0.1361084, + "step": 6872, + "time_per_iteration": 2.6492373943328857 + }, + { + "auxiliary_loss_clip": 0.06480041, + "auxiliary_loss_mlp": 0.01273197, + "balance_loss_clip": 0.06303592, + "balance_loss_mlp": 0.01259494, + "epoch": 0.41322711558695324, + "flos": 19250554176000.0, + "grad_norm": 1.8563624048188305, + "language_loss": 0.76261687, + "learning_rate": 2.6472950102893964e-06, + "loss": 0.84014916, + "num_input_tokens_seen": 147620700, + "router_z_loss_clip": 1.76464844, + "router_z_loss_mlp": 0.13696289, + "step": 6873, + "time_per_iteration": 2.5294342041015625 + }, + { + "auxiliary_loss_clip": 0.06480598, + "auxiliary_loss_mlp": 0.01273623, + "balance_loss_clip": 0.06302338, + "balance_loss_mlp": 0.0125958, + "epoch": 0.4132872388396212, + "flos": 22681067385600.0, + "grad_norm": 1.8281818605346505, + "language_loss": 0.83432305, + "learning_rate": 2.6469264980501746e-06, + "loss": 0.91186529, + "num_input_tokens_seen": 147639490, + "router_z_loss_clip": 1.78027344, + "router_z_loss_mlp": 0.14031982, + "step": 6874, + "time_per_iteration": 2.6135475635528564 + }, + { + "auxiliary_loss_clip": 0.06483124, + "auxiliary_loss_mlp": 0.01273525, + "balance_loss_clip": 0.06306563, + "balance_loss_mlp": 0.01258498, + "epoch": 0.4133473620922892, + "flos": 20155246709760.0, + "grad_norm": 1.7886089381127788, + "language_loss": 0.72210878, + "learning_rate": 2.646557961279436e-06, + "loss": 0.79967523, + "num_input_tokens_seen": 147657205, + "router_z_loss_clip": 1.76367188, + "router_z_loss_mlp": 0.15020752, + "step": 6875, + "time_per_iteration": 2.535613536834717 + }, + { + "auxiliary_loss_clip": 0.06467389, + "auxiliary_loss_mlp": 0.01270264, + "balance_loss_clip": 0.06301813, + "balance_loss_mlp": 0.01257151, + "epoch": 0.41340748534495714, + "flos": 24249520437120.0, + "grad_norm": 1.4522680677637643, + "language_loss": 0.82662565, + "learning_rate": 2.646189399991154e-06, + "loss": 0.90400219, + "num_input_tokens_seen": 147677005, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.13098145, + "step": 6876, + "time_per_iteration": 2.631683111190796 + }, + { + "auxiliary_loss_clip": 0.06476636, + "auxiliary_loss_mlp": 0.0126976, + "balance_loss_clip": 0.06298597, + "balance_loss_mlp": 0.01255198, + "epoch": 0.41346760859762516, + "flos": 14397385219200.0, + "grad_norm": 2.4272621941749044, + "language_loss": 0.65427208, + "learning_rate": 2.6458208141993048e-06, + "loss": 0.73173606, + "num_input_tokens_seen": 147693435, + "router_z_loss_clip": 1.77929688, + "router_z_loss_mlp": 0.14556885, + "step": 6877, + "time_per_iteration": 2.5211727619171143 + }, + { + "auxiliary_loss_clip": 0.06477489, + "auxiliary_loss_mlp": 0.01272334, + "balance_loss_clip": 0.06304673, + "balance_loss_mlp": 0.0125853, + "epoch": 0.4135277318502931, + "flos": 22498569192960.0, + "grad_norm": 1.7887587996629348, + "language_loss": 0.77271414, + "learning_rate": 2.6454522039178668e-06, + "loss": 0.85021234, + "num_input_tokens_seen": 147714000, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.13800049, + "step": 6878, + "time_per_iteration": 2.591952085494995 + }, + { + "auxiliary_loss_clip": 0.06478719, + "auxiliary_loss_mlp": 0.01271871, + "balance_loss_clip": 0.06303747, + "balance_loss_mlp": 0.01258525, + "epoch": 0.4135878551029611, + "flos": 22425251270400.0, + "grad_norm": 1.9381355665838014, + "language_loss": 0.8049022, + "learning_rate": 2.6450835691608154e-06, + "loss": 0.88240814, + "num_input_tokens_seen": 147731010, + "router_z_loss_clip": 1.74609375, + "router_z_loss_mlp": 0.13354492, + "step": 6879, + "time_per_iteration": 2.565875291824341 + }, + { + "auxiliary_loss_clip": 0.06476135, + "auxiliary_loss_mlp": 0.0127254, + "balance_loss_clip": 0.06301241, + "balance_loss_mlp": 0.01258688, + "epoch": 0.41364797835562905, + "flos": 27060646665600.0, + "grad_norm": 1.8294611042748399, + "language_loss": 0.8543402, + "learning_rate": 2.6447149099421315e-06, + "loss": 0.93182689, + "num_input_tokens_seen": 147750880, + "router_z_loss_clip": 1.74804688, + "router_z_loss_mlp": 0.13861084, + "step": 6880, + "time_per_iteration": 2.6438286304473877 + }, + { + "auxiliary_loss_clip": 0.06478438, + "auxiliary_loss_mlp": 0.01270379, + "balance_loss_clip": 0.06301369, + "balance_loss_mlp": 0.01256258, + "epoch": 0.413708101608297, + "flos": 22974464856960.0, + "grad_norm": 2.0767525842165413, + "language_loss": 0.70694637, + "learning_rate": 2.6443462262757927e-06, + "loss": 0.78443456, + "num_input_tokens_seen": 147771360, + "router_z_loss_clip": 1.77050781, + "router_z_loss_mlp": 0.14129639, + "step": 6881, + "time_per_iteration": 2.57663893699646 + }, + { + "auxiliary_loss_clip": 0.06468567, + "auxiliary_loss_mlp": 0.01269061, + "balance_loss_clip": 0.06300917, + "balance_loss_mlp": 0.01255978, + "epoch": 0.413768224860965, + "flos": 13339013097600.0, + "grad_norm": 1.7206029499163673, + "language_loss": 0.81694102, + "learning_rate": 2.6439775181757805e-06, + "loss": 0.89431733, + "num_input_tokens_seen": 147787440, + "router_z_loss_clip": 1.67578125, + "router_z_loss_mlp": 0.13092041, + "step": 6882, + "time_per_iteration": 2.572300672531128 + }, + { + "auxiliary_loss_clip": 0.06484764, + "auxiliary_loss_mlp": 0.01273853, + "balance_loss_clip": 0.06306723, + "balance_loss_mlp": 0.0125776, + "epoch": 0.41382834811363295, + "flos": 20820306965760.0, + "grad_norm": 2.0204096459019176, + "language_loss": 0.69182575, + "learning_rate": 2.643608785656077e-06, + "loss": 0.76941192, + "num_input_tokens_seen": 147805720, + "router_z_loss_clip": 1.77929688, + "router_z_loss_mlp": 0.16088867, + "step": 6883, + "time_per_iteration": 2.5611510276794434 + }, + { + "auxiliary_loss_clip": 0.06472149, + "auxiliary_loss_mlp": 0.0126815, + "balance_loss_clip": 0.06297622, + "balance_loss_mlp": 0.01255061, + "epoch": 0.4138884713663009, + "flos": 20673293777280.0, + "grad_norm": 2.0786241324697, + "language_loss": 0.75945485, + "learning_rate": 2.643240028730663e-06, + "loss": 0.83685786, + "num_input_tokens_seen": 147824605, + "router_z_loss_clip": 1.74609375, + "router_z_loss_mlp": 0.13092041, + "step": 6884, + "time_per_iteration": 2.5788567066192627 + }, + { + "auxiliary_loss_clip": 0.06477202, + "auxiliary_loss_mlp": 0.01273717, + "balance_loss_clip": 0.06298974, + "balance_loss_mlp": 0.01260008, + "epoch": 0.4139485946189689, + "flos": 29063808299520.0, + "grad_norm": 3.0401310083666444, + "language_loss": 0.76198518, + "learning_rate": 2.642871247413523e-06, + "loss": 0.83949435, + "num_input_tokens_seen": 147845445, + "router_z_loss_clip": 1.78125, + "router_z_loss_mlp": 0.13720703, + "step": 6885, + "time_per_iteration": 2.5964529514312744 + }, + { + "auxiliary_loss_clip": 0.06475228, + "auxiliary_loss_mlp": 0.01270635, + "balance_loss_clip": 0.06299268, + "balance_loss_mlp": 0.01256187, + "epoch": 0.41400871787163684, + "flos": 24432605608320.0, + "grad_norm": 1.9051304938208142, + "language_loss": 0.70031226, + "learning_rate": 2.6425024417186414e-06, + "loss": 0.77777094, + "num_input_tokens_seen": 147865580, + "router_z_loss_clip": 1.75878906, + "router_z_loss_mlp": 0.14447021, + "step": 6886, + "time_per_iteration": 4.101384878158569 + }, + { + "auxiliary_loss_clip": 0.06475122, + "auxiliary_loss_mlp": 0.01275658, + "balance_loss_clip": 0.06297341, + "balance_loss_mlp": 0.01260423, + "epoch": 0.4140688411243048, + "flos": 19470172527360.0, + "grad_norm": 1.459976196778311, + "language_loss": 0.75538456, + "learning_rate": 2.642133611660002e-06, + "loss": 0.83289236, + "num_input_tokens_seen": 147885230, + "router_z_loss_clip": 1.77832031, + "router_z_loss_mlp": 0.15234375, + "step": 6887, + "time_per_iteration": 2.5979294776916504 + }, + { + "auxiliary_loss_clip": 0.06468056, + "auxiliary_loss_mlp": 0.01273257, + "balance_loss_clip": 0.06294202, + "balance_loss_mlp": 0.0125916, + "epoch": 0.4141289643769728, + "flos": 19319008561920.0, + "grad_norm": 2.153365375528394, + "language_loss": 0.70707798, + "learning_rate": 2.641764757251592e-06, + "loss": 0.78449106, + "num_input_tokens_seen": 147903035, + "router_z_loss_clip": 1.73730469, + "router_z_loss_mlp": 0.14099121, + "step": 6888, + "time_per_iteration": 4.008386850357056 + }, + { + "auxiliary_loss_clip": 0.06466109, + "auxiliary_loss_mlp": 0.01273102, + "balance_loss_clip": 0.0629206, + "balance_loss_mlp": 0.0125863, + "epoch": 0.41418908762964074, + "flos": 16732448075520.0, + "grad_norm": 2.015209624353795, + "language_loss": 0.76631236, + "learning_rate": 2.6413958785073976e-06, + "loss": 0.84370446, + "num_input_tokens_seen": 147918745, + "router_z_loss_clip": 1.74121094, + "router_z_loss_mlp": 0.14477539, + "step": 6889, + "time_per_iteration": 2.5270447731018066 + }, + { + "auxiliary_loss_clip": 0.06466071, + "auxiliary_loss_mlp": 0.012722, + "balance_loss_clip": 0.06294381, + "balance_loss_mlp": 0.01258628, + "epoch": 0.41424921088230876, + "flos": 25303112876160.0, + "grad_norm": 1.5878983493356928, + "language_loss": 0.80245477, + "learning_rate": 2.6410269754414074e-06, + "loss": 0.87983751, + "num_input_tokens_seen": 147938265, + "router_z_loss_clip": 1.71679688, + "router_z_loss_mlp": 0.13568115, + "step": 6890, + "time_per_iteration": 2.5559017658233643 + }, + { + "auxiliary_loss_clip": 0.06465066, + "auxiliary_loss_mlp": 0.01273625, + "balance_loss_clip": 0.06294424, + "balance_loss_mlp": 0.01258592, + "epoch": 0.4143093341349767, + "flos": 20966984737920.0, + "grad_norm": 1.4631338633868025, + "language_loss": 0.74175858, + "learning_rate": 2.6406580480676113e-06, + "loss": 0.81914544, + "num_input_tokens_seen": 147957320, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.15014648, + "step": 6891, + "time_per_iteration": 2.5313403606414795 + }, + { + "auxiliary_loss_clip": 0.06475316, + "auxiliary_loss_mlp": 0.01269526, + "balance_loss_clip": 0.0629719, + "balance_loss_mlp": 0.01253283, + "epoch": 0.4143694573876447, + "flos": 22024182902400.0, + "grad_norm": 2.801103384820577, + "language_loss": 0.84378529, + "learning_rate": 2.6402890963999963e-06, + "loss": 0.92123371, + "num_input_tokens_seen": 147977045, + "router_z_loss_clip": 1.78125, + "router_z_loss_mlp": 0.16247559, + "step": 6892, + "time_per_iteration": 3.9777607917785645 + }, + { + "auxiliary_loss_clip": 0.06465086, + "auxiliary_loss_mlp": 0.01270368, + "balance_loss_clip": 0.06295982, + "balance_loss_mlp": 0.01257339, + "epoch": 0.41442958064031266, + "flos": 35705761418880.0, + "grad_norm": 1.735816743811137, + "language_loss": 0.70161885, + "learning_rate": 2.6399201204525554e-06, + "loss": 0.7789734, + "num_input_tokens_seen": 147996905, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.13037109, + "step": 6893, + "time_per_iteration": 2.6909854412078857 + }, + { + "auxiliary_loss_clip": 0.06467048, + "auxiliary_loss_mlp": 0.01267192, + "balance_loss_clip": 0.0629535, + "balance_loss_mlp": 0.01253799, + "epoch": 0.4144897038929806, + "flos": 28301391198720.0, + "grad_norm": 1.3940088969507989, + "language_loss": 0.73223269, + "learning_rate": 2.639551120239279e-06, + "loss": 0.80957508, + "num_input_tokens_seen": 148017875, + "router_z_loss_clip": 1.71484375, + "router_z_loss_mlp": 0.13378906, + "step": 6894, + "time_per_iteration": 2.5950350761413574 + }, + { + "auxiliary_loss_clip": 0.06476665, + "auxiliary_loss_mlp": 0.01273362, + "balance_loss_clip": 0.06300536, + "balance_loss_mlp": 0.0125867, + "epoch": 0.4145498271456486, + "flos": 11651568848640.0, + "grad_norm": 2.440609351676066, + "language_loss": 0.62663507, + "learning_rate": 2.63918209577416e-06, + "loss": 0.7041353, + "num_input_tokens_seen": 148032300, + "router_z_loss_clip": 1.76074219, + "router_z_loss_mlp": 0.14697266, + "step": 6895, + "time_per_iteration": 2.471320390701294 + }, + { + "auxiliary_loss_clip": 0.0646576, + "auxiliary_loss_mlp": 0.01272394, + "balance_loss_clip": 0.06296334, + "balance_loss_mlp": 0.01258589, + "epoch": 0.41460995039831655, + "flos": 27243061004160.0, + "grad_norm": 3.24758428503537, + "language_loss": 0.70684588, + "learning_rate": 2.638813047071192e-06, + "loss": 0.78422737, + "num_input_tokens_seen": 148053260, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.13806152, + "step": 6896, + "time_per_iteration": 2.5871524810791016 + }, + { + "auxiliary_loss_clip": 0.06475289, + "auxiliary_loss_mlp": 0.01275214, + "balance_loss_clip": 0.06299431, + "balance_loss_mlp": 0.01260164, + "epoch": 0.4146700736509845, + "flos": 25929627454080.0, + "grad_norm": 1.8920871134817128, + "language_loss": 0.73144394, + "learning_rate": 2.6384439741443696e-06, + "loss": 0.80894893, + "num_input_tokens_seen": 148072965, + "router_z_loss_clip": 1.7578125, + "router_z_loss_mlp": 0.15057373, + "step": 6897, + "time_per_iteration": 4.0778656005859375 + }, + { + "auxiliary_loss_clip": 0.0646714, + "auxiliary_loss_mlp": 0.01271778, + "balance_loss_clip": 0.06293359, + "balance_loss_mlp": 0.01257371, + "epoch": 0.4147301969036525, + "flos": 26840441335680.0, + "grad_norm": 6.247593775216772, + "language_loss": 0.84715986, + "learning_rate": 2.6380748770076873e-06, + "loss": 0.92454904, + "num_input_tokens_seen": 148093240, + "router_z_loss_clip": 1.73730469, + "router_z_loss_mlp": 0.14404297, + "step": 6898, + "time_per_iteration": 2.5603139400482178 + }, + { + "auxiliary_loss_clip": 0.06469397, + "auxiliary_loss_mlp": 0.01267488, + "balance_loss_clip": 0.06293289, + "balance_loss_mlp": 0.01253678, + "epoch": 0.41479032015632045, + "flos": 20303727344640.0, + "grad_norm": 2.0378276609946098, + "language_loss": 0.74898899, + "learning_rate": 2.6377057556751416e-06, + "loss": 0.82635784, + "num_input_tokens_seen": 148110925, + "router_z_loss_clip": 1.76074219, + "router_z_loss_mlp": 0.13812256, + "step": 6899, + "time_per_iteration": 2.53822660446167 + }, + { + "auxiliary_loss_clip": 0.06477535, + "auxiliary_loss_mlp": 0.01273796, + "balance_loss_clip": 0.06297705, + "balance_loss_mlp": 0.01258239, + "epoch": 0.4148504434089884, + "flos": 25272030211200.0, + "grad_norm": 2.0370175779228465, + "language_loss": 0.75786376, + "learning_rate": 2.6373366101607306e-06, + "loss": 0.83537704, + "num_input_tokens_seen": 148130670, + "router_z_loss_clip": 1.796875, + "router_z_loss_mlp": 0.15563965, + "step": 6900, + "time_per_iteration": 2.5547776222229004 + }, + { + "auxiliary_loss_clip": 0.06470095, + "auxiliary_loss_mlp": 0.01275828, + "balance_loss_clip": 0.06298018, + "balance_loss_mlp": 0.01260057, + "epoch": 0.4149105666616564, + "flos": 12827087377920.0, + "grad_norm": 3.426788101109298, + "language_loss": 0.80153453, + "learning_rate": 2.6369674404784503e-06, + "loss": 0.87899375, + "num_input_tokens_seen": 148148350, + "router_z_loss_clip": 1.72070312, + "router_z_loss_mlp": 0.15783691, + "step": 6901, + "time_per_iteration": 2.5724570751190186 + }, + { + "auxiliary_loss_clip": 0.06464257, + "auxiliary_loss_mlp": 0.01273382, + "balance_loss_clip": 0.06292327, + "balance_loss_mlp": 0.01258791, + "epoch": 0.41497068991432434, + "flos": 16769526307200.0, + "grad_norm": 2.2871359145608507, + "language_loss": 0.70271528, + "learning_rate": 2.6365982466423014e-06, + "loss": 0.78009164, + "num_input_tokens_seen": 148167550, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.14593506, + "step": 6902, + "time_per_iteration": 2.518018960952759 + }, + { + "auxiliary_loss_clip": 0.06463319, + "auxiliary_loss_mlp": 0.01270625, + "balance_loss_clip": 0.06294475, + "balance_loss_mlp": 0.01255706, + "epoch": 0.4150308131669923, + "flos": 18006161990400.0, + "grad_norm": 2.0523680752477906, + "language_loss": 0.8405019, + "learning_rate": 2.6362290286662834e-06, + "loss": 0.91784132, + "num_input_tokens_seen": 148184740, + "router_z_loss_clip": 1.68554688, + "router_z_loss_mlp": 0.14923096, + "step": 6903, + "time_per_iteration": 2.719252586364746 + }, + { + "auxiliary_loss_clip": 0.06478511, + "auxiliary_loss_mlp": 0.01270948, + "balance_loss_clip": 0.06298795, + "balance_loss_mlp": 0.01254282, + "epoch": 0.41509093641966033, + "flos": 30052635932160.0, + "grad_norm": 2.3513516306772826, + "language_loss": 0.67960835, + "learning_rate": 2.6358597865643968e-06, + "loss": 0.75710285, + "num_input_tokens_seen": 148204605, + "router_z_loss_clip": 1.796875, + "router_z_loss_mlp": 0.16674805, + "step": 6904, + "time_per_iteration": 2.605834484100342 + }, + { + "auxiliary_loss_clip": 0.06473922, + "auxiliary_loss_mlp": 0.01267185, + "balance_loss_clip": 0.06295053, + "balance_loss_mlp": 0.01252678, + "epoch": 0.4151510596723283, + "flos": 24286892158080.0, + "grad_norm": 1.8668907258080212, + "language_loss": 0.77697861, + "learning_rate": 2.635490520350643e-06, + "loss": 0.85438967, + "num_input_tokens_seen": 148224675, + "router_z_loss_clip": 1.78710938, + "router_z_loss_mlp": 0.14508057, + "step": 6905, + "time_per_iteration": 2.6073246002197266 + }, + { + "auxiliary_loss_clip": 0.06477012, + "auxiliary_loss_mlp": 0.01269791, + "balance_loss_clip": 0.06300149, + "balance_loss_mlp": 0.01255391, + "epoch": 0.41521118292499626, + "flos": 23482784851200.0, + "grad_norm": 2.106489831039321, + "language_loss": 0.68546331, + "learning_rate": 2.635121230039025e-06, + "loss": 0.76293135, + "num_input_tokens_seen": 148243375, + "router_z_loss_clip": 1.76855469, + "router_z_loss_mlp": 0.1439209, + "step": 6906, + "time_per_iteration": 2.5378260612487793 + }, + { + "auxiliary_loss_clip": 0.06470662, + "auxiliary_loss_mlp": 0.01269025, + "balance_loss_clip": 0.06298003, + "balance_loss_mlp": 0.01254839, + "epoch": 0.4152713061776642, + "flos": 22131728017920.0, + "grad_norm": 2.406599601104124, + "language_loss": 0.68275452, + "learning_rate": 2.6347519156435467e-06, + "loss": 0.76015139, + "num_input_tokens_seen": 148261140, + "router_z_loss_clip": 1.72851562, + "router_z_loss_mlp": 0.14196777, + "step": 6907, + "time_per_iteration": 2.548020124435425 + }, + { + "auxiliary_loss_clip": 0.06477083, + "auxiliary_loss_mlp": 0.01270349, + "balance_loss_clip": 0.06301615, + "balance_loss_mlp": 0.01256342, + "epoch": 0.4153314294303322, + "flos": 21257740805760.0, + "grad_norm": 2.5393224991434398, + "language_loss": 0.77004838, + "learning_rate": 2.6343825771782123e-06, + "loss": 0.84752274, + "num_input_tokens_seen": 148279655, + "router_z_loss_clip": 1.75585938, + "router_z_loss_mlp": 0.14013672, + "step": 6908, + "time_per_iteration": 2.52205753326416 + }, + { + "auxiliary_loss_clip": 0.0635362, + "auxiliary_loss_mlp": 0.01259834, + "balance_loss_clip": 0.06277395, + "balance_loss_mlp": 0.01256612, + "epoch": 0.41539155268300015, + "flos": 57939443527680.0, + "grad_norm": 0.769240592375345, + "language_loss": 0.64804208, + "learning_rate": 2.634013214657026e-06, + "loss": 0.72417659, + "num_input_tokens_seen": 148339005, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.03225708, + "step": 6909, + "time_per_iteration": 3.109095573425293 + }, + { + "auxiliary_loss_clip": 0.06469519, + "auxiliary_loss_mlp": 0.01271461, + "balance_loss_clip": 0.06297643, + "balance_loss_mlp": 0.0125746, + "epoch": 0.4154516759356681, + "flos": 21909384408960.0, + "grad_norm": 1.4248669333769037, + "language_loss": 0.87550539, + "learning_rate": 2.633643828093996e-06, + "loss": 0.95291519, + "num_input_tokens_seen": 148358715, + "router_z_loss_clip": 1.71777344, + "router_z_loss_mlp": 0.13989258, + "step": 6910, + "time_per_iteration": 2.5253639221191406 + }, + { + "auxiliary_loss_clip": 0.06354217, + "auxiliary_loss_mlp": 0.01257534, + "balance_loss_clip": 0.0627715, + "balance_loss_mlp": 0.01254598, + "epoch": 0.4155117991883361, + "flos": 67852234702080.0, + "grad_norm": 0.8147918233574727, + "language_loss": 0.62098897, + "learning_rate": 2.633274417503128e-06, + "loss": 0.69710648, + "num_input_tokens_seen": 148417280, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.02932739, + "step": 6911, + "time_per_iteration": 3.1515297889709473 + }, + { + "auxiliary_loss_clip": 0.06486405, + "auxiliary_loss_mlp": 0.01269938, + "balance_loss_clip": 0.06302486, + "balance_loss_mlp": 0.01254393, + "epoch": 0.41557192244100405, + "flos": 14287869532800.0, + "grad_norm": 2.853367345352451, + "language_loss": 0.88092077, + "learning_rate": 2.6329049828984312e-06, + "loss": 0.95848417, + "num_input_tokens_seen": 148432610, + "router_z_loss_clip": 1.83691406, + "router_z_loss_mlp": 0.15551758, + "step": 6912, + "time_per_iteration": 2.5334529876708984 + }, + { + "auxiliary_loss_clip": 0.06480967, + "auxiliary_loss_mlp": 0.01268043, + "balance_loss_clip": 0.06303312, + "balance_loss_mlp": 0.01253451, + "epoch": 0.415632045693672, + "flos": 24468803372160.0, + "grad_norm": 2.9756004279328945, + "language_loss": 0.63331664, + "learning_rate": 2.632535524293914e-06, + "loss": 0.71080673, + "num_input_tokens_seen": 148451510, + "router_z_loss_clip": 1.77734375, + "router_z_loss_mlp": 0.14581299, + "step": 6913, + "time_per_iteration": 2.547567129135132 + }, + { + "auxiliary_loss_clip": 0.06471419, + "auxiliary_loss_mlp": 0.01270035, + "balance_loss_clip": 0.06297998, + "balance_loss_mlp": 0.01256249, + "epoch": 0.41569216894634, + "flos": 20120600246400.0, + "grad_norm": 1.832366261637427, + "language_loss": 0.75605875, + "learning_rate": 2.632166041703586e-06, + "loss": 0.83347332, + "num_input_tokens_seen": 148469945, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.13787842, + "step": 6914, + "time_per_iteration": 2.5624208450317383 + }, + { + "auxiliary_loss_clip": 0.06479953, + "auxiliary_loss_mlp": 0.01273918, + "balance_loss_clip": 0.06302451, + "balance_loss_mlp": 0.01257897, + "epoch": 0.41575229219900794, + "flos": 23804497802880.0, + "grad_norm": 2.012818087979969, + "language_loss": 0.87586981, + "learning_rate": 2.631796535141458e-06, + "loss": 0.95340854, + "num_input_tokens_seen": 148486655, + "router_z_loss_clip": 1.77441406, + "router_z_loss_mlp": 0.16015625, + "step": 6915, + "time_per_iteration": 2.545825481414795 + }, + { + "auxiliary_loss_clip": 0.06478707, + "auxiliary_loss_mlp": 0.01273084, + "balance_loss_clip": 0.06302266, + "balance_loss_mlp": 0.01259273, + "epoch": 0.4158124154516759, + "flos": 23114224667520.0, + "grad_norm": 2.419843437778294, + "language_loss": 0.71605122, + "learning_rate": 2.6314270046215426e-06, + "loss": 0.79356909, + "num_input_tokens_seen": 148505035, + "router_z_loss_clip": 1.765625, + "router_z_loss_mlp": 0.13818359, + "step": 6916, + "time_per_iteration": 2.59429669380188 + }, + { + "auxiliary_loss_clip": 0.06477056, + "auxiliary_loss_mlp": 0.01267217, + "balance_loss_clip": 0.06298968, + "balance_loss_mlp": 0.01252208, + "epoch": 0.41587253870434393, + "flos": 24249771999360.0, + "grad_norm": 1.4428572529082921, + "language_loss": 0.71931446, + "learning_rate": 2.631057450157852e-06, + "loss": 0.7967571, + "num_input_tokens_seen": 148525575, + "router_z_loss_clip": 1.78222656, + "router_z_loss_mlp": 0.15002441, + "step": 6917, + "time_per_iteration": 2.56001877784729 + }, + { + "auxiliary_loss_clip": 0.06469631, + "auxiliary_loss_mlp": 0.01267681, + "balance_loss_clip": 0.06294615, + "balance_loss_mlp": 0.01253089, + "epoch": 0.4159326619570119, + "flos": 23888926391040.0, + "grad_norm": 4.142003179261072, + "language_loss": 0.80924189, + "learning_rate": 2.6306878717643988e-06, + "loss": 0.88661504, + "num_input_tokens_seen": 148547270, + "router_z_loss_clip": 1.75097656, + "router_z_loss_mlp": 0.14599609, + "step": 6918, + "time_per_iteration": 2.6182031631469727 + }, + { + "auxiliary_loss_clip": 0.06479505, + "auxiliary_loss_mlp": 0.01270819, + "balance_loss_clip": 0.06299014, + "balance_loss_mlp": 0.01255, + "epoch": 0.41599278520967986, + "flos": 40636315221120.0, + "grad_norm": 1.446116397311604, + "language_loss": 0.70620072, + "learning_rate": 2.6303182694551995e-06, + "loss": 0.78370392, + "num_input_tokens_seen": 148572100, + "router_z_loss_clip": 1.8046875, + "router_z_loss_mlp": 0.1583252, + "step": 6919, + "time_per_iteration": 2.7974801063537598 + }, + { + "auxiliary_loss_clip": 0.06470604, + "auxiliary_loss_mlp": 0.01270956, + "balance_loss_clip": 0.06293205, + "balance_loss_mlp": 0.01255697, + "epoch": 0.4160529084623478, + "flos": 18228757161600.0, + "grad_norm": 1.8139422387612383, + "language_loss": 0.81669927, + "learning_rate": 2.6299486432442677e-06, + "loss": 0.89411485, + "num_input_tokens_seen": 148591245, + "router_z_loss_clip": 1.77441406, + "router_z_loss_mlp": 0.15258789, + "step": 6920, + "time_per_iteration": 2.652277708053589 + }, + { + "auxiliary_loss_clip": 0.06476951, + "auxiliary_loss_mlp": 0.01273828, + "balance_loss_clip": 0.06298292, + "balance_loss_mlp": 0.01258724, + "epoch": 0.4161130317150158, + "flos": 13666973178240.0, + "grad_norm": 2.775667367204969, + "language_loss": 0.65528631, + "learning_rate": 2.6295789931456195e-06, + "loss": 0.73279405, + "num_input_tokens_seen": 148607980, + "router_z_loss_clip": 1.7890625, + "router_z_loss_mlp": 0.15100098, + "step": 6921, + "time_per_iteration": 2.543761968612671 + }, + { + "auxiliary_loss_clip": 0.0647813, + "auxiliary_loss_mlp": 0.01273522, + "balance_loss_clip": 0.06301805, + "balance_loss_mlp": 0.01258168, + "epoch": 0.41617315496768376, + "flos": 16183779540480.0, + "grad_norm": 2.038581093377189, + "language_loss": 0.80900288, + "learning_rate": 2.629209319173274e-06, + "loss": 0.88651937, + "num_input_tokens_seen": 148624490, + "router_z_loss_clip": 1.76171875, + "router_z_loss_mlp": 0.15368652, + "step": 6922, + "time_per_iteration": 2.5606656074523926 + }, + { + "auxiliary_loss_clip": 0.06480581, + "auxiliary_loss_mlp": 0.01270422, + "balance_loss_clip": 0.06301428, + "balance_loss_mlp": 0.01255163, + "epoch": 0.4162332782203517, + "flos": 26220467376000.0, + "grad_norm": 1.63600266107907, + "language_loss": 0.6809119, + "learning_rate": 2.628839621341247e-06, + "loss": 0.7584219, + "num_input_tokens_seen": 148646490, + "router_z_loss_clip": 1.79101562, + "router_z_loss_mlp": 0.15258789, + "step": 6923, + "time_per_iteration": 2.5789952278137207 + }, + { + "auxiliary_loss_clip": 0.06474873, + "auxiliary_loss_mlp": 0.0126996, + "balance_loss_clip": 0.06299335, + "balance_loss_mlp": 0.01254152, + "epoch": 0.4162934014730197, + "flos": 28191540096000.0, + "grad_norm": 1.91165548300248, + "language_loss": 0.76249051, + "learning_rate": 2.6284698996635593e-06, + "loss": 0.83993888, + "num_input_tokens_seen": 148668580, + "router_z_loss_clip": 1.75585938, + "router_z_loss_mlp": 0.15795898, + "step": 6924, + "time_per_iteration": 2.6209194660186768 + }, + { + "auxiliary_loss_clip": 0.06473987, + "auxiliary_loss_mlp": 0.01272207, + "balance_loss_clip": 0.06295989, + "balance_loss_mlp": 0.01257759, + "epoch": 0.41635352472568765, + "flos": 19871492457600.0, + "grad_norm": 1.5667233765254498, + "language_loss": 0.73101473, + "learning_rate": 2.62810015415423e-06, + "loss": 0.80847669, + "num_input_tokens_seen": 148688410, + "router_z_loss_clip": 1.78027344, + "router_z_loss_mlp": 0.14465332, + "step": 6925, + "time_per_iteration": 2.5133748054504395 + }, + { + "auxiliary_loss_clip": 0.0646892, + "auxiliary_loss_mlp": 0.01268263, + "balance_loss_clip": 0.06293461, + "balance_loss_mlp": 0.0125391, + "epoch": 0.4164136479783556, + "flos": 14939974333440.0, + "grad_norm": 2.1337011873068445, + "language_loss": 0.84242827, + "learning_rate": 2.6277303848272792e-06, + "loss": 0.91980004, + "num_input_tokens_seen": 148704855, + "router_z_loss_clip": 1.75488281, + "router_z_loss_mlp": 0.14361572, + "step": 6926, + "time_per_iteration": 3.923924446105957 + }, + { + "auxiliary_loss_clip": 0.06465639, + "auxiliary_loss_mlp": 0.01268265, + "balance_loss_clip": 0.06292935, + "balance_loss_mlp": 0.01254574, + "epoch": 0.4164737712310236, + "flos": 21763251688320.0, + "grad_norm": 1.56658623429888, + "language_loss": 0.86570489, + "learning_rate": 2.6273605916967302e-06, + "loss": 0.94304395, + "num_input_tokens_seen": 148723065, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.13696289, + "step": 6927, + "time_per_iteration": 3.9643561840057373 + }, + { + "auxiliary_loss_clip": 0.06468353, + "auxiliary_loss_mlp": 0.01275736, + "balance_loss_clip": 0.06293458, + "balance_loss_mlp": 0.01260287, + "epoch": 0.41653389448369155, + "flos": 20746318210560.0, + "grad_norm": 2.3770101780600976, + "language_loss": 0.72583216, + "learning_rate": 2.626990774776604e-06, + "loss": 0.80327296, + "num_input_tokens_seen": 148741780, + "router_z_loss_clip": 1.74902344, + "router_z_loss_mlp": 0.15447998, + "step": 6928, + "time_per_iteration": 2.5111186504364014 + }, + { + "auxiliary_loss_clip": 0.06468435, + "auxiliary_loss_mlp": 0.01272442, + "balance_loss_clip": 0.062929, + "balance_loss_mlp": 0.0125735, + "epoch": 0.4165940177363595, + "flos": 24979848624000.0, + "grad_norm": 1.9381497388164433, + "language_loss": 0.78399348, + "learning_rate": 2.6266209340809254e-06, + "loss": 0.86140227, + "num_input_tokens_seen": 148759795, + "router_z_loss_clip": 1.75488281, + "router_z_loss_mlp": 0.15087891, + "step": 6929, + "time_per_iteration": 2.6066014766693115 + }, + { + "auxiliary_loss_clip": 0.0646543, + "auxiliary_loss_mlp": 0.01268046, + "balance_loss_clip": 0.06291193, + "balance_loss_mlp": 0.01253842, + "epoch": 0.41665414098902753, + "flos": 20527957670400.0, + "grad_norm": 1.8432748306405895, + "language_loss": 0.71154583, + "learning_rate": 2.6262510696237182e-06, + "loss": 0.78888059, + "num_input_tokens_seen": 148778680, + "router_z_loss_clip": 1.74316406, + "router_z_loss_mlp": 0.14190674, + "step": 6930, + "time_per_iteration": 2.5052478313446045 + }, + { + "auxiliary_loss_clip": 0.06468388, + "auxiliary_loss_mlp": 0.01269408, + "balance_loss_clip": 0.06291626, + "balance_loss_mlp": 0.01255067, + "epoch": 0.4167142642416955, + "flos": 19689078119040.0, + "grad_norm": 1.7731266468983917, + "language_loss": 0.81487417, + "learning_rate": 2.625881181419007e-06, + "loss": 0.89225209, + "num_input_tokens_seen": 148796470, + "router_z_loss_clip": 1.76855469, + "router_z_loss_mlp": 0.14355469, + "step": 6931, + "time_per_iteration": 2.555651903152466 + }, + { + "auxiliary_loss_clip": 0.0646255, + "auxiliary_loss_mlp": 0.01270611, + "balance_loss_clip": 0.06289293, + "balance_loss_mlp": 0.01255233, + "epoch": 0.41677438749436346, + "flos": 23769641704320.0, + "grad_norm": 2.211036345176988, + "language_loss": 0.79310054, + "learning_rate": 2.6255112694808193e-06, + "loss": 0.87043214, + "num_input_tokens_seen": 148815300, + "router_z_loss_clip": 1.73144531, + "router_z_loss_mlp": 0.15362549, + "step": 6932, + "time_per_iteration": 4.05314040184021 + }, + { + "auxiliary_loss_clip": 0.06464541, + "auxiliary_loss_mlp": 0.01269463, + "balance_loss_clip": 0.06289106, + "balance_loss_mlp": 0.01254752, + "epoch": 0.41683451074703143, + "flos": 30418051587840.0, + "grad_norm": 2.244908394273299, + "language_loss": 0.82220912, + "learning_rate": 2.6251413338231813e-06, + "loss": 0.89954913, + "num_input_tokens_seen": 148834315, + "router_z_loss_clip": 1.75292969, + "router_z_loss_mlp": 0.14727783, + "step": 6933, + "time_per_iteration": 2.715542793273926 + }, + { + "auxiliary_loss_clip": 0.06467043, + "auxiliary_loss_mlp": 0.01272262, + "balance_loss_clip": 0.06287256, + "balance_loss_mlp": 0.01257963, + "epoch": 0.4168946339996994, + "flos": 21513137650560.0, + "grad_norm": 1.8583396237684835, + "language_loss": 0.76938605, + "learning_rate": 2.624771374460121e-06, + "loss": 0.84677911, + "num_input_tokens_seen": 148852420, + "router_z_loss_clip": 1.796875, + "router_z_loss_mlp": 0.14300537, + "step": 6934, + "time_per_iteration": 2.630192279815674 + }, + { + "auxiliary_loss_clip": 0.06469443, + "auxiliary_loss_mlp": 0.0126919, + "balance_loss_clip": 0.06293288, + "balance_loss_mlp": 0.01254586, + "epoch": 0.41695475725236736, + "flos": 17644310133120.0, + "grad_norm": 2.110423315639561, + "language_loss": 0.67164314, + "learning_rate": 2.624401391405668e-06, + "loss": 0.74902946, + "num_input_tokens_seen": 148869305, + "router_z_loss_clip": 1.76074219, + "router_z_loss_mlp": 0.14599609, + "step": 6935, + "time_per_iteration": 2.484464168548584 + }, + { + "auxiliary_loss_clip": 0.0646461, + "auxiliary_loss_mlp": 0.01269491, + "balance_loss_clip": 0.06289718, + "balance_loss_mlp": 0.01254458, + "epoch": 0.4170148805050353, + "flos": 15674285589120.0, + "grad_norm": 2.4566205528754033, + "language_loss": 0.7383365, + "learning_rate": 2.6240313846738513e-06, + "loss": 0.81567752, + "num_input_tokens_seen": 148886395, + "router_z_loss_clip": 1.74902344, + "router_z_loss_mlp": 0.15039062, + "step": 6936, + "time_per_iteration": 3.9171254634857178 + }, + { + "auxiliary_loss_clip": 0.06457968, + "auxiliary_loss_mlp": 0.01275405, + "balance_loss_clip": 0.06285361, + "balance_loss_mlp": 0.01262184, + "epoch": 0.4170750037577033, + "flos": 15164623929600.0, + "grad_norm": 4.126334603160969, + "language_loss": 0.74596691, + "learning_rate": 2.6236613542787024e-06, + "loss": 0.8233006, + "num_input_tokens_seen": 148905235, + "router_z_loss_clip": 1.72851562, + "router_z_loss_mlp": 0.13226318, + "step": 6937, + "time_per_iteration": 2.5286996364593506 + }, + { + "auxiliary_loss_clip": 0.06462386, + "auxiliary_loss_mlp": 0.01273752, + "balance_loss_clip": 0.06289354, + "balance_loss_mlp": 0.01259727, + "epoch": 0.41713512701037125, + "flos": 28776029051520.0, + "grad_norm": 1.4497703642581674, + "language_loss": 0.84985441, + "learning_rate": 2.6232913002342518e-06, + "loss": 0.92721575, + "num_input_tokens_seen": 148928130, + "router_z_loss_clip": 1.72949219, + "router_z_loss_mlp": 0.14031982, + "step": 6938, + "time_per_iteration": 2.594024419784546 + }, + { + "auxiliary_loss_clip": 0.06468149, + "auxiliary_loss_mlp": 0.01274736, + "balance_loss_clip": 0.06289169, + "balance_loss_mlp": 0.01259114, + "epoch": 0.4171952502630392, + "flos": 28264564529280.0, + "grad_norm": 1.8332960409763566, + "language_loss": 0.74288213, + "learning_rate": 2.6229212225545334e-06, + "loss": 0.82031095, + "num_input_tokens_seen": 148948790, + "router_z_loss_clip": 1.7890625, + "router_z_loss_mlp": 0.15618896, + "step": 6939, + "time_per_iteration": 2.628620147705078 + }, + { + "auxiliary_loss_clip": 0.06462568, + "auxiliary_loss_mlp": 0.01269134, + "balance_loss_clip": 0.06289193, + "balance_loss_mlp": 0.01254817, + "epoch": 0.4172553735157072, + "flos": 24578612547840.0, + "grad_norm": 1.6044361894616455, + "language_loss": 0.75275123, + "learning_rate": 2.622551121253579e-06, + "loss": 0.83006829, + "num_input_tokens_seen": 148967690, + "router_z_loss_clip": 1.73339844, + "router_z_loss_mlp": 0.14331055, + "step": 6940, + "time_per_iteration": 2.55566143989563 + }, + { + "auxiliary_loss_clip": 0.06464436, + "auxiliary_loss_mlp": 0.01269512, + "balance_loss_clip": 0.0628769, + "balance_loss_mlp": 0.01255338, + "epoch": 0.41731549676837515, + "flos": 27051967768320.0, + "grad_norm": 1.7023568307679129, + "language_loss": 0.71513987, + "learning_rate": 2.622180996345424e-06, + "loss": 0.79247934, + "num_input_tokens_seen": 148987150, + "router_z_loss_clip": 1.76855469, + "router_z_loss_mlp": 0.1416626, + "step": 6941, + "time_per_iteration": 2.628779649734497 + }, + { + "auxiliary_loss_clip": 0.06464395, + "auxiliary_loss_mlp": 0.0127035, + "balance_loss_clip": 0.06285797, + "balance_loss_mlp": 0.01255342, + "epoch": 0.4173756200210431, + "flos": 28400173562880.0, + "grad_norm": 3.007655990717308, + "language_loss": 0.73701853, + "learning_rate": 2.621810847844104e-06, + "loss": 0.81436592, + "num_input_tokens_seen": 149004895, + "router_z_loss_clip": 1.78613281, + "router_z_loss_mlp": 0.15008545, + "step": 6942, + "time_per_iteration": 2.579085350036621 + }, + { + "auxiliary_loss_clip": 0.06469673, + "auxiliary_loss_mlp": 0.01269256, + "balance_loss_clip": 0.06289446, + "balance_loss_mlp": 0.01254587, + "epoch": 0.41743574327371114, + "flos": 22526968527360.0, + "grad_norm": 2.366625341311562, + "language_loss": 0.73327738, + "learning_rate": 2.6214406757636534e-06, + "loss": 0.81066668, + "num_input_tokens_seen": 149020970, + "router_z_loss_clip": 1.80273438, + "router_z_loss_mlp": 0.14672852, + "step": 6943, + "time_per_iteration": 2.5890767574310303 + }, + { + "auxiliary_loss_clip": 0.06466928, + "auxiliary_loss_mlp": 0.01267232, + "balance_loss_clip": 0.06290001, + "balance_loss_mlp": 0.01252998, + "epoch": 0.4174958665263791, + "flos": 30120587193600.0, + "grad_norm": 2.3204117950268817, + "language_loss": 0.63901597, + "learning_rate": 2.621070480118111e-06, + "loss": 0.71635759, + "num_input_tokens_seen": 149041795, + "router_z_loss_clip": 1.76855469, + "router_z_loss_mlp": 0.14245605, + "step": 6944, + "time_per_iteration": 2.586949586868286 + }, + { + "auxiliary_loss_clip": 0.06466375, + "auxiliary_loss_mlp": 0.01271741, + "balance_loss_clip": 0.0628995, + "balance_loss_mlp": 0.0125684, + "epoch": 0.41755598977904707, + "flos": 25270227348480.0, + "grad_norm": 11.202050930016789, + "language_loss": 0.70295048, + "learning_rate": 2.620700260921513e-06, + "loss": 0.78033161, + "num_input_tokens_seen": 149063700, + "router_z_loss_clip": 1.76269531, + "router_z_loss_mlp": 0.14898682, + "step": 6945, + "time_per_iteration": 2.6323587894439697 + }, + { + "auxiliary_loss_clip": 0.06460019, + "auxiliary_loss_mlp": 0.01270496, + "balance_loss_clip": 0.06285217, + "balance_loss_mlp": 0.01255219, + "epoch": 0.41761611303171503, + "flos": 19834707715200.0, + "grad_norm": 1.6201275470111005, + "language_loss": 0.8079865, + "learning_rate": 2.620330018187899e-06, + "loss": 0.88529164, + "num_input_tokens_seen": 149082410, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.152771, + "step": 6946, + "time_per_iteration": 2.5303776264190674 + }, + { + "auxiliary_loss_clip": 0.064612, + "auxiliary_loss_mlp": 0.01269709, + "balance_loss_clip": 0.06288694, + "balance_loss_mlp": 0.0125569, + "epoch": 0.417676236284383, + "flos": 15528655992960.0, + "grad_norm": 2.2948583781036027, + "language_loss": 0.77726543, + "learning_rate": 2.6199597519313086e-06, + "loss": 0.85457456, + "num_input_tokens_seen": 149098745, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.14038086, + "step": 6947, + "time_per_iteration": 2.5844216346740723 + }, + { + "auxiliary_loss_clip": 0.06465282, + "auxiliary_loss_mlp": 0.01267852, + "balance_loss_clip": 0.06289726, + "balance_loss_mlp": 0.01252844, + "epoch": 0.41773635953705096, + "flos": 32532531770880.0, + "grad_norm": 1.6041388362904736, + "language_loss": 0.71914941, + "learning_rate": 2.6195894621657825e-06, + "loss": 0.79648077, + "num_input_tokens_seen": 149122255, + "router_z_loss_clip": 1.75488281, + "router_z_loss_mlp": 0.15014648, + "step": 6948, + "time_per_iteration": 2.632211685180664 + }, + { + "auxiliary_loss_clip": 0.06460577, + "auxiliary_loss_mlp": 0.01271252, + "balance_loss_clip": 0.06288102, + "balance_loss_mlp": 0.01256303, + "epoch": 0.4177964827897189, + "flos": 23447719117440.0, + "grad_norm": 1.868509756028272, + "language_loss": 0.76914591, + "learning_rate": 2.619219148905362e-06, + "loss": 0.84646416, + "num_input_tokens_seen": 149142845, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.14941406, + "step": 6949, + "time_per_iteration": 2.5791566371917725 + }, + { + "auxiliary_loss_clip": 0.06466889, + "auxiliary_loss_mlp": 0.01270609, + "balance_loss_clip": 0.06288934, + "balance_loss_mlp": 0.01255476, + "epoch": 0.4178566060423869, + "flos": 22755768900480.0, + "grad_norm": 1.6605109484051197, + "language_loss": 0.81921285, + "learning_rate": 2.6188488121640888e-06, + "loss": 0.89658785, + "num_input_tokens_seen": 149163375, + "router_z_loss_clip": 1.77734375, + "router_z_loss_mlp": 0.15148926, + "step": 6950, + "time_per_iteration": 2.550705909729004 + }, + { + "auxiliary_loss_clip": 0.06457172, + "auxiliary_loss_mlp": 0.01266593, + "balance_loss_clip": 0.062898, + "balance_loss_mlp": 0.01253319, + "epoch": 0.41791672929505486, + "flos": 26040233243520.0, + "grad_norm": 1.3162845057727355, + "language_loss": 0.76396811, + "learning_rate": 2.618478451956007e-06, + "loss": 0.84120584, + "num_input_tokens_seen": 149185610, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.13275146, + "step": 6951, + "time_per_iteration": 2.6047768592834473 + }, + { + "auxiliary_loss_clip": 0.06472172, + "auxiliary_loss_mlp": 0.01271966, + "balance_loss_clip": 0.06291625, + "balance_loss_mlp": 0.01256988, + "epoch": 0.4179768525477228, + "flos": 19574028063360.0, + "grad_norm": 1.8780871701618023, + "language_loss": 0.72956991, + "learning_rate": 2.61810806829516e-06, + "loss": 0.80701125, + "num_input_tokens_seen": 149203990, + "router_z_loss_clip": 1.80566406, + "router_z_loss_mlp": 0.14978027, + "step": 6952, + "time_per_iteration": 2.498915910720825 + }, + { + "auxiliary_loss_clip": 0.06467617, + "auxiliary_loss_mlp": 0.01270698, + "balance_loss_clip": 0.06290505, + "balance_loss_mlp": 0.01256286, + "epoch": 0.4180369758003908, + "flos": 17789352750720.0, + "grad_norm": 3.5208466342014444, + "language_loss": 0.72192442, + "learning_rate": 2.617737661195593e-06, + "loss": 0.79930753, + "num_input_tokens_seen": 149221385, + "router_z_loss_clip": 1.77050781, + "router_z_loss_mlp": 0.14428711, + "step": 6953, + "time_per_iteration": 2.5105345249176025 + }, + { + "auxiliary_loss_clip": 0.06460451, + "auxiliary_loss_mlp": 0.01269376, + "balance_loss_clip": 0.0629045, + "balance_loss_mlp": 0.01255143, + "epoch": 0.41809709905305875, + "flos": 20967152446080.0, + "grad_norm": 1.9107321624636409, + "language_loss": 0.76574248, + "learning_rate": 2.617367230671353e-06, + "loss": 0.8430407, + "num_input_tokens_seen": 149241175, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.14233398, + "step": 6954, + "time_per_iteration": 2.5424091815948486 + }, + { + "auxiliary_loss_clip": 0.06461184, + "auxiliary_loss_mlp": 0.01271375, + "balance_loss_clip": 0.06286837, + "balance_loss_mlp": 0.01255866, + "epoch": 0.4181572223057267, + "flos": 22024099048320.0, + "grad_norm": 2.2757291119189693, + "language_loss": 0.84719867, + "learning_rate": 2.616996776736485e-06, + "loss": 0.92452419, + "num_input_tokens_seen": 149259115, + "router_z_loss_clip": 1.74511719, + "router_z_loss_mlp": 0.15490723, + "step": 6955, + "time_per_iteration": 2.5423128604888916 + }, + { + "auxiliary_loss_clip": 0.06460696, + "auxiliary_loss_mlp": 0.01269449, + "balance_loss_clip": 0.06289047, + "balance_loss_mlp": 0.01255001, + "epoch": 0.4182173455583947, + "flos": 26251969311360.0, + "grad_norm": 1.5480485879739414, + "language_loss": 0.83159053, + "learning_rate": 2.616626299405037e-06, + "loss": 0.90889192, + "num_input_tokens_seen": 149278705, + "router_z_loss_clip": 1.71582031, + "router_z_loss_mlp": 0.14453125, + "step": 6956, + "time_per_iteration": 2.5377910137176514 + }, + { + "auxiliary_loss_clip": 0.06470253, + "auxiliary_loss_mlp": 0.01272951, + "balance_loss_clip": 0.06292067, + "balance_loss_mlp": 0.01258163, + "epoch": 0.4182774688110627, + "flos": 14796566870400.0, + "grad_norm": 2.2161530875987205, + "language_loss": 0.72170293, + "learning_rate": 2.616255798691059e-06, + "loss": 0.79913497, + "num_input_tokens_seen": 149294040, + "router_z_loss_clip": 1.77929688, + "router_z_loss_mlp": 0.14801025, + "step": 6957, + "time_per_iteration": 2.5512890815734863 + }, + { + "auxiliary_loss_clip": 0.06465964, + "auxiliary_loss_mlp": 0.01272907, + "balance_loss_clip": 0.06289618, + "balance_loss_mlp": 0.01258745, + "epoch": 0.41833759206373067, + "flos": 20418190421760.0, + "grad_norm": 1.9534240722910163, + "language_loss": 0.75827634, + "learning_rate": 2.6158852746085982e-06, + "loss": 0.83566499, + "num_input_tokens_seen": 149310385, + "router_z_loss_clip": 1.76269531, + "router_z_loss_mlp": 0.14147949, + "step": 6958, + "time_per_iteration": 2.5025634765625 + }, + { + "auxiliary_loss_clip": 0.06461923, + "auxiliary_loss_mlp": 0.01277567, + "balance_loss_clip": 0.06289306, + "balance_loss_mlp": 0.01262505, + "epoch": 0.41839771531639863, + "flos": 23662557786240.0, + "grad_norm": 1.62032760192947, + "language_loss": 0.77450699, + "learning_rate": 2.6155147271717066e-06, + "loss": 0.85190189, + "num_input_tokens_seen": 149328235, + "router_z_loss_clip": 1.72851562, + "router_z_loss_mlp": 0.15075684, + "step": 6959, + "time_per_iteration": 2.5644967555999756 + }, + { + "auxiliary_loss_clip": 0.06462178, + "auxiliary_loss_mlp": 0.01275343, + "balance_loss_clip": 0.06288128, + "balance_loss_mlp": 0.01259423, + "epoch": 0.4184578385690666, + "flos": 19760006200320.0, + "grad_norm": 1.8483570445524284, + "language_loss": 0.77022827, + "learning_rate": 2.6151441563944347e-06, + "loss": 0.84760344, + "num_input_tokens_seen": 149347465, + "router_z_loss_clip": 1.74121094, + "router_z_loss_mlp": 0.15924072, + "step": 6960, + "time_per_iteration": 2.5269885063171387 + }, + { + "auxiliary_loss_clip": 0.06453702, + "auxiliary_loss_mlp": 0.01269309, + "balance_loss_clip": 0.06288585, + "balance_loss_mlp": 0.01255552, + "epoch": 0.41851796182173456, + "flos": 20199578319360.0, + "grad_norm": 2.3993036704472717, + "language_loss": 0.75495946, + "learning_rate": 2.614773562290835e-06, + "loss": 0.83218956, + "num_input_tokens_seen": 149366685, + "router_z_loss_clip": 1.64941406, + "router_z_loss_mlp": 0.13769531, + "step": 6961, + "time_per_iteration": 2.571563243865967 + }, + { + "auxiliary_loss_clip": 0.06367883, + "auxiliary_loss_mlp": 0.0126221, + "balance_loss_clip": 0.06291385, + "balance_loss_mlp": 0.01259577, + "epoch": 0.41857808507440253, + "flos": 59038331898240.0, + "grad_norm": 0.8546546360875583, + "language_loss": 0.54730451, + "learning_rate": 2.61440294487496e-06, + "loss": 0.62360549, + "num_input_tokens_seen": 149422925, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.02635193, + "step": 6962, + "time_per_iteration": 3.0928165912628174 + }, + { + "auxiliary_loss_clip": 0.06468143, + "auxiliary_loss_mlp": 0.0127052, + "balance_loss_clip": 0.06293048, + "balance_loss_mlp": 0.01256423, + "epoch": 0.4186382083270705, + "flos": 18484740984960.0, + "grad_norm": 2.146654503648622, + "language_loss": 0.8523612, + "learning_rate": 2.614032304160864e-06, + "loss": 0.92974788, + "num_input_tokens_seen": 149440820, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.14093018, + "step": 6963, + "time_per_iteration": 2.4891340732574463 + }, + { + "auxiliary_loss_clip": 0.06465001, + "auxiliary_loss_mlp": 0.01271241, + "balance_loss_clip": 0.06290912, + "balance_loss_mlp": 0.01256453, + "epoch": 0.41869833157973846, + "flos": 21584988126720.0, + "grad_norm": 1.5636714712462336, + "language_loss": 0.70520425, + "learning_rate": 2.6136616401626014e-06, + "loss": 0.78256667, + "num_input_tokens_seen": 149461060, + "router_z_loss_clip": 1.74023438, + "router_z_loss_mlp": 0.14788818, + "step": 6964, + "time_per_iteration": 2.6037514209747314 + }, + { + "auxiliary_loss_clip": 0.06460649, + "auxiliary_loss_mlp": 0.01270666, + "balance_loss_clip": 0.06289357, + "balance_loss_mlp": 0.01257034, + "epoch": 0.4187584548324064, + "flos": 35526156192000.0, + "grad_norm": 2.108688626905877, + "language_loss": 0.71782613, + "learning_rate": 2.6132909528942273e-06, + "loss": 0.79513931, + "num_input_tokens_seen": 149483115, + "router_z_loss_clip": 1.71191406, + "router_z_loss_mlp": 0.1362915, + "step": 6965, + "time_per_iteration": 4.077980279922485 + }, + { + "auxiliary_loss_clip": 0.06453691, + "auxiliary_loss_mlp": 0.0126997, + "balance_loss_clip": 0.06286767, + "balance_loss_mlp": 0.01257173, + "epoch": 0.4188185780850744, + "flos": 18660950121600.0, + "grad_norm": 1.7018758391145836, + "language_loss": 0.72080678, + "learning_rate": 2.6129202423697997e-06, + "loss": 0.79804349, + "num_input_tokens_seen": 149501495, + "router_z_loss_clip": 1.66894531, + "router_z_loss_mlp": 0.12792969, + "step": 6966, + "time_per_iteration": 2.5740551948547363 + }, + { + "auxiliary_loss_clip": 0.06466748, + "auxiliary_loss_mlp": 0.0127158, + "balance_loss_clip": 0.06288405, + "balance_loss_mlp": 0.0125625, + "epoch": 0.41887870133774235, + "flos": 40342959676800.0, + "grad_norm": 4.506306240026155, + "language_loss": 0.71212667, + "learning_rate": 2.612549508603375e-06, + "loss": 0.78950995, + "num_input_tokens_seen": 149523170, + "router_z_loss_clip": 1.78125, + "router_z_loss_mlp": 0.15338135, + "step": 6967, + "time_per_iteration": 4.179578065872192 + }, + { + "auxiliary_loss_clip": 0.0636977, + "auxiliary_loss_mlp": 0.01256477, + "balance_loss_clip": 0.06291805, + "balance_loss_mlp": 0.01253975, + "epoch": 0.4189388245904103, + "flos": 61388083946880.0, + "grad_norm": 0.6570416522373307, + "language_loss": 0.45988834, + "learning_rate": 2.612178751609011e-06, + "loss": 0.53615081, + "num_input_tokens_seen": 149583955, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 0.02500916, + "step": 6968, + "time_per_iteration": 3.1288843154907227 + }, + { + "auxiliary_loss_clip": 0.06467855, + "auxiliary_loss_mlp": 0.01273397, + "balance_loss_clip": 0.06290668, + "balance_loss_mlp": 0.01257685, + "epoch": 0.4189989478430783, + "flos": 28222371198720.0, + "grad_norm": 1.7081344299750898, + "language_loss": 0.75350499, + "learning_rate": 2.6118079714007685e-06, + "loss": 0.8309176, + "num_input_tokens_seen": 149604440, + "router_z_loss_clip": 1.77246094, + "router_z_loss_mlp": 0.15710449, + "step": 6969, + "time_per_iteration": 2.5936050415039062 + }, + { + "auxiliary_loss_clip": 0.06460407, + "auxiliary_loss_mlp": 0.01272467, + "balance_loss_clip": 0.06287546, + "balance_loss_mlp": 0.01258365, + "epoch": 0.4190590710957463, + "flos": 24571820367360.0, + "grad_norm": 1.8003201263588986, + "language_loss": 0.80904478, + "learning_rate": 2.611437167992705e-06, + "loss": 0.88637358, + "num_input_tokens_seen": 149623745, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.14099121, + "step": 6970, + "time_per_iteration": 2.5366463661193848 + }, + { + "auxiliary_loss_clip": 0.06461529, + "auxiliary_loss_mlp": 0.01271512, + "balance_loss_clip": 0.06291033, + "balance_loss_mlp": 0.01257594, + "epoch": 0.41911919434841427, + "flos": 21732504439680.0, + "grad_norm": 2.0427263912189098, + "language_loss": 0.83781362, + "learning_rate": 2.6110663413988835e-06, + "loss": 0.91514409, + "num_input_tokens_seen": 149643025, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.13922119, + "step": 6971, + "time_per_iteration": 4.038029909133911 + }, + { + "auxiliary_loss_clip": 0.06459013, + "auxiliary_loss_mlp": 0.01277453, + "balance_loss_clip": 0.06292501, + "balance_loss_mlp": 0.01262766, + "epoch": 0.41917931760108224, + "flos": 17607064193280.0, + "grad_norm": 1.8913036217137231, + "language_loss": 0.74956995, + "learning_rate": 2.6106954916333648e-06, + "loss": 0.82693458, + "num_input_tokens_seen": 149660695, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.14685059, + "step": 6972, + "time_per_iteration": 2.5450055599212646 + }, + { + "auxiliary_loss_clip": 0.06463002, + "auxiliary_loss_mlp": 0.01269114, + "balance_loss_clip": 0.06289829, + "balance_loss_mlp": 0.01255405, + "epoch": 0.4192394408537502, + "flos": 37825943679360.0, + "grad_norm": 1.6425528401757075, + "language_loss": 0.73133683, + "learning_rate": 2.610324618710212e-06, + "loss": 0.808658, + "num_input_tokens_seen": 149682040, + "router_z_loss_clip": 1.73339844, + "router_z_loss_mlp": 0.13684082, + "step": 6973, + "time_per_iteration": 2.6852450370788574 + }, + { + "auxiliary_loss_clip": 0.06474721, + "auxiliary_loss_mlp": 0.01271721, + "balance_loss_clip": 0.06293075, + "balance_loss_mlp": 0.01257272, + "epoch": 0.41929956410641817, + "flos": 23113637688960.0, + "grad_norm": 1.8862458299453466, + "language_loss": 0.74830127, + "learning_rate": 2.609953722643489e-06, + "loss": 0.82576567, + "num_input_tokens_seen": 149700855, + "router_z_loss_clip": 1.81542969, + "router_z_loss_mlp": 0.14453125, + "step": 6974, + "time_per_iteration": 2.5765645503997803 + }, + { + "auxiliary_loss_clip": 0.06460831, + "auxiliary_loss_mlp": 0.01266173, + "balance_loss_clip": 0.0628831, + "balance_loss_mlp": 0.01252744, + "epoch": 0.41935968735908613, + "flos": 22530448471680.0, + "grad_norm": 1.902296645802657, + "language_loss": 0.73513019, + "learning_rate": 2.609582803447259e-06, + "loss": 0.81240016, + "num_input_tokens_seen": 149717360, + "router_z_loss_clip": 1.72363281, + "router_z_loss_mlp": 0.13421631, + "step": 6975, + "time_per_iteration": 2.4907052516937256 + }, + { + "auxiliary_loss_clip": 0.06461257, + "auxiliary_loss_mlp": 0.0127025, + "balance_loss_clip": 0.06293045, + "balance_loss_mlp": 0.01256172, + "epoch": 0.4194198106117541, + "flos": 26877771129600.0, + "grad_norm": 1.432926445179704, + "language_loss": 0.80820251, + "learning_rate": 2.6092118611355885e-06, + "loss": 0.8855176, + "num_input_tokens_seen": 149738975, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.14086914, + "step": 6976, + "time_per_iteration": 4.015337705612183 + }, + { + "auxiliary_loss_clip": 0.06465544, + "auxiliary_loss_mlp": 0.01265752, + "balance_loss_clip": 0.06291896, + "balance_loss_mlp": 0.01252174, + "epoch": 0.41947993386442206, + "flos": 19908696470400.0, + "grad_norm": 6.530638917868016, + "language_loss": 0.67613435, + "learning_rate": 2.6088408957225425e-06, + "loss": 0.75344729, + "num_input_tokens_seen": 149757055, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.13592529, + "step": 6977, + "time_per_iteration": 2.5907933712005615 + }, + { + "auxiliary_loss_clip": 0.06466645, + "auxiliary_loss_mlp": 0.012707, + "balance_loss_clip": 0.06291468, + "balance_loss_mlp": 0.01257104, + "epoch": 0.41954005711709, + "flos": 17389584120960.0, + "grad_norm": 2.431968733580352, + "language_loss": 0.8152501, + "learning_rate": 2.6084699072221898e-06, + "loss": 0.89262354, + "num_input_tokens_seen": 149772885, + "router_z_loss_clip": 1.75390625, + "router_z_loss_mlp": 0.13604736, + "step": 6978, + "time_per_iteration": 2.5534939765930176 + }, + { + "auxiliary_loss_clip": 0.06466036, + "auxiliary_loss_mlp": 0.01269917, + "balance_loss_clip": 0.06288658, + "balance_loss_mlp": 0.012561, + "epoch": 0.419600180369758, + "flos": 25009254207360.0, + "grad_norm": 1.7617066668945498, + "language_loss": 0.83044857, + "learning_rate": 2.6080988956485964e-06, + "loss": 0.90780807, + "num_input_tokens_seen": 149791515, + "router_z_loss_clip": 1.77539062, + "router_z_loss_mlp": 0.13824463, + "step": 6979, + "time_per_iteration": 2.5991194248199463 + }, + { + "auxiliary_loss_clip": 0.06464113, + "auxiliary_loss_mlp": 0.01266396, + "balance_loss_clip": 0.0629217, + "balance_loss_mlp": 0.01253313, + "epoch": 0.41966030362242596, + "flos": 17389458339840.0, + "grad_norm": 2.43413237172065, + "language_loss": 0.83727056, + "learning_rate": 2.6077278610158325e-06, + "loss": 0.9145757, + "num_input_tokens_seen": 149807250, + "router_z_loss_clip": 1.71972656, + "router_z_loss_mlp": 0.13079834, + "step": 6980, + "time_per_iteration": 2.4868295192718506 + }, + { + "auxiliary_loss_clip": 0.06469644, + "auxiliary_loss_mlp": 0.01274217, + "balance_loss_clip": 0.06293017, + "balance_loss_mlp": 0.01260061, + "epoch": 0.4197204268750939, + "flos": 22161427090560.0, + "grad_norm": 2.953064628504675, + "language_loss": 0.79802233, + "learning_rate": 2.6073568033379665e-06, + "loss": 0.87546098, + "num_input_tokens_seen": 149821640, + "router_z_loss_clip": 1.765625, + "router_z_loss_mlp": 0.14172363, + "step": 6981, + "time_per_iteration": 2.572671890258789 + }, + { + "auxiliary_loss_clip": 0.06461273, + "auxiliary_loss_mlp": 0.01268979, + "balance_loss_clip": 0.06293882, + "balance_loss_mlp": 0.01256152, + "epoch": 0.4197805501277619, + "flos": 22089534687360.0, + "grad_norm": 1.8874441419731374, + "language_loss": 0.84437835, + "learning_rate": 2.6069857226290696e-06, + "loss": 0.92168081, + "num_input_tokens_seen": 149840545, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.12823486, + "step": 6982, + "time_per_iteration": 2.515719413757324 + }, + { + "auxiliary_loss_clip": 0.06468281, + "auxiliary_loss_mlp": 0.0127262, + "balance_loss_clip": 0.06291284, + "balance_loss_mlp": 0.0125844, + "epoch": 0.4198406733804299, + "flos": 26439372967680.0, + "grad_norm": 2.198770889515785, + "language_loss": 0.57229298, + "learning_rate": 2.606614618903214e-06, + "loss": 0.64970195, + "num_input_tokens_seen": 149860375, + "router_z_loss_clip": 1.76953125, + "router_z_loss_mlp": 0.1418457, + "step": 6983, + "time_per_iteration": 2.589905023574829 + }, + { + "auxiliary_loss_clip": 0.06459898, + "auxiliary_loss_mlp": 0.01268511, + "balance_loss_clip": 0.0629196, + "balance_loss_mlp": 0.01255922, + "epoch": 0.4199007966330979, + "flos": 12535870112640.0, + "grad_norm": 1.9546340544122036, + "language_loss": 0.82430601, + "learning_rate": 2.606243492174471e-06, + "loss": 0.90159011, + "num_input_tokens_seen": 149877850, + "router_z_loss_clip": 1.6796875, + "router_z_loss_mlp": 0.1260376, + "step": 6984, + "time_per_iteration": 2.4837801456451416 + }, + { + "auxiliary_loss_clip": 0.06465998, + "auxiliary_loss_mlp": 0.0127065, + "balance_loss_clip": 0.06293395, + "balance_loss_mlp": 0.01257698, + "epoch": 0.41996091988576584, + "flos": 21769498817280.0, + "grad_norm": 1.6572496297875159, + "language_loss": 0.79565531, + "learning_rate": 2.605872342456914e-06, + "loss": 0.87302184, + "num_input_tokens_seen": 149896110, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.12963867, + "step": 6985, + "time_per_iteration": 2.558382511138916 + }, + { + "auxiliary_loss_clip": 0.06471538, + "auxiliary_loss_mlp": 0.01269266, + "balance_loss_clip": 0.06292171, + "balance_loss_mlp": 0.01254425, + "epoch": 0.4200210431384338, + "flos": 26549182143360.0, + "grad_norm": 1.7232010674189546, + "language_loss": 0.78413719, + "learning_rate": 2.6055011697646173e-06, + "loss": 0.86154521, + "num_input_tokens_seen": 149916495, + "router_z_loss_clip": 1.79492188, + "router_z_loss_mlp": 0.14831543, + "step": 6986, + "time_per_iteration": 2.557201385498047 + }, + { + "auxiliary_loss_clip": 0.06457713, + "auxiliary_loss_mlp": 0.01264036, + "balance_loss_clip": 0.06290729, + "balance_loss_mlp": 0.0125171, + "epoch": 0.42008116639110177, + "flos": 26802859979520.0, + "grad_norm": 1.5119871943534449, + "language_loss": 0.72772801, + "learning_rate": 2.605129974111655e-06, + "loss": 0.80494547, + "num_input_tokens_seen": 149936445, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.12310791, + "step": 6987, + "time_per_iteration": 2.590758800506592 + }, + { + "auxiliary_loss_clip": 0.06464639, + "auxiliary_loss_mlp": 0.01270518, + "balance_loss_clip": 0.06291942, + "balance_loss_mlp": 0.01256994, + "epoch": 0.42014128964376973, + "flos": 32095433347200.0, + "grad_norm": 1.493413355723003, + "language_loss": 0.75077468, + "learning_rate": 2.604758755512104e-06, + "loss": 0.82812625, + "num_input_tokens_seen": 149959430, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.13519287, + "step": 6988, + "time_per_iteration": 2.6159229278564453 + }, + { + "auxiliary_loss_clip": 0.064705, + "auxiliary_loss_mlp": 0.01272645, + "balance_loss_clip": 0.06293759, + "balance_loss_mlp": 0.01258256, + "epoch": 0.4202014128964377, + "flos": 26474061358080.0, + "grad_norm": 1.4960604967721163, + "language_loss": 0.7416907, + "learning_rate": 2.60438751398004e-06, + "loss": 0.81912208, + "num_input_tokens_seen": 149980365, + "router_z_loss_clip": 1.76855469, + "router_z_loss_mlp": 0.14385986, + "step": 6989, + "time_per_iteration": 2.6082265377044678 + }, + { + "auxiliary_loss_clip": 0.06467222, + "auxiliary_loss_mlp": 0.01268972, + "balance_loss_clip": 0.06291176, + "balance_loss_mlp": 0.0125413, + "epoch": 0.42026153614910566, + "flos": 13405287277440.0, + "grad_norm": 2.240751664581705, + "language_loss": 0.70939904, + "learning_rate": 2.6040162495295404e-06, + "loss": 0.78676105, + "num_input_tokens_seen": 149997375, + "router_z_loss_clip": 1.75976562, + "router_z_loss_mlp": 0.14831543, + "step": 6990, + "time_per_iteration": 2.5301413536071777 + }, + { + "auxiliary_loss_clip": 0.06372039, + "auxiliary_loss_mlp": 0.01262281, + "balance_loss_clip": 0.06294142, + "balance_loss_mlp": 0.01259734, + "epoch": 0.42032165940177363, + "flos": 60268720452480.0, + "grad_norm": 0.7958876139316734, + "language_loss": 0.6024788, + "learning_rate": 2.603644962174685e-06, + "loss": 0.67882204, + "num_input_tokens_seen": 150051230, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 0.02546692, + "step": 6991, + "time_per_iteration": 3.036398410797119 + }, + { + "auxiliary_loss_clip": 0.06468751, + "auxiliary_loss_mlp": 0.0127226, + "balance_loss_clip": 0.06294238, + "balance_loss_mlp": 0.01257251, + "epoch": 0.4203817826544416, + "flos": 24542121294720.0, + "grad_norm": 1.5524019758451273, + "language_loss": 0.83787376, + "learning_rate": 2.6032736519295517e-06, + "loss": 0.91528386, + "num_input_tokens_seen": 150071135, + "router_z_loss_clip": 1.74511719, + "router_z_loss_mlp": 0.15014648, + "step": 6992, + "time_per_iteration": 2.5513317584991455 + }, + { + "auxiliary_loss_clip": 0.06374694, + "auxiliary_loss_mlp": 0.01259872, + "balance_loss_clip": 0.06295739, + "balance_loss_mlp": 0.01257284, + "epoch": 0.42044190590710956, + "flos": 58837679297280.0, + "grad_norm": 0.7870388441722128, + "language_loss": 0.65295899, + "learning_rate": 2.6029023188082217e-06, + "loss": 0.72930467, + "num_input_tokens_seen": 150125220, + "router_z_loss_clip": 0.7890625, + "router_z_loss_mlp": 0.02589417, + "step": 6993, + "time_per_iteration": 3.139356851577759 + }, + { + "auxiliary_loss_clip": 0.06475414, + "auxiliary_loss_mlp": 0.01273103, + "balance_loss_clip": 0.06293732, + "balance_loss_mlp": 0.01257534, + "epoch": 0.4205020291597775, + "flos": 16441733934720.0, + "grad_norm": 2.0884817814411307, + "language_loss": 0.83771634, + "learning_rate": 2.6025309628247746e-06, + "loss": 0.91520149, + "num_input_tokens_seen": 150142300, + "router_z_loss_clip": 1.81738281, + "router_z_loss_mlp": 0.15576172, + "step": 6994, + "time_per_iteration": 2.5307908058166504 + }, + { + "auxiliary_loss_clip": 0.06461746, + "auxiliary_loss_mlp": 0.01269563, + "balance_loss_clip": 0.06292755, + "balance_loss_mlp": 0.01255544, + "epoch": 0.4205621524124455, + "flos": 18411548843520.0, + "grad_norm": 1.728991128313806, + "language_loss": 0.79243588, + "learning_rate": 2.6021595839932934e-06, + "loss": 0.86974895, + "num_input_tokens_seen": 150161345, + "router_z_loss_clip": 1.69140625, + "router_z_loss_mlp": 0.14013672, + "step": 6995, + "time_per_iteration": 2.5054030418395996 + }, + { + "auxiliary_loss_clip": 0.06461824, + "auxiliary_loss_mlp": 0.0126885, + "balance_loss_clip": 0.06293637, + "balance_loss_mlp": 0.01255433, + "epoch": 0.4206222756651135, + "flos": 25527133566720.0, + "grad_norm": 1.491511685078805, + "language_loss": 0.80235636, + "learning_rate": 2.60178818232786e-06, + "loss": 0.87966311, + "num_input_tokens_seen": 150182420, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.13409424, + "step": 6996, + "time_per_iteration": 2.6613996028900146 + }, + { + "auxiliary_loss_clip": 0.06466329, + "auxiliary_loss_mlp": 0.01268157, + "balance_loss_clip": 0.06293097, + "balance_loss_mlp": 0.01254466, + "epoch": 0.4206823989177815, + "flos": 15309708474240.0, + "grad_norm": 2.3637588948298998, + "language_loss": 0.76051879, + "learning_rate": 2.601416757842559e-06, + "loss": 0.83786368, + "num_input_tokens_seen": 150200175, + "router_z_loss_clip": 1.73339844, + "router_z_loss_mlp": 0.13690186, + "step": 6997, + "time_per_iteration": 2.484876871109009 + }, + { + "auxiliary_loss_clip": 0.06463061, + "auxiliary_loss_mlp": 0.0126838, + "balance_loss_clip": 0.06288689, + "balance_loss_mlp": 0.01253789, + "epoch": 0.42074252217044944, + "flos": 15558564700800.0, + "grad_norm": 2.0514206793414345, + "language_loss": 0.76478076, + "learning_rate": 2.6010453105514743e-06, + "loss": 0.84209514, + "num_input_tokens_seen": 150217100, + "router_z_loss_clip": 1.74316406, + "router_z_loss_mlp": 0.14599609, + "step": 6998, + "time_per_iteration": 2.5640127658843994 + }, + { + "auxiliary_loss_clip": 0.06466474, + "auxiliary_loss_mlp": 0.01275488, + "balance_loss_clip": 0.06289443, + "balance_loss_mlp": 0.01260587, + "epoch": 0.4208026454231174, + "flos": 26153941633920.0, + "grad_norm": 1.581279992496262, + "language_loss": 0.76102519, + "learning_rate": 2.60067384046869e-06, + "loss": 0.83844483, + "num_input_tokens_seen": 150239830, + "router_z_loss_clip": 1.76757812, + "router_z_loss_mlp": 0.14892578, + "step": 6999, + "time_per_iteration": 2.6406025886535645 + }, + { + "auxiliary_loss_clip": 0.06461642, + "auxiliary_loss_mlp": 0.01267644, + "balance_loss_clip": 0.06291209, + "balance_loss_mlp": 0.01254382, + "epoch": 0.42086276867578537, + "flos": 23556857460480.0, + "grad_norm": 1.988296138175356, + "language_loss": 0.64461291, + "learning_rate": 2.600302347608295e-06, + "loss": 0.72190583, + "num_input_tokens_seen": 150260690, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.13244629, + "step": 7000, + "time_per_iteration": 2.6081695556640625 + }, + { + "auxiliary_loss_clip": 0.06469343, + "auxiliary_loss_mlp": 0.01270405, + "balance_loss_clip": 0.06294516, + "balance_loss_mlp": 0.01256076, + "epoch": 0.42092289192845334, + "flos": 18119199548160.0, + "grad_norm": 1.6363851387704167, + "language_loss": 0.77022576, + "learning_rate": 2.5999308319843743e-06, + "loss": 0.84762329, + "num_input_tokens_seen": 150279885, + "router_z_loss_clip": 1.74804688, + "router_z_loss_mlp": 0.14318848, + "step": 7001, + "time_per_iteration": 2.5761475563049316 + }, + { + "auxiliary_loss_clip": 0.06461353, + "auxiliary_loss_mlp": 0.01268364, + "balance_loss_clip": 0.06290751, + "balance_loss_mlp": 0.01254882, + "epoch": 0.4209830151811213, + "flos": 20012006954880.0, + "grad_norm": 1.5030484792833017, + "language_loss": 0.86740428, + "learning_rate": 2.5995592936110154e-06, + "loss": 0.94470143, + "num_input_tokens_seen": 150297390, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.13482666, + "step": 7002, + "time_per_iteration": 2.585397958755493 + }, + { + "auxiliary_loss_clip": 0.06461627, + "auxiliary_loss_mlp": 0.01271644, + "balance_loss_clip": 0.06290498, + "balance_loss_mlp": 0.01258251, + "epoch": 0.42104313843378927, + "flos": 21985050245760.0, + "grad_norm": 2.152971198745627, + "language_loss": 0.68539977, + "learning_rate": 2.5991877325023096e-06, + "loss": 0.76273245, + "num_input_tokens_seen": 150317390, + "router_z_loss_clip": 1.70996094, + "router_z_loss_mlp": 0.1338501, + "step": 7003, + "time_per_iteration": 2.5039963722229004 + }, + { + "auxiliary_loss_clip": 0.06469242, + "auxiliary_loss_mlp": 0.01271214, + "balance_loss_clip": 0.06293743, + "balance_loss_mlp": 0.01255747, + "epoch": 0.42110326168645723, + "flos": 25450461480960.0, + "grad_norm": 1.8015075946869743, + "language_loss": 0.77306843, + "learning_rate": 2.598816148672344e-06, + "loss": 0.85047305, + "num_input_tokens_seen": 150337455, + "router_z_loss_clip": 1.75585938, + "router_z_loss_mlp": 0.15472412, + "step": 7004, + "time_per_iteration": 2.6128745079040527 + }, + { + "auxiliary_loss_clip": 0.06462541, + "auxiliary_loss_mlp": 0.01273285, + "balance_loss_clip": 0.06294234, + "balance_loss_mlp": 0.0125873, + "epoch": 0.4211633849391252, + "flos": 17828485407360.0, + "grad_norm": 1.7810886301824922, + "language_loss": 0.68804276, + "learning_rate": 2.59844454213521e-06, + "loss": 0.76540101, + "num_input_tokens_seen": 150355385, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.14562988, + "step": 7005, + "time_per_iteration": 3.888760566711426 + }, + { + "auxiliary_loss_clip": 0.06465107, + "auxiliary_loss_mlp": 0.01269773, + "balance_loss_clip": 0.0629124, + "balance_loss_mlp": 0.01255593, + "epoch": 0.42122350819179316, + "flos": 16286796535680.0, + "grad_norm": 1.8605985429595449, + "language_loss": 0.72998816, + "learning_rate": 2.5980729129049994e-06, + "loss": 0.80733699, + "num_input_tokens_seen": 150371750, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.14178467, + "step": 7006, + "time_per_iteration": 3.991835832595825 + }, + { + "auxiliary_loss_clip": 0.06464688, + "auxiliary_loss_mlp": 0.01266849, + "balance_loss_clip": 0.06289375, + "balance_loss_mlp": 0.01252424, + "epoch": 0.4212836314444611, + "flos": 19651916033280.0, + "grad_norm": 1.623062925912009, + "language_loss": 0.7118417, + "learning_rate": 2.5977012609958033e-06, + "loss": 0.78915709, + "num_input_tokens_seen": 150389955, + "router_z_loss_clip": 1.75585938, + "router_z_loss_mlp": 0.14416504, + "step": 7007, + "time_per_iteration": 2.5425753593444824 + }, + { + "auxiliary_loss_clip": 0.06463595, + "auxiliary_loss_mlp": 0.01271642, + "balance_loss_clip": 0.06289028, + "balance_loss_mlp": 0.01257581, + "epoch": 0.4213437546971291, + "flos": 18374889882240.0, + "grad_norm": 2.097779928402724, + "language_loss": 0.82573175, + "learning_rate": 2.5973295864217166e-06, + "loss": 0.90308416, + "num_input_tokens_seen": 150405780, + "router_z_loss_clip": 1.74316406, + "router_z_loss_mlp": 0.140625, + "step": 7008, + "time_per_iteration": 2.492260456085205 + }, + { + "auxiliary_loss_clip": 0.0646316, + "auxiliary_loss_mlp": 0.01269434, + "balance_loss_clip": 0.06289843, + "balance_loss_mlp": 0.01255129, + "epoch": 0.42140387794979706, + "flos": 27711116311680.0, + "grad_norm": 1.9580680041192111, + "language_loss": 0.72638381, + "learning_rate": 2.596957889196831e-06, + "loss": 0.80370975, + "num_input_tokens_seen": 150425615, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.14318848, + "step": 7009, + "time_per_iteration": 2.6216533184051514 + }, + { + "auxiliary_loss_clip": 0.06466616, + "auxiliary_loss_mlp": 0.0126722, + "balance_loss_clip": 0.06289244, + "balance_loss_mlp": 0.01253338, + "epoch": 0.4214640012024651, + "flos": 28154545718400.0, + "grad_norm": 2.5692415195563543, + "language_loss": 0.66926241, + "learning_rate": 2.596586169335243e-06, + "loss": 0.74660075, + "num_input_tokens_seen": 150445765, + "router_z_loss_clip": 1.77441406, + "router_z_loss_mlp": 0.13873291, + "step": 7010, + "time_per_iteration": 2.606501579284668 + }, + { + "auxiliary_loss_clip": 0.06462754, + "auxiliary_loss_mlp": 0.01271396, + "balance_loss_clip": 0.06290238, + "balance_loss_mlp": 0.01256662, + "epoch": 0.42152412445513304, + "flos": 23002989972480.0, + "grad_norm": 1.6839098151972378, + "language_loss": 0.7266804, + "learning_rate": 2.5962144268510477e-06, + "loss": 0.80402195, + "num_input_tokens_seen": 150464405, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.14727783, + "step": 7011, + "time_per_iteration": 4.0488903522491455 + }, + { + "auxiliary_loss_clip": 0.06363396, + "auxiliary_loss_mlp": 0.01255682, + "balance_loss_clip": 0.06285673, + "balance_loss_mlp": 0.01253149, + "epoch": 0.421584247707801, + "flos": 63767855756160.0, + "grad_norm": 0.7737758086067837, + "language_loss": 0.54255652, + "learning_rate": 2.5958426617583417e-06, + "loss": 0.61874723, + "num_input_tokens_seen": 150520430, + "router_z_loss_clip": 0.77734375, + "router_z_loss_mlp": 0.02532959, + "step": 7012, + "time_per_iteration": 3.0473456382751465 + }, + { + "auxiliary_loss_clip": 0.06465481, + "auxiliary_loss_mlp": 0.01272136, + "balance_loss_clip": 0.06289969, + "balance_loss_mlp": 0.01256656, + "epoch": 0.421644370960469, + "flos": 24321203205120.0, + "grad_norm": 1.3531523641491952, + "language_loss": 0.78821653, + "learning_rate": 2.5954708740712215e-06, + "loss": 0.86559272, + "num_input_tokens_seen": 150542610, + "router_z_loss_clip": 1.75292969, + "router_z_loss_mlp": 0.15472412, + "step": 7013, + "time_per_iteration": 2.5436811447143555 + }, + { + "auxiliary_loss_clip": 0.06463543, + "auxiliary_loss_mlp": 0.0127162, + "balance_loss_clip": 0.06287397, + "balance_loss_mlp": 0.01256516, + "epoch": 0.42170449421313694, + "flos": 23447425628160.0, + "grad_norm": 1.8634561108800796, + "language_loss": 0.81284738, + "learning_rate": 2.595099063803787e-06, + "loss": 0.89019895, + "num_input_tokens_seen": 150560970, + "router_z_loss_clip": 1.76269531, + "router_z_loss_mlp": 0.15100098, + "step": 7014, + "time_per_iteration": 2.6464757919311523 + }, + { + "auxiliary_loss_clip": 0.06460524, + "auxiliary_loss_mlp": 0.01273083, + "balance_loss_clip": 0.06287747, + "balance_loss_mlp": 0.01259225, + "epoch": 0.4217646174658049, + "flos": 23702151640320.0, + "grad_norm": 1.4680948866945018, + "language_loss": 0.77888769, + "learning_rate": 2.5947272309701354e-06, + "loss": 0.85622376, + "num_input_tokens_seen": 150582615, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.1385498, + "step": 7015, + "time_per_iteration": 4.043898582458496 + }, + { + "auxiliary_loss_clip": 0.06464352, + "auxiliary_loss_mlp": 0.01268474, + "balance_loss_clip": 0.06287283, + "balance_loss_mlp": 0.01253394, + "epoch": 0.42182474071847287, + "flos": 24978297323520.0, + "grad_norm": 1.853408702102599, + "language_loss": 0.82096922, + "learning_rate": 2.594355375584368e-06, + "loss": 0.89829755, + "num_input_tokens_seen": 150603640, + "router_z_loss_clip": 1.77050781, + "router_z_loss_mlp": 0.15075684, + "step": 7016, + "time_per_iteration": 2.5523900985717773 + }, + { + "auxiliary_loss_clip": 0.06465739, + "auxiliary_loss_mlp": 0.01271643, + "balance_loss_clip": 0.06291386, + "balance_loss_mlp": 0.01256527, + "epoch": 0.42188486397114083, + "flos": 22863230161920.0, + "grad_norm": 2.845700477826224, + "language_loss": 0.6853466, + "learning_rate": 2.593983497660586e-06, + "loss": 0.76272047, + "num_input_tokens_seen": 150622490, + "router_z_loss_clip": 1.74121094, + "router_z_loss_mlp": 0.15112305, + "step": 7017, + "time_per_iteration": 2.57027530670166 + }, + { + "auxiliary_loss_clip": 0.0636536, + "auxiliary_loss_mlp": 0.01255401, + "balance_loss_clip": 0.06287346, + "balance_loss_mlp": 0.01252595, + "epoch": 0.4219449872238088, + "flos": 66997072730880.0, + "grad_norm": 0.6666550742113542, + "language_loss": 0.59442866, + "learning_rate": 2.5936115972128895e-06, + "loss": 0.67063624, + "num_input_tokens_seen": 150689545, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 0.02804565, + "step": 7018, + "time_per_iteration": 3.1860194206237793 + }, + { + "auxiliary_loss_clip": 0.0646835, + "auxiliary_loss_mlp": 0.01271161, + "balance_loss_clip": 0.0628873, + "balance_loss_mlp": 0.0125617, + "epoch": 0.42200511047647676, + "flos": 13120400995200.0, + "grad_norm": 1.8819765217055724, + "language_loss": 0.75926054, + "learning_rate": 2.593239674255382e-06, + "loss": 0.83665562, + "num_input_tokens_seen": 150707610, + "router_z_loss_clip": 1.79492188, + "router_z_loss_mlp": 0.14990234, + "step": 7019, + "time_per_iteration": 2.542468309402466 + }, + { + "auxiliary_loss_clip": 0.06462015, + "auxiliary_loss_mlp": 0.01273146, + "balance_loss_clip": 0.06287961, + "balance_loss_mlp": 0.01257864, + "epoch": 0.42206523372914473, + "flos": 13996400705280.0, + "grad_norm": 1.899626408213008, + "language_loss": 0.69618917, + "learning_rate": 2.592867728802166e-06, + "loss": 0.77354079, + "num_input_tokens_seen": 150724530, + "router_z_loss_clip": 1.74121094, + "router_z_loss_mlp": 0.15283203, + "step": 7020, + "time_per_iteration": 2.4884140491485596 + }, + { + "auxiliary_loss_clip": 0.06459437, + "auxiliary_loss_mlp": 0.01272473, + "balance_loss_clip": 0.06290746, + "balance_loss_mlp": 0.01258347, + "epoch": 0.4221253569818127, + "flos": 21948391284480.0, + "grad_norm": 1.6760812445081854, + "language_loss": 0.81457055, + "learning_rate": 2.592495760867347e-06, + "loss": 0.89188963, + "num_input_tokens_seen": 150742870, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.14135742, + "step": 7021, + "time_per_iteration": 2.60335111618042 + }, + { + "auxiliary_loss_clip": 0.06460646, + "auxiliary_loss_mlp": 0.01268222, + "balance_loss_clip": 0.06286098, + "balance_loss_mlp": 0.01253869, + "epoch": 0.42218548023448066, + "flos": 32200001642880.0, + "grad_norm": 1.5750279801473723, + "language_loss": 0.70101392, + "learning_rate": 2.5921237704650293e-06, + "loss": 0.77830255, + "num_input_tokens_seen": 150765500, + "router_z_loss_clip": 1.74609375, + "router_z_loss_mlp": 0.14355469, + "step": 7022, + "time_per_iteration": 2.605795383453369 + }, + { + "auxiliary_loss_clip": 0.06450655, + "auxiliary_loss_mlp": 0.01272538, + "balance_loss_clip": 0.06284072, + "balance_loss_mlp": 0.01258788, + "epoch": 0.4222456034871487, + "flos": 30127043957760.0, + "grad_norm": 1.5974321201389856, + "language_loss": 0.67428911, + "learning_rate": 2.5917517576093188e-06, + "loss": 0.75152111, + "num_input_tokens_seen": 150784945, + "router_z_loss_clip": 1.66699219, + "router_z_loss_mlp": 0.13751221, + "step": 7023, + "time_per_iteration": 2.6615898609161377 + }, + { + "auxiliary_loss_clip": 0.06455819, + "auxiliary_loss_mlp": 0.01270862, + "balance_loss_clip": 0.06287459, + "balance_loss_mlp": 0.01255508, + "epoch": 0.42230572673981664, + "flos": 22134537129600.0, + "grad_norm": 1.6408413231786074, + "language_loss": 0.69710904, + "learning_rate": 2.591379722314322e-06, + "loss": 0.77437586, + "num_input_tokens_seen": 150803120, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.15356445, + "step": 7024, + "time_per_iteration": 2.531874895095825 + }, + { + "auxiliary_loss_clip": 0.06457987, + "auxiliary_loss_mlp": 0.01269795, + "balance_loss_clip": 0.06283922, + "balance_loss_mlp": 0.01255598, + "epoch": 0.4223658499924846, + "flos": 22061722331520.0, + "grad_norm": 2.1972757713163102, + "language_loss": 0.76880538, + "learning_rate": 2.591007664594147e-06, + "loss": 0.84608328, + "num_input_tokens_seen": 150823135, + "router_z_loss_clip": 1.73925781, + "router_z_loss_mlp": 0.14196777, + "step": 7025, + "time_per_iteration": 2.568814754486084 + }, + { + "auxiliary_loss_clip": 0.06457998, + "auxiliary_loss_mlp": 0.01277209, + "balance_loss_clip": 0.06287608, + "balance_loss_mlp": 0.01263017, + "epoch": 0.4224259732451526, + "flos": 20416681048320.0, + "grad_norm": 1.910881237925828, + "language_loss": 0.80124468, + "learning_rate": 2.5906355844629024e-06, + "loss": 0.87859672, + "num_input_tokens_seen": 150842070, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.14208984, + "step": 7026, + "time_per_iteration": 2.4988901615142822 + }, + { + "auxiliary_loss_clip": 0.06353324, + "auxiliary_loss_mlp": 0.01252769, + "balance_loss_clip": 0.06275862, + "balance_loss_mlp": 0.01250106, + "epoch": 0.42248609649782054, + "flos": 62866307750400.0, + "grad_norm": 0.7325438580667073, + "language_loss": 0.62037623, + "learning_rate": 2.5902634819346966e-06, + "loss": 0.69643718, + "num_input_tokens_seen": 150907450, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.0266571, + "step": 7027, + "time_per_iteration": 3.230607748031616 + }, + { + "auxiliary_loss_clip": 0.06460012, + "auxiliary_loss_mlp": 0.01272089, + "balance_loss_clip": 0.06290331, + "balance_loss_mlp": 0.01257456, + "epoch": 0.4225462197504885, + "flos": 26257126337280.0, + "grad_norm": 2.572422824646089, + "language_loss": 0.71053827, + "learning_rate": 2.5898913570236414e-06, + "loss": 0.78785932, + "num_input_tokens_seen": 150928040, + "router_z_loss_clip": 1.69824219, + "router_z_loss_mlp": 0.14642334, + "step": 7028, + "time_per_iteration": 2.5667781829833984 + }, + { + "auxiliary_loss_clip": 0.06463138, + "auxiliary_loss_mlp": 0.01269354, + "balance_loss_clip": 0.06289553, + "balance_loss_mlp": 0.01255437, + "epoch": 0.42260634300315647, + "flos": 20528209232640.0, + "grad_norm": 1.948126664005559, + "language_loss": 0.82621461, + "learning_rate": 2.589519209743846e-06, + "loss": 0.90353954, + "num_input_tokens_seen": 150945760, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.13928223, + "step": 7029, + "time_per_iteration": 2.5936038494110107 + }, + { + "auxiliary_loss_clip": 0.06468205, + "auxiliary_loss_mlp": 0.01274403, + "balance_loss_clip": 0.06289516, + "balance_loss_mlp": 0.01258441, + "epoch": 0.42266646625582444, + "flos": 24323676900480.0, + "grad_norm": 1.8377333901506168, + "language_loss": 0.75193119, + "learning_rate": 2.589147040109424e-06, + "loss": 0.82935727, + "num_input_tokens_seen": 150965665, + "router_z_loss_clip": 1.78613281, + "router_z_loss_mlp": 0.15966797, + "step": 7030, + "time_per_iteration": 2.6162269115448 + }, + { + "auxiliary_loss_clip": 0.06462294, + "auxiliary_loss_mlp": 0.01267502, + "balance_loss_clip": 0.06287964, + "balance_loss_mlp": 0.01251421, + "epoch": 0.4227265895084924, + "flos": 24210555488640.0, + "grad_norm": 1.9734407814648771, + "language_loss": 0.86909479, + "learning_rate": 2.588774848134486e-06, + "loss": 0.94639277, + "num_input_tokens_seen": 150982260, + "router_z_loss_clip": 1.74511719, + "router_z_loss_mlp": 0.1607666, + "step": 7031, + "time_per_iteration": 2.5292763710021973 + }, + { + "auxiliary_loss_clip": 0.06460671, + "auxiliary_loss_mlp": 0.01269226, + "balance_loss_clip": 0.06286174, + "balance_loss_mlp": 0.01255171, + "epoch": 0.42278671276116037, + "flos": 16915407465600.0, + "grad_norm": 1.893963671956315, + "language_loss": 0.73803562, + "learning_rate": 2.5884026338331473e-06, + "loss": 0.81533462, + "num_input_tokens_seen": 150999990, + "router_z_loss_clip": 1.74511719, + "router_z_loss_mlp": 0.140625, + "step": 7032, + "time_per_iteration": 2.5382707118988037 + }, + { + "auxiliary_loss_clip": 0.06463667, + "auxiliary_loss_mlp": 0.0126981, + "balance_loss_clip": 0.06286915, + "balance_loss_mlp": 0.01254874, + "epoch": 0.42284683601382833, + "flos": 25418162931840.0, + "grad_norm": 1.9439146678532522, + "language_loss": 0.70438349, + "learning_rate": 2.5880303972195222e-06, + "loss": 0.78171825, + "num_input_tokens_seen": 151021105, + "router_z_loss_clip": 1.76660156, + "router_z_loss_mlp": 0.1496582, + "step": 7033, + "time_per_iteration": 2.5798444747924805 + }, + { + "auxiliary_loss_clip": 0.06464536, + "auxiliary_loss_mlp": 0.01270969, + "balance_loss_clip": 0.06288149, + "balance_loss_mlp": 0.01256282, + "epoch": 0.4229069592664963, + "flos": 23047153873920.0, + "grad_norm": 1.8861418032064503, + "language_loss": 0.90879869, + "learning_rate": 2.5876581383077256e-06, + "loss": 0.98615378, + "num_input_tokens_seen": 151040665, + "router_z_loss_clip": 1.76269531, + "router_z_loss_mlp": 0.14685059, + "step": 7034, + "time_per_iteration": 2.5370678901672363 + }, + { + "auxiliary_loss_clip": 0.06455763, + "auxiliary_loss_mlp": 0.01270773, + "balance_loss_clip": 0.06283915, + "balance_loss_mlp": 0.01256676, + "epoch": 0.42296708251916426, + "flos": 26074586217600.0, + "grad_norm": 1.9962240812191803, + "language_loss": 0.77578306, + "learning_rate": 2.5872858571118723e-06, + "loss": 0.85304844, + "num_input_tokens_seen": 151061240, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.14080811, + "step": 7035, + "time_per_iteration": 2.542121648788452 + }, + { + "auxiliary_loss_clip": 0.06464495, + "auxiliary_loss_mlp": 0.01274418, + "balance_loss_clip": 0.06287753, + "balance_loss_mlp": 0.01259863, + "epoch": 0.4230272057718323, + "flos": 19463548055040.0, + "grad_norm": 2.323654021784471, + "language_loss": 0.83016878, + "learning_rate": 2.5869135536460817e-06, + "loss": 0.90755796, + "num_input_tokens_seen": 151076870, + "router_z_loss_clip": 1.76855469, + "router_z_loss_mlp": 0.14538574, + "step": 7036, + "time_per_iteration": 2.5446789264678955 + }, + { + "auxiliary_loss_clip": 0.06461224, + "auxiliary_loss_mlp": 0.01270872, + "balance_loss_clip": 0.06292447, + "balance_loss_mlp": 0.01256859, + "epoch": 0.42308732902450025, + "flos": 22389975901440.0, + "grad_norm": 1.9007003646753964, + "language_loss": 0.70561719, + "learning_rate": 2.58654122792447e-06, + "loss": 0.78293824, + "num_input_tokens_seen": 151095110, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.14031982, + "step": 7037, + "time_per_iteration": 2.5331337451934814 + }, + { + "auxiliary_loss_clip": 0.06462964, + "auxiliary_loss_mlp": 0.01269409, + "balance_loss_clip": 0.06289166, + "balance_loss_mlp": 0.01253923, + "epoch": 0.4231474522771682, + "flos": 21001631201280.0, + "grad_norm": 1.6547666669933128, + "language_loss": 0.77886164, + "learning_rate": 2.586168879961155e-06, + "loss": 0.85618538, + "num_input_tokens_seen": 151114355, + "router_z_loss_clip": 1.73828125, + "router_z_loss_mlp": 0.1550293, + "step": 7038, + "time_per_iteration": 2.547067165374756 + }, + { + "auxiliary_loss_clip": 0.06470759, + "auxiliary_loss_mlp": 0.01270751, + "balance_loss_clip": 0.06292742, + "balance_loss_mlp": 0.01255432, + "epoch": 0.4232075755298362, + "flos": 14981161415040.0, + "grad_norm": 2.6561544689274714, + "language_loss": 0.67851424, + "learning_rate": 2.585796509770259e-06, + "loss": 0.75592935, + "num_input_tokens_seen": 151131505, + "router_z_loss_clip": 1.77929688, + "router_z_loss_mlp": 0.15301514, + "step": 7039, + "time_per_iteration": 2.5148706436157227 + }, + { + "auxiliary_loss_clip": 0.06471442, + "auxiliary_loss_mlp": 0.01274269, + "balance_loss_clip": 0.06291762, + "balance_loss_mlp": 0.01258962, + "epoch": 0.42326769878250414, + "flos": 24539144474880.0, + "grad_norm": 1.5526791387199284, + "language_loss": 0.75859225, + "learning_rate": 2.5854241173658996e-06, + "loss": 0.83604932, + "num_input_tokens_seen": 151151555, + "router_z_loss_clip": 1.79492188, + "router_z_loss_mlp": 0.15307617, + "step": 7040, + "time_per_iteration": 2.6170670986175537 + }, + { + "auxiliary_loss_clip": 0.0646336, + "auxiliary_loss_mlp": 0.01267915, + "balance_loss_clip": 0.06288165, + "balance_loss_mlp": 0.01253199, + "epoch": 0.4233278220351721, + "flos": 26877603421440.0, + "grad_norm": 2.185572961013026, + "language_loss": 0.65619481, + "learning_rate": 2.5850517027621996e-06, + "loss": 0.73350751, + "num_input_tokens_seen": 151172385, + "router_z_loss_clip": 1.75097656, + "router_z_loss_mlp": 0.14715576, + "step": 7041, + "time_per_iteration": 2.5701920986175537 + }, + { + "auxiliary_loss_clip": 0.06470653, + "auxiliary_loss_mlp": 0.01271372, + "balance_loss_clip": 0.06294046, + "balance_loss_mlp": 0.01256626, + "epoch": 0.4233879452878401, + "flos": 42824951867520.0, + "grad_norm": 2.182989579985364, + "language_loss": 0.73763824, + "learning_rate": 2.5846792659732803e-06, + "loss": 0.81505847, + "num_input_tokens_seen": 151194930, + "router_z_loss_clip": 1.76367188, + "router_z_loss_mlp": 0.14752197, + "step": 7042, + "time_per_iteration": 2.7377729415893555 + }, + { + "auxiliary_loss_clip": 0.06466709, + "auxiliary_loss_mlp": 0.01270508, + "balance_loss_clip": 0.06294659, + "balance_loss_mlp": 0.01256119, + "epoch": 0.42344806854050804, + "flos": 25236125936640.0, + "grad_norm": 1.357775127981886, + "language_loss": 0.82479644, + "learning_rate": 2.5843068070132643e-06, + "loss": 0.90216863, + "num_input_tokens_seen": 151217905, + "router_z_loss_clip": 1.72167969, + "router_z_loss_mlp": 0.14379883, + "step": 7043, + "time_per_iteration": 2.6002635955810547 + }, + { + "auxiliary_loss_clip": 0.06466006, + "auxiliary_loss_mlp": 0.01268509, + "balance_loss_clip": 0.06294385, + "balance_loss_mlp": 0.01252749, + "epoch": 0.423508191793176, + "flos": 22784587505280.0, + "grad_norm": 2.981661405110402, + "language_loss": 0.65042412, + "learning_rate": 2.5839343258962763e-06, + "loss": 0.72776926, + "num_input_tokens_seen": 151234580, + "router_z_loss_clip": 1.71679688, + "router_z_loss_mlp": 0.1574707, + "step": 7044, + "time_per_iteration": 4.032661437988281 + }, + { + "auxiliary_loss_clip": 0.06473978, + "auxiliary_loss_mlp": 0.01277434, + "balance_loss_clip": 0.06294475, + "balance_loss_mlp": 0.01261793, + "epoch": 0.42356831504584397, + "flos": 34645376799360.0, + "grad_norm": 1.8091896069955142, + "language_loss": 0.74864423, + "learning_rate": 2.5835618226364393e-06, + "loss": 0.82615834, + "num_input_tokens_seen": 151254765, + "router_z_loss_clip": 1.79394531, + "router_z_loss_mlp": 0.15649414, + "step": 7045, + "time_per_iteration": 2.6634554862976074 + }, + { + "auxiliary_loss_clip": 0.06458761, + "auxiliary_loss_mlp": 0.01272071, + "balance_loss_clip": 0.06289783, + "balance_loss_mlp": 0.01258177, + "epoch": 0.42362843829851193, + "flos": 17601487896960.0, + "grad_norm": 2.434331790625752, + "language_loss": 0.8101598, + "learning_rate": 2.5831892972478797e-06, + "loss": 0.88746816, + "num_input_tokens_seen": 151269045, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.13885498, + "step": 7046, + "time_per_iteration": 3.8471035957336426 + }, + { + "auxiliary_loss_clip": 0.06470428, + "auxiliary_loss_mlp": 0.01270077, + "balance_loss_clip": 0.06293224, + "balance_loss_mlp": 0.01255635, + "epoch": 0.4236885615511799, + "flos": 22572390240000.0, + "grad_norm": 1.5654922866483163, + "language_loss": 0.77272886, + "learning_rate": 2.5828167497447242e-06, + "loss": 0.8501339, + "num_input_tokens_seen": 151287530, + "router_z_loss_clip": 1.77050781, + "router_z_loss_mlp": 0.14416504, + "step": 7047, + "time_per_iteration": 2.5323123931884766 + }, + { + "auxiliary_loss_clip": 0.06461948, + "auxiliary_loss_mlp": 0.01271728, + "balance_loss_clip": 0.06291857, + "balance_loss_mlp": 0.01258245, + "epoch": 0.42374868480384786, + "flos": 26476493126400.0, + "grad_norm": 1.7230664508561655, + "language_loss": 0.68109751, + "learning_rate": 2.582444180141098e-06, + "loss": 0.75843424, + "num_input_tokens_seen": 151308905, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.13482666, + "step": 7048, + "time_per_iteration": 2.5632970333099365 + }, + { + "auxiliary_loss_clip": 0.06464637, + "auxiliary_loss_mlp": 0.01268497, + "balance_loss_clip": 0.06289657, + "balance_loss_mlp": 0.01253263, + "epoch": 0.4238088080565159, + "flos": 20375493966720.0, + "grad_norm": 1.6594147848364105, + "language_loss": 0.78005636, + "learning_rate": 2.5820715884511307e-06, + "loss": 0.85738766, + "num_input_tokens_seen": 151326525, + "router_z_loss_clip": 1.74707031, + "router_z_loss_mlp": 0.15234375, + "step": 7049, + "time_per_iteration": 2.5366568565368652 + }, + { + "auxiliary_loss_clip": 0.06468852, + "auxiliary_loss_mlp": 0.01270789, + "balance_loss_clip": 0.06292627, + "balance_loss_mlp": 0.01256067, + "epoch": 0.42386893130918385, + "flos": 21177379140480.0, + "grad_norm": 1.886460992095426, + "language_loss": 0.83185136, + "learning_rate": 2.5816989746889504e-06, + "loss": 0.90924776, + "num_input_tokens_seen": 151344675, + "router_z_loss_clip": 1.76269531, + "router_z_loss_mlp": 0.1472168, + "step": 7050, + "time_per_iteration": 2.5130441188812256 + }, + { + "auxiliary_loss_clip": 0.06460265, + "auxiliary_loss_mlp": 0.01271009, + "balance_loss_clip": 0.06286017, + "balance_loss_mlp": 0.01255738, + "epoch": 0.4239290545618518, + "flos": 17681346437760.0, + "grad_norm": 2.0965482043088968, + "language_loss": 0.73218369, + "learning_rate": 2.581326338868687e-06, + "loss": 0.80949646, + "num_input_tokens_seen": 151360730, + "router_z_loss_clip": 1.74316406, + "router_z_loss_mlp": 0.15283203, + "step": 7051, + "time_per_iteration": 3.92645263671875 + }, + { + "auxiliary_loss_clip": 0.06464715, + "auxiliary_loss_mlp": 0.01268876, + "balance_loss_clip": 0.06291503, + "balance_loss_mlp": 0.01254595, + "epoch": 0.4239891778145198, + "flos": 24321077424000.0, + "grad_norm": 1.57175281695923, + "language_loss": 0.86744994, + "learning_rate": 2.5809536810044706e-06, + "loss": 0.94478583, + "num_input_tokens_seen": 151380445, + "router_z_loss_clip": 1.73144531, + "router_z_loss_mlp": 0.1427002, + "step": 7052, + "time_per_iteration": 2.584425210952759 + }, + { + "auxiliary_loss_clip": 0.06467065, + "auxiliary_loss_mlp": 0.01277353, + "balance_loss_clip": 0.06289236, + "balance_loss_mlp": 0.01262559, + "epoch": 0.42404930106718774, + "flos": 20564700485760.0, + "grad_norm": 1.3965954512003949, + "language_loss": 0.72571224, + "learning_rate": 2.5805810011104323e-06, + "loss": 0.80315644, + "num_input_tokens_seen": 151399325, + "router_z_loss_clip": 1.77734375, + "router_z_loss_mlp": 0.14794922, + "step": 7053, + "time_per_iteration": 2.5454976558685303 + }, + { + "auxiliary_loss_clip": 0.06462884, + "auxiliary_loss_mlp": 0.01267759, + "balance_loss_clip": 0.06288673, + "balance_loss_mlp": 0.01253251, + "epoch": 0.4241094243198557, + "flos": 22314351991680.0, + "grad_norm": 1.5249079777591508, + "language_loss": 0.82902604, + "learning_rate": 2.580208299200704e-06, + "loss": 0.90633249, + "num_input_tokens_seen": 151417240, + "router_z_loss_clip": 1.74023438, + "router_z_loss_mlp": 0.14508057, + "step": 7054, + "time_per_iteration": 4.019419193267822 + }, + { + "auxiliary_loss_clip": 0.06381379, + "auxiliary_loss_mlp": 0.01253973, + "balance_loss_clip": 0.06300146, + "balance_loss_mlp": 0.01250773, + "epoch": 0.4241695475725237, + "flos": 70632445973760.0, + "grad_norm": 0.7904217901105888, + "language_loss": 0.60280955, + "learning_rate": 2.5798355752894183e-06, + "loss": 0.6791631, + "num_input_tokens_seen": 151476015, + "router_z_loss_clip": 0.8125, + "router_z_loss_mlp": 0.03204346, + "step": 7055, + "time_per_iteration": 3.152217388153076 + }, + { + "auxiliary_loss_clip": 0.06467455, + "auxiliary_loss_mlp": 0.01267499, + "balance_loss_clip": 0.06290264, + "balance_loss_mlp": 0.01252717, + "epoch": 0.42422967082519164, + "flos": 14032640396160.0, + "grad_norm": 2.414100924234879, + "language_loss": 0.77460873, + "learning_rate": 2.5794628293907107e-06, + "loss": 0.85195827, + "num_input_tokens_seen": 151492035, + "router_z_loss_clip": 1.7734375, + "router_z_loss_mlp": 0.14782715, + "step": 7056, + "time_per_iteration": 2.469475746154785 + }, + { + "auxiliary_loss_clip": 0.06476917, + "auxiliary_loss_mlp": 0.01275416, + "balance_loss_clip": 0.06295634, + "balance_loss_mlp": 0.01259013, + "epoch": 0.4242897940778596, + "flos": 22351975274880.0, + "grad_norm": 2.3823515442172187, + "language_loss": 0.84773225, + "learning_rate": 2.579090061518714e-06, + "loss": 0.92525554, + "num_input_tokens_seen": 151508970, + "router_z_loss_clip": 1.8125, + "router_z_loss_mlp": 0.1640625, + "step": 7057, + "time_per_iteration": 2.559659481048584 + }, + { + "auxiliary_loss_clip": 0.06472223, + "auxiliary_loss_mlp": 0.01277699, + "balance_loss_clip": 0.06293373, + "balance_loss_mlp": 0.01262202, + "epoch": 0.42434991733052757, + "flos": 22601502334080.0, + "grad_norm": 3.5122040291641583, + "language_loss": 0.83485544, + "learning_rate": 2.5787172716875642e-06, + "loss": 0.91235471, + "num_input_tokens_seen": 151525295, + "router_z_loss_clip": 1.78808594, + "router_z_loss_mlp": 0.15490723, + "step": 7058, + "time_per_iteration": 2.4998161792755127 + }, + { + "auxiliary_loss_clip": 0.06459209, + "auxiliary_loss_mlp": 0.01270641, + "balance_loss_clip": 0.06288499, + "balance_loss_mlp": 0.01256205, + "epoch": 0.42441004058319554, + "flos": 20017667105280.0, + "grad_norm": 2.0122152391379498, + "language_loss": 0.80975556, + "learning_rate": 2.5783444599113973e-06, + "loss": 0.88705409, + "num_input_tokens_seen": 151544435, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.14440918, + "step": 7059, + "time_per_iteration": 2.581310987472534 + }, + { + "auxiliary_loss_clip": 0.06467164, + "auxiliary_loss_mlp": 0.0127411, + "balance_loss_clip": 0.06288522, + "balance_loss_mlp": 0.01258053, + "epoch": 0.4244701638358635, + "flos": 11149663691520.0, + "grad_norm": 2.3594129001130963, + "language_loss": 0.70608068, + "learning_rate": 2.57797162620435e-06, + "loss": 0.7834934, + "num_input_tokens_seen": 151559520, + "router_z_loss_clip": 1.78710938, + "router_z_loss_mlp": 0.16064453, + "step": 7060, + "time_per_iteration": 2.485072612762451 + }, + { + "auxiliary_loss_clip": 0.06469266, + "auxiliary_loss_mlp": 0.01274664, + "balance_loss_clip": 0.06293246, + "balance_loss_mlp": 0.01260317, + "epoch": 0.42453028708853147, + "flos": 23994542862720.0, + "grad_norm": 1.485543893241047, + "language_loss": 0.76297516, + "learning_rate": 2.577598770580562e-06, + "loss": 0.84041446, + "num_input_tokens_seen": 151579790, + "router_z_loss_clip": 1.76171875, + "router_z_loss_mlp": 0.14324951, + "step": 7061, + "time_per_iteration": 2.594430685043335 + }, + { + "auxiliary_loss_clip": 0.06469865, + "auxiliary_loss_mlp": 0.01271574, + "balance_loss_clip": 0.06291063, + "balance_loss_mlp": 0.01256643, + "epoch": 0.42459041034119943, + "flos": 18412345457280.0, + "grad_norm": 1.9822246970542112, + "language_loss": 0.72630441, + "learning_rate": 2.5772258930541693e-06, + "loss": 0.80371881, + "num_input_tokens_seen": 151598285, + "router_z_loss_clip": 1.79296875, + "router_z_loss_mlp": 0.14935303, + "step": 7062, + "time_per_iteration": 2.64372181892395 + }, + { + "auxiliary_loss_clip": 0.06460352, + "auxiliary_loss_mlp": 0.01277188, + "balance_loss_clip": 0.06284757, + "balance_loss_mlp": 0.01262215, + "epoch": 0.42465053359386745, + "flos": 20964049845120.0, + "grad_norm": 2.6818567528078923, + "language_loss": 0.66330427, + "learning_rate": 2.5768529936393137e-06, + "loss": 0.74067968, + "num_input_tokens_seen": 151615430, + "router_z_loss_clip": 1.7578125, + "router_z_loss_mlp": 0.1496582, + "step": 7063, + "time_per_iteration": 2.5413248538970947 + }, + { + "auxiliary_loss_clip": 0.06452604, + "auxiliary_loss_mlp": 0.01267624, + "balance_loss_clip": 0.062814, + "balance_loss_mlp": 0.01254195, + "epoch": 0.4247106568465354, + "flos": 33114001979520.0, + "grad_norm": 1.5147527354116395, + "language_loss": 0.78917265, + "learning_rate": 2.5764800723501354e-06, + "loss": 0.86637491, + "num_input_tokens_seen": 151637030, + "router_z_loss_clip": 1.71191406, + "router_z_loss_mlp": 0.13446045, + "step": 7064, + "time_per_iteration": 2.610231876373291 + }, + { + "auxiliary_loss_clip": 0.06469544, + "auxiliary_loss_mlp": 0.01271013, + "balance_loss_clip": 0.06291715, + "balance_loss_mlp": 0.01256267, + "epoch": 0.4247707800992034, + "flos": 20052984401280.0, + "grad_norm": 1.8682780470126852, + "language_loss": 0.75125778, + "learning_rate": 2.5761071292007736e-06, + "loss": 0.82866335, + "num_input_tokens_seen": 151655745, + "router_z_loss_clip": 1.77832031, + "router_z_loss_mlp": 0.14733887, + "step": 7065, + "time_per_iteration": 2.583846092224121 + }, + { + "auxiliary_loss_clip": 0.06463289, + "auxiliary_loss_mlp": 0.01272027, + "balance_loss_clip": 0.06289071, + "balance_loss_mlp": 0.01256971, + "epoch": 0.42483090335187135, + "flos": 22392114180480.0, + "grad_norm": 1.5143179334948575, + "language_loss": 0.72187293, + "learning_rate": 2.5757341642053725e-06, + "loss": 0.79922605, + "num_input_tokens_seen": 151678040, + "router_z_loss_clip": 1.7421875, + "router_z_loss_mlp": 0.1505127, + "step": 7066, + "time_per_iteration": 2.5569074153900146 + }, + { + "auxiliary_loss_clip": 0.06467879, + "auxiliary_loss_mlp": 0.01269525, + "balance_loss_clip": 0.06290474, + "balance_loss_mlp": 0.01254231, + "epoch": 0.4248910266045393, + "flos": 21362518736640.0, + "grad_norm": 2.6158792173392484, + "language_loss": 0.79757857, + "learning_rate": 2.5753611773780745e-06, + "loss": 0.87495261, + "num_input_tokens_seen": 151696410, + "router_z_loss_clip": 1.77441406, + "router_z_loss_mlp": 0.15289307, + "step": 7067, + "time_per_iteration": 2.5845797061920166 + }, + { + "auxiliary_loss_clip": 0.06384341, + "auxiliary_loss_mlp": 0.01254549, + "balance_loss_clip": 0.06303053, + "balance_loss_mlp": 0.01250746, + "epoch": 0.4249511498572073, + "flos": 64026942180480.0, + "grad_norm": 1.3506219442036578, + "language_loss": 0.63354319, + "learning_rate": 2.574988168733022e-06, + "loss": 0.70993209, + "num_input_tokens_seen": 151756365, + "router_z_loss_clip": 0.8125, + "router_z_loss_mlp": 0.03796387, + "step": 7068, + "time_per_iteration": 3.082864284515381 + }, + { + "auxiliary_loss_clip": 0.06464778, + "auxiliary_loss_mlp": 0.0127101, + "balance_loss_clip": 0.06287815, + "balance_loss_mlp": 0.01255155, + "epoch": 0.42501127310987524, + "flos": 19612699522560.0, + "grad_norm": 2.0360912712095875, + "language_loss": 0.72778141, + "learning_rate": 2.574615138284361e-06, + "loss": 0.8051393, + "num_input_tokens_seen": 151775165, + "router_z_loss_clip": 1.76953125, + "router_z_loss_mlp": 0.15844727, + "step": 7069, + "time_per_iteration": 2.560899257659912 + }, + { + "auxiliary_loss_clip": 0.06466071, + "auxiliary_loss_mlp": 0.01271316, + "balance_loss_clip": 0.06289013, + "balance_loss_mlp": 0.01255378, + "epoch": 0.4250713963625432, + "flos": 19468160029440.0, + "grad_norm": 2.1627827730841074, + "language_loss": 0.79640651, + "learning_rate": 2.5742420860462364e-06, + "loss": 0.87378043, + "num_input_tokens_seen": 151792620, + "router_z_loss_clip": 1.77050781, + "router_z_loss_mlp": 0.15930176, + "step": 7070, + "time_per_iteration": 2.507615327835083 + }, + { + "auxiliary_loss_clip": 0.06461551, + "auxiliary_loss_mlp": 0.01270578, + "balance_loss_clip": 0.06285524, + "balance_loss_mlp": 0.01255117, + "epoch": 0.4251315196152112, + "flos": 25344719228160.0, + "grad_norm": 1.9437385428250697, + "language_loss": 0.70912981, + "learning_rate": 2.573869012032795e-06, + "loss": 0.7864511, + "num_input_tokens_seen": 151812850, + "router_z_loss_clip": 1.76074219, + "router_z_loss_mlp": 0.15454102, + "step": 7071, + "time_per_iteration": 2.5730371475219727 + }, + { + "auxiliary_loss_clip": 0.06465049, + "auxiliary_loss_mlp": 0.01271451, + "balance_loss_clip": 0.06289509, + "balance_loss_mlp": 0.01256896, + "epoch": 0.42519164286787914, + "flos": 26366348534400.0, + "grad_norm": 2.618295142810269, + "language_loss": 0.71212989, + "learning_rate": 2.5734959162581824e-06, + "loss": 0.78949487, + "num_input_tokens_seen": 151831785, + "router_z_loss_clip": 1.75390625, + "router_z_loss_mlp": 0.14544678, + "step": 7072, + "time_per_iteration": 2.5560264587402344 + }, + { + "auxiliary_loss_clip": 0.06469329, + "auxiliary_loss_mlp": 0.01270547, + "balance_loss_clip": 0.06289761, + "balance_loss_mlp": 0.01256182, + "epoch": 0.4252517661205471, + "flos": 26038220745600.0, + "grad_norm": 1.647981639391401, + "language_loss": 0.81448823, + "learning_rate": 2.5731227987365475e-06, + "loss": 0.89188695, + "num_input_tokens_seen": 151853885, + "router_z_loss_clip": 1.79589844, + "router_z_loss_mlp": 0.14385986, + "step": 7073, + "time_per_iteration": 2.5955123901367188 + }, + { + "auxiliary_loss_clip": 0.06462769, + "auxiliary_loss_mlp": 0.01273163, + "balance_loss_clip": 0.06288294, + "balance_loss_mlp": 0.01259204, + "epoch": 0.42531188937321507, + "flos": 12718536013440.0, + "grad_norm": 2.653395632366352, + "language_loss": 0.91860557, + "learning_rate": 2.5727496594820386e-06, + "loss": 0.99596488, + "num_input_tokens_seen": 151871780, + "router_z_loss_clip": 1.74316406, + "router_z_loss_mlp": 0.1395874, + "step": 7074, + "time_per_iteration": 2.4894237518310547 + }, + { + "auxiliary_loss_clip": 0.06467288, + "auxiliary_loss_mlp": 0.01273087, + "balance_loss_clip": 0.06287881, + "balance_loss_mlp": 0.0125827, + "epoch": 0.42537201262588303, + "flos": 22098339365760.0, + "grad_norm": 1.877755960639547, + "language_loss": 0.64814276, + "learning_rate": 2.572376498508805e-06, + "loss": 0.72554648, + "num_input_tokens_seen": 151891600, + "router_z_loss_clip": 1.79296875, + "router_z_loss_mlp": 0.14807129, + "step": 7075, + "time_per_iteration": 2.598754644393921 + }, + { + "auxiliary_loss_clip": 0.06455241, + "auxiliary_loss_mlp": 0.01269515, + "balance_loss_clip": 0.06284718, + "balance_loss_mlp": 0.01255246, + "epoch": 0.42543213587855105, + "flos": 23009824080000.0, + "grad_norm": 2.0883967049140666, + "language_loss": 0.74251705, + "learning_rate": 2.5720033158309973e-06, + "loss": 0.81976461, + "num_input_tokens_seen": 151911330, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.1427002, + "step": 7076, + "time_per_iteration": 2.537986993789673 + }, + { + "auxiliary_loss_clip": 0.0646292, + "auxiliary_loss_mlp": 0.01270865, + "balance_loss_clip": 0.06284414, + "balance_loss_mlp": 0.01256334, + "epoch": 0.425492259131219, + "flos": 25089448164480.0, + "grad_norm": 3.3689754116422335, + "language_loss": 0.79212517, + "learning_rate": 2.571630111462766e-06, + "loss": 0.86946297, + "num_input_tokens_seen": 151930355, + "router_z_loss_clip": 1.78417969, + "router_z_loss_mlp": 0.14520264, + "step": 7077, + "time_per_iteration": 2.6490280628204346 + }, + { + "auxiliary_loss_clip": 0.06455311, + "auxiliary_loss_mlp": 0.01267846, + "balance_loss_clip": 0.06287791, + "balance_loss_mlp": 0.01254721, + "epoch": 0.425552382383887, + "flos": 22822881621120.0, + "grad_norm": 1.7167135286528112, + "language_loss": 0.7317155, + "learning_rate": 2.571256885418265e-06, + "loss": 0.80894709, + "num_input_tokens_seen": 151949695, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.13116455, + "step": 7078, + "time_per_iteration": 2.5729281902313232 + }, + { + "auxiliary_loss_clip": 0.06459501, + "auxiliary_loss_mlp": 0.01269381, + "balance_loss_clip": 0.06290293, + "balance_loss_mlp": 0.01256173, + "epoch": 0.42561250563655495, + "flos": 13558757230080.0, + "grad_norm": 1.6803598980459025, + "language_loss": 0.80183727, + "learning_rate": 2.5708836377116445e-06, + "loss": 0.87912607, + "num_input_tokens_seen": 151967640, + "router_z_loss_clip": 1.69238281, + "router_z_loss_mlp": 0.13201904, + "step": 7079, + "time_per_iteration": 2.4937188625335693 + }, + { + "auxiliary_loss_clip": 0.06460771, + "auxiliary_loss_mlp": 0.0127097, + "balance_loss_clip": 0.06287594, + "balance_loss_mlp": 0.01257481, + "epoch": 0.4256726288892229, + "flos": 46989692478720.0, + "grad_norm": 1.4689183555154843, + "language_loss": 0.71987867, + "learning_rate": 2.5705103683570592e-06, + "loss": 0.79719609, + "num_input_tokens_seen": 151994020, + "router_z_loss_clip": 1.73046875, + "router_z_loss_mlp": 0.13500977, + "step": 7080, + "time_per_iteration": 2.774247884750366 + }, + { + "auxiliary_loss_clip": 0.06462272, + "auxiliary_loss_mlp": 0.01269683, + "balance_loss_clip": 0.0628937, + "balance_loss_mlp": 0.01256505, + "epoch": 0.4257327521418909, + "flos": 23593181005440.0, + "grad_norm": 1.9610396393278133, + "language_loss": 0.80520535, + "learning_rate": 2.5701370773686646e-06, + "loss": 0.88252497, + "num_input_tokens_seen": 152013415, + "router_z_loss_clip": 1.72949219, + "router_z_loss_mlp": 0.13165283, + "step": 7081, + "time_per_iteration": 2.53387451171875 + }, + { + "auxiliary_loss_clip": 0.06452817, + "auxiliary_loss_mlp": 0.01271536, + "balance_loss_clip": 0.06286353, + "balance_loss_mlp": 0.01257844, + "epoch": 0.42579287539455885, + "flos": 18996079726080.0, + "grad_norm": 1.496926936820616, + "language_loss": 0.81558168, + "learning_rate": 2.5697637647606138e-06, + "loss": 0.89282513, + "num_input_tokens_seen": 152030860, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.13702393, + "step": 7082, + "time_per_iteration": 2.50972580909729 + }, + { + "auxiliary_loss_clip": 0.06462308, + "auxiliary_loss_mlp": 0.01271701, + "balance_loss_clip": 0.06289167, + "balance_loss_mlp": 0.0125745, + "epoch": 0.4258529986472268, + "flos": 25198921923840.0, + "grad_norm": 1.6583429285627758, + "language_loss": 0.70258069, + "learning_rate": 2.569390430547065e-06, + "loss": 0.77992082, + "num_input_tokens_seen": 152050395, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.14251709, + "step": 7083, + "time_per_iteration": 2.543390989303589 + }, + { + "auxiliary_loss_clip": 0.06373302, + "auxiliary_loss_mlp": 0.01258345, + "balance_loss_clip": 0.06290752, + "balance_loss_mlp": 0.01254316, + "epoch": 0.4259131218998948, + "flos": 69990277881600.0, + "grad_norm": 0.8555028711944374, + "language_loss": 0.67011017, + "learning_rate": 2.569017074742173e-06, + "loss": 0.74642664, + "num_input_tokens_seen": 152113555, + "router_z_loss_clip": 0.82714844, + "router_z_loss_mlp": 0.0402832, + "step": 7084, + "time_per_iteration": 4.592621803283691 + }, + { + "auxiliary_loss_clip": 0.0645996, + "auxiliary_loss_mlp": 0.01273486, + "balance_loss_clip": 0.06287397, + "balance_loss_mlp": 0.01259348, + "epoch": 0.42597324515256274, + "flos": 18010899745920.0, + "grad_norm": 6.078178213614668, + "language_loss": 0.78467649, + "learning_rate": 2.5686436973600964e-06, + "loss": 0.86201096, + "num_input_tokens_seen": 152131575, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.14135742, + "step": 7085, + "time_per_iteration": 4.053593635559082 + }, + { + "auxiliary_loss_clip": 0.0647409, + "auxiliary_loss_mlp": 0.01277113, + "balance_loss_clip": 0.0629435, + "balance_loss_mlp": 0.01262158, + "epoch": 0.4260333684052307, + "flos": 15164204659200.0, + "grad_norm": 2.149155774842141, + "language_loss": 0.7699095, + "learning_rate": 2.568270298414995e-06, + "loss": 0.84742153, + "num_input_tokens_seen": 152149435, + "router_z_loss_clip": 1.79785156, + "router_z_loss_mlp": 0.1496582, + "step": 7086, + "time_per_iteration": 2.480053424835205 + }, + { + "auxiliary_loss_clip": 0.06458418, + "auxiliary_loss_mlp": 0.01275137, + "balance_loss_clip": 0.06286179, + "balance_loss_mlp": 0.01260129, + "epoch": 0.42609349165789867, + "flos": 14944628234880.0, + "grad_norm": 1.8417550415955477, + "language_loss": 0.80286872, + "learning_rate": 2.5678968779210255e-06, + "loss": 0.88020432, + "num_input_tokens_seen": 152166860, + "router_z_loss_clip": 1.72070312, + "router_z_loss_mlp": 0.15026855, + "step": 7087, + "time_per_iteration": 2.5487940311431885 + }, + { + "auxiliary_loss_clip": 0.06464538, + "auxiliary_loss_mlp": 0.01271303, + "balance_loss_clip": 0.06291935, + "balance_loss_mlp": 0.01257183, + "epoch": 0.42615361491056664, + "flos": 23738642893440.0, + "grad_norm": 2.1069826106325213, + "language_loss": 0.66537511, + "learning_rate": 2.5675234358923505e-06, + "loss": 0.7427336, + "num_input_tokens_seen": 152187475, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.14111328, + "step": 7088, + "time_per_iteration": 2.5807759761810303 + }, + { + "auxiliary_loss_clip": 0.06470972, + "auxiliary_loss_mlp": 0.01274052, + "balance_loss_clip": 0.06293773, + "balance_loss_mlp": 0.01260402, + "epoch": 0.42621373816323466, + "flos": 24943399297920.0, + "grad_norm": 2.133950232933384, + "language_loss": 0.69013214, + "learning_rate": 2.56714997234313e-06, + "loss": 0.76758242, + "num_input_tokens_seen": 152207235, + "router_z_loss_clip": 1.76660156, + "router_z_loss_mlp": 0.13665771, + "step": 7089, + "time_per_iteration": 2.5817432403564453 + }, + { + "auxiliary_loss_clip": 0.06463064, + "auxiliary_loss_mlp": 0.0127013, + "balance_loss_clip": 0.0628805, + "balance_loss_mlp": 0.0125598, + "epoch": 0.4262738614159026, + "flos": 13558044470400.0, + "grad_norm": 4.212045379455766, + "language_loss": 0.74597216, + "learning_rate": 2.566776487287525e-06, + "loss": 0.82330406, + "num_input_tokens_seen": 152224240, + "router_z_loss_clip": 1.74804688, + "router_z_loss_mlp": 0.14141846, + "step": 7090, + "time_per_iteration": 3.9426205158233643 + }, + { + "auxiliary_loss_clip": 0.06464858, + "auxiliary_loss_mlp": 0.01272944, + "balance_loss_clip": 0.06287836, + "balance_loss_mlp": 0.01259211, + "epoch": 0.4263339846685706, + "flos": 29755926224640.0, + "grad_norm": 2.684790824023287, + "language_loss": 0.75386477, + "learning_rate": 2.5664029807396994e-06, + "loss": 0.8312428, + "num_input_tokens_seen": 152242595, + "router_z_loss_clip": 1.77148438, + "router_z_loss_mlp": 0.13745117, + "step": 7091, + "time_per_iteration": 2.563892126083374 + }, + { + "auxiliary_loss_clip": 0.0645293, + "auxiliary_loss_mlp": 0.01269396, + "balance_loss_clip": 0.06285767, + "balance_loss_mlp": 0.01257278, + "epoch": 0.42639410792123855, + "flos": 16839406212480.0, + "grad_norm": 1.8445868770478253, + "language_loss": 0.82496071, + "learning_rate": 2.5660294527138156e-06, + "loss": 0.90218395, + "num_input_tokens_seen": 152260840, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.12121582, + "step": 7092, + "time_per_iteration": 2.55583119392395 + }, + { + "auxiliary_loss_clip": 0.06467807, + "auxiliary_loss_mlp": 0.01271484, + "balance_loss_clip": 0.06288138, + "balance_loss_mlp": 0.01257567, + "epoch": 0.4264542311739065, + "flos": 28769991557760.0, + "grad_norm": 1.5226511822280566, + "language_loss": 0.73850381, + "learning_rate": 2.565655903224038e-06, + "loss": 0.81589675, + "num_input_tokens_seen": 152280580, + "router_z_loss_clip": 1.796875, + "router_z_loss_mlp": 0.13922119, + "step": 7093, + "time_per_iteration": 4.021864414215088 + }, + { + "auxiliary_loss_clip": 0.06460725, + "auxiliary_loss_mlp": 0.012688, + "balance_loss_clip": 0.06287876, + "balance_loss_mlp": 0.01254512, + "epoch": 0.4265143544265745, + "flos": 24719881731840.0, + "grad_norm": 2.2430846112789617, + "language_loss": 0.70883787, + "learning_rate": 2.565282332284532e-06, + "loss": 0.78613305, + "num_input_tokens_seen": 152298455, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.14300537, + "step": 7094, + "time_per_iteration": 2.5826168060302734 + }, + { + "auxiliary_loss_clip": 0.06461484, + "auxiliary_loss_mlp": 0.01268246, + "balance_loss_clip": 0.06287476, + "balance_loss_mlp": 0.0125381, + "epoch": 0.42657447767924245, + "flos": 21871467636480.0, + "grad_norm": 1.4959257312535472, + "language_loss": 0.81979394, + "learning_rate": 2.564908739909464e-06, + "loss": 0.89709127, + "num_input_tokens_seen": 152316995, + "router_z_loss_clip": 1.73925781, + "router_z_loss_mlp": 0.14428711, + "step": 7095, + "time_per_iteration": 2.5714282989501953 + }, + { + "auxiliary_loss_clip": 0.06464021, + "auxiliary_loss_mlp": 0.0127044, + "balance_loss_clip": 0.06287175, + "balance_loss_mlp": 0.01255831, + "epoch": 0.4266346009319104, + "flos": 21476604470400.0, + "grad_norm": 2.7630559086257533, + "language_loss": 0.80476701, + "learning_rate": 2.5645351261129996e-06, + "loss": 0.88211161, + "num_input_tokens_seen": 152334800, + "router_z_loss_clip": 1.76757812, + "router_z_loss_mlp": 0.1461792, + "step": 7096, + "time_per_iteration": 2.52101731300354 + }, + { + "auxiliary_loss_clip": 0.06471846, + "auxiliary_loss_mlp": 0.0126828, + "balance_loss_clip": 0.06290311, + "balance_loss_mlp": 0.01253946, + "epoch": 0.4266947241845784, + "flos": 25526295025920.0, + "grad_norm": 2.003429077322888, + "language_loss": 0.65857691, + "learning_rate": 2.5641614909093066e-06, + "loss": 0.73597825, + "num_input_tokens_seen": 152355175, + "router_z_loss_clip": 1.81542969, + "router_z_loss_mlp": 0.14331055, + "step": 7097, + "time_per_iteration": 2.6010050773620605 + }, + { + "auxiliary_loss_clip": 0.0645384, + "auxiliary_loss_mlp": 0.01272923, + "balance_loss_clip": 0.06282586, + "balance_loss_mlp": 0.01259601, + "epoch": 0.42675484743724634, + "flos": 26548343602560.0, + "grad_norm": 1.7498935394273216, + "language_loss": 0.75170088, + "learning_rate": 2.5637878343125535e-06, + "loss": 0.82896858, + "num_input_tokens_seen": 152377245, + "router_z_loss_clip": 1.71386719, + "router_z_loss_mlp": 0.13317871, + "step": 7098, + "time_per_iteration": 2.5674946308135986 + }, + { + "auxiliary_loss_clip": 0.06458846, + "auxiliary_loss_mlp": 0.01274446, + "balance_loss_clip": 0.0628911, + "balance_loss_mlp": 0.01260033, + "epoch": 0.4268149706899143, + "flos": 23119465547520.0, + "grad_norm": 1.6850998762786562, + "language_loss": 0.75184697, + "learning_rate": 2.5634141563369086e-06, + "loss": 0.82917988, + "num_input_tokens_seen": 152396985, + "router_z_loss_clip": 1.69726562, + "router_z_loss_mlp": 0.14428711, + "step": 7099, + "time_per_iteration": 2.5784735679626465 + }, + { + "auxiliary_loss_clip": 0.06459826, + "auxiliary_loss_mlp": 0.01273278, + "balance_loss_clip": 0.06283994, + "balance_loss_mlp": 0.01259116, + "epoch": 0.4268750939425823, + "flos": 22712401612800.0, + "grad_norm": 2.0765509228592802, + "language_loss": 0.83059096, + "learning_rate": 2.5630404569965432e-06, + "loss": 0.90792197, + "num_input_tokens_seen": 152415590, + "router_z_loss_clip": 1.75683594, + "router_z_loss_mlp": 0.14172363, + "step": 7100, + "time_per_iteration": 2.520923614501953 + }, + { + "auxiliary_loss_clip": 0.06459752, + "auxiliary_loss_mlp": 0.01269142, + "balance_loss_clip": 0.06284218, + "balance_loss_mlp": 0.01255839, + "epoch": 0.42693521719525024, + "flos": 25382007095040.0, + "grad_norm": 1.4351436052366604, + "language_loss": 0.82259512, + "learning_rate": 2.562666736305627e-06, + "loss": 0.8998841, + "num_input_tokens_seen": 152436735, + "router_z_loss_clip": 1.75292969, + "router_z_loss_mlp": 0.13311768, + "step": 7101, + "time_per_iteration": 2.595768451690674 + }, + { + "auxiliary_loss_clip": 0.06466523, + "auxiliary_loss_mlp": 0.01273606, + "balance_loss_clip": 0.06287891, + "balance_loss_mlp": 0.01259099, + "epoch": 0.42699534044791826, + "flos": 18156613196160.0, + "grad_norm": 2.266580923573967, + "language_loss": 0.72800845, + "learning_rate": 2.5622929942783314e-06, + "loss": 0.80540979, + "num_input_tokens_seen": 152455685, + "router_z_loss_clip": 1.78613281, + "router_z_loss_mlp": 0.14501953, + "step": 7102, + "time_per_iteration": 2.494673252105713 + }, + { + "auxiliary_loss_clip": 0.06457532, + "auxiliary_loss_mlp": 0.0127168, + "balance_loss_clip": 0.06287985, + "balance_loss_mlp": 0.01257935, + "epoch": 0.4270554637005862, + "flos": 13703422504320.0, + "grad_norm": 2.1781975733094936, + "language_loss": 0.83514953, + "learning_rate": 2.5619192309288297e-06, + "loss": 0.91244167, + "num_input_tokens_seen": 152473500, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.13751221, + "step": 7103, + "time_per_iteration": 2.506204128265381 + }, + { + "auxiliary_loss_clip": 0.06465043, + "auxiliary_loss_mlp": 0.01274672, + "balance_loss_clip": 0.0628773, + "balance_loss_mlp": 0.01259753, + "epoch": 0.4271155869532542, + "flos": 17499351369600.0, + "grad_norm": 2.042502996026563, + "language_loss": 0.73773789, + "learning_rate": 2.561545446271294e-06, + "loss": 0.815135, + "num_input_tokens_seen": 152491320, + "router_z_loss_clip": 1.77246094, + "router_z_loss_mlp": 0.14916992, + "step": 7104, + "time_per_iteration": 2.5006070137023926 + }, + { + "auxiliary_loss_clip": 0.06459317, + "auxiliary_loss_mlp": 0.01274322, + "balance_loss_clip": 0.0628491, + "balance_loss_mlp": 0.01260494, + "epoch": 0.42717571020592215, + "flos": 32460471659520.0, + "grad_norm": 3.22189729136274, + "language_loss": 0.75052768, + "learning_rate": 2.5611716403198987e-06, + "loss": 0.82786405, + "num_input_tokens_seen": 152511970, + "router_z_loss_clip": 1.74609375, + "router_z_loss_mlp": 0.13830566, + "step": 7105, + "time_per_iteration": 2.607759475708008 + }, + { + "auxiliary_loss_clip": 0.06461999, + "auxiliary_loss_mlp": 0.01274519, + "balance_loss_clip": 0.06286199, + "balance_loss_mlp": 0.01261168, + "epoch": 0.4272358334585901, + "flos": 16258606836480.0, + "grad_norm": 17.703344591331568, + "language_loss": 0.77349067, + "learning_rate": 2.560797813088819e-06, + "loss": 0.85085583, + "num_input_tokens_seen": 152530515, + "router_z_loss_clip": 1.75878906, + "router_z_loss_mlp": 0.13354492, + "step": 7106, + "time_per_iteration": 2.4834203720092773 + }, + { + "auxiliary_loss_clip": 0.06461152, + "auxiliary_loss_mlp": 0.01276721, + "balance_loss_clip": 0.06287872, + "balance_loss_mlp": 0.01262499, + "epoch": 0.4272959567112581, + "flos": 24205817733120.0, + "grad_norm": 1.9445558892844073, + "language_loss": 0.8013317, + "learning_rate": 2.560423964592229e-06, + "loss": 0.87871039, + "num_input_tokens_seen": 152549295, + "router_z_loss_clip": 1.73046875, + "router_z_loss_mlp": 0.14233398, + "step": 7107, + "time_per_iteration": 2.5639657974243164 + }, + { + "auxiliary_loss_clip": 0.06454289, + "auxiliary_loss_mlp": 0.01267783, + "balance_loss_clip": 0.06283173, + "balance_loss_mlp": 0.01253424, + "epoch": 0.42735607996392605, + "flos": 27970747787520.0, + "grad_norm": 1.710799907332892, + "language_loss": 0.68469441, + "learning_rate": 2.5600500948443075e-06, + "loss": 0.76191515, + "num_input_tokens_seen": 152570725, + "router_z_loss_clip": 1.70996094, + "router_z_loss_mlp": 0.14349365, + "step": 7108, + "time_per_iteration": 2.5538556575775146 + }, + { + "auxiliary_loss_clip": 0.06460684, + "auxiliary_loss_mlp": 0.01273244, + "balance_loss_clip": 0.06285615, + "balance_loss_mlp": 0.01258712, + "epoch": 0.427416203216594, + "flos": 20300582816640.0, + "grad_norm": 2.1700047707431342, + "language_loss": 0.72192961, + "learning_rate": 2.5596762038592294e-06, + "loss": 0.79926884, + "num_input_tokens_seen": 152588950, + "router_z_loss_clip": 1.75097656, + "router_z_loss_mlp": 0.14520264, + "step": 7109, + "time_per_iteration": 2.5418453216552734 + }, + { + "auxiliary_loss_clip": 0.06462875, + "auxiliary_loss_mlp": 0.01279728, + "balance_loss_clip": 0.06288399, + "balance_loss_mlp": 0.01264159, + "epoch": 0.427476326469262, + "flos": 26951382541440.0, + "grad_norm": 2.7192306397859034, + "language_loss": 0.64651388, + "learning_rate": 2.559302291651174e-06, + "loss": 0.7239399, + "num_input_tokens_seen": 152608965, + "router_z_loss_clip": 1.74511719, + "router_z_loss_mlp": 0.15551758, + "step": 7110, + "time_per_iteration": 2.6708264350891113 + }, + { + "auxiliary_loss_clip": 0.06457267, + "auxiliary_loss_mlp": 0.01278945, + "balance_loss_clip": 0.06284395, + "balance_loss_mlp": 0.01264056, + "epoch": 0.42753644972192995, + "flos": 25709967175680.0, + "grad_norm": 2.127603657525877, + "language_loss": 0.76798368, + "learning_rate": 2.5589283582343197e-06, + "loss": 0.84534585, + "num_input_tokens_seen": 152630220, + "router_z_loss_clip": 1.72851562, + "router_z_loss_mlp": 0.14880371, + "step": 7111, + "time_per_iteration": 2.678954601287842 + }, + { + "auxiliary_loss_clip": 0.0646024, + "auxiliary_loss_mlp": 0.01269729, + "balance_loss_clip": 0.06282812, + "balance_loss_mlp": 0.01255352, + "epoch": 0.4275965729745979, + "flos": 18772855649280.0, + "grad_norm": 1.9451066993795918, + "language_loss": 0.73479104, + "learning_rate": 2.558554403622845e-06, + "loss": 0.81209064, + "num_input_tokens_seen": 152648835, + "router_z_loss_clip": 1.77441406, + "router_z_loss_mlp": 0.1439209, + "step": 7112, + "time_per_iteration": 2.4913687705993652 + }, + { + "auxiliary_loss_clip": 0.06453889, + "auxiliary_loss_mlp": 0.01274214, + "balance_loss_clip": 0.06283249, + "balance_loss_mlp": 0.01260248, + "epoch": 0.4276566962272659, + "flos": 23770438318080.0, + "grad_norm": 1.6965987454612683, + "language_loss": 0.71646041, + "learning_rate": 2.5581804278309323e-06, + "loss": 0.79374146, + "num_input_tokens_seen": 152668375, + "router_z_loss_clip": 1.70800781, + "router_z_loss_mlp": 0.13964844, + "step": 7113, + "time_per_iteration": 2.567722797393799 + }, + { + "auxiliary_loss_clip": 0.06462316, + "auxiliary_loss_mlp": 0.01277106, + "balance_loss_clip": 0.06286302, + "balance_loss_mlp": 0.01262157, + "epoch": 0.42771681947993384, + "flos": 22499156171520.0, + "grad_norm": 1.507728091462329, + "language_loss": 0.61987239, + "learning_rate": 2.5578064308727617e-06, + "loss": 0.69726658, + "num_input_tokens_seen": 152689725, + "router_z_loss_clip": 1.75878906, + "router_z_loss_mlp": 0.14953613, + "step": 7114, + "time_per_iteration": 2.5800352096557617 + }, + { + "auxiliary_loss_clip": 0.06466354, + "auxiliary_loss_mlp": 0.01281834, + "balance_loss_clip": 0.06284335, + "balance_loss_mlp": 0.01264895, + "epoch": 0.42777694273260186, + "flos": 25051489464960.0, + "grad_norm": 1.9424022728130763, + "language_loss": 0.64557558, + "learning_rate": 2.5574324127625153e-06, + "loss": 0.72305751, + "num_input_tokens_seen": 152709375, + "router_z_loss_clip": 1.8203125, + "router_z_loss_mlp": 0.16943359, + "step": 7115, + "time_per_iteration": 2.625234603881836 + }, + { + "auxiliary_loss_clip": 0.06458592, + "auxiliary_loss_mlp": 0.01271806, + "balance_loss_clip": 0.06283341, + "balance_loss_mlp": 0.01257668, + "epoch": 0.4278370659852698, + "flos": 18667532666880.0, + "grad_norm": 1.4802584121928888, + "language_loss": 0.73841792, + "learning_rate": 2.5570583735143753e-06, + "loss": 0.81572187, + "num_input_tokens_seen": 152727510, + "router_z_loss_clip": 1.75195312, + "router_z_loss_mlp": 0.14141846, + "step": 7116, + "time_per_iteration": 2.517512798309326 + }, + { + "auxiliary_loss_clip": 0.06453552, + "auxiliary_loss_mlp": 0.0127651, + "balance_loss_clip": 0.06284202, + "balance_loss_mlp": 0.01262461, + "epoch": 0.4278971892379378, + "flos": 27315666167040.0, + "grad_norm": 1.6819154869474044, + "language_loss": 0.69691694, + "learning_rate": 2.5566843131425275e-06, + "loss": 0.77421755, + "num_input_tokens_seen": 152746670, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.14044189, + "step": 7117, + "time_per_iteration": 2.5842087268829346 + }, + { + "auxiliary_loss_clip": 0.06455907, + "auxiliary_loss_mlp": 0.01274379, + "balance_loss_clip": 0.06285148, + "balance_loss_mlp": 0.0126008, + "epoch": 0.42795731249060576, + "flos": 12892397235840.0, + "grad_norm": 2.190420439429125, + "language_loss": 0.69763142, + "learning_rate": 2.5563102316611536e-06, + "loss": 0.77493429, + "num_input_tokens_seen": 152760545, + "router_z_loss_clip": 1.70800781, + "router_z_loss_mlp": 0.14306641, + "step": 7118, + "time_per_iteration": 2.480435609817505 + }, + { + "auxiliary_loss_clip": 0.06457028, + "auxiliary_loss_mlp": 0.01277321, + "balance_loss_clip": 0.06285428, + "balance_loss_mlp": 0.01262109, + "epoch": 0.4280174357432737, + "flos": 33409873146240.0, + "grad_norm": 2.392758427844577, + "language_loss": 0.74691743, + "learning_rate": 2.55593612908444e-06, + "loss": 0.82426095, + "num_input_tokens_seen": 152780970, + "router_z_loss_clip": 1.71582031, + "router_z_loss_mlp": 0.15197754, + "step": 7119, + "time_per_iteration": 2.633418083190918 + }, + { + "auxiliary_loss_clip": 0.06453852, + "auxiliary_loss_mlp": 0.01276265, + "balance_loss_clip": 0.06282485, + "balance_loss_mlp": 0.0126134, + "epoch": 0.4280775589959417, + "flos": 18264871071360.0, + "grad_norm": 2.26485992413173, + "language_loss": 0.75017536, + "learning_rate": 2.555562005426573e-06, + "loss": 0.8274765, + "num_input_tokens_seen": 152798475, + "router_z_loss_clip": 1.71386719, + "router_z_loss_mlp": 0.14916992, + "step": 7120, + "time_per_iteration": 2.4857230186462402 + }, + { + "auxiliary_loss_clip": 0.06459665, + "auxiliary_loss_mlp": 0.01279872, + "balance_loss_clip": 0.062869, + "balance_loss_mlp": 0.01265883, + "epoch": 0.42813768224860965, + "flos": 21477820354560.0, + "grad_norm": 1.904077899556691, + "language_loss": 0.77223492, + "learning_rate": 2.5551878607017385e-06, + "loss": 0.8496303, + "num_input_tokens_seen": 152817555, + "router_z_loss_clip": 1.72753906, + "router_z_loss_mlp": 0.13989258, + "step": 7121, + "time_per_iteration": 2.547011375427246 + }, + { + "auxiliary_loss_clip": 0.06450777, + "auxiliary_loss_mlp": 0.01281298, + "balance_loss_clip": 0.06280679, + "balance_loss_mlp": 0.01267255, + "epoch": 0.4281978055012776, + "flos": 15674704859520.0, + "grad_norm": 1.7733631777850345, + "language_loss": 0.85767531, + "learning_rate": 2.554813694924126e-06, + "loss": 0.93499613, + "num_input_tokens_seen": 152836295, + "router_z_loss_clip": 1.70117188, + "router_z_loss_mlp": 0.14056396, + "step": 7122, + "time_per_iteration": 2.488633155822754 + }, + { + "auxiliary_loss_clip": 0.06454846, + "auxiliary_loss_mlp": 0.01275392, + "balance_loss_clip": 0.06284268, + "balance_loss_mlp": 0.01261022, + "epoch": 0.4282579287539456, + "flos": 17717711909760.0, + "grad_norm": 2.3186837977879886, + "language_loss": 0.8157897, + "learning_rate": 2.554439508107921e-06, + "loss": 0.89309216, + "num_input_tokens_seen": 152854950, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.14355469, + "step": 7123, + "time_per_iteration": 3.969069719314575 + }, + { + "auxiliary_loss_clip": 0.06453736, + "auxiliary_loss_mlp": 0.01276304, + "balance_loss_clip": 0.06284729, + "balance_loss_mlp": 0.01262034, + "epoch": 0.42831805200661355, + "flos": 19287171210240.0, + "grad_norm": 1.594767030772038, + "language_loss": 0.80927598, + "learning_rate": 2.5540653002673153e-06, + "loss": 0.88657635, + "num_input_tokens_seen": 152873995, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.14257812, + "step": 7124, + "time_per_iteration": 3.901512861251831 + }, + { + "auxiliary_loss_clip": 0.06454194, + "auxiliary_loss_mlp": 0.01273804, + "balance_loss_clip": 0.06283361, + "balance_loss_mlp": 0.01258312, + "epoch": 0.4283781752592815, + "flos": 19798845367680.0, + "grad_norm": 1.7493536594312618, + "language_loss": 0.81056678, + "learning_rate": 2.553691071416498e-06, + "loss": 0.88784677, + "num_input_tokens_seen": 152892925, + "router_z_loss_clip": 1.70703125, + "router_z_loss_mlp": 0.15484619, + "step": 7125, + "time_per_iteration": 2.561479091644287 + }, + { + "auxiliary_loss_clip": 0.06453275, + "auxiliary_loss_mlp": 0.0127252, + "balance_loss_clip": 0.06283629, + "balance_loss_mlp": 0.01259467, + "epoch": 0.4284382985119495, + "flos": 16513584410880.0, + "grad_norm": 2.012470201752393, + "language_loss": 0.75256401, + "learning_rate": 2.553316821569659e-06, + "loss": 0.829822, + "num_input_tokens_seen": 152910935, + "router_z_loss_clip": 1.69824219, + "router_z_loss_mlp": 0.13037109, + "step": 7126, + "time_per_iteration": 2.550835371017456 + }, + { + "auxiliary_loss_clip": 0.06454661, + "auxiliary_loss_mlp": 0.01269423, + "balance_loss_clip": 0.06280357, + "balance_loss_mlp": 0.01255518, + "epoch": 0.42849842176461744, + "flos": 23337406817280.0, + "grad_norm": 1.7018740006461155, + "language_loss": 0.81619167, + "learning_rate": 2.5529425507409913e-06, + "loss": 0.8934325, + "num_input_tokens_seen": 152931030, + "router_z_loss_clip": 1.74511719, + "router_z_loss_mlp": 0.13916016, + "step": 7127, + "time_per_iteration": 2.512833833694458 + }, + { + "auxiliary_loss_clip": 0.06455937, + "auxiliary_loss_mlp": 0.01269506, + "balance_loss_clip": 0.06282341, + "balance_loss_mlp": 0.01254659, + "epoch": 0.4285585450172854, + "flos": 17280110361600.0, + "grad_norm": 1.7733778395824964, + "language_loss": 0.76877725, + "learning_rate": 2.5525682589446867e-06, + "loss": 0.84603173, + "num_input_tokens_seen": 152948085, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.14837646, + "step": 7128, + "time_per_iteration": 2.54837703704834 + }, + { + "auxiliary_loss_clip": 0.06458156, + "auxiliary_loss_mlp": 0.01271641, + "balance_loss_clip": 0.06282061, + "balance_loss_mlp": 0.01255726, + "epoch": 0.42861866826995343, + "flos": 24286430960640.0, + "grad_norm": 1.8449893243882522, + "language_loss": 0.74647015, + "learning_rate": 2.552193946194937e-06, + "loss": 0.82376814, + "num_input_tokens_seen": 152966265, + "router_z_loss_clip": 1.76074219, + "router_z_loss_mlp": 0.15917969, + "step": 7129, + "time_per_iteration": 2.5513017177581787 + }, + { + "auxiliary_loss_clip": 0.06454159, + "auxiliary_loss_mlp": 0.0127295, + "balance_loss_clip": 0.06282164, + "balance_loss_mlp": 0.01258949, + "epoch": 0.4286787915226214, + "flos": 24360042372480.0, + "grad_norm": 1.8999084688655365, + "language_loss": 0.7830866, + "learning_rate": 2.5518196125059394e-06, + "loss": 0.86035764, + "num_input_tokens_seen": 152986775, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.14007568, + "step": 7130, + "time_per_iteration": 3.9916892051696777 + }, + { + "auxiliary_loss_clip": 0.06456774, + "auxiliary_loss_mlp": 0.01278579, + "balance_loss_clip": 0.06282126, + "balance_loss_mlp": 0.01263618, + "epoch": 0.42873891477528936, + "flos": 15455338070400.0, + "grad_norm": 2.1626861971351263, + "language_loss": 0.73881406, + "learning_rate": 2.551445257891886e-06, + "loss": 0.81616759, + "num_input_tokens_seen": 153003595, + "router_z_loss_clip": 1.74707031, + "router_z_loss_mlp": 0.1496582, + "step": 7131, + "time_per_iteration": 2.504786252975464 + }, + { + "auxiliary_loss_clip": 0.06455156, + "auxiliary_loss_mlp": 0.01273453, + "balance_loss_clip": 0.06282241, + "balance_loss_mlp": 0.01258183, + "epoch": 0.4287990380279573, + "flos": 17645358309120.0, + "grad_norm": 2.0546861067047533, + "language_loss": 0.77884281, + "learning_rate": 2.551070882366973e-06, + "loss": 0.85612893, + "num_input_tokens_seen": 153021960, + "router_z_loss_clip": 1.72753906, + "router_z_loss_mlp": 0.15270996, + "step": 7132, + "time_per_iteration": 2.5048811435699463 + }, + { + "auxiliary_loss_clip": 0.06456134, + "auxiliary_loss_mlp": 0.01270516, + "balance_loss_clip": 0.06281912, + "balance_loss_mlp": 0.01254542, + "epoch": 0.4288591612806253, + "flos": 27169701154560.0, + "grad_norm": 1.7726331897563596, + "language_loss": 0.78733218, + "learning_rate": 2.550696485945397e-06, + "loss": 0.86459869, + "num_input_tokens_seen": 153042110, + "router_z_loss_clip": 1.74121094, + "router_z_loss_mlp": 0.1595459, + "step": 7133, + "time_per_iteration": 4.068531036376953 + }, + { + "auxiliary_loss_clip": 0.06450784, + "auxiliary_loss_mlp": 0.01268858, + "balance_loss_clip": 0.06277733, + "balance_loss_mlp": 0.01254785, + "epoch": 0.42891928453329325, + "flos": 17168540250240.0, + "grad_norm": 1.7118267088696246, + "language_loss": 0.7483775, + "learning_rate": 2.550322068641355e-06, + "loss": 0.82557392, + "num_input_tokens_seen": 153058925, + "router_z_loss_clip": 1.73046875, + "router_z_loss_mlp": 0.14068604, + "step": 7134, + "time_per_iteration": 2.504011631011963 + }, + { + "auxiliary_loss_clip": 0.06450233, + "auxiliary_loss_mlp": 0.01272762, + "balance_loss_clip": 0.06279828, + "balance_loss_mlp": 0.0125882, + "epoch": 0.4289794077859612, + "flos": 18192936741120.0, + "grad_norm": 1.9195667435408965, + "language_loss": 0.84458339, + "learning_rate": 2.5499476304690455e-06, + "loss": 0.92181337, + "num_input_tokens_seen": 153078070, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.13946533, + "step": 7135, + "time_per_iteration": 2.4924819469451904 + }, + { + "auxiliary_loss_clip": 0.06447092, + "auxiliary_loss_mlp": 0.01268039, + "balance_loss_clip": 0.06279005, + "balance_loss_mlp": 0.01253949, + "epoch": 0.4290395310386292, + "flos": 28264438748160.0, + "grad_norm": 2.116473983113214, + "language_loss": 0.754601, + "learning_rate": 2.549573171442666e-06, + "loss": 0.8317523, + "num_input_tokens_seen": 153096680, + "router_z_loss_clip": 1.6796875, + "router_z_loss_mlp": 0.14099121, + "step": 7136, + "time_per_iteration": 2.579450845718384 + }, + { + "auxiliary_loss_clip": 0.06453092, + "auxiliary_loss_mlp": 0.01272367, + "balance_loss_clip": 0.06277236, + "balance_loss_mlp": 0.01257895, + "epoch": 0.42909965429129715, + "flos": 16221528604800.0, + "grad_norm": 1.8728665886520197, + "language_loss": 0.79211873, + "learning_rate": 2.5491986915764175e-06, + "loss": 0.86937326, + "num_input_tokens_seen": 153113305, + "router_z_loss_clip": 1.75976562, + "router_z_loss_mlp": 0.14465332, + "step": 7137, + "time_per_iteration": 2.485880136489868 + }, + { + "auxiliary_loss_clip": 0.06452384, + "auxiliary_loss_mlp": 0.01271962, + "balance_loss_clip": 0.06279657, + "balance_loss_mlp": 0.01257359, + "epoch": 0.4291597775439651, + "flos": 23119633255680.0, + "grad_norm": 1.8713356259191796, + "language_loss": 0.76152903, + "learning_rate": 2.548824190884499e-06, + "loss": 0.83877248, + "num_input_tokens_seen": 153132735, + "router_z_loss_clip": 1.72753906, + "router_z_loss_mlp": 0.14605713, + "step": 7138, + "time_per_iteration": 2.5630223751068115 + }, + { + "auxiliary_loss_clip": 0.06367285, + "auxiliary_loss_mlp": 0.01254388, + "balance_loss_clip": 0.06288805, + "balance_loss_mlp": 0.01250711, + "epoch": 0.4292199007966331, + "flos": 67565461703040.0, + "grad_norm": 0.7609122933706777, + "language_loss": 0.5608238, + "learning_rate": 2.548449669381113e-06, + "loss": 0.63704056, + "num_input_tokens_seen": 153187925, + "router_z_loss_clip": 0.78515625, + "router_z_loss_mlp": 0.03668213, + "step": 7139, + "time_per_iteration": 3.0345327854156494 + }, + { + "auxiliary_loss_clip": 0.06448679, + "auxiliary_loss_mlp": 0.01269902, + "balance_loss_clip": 0.06282055, + "balance_loss_mlp": 0.01256861, + "epoch": 0.42928002404930105, + "flos": 23006008719360.0, + "grad_norm": 1.7405631209015646, + "language_loss": 0.81563902, + "learning_rate": 2.5480751270804595e-06, + "loss": 0.89282477, + "num_input_tokens_seen": 153206990, + "router_z_loss_clip": 1.66699219, + "router_z_loss_mlp": 0.13049316, + "step": 7140, + "time_per_iteration": 2.5697882175445557 + }, + { + "auxiliary_loss_clip": 0.06455392, + "auxiliary_loss_mlp": 0.01267223, + "balance_loss_clip": 0.0628099, + "balance_loss_mlp": 0.01252543, + "epoch": 0.429340147301969, + "flos": 11549432321280.0, + "grad_norm": 1.8011940744465647, + "language_loss": 0.82215559, + "learning_rate": 2.5477005639967424e-06, + "loss": 0.89938176, + "num_input_tokens_seen": 153222345, + "router_z_loss_clip": 1.74316406, + "router_z_loss_mlp": 0.14678955, + "step": 7141, + "time_per_iteration": 2.4844813346862793 + }, + { + "auxiliary_loss_clip": 0.0646215, + "auxiliary_loss_mlp": 0.0128237, + "balance_loss_clip": 0.06283965, + "balance_loss_mlp": 0.01266336, + "epoch": 0.42940027055463703, + "flos": 25272030211200.0, + "grad_norm": 2.0081644747821947, + "language_loss": 0.86468136, + "learning_rate": 2.547325980144166e-06, + "loss": 0.94212657, + "num_input_tokens_seen": 153240570, + "router_z_loss_clip": 1.78222656, + "router_z_loss_mlp": 0.16027832, + "step": 7142, + "time_per_iteration": 2.570967674255371 + }, + { + "auxiliary_loss_clip": 0.0645667, + "auxiliary_loss_mlp": 0.01269132, + "balance_loss_clip": 0.06288485, + "balance_loss_mlp": 0.01255596, + "epoch": 0.429460393807305, + "flos": 23811709253760.0, + "grad_norm": 2.010483035293097, + "language_loss": 0.78394985, + "learning_rate": 2.5469513755369323e-06, + "loss": 0.86120784, + "num_input_tokens_seen": 153259575, + "router_z_loss_clip": 1.68066406, + "router_z_loss_mlp": 0.13549805, + "step": 7143, + "time_per_iteration": 2.5245959758758545 + }, + { + "auxiliary_loss_clip": 0.06458203, + "auxiliary_loss_mlp": 0.01271797, + "balance_loss_clip": 0.06286128, + "balance_loss_mlp": 0.01257689, + "epoch": 0.42952051705997296, + "flos": 13923502053120.0, + "grad_norm": 1.8646185905931467, + "language_loss": 0.77133417, + "learning_rate": 2.5465767501892484e-06, + "loss": 0.84863412, + "num_input_tokens_seen": 153276650, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.14117432, + "step": 7144, + "time_per_iteration": 2.5442261695861816 + }, + { + "auxiliary_loss_clip": 0.0645657, + "auxiliary_loss_mlp": 0.01274131, + "balance_loss_clip": 0.06283006, + "balance_loss_mlp": 0.0125973, + "epoch": 0.4295806403126409, + "flos": 26767584610560.0, + "grad_norm": 1.5670382727140026, + "language_loss": 0.74293256, + "learning_rate": 2.54620210411532e-06, + "loss": 0.8202396, + "num_input_tokens_seen": 153298025, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.14404297, + "step": 7145, + "time_per_iteration": 2.5812947750091553 + }, + { + "auxiliary_loss_clip": 0.06458145, + "auxiliary_loss_mlp": 0.01276391, + "balance_loss_clip": 0.06281675, + "balance_loss_mlp": 0.01261585, + "epoch": 0.4296407635653089, + "flos": 20957760789120.0, + "grad_norm": 2.084760622121642, + "language_loss": 0.79444236, + "learning_rate": 2.545827437329352e-06, + "loss": 0.87178773, + "num_input_tokens_seen": 153315775, + "router_z_loss_clip": 1.76269531, + "router_z_loss_mlp": 0.14807129, + "step": 7146, + "time_per_iteration": 2.5411908626556396 + }, + { + "auxiliary_loss_clip": 0.0645076, + "auxiliary_loss_mlp": 0.01276231, + "balance_loss_clip": 0.06280234, + "balance_loss_mlp": 0.01262373, + "epoch": 0.42970088681797686, + "flos": 15857915811840.0, + "grad_norm": 1.9977945232207481, + "language_loss": 0.83012491, + "learning_rate": 2.5454527498455532e-06, + "loss": 0.90739477, + "num_input_tokens_seen": 153332765, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.13867188, + "step": 7147, + "time_per_iteration": 2.4752652645111084 + }, + { + "auxiliary_loss_clip": 0.06456682, + "auxiliary_loss_mlp": 0.01274227, + "balance_loss_clip": 0.06283284, + "balance_loss_mlp": 0.01258622, + "epoch": 0.4297610100706448, + "flos": 22389179287680.0, + "grad_norm": 1.9494252458685553, + "language_loss": 0.87818855, + "learning_rate": 2.545078041678131e-06, + "loss": 0.95549762, + "num_input_tokens_seen": 153350760, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.15612793, + "step": 7148, + "time_per_iteration": 2.5504684448242188 + }, + { + "auxiliary_loss_clip": 0.06459592, + "auxiliary_loss_mlp": 0.0127006, + "balance_loss_clip": 0.06287406, + "balance_loss_mlp": 0.01255689, + "epoch": 0.4298211333233128, + "flos": 27932705233920.0, + "grad_norm": 1.7901480630114543, + "language_loss": 0.78474885, + "learning_rate": 2.5447033128412957e-06, + "loss": 0.86204541, + "num_input_tokens_seen": 153370765, + "router_z_loss_clip": 1.72265625, + "router_z_loss_mlp": 0.14373779, + "step": 7149, + "time_per_iteration": 2.5467026233673096 + }, + { + "auxiliary_loss_clip": 0.06454438, + "auxiliary_loss_mlp": 0.01275691, + "balance_loss_clip": 0.06285315, + "balance_loss_mlp": 0.01261153, + "epoch": 0.42988125657598075, + "flos": 24432479827200.0, + "grad_norm": 1.6909372302648806, + "language_loss": 0.79794931, + "learning_rate": 2.544328563349256e-06, + "loss": 0.87525058, + "num_input_tokens_seen": 153390725, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.14550781, + "step": 7150, + "time_per_iteration": 2.5642549991607666 + }, + { + "auxiliary_loss_clip": 0.06463797, + "auxiliary_loss_mlp": 0.01273266, + "balance_loss_clip": 0.06283444, + "balance_loss_mlp": 0.01256636, + "epoch": 0.4299413798286487, + "flos": 15855400189440.0, + "grad_norm": 1.6104667865383644, + "language_loss": 0.75438166, + "learning_rate": 2.5439537932162222e-06, + "loss": 0.8317523, + "num_input_tokens_seen": 153408010, + "router_z_loss_clip": 1.80371094, + "router_z_loss_mlp": 0.16638184, + "step": 7151, + "time_per_iteration": 2.47206711769104 + }, + { + "auxiliary_loss_clip": 0.06463672, + "auxiliary_loss_mlp": 0.01271158, + "balance_loss_clip": 0.06284998, + "balance_loss_mlp": 0.01256179, + "epoch": 0.4300015030813167, + "flos": 22316029073280.0, + "grad_norm": 1.9504143763164294, + "language_loss": 0.70926738, + "learning_rate": 2.543579002456406e-06, + "loss": 0.78661567, + "num_input_tokens_seen": 153426865, + "router_z_loss_clip": 1.78808594, + "router_z_loss_mlp": 0.14984131, + "step": 7152, + "time_per_iteration": 2.541208267211914 + }, + { + "auxiliary_loss_clip": 0.06452823, + "auxiliary_loss_mlp": 0.01271847, + "balance_loss_clip": 0.06279409, + "balance_loss_mlp": 0.01257482, + "epoch": 0.43006162633398465, + "flos": 34906391867520.0, + "grad_norm": 1.81395768481921, + "language_loss": 0.7223562, + "learning_rate": 2.54320419108402e-06, + "loss": 0.79960287, + "num_input_tokens_seen": 153449410, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.14361572, + "step": 7153, + "time_per_iteration": 2.6242926120758057 + }, + { + "auxiliary_loss_clip": 0.064519, + "auxiliary_loss_mlp": 0.01271383, + "balance_loss_clip": 0.06279962, + "balance_loss_mlp": 0.01257018, + "epoch": 0.4301217495866526, + "flos": 15967138008960.0, + "grad_norm": 2.006134184464422, + "language_loss": 0.78977376, + "learning_rate": 2.542829359113276e-06, + "loss": 0.8670066, + "num_input_tokens_seen": 153467910, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.14367676, + "step": 7154, + "time_per_iteration": 2.5568442344665527 + }, + { + "auxiliary_loss_clip": 0.06457433, + "auxiliary_loss_mlp": 0.01273105, + "balance_loss_clip": 0.06286051, + "balance_loss_mlp": 0.01258943, + "epoch": 0.43018187283932063, + "flos": 18776293666560.0, + "grad_norm": 1.5037130128548426, + "language_loss": 0.78947407, + "learning_rate": 2.542454506558389e-06, + "loss": 0.86677945, + "num_input_tokens_seen": 153487100, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.14172363, + "step": 7155, + "time_per_iteration": 2.5090463161468506 + }, + { + "auxiliary_loss_clip": 0.06448177, + "auxiliary_loss_mlp": 0.01271989, + "balance_loss_clip": 0.06280203, + "balance_loss_mlp": 0.01258613, + "epoch": 0.4302419960919886, + "flos": 20157007645440.0, + "grad_norm": 4.525310176173048, + "language_loss": 0.89197671, + "learning_rate": 2.5420796334335723e-06, + "loss": 0.96917844, + "num_input_tokens_seen": 153505565, + "router_z_loss_clip": 1.67871094, + "router_z_loss_mlp": 0.13397217, + "step": 7156, + "time_per_iteration": 2.5620951652526855 + }, + { + "auxiliary_loss_clip": 0.0645663, + "auxiliary_loss_mlp": 0.01274773, + "balance_loss_clip": 0.06281747, + "balance_loss_mlp": 0.01259836, + "epoch": 0.43030211934465656, + "flos": 26440001873280.0, + "grad_norm": 2.4796677358200423, + "language_loss": 0.82988536, + "learning_rate": 2.541704739753042e-06, + "loss": 0.90719938, + "num_input_tokens_seen": 153526130, + "router_z_loss_clip": 1.74707031, + "router_z_loss_mlp": 0.14929199, + "step": 7157, + "time_per_iteration": 2.5528175830841064 + }, + { + "auxiliary_loss_clip": 0.06457967, + "auxiliary_loss_mlp": 0.01275139, + "balance_loss_clip": 0.06280558, + "balance_loss_mlp": 0.01258974, + "epoch": 0.43036224259732453, + "flos": 24396114355200.0, + "grad_norm": 1.7333061296854189, + "language_loss": 0.71840358, + "learning_rate": 2.5413298255310132e-06, + "loss": 0.79573464, + "num_input_tokens_seen": 153546370, + "router_z_loss_clip": 1.77441406, + "router_z_loss_mlp": 0.16162109, + "step": 7158, + "time_per_iteration": 2.540012836456299 + }, + { + "auxiliary_loss_clip": 0.06449466, + "auxiliary_loss_mlp": 0.01275077, + "balance_loss_clip": 0.06278417, + "balance_loss_mlp": 0.01260355, + "epoch": 0.4304223658499925, + "flos": 17207421344640.0, + "grad_norm": 2.0047997442662684, + "language_loss": 0.82936633, + "learning_rate": 2.5409548907817034e-06, + "loss": 0.9066118, + "num_input_tokens_seen": 153562800, + "router_z_loss_clip": 1.7109375, + "router_z_loss_mlp": 0.14709473, + "step": 7159, + "time_per_iteration": 2.550978183746338 + }, + { + "auxiliary_loss_clip": 0.0645431, + "auxiliary_loss_mlp": 0.01270347, + "balance_loss_clip": 0.06281546, + "balance_loss_mlp": 0.01256048, + "epoch": 0.43048248910266046, + "flos": 14908304689920.0, + "grad_norm": 2.57539664943107, + "language_loss": 0.82999021, + "learning_rate": 2.54057993551933e-06, + "loss": 0.90723681, + "num_input_tokens_seen": 153578395, + "router_z_loss_clip": 1.72851562, + "router_z_loss_mlp": 0.1428833, + "step": 7160, + "time_per_iteration": 2.525343894958496 + }, + { + "auxiliary_loss_clip": 0.0645951, + "auxiliary_loss_mlp": 0.01269507, + "balance_loss_clip": 0.06281772, + "balance_loss_mlp": 0.01252675, + "epoch": 0.4305426123553284, + "flos": 21586245937920.0, + "grad_norm": 3.3699216716451046, + "language_loss": 0.77364504, + "learning_rate": 2.5402049597581116e-06, + "loss": 0.85093522, + "num_input_tokens_seen": 153596880, + "router_z_loss_clip": 1.77929688, + "router_z_loss_mlp": 0.16845703, + "step": 7161, + "time_per_iteration": 2.5307719707489014 + }, + { + "auxiliary_loss_clip": 0.06452791, + "auxiliary_loss_mlp": 0.0127042, + "balance_loss_clip": 0.06280292, + "balance_loss_mlp": 0.01256449, + "epoch": 0.4306027356079964, + "flos": 22607833317120.0, + "grad_norm": 2.044056208596942, + "language_loss": 0.73045391, + "learning_rate": 2.5398299635122662e-06, + "loss": 0.80768597, + "num_input_tokens_seen": 153616570, + "router_z_loss_clip": 1.72363281, + "router_z_loss_mlp": 0.13964844, + "step": 7162, + "time_per_iteration": 2.53442645072937 + }, + { + "auxiliary_loss_clip": 0.06358678, + "auxiliary_loss_mlp": 0.01256162, + "balance_loss_clip": 0.06279682, + "balance_loss_mlp": 0.01252738, + "epoch": 0.43066285886066435, + "flos": 70689873548160.0, + "grad_norm": 0.805422068373614, + "language_loss": 0.58694339, + "learning_rate": 2.5394549467960147e-06, + "loss": 0.66309178, + "num_input_tokens_seen": 153671450, + "router_z_loss_clip": 0.7890625, + "router_z_loss_mlp": 0.03433228, + "step": 7163, + "time_per_iteration": 4.420603036880493 + }, + { + "auxiliary_loss_clip": 0.06450315, + "auxiliary_loss_mlp": 0.01271156, + "balance_loss_clip": 0.06279671, + "balance_loss_mlp": 0.01257298, + "epoch": 0.4307229821133323, + "flos": 26727236069760.0, + "grad_norm": 1.7043821860128514, + "language_loss": 0.79015797, + "learning_rate": 2.5390799096235783e-06, + "loss": 0.86737275, + "num_input_tokens_seen": 153691405, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.13842773, + "step": 7164, + "time_per_iteration": 4.077051162719727 + }, + { + "auxiliary_loss_clip": 0.0645581, + "auxiliary_loss_mlp": 0.01269266, + "balance_loss_clip": 0.06279337, + "balance_loss_mlp": 0.01254222, + "epoch": 0.4307831053660003, + "flos": 26184311539200.0, + "grad_norm": 1.6263476545367235, + "language_loss": 0.68622434, + "learning_rate": 2.538704852009177e-06, + "loss": 0.76347512, + "num_input_tokens_seen": 153711555, + "router_z_loss_clip": 1.765625, + "router_z_loss_mlp": 0.1505127, + "step": 7165, + "time_per_iteration": 2.5447044372558594 + }, + { + "auxiliary_loss_clip": 0.06454252, + "auxiliary_loss_mlp": 0.01269461, + "balance_loss_clip": 0.06280573, + "balance_loss_mlp": 0.01254733, + "epoch": 0.43084322861866825, + "flos": 18915298790400.0, + "grad_norm": 2.036386887615401, + "language_loss": 0.75601453, + "learning_rate": 2.538329773967034e-06, + "loss": 0.83325171, + "num_input_tokens_seen": 153730095, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.14758301, + "step": 7166, + "time_per_iteration": 2.5380423069000244 + }, + { + "auxiliary_loss_clip": 0.06447423, + "auxiliary_loss_mlp": 0.01267427, + "balance_loss_clip": 0.06278174, + "balance_loss_mlp": 0.0125401, + "epoch": 0.4309033518713362, + "flos": 26440211508480.0, + "grad_norm": 1.6055464610704053, + "language_loss": 0.72472453, + "learning_rate": 2.537954675511372e-06, + "loss": 0.80187303, + "num_input_tokens_seen": 153749320, + "router_z_loss_clip": 1.69140625, + "router_z_loss_mlp": 0.13415527, + "step": 7167, + "time_per_iteration": 2.581911563873291 + }, + { + "auxiliary_loss_clip": 0.06445278, + "auxiliary_loss_mlp": 0.01267536, + "balance_loss_clip": 0.06279434, + "balance_loss_mlp": 0.01253398, + "epoch": 0.43096347512400424, + "flos": 21219362835840.0, + "grad_norm": 1.5535022771303773, + "language_loss": 0.78678393, + "learning_rate": 2.537579556656414e-06, + "loss": 0.86391199, + "num_input_tokens_seen": 153767825, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.14135742, + "step": 7168, + "time_per_iteration": 2.5395426750183105 + }, + { + "auxiliary_loss_clip": 0.06449728, + "auxiliary_loss_mlp": 0.0127075, + "balance_loss_clip": 0.06278324, + "balance_loss_mlp": 0.01257095, + "epoch": 0.4310235983766722, + "flos": 16544918638080.0, + "grad_norm": 2.3704233546720936, + "language_loss": 0.82314277, + "learning_rate": 2.537204417416387e-06, + "loss": 0.90034759, + "num_input_tokens_seen": 153785350, + "router_z_loss_clip": 1.71191406, + "router_z_loss_mlp": 0.13647461, + "step": 7169, + "time_per_iteration": 3.8934504985809326 + }, + { + "auxiliary_loss_clip": 0.06353073, + "auxiliary_loss_mlp": 0.01255187, + "balance_loss_clip": 0.0627488, + "balance_loss_mlp": 0.01251897, + "epoch": 0.43108372162934017, + "flos": 64794893650560.0, + "grad_norm": 0.6586067859139012, + "language_loss": 0.60826671, + "learning_rate": 2.5368292578055132e-06, + "loss": 0.6843493, + "num_input_tokens_seen": 153856400, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 0.03295898, + "step": 7170, + "time_per_iteration": 3.303295612335205 + }, + { + "auxiliary_loss_clip": 0.06446448, + "auxiliary_loss_mlp": 0.01267633, + "balance_loss_clip": 0.06276239, + "balance_loss_mlp": 0.01253841, + "epoch": 0.43114384488200813, + "flos": 13449241543680.0, + "grad_norm": 1.7965809828184895, + "language_loss": 0.76463991, + "learning_rate": 2.536454077838021e-06, + "loss": 0.84178072, + "num_input_tokens_seen": 153875230, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.13787842, + "step": 7171, + "time_per_iteration": 2.4991650581359863 + }, + { + "auxiliary_loss_clip": 0.06446211, + "auxiliary_loss_mlp": 0.01267534, + "balance_loss_clip": 0.06277663, + "balance_loss_mlp": 0.01253592, + "epoch": 0.4312039681346761, + "flos": 26293911079680.0, + "grad_norm": 1.4736819236139371, + "language_loss": 0.77570975, + "learning_rate": 2.5360788775281357e-06, + "loss": 0.8528471, + "num_input_tokens_seen": 153894740, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.13934326, + "step": 7172, + "time_per_iteration": 2.540095567703247 + }, + { + "auxiliary_loss_clip": 0.06448045, + "auxiliary_loss_mlp": 0.01271237, + "balance_loss_clip": 0.062755, + "balance_loss_mlp": 0.01256449, + "epoch": 0.43126409138734406, + "flos": 20383040833920.0, + "grad_norm": 1.8735364024745536, + "language_loss": 0.76837397, + "learning_rate": 2.535703656890086e-06, + "loss": 0.84556675, + "num_input_tokens_seen": 153913230, + "router_z_loss_clip": 1.72363281, + "router_z_loss_mlp": 0.14776611, + "step": 7173, + "time_per_iteration": 3.998828887939453 + }, + { + "auxiliary_loss_clip": 0.06449778, + "auxiliary_loss_mlp": 0.0126907, + "balance_loss_clip": 0.06280752, + "balance_loss_mlp": 0.0125529, + "epoch": 0.431324214640012, + "flos": 22128918906240.0, + "grad_norm": 1.4124937065278635, + "language_loss": 0.76940411, + "learning_rate": 2.5353284159381e-06, + "loss": 0.84659261, + "num_input_tokens_seen": 153933250, + "router_z_loss_clip": 1.69042969, + "router_z_loss_mlp": 0.13800049, + "step": 7174, + "time_per_iteration": 2.510742425918579 + }, + { + "auxiliary_loss_clip": 0.06448075, + "auxiliary_loss_mlp": 0.01271664, + "balance_loss_clip": 0.06275856, + "balance_loss_mlp": 0.01256477, + "epoch": 0.43138433789268, + "flos": 15236306697600.0, + "grad_norm": 1.9136821796322663, + "language_loss": 0.82178259, + "learning_rate": 2.534953154686407e-06, + "loss": 0.89898002, + "num_input_tokens_seen": 153951325, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.15185547, + "step": 7175, + "time_per_iteration": 2.5317423343658447 + }, + { + "auxiliary_loss_clip": 0.06456869, + "auxiliary_loss_mlp": 0.01274036, + "balance_loss_clip": 0.06277366, + "balance_loss_mlp": 0.01256935, + "epoch": 0.43144446114534796, + "flos": 18156151998720.0, + "grad_norm": 2.207412358761708, + "language_loss": 0.74869847, + "learning_rate": 2.5345778731492366e-06, + "loss": 0.82600749, + "num_input_tokens_seen": 153966975, + "router_z_loss_clip": 1.79492188, + "router_z_loss_mlp": 0.17095947, + "step": 7176, + "time_per_iteration": 2.4871389865875244 + }, + { + "auxiliary_loss_clip": 0.0645103, + "auxiliary_loss_mlp": 0.01269847, + "balance_loss_clip": 0.06277142, + "balance_loss_mlp": 0.01255565, + "epoch": 0.4315045843980159, + "flos": 22936506157440.0, + "grad_norm": 1.949576719813971, + "language_loss": 0.73992217, + "learning_rate": 2.534202571340819e-06, + "loss": 0.81713092, + "num_input_tokens_seen": 153986695, + "router_z_loss_clip": 1.73828125, + "router_z_loss_mlp": 0.14294434, + "step": 7177, + "time_per_iteration": 2.5317373275756836 + }, + { + "auxiliary_loss_clip": 0.06461225, + "auxiliary_loss_mlp": 0.01270022, + "balance_loss_clip": 0.06277613, + "balance_loss_mlp": 0.01253667, + "epoch": 0.4315647076506839, + "flos": 22133321245440.0, + "grad_norm": 1.7707547745548928, + "language_loss": 0.81576592, + "learning_rate": 2.533827249275387e-06, + "loss": 0.89307833, + "num_input_tokens_seen": 154004710, + "router_z_loss_clip": 1.83691406, + "router_z_loss_mlp": 0.16357422, + "step": 7178, + "time_per_iteration": 2.5210797786712646 + }, + { + "auxiliary_loss_clip": 0.06445872, + "auxiliary_loss_mlp": 0.01271308, + "balance_loss_clip": 0.06281172, + "balance_loss_mlp": 0.01257962, + "epoch": 0.43162483090335185, + "flos": 26878567743360.0, + "grad_norm": 1.4959775860860902, + "language_loss": 0.84818423, + "learning_rate": 2.5334519069671725e-06, + "loss": 0.92535609, + "num_input_tokens_seen": 154024320, + "router_z_loss_clip": 1.64746094, + "router_z_loss_mlp": 0.13360596, + "step": 7179, + "time_per_iteration": 2.6229355335235596 + }, + { + "auxiliary_loss_clip": 0.06446353, + "auxiliary_loss_mlp": 0.01270616, + "balance_loss_clip": 0.06276584, + "balance_loss_mlp": 0.01256096, + "epoch": 0.4316849541560198, + "flos": 13917464559360.0, + "grad_norm": 1.6356598233983888, + "language_loss": 0.75595218, + "learning_rate": 2.5330765444304075e-06, + "loss": 0.83312184, + "num_input_tokens_seen": 154041755, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.1451416, + "step": 7180, + "time_per_iteration": 2.4882874488830566 + }, + { + "auxiliary_loss_clip": 0.06450133, + "auxiliary_loss_mlp": 0.01266264, + "balance_loss_clip": 0.0627453, + "balance_loss_mlp": 0.01251023, + "epoch": 0.4317450774086878, + "flos": 16440685758720.0, + "grad_norm": 1.8060434620212955, + "language_loss": 0.81820869, + "learning_rate": 2.5327011616793274e-06, + "loss": 0.89537263, + "num_input_tokens_seen": 154056775, + "router_z_loss_clip": 1.7578125, + "router_z_loss_mlp": 0.15252686, + "step": 7181, + "time_per_iteration": 2.534747838973999 + }, + { + "auxiliary_loss_clip": 0.0644898, + "auxiliary_loss_mlp": 0.0127112, + "balance_loss_clip": 0.06274159, + "balance_loss_mlp": 0.01256189, + "epoch": 0.4318052006613558, + "flos": 20560675489920.0, + "grad_norm": 1.632078496987146, + "language_loss": 0.88980561, + "learning_rate": 2.532325758728165e-06, + "loss": 0.96700662, + "num_input_tokens_seen": 154075015, + "router_z_loss_clip": 1.74804688, + "router_z_loss_mlp": 0.14923096, + "step": 7182, + "time_per_iteration": 2.493427038192749 + }, + { + "auxiliary_loss_clip": 0.06446697, + "auxiliary_loss_mlp": 0.01267064, + "balance_loss_clip": 0.06278539, + "balance_loss_mlp": 0.01254052, + "epoch": 0.43186532391402377, + "flos": 22826613127680.0, + "grad_norm": 1.9212724157627075, + "language_loss": 0.75858486, + "learning_rate": 2.5319503355911566e-06, + "loss": 0.83572245, + "num_input_tokens_seen": 154095170, + "router_z_loss_clip": 1.68554688, + "router_z_loss_mlp": 0.13012695, + "step": 7183, + "time_per_iteration": 2.552116870880127 + }, + { + "auxiliary_loss_clip": 0.06451686, + "auxiliary_loss_mlp": 0.01267536, + "balance_loss_clip": 0.06278371, + "balance_loss_mlp": 0.01253923, + "epoch": 0.43192544716669173, + "flos": 25563624819840.0, + "grad_norm": 1.5103875784905794, + "language_loss": 0.77652711, + "learning_rate": 2.5315748922825393e-06, + "loss": 0.85371935, + "num_input_tokens_seen": 154116895, + "router_z_loss_clip": 1.73339844, + "router_z_loss_mlp": 0.13604736, + "step": 7184, + "time_per_iteration": 2.5299277305603027 + }, + { + "auxiliary_loss_clip": 0.06444119, + "auxiliary_loss_mlp": 0.01269203, + "balance_loss_clip": 0.06279948, + "balance_loss_mlp": 0.01255494, + "epoch": 0.4319855704193597, + "flos": 30962317783680.0, + "grad_norm": 1.4924548432613554, + "language_loss": 0.73502755, + "learning_rate": 2.5311994288165474e-06, + "loss": 0.81216079, + "num_input_tokens_seen": 154138395, + "router_z_loss_clip": 1.63964844, + "router_z_loss_mlp": 0.13720703, + "step": 7185, + "time_per_iteration": 2.5939247608184814 + }, + { + "auxiliary_loss_clip": 0.06455707, + "auxiliary_loss_mlp": 0.01271443, + "balance_loss_clip": 0.06279209, + "balance_loss_mlp": 0.0125684, + "epoch": 0.43204569367202766, + "flos": 24244824608640.0, + "grad_norm": 2.4112385113933015, + "language_loss": 0.75683951, + "learning_rate": 2.530823945207421e-06, + "loss": 0.83411103, + "num_input_tokens_seen": 154156775, + "router_z_loss_clip": 1.76464844, + "router_z_loss_mlp": 0.14611816, + "step": 7186, + "time_per_iteration": 2.543679714202881 + }, + { + "auxiliary_loss_clip": 0.06451818, + "auxiliary_loss_mlp": 0.01273087, + "balance_loss_clip": 0.06278853, + "balance_loss_mlp": 0.01259068, + "epoch": 0.43210581692469563, + "flos": 18413058216960.0, + "grad_norm": 2.2976206703160065, + "language_loss": 0.76516449, + "learning_rate": 2.5304484414693962e-06, + "loss": 0.84241354, + "num_input_tokens_seen": 154177500, + "router_z_loss_clip": 1.73046875, + "router_z_loss_mlp": 0.14038086, + "step": 7187, + "time_per_iteration": 2.530064105987549 + }, + { + "auxiliary_loss_clip": 0.06368419, + "auxiliary_loss_mlp": 0.01252589, + "balance_loss_clip": 0.06291005, + "balance_loss_mlp": 0.01249776, + "epoch": 0.4321659401773636, + "flos": 49851718133760.0, + "grad_norm": 0.8382360401327144, + "language_loss": 0.68072379, + "learning_rate": 2.530072917616714e-06, + "loss": 0.75693387, + "num_input_tokens_seen": 154237110, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.02812195, + "step": 7188, + "time_per_iteration": 3.1670610904693604 + }, + { + "auxiliary_loss_clip": 0.06446176, + "auxiliary_loss_mlp": 0.01270026, + "balance_loss_clip": 0.06279401, + "balance_loss_mlp": 0.01256913, + "epoch": 0.43222606343003156, + "flos": 17134229203200.0, + "grad_norm": 1.9056972558163987, + "language_loss": 0.7844317, + "learning_rate": 2.529697373663614e-06, + "loss": 0.86159372, + "num_input_tokens_seen": 154253910, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.13110352, + "step": 7189, + "time_per_iteration": 2.491743564605713 + }, + { + "auxiliary_loss_clip": 0.06457567, + "auxiliary_loss_mlp": 0.01270927, + "balance_loss_clip": 0.06278813, + "balance_loss_mlp": 0.01255906, + "epoch": 0.4322861866826995, + "flos": 22756984784640.0, + "grad_norm": 1.8601510823080152, + "language_loss": 0.72126836, + "learning_rate": 2.5293218096243364e-06, + "loss": 0.79855329, + "num_input_tokens_seen": 154274770, + "router_z_loss_clip": 1.78808594, + "router_z_loss_mlp": 0.15020752, + "step": 7190, + "time_per_iteration": 2.5745973587036133 + }, + { + "auxiliary_loss_clip": 0.06452946, + "auxiliary_loss_mlp": 0.01274284, + "balance_loss_clip": 0.06282853, + "balance_loss_mlp": 0.0125992, + "epoch": 0.4323463099353675, + "flos": 27899400435840.0, + "grad_norm": 1.5852812804273753, + "language_loss": 0.79949737, + "learning_rate": 2.5289462255131223e-06, + "loss": 0.87676966, + "num_input_tokens_seen": 154295035, + "router_z_loss_clip": 1.70019531, + "router_z_loss_mlp": 0.14355469, + "step": 7191, + "time_per_iteration": 2.5719873905181885 + }, + { + "auxiliary_loss_clip": 0.06448484, + "auxiliary_loss_mlp": 0.01269731, + "balance_loss_clip": 0.06279992, + "balance_loss_mlp": 0.01255694, + "epoch": 0.43240643318803546, + "flos": 21620892401280.0, + "grad_norm": 3.0880415359088467, + "language_loss": 0.75279927, + "learning_rate": 2.5285706213442146e-06, + "loss": 0.82998139, + "num_input_tokens_seen": 154314905, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.14056396, + "step": 7192, + "time_per_iteration": 2.536587715148926 + }, + { + "auxiliary_loss_clip": 0.0644784, + "auxiliary_loss_mlp": 0.01276118, + "balance_loss_clip": 0.06277698, + "balance_loss_mlp": 0.01260883, + "epoch": 0.4324665564407034, + "flos": 17562774510720.0, + "grad_norm": 2.069328799544239, + "language_loss": 0.79199994, + "learning_rate": 2.5281949971318557e-06, + "loss": 0.86923951, + "num_input_tokens_seen": 154331740, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.15216064, + "step": 7193, + "time_per_iteration": 2.483978033065796 + }, + { + "auxiliary_loss_clip": 0.06449077, + "auxiliary_loss_mlp": 0.01277624, + "balance_loss_clip": 0.06278618, + "balance_loss_mlp": 0.01263212, + "epoch": 0.4325266796933714, + "flos": 18407775409920.0, + "grad_norm": 2.329186427032778, + "language_loss": 0.76053572, + "learning_rate": 2.5278193528902897e-06, + "loss": 0.83780271, + "num_input_tokens_seen": 154348740, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.14404297, + "step": 7194, + "time_per_iteration": 2.5057263374328613 + }, + { + "auxiliary_loss_clip": 0.06451394, + "auxiliary_loss_mlp": 0.01275378, + "balance_loss_clip": 0.06279992, + "balance_loss_mlp": 0.01260847, + "epoch": 0.4325868029460394, + "flos": 22571342064000.0, + "grad_norm": 1.9582306658700896, + "language_loss": 0.60073519, + "learning_rate": 2.5274436886337613e-06, + "loss": 0.67800295, + "num_input_tokens_seen": 154368835, + "router_z_loss_clip": 1.71191406, + "router_z_loss_mlp": 0.14532471, + "step": 7195, + "time_per_iteration": 2.5116991996765137 + }, + { + "auxiliary_loss_clip": 0.06458029, + "auxiliary_loss_mlp": 0.01275051, + "balance_loss_clip": 0.06281463, + "balance_loss_mlp": 0.01259989, + "epoch": 0.43264692619870737, + "flos": 14609834046720.0, + "grad_norm": 1.968403141706004, + "language_loss": 0.65685856, + "learning_rate": 2.527068004376515e-06, + "loss": 0.73418939, + "num_input_tokens_seen": 154384620, + "router_z_loss_clip": 1.76660156, + "router_z_loss_mlp": 0.1506958, + "step": 7196, + "time_per_iteration": 2.5037827491760254 + }, + { + "auxiliary_loss_clip": 0.06456476, + "auxiliary_loss_mlp": 0.01272338, + "balance_loss_clip": 0.06280259, + "balance_loss_mlp": 0.01257151, + "epoch": 0.43270704945137534, + "flos": 21507184010880.0, + "grad_norm": 2.17558250449299, + "language_loss": 0.72638965, + "learning_rate": 2.526692300132797e-06, + "loss": 0.8036778, + "num_input_tokens_seen": 154402865, + "router_z_loss_clip": 1.76171875, + "router_z_loss_mlp": 0.15197754, + "step": 7197, + "time_per_iteration": 2.4931299686431885 + }, + { + "auxiliary_loss_clip": 0.0645181, + "auxiliary_loss_mlp": 0.01280731, + "balance_loss_clip": 0.06284913, + "balance_loss_mlp": 0.01265627, + "epoch": 0.4327671727040433, + "flos": 25162975722240.0, + "grad_norm": 1.6800922175899422, + "language_loss": 0.72821289, + "learning_rate": 2.5263165759168547e-06, + "loss": 0.8055383, + "num_input_tokens_seen": 154423625, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.15100098, + "step": 7198, + "time_per_iteration": 2.574894428253174 + }, + { + "auxiliary_loss_clip": 0.06448364, + "auxiliary_loss_mlp": 0.01268994, + "balance_loss_clip": 0.06280281, + "balance_loss_mlp": 0.01254969, + "epoch": 0.43282729595671127, + "flos": 25454192987520.0, + "grad_norm": 1.3407856907116962, + "language_loss": 0.8128798, + "learning_rate": 2.525940831742934e-06, + "loss": 0.89005339, + "num_input_tokens_seen": 154444775, + "router_z_loss_clip": 1.68066406, + "router_z_loss_mlp": 0.14013672, + "step": 7199, + "time_per_iteration": 2.5314407348632812 + }, + { + "auxiliary_loss_clip": 0.06450363, + "auxiliary_loss_mlp": 0.01269925, + "balance_loss_clip": 0.06280895, + "balance_loss_mlp": 0.01255918, + "epoch": 0.43288741920937923, + "flos": 24131661269760.0, + "grad_norm": 2.374744791798318, + "language_loss": 0.68757379, + "learning_rate": 2.525565067625286e-06, + "loss": 0.76477665, + "num_input_tokens_seen": 154460815, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.14013672, + "step": 7200, + "time_per_iteration": 2.5569095611572266 + }, + { + "auxiliary_loss_clip": 0.06449814, + "auxiliary_loss_mlp": 0.01269719, + "balance_loss_clip": 0.06278992, + "balance_loss_mlp": 0.01254925, + "epoch": 0.4329475424620472, + "flos": 19210415270400.0, + "grad_norm": 1.7756006077325563, + "language_loss": 0.87039292, + "learning_rate": 2.525189283578157e-06, + "loss": 0.94758821, + "num_input_tokens_seen": 154479145, + "router_z_loss_clip": 1.70898438, + "router_z_loss_mlp": 0.14807129, + "step": 7201, + "time_per_iteration": 2.4946835041046143 + }, + { + "auxiliary_loss_clip": 0.06464264, + "auxiliary_loss_mlp": 0.0127186, + "balance_loss_clip": 0.06283499, + "balance_loss_mlp": 0.01255016, + "epoch": 0.43300766571471516, + "flos": 22645037329920.0, + "grad_norm": 5.903168179153311, + "language_loss": 0.64564252, + "learning_rate": 2.5248134796157974e-06, + "loss": 0.72300375, + "num_input_tokens_seen": 154498905, + "router_z_loss_clip": 1.80664062, + "router_z_loss_mlp": 0.16845703, + "step": 7202, + "time_per_iteration": 2.5667803287506104 + }, + { + "auxiliary_loss_clip": 0.06448028, + "auxiliary_loss_mlp": 0.01268297, + "balance_loss_clip": 0.06278727, + "balance_loss_mlp": 0.01254838, + "epoch": 0.4330677889673831, + "flos": 22126570992000.0, + "grad_norm": 2.072135817395126, + "language_loss": 0.8230809, + "learning_rate": 2.5244376557524586e-06, + "loss": 0.90024418, + "num_input_tokens_seen": 154517270, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.13470459, + "step": 7203, + "time_per_iteration": 5.375681161880493 + }, + { + "auxiliary_loss_clip": 0.06458279, + "auxiliary_loss_mlp": 0.01268927, + "balance_loss_clip": 0.06282033, + "balance_loss_mlp": 0.01254169, + "epoch": 0.4331279122200511, + "flos": 23228184620160.0, + "grad_norm": 2.3968905297379024, + "language_loss": 0.81134045, + "learning_rate": 2.5240618120023912e-06, + "loss": 0.88861251, + "num_input_tokens_seen": 154535945, + "router_z_loss_clip": 1.75976562, + "router_z_loss_mlp": 0.14764404, + "step": 7204, + "time_per_iteration": 2.524557113647461 + }, + { + "auxiliary_loss_clip": 0.06450962, + "auxiliary_loss_mlp": 0.01270518, + "balance_loss_clip": 0.06281083, + "balance_loss_mlp": 0.0125691, + "epoch": 0.43318803547271906, + "flos": 18265625758080.0, + "grad_norm": 2.088854485199162, + "language_loss": 0.7413221, + "learning_rate": 2.5236859483798468e-06, + "loss": 0.81853694, + "num_input_tokens_seen": 154554935, + "router_z_loss_clip": 1.69726562, + "router_z_loss_mlp": 0.13604736, + "step": 7205, + "time_per_iteration": 2.519554376602173 + }, + { + "auxiliary_loss_clip": 0.0644919, + "auxiliary_loss_mlp": 0.01273515, + "balance_loss_clip": 0.06284859, + "balance_loss_mlp": 0.01259908, + "epoch": 0.433248158725387, + "flos": 27425936540160.0, + "grad_norm": 1.5872196628882773, + "language_loss": 0.75603741, + "learning_rate": 2.5233100648990803e-06, + "loss": 0.83326447, + "num_input_tokens_seen": 154576065, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.13598633, + "step": 7206, + "time_per_iteration": 2.5732641220092773 + }, + { + "auxiliary_loss_clip": 0.0644986, + "auxiliary_loss_mlp": 0.01269665, + "balance_loss_clip": 0.06280635, + "balance_loss_mlp": 0.01254728, + "epoch": 0.433308281978055, + "flos": 23224075770240.0, + "grad_norm": 1.828436296505125, + "language_loss": 0.78923273, + "learning_rate": 2.522934161574342e-06, + "loss": 0.86642796, + "num_input_tokens_seen": 154595110, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.1496582, + "step": 7207, + "time_per_iteration": 2.6846628189086914 + }, + { + "auxiliary_loss_clip": 0.06456017, + "auxiliary_loss_mlp": 0.01270448, + "balance_loss_clip": 0.06279423, + "balance_loss_mlp": 0.0125513, + "epoch": 0.433368405230723, + "flos": 15857999665920.0, + "grad_norm": 2.196810095173743, + "language_loss": 0.81095958, + "learning_rate": 2.5225582384198888e-06, + "loss": 0.8882243, + "num_input_tokens_seen": 154612255, + "router_z_loss_clip": 1.76367188, + "router_z_loss_mlp": 0.15307617, + "step": 7208, + "time_per_iteration": 2.4724419116973877 + }, + { + "auxiliary_loss_clip": 0.0645436, + "auxiliary_loss_mlp": 0.01269383, + "balance_loss_clip": 0.0628323, + "balance_loss_mlp": 0.0125481, + "epoch": 0.433428528483391, + "flos": 19032109781760.0, + "grad_norm": 2.1243132825557107, + "language_loss": 0.71321076, + "learning_rate": 2.5221822954499744e-06, + "loss": 0.79044819, + "num_input_tokens_seen": 154630440, + "router_z_loss_clip": 1.7109375, + "router_z_loss_mlp": 0.14581299, + "step": 7209, + "time_per_iteration": 3.9143481254577637 + }, + { + "auxiliary_loss_clip": 0.06450495, + "auxiliary_loss_mlp": 0.01271038, + "balance_loss_clip": 0.06281973, + "balance_loss_mlp": 0.01255517, + "epoch": 0.43348865173605894, + "flos": 24725290320000.0, + "grad_norm": 1.4388803928851785, + "language_loss": 0.8148647, + "learning_rate": 2.5218063326788557e-06, + "loss": 0.89208007, + "num_input_tokens_seen": 154652515, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.15515137, + "step": 7210, + "time_per_iteration": 2.564333915710449 + }, + { + "auxiliary_loss_clip": 0.06451392, + "auxiliary_loss_mlp": 0.01274146, + "balance_loss_clip": 0.06281275, + "balance_loss_mlp": 0.01261045, + "epoch": 0.4335487749887269, + "flos": 22097165408640.0, + "grad_norm": 1.8576931130518815, + "language_loss": 0.82474005, + "learning_rate": 2.5214303501207885e-06, + "loss": 0.90199542, + "num_input_tokens_seen": 154670965, + "router_z_loss_clip": 1.70019531, + "router_z_loss_mlp": 0.13110352, + "step": 7211, + "time_per_iteration": 2.491514205932617 + }, + { + "auxiliary_loss_clip": 0.06452142, + "auxiliary_loss_mlp": 0.01271809, + "balance_loss_clip": 0.06280628, + "balance_loss_mlp": 0.01258362, + "epoch": 0.43360889824139487, + "flos": 22389556631040.0, + "grad_norm": 12.106558391415842, + "language_loss": 0.7536357, + "learning_rate": 2.521054347790029e-06, + "loss": 0.83087522, + "num_input_tokens_seen": 154689980, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.13452148, + "step": 7212, + "time_per_iteration": 2.551093816757202 + }, + { + "auxiliary_loss_clip": 0.06452519, + "auxiliary_loss_mlp": 0.01272111, + "balance_loss_clip": 0.06284005, + "balance_loss_mlp": 0.01259517, + "epoch": 0.43366902149406283, + "flos": 17533746270720.0, + "grad_norm": 1.8081714291238689, + "language_loss": 0.77247733, + "learning_rate": 2.5206783257008375e-06, + "loss": 0.84972358, + "num_input_tokens_seen": 154706570, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.1260376, + "step": 7213, + "time_per_iteration": 3.8823790550231934 + }, + { + "auxiliary_loss_clip": 0.06452443, + "auxiliary_loss_mlp": 0.01274704, + "balance_loss_clip": 0.06281798, + "balance_loss_mlp": 0.01261245, + "epoch": 0.4337291447467308, + "flos": 19028126712960.0, + "grad_norm": 1.4293111519880635, + "language_loss": 0.65090191, + "learning_rate": 2.520302283867471e-06, + "loss": 0.72817338, + "num_input_tokens_seen": 154725210, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.13446045, + "step": 7214, + "time_per_iteration": 2.512341260910034 + }, + { + "auxiliary_loss_clip": 0.0644484, + "auxiliary_loss_mlp": 0.01268075, + "balance_loss_clip": 0.06280676, + "balance_loss_mlp": 0.01255319, + "epoch": 0.43378926799939876, + "flos": 27241216214400.0, + "grad_norm": 1.6847650033402397, + "language_loss": 0.7180531, + "learning_rate": 2.519926222304191e-06, + "loss": 0.79518223, + "num_input_tokens_seen": 154745945, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.12750244, + "step": 7215, + "time_per_iteration": 2.5413544178009033 + }, + { + "auxiliary_loss_clip": 0.06451561, + "auxiliary_loss_mlp": 0.01271937, + "balance_loss_clip": 0.06284516, + "balance_loss_mlp": 0.01258365, + "epoch": 0.43384939125206673, + "flos": 15966592957440.0, + "grad_norm": 1.7641597528508168, + "language_loss": 0.75291193, + "learning_rate": 2.519550141025255e-06, + "loss": 0.83014691, + "num_input_tokens_seen": 154763580, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.13574219, + "step": 7216, + "time_per_iteration": 2.539677143096924 + }, + { + "auxiliary_loss_clip": 0.06459753, + "auxiliary_loss_mlp": 0.01268936, + "balance_loss_clip": 0.06280532, + "balance_loss_mlp": 0.01254256, + "epoch": 0.4339095145047347, + "flos": 21798736692480.0, + "grad_norm": 2.367070732862923, + "language_loss": 0.7623983, + "learning_rate": 2.519174040044927e-06, + "loss": 0.8396852, + "num_input_tokens_seen": 154776825, + "router_z_loss_clip": 1.79394531, + "router_z_loss_mlp": 0.14685059, + "step": 7217, + "time_per_iteration": 2.491522789001465 + }, + { + "auxiliary_loss_clip": 0.06451164, + "auxiliary_loss_mlp": 0.01267926, + "balance_loss_clip": 0.0628095, + "balance_loss_mlp": 0.01254389, + "epoch": 0.43396963775740266, + "flos": 14215054734720.0, + "grad_norm": 2.758270274773255, + "language_loss": 0.74231893, + "learning_rate": 2.5187979193774664e-06, + "loss": 0.81950986, + "num_input_tokens_seen": 154794025, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.13531494, + "step": 7218, + "time_per_iteration": 2.5123910903930664 + }, + { + "auxiliary_loss_clip": 0.06450492, + "auxiliary_loss_mlp": 0.01270563, + "balance_loss_clip": 0.06277994, + "balance_loss_mlp": 0.01256443, + "epoch": 0.4340297610100706, + "flos": 19725150101760.0, + "grad_norm": 1.5975368135070402, + "language_loss": 0.69353253, + "learning_rate": 2.5184217790371367e-06, + "loss": 0.77074307, + "num_input_tokens_seen": 154813105, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.14117432, + "step": 7219, + "time_per_iteration": 2.502150297164917 + }, + { + "auxiliary_loss_clip": 0.06450121, + "auxiliary_loss_mlp": 0.01273865, + "balance_loss_clip": 0.06280973, + "balance_loss_mlp": 0.01259482, + "epoch": 0.4340898842627386, + "flos": 18959588472960.0, + "grad_norm": 2.696483499139917, + "language_loss": 0.77797616, + "learning_rate": 2.518045619038202e-06, + "loss": 0.85521603, + "num_input_tokens_seen": 154833525, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.1439209, + "step": 7220, + "time_per_iteration": 2.5805821418762207 + }, + { + "auxiliary_loss_clip": 0.06449743, + "auxiliary_loss_mlp": 0.01270897, + "balance_loss_clip": 0.06280366, + "balance_loss_mlp": 0.01257331, + "epoch": 0.4341500075154066, + "flos": 22024895662080.0, + "grad_norm": 2.140213938529436, + "language_loss": 0.69858402, + "learning_rate": 2.5176694393949243e-06, + "loss": 0.77579045, + "num_input_tokens_seen": 154853090, + "router_z_loss_clip": 1.69238281, + "router_z_loss_mlp": 0.13562012, + "step": 7221, + "time_per_iteration": 2.556913137435913 + }, + { + "auxiliary_loss_clip": 0.06448823, + "auxiliary_loss_mlp": 0.01267968, + "balance_loss_clip": 0.06277943, + "balance_loss_mlp": 0.01254188, + "epoch": 0.4342101307680746, + "flos": 23588527104000.0, + "grad_norm": 1.6725579163220456, + "language_loss": 0.65062654, + "learning_rate": 2.51729324012157e-06, + "loss": 0.72779441, + "num_input_tokens_seen": 154872055, + "router_z_loss_clip": 1.70898438, + "router_z_loss_mlp": 0.13793945, + "step": 7222, + "time_per_iteration": 2.642038345336914 + }, + { + "auxiliary_loss_clip": 0.0644563, + "auxiliary_loss_mlp": 0.01269163, + "balance_loss_clip": 0.06277044, + "balance_loss_mlp": 0.01254912, + "epoch": 0.43427025402074254, + "flos": 17973821514240.0, + "grad_norm": 2.158287657708821, + "language_loss": 0.73335516, + "learning_rate": 2.5169170212324053e-06, + "loss": 0.81050307, + "num_input_tokens_seen": 154886645, + "router_z_loss_clip": 1.68554688, + "router_z_loss_mlp": 0.14257812, + "step": 7223, + "time_per_iteration": 2.5124166011810303 + }, + { + "auxiliary_loss_clip": 0.06448437, + "auxiliary_loss_mlp": 0.01270913, + "balance_loss_clip": 0.06275682, + "balance_loss_mlp": 0.0125746, + "epoch": 0.4343303772734105, + "flos": 26293575663360.0, + "grad_norm": 1.9810355285503365, + "language_loss": 0.94283241, + "learning_rate": 2.516540782741694e-06, + "loss": 1.02002597, + "num_input_tokens_seen": 154906775, + "router_z_loss_clip": 1.72753906, + "router_z_loss_mlp": 0.13458252, + "step": 7224, + "time_per_iteration": 2.5581512451171875 + }, + { + "auxiliary_loss_clip": 0.06445128, + "auxiliary_loss_mlp": 0.01270275, + "balance_loss_clip": 0.06277162, + "balance_loss_mlp": 0.01257383, + "epoch": 0.43439050052607847, + "flos": 26841279876480.0, + "grad_norm": 2.0217716161026624, + "language_loss": 0.61832798, + "learning_rate": 2.5161645246637056e-06, + "loss": 0.69548196, + "num_input_tokens_seen": 154926990, + "router_z_loss_clip": 1.6796875, + "router_z_loss_mlp": 0.12890625, + "step": 7225, + "time_per_iteration": 2.5797905921936035 + }, + { + "auxiliary_loss_clip": 0.06447432, + "auxiliary_loss_mlp": 0.01269551, + "balance_loss_clip": 0.06278066, + "balance_loss_mlp": 0.01255895, + "epoch": 0.43445062377874644, + "flos": 21404083161600.0, + "grad_norm": 2.452465231522654, + "language_loss": 0.77966076, + "learning_rate": 2.5157882470127054e-06, + "loss": 0.8568306, + "num_input_tokens_seen": 154946210, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.13653564, + "step": 7226, + "time_per_iteration": 2.511101722717285 + }, + { + "auxiliary_loss_clip": 0.06444375, + "auxiliary_loss_mlp": 0.01273195, + "balance_loss_clip": 0.06280836, + "balance_loss_mlp": 0.01260553, + "epoch": 0.4345107470314144, + "flos": 19908151418880.0, + "grad_norm": 1.6845072318289191, + "language_loss": 0.84942114, + "learning_rate": 2.515411949802964e-06, + "loss": 0.92659688, + "num_input_tokens_seen": 154964995, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.12652588, + "step": 7227, + "time_per_iteration": 2.525317430496216 + }, + { + "auxiliary_loss_clip": 0.06449986, + "auxiliary_loss_mlp": 0.01270041, + "balance_loss_clip": 0.06281552, + "balance_loss_mlp": 0.0125601, + "epoch": 0.43457087028408237, + "flos": 26439876092160.0, + "grad_norm": 2.0880007397823714, + "language_loss": 0.77098775, + "learning_rate": 2.5150356330487498e-06, + "loss": 0.84818804, + "num_input_tokens_seen": 154984775, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.14025879, + "step": 7228, + "time_per_iteration": 2.5491206645965576 + }, + { + "auxiliary_loss_clip": 0.06447831, + "auxiliary_loss_mlp": 0.01269154, + "balance_loss_clip": 0.06281967, + "balance_loss_mlp": 0.0125486, + "epoch": 0.43463099353675033, + "flos": 31876947025920.0, + "grad_norm": 1.527689344505128, + "language_loss": 0.80533445, + "learning_rate": 2.5146592967643324e-06, + "loss": 0.88250422, + "num_input_tokens_seen": 155008125, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.14294434, + "step": 7229, + "time_per_iteration": 2.6139633655548096 + }, + { + "auxiliary_loss_clip": 0.06448658, + "auxiliary_loss_mlp": 0.01272316, + "balance_loss_clip": 0.0627811, + "balance_loss_mlp": 0.01258208, + "epoch": 0.4346911167894183, + "flos": 24578109423360.0, + "grad_norm": 1.897670481755329, + "language_loss": 0.8187139, + "learning_rate": 2.5142829409639834e-06, + "loss": 0.89592373, + "num_input_tokens_seen": 155027885, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.14117432, + "step": 7230, + "time_per_iteration": 2.535597085952759 + }, + { + "auxiliary_loss_clip": 0.06454149, + "auxiliary_loss_mlp": 0.01272993, + "balance_loss_clip": 0.06280425, + "balance_loss_mlp": 0.01258849, + "epoch": 0.43475124004208626, + "flos": 17096102795520.0, + "grad_norm": 2.6326033188165012, + "language_loss": 0.77091682, + "learning_rate": 2.513906565661973e-06, + "loss": 0.84818828, + "num_input_tokens_seen": 155043375, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.14135742, + "step": 7231, + "time_per_iteration": 2.509392738342285 + }, + { + "auxiliary_loss_clip": 0.064488, + "auxiliary_loss_mlp": 0.01274763, + "balance_loss_clip": 0.06282736, + "balance_loss_mlp": 0.01262162, + "epoch": 0.4348113632947542, + "flos": 26111874084480.0, + "grad_norm": 2.1662461953899044, + "language_loss": 0.69288278, + "learning_rate": 2.513530170872575e-06, + "loss": 0.77011836, + "num_input_tokens_seen": 155062930, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.1260376, + "step": 7232, + "time_per_iteration": 2.547469139099121 + }, + { + "auxiliary_loss_clip": 0.0645097, + "auxiliary_loss_mlp": 0.01271517, + "balance_loss_clip": 0.06279375, + "balance_loss_mlp": 0.01256431, + "epoch": 0.4348714865474222, + "flos": 34208446083840.0, + "grad_norm": 2.030594980717477, + "language_loss": 0.72046328, + "learning_rate": 2.5131537566100605e-06, + "loss": 0.79768813, + "num_input_tokens_seen": 155084980, + "router_z_loss_clip": 1.71386719, + "router_z_loss_mlp": 0.15075684, + "step": 7233, + "time_per_iteration": 2.633953332901001 + }, + { + "auxiliary_loss_clip": 0.06453332, + "auxiliary_loss_mlp": 0.01271348, + "balance_loss_clip": 0.06279553, + "balance_loss_mlp": 0.01257466, + "epoch": 0.43493160980009016, + "flos": 31545045803520.0, + "grad_norm": 1.5667863682634524, + "language_loss": 0.75517476, + "learning_rate": 2.5127773228887053e-06, + "loss": 0.83242154, + "num_input_tokens_seen": 155107260, + "router_z_loss_clip": 1.73730469, + "router_z_loss_mlp": 0.13885498, + "step": 7234, + "time_per_iteration": 2.592467784881592 + }, + { + "auxiliary_loss_clip": 0.06464201, + "auxiliary_loss_mlp": 0.01272529, + "balance_loss_clip": 0.06286918, + "balance_loss_mlp": 0.01258003, + "epoch": 0.4349917330527582, + "flos": 24068238128640.0, + "grad_norm": 2.6345915143615284, + "language_loss": 0.5890404, + "learning_rate": 2.512400869722782e-06, + "loss": 0.6664077, + "num_input_tokens_seen": 155126720, + "router_z_loss_clip": 1.77246094, + "router_z_loss_mlp": 0.14520264, + "step": 7235, + "time_per_iteration": 2.5652947425842285 + }, + { + "auxiliary_loss_clip": 0.06449015, + "auxiliary_loss_mlp": 0.01271774, + "balance_loss_clip": 0.06278686, + "balance_loss_mlp": 0.01257754, + "epoch": 0.43505185630542614, + "flos": 30527315712000.0, + "grad_norm": 1.3439257210534017, + "language_loss": 0.77555895, + "learning_rate": 2.512024397126566e-06, + "loss": 0.85276687, + "num_input_tokens_seen": 155148640, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.14019775, + "step": 7236, + "time_per_iteration": 2.600897789001465 + }, + { + "auxiliary_loss_clip": 0.06450135, + "auxiliary_loss_mlp": 0.01275561, + "balance_loss_clip": 0.06283981, + "balance_loss_mlp": 0.01260833, + "epoch": 0.4351119795580941, + "flos": 15739427738880.0, + "grad_norm": 1.5753739577535406, + "language_loss": 0.81058431, + "learning_rate": 2.5116479051143345e-06, + "loss": 0.88784134, + "num_input_tokens_seen": 155165870, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.14733887, + "step": 7237, + "time_per_iteration": 2.515153169631958 + }, + { + "auxiliary_loss_clip": 0.0644604, + "auxiliary_loss_mlp": 0.01270202, + "balance_loss_clip": 0.0627768, + "balance_loss_mlp": 0.0125607, + "epoch": 0.4351721028107621, + "flos": 18737328718080.0, + "grad_norm": 1.5657016421471992, + "language_loss": 0.63616467, + "learning_rate": 2.5112713937003623e-06, + "loss": 0.71332717, + "num_input_tokens_seen": 155185315, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.14129639, + "step": 7238, + "time_per_iteration": 2.4845099449157715 + }, + { + "auxiliary_loss_clip": 0.06448185, + "auxiliary_loss_mlp": 0.01273501, + "balance_loss_clip": 0.06281941, + "balance_loss_mlp": 0.01260162, + "epoch": 0.43523222606343004, + "flos": 25233652241280.0, + "grad_norm": 1.9152472058436172, + "language_loss": 0.85898602, + "learning_rate": 2.510894862898928e-06, + "loss": 0.93620288, + "num_input_tokens_seen": 155205790, + "router_z_loss_clip": 1.66308594, + "router_z_loss_mlp": 0.13342285, + "step": 7239, + "time_per_iteration": 2.579202175140381 + }, + { + "auxiliary_loss_clip": 0.06452584, + "auxiliary_loss_mlp": 0.01267786, + "balance_loss_clip": 0.06283215, + "balance_loss_mlp": 0.01253987, + "epoch": 0.435292349316098, + "flos": 22715504213760.0, + "grad_norm": 1.439066736410537, + "language_loss": 0.72456282, + "learning_rate": 2.510518312724309e-06, + "loss": 0.80176651, + "num_input_tokens_seen": 155226475, + "router_z_loss_clip": 1.69433594, + "router_z_loss_mlp": 0.13793945, + "step": 7240, + "time_per_iteration": 2.5192179679870605 + }, + { + "auxiliary_loss_clip": 0.06454788, + "auxiliary_loss_mlp": 0.01270866, + "balance_loss_clip": 0.06282151, + "balance_loss_mlp": 0.01256913, + "epoch": 0.43535247256876597, + "flos": 25783033536000.0, + "grad_norm": 2.0220617163145485, + "language_loss": 0.81900156, + "learning_rate": 2.5101417431907842e-06, + "loss": 0.89625818, + "num_input_tokens_seen": 155247110, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.1394043, + "step": 7241, + "time_per_iteration": 2.5792059898376465 + }, + { + "auxiliary_loss_clip": 0.06460294, + "auxiliary_loss_mlp": 0.01275581, + "balance_loss_clip": 0.0628238, + "balance_loss_mlp": 0.01260346, + "epoch": 0.43541259582143393, + "flos": 17533578562560.0, + "grad_norm": 2.581589278543144, + "language_loss": 0.79383838, + "learning_rate": 2.5097651543126345e-06, + "loss": 0.8711971, + "num_input_tokens_seen": 155261335, + "router_z_loss_clip": 1.77929688, + "router_z_loss_mlp": 0.15246582, + "step": 7242, + "time_per_iteration": 3.918156623840332 + }, + { + "auxiliary_loss_clip": 0.06452459, + "auxiliary_loss_mlp": 0.01271144, + "balance_loss_clip": 0.06278028, + "balance_loss_mlp": 0.01257405, + "epoch": 0.4354727190741019, + "flos": 15200612058240.0, + "grad_norm": 2.430343835688426, + "language_loss": 0.69088292, + "learning_rate": 2.509388546104138e-06, + "loss": 0.76811898, + "num_input_tokens_seen": 155278510, + "router_z_loss_clip": 1.7421875, + "router_z_loss_mlp": 0.13745117, + "step": 7243, + "time_per_iteration": 3.900606632232666 + }, + { + "auxiliary_loss_clip": 0.06444837, + "auxiliary_loss_mlp": 0.01271827, + "balance_loss_clip": 0.06279606, + "balance_loss_mlp": 0.01258655, + "epoch": 0.43553284232676986, + "flos": 16654015054080.0, + "grad_norm": 1.5901355562967736, + "language_loss": 0.81475091, + "learning_rate": 2.5090119185795766e-06, + "loss": 0.89191759, + "num_input_tokens_seen": 155296450, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.1317749, + "step": 7244, + "time_per_iteration": 2.581033229827881 + }, + { + "auxiliary_loss_clip": 0.06446069, + "auxiliary_loss_mlp": 0.01268137, + "balance_loss_clip": 0.06277774, + "balance_loss_mlp": 0.01255596, + "epoch": 0.43559296557943783, + "flos": 23407035160320.0, + "grad_norm": 1.5978807757182665, + "language_loss": 0.73241115, + "learning_rate": 2.508635271753234e-06, + "loss": 0.80955315, + "num_input_tokens_seen": 155316080, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.12554932, + "step": 7245, + "time_per_iteration": 2.5589826107025146 + }, + { + "auxiliary_loss_clip": 0.06452223, + "auxiliary_loss_mlp": 0.01269888, + "balance_loss_clip": 0.06282671, + "balance_loss_mlp": 0.01255792, + "epoch": 0.4356530888321058, + "flos": 22425628613760.0, + "grad_norm": 1.6720109050482812, + "language_loss": 0.77539527, + "learning_rate": 2.508258605639389e-06, + "loss": 0.85261637, + "num_input_tokens_seen": 155336765, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.14111328, + "step": 7246, + "time_per_iteration": 2.593538999557495 + }, + { + "auxiliary_loss_clip": 0.06448724, + "auxiliary_loss_mlp": 0.01267563, + "balance_loss_clip": 0.06280839, + "balance_loss_mlp": 0.01254033, + "epoch": 0.43571321208477376, + "flos": 21622527555840.0, + "grad_norm": 3.3071750834647426, + "language_loss": 0.86156344, + "learning_rate": 2.5078819202523275e-06, + "loss": 0.93872631, + "num_input_tokens_seen": 155356440, + "router_z_loss_clip": 1.67871094, + "router_z_loss_mlp": 0.13531494, + "step": 7247, + "time_per_iteration": 2.5369882583618164 + }, + { + "auxiliary_loss_clip": 0.06446265, + "auxiliary_loss_mlp": 0.01269788, + "balance_loss_clip": 0.06277846, + "balance_loss_mlp": 0.01257194, + "epoch": 0.4357733353374418, + "flos": 23994081665280.0, + "grad_norm": 1.7467086672612386, + "language_loss": 0.73132598, + "learning_rate": 2.507505215606333e-06, + "loss": 0.80848658, + "num_input_tokens_seen": 155377070, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.12597656, + "step": 7248, + "time_per_iteration": 3.9830687046051025 + }, + { + "auxiliary_loss_clip": 0.06447548, + "auxiliary_loss_mlp": 0.01267385, + "balance_loss_clip": 0.06279291, + "balance_loss_mlp": 0.01254022, + "epoch": 0.43583345859010975, + "flos": 25271736721920.0, + "grad_norm": 1.509350817375945, + "language_loss": 0.87227005, + "learning_rate": 2.5071284917156893e-06, + "loss": 0.94941938, + "num_input_tokens_seen": 155398415, + "router_z_loss_clip": 1.68066406, + "router_z_loss_mlp": 0.13378906, + "step": 7249, + "time_per_iteration": 2.565516948699951 + }, + { + "auxiliary_loss_clip": 0.06451611, + "auxiliary_loss_mlp": 0.01267914, + "balance_loss_clip": 0.06279075, + "balance_loss_mlp": 0.01254223, + "epoch": 0.4358935818427777, + "flos": 23703115962240.0, + "grad_norm": 1.8925784396827436, + "language_loss": 0.8199448, + "learning_rate": 2.506751748594683e-06, + "loss": 0.89714003, + "num_input_tokens_seen": 155415625, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.13690186, + "step": 7250, + "time_per_iteration": 2.5410354137420654 + }, + { + "auxiliary_loss_clip": 0.06454265, + "auxiliary_loss_mlp": 0.01273165, + "balance_loss_clip": 0.06283678, + "balance_loss_mlp": 0.01258901, + "epoch": 0.4359537050954457, + "flos": 29540416723200.0, + "grad_norm": 2.0613712873147723, + "language_loss": 0.85409963, + "learning_rate": 2.5063749862575988e-06, + "loss": 0.93137395, + "num_input_tokens_seen": 155435505, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.14251709, + "step": 7251, + "time_per_iteration": 2.5893919467926025 + }, + { + "auxiliary_loss_clip": 0.06448197, + "auxiliary_loss_mlp": 0.01270693, + "balance_loss_clip": 0.06280132, + "balance_loss_mlp": 0.01257431, + "epoch": 0.43601382834811364, + "flos": 22717935982080.0, + "grad_norm": 1.9454057009257966, + "language_loss": 0.69792974, + "learning_rate": 2.5059982047187245e-06, + "loss": 0.77511865, + "num_input_tokens_seen": 155455425, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.13262939, + "step": 7252, + "time_per_iteration": 2.518423080444336 + }, + { + "auxiliary_loss_clip": 0.06442783, + "auxiliary_loss_mlp": 0.01269502, + "balance_loss_clip": 0.06278728, + "balance_loss_mlp": 0.01256336, + "epoch": 0.4360739516007816, + "flos": 19104714944640.0, + "grad_norm": 1.67696041016681, + "language_loss": 0.83826983, + "learning_rate": 2.505621403992348e-06, + "loss": 0.91539264, + "num_input_tokens_seen": 155474250, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.13146973, + "step": 7253, + "time_per_iteration": 3.929287910461426 + }, + { + "auxiliary_loss_clip": 0.06446494, + "auxiliary_loss_mlp": 0.01271781, + "balance_loss_clip": 0.06278495, + "balance_loss_mlp": 0.01257095, + "epoch": 0.43613407485344957, + "flos": 23411185937280.0, + "grad_norm": 1.865330471105, + "language_loss": 0.7061553, + "learning_rate": 2.505244584092757e-06, + "loss": 0.78333807, + "num_input_tokens_seen": 155494685, + "router_z_loss_clip": 1.68066406, + "router_z_loss_mlp": 0.14678955, + "step": 7254, + "time_per_iteration": 2.5348615646362305 + }, + { + "auxiliary_loss_clip": 0.06446688, + "auxiliary_loss_mlp": 0.01270934, + "balance_loss_clip": 0.0628084, + "balance_loss_mlp": 0.01257249, + "epoch": 0.43619419810611754, + "flos": 22644366497280.0, + "grad_norm": 1.8869772682878516, + "language_loss": 0.81010306, + "learning_rate": 2.5048677450342406e-06, + "loss": 0.88727921, + "num_input_tokens_seen": 155513040, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.13671875, + "step": 7255, + "time_per_iteration": 2.6183383464813232 + }, + { + "auxiliary_loss_clip": 0.06450298, + "auxiliary_loss_mlp": 0.01267933, + "balance_loss_clip": 0.06279971, + "balance_loss_mlp": 0.01254772, + "epoch": 0.4362543213587855, + "flos": 20054200285440.0, + "grad_norm": 1.8086691858124306, + "language_loss": 0.78106731, + "learning_rate": 2.504490886831089e-06, + "loss": 0.85824955, + "num_input_tokens_seen": 155530100, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.13165283, + "step": 7256, + "time_per_iteration": 2.5364508628845215 + }, + { + "auxiliary_loss_clip": 0.06446915, + "auxiliary_loss_mlp": 0.01269551, + "balance_loss_clip": 0.06280836, + "balance_loss_mlp": 0.01256122, + "epoch": 0.43631444461145347, + "flos": 21367759616640.0, + "grad_norm": 1.5279282177598472, + "language_loss": 0.75952047, + "learning_rate": 2.5041140094975922e-06, + "loss": 0.83668512, + "num_input_tokens_seen": 155549375, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.13452148, + "step": 7257, + "time_per_iteration": 2.5156846046447754 + }, + { + "auxiliary_loss_clip": 0.06452259, + "auxiliary_loss_mlp": 0.01269452, + "balance_loss_clip": 0.06281701, + "balance_loss_mlp": 0.01255123, + "epoch": 0.43637456786412143, + "flos": 22424999708160.0, + "grad_norm": 1.7230532534800784, + "language_loss": 0.73248196, + "learning_rate": 2.5037371130480417e-06, + "loss": 0.80969918, + "num_input_tokens_seen": 155569395, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.14324951, + "step": 7258, + "time_per_iteration": 2.6132447719573975 + }, + { + "auxiliary_loss_clip": 0.06453618, + "auxiliary_loss_mlp": 0.01267142, + "balance_loss_clip": 0.06282197, + "balance_loss_mlp": 0.01253725, + "epoch": 0.4364346911167894, + "flos": 28556452627200.0, + "grad_norm": 1.8100021880336497, + "language_loss": 0.77633202, + "learning_rate": 2.5033601974967297e-06, + "loss": 0.85353959, + "num_input_tokens_seen": 155589090, + "router_z_loss_clip": 1.71191406, + "router_z_loss_mlp": 0.13415527, + "step": 7259, + "time_per_iteration": 2.589134931564331 + }, + { + "auxiliary_loss_clip": 0.06393245, + "auxiliary_loss_mlp": 0.01278627, + "balance_loss_clip": 0.0631365, + "balance_loss_mlp": 0.01275647, + "epoch": 0.43649481436945736, + "flos": 62678149407360.0, + "grad_norm": 0.7458705100033151, + "language_loss": 0.56939262, + "learning_rate": 2.5029832628579483e-06, + "loss": 0.64611137, + "num_input_tokens_seen": 155648660, + "router_z_loss_clip": 0.796875, + "router_z_loss_mlp": 0.02978516, + "step": 7260, + "time_per_iteration": 3.11572265625 + }, + { + "auxiliary_loss_clip": 0.06454421, + "auxiliary_loss_mlp": 0.01272288, + "balance_loss_clip": 0.06285764, + "balance_loss_mlp": 0.01257494, + "epoch": 0.4365549376221254, + "flos": 30600088583040.0, + "grad_norm": 1.806363539403124, + "language_loss": 0.71915948, + "learning_rate": 2.5026063091459907e-06, + "loss": 0.79642659, + "num_input_tokens_seen": 155669945, + "router_z_loss_clip": 1.6875, + "router_z_loss_mlp": 0.14794922, + "step": 7261, + "time_per_iteration": 2.6100480556488037 + }, + { + "auxiliary_loss_clip": 0.06453972, + "auxiliary_loss_mlp": 0.01271962, + "balance_loss_clip": 0.06284794, + "balance_loss_mlp": 0.0125836, + "epoch": 0.43661506087479335, + "flos": 17171684778240.0, + "grad_norm": 2.033659544742114, + "language_loss": 0.69274759, + "learning_rate": 2.5022293363751522e-06, + "loss": 0.77000701, + "num_input_tokens_seen": 155688555, + "router_z_loss_clip": 1.69042969, + "router_z_loss_mlp": 0.13604736, + "step": 7262, + "time_per_iteration": 2.556318521499634 + }, + { + "auxiliary_loss_clip": 0.0644339, + "auxiliary_loss_mlp": 0.01266124, + "balance_loss_clip": 0.06282735, + "balance_loss_mlp": 0.01253345, + "epoch": 0.4366751841274613, + "flos": 22052875726080.0, + "grad_norm": 1.6437752521732585, + "language_loss": 0.80115777, + "learning_rate": 2.501852344559726e-06, + "loss": 0.87825286, + "num_input_tokens_seen": 155705370, + "router_z_loss_clip": 1.60546875, + "router_z_loss_mlp": 0.12780762, + "step": 7263, + "time_per_iteration": 2.509807825088501 + }, + { + "auxiliary_loss_clip": 0.06448945, + "auxiliary_loss_mlp": 0.01267422, + "balance_loss_clip": 0.06281485, + "balance_loss_mlp": 0.01254076, + "epoch": 0.4367353073801293, + "flos": 16002748794240.0, + "grad_norm": 1.6772415302555446, + "language_loss": 0.76036841, + "learning_rate": 2.50147533371401e-06, + "loss": 0.83753204, + "num_input_tokens_seen": 155721890, + "router_z_loss_clip": 1.67480469, + "router_z_loss_mlp": 0.13354492, + "step": 7264, + "time_per_iteration": 2.523973226547241 + }, + { + "auxiliary_loss_clip": 0.06444526, + "auxiliary_loss_mlp": 0.01267772, + "balance_loss_clip": 0.06279328, + "balance_loss_mlp": 0.01253997, + "epoch": 0.43679543063279724, + "flos": 38226760485120.0, + "grad_norm": 2.1479145935669615, + "language_loss": 0.61845875, + "learning_rate": 2.501098303852298e-06, + "loss": 0.69558173, + "num_input_tokens_seen": 155743970, + "router_z_loss_clip": 1.65234375, + "router_z_loss_mlp": 0.13787842, + "step": 7265, + "time_per_iteration": 2.6696202754974365 + }, + { + "auxiliary_loss_clip": 0.06447139, + "auxiliary_loss_mlp": 0.01269097, + "balance_loss_clip": 0.06282498, + "balance_loss_mlp": 0.01256211, + "epoch": 0.4368555538854652, + "flos": 15198306071040.0, + "grad_norm": 1.934873925186605, + "language_loss": 0.73721504, + "learning_rate": 2.5007212549888884e-06, + "loss": 0.81437743, + "num_input_tokens_seen": 155761830, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.12896729, + "step": 7266, + "time_per_iteration": 2.5559945106506348 + }, + { + "auxiliary_loss_clip": 0.0644975, + "auxiliary_loss_mlp": 0.01273187, + "balance_loss_clip": 0.06282988, + "balance_loss_mlp": 0.01260432, + "epoch": 0.4369156771381332, + "flos": 23074630813440.0, + "grad_norm": 2.1253877681457904, + "language_loss": 0.82184762, + "learning_rate": 2.5003441871380794e-06, + "loss": 0.899077, + "num_input_tokens_seen": 155779610, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.12762451, + "step": 7267, + "time_per_iteration": 2.534639358520508 + }, + { + "auxiliary_loss_clip": 0.06444408, + "auxiliary_loss_mlp": 0.01269536, + "balance_loss_clip": 0.06281124, + "balance_loss_mlp": 0.01256459, + "epoch": 0.43697580039080114, + "flos": 23447886825600.0, + "grad_norm": 2.09966668439896, + "language_loss": 0.75195235, + "learning_rate": 2.4999671003141674e-06, + "loss": 0.82909179, + "num_input_tokens_seen": 155798765, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.13085938, + "step": 7268, + "time_per_iteration": 2.6128745079040527 + }, + { + "auxiliary_loss_clip": 0.06451406, + "auxiliary_loss_mlp": 0.0126863, + "balance_loss_clip": 0.06280525, + "balance_loss_mlp": 0.0125451, + "epoch": 0.4370359236434691, + "flos": 18520519478400.0, + "grad_norm": 3.050341004743464, + "language_loss": 0.79660171, + "learning_rate": 2.499589994531454e-06, + "loss": 0.87380207, + "num_input_tokens_seen": 155817750, + "router_z_loss_clip": 1.70898438, + "router_z_loss_mlp": 0.14099121, + "step": 7269, + "time_per_iteration": 2.516211986541748 + }, + { + "auxiliary_loss_clip": 0.06446489, + "auxiliary_loss_mlp": 0.01273185, + "balance_loss_clip": 0.06281964, + "balance_loss_mlp": 0.01260174, + "epoch": 0.43709604689613707, + "flos": 23229316650240.0, + "grad_norm": 1.8886828014681587, + "language_loss": 0.75057715, + "learning_rate": 2.499212869804237e-06, + "loss": 0.82777393, + "num_input_tokens_seen": 155836490, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.13024902, + "step": 7270, + "time_per_iteration": 2.5755550861358643 + }, + { + "auxiliary_loss_clip": 0.06447008, + "auxiliary_loss_mlp": 0.01268284, + "balance_loss_clip": 0.06279345, + "balance_loss_mlp": 0.01255064, + "epoch": 0.43715617014880503, + "flos": 23810199880320.0, + "grad_norm": 1.808972971243201, + "language_loss": 0.79453981, + "learning_rate": 2.4988357261468182e-06, + "loss": 0.87169278, + "num_input_tokens_seen": 155856225, + "router_z_loss_clip": 1.67773438, + "router_z_loss_mlp": 0.13220215, + "step": 7271, + "time_per_iteration": 2.564471960067749 + }, + { + "auxiliary_loss_clip": 0.06369642, + "auxiliary_loss_mlp": 0.01258814, + "balance_loss_clip": 0.0629034, + "balance_loss_mlp": 0.01255858, + "epoch": 0.437216293401473, + "flos": 61961824851840.0, + "grad_norm": 0.6886560925106296, + "language_loss": 0.54733157, + "learning_rate": 2.4984585635734993e-06, + "loss": 0.62361616, + "num_input_tokens_seen": 155916770, + "router_z_loss_clip": 0.79443359, + "router_z_loss_mlp": 0.02954102, + "step": 7272, + "time_per_iteration": 3.208707332611084 + }, + { + "auxiliary_loss_clip": 0.06451584, + "auxiliary_loss_mlp": 0.01270794, + "balance_loss_clip": 0.06281105, + "balance_loss_mlp": 0.01256757, + "epoch": 0.43727641665414096, + "flos": 21988907533440.0, + "grad_norm": 1.571184799437717, + "language_loss": 0.70994467, + "learning_rate": 2.498081382098581e-06, + "loss": 0.78716844, + "num_input_tokens_seen": 155936490, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.14031982, + "step": 7273, + "time_per_iteration": 2.540081262588501 + }, + { + "auxiliary_loss_clip": 0.06448624, + "auxiliary_loss_mlp": 0.0126917, + "balance_loss_clip": 0.06279367, + "balance_loss_mlp": 0.01255271, + "epoch": 0.437336539906809, + "flos": 39540277889280.0, + "grad_norm": 1.8107596290780341, + "language_loss": 0.7551834, + "learning_rate": 2.497704181736367e-06, + "loss": 0.83236134, + "num_input_tokens_seen": 155957595, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.13903809, + "step": 7274, + "time_per_iteration": 2.6836495399475098 + }, + { + "auxiliary_loss_clip": 0.06441884, + "auxiliary_loss_mlp": 0.01265059, + "balance_loss_clip": 0.06277934, + "balance_loss_mlp": 0.01252703, + "epoch": 0.43739666315947695, + "flos": 17462902043520.0, + "grad_norm": 1.9085211858375455, + "language_loss": 0.80314881, + "learning_rate": 2.49732696250116e-06, + "loss": 0.88021827, + "num_input_tokens_seen": 155975710, + "router_z_loss_clip": 1.63867188, + "router_z_loss_mlp": 0.12353516, + "step": 7275, + "time_per_iteration": 2.5408823490142822 + }, + { + "auxiliary_loss_clip": 0.06450746, + "auxiliary_loss_mlp": 0.01272848, + "balance_loss_clip": 0.06284586, + "balance_loss_mlp": 0.01259753, + "epoch": 0.4374567864121449, + "flos": 16363678256640.0, + "grad_norm": 1.98644372860744, + "language_loss": 0.81298435, + "learning_rate": 2.496949724407266e-06, + "loss": 0.89022022, + "num_input_tokens_seen": 155993090, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.13092041, + "step": 7276, + "time_per_iteration": 2.4871010780334473 + }, + { + "auxiliary_loss_clip": 0.06454313, + "auxiliary_loss_mlp": 0.01266955, + "balance_loss_clip": 0.06281172, + "balance_loss_mlp": 0.01253013, + "epoch": 0.4375169096648129, + "flos": 30594721921920.0, + "grad_norm": 1.9320579241517422, + "language_loss": 0.73048055, + "learning_rate": 2.496572467468988e-06, + "loss": 0.8076933, + "num_input_tokens_seen": 156013685, + "router_z_loss_clip": 1.73339844, + "router_z_loss_mlp": 0.1394043, + "step": 7277, + "time_per_iteration": 2.6151673793792725 + }, + { + "auxiliary_loss_clip": 0.06445154, + "auxiliary_loss_mlp": 0.01272648, + "balance_loss_clip": 0.06279732, + "balance_loss_mlp": 0.01258939, + "epoch": 0.43757703291748085, + "flos": 30563555402880.0, + "grad_norm": 1.9557335242574223, + "language_loss": 0.72527206, + "learning_rate": 2.4961951917006317e-06, + "loss": 0.80245006, + "num_input_tokens_seen": 156034300, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.13696289, + "step": 7278, + "time_per_iteration": 2.583293914794922 + }, + { + "auxiliary_loss_clip": 0.06440841, + "auxiliary_loss_mlp": 0.01270709, + "balance_loss_clip": 0.06277154, + "balance_loss_mlp": 0.01258371, + "epoch": 0.4376371561701488, + "flos": 21403747745280.0, + "grad_norm": 1.4778175335443475, + "language_loss": 0.65870327, + "learning_rate": 2.4958178971165046e-06, + "loss": 0.73581874, + "num_input_tokens_seen": 156053805, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.12329102, + "step": 7279, + "time_per_iteration": 2.5419130325317383 + }, + { + "auxiliary_loss_clip": 0.06451775, + "auxiliary_loss_mlp": 0.01270137, + "balance_loss_clip": 0.06279162, + "balance_loss_mlp": 0.01256559, + "epoch": 0.4376972794228168, + "flos": 23411144010240.0, + "grad_norm": 1.7454635588007905, + "language_loss": 0.8264519, + "learning_rate": 2.4954405837309126e-06, + "loss": 0.90367103, + "num_input_tokens_seen": 156073295, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.13568115, + "step": 7280, + "time_per_iteration": 2.5270493030548096 + }, + { + "auxiliary_loss_clip": 0.06438784, + "auxiliary_loss_mlp": 0.01272842, + "balance_loss_clip": 0.06277376, + "balance_loss_mlp": 0.01259848, + "epoch": 0.43775740267548474, + "flos": 22899511779840.0, + "grad_norm": 1.6085189920631162, + "language_loss": 0.7756325, + "learning_rate": 2.4950632515581653e-06, + "loss": 0.85274875, + "num_input_tokens_seen": 156094540, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.13000488, + "step": 7281, + "time_per_iteration": 2.614102602005005 + }, + { + "auxiliary_loss_clip": 0.0644282, + "auxiliary_loss_mlp": 0.01275956, + "balance_loss_clip": 0.06276567, + "balance_loss_mlp": 0.01263028, + "epoch": 0.4378175259281527, + "flos": 23301041345280.0, + "grad_norm": 1.8125010794319167, + "language_loss": 0.7622053, + "learning_rate": 2.494685900612569e-06, + "loss": 0.83939308, + "num_input_tokens_seen": 156114070, + "router_z_loss_clip": 1.66308594, + "router_z_loss_mlp": 0.12915039, + "step": 7282, + "time_per_iteration": 3.9149930477142334 + }, + { + "auxiliary_loss_clip": 0.06446523, + "auxiliary_loss_mlp": 0.01267087, + "balance_loss_clip": 0.06279582, + "balance_loss_mlp": 0.01254438, + "epoch": 0.43787764918082067, + "flos": 23883433948800.0, + "grad_norm": 2.0076194716834874, + "language_loss": 0.85396934, + "learning_rate": 2.4943085309084333e-06, + "loss": 0.93110549, + "num_input_tokens_seen": 156132130, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.12652588, + "step": 7283, + "time_per_iteration": 3.9656553268432617 + }, + { + "auxiliary_loss_clip": 0.0644891, + "auxiliary_loss_mlp": 0.01268213, + "balance_loss_clip": 0.06279234, + "balance_loss_mlp": 0.01254999, + "epoch": 0.43793777243348864, + "flos": 23995004060160.0, + "grad_norm": 1.8602515290448327, + "language_loss": 0.8091675, + "learning_rate": 2.49393114246007e-06, + "loss": 0.88633871, + "num_input_tokens_seen": 156150820, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.13214111, + "step": 7284, + "time_per_iteration": 2.566521167755127 + }, + { + "auxiliary_loss_clip": 0.06443676, + "auxiliary_loss_mlp": 0.0127107, + "balance_loss_clip": 0.06278057, + "balance_loss_mlp": 0.01258774, + "epoch": 0.4379978956861566, + "flos": 18629909383680.0, + "grad_norm": 1.7731724137458924, + "language_loss": 0.80635571, + "learning_rate": 2.493553735281787e-06, + "loss": 0.8835032, + "num_input_tokens_seen": 156170125, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.12310791, + "step": 7285, + "time_per_iteration": 2.5004618167877197 + }, + { + "auxiliary_loss_clip": 0.0643899, + "auxiliary_loss_mlp": 0.01269665, + "balance_loss_clip": 0.06274976, + "balance_loss_mlp": 0.01256642, + "epoch": 0.43805801893882457, + "flos": 21987901284480.0, + "grad_norm": 1.9005617879541583, + "language_loss": 0.75070119, + "learning_rate": 2.493176309387897e-06, + "loss": 0.82778776, + "num_input_tokens_seen": 156187320, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.13031006, + "step": 7286, + "time_per_iteration": 2.5617265701293945 + }, + { + "auxiliary_loss_clip": 0.0644343, + "auxiliary_loss_mlp": 0.01269982, + "balance_loss_clip": 0.06274993, + "balance_loss_mlp": 0.01257239, + "epoch": 0.43811814219149253, + "flos": 26400114529920.0, + "grad_norm": 2.124374396883661, + "language_loss": 0.73769003, + "learning_rate": 2.492798864792712e-06, + "loss": 0.81482422, + "num_input_tokens_seen": 156207455, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.12738037, + "step": 7287, + "time_per_iteration": 2.5709421634674072 + }, + { + "auxiliary_loss_clip": 0.06442735, + "auxiliary_loss_mlp": 0.01272914, + "balance_loss_clip": 0.06276426, + "balance_loss_mlp": 0.01259115, + "epoch": 0.43817826544416055, + "flos": 17499015953280.0, + "grad_norm": 1.6607447345750057, + "language_loss": 0.82538438, + "learning_rate": 2.492421401510545e-06, + "loss": 0.90254092, + "num_input_tokens_seen": 156226560, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.13812256, + "step": 7288, + "time_per_iteration": 3.92202091217041 + }, + { + "auxiliary_loss_clip": 0.06447385, + "auxiliary_loss_mlp": 0.01268847, + "balance_loss_clip": 0.06276591, + "balance_loss_mlp": 0.01254888, + "epoch": 0.4382383886968285, + "flos": 21587629530240.0, + "grad_norm": 1.4460149141548964, + "language_loss": 0.84252048, + "learning_rate": 2.4920439195557093e-06, + "loss": 0.9196828, + "num_input_tokens_seen": 156246740, + "router_z_loss_clip": 1.70703125, + "router_z_loss_mlp": 0.1395874, + "step": 7289, + "time_per_iteration": 2.557433843612671 + }, + { + "auxiliary_loss_clip": 0.06446871, + "auxiliary_loss_mlp": 0.01267959, + "balance_loss_clip": 0.06274465, + "balance_loss_mlp": 0.01254912, + "epoch": 0.4382985119494965, + "flos": 27930441173760.0, + "grad_norm": 2.36337419111835, + "language_loss": 0.78573066, + "learning_rate": 2.4916664189425183e-06, + "loss": 0.86287904, + "num_input_tokens_seen": 156266440, + "router_z_loss_clip": 1.72753906, + "router_z_loss_mlp": 0.13067627, + "step": 7290, + "time_per_iteration": 2.5970215797424316 + }, + { + "auxiliary_loss_clip": 0.06439934, + "auxiliary_loss_mlp": 0.01272143, + "balance_loss_clip": 0.06275328, + "balance_loss_mlp": 0.0125903, + "epoch": 0.43835863520216445, + "flos": 24943860495360.0, + "grad_norm": 1.8528017599911322, + "language_loss": 0.7800144, + "learning_rate": 2.491288899685288e-06, + "loss": 0.85713518, + "num_input_tokens_seen": 156286900, + "router_z_loss_clip": 1.64648438, + "router_z_loss_mlp": 0.13110352, + "step": 7291, + "time_per_iteration": 2.5944950580596924 + }, + { + "auxiliary_loss_clip": 0.06443708, + "auxiliary_loss_mlp": 0.01274453, + "balance_loss_clip": 0.06277154, + "balance_loss_mlp": 0.0126106, + "epoch": 0.4384187584548324, + "flos": 33518634145920.0, + "grad_norm": 1.8972630881774872, + "language_loss": 0.64874315, + "learning_rate": 2.4909113617983325e-06, + "loss": 0.72592473, + "num_input_tokens_seen": 156307690, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.13391113, + "step": 7292, + "time_per_iteration": 2.628173351287842 + }, + { + "auxiliary_loss_clip": 0.06447129, + "auxiliary_loss_mlp": 0.01269671, + "balance_loss_clip": 0.06278794, + "balance_loss_mlp": 0.01256653, + "epoch": 0.4384788817075004, + "flos": 23957800047360.0, + "grad_norm": 1.5925770854238166, + "language_loss": 0.74671286, + "learning_rate": 2.49053380529597e-06, + "loss": 0.82388091, + "num_input_tokens_seen": 156326620, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.13031006, + "step": 7293, + "time_per_iteration": 3.9379074573516846 + }, + { + "auxiliary_loss_clip": 0.06446324, + "auxiliary_loss_mlp": 0.01270789, + "balance_loss_clip": 0.06279649, + "balance_loss_mlp": 0.0125668, + "epoch": 0.43853900496016834, + "flos": 19104463382400.0, + "grad_norm": 4.9627482836353165, + "language_loss": 0.7920171, + "learning_rate": 2.490156230192516e-06, + "loss": 0.86918819, + "num_input_tokens_seen": 156345495, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.14099121, + "step": 7294, + "time_per_iteration": 2.4718902111053467 + }, + { + "auxiliary_loss_clip": 0.06450905, + "auxiliary_loss_mlp": 0.01269807, + "balance_loss_clip": 0.06283231, + "balance_loss_mlp": 0.01256252, + "epoch": 0.4385991282128363, + "flos": 13230503660160.0, + "grad_norm": 1.631074893492929, + "language_loss": 0.73162925, + "learning_rate": 2.4897786365022883e-06, + "loss": 0.80883634, + "num_input_tokens_seen": 156363155, + "router_z_loss_clip": 1.67480469, + "router_z_loss_mlp": 0.13574219, + "step": 7295, + "time_per_iteration": 2.531641721725464 + }, + { + "auxiliary_loss_clip": 0.06452312, + "auxiliary_loss_mlp": 0.01270937, + "balance_loss_clip": 0.06283045, + "balance_loss_mlp": 0.01256298, + "epoch": 0.4386592514655043, + "flos": 14325199326720.0, + "grad_norm": 2.435451861079371, + "language_loss": 0.75030828, + "learning_rate": 2.4894010242396063e-06, + "loss": 0.8275407, + "num_input_tokens_seen": 156380940, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.14648438, + "step": 7296, + "time_per_iteration": 2.4799978733062744 + }, + { + "auxiliary_loss_clip": 0.06443385, + "auxiliary_loss_mlp": 0.01270746, + "balance_loss_clip": 0.06278379, + "balance_loss_mlp": 0.01257598, + "epoch": 0.43871937471817224, + "flos": 22791128123520.0, + "grad_norm": 1.513671798105688, + "language_loss": 0.69379568, + "learning_rate": 2.4890233934187873e-06, + "loss": 0.77093697, + "num_input_tokens_seen": 156400415, + "router_z_loss_clip": 1.64941406, + "router_z_loss_mlp": 0.13146973, + "step": 7297, + "time_per_iteration": 2.5378599166870117 + }, + { + "auxiliary_loss_clip": 0.06447895, + "auxiliary_loss_mlp": 0.01268794, + "balance_loss_clip": 0.06281355, + "balance_loss_mlp": 0.01255878, + "epoch": 0.4387794979708402, + "flos": 28079466860160.0, + "grad_norm": 1.3753147611046208, + "language_loss": 0.70496702, + "learning_rate": 2.4886457440541535e-06, + "loss": 0.78213394, + "num_input_tokens_seen": 156421120, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.12902832, + "step": 7298, + "time_per_iteration": 2.5667972564697266 + }, + { + "auxiliary_loss_clip": 0.06442846, + "auxiliary_loss_mlp": 0.01270993, + "balance_loss_clip": 0.06279726, + "balance_loss_mlp": 0.01258023, + "epoch": 0.43883962122350817, + "flos": 26256665139840.0, + "grad_norm": 1.5271246100670304, + "language_loss": 0.72762883, + "learning_rate": 2.4882680761600238e-06, + "loss": 0.80476719, + "num_input_tokens_seen": 156441535, + "router_z_loss_clip": 1.63085938, + "router_z_loss_mlp": 0.12976074, + "step": 7299, + "time_per_iteration": 2.567258834838867 + }, + { + "auxiliary_loss_clip": 0.06449576, + "auxiliary_loss_mlp": 0.012749, + "balance_loss_clip": 0.06281091, + "balance_loss_mlp": 0.01260142, + "epoch": 0.43889974447617613, + "flos": 25890662505600.0, + "grad_norm": 1.7549107290593968, + "language_loss": 0.76878119, + "learning_rate": 2.487890389750719e-06, + "loss": 0.84602594, + "num_input_tokens_seen": 156462015, + "router_z_loss_clip": 1.6875, + "router_z_loss_mlp": 0.14758301, + "step": 7300, + "time_per_iteration": 2.541740655899048 + }, + { + "auxiliary_loss_clip": 0.06448291, + "auxiliary_loss_mlp": 0.01268162, + "balance_loss_clip": 0.06281555, + "balance_loss_mlp": 0.01254346, + "epoch": 0.43895986772884416, + "flos": 25053711598080.0, + "grad_norm": 2.544712476821277, + "language_loss": 0.71268392, + "learning_rate": 2.4875126848405626e-06, + "loss": 0.78984845, + "num_input_tokens_seen": 156482165, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.13824463, + "step": 7301, + "time_per_iteration": 2.547846794128418 + }, + { + "auxiliary_loss_clip": 0.06445279, + "auxiliary_loss_mlp": 0.01269466, + "balance_loss_clip": 0.06277898, + "balance_loss_mlp": 0.01254434, + "epoch": 0.4390199909815121, + "flos": 26001729492480.0, + "grad_norm": 4.607507625532986, + "language_loss": 0.71274817, + "learning_rate": 2.4871349614438757e-06, + "loss": 0.78989553, + "num_input_tokens_seen": 156503170, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.15026855, + "step": 7302, + "time_per_iteration": 2.531633138656616 + }, + { + "auxiliary_loss_clip": 0.06444067, + "auxiliary_loss_mlp": 0.0126751, + "balance_loss_clip": 0.06280646, + "balance_loss_mlp": 0.01254618, + "epoch": 0.4390801142341801, + "flos": 29029790741760.0, + "grad_norm": 1.545722029471357, + "language_loss": 0.82388735, + "learning_rate": 2.486757219574983e-06, + "loss": 0.90100312, + "num_input_tokens_seen": 156523005, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.12908936, + "step": 7303, + "time_per_iteration": 2.6841824054718018 + }, + { + "auxiliary_loss_clip": 0.06456171, + "auxiliary_loss_mlp": 0.01271253, + "balance_loss_clip": 0.06284264, + "balance_loss_mlp": 0.01256649, + "epoch": 0.43914023748684805, + "flos": 33447077159040.0, + "grad_norm": 2.3091286506484034, + "language_loss": 0.69152826, + "learning_rate": 2.4863794592482067e-06, + "loss": 0.76880252, + "num_input_tokens_seen": 156544440, + "router_z_loss_clip": 1.71972656, + "router_z_loss_mlp": 0.1461792, + "step": 7304, + "time_per_iteration": 2.6893982887268066 + }, + { + "auxiliary_loss_clip": 0.06439492, + "auxiliary_loss_mlp": 0.01269095, + "balance_loss_clip": 0.06278437, + "balance_loss_mlp": 0.01256507, + "epoch": 0.439200360739516, + "flos": 34540347306240.0, + "grad_norm": 1.5007015420493954, + "language_loss": 0.78744507, + "learning_rate": 2.486001680477873e-06, + "loss": 0.86453092, + "num_input_tokens_seen": 156565410, + "router_z_loss_clip": 1.61132812, + "router_z_loss_mlp": 0.12573242, + "step": 7305, + "time_per_iteration": 2.6403284072875977 + }, + { + "auxiliary_loss_clip": 0.06446742, + "auxiliary_loss_mlp": 0.01269235, + "balance_loss_clip": 0.06281108, + "balance_loss_mlp": 0.01255019, + "epoch": 0.439260483992184, + "flos": 21914247945600.0, + "grad_norm": 1.7423010107893722, + "language_loss": 0.68937683, + "learning_rate": 2.485623883278308e-06, + "loss": 0.76653659, + "num_input_tokens_seen": 156584210, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.14221191, + "step": 7306, + "time_per_iteration": 2.5665781497955322 + }, + { + "auxiliary_loss_clip": 0.06446797, + "auxiliary_loss_mlp": 0.01272443, + "balance_loss_clip": 0.06279111, + "balance_loss_mlp": 0.01258877, + "epoch": 0.43932060724485195, + "flos": 21002805158400.0, + "grad_norm": 1.5749593715316206, + "language_loss": 0.63249755, + "learning_rate": 2.4852460676638344e-06, + "loss": 0.70968997, + "num_input_tokens_seen": 156602730, + "router_z_loss_clip": 1.67675781, + "router_z_loss_mlp": 0.13562012, + "step": 7307, + "time_per_iteration": 2.5204410552978516 + }, + { + "auxiliary_loss_clip": 0.06449466, + "auxiliary_loss_mlp": 0.0126805, + "balance_loss_clip": 0.06279462, + "balance_loss_mlp": 0.01254305, + "epoch": 0.4393807304975199, + "flos": 17752526081280.0, + "grad_norm": 1.900088770074622, + "language_loss": 0.72216207, + "learning_rate": 2.4848682336487828e-06, + "loss": 0.79933721, + "num_input_tokens_seen": 156619405, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.13745117, + "step": 7308, + "time_per_iteration": 2.4988410472869873 + }, + { + "auxiliary_loss_clip": 0.06445662, + "auxiliary_loss_mlp": 0.01268116, + "balance_loss_clip": 0.06277111, + "balance_loss_mlp": 0.01254669, + "epoch": 0.4394408537501879, + "flos": 22535102373120.0, + "grad_norm": 2.200318468716899, + "language_loss": 0.76911771, + "learning_rate": 2.4844903812474787e-06, + "loss": 0.84625548, + "num_input_tokens_seen": 156638165, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.13458252, + "step": 7309, + "time_per_iteration": 2.521385431289673 + }, + { + "auxiliary_loss_clip": 0.06438792, + "auxiliary_loss_mlp": 0.01270246, + "balance_loss_clip": 0.06277418, + "balance_loss_mlp": 0.01257908, + "epoch": 0.43950097700285584, + "flos": 23447383701120.0, + "grad_norm": 3.092354645663241, + "language_loss": 0.71101463, + "learning_rate": 2.484112510474251e-06, + "loss": 0.78810501, + "num_input_tokens_seen": 156658845, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.12335205, + "step": 7310, + "time_per_iteration": 2.609769344329834 + }, + { + "auxiliary_loss_clip": 0.06452246, + "auxiliary_loss_mlp": 0.01269049, + "balance_loss_clip": 0.06282806, + "balance_loss_mlp": 0.0125624, + "epoch": 0.4395611002555238, + "flos": 23186620195200.0, + "grad_norm": 3.6443795998554744, + "language_loss": 0.76179528, + "learning_rate": 2.483734621343429e-06, + "loss": 0.83900821, + "num_input_tokens_seen": 156677275, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.12817383, + "step": 7311, + "time_per_iteration": 2.5347063541412354 + }, + { + "auxiliary_loss_clip": 0.06451476, + "auxiliary_loss_mlp": 0.01270936, + "balance_loss_clip": 0.0628207, + "balance_loss_mlp": 0.01258043, + "epoch": 0.43962122350819177, + "flos": 22133908224000.0, + "grad_norm": 1.9101034753519561, + "language_loss": 0.81546378, + "learning_rate": 2.483356713869341e-06, + "loss": 0.89268786, + "num_input_tokens_seen": 156695815, + "router_z_loss_clip": 1.69042969, + "router_z_loss_mlp": 0.12890625, + "step": 7312, + "time_per_iteration": 2.5744950771331787 + }, + { + "auxiliary_loss_clip": 0.06441756, + "auxiliary_loss_mlp": 0.01268695, + "balance_loss_clip": 0.06277572, + "balance_loss_mlp": 0.01255713, + "epoch": 0.43968134676085974, + "flos": 17426285009280.0, + "grad_norm": 1.9172183853591918, + "language_loss": 0.86001694, + "learning_rate": 2.482978788066318e-06, + "loss": 0.93712139, + "num_input_tokens_seen": 156714385, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.12982178, + "step": 7313, + "time_per_iteration": 2.536870241165161 + }, + { + "auxiliary_loss_clip": 0.06445049, + "auxiliary_loss_mlp": 0.01271249, + "balance_loss_clip": 0.06276917, + "balance_loss_mlp": 0.01258184, + "epoch": 0.43974147001352776, + "flos": 18958582224000.0, + "grad_norm": 6.24702313006486, + "language_loss": 0.679317, + "learning_rate": 2.4826008439486904e-06, + "loss": 0.75647992, + "num_input_tokens_seen": 156732615, + "router_z_loss_clip": 1.68066406, + "router_z_loss_mlp": 0.13061523, + "step": 7314, + "time_per_iteration": 2.5457370281219482 + }, + { + "auxiliary_loss_clip": 0.06448518, + "auxiliary_loss_mlp": 0.01271322, + "balance_loss_clip": 0.06279253, + "balance_loss_mlp": 0.01258209, + "epoch": 0.4398015932661957, + "flos": 18959588472960.0, + "grad_norm": 1.6336273312910292, + "language_loss": 0.76986659, + "learning_rate": 2.4822228815307915e-06, + "loss": 0.84706497, + "num_input_tokens_seen": 156750920, + "router_z_loss_clip": 1.69042969, + "router_z_loss_mlp": 0.13098145, + "step": 7315, + "time_per_iteration": 2.5225329399108887 + }, + { + "auxiliary_loss_clip": 0.06442133, + "auxiliary_loss_mlp": 0.01268226, + "balance_loss_clip": 0.06276898, + "balance_loss_mlp": 0.01255447, + "epoch": 0.4398617165188637, + "flos": 24205608097920.0, + "grad_norm": 2.1993234427936637, + "language_loss": 0.74934149, + "learning_rate": 2.4818449008269523e-06, + "loss": 0.8264451, + "num_input_tokens_seen": 156768520, + "router_z_loss_clip": 1.65234375, + "router_z_loss_mlp": 0.12780762, + "step": 7316, + "time_per_iteration": 2.5561742782592773 + }, + { + "auxiliary_loss_clip": 0.06444536, + "auxiliary_loss_mlp": 0.01271979, + "balance_loss_clip": 0.06280385, + "balance_loss_mlp": 0.01259289, + "epoch": 0.43992183977153165, + "flos": 22243214275200.0, + "grad_norm": 2.7598614180807814, + "language_loss": 0.65349543, + "learning_rate": 2.481466901851506e-06, + "loss": 0.73066062, + "num_input_tokens_seen": 156788700, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.12695312, + "step": 7317, + "time_per_iteration": 2.5142266750335693 + }, + { + "auxiliary_loss_clip": 0.06450248, + "auxiliary_loss_mlp": 0.01270442, + "balance_loss_clip": 0.06283192, + "balance_loss_mlp": 0.01256929, + "epoch": 0.4399819630241996, + "flos": 18703395014400.0, + "grad_norm": 1.826408349581849, + "language_loss": 0.80062312, + "learning_rate": 2.4810888846187865e-06, + "loss": 0.87783003, + "num_input_tokens_seen": 156806470, + "router_z_loss_clip": 1.66894531, + "router_z_loss_mlp": 0.13519287, + "step": 7318, + "time_per_iteration": 2.519906520843506 + }, + { + "auxiliary_loss_clip": 0.06445621, + "auxiliary_loss_mlp": 0.01269422, + "balance_loss_clip": 0.06275794, + "balance_loss_mlp": 0.01255725, + "epoch": 0.4400420862768676, + "flos": 23886326914560.0, + "grad_norm": 1.6582419144412086, + "language_loss": 0.79880667, + "learning_rate": 2.4807108491431283e-06, + "loss": 0.87595713, + "num_input_tokens_seen": 156825895, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.13708496, + "step": 7319, + "time_per_iteration": 2.593442440032959 + }, + { + "auxiliary_loss_clip": 0.06445733, + "auxiliary_loss_mlp": 0.01274619, + "balance_loss_clip": 0.06279506, + "balance_loss_mlp": 0.01260547, + "epoch": 0.44010220952953555, + "flos": 28045071959040.0, + "grad_norm": 2.6685359162637172, + "language_loss": 0.80292428, + "learning_rate": 2.4803327954388667e-06, + "loss": 0.88012779, + "num_input_tokens_seen": 156845990, + "router_z_loss_clip": 1.66113281, + "router_z_loss_mlp": 0.14074707, + "step": 7320, + "time_per_iteration": 2.576824188232422 + }, + { + "auxiliary_loss_clip": 0.06443729, + "auxiliary_loss_mlp": 0.01271309, + "balance_loss_clip": 0.06278579, + "balance_loss_mlp": 0.01258333, + "epoch": 0.4401623327822035, + "flos": 23775763052160.0, + "grad_norm": 3.573791590582856, + "language_loss": 0.69620574, + "learning_rate": 2.4799547235203376e-06, + "loss": 0.77335614, + "num_input_tokens_seen": 156866685, + "router_z_loss_clip": 1.65136719, + "router_z_loss_mlp": 0.12969971, + "step": 7321, + "time_per_iteration": 4.008130311965942 + }, + { + "auxiliary_loss_clip": 0.06352215, + "auxiliary_loss_mlp": 0.01268902, + "balance_loss_clip": 0.06277325, + "balance_loss_mlp": 0.01265612, + "epoch": 0.4402224560348715, + "flos": 70797320081280.0, + "grad_norm": 0.8902034574652531, + "language_loss": 0.56966496, + "learning_rate": 2.4795766334018763e-06, + "loss": 0.64587617, + "num_input_tokens_seen": 156923450, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.03295898, + "step": 7322, + "time_per_iteration": 4.591723680496216 + }, + { + "auxiliary_loss_clip": 0.06443685, + "auxiliary_loss_mlp": 0.01271286, + "balance_loss_clip": 0.06277888, + "balance_loss_mlp": 0.01258787, + "epoch": 0.44028257928753944, + "flos": 22898170114560.0, + "grad_norm": 1.423216656342095, + "language_loss": 0.76491451, + "learning_rate": 2.479198525097822e-06, + "loss": 0.8420642, + "num_input_tokens_seen": 156944795, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.12493896, + "step": 7323, + "time_per_iteration": 2.5367372035980225 + }, + { + "auxiliary_loss_clip": 0.06449594, + "auxiliary_loss_mlp": 0.01277882, + "balance_loss_clip": 0.06282798, + "balance_loss_mlp": 0.01265216, + "epoch": 0.4403427025402074, + "flos": 17901719475840.0, + "grad_norm": 1.6412485345287482, + "language_loss": 0.80679965, + "learning_rate": 2.478820398622511e-06, + "loss": 0.88407433, + "num_input_tokens_seen": 156962755, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.12670898, + "step": 7324, + "time_per_iteration": 2.496735095977783 + }, + { + "auxiliary_loss_clip": 0.0634661, + "auxiliary_loss_mlp": 0.01259308, + "balance_loss_clip": 0.06271856, + "balance_loss_mlp": 0.01255979, + "epoch": 0.4404028257928754, + "flos": 69583717071360.0, + "grad_norm": 0.6517122364434149, + "language_loss": 0.54482663, + "learning_rate": 2.478442253990283e-06, + "loss": 0.62088585, + "num_input_tokens_seen": 157028095, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.03335571, + "step": 7325, + "time_per_iteration": 3.1927096843719482 + }, + { + "auxiliary_loss_clip": 0.06445315, + "auxiliary_loss_mlp": 0.01266283, + "balance_loss_clip": 0.06281503, + "balance_loss_mlp": 0.01253981, + "epoch": 0.44046294904554334, + "flos": 20930074214400.0, + "grad_norm": 1.5304533021700073, + "language_loss": 0.69945073, + "learning_rate": 2.4780640912154766e-06, + "loss": 0.77656674, + "num_input_tokens_seen": 157048365, + "router_z_loss_clip": 1.63964844, + "router_z_loss_mlp": 0.12298584, + "step": 7326, + "time_per_iteration": 2.5716168880462646 + }, + { + "auxiliary_loss_clip": 0.06441578, + "auxiliary_loss_mlp": 0.01266878, + "balance_loss_clip": 0.06279023, + "balance_loss_mlp": 0.01254402, + "epoch": 0.44052307229821136, + "flos": 23630301164160.0, + "grad_norm": 1.488040619087652, + "language_loss": 0.76529855, + "learning_rate": 2.477685910312432e-06, + "loss": 0.84238315, + "num_input_tokens_seen": 157069130, + "router_z_loss_clip": 1.62402344, + "router_z_loss_mlp": 0.12481689, + "step": 7327, + "time_per_iteration": 3.997654676437378 + }, + { + "auxiliary_loss_clip": 0.06439877, + "auxiliary_loss_mlp": 0.01269684, + "balance_loss_clip": 0.06277373, + "balance_loss_mlp": 0.01256744, + "epoch": 0.4405831955508793, + "flos": 17602536072960.0, + "grad_norm": 2.6410067735498512, + "language_loss": 0.83833683, + "learning_rate": 2.4773077112954897e-06, + "loss": 0.91543245, + "num_input_tokens_seen": 157084940, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.1295166, + "step": 7328, + "time_per_iteration": 2.520899534225464 + }, + { + "auxiliary_loss_clip": 0.06445633, + "auxiliary_loss_mlp": 0.01268864, + "balance_loss_clip": 0.06283547, + "balance_loss_mlp": 0.01255703, + "epoch": 0.4406433188035473, + "flos": 21468596405760.0, + "grad_norm": 3.134642090151518, + "language_loss": 0.77723283, + "learning_rate": 2.4769294941789908e-06, + "loss": 0.85437775, + "num_input_tokens_seen": 157102770, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.13165283, + "step": 7329, + "time_per_iteration": 2.5004947185516357 + }, + { + "auxiliary_loss_clip": 0.06448144, + "auxiliary_loss_mlp": 0.01272671, + "balance_loss_clip": 0.06280035, + "balance_loss_mlp": 0.01259176, + "epoch": 0.44070344205621526, + "flos": 22680019209600.0, + "grad_norm": 1.6769566948090702, + "language_loss": 0.74290001, + "learning_rate": 2.476551258977278e-06, + "loss": 0.82010818, + "num_input_tokens_seen": 157122035, + "router_z_loss_clip": 1.6796875, + "router_z_loss_mlp": 0.1348877, + "step": 7330, + "time_per_iteration": 2.534775733947754 + }, + { + "auxiliary_loss_clip": 0.06448483, + "auxiliary_loss_mlp": 0.01270882, + "balance_loss_clip": 0.06283589, + "balance_loss_mlp": 0.01258127, + "epoch": 0.4407635653088832, + "flos": 23448012606720.0, + "grad_norm": 1.699983061814717, + "language_loss": 0.74538559, + "learning_rate": 2.4761730057046936e-06, + "loss": 0.82257915, + "num_input_tokens_seen": 157142800, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.12762451, + "step": 7331, + "time_per_iteration": 2.5442659854888916 + }, + { + "auxiliary_loss_clip": 0.06442808, + "auxiliary_loss_mlp": 0.01270328, + "balance_loss_clip": 0.06279509, + "balance_loss_mlp": 0.01256667, + "epoch": 0.4408236885615512, + "flos": 24027596098560.0, + "grad_norm": 1.6889636086213913, + "language_loss": 0.76643395, + "learning_rate": 2.475794734375581e-06, + "loss": 0.84356534, + "num_input_tokens_seen": 157163295, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.13659668, + "step": 7332, + "time_per_iteration": 2.5714762210845947 + }, + { + "auxiliary_loss_clip": 0.06442308, + "auxiliary_loss_mlp": 0.01271754, + "balance_loss_clip": 0.06277508, + "balance_loss_mlp": 0.01258272, + "epoch": 0.44088381181421915, + "flos": 12681667416960.0, + "grad_norm": 1.845933322464005, + "language_loss": 0.73768836, + "learning_rate": 2.475416445004285e-06, + "loss": 0.81482899, + "num_input_tokens_seen": 157180890, + "router_z_loss_clip": 1.64746094, + "router_z_loss_mlp": 0.1348877, + "step": 7333, + "time_per_iteration": 3.9176201820373535 + }, + { + "auxiliary_loss_clip": 0.06439593, + "auxiliary_loss_mlp": 0.01265669, + "balance_loss_clip": 0.06280486, + "balance_loss_mlp": 0.01253486, + "epoch": 0.4409439350668871, + "flos": 24576474268800.0, + "grad_norm": 1.6297964144317614, + "language_loss": 0.79249531, + "learning_rate": 2.4750381376051493e-06, + "loss": 0.8695479, + "num_input_tokens_seen": 157200580, + "router_z_loss_clip": 1.59277344, + "router_z_loss_mlp": 0.12200928, + "step": 7334, + "time_per_iteration": 2.530762195587158 + }, + { + "auxiliary_loss_clip": 0.06456793, + "auxiliary_loss_mlp": 0.01269696, + "balance_loss_clip": 0.06281539, + "balance_loss_mlp": 0.01254747, + "epoch": 0.4410040583195551, + "flos": 22674191351040.0, + "grad_norm": 7.845487214918662, + "language_loss": 0.7603153, + "learning_rate": 2.47465981219252e-06, + "loss": 0.83758014, + "num_input_tokens_seen": 157218345, + "router_z_loss_clip": 1.75097656, + "router_z_loss_mlp": 0.1496582, + "step": 7335, + "time_per_iteration": 2.5146994590759277 + }, + { + "auxiliary_loss_clip": 0.06445056, + "auxiliary_loss_mlp": 0.01269223, + "balance_loss_clip": 0.06279862, + "balance_loss_mlp": 0.01254942, + "epoch": 0.44106418157222305, + "flos": 10857062833920.0, + "grad_norm": 1.9701535584859973, + "language_loss": 0.72720182, + "learning_rate": 2.4742814687807423e-06, + "loss": 0.80434465, + "num_input_tokens_seen": 157234395, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.14263916, + "step": 7336, + "time_per_iteration": 2.470501661300659 + }, + { + "auxiliary_loss_clip": 0.06448875, + "auxiliary_loss_mlp": 0.01272884, + "balance_loss_clip": 0.06281201, + "balance_loss_mlp": 0.01259079, + "epoch": 0.441124304824891, + "flos": 21733301053440.0, + "grad_norm": 2.690720747597236, + "language_loss": 0.62764168, + "learning_rate": 2.473903107384165e-06, + "loss": 0.70485932, + "num_input_tokens_seen": 157254805, + "router_z_loss_clip": 1.67578125, + "router_z_loss_mlp": 0.13812256, + "step": 7337, + "time_per_iteration": 2.5464730262756348 + }, + { + "auxiliary_loss_clip": 0.06339368, + "auxiliary_loss_mlp": 0.01255392, + "balance_loss_clip": 0.06265444, + "balance_loss_mlp": 0.01252635, + "epoch": 0.441184428077559, + "flos": 63241702041600.0, + "grad_norm": 0.7296971987367982, + "language_loss": 0.52622962, + "learning_rate": 2.473524728017134e-06, + "loss": 0.60217726, + "num_input_tokens_seen": 157317870, + "router_z_loss_clip": 0.73876953, + "router_z_loss_mlp": 0.02761841, + "step": 7338, + "time_per_iteration": 3.1634135246276855 + }, + { + "auxiliary_loss_clip": 0.06451306, + "auxiliary_loss_mlp": 0.0127376, + "balance_loss_clip": 0.06278681, + "balance_loss_mlp": 0.01259133, + "epoch": 0.44124455133022694, + "flos": 21184213248000.0, + "grad_norm": 2.888450189779477, + "language_loss": 0.71053195, + "learning_rate": 2.473146330693997e-06, + "loss": 0.78778255, + "num_input_tokens_seen": 157336505, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.14611816, + "step": 7339, + "time_per_iteration": 2.526179552078247 + }, + { + "auxiliary_loss_clip": 0.06437125, + "auxiliary_loss_mlp": 0.01265386, + "balance_loss_clip": 0.06279349, + "balance_loss_mlp": 0.01252833, + "epoch": 0.4413046745828949, + "flos": 17463740584320.0, + "grad_norm": 1.6365123651784117, + "language_loss": 0.70282859, + "learning_rate": 2.472767915429105e-06, + "loss": 0.77985364, + "num_input_tokens_seen": 157354995, + "router_z_loss_clip": 1.57714844, + "router_z_loss_mlp": 0.12554932, + "step": 7340, + "time_per_iteration": 2.4790234565734863 + }, + { + "auxiliary_loss_clip": 0.06342094, + "auxiliary_loss_mlp": 0.01254424, + "balance_loss_clip": 0.06268074, + "balance_loss_mlp": 0.01251767, + "epoch": 0.4413647978355629, + "flos": 61602251783040.0, + "grad_norm": 0.8821319445569078, + "language_loss": 0.64009017, + "learning_rate": 2.4723894822368054e-06, + "loss": 0.71605539, + "num_input_tokens_seen": 157404260, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.02659607, + "step": 7341, + "time_per_iteration": 2.9593453407287598 + }, + { + "auxiliary_loss_clip": 0.06446001, + "auxiliary_loss_mlp": 0.0127129, + "balance_loss_clip": 0.06280506, + "balance_loss_mlp": 0.01257992, + "epoch": 0.4414249210882309, + "flos": 27534404050560.0, + "grad_norm": 1.9827417031820809, + "language_loss": 0.73812068, + "learning_rate": 2.47201103113145e-06, + "loss": 0.81529361, + "num_input_tokens_seen": 157423045, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.13299561, + "step": 7342, + "time_per_iteration": 2.5592381954193115 + }, + { + "auxiliary_loss_clip": 0.06443819, + "auxiliary_loss_mlp": 0.01272391, + "balance_loss_clip": 0.06280041, + "balance_loss_mlp": 0.01258497, + "epoch": 0.44148504434089886, + "flos": 23520785477760.0, + "grad_norm": 1.7847903417039304, + "language_loss": 0.80326116, + "learning_rate": 2.4716325621273886e-06, + "loss": 0.88042319, + "num_input_tokens_seen": 157441815, + "router_z_loss_clip": 1.63769531, + "router_z_loss_mlp": 0.13885498, + "step": 7343, + "time_per_iteration": 2.567669630050659 + }, + { + "auxiliary_loss_clip": 0.0644604, + "auxiliary_loss_mlp": 0.01268371, + "balance_loss_clip": 0.06281629, + "balance_loss_mlp": 0.01254382, + "epoch": 0.4415451675935668, + "flos": 21587126405760.0, + "grad_norm": 1.6274174275387656, + "language_loss": 0.7678231, + "learning_rate": 2.4712540752389725e-06, + "loss": 0.84496725, + "num_input_tokens_seen": 157460470, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.14001465, + "step": 7344, + "time_per_iteration": 2.50498628616333 + }, + { + "auxiliary_loss_clip": 0.06331868, + "auxiliary_loss_mlp": 0.01254509, + "balance_loss_clip": 0.06258254, + "balance_loss_mlp": 0.01251979, + "epoch": 0.4416052908462348, + "flos": 59023825142400.0, + "grad_norm": 0.9594048262741005, + "language_loss": 0.63725042, + "learning_rate": 2.470875570480556e-06, + "loss": 0.71311414, + "num_input_tokens_seen": 157512655, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.02529907, + "step": 7345, + "time_per_iteration": 2.9305789470672607 + }, + { + "auxiliary_loss_clip": 0.06448534, + "auxiliary_loss_mlp": 0.01269691, + "balance_loss_clip": 0.06281187, + "balance_loss_mlp": 0.01255386, + "epoch": 0.44166541409890275, + "flos": 26364545671680.0, + "grad_norm": 1.5861169822925434, + "language_loss": 0.86231661, + "learning_rate": 2.470497047866489e-06, + "loss": 0.9394989, + "num_input_tokens_seen": 157533700, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.14306641, + "step": 7346, + "time_per_iteration": 2.566326141357422 + }, + { + "auxiliary_loss_clip": 0.06448992, + "auxiliary_loss_mlp": 0.01268131, + "balance_loss_clip": 0.06282933, + "balance_loss_mlp": 0.01253909, + "epoch": 0.4417255373515707, + "flos": 20198739778560.0, + "grad_norm": 1.9006247897038917, + "language_loss": 0.80872411, + "learning_rate": 2.470118507411128e-06, + "loss": 0.88589537, + "num_input_tokens_seen": 157551105, + "router_z_loss_clip": 1.66113281, + "router_z_loss_mlp": 0.14221191, + "step": 7347, + "time_per_iteration": 2.4968490600585938 + }, + { + "auxiliary_loss_clip": 0.06445403, + "auxiliary_loss_mlp": 0.01269031, + "balance_loss_clip": 0.06279784, + "balance_loss_mlp": 0.01254166, + "epoch": 0.4417856606042387, + "flos": 17892537454080.0, + "grad_norm": 1.9280841383218132, + "language_loss": 0.83507645, + "learning_rate": 2.4697399491288263e-06, + "loss": 0.91222078, + "num_input_tokens_seen": 157568285, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.14868164, + "step": 7348, + "time_per_iteration": 2.5483500957489014 + }, + { + "auxiliary_loss_clip": 0.06451687, + "auxiliary_loss_mlp": 0.01270301, + "balance_loss_clip": 0.06282644, + "balance_loss_mlp": 0.0125571, + "epoch": 0.44184578385690665, + "flos": 27971376693120.0, + "grad_norm": 2.209333058456871, + "language_loss": 0.70229864, + "learning_rate": 2.469361373033938e-06, + "loss": 0.77951854, + "num_input_tokens_seen": 157590405, + "router_z_loss_clip": 1.69042969, + "router_z_loss_mlp": 0.14593506, + "step": 7349, + "time_per_iteration": 2.5552031993865967 + }, + { + "auxiliary_loss_clip": 0.06448848, + "auxiliary_loss_mlp": 0.01269717, + "balance_loss_clip": 0.06281149, + "balance_loss_mlp": 0.01254858, + "epoch": 0.4419059071095746, + "flos": 23374652757120.0, + "grad_norm": 1.8931524120790788, + "language_loss": 0.74732667, + "learning_rate": 2.468982779140819e-06, + "loss": 0.82451236, + "num_input_tokens_seen": 157607420, + "router_z_loss_clip": 1.67578125, + "router_z_loss_mlp": 0.14855957, + "step": 7350, + "time_per_iteration": 2.5428407192230225 + }, + { + "auxiliary_loss_clip": 0.06449752, + "auxiliary_loss_mlp": 0.01269052, + "balance_loss_clip": 0.06283528, + "balance_loss_mlp": 0.01254591, + "epoch": 0.4419660303622426, + "flos": 15017443032960.0, + "grad_norm": 2.6211867622298626, + "language_loss": 0.81412131, + "learning_rate": 2.468604167463827e-06, + "loss": 0.89130938, + "num_input_tokens_seen": 157624990, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.14453125, + "step": 7351, + "time_per_iteration": 2.5310895442962646 + }, + { + "auxiliary_loss_clip": 0.06439559, + "auxiliary_loss_mlp": 0.01271292, + "balance_loss_clip": 0.06278528, + "balance_loss_mlp": 0.01258537, + "epoch": 0.44202615361491054, + "flos": 25378359442560.0, + "grad_norm": 1.998249332467298, + "language_loss": 0.73669267, + "learning_rate": 2.4682255380173176e-06, + "loss": 0.81380117, + "num_input_tokens_seen": 157645300, + "router_z_loss_clip": 1.61035156, + "router_z_loss_mlp": 0.12774658, + "step": 7352, + "time_per_iteration": 2.6823537349700928 + }, + { + "auxiliary_loss_clip": 0.06450884, + "auxiliary_loss_mlp": 0.01267651, + "balance_loss_clip": 0.06284234, + "balance_loss_mlp": 0.01253584, + "epoch": 0.4420862768675785, + "flos": 24688044380160.0, + "grad_norm": 1.9707834429969424, + "language_loss": 0.87580955, + "learning_rate": 2.467846890815649e-06, + "loss": 0.95299494, + "num_input_tokens_seen": 157664060, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.14086914, + "step": 7353, + "time_per_iteration": 2.531208038330078 + }, + { + "auxiliary_loss_clip": 0.06445745, + "auxiliary_loss_mlp": 0.01274404, + "balance_loss_clip": 0.06277722, + "balance_loss_mlp": 0.01260659, + "epoch": 0.44214640012024653, + "flos": 19533134471040.0, + "grad_norm": 2.5061219192509676, + "language_loss": 0.76425511, + "learning_rate": 2.4674682258731795e-06, + "loss": 0.84145659, + "num_input_tokens_seen": 157680905, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.13751221, + "step": 7354, + "time_per_iteration": 2.5208046436309814 + }, + { + "auxiliary_loss_clip": 0.06442366, + "auxiliary_loss_mlp": 0.01269638, + "balance_loss_clip": 0.06279345, + "balance_loss_mlp": 0.01256894, + "epoch": 0.4422065233729145, + "flos": 47568143940480.0, + "grad_norm": 2.32689870132585, + "language_loss": 0.65273595, + "learning_rate": 2.467089543204268e-06, + "loss": 0.72985595, + "num_input_tokens_seen": 157701980, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.12768555, + "step": 7355, + "time_per_iteration": 2.7359063625335693 + }, + { + "auxiliary_loss_clip": 0.06452843, + "auxiliary_loss_mlp": 0.01272532, + "balance_loss_clip": 0.06279876, + "balance_loss_mlp": 0.01257225, + "epoch": 0.44226664662558246, + "flos": 19287045429120.0, + "grad_norm": 1.8090120162092156, + "language_loss": 0.78513968, + "learning_rate": 2.466710842823274e-06, + "loss": 0.86239338, + "num_input_tokens_seen": 157720555, + "router_z_loss_clip": 1.72753906, + "router_z_loss_mlp": 0.15307617, + "step": 7356, + "time_per_iteration": 2.5535836219787598 + }, + { + "auxiliary_loss_clip": 0.0645135, + "auxiliary_loss_mlp": 0.01270574, + "balance_loss_clip": 0.0628085, + "balance_loss_mlp": 0.01255184, + "epoch": 0.4423267698782504, + "flos": 17827604939520.0, + "grad_norm": 1.5923292427452285, + "language_loss": 0.77331412, + "learning_rate": 2.4663321247445577e-06, + "loss": 0.85053337, + "num_input_tokens_seen": 157739160, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.1539917, + "step": 7357, + "time_per_iteration": 2.472616195678711 + }, + { + "auxiliary_loss_clip": 0.06444242, + "auxiliary_loss_mlp": 0.0127409, + "balance_loss_clip": 0.06277513, + "balance_loss_mlp": 0.01259112, + "epoch": 0.4423868931309184, + "flos": 29211953518080.0, + "grad_norm": 1.4316006976636513, + "language_loss": 0.73656726, + "learning_rate": 2.465953388982481e-06, + "loss": 0.81375057, + "num_input_tokens_seen": 157760020, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.14971924, + "step": 7358, + "time_per_iteration": 2.596794366836548 + }, + { + "auxiliary_loss_clip": 0.06449263, + "auxiliary_loss_mlp": 0.01268513, + "balance_loss_clip": 0.06281863, + "balance_loss_mlp": 0.01255131, + "epoch": 0.44244701638358636, + "flos": 29720399293440.0, + "grad_norm": 1.5482043588344903, + "language_loss": 0.75746959, + "learning_rate": 2.465574635551405e-06, + "loss": 0.83464736, + "num_input_tokens_seen": 157780435, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.13378906, + "step": 7359, + "time_per_iteration": 2.565152168273926 + }, + { + "auxiliary_loss_clip": 0.06449427, + "auxiliary_loss_mlp": 0.01273427, + "balance_loss_clip": 0.06282771, + "balance_loss_mlp": 0.01258907, + "epoch": 0.4425071396362543, + "flos": 22936715792640.0, + "grad_norm": 1.7006216058888692, + "language_loss": 0.70234901, + "learning_rate": 2.4651958644656923e-06, + "loss": 0.77957749, + "num_input_tokens_seen": 157799420, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.14526367, + "step": 7360, + "time_per_iteration": 3.9516735076904297 + }, + { + "auxiliary_loss_clip": 0.06450445, + "auxiliary_loss_mlp": 0.01276643, + "balance_loss_clip": 0.06282296, + "balance_loss_mlp": 0.01262028, + "epoch": 0.4425672628889223, + "flos": 19798509951360.0, + "grad_norm": 2.334645337647824, + "language_loss": 0.69802427, + "learning_rate": 2.4648170757397053e-06, + "loss": 0.77529514, + "num_input_tokens_seen": 157817025, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.14599609, + "step": 7361, + "time_per_iteration": 3.9590420722961426 + }, + { + "auxiliary_loss_clip": 0.06448395, + "auxiliary_loss_mlp": 0.01271063, + "balance_loss_clip": 0.06281347, + "balance_loss_mlp": 0.01256287, + "epoch": 0.44262738614159025, + "flos": 13667266667520.0, + "grad_norm": 1.9889994262633817, + "language_loss": 0.82882756, + "learning_rate": 2.464438269387809e-06, + "loss": 0.90602213, + "num_input_tokens_seen": 157834345, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.14770508, + "step": 7362, + "time_per_iteration": 2.4627645015716553 + }, + { + "auxiliary_loss_clip": 0.06458044, + "auxiliary_loss_mlp": 0.01274491, + "balance_loss_clip": 0.06284538, + "balance_loss_mlp": 0.01258111, + "epoch": 0.4426875093942582, + "flos": 14215474005120.0, + "grad_norm": 1.7592716332344263, + "language_loss": 0.75051332, + "learning_rate": 2.464059445424366e-06, + "loss": 0.82783866, + "num_input_tokens_seen": 157852290, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.16381836, + "step": 7363, + "time_per_iteration": 2.526925802230835 + }, + { + "auxiliary_loss_clip": 0.0633463, + "auxiliary_loss_mlp": 0.01256608, + "balance_loss_clip": 0.06260501, + "balance_loss_mlp": 0.01253844, + "epoch": 0.4427476326469262, + "flos": 70140100181760.0, + "grad_norm": 0.6687771463902197, + "language_loss": 0.55581295, + "learning_rate": 2.463680603863743e-06, + "loss": 0.63172531, + "num_input_tokens_seen": 157923060, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.02767944, + "step": 7364, + "time_per_iteration": 3.2234084606170654 + }, + { + "auxiliary_loss_clip": 0.06445954, + "auxiliary_loss_mlp": 0.01269396, + "balance_loss_clip": 0.06280937, + "balance_loss_mlp": 0.01255479, + "epoch": 0.44280775589959415, + "flos": 25451761219200.0, + "grad_norm": 6.076987981061014, + "language_loss": 0.75066888, + "learning_rate": 2.463301744720305e-06, + "loss": 0.82782239, + "num_input_tokens_seen": 157944110, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.13928223, + "step": 7365, + "time_per_iteration": 2.606168746948242 + }, + { + "auxiliary_loss_clip": 0.06448679, + "auxiliary_loss_mlp": 0.01268458, + "balance_loss_clip": 0.06282686, + "balance_loss_mlp": 0.01253724, + "epoch": 0.4428678791522621, + "flos": 22863900994560.0, + "grad_norm": 1.5120042705282817, + "language_loss": 0.74655497, + "learning_rate": 2.4629228680084184e-06, + "loss": 0.82372636, + "num_input_tokens_seen": 157964295, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.1473999, + "step": 7366, + "time_per_iteration": 2.5269834995269775 + }, + { + "auxiliary_loss_clip": 0.06449491, + "auxiliary_loss_mlp": 0.0127034, + "balance_loss_clip": 0.06283636, + "balance_loss_mlp": 0.01255438, + "epoch": 0.44292800240493013, + "flos": 25819608643200.0, + "grad_norm": 2.3253747528787447, + "language_loss": 0.7339704, + "learning_rate": 2.46254397374245e-06, + "loss": 0.81116873, + "num_input_tokens_seen": 157983970, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.14904785, + "step": 7367, + "time_per_iteration": 4.017570495605469 + }, + { + "auxiliary_loss_clip": 0.06453082, + "auxiliary_loss_mlp": 0.01276023, + "balance_loss_clip": 0.06286091, + "balance_loss_mlp": 0.01260979, + "epoch": 0.4429881256575981, + "flos": 32425238217600.0, + "grad_norm": 1.584590811661976, + "language_loss": 0.73953557, + "learning_rate": 2.4621650619367677e-06, + "loss": 0.81682664, + "num_input_tokens_seen": 158006515, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.15057373, + "step": 7368, + "time_per_iteration": 2.6219804286956787 + }, + { + "auxiliary_loss_clip": 0.06446074, + "auxiliary_loss_mlp": 0.01270312, + "balance_loss_clip": 0.06281151, + "balance_loss_mlp": 0.01256007, + "epoch": 0.44304824891026606, + "flos": 22170231768960.0, + "grad_norm": 1.6442785623938219, + "language_loss": 0.79845673, + "learning_rate": 2.4617861326057403e-06, + "loss": 0.8756206, + "num_input_tokens_seen": 158025565, + "router_z_loss_clip": 1.64941406, + "router_z_loss_mlp": 0.14306641, + "step": 7369, + "time_per_iteration": 2.5048859119415283 + }, + { + "auxiliary_loss_clip": 0.06445719, + "auxiliary_loss_mlp": 0.01268056, + "balance_loss_clip": 0.0628242, + "balance_loss_mlp": 0.01253524, + "epoch": 0.443108372162934, + "flos": 25345725477120.0, + "grad_norm": 1.8080912741875748, + "language_loss": 0.72226167, + "learning_rate": 2.461407185763737e-06, + "loss": 0.79939938, + "num_input_tokens_seen": 158045620, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.14538574, + "step": 7370, + "time_per_iteration": 2.59167218208313 + }, + { + "auxiliary_loss_clip": 0.06444093, + "auxiliary_loss_mlp": 0.01274154, + "balance_loss_clip": 0.06279977, + "balance_loss_mlp": 0.01259741, + "epoch": 0.443168495415602, + "flos": 23337616452480.0, + "grad_norm": 2.642683672552081, + "language_loss": 0.70957971, + "learning_rate": 2.461028221425126e-06, + "loss": 0.78676224, + "num_input_tokens_seen": 158063505, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.14428711, + "step": 7371, + "time_per_iteration": 2.5119266510009766 + }, + { + "auxiliary_loss_clip": 0.0644391, + "auxiliary_loss_mlp": 0.01268622, + "balance_loss_clip": 0.06280756, + "balance_loss_mlp": 0.01255288, + "epoch": 0.44322861866826996, + "flos": 21877924400640.0, + "grad_norm": 2.5641722247612977, + "language_loss": 0.69211292, + "learning_rate": 2.4606492396042786e-06, + "loss": 0.76923823, + "num_input_tokens_seen": 158080335, + "router_z_loss_clip": 1.63085938, + "router_z_loss_mlp": 0.13330078, + "step": 7372, + "time_per_iteration": 2.575803518295288 + }, + { + "auxiliary_loss_clip": 0.06450622, + "auxiliary_loss_mlp": 0.01273627, + "balance_loss_clip": 0.06281562, + "balance_loss_mlp": 0.01257855, + "epoch": 0.4432887419209379, + "flos": 20090649611520.0, + "grad_norm": 1.7339006835744544, + "language_loss": 0.83742619, + "learning_rate": 2.4602702403155664e-06, + "loss": 0.91466868, + "num_input_tokens_seen": 158098955, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.15765381, + "step": 7373, + "time_per_iteration": 4.006488084793091 + }, + { + "auxiliary_loss_clip": 0.06340961, + "auxiliary_loss_mlp": 0.01252329, + "balance_loss_clip": 0.06267951, + "balance_loss_mlp": 0.01249765, + "epoch": 0.4433488651736059, + "flos": 70056593988480.0, + "grad_norm": 0.7566866942124226, + "language_loss": 0.55204445, + "learning_rate": 2.4598912235733604e-06, + "loss": 0.62797731, + "num_input_tokens_seen": 158164110, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.02565002, + "step": 7374, + "time_per_iteration": 3.1780457496643066 + }, + { + "auxiliary_loss_clip": 0.06443411, + "auxiliary_loss_mlp": 0.01275671, + "balance_loss_clip": 0.06280876, + "balance_loss_mlp": 0.01260198, + "epoch": 0.44340898842627385, + "flos": 16286838462720.0, + "grad_norm": 2.3260457628480617, + "language_loss": 0.82868445, + "learning_rate": 2.4595121893920327e-06, + "loss": 0.90587527, + "num_input_tokens_seen": 158179850, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.15478516, + "step": 7375, + "time_per_iteration": 2.5473110675811768 + }, + { + "auxiliary_loss_clip": 0.0644948, + "auxiliary_loss_mlp": 0.01269753, + "balance_loss_clip": 0.06282064, + "balance_loss_mlp": 0.01255388, + "epoch": 0.4434691116789418, + "flos": 16616601406080.0, + "grad_norm": 2.217281539940859, + "language_loss": 0.83904636, + "learning_rate": 2.4591331377859578e-06, + "loss": 0.91623867, + "num_input_tokens_seen": 158196590, + "router_z_loss_clip": 1.671875, + "router_z_loss_mlp": 0.1439209, + "step": 7376, + "time_per_iteration": 2.4960668087005615 + }, + { + "auxiliary_loss_clip": 0.06447101, + "auxiliary_loss_mlp": 0.01271986, + "balance_loss_clip": 0.06282647, + "balance_loss_mlp": 0.01257573, + "epoch": 0.4435292349316098, + "flos": 19069397648640.0, + "grad_norm": 1.7110647715019258, + "language_loss": 0.77357483, + "learning_rate": 2.4587540687695077e-06, + "loss": 0.85076571, + "num_input_tokens_seen": 158216355, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.14422607, + "step": 7377, + "time_per_iteration": 2.5489466190338135 + }, + { + "auxiliary_loss_clip": 0.064443, + "auxiliary_loss_mlp": 0.01269165, + "balance_loss_clip": 0.06284986, + "balance_loss_mlp": 0.01255396, + "epoch": 0.44358935818427775, + "flos": 21257656951680.0, + "grad_norm": 1.7746716431943175, + "language_loss": 0.75928617, + "learning_rate": 2.458374982357057e-06, + "loss": 0.83642089, + "num_input_tokens_seen": 158235825, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.13763428, + "step": 7378, + "time_per_iteration": 2.498782157897949 + }, + { + "auxiliary_loss_clip": 0.06446375, + "auxiliary_loss_mlp": 0.01269929, + "balance_loss_clip": 0.06281648, + "balance_loss_mlp": 0.01255106, + "epoch": 0.4436494814369457, + "flos": 12500259327360.0, + "grad_norm": 1.8740687903376234, + "language_loss": 0.69627756, + "learning_rate": 2.457995878562982e-06, + "loss": 0.77344066, + "num_input_tokens_seen": 158254230, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.14825439, + "step": 7379, + "time_per_iteration": 2.5212602615356445 + }, + { + "auxiliary_loss_clip": 0.0645185, + "auxiliary_loss_mlp": 0.01266938, + "balance_loss_clip": 0.0628576, + "balance_loss_mlp": 0.01252556, + "epoch": 0.44370960468961373, + "flos": 23666666636160.0, + "grad_norm": 2.508566876625721, + "language_loss": 0.73565447, + "learning_rate": 2.457616757401656e-06, + "loss": 0.81284231, + "num_input_tokens_seen": 158273400, + "router_z_loss_clip": 1.66113281, + "router_z_loss_mlp": 0.1439209, + "step": 7380, + "time_per_iteration": 2.500859260559082 + }, + { + "auxiliary_loss_clip": 0.06449685, + "auxiliary_loss_mlp": 0.01268804, + "balance_loss_clip": 0.06285541, + "balance_loss_mlp": 0.01255452, + "epoch": 0.4437697279422817, + "flos": 32425196290560.0, + "grad_norm": 1.7107220322970214, + "language_loss": 0.65104783, + "learning_rate": 2.457237618887458e-06, + "loss": 0.72823262, + "num_input_tokens_seen": 158296840, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.13336182, + "step": 7381, + "time_per_iteration": 2.618229627609253 + }, + { + "auxiliary_loss_clip": 0.06454551, + "auxiliary_loss_mlp": 0.01272971, + "balance_loss_clip": 0.06288015, + "balance_loss_mlp": 0.01258773, + "epoch": 0.44382985119494966, + "flos": 18118570642560.0, + "grad_norm": 2.331874867497661, + "language_loss": 0.80543017, + "learning_rate": 2.456858463034763e-06, + "loss": 0.88270545, + "num_input_tokens_seen": 158314935, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.14190674, + "step": 7382, + "time_per_iteration": 2.4738404750823975 + }, + { + "auxiliary_loss_clip": 0.06452931, + "auxiliary_loss_mlp": 0.01272481, + "balance_loss_clip": 0.06287742, + "balance_loss_mlp": 0.01258486, + "epoch": 0.44388997444761763, + "flos": 30782083651200.0, + "grad_norm": 1.5922456749371714, + "language_loss": 0.65226638, + "learning_rate": 2.456479289857949e-06, + "loss": 0.72952044, + "num_input_tokens_seen": 158334620, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.13983154, + "step": 7383, + "time_per_iteration": 2.614912986755371 + }, + { + "auxiliary_loss_clip": 0.0645685, + "auxiliary_loss_mlp": 0.01272667, + "balance_loss_clip": 0.0628838, + "balance_loss_mlp": 0.01258088, + "epoch": 0.4439500977002856, + "flos": 20345333696640.0, + "grad_norm": 2.064556949518224, + "language_loss": 0.76699257, + "learning_rate": 2.4561000993713953e-06, + "loss": 0.84428775, + "num_input_tokens_seen": 158350550, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.14587402, + "step": 7384, + "time_per_iteration": 2.4842731952667236 + }, + { + "auxiliary_loss_clip": 0.06456664, + "auxiliary_loss_mlp": 0.012692, + "balance_loss_clip": 0.06288753, + "balance_loss_mlp": 0.01254442, + "epoch": 0.44401022095295356, + "flos": 20376667923840.0, + "grad_norm": 2.2924078267975605, + "language_loss": 0.80810666, + "learning_rate": 2.4557208915894796e-06, + "loss": 0.88536537, + "num_input_tokens_seen": 158369555, + "router_z_loss_clip": 1.67871094, + "router_z_loss_mlp": 0.14758301, + "step": 7385, + "time_per_iteration": 2.5268380641937256 + }, + { + "auxiliary_loss_clip": 0.0645503, + "auxiliary_loss_mlp": 0.01272748, + "balance_loss_clip": 0.06290472, + "balance_loss_mlp": 0.01257013, + "epoch": 0.4440703442056215, + "flos": 20236950040320.0, + "grad_norm": 1.6897241264536553, + "language_loss": 0.82179439, + "learning_rate": 2.455341666526582e-06, + "loss": 0.89907217, + "num_input_tokens_seen": 158388045, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.15734863, + "step": 7386, + "time_per_iteration": 2.497891426086426 + }, + { + "auxiliary_loss_clip": 0.06463334, + "auxiliary_loss_mlp": 0.01273049, + "balance_loss_clip": 0.06290253, + "balance_loss_mlp": 0.01257683, + "epoch": 0.4441304674582895, + "flos": 39504163979520.0, + "grad_norm": 2.9557468241194624, + "language_loss": 0.70275033, + "learning_rate": 2.4549624241970832e-06, + "loss": 0.78011411, + "num_input_tokens_seen": 158410115, + "router_z_loss_clip": 1.73046875, + "router_z_loss_mlp": 0.15356445, + "step": 7387, + "time_per_iteration": 2.6782705783843994 + }, + { + "auxiliary_loss_clip": 0.06455649, + "auxiliary_loss_mlp": 0.01272917, + "balance_loss_clip": 0.06289866, + "balance_loss_mlp": 0.01258206, + "epoch": 0.44419059071095746, + "flos": 14834902913280.0, + "grad_norm": 1.9684531060003607, + "language_loss": 0.72165161, + "learning_rate": 2.4545831646153628e-06, + "loss": 0.79893732, + "num_input_tokens_seen": 158427765, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.14715576, + "step": 7388, + "time_per_iteration": 2.5119476318359375 + }, + { + "auxiliary_loss_clip": 0.06464041, + "auxiliary_loss_mlp": 0.01270575, + "balance_loss_clip": 0.06293739, + "balance_loss_mlp": 0.01255113, + "epoch": 0.4442507139636254, + "flos": 22644408424320.0, + "grad_norm": 1.566920019209845, + "language_loss": 0.69646138, + "learning_rate": 2.4542038877958044e-06, + "loss": 0.77380753, + "num_input_tokens_seen": 158446375, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.15454102, + "step": 7389, + "time_per_iteration": 2.671290874481201 + }, + { + "auxiliary_loss_clip": 0.06455444, + "auxiliary_loss_mlp": 0.01269625, + "balance_loss_clip": 0.06289597, + "balance_loss_mlp": 0.01255487, + "epoch": 0.4443108372162934, + "flos": 38299994553600.0, + "grad_norm": 1.918848783354648, + "language_loss": 0.74912727, + "learning_rate": 2.453824593752788e-06, + "loss": 0.82637799, + "num_input_tokens_seen": 158467260, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.14135742, + "step": 7390, + "time_per_iteration": 2.6656923294067383 + }, + { + "auxiliary_loss_clip": 0.06453501, + "auxiliary_loss_mlp": 0.01269903, + "balance_loss_clip": 0.06290193, + "balance_loss_mlp": 0.0125657, + "epoch": 0.44437096046896135, + "flos": 17754790141440.0, + "grad_norm": 1.7902511429273704, + "language_loss": 0.82203722, + "learning_rate": 2.4534452825006988e-06, + "loss": 0.89927119, + "num_input_tokens_seen": 158486720, + "router_z_loss_clip": 1.63476562, + "router_z_loss_mlp": 0.13323975, + "step": 7391, + "time_per_iteration": 2.5425097942352295 + }, + { + "auxiliary_loss_clip": 0.06451984, + "auxiliary_loss_mlp": 0.01268602, + "balance_loss_clip": 0.06289234, + "balance_loss_mlp": 0.01254547, + "epoch": 0.4444310837216293, + "flos": 13736936937600.0, + "grad_norm": 1.5949305897923123, + "language_loss": 0.73880637, + "learning_rate": 2.4530659540539185e-06, + "loss": 0.81601214, + "num_input_tokens_seen": 158502530, + "router_z_loss_clip": 1.62890625, + "router_z_loss_mlp": 0.14044189, + "step": 7392, + "time_per_iteration": 2.509695053100586 + }, + { + "auxiliary_loss_clip": 0.06450866, + "auxiliary_loss_mlp": 0.01269173, + "balance_loss_clip": 0.06287552, + "balance_loss_mlp": 0.01256424, + "epoch": 0.44449120697429734, + "flos": 25017346126080.0, + "grad_norm": 1.7319744549950544, + "language_loss": 0.79953551, + "learning_rate": 2.4526866084268313e-06, + "loss": 0.87673593, + "num_input_tokens_seen": 158522715, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.12744141, + "step": 7393, + "time_per_iteration": 2.6058006286621094 + }, + { + "auxiliary_loss_clip": 0.06460646, + "auxiliary_loss_mlp": 0.01270821, + "balance_loss_clip": 0.06291801, + "balance_loss_mlp": 0.01255276, + "epoch": 0.4445513302269653, + "flos": 32680006156800.0, + "grad_norm": 1.76893741086752, + "language_loss": 0.8113097, + "learning_rate": 2.4523072456338226e-06, + "loss": 0.88862437, + "num_input_tokens_seen": 158543615, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.15551758, + "step": 7394, + "time_per_iteration": 2.6408586502075195 + }, + { + "auxiliary_loss_clip": 0.06448914, + "auxiliary_loss_mlp": 0.01267892, + "balance_loss_clip": 0.06286056, + "balance_loss_mlp": 0.01254796, + "epoch": 0.44461145347963327, + "flos": 11660583162240.0, + "grad_norm": 2.0227503675909646, + "language_loss": 0.79471397, + "learning_rate": 2.4519278656892785e-06, + "loss": 0.87188208, + "num_input_tokens_seen": 158560330, + "router_z_loss_clip": 1.62988281, + "router_z_loss_mlp": 0.13092041, + "step": 7395, + "time_per_iteration": 2.482771158218384 + }, + { + "auxiliary_loss_clip": 0.06457528, + "auxiliary_loss_mlp": 0.01269923, + "balance_loss_clip": 0.06293359, + "balance_loss_mlp": 0.01255838, + "epoch": 0.44467157673230123, + "flos": 20893079836800.0, + "grad_norm": 1.8465254869377097, + "language_loss": 0.68925393, + "learning_rate": 2.451548468607584e-06, + "loss": 0.76652849, + "num_input_tokens_seen": 158579735, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.14074707, + "step": 7396, + "time_per_iteration": 2.526031017303467 + }, + { + "auxiliary_loss_clip": 0.06458125, + "auxiliary_loss_mlp": 0.01266336, + "balance_loss_clip": 0.06290217, + "balance_loss_mlp": 0.0125299, + "epoch": 0.4447316999849692, + "flos": 18551140945920.0, + "grad_norm": 2.1703937468753964, + "language_loss": 0.80956584, + "learning_rate": 2.451169054403126e-06, + "loss": 0.88681042, + "num_input_tokens_seen": 158597075, + "router_z_loss_clip": 1.67773438, + "router_z_loss_mlp": 0.13342285, + "step": 7397, + "time_per_iteration": 2.482004404067993 + }, + { + "auxiliary_loss_clip": 0.06453413, + "auxiliary_loss_mlp": 0.01269867, + "balance_loss_clip": 0.06290947, + "balance_loss_mlp": 0.01256814, + "epoch": 0.44479182323763716, + "flos": 23775846906240.0, + "grad_norm": 2.7975733901761672, + "language_loss": 0.67842102, + "learning_rate": 2.450789623090293e-06, + "loss": 0.75565386, + "num_input_tokens_seen": 158616650, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.13067627, + "step": 7398, + "time_per_iteration": 2.579227924346924 + }, + { + "auxiliary_loss_clip": 0.06451767, + "auxiliary_loss_mlp": 0.01268989, + "balance_loss_clip": 0.06290427, + "balance_loss_mlp": 0.01256097, + "epoch": 0.44485194649030513, + "flos": 16549237123200.0, + "grad_norm": 1.6886298033370946, + "language_loss": 0.70454216, + "learning_rate": 2.450410174683472e-06, + "loss": 0.78174973, + "num_input_tokens_seen": 158634515, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.12896729, + "step": 7399, + "time_per_iteration": 2.491422653198242 + }, + { + "auxiliary_loss_clip": 0.06448349, + "auxiliary_loss_mlp": 0.01267519, + "balance_loss_clip": 0.06287403, + "balance_loss_mlp": 0.01254543, + "epoch": 0.4449120697429731, + "flos": 22607455973760.0, + "grad_norm": 1.7365156462421643, + "language_loss": 0.72588718, + "learning_rate": 2.4500307091970514e-06, + "loss": 0.80304587, + "num_input_tokens_seen": 158653760, + "router_z_loss_clip": 1.61035156, + "router_z_loss_mlp": 0.12963867, + "step": 7400, + "time_per_iteration": 3.9914138317108154 + }, + { + "auxiliary_loss_clip": 0.06451382, + "auxiliary_loss_mlp": 0.01270619, + "balance_loss_clip": 0.06288703, + "balance_loss_mlp": 0.0125738, + "epoch": 0.44497219299564106, + "flos": 20009994456960.0, + "grad_norm": 1.5547932465186114, + "language_loss": 0.85223019, + "learning_rate": 2.449651226645422e-06, + "loss": 0.92945021, + "num_input_tokens_seen": 158672190, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.13250732, + "step": 7401, + "time_per_iteration": 3.972844123840332 + }, + { + "auxiliary_loss_clip": 0.0644277, + "auxiliary_loss_mlp": 0.01266074, + "balance_loss_clip": 0.06283394, + "balance_loss_mlp": 0.01254099, + "epoch": 0.445032316248309, + "flos": 25601499665280.0, + "grad_norm": 1.7738805367720483, + "language_loss": 0.8345179, + "learning_rate": 2.449271727042973e-06, + "loss": 0.91160637, + "num_input_tokens_seen": 158694115, + "router_z_loss_clip": 1.59277344, + "router_z_loss_mlp": 0.11968994, + "step": 7402, + "time_per_iteration": 2.546557664871216 + }, + { + "auxiliary_loss_clip": 0.06449325, + "auxiliary_loss_mlp": 0.0126916, + "balance_loss_clip": 0.06285563, + "balance_loss_mlp": 0.01255898, + "epoch": 0.445092439500977, + "flos": 21256608775680.0, + "grad_norm": 1.6765614973905527, + "language_loss": 0.77230763, + "learning_rate": 2.4488922104040947e-06, + "loss": 0.84949255, + "num_input_tokens_seen": 158711000, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.13275146, + "step": 7403, + "time_per_iteration": 2.540351152420044 + }, + { + "auxiliary_loss_clip": 0.06362203, + "auxiliary_loss_mlp": 0.01255762, + "balance_loss_clip": 0.0628911, + "balance_loss_mlp": 0.01252394, + "epoch": 0.44515256275364495, + "flos": 57781990506240.0, + "grad_norm": 0.751382178532419, + "language_loss": 0.60078514, + "learning_rate": 2.4485126767431793e-06, + "loss": 0.67696476, + "num_input_tokens_seen": 158769675, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.03375244, + "step": 7404, + "time_per_iteration": 3.1188013553619385 + }, + { + "auxiliary_loss_clip": 0.06455964, + "auxiliary_loss_mlp": 0.01272779, + "balance_loss_clip": 0.06287853, + "balance_loss_mlp": 0.01258462, + "epoch": 0.4452126860063129, + "flos": 15601386936960.0, + "grad_norm": 1.4877710129276585, + "language_loss": 0.82279229, + "learning_rate": 2.4481331260746177e-06, + "loss": 0.90007967, + "num_input_tokens_seen": 158788215, + "router_z_loss_clip": 1.68066406, + "router_z_loss_mlp": 0.14312744, + "step": 7405, + "time_per_iteration": 2.5388095378875732 + }, + { + "auxiliary_loss_clip": 0.06447265, + "auxiliary_loss_mlp": 0.01267875, + "balance_loss_clip": 0.06283686, + "balance_loss_mlp": 0.0125512, + "epoch": 0.4452728092589809, + "flos": 21623995002240.0, + "grad_norm": 1.5786988713847923, + "language_loss": 0.75529754, + "learning_rate": 2.4477535584128036e-06, + "loss": 0.83244896, + "num_input_tokens_seen": 158809090, + "router_z_loss_clip": 1.63476562, + "router_z_loss_mlp": 0.12744141, + "step": 7406, + "time_per_iteration": 2.5249385833740234 + }, + { + "auxiliary_loss_clip": 0.06440533, + "auxiliary_loss_mlp": 0.01271164, + "balance_loss_clip": 0.06282739, + "balance_loss_mlp": 0.01259094, + "epoch": 0.4453329325116489, + "flos": 29505267135360.0, + "grad_norm": 1.6524917293298949, + "language_loss": 0.65847838, + "learning_rate": 2.447373973772129e-06, + "loss": 0.73559535, + "num_input_tokens_seen": 158828320, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.12060547, + "step": 7407, + "time_per_iteration": 3.998326063156128 + }, + { + "auxiliary_loss_clip": 0.06449907, + "auxiliary_loss_mlp": 0.01269795, + "balance_loss_clip": 0.06284529, + "balance_loss_mlp": 0.01256777, + "epoch": 0.44539305576431687, + "flos": 21367549981440.0, + "grad_norm": 1.547450204556426, + "language_loss": 0.68216872, + "learning_rate": 2.4469943721669887e-06, + "loss": 0.75936574, + "num_input_tokens_seen": 158847040, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.13018799, + "step": 7408, + "time_per_iteration": 2.5295586585998535 + }, + { + "auxiliary_loss_clip": 0.06449315, + "auxiliary_loss_mlp": 0.01269644, + "balance_loss_clip": 0.06285807, + "balance_loss_mlp": 0.01256508, + "epoch": 0.44545317901698483, + "flos": 41437278000000.0, + "grad_norm": 2.0427525389439443, + "language_loss": 0.720608, + "learning_rate": 2.4466147536117776e-06, + "loss": 0.79779756, + "num_input_tokens_seen": 158870490, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.13134766, + "step": 7409, + "time_per_iteration": 2.678666114807129 + }, + { + "auxiliary_loss_clip": 0.06448312, + "auxiliary_loss_mlp": 0.01270862, + "balance_loss_clip": 0.06284307, + "balance_loss_mlp": 0.01257045, + "epoch": 0.4455133022696528, + "flos": 22061638477440.0, + "grad_norm": 1.7184461657241017, + "language_loss": 0.65940762, + "learning_rate": 2.4462351181208895e-06, + "loss": 0.73659933, + "num_input_tokens_seen": 158889920, + "router_z_loss_clip": 1.63769531, + "router_z_loss_mlp": 0.13818359, + "step": 7410, + "time_per_iteration": 2.5486950874328613 + }, + { + "auxiliary_loss_clip": 0.06453686, + "auxiliary_loss_mlp": 0.01268565, + "balance_loss_clip": 0.06284985, + "balance_loss_mlp": 0.0125522, + "epoch": 0.44557342552232077, + "flos": 23483665319040.0, + "grad_norm": 3.696220183147237, + "language_loss": 0.74690163, + "learning_rate": 2.4458554657087217e-06, + "loss": 0.82412422, + "num_input_tokens_seen": 158909580, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.13360596, + "step": 7411, + "time_per_iteration": 2.5290050506591797 + }, + { + "auxiliary_loss_clip": 0.0644176, + "auxiliary_loss_mlp": 0.01268016, + "balance_loss_clip": 0.06284117, + "balance_loss_mlp": 0.01256166, + "epoch": 0.44563354877498873, + "flos": 19140577292160.0, + "grad_norm": 2.065063291172047, + "language_loss": 0.7906481, + "learning_rate": 2.4454757963896695e-06, + "loss": 0.86774588, + "num_input_tokens_seen": 158924600, + "router_z_loss_clip": 1.57617188, + "router_z_loss_mlp": 0.11859131, + "step": 7412, + "time_per_iteration": 2.5156190395355225 + }, + { + "auxiliary_loss_clip": 0.0645022, + "auxiliary_loss_mlp": 0.01268988, + "balance_loss_clip": 0.06282784, + "balance_loss_mlp": 0.01255792, + "epoch": 0.4456936720276567, + "flos": 13625744169600.0, + "grad_norm": 2.15802472542835, + "language_loss": 0.80199099, + "learning_rate": 2.4450961101781304e-06, + "loss": 0.87918305, + "num_input_tokens_seen": 158939345, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.13195801, + "step": 7413, + "time_per_iteration": 3.9694504737854004 + }, + { + "auxiliary_loss_clip": 0.06443125, + "auxiliary_loss_mlp": 0.01265994, + "balance_loss_clip": 0.0628258, + "balance_loss_mlp": 0.01254037, + "epoch": 0.44575379528032466, + "flos": 14717840359680.0, + "grad_norm": 1.9357576200238034, + "language_loss": 0.76531088, + "learning_rate": 2.4447164070885026e-06, + "loss": 0.8424021, + "num_input_tokens_seen": 158955855, + "router_z_loss_clip": 1.60546875, + "router_z_loss_mlp": 0.11956787, + "step": 7414, + "time_per_iteration": 2.515110731124878 + }, + { + "auxiliary_loss_clip": 0.06447163, + "auxiliary_loss_mlp": 0.01269628, + "balance_loss_clip": 0.06286051, + "balance_loss_mlp": 0.01257177, + "epoch": 0.4458139185329926, + "flos": 24177586106880.0, + "grad_norm": 1.4166090983539044, + "language_loss": 0.84000552, + "learning_rate": 2.4443366871351837e-06, + "loss": 0.91717345, + "num_input_tokens_seen": 158976315, + "router_z_loss_clip": 1.61132812, + "router_z_loss_mlp": 0.12457275, + "step": 7415, + "time_per_iteration": 2.528939723968506 + }, + { + "auxiliary_loss_clip": 0.06442896, + "auxiliary_loss_mlp": 0.01267494, + "balance_loss_clip": 0.06282021, + "balance_loss_mlp": 0.01254733, + "epoch": 0.4458740417856606, + "flos": 21768660276480.0, + "grad_norm": 1.9578275078246672, + "language_loss": 0.84485269, + "learning_rate": 2.4439569503325732e-06, + "loss": 0.92195654, + "num_input_tokens_seen": 158996725, + "router_z_loss_clip": 1.60839844, + "router_z_loss_mlp": 0.12756348, + "step": 7416, + "time_per_iteration": 2.57027268409729 + }, + { + "auxiliary_loss_clip": 0.06451635, + "auxiliary_loss_mlp": 0.0126797, + "balance_loss_clip": 0.06285699, + "balance_loss_mlp": 0.01255298, + "epoch": 0.44593416503832856, + "flos": 21075074904960.0, + "grad_norm": 1.7085615846271827, + "language_loss": 0.81362593, + "learning_rate": 2.4435771966950706e-06, + "loss": 0.89082199, + "num_input_tokens_seen": 159017255, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.12670898, + "step": 7417, + "time_per_iteration": 2.547837734222412 + }, + { + "auxiliary_loss_clip": 0.06448114, + "auxiliary_loss_mlp": 0.01267636, + "balance_loss_clip": 0.06283562, + "balance_loss_mlp": 0.01255601, + "epoch": 0.4459942882909965, + "flos": 22606910922240.0, + "grad_norm": 1.8801354401717048, + "language_loss": 0.81286234, + "learning_rate": 2.443197426237077e-06, + "loss": 0.89001989, + "num_input_tokens_seen": 159035010, + "router_z_loss_clip": 1.64648438, + "router_z_loss_mlp": 0.12042236, + "step": 7418, + "time_per_iteration": 2.5529236793518066 + }, + { + "auxiliary_loss_clip": 0.06449951, + "auxiliary_loss_mlp": 0.01268288, + "balance_loss_clip": 0.06284475, + "balance_loss_mlp": 0.01255652, + "epoch": 0.4460544115436645, + "flos": 26512732817280.0, + "grad_norm": 1.8068813549808598, + "language_loss": 0.77866399, + "learning_rate": 2.442817638972991e-06, + "loss": 0.85584641, + "num_input_tokens_seen": 159055345, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.12646484, + "step": 7419, + "time_per_iteration": 2.637568235397339 + }, + { + "auxiliary_loss_clip": 0.06446308, + "auxiliary_loss_mlp": 0.01271146, + "balance_loss_clip": 0.06283416, + "balance_loss_mlp": 0.01258349, + "epoch": 0.4461145347963325, + "flos": 17609957159040.0, + "grad_norm": 3.5469346323262068, + "language_loss": 0.73053217, + "learning_rate": 2.4424378349172176e-06, + "loss": 0.80770659, + "num_input_tokens_seen": 159074225, + "router_z_loss_clip": 1.63085938, + "router_z_loss_mlp": 0.12805176, + "step": 7420, + "time_per_iteration": 2.4839932918548584 + }, + { + "auxiliary_loss_clip": 0.06441851, + "auxiliary_loss_mlp": 0.01268009, + "balance_loss_clip": 0.06283888, + "balance_loss_mlp": 0.01255176, + "epoch": 0.44617465804900047, + "flos": 27274982209920.0, + "grad_norm": 1.4177043979342248, + "language_loss": 0.75314558, + "learning_rate": 2.442058014084156e-06, + "loss": 0.83024418, + "num_input_tokens_seen": 159095415, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.12823486, + "step": 7421, + "time_per_iteration": 2.6001040935516357 + }, + { + "auxiliary_loss_clip": 0.06439819, + "auxiliary_loss_mlp": 0.01266608, + "balance_loss_clip": 0.06281345, + "balance_loss_mlp": 0.01254073, + "epoch": 0.44623478130166844, + "flos": 17792371497600.0, + "grad_norm": 1.9155365450665858, + "language_loss": 0.75864565, + "learning_rate": 2.44167817648821e-06, + "loss": 0.83570993, + "num_input_tokens_seen": 159114615, + "router_z_loss_clip": 1.58398438, + "router_z_loss_mlp": 0.12536621, + "step": 7422, + "time_per_iteration": 2.481241226196289 + }, + { + "auxiliary_loss_clip": 0.06447253, + "auxiliary_loss_mlp": 0.01267362, + "balance_loss_clip": 0.06284253, + "balance_loss_mlp": 0.01254804, + "epoch": 0.4462949045543364, + "flos": 23009698298880.0, + "grad_norm": 1.7347835392128452, + "language_loss": 0.65679651, + "learning_rate": 2.441298322143784e-06, + "loss": 0.73394263, + "num_input_tokens_seen": 159134370, + "router_z_loss_clip": 1.62890625, + "router_z_loss_mlp": 0.12573242, + "step": 7423, + "time_per_iteration": 2.539268732070923 + }, + { + "auxiliary_loss_clip": 0.06440745, + "auxiliary_loss_mlp": 0.01268488, + "balance_loss_clip": 0.06283564, + "balance_loss_mlp": 0.01256591, + "epoch": 0.44635502780700437, + "flos": 17825592441600.0, + "grad_norm": 1.4381231336851048, + "language_loss": 0.79473054, + "learning_rate": 2.4409184510652807e-06, + "loss": 0.87182289, + "num_input_tokens_seen": 159152540, + "router_z_loss_clip": 1.57128906, + "router_z_loss_mlp": 0.11901855, + "step": 7424, + "time_per_iteration": 2.488111972808838 + }, + { + "auxiliary_loss_clip": 0.06437074, + "auxiliary_loss_mlp": 0.01267937, + "balance_loss_clip": 0.06280597, + "balance_loss_mlp": 0.01256148, + "epoch": 0.44641515105967233, + "flos": 26695314864000.0, + "grad_norm": 1.3471148592694158, + "language_loss": 0.8055563, + "learning_rate": 2.4405385632671063e-06, + "loss": 0.88260639, + "num_input_tokens_seen": 159173425, + "router_z_loss_clip": 1.56542969, + "router_z_loss_mlp": 0.11791992, + "step": 7425, + "time_per_iteration": 2.598731756210327 + }, + { + "auxiliary_loss_clip": 0.06439465, + "auxiliary_loss_mlp": 0.01271755, + "balance_loss_clip": 0.06279327, + "balance_loss_mlp": 0.01259536, + "epoch": 0.4464752743123403, + "flos": 18918778734720.0, + "grad_norm": 1.4143607287110962, + "language_loss": 0.77488291, + "learning_rate": 2.4401586587636655e-06, + "loss": 0.85199511, + "num_input_tokens_seen": 159191210, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.12207031, + "step": 7426, + "time_per_iteration": 2.494330406188965 + }, + { + "auxiliary_loss_clip": 0.06445856, + "auxiliary_loss_mlp": 0.01267303, + "balance_loss_clip": 0.06281333, + "balance_loss_mlp": 0.01253773, + "epoch": 0.44653539756500826, + "flos": 29578081933440.0, + "grad_norm": 1.9924998088803147, + "language_loss": 0.64776599, + "learning_rate": 2.4397787375693634e-06, + "loss": 0.72489762, + "num_input_tokens_seen": 159211755, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.13513184, + "step": 7427, + "time_per_iteration": 2.611482858657837 + }, + { + "auxiliary_loss_clip": 0.06441574, + "auxiliary_loss_mlp": 0.01275968, + "balance_loss_clip": 0.06284505, + "balance_loss_mlp": 0.0126372, + "epoch": 0.44659552081767623, + "flos": 21475137024000.0, + "grad_norm": 1.5780428941103348, + "language_loss": 0.75530696, + "learning_rate": 2.439398799698608e-06, + "loss": 0.8324824, + "num_input_tokens_seen": 159230315, + "router_z_loss_clip": 1.56933594, + "router_z_loss_mlp": 0.12268066, + "step": 7428, + "time_per_iteration": 2.505094051361084 + }, + { + "auxiliary_loss_clip": 0.06441561, + "auxiliary_loss_mlp": 0.01271156, + "balance_loss_clip": 0.06281359, + "balance_loss_mlp": 0.0125843, + "epoch": 0.4466556440703442, + "flos": 17937791458560.0, + "grad_norm": 1.912744298925221, + "language_loss": 0.78478271, + "learning_rate": 2.439018845165806e-06, + "loss": 0.86190987, + "num_input_tokens_seen": 159249810, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.12731934, + "step": 7429, + "time_per_iteration": 2.5107972621917725 + }, + { + "auxiliary_loss_clip": 0.06447433, + "auxiliary_loss_mlp": 0.0127403, + "balance_loss_clip": 0.06283738, + "balance_loss_mlp": 0.01260667, + "epoch": 0.44671576732301216, + "flos": 21114081780480.0, + "grad_norm": 1.7694096542013318, + "language_loss": 0.91354167, + "learning_rate": 2.438638873985366e-06, + "loss": 0.99075633, + "num_input_tokens_seen": 159271715, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.13366699, + "step": 7430, + "time_per_iteration": 2.537428140640259 + }, + { + "auxiliary_loss_clip": 0.06451312, + "auxiliary_loss_mlp": 0.01271269, + "balance_loss_clip": 0.06282946, + "balance_loss_mlp": 0.01257792, + "epoch": 0.4467758905756801, + "flos": 23514873765120.0, + "grad_norm": 1.610238873942938, + "language_loss": 0.80143106, + "learning_rate": 2.4382588861716954e-06, + "loss": 0.87865686, + "num_input_tokens_seen": 159290690, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.1348877, + "step": 7431, + "time_per_iteration": 2.5611300468444824 + }, + { + "auxiliary_loss_clip": 0.06447126, + "auxiliary_loss_mlp": 0.01271916, + "balance_loss_clip": 0.06282945, + "balance_loss_mlp": 0.01258374, + "epoch": 0.4468360138283481, + "flos": 18739970121600.0, + "grad_norm": 1.9551980798487134, + "language_loss": 0.80273902, + "learning_rate": 2.437878881739204e-06, + "loss": 0.87992942, + "num_input_tokens_seen": 159309400, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.13543701, + "step": 7432, + "time_per_iteration": 2.500554084777832 + }, + { + "auxiliary_loss_clip": 0.06450094, + "auxiliary_loss_mlp": 0.01273992, + "balance_loss_clip": 0.06283274, + "balance_loss_mlp": 0.0126073, + "epoch": 0.4468961370810161, + "flos": 23483874954240.0, + "grad_norm": 1.835454334349629, + "language_loss": 0.76644909, + "learning_rate": 2.437498860702301e-06, + "loss": 0.84368992, + "num_input_tokens_seen": 159327425, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.13269043, + "step": 7433, + "time_per_iteration": 2.5840916633605957 + }, + { + "auxiliary_loss_clip": 0.06435596, + "auxiliary_loss_mlp": 0.01271551, + "balance_loss_clip": 0.06279343, + "balance_loss_mlp": 0.01260047, + "epoch": 0.4469562603336841, + "flos": 30081873807360.0, + "grad_norm": 1.6012992804544768, + "language_loss": 0.77581275, + "learning_rate": 2.437118823075398e-06, + "loss": 0.85288417, + "num_input_tokens_seen": 159345805, + "router_z_loss_clip": 1.5625, + "router_z_loss_mlp": 0.1151123, + "step": 7434, + "time_per_iteration": 2.579667329788208 + }, + { + "auxiliary_loss_clip": 0.06443198, + "auxiliary_loss_mlp": 0.01270182, + "balance_loss_clip": 0.06278063, + "balance_loss_mlp": 0.01257439, + "epoch": 0.44701638358635204, + "flos": 22463126115840.0, + "grad_norm": 1.683412458990524, + "language_loss": 0.63887638, + "learning_rate": 2.436738768872905e-06, + "loss": 0.71601021, + "num_input_tokens_seen": 159364595, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.12750244, + "step": 7435, + "time_per_iteration": 2.5773611068725586 + }, + { + "auxiliary_loss_clip": 0.06444404, + "auxiliary_loss_mlp": 0.01272477, + "balance_loss_clip": 0.06280479, + "balance_loss_mlp": 0.01258714, + "epoch": 0.44707650683902, + "flos": 24064171205760.0, + "grad_norm": 1.5617494879233198, + "language_loss": 0.83911443, + "learning_rate": 2.4363586981092346e-06, + "loss": 0.91628319, + "num_input_tokens_seen": 159385265, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.13763428, + "step": 7436, + "time_per_iteration": 2.5204451084136963 + }, + { + "auxiliary_loss_clip": 0.0644998, + "auxiliary_loss_mlp": 0.01269044, + "balance_loss_clip": 0.0628316, + "balance_loss_mlp": 0.01254226, + "epoch": 0.44713663009168797, + "flos": 23773373210880.0, + "grad_norm": 1.7812959316100008, + "language_loss": 0.79632622, + "learning_rate": 2.435978610798798e-06, + "loss": 0.87351644, + "num_input_tokens_seen": 159405080, + "router_z_loss_clip": 1.66699219, + "router_z_loss_mlp": 0.14819336, + "step": 7437, + "time_per_iteration": 2.564180374145508 + }, + { + "auxiliary_loss_clip": 0.0644551, + "auxiliary_loss_mlp": 0.01269936, + "balance_loss_clip": 0.06279416, + "balance_loss_mlp": 0.01256829, + "epoch": 0.44719675334435594, + "flos": 24506258947200.0, + "grad_norm": 1.814975751419929, + "language_loss": 0.72632974, + "learning_rate": 2.435598506956009e-06, + "loss": 0.8034842, + "num_input_tokens_seen": 159424595, + "router_z_loss_clip": 1.66113281, + "router_z_loss_mlp": 0.13116455, + "step": 7438, + "time_per_iteration": 2.601855993270874 + }, + { + "auxiliary_loss_clip": 0.06445266, + "auxiliary_loss_mlp": 0.01270946, + "balance_loss_clip": 0.06279082, + "balance_loss_mlp": 0.01257046, + "epoch": 0.4472568765970239, + "flos": 29788308627840.0, + "grad_norm": 3.3026679320519716, + "language_loss": 0.67660618, + "learning_rate": 2.4352183865952808e-06, + "loss": 0.75376832, + "num_input_tokens_seen": 159443865, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.13903809, + "step": 7439, + "time_per_iteration": 2.6503498554229736 + }, + { + "auxiliary_loss_clip": 0.06447087, + "auxiliary_loss_mlp": 0.01272251, + "balance_loss_clip": 0.06280239, + "balance_loss_mlp": 0.01257648, + "epoch": 0.44731699984969187, + "flos": 24649792191360.0, + "grad_norm": 1.6003212894552636, + "language_loss": 0.73896551, + "learning_rate": 2.4348382497310285e-06, + "loss": 0.81615895, + "num_input_tokens_seen": 159464525, + "router_z_loss_clip": 1.66894531, + "router_z_loss_mlp": 0.14605713, + "step": 7440, + "time_per_iteration": 4.026291608810425 + }, + { + "auxiliary_loss_clip": 0.06441355, + "auxiliary_loss_mlp": 0.01270172, + "balance_loss_clip": 0.06277838, + "balance_loss_mlp": 0.0125722, + "epoch": 0.44737712310235983, + "flos": 29462570680320.0, + "grad_norm": 1.5530123963175664, + "language_loss": 0.74356592, + "learning_rate": 2.4344580963776655e-06, + "loss": 0.82068115, + "num_input_tokens_seen": 159486385, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.12963867, + "step": 7441, + "time_per_iteration": 2.5968191623687744 + }, + { + "auxiliary_loss_clip": 0.06443278, + "auxiliary_loss_mlp": 0.01268347, + "balance_loss_clip": 0.06277753, + "balance_loss_mlp": 0.01254983, + "epoch": 0.4474372463550278, + "flos": 24903260392320.0, + "grad_norm": 2.4580446492601014, + "language_loss": 0.75523049, + "learning_rate": 2.4340779265496082e-06, + "loss": 0.83234674, + "num_input_tokens_seen": 159503880, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.13378906, + "step": 7442, + "time_per_iteration": 2.6050899028778076 + }, + { + "auxiliary_loss_clip": 0.0645077, + "auxiliary_loss_mlp": 0.01276603, + "balance_loss_clip": 0.06281515, + "balance_loss_mlp": 0.01262644, + "epoch": 0.44749736960769576, + "flos": 33189835524480.0, + "grad_norm": 1.8304580376547321, + "language_loss": 0.74504036, + "learning_rate": 2.433697740261273e-06, + "loss": 0.82231408, + "num_input_tokens_seen": 159522980, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.13952637, + "step": 7443, + "time_per_iteration": 2.590211868286133 + }, + { + "auxiliary_loss_clip": 0.06441949, + "auxiliary_loss_mlp": 0.01270493, + "balance_loss_clip": 0.06278961, + "balance_loss_mlp": 0.01256605, + "epoch": 0.4475574928603637, + "flos": 21078596776320.0, + "grad_norm": 1.7164366382085705, + "language_loss": 0.78287792, + "learning_rate": 2.4333175375270748e-06, + "loss": 0.86000234, + "num_input_tokens_seen": 159543340, + "router_z_loss_clip": 1.62890625, + "router_z_loss_mlp": 0.13891602, + "step": 7444, + "time_per_iteration": 2.554215669631958 + }, + { + "auxiliary_loss_clip": 0.06437638, + "auxiliary_loss_mlp": 0.01276986, + "balance_loss_clip": 0.06276217, + "balance_loss_mlp": 0.01263664, + "epoch": 0.4476176161130317, + "flos": 21867442640640.0, + "grad_norm": 2.3488437532538735, + "language_loss": 0.85014707, + "learning_rate": 2.4329373183614333e-06, + "loss": 0.9272933, + "num_input_tokens_seen": 159558210, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.13317871, + "step": 7445, + "time_per_iteration": 2.463123321533203 + }, + { + "auxiliary_loss_clip": 0.06446601, + "auxiliary_loss_mlp": 0.0127394, + "balance_loss_clip": 0.06279677, + "balance_loss_mlp": 0.01258312, + "epoch": 0.4476777393656997, + "flos": 22535270081280.0, + "grad_norm": 2.2137135091267135, + "language_loss": 0.64567178, + "learning_rate": 2.432557082778765e-06, + "loss": 0.72287714, + "num_input_tokens_seen": 159577920, + "router_z_loss_clip": 1.66894531, + "router_z_loss_mlp": 0.15631104, + "step": 7446, + "time_per_iteration": 3.9910571575164795 + }, + { + "auxiliary_loss_clip": 0.06349403, + "auxiliary_loss_mlp": 0.01256298, + "balance_loss_clip": 0.06276181, + "balance_loss_mlp": 0.01253975, + "epoch": 0.4477378626183677, + "flos": 49034236101120.0, + "grad_norm": 0.7348354325841562, + "language_loss": 0.49922079, + "learning_rate": 2.4321768307934884e-06, + "loss": 0.57527786, + "num_input_tokens_seen": 159632295, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.0231781, + "step": 7447, + "time_per_iteration": 3.0209667682647705 + }, + { + "auxiliary_loss_clip": 0.06344398, + "auxiliary_loss_mlp": 0.01262514, + "balance_loss_clip": 0.06271263, + "balance_loss_mlp": 0.01260019, + "epoch": 0.44779798587103564, + "flos": 56562041784960.0, + "grad_norm": 0.8026230684928909, + "language_loss": 0.59334445, + "learning_rate": 2.4317965624200235e-06, + "loss": 0.66941357, + "num_input_tokens_seen": 159698435, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.02493286, + "step": 7448, + "time_per_iteration": 3.2380871772766113 + }, + { + "auxiliary_loss_clip": 0.06443155, + "auxiliary_loss_mlp": 0.01270524, + "balance_loss_clip": 0.06277426, + "balance_loss_mlp": 0.01256994, + "epoch": 0.4478581091237036, + "flos": 46508933278080.0, + "grad_norm": 1.7384627548967189, + "language_loss": 0.59131092, + "learning_rate": 2.431416277672789e-06, + "loss": 0.66844773, + "num_input_tokens_seen": 159722150, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.13537598, + "step": 7449, + "time_per_iteration": 2.7783467769622803 + }, + { + "auxiliary_loss_clip": 0.06440828, + "auxiliary_loss_mlp": 0.01272088, + "balance_loss_clip": 0.06277853, + "balance_loss_mlp": 0.01258868, + "epoch": 0.4479182323763716, + "flos": 20820768163200.0, + "grad_norm": 1.956040680672474, + "language_loss": 0.81008971, + "learning_rate": 2.4310359765662065e-06, + "loss": 0.88721895, + "num_input_tokens_seen": 159740550, + "router_z_loss_clip": 1.62890625, + "router_z_loss_mlp": 0.13220215, + "step": 7450, + "time_per_iteration": 2.488323450088501 + }, + { + "auxiliary_loss_clip": 0.06442301, + "auxiliary_loss_mlp": 0.01273054, + "balance_loss_clip": 0.06277788, + "balance_loss_mlp": 0.01259172, + "epoch": 0.44797835562903954, + "flos": 14251126717440.0, + "grad_norm": 2.5451576111358136, + "language_loss": 0.79348361, + "learning_rate": 2.430655659114697e-06, + "loss": 0.87063718, + "num_input_tokens_seen": 159758245, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.13885498, + "step": 7451, + "time_per_iteration": 2.4923946857452393 + }, + { + "auxiliary_loss_clip": 0.06344576, + "auxiliary_loss_mlp": 0.0125349, + "balance_loss_clip": 0.06271936, + "balance_loss_mlp": 0.0125126, + "epoch": 0.4480384788817075, + "flos": 63553436357760.0, + "grad_norm": 0.7850742570611701, + "language_loss": 0.62791413, + "learning_rate": 2.430275325332681e-06, + "loss": 0.70389479, + "num_input_tokens_seen": 159826790, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.02233887, + "step": 7452, + "time_per_iteration": 3.2259254455566406 + }, + { + "auxiliary_loss_clip": 0.06441975, + "auxiliary_loss_mlp": 0.01272416, + "balance_loss_clip": 0.06277539, + "balance_loss_mlp": 0.01258874, + "epoch": 0.44809860213437547, + "flos": 21659018808960.0, + "grad_norm": 1.8053672901244522, + "language_loss": 0.62585479, + "learning_rate": 2.429894975234582e-06, + "loss": 0.70299876, + "num_input_tokens_seen": 159845805, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.13537598, + "step": 7453, + "time_per_iteration": 3.928234577178955 + }, + { + "auxiliary_loss_clip": 0.06345223, + "auxiliary_loss_mlp": 0.01256622, + "balance_loss_clip": 0.06272231, + "balance_loss_mlp": 0.01254279, + "epoch": 0.44815872538704343, + "flos": 69210586840320.0, + "grad_norm": 0.747363028090033, + "language_loss": 0.5699693, + "learning_rate": 2.4295146088348224e-06, + "loss": 0.64598775, + "num_input_tokens_seen": 159898860, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.02339172, + "step": 7454, + "time_per_iteration": 3.0569918155670166 + }, + { + "auxiliary_loss_clip": 0.06447325, + "auxiliary_loss_mlp": 0.01268938, + "balance_loss_clip": 0.06281178, + "balance_loss_mlp": 0.01255705, + "epoch": 0.4482188486397114, + "flos": 12602186219520.0, + "grad_norm": 1.9501180256269237, + "language_loss": 0.75448847, + "learning_rate": 2.4291342261478255e-06, + "loss": 0.83165109, + "num_input_tokens_seen": 159911555, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.13220215, + "step": 7455, + "time_per_iteration": 2.4410433769226074 + }, + { + "auxiliary_loss_clip": 0.06442874, + "auxiliary_loss_mlp": 0.0126888, + "balance_loss_clip": 0.06278916, + "balance_loss_mlp": 0.01254932, + "epoch": 0.44827897189237936, + "flos": 34066715702400.0, + "grad_norm": 1.6532992970231903, + "language_loss": 0.76341856, + "learning_rate": 2.428753827188016e-06, + "loss": 0.84053606, + "num_input_tokens_seen": 159931470, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.1394043, + "step": 7456, + "time_per_iteration": 2.6695046424865723 + }, + { + "auxiliary_loss_clip": 0.06443818, + "auxiliary_loss_mlp": 0.01274223, + "balance_loss_clip": 0.06283055, + "balance_loss_mlp": 0.01261087, + "epoch": 0.44833909514504733, + "flos": 25153080940800.0, + "grad_norm": 1.8332154029673087, + "language_loss": 0.7703625, + "learning_rate": 2.428373411969818e-06, + "loss": 0.84754294, + "num_input_tokens_seen": 159946115, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.13122559, + "step": 7457, + "time_per_iteration": 2.4982032775878906 + }, + { + "auxiliary_loss_clip": 0.06449621, + "auxiliary_loss_mlp": 0.0126721, + "balance_loss_clip": 0.06282188, + "balance_loss_mlp": 0.01253269, + "epoch": 0.4483992183977153, + "flos": 16185498549120.0, + "grad_norm": 2.4281328609676254, + "language_loss": 0.68744391, + "learning_rate": 2.4279929805076576e-06, + "loss": 0.7646122, + "num_input_tokens_seen": 159963915, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.1394043, + "step": 7458, + "time_per_iteration": 2.4979610443115234 + }, + { + "auxiliary_loss_clip": 0.06448827, + "auxiliary_loss_mlp": 0.01274875, + "balance_loss_clip": 0.06280437, + "balance_loss_mlp": 0.01259592, + "epoch": 0.44845934165038326, + "flos": 17751352124160.0, + "grad_norm": 1.539492966179865, + "language_loss": 0.71756333, + "learning_rate": 2.427612532815961e-06, + "loss": 0.79480034, + "num_input_tokens_seen": 159982140, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.15283203, + "step": 7459, + "time_per_iteration": 2.482675075531006 + }, + { + "auxiliary_loss_clip": 0.06445904, + "auxiliary_loss_mlp": 0.01268873, + "balance_loss_clip": 0.06281781, + "balance_loss_mlp": 0.01255343, + "epoch": 0.4485194649030513, + "flos": 21842481323520.0, + "grad_norm": 1.7620296739852843, + "language_loss": 0.69945031, + "learning_rate": 2.427232068909154e-06, + "loss": 0.7765981, + "num_input_tokens_seen": 160002280, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.13525391, + "step": 7460, + "time_per_iteration": 2.548891067504883 + }, + { + "auxiliary_loss_clip": 0.06446661, + "auxiliary_loss_mlp": 0.01267799, + "balance_loss_clip": 0.06281269, + "balance_loss_mlp": 0.01253744, + "epoch": 0.44857958815571924, + "flos": 20090775392640.0, + "grad_norm": 2.1567039258492637, + "language_loss": 0.77558124, + "learning_rate": 2.4268515888016635e-06, + "loss": 0.85272586, + "num_input_tokens_seen": 160020260, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.14068604, + "step": 7461, + "time_per_iteration": 2.488675832748413 + }, + { + "auxiliary_loss_clip": 0.0644468, + "auxiliary_loss_mlp": 0.01266891, + "balance_loss_clip": 0.0627977, + "balance_loss_mlp": 0.01252514, + "epoch": 0.4486397114083872, + "flos": 27060982081920.0, + "grad_norm": 1.6449935173844783, + "language_loss": 0.68081152, + "learning_rate": 2.4264710925079184e-06, + "loss": 0.75792718, + "num_input_tokens_seen": 160040240, + "router_z_loss_clip": 1.64941406, + "router_z_loss_mlp": 0.14367676, + "step": 7462, + "time_per_iteration": 2.5873477458953857 + }, + { + "auxiliary_loss_clip": 0.06346884, + "auxiliary_loss_mlp": 0.01259781, + "balance_loss_clip": 0.06274216, + "balance_loss_mlp": 0.01257521, + "epoch": 0.4486998346610552, + "flos": 67339386587520.0, + "grad_norm": 0.7371865357722727, + "language_loss": 0.54459572, + "learning_rate": 2.4260905800423462e-06, + "loss": 0.62066233, + "num_input_tokens_seen": 160093865, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.0226593, + "step": 7463, + "time_per_iteration": 3.135831594467163 + }, + { + "auxiliary_loss_clip": 0.06446455, + "auxiliary_loss_mlp": 0.01271071, + "balance_loss_clip": 0.06283797, + "balance_loss_mlp": 0.01257344, + "epoch": 0.44875995791372314, + "flos": 27644297080320.0, + "grad_norm": 1.768714620285087, + "language_loss": 0.76698768, + "learning_rate": 2.4257100514193775e-06, + "loss": 0.844163, + "num_input_tokens_seen": 160113590, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.13726807, + "step": 7464, + "time_per_iteration": 2.5624353885650635 + }, + { + "auxiliary_loss_clip": 0.06442145, + "auxiliary_loss_mlp": 0.01270123, + "balance_loss_clip": 0.06281784, + "balance_loss_mlp": 0.01257063, + "epoch": 0.4488200811663911, + "flos": 13010969162880.0, + "grad_norm": 1.8955897931068166, + "language_loss": 0.74468267, + "learning_rate": 2.425329506653441e-06, + "loss": 0.82180536, + "num_input_tokens_seen": 160131795, + "router_z_loss_clip": 1.60351562, + "router_z_loss_mlp": 0.13043213, + "step": 7465, + "time_per_iteration": 2.4702823162078857 + }, + { + "auxiliary_loss_clip": 0.0645618, + "auxiliary_loss_mlp": 0.01272918, + "balance_loss_clip": 0.06284305, + "balance_loss_mlp": 0.01257391, + "epoch": 0.44888020441905907, + "flos": 27497283891840.0, + "grad_norm": 2.0464026275546314, + "language_loss": 0.80248308, + "learning_rate": 2.424948945758966e-06, + "loss": 0.87977397, + "num_input_tokens_seen": 160150635, + "router_z_loss_clip": 1.71777344, + "router_z_loss_mlp": 0.1552124, + "step": 7466, + "time_per_iteration": 2.542721748352051 + }, + { + "auxiliary_loss_clip": 0.06448439, + "auxiliary_loss_mlp": 0.01269021, + "balance_loss_clip": 0.0628207, + "balance_loss_mlp": 0.01255735, + "epoch": 0.44894032767172704, + "flos": 18265541904000.0, + "grad_norm": 2.2890338528416416, + "language_loss": 0.80875736, + "learning_rate": 2.4245683687503844e-06, + "loss": 0.88593197, + "num_input_tokens_seen": 160168615, + "router_z_loss_clip": 1.66308594, + "router_z_loss_mlp": 0.13293457, + "step": 7467, + "time_per_iteration": 2.4503378868103027 + }, + { + "auxiliary_loss_clip": 0.06442044, + "auxiliary_loss_mlp": 0.01269059, + "balance_loss_clip": 0.06284908, + "balance_loss_mlp": 0.01256465, + "epoch": 0.449000450924395, + "flos": 21586245937920.0, + "grad_norm": 2.2421166338055762, + "language_loss": 0.75738609, + "learning_rate": 2.424187775642129e-06, + "loss": 0.83449709, + "num_input_tokens_seen": 160187295, + "router_z_loss_clip": 1.5703125, + "router_z_loss_mlp": 0.12597656, + "step": 7468, + "time_per_iteration": 2.5120036602020264 + }, + { + "auxiliary_loss_clip": 0.06448267, + "auxiliary_loss_mlp": 0.01270415, + "balance_loss_clip": 0.06286301, + "balance_loss_mlp": 0.01257993, + "epoch": 0.44906057417706297, + "flos": 17973737660160.0, + "grad_norm": 2.1198815882874626, + "language_loss": 0.71292973, + "learning_rate": 2.4238071664486297e-06, + "loss": 0.79011655, + "num_input_tokens_seen": 160205115, + "router_z_loss_clip": 1.62109375, + "router_z_loss_mlp": 0.12414551, + "step": 7469, + "time_per_iteration": 2.4725160598754883 + }, + { + "auxiliary_loss_clip": 0.06450349, + "auxiliary_loss_mlp": 0.01268175, + "balance_loss_clip": 0.06284628, + "balance_loss_mlp": 0.0125427, + "epoch": 0.44912069742973093, + "flos": 20053487525760.0, + "grad_norm": 1.6969020049584582, + "language_loss": 0.7254343, + "learning_rate": 2.4234265411843203e-06, + "loss": 0.80261958, + "num_input_tokens_seen": 160222580, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.13903809, + "step": 7470, + "time_per_iteration": 2.5212604999542236 + }, + { + "auxiliary_loss_clip": 0.06447989, + "auxiliary_loss_mlp": 0.01269333, + "balance_loss_clip": 0.0628368, + "balance_loss_mlp": 0.01255951, + "epoch": 0.4491808206823989, + "flos": 21040009171200.0, + "grad_norm": 2.607168963621531, + "language_loss": 0.77266711, + "learning_rate": 2.423045899863634e-06, + "loss": 0.84984034, + "num_input_tokens_seen": 160241520, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.13397217, + "step": 7471, + "time_per_iteration": 2.4833462238311768 + }, + { + "auxiliary_loss_clip": 0.0644739, + "auxiliary_loss_mlp": 0.01274961, + "balance_loss_clip": 0.06286953, + "balance_loss_mlp": 0.01261579, + "epoch": 0.44924094393506686, + "flos": 22973919805440.0, + "grad_norm": 1.613716342828386, + "language_loss": 0.69996417, + "learning_rate": 2.4226652425010048e-06, + "loss": 0.77718765, + "num_input_tokens_seen": 160261815, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.1338501, + "step": 7472, + "time_per_iteration": 2.5575385093688965 + }, + { + "auxiliary_loss_clip": 0.06348881, + "auxiliary_loss_mlp": 0.01263011, + "balance_loss_clip": 0.0627597, + "balance_loss_mlp": 0.01260363, + "epoch": 0.4493010671877349, + "flos": 59252332026240.0, + "grad_norm": 0.7278471165666979, + "language_loss": 0.61657208, + "learning_rate": 2.4222845691108676e-06, + "loss": 0.69269097, + "num_input_tokens_seen": 160317070, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.02650452, + "step": 7473, + "time_per_iteration": 3.1560816764831543 + }, + { + "auxiliary_loss_clip": 0.06448925, + "auxiliary_loss_mlp": 0.01270251, + "balance_loss_clip": 0.0628556, + "balance_loss_mlp": 0.01256417, + "epoch": 0.44936119044040285, + "flos": 18010815891840.0, + "grad_norm": 2.7240719920550873, + "language_loss": 0.77420998, + "learning_rate": 2.421903879707657e-06, + "loss": 0.85140175, + "num_input_tokens_seen": 160334980, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.13830566, + "step": 7474, + "time_per_iteration": 2.4717578887939453 + }, + { + "auxiliary_loss_clip": 0.06442197, + "auxiliary_loss_mlp": 0.01276021, + "balance_loss_clip": 0.06283113, + "balance_loss_mlp": 0.0126264, + "epoch": 0.4494213136930708, + "flos": 21258243930240.0, + "grad_norm": 2.650117553560035, + "language_loss": 0.72072601, + "learning_rate": 2.4215231743058086e-06, + "loss": 0.79790819, + "num_input_tokens_seen": 160354500, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.1338501, + "step": 7475, + "time_per_iteration": 2.513819456100464 + }, + { + "auxiliary_loss_clip": 0.06442311, + "auxiliary_loss_mlp": 0.01269894, + "balance_loss_clip": 0.06277612, + "balance_loss_mlp": 0.01256954, + "epoch": 0.4494814369457388, + "flos": 27426271956480.0, + "grad_norm": 1.759412456892788, + "language_loss": 0.77338856, + "learning_rate": 2.4211424529197594e-06, + "loss": 0.8505106, + "num_input_tokens_seen": 160373650, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.1295166, + "step": 7476, + "time_per_iteration": 2.5318853855133057 + }, + { + "auxiliary_loss_clip": 0.06449737, + "auxiliary_loss_mlp": 0.01271172, + "balance_loss_clip": 0.06281359, + "balance_loss_mlp": 0.01256754, + "epoch": 0.44954156019840674, + "flos": 22860211415040.0, + "grad_norm": 1.712065897066968, + "language_loss": 0.71606135, + "learning_rate": 2.4207617155639464e-06, + "loss": 0.79327047, + "num_input_tokens_seen": 160393430, + "router_z_loss_clip": 1.68066406, + "router_z_loss_mlp": 0.144104, + "step": 7477, + "time_per_iteration": 2.532437324523926 + }, + { + "auxiliary_loss_clip": 0.06452323, + "auxiliary_loss_mlp": 0.01271774, + "balance_loss_clip": 0.06283113, + "balance_loss_mlp": 0.01257457, + "epoch": 0.4496016834510747, + "flos": 17207253636480.0, + "grad_norm": 8.505711381360525, + "language_loss": 0.68249893, + "learning_rate": 2.4203809622528062e-06, + "loss": 0.75973988, + "num_input_tokens_seen": 160410545, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.14331055, + "step": 7478, + "time_per_iteration": 2.4901106357574463 + }, + { + "auxiliary_loss_clip": 0.06438291, + "auxiliary_loss_mlp": 0.01274211, + "balance_loss_clip": 0.06278055, + "balance_loss_mlp": 0.01261676, + "epoch": 0.4496618067037427, + "flos": 18922636022400.0, + "grad_norm": 1.7939017561082606, + "language_loss": 0.89897281, + "learning_rate": 2.420000193000779e-06, + "loss": 0.97609776, + "num_input_tokens_seen": 160428105, + "router_z_loss_clip": 1.60058594, + "router_z_loss_mlp": 0.12518311, + "step": 7479, + "time_per_iteration": 3.9324028491973877 + }, + { + "auxiliary_loss_clip": 0.06445809, + "auxiliary_loss_mlp": 0.01275156, + "balance_loss_clip": 0.06282537, + "balance_loss_mlp": 0.01261304, + "epoch": 0.44972192995641064, + "flos": 21037828965120.0, + "grad_norm": 1.5817445570827902, + "language_loss": 0.75620329, + "learning_rate": 2.419619407822302e-06, + "loss": 0.833413, + "num_input_tokens_seen": 160448815, + "router_z_loss_clip": 1.63085938, + "router_z_loss_mlp": 0.13861084, + "step": 7480, + "time_per_iteration": 2.519364595413208 + }, + { + "auxiliary_loss_clip": 0.06450936, + "auxiliary_loss_mlp": 0.01270868, + "balance_loss_clip": 0.06283928, + "balance_loss_mlp": 0.01257033, + "epoch": 0.4497820532090786, + "flos": 20783354515200.0, + "grad_norm": 2.4818923045987233, + "language_loss": 0.79794782, + "learning_rate": 2.419238606731815e-06, + "loss": 0.87516582, + "num_input_tokens_seen": 160465940, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.1385498, + "step": 7481, + "time_per_iteration": 2.511104106903076 + }, + { + "auxiliary_loss_clip": 0.06439544, + "auxiliary_loss_mlp": 0.01274879, + "balance_loss_clip": 0.06280965, + "balance_loss_mlp": 0.01261003, + "epoch": 0.44984217646174657, + "flos": 33811067295360.0, + "grad_norm": 1.5325857273153378, + "language_loss": 0.68501163, + "learning_rate": 2.418857789743758e-06, + "loss": 0.76215583, + "num_input_tokens_seen": 160486710, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 0.13873291, + "step": 7482, + "time_per_iteration": 2.6323177814483643 + }, + { + "auxiliary_loss_clip": 0.06449723, + "auxiliary_loss_mlp": 0.01275016, + "balance_loss_clip": 0.06284413, + "balance_loss_mlp": 0.01261236, + "epoch": 0.44990229971441453, + "flos": 15522953915520.0, + "grad_norm": 2.4692742165129347, + "language_loss": 0.85184467, + "learning_rate": 2.418476956872571e-06, + "loss": 0.92909217, + "num_input_tokens_seen": 160503405, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.13775635, + "step": 7483, + "time_per_iteration": 2.5510005950927734 + }, + { + "auxiliary_loss_clip": 0.0644832, + "auxiliary_loss_mlp": 0.01272458, + "balance_loss_clip": 0.06278956, + "balance_loss_mlp": 0.01259017, + "epoch": 0.4499624229670825, + "flos": 29869676542080.0, + "grad_norm": 2.2555510336477362, + "language_loss": 0.81026614, + "learning_rate": 2.4180961081326967e-06, + "loss": 0.88747394, + "num_input_tokens_seen": 160525080, + "router_z_loss_clip": 1.69238281, + "router_z_loss_mlp": 0.13439941, + "step": 7484, + "time_per_iteration": 2.5549514293670654 + }, + { + "auxiliary_loss_clip": 0.06454043, + "auxiliary_loss_mlp": 0.01271307, + "balance_loss_clip": 0.06282799, + "balance_loss_mlp": 0.01257133, + "epoch": 0.45002254621975046, + "flos": 18519345521280.0, + "grad_norm": 3.0066277785462296, + "language_loss": 0.75523663, + "learning_rate": 2.4177152435385754e-06, + "loss": 0.83249015, + "num_input_tokens_seen": 160540895, + "router_z_loss_clip": 1.71191406, + "router_z_loss_mlp": 0.14172363, + "step": 7485, + "time_per_iteration": 2.5260515213012695 + }, + { + "auxiliary_loss_clip": 0.06353837, + "auxiliary_loss_mlp": 0.01254878, + "balance_loss_clip": 0.06280266, + "balance_loss_mlp": 0.01252054, + "epoch": 0.4500826694724185, + "flos": 70438753261440.0, + "grad_norm": 0.7710237062022668, + "language_loss": 0.58055162, + "learning_rate": 2.4173343631046504e-06, + "loss": 0.65663874, + "num_input_tokens_seen": 160598270, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.02819824, + "step": 7486, + "time_per_iteration": 4.631975173950195 + }, + { + "auxiliary_loss_clip": 0.06445555, + "auxiliary_loss_mlp": 0.0126857, + "balance_loss_clip": 0.06281094, + "balance_loss_mlp": 0.0125523, + "epoch": 0.45014279272508645, + "flos": 15784388254080.0, + "grad_norm": 2.313810641491004, + "language_loss": 0.83291382, + "learning_rate": 2.4169534668453654e-06, + "loss": 0.91005504, + "num_input_tokens_seen": 160614720, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.13336182, + "step": 7487, + "time_per_iteration": 2.4474549293518066 + }, + { + "auxiliary_loss_clip": 0.06440553, + "auxiliary_loss_mlp": 0.01274868, + "balance_loss_clip": 0.06278186, + "balance_loss_mlp": 0.01260879, + "epoch": 0.4502029159777544, + "flos": 21806157778560.0, + "grad_norm": 1.8256144522955593, + "language_loss": 0.77817398, + "learning_rate": 2.4165725547751622e-06, + "loss": 0.8553282, + "num_input_tokens_seen": 160635170, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.13983154, + "step": 7488, + "time_per_iteration": 2.5497655868530273 + }, + { + "auxiliary_loss_clip": 0.0645895, + "auxiliary_loss_mlp": 0.01273187, + "balance_loss_clip": 0.06284817, + "balance_loss_mlp": 0.01257773, + "epoch": 0.4502630392304224, + "flos": 28775651708160.0, + "grad_norm": 2.1057521417086194, + "language_loss": 0.72464138, + "learning_rate": 2.4161916269084858e-06, + "loss": 0.80196273, + "num_input_tokens_seen": 160654490, + "router_z_loss_clip": 1.7421875, + "router_z_loss_mlp": 0.15405273, + "step": 7489, + "time_per_iteration": 2.536022186279297 + }, + { + "auxiliary_loss_clip": 0.06449728, + "auxiliary_loss_mlp": 0.01273963, + "balance_loss_clip": 0.06281854, + "balance_loss_mlp": 0.012597, + "epoch": 0.45032316248309034, + "flos": 15848398373760.0, + "grad_norm": 2.178444480440472, + "language_loss": 0.70506239, + "learning_rate": 2.4158106832597817e-06, + "loss": 0.78229928, + "num_input_tokens_seen": 160669400, + "router_z_loss_clip": 1.6796875, + "router_z_loss_mlp": 0.14263916, + "step": 7490, + "time_per_iteration": 2.5048370361328125 + }, + { + "auxiliary_loss_clip": 0.06351414, + "auxiliary_loss_mlp": 0.01254304, + "balance_loss_clip": 0.06277761, + "balance_loss_mlp": 0.01251552, + "epoch": 0.4503832857357583, + "flos": 57873337056000.0, + "grad_norm": 0.766905441156629, + "language_loss": 0.56608462, + "learning_rate": 2.415429723843495e-06, + "loss": 0.64214182, + "num_input_tokens_seen": 160733820, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.02757263, + "step": 7491, + "time_per_iteration": 3.1021111011505127 + }, + { + "auxiliary_loss_clip": 0.06440033, + "auxiliary_loss_mlp": 0.01270664, + "balance_loss_clip": 0.06278066, + "balance_loss_mlp": 0.01257217, + "epoch": 0.4504434089884263, + "flos": 23884817541120.0, + "grad_norm": 1.940533812141729, + "language_loss": 0.79471588, + "learning_rate": 2.4150487486740713e-06, + "loss": 0.87182283, + "num_input_tokens_seen": 160753175, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.13446045, + "step": 7492, + "time_per_iteration": 3.906813144683838 + }, + { + "auxiliary_loss_clip": 0.06454505, + "auxiliary_loss_mlp": 0.01272683, + "balance_loss_clip": 0.06282404, + "balance_loss_mlp": 0.01257925, + "epoch": 0.45050353224109424, + "flos": 17790820197120.0, + "grad_norm": 2.4926790281130566, + "language_loss": 0.92799652, + "learning_rate": 2.4146677577659573e-06, + "loss": 1.00526834, + "num_input_tokens_seen": 160768310, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.14758301, + "step": 7493, + "time_per_iteration": 2.516523838043213 + }, + { + "auxiliary_loss_clip": 0.06351101, + "auxiliary_loss_mlp": 0.01253906, + "balance_loss_clip": 0.06277501, + "balance_loss_mlp": 0.01251232, + "epoch": 0.4505636554937622, + "flos": 65081960138880.0, + "grad_norm": 0.7917943169613642, + "language_loss": 0.62850708, + "learning_rate": 2.4142867511336e-06, + "loss": 0.70455718, + "num_input_tokens_seen": 160827370, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.02676392, + "step": 7494, + "time_per_iteration": 3.200533866882324 + }, + { + "auxiliary_loss_clip": 0.06439039, + "auxiliary_loss_mlp": 0.01268167, + "balance_loss_clip": 0.06275568, + "balance_loss_mlp": 0.01255305, + "epoch": 0.45062377874643017, + "flos": 22206597240960.0, + "grad_norm": 1.3576432808579277, + "language_loss": 0.8187722, + "learning_rate": 2.4139057287914484e-06, + "loss": 0.89584428, + "num_input_tokens_seen": 160849140, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.12860107, + "step": 7495, + "time_per_iteration": 2.6740329265594482 + }, + { + "auxiliary_loss_clip": 0.06444755, + "auxiliary_loss_mlp": 0.01267118, + "balance_loss_clip": 0.06279008, + "balance_loss_mlp": 0.01253344, + "epoch": 0.45068390199909814, + "flos": 37679433615360.0, + "grad_norm": 3.4533684270887988, + "language_loss": 0.85559022, + "learning_rate": 2.41352469075395e-06, + "loss": 0.93270886, + "num_input_tokens_seen": 160871280, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.13775635, + "step": 7496, + "time_per_iteration": 2.6514453887939453 + }, + { + "auxiliary_loss_clip": 0.06445448, + "auxiliary_loss_mlp": 0.01271465, + "balance_loss_clip": 0.06277982, + "balance_loss_mlp": 0.01258042, + "epoch": 0.4507440252517661, + "flos": 22307853300480.0, + "grad_norm": 2.147795774994512, + "language_loss": 0.76396865, + "learning_rate": 2.4131436370355534e-06, + "loss": 0.84113777, + "num_input_tokens_seen": 160888625, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.13427734, + "step": 7497, + "time_per_iteration": 2.5248610973358154 + }, + { + "auxiliary_loss_clip": 0.0644587, + "auxiliary_loss_mlp": 0.01268435, + "balance_loss_clip": 0.062753, + "balance_loss_mlp": 0.01254189, + "epoch": 0.45080414850443407, + "flos": 13193425428480.0, + "grad_norm": 1.9297018893586142, + "language_loss": 0.75253481, + "learning_rate": 2.4127625676507088e-06, + "loss": 0.82967794, + "num_input_tokens_seen": 160907040, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.14245605, + "step": 7498, + "time_per_iteration": 2.482625722885132 + }, + { + "auxiliary_loss_clip": 0.06447846, + "auxiliary_loss_mlp": 0.01269776, + "balance_loss_clip": 0.06277958, + "balance_loss_mlp": 0.01255697, + "epoch": 0.4508642717571021, + "flos": 21951451958400.0, + "grad_norm": 1.9463705761270829, + "language_loss": 0.70564914, + "learning_rate": 2.4123814826138663e-06, + "loss": 0.78282535, + "num_input_tokens_seen": 160927115, + "router_z_loss_clip": 1.69824219, + "router_z_loss_mlp": 0.14093018, + "step": 7499, + "time_per_iteration": 2.5338642597198486 + }, + { + "auxiliary_loss_clip": 0.06449613, + "auxiliary_loss_mlp": 0.01268145, + "balance_loss_clip": 0.06278396, + "balance_loss_mlp": 0.0125412, + "epoch": 0.45092439500977005, + "flos": 23374149632640.0, + "grad_norm": 2.119825325087625, + "language_loss": 0.77484369, + "learning_rate": 2.412000381939477e-06, + "loss": 0.85202128, + "num_input_tokens_seen": 160944405, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.14025879, + "step": 7500, + "time_per_iteration": 2.5290849208831787 + }, + { + "auxiliary_loss_clip": 0.06441833, + "auxiliary_loss_mlp": 0.01275038, + "balance_loss_clip": 0.06276967, + "balance_loss_mlp": 0.01262211, + "epoch": 0.450984518262438, + "flos": 20778532905600.0, + "grad_norm": 2.0513851791377014, + "language_loss": 0.62714708, + "learning_rate": 2.411619265641992e-06, + "loss": 0.70431578, + "num_input_tokens_seen": 160961345, + "router_z_loss_clip": 1.64746094, + "router_z_loss_mlp": 0.12823486, + "step": 7501, + "time_per_iteration": 2.513014316558838 + }, + { + "auxiliary_loss_clip": 0.06447023, + "auxiliary_loss_mlp": 0.01269353, + "balance_loss_clip": 0.0627754, + "balance_loss_mlp": 0.01255251, + "epoch": 0.451044641515106, + "flos": 17712303321600.0, + "grad_norm": 1.7676077358786102, + "language_loss": 0.8475225, + "learning_rate": 2.411238133735863e-06, + "loss": 0.92468631, + "num_input_tokens_seen": 160977330, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.14111328, + "step": 7502, + "time_per_iteration": 2.502213954925537 + }, + { + "auxiliary_loss_clip": 0.06440664, + "auxiliary_loss_mlp": 0.01270795, + "balance_loss_clip": 0.06275544, + "balance_loss_mlp": 0.01256967, + "epoch": 0.45110476476777395, + "flos": 20600940176640.0, + "grad_norm": 1.2963550821027272, + "language_loss": 0.79440266, + "learning_rate": 2.4108569862355418e-06, + "loss": 0.8715173, + "num_input_tokens_seen": 160997280, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.13824463, + "step": 7503, + "time_per_iteration": 2.539870023727417 + }, + { + "auxiliary_loss_clip": 0.0643944, + "auxiliary_loss_mlp": 0.01270582, + "balance_loss_clip": 0.06278714, + "balance_loss_mlp": 0.01257213, + "epoch": 0.4511648880204419, + "flos": 16039533536640.0, + "grad_norm": 2.8864102182872746, + "language_loss": 0.80966014, + "learning_rate": 2.410475823155484e-06, + "loss": 0.88676035, + "num_input_tokens_seen": 161014235, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.13354492, + "step": 7504, + "time_per_iteration": 2.4834609031677246 + }, + { + "auxiliary_loss_clip": 0.06439783, + "auxiliary_loss_mlp": 0.01267614, + "balance_loss_clip": 0.06277721, + "balance_loss_mlp": 0.0125412, + "epoch": 0.4512250112731099, + "flos": 23984103029760.0, + "grad_norm": 1.8935476867238503, + "language_loss": 0.63783783, + "learning_rate": 2.4100946445101405e-06, + "loss": 0.71491182, + "num_input_tokens_seen": 161032360, + "router_z_loss_clip": 1.62109375, + "router_z_loss_mlp": 0.1350708, + "step": 7505, + "time_per_iteration": 2.5183863639831543 + }, + { + "auxiliary_loss_clip": 0.06338686, + "auxiliary_loss_mlp": 0.0125649, + "balance_loss_clip": 0.06265638, + "balance_loss_mlp": 0.01253881, + "epoch": 0.45128513452577784, + "flos": 71484239053440.0, + "grad_norm": 0.8179087732062593, + "language_loss": 0.58726048, + "learning_rate": 2.409713450313968e-06, + "loss": 0.66321218, + "num_input_tokens_seen": 161091360, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.02610779, + "step": 7506, + "time_per_iteration": 3.2057392597198486 + }, + { + "auxiliary_loss_clip": 0.06438521, + "auxiliary_loss_mlp": 0.01269482, + "balance_loss_clip": 0.0627608, + "balance_loss_mlp": 0.01255987, + "epoch": 0.4513452577784458, + "flos": 22097375043840.0, + "grad_norm": 1.6199933066680872, + "language_loss": 0.79207951, + "learning_rate": 2.40933224058142e-06, + "loss": 0.86915958, + "num_input_tokens_seen": 161110825, + "router_z_loss_clip": 1.62402344, + "router_z_loss_mlp": 0.1348877, + "step": 7507, + "time_per_iteration": 2.485177993774414 + }, + { + "auxiliary_loss_clip": 0.0644455, + "auxiliary_loss_mlp": 0.01270991, + "balance_loss_clip": 0.06277668, + "balance_loss_mlp": 0.01256543, + "epoch": 0.4514053810311138, + "flos": 24282699454080.0, + "grad_norm": 1.6041025363642085, + "language_loss": 0.74460357, + "learning_rate": 2.4089510153269526e-06, + "loss": 0.82175899, + "num_input_tokens_seen": 161130685, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.14440918, + "step": 7508, + "time_per_iteration": 2.5957343578338623 + }, + { + "auxiliary_loss_clip": 0.06439587, + "auxiliary_loss_mlp": 0.01271402, + "balance_loss_clip": 0.06279378, + "balance_loss_mlp": 0.01258552, + "epoch": 0.45146550428378174, + "flos": 17891237715840.0, + "grad_norm": 2.0541508842975946, + "language_loss": 0.79828942, + "learning_rate": 2.4085697745650217e-06, + "loss": 0.87539923, + "num_input_tokens_seen": 161147555, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.12841797, + "step": 7509, + "time_per_iteration": 2.4700090885162354 + }, + { + "auxiliary_loss_clip": 0.06441342, + "auxiliary_loss_mlp": 0.01270525, + "balance_loss_clip": 0.06278946, + "balance_loss_mlp": 0.01257746, + "epoch": 0.4515256275364497, + "flos": 24250317050880.0, + "grad_norm": 1.7065874480024321, + "language_loss": 0.73257631, + "learning_rate": 2.4081885183100837e-06, + "loss": 0.80969501, + "num_input_tokens_seen": 161166255, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.12774658, + "step": 7510, + "time_per_iteration": 2.5448224544525146 + }, + { + "auxiliary_loss_clip": 0.06438527, + "auxiliary_loss_mlp": 0.01269291, + "balance_loss_clip": 0.06273279, + "balance_loss_mlp": 0.01255707, + "epoch": 0.45158575078911767, + "flos": 20637263721600.0, + "grad_norm": 1.688618785836195, + "language_loss": 0.77059448, + "learning_rate": 2.4078072465765964e-06, + "loss": 0.8476727, + "num_input_tokens_seen": 161184720, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.13598633, + "step": 7511, + "time_per_iteration": 2.48913311958313 + }, + { + "auxiliary_loss_clip": 0.06443627, + "auxiliary_loss_mlp": 0.01270366, + "balance_loss_clip": 0.06277004, + "balance_loss_mlp": 0.0125543, + "epoch": 0.45164587404178563, + "flos": 23333884945920.0, + "grad_norm": 1.5549799825793658, + "language_loss": 0.79259372, + "learning_rate": 2.4074259593790174e-06, + "loss": 0.86973357, + "num_input_tokens_seen": 161204360, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.14929199, + "step": 7512, + "time_per_iteration": 2.5429651737213135 + }, + { + "auxiliary_loss_clip": 0.06447546, + "auxiliary_loss_mlp": 0.01266751, + "balance_loss_clip": 0.06275645, + "balance_loss_mlp": 0.01252219, + "epoch": 0.45170599729445365, + "flos": 23812841283840.0, + "grad_norm": 2.088368619040166, + "language_loss": 0.87660837, + "learning_rate": 2.4070446567318053e-06, + "loss": 0.95375133, + "num_input_tokens_seen": 161223575, + "router_z_loss_clip": 1.71777344, + "router_z_loss_mlp": 0.14538574, + "step": 7513, + "time_per_iteration": 2.50119686126709 + }, + { + "auxiliary_loss_clip": 0.06437154, + "auxiliary_loss_mlp": 0.01272349, + "balance_loss_clip": 0.06280629, + "balance_loss_mlp": 0.01259963, + "epoch": 0.4517661205471216, + "flos": 23519569593600.0, + "grad_norm": 1.9321046654640033, + "language_loss": 0.67692971, + "learning_rate": 2.406663338649419e-06, + "loss": 0.75402474, + "num_input_tokens_seen": 161243805, + "router_z_loss_clip": 1.56738281, + "router_z_loss_mlp": 0.1237793, + "step": 7514, + "time_per_iteration": 2.548349618911743 + }, + { + "auxiliary_loss_clip": 0.0644633, + "auxiliary_loss_mlp": 0.01272615, + "balance_loss_clip": 0.06280062, + "balance_loss_mlp": 0.01258017, + "epoch": 0.4518262437997896, + "flos": 23520743550720.0, + "grad_norm": 2.108913826152056, + "language_loss": 0.69738746, + "learning_rate": 2.406282005146318e-06, + "loss": 0.7745769, + "num_input_tokens_seen": 161261450, + "router_z_loss_clip": 1.6640625, + "router_z_loss_mlp": 0.14587402, + "step": 7515, + "time_per_iteration": 2.5203166007995605 + }, + { + "auxiliary_loss_clip": 0.06448089, + "auxiliary_loss_mlp": 0.01273292, + "balance_loss_clip": 0.06278358, + "balance_loss_mlp": 0.01258379, + "epoch": 0.45188636705245755, + "flos": 14572210763520.0, + "grad_norm": 2.327142049261069, + "language_loss": 0.81245089, + "learning_rate": 2.405900656236963e-06, + "loss": 0.88966471, + "num_input_tokens_seen": 161276965, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.14916992, + "step": 7516, + "time_per_iteration": 2.5070860385894775 + }, + { + "auxiliary_loss_clip": 0.06440821, + "auxiliary_loss_mlp": 0.01272469, + "balance_loss_clip": 0.0627999, + "balance_loss_mlp": 0.01259899, + "epoch": 0.4519464903051255, + "flos": 19907690221440.0, + "grad_norm": 1.8586788547852597, + "language_loss": 0.65825433, + "learning_rate": 2.4055192919358137e-06, + "loss": 0.73538721, + "num_input_tokens_seen": 161295375, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.12573242, + "step": 7517, + "time_per_iteration": 2.4824438095092773 + }, + { + "auxiliary_loss_clip": 0.06439231, + "auxiliary_loss_mlp": 0.01270445, + "balance_loss_clip": 0.06279515, + "balance_loss_mlp": 0.01257923, + "epoch": 0.4520066135577935, + "flos": 18850492056960.0, + "grad_norm": 1.7463164288041955, + "language_loss": 0.63218093, + "learning_rate": 2.405137912257333e-06, + "loss": 0.70927775, + "num_input_tokens_seen": 161313010, + "router_z_loss_clip": 1.59570312, + "router_z_loss_mlp": 0.12524414, + "step": 7518, + "time_per_iteration": 2.5339365005493164 + }, + { + "auxiliary_loss_clip": 0.0644324, + "auxiliary_loss_mlp": 0.01270416, + "balance_loss_clip": 0.06278235, + "balance_loss_mlp": 0.0125713, + "epoch": 0.45206673681046144, + "flos": 48225279985920.0, + "grad_norm": 1.4167266474258036, + "language_loss": 0.59749353, + "learning_rate": 2.404756517215982e-06, + "loss": 0.67463017, + "num_input_tokens_seen": 161336690, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.13287354, + "step": 7519, + "time_per_iteration": 4.238602876663208 + }, + { + "auxiliary_loss_clip": 0.06444496, + "auxiliary_loss_mlp": 0.01271755, + "balance_loss_clip": 0.06278859, + "balance_loss_mlp": 0.0125789, + "epoch": 0.4521268600631294, + "flos": 23848997120640.0, + "grad_norm": 1.307309529899749, + "language_loss": 0.72893107, + "learning_rate": 2.404375106826223e-06, + "loss": 0.80609363, + "num_input_tokens_seen": 161357845, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.13848877, + "step": 7520, + "time_per_iteration": 2.5295658111572266 + }, + { + "auxiliary_loss_clip": 0.06438812, + "auxiliary_loss_mlp": 0.01272031, + "balance_loss_clip": 0.062758, + "balance_loss_mlp": 0.01257875, + "epoch": 0.4521869833157974, + "flos": 18849611589120.0, + "grad_norm": 1.9694306251575102, + "language_loss": 0.75821477, + "learning_rate": 2.4039936811025194e-06, + "loss": 0.83532321, + "num_input_tokens_seen": 161375160, + "router_z_loss_clip": 1.62890625, + "router_z_loss_mlp": 0.14147949, + "step": 7521, + "time_per_iteration": 2.51493763923645 + }, + { + "auxiliary_loss_clip": 0.06448258, + "auxiliary_loss_mlp": 0.01268765, + "balance_loss_clip": 0.06277934, + "balance_loss_mlp": 0.01255485, + "epoch": 0.45224710656846534, + "flos": 19793520633600.0, + "grad_norm": 2.0145516283749334, + "language_loss": 0.68112928, + "learning_rate": 2.4036122400593343e-06, + "loss": 0.75829947, + "num_input_tokens_seen": 161393690, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.1328125, + "step": 7522, + "time_per_iteration": 2.4986941814422607 + }, + { + "auxiliary_loss_clip": 0.06441501, + "auxiliary_loss_mlp": 0.0127253, + "balance_loss_clip": 0.06278691, + "balance_loss_mlp": 0.01258797, + "epoch": 0.4523072298211333, + "flos": 28263558280320.0, + "grad_norm": 1.4118666030005445, + "language_loss": 0.61165464, + "learning_rate": 2.403230783711134e-06, + "loss": 0.68879497, + "num_input_tokens_seen": 161415015, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.13739014, + "step": 7523, + "time_per_iteration": 2.5918800830841064 + }, + { + "auxiliary_loss_clip": 0.06446532, + "auxiliary_loss_mlp": 0.01271231, + "balance_loss_clip": 0.06278014, + "balance_loss_mlp": 0.01256187, + "epoch": 0.45236735307380127, + "flos": 11185651820160.0, + "grad_norm": 1.7682897571754845, + "language_loss": 0.78361082, + "learning_rate": 2.4028493120723813e-06, + "loss": 0.86078846, + "num_input_tokens_seen": 161432940, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.15057373, + "step": 7524, + "time_per_iteration": 2.4915785789489746 + }, + { + "auxiliary_loss_clip": 0.06441181, + "auxiliary_loss_mlp": 0.01272652, + "balance_loss_clip": 0.06277032, + "balance_loss_mlp": 0.01259527, + "epoch": 0.45242747632646924, + "flos": 22607959098240.0, + "grad_norm": 1.5918865124670334, + "language_loss": 0.63704681, + "learning_rate": 2.4024678251575417e-06, + "loss": 0.71418512, + "num_input_tokens_seen": 161452215, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.13122559, + "step": 7525, + "time_per_iteration": 4.0678441524505615 + }, + { + "auxiliary_loss_clip": 0.06439088, + "auxiliary_loss_mlp": 0.01272795, + "balance_loss_clip": 0.06279112, + "balance_loss_mlp": 0.01260153, + "epoch": 0.45248759957913726, + "flos": 18261558835200.0, + "grad_norm": 33.97196740045056, + "language_loss": 0.78961569, + "learning_rate": 2.402086322981083e-06, + "loss": 0.8667345, + "num_input_tokens_seen": 161469520, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.12664795, + "step": 7526, + "time_per_iteration": 2.4813144207000732 + }, + { + "auxiliary_loss_clip": 0.06437138, + "auxiliary_loss_mlp": 0.01271118, + "balance_loss_clip": 0.06276058, + "balance_loss_mlp": 0.01257493, + "epoch": 0.4525477228318052, + "flos": 22455746956800.0, + "grad_norm": 1.6415997795559136, + "language_loss": 0.81301343, + "learning_rate": 2.40170480555747e-06, + "loss": 0.89009607, + "num_input_tokens_seen": 161487335, + "router_z_loss_clip": 1.61132812, + "router_z_loss_mlp": 0.13641357, + "step": 7527, + "time_per_iteration": 2.5056183338165283 + }, + { + "auxiliary_loss_clip": 0.06441762, + "auxiliary_loss_mlp": 0.01270981, + "balance_loss_clip": 0.06280501, + "balance_loss_mlp": 0.01258106, + "epoch": 0.4526078460844732, + "flos": 29652909229440.0, + "grad_norm": 1.731340365534577, + "language_loss": 0.65853465, + "learning_rate": 2.4013232729011706e-06, + "loss": 0.73566198, + "num_input_tokens_seen": 161510095, + "router_z_loss_clip": 1.61230469, + "router_z_loss_mlp": 0.12866211, + "step": 7528, + "time_per_iteration": 2.6073391437530518 + }, + { + "auxiliary_loss_clip": 0.06439637, + "auxiliary_loss_mlp": 0.0127116, + "balance_loss_clip": 0.06280227, + "balance_loss_mlp": 0.01257296, + "epoch": 0.45266796933714115, + "flos": 23046483041280.0, + "grad_norm": 1.6874802957215247, + "language_loss": 0.75494301, + "learning_rate": 2.4009417250266525e-06, + "loss": 0.83205104, + "num_input_tokens_seen": 161528725, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.13867188, + "step": 7529, + "time_per_iteration": 2.5490171909332275 + }, + { + "auxiliary_loss_clip": 0.06443143, + "auxiliary_loss_mlp": 0.01270284, + "balance_loss_clip": 0.06278682, + "balance_loss_mlp": 0.0125614, + "epoch": 0.4527280925898091, + "flos": 14433582983040.0, + "grad_norm": 5.318026120447717, + "language_loss": 0.73199093, + "learning_rate": 2.400560161948384e-06, + "loss": 0.80912519, + "num_input_tokens_seen": 161547195, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.14160156, + "step": 7530, + "time_per_iteration": 2.4709434509277344 + }, + { + "auxiliary_loss_clip": 0.06441925, + "auxiliary_loss_mlp": 0.01267178, + "balance_loss_clip": 0.06279813, + "balance_loss_mlp": 0.01253857, + "epoch": 0.4527882158424771, + "flos": 22931432985600.0, + "grad_norm": 1.7055117614079858, + "language_loss": 0.76767921, + "learning_rate": 2.400178583680834e-06, + "loss": 0.84477019, + "num_input_tokens_seen": 161565565, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.13336182, + "step": 7531, + "time_per_iteration": 3.9209694862365723 + }, + { + "auxiliary_loss_clip": 0.06439964, + "auxiliary_loss_mlp": 0.01266077, + "balance_loss_clip": 0.06281422, + "balance_loss_mlp": 0.01253018, + "epoch": 0.45284833909514505, + "flos": 25562157373440.0, + "grad_norm": 1.5452453614533965, + "language_loss": 0.67367595, + "learning_rate": 2.3997969902384717e-06, + "loss": 0.75073636, + "num_input_tokens_seen": 161586630, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.1305542, + "step": 7532, + "time_per_iteration": 2.5799813270568848 + }, + { + "auxiliary_loss_clip": 0.06441537, + "auxiliary_loss_mlp": 0.01269682, + "balance_loss_clip": 0.06280663, + "balance_loss_mlp": 0.01257206, + "epoch": 0.452908462347813, + "flos": 18155816582400.0, + "grad_norm": 2.362226158293886, + "language_loss": 0.78750062, + "learning_rate": 2.399415381635768e-06, + "loss": 0.86461282, + "num_input_tokens_seen": 161603815, + "router_z_loss_clip": 1.60839844, + "router_z_loss_mlp": 0.12481689, + "step": 7533, + "time_per_iteration": 2.4713315963745117 + }, + { + "auxiliary_loss_clip": 0.06451754, + "auxiliary_loss_mlp": 0.01272809, + "balance_loss_clip": 0.06279968, + "balance_loss_mlp": 0.01257849, + "epoch": 0.452968585600481, + "flos": 19068810670080.0, + "grad_norm": 1.7736608700696739, + "language_loss": 0.83544481, + "learning_rate": 2.3990337578871927e-06, + "loss": 0.9126904, + "num_input_tokens_seen": 161622900, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.1494751, + "step": 7534, + "time_per_iteration": 2.632647752761841 + }, + { + "auxiliary_loss_clip": 0.06447195, + "auxiliary_loss_mlp": 0.01272735, + "balance_loss_clip": 0.06281491, + "balance_loss_mlp": 0.01258597, + "epoch": 0.45302870885314894, + "flos": 22057823116800.0, + "grad_norm": 1.5477368000033016, + "language_loss": 0.77199811, + "learning_rate": 2.3986521190072176e-06, + "loss": 0.84919739, + "num_input_tokens_seen": 161641700, + "router_z_loss_clip": 1.65625, + "router_z_loss_mlp": 0.14129639, + "step": 7535, + "time_per_iteration": 2.504075765609741 + }, + { + "auxiliary_loss_clip": 0.06444988, + "auxiliary_loss_mlp": 0.01267095, + "balance_loss_clip": 0.06283444, + "balance_loss_mlp": 0.01254453, + "epoch": 0.4530888321058169, + "flos": 20382495782400.0, + "grad_norm": 1.553658728431748, + "language_loss": 0.80988163, + "learning_rate": 2.3982704650103138e-06, + "loss": 0.88700247, + "num_input_tokens_seen": 161661955, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.12640381, + "step": 7536, + "time_per_iteration": 2.5701963901519775 + }, + { + "auxiliary_loss_clip": 0.06448273, + "auxiliary_loss_mlp": 0.01269034, + "balance_loss_clip": 0.06281114, + "balance_loss_mlp": 0.01255617, + "epoch": 0.4531489553584849, + "flos": 14835783381120.0, + "grad_norm": 1.8444336957712972, + "language_loss": 0.76206815, + "learning_rate": 2.3978887959109544e-06, + "loss": 0.83924115, + "num_input_tokens_seen": 161679245, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.13427734, + "step": 7537, + "time_per_iteration": 2.4535741806030273 + }, + { + "auxiliary_loss_clip": 0.06453362, + "auxiliary_loss_mlp": 0.01269094, + "balance_loss_clip": 0.06287456, + "balance_loss_mlp": 0.0125526, + "epoch": 0.45320907861115284, + "flos": 21951493885440.0, + "grad_norm": 1.8251133101176713, + "language_loss": 0.75698435, + "learning_rate": 2.3975071117236118e-06, + "loss": 0.83420891, + "num_input_tokens_seen": 161698795, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.13830566, + "step": 7538, + "time_per_iteration": 2.5437614917755127 + }, + { + "auxiliary_loss_clip": 0.06342177, + "auxiliary_loss_mlp": 0.01255931, + "balance_loss_clip": 0.06267795, + "balance_loss_mlp": 0.01253302, + "epoch": 0.45326920186382086, + "flos": 66273620578560.0, + "grad_norm": 1.09487044177016, + "language_loss": 0.62420493, + "learning_rate": 2.3971254124627593e-06, + "loss": 0.70018601, + "num_input_tokens_seen": 161761980, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.02630615, + "step": 7539, + "time_per_iteration": 3.1658005714416504 + }, + { + "auxiliary_loss_clip": 0.06450586, + "auxiliary_loss_mlp": 0.01270155, + "balance_loss_clip": 0.06287818, + "balance_loss_mlp": 0.01256404, + "epoch": 0.4533293251164888, + "flos": 14689524879360.0, + "grad_norm": 1.7102983978579578, + "language_loss": 0.65674543, + "learning_rate": 2.396743698142872e-06, + "loss": 0.73395288, + "num_input_tokens_seen": 161779455, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.13757324, + "step": 7540, + "time_per_iteration": 2.5642666816711426 + }, + { + "auxiliary_loss_clip": 0.06454974, + "auxiliary_loss_mlp": 0.01269021, + "balance_loss_clip": 0.06285828, + "balance_loss_mlp": 0.01254179, + "epoch": 0.4533894483691568, + "flos": 22607749463040.0, + "grad_norm": 2.019177110810713, + "language_loss": 0.84982491, + "learning_rate": 2.396361968778424e-06, + "loss": 0.92706484, + "num_input_tokens_seen": 161798980, + "router_z_loss_clip": 1.69238281, + "router_z_loss_mlp": 0.1484375, + "step": 7541, + "time_per_iteration": 2.515012741088867 + }, + { + "auxiliary_loss_clip": 0.06444205, + "auxiliary_loss_mlp": 0.01270638, + "balance_loss_clip": 0.06281162, + "balance_loss_mlp": 0.01257853, + "epoch": 0.45344957162182475, + "flos": 34760301073920.0, + "grad_norm": 1.6772641382422697, + "language_loss": 0.77260393, + "learning_rate": 2.395980224383889e-06, + "loss": 0.84975231, + "num_input_tokens_seen": 161819745, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.12780762, + "step": 7542, + "time_per_iteration": 2.6276772022247314 + }, + { + "auxiliary_loss_clip": 0.06447195, + "auxiliary_loss_mlp": 0.01266644, + "balance_loss_clip": 0.06281827, + "balance_loss_mlp": 0.01252398, + "epoch": 0.4535096948744927, + "flos": 23556983241600.0, + "grad_norm": 1.679511772595701, + "language_loss": 0.80522043, + "learning_rate": 2.395598464973746e-06, + "loss": 0.88235873, + "num_input_tokens_seen": 161838575, + "router_z_loss_clip": 1.65234375, + "router_z_loss_mlp": 0.14233398, + "step": 7543, + "time_per_iteration": 2.5102038383483887 + }, + { + "auxiliary_loss_clip": 0.06448692, + "auxiliary_loss_mlp": 0.01269791, + "balance_loss_clip": 0.06283225, + "balance_loss_mlp": 0.01256339, + "epoch": 0.4535698181271607, + "flos": 25564756849920.0, + "grad_norm": 1.5595363191014409, + "language_loss": 0.76234162, + "learning_rate": 2.395216690562469e-06, + "loss": 0.83952641, + "num_input_tokens_seen": 161858590, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.13446045, + "step": 7544, + "time_per_iteration": 2.613546371459961 + }, + { + "auxiliary_loss_clip": 0.06450664, + "auxiliary_loss_mlp": 0.0127145, + "balance_loss_clip": 0.06283042, + "balance_loss_mlp": 0.01257747, + "epoch": 0.45362994137982865, + "flos": 24871171478400.0, + "grad_norm": 1.656067150864753, + "language_loss": 0.75691646, + "learning_rate": 2.3948349011645355e-06, + "loss": 0.83413762, + "num_input_tokens_seen": 161878390, + "router_z_loss_clip": 1.67578125, + "router_z_loss_mlp": 0.137146, + "step": 7545, + "time_per_iteration": 2.5587077140808105 + }, + { + "auxiliary_loss_clip": 0.06444206, + "auxiliary_loss_mlp": 0.01276554, + "balance_loss_clip": 0.06279359, + "balance_loss_mlp": 0.01263161, + "epoch": 0.4536900646324966, + "flos": 30814088711040.0, + "grad_norm": 1.7013764448707542, + "language_loss": 0.72677243, + "learning_rate": 2.394453096794423e-06, + "loss": 0.80397999, + "num_input_tokens_seen": 161898610, + "router_z_loss_clip": 1.64941406, + "router_z_loss_mlp": 0.13391113, + "step": 7546, + "time_per_iteration": 2.582507371902466 + }, + { + "auxiliary_loss_clip": 0.06454303, + "auxiliary_loss_mlp": 0.01276587, + "balance_loss_clip": 0.06282242, + "balance_loss_mlp": 0.01261531, + "epoch": 0.4537501878851646, + "flos": 23411060156160.0, + "grad_norm": 1.4140833040204603, + "language_loss": 0.76407051, + "learning_rate": 2.394071277466609e-06, + "loss": 0.8413794, + "num_input_tokens_seen": 161918210, + "router_z_loss_clip": 1.72070312, + "router_z_loss_mlp": 0.1505127, + "step": 7547, + "time_per_iteration": 2.5376148223876953 + }, + { + "auxiliary_loss_clip": 0.06452849, + "auxiliary_loss_mlp": 0.0127245, + "balance_loss_clip": 0.06284454, + "balance_loss_mlp": 0.01258086, + "epoch": 0.45381031113783254, + "flos": 18154978041600.0, + "grad_norm": 1.9572251150113926, + "language_loss": 0.70011902, + "learning_rate": 2.393689443195573e-06, + "loss": 0.777372, + "num_input_tokens_seen": 161936950, + "router_z_loss_clip": 1.68554688, + "router_z_loss_mlp": 0.14367676, + "step": 7548, + "time_per_iteration": 2.519615650177002 + }, + { + "auxiliary_loss_clip": 0.0644725, + "auxiliary_loss_mlp": 0.01271972, + "balance_loss_clip": 0.06283379, + "balance_loss_mlp": 0.01258638, + "epoch": 0.4538704343905005, + "flos": 25343503344000.0, + "grad_norm": 2.0312160927741933, + "language_loss": 0.72993481, + "learning_rate": 2.393307593995794e-06, + "loss": 0.80712706, + "num_input_tokens_seen": 161955550, + "router_z_loss_clip": 1.63769531, + "router_z_loss_mlp": 0.13342285, + "step": 7549, + "time_per_iteration": 2.57501482963562 + }, + { + "auxiliary_loss_clip": 0.06446082, + "auxiliary_loss_mlp": 0.01269972, + "balance_loss_clip": 0.06283575, + "balance_loss_mlp": 0.01257312, + "epoch": 0.4539305576431685, + "flos": 28739118528000.0, + "grad_norm": 1.441987244253853, + "language_loss": 0.65387678, + "learning_rate": 2.392925729881751e-06, + "loss": 0.73103732, + "num_input_tokens_seen": 161976760, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.12658691, + "step": 7550, + "time_per_iteration": 2.5835819244384766 + }, + { + "auxiliary_loss_clip": 0.06445216, + "auxiliary_loss_mlp": 0.01271365, + "balance_loss_clip": 0.06284294, + "balance_loss_mlp": 0.01258162, + "epoch": 0.45399068089583644, + "flos": 22499030390400.0, + "grad_norm": 1.5764003430967004, + "language_loss": 0.6906575, + "learning_rate": 2.3925438508679263e-06, + "loss": 0.76782334, + "num_input_tokens_seen": 161996120, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.13189697, + "step": 7551, + "time_per_iteration": 2.562033176422119 + }, + { + "auxiliary_loss_clip": 0.06442459, + "auxiliary_loss_mlp": 0.01272903, + "balance_loss_clip": 0.06276844, + "balance_loss_mlp": 0.01259504, + "epoch": 0.45405080414850446, + "flos": 12897889678080.0, + "grad_norm": 1.6874134559177159, + "language_loss": 0.79426885, + "learning_rate": 2.392161956968798e-06, + "loss": 0.87142253, + "num_input_tokens_seen": 162011125, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.13409424, + "step": 7552, + "time_per_iteration": 2.4449541568756104 + }, + { + "auxiliary_loss_clip": 0.063404, + "auxiliary_loss_mlp": 0.01262626, + "balance_loss_clip": 0.06265783, + "balance_loss_mlp": 0.01260128, + "epoch": 0.4541109274011724, + "flos": 59783558912640.0, + "grad_norm": 0.8094629177090237, + "language_loss": 0.57832247, + "learning_rate": 2.39178004819885e-06, + "loss": 0.65435266, + "num_input_tokens_seen": 162068705, + "router_z_loss_clip": 0.74707031, + "router_z_loss_mlp": 0.02496338, + "step": 7553, + "time_per_iteration": 3.089684247970581 + }, + { + "auxiliary_loss_clip": 0.06443945, + "auxiliary_loss_mlp": 0.01272453, + "balance_loss_clip": 0.06280293, + "balance_loss_mlp": 0.01258946, + "epoch": 0.4541710506538404, + "flos": 28519248614400.0, + "grad_norm": 1.8062911390055711, + "language_loss": 0.76727033, + "learning_rate": 2.3913981245725626e-06, + "loss": 0.84443438, + "num_input_tokens_seen": 162089655, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.13494873, + "step": 7554, + "time_per_iteration": 2.541727066040039 + }, + { + "auxiliary_loss_clip": 0.06449907, + "auxiliary_loss_mlp": 0.0126986, + "balance_loss_clip": 0.06284112, + "balance_loss_mlp": 0.0125559, + "epoch": 0.45423117390650836, + "flos": 17681304510720.0, + "grad_norm": 3.221825223389834, + "language_loss": 0.76701951, + "learning_rate": 2.3910161861044194e-06, + "loss": 0.84421712, + "num_input_tokens_seen": 162108465, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.1427002, + "step": 7555, + "time_per_iteration": 2.5190746784210205 + }, + { + "auxiliary_loss_clip": 0.06447887, + "auxiliary_loss_mlp": 0.01270234, + "balance_loss_clip": 0.06284074, + "balance_loss_mlp": 0.01256292, + "epoch": 0.4542912971591763, + "flos": 28079760349440.0, + "grad_norm": 1.2938327471401587, + "language_loss": 0.7293222, + "learning_rate": 2.390634232808903e-06, + "loss": 0.80650342, + "num_input_tokens_seen": 162129910, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.13946533, + "step": 7556, + "time_per_iteration": 2.559330940246582 + }, + { + "auxiliary_loss_clip": 0.06452744, + "auxiliary_loss_mlp": 0.0127062, + "balance_loss_clip": 0.06282438, + "balance_loss_mlp": 0.01256351, + "epoch": 0.4543514204118443, + "flos": 22677922857600.0, + "grad_norm": 1.9930550713200077, + "language_loss": 0.63614035, + "learning_rate": 2.3902522647004982e-06, + "loss": 0.71337396, + "num_input_tokens_seen": 162148840, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.14294434, + "step": 7557, + "time_per_iteration": 2.555694580078125 + }, + { + "auxiliary_loss_clip": 0.06341553, + "auxiliary_loss_mlp": 0.01256007, + "balance_loss_clip": 0.06267436, + "balance_loss_mlp": 0.01253351, + "epoch": 0.45441154366451225, + "flos": 58236027454080.0, + "grad_norm": 0.6640379644801875, + "language_loss": 0.57562745, + "learning_rate": 2.3898702817936875e-06, + "loss": 0.65160298, + "num_input_tokens_seen": 162208500, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.02658081, + "step": 7558, + "time_per_iteration": 5.871712684631348 + }, + { + "auxiliary_loss_clip": 0.06449831, + "auxiliary_loss_mlp": 0.01270129, + "balance_loss_clip": 0.06282432, + "balance_loss_mlp": 0.01255216, + "epoch": 0.4544716669171802, + "flos": 16769987504640.0, + "grad_norm": 2.2880587940678927, + "language_loss": 0.56438738, + "learning_rate": 2.3894882841029573e-06, + "loss": 0.64158702, + "num_input_tokens_seen": 162224650, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.14904785, + "step": 7559, + "time_per_iteration": 2.4660634994506836 + }, + { + "auxiliary_loss_clip": 0.06446083, + "auxiliary_loss_mlp": 0.01271383, + "balance_loss_clip": 0.06282272, + "balance_loss_mlp": 0.01257728, + "epoch": 0.4545317901698482, + "flos": 15930814464000.0, + "grad_norm": 1.794091833084443, + "language_loss": 0.72316611, + "learning_rate": 2.389106271642792e-06, + "loss": 0.80034077, + "num_input_tokens_seen": 162242930, + "router_z_loss_clip": 1.63867188, + "router_z_loss_mlp": 0.13671875, + "step": 7560, + "time_per_iteration": 2.497083902359009 + }, + { + "auxiliary_loss_clip": 0.06455533, + "auxiliary_loss_mlp": 0.01271449, + "balance_loss_clip": 0.0628465, + "balance_loss_mlp": 0.01257096, + "epoch": 0.45459191342251615, + "flos": 17645567944320.0, + "grad_norm": 2.9678955818231167, + "language_loss": 0.69120479, + "learning_rate": 2.3887242444276775e-06, + "loss": 0.76847458, + "num_input_tokens_seen": 162261455, + "router_z_loss_clip": 1.70800781, + "router_z_loss_mlp": 0.14355469, + "step": 7561, + "time_per_iteration": 2.463433027267456 + }, + { + "auxiliary_loss_clip": 0.06447616, + "auxiliary_loss_mlp": 0.01269071, + "balance_loss_clip": 0.06286462, + "balance_loss_mlp": 0.01256161, + "epoch": 0.4546520366751841, + "flos": 16181557407360.0, + "grad_norm": 2.3534128933362277, + "language_loss": 0.85417646, + "learning_rate": 2.3883422024721015e-06, + "loss": 0.93134332, + "num_input_tokens_seen": 162279725, + "router_z_loss_clip": 1.61132812, + "router_z_loss_mlp": 0.12908936, + "step": 7562, + "time_per_iteration": 2.5475013256073 + }, + { + "auxiliary_loss_clip": 0.06445649, + "auxiliary_loss_mlp": 0.01271177, + "balance_loss_clip": 0.06284063, + "balance_loss_mlp": 0.01257504, + "epoch": 0.4547121599278521, + "flos": 19756861672320.0, + "grad_norm": 1.7772924752060992, + "language_loss": 0.89642298, + "learning_rate": 2.38796014579055e-06, + "loss": 0.97359127, + "num_input_tokens_seen": 162297865, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.13684082, + "step": 7563, + "time_per_iteration": 2.489121675491333 + }, + { + "auxiliary_loss_clip": 0.06453149, + "auxiliary_loss_mlp": 0.01274815, + "balance_loss_clip": 0.06286659, + "balance_loss_mlp": 0.01260397, + "epoch": 0.45477228318052004, + "flos": 19943510641920.0, + "grad_norm": 1.9263110789996643, + "language_loss": 0.71668887, + "learning_rate": 2.3875780743975097e-06, + "loss": 0.79396844, + "num_input_tokens_seen": 162316010, + "router_z_loss_clip": 1.6640625, + "router_z_loss_mlp": 0.14428711, + "step": 7564, + "time_per_iteration": 2.4964044094085693 + }, + { + "auxiliary_loss_clip": 0.06450239, + "auxiliary_loss_mlp": 0.01267794, + "balance_loss_clip": 0.06283273, + "balance_loss_mlp": 0.01253912, + "epoch": 0.454832406433188, + "flos": 21294735183360.0, + "grad_norm": 2.0561067408009994, + "language_loss": 0.68633133, + "learning_rate": 2.3871959883074713e-06, + "loss": 0.7635116, + "num_input_tokens_seen": 162336115, + "router_z_loss_clip": 1.66894531, + "router_z_loss_mlp": 0.13879395, + "step": 7565, + "time_per_iteration": 4.080512762069702 + }, + { + "auxiliary_loss_clip": 0.06446166, + "auxiliary_loss_mlp": 0.01274343, + "balance_loss_clip": 0.06282604, + "balance_loss_mlp": 0.01260247, + "epoch": 0.45489252968585603, + "flos": 24505630041600.0, + "grad_norm": 2.0436514367854413, + "language_loss": 0.802881, + "learning_rate": 2.386813887534922e-06, + "loss": 0.88008606, + "num_input_tokens_seen": 162355705, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.14105225, + "step": 7566, + "time_per_iteration": 2.521056890487671 + }, + { + "auxiliary_loss_clip": 0.06452477, + "auxiliary_loss_mlp": 0.01273216, + "balance_loss_clip": 0.06286022, + "balance_loss_mlp": 0.01257558, + "epoch": 0.454952652938524, + "flos": 17098199147520.0, + "grad_norm": 2.208842453595512, + "language_loss": 0.74317467, + "learning_rate": 2.3864317720943508e-06, + "loss": 0.82043159, + "num_input_tokens_seen": 162374055, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.15661621, + "step": 7567, + "time_per_iteration": 2.515658140182495 + }, + { + "auxiliary_loss_clip": 0.06459296, + "auxiliary_loss_mlp": 0.01271605, + "balance_loss_clip": 0.06291091, + "balance_loss_mlp": 0.0125801, + "epoch": 0.45501277619119196, + "flos": 27636792140160.0, + "grad_norm": 1.5215577708435108, + "language_loss": 0.80959934, + "learning_rate": 2.386049642000249e-06, + "loss": 0.88690829, + "num_input_tokens_seen": 162393560, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.13604736, + "step": 7568, + "time_per_iteration": 2.558258533477783 + }, + { + "auxiliary_loss_clip": 0.06466229, + "auxiliary_loss_mlp": 0.01276365, + "balance_loss_clip": 0.06294216, + "balance_loss_mlp": 0.01260176, + "epoch": 0.4550728994438599, + "flos": 19980840435840.0, + "grad_norm": 1.8148678559144198, + "language_loss": 0.80280846, + "learning_rate": 2.3856674972671055e-06, + "loss": 0.88023436, + "num_input_tokens_seen": 162413170, + "router_z_loss_clip": 1.71972656, + "router_z_loss_mlp": 0.16186523, + "step": 7569, + "time_per_iteration": 2.531153917312622 + }, + { + "auxiliary_loss_clip": 0.06458277, + "auxiliary_loss_mlp": 0.01268707, + "balance_loss_clip": 0.06287743, + "balance_loss_mlp": 0.01254176, + "epoch": 0.4551330226965279, + "flos": 26073915384960.0, + "grad_norm": 1.3474740501928035, + "language_loss": 0.75202894, + "learning_rate": 2.385285337909412e-06, + "loss": 0.82929879, + "num_input_tokens_seen": 162434080, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.14538574, + "step": 7570, + "time_per_iteration": 2.543170690536499 + }, + { + "auxiliary_loss_clip": 0.06452256, + "auxiliary_loss_mlp": 0.01273702, + "balance_loss_clip": 0.06289603, + "balance_loss_mlp": 0.01259826, + "epoch": 0.45519314594919585, + "flos": 32789396062080.0, + "grad_norm": 1.7878922954829848, + "language_loss": 0.74832451, + "learning_rate": 2.3849031639416596e-06, + "loss": 0.82558417, + "num_input_tokens_seen": 162455445, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.13879395, + "step": 7571, + "time_per_iteration": 4.052931308746338 + }, + { + "auxiliary_loss_clip": 0.06451707, + "auxiliary_loss_mlp": 0.01275937, + "balance_loss_clip": 0.06292738, + "balance_loss_mlp": 0.01261954, + "epoch": 0.4552532692018638, + "flos": 19178829480960.0, + "grad_norm": 1.5879241198756615, + "language_loss": 0.81163442, + "learning_rate": 2.3845209753783414e-06, + "loss": 0.88891089, + "num_input_tokens_seen": 162474940, + "router_z_loss_clip": 1.58886719, + "router_z_loss_mlp": 0.13983154, + "step": 7572, + "time_per_iteration": 2.511032819747925 + }, + { + "auxiliary_loss_clip": 0.06461887, + "auxiliary_loss_mlp": 0.01269094, + "balance_loss_clip": 0.06292465, + "balance_loss_mlp": 0.01254306, + "epoch": 0.4553133924545318, + "flos": 26033650698240.0, + "grad_norm": 2.340526601051543, + "language_loss": 0.72866237, + "learning_rate": 2.3841387722339486e-06, + "loss": 0.80597222, + "num_input_tokens_seen": 162493340, + "router_z_loss_clip": 1.69238281, + "router_z_loss_mlp": 0.14788818, + "step": 7573, + "time_per_iteration": 2.5469906330108643 + }, + { + "auxiliary_loss_clip": 0.06470129, + "auxiliary_loss_mlp": 0.0127089, + "balance_loss_clip": 0.06300491, + "balance_loss_mlp": 0.01255094, + "epoch": 0.45537351570719975, + "flos": 30668920312320.0, + "grad_norm": 1.9189620807456311, + "language_loss": 0.74504352, + "learning_rate": 2.3837565545229748e-06, + "loss": 0.82245368, + "num_input_tokens_seen": 162514360, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.15783691, + "step": 7574, + "time_per_iteration": 2.6484622955322266 + }, + { + "auxiliary_loss_clip": 0.06463373, + "auxiliary_loss_mlp": 0.01271034, + "balance_loss_clip": 0.06294367, + "balance_loss_mlp": 0.0125661, + "epoch": 0.4554336389598677, + "flos": 24360377788800.0, + "grad_norm": 1.669597443611077, + "language_loss": 0.71544576, + "learning_rate": 2.383374322259915e-06, + "loss": 0.79278982, + "num_input_tokens_seen": 162535240, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.14428711, + "step": 7575, + "time_per_iteration": 2.544975519180298 + }, + { + "auxiliary_loss_clip": 0.06456485, + "auxiliary_loss_mlp": 0.01268004, + "balance_loss_clip": 0.06290726, + "balance_loss_mlp": 0.01253794, + "epoch": 0.4554937622125357, + "flos": 20564113507200.0, + "grad_norm": 1.7578928676474412, + "language_loss": 0.7370066, + "learning_rate": 2.3829920754592617e-06, + "loss": 0.81425148, + "num_input_tokens_seen": 162553880, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.14202881, + "step": 7576, + "time_per_iteration": 2.534135580062866 + }, + { + "auxiliary_loss_clip": 0.06453636, + "auxiliary_loss_mlp": 0.0127588, + "balance_loss_clip": 0.06290971, + "balance_loss_mlp": 0.01261551, + "epoch": 0.45555388546520365, + "flos": 22827451668480.0, + "grad_norm": 2.007695048360481, + "language_loss": 0.66580224, + "learning_rate": 2.382609814135511e-06, + "loss": 0.74309736, + "num_input_tokens_seen": 162574485, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.14312744, + "step": 7577, + "time_per_iteration": 2.5095431804656982 + }, + { + "auxiliary_loss_clip": 0.06452672, + "auxiliary_loss_mlp": 0.01272369, + "balance_loss_clip": 0.0628684, + "balance_loss_mlp": 0.01256538, + "epoch": 0.4556140087178716, + "flos": 21732462512640.0, + "grad_norm": 1.904316861437945, + "language_loss": 0.74386835, + "learning_rate": 2.382227538303157e-06, + "loss": 0.82111871, + "num_input_tokens_seen": 162595130, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.15820312, + "step": 7578, + "time_per_iteration": 2.5497546195983887 + }, + { + "auxiliary_loss_clip": 0.06453466, + "auxiliary_loss_mlp": 0.01270181, + "balance_loss_clip": 0.06290053, + "balance_loss_mlp": 0.01256645, + "epoch": 0.45567413197053963, + "flos": 26001645638400.0, + "grad_norm": 1.7724513927111563, + "language_loss": 0.70436674, + "learning_rate": 2.381845247976697e-06, + "loss": 0.78160322, + "num_input_tokens_seen": 162615720, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.13531494, + "step": 7579, + "time_per_iteration": 2.5318000316619873 + }, + { + "auxiliary_loss_clip": 0.06449443, + "auxiliary_loss_mlp": 0.01270557, + "balance_loss_clip": 0.06286655, + "balance_loss_mlp": 0.01257664, + "epoch": 0.4557342552232076, + "flos": 21543046358400.0, + "grad_norm": 1.8462396851301097, + "language_loss": 0.78760922, + "learning_rate": 2.381462943170627e-06, + "loss": 0.86480927, + "num_input_tokens_seen": 162635825, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.12902832, + "step": 7580, + "time_per_iteration": 2.5358526706695557 + }, + { + "auxiliary_loss_clip": 0.06450854, + "auxiliary_loss_mlp": 0.0127087, + "balance_loss_clip": 0.06288584, + "balance_loss_mlp": 0.01257822, + "epoch": 0.45579437847587556, + "flos": 40010932673280.0, + "grad_norm": 1.6599136037597217, + "language_loss": 0.68708634, + "learning_rate": 2.381080623899444e-06, + "loss": 0.76430357, + "num_input_tokens_seen": 162659130, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.13049316, + "step": 7581, + "time_per_iteration": 2.667543888092041 + }, + { + "auxiliary_loss_clip": 0.06448796, + "auxiliary_loss_mlp": 0.01272512, + "balance_loss_clip": 0.06289542, + "balance_loss_mlp": 0.01258678, + "epoch": 0.4558545017285435, + "flos": 31146409203840.0, + "grad_norm": 1.6471906775179725, + "language_loss": 0.7358638, + "learning_rate": 2.3806982901776455e-06, + "loss": 0.81307691, + "num_input_tokens_seen": 162681665, + "router_z_loss_clip": 1.59277344, + "router_z_loss_mlp": 0.1383667, + "step": 7582, + "time_per_iteration": 2.6570708751678467 + }, + { + "auxiliary_loss_clip": 0.06455518, + "auxiliary_loss_mlp": 0.01272969, + "balance_loss_clip": 0.06286626, + "balance_loss_mlp": 0.01257818, + "epoch": 0.4559146249812115, + "flos": 21732210950400.0, + "grad_norm": 1.8620959272942483, + "language_loss": 0.73187852, + "learning_rate": 2.380315942019729e-06, + "loss": 0.80916339, + "num_input_tokens_seen": 162702040, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.15148926, + "step": 7583, + "time_per_iteration": 2.510700225830078 + }, + { + "auxiliary_loss_clip": 0.06455322, + "auxiliary_loss_mlp": 0.01272152, + "balance_loss_clip": 0.06287013, + "balance_loss_mlp": 0.01256202, + "epoch": 0.45597474823387946, + "flos": 23812841283840.0, + "grad_norm": 1.81949303768272, + "language_loss": 0.72839421, + "learning_rate": 2.379933579440195e-06, + "loss": 0.80566895, + "num_input_tokens_seen": 162722375, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.1595459, + "step": 7584, + "time_per_iteration": 2.5747973918914795 + }, + { + "auxiliary_loss_clip": 0.06447833, + "auxiliary_loss_mlp": 0.01268136, + "balance_loss_clip": 0.0628446, + "balance_loss_mlp": 0.01255357, + "epoch": 0.4560348714865474, + "flos": 31913857549440.0, + "grad_norm": 1.7864940938501939, + "language_loss": 0.67957801, + "learning_rate": 2.379551202453541e-06, + "loss": 0.75673771, + "num_input_tokens_seen": 162746095, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.12792969, + "step": 7585, + "time_per_iteration": 2.6153225898742676 + }, + { + "auxiliary_loss_clip": 0.0645072, + "auxiliary_loss_mlp": 0.01268647, + "balance_loss_clip": 0.06284043, + "balance_loss_mlp": 0.01254449, + "epoch": 0.4560949947392154, + "flos": 22054427026560.0, + "grad_norm": 1.7083540410775564, + "language_loss": 0.76353097, + "learning_rate": 2.379168811074267e-06, + "loss": 0.84072465, + "num_input_tokens_seen": 162766330, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.14190674, + "step": 7586, + "time_per_iteration": 2.5682435035705566 + }, + { + "auxiliary_loss_clip": 0.06448488, + "auxiliary_loss_mlp": 0.01267379, + "balance_loss_clip": 0.0628647, + "balance_loss_mlp": 0.01254182, + "epoch": 0.45615511799188335, + "flos": 24578738328960.0, + "grad_norm": 1.819670635232321, + "language_loss": 0.78360641, + "learning_rate": 2.3787864053168747e-06, + "loss": 0.86076516, + "num_input_tokens_seen": 162784755, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.13189697, + "step": 7587, + "time_per_iteration": 2.5558509826660156 + }, + { + "auxiliary_loss_clip": 0.06459979, + "auxiliary_loss_mlp": 0.01275995, + "balance_loss_clip": 0.06286488, + "balance_loss_mlp": 0.01260152, + "epoch": 0.4562152412445513, + "flos": 18336260350080.0, + "grad_norm": 1.7968748305561377, + "language_loss": 0.69667047, + "learning_rate": 2.378403985195863e-06, + "loss": 0.77403021, + "num_input_tokens_seen": 162803850, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.1583252, + "step": 7588, + "time_per_iteration": 2.5365071296691895 + }, + { + "auxiliary_loss_clip": 0.06447656, + "auxiliary_loss_mlp": 0.01274434, + "balance_loss_clip": 0.06286096, + "balance_loss_mlp": 0.01261422, + "epoch": 0.4562753644972193, + "flos": 13521595144320.0, + "grad_norm": 1.6774091429175193, + "language_loss": 0.79575098, + "learning_rate": 2.378021550725735e-06, + "loss": 0.87297189, + "num_input_tokens_seen": 162820775, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.13006592, + "step": 7589, + "time_per_iteration": 2.484713315963745 + }, + { + "auxiliary_loss_clip": 0.06452583, + "auxiliary_loss_mlp": 0.01271771, + "balance_loss_clip": 0.06289135, + "balance_loss_mlp": 0.0125774, + "epoch": 0.45633548774988725, + "flos": 29646871735680.0, + "grad_norm": 2.003946782113331, + "language_loss": 0.62696528, + "learning_rate": 2.377639101920992e-06, + "loss": 0.70420885, + "num_input_tokens_seen": 162839695, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.14044189, + "step": 7590, + "time_per_iteration": 2.609936475753784 + }, + { + "auxiliary_loss_clip": 0.06445528, + "auxiliary_loss_mlp": 0.01270847, + "balance_loss_clip": 0.06280724, + "balance_loss_mlp": 0.01257496, + "epoch": 0.4563956110025552, + "flos": 22239398914560.0, + "grad_norm": 1.8300596662255737, + "language_loss": 0.73085624, + "learning_rate": 2.377256638796135e-06, + "loss": 0.80802, + "num_input_tokens_seen": 162856095, + "router_z_loss_clip": 1.64746094, + "router_z_loss_mlp": 0.13330078, + "step": 7591, + "time_per_iteration": 2.47824764251709 + }, + { + "auxiliary_loss_clip": 0.06452768, + "auxiliary_loss_mlp": 0.01273962, + "balance_loss_clip": 0.0628728, + "balance_loss_mlp": 0.01260205, + "epoch": 0.45645573425522323, + "flos": 17097696023040.0, + "grad_norm": 1.9979722051509847, + "language_loss": 0.77518493, + "learning_rate": 2.3768741613656695e-06, + "loss": 0.85245228, + "num_input_tokens_seen": 162874070, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.13751221, + "step": 7592, + "time_per_iteration": 2.5239169597625732 + }, + { + "auxiliary_loss_clip": 0.06449406, + "auxiliary_loss_mlp": 0.01273175, + "balance_loss_clip": 0.06284081, + "balance_loss_mlp": 0.01259954, + "epoch": 0.4565158575078912, + "flos": 20337367559040.0, + "grad_norm": 2.421698823443505, + "language_loss": 0.6941641, + "learning_rate": 2.376491669644098e-06, + "loss": 0.77138984, + "num_input_tokens_seen": 162891000, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.13232422, + "step": 7593, + "time_per_iteration": 2.5688788890838623 + }, + { + "auxiliary_loss_clip": 0.06437326, + "auxiliary_loss_mlp": 0.01268265, + "balance_loss_clip": 0.06278698, + "balance_loss_mlp": 0.01256034, + "epoch": 0.45657598076055916, + "flos": 23989008493440.0, + "grad_norm": 2.02887277896486, + "language_loss": 0.8417384, + "learning_rate": 2.3761091636459248e-06, + "loss": 0.91879439, + "num_input_tokens_seen": 162910120, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.12237549, + "step": 7594, + "time_per_iteration": 2.5792298316955566 + }, + { + "auxiliary_loss_clip": 0.06341574, + "auxiliary_loss_mlp": 0.01258819, + "balance_loss_clip": 0.06267718, + "balance_loss_mlp": 0.0125595, + "epoch": 0.45663610401322713, + "flos": 69382812908160.0, + "grad_norm": 0.7684087429591354, + "language_loss": 0.52710819, + "learning_rate": 2.375726643385654e-06, + "loss": 0.60311204, + "num_input_tokens_seen": 162963720, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.02864075, + "step": 7595, + "time_per_iteration": 3.150902509689331 + }, + { + "auxiliary_loss_clip": 0.06451569, + "auxiliary_loss_mlp": 0.01268714, + "balance_loss_clip": 0.06282795, + "balance_loss_mlp": 0.0125491, + "epoch": 0.4566962272658951, + "flos": 15152884358400.0, + "grad_norm": 2.304862186673624, + "language_loss": 0.8729161, + "learning_rate": 2.3753441088777915e-06, + "loss": 0.95011896, + "num_input_tokens_seen": 162975760, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.13824463, + "step": 7596, + "time_per_iteration": 2.490346908569336 + }, + { + "auxiliary_loss_clip": 0.0644666, + "auxiliary_loss_mlp": 0.01270115, + "balance_loss_clip": 0.06282236, + "balance_loss_mlp": 0.01257324, + "epoch": 0.45675635051856306, + "flos": 18703395014400.0, + "grad_norm": 1.5857620712679525, + "language_loss": 0.77719533, + "learning_rate": 2.374961560136843e-06, + "loss": 0.85436308, + "num_input_tokens_seen": 162994865, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.12792969, + "step": 7597, + "time_per_iteration": 2.5043859481811523 + }, + { + "auxiliary_loss_clip": 0.0644691, + "auxiliary_loss_mlp": 0.01271101, + "balance_loss_clip": 0.06280024, + "balance_loss_mlp": 0.01256587, + "epoch": 0.456816473771231, + "flos": 19104211820160.0, + "grad_norm": 1.619707981694153, + "language_loss": 0.78513646, + "learning_rate": 2.374578997177314e-06, + "loss": 0.86231661, + "num_input_tokens_seen": 163014730, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.14501953, + "step": 7598, + "time_per_iteration": 3.9724912643432617 + }, + { + "auxiliary_loss_clip": 0.06447135, + "auxiliary_loss_mlp": 0.01268948, + "balance_loss_clip": 0.06284773, + "balance_loss_mlp": 0.01255508, + "epoch": 0.456876597023899, + "flos": 28957730630400.0, + "grad_norm": 2.2287540067942957, + "language_loss": 0.72171777, + "learning_rate": 2.374196420013712e-06, + "loss": 0.79887861, + "num_input_tokens_seen": 163033405, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.13458252, + "step": 7599, + "time_per_iteration": 2.594240188598633 + }, + { + "auxiliary_loss_clip": 0.06445186, + "auxiliary_loss_mlp": 0.0126948, + "balance_loss_clip": 0.06281814, + "balance_loss_mlp": 0.01256021, + "epoch": 0.45693672027656695, + "flos": 23295297340800.0, + "grad_norm": 1.7934880288039583, + "language_loss": 0.70205128, + "learning_rate": 2.373813828660544e-06, + "loss": 0.77919793, + "num_input_tokens_seen": 163051400, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.13439941, + "step": 7600, + "time_per_iteration": 2.5063295364379883 + }, + { + "auxiliary_loss_clip": 0.06449603, + "auxiliary_loss_mlp": 0.01270393, + "balance_loss_clip": 0.06284294, + "balance_loss_mlp": 0.01256571, + "epoch": 0.4569968435292349, + "flos": 20564448923520.0, + "grad_norm": 2.031833923402261, + "language_loss": 0.78985888, + "learning_rate": 2.373431223132319e-06, + "loss": 0.86705881, + "num_input_tokens_seen": 163069250, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.13824463, + "step": 7601, + "time_per_iteration": 2.559072494506836 + }, + { + "auxiliary_loss_clip": 0.06449661, + "auxiliary_loss_mlp": 0.0127022, + "balance_loss_clip": 0.06283583, + "balance_loss_mlp": 0.01257089, + "epoch": 0.4570569667819029, + "flos": 41292403090560.0, + "grad_norm": 1.9704151582810323, + "language_loss": 0.71676505, + "learning_rate": 2.3730486034435448e-06, + "loss": 0.79396379, + "num_input_tokens_seen": 163091755, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.13134766, + "step": 7602, + "time_per_iteration": 2.6897006034851074 + }, + { + "auxiliary_loss_clip": 0.06446967, + "auxiliary_loss_mlp": 0.01270876, + "balance_loss_clip": 0.06280911, + "balance_loss_mlp": 0.01255843, + "epoch": 0.45711709003457085, + "flos": 26038807724160.0, + "grad_norm": 1.8547506252317059, + "language_loss": 0.73479527, + "learning_rate": 2.372665969608729e-06, + "loss": 0.81197369, + "num_input_tokens_seen": 163111600, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.15026855, + "step": 7603, + "time_per_iteration": 2.5908169746398926 + }, + { + "auxiliary_loss_clip": 0.06447335, + "auxiliary_loss_mlp": 0.01269467, + "balance_loss_clip": 0.0628283, + "balance_loss_mlp": 0.01254077, + "epoch": 0.4571772132872388, + "flos": 22163649223680.0, + "grad_norm": 1.7365999934209901, + "language_loss": 0.83048642, + "learning_rate": 2.372283321642383e-06, + "loss": 0.90765446, + "num_input_tokens_seen": 163127350, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.15374756, + "step": 7604, + "time_per_iteration": 2.462653636932373 + }, + { + "auxiliary_loss_clip": 0.0645724, + "auxiliary_loss_mlp": 0.01271667, + "balance_loss_clip": 0.06285316, + "balance_loss_mlp": 0.01256456, + "epoch": 0.45723733653990684, + "flos": 23885739936000.0, + "grad_norm": 1.8384947858044167, + "language_loss": 0.86237913, + "learning_rate": 2.371900659559016e-06, + "loss": 0.93966818, + "num_input_tokens_seen": 163145855, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.15209961, + "step": 7605, + "time_per_iteration": 3.9711341857910156 + }, + { + "auxiliary_loss_clip": 0.0645397, + "auxiliary_loss_mlp": 0.01268015, + "balance_loss_clip": 0.06283225, + "balance_loss_mlp": 0.01253686, + "epoch": 0.4572974597925748, + "flos": 16877197203840.0, + "grad_norm": 1.5621441730902494, + "language_loss": 0.73368603, + "learning_rate": 2.371517983373138e-06, + "loss": 0.81090587, + "num_input_tokens_seen": 163163830, + "router_z_loss_clip": 1.70898438, + "router_z_loss_mlp": 0.14343262, + "step": 7606, + "time_per_iteration": 2.53171968460083 + }, + { + "auxiliary_loss_clip": 0.06450876, + "auxiliary_loss_mlp": 0.01272472, + "balance_loss_clip": 0.06281146, + "balance_loss_mlp": 0.01257118, + "epoch": 0.45735758304524277, + "flos": 13776530791680.0, + "grad_norm": 2.9980100906386324, + "language_loss": 0.80445778, + "learning_rate": 2.371135293099262e-06, + "loss": 0.88169128, + "num_input_tokens_seen": 163180700, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.15356445, + "step": 7607, + "time_per_iteration": 2.4730136394500732 + }, + { + "auxiliary_loss_clip": 0.06449468, + "auxiliary_loss_mlp": 0.01267355, + "balance_loss_clip": 0.06282607, + "balance_loss_mlp": 0.01252216, + "epoch": 0.45741770629791073, + "flos": 21106283351040.0, + "grad_norm": 1.9890456967063905, + "language_loss": 0.80849135, + "learning_rate": 2.3707525887518982e-06, + "loss": 0.88565969, + "num_input_tokens_seen": 163199450, + "router_z_loss_clip": 1.66894531, + "router_z_loss_mlp": 0.15130615, + "step": 7608, + "time_per_iteration": 2.5604805946350098 + }, + { + "auxiliary_loss_clip": 0.06445852, + "auxiliary_loss_mlp": 0.01268416, + "balance_loss_clip": 0.06281331, + "balance_loss_mlp": 0.01254576, + "epoch": 0.4574778295505787, + "flos": 23119675182720.0, + "grad_norm": 1.6776975313937859, + "language_loss": 0.68550682, + "learning_rate": 2.370369870345559e-06, + "loss": 0.76264954, + "num_input_tokens_seen": 163217875, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.1385498, + "step": 7609, + "time_per_iteration": 2.5249829292297363 + }, + { + "auxiliary_loss_clip": 0.06446596, + "auxiliary_loss_mlp": 0.01267793, + "balance_loss_clip": 0.06279876, + "balance_loss_mlp": 0.01253917, + "epoch": 0.45753795280324666, + "flos": 24359832737280.0, + "grad_norm": 4.839518120228961, + "language_loss": 0.81053591, + "learning_rate": 2.369987137894757e-06, + "loss": 0.88767982, + "num_input_tokens_seen": 163237430, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.13879395, + "step": 7610, + "time_per_iteration": 3.9629292488098145 + }, + { + "auxiliary_loss_clip": 0.06456244, + "auxiliary_loss_mlp": 0.01272187, + "balance_loss_clip": 0.06284218, + "balance_loss_mlp": 0.01258359, + "epoch": 0.4575980760559146, + "flos": 16659297861120.0, + "grad_norm": 2.22162560638367, + "language_loss": 0.82538879, + "learning_rate": 2.3696043914140057e-06, + "loss": 0.90267307, + "num_input_tokens_seen": 163253905, + "router_z_loss_clip": 1.71972656, + "router_z_loss_mlp": 0.13848877, + "step": 7611, + "time_per_iteration": 2.483184337615967 + }, + { + "auxiliary_loss_clip": 0.06450104, + "auxiliary_loss_mlp": 0.01268987, + "balance_loss_clip": 0.06284404, + "balance_loss_mlp": 0.01254753, + "epoch": 0.4576581993085826, + "flos": 35919006860160.0, + "grad_norm": 1.7486456420241998, + "language_loss": 0.73840886, + "learning_rate": 2.369221630917819e-06, + "loss": 0.81559974, + "num_input_tokens_seen": 163274285, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.14239502, + "step": 7612, + "time_per_iteration": 2.629122734069824 + }, + { + "auxiliary_loss_clip": 0.06446031, + "auxiliary_loss_mlp": 0.0126785, + "balance_loss_clip": 0.06281702, + "balance_loss_mlp": 0.01253711, + "epoch": 0.45771832256125056, + "flos": 20085995710080.0, + "grad_norm": 1.498537690587119, + "language_loss": 0.85104787, + "learning_rate": 2.368838856420711e-06, + "loss": 0.92818671, + "num_input_tokens_seen": 163293150, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.14160156, + "step": 7613, + "time_per_iteration": 2.4995853900909424 + }, + { + "auxiliary_loss_clip": 0.06450839, + "auxiliary_loss_mlp": 0.01271405, + "balance_loss_clip": 0.062853, + "balance_loss_mlp": 0.01257458, + "epoch": 0.4577784458139185, + "flos": 10749056520960.0, + "grad_norm": 2.317250545042104, + "language_loss": 0.75818133, + "learning_rate": 2.3684560679371965e-06, + "loss": 0.8354038, + "num_input_tokens_seen": 163310065, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.13946533, + "step": 7614, + "time_per_iteration": 2.5512688159942627 + }, + { + "auxiliary_loss_clip": 0.06447698, + "auxiliary_loss_mlp": 0.01269104, + "balance_loss_clip": 0.06284869, + "balance_loss_mlp": 0.01254513, + "epoch": 0.4578385690665865, + "flos": 21913577112960.0, + "grad_norm": 1.7278714332693421, + "language_loss": 0.7495364, + "learning_rate": 2.368073265481791e-06, + "loss": 0.82670438, + "num_input_tokens_seen": 163329415, + "router_z_loss_clip": 1.62890625, + "router_z_loss_mlp": 0.14587402, + "step": 7615, + "time_per_iteration": 2.4959964752197266 + }, + { + "auxiliary_loss_clip": 0.06341572, + "auxiliary_loss_mlp": 0.01260056, + "balance_loss_clip": 0.06266811, + "balance_loss_mlp": 0.01256924, + "epoch": 0.45789869231925445, + "flos": 64774559036160.0, + "grad_norm": 0.7564263714074747, + "language_loss": 0.57682395, + "learning_rate": 2.3676904490690105e-06, + "loss": 0.65284026, + "num_input_tokens_seen": 163385875, + "router_z_loss_clip": 0.74804688, + "router_z_loss_mlp": 0.03129578, + "step": 7616, + "time_per_iteration": 3.1225674152374268 + }, + { + "auxiliary_loss_clip": 0.06451499, + "auxiliary_loss_mlp": 0.01269699, + "balance_loss_clip": 0.06287209, + "balance_loss_mlp": 0.01255299, + "epoch": 0.4579588155719224, + "flos": 16149594274560.0, + "grad_norm": 2.222129623674548, + "language_loss": 0.71319497, + "learning_rate": 2.3673076187133704e-06, + "loss": 0.790407, + "num_input_tokens_seen": 163405170, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.144104, + "step": 7617, + "time_per_iteration": 2.535795211791992 + }, + { + "auxiliary_loss_clip": 0.06453606, + "auxiliary_loss_mlp": 0.01272033, + "balance_loss_clip": 0.06288601, + "balance_loss_mlp": 0.0125749, + "epoch": 0.45801893882459044, + "flos": 21401609466240.0, + "grad_norm": 1.7708953304075432, + "language_loss": 0.7611897, + "learning_rate": 2.36692477442939e-06, + "loss": 0.83844614, + "num_input_tokens_seen": 163423155, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.14538574, + "step": 7618, + "time_per_iteration": 2.486976146697998 + }, + { + "auxiliary_loss_clip": 0.06453368, + "auxiliary_loss_mlp": 0.01269962, + "balance_loss_clip": 0.06288654, + "balance_loss_mlp": 0.01256778, + "epoch": 0.4580790620772584, + "flos": 19542609982080.0, + "grad_norm": 1.989312042597275, + "language_loss": 0.76642346, + "learning_rate": 2.366541916231585e-06, + "loss": 0.84365678, + "num_input_tokens_seen": 163442450, + "router_z_loss_clip": 1.64648438, + "router_z_loss_mlp": 0.13195801, + "step": 7619, + "time_per_iteration": 2.5505213737487793 + }, + { + "auxiliary_loss_clip": 0.06448688, + "auxiliary_loss_mlp": 0.01269236, + "balance_loss_clip": 0.06287201, + "balance_loss_mlp": 0.01256242, + "epoch": 0.45813918532992637, + "flos": 16586608844160.0, + "grad_norm": 1.7634638926548802, + "language_loss": 0.72444797, + "learning_rate": 2.366159044134473e-06, + "loss": 0.80162722, + "num_input_tokens_seen": 163459810, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.13018799, + "step": 7620, + "time_per_iteration": 2.5020828247070312 + }, + { + "auxiliary_loss_clip": 0.06448015, + "auxiliary_loss_mlp": 0.0127207, + "balance_loss_clip": 0.06286486, + "balance_loss_mlp": 0.01259243, + "epoch": 0.45819930858259433, + "flos": 42240085568640.0, + "grad_norm": 2.4478513756868168, + "language_loss": 0.77894747, + "learning_rate": 2.3657761581525748e-06, + "loss": 0.8561483, + "num_input_tokens_seen": 163482970, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.12835693, + "step": 7621, + "time_per_iteration": 2.7115588188171387 + }, + { + "auxiliary_loss_clip": 0.06339111, + "auxiliary_loss_mlp": 0.01257981, + "balance_loss_clip": 0.06264743, + "balance_loss_mlp": 0.01255324, + "epoch": 0.4582594318352623, + "flos": 63733335073920.0, + "grad_norm": 0.7682856550602313, + "language_loss": 0.64809114, + "learning_rate": 2.3653932583004063e-06, + "loss": 0.72406203, + "num_input_tokens_seen": 163545330, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.02659607, + "step": 7622, + "time_per_iteration": 3.13112473487854 + }, + { + "auxiliary_loss_clip": 0.06452725, + "auxiliary_loss_mlp": 0.01272617, + "balance_loss_clip": 0.06286744, + "balance_loss_mlp": 0.01258449, + "epoch": 0.45831955508793026, + "flos": 26877226078080.0, + "grad_norm": 1.7433537302254658, + "language_loss": 0.79958743, + "learning_rate": 2.3650103445924903e-06, + "loss": 0.87684089, + "num_input_tokens_seen": 163564620, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.1416626, + "step": 7623, + "time_per_iteration": 2.6407015323638916 + }, + { + "auxiliary_loss_clip": 0.0645254, + "auxiliary_loss_mlp": 0.0127269, + "balance_loss_clip": 0.06285348, + "balance_loss_mlp": 0.01258528, + "epoch": 0.45837967834059823, + "flos": 18739886267520.0, + "grad_norm": 2.305548200028626, + "language_loss": 0.71172595, + "learning_rate": 2.3646274170433452e-06, + "loss": 0.78897822, + "num_input_tokens_seen": 163581010, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.14160156, + "step": 7624, + "time_per_iteration": 2.4580042362213135 + }, + { + "auxiliary_loss_clip": 0.06451602, + "auxiliary_loss_mlp": 0.01273069, + "balance_loss_clip": 0.06285381, + "balance_loss_mlp": 0.012593, + "epoch": 0.4584398015932662, + "flos": 21184380956160.0, + "grad_norm": 1.776025787081333, + "language_loss": 0.73132861, + "learning_rate": 2.364244475667491e-06, + "loss": 0.80857527, + "num_input_tokens_seen": 163599955, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.13763428, + "step": 7625, + "time_per_iteration": 2.5352139472961426 + }, + { + "auxiliary_loss_clip": 0.06452388, + "auxiliary_loss_mlp": 0.01273572, + "balance_loss_clip": 0.06287026, + "balance_loss_mlp": 0.01259857, + "epoch": 0.45849992484593416, + "flos": 19795826620800.0, + "grad_norm": 3.130746647878431, + "language_loss": 0.78340298, + "learning_rate": 2.363861520479451e-06, + "loss": 0.86066258, + "num_input_tokens_seen": 163618545, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.137146, + "step": 7626, + "time_per_iteration": 2.4839165210723877 + }, + { + "auxiliary_loss_clip": 0.06454711, + "auxiliary_loss_mlp": 0.01271249, + "balance_loss_clip": 0.06286182, + "balance_loss_mlp": 0.01257284, + "epoch": 0.4585600480986021, + "flos": 18229134504960.0, + "grad_norm": 1.6201293476115848, + "language_loss": 0.85071468, + "learning_rate": 2.3634785514937445e-06, + "loss": 0.92797422, + "num_input_tokens_seen": 163636055, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.1394043, + "step": 7627, + "time_per_iteration": 2.5822484493255615 + }, + { + "auxiliary_loss_clip": 0.06454201, + "auxiliary_loss_mlp": 0.01270166, + "balance_loss_clip": 0.06285322, + "balance_loss_mlp": 0.01255634, + "epoch": 0.4586201713512701, + "flos": 29029748814720.0, + "grad_norm": 1.6524494424678404, + "language_loss": 0.69812655, + "learning_rate": 2.3630955687248953e-06, + "loss": 0.77537024, + "num_input_tokens_seen": 163657485, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.14544678, + "step": 7628, + "time_per_iteration": 2.5642716884613037 + }, + { + "auxiliary_loss_clip": 0.06450283, + "auxiliary_loss_mlp": 0.01272737, + "balance_loss_clip": 0.06287684, + "balance_loss_mlp": 0.01258492, + "epoch": 0.45868029460393805, + "flos": 23411395572480.0, + "grad_norm": 1.512396631295222, + "language_loss": 0.78590345, + "learning_rate": 2.3627125721874265e-06, + "loss": 0.86313355, + "num_input_tokens_seen": 163676030, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.14245605, + "step": 7629, + "time_per_iteration": 2.5380680561065674 + }, + { + "auxiliary_loss_clip": 0.0645413, + "auxiliary_loss_mlp": 0.01273786, + "balance_loss_clip": 0.06283213, + "balance_loss_mlp": 0.01258372, + "epoch": 0.458740417856606, + "flos": 18227625131520.0, + "grad_norm": 2.58579854057945, + "language_loss": 0.7964831, + "learning_rate": 2.3623295618958595e-06, + "loss": 0.87376225, + "num_input_tokens_seen": 163694490, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.1541748, + "step": 7630, + "time_per_iteration": 2.4736902713775635 + }, + { + "auxiliary_loss_clip": 0.0645593, + "auxiliary_loss_mlp": 0.01273082, + "balance_loss_clip": 0.06288286, + "balance_loss_mlp": 0.01258378, + "epoch": 0.458800541109274, + "flos": 34577341683840.0, + "grad_norm": 2.0263904819558243, + "language_loss": 0.72204614, + "learning_rate": 2.3619465378647198e-06, + "loss": 0.79933631, + "num_input_tokens_seen": 163717035, + "router_z_loss_clip": 1.67675781, + "router_z_loss_mlp": 0.14715576, + "step": 7631, + "time_per_iteration": 2.8143060207366943 + }, + { + "auxiliary_loss_clip": 0.06451838, + "auxiliary_loss_mlp": 0.01269985, + "balance_loss_clip": 0.06285281, + "balance_loss_mlp": 0.0125565, + "epoch": 0.458860664361942, + "flos": 17717837690880.0, + "grad_norm": 2.417001672331849, + "language_loss": 0.71850061, + "learning_rate": 2.361563500108531e-06, + "loss": 0.79571879, + "num_input_tokens_seen": 163734525, + "router_z_loss_clip": 1.66113281, + "router_z_loss_mlp": 0.14324951, + "step": 7632, + "time_per_iteration": 2.616152048110962 + }, + { + "auxiliary_loss_clip": 0.0645618, + "auxiliary_loss_mlp": 0.01272337, + "balance_loss_clip": 0.06285533, + "balance_loss_mlp": 0.01258055, + "epoch": 0.45892078761460997, + "flos": 18447746607360.0, + "grad_norm": 2.3994338935229784, + "language_loss": 0.69457287, + "learning_rate": 2.3611804486418178e-06, + "loss": 0.7718581, + "num_input_tokens_seen": 163752860, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.14294434, + "step": 7633, + "time_per_iteration": 2.544916868209839 + }, + { + "auxiliary_loss_clip": 0.06450637, + "auxiliary_loss_mlp": 0.01269265, + "balance_loss_clip": 0.06284192, + "balance_loss_mlp": 0.01255055, + "epoch": 0.45898091086727794, + "flos": 22679306449920.0, + "grad_norm": 1.6111707393144439, + "language_loss": 0.81188464, + "learning_rate": 2.3607973834791062e-06, + "loss": 0.88908368, + "num_input_tokens_seen": 163772495, + "router_z_loss_clip": 1.6640625, + "router_z_loss_mlp": 0.14208984, + "step": 7634, + "time_per_iteration": 2.508498430252075 + }, + { + "auxiliary_loss_clip": 0.06458217, + "auxiliary_loss_mlp": 0.0127198, + "balance_loss_clip": 0.06285305, + "balance_loss_mlp": 0.01256995, + "epoch": 0.4590410341199459, + "flos": 21659396152320.0, + "grad_norm": 1.6788945577423258, + "language_loss": 0.8141619, + "learning_rate": 2.3604143046349216e-06, + "loss": 0.89146382, + "num_input_tokens_seen": 163791475, + "router_z_loss_clip": 1.72851562, + "router_z_loss_mlp": 0.15002441, + "step": 7635, + "time_per_iteration": 2.5435891151428223 + }, + { + "auxiliary_loss_clip": 0.06450347, + "auxiliary_loss_mlp": 0.01272084, + "balance_loss_clip": 0.06285377, + "balance_loss_mlp": 0.01258095, + "epoch": 0.45910115737261387, + "flos": 36543676648320.0, + "grad_norm": 1.5202825589824251, + "language_loss": 0.65088654, + "learning_rate": 2.3600312121237905e-06, + "loss": 0.72811085, + "num_input_tokens_seen": 163812995, + "router_z_loss_clip": 1.64746094, + "router_z_loss_mlp": 0.13995361, + "step": 7636, + "time_per_iteration": 2.6333730220794678 + }, + { + "auxiliary_loss_clip": 0.06449063, + "auxiliary_loss_mlp": 0.01267464, + "balance_loss_clip": 0.06286588, + "balance_loss_mlp": 0.0125376, + "epoch": 0.45916128062528183, + "flos": 24425771500800.0, + "grad_norm": 1.3857173948582018, + "language_loss": 0.80552399, + "learning_rate": 2.3596481059602395e-06, + "loss": 0.88268924, + "num_input_tokens_seen": 163833945, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.13702393, + "step": 7637, + "time_per_iteration": 4.1112189292907715 + }, + { + "auxiliary_loss_clip": 0.06456389, + "auxiliary_loss_mlp": 0.0127208, + "balance_loss_clip": 0.06286228, + "balance_loss_mlp": 0.01257089, + "epoch": 0.4592214038779498, + "flos": 23228687744640.0, + "grad_norm": 2.823234077565048, + "language_loss": 0.75517625, + "learning_rate": 2.3592649861587965e-06, + "loss": 0.83246088, + "num_input_tokens_seen": 163853885, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.14990234, + "step": 7638, + "time_per_iteration": 3.910426616668701 + }, + { + "auxiliary_loss_clip": 0.06446041, + "auxiliary_loss_mlp": 0.01269213, + "balance_loss_clip": 0.06283274, + "balance_loss_mlp": 0.01254824, + "epoch": 0.45928152713061776, + "flos": 19178200575360.0, + "grad_norm": 1.717868731304971, + "language_loss": 0.74023581, + "learning_rate": 2.358881852733989e-06, + "loss": 0.81738836, + "num_input_tokens_seen": 163871855, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.14373779, + "step": 7639, + "time_per_iteration": 2.566300630569458 + }, + { + "auxiliary_loss_clip": 0.06454983, + "auxiliary_loss_mlp": 0.01270543, + "balance_loss_clip": 0.06286465, + "balance_loss_mlp": 0.01255165, + "epoch": 0.4593416503832857, + "flos": 22420513514880.0, + "grad_norm": 1.8698154023651474, + "language_loss": 0.683029, + "learning_rate": 2.358498705700346e-06, + "loss": 0.76028425, + "num_input_tokens_seen": 163891450, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.15380859, + "step": 7640, + "time_per_iteration": 2.5371484756469727 + }, + { + "auxiliary_loss_clip": 0.06455723, + "auxiliary_loss_mlp": 0.01270807, + "balance_loss_clip": 0.06285085, + "balance_loss_mlp": 0.01256454, + "epoch": 0.4594017736359537, + "flos": 18886228623360.0, + "grad_norm": 1.657871276405927, + "language_loss": 0.76190329, + "learning_rate": 2.3581155450723958e-06, + "loss": 0.83916861, + "num_input_tokens_seen": 163909345, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.14367676, + "step": 7641, + "time_per_iteration": 2.633190631866455 + }, + { + "auxiliary_loss_clip": 0.06450865, + "auxiliary_loss_mlp": 0.01271757, + "balance_loss_clip": 0.06281709, + "balance_loss_mlp": 0.01256749, + "epoch": 0.45946189688862166, + "flos": 20524268090880.0, + "grad_norm": 2.1109400166256753, + "language_loss": 0.75088501, + "learning_rate": 2.357732370864668e-06, + "loss": 0.82811123, + "num_input_tokens_seen": 163926940, + "router_z_loss_clip": 1.69238281, + "router_z_loss_mlp": 0.15008545, + "step": 7642, + "time_per_iteration": 2.497342824935913 + }, + { + "auxiliary_loss_clip": 0.06325873, + "auxiliary_loss_mlp": 0.01255986, + "balance_loss_clip": 0.06252096, + "balance_loss_mlp": 0.01253583, + "epoch": 0.4595220201412896, + "flos": 61422436920960.0, + "grad_norm": 0.8082143270085457, + "language_loss": 0.58238232, + "learning_rate": 2.357349183091694e-06, + "loss": 0.65820098, + "num_input_tokens_seen": 163977785, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.02400208, + "step": 7643, + "time_per_iteration": 2.9001851081848145 + }, + { + "auxiliary_loss_clip": 0.06454818, + "auxiliary_loss_mlp": 0.01269178, + "balance_loss_clip": 0.06279951, + "balance_loss_mlp": 0.01254467, + "epoch": 0.4595821433939576, + "flos": 23337616452480.0, + "grad_norm": 1.460564072578963, + "language_loss": 0.93123877, + "learning_rate": 2.3569659817680016e-06, + "loss": 1.00847864, + "num_input_tokens_seen": 163996630, + "router_z_loss_clip": 1.74902344, + "router_z_loss_mlp": 0.14709473, + "step": 7644, + "time_per_iteration": 3.956286668777466 + }, + { + "auxiliary_loss_clip": 0.06453376, + "auxiliary_loss_mlp": 0.01272616, + "balance_loss_clip": 0.06283151, + "balance_loss_mlp": 0.01258591, + "epoch": 0.4596422666466256, + "flos": 14287492189440.0, + "grad_norm": 2.5856018073831954, + "language_loss": 0.82780254, + "learning_rate": 2.3565827669081243e-06, + "loss": 0.90506244, + "num_input_tokens_seen": 164013190, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.14031982, + "step": 7645, + "time_per_iteration": 2.5230045318603516 + }, + { + "auxiliary_loss_clip": 0.0632263, + "auxiliary_loss_mlp": 0.0125685, + "balance_loss_clip": 0.06249407, + "balance_loss_mlp": 0.01254095, + "epoch": 0.4597023898992936, + "flos": 65747188103040.0, + "grad_norm": 0.7461836102968291, + "language_loss": 0.59904981, + "learning_rate": 2.356199538526593e-06, + "loss": 0.67484462, + "num_input_tokens_seen": 164074030, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.02758789, + "step": 7646, + "time_per_iteration": 3.0677428245544434 + }, + { + "auxiliary_loss_clip": 0.06451902, + "auxiliary_loss_mlp": 0.01272391, + "balance_loss_clip": 0.06282644, + "balance_loss_mlp": 0.01257931, + "epoch": 0.45976251315196154, + "flos": 26914430090880.0, + "grad_norm": 1.5401961064627432, + "language_loss": 0.72954202, + "learning_rate": 2.355816296637939e-06, + "loss": 0.80678499, + "num_input_tokens_seen": 164095515, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.14465332, + "step": 7647, + "time_per_iteration": 2.5715911388397217 + }, + { + "auxiliary_loss_clip": 0.06455843, + "auxiliary_loss_mlp": 0.01270403, + "balance_loss_clip": 0.06283608, + "balance_loss_mlp": 0.0125586, + "epoch": 0.4598226364046295, + "flos": 26625854229120.0, + "grad_norm": 1.5262276937698116, + "language_loss": 0.66966379, + "learning_rate": 2.3554330412566957e-06, + "loss": 0.74692625, + "num_input_tokens_seen": 164117270, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.14526367, + "step": 7648, + "time_per_iteration": 2.6032962799072266 + }, + { + "auxiliary_loss_clip": 0.06453076, + "auxiliary_loss_mlp": 0.01270647, + "balance_loss_clip": 0.06283541, + "balance_loss_mlp": 0.01256562, + "epoch": 0.45988275965729747, + "flos": 24394395346560.0, + "grad_norm": 1.3937992948207578, + "language_loss": 0.78837889, + "learning_rate": 2.3550497723973953e-06, + "loss": 0.86561614, + "num_input_tokens_seen": 164137850, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.14093018, + "step": 7649, + "time_per_iteration": 3.961230754852295 + }, + { + "auxiliary_loss_clip": 0.06449774, + "auxiliary_loss_mlp": 0.01273295, + "balance_loss_clip": 0.06282938, + "balance_loss_mlp": 0.01258221, + "epoch": 0.45994288290996543, + "flos": 24542834054400.0, + "grad_norm": 2.427132979105608, + "language_loss": 0.694453, + "learning_rate": 2.3546664900745726e-06, + "loss": 0.77168369, + "num_input_tokens_seen": 164157960, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.15087891, + "step": 7650, + "time_per_iteration": 2.5870516300201416 + }, + { + "auxiliary_loss_clip": 0.06454967, + "auxiliary_loss_mlp": 0.01271386, + "balance_loss_clip": 0.06281558, + "balance_loss_mlp": 0.01255876, + "epoch": 0.4600030061626334, + "flos": 14835573745920.0, + "grad_norm": 2.508823744651641, + "language_loss": 0.84580773, + "learning_rate": 2.354283194302761e-06, + "loss": 0.92307127, + "num_input_tokens_seen": 164174590, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.15515137, + "step": 7651, + "time_per_iteration": 2.4682910442352295 + }, + { + "auxiliary_loss_clip": 0.06447899, + "auxiliary_loss_mlp": 0.01269723, + "balance_loss_clip": 0.06282218, + "balance_loss_mlp": 0.01255567, + "epoch": 0.46006312941530136, + "flos": 18119702672640.0, + "grad_norm": 2.0398588051370536, + "language_loss": 0.75204146, + "learning_rate": 2.3538998850964948e-06, + "loss": 0.82921767, + "num_input_tokens_seen": 164192935, + "router_z_loss_clip": 1.65625, + "router_z_loss_mlp": 0.14160156, + "step": 7652, + "time_per_iteration": 2.533160448074341 + }, + { + "auxiliary_loss_clip": 0.06453463, + "auxiliary_loss_mlp": 0.01267977, + "balance_loss_clip": 0.06283025, + "balance_loss_mlp": 0.01253803, + "epoch": 0.46012325266796933, + "flos": 21982157280000.0, + "grad_norm": 1.8219910575186118, + "language_loss": 0.76111704, + "learning_rate": 2.3535165624703097e-06, + "loss": 0.83833146, + "num_input_tokens_seen": 164213160, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.14154053, + "step": 7653, + "time_per_iteration": 2.607556104660034 + }, + { + "auxiliary_loss_clip": 0.06466014, + "auxiliary_loss_mlp": 0.01279742, + "balance_loss_clip": 0.06286691, + "balance_loss_mlp": 0.01262618, + "epoch": 0.4601833759206373, + "flos": 15273468783360.0, + "grad_norm": 1.9930521100890286, + "language_loss": 0.66339052, + "learning_rate": 2.353133226438741e-06, + "loss": 0.74084806, + "num_input_tokens_seen": 164229330, + "router_z_loss_clip": 1.79199219, + "router_z_loss_mlp": 0.17132568, + "step": 7654, + "time_per_iteration": 2.5845115184783936 + }, + { + "auxiliary_loss_clip": 0.06450775, + "auxiliary_loss_mlp": 0.01273684, + "balance_loss_clip": 0.06283066, + "balance_loss_mlp": 0.01260524, + "epoch": 0.46024349917330526, + "flos": 27096299377920.0, + "grad_norm": 1.834954182024095, + "language_loss": 0.79552221, + "learning_rate": 2.3527498770163248e-06, + "loss": 0.87276679, + "num_input_tokens_seen": 164248240, + "router_z_loss_clip": 1.67675781, + "router_z_loss_mlp": 0.1315918, + "step": 7655, + "time_per_iteration": 2.5619075298309326 + }, + { + "auxiliary_loss_clip": 0.06446843, + "auxiliary_loss_mlp": 0.01271784, + "balance_loss_clip": 0.06282479, + "balance_loss_mlp": 0.0125795, + "epoch": 0.4603036224259732, + "flos": 24469935402240.0, + "grad_norm": 1.525008853184554, + "language_loss": 0.68020397, + "learning_rate": 2.3523665142175985e-06, + "loss": 0.7573902, + "num_input_tokens_seen": 164268020, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.13824463, + "step": 7656, + "time_per_iteration": 2.534085988998413 + }, + { + "auxiliary_loss_clip": 0.06450829, + "auxiliary_loss_mlp": 0.0126853, + "balance_loss_clip": 0.06280753, + "balance_loss_mlp": 0.01254249, + "epoch": 0.4603637456786412, + "flos": 28116545091840.0, + "grad_norm": 1.6883930229899933, + "language_loss": 0.81940675, + "learning_rate": 2.351983138057098e-06, + "loss": 0.89660037, + "num_input_tokens_seen": 164287305, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.14300537, + "step": 7657, + "time_per_iteration": 2.6093909740448 + }, + { + "auxiliary_loss_clip": 0.06452166, + "auxiliary_loss_mlp": 0.01272452, + "balance_loss_clip": 0.06283732, + "balance_loss_mlp": 0.01257598, + "epoch": 0.4604238689313092, + "flos": 24355178835840.0, + "grad_norm": 1.9081069655960825, + "language_loss": 0.70684779, + "learning_rate": 2.3515997485493623e-06, + "loss": 0.78409398, + "num_input_tokens_seen": 164306835, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.1484375, + "step": 7658, + "time_per_iteration": 2.5257532596588135 + }, + { + "auxiliary_loss_clip": 0.06333129, + "auxiliary_loss_mlp": 0.01254207, + "balance_loss_clip": 0.06259783, + "balance_loss_mlp": 0.01251698, + "epoch": 0.4604839921839772, + "flos": 53622742337280.0, + "grad_norm": 1.3056028191134426, + "language_loss": 0.6180622, + "learning_rate": 2.351216345708928e-06, + "loss": 0.69393557, + "num_input_tokens_seen": 164367095, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.02508545, + "step": 7659, + "time_per_iteration": 3.2051191329956055 + }, + { + "auxiliary_loss_clip": 0.06450778, + "auxiliary_loss_mlp": 0.01270415, + "balance_loss_clip": 0.06284198, + "balance_loss_mlp": 0.01254692, + "epoch": 0.46054411543664514, + "flos": 31256428014720.0, + "grad_norm": 1.6821089703035916, + "language_loss": 0.68614, + "learning_rate": 2.350832929550336e-06, + "loss": 0.76335192, + "num_input_tokens_seen": 164388895, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.1572876, + "step": 7660, + "time_per_iteration": 2.5768120288848877 + }, + { + "auxiliary_loss_clip": 0.06455722, + "auxiliary_loss_mlp": 0.01269625, + "balance_loss_clip": 0.06285393, + "balance_loss_mlp": 0.01254843, + "epoch": 0.4606042386893131, + "flos": 24098943450240.0, + "grad_norm": 1.8024702284570222, + "language_loss": 0.76982367, + "learning_rate": 2.3504495000881227e-06, + "loss": 0.84707713, + "num_input_tokens_seen": 164409080, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.14782715, + "step": 7661, + "time_per_iteration": 2.5556533336639404 + }, + { + "auxiliary_loss_clip": 0.06448123, + "auxiliary_loss_mlp": 0.01270523, + "balance_loss_clip": 0.06284644, + "balance_loss_mlp": 0.01257511, + "epoch": 0.46066436194198107, + "flos": 26585715323520.0, + "grad_norm": 1.64374674726695, + "language_loss": 0.75330603, + "learning_rate": 2.3500660573368305e-06, + "loss": 0.8304925, + "num_input_tokens_seen": 164427585, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.13000488, + "step": 7662, + "time_per_iteration": 2.5430636405944824 + }, + { + "auxiliary_loss_clip": 0.064645, + "auxiliary_loss_mlp": 0.01271435, + "balance_loss_clip": 0.06287506, + "balance_loss_mlp": 0.01255807, + "epoch": 0.46072448519464904, + "flos": 17779751458560.0, + "grad_norm": 2.8997354943734144, + "language_loss": 0.79542935, + "learning_rate": 2.349682601310998e-06, + "loss": 0.87278873, + "num_input_tokens_seen": 164438455, + "router_z_loss_clip": 1.76953125, + "router_z_loss_mlp": 0.15625, + "step": 7663, + "time_per_iteration": 2.4792025089263916 + }, + { + "auxiliary_loss_clip": 0.06451327, + "auxiliary_loss_mlp": 0.01270399, + "balance_loss_clip": 0.0628781, + "balance_loss_mlp": 0.01256344, + "epoch": 0.460784608447317, + "flos": 15091557569280.0, + "grad_norm": 1.9500633364095115, + "language_loss": 0.73664737, + "learning_rate": 2.3492991320251653e-06, + "loss": 0.81386459, + "num_input_tokens_seen": 164456830, + "router_z_loss_clip": 1.63476562, + "router_z_loss_mlp": 0.14050293, + "step": 7664, + "time_per_iteration": 2.5058319568634033 + }, + { + "auxiliary_loss_clip": 0.06454196, + "auxiliary_loss_mlp": 0.01269654, + "balance_loss_clip": 0.06286658, + "balance_loss_mlp": 0.01255403, + "epoch": 0.46084473169998497, + "flos": 18594214744320.0, + "grad_norm": 1.4541358898310397, + "language_loss": 0.72731769, + "learning_rate": 2.3489156494938753e-06, + "loss": 0.80455625, + "num_input_tokens_seen": 164475375, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.14257812, + "step": 7665, + "time_per_iteration": 2.5651309490203857 + }, + { + "auxiliary_loss_clip": 0.06452034, + "auxiliary_loss_mlp": 0.01269476, + "balance_loss_clip": 0.06283794, + "balance_loss_mlp": 0.01255016, + "epoch": 0.46090485495265293, + "flos": 19499955454080.0, + "grad_norm": 1.6858212343920378, + "language_loss": 0.78057897, + "learning_rate": 2.348532153731669e-06, + "loss": 0.85779405, + "num_input_tokens_seen": 164492040, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.14459229, + "step": 7666, + "time_per_iteration": 2.4884724617004395 + }, + { + "auxiliary_loss_clip": 0.06454702, + "auxiliary_loss_mlp": 0.01278259, + "balance_loss_clip": 0.06288874, + "balance_loss_mlp": 0.01262982, + "epoch": 0.4609649782053209, + "flos": 33373339966080.0, + "grad_norm": 1.3323556356345916, + "language_loss": 0.7438637, + "learning_rate": 2.348148644753088e-06, + "loss": 0.82119334, + "num_input_tokens_seen": 164513665, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.15270996, + "step": 7667, + "time_per_iteration": 2.6961426734924316 + }, + { + "auxiliary_loss_clip": 0.06450665, + "auxiliary_loss_mlp": 0.01267319, + "balance_loss_clip": 0.06283414, + "balance_loss_mlp": 0.01253574, + "epoch": 0.46102510145798886, + "flos": 23775972687360.0, + "grad_norm": 1.463924526715157, + "language_loss": 0.76157856, + "learning_rate": 2.347765122572676e-06, + "loss": 0.83875835, + "num_input_tokens_seen": 164533890, + "router_z_loss_clip": 1.67480469, + "router_z_loss_mlp": 0.1373291, + "step": 7668, + "time_per_iteration": 2.517401933670044 + }, + { + "auxiliary_loss_clip": 0.06446877, + "auxiliary_loss_mlp": 0.0126819, + "balance_loss_clip": 0.06284317, + "balance_loss_mlp": 0.01254982, + "epoch": 0.4610852247106568, + "flos": 23301544469760.0, + "grad_norm": 1.5533292001822034, + "language_loss": 0.78315312, + "learning_rate": 2.347381587204975e-06, + "loss": 0.86030376, + "num_input_tokens_seen": 164553815, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.13208008, + "step": 7669, + "time_per_iteration": 2.58445405960083 + }, + { + "auxiliary_loss_clip": 0.06450041, + "auxiliary_loss_mlp": 0.01264673, + "balance_loss_clip": 0.06282575, + "balance_loss_mlp": 0.01251286, + "epoch": 0.4611453479633248, + "flos": 25454528403840.0, + "grad_norm": 1.739851036429443, + "language_loss": 0.83272684, + "learning_rate": 2.34699803866453e-06, + "loss": 0.90987396, + "num_input_tokens_seen": 164573125, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.13391113, + "step": 7670, + "time_per_iteration": 2.5387001037597656 + }, + { + "auxiliary_loss_clip": 0.06451756, + "auxiliary_loss_mlp": 0.01270534, + "balance_loss_clip": 0.06288445, + "balance_loss_mlp": 0.01257129, + "epoch": 0.4612054712159928, + "flos": 21145541788800.0, + "grad_norm": 1.8274954721629995, + "language_loss": 0.63656652, + "learning_rate": 2.3466144769658845e-06, + "loss": 0.7137894, + "num_input_tokens_seen": 164592575, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.1340332, + "step": 7671, + "time_per_iteration": 2.5336413383483887 + }, + { + "auxiliary_loss_clip": 0.06335695, + "auxiliary_loss_mlp": 0.01251787, + "balance_loss_clip": 0.0626289, + "balance_loss_mlp": 0.01249119, + "epoch": 0.4612655944686608, + "flos": 69979754194560.0, + "grad_norm": 0.792480479203595, + "language_loss": 0.55791217, + "learning_rate": 2.346230902123583e-06, + "loss": 0.63378698, + "num_input_tokens_seen": 164659795, + "router_z_loss_clip": 0.72705078, + "router_z_loss_mlp": 0.02670288, + "step": 7672, + "time_per_iteration": 3.2302184104919434 + }, + { + "auxiliary_loss_clip": 0.06453065, + "auxiliary_loss_mlp": 0.01266934, + "balance_loss_clip": 0.06283592, + "balance_loss_mlp": 0.01253213, + "epoch": 0.46132571772132874, + "flos": 16842844229760.0, + "grad_norm": 2.026723370874256, + "language_loss": 0.71486014, + "learning_rate": 2.3458473141521715e-06, + "loss": 0.79206014, + "num_input_tokens_seen": 164678735, + "router_z_loss_clip": 1.69238281, + "router_z_loss_mlp": 0.13720703, + "step": 7673, + "time_per_iteration": 2.5307891368865967 + }, + { + "auxiliary_loss_clip": 0.06444372, + "auxiliary_loss_mlp": 0.01267461, + "balance_loss_clip": 0.06280223, + "balance_loss_mlp": 0.01254014, + "epoch": 0.4613858409739967, + "flos": 35817666946560.0, + "grad_norm": 1.6118988477871892, + "language_loss": 0.70779812, + "learning_rate": 2.345463713066195e-06, + "loss": 0.7849164, + "num_input_tokens_seen": 164700885, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.13446045, + "step": 7674, + "time_per_iteration": 2.67787766456604 + }, + { + "auxiliary_loss_clip": 0.06445141, + "auxiliary_loss_mlp": 0.01269162, + "balance_loss_clip": 0.06278897, + "balance_loss_mlp": 0.01255554, + "epoch": 0.4614459642266647, + "flos": 35276251789440.0, + "grad_norm": 1.4817902433092767, + "language_loss": 0.65456873, + "learning_rate": 2.3450800988801996e-06, + "loss": 0.73171175, + "num_input_tokens_seen": 164726960, + "router_z_loss_clip": 1.66113281, + "router_z_loss_mlp": 0.1362915, + "step": 7675, + "time_per_iteration": 2.683043956756592 + }, + { + "auxiliary_loss_clip": 0.06330552, + "auxiliary_loss_mlp": 0.01253837, + "balance_loss_clip": 0.06257802, + "balance_loss_mlp": 0.01251083, + "epoch": 0.46150608747933264, + "flos": 66723311842560.0, + "grad_norm": 0.7159632658119685, + "language_loss": 0.58438665, + "learning_rate": 2.3446964716087327e-06, + "loss": 0.66023052, + "num_input_tokens_seen": 164788525, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.02758789, + "step": 7676, + "time_per_iteration": 3.2052080631256104 + }, + { + "auxiliary_loss_clip": 0.06331712, + "auxiliary_loss_mlp": 0.01253621, + "balance_loss_clip": 0.06258753, + "balance_loss_mlp": 0.01250806, + "epoch": 0.4615662107320006, + "flos": 55846780133760.0, + "grad_norm": 0.7666580083801284, + "language_loss": 0.62806678, + "learning_rate": 2.344312831266341e-06, + "loss": 0.70392013, + "num_input_tokens_seen": 164843525, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.02810669, + "step": 7677, + "time_per_iteration": 5.753543853759766 + }, + { + "auxiliary_loss_clip": 0.06441256, + "auxiliary_loss_mlp": 0.01269221, + "balance_loss_clip": 0.06278154, + "balance_loss_mlp": 0.012564, + "epoch": 0.46162633398466857, + "flos": 15488055889920.0, + "grad_norm": 2.0928007642005224, + "language_loss": 0.7694543, + "learning_rate": 2.3439291778675718e-06, + "loss": 0.84655911, + "num_input_tokens_seen": 164859895, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.12817383, + "step": 7678, + "time_per_iteration": 2.5979206562042236 + }, + { + "auxiliary_loss_clip": 0.06447493, + "auxiliary_loss_mlp": 0.01267035, + "balance_loss_clip": 0.06279032, + "balance_loss_mlp": 0.01253672, + "epoch": 0.46168645723733653, + "flos": 20017667105280.0, + "grad_norm": 1.9130482273301792, + "language_loss": 0.66792345, + "learning_rate": 2.343545511426974e-06, + "loss": 0.74506873, + "num_input_tokens_seen": 164878030, + "router_z_loss_clip": 1.68554688, + "router_z_loss_mlp": 0.13360596, + "step": 7679, + "time_per_iteration": 2.548025131225586 + }, + { + "auxiliary_loss_clip": 0.06445532, + "auxiliary_loss_mlp": 0.0127232, + "balance_loss_clip": 0.06279338, + "balance_loss_mlp": 0.01259409, + "epoch": 0.4617465804900045, + "flos": 20304020833920.0, + "grad_norm": 2.6299917180378203, + "language_loss": 0.702595, + "learning_rate": 2.3431618319590963e-06, + "loss": 0.77977353, + "num_input_tokens_seen": 164895710, + "router_z_loss_clip": 1.66113281, + "router_z_loss_mlp": 0.12921143, + "step": 7680, + "time_per_iteration": 2.475419282913208 + }, + { + "auxiliary_loss_clip": 0.06449848, + "auxiliary_loss_mlp": 0.01274843, + "balance_loss_clip": 0.06279959, + "balance_loss_mlp": 0.01260454, + "epoch": 0.46180670374267246, + "flos": 22352897669760.0, + "grad_norm": 1.6539051623213383, + "language_loss": 0.63903129, + "learning_rate": 2.342778139478487e-06, + "loss": 0.7162782, + "num_input_tokens_seen": 164913365, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.14398193, + "step": 7681, + "time_per_iteration": 2.518878698348999 + }, + { + "auxiliary_loss_clip": 0.06438938, + "auxiliary_loss_mlp": 0.01267755, + "balance_loss_clip": 0.06277744, + "balance_loss_mlp": 0.01255566, + "epoch": 0.46186682699534043, + "flos": 19900856113920.0, + "grad_norm": 1.5795449228659066, + "language_loss": 0.67458999, + "learning_rate": 2.342394433999697e-06, + "loss": 0.75165695, + "num_input_tokens_seen": 164931620, + "router_z_loss_clip": 1.61230469, + "router_z_loss_mlp": 0.12194824, + "step": 7682, + "time_per_iteration": 2.4734294414520264 + }, + { + "auxiliary_loss_clip": 0.06442823, + "auxiliary_loss_mlp": 0.01267731, + "balance_loss_clip": 0.06276147, + "balance_loss_mlp": 0.01254564, + "epoch": 0.4619269502480084, + "flos": 31511573297280.0, + "grad_norm": 2.0778412213868025, + "language_loss": 0.74573362, + "learning_rate": 2.342010715537275e-06, + "loss": 0.82283914, + "num_input_tokens_seen": 164950905, + "router_z_loss_clip": 1.66699219, + "router_z_loss_mlp": 0.1317749, + "step": 7683, + "time_per_iteration": 2.5680744647979736 + }, + { + "auxiliary_loss_clip": 0.0644316, + "auxiliary_loss_mlp": 0.01269615, + "balance_loss_clip": 0.06278165, + "balance_loss_mlp": 0.01255995, + "epoch": 0.46198707350067636, + "flos": 25016465658240.0, + "grad_norm": 2.034673139361796, + "language_loss": 0.77701104, + "learning_rate": 2.3416269841057726e-06, + "loss": 0.85413885, + "num_input_tokens_seen": 164970950, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.13604736, + "step": 7684, + "time_per_iteration": 3.9865663051605225 + }, + { + "auxiliary_loss_clip": 0.06455924, + "auxiliary_loss_mlp": 0.01269534, + "balance_loss_clip": 0.06282193, + "balance_loss_mlp": 0.01255074, + "epoch": 0.4620471967533444, + "flos": 18297588890880.0, + "grad_norm": 1.7679070884814239, + "language_loss": 0.79849184, + "learning_rate": 2.3412432397197412e-06, + "loss": 0.87574637, + "num_input_tokens_seen": 164989855, + "router_z_loss_clip": 1.73828125, + "router_z_loss_mlp": 0.14471436, + "step": 7685, + "time_per_iteration": 2.4874165058135986 + }, + { + "auxiliary_loss_clip": 0.06442665, + "auxiliary_loss_mlp": 0.01268831, + "balance_loss_clip": 0.06282581, + "balance_loss_mlp": 0.01254151, + "epoch": 0.46210732000601235, + "flos": 33993607415040.0, + "grad_norm": 2.697729181890728, + "language_loss": 0.66966581, + "learning_rate": 2.340859482393731e-06, + "loss": 0.74678075, + "num_input_tokens_seen": 165012290, + "router_z_loss_clip": 1.59863281, + "router_z_loss_mlp": 0.14678955, + "step": 7686, + "time_per_iteration": 2.673029661178589 + }, + { + "auxiliary_loss_clip": 0.06450719, + "auxiliary_loss_mlp": 0.01270437, + "balance_loss_clip": 0.06281859, + "balance_loss_mlp": 0.01255929, + "epoch": 0.4621674432586803, + "flos": 25016381804160.0, + "grad_norm": 1.8957956969587364, + "language_loss": 0.7416718, + "learning_rate": 2.340475712142296e-06, + "loss": 0.81888342, + "num_input_tokens_seen": 165030810, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.14508057, + "step": 7687, + "time_per_iteration": 2.520526885986328 + }, + { + "auxiliary_loss_clip": 0.06441881, + "auxiliary_loss_mlp": 0.01268556, + "balance_loss_clip": 0.06278582, + "balance_loss_mlp": 0.01254943, + "epoch": 0.4622275665113483, + "flos": 22019906344320.0, + "grad_norm": 2.1641165257521098, + "language_loss": 0.75034606, + "learning_rate": 2.3400919289799873e-06, + "loss": 0.82745045, + "num_input_tokens_seen": 165050205, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.13623047, + "step": 7688, + "time_per_iteration": 2.6087183952331543 + }, + { + "auxiliary_loss_clip": 0.06442745, + "auxiliary_loss_mlp": 0.01266791, + "balance_loss_clip": 0.06279768, + "balance_loss_mlp": 0.0125375, + "epoch": 0.46228768976401624, + "flos": 24065303235840.0, + "grad_norm": 1.76695871159964, + "language_loss": 0.78822517, + "learning_rate": 2.3397081329213585e-06, + "loss": 0.86532056, + "num_input_tokens_seen": 165069370, + "router_z_loss_clip": 1.62988281, + "router_z_loss_mlp": 0.13043213, + "step": 7689, + "time_per_iteration": 4.008488416671753 + }, + { + "auxiliary_loss_clip": 0.0644816, + "auxiliary_loss_mlp": 0.01269125, + "balance_loss_clip": 0.06278446, + "balance_loss_mlp": 0.01254116, + "epoch": 0.4623478130166842, + "flos": 26658655902720.0, + "grad_norm": 2.4003711776889936, + "language_loss": 0.56824899, + "learning_rate": 2.339324323980964e-06, + "loss": 0.6454218, + "num_input_tokens_seen": 165089610, + "router_z_loss_clip": 1.69726562, + "router_z_loss_mlp": 0.15020752, + "step": 7690, + "time_per_iteration": 2.586726665496826 + }, + { + "auxiliary_loss_clip": 0.0644986, + "auxiliary_loss_mlp": 0.01270548, + "balance_loss_clip": 0.06281572, + "balance_loss_mlp": 0.01256421, + "epoch": 0.46240793626935217, + "flos": 20564700485760.0, + "grad_norm": 2.1153050114919387, + "language_loss": 0.83470464, + "learning_rate": 2.3389405021733562e-06, + "loss": 0.91190875, + "num_input_tokens_seen": 165109050, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.14135742, + "step": 7691, + "time_per_iteration": 2.5688517093658447 + }, + { + "auxiliary_loss_clip": 0.06446303, + "auxiliary_loss_mlp": 0.01268112, + "balance_loss_clip": 0.06280233, + "balance_loss_mlp": 0.01254528, + "epoch": 0.46246805952202014, + "flos": 22462706845440.0, + "grad_norm": 1.4394066258336355, + "language_loss": 0.75601387, + "learning_rate": 2.338556667513091e-06, + "loss": 0.83315802, + "num_input_tokens_seen": 165130130, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.13604736, + "step": 7692, + "time_per_iteration": 2.537447929382324 + }, + { + "auxiliary_loss_clip": 0.06447245, + "auxiliary_loss_mlp": 0.01269367, + "balance_loss_clip": 0.06279314, + "balance_loss_mlp": 0.01255324, + "epoch": 0.4625281827746881, + "flos": 35049673549440.0, + "grad_norm": 1.4816622996820314, + "language_loss": 0.74488908, + "learning_rate": 2.338172820014723e-06, + "loss": 0.82205522, + "num_input_tokens_seen": 165152685, + "router_z_loss_clip": 1.67871094, + "router_z_loss_mlp": 0.14038086, + "step": 7693, + "time_per_iteration": 2.655733823776245 + }, + { + "auxiliary_loss_clip": 0.06448781, + "auxiliary_loss_mlp": 0.01269271, + "balance_loss_clip": 0.06283827, + "balance_loss_mlp": 0.01255496, + "epoch": 0.46258830602735607, + "flos": 21074907196800.0, + "grad_norm": 1.4111581138712515, + "language_loss": 0.85637844, + "learning_rate": 2.337788959692808e-06, + "loss": 0.93355894, + "num_input_tokens_seen": 165173315, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.13781738, + "step": 7694, + "time_per_iteration": 2.5321285724639893 + }, + { + "auxiliary_loss_clip": 0.06447286, + "auxiliary_loss_mlp": 0.01268569, + "balance_loss_clip": 0.06280261, + "balance_loss_mlp": 0.01254979, + "epoch": 0.46264842928002403, + "flos": 26184437320320.0, + "grad_norm": 2.8233556574725744, + "language_loss": 0.79577935, + "learning_rate": 2.337405086561902e-06, + "loss": 0.87293792, + "num_input_tokens_seen": 165192395, + "router_z_loss_clip": 1.66699219, + "router_z_loss_mlp": 0.13586426, + "step": 7695, + "time_per_iteration": 2.569974660873413 + }, + { + "auxiliary_loss_clip": 0.06442414, + "auxiliary_loss_mlp": 0.01270579, + "balance_loss_clip": 0.0628098, + "balance_loss_mlp": 0.01258432, + "epoch": 0.462708552532692, + "flos": 16769903650560.0, + "grad_norm": 1.6398131561505984, + "language_loss": 0.72464627, + "learning_rate": 2.3370212006365606e-06, + "loss": 0.80177617, + "num_input_tokens_seen": 165211355, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.12133789, + "step": 7696, + "time_per_iteration": 2.49324369430542 + }, + { + "auxiliary_loss_clip": 0.06448425, + "auxiliary_loss_mlp": 0.01269091, + "balance_loss_clip": 0.06281986, + "balance_loss_mlp": 0.01256139, + "epoch": 0.46276867578535996, + "flos": 15565985786880.0, + "grad_norm": 1.5682310460433448, + "language_loss": 0.69151074, + "learning_rate": 2.3366373019313423e-06, + "loss": 0.76868594, + "num_input_tokens_seen": 165229380, + "router_z_loss_clip": 1.66308594, + "router_z_loss_mlp": 0.12945557, + "step": 7697, + "time_per_iteration": 2.5437402725219727 + }, + { + "auxiliary_loss_clip": 0.06445374, + "auxiliary_loss_mlp": 0.01272368, + "balance_loss_clip": 0.06278891, + "balance_loss_mlp": 0.01258903, + "epoch": 0.462828799038028, + "flos": 22421352055680.0, + "grad_norm": 2.477481810490018, + "language_loss": 0.84870285, + "learning_rate": 2.3362533904608025e-06, + "loss": 0.92588031, + "num_input_tokens_seen": 165247200, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.13470459, + "step": 7698, + "time_per_iteration": 2.5088558197021484 + }, + { + "auxiliary_loss_clip": 0.06449191, + "auxiliary_loss_mlp": 0.01269693, + "balance_loss_clip": 0.06284188, + "balance_loss_mlp": 0.01255883, + "epoch": 0.46288892229069595, + "flos": 21075997299840.0, + "grad_norm": 1.5978854439043657, + "language_loss": 0.71711451, + "learning_rate": 2.335869466239502e-06, + "loss": 0.79430336, + "num_input_tokens_seen": 165265825, + "router_z_loss_clip": 1.64941406, + "router_z_loss_mlp": 0.13824463, + "step": 7699, + "time_per_iteration": 2.572908639907837 + }, + { + "auxiliary_loss_clip": 0.06453253, + "auxiliary_loss_mlp": 0.01268472, + "balance_loss_clip": 0.06283245, + "balance_loss_mlp": 0.01253952, + "epoch": 0.4629490455433639, + "flos": 23192448053760.0, + "grad_norm": 3.9296940778908724, + "language_loss": 0.71994227, + "learning_rate": 2.335485529281996e-06, + "loss": 0.79715955, + "num_input_tokens_seen": 165284380, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.1451416, + "step": 7700, + "time_per_iteration": 2.5155210494995117 + }, + { + "auxiliary_loss_clip": 0.06446292, + "auxiliary_loss_mlp": 0.01271375, + "balance_loss_clip": 0.0628306, + "balance_loss_mlp": 0.01258608, + "epoch": 0.4630091687960319, + "flos": 18840178005120.0, + "grad_norm": 2.0219592023308297, + "language_loss": 0.72735655, + "learning_rate": 2.3351015796028467e-06, + "loss": 0.80453324, + "num_input_tokens_seen": 165300320, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.12780762, + "step": 7701, + "time_per_iteration": 2.5208041667938232 + }, + { + "auxiliary_loss_clip": 0.06455772, + "auxiliary_loss_mlp": 0.01272275, + "balance_loss_clip": 0.06285252, + "balance_loss_mlp": 0.01258768, + "epoch": 0.46306929204869984, + "flos": 38915733882240.0, + "grad_norm": 1.8677153728043454, + "language_loss": 0.64857763, + "learning_rate": 2.3347176172166114e-06, + "loss": 0.72585809, + "num_input_tokens_seen": 165318130, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.13519287, + "step": 7702, + "time_per_iteration": 2.6274476051330566 + }, + { + "auxiliary_loss_clip": 0.06443912, + "auxiliary_loss_mlp": 0.01267806, + "balance_loss_clip": 0.06281176, + "balance_loss_mlp": 0.01255181, + "epoch": 0.4631294153013678, + "flos": 19649945462400.0, + "grad_norm": 1.8702283374659314, + "language_loss": 0.73327863, + "learning_rate": 2.33433364213785e-06, + "loss": 0.81039578, + "num_input_tokens_seen": 165336225, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.12640381, + "step": 7703, + "time_per_iteration": 2.505009651184082 + }, + { + "auxiliary_loss_clip": 0.06456561, + "auxiliary_loss_mlp": 0.01272434, + "balance_loss_clip": 0.0628607, + "balance_loss_mlp": 0.0125776, + "epoch": 0.4631895385540358, + "flos": 24615187655040.0, + "grad_norm": 1.7291559958554978, + "language_loss": 0.68770319, + "learning_rate": 2.3339496543811243e-06, + "loss": 0.76499313, + "num_input_tokens_seen": 165355005, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.14666748, + "step": 7704, + "time_per_iteration": 2.5337138175964355 + }, + { + "auxiliary_loss_clip": 0.06456052, + "auxiliary_loss_mlp": 0.01269056, + "balance_loss_clip": 0.06286585, + "balance_loss_mlp": 0.01255693, + "epoch": 0.46324966180670374, + "flos": 26326838534400.0, + "grad_norm": 2.021774763699282, + "language_loss": 0.81483209, + "learning_rate": 2.3335656539609934e-06, + "loss": 0.89208323, + "num_input_tokens_seen": 165374910, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.13378906, + "step": 7705, + "time_per_iteration": 2.612663745880127 + }, + { + "auxiliary_loss_clip": 0.06459744, + "auxiliary_loss_mlp": 0.01269987, + "balance_loss_clip": 0.06288762, + "balance_loss_mlp": 0.01256313, + "epoch": 0.4633097850593717, + "flos": 19245816420480.0, + "grad_norm": 1.7146225700720175, + "language_loss": 0.77885628, + "learning_rate": 2.3331816408920196e-06, + "loss": 0.85615361, + "num_input_tokens_seen": 165392590, + "router_z_loss_clip": 1.70898438, + "router_z_loss_mlp": 0.13684082, + "step": 7706, + "time_per_iteration": 2.508925437927246 + }, + { + "auxiliary_loss_clip": 0.06446654, + "auxiliary_loss_mlp": 0.01269933, + "balance_loss_clip": 0.06285432, + "balance_loss_mlp": 0.01256254, + "epoch": 0.46336990831203967, + "flos": 22789660677120.0, + "grad_norm": 1.8229249281456994, + "language_loss": 0.70008546, + "learning_rate": 2.3327976151887654e-06, + "loss": 0.77725136, + "num_input_tokens_seen": 165411195, + "router_z_loss_clip": 1.61230469, + "router_z_loss_mlp": 0.13671875, + "step": 7707, + "time_per_iteration": 2.5517148971557617 + }, + { + "auxiliary_loss_clip": 0.06460145, + "auxiliary_loss_mlp": 0.01269807, + "balance_loss_clip": 0.06290638, + "balance_loss_mlp": 0.01255716, + "epoch": 0.46343003156470763, + "flos": 38218668566400.0, + "grad_norm": 2.701141573629833, + "language_loss": 0.61044616, + "learning_rate": 2.332413576865791e-06, + "loss": 0.68774569, + "num_input_tokens_seen": 165430150, + "router_z_loss_clip": 1.69433594, + "router_z_loss_mlp": 0.14093018, + "step": 7708, + "time_per_iteration": 2.6566975116729736 + }, + { + "auxiliary_loss_clip": 0.06457859, + "auxiliary_loss_mlp": 0.01269726, + "balance_loss_clip": 0.06291145, + "balance_loss_mlp": 0.01255946, + "epoch": 0.4634901548173756, + "flos": 31946156098560.0, + "grad_norm": 2.0418964495503125, + "language_loss": 0.77915132, + "learning_rate": 2.3320295259376614e-06, + "loss": 0.85642713, + "num_input_tokens_seen": 165450595, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.13781738, + "step": 7709, + "time_per_iteration": 2.6596858501434326 + }, + { + "auxiliary_loss_clip": 0.06459823, + "auxiliary_loss_mlp": 0.01271527, + "balance_loss_clip": 0.06291819, + "balance_loss_mlp": 0.01256756, + "epoch": 0.46355027807004356, + "flos": 20088469405440.0, + "grad_norm": 1.5745013311626586, + "language_loss": 0.77581245, + "learning_rate": 2.3316454624189385e-06, + "loss": 0.85312593, + "num_input_tokens_seen": 165469515, + "router_z_loss_clip": 1.67773438, + "router_z_loss_mlp": 0.14764404, + "step": 7710, + "time_per_iteration": 2.5101842880249023 + }, + { + "auxiliary_loss_clip": 0.06457606, + "auxiliary_loss_mlp": 0.01274408, + "balance_loss_clip": 0.06287406, + "balance_loss_mlp": 0.01260151, + "epoch": 0.4636104013227116, + "flos": 24068280055680.0, + "grad_norm": 2.3601088939338086, + "language_loss": 0.73606086, + "learning_rate": 2.3312613863241865e-06, + "loss": 0.81338096, + "num_input_tokens_seen": 165488125, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.14257812, + "step": 7711, + "time_per_iteration": 2.590855598449707 + }, + { + "auxiliary_loss_clip": 0.06459524, + "auxiliary_loss_mlp": 0.01272046, + "balance_loss_clip": 0.06293879, + "balance_loss_mlp": 0.01257354, + "epoch": 0.46367052457537955, + "flos": 23921392648320.0, + "grad_norm": 1.4235356855228358, + "language_loss": 0.71632046, + "learning_rate": 2.33087729766797e-06, + "loss": 0.7936362, + "num_input_tokens_seen": 165509225, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.14685059, + "step": 7712, + "time_per_iteration": 2.524653434753418 + }, + { + "auxiliary_loss_clip": 0.06464949, + "auxiliary_loss_mlp": 0.01272658, + "balance_loss_clip": 0.06290694, + "balance_loss_mlp": 0.01257709, + "epoch": 0.4637306478280475, + "flos": 26403846036480.0, + "grad_norm": 2.2505033505731493, + "language_loss": 0.73737693, + "learning_rate": 2.3304931964648524e-06, + "loss": 0.81475306, + "num_input_tokens_seen": 165529945, + "router_z_loss_clip": 1.74121094, + "router_z_loss_mlp": 0.14941406, + "step": 7713, + "time_per_iteration": 2.5624618530273438 + }, + { + "auxiliary_loss_clip": 0.06466722, + "auxiliary_loss_mlp": 0.01276857, + "balance_loss_clip": 0.06292763, + "balance_loss_mlp": 0.01261372, + "epoch": 0.4637907710807155, + "flos": 21987104670720.0, + "grad_norm": 1.4954624193011212, + "language_loss": 0.58918363, + "learning_rate": 2.3301090827294e-06, + "loss": 0.66661942, + "num_input_tokens_seen": 165550690, + "router_z_loss_clip": 1.74023438, + "router_z_loss_mlp": 0.15466309, + "step": 7714, + "time_per_iteration": 2.510551929473877 + }, + { + "auxiliary_loss_clip": 0.06456332, + "auxiliary_loss_mlp": 0.01271959, + "balance_loss_clip": 0.06290398, + "balance_loss_mlp": 0.01257427, + "epoch": 0.46385089433338345, + "flos": 12427234894080.0, + "grad_norm": 2.7033660685293186, + "language_loss": 0.70470357, + "learning_rate": 2.3297249564761784e-06, + "loss": 0.78198647, + "num_input_tokens_seen": 165567775, + "router_z_loss_clip": 1.66113281, + "router_z_loss_mlp": 0.14538574, + "step": 7715, + "time_per_iteration": 2.533158779144287 + }, + { + "auxiliary_loss_clip": 0.06470867, + "auxiliary_loss_mlp": 0.01270095, + "balance_loss_clip": 0.06294338, + "balance_loss_mlp": 0.01255731, + "epoch": 0.4639110175860514, + "flos": 23922692386560.0, + "grad_norm": 1.7790063066577455, + "language_loss": 0.68472731, + "learning_rate": 2.3293408177197527e-06, + "loss": 0.762137, + "num_input_tokens_seen": 165587010, + "router_z_loss_clip": 1.76660156, + "router_z_loss_mlp": 0.14355469, + "step": 7716, + "time_per_iteration": 4.020689249038696 + }, + { + "auxiliary_loss_clip": 0.06459275, + "auxiliary_loss_mlp": 0.01270908, + "balance_loss_clip": 0.06288785, + "balance_loss_mlp": 0.01255858, + "epoch": 0.4639711408387194, + "flos": 25307263653120.0, + "grad_norm": 1.603260424737227, + "language_loss": 0.81029081, + "learning_rate": 2.328956666474691e-06, + "loss": 0.88759267, + "num_input_tokens_seen": 165607850, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.1505127, + "step": 7717, + "time_per_iteration": 3.932593584060669 + }, + { + "auxiliary_loss_clip": 0.06454346, + "auxiliary_loss_mlp": 0.01273075, + "balance_loss_clip": 0.06284629, + "balance_loss_mlp": 0.01258127, + "epoch": 0.46403126409138734, + "flos": 21217643827200.0, + "grad_norm": 1.6983648240686933, + "language_loss": 0.73560178, + "learning_rate": 2.3285725027555593e-06, + "loss": 0.81287599, + "num_input_tokens_seen": 165627175, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.14929199, + "step": 7718, + "time_per_iteration": 2.567814350128174 + }, + { + "auxiliary_loss_clip": 0.06461985, + "auxiliary_loss_mlp": 0.0127191, + "balance_loss_clip": 0.06294554, + "balance_loss_mlp": 0.01257384, + "epoch": 0.4640913873440553, + "flos": 35854325907840.0, + "grad_norm": 1.9528130818693374, + "language_loss": 0.70908272, + "learning_rate": 2.3281883265769254e-06, + "loss": 0.78642172, + "num_input_tokens_seen": 165648340, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.14526367, + "step": 7719, + "time_per_iteration": 2.6412456035614014 + }, + { + "auxiliary_loss_clip": 0.06458225, + "auxiliary_loss_mlp": 0.01272538, + "balance_loss_clip": 0.06287955, + "balance_loss_mlp": 0.01258793, + "epoch": 0.46415151059672327, + "flos": 19171282613760.0, + "grad_norm": 2.2400961683609473, + "language_loss": 0.86823237, + "learning_rate": 2.327804137953357e-06, + "loss": 0.94553995, + "num_input_tokens_seen": 165667195, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.13745117, + "step": 7720, + "time_per_iteration": 2.5479180812835693 + }, + { + "auxiliary_loss_clip": 0.06346954, + "auxiliary_loss_mlp": 0.01257869, + "balance_loss_clip": 0.06273555, + "balance_loss_mlp": 0.01255387, + "epoch": 0.46421163384939124, + "flos": 58932841207680.0, + "grad_norm": 0.7060507258277461, + "language_loss": 0.54935473, + "learning_rate": 2.3274199368994226e-06, + "loss": 0.62540293, + "num_input_tokens_seen": 165726760, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.02481079, + "step": 7721, + "time_per_iteration": 3.185922861099243 + }, + { + "auxiliary_loss_clip": 0.06453753, + "auxiliary_loss_mlp": 0.01271222, + "balance_loss_clip": 0.0628788, + "balance_loss_mlp": 0.01257227, + "epoch": 0.4642717571020592, + "flos": 20163590190720.0, + "grad_norm": 1.901448408880664, + "language_loss": 0.80108112, + "learning_rate": 2.3270357234296918e-06, + "loss": 0.87833083, + "num_input_tokens_seen": 165745005, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.13995361, + "step": 7722, + "time_per_iteration": 2.524707317352295 + }, + { + "auxiliary_loss_clip": 0.06454173, + "auxiliary_loss_mlp": 0.01270539, + "balance_loss_clip": 0.06282455, + "balance_loss_mlp": 0.0125627, + "epoch": 0.46433188035472717, + "flos": 25053208473600.0, + "grad_norm": 1.90118065677523, + "language_loss": 0.78278601, + "learning_rate": 2.3266514975587332e-06, + "loss": 0.86003315, + "num_input_tokens_seen": 165765750, + "router_z_loss_clip": 1.71777344, + "router_z_loss_mlp": 0.1427002, + "step": 7723, + "time_per_iteration": 3.9820849895477295 + }, + { + "auxiliary_loss_clip": 0.06448075, + "auxiliary_loss_mlp": 0.01267351, + "balance_loss_clip": 0.06282157, + "balance_loss_mlp": 0.01253046, + "epoch": 0.4643920036073952, + "flos": 28083366074880.0, + "grad_norm": 1.6378874340525207, + "language_loss": 0.68861282, + "learning_rate": 2.326267259301118e-06, + "loss": 0.7657671, + "num_input_tokens_seen": 165787515, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.14306641, + "step": 7724, + "time_per_iteration": 2.550832748413086 + }, + { + "auxiliary_loss_clip": 0.06449208, + "auxiliary_loss_mlp": 0.01272875, + "balance_loss_clip": 0.06283656, + "balance_loss_mlp": 0.01259297, + "epoch": 0.46445212686006315, + "flos": 18375267225600.0, + "grad_norm": 2.354559005563411, + "language_loss": 0.67722934, + "learning_rate": 2.325883008671415e-06, + "loss": 0.7544502, + "num_input_tokens_seen": 165806675, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.13592529, + "step": 7725, + "time_per_iteration": 2.534698009490967 + }, + { + "auxiliary_loss_clip": 0.0644237, + "auxiliary_loss_mlp": 0.01270691, + "balance_loss_clip": 0.06281108, + "balance_loss_mlp": 0.01258108, + "epoch": 0.4645122501127311, + "flos": 31729514567040.0, + "grad_norm": 1.5959059771038482, + "language_loss": 0.65303701, + "learning_rate": 2.3254987456841955e-06, + "loss": 0.73016763, + "num_input_tokens_seen": 165829835, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.12585449, + "step": 7726, + "time_per_iteration": 2.6071393489837646 + }, + { + "auxiliary_loss_clip": 0.06452325, + "auxiliary_loss_mlp": 0.01268836, + "balance_loss_clip": 0.06286149, + "balance_loss_mlp": 0.01255312, + "epoch": 0.4645723733653991, + "flos": 23775553416960.0, + "grad_norm": 2.198219591713496, + "language_loss": 0.75535023, + "learning_rate": 2.3251144703540307e-06, + "loss": 0.83256185, + "num_input_tokens_seen": 165849380, + "router_z_loss_clip": 1.66113281, + "router_z_loss_mlp": 0.13525391, + "step": 7727, + "time_per_iteration": 2.5323383808135986 + }, + { + "auxiliary_loss_clip": 0.06449004, + "auxiliary_loss_mlp": 0.01272292, + "balance_loss_clip": 0.06281407, + "balance_loss_mlp": 0.01258166, + "epoch": 0.46463249661806705, + "flos": 33153805468800.0, + "grad_norm": 1.912145195790545, + "language_loss": 0.78694946, + "learning_rate": 2.3247301826954936e-06, + "loss": 0.86416245, + "num_input_tokens_seen": 165868620, + "router_z_loss_clip": 1.67675781, + "router_z_loss_mlp": 0.14147949, + "step": 7728, + "time_per_iteration": 3.998812437057495 + }, + { + "auxiliary_loss_clip": 0.06450211, + "auxiliary_loss_mlp": 0.01270241, + "balance_loss_clip": 0.06282613, + "balance_loss_mlp": 0.0125658, + "epoch": 0.464692619870735, + "flos": 18301865448960.0, + "grad_norm": 2.3670866338465295, + "language_loss": 0.76134968, + "learning_rate": 2.324345882723155e-06, + "loss": 0.83855414, + "num_input_tokens_seen": 165885915, + "router_z_loss_clip": 1.67675781, + "router_z_loss_mlp": 0.13659668, + "step": 7729, + "time_per_iteration": 2.459913730621338 + }, + { + "auxiliary_loss_clip": 0.06449223, + "auxiliary_loss_mlp": 0.01270726, + "balance_loss_clip": 0.06283462, + "balance_loss_mlp": 0.01257339, + "epoch": 0.464752743123403, + "flos": 22644659986560.0, + "grad_norm": 1.7402612149106196, + "language_loss": 0.80316758, + "learning_rate": 2.323961570451588e-06, + "loss": 0.88036704, + "num_input_tokens_seen": 165905465, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.13378906, + "step": 7730, + "time_per_iteration": 2.5472798347473145 + }, + { + "auxiliary_loss_clip": 0.06447513, + "auxiliary_loss_mlp": 0.01272657, + "balance_loss_clip": 0.06282953, + "balance_loss_mlp": 0.01258924, + "epoch": 0.46481286637607094, + "flos": 20418316202880.0, + "grad_norm": 1.544685409716396, + "language_loss": 0.77440143, + "learning_rate": 2.3235772458953655e-06, + "loss": 0.85160315, + "num_input_tokens_seen": 165924640, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.13726807, + "step": 7731, + "time_per_iteration": 2.539971351623535 + }, + { + "auxiliary_loss_clip": 0.06444095, + "auxiliary_loss_mlp": 0.01267001, + "balance_loss_clip": 0.06280014, + "balance_loss_mlp": 0.01253984, + "epoch": 0.4648729896287389, + "flos": 34283692650240.0, + "grad_norm": 1.8393249998070078, + "language_loss": 0.66022158, + "learning_rate": 2.323192909069061e-06, + "loss": 0.73733258, + "num_input_tokens_seen": 165945765, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.13006592, + "step": 7732, + "time_per_iteration": 2.6860389709472656 + }, + { + "auxiliary_loss_clip": 0.0645274, + "auxiliary_loss_mlp": 0.01268588, + "balance_loss_clip": 0.0628058, + "balance_loss_mlp": 0.01254474, + "epoch": 0.4649331128814069, + "flos": 21327704565120.0, + "grad_norm": 2.1920635353287157, + "language_loss": 0.73225021, + "learning_rate": 2.32280855998725e-06, + "loss": 0.8094635, + "num_input_tokens_seen": 165964025, + "router_z_loss_clip": 1.72363281, + "router_z_loss_mlp": 0.14123535, + "step": 7733, + "time_per_iteration": 2.4875564575195312 + }, + { + "auxiliary_loss_clip": 0.06338679, + "auxiliary_loss_mlp": 0.01252754, + "balance_loss_clip": 0.0626616, + "balance_loss_mlp": 0.0124981, + "epoch": 0.46499323613407484, + "flos": 58325082744960.0, + "grad_norm": 1.3051386869973822, + "language_loss": 0.52022988, + "learning_rate": 2.3224241986645057e-06, + "loss": 0.5961442, + "num_input_tokens_seen": 166021950, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.02941895, + "step": 7734, + "time_per_iteration": 3.0869898796081543 + }, + { + "auxiliary_loss_clip": 0.0644846, + "auxiliary_loss_mlp": 0.01271308, + "balance_loss_clip": 0.06283916, + "balance_loss_mlp": 0.01257856, + "epoch": 0.4650533593867428, + "flos": 10894308773760.0, + "grad_norm": 2.170877243914886, + "language_loss": 0.75776118, + "learning_rate": 2.3220398251154035e-06, + "loss": 0.83495891, + "num_input_tokens_seen": 166039675, + "router_z_loss_clip": 1.64648438, + "router_z_loss_mlp": 0.13464355, + "step": 7735, + "time_per_iteration": 2.478837490081787 + }, + { + "auxiliary_loss_clip": 0.06441534, + "auxiliary_loss_mlp": 0.01268486, + "balance_loss_clip": 0.0627993, + "balance_loss_mlp": 0.01255009, + "epoch": 0.46511348263941077, + "flos": 19980756581760.0, + "grad_norm": 2.0032469234086507, + "language_loss": 0.6994068, + "learning_rate": 2.321655439354519e-06, + "loss": 0.77650702, + "num_input_tokens_seen": 166057745, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.13482666, + "step": 7736, + "time_per_iteration": 2.6353659629821777 + }, + { + "auxiliary_loss_clip": 0.06442849, + "auxiliary_loss_mlp": 0.01268241, + "balance_loss_clip": 0.0628303, + "balance_loss_mlp": 0.01256237, + "epoch": 0.46517360589207873, + "flos": 19683795312000.0, + "grad_norm": 1.6634794649969447, + "language_loss": 0.72674608, + "learning_rate": 2.321271041396427e-06, + "loss": 0.80385697, + "num_input_tokens_seen": 166076440, + "router_z_loss_clip": 1.59570312, + "router_z_loss_mlp": 0.12005615, + "step": 7737, + "time_per_iteration": 2.5038952827453613 + }, + { + "auxiliary_loss_clip": 0.06449911, + "auxiliary_loss_mlp": 0.01268223, + "balance_loss_clip": 0.06283341, + "balance_loss_mlp": 0.01254603, + "epoch": 0.46523372914474675, + "flos": 16878203452800.0, + "grad_norm": 1.9711860161800356, + "language_loss": 0.84095049, + "learning_rate": 2.3208866312557065e-06, + "loss": 0.91813183, + "num_input_tokens_seen": 166092520, + "router_z_loss_clip": 1.66699219, + "router_z_loss_mlp": 0.1361084, + "step": 7738, + "time_per_iteration": 2.5216240882873535 + }, + { + "auxiliary_loss_clip": 0.06338458, + "auxiliary_loss_mlp": 0.01253722, + "balance_loss_clip": 0.06265976, + "balance_loss_mlp": 0.01250617, + "epoch": 0.4652938523974147, + "flos": 53458188917760.0, + "grad_norm": 0.7399188166866549, + "language_loss": 0.57646966, + "learning_rate": 2.320502208946932e-06, + "loss": 0.65239149, + "num_input_tokens_seen": 166156285, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.03102112, + "step": 7739, + "time_per_iteration": 3.215662717819214 + }, + { + "auxiliary_loss_clip": 0.06450304, + "auxiliary_loss_mlp": 0.01271295, + "balance_loss_clip": 0.06285876, + "balance_loss_mlp": 0.01257299, + "epoch": 0.4653539756500827, + "flos": 15236642113920.0, + "grad_norm": 1.7449085109148506, + "language_loss": 0.85184145, + "learning_rate": 2.3201177744846815e-06, + "loss": 0.92905748, + "num_input_tokens_seen": 166173455, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.14013672, + "step": 7740, + "time_per_iteration": 2.4736168384552 + }, + { + "auxiliary_loss_clip": 0.0644415, + "auxiliary_loss_mlp": 0.01270653, + "balance_loss_clip": 0.06281894, + "balance_loss_mlp": 0.01256706, + "epoch": 0.46541409890275065, + "flos": 23738978309760.0, + "grad_norm": 1.5125636475233326, + "language_loss": 0.76338875, + "learning_rate": 2.3197333278835327e-06, + "loss": 0.84053683, + "num_input_tokens_seen": 166194370, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.1394043, + "step": 7741, + "time_per_iteration": 2.56061053276062 + }, + { + "auxiliary_loss_clip": 0.06456167, + "auxiliary_loss_mlp": 0.01268672, + "balance_loss_clip": 0.06284943, + "balance_loss_mlp": 0.01254838, + "epoch": 0.4654742221554186, + "flos": 20853150566400.0, + "grad_norm": 1.6688490987186926, + "language_loss": 0.81291914, + "learning_rate": 2.319348869158064e-06, + "loss": 0.89016759, + "num_input_tokens_seen": 166213195, + "router_z_loss_clip": 1.71386719, + "router_z_loss_mlp": 0.13812256, + "step": 7742, + "time_per_iteration": 2.5372226238250732 + }, + { + "auxiliary_loss_clip": 0.06456183, + "auxiliary_loss_mlp": 0.01268485, + "balance_loss_clip": 0.06287557, + "balance_loss_mlp": 0.01254264, + "epoch": 0.4655343454080866, + "flos": 20711210549760.0, + "grad_norm": 1.6329017257985423, + "language_loss": 0.72620338, + "learning_rate": 2.3189643983228555e-06, + "loss": 0.80345011, + "num_input_tokens_seen": 166231350, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.14227295, + "step": 7743, + "time_per_iteration": 2.561323404312134 + }, + { + "auxiliary_loss_clip": 0.0644543, + "auxiliary_loss_mlp": 0.01269888, + "balance_loss_clip": 0.06280947, + "balance_loss_mlp": 0.01256036, + "epoch": 0.46559446866075455, + "flos": 18995912017920.0, + "grad_norm": 1.7294678893011792, + "language_loss": 0.71235406, + "learning_rate": 2.318579915392483e-06, + "loss": 0.78950727, + "num_input_tokens_seen": 166250530, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.13842773, + "step": 7744, + "time_per_iteration": 2.491428852081299 + }, + { + "auxiliary_loss_clip": 0.06446386, + "auxiliary_loss_mlp": 0.01264964, + "balance_loss_clip": 0.06285123, + "balance_loss_mlp": 0.01252513, + "epoch": 0.4656545919134225, + "flos": 34505030010240.0, + "grad_norm": 1.6678897715471863, + "language_loss": 0.84893715, + "learning_rate": 2.31819542038153e-06, + "loss": 0.92605066, + "num_input_tokens_seen": 166272545, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.12451172, + "step": 7745, + "time_per_iteration": 2.759547233581543 + }, + { + "auxiliary_loss_clip": 0.064444, + "auxiliary_loss_mlp": 0.01268532, + "balance_loss_clip": 0.06282735, + "balance_loss_mlp": 0.01255824, + "epoch": 0.4657147151660905, + "flos": 24316465449600.0, + "grad_norm": 1.3285756054685907, + "language_loss": 0.73465878, + "learning_rate": 2.317810913304574e-06, + "loss": 0.81178808, + "num_input_tokens_seen": 166292135, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.12701416, + "step": 7746, + "time_per_iteration": 2.5268633365631104 + }, + { + "auxiliary_loss_clip": 0.064431, + "auxiliary_loss_mlp": 0.01272209, + "balance_loss_clip": 0.06282558, + "balance_loss_mlp": 0.0125931, + "epoch": 0.46577483841875844, + "flos": 58807743390720.0, + "grad_norm": 1.6027404056917662, + "language_loss": 0.69721079, + "learning_rate": 2.3174263941761963e-06, + "loss": 0.77436388, + "num_input_tokens_seen": 166316710, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.12896729, + "step": 7747, + "time_per_iteration": 2.8772974014282227 + }, + { + "auxiliary_loss_clip": 0.06441785, + "auxiliary_loss_mlp": 0.01269191, + "balance_loss_clip": 0.06279266, + "balance_loss_mlp": 0.01255631, + "epoch": 0.4658349616714264, + "flos": 31330081353600.0, + "grad_norm": 1.8250767057505617, + "language_loss": 0.68153578, + "learning_rate": 2.317041863010978e-06, + "loss": 0.75864553, + "num_input_tokens_seen": 166338535, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.13543701, + "step": 7748, + "time_per_iteration": 2.576828956604004 + }, + { + "auxiliary_loss_clip": 0.06449303, + "auxiliary_loss_mlp": 0.01269068, + "balance_loss_clip": 0.06280029, + "balance_loss_mlp": 0.01254768, + "epoch": 0.46589508492409437, + "flos": 14864601985920.0, + "grad_norm": 2.1691376792383554, + "language_loss": 0.64591479, + "learning_rate": 2.3166573198235007e-06, + "loss": 0.72309858, + "num_input_tokens_seen": 166355540, + "router_z_loss_clip": 1.69140625, + "router_z_loss_mlp": 0.14306641, + "step": 7749, + "time_per_iteration": 2.5408928394317627 + }, + { + "auxiliary_loss_clip": 0.06452534, + "auxiliary_loss_mlp": 0.01273929, + "balance_loss_clip": 0.06283832, + "balance_loss_mlp": 0.01258795, + "epoch": 0.46595520817676234, + "flos": 12900908424960.0, + "grad_norm": 2.0171049134441237, + "language_loss": 0.74442625, + "learning_rate": 2.3162727646283456e-06, + "loss": 0.82169086, + "num_input_tokens_seen": 166372635, + "router_z_loss_clip": 1.6875, + "router_z_loss_mlp": 0.15142822, + "step": 7750, + "time_per_iteration": 2.4698846340179443 + }, + { + "auxiliary_loss_clip": 0.06444734, + "auxiliary_loss_mlp": 0.01270437, + "balance_loss_clip": 0.06276895, + "balance_loss_mlp": 0.01255811, + "epoch": 0.46601533142943036, + "flos": 32862504349440.0, + "grad_norm": 1.8980956421649817, + "language_loss": 0.7426213, + "learning_rate": 2.3158881974400963e-06, + "loss": 0.81977308, + "num_input_tokens_seen": 166393175, + "router_z_loss_clip": 1.67871094, + "router_z_loss_mlp": 0.14624023, + "step": 7751, + "time_per_iteration": 2.6534221172332764 + }, + { + "auxiliary_loss_clip": 0.06449904, + "auxiliary_loss_mlp": 0.01267221, + "balance_loss_clip": 0.06280084, + "balance_loss_mlp": 0.01253017, + "epoch": 0.4660754546820983, + "flos": 19972496954880.0, + "grad_norm": 1.7579709538150943, + "language_loss": 0.73910719, + "learning_rate": 2.3155036182733345e-06, + "loss": 0.81627846, + "num_input_tokens_seen": 166408630, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.14202881, + "step": 7752, + "time_per_iteration": 2.474492311477661 + }, + { + "auxiliary_loss_clip": 0.06447943, + "auxiliary_loss_mlp": 0.01268609, + "balance_loss_clip": 0.06279718, + "balance_loss_mlp": 0.01254578, + "epoch": 0.4661355779347663, + "flos": 26695482572160.0, + "grad_norm": 2.190938043745359, + "language_loss": 0.69726032, + "learning_rate": 2.315119027142644e-06, + "loss": 0.7744258, + "num_input_tokens_seen": 166428170, + "router_z_loss_clip": 1.68066406, + "router_z_loss_mlp": 0.14038086, + "step": 7753, + "time_per_iteration": 2.604612350463867 + }, + { + "auxiliary_loss_clip": 0.06438763, + "auxiliary_loss_mlp": 0.01269724, + "balance_loss_clip": 0.0627787, + "balance_loss_mlp": 0.01256777, + "epoch": 0.46619570118743425, + "flos": 20965726926720.0, + "grad_norm": 1.7706266197381177, + "language_loss": 0.73293746, + "learning_rate": 2.3147344240626076e-06, + "loss": 0.81002235, + "num_input_tokens_seen": 166446705, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.12963867, + "step": 7754, + "time_per_iteration": 2.491225242614746 + }, + { + "auxiliary_loss_clip": 0.06444383, + "auxiliary_loss_mlp": 0.01271714, + "balance_loss_clip": 0.06278208, + "balance_loss_mlp": 0.01256855, + "epoch": 0.4662558244401022, + "flos": 24433024878720.0, + "grad_norm": 1.5728879839910523, + "language_loss": 0.79001075, + "learning_rate": 2.3143498090478114e-06, + "loss": 0.8671717, + "num_input_tokens_seen": 166466750, + "router_z_loss_clip": 1.6640625, + "router_z_loss_mlp": 0.14868164, + "step": 7755, + "time_per_iteration": 2.562178134918213 + }, + { + "auxiliary_loss_clip": 0.06436031, + "auxiliary_loss_mlp": 0.01269294, + "balance_loss_clip": 0.06276575, + "balance_loss_mlp": 0.01256181, + "epoch": 0.4663159476927702, + "flos": 20601820644480.0, + "grad_norm": 1.5633103047544015, + "language_loss": 0.72593671, + "learning_rate": 2.3139651821128382e-06, + "loss": 0.80299002, + "num_input_tokens_seen": 166485400, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.13116455, + "step": 7756, + "time_per_iteration": 4.01608943939209 + }, + { + "auxiliary_loss_clip": 0.06436817, + "auxiliary_loss_mlp": 0.01269611, + "balance_loss_clip": 0.06276436, + "balance_loss_mlp": 0.01256897, + "epoch": 0.46637607094543815, + "flos": 25668235042560.0, + "grad_norm": 1.701604485790762, + "language_loss": 0.7836898, + "learning_rate": 2.313580543272274e-06, + "loss": 0.86075413, + "num_input_tokens_seen": 166505730, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.12719727, + "step": 7757, + "time_per_iteration": 2.555097818374634 + }, + { + "auxiliary_loss_clip": 0.06441291, + "auxiliary_loss_mlp": 0.01274403, + "balance_loss_clip": 0.06277295, + "balance_loss_mlp": 0.01261123, + "epoch": 0.4664361941981061, + "flos": 24279722634240.0, + "grad_norm": 1.9711907960618857, + "language_loss": 0.66213286, + "learning_rate": 2.313195892540705e-06, + "loss": 0.73928982, + "num_input_tokens_seen": 166523770, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.13275146, + "step": 7758, + "time_per_iteration": 2.569962739944458 + }, + { + "auxiliary_loss_clip": 0.06442615, + "auxiliary_loss_mlp": 0.01273146, + "balance_loss_clip": 0.0627957, + "balance_loss_mlp": 0.01260629, + "epoch": 0.4664963174507741, + "flos": 18411800405760.0, + "grad_norm": 1.9738824417509344, + "language_loss": 0.74950838, + "learning_rate": 2.3128112299327147e-06, + "loss": 0.826666, + "num_input_tokens_seen": 166542935, + "router_z_loss_clip": 1.63085938, + "router_z_loss_mlp": 0.12518311, + "step": 7759, + "time_per_iteration": 2.47729229927063 + }, + { + "auxiliary_loss_clip": 0.06440781, + "auxiliary_loss_mlp": 0.01272683, + "balance_loss_clip": 0.06281125, + "balance_loss_mlp": 0.01259827, + "epoch": 0.46655644070344204, + "flos": 22461616742400.0, + "grad_norm": 3.1770723580201103, + "language_loss": 0.77710176, + "learning_rate": 2.312426555462893e-06, + "loss": 0.85423636, + "num_input_tokens_seen": 166563935, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.12860107, + "step": 7760, + "time_per_iteration": 2.555143117904663 + }, + { + "auxiliary_loss_clip": 0.06438316, + "auxiliary_loss_mlp": 0.01270754, + "balance_loss_clip": 0.06279285, + "balance_loss_mlp": 0.01256675, + "epoch": 0.46661656395611, + "flos": 13813525169280.0, + "grad_norm": 1.6658245877843647, + "language_loss": 0.7447418, + "learning_rate": 2.3120418691458237e-06, + "loss": 0.82183254, + "num_input_tokens_seen": 166582175, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.14099121, + "step": 7761, + "time_per_iteration": 2.493032217025757 + }, + { + "auxiliary_loss_clip": 0.06446707, + "auxiliary_loss_mlp": 0.01275728, + "balance_loss_clip": 0.06281132, + "balance_loss_mlp": 0.0126094, + "epoch": 0.466676687208778, + "flos": 21658473757440.0, + "grad_norm": 1.6817719059657052, + "language_loss": 0.78770381, + "learning_rate": 2.3116571709960956e-06, + "loss": 0.86492819, + "num_input_tokens_seen": 166601870, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.14788818, + "step": 7762, + "time_per_iteration": 2.5613081455230713 + }, + { + "auxiliary_loss_clip": 0.06338885, + "auxiliary_loss_mlp": 0.01268455, + "balance_loss_clip": 0.06268312, + "balance_loss_mlp": 0.01265552, + "epoch": 0.46673681046144594, + "flos": 68554163554560.0, + "grad_norm": 0.7818830178478652, + "language_loss": 0.59643799, + "learning_rate": 2.311272461028297e-06, + "loss": 0.67251134, + "num_input_tokens_seen": 166668960, + "router_z_loss_clip": 0.70605469, + "router_z_loss_mlp": 0.0289917, + "step": 7763, + "time_per_iteration": 4.584456443786621 + }, + { + "auxiliary_loss_clip": 0.06446124, + "auxiliary_loss_mlp": 0.01269966, + "balance_loss_clip": 0.06278878, + "balance_loss_mlp": 0.01255559, + "epoch": 0.46679693371411396, + "flos": 15819789404160.0, + "grad_norm": 1.948864663001373, + "language_loss": 0.79278809, + "learning_rate": 2.3108877392570146e-06, + "loss": 0.86994898, + "num_input_tokens_seen": 166686110, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.14398193, + "step": 7764, + "time_per_iteration": 2.465179920196533 + }, + { + "auxiliary_loss_clip": 0.06441632, + "auxiliary_loss_mlp": 0.01267635, + "balance_loss_clip": 0.06281599, + "balance_loss_mlp": 0.01255035, + "epoch": 0.4668570569667819, + "flos": 18520393697280.0, + "grad_norm": 2.0437394229584123, + "language_loss": 0.72096646, + "learning_rate": 2.310503005696839e-06, + "loss": 0.79805923, + "num_input_tokens_seen": 166703930, + "router_z_loss_clip": 1.59863281, + "router_z_loss_mlp": 0.12597656, + "step": 7765, + "time_per_iteration": 2.5701630115509033 + }, + { + "auxiliary_loss_clip": 0.06443523, + "auxiliary_loss_mlp": 0.01272136, + "balance_loss_clip": 0.06278671, + "balance_loss_mlp": 0.01258141, + "epoch": 0.4669171802194499, + "flos": 19212385841280.0, + "grad_norm": 2.21059711365052, + "language_loss": 0.77947736, + "learning_rate": 2.3101182603623576e-06, + "loss": 0.85663396, + "num_input_tokens_seen": 166719940, + "router_z_loss_clip": 1.64746094, + "router_z_loss_mlp": 0.14001465, + "step": 7766, + "time_per_iteration": 2.481160879135132 + }, + { + "auxiliary_loss_clip": 0.06441876, + "auxiliary_loss_mlp": 0.01272138, + "balance_loss_clip": 0.06280202, + "balance_loss_mlp": 0.01258489, + "epoch": 0.46697730347211786, + "flos": 12281018319360.0, + "grad_norm": 2.232432946710323, + "language_loss": 0.65461195, + "learning_rate": 2.3097335032681607e-06, + "loss": 0.73175204, + "num_input_tokens_seen": 166738285, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.13653564, + "step": 7767, + "time_per_iteration": 2.5368387699127197 + }, + { + "auxiliary_loss_clip": 0.06442834, + "auxiliary_loss_mlp": 0.01272968, + "balance_loss_clip": 0.06280966, + "balance_loss_mlp": 0.01259307, + "epoch": 0.4670374267247858, + "flos": 23593516421760.0, + "grad_norm": 2.313152144280668, + "language_loss": 0.75071919, + "learning_rate": 2.3093487344288393e-06, + "loss": 0.82787716, + "num_input_tokens_seen": 166758170, + "router_z_loss_clip": 1.61914062, + "router_z_loss_mlp": 0.13677979, + "step": 7768, + "time_per_iteration": 3.9271702766418457 + }, + { + "auxiliary_loss_clip": 0.06441817, + "auxiliary_loss_mlp": 0.0126721, + "balance_loss_clip": 0.06279824, + "balance_loss_mlp": 0.01253697, + "epoch": 0.4670975499774538, + "flos": 15995495416320.0, + "grad_norm": 1.5695198160982793, + "language_loss": 0.71176434, + "learning_rate": 2.308963953858982e-06, + "loss": 0.7888546, + "num_input_tokens_seen": 166775750, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.1350708, + "step": 7769, + "time_per_iteration": 2.5253636837005615 + }, + { + "auxiliary_loss_clip": 0.06441696, + "auxiliary_loss_mlp": 0.01271746, + "balance_loss_clip": 0.06279374, + "balance_loss_mlp": 0.01258305, + "epoch": 0.46715767323012175, + "flos": 15383026396800.0, + "grad_norm": 1.8223238330296296, + "language_loss": 0.81503379, + "learning_rate": 2.3085791615731803e-06, + "loss": 0.89216816, + "num_input_tokens_seen": 166791720, + "router_z_loss_clip": 1.61914062, + "router_z_loss_mlp": 0.13446045, + "step": 7770, + "time_per_iteration": 2.468287706375122 + }, + { + "auxiliary_loss_clip": 0.06346406, + "auxiliary_loss_mlp": 0.01251242, + "balance_loss_clip": 0.06275694, + "balance_loss_mlp": 0.01249068, + "epoch": 0.4672177964827897, + "flos": 60270774877440.0, + "grad_norm": 0.8490857527823061, + "language_loss": 0.55591935, + "learning_rate": 2.3081943575860265e-06, + "loss": 0.63189584, + "num_input_tokens_seen": 166856360, + "router_z_loss_clip": 0.70703125, + "router_z_loss_mlp": 0.02177429, + "step": 7771, + "time_per_iteration": 3.1719799041748047 + }, + { + "auxiliary_loss_clip": 0.064445, + "auxiliary_loss_mlp": 0.01269252, + "balance_loss_clip": 0.06282087, + "balance_loss_mlp": 0.01256234, + "epoch": 0.4672779197354577, + "flos": 27643500466560.0, + "grad_norm": 2.2149063838305363, + "language_loss": 0.65989488, + "learning_rate": 2.3078095419121117e-06, + "loss": 0.73703241, + "num_input_tokens_seen": 166875925, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.13024902, + "step": 7772, + "time_per_iteration": 2.616668939590454 + }, + { + "auxiliary_loss_clip": 0.06441614, + "auxiliary_loss_mlp": 0.01269621, + "balance_loss_clip": 0.06282961, + "balance_loss_mlp": 0.01257009, + "epoch": 0.46733804298812565, + "flos": 31402267246080.0, + "grad_norm": 2.671628135597842, + "language_loss": 0.64495057, + "learning_rate": 2.3074247145660283e-06, + "loss": 0.72206295, + "num_input_tokens_seen": 166896520, + "router_z_loss_clip": 1.58789062, + "router_z_loss_mlp": 0.1260376, + "step": 7773, + "time_per_iteration": 2.5923900604248047 + }, + { + "auxiliary_loss_clip": 0.06442621, + "auxiliary_loss_mlp": 0.01269928, + "balance_loss_clip": 0.06280822, + "balance_loss_mlp": 0.01256457, + "epoch": 0.4673981662407936, + "flos": 19506747634560.0, + "grad_norm": 1.7164237292195044, + "language_loss": 0.80045915, + "learning_rate": 2.3070398755623685e-06, + "loss": 0.87758458, + "num_input_tokens_seen": 166915370, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.13464355, + "step": 7774, + "time_per_iteration": 2.577458620071411 + }, + { + "auxiliary_loss_clip": 0.06444994, + "auxiliary_loss_mlp": 0.01268006, + "balance_loss_clip": 0.06279732, + "balance_loss_mlp": 0.01254583, + "epoch": 0.4674582894934616, + "flos": 20528083451520.0, + "grad_norm": 1.5985457295090966, + "language_loss": 0.78042519, + "learning_rate": 2.306655024915726e-06, + "loss": 0.85755515, + "num_input_tokens_seen": 166934875, + "router_z_loss_clip": 1.65234375, + "router_z_loss_mlp": 0.13439941, + "step": 7775, + "time_per_iteration": 2.5538787841796875 + }, + { + "auxiliary_loss_clip": 0.06442325, + "auxiliary_loss_mlp": 0.0127297, + "balance_loss_clip": 0.06282222, + "balance_loss_mlp": 0.01259988, + "epoch": 0.46751841274612954, + "flos": 22097500824960.0, + "grad_norm": 1.8860444903676625, + "language_loss": 0.69909471, + "learning_rate": 2.306270162640694e-06, + "loss": 0.77624762, + "num_input_tokens_seen": 166954285, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.12963867, + "step": 7776, + "time_per_iteration": 2.561692237854004 + }, + { + "auxiliary_loss_clip": 0.0644502, + "auxiliary_loss_mlp": 0.01270071, + "balance_loss_clip": 0.06284119, + "balance_loss_mlp": 0.01257244, + "epoch": 0.46757853599879756, + "flos": 26987454524160.0, + "grad_norm": 1.3861659298765134, + "language_loss": 0.74096608, + "learning_rate": 2.3058852887518678e-06, + "loss": 0.81811702, + "num_input_tokens_seen": 166975975, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.1282959, + "step": 7777, + "time_per_iteration": 2.536015510559082 + }, + { + "auxiliary_loss_clip": 0.06447745, + "auxiliary_loss_mlp": 0.01270612, + "balance_loss_clip": 0.06284414, + "balance_loss_mlp": 0.01256921, + "epoch": 0.4676386592514655, + "flos": 24140927145600.0, + "grad_norm": 1.9470179218555579, + "language_loss": 0.69820189, + "learning_rate": 2.3055004032638394e-06, + "loss": 0.77538544, + "num_input_tokens_seen": 166996140, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.13690186, + "step": 7778, + "time_per_iteration": 2.548154354095459 + }, + { + "auxiliary_loss_clip": 0.06447626, + "auxiliary_loss_mlp": 0.01267681, + "balance_loss_clip": 0.06282265, + "balance_loss_mlp": 0.01253513, + "epoch": 0.4676987825041335, + "flos": 25490768094720.0, + "grad_norm": 1.4247023457023664, + "language_loss": 0.73440385, + "learning_rate": 2.305115506191206e-06, + "loss": 0.81155688, + "num_input_tokens_seen": 167016105, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.14160156, + "step": 7779, + "time_per_iteration": 2.5291388034820557 + }, + { + "auxiliary_loss_clip": 0.06443821, + "auxiliary_loss_mlp": 0.01265717, + "balance_loss_clip": 0.06285408, + "balance_loss_mlp": 0.01253379, + "epoch": 0.46775890575680146, + "flos": 21951871228800.0, + "grad_norm": 1.9613896423037807, + "language_loss": 0.72685552, + "learning_rate": 2.304730597548562e-06, + "loss": 0.80395079, + "num_input_tokens_seen": 167036185, + "router_z_loss_clip": 1.58398438, + "router_z_loss_mlp": 0.12353516, + "step": 7780, + "time_per_iteration": 2.5508480072021484 + }, + { + "auxiliary_loss_clip": 0.06447856, + "auxiliary_loss_mlp": 0.01269851, + "balance_loss_clip": 0.06280719, + "balance_loss_mlp": 0.01256273, + "epoch": 0.4678190290094694, + "flos": 25235413176960.0, + "grad_norm": 1.8471847442174032, + "language_loss": 0.74638426, + "learning_rate": 2.3043456773505023e-06, + "loss": 0.82356131, + "num_input_tokens_seen": 167054515, + "router_z_loss_clip": 1.671875, + "router_z_loss_mlp": 0.13586426, + "step": 7781, + "time_per_iteration": 2.527614116668701 + }, + { + "auxiliary_loss_clip": 0.06446712, + "auxiliary_loss_mlp": 0.01269353, + "balance_loss_clip": 0.06281281, + "balance_loss_mlp": 0.0125528, + "epoch": 0.4678791522621374, + "flos": 32276254458240.0, + "grad_norm": 1.845752858447898, + "language_loss": 0.63050562, + "learning_rate": 2.3039607456116252e-06, + "loss": 0.70766628, + "num_input_tokens_seen": 167077245, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.140625, + "step": 7782, + "time_per_iteration": 2.650505304336548 + }, + { + "auxiliary_loss_clip": 0.06445308, + "auxiliary_loss_mlp": 0.01268795, + "balance_loss_clip": 0.06280467, + "balance_loss_mlp": 0.01255306, + "epoch": 0.46793927551480535, + "flos": 27052764382080.0, + "grad_norm": 2.229893941722145, + "language_loss": 0.63585413, + "learning_rate": 2.3035758023465254e-06, + "loss": 0.71299517, + "num_input_tokens_seen": 167097235, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.13494873, + "step": 7783, + "time_per_iteration": 2.5537588596343994 + }, + { + "auxiliary_loss_clip": 0.0645118, + "auxiliary_loss_mlp": 0.01271407, + "balance_loss_clip": 0.06280845, + "balance_loss_mlp": 0.01257245, + "epoch": 0.4679993987674733, + "flos": 17463195532800.0, + "grad_norm": 2.4083561383098004, + "language_loss": 0.68662858, + "learning_rate": 2.303190847569801e-06, + "loss": 0.7638545, + "num_input_tokens_seen": 167113155, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.1418457, + "step": 7784, + "time_per_iteration": 2.560459613800049 + }, + { + "auxiliary_loss_clip": 0.06438549, + "auxiliary_loss_mlp": 0.01266567, + "balance_loss_clip": 0.06278238, + "balance_loss_mlp": 0.01254003, + "epoch": 0.4680595220201413, + "flos": 17170804310400.0, + "grad_norm": 1.9765250646873525, + "language_loss": 0.84616911, + "learning_rate": 2.3028058812960497e-06, + "loss": 0.92322016, + "num_input_tokens_seen": 167131765, + "router_z_loss_clip": 1.60351562, + "router_z_loss_mlp": 0.12567139, + "step": 7785, + "time_per_iteration": 2.5567643642425537 + }, + { + "auxiliary_loss_clip": 0.06444662, + "auxiliary_loss_mlp": 0.01268089, + "balance_loss_clip": 0.06281722, + "balance_loss_mlp": 0.01254225, + "epoch": 0.46811964527280925, + "flos": 11332329592320.0, + "grad_norm": 1.9719414675879272, + "language_loss": 0.77991092, + "learning_rate": 2.3024209035398678e-06, + "loss": 0.85703844, + "num_input_tokens_seen": 167149030, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.13867188, + "step": 7786, + "time_per_iteration": 2.507206439971924 + }, + { + "auxiliary_loss_clip": 0.06440122, + "auxiliary_loss_mlp": 0.01265794, + "balance_loss_clip": 0.06281641, + "balance_loss_mlp": 0.01253897, + "epoch": 0.4681797685254772, + "flos": 24285508565760.0, + "grad_norm": 2.2497529795631817, + "language_loss": 0.74387538, + "learning_rate": 2.302035914315856e-06, + "loss": 0.82093459, + "num_input_tokens_seen": 167167375, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.11901855, + "step": 7787, + "time_per_iteration": 2.498021125793457 + }, + { + "auxiliary_loss_clip": 0.06439888, + "auxiliary_loss_mlp": 0.01272631, + "balance_loss_clip": 0.06278901, + "balance_loss_mlp": 0.01258785, + "epoch": 0.4682398917781452, + "flos": 31658544558720.0, + "grad_norm": 1.7533783368280031, + "language_loss": 0.66132212, + "learning_rate": 2.3016509136386116e-06, + "loss": 0.73844731, + "num_input_tokens_seen": 167188065, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.1383667, + "step": 7788, + "time_per_iteration": 2.650092363357544 + }, + { + "auxiliary_loss_clip": 0.06441839, + "auxiliary_loss_mlp": 0.01268022, + "balance_loss_clip": 0.06280681, + "balance_loss_mlp": 0.01256036, + "epoch": 0.46830001503081314, + "flos": 28118264100480.0, + "grad_norm": 1.5278727961877703, + "language_loss": 0.64315766, + "learning_rate": 2.3012659015227343e-06, + "loss": 0.72025621, + "num_input_tokens_seen": 167209675, + "router_z_loss_clip": 1.61132812, + "router_z_loss_mlp": 0.11987305, + "step": 7789, + "time_per_iteration": 2.5806198120117188 + }, + { + "auxiliary_loss_clip": 0.06338993, + "auxiliary_loss_mlp": 0.01252338, + "balance_loss_clip": 0.06268935, + "balance_loss_mlp": 0.01250063, + "epoch": 0.4683601382834811, + "flos": 57900059308800.0, + "grad_norm": 0.6904155708009142, + "language_loss": 0.61868596, + "learning_rate": 2.300880877982825e-06, + "loss": 0.69459921, + "num_input_tokens_seen": 167273940, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 0.02276611, + "step": 7790, + "time_per_iteration": 3.2271504402160645 + }, + { + "auxiliary_loss_clip": 0.06442016, + "auxiliary_loss_mlp": 0.01269711, + "balance_loss_clip": 0.06283005, + "balance_loss_mlp": 0.01257111, + "epoch": 0.46842026153614913, + "flos": 21878427525120.0, + "grad_norm": 1.6377280327187325, + "language_loss": 0.79426539, + "learning_rate": 2.3004958430334808e-06, + "loss": 0.87138271, + "num_input_tokens_seen": 167292730, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.12597656, + "step": 7791, + "time_per_iteration": 2.490171194076538 + }, + { + "auxiliary_loss_clip": 0.06441824, + "auxiliary_loss_mlp": 0.01269493, + "balance_loss_clip": 0.06283456, + "balance_loss_mlp": 0.01256899, + "epoch": 0.4684803847888171, + "flos": 24907914293760.0, + "grad_norm": 1.496703208223837, + "language_loss": 0.74930024, + "learning_rate": 2.3001107966893052e-06, + "loss": 0.82641351, + "num_input_tokens_seen": 167313460, + "router_z_loss_clip": 1.58300781, + "router_z_loss_mlp": 0.12573242, + "step": 7792, + "time_per_iteration": 2.5588057041168213 + }, + { + "auxiliary_loss_clip": 0.0643919, + "auxiliary_loss_mlp": 0.01267774, + "balance_loss_clip": 0.06282478, + "balance_loss_mlp": 0.01255972, + "epoch": 0.46854050804148506, + "flos": 26259138835200.0, + "grad_norm": 1.9488467409065784, + "language_loss": 0.68353844, + "learning_rate": 2.299725738964898e-06, + "loss": 0.76060808, + "num_input_tokens_seen": 167335385, + "router_z_loss_clip": 1.56640625, + "router_z_loss_mlp": 0.11804199, + "step": 7793, + "time_per_iteration": 2.543156147003174 + }, + { + "auxiliary_loss_clip": 0.06441274, + "auxiliary_loss_mlp": 0.01273582, + "balance_loss_clip": 0.0628298, + "balance_loss_mlp": 0.01261387, + "epoch": 0.468600631294153, + "flos": 21586204010880.0, + "grad_norm": 1.8535654365133143, + "language_loss": 0.74367434, + "learning_rate": 2.2993406698748607e-06, + "loss": 0.82082289, + "num_input_tokens_seen": 167353625, + "router_z_loss_clip": 1.58105469, + "router_z_loss_mlp": 0.12194824, + "step": 7794, + "time_per_iteration": 2.6082603931427 + }, + { + "auxiliary_loss_clip": 0.06445156, + "auxiliary_loss_mlp": 0.01268892, + "balance_loss_clip": 0.06285646, + "balance_loss_mlp": 0.01255343, + "epoch": 0.468660754546821, + "flos": 25892842711680.0, + "grad_norm": 2.128212140250663, + "language_loss": 0.64027059, + "learning_rate": 2.2989555894337953e-06, + "loss": 0.71741104, + "num_input_tokens_seen": 167374565, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.13537598, + "step": 7795, + "time_per_iteration": 2.554871082305908 + }, + { + "auxiliary_loss_clip": 0.06440422, + "auxiliary_loss_mlp": 0.01266239, + "balance_loss_clip": 0.06283793, + "balance_loss_mlp": 0.01253067, + "epoch": 0.46872087779948896, + "flos": 35482746977280.0, + "grad_norm": 1.4934025143707166, + "language_loss": 0.6791029, + "learning_rate": 2.298570497656304e-06, + "loss": 0.7561695, + "num_input_tokens_seen": 167395010, + "router_z_loss_clip": 1.56542969, + "router_z_loss_mlp": 0.13171387, + "step": 7796, + "time_per_iteration": 4.070605754852295 + }, + { + "auxiliary_loss_clip": 0.06441301, + "auxiliary_loss_mlp": 0.01267111, + "balance_loss_clip": 0.06280352, + "balance_loss_mlp": 0.0125435, + "epoch": 0.4687810010521569, + "flos": 26403720255360.0, + "grad_norm": 1.619506492510176, + "language_loss": 0.70710748, + "learning_rate": 2.2981853945569894e-06, + "loss": 0.78419161, + "num_input_tokens_seen": 167415285, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.12762451, + "step": 7797, + "time_per_iteration": 2.574291706085205 + }, + { + "auxiliary_loss_clip": 0.06443868, + "auxiliary_loss_mlp": 0.01272473, + "balance_loss_clip": 0.0628204, + "balance_loss_mlp": 0.01258472, + "epoch": 0.4688411243048249, + "flos": 19978618302720.0, + "grad_norm": 1.9026226114754317, + "language_loss": 0.67159688, + "learning_rate": 2.297800280150454e-06, + "loss": 0.74876028, + "num_input_tokens_seen": 167432405, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.14007568, + "step": 7798, + "time_per_iteration": 2.4703564643859863 + }, + { + "auxiliary_loss_clip": 0.06331287, + "auxiliary_loss_mlp": 0.01256102, + "balance_loss_clip": 0.06261373, + "balance_loss_mlp": 0.01253898, + "epoch": 0.46890124755749285, + "flos": 63996739983360.0, + "grad_norm": 0.926390069403038, + "language_loss": 0.64518279, + "learning_rate": 2.2974151544513033e-06, + "loss": 0.7210567, + "num_input_tokens_seen": 167499365, + "router_z_loss_clip": 0.69921875, + "router_z_loss_mlp": 0.02207947, + "step": 7799, + "time_per_iteration": 3.3128738403320312 + }, + { + "auxiliary_loss_clip": 0.06441961, + "auxiliary_loss_mlp": 0.01271763, + "balance_loss_clip": 0.06283548, + "balance_loss_mlp": 0.01258429, + "epoch": 0.4689613708101608, + "flos": 23775763052160.0, + "grad_norm": 1.2629628474735628, + "language_loss": 0.72331405, + "learning_rate": 2.2970300174741395e-06, + "loss": 0.80045128, + "num_input_tokens_seen": 167520390, + "router_z_loss_clip": 1.58300781, + "router_z_loss_mlp": 0.13330078, + "step": 7800, + "time_per_iteration": 2.5339090824127197 + }, + { + "auxiliary_loss_clip": 0.06436972, + "auxiliary_loss_mlp": 0.01269738, + "balance_loss_clip": 0.06279731, + "balance_loss_mlp": 0.01257406, + "epoch": 0.4690214940628288, + "flos": 24795337933440.0, + "grad_norm": 2.7480307453946726, + "language_loss": 0.72682166, + "learning_rate": 2.296644869233568e-06, + "loss": 0.80388874, + "num_input_tokens_seen": 167539865, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.12335205, + "step": 7801, + "time_per_iteration": 2.552154541015625 + }, + { + "auxiliary_loss_clip": 0.06449857, + "auxiliary_loss_mlp": 0.01274232, + "balance_loss_clip": 0.06283514, + "balance_loss_mlp": 0.01260094, + "epoch": 0.46908161731549675, + "flos": 18083169492480.0, + "grad_norm": 1.9453242658612842, + "language_loss": 0.62466741, + "learning_rate": 2.2962597097441936e-06, + "loss": 0.70190829, + "num_input_tokens_seen": 167558190, + "router_z_loss_clip": 1.66308594, + "router_z_loss_mlp": 0.14135742, + "step": 7802, + "time_per_iteration": 3.9707396030426025 + }, + { + "auxiliary_loss_clip": 0.06437971, + "auxiliary_loss_mlp": 0.01270017, + "balance_loss_clip": 0.06277081, + "balance_loss_mlp": 0.01257459, + "epoch": 0.4691417405681647, + "flos": 25710554154240.0, + "grad_norm": 1.8844359624083942, + "language_loss": 0.73532665, + "learning_rate": 2.2958745390206206e-06, + "loss": 0.81240654, + "num_input_tokens_seen": 167577685, + "router_z_loss_clip": 1.60839844, + "router_z_loss_mlp": 0.12554932, + "step": 7803, + "time_per_iteration": 2.554459810256958 + }, + { + "auxiliary_loss_clip": 0.06438211, + "auxiliary_loss_mlp": 0.01272362, + "balance_loss_clip": 0.06278156, + "balance_loss_mlp": 0.01259338, + "epoch": 0.46920186382083273, + "flos": 17462776262400.0, + "grad_norm": 1.58578754852504, + "language_loss": 0.77327907, + "learning_rate": 2.2954893570774558e-06, + "loss": 0.85038471, + "num_input_tokens_seen": 167596390, + "router_z_loss_clip": 1.60058594, + "router_z_loss_mlp": 0.13012695, + "step": 7804, + "time_per_iteration": 2.543470621109009 + }, + { + "auxiliary_loss_clip": 0.06432682, + "auxiliary_loss_mlp": 0.0126654, + "balance_loss_clip": 0.06275688, + "balance_loss_mlp": 0.01254298, + "epoch": 0.4692619870735007, + "flos": 20345669112960.0, + "grad_norm": 1.787683586047485, + "language_loss": 0.77375299, + "learning_rate": 2.295104163929305e-06, + "loss": 0.8507452, + "num_input_tokens_seen": 167614980, + "router_z_loss_clip": 1.56933594, + "router_z_loss_mlp": 0.12231445, + "step": 7805, + "time_per_iteration": 2.501739740371704 + }, + { + "auxiliary_loss_clip": 0.0644381, + "auxiliary_loss_mlp": 0.01270681, + "balance_loss_clip": 0.06276695, + "balance_loss_mlp": 0.01257163, + "epoch": 0.46932211032616866, + "flos": 29504177032320.0, + "grad_norm": 1.522976757050157, + "language_loss": 0.83108258, + "learning_rate": 2.2947189595907742e-06, + "loss": 0.90822744, + "num_input_tokens_seen": 167635895, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.13519287, + "step": 7806, + "time_per_iteration": 2.6634225845336914 + }, + { + "auxiliary_loss_clip": 0.06437123, + "auxiliary_loss_mlp": 0.01266513, + "balance_loss_clip": 0.06274477, + "balance_loss_mlp": 0.01253496, + "epoch": 0.4693822335788366, + "flos": 36220202760960.0, + "grad_norm": 1.6923542734381007, + "language_loss": 0.77444482, + "learning_rate": 2.294333744076472e-06, + "loss": 0.8514812, + "num_input_tokens_seen": 167657440, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.13006592, + "step": 7807, + "time_per_iteration": 4.0442986488342285 + }, + { + "auxiliary_loss_clip": 0.06438392, + "auxiliary_loss_mlp": 0.01270643, + "balance_loss_clip": 0.06276641, + "balance_loss_mlp": 0.01257024, + "epoch": 0.4694423568315046, + "flos": 20345124061440.0, + "grad_norm": 1.7839407979100135, + "language_loss": 0.51769608, + "learning_rate": 2.2939485174010035e-06, + "loss": 0.59478641, + "num_input_tokens_seen": 167675025, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.13635254, + "step": 7808, + "time_per_iteration": 2.4910712242126465 + }, + { + "auxiliary_loss_clip": 0.06328695, + "auxiliary_loss_mlp": 0.01252926, + "balance_loss_clip": 0.06259091, + "balance_loss_mlp": 0.01250451, + "epoch": 0.46950248008417256, + "flos": 64343540033280.0, + "grad_norm": 0.7688077124363479, + "language_loss": 0.57691324, + "learning_rate": 2.293563279578978e-06, + "loss": 0.65272945, + "num_input_tokens_seen": 167729635, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 0.0247345, + "step": 7809, + "time_per_iteration": 3.055589199066162 + }, + { + "auxiliary_loss_clip": 0.06439595, + "auxiliary_loss_mlp": 0.01268316, + "balance_loss_clip": 0.06276885, + "balance_loss_mlp": 0.01254845, + "epoch": 0.4695626033368405, + "flos": 19204755120000.0, + "grad_norm": 2.3576337237105425, + "language_loss": 0.71649069, + "learning_rate": 2.2931780306250045e-06, + "loss": 0.7935698, + "num_input_tokens_seen": 167745135, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.13470459, + "step": 7810, + "time_per_iteration": 2.5001537799835205 + }, + { + "auxiliary_loss_clip": 0.06435918, + "auxiliary_loss_mlp": 0.01272852, + "balance_loss_clip": 0.06275883, + "balance_loss_mlp": 0.01259113, + "epoch": 0.4696227265895085, + "flos": 23009027466240.0, + "grad_norm": 3.6880824309964617, + "language_loss": 0.81146425, + "learning_rate": 2.29279277055369e-06, + "loss": 0.88855195, + "num_input_tokens_seen": 167763875, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.13726807, + "step": 7811, + "time_per_iteration": 2.5971217155456543 + }, + { + "auxiliary_loss_clip": 0.06437828, + "auxiliary_loss_mlp": 0.01267753, + "balance_loss_clip": 0.06276736, + "balance_loss_mlp": 0.0125405, + "epoch": 0.46968284984217645, + "flos": 21877169713920.0, + "grad_norm": 1.5426371434141024, + "language_loss": 0.80606401, + "learning_rate": 2.292407499379644e-06, + "loss": 0.88311982, + "num_input_tokens_seen": 167784895, + "router_z_loss_clip": 1.61035156, + "router_z_loss_mlp": 0.13708496, + "step": 7812, + "time_per_iteration": 2.5140600204467773 + }, + { + "auxiliary_loss_clip": 0.06435272, + "auxiliary_loss_mlp": 0.01271707, + "balance_loss_clip": 0.06277305, + "balance_loss_mlp": 0.01258445, + "epoch": 0.4697429730948444, + "flos": 19981217779200.0, + "grad_norm": 1.702985157553907, + "language_loss": 0.74653876, + "learning_rate": 2.292022217117477e-06, + "loss": 0.82360852, + "num_input_tokens_seen": 167803185, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.13256836, + "step": 7813, + "time_per_iteration": 2.530773401260376 + }, + { + "auxiliary_loss_clip": 0.06438613, + "auxiliary_loss_mlp": 0.01270357, + "balance_loss_clip": 0.06279637, + "balance_loss_mlp": 0.01256755, + "epoch": 0.4698030963475124, + "flos": 15161185912320.0, + "grad_norm": 2.103167897479233, + "language_loss": 0.84843278, + "learning_rate": 2.291636923781798e-06, + "loss": 0.92552245, + "num_input_tokens_seen": 167816550, + "router_z_loss_clip": 1.58886719, + "router_z_loss_mlp": 0.13604736, + "step": 7814, + "time_per_iteration": 2.550631046295166 + }, + { + "auxiliary_loss_clip": 0.06432581, + "auxiliary_loss_mlp": 0.01265742, + "balance_loss_clip": 0.06276342, + "balance_loss_mlp": 0.01252856, + "epoch": 0.46986321960018035, + "flos": 15155316126720.0, + "grad_norm": 2.71974016097947, + "language_loss": 0.82219559, + "learning_rate": 2.291251619387217e-06, + "loss": 0.89917886, + "num_input_tokens_seen": 167831845, + "router_z_loss_clip": 1.56152344, + "router_z_loss_mlp": 0.12896729, + "step": 7815, + "time_per_iteration": 2.508582592010498 + }, + { + "auxiliary_loss_clip": 0.06434117, + "auxiliary_loss_mlp": 0.01273411, + "balance_loss_clip": 0.06275953, + "balance_loss_mlp": 0.01259952, + "epoch": 0.4699233428528483, + "flos": 23115021281280.0, + "grad_norm": 2.356408218131492, + "language_loss": 0.77761489, + "learning_rate": 2.2908663039483468e-06, + "loss": 0.85469019, + "num_input_tokens_seen": 167850360, + "router_z_loss_clip": 1.58300781, + "router_z_loss_mlp": 0.13452148, + "step": 7816, + "time_per_iteration": 2.505244493484497 + }, + { + "auxiliary_loss_clip": 0.06334539, + "auxiliary_loss_mlp": 0.01254323, + "balance_loss_clip": 0.06264929, + "balance_loss_mlp": 0.01251993, + "epoch": 0.46998346610551633, + "flos": 68126917985280.0, + "grad_norm": 0.8142436419344395, + "language_loss": 0.58616334, + "learning_rate": 2.290480977479796e-06, + "loss": 0.66205192, + "num_input_tokens_seen": 167908660, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 0.02325439, + "step": 7817, + "time_per_iteration": 3.1171398162841797 + }, + { + "auxiliary_loss_clip": 0.0643587, + "auxiliary_loss_mlp": 0.01268626, + "balance_loss_clip": 0.06280724, + "balance_loss_mlp": 0.01255119, + "epoch": 0.4700435893581843, + "flos": 24135560484480.0, + "grad_norm": 1.6087842481989176, + "language_loss": 0.7922467, + "learning_rate": 2.2900956399961775e-06, + "loss": 0.8692916, + "num_input_tokens_seen": 167927905, + "router_z_loss_clip": 1.54980469, + "router_z_loss_mlp": 0.13513184, + "step": 7818, + "time_per_iteration": 2.5133657455444336 + }, + { + "auxiliary_loss_clip": 0.06435841, + "auxiliary_loss_mlp": 0.01270106, + "balance_loss_clip": 0.06278426, + "balance_loss_mlp": 0.01257279, + "epoch": 0.47010371261085226, + "flos": 20155624053120.0, + "grad_norm": 1.9598217577618973, + "language_loss": 0.83629054, + "learning_rate": 2.289710291512104e-06, + "loss": 0.91334999, + "num_input_tokens_seen": 167945995, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.12841797, + "step": 7819, + "time_per_iteration": 2.512434482574463 + }, + { + "auxiliary_loss_clip": 0.06440641, + "auxiliary_loss_mlp": 0.01268241, + "balance_loss_clip": 0.06277996, + "balance_loss_mlp": 0.01253519, + "epoch": 0.47016383586352023, + "flos": 15127587624960.0, + "grad_norm": 1.951811924314391, + "language_loss": 0.76718354, + "learning_rate": 2.289324932042186e-06, + "loss": 0.84427238, + "num_input_tokens_seen": 167963380, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.1472168, + "step": 7820, + "time_per_iteration": 2.4596121311187744 + }, + { + "auxiliary_loss_clip": 0.06434815, + "auxiliary_loss_mlp": 0.01270743, + "balance_loss_clip": 0.06279559, + "balance_loss_mlp": 0.01257636, + "epoch": 0.4702239591161882, + "flos": 13558044470400.0, + "grad_norm": 1.9648943700675503, + "language_loss": 0.74081844, + "learning_rate": 2.288939561601039e-06, + "loss": 0.81787401, + "num_input_tokens_seen": 167981740, + "router_z_loss_clip": 1.55175781, + "router_z_loss_mlp": 0.13116455, + "step": 7821, + "time_per_iteration": 2.4793312549591064 + }, + { + "auxiliary_loss_clip": 0.06431578, + "auxiliary_loss_mlp": 0.01268853, + "balance_loss_clip": 0.06276228, + "balance_loss_mlp": 0.01256658, + "epoch": 0.47028408236885616, + "flos": 24282825235200.0, + "grad_norm": 1.6413236035832721, + "language_loss": 0.89491117, + "learning_rate": 2.2885541802032746e-06, + "loss": 0.97191548, + "num_input_tokens_seen": 167999380, + "router_z_loss_clip": 1.54980469, + "router_z_loss_mlp": 0.12207031, + "step": 7822, + "time_per_iteration": 2.5880398750305176 + }, + { + "auxiliary_loss_clip": 0.06433522, + "auxiliary_loss_mlp": 0.01266311, + "balance_loss_clip": 0.06277143, + "balance_loss_mlp": 0.01254062, + "epoch": 0.4703442056215241, + "flos": 22863565578240.0, + "grad_norm": 1.438932852866735, + "language_loss": 0.79699898, + "learning_rate": 2.2881687878635055e-06, + "loss": 0.87399733, + "num_input_tokens_seen": 168018395, + "router_z_loss_clip": 1.56445312, + "router_z_loss_mlp": 0.12255859, + "step": 7823, + "time_per_iteration": 2.5661919116973877 + }, + { + "auxiliary_loss_clip": 0.06324597, + "auxiliary_loss_mlp": 0.01253174, + "balance_loss_clip": 0.06255165, + "balance_loss_mlp": 0.01250784, + "epoch": 0.4704043288741921, + "flos": 69262381463040.0, + "grad_norm": 0.6854102840454825, + "language_loss": 0.56514406, + "learning_rate": 2.2877833845963487e-06, + "loss": 0.64092177, + "num_input_tokens_seen": 168084080, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 0.02386475, + "step": 7824, + "time_per_iteration": 3.223728656768799 + }, + { + "auxiliary_loss_clip": 0.06442541, + "auxiliary_loss_mlp": 0.01269654, + "balance_loss_clip": 0.06281068, + "balance_loss_mlp": 0.01255837, + "epoch": 0.47046445212686006, + "flos": 18046971728640.0, + "grad_norm": 1.8116047863427858, + "language_loss": 0.81242847, + "learning_rate": 2.2873979704164157e-06, + "loss": 0.88955039, + "num_input_tokens_seen": 168101555, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.13818359, + "step": 7825, + "time_per_iteration": 2.4815890789031982 + }, + { + "auxiliary_loss_clip": 0.06441189, + "auxiliary_loss_mlp": 0.01270609, + "balance_loss_clip": 0.06280564, + "balance_loss_mlp": 0.01257443, + "epoch": 0.470524575379528, + "flos": 23958261244800.0, + "grad_norm": 2.19673184020816, + "language_loss": 0.67126369, + "learning_rate": 2.287012545338324e-06, + "loss": 0.74838167, + "num_input_tokens_seen": 168121530, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.1317749, + "step": 7826, + "time_per_iteration": 2.5820834636688232 + }, + { + "auxiliary_loss_clip": 0.06443623, + "auxiliary_loss_mlp": 0.01268086, + "balance_loss_clip": 0.06281798, + "balance_loss_mlp": 0.01254824, + "epoch": 0.470584698632196, + "flos": 18119367256320.0, + "grad_norm": 1.7021383964965269, + "language_loss": 0.8395251, + "learning_rate": 2.2866271093766877e-06, + "loss": 0.91664219, + "num_input_tokens_seen": 168140335, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.13250732, + "step": 7827, + "time_per_iteration": 2.4966769218444824 + }, + { + "auxiliary_loss_clip": 0.06333943, + "auxiliary_loss_mlp": 0.01253247, + "balance_loss_clip": 0.06264865, + "balance_loss_mlp": 0.01250913, + "epoch": 0.47064482188486395, + "flos": 57268555413120.0, + "grad_norm": 0.786622619089935, + "language_loss": 0.55656797, + "learning_rate": 2.286241662546122e-06, + "loss": 0.63243991, + "num_input_tokens_seen": 168200535, + "router_z_loss_clip": 0.6875, + "router_z_loss_mlp": 0.02328491, + "step": 7828, + "time_per_iteration": 3.1594009399414062 + }, + { + "auxiliary_loss_clip": 0.06439656, + "auxiliary_loss_mlp": 0.01268005, + "balance_loss_clip": 0.06281954, + "balance_loss_mlp": 0.01254743, + "epoch": 0.4707049451375319, + "flos": 17900922862080.0, + "grad_norm": 1.8377127056601934, + "language_loss": 0.80904895, + "learning_rate": 2.285856204861245e-06, + "loss": 0.88612556, + "num_input_tokens_seen": 168219610, + "router_z_loss_clip": 1.57519531, + "router_z_loss_mlp": 0.13256836, + "step": 7829, + "time_per_iteration": 2.485140800476074 + }, + { + "auxiliary_loss_clip": 0.0643746, + "auxiliary_loss_mlp": 0.01272596, + "balance_loss_clip": 0.06279843, + "balance_loss_mlp": 0.0126024, + "epoch": 0.47076506839019994, + "flos": 25240402494720.0, + "grad_norm": 1.2696703606336757, + "language_loss": 0.76018727, + "learning_rate": 2.2854707363366703e-06, + "loss": 0.83728784, + "num_input_tokens_seen": 168242505, + "router_z_loss_clip": 1.57519531, + "router_z_loss_mlp": 0.12359619, + "step": 7830, + "time_per_iteration": 2.6114325523376465 + }, + { + "auxiliary_loss_clip": 0.06438384, + "auxiliary_loss_mlp": 0.01269492, + "balance_loss_clip": 0.06283822, + "balance_loss_mlp": 0.01257016, + "epoch": 0.4708251916428679, + "flos": 13484684620800.0, + "grad_norm": 2.037519777934202, + "language_loss": 0.78570348, + "learning_rate": 2.2850852569870177e-06, + "loss": 0.86278224, + "num_input_tokens_seen": 168260220, + "router_z_loss_clip": 1.54785156, + "router_z_loss_mlp": 0.12463379, + "step": 7831, + "time_per_iteration": 2.4759325981140137 + }, + { + "auxiliary_loss_clip": 0.06447008, + "auxiliary_loss_mlp": 0.01269408, + "balance_loss_clip": 0.06280228, + "balance_loss_mlp": 0.01255365, + "epoch": 0.47088531489553587, + "flos": 30154646678400.0, + "grad_norm": 1.667499960909574, + "language_loss": 0.7574442, + "learning_rate": 2.2846997668269033e-06, + "loss": 0.83460832, + "num_input_tokens_seen": 168277360, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.140625, + "step": 7832, + "time_per_iteration": 2.6298487186431885 + }, + { + "auxiliary_loss_clip": 0.06434175, + "auxiliary_loss_mlp": 0.01267877, + "balance_loss_clip": 0.0627791, + "balance_loss_mlp": 0.01256844, + "epoch": 0.47094543814820383, + "flos": 21804648405120.0, + "grad_norm": 1.2855995862723888, + "language_loss": 0.74791807, + "learning_rate": 2.2843142658709454e-06, + "loss": 0.82493854, + "num_input_tokens_seen": 168296605, + "router_z_loss_clip": 1.5625, + "router_z_loss_mlp": 0.1104126, + "step": 7833, + "time_per_iteration": 2.5464203357696533 + }, + { + "auxiliary_loss_clip": 0.06437977, + "auxiliary_loss_mlp": 0.01266439, + "balance_loss_clip": 0.06281009, + "balance_loss_mlp": 0.01254118, + "epoch": 0.4710055614008718, + "flos": 23009698298880.0, + "grad_norm": 1.569702279619268, + "language_loss": 0.76145566, + "learning_rate": 2.283928754133762e-06, + "loss": 0.83849978, + "num_input_tokens_seen": 168316205, + "router_z_loss_clip": 1.57128906, + "router_z_loss_mlp": 0.12329102, + "step": 7834, + "time_per_iteration": 2.6125214099884033 + }, + { + "auxiliary_loss_clip": 0.06433094, + "auxiliary_loss_mlp": 0.01266226, + "balance_loss_clip": 0.06278115, + "balance_loss_mlp": 0.01254078, + "epoch": 0.47106568465353976, + "flos": 42751256601600.0, + "grad_norm": 1.4292072421609816, + "language_loss": 0.66957295, + "learning_rate": 2.283543231629972e-06, + "loss": 0.74656606, + "num_input_tokens_seen": 168338935, + "router_z_loss_clip": 1.54882812, + "router_z_loss_mlp": 0.12158203, + "step": 7835, + "time_per_iteration": 5.518744707107544 + }, + { + "auxiliary_loss_clip": 0.06330478, + "auxiliary_loss_mlp": 0.01256395, + "balance_loss_clip": 0.06261497, + "balance_loss_mlp": 0.01253791, + "epoch": 0.4711258079062077, + "flos": 68571116807040.0, + "grad_norm": 0.853960187866431, + "language_loss": 0.62259066, + "learning_rate": 2.283157698374194e-06, + "loss": 0.69845939, + "num_input_tokens_seen": 168392800, + "router_z_loss_clip": 0.68798828, + "router_z_loss_mlp": 0.02604675, + "step": 7836, + "time_per_iteration": 3.1000564098358154 + }, + { + "auxiliary_loss_clip": 0.06439401, + "auxiliary_loss_mlp": 0.01267232, + "balance_loss_clip": 0.06274831, + "balance_loss_mlp": 0.01254006, + "epoch": 0.4711859311588757, + "flos": 25453522154880.0, + "grad_norm": 1.6974399997165228, + "language_loss": 0.69606686, + "learning_rate": 2.2827721543810475e-06, + "loss": 0.7731331, + "num_input_tokens_seen": 168412940, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.13238525, + "step": 7837, + "time_per_iteration": 2.5282108783721924 + }, + { + "auxiliary_loss_clip": 0.06437849, + "auxiliary_loss_mlp": 0.01268383, + "balance_loss_clip": 0.06277718, + "balance_loss_mlp": 0.01255061, + "epoch": 0.47124605441154366, + "flos": 21988488263040.0, + "grad_norm": 1.9658270715858404, + "language_loss": 0.66562694, + "learning_rate": 2.282386599665153e-06, + "loss": 0.74268925, + "num_input_tokens_seen": 168431995, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.13311768, + "step": 7838, + "time_per_iteration": 2.5846638679504395 + }, + { + "auxiliary_loss_clip": 0.06440166, + "auxiliary_loss_mlp": 0.01268362, + "balance_loss_clip": 0.06276537, + "balance_loss_mlp": 0.01255082, + "epoch": 0.4713061776642116, + "flos": 25420049648640.0, + "grad_norm": 5.850528361960432, + "language_loss": 0.77699667, + "learning_rate": 2.2820010342411304e-06, + "loss": 0.85408199, + "num_input_tokens_seen": 168454585, + "router_z_loss_clip": 1.63476562, + "router_z_loss_mlp": 0.1328125, + "step": 7839, + "time_per_iteration": 2.5414958000183105 + }, + { + "auxiliary_loss_clip": 0.06429788, + "auxiliary_loss_mlp": 0.01268311, + "balance_loss_clip": 0.06275208, + "balance_loss_mlp": 0.0125592, + "epoch": 0.4713663009168796, + "flos": 26549559486720.0, + "grad_norm": 2.242315176037199, + "language_loss": 0.73086643, + "learning_rate": 2.2816154581235993e-06, + "loss": 0.80784744, + "num_input_tokens_seen": 168471265, + "router_z_loss_clip": 1.54492188, + "router_z_loss_mlp": 0.12390137, + "step": 7840, + "time_per_iteration": 2.5519280433654785 + }, + { + "auxiliary_loss_clip": 0.06431505, + "auxiliary_loss_mlp": 0.01263733, + "balance_loss_clip": 0.06274457, + "balance_loss_mlp": 0.01251562, + "epoch": 0.47142642416954755, + "flos": 23630426945280.0, + "grad_norm": 1.566587637557085, + "language_loss": 0.75317335, + "learning_rate": 2.2812298713271833e-06, + "loss": 0.83012575, + "num_input_tokens_seen": 168491360, + "router_z_loss_clip": 1.5703125, + "router_z_loss_mlp": 0.1217041, + "step": 7841, + "time_per_iteration": 2.552835702896118 + }, + { + "auxiliary_loss_clip": 0.06436779, + "auxiliary_loss_mlp": 0.01265983, + "balance_loss_clip": 0.06277694, + "balance_loss_mlp": 0.01252947, + "epoch": 0.4714865474222155, + "flos": 22316783760000.0, + "grad_norm": 1.5550986710562988, + "language_loss": 0.70513815, + "learning_rate": 2.280844273866501e-06, + "loss": 0.78216577, + "num_input_tokens_seen": 168511335, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.13049316, + "step": 7842, + "time_per_iteration": 3.933955192565918 + }, + { + "auxiliary_loss_clip": 0.06436103, + "auxiliary_loss_mlp": 0.01268574, + "balance_loss_clip": 0.0627934, + "balance_loss_mlp": 0.01255891, + "epoch": 0.4715466706748835, + "flos": 17828317699200.0, + "grad_norm": 1.9804632158033957, + "language_loss": 0.79634649, + "learning_rate": 2.280458665756177e-06, + "loss": 0.87339324, + "num_input_tokens_seen": 168529920, + "router_z_loss_clip": 1.56738281, + "router_z_loss_mlp": 0.12677002, + "step": 7843, + "time_per_iteration": 2.4907753467559814 + }, + { + "auxiliary_loss_clip": 0.06434722, + "auxiliary_loss_mlp": 0.01265319, + "balance_loss_clip": 0.06276919, + "balance_loss_mlp": 0.0125301, + "epoch": 0.4716067939275515, + "flos": 23666289292800.0, + "grad_norm": 1.6302002599700955, + "language_loss": 0.74402809, + "learning_rate": 2.280073047010832e-06, + "loss": 0.82102847, + "num_input_tokens_seen": 168550595, + "router_z_loss_clip": 1.57714844, + "router_z_loss_mlp": 0.12298584, + "step": 7844, + "time_per_iteration": 2.5746476650238037 + }, + { + "auxiliary_loss_clip": 0.06436022, + "auxiliary_loss_mlp": 0.0127037, + "balance_loss_clip": 0.0627865, + "balance_loss_mlp": 0.01257138, + "epoch": 0.47166691718021947, + "flos": 17935778960640.0, + "grad_norm": 2.158450508091108, + "language_loss": 0.78678179, + "learning_rate": 2.279687417645088e-06, + "loss": 0.86384571, + "num_input_tokens_seen": 168569765, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.13238525, + "step": 7845, + "time_per_iteration": 2.4827558994293213 + }, + { + "auxiliary_loss_clip": 0.06430048, + "auxiliary_loss_mlp": 0.01266435, + "balance_loss_clip": 0.06273912, + "balance_loss_mlp": 0.01254991, + "epoch": 0.47172704043288743, + "flos": 26621787306240.0, + "grad_norm": 1.2653259456946966, + "language_loss": 0.73458219, + "learning_rate": 2.2793017776735703e-06, + "loss": 0.81154698, + "num_input_tokens_seen": 168591525, + "router_z_loss_clip": 1.56152344, + "router_z_loss_mlp": 0.11450195, + "step": 7846, + "time_per_iteration": 2.586641550064087 + }, + { + "auxiliary_loss_clip": 0.06430165, + "auxiliary_loss_mlp": 0.01268985, + "balance_loss_clip": 0.06277196, + "balance_loss_mlp": 0.01256754, + "epoch": 0.4717871636855554, + "flos": 27929225289600.0, + "grad_norm": 1.2918573904220954, + "language_loss": 0.74434412, + "learning_rate": 2.2789161271109e-06, + "loss": 0.82133555, + "num_input_tokens_seen": 168611235, + "router_z_loss_clip": 1.52929688, + "router_z_loss_mlp": 0.12243652, + "step": 7847, + "time_per_iteration": 3.984661817550659 + }, + { + "auxiliary_loss_clip": 0.06434786, + "auxiliary_loss_mlp": 0.0126996, + "balance_loss_clip": 0.06276622, + "balance_loss_mlp": 0.01258123, + "epoch": 0.47184728693822336, + "flos": 14507571738240.0, + "grad_norm": 1.68455833448323, + "language_loss": 0.81004, + "learning_rate": 2.278530465971703e-06, + "loss": 0.88708746, + "num_input_tokens_seen": 168628710, + "router_z_loss_clip": 1.58105469, + "router_z_loss_mlp": 0.1184082, + "step": 7848, + "time_per_iteration": 2.482759714126587 + }, + { + "auxiliary_loss_clip": 0.06438575, + "auxiliary_loss_mlp": 0.01265775, + "balance_loss_clip": 0.06279046, + "balance_loss_mlp": 0.01252394, + "epoch": 0.47190741019089133, + "flos": 17862041767680.0, + "grad_norm": 1.8089027190058555, + "language_loss": 0.70106918, + "learning_rate": 2.2781447942706032e-06, + "loss": 0.77811265, + "num_input_tokens_seen": 168645645, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.1338501, + "step": 7849, + "time_per_iteration": 2.5101277828216553 + }, + { + "auxiliary_loss_clip": 0.06444675, + "auxiliary_loss_mlp": 0.01269385, + "balance_loss_clip": 0.06280467, + "balance_loss_mlp": 0.0125539, + "epoch": 0.4719675334435593, + "flos": 17901384059520.0, + "grad_norm": 1.915736246727948, + "language_loss": 0.69964916, + "learning_rate": 2.277759112022224e-06, + "loss": 0.77678978, + "num_input_tokens_seen": 168664165, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.14001465, + "step": 7850, + "time_per_iteration": 2.46455979347229 + }, + { + "auxiliary_loss_clip": 0.06441706, + "auxiliary_loss_mlp": 0.01269243, + "balance_loss_clip": 0.0627879, + "balance_loss_mlp": 0.01255951, + "epoch": 0.47202765669622726, + "flos": 20710665498240.0, + "grad_norm": 1.953909301983903, + "language_loss": 0.75806379, + "learning_rate": 2.2773734192411916e-06, + "loss": 0.83517331, + "num_input_tokens_seen": 168681940, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.13305664, + "step": 7851, + "time_per_iteration": 2.5298452377319336 + }, + { + "auxiliary_loss_clip": 0.06440549, + "auxiliary_loss_mlp": 0.01271731, + "balance_loss_clip": 0.06277989, + "balance_loss_mlp": 0.01257534, + "epoch": 0.4720877799488952, + "flos": 16365439192320.0, + "grad_norm": 1.905541371588542, + "language_loss": 0.76767981, + "learning_rate": 2.276987715942132e-06, + "loss": 0.84480262, + "num_input_tokens_seen": 168698830, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.14196777, + "step": 7852, + "time_per_iteration": 2.473349094390869 + }, + { + "auxiliary_loss_clip": 0.06431545, + "auxiliary_loss_mlp": 0.01270384, + "balance_loss_clip": 0.06273787, + "balance_loss_mlp": 0.01257742, + "epoch": 0.4721479032015632, + "flos": 20674509661440.0, + "grad_norm": 2.394869083314355, + "language_loss": 0.69452804, + "learning_rate": 2.2766020021396696e-06, + "loss": 0.77154732, + "num_input_tokens_seen": 168718305, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.12658691, + "step": 7853, + "time_per_iteration": 2.537550210952759 + }, + { + "auxiliary_loss_clip": 0.06333929, + "auxiliary_loss_mlp": 0.01250651, + "balance_loss_clip": 0.06264801, + "balance_loss_mlp": 0.01248457, + "epoch": 0.47220802645423116, + "flos": 67773367681920.0, + "grad_norm": 0.6896509796832918, + "language_loss": 0.50247812, + "learning_rate": 2.276216277848432e-06, + "loss": 0.57832396, + "num_input_tokens_seen": 168782365, + "router_z_loss_clip": 0.69140625, + "router_z_loss_mlp": 0.02197266, + "step": 7854, + "time_per_iteration": 3.2550642490386963 + }, + { + "auxiliary_loss_clip": 0.06436136, + "auxiliary_loss_mlp": 0.0126914, + "balance_loss_clip": 0.06276229, + "balance_loss_mlp": 0.0125583, + "epoch": 0.4722681497068991, + "flos": 20927474737920.0, + "grad_norm": 1.8228483302344913, + "language_loss": 0.63672256, + "learning_rate": 2.2758305430830455e-06, + "loss": 0.71377528, + "num_input_tokens_seen": 168800485, + "router_z_loss_clip": 1.60058594, + "router_z_loss_mlp": 0.13317871, + "step": 7855, + "time_per_iteration": 2.5252599716186523 + }, + { + "auxiliary_loss_clip": 0.06439453, + "auxiliary_loss_mlp": 0.01268333, + "balance_loss_clip": 0.06280654, + "balance_loss_mlp": 0.01255715, + "epoch": 0.4723282729595671, + "flos": 28300594584960.0, + "grad_norm": 1.8174966086465816, + "language_loss": 0.76136196, + "learning_rate": 2.2754447978581376e-06, + "loss": 0.83843982, + "num_input_tokens_seen": 168818965, + "router_z_loss_clip": 1.58789062, + "router_z_loss_mlp": 0.1262207, + "step": 7856, + "time_per_iteration": 2.560236692428589 + }, + { + "auxiliary_loss_clip": 0.06436295, + "auxiliary_loss_mlp": 0.01269996, + "balance_loss_clip": 0.06279726, + "balance_loss_mlp": 0.01258284, + "epoch": 0.4723883962122351, + "flos": 27132287506560.0, + "grad_norm": 1.7138943667728106, + "language_loss": 0.750875, + "learning_rate": 2.2750590421883347e-06, + "loss": 0.8279379, + "num_input_tokens_seen": 168840355, + "router_z_loss_clip": 1.56347656, + "router_z_loss_mlp": 0.11706543, + "step": 7857, + "time_per_iteration": 2.5613489151000977 + }, + { + "auxiliary_loss_clip": 0.06436294, + "auxiliary_loss_mlp": 0.01270819, + "balance_loss_clip": 0.0628143, + "balance_loss_mlp": 0.01258946, + "epoch": 0.47244851946490307, + "flos": 31544794241280.0, + "grad_norm": 1.4694813046790665, + "language_loss": 0.64839488, + "learning_rate": 2.2746732760882655e-06, + "loss": 0.72546607, + "num_input_tokens_seen": 168861765, + "router_z_loss_clip": 1.54882812, + "router_z_loss_mlp": 0.11889648, + "step": 7858, + "time_per_iteration": 2.5957653522491455 + }, + { + "auxiliary_loss_clip": 0.06431169, + "auxiliary_loss_mlp": 0.01271908, + "balance_loss_clip": 0.06278542, + "balance_loss_mlp": 0.01259719, + "epoch": 0.47250864271757104, + "flos": 20892828274560.0, + "grad_norm": 1.741748713475879, + "language_loss": 0.71104157, + "learning_rate": 2.2742874995725575e-06, + "loss": 0.78807235, + "num_input_tokens_seen": 168881310, + "router_z_loss_clip": 1.52539062, + "router_z_loss_mlp": 0.12194824, + "step": 7859, + "time_per_iteration": 2.541404962539673 + }, + { + "auxiliary_loss_clip": 0.06440333, + "auxiliary_loss_mlp": 0.01270209, + "balance_loss_clip": 0.06277637, + "balance_loss_mlp": 0.01257776, + "epoch": 0.472568765970239, + "flos": 20528376940800.0, + "grad_norm": 1.7364161900477437, + "language_loss": 0.62341475, + "learning_rate": 2.2739017126558413e-06, + "loss": 0.70052016, + "num_input_tokens_seen": 168899470, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.12426758, + "step": 7860, + "time_per_iteration": 2.5165910720825195 + }, + { + "auxiliary_loss_clip": 0.06438711, + "auxiliary_loss_mlp": 0.01267574, + "balance_loss_clip": 0.06280093, + "balance_loss_mlp": 0.01254914, + "epoch": 0.47262888922290697, + "flos": 35813306534400.0, + "grad_norm": 2.092826385669962, + "language_loss": 0.72540921, + "learning_rate": 2.2735159153527445e-06, + "loss": 0.80247205, + "num_input_tokens_seen": 168921495, + "router_z_loss_clip": 1.58398438, + "router_z_loss_mlp": 0.12658691, + "step": 7861, + "time_per_iteration": 2.6575915813446045 + }, + { + "auxiliary_loss_clip": 0.06439754, + "auxiliary_loss_mlp": 0.01268288, + "balance_loss_clip": 0.0628088, + "balance_loss_mlp": 0.01254734, + "epoch": 0.47268901247557493, + "flos": 20674006536960.0, + "grad_norm": 2.2960282018232965, + "language_loss": 0.85134012, + "learning_rate": 2.273130107677896e-06, + "loss": 0.92842054, + "num_input_tokens_seen": 168940515, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 0.13555908, + "step": 7862, + "time_per_iteration": 2.4969582557678223 + }, + { + "auxiliary_loss_clip": 0.06443156, + "auxiliary_loss_mlp": 0.01269094, + "balance_loss_clip": 0.06283151, + "balance_loss_mlp": 0.012566, + "epoch": 0.4727491357282429, + "flos": 19579394724480.0, + "grad_norm": 1.7759944267926648, + "language_loss": 0.84885079, + "learning_rate": 2.272744289645927e-06, + "loss": 0.92597324, + "num_input_tokens_seen": 168958340, + "router_z_loss_clip": 1.60058594, + "router_z_loss_mlp": 0.12506104, + "step": 7863, + "time_per_iteration": 2.545445442199707 + }, + { + "auxiliary_loss_clip": 0.06435807, + "auxiliary_loss_mlp": 0.01268812, + "balance_loss_clip": 0.06279373, + "balance_loss_mlp": 0.01256873, + "epoch": 0.47280925898091086, + "flos": 18222090762240.0, + "grad_norm": 1.953539417417106, + "language_loss": 0.6582734, + "learning_rate": 2.272358461271467e-06, + "loss": 0.73531955, + "num_input_tokens_seen": 168974850, + "router_z_loss_clip": 1.5625, + "router_z_loss_mlp": 0.11950684, + "step": 7864, + "time_per_iteration": 2.4730403423309326 + }, + { + "auxiliary_loss_clip": 0.06438613, + "auxiliary_loss_mlp": 0.01269576, + "balance_loss_clip": 0.06280264, + "balance_loss_mlp": 0.01257619, + "epoch": 0.4728693822335788, + "flos": 17827604939520.0, + "grad_norm": 1.945688521953863, + "language_loss": 0.65635985, + "learning_rate": 2.271972622569147e-06, + "loss": 0.73344177, + "num_input_tokens_seen": 168992860, + "router_z_loss_clip": 1.58398438, + "router_z_loss_mlp": 0.11962891, + "step": 7865, + "time_per_iteration": 2.498135805130005 + }, + { + "auxiliary_loss_clip": 0.06430352, + "auxiliary_loss_mlp": 0.01270111, + "balance_loss_clip": 0.06277367, + "balance_loss_mlp": 0.01257671, + "epoch": 0.4729295054862468, + "flos": 20601359447040.0, + "grad_norm": 2.5713138482446234, + "language_loss": 0.73970878, + "learning_rate": 2.2715867735535976e-06, + "loss": 0.81671345, + "num_input_tokens_seen": 169010325, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.12445068, + "step": 7866, + "time_per_iteration": 2.495232582092285 + }, + { + "auxiliary_loss_clip": 0.06437797, + "auxiliary_loss_mlp": 0.01265922, + "balance_loss_clip": 0.06279261, + "balance_loss_mlp": 0.01254347, + "epoch": 0.47298962873891476, + "flos": 23374862392320.0, + "grad_norm": 2.8570557032751522, + "language_loss": 0.83387589, + "learning_rate": 2.271200914239451e-06, + "loss": 0.91091311, + "num_input_tokens_seen": 169029840, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.11578369, + "step": 7867, + "time_per_iteration": 2.565706968307495 + }, + { + "auxiliary_loss_clip": 0.06430209, + "auxiliary_loss_mlp": 0.01265413, + "balance_loss_clip": 0.06275865, + "balance_loss_mlp": 0.01253391, + "epoch": 0.4730497519915827, + "flos": 22058410095360.0, + "grad_norm": 1.6535025871822049, + "language_loss": 0.79521739, + "learning_rate": 2.2708150446413385e-06, + "loss": 0.87217355, + "num_input_tokens_seen": 169049975, + "router_z_loss_clip": 1.54492188, + "router_z_loss_mlp": 0.12036133, + "step": 7868, + "time_per_iteration": 2.549220561981201 + }, + { + "auxiliary_loss_clip": 0.06442262, + "auxiliary_loss_mlp": 0.01268103, + "balance_loss_clip": 0.06279381, + "balance_loss_mlp": 0.01255169, + "epoch": 0.4731098752442507, + "flos": 21076165008000.0, + "grad_norm": 1.8227151972017304, + "language_loss": 0.75178695, + "learning_rate": 2.2704291647738915e-06, + "loss": 0.82889056, + "num_input_tokens_seen": 169069540, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.12945557, + "step": 7869, + "time_per_iteration": 2.5188441276550293 + }, + { + "auxiliary_loss_clip": 0.06441551, + "auxiliary_loss_mlp": 0.01271574, + "balance_loss_clip": 0.06282122, + "balance_loss_mlp": 0.01258014, + "epoch": 0.4731699984969187, + "flos": 22535395862400.0, + "grad_norm": 1.4513841331120019, + "language_loss": 0.73749697, + "learning_rate": 2.2700432746517443e-06, + "loss": 0.81462824, + "num_input_tokens_seen": 169089940, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.13555908, + "step": 7870, + "time_per_iteration": 2.520761251449585 + }, + { + "auxiliary_loss_clip": 0.0644481, + "auxiliary_loss_mlp": 0.01272916, + "balance_loss_clip": 0.06280311, + "balance_loss_mlp": 0.01259231, + "epoch": 0.4732301217495867, + "flos": 24904769765760.0, + "grad_norm": 1.9907019842809281, + "language_loss": 0.81971508, + "learning_rate": 2.2696573742895292e-06, + "loss": 0.89689231, + "num_input_tokens_seen": 169109650, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.13684082, + "step": 7871, + "time_per_iteration": 2.7390120029449463 + }, + { + "auxiliary_loss_clip": 0.06436551, + "auxiliary_loss_mlp": 0.01267445, + "balance_loss_clip": 0.06278443, + "balance_loss_mlp": 0.01254261, + "epoch": 0.47329024500225464, + "flos": 22791128123520.0, + "grad_norm": 1.7255093919697873, + "language_loss": 0.76232624, + "learning_rate": 2.269271463701879e-06, + "loss": 0.8393662, + "num_input_tokens_seen": 169128990, + "router_z_loss_clip": 1.58203125, + "router_z_loss_mlp": 0.13189697, + "step": 7872, + "time_per_iteration": 2.6356093883514404 + }, + { + "auxiliary_loss_clip": 0.06438267, + "auxiliary_loss_mlp": 0.0127024, + "balance_loss_clip": 0.06279084, + "balance_loss_mlp": 0.01256847, + "epoch": 0.4733503682549226, + "flos": 38705884531200.0, + "grad_norm": 1.877318740282883, + "language_loss": 0.67809367, + "learning_rate": 2.268885542903428e-06, + "loss": 0.75517869, + "num_input_tokens_seen": 169154645, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.1338501, + "step": 7873, + "time_per_iteration": 2.7092511653900146 + }, + { + "auxiliary_loss_clip": 0.06434255, + "auxiliary_loss_mlp": 0.01269292, + "balance_loss_clip": 0.06277623, + "balance_loss_mlp": 0.0125699, + "epoch": 0.47341049150759057, + "flos": 22973584389120.0, + "grad_norm": 1.442307420398724, + "language_loss": 0.72792107, + "learning_rate": 2.26849961190881e-06, + "loss": 0.80495656, + "num_input_tokens_seen": 169174995, + "router_z_loss_clip": 1.56738281, + "router_z_loss_mlp": 0.12298584, + "step": 7874, + "time_per_iteration": 3.9462826251983643 + }, + { + "auxiliary_loss_clip": 0.06440391, + "auxiliary_loss_mlp": 0.01271103, + "balance_loss_clip": 0.06281446, + "balance_loss_mlp": 0.01258431, + "epoch": 0.47347061476025853, + "flos": 14543769502080.0, + "grad_norm": 2.253933500743018, + "language_loss": 0.65938866, + "learning_rate": 2.26811367073266e-06, + "loss": 0.7365036, + "num_input_tokens_seen": 169191815, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.12658691, + "step": 7875, + "time_per_iteration": 4.013593435287476 + }, + { + "auxiliary_loss_clip": 0.06443131, + "auxiliary_loss_mlp": 0.01267762, + "balance_loss_clip": 0.06284615, + "balance_loss_mlp": 0.01254571, + "epoch": 0.4735307380129265, + "flos": 30271080326400.0, + "grad_norm": 2.373261357507393, + "language_loss": 0.80868709, + "learning_rate": 2.2677277193896125e-06, + "loss": 0.88579601, + "num_input_tokens_seen": 169210430, + "router_z_loss_clip": 1.58398438, + "router_z_loss_mlp": 0.13183594, + "step": 7876, + "time_per_iteration": 2.577624797821045 + }, + { + "auxiliary_loss_clip": 0.06439028, + "auxiliary_loss_mlp": 0.01268228, + "balance_loss_clip": 0.0628099, + "balance_loss_mlp": 0.0125583, + "epoch": 0.47359086126559446, + "flos": 19397148094080.0, + "grad_norm": 1.7113236821341018, + "language_loss": 0.792979, + "learning_rate": 2.267341757894304e-06, + "loss": 0.87005162, + "num_input_tokens_seen": 169229295, + "router_z_loss_clip": 1.58007812, + "router_z_loss_mlp": 0.12402344, + "step": 7877, + "time_per_iteration": 2.5248916149139404 + }, + { + "auxiliary_loss_clip": 0.06431633, + "auxiliary_loss_mlp": 0.01269276, + "balance_loss_clip": 0.0627646, + "balance_loss_mlp": 0.01256938, + "epoch": 0.47365098451826243, + "flos": 21944995194240.0, + "grad_norm": 1.9478135029908927, + "language_loss": 0.70673579, + "learning_rate": 2.2669557862613685e-06, + "loss": 0.78374487, + "num_input_tokens_seen": 169247855, + "router_z_loss_clip": 1.55273438, + "router_z_loss_mlp": 0.12335205, + "step": 7878, + "time_per_iteration": 2.5023298263549805 + }, + { + "auxiliary_loss_clip": 0.06432398, + "auxiliary_loss_mlp": 0.01268548, + "balance_loss_clip": 0.06278147, + "balance_loss_mlp": 0.01256382, + "epoch": 0.4737111077709304, + "flos": 25851571776000.0, + "grad_norm": 1.6314467446120229, + "language_loss": 0.75137293, + "learning_rate": 2.2665698045054425e-06, + "loss": 0.82838243, + "num_input_tokens_seen": 169268860, + "router_z_loss_clip": 1.54199219, + "router_z_loss_mlp": 0.1217041, + "step": 7879, + "time_per_iteration": 2.623811960220337 + }, + { + "auxiliary_loss_clip": 0.06320075, + "auxiliary_loss_mlp": 0.01265678, + "balance_loss_clip": 0.06251323, + "balance_loss_mlp": 0.01262992, + "epoch": 0.47377123102359836, + "flos": 67779461831040.0, + "grad_norm": 0.7167002771941348, + "language_loss": 0.6131798, + "learning_rate": 2.266183812641164e-06, + "loss": 0.68903732, + "num_input_tokens_seen": 169331855, + "router_z_loss_clip": 0.6875, + "router_z_loss_mlp": 0.02690125, + "step": 7880, + "time_per_iteration": 3.159388303756714 + }, + { + "auxiliary_loss_clip": 0.06434937, + "auxiliary_loss_mlp": 0.01268898, + "balance_loss_clip": 0.06278567, + "balance_loss_mlp": 0.01256035, + "epoch": 0.4738313542762663, + "flos": 24322796432640.0, + "grad_norm": 1.5964233369580554, + "language_loss": 0.68369412, + "learning_rate": 2.2657978106831675e-06, + "loss": 0.76073253, + "num_input_tokens_seen": 169352175, + "router_z_loss_clip": 1.56542969, + "router_z_loss_mlp": 0.12866211, + "step": 7881, + "time_per_iteration": 4.010294198989868 + }, + { + "auxiliary_loss_clip": 0.06434233, + "auxiliary_loss_mlp": 0.01267509, + "balance_loss_clip": 0.06279774, + "balance_loss_mlp": 0.01255964, + "epoch": 0.4738914775289343, + "flos": 20711797528320.0, + "grad_norm": 1.8204307046333812, + "language_loss": 0.77692872, + "learning_rate": 2.265411798646092e-06, + "loss": 0.85394609, + "num_input_tokens_seen": 169371215, + "router_z_loss_clip": 1.54492188, + "router_z_loss_mlp": 0.11541748, + "step": 7882, + "time_per_iteration": 2.5205814838409424 + }, + { + "auxiliary_loss_clip": 0.06437336, + "auxiliary_loss_mlp": 0.01269511, + "balance_loss_clip": 0.06279442, + "balance_loss_mlp": 0.01257208, + "epoch": 0.4739516007816023, + "flos": 25453228665600.0, + "grad_norm": 1.3763225621826927, + "language_loss": 0.76357329, + "learning_rate": 2.2650257765445747e-06, + "loss": 0.84064174, + "num_input_tokens_seen": 169391745, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.12304688, + "step": 7883, + "time_per_iteration": 2.5500354766845703 + }, + { + "auxiliary_loss_clip": 0.0643235, + "auxiliary_loss_mlp": 0.0126636, + "balance_loss_clip": 0.06278035, + "balance_loss_mlp": 0.01255101, + "epoch": 0.4740117240342703, + "flos": 19980463092480.0, + "grad_norm": 1.6935272320670107, + "language_loss": 0.72225314, + "learning_rate": 2.2646397443932525e-06, + "loss": 0.79924023, + "num_input_tokens_seen": 169409845, + "router_z_loss_clip": 1.54101562, + "router_z_loss_mlp": 0.1126709, + "step": 7884, + "time_per_iteration": 2.5347273349761963 + }, + { + "auxiliary_loss_clip": 0.06443354, + "auxiliary_loss_mlp": 0.01266451, + "balance_loss_clip": 0.06279097, + "balance_loss_mlp": 0.01252944, + "epoch": 0.47407184728693824, + "flos": 15665229348480.0, + "grad_norm": 2.6351569696409314, + "language_loss": 0.82340348, + "learning_rate": 2.2642537022067655e-06, + "loss": 0.90050149, + "num_input_tokens_seen": 169426085, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.13513184, + "step": 7885, + "time_per_iteration": 2.482201099395752 + }, + { + "auxiliary_loss_clip": 0.06433931, + "auxiliary_loss_mlp": 0.01271088, + "balance_loss_clip": 0.06277239, + "balance_loss_mlp": 0.01259262, + "epoch": 0.4741319705396062, + "flos": 18594843649920.0, + "grad_norm": 1.913533031103811, + "language_loss": 0.7349298, + "learning_rate": 2.263867649999751e-06, + "loss": 0.81198001, + "num_input_tokens_seen": 169444705, + "router_z_loss_clip": 1.56738281, + "router_z_loss_mlp": 0.11816406, + "step": 7886, + "time_per_iteration": 3.95589017868042 + }, + { + "auxiliary_loss_clip": 0.06445764, + "auxiliary_loss_mlp": 0.01269023, + "balance_loss_clip": 0.0628106, + "balance_loss_mlp": 0.01256655, + "epoch": 0.47419209379227417, + "flos": 13266114445440.0, + "grad_norm": 1.8957247676006206, + "language_loss": 0.74131465, + "learning_rate": 2.263481587786849e-06, + "loss": 0.81846249, + "num_input_tokens_seen": 169460850, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.12384033, + "step": 7887, + "time_per_iteration": 2.558175563812256 + }, + { + "auxiliary_loss_clip": 0.06431396, + "auxiliary_loss_mlp": 0.01269479, + "balance_loss_clip": 0.06276178, + "balance_loss_mlp": 0.01257499, + "epoch": 0.47425221704494214, + "flos": 20049630238080.0, + "grad_norm": 2.0468025330010016, + "language_loss": 0.7742272, + "learning_rate": 2.2630955155826993e-06, + "loss": 0.85123587, + "num_input_tokens_seen": 169478890, + "router_z_loss_clip": 1.55273438, + "router_z_loss_mlp": 0.11987305, + "step": 7888, + "time_per_iteration": 2.5532913208007812 + }, + { + "auxiliary_loss_clip": 0.06440586, + "auxiliary_loss_mlp": 0.01268245, + "balance_loss_clip": 0.06282103, + "balance_loss_mlp": 0.01255978, + "epoch": 0.4743123402976101, + "flos": 27279300695040.0, + "grad_norm": 1.7248476258859713, + "language_loss": 0.72833514, + "learning_rate": 2.2627094334019406e-06, + "loss": 0.80542344, + "num_input_tokens_seen": 169499690, + "router_z_loss_clip": 1.58398438, + "router_z_loss_mlp": 0.1227417, + "step": 7889, + "time_per_iteration": 2.635697603225708 + }, + { + "auxiliary_loss_clip": 0.06323753, + "auxiliary_loss_mlp": 0.01252671, + "balance_loss_clip": 0.0625556, + "balance_loss_mlp": 0.01250217, + "epoch": 0.47437246355027807, + "flos": 55410771813120.0, + "grad_norm": 0.6980000025852627, + "language_loss": 0.55692458, + "learning_rate": 2.262323341259214e-06, + "loss": 0.63268882, + "num_input_tokens_seen": 169560475, + "router_z_loss_clip": 0.68261719, + "router_z_loss_mlp": 0.02452087, + "step": 7890, + "time_per_iteration": 3.196005344390869 + }, + { + "auxiliary_loss_clip": 0.06440383, + "auxiliary_loss_mlp": 0.01269286, + "balance_loss_clip": 0.06280889, + "balance_loss_mlp": 0.01255929, + "epoch": 0.47443258680294603, + "flos": 23885278738560.0, + "grad_norm": 1.7863596191541609, + "language_loss": 0.65755105, + "learning_rate": 2.2619372391691605e-06, + "loss": 0.73464775, + "num_input_tokens_seen": 169580110, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.13366699, + "step": 7891, + "time_per_iteration": 2.5535497665405273 + }, + { + "auxiliary_loss_clip": 0.06448144, + "auxiliary_loss_mlp": 0.01270649, + "balance_loss_clip": 0.06284909, + "balance_loss_mlp": 0.01256892, + "epoch": 0.474492710055614, + "flos": 21983666653440.0, + "grad_norm": 2.0785188787991133, + "language_loss": 0.70081401, + "learning_rate": 2.26155112714642e-06, + "loss": 0.77800196, + "num_input_tokens_seen": 169597510, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.13757324, + "step": 7892, + "time_per_iteration": 2.512953519821167 + }, + { + "auxiliary_loss_clip": 0.06322581, + "auxiliary_loss_mlp": 0.01253797, + "balance_loss_clip": 0.06254438, + "balance_loss_mlp": 0.01251454, + "epoch": 0.47455283330828196, + "flos": 62577186837120.0, + "grad_norm": 0.7954751994073583, + "language_loss": 0.58515328, + "learning_rate": 2.2611650052056355e-06, + "loss": 0.66091704, + "num_input_tokens_seen": 169660010, + "router_z_loss_clip": 0.6796875, + "router_z_loss_mlp": 0.02337646, + "step": 7893, + "time_per_iteration": 3.2652807235717773 + }, + { + "auxiliary_loss_clip": 0.06435462, + "auxiliary_loss_mlp": 0.01271377, + "balance_loss_clip": 0.06278428, + "balance_loss_mlp": 0.01259498, + "epoch": 0.47461295656094993, + "flos": 12098478199680.0, + "grad_norm": 1.6548256161788057, + "language_loss": 0.77515912, + "learning_rate": 2.2607788733614463e-06, + "loss": 0.85222745, + "num_input_tokens_seen": 169678485, + "router_z_loss_clip": 1.57226562, + "router_z_loss_mlp": 0.11871338, + "step": 7894, + "time_per_iteration": 2.4962351322174072 + }, + { + "auxiliary_loss_clip": 0.06436545, + "auxiliary_loss_mlp": 0.01267591, + "balance_loss_clip": 0.06277076, + "balance_loss_mlp": 0.01254883, + "epoch": 0.4746730798136179, + "flos": 20890522287360.0, + "grad_norm": 1.8932038979458137, + "language_loss": 0.75310624, + "learning_rate": 2.260392731628497e-06, + "loss": 0.83014762, + "num_input_tokens_seen": 169697335, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.1270752, + "step": 7895, + "time_per_iteration": 2.536651611328125 + }, + { + "auxiliary_loss_clip": 0.06438908, + "auxiliary_loss_mlp": 0.0126825, + "balance_loss_clip": 0.06280944, + "balance_loss_mlp": 0.012559, + "epoch": 0.4747332030662859, + "flos": 19981008144000.0, + "grad_norm": 1.9186877339725528, + "language_loss": 0.824898, + "learning_rate": 2.260006580021429e-06, + "loss": 0.90196961, + "num_input_tokens_seen": 169715395, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.12341309, + "step": 7896, + "time_per_iteration": 2.5451180934906006 + }, + { + "auxiliary_loss_clip": 0.06438936, + "auxiliary_loss_mlp": 0.0126766, + "balance_loss_clip": 0.06281327, + "balance_loss_mlp": 0.01254964, + "epoch": 0.4747933263189539, + "flos": 16039701244800.0, + "grad_norm": 4.910262672985542, + "language_loss": 0.76465023, + "learning_rate": 2.259620418554886e-06, + "loss": 0.84171617, + "num_input_tokens_seen": 169733755, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.12689209, + "step": 7897, + "time_per_iteration": 2.529157876968384 + }, + { + "auxiliary_loss_clip": 0.06443989, + "auxiliary_loss_mlp": 0.012709, + "balance_loss_clip": 0.0627964, + "balance_loss_mlp": 0.01257376, + "epoch": 0.47485344957162184, + "flos": 13960370649600.0, + "grad_norm": 1.9701771451271233, + "language_loss": 0.64411497, + "learning_rate": 2.25923424724351e-06, + "loss": 0.72126389, + "num_input_tokens_seen": 169751390, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.13519287, + "step": 7898, + "time_per_iteration": 2.4861059188842773 + }, + { + "auxiliary_loss_clip": 0.06443477, + "auxiliary_loss_mlp": 0.01269988, + "balance_loss_clip": 0.0628337, + "balance_loss_mlp": 0.01256774, + "epoch": 0.4749135728242898, + "flos": 20455352507520.0, + "grad_norm": 2.55946780946792, + "language_loss": 0.70317411, + "learning_rate": 2.258848066101946e-06, + "loss": 0.78030878, + "num_input_tokens_seen": 169769500, + "router_z_loss_clip": 1.60058594, + "router_z_loss_mlp": 0.13201904, + "step": 7899, + "time_per_iteration": 2.5035181045532227 + }, + { + "auxiliary_loss_clip": 0.06438522, + "auxiliary_loss_mlp": 0.0127023, + "balance_loss_clip": 0.06280558, + "balance_loss_mlp": 0.01257701, + "epoch": 0.4749736960769578, + "flos": 28957604849280.0, + "grad_norm": 1.797290129910965, + "language_loss": 0.68821597, + "learning_rate": 2.258461875144837e-06, + "loss": 0.76530349, + "num_input_tokens_seen": 169789215, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.12536621, + "step": 7900, + "time_per_iteration": 2.638021469116211 + }, + { + "auxiliary_loss_clip": 0.06435557, + "auxiliary_loss_mlp": 0.0126873, + "balance_loss_clip": 0.06277159, + "balance_loss_mlp": 0.01254216, + "epoch": 0.47503381932962574, + "flos": 31946407660800.0, + "grad_norm": 2.027602507157595, + "language_loss": 0.70583236, + "learning_rate": 2.2580756743868273e-06, + "loss": 0.78287518, + "num_input_tokens_seen": 169808825, + "router_z_loss_clip": 1.58300781, + "router_z_loss_mlp": 0.14501953, + "step": 7901, + "time_per_iteration": 2.6210362911224365 + }, + { + "auxiliary_loss_clip": 0.06438562, + "auxiliary_loss_mlp": 0.01269369, + "balance_loss_clip": 0.06280936, + "balance_loss_mlp": 0.01256817, + "epoch": 0.4750939425822937, + "flos": 22133782442880.0, + "grad_norm": 1.48556411263083, + "language_loss": 0.73796129, + "learning_rate": 2.2576894638425636e-06, + "loss": 0.81504059, + "num_input_tokens_seen": 169827590, + "router_z_loss_clip": 1.57617188, + "router_z_loss_mlp": 0.12542725, + "step": 7902, + "time_per_iteration": 2.5175282955169678 + }, + { + "auxiliary_loss_clip": 0.06431635, + "auxiliary_loss_mlp": 0.01269606, + "balance_loss_clip": 0.06275479, + "balance_loss_mlp": 0.0125747, + "epoch": 0.47515406583496167, + "flos": 20856378948480.0, + "grad_norm": 3.332476837285125, + "language_loss": 0.69285202, + "learning_rate": 2.257303243526688e-06, + "loss": 0.76986444, + "num_input_tokens_seen": 169844925, + "router_z_loss_clip": 1.5625, + "router_z_loss_mlp": 0.12139893, + "step": 7903, + "time_per_iteration": 2.5292611122131348 + }, + { + "auxiliary_loss_clip": 0.06430157, + "auxiliary_loss_mlp": 0.01266387, + "balance_loss_clip": 0.06276098, + "balance_loss_mlp": 0.01255015, + "epoch": 0.47521418908762963, + "flos": 17529679347840.0, + "grad_norm": 1.464561850634071, + "language_loss": 0.72526675, + "learning_rate": 2.256917013453848e-06, + "loss": 0.80223215, + "num_input_tokens_seen": 169862705, + "router_z_loss_clip": 1.54003906, + "router_z_loss_mlp": 0.1137085, + "step": 7904, + "time_per_iteration": 2.491152286529541 + }, + { + "auxiliary_loss_clip": 0.06430416, + "auxiliary_loss_mlp": 0.01265335, + "balance_loss_clip": 0.06276643, + "balance_loss_mlp": 0.01253706, + "epoch": 0.4752743123402976, + "flos": 20565874442880.0, + "grad_norm": 1.4968424405470007, + "language_loss": 0.86079156, + "learning_rate": 2.25653077363869e-06, + "loss": 0.93774903, + "num_input_tokens_seen": 169880155, + "router_z_loss_clip": 1.53710938, + "router_z_loss_mlp": 0.11633301, + "step": 7905, + "time_per_iteration": 2.5502467155456543 + }, + { + "auxiliary_loss_clip": 0.06426042, + "auxiliary_loss_mlp": 0.01267894, + "balance_loss_clip": 0.06274827, + "balance_loss_mlp": 0.01256146, + "epoch": 0.47533443559296557, + "flos": 26368025616000.0, + "grad_norm": 2.2485080153720425, + "language_loss": 0.82345891, + "learning_rate": 2.2561445240958583e-06, + "loss": 0.90039825, + "num_input_tokens_seen": 169901525, + "router_z_loss_clip": 1.51269531, + "router_z_loss_mlp": 0.11749268, + "step": 7906, + "time_per_iteration": 2.5368199348449707 + }, + { + "auxiliary_loss_clip": 0.06321883, + "auxiliary_loss_mlp": 0.01254668, + "balance_loss_clip": 0.06254389, + "balance_loss_mlp": 0.01251897, + "epoch": 0.47539455884563353, + "flos": 65970118690560.0, + "grad_norm": 0.659791256047387, + "language_loss": 0.5900293, + "learning_rate": 2.255758264840002e-06, + "loss": 0.66579485, + "num_input_tokens_seen": 169970345, + "router_z_loss_clip": 0.67529297, + "router_z_loss_mlp": 0.02775574, + "step": 7907, + "time_per_iteration": 3.279963254928589 + }, + { + "auxiliary_loss_clip": 0.06431986, + "auxiliary_loss_mlp": 0.01269488, + "balance_loss_clip": 0.06276301, + "balance_loss_mlp": 0.01256721, + "epoch": 0.4754546820983015, + "flos": 17243828743680.0, + "grad_norm": 1.7704403118247245, + "language_loss": 0.81422615, + "learning_rate": 2.255371995885765e-06, + "loss": 0.89124084, + "num_input_tokens_seen": 169986440, + "router_z_loss_clip": 1.55664062, + "router_z_loss_mlp": 0.12756348, + "step": 7908, + "time_per_iteration": 2.5366125106811523 + }, + { + "auxiliary_loss_clip": 0.0643681, + "auxiliary_loss_mlp": 0.01270103, + "balance_loss_clip": 0.06278989, + "balance_loss_mlp": 0.01257258, + "epoch": 0.47551480535096946, + "flos": 19831563187200.0, + "grad_norm": 1.6522879253580633, + "language_loss": 0.74338585, + "learning_rate": 2.254985717247797e-06, + "loss": 0.82045496, + "num_input_tokens_seen": 170005705, + "router_z_loss_clip": 1.57714844, + "router_z_loss_mlp": 0.12841797, + "step": 7909, + "time_per_iteration": 2.5318603515625 + }, + { + "auxiliary_loss_clip": 0.06431618, + "auxiliary_loss_mlp": 0.01267166, + "balance_loss_clip": 0.0627422, + "balance_loss_mlp": 0.01255192, + "epoch": 0.4755749286036375, + "flos": 22170525258240.0, + "grad_norm": 1.5977935042114109, + "language_loss": 0.75628603, + "learning_rate": 2.2545994289407457e-06, + "loss": 0.83327389, + "num_input_tokens_seen": 170023415, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.11987305, + "step": 7910, + "time_per_iteration": 2.5529162883758545 + }, + { + "auxiliary_loss_clip": 0.0643287, + "auxiliary_loss_mlp": 0.01264956, + "balance_loss_clip": 0.06276555, + "balance_loss_mlp": 0.01253488, + "epoch": 0.47563505185630545, + "flos": 21653945637120.0, + "grad_norm": 1.8732404582916444, + "language_loss": 0.7930491, + "learning_rate": 2.2542131309792577e-06, + "loss": 0.8700273, + "num_input_tokens_seen": 170042395, + "router_z_loss_clip": 1.56347656, + "router_z_loss_mlp": 0.11474609, + "step": 7911, + "time_per_iteration": 2.5172598361968994 + }, + { + "auxiliary_loss_clip": 0.0643772, + "auxiliary_loss_mlp": 0.01268087, + "balance_loss_clip": 0.06276082, + "balance_loss_mlp": 0.01253854, + "epoch": 0.4756951751089734, + "flos": 20634622318080.0, + "grad_norm": 1.775078995772379, + "language_loss": 0.76487613, + "learning_rate": 2.253826823377983e-06, + "loss": 0.8419342, + "num_input_tokens_seen": 170061610, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.14239502, + "step": 7912, + "time_per_iteration": 2.5627753734588623 + }, + { + "auxiliary_loss_clip": 0.06432701, + "auxiliary_loss_mlp": 0.01273558, + "balance_loss_clip": 0.06275164, + "balance_loss_mlp": 0.01260797, + "epoch": 0.4757552983616414, + "flos": 25855932188160.0, + "grad_norm": 1.3867905424321492, + "language_loss": 0.74749589, + "learning_rate": 2.253440506151569e-06, + "loss": 0.82455844, + "num_input_tokens_seen": 170083505, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.12762451, + "step": 7913, + "time_per_iteration": 2.539555549621582 + }, + { + "auxiliary_loss_clip": 0.06434918, + "auxiliary_loss_mlp": 0.01269661, + "balance_loss_clip": 0.06277134, + "balance_loss_mlp": 0.01257418, + "epoch": 0.47581542161430934, + "flos": 18228841015680.0, + "grad_norm": 1.9858873239790236, + "language_loss": 0.72184181, + "learning_rate": 2.253054179314666e-06, + "loss": 0.79888761, + "num_input_tokens_seen": 170100690, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.12249756, + "step": 7914, + "time_per_iteration": 3.9911863803863525 + }, + { + "auxiliary_loss_clip": 0.06440303, + "auxiliary_loss_mlp": 0.01270006, + "balance_loss_clip": 0.06281254, + "balance_loss_mlp": 0.0125737, + "epoch": 0.4758755448669773, + "flos": 21586162083840.0, + "grad_norm": 1.8571830642758371, + "language_loss": 0.65017748, + "learning_rate": 2.2526678428819227e-06, + "loss": 0.72728062, + "num_input_tokens_seen": 170119240, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.12628174, + "step": 7915, + "time_per_iteration": 3.94254207611084 + }, + { + "auxiliary_loss_clip": 0.06428695, + "auxiliary_loss_mlp": 0.01268984, + "balance_loss_clip": 0.06276157, + "balance_loss_mlp": 0.01257027, + "epoch": 0.47593566811964527, + "flos": 15236474405760.0, + "grad_norm": 1.6782618347522322, + "language_loss": 0.77118516, + "learning_rate": 2.2522814968679896e-06, + "loss": 0.84816194, + "num_input_tokens_seen": 170136450, + "router_z_loss_clip": 1.52636719, + "router_z_loss_mlp": 0.11950684, + "step": 7916, + "time_per_iteration": 2.5071310997009277 + }, + { + "auxiliary_loss_clip": 0.0642941, + "auxiliary_loss_mlp": 0.01270125, + "balance_loss_clip": 0.06275692, + "balance_loss_mlp": 0.01258842, + "epoch": 0.47599579137231324, + "flos": 21549628903680.0, + "grad_norm": 2.1020342658546878, + "language_loss": 0.64506871, + "learning_rate": 2.2518951412875173e-06, + "loss": 0.72206402, + "num_input_tokens_seen": 170155295, + "router_z_loss_clip": 1.53710938, + "router_z_loss_mlp": 0.112854, + "step": 7917, + "time_per_iteration": 2.660997152328491 + }, + { + "auxiliary_loss_clip": 0.06322742, + "auxiliary_loss_mlp": 0.01267172, + "balance_loss_clip": 0.06253887, + "balance_loss_mlp": 0.01264125, + "epoch": 0.4760559146249812, + "flos": 64573388582400.0, + "grad_norm": 0.81764582989578, + "language_loss": 0.65507567, + "learning_rate": 2.2515087761551557e-06, + "loss": 0.73097479, + "num_input_tokens_seen": 170222325, + "router_z_loss_clip": 0.6875, + "router_z_loss_mlp": 0.03042603, + "step": 7918, + "time_per_iteration": 3.185194492340088 + }, + { + "auxiliary_loss_clip": 0.06435688, + "auxiliary_loss_mlp": 0.01270072, + "balance_loss_clip": 0.06277663, + "balance_loss_mlp": 0.01257781, + "epoch": 0.47611603787764917, + "flos": 22239943966080.0, + "grad_norm": 1.5442115166230013, + "language_loss": 0.69113988, + "learning_rate": 2.2511224014855563e-06, + "loss": 0.76819742, + "num_input_tokens_seen": 170241625, + "router_z_loss_clip": 1.58007812, + "router_z_loss_mlp": 0.12286377, + "step": 7919, + "time_per_iteration": 2.5625159740448 + }, + { + "auxiliary_loss_clip": 0.06440815, + "auxiliary_loss_mlp": 0.01266869, + "balance_loss_clip": 0.06280257, + "balance_loss_mlp": 0.01254966, + "epoch": 0.47617616113031713, + "flos": 22785971097600.0, + "grad_norm": 1.4153562055419862, + "language_loss": 0.75135148, + "learning_rate": 2.2507360172933694e-06, + "loss": 0.82842833, + "num_input_tokens_seen": 170262470, + "router_z_loss_clip": 1.60742188, + "router_z_loss_mlp": 0.11914062, + "step": 7920, + "time_per_iteration": 2.606783866882324 + }, + { + "auxiliary_loss_clip": 0.06442747, + "auxiliary_loss_mlp": 0.01268403, + "balance_loss_clip": 0.06280643, + "balance_loss_mlp": 0.01255391, + "epoch": 0.4762362843829851, + "flos": 24140633656320.0, + "grad_norm": 1.5595930907743143, + "language_loss": 0.77291155, + "learning_rate": 2.2503496235932487e-06, + "loss": 0.85002303, + "num_input_tokens_seen": 170283460, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.13000488, + "step": 7921, + "time_per_iteration": 4.0331573486328125 + }, + { + "auxiliary_loss_clip": 0.06441253, + "auxiliary_loss_mlp": 0.01270198, + "balance_loss_clip": 0.06281719, + "balance_loss_mlp": 0.01256859, + "epoch": 0.47629640763565306, + "flos": 22458052944000.0, + "grad_norm": 1.5318798569312555, + "language_loss": 0.78402638, + "learning_rate": 2.249963220399845e-06, + "loss": 0.86114085, + "num_input_tokens_seen": 170304225, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.13342285, + "step": 7922, + "time_per_iteration": 2.615656614303589 + }, + { + "auxiliary_loss_clip": 0.06443102, + "auxiliary_loss_mlp": 0.01266921, + "balance_loss_clip": 0.06280392, + "balance_loss_mlp": 0.01253426, + "epoch": 0.4763565308883211, + "flos": 11186071090560.0, + "grad_norm": 1.9566034639967664, + "language_loss": 0.72915596, + "learning_rate": 2.2495768077278104e-06, + "loss": 0.80625618, + "num_input_tokens_seen": 170322110, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.1350708, + "step": 7923, + "time_per_iteration": 2.495023727416992 + }, + { + "auxiliary_loss_clip": 0.06440397, + "auxiliary_loss_mlp": 0.01267365, + "balance_loss_clip": 0.06280472, + "balance_loss_mlp": 0.01255772, + "epoch": 0.47641665414098905, + "flos": 22388634236160.0, + "grad_norm": 2.175648520453788, + "language_loss": 0.82023257, + "learning_rate": 2.2491903855917992e-06, + "loss": 0.8973102, + "num_input_tokens_seen": 170340700, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.11590576, + "step": 7924, + "time_per_iteration": 2.5592448711395264 + }, + { + "auxiliary_loss_clip": 0.06449094, + "auxiliary_loss_mlp": 0.01271258, + "balance_loss_clip": 0.06283164, + "balance_loss_mlp": 0.01257191, + "epoch": 0.476476777393657, + "flos": 25053166546560.0, + "grad_norm": 1.6497722763363074, + "language_loss": 0.80566549, + "learning_rate": 2.2488039540064626e-06, + "loss": 0.88286906, + "num_input_tokens_seen": 170359780, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.14074707, + "step": 7925, + "time_per_iteration": 2.5462217330932617 + }, + { + "auxiliary_loss_clip": 0.06433398, + "auxiliary_loss_mlp": 0.01273204, + "balance_loss_clip": 0.06273591, + "balance_loss_mlp": 0.01259984, + "epoch": 0.476536900646325, + "flos": 27276994707840.0, + "grad_norm": 1.5163925310357687, + "language_loss": 0.72183931, + "learning_rate": 2.2484175129864558e-06, + "loss": 0.79890537, + "num_input_tokens_seen": 170381260, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.13214111, + "step": 7926, + "time_per_iteration": 4.022697448730469 + }, + { + "auxiliary_loss_clip": 0.06443252, + "auxiliary_loss_mlp": 0.01270757, + "balance_loss_clip": 0.062805, + "balance_loss_mlp": 0.01257304, + "epoch": 0.47659702389899294, + "flos": 25308437610240.0, + "grad_norm": 2.540030120332383, + "language_loss": 0.69248974, + "learning_rate": 2.248031062546432e-06, + "loss": 0.76962984, + "num_input_tokens_seen": 170400595, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.13452148, + "step": 7927, + "time_per_iteration": 2.651005744934082 + }, + { + "auxiliary_loss_clip": 0.06432809, + "auxiliary_loss_mlp": 0.01274998, + "balance_loss_clip": 0.06276157, + "balance_loss_mlp": 0.01262928, + "epoch": 0.4766571471516609, + "flos": 25999716994560.0, + "grad_norm": 1.8555909912878064, + "language_loss": 0.68153882, + "learning_rate": 2.247644602701045e-06, + "loss": 0.75861686, + "num_input_tokens_seen": 170421110, + "router_z_loss_clip": 1.56640625, + "router_z_loss_mlp": 0.12072754, + "step": 7928, + "time_per_iteration": 2.6001169681549072 + }, + { + "auxiliary_loss_clip": 0.06439018, + "auxiliary_loss_mlp": 0.01266996, + "balance_loss_clip": 0.06277569, + "balance_loss_mlp": 0.01254497, + "epoch": 0.4767172704043289, + "flos": 16037395257600.0, + "grad_norm": 2.030081429010121, + "language_loss": 0.79402888, + "learning_rate": 2.2472581334649496e-06, + "loss": 0.87108904, + "num_input_tokens_seen": 170436700, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.12506104, + "step": 7929, + "time_per_iteration": 2.4979782104492188 + }, + { + "auxiliary_loss_clip": 0.06434054, + "auxiliary_loss_mlp": 0.0127525, + "balance_loss_clip": 0.06276359, + "balance_loss_mlp": 0.01263496, + "epoch": 0.47677739365699684, + "flos": 39244113233280.0, + "grad_norm": 1.8073767988538123, + "language_loss": 0.67109072, + "learning_rate": 2.2468716548528016e-06, + "loss": 0.74818379, + "num_input_tokens_seen": 170459555, + "router_z_loss_clip": 1.57617188, + "router_z_loss_mlp": 0.11749268, + "step": 7930, + "time_per_iteration": 2.64865779876709 + }, + { + "auxiliary_loss_clip": 0.06440657, + "auxiliary_loss_mlp": 0.01272697, + "balance_loss_clip": 0.06280986, + "balance_loss_mlp": 0.01260484, + "epoch": 0.4768375169096648, + "flos": 24724745268480.0, + "grad_norm": 1.7506463735046407, + "language_loss": 0.79864836, + "learning_rate": 2.2464851668792555e-06, + "loss": 0.87578189, + "num_input_tokens_seen": 170479175, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.12207031, + "step": 7931, + "time_per_iteration": 2.5824391841888428 + }, + { + "auxiliary_loss_clip": 0.06435428, + "auxiliary_loss_mlp": 0.01273232, + "balance_loss_clip": 0.06274468, + "balance_loss_mlp": 0.01260203, + "epoch": 0.47689764016233277, + "flos": 22535270081280.0, + "grad_norm": 2.3707401208689753, + "language_loss": 0.76826382, + "learning_rate": 2.2460986695589678e-06, + "loss": 0.8453505, + "num_input_tokens_seen": 170498450, + "router_z_loss_clip": 1.61035156, + "router_z_loss_mlp": 0.13043213, + "step": 7932, + "time_per_iteration": 2.510439157485962 + }, + { + "auxiliary_loss_clip": 0.06434679, + "auxiliary_loss_mlp": 0.01279125, + "balance_loss_clip": 0.06279778, + "balance_loss_mlp": 0.01266101, + "epoch": 0.47695776341500074, + "flos": 15125742835200.0, + "grad_norm": 3.7494408598150946, + "language_loss": 0.79909194, + "learning_rate": 2.245712162906593e-06, + "loss": 0.87623, + "num_input_tokens_seen": 170516255, + "router_z_loss_clip": 1.54589844, + "router_z_loss_mlp": 0.13012695, + "step": 7933, + "time_per_iteration": 2.5868406295776367 + }, + { + "auxiliary_loss_clip": 0.06440616, + "auxiliary_loss_mlp": 0.01270557, + "balance_loss_clip": 0.06276172, + "balance_loss_mlp": 0.01256889, + "epoch": 0.4770178866676687, + "flos": 14683319677440.0, + "grad_norm": 1.845903856635024, + "language_loss": 0.74363738, + "learning_rate": 2.2453256469367888e-06, + "loss": 0.8207491, + "num_input_tokens_seen": 170532705, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.13677979, + "step": 7934, + "time_per_iteration": 2.467625141143799 + }, + { + "auxiliary_loss_clip": 0.06439498, + "auxiliary_loss_mlp": 0.01269177, + "balance_loss_clip": 0.06278646, + "balance_loss_mlp": 0.01256213, + "epoch": 0.47707800992033667, + "flos": 22572264458880.0, + "grad_norm": 2.1751877197221847, + "language_loss": 0.80426806, + "learning_rate": 2.244939121664211e-06, + "loss": 0.88135481, + "num_input_tokens_seen": 170551925, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.12963867, + "step": 7935, + "time_per_iteration": 2.57150936126709 + }, + { + "auxiliary_loss_clip": 0.06443004, + "auxiliary_loss_mlp": 0.01271494, + "balance_loss_clip": 0.06275547, + "balance_loss_mlp": 0.01257249, + "epoch": 0.4771381331730047, + "flos": 30925868457600.0, + "grad_norm": 1.696374515888555, + "language_loss": 0.71442336, + "learning_rate": 2.2445525871035177e-06, + "loss": 0.7915684, + "num_input_tokens_seen": 170572320, + "router_z_loss_clip": 1.67480469, + "router_z_loss_mlp": 0.14245605, + "step": 7936, + "time_per_iteration": 2.577134609222412 + }, + { + "auxiliary_loss_clip": 0.06440726, + "auxiliary_loss_mlp": 0.01267366, + "balance_loss_clip": 0.06278887, + "balance_loss_mlp": 0.01254593, + "epoch": 0.47719825642567265, + "flos": 25745955304320.0, + "grad_norm": 1.9394747057802306, + "language_loss": 0.68651855, + "learning_rate": 2.2441660432693656e-06, + "loss": 0.76359951, + "num_input_tokens_seen": 170589470, + "router_z_loss_clip": 1.61914062, + "router_z_loss_mlp": 0.12774658, + "step": 7937, + "time_per_iteration": 2.5523571968078613 + }, + { + "auxiliary_loss_clip": 0.06332788, + "auxiliary_loss_mlp": 0.01255518, + "balance_loss_clip": 0.06264147, + "balance_loss_mlp": 0.01252959, + "epoch": 0.4772583796783406, + "flos": 66376344084480.0, + "grad_norm": 0.7063710164794027, + "language_loss": 0.56256598, + "learning_rate": 2.2437794901764128e-06, + "loss": 0.63844901, + "num_input_tokens_seen": 170662265, + "router_z_loss_clip": 0.6875, + "router_z_loss_mlp": 0.02558899, + "step": 7938, + "time_per_iteration": 3.3101401329040527 + }, + { + "auxiliary_loss_clip": 0.06435397, + "auxiliary_loss_mlp": 0.0126663, + "balance_loss_clip": 0.06278569, + "balance_loss_mlp": 0.01252927, + "epoch": 0.4773185029310086, + "flos": 22057068430080.0, + "grad_norm": 1.5498541545702798, + "language_loss": 0.89232612, + "learning_rate": 2.243392927839317e-06, + "loss": 0.96934634, + "num_input_tokens_seen": 170679680, + "router_z_loss_clip": 1.56738281, + "router_z_loss_mlp": 0.13702393, + "step": 7939, + "time_per_iteration": 2.559797525405884 + }, + { + "auxiliary_loss_clip": 0.06434917, + "auxiliary_loss_mlp": 0.01268488, + "balance_loss_clip": 0.06277393, + "balance_loss_mlp": 0.01256239, + "epoch": 0.47737862618367655, + "flos": 16733496251520.0, + "grad_norm": 2.4258721196632456, + "language_loss": 0.77298427, + "learning_rate": 2.2430063562727367e-06, + "loss": 0.85001838, + "num_input_tokens_seen": 170697340, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.12249756, + "step": 7940, + "time_per_iteration": 2.5268869400024414 + }, + { + "auxiliary_loss_clip": 0.06430884, + "auxiliary_loss_mlp": 0.01269812, + "balance_loss_clip": 0.0627719, + "balance_loss_mlp": 0.01257373, + "epoch": 0.4774387494363445, + "flos": 19615508634240.0, + "grad_norm": 1.6559533080399789, + "language_loss": 0.85386801, + "learning_rate": 2.2426197754913322e-06, + "loss": 0.930875, + "num_input_tokens_seen": 170714905, + "router_z_loss_clip": 1.53710938, + "router_z_loss_mlp": 0.12432861, + "step": 7941, + "time_per_iteration": 2.547070264816284 + }, + { + "auxiliary_loss_clip": 0.06437483, + "auxiliary_loss_mlp": 0.01270392, + "balance_loss_clip": 0.06277451, + "balance_loss_mlp": 0.01257965, + "epoch": 0.4774988726890125, + "flos": 16659507496320.0, + "grad_norm": 1.9070361015512296, + "language_loss": 0.76308775, + "learning_rate": 2.24223318550976e-06, + "loss": 0.84016657, + "num_input_tokens_seen": 170731810, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.12420654, + "step": 7942, + "time_per_iteration": 2.4842329025268555 + }, + { + "auxiliary_loss_clip": 0.06440963, + "auxiliary_loss_mlp": 0.01266017, + "balance_loss_clip": 0.06282113, + "balance_loss_mlp": 0.01253601, + "epoch": 0.47755899594168044, + "flos": 20491843760640.0, + "grad_norm": 1.6294214929971118, + "language_loss": 0.64313745, + "learning_rate": 2.241846586342682e-06, + "loss": 0.72020721, + "num_input_tokens_seen": 170750270, + "router_z_loss_clip": 1.58789062, + "router_z_loss_mlp": 0.12402344, + "step": 7943, + "time_per_iteration": 2.5384066104888916 + }, + { + "auxiliary_loss_clip": 0.06444484, + "auxiliary_loss_mlp": 0.01267261, + "balance_loss_clip": 0.06280033, + "balance_loss_mlp": 0.01253493, + "epoch": 0.4776191191943484, + "flos": 21659228444160.0, + "grad_norm": 1.6943023581153507, + "language_loss": 0.73866045, + "learning_rate": 2.2414599780047577e-06, + "loss": 0.8157779, + "num_input_tokens_seen": 170769015, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.13781738, + "step": 7944, + "time_per_iteration": 2.5201148986816406 + }, + { + "auxiliary_loss_clip": 0.06447009, + "auxiliary_loss_mlp": 0.01271608, + "balance_loss_clip": 0.06287117, + "balance_loss_mlp": 0.01258459, + "epoch": 0.4776792424470164, + "flos": 18776125958400.0, + "grad_norm": 2.2429214657199257, + "language_loss": 0.68437827, + "learning_rate": 2.2410733605106456e-06, + "loss": 0.76156443, + "num_input_tokens_seen": 170785725, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.13153076, + "step": 7945, + "time_per_iteration": 2.5126469135284424 + }, + { + "auxiliary_loss_clip": 0.06440154, + "auxiliary_loss_mlp": 0.01270213, + "balance_loss_clip": 0.06280819, + "balance_loss_mlp": 0.01257577, + "epoch": 0.47773936569968434, + "flos": 29723543821440.0, + "grad_norm": 1.8191434389659598, + "language_loss": 0.75203103, + "learning_rate": 2.240686733875009e-06, + "loss": 0.8291347, + "num_input_tokens_seen": 170804600, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.12628174, + "step": 7946, + "time_per_iteration": 2.5952818393707275 + }, + { + "auxiliary_loss_clip": 0.06450987, + "auxiliary_loss_mlp": 0.0126674, + "balance_loss_clip": 0.06288904, + "balance_loss_mlp": 0.0125368, + "epoch": 0.4777994889523523, + "flos": 24798650169600.0, + "grad_norm": 2.1264871549136566, + "language_loss": 0.79598629, + "learning_rate": 2.240300098112506e-06, + "loss": 0.87316352, + "num_input_tokens_seen": 170824230, + "router_z_loss_clip": 1.62109375, + "router_z_loss_mlp": 0.13043213, + "step": 7947, + "time_per_iteration": 2.561429023742676 + }, + { + "auxiliary_loss_clip": 0.06437, + "auxiliary_loss_mlp": 0.01268516, + "balance_loss_clip": 0.06282562, + "balance_loss_mlp": 0.01255302, + "epoch": 0.47785961220502027, + "flos": 17863928484480.0, + "grad_norm": 1.6733844414372485, + "language_loss": 0.73571151, + "learning_rate": 2.2399134532377998e-06, + "loss": 0.81276667, + "num_input_tokens_seen": 170843365, + "router_z_loss_clip": 1.54199219, + "router_z_loss_mlp": 0.13220215, + "step": 7948, + "time_per_iteration": 2.5309975147247314 + }, + { + "auxiliary_loss_clip": 0.06442553, + "auxiliary_loss_mlp": 0.01267736, + "balance_loss_clip": 0.06283022, + "balance_loss_mlp": 0.01253848, + "epoch": 0.4779197354576883, + "flos": 20272770460800.0, + "grad_norm": 2.2305312131568256, + "language_loss": 0.78282905, + "learning_rate": 2.2395267992655514e-06, + "loss": 0.85993195, + "num_input_tokens_seen": 170863515, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.13891602, + "step": 7949, + "time_per_iteration": 2.5135691165924072 + }, + { + "auxiliary_loss_clip": 0.06441014, + "auxiliary_loss_mlp": 0.01264008, + "balance_loss_clip": 0.06285359, + "balance_loss_mlp": 0.01251849, + "epoch": 0.47797985871035625, + "flos": 17062420654080.0, + "grad_norm": 2.4211239692864686, + "language_loss": 0.75134766, + "learning_rate": 2.2391401362104227e-06, + "loss": 0.82839787, + "num_input_tokens_seen": 170881245, + "router_z_loss_clip": 1.55566406, + "router_z_loss_mlp": 0.12164307, + "step": 7950, + "time_per_iteration": 2.5256588459014893 + }, + { + "auxiliary_loss_clip": 0.06439517, + "auxiliary_loss_mlp": 0.01271424, + "balance_loss_clip": 0.0628176, + "balance_loss_mlp": 0.01258668, + "epoch": 0.4780399819630242, + "flos": 31366530679680.0, + "grad_norm": 1.6557560470716002, + "language_loss": 0.744519, + "learning_rate": 2.2387534640870756e-06, + "loss": 0.82162845, + "num_input_tokens_seen": 170901285, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.12756348, + "step": 7951, + "time_per_iteration": 2.6257662773132324 + }, + { + "auxiliary_loss_clip": 0.0644564, + "auxiliary_loss_mlp": 0.0126871, + "balance_loss_clip": 0.06285301, + "balance_loss_mlp": 0.01255925, + "epoch": 0.4781001052156922, + "flos": 24906488774400.0, + "grad_norm": 2.0941094174335, + "language_loss": 0.80880862, + "learning_rate": 2.238366782910174e-06, + "loss": 0.88595212, + "num_input_tokens_seen": 170919740, + "router_z_loss_clip": 1.60351562, + "router_z_loss_mlp": 0.12786865, + "step": 7952, + "time_per_iteration": 2.6039650440216064 + }, + { + "auxiliary_loss_clip": 0.06449462, + "auxiliary_loss_mlp": 0.01273751, + "balance_loss_clip": 0.06286798, + "balance_loss_mlp": 0.01259684, + "epoch": 0.47816022846836015, + "flos": 18703688503680.0, + "grad_norm": 1.7383850677064194, + "language_loss": 0.78965735, + "learning_rate": 2.23798009269438e-06, + "loss": 0.86688948, + "num_input_tokens_seen": 170938510, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.14068604, + "step": 7953, + "time_per_iteration": 3.9394986629486084 + }, + { + "auxiliary_loss_clip": 0.0644647, + "auxiliary_loss_mlp": 0.0126971, + "balance_loss_clip": 0.0628321, + "balance_loss_mlp": 0.01256793, + "epoch": 0.4782203517210281, + "flos": 11981289864960.0, + "grad_norm": 2.1105030234958733, + "language_loss": 0.84721971, + "learning_rate": 2.2375933934543566e-06, + "loss": 0.92438149, + "num_input_tokens_seen": 170951170, + "router_z_loss_clip": 1.62988281, + "router_z_loss_mlp": 0.12921143, + "step": 7954, + "time_per_iteration": 3.9196231365203857 + }, + { + "auxiliary_loss_clip": 0.06440185, + "auxiliary_loss_mlp": 0.0126799, + "balance_loss_clip": 0.06283759, + "balance_loss_mlp": 0.01255282, + "epoch": 0.4782804749736961, + "flos": 20819761914240.0, + "grad_norm": 1.4881886911999394, + "language_loss": 0.70481235, + "learning_rate": 2.237206685204768e-06, + "loss": 0.78189409, + "num_input_tokens_seen": 170970990, + "router_z_loss_clip": 1.56347656, + "router_z_loss_mlp": 0.1270752, + "step": 7955, + "time_per_iteration": 2.5434484481811523 + }, + { + "auxiliary_loss_clip": 0.064454, + "auxiliary_loss_mlp": 0.01270242, + "balance_loss_clip": 0.06284527, + "balance_loss_mlp": 0.01257326, + "epoch": 0.47834059822636404, + "flos": 23846816914560.0, + "grad_norm": 1.553979149808007, + "language_loss": 0.823044, + "learning_rate": 2.2368199679602787e-06, + "loss": 0.90020043, + "num_input_tokens_seen": 170991215, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.12902832, + "step": 7956, + "time_per_iteration": 2.545602560043335 + }, + { + "auxiliary_loss_clip": 0.06441168, + "auxiliary_loss_mlp": 0.01269938, + "balance_loss_clip": 0.06284995, + "balance_loss_mlp": 0.01255627, + "epoch": 0.478400721479032, + "flos": 22639670668800.0, + "grad_norm": 1.9591153371347299, + "language_loss": 0.85127819, + "learning_rate": 2.2364332417355516e-06, + "loss": 0.92838925, + "num_input_tokens_seen": 171007325, + "router_z_loss_clip": 1.56152344, + "router_z_loss_mlp": 0.14300537, + "step": 7957, + "time_per_iteration": 2.548643112182617 + }, + { + "auxiliary_loss_clip": 0.06441608, + "auxiliary_loss_mlp": 0.01269143, + "balance_loss_clip": 0.06285611, + "balance_loss_mlp": 0.01257001, + "epoch": 0.4784608447317, + "flos": 19361118038400.0, + "grad_norm": 7.050300940807432, + "language_loss": 0.79869133, + "learning_rate": 2.2360465065452527e-06, + "loss": 0.87579882, + "num_input_tokens_seen": 171025650, + "router_z_loss_clip": 1.55957031, + "router_z_loss_mlp": 0.12139893, + "step": 7958, + "time_per_iteration": 2.5078237056732178 + }, + { + "auxiliary_loss_clip": 0.06441762, + "auxiliary_loss_mlp": 0.01268959, + "balance_loss_clip": 0.06283723, + "balance_loss_mlp": 0.0125534, + "epoch": 0.47852096798436794, + "flos": 24027386463360.0, + "grad_norm": 1.6951891176109464, + "language_loss": 0.82802176, + "learning_rate": 2.235659762404047e-06, + "loss": 0.90512896, + "num_input_tokens_seen": 171045045, + "router_z_loss_clip": 1.58203125, + "router_z_loss_mlp": 0.1361084, + "step": 7959, + "time_per_iteration": 2.565302610397339 + }, + { + "auxiliary_loss_clip": 0.06438372, + "auxiliary_loss_mlp": 0.01267095, + "balance_loss_clip": 0.06285324, + "balance_loss_mlp": 0.01255615, + "epoch": 0.4785810912370359, + "flos": 25673559776640.0, + "grad_norm": 2.330976037710063, + "language_loss": 0.73464501, + "learning_rate": 2.235273009326599e-06, + "loss": 0.81169969, + "num_input_tokens_seen": 171062910, + "router_z_loss_clip": 1.53027344, + "router_z_loss_mlp": 0.1149292, + "step": 7960, + "time_per_iteration": 4.027269124984741 + }, + { + "auxiliary_loss_clip": 0.06436551, + "auxiliary_loss_mlp": 0.01270036, + "balance_loss_clip": 0.0628148, + "balance_loss_mlp": 0.01258014, + "epoch": 0.47864121448970387, + "flos": 21438226500480.0, + "grad_norm": 3.172971837567245, + "language_loss": 0.77372915, + "learning_rate": 2.2348862473275745e-06, + "loss": 0.85079503, + "num_input_tokens_seen": 171080875, + "router_z_loss_clip": 1.55078125, + "router_z_loss_mlp": 0.12036133, + "step": 7961, + "time_per_iteration": 2.5147969722747803 + }, + { + "auxiliary_loss_clip": 0.06435739, + "auxiliary_loss_mlp": 0.01267875, + "balance_loss_clip": 0.06278273, + "balance_loss_mlp": 0.01255269, + "epoch": 0.47870133774237184, + "flos": 16149468493440.0, + "grad_norm": 1.5337652867811775, + "language_loss": 0.78017688, + "learning_rate": 2.2344994764216405e-06, + "loss": 0.85721302, + "num_input_tokens_seen": 171099190, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.12597656, + "step": 7962, + "time_per_iteration": 2.513148307800293 + }, + { + "auxiliary_loss_clip": 0.06441396, + "auxiliary_loss_mlp": 0.01270097, + "balance_loss_clip": 0.06281849, + "balance_loss_mlp": 0.01257646, + "epoch": 0.47876146099503986, + "flos": 26914094674560.0, + "grad_norm": 1.8277818369463197, + "language_loss": 0.65211046, + "learning_rate": 2.2341126966234635e-06, + "loss": 0.7292254, + "num_input_tokens_seen": 171119060, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.12457275, + "step": 7963, + "time_per_iteration": 2.601811647415161 + }, + { + "auxiliary_loss_clip": 0.06439337, + "auxiliary_loss_mlp": 0.01266508, + "balance_loss_clip": 0.06280507, + "balance_loss_mlp": 0.01253621, + "epoch": 0.4788215842477078, + "flos": 45342470989440.0, + "grad_norm": 2.309935013710649, + "language_loss": 0.77810884, + "learning_rate": 2.2337259079477083e-06, + "loss": 0.85516727, + "num_input_tokens_seen": 171141900, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.12890625, + "step": 7964, + "time_per_iteration": 2.747879981994629 + }, + { + "auxiliary_loss_clip": 0.06446981, + "auxiliary_loss_mlp": 0.01271797, + "balance_loss_clip": 0.06283239, + "balance_loss_mlp": 0.01257218, + "epoch": 0.4788817075003758, + "flos": 22243801253760.0, + "grad_norm": 1.6568781202078557, + "language_loss": 0.76541996, + "learning_rate": 2.233339110409044e-06, + "loss": 0.84260774, + "num_input_tokens_seen": 171161045, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.14587402, + "step": 7965, + "time_per_iteration": 2.562894344329834 + }, + { + "auxiliary_loss_clip": 0.06441608, + "auxiliary_loss_mlp": 0.01268659, + "balance_loss_clip": 0.06281182, + "balance_loss_mlp": 0.01256434, + "epoch": 0.47894183075304375, + "flos": 16476631960320.0, + "grad_norm": 1.6972134667517975, + "language_loss": 0.74819887, + "learning_rate": 2.232952304022137e-06, + "loss": 0.82530153, + "num_input_tokens_seen": 171179675, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.12237549, + "step": 7966, + "time_per_iteration": 4.023793697357178 + }, + { + "auxiliary_loss_clip": 0.06437664, + "auxiliary_loss_mlp": 0.01265562, + "balance_loss_clip": 0.06279117, + "balance_loss_mlp": 0.01253033, + "epoch": 0.4790019540057117, + "flos": 24290036686080.0, + "grad_norm": 1.5237416858661557, + "language_loss": 0.73335361, + "learning_rate": 2.232565488801655e-06, + "loss": 0.81038582, + "num_input_tokens_seen": 171201175, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.12518311, + "step": 7967, + "time_per_iteration": 2.586228847503662 + }, + { + "auxiliary_loss_clip": 0.06429637, + "auxiliary_loss_mlp": 0.01267705, + "balance_loss_clip": 0.06277768, + "balance_loss_mlp": 0.01254825, + "epoch": 0.4790620772583797, + "flos": 25673601703680.0, + "grad_norm": 2.2388113154567058, + "language_loss": 0.79254079, + "learning_rate": 2.232178664762267e-06, + "loss": 0.86951417, + "num_input_tokens_seen": 171221750, + "router_z_loss_clip": 1.51855469, + "router_z_loss_mlp": 0.12896729, + "step": 7968, + "time_per_iteration": 2.569835901260376 + }, + { + "auxiliary_loss_clip": 0.06330545, + "auxiliary_loss_mlp": 0.01255481, + "balance_loss_clip": 0.06260878, + "balance_loss_mlp": 0.01252947, + "epoch": 0.47912220051104765, + "flos": 69451168711680.0, + "grad_norm": 0.7701358383106056, + "language_loss": 0.62163401, + "learning_rate": 2.2317918319186408e-06, + "loss": 0.69749427, + "num_input_tokens_seen": 171292235, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 0.02534485, + "step": 7969, + "time_per_iteration": 3.2898826599121094 + }, + { + "auxiliary_loss_clip": 0.06435778, + "auxiliary_loss_mlp": 0.01265918, + "balance_loss_clip": 0.06281342, + "balance_loss_mlp": 0.012529, + "epoch": 0.4791823237637156, + "flos": 24175531681920.0, + "grad_norm": 1.7909857243287752, + "language_loss": 0.77847564, + "learning_rate": 2.2314049902854446e-06, + "loss": 0.85549259, + "num_input_tokens_seen": 171312215, + "router_z_loss_clip": 1.54296875, + "router_z_loss_mlp": 0.13006592, + "step": 7970, + "time_per_iteration": 2.5170607566833496 + }, + { + "auxiliary_loss_clip": 0.06435491, + "auxiliary_loss_mlp": 0.01267513, + "balance_loss_clip": 0.06276551, + "balance_loss_mlp": 0.0125384, + "epoch": 0.4792424470163836, + "flos": 24757966212480.0, + "grad_norm": 1.6160167990193877, + "language_loss": 0.71182537, + "learning_rate": 2.231018139877349e-06, + "loss": 0.78885543, + "num_input_tokens_seen": 171332975, + "router_z_loss_clip": 1.58886719, + "router_z_loss_mlp": 0.13665771, + "step": 7971, + "time_per_iteration": 2.572124719619751 + }, + { + "auxiliary_loss_clip": 0.06436221, + "auxiliary_loss_mlp": 0.01271919, + "balance_loss_clip": 0.06279434, + "balance_loss_mlp": 0.01258836, + "epoch": 0.47930257026905154, + "flos": 23264550092160.0, + "grad_norm": 1.2950674857674533, + "language_loss": 0.80144143, + "learning_rate": 2.230631280709021e-06, + "loss": 0.87852287, + "num_input_tokens_seen": 171353880, + "router_z_loss_clip": 1.56738281, + "router_z_loss_mlp": 0.1307373, + "step": 7972, + "time_per_iteration": 2.545262575149536 + }, + { + "auxiliary_loss_clip": 0.06442808, + "auxiliary_loss_mlp": 0.01270328, + "balance_loss_clip": 0.06281324, + "balance_loss_mlp": 0.01256392, + "epoch": 0.4793626935217195, + "flos": 14069299357440.0, + "grad_norm": 2.062531710859889, + "language_loss": 0.70572007, + "learning_rate": 2.2302444127951327e-06, + "loss": 0.7828514, + "num_input_tokens_seen": 171370930, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.13934326, + "step": 7973, + "time_per_iteration": 2.5338237285614014 + }, + { + "auxiliary_loss_clip": 0.064371, + "auxiliary_loss_mlp": 0.01270261, + "balance_loss_clip": 0.06283109, + "balance_loss_mlp": 0.0125806, + "epoch": 0.4794228167743875, + "flos": 21805319237760.0, + "grad_norm": 1.7273933233655367, + "language_loss": 0.79198468, + "learning_rate": 2.2298575361503523e-06, + "loss": 0.86905837, + "num_input_tokens_seen": 171387575, + "router_z_loss_clip": 1.54101562, + "router_z_loss_mlp": 0.12200928, + "step": 7974, + "time_per_iteration": 2.5069854259490967 + }, + { + "auxiliary_loss_clip": 0.06339005, + "auxiliary_loss_mlp": 0.01258702, + "balance_loss_clip": 0.06269643, + "balance_loss_mlp": 0.01255866, + "epoch": 0.47948294002705544, + "flos": 66989022739200.0, + "grad_norm": 0.7443790840370731, + "language_loss": 0.53920376, + "learning_rate": 2.2294706507893517e-06, + "loss": 0.61518085, + "num_input_tokens_seen": 171449980, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 0.02832031, + "step": 7975, + "time_per_iteration": 3.2263216972351074 + }, + { + "auxiliary_loss_clip": 0.06450166, + "auxiliary_loss_mlp": 0.01269981, + "balance_loss_clip": 0.06283702, + "balance_loss_mlp": 0.0125465, + "epoch": 0.47954306327972346, + "flos": 12427444529280.0, + "grad_norm": 1.9824704830592612, + "language_loss": 0.90397954, + "learning_rate": 2.2290837567268008e-06, + "loss": 0.98118103, + "num_input_tokens_seen": 171465290, + "router_z_loss_clip": 1.6640625, + "router_z_loss_mlp": 0.15313721, + "step": 7976, + "time_per_iteration": 2.5806965827941895 + }, + { + "auxiliary_loss_clip": 0.06448781, + "auxiliary_loss_mlp": 0.01272852, + "balance_loss_clip": 0.06284519, + "balance_loss_mlp": 0.01257629, + "epoch": 0.4796031865323914, + "flos": 18366630255360.0, + "grad_norm": 3.7288296944586166, + "language_loss": 0.73905623, + "learning_rate": 2.2286968539773713e-06, + "loss": 0.81627262, + "num_input_tokens_seen": 171481130, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.15209961, + "step": 7977, + "time_per_iteration": 2.5562849044799805 + }, + { + "auxiliary_loss_clip": 0.06437217, + "auxiliary_loss_mlp": 0.01268705, + "balance_loss_clip": 0.06283021, + "balance_loss_mlp": 0.01255741, + "epoch": 0.4796633097850594, + "flos": 21841517001600.0, + "grad_norm": 1.607227573724713, + "language_loss": 0.78873986, + "learning_rate": 2.228309942555734e-06, + "loss": 0.86579907, + "num_input_tokens_seen": 171501140, + "router_z_loss_clip": 1.54296875, + "router_z_loss_mlp": 0.12976074, + "step": 7978, + "time_per_iteration": 2.558842420578003 + }, + { + "auxiliary_loss_clip": 0.06440634, + "auxiliary_loss_mlp": 0.01269299, + "balance_loss_clip": 0.06280127, + "balance_loss_mlp": 0.01255214, + "epoch": 0.47972343303772735, + "flos": 23443526413440.0, + "grad_norm": 1.9276236664860738, + "language_loss": 0.89800453, + "learning_rate": 2.22792302247656e-06, + "loss": 0.97510386, + "num_input_tokens_seen": 171519835, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.14099121, + "step": 7979, + "time_per_iteration": 2.5952987670898438 + }, + { + "auxiliary_loss_clip": 0.06446249, + "auxiliary_loss_mlp": 0.01270987, + "balance_loss_clip": 0.06283665, + "balance_loss_mlp": 0.01256378, + "epoch": 0.4797835562903953, + "flos": 24906698409600.0, + "grad_norm": 1.4562164603157606, + "language_loss": 0.7704469, + "learning_rate": 2.227536093754523e-06, + "loss": 0.8476193, + "num_input_tokens_seen": 171540980, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.14605713, + "step": 7980, + "time_per_iteration": 2.5736522674560547 + }, + { + "auxiliary_loss_clip": 0.06447264, + "auxiliary_loss_mlp": 0.01273404, + "balance_loss_clip": 0.06281359, + "balance_loss_mlp": 0.01258938, + "epoch": 0.4798436795430633, + "flos": 35051644120320.0, + "grad_norm": 1.875578547391537, + "language_loss": 0.71508431, + "learning_rate": 2.227149156404295e-06, + "loss": 0.79229099, + "num_input_tokens_seen": 171563600, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.14459229, + "step": 7981, + "time_per_iteration": 2.6367290019989014 + }, + { + "auxiliary_loss_clip": 0.06439552, + "auxiliary_loss_mlp": 0.01273941, + "balance_loss_clip": 0.06281938, + "balance_loss_mlp": 0.01258998, + "epoch": 0.47990380279573125, + "flos": 20595699296640.0, + "grad_norm": 1.7763359166784585, + "language_loss": 0.70155972, + "learning_rate": 2.2267622104405473e-06, + "loss": 0.77869463, + "num_input_tokens_seen": 171580700, + "router_z_loss_clip": 1.57519531, + "router_z_loss_mlp": 0.14935303, + "step": 7982, + "time_per_iteration": 2.5258874893188477 + }, + { + "auxiliary_loss_clip": 0.06432236, + "auxiliary_loss_mlp": 0.0126906, + "balance_loss_clip": 0.06278554, + "balance_loss_mlp": 0.01257079, + "epoch": 0.4799639260483992, + "flos": 26366600096640.0, + "grad_norm": 1.7437778110304778, + "language_loss": 0.71608925, + "learning_rate": 2.2263752558779544e-06, + "loss": 0.79310226, + "num_input_tokens_seen": 171602035, + "router_z_loss_clip": 1.53417969, + "router_z_loss_mlp": 0.11975098, + "step": 7983, + "time_per_iteration": 2.568826913833618 + }, + { + "auxiliary_loss_clip": 0.06340544, + "auxiliary_loss_mlp": 0.01252804, + "balance_loss_clip": 0.06272082, + "balance_loss_mlp": 0.01249972, + "epoch": 0.4800240493010672, + "flos": 70999371002880.0, + "grad_norm": 0.765879442061108, + "language_loss": 0.59357727, + "learning_rate": 2.2259882927311883e-06, + "loss": 0.66951072, + "num_input_tokens_seen": 171659215, + "router_z_loss_clip": 0.68554688, + "router_z_loss_mlp": 0.02828979, + "step": 7984, + "time_per_iteration": 3.1084651947021484 + }, + { + "auxiliary_loss_clip": 0.0643955, + "auxiliary_loss_mlp": 0.01275134, + "balance_loss_clip": 0.0628207, + "balance_loss_mlp": 0.01262152, + "epoch": 0.48008417255373514, + "flos": 17091406967040.0, + "grad_norm": 1.5773823669430012, + "language_loss": 0.67127079, + "learning_rate": 2.2256013210149247e-06, + "loss": 0.74841756, + "num_input_tokens_seen": 171675710, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.12988281, + "step": 7985, + "time_per_iteration": 2.4906041622161865 + }, + { + "auxiliary_loss_clip": 0.06439713, + "auxiliary_loss_mlp": 0.01270507, + "balance_loss_clip": 0.0627727, + "balance_loss_mlp": 0.01256458, + "epoch": 0.4801442958064031, + "flos": 15418762963200.0, + "grad_norm": 1.6902399231491212, + "language_loss": 0.70749509, + "learning_rate": 2.225214340743835e-06, + "loss": 0.78459728, + "num_input_tokens_seen": 171692510, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.14056396, + "step": 7986, + "time_per_iteration": 2.52093243598938 + }, + { + "auxiliary_loss_clip": 0.06445119, + "auxiliary_loss_mlp": 0.01273703, + "balance_loss_clip": 0.06282695, + "balance_loss_mlp": 0.0125972, + "epoch": 0.4802044190590711, + "flos": 11478546167040.0, + "grad_norm": 1.9459651571320913, + "language_loss": 0.79178715, + "learning_rate": 2.2248273519325956e-06, + "loss": 0.86897534, + "num_input_tokens_seen": 171710235, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.13983154, + "step": 7987, + "time_per_iteration": 2.498640537261963 + }, + { + "auxiliary_loss_clip": 0.06442459, + "auxiliary_loss_mlp": 0.01274239, + "balance_loss_clip": 0.06282187, + "balance_loss_mlp": 0.01260029, + "epoch": 0.48026454231173904, + "flos": 20955874072320.0, + "grad_norm": 2.568897435463935, + "language_loss": 0.75366008, + "learning_rate": 2.2244403545958812e-06, + "loss": 0.83082712, + "num_input_tokens_seen": 171726715, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.14215088, + "step": 7988, + "time_per_iteration": 2.516512632369995 + }, + { + "auxiliary_loss_clip": 0.0644449, + "auxiliary_loss_mlp": 0.01267812, + "balance_loss_clip": 0.06284034, + "balance_loss_mlp": 0.01254651, + "epoch": 0.48032466556440706, + "flos": 20454220477440.0, + "grad_norm": 2.121657383550553, + "language_loss": 0.79781222, + "learning_rate": 2.224053348748365e-06, + "loss": 0.87493527, + "num_input_tokens_seen": 171743605, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.13140869, + "step": 7989, + "time_per_iteration": 2.5021252632141113 + }, + { + "auxiliary_loss_clip": 0.06450642, + "auxiliary_loss_mlp": 0.01272628, + "balance_loss_clip": 0.0628516, + "balance_loss_mlp": 0.01259277, + "epoch": 0.480384788817075, + "flos": 37129507269120.0, + "grad_norm": 1.6027553338262992, + "language_loss": 0.73628318, + "learning_rate": 2.223666334404724e-06, + "loss": 0.81351584, + "num_input_tokens_seen": 171765445, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.13360596, + "step": 7990, + "time_per_iteration": 2.678316593170166 + }, + { + "auxiliary_loss_clip": 0.06340674, + "auxiliary_loss_mlp": 0.01254539, + "balance_loss_clip": 0.06272323, + "balance_loss_mlp": 0.01252124, + "epoch": 0.480444912069743, + "flos": 69572103281280.0, + "grad_norm": 0.7463246314152452, + "language_loss": 0.59028065, + "learning_rate": 2.223279311579633e-06, + "loss": 0.66623276, + "num_input_tokens_seen": 171830115, + "router_z_loss_clip": 0.68359375, + "router_z_loss_mlp": 0.02412415, + "step": 7991, + "time_per_iteration": 3.2123708724975586 + }, + { + "auxiliary_loss_clip": 0.06440669, + "auxiliary_loss_mlp": 0.0127166, + "balance_loss_clip": 0.06280738, + "balance_loss_mlp": 0.01258493, + "epoch": 0.48050503532241096, + "flos": 29829453782400.0, + "grad_norm": 1.8077991766436714, + "language_loss": 0.67425305, + "learning_rate": 2.222892280287768e-06, + "loss": 0.75137639, + "num_input_tokens_seen": 171849135, + "router_z_loss_clip": 1.60058594, + "router_z_loss_mlp": 0.1317749, + "step": 7992, + "time_per_iteration": 4.022457599639893 + }, + { + "auxiliary_loss_clip": 0.06441684, + "auxiliary_loss_mlp": 0.01270903, + "balance_loss_clip": 0.06280079, + "balance_loss_mlp": 0.01257289, + "epoch": 0.4805651585750789, + "flos": 23954865154560.0, + "grad_norm": 1.520335815005364, + "language_loss": 0.76567221, + "learning_rate": 2.2225052405438056e-06, + "loss": 0.84279805, + "num_input_tokens_seen": 171868880, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.13616943, + "step": 7993, + "time_per_iteration": 2.5975513458251953 + }, + { + "auxiliary_loss_clip": 0.0643717, + "auxiliary_loss_mlp": 0.012705, + "balance_loss_clip": 0.06281759, + "balance_loss_mlp": 0.01257101, + "epoch": 0.4806252818277469, + "flos": 25672385819520.0, + "grad_norm": 1.5304271246014225, + "language_loss": 0.78575444, + "learning_rate": 2.222118192362422e-06, + "loss": 0.86283118, + "num_input_tokens_seen": 171889455, + "router_z_loss_clip": 1.55273438, + "router_z_loss_mlp": 0.1340332, + "step": 7994, + "time_per_iteration": 3.9770989418029785 + }, + { + "auxiliary_loss_clip": 0.06441342, + "auxiliary_loss_mlp": 0.01268981, + "balance_loss_clip": 0.06282856, + "balance_loss_mlp": 0.01255284, + "epoch": 0.48068540508041485, + "flos": 13157059956480.0, + "grad_norm": 1.7612496141579397, + "language_loss": 0.80023497, + "learning_rate": 2.2217311357582946e-06, + "loss": 0.87733817, + "num_input_tokens_seen": 171906070, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.13702393, + "step": 7995, + "time_per_iteration": 2.565765380859375 + }, + { + "auxiliary_loss_clip": 0.06436922, + "auxiliary_loss_mlp": 0.01271915, + "balance_loss_clip": 0.06281693, + "balance_loss_mlp": 0.01259499, + "epoch": 0.4807455283330828, + "flos": 21182787728640.0, + "grad_norm": 1.7014068364920145, + "language_loss": 0.82857656, + "learning_rate": 2.2213440707461e-06, + "loss": 0.90566498, + "num_input_tokens_seen": 171926515, + "router_z_loss_clip": 1.55078125, + "router_z_loss_mlp": 0.12408447, + "step": 7996, + "time_per_iteration": 2.5223636627197266 + }, + { + "auxiliary_loss_clip": 0.06437848, + "auxiliary_loss_mlp": 0.01273993, + "balance_loss_clip": 0.06283682, + "balance_loss_mlp": 0.0126104, + "epoch": 0.4808056515857508, + "flos": 12280850611200.0, + "grad_norm": 2.0553444119055095, + "language_loss": 0.81048906, + "learning_rate": 2.220956997340516e-06, + "loss": 0.88760751, + "num_input_tokens_seen": 171943845, + "router_z_loss_clip": 1.54003906, + "router_z_loss_mlp": 0.12957764, + "step": 7997, + "time_per_iteration": 2.5387723445892334 + }, + { + "auxiliary_loss_clip": 0.06439243, + "auxiliary_loss_mlp": 0.01272881, + "balance_loss_clip": 0.06278609, + "balance_loss_mlp": 0.01258886, + "epoch": 0.48086577483841875, + "flos": 24832835435520.0, + "grad_norm": 1.673774189345091, + "language_loss": 0.72584945, + "learning_rate": 2.220569915556221e-06, + "loss": 0.80297071, + "num_input_tokens_seen": 171964970, + "router_z_loss_clip": 1.60742188, + "router_z_loss_mlp": 0.13989258, + "step": 7998, + "time_per_iteration": 2.5332131385803223 + }, + { + "auxiliary_loss_clip": 0.06438513, + "auxiliary_loss_mlp": 0.0127211, + "balance_loss_clip": 0.06282588, + "balance_loss_mlp": 0.01258931, + "epoch": 0.4809258980910867, + "flos": 24472786440960.0, + "grad_norm": 1.7584112558628078, + "language_loss": 0.71207035, + "learning_rate": 2.220182825407892e-06, + "loss": 0.78917658, + "num_input_tokens_seen": 171986340, + "router_z_loss_clip": 1.55664062, + "router_z_loss_mlp": 0.1317749, + "step": 7999, + "time_per_iteration": 2.5675172805786133 + }, + { + "auxiliary_loss_clip": 0.06447413, + "auxiliary_loss_mlp": 0.01268559, + "balance_loss_clip": 0.06285158, + "balance_loss_mlp": 0.01254581, + "epoch": 0.4809860213437547, + "flos": 21222465436800.0, + "grad_norm": 1.5803850534596136, + "language_loss": 0.71622467, + "learning_rate": 2.2197957269102083e-06, + "loss": 0.79338437, + "num_input_tokens_seen": 172007300, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.13983154, + "step": 8000, + "time_per_iteration": 4.0574305057525635 + }, + { + "auxiliary_loss_clip": 0.06440975, + "auxiliary_loss_mlp": 0.01266748, + "balance_loss_clip": 0.06282955, + "balance_loss_mlp": 0.01253558, + "epoch": 0.48104614459642264, + "flos": 37640929864320.0, + "grad_norm": 1.3783876991224597, + "language_loss": 0.75060636, + "learning_rate": 2.2194086200778485e-06, + "loss": 0.82768357, + "num_input_tokens_seen": 172029585, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.13189697, + "step": 8001, + "time_per_iteration": 2.6750619411468506 + }, + { + "auxiliary_loss_clip": 0.06444116, + "auxiliary_loss_mlp": 0.01269598, + "balance_loss_clip": 0.06285578, + "balance_loss_mlp": 0.0125667, + "epoch": 0.48110626784909066, + "flos": 18412093895040.0, + "grad_norm": 3.3850625220280066, + "language_loss": 0.81721932, + "learning_rate": 2.219021504925493e-06, + "loss": 0.89435649, + "num_input_tokens_seen": 172047495, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.12921143, + "step": 8002, + "time_per_iteration": 2.537611961364746 + }, + { + "auxiliary_loss_clip": 0.06444092, + "auxiliary_loss_mlp": 0.01266064, + "balance_loss_clip": 0.06282309, + "balance_loss_mlp": 0.0125232, + "epoch": 0.48116639110175863, + "flos": 28447481992320.0, + "grad_norm": 1.6717054522334394, + "language_loss": 0.71586967, + "learning_rate": 2.218634381467819e-06, + "loss": 0.79297119, + "num_input_tokens_seen": 172067625, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.13739014, + "step": 8003, + "time_per_iteration": 2.586836576461792 + }, + { + "auxiliary_loss_clip": 0.06435338, + "auxiliary_loss_mlp": 0.01268946, + "balance_loss_clip": 0.0628237, + "balance_loss_mlp": 0.01256375, + "epoch": 0.4812265143544266, + "flos": 21731582044800.0, + "grad_norm": 1.5740971137450945, + "language_loss": 0.82286322, + "learning_rate": 2.218247249719507e-06, + "loss": 0.89990604, + "num_input_tokens_seen": 172087885, + "router_z_loss_clip": 1.52832031, + "router_z_loss_mlp": 0.12561035, + "step": 8004, + "time_per_iteration": 2.5606155395507812 + }, + { + "auxiliary_loss_clip": 0.06454347, + "auxiliary_loss_mlp": 0.01272857, + "balance_loss_clip": 0.06285338, + "balance_loss_mlp": 0.01258004, + "epoch": 0.48128663760709456, + "flos": 13229707046400.0, + "grad_norm": 2.0390359670143465, + "language_loss": 0.77871376, + "learning_rate": 2.217860109695239e-06, + "loss": 0.85598582, + "num_input_tokens_seen": 172105815, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.14837646, + "step": 8005, + "time_per_iteration": 2.47816801071167 + }, + { + "auxiliary_loss_clip": 0.06444031, + "auxiliary_loss_mlp": 0.01265777, + "balance_loss_clip": 0.06283107, + "balance_loss_mlp": 0.01252902, + "epoch": 0.4813467608597625, + "flos": 24250317050880.0, + "grad_norm": 8.997763816911675, + "language_loss": 0.71145892, + "learning_rate": 2.217472961409692e-06, + "loss": 0.78855699, + "num_input_tokens_seen": 172126125, + "router_z_loss_clip": 1.61132812, + "router_z_loss_mlp": 0.12866211, + "step": 8006, + "time_per_iteration": 3.998465061187744 + }, + { + "auxiliary_loss_clip": 0.06443979, + "auxiliary_loss_mlp": 0.0126724, + "balance_loss_clip": 0.06283164, + "balance_loss_mlp": 0.01253502, + "epoch": 0.4814068841124305, + "flos": 27486131299200.0, + "grad_norm": 1.774717747938, + "language_loss": 0.7057631, + "learning_rate": 2.2170858048775495e-06, + "loss": 0.78287524, + "num_input_tokens_seen": 172141945, + "router_z_loss_clip": 1.60839844, + "router_z_loss_mlp": 0.13726807, + "step": 8007, + "time_per_iteration": 2.6010959148406982 + }, + { + "auxiliary_loss_clip": 0.06445048, + "auxiliary_loss_mlp": 0.01270091, + "balance_loss_clip": 0.06283326, + "balance_loss_mlp": 0.01256382, + "epoch": 0.48146700736509845, + "flos": 19578933527040.0, + "grad_norm": 1.7543289086675633, + "language_loss": 0.72215438, + "learning_rate": 2.2166986401134914e-06, + "loss": 0.79930574, + "num_input_tokens_seen": 172161095, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.137146, + "step": 8008, + "time_per_iteration": 2.5119597911834717 + }, + { + "auxiliary_loss_clip": 0.064485, + "auxiliary_loss_mlp": 0.01270116, + "balance_loss_clip": 0.06287649, + "balance_loss_mlp": 0.01256699, + "epoch": 0.4815271306177664, + "flos": 20633448360960.0, + "grad_norm": 2.3493781090087427, + "language_loss": 0.61680824, + "learning_rate": 2.216311467132199e-06, + "loss": 0.6939944, + "num_input_tokens_seen": 172178750, + "router_z_loss_clip": 1.60839844, + "router_z_loss_mlp": 0.13421631, + "step": 8009, + "time_per_iteration": 2.531614303588867 + }, + { + "auxiliary_loss_clip": 0.06337314, + "auxiliary_loss_mlp": 0.01256915, + "balance_loss_clip": 0.062691, + "balance_loss_mlp": 0.01254566, + "epoch": 0.4815872538704344, + "flos": 67710168904320.0, + "grad_norm": 0.8824544242806498, + "language_loss": 0.61164761, + "learning_rate": 2.2159242859483547e-06, + "loss": 0.68758988, + "num_input_tokens_seen": 172240235, + "router_z_loss_clip": 0.68017578, + "router_z_loss_mlp": 0.0234375, + "step": 8010, + "time_per_iteration": 3.1565909385681152 + }, + { + "auxiliary_loss_clip": 0.06445675, + "auxiliary_loss_mlp": 0.01270127, + "balance_loss_clip": 0.06287005, + "balance_loss_mlp": 0.01256364, + "epoch": 0.48164737712310235, + "flos": 22827451668480.0, + "grad_norm": 1.6746394307020662, + "language_loss": 0.73637664, + "learning_rate": 2.215537096576639e-06, + "loss": 0.81353462, + "num_input_tokens_seen": 172259875, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.1373291, + "step": 8011, + "time_per_iteration": 2.6046555042266846 + }, + { + "auxiliary_loss_clip": 0.0643819, + "auxiliary_loss_mlp": 0.01270392, + "balance_loss_clip": 0.06284268, + "balance_loss_mlp": 0.01257887, + "epoch": 0.4817075003757703, + "flos": 23740865026560.0, + "grad_norm": 1.8215201759984196, + "language_loss": 0.79494172, + "learning_rate": 2.2151498990317354e-06, + "loss": 0.87202752, + "num_input_tokens_seen": 172280150, + "router_z_loss_clip": 1.5390625, + "router_z_loss_mlp": 0.125, + "step": 8012, + "time_per_iteration": 2.5538861751556396 + }, + { + "auxiliary_loss_clip": 0.06444636, + "auxiliary_loss_mlp": 0.0127321, + "balance_loss_clip": 0.0628611, + "balance_loss_mlp": 0.01259501, + "epoch": 0.4817676236284383, + "flos": 28190282284800.0, + "grad_norm": 1.6047815948624113, + "language_loss": 0.73606604, + "learning_rate": 2.214762693328326e-06, + "loss": 0.81324452, + "num_input_tokens_seen": 172300810, + "router_z_loss_clip": 1.58398438, + "router_z_loss_mlp": 0.1373291, + "step": 8013, + "time_per_iteration": 2.6944220066070557 + }, + { + "auxiliary_loss_clip": 0.06441531, + "auxiliary_loss_mlp": 0.01264943, + "balance_loss_clip": 0.06285915, + "balance_loss_mlp": 0.01253094, + "epoch": 0.48182774688110624, + "flos": 17097360606720.0, + "grad_norm": 1.8755216355849496, + "language_loss": 0.91141838, + "learning_rate": 2.214375479481094e-06, + "loss": 0.98848319, + "num_input_tokens_seen": 172317930, + "router_z_loss_clip": 1.55371094, + "router_z_loss_mlp": 0.11859131, + "step": 8014, + "time_per_iteration": 2.501678466796875 + }, + { + "auxiliary_loss_clip": 0.06448989, + "auxiliary_loss_mlp": 0.0126993, + "balance_loss_clip": 0.06285382, + "balance_loss_mlp": 0.01256149, + "epoch": 0.4818878701337742, + "flos": 12572780636160.0, + "grad_norm": 2.068904383285823, + "language_loss": 0.75191212, + "learning_rate": 2.213988257504722e-06, + "loss": 0.82910132, + "num_input_tokens_seen": 172336340, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.13775635, + "step": 8015, + "time_per_iteration": 2.574915885925293 + }, + { + "auxiliary_loss_clip": 0.06450102, + "auxiliary_loss_mlp": 0.01268556, + "balance_loss_clip": 0.06285062, + "balance_loss_mlp": 0.01254942, + "epoch": 0.48194799338644223, + "flos": 24615481144320.0, + "grad_norm": 2.7940595212226693, + "language_loss": 0.80323374, + "learning_rate": 2.213601027413894e-06, + "loss": 0.88042033, + "num_input_tokens_seen": 172354315, + "router_z_loss_clip": 1.65136719, + "router_z_loss_mlp": 0.13604736, + "step": 8016, + "time_per_iteration": 2.545562744140625 + }, + { + "auxiliary_loss_clip": 0.06441234, + "auxiliary_loss_mlp": 0.01268233, + "balance_loss_clip": 0.06288698, + "balance_loss_mlp": 0.01255996, + "epoch": 0.4820081166391102, + "flos": 21111482304000.0, + "grad_norm": 1.7856263642868424, + "language_loss": 0.77840865, + "learning_rate": 2.2132137892232933e-06, + "loss": 0.85550332, + "num_input_tokens_seen": 172372695, + "router_z_loss_clip": 1.52636719, + "router_z_loss_mlp": 0.12237549, + "step": 8017, + "time_per_iteration": 2.548884153366089 + }, + { + "auxiliary_loss_clip": 0.06442289, + "auxiliary_loss_mlp": 0.01274842, + "balance_loss_clip": 0.06287417, + "balance_loss_mlp": 0.01261729, + "epoch": 0.48206823989177816, + "flos": 25271569013760.0, + "grad_norm": 1.8858588216369734, + "language_loss": 0.80356038, + "learning_rate": 2.2128265429476043e-06, + "loss": 0.8807317, + "num_input_tokens_seen": 172390905, + "router_z_loss_clip": 1.54785156, + "router_z_loss_mlp": 0.13098145, + "step": 8018, + "time_per_iteration": 2.5485877990722656 + }, + { + "auxiliary_loss_clip": 0.06443836, + "auxiliary_loss_mlp": 0.01268171, + "balance_loss_clip": 0.06283845, + "balance_loss_mlp": 0.01255177, + "epoch": 0.4821283631444461, + "flos": 24652056251520.0, + "grad_norm": 1.8013341989070415, + "language_loss": 0.76402384, + "learning_rate": 2.2124392886015124e-06, + "loss": 0.84114391, + "num_input_tokens_seen": 172412295, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.12988281, + "step": 8019, + "time_per_iteration": 2.583380937576294 + }, + { + "auxiliary_loss_clip": 0.06444359, + "auxiliary_loss_mlp": 0.01271658, + "balance_loss_clip": 0.06285813, + "balance_loss_mlp": 0.01258826, + "epoch": 0.4821884863971141, + "flos": 23959015931520.0, + "grad_norm": 1.6800720935629156, + "language_loss": 0.79355383, + "learning_rate": 2.212052026199701e-06, + "loss": 0.87071395, + "num_input_tokens_seen": 172432625, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 0.12841797, + "step": 8020, + "time_per_iteration": 2.531282663345337 + }, + { + "auxiliary_loss_clip": 0.06436829, + "auxiliary_loss_mlp": 0.01270595, + "balance_loss_clip": 0.06283663, + "balance_loss_mlp": 0.01257655, + "epoch": 0.48224860964978206, + "flos": 17165605357440.0, + "grad_norm": 1.8962985695511603, + "language_loss": 0.70203435, + "learning_rate": 2.211664755756855e-06, + "loss": 0.77910858, + "num_input_tokens_seen": 172450010, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.12945557, + "step": 8021, + "time_per_iteration": 2.5050454139709473 + }, + { + "auxiliary_loss_clip": 0.06448636, + "auxiliary_loss_mlp": 0.01267557, + "balance_loss_clip": 0.06284462, + "balance_loss_mlp": 0.01253568, + "epoch": 0.48230873290245, + "flos": 23082513096960.0, + "grad_norm": 1.8444275684859448, + "language_loss": 0.63131356, + "learning_rate": 2.2112774772876603e-06, + "loss": 0.70847559, + "num_input_tokens_seen": 172469080, + "router_z_loss_clip": 1.63964844, + "router_z_loss_mlp": 0.14001465, + "step": 8022, + "time_per_iteration": 2.5153286457061768 + }, + { + "auxiliary_loss_clip": 0.06439438, + "auxiliary_loss_mlp": 0.0127221, + "balance_loss_clip": 0.06284659, + "balance_loss_mlp": 0.01259544, + "epoch": 0.482368856155118, + "flos": 19359440956800.0, + "grad_norm": 2.0552590280374625, + "language_loss": 0.67256629, + "learning_rate": 2.2108901908068028e-06, + "loss": 0.74968272, + "num_input_tokens_seen": 172484850, + "router_z_loss_clip": 1.54785156, + "router_z_loss_mlp": 0.12664795, + "step": 8023, + "time_per_iteration": 2.5504207611083984 + }, + { + "auxiliary_loss_clip": 0.06441902, + "auxiliary_loss_mlp": 0.01274331, + "balance_loss_clip": 0.06284256, + "balance_loss_mlp": 0.01261426, + "epoch": 0.48242897940778595, + "flos": 20084318628480.0, + "grad_norm": 1.5610336564699971, + "language_loss": 0.76933229, + "learning_rate": 2.2105028963289683e-06, + "loss": 0.84649462, + "num_input_tokens_seen": 172503525, + "router_z_loss_clip": 1.57714844, + "router_z_loss_mlp": 0.12915039, + "step": 8024, + "time_per_iteration": 2.576347589492798 + }, + { + "auxiliary_loss_clip": 0.06441621, + "auxiliary_loss_mlp": 0.01268624, + "balance_loss_clip": 0.06283119, + "balance_loss_mlp": 0.01255553, + "epoch": 0.4824891026604539, + "flos": 23410682812800.0, + "grad_norm": 1.519749434932375, + "language_loss": 0.75555682, + "learning_rate": 2.2101155938688423e-06, + "loss": 0.83265924, + "num_input_tokens_seen": 172524360, + "router_z_loss_clip": 1.58203125, + "router_z_loss_mlp": 0.13067627, + "step": 8025, + "time_per_iteration": 2.559722900390625 + }, + { + "auxiliary_loss_clip": 0.06445173, + "auxiliary_loss_mlp": 0.01270078, + "balance_loss_clip": 0.06286605, + "balance_loss_mlp": 0.01256536, + "epoch": 0.4825492259131219, + "flos": 20373691104000.0, + "grad_norm": 3.210842824131336, + "language_loss": 0.71099132, + "learning_rate": 2.209728283441112e-06, + "loss": 0.78814387, + "num_input_tokens_seen": 172541480, + "router_z_loss_clip": 1.58398438, + "router_z_loss_mlp": 0.13543701, + "step": 8026, + "time_per_iteration": 2.512563943862915 + }, + { + "auxiliary_loss_clip": 0.06450065, + "auxiliary_loss_mlp": 0.0127128, + "balance_loss_clip": 0.06287996, + "balance_loss_mlp": 0.01257094, + "epoch": 0.48260934916578985, + "flos": 14324193077760.0, + "grad_norm": 2.0787728376845385, + "language_loss": 0.74646676, + "learning_rate": 2.209340965060465e-06, + "loss": 0.82368022, + "num_input_tokens_seen": 172559005, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.14190674, + "step": 8027, + "time_per_iteration": 2.523252248764038 + }, + { + "auxiliary_loss_clip": 0.06445143, + "auxiliary_loss_mlp": 0.01269951, + "balance_loss_clip": 0.06285772, + "balance_loss_mlp": 0.01257166, + "epoch": 0.4826694724184578, + "flos": 22126654846080.0, + "grad_norm": 1.6924958309049165, + "language_loss": 0.67414463, + "learning_rate": 2.2089536387415868e-06, + "loss": 0.75129557, + "num_input_tokens_seen": 172578435, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.12792969, + "step": 8028, + "time_per_iteration": 2.5118508338928223 + }, + { + "auxiliary_loss_clip": 0.06443746, + "auxiliary_loss_mlp": 0.01268069, + "balance_loss_clip": 0.06285068, + "balance_loss_mlp": 0.01254926, + "epoch": 0.48272959567112583, + "flos": 16186882141440.0, + "grad_norm": 1.4109383431826554, + "language_loss": 0.73031461, + "learning_rate": 2.2085663044991655e-06, + "loss": 0.80743277, + "num_input_tokens_seen": 172596095, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.13134766, + "step": 8029, + "time_per_iteration": 2.513986587524414 + }, + { + "auxiliary_loss_clip": 0.06447576, + "auxiliary_loss_mlp": 0.01267005, + "balance_loss_clip": 0.0628765, + "balance_loss_mlp": 0.01253755, + "epoch": 0.4827897189237938, + "flos": 23186326705920.0, + "grad_norm": 2.2851559020013994, + "language_loss": 0.84759653, + "learning_rate": 2.2081789623478896e-06, + "loss": 0.92474234, + "num_input_tokens_seen": 172615255, + "router_z_loss_clip": 1.59863281, + "router_z_loss_mlp": 0.13256836, + "step": 8030, + "time_per_iteration": 2.523336410522461 + }, + { + "auxiliary_loss_clip": 0.0644383, + "auxiliary_loss_mlp": 0.0126632, + "balance_loss_clip": 0.06286349, + "balance_loss_mlp": 0.01253374, + "epoch": 0.48284984217646176, + "flos": 21659018808960.0, + "grad_norm": 2.6563677126547858, + "language_loss": 0.73703504, + "learning_rate": 2.2077916123024466e-06, + "loss": 0.81413656, + "num_input_tokens_seen": 172633185, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.12945557, + "step": 8031, + "time_per_iteration": 2.523465633392334 + }, + { + "auxiliary_loss_clip": 0.06451262, + "auxiliary_loss_mlp": 0.01268996, + "balance_loss_clip": 0.06285872, + "balance_loss_mlp": 0.01254548, + "epoch": 0.48290996542912973, + "flos": 31475501314560.0, + "grad_norm": 1.5957405541522132, + "language_loss": 0.71345282, + "learning_rate": 2.2074042543775245e-06, + "loss": 0.79065537, + "num_input_tokens_seen": 172654280, + "router_z_loss_clip": 1.65234375, + "router_z_loss_mlp": 0.14434814, + "step": 8032, + "time_per_iteration": 4.084775924682617 + }, + { + "auxiliary_loss_clip": 0.06441716, + "auxiliary_loss_mlp": 0.01271696, + "balance_loss_clip": 0.06285156, + "balance_loss_mlp": 0.01259066, + "epoch": 0.4829700886817977, + "flos": 24468803372160.0, + "grad_norm": 1.3669631944631024, + "language_loss": 0.74361598, + "learning_rate": 2.2070168885878126e-06, + "loss": 0.82075012, + "num_input_tokens_seen": 172675545, + "router_z_loss_clip": 1.56542969, + "router_z_loss_mlp": 0.12609863, + "step": 8033, + "time_per_iteration": 2.558655023574829 + }, + { + "auxiliary_loss_clip": 0.06455428, + "auxiliary_loss_mlp": 0.0126933, + "balance_loss_clip": 0.06290704, + "balance_loss_mlp": 0.01255436, + "epoch": 0.48303021193446566, + "flos": 25709170561920.0, + "grad_norm": 1.5251236339326817, + "language_loss": 0.83579373, + "learning_rate": 2.2066295149479996e-06, + "loss": 0.91304129, + "num_input_tokens_seen": 172696455, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.13909912, + "step": 8034, + "time_per_iteration": 4.034566402435303 + }, + { + "auxiliary_loss_clip": 0.06441804, + "auxiliary_loss_mlp": 0.01267333, + "balance_loss_clip": 0.06286483, + "balance_loss_mlp": 0.01255162, + "epoch": 0.4830903351871336, + "flos": 20091613933440.0, + "grad_norm": 1.4995747649605073, + "language_loss": 0.80011666, + "learning_rate": 2.2062421334727744e-06, + "loss": 0.87720799, + "num_input_tokens_seen": 172716720, + "router_z_loss_clip": 1.55273438, + "router_z_loss_mlp": 0.12176514, + "step": 8035, + "time_per_iteration": 2.560216188430786 + }, + { + "auxiliary_loss_clip": 0.06443267, + "auxiliary_loss_mlp": 0.01272391, + "balance_loss_clip": 0.06284694, + "balance_loss_mlp": 0.01257996, + "epoch": 0.4831504584398016, + "flos": 39460670910720.0, + "grad_norm": 2.4180718513556196, + "language_loss": 0.69735384, + "learning_rate": 2.2058547441768267e-06, + "loss": 0.77451038, + "num_input_tokens_seen": 172737435, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.14385986, + "step": 8036, + "time_per_iteration": 2.676248550415039 + }, + { + "auxiliary_loss_clip": 0.06441773, + "auxiliary_loss_mlp": 0.01267179, + "balance_loss_clip": 0.06283154, + "balance_loss_mlp": 0.01254638, + "epoch": 0.48321058169246955, + "flos": 20012006954880.0, + "grad_norm": 1.964916404489229, + "language_loss": 0.7269727, + "learning_rate": 2.205467347074847e-06, + "loss": 0.80406225, + "num_input_tokens_seen": 172755700, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.12536621, + "step": 8037, + "time_per_iteration": 2.5361721515655518 + }, + { + "auxiliary_loss_clip": 0.06449978, + "auxiliary_loss_mlp": 0.01267952, + "balance_loss_clip": 0.06284893, + "balance_loss_mlp": 0.01254594, + "epoch": 0.4832707049451375, + "flos": 20747869511040.0, + "grad_norm": 2.294242093364334, + "language_loss": 0.69135344, + "learning_rate": 2.205079942181525e-06, + "loss": 0.76853275, + "num_input_tokens_seen": 172775185, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.13366699, + "step": 8038, + "time_per_iteration": 2.5300488471984863 + }, + { + "auxiliary_loss_clip": 0.06441218, + "auxiliary_loss_mlp": 0.01266351, + "balance_loss_clip": 0.06284897, + "balance_loss_mlp": 0.01253161, + "epoch": 0.4833308281978055, + "flos": 33153889322880.0, + "grad_norm": 1.5080177559172256, + "language_loss": 0.79238868, + "learning_rate": 2.20469252951155e-06, + "loss": 0.8694644, + "num_input_tokens_seen": 172796990, + "router_z_loss_clip": 1.56347656, + "router_z_loss_mlp": 0.13201904, + "step": 8039, + "time_per_iteration": 4.106697082519531 + }, + { + "auxiliary_loss_clip": 0.06443603, + "auxiliary_loss_mlp": 0.01270239, + "balance_loss_clip": 0.06284612, + "balance_loss_mlp": 0.01257221, + "epoch": 0.48339095145047345, + "flos": 19105301923200.0, + "grad_norm": 2.5245127885531926, + "language_loss": 0.78196943, + "learning_rate": 2.2043051090796143e-06, + "loss": 0.85910785, + "num_input_tokens_seen": 172814915, + "router_z_loss_clip": 1.58886719, + "router_z_loss_mlp": 0.13024902, + "step": 8040, + "time_per_iteration": 2.51356840133667 + }, + { + "auxiliary_loss_clip": 0.06449578, + "auxiliary_loss_mlp": 0.01268689, + "balance_loss_clip": 0.06287356, + "balance_loss_mlp": 0.01254342, + "epoch": 0.4834510747031414, + "flos": 34468035632640.0, + "grad_norm": 1.5686841461958603, + "language_loss": 0.75648201, + "learning_rate": 2.203917680900409e-06, + "loss": 0.83366466, + "num_input_tokens_seen": 172837060, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.14337158, + "step": 8041, + "time_per_iteration": 2.6821110248565674 + }, + { + "auxiliary_loss_clip": 0.06444554, + "auxiliary_loss_mlp": 0.01274011, + "balance_loss_clip": 0.06290209, + "balance_loss_mlp": 0.01261244, + "epoch": 0.48351119795580944, + "flos": 27388187475840.0, + "grad_norm": 1.655786729526556, + "language_loss": 0.66309774, + "learning_rate": 2.203530244988624e-06, + "loss": 0.74028337, + "num_input_tokens_seen": 172856545, + "router_z_loss_clip": 1.54296875, + "router_z_loss_mlp": 0.12756348, + "step": 8042, + "time_per_iteration": 2.587979316711426 + }, + { + "auxiliary_loss_clip": 0.0635567, + "auxiliary_loss_mlp": 0.01262787, + "balance_loss_clip": 0.06287327, + "balance_loss_mlp": 0.012603, + "epoch": 0.4835713212084774, + "flos": 67162967815680.0, + "grad_norm": 0.683297043643475, + "language_loss": 0.58432257, + "learning_rate": 2.2031428013589517e-06, + "loss": 0.66050708, + "num_input_tokens_seen": 172923055, + "router_z_loss_clip": 0.68359375, + "router_z_loss_mlp": 0.02485657, + "step": 8043, + "time_per_iteration": 3.240037441253662 + }, + { + "auxiliary_loss_clip": 0.06448962, + "auxiliary_loss_mlp": 0.01270561, + "balance_loss_clip": 0.06288527, + "balance_loss_mlp": 0.01256548, + "epoch": 0.48363144446114537, + "flos": 17973234535680.0, + "grad_norm": 8.666689726695457, + "language_loss": 0.71932065, + "learning_rate": 2.2027553500260847e-06, + "loss": 0.79651588, + "num_input_tokens_seen": 172940700, + "router_z_loss_clip": 1.60546875, + "router_z_loss_mlp": 0.14013672, + "step": 8044, + "time_per_iteration": 2.557222604751587 + }, + { + "auxiliary_loss_clip": 0.06443186, + "auxiliary_loss_mlp": 0.01271215, + "balance_loss_clip": 0.06287612, + "balance_loss_mlp": 0.01257667, + "epoch": 0.48369156771381333, + "flos": 20599556584320.0, + "grad_norm": 1.2792089170093015, + "language_loss": 0.76084363, + "learning_rate": 2.202367891004714e-06, + "loss": 0.83798766, + "num_input_tokens_seen": 172961125, + "router_z_loss_clip": 1.55371094, + "router_z_loss_mlp": 0.13549805, + "step": 8045, + "time_per_iteration": 3.9927117824554443 + }, + { + "auxiliary_loss_clip": 0.06452677, + "auxiliary_loss_mlp": 0.01268119, + "balance_loss_clip": 0.06291251, + "balance_loss_mlp": 0.01255274, + "epoch": 0.4837516909664813, + "flos": 22681780145280.0, + "grad_norm": 1.8159113209886955, + "language_loss": 0.69591677, + "learning_rate": 2.201980424309533e-06, + "loss": 0.77312469, + "num_input_tokens_seen": 172980405, + "router_z_loss_clip": 1.61230469, + "router_z_loss_mlp": 0.12854004, + "step": 8046, + "time_per_iteration": 2.563061237335205 + }, + { + "auxiliary_loss_clip": 0.06444287, + "auxiliary_loss_mlp": 0.01272531, + "balance_loss_clip": 0.06285235, + "balance_loss_mlp": 0.01259674, + "epoch": 0.48381181421914926, + "flos": 25525414558080.0, + "grad_norm": 1.7918831202662233, + "language_loss": 0.83005214, + "learning_rate": 2.2015929499552337e-06, + "loss": 0.90722024, + "num_input_tokens_seen": 172999105, + "router_z_loss_clip": 1.58886719, + "router_z_loss_mlp": 0.12866211, + "step": 8047, + "time_per_iteration": 2.5624239444732666 + }, + { + "auxiliary_loss_clip": 0.06441472, + "auxiliary_loss_mlp": 0.01268193, + "balance_loss_clip": 0.06286557, + "balance_loss_mlp": 0.01255522, + "epoch": 0.4838719374718172, + "flos": 24214454703360.0, + "grad_norm": 3.8503425220093273, + "language_loss": 0.8051095, + "learning_rate": 2.2012054679565092e-06, + "loss": 0.88220614, + "num_input_tokens_seen": 173019935, + "router_z_loss_clip": 1.54785156, + "router_z_loss_mlp": 0.12664795, + "step": 8048, + "time_per_iteration": 2.5535151958465576 + }, + { + "auxiliary_loss_clip": 0.06450336, + "auxiliary_loss_mlp": 0.01269587, + "balance_loss_clip": 0.06287669, + "balance_loss_mlp": 0.01255091, + "epoch": 0.4839320607244852, + "flos": 26731889971200.0, + "grad_norm": 1.601579819484506, + "language_loss": 0.8118276, + "learning_rate": 2.200817978328054e-06, + "loss": 0.88902682, + "num_input_tokens_seen": 173039700, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.14477539, + "step": 8049, + "time_per_iteration": 2.576237440109253 + }, + { + "auxiliary_loss_clip": 0.0644124, + "auxiliary_loss_mlp": 0.01266224, + "balance_loss_clip": 0.0628837, + "balance_loss_mlp": 0.01254392, + "epoch": 0.48399218397715316, + "flos": 20455142872320.0, + "grad_norm": 1.6782620987313854, + "language_loss": 0.7275942, + "learning_rate": 2.2004304810845602e-06, + "loss": 0.8046689, + "num_input_tokens_seen": 173059170, + "router_z_loss_clip": 1.52832031, + "router_z_loss_mlp": 0.1184082, + "step": 8050, + "time_per_iteration": 2.5001842975616455 + }, + { + "auxiliary_loss_clip": 0.06348944, + "auxiliary_loss_mlp": 0.01254327, + "balance_loss_clip": 0.06280461, + "balance_loss_mlp": 0.01252052, + "epoch": 0.4840523072298211, + "flos": 67199626776960.0, + "grad_norm": 0.6876828937687306, + "language_loss": 0.56319511, + "learning_rate": 2.200042976240723e-06, + "loss": 0.63922787, + "num_input_tokens_seen": 173119000, + "router_z_loss_clip": 0.6875, + "router_z_loss_mlp": 0.02278137, + "step": 8051, + "time_per_iteration": 3.1732234954833984 + }, + { + "auxiliary_loss_clip": 0.06445932, + "auxiliary_loss_mlp": 0.01267371, + "balance_loss_clip": 0.06285888, + "balance_loss_mlp": 0.01254806, + "epoch": 0.4841124304824891, + "flos": 22416782008320.0, + "grad_norm": 1.9466323687223244, + "language_loss": 0.75329518, + "learning_rate": 2.199655463811236e-06, + "loss": 0.83042824, + "num_input_tokens_seen": 173137570, + "router_z_loss_clip": 1.60058594, + "router_z_loss_mlp": 0.12554932, + "step": 8052, + "time_per_iteration": 2.525742769241333 + }, + { + "auxiliary_loss_clip": 0.06445011, + "auxiliary_loss_mlp": 0.01268398, + "balance_loss_clip": 0.0628748, + "balance_loss_mlp": 0.01255797, + "epoch": 0.48417255373515705, + "flos": 13848926319360.0, + "grad_norm": 9.22847684329053, + "language_loss": 0.65932119, + "learning_rate": 2.1992679438107936e-06, + "loss": 0.73645532, + "num_input_tokens_seen": 173154355, + "router_z_loss_clip": 1.57226562, + "router_z_loss_mlp": 0.1260376, + "step": 8053, + "time_per_iteration": 2.508634328842163 + }, + { + "auxiliary_loss_clip": 0.06439514, + "auxiliary_loss_mlp": 0.01270848, + "balance_loss_clip": 0.06286003, + "balance_loss_mlp": 0.01258242, + "epoch": 0.484232676987825, + "flos": 31657747944960.0, + "grad_norm": 1.9001102819500506, + "language_loss": 0.69764733, + "learning_rate": 2.198880416254091e-06, + "loss": 0.77475095, + "num_input_tokens_seen": 173174845, + "router_z_loss_clip": 1.53613281, + "router_z_loss_mlp": 0.12609863, + "step": 8054, + "time_per_iteration": 2.6046009063720703 + }, + { + "auxiliary_loss_clip": 0.06439343, + "auxiliary_loss_mlp": 0.01266256, + "balance_loss_clip": 0.062842, + "balance_loss_mlp": 0.01253578, + "epoch": 0.48429280024049304, + "flos": 24101878343040.0, + "grad_norm": 1.6288967613161636, + "language_loss": 0.69845426, + "learning_rate": 2.1984928811558233e-06, + "loss": 0.77551031, + "num_input_tokens_seen": 173195025, + "router_z_loss_clip": 1.55273438, + "router_z_loss_mlp": 0.12683105, + "step": 8055, + "time_per_iteration": 2.5645036697387695 + }, + { + "auxiliary_loss_clip": 0.06441051, + "auxiliary_loss_mlp": 0.01270203, + "balance_loss_clip": 0.06283379, + "balance_loss_mlp": 0.01257621, + "epoch": 0.484352923493161, + "flos": 17535842622720.0, + "grad_norm": 2.1100630556312256, + "language_loss": 0.63363564, + "learning_rate": 2.198105338530685e-06, + "loss": 0.71074814, + "num_input_tokens_seen": 173213065, + "router_z_loss_clip": 1.57617188, + "router_z_loss_mlp": 0.12597656, + "step": 8056, + "time_per_iteration": 2.4887776374816895 + }, + { + "auxiliary_loss_clip": 0.06441829, + "auxiliary_loss_mlp": 0.01269551, + "balance_loss_clip": 0.06283918, + "balance_loss_mlp": 0.0125639, + "epoch": 0.48441304674582897, + "flos": 29174204453760.0, + "grad_norm": 1.7583270452203597, + "language_loss": 0.67791545, + "learning_rate": 2.1977177883933726e-06, + "loss": 0.75502926, + "num_input_tokens_seen": 173234545, + "router_z_loss_clip": 1.58007812, + "router_z_loss_mlp": 0.1315918, + "step": 8057, + "time_per_iteration": 2.6147687435150146 + }, + { + "auxiliary_loss_clip": 0.06438136, + "auxiliary_loss_mlp": 0.01270959, + "balance_loss_clip": 0.06284122, + "balance_loss_mlp": 0.0125933, + "epoch": 0.48447316999849693, + "flos": 15891933369600.0, + "grad_norm": 1.7129310149903716, + "language_loss": 0.81615114, + "learning_rate": 2.1973302307585827e-06, + "loss": 0.89324206, + "num_input_tokens_seen": 173252175, + "router_z_loss_clip": 1.5390625, + "router_z_loss_mlp": 0.11627197, + "step": 8058, + "time_per_iteration": 2.499464273452759 + }, + { + "auxiliary_loss_clip": 0.06444308, + "auxiliary_loss_mlp": 0.01272607, + "balance_loss_clip": 0.06283933, + "balance_loss_mlp": 0.01259619, + "epoch": 0.4845332932511649, + "flos": 24386974260480.0, + "grad_norm": 1.694669299967896, + "language_loss": 0.79782939, + "learning_rate": 2.1969426656410097e-06, + "loss": 0.87499857, + "num_input_tokens_seen": 173268790, + "router_z_loss_clip": 1.60351562, + "router_z_loss_mlp": 0.12988281, + "step": 8059, + "time_per_iteration": 2.5456764698028564 + }, + { + "auxiliary_loss_clip": 0.06445169, + "auxiliary_loss_mlp": 0.0126972, + "balance_loss_clip": 0.06283809, + "balance_loss_mlp": 0.01256065, + "epoch": 0.48459341650383286, + "flos": 37124434097280.0, + "grad_norm": 2.171534570518566, + "language_loss": 0.67115712, + "learning_rate": 2.196555093055352e-06, + "loss": 0.74830604, + "num_input_tokens_seen": 173288030, + "router_z_loss_clip": 1.61230469, + "router_z_loss_mlp": 0.13659668, + "step": 8060, + "time_per_iteration": 2.639552593231201 + }, + { + "auxiliary_loss_clip": 0.06448266, + "auxiliary_loss_mlp": 0.01267897, + "balance_loss_clip": 0.06291284, + "balance_loss_mlp": 0.01255404, + "epoch": 0.48465353975650083, + "flos": 22973500535040.0, + "grad_norm": 1.9145476252385885, + "language_loss": 0.67691833, + "learning_rate": 2.1961675130163046e-06, + "loss": 0.75407994, + "num_input_tokens_seen": 173305965, + "router_z_loss_clip": 1.56835938, + "router_z_loss_mlp": 0.12506104, + "step": 8061, + "time_per_iteration": 2.636291265487671 + }, + { + "auxiliary_loss_clip": 0.06440581, + "auxiliary_loss_mlp": 0.012731, + "balance_loss_clip": 0.06285343, + "balance_loss_mlp": 0.01259581, + "epoch": 0.4847136630091688, + "flos": 17712680664960.0, + "grad_norm": 1.8103717294603696, + "language_loss": 0.83217871, + "learning_rate": 2.1957799255385653e-06, + "loss": 0.90931553, + "num_input_tokens_seen": 173321985, + "router_z_loss_clip": 1.55078125, + "router_z_loss_mlp": 0.13531494, + "step": 8062, + "time_per_iteration": 2.5335779190063477 + }, + { + "auxiliary_loss_clip": 0.06441268, + "auxiliary_loss_mlp": 0.01271147, + "balance_loss_clip": 0.06286018, + "balance_loss_mlp": 0.01259077, + "epoch": 0.48477378626183676, + "flos": 22024853735040.0, + "grad_norm": 1.4198166357723545, + "language_loss": 0.74425852, + "learning_rate": 2.1953923306368325e-06, + "loss": 0.82138264, + "num_input_tokens_seen": 173341315, + "router_z_loss_clip": 1.55273438, + "router_z_loss_mlp": 0.1206665, + "step": 8063, + "time_per_iteration": 2.575752019882202 + }, + { + "auxiliary_loss_clip": 0.06438752, + "auxiliary_loss_mlp": 0.01268531, + "balance_loss_clip": 0.06282612, + "balance_loss_mlp": 0.01256276, + "epoch": 0.4848339095145047, + "flos": 27970118881920.0, + "grad_norm": 1.5830553745787852, + "language_loss": 0.79034185, + "learning_rate": 2.1950047283258023e-06, + "loss": 0.86741465, + "num_input_tokens_seen": 173361055, + "router_z_loss_clip": 1.56054688, + "router_z_loss_mlp": 0.12255859, + "step": 8064, + "time_per_iteration": 2.601557731628418 + }, + { + "auxiliary_loss_clip": 0.06441826, + "auxiliary_loss_mlp": 0.01266756, + "balance_loss_clip": 0.06290108, + "balance_loss_mlp": 0.01254817, + "epoch": 0.4848940327671727, + "flos": 21695090791680.0, + "grad_norm": 1.71958305783472, + "language_loss": 0.795892, + "learning_rate": 2.194617118620173e-06, + "loss": 0.87297779, + "num_input_tokens_seen": 173379255, + "router_z_loss_clip": 1.51757812, + "router_z_loss_mlp": 0.1194458, + "step": 8065, + "time_per_iteration": 2.5325217247009277 + }, + { + "auxiliary_loss_clip": 0.06434904, + "auxiliary_loss_mlp": 0.0126868, + "balance_loss_clip": 0.06285697, + "balance_loss_mlp": 0.01256813, + "epoch": 0.48495415601984065, + "flos": 20637892627200.0, + "grad_norm": 1.7068711802888106, + "language_loss": 0.76162863, + "learning_rate": 2.194229501534644e-06, + "loss": 0.83866447, + "num_input_tokens_seen": 173398370, + "router_z_loss_clip": 1.49121094, + "router_z_loss_mlp": 0.11865234, + "step": 8066, + "time_per_iteration": 2.506598949432373 + }, + { + "auxiliary_loss_clip": 0.06438506, + "auxiliary_loss_mlp": 0.01268819, + "balance_loss_clip": 0.06285724, + "balance_loss_mlp": 0.01257375, + "epoch": 0.4850142792725086, + "flos": 25634972171520.0, + "grad_norm": 1.302389197624331, + "language_loss": 0.72176784, + "learning_rate": 2.193841877083912e-06, + "loss": 0.79884112, + "num_input_tokens_seen": 173419595, + "router_z_loss_clip": 1.52636719, + "router_z_loss_mlp": 0.11444092, + "step": 8067, + "time_per_iteration": 2.5921640396118164 + }, + { + "auxiliary_loss_clip": 0.06438944, + "auxiliary_loss_mlp": 0.01268187, + "balance_loss_clip": 0.06282091, + "balance_loss_mlp": 0.01255986, + "epoch": 0.4850744025251766, + "flos": 13777075843200.0, + "grad_norm": 2.2825284137915975, + "language_loss": 0.79257572, + "learning_rate": 2.1934542452826767e-06, + "loss": 0.86964703, + "num_input_tokens_seen": 173435390, + "router_z_loss_clip": 1.5703125, + "router_z_loss_mlp": 0.12219238, + "step": 8068, + "time_per_iteration": 2.5287444591522217 + }, + { + "auxiliary_loss_clip": 0.06435382, + "auxiliary_loss_mlp": 0.01268403, + "balance_loss_clip": 0.06280828, + "balance_loss_mlp": 0.012565, + "epoch": 0.4851345257778446, + "flos": 20266691040000.0, + "grad_norm": 1.4034205816126453, + "language_loss": 0.84740359, + "learning_rate": 2.193066606145638e-06, + "loss": 0.92444146, + "num_input_tokens_seen": 173454095, + "router_z_loss_clip": 1.54394531, + "router_z_loss_mlp": 0.11901855, + "step": 8069, + "time_per_iteration": 2.548593044281006 + }, + { + "auxiliary_loss_clip": 0.06435016, + "auxiliary_loss_mlp": 0.01266308, + "balance_loss_clip": 0.06280835, + "balance_loss_mlp": 0.01254763, + "epoch": 0.48519464903051257, + "flos": 27097095991680.0, + "grad_norm": 1.771109080244907, + "language_loss": 0.78544027, + "learning_rate": 2.192678959687493e-06, + "loss": 0.86245352, + "num_input_tokens_seen": 173475300, + "router_z_loss_clip": 1.54003906, + "router_z_loss_mlp": 0.11553955, + "step": 8070, + "time_per_iteration": 2.581026315689087 + }, + { + "auxiliary_loss_clip": 0.06432221, + "auxiliary_loss_mlp": 0.01268982, + "balance_loss_clip": 0.06279641, + "balance_loss_mlp": 0.01256239, + "epoch": 0.48525477228318054, + "flos": 17132677902720.0, + "grad_norm": 3.597843949572919, + "language_loss": 0.77929389, + "learning_rate": 2.192291305922943e-06, + "loss": 0.85630596, + "num_input_tokens_seen": 173492005, + "router_z_loss_clip": 1.52539062, + "router_z_loss_mlp": 0.12756348, + "step": 8071, + "time_per_iteration": 3.963555335998535 + }, + { + "auxiliary_loss_clip": 0.06438918, + "auxiliary_loss_mlp": 0.01270068, + "balance_loss_clip": 0.06282261, + "balance_loss_mlp": 0.01256777, + "epoch": 0.4853148955358485, + "flos": 28187263537920.0, + "grad_norm": 2.115731418126265, + "language_loss": 0.72008896, + "learning_rate": 2.1919036448666873e-06, + "loss": 0.7971788, + "num_input_tokens_seen": 173511995, + "router_z_loss_clip": 1.56835938, + "router_z_loss_mlp": 0.13299561, + "step": 8072, + "time_per_iteration": 2.6861536502838135 + }, + { + "auxiliary_loss_clip": 0.06439583, + "auxiliary_loss_mlp": 0.0126679, + "balance_loss_clip": 0.06282715, + "balance_loss_mlp": 0.01253761, + "epoch": 0.48537501878851647, + "flos": 17499015953280.0, + "grad_norm": 1.8999559951356444, + "language_loss": 0.88288134, + "learning_rate": 2.1915159765334262e-06, + "loss": 0.95994508, + "num_input_tokens_seen": 173530215, + "router_z_loss_clip": 1.56835938, + "router_z_loss_mlp": 0.13037109, + "step": 8073, + "time_per_iteration": 2.4814834594726562 + }, + { + "auxiliary_loss_clip": 0.06432822, + "auxiliary_loss_mlp": 0.01269151, + "balance_loss_clip": 0.06283282, + "balance_loss_mlp": 0.01257731, + "epoch": 0.48543514204118443, + "flos": 28592398828800.0, + "grad_norm": 2.458004055687259, + "language_loss": 0.61317194, + "learning_rate": 2.19112830093786e-06, + "loss": 0.69019163, + "num_input_tokens_seen": 173550920, + "router_z_loss_clip": 1.49316406, + "router_z_loss_mlp": 0.11413574, + "step": 8074, + "time_per_iteration": 3.984229326248169 + }, + { + "auxiliary_loss_clip": 0.06435922, + "auxiliary_loss_mlp": 0.01265981, + "balance_loss_clip": 0.0627804, + "balance_loss_mlp": 0.01254024, + "epoch": 0.4854952652938524, + "flos": 20966355832320.0, + "grad_norm": 1.641968552330247, + "language_loss": 0.73514569, + "learning_rate": 2.19074061809469e-06, + "loss": 0.81216466, + "num_input_tokens_seen": 173569065, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.11962891, + "step": 8075, + "time_per_iteration": 2.5479941368103027 + }, + { + "auxiliary_loss_clip": 0.06429431, + "auxiliary_loss_mlp": 0.01268393, + "balance_loss_clip": 0.06278814, + "balance_loss_mlp": 0.01256704, + "epoch": 0.48555538854652036, + "flos": 66543344000640.0, + "grad_norm": 1.7202852105657789, + "language_loss": 0.81976241, + "learning_rate": 2.1903529280186163e-06, + "loss": 0.89674067, + "num_input_tokens_seen": 173596085, + "router_z_loss_clip": 1.50488281, + "router_z_loss_mlp": 0.11676025, + "step": 8076, + "time_per_iteration": 2.9675233364105225 + }, + { + "auxiliary_loss_clip": 0.06435271, + "auxiliary_loss_mlp": 0.01273017, + "balance_loss_clip": 0.06280246, + "balance_loss_mlp": 0.01259242, + "epoch": 0.4856155117991883, + "flos": 15930520974720.0, + "grad_norm": 1.9409864090603182, + "language_loss": 0.86392474, + "learning_rate": 2.1899652307243407e-06, + "loss": 0.94100761, + "num_input_tokens_seen": 173613900, + "router_z_loss_clip": 1.55078125, + "router_z_loss_mlp": 0.13781738, + "step": 8077, + "time_per_iteration": 2.5062685012817383 + }, + { + "auxiliary_loss_clip": 0.06325787, + "auxiliary_loss_mlp": 0.01252172, + "balance_loss_clip": 0.062584, + "balance_loss_mlp": 0.0125022, + "epoch": 0.4856756350518563, + "flos": 71066986848000.0, + "grad_norm": 0.9289783803731909, + "language_loss": 0.58378243, + "learning_rate": 2.189577526226564e-06, + "loss": 0.65956199, + "num_input_tokens_seen": 173671305, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 0.01950073, + "step": 8078, + "time_per_iteration": 4.502991199493408 + }, + { + "auxiliary_loss_clip": 0.06440585, + "auxiliary_loss_mlp": 0.01268963, + "balance_loss_clip": 0.06280588, + "balance_loss_mlp": 0.01255886, + "epoch": 0.48573575830452426, + "flos": 29833478778240.0, + "grad_norm": 2.317528327629363, + "language_loss": 0.72874224, + "learning_rate": 2.1891898145399884e-06, + "loss": 0.80583775, + "num_input_tokens_seen": 173692070, + "router_z_loss_clip": 1.59863281, + "router_z_loss_mlp": 0.1307373, + "step": 8079, + "time_per_iteration": 2.5839955806732178 + }, + { + "auxiliary_loss_clip": 0.06440279, + "auxiliary_loss_mlp": 0.01268912, + "balance_loss_clip": 0.06283288, + "balance_loss_mlp": 0.01256925, + "epoch": 0.4857958815571922, + "flos": 17645274455040.0, + "grad_norm": 2.8950752184508843, + "language_loss": 0.80285943, + "learning_rate": 2.1888020956793172e-06, + "loss": 0.87995136, + "num_input_tokens_seen": 173709785, + "router_z_loss_clip": 1.5703125, + "router_z_loss_mlp": 0.11999512, + "step": 8080, + "time_per_iteration": 2.542607307434082 + }, + { + "auxiliary_loss_clip": 0.06436758, + "auxiliary_loss_mlp": 0.01265336, + "balance_loss_clip": 0.06281016, + "balance_loss_mlp": 0.01252754, + "epoch": 0.4858560048098602, + "flos": 21111817720320.0, + "grad_norm": 1.934060586134842, + "language_loss": 0.84237295, + "learning_rate": 2.188414369659251e-06, + "loss": 0.9193939, + "num_input_tokens_seen": 173728770, + "router_z_loss_clip": 1.55957031, + "router_z_loss_mlp": 0.12579346, + "step": 8081, + "time_per_iteration": 2.523787021636963 + }, + { + "auxiliary_loss_clip": 0.06433021, + "auxiliary_loss_mlp": 0.01268596, + "balance_loss_clip": 0.06277841, + "balance_loss_mlp": 0.0125512, + "epoch": 0.4859161280625282, + "flos": 22097375043840.0, + "grad_norm": 1.530246142437005, + "language_loss": 0.83824933, + "learning_rate": 2.1880266364944924e-06, + "loss": 0.91526556, + "num_input_tokens_seen": 173747355, + "router_z_loss_clip": 1.55175781, + "router_z_loss_mlp": 0.13464355, + "step": 8082, + "time_per_iteration": 2.562739372253418 + }, + { + "auxiliary_loss_clip": 0.0643435, + "auxiliary_loss_mlp": 0.01268115, + "balance_loss_clip": 0.06283809, + "balance_loss_mlp": 0.01255849, + "epoch": 0.4859762513151962, + "flos": 17499183661440.0, + "grad_norm": 1.9064651850671037, + "language_loss": 0.87366831, + "learning_rate": 2.187638896199746e-06, + "loss": 0.95069289, + "num_input_tokens_seen": 173764825, + "router_z_loss_clip": 1.50292969, + "router_z_loss_mlp": 0.12268066, + "step": 8083, + "time_per_iteration": 2.5062954425811768 + }, + { + "auxiliary_loss_clip": 0.064337, + "auxiliary_loss_mlp": 0.01268463, + "balance_loss_clip": 0.06281679, + "balance_loss_mlp": 0.01255356, + "epoch": 0.48603637456786414, + "flos": 18010061205120.0, + "grad_norm": 1.6184381568123027, + "language_loss": 0.81531483, + "learning_rate": 2.1872511487897126e-06, + "loss": 0.89233649, + "num_input_tokens_seen": 173783215, + "router_z_loss_clip": 1.51855469, + "router_z_loss_mlp": 0.13110352, + "step": 8084, + "time_per_iteration": 3.9548635482788086 + }, + { + "auxiliary_loss_clip": 0.06438272, + "auxiliary_loss_mlp": 0.01269138, + "balance_loss_clip": 0.06283273, + "balance_loss_mlp": 0.01256645, + "epoch": 0.4860964978205321, + "flos": 22498611120000.0, + "grad_norm": 1.8856401579659385, + "language_loss": 0.68814772, + "learning_rate": 2.186863394279098e-06, + "loss": 0.76522183, + "num_input_tokens_seen": 173801905, + "router_z_loss_clip": 1.54980469, + "router_z_loss_mlp": 0.12475586, + "step": 8085, + "time_per_iteration": 2.525697708129883 + }, + { + "auxiliary_loss_clip": 0.06434157, + "auxiliary_loss_mlp": 0.01270175, + "balance_loss_clip": 0.0627964, + "balance_loss_mlp": 0.01257158, + "epoch": 0.48615662107320007, + "flos": 23380061345280.0, + "grad_norm": 1.4159205206948002, + "language_loss": 0.77895916, + "learning_rate": 2.1864756326826046e-06, + "loss": 0.85600245, + "num_input_tokens_seen": 173824690, + "router_z_loss_clip": 1.54492188, + "router_z_loss_mlp": 0.13024902, + "step": 8086, + "time_per_iteration": 2.5914857387542725 + }, + { + "auxiliary_loss_clip": 0.06433852, + "auxiliary_loss_mlp": 0.01265857, + "balance_loss_clip": 0.06279776, + "balance_loss_mlp": 0.01253292, + "epoch": 0.48621674432586803, + "flos": 34426722769920.0, + "grad_norm": 1.8125320165569008, + "language_loss": 0.69750226, + "learning_rate": 2.1860878640149355e-06, + "loss": 0.7744993, + "num_input_tokens_seen": 173844450, + "router_z_loss_clip": 1.54101562, + "router_z_loss_mlp": 0.12573242, + "step": 8087, + "time_per_iteration": 2.611724615097046 + }, + { + "auxiliary_loss_clip": 0.06440983, + "auxiliary_loss_mlp": 0.01266005, + "balance_loss_clip": 0.06277409, + "balance_loss_mlp": 0.0125254, + "epoch": 0.486276867578536, + "flos": 33115595207040.0, + "grad_norm": 1.9401027694089865, + "language_loss": 0.73050213, + "learning_rate": 2.1857000882907974e-06, + "loss": 0.80757201, + "num_input_tokens_seen": 173864975, + "router_z_loss_clip": 1.63476562, + "router_z_loss_mlp": 0.13482666, + "step": 8088, + "time_per_iteration": 2.6235716342926025 + }, + { + "auxiliary_loss_clip": 0.06434947, + "auxiliary_loss_mlp": 0.01270457, + "balance_loss_clip": 0.06279397, + "balance_loss_mlp": 0.01257982, + "epoch": 0.48633699083120396, + "flos": 21477149521920.0, + "grad_norm": 1.5117477196191362, + "language_loss": 0.75765258, + "learning_rate": 2.185312305524892e-06, + "loss": 0.83470654, + "num_input_tokens_seen": 173883805, + "router_z_loss_clip": 1.55566406, + "router_z_loss_mlp": 0.12481689, + "step": 8089, + "time_per_iteration": 2.522033214569092 + }, + { + "auxiliary_loss_clip": 0.06432723, + "auxiliary_loss_mlp": 0.01266623, + "balance_loss_clip": 0.06276575, + "balance_loss_mlp": 0.01254702, + "epoch": 0.48639711408387193, + "flos": 20090565757440.0, + "grad_norm": 2.0719257974800307, + "language_loss": 0.84617764, + "learning_rate": 2.184924515731926e-06, + "loss": 0.92317104, + "num_input_tokens_seen": 173903520, + "router_z_loss_clip": 1.5625, + "router_z_loss_mlp": 0.1192627, + "step": 8090, + "time_per_iteration": 2.6032962799072266 + }, + { + "auxiliary_loss_clip": 0.06428317, + "auxiliary_loss_mlp": 0.01267937, + "balance_loss_clip": 0.06279606, + "balance_loss_mlp": 0.01256362, + "epoch": 0.4864572373365399, + "flos": 20785450867200.0, + "grad_norm": 1.460241002220635, + "language_loss": 0.76103806, + "learning_rate": 2.1845367189266045e-06, + "loss": 0.8380006, + "num_input_tokens_seen": 173924255, + "router_z_loss_clip": 1.48632812, + "router_z_loss_mlp": 0.11578369, + "step": 8091, + "time_per_iteration": 2.534083127975464 + }, + { + "auxiliary_loss_clip": 0.06434517, + "auxiliary_loss_mlp": 0.01264632, + "balance_loss_clip": 0.0627959, + "balance_loss_mlp": 0.01252651, + "epoch": 0.48651736058920786, + "flos": 26031554346240.0, + "grad_norm": 1.4698762569471817, + "language_loss": 0.8086524, + "learning_rate": 2.184148915123631e-06, + "loss": 0.88564396, + "num_input_tokens_seen": 173943285, + "router_z_loss_clip": 1.546875, + "router_z_loss_mlp": 0.11987305, + "step": 8092, + "time_per_iteration": 2.5732295513153076 + }, + { + "auxiliary_loss_clip": 0.06434911, + "auxiliary_loss_mlp": 0.01268235, + "balance_loss_clip": 0.06279235, + "balance_loss_mlp": 0.01254711, + "epoch": 0.4865774838418758, + "flos": 20491885687680.0, + "grad_norm": 1.359461965274961, + "language_loss": 0.71901554, + "learning_rate": 2.1837611043377126e-06, + "loss": 0.79604697, + "num_input_tokens_seen": 173962205, + "router_z_loss_clip": 1.55566406, + "router_z_loss_mlp": 0.13537598, + "step": 8093, + "time_per_iteration": 2.5315988063812256 + }, + { + "auxiliary_loss_clip": 0.06430057, + "auxiliary_loss_mlp": 0.01268667, + "balance_loss_clip": 0.06278083, + "balance_loss_mlp": 0.01256424, + "epoch": 0.4866376070945438, + "flos": 23554048348800.0, + "grad_norm": 1.746145283456106, + "language_loss": 0.68340707, + "learning_rate": 2.1833732865835545e-06, + "loss": 0.76039433, + "num_input_tokens_seen": 173980945, + "router_z_loss_clip": 1.51855469, + "router_z_loss_mlp": 0.12237549, + "step": 8094, + "time_per_iteration": 2.5621020793914795 + }, + { + "auxiliary_loss_clip": 0.06439431, + "auxiliary_loss_mlp": 0.01276508, + "balance_loss_clip": 0.06280254, + "balance_loss_mlp": 0.01263502, + "epoch": 0.4866977303472118, + "flos": 16696166457600.0, + "grad_norm": 2.187009986392795, + "language_loss": 0.66443598, + "learning_rate": 2.1829854618758636e-06, + "loss": 0.74159545, + "num_input_tokens_seen": 173998860, + "router_z_loss_clip": 1.59277344, + "router_z_loss_mlp": 0.13006592, + "step": 8095, + "time_per_iteration": 2.4823923110961914 + }, + { + "auxiliary_loss_clip": 0.06436304, + "auxiliary_loss_mlp": 0.01266824, + "balance_loss_clip": 0.06279348, + "balance_loss_mlp": 0.01254444, + "epoch": 0.4867578535998798, + "flos": 17902012965120.0, + "grad_norm": 1.919238290363099, + "language_loss": 0.79046065, + "learning_rate": 2.182597630229345e-06, + "loss": 0.86749196, + "num_input_tokens_seen": 174016665, + "router_z_loss_clip": 1.56933594, + "router_z_loss_mlp": 0.12384033, + "step": 8096, + "time_per_iteration": 2.507293701171875 + }, + { + "auxiliary_loss_clip": 0.06432957, + "auxiliary_loss_mlp": 0.01269945, + "balance_loss_clip": 0.06279905, + "balance_loss_mlp": 0.01257154, + "epoch": 0.48681797685254774, + "flos": 22644366497280.0, + "grad_norm": 2.003337305767246, + "language_loss": 0.68162191, + "learning_rate": 2.1822097916587067e-06, + "loss": 0.75865096, + "num_input_tokens_seen": 174034800, + "router_z_loss_clip": 1.53222656, + "router_z_loss_mlp": 0.12799072, + "step": 8097, + "time_per_iteration": 2.5473361015319824 + }, + { + "auxiliary_loss_clip": 0.06432723, + "auxiliary_loss_mlp": 0.01272073, + "balance_loss_clip": 0.06279548, + "balance_loss_mlp": 0.01259944, + "epoch": 0.4868781001052157, + "flos": 20892283223040.0, + "grad_norm": 1.4401604045572658, + "language_loss": 0.71418583, + "learning_rate": 2.1818219461786543e-06, + "loss": 0.79123378, + "num_input_tokens_seen": 174054445, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.12127686, + "step": 8098, + "time_per_iteration": 2.5543363094329834 + }, + { + "auxiliary_loss_clip": 0.06441437, + "auxiliary_loss_mlp": 0.01269071, + "balance_loss_clip": 0.06279659, + "balance_loss_mlp": 0.01255725, + "epoch": 0.48693822335788367, + "flos": 41984688723840.0, + "grad_norm": 1.4376447542768653, + "language_loss": 0.66435724, + "learning_rate": 2.1814340938038956e-06, + "loss": 0.74146235, + "num_input_tokens_seen": 174077890, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.13348389, + "step": 8099, + "time_per_iteration": 2.711822032928467 + }, + { + "auxiliary_loss_clip": 0.0643863, + "auxiliary_loss_mlp": 0.01271817, + "balance_loss_clip": 0.06281494, + "balance_loss_mlp": 0.01259485, + "epoch": 0.48699834661055164, + "flos": 24250149342720.0, + "grad_norm": 1.5852242434455028, + "language_loss": 0.66993374, + "learning_rate": 2.181046234549138e-06, + "loss": 0.74703825, + "num_input_tokens_seen": 174097460, + "router_z_loss_clip": 1.5703125, + "router_z_loss_mlp": 0.12329102, + "step": 8100, + "time_per_iteration": 2.5218353271484375 + }, + { + "auxiliary_loss_clip": 0.0643635, + "auxiliary_loss_mlp": 0.0127283, + "balance_loss_clip": 0.06283123, + "balance_loss_mlp": 0.01260176, + "epoch": 0.4870584698632196, + "flos": 25931388389760.0, + "grad_norm": 1.294146562327305, + "language_loss": 0.76505142, + "learning_rate": 2.180658368429088e-06, + "loss": 0.84214324, + "num_input_tokens_seen": 174120775, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.12664795, + "step": 8101, + "time_per_iteration": 2.645095109939575 + }, + { + "auxiliary_loss_clip": 0.06345028, + "auxiliary_loss_mlp": 0.01254744, + "balance_loss_clip": 0.06277841, + "balance_loss_mlp": 0.01252564, + "epoch": 0.48711859311588757, + "flos": 70232006511360.0, + "grad_norm": 0.6692636412141889, + "language_loss": 0.5212009, + "learning_rate": 2.1802704954584565e-06, + "loss": 0.59719861, + "num_input_tokens_seen": 174189135, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 0.02183533, + "step": 8102, + "time_per_iteration": 3.2782585620880127 + }, + { + "auxiliary_loss_clip": 0.06439511, + "auxiliary_loss_mlp": 0.01266928, + "balance_loss_clip": 0.06284305, + "balance_loss_mlp": 0.01253523, + "epoch": 0.48717871636855553, + "flos": 12346831301760.0, + "grad_norm": 2.023585148758525, + "language_loss": 0.7395249, + "learning_rate": 2.1798826156519484e-06, + "loss": 0.81658924, + "num_input_tokens_seen": 174203250, + "router_z_loss_clip": 1.55078125, + "router_z_loss_mlp": 0.13415527, + "step": 8103, + "time_per_iteration": 2.5020487308502197 + }, + { + "auxiliary_loss_clip": 0.06437068, + "auxiliary_loss_mlp": 0.01271054, + "balance_loss_clip": 0.06280553, + "balance_loss_mlp": 0.01257059, + "epoch": 0.4872388396212235, + "flos": 23483874954240.0, + "grad_norm": 1.425095223977108, + "language_loss": 0.6284436, + "learning_rate": 2.1794947290242737e-06, + "loss": 0.70552492, + "num_input_tokens_seen": 174224145, + "router_z_loss_clip": 1.56835938, + "router_z_loss_mlp": 0.13989258, + "step": 8104, + "time_per_iteration": 2.5457305908203125 + }, + { + "auxiliary_loss_clip": 0.06436496, + "auxiliary_loss_mlp": 0.012688, + "balance_loss_clip": 0.06281868, + "balance_loss_mlp": 0.01255759, + "epoch": 0.48729896287389146, + "flos": 31435068919680.0, + "grad_norm": 2.8385892248494575, + "language_loss": 0.69637764, + "learning_rate": 2.1791068355901413e-06, + "loss": 0.77343059, + "num_input_tokens_seen": 174244435, + "router_z_loss_clip": 1.54492188, + "router_z_loss_mlp": 0.13043213, + "step": 8105, + "time_per_iteration": 2.6453042030334473 + }, + { + "auxiliary_loss_clip": 0.0643308, + "auxiliary_loss_mlp": 0.01270898, + "balance_loss_clip": 0.06279837, + "balance_loss_mlp": 0.01258464, + "epoch": 0.4873590861265594, + "flos": 19063192446720.0, + "grad_norm": 1.510355754545757, + "language_loss": 0.73659271, + "learning_rate": 2.178718935364259e-06, + "loss": 0.81363249, + "num_input_tokens_seen": 174262710, + "router_z_loss_clip": 1.53222656, + "router_z_loss_mlp": 0.12451172, + "step": 8106, + "time_per_iteration": 2.4909706115722656 + }, + { + "auxiliary_loss_clip": 0.0644394, + "auxiliary_loss_mlp": 0.01272973, + "balance_loss_clip": 0.06283985, + "balance_loss_mlp": 0.01258888, + "epoch": 0.4874192093792274, + "flos": 24354424149120.0, + "grad_norm": 1.669305756095907, + "language_loss": 0.77040148, + "learning_rate": 2.1783310283613373e-06, + "loss": 0.84757066, + "num_input_tokens_seen": 174281545, + "router_z_loss_clip": 1.59863281, + "router_z_loss_mlp": 0.14080811, + "step": 8107, + "time_per_iteration": 2.5784239768981934 + }, + { + "auxiliary_loss_clip": 0.06432547, + "auxiliary_loss_mlp": 0.01266802, + "balance_loss_clip": 0.06281953, + "balance_loss_mlp": 0.01254971, + "epoch": 0.4874793326318954, + "flos": 23119339766400.0, + "grad_norm": 3.7362093355788857, + "language_loss": 0.75508547, + "learning_rate": 2.1779431145960853e-06, + "loss": 0.83207899, + "num_input_tokens_seen": 174300290, + "router_z_loss_clip": 1.50585938, + "router_z_loss_mlp": 0.1182251, + "step": 8108, + "time_per_iteration": 2.51676607131958 + }, + { + "auxiliary_loss_clip": 0.06434841, + "auxiliary_loss_mlp": 0.01268609, + "balance_loss_clip": 0.06281565, + "balance_loss_mlp": 0.01257522, + "epoch": 0.4875394558845634, + "flos": 19032193635840.0, + "grad_norm": 1.6826296910838767, + "language_loss": 0.73853874, + "learning_rate": 2.177555194083212e-06, + "loss": 0.81557322, + "num_input_tokens_seen": 174318490, + "router_z_loss_clip": 1.53320312, + "router_z_loss_mlp": 0.11090088, + "step": 8109, + "time_per_iteration": 2.594315767288208 + }, + { + "auxiliary_loss_clip": 0.06429494, + "auxiliary_loss_mlp": 0.01265982, + "balance_loss_clip": 0.0628022, + "balance_loss_mlp": 0.01253853, + "epoch": 0.48759957913723134, + "flos": 21439945509120.0, + "grad_norm": 1.7035668673577407, + "language_loss": 0.78900838, + "learning_rate": 2.177167266837428e-06, + "loss": 0.86596316, + "num_input_tokens_seen": 174335505, + "router_z_loss_clip": 1.49414062, + "router_z_loss_mlp": 0.12121582, + "step": 8110, + "time_per_iteration": 2.517711639404297 + }, + { + "auxiliary_loss_clip": 0.06435961, + "auxiliary_loss_mlp": 0.01271496, + "balance_loss_clip": 0.06281072, + "balance_loss_mlp": 0.01259265, + "epoch": 0.4876597023898993, + "flos": 17754412798080.0, + "grad_norm": 2.2958034596154238, + "language_loss": 0.72586286, + "learning_rate": 2.176779332873444e-06, + "loss": 0.80293739, + "num_input_tokens_seen": 174353990, + "router_z_loss_clip": 1.55078125, + "router_z_loss_mlp": 0.12231445, + "step": 8111, + "time_per_iteration": 3.939528465270996 + }, + { + "auxiliary_loss_clip": 0.06434079, + "auxiliary_loss_mlp": 0.01270804, + "balance_loss_clip": 0.06283166, + "balance_loss_mlp": 0.01257947, + "epoch": 0.4877198256425673, + "flos": 17025384349440.0, + "grad_norm": 1.699620610729742, + "language_loss": 0.76073879, + "learning_rate": 2.17639139220597e-06, + "loss": 0.83778763, + "num_input_tokens_seen": 174373425, + "router_z_loss_clip": 1.50976562, + "router_z_loss_mlp": 0.128479, + "step": 8112, + "time_per_iteration": 2.614734172821045 + }, + { + "auxiliary_loss_clip": 0.06443445, + "auxiliary_loss_mlp": 0.01270845, + "balance_loss_clip": 0.06281452, + "balance_loss_mlp": 0.01257445, + "epoch": 0.48777994889523524, + "flos": 22390898296320.0, + "grad_norm": 1.829058055025175, + "language_loss": 0.756136, + "learning_rate": 2.1760034448497166e-06, + "loss": 0.83327889, + "num_input_tokens_seen": 174393070, + "router_z_loss_clip": 1.61914062, + "router_z_loss_mlp": 0.13397217, + "step": 8113, + "time_per_iteration": 3.978013277053833 + }, + { + "auxiliary_loss_clip": 0.0633374, + "auxiliary_loss_mlp": 0.01252792, + "balance_loss_clip": 0.06267424, + "balance_loss_mlp": 0.0125078, + "epoch": 0.4878400721479032, + "flos": 61261237664640.0, + "grad_norm": 0.785084950627043, + "language_loss": 0.48805469, + "learning_rate": 2.1756154908193943e-06, + "loss": 0.56391996, + "num_input_tokens_seen": 174446880, + "router_z_loss_clip": 0.6640625, + "router_z_loss_mlp": 0.02011108, + "step": 8114, + "time_per_iteration": 3.0476014614105225 + }, + { + "auxiliary_loss_clip": 0.06435857, + "auxiliary_loss_mlp": 0.01268853, + "balance_loss_clip": 0.06280373, + "balance_loss_mlp": 0.01255507, + "epoch": 0.48790019540057117, + "flos": 24543756449280.0, + "grad_norm": 1.6081028897323706, + "language_loss": 0.77215505, + "learning_rate": 2.1752275301297155e-06, + "loss": 0.84920216, + "num_input_tokens_seen": 174468485, + "router_z_loss_clip": 1.55664062, + "router_z_loss_mlp": 0.13348389, + "step": 8115, + "time_per_iteration": 2.615709066390991 + }, + { + "auxiliary_loss_clip": 0.06438144, + "auxiliary_loss_mlp": 0.01270465, + "balance_loss_clip": 0.06279679, + "balance_loss_mlp": 0.01256858, + "epoch": 0.48796031865323913, + "flos": 21840175336320.0, + "grad_norm": 1.938320357328723, + "language_loss": 0.72471654, + "learning_rate": 2.1748395627953915e-06, + "loss": 0.80180264, + "num_input_tokens_seen": 174486360, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.13586426, + "step": 8116, + "time_per_iteration": 2.502880573272705 + }, + { + "auxiliary_loss_clip": 0.06428684, + "auxiliary_loss_mlp": 0.01266227, + "balance_loss_clip": 0.06277922, + "balance_loss_mlp": 0.0125349, + "epoch": 0.4880204419059071, + "flos": 18594969431040.0, + "grad_norm": 1.5984683769851484, + "language_loss": 0.63217908, + "learning_rate": 2.1744515888311335e-06, + "loss": 0.70912814, + "num_input_tokens_seen": 174505075, + "router_z_loss_clip": 1.5078125, + "router_z_loss_mlp": 0.12750244, + "step": 8117, + "time_per_iteration": 2.5082454681396484 + }, + { + "auxiliary_loss_clip": 0.06432296, + "auxiliary_loss_mlp": 0.01268211, + "balance_loss_clip": 0.06278604, + "balance_loss_mlp": 0.0125558, + "epoch": 0.48808056515857506, + "flos": 19178242502400.0, + "grad_norm": 1.8182073979213524, + "language_loss": 0.79733717, + "learning_rate": 2.1740636082516533e-06, + "loss": 0.87434226, + "num_input_tokens_seen": 174523385, + "router_z_loss_clip": 1.53808594, + "router_z_loss_mlp": 0.1262207, + "step": 8118, + "time_per_iteration": 3.925899028778076 + }, + { + "auxiliary_loss_clip": 0.06436172, + "auxiliary_loss_mlp": 0.01267812, + "balance_loss_clip": 0.06280739, + "balance_loss_mlp": 0.01254669, + "epoch": 0.48814068841124303, + "flos": 20126679667200.0, + "grad_norm": 1.6934286727955359, + "language_loss": 0.63701898, + "learning_rate": 2.1736756210716645e-06, + "loss": 0.71405882, + "num_input_tokens_seen": 174542200, + "router_z_loss_clip": 1.55273438, + "router_z_loss_mlp": 0.13134766, + "step": 8119, + "time_per_iteration": 2.575894832611084 + }, + { + "auxiliary_loss_clip": 0.06432833, + "auxiliary_loss_mlp": 0.01267436, + "balance_loss_clip": 0.0627794, + "balance_loss_mlp": 0.01254698, + "epoch": 0.488200811663911, + "flos": 22972116942720.0, + "grad_norm": 1.6464989706708673, + "language_loss": 0.72632396, + "learning_rate": 2.173287627305878e-06, + "loss": 0.80332661, + "num_input_tokens_seen": 174563620, + "router_z_loss_clip": 1.54882812, + "router_z_loss_mlp": 0.12744141, + "step": 8120, + "time_per_iteration": 2.5209426879882812 + }, + { + "auxiliary_loss_clip": 0.06438597, + "auxiliary_loss_mlp": 0.01268649, + "balance_loss_clip": 0.06279586, + "balance_loss_mlp": 0.01255297, + "epoch": 0.48826093491657896, + "flos": 33918947827200.0, + "grad_norm": 1.7374615150704595, + "language_loss": 0.63695973, + "learning_rate": 2.1728996269690075e-06, + "loss": 0.71403223, + "num_input_tokens_seen": 174586465, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.13336182, + "step": 8121, + "time_per_iteration": 2.619035005569458 + }, + { + "auxiliary_loss_clip": 0.0644285, + "auxiliary_loss_mlp": 0.01267435, + "balance_loss_clip": 0.06282102, + "balance_loss_mlp": 0.01253643, + "epoch": 0.488321058169247, + "flos": 23076056332800.0, + "grad_norm": 1.857577186148328, + "language_loss": 0.82684505, + "learning_rate": 2.1725116200757664e-06, + "loss": 0.90394789, + "num_input_tokens_seen": 174604035, + "router_z_loss_clip": 1.60546875, + "router_z_loss_mlp": 0.13800049, + "step": 8122, + "time_per_iteration": 2.5246660709381104 + }, + { + "auxiliary_loss_clip": 0.06440943, + "auxiliary_loss_mlp": 0.01268474, + "balance_loss_clip": 0.06282523, + "balance_loss_mlp": 0.01255397, + "epoch": 0.48838118142191494, + "flos": 19323746317440.0, + "grad_norm": 1.8250600769951077, + "language_loss": 0.85500193, + "learning_rate": 2.172123606640866e-06, + "loss": 0.93209612, + "num_input_tokens_seen": 174621715, + "router_z_loss_clip": 1.58300781, + "router_z_loss_mlp": 0.13085938, + "step": 8123, + "time_per_iteration": 2.5317881107330322 + }, + { + "auxiliary_loss_clip": 0.06441107, + "auxiliary_loss_mlp": 0.01272894, + "balance_loss_clip": 0.06282164, + "balance_loss_mlp": 0.0125934, + "epoch": 0.4884413046745829, + "flos": 25417701734400.0, + "grad_norm": 1.3930130047769251, + "language_loss": 0.85569358, + "learning_rate": 2.1717355866790227e-06, + "loss": 0.93283355, + "num_input_tokens_seen": 174643835, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.13549805, + "step": 8124, + "time_per_iteration": 4.062820196151733 + }, + { + "auxiliary_loss_clip": 0.0644336, + "auxiliary_loss_mlp": 0.01266972, + "balance_loss_clip": 0.06285739, + "balance_loss_mlp": 0.01253769, + "epoch": 0.4885014279272509, + "flos": 20997103080960.0, + "grad_norm": 2.2053414232015363, + "language_loss": 0.80210352, + "learning_rate": 2.171347560204948e-06, + "loss": 0.87920684, + "num_input_tokens_seen": 174660955, + "router_z_loss_clip": 1.57519531, + "router_z_loss_mlp": 0.13201904, + "step": 8125, + "time_per_iteration": 2.5117287635803223 + }, + { + "auxiliary_loss_clip": 0.06437683, + "auxiliary_loss_mlp": 0.01269334, + "balance_loss_clip": 0.06283394, + "balance_loss_mlp": 0.01255976, + "epoch": 0.48856155117991884, + "flos": 13776656572800.0, + "grad_norm": 2.5222320452086016, + "language_loss": 0.72852308, + "learning_rate": 2.170959527233356e-06, + "loss": 0.80559325, + "num_input_tokens_seen": 174678270, + "router_z_loss_clip": 1.54296875, + "router_z_loss_mlp": 0.13348389, + "step": 8126, + "time_per_iteration": 2.5177037715911865 + }, + { + "auxiliary_loss_clip": 0.06445107, + "auxiliary_loss_mlp": 0.01269465, + "balance_loss_clip": 0.06285033, + "balance_loss_mlp": 0.01256113, + "epoch": 0.4886216744325868, + "flos": 32095936471680.0, + "grad_norm": 1.5739512034612657, + "language_loss": 0.68640763, + "learning_rate": 2.1705714877789633e-06, + "loss": 0.76355338, + "num_input_tokens_seen": 174698360, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.13372803, + "step": 8127, + "time_per_iteration": 2.606557846069336 + }, + { + "auxiliary_loss_clip": 0.06442467, + "auxiliary_loss_mlp": 0.01268043, + "balance_loss_clip": 0.06283246, + "balance_loss_mlp": 0.01254972, + "epoch": 0.48868179768525477, + "flos": 19616221393920.0, + "grad_norm": 1.6528567440124056, + "language_loss": 0.7688967, + "learning_rate": 2.170183441856481e-06, + "loss": 0.84600174, + "num_input_tokens_seen": 174716755, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.13085938, + "step": 8128, + "time_per_iteration": 2.564112901687622 + }, + { + "auxiliary_loss_clip": 0.06448022, + "auxiliary_loss_mlp": 0.01274106, + "balance_loss_clip": 0.06289175, + "balance_loss_mlp": 0.01260653, + "epoch": 0.48874192093792274, + "flos": 21293100028800.0, + "grad_norm": 1.6046032409788031, + "language_loss": 0.76479989, + "learning_rate": 2.1697953894806265e-06, + "loss": 0.84202117, + "num_input_tokens_seen": 174735560, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.13452148, + "step": 8129, + "time_per_iteration": 2.5374317169189453 + }, + { + "auxiliary_loss_clip": 0.06444047, + "auxiliary_loss_mlp": 0.01266373, + "balance_loss_clip": 0.06286857, + "balance_loss_mlp": 0.01252944, + "epoch": 0.4888020441905907, + "flos": 14178647335680.0, + "grad_norm": 2.0974560904884867, + "language_loss": 0.65812773, + "learning_rate": 2.169407330666114e-06, + "loss": 0.735232, + "num_input_tokens_seen": 174752730, + "router_z_loss_clip": 1.5703125, + "router_z_loss_mlp": 0.13452148, + "step": 8130, + "time_per_iteration": 2.5409111976623535 + }, + { + "auxiliary_loss_clip": 0.06440154, + "auxiliary_loss_mlp": 0.01269301, + "balance_loss_clip": 0.06286357, + "balance_loss_mlp": 0.01256528, + "epoch": 0.48886216744325867, + "flos": 24104813235840.0, + "grad_norm": 1.7915788803825166, + "language_loss": 0.72896582, + "learning_rate": 2.169019265427658e-06, + "loss": 0.80606037, + "num_input_tokens_seen": 174772520, + "router_z_loss_clip": 1.53808594, + "router_z_loss_mlp": 0.12768555, + "step": 8131, + "time_per_iteration": 2.56299090385437 + }, + { + "auxiliary_loss_clip": 0.06451105, + "auxiliary_loss_mlp": 0.01270383, + "balance_loss_clip": 0.06289683, + "balance_loss_mlp": 0.01256811, + "epoch": 0.48892229069592663, + "flos": 38439838218240.0, + "grad_norm": 1.2588039875779695, + "language_loss": 0.69597721, + "learning_rate": 2.1686311937799745e-06, + "loss": 0.77319217, + "num_input_tokens_seen": 174796540, + "router_z_loss_clip": 1.61230469, + "router_z_loss_mlp": 0.13586426, + "step": 8132, + "time_per_iteration": 2.70053768157959 + }, + { + "auxiliary_loss_clip": 0.06438366, + "auxiliary_loss_mlp": 0.01270585, + "balance_loss_clip": 0.06285742, + "balance_loss_mlp": 0.01257436, + "epoch": 0.4889824139485946, + "flos": 23850338785920.0, + "grad_norm": 2.3033814193981454, + "language_loss": 0.70031691, + "learning_rate": 2.1682431157377797e-06, + "loss": 0.77740639, + "num_input_tokens_seen": 174817840, + "router_z_loss_clip": 1.52636719, + "router_z_loss_mlp": 0.13146973, + "step": 8133, + "time_per_iteration": 2.5559158325195312 + }, + { + "auxiliary_loss_clip": 0.06443258, + "auxiliary_loss_mlp": 0.01270512, + "balance_loss_clip": 0.0629006, + "balance_loss_mlp": 0.01257548, + "epoch": 0.48904253720126256, + "flos": 24432731389440.0, + "grad_norm": 1.67073327790382, + "language_loss": 0.71227533, + "learning_rate": 2.1678550313157883e-06, + "loss": 0.78941303, + "num_input_tokens_seen": 174837885, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.12957764, + "step": 8134, + "time_per_iteration": 2.5545125007629395 + }, + { + "auxiliary_loss_clip": 0.06444804, + "auxiliary_loss_mlp": 0.01271014, + "balance_loss_clip": 0.06283658, + "balance_loss_mlp": 0.01257055, + "epoch": 0.4891026604539306, + "flos": 24177586106880.0, + "grad_norm": 1.7998075455300961, + "language_loss": 0.80179673, + "learning_rate": 2.167466940528718e-06, + "loss": 0.87895489, + "num_input_tokens_seen": 174855240, + "router_z_loss_clip": 1.61132812, + "router_z_loss_mlp": 0.13977051, + "step": 8135, + "time_per_iteration": 2.54832124710083 + }, + { + "auxiliary_loss_clip": 0.06439205, + "auxiliary_loss_mlp": 0.01267223, + "balance_loss_clip": 0.06284894, + "balance_loss_mlp": 0.01255004, + "epoch": 0.48916278370659855, + "flos": 21477443011200.0, + "grad_norm": 1.5753098834035062, + "language_loss": 0.74565232, + "learning_rate": 2.1670788433912843e-06, + "loss": 0.82271659, + "num_input_tokens_seen": 174875145, + "router_z_loss_clip": 1.54199219, + "router_z_loss_mlp": 0.12213135, + "step": 8136, + "time_per_iteration": 2.5225162506103516 + }, + { + "auxiliary_loss_clip": 0.06440099, + "auxiliary_loss_mlp": 0.01265964, + "balance_loss_clip": 0.06286249, + "balance_loss_mlp": 0.01253519, + "epoch": 0.4892229069592665, + "flos": 22316322562560.0, + "grad_norm": 1.5544220345156794, + "language_loss": 0.73698246, + "learning_rate": 2.166690739918204e-06, + "loss": 0.81404305, + "num_input_tokens_seen": 174894770, + "router_z_loss_clip": 1.5390625, + "router_z_loss_mlp": 0.12451172, + "step": 8137, + "time_per_iteration": 2.5138792991638184 + }, + { + "auxiliary_loss_clip": 0.06443799, + "auxiliary_loss_mlp": 0.01270566, + "balance_loss_clip": 0.06287944, + "balance_loss_mlp": 0.01257673, + "epoch": 0.4892830302119345, + "flos": 12791812008960.0, + "grad_norm": 2.1813813764641448, + "language_loss": 0.75360358, + "learning_rate": 2.1663026301241944e-06, + "loss": 0.83074719, + "num_input_tokens_seen": 174912780, + "router_z_loss_clip": 1.55761719, + "router_z_loss_mlp": 0.12890625, + "step": 8138, + "time_per_iteration": 2.52406644821167 + }, + { + "auxiliary_loss_clip": 0.06443107, + "auxiliary_loss_mlp": 0.01267703, + "balance_loss_clip": 0.06287149, + "balance_loss_mlp": 0.01255192, + "epoch": 0.48934315346460244, + "flos": 20820223111680.0, + "grad_norm": 1.5609881437350468, + "language_loss": 0.74361938, + "learning_rate": 2.165914514023972e-06, + "loss": 0.82072747, + "num_input_tokens_seen": 174931250, + "router_z_loss_clip": 1.56347656, + "router_z_loss_mlp": 0.12518311, + "step": 8139, + "time_per_iteration": 2.5139529705047607 + }, + { + "auxiliary_loss_clip": 0.0643822, + "auxiliary_loss_mlp": 0.01266126, + "balance_loss_clip": 0.06281914, + "balance_loss_mlp": 0.01253144, + "epoch": 0.4894032767172704, + "flos": 19761641354880.0, + "grad_norm": 2.1585110635090388, + "language_loss": 0.62118167, + "learning_rate": 2.165526391632255e-06, + "loss": 0.69822514, + "num_input_tokens_seen": 174951105, + "router_z_loss_clip": 1.56347656, + "router_z_loss_mlp": 0.12988281, + "step": 8140, + "time_per_iteration": 2.5321638584136963 + }, + { + "auxiliary_loss_clip": 0.06444136, + "auxiliary_loss_mlp": 0.01271459, + "balance_loss_clip": 0.06286128, + "balance_loss_mlp": 0.01257506, + "epoch": 0.4894633999699384, + "flos": 17824292703360.0, + "grad_norm": 1.8580247423308633, + "language_loss": 0.82388717, + "learning_rate": 2.1651382629637608e-06, + "loss": 0.90104312, + "num_input_tokens_seen": 174969120, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.13946533, + "step": 8141, + "time_per_iteration": 2.4724786281585693 + }, + { + "auxiliary_loss_clip": 0.06448226, + "auxiliary_loss_mlp": 0.01272495, + "balance_loss_clip": 0.06290399, + "balance_loss_mlp": 0.01258279, + "epoch": 0.48952352322260634, + "flos": 25530781219200.0, + "grad_norm": 1.6913372633538968, + "language_loss": 0.72726512, + "learning_rate": 2.1647501280332066e-06, + "loss": 0.80447233, + "num_input_tokens_seen": 174991295, + "router_z_loss_clip": 1.57714844, + "router_z_loss_mlp": 0.14208984, + "step": 8142, + "time_per_iteration": 2.5858702659606934 + }, + { + "auxiliary_loss_clip": 0.06437673, + "auxiliary_loss_mlp": 0.01270492, + "balance_loss_clip": 0.062835, + "balance_loss_mlp": 0.01257624, + "epoch": 0.4895836464752743, + "flos": 29062508561280.0, + "grad_norm": 1.575435552323968, + "language_loss": 0.6727252, + "learning_rate": 2.1643619868553105e-06, + "loss": 0.74980688, + "num_input_tokens_seen": 175012830, + "router_z_loss_clip": 1.54003906, + "router_z_loss_mlp": 0.12860107, + "step": 8143, + "time_per_iteration": 2.576084613800049 + }, + { + "auxiliary_loss_clip": 0.06441937, + "auxiliary_loss_mlp": 0.01266921, + "balance_loss_clip": 0.06288718, + "balance_loss_mlp": 0.01254678, + "epoch": 0.48964376972794227, + "flos": 33555335034240.0, + "grad_norm": 1.550815752793646, + "language_loss": 0.75150239, + "learning_rate": 2.163973839444793e-06, + "loss": 0.82859099, + "num_input_tokens_seen": 175035695, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.12243652, + "step": 8144, + "time_per_iteration": 2.641314744949341 + }, + { + "auxiliary_loss_clip": 0.06442292, + "auxiliary_loss_mlp": 0.01272411, + "balance_loss_clip": 0.06287357, + "balance_loss_mlp": 0.01259089, + "epoch": 0.48970389298061023, + "flos": 22060506447360.0, + "grad_norm": 1.55007225141579, + "language_loss": 0.75850821, + "learning_rate": 2.1635856858163695e-06, + "loss": 0.83565521, + "num_input_tokens_seen": 175056425, + "router_z_loss_clip": 1.54785156, + "router_z_loss_mlp": 0.13311768, + "step": 8145, + "time_per_iteration": 2.5283498764038086 + }, + { + "auxiliary_loss_clip": 0.0644419, + "auxiliary_loss_mlp": 0.0126844, + "balance_loss_clip": 0.0628912, + "balance_loss_mlp": 0.01254564, + "epoch": 0.4897640162332782, + "flos": 20090523830400.0, + "grad_norm": 1.8073715924768365, + "language_loss": 0.8057586, + "learning_rate": 2.163197525984761e-06, + "loss": 0.88288498, + "num_input_tokens_seen": 175074800, + "router_z_loss_clip": 1.54980469, + "router_z_loss_mlp": 0.13861084, + "step": 8146, + "time_per_iteration": 2.5433614253997803 + }, + { + "auxiliary_loss_clip": 0.06439323, + "auxiliary_loss_mlp": 0.01272664, + "balance_loss_clip": 0.06288785, + "balance_loss_mlp": 0.01260737, + "epoch": 0.48982413948594616, + "flos": 23813134773120.0, + "grad_norm": 1.5096911604618644, + "language_loss": 0.74847698, + "learning_rate": 2.162809359964687e-06, + "loss": 0.82559681, + "num_input_tokens_seen": 175094500, + "router_z_loss_clip": 1.50585938, + "router_z_loss_mlp": 0.11920166, + "step": 8147, + "time_per_iteration": 2.5623743534088135 + }, + { + "auxiliary_loss_clip": 0.06440282, + "auxiliary_loss_mlp": 0.01269967, + "balance_loss_clip": 0.06287088, + "balance_loss_mlp": 0.01256615, + "epoch": 0.4898842627386142, + "flos": 17645442163200.0, + "grad_norm": 1.9926710345073115, + "language_loss": 0.82984591, + "learning_rate": 2.162421187770864e-06, + "loss": 0.90694839, + "num_input_tokens_seen": 175112920, + "router_z_loss_clip": 1.53222656, + "router_z_loss_mlp": 0.13360596, + "step": 8148, + "time_per_iteration": 2.5547962188720703 + }, + { + "auxiliary_loss_clip": 0.0644103, + "auxiliary_loss_mlp": 0.01267177, + "balance_loss_clip": 0.0629115, + "balance_loss_mlp": 0.01255363, + "epoch": 0.48994438599128215, + "flos": 16623519367680.0, + "grad_norm": 2.084842951303776, + "language_loss": 0.74672109, + "learning_rate": 2.162033009418015e-06, + "loss": 0.82380313, + "num_input_tokens_seen": 175129910, + "router_z_loss_clip": 1.49804688, + "router_z_loss_mlp": 0.11810303, + "step": 8149, + "time_per_iteration": 2.533867120742798 + }, + { + "auxiliary_loss_clip": 0.06448293, + "auxiliary_loss_mlp": 0.01270293, + "balance_loss_clip": 0.06289135, + "balance_loss_mlp": 0.01256507, + "epoch": 0.4900045092439501, + "flos": 26622080795520.0, + "grad_norm": 1.692853589800977, + "language_loss": 0.76331913, + "learning_rate": 2.1616448249208567e-06, + "loss": 0.840505, + "num_input_tokens_seen": 175148705, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.13787842, + "step": 8150, + "time_per_iteration": 3.964707374572754 + }, + { + "auxiliary_loss_clip": 0.06450059, + "auxiliary_loss_mlp": 0.01271131, + "balance_loss_clip": 0.06294075, + "balance_loss_mlp": 0.01257833, + "epoch": 0.4900646324966181, + "flos": 19908361054080.0, + "grad_norm": 2.244817701974514, + "language_loss": 0.72999722, + "learning_rate": 2.1612566342941106e-06, + "loss": 0.80720913, + "num_input_tokens_seen": 175167425, + "router_z_loss_clip": 1.56054688, + "router_z_loss_mlp": 0.13299561, + "step": 8151, + "time_per_iteration": 2.5549871921539307 + }, + { + "auxiliary_loss_clip": 0.06359711, + "auxiliary_loss_mlp": 0.01259283, + "balance_loss_clip": 0.06292651, + "balance_loss_mlp": 0.01257264, + "epoch": 0.49012475574928605, + "flos": 59207245729920.0, + "grad_norm": 0.8143029783085558, + "language_loss": 0.54076481, + "learning_rate": 2.1608684375524977e-06, + "loss": 0.6169548, + "num_input_tokens_seen": 175227985, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 0.02018738, + "step": 8152, + "time_per_iteration": 3.1047332286834717 + }, + { + "auxiliary_loss_clip": 0.06453663, + "auxiliary_loss_mlp": 0.01270304, + "balance_loss_clip": 0.06293964, + "balance_loss_mlp": 0.01257018, + "epoch": 0.490184879001954, + "flos": 45270285096960.0, + "grad_norm": 1.7665437022978014, + "language_loss": 0.6121304, + "learning_rate": 2.1604802347107364e-06, + "loss": 0.68937004, + "num_input_tokens_seen": 175251895, + "router_z_loss_clip": 1.59570312, + "router_z_loss_mlp": 0.13293457, + "step": 8153, + "time_per_iteration": 4.15813422203064 + }, + { + "auxiliary_loss_clip": 0.06445354, + "auxiliary_loss_mlp": 0.01267264, + "balance_loss_clip": 0.06291656, + "balance_loss_mlp": 0.01254074, + "epoch": 0.490245002254622, + "flos": 28009754663040.0, + "grad_norm": 1.583608688205754, + "language_loss": 0.76979434, + "learning_rate": 2.160092025783549e-06, + "loss": 0.84692061, + "num_input_tokens_seen": 175272770, + "router_z_loss_clip": 1.53417969, + "router_z_loss_mlp": 0.13195801, + "step": 8154, + "time_per_iteration": 2.5994982719421387 + }, + { + "auxiliary_loss_clip": 0.06359019, + "auxiliary_loss_mlp": 0.01255517, + "balance_loss_clip": 0.06291451, + "balance_loss_mlp": 0.01253472, + "epoch": 0.49030512550728994, + "flos": 58971764229120.0, + "grad_norm": 1.0610708177187165, + "language_loss": 0.669397, + "learning_rate": 2.1597038107856564e-06, + "loss": 0.74554235, + "num_input_tokens_seen": 175336320, + "router_z_loss_clip": 0.67578125, + "router_z_loss_mlp": 0.02046204, + "step": 8155, + "time_per_iteration": 3.2433578968048096 + }, + { + "auxiliary_loss_clip": 0.06448951, + "auxiliary_loss_mlp": 0.01269488, + "balance_loss_clip": 0.06294696, + "balance_loss_mlp": 0.0125743, + "epoch": 0.4903652487599579, + "flos": 19797922972800.0, + "grad_norm": 1.7256067083752205, + "language_loss": 0.77014565, + "learning_rate": 2.1593155897317784e-06, + "loss": 0.84733009, + "num_input_tokens_seen": 175353540, + "router_z_loss_clip": 1.54296875, + "router_z_loss_mlp": 0.12072754, + "step": 8156, + "time_per_iteration": 2.5398688316345215 + }, + { + "auxiliary_loss_clip": 0.06449247, + "auxiliary_loss_mlp": 0.01273385, + "balance_loss_clip": 0.06294699, + "balance_loss_mlp": 0.01259384, + "epoch": 0.49042537201262587, + "flos": 21768492568320.0, + "grad_norm": 1.9286441434498818, + "language_loss": 0.84019762, + "learning_rate": 2.1589273626366377e-06, + "loss": 0.91742396, + "num_input_tokens_seen": 175370445, + "router_z_loss_clip": 1.54492188, + "router_z_loss_mlp": 0.14007568, + "step": 8157, + "time_per_iteration": 2.5673582553863525 + }, + { + "auxiliary_loss_clip": 0.06449863, + "auxiliary_loss_mlp": 0.01266635, + "balance_loss_clip": 0.06293592, + "balance_loss_mlp": 0.01253701, + "epoch": 0.49048549526529384, + "flos": 18959043421440.0, + "grad_norm": 1.7147218979138201, + "language_loss": 0.79903084, + "learning_rate": 2.158539129514956e-06, + "loss": 0.87619579, + "num_input_tokens_seen": 175389020, + "router_z_loss_clip": 1.56152344, + "router_z_loss_mlp": 0.12927246, + "step": 8158, + "time_per_iteration": 3.982774496078491 + }, + { + "auxiliary_loss_clip": 0.0645184, + "auxiliary_loss_mlp": 0.01273348, + "balance_loss_clip": 0.06292954, + "balance_loss_mlp": 0.01259615, + "epoch": 0.4905456185179618, + "flos": 26913633477120.0, + "grad_norm": 1.6654114756309404, + "language_loss": 0.69551659, + "learning_rate": 2.158150890381454e-06, + "loss": 0.77276844, + "num_input_tokens_seen": 175409545, + "router_z_loss_clip": 1.58886719, + "router_z_loss_mlp": 0.1373291, + "step": 8159, + "time_per_iteration": 2.6114954948425293 + }, + { + "auxiliary_loss_clip": 0.06446424, + "auxiliary_loss_mlp": 0.01266602, + "balance_loss_clip": 0.06292199, + "balance_loss_mlp": 0.01253591, + "epoch": 0.49060574177062977, + "flos": 20418567765120.0, + "grad_norm": 1.7624184717579066, + "language_loss": 0.73495585, + "learning_rate": 2.157762645250854e-06, + "loss": 0.81208611, + "num_input_tokens_seen": 175429335, + "router_z_loss_clip": 1.54101562, + "router_z_loss_mlp": 0.13006592, + "step": 8160, + "time_per_iteration": 2.5310287475585938 + }, + { + "auxiliary_loss_clip": 0.06446327, + "auxiliary_loss_mlp": 0.01268684, + "balance_loss_clip": 0.06286773, + "balance_loss_mlp": 0.01254718, + "epoch": 0.4906658650232978, + "flos": 17499477150720.0, + "grad_norm": 1.9303786573731354, + "language_loss": 0.71921647, + "learning_rate": 2.1573743941378796e-06, + "loss": 0.79636657, + "num_input_tokens_seen": 175446955, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.13952637, + "step": 8161, + "time_per_iteration": 2.548387050628662 + }, + { + "auxiliary_loss_clip": 0.06438495, + "auxiliary_loss_mlp": 0.01269834, + "balance_loss_clip": 0.06285487, + "balance_loss_mlp": 0.01257102, + "epoch": 0.49072598827596575, + "flos": 26621619598080.0, + "grad_norm": 1.7423183419157489, + "language_loss": 0.68838918, + "learning_rate": 2.1569861370572517e-06, + "loss": 0.76547247, + "num_input_tokens_seen": 175468195, + "router_z_loss_clip": 1.53027344, + "router_z_loss_mlp": 0.12738037, + "step": 8162, + "time_per_iteration": 2.5565345287323 + }, + { + "auxiliary_loss_clip": 0.06445014, + "auxiliary_loss_mlp": 0.01271543, + "balance_loss_clip": 0.06284854, + "balance_loss_mlp": 0.01258048, + "epoch": 0.4907861115286337, + "flos": 20418861254400.0, + "grad_norm": 1.5998221011516633, + "language_loss": 0.6369257, + "learning_rate": 2.1565978740236944e-06, + "loss": 0.7140913, + "num_input_tokens_seen": 175487455, + "router_z_loss_clip": 1.60058594, + "router_z_loss_mlp": 0.1350708, + "step": 8163, + "time_per_iteration": 2.545926094055176 + }, + { + "auxiliary_loss_clip": 0.0643242, + "auxiliary_loss_mlp": 0.01272916, + "balance_loss_clip": 0.06283394, + "balance_loss_mlp": 0.01260471, + "epoch": 0.4908462347813017, + "flos": 14069508992640.0, + "grad_norm": 1.9421890992027433, + "language_loss": 0.77104688, + "learning_rate": 2.1562096050519293e-06, + "loss": 0.84810019, + "num_input_tokens_seen": 175504450, + "router_z_loss_clip": 1.48828125, + "router_z_loss_mlp": 0.12438965, + "step": 8164, + "time_per_iteration": 3.93280029296875 + }, + { + "auxiliary_loss_clip": 0.06443131, + "auxiliary_loss_mlp": 0.01271936, + "balance_loss_clip": 0.06285694, + "balance_loss_mlp": 0.01258382, + "epoch": 0.49090635803396965, + "flos": 18741227932800.0, + "grad_norm": 1.56961735096587, + "language_loss": 0.77229172, + "learning_rate": 2.1558213301566806e-06, + "loss": 0.84944236, + "num_input_tokens_seen": 175523600, + "router_z_loss_clip": 1.57324219, + "router_z_loss_mlp": 0.13562012, + "step": 8165, + "time_per_iteration": 2.493861436843872 + }, + { + "auxiliary_loss_clip": 0.06434909, + "auxiliary_loss_mlp": 0.01271922, + "balance_loss_clip": 0.06283913, + "balance_loss_mlp": 0.01258922, + "epoch": 0.4909664812866376, + "flos": 20564784339840.0, + "grad_norm": 2.2518376482371862, + "language_loss": 0.77749753, + "learning_rate": 2.1554330493526716e-06, + "loss": 0.85456586, + "num_input_tokens_seen": 175542720, + "router_z_loss_clip": 1.50683594, + "router_z_loss_mlp": 0.13006592, + "step": 8166, + "time_per_iteration": 2.578685760498047 + }, + { + "auxiliary_loss_clip": 0.06343444, + "auxiliary_loss_mlp": 0.01254597, + "balance_loss_clip": 0.06276363, + "balance_loss_mlp": 0.01252508, + "epoch": 0.4910266045393056, + "flos": 54704006622720.0, + "grad_norm": 0.7970989298383858, + "language_loss": 0.54202092, + "learning_rate": 2.1550447626546253e-06, + "loss": 0.61800134, + "num_input_tokens_seen": 175598640, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 0.02090454, + "step": 8167, + "time_per_iteration": 3.1805777549743652 + }, + { + "auxiliary_loss_clip": 0.06435132, + "auxiliary_loss_mlp": 0.01271015, + "balance_loss_clip": 0.06282446, + "balance_loss_mlp": 0.01257902, + "epoch": 0.49108672779197354, + "flos": 16250892261120.0, + "grad_norm": 1.7548504171286585, + "language_loss": 0.86375958, + "learning_rate": 2.1546564700772665e-06, + "loss": 0.94082105, + "num_input_tokens_seen": 175615675, + "router_z_loss_clip": 1.52832031, + "router_z_loss_mlp": 0.13110352, + "step": 8168, + "time_per_iteration": 2.5346431732177734 + }, + { + "auxiliary_loss_clip": 0.06439523, + "auxiliary_loss_mlp": 0.01270106, + "balance_loss_clip": 0.06287682, + "balance_loss_mlp": 0.01257667, + "epoch": 0.4911468510446415, + "flos": 19831018135680.0, + "grad_norm": 1.6618595444085258, + "language_loss": 0.73708379, + "learning_rate": 2.1542681716353193e-06, + "loss": 0.81418014, + "num_input_tokens_seen": 175632255, + "router_z_loss_clip": 1.51757812, + "router_z_loss_mlp": 0.12438965, + "step": 8169, + "time_per_iteration": 2.519845962524414 + }, + { + "auxiliary_loss_clip": 0.06435073, + "auxiliary_loss_mlp": 0.01267032, + "balance_loss_clip": 0.06282359, + "balance_loss_mlp": 0.01254795, + "epoch": 0.4912069742973095, + "flos": 21218650076160.0, + "grad_norm": 1.7105636772686297, + "language_loss": 0.78364748, + "learning_rate": 2.1538798673435068e-06, + "loss": 0.86066854, + "num_input_tokens_seen": 175651625, + "router_z_loss_clip": 1.52441406, + "router_z_loss_mlp": 0.12237549, + "step": 8170, + "time_per_iteration": 2.5751500129699707 + }, + { + "auxiliary_loss_clip": 0.06441889, + "auxiliary_loss_mlp": 0.01268553, + "balance_loss_clip": 0.06285594, + "balance_loss_mlp": 0.01255547, + "epoch": 0.49126709754997744, + "flos": 19543280814720.0, + "grad_norm": 2.6389457816540527, + "language_loss": 0.76311809, + "learning_rate": 2.1534915572165545e-06, + "loss": 0.84022248, + "num_input_tokens_seen": 175669265, + "router_z_loss_clip": 1.56347656, + "router_z_loss_mlp": 0.12988281, + "step": 8171, + "time_per_iteration": 2.5004677772521973 + }, + { + "auxiliary_loss_clip": 0.06443939, + "auxiliary_loss_mlp": 0.01268404, + "balance_loss_clip": 0.06285004, + "balance_loss_mlp": 0.01255947, + "epoch": 0.4913272208026454, + "flos": 12244568993280.0, + "grad_norm": 2.2552468133898684, + "language_loss": 0.81709123, + "learning_rate": 2.1531032412691875e-06, + "loss": 0.89421463, + "num_input_tokens_seen": 175686065, + "router_z_loss_clip": 1.58789062, + "router_z_loss_mlp": 0.12457275, + "step": 8172, + "time_per_iteration": 2.5347814559936523 + }, + { + "auxiliary_loss_clip": 0.06338271, + "auxiliary_loss_mlp": 0.01256316, + "balance_loss_clip": 0.06271008, + "balance_loss_mlp": 0.0125441, + "epoch": 0.49138734405531337, + "flos": 65484663661440.0, + "grad_norm": 0.6802144154671269, + "language_loss": 0.5333854, + "learning_rate": 2.1527149195161295e-06, + "loss": 0.60933125, + "num_input_tokens_seen": 175748595, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 0.01902771, + "step": 8173, + "time_per_iteration": 3.1376869678497314 + }, + { + "auxiliary_loss_clip": 0.06444144, + "auxiliary_loss_mlp": 0.01267282, + "balance_loss_clip": 0.0628697, + "balance_loss_mlp": 0.01253663, + "epoch": 0.4914474673079814, + "flos": 18444434371200.0, + "grad_norm": 1.9185770389222636, + "language_loss": 0.6246022, + "learning_rate": 2.152326591972107e-06, + "loss": 0.70171648, + "num_input_tokens_seen": 175766770, + "router_z_loss_clip": 1.57226562, + "router_z_loss_mlp": 0.1361084, + "step": 8174, + "time_per_iteration": 2.5815811157226562 + }, + { + "auxiliary_loss_clip": 0.06439996, + "auxiliary_loss_mlp": 0.01273325, + "balance_loss_clip": 0.0628511, + "balance_loss_mlp": 0.0126051, + "epoch": 0.49150759056064935, + "flos": 21690772306560.0, + "grad_norm": 2.0568306898238045, + "language_loss": 0.69594127, + "learning_rate": 2.1519382586518445e-06, + "loss": 0.77307451, + "num_input_tokens_seen": 175783605, + "router_z_loss_clip": 1.54980469, + "router_z_loss_mlp": 0.1282959, + "step": 8175, + "time_per_iteration": 2.5219566822052 + }, + { + "auxiliary_loss_clip": 0.06442218, + "auxiliary_loss_mlp": 0.0126783, + "balance_loss_clip": 0.06288453, + "balance_loss_mlp": 0.01255021, + "epoch": 0.4915677138133173, + "flos": 22388969652480.0, + "grad_norm": 1.5433299767806794, + "language_loss": 0.74403, + "learning_rate": 2.151549919570068e-06, + "loss": 0.82113051, + "num_input_tokens_seen": 175801390, + "router_z_loss_clip": 1.53613281, + "router_z_loss_mlp": 0.12805176, + "step": 8176, + "time_per_iteration": 2.5598292350769043 + }, + { + "auxiliary_loss_clip": 0.0643885, + "auxiliary_loss_mlp": 0.01272965, + "balance_loss_clip": 0.0628263, + "balance_loss_mlp": 0.01259977, + "epoch": 0.4916278370659853, + "flos": 18408320461440.0, + "grad_norm": 1.8239688366126487, + "language_loss": 0.70529395, + "learning_rate": 2.1511615747415036e-06, + "loss": 0.78241211, + "num_input_tokens_seen": 175819830, + "router_z_loss_clip": 1.56054688, + "router_z_loss_mlp": 0.12988281, + "step": 8177, + "time_per_iteration": 2.5329604148864746 + }, + { + "auxiliary_loss_clip": 0.06340313, + "auxiliary_loss_mlp": 0.01256045, + "balance_loss_clip": 0.06273533, + "balance_loss_mlp": 0.01253889, + "epoch": 0.49168796031865325, + "flos": 66630147701760.0, + "grad_norm": 0.6656640602529083, + "language_loss": 0.46068031, + "learning_rate": 2.150773224180877e-06, + "loss": 0.53664386, + "num_input_tokens_seen": 175881765, + "router_z_loss_clip": 0.66796875, + "router_z_loss_mlp": 0.02159119, + "step": 8178, + "time_per_iteration": 3.170982837677002 + }, + { + "auxiliary_loss_clip": 0.06445555, + "auxiliary_loss_mlp": 0.01272894, + "balance_loss_clip": 0.06286988, + "balance_loss_mlp": 0.01259597, + "epoch": 0.4917480835713212, + "flos": 20965601145600.0, + "grad_norm": 2.2617000627187407, + "language_loss": 0.6597743, + "learning_rate": 2.1503848679029147e-06, + "loss": 0.73695886, + "num_input_tokens_seen": 175901795, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.13299561, + "step": 8179, + "time_per_iteration": 2.5594394207000732 + }, + { + "auxiliary_loss_clip": 0.06447062, + "auxiliary_loss_mlp": 0.01267463, + "balance_loss_clip": 0.06285466, + "balance_loss_mlp": 0.01254088, + "epoch": 0.4918082068239892, + "flos": 15777386438400.0, + "grad_norm": 2.2633588866978442, + "language_loss": 0.70069337, + "learning_rate": 2.149996505922343e-06, + "loss": 0.77783871, + "num_input_tokens_seen": 175917770, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.1338501, + "step": 8180, + "time_per_iteration": 2.489649772644043 + }, + { + "auxiliary_loss_clip": 0.0643749, + "auxiliary_loss_mlp": 0.01267489, + "balance_loss_clip": 0.06285596, + "balance_loss_mlp": 0.01254406, + "epoch": 0.49186833007665715, + "flos": 24611162659200.0, + "grad_norm": 1.7052643417851399, + "language_loss": 0.84654552, + "learning_rate": 2.1496081382538895e-06, + "loss": 0.92359537, + "num_input_tokens_seen": 175937000, + "router_z_loss_clip": 1.51953125, + "router_z_loss_mlp": 0.13098145, + "step": 8181, + "time_per_iteration": 2.570831298828125 + }, + { + "auxiliary_loss_clip": 0.06432545, + "auxiliary_loss_mlp": 0.0127158, + "balance_loss_clip": 0.06282885, + "balance_loss_mlp": 0.01259843, + "epoch": 0.4919284533293251, + "flos": 22097039627520.0, + "grad_norm": 1.9771399001803804, + "language_loss": 0.73092818, + "learning_rate": 2.1492197649122793e-06, + "loss": 0.80796945, + "num_input_tokens_seen": 175955170, + "router_z_loss_clip": 1.49609375, + "router_z_loss_mlp": 0.11743164, + "step": 8182, + "time_per_iteration": 2.4966702461242676 + }, + { + "auxiliary_loss_clip": 0.06435409, + "auxiliary_loss_mlp": 0.01272985, + "balance_loss_clip": 0.06280539, + "balance_loss_mlp": 0.01260826, + "epoch": 0.4919885765819931, + "flos": 23374820465280.0, + "grad_norm": 1.9470010509475855, + "language_loss": 0.73167384, + "learning_rate": 2.1488313859122412e-06, + "loss": 0.80875778, + "num_input_tokens_seen": 175973725, + "router_z_loss_clip": 1.54882812, + "router_z_loss_mlp": 0.1217041, + "step": 8183, + "time_per_iteration": 2.5529325008392334 + }, + { + "auxiliary_loss_clip": 0.06441429, + "auxiliary_loss_mlp": 0.01268017, + "balance_loss_clip": 0.06279727, + "balance_loss_mlp": 0.01254523, + "epoch": 0.49204869983466104, + "flos": 21366795294720.0, + "grad_norm": 2.013163662705091, + "language_loss": 0.77443838, + "learning_rate": 2.1484430012685015e-06, + "loss": 0.85153282, + "num_input_tokens_seen": 175993885, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.1348877, + "step": 8184, + "time_per_iteration": 2.508230209350586 + }, + { + "auxiliary_loss_clip": 0.06435518, + "auxiliary_loss_mlp": 0.01266873, + "balance_loss_clip": 0.06281742, + "balance_loss_mlp": 0.01254523, + "epoch": 0.492108823087329, + "flos": 21149147514240.0, + "grad_norm": 2.3088868689892674, + "language_loss": 0.71377504, + "learning_rate": 2.148054610995789e-06, + "loss": 0.79079902, + "num_input_tokens_seen": 176014210, + "router_z_loss_clip": 1.53710938, + "router_z_loss_mlp": 0.12347412, + "step": 8185, + "time_per_iteration": 2.545316219329834 + }, + { + "auxiliary_loss_clip": 0.06437825, + "auxiliary_loss_mlp": 0.01266771, + "balance_loss_clip": 0.06280625, + "balance_loss_mlp": 0.01253074, + "epoch": 0.49216894633999697, + "flos": 25123214160000.0, + "grad_norm": 1.8318004423040046, + "language_loss": 0.75395268, + "learning_rate": 2.147666215108831e-06, + "loss": 0.8309986, + "num_input_tokens_seen": 176033890, + "router_z_loss_clip": 1.57128906, + "router_z_loss_mlp": 0.13684082, + "step": 8186, + "time_per_iteration": 2.5238165855407715 + }, + { + "auxiliary_loss_clip": 0.06435218, + "auxiliary_loss_mlp": 0.01274022, + "balance_loss_clip": 0.06281888, + "balance_loss_mlp": 0.01261124, + "epoch": 0.49222906959266494, + "flos": 22644534205440.0, + "grad_norm": 2.2257308208746975, + "language_loss": 0.68571508, + "learning_rate": 2.1472778136223545e-06, + "loss": 0.76280749, + "num_input_tokens_seen": 176052720, + "router_z_loss_clip": 1.53515625, + "router_z_loss_mlp": 0.12908936, + "step": 8187, + "time_per_iteration": 2.561488151550293 + }, + { + "auxiliary_loss_clip": 0.06434098, + "auxiliary_loss_mlp": 0.01272206, + "balance_loss_clip": 0.06280753, + "balance_loss_mlp": 0.01259653, + "epoch": 0.49228919284533296, + "flos": 20416471413120.0, + "grad_norm": 1.3887162782350388, + "language_loss": 0.67211652, + "learning_rate": 2.1468894065510894e-06, + "loss": 0.7491796, + "num_input_tokens_seen": 176072545, + "router_z_loss_clip": 1.53417969, + "router_z_loss_mlp": 0.12567139, + "step": 8188, + "time_per_iteration": 2.5019164085388184 + }, + { + "auxiliary_loss_clip": 0.06437577, + "auxiliary_loss_mlp": 0.01267268, + "balance_loss_clip": 0.06282844, + "balance_loss_mlp": 0.012549, + "epoch": 0.4923493160980009, + "flos": 27129142978560.0, + "grad_norm": 1.6466242872646388, + "language_loss": 0.74921268, + "learning_rate": 2.1465009939097623e-06, + "loss": 0.8262611, + "num_input_tokens_seen": 176091490, + "router_z_loss_clip": 1.54785156, + "router_z_loss_mlp": 0.12365723, + "step": 8189, + "time_per_iteration": 2.6160171031951904 + }, + { + "auxiliary_loss_clip": 0.06432211, + "auxiliary_loss_mlp": 0.01271904, + "balance_loss_clip": 0.0627953, + "balance_loss_mlp": 0.01259363, + "epoch": 0.4924094393506689, + "flos": 35745522981120.0, + "grad_norm": 1.6094215463667148, + "language_loss": 0.64780444, + "learning_rate": 2.146112575713104e-06, + "loss": 0.72484565, + "num_input_tokens_seen": 176113200, + "router_z_loss_clip": 1.52734375, + "router_z_loss_mlp": 0.12542725, + "step": 8190, + "time_per_iteration": 4.0641090869903564 + }, + { + "auxiliary_loss_clip": 0.06438321, + "auxiliary_loss_mlp": 0.01273117, + "balance_loss_clip": 0.06285122, + "balance_loss_mlp": 0.01260486, + "epoch": 0.49246956260333685, + "flos": 20418735473280.0, + "grad_norm": 1.8613448606205585, + "language_loss": 0.71446037, + "learning_rate": 2.1457241519758413e-06, + "loss": 0.79157472, + "num_input_tokens_seen": 176132485, + "router_z_loss_clip": 1.53222656, + "router_z_loss_mlp": 0.12628174, + "step": 8191, + "time_per_iteration": 2.5388033390045166 + }, + { + "auxiliary_loss_clip": 0.06437817, + "auxiliary_loss_mlp": 0.01265513, + "balance_loss_clip": 0.06282701, + "balance_loss_mlp": 0.01253193, + "epoch": 0.4925296858560048, + "flos": 38985152590080.0, + "grad_norm": 1.8396866027790106, + "language_loss": 0.72404003, + "learning_rate": 2.1453357227127043e-06, + "loss": 0.80107331, + "num_input_tokens_seen": 176155755, + "router_z_loss_clip": 1.55078125, + "router_z_loss_mlp": 0.12335205, + "step": 8192, + "time_per_iteration": 2.696115255355835 + }, + { + "auxiliary_loss_clip": 0.06334923, + "auxiliary_loss_mlp": 0.01254622, + "balance_loss_clip": 0.06267789, + "balance_loss_mlp": 0.01252217, + "epoch": 0.4925898091086728, + "flos": 64300367652480.0, + "grad_norm": 0.7283072322766662, + "language_loss": 0.51975358, + "learning_rate": 2.1449472879384224e-06, + "loss": 0.59564906, + "num_input_tokens_seen": 176216295, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 0.02401733, + "step": 8193, + "time_per_iteration": 4.540759086608887 + }, + { + "auxiliary_loss_clip": 0.06434911, + "auxiliary_loss_mlp": 0.01271982, + "balance_loss_clip": 0.06282961, + "balance_loss_mlp": 0.01259417, + "epoch": 0.49264993236134075, + "flos": 23042541899520.0, + "grad_norm": 1.3982393371006636, + "language_loss": 0.77103728, + "learning_rate": 2.1445588476677246e-06, + "loss": 0.84810621, + "num_input_tokens_seen": 176235925, + "router_z_loss_clip": 1.51953125, + "router_z_loss_mlp": 0.12554932, + "step": 8194, + "time_per_iteration": 2.585632085800171 + }, + { + "auxiliary_loss_clip": 0.06434575, + "auxiliary_loss_mlp": 0.01269697, + "balance_loss_clip": 0.06280608, + "balance_loss_mlp": 0.01257376, + "epoch": 0.4927100556140087, + "flos": 24725248392960.0, + "grad_norm": 2.1551580003064186, + "language_loss": 0.70539922, + "learning_rate": 2.144170401915341e-06, + "loss": 0.78244197, + "num_input_tokens_seen": 176253865, + "router_z_loss_clip": 1.5390625, + "router_z_loss_mlp": 0.12329102, + "step": 8195, + "time_per_iteration": 2.5881664752960205 + }, + { + "auxiliary_loss_clip": 0.06438025, + "auxiliary_loss_mlp": 0.01269625, + "balance_loss_clip": 0.06284925, + "balance_loss_mlp": 0.01257687, + "epoch": 0.4927701788666767, + "flos": 23510932623360.0, + "grad_norm": 2.3036054872688765, + "language_loss": 0.81165189, + "learning_rate": 2.143781950696001e-06, + "loss": 0.88872838, + "num_input_tokens_seen": 176271525, + "router_z_loss_clip": 1.52929688, + "router_z_loss_mlp": 0.11932373, + "step": 8196, + "time_per_iteration": 2.5550785064697266 + }, + { + "auxiliary_loss_clip": 0.06437081, + "auxiliary_loss_mlp": 0.01270899, + "balance_loss_clip": 0.06279114, + "balance_loss_mlp": 0.01258311, + "epoch": 0.49283030211934464, + "flos": 22935374127360.0, + "grad_norm": 1.9095456135696567, + "language_loss": 0.70909548, + "learning_rate": 2.1433934940244356e-06, + "loss": 0.78617525, + "num_input_tokens_seen": 176290810, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.12597656, + "step": 8197, + "time_per_iteration": 4.003530263900757 + }, + { + "auxiliary_loss_clip": 0.06434973, + "auxiliary_loss_mlp": 0.01271256, + "balance_loss_clip": 0.0628255, + "balance_loss_mlp": 0.01259699, + "epoch": 0.4928904253720126, + "flos": 16878622723200.0, + "grad_norm": 1.745870627956974, + "language_loss": 0.84271383, + "learning_rate": 2.143005031915374e-06, + "loss": 0.91977608, + "num_input_tokens_seen": 176309165, + "router_z_loss_clip": 1.52441406, + "router_z_loss_mlp": 0.11553955, + "step": 8198, + "time_per_iteration": 2.498107671737671 + }, + { + "auxiliary_loss_clip": 0.06443786, + "auxiliary_loss_mlp": 0.01268462, + "balance_loss_clip": 0.06287393, + "balance_loss_mlp": 0.01254521, + "epoch": 0.4929505486246806, + "flos": 14871855363840.0, + "grad_norm": 1.7338591596570678, + "language_loss": 0.76126587, + "learning_rate": 2.1426165643835467e-06, + "loss": 0.83838832, + "num_input_tokens_seen": 176324960, + "router_z_loss_clip": 1.56347656, + "router_z_loss_mlp": 0.13946533, + "step": 8199, + "time_per_iteration": 2.5254313945770264 + }, + { + "auxiliary_loss_clip": 0.06436033, + "auxiliary_loss_mlp": 0.01266476, + "balance_loss_clip": 0.06279432, + "balance_loss_mlp": 0.01252808, + "epoch": 0.49301067187734854, + "flos": 23849206755840.0, + "grad_norm": 1.3683337876027823, + "language_loss": 0.60070461, + "learning_rate": 2.1422280914436864e-06, + "loss": 0.67772967, + "num_input_tokens_seen": 176346195, + "router_z_loss_clip": 1.56542969, + "router_z_loss_mlp": 0.13647461, + "step": 8200, + "time_per_iteration": 2.54241943359375 + }, + { + "auxiliary_loss_clip": 0.06429607, + "auxiliary_loss_mlp": 0.01273188, + "balance_loss_clip": 0.06281705, + "balance_loss_mlp": 0.01261541, + "epoch": 0.49307079513001656, + "flos": 22497730652160.0, + "grad_norm": 1.4845406915411774, + "language_loss": 0.79454738, + "learning_rate": 2.1418396131105213e-06, + "loss": 0.87157536, + "num_input_tokens_seen": 176366735, + "router_z_loss_clip": 1.47851562, + "router_z_loss_mlp": 0.11657715, + "step": 8201, + "time_per_iteration": 2.590289831161499 + }, + { + "auxiliary_loss_clip": 0.0644393, + "auxiliary_loss_mlp": 0.01272695, + "balance_loss_clip": 0.06281954, + "balance_loss_mlp": 0.01259171, + "epoch": 0.4931309183826845, + "flos": 15930059777280.0, + "grad_norm": 1.9752291134223394, + "language_loss": 0.66993362, + "learning_rate": 2.141451129398785e-06, + "loss": 0.74709988, + "num_input_tokens_seen": 176384475, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.13525391, + "step": 8202, + "time_per_iteration": 2.5706307888031006 + }, + { + "auxiliary_loss_clip": 0.06429332, + "auxiliary_loss_mlp": 0.01267886, + "balance_loss_clip": 0.06277282, + "balance_loss_mlp": 0.01256055, + "epoch": 0.4931910416353525, + "flos": 27316588561920.0, + "grad_norm": 1.8969992308716948, + "language_loss": 0.75337243, + "learning_rate": 2.1410626403232076e-06, + "loss": 0.83034456, + "num_input_tokens_seen": 176402645, + "router_z_loss_clip": 1.52148438, + "router_z_loss_mlp": 0.11834717, + "step": 8203, + "time_per_iteration": 4.0727972984313965 + }, + { + "auxiliary_loss_clip": 0.06434371, + "auxiliary_loss_mlp": 0.01265731, + "balance_loss_clip": 0.06279419, + "balance_loss_mlp": 0.01253626, + "epoch": 0.49325116488802045, + "flos": 20811166871040.0, + "grad_norm": 2.0494104605673935, + "language_loss": 0.80605292, + "learning_rate": 2.1406741458985197e-06, + "loss": 0.8830539, + "num_input_tokens_seen": 176416715, + "router_z_loss_clip": 1.54980469, + "router_z_loss_mlp": 0.12103271, + "step": 8204, + "time_per_iteration": 2.6136350631713867 + }, + { + "auxiliary_loss_clip": 0.0643463, + "auxiliary_loss_mlp": 0.0126736, + "balance_loss_clip": 0.06280951, + "balance_loss_mlp": 0.01254664, + "epoch": 0.4933112881406884, + "flos": 19872247144320.0, + "grad_norm": 1.7256783924705517, + "language_loss": 0.65881336, + "learning_rate": 2.140285646139455e-06, + "loss": 0.73583329, + "num_input_tokens_seen": 176435755, + "router_z_loss_clip": 1.53613281, + "router_z_loss_mlp": 0.12695312, + "step": 8205, + "time_per_iteration": 2.5172812938690186 + }, + { + "auxiliary_loss_clip": 0.06445079, + "auxiliary_loss_mlp": 0.01273568, + "balance_loss_clip": 0.06283986, + "balance_loss_mlp": 0.0125971, + "epoch": 0.4933714113933564, + "flos": 21833215447680.0, + "grad_norm": 1.6546444342030124, + "language_loss": 0.66620767, + "learning_rate": 2.139897141060744e-06, + "loss": 0.74339426, + "num_input_tokens_seen": 176453915, + "router_z_loss_clip": 1.61035156, + "router_z_loss_mlp": 0.13861084, + "step": 8206, + "time_per_iteration": 2.556596040725708 + }, + { + "auxiliary_loss_clip": 0.06434575, + "auxiliary_loss_mlp": 0.0126512, + "balance_loss_clip": 0.06278799, + "balance_loss_mlp": 0.01253539, + "epoch": 0.49343153464602435, + "flos": 27897304083840.0, + "grad_norm": 1.8364733010130068, + "language_loss": 0.77070463, + "learning_rate": 2.1395086306771196e-06, + "loss": 0.84770155, + "num_input_tokens_seen": 176475175, + "router_z_loss_clip": 1.55761719, + "router_z_loss_mlp": 0.11584473, + "step": 8207, + "time_per_iteration": 2.591074228286743 + }, + { + "auxiliary_loss_clip": 0.06430385, + "auxiliary_loss_mlp": 0.01268434, + "balance_loss_clip": 0.06277601, + "balance_loss_mlp": 0.01256174, + "epoch": 0.4934916578986923, + "flos": 24688002453120.0, + "grad_norm": 2.876199477758729, + "language_loss": 0.60526079, + "learning_rate": 2.1391201150033147e-06, + "loss": 0.68224895, + "num_input_tokens_seen": 176494250, + "router_z_loss_clip": 1.53027344, + "router_z_loss_mlp": 0.12261963, + "step": 8208, + "time_per_iteration": 2.5641872882843018 + }, + { + "auxiliary_loss_clip": 0.06432977, + "auxiliary_loss_mlp": 0.01268916, + "balance_loss_clip": 0.06279885, + "balance_loss_mlp": 0.01256548, + "epoch": 0.4935517811513603, + "flos": 23412024478080.0, + "grad_norm": 2.3268226049750025, + "language_loss": 0.79136336, + "learning_rate": 2.1387315940540598e-06, + "loss": 0.86838233, + "num_input_tokens_seen": 176513325, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.12365723, + "step": 8209, + "time_per_iteration": 2.5345427989959717 + }, + { + "auxiliary_loss_clip": 0.06431048, + "auxiliary_loss_mlp": 0.01266279, + "balance_loss_clip": 0.06279348, + "balance_loss_mlp": 0.01253917, + "epoch": 0.49361190440402825, + "flos": 21950948833920.0, + "grad_norm": 3.2965997735856423, + "language_loss": 0.79514015, + "learning_rate": 2.138343067844089e-06, + "loss": 0.87211347, + "num_input_tokens_seen": 176532915, + "router_z_loss_clip": 1.51757812, + "router_z_loss_mlp": 0.12359619, + "step": 8210, + "time_per_iteration": 2.5686817169189453 + }, + { + "auxiliary_loss_clip": 0.06438643, + "auxiliary_loss_mlp": 0.01268716, + "balance_loss_clip": 0.06280634, + "balance_loss_mlp": 0.01256629, + "epoch": 0.4936720276566962, + "flos": 25122124056960.0, + "grad_norm": 2.539502696257949, + "language_loss": 0.81421793, + "learning_rate": 2.1379545363881363e-06, + "loss": 0.8912915, + "num_input_tokens_seen": 176552775, + "router_z_loss_clip": 1.58007812, + "router_z_loss_mlp": 0.12084961, + "step": 8211, + "time_per_iteration": 2.5667943954467773 + }, + { + "auxiliary_loss_clip": 0.06429391, + "auxiliary_loss_mlp": 0.0126729, + "balance_loss_clip": 0.06274866, + "balance_loss_mlp": 0.01254803, + "epoch": 0.4937321509093642, + "flos": 26366055045120.0, + "grad_norm": 2.1078758653058913, + "language_loss": 0.91783321, + "learning_rate": 2.137565999700933e-06, + "loss": 0.99480009, + "num_input_tokens_seen": 176572185, + "router_z_loss_clip": 1.54394531, + "router_z_loss_mlp": 0.12506104, + "step": 8212, + "time_per_iteration": 2.5892627239227295 + }, + { + "auxiliary_loss_clip": 0.06437102, + "auxiliary_loss_mlp": 0.01269581, + "balance_loss_clip": 0.06282008, + "balance_loss_mlp": 0.01257511, + "epoch": 0.49379227416203214, + "flos": 22967211479040.0, + "grad_norm": 1.9203573298750467, + "language_loss": 0.65474772, + "learning_rate": 2.1371774577972138e-06, + "loss": 0.7318145, + "num_input_tokens_seen": 176591490, + "router_z_loss_clip": 1.55078125, + "router_z_loss_mlp": 0.1206665, + "step": 8213, + "time_per_iteration": 2.5766966342926025 + }, + { + "auxiliary_loss_clip": 0.06435272, + "auxiliary_loss_mlp": 0.01267129, + "balance_loss_clip": 0.06281263, + "balance_loss_mlp": 0.01254957, + "epoch": 0.49385239741470016, + "flos": 32497340256000.0, + "grad_norm": 5.5178519689557435, + "language_loss": 0.76015925, + "learning_rate": 2.136788910691711e-06, + "loss": 0.83718324, + "num_input_tokens_seen": 176612715, + "router_z_loss_clip": 1.54003906, + "router_z_loss_mlp": 0.1217041, + "step": 8214, + "time_per_iteration": 2.6815552711486816 + }, + { + "auxiliary_loss_clip": 0.06435767, + "auxiliary_loss_mlp": 0.01267382, + "balance_loss_clip": 0.06282468, + "balance_loss_mlp": 0.0125508, + "epoch": 0.4939125206673681, + "flos": 22499575441920.0, + "grad_norm": 1.6727543381074526, + "language_loss": 0.84167933, + "learning_rate": 2.1364003583991594e-06, + "loss": 0.91871083, + "num_input_tokens_seen": 176631950, + "router_z_loss_clip": 1.53417969, + "router_z_loss_mlp": 0.12298584, + "step": 8215, + "time_per_iteration": 2.6213715076446533 + }, + { + "auxiliary_loss_clip": 0.06426814, + "auxiliary_loss_mlp": 0.01268273, + "balance_loss_clip": 0.06280927, + "balance_loss_mlp": 0.0125696, + "epoch": 0.4939726439200361, + "flos": 31184493684480.0, + "grad_norm": 1.9918722360209278, + "language_loss": 0.83712834, + "learning_rate": 2.136011800934292e-06, + "loss": 0.91407919, + "num_input_tokens_seen": 176653060, + "router_z_loss_clip": 1.45996094, + "router_z_loss_mlp": 0.11315918, + "step": 8216, + "time_per_iteration": 2.619922637939453 + }, + { + "auxiliary_loss_clip": 0.06434111, + "auxiliary_loss_mlp": 0.0127241, + "balance_loss_clip": 0.06283373, + "balance_loss_mlp": 0.01260614, + "epoch": 0.49403276717270406, + "flos": 22680773896320.0, + "grad_norm": 1.6954468061355052, + "language_loss": 0.75099367, + "learning_rate": 2.1356232383118442e-06, + "loss": 0.82805896, + "num_input_tokens_seen": 176673895, + "router_z_loss_clip": 1.50585938, + "router_z_loss_mlp": 0.11791992, + "step": 8217, + "time_per_iteration": 2.5473809242248535 + }, + { + "auxiliary_loss_clip": 0.06434639, + "auxiliary_loss_mlp": 0.01271118, + "balance_loss_clip": 0.06285703, + "balance_loss_mlp": 0.01258422, + "epoch": 0.494092890425372, + "flos": 20747408313600.0, + "grad_norm": 1.6176152886760666, + "language_loss": 0.78781378, + "learning_rate": 2.1352346705465494e-06, + "loss": 0.86487138, + "num_input_tokens_seen": 176692550, + "router_z_loss_clip": 1.48925781, + "router_z_loss_mlp": 0.12689209, + "step": 8218, + "time_per_iteration": 2.542994976043701 + }, + { + "auxiliary_loss_clip": 0.06433167, + "auxiliary_loss_mlp": 0.01265257, + "balance_loss_clip": 0.06283546, + "balance_loss_mlp": 0.01253628, + "epoch": 0.49415301367804, + "flos": 18374889882240.0, + "grad_norm": 2.39829798701753, + "language_loss": 0.77065396, + "learning_rate": 2.134846097653142e-06, + "loss": 0.84763819, + "num_input_tokens_seen": 176709335, + "router_z_loss_clip": 1.49609375, + "router_z_loss_mlp": 0.11639404, + "step": 8219, + "time_per_iteration": 2.5450475215911865 + }, + { + "auxiliary_loss_clip": 0.06439486, + "auxiliary_loss_mlp": 0.01269777, + "balance_loss_clip": 0.06285974, + "balance_loss_mlp": 0.01258321, + "epoch": 0.49421313693070795, + "flos": 17536471528320.0, + "grad_norm": 2.258549541306087, + "language_loss": 0.62705898, + "learning_rate": 2.134457519646357e-06, + "loss": 0.70415157, + "num_input_tokens_seen": 176727715, + "router_z_loss_clip": 1.53613281, + "router_z_loss_mlp": 0.11462402, + "step": 8220, + "time_per_iteration": 2.5296928882598877 + }, + { + "auxiliary_loss_clip": 0.06433114, + "auxiliary_loss_mlp": 0.01270633, + "balance_loss_clip": 0.06280304, + "balance_loss_mlp": 0.01259076, + "epoch": 0.4942732601833759, + "flos": 20818210613760.0, + "grad_norm": 1.8931623619102378, + "language_loss": 0.72802091, + "learning_rate": 2.1340689365409296e-06, + "loss": 0.80505836, + "num_input_tokens_seen": 176747530, + "router_z_loss_clip": 1.52929688, + "router_z_loss_mlp": 0.11572266, + "step": 8221, + "time_per_iteration": 2.521430253982544 + }, + { + "auxiliary_loss_clip": 0.06441319, + "auxiliary_loss_mlp": 0.01270693, + "balance_loss_clip": 0.06292681, + "balance_loss_mlp": 0.01258761, + "epoch": 0.4943333834360439, + "flos": 15054269702400.0, + "grad_norm": 1.6896047494674526, + "language_loss": 0.79253769, + "learning_rate": 2.133680348351595e-06, + "loss": 0.86965781, + "num_input_tokens_seen": 176765260, + "router_z_loss_clip": 1.48828125, + "router_z_loss_mlp": 0.11920166, + "step": 8222, + "time_per_iteration": 2.533997058868408 + }, + { + "auxiliary_loss_clip": 0.06434612, + "auxiliary_loss_mlp": 0.01272431, + "balance_loss_clip": 0.06282104, + "balance_loss_mlp": 0.0126051, + "epoch": 0.49439350668871185, + "flos": 16075899008640.0, + "grad_norm": 6.490136916654426, + "language_loss": 0.72483402, + "learning_rate": 2.133291755093088e-06, + "loss": 0.80190444, + "num_input_tokens_seen": 176781770, + "router_z_loss_clip": 1.5234375, + "router_z_loss_mlp": 0.1192627, + "step": 8223, + "time_per_iteration": 2.457361936569214 + }, + { + "auxiliary_loss_clip": 0.06444422, + "auxiliary_loss_mlp": 0.01270468, + "balance_loss_clip": 0.06287469, + "balance_loss_mlp": 0.01257367, + "epoch": 0.4944536299413798, + "flos": 20885281407360.0, + "grad_norm": 1.6318042764148617, + "language_loss": 0.75256205, + "learning_rate": 2.132903156780144e-06, + "loss": 0.82971096, + "num_input_tokens_seen": 176800655, + "router_z_loss_clip": 1.56933594, + "router_z_loss_mlp": 0.13122559, + "step": 8224, + "time_per_iteration": 2.5326499938964844 + }, + { + "auxiliary_loss_clip": 0.06441943, + "auxiliary_loss_mlp": 0.01267954, + "balance_loss_clip": 0.06287307, + "balance_loss_mlp": 0.01255646, + "epoch": 0.4945137531940478, + "flos": 26615162833920.0, + "grad_norm": 2.58625148433793, + "language_loss": 0.64002287, + "learning_rate": 2.1325145534274997e-06, + "loss": 0.71712184, + "num_input_tokens_seen": 176820610, + "router_z_loss_clip": 1.54394531, + "router_z_loss_mlp": 0.12322998, + "step": 8225, + "time_per_iteration": 2.555088996887207 + }, + { + "auxiliary_loss_clip": 0.06438252, + "auxiliary_loss_mlp": 0.01269636, + "balance_loss_clip": 0.06283222, + "balance_loss_mlp": 0.01258007, + "epoch": 0.49457387644671574, + "flos": 23995004060160.0, + "grad_norm": 2.0569415863505554, + "language_loss": 0.77084112, + "learning_rate": 2.1321259450498893e-06, + "loss": 0.84792, + "num_input_tokens_seen": 176840520, + "router_z_loss_clip": 1.54980469, + "router_z_loss_mlp": 0.11627197, + "step": 8226, + "time_per_iteration": 2.557900905609131 + }, + { + "auxiliary_loss_clip": 0.06436731, + "auxiliary_loss_mlp": 0.01270529, + "balance_loss_clip": 0.06281079, + "balance_loss_mlp": 0.01256958, + "epoch": 0.49463399969938376, + "flos": 26983387601280.0, + "grad_norm": 1.6446627405679832, + "language_loss": 0.71402973, + "learning_rate": 2.131737331662051e-06, + "loss": 0.79110235, + "num_input_tokens_seen": 176860265, + "router_z_loss_clip": 1.55664062, + "router_z_loss_mlp": 0.13568115, + "step": 8227, + "time_per_iteration": 2.533468246459961 + }, + { + "auxiliary_loss_clip": 0.06441461, + "auxiliary_loss_mlp": 0.01270684, + "balance_loss_clip": 0.06282251, + "balance_loss_mlp": 0.01258477, + "epoch": 0.49469412295205173, + "flos": 29689610117760.0, + "grad_norm": 1.6469495440568809, + "language_loss": 0.7179364, + "learning_rate": 2.131348713278718e-06, + "loss": 0.79505783, + "num_input_tokens_seen": 176882910, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.12213135, + "step": 8228, + "time_per_iteration": 2.621777296066284 + }, + { + "auxiliary_loss_clip": 0.06432875, + "auxiliary_loss_mlp": 0.01271248, + "balance_loss_clip": 0.06283268, + "balance_loss_mlp": 0.01259768, + "epoch": 0.4947542462047197, + "flos": 24138285742080.0, + "grad_norm": 1.3686875437171686, + "language_loss": 0.84044397, + "learning_rate": 2.1309600899146304e-06, + "loss": 0.91748512, + "num_input_tokens_seen": 176903030, + "router_z_loss_clip": 1.49511719, + "router_z_loss_mlp": 0.1149292, + "step": 8229, + "time_per_iteration": 2.620849609375 + }, + { + "auxiliary_loss_clip": 0.06443636, + "auxiliary_loss_mlp": 0.01271474, + "balance_loss_clip": 0.0628624, + "balance_loss_mlp": 0.01258134, + "epoch": 0.49481436945738766, + "flos": 20050804195200.0, + "grad_norm": 2.3211713476829656, + "language_loss": 0.75208747, + "learning_rate": 2.1305714615845227e-06, + "loss": 0.82923853, + "num_input_tokens_seen": 176919025, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.13342285, + "step": 8230, + "time_per_iteration": 3.9126293659210205 + }, + { + "auxiliary_loss_clip": 0.06439002, + "auxiliary_loss_mlp": 0.01268847, + "balance_loss_clip": 0.06284901, + "balance_loss_mlp": 0.01256432, + "epoch": 0.4948744927100556, + "flos": 15675040275840.0, + "grad_norm": 1.9615207178823395, + "language_loss": 0.80548179, + "learning_rate": 2.1301828283031314e-06, + "loss": 0.88256031, + "num_input_tokens_seen": 176937945, + "router_z_loss_clip": 1.54101562, + "router_z_loss_mlp": 0.1239624, + "step": 8231, + "time_per_iteration": 2.525049924850464 + }, + { + "auxiliary_loss_clip": 0.06329959, + "auxiliary_loss_mlp": 0.01257972, + "balance_loss_clip": 0.06262948, + "balance_loss_mlp": 0.0125556, + "epoch": 0.4949346159627236, + "flos": 68893611644160.0, + "grad_norm": 0.7512177245674743, + "language_loss": 0.60052431, + "learning_rate": 2.1297941900851944e-06, + "loss": 0.67640364, + "num_input_tokens_seen": 177004575, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 0.02409363, + "step": 8232, + "time_per_iteration": 4.674450159072876 + }, + { + "auxiliary_loss_clip": 0.06440374, + "auxiliary_loss_mlp": 0.01269686, + "balance_loss_clip": 0.06279664, + "balance_loss_mlp": 0.0125631, + "epoch": 0.49499473921539155, + "flos": 24797182723200.0, + "grad_norm": 1.782814520641974, + "language_loss": 0.68933427, + "learning_rate": 2.1294055469454496e-06, + "loss": 0.76643485, + "num_input_tokens_seen": 177024155, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.13366699, + "step": 8233, + "time_per_iteration": 2.574759006500244 + }, + { + "auxiliary_loss_clip": 0.06426412, + "auxiliary_loss_mlp": 0.01270358, + "balance_loss_clip": 0.06276375, + "balance_loss_mlp": 0.01258508, + "epoch": 0.4950548624680595, + "flos": 32716161993600.0, + "grad_norm": 2.8586701341507355, + "language_loss": 0.6684472, + "learning_rate": 2.129016898898633e-06, + "loss": 0.74541491, + "num_input_tokens_seen": 177046185, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.1184082, + "step": 8234, + "time_per_iteration": 2.653381824493408 + }, + { + "auxiliary_loss_clip": 0.06329186, + "auxiliary_loss_mlp": 0.0125637, + "balance_loss_clip": 0.06261852, + "balance_loss_mlp": 0.01254119, + "epoch": 0.4951149857207275, + "flos": 50100616287360.0, + "grad_norm": 0.7779673724008701, + "language_loss": 0.58149666, + "learning_rate": 2.128628245959482e-06, + "loss": 0.65735215, + "num_input_tokens_seen": 177099025, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 0.02255249, + "step": 8235, + "time_per_iteration": 3.0858991146087646 + }, + { + "auxiliary_loss_clip": 0.06437027, + "auxiliary_loss_mlp": 0.01272544, + "balance_loss_clip": 0.06281243, + "balance_loss_mlp": 0.01259401, + "epoch": 0.49517510897339545, + "flos": 22243340056320.0, + "grad_norm": 1.7279160321905627, + "language_loss": 0.77504063, + "learning_rate": 2.1282395881427355e-06, + "loss": 0.85213637, + "num_input_tokens_seen": 177118365, + "router_z_loss_clip": 1.55664062, + "router_z_loss_mlp": 0.13134766, + "step": 8236, + "time_per_iteration": 2.5753977298736572 + }, + { + "auxiliary_loss_clip": 0.06428996, + "auxiliary_loss_mlp": 0.01267571, + "balance_loss_clip": 0.06278376, + "balance_loss_mlp": 0.01256037, + "epoch": 0.4952352322260634, + "flos": 25381126627200.0, + "grad_norm": 1.6842676088909172, + "language_loss": 0.72880518, + "learning_rate": 2.1278509254631315e-06, + "loss": 0.80577087, + "num_input_tokens_seen": 177136415, + "router_z_loss_clip": 1.50390625, + "router_z_loss_mlp": 0.11529541, + "step": 8237, + "time_per_iteration": 4.036882400512695 + }, + { + "auxiliary_loss_clip": 0.06434725, + "auxiliary_loss_mlp": 0.01270554, + "balance_loss_clip": 0.06283747, + "balance_loss_mlp": 0.0125787, + "epoch": 0.4952953554787314, + "flos": 24615732706560.0, + "grad_norm": 2.2000126991913285, + "language_loss": 0.75703216, + "learning_rate": 2.127462257935406e-06, + "loss": 0.83408493, + "num_input_tokens_seen": 177155690, + "router_z_loss_clip": 1.50976562, + "router_z_loss_mlp": 0.12664795, + "step": 8238, + "time_per_iteration": 2.549431085586548 + }, + { + "auxiliary_loss_clip": 0.06435382, + "auxiliary_loss_mlp": 0.01269682, + "balance_loss_clip": 0.06280845, + "balance_loss_mlp": 0.01257081, + "epoch": 0.49535547873139935, + "flos": 17317020885120.0, + "grad_norm": 2.278500195677925, + "language_loss": 0.74391794, + "learning_rate": 2.1270735855743008e-06, + "loss": 0.82096863, + "num_input_tokens_seen": 177173350, + "router_z_loss_clip": 1.54492188, + "router_z_loss_mlp": 0.12615967, + "step": 8239, + "time_per_iteration": 2.571343183517456 + }, + { + "auxiliary_loss_clip": 0.06438212, + "auxiliary_loss_mlp": 0.01271609, + "balance_loss_clip": 0.06280148, + "balance_loss_mlp": 0.01257917, + "epoch": 0.4954156019840673, + "flos": 20746527845760.0, + "grad_norm": 2.0000035114581927, + "language_loss": 0.79093564, + "learning_rate": 2.126684908394552e-06, + "loss": 0.86803377, + "num_input_tokens_seen": 177191115, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.13684082, + "step": 8240, + "time_per_iteration": 2.531712532043457 + }, + { + "auxiliary_loss_clip": 0.06430051, + "auxiliary_loss_mlp": 0.01267271, + "balance_loss_clip": 0.06279683, + "balance_loss_mlp": 0.0125594, + "epoch": 0.49547572523673533, + "flos": 12825200661120.0, + "grad_norm": 2.1298693498085592, + "language_loss": 0.86484092, + "learning_rate": 2.126296226410898e-06, + "loss": 0.94181418, + "num_input_tokens_seen": 177206155, + "router_z_loss_clip": 1.50292969, + "router_z_loss_mlp": 0.11334229, + "step": 8241, + "time_per_iteration": 2.5414860248565674 + }, + { + "auxiliary_loss_clip": 0.06427231, + "auxiliary_loss_mlp": 0.01270719, + "balance_loss_clip": 0.06279866, + "balance_loss_mlp": 0.01260003, + "epoch": 0.4955358484894033, + "flos": 15602602821120.0, + "grad_norm": 1.7100085929309539, + "language_loss": 0.77987742, + "learning_rate": 2.1259075396380794e-06, + "loss": 0.85685694, + "num_input_tokens_seen": 177224815, + "router_z_loss_clip": 1.47460938, + "router_z_loss_mlp": 0.10723877, + "step": 8242, + "time_per_iteration": 2.500761032104492 + }, + { + "auxiliary_loss_clip": 0.06436419, + "auxiliary_loss_mlp": 0.01265138, + "balance_loss_clip": 0.06284536, + "balance_loss_mlp": 0.0125308, + "epoch": 0.49559597174207126, + "flos": 26470832976000.0, + "grad_norm": 1.8102794432235507, + "language_loss": 0.67317849, + "learning_rate": 2.125518848090833e-06, + "loss": 0.75019407, + "num_input_tokens_seen": 177244490, + "router_z_loss_clip": 1.51855469, + "router_z_loss_mlp": 0.1206665, + "step": 8243, + "time_per_iteration": 4.062270641326904 + }, + { + "auxiliary_loss_clip": 0.06430024, + "auxiliary_loss_mlp": 0.01269105, + "balance_loss_clip": 0.06279217, + "balance_loss_mlp": 0.0125722, + "epoch": 0.4956560949947392, + "flos": 23154824770560.0, + "grad_norm": 2.721585758888369, + "language_loss": 0.68786383, + "learning_rate": 2.125130151783901e-06, + "loss": 0.76485521, + "num_input_tokens_seen": 177264340, + "router_z_loss_clip": 1.5078125, + "router_z_loss_mlp": 0.11889648, + "step": 8244, + "time_per_iteration": 2.55732798576355 + }, + { + "auxiliary_loss_clip": 0.06434646, + "auxiliary_loss_mlp": 0.01266504, + "balance_loss_clip": 0.06280981, + "balance_loss_mlp": 0.01254541, + "epoch": 0.4957162182474072, + "flos": 20779119884160.0, + "grad_norm": 2.485823072522516, + "language_loss": 0.75575739, + "learning_rate": 2.12474145073202e-06, + "loss": 0.83276892, + "num_input_tokens_seen": 177283055, + "router_z_loss_clip": 1.53515625, + "router_z_loss_mlp": 0.11962891, + "step": 8245, + "time_per_iteration": 2.5086231231689453 + }, + { + "auxiliary_loss_clip": 0.06428742, + "auxiliary_loss_mlp": 0.01268325, + "balance_loss_clip": 0.06280199, + "balance_loss_mlp": 0.01256762, + "epoch": 0.49577634150007516, + "flos": 18740179756800.0, + "grad_norm": 1.8890947976192427, + "language_loss": 0.81602311, + "learning_rate": 2.1243527449499306e-06, + "loss": 0.89299381, + "num_input_tokens_seen": 177301140, + "router_z_loss_clip": 1.48632812, + "router_z_loss_mlp": 0.11572266, + "step": 8246, + "time_per_iteration": 2.534557342529297 + }, + { + "auxiliary_loss_clip": 0.06440324, + "auxiliary_loss_mlp": 0.01268715, + "balance_loss_clip": 0.06283663, + "balance_loss_mlp": 0.01256347, + "epoch": 0.4958364647527431, + "flos": 25560815708160.0, + "grad_norm": 1.7539344008969155, + "language_loss": 0.84379256, + "learning_rate": 2.1239640344523733e-06, + "loss": 0.92088294, + "num_input_tokens_seen": 177323095, + "router_z_loss_clip": 1.56640625, + "router_z_loss_mlp": 0.12359619, + "step": 8247, + "time_per_iteration": 2.5563809871673584 + }, + { + "auxiliary_loss_clip": 0.06436694, + "auxiliary_loss_mlp": 0.01269797, + "balance_loss_clip": 0.06282616, + "balance_loss_mlp": 0.01257798, + "epoch": 0.4958965880054111, + "flos": 24432144410880.0, + "grad_norm": 2.2837128243369658, + "language_loss": 0.84184051, + "learning_rate": 2.123575319254087e-06, + "loss": 0.91890538, + "num_input_tokens_seen": 177339845, + "router_z_loss_clip": 1.54101562, + "router_z_loss_mlp": 0.12011719, + "step": 8248, + "time_per_iteration": 2.566392660140991 + }, + { + "auxiliary_loss_clip": 0.0643697, + "auxiliary_loss_mlp": 0.01268541, + "balance_loss_clip": 0.06282248, + "balance_loss_mlp": 0.01256024, + "epoch": 0.49595671125807905, + "flos": 25090622121600.0, + "grad_norm": 1.727142692455913, + "language_loss": 0.73609596, + "learning_rate": 2.123186599369812e-06, + "loss": 0.813151, + "num_input_tokens_seen": 177359980, + "router_z_loss_clip": 1.54492188, + "router_z_loss_mlp": 0.12518311, + "step": 8249, + "time_per_iteration": 2.548520088195801 + }, + { + "auxiliary_loss_clip": 0.06441288, + "auxiliary_loss_mlp": 0.01269234, + "balance_loss_clip": 0.06283297, + "balance_loss_mlp": 0.01256365, + "epoch": 0.496016834510747, + "flos": 16441524299520.0, + "grad_norm": 2.7229998624345115, + "language_loss": 0.76506901, + "learning_rate": 2.122797874814289e-06, + "loss": 0.84217423, + "num_input_tokens_seen": 177378580, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.12860107, + "step": 8250, + "time_per_iteration": 2.524714231491089 + }, + { + "auxiliary_loss_clip": 0.06438759, + "auxiliary_loss_mlp": 0.01269282, + "balance_loss_clip": 0.06282068, + "balance_loss_mlp": 0.01256551, + "epoch": 0.496076957763415, + "flos": 23444197246080.0, + "grad_norm": 1.6959600873244032, + "language_loss": 0.7021333, + "learning_rate": 2.1224091456022585e-06, + "loss": 0.77921373, + "num_input_tokens_seen": 177398790, + "router_z_loss_clip": 1.56542969, + "router_z_loss_mlp": 0.12738037, + "step": 8251, + "time_per_iteration": 2.531841516494751 + }, + { + "auxiliary_loss_clip": 0.06437311, + "auxiliary_loss_mlp": 0.01271839, + "balance_loss_clip": 0.06285296, + "balance_loss_mlp": 0.01259871, + "epoch": 0.49613708101608295, + "flos": 16915113976320.0, + "grad_norm": 1.8201441219473296, + "language_loss": 0.7993809, + "learning_rate": 2.122020411748461e-06, + "loss": 0.87647241, + "num_input_tokens_seen": 177416515, + "router_z_loss_clip": 1.51855469, + "router_z_loss_mlp": 0.11975098, + "step": 8252, + "time_per_iteration": 2.5806944370269775 + }, + { + "auxiliary_loss_clip": 0.06434863, + "auxiliary_loss_mlp": 0.01270348, + "balance_loss_clip": 0.06282027, + "balance_loss_mlp": 0.01255905, + "epoch": 0.4961972042687509, + "flos": 16623729002880.0, + "grad_norm": 1.8109031344325417, + "language_loss": 0.81898755, + "learning_rate": 2.1216316732676363e-06, + "loss": 0.89603961, + "num_input_tokens_seen": 177434425, + "router_z_loss_clip": 1.52636719, + "router_z_loss_mlp": 0.14447021, + "step": 8253, + "time_per_iteration": 2.4936153888702393 + }, + { + "auxiliary_loss_clip": 0.0643016, + "auxiliary_loss_mlp": 0.01264565, + "balance_loss_clip": 0.06279143, + "balance_loss_mlp": 0.01253139, + "epoch": 0.49625732752141893, + "flos": 28965529059840.0, + "grad_norm": 1.4049535238306547, + "language_loss": 0.67659622, + "learning_rate": 2.1212429301745275e-06, + "loss": 0.7535435, + "num_input_tokens_seen": 177459675, + "router_z_loss_clip": 1.51074219, + "router_z_loss_mlp": 0.11437988, + "step": 8254, + "time_per_iteration": 2.681328058242798 + }, + { + "auxiliary_loss_clip": 0.06436362, + "auxiliary_loss_mlp": 0.01267121, + "balance_loss_clip": 0.06281647, + "balance_loss_mlp": 0.01254729, + "epoch": 0.4963174507740869, + "flos": 23119046277120.0, + "grad_norm": 6.04751780380752, + "language_loss": 0.74611968, + "learning_rate": 2.1208541824838743e-06, + "loss": 0.82315457, + "num_input_tokens_seen": 177478895, + "router_z_loss_clip": 1.54785156, + "router_z_loss_mlp": 0.12384033, + "step": 8255, + "time_per_iteration": 2.5586442947387695 + }, + { + "auxiliary_loss_clip": 0.06430424, + "auxiliary_loss_mlp": 0.01268774, + "balance_loss_clip": 0.06278734, + "balance_loss_mlp": 0.01256972, + "epoch": 0.49637757402675486, + "flos": 13922998928640.0, + "grad_norm": 1.9051204382469373, + "language_loss": 0.81712639, + "learning_rate": 2.1204654302104183e-06, + "loss": 0.89411843, + "num_input_tokens_seen": 177494920, + "router_z_loss_clip": 1.515625, + "router_z_loss_mlp": 0.11798096, + "step": 8256, + "time_per_iteration": 2.525191307067871 + }, + { + "auxiliary_loss_clip": 0.06430264, + "auxiliary_loss_mlp": 0.01267515, + "balance_loss_clip": 0.06279526, + "balance_loss_mlp": 0.01256035, + "epoch": 0.49643769727942283, + "flos": 22315442094720.0, + "grad_norm": 1.4246388626256767, + "language_loss": 0.81285727, + "learning_rate": 2.120076673368901e-06, + "loss": 0.889835, + "num_input_tokens_seen": 177515455, + "router_z_loss_clip": 1.50585938, + "router_z_loss_mlp": 0.11474609, + "step": 8257, + "time_per_iteration": 2.5366289615631104 + }, + { + "auxiliary_loss_clip": 0.06441522, + "auxiliary_loss_mlp": 0.01265551, + "balance_loss_clip": 0.06281207, + "balance_loss_mlp": 0.01253153, + "epoch": 0.4964978205320908, + "flos": 19506328364160.0, + "grad_norm": 1.7556989119603337, + "language_loss": 0.66651785, + "learning_rate": 2.1196879119740647e-06, + "loss": 0.74358857, + "num_input_tokens_seen": 177534040, + "router_z_loss_clip": 1.60351562, + "router_z_loss_mlp": 0.1239624, + "step": 8258, + "time_per_iteration": 2.567802667617798 + }, + { + "auxiliary_loss_clip": 0.06427691, + "auxiliary_loss_mlp": 0.01266798, + "balance_loss_clip": 0.06277505, + "balance_loss_mlp": 0.0125607, + "epoch": 0.49655794378475876, + "flos": 23442562091520.0, + "grad_norm": 1.5238866764667018, + "language_loss": 0.7778039, + "learning_rate": 2.1192991460406502e-06, + "loss": 0.85474873, + "num_input_tokens_seen": 177554510, + "router_z_loss_clip": 1.50195312, + "router_z_loss_mlp": 0.10723877, + "step": 8259, + "time_per_iteration": 2.5521552562713623 + }, + { + "auxiliary_loss_clip": 0.06430545, + "auxiliary_loss_mlp": 0.01266762, + "balance_loss_clip": 0.06279439, + "balance_loss_mlp": 0.01254954, + "epoch": 0.4966180670374267, + "flos": 26837967640320.0, + "grad_norm": 1.4589343239403403, + "language_loss": 0.78972054, + "learning_rate": 2.1189103755834e-06, + "loss": 0.86669362, + "num_input_tokens_seen": 177575780, + "router_z_loss_clip": 1.51171875, + "router_z_loss_mlp": 0.11816406, + "step": 8260, + "time_per_iteration": 2.6012649536132812 + }, + { + "auxiliary_loss_clip": 0.06434717, + "auxiliary_loss_mlp": 0.01267655, + "balance_loss_clip": 0.06279895, + "balance_loss_mlp": 0.01255055, + "epoch": 0.4966781902900947, + "flos": 22014413902080.0, + "grad_norm": 2.8586716221878206, + "language_loss": 0.76515198, + "learning_rate": 2.1185216006170573e-06, + "loss": 0.84217566, + "num_input_tokens_seen": 177588965, + "router_z_loss_clip": 1.54785156, + "router_z_loss_mlp": 0.12591553, + "step": 8261, + "time_per_iteration": 2.4737415313720703 + }, + { + "auxiliary_loss_clip": 0.06427643, + "auxiliary_loss_mlp": 0.01267002, + "balance_loss_clip": 0.0627794, + "balance_loss_mlp": 0.01255772, + "epoch": 0.49673831354276266, + "flos": 26220509303040.0, + "grad_norm": 1.7291004140234418, + "language_loss": 0.89456958, + "learning_rate": 2.1181328211563627e-06, + "loss": 0.97151601, + "num_input_tokens_seen": 177608425, + "router_z_loss_clip": 1.49609375, + "router_z_loss_mlp": 0.11230469, + "step": 8262, + "time_per_iteration": 2.613236665725708 + }, + { + "auxiliary_loss_clip": 0.06431636, + "auxiliary_loss_mlp": 0.01268648, + "balance_loss_clip": 0.06281907, + "balance_loss_mlp": 0.01256817, + "epoch": 0.4967984367954306, + "flos": 23188464984960.0, + "grad_norm": 1.4347791599980126, + "language_loss": 0.73918176, + "learning_rate": 2.11774403721606e-06, + "loss": 0.81618452, + "num_input_tokens_seen": 177628240, + "router_z_loss_clip": 1.49804688, + "router_z_loss_mlp": 0.11834717, + "step": 8263, + "time_per_iteration": 2.595635414123535 + }, + { + "auxiliary_loss_clip": 0.06439725, + "auxiliary_loss_mlp": 0.01274389, + "balance_loss_clip": 0.06283052, + "balance_loss_mlp": 0.01260239, + "epoch": 0.4968585600480986, + "flos": 19287506626560.0, + "grad_norm": 2.258936930728745, + "language_loss": 0.69678748, + "learning_rate": 2.1173552488108923e-06, + "loss": 0.77392858, + "num_input_tokens_seen": 177645920, + "router_z_loss_clip": 1.56542969, + "router_z_loss_mlp": 0.14147949, + "step": 8264, + "time_per_iteration": 2.5913755893707275 + }, + { + "auxiliary_loss_clip": 0.06438377, + "auxiliary_loss_mlp": 0.01267325, + "balance_loss_clip": 0.06281792, + "balance_loss_mlp": 0.01255136, + "epoch": 0.49691868330076655, + "flos": 22535312008320.0, + "grad_norm": 1.388736059607974, + "language_loss": 0.65131235, + "learning_rate": 2.1169664559556007e-06, + "loss": 0.72836947, + "num_input_tokens_seen": 177667185, + "router_z_loss_clip": 1.56933594, + "router_z_loss_mlp": 0.12188721, + "step": 8265, + "time_per_iteration": 2.528193473815918 + }, + { + "auxiliary_loss_clip": 0.06333993, + "auxiliary_loss_mlp": 0.01255399, + "balance_loss_clip": 0.06266748, + "balance_loss_mlp": 0.01253268, + "epoch": 0.4969788065534345, + "flos": 66598897328640.0, + "grad_norm": 0.8036364801041208, + "language_loss": 0.53402334, + "learning_rate": 2.1165776586649304e-06, + "loss": 0.60991728, + "num_input_tokens_seen": 177733020, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 0.02133179, + "step": 8266, + "time_per_iteration": 3.1838197708129883 + }, + { + "auxiliary_loss_clip": 0.06428756, + "auxiliary_loss_mlp": 0.01272627, + "balance_loss_clip": 0.06282037, + "balance_loss_mlp": 0.01260592, + "epoch": 0.49703892980610254, + "flos": 24066099849600.0, + "grad_norm": 1.4975664699088878, + "language_loss": 0.79899192, + "learning_rate": 2.1161888569536223e-06, + "loss": 0.87600571, + "num_input_tokens_seen": 177753370, + "router_z_loss_clip": 1.46679688, + "router_z_loss_mlp": 0.12036133, + "step": 8267, + "time_per_iteration": 2.556995391845703 + }, + { + "auxiliary_loss_clip": 0.06434017, + "auxiliary_loss_mlp": 0.01269443, + "balance_loss_clip": 0.06279886, + "balance_loss_mlp": 0.01256295, + "epoch": 0.4970990530587705, + "flos": 29132807736960.0, + "grad_norm": 3.0454644456900155, + "language_loss": 0.75843596, + "learning_rate": 2.1158000508364223e-06, + "loss": 0.83547056, + "num_input_tokens_seen": 177771530, + "router_z_loss_clip": 1.54003906, + "router_z_loss_mlp": 0.13146973, + "step": 8268, + "time_per_iteration": 2.6049721240997314 + }, + { + "auxiliary_loss_clip": 0.06435575, + "auxiliary_loss_mlp": 0.01270084, + "balance_loss_clip": 0.06281421, + "balance_loss_mlp": 0.01257185, + "epoch": 0.49715917631143847, + "flos": 46036811047680.0, + "grad_norm": 1.4862794016102487, + "language_loss": 0.68007714, + "learning_rate": 2.115411240328073e-06, + "loss": 0.75713372, + "num_input_tokens_seen": 177796355, + "router_z_loss_clip": 1.54003906, + "router_z_loss_mlp": 0.12902832, + "step": 8269, + "time_per_iteration": 4.128691911697388 + }, + { + "auxiliary_loss_clip": 0.06433591, + "auxiliary_loss_mlp": 0.01270109, + "balance_loss_clip": 0.06283623, + "balance_loss_mlp": 0.01258444, + "epoch": 0.49721929956410643, + "flos": 20197104624000.0, + "grad_norm": 1.5327488108804688, + "language_loss": 0.85668087, + "learning_rate": 2.1150224254433167e-06, + "loss": 0.93371785, + "num_input_tokens_seen": 177814300, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.11669922, + "step": 8270, + "time_per_iteration": 2.518367290496826 + }, + { + "auxiliary_loss_clip": 0.06438391, + "auxiliary_loss_mlp": 0.012695, + "balance_loss_clip": 0.06282806, + "balance_loss_mlp": 0.01258443, + "epoch": 0.4972794228167744, + "flos": 21660108912000.0, + "grad_norm": 1.8194061326909323, + "language_loss": 0.71364737, + "learning_rate": 2.114633606196899e-06, + "loss": 0.7907263, + "num_input_tokens_seen": 177833615, + "router_z_loss_clip": 1.55664062, + "router_z_loss_mlp": 0.1105957, + "step": 8271, + "time_per_iteration": 2.5573620796203613 + }, + { + "auxiliary_loss_clip": 0.06437098, + "auxiliary_loss_mlp": 0.01269156, + "balance_loss_clip": 0.06284092, + "balance_loss_mlp": 0.0125646, + "epoch": 0.49733954606944236, + "flos": 24286598668800.0, + "grad_norm": 1.3024187792808712, + "language_loss": 0.78511107, + "learning_rate": 2.1142447826035635e-06, + "loss": 0.86217368, + "num_input_tokens_seen": 177855315, + "router_z_loss_clip": 1.52832031, + "router_z_loss_mlp": 0.12677002, + "step": 8272, + "time_per_iteration": 4.061326742172241 + }, + { + "auxiliary_loss_clip": 0.06438889, + "auxiliary_loss_mlp": 0.01270506, + "balance_loss_clip": 0.06285517, + "balance_loss_mlp": 0.01257548, + "epoch": 0.4973996693221103, + "flos": 37861722172800.0, + "grad_norm": 2.25975995369767, + "language_loss": 0.66725254, + "learning_rate": 2.1138559546780544e-06, + "loss": 0.7443465, + "num_input_tokens_seen": 177875590, + "router_z_loss_clip": 1.53320312, + "router_z_loss_mlp": 0.12957764, + "step": 8273, + "time_per_iteration": 2.645908832550049 + }, + { + "auxiliary_loss_clip": 0.06436634, + "auxiliary_loss_mlp": 0.01276274, + "balance_loss_clip": 0.06285357, + "balance_loss_mlp": 0.01264109, + "epoch": 0.4974597925747783, + "flos": 21367885397760.0, + "grad_norm": 1.5281958400790516, + "language_loss": 0.78156513, + "learning_rate": 2.1134671224351163e-06, + "loss": 0.8586942, + "num_input_tokens_seen": 177894175, + "router_z_loss_clip": 1.51171875, + "router_z_loss_mlp": 0.12182617, + "step": 8274, + "time_per_iteration": 2.535804271697998 + }, + { + "auxiliary_loss_clip": 0.06437881, + "auxiliary_loss_mlp": 0.0127292, + "balance_loss_clip": 0.06281041, + "balance_loss_mlp": 0.01259992, + "epoch": 0.49751991582744626, + "flos": 30746137449600.0, + "grad_norm": 1.6098675264323796, + "language_loss": 0.76012516, + "learning_rate": 2.113078285889493e-06, + "loss": 0.83723313, + "num_input_tokens_seen": 177913920, + "router_z_loss_clip": 1.56640625, + "router_z_loss_mlp": 0.12939453, + "step": 8275, + "time_per_iteration": 2.5787549018859863 + }, + { + "auxiliary_loss_clip": 0.06438003, + "auxiliary_loss_mlp": 0.01271635, + "balance_loss_clip": 0.06282246, + "balance_loss_mlp": 0.01257789, + "epoch": 0.4975800390801142, + "flos": 14105748683520.0, + "grad_norm": 1.8196816586022186, + "language_loss": 0.84079218, + "learning_rate": 2.1126894450559303e-06, + "loss": 0.91788852, + "num_input_tokens_seen": 177930425, + "router_z_loss_clip": 1.55664062, + "router_z_loss_mlp": 0.1385498, + "step": 8276, + "time_per_iteration": 2.5156893730163574 + }, + { + "auxiliary_loss_clip": 0.06426419, + "auxiliary_loss_mlp": 0.01277009, + "balance_loss_clip": 0.06279768, + "balance_loss_mlp": 0.01265398, + "epoch": 0.4976401623327822, + "flos": 24214203141120.0, + "grad_norm": 1.3141436658277077, + "language_loss": 0.70087981, + "learning_rate": 2.112300599949172e-06, + "loss": 0.77791417, + "num_input_tokens_seen": 177949885, + "router_z_loss_clip": 1.46582031, + "router_z_loss_mlp": 0.1161499, + "step": 8277, + "time_per_iteration": 3.9860711097717285 + }, + { + "auxiliary_loss_clip": 0.06429198, + "auxiliary_loss_mlp": 0.01270973, + "balance_loss_clip": 0.06280812, + "balance_loss_mlp": 0.01258754, + "epoch": 0.49770028558545015, + "flos": 21142229552640.0, + "grad_norm": 1.8219149953370526, + "language_loss": 0.82141137, + "learning_rate": 2.111911750583964e-06, + "loss": 0.89841306, + "num_input_tokens_seen": 177965720, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.12231445, + "step": 8278, + "time_per_iteration": 2.5353100299835205 + }, + { + "auxiliary_loss_clip": 0.06435424, + "auxiliary_loss_mlp": 0.01268936, + "balance_loss_clip": 0.06279474, + "balance_loss_mlp": 0.01256246, + "epoch": 0.4977604088381181, + "flos": 16769568234240.0, + "grad_norm": 1.8298360040603827, + "language_loss": 0.68205428, + "learning_rate": 2.111522896975052e-06, + "loss": 0.75909793, + "num_input_tokens_seen": 177983190, + "router_z_loss_clip": 1.55957031, + "router_z_loss_mlp": 0.12695312, + "step": 8279, + "time_per_iteration": 2.538273334503174 + }, + { + "auxiliary_loss_clip": 0.06430422, + "auxiliary_loss_mlp": 0.01271809, + "balance_loss_clip": 0.06277534, + "balance_loss_mlp": 0.01258129, + "epoch": 0.49782053209078614, + "flos": 15708596636160.0, + "grad_norm": 1.929140490148881, + "language_loss": 0.70948005, + "learning_rate": 2.1111340391371794e-06, + "loss": 0.78650236, + "num_input_tokens_seen": 178000155, + "router_z_loss_clip": 1.52832031, + "router_z_loss_mlp": 0.13665771, + "step": 8280, + "time_per_iteration": 2.5344486236572266 + }, + { + "auxiliary_loss_clip": 0.06432884, + "auxiliary_loss_mlp": 0.01270682, + "balance_loss_clip": 0.06279922, + "balance_loss_mlp": 0.01257331, + "epoch": 0.4978806553434541, + "flos": 24760565688960.0, + "grad_norm": 1.4498126802552027, + "language_loss": 0.6468308, + "learning_rate": 2.1107451770850936e-06, + "loss": 0.72386646, + "num_input_tokens_seen": 178021060, + "router_z_loss_clip": 1.53027344, + "router_z_loss_mlp": 0.13366699, + "step": 8281, + "time_per_iteration": 2.5905003547668457 + }, + { + "auxiliary_loss_clip": 0.06432123, + "auxiliary_loss_mlp": 0.01269379, + "balance_loss_clip": 0.06277686, + "balance_loss_mlp": 0.01256141, + "epoch": 0.49794077859612207, + "flos": 13120820265600.0, + "grad_norm": 2.543831826961268, + "language_loss": 0.73404002, + "learning_rate": 2.1103563108335387e-06, + "loss": 0.81105494, + "num_input_tokens_seen": 178038180, + "router_z_loss_clip": 1.54199219, + "router_z_loss_mlp": 0.13226318, + "step": 8282, + "time_per_iteration": 2.481513023376465 + }, + { + "auxiliary_loss_clip": 0.06433594, + "auxiliary_loss_mlp": 0.01272132, + "balance_loss_clip": 0.062822, + "balance_loss_mlp": 0.01260748, + "epoch": 0.49800090184879003, + "flos": 27532223844480.0, + "grad_norm": 1.4555237952962066, + "language_loss": 0.7312296, + "learning_rate": 2.109967440397263e-06, + "loss": 0.80828691, + "num_input_tokens_seen": 178057565, + "router_z_loss_clip": 1.51464844, + "router_z_loss_mlp": 0.1138916, + "step": 8283, + "time_per_iteration": 4.015530824661255 + }, + { + "auxiliary_loss_clip": 0.06430134, + "auxiliary_loss_mlp": 0.01267653, + "balance_loss_clip": 0.06279625, + "balance_loss_mlp": 0.01254791, + "epoch": 0.498061025101458, + "flos": 19798677659520.0, + "grad_norm": 1.429490370630744, + "language_loss": 0.78535879, + "learning_rate": 2.1095785657910095e-06, + "loss": 0.8623367, + "num_input_tokens_seen": 178076965, + "router_z_loss_clip": 1.50488281, + "router_z_loss_mlp": 0.12860107, + "step": 8284, + "time_per_iteration": 2.4994332790374756 + }, + { + "auxiliary_loss_clip": 0.06437389, + "auxiliary_loss_mlp": 0.01269907, + "balance_loss_clip": 0.06278685, + "balance_loss_mlp": 0.01255864, + "epoch": 0.49812114835412596, + "flos": 29900926915200.0, + "grad_norm": 1.711585124439885, + "language_loss": 0.7343573, + "learning_rate": 2.109189687029526e-06, + "loss": 0.81143022, + "num_input_tokens_seen": 178095105, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 0.14044189, + "step": 8285, + "time_per_iteration": 2.566572904586792 + }, + { + "auxiliary_loss_clip": 0.06430154, + "auxiliary_loss_mlp": 0.01270611, + "balance_loss_clip": 0.0627718, + "balance_loss_mlp": 0.01258404, + "epoch": 0.49818127160679393, + "flos": 23153441178240.0, + "grad_norm": 1.4871294259616603, + "language_loss": 0.74281567, + "learning_rate": 2.1088008041275598e-06, + "loss": 0.81982332, + "num_input_tokens_seen": 178114505, + "router_z_loss_clip": 1.52832031, + "router_z_loss_mlp": 0.12207031, + "step": 8286, + "time_per_iteration": 2.5136756896972656 + }, + { + "auxiliary_loss_clip": 0.06434155, + "auxiliary_loss_mlp": 0.01266169, + "balance_loss_clip": 0.06279751, + "balance_loss_mlp": 0.0125358, + "epoch": 0.4982413948594619, + "flos": 21659228444160.0, + "grad_norm": 1.6982664351725185, + "language_loss": 0.85701174, + "learning_rate": 2.1084119170998545e-06, + "loss": 0.93401492, + "num_input_tokens_seen": 178131595, + "router_z_loss_clip": 1.54492188, + "router_z_loss_mlp": 0.12579346, + "step": 8287, + "time_per_iteration": 2.518136501312256 + }, + { + "auxiliary_loss_clip": 0.06432185, + "auxiliary_loss_mlp": 0.01270528, + "balance_loss_clip": 0.06276216, + "balance_loss_mlp": 0.01256801, + "epoch": 0.49830151811212986, + "flos": 32494866560640.0, + "grad_norm": 1.6945408763753198, + "language_loss": 0.72708082, + "learning_rate": 2.108023025961159e-06, + "loss": 0.80410802, + "num_input_tokens_seen": 178152055, + "router_z_loss_clip": 1.55957031, + "router_z_loss_mlp": 0.13745117, + "step": 8288, + "time_per_iteration": 2.590862512588501 + }, + { + "auxiliary_loss_clip": 0.06436619, + "auxiliary_loss_mlp": 0.01272174, + "balance_loss_clip": 0.0627879, + "balance_loss_mlp": 0.01258972, + "epoch": 0.4983616413647978, + "flos": 18146886122880.0, + "grad_norm": 4.0455531591406855, + "language_loss": 0.81054366, + "learning_rate": 2.10763413072622e-06, + "loss": 0.8876316, + "num_input_tokens_seen": 178168150, + "router_z_loss_clip": 1.57617188, + "router_z_loss_mlp": 0.13201904, + "step": 8289, + "time_per_iteration": 2.504817008972168 + }, + { + "auxiliary_loss_clip": 0.06432903, + "auxiliary_loss_mlp": 0.01270371, + "balance_loss_clip": 0.06279443, + "balance_loss_mlp": 0.01257074, + "epoch": 0.4984217646174658, + "flos": 19724898539520.0, + "grad_norm": 2.471620750065275, + "language_loss": 0.73847377, + "learning_rate": 2.107245231409784e-06, + "loss": 0.81550646, + "num_input_tokens_seen": 178186150, + "router_z_loss_clip": 1.53613281, + "router_z_loss_mlp": 0.13305664, + "step": 8290, + "time_per_iteration": 2.492176055908203 + }, + { + "auxiliary_loss_clip": 0.0643364, + "auxiliary_loss_mlp": 0.01275224, + "balance_loss_clip": 0.06278273, + "balance_loss_mlp": 0.01261157, + "epoch": 0.49848188787013376, + "flos": 24943525079040.0, + "grad_norm": 1.4456375643187662, + "language_loss": 0.84330356, + "learning_rate": 2.106856328026598e-06, + "loss": 0.92039216, + "num_input_tokens_seen": 178207665, + "router_z_loss_clip": 1.55273438, + "router_z_loss_mlp": 0.140625, + "step": 8291, + "time_per_iteration": 2.5577101707458496 + }, + { + "auxiliary_loss_clip": 0.06438746, + "auxiliary_loss_mlp": 0.01270664, + "balance_loss_clip": 0.06277075, + "balance_loss_mlp": 0.01257379, + "epoch": 0.4985420111228017, + "flos": 22388969652480.0, + "grad_norm": 1.8626179833436056, + "language_loss": 0.67868197, + "learning_rate": 2.106467420591409e-06, + "loss": 0.75577605, + "num_input_tokens_seen": 178226325, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.13275146, + "step": 8292, + "time_per_iteration": 2.5227880477905273 + }, + { + "auxiliary_loss_clip": 0.06428275, + "auxiliary_loss_mlp": 0.01268796, + "balance_loss_clip": 0.06275518, + "balance_loss_mlp": 0.01256977, + "epoch": 0.4986021343754697, + "flos": 16221989802240.0, + "grad_norm": 1.635019918785358, + "language_loss": 0.67247725, + "learning_rate": 2.106078509118965e-06, + "loss": 0.749448, + "num_input_tokens_seen": 178244960, + "router_z_loss_clip": 1.52832031, + "router_z_loss_mlp": 0.11798096, + "step": 8293, + "time_per_iteration": 2.5051913261413574 + }, + { + "auxiliary_loss_clip": 0.0643108, + "auxiliary_loss_mlp": 0.01270371, + "balance_loss_clip": 0.06275735, + "balance_loss_mlp": 0.01258891, + "epoch": 0.4986622576281377, + "flos": 23410221615360.0, + "grad_norm": 1.789605024821123, + "language_loss": 0.82488304, + "learning_rate": 2.1056895936240133e-06, + "loss": 0.90189755, + "num_input_tokens_seen": 178265400, + "router_z_loss_clip": 1.5546875, + "router_z_loss_mlp": 0.11480713, + "step": 8294, + "time_per_iteration": 2.5429139137268066 + }, + { + "auxiliary_loss_clip": 0.06432615, + "auxiliary_loss_mlp": 0.01272563, + "balance_loss_clip": 0.06277893, + "balance_loss_mlp": 0.01260315, + "epoch": 0.49872238088080567, + "flos": 19980714654720.0, + "grad_norm": 2.5766475970916285, + "language_loss": 0.73639232, + "learning_rate": 2.1053006741213016e-06, + "loss": 0.81344408, + "num_input_tokens_seen": 178284535, + "router_z_loss_clip": 1.546875, + "router_z_loss_mlp": 0.12249756, + "step": 8295, + "time_per_iteration": 2.535090923309326 + }, + { + "auxiliary_loss_clip": 0.06427556, + "auxiliary_loss_mlp": 0.01272493, + "balance_loss_clip": 0.06276329, + "balance_loss_mlp": 0.01259911, + "epoch": 0.49878250413347364, + "flos": 22899595633920.0, + "grad_norm": 1.8257233918976585, + "language_loss": 0.68199098, + "learning_rate": 2.1049117506255775e-06, + "loss": 0.75899148, + "num_input_tokens_seen": 178302425, + "router_z_loss_clip": 1.50976562, + "router_z_loss_mlp": 0.12591553, + "step": 8296, + "time_per_iteration": 2.5079848766326904 + }, + { + "auxiliary_loss_clip": 0.06433527, + "auxiliary_loss_mlp": 0.01272036, + "balance_loss_clip": 0.06276954, + "balance_loss_mlp": 0.0125878, + "epoch": 0.4988426273861416, + "flos": 32606688234240.0, + "grad_norm": 1.801119189108274, + "language_loss": 0.64925557, + "learning_rate": 2.1045228231515895e-06, + "loss": 0.72631121, + "num_input_tokens_seen": 178323065, + "router_z_loss_clip": 1.56347656, + "router_z_loss_mlp": 0.13256836, + "step": 8297, + "time_per_iteration": 2.6275887489318848 + }, + { + "auxiliary_loss_clip": 0.06427586, + "auxiliary_loss_mlp": 0.01270462, + "balance_loss_clip": 0.06278079, + "balance_loss_mlp": 0.01258845, + "epoch": 0.49890275063880957, + "flos": 20929990360320.0, + "grad_norm": 1.5890674789628483, + "language_loss": 0.69987392, + "learning_rate": 2.1041338917140857e-06, + "loss": 0.77685434, + "num_input_tokens_seen": 178343985, + "router_z_loss_clip": 1.49511719, + "router_z_loss_mlp": 0.11621094, + "step": 8298, + "time_per_iteration": 2.527082681655884 + }, + { + "auxiliary_loss_clip": 0.06428695, + "auxiliary_loss_mlp": 0.01265195, + "balance_loss_clip": 0.06276681, + "balance_loss_mlp": 0.01253668, + "epoch": 0.49896287389147753, + "flos": 18630370581120.0, + "grad_norm": 3.032196085375079, + "language_loss": 0.85047698, + "learning_rate": 2.103744956327814e-06, + "loss": 0.92741591, + "num_input_tokens_seen": 178362345, + "router_z_loss_clip": 1.51953125, + "router_z_loss_mlp": 0.11517334, + "step": 8299, + "time_per_iteration": 2.531541585922241 + }, + { + "auxiliary_loss_clip": 0.06429411, + "auxiliary_loss_mlp": 0.01267168, + "balance_loss_clip": 0.06274673, + "balance_loss_mlp": 0.0125412, + "epoch": 0.4990229971441455, + "flos": 24833422414080.0, + "grad_norm": 2.041795476236588, + "language_loss": 0.69284618, + "learning_rate": 2.1033560170075234e-06, + "loss": 0.76981199, + "num_input_tokens_seen": 178383190, + "router_z_loss_clip": 1.54296875, + "router_z_loss_mlp": 0.13061523, + "step": 8300, + "time_per_iteration": 2.562002658843994 + }, + { + "auxiliary_loss_clip": 0.0633271, + "auxiliary_loss_mlp": 0.01269781, + "balance_loss_clip": 0.06265618, + "balance_loss_mlp": 0.01267531, + "epoch": 0.49908312039681346, + "flos": 71405638323840.0, + "grad_norm": 0.7392878070409407, + "language_loss": 0.51101816, + "learning_rate": 2.1029670737679623e-06, + "loss": 0.58704311, + "num_input_tokens_seen": 178444250, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 0.02253723, + "step": 8301, + "time_per_iteration": 3.3210127353668213 + }, + { + "auxiliary_loss_clip": 0.06423864, + "auxiliary_loss_mlp": 0.01270768, + "balance_loss_clip": 0.06275457, + "balance_loss_mlp": 0.01258173, + "epoch": 0.4991432436494814, + "flos": 19834791569280.0, + "grad_norm": 2.2486532521822302, + "language_loss": 0.84452468, + "learning_rate": 2.102578126623879e-06, + "loss": 0.921471, + "num_input_tokens_seen": 178463250, + "router_z_loss_clip": 1.48339844, + "router_z_loss_mlp": 0.12591553, + "step": 8302, + "time_per_iteration": 2.547562837600708 + }, + { + "auxiliary_loss_clip": 0.06428537, + "auxiliary_loss_mlp": 0.01271397, + "balance_loss_clip": 0.06279141, + "balance_loss_mlp": 0.01259607, + "epoch": 0.4992033669021494, + "flos": 15127252208640.0, + "grad_norm": 1.6659174741740037, + "language_loss": 0.69610626, + "learning_rate": 2.102189175590024e-06, + "loss": 0.77310562, + "num_input_tokens_seen": 178481340, + "router_z_loss_clip": 1.49414062, + "router_z_loss_mlp": 0.11785889, + "step": 8303, + "time_per_iteration": 2.473879337310791 + }, + { + "auxiliary_loss_clip": 0.06429437, + "auxiliary_loss_mlp": 0.01266243, + "balance_loss_clip": 0.0627458, + "balance_loss_mlp": 0.01253851, + "epoch": 0.49926349015481736, + "flos": 31215282860160.0, + "grad_norm": 1.7036998151712766, + "language_loss": 0.72999942, + "learning_rate": 2.101800220681144e-06, + "loss": 0.80695617, + "num_input_tokens_seen": 178501545, + "router_z_loss_clip": 1.54785156, + "router_z_loss_mlp": 0.1239624, + "step": 8304, + "time_per_iteration": 2.611502170562744 + }, + { + "auxiliary_loss_clip": 0.0642409, + "auxiliary_loss_mlp": 0.0126995, + "balance_loss_clip": 0.0627369, + "balance_loss_mlp": 0.01257683, + "epoch": 0.4993236134074853, + "flos": 24907201534080.0, + "grad_norm": 2.0593873642803486, + "language_loss": 0.81677687, + "learning_rate": 2.10141126191199e-06, + "loss": 0.89371729, + "num_input_tokens_seen": 178519700, + "router_z_loss_clip": 1.50292969, + "router_z_loss_mlp": 0.1227417, + "step": 8305, + "time_per_iteration": 2.57425594329834 + }, + { + "auxiliary_loss_clip": 0.0632831, + "auxiliary_loss_mlp": 0.01255041, + "balance_loss_clip": 0.06261367, + "balance_loss_mlp": 0.01252826, + "epoch": 0.4993837366601533, + "flos": 70438962896640.0, + "grad_norm": 0.7837813432026206, + "language_loss": 0.56909657, + "learning_rate": 2.1010222992973107e-06, + "loss": 0.64493006, + "num_input_tokens_seen": 178576740, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 0.02220154, + "step": 8306, + "time_per_iteration": 3.2806143760681152 + }, + { + "auxiliary_loss_clip": 0.06430675, + "auxiliary_loss_mlp": 0.01269703, + "balance_loss_clip": 0.06278585, + "balance_loss_mlp": 0.01255422, + "epoch": 0.4994438599128213, + "flos": 15966718738560.0, + "grad_norm": 1.7475082532303507, + "language_loss": 0.83157074, + "learning_rate": 2.1006333328518556e-06, + "loss": 0.90857446, + "num_input_tokens_seen": 178594745, + "router_z_loss_clip": 1.52148438, + "router_z_loss_mlp": 0.1427002, + "step": 8307, + "time_per_iteration": 2.4851419925689697 + }, + { + "auxiliary_loss_clip": 0.06426803, + "auxiliary_loss_mlp": 0.01271631, + "balance_loss_clip": 0.06277731, + "balance_loss_mlp": 0.01258458, + "epoch": 0.4995039831654893, + "flos": 27935765907840.0, + "grad_norm": 1.9977557260500436, + "language_loss": 0.61003512, + "learning_rate": 2.1002443625903748e-06, + "loss": 0.68701947, + "num_input_tokens_seen": 178614110, + "router_z_loss_clip": 1.48925781, + "router_z_loss_mlp": 0.13189697, + "step": 8308, + "time_per_iteration": 2.5943245887756348 + }, + { + "auxiliary_loss_clip": 0.06426641, + "auxiliary_loss_mlp": 0.01271422, + "balance_loss_clip": 0.06278297, + "balance_loss_mlp": 0.01259948, + "epoch": 0.49956410641815724, + "flos": 24211310175360.0, + "grad_norm": 1.573691211270805, + "language_loss": 0.74911636, + "learning_rate": 2.0998553885276168e-06, + "loss": 0.82609695, + "num_input_tokens_seen": 178634170, + "router_z_loss_clip": 1.48242188, + "router_z_loss_mlp": 0.11468506, + "step": 8309, + "time_per_iteration": 3.9743635654449463 + }, + { + "auxiliary_loss_clip": 0.06430435, + "auxiliary_loss_mlp": 0.01268231, + "balance_loss_clip": 0.0627753, + "balance_loss_mlp": 0.0125578, + "epoch": 0.4996242296708252, + "flos": 16185666257280.0, + "grad_norm": 2.033466484631739, + "language_loss": 0.80080384, + "learning_rate": 2.0994664106783335e-06, + "loss": 0.87779051, + "num_input_tokens_seen": 178651775, + "router_z_loss_clip": 1.52734375, + "router_z_loss_mlp": 0.12438965, + "step": 8310, + "time_per_iteration": 2.475815534591675 + }, + { + "auxiliary_loss_clip": 0.06429116, + "auxiliary_loss_mlp": 0.01267368, + "balance_loss_clip": 0.06274112, + "balance_loss_mlp": 0.01254541, + "epoch": 0.49968435292349317, + "flos": 16879209701760.0, + "grad_norm": 1.5486293297173337, + "language_loss": 0.71370041, + "learning_rate": 2.0990774290572735e-06, + "loss": 0.79066527, + "num_input_tokens_seen": 178669720, + "router_z_loss_clip": 1.55078125, + "router_z_loss_mlp": 0.12823486, + "step": 8311, + "time_per_iteration": 4.01245641708374 + }, + { + "auxiliary_loss_clip": 0.06428856, + "auxiliary_loss_mlp": 0.01266033, + "balance_loss_clip": 0.06277557, + "balance_loss_mlp": 0.01254636, + "epoch": 0.49974447617616113, + "flos": 14944837870080.0, + "grad_norm": 1.8003339909908787, + "language_loss": 0.77129757, + "learning_rate": 2.098688443679187e-06, + "loss": 0.8482464, + "num_input_tokens_seen": 178686765, + "router_z_loss_clip": 1.51074219, + "router_z_loss_mlp": 0.11401367, + "step": 8312, + "time_per_iteration": 2.4761128425598145 + }, + { + "auxiliary_loss_clip": 0.0643132, + "auxiliary_loss_mlp": 0.01266437, + "balance_loss_clip": 0.06279029, + "balance_loss_mlp": 0.01254206, + "epoch": 0.4998045994288291, + "flos": 26658823610880.0, + "grad_norm": 1.6524127143489034, + "language_loss": 0.84981465, + "learning_rate": 2.0982994545588256e-06, + "loss": 0.9267922, + "num_input_tokens_seen": 178705845, + "router_z_loss_clip": 1.52246094, + "router_z_loss_mlp": 0.12231445, + "step": 8313, + "time_per_iteration": 2.6057398319244385 + }, + { + "auxiliary_loss_clip": 0.06431891, + "auxiliary_loss_mlp": 0.01267877, + "balance_loss_clip": 0.06279939, + "balance_loss_mlp": 0.01256224, + "epoch": 0.49986472268149706, + "flos": 20959102454400.0, + "grad_norm": 1.6979548607445847, + "language_loss": 0.81193811, + "learning_rate": 2.097910461710939e-06, + "loss": 0.8889358, + "num_input_tokens_seen": 178723410, + "router_z_loss_clip": 1.52050781, + "router_z_loss_mlp": 0.11657715, + "step": 8314, + "time_per_iteration": 2.5246880054473877 + }, + { + "auxiliary_loss_clip": 0.06430186, + "auxiliary_loss_mlp": 0.01269627, + "balance_loss_clip": 0.06278808, + "balance_loss_mlp": 0.01256341, + "epoch": 0.49992484593416503, + "flos": 22790499217920.0, + "grad_norm": 1.7217224756504992, + "language_loss": 0.79857439, + "learning_rate": 2.0975214651502773e-06, + "loss": 0.8755725, + "num_input_tokens_seen": 178743560, + "router_z_loss_clip": 1.51464844, + "router_z_loss_mlp": 0.13305664, + "step": 8315, + "time_per_iteration": 2.5382394790649414 + }, + { + "auxiliary_loss_clip": 0.06427336, + "auxiliary_loss_mlp": 0.01267686, + "balance_loss_clip": 0.06276156, + "balance_loss_mlp": 0.0125595, + "epoch": 0.499984969186833, + "flos": 46796838307200.0, + "grad_norm": 1.6656557215916168, + "language_loss": 0.74803257, + "learning_rate": 2.0971324648915926e-06, + "loss": 0.82498288, + "num_input_tokens_seen": 178767225, + "router_z_loss_clip": 1.50976562, + "router_z_loss_mlp": 0.11749268, + "step": 8316, + "time_per_iteration": 4.178734540939331 + }, + { + "auxiliary_loss_clip": 0.06424455, + "auxiliary_loss_mlp": 0.01269425, + "balance_loss_clip": 0.0627817, + "balance_loss_mlp": 0.01258083, + "epoch": 0.500045092439501, + "flos": 25564086017280.0, + "grad_norm": 1.744541126829246, + "language_loss": 0.81478661, + "learning_rate": 2.0967434609496343e-06, + "loss": 0.89172542, + "num_input_tokens_seen": 178786810, + "router_z_loss_clip": 1.46191406, + "router_z_loss_mlp": 0.11346436, + "step": 8317, + "time_per_iteration": 2.537320613861084 + }, + { + "auxiliary_loss_clip": 0.06427011, + "auxiliary_loss_mlp": 0.01270425, + "balance_loss_clip": 0.06274804, + "balance_loss_mlp": 0.01257586, + "epoch": 0.5001052156921689, + "flos": 20711126695680.0, + "grad_norm": 1.5732702518161361, + "language_loss": 0.83390272, + "learning_rate": 2.0963544533391548e-06, + "loss": 0.91087711, + "num_input_tokens_seen": 178805660, + "router_z_loss_clip": 1.52050781, + "router_z_loss_mlp": 0.12835693, + "step": 8318, + "time_per_iteration": 2.534135103225708 + }, + { + "auxiliary_loss_clip": 0.06428336, + "auxiliary_loss_mlp": 0.01269138, + "balance_loss_clip": 0.06277522, + "balance_loss_mlp": 0.01257109, + "epoch": 0.500165338944837, + "flos": 21257405389440.0, + "grad_norm": 1.6807233025456896, + "language_loss": 0.82012349, + "learning_rate": 2.0959654420749045e-06, + "loss": 0.89709824, + "num_input_tokens_seen": 178824780, + "router_z_loss_clip": 1.5078125, + "router_z_loss_mlp": 0.12030029, + "step": 8319, + "time_per_iteration": 2.515835762023926 + }, + { + "auxiliary_loss_clip": 0.06428086, + "auxiliary_loss_mlp": 0.01265652, + "balance_loss_clip": 0.0627624, + "balance_loss_mlp": 0.01254697, + "epoch": 0.5002254621975049, + "flos": 27861693298560.0, + "grad_norm": 1.6360150103182107, + "language_loss": 0.72118968, + "learning_rate": 2.095576427171635e-06, + "loss": 0.79812706, + "num_input_tokens_seen": 178845640, + "router_z_loss_clip": 1.51757812, + "router_z_loss_mlp": 0.10955811, + "step": 8320, + "time_per_iteration": 2.5796635150909424 + }, + { + "auxiliary_loss_clip": 0.06441814, + "auxiliary_loss_mlp": 0.01267293, + "balance_loss_clip": 0.06280147, + "balance_loss_mlp": 0.01253858, + "epoch": 0.5002855854501729, + "flos": 15556049078400.0, + "grad_norm": 2.4313263695255696, + "language_loss": 0.76678413, + "learning_rate": 2.0951874086440978e-06, + "loss": 0.84387517, + "num_input_tokens_seen": 178862290, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.13439941, + "step": 8321, + "time_per_iteration": 2.4691002368927 + }, + { + "auxiliary_loss_clip": 0.06428922, + "auxiliary_loss_mlp": 0.01268744, + "balance_loss_clip": 0.06276058, + "balance_loss_mlp": 0.0125556, + "epoch": 0.5003457087028408, + "flos": 16112977240320.0, + "grad_norm": 1.7492839336280708, + "language_loss": 0.82910907, + "learning_rate": 2.0947983865070455e-06, + "loss": 0.90608579, + "num_input_tokens_seen": 178879805, + "router_z_loss_clip": 1.52636719, + "router_z_loss_mlp": 0.13183594, + "step": 8322, + "time_per_iteration": 2.515460252761841 + }, + { + "auxiliary_loss_clip": 0.06431515, + "auxiliary_loss_mlp": 0.0126974, + "balance_loss_clip": 0.06279334, + "balance_loss_mlp": 0.01256973, + "epoch": 0.5004058319555088, + "flos": 22717055514240.0, + "grad_norm": 3.787468052495824, + "language_loss": 0.74021679, + "learning_rate": 2.094409360775228e-06, + "loss": 0.81722933, + "num_input_tokens_seen": 178896985, + "router_z_loss_clip": 1.52246094, + "router_z_loss_mlp": 0.12774658, + "step": 8323, + "time_per_iteration": 3.9577157497406006 + }, + { + "auxiliary_loss_clip": 0.06425107, + "auxiliary_loss_mlp": 0.01267421, + "balance_loss_clip": 0.06273489, + "balance_loss_mlp": 0.01254761, + "epoch": 0.5004659552081767, + "flos": 30125870000640.0, + "grad_norm": 1.569659839153646, + "language_loss": 0.69694078, + "learning_rate": 2.0940203314633977e-06, + "loss": 0.77386606, + "num_input_tokens_seen": 178920605, + "router_z_loss_clip": 1.515625, + "router_z_loss_mlp": 0.12670898, + "step": 8324, + "time_per_iteration": 2.5927038192749023 + }, + { + "auxiliary_loss_clip": 0.06426285, + "auxiliary_loss_mlp": 0.01267566, + "balance_loss_clip": 0.06274655, + "balance_loss_mlp": 0.012554, + "epoch": 0.5005260784608447, + "flos": 18630664070400.0, + "grad_norm": 1.9637621432589805, + "language_loss": 0.72455752, + "learning_rate": 2.0936312985863077e-06, + "loss": 0.80149603, + "num_input_tokens_seen": 178937760, + "router_z_loss_clip": 1.51367188, + "router_z_loss_mlp": 0.12164307, + "step": 8325, + "time_per_iteration": 2.5748932361602783 + }, + { + "auxiliary_loss_clip": 0.06431422, + "auxiliary_loss_mlp": 0.01266152, + "balance_loss_clip": 0.06278826, + "balance_loss_mlp": 0.01253069, + "epoch": 0.5005862017135126, + "flos": 24866349868800.0, + "grad_norm": 1.7160687334315328, + "language_loss": 0.73386943, + "learning_rate": 2.093242262158709e-06, + "loss": 0.8108452, + "num_input_tokens_seen": 178957985, + "router_z_loss_clip": 1.52636719, + "router_z_loss_mlp": 0.13085938, + "step": 8326, + "time_per_iteration": 2.5720608234405518 + }, + { + "auxiliary_loss_clip": 0.06426796, + "auxiliary_loss_mlp": 0.01267135, + "balance_loss_clip": 0.06276905, + "balance_loss_mlp": 0.01255763, + "epoch": 0.5006463249661807, + "flos": 18740389392000.0, + "grad_norm": 1.5629486934520718, + "language_loss": 0.78059208, + "learning_rate": 2.0928532221953544e-06, + "loss": 0.85753143, + "num_input_tokens_seen": 178977070, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.11364746, + "step": 8327, + "time_per_iteration": 2.5033681392669678 + }, + { + "auxiliary_loss_clip": 0.06429915, + "auxiliary_loss_mlp": 0.01266866, + "balance_loss_clip": 0.06277432, + "balance_loss_mlp": 0.01254533, + "epoch": 0.5007064482188487, + "flos": 13047124999680.0, + "grad_norm": 2.5584329331081253, + "language_loss": 0.88066995, + "learning_rate": 2.092464178710997e-06, + "loss": 0.95763773, + "num_input_tokens_seen": 178994175, + "router_z_loss_clip": 1.52441406, + "router_z_loss_mlp": 0.12329102, + "step": 8328, + "time_per_iteration": 2.469723701477051 + }, + { + "auxiliary_loss_clip": 0.06430298, + "auxiliary_loss_mlp": 0.0126735, + "balance_loss_clip": 0.06274554, + "balance_loss_mlp": 0.01254302, + "epoch": 0.5007665714715166, + "flos": 21295154453760.0, + "grad_norm": 2.120857663767784, + "language_loss": 0.74578768, + "learning_rate": 2.092075131720388e-06, + "loss": 0.82276416, + "num_input_tokens_seen": 179013710, + "router_z_loss_clip": 1.55664062, + "router_z_loss_mlp": 0.1305542, + "step": 8329, + "time_per_iteration": 2.527421236038208 + }, + { + "auxiliary_loss_clip": 0.06427623, + "auxiliary_loss_mlp": 0.01269321, + "balance_loss_clip": 0.06278372, + "balance_loss_mlp": 0.01257626, + "epoch": 0.5008266947241846, + "flos": 29762676478080.0, + "grad_norm": 1.5806360237517383, + "language_loss": 0.80007339, + "learning_rate": 2.091686081238281e-06, + "loss": 0.87704277, + "num_input_tokens_seen": 179035255, + "router_z_loss_clip": 1.49316406, + "router_z_loss_mlp": 0.11688232, + "step": 8330, + "time_per_iteration": 2.589132785797119 + }, + { + "auxiliary_loss_clip": 0.063256, + "auxiliary_loss_mlp": 0.01256172, + "balance_loss_clip": 0.06259131, + "balance_loss_mlp": 0.0125421, + "epoch": 0.5008868179768525, + "flos": 63574498460160.0, + "grad_norm": 0.7051231310601146, + "language_loss": 0.56005836, + "learning_rate": 2.0912970272794282e-06, + "loss": 0.63587606, + "num_input_tokens_seen": 179090915, + "router_z_loss_clip": 0.6640625, + "router_z_loss_mlp": 0.01960754, + "step": 8331, + "time_per_iteration": 2.9798707962036133 + }, + { + "auxiliary_loss_clip": 0.06425481, + "auxiliary_loss_mlp": 0.01267706, + "balance_loss_clip": 0.06278575, + "balance_loss_mlp": 0.01256125, + "epoch": 0.5009469412295205, + "flos": 27382108055040.0, + "grad_norm": 1.8793466545943338, + "language_loss": 0.65444684, + "learning_rate": 2.0909079698585833e-06, + "loss": 0.73137867, + "num_input_tokens_seen": 179109160, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 0.11584473, + "step": 8332, + "time_per_iteration": 2.548846483230591 + }, + { + "auxiliary_loss_clip": 0.06424412, + "auxiliary_loss_mlp": 0.01265076, + "balance_loss_clip": 0.06275713, + "balance_loss_mlp": 0.01253578, + "epoch": 0.5010070644821885, + "flos": 27385839561600.0, + "grad_norm": 1.4154143625456153, + "language_loss": 0.75122535, + "learning_rate": 2.0905189089904993e-06, + "loss": 0.82812029, + "num_input_tokens_seen": 179130610, + "router_z_loss_clip": 1.48632812, + "router_z_loss_mlp": 0.1149292, + "step": 8333, + "time_per_iteration": 2.600377082824707 + }, + { + "auxiliary_loss_clip": 0.06429033, + "auxiliary_loss_mlp": 0.01268641, + "balance_loss_clip": 0.06276083, + "balance_loss_mlp": 0.01256481, + "epoch": 0.5010671877348565, + "flos": 20668178678400.0, + "grad_norm": 1.9411742898612023, + "language_loss": 0.80806357, + "learning_rate": 2.090129844689929e-06, + "loss": 0.88504034, + "num_input_tokens_seen": 179147860, + "router_z_loss_clip": 1.53222656, + "router_z_loss_mlp": 0.12158203, + "step": 8334, + "time_per_iteration": 2.490330457687378 + }, + { + "auxiliary_loss_clip": 0.0633373, + "auxiliary_loss_mlp": 0.01254486, + "balance_loss_clip": 0.06267349, + "balance_loss_mlp": 0.01252466, + "epoch": 0.5011273109875244, + "flos": 59148266855040.0, + "grad_norm": 0.880609822046852, + "language_loss": 0.62818438, + "learning_rate": 2.089740776971626e-06, + "loss": 0.70406651, + "num_input_tokens_seen": 179210490, + "router_z_loss_clip": 0.6640625, + "router_z_loss_mlp": 0.02020264, + "step": 8335, + "time_per_iteration": 3.1081318855285645 + }, + { + "auxiliary_loss_clip": 0.06426011, + "auxiliary_loss_mlp": 0.01265932, + "balance_loss_clip": 0.06278515, + "balance_loss_mlp": 0.01255334, + "epoch": 0.5011874342401924, + "flos": 25343126000640.0, + "grad_norm": 1.3778270209342711, + "language_loss": 0.80092967, + "learning_rate": 2.0893517058503435e-06, + "loss": 0.8778491, + "num_input_tokens_seen": 179231360, + "router_z_loss_clip": 1.47558594, + "router_z_loss_mlp": 0.105896, + "step": 8336, + "time_per_iteration": 2.5390379428863525 + }, + { + "auxiliary_loss_clip": 0.06428748, + "auxiliary_loss_mlp": 0.0126676, + "balance_loss_clip": 0.06278357, + "balance_loss_mlp": 0.01254923, + "epoch": 0.5012475574928603, + "flos": 20236153426560.0, + "grad_norm": 1.7537768303990948, + "language_loss": 0.81054461, + "learning_rate": 2.088962631340836e-06, + "loss": 0.88749969, + "num_input_tokens_seen": 179250625, + "router_z_loss_clip": 1.50585938, + "router_z_loss_mlp": 0.11834717, + "step": 8337, + "time_per_iteration": 2.5480427742004395 + }, + { + "auxiliary_loss_clip": 0.06436703, + "auxiliary_loss_mlp": 0.01267216, + "balance_loss_clip": 0.06279006, + "balance_loss_mlp": 0.01254973, + "epoch": 0.5013076807455283, + "flos": 22716594316800.0, + "grad_norm": 1.7916878418610642, + "language_loss": 0.79506505, + "learning_rate": 2.0885735534578555e-06, + "loss": 0.87210429, + "num_input_tokens_seen": 179267360, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.12255859, + "step": 8338, + "time_per_iteration": 2.5164718627929688 + }, + { + "auxiliary_loss_clip": 0.0643065, + "auxiliary_loss_mlp": 0.01265282, + "balance_loss_clip": 0.06277832, + "balance_loss_mlp": 0.01253176, + "epoch": 0.5013678039981962, + "flos": 24252329548800.0, + "grad_norm": 1.5889596080337545, + "language_loss": 0.85034919, + "learning_rate": 2.0881844722161583e-06, + "loss": 0.9273085, + "num_input_tokens_seen": 179289810, + "router_z_loss_clip": 1.52636719, + "router_z_loss_mlp": 0.12127686, + "step": 8339, + "time_per_iteration": 2.5785508155822754 + }, + { + "auxiliary_loss_clip": 0.06426719, + "auxiliary_loss_mlp": 0.01269107, + "balance_loss_clip": 0.06276147, + "balance_loss_mlp": 0.0125814, + "epoch": 0.5014279272508643, + "flos": 26183808414720.0, + "grad_norm": 1.5165096284579775, + "language_loss": 0.71162677, + "learning_rate": 2.0877953876304962e-06, + "loss": 0.78858501, + "num_input_tokens_seen": 179310620, + "router_z_loss_clip": 1.50488281, + "router_z_loss_mlp": 0.10968018, + "step": 8340, + "time_per_iteration": 2.5929582118988037 + }, + { + "auxiliary_loss_clip": 0.06433477, + "auxiliary_loss_mlp": 0.01270076, + "balance_loss_clip": 0.06277999, + "balance_loss_mlp": 0.01256867, + "epoch": 0.5014880505035323, + "flos": 21436255929600.0, + "grad_norm": 2.442832877053188, + "language_loss": 0.7829324, + "learning_rate": 2.0874062997156245e-06, + "loss": 0.85996789, + "num_input_tokens_seen": 179329005, + "router_z_loss_clip": 1.5546875, + "router_z_loss_mlp": 0.13208008, + "step": 8341, + "time_per_iteration": 2.5200908184051514 + }, + { + "auxiliary_loss_clip": 0.06435034, + "auxiliary_loss_mlp": 0.01267489, + "balance_loss_clip": 0.062792, + "balance_loss_mlp": 0.01255407, + "epoch": 0.5015481737562002, + "flos": 15774870816000.0, + "grad_norm": 2.1824930872588917, + "language_loss": 0.89806843, + "learning_rate": 2.0870172084862975e-06, + "loss": 0.97509372, + "num_input_tokens_seen": 179343785, + "router_z_loss_clip": 1.5546875, + "router_z_loss_mlp": 0.12091064, + "step": 8342, + "time_per_iteration": 2.502265691757202 + }, + { + "auxiliary_loss_clip": 0.06427857, + "auxiliary_loss_mlp": 0.01264552, + "balance_loss_clip": 0.06276843, + "balance_loss_mlp": 0.0125275, + "epoch": 0.5016082970088682, + "flos": 26837590296960.0, + "grad_norm": 1.7003073455140034, + "language_loss": 0.76872855, + "learning_rate": 2.0866281139572682e-06, + "loss": 0.84565264, + "num_input_tokens_seen": 179364070, + "router_z_loss_clip": 1.50878906, + "router_z_loss_mlp": 0.11804199, + "step": 8343, + "time_per_iteration": 2.5502099990844727 + }, + { + "auxiliary_loss_clip": 0.06426306, + "auxiliary_loss_mlp": 0.01267626, + "balance_loss_clip": 0.0627844, + "balance_loss_mlp": 0.01256724, + "epoch": 0.5016684202615361, + "flos": 21477023740800.0, + "grad_norm": 3.7325470711422466, + "language_loss": 0.67772466, + "learning_rate": 2.086239016143293e-06, + "loss": 0.75466394, + "num_input_tokens_seen": 179384225, + "router_z_loss_clip": 1.47949219, + "router_z_loss_mlp": 0.10900879, + "step": 8344, + "time_per_iteration": 2.5443081855773926 + }, + { + "auxiliary_loss_clip": 0.06429319, + "auxiliary_loss_mlp": 0.01271563, + "balance_loss_clip": 0.06277445, + "balance_loss_mlp": 0.01259803, + "epoch": 0.5017285435142042, + "flos": 26253478684800.0, + "grad_norm": 2.15637603402593, + "language_loss": 0.75492197, + "learning_rate": 2.0858499150591258e-06, + "loss": 0.83193076, + "num_input_tokens_seen": 179402595, + "router_z_loss_clip": 1.51757812, + "router_z_loss_mlp": 0.11767578, + "step": 8345, + "time_per_iteration": 2.5757455825805664 + }, + { + "auxiliary_loss_clip": 0.06426319, + "auxiliary_loss_mlp": 0.01267207, + "balance_loss_clip": 0.06275543, + "balance_loss_mlp": 0.0125441, + "epoch": 0.5017886667668721, + "flos": 20783899566720.0, + "grad_norm": 2.131359070350305, + "language_loss": 0.78573453, + "learning_rate": 2.0854608107195203e-06, + "loss": 0.86266983, + "num_input_tokens_seen": 179419635, + "router_z_loss_clip": 1.50585938, + "router_z_loss_mlp": 0.12805176, + "step": 8346, + "time_per_iteration": 2.5463459491729736 + }, + { + "auxiliary_loss_clip": 0.06428749, + "auxiliary_loss_mlp": 0.012678, + "balance_loss_clip": 0.0627691, + "balance_loss_mlp": 0.01256201, + "epoch": 0.5018487900195401, + "flos": 20162500087680.0, + "grad_norm": 1.4665059060371557, + "language_loss": 0.69395542, + "learning_rate": 2.0850717031392333e-06, + "loss": 0.77092093, + "num_input_tokens_seen": 179438770, + "router_z_loss_clip": 1.51757812, + "router_z_loss_mlp": 0.11608887, + "step": 8347, + "time_per_iteration": 2.5277669429779053 + }, + { + "auxiliary_loss_clip": 0.06433204, + "auxiliary_loss_mlp": 0.0126827, + "balance_loss_clip": 0.06278361, + "balance_loss_mlp": 0.01256236, + "epoch": 0.501908913272208, + "flos": 18156613196160.0, + "grad_norm": 2.582566868470837, + "language_loss": 0.7215631, + "learning_rate": 2.0846825923330174e-06, + "loss": 0.79857785, + "num_input_tokens_seen": 179457475, + "router_z_loss_clip": 1.54882812, + "router_z_loss_mlp": 0.12030029, + "step": 8348, + "time_per_iteration": 3.996784210205078 + }, + { + "auxiliary_loss_clip": 0.06424178, + "auxiliary_loss_mlp": 0.01269515, + "balance_loss_clip": 0.06277803, + "balance_loss_mlp": 0.01258166, + "epoch": 0.501969036524876, + "flos": 23118962423040.0, + "grad_norm": 1.4308074213434065, + "language_loss": 0.74796462, + "learning_rate": 2.0842934783156303e-06, + "loss": 0.82490146, + "num_input_tokens_seen": 179478140, + "router_z_loss_clip": 1.46386719, + "router_z_loss_mlp": 0.11346436, + "step": 8349, + "time_per_iteration": 2.5489115715026855 + }, + { + "auxiliary_loss_clip": 0.06429881, + "auxiliary_loss_mlp": 0.01269935, + "balance_loss_clip": 0.06276442, + "balance_loss_mlp": 0.01257442, + "epoch": 0.5020291597775439, + "flos": 11367814596480.0, + "grad_norm": 1.898459652208493, + "language_loss": 0.63674343, + "learning_rate": 2.0839043611018266e-06, + "loss": 0.71374166, + "num_input_tokens_seen": 179494325, + "router_z_loss_clip": 1.53417969, + "router_z_loss_mlp": 0.12493896, + "step": 8350, + "time_per_iteration": 2.487217426300049 + }, + { + "auxiliary_loss_clip": 0.06323833, + "auxiliary_loss_mlp": 0.01259522, + "balance_loss_clip": 0.06257538, + "balance_loss_mlp": 0.01257642, + "epoch": 0.5020892830302119, + "flos": 64030422124800.0, + "grad_norm": 0.7586308907420236, + "language_loss": 0.59914774, + "learning_rate": 2.0835152407063597e-06, + "loss": 0.6749813, + "num_input_tokens_seen": 179553545, + "router_z_loss_clip": 0.6640625, + "router_z_loss_mlp": 0.01876831, + "step": 8351, + "time_per_iteration": 4.69463324546814 + }, + { + "auxiliary_loss_clip": 0.06434566, + "auxiliary_loss_mlp": 0.01269503, + "balance_loss_clip": 0.06280354, + "balance_loss_mlp": 0.01258029, + "epoch": 0.5021494062828799, + "flos": 23739691069440.0, + "grad_norm": 1.6219034526425078, + "language_loss": 0.75496215, + "learning_rate": 2.0831261171439873e-06, + "loss": 0.83200288, + "num_input_tokens_seen": 179573645, + "router_z_loss_clip": 1.54296875, + "router_z_loss_mlp": 0.11474609, + "step": 8352, + "time_per_iteration": 2.5164549350738525 + }, + { + "auxiliary_loss_clip": 0.06428628, + "auxiliary_loss_mlp": 0.01267422, + "balance_loss_clip": 0.06277371, + "balance_loss_mlp": 0.01254845, + "epoch": 0.5022095295355479, + "flos": 21582640212480.0, + "grad_norm": 1.8174761726271038, + "language_loss": 0.71818656, + "learning_rate": 2.082736990429464e-06, + "loss": 0.795147, + "num_input_tokens_seen": 179591435, + "router_z_loss_clip": 1.51171875, + "router_z_loss_mlp": 0.12573242, + "step": 8353, + "time_per_iteration": 2.51479172706604 + }, + { + "auxiliary_loss_clip": 0.06434356, + "auxiliary_loss_mlp": 0.01268164, + "balance_loss_clip": 0.06281401, + "balance_loss_mlp": 0.01256105, + "epoch": 0.5022696527882159, + "flos": 21403580037120.0, + "grad_norm": 2.9144841273148154, + "language_loss": 0.74235505, + "learning_rate": 2.0823478605775455e-06, + "loss": 0.81938022, + "num_input_tokens_seen": 179609955, + "router_z_loss_clip": 1.52734375, + "router_z_loss_mlp": 0.12060547, + "step": 8354, + "time_per_iteration": 2.5085036754608154 + }, + { + "auxiliary_loss_clip": 0.06431521, + "auxiliary_loss_mlp": 0.0126797, + "balance_loss_clip": 0.06281638, + "balance_loss_mlp": 0.01256216, + "epoch": 0.5023297760408838, + "flos": 27167814437760.0, + "grad_norm": 1.5801517406711547, + "language_loss": 0.7257005, + "learning_rate": 2.0819587276029884e-06, + "loss": 0.80269539, + "num_input_tokens_seen": 179630875, + "router_z_loss_clip": 1.49804688, + "router_z_loss_mlp": 0.11755371, + "step": 8355, + "time_per_iteration": 2.559136152267456 + }, + { + "auxiliary_loss_clip": 0.06435544, + "auxiliary_loss_mlp": 0.01267978, + "balance_loss_clip": 0.06278937, + "balance_loss_mlp": 0.01255134, + "epoch": 0.5023898992935518, + "flos": 26221054354560.0, + "grad_norm": 1.801551244152151, + "language_loss": 0.8142066, + "learning_rate": 2.081569591520548e-06, + "loss": 0.89124179, + "num_input_tokens_seen": 179649835, + "router_z_loss_clip": 1.56445312, + "router_z_loss_mlp": 0.1282959, + "step": 8356, + "time_per_iteration": 3.978407144546509 + }, + { + "auxiliary_loss_clip": 0.06435513, + "auxiliary_loss_mlp": 0.01268474, + "balance_loss_clip": 0.06275411, + "balance_loss_mlp": 0.01255272, + "epoch": 0.5024500225462197, + "flos": 13444839204480.0, + "grad_norm": 2.072167033386685, + "language_loss": 0.7662456, + "learning_rate": 2.0811804523449803e-06, + "loss": 0.84328556, + "num_input_tokens_seen": 179667605, + "router_z_loss_clip": 1.60058594, + "router_z_loss_mlp": 0.13201904, + "step": 8357, + "time_per_iteration": 2.488581657409668 + }, + { + "auxiliary_loss_clip": 0.06431419, + "auxiliary_loss_mlp": 0.01272086, + "balance_loss_clip": 0.06275965, + "balance_loss_mlp": 0.01258758, + "epoch": 0.5025101457988878, + "flos": 21585952448640.0, + "grad_norm": 1.5828459742560037, + "language_loss": 0.76457655, + "learning_rate": 2.0807913100910417e-06, + "loss": 0.84161162, + "num_input_tokens_seen": 179686910, + "router_z_loss_clip": 1.55371094, + "router_z_loss_mlp": 0.13342285, + "step": 8358, + "time_per_iteration": 2.62697434425354 + }, + { + "auxiliary_loss_clip": 0.06429468, + "auxiliary_loss_mlp": 0.01266352, + "balance_loss_clip": 0.06276305, + "balance_loss_mlp": 0.01253877, + "epoch": 0.5025702690515557, + "flos": 24652140105600.0, + "grad_norm": 2.247340947262335, + "language_loss": 0.72276986, + "learning_rate": 2.0804021647734887e-06, + "loss": 0.79972816, + "num_input_tokens_seen": 179706395, + "router_z_loss_clip": 1.53027344, + "router_z_loss_mlp": 0.12481689, + "step": 8359, + "time_per_iteration": 2.577232599258423 + }, + { + "auxiliary_loss_clip": 0.0642844, + "auxiliary_loss_mlp": 0.01267714, + "balance_loss_clip": 0.06277584, + "balance_loss_mlp": 0.01255263, + "epoch": 0.5026303923042237, + "flos": 22096578430080.0, + "grad_norm": 1.7221298639434877, + "language_loss": 0.77017021, + "learning_rate": 2.080013016407077e-06, + "loss": 0.84713173, + "num_input_tokens_seen": 179725735, + "router_z_loss_clip": 1.50683594, + "router_z_loss_mlp": 0.12451172, + "step": 8360, + "time_per_iteration": 2.5449211597442627 + }, + { + "auxiliary_loss_clip": 0.0642498, + "auxiliary_loss_mlp": 0.01267029, + "balance_loss_clip": 0.06274442, + "balance_loss_mlp": 0.0125571, + "epoch": 0.5026905155568916, + "flos": 23704164138240.0, + "grad_norm": 3.319216273479951, + "language_loss": 0.76811969, + "learning_rate": 2.0796238650065645e-06, + "loss": 0.84503973, + "num_input_tokens_seen": 179746150, + "router_z_loss_clip": 1.50488281, + "router_z_loss_mlp": 0.11322021, + "step": 8361, + "time_per_iteration": 2.5360496044158936 + }, + { + "auxiliary_loss_clip": 0.06433755, + "auxiliary_loss_mlp": 0.01271718, + "balance_loss_clip": 0.06276754, + "balance_loss_mlp": 0.01258641, + "epoch": 0.5027506388095596, + "flos": 25819566716160.0, + "grad_norm": 1.6478894806212292, + "language_loss": 0.85182559, + "learning_rate": 2.0792347105867065e-06, + "loss": 0.92888033, + "num_input_tokens_seen": 179767550, + "router_z_loss_clip": 1.56835938, + "router_z_loss_mlp": 0.13067627, + "step": 8362, + "time_per_iteration": 4.023087739944458 + }, + { + "auxiliary_loss_clip": 0.06433062, + "auxiliary_loss_mlp": 0.01266272, + "balance_loss_clip": 0.06277543, + "balance_loss_mlp": 0.01253851, + "epoch": 0.5028107620622275, + "flos": 27533942853120.0, + "grad_norm": 1.6676304720736304, + "language_loss": 0.79210544, + "learning_rate": 2.0788455531622605e-06, + "loss": 0.86909878, + "num_input_tokens_seen": 179790075, + "router_z_loss_clip": 1.5546875, + "router_z_loss_mlp": 0.12420654, + "step": 8363, + "time_per_iteration": 2.610635757446289 + }, + { + "auxiliary_loss_clip": 0.0642155, + "auxiliary_loss_mlp": 0.0126839, + "balance_loss_clip": 0.06275487, + "balance_loss_mlp": 0.01255903, + "epoch": 0.5028708853148955, + "flos": 24541031191680.0, + "grad_norm": 2.470464307064636, + "language_loss": 0.76251006, + "learning_rate": 2.0784563927479838e-06, + "loss": 0.83940947, + "num_input_tokens_seen": 179806515, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.12493896, + "step": 8364, + "time_per_iteration": 2.510077953338623 + }, + { + "auxiliary_loss_clip": 0.06429755, + "auxiliary_loss_mlp": 0.01267093, + "balance_loss_clip": 0.0627771, + "balance_loss_mlp": 0.0125556, + "epoch": 0.5029310085675635, + "flos": 20819887695360.0, + "grad_norm": 1.5150578704653515, + "language_loss": 0.69785869, + "learning_rate": 2.0780672293586317e-06, + "loss": 0.77482712, + "num_input_tokens_seen": 179826450, + "router_z_loss_clip": 1.52050781, + "router_z_loss_mlp": 0.11529541, + "step": 8365, + "time_per_iteration": 2.523810386657715 + }, + { + "auxiliary_loss_clip": 0.064358, + "auxiliary_loss_mlp": 0.01267788, + "balance_loss_clip": 0.06276847, + "balance_loss_mlp": 0.01254365, + "epoch": 0.5029911318202315, + "flos": 22348411476480.0, + "grad_norm": 1.5746180090110224, + "language_loss": 0.73351806, + "learning_rate": 2.0776780630089635e-06, + "loss": 0.81055391, + "num_input_tokens_seen": 179846770, + "router_z_loss_clip": 1.58789062, + "router_z_loss_mlp": 0.13439941, + "step": 8366, + "time_per_iteration": 2.538522481918335 + }, + { + "auxiliary_loss_clip": 0.06433431, + "auxiliary_loss_mlp": 0.01266603, + "balance_loss_clip": 0.06282506, + "balance_loss_mlp": 0.01254324, + "epoch": 0.5030512550728995, + "flos": 24359581175040.0, + "grad_norm": 1.43168858878555, + "language_loss": 0.78766662, + "learning_rate": 2.077288893713735e-06, + "loss": 0.86466694, + "num_input_tokens_seen": 179866585, + "router_z_loss_clip": 1.50976562, + "router_z_loss_mlp": 0.12268066, + "step": 8367, + "time_per_iteration": 2.58542799949646 + }, + { + "auxiliary_loss_clip": 0.064292, + "auxiliary_loss_mlp": 0.01267625, + "balance_loss_clip": 0.06276654, + "balance_loss_mlp": 0.01255835, + "epoch": 0.5031113783255674, + "flos": 18265835393280.0, + "grad_norm": 1.7642536194953051, + "language_loss": 0.70319581, + "learning_rate": 2.0768997214877035e-06, + "loss": 0.78016406, + "num_input_tokens_seen": 179885575, + "router_z_loss_clip": 1.5234375, + "router_z_loss_mlp": 0.11804199, + "step": 8368, + "time_per_iteration": 2.4808216094970703 + }, + { + "auxiliary_loss_clip": 0.06318872, + "auxiliary_loss_mlp": 0.01256661, + "balance_loss_clip": 0.06252527, + "balance_loss_mlp": 0.01254704, + "epoch": 0.5031715015782354, + "flos": 57270022859520.0, + "grad_norm": 0.9058846668072361, + "language_loss": 0.63429594, + "learning_rate": 2.0765105463456274e-06, + "loss": 0.7100513, + "num_input_tokens_seen": 179939650, + "router_z_loss_clip": 0.6640625, + "router_z_loss_mlp": 0.01954651, + "step": 8369, + "time_per_iteration": 3.0813984870910645 + }, + { + "auxiliary_loss_clip": 0.06425582, + "auxiliary_loss_mlp": 0.0126821, + "balance_loss_clip": 0.06275157, + "balance_loss_mlp": 0.01256873, + "epoch": 0.5032316248309033, + "flos": 27534823320960.0, + "grad_norm": 1.9780482072247232, + "language_loss": 0.60450232, + "learning_rate": 2.076121368302263e-06, + "loss": 0.68144017, + "num_input_tokens_seen": 179961765, + "router_z_loss_clip": 1.50390625, + "router_z_loss_mlp": 0.11328125, + "step": 8370, + "time_per_iteration": 2.6361827850341797 + }, + { + "auxiliary_loss_clip": 0.06429368, + "auxiliary_loss_mlp": 0.01269199, + "balance_loss_clip": 0.06273827, + "balance_loss_mlp": 0.01255901, + "epoch": 0.5032917480835714, + "flos": 34504401104640.0, + "grad_norm": 1.6209694165930644, + "language_loss": 0.68475735, + "learning_rate": 2.0757321873723695e-06, + "loss": 0.76174301, + "num_input_tokens_seen": 179983015, + "router_z_loss_clip": 1.55664062, + "router_z_loss_mlp": 0.13293457, + "step": 8371, + "time_per_iteration": 2.6757090091705322 + }, + { + "auxiliary_loss_clip": 0.06428707, + "auxiliary_loss_mlp": 0.01269533, + "balance_loss_clip": 0.06274853, + "balance_loss_mlp": 0.01256158, + "epoch": 0.5033518713362393, + "flos": 33665228064000.0, + "grad_norm": 1.992355635042309, + "language_loss": 0.67781597, + "learning_rate": 2.0753430035707042e-06, + "loss": 0.75479841, + "num_input_tokens_seen": 180003210, + "router_z_loss_clip": 1.54199219, + "router_z_loss_mlp": 0.13397217, + "step": 8372, + "time_per_iteration": 2.625875234603882 + }, + { + "auxiliary_loss_clip": 0.06429783, + "auxiliary_loss_mlp": 0.0126941, + "balance_loss_clip": 0.06275001, + "balance_loss_mlp": 0.0125582, + "epoch": 0.5034119945889073, + "flos": 28193301031680.0, + "grad_norm": 1.502668832263038, + "language_loss": 0.67200899, + "learning_rate": 2.0749538169120235e-06, + "loss": 0.74900091, + "num_input_tokens_seen": 180025530, + "router_z_loss_clip": 1.546875, + "router_z_loss_mlp": 0.13604736, + "step": 8373, + "time_per_iteration": 2.605649709701538 + }, + { + "auxiliary_loss_clip": 0.06426984, + "auxiliary_loss_mlp": 0.01270724, + "balance_loss_clip": 0.06274835, + "balance_loss_mlp": 0.01258362, + "epoch": 0.5034721178415752, + "flos": 21364698942720.0, + "grad_norm": 1.6635937081301206, + "language_loss": 0.75186062, + "learning_rate": 2.0745646274110872e-06, + "loss": 0.82883763, + "num_input_tokens_seen": 180043180, + "router_z_loss_clip": 1.52050781, + "router_z_loss_mlp": 0.12365723, + "step": 8374, + "time_per_iteration": 2.503739595413208 + }, + { + "auxiliary_loss_clip": 0.06431206, + "auxiliary_loss_mlp": 0.01268819, + "balance_loss_clip": 0.06274216, + "balance_loss_mlp": 0.01255945, + "epoch": 0.5035322410942432, + "flos": 22681486656000.0, + "grad_norm": 1.5469346618590563, + "language_loss": 0.68547672, + "learning_rate": 2.0741754350826525e-06, + "loss": 0.76247704, + "num_input_tokens_seen": 180062905, + "router_z_loss_clip": 1.5703125, + "router_z_loss_mlp": 0.12878418, + "step": 8375, + "time_per_iteration": 2.5394840240478516 + }, + { + "auxiliary_loss_clip": 0.06436669, + "auxiliary_loss_mlp": 0.0127122, + "balance_loss_clip": 0.06277038, + "balance_loss_mlp": 0.01257285, + "epoch": 0.5035923643469111, + "flos": 19834875423360.0, + "grad_norm": 1.6007016499880733, + "language_loss": 0.78976023, + "learning_rate": 2.0737862399414777e-06, + "loss": 0.86683917, + "num_input_tokens_seen": 180082000, + "router_z_loss_clip": 1.59570312, + "router_z_loss_mlp": 0.1394043, + "step": 8376, + "time_per_iteration": 2.480931520462036 + }, + { + "auxiliary_loss_clip": 0.06429401, + "auxiliary_loss_mlp": 0.01267025, + "balance_loss_clip": 0.06272124, + "balance_loss_mlp": 0.01254722, + "epoch": 0.5036524875995791, + "flos": 30521823269760.0, + "grad_norm": 2.1513689232389686, + "language_loss": 0.59716964, + "learning_rate": 2.0733970420023213e-06, + "loss": 0.6741339, + "num_input_tokens_seen": 180101340, + "router_z_loss_clip": 1.57226562, + "router_z_loss_mlp": 0.12304688, + "step": 8377, + "time_per_iteration": 2.5793137550354004 + }, + { + "auxiliary_loss_clip": 0.06430321, + "auxiliary_loss_mlp": 0.01267909, + "balance_loss_clip": 0.06277174, + "balance_loss_mlp": 0.01254617, + "epoch": 0.5037126108522471, + "flos": 14725848424320.0, + "grad_norm": 1.9178870854351904, + "language_loss": 0.76377517, + "learning_rate": 2.0730078412799425e-06, + "loss": 0.84075749, + "num_input_tokens_seen": 180119160, + "router_z_loss_clip": 1.53320312, + "router_z_loss_mlp": 0.13305664, + "step": 8378, + "time_per_iteration": 2.4622483253479004 + }, + { + "auxiliary_loss_clip": 0.06432158, + "auxiliary_loss_mlp": 0.01267261, + "balance_loss_clip": 0.06278415, + "balance_loss_mlp": 0.01254815, + "epoch": 0.5037727341049151, + "flos": 25304119125120.0, + "grad_norm": 1.5376418940503571, + "language_loss": 0.746418, + "learning_rate": 2.0726186377890985e-06, + "loss": 0.82341218, + "num_input_tokens_seen": 180138730, + "router_z_loss_clip": 1.53710938, + "router_z_loss_mlp": 0.12457275, + "step": 8379, + "time_per_iteration": 2.55764102935791 + }, + { + "auxiliary_loss_clip": 0.06427328, + "auxiliary_loss_mlp": 0.01273275, + "balance_loss_clip": 0.06276823, + "balance_loss_mlp": 0.01260138, + "epoch": 0.5038328573575831, + "flos": 28548193000320.0, + "grad_norm": 1.8355606211356674, + "language_loss": 0.66636741, + "learning_rate": 2.072229431544548e-06, + "loss": 0.74337339, + "num_input_tokens_seen": 180158810, + "router_z_loss_clip": 1.50195312, + "router_z_loss_mlp": 0.13146973, + "step": 8380, + "time_per_iteration": 2.566993474960327 + }, + { + "auxiliary_loss_clip": 0.06426656, + "auxiliary_loss_mlp": 0.01266484, + "balance_loss_clip": 0.0627608, + "balance_loss_mlp": 0.01254259, + "epoch": 0.503892980610251, + "flos": 31657957580160.0, + "grad_norm": 1.8901892775526132, + "language_loss": 0.63646573, + "learning_rate": 2.071840222561051e-06, + "loss": 0.71339715, + "num_input_tokens_seen": 180179700, + "router_z_loss_clip": 1.50488281, + "router_z_loss_mlp": 0.12213135, + "step": 8381, + "time_per_iteration": 2.5915544033050537 + }, + { + "auxiliary_loss_clip": 0.06427336, + "auxiliary_loss_mlp": 0.01268764, + "balance_loss_clip": 0.06275158, + "balance_loss_mlp": 0.01257087, + "epoch": 0.503953103862919, + "flos": 27096718648320.0, + "grad_norm": 1.5372847630358786, + "language_loss": 0.67925096, + "learning_rate": 2.071451010853365e-06, + "loss": 0.756212, + "num_input_tokens_seen": 180199890, + "router_z_loss_clip": 1.52246094, + "router_z_loss_mlp": 0.11676025, + "step": 8382, + "time_per_iteration": 2.553654432296753 + }, + { + "auxiliary_loss_clip": 0.06443429, + "auxiliary_loss_mlp": 0.01271028, + "balance_loss_clip": 0.06281322, + "balance_loss_mlp": 0.0125745, + "epoch": 0.5040132271155869, + "flos": 15638423241600.0, + "grad_norm": 1.8104420976136362, + "language_loss": 0.62072217, + "learning_rate": 2.0710617964362506e-06, + "loss": 0.69786668, + "num_input_tokens_seen": 180217840, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.13598633, + "step": 8383, + "time_per_iteration": 2.525148630142212 + }, + { + "auxiliary_loss_clip": 0.06426074, + "auxiliary_loss_mlp": 0.01267471, + "balance_loss_clip": 0.06277263, + "balance_loss_mlp": 0.01255609, + "epoch": 0.504073350368255, + "flos": 13595290410240.0, + "grad_norm": 1.7264517386370961, + "language_loss": 0.6736567, + "learning_rate": 2.070672579324465e-06, + "loss": 0.75059223, + "num_input_tokens_seen": 180236465, + "router_z_loss_clip": 1.48730469, + "router_z_loss_mlp": 0.11853027, + "step": 8384, + "time_per_iteration": 2.4712305068969727 + }, + { + "auxiliary_loss_clip": 0.064311, + "auxiliary_loss_mlp": 0.01267671, + "balance_loss_clip": 0.06277114, + "balance_loss_mlp": 0.01255059, + "epoch": 0.5041334736209229, + "flos": 29065611162240.0, + "grad_norm": 1.6378210813415193, + "language_loss": 0.71431983, + "learning_rate": 2.0702833595327674e-06, + "loss": 0.79130751, + "num_input_tokens_seen": 180258025, + "router_z_loss_clip": 1.54003906, + "router_z_loss_mlp": 0.12609863, + "step": 8385, + "time_per_iteration": 2.573953151702881 + }, + { + "auxiliary_loss_clip": 0.06426452, + "auxiliary_loss_mlp": 0.01264681, + "balance_loss_clip": 0.0627909, + "balance_loss_mlp": 0.01252916, + "epoch": 0.5041935968735909, + "flos": 24615313436160.0, + "grad_norm": 1.6953325653845304, + "language_loss": 0.83098906, + "learning_rate": 2.069894137075919e-06, + "loss": 0.90790039, + "num_input_tokens_seen": 180277825, + "router_z_loss_clip": 1.47167969, + "router_z_loss_mlp": 0.11767578, + "step": 8386, + "time_per_iteration": 2.5524075031280518 + }, + { + "auxiliary_loss_clip": 0.06431791, + "auxiliary_loss_mlp": 0.01268931, + "balance_loss_clip": 0.06277502, + "balance_loss_mlp": 0.01256146, + "epoch": 0.5042537201262588, + "flos": 26294204568960.0, + "grad_norm": 1.4563010196783333, + "language_loss": 0.669891, + "learning_rate": 2.0695049119686766e-06, + "loss": 0.74689829, + "num_input_tokens_seen": 180300465, + "router_z_loss_clip": 1.54296875, + "router_z_loss_mlp": 0.12780762, + "step": 8387, + "time_per_iteration": 3.9810335636138916 + }, + { + "auxiliary_loss_clip": 0.064284, + "auxiliary_loss_mlp": 0.01266601, + "balance_loss_clip": 0.06276827, + "balance_loss_mlp": 0.01254608, + "epoch": 0.5043138433789268, + "flos": 22023805559040.0, + "grad_norm": 3.745410743833339, + "language_loss": 0.80531698, + "learning_rate": 2.0691156842258016e-06, + "loss": 0.882267, + "num_input_tokens_seen": 180321050, + "router_z_loss_clip": 1.51464844, + "router_z_loss_mlp": 0.11999512, + "step": 8388, + "time_per_iteration": 2.5729317665100098 + }, + { + "auxiliary_loss_clip": 0.06426677, + "auxiliary_loss_mlp": 0.01268377, + "balance_loss_clip": 0.06274392, + "balance_loss_mlp": 0.01256075, + "epoch": 0.5043739666315947, + "flos": 28774645459200.0, + "grad_norm": 1.9801629056940246, + "language_loss": 0.70134413, + "learning_rate": 2.0687264538620537e-06, + "loss": 0.77829468, + "num_input_tokens_seen": 180338870, + "router_z_loss_clip": 1.52246094, + "router_z_loss_mlp": 0.12298584, + "step": 8389, + "time_per_iteration": 2.5604100227355957 + }, + { + "auxiliary_loss_clip": 0.06432408, + "auxiliary_loss_mlp": 0.01269066, + "balance_loss_clip": 0.06276394, + "balance_loss_mlp": 0.01256328, + "epoch": 0.5044340898842627, + "flos": 27606548016000.0, + "grad_norm": 1.4709504779743863, + "language_loss": 0.69360697, + "learning_rate": 2.068337220892191e-06, + "loss": 0.77062166, + "num_input_tokens_seen": 180361285, + "router_z_loss_clip": 1.56054688, + "router_z_loss_mlp": 0.12750244, + "step": 8390, + "time_per_iteration": 4.074434041976929 + }, + { + "auxiliary_loss_clip": 0.06327184, + "auxiliary_loss_mlp": 0.01253766, + "balance_loss_clip": 0.06261003, + "balance_loss_mlp": 0.01251581, + "epoch": 0.5044942131369307, + "flos": 67474744058880.0, + "grad_norm": 0.7911094819234682, + "language_loss": 0.52874231, + "learning_rate": 2.067947985330974e-06, + "loss": 0.60455179, + "num_input_tokens_seen": 180415170, + "router_z_loss_clip": 0.6640625, + "router_z_loss_mlp": 0.0218811, + "step": 8391, + "time_per_iteration": 2.939533233642578 + }, + { + "auxiliary_loss_clip": 0.06334387, + "auxiliary_loss_mlp": 0.01253845, + "balance_loss_clip": 0.06267701, + "balance_loss_mlp": 0.01251732, + "epoch": 0.5045543363895987, + "flos": 58646460280320.0, + "grad_norm": 0.8187125498801333, + "language_loss": 0.60630977, + "learning_rate": 2.0675587471931628e-06, + "loss": 0.68219203, + "num_input_tokens_seen": 180468060, + "router_z_loss_clip": 0.66796875, + "router_z_loss_mlp": 0.02114868, + "step": 8392, + "time_per_iteration": 2.9839742183685303 + }, + { + "auxiliary_loss_clip": 0.06425072, + "auxiliary_loss_mlp": 0.01265494, + "balance_loss_clip": 0.06275131, + "balance_loss_mlp": 0.01252631, + "epoch": 0.5046144596422667, + "flos": 22532880240000.0, + "grad_norm": 1.6790063296091327, + "language_loss": 0.85000169, + "learning_rate": 2.067169506493517e-06, + "loss": 0.9269073, + "num_input_tokens_seen": 180486610, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.12866211, + "step": 8393, + "time_per_iteration": 2.5764622688293457 + }, + { + "auxiliary_loss_clip": 0.06430794, + "auxiliary_loss_mlp": 0.01270713, + "balance_loss_clip": 0.06278183, + "balance_loss_mlp": 0.01258869, + "epoch": 0.5046745828949346, + "flos": 27461673106560.0, + "grad_norm": 1.8013259480756436, + "language_loss": 0.5139519, + "learning_rate": 2.0667802632467974e-06, + "loss": 0.590967, + "num_input_tokens_seen": 180508135, + "router_z_loss_clip": 1.52734375, + "router_z_loss_mlp": 0.11834717, + "step": 8394, + "time_per_iteration": 2.5577075481414795 + }, + { + "auxiliary_loss_clip": 0.06430504, + "auxiliary_loss_mlp": 0.012693, + "balance_loss_clip": 0.06275499, + "balance_loss_mlp": 0.01256664, + "epoch": 0.5047347061476026, + "flos": 17280236142720.0, + "grad_norm": 1.62433976950566, + "language_loss": 0.75468862, + "learning_rate": 2.0663910174677627e-06, + "loss": 0.83168674, + "num_input_tokens_seen": 180527000, + "router_z_loss_clip": 1.54980469, + "router_z_loss_mlp": 0.12628174, + "step": 8395, + "time_per_iteration": 4.00100040435791 + }, + { + "auxiliary_loss_clip": 0.06430663, + "auxiliary_loss_mlp": 0.01264928, + "balance_loss_clip": 0.06276973, + "balance_loss_mlp": 0.01252876, + "epoch": 0.5047948294002705, + "flos": 16654308543360.0, + "grad_norm": 3.1739634410128446, + "language_loss": 0.68759549, + "learning_rate": 2.0660017691711737e-06, + "loss": 0.76455134, + "num_input_tokens_seen": 180544715, + "router_z_loss_clip": 1.53613281, + "router_z_loss_mlp": 0.1206665, + "step": 8396, + "time_per_iteration": 2.5608737468719482 + }, + { + "auxiliary_loss_clip": 0.0643612, + "auxiliary_loss_mlp": 0.01265513, + "balance_loss_clip": 0.06282924, + "balance_loss_mlp": 0.01253235, + "epoch": 0.5048549526529386, + "flos": 26872236760320.0, + "grad_norm": 1.7251064316936986, + "language_loss": 0.7921707, + "learning_rate": 2.065612518371792e-06, + "loss": 0.869187, + "num_input_tokens_seen": 180565365, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.12268066, + "step": 8397, + "time_per_iteration": 2.5829713344573975 + }, + { + "auxiliary_loss_clip": 0.06430176, + "auxiliary_loss_mlp": 0.01271123, + "balance_loss_clip": 0.06278492, + "balance_loss_mlp": 0.01258571, + "epoch": 0.5049150759056065, + "flos": 21840175336320.0, + "grad_norm": 1.4916236371554883, + "language_loss": 0.66563869, + "learning_rate": 2.065223265084376e-06, + "loss": 0.7426517, + "num_input_tokens_seen": 180586670, + "router_z_loss_clip": 1.51660156, + "router_z_loss_mlp": 0.12554932, + "step": 8398, + "time_per_iteration": 2.5790011882781982 + }, + { + "auxiliary_loss_clip": 0.06432331, + "auxiliary_loss_mlp": 0.01272223, + "balance_loss_clip": 0.06280147, + "balance_loss_mlp": 0.01259688, + "epoch": 0.5049751991582745, + "flos": 21691652774400.0, + "grad_norm": 1.5799272085735376, + "language_loss": 0.72252852, + "learning_rate": 2.064834009323688e-06, + "loss": 0.79957408, + "num_input_tokens_seen": 180605085, + "router_z_loss_clip": 1.52148438, + "router_z_loss_mlp": 0.12524414, + "step": 8399, + "time_per_iteration": 2.5528035163879395 + }, + { + "auxiliary_loss_clip": 0.06433836, + "auxiliary_loss_mlp": 0.01270059, + "balance_loss_clip": 0.06277353, + "balance_loss_mlp": 0.01257267, + "epoch": 0.5050353224109424, + "flos": 21365495556480.0, + "grad_norm": 1.7587629772693838, + "language_loss": 0.81515628, + "learning_rate": 2.0644447511044878e-06, + "loss": 0.89219522, + "num_input_tokens_seen": 180624370, + "router_z_loss_clip": 1.56347656, + "router_z_loss_mlp": 0.12792969, + "step": 8400, + "time_per_iteration": 2.550828456878662 + }, + { + "auxiliary_loss_clip": 0.06428652, + "auxiliary_loss_mlp": 0.01266624, + "balance_loss_clip": 0.06276295, + "balance_loss_mlp": 0.01254852, + "epoch": 0.5050954456636104, + "flos": 22826655054720.0, + "grad_norm": 2.5272013560823403, + "language_loss": 0.79016161, + "learning_rate": 2.0640554904415362e-06, + "loss": 0.86711431, + "num_input_tokens_seen": 180642450, + "router_z_loss_clip": 1.5234375, + "router_z_loss_mlp": 0.11779785, + "step": 8401, + "time_per_iteration": 2.525132894515991 + }, + { + "auxiliary_loss_clip": 0.06433861, + "auxiliary_loss_mlp": 0.01265271, + "balance_loss_clip": 0.06275853, + "balance_loss_mlp": 0.01252778, + "epoch": 0.5051555689162783, + "flos": 30456513411840.0, + "grad_norm": 1.509144939938127, + "language_loss": 0.70489848, + "learning_rate": 2.063666227349593e-06, + "loss": 0.7818898, + "num_input_tokens_seen": 180665250, + "router_z_loss_clip": 1.58203125, + "router_z_loss_mlp": 0.125, + "step": 8402, + "time_per_iteration": 4.0306360721588135 + }, + { + "auxiliary_loss_clip": 0.06429238, + "auxiliary_loss_mlp": 0.01267033, + "balance_loss_clip": 0.06274545, + "balance_loss_mlp": 0.01254915, + "epoch": 0.5052156921689464, + "flos": 21294315912960.0, + "grad_norm": 1.5960111955062717, + "language_loss": 0.6935674, + "learning_rate": 2.063276961843422e-06, + "loss": 0.77053005, + "num_input_tokens_seen": 180687425, + "router_z_loss_clip": 1.54589844, + "router_z_loss_mlp": 0.12121582, + "step": 8403, + "time_per_iteration": 2.558231830596924 + }, + { + "auxiliary_loss_clip": 0.06433211, + "auxiliary_loss_mlp": 0.01267338, + "balance_loss_clip": 0.06281981, + "balance_loss_mlp": 0.01255799, + "epoch": 0.5052758154216143, + "flos": 25088106499200.0, + "grad_norm": 1.463323664554185, + "language_loss": 0.86018717, + "learning_rate": 2.062887693937781e-06, + "loss": 0.93719262, + "num_input_tokens_seen": 180708725, + "router_z_loss_clip": 1.51269531, + "router_z_loss_mlp": 0.11547852, + "step": 8404, + "time_per_iteration": 2.618649959564209 + }, + { + "auxiliary_loss_clip": 0.06428184, + "auxiliary_loss_mlp": 0.01270079, + "balance_loss_clip": 0.06276304, + "balance_loss_mlp": 0.01258092, + "epoch": 0.5053359386742823, + "flos": 20891612390400.0, + "grad_norm": 1.5475179634828664, + "language_loss": 0.75802314, + "learning_rate": 2.0624984236474322e-06, + "loss": 0.83500576, + "num_input_tokens_seen": 180727990, + "router_z_loss_clip": 1.51757812, + "router_z_loss_mlp": 0.11987305, + "step": 8405, + "time_per_iteration": 2.5067524909973145 + }, + { + "auxiliary_loss_clip": 0.0643079, + "auxiliary_loss_mlp": 0.01267126, + "balance_loss_clip": 0.0627564, + "balance_loss_mlp": 0.01253882, + "epoch": 0.5053960619269503, + "flos": 37752499975680.0, + "grad_norm": 1.6248618607930092, + "language_loss": 0.73678941, + "learning_rate": 2.0621091509871378e-06, + "loss": 0.81376863, + "num_input_tokens_seen": 180749765, + "router_z_loss_clip": 1.55175781, + "router_z_loss_mlp": 0.13250732, + "step": 8406, + "time_per_iteration": 2.8841259479522705 + }, + { + "auxiliary_loss_clip": 0.06424634, + "auxiliary_loss_mlp": 0.01267238, + "balance_loss_clip": 0.06275164, + "balance_loss_mlp": 0.01254662, + "epoch": 0.5054561851796182, + "flos": 23520617769600.0, + "grad_norm": 1.7553784713680058, + "language_loss": 0.77329504, + "learning_rate": 2.0617198759716568e-06, + "loss": 0.85021389, + "num_input_tokens_seen": 180769580, + "router_z_loss_clip": 1.49414062, + "router_z_loss_mlp": 0.12579346, + "step": 8407, + "time_per_iteration": 2.5749242305755615 + }, + { + "auxiliary_loss_clip": 0.06430455, + "auxiliary_loss_mlp": 0.01267206, + "balance_loss_clip": 0.06274534, + "balance_loss_mlp": 0.01255434, + "epoch": 0.5055163084322862, + "flos": 30418261223040.0, + "grad_norm": 1.7587183909270583, + "language_loss": 0.63584411, + "learning_rate": 2.0613305986157535e-06, + "loss": 0.71282065, + "num_input_tokens_seen": 180790295, + "router_z_loss_clip": 1.55859375, + "router_z_loss_mlp": 0.11767578, + "step": 8408, + "time_per_iteration": 2.5872433185577393 + }, + { + "auxiliary_loss_clip": 0.06432275, + "auxiliary_loss_mlp": 0.01267048, + "balance_loss_clip": 0.06279387, + "balance_loss_mlp": 0.01253387, + "epoch": 0.5055764316849541, + "flos": 20264720469120.0, + "grad_norm": 2.4280351300793086, + "language_loss": 0.63813823, + "learning_rate": 2.0609413189341865e-06, + "loss": 0.71513146, + "num_input_tokens_seen": 180807875, + "router_z_loss_clip": 1.52734375, + "router_z_loss_mlp": 0.13659668, + "step": 8409, + "time_per_iteration": 2.5165858268737793 + }, + { + "auxiliary_loss_clip": 0.064235, + "auxiliary_loss_mlp": 0.01266011, + "balance_loss_clip": 0.06273322, + "balance_loss_mlp": 0.01254895, + "epoch": 0.5056365549376222, + "flos": 26078611213440.0, + "grad_norm": 1.3852804971458688, + "language_loss": 0.71039546, + "learning_rate": 2.0605520369417193e-06, + "loss": 0.78729057, + "num_input_tokens_seen": 180831300, + "router_z_loss_clip": 1.50097656, + "router_z_loss_mlp": 0.11132812, + "step": 8410, + "time_per_iteration": 2.594809055328369 + }, + { + "auxiliary_loss_clip": 0.0643055, + "auxiliary_loss_mlp": 0.01267353, + "balance_loss_clip": 0.0627602, + "balance_loss_mlp": 0.01254437, + "epoch": 0.5056966781902901, + "flos": 19284739441920.0, + "grad_norm": 1.6144456520966346, + "language_loss": 0.79591584, + "learning_rate": 2.060162752653113e-06, + "loss": 0.87289482, + "num_input_tokens_seen": 180849055, + "router_z_loss_clip": 1.546875, + "router_z_loss_mlp": 0.12921143, + "step": 8411, + "time_per_iteration": 2.53426194190979 + }, + { + "auxiliary_loss_clip": 0.06433219, + "auxiliary_loss_mlp": 0.0126873, + "balance_loss_clip": 0.06276312, + "balance_loss_mlp": 0.01254979, + "epoch": 0.5057568014429581, + "flos": 21329507427840.0, + "grad_norm": 1.7389096144894618, + "language_loss": 0.81907368, + "learning_rate": 2.0597734660831285e-06, + "loss": 0.89609325, + "num_input_tokens_seen": 180867395, + "router_z_loss_clip": 1.56835938, + "router_z_loss_mlp": 0.13757324, + "step": 8412, + "time_per_iteration": 2.517216444015503 + }, + { + "auxiliary_loss_clip": 0.06429601, + "auxiliary_loss_mlp": 0.01270568, + "balance_loss_clip": 0.0627761, + "balance_loss_mlp": 0.01258134, + "epoch": 0.505816924695626, + "flos": 17499351369600.0, + "grad_norm": 1.7713461187517285, + "language_loss": 0.80336094, + "learning_rate": 2.0593841772465283e-06, + "loss": 0.88036257, + "num_input_tokens_seen": 180886670, + "router_z_loss_clip": 1.51953125, + "router_z_loss_mlp": 0.12438965, + "step": 8413, + "time_per_iteration": 2.524210214614868 + }, + { + "auxiliary_loss_clip": 0.06428088, + "auxiliary_loss_mlp": 0.01274079, + "balance_loss_clip": 0.06273276, + "balance_loss_mlp": 0.01260328, + "epoch": 0.505877047948294, + "flos": 21148434754560.0, + "grad_norm": 1.7829708596435327, + "language_loss": 0.80812234, + "learning_rate": 2.0589948861580737e-06, + "loss": 0.885144, + "num_input_tokens_seen": 180904645, + "router_z_loss_clip": 1.54589844, + "router_z_loss_mlp": 0.1373291, + "step": 8414, + "time_per_iteration": 2.5200514793395996 + }, + { + "auxiliary_loss_clip": 0.06426316, + "auxiliary_loss_mlp": 0.01270081, + "balance_loss_clip": 0.06272253, + "balance_loss_mlp": 0.01257468, + "epoch": 0.5059371712009619, + "flos": 36357824292480.0, + "grad_norm": 2.3266509400680935, + "language_loss": 0.62741381, + "learning_rate": 2.058605592832528e-06, + "loss": 0.70437777, + "num_input_tokens_seen": 180922340, + "router_z_loss_clip": 1.5390625, + "router_z_loss_mlp": 0.12615967, + "step": 8415, + "time_per_iteration": 2.676204204559326 + }, + { + "auxiliary_loss_clip": 0.06428116, + "auxiliary_loss_mlp": 0.01272149, + "balance_loss_clip": 0.06274984, + "balance_loss_mlp": 0.01259882, + "epoch": 0.50599729445363, + "flos": 22679809574400.0, + "grad_norm": 1.4983327127759412, + "language_loss": 0.82398355, + "learning_rate": 2.0582162972846515e-06, + "loss": 0.90098619, + "num_input_tokens_seen": 180941350, + "router_z_loss_clip": 1.53320312, + "router_z_loss_mlp": 0.12261963, + "step": 8416, + "time_per_iteration": 2.540487289428711 + }, + { + "auxiliary_loss_clip": 0.06427394, + "auxiliary_loss_mlp": 0.01269018, + "balance_loss_clip": 0.06278178, + "balance_loss_mlp": 0.01257705, + "epoch": 0.5060574177062979, + "flos": 22754553016320.0, + "grad_norm": 1.8321417063208305, + "language_loss": 0.79700905, + "learning_rate": 2.0578269995292078e-06, + "loss": 0.87397313, + "num_input_tokens_seen": 180960720, + "router_z_loss_clip": 1.49023438, + "router_z_loss_mlp": 0.11328125, + "step": 8417, + "time_per_iteration": 2.5462777614593506 + }, + { + "auxiliary_loss_clip": 0.06425334, + "auxiliary_loss_mlp": 0.01268694, + "balance_loss_clip": 0.06277245, + "balance_loss_mlp": 0.01256875, + "epoch": 0.5061175409589659, + "flos": 21659689641600.0, + "grad_norm": 1.7824010317095476, + "language_loss": 0.63313794, + "learning_rate": 2.0574376995809588e-06, + "loss": 0.71007824, + "num_input_tokens_seen": 180979725, + "router_z_loss_clip": 1.48046875, + "router_z_loss_mlp": 0.11816406, + "step": 8418, + "time_per_iteration": 2.5203146934509277 + }, + { + "auxiliary_loss_clip": 0.0643232, + "auxiliary_loss_mlp": 0.01270126, + "balance_loss_clip": 0.06277534, + "balance_loss_mlp": 0.01257877, + "epoch": 0.5061776642116339, + "flos": 21622653336960.0, + "grad_norm": 1.6210660838966935, + "language_loss": 0.77937323, + "learning_rate": 2.0570483974546653e-06, + "loss": 0.85639775, + "num_input_tokens_seen": 180998980, + "router_z_loss_clip": 1.54492188, + "router_z_loss_mlp": 0.12249756, + "step": 8419, + "time_per_iteration": 2.549057722091675 + }, + { + "auxiliary_loss_clip": 0.06433055, + "auxiliary_loss_mlp": 0.01272716, + "balance_loss_clip": 0.06277718, + "balance_loss_mlp": 0.01259955, + "epoch": 0.5062377874643018, + "flos": 24433276440960.0, + "grad_norm": 1.7091767496398438, + "language_loss": 0.77142859, + "learning_rate": 2.0566590931650917e-06, + "loss": 0.8484863, + "num_input_tokens_seen": 181019165, + "router_z_loss_clip": 1.55273438, + "router_z_loss_mlp": 0.12762451, + "step": 8420, + "time_per_iteration": 2.533263921737671 + }, + { + "auxiliary_loss_clip": 0.06430572, + "auxiliary_loss_mlp": 0.0127647, + "balance_loss_clip": 0.06276705, + "balance_loss_mlp": 0.01264311, + "epoch": 0.5062979107169698, + "flos": 22530322690560.0, + "grad_norm": 1.6514243222666503, + "language_loss": 0.77777469, + "learning_rate": 2.056269786726999e-06, + "loss": 0.85484511, + "num_input_tokens_seen": 181037110, + "router_z_loss_clip": 1.5390625, + "router_z_loss_mlp": 0.121521, + "step": 8421, + "time_per_iteration": 2.535022497177124 + }, + { + "auxiliary_loss_clip": 0.06429385, + "auxiliary_loss_mlp": 0.01273249, + "balance_loss_clip": 0.06276778, + "balance_loss_mlp": 0.01261895, + "epoch": 0.5063580339696377, + "flos": 24578947964160.0, + "grad_norm": 1.4350674480860695, + "language_loss": 0.67189109, + "learning_rate": 2.0558804781551512e-06, + "loss": 0.74891746, + "num_input_tokens_seen": 181057775, + "router_z_loss_clip": 1.52636719, + "router_z_loss_mlp": 0.11352539, + "step": 8422, + "time_per_iteration": 2.555051803588867 + }, + { + "auxiliary_loss_clip": 0.064266, + "auxiliary_loss_mlp": 0.01271001, + "balance_loss_clip": 0.06276479, + "balance_loss_mlp": 0.01259241, + "epoch": 0.5064181572223058, + "flos": 22601837750400.0, + "grad_norm": 1.5827559778751017, + "language_loss": 0.81783563, + "learning_rate": 2.05549116746431e-06, + "loss": 0.89481163, + "num_input_tokens_seen": 181078260, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.11755371, + "step": 8423, + "time_per_iteration": 2.606844663619995 + }, + { + "auxiliary_loss_clip": 0.06427386, + "auxiliary_loss_mlp": 0.01268856, + "balance_loss_clip": 0.06273049, + "balance_loss_mlp": 0.01256411, + "epoch": 0.5064782804749737, + "flos": 26002148762880.0, + "grad_norm": 2.1055931359181086, + "language_loss": 0.74535251, + "learning_rate": 2.055101854669237e-06, + "loss": 0.82231486, + "num_input_tokens_seen": 181098755, + "router_z_loss_clip": 1.54199219, + "router_z_loss_mlp": 0.12451172, + "step": 8424, + "time_per_iteration": 2.5353689193725586 + }, + { + "auxiliary_loss_clip": 0.06427233, + "auxiliary_loss_mlp": 0.01264898, + "balance_loss_clip": 0.06278618, + "balance_loss_mlp": 0.0125268, + "epoch": 0.5065384037276417, + "flos": 28561358090880.0, + "grad_norm": 1.333495130602937, + "language_loss": 0.71332014, + "learning_rate": 2.0547125397846975e-06, + "loss": 0.79024142, + "num_input_tokens_seen": 181121570, + "router_z_loss_clip": 1.48632812, + "router_z_loss_mlp": 0.12231445, + "step": 8425, + "time_per_iteration": 2.624431610107422 + }, + { + "auxiliary_loss_clip": 0.06429943, + "auxiliary_loss_mlp": 0.01268875, + "balance_loss_clip": 0.06278015, + "balance_loss_mlp": 0.01257187, + "epoch": 0.5065985269803096, + "flos": 22972620067200.0, + "grad_norm": 1.8777832339890803, + "language_loss": 0.78901541, + "learning_rate": 2.0543232228254524e-06, + "loss": 0.86600357, + "num_input_tokens_seen": 181140240, + "router_z_loss_clip": 1.51855469, + "router_z_loss_mlp": 0.11700439, + "step": 8426, + "time_per_iteration": 3.936661958694458 + }, + { + "auxiliary_loss_clip": 0.06432042, + "auxiliary_loss_mlp": 0.0127276, + "balance_loss_clip": 0.06277739, + "balance_loss_mlp": 0.01260768, + "epoch": 0.5066586502329776, + "flos": 21613680950400.0, + "grad_norm": 2.2511428758914325, + "language_loss": 0.7803759, + "learning_rate": 2.053933903806265e-06, + "loss": 0.85742396, + "num_input_tokens_seen": 181158630, + "router_z_loss_clip": 1.54199219, + "router_z_loss_mlp": 0.12005615, + "step": 8427, + "time_per_iteration": 2.5481557846069336 + }, + { + "auxiliary_loss_clip": 0.06424822, + "auxiliary_loss_mlp": 0.01267004, + "balance_loss_clip": 0.06275385, + "balance_loss_mlp": 0.01255268, + "epoch": 0.5067187734856455, + "flos": 20346214164480.0, + "grad_norm": 1.5242931798978783, + "language_loss": 0.719284, + "learning_rate": 2.0535445827418997e-06, + "loss": 0.79620224, + "num_input_tokens_seen": 181176405, + "router_z_loss_clip": 1.49316406, + "router_z_loss_mlp": 0.11737061, + "step": 8428, + "time_per_iteration": 2.5370116233825684 + }, + { + "auxiliary_loss_clip": 0.06427782, + "auxiliary_loss_mlp": 0.01268707, + "balance_loss_clip": 0.0627581, + "balance_loss_mlp": 0.0125799, + "epoch": 0.5067788967383136, + "flos": 28848801922560.0, + "grad_norm": 1.7598513800416933, + "language_loss": 0.83218622, + "learning_rate": 2.0531552596471168e-06, + "loss": 0.90915114, + "num_input_tokens_seen": 181197595, + "router_z_loss_clip": 1.51953125, + "router_z_loss_mlp": 0.10717773, + "step": 8429, + "time_per_iteration": 2.5739033222198486 + }, + { + "auxiliary_loss_clip": 0.06435312, + "auxiliary_loss_mlp": 0.01266816, + "balance_loss_clip": 0.06276707, + "balance_loss_mlp": 0.01254013, + "epoch": 0.5068390199909815, + "flos": 32457997964160.0, + "grad_norm": 4.868596583088969, + "language_loss": 0.7373606, + "learning_rate": 2.052765934536682e-06, + "loss": 0.8143819, + "num_input_tokens_seen": 181218560, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 0.12805176, + "step": 8430, + "time_per_iteration": 4.062525749206543 + }, + { + "auxiliary_loss_clip": 0.06428299, + "auxiliary_loss_mlp": 0.01270046, + "balance_loss_clip": 0.06275186, + "balance_loss_mlp": 0.01258334, + "epoch": 0.5068991432436495, + "flos": 23152896126720.0, + "grad_norm": 1.801463516744859, + "language_loss": 0.76942408, + "learning_rate": 2.0523766074253575e-06, + "loss": 0.84640753, + "num_input_tokens_seen": 181237095, + "router_z_loss_clip": 1.53027344, + "router_z_loss_mlp": 0.1171875, + "step": 8431, + "time_per_iteration": 2.535198211669922 + }, + { + "auxiliary_loss_clip": 0.06426188, + "auxiliary_loss_mlp": 0.01266777, + "balance_loss_clip": 0.06275809, + "balance_loss_mlp": 0.0125488, + "epoch": 0.5069592664963174, + "flos": 19941917414400.0, + "grad_norm": 1.5385752235820749, + "language_loss": 0.72917402, + "learning_rate": 2.0519872783279074e-06, + "loss": 0.80610371, + "num_input_tokens_seen": 181255940, + "router_z_loss_clip": 1.50292969, + "router_z_loss_mlp": 0.11901855, + "step": 8432, + "time_per_iteration": 2.5343048572540283 + }, + { + "auxiliary_loss_clip": 0.06319194, + "auxiliary_loss_mlp": 0.01252325, + "balance_loss_clip": 0.06253257, + "balance_loss_mlp": 0.01250496, + "epoch": 0.5070193897489854, + "flos": 65812539888000.0, + "grad_norm": 0.7543358557352665, + "language_loss": 0.63621199, + "learning_rate": 2.0515979472590945e-06, + "loss": 0.71192724, + "num_input_tokens_seen": 181316945, + "router_z_loss_clip": 0.66015625, + "router_z_loss_mlp": 0.01824951, + "step": 8433, + "time_per_iteration": 3.1825270652770996 + }, + { + "auxiliary_loss_clip": 0.06432432, + "auxiliary_loss_mlp": 0.01266931, + "balance_loss_clip": 0.06279546, + "balance_loss_mlp": 0.01254414, + "epoch": 0.5070795130016534, + "flos": 17281158537600.0, + "grad_norm": 2.2002665512489505, + "language_loss": 0.77719331, + "learning_rate": 2.051208614233681e-06, + "loss": 0.85418689, + "num_input_tokens_seen": 181335555, + "router_z_loss_clip": 1.52832031, + "router_z_loss_mlp": 0.12512207, + "step": 8434, + "time_per_iteration": 2.51298451423645 + }, + { + "auxiliary_loss_clip": 0.06435563, + "auxiliary_loss_mlp": 0.01265248, + "balance_loss_clip": 0.06278686, + "balance_loss_mlp": 0.01253047, + "epoch": 0.5071396362543213, + "flos": 21076416570240.0, + "grad_norm": 1.9257186196996396, + "language_loss": 0.7107513, + "learning_rate": 2.0508192792664326e-06, + "loss": 0.78775942, + "num_input_tokens_seen": 181354580, + "router_z_loss_clip": 1.56738281, + "router_z_loss_mlp": 0.12207031, + "step": 8435, + "time_per_iteration": 3.9952967166900635 + }, + { + "auxiliary_loss_clip": 0.06431434, + "auxiliary_loss_mlp": 0.01269503, + "balance_loss_clip": 0.06278223, + "balance_loss_mlp": 0.01256646, + "epoch": 0.5071997595069894, + "flos": 23150841701760.0, + "grad_norm": 1.974114732671287, + "language_loss": 0.72623628, + "learning_rate": 2.050429942372112e-06, + "loss": 0.80324566, + "num_input_tokens_seen": 181374320, + "router_z_loss_clip": 1.53222656, + "router_z_loss_mlp": 0.128479, + "step": 8436, + "time_per_iteration": 2.5126936435699463 + }, + { + "auxiliary_loss_clip": 0.06431168, + "auxiliary_loss_mlp": 0.01266132, + "balance_loss_clip": 0.06278354, + "balance_loss_mlp": 0.01253449, + "epoch": 0.5072598827596573, + "flos": 22753756402560.0, + "grad_norm": 2.390958224451536, + "language_loss": 0.84374195, + "learning_rate": 2.050040603565483e-06, + "loss": 0.92071497, + "num_input_tokens_seen": 181392190, + "router_z_loss_clip": 1.52832031, + "router_z_loss_mlp": 0.12701416, + "step": 8437, + "time_per_iteration": 2.5411131381988525 + }, + { + "auxiliary_loss_clip": 0.06423598, + "auxiliary_loss_mlp": 0.01265882, + "balance_loss_clip": 0.06273607, + "balance_loss_mlp": 0.01254128, + "epoch": 0.5073200060123253, + "flos": 22573102999680.0, + "grad_norm": 1.4207198809320167, + "language_loss": 0.80947453, + "learning_rate": 2.049651262861309e-06, + "loss": 0.88636929, + "num_input_tokens_seen": 181413890, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.11749268, + "step": 8438, + "time_per_iteration": 2.5992414951324463 + }, + { + "auxiliary_loss_clip": 0.06431951, + "auxiliary_loss_mlp": 0.01267455, + "balance_loss_clip": 0.06277303, + "balance_loss_mlp": 0.0125458, + "epoch": 0.5073801292649932, + "flos": 25812481046400.0, + "grad_norm": 1.639362892711676, + "language_loss": 0.7992267, + "learning_rate": 2.0492619202743543e-06, + "loss": 0.87622082, + "num_input_tokens_seen": 181433240, + "router_z_loss_clip": 1.54589844, + "router_z_loss_mlp": 0.12872314, + "step": 8439, + "time_per_iteration": 2.5635995864868164 + }, + { + "auxiliary_loss_clip": 0.06422722, + "auxiliary_loss_mlp": 0.01265384, + "balance_loss_clip": 0.06272503, + "balance_loss_mlp": 0.01253833, + "epoch": 0.5074402525176612, + "flos": 25380916992000.0, + "grad_norm": 1.6123120964481592, + "language_loss": 0.71044374, + "learning_rate": 2.048872575819383e-06, + "loss": 0.78732479, + "num_input_tokens_seen": 181453535, + "router_z_loss_clip": 1.50292969, + "router_z_loss_mlp": 0.11560059, + "step": 8440, + "time_per_iteration": 2.54082989692688 + }, + { + "auxiliary_loss_clip": 0.0642738, + "auxiliary_loss_mlp": 0.0126602, + "balance_loss_clip": 0.06274064, + "balance_loss_mlp": 0.01254278, + "epoch": 0.5075003757703291, + "flos": 26071064346240.0, + "grad_norm": 1.625029424987906, + "language_loss": 0.71058178, + "learning_rate": 2.048483229511158e-06, + "loss": 0.78751576, + "num_input_tokens_seen": 181474195, + "router_z_loss_clip": 1.53417969, + "router_z_loss_mlp": 0.11743164, + "step": 8441, + "time_per_iteration": 2.5597851276397705 + }, + { + "auxiliary_loss_clip": 0.06432067, + "auxiliary_loss_mlp": 0.0126825, + "balance_loss_clip": 0.06275806, + "balance_loss_mlp": 0.01255608, + "epoch": 0.5075604990229972, + "flos": 21841936272000.0, + "grad_norm": 1.6251927502787415, + "language_loss": 0.64299369, + "learning_rate": 2.0480938813644445e-06, + "loss": 0.71999681, + "num_input_tokens_seen": 181494000, + "router_z_loss_clip": 1.56152344, + "router_z_loss_mlp": 0.12634277, + "step": 8442, + "time_per_iteration": 3.9658992290496826 + }, + { + "auxiliary_loss_clip": 0.06421914, + "auxiliary_loss_mlp": 0.01270692, + "balance_loss_clip": 0.06274506, + "balance_loss_mlp": 0.01259475, + "epoch": 0.5076206222756651, + "flos": 31986923909760.0, + "grad_norm": 1.4468343781265969, + "language_loss": 0.71796834, + "learning_rate": 2.047704531394006e-06, + "loss": 0.7948944, + "num_input_tokens_seen": 181515955, + "router_z_loss_clip": 1.47460938, + "router_z_loss_mlp": 0.11212158, + "step": 8443, + "time_per_iteration": 2.6133296489715576 + }, + { + "auxiliary_loss_clip": 0.06430129, + "auxiliary_loss_mlp": 0.01267886, + "balance_loss_clip": 0.06273551, + "balance_loss_mlp": 0.01255506, + "epoch": 0.5076807455283331, + "flos": 36913033445760.0, + "grad_norm": 1.2663152678698668, + "language_loss": 0.62379253, + "learning_rate": 2.047315179614607e-06, + "loss": 0.70077264, + "num_input_tokens_seen": 181540225, + "router_z_loss_clip": 1.56542969, + "router_z_loss_mlp": 0.12390137, + "step": 8444, + "time_per_iteration": 2.670844554901123 + }, + { + "auxiliary_loss_clip": 0.06426448, + "auxiliary_loss_mlp": 0.01266149, + "balance_loss_clip": 0.06273904, + "balance_loss_mlp": 0.01255158, + "epoch": 0.507740868781001, + "flos": 29870263520640.0, + "grad_norm": 1.5635527032998127, + "language_loss": 0.64163882, + "learning_rate": 2.046925826041012e-06, + "loss": 0.71856481, + "num_input_tokens_seen": 181560125, + "router_z_loss_clip": 1.52441406, + "router_z_loss_mlp": 0.10992432, + "step": 8445, + "time_per_iteration": 2.564972162246704 + }, + { + "auxiliary_loss_clip": 0.06326441, + "auxiliary_loss_mlp": 0.01258393, + "balance_loss_clip": 0.06260093, + "balance_loss_mlp": 0.0125657, + "epoch": 0.507800992033669, + "flos": 61935872014080.0, + "grad_norm": 0.8045039829713045, + "language_loss": 0.61588788, + "learning_rate": 2.0465364706879845e-06, + "loss": 0.69173622, + "num_input_tokens_seen": 181618830, + "router_z_loss_clip": 0.6640625, + "router_z_loss_mlp": 0.01817322, + "step": 8446, + "time_per_iteration": 3.1747779846191406 + }, + { + "auxiliary_loss_clip": 0.06424413, + "auxiliary_loss_mlp": 0.01266643, + "balance_loss_clip": 0.06272733, + "balance_loss_mlp": 0.01254394, + "epoch": 0.507861115286337, + "flos": 20706137377920.0, + "grad_norm": 4.618603604158377, + "language_loss": 0.80737472, + "learning_rate": 2.04614711357029e-06, + "loss": 0.88428527, + "num_input_tokens_seen": 181637120, + "router_z_loss_clip": 1.51757812, + "router_z_loss_mlp": 0.12243652, + "step": 8447, + "time_per_iteration": 2.510443687438965 + }, + { + "auxiliary_loss_clip": 0.06422254, + "auxiliary_loss_mlp": 0.01267237, + "balance_loss_clip": 0.06272172, + "balance_loss_mlp": 0.01255775, + "epoch": 0.507921238539005, + "flos": 30854982303360.0, + "grad_norm": 1.2702922663182385, + "language_loss": 0.70493698, + "learning_rate": 2.0457577547026916e-06, + "loss": 0.78183186, + "num_input_tokens_seen": 181659965, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.11456299, + "step": 8448, + "time_per_iteration": 2.6021034717559814 + }, + { + "auxiliary_loss_clip": 0.06427675, + "auxiliary_loss_mlp": 0.01268661, + "balance_loss_clip": 0.0627776, + "balance_loss_mlp": 0.0125745, + "epoch": 0.507981361791673, + "flos": 35709031728000.0, + "grad_norm": 1.3111664343686333, + "language_loss": 0.72171003, + "learning_rate": 2.045368394099955e-06, + "loss": 0.79867339, + "num_input_tokens_seen": 181685290, + "router_z_loss_clip": 1.49804688, + "router_z_loss_mlp": 0.11199951, + "step": 8449, + "time_per_iteration": 2.6752874851226807 + }, + { + "auxiliary_loss_clip": 0.06426987, + "auxiliary_loss_mlp": 0.01268113, + "balance_loss_clip": 0.06274859, + "balance_loss_mlp": 0.0125686, + "epoch": 0.5080414850443409, + "flos": 27168694905600.0, + "grad_norm": 1.3940572087719376, + "language_loss": 0.73039591, + "learning_rate": 2.044979031776844e-06, + "loss": 0.80734688, + "num_input_tokens_seen": 181706080, + "router_z_loss_clip": 1.52050781, + "router_z_loss_mlp": 0.11254883, + "step": 8450, + "time_per_iteration": 2.6428375244140625 + }, + { + "auxiliary_loss_clip": 0.06430449, + "auxiliary_loss_mlp": 0.0127298, + "balance_loss_clip": 0.06278583, + "balance_loss_mlp": 0.01261148, + "epoch": 0.5081016082970089, + "flos": 27091855111680.0, + "grad_norm": 1.6054602673211236, + "language_loss": 0.7744205, + "learning_rate": 2.0445896677481234e-06, + "loss": 0.85145479, + "num_input_tokens_seen": 181724805, + "router_z_loss_clip": 1.51757812, + "router_z_loss_mlp": 0.1184082, + "step": 8451, + "time_per_iteration": 2.6066558361053467 + }, + { + "auxiliary_loss_clip": 0.06429529, + "auxiliary_loss_mlp": 0.01266696, + "balance_loss_clip": 0.06276423, + "balance_loss_mlp": 0.01254531, + "epoch": 0.5081617315496768, + "flos": 22863104380800.0, + "grad_norm": 1.825930217148951, + "language_loss": 0.85374677, + "learning_rate": 2.044200302028559e-06, + "loss": 0.930709, + "num_input_tokens_seen": 181743725, + "router_z_loss_clip": 1.53027344, + "router_z_loss_mlp": 0.12158203, + "step": 8452, + "time_per_iteration": 2.5062003135681152 + }, + { + "auxiliary_loss_clip": 0.06431726, + "auxiliary_loss_mlp": 0.01267185, + "balance_loss_clip": 0.06276073, + "balance_loss_mlp": 0.01254716, + "epoch": 0.5082218548023448, + "flos": 16286167630080.0, + "grad_norm": 2.3752555926719343, + "language_loss": 0.77806371, + "learning_rate": 2.0438109346329143e-06, + "loss": 0.85505283, + "num_input_tokens_seen": 181757720, + "router_z_loss_clip": 1.55566406, + "router_z_loss_mlp": 0.12463379, + "step": 8453, + "time_per_iteration": 2.4981954097747803 + }, + { + "auxiliary_loss_clip": 0.06430794, + "auxiliary_loss_mlp": 0.01268463, + "balance_loss_clip": 0.06281981, + "balance_loss_mlp": 0.0125774, + "epoch": 0.5082819780550127, + "flos": 24467419779840.0, + "grad_norm": 1.5957908763151711, + "language_loss": 0.76932752, + "learning_rate": 2.0434215655759544e-06, + "loss": 0.84632009, + "num_input_tokens_seen": 181778545, + "router_z_loss_clip": 1.48828125, + "router_z_loss_mlp": 0.1072998, + "step": 8454, + "time_per_iteration": 2.6134133338928223 + }, + { + "auxiliary_loss_clip": 0.06431732, + "auxiliary_loss_mlp": 0.01271277, + "balance_loss_clip": 0.06279022, + "balance_loss_mlp": 0.01259118, + "epoch": 0.5083421013076808, + "flos": 23409844272000.0, + "grad_norm": 1.4822981638740835, + "language_loss": 0.89621413, + "learning_rate": 2.0430321948724446e-06, + "loss": 0.97324431, + "num_input_tokens_seen": 181799495, + "router_z_loss_clip": 1.52734375, + "router_z_loss_mlp": 0.1217041, + "step": 8455, + "time_per_iteration": 2.6085920333862305 + }, + { + "auxiliary_loss_clip": 0.06434034, + "auxiliary_loss_mlp": 0.01274373, + "balance_loss_clip": 0.06275303, + "balance_loss_mlp": 0.01260831, + "epoch": 0.5084022245603487, + "flos": 23878528485120.0, + "grad_norm": 1.6442671341978696, + "language_loss": 0.62785953, + "learning_rate": 2.042642822537149e-06, + "loss": 0.7049436, + "num_input_tokens_seen": 181818400, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.13555908, + "step": 8456, + "time_per_iteration": 2.5377745628356934 + }, + { + "auxiliary_loss_clip": 0.06329988, + "auxiliary_loss_mlp": 0.01255905, + "balance_loss_clip": 0.06263152, + "balance_loss_mlp": 0.01253715, + "epoch": 0.5084623478130167, + "flos": 62891352921600.0, + "grad_norm": 0.8103581861082657, + "language_loss": 0.62548244, + "learning_rate": 2.0422534485848343e-06, + "loss": 0.70134139, + "num_input_tokens_seen": 181875975, + "router_z_loss_clip": 0.67138672, + "router_z_loss_mlp": 0.02194214, + "step": 8457, + "time_per_iteration": 3.0378763675689697 + }, + { + "auxiliary_loss_clip": 0.06436984, + "auxiliary_loss_mlp": 0.01271319, + "balance_loss_clip": 0.06280852, + "balance_loss_mlp": 0.01258337, + "epoch": 0.5085224710656846, + "flos": 22352688034560.0, + "grad_norm": 1.5276658426580998, + "language_loss": 0.67559206, + "learning_rate": 2.0418640730302644e-06, + "loss": 0.75267512, + "num_input_tokens_seen": 181896450, + "router_z_loss_clip": 1.56152344, + "router_z_loss_mlp": 0.12976074, + "step": 8458, + "time_per_iteration": 2.5329530239105225 + }, + { + "auxiliary_loss_clip": 0.06432781, + "auxiliary_loss_mlp": 0.01272615, + "balance_loss_clip": 0.0627652, + "balance_loss_mlp": 0.01260015, + "epoch": 0.5085825943183526, + "flos": 26073202625280.0, + "grad_norm": 1.618055128351248, + "language_loss": 0.77449083, + "learning_rate": 2.0414746958882043e-06, + "loss": 0.85154486, + "num_input_tokens_seen": 181916770, + "router_z_loss_clip": 1.56152344, + "router_z_loss_mlp": 0.1260376, + "step": 8459, + "time_per_iteration": 2.5590224266052246 + }, + { + "auxiliary_loss_clip": 0.06437792, + "auxiliary_loss_mlp": 0.01271084, + "balance_loss_clip": 0.06279328, + "balance_loss_mlp": 0.01258132, + "epoch": 0.5086427175710206, + "flos": 17426494644480.0, + "grad_norm": 2.2202109072156664, + "language_loss": 0.81101096, + "learning_rate": 2.0410853171734196e-06, + "loss": 0.88809973, + "num_input_tokens_seen": 181932710, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 0.12945557, + "step": 8460, + "time_per_iteration": 2.4797065258026123 + }, + { + "auxiliary_loss_clip": 0.06432672, + "auxiliary_loss_mlp": 0.01272652, + "balance_loss_clip": 0.06276584, + "balance_loss_mlp": 0.01259968, + "epoch": 0.5087028408236886, + "flos": 20638102262400.0, + "grad_norm": 1.6011145053716882, + "language_loss": 0.69150776, + "learning_rate": 2.0406959369006754e-06, + "loss": 0.76856101, + "num_input_tokens_seen": 181950665, + "router_z_loss_clip": 1.56054688, + "router_z_loss_mlp": 0.12677002, + "step": 8461, + "time_per_iteration": 2.5423507690429688 + }, + { + "auxiliary_loss_clip": 0.06423958, + "auxiliary_loss_mlp": 0.01270241, + "balance_loss_clip": 0.06275716, + "balance_loss_mlp": 0.01258052, + "epoch": 0.5087629640763566, + "flos": 25600996540800.0, + "grad_norm": 1.5704547594862186, + "language_loss": 0.76788783, + "learning_rate": 2.0403065550847375e-06, + "loss": 0.84482986, + "num_input_tokens_seen": 181971270, + "router_z_loss_clip": 1.48046875, + "router_z_loss_mlp": 0.12200928, + "step": 8462, + "time_per_iteration": 2.5558974742889404 + }, + { + "auxiliary_loss_clip": 0.06431352, + "auxiliary_loss_mlp": 0.01267196, + "balance_loss_clip": 0.06279621, + "balance_loss_mlp": 0.01255251, + "epoch": 0.5088230873290245, + "flos": 13266743351040.0, + "grad_norm": 1.98943246577739, + "language_loss": 0.81940925, + "learning_rate": 2.0399171717403706e-06, + "loss": 0.89639473, + "num_input_tokens_seen": 181988410, + "router_z_loss_clip": 1.515625, + "router_z_loss_mlp": 0.11938477, + "step": 8463, + "time_per_iteration": 2.5092854499816895 + }, + { + "auxiliary_loss_clip": 0.06429717, + "auxiliary_loss_mlp": 0.01268295, + "balance_loss_clip": 0.06277439, + "balance_loss_mlp": 0.01255974, + "epoch": 0.5088832105816925, + "flos": 20048959405440.0, + "grad_norm": 4.395577464341562, + "language_loss": 0.76639092, + "learning_rate": 2.039527786882341e-06, + "loss": 0.84337103, + "num_input_tokens_seen": 182006530, + "router_z_loss_clip": 1.5234375, + "router_z_loss_mlp": 0.12310791, + "step": 8464, + "time_per_iteration": 2.5100886821746826 + }, + { + "auxiliary_loss_clip": 0.06332754, + "auxiliary_loss_mlp": 0.01251908, + "balance_loss_clip": 0.06266724, + "balance_loss_mlp": 0.01250196, + "epoch": 0.5089433338343604, + "flos": 67445072184960.0, + "grad_norm": 0.674227101372006, + "language_loss": 0.59172922, + "learning_rate": 2.0391384005254133e-06, + "loss": 0.66757584, + "num_input_tokens_seen": 182074240, + "router_z_loss_clip": 0.66357422, + "router_z_loss_mlp": 0.01716614, + "step": 8465, + "time_per_iteration": 3.288703441619873 + }, + { + "auxiliary_loss_clip": 0.06429654, + "auxiliary_loss_mlp": 0.01267036, + "balance_loss_clip": 0.06277246, + "balance_loss_mlp": 0.01255026, + "epoch": 0.5090034570870284, + "flos": 22716845879040.0, + "grad_norm": 1.7766724873518385, + "language_loss": 0.80341208, + "learning_rate": 2.038749012684354e-06, + "loss": 0.88037896, + "num_input_tokens_seen": 182093360, + "router_z_loss_clip": 1.52246094, + "router_z_loss_mlp": 0.12005615, + "step": 8466, + "time_per_iteration": 3.9034652709960938 + }, + { + "auxiliary_loss_clip": 0.06428038, + "auxiliary_loss_mlp": 0.01262494, + "balance_loss_clip": 0.06276771, + "balance_loss_mlp": 0.01250603, + "epoch": 0.5090635803396963, + "flos": 20451537146880.0, + "grad_norm": 1.506058765425311, + "language_loss": 0.78925973, + "learning_rate": 2.0383596233739286e-06, + "loss": 0.86616498, + "num_input_tokens_seen": 182110170, + "router_z_loss_clip": 1.51074219, + "router_z_loss_mlp": 0.11895752, + "step": 8467, + "time_per_iteration": 2.483701229095459 + }, + { + "auxiliary_loss_clip": 0.06425558, + "auxiliary_loss_mlp": 0.01269027, + "balance_loss_clip": 0.06277174, + "balance_loss_mlp": 0.01257565, + "epoch": 0.5091237035923644, + "flos": 23775637271040.0, + "grad_norm": 1.593164773968791, + "language_loss": 0.74572229, + "learning_rate": 2.0379702326089013e-06, + "loss": 0.82266819, + "num_input_tokens_seen": 182129570, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.11468506, + "step": 8468, + "time_per_iteration": 2.550657033920288 + }, + { + "auxiliary_loss_clip": 0.06425174, + "auxiliary_loss_mlp": 0.01264118, + "balance_loss_clip": 0.06274162, + "balance_loss_mlp": 0.01252108, + "epoch": 0.5091838268450323, + "flos": 18332990040960.0, + "grad_norm": 1.7522760366327397, + "language_loss": 0.78574747, + "learning_rate": 2.03758084040404e-06, + "loss": 0.86264038, + "num_input_tokens_seen": 182147565, + "router_z_loss_clip": 1.50976562, + "router_z_loss_mlp": 0.12011719, + "step": 8469, + "time_per_iteration": 2.4776134490966797 + }, + { + "auxiliary_loss_clip": 0.06431125, + "auxiliary_loss_mlp": 0.012685, + "balance_loss_clip": 0.0627888, + "balance_loss_mlp": 0.01256526, + "epoch": 0.5092439500977003, + "flos": 29064982256640.0, + "grad_norm": 1.429622552318455, + "language_loss": 0.6959703, + "learning_rate": 2.037191446774109e-06, + "loss": 0.7729665, + "num_input_tokens_seen": 182169695, + "router_z_loss_clip": 1.52050781, + "router_z_loss_mlp": 0.11968994, + "step": 8470, + "time_per_iteration": 4.06356954574585 + }, + { + "auxiliary_loss_clip": 0.06432179, + "auxiliary_loss_mlp": 0.01268896, + "balance_loss_clip": 0.06276524, + "balance_loss_mlp": 0.01256278, + "epoch": 0.5093040733503682, + "flos": 13559134573440.0, + "grad_norm": 1.739958995441318, + "language_loss": 0.73736298, + "learning_rate": 2.0368020517338745e-06, + "loss": 0.81437373, + "num_input_tokens_seen": 182186385, + "router_z_loss_clip": 1.55664062, + "router_z_loss_mlp": 0.12615967, + "step": 8471, + "time_per_iteration": 2.5252416133880615 + }, + { + "auxiliary_loss_clip": 0.06330768, + "auxiliary_loss_mlp": 0.01255323, + "balance_loss_clip": 0.06264758, + "balance_loss_mlp": 0.01253313, + "epoch": 0.5093641966030362, + "flos": 68927838837120.0, + "grad_norm": 0.738097810584446, + "language_loss": 0.58042324, + "learning_rate": 2.036412655298103e-06, + "loss": 0.65628415, + "num_input_tokens_seen": 182247095, + "router_z_loss_clip": 0.66015625, + "router_z_loss_mlp": 0.02009583, + "step": 8472, + "time_per_iteration": 3.1610372066497803 + }, + { + "auxiliary_loss_clip": 0.06430018, + "auxiliary_loss_mlp": 0.01266308, + "balance_loss_clip": 0.06275266, + "balance_loss_mlp": 0.01254953, + "epoch": 0.5094243198557042, + "flos": 21587545676160.0, + "grad_norm": 1.8344067804800992, + "language_loss": 0.69000626, + "learning_rate": 2.03602325748156e-06, + "loss": 0.76696956, + "num_input_tokens_seen": 182266380, + "router_z_loss_clip": 1.54492188, + "router_z_loss_mlp": 0.11358643, + "step": 8473, + "time_per_iteration": 2.5834267139434814 + }, + { + "auxiliary_loss_clip": 0.06430315, + "auxiliary_loss_mlp": 0.01267159, + "balance_loss_clip": 0.06279565, + "balance_loss_mlp": 0.01255143, + "epoch": 0.5094844431083722, + "flos": 28848382652160.0, + "grad_norm": 2.5664905714857422, + "language_loss": 0.85103536, + "learning_rate": 2.0356338582990105e-06, + "loss": 0.92801011, + "num_input_tokens_seen": 182284685, + "router_z_loss_clip": 1.50585938, + "router_z_loss_mlp": 0.12011719, + "step": 8474, + "time_per_iteration": 2.5577685832977295 + }, + { + "auxiliary_loss_clip": 0.06432322, + "auxiliary_loss_mlp": 0.0126557, + "balance_loss_clip": 0.06278027, + "balance_loss_mlp": 0.01253488, + "epoch": 0.5095445663610402, + "flos": 14981454904320.0, + "grad_norm": 1.910358455820602, + "language_loss": 0.64868319, + "learning_rate": 2.035244457765222e-06, + "loss": 0.72566211, + "num_input_tokens_seen": 182301810, + "router_z_loss_clip": 1.54101562, + "router_z_loss_mlp": 0.12091064, + "step": 8475, + "time_per_iteration": 3.9494359493255615 + }, + { + "auxiliary_loss_clip": 0.06435733, + "auxiliary_loss_mlp": 0.01268463, + "balance_loss_clip": 0.0627934, + "balance_loss_mlp": 0.01255779, + "epoch": 0.5096046896137081, + "flos": 20783354515200.0, + "grad_norm": 2.1677913618760623, + "language_loss": 0.8248105, + "learning_rate": 2.0348550558949605e-06, + "loss": 0.90185243, + "num_input_tokens_seen": 182320285, + "router_z_loss_clip": 1.56445312, + "router_z_loss_mlp": 0.12689209, + "step": 8476, + "time_per_iteration": 2.533986806869507 + }, + { + "auxiliary_loss_clip": 0.06432153, + "auxiliary_loss_mlp": 0.01267228, + "balance_loss_clip": 0.06275326, + "balance_loss_mlp": 0.01254628, + "epoch": 0.5096648128663761, + "flos": 23191735294080.0, + "grad_norm": 2.112211155301917, + "language_loss": 0.81339389, + "learning_rate": 2.0344656527029917e-06, + "loss": 0.89038771, + "num_input_tokens_seen": 182339465, + "router_z_loss_clip": 1.56738281, + "router_z_loss_mlp": 0.12609863, + "step": 8477, + "time_per_iteration": 2.614363193511963 + }, + { + "auxiliary_loss_clip": 0.06429507, + "auxiliary_loss_mlp": 0.01267776, + "balance_loss_clip": 0.0627466, + "balance_loss_mlp": 0.01254741, + "epoch": 0.509724936119044, + "flos": 22315945219200.0, + "grad_norm": 1.7511302636686703, + "language_loss": 0.61918831, + "learning_rate": 2.034076248204082e-06, + "loss": 0.69616115, + "num_input_tokens_seen": 182358375, + "router_z_loss_clip": 1.546875, + "router_z_loss_mlp": 0.13024902, + "step": 8478, + "time_per_iteration": 2.5054080486297607 + }, + { + "auxiliary_loss_clip": 0.06424017, + "auxiliary_loss_mlp": 0.01267914, + "balance_loss_clip": 0.06273499, + "balance_loss_mlp": 0.01256136, + "epoch": 0.509785059371712, + "flos": 26294372277120.0, + "grad_norm": 1.8013233320362476, + "language_loss": 0.66670853, + "learning_rate": 2.0336868424129968e-06, + "loss": 0.74362785, + "num_input_tokens_seen": 182377935, + "router_z_loss_clip": 1.50390625, + "router_z_loss_mlp": 0.11773682, + "step": 8479, + "time_per_iteration": 2.5773558616638184 + }, + { + "auxiliary_loss_clip": 0.06427336, + "auxiliary_loss_mlp": 0.01266645, + "balance_loss_clip": 0.06276052, + "balance_loss_mlp": 0.01254795, + "epoch": 0.50984518262438, + "flos": 22970942985600.0, + "grad_norm": 1.5048945656562989, + "language_loss": 0.69523573, + "learning_rate": 2.0332974353445037e-06, + "loss": 0.77217555, + "num_input_tokens_seen": 182396440, + "router_z_loss_clip": 1.51367188, + "router_z_loss_mlp": 0.1184082, + "step": 8480, + "time_per_iteration": 2.5308327674865723 + }, + { + "auxiliary_loss_clip": 0.06433358, + "auxiliary_loss_mlp": 0.01264781, + "balance_loss_clip": 0.06277278, + "balance_loss_mlp": 0.01252908, + "epoch": 0.509905305877048, + "flos": 26220551230080.0, + "grad_norm": 1.695627830792001, + "language_loss": 0.79513025, + "learning_rate": 2.0329080270133688e-06, + "loss": 0.87211168, + "num_input_tokens_seen": 182415890, + "router_z_loss_clip": 1.5625, + "router_z_loss_mlp": 0.11865234, + "step": 8481, + "time_per_iteration": 3.9862852096557617 + }, + { + "auxiliary_loss_clip": 0.06423856, + "auxiliary_loss_mlp": 0.01267023, + "balance_loss_clip": 0.06274414, + "balance_loss_mlp": 0.01255186, + "epoch": 0.5099654291297159, + "flos": 20346381872640.0, + "grad_norm": 1.4463685523965593, + "language_loss": 0.83447778, + "learning_rate": 2.0325186174343578e-06, + "loss": 0.91138661, + "num_input_tokens_seen": 182434235, + "router_z_loss_clip": 1.49414062, + "router_z_loss_mlp": 0.1184082, + "step": 8482, + "time_per_iteration": 2.539057970046997 + }, + { + "auxiliary_loss_clip": 0.06432243, + "auxiliary_loss_mlp": 0.01269925, + "balance_loss_clip": 0.0627501, + "balance_loss_mlp": 0.01257682, + "epoch": 0.5100255523823839, + "flos": 29061711947520.0, + "grad_norm": 1.7174746607832896, + "language_loss": 0.85923511, + "learning_rate": 2.032129206622238e-06, + "loss": 0.93625677, + "num_input_tokens_seen": 182454360, + "router_z_loss_clip": 1.57128906, + "router_z_loss_mlp": 0.12243652, + "step": 8483, + "time_per_iteration": 2.5567803382873535 + }, + { + "auxiliary_loss_clip": 0.06428108, + "auxiliary_loss_mlp": 0.01267951, + "balance_loss_clip": 0.06273945, + "balance_loss_mlp": 0.01256352, + "epoch": 0.5100856756350518, + "flos": 22462539137280.0, + "grad_norm": 3.7192784343186367, + "language_loss": 0.83011222, + "learning_rate": 2.031739794591775e-06, + "loss": 0.90707278, + "num_input_tokens_seen": 182471940, + "router_z_loss_clip": 1.54101562, + "router_z_loss_mlp": 0.11590576, + "step": 8484, + "time_per_iteration": 2.50913143157959 + }, + { + "auxiliary_loss_clip": 0.0642792, + "auxiliary_loss_mlp": 0.0126741, + "balance_loss_clip": 0.06274521, + "balance_loss_mlp": 0.01254953, + "epoch": 0.5101457988877198, + "flos": 19176942764160.0, + "grad_norm": 1.8545423824290383, + "language_loss": 0.81929463, + "learning_rate": 2.031350381357736e-06, + "loss": 0.89624798, + "num_input_tokens_seen": 182490685, + "router_z_loss_clip": 1.53222656, + "router_z_loss_mlp": 0.12463379, + "step": 8485, + "time_per_iteration": 2.479165554046631 + }, + { + "auxiliary_loss_clip": 0.06421156, + "auxiliary_loss_mlp": 0.01266312, + "balance_loss_clip": 0.06272486, + "balance_loss_mlp": 0.01254522, + "epoch": 0.5102059221403878, + "flos": 14871645728640.0, + "grad_norm": 1.8580884452241668, + "language_loss": 0.73778898, + "learning_rate": 2.0309609669348874e-06, + "loss": 0.81466365, + "num_input_tokens_seen": 182508325, + "router_z_loss_clip": 1.48632812, + "router_z_loss_mlp": 0.11791992, + "step": 8486, + "time_per_iteration": 2.502035140991211 + }, + { + "auxiliary_loss_clip": 0.06432486, + "auxiliary_loss_mlp": 0.01268204, + "balance_loss_clip": 0.06276038, + "balance_loss_mlp": 0.01255115, + "epoch": 0.5102660453930558, + "flos": 22966876062720.0, + "grad_norm": 1.455931130318143, + "language_loss": 0.6993084, + "learning_rate": 2.0305715513379953e-06, + "loss": 0.77631527, + "num_input_tokens_seen": 182527020, + "router_z_loss_clip": 1.56542969, + "router_z_loss_mlp": 0.13092041, + "step": 8487, + "time_per_iteration": 2.5022764205932617 + }, + { + "auxiliary_loss_clip": 0.06425266, + "auxiliary_loss_mlp": 0.01265042, + "balance_loss_clip": 0.06274921, + "balance_loss_mlp": 0.01252072, + "epoch": 0.5103261686457238, + "flos": 23156082581760.0, + "grad_norm": 2.025146562514191, + "language_loss": 0.72757244, + "learning_rate": 2.030182134581827e-06, + "loss": 0.80447549, + "num_input_tokens_seen": 182543505, + "router_z_loss_clip": 1.50097656, + "router_z_loss_mlp": 0.12963867, + "step": 8488, + "time_per_iteration": 2.5181195735931396 + }, + { + "auxiliary_loss_clip": 0.06435129, + "auxiliary_loss_mlp": 0.01271711, + "balance_loss_clip": 0.06278089, + "balance_loss_mlp": 0.01259861, + "epoch": 0.5103862918983917, + "flos": 14324444640000.0, + "grad_norm": 1.9274143081394266, + "language_loss": 0.69714773, + "learning_rate": 2.0297927166811503e-06, + "loss": 0.77421612, + "num_input_tokens_seen": 182562250, + "router_z_loss_clip": 1.56933594, + "router_z_loss_mlp": 0.11846924, + "step": 8489, + "time_per_iteration": 2.491626739501953 + }, + { + "auxiliary_loss_clip": 0.06427855, + "auxiliary_loss_mlp": 0.01262645, + "balance_loss_clip": 0.06272568, + "balance_loss_mlp": 0.01251231, + "epoch": 0.5104464151510597, + "flos": 25855638698880.0, + "grad_norm": 1.7641928011440773, + "language_loss": 0.73334658, + "learning_rate": 2.0294032976507297e-06, + "loss": 0.81025159, + "num_input_tokens_seen": 182581910, + "router_z_loss_clip": 1.55078125, + "router_z_loss_mlp": 0.11407471, + "step": 8490, + "time_per_iteration": 2.6192476749420166 + }, + { + "auxiliary_loss_clip": 0.06422485, + "auxiliary_loss_mlp": 0.01268102, + "balance_loss_clip": 0.06271752, + "balance_loss_mlp": 0.01256628, + "epoch": 0.5105065384037276, + "flos": 21659354225280.0, + "grad_norm": 1.995020059533993, + "language_loss": 0.8080864, + "learning_rate": 2.0290138775053337e-06, + "loss": 0.8849923, + "num_input_tokens_seen": 182601350, + "router_z_loss_clip": 1.50585938, + "router_z_loss_mlp": 0.11474609, + "step": 8491, + "time_per_iteration": 2.5444910526275635 + }, + { + "auxiliary_loss_clip": 0.0642098, + "auxiliary_loss_mlp": 0.01268766, + "balance_loss_clip": 0.06274496, + "balance_loss_mlp": 0.01257089, + "epoch": 0.5105666616563956, + "flos": 22498066068480.0, + "grad_norm": 2.247071959069697, + "language_loss": 0.79263282, + "learning_rate": 2.028624456259728e-06, + "loss": 0.86953026, + "num_input_tokens_seen": 182619660, + "router_z_loss_clip": 1.46386719, + "router_z_loss_mlp": 0.11676025, + "step": 8492, + "time_per_iteration": 2.656888008117676 + }, + { + "auxiliary_loss_clip": 0.06433547, + "auxiliary_loss_mlp": 0.01271088, + "balance_loss_clip": 0.06276479, + "balance_loss_mlp": 0.01257838, + "epoch": 0.5106267849090635, + "flos": 22462371429120.0, + "grad_norm": 1.9309641209432507, + "language_loss": 0.77830237, + "learning_rate": 2.0282350339286804e-06, + "loss": 0.85534871, + "num_input_tokens_seen": 182639815, + "router_z_loss_clip": 1.5703125, + "router_z_loss_mlp": 0.13256836, + "step": 8493, + "time_per_iteration": 2.550326347351074 + }, + { + "auxiliary_loss_clip": 0.06427996, + "auxiliary_loss_mlp": 0.01265337, + "balance_loss_clip": 0.06275648, + "balance_loss_mlp": 0.01252879, + "epoch": 0.5106869081617316, + "flos": 23553335589120.0, + "grad_norm": 1.7342765336142327, + "language_loss": 0.84044284, + "learning_rate": 2.0278456105269574e-06, + "loss": 0.91737616, + "num_input_tokens_seen": 182659655, + "router_z_loss_clip": 1.52539062, + "router_z_loss_mlp": 0.12457275, + "step": 8494, + "time_per_iteration": 2.582463026046753 + }, + { + "auxiliary_loss_clip": 0.06430838, + "auxiliary_loss_mlp": 0.0126671, + "balance_loss_clip": 0.0627555, + "balance_loss_mlp": 0.0125492, + "epoch": 0.5107470314143995, + "flos": 26799547743360.0, + "grad_norm": 2.0062643152671877, + "language_loss": 0.79773927, + "learning_rate": 2.027456186069326e-06, + "loss": 0.87471473, + "num_input_tokens_seen": 182677075, + "router_z_loss_clip": 1.55273438, + "router_z_loss_mlp": 0.11798096, + "step": 8495, + "time_per_iteration": 2.5472564697265625 + }, + { + "auxiliary_loss_clip": 0.06425454, + "auxiliary_loss_mlp": 0.01268533, + "balance_loss_clip": 0.06273226, + "balance_loss_mlp": 0.01256308, + "epoch": 0.5108071546670675, + "flos": 25746877699200.0, + "grad_norm": 1.417654874659872, + "language_loss": 0.78675163, + "learning_rate": 2.0270667605705535e-06, + "loss": 0.86369145, + "num_input_tokens_seen": 182699625, + "router_z_loss_clip": 1.52148438, + "router_z_loss_mlp": 0.12231445, + "step": 8496, + "time_per_iteration": 2.5841569900512695 + }, + { + "auxiliary_loss_clip": 0.06422253, + "auxiliary_loss_mlp": 0.01267746, + "balance_loss_clip": 0.06273818, + "balance_loss_mlp": 0.01255998, + "epoch": 0.5108672779197354, + "flos": 18703478868480.0, + "grad_norm": 1.866540646775448, + "language_loss": 0.7912823, + "learning_rate": 2.0266773340454066e-06, + "loss": 0.8681823, + "num_input_tokens_seen": 182717020, + "router_z_loss_clip": 1.48046875, + "router_z_loss_mlp": 0.11755371, + "step": 8497, + "time_per_iteration": 2.5111966133117676 + }, + { + "auxiliary_loss_clip": 0.06429158, + "auxiliary_loss_mlp": 0.0126396, + "balance_loss_clip": 0.06277271, + "balance_loss_mlp": 0.01252277, + "epoch": 0.5109274011724034, + "flos": 26695482572160.0, + "grad_norm": 1.6666059931479484, + "language_loss": 0.81941032, + "learning_rate": 2.0262879065086525e-06, + "loss": 0.89634144, + "num_input_tokens_seen": 182736955, + "router_z_loss_clip": 1.51757812, + "router_z_loss_mlp": 0.11682129, + "step": 8498, + "time_per_iteration": 2.608631134033203 + }, + { + "auxiliary_loss_clip": 0.06424002, + "auxiliary_loss_mlp": 0.01271992, + "balance_loss_clip": 0.06274551, + "balance_loss_mlp": 0.01260267, + "epoch": 0.5109875244250714, + "flos": 22790666926080.0, + "grad_norm": 1.6923312462183162, + "language_loss": 0.71301198, + "learning_rate": 2.0258984779750584e-06, + "loss": 0.78997189, + "num_input_tokens_seen": 182757620, + "router_z_loss_clip": 1.49316406, + "router_z_loss_mlp": 0.11724854, + "step": 8499, + "time_per_iteration": 2.5150094032287598 + }, + { + "auxiliary_loss_clip": 0.06427284, + "auxiliary_loss_mlp": 0.01266703, + "balance_loss_clip": 0.06273851, + "balance_loss_mlp": 0.01255003, + "epoch": 0.5110476476777394, + "flos": 35596958492160.0, + "grad_norm": 1.3954443671639698, + "language_loss": 0.72611153, + "learning_rate": 2.0255090484593914e-06, + "loss": 0.80305135, + "num_input_tokens_seen": 182780195, + "router_z_loss_clip": 1.53320312, + "router_z_loss_mlp": 0.11694336, + "step": 8500, + "time_per_iteration": 2.633239269256592 + }, + { + "auxiliary_loss_clip": 0.06435662, + "auxiliary_loss_mlp": 0.01270607, + "balance_loss_clip": 0.06276006, + "balance_loss_mlp": 0.01256803, + "epoch": 0.5111077709304074, + "flos": 19286751939840.0, + "grad_norm": 2.7349973685574973, + "language_loss": 0.63562721, + "learning_rate": 2.0251196179764183e-06, + "loss": 0.71268988, + "num_input_tokens_seen": 182795765, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.13800049, + "step": 8501, + "time_per_iteration": 2.5091230869293213 + }, + { + "auxiliary_loss_clip": 0.06434844, + "auxiliary_loss_mlp": 0.01273353, + "balance_loss_clip": 0.06276836, + "balance_loss_mlp": 0.01260848, + "epoch": 0.5111678941830753, + "flos": 20674551588480.0, + "grad_norm": 1.8816899756355796, + "language_loss": 0.88057411, + "learning_rate": 2.024730186540907e-06, + "loss": 0.95765609, + "num_input_tokens_seen": 182813120, + "router_z_loss_clip": 1.58007812, + "router_z_loss_mlp": 0.12506104, + "step": 8502, + "time_per_iteration": 2.517728090286255 + }, + { + "auxiliary_loss_clip": 0.06425811, + "auxiliary_loss_mlp": 0.01265447, + "balance_loss_clip": 0.06274389, + "balance_loss_mlp": 0.01253663, + "epoch": 0.5112280174357433, + "flos": 26295336599040.0, + "grad_norm": 1.4524091598864723, + "language_loss": 0.82627225, + "learning_rate": 2.0243407541676253e-06, + "loss": 0.90318477, + "num_input_tokens_seen": 182835745, + "router_z_loss_clip": 1.51367188, + "router_z_loss_mlp": 0.11779785, + "step": 8503, + "time_per_iteration": 2.711451768875122 + }, + { + "auxiliary_loss_clip": 0.06333953, + "auxiliary_loss_mlp": 0.01255603, + "balance_loss_clip": 0.06268184, + "balance_loss_mlp": 0.0125384, + "epoch": 0.5112881406884112, + "flos": 59490706492800.0, + "grad_norm": 0.8512772291593351, + "language_loss": 0.63800937, + "learning_rate": 2.023951320871339e-06, + "loss": 0.71390492, + "num_input_tokens_seen": 182892540, + "router_z_loss_clip": 0.65966797, + "router_z_loss_mlp": 0.01766968, + "step": 8504, + "time_per_iteration": 3.1690919399261475 + }, + { + "auxiliary_loss_clip": 0.06425914, + "auxiliary_loss_mlp": 0.01265825, + "balance_loss_clip": 0.06275845, + "balance_loss_mlp": 0.01253576, + "epoch": 0.5113482639410792, + "flos": 26476073856000.0, + "grad_norm": 1.7986544100736102, + "language_loss": 0.84377933, + "learning_rate": 2.023561886666816e-06, + "loss": 0.92069674, + "num_input_tokens_seen": 182911515, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.12261963, + "step": 8505, + "time_per_iteration": 2.5755858421325684 + }, + { + "auxiliary_loss_clip": 0.0643035, + "auxiliary_loss_mlp": 0.01265823, + "balance_loss_clip": 0.06279911, + "balance_loss_mlp": 0.01254229, + "epoch": 0.5114083871937471, + "flos": 29903190975360.0, + "grad_norm": 1.7295208629505698, + "language_loss": 0.75707996, + "learning_rate": 2.0231724515688246e-06, + "loss": 0.83404166, + "num_input_tokens_seen": 182930860, + "router_z_loss_clip": 1.50585938, + "router_z_loss_mlp": 0.11590576, + "step": 8506, + "time_per_iteration": 3.947927713394165 + }, + { + "auxiliary_loss_clip": 0.0642788, + "auxiliary_loss_mlp": 0.01268518, + "balance_loss_clip": 0.06276722, + "balance_loss_mlp": 0.01255303, + "epoch": 0.5114685104464152, + "flos": 24321161278080.0, + "grad_norm": 1.7165713389532073, + "language_loss": 0.58250427, + "learning_rate": 2.022783015592131e-06, + "loss": 0.65946829, + "num_input_tokens_seen": 182949960, + "router_z_loss_clip": 1.50976562, + "router_z_loss_mlp": 0.13214111, + "step": 8507, + "time_per_iteration": 2.5460915565490723 + }, + { + "auxiliary_loss_clip": 0.06432099, + "auxiliary_loss_mlp": 0.01269517, + "balance_loss_clip": 0.06281347, + "balance_loss_mlp": 0.01257023, + "epoch": 0.5115286336990831, + "flos": 17024965079040.0, + "grad_norm": 1.7959155859668763, + "language_loss": 0.8588531, + "learning_rate": 2.022393578751503e-06, + "loss": 0.93586934, + "num_input_tokens_seen": 182968085, + "router_z_loss_clip": 1.5078125, + "router_z_loss_mlp": 0.12475586, + "step": 8508, + "time_per_iteration": 2.501931667327881 + }, + { + "auxiliary_loss_clip": 0.06430113, + "auxiliary_loss_mlp": 0.01267037, + "balance_loss_clip": 0.06279224, + "balance_loss_mlp": 0.012544, + "epoch": 0.5115887569517511, + "flos": 23666121584640.0, + "grad_norm": 1.985741338533524, + "language_loss": 0.72740698, + "learning_rate": 2.022004141061709e-06, + "loss": 0.80437851, + "num_input_tokens_seen": 182987275, + "router_z_loss_clip": 1.50976562, + "router_z_loss_mlp": 0.12640381, + "step": 8509, + "time_per_iteration": 3.9570322036743164 + }, + { + "auxiliary_loss_clip": 0.06425552, + "auxiliary_loss_mlp": 0.01264949, + "balance_loss_clip": 0.06277531, + "balance_loss_mlp": 0.01254476, + "epoch": 0.511648880204419, + "flos": 16112725678080.0, + "grad_norm": 1.6522242028614569, + "language_loss": 0.76532018, + "learning_rate": 2.0216147025375153e-06, + "loss": 0.84222525, + "num_input_tokens_seen": 183004700, + "router_z_loss_clip": 1.47949219, + "router_z_loss_mlp": 0.10479736, + "step": 8510, + "time_per_iteration": 2.5000293254852295 + }, + { + "auxiliary_loss_clip": 0.06424148, + "auxiliary_loss_mlp": 0.01267464, + "balance_loss_clip": 0.06276409, + "balance_loss_mlp": 0.01256402, + "epoch": 0.511709003457087, + "flos": 32643221414400.0, + "grad_norm": 1.8483097722803792, + "language_loss": 0.71295965, + "learning_rate": 2.0212252631936907e-06, + "loss": 0.78987575, + "num_input_tokens_seen": 183025830, + "router_z_loss_clip": 1.47753906, + "router_z_loss_mlp": 0.11053467, + "step": 8511, + "time_per_iteration": 2.5970981121063232 + }, + { + "auxiliary_loss_clip": 0.06426742, + "auxiliary_loss_mlp": 0.01265633, + "balance_loss_clip": 0.06277999, + "balance_loss_mlp": 0.0125404, + "epoch": 0.511769126709755, + "flos": 21768492568320.0, + "grad_norm": 1.8966780464465567, + "language_loss": 0.67139721, + "learning_rate": 2.020835823045001e-06, + "loss": 0.74832094, + "num_input_tokens_seen": 183045140, + "router_z_loss_clip": 1.48632812, + "router_z_loss_mlp": 0.11584473, + "step": 8512, + "time_per_iteration": 2.5369138717651367 + }, + { + "auxiliary_loss_clip": 0.06426971, + "auxiliary_loss_mlp": 0.01266218, + "balance_loss_clip": 0.06273089, + "balance_loss_mlp": 0.01253588, + "epoch": 0.511829249962423, + "flos": 23922231189120.0, + "grad_norm": 1.7695600544803753, + "language_loss": 0.67171764, + "learning_rate": 2.0204463821062146e-06, + "loss": 0.7486496, + "num_input_tokens_seen": 183063935, + "router_z_loss_clip": 1.54003906, + "router_z_loss_mlp": 0.12628174, + "step": 8513, + "time_per_iteration": 2.517648220062256 + }, + { + "auxiliary_loss_clip": 0.06423096, + "auxiliary_loss_mlp": 0.01268209, + "balance_loss_clip": 0.06275445, + "balance_loss_mlp": 0.01255948, + "epoch": 0.511889373215091, + "flos": 23732856961920.0, + "grad_norm": 1.8747309224946216, + "language_loss": 0.68931103, + "learning_rate": 2.0200569403921e-06, + "loss": 0.76622409, + "num_input_tokens_seen": 183084135, + "router_z_loss_clip": 1.47753906, + "router_z_loss_mlp": 0.1227417, + "step": 8514, + "time_per_iteration": 3.969726085662842 + }, + { + "auxiliary_loss_clip": 0.06422693, + "auxiliary_loss_mlp": 0.01265425, + "balance_loss_clip": 0.06273951, + "balance_loss_mlp": 0.01254357, + "epoch": 0.5119494964677589, + "flos": 28119144568320.0, + "grad_norm": 1.955376754159203, + "language_loss": 0.66104603, + "learning_rate": 2.019667497917424e-06, + "loss": 0.7379272, + "num_input_tokens_seen": 183104570, + "router_z_loss_clip": 1.48632812, + "router_z_loss_mlp": 0.11065674, + "step": 8515, + "time_per_iteration": 2.586984872817993 + }, + { + "auxiliary_loss_clip": 0.06415779, + "auxiliary_loss_mlp": 0.01265644, + "balance_loss_clip": 0.0627024, + "balance_loss_mlp": 0.01254754, + "epoch": 0.5120096197204269, + "flos": 24980225967360.0, + "grad_norm": 1.8485741123105555, + "language_loss": 0.76016974, + "learning_rate": 2.019278054696955e-06, + "loss": 0.83698404, + "num_input_tokens_seen": 183123850, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.10894775, + "step": 8516, + "time_per_iteration": 2.5933895111083984 + }, + { + "auxiliary_loss_clip": 0.06425153, + "auxiliary_loss_mlp": 0.01265819, + "balance_loss_clip": 0.0627657, + "balance_loss_mlp": 0.01254136, + "epoch": 0.5120697429730948, + "flos": 17973863441280.0, + "grad_norm": 1.9611042257937292, + "language_loss": 0.78053069, + "learning_rate": 2.0188886107454595e-06, + "loss": 0.85744041, + "num_input_tokens_seen": 183141725, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.11694336, + "step": 8517, + "time_per_iteration": 2.4962363243103027 + }, + { + "auxiliary_loss_clip": 0.06430522, + "auxiliary_loss_mlp": 0.01271394, + "balance_loss_clip": 0.06276728, + "balance_loss_mlp": 0.01259211, + "epoch": 0.5121298662257628, + "flos": 23298651504000.0, + "grad_norm": 1.7759167489555023, + "language_loss": 0.74719632, + "learning_rate": 2.0184991660777063e-06, + "loss": 0.82421547, + "num_input_tokens_seen": 183161300, + "router_z_loss_clip": 1.53710938, + "router_z_loss_mlp": 0.12164307, + "step": 8518, + "time_per_iteration": 2.5037240982055664 + }, + { + "auxiliary_loss_clip": 0.06424905, + "auxiliary_loss_mlp": 0.0126823, + "balance_loss_clip": 0.06274471, + "balance_loss_mlp": 0.01256529, + "epoch": 0.5121899894784308, + "flos": 17316769322880.0, + "grad_norm": 1.687169580100827, + "language_loss": 0.78467947, + "learning_rate": 2.0181097207084625e-06, + "loss": 0.86161083, + "num_input_tokens_seen": 183180495, + "router_z_loss_clip": 1.50097656, + "router_z_loss_mlp": 0.11706543, + "step": 8519, + "time_per_iteration": 2.524724006652832 + }, + { + "auxiliary_loss_clip": 0.06422982, + "auxiliary_loss_mlp": 0.01264919, + "balance_loss_clip": 0.06273712, + "balance_loss_mlp": 0.01253016, + "epoch": 0.5122501127310988, + "flos": 24935978211840.0, + "grad_norm": 1.6239003664198155, + "language_loss": 0.79446238, + "learning_rate": 2.017720274652497e-06, + "loss": 0.87134135, + "num_input_tokens_seen": 183200330, + "router_z_loss_clip": 1.49121094, + "router_z_loss_mlp": 0.11907959, + "step": 8520, + "time_per_iteration": 2.522068500518799 + }, + { + "auxiliary_loss_clip": 0.06431363, + "auxiliary_loss_mlp": 0.01269526, + "balance_loss_clip": 0.06276108, + "balance_loss_mlp": 0.01256151, + "epoch": 0.5123102359837667, + "flos": 18448878637440.0, + "grad_norm": 1.8569595834923718, + "language_loss": 0.81725198, + "learning_rate": 2.0173308279245765e-06, + "loss": 0.89426088, + "num_input_tokens_seen": 183218230, + "router_z_loss_clip": 1.55175781, + "router_z_loss_mlp": 0.13366699, + "step": 8521, + "time_per_iteration": 3.956547498703003 + }, + { + "auxiliary_loss_clip": 0.06422685, + "auxiliary_loss_mlp": 0.01264857, + "balance_loss_clip": 0.0627308, + "balance_loss_mlp": 0.01253383, + "epoch": 0.5123703592364347, + "flos": 26691625284480.0, + "grad_norm": 3.145804815574879, + "language_loss": 0.68764591, + "learning_rate": 2.0169413805394692e-06, + "loss": 0.7645213, + "num_input_tokens_seen": 183236735, + "router_z_loss_clip": 1.49316406, + "router_z_loss_mlp": 0.11462402, + "step": 8522, + "time_per_iteration": 2.53696608543396 + }, + { + "auxiliary_loss_clip": 0.06430639, + "auxiliary_loss_mlp": 0.01269235, + "balance_loss_clip": 0.06276414, + "balance_loss_mlp": 0.01256039, + "epoch": 0.5124304824891026, + "flos": 28811555982720.0, + "grad_norm": 1.853417160064295, + "language_loss": 0.622962, + "learning_rate": 2.0165519325119433e-06, + "loss": 0.69996071, + "num_input_tokens_seen": 183257550, + "router_z_loss_clip": 1.54199219, + "router_z_loss_mlp": 0.13201904, + "step": 8523, + "time_per_iteration": 2.589885950088501 + }, + { + "auxiliary_loss_clip": 0.06424818, + "auxiliary_loss_mlp": 0.01265688, + "balance_loss_clip": 0.06274516, + "balance_loss_mlp": 0.01254685, + "epoch": 0.5124906057417706, + "flos": 21768199079040.0, + "grad_norm": 1.9669486922935226, + "language_loss": 0.77939785, + "learning_rate": 2.0161624838567656e-06, + "loss": 0.85630286, + "num_input_tokens_seen": 183275515, + "router_z_loss_clip": 1.50195312, + "router_z_loss_mlp": 0.11004639, + "step": 8524, + "time_per_iteration": 2.506647825241089 + }, + { + "auxiliary_loss_clip": 0.06424855, + "auxiliary_loss_mlp": 0.01266329, + "balance_loss_clip": 0.06275764, + "balance_loss_mlp": 0.01255344, + "epoch": 0.5125507289944387, + "flos": 18886605966720.0, + "grad_norm": 1.985021925330002, + "language_loss": 0.74904448, + "learning_rate": 2.015773034588706e-06, + "loss": 0.82595634, + "num_input_tokens_seen": 183293880, + "router_z_loss_clip": 1.49121094, + "router_z_loss_mlp": 0.10986328, + "step": 8525, + "time_per_iteration": 2.509902000427246 + }, + { + "auxiliary_loss_clip": 0.06429298, + "auxiliary_loss_mlp": 0.01270559, + "balance_loss_clip": 0.06276, + "balance_loss_mlp": 0.01258412, + "epoch": 0.5126108522471066, + "flos": 35636761981440.0, + "grad_norm": 1.5788283001431092, + "language_loss": 0.74868685, + "learning_rate": 2.015383584722531e-06, + "loss": 0.82568544, + "num_input_tokens_seen": 183315860, + "router_z_loss_clip": 1.53222656, + "router_z_loss_mlp": 0.12127686, + "step": 8526, + "time_per_iteration": 2.640554428100586 + }, + { + "auxiliary_loss_clip": 0.06428048, + "auxiliary_loss_mlp": 0.01267884, + "balance_loss_clip": 0.06275488, + "balance_loss_mlp": 0.01256613, + "epoch": 0.5126709754997746, + "flos": 20196685353600.0, + "grad_norm": 1.5376970768591331, + "language_loss": 0.658445, + "learning_rate": 2.0149941342730088e-06, + "loss": 0.73540437, + "num_input_tokens_seen": 183335480, + "router_z_loss_clip": 1.52539062, + "router_z_loss_mlp": 0.11279297, + "step": 8527, + "time_per_iteration": 2.5079874992370605 + }, + { + "auxiliary_loss_clip": 0.06421998, + "auxiliary_loss_mlp": 0.01268926, + "balance_loss_clip": 0.0627896, + "balance_loss_mlp": 0.01258644, + "epoch": 0.5127310987524425, + "flos": 18594550160640.0, + "grad_norm": 1.4224570841542155, + "language_loss": 0.74258637, + "learning_rate": 2.014604683254908e-06, + "loss": 0.81949556, + "num_input_tokens_seen": 183354395, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.10290527, + "step": 8528, + "time_per_iteration": 2.5583620071411133 + }, + { + "auxiliary_loss_clip": 0.06424492, + "auxiliary_loss_mlp": 0.01266445, + "balance_loss_clip": 0.06275051, + "balance_loss_mlp": 0.01254816, + "epoch": 0.5127912220051105, + "flos": 22461113617920.0, + "grad_norm": 1.747082224822374, + "language_loss": 0.83357608, + "learning_rate": 2.014215231682995e-06, + "loss": 0.91048539, + "num_input_tokens_seen": 183372980, + "router_z_loss_clip": 1.49316406, + "router_z_loss_mlp": 0.11621094, + "step": 8529, + "time_per_iteration": 2.5290021896362305 + }, + { + "auxiliary_loss_clip": 0.06427129, + "auxiliary_loss_mlp": 0.01266072, + "balance_loss_clip": 0.06279376, + "balance_loss_mlp": 0.01255206, + "epoch": 0.5128513452577784, + "flos": 19098845159040.0, + "grad_norm": 1.7753814294124612, + "language_loss": 0.7435441, + "learning_rate": 2.01382577957204e-06, + "loss": 0.82047611, + "num_input_tokens_seen": 183390160, + "router_z_loss_clip": 1.47753906, + "router_z_loss_mlp": 0.10852051, + "step": 8530, + "time_per_iteration": 2.5009660720825195 + }, + { + "auxiliary_loss_clip": 0.06336609, + "auxiliary_loss_mlp": 0.01264939, + "balance_loss_clip": 0.062712, + "balance_loss_mlp": 0.01263291, + "epoch": 0.5129114685104464, + "flos": 67914553011840.0, + "grad_norm": 0.7560442553547831, + "language_loss": 0.60794806, + "learning_rate": 2.0134363269368095e-06, + "loss": 0.68396354, + "num_input_tokens_seen": 183455280, + "router_z_loss_clip": 0.65625, + "router_z_loss_mlp": 0.01651001, + "step": 8531, + "time_per_iteration": 3.2641408443450928 + }, + { + "auxiliary_loss_clip": 0.06436025, + "auxiliary_loss_mlp": 0.0126867, + "balance_loss_clip": 0.062833, + "balance_loss_mlp": 0.0125722, + "epoch": 0.5129715917631144, + "flos": 20455436361600.0, + "grad_norm": 1.5619116128751078, + "language_loss": 0.76922929, + "learning_rate": 2.0130468737920725e-06, + "loss": 0.84627628, + "num_input_tokens_seen": 183473955, + "router_z_loss_clip": 1.52832031, + "router_z_loss_mlp": 0.11444092, + "step": 8532, + "time_per_iteration": 2.54885196685791 + }, + { + "auxiliary_loss_clip": 0.06429256, + "auxiliary_loss_mlp": 0.01269177, + "balance_loss_clip": 0.0627965, + "balance_loss_mlp": 0.0125747, + "epoch": 0.5130317150157824, + "flos": 35124836261760.0, + "grad_norm": 2.143443364581078, + "language_loss": 0.67464834, + "learning_rate": 2.012657420152597e-06, + "loss": 0.75163269, + "num_input_tokens_seen": 183497195, + "router_z_loss_clip": 1.49609375, + "router_z_loss_mlp": 0.11706543, + "step": 8533, + "time_per_iteration": 2.634751081466675 + }, + { + "auxiliary_loss_clip": 0.06435291, + "auxiliary_loss_mlp": 0.01270583, + "balance_loss_clip": 0.06282294, + "balance_loss_mlp": 0.01257995, + "epoch": 0.5130918382684503, + "flos": 19797671410560.0, + "grad_norm": 2.0992969405941526, + "language_loss": 0.82022768, + "learning_rate": 2.01226796603315e-06, + "loss": 0.89728636, + "num_input_tokens_seen": 183513675, + "router_z_loss_clip": 1.52734375, + "router_z_loss_mlp": 0.12585449, + "step": 8534, + "time_per_iteration": 2.527186632156372 + }, + { + "auxiliary_loss_clip": 0.06432565, + "auxiliary_loss_mlp": 0.01272989, + "balance_loss_clip": 0.06280594, + "balance_loss_mlp": 0.0126077, + "epoch": 0.5131519615211183, + "flos": 26330318478720.0, + "grad_norm": 1.396585887996991, + "language_loss": 0.64072168, + "learning_rate": 2.0118785114485017e-06, + "loss": 0.71777725, + "num_input_tokens_seen": 183535165, + "router_z_loss_clip": 1.51953125, + "router_z_loss_mlp": 0.12225342, + "step": 8535, + "time_per_iteration": 2.5608325004577637 + }, + { + "auxiliary_loss_clip": 0.06432404, + "auxiliary_loss_mlp": 0.01265724, + "balance_loss_clip": 0.06282519, + "balance_loss_mlp": 0.01254036, + "epoch": 0.5132120847737862, + "flos": 19177949013120.0, + "grad_norm": 1.677219086168078, + "language_loss": 0.70047057, + "learning_rate": 2.011489056413418e-06, + "loss": 0.77745175, + "num_input_tokens_seen": 183553780, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.11682129, + "step": 8536, + "time_per_iteration": 2.562103509902954 + }, + { + "auxiliary_loss_clip": 0.06443835, + "auxiliary_loss_mlp": 0.01273704, + "balance_loss_clip": 0.06287554, + "balance_loss_mlp": 0.01260359, + "epoch": 0.5132722080264542, + "flos": 20236698478080.0, + "grad_norm": 2.053357085489985, + "language_loss": 0.71648562, + "learning_rate": 2.011099600942669e-06, + "loss": 0.793661, + "num_input_tokens_seen": 183572285, + "router_z_loss_clip": 1.56054688, + "router_z_loss_mlp": 0.13348389, + "step": 8537, + "time_per_iteration": 2.5208451747894287 + }, + { + "auxiliary_loss_clip": 0.06435503, + "auxiliary_loss_mlp": 0.01264426, + "balance_loss_clip": 0.06282058, + "balance_loss_mlp": 0.01252559, + "epoch": 0.5133323312791223, + "flos": 16474619462400.0, + "grad_norm": 2.3096480270315487, + "language_loss": 0.80560482, + "learning_rate": 2.0107101450510214e-06, + "loss": 0.88260412, + "num_input_tokens_seen": 183589330, + "router_z_loss_clip": 1.53222656, + "router_z_loss_mlp": 0.11859131, + "step": 8538, + "time_per_iteration": 2.5136818885803223 + }, + { + "auxiliary_loss_clip": 0.06432489, + "auxiliary_loss_mlp": 0.01269896, + "balance_loss_clip": 0.06280679, + "balance_loss_mlp": 0.01258177, + "epoch": 0.5133924545317902, + "flos": 26075340904320.0, + "grad_norm": 1.6767929293826078, + "language_loss": 0.78499532, + "learning_rate": 2.0103206887532437e-06, + "loss": 0.86201918, + "num_input_tokens_seen": 183609205, + "router_z_loss_clip": 1.51757812, + "router_z_loss_mlp": 0.1171875, + "step": 8539, + "time_per_iteration": 2.5898549556732178 + }, + { + "auxiliary_loss_clip": 0.06434882, + "auxiliary_loss_mlp": 0.01267576, + "balance_loss_clip": 0.06283914, + "balance_loss_mlp": 0.01255703, + "epoch": 0.5134525777844582, + "flos": 29138467887360.0, + "grad_norm": 1.6389084641418472, + "language_loss": 0.76422769, + "learning_rate": 2.009931232064105e-06, + "loss": 0.84125227, + "num_input_tokens_seen": 183629985, + "router_z_loss_clip": 1.50976562, + "router_z_loss_mlp": 0.11877441, + "step": 8540, + "time_per_iteration": 2.695279359817505 + }, + { + "auxiliary_loss_clip": 0.06437706, + "auxiliary_loss_mlp": 0.01272086, + "balance_loss_clip": 0.06283282, + "balance_loss_mlp": 0.01258812, + "epoch": 0.5135127010371261, + "flos": 17460134858880.0, + "grad_norm": 1.735384048528371, + "language_loss": 0.74720204, + "learning_rate": 2.0095417749983724e-06, + "loss": 0.82429993, + "num_input_tokens_seen": 183648220, + "router_z_loss_clip": 1.54296875, + "router_z_loss_mlp": 0.1328125, + "step": 8541, + "time_per_iteration": 2.5028650760650635 + }, + { + "auxiliary_loss_clip": 0.06433722, + "auxiliary_loss_mlp": 0.01268404, + "balance_loss_clip": 0.06282187, + "balance_loss_mlp": 0.01255905, + "epoch": 0.5135728242897941, + "flos": 21951493885440.0, + "grad_norm": 1.7658048645767805, + "language_loss": 0.71345925, + "learning_rate": 2.0091523175708162e-06, + "loss": 0.79048049, + "num_input_tokens_seen": 183668230, + "router_z_loss_clip": 1.51367188, + "router_z_loss_mlp": 0.12493896, + "step": 8542, + "time_per_iteration": 2.55663800239563 + }, + { + "auxiliary_loss_clip": 0.06432796, + "auxiliary_loss_mlp": 0.01267795, + "balance_loss_clip": 0.06282645, + "balance_loss_mlp": 0.01255939, + "epoch": 0.513632947542462, + "flos": 22681528583040.0, + "grad_norm": 1.8429175926110044, + "language_loss": 0.79735661, + "learning_rate": 2.0087628597962023e-06, + "loss": 0.87436259, + "num_input_tokens_seen": 183687800, + "router_z_loss_clip": 1.50195312, + "router_z_loss_mlp": 0.11846924, + "step": 8543, + "time_per_iteration": 2.530942440032959 + }, + { + "auxiliary_loss_clip": 0.06431838, + "auxiliary_loss_mlp": 0.01268863, + "balance_loss_clip": 0.06281078, + "balance_loss_mlp": 0.0125693, + "epoch": 0.51369307079513, + "flos": 29464289688960.0, + "grad_norm": 1.9724623685644402, + "language_loss": 0.68434304, + "learning_rate": 2.008373401689299e-06, + "loss": 0.76135004, + "num_input_tokens_seen": 183709025, + "router_z_loss_clip": 1.50878906, + "router_z_loss_mlp": 0.11932373, + "step": 8544, + "time_per_iteration": 2.581965684890747 + }, + { + "auxiliary_loss_clip": 0.06435554, + "auxiliary_loss_mlp": 0.01269408, + "balance_loss_clip": 0.0628157, + "balance_loss_mlp": 0.01257314, + "epoch": 0.513753194047798, + "flos": 18995325039360.0, + "grad_norm": 1.9173308249452852, + "language_loss": 0.73101795, + "learning_rate": 2.0079839432648765e-06, + "loss": 0.80806756, + "num_input_tokens_seen": 183725740, + "router_z_loss_clip": 1.53808594, + "router_z_loss_mlp": 0.12103271, + "step": 8545, + "time_per_iteration": 3.9112906455993652 + }, + { + "auxiliary_loss_clip": 0.06434133, + "auxiliary_loss_mlp": 0.01273161, + "balance_loss_clip": 0.06280358, + "balance_loss_mlp": 0.01260745, + "epoch": 0.513813317300466, + "flos": 17827646866560.0, + "grad_norm": 2.3149125381427322, + "language_loss": 0.82387555, + "learning_rate": 2.0075944845377016e-06, + "loss": 0.90094852, + "num_input_tokens_seen": 183743995, + "router_z_loss_clip": 1.53613281, + "router_z_loss_mlp": 0.12408447, + "step": 8546, + "time_per_iteration": 2.4859204292297363 + }, + { + "auxiliary_loss_clip": 0.06431763, + "auxiliary_loss_mlp": 0.0126748, + "balance_loss_clip": 0.062795, + "balance_loss_mlp": 0.01255101, + "epoch": 0.5138734405531339, + "flos": 24068070420480.0, + "grad_norm": 1.656069587269211, + "language_loss": 0.73464745, + "learning_rate": 2.007205025522544e-06, + "loss": 0.81163985, + "num_input_tokens_seen": 183764150, + "router_z_loss_clip": 1.52539062, + "router_z_loss_mlp": 0.12384033, + "step": 8547, + "time_per_iteration": 2.5682289600372314 + }, + { + "auxiliary_loss_clip": 0.0643255, + "auxiliary_loss_mlp": 0.01266832, + "balance_loss_clip": 0.06281269, + "balance_loss_mlp": 0.01254697, + "epoch": 0.5139335638058019, + "flos": 26103279041280.0, + "grad_norm": 1.7029090715356687, + "language_loss": 0.7379564, + "learning_rate": 2.0068155662341702e-06, + "loss": 0.81495023, + "num_input_tokens_seen": 183783280, + "router_z_loss_clip": 1.51269531, + "router_z_loss_mlp": 0.12121582, + "step": 8548, + "time_per_iteration": 2.534795045852661 + }, + { + "auxiliary_loss_clip": 0.06433449, + "auxiliary_loss_mlp": 0.01270968, + "balance_loss_clip": 0.06279913, + "balance_loss_mlp": 0.01259124, + "epoch": 0.5139936870584698, + "flos": 18923181073920.0, + "grad_norm": 1.5199417717256292, + "language_loss": 0.82597619, + "learning_rate": 2.0064261066873495e-06, + "loss": 0.90302038, + "num_input_tokens_seen": 183800725, + "router_z_loss_clip": 1.53417969, + "router_z_loss_mlp": 0.11853027, + "step": 8549, + "time_per_iteration": 3.9844579696655273 + }, + { + "auxiliary_loss_clip": 0.06431821, + "auxiliary_loss_mlp": 0.01268578, + "balance_loss_clip": 0.06283253, + "balance_loss_mlp": 0.01256913, + "epoch": 0.5140538103111378, + "flos": 16149594274560.0, + "grad_norm": 1.7893333067818897, + "language_loss": 0.72460294, + "learning_rate": 2.0060366468968504e-06, + "loss": 0.80160695, + "num_input_tokens_seen": 183818735, + "router_z_loss_clip": 1.48339844, + "router_z_loss_mlp": 0.11669922, + "step": 8550, + "time_per_iteration": 2.6143221855163574 + }, + { + "auxiliary_loss_clip": 0.06436016, + "auxiliary_loss_mlp": 0.01265894, + "balance_loss_clip": 0.06278858, + "balance_loss_mlp": 0.01253341, + "epoch": 0.5141139335638057, + "flos": 22426886424960.0, + "grad_norm": 1.3843612466681816, + "language_loss": 0.7537846, + "learning_rate": 2.0056471868774408e-06, + "loss": 0.83080363, + "num_input_tokens_seen": 183840015, + "router_z_loss_clip": 1.56933594, + "router_z_loss_mlp": 0.12536621, + "step": 8551, + "time_per_iteration": 2.563551664352417 + }, + { + "auxiliary_loss_clip": 0.06427439, + "auxiliary_loss_mlp": 0.01266176, + "balance_loss_clip": 0.06281094, + "balance_loss_mlp": 0.01255233, + "epoch": 0.5141740568164738, + "flos": 27097054064640.0, + "grad_norm": 1.547590229430392, + "language_loss": 0.69192576, + "learning_rate": 2.0052577266438897e-06, + "loss": 0.76886189, + "num_input_tokens_seen": 183860145, + "router_z_loss_clip": 1.46191406, + "router_z_loss_mlp": 0.10949707, + "step": 8552, + "time_per_iteration": 2.598309278488159 + }, + { + "auxiliary_loss_clip": 0.06434312, + "auxiliary_loss_mlp": 0.01271227, + "balance_loss_clip": 0.06280888, + "balance_loss_mlp": 0.01258972, + "epoch": 0.5142341800691418, + "flos": 24980267894400.0, + "grad_norm": 1.7162445999633908, + "language_loss": 0.75295067, + "learning_rate": 2.004868266210965e-06, + "loss": 0.830006, + "num_input_tokens_seen": 183880540, + "router_z_loss_clip": 1.53515625, + "router_z_loss_mlp": 0.12255859, + "step": 8553, + "time_per_iteration": 2.56817364692688 + }, + { + "auxiliary_loss_clip": 0.06427588, + "auxiliary_loss_mlp": 0.01265909, + "balance_loss_clip": 0.06277347, + "balance_loss_mlp": 0.01253642, + "epoch": 0.5142943033218097, + "flos": 20710833206400.0, + "grad_norm": 1.5512777085285745, + "language_loss": 0.68091589, + "learning_rate": 2.004478805593435e-06, + "loss": 0.75785089, + "num_input_tokens_seen": 183900895, + "router_z_loss_clip": 1.50195312, + "router_z_loss_mlp": 0.1227417, + "step": 8554, + "time_per_iteration": 4.041098117828369 + }, + { + "auxiliary_loss_clip": 0.06434806, + "auxiliary_loss_mlp": 0.01269189, + "balance_loss_clip": 0.0627867, + "balance_loss_mlp": 0.0125514, + "epoch": 0.5143544265744777, + "flos": 22931391058560.0, + "grad_norm": 1.9544744043919176, + "language_loss": 0.73420155, + "learning_rate": 2.004089344806068e-06, + "loss": 0.81124151, + "num_input_tokens_seen": 183920335, + "router_z_loss_clip": 1.56054688, + "router_z_loss_mlp": 0.14050293, + "step": 8555, + "time_per_iteration": 2.560406446456909 + }, + { + "auxiliary_loss_clip": 0.0643023, + "auxiliary_loss_mlp": 0.01264405, + "balance_loss_clip": 0.06277946, + "balance_loss_mlp": 0.0125305, + "epoch": 0.5144145498271456, + "flos": 15926328270720.0, + "grad_norm": 3.1721710851325478, + "language_loss": 0.74827576, + "learning_rate": 2.003699883863633e-06, + "loss": 0.82522213, + "num_input_tokens_seen": 183936220, + "router_z_loss_clip": 1.52246094, + "router_z_loss_mlp": 0.11346436, + "step": 8556, + "time_per_iteration": 2.510631561279297 + }, + { + "auxiliary_loss_clip": 0.06426013, + "auxiliary_loss_mlp": 0.01266484, + "balance_loss_clip": 0.06279086, + "balance_loss_mlp": 0.01255374, + "epoch": 0.5144746730798136, + "flos": 19687107548160.0, + "grad_norm": 1.7802365486116365, + "language_loss": 0.86600292, + "learning_rate": 2.003310422780898e-06, + "loss": 0.9429279, + "num_input_tokens_seen": 183953250, + "router_z_loss_clip": 1.46972656, + "router_z_loss_mlp": 0.11114502, + "step": 8557, + "time_per_iteration": 2.4897682666778564 + }, + { + "auxiliary_loss_clip": 0.06427194, + "auxiliary_loss_mlp": 0.01265116, + "balance_loss_clip": 0.06280152, + "balance_loss_mlp": 0.0125372, + "epoch": 0.5145347963324816, + "flos": 23921476502400.0, + "grad_norm": 1.7088292247190593, + "language_loss": 0.89943027, + "learning_rate": 2.0029209615726307e-06, + "loss": 0.97635341, + "num_input_tokens_seen": 183973865, + "router_z_loss_clip": 1.46972656, + "router_z_loss_mlp": 0.11407471, + "step": 8558, + "time_per_iteration": 2.552520513534546 + }, + { + "auxiliary_loss_clip": 0.06426296, + "auxiliary_loss_mlp": 0.01270393, + "balance_loss_clip": 0.06281744, + "balance_loss_mlp": 0.01259337, + "epoch": 0.5145949195851496, + "flos": 18265919247360.0, + "grad_norm": 1.814909546317071, + "language_loss": 0.65665084, + "learning_rate": 2.002531500253602e-06, + "loss": 0.73361778, + "num_input_tokens_seen": 183992555, + "router_z_loss_clip": 1.4453125, + "router_z_loss_mlp": 0.1105957, + "step": 8559, + "time_per_iteration": 2.5509958267211914 + }, + { + "auxiliary_loss_clip": 0.06428455, + "auxiliary_loss_mlp": 0.0126663, + "balance_loss_clip": 0.0628074, + "balance_loss_mlp": 0.0125527, + "epoch": 0.5146550428378175, + "flos": 26220593157120.0, + "grad_norm": 1.5790337478872891, + "language_loss": 0.63388872, + "learning_rate": 2.002142038838577e-06, + "loss": 0.71083951, + "num_input_tokens_seen": 184010825, + "router_z_loss_clip": 1.47851562, + "router_z_loss_mlp": 0.11358643, + "step": 8560, + "time_per_iteration": 2.5824177265167236 + }, + { + "auxiliary_loss_clip": 0.06429952, + "auxiliary_loss_mlp": 0.01265572, + "balance_loss_clip": 0.06279366, + "balance_loss_mlp": 0.01253597, + "epoch": 0.5147151660904855, + "flos": 22680731969280.0, + "grad_norm": 1.6548160663474087, + "language_loss": 0.70604181, + "learning_rate": 2.0017525773423265e-06, + "loss": 0.78299701, + "num_input_tokens_seen": 184030155, + "router_z_loss_clip": 1.50390625, + "router_z_loss_mlp": 0.11975098, + "step": 8561, + "time_per_iteration": 4.051865816116333 + }, + { + "auxiliary_loss_clip": 0.06432293, + "auxiliary_loss_mlp": 0.01266304, + "balance_loss_clip": 0.0628119, + "balance_loss_mlp": 0.01254937, + "epoch": 0.5147752893431534, + "flos": 24979261645440.0, + "grad_norm": 1.5164557892601689, + "language_loss": 0.67091215, + "learning_rate": 2.0013631157796177e-06, + "loss": 0.7478981, + "num_input_tokens_seen": 184051440, + "router_z_loss_clip": 1.51171875, + "router_z_loss_mlp": 0.1137085, + "step": 8562, + "time_per_iteration": 2.587117910385132 + }, + { + "auxiliary_loss_clip": 0.06434688, + "auxiliary_loss_mlp": 0.0126818, + "balance_loss_clip": 0.06283362, + "balance_loss_mlp": 0.01256945, + "epoch": 0.5148354125958214, + "flos": 22750821509760.0, + "grad_norm": 1.6017474228640745, + "language_loss": 0.77982432, + "learning_rate": 2.0009736541652188e-06, + "loss": 0.85685301, + "num_input_tokens_seen": 184070205, + "router_z_loss_clip": 1.51367188, + "router_z_loss_mlp": 0.11248779, + "step": 8563, + "time_per_iteration": 2.5995922088623047 + }, + { + "auxiliary_loss_clip": 0.06441233, + "auxiliary_loss_mlp": 0.01269901, + "balance_loss_clip": 0.06284129, + "balance_loss_mlp": 0.01257235, + "epoch": 0.5148955358484893, + "flos": 23074253470080.0, + "grad_norm": 2.0871441030394426, + "language_loss": 0.83276081, + "learning_rate": 2.0005841925139e-06, + "loss": 0.90987211, + "num_input_tokens_seen": 184087345, + "router_z_loss_clip": 1.57128906, + "router_z_loss_mlp": 0.12658691, + "step": 8564, + "time_per_iteration": 2.5510189533233643 + }, + { + "auxiliary_loss_clip": 0.06436282, + "auxiliary_loss_mlp": 0.01266369, + "balance_loss_clip": 0.06281953, + "balance_loss_mlp": 0.01253918, + "epoch": 0.5149556591011574, + "flos": 20346465726720.0, + "grad_norm": 3.2981963875061915, + "language_loss": 0.73735076, + "learning_rate": 2.0001947308404283e-06, + "loss": 0.81437725, + "num_input_tokens_seen": 184107110, + "router_z_loss_clip": 1.54589844, + "router_z_loss_mlp": 0.12451172, + "step": 8565, + "time_per_iteration": 2.565485715866089 + }, + { + "auxiliary_loss_clip": 0.06439919, + "auxiliary_loss_mlp": 0.01271905, + "balance_loss_clip": 0.06283022, + "balance_loss_mlp": 0.01259478, + "epoch": 0.5150157823538254, + "flos": 22644869621760.0, + "grad_norm": 2.0080537974138424, + "language_loss": 0.6841439, + "learning_rate": 1.9998052691595715e-06, + "loss": 0.76126206, + "num_input_tokens_seen": 184127105, + "router_z_loss_clip": 1.56542969, + "router_z_loss_mlp": 0.12438965, + "step": 8566, + "time_per_iteration": 2.540060520172119 + }, + { + "auxiliary_loss_clip": 0.06439756, + "auxiliary_loss_mlp": 0.01270124, + "balance_loss_clip": 0.06282447, + "balance_loss_mlp": 0.0125828, + "epoch": 0.5150759056064933, + "flos": 26074795852800.0, + "grad_norm": 1.7193676063763261, + "language_loss": 0.78763425, + "learning_rate": 1.9994158074861005e-06, + "loss": 0.86473316, + "num_input_tokens_seen": 184148060, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.11834717, + "step": 8567, + "time_per_iteration": 2.610316276550293 + }, + { + "auxiliary_loss_clip": 0.06433998, + "auxiliary_loss_mlp": 0.0126364, + "balance_loss_clip": 0.06282104, + "balance_loss_mlp": 0.01251535, + "epoch": 0.5151360288591613, + "flos": 25958865329280.0, + "grad_norm": 1.8031823951648205, + "language_loss": 0.79058564, + "learning_rate": 1.9990263458347806e-06, + "loss": 0.86756206, + "num_input_tokens_seen": 184166175, + "router_z_loss_clip": 1.51855469, + "router_z_loss_mlp": 0.12091064, + "step": 8568, + "time_per_iteration": 2.5746078491210938 + }, + { + "auxiliary_loss_clip": 0.06425972, + "auxiliary_loss_mlp": 0.01263804, + "balance_loss_clip": 0.06277977, + "balance_loss_mlp": 0.01252705, + "epoch": 0.5151961521118292, + "flos": 18511840581120.0, + "grad_norm": 2.107330893228774, + "language_loss": 0.90881652, + "learning_rate": 1.9986368842203825e-06, + "loss": 0.98571432, + "num_input_tokens_seen": 184182600, + "router_z_loss_clip": 1.48046875, + "router_z_loss_mlp": 0.11096191, + "step": 8569, + "time_per_iteration": 2.5259969234466553 + }, + { + "auxiliary_loss_clip": 0.06436515, + "auxiliary_loss_mlp": 0.01273396, + "balance_loss_clip": 0.06282495, + "balance_loss_mlp": 0.01261225, + "epoch": 0.5152562753644973, + "flos": 22239734330880.0, + "grad_norm": 1.7160477900396784, + "language_loss": 0.77020866, + "learning_rate": 1.998247422657674e-06, + "loss": 0.84730774, + "num_input_tokens_seen": 184202020, + "router_z_loss_clip": 1.53808594, + "router_z_loss_mlp": 0.12188721, + "step": 8570, + "time_per_iteration": 2.5214664936065674 + }, + { + "auxiliary_loss_clip": 0.06435493, + "auxiliary_loss_mlp": 0.01269852, + "balance_loss_clip": 0.06284317, + "balance_loss_mlp": 0.01256817, + "epoch": 0.5153163986171652, + "flos": 38445833784960.0, + "grad_norm": 1.5069722692963965, + "language_loss": 0.73508942, + "learning_rate": 1.9978579611614227e-06, + "loss": 0.81214285, + "num_input_tokens_seen": 184224850, + "router_z_loss_clip": 1.51269531, + "router_z_loss_mlp": 0.1305542, + "step": 8571, + "time_per_iteration": 2.6566643714904785 + }, + { + "auxiliary_loss_clip": 0.06335695, + "auxiliary_loss_mlp": 0.01251905, + "balance_loss_clip": 0.06270696, + "balance_loss_mlp": 0.01250073, + "epoch": 0.5153765218698332, + "flos": 66404533783680.0, + "grad_norm": 0.7650204220049751, + "language_loss": 0.52955389, + "learning_rate": 1.9974684997463984e-06, + "loss": 0.60542989, + "num_input_tokens_seen": 184288520, + "router_z_loss_clip": 0.64990234, + "router_z_loss_mlp": 0.01826477, + "step": 8572, + "time_per_iteration": 3.231537103652954 + }, + { + "auxiliary_loss_clip": 0.06429811, + "auxiliary_loss_mlp": 0.01269417, + "balance_loss_clip": 0.06284182, + "balance_loss_mlp": 0.01257622, + "epoch": 0.5154366451225011, + "flos": 24031537240320.0, + "grad_norm": 1.6307698114257092, + "language_loss": 0.76929724, + "learning_rate": 1.9970790384273687e-06, + "loss": 0.84628952, + "num_input_tokens_seen": 184308565, + "router_z_loss_clip": 1.45703125, + "router_z_loss_mlp": 0.11791992, + "step": 8573, + "time_per_iteration": 2.5637993812561035 + }, + { + "auxiliary_loss_clip": 0.06429262, + "auxiliary_loss_mlp": 0.01267008, + "balance_loss_clip": 0.06281111, + "balance_loss_mlp": 0.01255099, + "epoch": 0.5154967683751691, + "flos": 23474189808000.0, + "grad_norm": 2.3679054324331967, + "language_loss": 0.77109015, + "learning_rate": 1.996689577219102e-06, + "loss": 0.84805286, + "num_input_tokens_seen": 184326795, + "router_z_loss_clip": 1.48046875, + "router_z_loss_mlp": 0.11914062, + "step": 8574, + "time_per_iteration": 2.53300404548645 + }, + { + "auxiliary_loss_clip": 0.06429033, + "auxiliary_loss_mlp": 0.01263951, + "balance_loss_clip": 0.06281316, + "balance_loss_mlp": 0.01252691, + "epoch": 0.515556891627837, + "flos": 23812463940480.0, + "grad_norm": 1.7644957150045186, + "language_loss": 0.85785985, + "learning_rate": 1.996300116136367e-06, + "loss": 0.93478966, + "num_input_tokens_seen": 184345990, + "router_z_loss_clip": 1.4765625, + "router_z_loss_mlp": 0.11248779, + "step": 8575, + "time_per_iteration": 2.577409029006958 + }, + { + "auxiliary_loss_clip": 0.06435408, + "auxiliary_loss_mlp": 0.01265893, + "balance_loss_clip": 0.06283233, + "balance_loss_mlp": 0.01253859, + "epoch": 0.515617014880505, + "flos": 19834665788160.0, + "grad_norm": 1.5082721708333224, + "language_loss": 0.76947051, + "learning_rate": 1.995910655193932e-06, + "loss": 0.84648347, + "num_input_tokens_seen": 184366300, + "router_z_loss_clip": 1.52050781, + "router_z_loss_mlp": 0.1204834, + "step": 8576, + "time_per_iteration": 2.5881736278533936 + }, + { + "auxiliary_loss_clip": 0.06444222, + "auxiliary_loss_mlp": 0.01270832, + "balance_loss_clip": 0.06283684, + "balance_loss_mlp": 0.01258083, + "epoch": 0.515677138133173, + "flos": 14251042863360.0, + "grad_norm": 2.2995750246066406, + "language_loss": 0.75517124, + "learning_rate": 1.9955211944065654e-06, + "loss": 0.83232176, + "num_input_tokens_seen": 184383030, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.12762451, + "step": 8577, + "time_per_iteration": 2.518495559692383 + }, + { + "auxiliary_loss_clip": 0.06436984, + "auxiliary_loss_mlp": 0.01270084, + "balance_loss_clip": 0.0628281, + "balance_loss_mlp": 0.01257037, + "epoch": 0.515737261385841, + "flos": 28296653443200.0, + "grad_norm": 4.0524023742876345, + "language_loss": 0.81602645, + "learning_rate": 1.9951317337890353e-06, + "loss": 0.89309716, + "num_input_tokens_seen": 184403410, + "router_z_loss_clip": 1.54101562, + "router_z_loss_mlp": 0.13049316, + "step": 8578, + "time_per_iteration": 2.5854508876800537 + }, + { + "auxiliary_loss_clip": 0.06431551, + "auxiliary_loss_mlp": 0.01266524, + "balance_loss_clip": 0.06281303, + "balance_loss_mlp": 0.01254746, + "epoch": 0.515797384638509, + "flos": 27899400435840.0, + "grad_norm": 1.724028071509101, + "language_loss": 0.7613306, + "learning_rate": 1.9947422733561105e-06, + "loss": 0.83831137, + "num_input_tokens_seen": 184423830, + "router_z_loss_clip": 1.50097656, + "router_z_loss_mlp": 0.11785889, + "step": 8579, + "time_per_iteration": 2.5765621662139893 + }, + { + "auxiliary_loss_clip": 0.06434369, + "auxiliary_loss_mlp": 0.01265499, + "balance_loss_clip": 0.06280281, + "balance_loss_mlp": 0.01253053, + "epoch": 0.5158575078911769, + "flos": 23046860384640.0, + "grad_norm": 1.6181814769530192, + "language_loss": 0.79290402, + "learning_rate": 1.994352813122559e-06, + "loss": 0.86990273, + "num_input_tokens_seen": 184445050, + "router_z_loss_clip": 1.54199219, + "router_z_loss_mlp": 0.12457275, + "step": 8580, + "time_per_iteration": 2.5879290103912354 + }, + { + "auxiliary_loss_clip": 0.0643789, + "auxiliary_loss_mlp": 0.01268597, + "balance_loss_clip": 0.06283616, + "balance_loss_mlp": 0.01254763, + "epoch": 0.5159176311438449, + "flos": 12646350120960.0, + "grad_norm": 1.9944005001089613, + "language_loss": 0.73488963, + "learning_rate": 1.99396335310315e-06, + "loss": 0.81195444, + "num_input_tokens_seen": 184460775, + "router_z_loss_clip": 1.54199219, + "router_z_loss_mlp": 0.1383667, + "step": 8581, + "time_per_iteration": 2.500063180923462 + }, + { + "auxiliary_loss_clip": 0.06434488, + "auxiliary_loss_mlp": 0.01266672, + "balance_loss_clip": 0.06284754, + "balance_loss_mlp": 0.01254781, + "epoch": 0.5159777543965128, + "flos": 15563302456320.0, + "grad_norm": 1.882801773214852, + "language_loss": 0.74207276, + "learning_rate": 1.9935738933126508e-06, + "loss": 0.81908435, + "num_input_tokens_seen": 184477365, + "router_z_loss_clip": 1.49511719, + "router_z_loss_mlp": 0.11901855, + "step": 8582, + "time_per_iteration": 2.518564462661743 + }, + { + "auxiliary_loss_clip": 0.06429887, + "auxiliary_loss_mlp": 0.01265806, + "balance_loss_clip": 0.06280613, + "balance_loss_mlp": 0.01254648, + "epoch": 0.5160378776491809, + "flos": 23228352328320.0, + "grad_norm": 1.8807127189493567, + "language_loss": 0.66238904, + "learning_rate": 1.99318443376583e-06, + "loss": 0.73934591, + "num_input_tokens_seen": 184497045, + "router_z_loss_clip": 1.49316406, + "router_z_loss_mlp": 0.11157227, + "step": 8583, + "time_per_iteration": 2.542539119720459 + }, + { + "auxiliary_loss_clip": 0.06437095, + "auxiliary_loss_mlp": 0.01269933, + "balance_loss_clip": 0.06283841, + "balance_loss_mlp": 0.01257404, + "epoch": 0.5160980009018488, + "flos": 21951074615040.0, + "grad_norm": 1.3417837681818925, + "language_loss": 0.760252, + "learning_rate": 1.9927949744774568e-06, + "loss": 0.83732229, + "num_input_tokens_seen": 184517675, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.12524414, + "step": 8584, + "time_per_iteration": 2.587082624435425 + }, + { + "auxiliary_loss_clip": 0.06437847, + "auxiliary_loss_mlp": 0.01266727, + "balance_loss_clip": 0.06283042, + "balance_loss_mlp": 0.01253579, + "epoch": 0.5161581241545168, + "flos": 22790708853120.0, + "grad_norm": 1.8159571462416286, + "language_loss": 0.78972226, + "learning_rate": 1.9924055154622983e-06, + "loss": 0.866768, + "num_input_tokens_seen": 184537745, + "router_z_loss_clip": 1.54785156, + "router_z_loss_mlp": 0.13153076, + "step": 8585, + "time_per_iteration": 3.918409824371338 + }, + { + "auxiliary_loss_clip": 0.06432407, + "auxiliary_loss_mlp": 0.01268664, + "balance_loss_clip": 0.06287332, + "balance_loss_mlp": 0.01257076, + "epoch": 0.5162182474071847, + "flos": 19680273440640.0, + "grad_norm": 1.974004410778628, + "language_loss": 0.81013006, + "learning_rate": 1.9920160567351238e-06, + "loss": 0.88714075, + "num_input_tokens_seen": 184553630, + "router_z_loss_clip": 1.45117188, + "router_z_loss_mlp": 0.11578369, + "step": 8586, + "time_per_iteration": 2.4944536685943604 + }, + { + "auxiliary_loss_clip": 0.06434685, + "auxiliary_loss_mlp": 0.01270978, + "balance_loss_clip": 0.06284505, + "balance_loss_mlp": 0.01258473, + "epoch": 0.5162783706598527, + "flos": 20052145860480.0, + "grad_norm": 2.892216813448522, + "language_loss": 0.71914274, + "learning_rate": 1.991626598310701e-06, + "loss": 0.79619938, + "num_input_tokens_seen": 184573530, + "router_z_loss_clip": 1.50097656, + "router_z_loss_mlp": 0.125, + "step": 8587, + "time_per_iteration": 2.500964403152466 + }, + { + "auxiliary_loss_clip": 0.06328937, + "auxiliary_loss_mlp": 0.01260473, + "balance_loss_clip": 0.06264381, + "balance_loss_mlp": 0.01258639, + "epoch": 0.5163384939125206, + "flos": 69980089610880.0, + "grad_norm": 0.7154986672608752, + "language_loss": 0.57844335, + "learning_rate": 1.9912371402037984e-06, + "loss": 0.65433741, + "num_input_tokens_seen": 184637875, + "router_z_loss_clip": 0.6484375, + "router_z_loss_mlp": 0.01829529, + "step": 8588, + "time_per_iteration": 4.569206476211548 + }, + { + "auxiliary_loss_clip": 0.06434999, + "auxiliary_loss_mlp": 0.01267755, + "balance_loss_clip": 0.06281946, + "balance_loss_mlp": 0.01254618, + "epoch": 0.5163986171651886, + "flos": 17422176159360.0, + "grad_norm": 8.344302755834537, + "language_loss": 0.75224382, + "learning_rate": 1.990847682429185e-06, + "loss": 0.82927144, + "num_input_tokens_seen": 184656125, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.13134766, + "step": 8589, + "time_per_iteration": 2.551936388015747 + }, + { + "auxiliary_loss_clip": 0.06436837, + "auxiliary_loss_mlp": 0.01265639, + "balance_loss_clip": 0.0628375, + "balance_loss_mlp": 0.01254607, + "epoch": 0.5164587404178566, + "flos": 21328752741120.0, + "grad_norm": 1.4649655682055334, + "language_loss": 0.67921245, + "learning_rate": 1.990458225001627e-06, + "loss": 0.75623721, + "num_input_tokens_seen": 184675920, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.11035156, + "step": 8590, + "time_per_iteration": 2.5104808807373047 + }, + { + "auxiliary_loss_clip": 0.06330067, + "auxiliary_loss_mlp": 0.01255277, + "balance_loss_clip": 0.06265621, + "balance_loss_mlp": 0.01253319, + "epoch": 0.5165188636705246, + "flos": 68076506954880.0, + "grad_norm": 0.7672531816981234, + "language_loss": 0.55843657, + "learning_rate": 1.990068767935895e-06, + "loss": 0.63428998, + "num_input_tokens_seen": 184730520, + "router_z_loss_clip": 0.64550781, + "router_z_loss_mlp": 0.01956177, + "step": 8591, + "time_per_iteration": 3.0606987476348877 + }, + { + "auxiliary_loss_clip": 0.06426874, + "auxiliary_loss_mlp": 0.01264002, + "balance_loss_clip": 0.06283274, + "balance_loss_mlp": 0.01253261, + "epoch": 0.5165789869231926, + "flos": 19390859038080.0, + "grad_norm": 1.5432128891960295, + "language_loss": 0.81508362, + "learning_rate": 1.9896793112467566e-06, + "loss": 0.89199233, + "num_input_tokens_seen": 184748340, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.10736084, + "step": 8592, + "time_per_iteration": 2.5063397884368896 + }, + { + "auxiliary_loss_clip": 0.0642782, + "auxiliary_loss_mlp": 0.01262629, + "balance_loss_clip": 0.06281757, + "balance_loss_mlp": 0.01251626, + "epoch": 0.5166391101758605, + "flos": 20966607394560.0, + "grad_norm": 1.7131386706837877, + "language_loss": 0.83462119, + "learning_rate": 1.989289854948979e-06, + "loss": 0.91152561, + "num_input_tokens_seen": 184766615, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.11010742, + "step": 8593, + "time_per_iteration": 3.951284170150757 + }, + { + "auxiliary_loss_clip": 0.06431139, + "auxiliary_loss_mlp": 0.01265605, + "balance_loss_clip": 0.06281991, + "balance_loss_mlp": 0.01253833, + "epoch": 0.5166992334285285, + "flos": 29470411036800.0, + "grad_norm": 1.8647556534792968, + "language_loss": 0.69381714, + "learning_rate": 1.9889003990573314e-06, + "loss": 0.77078462, + "num_input_tokens_seen": 184788075, + "router_z_loss_clip": 1.49121094, + "router_z_loss_mlp": 0.11761475, + "step": 8594, + "time_per_iteration": 2.600724220275879 + }, + { + "auxiliary_loss_clip": 0.06431773, + "auxiliary_loss_mlp": 0.01266128, + "balance_loss_clip": 0.06282206, + "balance_loss_mlp": 0.0125441, + "epoch": 0.5167593566811964, + "flos": 20310813014400.0, + "grad_norm": 1.4700297891307748, + "language_loss": 0.77611995, + "learning_rate": 1.988510943586582e-06, + "loss": 0.85309899, + "num_input_tokens_seen": 184808710, + "router_z_loss_clip": 1.49511719, + "router_z_loss_mlp": 0.1171875, + "step": 8595, + "time_per_iteration": 2.5478954315185547 + }, + { + "auxiliary_loss_clip": 0.06431342, + "auxiliary_loss_mlp": 0.01266673, + "balance_loss_clip": 0.06281155, + "balance_loss_mlp": 0.01255563, + "epoch": 0.5168194799338645, + "flos": 14616668154240.0, + "grad_norm": 1.457832438333805, + "language_loss": 0.65828246, + "learning_rate": 1.9881214885514986e-06, + "loss": 0.73526263, + "num_input_tokens_seen": 184826475, + "router_z_loss_clip": 1.50292969, + "router_z_loss_mlp": 0.11114502, + "step": 8596, + "time_per_iteration": 2.5720162391662598 + }, + { + "auxiliary_loss_clip": 0.06432624, + "auxiliary_loss_mlp": 0.01271477, + "balance_loss_clip": 0.06281975, + "balance_loss_mlp": 0.01258483, + "epoch": 0.5168796031865324, + "flos": 25013866181760.0, + "grad_norm": 1.4915456509806782, + "language_loss": 0.75734007, + "learning_rate": 1.9877320339668492e-06, + "loss": 0.8343811, + "num_input_tokens_seen": 184845245, + "router_z_loss_clip": 1.50683594, + "router_z_loss_mlp": 0.12988281, + "step": 8597, + "time_per_iteration": 2.5495989322662354 + }, + { + "auxiliary_loss_clip": 0.06427812, + "auxiliary_loss_mlp": 0.01266343, + "balance_loss_clip": 0.06278015, + "balance_loss_mlp": 0.01254583, + "epoch": 0.5169397264392004, + "flos": 26946728640000.0, + "grad_norm": 1.7231987845025152, + "language_loss": 0.8152492, + "learning_rate": 1.987342579847403e-06, + "loss": 0.89219069, + "num_input_tokens_seen": 184866605, + "router_z_loss_clip": 1.49609375, + "router_z_loss_mlp": 0.11773682, + "step": 8598, + "time_per_iteration": 2.6746177673339844 + }, + { + "auxiliary_loss_clip": 0.06427282, + "auxiliary_loss_mlp": 0.0126742, + "balance_loss_clip": 0.06279184, + "balance_loss_mlp": 0.0125523, + "epoch": 0.5169998496918683, + "flos": 25414347571200.0, + "grad_norm": 1.537627068096994, + "language_loss": 0.7597698, + "learning_rate": 1.9869531262079273e-06, + "loss": 0.83671683, + "num_input_tokens_seen": 184886945, + "router_z_loss_clip": 1.47851562, + "router_z_loss_mlp": 0.12194824, + "step": 8599, + "time_per_iteration": 2.548478841781616 + }, + { + "auxiliary_loss_clip": 0.06428513, + "auxiliary_loss_mlp": 0.01264151, + "balance_loss_clip": 0.06279552, + "balance_loss_mlp": 0.01253291, + "epoch": 0.5170599729445363, + "flos": 24687667036800.0, + "grad_norm": 4.521028695007152, + "language_loss": 0.72775459, + "learning_rate": 1.9865636730631904e-06, + "loss": 0.80468118, + "num_input_tokens_seen": 184905590, + "router_z_loss_clip": 1.48828125, + "router_z_loss_mlp": 0.10852051, + "step": 8600, + "time_per_iteration": 3.977342367172241 + }, + { + "auxiliary_loss_clip": 0.06427286, + "auxiliary_loss_mlp": 0.01268182, + "balance_loss_clip": 0.06278619, + "balance_loss_mlp": 0.01256732, + "epoch": 0.5171200961972042, + "flos": 21000499171200.0, + "grad_norm": 1.369345328324843, + "language_loss": 0.74472946, + "learning_rate": 1.9861742204279602e-06, + "loss": 0.82168412, + "num_input_tokens_seen": 184925555, + "router_z_loss_clip": 1.48535156, + "router_z_loss_mlp": 0.11444092, + "step": 8601, + "time_per_iteration": 2.5409762859344482 + }, + { + "auxiliary_loss_clip": 0.06429532, + "auxiliary_loss_mlp": 0.01271067, + "balance_loss_clip": 0.06278992, + "balance_loss_mlp": 0.01258467, + "epoch": 0.5171802194498722, + "flos": 22751953539840.0, + "grad_norm": 1.8713669852223682, + "language_loss": 0.83940291, + "learning_rate": 1.9857847683170045e-06, + "loss": 0.9164089, + "num_input_tokens_seen": 184944490, + "router_z_loss_clip": 1.50585938, + "router_z_loss_mlp": 0.12597656, + "step": 8602, + "time_per_iteration": 2.5086002349853516 + }, + { + "auxiliary_loss_clip": 0.06431083, + "auxiliary_loss_mlp": 0.01265946, + "balance_loss_clip": 0.06279787, + "balance_loss_mlp": 0.01254026, + "epoch": 0.5172403427025402, + "flos": 28183070833920.0, + "grad_norm": 1.835239532551919, + "language_loss": 0.74816436, + "learning_rate": 1.9853953167450926e-06, + "loss": 0.82513469, + "num_input_tokens_seen": 184963190, + "router_z_loss_clip": 1.51269531, + "router_z_loss_mlp": 0.1192627, + "step": 8603, + "time_per_iteration": 2.628830909729004 + }, + { + "auxiliary_loss_clip": 0.06434101, + "auxiliary_loss_mlp": 0.01267589, + "balance_loss_clip": 0.06281082, + "balance_loss_mlp": 0.01255566, + "epoch": 0.5173004659552082, + "flos": 20343782396160.0, + "grad_norm": 2.436721116583926, + "language_loss": 0.73165393, + "learning_rate": 1.9850058657269915e-06, + "loss": 0.80867082, + "num_input_tokens_seen": 184981220, + "router_z_loss_clip": 1.52734375, + "router_z_loss_mlp": 0.12017822, + "step": 8604, + "time_per_iteration": 2.521681785583496 + }, + { + "auxiliary_loss_clip": 0.06440152, + "auxiliary_loss_mlp": 0.01268375, + "balance_loss_clip": 0.06279815, + "balance_loss_mlp": 0.01254469, + "epoch": 0.5173605892078762, + "flos": 19069481502720.0, + "grad_norm": 1.6971244246662016, + "language_loss": 0.85418487, + "learning_rate": 1.984616415277469e-06, + "loss": 0.93127012, + "num_input_tokens_seen": 184998810, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.13922119, + "step": 8605, + "time_per_iteration": 2.5182762145996094 + }, + { + "auxiliary_loss_clip": 0.06430884, + "auxiliary_loss_mlp": 0.01270289, + "balance_loss_clip": 0.06279606, + "balance_loss_mlp": 0.01258893, + "epoch": 0.5174207124605441, + "flos": 28001620817280.0, + "grad_norm": 1.308601391892793, + "language_loss": 0.64964187, + "learning_rate": 1.984226965411294e-06, + "loss": 0.72665358, + "num_input_tokens_seen": 185021185, + "router_z_loss_clip": 1.51269531, + "router_z_loss_mlp": 0.1138916, + "step": 8606, + "time_per_iteration": 2.5762083530426025 + }, + { + "auxiliary_loss_clip": 0.06431288, + "auxiliary_loss_mlp": 0.01265541, + "balance_loss_clip": 0.06280211, + "balance_loss_mlp": 0.0125362, + "epoch": 0.5174808357132121, + "flos": 19502135660160.0, + "grad_norm": 1.5729301555613031, + "language_loss": 0.78141046, + "learning_rate": 1.983837516143234e-06, + "loss": 0.85837877, + "num_input_tokens_seen": 185038465, + "router_z_loss_clip": 1.50976562, + "router_z_loss_mlp": 0.11914062, + "step": 8607, + "time_per_iteration": 2.5321435928344727 + }, + { + "auxiliary_loss_clip": 0.06431965, + "auxiliary_loss_mlp": 0.01271738, + "balance_loss_clip": 0.06280412, + "balance_loss_mlp": 0.01259049, + "epoch": 0.51754095896588, + "flos": 22790834634240.0, + "grad_norm": 1.7409540075434562, + "language_loss": 0.72313815, + "learning_rate": 1.983448067488057e-06, + "loss": 0.80017519, + "num_input_tokens_seen": 185057340, + "router_z_loss_clip": 1.51660156, + "router_z_loss_mlp": 0.12677002, + "step": 8608, + "time_per_iteration": 2.52758526802063 + }, + { + "auxiliary_loss_clip": 0.06435958, + "auxiliary_loss_mlp": 0.01273384, + "balance_loss_clip": 0.06279105, + "balance_loss_mlp": 0.01261046, + "epoch": 0.5176010822185481, + "flos": 22674987964800.0, + "grad_norm": 1.7194792439439102, + "language_loss": 0.86816031, + "learning_rate": 1.983058619460531e-06, + "loss": 0.94525373, + "num_input_tokens_seen": 185074935, + "router_z_loss_clip": 1.56738281, + "router_z_loss_mlp": 0.12341309, + "step": 8609, + "time_per_iteration": 2.538146495819092 + }, + { + "auxiliary_loss_clip": 0.06431948, + "auxiliary_loss_mlp": 0.0126355, + "balance_loss_clip": 0.06280786, + "balance_loss_mlp": 0.01252201, + "epoch": 0.517661205471216, + "flos": 23957967755520.0, + "grad_norm": 2.0604849644666943, + "language_loss": 0.73853832, + "learning_rate": 1.9826691720754237e-06, + "loss": 0.81549335, + "num_input_tokens_seen": 185095050, + "router_z_loss_clip": 1.51074219, + "router_z_loss_mlp": 0.11352539, + "step": 8610, + "time_per_iteration": 2.5313732624053955 + }, + { + "auxiliary_loss_clip": 0.064363, + "auxiliary_loss_mlp": 0.01270735, + "balance_loss_clip": 0.06279181, + "balance_loss_mlp": 0.01258051, + "epoch": 0.517721328723884, + "flos": 15601470791040.0, + "grad_norm": 2.184245135297296, + "language_loss": 0.67738098, + "learning_rate": 1.9822797253475034e-06, + "loss": 0.75445139, + "num_input_tokens_seen": 185112275, + "router_z_loss_clip": 1.57128906, + "router_z_loss_mlp": 0.12689209, + "step": 8611, + "time_per_iteration": 2.510500431060791 + }, + { + "auxiliary_loss_clip": 0.06427399, + "auxiliary_loss_mlp": 0.0126573, + "balance_loss_clip": 0.06275965, + "balance_loss_mlp": 0.01253153, + "epoch": 0.5177814519765519, + "flos": 20966607394560.0, + "grad_norm": 1.678614110348905, + "language_loss": 0.77387339, + "learning_rate": 1.9818902792915373e-06, + "loss": 0.85080469, + "num_input_tokens_seen": 185132165, + "router_z_loss_clip": 1.51367188, + "router_z_loss_mlp": 0.12573242, + "step": 8612, + "time_per_iteration": 2.5206472873687744 + }, + { + "auxiliary_loss_clip": 0.064338, + "auxiliary_loss_mlp": 0.01269204, + "balance_loss_clip": 0.0628019, + "balance_loss_mlp": 0.01257641, + "epoch": 0.5178415752292199, + "flos": 17973653806080.0, + "grad_norm": 1.9437798274552756, + "language_loss": 0.82318223, + "learning_rate": 1.981500833922294e-06, + "loss": 0.90021223, + "num_input_tokens_seen": 185151025, + "router_z_loss_clip": 1.53613281, + "router_z_loss_mlp": 0.11560059, + "step": 8613, + "time_per_iteration": 2.4999184608459473 + }, + { + "auxiliary_loss_clip": 0.06431679, + "auxiliary_loss_mlp": 0.01268922, + "balance_loss_clip": 0.062784, + "balance_loss_mlp": 0.01255511, + "epoch": 0.5179016984818878, + "flos": 17827227596160.0, + "grad_norm": 2.2958122780571473, + "language_loss": 0.66944718, + "learning_rate": 1.981111389254541e-06, + "loss": 0.74645323, + "num_input_tokens_seen": 185168455, + "router_z_loss_clip": 1.53320312, + "router_z_loss_mlp": 0.1340332, + "step": 8614, + "time_per_iteration": 2.480762004852295 + }, + { + "auxiliary_loss_clip": 0.06432712, + "auxiliary_loss_mlp": 0.0126997, + "balance_loss_clip": 0.06278278, + "balance_loss_mlp": 0.01257465, + "epoch": 0.5179618217345558, + "flos": 17826011712000.0, + "grad_norm": 1.8941766649542733, + "language_loss": 0.87114352, + "learning_rate": 1.9807219453030453e-06, + "loss": 0.94817036, + "num_input_tokens_seen": 185184415, + "router_z_loss_clip": 1.54492188, + "router_z_loss_mlp": 0.12493896, + "step": 8615, + "time_per_iteration": 2.500279188156128 + }, + { + "auxiliary_loss_clip": 0.06431083, + "auxiliary_loss_mlp": 0.01270372, + "balance_loss_clip": 0.06278686, + "balance_loss_mlp": 0.01258731, + "epoch": 0.5180219449872238, + "flos": 22527639360000.0, + "grad_norm": 1.466896191984659, + "language_loss": 0.80947113, + "learning_rate": 1.9803325020825763e-06, + "loss": 0.8864857, + "num_input_tokens_seen": 185202910, + "router_z_loss_clip": 1.5234375, + "router_z_loss_mlp": 0.11639404, + "step": 8616, + "time_per_iteration": 2.523977279663086 + }, + { + "auxiliary_loss_clip": 0.06436383, + "auxiliary_loss_mlp": 0.01270292, + "balance_loss_clip": 0.0627937, + "balance_loss_mlp": 0.01257554, + "epoch": 0.5180820682398918, + "flos": 23922356970240.0, + "grad_norm": 2.681335053285678, + "language_loss": 0.75563776, + "learning_rate": 1.9799430596079e-06, + "loss": 0.83270454, + "num_input_tokens_seen": 185223085, + "router_z_loss_clip": 1.56835938, + "router_z_loss_mlp": 0.12744141, + "step": 8617, + "time_per_iteration": 2.5584635734558105 + }, + { + "auxiliary_loss_clip": 0.0643236, + "auxiliary_loss_mlp": 0.01270738, + "balance_loss_clip": 0.06279179, + "balance_loss_mlp": 0.01258215, + "epoch": 0.5181421914925598, + "flos": 16985119662720.0, + "grad_norm": 2.384459515549961, + "language_loss": 0.70321333, + "learning_rate": 1.979553617893785e-06, + "loss": 0.78024429, + "num_input_tokens_seen": 185241295, + "router_z_loss_clip": 1.53320312, + "router_z_loss_mlp": 0.12518311, + "step": 8618, + "time_per_iteration": 2.4864299297332764 + }, + { + "auxiliary_loss_clip": 0.06326556, + "auxiliary_loss_mlp": 0.01258187, + "balance_loss_clip": 0.0626248, + "balance_loss_mlp": 0.01256348, + "epoch": 0.5182023147452277, + "flos": 66080472917760.0, + "grad_norm": 0.9021946533901657, + "language_loss": 0.6731512, + "learning_rate": 1.979164176954999e-06, + "loss": 0.74899864, + "num_input_tokens_seen": 185298295, + "router_z_loss_clip": 0.640625, + "router_z_loss_mlp": 0.01834106, + "step": 8619, + "time_per_iteration": 3.1113593578338623 + }, + { + "auxiliary_loss_clip": 0.06429242, + "auxiliary_loss_mlp": 0.01268132, + "balance_loss_clip": 0.06279487, + "balance_loss_mlp": 0.01256235, + "epoch": 0.5182624379978957, + "flos": 18193775281920.0, + "grad_norm": 1.7875432352275369, + "language_loss": 0.79252517, + "learning_rate": 1.97877473680631e-06, + "loss": 0.86949891, + "num_input_tokens_seen": 185317000, + "router_z_loss_clip": 1.49804688, + "router_z_loss_mlp": 0.11883545, + "step": 8620, + "time_per_iteration": 2.490337371826172 + }, + { + "auxiliary_loss_clip": 0.06426805, + "auxiliary_loss_mlp": 0.01265045, + "balance_loss_clip": 0.06278054, + "balance_loss_mlp": 0.01253815, + "epoch": 0.5183225612505636, + "flos": 14031759928320.0, + "grad_norm": 2.0424555394318347, + "language_loss": 0.82670712, + "learning_rate": 1.9783852974624846e-06, + "loss": 0.90362567, + "num_input_tokens_seen": 185331185, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.11236572, + "step": 8621, + "time_per_iteration": 2.5358636379241943 + }, + { + "auxiliary_loss_clip": 0.06430708, + "auxiliary_loss_mlp": 0.01270453, + "balance_loss_clip": 0.06278727, + "balance_loss_mlp": 0.01257787, + "epoch": 0.5183826845032317, + "flos": 23666582782080.0, + "grad_norm": 3.572556492630201, + "language_loss": 0.65903664, + "learning_rate": 1.9779958589382905e-06, + "loss": 0.73604816, + "num_input_tokens_seen": 185348955, + "router_z_loss_clip": 1.51757812, + "router_z_loss_mlp": 0.12664795, + "step": 8622, + "time_per_iteration": 2.5054616928100586 + }, + { + "auxiliary_loss_clip": 0.06440182, + "auxiliary_loss_mlp": 0.0126943, + "balance_loss_clip": 0.06282417, + "balance_loss_mlp": 0.01257419, + "epoch": 0.5184428077558996, + "flos": 15894155502720.0, + "grad_norm": 2.003886693767472, + "language_loss": 0.60810971, + "learning_rate": 1.977606421248497e-06, + "loss": 0.68520582, + "num_input_tokens_seen": 185367330, + "router_z_loss_clip": 1.57617188, + "router_z_loss_mlp": 0.12011719, + "step": 8623, + "time_per_iteration": 2.517026662826538 + }, + { + "auxiliary_loss_clip": 0.06431899, + "auxiliary_loss_mlp": 0.01268479, + "balance_loss_clip": 0.06278786, + "balance_loss_mlp": 0.01256766, + "epoch": 0.5185029310085676, + "flos": 21036864643200.0, + "grad_norm": 1.709310334319468, + "language_loss": 0.76342779, + "learning_rate": 1.9772169844078685e-06, + "loss": 0.84043157, + "num_input_tokens_seen": 185385060, + "router_z_loss_clip": 1.53027344, + "router_z_loss_mlp": 0.11712646, + "step": 8624, + "time_per_iteration": 2.5128896236419678 + }, + { + "auxiliary_loss_clip": 0.0643063, + "auxiliary_loss_mlp": 0.01264535, + "balance_loss_clip": 0.06277324, + "balance_loss_mlp": 0.01251684, + "epoch": 0.5185630542612355, + "flos": 26550062611200.0, + "grad_norm": 2.453361725716909, + "language_loss": 0.71663254, + "learning_rate": 1.9768275484311756e-06, + "loss": 0.79358423, + "num_input_tokens_seen": 185403745, + "router_z_loss_clip": 1.53320312, + "router_z_loss_mlp": 0.12854004, + "step": 8625, + "time_per_iteration": 3.9488492012023926 + }, + { + "auxiliary_loss_clip": 0.06427859, + "auxiliary_loss_mlp": 0.01267162, + "balance_loss_clip": 0.06276631, + "balance_loss_mlp": 0.01255378, + "epoch": 0.5186231775139035, + "flos": 20674803150720.0, + "grad_norm": 1.8867804759418334, + "language_loss": 0.68206352, + "learning_rate": 1.976438113333184e-06, + "loss": 0.75901365, + "num_input_tokens_seen": 185422620, + "router_z_loss_clip": 1.51171875, + "router_z_loss_mlp": 0.11785889, + "step": 8626, + "time_per_iteration": 2.5555548667907715 + }, + { + "auxiliary_loss_clip": 0.06429964, + "auxiliary_loss_mlp": 0.01270465, + "balance_loss_clip": 0.06278128, + "balance_loss_mlp": 0.01257459, + "epoch": 0.5186833007665714, + "flos": 20891612390400.0, + "grad_norm": 1.918580922134282, + "language_loss": 0.70565557, + "learning_rate": 1.9760486791286612e-06, + "loss": 0.78265989, + "num_input_tokens_seen": 185439380, + "router_z_loss_clip": 1.51757812, + "router_z_loss_mlp": 0.13006592, + "step": 8627, + "time_per_iteration": 2.481426954269409 + }, + { + "auxiliary_loss_clip": 0.0643362, + "auxiliary_loss_mlp": 0.01266564, + "balance_loss_clip": 0.06277519, + "balance_loss_mlp": 0.01254399, + "epoch": 0.5187434240192395, + "flos": 20893247544960.0, + "grad_norm": 1.7293286755655957, + "language_loss": 0.73529112, + "learning_rate": 1.9756592458323753e-06, + "loss": 0.81229293, + "num_input_tokens_seen": 185458830, + "router_z_loss_clip": 1.56152344, + "router_z_loss_mlp": 0.12164307, + "step": 8628, + "time_per_iteration": 3.9418892860412598 + }, + { + "auxiliary_loss_clip": 0.0642761, + "auxiliary_loss_mlp": 0.01268136, + "balance_loss_clip": 0.06276411, + "balance_loss_mlp": 0.01255851, + "epoch": 0.5188035472719074, + "flos": 19865203401600.0, + "grad_norm": 1.86469754984735, + "language_loss": 0.77606678, + "learning_rate": 1.9752698134590927e-06, + "loss": 0.85302424, + "num_input_tokens_seen": 185477270, + "router_z_loss_clip": 1.51074219, + "router_z_loss_mlp": 0.1229248, + "step": 8629, + "time_per_iteration": 2.536813974380493 + }, + { + "auxiliary_loss_clip": 0.06431592, + "auxiliary_loss_mlp": 0.01268458, + "balance_loss_clip": 0.06276736, + "balance_loss_mlp": 0.01255923, + "epoch": 0.5188636705245754, + "flos": 21144032415360.0, + "grad_norm": 2.295438438275443, + "language_loss": 0.74746907, + "learning_rate": 1.9748803820235815e-06, + "loss": 0.82446957, + "num_input_tokens_seen": 185495795, + "router_z_loss_clip": 1.546875, + "router_z_loss_mlp": 0.12536621, + "step": 8630, + "time_per_iteration": 2.5338122844696045 + }, + { + "auxiliary_loss_clip": 0.06432383, + "auxiliary_loss_mlp": 0.0126778, + "balance_loss_clip": 0.06276915, + "balance_loss_mlp": 0.01253636, + "epoch": 0.5189237937772434, + "flos": 22426467154560.0, + "grad_norm": 1.6718033524216807, + "language_loss": 0.80433989, + "learning_rate": 1.9744909515406093e-06, + "loss": 0.88134158, + "num_input_tokens_seen": 185514885, + "router_z_loss_clip": 1.5546875, + "router_z_loss_mlp": 0.14141846, + "step": 8631, + "time_per_iteration": 2.5228912830352783 + }, + { + "auxiliary_loss_clip": 0.06431842, + "auxiliary_loss_mlp": 0.01268253, + "balance_loss_clip": 0.06276682, + "balance_loss_mlp": 0.01255187, + "epoch": 0.5189839170299113, + "flos": 25453647936000.0, + "grad_norm": 1.4304618482279687, + "language_loss": 0.74388516, + "learning_rate": 1.974101522024942e-06, + "loss": 0.82088614, + "num_input_tokens_seen": 185537155, + "router_z_loss_clip": 1.55175781, + "router_z_loss_mlp": 0.1305542, + "step": 8632, + "time_per_iteration": 2.5850229263305664 + }, + { + "auxiliary_loss_clip": 0.06424779, + "auxiliary_loss_mlp": 0.01268865, + "balance_loss_clip": 0.06277869, + "balance_loss_mlp": 0.01255865, + "epoch": 0.5190440402825793, + "flos": 18593585838720.0, + "grad_norm": 1.7732237266140687, + "language_loss": 0.79105878, + "learning_rate": 1.9737120934913477e-06, + "loss": 0.86799526, + "num_input_tokens_seen": 185555520, + "router_z_loss_clip": 1.46777344, + "router_z_loss_mlp": 0.13018799, + "step": 8633, + "time_per_iteration": 3.944106340408325 + }, + { + "auxiliary_loss_clip": 0.06433854, + "auxiliary_loss_mlp": 0.01265699, + "balance_loss_clip": 0.06279819, + "balance_loss_mlp": 0.01253492, + "epoch": 0.5191041635352472, + "flos": 21915170340480.0, + "grad_norm": 1.7747709828095277, + "language_loss": 0.80929339, + "learning_rate": 1.9733226659545936e-06, + "loss": 0.88628888, + "num_input_tokens_seen": 185573855, + "router_z_loss_clip": 1.5390625, + "router_z_loss_mlp": 0.12200928, + "step": 8634, + "time_per_iteration": 2.4922289848327637 + }, + { + "auxiliary_loss_clip": 0.0643179, + "auxiliary_loss_mlp": 0.01268829, + "balance_loss_clip": 0.06280308, + "balance_loss_mlp": 0.01256985, + "epoch": 0.5191642867879153, + "flos": 27535536080640.0, + "grad_norm": 1.4623629686344204, + "language_loss": 0.69064617, + "learning_rate": 1.9729332394294467e-06, + "loss": 0.76765239, + "num_input_tokens_seen": 185595145, + "router_z_loss_clip": 1.51660156, + "router_z_loss_mlp": 0.11846924, + "step": 8635, + "time_per_iteration": 2.5806636810302734 + }, + { + "auxiliary_loss_clip": 0.06433641, + "auxiliary_loss_mlp": 0.01269766, + "balance_loss_clip": 0.06278556, + "balance_loss_mlp": 0.01257356, + "epoch": 0.5192244100405832, + "flos": 15711489601920.0, + "grad_norm": 1.5680222184402974, + "language_loss": 0.77829492, + "learning_rate": 1.9725438139306742e-06, + "loss": 0.85532898, + "num_input_tokens_seen": 185613320, + "router_z_loss_clip": 1.55371094, + "router_z_loss_mlp": 0.12414551, + "step": 8636, + "time_per_iteration": 2.5346691608428955 + }, + { + "auxiliary_loss_clip": 0.0643746, + "auxiliary_loss_mlp": 0.01268889, + "balance_loss_clip": 0.06281422, + "balance_loss_mlp": 0.01256122, + "epoch": 0.5192845332932512, + "flos": 12061903092480.0, + "grad_norm": 2.0443106284945016, + "language_loss": 0.72005326, + "learning_rate": 1.9721543894730425e-06, + "loss": 0.7971167, + "num_input_tokens_seen": 185630730, + "router_z_loss_clip": 1.55859375, + "router_z_loss_mlp": 0.12768555, + "step": 8637, + "time_per_iteration": 2.5669779777526855 + }, + { + "auxiliary_loss_clip": 0.06428012, + "auxiliary_loss_mlp": 0.01270032, + "balance_loss_clip": 0.06279644, + "balance_loss_mlp": 0.01257724, + "epoch": 0.5193446565459191, + "flos": 18959211129600.0, + "grad_norm": 2.0277263511036625, + "language_loss": 0.76600313, + "learning_rate": 1.9717649660713194e-06, + "loss": 0.8429836, + "num_input_tokens_seen": 185648515, + "router_z_loss_clip": 1.48339844, + "router_z_loss_mlp": 0.12298584, + "step": 8638, + "time_per_iteration": 2.4836151599884033 + }, + { + "auxiliary_loss_clip": 0.06427278, + "auxiliary_loss_mlp": 0.012673, + "balance_loss_clip": 0.06276545, + "balance_loss_mlp": 0.0125548, + "epoch": 0.5194047797985871, + "flos": 20381028336000.0, + "grad_norm": 1.8081920937255338, + "language_loss": 0.74863744, + "learning_rate": 1.971375543740272e-06, + "loss": 0.82558322, + "num_input_tokens_seen": 185665220, + "router_z_loss_clip": 1.50878906, + "router_z_loss_mlp": 0.11828613, + "step": 8639, + "time_per_iteration": 2.508589029312134 + }, + { + "auxiliary_loss_clip": 0.06432048, + "auxiliary_loss_mlp": 0.01270657, + "balance_loss_clip": 0.06280512, + "balance_loss_mlp": 0.01258045, + "epoch": 0.519464903051255, + "flos": 24359916591360.0, + "grad_norm": 1.679129082437046, + "language_loss": 0.77792585, + "learning_rate": 1.9709861224946665e-06, + "loss": 0.85495287, + "num_input_tokens_seen": 185683750, + "router_z_loss_clip": 1.51367188, + "router_z_loss_mlp": 0.12628174, + "step": 8640, + "time_per_iteration": 4.030183553695679 + }, + { + "auxiliary_loss_clip": 0.06430673, + "auxiliary_loss_mlp": 0.012682, + "balance_loss_clip": 0.06282452, + "balance_loss_mlp": 0.01256482, + "epoch": 0.519525026303923, + "flos": 14066657953920.0, + "grad_norm": 1.8086687453592558, + "language_loss": 0.66518152, + "learning_rate": 1.97059670234927e-06, + "loss": 0.74217027, + "num_input_tokens_seen": 185700625, + "router_z_loss_clip": 1.48046875, + "router_z_loss_mlp": 0.11700439, + "step": 8641, + "time_per_iteration": 2.471047878265381 + }, + { + "auxiliary_loss_clip": 0.06427969, + "auxiliary_loss_mlp": 0.01270672, + "balance_loss_clip": 0.06279019, + "balance_loss_mlp": 0.01259228, + "epoch": 0.519585149556591, + "flos": 28842722501760.0, + "grad_norm": 1.7536948571823123, + "language_loss": 0.76330602, + "learning_rate": 1.97020728331885e-06, + "loss": 0.84029233, + "num_input_tokens_seen": 185721155, + "router_z_loss_clip": 1.48730469, + "router_z_loss_mlp": 0.11456299, + "step": 8642, + "time_per_iteration": 2.5977513790130615 + }, + { + "auxiliary_loss_clip": 0.06428998, + "auxiliary_loss_mlp": 0.01266151, + "balance_loss_clip": 0.06280753, + "balance_loss_mlp": 0.01254374, + "epoch": 0.519645272809259, + "flos": 25379826888960.0, + "grad_norm": 21.827473826572724, + "language_loss": 0.83256245, + "learning_rate": 1.9698178654181726e-06, + "loss": 0.90951395, + "num_input_tokens_seen": 185740990, + "router_z_loss_clip": 1.48339844, + "router_z_loss_mlp": 0.11767578, + "step": 8643, + "time_per_iteration": 2.547438621520996 + }, + { + "auxiliary_loss_clip": 0.06436369, + "auxiliary_loss_mlp": 0.01268573, + "balance_loss_clip": 0.06280598, + "balance_loss_mlp": 0.01255508, + "epoch": 0.519705396061927, + "flos": 25379659180800.0, + "grad_norm": 1.5731350893002956, + "language_loss": 0.70531744, + "learning_rate": 1.969428448662004e-06, + "loss": 0.78236687, + "num_input_tokens_seen": 185762235, + "router_z_loss_clip": 1.55664062, + "router_z_loss_mlp": 0.13067627, + "step": 8644, + "time_per_iteration": 2.5876879692077637 + }, + { + "auxiliary_loss_clip": 0.06430183, + "auxiliary_loss_mlp": 0.01266621, + "balance_loss_clip": 0.0627798, + "balance_loss_mlp": 0.01254825, + "epoch": 0.5197655193145949, + "flos": 28483889391360.0, + "grad_norm": 1.5934186274855324, + "language_loss": 0.80385697, + "learning_rate": 1.9690390330651133e-06, + "loss": 0.88082504, + "num_input_tokens_seen": 185783415, + "router_z_loss_clip": 1.52148438, + "router_z_loss_mlp": 0.11804199, + "step": 8645, + "time_per_iteration": 2.574620246887207 + }, + { + "auxiliary_loss_clip": 0.06430401, + "auxiliary_loss_mlp": 0.01271116, + "balance_loss_clip": 0.06280167, + "balance_loss_mlp": 0.01258898, + "epoch": 0.5198256425672629, + "flos": 20014983774720.0, + "grad_norm": 1.690489867798711, + "language_loss": 0.78455305, + "learning_rate": 1.968649618642264e-06, + "loss": 0.86156821, + "num_input_tokens_seen": 185801345, + "router_z_loss_clip": 1.50292969, + "router_z_loss_mlp": 0.12207031, + "step": 8646, + "time_per_iteration": 2.6401519775390625 + }, + { + "auxiliary_loss_clip": 0.06429573, + "auxiliary_loss_mlp": 0.01268342, + "balance_loss_clip": 0.06279829, + "balance_loss_mlp": 0.01256243, + "epoch": 0.5198857658199308, + "flos": 19835043131520.0, + "grad_norm": 2.3656488760516132, + "language_loss": 0.66367847, + "learning_rate": 1.9682602054082252e-06, + "loss": 0.74065757, + "num_input_tokens_seen": 185820815, + "router_z_loss_clip": 1.49804688, + "router_z_loss_mlp": 0.12091064, + "step": 8647, + "time_per_iteration": 2.599353551864624 + }, + { + "auxiliary_loss_clip": 0.06438218, + "auxiliary_loss_mlp": 0.01268132, + "balance_loss_clip": 0.06282619, + "balance_loss_mlp": 0.0125462, + "epoch": 0.5199458890725989, + "flos": 24468761445120.0, + "grad_norm": 1.778197055342432, + "language_loss": 0.71491444, + "learning_rate": 1.967870793377763e-06, + "loss": 0.79197794, + "num_input_tokens_seen": 185841450, + "router_z_loss_clip": 1.55566406, + "router_z_loss_mlp": 0.13513184, + "step": 8648, + "time_per_iteration": 2.572368860244751 + }, + { + "auxiliary_loss_clip": 0.06438164, + "auxiliary_loss_mlp": 0.01268937, + "balance_loss_clip": 0.06285776, + "balance_loss_mlp": 0.01255884, + "epoch": 0.5200060123252668, + "flos": 23411605207680.0, + "grad_norm": 2.1583755088943875, + "language_loss": 0.64699459, + "learning_rate": 1.967481382565642e-06, + "loss": 0.72406554, + "num_input_tokens_seen": 185859935, + "router_z_loss_clip": 1.5234375, + "router_z_loss_mlp": 0.13031006, + "step": 8649, + "time_per_iteration": 2.5117433071136475 + }, + { + "auxiliary_loss_clip": 0.06439677, + "auxiliary_loss_mlp": 0.01274224, + "balance_loss_clip": 0.06281672, + "balance_loss_mlp": 0.01260778, + "epoch": 0.5200661355779348, + "flos": 17207002074240.0, + "grad_norm": 5.161359302041442, + "language_loss": 0.70409989, + "learning_rate": 1.9670919729866315e-06, + "loss": 0.78123897, + "num_input_tokens_seen": 185876795, + "router_z_loss_clip": 1.58105469, + "router_z_loss_mlp": 0.13446045, + "step": 8650, + "time_per_iteration": 2.5144400596618652 + }, + { + "auxiliary_loss_clip": 0.06431218, + "auxiliary_loss_mlp": 0.01268732, + "balance_loss_clip": 0.06279574, + "balance_loss_mlp": 0.01256936, + "epoch": 0.5201262588306027, + "flos": 18520980675840.0, + "grad_norm": 1.6145243882323275, + "language_loss": 0.78030795, + "learning_rate": 1.966702564655496e-06, + "loss": 0.85730743, + "num_input_tokens_seen": 185895570, + "router_z_loss_clip": 1.515625, + "router_z_loss_mlp": 0.11791992, + "step": 8651, + "time_per_iteration": 2.467643976211548 + }, + { + "auxiliary_loss_clip": 0.06437017, + "auxiliary_loss_mlp": 0.01266893, + "balance_loss_clip": 0.06283189, + "balance_loss_mlp": 0.01253709, + "epoch": 0.5201863820832707, + "flos": 18624458868480.0, + "grad_norm": 1.6266187944599841, + "language_loss": 0.79176587, + "learning_rate": 1.966313157587003e-06, + "loss": 0.86880493, + "num_input_tokens_seen": 185913700, + "router_z_loss_clip": 1.53613281, + "router_z_loss_mlp": 0.13171387, + "step": 8652, + "time_per_iteration": 2.5569629669189453 + }, + { + "auxiliary_loss_clip": 0.06434878, + "auxiliary_loss_mlp": 0.01268954, + "balance_loss_clip": 0.0628317, + "balance_loss_mlp": 0.01255919, + "epoch": 0.5202465053359386, + "flos": 22863817140480.0, + "grad_norm": 1.9022927985659936, + "language_loss": 0.70460284, + "learning_rate": 1.9659237517959187e-06, + "loss": 0.78164113, + "num_input_tokens_seen": 185932460, + "router_z_loss_clip": 1.515625, + "router_z_loss_mlp": 0.13049316, + "step": 8653, + "time_per_iteration": 2.5013556480407715 + }, + { + "auxiliary_loss_clip": 0.06435711, + "auxiliary_loss_mlp": 0.01270582, + "balance_loss_clip": 0.06279919, + "balance_loss_mlp": 0.01257124, + "epoch": 0.5203066285886067, + "flos": 21988068992640.0, + "grad_norm": 1.7386916801416297, + "language_loss": 0.78877962, + "learning_rate": 1.965534347297008e-06, + "loss": 0.86584258, + "num_input_tokens_seen": 185952030, + "router_z_loss_clip": 1.55859375, + "router_z_loss_mlp": 0.13452148, + "step": 8654, + "time_per_iteration": 2.5205516815185547 + }, + { + "auxiliary_loss_clip": 0.06439671, + "auxiliary_loss_mlp": 0.01271817, + "balance_loss_clip": 0.06283241, + "balance_loss_mlp": 0.01258763, + "epoch": 0.5203667518412746, + "flos": 20240094568320.0, + "grad_norm": 1.7537160659546802, + "language_loss": 0.84438735, + "learning_rate": 1.9651449441050393e-06, + "loss": 0.92150223, + "num_input_tokens_seen": 185973130, + "router_z_loss_clip": 1.56347656, + "router_z_loss_mlp": 0.13043213, + "step": 8655, + "time_per_iteration": 2.523545026779175 + }, + { + "auxiliary_loss_clip": 0.06427735, + "auxiliary_loss_mlp": 0.01264722, + "balance_loss_clip": 0.06279121, + "balance_loss_mlp": 0.01253027, + "epoch": 0.5204268750939426, + "flos": 15710860696320.0, + "grad_norm": 2.477748600032862, + "language_loss": 0.66631675, + "learning_rate": 1.9647555422347777e-06, + "loss": 0.74324131, + "num_input_tokens_seen": 185990200, + "router_z_loss_clip": 1.48632812, + "router_z_loss_mlp": 0.11688232, + "step": 8656, + "time_per_iteration": 2.504314661026001 + }, + { + "auxiliary_loss_clip": 0.06430535, + "auxiliary_loss_mlp": 0.01266767, + "balance_loss_clip": 0.06278028, + "balance_loss_mlp": 0.01254203, + "epoch": 0.5204869983466105, + "flos": 27456096810240.0, + "grad_norm": 1.7743424381892883, + "language_loss": 0.73250526, + "learning_rate": 1.9643661417009893e-06, + "loss": 0.80947828, + "num_input_tokens_seen": 186009880, + "router_z_loss_clip": 1.52539062, + "router_z_loss_mlp": 0.12567139, + "step": 8657, + "time_per_iteration": 2.547746419906616 + }, + { + "auxiliary_loss_clip": 0.06431027, + "auxiliary_loss_mlp": 0.01268378, + "balance_loss_clip": 0.06281261, + "balance_loss_mlp": 0.01255611, + "epoch": 0.5205471215992785, + "flos": 20601820644480.0, + "grad_norm": 1.9136699042437477, + "language_loss": 0.71553123, + "learning_rate": 1.9639767425184408e-06, + "loss": 0.79252529, + "num_input_tokens_seen": 186026680, + "router_z_loss_clip": 1.49804688, + "router_z_loss_mlp": 0.12756348, + "step": 8658, + "time_per_iteration": 2.523796796798706 + }, + { + "auxiliary_loss_clip": 0.06426262, + "auxiliary_loss_mlp": 0.01268222, + "balance_loss_clip": 0.06275812, + "balance_loss_mlp": 0.01255669, + "epoch": 0.5206072448519465, + "flos": 22134537129600.0, + "grad_norm": 1.8507369766537312, + "language_loss": 0.83638287, + "learning_rate": 1.963587344701897e-06, + "loss": 0.91332769, + "num_input_tokens_seen": 186046920, + "router_z_loss_clip": 1.50585938, + "router_z_loss_mlp": 0.12554932, + "step": 8659, + "time_per_iteration": 2.5169432163238525 + }, + { + "auxiliary_loss_clip": 0.06437267, + "auxiliary_loss_mlp": 0.01269684, + "balance_loss_clip": 0.06277223, + "balance_loss_mlp": 0.01255587, + "epoch": 0.5206673681046144, + "flos": 18335924933760.0, + "grad_norm": 2.050641453841446, + "language_loss": 0.75738013, + "learning_rate": 1.9631979482661253e-06, + "loss": 0.83444965, + "num_input_tokens_seen": 186062090, + "router_z_loss_clip": 1.59863281, + "router_z_loss_mlp": 0.14093018, + "step": 8660, + "time_per_iteration": 2.557415723800659 + }, + { + "auxiliary_loss_clip": 0.06428091, + "auxiliary_loss_mlp": 0.0126833, + "balance_loss_clip": 0.06277187, + "balance_loss_mlp": 0.01256105, + "epoch": 0.5207274913572825, + "flos": 20236488842880.0, + "grad_norm": 1.6215362458867588, + "language_loss": 0.77692747, + "learning_rate": 1.9628085532258906e-06, + "loss": 0.85389173, + "num_input_tokens_seen": 186081135, + "router_z_loss_clip": 1.50683594, + "router_z_loss_mlp": 0.12231445, + "step": 8661, + "time_per_iteration": 2.509428024291992 + }, + { + "auxiliary_loss_clip": 0.06431398, + "auxiliary_loss_mlp": 0.01266033, + "balance_loss_clip": 0.06278183, + "balance_loss_mlp": 0.01254112, + "epoch": 0.5207876146099504, + "flos": 22133530880640.0, + "grad_norm": 1.7321078317719976, + "language_loss": 0.70359308, + "learning_rate": 1.9624191595959603e-06, + "loss": 0.78056741, + "num_input_tokens_seen": 186099700, + "router_z_loss_clip": 1.53027344, + "router_z_loss_mlp": 0.1192627, + "step": 8662, + "time_per_iteration": 2.5810325145721436 + }, + { + "auxiliary_loss_clip": 0.0642472, + "auxiliary_loss_mlp": 0.01270038, + "balance_loss_clip": 0.06276304, + "balance_loss_mlp": 0.01257169, + "epoch": 0.5208477378626184, + "flos": 23885781863040.0, + "grad_norm": 1.845579934529664, + "language_loss": 0.70074278, + "learning_rate": 1.962029767391098e-06, + "loss": 0.77769035, + "num_input_tokens_seen": 186119740, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.12872314, + "step": 8663, + "time_per_iteration": 2.528122901916504 + }, + { + "auxiliary_loss_clip": 0.06433125, + "auxiliary_loss_mlp": 0.01272195, + "balance_loss_clip": 0.06282328, + "balance_loss_mlp": 0.01259619, + "epoch": 0.5209078611152863, + "flos": 20968158695040.0, + "grad_norm": 1.5162641399491859, + "language_loss": 0.77111858, + "learning_rate": 1.961640376626072e-06, + "loss": 0.84817183, + "num_input_tokens_seen": 186140645, + "router_z_loss_clip": 1.50878906, + "router_z_loss_mlp": 0.12591553, + "step": 8664, + "time_per_iteration": 3.9675118923187256 + }, + { + "auxiliary_loss_clip": 0.06426796, + "auxiliary_loss_mlp": 0.01274545, + "balance_loss_clip": 0.06275374, + "balance_loss_mlp": 0.01261641, + "epoch": 0.5209679843679543, + "flos": 20674006536960.0, + "grad_norm": 1.9585914111684504, + "language_loss": 0.76477247, + "learning_rate": 1.961250987315646e-06, + "loss": 0.84178591, + "num_input_tokens_seen": 186160130, + "router_z_loss_clip": 1.51269531, + "router_z_loss_mlp": 0.12915039, + "step": 8665, + "time_per_iteration": 2.541412830352783 + }, + { + "auxiliary_loss_clip": 0.06427725, + "auxiliary_loss_mlp": 0.01272532, + "balance_loss_clip": 0.06278466, + "balance_loss_mlp": 0.01260593, + "epoch": 0.5210281076206222, + "flos": 20233050825600.0, + "grad_norm": 1.6923585849410518, + "language_loss": 0.72734976, + "learning_rate": 1.960861599474586e-06, + "loss": 0.80435228, + "num_input_tokens_seen": 186179485, + "router_z_loss_clip": 1.4921875, + "router_z_loss_mlp": 0.11920166, + "step": 8666, + "time_per_iteration": 2.4996509552001953 + }, + { + "auxiliary_loss_clip": 0.06442789, + "auxiliary_loss_mlp": 0.01270993, + "balance_loss_clip": 0.0628055, + "balance_loss_mlp": 0.01256199, + "epoch": 0.5210882308732903, + "flos": 16075395884160.0, + "grad_norm": 2.8085912573953093, + "language_loss": 0.69292629, + "learning_rate": 1.9604722131176592e-06, + "loss": 0.77006412, + "num_input_tokens_seen": 186197140, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.14794922, + "step": 8667, + "time_per_iteration": 3.966068744659424 + }, + { + "auxiliary_loss_clip": 0.06427799, + "auxiliary_loss_mlp": 0.0127319, + "balance_loss_clip": 0.06280097, + "balance_loss_mlp": 0.01261793, + "epoch": 0.5211483541259582, + "flos": 24831954967680.0, + "grad_norm": 1.4529640974986662, + "language_loss": 0.8142345, + "learning_rate": 1.960082828259629e-06, + "loss": 0.89124429, + "num_input_tokens_seen": 186216800, + "router_z_loss_clip": 1.47558594, + "router_z_loss_mlp": 0.11401367, + "step": 8668, + "time_per_iteration": 2.531757116317749 + }, + { + "auxiliary_loss_clip": 0.06428734, + "auxiliary_loss_mlp": 0.01265972, + "balance_loss_clip": 0.06277529, + "balance_loss_mlp": 0.01253485, + "epoch": 0.5212084773786262, + "flos": 20375997091200.0, + "grad_norm": 2.3545461183864793, + "language_loss": 0.6399523, + "learning_rate": 1.9596934449152623e-06, + "loss": 0.71689939, + "num_input_tokens_seen": 186235320, + "router_z_loss_clip": 1.51171875, + "router_z_loss_mlp": 0.12493896, + "step": 8669, + "time_per_iteration": 2.582458019256592 + }, + { + "auxiliary_loss_clip": 0.06433244, + "auxiliary_loss_mlp": 0.01270095, + "balance_loss_clip": 0.06281579, + "balance_loss_mlp": 0.01257846, + "epoch": 0.5212686006312941, + "flos": 23151596388480.0, + "grad_norm": 1.5489696479352357, + "language_loss": 0.66586244, + "learning_rate": 1.959304063099325e-06, + "loss": 0.74289578, + "num_input_tokens_seen": 186254460, + "router_z_loss_clip": 1.51757812, + "router_z_loss_mlp": 0.12261963, + "step": 8670, + "time_per_iteration": 2.5730559825897217 + }, + { + "auxiliary_loss_clip": 0.0642543, + "auxiliary_loss_mlp": 0.01273699, + "balance_loss_clip": 0.06278989, + "balance_loss_mlp": 0.01262195, + "epoch": 0.5213287238839621, + "flos": 27780073822080.0, + "grad_norm": 2.549693242202028, + "language_loss": 0.76187384, + "learning_rate": 1.9589146828265806e-06, + "loss": 0.83886516, + "num_input_tokens_seen": 186269465, + "router_z_loss_clip": 1.46386719, + "router_z_loss_mlp": 0.11505127, + "step": 8671, + "time_per_iteration": 2.5233168601989746 + }, + { + "auxiliary_loss_clip": 0.064327, + "auxiliary_loss_mlp": 0.01274872, + "balance_loss_clip": 0.06278658, + "balance_loss_mlp": 0.01262534, + "epoch": 0.5213888471366301, + "flos": 19943762204160.0, + "grad_norm": 1.8121341163261586, + "language_loss": 0.78893673, + "learning_rate": 1.958525304111796e-06, + "loss": 0.86601251, + "num_input_tokens_seen": 186288660, + "router_z_loss_clip": 1.54003906, + "router_z_loss_mlp": 0.12341309, + "step": 8672, + "time_per_iteration": 3.9492485523223877 + }, + { + "auxiliary_loss_clip": 0.06431769, + "auxiliary_loss_mlp": 0.01269371, + "balance_loss_clip": 0.06282303, + "balance_loss_mlp": 0.01257957, + "epoch": 0.521448970389298, + "flos": 16988389971840.0, + "grad_norm": 2.0794497937850327, + "language_loss": 0.72609621, + "learning_rate": 1.958135926969736e-06, + "loss": 0.80310762, + "num_input_tokens_seen": 186305760, + "router_z_loss_clip": 1.49414062, + "router_z_loss_mlp": 0.11425781, + "step": 8673, + "time_per_iteration": 2.4858944416046143 + }, + { + "auxiliary_loss_clip": 0.06430827, + "auxiliary_loss_mlp": 0.01267899, + "balance_loss_clip": 0.06280996, + "balance_loss_mlp": 0.01256133, + "epoch": 0.5215090936419661, + "flos": 18995744309760.0, + "grad_norm": 1.6692646430310563, + "language_loss": 0.75224721, + "learning_rate": 1.957746551415166e-06, + "loss": 0.82923448, + "num_input_tokens_seen": 186324135, + "router_z_loss_clip": 1.49609375, + "router_z_loss_mlp": 0.11755371, + "step": 8674, + "time_per_iteration": 2.528323173522949 + }, + { + "auxiliary_loss_clip": 0.06432723, + "auxiliary_loss_mlp": 0.01271657, + "balance_loss_clip": 0.06279887, + "balance_loss_mlp": 0.01258812, + "epoch": 0.521569216894634, + "flos": 16148923441920.0, + "grad_norm": 2.0098628900715694, + "language_loss": 0.86161578, + "learning_rate": 1.9573571774628506e-06, + "loss": 0.93865955, + "num_input_tokens_seen": 186340205, + "router_z_loss_clip": 1.52832031, + "router_z_loss_mlp": 0.128479, + "step": 8675, + "time_per_iteration": 2.486656665802002 + }, + { + "auxiliary_loss_clip": 0.06328152, + "auxiliary_loss_mlp": 0.0125317, + "balance_loss_clip": 0.06263625, + "balance_loss_mlp": 0.01251218, + "epoch": 0.521629340147302, + "flos": 57596054296320.0, + "grad_norm": 0.8389911483177593, + "language_loss": 0.62711406, + "learning_rate": 1.9569678051275556e-06, + "loss": 0.70292729, + "num_input_tokens_seen": 186396940, + "router_z_loss_clip": 0.6484375, + "router_z_loss_mlp": 0.01950073, + "step": 8676, + "time_per_iteration": 3.09920597076416 + }, + { + "auxiliary_loss_clip": 0.06427533, + "auxiliary_loss_mlp": 0.01264396, + "balance_loss_clip": 0.06277495, + "balance_loss_mlp": 0.01252839, + "epoch": 0.5216894633999699, + "flos": 26804117790720.0, + "grad_norm": 1.458201451867465, + "language_loss": 0.69111204, + "learning_rate": 1.956578434424046e-06, + "loss": 0.7680313, + "num_input_tokens_seen": 186418680, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.11572266, + "step": 8677, + "time_per_iteration": 2.5477073192596436 + }, + { + "auxiliary_loss_clip": 0.06427766, + "auxiliary_loss_mlp": 0.01266893, + "balance_loss_clip": 0.06279552, + "balance_loss_mlp": 0.01255127, + "epoch": 0.5217495866526379, + "flos": 26365803482880.0, + "grad_norm": 1.7210863244717929, + "language_loss": 0.65549737, + "learning_rate": 1.956189065367086e-06, + "loss": 0.73244393, + "num_input_tokens_seen": 186438265, + "router_z_loss_clip": 1.48046875, + "router_z_loss_mlp": 0.11749268, + "step": 8678, + "time_per_iteration": 2.566591739654541 + }, + { + "auxiliary_loss_clip": 0.06434263, + "auxiliary_loss_mlp": 0.01268698, + "balance_loss_clip": 0.06280728, + "balance_loss_mlp": 0.01255531, + "epoch": 0.5218097099053058, + "flos": 23590329966720.0, + "grad_norm": 2.9370978110790507, + "language_loss": 0.68504936, + "learning_rate": 1.9557996979714414e-06, + "loss": 0.762079, + "num_input_tokens_seen": 186456870, + "router_z_loss_clip": 1.53417969, + "router_z_loss_mlp": 0.1317749, + "step": 8679, + "time_per_iteration": 2.510748863220215 + }, + { + "auxiliary_loss_clip": 0.06433919, + "auxiliary_loss_mlp": 0.01268379, + "balance_loss_clip": 0.06281881, + "balance_loss_mlp": 0.01256345, + "epoch": 0.5218698331579739, + "flos": 18083253346560.0, + "grad_norm": 1.6397075137651071, + "language_loss": 0.67471087, + "learning_rate": 1.9554103322518764e-06, + "loss": 0.7517339, + "num_input_tokens_seen": 186476425, + "router_z_loss_clip": 1.51855469, + "router_z_loss_mlp": 0.12036133, + "step": 8680, + "time_per_iteration": 3.9219276905059814 + }, + { + "auxiliary_loss_clip": 0.06433384, + "auxiliary_loss_mlp": 0.01271487, + "balance_loss_clip": 0.06281422, + "balance_loss_mlp": 0.01259595, + "epoch": 0.5219299564106418, + "flos": 19287129283200.0, + "grad_norm": 1.8649470617465917, + "language_loss": 0.83311534, + "learning_rate": 1.955020968223156e-06, + "loss": 0.91016412, + "num_input_tokens_seen": 186492555, + "router_z_loss_clip": 1.51953125, + "router_z_loss_mlp": 0.11889648, + "step": 8681, + "time_per_iteration": 2.516465663909912 + }, + { + "auxiliary_loss_clip": 0.06426493, + "auxiliary_loss_mlp": 0.0126523, + "balance_loss_clip": 0.06276904, + "balance_loss_mlp": 0.01253792, + "epoch": 0.5219900796633098, + "flos": 26658613975680.0, + "grad_norm": 1.6454147062415487, + "language_loss": 0.77514279, + "learning_rate": 1.9546316059000454e-06, + "loss": 0.85205996, + "num_input_tokens_seen": 186513190, + "router_z_loss_clip": 1.49511719, + "router_z_loss_mlp": 0.11437988, + "step": 8682, + "time_per_iteration": 2.554325819015503 + }, + { + "auxiliary_loss_clip": 0.06427193, + "auxiliary_loss_mlp": 0.01266482, + "balance_loss_clip": 0.06279279, + "balance_loss_mlp": 0.01254949, + "epoch": 0.5220502029159777, + "flos": 34321148225280.0, + "grad_norm": 1.635540508166305, + "language_loss": 0.693317, + "learning_rate": 1.9542422452973082e-06, + "loss": 0.77025378, + "num_input_tokens_seen": 186534830, + "router_z_loss_clip": 1.47949219, + "router_z_loss_mlp": 0.11529541, + "step": 8683, + "time_per_iteration": 2.6571457386016846 + }, + { + "auxiliary_loss_clip": 0.06430393, + "auxiliary_loss_mlp": 0.01269896, + "balance_loss_clip": 0.06278116, + "balance_loss_mlp": 0.01257629, + "epoch": 0.5221103261686457, + "flos": 22161804433920.0, + "grad_norm": 1.5499745188789709, + "language_loss": 0.76029563, + "learning_rate": 1.9538528864297104e-06, + "loss": 0.83729851, + "num_input_tokens_seen": 186554390, + "router_z_loss_clip": 1.52441406, + "router_z_loss_mlp": 0.12255859, + "step": 8684, + "time_per_iteration": 2.5611672401428223 + }, + { + "auxiliary_loss_clip": 0.06422482, + "auxiliary_loss_mlp": 0.01267711, + "balance_loss_clip": 0.06276357, + "balance_loss_mlp": 0.01256123, + "epoch": 0.5221704494213137, + "flos": 19214440266240.0, + "grad_norm": 1.9689133598672337, + "language_loss": 0.75993264, + "learning_rate": 1.9534635293120153e-06, + "loss": 0.83683455, + "num_input_tokens_seen": 186572360, + "router_z_loss_clip": 1.46191406, + "router_z_loss_mlp": 0.11590576, + "step": 8685, + "time_per_iteration": 2.592336416244507 + }, + { + "auxiliary_loss_clip": 0.06433201, + "auxiliary_loss_mlp": 0.01267661, + "balance_loss_clip": 0.06280906, + "balance_loss_mlp": 0.01255549, + "epoch": 0.5222305726739817, + "flos": 19360069862400.0, + "grad_norm": 1.8592295664699974, + "language_loss": 0.81054503, + "learning_rate": 1.9530741739589876e-06, + "loss": 0.88755369, + "num_input_tokens_seen": 186590655, + "router_z_loss_clip": 1.52246094, + "router_z_loss_mlp": 0.12103271, + "step": 8686, + "time_per_iteration": 2.529801845550537 + }, + { + "auxiliary_loss_clip": 0.06419135, + "auxiliary_loss_mlp": 0.01266554, + "balance_loss_clip": 0.06276063, + "balance_loss_mlp": 0.01255021, + "epoch": 0.5222906959266497, + "flos": 27821554392960.0, + "grad_norm": 1.7724306724007597, + "language_loss": 0.7060039, + "learning_rate": 1.9526848203853927e-06, + "loss": 0.78286076, + "num_input_tokens_seen": 186610345, + "router_z_loss_clip": 1.43164062, + "router_z_loss_mlp": 0.11535645, + "step": 8687, + "time_per_iteration": 2.580845594406128 + }, + { + "auxiliary_loss_clip": 0.06421649, + "auxiliary_loss_mlp": 0.01267038, + "balance_loss_clip": 0.06277607, + "balance_loss_mlp": 0.01256297, + "epoch": 0.5223508191793176, + "flos": 12717781326720.0, + "grad_norm": 2.573153086937961, + "language_loss": 0.82975262, + "learning_rate": 1.9522954686059936e-06, + "loss": 0.90663946, + "num_input_tokens_seen": 186624360, + "router_z_loss_clip": 1.43945312, + "router_z_loss_mlp": 0.10736084, + "step": 8688, + "time_per_iteration": 2.479219436645508 + }, + { + "auxiliary_loss_clip": 0.06427407, + "auxiliary_loss_mlp": 0.01268772, + "balance_loss_clip": 0.06280096, + "balance_loss_mlp": 0.01256345, + "epoch": 0.5224109424319856, + "flos": 15637584700800.0, + "grad_norm": 2.221621058495187, + "language_loss": 0.74186772, + "learning_rate": 1.9519061186355558e-06, + "loss": 0.81882954, + "num_input_tokens_seen": 186638680, + "router_z_loss_clip": 1.47363281, + "router_z_loss_mlp": 0.12426758, + "step": 8689, + "time_per_iteration": 2.519578456878662 + }, + { + "auxiliary_loss_clip": 0.06423427, + "auxiliary_loss_mlp": 0.01264867, + "balance_loss_clip": 0.06277696, + "balance_loss_mlp": 0.01253858, + "epoch": 0.5224710656846535, + "flos": 15747687365760.0, + "grad_norm": 1.8795858532487468, + "language_loss": 0.8292582, + "learning_rate": 1.9515167704888417e-06, + "loss": 0.90614116, + "num_input_tokens_seen": 186655840, + "router_z_loss_clip": 1.45703125, + "router_z_loss_mlp": 0.11022949, + "step": 8690, + "time_per_iteration": 2.4795632362365723 + }, + { + "auxiliary_loss_clip": 0.06425175, + "auxiliary_loss_mlp": 0.01267616, + "balance_loss_clip": 0.06276759, + "balance_loss_mlp": 0.0125542, + "epoch": 0.5225311889373215, + "flos": 26038136891520.0, + "grad_norm": 1.8859654188369186, + "language_loss": 0.79290485, + "learning_rate": 1.9511274241806173e-06, + "loss": 0.86983275, + "num_input_tokens_seen": 186674150, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.12200928, + "step": 8691, + "time_per_iteration": 2.554316520690918 + }, + { + "auxiliary_loss_clip": 0.06425714, + "auxiliary_loss_mlp": 0.01267876, + "balance_loss_clip": 0.06275393, + "balance_loss_mlp": 0.01255044, + "epoch": 0.5225913121899894, + "flos": 18375183371520.0, + "grad_norm": 2.097465391576973, + "language_loss": 0.76909935, + "learning_rate": 1.950738079725646e-06, + "loss": 0.84603524, + "num_input_tokens_seen": 186690675, + "router_z_loss_clip": 1.50292969, + "router_z_loss_mlp": 0.12835693, + "step": 8692, + "time_per_iteration": 2.508985757827759 + }, + { + "auxiliary_loss_clip": 0.06422729, + "auxiliary_loss_mlp": 0.01266471, + "balance_loss_clip": 0.06279368, + "balance_loss_mlp": 0.01254872, + "epoch": 0.5226514354426575, + "flos": 29280407904000.0, + "grad_norm": 1.831817200061648, + "language_loss": 0.73045087, + "learning_rate": 1.950348737138691e-06, + "loss": 0.80734289, + "num_input_tokens_seen": 186710380, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.11608887, + "step": 8693, + "time_per_iteration": 2.5672616958618164 + }, + { + "auxiliary_loss_clip": 0.06430539, + "auxiliary_loss_mlp": 0.01265444, + "balance_loss_clip": 0.06276198, + "balance_loss_mlp": 0.01252802, + "epoch": 0.5227115586953254, + "flos": 22859330947200.0, + "grad_norm": 2.034375584307348, + "language_loss": 0.8244431, + "learning_rate": 1.949959396434517e-06, + "loss": 0.90140283, + "num_input_tokens_seen": 186729135, + "router_z_loss_clip": 1.54199219, + "router_z_loss_mlp": 0.12640381, + "step": 8694, + "time_per_iteration": 2.511063814163208 + }, + { + "auxiliary_loss_clip": 0.06334698, + "auxiliary_loss_mlp": 0.01264, + "balance_loss_clip": 0.06270603, + "balance_loss_mlp": 0.01262187, + "epoch": 0.5227716819479934, + "flos": 57491695635840.0, + "grad_norm": 0.936740482735722, + "language_loss": 0.55577236, + "learning_rate": 1.949570057627888e-06, + "loss": 0.63175929, + "num_input_tokens_seen": 186791115, + "router_z_loss_clip": 0.640625, + "router_z_loss_mlp": 0.01809692, + "step": 8695, + "time_per_iteration": 3.201383113861084 + }, + { + "auxiliary_loss_clip": 0.06426679, + "auxiliary_loss_mlp": 0.01263614, + "balance_loss_clip": 0.06277943, + "balance_loss_mlp": 0.01252074, + "epoch": 0.5228318052006613, + "flos": 13813357461120.0, + "grad_norm": 1.622631737546212, + "language_loss": 0.73801219, + "learning_rate": 1.9491807207335672e-06, + "loss": 0.81491518, + "num_input_tokens_seen": 186808660, + "router_z_loss_clip": 1.48730469, + "router_z_loss_mlp": 0.11547852, + "step": 8696, + "time_per_iteration": 2.542386770248413 + }, + { + "auxiliary_loss_clip": 0.06429457, + "auxiliary_loss_mlp": 0.01266915, + "balance_loss_clip": 0.06279002, + "balance_loss_mlp": 0.01254589, + "epoch": 0.5228919284533293, + "flos": 15601596572160.0, + "grad_norm": 1.5536675741091566, + "language_loss": 0.71410191, + "learning_rate": 1.948791385766319e-06, + "loss": 0.79106563, + "num_input_tokens_seen": 186825900, + "router_z_loss_clip": 1.50488281, + "router_z_loss_mlp": 0.12341309, + "step": 8697, + "time_per_iteration": 2.520252227783203 + }, + { + "auxiliary_loss_clip": 0.06423891, + "auxiliary_loss_mlp": 0.01265854, + "balance_loss_clip": 0.0627815, + "balance_loss_mlp": 0.0125453, + "epoch": 0.5229520517059973, + "flos": 22497982214400.0, + "grad_norm": 1.650008991843684, + "language_loss": 0.80845451, + "learning_rate": 1.948402052740906e-06, + "loss": 0.88535196, + "num_input_tokens_seen": 186843735, + "router_z_loss_clip": 1.45605469, + "router_z_loss_mlp": 0.11328125, + "step": 8698, + "time_per_iteration": 2.5636022090911865 + }, + { + "auxiliary_loss_clip": 0.06426111, + "auxiliary_loss_mlp": 0.01266716, + "balance_loss_clip": 0.06278659, + "balance_loss_mlp": 0.01254908, + "epoch": 0.5230121749586653, + "flos": 22097416970880.0, + "grad_norm": 3.7708298280456023, + "language_loss": 0.74449289, + "learning_rate": 1.948012721672093e-06, + "loss": 0.82142115, + "num_input_tokens_seen": 186862440, + "router_z_loss_clip": 1.47167969, + "router_z_loss_mlp": 0.1182251, + "step": 8699, + "time_per_iteration": 2.531606912612915 + }, + { + "auxiliary_loss_clip": 0.06432469, + "auxiliary_loss_mlp": 0.0126789, + "balance_loss_clip": 0.06277843, + "balance_loss_mlp": 0.01255325, + "epoch": 0.5230722982113333, + "flos": 22133656661760.0, + "grad_norm": 1.5875927962566738, + "language_loss": 0.73680252, + "learning_rate": 1.947623392574642e-06, + "loss": 0.81380606, + "num_input_tokens_seen": 186880940, + "router_z_loss_clip": 1.546875, + "router_z_loss_mlp": 0.12561035, + "step": 8700, + "time_per_iteration": 2.542734146118164 + }, + { + "auxiliary_loss_clip": 0.06429377, + "auxiliary_loss_mlp": 0.01275322, + "balance_loss_clip": 0.06277132, + "balance_loss_mlp": 0.01263127, + "epoch": 0.5231324214640012, + "flos": 25016214096000.0, + "grad_norm": 1.8967545071734793, + "language_loss": 0.67123276, + "learning_rate": 1.947234065463318e-06, + "loss": 0.74827981, + "num_input_tokens_seen": 186900785, + "router_z_loss_clip": 1.52148438, + "router_z_loss_mlp": 0.12207031, + "step": 8701, + "time_per_iteration": 2.543332815170288 + }, + { + "auxiliary_loss_clip": 0.06421816, + "auxiliary_loss_mlp": 0.01266038, + "balance_loss_clip": 0.06274643, + "balance_loss_mlp": 0.01254696, + "epoch": 0.5231925447166692, + "flos": 25747842021120.0, + "grad_norm": 1.6886589098280236, + "language_loss": 0.66874444, + "learning_rate": 1.9468447403528826e-06, + "loss": 0.74562299, + "num_input_tokens_seen": 186920895, + "router_z_loss_clip": 1.46972656, + "router_z_loss_mlp": 0.11340332, + "step": 8702, + "time_per_iteration": 2.5511581897735596 + }, + { + "auxiliary_loss_clip": 0.06426294, + "auxiliary_loss_mlp": 0.01268357, + "balance_loss_clip": 0.06277906, + "balance_loss_mlp": 0.01255906, + "epoch": 0.5232526679693371, + "flos": 21440322852480.0, + "grad_norm": 3.970152828937024, + "language_loss": 0.76360488, + "learning_rate": 1.946455417258101e-06, + "loss": 0.84055138, + "num_input_tokens_seen": 186940605, + "router_z_loss_clip": 1.48632812, + "router_z_loss_mlp": 0.12457275, + "step": 8703, + "time_per_iteration": 2.4913880825042725 + }, + { + "auxiliary_loss_clip": 0.06434231, + "auxiliary_loss_mlp": 0.01268071, + "balance_loss_clip": 0.06279694, + "balance_loss_mlp": 0.01255471, + "epoch": 0.5233127912220051, + "flos": 35307082892160.0, + "grad_norm": 2.0695890072195344, + "language_loss": 0.77554905, + "learning_rate": 1.9460660961937348e-06, + "loss": 0.85257214, + "num_input_tokens_seen": 186960820, + "router_z_loss_clip": 1.54589844, + "router_z_loss_mlp": 0.1260376, + "step": 8704, + "time_per_iteration": 4.093170642852783 + }, + { + "auxiliary_loss_clip": 0.06425636, + "auxiliary_loss_mlp": 0.01277604, + "balance_loss_clip": 0.06278675, + "balance_loss_mlp": 0.012665, + "epoch": 0.523372914474673, + "flos": 17056257379200.0, + "grad_norm": 1.7488135640398956, + "language_loss": 0.78527272, + "learning_rate": 1.9456767771745474e-06, + "loss": 0.86230516, + "num_input_tokens_seen": 186976240, + "router_z_loss_clip": 1.47070312, + "router_z_loss_mlp": 0.11108398, + "step": 8705, + "time_per_iteration": 2.487792730331421 + }, + { + "auxiliary_loss_clip": 0.06433457, + "auxiliary_loss_mlp": 0.01264626, + "balance_loss_clip": 0.06280416, + "balance_loss_mlp": 0.0125221, + "epoch": 0.5234330377273411, + "flos": 18412303530240.0, + "grad_norm": 1.822089906899261, + "language_loss": 0.69768077, + "learning_rate": 1.9452874602153027e-06, + "loss": 0.77466154, + "num_input_tokens_seen": 186992855, + "router_z_loss_clip": 1.52832031, + "router_z_loss_mlp": 0.12408447, + "step": 8706, + "time_per_iteration": 2.52415132522583 + }, + { + "auxiliary_loss_clip": 0.06339821, + "auxiliary_loss_mlp": 0.01262622, + "balance_loss_clip": 0.06275055, + "balance_loss_mlp": 0.01260974, + "epoch": 0.523493160980009, + "flos": 65872426429440.0, + "grad_norm": 0.668265925718786, + "language_loss": 0.52398658, + "learning_rate": 1.9448981453307623e-06, + "loss": 0.60001105, + "num_input_tokens_seen": 187051205, + "router_z_loss_clip": 0.6484375, + "router_z_loss_mlp": 0.01651001, + "step": 8707, + "time_per_iteration": 4.596412658691406 + }, + { + "auxiliary_loss_clip": 0.06431062, + "auxiliary_loss_mlp": 0.01267285, + "balance_loss_clip": 0.06282815, + "balance_loss_mlp": 0.01255829, + "epoch": 0.523553284232677, + "flos": 21878595233280.0, + "grad_norm": 1.763620445487087, + "language_loss": 0.75447237, + "learning_rate": 1.9445088325356904e-06, + "loss": 0.83145583, + "num_input_tokens_seen": 187070540, + "router_z_loss_clip": 1.47949219, + "router_z_loss_mlp": 0.11450195, + "step": 8708, + "time_per_iteration": 2.515388011932373 + }, + { + "auxiliary_loss_clip": 0.06425884, + "auxiliary_loss_mlp": 0.01269189, + "balance_loss_clip": 0.06279897, + "balance_loss_mlp": 0.01258252, + "epoch": 0.5236134074853449, + "flos": 20854156815360.0, + "grad_norm": 1.5562083670602136, + "language_loss": 0.78041285, + "learning_rate": 1.944119521844849e-06, + "loss": 0.85736358, + "num_input_tokens_seen": 187089975, + "router_z_loss_clip": 1.45800781, + "router_z_loss_mlp": 0.109375, + "step": 8709, + "time_per_iteration": 2.569312810897827 + }, + { + "auxiliary_loss_clip": 0.06434496, + "auxiliary_loss_mlp": 0.01269997, + "balance_loss_clip": 0.062785, + "balance_loss_mlp": 0.01256872, + "epoch": 0.5236735307380129, + "flos": 25527510910080.0, + "grad_norm": 1.8691534112354709, + "language_loss": 0.83896649, + "learning_rate": 1.9437302132730003e-06, + "loss": 0.91601145, + "num_input_tokens_seen": 187108775, + "router_z_loss_clip": 1.55859375, + "router_z_loss_mlp": 0.13128662, + "step": 8710, + "time_per_iteration": 2.5364856719970703 + }, + { + "auxiliary_loss_clip": 0.06424439, + "auxiliary_loss_mlp": 0.01271523, + "balance_loss_clip": 0.06278566, + "balance_loss_mlp": 0.01260347, + "epoch": 0.523733653990681, + "flos": 23589281790720.0, + "grad_norm": 1.796806294076298, + "language_loss": 0.69453466, + "learning_rate": 1.943340906834908e-06, + "loss": 0.77149427, + "num_input_tokens_seen": 187128830, + "router_z_loss_clip": 1.45898438, + "router_z_loss_mlp": 0.11181641, + "step": 8711, + "time_per_iteration": 2.5488204956054688 + }, + { + "auxiliary_loss_clip": 0.06423855, + "auxiliary_loss_mlp": 0.01267852, + "balance_loss_clip": 0.06275582, + "balance_loss_mlp": 0.01256539, + "epoch": 0.5237937772433489, + "flos": 21112698188160.0, + "grad_norm": 1.676774757059823, + "language_loss": 0.82997072, + "learning_rate": 1.9429516025453345e-06, + "loss": 0.90688783, + "num_input_tokens_seen": 187149570, + "router_z_loss_clip": 1.48242188, + "router_z_loss_mlp": 0.11322021, + "step": 8712, + "time_per_iteration": 4.064100980758667 + }, + { + "auxiliary_loss_clip": 0.0643232, + "auxiliary_loss_mlp": 0.01271192, + "balance_loss_clip": 0.06279981, + "balance_loss_mlp": 0.01259051, + "epoch": 0.5238539004960169, + "flos": 19179081043200.0, + "grad_norm": 1.8094880941691576, + "language_loss": 0.6993227, + "learning_rate": 1.9425623004190415e-06, + "loss": 0.77635783, + "num_input_tokens_seen": 187170575, + "router_z_loss_clip": 1.52246094, + "router_z_loss_mlp": 0.121521, + "step": 8713, + "time_per_iteration": 2.544586420059204 + }, + { + "auxiliary_loss_clip": 0.06435391, + "auxiliary_loss_mlp": 0.01268239, + "balance_loss_clip": 0.06280154, + "balance_loss_mlp": 0.01254834, + "epoch": 0.5239140237486848, + "flos": 17892914797440.0, + "grad_norm": 2.8365689324721597, + "language_loss": 0.76947498, + "learning_rate": 1.9421730004707925e-06, + "loss": 0.84651124, + "num_input_tokens_seen": 187187190, + "router_z_loss_clip": 1.55078125, + "router_z_loss_mlp": 0.13409424, + "step": 8714, + "time_per_iteration": 2.5225958824157715 + }, + { + "auxiliary_loss_clip": 0.06430446, + "auxiliary_loss_mlp": 0.01267137, + "balance_loss_clip": 0.06279821, + "balance_loss_mlp": 0.01255085, + "epoch": 0.5239741470013528, + "flos": 17936072449920.0, + "grad_norm": 1.8206248729771282, + "language_loss": 0.76218581, + "learning_rate": 1.9417837027153483e-06, + "loss": 0.83916163, + "num_input_tokens_seen": 187204350, + "router_z_loss_clip": 1.50390625, + "router_z_loss_mlp": 0.12060547, + "step": 8715, + "time_per_iteration": 2.479482650756836 + }, + { + "auxiliary_loss_clip": 0.06428694, + "auxiliary_loss_mlp": 0.01265255, + "balance_loss_clip": 0.06280876, + "balance_loss_mlp": 0.01253537, + "epoch": 0.5240342702540207, + "flos": 31001408513280.0, + "grad_norm": 1.518077309755953, + "language_loss": 0.71405065, + "learning_rate": 1.9413944071674723e-06, + "loss": 0.79099017, + "num_input_tokens_seen": 187225605, + "router_z_loss_clip": 1.47851562, + "router_z_loss_mlp": 0.1171875, + "step": 8716, + "time_per_iteration": 2.6313345432281494 + }, + { + "auxiliary_loss_clip": 0.06429261, + "auxiliary_loss_mlp": 0.01264727, + "balance_loss_clip": 0.06279399, + "balance_loss_mlp": 0.012541, + "epoch": 0.5240943935066887, + "flos": 25011308632320.0, + "grad_norm": 2.053994478361076, + "language_loss": 0.87371016, + "learning_rate": 1.941005113841926e-06, + "loss": 0.95065004, + "num_input_tokens_seen": 187241335, + "router_z_loss_clip": 1.49804688, + "router_z_loss_mlp": 0.10626221, + "step": 8717, + "time_per_iteration": 2.5242137908935547 + }, + { + "auxiliary_loss_clip": 0.06427871, + "auxiliary_loss_mlp": 0.01272314, + "balance_loss_clip": 0.06276905, + "balance_loss_mlp": 0.01260184, + "epoch": 0.5241545167593566, + "flos": 23665786168320.0, + "grad_norm": 1.9379813616750423, + "language_loss": 0.62001824, + "learning_rate": 1.9406158227534723e-06, + "loss": 0.69702005, + "num_input_tokens_seen": 187259925, + "router_z_loss_clip": 1.5078125, + "router_z_loss_mlp": 0.12139893, + "step": 8718, + "time_per_iteration": 2.5543830394744873 + }, + { + "auxiliary_loss_clip": 0.06436223, + "auxiliary_loss_mlp": 0.01271154, + "balance_loss_clip": 0.06282552, + "balance_loss_mlp": 0.01259006, + "epoch": 0.5242146400120247, + "flos": 23406490108800.0, + "grad_norm": 1.965252740565909, + "language_loss": 0.72457337, + "learning_rate": 1.940226533916872e-06, + "loss": 0.80164713, + "num_input_tokens_seen": 187279035, + "router_z_loss_clip": 1.53613281, + "router_z_loss_mlp": 0.12145996, + "step": 8719, + "time_per_iteration": 3.9948794841766357 + }, + { + "auxiliary_loss_clip": 0.06428128, + "auxiliary_loss_mlp": 0.01267914, + "balance_loss_clip": 0.0628122, + "balance_loss_mlp": 0.01256983, + "epoch": 0.5242747632646926, + "flos": 17754873995520.0, + "grad_norm": 2.179080036180393, + "language_loss": 0.73360658, + "learning_rate": 1.9398372473468877e-06, + "loss": 0.81056702, + "num_input_tokens_seen": 187297555, + "router_z_loss_clip": 1.46777344, + "router_z_loss_mlp": 0.10919189, + "step": 8720, + "time_per_iteration": 2.561491012573242 + }, + { + "auxiliary_loss_clip": 0.06431387, + "auxiliary_loss_mlp": 0.0126878, + "balance_loss_clip": 0.06281313, + "balance_loss_mlp": 0.01256227, + "epoch": 0.5243348865173606, + "flos": 32605849693440.0, + "grad_norm": 1.7043415367979953, + "language_loss": 0.70633399, + "learning_rate": 1.939447963058281e-06, + "loss": 0.78333569, + "num_input_tokens_seen": 187320265, + "router_z_loss_clip": 1.49902344, + "router_z_loss_mlp": 0.12561035, + "step": 8721, + "time_per_iteration": 2.6254172325134277 + }, + { + "auxiliary_loss_clip": 0.06427501, + "auxiliary_loss_mlp": 0.01269506, + "balance_loss_clip": 0.06277889, + "balance_loss_mlp": 0.01258008, + "epoch": 0.5243950097700285, + "flos": 25491229292160.0, + "grad_norm": 1.669973954204285, + "language_loss": 0.86888224, + "learning_rate": 1.939058681065813e-06, + "loss": 0.94585228, + "num_input_tokens_seen": 187338045, + "router_z_loss_clip": 1.49707031, + "router_z_loss_mlp": 0.1151123, + "step": 8722, + "time_per_iteration": 2.532735586166382 + }, + { + "auxiliary_loss_clip": 0.06423786, + "auxiliary_loss_mlp": 0.01270795, + "balance_loss_clip": 0.06276488, + "balance_loss_mlp": 0.01259041, + "epoch": 0.5244551330226965, + "flos": 15273846126720.0, + "grad_norm": 1.6547564845342364, + "language_loss": 0.80303264, + "learning_rate": 1.938669401384247e-06, + "loss": 0.87997842, + "num_input_tokens_seen": 187356040, + "router_z_loss_clip": 1.47265625, + "router_z_loss_mlp": 0.11743164, + "step": 8723, + "time_per_iteration": 2.519230842590332 + }, + { + "auxiliary_loss_clip": 0.06433833, + "auxiliary_loss_mlp": 0.01269065, + "balance_loss_clip": 0.06281124, + "balance_loss_mlp": 0.01256286, + "epoch": 0.5245152562753645, + "flos": 22243717399680.0, + "grad_norm": 1.8110090728616772, + "language_loss": 0.75572187, + "learning_rate": 1.9382801240283426e-06, + "loss": 0.83275086, + "num_input_tokens_seen": 187374185, + "router_z_loss_clip": 1.52539062, + "router_z_loss_mlp": 0.12780762, + "step": 8724, + "time_per_iteration": 2.503331422805786 + }, + { + "auxiliary_loss_clip": 0.06439602, + "auxiliary_loss_mlp": 0.01267267, + "balance_loss_clip": 0.06280126, + "balance_loss_mlp": 0.01254428, + "epoch": 0.5245753795280325, + "flos": 29434548689280.0, + "grad_norm": 1.6762764466906133, + "language_loss": 0.70858645, + "learning_rate": 1.9378908490128625e-06, + "loss": 0.78565514, + "num_input_tokens_seen": 187396640, + "router_z_loss_clip": 1.59179688, + "router_z_loss_mlp": 0.12835693, + "step": 8725, + "time_per_iteration": 2.6268577575683594 + }, + { + "auxiliary_loss_clip": 0.06331155, + "auxiliary_loss_mlp": 0.01254987, + "balance_loss_clip": 0.06266978, + "balance_loss_mlp": 0.01252628, + "epoch": 0.5246355027807005, + "flos": 58853569645440.0, + "grad_norm": 0.7398874669792804, + "language_loss": 0.55689812, + "learning_rate": 1.937501576352568e-06, + "loss": 0.63275951, + "num_input_tokens_seen": 187455945, + "router_z_loss_clip": 0.64111328, + "router_z_loss_mlp": 0.02354431, + "step": 8726, + "time_per_iteration": 3.1253981590270996 + }, + { + "auxiliary_loss_clip": 0.06326637, + "auxiliary_loss_mlp": 0.01254365, + "balance_loss_clip": 0.06262497, + "balance_loss_mlp": 0.01252303, + "epoch": 0.5246956260333684, + "flos": 64546792110720.0, + "grad_norm": 0.7865731844335093, + "language_loss": 0.58442128, + "learning_rate": 1.937112306062219e-06, + "loss": 0.66023123, + "num_input_tokens_seen": 187519975, + "router_z_loss_clip": 0.640625, + "router_z_loss_mlp": 0.02062988, + "step": 8727, + "time_per_iteration": 3.176279306411743 + }, + { + "auxiliary_loss_clip": 0.06432917, + "auxiliary_loss_mlp": 0.01270503, + "balance_loss_clip": 0.06279024, + "balance_loss_mlp": 0.01258118, + "epoch": 0.5247557492860364, + "flos": 24540276504960.0, + "grad_norm": 1.4599497814344178, + "language_loss": 0.70513123, + "learning_rate": 1.9367230381565786e-06, + "loss": 0.78216541, + "num_input_tokens_seen": 187541775, + "router_z_loss_clip": 1.53808594, + "router_z_loss_mlp": 0.12390137, + "step": 8728, + "time_per_iteration": 2.635087728500366 + }, + { + "auxiliary_loss_clip": 0.06426623, + "auxiliary_loss_mlp": 0.01271129, + "balance_loss_clip": 0.06274961, + "balance_loss_mlp": 0.01258815, + "epoch": 0.5248158725387043, + "flos": 18811946378880.0, + "grad_norm": 1.5300920869777792, + "language_loss": 0.69649124, + "learning_rate": 1.9363337726504062e-06, + "loss": 0.77346873, + "num_input_tokens_seen": 187560425, + "router_z_loss_clip": 1.51757812, + "router_z_loss_mlp": 0.12310791, + "step": 8729, + "time_per_iteration": 2.5286824703216553 + }, + { + "auxiliary_loss_clip": 0.06429707, + "auxiliary_loss_mlp": 0.01272402, + "balance_loss_clip": 0.06276232, + "balance_loss_mlp": 0.01260112, + "epoch": 0.5248759957913723, + "flos": 20961534222720.0, + "grad_norm": 1.931767440888087, + "language_loss": 0.83841878, + "learning_rate": 1.935944509558464e-06, + "loss": 0.91543984, + "num_input_tokens_seen": 187579930, + "router_z_loss_clip": 1.53417969, + "router_z_loss_mlp": 0.12280273, + "step": 8730, + "time_per_iteration": 2.50693678855896 + }, + { + "auxiliary_loss_clip": 0.06424531, + "auxiliary_loss_mlp": 0.01265175, + "balance_loss_clip": 0.06273736, + "balance_loss_mlp": 0.01253301, + "epoch": 0.5249361190440403, + "flos": 18666903761280.0, + "grad_norm": 2.7205788659727634, + "language_loss": 0.79795074, + "learning_rate": 1.9355552488955125e-06, + "loss": 0.87484777, + "num_input_tokens_seen": 187595365, + "router_z_loss_clip": 1.50976562, + "router_z_loss_mlp": 0.11877441, + "step": 8731, + "time_per_iteration": 2.5262162685394287 + }, + { + "auxiliary_loss_clip": 0.06421249, + "auxiliary_loss_mlp": 0.01268831, + "balance_loss_clip": 0.06275119, + "balance_loss_mlp": 0.01256653, + "epoch": 0.5249962422967083, + "flos": 24870249083520.0, + "grad_norm": 2.282421292997204, + "language_loss": 0.83455729, + "learning_rate": 1.935165990676312e-06, + "loss": 0.91145802, + "num_input_tokens_seen": 187614715, + "router_z_loss_clip": 1.46191406, + "router_z_loss_mlp": 0.12182617, + "step": 8732, + "time_per_iteration": 2.5442264080047607 + }, + { + "auxiliary_loss_clip": 0.06426094, + "auxiliary_loss_mlp": 0.01271634, + "balance_loss_clip": 0.06276669, + "balance_loss_mlp": 0.01259654, + "epoch": 0.5250563655493762, + "flos": 15267179727360.0, + "grad_norm": 1.5246135300121169, + "language_loss": 0.77770185, + "learning_rate": 1.9347767349156237e-06, + "loss": 0.85467911, + "num_input_tokens_seen": 187630745, + "router_z_loss_clip": 1.49414062, + "router_z_loss_mlp": 0.11975098, + "step": 8733, + "time_per_iteration": 2.5826051235198975 + }, + { + "auxiliary_loss_clip": 0.0643189, + "auxiliary_loss_mlp": 0.01266095, + "balance_loss_clip": 0.0627751, + "balance_loss_mlp": 0.01253655, + "epoch": 0.5251164888020442, + "flos": 18631209121920.0, + "grad_norm": 3.9739558224943683, + "language_loss": 0.81671995, + "learning_rate": 1.934387481628208e-06, + "loss": 0.89369977, + "num_input_tokens_seen": 187648200, + "router_z_loss_clip": 1.54492188, + "router_z_loss_mlp": 0.12445068, + "step": 8734, + "time_per_iteration": 2.496502637863159 + }, + { + "auxiliary_loss_clip": 0.0642469, + "auxiliary_loss_mlp": 0.01264669, + "balance_loss_clip": 0.06276481, + "balance_loss_mlp": 0.01253041, + "epoch": 0.5251766120547121, + "flos": 29717632108800.0, + "grad_norm": 1.407036688227265, + "language_loss": 0.77114183, + "learning_rate": 1.933998230828826e-06, + "loss": 0.84803545, + "num_input_tokens_seen": 187669205, + "router_z_loss_clip": 1.48144531, + "router_z_loss_mlp": 0.11627197, + "step": 8735, + "time_per_iteration": 2.5745790004730225 + }, + { + "auxiliary_loss_clip": 0.06423082, + "auxiliary_loss_mlp": 0.01265046, + "balance_loss_clip": 0.06273593, + "balance_loss_mlp": 0.01253632, + "epoch": 0.5252367353073801, + "flos": 23446964430720.0, + "grad_norm": 1.5621679512535565, + "language_loss": 0.80604559, + "learning_rate": 1.9336089825322376e-06, + "loss": 0.88292682, + "num_input_tokens_seen": 187690890, + "router_z_loss_clip": 1.49414062, + "router_z_loss_mlp": 0.11419678, + "step": 8736, + "time_per_iteration": 2.5257420539855957 + }, + { + "auxiliary_loss_clip": 0.06425665, + "auxiliary_loss_mlp": 0.0127044, + "balance_loss_clip": 0.06277201, + "balance_loss_mlp": 0.01258334, + "epoch": 0.5252968585600482, + "flos": 30818658758400.0, + "grad_norm": 2.1177707386756697, + "language_loss": 0.70240873, + "learning_rate": 1.9332197367532033e-06, + "loss": 0.77936983, + "num_input_tokens_seen": 187713045, + "router_z_loss_clip": 1.48339844, + "router_z_loss_mlp": 0.12097168, + "step": 8737, + "time_per_iteration": 2.5996742248535156 + }, + { + "auxiliary_loss_clip": 0.06423551, + "auxiliary_loss_mlp": 0.01268169, + "balance_loss_clip": 0.06272718, + "balance_loss_mlp": 0.01256564, + "epoch": 0.5253569818127161, + "flos": 20634035339520.0, + "grad_norm": 1.5486622918302246, + "language_loss": 0.7715745, + "learning_rate": 1.9328304935064833e-06, + "loss": 0.84849167, + "num_input_tokens_seen": 187733640, + "router_z_loss_clip": 1.50878906, + "router_z_loss_mlp": 0.11608887, + "step": 8738, + "time_per_iteration": 2.5352158546447754 + }, + { + "auxiliary_loss_clip": 0.06323943, + "auxiliary_loss_mlp": 0.01255398, + "balance_loss_clip": 0.06260057, + "balance_loss_mlp": 0.01253626, + "epoch": 0.5254171050653841, + "flos": 63448155302400.0, + "grad_norm": 0.7261228489339219, + "language_loss": 0.54416603, + "learning_rate": 1.932441252806837e-06, + "loss": 0.61995941, + "num_input_tokens_seen": 187792930, + "router_z_loss_clip": 0.640625, + "router_z_loss_mlp": 0.01774597, + "step": 8739, + "time_per_iteration": 3.1277644634246826 + }, + { + "auxiliary_loss_clip": 0.06426128, + "auxiliary_loss_mlp": 0.01267449, + "balance_loss_clip": 0.06276017, + "balance_loss_mlp": 0.01255457, + "epoch": 0.525477228318052, + "flos": 34678136545920.0, + "grad_norm": 1.6647555558701046, + "language_loss": 0.84639645, + "learning_rate": 1.9320520146690263e-06, + "loss": 0.92333221, + "num_input_tokens_seen": 187812495, + "router_z_loss_clip": 1.50195312, + "router_z_loss_mlp": 0.11993408, + "step": 8740, + "time_per_iteration": 2.658111572265625 + }, + { + "auxiliary_loss_clip": 0.06423901, + "auxiliary_loss_mlp": 0.01263794, + "balance_loss_clip": 0.06275214, + "balance_loss_mlp": 0.01251843, + "epoch": 0.52553735157072, + "flos": 17936575574400.0, + "grad_norm": 2.0969213447662156, + "language_loss": 0.69862366, + "learning_rate": 1.9316627791078093e-06, + "loss": 0.77550066, + "num_input_tokens_seen": 187829685, + "router_z_loss_clip": 1.48535156, + "router_z_loss_mlp": 0.11938477, + "step": 8741, + "time_per_iteration": 2.4757626056671143 + }, + { + "auxiliary_loss_clip": 0.0642582, + "auxiliary_loss_mlp": 0.01266561, + "balance_loss_clip": 0.06271701, + "balance_loss_mlp": 0.01254378, + "epoch": 0.5255974748233879, + "flos": 9945326557440.0, + "grad_norm": 2.083494644749303, + "language_loss": 0.66346633, + "learning_rate": 1.931273546137947e-06, + "loss": 0.74039018, + "num_input_tokens_seen": 187846495, + "router_z_loss_clip": 1.5390625, + "router_z_loss_mlp": 0.12188721, + "step": 8742, + "time_per_iteration": 2.4912760257720947 + }, + { + "auxiliary_loss_clip": 0.06430671, + "auxiliary_loss_mlp": 0.01267776, + "balance_loss_clip": 0.06273881, + "balance_loss_mlp": 0.01254592, + "epoch": 0.5256575980760559, + "flos": 16873256062080.0, + "grad_norm": 2.278792899782439, + "language_loss": 0.62974113, + "learning_rate": 1.9308843157741983e-06, + "loss": 0.7067256, + "num_input_tokens_seen": 187862010, + "router_z_loss_clip": 1.56835938, + "router_z_loss_mlp": 0.13195801, + "step": 8743, + "time_per_iteration": 3.8745810985565186 + }, + { + "auxiliary_loss_clip": 0.06328367, + "auxiliary_loss_mlp": 0.01251768, + "balance_loss_clip": 0.06264926, + "balance_loss_mlp": 0.01249956, + "epoch": 0.5257177213287239, + "flos": 62408105297280.0, + "grad_norm": 0.7594186151089873, + "language_loss": 0.54170012, + "learning_rate": 1.930495088031323e-06, + "loss": 0.6175015, + "num_input_tokens_seen": 187922730, + "router_z_loss_clip": 0.63671875, + "router_z_loss_mlp": 0.01808167, + "step": 8744, + "time_per_iteration": 3.2680962085723877 + }, + { + "auxiliary_loss_clip": 0.06434917, + "auxiliary_loss_mlp": 0.01266273, + "balance_loss_clip": 0.0627819, + "balance_loss_mlp": 0.01252635, + "epoch": 0.5257778445813919, + "flos": 20783144880000.0, + "grad_norm": 1.988296485781083, + "language_loss": 0.76358819, + "learning_rate": 1.9301058629240814e-06, + "loss": 0.84060007, + "num_input_tokens_seen": 187940160, + "router_z_loss_clip": 1.56640625, + "router_z_loss_mlp": 0.13653564, + "step": 8745, + "time_per_iteration": 2.5416345596313477 + }, + { + "auxiliary_loss_clip": 0.06422935, + "auxiliary_loss_mlp": 0.01269048, + "balance_loss_clip": 0.06273594, + "balance_loss_mlp": 0.0125733, + "epoch": 0.5258379678340598, + "flos": 17024168465280.0, + "grad_norm": 2.2863222877599703, + "language_loss": 0.81917781, + "learning_rate": 1.9297166404672324e-06, + "loss": 0.8960976, + "num_input_tokens_seen": 187958625, + "router_z_loss_clip": 1.49316406, + "router_z_loss_mlp": 0.1171875, + "step": 8746, + "time_per_iteration": 3.8924081325531006 + }, + { + "auxiliary_loss_clip": 0.06420557, + "auxiliary_loss_mlp": 0.01268181, + "balance_loss_clip": 0.06274772, + "balance_loss_mlp": 0.01257011, + "epoch": 0.5258980910867278, + "flos": 21075032977920.0, + "grad_norm": 1.8269554832422097, + "language_loss": 0.76250327, + "learning_rate": 1.9293274206755353e-06, + "loss": 0.83939064, + "num_input_tokens_seen": 187977575, + "router_z_loss_clip": 1.45703125, + "router_z_loss_mlp": 0.11157227, + "step": 8747, + "time_per_iteration": 2.5338385105133057 + }, + { + "auxiliary_loss_clip": 0.0641925, + "auxiliary_loss_mlp": 0.01266781, + "balance_loss_clip": 0.06273648, + "balance_loss_mlp": 0.01254443, + "epoch": 0.5259582143393957, + "flos": 18010312767360.0, + "grad_norm": 1.781184467493656, + "language_loss": 0.82852685, + "learning_rate": 1.9289382035637505e-06, + "loss": 0.90538716, + "num_input_tokens_seen": 187996650, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.12353516, + "step": 8748, + "time_per_iteration": 2.4989612102508545 + }, + { + "auxiliary_loss_clip": 0.06428373, + "auxiliary_loss_mlp": 0.0126857, + "balance_loss_clip": 0.06276021, + "balance_loss_mlp": 0.01255803, + "epoch": 0.5260183375920637, + "flos": 22790457290880.0, + "grad_norm": 2.0798716741461862, + "language_loss": 0.81033522, + "learning_rate": 1.9285489891466345e-06, + "loss": 0.88730466, + "num_input_tokens_seen": 188013510, + "router_z_loss_clip": 1.5234375, + "router_z_loss_mlp": 0.12756348, + "step": 8749, + "time_per_iteration": 2.541492462158203 + }, + { + "auxiliary_loss_clip": 0.06426647, + "auxiliary_loss_mlp": 0.01269736, + "balance_loss_clip": 0.06276764, + "balance_loss_mlp": 0.01257857, + "epoch": 0.5260784608447318, + "flos": 27059682343680.0, + "grad_norm": 1.8461671999009361, + "language_loss": 0.72827047, + "learning_rate": 1.9281597774389487e-06, + "loss": 0.80523431, + "num_input_tokens_seen": 188032085, + "router_z_loss_clip": 1.49804688, + "router_z_loss_mlp": 0.11877441, + "step": 8750, + "time_per_iteration": 2.55197811126709 + }, + { + "auxiliary_loss_clip": 0.06428036, + "auxiliary_loss_mlp": 0.01265815, + "balance_loss_clip": 0.06278102, + "balance_loss_mlp": 0.0125393, + "epoch": 0.5261385840973997, + "flos": 20668262532480.0, + "grad_norm": 1.3256906405876772, + "language_loss": 0.76755565, + "learning_rate": 1.9277705684554517e-06, + "loss": 0.8444941, + "num_input_tokens_seen": 188050590, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.11883545, + "step": 8751, + "time_per_iteration": 3.989189624786377 + }, + { + "auxiliary_loss_clip": 0.06427495, + "auxiliary_loss_mlp": 0.01266896, + "balance_loss_clip": 0.0627936, + "balance_loss_mlp": 0.01255286, + "epoch": 0.5261987073500677, + "flos": 23629336842240.0, + "grad_norm": 1.3401050149591014, + "language_loss": 0.76360512, + "learning_rate": 1.927381362210902e-06, + "loss": 0.84054899, + "num_input_tokens_seen": 188071620, + "router_z_loss_clip": 1.48242188, + "router_z_loss_mlp": 0.11608887, + "step": 8752, + "time_per_iteration": 2.6008472442626953 + }, + { + "auxiliary_loss_clip": 0.06432231, + "auxiliary_loss_mlp": 0.01266695, + "balance_loss_clip": 0.06278201, + "balance_loss_mlp": 0.01253487, + "epoch": 0.5262588306027356, + "flos": 27643626247680.0, + "grad_norm": 1.396446170400335, + "language_loss": 0.68317235, + "learning_rate": 1.926992158720058e-06, + "loss": 0.76016164, + "num_input_tokens_seen": 188091740, + "router_z_loss_clip": 1.53808594, + "router_z_loss_mlp": 0.13208008, + "step": 8753, + "time_per_iteration": 2.5851571559906006 + }, + { + "auxiliary_loss_clip": 0.06430234, + "auxiliary_loss_mlp": 0.01269545, + "balance_loss_clip": 0.06281005, + "balance_loss_mlp": 0.01257142, + "epoch": 0.5263189538554036, + "flos": 21765725383680.0, + "grad_norm": 1.5666571832863774, + "language_loss": 0.8392294, + "learning_rate": 1.9266029579976785e-06, + "loss": 0.91622722, + "num_input_tokens_seen": 188111165, + "router_z_loss_clip": 1.49121094, + "router_z_loss_mlp": 0.12384033, + "step": 8754, + "time_per_iteration": 2.552424907684326 + }, + { + "auxiliary_loss_clip": 0.06431299, + "auxiliary_loss_mlp": 0.01267122, + "balance_loss_clip": 0.06278868, + "balance_loss_mlp": 0.01254969, + "epoch": 0.5263790771080715, + "flos": 14280490373760.0, + "grad_norm": 9.005791031911038, + "language_loss": 0.87464845, + "learning_rate": 1.926213760058522e-06, + "loss": 0.95163268, + "num_input_tokens_seen": 188127825, + "router_z_loss_clip": 1.52539062, + "router_z_loss_mlp": 0.12139893, + "step": 8755, + "time_per_iteration": 2.4848403930664062 + }, + { + "auxiliary_loss_clip": 0.06329039, + "auxiliary_loss_mlp": 0.01251879, + "balance_loss_clip": 0.06265183, + "balance_loss_mlp": 0.01250204, + "epoch": 0.5264392003607395, + "flos": 65827298206080.0, + "grad_norm": 0.7019882104343015, + "language_loss": 0.5870319, + "learning_rate": 1.9258245649173477e-06, + "loss": 0.66284108, + "num_input_tokens_seen": 188194050, + "router_z_loss_clip": 0.640625, + "router_z_loss_mlp": 0.01678467, + "step": 8756, + "time_per_iteration": 3.275596857070923 + }, + { + "auxiliary_loss_clip": 0.06435139, + "auxiliary_loss_mlp": 0.0126978, + "balance_loss_clip": 0.06280214, + "balance_loss_mlp": 0.01257001, + "epoch": 0.5264993236134075, + "flos": 21038709432960.0, + "grad_norm": 1.5391071607522773, + "language_loss": 0.70246553, + "learning_rate": 1.925435372588913e-06, + "loss": 0.77951479, + "num_input_tokens_seen": 188212565, + "router_z_loss_clip": 1.54882812, + "router_z_loss_mlp": 0.12762451, + "step": 8757, + "time_per_iteration": 2.5078463554382324 + }, + { + "auxiliary_loss_clip": 0.06425242, + "auxiliary_loss_mlp": 0.01271353, + "balance_loss_clip": 0.06274789, + "balance_loss_mlp": 0.01259015, + "epoch": 0.5265594468660755, + "flos": 16623854784000.0, + "grad_norm": 1.5949031044885071, + "language_loss": 0.88366896, + "learning_rate": 1.9250461830879768e-06, + "loss": 0.96063495, + "num_input_tokens_seen": 188229505, + "router_z_loss_clip": 1.50683594, + "router_z_loss_mlp": 0.12341309, + "step": 8758, + "time_per_iteration": 2.503643751144409 + }, + { + "auxiliary_loss_clip": 0.06431897, + "auxiliary_loss_mlp": 0.01273559, + "balance_loss_clip": 0.06277955, + "balance_loss_mlp": 0.01260165, + "epoch": 0.5266195701187434, + "flos": 24141010999680.0, + "grad_norm": 1.3529199811462889, + "language_loss": 0.76677716, + "learning_rate": 1.9246569964292965e-06, + "loss": 0.84383172, + "num_input_tokens_seen": 188250395, + "router_z_loss_clip": 1.53808594, + "router_z_loss_mlp": 0.13391113, + "step": 8759, + "time_per_iteration": 4.0746564865112305 + }, + { + "auxiliary_loss_clip": 0.06426352, + "auxiliary_loss_mlp": 0.01272091, + "balance_loss_clip": 0.06278519, + "balance_loss_mlp": 0.01258603, + "epoch": 0.5266796933714114, + "flos": 15848314519680.0, + "grad_norm": 1.866695897182309, + "language_loss": 0.72062105, + "learning_rate": 1.9242678126276307e-06, + "loss": 0.79760551, + "num_input_tokens_seen": 188266785, + "router_z_loss_clip": 1.47753906, + "router_z_loss_mlp": 0.1348877, + "step": 8760, + "time_per_iteration": 2.4678292274475098 + }, + { + "auxiliary_loss_clip": 0.06434111, + "auxiliary_loss_mlp": 0.01266301, + "balance_loss_clip": 0.06277363, + "balance_loss_mlp": 0.01253152, + "epoch": 0.5267398166240793, + "flos": 20956377196800.0, + "grad_norm": 2.1261739839163263, + "language_loss": 0.76520377, + "learning_rate": 1.923878631697736e-06, + "loss": 0.84220791, + "num_input_tokens_seen": 188282525, + "router_z_loss_clip": 1.56835938, + "router_z_loss_mlp": 0.13140869, + "step": 8761, + "time_per_iteration": 2.5250892639160156 + }, + { + "auxiliary_loss_clip": 0.06431311, + "auxiliary_loss_mlp": 0.01268211, + "balance_loss_clip": 0.06277812, + "balance_loss_mlp": 0.01256696, + "epoch": 0.5267999398767473, + "flos": 21002763231360.0, + "grad_norm": 1.6289028393625449, + "language_loss": 0.7137605, + "learning_rate": 1.923489453654373e-06, + "loss": 0.79075569, + "num_input_tokens_seen": 188301395, + "router_z_loss_clip": 1.53417969, + "router_z_loss_mlp": 0.1151123, + "step": 8762, + "time_per_iteration": 2.50102162361145 + }, + { + "auxiliary_loss_clip": 0.06330161, + "auxiliary_loss_mlp": 0.01253956, + "balance_loss_clip": 0.06266978, + "balance_loss_mlp": 0.01252303, + "epoch": 0.5268600631294152, + "flos": 66867935189760.0, + "grad_norm": 0.9166133094312116, + "language_loss": 0.65129638, + "learning_rate": 1.9231002785122963e-06, + "loss": 0.72713745, + "num_input_tokens_seen": 188357665, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 0.01655579, + "step": 8763, + "time_per_iteration": 3.076136827468872 + }, + { + "auxiliary_loss_clip": 0.06428451, + "auxiliary_loss_mlp": 0.01268489, + "balance_loss_clip": 0.06276652, + "balance_loss_mlp": 0.01255918, + "epoch": 0.5269201863820833, + "flos": 17171307434880.0, + "grad_norm": 1.6120731347351738, + "language_loss": 0.71481144, + "learning_rate": 1.922711106286265e-06, + "loss": 0.79178083, + "num_input_tokens_seen": 188376935, + "router_z_loss_clip": 1.51660156, + "router_z_loss_mlp": 0.12579346, + "step": 8764, + "time_per_iteration": 2.5250110626220703 + }, + { + "auxiliary_loss_clip": 0.06431142, + "auxiliary_loss_mlp": 0.01269659, + "balance_loss_clip": 0.06278007, + "balance_loss_mlp": 0.01256141, + "epoch": 0.5269803096347513, + "flos": 20528963919360.0, + "grad_norm": 1.6456726211241999, + "language_loss": 0.74125087, + "learning_rate": 1.9223219369910368e-06, + "loss": 0.81825888, + "num_input_tokens_seen": 188394995, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.13531494, + "step": 8765, + "time_per_iteration": 2.552011251449585 + }, + { + "auxiliary_loss_clip": 0.06432463, + "auxiliary_loss_mlp": 0.01265724, + "balance_loss_clip": 0.06276315, + "balance_loss_mlp": 0.01253076, + "epoch": 0.5270404328874192, + "flos": 27237652416000.0, + "grad_norm": 1.4730640837864142, + "language_loss": 0.8564899, + "learning_rate": 1.9219327706413677e-06, + "loss": 0.9334718, + "num_input_tokens_seen": 188415475, + "router_z_loss_clip": 1.55957031, + "router_z_loss_mlp": 0.12640381, + "step": 8766, + "time_per_iteration": 2.5471248626708984 + }, + { + "auxiliary_loss_clip": 0.06432243, + "auxiliary_loss_mlp": 0.01271497, + "balance_loss_clip": 0.06278689, + "balance_loss_mlp": 0.01257812, + "epoch": 0.5271005561400872, + "flos": 23116866071040.0, + "grad_norm": 1.6309488802468612, + "language_loss": 0.79294145, + "learning_rate": 1.921543607252017e-06, + "loss": 0.8699789, + "num_input_tokens_seen": 188435665, + "router_z_loss_clip": 1.53515625, + "router_z_loss_mlp": 0.13690186, + "step": 8767, + "time_per_iteration": 2.5700509548187256 + }, + { + "auxiliary_loss_clip": 0.06431086, + "auxiliary_loss_mlp": 0.01269174, + "balance_loss_clip": 0.06277132, + "balance_loss_mlp": 0.01256532, + "epoch": 0.5271606793927551, + "flos": 22571342064000.0, + "grad_norm": 1.7993411408437945, + "language_loss": 0.73931158, + "learning_rate": 1.9211544468377394e-06, + "loss": 0.81631416, + "num_input_tokens_seen": 188455405, + "router_z_loss_clip": 1.5390625, + "router_z_loss_mlp": 0.12646484, + "step": 8768, + "time_per_iteration": 2.5251431465148926 + }, + { + "auxiliary_loss_clip": 0.06428067, + "auxiliary_loss_mlp": 0.01269059, + "balance_loss_clip": 0.0627723, + "balance_loss_mlp": 0.01257174, + "epoch": 0.5272208026454231, + "flos": 18769166069760.0, + "grad_norm": 1.6856667564577028, + "language_loss": 0.74105024, + "learning_rate": 1.9207652894132933e-06, + "loss": 0.81802148, + "num_input_tokens_seen": 188472940, + "router_z_loss_clip": 1.50878906, + "router_z_loss_mlp": 0.11883545, + "step": 8769, + "time_per_iteration": 2.518446683883667 + }, + { + "auxiliary_loss_clip": 0.06431002, + "auxiliary_loss_mlp": 0.01267563, + "balance_loss_clip": 0.06279421, + "balance_loss_mlp": 0.01255172, + "epoch": 0.5272809258980911, + "flos": 20418358129920.0, + "grad_norm": 1.672714058447801, + "language_loss": 0.74041271, + "learning_rate": 1.920376134993436e-06, + "loss": 0.81739843, + "num_input_tokens_seen": 188493035, + "router_z_loss_clip": 1.51367188, + "router_z_loss_mlp": 0.1239624, + "step": 8770, + "time_per_iteration": 2.5188913345336914 + }, + { + "auxiliary_loss_clip": 0.06428713, + "auxiliary_loss_mlp": 0.01271059, + "balance_loss_clip": 0.06278759, + "balance_loss_mlp": 0.01259085, + "epoch": 0.5273410491507591, + "flos": 28264271040000.0, + "grad_norm": 1.8244918854449486, + "language_loss": 0.68641269, + "learning_rate": 1.9199869835929224e-06, + "loss": 0.76341033, + "num_input_tokens_seen": 188513860, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.11987305, + "step": 8771, + "time_per_iteration": 2.5867247581481934 + }, + { + "auxiliary_loss_clip": 0.06424269, + "auxiliary_loss_mlp": 0.01271661, + "balance_loss_clip": 0.06276186, + "balance_loss_mlp": 0.01259704, + "epoch": 0.527401172403427, + "flos": 22461658669440.0, + "grad_norm": 11.676913645943259, + "language_loss": 0.7669906, + "learning_rate": 1.9195978352265115e-06, + "loss": 0.84394991, + "num_input_tokens_seen": 188533345, + "router_z_loss_clip": 1.48242188, + "router_z_loss_mlp": 0.11938477, + "step": 8772, + "time_per_iteration": 2.5199668407440186 + }, + { + "auxiliary_loss_clip": 0.06429616, + "auxiliary_loss_mlp": 0.01267782, + "balance_loss_clip": 0.0627689, + "balance_loss_mlp": 0.01255599, + "epoch": 0.527461295656095, + "flos": 21037158132480.0, + "grad_norm": 2.161876297932061, + "language_loss": 0.66294622, + "learning_rate": 1.9192086899089585e-06, + "loss": 0.73992014, + "num_input_tokens_seen": 188551550, + "router_z_loss_clip": 1.52734375, + "router_z_loss_mlp": 0.12176514, + "step": 8773, + "time_per_iteration": 2.5476229190826416 + }, + { + "auxiliary_loss_clip": 0.06430208, + "auxiliary_loss_mlp": 0.01267896, + "balance_loss_clip": 0.06277348, + "balance_loss_mlp": 0.01256643, + "epoch": 0.5275214189087629, + "flos": 26329060667520.0, + "grad_norm": 1.7199176113539936, + "language_loss": 0.86321867, + "learning_rate": 1.91881954765502e-06, + "loss": 0.94019973, + "num_input_tokens_seen": 188571615, + "router_z_loss_clip": 1.52929688, + "router_z_loss_mlp": 0.11254883, + "step": 8774, + "time_per_iteration": 2.545171022415161 + }, + { + "auxiliary_loss_clip": 0.06427547, + "auxiliary_loss_mlp": 0.01271648, + "balance_loss_clip": 0.06276767, + "balance_loss_mlp": 0.01259525, + "epoch": 0.5275815421614309, + "flos": 20053110182400.0, + "grad_norm": 1.6744248524719214, + "language_loss": 0.80195713, + "learning_rate": 1.9184304084794523e-06, + "loss": 0.87894905, + "num_input_tokens_seen": 188591965, + "router_z_loss_clip": 1.50683594, + "router_z_loss_mlp": 0.12121582, + "step": 8775, + "time_per_iteration": 2.544409990310669 + }, + { + "auxiliary_loss_clip": 0.06422298, + "auxiliary_loss_mlp": 0.01270371, + "balance_loss_clip": 0.06275839, + "balance_loss_mlp": 0.01257968, + "epoch": 0.5276416654140988, + "flos": 21438310354560.0, + "grad_norm": 1.5933640173688606, + "language_loss": 0.83310181, + "learning_rate": 1.918041272397012e-06, + "loss": 0.91002852, + "num_input_tokens_seen": 188610675, + "router_z_loss_clip": 1.46289062, + "router_z_loss_mlp": 0.1239624, + "step": 8776, + "time_per_iteration": 2.5175352096557617 + }, + { + "auxiliary_loss_clip": 0.06428739, + "auxiliary_loss_mlp": 0.012708, + "balance_loss_clip": 0.06277907, + "balance_loss_mlp": 0.0125867, + "epoch": 0.5277017886667669, + "flos": 17170762383360.0, + "grad_norm": 1.5849666431846519, + "language_loss": 0.67932826, + "learning_rate": 1.9176521394224547e-06, + "loss": 0.7563237, + "num_input_tokens_seen": 188628235, + "router_z_loss_clip": 1.50878906, + "router_z_loss_mlp": 0.12127686, + "step": 8777, + "time_per_iteration": 2.5778138637542725 + }, + { + "auxiliary_loss_clip": 0.06429909, + "auxiliary_loss_mlp": 0.01265517, + "balance_loss_clip": 0.06281164, + "balance_loss_mlp": 0.01253935, + "epoch": 0.5277619119194349, + "flos": 20454262404480.0, + "grad_norm": 1.855602906151282, + "language_loss": 0.82547855, + "learning_rate": 1.9172630095705358e-06, + "loss": 0.90243274, + "num_input_tokens_seen": 188648925, + "router_z_loss_clip": 1.48632812, + "router_z_loss_mlp": 0.11584473, + "step": 8778, + "time_per_iteration": 2.571700096130371 + }, + { + "auxiliary_loss_clip": 0.06433128, + "auxiliary_loss_mlp": 0.01269297, + "balance_loss_clip": 0.06280521, + "balance_loss_mlp": 0.01257114, + "epoch": 0.5278220351721028, + "flos": 24067944639360.0, + "grad_norm": 1.9512823836083997, + "language_loss": 0.79944891, + "learning_rate": 1.916873882856013e-06, + "loss": 0.87647313, + "num_input_tokens_seen": 188668125, + "router_z_loss_clip": 1.52441406, + "router_z_loss_mlp": 0.1217041, + "step": 8779, + "time_per_iteration": 2.562757968902588 + }, + { + "auxiliary_loss_clip": 0.06427805, + "auxiliary_loss_mlp": 0.01263718, + "balance_loss_clip": 0.06278832, + "balance_loss_mlp": 0.01252429, + "epoch": 0.5278821584247708, + "flos": 24649540629120.0, + "grad_norm": 2.3350915047762957, + "language_loss": 0.77251387, + "learning_rate": 1.9164847592936406e-06, + "loss": 0.84942913, + "num_input_tokens_seen": 188684410, + "router_z_loss_clip": 1.48828125, + "router_z_loss_mlp": 0.11291504, + "step": 8780, + "time_per_iteration": 2.517606258392334 + }, + { + "auxiliary_loss_clip": 0.0643455, + "auxiliary_loss_mlp": 0.01267518, + "balance_loss_clip": 0.06281555, + "balance_loss_mlp": 0.01254507, + "epoch": 0.5279422816774387, + "flos": 35417017848960.0, + "grad_norm": 1.6574386864631518, + "language_loss": 0.69489729, + "learning_rate": 1.916095638898174e-06, + "loss": 0.77191794, + "num_input_tokens_seen": 188706130, + "router_z_loss_clip": 1.52832031, + "router_z_loss_mlp": 0.13018799, + "step": 8781, + "time_per_iteration": 2.693525791168213 + }, + { + "auxiliary_loss_clip": 0.06421035, + "auxiliary_loss_mlp": 0.01270298, + "balance_loss_clip": 0.06274436, + "balance_loss_mlp": 0.01259051, + "epoch": 0.5280024049301068, + "flos": 22973794024320.0, + "grad_norm": 1.4417281394316688, + "language_loss": 0.7270093, + "learning_rate": 1.9157065216843696e-06, + "loss": 0.80392265, + "num_input_tokens_seen": 188725030, + "router_z_loss_clip": 1.46679688, + "router_z_loss_mlp": 0.11254883, + "step": 8782, + "time_per_iteration": 2.5421454906463623 + }, + { + "auxiliary_loss_clip": 0.06428084, + "auxiliary_loss_mlp": 0.01267241, + "balance_loss_clip": 0.06279479, + "balance_loss_mlp": 0.01255314, + "epoch": 0.5280625281827747, + "flos": 21514143899520.0, + "grad_norm": 1.839654531053583, + "language_loss": 0.68914783, + "learning_rate": 1.915317407666982e-06, + "loss": 0.76610112, + "num_input_tokens_seen": 188744325, + "router_z_loss_clip": 1.48535156, + "router_z_loss_mlp": 0.1192627, + "step": 8783, + "time_per_iteration": 4.037707328796387 + }, + { + "auxiliary_loss_clip": 0.06440329, + "auxiliary_loss_mlp": 0.01270068, + "balance_loss_clip": 0.06282043, + "balance_loss_mlp": 0.01256281, + "epoch": 0.5281226514354427, + "flos": 31215534422400.0, + "grad_norm": 1.947626233704344, + "language_loss": 0.69763857, + "learning_rate": 1.9149282968607674e-06, + "loss": 0.77474254, + "num_input_tokens_seen": 188765100, + "router_z_loss_clip": 1.58203125, + "router_z_loss_mlp": 0.13793945, + "step": 8784, + "time_per_iteration": 2.6415882110595703 + }, + { + "auxiliary_loss_clip": 0.06436743, + "auxiliary_loss_mlp": 0.01269839, + "balance_loss_clip": 0.06277036, + "balance_loss_mlp": 0.01256393, + "epoch": 0.5281827746881106, + "flos": 25084039576320.0, + "grad_norm": 1.9575438568521135, + "language_loss": 0.75138849, + "learning_rate": 1.91453918928048e-06, + "loss": 0.82845432, + "num_input_tokens_seen": 188783995, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.13458252, + "step": 8785, + "time_per_iteration": 2.5360119342803955 + }, + { + "auxiliary_loss_clip": 0.06430692, + "auxiliary_loss_mlp": 0.01270335, + "balance_loss_clip": 0.06279787, + "balance_loss_mlp": 0.01257806, + "epoch": 0.5282428979407786, + "flos": 20637515283840.0, + "grad_norm": 2.81532856062796, + "language_loss": 0.83379281, + "learning_rate": 1.9141500849408745e-06, + "loss": 0.91080302, + "num_input_tokens_seen": 188803120, + "router_z_loss_clip": 1.50976562, + "router_z_loss_mlp": 0.12518311, + "step": 8786, + "time_per_iteration": 3.923038959503174 + }, + { + "auxiliary_loss_clip": 0.06426571, + "auxiliary_loss_mlp": 0.01265911, + "balance_loss_clip": 0.0628151, + "balance_loss_mlp": 0.01255248, + "epoch": 0.5283030211934465, + "flos": 22426005957120.0, + "grad_norm": 2.0503071903036134, + "language_loss": 0.82639015, + "learning_rate": 1.9137609838567076e-06, + "loss": 0.90331495, + "num_input_tokens_seen": 188820960, + "router_z_loss_clip": 1.45117188, + "router_z_loss_mlp": 0.10650635, + "step": 8787, + "time_per_iteration": 2.549422025680542 + }, + { + "auxiliary_loss_clip": 0.06423321, + "auxiliary_loss_mlp": 0.01271192, + "balance_loss_clip": 0.06276572, + "balance_loss_mlp": 0.01259932, + "epoch": 0.5283631444461145, + "flos": 23620951434240.0, + "grad_norm": 1.6336970157139816, + "language_loss": 0.83324271, + "learning_rate": 1.9133718860427316e-06, + "loss": 0.91018784, + "num_input_tokens_seen": 188837165, + "router_z_loss_clip": 1.46582031, + "router_z_loss_mlp": 0.11260986, + "step": 8788, + "time_per_iteration": 2.4937057495117188 + }, + { + "auxiliary_loss_clip": 0.06426245, + "auxiliary_loss_mlp": 0.01271299, + "balance_loss_clip": 0.06279786, + "balance_loss_mlp": 0.0125886, + "epoch": 0.5284232676987825, + "flos": 32680341573120.0, + "grad_norm": 1.675322731323109, + "language_loss": 0.75004017, + "learning_rate": 1.9129827915137027e-06, + "loss": 0.82701558, + "num_input_tokens_seen": 188858555, + "router_z_loss_clip": 1.46484375, + "router_z_loss_mlp": 0.12451172, + "step": 8789, + "time_per_iteration": 2.6138312816619873 + }, + { + "auxiliary_loss_clip": 0.06430633, + "auxiliary_loss_mlp": 0.01265881, + "balance_loss_clip": 0.06280988, + "balance_loss_mlp": 0.01254139, + "epoch": 0.5284833909514505, + "flos": 26768213516160.0, + "grad_norm": 1.5707088647426293, + "language_loss": 0.70574284, + "learning_rate": 1.9125937002843754e-06, + "loss": 0.78270793, + "num_input_tokens_seen": 188879050, + "router_z_loss_clip": 1.49609375, + "router_z_loss_mlp": 0.11743164, + "step": 8790, + "time_per_iteration": 2.5883655548095703 + }, + { + "auxiliary_loss_clip": 0.06427436, + "auxiliary_loss_mlp": 0.01266819, + "balance_loss_clip": 0.06280458, + "balance_loss_mlp": 0.01255506, + "epoch": 0.5285435142041185, + "flos": 22097207335680.0, + "grad_norm": 1.512627214826232, + "language_loss": 0.79474425, + "learning_rate": 1.9122046123695036e-06, + "loss": 0.87168682, + "num_input_tokens_seen": 188898885, + "router_z_loss_clip": 1.46972656, + "router_z_loss_mlp": 0.11309814, + "step": 8791, + "time_per_iteration": 4.033270835876465 + }, + { + "auxiliary_loss_clip": 0.06429024, + "auxiliary_loss_mlp": 0.01266875, + "balance_loss_clip": 0.06280901, + "balance_loss_mlp": 0.01255205, + "epoch": 0.5286036374567864, + "flos": 20381615314560.0, + "grad_norm": 2.07521505612664, + "language_loss": 0.65493345, + "learning_rate": 1.9118155277838423e-06, + "loss": 0.73189247, + "num_input_tokens_seen": 188917225, + "router_z_loss_clip": 1.48144531, + "router_z_loss_mlp": 0.11676025, + "step": 8792, + "time_per_iteration": 2.521308183670044 + }, + { + "auxiliary_loss_clip": 0.06423797, + "auxiliary_loss_mlp": 0.01264198, + "balance_loss_clip": 0.06276767, + "balance_loss_mlp": 0.01253415, + "epoch": 0.5286637607094544, + "flos": 24358952269440.0, + "grad_norm": 2.076646851589869, + "language_loss": 0.79861224, + "learning_rate": 1.9114264465421443e-06, + "loss": 0.87549216, + "num_input_tokens_seen": 188936120, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 0.10778809, + "step": 8793, + "time_per_iteration": 2.5511038303375244 + }, + { + "auxiliary_loss_clip": 0.06422493, + "auxiliary_loss_mlp": 0.01268071, + "balance_loss_clip": 0.062755, + "balance_loss_mlp": 0.01256168, + "epoch": 0.5287238839621223, + "flos": 17276295000960.0, + "grad_norm": 2.078436862745294, + "language_loss": 0.85337698, + "learning_rate": 1.9110373686591645e-06, + "loss": 0.93028271, + "num_input_tokens_seen": 188953405, + "router_z_loss_clip": 1.46972656, + "router_z_loss_mlp": 0.11901855, + "step": 8794, + "time_per_iteration": 2.4898123741149902 + }, + { + "auxiliary_loss_clip": 0.06434184, + "auxiliary_loss_mlp": 0.01268266, + "balance_loss_clip": 0.0627749, + "balance_loss_mlp": 0.01255284, + "epoch": 0.5287840072147904, + "flos": 17572711219200.0, + "grad_norm": 2.1545808018265427, + "language_loss": 0.67890751, + "learning_rate": 1.9106482941496564e-06, + "loss": 0.75593209, + "num_input_tokens_seen": 188971150, + "router_z_loss_clip": 1.56542969, + "router_z_loss_mlp": 0.12982178, + "step": 8795, + "time_per_iteration": 2.5213987827301025 + }, + { + "auxiliary_loss_clip": 0.0642955, + "auxiliary_loss_mlp": 0.01269682, + "balance_loss_clip": 0.06279209, + "balance_loss_mlp": 0.01257714, + "epoch": 0.5288441304674583, + "flos": 18558100834560.0, + "grad_norm": 1.7521680482784363, + "language_loss": 0.80681872, + "learning_rate": 1.910259223028374e-06, + "loss": 0.88381112, + "num_input_tokens_seen": 188989550, + "router_z_loss_clip": 1.50390625, + "router_z_loss_mlp": 0.11968994, + "step": 8796, + "time_per_iteration": 2.4875407218933105 + }, + { + "auxiliary_loss_clip": 0.06428242, + "auxiliary_loss_mlp": 0.01267761, + "balance_loss_clip": 0.06279264, + "balance_loss_mlp": 0.01255656, + "epoch": 0.5289042537201263, + "flos": 20820935871360.0, + "grad_norm": 1.952583587455058, + "language_loss": 0.69353104, + "learning_rate": 1.909870155310071e-06, + "loss": 0.770491, + "num_input_tokens_seen": 189008795, + "router_z_loss_clip": 1.48925781, + "router_z_loss_mlp": 0.12097168, + "step": 8797, + "time_per_iteration": 2.5311903953552246 + }, + { + "auxiliary_loss_clip": 0.06424771, + "auxiliary_loss_mlp": 0.01268361, + "balance_loss_clip": 0.06280869, + "balance_loss_mlp": 0.01256857, + "epoch": 0.5289643769727942, + "flos": 15739553520000.0, + "grad_norm": 1.4672049002002021, + "language_loss": 0.82371795, + "learning_rate": 1.9094810910095005e-06, + "loss": 0.90064925, + "num_input_tokens_seen": 189025540, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.11499023, + "step": 8798, + "time_per_iteration": 3.947748899459839 + }, + { + "auxiliary_loss_clip": 0.06430193, + "auxiliary_loss_mlp": 0.01268372, + "balance_loss_clip": 0.06277348, + "balance_loss_mlp": 0.01255181, + "epoch": 0.5290245002254622, + "flos": 19543490449920.0, + "grad_norm": 2.0391495748491133, + "language_loss": 0.71206701, + "learning_rate": 1.9090920301414166e-06, + "loss": 0.78905261, + "num_input_tokens_seen": 189044885, + "router_z_loss_clip": 1.52929688, + "router_z_loss_mlp": 0.13201904, + "step": 8799, + "time_per_iteration": 2.5031862258911133 + }, + { + "auxiliary_loss_clip": 0.06420026, + "auxiliary_loss_mlp": 0.01267776, + "balance_loss_clip": 0.06277078, + "balance_loss_mlp": 0.01256124, + "epoch": 0.5290846234781301, + "flos": 15820586017920.0, + "grad_norm": 1.9322407735459124, + "language_loss": 0.69337815, + "learning_rate": 1.9087029727205716e-06, + "loss": 0.77025622, + "num_input_tokens_seen": 189061280, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.11657715, + "step": 8800, + "time_per_iteration": 2.5130701065063477 + }, + { + "auxiliary_loss_clip": 0.06335981, + "auxiliary_loss_mlp": 0.01252268, + "balance_loss_clip": 0.06272759, + "balance_loss_mlp": 0.01250352, + "epoch": 0.5291447467307981, + "flos": 70076272498560.0, + "grad_norm": 0.8722049049478691, + "language_loss": 0.5706265, + "learning_rate": 1.9083139187617193e-06, + "loss": 0.64650893, + "num_input_tokens_seen": 189114775, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 0.01913452, + "step": 8801, + "time_per_iteration": 3.0075480937957764 + }, + { + "auxiliary_loss_clip": 0.06425781, + "auxiliary_loss_mlp": 0.01269363, + "balance_loss_clip": 0.06275494, + "balance_loss_mlp": 0.01257978, + "epoch": 0.529204869983466, + "flos": 28371396885120.0, + "grad_norm": 1.559087936128458, + "language_loss": 0.64462554, + "learning_rate": 1.9079248682796123e-06, + "loss": 0.72157693, + "num_input_tokens_seen": 189134700, + "router_z_loss_clip": 1.50292969, + "router_z_loss_mlp": 0.1138916, + "step": 8802, + "time_per_iteration": 2.568263053894043 + }, + { + "auxiliary_loss_clip": 0.06423493, + "auxiliary_loss_mlp": 0.01268948, + "balance_loss_clip": 0.06277072, + "balance_loss_mlp": 0.01257969, + "epoch": 0.5292649932361341, + "flos": 33766064853120.0, + "grad_norm": 1.9436732858799899, + "language_loss": 0.69115645, + "learning_rate": 1.907535821289003e-06, + "loss": 0.76808089, + "num_input_tokens_seen": 189155365, + "router_z_loss_clip": 1.46289062, + "router_z_loss_mlp": 0.10980225, + "step": 8803, + "time_per_iteration": 2.637096881866455 + }, + { + "auxiliary_loss_clip": 0.06421783, + "auxiliary_loss_mlp": 0.01270558, + "balance_loss_clip": 0.0627604, + "balance_loss_mlp": 0.01258596, + "epoch": 0.5293251164888021, + "flos": 20453717352960.0, + "grad_norm": 1.815171914881367, + "language_loss": 0.75997305, + "learning_rate": 1.9071467778046458e-06, + "loss": 0.83689642, + "num_input_tokens_seen": 189173885, + "router_z_loss_clip": 1.45800781, + "router_z_loss_mlp": 0.11962891, + "step": 8804, + "time_per_iteration": 2.5163068771362305 + }, + { + "auxiliary_loss_clip": 0.0632845, + "auxiliary_loss_mlp": 0.01252381, + "balance_loss_clip": 0.06265265, + "balance_loss_mlp": 0.01250461, + "epoch": 0.52938523974147, + "flos": 66567856590720.0, + "grad_norm": 0.7410273965373205, + "language_loss": 0.52945232, + "learning_rate": 1.906757737841291e-06, + "loss": 0.60526061, + "num_input_tokens_seen": 189236515, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 0.01916504, + "step": 8805, + "time_per_iteration": 3.24060320854187 + }, + { + "auxiliary_loss_clip": 0.06328098, + "auxiliary_loss_mlp": 0.01252617, + "balance_loss_clip": 0.06265187, + "balance_loss_mlp": 0.01250968, + "epoch": 0.529445362994138, + "flos": 67172065983360.0, + "grad_norm": 1.018872897712542, + "language_loss": 0.63735455, + "learning_rate": 1.906368701413693e-06, + "loss": 0.71316171, + "num_input_tokens_seen": 189300500, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 0.01652527, + "step": 8806, + "time_per_iteration": 3.1444826126098633 + }, + { + "auxiliary_loss_clip": 0.06429877, + "auxiliary_loss_mlp": 0.01268417, + "balance_loss_clip": 0.06274825, + "balance_loss_mlp": 0.01256073, + "epoch": 0.5295054862468059, + "flos": 17755167484800.0, + "grad_norm": 1.837636262170248, + "language_loss": 0.7251606, + "learning_rate": 1.9059796685366026e-06, + "loss": 0.80214357, + "num_input_tokens_seen": 189319745, + "router_z_loss_clip": 1.54980469, + "router_z_loss_mlp": 0.12335205, + "step": 8807, + "time_per_iteration": 2.513139247894287 + }, + { + "auxiliary_loss_clip": 0.06424799, + "auxiliary_loss_mlp": 0.01267744, + "balance_loss_clip": 0.06278958, + "balance_loss_mlp": 0.01257241, + "epoch": 0.529565609499474, + "flos": 11401622519040.0, + "grad_norm": 2.5266289150801295, + "language_loss": 0.69956362, + "learning_rate": 1.9055906392247723e-06, + "loss": 0.77648908, + "num_input_tokens_seen": 189334550, + "router_z_loss_clip": 1.45800781, + "router_z_loss_mlp": 0.1050415, + "step": 8808, + "time_per_iteration": 2.472822666168213 + }, + { + "auxiliary_loss_clip": 0.06422195, + "auxiliary_loss_mlp": 0.0126947, + "balance_loss_clip": 0.06274572, + "balance_loss_mlp": 0.01258861, + "epoch": 0.5296257327521419, + "flos": 17201174215680.0, + "grad_norm": 2.036831994826339, + "language_loss": 0.87141514, + "learning_rate": 1.9052016134929554e-06, + "loss": 0.94833171, + "num_input_tokens_seen": 189351735, + "router_z_loss_clip": 1.47851562, + "router_z_loss_mlp": 0.10614014, + "step": 8809, + "time_per_iteration": 2.5245158672332764 + }, + { + "auxiliary_loss_clip": 0.06436493, + "auxiliary_loss_mlp": 0.01270155, + "balance_loss_clip": 0.062795, + "balance_loss_mlp": 0.01257138, + "epoch": 0.5296858560048099, + "flos": 39972806265600.0, + "grad_norm": 1.6505081453472243, + "language_loss": 0.64378583, + "learning_rate": 1.9048125913559016e-06, + "loss": 0.72085232, + "num_input_tokens_seen": 189373105, + "router_z_loss_clip": 1.56835938, + "router_z_loss_mlp": 0.13037109, + "step": 8810, + "time_per_iteration": 2.6857082843780518 + }, + { + "auxiliary_loss_clip": 0.06422746, + "auxiliary_loss_mlp": 0.01270623, + "balance_loss_clip": 0.06277126, + "balance_loss_mlp": 0.01259012, + "epoch": 0.5297459792574778, + "flos": 20968032913920.0, + "grad_norm": 1.5863211204070509, + "language_loss": 0.68117309, + "learning_rate": 1.9044235728283646e-06, + "loss": 0.75810677, + "num_input_tokens_seen": 189394615, + "router_z_loss_clip": 1.45605469, + "router_z_loss_mlp": 0.11608887, + "step": 8811, + "time_per_iteration": 2.5947864055633545 + }, + { + "auxiliary_loss_clip": 0.06326769, + "auxiliary_loss_mlp": 0.01252115, + "balance_loss_clip": 0.06264065, + "balance_loss_mlp": 0.0125052, + "epoch": 0.5298061025101458, + "flos": 66542532658560.0, + "grad_norm": 0.6560344299955198, + "language_loss": 0.53324163, + "learning_rate": 1.9040345579250953e-06, + "loss": 0.60903049, + "num_input_tokens_seen": 189459750, + "router_z_loss_clip": 0.62890625, + "router_z_loss_mlp": 0.01597595, + "step": 8812, + "time_per_iteration": 3.2503774166107178 + }, + { + "auxiliary_loss_clip": 0.06327102, + "auxiliary_loss_mlp": 0.01252134, + "balance_loss_clip": 0.06264044, + "balance_loss_mlp": 0.01250548, + "epoch": 0.5298662257628137, + "flos": 67683488578560.0, + "grad_norm": 0.7118690065629296, + "language_loss": 0.56452167, + "learning_rate": 1.9036455466608453e-06, + "loss": 0.64031398, + "num_input_tokens_seen": 189527540, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 0.01586151, + "step": 8813, + "time_per_iteration": 3.211704730987549 + }, + { + "auxiliary_loss_clip": 0.06420116, + "auxiliary_loss_mlp": 0.0126288, + "balance_loss_clip": 0.06277177, + "balance_loss_mlp": 0.01252223, + "epoch": 0.5299263490154817, + "flos": 19652544938880.0, + "grad_norm": 1.6476785970765333, + "language_loss": 0.82062042, + "learning_rate": 1.9032565390503657e-06, + "loss": 0.89745033, + "num_input_tokens_seen": 189546900, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.10656738, + "step": 8814, + "time_per_iteration": 2.5407004356384277 + }, + { + "auxiliary_loss_clip": 0.06433088, + "auxiliary_loss_mlp": 0.01266965, + "balance_loss_clip": 0.062782, + "balance_loss_mlp": 0.01255646, + "epoch": 0.5299864722681497, + "flos": 22061638477440.0, + "grad_norm": 1.5146312250557674, + "language_loss": 0.85424864, + "learning_rate": 1.9028675351084076e-06, + "loss": 0.93124914, + "num_input_tokens_seen": 189566490, + "router_z_loss_clip": 1.54882812, + "router_z_loss_mlp": 0.11322021, + "step": 8815, + "time_per_iteration": 2.511718273162842 + }, + { + "auxiliary_loss_clip": 0.06421779, + "auxiliary_loss_mlp": 0.01265999, + "balance_loss_clip": 0.0627707, + "balance_loss_mlp": 0.01254573, + "epoch": 0.5300465955208177, + "flos": 21770379285120.0, + "grad_norm": 2.2057457770846947, + "language_loss": 0.67210793, + "learning_rate": 1.9024785348497225e-06, + "loss": 0.74898565, + "num_input_tokens_seen": 189585580, + "router_z_loss_clip": 1.44824219, + "router_z_loss_mlp": 0.11431885, + "step": 8816, + "time_per_iteration": 2.564680576324463 + }, + { + "auxiliary_loss_clip": 0.06425485, + "auxiliary_loss_mlp": 0.01269628, + "balance_loss_clip": 0.06278205, + "balance_loss_mlp": 0.01258106, + "epoch": 0.5301067187734857, + "flos": 43006401884160.0, + "grad_norm": 1.5302739112082, + "language_loss": 0.72652006, + "learning_rate": 1.9020895382890611e-06, + "loss": 0.80347115, + "num_input_tokens_seen": 189608485, + "router_z_loss_clip": 1.47265625, + "router_z_loss_mlp": 0.1151123, + "step": 8817, + "time_per_iteration": 2.719486951828003 + }, + { + "auxiliary_loss_clip": 0.06425378, + "auxiliary_loss_mlp": 0.0126821, + "balance_loss_clip": 0.06274515, + "balance_loss_mlp": 0.01256957, + "epoch": 0.5301668420261536, + "flos": 20559878876160.0, + "grad_norm": 1.5998738611170542, + "language_loss": 0.65166581, + "learning_rate": 1.9017005454411743e-06, + "loss": 0.72860169, + "num_input_tokens_seen": 189627815, + "router_z_loss_clip": 1.5078125, + "router_z_loss_mlp": 0.11242676, + "step": 8818, + "time_per_iteration": 2.573202610015869 + }, + { + "auxiliary_loss_clip": 0.06425599, + "auxiliary_loss_mlp": 0.01266023, + "balance_loss_clip": 0.06275538, + "balance_loss_mlp": 0.0125378, + "epoch": 0.5302269652788216, + "flos": 17491259450880.0, + "grad_norm": 1.7883158874481297, + "language_loss": 0.75112927, + "learning_rate": 1.9013115563208126e-06, + "loss": 0.82804549, + "num_input_tokens_seen": 189644850, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.12249756, + "step": 8819, + "time_per_iteration": 2.4882779121398926 + }, + { + "auxiliary_loss_clip": 0.06426901, + "auxiliary_loss_mlp": 0.01268351, + "balance_loss_clip": 0.06273513, + "balance_loss_mlp": 0.01255995, + "epoch": 0.5302870885314895, + "flos": 14579380287360.0, + "grad_norm": 2.7239673645734905, + "language_loss": 0.82232261, + "learning_rate": 1.9009225709427267e-06, + "loss": 0.89927506, + "num_input_tokens_seen": 189660945, + "router_z_loss_clip": 1.53027344, + "router_z_loss_mlp": 0.12353516, + "step": 8820, + "time_per_iteration": 2.5082767009735107 + }, + { + "auxiliary_loss_clip": 0.06421572, + "auxiliary_loss_mlp": 0.0126791, + "balance_loss_clip": 0.06271127, + "balance_loss_mlp": 0.01257437, + "epoch": 0.5303472117841576, + "flos": 23444323027200.0, + "grad_norm": 1.7959737859178544, + "language_loss": 0.72743207, + "learning_rate": 1.9005335893216667e-06, + "loss": 0.80432689, + "num_input_tokens_seen": 189680425, + "router_z_loss_clip": 1.50390625, + "router_z_loss_mlp": 0.10479736, + "step": 8821, + "time_per_iteration": 2.5132317543029785 + }, + { + "auxiliary_loss_clip": 0.06418677, + "auxiliary_loss_mlp": 0.01266676, + "balance_loss_clip": 0.06273392, + "balance_loss_mlp": 0.01255643, + "epoch": 0.5304073350368255, + "flos": 22715294578560.0, + "grad_norm": 1.486709371307985, + "language_loss": 0.74618089, + "learning_rate": 1.9001446114723824e-06, + "loss": 0.82303441, + "num_input_tokens_seen": 189700375, + "router_z_loss_clip": 1.45214844, + "router_z_loss_mlp": 0.11035156, + "step": 8822, + "time_per_iteration": 2.528388261795044 + }, + { + "auxiliary_loss_clip": 0.06422541, + "auxiliary_loss_mlp": 0.0126649, + "balance_loss_clip": 0.06275284, + "balance_loss_mlp": 0.01255094, + "epoch": 0.5304674582894935, + "flos": 27936059397120.0, + "grad_norm": 1.8362514047395362, + "language_loss": 0.67618608, + "learning_rate": 1.8997556374096257e-06, + "loss": 0.75307631, + "num_input_tokens_seen": 189721225, + "router_z_loss_clip": 1.47167969, + "router_z_loss_mlp": 0.11401367, + "step": 8823, + "time_per_iteration": 3.9042444229125977 + }, + { + "auxiliary_loss_clip": 0.06425376, + "auxiliary_loss_mlp": 0.01269944, + "balance_loss_clip": 0.06273329, + "balance_loss_mlp": 0.01257969, + "epoch": 0.5305275815421614, + "flos": 21256860337920.0, + "grad_norm": 1.7650443733670647, + "language_loss": 0.69634396, + "learning_rate": 1.8993666671481444e-06, + "loss": 0.77329719, + "num_input_tokens_seen": 189740170, + "router_z_loss_clip": 1.51953125, + "router_z_loss_mlp": 0.11968994, + "step": 8824, + "time_per_iteration": 2.5146212577819824 + }, + { + "auxiliary_loss_clip": 0.06418572, + "auxiliary_loss_mlp": 0.01266795, + "balance_loss_clip": 0.06275523, + "balance_loss_mlp": 0.01256292, + "epoch": 0.5305877047948294, + "flos": 17608867056000.0, + "grad_norm": 1.7570108593506664, + "language_loss": 0.76559019, + "learning_rate": 1.898977700702689e-06, + "loss": 0.84244382, + "num_input_tokens_seen": 189757890, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.1050415, + "step": 8825, + "time_per_iteration": 2.4815242290496826 + }, + { + "auxiliary_loss_clip": 0.06420843, + "auxiliary_loss_mlp": 0.01268607, + "balance_loss_clip": 0.06275746, + "balance_loss_mlp": 0.01257335, + "epoch": 0.5306478280474973, + "flos": 15200947474560.0, + "grad_norm": 2.5706419514423526, + "language_loss": 0.85959315, + "learning_rate": 1.8985887380880103e-06, + "loss": 0.93648767, + "num_input_tokens_seen": 189775390, + "router_z_loss_clip": 1.45117188, + "router_z_loss_mlp": 0.11279297, + "step": 8826, + "time_per_iteration": 3.921194076538086 + }, + { + "auxiliary_loss_clip": 0.06417906, + "auxiliary_loss_mlp": 0.01264941, + "balance_loss_clip": 0.06272666, + "balance_loss_mlp": 0.01253759, + "epoch": 0.5307079513001653, + "flos": 15346660924800.0, + "grad_norm": 1.4506860249913964, + "language_loss": 0.64565361, + "learning_rate": 1.8981997793188558e-06, + "loss": 0.72248203, + "num_input_tokens_seen": 189793975, + "router_z_loss_clip": 1.45117188, + "router_z_loss_mlp": 0.11181641, + "step": 8827, + "time_per_iteration": 2.4920613765716553 + }, + { + "auxiliary_loss_clip": 0.06420277, + "auxiliary_loss_mlp": 0.01268465, + "balance_loss_clip": 0.06272143, + "balance_loss_mlp": 0.01256961, + "epoch": 0.5307680745528333, + "flos": 43554567294720.0, + "grad_norm": 1.8307336922940562, + "language_loss": 0.59537661, + "learning_rate": 1.8978108244099762e-06, + "loss": 0.6722641, + "num_input_tokens_seen": 189817870, + "router_z_loss_clip": 1.48046875, + "router_z_loss_mlp": 0.11499023, + "step": 8828, + "time_per_iteration": 2.7917306423187256 + }, + { + "auxiliary_loss_clip": 0.06423927, + "auxiliary_loss_mlp": 0.012663, + "balance_loss_clip": 0.06272669, + "balance_loss_mlp": 0.01254725, + "epoch": 0.5308281978055013, + "flos": 20055332315520.0, + "grad_norm": 1.5709125682754386, + "language_loss": 0.81926584, + "learning_rate": 1.8974218733761208e-06, + "loss": 0.89616817, + "num_input_tokens_seen": 189837905, + "router_z_loss_clip": 1.51171875, + "router_z_loss_mlp": 0.11578369, + "step": 8829, + "time_per_iteration": 2.606851100921631 + }, + { + "auxiliary_loss_clip": 0.06417149, + "auxiliary_loss_mlp": 0.01263824, + "balance_loss_clip": 0.06271777, + "balance_loss_mlp": 0.01253316, + "epoch": 0.5308883210581693, + "flos": 20710162373760.0, + "grad_norm": 1.3864012566435717, + "language_loss": 0.78353059, + "learning_rate": 1.8970329262320375e-06, + "loss": 0.86034036, + "num_input_tokens_seen": 189856970, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.1050415, + "step": 8830, + "time_per_iteration": 3.954951286315918 + }, + { + "auxiliary_loss_clip": 0.06420083, + "auxiliary_loss_mlp": 0.01268446, + "balance_loss_clip": 0.06272915, + "balance_loss_mlp": 0.01256924, + "epoch": 0.5309484443108372, + "flos": 14360684330880.0, + "grad_norm": 2.11171769837039, + "language_loss": 0.81423479, + "learning_rate": 1.8966439829924768e-06, + "loss": 0.89112008, + "num_input_tokens_seen": 189872830, + "router_z_loss_clip": 1.47167969, + "router_z_loss_mlp": 0.11517334, + "step": 8831, + "time_per_iteration": 2.469822883605957 + }, + { + "auxiliary_loss_clip": 0.06415518, + "auxiliary_loss_mlp": 0.01266871, + "balance_loss_clip": 0.06270008, + "balance_loss_mlp": 0.0125579, + "epoch": 0.5310085675635052, + "flos": 20016577002240.0, + "grad_norm": 1.695592927900533, + "language_loss": 0.73638004, + "learning_rate": 1.896255043672186e-06, + "loss": 0.81320393, + "num_input_tokens_seen": 189891635, + "router_z_loss_clip": 1.45507812, + "router_z_loss_mlp": 0.11071777, + "step": 8832, + "time_per_iteration": 2.527545213699341 + }, + { + "auxiliary_loss_clip": 0.06424195, + "auxiliary_loss_mlp": 0.01266175, + "balance_loss_clip": 0.06271979, + "balance_loss_mlp": 0.01253831, + "epoch": 0.5310686908161731, + "flos": 22133824369920.0, + "grad_norm": 1.9494235860340738, + "language_loss": 0.75823116, + "learning_rate": 1.8958661082859143e-06, + "loss": 0.83513486, + "num_input_tokens_seen": 189909050, + "router_z_loss_clip": 1.52246094, + "router_z_loss_mlp": 0.12341309, + "step": 8833, + "time_per_iteration": 2.497962236404419 + }, + { + "auxiliary_loss_clip": 0.06426589, + "auxiliary_loss_mlp": 0.01264835, + "balance_loss_clip": 0.06274767, + "balance_loss_mlp": 0.01252861, + "epoch": 0.5311288140688412, + "flos": 24724871049600.0, + "grad_norm": 1.6156023907192425, + "language_loss": 0.7400462, + "learning_rate": 1.8954771768484103e-06, + "loss": 0.81696039, + "num_input_tokens_seen": 189927405, + "router_z_loss_clip": 1.51660156, + "router_z_loss_mlp": 0.11975098, + "step": 8834, + "time_per_iteration": 2.5790417194366455 + }, + { + "auxiliary_loss_clip": 0.06429796, + "auxiliary_loss_mlp": 0.01267125, + "balance_loss_clip": 0.06274004, + "balance_loss_mlp": 0.01254322, + "epoch": 0.5311889373215091, + "flos": 24104603600640.0, + "grad_norm": 1.6077843194652517, + "language_loss": 0.77900589, + "learning_rate": 1.8950882493744226e-06, + "loss": 0.85597509, + "num_input_tokens_seen": 189947740, + "router_z_loss_clip": 1.55761719, + "router_z_loss_mlp": 0.12817383, + "step": 8835, + "time_per_iteration": 2.5299718379974365 + }, + { + "auxiliary_loss_clip": 0.06422241, + "auxiliary_loss_mlp": 0.01265258, + "balance_loss_clip": 0.06272303, + "balance_loss_mlp": 0.01253147, + "epoch": 0.5312490605741771, + "flos": 22023386288640.0, + "grad_norm": 1.8854276384026003, + "language_loss": 0.72502893, + "learning_rate": 1.8946993258786985e-06, + "loss": 0.80190396, + "num_input_tokens_seen": 189966495, + "router_z_loss_clip": 1.49804688, + "router_z_loss_mlp": 0.12115479, + "step": 8836, + "time_per_iteration": 2.548025131225586 + }, + { + "auxiliary_loss_clip": 0.06424102, + "auxiliary_loss_mlp": 0.01268272, + "balance_loss_clip": 0.06273144, + "balance_loss_mlp": 0.01255815, + "epoch": 0.531309183826845, + "flos": 19396561115520.0, + "grad_norm": 1.819661501339542, + "language_loss": 0.81157684, + "learning_rate": 1.894310406375987e-06, + "loss": 0.88850057, + "num_input_tokens_seen": 189985325, + "router_z_loss_clip": 1.50976562, + "router_z_loss_mlp": 0.12463379, + "step": 8837, + "time_per_iteration": 2.484968662261963 + }, + { + "auxiliary_loss_clip": 0.06418987, + "auxiliary_loss_mlp": 0.0126777, + "balance_loss_clip": 0.06274254, + "balance_loss_mlp": 0.01255778, + "epoch": 0.531369307079513, + "flos": 20195679104640.0, + "grad_norm": 1.8987589865078431, + "language_loss": 0.86269474, + "learning_rate": 1.893921490881035e-06, + "loss": 0.93956232, + "num_input_tokens_seen": 190003290, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.11981201, + "step": 8838, + "time_per_iteration": 3.9265315532684326 + }, + { + "auxiliary_loss_clip": 0.06418579, + "auxiliary_loss_mlp": 0.01264671, + "balance_loss_clip": 0.06271757, + "balance_loss_mlp": 0.01253584, + "epoch": 0.5314294303321809, + "flos": 18886144769280.0, + "grad_norm": 1.6029216559450563, + "language_loss": 0.73087633, + "learning_rate": 1.8935325794085906e-06, + "loss": 0.8077088, + "num_input_tokens_seen": 190023260, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 0.11077881, + "step": 8839, + "time_per_iteration": 2.595414876937866 + }, + { + "auxiliary_loss_clip": 0.06421834, + "auxiliary_loss_mlp": 0.01265258, + "balance_loss_clip": 0.06271024, + "balance_loss_mlp": 0.01253551, + "epoch": 0.531489553584849, + "flos": 23046818457600.0, + "grad_norm": 1.6603149015146987, + "language_loss": 0.76847923, + "learning_rate": 1.8931436719734023e-06, + "loss": 0.84535015, + "num_input_tokens_seen": 190042035, + "router_z_loss_clip": 1.5078125, + "router_z_loss_mlp": 0.11712646, + "step": 8840, + "time_per_iteration": 2.543708086013794 + }, + { + "auxiliary_loss_clip": 0.06426372, + "auxiliary_loss_mlp": 0.01267236, + "balance_loss_clip": 0.06275196, + "balance_loss_mlp": 0.01255291, + "epoch": 0.5315496768375169, + "flos": 19796329745280.0, + "grad_norm": 3.0684588696132553, + "language_loss": 0.7743901, + "learning_rate": 1.892754768590216e-06, + "loss": 0.85132617, + "num_input_tokens_seen": 190057545, + "router_z_loss_clip": 1.51171875, + "router_z_loss_mlp": 0.11932373, + "step": 8841, + "time_per_iteration": 2.5301966667175293 + }, + { + "auxiliary_loss_clip": 0.0631949, + "auxiliary_loss_mlp": 0.01253613, + "balance_loss_clip": 0.06256352, + "balance_loss_mlp": 0.01251976, + "epoch": 0.5316098000901849, + "flos": 71044876569600.0, + "grad_norm": 0.6765052539549429, + "language_loss": 0.56618965, + "learning_rate": 1.8923658692737793e-06, + "loss": 0.64192069, + "num_input_tokens_seen": 190123800, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 0.0164032, + "step": 8842, + "time_per_iteration": 3.2740724086761475 + }, + { + "auxiliary_loss_clip": 0.06425814, + "auxiliary_loss_mlp": 0.01266185, + "balance_loss_clip": 0.06272734, + "balance_loss_mlp": 0.01252876, + "epoch": 0.5316699233428529, + "flos": 16441146956160.0, + "grad_norm": 1.7388474755658287, + "language_loss": 0.73801279, + "learning_rate": 1.8919769740388407e-06, + "loss": 0.81493276, + "num_input_tokens_seen": 190141625, + "router_z_loss_clip": 1.53027344, + "router_z_loss_mlp": 0.13317871, + "step": 8843, + "time_per_iteration": 2.5188851356506348 + }, + { + "auxiliary_loss_clip": 0.06319, + "auxiliary_loss_mlp": 0.01253092, + "balance_loss_clip": 0.06256077, + "balance_loss_mlp": 0.01251205, + "epoch": 0.5317300465955208, + "flos": 67443478957440.0, + "grad_norm": 0.8484317442594647, + "language_loss": 0.60991502, + "learning_rate": 1.891588082900145e-06, + "loss": 0.68563592, + "num_input_tokens_seen": 190198110, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 0.01882935, + "step": 8844, + "time_per_iteration": 3.1943981647491455 + }, + { + "auxiliary_loss_clip": 0.06316474, + "auxiliary_loss_mlp": 0.01252227, + "balance_loss_clip": 0.06253788, + "balance_loss_mlp": 0.01250519, + "epoch": 0.5317901698481888, + "flos": 59524095144960.0, + "grad_norm": 0.8355266908782794, + "language_loss": 0.62249273, + "learning_rate": 1.8911991958724411e-06, + "loss": 0.69817972, + "num_input_tokens_seen": 190259950, + "router_z_loss_clip": 0.62744141, + "router_z_loss_mlp": 0.01712036, + "step": 8845, + "time_per_iteration": 3.149904727935791 + }, + { + "auxiliary_loss_clip": 0.06421602, + "auxiliary_loss_mlp": 0.01271191, + "balance_loss_clip": 0.06273656, + "balance_loss_mlp": 0.01258424, + "epoch": 0.5318502931008567, + "flos": 19134204382080.0, + "grad_norm": 1.8837935046538667, + "language_loss": 0.7569865, + "learning_rate": 1.890810312970474e-06, + "loss": 0.8339144, + "num_input_tokens_seen": 190278265, + "router_z_loss_clip": 1.48046875, + "router_z_loss_mlp": 0.12774658, + "step": 8846, + "time_per_iteration": 2.5158872604370117 + }, + { + "auxiliary_loss_clip": 0.0642429, + "auxiliary_loss_mlp": 0.01267758, + "balance_loss_clip": 0.06273554, + "balance_loss_mlp": 0.01256838, + "epoch": 0.5319104163535248, + "flos": 24687960526080.0, + "grad_norm": 1.6867562646607668, + "language_loss": 0.75546432, + "learning_rate": 1.8904214342089903e-06, + "loss": 0.83238477, + "num_input_tokens_seen": 190298400, + "router_z_loss_clip": 1.50585938, + "router_z_loss_mlp": 0.10913086, + "step": 8847, + "time_per_iteration": 2.5634870529174805 + }, + { + "auxiliary_loss_clip": 0.06415805, + "auxiliary_loss_mlp": 0.01265969, + "balance_loss_clip": 0.06269352, + "balance_loss_mlp": 0.01254823, + "epoch": 0.5319705396061927, + "flos": 19390691329920.0, + "grad_norm": 1.5354205561883685, + "language_loss": 0.87653261, + "learning_rate": 1.8900325596027378e-06, + "loss": 0.95335042, + "num_input_tokens_seen": 190316235, + "router_z_loss_clip": 1.46386719, + "router_z_loss_mlp": 0.1114502, + "step": 8848, + "time_per_iteration": 2.4771876335144043 + }, + { + "auxiliary_loss_clip": 0.06423473, + "auxiliary_loss_mlp": 0.01274581, + "balance_loss_clip": 0.06273171, + "balance_loss_mlp": 0.01261564, + "epoch": 0.5320306628588607, + "flos": 18265122633600.0, + "grad_norm": 1.744694135662772, + "language_loss": 0.74510658, + "learning_rate": 1.8896436891664609e-06, + "loss": 0.82208717, + "num_input_tokens_seen": 190335060, + "router_z_loss_clip": 1.50195312, + "router_z_loss_mlp": 0.13012695, + "step": 8849, + "time_per_iteration": 2.5036580562591553 + }, + { + "auxiliary_loss_clip": 0.06429593, + "auxiliary_loss_mlp": 0.01265611, + "balance_loss_clip": 0.06274542, + "balance_loss_mlp": 0.01253761, + "epoch": 0.5320907861115286, + "flos": 23739062163840.0, + "grad_norm": 1.9586489533772713, + "language_loss": 0.79968703, + "learning_rate": 1.8892548229149066e-06, + "loss": 0.87663901, + "num_input_tokens_seen": 190353265, + "router_z_loss_clip": 1.54980469, + "router_z_loss_mlp": 0.11853027, + "step": 8850, + "time_per_iteration": 2.5143027305603027 + }, + { + "auxiliary_loss_clip": 0.06426045, + "auxiliary_loss_mlp": 0.0126479, + "balance_loss_clip": 0.06276459, + "balance_loss_mlp": 0.01254086, + "epoch": 0.5321509093641966, + "flos": 34503730272000.0, + "grad_norm": 1.273724424531188, + "language_loss": 0.55058682, + "learning_rate": 1.888865960862821e-06, + "loss": 0.62749517, + "num_input_tokens_seen": 190376575, + "router_z_loss_clip": 1.49609375, + "router_z_loss_mlp": 0.1071167, + "step": 8851, + "time_per_iteration": 2.6221299171447754 + }, + { + "auxiliary_loss_clip": 0.06426491, + "auxiliary_loss_mlp": 0.01267353, + "balance_loss_clip": 0.06274278, + "balance_loss_mlp": 0.01255844, + "epoch": 0.5322110326168645, + "flos": 20017080126720.0, + "grad_norm": 1.7230657412679744, + "language_loss": 0.69354177, + "learning_rate": 1.8884771030249484e-06, + "loss": 0.77048028, + "num_input_tokens_seen": 190395185, + "router_z_loss_clip": 1.52050781, + "router_z_loss_mlp": 0.11517334, + "step": 8852, + "time_per_iteration": 2.483614206314087 + }, + { + "auxiliary_loss_clip": 0.06316812, + "auxiliary_loss_mlp": 0.01252104, + "balance_loss_clip": 0.06254005, + "balance_loss_mlp": 0.01250446, + "epoch": 0.5322711558695326, + "flos": 64650563792640.0, + "grad_norm": 0.7839220079179184, + "language_loss": 0.62548178, + "learning_rate": 1.8880882494160357e-06, + "loss": 0.70117098, + "num_input_tokens_seen": 190452595, + "router_z_loss_clip": 0.62890625, + "router_z_loss_mlp": 0.01661682, + "step": 8853, + "time_per_iteration": 3.085580587387085 + }, + { + "auxiliary_loss_clip": 0.06429263, + "auxiliary_loss_mlp": 0.01267576, + "balance_loss_clip": 0.06274428, + "balance_loss_mlp": 0.01256364, + "epoch": 0.5323312791222005, + "flos": 14944628234880.0, + "grad_norm": 2.314845805246822, + "language_loss": 0.79806542, + "learning_rate": 1.8876994000508278e-06, + "loss": 0.87503386, + "num_input_tokens_seen": 190469140, + "router_z_loss_clip": 1.54785156, + "router_z_loss_mlp": 0.11212158, + "step": 8854, + "time_per_iteration": 2.5530436038970947 + }, + { + "auxiliary_loss_clip": 0.06415577, + "auxiliary_loss_mlp": 0.01266542, + "balance_loss_clip": 0.06272486, + "balance_loss_mlp": 0.0125663, + "epoch": 0.5323914023748685, + "flos": 23447593336320.0, + "grad_norm": 2.5938972527955038, + "language_loss": 0.74205482, + "learning_rate": 1.8873105549440698e-06, + "loss": 0.81887597, + "num_input_tokens_seen": 190489015, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.09912109, + "step": 8855, + "time_per_iteration": 2.527981996536255 + }, + { + "auxiliary_loss_clip": 0.0641944, + "auxiliary_loss_mlp": 0.01263629, + "balance_loss_clip": 0.06272254, + "balance_loss_mlp": 0.01253371, + "epoch": 0.5324515256275365, + "flos": 26293324101120.0, + "grad_norm": 4.18366969320272, + "language_loss": 0.64945328, + "learning_rate": 1.886921714110507e-06, + "loss": 0.72628403, + "num_input_tokens_seen": 190508065, + "router_z_loss_clip": 1.47070312, + "router_z_loss_mlp": 0.10266113, + "step": 8856, + "time_per_iteration": 2.5942611694335938 + }, + { + "auxiliary_loss_clip": 0.06428003, + "auxiliary_loss_mlp": 0.01267402, + "balance_loss_clip": 0.06274043, + "balance_loss_mlp": 0.01255177, + "epoch": 0.5325116488802044, + "flos": 26878316181120.0, + "grad_norm": 1.8445625051613121, + "language_loss": 0.77944165, + "learning_rate": 1.8865328775648842e-06, + "loss": 0.85639572, + "num_input_tokens_seen": 190527045, + "router_z_loss_clip": 1.53808594, + "router_z_loss_mlp": 0.12231445, + "step": 8857, + "time_per_iteration": 2.551980972290039 + }, + { + "auxiliary_loss_clip": 0.06420985, + "auxiliary_loss_mlp": 0.01266182, + "balance_loss_clip": 0.06271584, + "balance_loss_mlp": 0.01254422, + "epoch": 0.5325717721328724, + "flos": 25891794535680.0, + "grad_norm": 1.6903303041385833, + "language_loss": 0.71116436, + "learning_rate": 1.8861440453219456e-06, + "loss": 0.78803611, + "num_input_tokens_seen": 190544075, + "router_z_loss_clip": 1.49316406, + "router_z_loss_mlp": 0.11749268, + "step": 8858, + "time_per_iteration": 2.564082384109497 + }, + { + "auxiliary_loss_clip": 0.0642374, + "auxiliary_loss_mlp": 0.01268133, + "balance_loss_clip": 0.06274494, + "balance_loss_mlp": 0.01255968, + "epoch": 0.5326318953855403, + "flos": 21805864289280.0, + "grad_norm": 3.8992078644613217, + "language_loss": 0.69476694, + "learning_rate": 1.8857552173964367e-06, + "loss": 0.77168566, + "num_input_tokens_seen": 190566030, + "router_z_loss_clip": 1.49316406, + "router_z_loss_mlp": 0.12158203, + "step": 8859, + "time_per_iteration": 2.5558056831359863 + }, + { + "auxiliary_loss_clip": 0.06418291, + "auxiliary_loss_mlp": 0.01266588, + "balance_loss_clip": 0.06275187, + "balance_loss_mlp": 0.0125624, + "epoch": 0.5326920186382084, + "flos": 20929193746560.0, + "grad_norm": 1.4322040270296341, + "language_loss": 0.69681478, + "learning_rate": 1.8853663938031013e-06, + "loss": 0.77366364, + "num_input_tokens_seen": 190585605, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.10339355, + "step": 8860, + "time_per_iteration": 2.5150671005249023 + }, + { + "auxiliary_loss_clip": 0.06419887, + "auxiliary_loss_mlp": 0.01266208, + "balance_loss_clip": 0.06273462, + "balance_loss_mlp": 0.01255259, + "epoch": 0.5327521418908763, + "flos": 21439735873920.0, + "grad_norm": 1.9652920134152139, + "language_loss": 0.77936381, + "learning_rate": 1.884977574556683e-06, + "loss": 0.85622478, + "num_input_tokens_seen": 190604625, + "router_z_loss_clip": 1.46484375, + "router_z_loss_mlp": 0.10955811, + "step": 8861, + "time_per_iteration": 2.527064561843872 + }, + { + "auxiliary_loss_clip": 0.06428909, + "auxiliary_loss_mlp": 0.01269839, + "balance_loss_clip": 0.06279886, + "balance_loss_mlp": 0.012579, + "epoch": 0.5328122651435443, + "flos": 21766354289280.0, + "grad_norm": 1.487259241409864, + "language_loss": 0.8585394, + "learning_rate": 1.8845887596719279e-06, + "loss": 0.93552685, + "num_input_tokens_seen": 190625060, + "router_z_loss_clip": 1.48828125, + "router_z_loss_mlp": 0.11938477, + "step": 8862, + "time_per_iteration": 4.031865358352661 + }, + { + "auxiliary_loss_clip": 0.06431703, + "auxiliary_loss_mlp": 0.01269915, + "balance_loss_clip": 0.06279312, + "balance_loss_mlp": 0.01257046, + "epoch": 0.5328723883962122, + "flos": 18302410500480.0, + "grad_norm": 1.6037650471474167, + "language_loss": 0.61557126, + "learning_rate": 1.8841999491635778e-06, + "loss": 0.69258749, + "num_input_tokens_seen": 190643150, + "router_z_loss_clip": 1.5234375, + "router_z_loss_mlp": 0.12866211, + "step": 8863, + "time_per_iteration": 2.499657154083252 + }, + { + "auxiliary_loss_clip": 0.06422713, + "auxiliary_loss_mlp": 0.01268054, + "balance_loss_clip": 0.06278422, + "balance_loss_mlp": 0.01257736, + "epoch": 0.5329325116488802, + "flos": 25382049022080.0, + "grad_norm": 1.8448114340212167, + "language_loss": 0.73693913, + "learning_rate": 1.883811143046377e-06, + "loss": 0.81384677, + "num_input_tokens_seen": 190662725, + "router_z_loss_clip": 1.44335938, + "router_z_loss_mlp": 0.10314941, + "step": 8864, + "time_per_iteration": 2.549104928970337 + }, + { + "auxiliary_loss_clip": 0.06424475, + "auxiliary_loss_mlp": 0.01267423, + "balance_loss_clip": 0.06276639, + "balance_loss_mlp": 0.0125636, + "epoch": 0.5329926349015481, + "flos": 25598984042880.0, + "grad_norm": 1.865165386122464, + "language_loss": 0.64464402, + "learning_rate": 1.8834223413350702e-06, + "loss": 0.72156298, + "num_input_tokens_seen": 190683680, + "router_z_loss_clip": 1.47851562, + "router_z_loss_mlp": 0.11065674, + "step": 8865, + "time_per_iteration": 4.099254608154297 + }, + { + "auxiliary_loss_clip": 0.0642702, + "auxiliary_loss_mlp": 0.01269229, + "balance_loss_clip": 0.06277309, + "balance_loss_mlp": 0.01257874, + "epoch": 0.5330527581542162, + "flos": 22895612565120.0, + "grad_norm": 1.6799514905357744, + "language_loss": 0.78778207, + "learning_rate": 1.8830335440443989e-06, + "loss": 0.86474454, + "num_input_tokens_seen": 190703350, + "router_z_loss_clip": 1.49511719, + "router_z_loss_mlp": 0.11346436, + "step": 8866, + "time_per_iteration": 2.505974531173706 + }, + { + "auxiliary_loss_clip": 0.06424611, + "auxiliary_loss_mlp": 0.01266962, + "balance_loss_clip": 0.06276287, + "balance_loss_mlp": 0.01255333, + "epoch": 0.5331128814068841, + "flos": 16031022347520.0, + "grad_norm": 1.850684934112151, + "language_loss": 0.74175781, + "learning_rate": 1.882644751189108e-06, + "loss": 0.81867361, + "num_input_tokens_seen": 190721170, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.11633301, + "step": 8867, + "time_per_iteration": 2.5437192916870117 + }, + { + "auxiliary_loss_clip": 0.0642608, + "auxiliary_loss_mlp": 0.0126656, + "balance_loss_clip": 0.06276974, + "balance_loss_mlp": 0.01254204, + "epoch": 0.5331730046595521, + "flos": 39353461211520.0, + "grad_norm": 1.4678278533937592, + "language_loss": 0.72377831, + "learning_rate": 1.88225596278394e-06, + "loss": 0.80070472, + "num_input_tokens_seen": 190743795, + "router_z_loss_clip": 1.49121094, + "router_z_loss_mlp": 0.12353516, + "step": 8868, + "time_per_iteration": 2.6680116653442383 + }, + { + "auxiliary_loss_clip": 0.06425264, + "auxiliary_loss_mlp": 0.01269354, + "balance_loss_clip": 0.06276006, + "balance_loss_mlp": 0.01258345, + "epoch": 0.5332331279122201, + "flos": 24031201824000.0, + "grad_norm": 1.7262272651388555, + "language_loss": 0.78884375, + "learning_rate": 1.881867178843637e-06, + "loss": 0.86578989, + "num_input_tokens_seen": 190761560, + "router_z_loss_clip": 1.49121094, + "router_z_loss_mlp": 0.11016846, + "step": 8869, + "time_per_iteration": 3.9937024116516113 + }, + { + "auxiliary_loss_clip": 0.06438692, + "auxiliary_loss_mlp": 0.0126748, + "balance_loss_clip": 0.06282986, + "balance_loss_mlp": 0.01255434, + "epoch": 0.533293251164888, + "flos": 17135109671040.0, + "grad_norm": 2.017265080243192, + "language_loss": 0.7622692, + "learning_rate": 1.8814783993829434e-06, + "loss": 0.83933091, + "num_input_tokens_seen": 190778875, + "router_z_loss_clip": 1.55566406, + "router_z_loss_mlp": 0.1204834, + "step": 8870, + "time_per_iteration": 2.520585536956787 + }, + { + "auxiliary_loss_clip": 0.06435512, + "auxiliary_loss_mlp": 0.01273068, + "balance_loss_clip": 0.06280903, + "balance_loss_mlp": 0.01260366, + "epoch": 0.533353374417556, + "flos": 22132734266880.0, + "grad_norm": 2.1166188019250316, + "language_loss": 0.76185441, + "learning_rate": 1.8810896244165997e-06, + "loss": 0.83894014, + "num_input_tokens_seen": 190799830, + "router_z_loss_clip": 1.546875, + "router_z_loss_mlp": 0.12713623, + "step": 8871, + "time_per_iteration": 2.5372307300567627 + }, + { + "auxiliary_loss_clip": 0.06427529, + "auxiliary_loss_mlp": 0.01272588, + "balance_loss_clip": 0.06279083, + "balance_loss_mlp": 0.01261383, + "epoch": 0.533413497670224, + "flos": 15016185221760.0, + "grad_norm": 1.8709318225271354, + "language_loss": 0.72608036, + "learning_rate": 1.8807008539593498e-06, + "loss": 0.80308151, + "num_input_tokens_seen": 190817155, + "router_z_loss_clip": 1.48339844, + "router_z_loss_mlp": 0.11206055, + "step": 8872, + "time_per_iteration": 2.486344337463379 + }, + { + "auxiliary_loss_clip": 0.06426945, + "auxiliary_loss_mlp": 0.01270876, + "balance_loss_clip": 0.06280041, + "balance_loss_mlp": 0.01258925, + "epoch": 0.533473620922892, + "flos": 19616095612800.0, + "grad_norm": 1.6405410033387824, + "language_loss": 0.65059078, + "learning_rate": 1.880312088025936e-06, + "loss": 0.72756892, + "num_input_tokens_seen": 190835240, + "router_z_loss_clip": 1.46777344, + "router_z_loss_mlp": 0.11956787, + "step": 8873, + "time_per_iteration": 2.4989571571350098 + }, + { + "auxiliary_loss_clip": 0.06430013, + "auxiliary_loss_mlp": 0.01270669, + "balance_loss_clip": 0.06281542, + "balance_loss_mlp": 0.01260113, + "epoch": 0.5335337441755599, + "flos": 14287827605760.0, + "grad_norm": 2.154155286859053, + "language_loss": 0.80397201, + "learning_rate": 1.879923326631099e-06, + "loss": 0.88097882, + "num_input_tokens_seen": 190851620, + "router_z_loss_clip": 1.48535156, + "router_z_loss_mlp": 0.10559082, + "step": 8874, + "time_per_iteration": 2.5248029232025146 + }, + { + "auxiliary_loss_clip": 0.06429289, + "auxiliary_loss_mlp": 0.01270488, + "balance_loss_clip": 0.06281012, + "balance_loss_mlp": 0.01259306, + "epoch": 0.5335938674282279, + "flos": 20821313214720.0, + "grad_norm": 1.9252791788754828, + "language_loss": 0.70199001, + "learning_rate": 1.879534569789582e-06, + "loss": 0.77898782, + "num_input_tokens_seen": 190870545, + "router_z_loss_clip": 1.48339844, + "router_z_loss_mlp": 0.11181641, + "step": 8875, + "time_per_iteration": 2.514606475830078 + }, + { + "auxiliary_loss_clip": 0.06327371, + "auxiliary_loss_mlp": 0.01252854, + "balance_loss_clip": 0.06264151, + "balance_loss_mlp": 0.01251167, + "epoch": 0.5336539906808958, + "flos": 71419558101120.0, + "grad_norm": 0.7076326652144627, + "language_loss": 0.59621203, + "learning_rate": 1.879145817516126e-06, + "loss": 0.6720143, + "num_input_tokens_seen": 190931995, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 0.01690674, + "step": 8876, + "time_per_iteration": 3.2623958587646484 + }, + { + "auxiliary_loss_clip": 0.06431912, + "auxiliary_loss_mlp": 0.0127027, + "balance_loss_clip": 0.06282675, + "balance_loss_mlp": 0.01259833, + "epoch": 0.5337141139335638, + "flos": 20158517018880.0, + "grad_norm": 1.761940945107411, + "language_loss": 0.75235462, + "learning_rate": 1.8787570698254727e-06, + "loss": 0.8293764, + "num_input_tokens_seen": 190949890, + "router_z_loss_clip": 1.49121094, + "router_z_loss_mlp": 0.10437012, + "step": 8877, + "time_per_iteration": 4.019563674926758 + }, + { + "auxiliary_loss_clip": 0.06329054, + "auxiliary_loss_mlp": 0.01254827, + "balance_loss_clip": 0.06265914, + "balance_loss_mlp": 0.01253019, + "epoch": 0.5337742371862317, + "flos": 67747624479360.0, + "grad_norm": 0.7353643225564799, + "language_loss": 0.57172877, + "learning_rate": 1.8783683267323629e-06, + "loss": 0.64756757, + "num_input_tokens_seen": 191008480, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 0.01803589, + "step": 8878, + "time_per_iteration": 3.0581912994384766 + }, + { + "auxiliary_loss_clip": 0.06440037, + "auxiliary_loss_mlp": 0.0127241, + "balance_loss_clip": 0.06285742, + "balance_loss_mlp": 0.01260573, + "epoch": 0.5338343604388998, + "flos": 25015794825600.0, + "grad_norm": 1.5270572668187339, + "language_loss": 0.7260288, + "learning_rate": 1.8779795882515395e-06, + "loss": 0.80315328, + "num_input_tokens_seen": 191028995, + "router_z_loss_clip": 1.54492188, + "router_z_loss_mlp": 0.11834717, + "step": 8879, + "time_per_iteration": 2.594075918197632 + }, + { + "auxiliary_loss_clip": 0.06432897, + "auxiliary_loss_mlp": 0.01271434, + "balance_loss_clip": 0.06280228, + "balance_loss_mlp": 0.01259644, + "epoch": 0.5338944836915677, + "flos": 17606728776960.0, + "grad_norm": 2.8683921774089445, + "language_loss": 0.84095323, + "learning_rate": 1.8775908543977416e-06, + "loss": 0.91799653, + "num_input_tokens_seen": 191045285, + "router_z_loss_clip": 1.52636719, + "router_z_loss_mlp": 0.11785889, + "step": 8880, + "time_per_iteration": 2.4828426837921143 + }, + { + "auxiliary_loss_clip": 0.06424058, + "auxiliary_loss_mlp": 0.01273011, + "balance_loss_clip": 0.06279065, + "balance_loss_mlp": 0.01262277, + "epoch": 0.5339546069442357, + "flos": 21730282306560.0, + "grad_norm": 1.3465483600758703, + "language_loss": 0.79582727, + "learning_rate": 1.8772021251857107e-06, + "loss": 0.87279797, + "num_input_tokens_seen": 191066105, + "router_z_loss_clip": 1.45019531, + "router_z_loss_mlp": 0.1072998, + "step": 8881, + "time_per_iteration": 2.5683958530426025 + }, + { + "auxiliary_loss_clip": 0.06324948, + "auxiliary_loss_mlp": 0.01252734, + "balance_loss_clip": 0.06261811, + "balance_loss_mlp": 0.01251199, + "epoch": 0.5340147301969036, + "flos": 69741226748160.0, + "grad_norm": 0.7871410050477539, + "language_loss": 0.5924378, + "learning_rate": 1.8768134006301882e-06, + "loss": 0.66821468, + "num_input_tokens_seen": 191126315, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 0.01533508, + "step": 8882, + "time_per_iteration": 3.0768346786499023 + }, + { + "auxiliary_loss_clip": 0.06325522, + "auxiliary_loss_mlp": 0.01253695, + "balance_loss_clip": 0.06262392, + "balance_loss_mlp": 0.01252035, + "epoch": 0.5340748534495716, + "flos": 63896504901120.0, + "grad_norm": 0.885852476410532, + "language_loss": 0.63786471, + "learning_rate": 1.876424680745913e-06, + "loss": 0.7136569, + "num_input_tokens_seen": 191174240, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 0.01663208, + "step": 8883, + "time_per_iteration": 2.967287063598633 + }, + { + "auxiliary_loss_clip": 0.06432307, + "auxiliary_loss_mlp": 0.01267155, + "balance_loss_clip": 0.06278822, + "balance_loss_mlp": 0.01254942, + "epoch": 0.5341349767022396, + "flos": 28701872588160.0, + "grad_norm": 2.199844959316804, + "language_loss": 0.82043612, + "learning_rate": 1.8760359655476272e-06, + "loss": 0.89743072, + "num_input_tokens_seen": 191193335, + "router_z_loss_clip": 1.53515625, + "router_z_loss_mlp": 0.12200928, + "step": 8884, + "time_per_iteration": 2.5675361156463623 + }, + { + "auxiliary_loss_clip": 0.06425676, + "auxiliary_loss_mlp": 0.01268668, + "balance_loss_clip": 0.06280383, + "balance_loss_mlp": 0.01257873, + "epoch": 0.5341950999549075, + "flos": 16295265797760.0, + "grad_norm": 1.5488539614491517, + "language_loss": 0.72820723, + "learning_rate": 1.8756472550500695e-06, + "loss": 0.80515063, + "num_input_tokens_seen": 191210900, + "router_z_loss_clip": 1.45214844, + "router_z_loss_mlp": 0.10784912, + "step": 8885, + "time_per_iteration": 2.5164196491241455 + }, + { + "auxiliary_loss_clip": 0.06432982, + "auxiliary_loss_mlp": 0.01266357, + "balance_loss_clip": 0.06277923, + "balance_loss_mlp": 0.01254525, + "epoch": 0.5342552232075756, + "flos": 14360852039040.0, + "grad_norm": 1.8494222651114738, + "language_loss": 0.78934276, + "learning_rate": 1.87525854926798e-06, + "loss": 0.86633611, + "num_input_tokens_seen": 191226730, + "router_z_loss_clip": 1.54980469, + "router_z_loss_mlp": 0.11834717, + "step": 8886, + "time_per_iteration": 2.524366855621338 + }, + { + "auxiliary_loss_clip": 0.06429981, + "auxiliary_loss_mlp": 0.01268189, + "balance_loss_clip": 0.06279354, + "balance_loss_mlp": 0.01255606, + "epoch": 0.5343153464602435, + "flos": 30305517154560.0, + "grad_norm": 1.3913460534471052, + "language_loss": 0.75135863, + "learning_rate": 1.8748698482160996e-06, + "loss": 0.82834035, + "num_input_tokens_seen": 191250435, + "router_z_loss_clip": 1.50683594, + "router_z_loss_mlp": 0.12579346, + "step": 8887, + "time_per_iteration": 2.6564323902130127 + }, + { + "auxiliary_loss_clip": 0.06427558, + "auxiliary_loss_mlp": 0.01265573, + "balance_loss_clip": 0.06278411, + "balance_loss_mlp": 0.0125401, + "epoch": 0.5343754697129115, + "flos": 15601722353280.0, + "grad_norm": 2.357980716065106, + "language_loss": 0.69295096, + "learning_rate": 1.8744811519091663e-06, + "loss": 0.76988232, + "num_input_tokens_seen": 191268315, + "router_z_loss_clip": 1.49023438, + "router_z_loss_mlp": 0.11560059, + "step": 8888, + "time_per_iteration": 2.4917025566101074 + }, + { + "auxiliary_loss_clip": 0.06442724, + "auxiliary_loss_mlp": 0.01272933, + "balance_loss_clip": 0.06283408, + "balance_loss_mlp": 0.01260935, + "epoch": 0.5344355929655794, + "flos": 16915239757440.0, + "grad_norm": 1.9387999695924976, + "language_loss": 0.78584576, + "learning_rate": 1.8740924603619208e-06, + "loss": 0.8630023, + "num_input_tokens_seen": 191287000, + "router_z_loss_clip": 1.59277344, + "router_z_loss_mlp": 0.12005615, + "step": 8889, + "time_per_iteration": 2.5028741359710693 + }, + { + "auxiliary_loss_clip": 0.06424284, + "auxiliary_loss_mlp": 0.01268375, + "balance_loss_clip": 0.06276136, + "balance_loss_mlp": 0.01256431, + "epoch": 0.5344957162182474, + "flos": 16803460010880.0, + "grad_norm": 1.9089962398127316, + "language_loss": 0.69733131, + "learning_rate": 1.873703773589102e-06, + "loss": 0.7742579, + "num_input_tokens_seen": 191304565, + "router_z_loss_clip": 1.47949219, + "router_z_loss_mlp": 0.1194458, + "step": 8890, + "time_per_iteration": 2.4705469608306885 + }, + { + "auxiliary_loss_clip": 0.06430273, + "auxiliary_loss_mlp": 0.01267824, + "balance_loss_clip": 0.0627601, + "balance_loss_mlp": 0.01255635, + "epoch": 0.5345558394709153, + "flos": 12709144356480.0, + "grad_norm": 3.2953855429591536, + "language_loss": 0.77688992, + "learning_rate": 1.8733150916054483e-06, + "loss": 0.85387087, + "num_input_tokens_seen": 191318300, + "router_z_loss_clip": 1.54101562, + "router_z_loss_mlp": 0.12182617, + "step": 8891, + "time_per_iteration": 2.500333547592163 + }, + { + "auxiliary_loss_clip": 0.06428199, + "auxiliary_loss_mlp": 0.01268573, + "balance_loss_clip": 0.06281698, + "balance_loss_mlp": 0.01257486, + "epoch": 0.5346159627235834, + "flos": 22461532888320.0, + "grad_norm": 1.516620120390114, + "language_loss": 0.74519014, + "learning_rate": 1.872926414425699e-06, + "loss": 0.82215786, + "num_input_tokens_seen": 191337925, + "router_z_loss_clip": 1.46484375, + "router_z_loss_mlp": 0.11102295, + "step": 8892, + "time_per_iteration": 2.4968128204345703 + }, + { + "auxiliary_loss_clip": 0.06427278, + "auxiliary_loss_mlp": 0.01264312, + "balance_loss_clip": 0.06277005, + "balance_loss_mlp": 0.01253566, + "epoch": 0.5346760859762513, + "flos": 22421771326080.0, + "grad_norm": 1.6631056082688196, + "language_loss": 0.87902844, + "learning_rate": 1.8725377420645932e-06, + "loss": 0.95594442, + "num_input_tokens_seen": 191357120, + "router_z_loss_clip": 1.50292969, + "router_z_loss_mlp": 0.10742188, + "step": 8893, + "time_per_iteration": 2.5580215454101562 + }, + { + "auxiliary_loss_clip": 0.06429157, + "auxiliary_loss_mlp": 0.01263801, + "balance_loss_clip": 0.06281421, + "balance_loss_mlp": 0.01253155, + "epoch": 0.5347362092289193, + "flos": 22822043080320.0, + "grad_norm": 1.612055893952936, + "language_loss": 0.72799695, + "learning_rate": 1.872149074536869e-06, + "loss": 0.80492651, + "num_input_tokens_seen": 191375395, + "router_z_loss_clip": 1.4765625, + "router_z_loss_mlp": 0.10650635, + "step": 8894, + "time_per_iteration": 2.54834246635437 + }, + { + "auxiliary_loss_clip": 0.06422012, + "auxiliary_loss_mlp": 0.01266432, + "balance_loss_clip": 0.06275687, + "balance_loss_mlp": 0.01254571, + "epoch": 0.5347963324815872, + "flos": 23225794778880.0, + "grad_norm": 1.4320398201671862, + "language_loss": 0.75047934, + "learning_rate": 1.8717604118572648e-06, + "loss": 0.82736373, + "num_input_tokens_seen": 191395595, + "router_z_loss_clip": 1.46289062, + "router_z_loss_mlp": 0.11865234, + "step": 8895, + "time_per_iteration": 2.5309391021728516 + }, + { + "auxiliary_loss_clip": 0.06432986, + "auxiliary_loss_mlp": 0.01266799, + "balance_loss_clip": 0.06282157, + "balance_loss_mlp": 0.01255606, + "epoch": 0.5348564557342552, + "flos": 22607917171200.0, + "grad_norm": 1.7183644079473714, + "language_loss": 0.77449572, + "learning_rate": 1.8713717540405178e-06, + "loss": 0.8514936, + "num_input_tokens_seen": 191413730, + "router_z_loss_clip": 1.50683594, + "router_z_loss_mlp": 0.11181641, + "step": 8896, + "time_per_iteration": 2.5175390243530273 + }, + { + "auxiliary_loss_clip": 0.06424737, + "auxiliary_loss_mlp": 0.01267928, + "balance_loss_clip": 0.06278285, + "balance_loss_mlp": 0.01256639, + "epoch": 0.5349165789869232, + "flos": 18007880999040.0, + "grad_norm": 1.7578614055599853, + "language_loss": 0.79043764, + "learning_rate": 1.8709831011013676e-06, + "loss": 0.86736429, + "num_input_tokens_seen": 191432400, + "router_z_loss_clip": 1.46386719, + "router_z_loss_mlp": 0.11297607, + "step": 8897, + "time_per_iteration": 2.5068724155426025 + }, + { + "auxiliary_loss_clip": 0.06429999, + "auxiliary_loss_mlp": 0.01264934, + "balance_loss_clip": 0.06279507, + "balance_loss_mlp": 0.01253365, + "epoch": 0.5349767022395912, + "flos": 17164557181440.0, + "grad_norm": 1.7104987912832146, + "language_loss": 0.76011693, + "learning_rate": 1.8705944530545509e-06, + "loss": 0.83706623, + "num_input_tokens_seen": 191448855, + "router_z_loss_clip": 1.50488281, + "router_z_loss_mlp": 0.11566162, + "step": 8898, + "time_per_iteration": 2.5468573570251465 + }, + { + "auxiliary_loss_clip": 0.06323466, + "auxiliary_loss_mlp": 0.01262304, + "balance_loss_clip": 0.06260733, + "balance_loss_mlp": 0.01260944, + "epoch": 0.5350368254922592, + "flos": 71014590518400.0, + "grad_norm": 0.8026406428525971, + "language_loss": 0.57916105, + "learning_rate": 1.8702058099148052e-06, + "loss": 0.65501881, + "num_input_tokens_seen": 191519690, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 0.01361847, + "step": 8899, + "time_per_iteration": 3.354367256164551 + }, + { + "auxiliary_loss_clip": 0.06428243, + "auxiliary_loss_mlp": 0.01266735, + "balance_loss_clip": 0.06281818, + "balance_loss_mlp": 0.01255857, + "epoch": 0.5350969487449271, + "flos": 27425265707520.0, + "grad_norm": 1.5056303351191316, + "language_loss": 0.70071346, + "learning_rate": 1.869817171696868e-06, + "loss": 0.77766323, + "num_input_tokens_seen": 191539380, + "router_z_loss_clip": 1.46484375, + "router_z_loss_mlp": 0.10882568, + "step": 8900, + "time_per_iteration": 2.596675395965576 + }, + { + "auxiliary_loss_clip": 0.0643241, + "auxiliary_loss_mlp": 0.01268767, + "balance_loss_clip": 0.06280074, + "balance_loss_mlp": 0.0125743, + "epoch": 0.5351570719975951, + "flos": 19321901527680.0, + "grad_norm": 1.5148336766284718, + "language_loss": 0.71324182, + "learning_rate": 1.8694285384154777e-06, + "loss": 0.79025364, + "num_input_tokens_seen": 191557400, + "router_z_loss_clip": 1.52148438, + "router_z_loss_mlp": 0.11346436, + "step": 8901, + "time_per_iteration": 2.526811122894287 + }, + { + "auxiliary_loss_clip": 0.06432061, + "auxiliary_loss_mlp": 0.01269417, + "balance_loss_clip": 0.06280375, + "balance_loss_mlp": 0.01257377, + "epoch": 0.535217195250263, + "flos": 19834707715200.0, + "grad_norm": 1.961594084549487, + "language_loss": 0.77373689, + "learning_rate": 1.8690399100853699e-06, + "loss": 0.85075164, + "num_input_tokens_seen": 191575860, + "router_z_loss_clip": 1.51757812, + "router_z_loss_mlp": 0.1204834, + "step": 8902, + "time_per_iteration": 3.931328773498535 + }, + { + "auxiliary_loss_clip": 0.06422594, + "auxiliary_loss_mlp": 0.01261364, + "balance_loss_clip": 0.0627951, + "balance_loss_mlp": 0.01250188, + "epoch": 0.535277318502931, + "flos": 22134495202560.0, + "grad_norm": 1.5214881410098744, + "language_loss": 0.7052539, + "learning_rate": 1.868651286721281e-06, + "loss": 0.78209347, + "num_input_tokens_seen": 191595775, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.1116333, + "step": 8903, + "time_per_iteration": 2.5344340801239014 + }, + { + "auxiliary_loss_clip": 0.06433277, + "auxiliary_loss_mlp": 0.01267717, + "balance_loss_clip": 0.06279396, + "balance_loss_mlp": 0.01255426, + "epoch": 0.5353374417555989, + "flos": 25052873057280.0, + "grad_norm": 1.5307499252390009, + "language_loss": 0.72374737, + "learning_rate": 1.86826266833795e-06, + "loss": 0.80075729, + "num_input_tokens_seen": 191617785, + "router_z_loss_clip": 1.53710938, + "router_z_loss_mlp": 0.12304688, + "step": 8904, + "time_per_iteration": 3.979325294494629 + }, + { + "auxiliary_loss_clip": 0.06430352, + "auxiliary_loss_mlp": 0.0127012, + "balance_loss_clip": 0.06280231, + "balance_loss_mlp": 0.01257961, + "epoch": 0.535397565008267, + "flos": 19394422836480.0, + "grad_norm": 1.7887132092295748, + "language_loss": 0.73359382, + "learning_rate": 1.8678740549501103e-06, + "loss": 0.81059849, + "num_input_tokens_seen": 191636900, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.121521, + "step": 8905, + "time_per_iteration": 2.5468502044677734 + }, + { + "auxiliary_loss_clip": 0.06426303, + "auxiliary_loss_mlp": 0.01263381, + "balance_loss_clip": 0.06282683, + "balance_loss_mlp": 0.01252402, + "epoch": 0.5354576882609349, + "flos": 21477736500480.0, + "grad_norm": 1.458955847450215, + "language_loss": 0.83904094, + "learning_rate": 1.8674854465725005e-06, + "loss": 0.91593778, + "num_input_tokens_seen": 191656720, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.10980225, + "step": 8906, + "time_per_iteration": 2.5199477672576904 + }, + { + "auxiliary_loss_clip": 0.06430362, + "auxiliary_loss_mlp": 0.01270808, + "balance_loss_clip": 0.06278186, + "balance_loss_mlp": 0.01258416, + "epoch": 0.5355178115136029, + "flos": 20783857639680.0, + "grad_norm": 1.893504710630849, + "language_loss": 0.74486792, + "learning_rate": 1.8670968432198563e-06, + "loss": 0.82187963, + "num_input_tokens_seen": 191674445, + "router_z_loss_clip": 1.52246094, + "router_z_loss_mlp": 0.1237793, + "step": 8907, + "time_per_iteration": 2.5200021266937256 + }, + { + "auxiliary_loss_clip": 0.06428273, + "auxiliary_loss_mlp": 0.01264992, + "balance_loss_clip": 0.06280483, + "balance_loss_mlp": 0.0125421, + "epoch": 0.5355779347662708, + "flos": 23520827404800.0, + "grad_norm": 1.6955230805298804, + "language_loss": 0.76706243, + "learning_rate": 1.866708244906912e-06, + "loss": 0.84399509, + "num_input_tokens_seen": 191695000, + "router_z_loss_clip": 1.47851562, + "router_z_loss_mlp": 0.10772705, + "step": 8908, + "time_per_iteration": 4.040110349655151 + }, + { + "auxiliary_loss_clip": 0.06432807, + "auxiliary_loss_mlp": 0.01271179, + "balance_loss_clip": 0.06280953, + "balance_loss_mlp": 0.01258835, + "epoch": 0.5356380580189388, + "flos": 20309471349120.0, + "grad_norm": 2.626231250487559, + "language_loss": 0.74318033, + "learning_rate": 1.8663196516484055e-06, + "loss": 0.82022017, + "num_input_tokens_seen": 191713295, + "router_z_loss_clip": 1.51855469, + "router_z_loss_mlp": 0.12347412, + "step": 8909, + "time_per_iteration": 2.503324031829834 + }, + { + "auxiliary_loss_clip": 0.06428281, + "auxiliary_loss_mlp": 0.01268046, + "balance_loss_clip": 0.06279926, + "balance_loss_mlp": 0.0125724, + "epoch": 0.5356981812716068, + "flos": 21368136960000.0, + "grad_norm": 2.2429477917403435, + "language_loss": 0.84013373, + "learning_rate": 1.8659310634590702e-06, + "loss": 0.91709697, + "num_input_tokens_seen": 191732725, + "router_z_loss_clip": 1.48242188, + "router_z_loss_mlp": 0.10803223, + "step": 8910, + "time_per_iteration": 2.532768726348877 + }, + { + "auxiliary_loss_clip": 0.06428899, + "auxiliary_loss_mlp": 0.01267044, + "balance_loss_clip": 0.06278617, + "balance_loss_mlp": 0.01255152, + "epoch": 0.5357583045242748, + "flos": 23117746538880.0, + "grad_norm": 1.5068539432144845, + "language_loss": 0.82170522, + "learning_rate": 1.8655424803536427e-06, + "loss": 0.89866459, + "num_input_tokens_seen": 191753765, + "router_z_loss_clip": 1.50292969, + "router_z_loss_mlp": 0.11895752, + "step": 8911, + "time_per_iteration": 2.530242681503296 + }, + { + "auxiliary_loss_clip": 0.06427851, + "auxiliary_loss_mlp": 0.01268226, + "balance_loss_clip": 0.06279093, + "balance_loss_mlp": 0.01256794, + "epoch": 0.5358184277769428, + "flos": 21148057411200.0, + "grad_norm": 1.7566097539058134, + "language_loss": 0.6953544, + "learning_rate": 1.8651539023468585e-06, + "loss": 0.7723152, + "num_input_tokens_seen": 191773560, + "router_z_loss_clip": 1.48730469, + "router_z_loss_mlp": 0.11425781, + "step": 8912, + "time_per_iteration": 2.52546763420105 + }, + { + "auxiliary_loss_clip": 0.06429117, + "auxiliary_loss_mlp": 0.01266082, + "balance_loss_clip": 0.06281352, + "balance_loss_mlp": 0.01255234, + "epoch": 0.5358785510296107, + "flos": 16286754608640.0, + "grad_norm": 1.7988140692342254, + "language_loss": 0.71504682, + "learning_rate": 1.8647653294534509e-06, + "loss": 0.79199886, + "num_input_tokens_seen": 191791255, + "router_z_loss_clip": 1.4765625, + "router_z_loss_mlp": 0.10858154, + "step": 8913, + "time_per_iteration": 2.4723551273345947 + }, + { + "auxiliary_loss_clip": 0.06437049, + "auxiliary_loss_mlp": 0.01269643, + "balance_loss_clip": 0.06283163, + "balance_loss_mlp": 0.01257883, + "epoch": 0.5359386742822787, + "flos": 16981555864320.0, + "grad_norm": 1.6333944745256754, + "language_loss": 0.72038394, + "learning_rate": 1.864376761688156e-06, + "loss": 0.7974509, + "num_input_tokens_seen": 191809325, + "router_z_loss_clip": 1.5390625, + "router_z_loss_mlp": 0.11761475, + "step": 8914, + "time_per_iteration": 2.5807461738586426 + }, + { + "auxiliary_loss_clip": 0.06438086, + "auxiliary_loss_mlp": 0.01272172, + "balance_loss_clip": 0.06283066, + "balance_loss_mlp": 0.01259327, + "epoch": 0.5359987975349466, + "flos": 20819091081600.0, + "grad_norm": 1.7157890571158112, + "language_loss": 0.706487, + "learning_rate": 1.8639881990657079e-06, + "loss": 0.7835896, + "num_input_tokens_seen": 191829795, + "router_z_loss_clip": 1.54785156, + "router_z_loss_mlp": 0.12841797, + "step": 8915, + "time_per_iteration": 2.542787790298462 + }, + { + "auxiliary_loss_clip": 0.06428587, + "auxiliary_loss_mlp": 0.01269302, + "balance_loss_clip": 0.06281634, + "balance_loss_mlp": 0.01257918, + "epoch": 0.5360589207876146, + "flos": 22206429532800.0, + "grad_norm": 1.674776865577312, + "language_loss": 0.75600839, + "learning_rate": 1.8635996416008408e-06, + "loss": 0.83298731, + "num_input_tokens_seen": 191850840, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 0.11383057, + "step": 8916, + "time_per_iteration": 2.5621731281280518 + }, + { + "auxiliary_loss_clip": 0.06429151, + "auxiliary_loss_mlp": 0.01268516, + "balance_loss_clip": 0.06277589, + "balance_loss_mlp": 0.01256995, + "epoch": 0.5361190440402825, + "flos": 31402393027200.0, + "grad_norm": 2.5448267428400655, + "language_loss": 0.72810572, + "learning_rate": 1.863211089308289e-06, + "loss": 0.80508238, + "num_input_tokens_seen": 191869520, + "router_z_loss_clip": 1.51660156, + "router_z_loss_mlp": 0.1151123, + "step": 8917, + "time_per_iteration": 4.027824401855469 + }, + { + "auxiliary_loss_clip": 0.06433325, + "auxiliary_loss_mlp": 0.01268717, + "balance_loss_clip": 0.06283134, + "balance_loss_mlp": 0.01257195, + "epoch": 0.5361791672929506, + "flos": 16075270103040.0, + "grad_norm": 1.844905450054995, + "language_loss": 0.71658254, + "learning_rate": 1.8628225422027865e-06, + "loss": 0.793603, + "num_input_tokens_seen": 191887240, + "router_z_loss_clip": 1.50097656, + "router_z_loss_mlp": 0.11529541, + "step": 8918, + "time_per_iteration": 2.5032598972320557 + }, + { + "auxiliary_loss_clip": 0.06431636, + "auxiliary_loss_mlp": 0.01270312, + "balance_loss_clip": 0.06282899, + "balance_loss_mlp": 0.01258933, + "epoch": 0.5362392905456185, + "flos": 20747240605440.0, + "grad_norm": 1.4549229797282903, + "language_loss": 0.75235254, + "learning_rate": 1.862434000299067e-06, + "loss": 0.82937205, + "num_input_tokens_seen": 191905690, + "router_z_loss_clip": 1.48632812, + "router_z_loss_mlp": 0.11383057, + "step": 8919, + "time_per_iteration": 2.5361175537109375 + }, + { + "auxiliary_loss_clip": 0.06430984, + "auxiliary_loss_mlp": 0.01266509, + "balance_loss_clip": 0.06280042, + "balance_loss_mlp": 0.01255244, + "epoch": 0.5362994137982865, + "flos": 17344539751680.0, + "grad_norm": 10.323313850773834, + "language_loss": 0.71843415, + "learning_rate": 1.862045463611864e-06, + "loss": 0.79540908, + "num_input_tokens_seen": 191920725, + "router_z_loss_clip": 1.50878906, + "router_z_loss_mlp": 0.11254883, + "step": 8920, + "time_per_iteration": 2.481144666671753 + }, + { + "auxiliary_loss_clip": 0.06425787, + "auxiliary_loss_mlp": 0.0126502, + "balance_loss_clip": 0.06276651, + "balance_loss_mlp": 0.01253659, + "epoch": 0.5363595370509544, + "flos": 42823819837440.0, + "grad_norm": 1.3389140049198536, + "language_loss": 0.68970168, + "learning_rate": 1.8616569321559105e-06, + "loss": 0.76660967, + "num_input_tokens_seen": 191944645, + "router_z_loss_clip": 1.49023438, + "router_z_loss_mlp": 0.11352539, + "step": 8921, + "time_per_iteration": 2.7377495765686035 + }, + { + "auxiliary_loss_clip": 0.06429093, + "auxiliary_loss_mlp": 0.01267258, + "balance_loss_clip": 0.06280531, + "balance_loss_mlp": 0.01255575, + "epoch": 0.5364196603036224, + "flos": 19177990940160.0, + "grad_norm": 2.2769865828018516, + "language_loss": 0.81912661, + "learning_rate": 1.86126840594594e-06, + "loss": 0.89609009, + "num_input_tokens_seen": 191962265, + "router_z_loss_clip": 1.48632812, + "router_z_loss_mlp": 0.11676025, + "step": 8922, + "time_per_iteration": 2.491041660308838 + }, + { + "auxiliary_loss_clip": 0.06431051, + "auxiliary_loss_mlp": 0.01267721, + "balance_loss_clip": 0.06279019, + "balance_loss_mlp": 0.01256539, + "epoch": 0.5364797835562904, + "flos": 17936827136640.0, + "grad_norm": 1.913279005224502, + "language_loss": 0.76818264, + "learning_rate": 1.860879884996686e-06, + "loss": 0.84517032, + "num_input_tokens_seen": 191978850, + "router_z_loss_clip": 1.52050781, + "router_z_loss_mlp": 0.11175537, + "step": 8923, + "time_per_iteration": 2.502797842025757 + }, + { + "auxiliary_loss_clip": 0.06430578, + "auxiliary_loss_mlp": 0.01268847, + "balance_loss_clip": 0.06277579, + "balance_loss_mlp": 0.01257052, + "epoch": 0.5365399068089584, + "flos": 30236098446720.0, + "grad_norm": 1.4167756526815838, + "language_loss": 0.70506531, + "learning_rate": 1.8604913693228804e-06, + "loss": 0.78205955, + "num_input_tokens_seen": 192002000, + "router_z_loss_clip": 1.53027344, + "router_z_loss_mlp": 0.11791992, + "step": 8924, + "time_per_iteration": 2.5783135890960693 + }, + { + "auxiliary_loss_clip": 0.06433783, + "auxiliary_loss_mlp": 0.01269029, + "balance_loss_clip": 0.06280564, + "balance_loss_mlp": 0.01256804, + "epoch": 0.5366000300616264, + "flos": 24897264825600.0, + "grad_norm": 2.5342740284522516, + "language_loss": 0.87064564, + "learning_rate": 1.8601028589392558e-06, + "loss": 0.9476738, + "num_input_tokens_seen": 192019100, + "router_z_loss_clip": 1.53320312, + "router_z_loss_mlp": 0.12231445, + "step": 8925, + "time_per_iteration": 2.555947780609131 + }, + { + "auxiliary_loss_clip": 0.0643315, + "auxiliary_loss_mlp": 0.012686, + "balance_loss_clip": 0.06278683, + "balance_loss_mlp": 0.01256911, + "epoch": 0.5366601533142943, + "flos": 29834610808320.0, + "grad_norm": 1.6615305931190325, + "language_loss": 0.78511882, + "learning_rate": 1.8597143538605455e-06, + "loss": 0.86213624, + "num_input_tokens_seen": 192041660, + "router_z_loss_clip": 1.54296875, + "router_z_loss_mlp": 0.11694336, + "step": 8926, + "time_per_iteration": 2.575540781021118 + }, + { + "auxiliary_loss_clip": 0.06420288, + "auxiliary_loss_mlp": 0.01265367, + "balance_loss_clip": 0.06276788, + "balance_loss_mlp": 0.0125437, + "epoch": 0.5367202765669623, + "flos": 27206821313280.0, + "grad_norm": 1.3335091711279083, + "language_loss": 0.66572356, + "learning_rate": 1.85932585410148e-06, + "loss": 0.74258018, + "num_input_tokens_seen": 192063540, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.11004639, + "step": 8927, + "time_per_iteration": 2.574263572692871 + }, + { + "auxiliary_loss_clip": 0.06429082, + "auxiliary_loss_mlp": 0.0126451, + "balance_loss_clip": 0.06277999, + "balance_loss_mlp": 0.0125309, + "epoch": 0.5367803998196302, + "flos": 20236153426560.0, + "grad_norm": 1.7727091217622297, + "language_loss": 0.73473167, + "learning_rate": 1.8589373596767929e-06, + "loss": 0.81166756, + "num_input_tokens_seen": 192081760, + "router_z_loss_clip": 1.51074219, + "router_z_loss_mlp": 0.11413574, + "step": 8928, + "time_per_iteration": 2.4792275428771973 + }, + { + "auxiliary_loss_clip": 0.06429128, + "auxiliary_loss_mlp": 0.01265529, + "balance_loss_clip": 0.06278329, + "balance_loss_mlp": 0.01254609, + "epoch": 0.5368405230722982, + "flos": 32161791381120.0, + "grad_norm": 1.7479222402462038, + "language_loss": 0.62972343, + "learning_rate": 1.8585488706012154e-06, + "loss": 0.70666999, + "num_input_tokens_seen": 192101620, + "router_z_loss_clip": 1.50683594, + "router_z_loss_mlp": 0.10919189, + "step": 8929, + "time_per_iteration": 2.622292995452881 + }, + { + "auxiliary_loss_clip": 0.06432647, + "auxiliary_loss_mlp": 0.01265269, + "balance_loss_clip": 0.0628202, + "balance_loss_mlp": 0.01254433, + "epoch": 0.5369006463249661, + "flos": 26254778423040.0, + "grad_norm": 1.591710131173975, + "language_loss": 0.66400939, + "learning_rate": 1.8581603868894781e-06, + "loss": 0.74098849, + "num_input_tokens_seen": 192121805, + "router_z_loss_clip": 1.50585938, + "router_z_loss_mlp": 0.10845947, + "step": 8930, + "time_per_iteration": 2.543949604034424 + }, + { + "auxiliary_loss_clip": 0.06424774, + "auxiliary_loss_mlp": 0.01264361, + "balance_loss_clip": 0.06279226, + "balance_loss_mlp": 0.01253299, + "epoch": 0.5369607695776342, + "flos": 26218119461760.0, + "grad_norm": 1.4676781117198738, + "language_loss": 0.67308921, + "learning_rate": 1.8577719085563136e-06, + "loss": 0.74998057, + "num_input_tokens_seen": 192141765, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.1105957, + "step": 8931, + "time_per_iteration": 2.5630295276641846 + }, + { + "auxiliary_loss_clip": 0.06432625, + "auxiliary_loss_mlp": 0.01268662, + "balance_loss_clip": 0.0628577, + "balance_loss_mlp": 0.01256598, + "epoch": 0.5370208928303021, + "flos": 25015920606720.0, + "grad_norm": 1.565512656212007, + "language_loss": 0.76494187, + "learning_rate": 1.8573834356164525e-06, + "loss": 0.84195477, + "num_input_tokens_seen": 192161560, + "router_z_loss_clip": 1.46777344, + "router_z_loss_mlp": 0.12072754, + "step": 8932, + "time_per_iteration": 2.5423011779785156 + }, + { + "auxiliary_loss_clip": 0.0642775, + "auxiliary_loss_mlp": 0.01267942, + "balance_loss_clip": 0.06280537, + "balance_loss_mlp": 0.01255723, + "epoch": 0.5370810160829701, + "flos": 31799646034560.0, + "grad_norm": 1.681669184165067, + "language_loss": 0.66588402, + "learning_rate": 1.8569949680846261e-06, + "loss": 0.74284095, + "num_input_tokens_seen": 192180190, + "router_z_loss_clip": 1.47167969, + "router_z_loss_mlp": 0.12219238, + "step": 8933, + "time_per_iteration": 2.6461243629455566 + }, + { + "auxiliary_loss_clip": 0.0642833, + "auxiliary_loss_mlp": 0.01268413, + "balance_loss_clip": 0.06281729, + "balance_loss_mlp": 0.01256515, + "epoch": 0.537141139335638, + "flos": 23849500245120.0, + "grad_norm": 1.5934461108199862, + "language_loss": 0.83294082, + "learning_rate": 1.856606505975565e-06, + "loss": 0.90990818, + "num_input_tokens_seen": 192198855, + "router_z_loss_clip": 1.46582031, + "router_z_loss_mlp": 0.11895752, + "step": 8934, + "time_per_iteration": 2.5241549015045166 + }, + { + "auxiliary_loss_clip": 0.06428687, + "auxiliary_loss_mlp": 0.01267543, + "balance_loss_clip": 0.06283442, + "balance_loss_mlp": 0.01256033, + "epoch": 0.537201262588306, + "flos": 18513685370880.0, + "grad_norm": 1.6222709830765285, + "language_loss": 0.7995823, + "learning_rate": 1.856218049303999e-06, + "loss": 0.87654459, + "num_input_tokens_seen": 192216555, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.11517334, + "step": 8935, + "time_per_iteration": 2.5692355632781982 + }, + { + "auxiliary_loss_clip": 0.06432107, + "auxiliary_loss_mlp": 0.01265757, + "balance_loss_clip": 0.06282724, + "balance_loss_mlp": 0.01253556, + "epoch": 0.537261385840974, + "flos": 25669492853760.0, + "grad_norm": 4.395420873174801, + "language_loss": 0.83744997, + "learning_rate": 1.855829598084659e-06, + "loss": 0.91442859, + "num_input_tokens_seen": 192236910, + "router_z_loss_clip": 1.49414062, + "router_z_loss_mlp": 0.12200928, + "step": 8936, + "time_per_iteration": 2.53723406791687 + }, + { + "auxiliary_loss_clip": 0.06430986, + "auxiliary_loss_mlp": 0.0126655, + "balance_loss_clip": 0.06284051, + "balance_loss_mlp": 0.01255458, + "epoch": 0.537321509093642, + "flos": 40744656950400.0, + "grad_norm": 1.238966659536207, + "language_loss": 0.73065245, + "learning_rate": 1.8554411523322754e-06, + "loss": 0.8076278, + "num_input_tokens_seen": 192260790, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 0.11096191, + "step": 8937, + "time_per_iteration": 2.7185041904449463 + }, + { + "auxiliary_loss_clip": 0.06432244, + "auxiliary_loss_mlp": 0.01269226, + "balance_loss_clip": 0.06281854, + "balance_loss_mlp": 0.01257591, + "epoch": 0.53738163234631, + "flos": 17244248014080.0, + "grad_norm": 2.3423795733880506, + "language_loss": 0.82399505, + "learning_rate": 1.8550527120615778e-06, + "loss": 0.90100974, + "num_input_tokens_seen": 192277230, + "router_z_loss_clip": 1.50585938, + "router_z_loss_mlp": 0.11645508, + "step": 8938, + "time_per_iteration": 2.497788906097412 + }, + { + "auxiliary_loss_clip": 0.06440363, + "auxiliary_loss_mlp": 0.01269336, + "balance_loss_clip": 0.06284846, + "balance_loss_mlp": 0.01257505, + "epoch": 0.5374417555989779, + "flos": 12826710034560.0, + "grad_norm": 2.237788663184982, + "language_loss": 0.80566859, + "learning_rate": 1.8546642772872957e-06, + "loss": 0.88276565, + "num_input_tokens_seen": 192292840, + "router_z_loss_clip": 1.5546875, + "router_z_loss_mlp": 0.1184082, + "step": 8939, + "time_per_iteration": 2.506603479385376 + }, + { + "auxiliary_loss_clip": 0.06330699, + "auxiliary_loss_mlp": 0.01256495, + "balance_loss_clip": 0.06268299, + "balance_loss_mlp": 0.01254887, + "epoch": 0.5375018788516459, + "flos": 67275502248960.0, + "grad_norm": 0.6889137998662954, + "language_loss": 0.5233649, + "learning_rate": 1.8542758480241589e-06, + "loss": 0.59923685, + "num_input_tokens_seen": 192358240, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 0.01609802, + "step": 8940, + "time_per_iteration": 3.1455881595611572 + }, + { + "auxiliary_loss_clip": 0.06424817, + "auxiliary_loss_mlp": 0.01265039, + "balance_loss_clip": 0.06280527, + "balance_loss_mlp": 0.01254197, + "epoch": 0.5375620021043138, + "flos": 18120080016000.0, + "grad_norm": 1.7572331791906293, + "language_loss": 0.71456778, + "learning_rate": 1.8538874242868965e-06, + "loss": 0.7914663, + "num_input_tokens_seen": 192377370, + "router_z_loss_clip": 1.44238281, + "router_z_loss_mlp": 0.1083374, + "step": 8941, + "time_per_iteration": 3.9169673919677734 + }, + { + "auxiliary_loss_clip": 0.06423429, + "auxiliary_loss_mlp": 0.01266734, + "balance_loss_clip": 0.06280611, + "balance_loss_mlp": 0.01256554, + "epoch": 0.5376221253569818, + "flos": 23156166435840.0, + "grad_norm": 1.5985240277338788, + "language_loss": 0.79660439, + "learning_rate": 1.853499006090237e-06, + "loss": 0.87350607, + "num_input_tokens_seen": 192396450, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.10174561, + "step": 8942, + "time_per_iteration": 2.5441763401031494 + }, + { + "auxiliary_loss_clip": 0.06433077, + "auxiliary_loss_mlp": 0.01269882, + "balance_loss_clip": 0.06281331, + "balance_loss_mlp": 0.01258229, + "epoch": 0.5376822486096497, + "flos": 29980240404480.0, + "grad_norm": 1.695957968467341, + "language_loss": 0.7061829, + "learning_rate": 1.853110593448911e-06, + "loss": 0.78321248, + "num_input_tokens_seen": 192417390, + "router_z_loss_clip": 1.51855469, + "router_z_loss_mlp": 0.11645508, + "step": 8943, + "time_per_iteration": 2.5876903533935547 + }, + { + "auxiliary_loss_clip": 0.06327454, + "auxiliary_loss_mlp": 0.01255314, + "balance_loss_clip": 0.06264913, + "balance_loss_mlp": 0.0125356, + "epoch": 0.5377423718623178, + "flos": 54188139761280.0, + "grad_norm": 0.7834151101556619, + "language_loss": 0.59688759, + "learning_rate": 1.852722186377645e-06, + "loss": 0.67271525, + "num_input_tokens_seen": 192478060, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 0.01757812, + "step": 8944, + "time_per_iteration": 4.5469114780426025 + }, + { + "auxiliary_loss_clip": 0.06439775, + "auxiliary_loss_mlp": 0.01267766, + "balance_loss_clip": 0.06283297, + "balance_loss_mlp": 0.01256066, + "epoch": 0.5378024951149857, + "flos": 23263585770240.0, + "grad_norm": 2.6705245070619754, + "language_loss": 0.776173, + "learning_rate": 1.852333784891169e-06, + "loss": 0.85324842, + "num_input_tokens_seen": 192495985, + "router_z_loss_clip": 1.56347656, + "router_z_loss_mlp": 0.11706543, + "step": 8945, + "time_per_iteration": 2.61606502532959 + }, + { + "auxiliary_loss_clip": 0.06428292, + "auxiliary_loss_mlp": 0.01263773, + "balance_loss_clip": 0.06278516, + "balance_loss_mlp": 0.01252883, + "epoch": 0.5378626183676537, + "flos": 24030866407680.0, + "grad_norm": 1.7469475045380867, + "language_loss": 0.68958521, + "learning_rate": 1.8519453890042112e-06, + "loss": 0.76650584, + "num_input_tokens_seen": 192515445, + "router_z_loss_clip": 1.49804688, + "router_z_loss_mlp": 0.10888672, + "step": 8946, + "time_per_iteration": 2.6660590171813965 + }, + { + "auxiliary_loss_clip": 0.06427687, + "auxiliary_loss_mlp": 0.0126763, + "balance_loss_clip": 0.06282603, + "balance_loss_mlp": 0.01256704, + "epoch": 0.5379227416203216, + "flos": 27169072248960.0, + "grad_norm": 1.5118478086705984, + "language_loss": 0.77489585, + "learning_rate": 1.851556998731498e-06, + "loss": 0.85184896, + "num_input_tokens_seen": 192536530, + "router_z_loss_clip": 1.45019531, + "router_z_loss_mlp": 0.10925293, + "step": 8947, + "time_per_iteration": 2.618797779083252 + }, + { + "auxiliary_loss_clip": 0.06429853, + "auxiliary_loss_mlp": 0.0126878, + "balance_loss_clip": 0.06282403, + "balance_loss_mlp": 0.01257688, + "epoch": 0.5379828648729896, + "flos": 24688631358720.0, + "grad_norm": 1.962883252611848, + "language_loss": 0.60299599, + "learning_rate": 1.8511686140877592e-06, + "loss": 0.6799823, + "num_input_tokens_seen": 192556075, + "router_z_loss_clip": 1.47265625, + "router_z_loss_mlp": 0.11090088, + "step": 8948, + "time_per_iteration": 3.99113392829895 + }, + { + "auxiliary_loss_clip": 0.06430186, + "auxiliary_loss_mlp": 0.01265436, + "balance_loss_clip": 0.06282011, + "balance_loss_mlp": 0.01254629, + "epoch": 0.5380429881256577, + "flos": 22528981025280.0, + "grad_norm": 1.6036817147437437, + "language_loss": 0.7965849, + "learning_rate": 1.8507802350877205e-06, + "loss": 0.87354112, + "num_input_tokens_seen": 192575535, + "router_z_loss_clip": 1.48046875, + "router_z_loss_mlp": 0.10803223, + "step": 8949, + "time_per_iteration": 2.5306220054626465 + }, + { + "auxiliary_loss_clip": 0.06424635, + "auxiliary_loss_mlp": 0.01267697, + "balance_loss_clip": 0.06281022, + "balance_loss_mlp": 0.01256796, + "epoch": 0.5381031113783256, + "flos": 26986825618560.0, + "grad_norm": 1.5758786571118277, + "language_loss": 0.78447008, + "learning_rate": 1.850391861746111e-06, + "loss": 0.86139345, + "num_input_tokens_seen": 192594490, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.10900879, + "step": 8950, + "time_per_iteration": 2.5665290355682373 + }, + { + "auxiliary_loss_clip": 0.0642289, + "auxiliary_loss_mlp": 0.01269045, + "balance_loss_clip": 0.06281261, + "balance_loss_mlp": 0.01258793, + "epoch": 0.5381632346309936, + "flos": 24761026886400.0, + "grad_norm": 1.6449806756094487, + "language_loss": 0.72907847, + "learning_rate": 1.8500034940776573e-06, + "loss": 0.80599785, + "num_input_tokens_seen": 192615650, + "router_z_loss_clip": 1.41503906, + "router_z_loss_mlp": 0.10253906, + "step": 8951, + "time_per_iteration": 2.5389561653137207 + }, + { + "auxiliary_loss_clip": 0.0643057, + "auxiliary_loss_mlp": 0.01265397, + "balance_loss_clip": 0.06280816, + "balance_loss_mlp": 0.01254626, + "epoch": 0.5382233578836615, + "flos": 15565524589440.0, + "grad_norm": 1.8886102084278436, + "language_loss": 0.75767493, + "learning_rate": 1.849615132097085e-06, + "loss": 0.83463454, + "num_input_tokens_seen": 192633840, + "router_z_loss_clip": 1.49707031, + "router_z_loss_mlp": 0.10760498, + "step": 8952, + "time_per_iteration": 2.5009233951568604 + }, + { + "auxiliary_loss_clip": 0.06423527, + "auxiliary_loss_mlp": 0.01265834, + "balance_loss_clip": 0.0627749, + "balance_loss_mlp": 0.01254384, + "epoch": 0.5382834811363295, + "flos": 25091838005760.0, + "grad_norm": 1.352822721598185, + "language_loss": 0.79742837, + "learning_rate": 1.8492267758191228e-06, + "loss": 0.87432194, + "num_input_tokens_seen": 192655890, + "router_z_loss_clip": 1.45996094, + "router_z_loss_mlp": 0.11456299, + "step": 8953, + "time_per_iteration": 2.5382277965545654 + }, + { + "auxiliary_loss_clip": 0.06422, + "auxiliary_loss_mlp": 0.01263666, + "balance_loss_clip": 0.06278996, + "balance_loss_mlp": 0.01253193, + "epoch": 0.5383436043889974, + "flos": 13302983041920.0, + "grad_norm": 1.682075048645487, + "language_loss": 0.80507964, + "learning_rate": 1.8488384252584964e-06, + "loss": 0.88193631, + "num_input_tokens_seen": 192673025, + "router_z_loss_clip": 1.42871094, + "router_z_loss_mlp": 0.10473633, + "step": 8954, + "time_per_iteration": 2.5006446838378906 + }, + { + "auxiliary_loss_clip": 0.06425533, + "auxiliary_loss_mlp": 0.01268977, + "balance_loss_clip": 0.06279075, + "balance_loss_mlp": 0.01258123, + "epoch": 0.5384037276416654, + "flos": 23046063770880.0, + "grad_norm": 2.297323300751636, + "language_loss": 0.77060652, + "learning_rate": 1.8484500804299318e-06, + "loss": 0.84755164, + "num_input_tokens_seen": 192692190, + "router_z_loss_clip": 1.46386719, + "router_z_loss_mlp": 0.10858154, + "step": 8955, + "time_per_iteration": 2.5469982624053955 + }, + { + "auxiliary_loss_clip": 0.06422862, + "auxiliary_loss_mlp": 0.01268692, + "balance_loss_clip": 0.06278117, + "balance_loss_mlp": 0.01257624, + "epoch": 0.5384638508943334, + "flos": 20637389502720.0, + "grad_norm": 1.4766809485278785, + "language_loss": 0.78634906, + "learning_rate": 1.8480617413481557e-06, + "loss": 0.86326456, + "num_input_tokens_seen": 192710380, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.11071777, + "step": 8956, + "time_per_iteration": 3.9486958980560303 + }, + { + "auxiliary_loss_clip": 0.06328554, + "auxiliary_loss_mlp": 0.01254386, + "balance_loss_clip": 0.0626571, + "balance_loss_mlp": 0.01252584, + "epoch": 0.5385239741470014, + "flos": 66755820026880.0, + "grad_norm": 0.8475755828975666, + "language_loss": 0.63483834, + "learning_rate": 1.8476734080278932e-06, + "loss": 0.71066773, + "num_input_tokens_seen": 192768995, + "router_z_loss_clip": 0.62890625, + "router_z_loss_mlp": 0.01797485, + "step": 8957, + "time_per_iteration": 3.0589206218719482 + }, + { + "auxiliary_loss_clip": 0.06326501, + "auxiliary_loss_mlp": 0.01256038, + "balance_loss_clip": 0.06263363, + "balance_loss_mlp": 0.01254215, + "epoch": 0.5385840973996693, + "flos": 64737466076160.0, + "grad_norm": 0.6942778211869604, + "language_loss": 0.51190817, + "learning_rate": 1.8472850804838705e-06, + "loss": 0.58773351, + "num_input_tokens_seen": 192825585, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 0.01818848, + "step": 8958, + "time_per_iteration": 3.1954948902130127 + }, + { + "auxiliary_loss_clip": 0.06433147, + "auxiliary_loss_mlp": 0.01266859, + "balance_loss_clip": 0.06283388, + "balance_loss_mlp": 0.01255189, + "epoch": 0.5386442206523373, + "flos": 26149161951360.0, + "grad_norm": 1.5085241385719446, + "language_loss": 0.77482343, + "learning_rate": 1.8468967587308128e-06, + "loss": 0.85182357, + "num_input_tokens_seen": 192847335, + "router_z_loss_clip": 1.49609375, + "router_z_loss_mlp": 0.11669922, + "step": 8959, + "time_per_iteration": 2.595390558242798 + }, + { + "auxiliary_loss_clip": 0.06429408, + "auxiliary_loss_mlp": 0.01266713, + "balance_loss_clip": 0.06280766, + "balance_loss_mlp": 0.01255269, + "epoch": 0.5387043439050052, + "flos": 18256401809280.0, + "grad_norm": 2.0832623304514373, + "language_loss": 0.84442693, + "learning_rate": 1.8465084427834455e-06, + "loss": 0.92138815, + "num_input_tokens_seen": 192862205, + "router_z_loss_clip": 1.48632812, + "router_z_loss_mlp": 0.11437988, + "step": 8960, + "time_per_iteration": 2.459411382675171 + }, + { + "auxiliary_loss_clip": 0.0642896, + "auxiliary_loss_mlp": 0.01265224, + "balance_loss_clip": 0.06281836, + "balance_loss_mlp": 0.01254495, + "epoch": 0.5387644671576732, + "flos": 29795939349120.0, + "grad_norm": 1.5299241540989073, + "language_loss": 0.78738272, + "learning_rate": 1.8461201326564933e-06, + "loss": 0.86432457, + "num_input_tokens_seen": 192883695, + "router_z_loss_clip": 1.47070312, + "router_z_loss_mlp": 0.1072998, + "step": 8961, + "time_per_iteration": 2.6379730701446533 + }, + { + "auxiliary_loss_clip": 0.06425574, + "auxiliary_loss_mlp": 0.01265079, + "balance_loss_clip": 0.06280299, + "balance_loss_mlp": 0.01254106, + "epoch": 0.5388245904103413, + "flos": 22379661849600.0, + "grad_norm": 1.7063822520278231, + "language_loss": 0.85018182, + "learning_rate": 1.845731828364681e-06, + "loss": 0.92708838, + "num_input_tokens_seen": 192900190, + "router_z_loss_clip": 1.45214844, + "router_z_loss_mlp": 0.10980225, + "step": 8962, + "time_per_iteration": 2.495314359664917 + }, + { + "auxiliary_loss_clip": 0.06324032, + "auxiliary_loss_mlp": 0.01253937, + "balance_loss_clip": 0.06261306, + "balance_loss_mlp": 0.01252085, + "epoch": 0.5388847136630092, + "flos": 69827332417920.0, + "grad_norm": 0.7252434381461927, + "language_loss": 0.54196495, + "learning_rate": 1.8453435299227333e-06, + "loss": 0.61774462, + "num_input_tokens_seen": 192958675, + "router_z_loss_clip": 0.62792969, + "router_z_loss_mlp": 0.01847839, + "step": 8963, + "time_per_iteration": 3.0685930252075195 + }, + { + "auxiliary_loss_clip": 0.06319527, + "auxiliary_loss_mlp": 0.01253383, + "balance_loss_clip": 0.0625699, + "balance_loss_mlp": 0.01251595, + "epoch": 0.5389448369156772, + "flos": 69844270942080.0, + "grad_norm": 0.7817796987422422, + "language_loss": 0.62972116, + "learning_rate": 1.8449552373453744e-06, + "loss": 0.7054503, + "num_input_tokens_seen": 193033135, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 0.01786804, + "step": 8964, + "time_per_iteration": 3.2163538932800293 + }, + { + "auxiliary_loss_clip": 0.0643357, + "auxiliary_loss_mlp": 0.01266947, + "balance_loss_clip": 0.06280617, + "balance_loss_mlp": 0.01255462, + "epoch": 0.5390049601683451, + "flos": 31730478888960.0, + "grad_norm": 1.575337207693627, + "language_loss": 0.70121396, + "learning_rate": 1.8445669506473287e-06, + "loss": 0.77821916, + "num_input_tokens_seen": 193055570, + "router_z_loss_clip": 1.52734375, + "router_z_loss_mlp": 0.11499023, + "step": 8965, + "time_per_iteration": 2.6127662658691406 + }, + { + "auxiliary_loss_clip": 0.06431293, + "auxiliary_loss_mlp": 0.01269597, + "balance_loss_clip": 0.06281815, + "balance_loss_mlp": 0.01258546, + "epoch": 0.5390650834210131, + "flos": 18119283402240.0, + "grad_norm": 2.027850604452939, + "language_loss": 0.82445288, + "learning_rate": 1.8441786698433192e-06, + "loss": 0.90146178, + "num_input_tokens_seen": 193073120, + "router_z_loss_clip": 1.49414062, + "router_z_loss_mlp": 0.11047363, + "step": 8966, + "time_per_iteration": 2.472459554672241 + }, + { + "auxiliary_loss_clip": 0.06426321, + "auxiliary_loss_mlp": 0.01267306, + "balance_loss_clip": 0.06281838, + "balance_loss_mlp": 0.01256326, + "epoch": 0.539125206673681, + "flos": 17421798816000.0, + "grad_norm": 2.5704499610569282, + "language_loss": 0.72936428, + "learning_rate": 1.8437903949480706e-06, + "loss": 0.80630052, + "num_input_tokens_seen": 193090105, + "router_z_loss_clip": 1.44335938, + "router_z_loss_mlp": 0.10980225, + "step": 8967, + "time_per_iteration": 2.4896764755249023 + }, + { + "auxiliary_loss_clip": 0.06424848, + "auxiliary_loss_mlp": 0.01264578, + "balance_loss_clip": 0.06278098, + "balance_loss_mlp": 0.01254493, + "epoch": 0.539185329926349, + "flos": 22205255575680.0, + "grad_norm": 1.5589784366040595, + "language_loss": 0.81895125, + "learning_rate": 1.8434021259763065e-06, + "loss": 0.89584547, + "num_input_tokens_seen": 193109325, + "router_z_loss_clip": 1.46679688, + "router_z_loss_mlp": 0.10083008, + "step": 8968, + "time_per_iteration": 2.5401480197906494 + }, + { + "auxiliary_loss_clip": 0.06428899, + "auxiliary_loss_mlp": 0.01265753, + "balance_loss_clip": 0.0628034, + "balance_loss_mlp": 0.01254118, + "epoch": 0.539245453179017, + "flos": 21440867904000.0, + "grad_norm": 1.4575649765742498, + "language_loss": 0.74243855, + "learning_rate": 1.8430138629427484e-06, + "loss": 0.81938505, + "num_input_tokens_seen": 193130595, + "router_z_loss_clip": 1.48632812, + "router_z_loss_mlp": 0.11633301, + "step": 8969, + "time_per_iteration": 2.553879976272583 + }, + { + "auxiliary_loss_clip": 0.06430885, + "auxiliary_loss_mlp": 0.01265593, + "balance_loss_clip": 0.06278199, + "balance_loss_mlp": 0.01254214, + "epoch": 0.539305576431685, + "flos": 20740322643840.0, + "grad_norm": 2.1595830648072347, + "language_loss": 0.827712, + "learning_rate": 1.8426256058621205e-06, + "loss": 0.90467674, + "num_input_tokens_seen": 193148930, + "router_z_loss_clip": 1.52636719, + "router_z_loss_mlp": 0.1137085, + "step": 8970, + "time_per_iteration": 2.478726863861084 + }, + { + "auxiliary_loss_clip": 0.06422678, + "auxiliary_loss_mlp": 0.01263676, + "balance_loss_clip": 0.06278254, + "balance_loss_mlp": 0.01253185, + "epoch": 0.5393656996843529, + "flos": 30928467934080.0, + "grad_norm": 1.400352356553148, + "language_loss": 0.75607336, + "learning_rate": 1.842237354749146e-06, + "loss": 0.83293688, + "num_input_tokens_seen": 193170140, + "router_z_loss_clip": 1.4453125, + "router_z_loss_mlp": 0.1048584, + "step": 8971, + "time_per_iteration": 2.5901689529418945 + }, + { + "auxiliary_loss_clip": 0.06318198, + "auxiliary_loss_mlp": 0.01253533, + "balance_loss_clip": 0.06255443, + "balance_loss_mlp": 0.0125168, + "epoch": 0.5394258229370209, + "flos": 50332953260160.0, + "grad_norm": 0.8588377208931133, + "language_loss": 0.60451257, + "learning_rate": 1.8418491096185465e-06, + "loss": 0.68022978, + "num_input_tokens_seen": 193227235, + "router_z_loss_clip": 0.62841797, + "router_z_loss_mlp": 0.01847839, + "step": 8972, + "time_per_iteration": 3.1413605213165283 + }, + { + "auxiliary_loss_clip": 0.06426257, + "auxiliary_loss_mlp": 0.01269177, + "balance_loss_clip": 0.06278059, + "balance_loss_mlp": 0.01257918, + "epoch": 0.5394859461896888, + "flos": 25419169180800.0, + "grad_norm": 1.5980875117754325, + "language_loss": 0.787233, + "learning_rate": 1.841460870485045e-06, + "loss": 0.8641873, + "num_input_tokens_seen": 193248435, + "router_z_loss_clip": 1.48046875, + "router_z_loss_mlp": 0.1126709, + "step": 8973, + "time_per_iteration": 2.5336296558380127 + }, + { + "auxiliary_loss_clip": 0.06433228, + "auxiliary_loss_mlp": 0.01267524, + "balance_loss_clip": 0.06279569, + "balance_loss_mlp": 0.0125546, + "epoch": 0.5395460694423568, + "flos": 25484646746880.0, + "grad_norm": 1.7949926655699973, + "language_loss": 0.7381959, + "learning_rate": 1.8410726373633623e-06, + "loss": 0.81520343, + "num_input_tokens_seen": 193267490, + "router_z_loss_clip": 1.53710938, + "router_z_loss_mlp": 0.12078857, + "step": 8974, + "time_per_iteration": 2.5483648777008057 + }, + { + "auxiliary_loss_clip": 0.06318444, + "auxiliary_loss_mlp": 0.01253276, + "balance_loss_clip": 0.06255525, + "balance_loss_mlp": 0.01251373, + "epoch": 0.5396061926950249, + "flos": 53267305317120.0, + "grad_norm": 0.7276638901828621, + "language_loss": 0.50946128, + "learning_rate": 1.8406844102682215e-06, + "loss": 0.58517849, + "num_input_tokens_seen": 193326050, + "router_z_loss_clip": 0.62890625, + "router_z_loss_mlp": 0.01899719, + "step": 8975, + "time_per_iteration": 3.125056028366089 + }, + { + "auxiliary_loss_clip": 0.06423691, + "auxiliary_loss_mlp": 0.01264945, + "balance_loss_clip": 0.06277017, + "balance_loss_mlp": 0.01253215, + "epoch": 0.5396663159476928, + "flos": 26732476949760.0, + "grad_norm": 1.546051077066994, + "language_loss": 0.72722358, + "learning_rate": 1.840296189214344e-06, + "loss": 0.80410993, + "num_input_tokens_seen": 193348785, + "router_z_loss_clip": 1.46777344, + "router_z_loss_mlp": 0.11724854, + "step": 8976, + "time_per_iteration": 2.563626766204834 + }, + { + "auxiliary_loss_clip": 0.06424834, + "auxiliary_loss_mlp": 0.01268763, + "balance_loss_clip": 0.06278136, + "balance_loss_mlp": 0.01257999, + "epoch": 0.5397264392003608, + "flos": 23259267285120.0, + "grad_norm": 1.9541916066514684, + "language_loss": 0.70649612, + "learning_rate": 1.8399079742164509e-06, + "loss": 0.78343207, + "num_input_tokens_seen": 193367080, + "router_z_loss_clip": 1.46679688, + "router_z_loss_mlp": 0.10766602, + "step": 8977, + "time_per_iteration": 2.5443131923675537 + }, + { + "auxiliary_loss_clip": 0.06428454, + "auxiliary_loss_mlp": 0.01267706, + "balance_loss_clip": 0.06278601, + "balance_loss_mlp": 0.01256691, + "epoch": 0.5397865624530287, + "flos": 18299727169920.0, + "grad_norm": 1.8457096410810847, + "language_loss": 0.72901827, + "learning_rate": 1.8395197652892636e-06, + "loss": 0.80597985, + "num_input_tokens_seen": 193383715, + "router_z_loss_clip": 1.49804688, + "router_z_loss_mlp": 0.11016846, + "step": 8978, + "time_per_iteration": 2.511715888977051 + }, + { + "auxiliary_loss_clip": 0.06434547, + "auxiliary_loss_mlp": 0.01269171, + "balance_loss_clip": 0.0627895, + "balance_loss_mlp": 0.01256821, + "epoch": 0.5398466857056967, + "flos": 15301742336640.0, + "grad_norm": 1.7083695222951265, + "language_loss": 0.74513042, + "learning_rate": 1.8391315624475028e-06, + "loss": 0.82216758, + "num_input_tokens_seen": 193400560, + "router_z_loss_clip": 1.5546875, + "router_z_loss_mlp": 0.12347412, + "step": 8979, + "time_per_iteration": 2.4654295444488525 + }, + { + "auxiliary_loss_clip": 0.06435215, + "auxiliary_loss_mlp": 0.01268104, + "balance_loss_clip": 0.062815, + "balance_loss_mlp": 0.0125551, + "epoch": 0.5399068089583646, + "flos": 17827521085440.0, + "grad_norm": 2.1729763122828567, + "language_loss": 0.77298462, + "learning_rate": 1.8387433657058892e-06, + "loss": 0.85001791, + "num_input_tokens_seen": 193418680, + "router_z_loss_clip": 1.53710938, + "router_z_loss_mlp": 0.12609863, + "step": 8980, + "time_per_iteration": 2.5131070613861084 + }, + { + "auxiliary_loss_clip": 0.06428653, + "auxiliary_loss_mlp": 0.01266817, + "balance_loss_clip": 0.06278711, + "balance_loss_mlp": 0.01256202, + "epoch": 0.5399669322110326, + "flos": 27389109870720.0, + "grad_norm": 1.7146505379249901, + "language_loss": 0.82213032, + "learning_rate": 1.8383551750791431e-06, + "loss": 0.89908504, + "num_input_tokens_seen": 193439310, + "router_z_loss_clip": 1.49902344, + "router_z_loss_mlp": 0.10626221, + "step": 8981, + "time_per_iteration": 4.00026273727417 + }, + { + "auxiliary_loss_clip": 0.06430832, + "auxiliary_loss_mlp": 0.01266682, + "balance_loss_clip": 0.06279931, + "balance_loss_mlp": 0.01255292, + "epoch": 0.5400270554637006, + "flos": 20455394434560.0, + "grad_norm": 1.8197401655909293, + "language_loss": 0.67626458, + "learning_rate": 1.8379669905819857e-06, + "loss": 0.75323975, + "num_input_tokens_seen": 193458115, + "router_z_loss_clip": 1.5078125, + "router_z_loss_mlp": 0.11395264, + "step": 8982, + "time_per_iteration": 2.7018609046936035 + }, + { + "auxiliary_loss_clip": 0.06430931, + "auxiliary_loss_mlp": 0.01272335, + "balance_loss_clip": 0.06282471, + "balance_loss_mlp": 0.0126123, + "epoch": 0.5400871787163686, + "flos": 21696055113600.0, + "grad_norm": 1.5105940902505235, + "language_loss": 0.82925522, + "learning_rate": 1.8375788122291358e-06, + "loss": 0.90628791, + "num_input_tokens_seen": 193477365, + "router_z_loss_clip": 1.48632812, + "router_z_loss_mlp": 0.11108398, + "step": 8983, + "time_per_iteration": 4.0147035121917725 + }, + { + "auxiliary_loss_clip": 0.06427681, + "auxiliary_loss_mlp": 0.01265838, + "balance_loss_clip": 0.06280811, + "balance_loss_mlp": 0.01254233, + "epoch": 0.5401473019690365, + "flos": 19210163708160.0, + "grad_norm": 2.5381589556683752, + "language_loss": 0.70748949, + "learning_rate": 1.8371906400353138e-06, + "loss": 0.78442466, + "num_input_tokens_seen": 193495595, + "router_z_loss_clip": 1.46679688, + "router_z_loss_mlp": 0.11608887, + "step": 8984, + "time_per_iteration": 2.485203742980957 + }, + { + "auxiliary_loss_clip": 0.06436664, + "auxiliary_loss_mlp": 0.01270492, + "balance_loss_clip": 0.06283301, + "balance_loss_mlp": 0.01258702, + "epoch": 0.5402074252217045, + "flos": 20632987163520.0, + "grad_norm": 1.6283776116809212, + "language_loss": 0.80336136, + "learning_rate": 1.8368024740152386e-06, + "loss": 0.88043296, + "num_input_tokens_seen": 193514035, + "router_z_loss_clip": 1.53417969, + "router_z_loss_mlp": 0.11798096, + "step": 8985, + "time_per_iteration": 2.5176138877868652 + }, + { + "auxiliary_loss_clip": 0.06421156, + "auxiliary_loss_mlp": 0.01266547, + "balance_loss_clip": 0.06279361, + "balance_loss_mlp": 0.01255497, + "epoch": 0.5402675484743724, + "flos": 24980519456640.0, + "grad_norm": 1.4261046169392377, + "language_loss": 0.79538441, + "learning_rate": 1.83641431418363e-06, + "loss": 0.87226146, + "num_input_tokens_seen": 193535445, + "router_z_loss_clip": 1.41796875, + "router_z_loss_mlp": 0.11053467, + "step": 8986, + "time_per_iteration": 2.528057098388672 + }, + { + "auxiliary_loss_clip": 0.06426872, + "auxiliary_loss_mlp": 0.01269311, + "balance_loss_clip": 0.06277602, + "balance_loss_mlp": 0.01258636, + "epoch": 0.5403276717270404, + "flos": 19464302741760.0, + "grad_norm": 1.7453745991771563, + "language_loss": 0.77310205, + "learning_rate": 1.8360261605552075e-06, + "loss": 0.85006386, + "num_input_tokens_seen": 193554780, + "router_z_loss_clip": 1.4921875, + "router_z_loss_mlp": 0.10681152, + "step": 8987, + "time_per_iteration": 3.9355413913726807 + }, + { + "auxiliary_loss_clip": 0.06426796, + "auxiliary_loss_mlp": 0.01265394, + "balance_loss_clip": 0.06278582, + "balance_loss_mlp": 0.01254147, + "epoch": 0.5403877949797083, + "flos": 18448040096640.0, + "grad_norm": 1.594164869128485, + "language_loss": 0.70988709, + "learning_rate": 1.8356380131446887e-06, + "loss": 0.78680897, + "num_input_tokens_seen": 193573580, + "router_z_loss_clip": 1.48046875, + "router_z_loss_mlp": 0.11248779, + "step": 8988, + "time_per_iteration": 2.529665470123291 + }, + { + "auxiliary_loss_clip": 0.06432524, + "auxiliary_loss_mlp": 0.01266539, + "balance_loss_clip": 0.06283048, + "balance_loss_mlp": 0.0125528, + "epoch": 0.5404479182323764, + "flos": 28300343022720.0, + "grad_norm": 2.353153070088846, + "language_loss": 0.68308997, + "learning_rate": 1.8352498719667934e-06, + "loss": 0.76008058, + "num_input_tokens_seen": 193590490, + "router_z_loss_clip": 1.49414062, + "router_z_loss_mlp": 0.11260986, + "step": 8989, + "time_per_iteration": 2.541705846786499 + }, + { + "auxiliary_loss_clip": 0.06425673, + "auxiliary_loss_mlp": 0.0126717, + "balance_loss_clip": 0.06277242, + "balance_loss_mlp": 0.01255071, + "epoch": 0.5405080414850444, + "flos": 23373981924480.0, + "grad_norm": 1.5774927452360248, + "language_loss": 0.77866185, + "learning_rate": 1.8348617370362399e-06, + "loss": 0.85559022, + "num_input_tokens_seen": 193609900, + "router_z_loss_clip": 1.48339844, + "router_z_loss_mlp": 0.12091064, + "step": 8990, + "time_per_iteration": 2.570016384124756 + }, + { + "auxiliary_loss_clip": 0.06423812, + "auxiliary_loss_mlp": 0.01264876, + "balance_loss_clip": 0.06277065, + "balance_loss_mlp": 0.01254517, + "epoch": 0.5405681647377123, + "flos": 21112907823360.0, + "grad_norm": 1.4794826200904196, + "language_loss": 0.69081038, + "learning_rate": 1.834473608367745e-06, + "loss": 0.76769722, + "num_input_tokens_seen": 193629775, + "router_z_loss_clip": 1.46679688, + "router_z_loss_mlp": 0.10357666, + "step": 8991, + "time_per_iteration": 2.491284132003784 + }, + { + "auxiliary_loss_clip": 0.06430428, + "auxiliary_loss_mlp": 0.01268215, + "balance_loss_clip": 0.06280528, + "balance_loss_mlp": 0.01256598, + "epoch": 0.5406282879903803, + "flos": 20455478288640.0, + "grad_norm": 1.6151673604367662, + "language_loss": 0.76260269, + "learning_rate": 1.8340854859760277e-06, + "loss": 0.83958906, + "num_input_tokens_seen": 193648070, + "router_z_loss_clip": 1.49707031, + "router_z_loss_mlp": 0.11621094, + "step": 8992, + "time_per_iteration": 2.506131649017334 + }, + { + "auxiliary_loss_clip": 0.06429817, + "auxiliary_loss_mlp": 0.01266516, + "balance_loss_clip": 0.06278399, + "balance_loss_mlp": 0.01255871, + "epoch": 0.5406884112430482, + "flos": 14214635464320.0, + "grad_norm": 2.867003800231527, + "language_loss": 0.7616564, + "learning_rate": 1.8336973698758056e-06, + "loss": 0.83861977, + "num_input_tokens_seen": 193665060, + "router_z_loss_clip": 1.51367188, + "router_z_loss_mlp": 0.10644531, + "step": 8993, + "time_per_iteration": 2.5104384422302246 + }, + { + "auxiliary_loss_clip": 0.06425033, + "auxiliary_loss_mlp": 0.01270182, + "balance_loss_clip": 0.06278533, + "balance_loss_mlp": 0.01259024, + "epoch": 0.5407485344957162, + "flos": 23881882648320.0, + "grad_norm": 1.5714876378286171, + "language_loss": 0.70600474, + "learning_rate": 1.8333092600817959e-06, + "loss": 0.78295696, + "num_input_tokens_seen": 193683620, + "router_z_loss_clip": 1.46289062, + "router_z_loss_mlp": 0.11151123, + "step": 8994, + "time_per_iteration": 2.557224988937378 + }, + { + "auxiliary_loss_clip": 0.06430587, + "auxiliary_loss_mlp": 0.01267062, + "balance_loss_clip": 0.06279735, + "balance_loss_mlp": 0.01255397, + "epoch": 0.5408086577483842, + "flos": 23155118259840.0, + "grad_norm": 1.7868138082728735, + "language_loss": 0.7559076, + "learning_rate": 1.8329211566087157e-06, + "loss": 0.83288407, + "num_input_tokens_seen": 193702990, + "router_z_loss_clip": 1.50878906, + "router_z_loss_mlp": 0.11657715, + "step": 8995, + "time_per_iteration": 4.038757085800171 + }, + { + "auxiliary_loss_clip": 0.06426084, + "auxiliary_loss_mlp": 0.01266333, + "balance_loss_clip": 0.06281247, + "balance_loss_mlp": 0.01255748, + "epoch": 0.5408687810010522, + "flos": 18777090280320.0, + "grad_norm": 1.7506118703188027, + "language_loss": 0.73407996, + "learning_rate": 1.832533059471282e-06, + "loss": 0.81100416, + "num_input_tokens_seen": 193721785, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.105896, + "step": 8996, + "time_per_iteration": 2.4787185192108154 + }, + { + "auxiliary_loss_clip": 0.06423852, + "auxiliary_loss_mlp": 0.01266299, + "balance_loss_clip": 0.06280176, + "balance_loss_mlp": 0.01254801, + "epoch": 0.5409289042537201, + "flos": 13886717310720.0, + "grad_norm": 1.8157411884483814, + "language_loss": 0.73422438, + "learning_rate": 1.8321449686842115e-06, + "loss": 0.81112587, + "num_input_tokens_seen": 193740315, + "router_z_loss_clip": 1.43652344, + "router_z_loss_mlp": 0.11499023, + "step": 8997, + "time_per_iteration": 2.5067830085754395 + }, + { + "auxiliary_loss_clip": 0.0643085, + "auxiliary_loss_mlp": 0.01267668, + "balance_loss_clip": 0.06281897, + "balance_loss_mlp": 0.01256802, + "epoch": 0.5409890275063881, + "flos": 14470619287680.0, + "grad_norm": 2.2163933004413625, + "language_loss": 0.72107315, + "learning_rate": 1.8317568842622207e-06, + "loss": 0.79805827, + "num_input_tokens_seen": 193757580, + "router_z_loss_clip": 1.48925781, + "router_z_loss_mlp": 0.10870361, + "step": 8998, + "time_per_iteration": 2.499892234802246 + }, + { + "auxiliary_loss_clip": 0.06424686, + "auxiliary_loss_mlp": 0.01266284, + "balance_loss_clip": 0.0627818, + "balance_loss_mlp": 0.01255281, + "epoch": 0.541049150759056, + "flos": 48987906721920.0, + "grad_norm": 1.4223172525448995, + "language_loss": 0.7060768, + "learning_rate": 1.8313688062200256e-06, + "loss": 0.78298652, + "num_input_tokens_seen": 193780965, + "router_z_loss_clip": 1.46386719, + "router_z_loss_mlp": 0.11004639, + "step": 8999, + "time_per_iteration": 2.75883412361145 + }, + { + "auxiliary_loss_clip": 0.06424989, + "auxiliary_loss_mlp": 0.01267453, + "balance_loss_clip": 0.06280144, + "balance_loss_mlp": 0.01255818, + "epoch": 0.541109274011724, + "flos": 18153007470720.0, + "grad_norm": 3.0241903502045884, + "language_loss": 0.8099103, + "learning_rate": 1.8309807345723422e-06, + "loss": 0.88683468, + "num_input_tokens_seen": 193797855, + "router_z_loss_clip": 1.44726562, + "router_z_loss_mlp": 0.11639404, + "step": 9000, + "time_per_iteration": 2.4591987133026123 + }, + { + "auxiliary_loss_clip": 0.06425589, + "auxiliary_loss_mlp": 0.01267626, + "balance_loss_clip": 0.0628029, + "balance_loss_mlp": 0.01256438, + "epoch": 0.541169397264392, + "flos": 20528921992320.0, + "grad_norm": 1.444857324942775, + "language_loss": 0.73542678, + "learning_rate": 1.8305926693338863e-06, + "loss": 0.81235898, + "num_input_tokens_seen": 193817375, + "router_z_loss_clip": 1.45117188, + "router_z_loss_mlp": 0.11193848, + "step": 9001, + "time_per_iteration": 2.5392372608184814 + }, + { + "auxiliary_loss_clip": 0.06428811, + "auxiliary_loss_mlp": 0.0126804, + "balance_loss_clip": 0.0627747, + "balance_loss_mlp": 0.01256489, + "epoch": 0.54122952051706, + "flos": 20049630238080.0, + "grad_norm": 2.1661909625933675, + "language_loss": 0.85214329, + "learning_rate": 1.8302046105193734e-06, + "loss": 0.92911184, + "num_input_tokens_seen": 193832205, + "router_z_loss_clip": 1.51171875, + "router_z_loss_mlp": 0.11560059, + "step": 9002, + "time_per_iteration": 2.4666826725006104 + }, + { + "auxiliary_loss_clip": 0.06425083, + "auxiliary_loss_mlp": 0.01263895, + "balance_loss_clip": 0.06280569, + "balance_loss_mlp": 0.01253792, + "epoch": 0.541289643769728, + "flos": 19068223691520.0, + "grad_norm": 1.8644067392145132, + "language_loss": 0.78467226, + "learning_rate": 1.8298165581435183e-06, + "loss": 0.86156201, + "num_input_tokens_seen": 193849830, + "router_z_loss_clip": 1.44433594, + "router_z_loss_mlp": 0.10101318, + "step": 9003, + "time_per_iteration": 2.536766767501831 + }, + { + "auxiliary_loss_clip": 0.06424496, + "auxiliary_loss_mlp": 0.01263823, + "balance_loss_clip": 0.06279116, + "balance_loss_mlp": 0.01253005, + "epoch": 0.5413497670223959, + "flos": 22388801944320.0, + "grad_norm": 1.7504010601062234, + "language_loss": 0.69487125, + "learning_rate": 1.8294285122210372e-06, + "loss": 0.77175444, + "num_input_tokens_seen": 193869945, + "router_z_loss_clip": 1.45019531, + "router_z_loss_mlp": 0.1081543, + "step": 9004, + "time_per_iteration": 2.522757053375244 + }, + { + "auxiliary_loss_clip": 0.06323519, + "auxiliary_loss_mlp": 0.01256562, + "balance_loss_clip": 0.0626113, + "balance_loss_mlp": 0.01254622, + "epoch": 0.5414098902750639, + "flos": 70052149722240.0, + "grad_norm": 0.9317133774182984, + "language_loss": 0.58728683, + "learning_rate": 1.8290404727666434e-06, + "loss": 0.66308761, + "num_input_tokens_seen": 193930860, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 0.01937866, + "step": 9005, + "time_per_iteration": 3.227922201156616 + }, + { + "auxiliary_loss_clip": 0.06426564, + "auxiliary_loss_mlp": 0.01264985, + "balance_loss_clip": 0.06276372, + "balance_loss_mlp": 0.01254477, + "epoch": 0.5414700135277318, + "flos": 21805445018880.0, + "grad_norm": 2.0206216562473416, + "language_loss": 0.78202778, + "learning_rate": 1.8286524397950517e-06, + "loss": 0.85894328, + "num_input_tokens_seen": 193949075, + "router_z_loss_clip": 1.50097656, + "router_z_loss_mlp": 0.10510254, + "step": 9006, + "time_per_iteration": 2.557199001312256 + }, + { + "auxiliary_loss_clip": 0.06423091, + "auxiliary_loss_mlp": 0.01269943, + "balance_loss_clip": 0.06278808, + "balance_loss_mlp": 0.01259965, + "epoch": 0.5415301367803999, + "flos": 16913269186560.0, + "grad_norm": 3.052189299631263, + "language_loss": 0.8345896, + "learning_rate": 1.8282644133209777e-06, + "loss": 0.91152, + "num_input_tokens_seen": 193967630, + "router_z_loss_clip": 1.44238281, + "router_z_loss_mlp": 0.09979248, + "step": 9007, + "time_per_iteration": 2.5309536457061768 + }, + { + "auxiliary_loss_clip": 0.06427018, + "auxiliary_loss_mlp": 0.01265497, + "balance_loss_clip": 0.06280112, + "balance_loss_mlp": 0.01254089, + "epoch": 0.5415902600330678, + "flos": 25711518476160.0, + "grad_norm": 1.8242309219870276, + "language_loss": 0.67383778, + "learning_rate": 1.8278763933591334e-06, + "loss": 0.750763, + "num_input_tokens_seen": 193988730, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 0.11401367, + "step": 9008, + "time_per_iteration": 2.5476038455963135 + }, + { + "auxiliary_loss_clip": 0.0643273, + "auxiliary_loss_mlp": 0.01271282, + "balance_loss_clip": 0.06281075, + "balance_loss_mlp": 0.01259432, + "epoch": 0.5416503832857358, + "flos": 19214146776960.0, + "grad_norm": 1.9758514689639541, + "language_loss": 0.7415235, + "learning_rate": 1.827488379924234e-06, + "loss": 0.81856364, + "num_input_tokens_seen": 194005160, + "router_z_loss_clip": 1.51464844, + "router_z_loss_mlp": 0.11846924, + "step": 9009, + "time_per_iteration": 2.519923448562622 + }, + { + "auxiliary_loss_clip": 0.06433536, + "auxiliary_loss_mlp": 0.012676, + "balance_loss_clip": 0.0628282, + "balance_loss_mlp": 0.01255691, + "epoch": 0.5417105065384037, + "flos": 12718619867520.0, + "grad_norm": 2.008927815850951, + "language_loss": 0.88025904, + "learning_rate": 1.8271003730309923e-06, + "loss": 0.95727038, + "num_input_tokens_seen": 194021700, + "router_z_loss_clip": 1.50683594, + "router_z_loss_mlp": 0.11907959, + "step": 9010, + "time_per_iteration": 2.4986653327941895 + }, + { + "auxiliary_loss_clip": 0.06425326, + "auxiliary_loss_mlp": 0.01266313, + "balance_loss_clip": 0.06279215, + "balance_loss_mlp": 0.0125562, + "epoch": 0.5417706297910717, + "flos": 30343727416320.0, + "grad_norm": 1.9869037800658418, + "language_loss": 0.64700162, + "learning_rate": 1.826712372694122e-06, + "loss": 0.72391802, + "num_input_tokens_seen": 194042620, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.10693359, + "step": 9011, + "time_per_iteration": 2.639526605606079 + }, + { + "auxiliary_loss_clip": 0.06426919, + "auxiliary_loss_mlp": 0.0126718, + "balance_loss_clip": 0.06280383, + "balance_loss_mlp": 0.01256368, + "epoch": 0.5418307530437396, + "flos": 29028323295360.0, + "grad_norm": 2.488283502034593, + "language_loss": 0.79704046, + "learning_rate": 1.8263243789283362e-06, + "loss": 0.87398142, + "num_input_tokens_seen": 194061800, + "router_z_loss_clip": 1.46386719, + "router_z_loss_mlp": 0.1081543, + "step": 9012, + "time_per_iteration": 2.546048641204834 + }, + { + "auxiliary_loss_clip": 0.06429458, + "auxiliary_loss_mlp": 0.01265294, + "balance_loss_clip": 0.06280975, + "balance_loss_mlp": 0.01254464, + "epoch": 0.5418908762964076, + "flos": 16879125847680.0, + "grad_norm": 2.3471098958204712, + "language_loss": 0.74353266, + "learning_rate": 1.8259363917483466e-06, + "loss": 0.82048023, + "num_input_tokens_seen": 194079890, + "router_z_loss_clip": 1.48242188, + "router_z_loss_mlp": 0.10839844, + "step": 9013, + "time_per_iteration": 2.544989585876465 + }, + { + "auxiliary_loss_clip": 0.06429175, + "auxiliary_loss_mlp": 0.01265654, + "balance_loss_clip": 0.06277567, + "balance_loss_mlp": 0.01254806, + "epoch": 0.5419509995490756, + "flos": 18955144206720.0, + "grad_norm": 2.592240526053277, + "language_loss": 0.72416294, + "learning_rate": 1.8255484111688667e-06, + "loss": 0.80111116, + "num_input_tokens_seen": 194097625, + "router_z_loss_clip": 1.51464844, + "router_z_loss_mlp": 0.10852051, + "step": 9014, + "time_per_iteration": 2.4757673740386963 + }, + { + "auxiliary_loss_clip": 0.06427553, + "auxiliary_loss_mlp": 0.01267434, + "balance_loss_clip": 0.06280749, + "balance_loss_mlp": 0.01256413, + "epoch": 0.5420111228017436, + "flos": 18083630689920.0, + "grad_norm": 1.4576837239395228, + "language_loss": 0.80686474, + "learning_rate": 1.8251604372046085e-06, + "loss": 0.88381469, + "num_input_tokens_seen": 194116055, + "router_z_loss_clip": 1.46777344, + "router_z_loss_mlp": 0.11010742, + "step": 9015, + "time_per_iteration": 2.50618839263916 + }, + { + "auxiliary_loss_clip": 0.06436689, + "auxiliary_loss_mlp": 0.01270112, + "balance_loss_clip": 0.06286176, + "balance_loss_mlp": 0.01259061, + "epoch": 0.5420712460544116, + "flos": 19067678640000.0, + "grad_norm": 2.2120132338352105, + "language_loss": 0.81892127, + "learning_rate": 1.8247724698702843e-06, + "loss": 0.8959893, + "num_input_tokens_seen": 194130365, + "router_z_loss_clip": 1.50488281, + "router_z_loss_mlp": 0.11053467, + "step": 9016, + "time_per_iteration": 2.475426197052002 + }, + { + "auxiliary_loss_clip": 0.06424853, + "auxiliary_loss_mlp": 0.01269653, + "balance_loss_clip": 0.06277838, + "balance_loss_mlp": 0.01259258, + "epoch": 0.5421313693070795, + "flos": 18193020595200.0, + "grad_norm": 1.7396358642065415, + "language_loss": 0.81981838, + "learning_rate": 1.8243845091806053e-06, + "loss": 0.89676344, + "num_input_tokens_seen": 194148975, + "router_z_loss_clip": 1.46777344, + "router_z_loss_mlp": 0.10388184, + "step": 9017, + "time_per_iteration": 2.4966297149658203 + }, + { + "auxiliary_loss_clip": 0.06421264, + "auxiliary_loss_mlp": 0.01267488, + "balance_loss_clip": 0.06278099, + "balance_loss_mlp": 0.01256301, + "epoch": 0.5421914925597475, + "flos": 13010969162880.0, + "grad_norm": 1.7307795983641447, + "language_loss": 0.77940953, + "learning_rate": 1.8239965551502837e-06, + "loss": 0.85629702, + "num_input_tokens_seen": 194167185, + "router_z_loss_clip": 1.43164062, + "router_z_loss_mlp": 0.11193848, + "step": 9018, + "time_per_iteration": 2.4861438274383545 + }, + { + "auxiliary_loss_clip": 0.0643111, + "auxiliary_loss_mlp": 0.01264327, + "balance_loss_clip": 0.06279995, + "balance_loss_mlp": 0.01253557, + "epoch": 0.5422516158124154, + "flos": 46769654856960.0, + "grad_norm": 1.436078593305458, + "language_loss": 0.66629684, + "learning_rate": 1.8236086077940303e-06, + "loss": 0.7432512, + "num_input_tokens_seen": 194192840, + "router_z_loss_clip": 1.51074219, + "router_z_loss_mlp": 0.10772705, + "step": 9019, + "time_per_iteration": 2.793942928314209 + }, + { + "auxiliary_loss_clip": 0.06420586, + "auxiliary_loss_mlp": 0.01266098, + "balance_loss_clip": 0.06277826, + "balance_loss_mlp": 0.01256627, + "epoch": 0.5423117390650835, + "flos": 31766634725760.0, + "grad_norm": 1.5531318778473993, + "language_loss": 0.69972849, + "learning_rate": 1.8232206671265555e-06, + "loss": 0.77659535, + "num_input_tokens_seen": 194213150, + "router_z_loss_clip": 1.42871094, + "router_z_loss_mlp": 0.0947876, + "step": 9020, + "time_per_iteration": 3.977450132369995 + }, + { + "auxiliary_loss_clip": 0.0642193, + "auxiliary_loss_mlp": 0.01268231, + "balance_loss_clip": 0.0627913, + "balance_loss_mlp": 0.01257586, + "epoch": 0.5423718623177514, + "flos": 27209881987200.0, + "grad_norm": 1.41400284004279, + "language_loss": 0.80270976, + "learning_rate": 1.8228327331625717e-06, + "loss": 0.87961137, + "num_input_tokens_seen": 194234665, + "router_z_loss_clip": 1.42871094, + "router_z_loss_mlp": 0.10650635, + "step": 9021, + "time_per_iteration": 2.5875015258789062 + }, + { + "auxiliary_loss_clip": 0.06426784, + "auxiliary_loss_mlp": 0.0126779, + "balance_loss_clip": 0.0628023, + "balance_loss_mlp": 0.01257162, + "epoch": 0.5424319855704194, + "flos": 23552580902400.0, + "grad_norm": 2.7424242746142298, + "language_loss": 0.78868818, + "learning_rate": 1.822444805916788e-06, + "loss": 0.86563396, + "num_input_tokens_seen": 194253790, + "router_z_loss_clip": 1.46582031, + "router_z_loss_mlp": 0.10626221, + "step": 9022, + "time_per_iteration": 2.6569435596466064 + }, + { + "auxiliary_loss_clip": 0.06421105, + "auxiliary_loss_mlp": 0.01267956, + "balance_loss_clip": 0.06275026, + "balance_loss_mlp": 0.01257132, + "epoch": 0.5424921088230873, + "flos": 26623003190400.0, + "grad_norm": 2.014349133750916, + "language_loss": 0.82876647, + "learning_rate": 1.822056885403915e-06, + "loss": 0.90565705, + "num_input_tokens_seen": 194274950, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.10827637, + "step": 9023, + "time_per_iteration": 4.035135746002197 + }, + { + "auxiliary_loss_clip": 0.06427208, + "auxiliary_loss_mlp": 0.01266773, + "balance_loss_clip": 0.06280831, + "balance_loss_mlp": 0.01256718, + "epoch": 0.5425522320757553, + "flos": 23593600275840.0, + "grad_norm": 1.5793438869499181, + "language_loss": 0.71421236, + "learning_rate": 1.8216689716386627e-06, + "loss": 0.79115218, + "num_input_tokens_seen": 194296155, + "router_z_loss_clip": 1.46386719, + "router_z_loss_mlp": 0.10058594, + "step": 9024, + "time_per_iteration": 2.540205717086792 + }, + { + "auxiliary_loss_clip": 0.06424701, + "auxiliary_loss_mlp": 0.01264518, + "balance_loss_clip": 0.06276532, + "balance_loss_mlp": 0.01253908, + "epoch": 0.5426123553284232, + "flos": 30600256291200.0, + "grad_norm": 1.6177082091395079, + "language_loss": 0.65074164, + "learning_rate": 1.8212810646357405e-06, + "loss": 0.72763383, + "num_input_tokens_seen": 194318025, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.10601807, + "step": 9025, + "time_per_iteration": 2.6120383739471436 + }, + { + "auxiliary_loss_clip": 0.06428426, + "auxiliary_loss_mlp": 0.01269591, + "balance_loss_clip": 0.06278306, + "balance_loss_mlp": 0.0125891, + "epoch": 0.5426724785810912, + "flos": 12500049692160.0, + "grad_norm": 9.095866287209772, + "language_loss": 0.73753297, + "learning_rate": 1.8208931644098591e-06, + "loss": 0.81451309, + "num_input_tokens_seen": 194336150, + "router_z_loss_clip": 1.49902344, + "router_z_loss_mlp": 0.10681152, + "step": 9026, + "time_per_iteration": 2.47986102104187 + }, + { + "auxiliary_loss_clip": 0.06430142, + "auxiliary_loss_mlp": 0.01269421, + "balance_loss_clip": 0.06282182, + "balance_loss_mlp": 0.01256993, + "epoch": 0.5427326018337592, + "flos": 26071273981440.0, + "grad_norm": 2.23504413576904, + "language_loss": 0.78765059, + "learning_rate": 1.8205052709757265e-06, + "loss": 0.8646462, + "num_input_tokens_seen": 194355980, + "router_z_loss_clip": 1.47949219, + "router_z_loss_mlp": 0.12432861, + "step": 9027, + "time_per_iteration": 3.9859650135040283 + }, + { + "auxiliary_loss_clip": 0.06320234, + "auxiliary_loss_mlp": 0.01252608, + "balance_loss_clip": 0.06257887, + "balance_loss_mlp": 0.01250684, + "epoch": 0.5427927250864272, + "flos": 66004974789120.0, + "grad_norm": 0.7416092139326844, + "language_loss": 0.56562424, + "learning_rate": 1.8201173843480515e-06, + "loss": 0.64135265, + "num_input_tokens_seen": 194422660, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 0.01921082, + "step": 9028, + "time_per_iteration": 3.155468702316284 + }, + { + "auxiliary_loss_clip": 0.06432774, + "auxiliary_loss_mlp": 0.01272049, + "balance_loss_clip": 0.06283672, + "balance_loss_mlp": 0.01260158, + "epoch": 0.5428528483390952, + "flos": 19981678976640.0, + "grad_norm": 2.1493249613849015, + "language_loss": 0.78262091, + "learning_rate": 1.8197295045415442e-06, + "loss": 0.85966909, + "num_input_tokens_seen": 194438545, + "router_z_loss_clip": 1.49023438, + "router_z_loss_mlp": 0.11883545, + "step": 9029, + "time_per_iteration": 2.59745192527771 + }, + { + "auxiliary_loss_clip": 0.06422626, + "auxiliary_loss_mlp": 0.0127098, + "balance_loss_clip": 0.06278758, + "balance_loss_mlp": 0.01260108, + "epoch": 0.5429129715917631, + "flos": 21838288619520.0, + "grad_norm": 1.5330300742008836, + "language_loss": 0.83522928, + "learning_rate": 1.8193416315709112e-06, + "loss": 0.9121654, + "num_input_tokens_seen": 194458060, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.10870361, + "step": 9030, + "time_per_iteration": 2.579742670059204 + }, + { + "auxiliary_loss_clip": 0.06426223, + "auxiliary_loss_mlp": 0.01263686, + "balance_loss_clip": 0.06282306, + "balance_loss_mlp": 0.01252903, + "epoch": 0.5429730948444311, + "flos": 27790178238720.0, + "grad_norm": 1.5430505390577234, + "language_loss": 0.75487745, + "learning_rate": 1.8189537654508623e-06, + "loss": 0.8317765, + "num_input_tokens_seen": 194477405, + "router_z_loss_clip": 1.43847656, + "router_z_loss_mlp": 0.10784912, + "step": 9031, + "time_per_iteration": 2.5645737648010254 + }, + { + "auxiliary_loss_clip": 0.06421311, + "auxiliary_loss_mlp": 0.01265953, + "balance_loss_clip": 0.0628026, + "balance_loss_mlp": 0.01256226, + "epoch": 0.543033218097099, + "flos": 26767668464640.0, + "grad_norm": 1.6242541501700514, + "language_loss": 0.85659242, + "learning_rate": 1.8185659061961045e-06, + "loss": 0.933465, + "num_input_tokens_seen": 194497085, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.097229, + "step": 9032, + "time_per_iteration": 2.5913095474243164 + }, + { + "auxiliary_loss_clip": 0.06434417, + "auxiliary_loss_mlp": 0.01272349, + "balance_loss_clip": 0.06282632, + "balance_loss_mlp": 0.01260815, + "epoch": 0.5430933413497671, + "flos": 22681989780480.0, + "grad_norm": 1.5840496509982642, + "language_loss": 0.74130201, + "learning_rate": 1.8181780538213457e-06, + "loss": 0.81836969, + "num_input_tokens_seen": 194516785, + "router_z_loss_clip": 1.515625, + "router_z_loss_mlp": 0.11535645, + "step": 9033, + "time_per_iteration": 2.546196937561035 + }, + { + "auxiliary_loss_clip": 0.06426211, + "auxiliary_loss_mlp": 0.01268108, + "balance_loss_clip": 0.06281157, + "balance_loss_mlp": 0.01256569, + "epoch": 0.543153464602435, + "flos": 24614307187200.0, + "grad_norm": 1.5750334880362715, + "language_loss": 0.76250172, + "learning_rate": 1.8177902083412935e-06, + "loss": 0.83944499, + "num_input_tokens_seen": 194536475, + "router_z_loss_clip": 1.44921875, + "router_z_loss_mlp": 0.11535645, + "step": 9034, + "time_per_iteration": 2.5637965202331543 + }, + { + "auxiliary_loss_clip": 0.0642693, + "auxiliary_loss_mlp": 0.0126457, + "balance_loss_clip": 0.06282238, + "balance_loss_mlp": 0.01254002, + "epoch": 0.543213587855103, + "flos": 19031690511360.0, + "grad_norm": 1.6968779523598936, + "language_loss": 0.84307218, + "learning_rate": 1.817402369770655e-06, + "loss": 0.91998708, + "num_input_tokens_seen": 194554495, + "router_z_loss_clip": 1.4453125, + "router_z_loss_mlp": 0.10583496, + "step": 9035, + "time_per_iteration": 4.028722524642944 + }, + { + "auxiliary_loss_clip": 0.063224, + "auxiliary_loss_mlp": 0.01251692, + "balance_loss_clip": 0.06260421, + "balance_loss_mlp": 0.01250003, + "epoch": 0.5432737111077709, + "flos": 65705539824000.0, + "grad_norm": 0.6842717349937131, + "language_loss": 0.55272961, + "learning_rate": 1.8170145381241364e-06, + "loss": 0.62847054, + "num_input_tokens_seen": 194617620, + "router_z_loss_clip": 0.61816406, + "router_z_loss_mlp": 0.01693726, + "step": 9036, + "time_per_iteration": 3.117825746536255 + }, + { + "auxiliary_loss_clip": 0.06427496, + "auxiliary_loss_mlp": 0.01266068, + "balance_loss_clip": 0.06278114, + "balance_loss_mlp": 0.0125423, + "epoch": 0.5433338343604389, + "flos": 22098339365760.0, + "grad_norm": 1.6522952339212897, + "language_loss": 0.75599706, + "learning_rate": 1.8166267134164451e-06, + "loss": 0.83293271, + "num_input_tokens_seen": 194637690, + "router_z_loss_clip": 1.4921875, + "router_z_loss_mlp": 0.1184082, + "step": 9037, + "time_per_iteration": 2.520371913909912 + }, + { + "auxiliary_loss_clip": 0.06428872, + "auxiliary_loss_mlp": 0.01263373, + "balance_loss_clip": 0.06282881, + "balance_loss_mlp": 0.01252561, + "epoch": 0.5433939576131068, + "flos": 34680316752000.0, + "grad_norm": 1.5920545337485463, + "language_loss": 0.66775727, + "learning_rate": 1.8162388956622875e-06, + "loss": 0.74467969, + "num_input_tokens_seen": 194659520, + "router_z_loss_clip": 1.45800781, + "router_z_loss_mlp": 0.1081543, + "step": 9038, + "time_per_iteration": 2.6492366790771484 + }, + { + "auxiliary_loss_clip": 0.06424891, + "auxiliary_loss_mlp": 0.01265017, + "balance_loss_clip": 0.06279261, + "balance_loss_mlp": 0.01254395, + "epoch": 0.5434540808657748, + "flos": 20309639057280.0, + "grad_norm": 2.8075357913922687, + "language_loss": 0.78373635, + "learning_rate": 1.8158510848763692e-06, + "loss": 0.8606354, + "num_input_tokens_seen": 194677645, + "router_z_loss_clip": 1.45507812, + "router_z_loss_mlp": 0.10626221, + "step": 9039, + "time_per_iteration": 2.528156280517578 + }, + { + "auxiliary_loss_clip": 0.06428317, + "auxiliary_loss_mlp": 0.01269709, + "balance_loss_clip": 0.06281251, + "balance_loss_mlp": 0.01258677, + "epoch": 0.5435142041184428, + "flos": 23119549401600.0, + "grad_norm": 1.7481925172590123, + "language_loss": 0.76885521, + "learning_rate": 1.8154632810733962e-06, + "loss": 0.84583545, + "num_input_tokens_seen": 194697400, + "router_z_loss_clip": 1.46777344, + "router_z_loss_mlp": 0.11029053, + "step": 9040, + "time_per_iteration": 2.5517256259918213 + }, + { + "auxiliary_loss_clip": 0.06319717, + "auxiliary_loss_mlp": 0.01257021, + "balance_loss_clip": 0.06257772, + "balance_loss_mlp": 0.01255075, + "epoch": 0.5435743273711108, + "flos": 64032350768640.0, + "grad_norm": 0.6699998863594594, + "language_loss": 0.52323502, + "learning_rate": 1.815075484268074e-06, + "loss": 0.59900236, + "num_input_tokens_seen": 194761205, + "router_z_loss_clip": 0.61865234, + "router_z_loss_mlp": 0.0194397, + "step": 9041, + "time_per_iteration": 3.166306972503662 + }, + { + "auxiliary_loss_clip": 0.06428386, + "auxiliary_loss_mlp": 0.01265505, + "balance_loss_clip": 0.06280383, + "balance_loss_mlp": 0.01254687, + "epoch": 0.5436344506237788, + "flos": 25125897490560.0, + "grad_norm": 1.7575616905304456, + "language_loss": 0.762761, + "learning_rate": 1.8146876944751078e-06, + "loss": 0.83969998, + "num_input_tokens_seen": 194782445, + "router_z_loss_clip": 1.48242188, + "router_z_loss_mlp": 0.10821533, + "step": 9042, + "time_per_iteration": 2.5450282096862793 + }, + { + "auxiliary_loss_clip": 0.0642225, + "auxiliary_loss_mlp": 0.01265245, + "balance_loss_clip": 0.06278253, + "balance_loss_mlp": 0.01254176, + "epoch": 0.5436945738764467, + "flos": 19579017381120.0, + "grad_norm": 2.3576554691894054, + "language_loss": 0.6770978, + "learning_rate": 1.8142999117092033e-06, + "loss": 0.75397277, + "num_input_tokens_seen": 194800325, + "router_z_loss_clip": 1.43945312, + "router_z_loss_mlp": 0.11065674, + "step": 9043, + "time_per_iteration": 2.5310070514678955 + }, + { + "auxiliary_loss_clip": 0.06421092, + "auxiliary_loss_mlp": 0.01266758, + "balance_loss_clip": 0.06277125, + "balance_loss_mlp": 0.01256065, + "epoch": 0.5437546971291147, + "flos": 21148937879040.0, + "grad_norm": 1.5176966924106092, + "language_loss": 0.84091616, + "learning_rate": 1.8139121359850644e-06, + "loss": 0.91779459, + "num_input_tokens_seen": 194818675, + "router_z_loss_clip": 1.43847656, + "router_z_loss_mlp": 0.10699463, + "step": 9044, + "time_per_iteration": 2.4937691688537598 + }, + { + "auxiliary_loss_clip": 0.06427783, + "auxiliary_loss_mlp": 0.01267965, + "balance_loss_clip": 0.06275944, + "balance_loss_mlp": 0.01256056, + "epoch": 0.5438148203817826, + "flos": 25125645928320.0, + "grad_norm": 1.559720453478778, + "language_loss": 0.62531364, + "learning_rate": 1.8135243673173956e-06, + "loss": 0.70227116, + "num_input_tokens_seen": 194836595, + "router_z_loss_clip": 1.51660156, + "router_z_loss_mlp": 0.11914062, + "step": 9045, + "time_per_iteration": 2.558842182159424 + }, + { + "auxiliary_loss_clip": 0.06425174, + "auxiliary_loss_mlp": 0.01267999, + "balance_loss_clip": 0.06278486, + "balance_loss_mlp": 0.01257312, + "epoch": 0.5438749436344507, + "flos": 23009614444800.0, + "grad_norm": 1.4475609839642107, + "language_loss": 0.70189548, + "learning_rate": 1.8131366057209023e-06, + "loss": 0.77882719, + "num_input_tokens_seen": 194857520, + "router_z_loss_clip": 1.46679688, + "router_z_loss_mlp": 0.10687256, + "step": 9046, + "time_per_iteration": 2.546400785446167 + }, + { + "auxiliary_loss_clip": 0.06422587, + "auxiliary_loss_mlp": 0.01263416, + "balance_loss_clip": 0.06278922, + "balance_loss_mlp": 0.01253087, + "epoch": 0.5439350668871186, + "flos": 15492458229120.0, + "grad_norm": 1.7829079763234368, + "language_loss": 0.77310658, + "learning_rate": 1.8127488512102868e-06, + "loss": 0.84996659, + "num_input_tokens_seen": 194876020, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.10333252, + "step": 9047, + "time_per_iteration": 2.5223042964935303 + }, + { + "auxiliary_loss_clip": 0.06424624, + "auxiliary_loss_mlp": 0.01269137, + "balance_loss_clip": 0.06278106, + "balance_loss_mlp": 0.01257598, + "epoch": 0.5439951901397866, + "flos": 17244164160000.0, + "grad_norm": 2.1796692597227363, + "language_loss": 0.73181236, + "learning_rate": 1.8123611038002547e-06, + "loss": 0.80874991, + "num_input_tokens_seen": 194894650, + "router_z_loss_clip": 1.46484375, + "router_z_loss_mlp": 0.11547852, + "step": 9048, + "time_per_iteration": 2.4901275634765625 + }, + { + "auxiliary_loss_clip": 0.06419719, + "auxiliary_loss_mlp": 0.01268414, + "balance_loss_clip": 0.06275168, + "balance_loss_mlp": 0.01256773, + "epoch": 0.5440553133924545, + "flos": 18666945688320.0, + "grad_norm": 2.2913555210162535, + "language_loss": 0.93342638, + "learning_rate": 1.8119733635055076e-06, + "loss": 1.01030767, + "num_input_tokens_seen": 194911935, + "router_z_loss_clip": 1.4453125, + "router_z_loss_mlp": 0.11639404, + "step": 9049, + "time_per_iteration": 2.5185091495513916 + }, + { + "auxiliary_loss_clip": 0.0641875, + "auxiliary_loss_mlp": 0.01267443, + "balance_loss_clip": 0.06274416, + "balance_loss_mlp": 0.01257155, + "epoch": 0.5441154366451225, + "flos": 27129813811200.0, + "grad_norm": 1.6778604645700708, + "language_loss": 0.74161297, + "learning_rate": 1.8115856303407492e-06, + "loss": 0.81847489, + "num_input_tokens_seen": 194931620, + "router_z_loss_clip": 1.44140625, + "router_z_loss_mlp": 0.10284424, + "step": 9050, + "time_per_iteration": 2.551227331161499 + }, + { + "auxiliary_loss_clip": 0.06424956, + "auxiliary_loss_mlp": 0.01268538, + "balance_loss_clip": 0.06277525, + "balance_loss_mlp": 0.01257684, + "epoch": 0.5441755598977904, + "flos": 26000890951680.0, + "grad_norm": 1.7704942450323604, + "language_loss": 0.67003465, + "learning_rate": 1.8111979043206832e-06, + "loss": 0.74696958, + "num_input_tokens_seen": 194952560, + "router_z_loss_clip": 1.47460938, + "router_z_loss_mlp": 0.10852051, + "step": 9051, + "time_per_iteration": 2.586360454559326 + }, + { + "auxiliary_loss_clip": 0.06422283, + "auxiliary_loss_mlp": 0.01264215, + "balance_loss_clip": 0.06277864, + "balance_loss_mlp": 0.01253629, + "epoch": 0.5442356831504584, + "flos": 32388327694080.0, + "grad_norm": 1.6805683860476124, + "language_loss": 0.68003166, + "learning_rate": 1.810810185460011e-06, + "loss": 0.75689662, + "num_input_tokens_seen": 194973915, + "router_z_loss_clip": 1.44238281, + "router_z_loss_mlp": 0.10583496, + "step": 9052, + "time_per_iteration": 2.595308303833008 + }, + { + "auxiliary_loss_clip": 0.0642236, + "auxiliary_loss_mlp": 0.01267208, + "balance_loss_clip": 0.06275343, + "balance_loss_mlp": 0.01255413, + "epoch": 0.5442958064031264, + "flos": 24170123093760.0, + "grad_norm": 1.9713868762163456, + "language_loss": 0.93283188, + "learning_rate": 1.810422473773436e-06, + "loss": 1.0097276, + "num_input_tokens_seen": 194990170, + "router_z_loss_clip": 1.46972656, + "router_z_loss_mlp": 0.11791992, + "step": 9053, + "time_per_iteration": 2.5700409412384033 + }, + { + "auxiliary_loss_clip": 0.06427357, + "auxiliary_loss_mlp": 0.0127068, + "balance_loss_clip": 0.06279093, + "balance_loss_mlp": 0.01258509, + "epoch": 0.5443559296557944, + "flos": 18769669194240.0, + "grad_norm": 1.9808667763978582, + "language_loss": 0.83683395, + "learning_rate": 1.8100347692756595e-06, + "loss": 0.91381431, + "num_input_tokens_seen": 195006395, + "router_z_loss_clip": 1.48339844, + "router_z_loss_mlp": 0.1217041, + "step": 9054, + "time_per_iteration": 2.4873886108398438 + }, + { + "auxiliary_loss_clip": 0.06424912, + "auxiliary_loss_mlp": 0.01271948, + "balance_loss_clip": 0.06277627, + "balance_loss_mlp": 0.01260021, + "epoch": 0.5444160529084624, + "flos": 22638245149440.0, + "grad_norm": 1.9496494567304603, + "language_loss": 0.68541598, + "learning_rate": 1.8096470719813836e-06, + "loss": 0.76238453, + "num_input_tokens_seen": 195025080, + "router_z_loss_clip": 1.47265625, + "router_z_loss_mlp": 0.11920166, + "step": 9055, + "time_per_iteration": 2.5629093647003174 + }, + { + "auxiliary_loss_clip": 0.06326497, + "auxiliary_loss_mlp": 0.01261063, + "balance_loss_clip": 0.06264114, + "balance_loss_mlp": 0.01259381, + "epoch": 0.5444761761611303, + "flos": 69693106976640.0, + "grad_norm": 0.7193405715621726, + "language_loss": 0.57599837, + "learning_rate": 1.80925938190531e-06, + "loss": 0.65187401, + "num_input_tokens_seen": 195085725, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 0.01686096, + "step": 9056, + "time_per_iteration": 3.1249008178710938 + }, + { + "auxiliary_loss_clip": 0.06428131, + "auxiliary_loss_mlp": 0.01267471, + "balance_loss_clip": 0.06279279, + "balance_loss_mlp": 0.01255676, + "epoch": 0.5445362994137983, + "flos": 14282922142080.0, + "grad_norm": 1.7879789013056906, + "language_loss": 0.69611216, + "learning_rate": 1.8088716990621395e-06, + "loss": 0.77306819, + "num_input_tokens_seen": 195102585, + "router_z_loss_clip": 1.48828125, + "router_z_loss_mlp": 0.11798096, + "step": 9057, + "time_per_iteration": 2.498568296432495 + }, + { + "auxiliary_loss_clip": 0.06425367, + "auxiliary_loss_mlp": 0.01267238, + "balance_loss_clip": 0.06281108, + "balance_loss_mlp": 0.01255651, + "epoch": 0.5445964226664662, + "flos": 28993802613120.0, + "grad_norm": 1.9346963255645138, + "language_loss": 0.75279379, + "learning_rate": 1.8084840234665738e-06, + "loss": 0.8297199, + "num_input_tokens_seen": 195120055, + "router_z_loss_clip": 1.44433594, + "router_z_loss_mlp": 0.11578369, + "step": 9058, + "time_per_iteration": 2.569481134414673 + }, + { + "auxiliary_loss_clip": 0.06324711, + "auxiliary_loss_mlp": 0.01255513, + "balance_loss_clip": 0.06262248, + "balance_loss_mlp": 0.01253708, + "epoch": 0.5446565459191343, + "flos": 68642323649280.0, + "grad_norm": 0.781118187376451, + "language_loss": 0.62576413, + "learning_rate": 1.808096355133312e-06, + "loss": 0.7015664, + "num_input_tokens_seen": 195181045, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 0.01800537, + "step": 9059, + "time_per_iteration": 4.5610737800598145 + }, + { + "auxiliary_loss_clip": 0.06421264, + "auxiliary_loss_mlp": 0.01268955, + "balance_loss_clip": 0.06278148, + "balance_loss_mlp": 0.01257993, + "epoch": 0.5447166691718022, + "flos": 16221989802240.0, + "grad_norm": 1.8006783567998876, + "language_loss": 0.79601544, + "learning_rate": 1.8077086940770572e-06, + "loss": 0.87291771, + "num_input_tokens_seen": 195198840, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.10961914, + "step": 9060, + "time_per_iteration": 2.511836290359497 + }, + { + "auxiliary_loss_clip": 0.06426552, + "auxiliary_loss_mlp": 0.0126624, + "balance_loss_clip": 0.06279396, + "balance_loss_mlp": 0.0125454, + "epoch": 0.5447767924244702, + "flos": 25856225677440.0, + "grad_norm": 1.542760917466334, + "language_loss": 0.80138546, + "learning_rate": 1.8073210403125072e-06, + "loss": 0.87831336, + "num_input_tokens_seen": 195218720, + "router_z_loss_clip": 1.46972656, + "router_z_loss_mlp": 0.11700439, + "step": 9061, + "time_per_iteration": 2.5398924350738525 + }, + { + "auxiliary_loss_clip": 0.06425673, + "auxiliary_loss_mlp": 0.01265991, + "balance_loss_clip": 0.06280909, + "balance_loss_mlp": 0.01255221, + "epoch": 0.5448369156771381, + "flos": 19682998698240.0, + "grad_norm": 1.6196021204279303, + "language_loss": 0.87203825, + "learning_rate": 1.8069333938543627e-06, + "loss": 0.94895482, + "num_input_tokens_seen": 195235770, + "router_z_loss_clip": 1.44921875, + "router_z_loss_mlp": 0.10772705, + "step": 9062, + "time_per_iteration": 4.0366997718811035 + }, + { + "auxiliary_loss_clip": 0.06433238, + "auxiliary_loss_mlp": 0.01268748, + "balance_loss_clip": 0.0628314, + "balance_loss_mlp": 0.01256392, + "epoch": 0.5448970389298061, + "flos": 19287925896960.0, + "grad_norm": 1.7163800985020743, + "language_loss": 0.82674021, + "learning_rate": 1.8065457547173233e-06, + "loss": 0.90376008, + "num_input_tokens_seen": 195254870, + "router_z_loss_clip": 1.50097656, + "router_z_loss_mlp": 0.12359619, + "step": 9063, + "time_per_iteration": 2.5397801399230957 + }, + { + "auxiliary_loss_clip": 0.06429115, + "auxiliary_loss_mlp": 0.01269549, + "balance_loss_clip": 0.0628127, + "balance_loss_mlp": 0.01257264, + "epoch": 0.544957162182474, + "flos": 20997270789120.0, + "grad_norm": 1.590898869425655, + "language_loss": 0.63855612, + "learning_rate": 1.8061581229160878e-06, + "loss": 0.71554273, + "num_input_tokens_seen": 195273390, + "router_z_loss_clip": 1.47558594, + "router_z_loss_mlp": 0.1227417, + "step": 9064, + "time_per_iteration": 2.511350631713867 + }, + { + "auxiliary_loss_clip": 0.06432661, + "auxiliary_loss_mlp": 0.01263974, + "balance_loss_clip": 0.06282693, + "balance_loss_mlp": 0.01251863, + "epoch": 0.545017285435142, + "flos": 25381671678720.0, + "grad_norm": 1.596100575558465, + "language_loss": 0.80746907, + "learning_rate": 1.8057704984653566e-06, + "loss": 0.88443542, + "num_input_tokens_seen": 195295635, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.12115479, + "step": 9065, + "time_per_iteration": 2.589707136154175 + }, + { + "auxiliary_loss_clip": 0.06425799, + "auxiliary_loss_mlp": 0.01266335, + "balance_loss_clip": 0.06280494, + "balance_loss_mlp": 0.01255916, + "epoch": 0.54507740868781, + "flos": 19140661146240.0, + "grad_norm": 1.9404249818077939, + "language_loss": 0.78152055, + "learning_rate": 1.805382881379827e-06, + "loss": 0.85844183, + "num_input_tokens_seen": 195312545, + "router_z_loss_clip": 1.45214844, + "router_z_loss_mlp": 0.10412598, + "step": 9066, + "time_per_iteration": 2.5037317276000977 + }, + { + "auxiliary_loss_clip": 0.06434928, + "auxiliary_loss_mlp": 0.01268701, + "balance_loss_clip": 0.06284117, + "balance_loss_mlp": 0.01256714, + "epoch": 0.545137531940478, + "flos": 26256958629120.0, + "grad_norm": 1.5302055737642422, + "language_loss": 0.76331961, + "learning_rate": 1.8049952716741975e-06, + "loss": 0.84035593, + "num_input_tokens_seen": 195332955, + "router_z_loss_clip": 1.5078125, + "router_z_loss_mlp": 0.11993408, + "step": 9067, + "time_per_iteration": 4.019241571426392 + }, + { + "auxiliary_loss_clip": 0.06438933, + "auxiliary_loss_mlp": 0.01268386, + "balance_loss_clip": 0.06285474, + "balance_loss_mlp": 0.01255685, + "epoch": 0.545197655193146, + "flos": 37563880435200.0, + "grad_norm": 1.8087199149855477, + "language_loss": 0.62992573, + "learning_rate": 1.8046076693631682e-06, + "loss": 0.70699894, + "num_input_tokens_seen": 195355930, + "router_z_loss_clip": 1.53417969, + "router_z_loss_mlp": 0.12701416, + "step": 9068, + "time_per_iteration": 2.6678848266601562 + }, + { + "auxiliary_loss_clip": 0.06424262, + "auxiliary_loss_mlp": 0.01267107, + "balance_loss_clip": 0.06280495, + "balance_loss_mlp": 0.01256163, + "epoch": 0.5452577784458139, + "flos": 26038430380800.0, + "grad_norm": 1.5391820181686233, + "language_loss": 0.72328687, + "learning_rate": 1.8042200744614343e-06, + "loss": 0.80020058, + "num_input_tokens_seen": 195376445, + "router_z_loss_clip": 1.43847656, + "router_z_loss_mlp": 0.10949707, + "step": 9069, + "time_per_iteration": 2.555837631225586 + }, + { + "auxiliary_loss_clip": 0.06424727, + "auxiliary_loss_mlp": 0.0126738, + "balance_loss_clip": 0.0628207, + "balance_loss_mlp": 0.01256723, + "epoch": 0.5453179016984819, + "flos": 17644729403520.0, + "grad_norm": 1.699483734463513, + "language_loss": 0.74651837, + "learning_rate": 1.8038324869836957e-06, + "loss": 0.82343948, + "num_input_tokens_seen": 195393725, + "router_z_loss_clip": 1.42480469, + "router_z_loss_mlp": 0.10662842, + "step": 9070, + "time_per_iteration": 2.493806838989258 + }, + { + "auxiliary_loss_clip": 0.06424981, + "auxiliary_loss_mlp": 0.01264741, + "balance_loss_clip": 0.06277809, + "balance_loss_mlp": 0.01253839, + "epoch": 0.5453780249511498, + "flos": 23222524469760.0, + "grad_norm": 1.8987434929949667, + "language_loss": 0.61238426, + "learning_rate": 1.8034449069446489e-06, + "loss": 0.68928152, + "num_input_tokens_seen": 195411380, + "router_z_loss_clip": 1.47167969, + "router_z_loss_mlp": 0.10900879, + "step": 9071, + "time_per_iteration": 2.522620677947998 + }, + { + "auxiliary_loss_clip": 0.06331067, + "auxiliary_loss_mlp": 0.01252658, + "balance_loss_clip": 0.06269144, + "balance_loss_mlp": 0.01250867, + "epoch": 0.5454381482038179, + "flos": 68719163443200.0, + "grad_norm": 0.6892933067721945, + "language_loss": 0.57065922, + "learning_rate": 1.80305733435899e-06, + "loss": 0.64649647, + "num_input_tokens_seen": 195482015, + "router_z_loss_clip": 0.61767578, + "router_z_loss_mlp": 0.01786804, + "step": 9072, + "time_per_iteration": 3.235288381576538 + }, + { + "auxiliary_loss_clip": 0.06422395, + "auxiliary_loss_mlp": 0.01268326, + "balance_loss_clip": 0.06280763, + "balance_loss_mlp": 0.01257424, + "epoch": 0.5454982714564858, + "flos": 13265569393920.0, + "grad_norm": 1.8411374110080903, + "language_loss": 0.69644904, + "learning_rate": 1.8026697692414174e-06, + "loss": 0.77335626, + "num_input_tokens_seen": 195500440, + "router_z_loss_clip": 1.41601562, + "router_z_loss_mlp": 0.10906982, + "step": 9073, + "time_per_iteration": 2.476053237915039 + }, + { + "auxiliary_loss_clip": 0.06421326, + "auxiliary_loss_mlp": 0.01272164, + "balance_loss_clip": 0.06280228, + "balance_loss_mlp": 0.01261477, + "epoch": 0.5455583947091538, + "flos": 21842439396480.0, + "grad_norm": 1.836952800264558, + "language_loss": 0.71413183, + "learning_rate": 1.802282211606627e-06, + "loss": 0.79106677, + "num_input_tokens_seen": 195520860, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.10687256, + "step": 9074, + "time_per_iteration": 3.981220006942749 + }, + { + "auxiliary_loss_clip": 0.06424403, + "auxiliary_loss_mlp": 0.01266647, + "balance_loss_clip": 0.06278551, + "balance_loss_mlp": 0.01255364, + "epoch": 0.5456185179618217, + "flos": 17822489840640.0, + "grad_norm": 1.975994190229167, + "language_loss": 0.68697762, + "learning_rate": 1.8018946614693148e-06, + "loss": 0.76388818, + "num_input_tokens_seen": 195538615, + "router_z_loss_clip": 1.45800781, + "router_z_loss_mlp": 0.112854, + "step": 9075, + "time_per_iteration": 2.506155490875244 + }, + { + "auxiliary_loss_clip": 0.06425694, + "auxiliary_loss_mlp": 0.01265713, + "balance_loss_clip": 0.06281726, + "balance_loss_mlp": 0.01254942, + "epoch": 0.5456786412144897, + "flos": 21075787664640.0, + "grad_norm": 1.6135772994791406, + "language_loss": 0.80784404, + "learning_rate": 1.8015071188441768e-06, + "loss": 0.88475811, + "num_input_tokens_seen": 195557460, + "router_z_loss_clip": 1.43847656, + "router_z_loss_mlp": 0.10778809, + "step": 9076, + "time_per_iteration": 2.538940906524658 + }, + { + "auxiliary_loss_clip": 0.06430642, + "auxiliary_loss_mlp": 0.01272688, + "balance_loss_clip": 0.06283286, + "balance_loss_mlp": 0.01261005, + "epoch": 0.5457387644671576, + "flos": 23301712177920.0, + "grad_norm": 1.7804219771063188, + "language_loss": 0.80408549, + "learning_rate": 1.8011195837459089e-06, + "loss": 0.88111883, + "num_input_tokens_seen": 195577985, + "router_z_loss_clip": 1.47363281, + "router_z_loss_mlp": 0.11682129, + "step": 9077, + "time_per_iteration": 2.6752305030822754 + }, + { + "auxiliary_loss_clip": 0.06424201, + "auxiliary_loss_mlp": 0.01267583, + "balance_loss_clip": 0.06278477, + "balance_loss_mlp": 0.0125698, + "epoch": 0.5457988877198257, + "flos": 21623575731840.0, + "grad_norm": 1.8316897806182997, + "language_loss": 0.67871404, + "learning_rate": 1.8007320561892064e-06, + "loss": 0.75563186, + "num_input_tokens_seen": 195597620, + "router_z_loss_clip": 1.45800781, + "router_z_loss_mlp": 0.1060791, + "step": 9078, + "time_per_iteration": 2.5634307861328125 + }, + { + "auxiliary_loss_clip": 0.06428619, + "auxiliary_loss_mlp": 0.01268679, + "balance_loss_clip": 0.0628078, + "balance_loss_mlp": 0.01256722, + "epoch": 0.5458590109724936, + "flos": 23768174257920.0, + "grad_norm": 2.0367985655242116, + "language_loss": 0.81582344, + "learning_rate": 1.800344536188764e-06, + "loss": 0.8927964, + "num_input_tokens_seen": 195615910, + "router_z_loss_clip": 1.4765625, + "router_z_loss_mlp": 0.1194458, + "step": 9079, + "time_per_iteration": 2.563260078430176 + }, + { + "auxiliary_loss_clip": 0.06434448, + "auxiliary_loss_mlp": 0.01267346, + "balance_loss_clip": 0.06280699, + "balance_loss_mlp": 0.01255341, + "epoch": 0.5459191342251616, + "flos": 24430928526720.0, + "grad_norm": 1.7111364231373303, + "language_loss": 0.76216662, + "learning_rate": 1.799957023759277e-06, + "loss": 0.83918452, + "num_input_tokens_seen": 195635620, + "router_z_loss_clip": 1.53710938, + "router_z_loss_mlp": 0.12011719, + "step": 9080, + "time_per_iteration": 2.538072347640991 + }, + { + "auxiliary_loss_clip": 0.06429628, + "auxiliary_loss_mlp": 0.0126983, + "balance_loss_clip": 0.06281854, + "balance_loss_mlp": 0.0125816, + "epoch": 0.5459792574778296, + "flos": 23629756112640.0, + "grad_norm": 1.9762884364861095, + "language_loss": 0.83489871, + "learning_rate": 1.7995695189154392e-06, + "loss": 0.91189325, + "num_input_tokens_seen": 195652495, + "router_z_loss_clip": 1.47753906, + "router_z_loss_mlp": 0.11669922, + "step": 9081, + "time_per_iteration": 2.583111047744751 + }, + { + "auxiliary_loss_clip": 0.06430145, + "auxiliary_loss_mlp": 0.01267495, + "balance_loss_clip": 0.0628006, + "balance_loss_mlp": 0.01256552, + "epoch": 0.5460393807304975, + "flos": 19141583541120.0, + "grad_norm": 2.327386206353707, + "language_loss": 0.70079756, + "learning_rate": 1.7991820216719461e-06, + "loss": 0.77777398, + "num_input_tokens_seen": 195671965, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.10943604, + "step": 9082, + "time_per_iteration": 2.5038371086120605 + }, + { + "auxiliary_loss_clip": 0.06421287, + "auxiliary_loss_mlp": 0.01265183, + "balance_loss_clip": 0.06277952, + "balance_loss_mlp": 0.01253959, + "epoch": 0.5460995039831655, + "flos": 35927308414080.0, + "grad_norm": 1.8952773157154152, + "language_loss": 0.66865891, + "learning_rate": 1.7987945320434906e-06, + "loss": 0.74552357, + "num_input_tokens_seen": 195694725, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.11224365, + "step": 9083, + "time_per_iteration": 2.6453137397766113 + }, + { + "auxiliary_loss_clip": 0.06418573, + "auxiliary_loss_mlp": 0.01266425, + "balance_loss_clip": 0.06276823, + "balance_loss_mlp": 0.01256019, + "epoch": 0.5461596272358334, + "flos": 26766242945280.0, + "grad_norm": 1.5423197483893423, + "language_loss": 0.7895304, + "learning_rate": 1.798407050044766e-06, + "loss": 0.86638033, + "num_input_tokens_seen": 195714090, + "router_z_loss_clip": 1.41992188, + "router_z_loss_mlp": 0.10406494, + "step": 9084, + "time_per_iteration": 2.5392911434173584 + }, + { + "auxiliary_loss_clip": 0.06427852, + "auxiliary_loss_mlp": 0.01262899, + "balance_loss_clip": 0.06280479, + "balance_loss_mlp": 0.01252004, + "epoch": 0.5462197504885015, + "flos": 20892870201600.0, + "grad_norm": 1.8818428979315067, + "language_loss": 0.75159836, + "learning_rate": 1.7980195756904675e-06, + "loss": 0.82850587, + "num_input_tokens_seen": 195733585, + "router_z_loss_clip": 1.47265625, + "router_z_loss_mlp": 0.10900879, + "step": 9085, + "time_per_iteration": 2.5238590240478516 + }, + { + "auxiliary_loss_clip": 0.06428534, + "auxiliary_loss_mlp": 0.012646, + "balance_loss_clip": 0.06279323, + "balance_loss_mlp": 0.01252995, + "epoch": 0.5462798737411694, + "flos": 25810887818880.0, + "grad_norm": 1.69825848629267, + "language_loss": 0.74606055, + "learning_rate": 1.7976321089952857e-06, + "loss": 0.82299185, + "num_input_tokens_seen": 195752820, + "router_z_loss_clip": 1.4921875, + "router_z_loss_mlp": 0.1161499, + "step": 9086, + "time_per_iteration": 2.5416669845581055 + }, + { + "auxiliary_loss_clip": 0.06424639, + "auxiliary_loss_mlp": 0.01267462, + "balance_loss_clip": 0.06277122, + "balance_loss_mlp": 0.01255834, + "epoch": 0.5463399969938374, + "flos": 25782027287040.0, + "grad_norm": 1.4075791244754594, + "language_loss": 0.76979077, + "learning_rate": 1.7972446499739155e-06, + "loss": 0.84671181, + "num_input_tokens_seen": 195773740, + "router_z_loss_clip": 1.47363281, + "router_z_loss_mlp": 0.11633301, + "step": 9087, + "time_per_iteration": 2.5764284133911133 + }, + { + "auxiliary_loss_clip": 0.0642488, + "auxiliary_loss_mlp": 0.01270837, + "balance_loss_clip": 0.06278133, + "balance_loss_mlp": 0.01258088, + "epoch": 0.5464001202465053, + "flos": 18849234245760.0, + "grad_norm": 1.6014949266825944, + "language_loss": 0.77368462, + "learning_rate": 1.7968571986410484e-06, + "loss": 0.85064179, + "num_input_tokens_seen": 195792125, + "router_z_loss_clip": 1.46777344, + "router_z_loss_mlp": 0.12744141, + "step": 9088, + "time_per_iteration": 2.4971888065338135 + }, + { + "auxiliary_loss_clip": 0.06317829, + "auxiliary_loss_mlp": 0.01258554, + "balance_loss_clip": 0.062563, + "balance_loss_mlp": 0.0125685, + "epoch": 0.5464602434991733, + "flos": 69070281978240.0, + "grad_norm": 0.7120973935253039, + "language_loss": 0.57630938, + "learning_rate": 1.7964697550113758e-06, + "loss": 0.6520732, + "num_input_tokens_seen": 195854935, + "router_z_loss_clip": 0.6171875, + "router_z_loss_mlp": 0.01708984, + "step": 9089, + "time_per_iteration": 3.251268148422241 + }, + { + "auxiliary_loss_clip": 0.06429952, + "auxiliary_loss_mlp": 0.01270687, + "balance_loss_clip": 0.06279282, + "balance_loss_mlp": 0.01258945, + "epoch": 0.5465203667518412, + "flos": 27566870307840.0, + "grad_norm": 1.7671189132091156, + "language_loss": 0.77121699, + "learning_rate": 1.7960823190995918e-06, + "loss": 0.84822339, + "num_input_tokens_seen": 195874715, + "router_z_loss_clip": 1.50585938, + "router_z_loss_mlp": 0.11743164, + "step": 9090, + "time_per_iteration": 2.5513298511505127 + }, + { + "auxiliary_loss_clip": 0.06428426, + "auxiliary_loss_mlp": 0.01269928, + "balance_loss_clip": 0.06277205, + "balance_loss_mlp": 0.01257268, + "epoch": 0.5465804900045093, + "flos": 21215757110400.0, + "grad_norm": 1.8390444270451474, + "language_loss": 0.73801088, + "learning_rate": 1.7956948909203855e-06, + "loss": 0.81499445, + "num_input_tokens_seen": 195892610, + "router_z_loss_clip": 1.51269531, + "router_z_loss_mlp": 0.12670898, + "step": 9091, + "time_per_iteration": 2.5593018531799316 + }, + { + "auxiliary_loss_clip": 0.06426038, + "auxiliary_loss_mlp": 0.01268102, + "balance_loss_clip": 0.06278463, + "balance_loss_mlp": 0.01255948, + "epoch": 0.5466406132571772, + "flos": 22495005394560.0, + "grad_norm": 3.020884161734631, + "language_loss": 0.77827132, + "learning_rate": 1.7953074704884498e-06, + "loss": 0.85521269, + "num_input_tokens_seen": 195911085, + "router_z_loss_clip": 1.47363281, + "router_z_loss_mlp": 0.12164307, + "step": 9092, + "time_per_iteration": 2.5000102519989014 + }, + { + "auxiliary_loss_clip": 0.06431385, + "auxiliary_loss_mlp": 0.01266206, + "balance_loss_clip": 0.06280962, + "balance_loss_mlp": 0.01254583, + "epoch": 0.5467007365098452, + "flos": 17681598000000.0, + "grad_norm": 2.033807673433485, + "language_loss": 0.75258666, + "learning_rate": 1.794920057818476e-06, + "loss": 0.82956254, + "num_input_tokens_seen": 195929845, + "router_z_loss_clip": 1.50195312, + "router_z_loss_mlp": 0.11627197, + "step": 9093, + "time_per_iteration": 2.5118560791015625 + }, + { + "auxiliary_loss_clip": 0.06426246, + "auxiliary_loss_mlp": 0.01271687, + "balance_loss_clip": 0.06277527, + "balance_loss_mlp": 0.01258634, + "epoch": 0.5467608597625132, + "flos": 15703146120960.0, + "grad_norm": 3.7072671758327993, + "language_loss": 0.69514894, + "learning_rate": 1.7945326529251533e-06, + "loss": 0.77212822, + "num_input_tokens_seen": 195946350, + "router_z_loss_clip": 1.48828125, + "router_z_loss_mlp": 0.13067627, + "step": 9094, + "time_per_iteration": 2.471296787261963 + }, + { + "auxiliary_loss_clip": 0.06427498, + "auxiliary_loss_mlp": 0.01268457, + "balance_loss_clip": 0.06281194, + "balance_loss_mlp": 0.0125799, + "epoch": 0.5468209830151811, + "flos": 24319106853120.0, + "grad_norm": 3.067574771902978, + "language_loss": 0.68405867, + "learning_rate": 1.7941452558231731e-06, + "loss": 0.76101816, + "num_input_tokens_seen": 195959840, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.10467529, + "step": 9095, + "time_per_iteration": 2.559969186782837 + }, + { + "auxiliary_loss_clip": 0.06427877, + "auxiliary_loss_mlp": 0.01266121, + "balance_loss_clip": 0.06280283, + "balance_loss_mlp": 0.01255058, + "epoch": 0.5468811062678491, + "flos": 29173575548160.0, + "grad_norm": 1.4017188918581747, + "language_loss": 0.67021394, + "learning_rate": 1.7937578665272256e-06, + "loss": 0.747154, + "num_input_tokens_seen": 195981125, + "router_z_loss_clip": 1.4765625, + "router_z_loss_mlp": 0.11065674, + "step": 9096, + "time_per_iteration": 2.5755646228790283 + }, + { + "auxiliary_loss_clip": 0.06321621, + "auxiliary_loss_mlp": 0.01252605, + "balance_loss_clip": 0.06259765, + "balance_loss_mlp": 0.01250808, + "epoch": 0.546941229520517, + "flos": 67885078302720.0, + "grad_norm": 0.7312259601273227, + "language_loss": 0.57564938, + "learning_rate": 1.7933704850520007e-06, + "loss": 0.65139174, + "num_input_tokens_seen": 196038880, + "router_z_loss_clip": 0.6171875, + "router_z_loss_mlp": 0.01792908, + "step": 9097, + "time_per_iteration": 3.239208698272705 + }, + { + "auxiliary_loss_clip": 0.06323195, + "auxiliary_loss_mlp": 0.01252523, + "balance_loss_clip": 0.06261444, + "balance_loss_mlp": 0.01250845, + "epoch": 0.5470013527731851, + "flos": 58286578993920.0, + "grad_norm": 0.8922489191245683, + "language_loss": 0.64733016, + "learning_rate": 1.7929831114121868e-06, + "loss": 0.72308731, + "num_input_tokens_seen": 196099215, + "router_z_loss_clip": 0.6171875, + "router_z_loss_mlp": 0.01681519, + "step": 9098, + "time_per_iteration": 4.485429763793945 + }, + { + "auxiliary_loss_clip": 0.06427541, + "auxiliary_loss_mlp": 0.01271404, + "balance_loss_clip": 0.06279691, + "balance_loss_mlp": 0.0125937, + "epoch": 0.547061476025853, + "flos": 22972494286080.0, + "grad_norm": 1.4988253633991158, + "language_loss": 0.73256373, + "learning_rate": 1.7925957456224753e-06, + "loss": 0.80955321, + "num_input_tokens_seen": 196120370, + "router_z_loss_clip": 1.47753906, + "router_z_loss_mlp": 0.12042236, + "step": 9099, + "time_per_iteration": 2.5771172046661377 + }, + { + "auxiliary_loss_clip": 0.06428638, + "auxiliary_loss_mlp": 0.01265011, + "balance_loss_clip": 0.06282665, + "balance_loss_mlp": 0.01254712, + "epoch": 0.547121599278521, + "flos": 29975502648960.0, + "grad_norm": 1.9003011025398133, + "language_loss": 0.73232269, + "learning_rate": 1.7922083876975537e-06, + "loss": 0.80925912, + "num_input_tokens_seen": 196139075, + "router_z_loss_clip": 1.45898438, + "router_z_loss_mlp": 0.10296631, + "step": 9100, + "time_per_iteration": 2.613353967666626 + }, + { + "auxiliary_loss_clip": 0.06426845, + "auxiliary_loss_mlp": 0.01268034, + "balance_loss_clip": 0.06282172, + "balance_loss_mlp": 0.01256376, + "epoch": 0.5471817225311889, + "flos": 36543760502400.0, + "grad_norm": 3.16405552040578, + "language_loss": 0.68177283, + "learning_rate": 1.7918210376521102e-06, + "loss": 0.75872165, + "num_input_tokens_seen": 196159990, + "router_z_loss_clip": 1.44726562, + "router_z_loss_mlp": 0.11663818, + "step": 9101, + "time_per_iteration": 2.645268440246582 + }, + { + "auxiliary_loss_clip": 0.06429439, + "auxiliary_loss_mlp": 0.01267587, + "balance_loss_clip": 0.06282283, + "balance_loss_mlp": 0.01256482, + "epoch": 0.5472418457838569, + "flos": 25782278849280.0, + "grad_norm": 1.6236525701759785, + "language_loss": 0.78028667, + "learning_rate": 1.7914336955008343e-06, + "loss": 0.85725689, + "num_input_tokens_seen": 196180570, + "router_z_loss_clip": 1.46972656, + "router_z_loss_mlp": 0.11114502, + "step": 9102, + "time_per_iteration": 4.018383264541626 + }, + { + "auxiliary_loss_clip": 0.06425326, + "auxiliary_loss_mlp": 0.01265935, + "balance_loss_clip": 0.06284064, + "balance_loss_mlp": 0.01255659, + "epoch": 0.5473019690365248, + "flos": 27894453045120.0, + "grad_norm": 1.4050316255430886, + "language_loss": 0.72370696, + "learning_rate": 1.791046361258413e-06, + "loss": 0.80061954, + "num_input_tokens_seen": 196200300, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.1027832, + "step": 9103, + "time_per_iteration": 2.613557815551758 + }, + { + "auxiliary_loss_clip": 0.06427938, + "auxiliary_loss_mlp": 0.01268597, + "balance_loss_clip": 0.06282217, + "balance_loss_mlp": 0.01257237, + "epoch": 0.5473620922891929, + "flos": 57644551411200.0, + "grad_norm": 1.2696818989696173, + "language_loss": 0.65471172, + "learning_rate": 1.7906590349395356e-06, + "loss": 0.73167711, + "num_input_tokens_seen": 196228525, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.11352539, + "step": 9104, + "time_per_iteration": 2.8648996353149414 + }, + { + "auxiliary_loss_clip": 0.0643408, + "auxiliary_loss_mlp": 0.01271697, + "balance_loss_clip": 0.06284557, + "balance_loss_mlp": 0.01259174, + "epoch": 0.5474222155418608, + "flos": 19360069862400.0, + "grad_norm": 1.73787664165883, + "language_loss": 0.8214826, + "learning_rate": 1.790271716558888e-06, + "loss": 0.89854038, + "num_input_tokens_seen": 196247690, + "router_z_loss_clip": 1.49609375, + "router_z_loss_mlp": 0.12536621, + "step": 9105, + "time_per_iteration": 2.5110819339752197 + }, + { + "auxiliary_loss_clip": 0.06424334, + "auxiliary_loss_mlp": 0.01267412, + "balance_loss_clip": 0.06280238, + "balance_loss_mlp": 0.01256474, + "epoch": 0.5474823387945288, + "flos": 25127700353280.0, + "grad_norm": 1.5738849579324676, + "language_loss": 0.80505264, + "learning_rate": 1.7898844061311575e-06, + "loss": 0.88197005, + "num_input_tokens_seen": 196268555, + "router_z_loss_clip": 1.43945312, + "router_z_loss_mlp": 0.10943604, + "step": 9106, + "time_per_iteration": 2.545797824859619 + }, + { + "auxiliary_loss_clip": 0.0642664, + "auxiliary_loss_mlp": 0.01267343, + "balance_loss_clip": 0.06280842, + "balance_loss_mlp": 0.01256334, + "epoch": 0.5475424620471967, + "flos": 18009977351040.0, + "grad_norm": 1.8936776188065845, + "language_loss": 0.69983113, + "learning_rate": 1.7894971036710322e-06, + "loss": 0.77677101, + "num_input_tokens_seen": 196285585, + "router_z_loss_clip": 1.45800781, + "router_z_loss_mlp": 0.11010742, + "step": 9107, + "time_per_iteration": 3.930511474609375 + }, + { + "auxiliary_loss_clip": 0.06431143, + "auxiliary_loss_mlp": 0.01263745, + "balance_loss_clip": 0.06281775, + "balance_loss_mlp": 0.01252438, + "epoch": 0.5476025852998647, + "flos": 22315819438080.0, + "grad_norm": 1.6441057037047366, + "language_loss": 0.63668221, + "learning_rate": 1.789109809193197e-06, + "loss": 0.71363103, + "num_input_tokens_seen": 196305085, + "router_z_loss_clip": 1.49316406, + "router_z_loss_mlp": 0.11309814, + "step": 9108, + "time_per_iteration": 2.548469305038452 + }, + { + "auxiliary_loss_clip": 0.06427735, + "auxiliary_loss_mlp": 0.01264812, + "balance_loss_clip": 0.06281575, + "balance_loss_mlp": 0.01254632, + "epoch": 0.5476627085525327, + "flos": 20126679667200.0, + "grad_norm": 1.6544017163405356, + "language_loss": 0.75096864, + "learning_rate": 1.7887225227123396e-06, + "loss": 0.82789409, + "num_input_tokens_seen": 196323945, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.10174561, + "step": 9109, + "time_per_iteration": 2.505537748336792 + }, + { + "auxiliary_loss_clip": 0.06426554, + "auxiliary_loss_mlp": 0.01271245, + "balance_loss_clip": 0.06282739, + "balance_loss_mlp": 0.01259235, + "epoch": 0.5477228318052006, + "flos": 17718382742400.0, + "grad_norm": 1.7609925306613563, + "language_loss": 0.78101015, + "learning_rate": 1.7883352442431457e-06, + "loss": 0.85798812, + "num_input_tokens_seen": 196342200, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.12005615, + "step": 9110, + "time_per_iteration": 2.5898001194000244 + }, + { + "auxiliary_loss_clip": 0.0642444, + "auxiliary_loss_mlp": 0.01264653, + "balance_loss_clip": 0.06281163, + "balance_loss_mlp": 0.01253948, + "epoch": 0.5477829550578687, + "flos": 25856057969280.0, + "grad_norm": 1.4117567478996924, + "language_loss": 0.71281165, + "learning_rate": 1.7879479738002993e-06, + "loss": 0.78970265, + "num_input_tokens_seen": 196362940, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.10699463, + "step": 9111, + "time_per_iteration": 2.5514800548553467 + }, + { + "auxiliary_loss_clip": 0.06428348, + "auxiliary_loss_mlp": 0.01265751, + "balance_loss_clip": 0.06282744, + "balance_loss_mlp": 0.01254021, + "epoch": 0.5478430783105366, + "flos": 23046399187200.0, + "grad_norm": 1.7318252125729088, + "language_loss": 0.71129775, + "learning_rate": 1.7875607113984876e-06, + "loss": 0.7882387, + "num_input_tokens_seen": 196383070, + "router_z_loss_clip": 1.45605469, + "router_z_loss_mlp": 0.1171875, + "step": 9112, + "time_per_iteration": 2.5733911991119385 + }, + { + "auxiliary_loss_clip": 0.06428306, + "auxiliary_loss_mlp": 0.01265183, + "balance_loss_clip": 0.06280322, + "balance_loss_mlp": 0.0125412, + "epoch": 0.5479032015632046, + "flos": 16076821403520.0, + "grad_norm": 1.865243038866792, + "language_loss": 0.88150853, + "learning_rate": 1.7871734570523953e-06, + "loss": 0.95844346, + "num_input_tokens_seen": 196398485, + "router_z_loss_clip": 1.47949219, + "router_z_loss_mlp": 0.1105957, + "step": 9113, + "time_per_iteration": 4.03569483757019 + }, + { + "auxiliary_loss_clip": 0.06427854, + "auxiliary_loss_mlp": 0.01265805, + "balance_loss_clip": 0.0628054, + "balance_loss_mlp": 0.01254171, + "epoch": 0.5479633248158725, + "flos": 24285382784640.0, + "grad_norm": 1.9056802782338742, + "language_loss": 0.73404038, + "learning_rate": 1.7867862107767067e-06, + "loss": 0.81097698, + "num_input_tokens_seen": 196417725, + "router_z_loss_clip": 1.47363281, + "router_z_loss_mlp": 0.11633301, + "step": 9114, + "time_per_iteration": 2.552778959274292 + }, + { + "auxiliary_loss_clip": 0.06424817, + "auxiliary_loss_mlp": 0.0126649, + "balance_loss_clip": 0.06279442, + "balance_loss_mlp": 0.0125582, + "epoch": 0.5480234480685405, + "flos": 26365216504320.0, + "grad_norm": 1.4540698273743113, + "language_loss": 0.72457099, + "learning_rate": 1.7863989725861066e-06, + "loss": 0.80148405, + "num_input_tokens_seen": 196437840, + "router_z_loss_clip": 1.45214844, + "router_z_loss_mlp": 0.10662842, + "step": 9115, + "time_per_iteration": 2.5838403701782227 + }, + { + "auxiliary_loss_clip": 0.06436512, + "auxiliary_loss_mlp": 0.01267671, + "balance_loss_clip": 0.06284098, + "balance_loss_mlp": 0.01256066, + "epoch": 0.5480835713212084, + "flos": 22061722331520.0, + "grad_norm": 1.7541916767056687, + "language_loss": 0.72373956, + "learning_rate": 1.7860117424952781e-06, + "loss": 0.80078137, + "num_input_tokens_seen": 196457300, + "router_z_loss_clip": 1.52539062, + "router_z_loss_mlp": 0.1161499, + "step": 9116, + "time_per_iteration": 2.5292439460754395 + }, + { + "auxiliary_loss_clip": 0.06426133, + "auxiliary_loss_mlp": 0.01267118, + "balance_loss_clip": 0.06279518, + "balance_loss_mlp": 0.01256205, + "epoch": 0.5481436945738765, + "flos": 25308018339840.0, + "grad_norm": 1.941043285146296, + "language_loss": 0.76906073, + "learning_rate": 1.7856245205189063e-06, + "loss": 0.84599322, + "num_input_tokens_seen": 196476720, + "router_z_loss_clip": 1.46582031, + "router_z_loss_mlp": 0.10906982, + "step": 9117, + "time_per_iteration": 2.5854122638702393 + }, + { + "auxiliary_loss_clip": 0.06421119, + "auxiliary_loss_mlp": 0.01264207, + "balance_loss_clip": 0.06279179, + "balance_loss_mlp": 0.01253532, + "epoch": 0.5482038178265444, + "flos": 33588807540480.0, + "grad_norm": 1.613198613591587, + "language_loss": 0.62954283, + "learning_rate": 1.785237306671674e-06, + "loss": 0.7063961, + "num_input_tokens_seen": 196496765, + "router_z_loss_clip": 1.41894531, + "router_z_loss_mlp": 0.10675049, + "step": 9118, + "time_per_iteration": 2.61136531829834 + }, + { + "auxiliary_loss_clip": 0.06429429, + "auxiliary_loss_mlp": 0.01266281, + "balance_loss_clip": 0.06280537, + "balance_loss_mlp": 0.0125436, + "epoch": 0.5482639410792124, + "flos": 19032235562880.0, + "grad_norm": 1.6774564392555322, + "language_loss": 0.79138243, + "learning_rate": 1.7848501009682646e-06, + "loss": 0.86833954, + "num_input_tokens_seen": 196516220, + "router_z_loss_clip": 1.48925781, + "router_z_loss_mlp": 0.11920166, + "step": 9119, + "time_per_iteration": 2.5309953689575195 + }, + { + "auxiliary_loss_clip": 0.06425598, + "auxiliary_loss_mlp": 0.01271106, + "balance_loss_clip": 0.06281713, + "balance_loss_mlp": 0.0126033, + "epoch": 0.5483240643318803, + "flos": 25417282464000.0, + "grad_norm": 1.5630724809093546, + "language_loss": 0.82719064, + "learning_rate": 1.7844629034233604e-06, + "loss": 0.9041577, + "num_input_tokens_seen": 196533860, + "router_z_loss_clip": 1.43945312, + "router_z_loss_mlp": 0.10772705, + "step": 9120, + "time_per_iteration": 2.551790952682495 + }, + { + "auxiliary_loss_clip": 0.06432922, + "auxiliary_loss_mlp": 0.01264861, + "balance_loss_clip": 0.06284823, + "balance_loss_mlp": 0.01253292, + "epoch": 0.5483841875845483, + "flos": 21472705255680.0, + "grad_norm": 1.7308751336861314, + "language_loss": 0.80248237, + "learning_rate": 1.7840757140516455e-06, + "loss": 0.87946028, + "num_input_tokens_seen": 196551305, + "router_z_loss_clip": 1.48242188, + "router_z_loss_mlp": 0.11566162, + "step": 9121, + "time_per_iteration": 2.5354321002960205 + }, + { + "auxiliary_loss_clip": 0.06429829, + "auxiliary_loss_mlp": 0.01267, + "balance_loss_clip": 0.06280297, + "balance_loss_mlp": 0.01255651, + "epoch": 0.5484443108372163, + "flos": 24753060748800.0, + "grad_norm": 1.8214688446413962, + "language_loss": 0.6171329, + "learning_rate": 1.7836885328678008e-06, + "loss": 0.69410121, + "num_input_tokens_seen": 196569420, + "router_z_loss_clip": 1.49609375, + "router_z_loss_mlp": 0.11352539, + "step": 9122, + "time_per_iteration": 2.536548614501953 + }, + { + "auxiliary_loss_clip": 0.06426375, + "auxiliary_loss_mlp": 0.01268013, + "balance_loss_clip": 0.06283108, + "balance_loss_mlp": 0.0125729, + "epoch": 0.5485044340898843, + "flos": 25382594073600.0, + "grad_norm": 1.6758320366866328, + "language_loss": 0.71812153, + "learning_rate": 1.7833013598865084e-06, + "loss": 0.7950654, + "num_input_tokens_seen": 196590610, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.1071167, + "step": 9123, + "time_per_iteration": 2.563128709793091 + }, + { + "auxiliary_loss_clip": 0.06422795, + "auxiliary_loss_mlp": 0.01264644, + "balance_loss_clip": 0.06277866, + "balance_loss_mlp": 0.01254839, + "epoch": 0.5485645573425523, + "flos": 12646140485760.0, + "grad_norm": 2.0499300220900367, + "language_loss": 0.83466411, + "learning_rate": 1.7829141951224505e-06, + "loss": 0.91153848, + "num_input_tokens_seen": 196606495, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.09802246, + "step": 9124, + "time_per_iteration": 2.4774932861328125 + }, + { + "auxiliary_loss_clip": 0.06423289, + "auxiliary_loss_mlp": 0.01272789, + "balance_loss_clip": 0.06280372, + "balance_loss_mlp": 0.01262054, + "epoch": 0.5486246805952202, + "flos": 28336918129920.0, + "grad_norm": 1.5704023496451165, + "language_loss": 0.80787551, + "learning_rate": 1.7825270385903075e-06, + "loss": 0.88483626, + "num_input_tokens_seen": 196626365, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.10736084, + "step": 9125, + "time_per_iteration": 2.6640827655792236 + }, + { + "auxiliary_loss_clip": 0.06429766, + "auxiliary_loss_mlp": 0.01266738, + "balance_loss_clip": 0.06281759, + "balance_loss_mlp": 0.0125558, + "epoch": 0.5486848038478882, + "flos": 16805598289920.0, + "grad_norm": 1.778522251586277, + "language_loss": 0.74475932, + "learning_rate": 1.7821398903047617e-06, + "loss": 0.82172436, + "num_input_tokens_seen": 196644465, + "router_z_loss_clip": 1.47753906, + "router_z_loss_mlp": 0.1114502, + "step": 9126, + "time_per_iteration": 2.4920494556427 + }, + { + "auxiliary_loss_clip": 0.0643461, + "auxiliary_loss_mlp": 0.01271917, + "balance_loss_clip": 0.06284419, + "balance_loss_mlp": 0.01260383, + "epoch": 0.5487449271005561, + "flos": 17241606610560.0, + "grad_norm": 2.5065680491325217, + "language_loss": 0.66843152, + "learning_rate": 1.7817527502804928e-06, + "loss": 0.74549675, + "num_input_tokens_seen": 196659160, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.11535645, + "step": 9127, + "time_per_iteration": 2.498995304107666 + }, + { + "auxiliary_loss_clip": 0.0642729, + "auxiliary_loss_mlp": 0.0126947, + "balance_loss_clip": 0.06281507, + "balance_loss_mlp": 0.01257072, + "epoch": 0.5488050503532241, + "flos": 17345462146560.0, + "grad_norm": 1.8347258108428224, + "language_loss": 0.83430481, + "learning_rate": 1.781365618532181e-06, + "loss": 0.91127241, + "num_input_tokens_seen": 196677410, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.1239624, + "step": 9128, + "time_per_iteration": 2.4851553440093994 + }, + { + "auxiliary_loss_clip": 0.06423862, + "auxiliary_loss_mlp": 0.01267411, + "balance_loss_clip": 0.06279477, + "balance_loss_mlp": 0.01256032, + "epoch": 0.548865173605892, + "flos": 17245044627840.0, + "grad_norm": 1.9721748285442382, + "language_loss": 0.73992771, + "learning_rate": 1.7809784950745078e-06, + "loss": 0.81684041, + "num_input_tokens_seen": 196696765, + "router_z_loss_clip": 1.44335938, + "router_z_loss_mlp": 0.1138916, + "step": 9129, + "time_per_iteration": 2.5088050365448 + }, + { + "auxiliary_loss_clip": 0.06436306, + "auxiliary_loss_mlp": 0.0126816, + "balance_loss_clip": 0.0628598, + "balance_loss_mlp": 0.01256108, + "epoch": 0.5489252968585601, + "flos": 17462398919040.0, + "grad_norm": 2.1982698674747745, + "language_loss": 0.63327444, + "learning_rate": 1.7805913799221511e-06, + "loss": 0.7103191, + "num_input_tokens_seen": 196714895, + "router_z_loss_clip": 1.50292969, + "router_z_loss_mlp": 0.12054443, + "step": 9130, + "time_per_iteration": 2.4861414432525635 + }, + { + "auxiliary_loss_clip": 0.06431893, + "auxiliary_loss_mlp": 0.01266818, + "balance_loss_clip": 0.06281481, + "balance_loss_mlp": 0.01255046, + "epoch": 0.548985420111228, + "flos": 26330653895040.0, + "grad_norm": 1.729948569228587, + "language_loss": 0.63358611, + "learning_rate": 1.7802042730897915e-06, + "loss": 0.71057326, + "num_input_tokens_seen": 196735510, + "router_z_loss_clip": 1.50390625, + "router_z_loss_mlp": 0.11773682, + "step": 9131, + "time_per_iteration": 2.589580535888672 + }, + { + "auxiliary_loss_clip": 0.0643028, + "auxiliary_loss_mlp": 0.01268323, + "balance_loss_clip": 0.06282265, + "balance_loss_mlp": 0.01255955, + "epoch": 0.549045543363896, + "flos": 18699034602240.0, + "grad_norm": 1.7539544854272515, + "language_loss": 0.75148702, + "learning_rate": 1.7798171745921084e-06, + "loss": 0.82847303, + "num_input_tokens_seen": 196752855, + "router_z_loss_clip": 1.47851562, + "router_z_loss_mlp": 0.12353516, + "step": 9132, + "time_per_iteration": 2.461970329284668 + }, + { + "auxiliary_loss_clip": 0.06429279, + "auxiliary_loss_mlp": 0.01266105, + "balance_loss_clip": 0.06280597, + "balance_loss_mlp": 0.01255234, + "epoch": 0.5491056666165639, + "flos": 24724284071040.0, + "grad_norm": 2.6052413777049144, + "language_loss": 0.8162328, + "learning_rate": 1.7794300844437795e-06, + "loss": 0.89318669, + "num_input_tokens_seen": 196772230, + "router_z_loss_clip": 1.48730469, + "router_z_loss_mlp": 0.10870361, + "step": 9133, + "time_per_iteration": 2.5799684524536133 + }, + { + "auxiliary_loss_clip": 0.06426433, + "auxiliary_loss_mlp": 0.01271009, + "balance_loss_clip": 0.06280407, + "balance_loss_mlp": 0.01259691, + "epoch": 0.5491657898692319, + "flos": 21582849847680.0, + "grad_norm": 1.8788464104374898, + "language_loss": 0.70385146, + "learning_rate": 1.7790430026594841e-06, + "loss": 0.78082585, + "num_input_tokens_seen": 196790405, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.11328125, + "step": 9134, + "time_per_iteration": 2.5116565227508545 + }, + { + "auxiliary_loss_clip": 0.06431407, + "auxiliary_loss_mlp": 0.01267106, + "balance_loss_clip": 0.06281983, + "balance_loss_mlp": 0.01256062, + "epoch": 0.5492259131219, + "flos": 50487653825280.0, + "grad_norm": 2.3217483044436955, + "language_loss": 0.61379695, + "learning_rate": 1.7786559292539004e-06, + "loss": 0.69078213, + "num_input_tokens_seen": 196813785, + "router_z_loss_clip": 1.49316406, + "router_z_loss_mlp": 0.11035156, + "step": 9135, + "time_per_iteration": 2.8019859790802 + }, + { + "auxiliary_loss_clip": 0.06430922, + "auxiliary_loss_mlp": 0.01266434, + "balance_loss_clip": 0.06280293, + "balance_loss_mlp": 0.01254591, + "epoch": 0.5492860363745679, + "flos": 25126316760960.0, + "grad_norm": 1.8569102400294533, + "language_loss": 0.72833902, + "learning_rate": 1.7782688642417058e-06, + "loss": 0.80531251, + "num_input_tokens_seen": 196834390, + "router_z_loss_clip": 1.50488281, + "router_z_loss_mlp": 0.11846924, + "step": 9136, + "time_per_iteration": 2.5313796997070312 + }, + { + "auxiliary_loss_clip": 0.06434008, + "auxiliary_loss_mlp": 0.01267878, + "balance_loss_clip": 0.06279632, + "balance_loss_mlp": 0.01255551, + "epoch": 0.5493461596272359, + "flos": 22639670668800.0, + "grad_norm": 2.4335907064216302, + "language_loss": 0.6873585, + "learning_rate": 1.7778818076375781e-06, + "loss": 0.76437736, + "num_input_tokens_seen": 196853290, + "router_z_loss_clip": 1.54296875, + "router_z_loss_mlp": 0.12329102, + "step": 9137, + "time_per_iteration": 2.606400489807129 + }, + { + "auxiliary_loss_clip": 0.06325421, + "auxiliary_loss_mlp": 0.01260391, + "balance_loss_clip": 0.06263588, + "balance_loss_mlp": 0.01258753, + "epoch": 0.5494062828799038, + "flos": 66169486281600.0, + "grad_norm": 0.7309885412732349, + "language_loss": 0.65176189, + "learning_rate": 1.7774947594561947e-06, + "loss": 0.72762001, + "num_input_tokens_seen": 196913120, + "router_z_loss_clip": 0.61767578, + "router_z_loss_mlp": 0.0164032, + "step": 9138, + "time_per_iteration": 4.603189945220947 + }, + { + "auxiliary_loss_clip": 0.06431855, + "auxiliary_loss_mlp": 0.01265593, + "balance_loss_clip": 0.06282654, + "balance_loss_mlp": 0.01253803, + "epoch": 0.5494664061325718, + "flos": 21112362771840.0, + "grad_norm": 1.7352131741027665, + "language_loss": 0.75659418, + "learning_rate": 1.7771077197122321e-06, + "loss": 0.83356863, + "num_input_tokens_seen": 196931530, + "router_z_loss_clip": 1.49121094, + "router_z_loss_mlp": 0.11785889, + "step": 9139, + "time_per_iteration": 2.5063250064849854 + }, + { + "auxiliary_loss_clip": 0.06427477, + "auxiliary_loss_mlp": 0.01268876, + "balance_loss_clip": 0.06281833, + "balance_loss_mlp": 0.01257599, + "epoch": 0.5495265293852397, + "flos": 14397846416640.0, + "grad_norm": 2.090947018102217, + "language_loss": 0.71453607, + "learning_rate": 1.7767206884203672e-06, + "loss": 0.79149961, + "num_input_tokens_seen": 196949430, + "router_z_loss_clip": 1.45507812, + "router_z_loss_mlp": 0.11273193, + "step": 9140, + "time_per_iteration": 2.516493558883667 + }, + { + "auxiliary_loss_clip": 0.06426564, + "auxiliary_loss_mlp": 0.01265679, + "balance_loss_clip": 0.06279987, + "balance_loss_mlp": 0.01254623, + "epoch": 0.5495866526379077, + "flos": 25554945922560.0, + "grad_norm": 1.591757169874098, + "language_loss": 0.76439172, + "learning_rate": 1.7763336655952762e-06, + "loss": 0.84131408, + "num_input_tokens_seen": 196968265, + "router_z_loss_clip": 1.46484375, + "router_z_loss_mlp": 0.1105957, + "step": 9141, + "time_per_iteration": 4.032621383666992 + }, + { + "auxiliary_loss_clip": 0.06420414, + "auxiliary_loss_mlp": 0.01268222, + "balance_loss_clip": 0.06278077, + "balance_loss_mlp": 0.01257648, + "epoch": 0.5496467758905756, + "flos": 21322421758080.0, + "grad_norm": 1.9135284052459163, + "language_loss": 0.75301933, + "learning_rate": 1.7759466512516346e-06, + "loss": 0.82990575, + "num_input_tokens_seen": 196984930, + "router_z_loss_clip": 1.42285156, + "router_z_loss_mlp": 0.10577393, + "step": 9142, + "time_per_iteration": 2.517458438873291 + }, + { + "auxiliary_loss_clip": 0.06433351, + "auxiliary_loss_mlp": 0.01271982, + "balance_loss_clip": 0.06284253, + "balance_loss_mlp": 0.01259895, + "epoch": 0.5497068991432437, + "flos": 22239021571200.0, + "grad_norm": 1.7111366793556597, + "language_loss": 0.77014959, + "learning_rate": 1.7755596454041192e-06, + "loss": 0.84720296, + "num_input_tokens_seen": 197002320, + "router_z_loss_clip": 1.49121094, + "router_z_loss_mlp": 0.12091064, + "step": 9143, + "time_per_iteration": 2.516505002975464 + }, + { + "auxiliary_loss_clip": 0.06424481, + "auxiliary_loss_mlp": 0.01268074, + "balance_loss_clip": 0.06278251, + "balance_loss_mlp": 0.01256416, + "epoch": 0.5497670223959116, + "flos": 18485076401280.0, + "grad_norm": 3.356687572137957, + "language_loss": 0.79973668, + "learning_rate": 1.7751726480674044e-06, + "loss": 0.87666219, + "num_input_tokens_seen": 197020825, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.11663818, + "step": 9144, + "time_per_iteration": 2.4832475185394287 + }, + { + "auxiliary_loss_clip": 0.0642961, + "auxiliary_loss_mlp": 0.01268496, + "balance_loss_clip": 0.06281358, + "balance_loss_mlp": 0.01257153, + "epoch": 0.5498271456485796, + "flos": 29212750131840.0, + "grad_norm": 1.7313830940317911, + "language_loss": 0.7154156, + "learning_rate": 1.7747856592561645e-06, + "loss": 0.79239666, + "num_input_tokens_seen": 197040450, + "router_z_loss_clip": 1.48242188, + "router_z_loss_mlp": 0.11346436, + "step": 9145, + "time_per_iteration": 2.6261048316955566 + }, + { + "auxiliary_loss_clip": 0.06426725, + "auxiliary_loss_mlp": 0.01264568, + "balance_loss_clip": 0.06279887, + "balance_loss_mlp": 0.01254197, + "epoch": 0.5498872689012475, + "flos": 34833032017920.0, + "grad_norm": 1.5682468167397778, + "language_loss": 0.70529747, + "learning_rate": 1.774398678985076e-06, + "loss": 0.78221035, + "num_input_tokens_seen": 197063930, + "router_z_loss_clip": 1.46582031, + "router_z_loss_mlp": 0.10369873, + "step": 9146, + "time_per_iteration": 4.087557315826416 + }, + { + "auxiliary_loss_clip": 0.06419109, + "auxiliary_loss_mlp": 0.01264014, + "balance_loss_clip": 0.06276917, + "balance_loss_mlp": 0.01253923, + "epoch": 0.5499473921539155, + "flos": 25929124329600.0, + "grad_norm": 2.0128119517228305, + "language_loss": 0.64188051, + "learning_rate": 1.7740117072688113e-06, + "loss": 0.71871173, + "num_input_tokens_seen": 197082660, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.10095215, + "step": 9147, + "time_per_iteration": 2.5406603813171387 + }, + { + "auxiliary_loss_clip": 0.06424303, + "auxiliary_loss_mlp": 0.01265827, + "balance_loss_clip": 0.06279408, + "balance_loss_mlp": 0.01255122, + "epoch": 0.5500075154065835, + "flos": 22280334433920.0, + "grad_norm": 1.893989099652022, + "language_loss": 0.81534255, + "learning_rate": 1.7736247441220458e-06, + "loss": 0.89224386, + "num_input_tokens_seen": 197100675, + "router_z_loss_clip": 1.44921875, + "router_z_loss_mlp": 0.1071167, + "step": 9148, + "time_per_iteration": 2.5051376819610596 + }, + { + "auxiliary_loss_clip": 0.06424436, + "auxiliary_loss_mlp": 0.01270935, + "balance_loss_clip": 0.06277981, + "balance_loss_mlp": 0.0125992, + "epoch": 0.5500676386592515, + "flos": 28044946177920.0, + "grad_norm": 1.7460739337347344, + "language_loss": 0.7916007, + "learning_rate": 1.773237789559453e-06, + "loss": 0.86855441, + "num_input_tokens_seen": 197121320, + "router_z_loss_clip": 1.46484375, + "router_z_loss_mlp": 0.11016846, + "step": 9149, + "time_per_iteration": 2.5586931705474854 + }, + { + "auxiliary_loss_clip": 0.0642364, + "auxiliary_loss_mlp": 0.01264747, + "balance_loss_clip": 0.06277739, + "balance_loss_mlp": 0.01253852, + "epoch": 0.5501277619119195, + "flos": 23921602283520.0, + "grad_norm": 2.0079288501902965, + "language_loss": 0.7263124, + "learning_rate": 1.7728508435957052e-06, + "loss": 0.80319625, + "num_input_tokens_seen": 197138965, + "router_z_loss_clip": 1.45996094, + "router_z_loss_mlp": 0.10888672, + "step": 9150, + "time_per_iteration": 2.5097196102142334 + }, + { + "auxiliary_loss_clip": 0.06428004, + "auxiliary_loss_mlp": 0.01265548, + "balance_loss_clip": 0.06278474, + "balance_loss_mlp": 0.01253454, + "epoch": 0.5501878851645874, + "flos": 20930199995520.0, + "grad_norm": 1.7516173490285718, + "language_loss": 0.74991822, + "learning_rate": 1.772463906245477e-06, + "loss": 0.82685369, + "num_input_tokens_seen": 197156460, + "router_z_loss_clip": 1.49316406, + "router_z_loss_mlp": 0.12103271, + "step": 9151, + "time_per_iteration": 2.4953532218933105 + }, + { + "auxiliary_loss_clip": 0.06421181, + "auxiliary_loss_mlp": 0.01264237, + "balance_loss_clip": 0.06275992, + "balance_loss_mlp": 0.01253317, + "epoch": 0.5502480084172554, + "flos": 20671155498240.0, + "grad_norm": 1.7180580365194615, + "language_loss": 0.76128006, + "learning_rate": 1.7720769775234394e-06, + "loss": 0.83813429, + "num_input_tokens_seen": 197175140, + "router_z_loss_clip": 1.45117188, + "router_z_loss_mlp": 0.10925293, + "step": 9152, + "time_per_iteration": 2.5041630268096924 + }, + { + "auxiliary_loss_clip": 0.06418908, + "auxiliary_loss_mlp": 0.01264981, + "balance_loss_clip": 0.06276076, + "balance_loss_mlp": 0.01254336, + "epoch": 0.5503081316699233, + "flos": 26439792238080.0, + "grad_norm": 3.86516963702514, + "language_loss": 0.82636946, + "learning_rate": 1.7716900574442662e-06, + "loss": 0.90320837, + "num_input_tokens_seen": 197194345, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.10650635, + "step": 9153, + "time_per_iteration": 4.000823259353638 + }, + { + "auxiliary_loss_clip": 0.06419568, + "auxiliary_loss_mlp": 0.01266317, + "balance_loss_clip": 0.0627673, + "balance_loss_mlp": 0.01254682, + "epoch": 0.5503682549225913, + "flos": 30637208741760.0, + "grad_norm": 1.7185020713354737, + "language_loss": 0.7442615, + "learning_rate": 1.7713031460226294e-06, + "loss": 0.82112032, + "num_input_tokens_seen": 197215535, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.11633301, + "step": 9154, + "time_per_iteration": 2.619478225708008 + }, + { + "auxiliary_loss_clip": 0.06431979, + "auxiliary_loss_mlp": 0.01267491, + "balance_loss_clip": 0.0627896, + "balance_loss_mlp": 0.01256273, + "epoch": 0.5504283781752592, + "flos": 22572096750720.0, + "grad_norm": 1.5448619232700234, + "language_loss": 0.73359931, + "learning_rate": 1.770916243273199e-06, + "loss": 0.81059402, + "num_input_tokens_seen": 197234945, + "router_z_loss_clip": 1.53027344, + "router_z_loss_mlp": 0.11212158, + "step": 9155, + "time_per_iteration": 2.5512940883636475 + }, + { + "auxiliary_loss_clip": 0.0632084, + "auxiliary_loss_mlp": 0.01252943, + "balance_loss_clip": 0.06258567, + "balance_loss_mlp": 0.01251311, + "epoch": 0.5504885014279273, + "flos": 67918634663040.0, + "grad_norm": 0.7176527357407121, + "language_loss": 0.5550307, + "learning_rate": 1.7705293492106483e-06, + "loss": 0.63076854, + "num_input_tokens_seen": 197302285, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 0.01634216, + "step": 9156, + "time_per_iteration": 3.3401191234588623 + }, + { + "auxiliary_loss_clip": 0.06423487, + "auxiliary_loss_mlp": 0.01263997, + "balance_loss_clip": 0.06277417, + "balance_loss_mlp": 0.01254115, + "epoch": 0.5505486246805952, + "flos": 22455705029760.0, + "grad_norm": 1.7228062733410818, + "language_loss": 0.82601535, + "learning_rate": 1.7701424638496475e-06, + "loss": 0.90289015, + "num_input_tokens_seen": 197321575, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.09881592, + "step": 9157, + "time_per_iteration": 2.5331945419311523 + }, + { + "auxiliary_loss_clip": 0.06433383, + "auxiliary_loss_mlp": 0.01267609, + "balance_loss_clip": 0.06279938, + "balance_loss_mlp": 0.01255885, + "epoch": 0.5506087479332632, + "flos": 26914220455680.0, + "grad_norm": 2.384583042502796, + "language_loss": 0.7632947, + "learning_rate": 1.7697555872048677e-06, + "loss": 0.84030461, + "num_input_tokens_seen": 197340255, + "router_z_loss_clip": 1.53320312, + "router_z_loss_mlp": 0.11743164, + "step": 9158, + "time_per_iteration": 2.5622854232788086 + }, + { + "auxiliary_loss_clip": 0.06422579, + "auxiliary_loss_mlp": 0.01265094, + "balance_loss_clip": 0.06281133, + "balance_loss_mlp": 0.01255134, + "epoch": 0.5506688711859311, + "flos": 22936967354880.0, + "grad_norm": 1.858566635879154, + "language_loss": 0.70421213, + "learning_rate": 1.769368719290979e-06, + "loss": 0.78108883, + "num_input_tokens_seen": 197360360, + "router_z_loss_clip": 1.41503906, + "router_z_loss_mlp": 0.09967041, + "step": 9159, + "time_per_iteration": 2.5299885272979736 + }, + { + "auxiliary_loss_clip": 0.06426555, + "auxiliary_loss_mlp": 0.01265176, + "balance_loss_clip": 0.06279982, + "balance_loss_mlp": 0.01254114, + "epoch": 0.5507289944385991, + "flos": 29614111989120.0, + "grad_norm": 1.5102709537150474, + "language_loss": 0.68438101, + "learning_rate": 1.7689818601226516e-06, + "loss": 0.7612983, + "num_input_tokens_seen": 197381905, + "router_z_loss_clip": 1.46484375, + "router_z_loss_mlp": 0.11065674, + "step": 9160, + "time_per_iteration": 2.5797348022460938 + }, + { + "auxiliary_loss_clip": 0.06423666, + "auxiliary_loss_mlp": 0.01264259, + "balance_loss_clip": 0.06278166, + "balance_loss_mlp": 0.01252774, + "epoch": 0.5507891176912671, + "flos": 15338736714240.0, + "grad_norm": 1.8978617290593418, + "language_loss": 0.7231009, + "learning_rate": 1.7685950097145552e-06, + "loss": 0.79998016, + "num_input_tokens_seen": 197398555, + "router_z_loss_clip": 1.45605469, + "router_z_loss_mlp": 0.11474609, + "step": 9161, + "time_per_iteration": 2.4746181964874268 + }, + { + "auxiliary_loss_clip": 0.06425308, + "auxiliary_loss_mlp": 0.01270177, + "balance_loss_clip": 0.0627985, + "balance_loss_mlp": 0.01259472, + "epoch": 0.5508492409439351, + "flos": 26585547615360.0, + "grad_norm": 4.143741197260591, + "language_loss": 0.69514179, + "learning_rate": 1.768208168081359e-06, + "loss": 0.77209663, + "num_input_tokens_seen": 197419630, + "router_z_loss_clip": 1.45507812, + "router_z_loss_mlp": 0.10717773, + "step": 9162, + "time_per_iteration": 2.601036548614502 + }, + { + "auxiliary_loss_clip": 0.06422161, + "auxiliary_loss_mlp": 0.01271792, + "balance_loss_clip": 0.06278013, + "balance_loss_mlp": 0.01261164, + "epoch": 0.5509093641966031, + "flos": 25449832575360.0, + "grad_norm": 1.6789972101454846, + "language_loss": 0.85959709, + "learning_rate": 1.767821335237733e-06, + "loss": 0.93653667, + "num_input_tokens_seen": 197438480, + "router_z_loss_clip": 1.44140625, + "router_z_loss_mlp": 0.10638428, + "step": 9163, + "time_per_iteration": 2.539546489715576 + }, + { + "auxiliary_loss_clip": 0.06425934, + "auxiliary_loss_mlp": 0.0126949, + "balance_loss_clip": 0.06282654, + "balance_loss_mlp": 0.01258856, + "epoch": 0.550969487449271, + "flos": 18704652825600.0, + "grad_norm": 1.572244133846192, + "language_loss": 0.81101871, + "learning_rate": 1.7674345111983441e-06, + "loss": 0.88797295, + "num_input_tokens_seen": 197456755, + "router_z_loss_clip": 1.43359375, + "router_z_loss_mlp": 0.10638428, + "step": 9164, + "time_per_iteration": 2.5266709327697754 + }, + { + "auxiliary_loss_clip": 0.06427547, + "auxiliary_loss_mlp": 0.01271715, + "balance_loss_clip": 0.06278498, + "balance_loss_mlp": 0.01260026, + "epoch": 0.551029610701939, + "flos": 22714959162240.0, + "grad_norm": 1.8760540237074659, + "language_loss": 0.73664248, + "learning_rate": 1.767047695977863e-06, + "loss": 0.81363511, + "num_input_tokens_seen": 197475530, + "router_z_loss_clip": 1.49121094, + "router_z_loss_mlp": 0.11688232, + "step": 9165, + "time_per_iteration": 2.511892318725586 + }, + { + "auxiliary_loss_clip": 0.06419477, + "auxiliary_loss_mlp": 0.01269172, + "balance_loss_clip": 0.06277155, + "balance_loss_mlp": 0.01258479, + "epoch": 0.5510897339546069, + "flos": 12425138542080.0, + "grad_norm": 2.0479120482719084, + "language_loss": 0.79496598, + "learning_rate": 1.7666608895909563e-06, + "loss": 0.87185252, + "num_input_tokens_seen": 197490835, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.10687256, + "step": 9166, + "time_per_iteration": 2.5217325687408447 + }, + { + "auxiliary_loss_clip": 0.06426241, + "auxiliary_loss_mlp": 0.01268783, + "balance_loss_clip": 0.06279847, + "balance_loss_mlp": 0.01257232, + "epoch": 0.5511498572072749, + "flos": 18776545228800.0, + "grad_norm": 2.094065158330193, + "language_loss": 0.77047074, + "learning_rate": 1.7662740920522913e-06, + "loss": 0.84742099, + "num_input_tokens_seen": 197508770, + "router_z_loss_clip": 1.46386719, + "router_z_loss_mlp": 0.11560059, + "step": 9167, + "time_per_iteration": 2.5210516452789307 + }, + { + "auxiliary_loss_clip": 0.06422734, + "auxiliary_loss_mlp": 0.01276612, + "balance_loss_clip": 0.06278996, + "balance_loss_mlp": 0.01264995, + "epoch": 0.5512099804599428, + "flos": 19579436651520.0, + "grad_norm": 1.8110306936777156, + "language_loss": 0.80698925, + "learning_rate": 1.7658873033765374e-06, + "loss": 0.88398266, + "num_input_tokens_seen": 197527340, + "router_z_loss_clip": 1.43847656, + "router_z_loss_mlp": 0.11627197, + "step": 9168, + "time_per_iteration": 2.5044801235198975 + }, + { + "auxiliary_loss_clip": 0.06426235, + "auxiliary_loss_mlp": 0.01266078, + "balance_loss_clip": 0.06278569, + "balance_loss_mlp": 0.01255206, + "epoch": 0.5512701037126109, + "flos": 26252053165440.0, + "grad_norm": 1.768039916500128, + "language_loss": 0.6941396, + "learning_rate": 1.7655005235783591e-06, + "loss": 0.77106273, + "num_input_tokens_seen": 197547280, + "router_z_loss_clip": 1.47460938, + "router_z_loss_mlp": 0.10876465, + "step": 9169, + "time_per_iteration": 2.5712435245513916 + }, + { + "auxiliary_loss_clip": 0.06426435, + "auxiliary_loss_mlp": 0.01277267, + "balance_loss_clip": 0.06284146, + "balance_loss_mlp": 0.01267092, + "epoch": 0.5513302269652788, + "flos": 21951997009920.0, + "grad_norm": 1.7919633768432253, + "language_loss": 0.85238504, + "learning_rate": 1.7651137526724251e-06, + "loss": 0.92942202, + "num_input_tokens_seen": 197565045, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.10174561, + "step": 9170, + "time_per_iteration": 2.6517226696014404 + }, + { + "auxiliary_loss_clip": 0.06339835, + "auxiliary_loss_mlp": 0.01252247, + "balance_loss_clip": 0.06277715, + "balance_loss_mlp": 0.01250597, + "epoch": 0.5513903502179468, + "flos": 68254728589440.0, + "grad_norm": 0.7663699077680228, + "language_loss": 0.59884483, + "learning_rate": 1.7647269906734017e-06, + "loss": 0.67476565, + "num_input_tokens_seen": 197625005, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 0.01652527, + "step": 9171, + "time_per_iteration": 3.190981864929199 + }, + { + "auxiliary_loss_clip": 0.06426144, + "auxiliary_loss_mlp": 0.01271114, + "balance_loss_clip": 0.06280371, + "balance_loss_mlp": 0.01260159, + "epoch": 0.5514504734706147, + "flos": 18740221683840.0, + "grad_norm": 1.5861452481841698, + "language_loss": 0.7047599, + "learning_rate": 1.7643402375959533e-06, + "loss": 0.78173256, + "num_input_tokens_seen": 197645050, + "router_z_loss_clip": 1.45800781, + "router_z_loss_mlp": 0.10961914, + "step": 9172, + "time_per_iteration": 2.5032176971435547 + }, + { + "auxiliary_loss_clip": 0.06426188, + "auxiliary_loss_mlp": 0.01273715, + "balance_loss_clip": 0.06281123, + "balance_loss_mlp": 0.01263218, + "epoch": 0.5515105967232827, + "flos": 22277147978880.0, + "grad_norm": 1.7175476935278873, + "language_loss": 0.76203263, + "learning_rate": 1.7639534934547474e-06, + "loss": 0.8390317, + "num_input_tokens_seen": 197663910, + "router_z_loss_clip": 1.44921875, + "router_z_loss_mlp": 0.10498047, + "step": 9173, + "time_per_iteration": 2.577878713607788 + }, + { + "auxiliary_loss_clip": 0.06421756, + "auxiliary_loss_mlp": 0.01264421, + "balance_loss_clip": 0.0627896, + "balance_loss_mlp": 0.01253359, + "epoch": 0.5515707199759508, + "flos": 22563040510080.0, + "grad_norm": 1.5999460100016771, + "language_loss": 0.75182664, + "learning_rate": 1.7635667582644484e-06, + "loss": 0.82868844, + "num_input_tokens_seen": 197681580, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.11077881, + "step": 9174, + "time_per_iteration": 2.520578384399414 + }, + { + "auxiliary_loss_clip": 0.06429856, + "auxiliary_loss_mlp": 0.0126509, + "balance_loss_clip": 0.06282729, + "balance_loss_mlp": 0.01253866, + "epoch": 0.5516308432286187, + "flos": 28298246670720.0, + "grad_norm": 1.7068220971376928, + "language_loss": 0.72958624, + "learning_rate": 1.7631800320397217e-06, + "loss": 0.80653572, + "num_input_tokens_seen": 197702095, + "router_z_loss_clip": 1.47070312, + "router_z_loss_mlp": 0.11206055, + "step": 9175, + "time_per_iteration": 2.5991220474243164 + }, + { + "auxiliary_loss_clip": 0.06423448, + "auxiliary_loss_mlp": 0.01272105, + "balance_loss_clip": 0.06278881, + "balance_loss_mlp": 0.01261192, + "epoch": 0.5516909664812867, + "flos": 18769417632000.0, + "grad_norm": 1.996679187528513, + "language_loss": 0.69295454, + "learning_rate": 1.7627933147952318e-06, + "loss": 0.7699101, + "num_input_tokens_seen": 197720720, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.10919189, + "step": 9176, + "time_per_iteration": 2.4903998374938965 + }, + { + "auxiliary_loss_clip": 0.06421016, + "auxiliary_loss_mlp": 0.01270885, + "balance_loss_clip": 0.06278497, + "balance_loss_mlp": 0.01260467, + "epoch": 0.5517510897339546, + "flos": 27746852878080.0, + "grad_norm": 1.714802927656724, + "language_loss": 0.71279752, + "learning_rate": 1.7624066065456435e-06, + "loss": 0.78971648, + "num_input_tokens_seen": 197741820, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.10418701, + "step": 9177, + "time_per_iteration": 3.9531290531158447 + }, + { + "auxiliary_loss_clip": 0.06428478, + "auxiliary_loss_mlp": 0.01269605, + "balance_loss_clip": 0.06282966, + "balance_loss_mlp": 0.0125924, + "epoch": 0.5518112129866226, + "flos": 18410165251200.0, + "grad_norm": 1.801915682479776, + "language_loss": 0.80691963, + "learning_rate": 1.7620199073056204e-06, + "loss": 0.8839004, + "num_input_tokens_seen": 197759160, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.10369873, + "step": 9178, + "time_per_iteration": 2.5356597900390625 + }, + { + "auxiliary_loss_clip": 0.06432515, + "auxiliary_loss_mlp": 0.01265625, + "balance_loss_clip": 0.06282209, + "balance_loss_mlp": 0.01254228, + "epoch": 0.5518713362392905, + "flos": 25089699726720.0, + "grad_norm": 1.5622133019409348, + "language_loss": 0.7545979, + "learning_rate": 1.761633217089826e-06, + "loss": 0.83157933, + "num_input_tokens_seen": 197779760, + "router_z_loss_clip": 1.50292969, + "router_z_loss_mlp": 0.11395264, + "step": 9179, + "time_per_iteration": 2.598055124282837 + }, + { + "auxiliary_loss_clip": 0.06425376, + "auxiliary_loss_mlp": 0.01269609, + "balance_loss_clip": 0.06280036, + "balance_loss_mlp": 0.01259005, + "epoch": 0.5519314594919585, + "flos": 36547911279360.0, + "grad_norm": 1.6999645614086591, + "language_loss": 0.70073718, + "learning_rate": 1.761246535912924e-06, + "loss": 0.77768701, + "num_input_tokens_seen": 197801545, + "router_z_loss_clip": 1.453125, + "router_z_loss_mlp": 0.1060791, + "step": 9180, + "time_per_iteration": 2.6791419982910156 + }, + { + "auxiliary_loss_clip": 0.06424871, + "auxiliary_loss_mlp": 0.01268506, + "balance_loss_clip": 0.06279478, + "balance_loss_mlp": 0.01257121, + "epoch": 0.5519915827446265, + "flos": 20454807456000.0, + "grad_norm": 1.7661274413355668, + "language_loss": 0.67505682, + "learning_rate": 1.7608598637895776e-06, + "loss": 0.75199056, + "num_input_tokens_seen": 197820760, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.11376953, + "step": 9181, + "time_per_iteration": 4.004978656768799 + }, + { + "auxiliary_loss_clip": 0.06431428, + "auxiliary_loss_mlp": 0.01267631, + "balance_loss_clip": 0.06280805, + "balance_loss_mlp": 0.01256682, + "epoch": 0.5520517059972945, + "flos": 23774672949120.0, + "grad_norm": 1.9095811471330626, + "language_loss": 0.79281217, + "learning_rate": 1.7604732007344486e-06, + "loss": 0.86980277, + "num_input_tokens_seen": 197840195, + "router_z_loss_clip": 1.50683594, + "router_z_loss_mlp": 0.10949707, + "step": 9182, + "time_per_iteration": 2.537867546081543 + }, + { + "auxiliary_loss_clip": 0.06428897, + "auxiliary_loss_mlp": 0.0126956, + "balance_loss_clip": 0.06281601, + "balance_loss_mlp": 0.01258259, + "epoch": 0.5521118292499624, + "flos": 22202362609920.0, + "grad_norm": 1.7640468757897252, + "language_loss": 0.83230162, + "learning_rate": 1.7600865467622003e-06, + "loss": 0.9092862, + "num_input_tokens_seen": 197859475, + "router_z_loss_clip": 1.47167969, + "router_z_loss_mlp": 0.11303711, + "step": 9183, + "time_per_iteration": 2.5279808044433594 + }, + { + "auxiliary_loss_clip": 0.0642349, + "auxiliary_loss_mlp": 0.01270068, + "balance_loss_clip": 0.0627853, + "balance_loss_mlp": 0.01259632, + "epoch": 0.5521719525026304, + "flos": 23589491425920.0, + "grad_norm": 1.2800662076099543, + "language_loss": 0.67446053, + "learning_rate": 1.7596999018874936e-06, + "loss": 0.75139618, + "num_input_tokens_seen": 197879395, + "router_z_loss_clip": 1.45019531, + "router_z_loss_mlp": 0.10437012, + "step": 9184, + "time_per_iteration": 2.684945821762085 + }, + { + "auxiliary_loss_clip": 0.06425154, + "auxiliary_loss_mlp": 0.01269673, + "balance_loss_clip": 0.06279694, + "balance_loss_mlp": 0.01258652, + "epoch": 0.5522320757552983, + "flos": 26144298414720.0, + "grad_norm": 1.5606033277911597, + "language_loss": 0.76214409, + "learning_rate": 1.7593132661249917e-06, + "loss": 0.83909237, + "num_input_tokens_seen": 197900815, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.11016846, + "step": 9185, + "time_per_iteration": 2.654999017715454 + }, + { + "auxiliary_loss_clip": 0.06428938, + "auxiliary_loss_mlp": 0.01270824, + "balance_loss_clip": 0.06280778, + "balance_loss_mlp": 0.01259661, + "epoch": 0.5522921990079663, + "flos": 24682258448640.0, + "grad_norm": 1.714573937603497, + "language_loss": 0.73903292, + "learning_rate": 1.7589266394893536e-06, + "loss": 0.8160305, + "num_input_tokens_seen": 197918985, + "router_z_loss_clip": 1.47949219, + "router_z_loss_mlp": 0.1116333, + "step": 9186, + "time_per_iteration": 4.173564672470093 + }, + { + "auxiliary_loss_clip": 0.06430478, + "auxiliary_loss_mlp": 0.0127082, + "balance_loss_clip": 0.06282008, + "balance_loss_mlp": 0.01260032, + "epoch": 0.5523523222606344, + "flos": 22754888432640.0, + "grad_norm": 1.9890242222634391, + "language_loss": 0.66822404, + "learning_rate": 1.7585400219952421e-06, + "loss": 0.74523699, + "num_input_tokens_seen": 197937725, + "router_z_loss_clip": 1.48339844, + "router_z_loss_mlp": 0.10784912, + "step": 9187, + "time_per_iteration": 2.5402488708496094 + }, + { + "auxiliary_loss_clip": 0.06424463, + "auxiliary_loss_mlp": 0.01272464, + "balance_loss_clip": 0.06278258, + "balance_loss_mlp": 0.01261663, + "epoch": 0.5524124455133023, + "flos": 19761976771200.0, + "grad_norm": 1.6249988598177185, + "language_loss": 0.77965587, + "learning_rate": 1.758153413657318e-06, + "loss": 0.85662508, + "num_input_tokens_seen": 197955635, + "router_z_loss_clip": 1.45996094, + "router_z_loss_mlp": 0.10803223, + "step": 9188, + "time_per_iteration": 2.4915547370910645 + }, + { + "auxiliary_loss_clip": 0.06426179, + "auxiliary_loss_mlp": 0.01274155, + "balance_loss_clip": 0.06280048, + "balance_loss_mlp": 0.01262579, + "epoch": 0.5524725687659703, + "flos": 23301544469760.0, + "grad_norm": 1.615723789328545, + "language_loss": 0.81586993, + "learning_rate": 1.7577668144902394e-06, + "loss": 0.89287329, + "num_input_tokens_seen": 197974490, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.11572266, + "step": 9189, + "time_per_iteration": 2.540083885192871 + }, + { + "auxiliary_loss_clip": 0.06419186, + "auxiliary_loss_mlp": 0.01269353, + "balance_loss_clip": 0.06276601, + "balance_loss_mlp": 0.0125776, + "epoch": 0.5525326920186382, + "flos": 24868907418240.0, + "grad_norm": 1.331008644060519, + "language_loss": 0.76847303, + "learning_rate": 1.7573802245086684e-06, + "loss": 0.84535837, + "num_input_tokens_seen": 197995735, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.1159668, + "step": 9190, + "time_per_iteration": 2.597717046737671 + }, + { + "auxiliary_loss_clip": 0.0643147, + "auxiliary_loss_mlp": 0.01272383, + "balance_loss_clip": 0.06278718, + "balance_loss_mlp": 0.01260438, + "epoch": 0.5525928152713062, + "flos": 13740710371200.0, + "grad_norm": 2.3910114977567787, + "language_loss": 0.79437977, + "learning_rate": 1.7569936437272627e-06, + "loss": 0.87141836, + "num_input_tokens_seen": 198009685, + "router_z_loss_clip": 1.52636719, + "router_z_loss_mlp": 0.11950684, + "step": 9191, + "time_per_iteration": 2.547445774078369 + }, + { + "auxiliary_loss_clip": 0.06422585, + "auxiliary_loss_mlp": 0.01264097, + "balance_loss_clip": 0.06276913, + "balance_loss_mlp": 0.01253624, + "epoch": 0.5526529385239741, + "flos": 13075398552960.0, + "grad_norm": 2.207227027061606, + "language_loss": 0.6899271, + "learning_rate": 1.7566070721606829e-06, + "loss": 0.76679391, + "num_input_tokens_seen": 198026845, + "router_z_loss_clip": 1.45800781, + "router_z_loss_mlp": 0.10473633, + "step": 9192, + "time_per_iteration": 2.4774858951568604 + }, + { + "auxiliary_loss_clip": 0.06421191, + "auxiliary_loss_mlp": 0.01268175, + "balance_loss_clip": 0.06277353, + "balance_loss_mlp": 0.01257786, + "epoch": 0.5527130617766421, + "flos": 23154992478720.0, + "grad_norm": 1.5351732563488263, + "language_loss": 0.77348876, + "learning_rate": 1.756220509823588e-06, + "loss": 0.85038239, + "num_input_tokens_seen": 198045275, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.10400391, + "step": 9193, + "time_per_iteration": 3.9115588665008545 + }, + { + "auxiliary_loss_clip": 0.06421337, + "auxiliary_loss_mlp": 0.01271193, + "balance_loss_clip": 0.06275223, + "balance_loss_mlp": 0.01260357, + "epoch": 0.55277318502931, + "flos": 21291506801280.0, + "grad_norm": 1.5126002389204065, + "language_loss": 0.79036456, + "learning_rate": 1.7558339567306344e-06, + "loss": 0.8672899, + "num_input_tokens_seen": 198065760, + "router_z_loss_clip": 1.45996094, + "router_z_loss_mlp": 0.1083374, + "step": 9194, + "time_per_iteration": 2.5319602489471436 + }, + { + "auxiliary_loss_clip": 0.06427231, + "auxiliary_loss_mlp": 0.01269531, + "balance_loss_clip": 0.06274066, + "balance_loss_mlp": 0.01258189, + "epoch": 0.5528333082819781, + "flos": 38333383205760.0, + "grad_norm": 1.8079647356103097, + "language_loss": 0.70506799, + "learning_rate": 1.7554474128964825e-06, + "loss": 0.78203559, + "num_input_tokens_seen": 198087595, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.11340332, + "step": 9195, + "time_per_iteration": 2.6384387016296387 + }, + { + "auxiliary_loss_clip": 0.06436112, + "auxiliary_loss_mlp": 0.01266283, + "balance_loss_clip": 0.06281462, + "balance_loss_mlp": 0.01253778, + "epoch": 0.552893431534646, + "flos": 13558799157120.0, + "grad_norm": 2.003941554047622, + "language_loss": 0.74570775, + "learning_rate": 1.7550608783357887e-06, + "loss": 0.82273173, + "num_input_tokens_seen": 198104620, + "router_z_loss_clip": 1.546875, + "router_z_loss_mlp": 0.12506104, + "step": 9196, + "time_per_iteration": 2.5033600330352783 + }, + { + "auxiliary_loss_clip": 0.06429259, + "auxiliary_loss_mlp": 0.01263784, + "balance_loss_clip": 0.0628302, + "balance_loss_mlp": 0.01252656, + "epoch": 0.552953554787314, + "flos": 21944995194240.0, + "grad_norm": 1.6318385903460113, + "language_loss": 0.77179539, + "learning_rate": 1.7546743530632115e-06, + "loss": 0.8487258, + "num_input_tokens_seen": 198123565, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.11126709, + "step": 9197, + "time_per_iteration": 2.500624895095825 + }, + { + "auxiliary_loss_clip": 0.06421226, + "auxiliary_loss_mlp": 0.01269574, + "balance_loss_clip": 0.06276499, + "balance_loss_mlp": 0.01259316, + "epoch": 0.5530136780399819, + "flos": 43668820736640.0, + "grad_norm": 1.4562548285485233, + "language_loss": 0.76468647, + "learning_rate": 1.754287837093407e-06, + "loss": 0.84159452, + "num_input_tokens_seen": 198148270, + "router_z_loss_clip": 1.44824219, + "router_z_loss_mlp": 0.1026001, + "step": 9198, + "time_per_iteration": 2.7432668209075928 + }, + { + "auxiliary_loss_clip": 0.06427757, + "auxiliary_loss_mlp": 0.0126746, + "balance_loss_clip": 0.06281044, + "balance_loss_mlp": 0.01256994, + "epoch": 0.5530738012926499, + "flos": 25052411859840.0, + "grad_norm": 1.5004430901507595, + "language_loss": 0.79301012, + "learning_rate": 1.7539013304410327e-06, + "loss": 0.86996233, + "num_input_tokens_seen": 198168810, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 0.10461426, + "step": 9199, + "time_per_iteration": 2.547755241394043 + }, + { + "auxiliary_loss_clip": 0.06422742, + "auxiliary_loss_mlp": 0.01266548, + "balance_loss_clip": 0.06276976, + "balance_loss_mlp": 0.01255962, + "epoch": 0.553133924545318, + "flos": 16477680136320.0, + "grad_norm": 1.9305306774012563, + "language_loss": 0.63492346, + "learning_rate": 1.7535148331207443e-06, + "loss": 0.71181637, + "num_input_tokens_seen": 198186200, + "router_z_loss_clip": 1.45605469, + "router_z_loss_mlp": 0.10577393, + "step": 9200, + "time_per_iteration": 2.5127363204956055 + }, + { + "auxiliary_loss_clip": 0.06431345, + "auxiliary_loss_mlp": 0.01265429, + "balance_loss_clip": 0.06280623, + "balance_loss_mlp": 0.01253866, + "epoch": 0.5531940477979859, + "flos": 24612797813760.0, + "grad_norm": 1.757338852617271, + "language_loss": 0.66817963, + "learning_rate": 1.7531283451471978e-06, + "loss": 0.74514735, + "num_input_tokens_seen": 198207050, + "router_z_loss_clip": 1.50683594, + "router_z_loss_mlp": 0.11560059, + "step": 9201, + "time_per_iteration": 2.5651068687438965 + }, + { + "auxiliary_loss_clip": 0.06425701, + "auxiliary_loss_mlp": 0.01270434, + "balance_loss_clip": 0.06278911, + "balance_loss_mlp": 0.0125871, + "epoch": 0.5532541710506539, + "flos": 22165410159360.0, + "grad_norm": 2.045638683899954, + "language_loss": 0.61266994, + "learning_rate": 1.7527418665350502e-06, + "loss": 0.68963134, + "num_input_tokens_seen": 198224565, + "router_z_loss_clip": 1.46679688, + "router_z_loss_mlp": 0.11737061, + "step": 9202, + "time_per_iteration": 2.5841257572174072 + }, + { + "auxiliary_loss_clip": 0.06419975, + "auxiliary_loss_mlp": 0.01264358, + "balance_loss_clip": 0.06278098, + "balance_loss_mlp": 0.01253493, + "epoch": 0.5533142943033218, + "flos": 21403621964160.0, + "grad_norm": 1.6777411475808515, + "language_loss": 0.64766765, + "learning_rate": 1.7523553972989548e-06, + "loss": 0.72451103, + "num_input_tokens_seen": 198244790, + "router_z_loss_clip": 1.41894531, + "router_z_loss_mlp": 0.10864258, + "step": 9203, + "time_per_iteration": 2.502300977706909 + }, + { + "auxiliary_loss_clip": 0.06425197, + "auxiliary_loss_mlp": 0.01269086, + "balance_loss_clip": 0.06279255, + "balance_loss_mlp": 0.01258065, + "epoch": 0.5533744175559898, + "flos": 23557360584960.0, + "grad_norm": 1.630044734052438, + "language_loss": 0.63918829, + "learning_rate": 1.7519689374535683e-06, + "loss": 0.71613109, + "num_input_tokens_seen": 198264375, + "router_z_loss_clip": 1.45996094, + "router_z_loss_mlp": 0.11022949, + "step": 9204, + "time_per_iteration": 2.5487308502197266 + }, + { + "auxiliary_loss_clip": 0.0642142, + "auxiliary_loss_mlp": 0.01264869, + "balance_loss_clip": 0.06278381, + "balance_loss_mlp": 0.01254451, + "epoch": 0.5534345408086577, + "flos": 24068447763840.0, + "grad_norm": 1.4496742073495597, + "language_loss": 0.77449042, + "learning_rate": 1.7515824870135445e-06, + "loss": 0.85135335, + "num_input_tokens_seen": 198283895, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.10418701, + "step": 9205, + "time_per_iteration": 2.5445451736450195 + }, + { + "auxiliary_loss_clip": 0.06419459, + "auxiliary_loss_mlp": 0.01264463, + "balance_loss_clip": 0.06277758, + "balance_loss_mlp": 0.01254104, + "epoch": 0.5534946640613257, + "flos": 33781242441600.0, + "grad_norm": 1.38023808830968, + "language_loss": 0.72729224, + "learning_rate": 1.751196045993537e-06, + "loss": 0.80413151, + "num_input_tokens_seen": 198310035, + "router_z_loss_clip": 1.41699219, + "router_z_loss_mlp": 0.1036377, + "step": 9206, + "time_per_iteration": 2.7339117527008057 + }, + { + "auxiliary_loss_clip": 0.06421407, + "auxiliary_loss_mlp": 0.01265704, + "balance_loss_clip": 0.06274875, + "balance_loss_mlp": 0.01255005, + "epoch": 0.5535547873139937, + "flos": 15164707783680.0, + "grad_norm": 1.9977188658051825, + "language_loss": 0.7547437, + "learning_rate": 1.7508096144082012e-06, + "loss": 0.83161485, + "num_input_tokens_seen": 198327810, + "router_z_loss_clip": 1.46386719, + "router_z_loss_mlp": 0.10699463, + "step": 9207, + "time_per_iteration": 2.482356548309326 + }, + { + "auxiliary_loss_clip": 0.06436527, + "auxiliary_loss_mlp": 0.01265889, + "balance_loss_clip": 0.06285885, + "balance_loss_mlp": 0.01254493, + "epoch": 0.5536149105666617, + "flos": 16986209765760.0, + "grad_norm": 2.498092208232672, + "language_loss": 0.61888683, + "learning_rate": 1.750423192272189e-06, + "loss": 0.69591099, + "num_input_tokens_seen": 198343150, + "router_z_loss_clip": 1.50488281, + "router_z_loss_mlp": 0.1138916, + "step": 9208, + "time_per_iteration": 2.493628740310669 + }, + { + "auxiliary_loss_clip": 0.06428279, + "auxiliary_loss_mlp": 0.01268207, + "balance_loss_clip": 0.06278799, + "balance_loss_mlp": 0.01256543, + "epoch": 0.5536750338193296, + "flos": 18155732728320.0, + "grad_norm": 2.094677241914043, + "language_loss": 0.64708155, + "learning_rate": 1.7500367796001547e-06, + "loss": 0.72404641, + "num_input_tokens_seen": 198360925, + "router_z_loss_clip": 1.49511719, + "router_z_loss_mlp": 0.11663818, + "step": 9209, + "time_per_iteration": 2.4616804122924805 + }, + { + "auxiliary_loss_clip": 0.06424735, + "auxiliary_loss_mlp": 0.01272111, + "balance_loss_clip": 0.06279891, + "balance_loss_mlp": 0.01260863, + "epoch": 0.5537351570719976, + "flos": 22754469162240.0, + "grad_norm": 1.8280568303571236, + "language_loss": 0.82967091, + "learning_rate": 1.7496503764067513e-06, + "loss": 0.90663934, + "num_input_tokens_seen": 198379265, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.11242676, + "step": 9210, + "time_per_iteration": 2.564713954925537 + }, + { + "auxiliary_loss_clip": 0.06418703, + "auxiliary_loss_mlp": 0.01265805, + "balance_loss_clip": 0.06275869, + "balance_loss_mlp": 0.01255381, + "epoch": 0.5537952803246655, + "flos": 26362658954880.0, + "grad_norm": 1.71176011345987, + "language_loss": 0.72960317, + "learning_rate": 1.74926398270663e-06, + "loss": 0.80644828, + "num_input_tokens_seen": 198399490, + "router_z_loss_clip": 1.42675781, + "router_z_loss_mlp": 0.10430908, + "step": 9211, + "time_per_iteration": 2.5312066078186035 + }, + { + "auxiliary_loss_clip": 0.06431179, + "auxiliary_loss_mlp": 0.01267507, + "balance_loss_clip": 0.06280635, + "balance_loss_mlp": 0.01256045, + "epoch": 0.5538554035773335, + "flos": 18042695170560.0, + "grad_norm": 2.3508559175952803, + "language_loss": 0.67497891, + "learning_rate": 1.7488775985144437e-06, + "loss": 0.75196576, + "num_input_tokens_seen": 198419110, + "router_z_loss_clip": 1.50585938, + "router_z_loss_mlp": 0.11462402, + "step": 9212, + "time_per_iteration": 2.5141408443450928 + }, + { + "auxiliary_loss_clip": 0.06429373, + "auxiliary_loss_mlp": 0.01268343, + "balance_loss_clip": 0.06279323, + "balance_loss_mlp": 0.0125554, + "epoch": 0.5539155268300014, + "flos": 31694323052160.0, + "grad_norm": 1.4365879651928444, + "language_loss": 0.5225575, + "learning_rate": 1.7484912238448443e-06, + "loss": 0.59953463, + "num_input_tokens_seen": 198441360, + "router_z_loss_clip": 1.49902344, + "router_z_loss_mlp": 0.12792969, + "step": 9213, + "time_per_iteration": 2.5764448642730713 + }, + { + "auxiliary_loss_clip": 0.06431083, + "auxiliary_loss_mlp": 0.01264603, + "balance_loss_clip": 0.06282363, + "balance_loss_mlp": 0.01253302, + "epoch": 0.5539756500826695, + "flos": 15198934976640.0, + "grad_norm": 1.6892906357761146, + "language_loss": 0.85764515, + "learning_rate": 1.7481048587124827e-06, + "loss": 0.93460202, + "num_input_tokens_seen": 198459835, + "router_z_loss_clip": 1.48632812, + "router_z_loss_mlp": 0.11303711, + "step": 9214, + "time_per_iteration": 2.5433578491210938 + }, + { + "auxiliary_loss_clip": 0.06422558, + "auxiliary_loss_mlp": 0.01262824, + "balance_loss_clip": 0.06277601, + "balance_loss_mlp": 0.01252333, + "epoch": 0.5540357733353375, + "flos": 26359262864640.0, + "grad_norm": 1.8961662277212366, + "language_loss": 0.70100081, + "learning_rate": 1.7477185031320108e-06, + "loss": 0.77785456, + "num_input_tokens_seen": 198478955, + "router_z_loss_clip": 1.45019531, + "router_z_loss_mlp": 0.10491943, + "step": 9215, + "time_per_iteration": 2.548687696456909 + }, + { + "auxiliary_loss_clip": 0.06428155, + "auxiliary_loss_mlp": 0.01266334, + "balance_loss_clip": 0.06279612, + "balance_loss_mlp": 0.01254825, + "epoch": 0.5540958965880054, + "flos": 21329926698240.0, + "grad_norm": 1.6927060371572338, + "language_loss": 0.73713386, + "learning_rate": 1.7473321571180773e-06, + "loss": 0.81407875, + "num_input_tokens_seen": 198499030, + "router_z_loss_clip": 1.48535156, + "router_z_loss_mlp": 0.1151123, + "step": 9216, + "time_per_iteration": 2.541210174560547 + }, + { + "auxiliary_loss_clip": 0.06421469, + "auxiliary_loss_mlp": 0.01265486, + "balance_loss_clip": 0.06278324, + "balance_loss_mlp": 0.01254471, + "epoch": 0.5541560198406734, + "flos": 25674020974080.0, + "grad_norm": 1.768513313341331, + "language_loss": 0.71651757, + "learning_rate": 1.7469458206853345e-06, + "loss": 0.79338706, + "num_input_tokens_seen": 198520265, + "router_z_loss_clip": 1.43164062, + "router_z_loss_mlp": 0.11029053, + "step": 9217, + "time_per_iteration": 4.048692226409912 + }, + { + "auxiliary_loss_clip": 0.0642062, + "auxiliary_loss_mlp": 0.01262573, + "balance_loss_clip": 0.06274968, + "balance_loss_mlp": 0.01251993, + "epoch": 0.5542161430933413, + "flos": 21945246756480.0, + "grad_norm": 1.641855173543887, + "language_loss": 0.78896093, + "learning_rate": 1.7465594938484315e-06, + "loss": 0.86579281, + "num_input_tokens_seen": 198539645, + "router_z_loss_clip": 1.45703125, + "router_z_loss_mlp": 0.10577393, + "step": 9218, + "time_per_iteration": 2.5090229511260986 + }, + { + "auxiliary_loss_clip": 0.06429659, + "auxiliary_loss_mlp": 0.01266909, + "balance_loss_clip": 0.06280088, + "balance_loss_mlp": 0.01255023, + "epoch": 0.5542762663460093, + "flos": 19577256445440.0, + "grad_norm": 1.9145093316494244, + "language_loss": 0.72342837, + "learning_rate": 1.7461731766220176e-06, + "loss": 0.80039406, + "num_input_tokens_seen": 198558710, + "router_z_loss_clip": 1.49316406, + "router_z_loss_mlp": 0.11889648, + "step": 9219, + "time_per_iteration": 2.6097207069396973 + }, + { + "auxiliary_loss_clip": 0.06423312, + "auxiliary_loss_mlp": 0.01267842, + "balance_loss_clip": 0.06275792, + "balance_loss_mlp": 0.01256809, + "epoch": 0.5543363895986773, + "flos": 19504944771840.0, + "grad_norm": 1.6265573389583097, + "language_loss": 0.7175796, + "learning_rate": 1.7457868690207426e-06, + "loss": 0.79449117, + "num_input_tokens_seen": 198577050, + "router_z_loss_clip": 1.47558594, + "router_z_loss_mlp": 0.11035156, + "step": 9220, + "time_per_iteration": 3.953366756439209 + }, + { + "auxiliary_loss_clip": 0.0641966, + "auxiliary_loss_mlp": 0.01266715, + "balance_loss_clip": 0.06276264, + "balance_loss_mlp": 0.01256154, + "epoch": 0.5543965128513453, + "flos": 22641808947840.0, + "grad_norm": 1.5837082117197903, + "language_loss": 0.79554594, + "learning_rate": 1.7454005710592547e-06, + "loss": 0.8724097, + "num_input_tokens_seen": 198595290, + "router_z_loss_clip": 1.43359375, + "router_z_loss_mlp": 0.10565186, + "step": 9221, + "time_per_iteration": 2.6012284755706787 + }, + { + "auxiliary_loss_clip": 0.06419835, + "auxiliary_loss_mlp": 0.01268367, + "balance_loss_clip": 0.06276818, + "balance_loss_mlp": 0.0125715, + "epoch": 0.5544566361040132, + "flos": 25996320904320.0, + "grad_norm": 1.7031606951897913, + "language_loss": 0.8378005, + "learning_rate": 1.7450142827522027e-06, + "loss": 0.91468251, + "num_input_tokens_seen": 198614110, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.11224365, + "step": 9222, + "time_per_iteration": 2.5621228218078613 + }, + { + "auxiliary_loss_clip": 0.06426205, + "auxiliary_loss_mlp": 0.01268401, + "balance_loss_clip": 0.06276226, + "balance_loss_mlp": 0.01256236, + "epoch": 0.5545167593566812, + "flos": 28265235361920.0, + "grad_norm": 1.624171595552914, + "language_loss": 0.75644016, + "learning_rate": 1.7446280041142344e-06, + "loss": 0.83338618, + "num_input_tokens_seen": 198633880, + "router_z_loss_clip": 1.50097656, + "router_z_loss_mlp": 0.1217041, + "step": 9223, + "time_per_iteration": 2.6189255714416504 + }, + { + "auxiliary_loss_clip": 0.06421085, + "auxiliary_loss_mlp": 0.012666, + "balance_loss_clip": 0.06275317, + "balance_loss_mlp": 0.01255168, + "epoch": 0.5545768826093491, + "flos": 28484266734720.0, + "grad_norm": 1.537609394832996, + "language_loss": 0.81879461, + "learning_rate": 1.7442417351599986e-06, + "loss": 0.89567149, + "num_input_tokens_seen": 198653505, + "router_z_loss_clip": 1.45800781, + "router_z_loss_mlp": 0.11425781, + "step": 9224, + "time_per_iteration": 2.5794196128845215 + }, + { + "auxiliary_loss_clip": 0.06424309, + "auxiliary_loss_mlp": 0.01271127, + "balance_loss_clip": 0.06276202, + "balance_loss_mlp": 0.01259432, + "epoch": 0.5546370058620171, + "flos": 18483860517120.0, + "grad_norm": 1.6794429489770297, + "language_loss": 0.57241935, + "learning_rate": 1.743855475904141e-06, + "loss": 0.64937371, + "num_input_tokens_seen": 198671890, + "router_z_loss_clip": 1.47949219, + "router_z_loss_mlp": 0.11688232, + "step": 9225, + "time_per_iteration": 3.9698383808135986 + }, + { + "auxiliary_loss_clip": 0.06422257, + "auxiliary_loss_mlp": 0.01267893, + "balance_loss_clip": 0.06275012, + "balance_loss_mlp": 0.01257009, + "epoch": 0.554697129114685, + "flos": 22937260844160.0, + "grad_norm": 1.5804786041677554, + "language_loss": 0.6778791, + "learning_rate": 1.7434692263613098e-06, + "loss": 0.75478059, + "num_input_tokens_seen": 198691995, + "router_z_loss_clip": 1.47460938, + "router_z_loss_mlp": 0.10870361, + "step": 9226, + "time_per_iteration": 2.5307633876800537 + }, + { + "auxiliary_loss_clip": 0.06423603, + "auxiliary_loss_mlp": 0.01267041, + "balance_loss_clip": 0.06275073, + "balance_loss_mlp": 0.01256002, + "epoch": 0.5547572523673531, + "flos": 21803348666880.0, + "grad_norm": 1.2977635143377364, + "language_loss": 0.74954712, + "learning_rate": 1.7430829865461518e-06, + "loss": 0.82645351, + "num_input_tokens_seen": 198712440, + "router_z_loss_clip": 1.48339844, + "router_z_loss_mlp": 0.11047363, + "step": 9227, + "time_per_iteration": 2.5083706378936768 + }, + { + "auxiliary_loss_clip": 0.06423934, + "auxiliary_loss_mlp": 0.01266224, + "balance_loss_clip": 0.06275739, + "balance_loss_mlp": 0.01254768, + "epoch": 0.5548173756200211, + "flos": 22348830746880.0, + "grad_norm": 1.524887798675916, + "language_loss": 0.73794919, + "learning_rate": 1.7426967564733118e-06, + "loss": 0.81485081, + "num_input_tokens_seen": 198731515, + "router_z_loss_clip": 1.48046875, + "router_z_loss_mlp": 0.11444092, + "step": 9228, + "time_per_iteration": 2.555020809173584 + }, + { + "auxiliary_loss_clip": 0.06423147, + "auxiliary_loss_mlp": 0.01263866, + "balance_loss_clip": 0.06276013, + "balance_loss_mlp": 0.01253465, + "epoch": 0.554877498872689, + "flos": 17864599317120.0, + "grad_norm": 1.7043498128680434, + "language_loss": 0.76352561, + "learning_rate": 1.7423105361574373e-06, + "loss": 0.84039581, + "num_input_tokens_seen": 198749750, + "router_z_loss_clip": 1.47167969, + "router_z_loss_mlp": 0.10400391, + "step": 9229, + "time_per_iteration": 2.4959444999694824 + }, + { + "auxiliary_loss_clip": 0.06423293, + "auxiliary_loss_mlp": 0.01266918, + "balance_loss_clip": 0.06275852, + "balance_loss_mlp": 0.0125464, + "epoch": 0.554937622125357, + "flos": 17244080305920.0, + "grad_norm": 1.4897541866361217, + "language_loss": 0.69068646, + "learning_rate": 1.741924325613172e-06, + "loss": 0.76758856, + "num_input_tokens_seen": 198768320, + "router_z_loss_clip": 1.47363281, + "router_z_loss_mlp": 0.12280273, + "step": 9230, + "time_per_iteration": 2.5090713500976562 + }, + { + "auxiliary_loss_clip": 0.06427252, + "auxiliary_loss_mlp": 0.01267128, + "balance_loss_clip": 0.06276985, + "balance_loss_mlp": 0.01254587, + "epoch": 0.5549977453780249, + "flos": 25374082884480.0, + "grad_norm": 2.3665837136773047, + "language_loss": 0.68808627, + "learning_rate": 1.741538124855163e-06, + "loss": 0.76503003, + "num_input_tokens_seen": 198787230, + "router_z_loss_clip": 1.50097656, + "router_z_loss_mlp": 0.12554932, + "step": 9231, + "time_per_iteration": 2.5350747108459473 + }, + { + "auxiliary_loss_clip": 0.06429425, + "auxiliary_loss_mlp": 0.01269438, + "balance_loss_clip": 0.06277338, + "balance_loss_mlp": 0.01256885, + "epoch": 0.555057868630693, + "flos": 25085548949760.0, + "grad_norm": 1.6698826084601515, + "language_loss": 0.78408533, + "learning_rate": 1.7411519338980548e-06, + "loss": 0.86107397, + "num_input_tokens_seen": 198806720, + "router_z_loss_clip": 1.51953125, + "router_z_loss_mlp": 0.12542725, + "step": 9232, + "time_per_iteration": 4.055214881896973 + }, + { + "auxiliary_loss_clip": 0.06416719, + "auxiliary_loss_mlp": 0.01266689, + "balance_loss_clip": 0.06273052, + "balance_loss_mlp": 0.01255972, + "epoch": 0.5551179918833609, + "flos": 26111412887040.0, + "grad_norm": 1.627879634610194, + "language_loss": 0.83063745, + "learning_rate": 1.7407657527564898e-06, + "loss": 0.90747154, + "num_input_tokens_seen": 198826235, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.10723877, + "step": 9233, + "time_per_iteration": 2.6376969814300537 + }, + { + "auxiliary_loss_clip": 0.06430396, + "auxiliary_loss_mlp": 0.01266353, + "balance_loss_clip": 0.06277359, + "balance_loss_mlp": 0.01254927, + "epoch": 0.5551781151360289, + "flos": 19389810862080.0, + "grad_norm": 2.483522309942904, + "language_loss": 0.7549684, + "learning_rate": 1.7403795814451142e-06, + "loss": 0.83193588, + "num_input_tokens_seen": 198842655, + "router_z_loss_clip": 1.52832031, + "router_z_loss_mlp": 0.11431885, + "step": 9234, + "time_per_iteration": 2.4859883785247803 + }, + { + "auxiliary_loss_clip": 0.06418739, + "auxiliary_loss_mlp": 0.01265554, + "balance_loss_clip": 0.06273869, + "balance_loss_mlp": 0.01255129, + "epoch": 0.5552382383886968, + "flos": 21732420585600.0, + "grad_norm": 1.8065340969909298, + "language_loss": 0.64963275, + "learning_rate": 1.7399934199785706e-06, + "loss": 0.72647566, + "num_input_tokens_seen": 198861210, + "router_z_loss_clip": 1.44824219, + "router_z_loss_mlp": 0.10418701, + "step": 9235, + "time_per_iteration": 2.523128032684326 + }, + { + "auxiliary_loss_clip": 0.06420863, + "auxiliary_loss_mlp": 0.01267129, + "balance_loss_clip": 0.06272598, + "balance_loss_mlp": 0.0125519, + "epoch": 0.5552983616413648, + "flos": 14361480944640.0, + "grad_norm": 1.6397834212981734, + "language_loss": 0.68087149, + "learning_rate": 1.7396072683715029e-06, + "loss": 0.75775141, + "num_input_tokens_seen": 198880045, + "router_z_loss_clip": 1.48339844, + "router_z_loss_mlp": 0.11932373, + "step": 9236, + "time_per_iteration": 2.506023406982422 + }, + { + "auxiliary_loss_clip": 0.06416081, + "auxiliary_loss_mlp": 0.01266517, + "balance_loss_clip": 0.06273347, + "balance_loss_mlp": 0.01256068, + "epoch": 0.5553584848940327, + "flos": 25484730600960.0, + "grad_norm": 1.5459271274239896, + "language_loss": 0.86436939, + "learning_rate": 1.7392211266385536e-06, + "loss": 0.94119537, + "num_input_tokens_seen": 198900210, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.10449219, + "step": 9237, + "time_per_iteration": 2.580103874206543 + }, + { + "auxiliary_loss_clip": 0.0641643, + "auxiliary_loss_mlp": 0.01267385, + "balance_loss_clip": 0.06273238, + "balance_loss_mlp": 0.01255875, + "epoch": 0.5554186081467007, + "flos": 22170399477120.0, + "grad_norm": 1.8042242059193758, + "language_loss": 0.73774469, + "learning_rate": 1.7388349947943652e-06, + "loss": 0.81458282, + "num_input_tokens_seen": 198919055, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.11517334, + "step": 9238, + "time_per_iteration": 2.5031590461730957 + }, + { + "auxiliary_loss_clip": 0.0642554, + "auxiliary_loss_mlp": 0.01266682, + "balance_loss_clip": 0.06275032, + "balance_loss_mlp": 0.01255924, + "epoch": 0.5554787313993687, + "flos": 49757744908800.0, + "grad_norm": 1.5320503148177431, + "language_loss": 0.78384852, + "learning_rate": 1.73844887285358e-06, + "loss": 0.86077076, + "num_input_tokens_seen": 198943505, + "router_z_loss_clip": 1.50585938, + "router_z_loss_mlp": 0.10766602, + "step": 9239, + "time_per_iteration": 2.7739756107330322 + }, + { + "auxiliary_loss_clip": 0.06423195, + "auxiliary_loss_mlp": 0.01266863, + "balance_loss_clip": 0.06276082, + "balance_loss_mlp": 0.0125546, + "epoch": 0.5555388546520367, + "flos": 22133908224000.0, + "grad_norm": 1.4777059666754715, + "language_loss": 0.80562818, + "learning_rate": 1.7380627608308393e-06, + "loss": 0.88252878, + "num_input_tokens_seen": 198963590, + "router_z_loss_clip": 1.47070312, + "router_z_loss_mlp": 0.11401367, + "step": 9240, + "time_per_iteration": 2.5036380290985107 + }, + { + "auxiliary_loss_clip": 0.06419357, + "auxiliary_loss_mlp": 0.01266651, + "balance_loss_clip": 0.06273453, + "balance_loss_mlp": 0.01255142, + "epoch": 0.5555989779047047, + "flos": 24689218337280.0, + "grad_norm": 1.7126628457644222, + "language_loss": 0.65465248, + "learning_rate": 1.737676658740786e-06, + "loss": 0.73151255, + "num_input_tokens_seen": 198982680, + "router_z_loss_clip": 1.45996094, + "router_z_loss_mlp": 0.1151123, + "step": 9241, + "time_per_iteration": 2.5851833820343018 + }, + { + "auxiliary_loss_clip": 0.06422672, + "auxiliary_loss_mlp": 0.01264033, + "balance_loss_clip": 0.06276439, + "balance_loss_mlp": 0.01252566, + "epoch": 0.5556591011573726, + "flos": 16111929064320.0, + "grad_norm": 1.8766289396676605, + "language_loss": 0.73123193, + "learning_rate": 1.7372905665980594e-06, + "loss": 0.80809897, + "num_input_tokens_seen": 199000185, + "router_z_loss_clip": 1.46289062, + "router_z_loss_mlp": 0.11474609, + "step": 9242, + "time_per_iteration": 2.467933416366577 + }, + { + "auxiliary_loss_clip": 0.06423976, + "auxiliary_loss_mlp": 0.0126539, + "balance_loss_clip": 0.06276064, + "balance_loss_mlp": 0.01253022, + "epoch": 0.5557192244100406, + "flos": 12938825197440.0, + "grad_norm": 6.974019127266796, + "language_loss": 0.64053857, + "learning_rate": 1.7369044844173012e-06, + "loss": 0.71743226, + "num_input_tokens_seen": 199018380, + "router_z_loss_clip": 1.47851562, + "router_z_loss_mlp": 0.12365723, + "step": 9243, + "time_per_iteration": 2.528529167175293 + }, + { + "auxiliary_loss_clip": 0.0642553, + "auxiliary_loss_mlp": 0.01269814, + "balance_loss_clip": 0.06280211, + "balance_loss_mlp": 0.01258614, + "epoch": 0.5557793476627085, + "flos": 23118291590400.0, + "grad_norm": 3.1703508621435095, + "language_loss": 0.75212169, + "learning_rate": 1.7365184122131509e-06, + "loss": 0.82907516, + "num_input_tokens_seen": 199037115, + "router_z_loss_clip": 1.45214844, + "router_z_loss_mlp": 0.11199951, + "step": 9244, + "time_per_iteration": 2.5159640312194824 + }, + { + "auxiliary_loss_clip": 0.06417421, + "auxiliary_loss_mlp": 0.01263368, + "balance_loss_clip": 0.06277108, + "balance_loss_mlp": 0.01252938, + "epoch": 0.5558394709153766, + "flos": 21433446817920.0, + "grad_norm": 2.161992759062338, + "language_loss": 0.74536991, + "learning_rate": 1.7361323500002486e-06, + "loss": 0.82217783, + "num_input_tokens_seen": 199053375, + "router_z_loss_clip": 1.40332031, + "router_z_loss_mlp": 0.10437012, + "step": 9245, + "time_per_iteration": 2.5320873260498047 + }, + { + "auxiliary_loss_clip": 0.06425805, + "auxiliary_loss_mlp": 0.01268074, + "balance_loss_clip": 0.06274477, + "balance_loss_mlp": 0.01255533, + "epoch": 0.5558995941680445, + "flos": 25084626554880.0, + "grad_norm": 2.1186554191459575, + "language_loss": 0.79345202, + "learning_rate": 1.7357462977932348e-06, + "loss": 0.87039083, + "num_input_tokens_seen": 199070930, + "router_z_loss_clip": 1.51269531, + "router_z_loss_mlp": 0.12530518, + "step": 9246, + "time_per_iteration": 2.5617494583129883 + }, + { + "auxiliary_loss_clip": 0.06425521, + "auxiliary_loss_mlp": 0.01270795, + "balance_loss_clip": 0.06276709, + "balance_loss_mlp": 0.01258993, + "epoch": 0.5559597174207125, + "flos": 20017331688960.0, + "grad_norm": 1.8080775090170724, + "language_loss": 0.7423467, + "learning_rate": 1.7353602556067471e-06, + "loss": 0.81930989, + "num_input_tokens_seen": 199088675, + "router_z_loss_clip": 1.48730469, + "router_z_loss_mlp": 0.11810303, + "step": 9247, + "time_per_iteration": 2.5472562313079834 + }, + { + "auxiliary_loss_clip": 0.06421669, + "auxiliary_loss_mlp": 0.01265666, + "balance_loss_clip": 0.06275357, + "balance_loss_mlp": 0.01254007, + "epoch": 0.5560198406733804, + "flos": 16841125221120.0, + "grad_norm": 2.9360607038713127, + "language_loss": 0.75686443, + "learning_rate": 1.7349742234554254e-06, + "loss": 0.83373785, + "num_input_tokens_seen": 199103075, + "router_z_loss_clip": 1.46386719, + "router_z_loss_mlp": 0.11645508, + "step": 9248, + "time_per_iteration": 2.4991230964660645 + }, + { + "auxiliary_loss_clip": 0.06332292, + "auxiliary_loss_mlp": 0.01252325, + "balance_loss_clip": 0.06269792, + "balance_loss_mlp": 0.01250564, + "epoch": 0.5560799639260484, + "flos": 70719012840960.0, + "grad_norm": 0.8521249277155936, + "language_loss": 0.5948171, + "learning_rate": 1.7345882013539081e-06, + "loss": 0.67066324, + "num_input_tokens_seen": 199160325, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 0.01763916, + "step": 9249, + "time_per_iteration": 3.2450287342071533 + }, + { + "auxiliary_loss_clip": 0.06424973, + "auxiliary_loss_mlp": 0.0126469, + "balance_loss_clip": 0.06276406, + "balance_loss_mlp": 0.01253943, + "epoch": 0.5561400871787163, + "flos": 23155244040960.0, + "grad_norm": 2.0335955894649036, + "language_loss": 0.79889202, + "learning_rate": 1.734202189316832e-06, + "loss": 0.87578869, + "num_input_tokens_seen": 199179760, + "router_z_loss_clip": 1.48339844, + "router_z_loss_mlp": 0.10748291, + "step": 9250, + "time_per_iteration": 2.5372138023376465 + }, + { + "auxiliary_loss_clip": 0.06427802, + "auxiliary_loss_mlp": 0.0126907, + "balance_loss_clip": 0.06277002, + "balance_loss_mlp": 0.01257471, + "epoch": 0.5562002104313843, + "flos": 17572166167680.0, + "grad_norm": 3.4851408255327856, + "language_loss": 0.69400316, + "learning_rate": 1.733816187358836e-06, + "loss": 0.77097189, + "num_input_tokens_seen": 199196695, + "router_z_loss_clip": 1.5078125, + "router_z_loss_mlp": 0.11584473, + "step": 9251, + "time_per_iteration": 2.554487943649292 + }, + { + "auxiliary_loss_clip": 0.06422772, + "auxiliary_loss_mlp": 0.01265424, + "balance_loss_clip": 0.06275512, + "balance_loss_mlp": 0.01253676, + "epoch": 0.5562603336840523, + "flos": 25052328005760.0, + "grad_norm": 1.4438817767967254, + "language_loss": 0.75297302, + "learning_rate": 1.7334301954945569e-06, + "loss": 0.82985497, + "num_input_tokens_seen": 199217845, + "router_z_loss_clip": 1.47265625, + "router_z_loss_mlp": 0.11743164, + "step": 9252, + "time_per_iteration": 2.554103374481201 + }, + { + "auxiliary_loss_clip": 0.06427599, + "auxiliary_loss_mlp": 0.01265088, + "balance_loss_clip": 0.0627709, + "balance_loss_mlp": 0.01254115, + "epoch": 0.5563204569367203, + "flos": 29066617411200.0, + "grad_norm": 1.5076691298158018, + "language_loss": 0.72903025, + "learning_rate": 1.7330442137386313e-06, + "loss": 0.80595708, + "num_input_tokens_seen": 199239250, + "router_z_loss_clip": 1.50488281, + "router_z_loss_mlp": 0.10980225, + "step": 9253, + "time_per_iteration": 2.5654473304748535 + }, + { + "auxiliary_loss_clip": 0.06422551, + "auxiliary_loss_mlp": 0.01269621, + "balance_loss_clip": 0.06276682, + "balance_loss_mlp": 0.01259161, + "epoch": 0.5563805801893883, + "flos": 22096913846400.0, + "grad_norm": 1.9717474280435598, + "language_loss": 0.83141911, + "learning_rate": 1.7326582421056965e-06, + "loss": 0.90834075, + "num_input_tokens_seen": 199258320, + "router_z_loss_clip": 1.45996094, + "router_z_loss_mlp": 0.10455322, + "step": 9254, + "time_per_iteration": 2.5113630294799805 + }, + { + "auxiliary_loss_clip": 0.06332405, + "auxiliary_loss_mlp": 0.01255231, + "balance_loss_clip": 0.06269685, + "balance_loss_mlp": 0.01253453, + "epoch": 0.5564407034420562, + "flos": 58652623555200.0, + "grad_norm": 0.8548643960281289, + "language_loss": 0.64887053, + "learning_rate": 1.732272280610387e-06, + "loss": 0.72474694, + "num_input_tokens_seen": 199314840, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 0.01777649, + "step": 9255, + "time_per_iteration": 2.980931043624878 + }, + { + "auxiliary_loss_clip": 0.06420524, + "auxiliary_loss_mlp": 0.01265076, + "balance_loss_clip": 0.06275329, + "balance_loss_mlp": 0.01254175, + "epoch": 0.5565008266947242, + "flos": 23119004350080.0, + "grad_norm": 1.731717948076331, + "language_loss": 0.69607276, + "learning_rate": 1.7318863292673399e-06, + "loss": 0.77292871, + "num_input_tokens_seen": 199335405, + "router_z_loss_clip": 1.45117188, + "router_z_loss_mlp": 0.10900879, + "step": 9256, + "time_per_iteration": 3.9532642364501953 + }, + { + "auxiliary_loss_clip": 0.06418847, + "auxiliary_loss_mlp": 0.01264994, + "balance_loss_clip": 0.06276, + "balance_loss_mlp": 0.01254551, + "epoch": 0.5565609499473921, + "flos": 21584568856320.0, + "grad_norm": 1.4749881970234011, + "language_loss": 0.76680368, + "learning_rate": 1.73150038809119e-06, + "loss": 0.84364206, + "num_input_tokens_seen": 199354345, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.10443115, + "step": 9257, + "time_per_iteration": 2.4937705993652344 + }, + { + "auxiliary_loss_clip": 0.06425476, + "auxiliary_loss_mlp": 0.01273625, + "balance_loss_clip": 0.0627654, + "balance_loss_mlp": 0.01262735, + "epoch": 0.5566210732000602, + "flos": 18375602641920.0, + "grad_norm": 2.7130999997532563, + "language_loss": 0.61334699, + "learning_rate": 1.7311144570965724e-06, + "loss": 0.69033802, + "num_input_tokens_seen": 199372250, + "router_z_loss_clip": 1.48828125, + "router_z_loss_mlp": 0.10894775, + "step": 9258, + "time_per_iteration": 2.5560710430145264 + }, + { + "auxiliary_loss_clip": 0.06420255, + "auxiliary_loss_mlp": 0.01266708, + "balance_loss_clip": 0.0627327, + "balance_loss_mlp": 0.01255431, + "epoch": 0.5566811964527281, + "flos": 25710554154240.0, + "grad_norm": 1.5983859944569927, + "language_loss": 0.79631943, + "learning_rate": 1.7307285362981215e-06, + "loss": 0.87318903, + "num_input_tokens_seen": 199392815, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 0.11279297, + "step": 9259, + "time_per_iteration": 2.582550525665283 + }, + { + "auxiliary_loss_clip": 0.06421982, + "auxiliary_loss_mlp": 0.01267837, + "balance_loss_clip": 0.06275143, + "balance_loss_mlp": 0.01257013, + "epoch": 0.5567413197053961, + "flos": 26951424468480.0, + "grad_norm": 1.7768491917262519, + "language_loss": 0.81632483, + "learning_rate": 1.7303426257104712e-06, + "loss": 0.89322305, + "num_input_tokens_seen": 199412375, + "router_z_loss_clip": 1.46777344, + "router_z_loss_mlp": 0.10821533, + "step": 9260, + "time_per_iteration": 3.994185209274292 + }, + { + "auxiliary_loss_clip": 0.0642475, + "auxiliary_loss_mlp": 0.0126987, + "balance_loss_clip": 0.06276532, + "balance_loss_mlp": 0.01257598, + "epoch": 0.556801442958064, + "flos": 20856965927040.0, + "grad_norm": 1.6577209620324271, + "language_loss": 0.69569898, + "learning_rate": 1.729956725348256e-06, + "loss": 0.77264518, + "num_input_tokens_seen": 199431490, + "router_z_loss_clip": 1.48144531, + "router_z_loss_mlp": 0.1227417, + "step": 9261, + "time_per_iteration": 2.558511734008789 + }, + { + "auxiliary_loss_clip": 0.06317247, + "auxiliary_loss_mlp": 0.01254512, + "balance_loss_clip": 0.06255186, + "balance_loss_mlp": 0.01252651, + "epoch": 0.556861566210732, + "flos": 70517395918080.0, + "grad_norm": 0.7170849600938061, + "language_loss": 0.61090672, + "learning_rate": 1.729570835226108e-06, + "loss": 0.68662429, + "num_input_tokens_seen": 199495855, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 0.01856995, + "step": 9262, + "time_per_iteration": 3.134216070175171 + }, + { + "auxiliary_loss_clip": 0.06422806, + "auxiliary_loss_mlp": 0.01270562, + "balance_loss_clip": 0.06273758, + "balance_loss_mlp": 0.01259214, + "epoch": 0.5569216894633999, + "flos": 25344216103680.0, + "grad_norm": 1.5027402480240113, + "language_loss": 0.64822662, + "learning_rate": 1.7291849553586622e-06, + "loss": 0.72516024, + "num_input_tokens_seen": 199515870, + "router_z_loss_clip": 1.48925781, + "router_z_loss_mlp": 0.11340332, + "step": 9263, + "time_per_iteration": 2.5533127784729004 + }, + { + "auxiliary_loss_clip": 0.06420417, + "auxiliary_loss_mlp": 0.01271706, + "balance_loss_clip": 0.06274161, + "balance_loss_mlp": 0.01260679, + "epoch": 0.556981812716068, + "flos": 22645456600320.0, + "grad_norm": 1.647856593864945, + "language_loss": 0.73077464, + "learning_rate": 1.7287990857605497e-06, + "loss": 0.80769587, + "num_input_tokens_seen": 199535745, + "router_z_loss_clip": 1.46484375, + "router_z_loss_mlp": 0.11035156, + "step": 9264, + "time_per_iteration": 2.5055153369903564 + }, + { + "auxiliary_loss_clip": 0.06421056, + "auxiliary_loss_mlp": 0.01267322, + "balance_loss_clip": 0.06273742, + "balance_loss_mlp": 0.01255765, + "epoch": 0.5570419359687359, + "flos": 11040567275520.0, + "grad_norm": 1.7723772076526776, + "language_loss": 0.7667138, + "learning_rate": 1.7284132264464022e-06, + "loss": 0.84359753, + "num_input_tokens_seen": 199554035, + "router_z_loss_clip": 1.47265625, + "router_z_loss_mlp": 0.11553955, + "step": 9265, + "time_per_iteration": 3.964038372039795 + }, + { + "auxiliary_loss_clip": 0.064167, + "auxiliary_loss_mlp": 0.01273186, + "balance_loss_clip": 0.06276511, + "balance_loss_mlp": 0.01262368, + "epoch": 0.5571020592214039, + "flos": 22830218853120.0, + "grad_norm": 1.7025735740351078, + "language_loss": 0.71389985, + "learning_rate": 1.7280273774308536e-06, + "loss": 0.79079872, + "num_input_tokens_seen": 199576120, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.1081543, + "step": 9266, + "time_per_iteration": 2.5572071075439453 + }, + { + "auxiliary_loss_clip": 0.06418756, + "auxiliary_loss_mlp": 0.01270352, + "balance_loss_clip": 0.06275168, + "balance_loss_mlp": 0.01259701, + "epoch": 0.5571621824740719, + "flos": 22934074389120.0, + "grad_norm": 1.5846567867344512, + "language_loss": 0.68614411, + "learning_rate": 1.727641538728533e-06, + "loss": 0.76303518, + "num_input_tokens_seen": 199593780, + "router_z_loss_clip": 1.43457031, + "router_z_loss_mlp": 0.10656738, + "step": 9267, + "time_per_iteration": 2.4949660301208496 + }, + { + "auxiliary_loss_clip": 0.06419186, + "auxiliary_loss_mlp": 0.01266996, + "balance_loss_clip": 0.06277707, + "balance_loss_mlp": 0.01255677, + "epoch": 0.5572223057267398, + "flos": 22973416680960.0, + "grad_norm": 2.0664301257613684, + "language_loss": 0.75132561, + "learning_rate": 1.7272557103540736e-06, + "loss": 0.82818741, + "num_input_tokens_seen": 199613220, + "router_z_loss_clip": 1.41601562, + "router_z_loss_mlp": 0.11315918, + "step": 9268, + "time_per_iteration": 2.5834717750549316 + }, + { + "auxiliary_loss_clip": 0.06420539, + "auxiliary_loss_mlp": 0.01262996, + "balance_loss_clip": 0.06276375, + "balance_loss_mlp": 0.01252184, + "epoch": 0.5572824289794078, + "flos": 20966439686400.0, + "grad_norm": 2.076388090189787, + "language_loss": 0.75247812, + "learning_rate": 1.726869892322104e-06, + "loss": 0.8293134, + "num_input_tokens_seen": 199632085, + "router_z_loss_clip": 1.44042969, + "router_z_loss_mlp": 0.10803223, + "step": 9269, + "time_per_iteration": 2.6340525150299072 + }, + { + "auxiliary_loss_clip": 0.06420279, + "auxiliary_loss_mlp": 0.01268076, + "balance_loss_clip": 0.06274693, + "balance_loss_mlp": 0.01257091, + "epoch": 0.5573425522320757, + "flos": 25048806134400.0, + "grad_norm": 1.9328220368280318, + "language_loss": 0.82704222, + "learning_rate": 1.726484084647256e-06, + "loss": 0.90392578, + "num_input_tokens_seen": 199649295, + "router_z_loss_clip": 1.45507812, + "router_z_loss_mlp": 0.10986328, + "step": 9270, + "time_per_iteration": 2.6455605030059814 + }, + { + "auxiliary_loss_clip": 0.06426194, + "auxiliary_loss_mlp": 0.01267053, + "balance_loss_clip": 0.06276838, + "balance_loss_mlp": 0.01255657, + "epoch": 0.5574026754847438, + "flos": 23666415073920.0, + "grad_norm": 1.8553396052443616, + "language_loss": 0.79884106, + "learning_rate": 1.7260982873441591e-06, + "loss": 0.87577355, + "num_input_tokens_seen": 199668870, + "router_z_loss_clip": 1.49414062, + "router_z_loss_mlp": 0.1138916, + "step": 9271, + "time_per_iteration": 4.060855388641357 + }, + { + "auxiliary_loss_clip": 0.0642622, + "auxiliary_loss_mlp": 0.01265728, + "balance_loss_clip": 0.0627868, + "balance_loss_mlp": 0.01254153, + "epoch": 0.5574627987374117, + "flos": 24787791066240.0, + "grad_norm": 1.7644146130703546, + "language_loss": 0.90646034, + "learning_rate": 1.725712500427442e-06, + "loss": 0.9833799, + "num_input_tokens_seen": 199684870, + "router_z_loss_clip": 1.47265625, + "router_z_loss_mlp": 0.11572266, + "step": 9272, + "time_per_iteration": 2.534665107727051 + }, + { + "auxiliary_loss_clip": 0.0641982, + "auxiliary_loss_mlp": 0.01265463, + "balance_loss_clip": 0.06279024, + "balance_loss_mlp": 0.0125446, + "epoch": 0.5575229219900797, + "flos": 21841349293440.0, + "grad_norm": 1.8989818213493146, + "language_loss": 0.84368634, + "learning_rate": 1.7253267239117347e-06, + "loss": 0.92053914, + "num_input_tokens_seen": 199701975, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.10992432, + "step": 9273, + "time_per_iteration": 2.5200788974761963 + }, + { + "auxiliary_loss_clip": 0.06423581, + "auxiliary_loss_mlp": 0.01268606, + "balance_loss_clip": 0.06278996, + "balance_loss_mlp": 0.01256059, + "epoch": 0.5575830452427476, + "flos": 27821973663360.0, + "grad_norm": 1.9193499092419828, + "language_loss": 0.75017828, + "learning_rate": 1.7249409578116655e-06, + "loss": 0.82710016, + "num_input_tokens_seen": 199721865, + "router_z_loss_clip": 1.44433594, + "router_z_loss_mlp": 0.12548828, + "step": 9274, + "time_per_iteration": 2.548865795135498 + }, + { + "auxiliary_loss_clip": 0.06435296, + "auxiliary_loss_mlp": 0.01273341, + "balance_loss_clip": 0.06282236, + "balance_loss_mlp": 0.01260806, + "epoch": 0.5576431684954156, + "flos": 17817081252480.0, + "grad_norm": 2.8160029917848397, + "language_loss": 0.78999293, + "learning_rate": 1.7245552021418629e-06, + "loss": 0.86707926, + "num_input_tokens_seen": 199736455, + "router_z_loss_clip": 1.52832031, + "router_z_loss_mlp": 0.12530518, + "step": 9275, + "time_per_iteration": 2.503168821334839 + }, + { + "auxiliary_loss_clip": 0.06426495, + "auxiliary_loss_mlp": 0.01264959, + "balance_loss_clip": 0.06279385, + "balance_loss_mlp": 0.01253372, + "epoch": 0.5577032917480835, + "flos": 15492290520960.0, + "grad_norm": 1.5722489245589244, + "language_loss": 0.75639874, + "learning_rate": 1.7241694569169546e-06, + "loss": 0.83331323, + "num_input_tokens_seen": 199753125, + "router_z_loss_clip": 1.47070312, + "router_z_loss_mlp": 0.11584473, + "step": 9276, + "time_per_iteration": 2.466275215148926 + }, + { + "auxiliary_loss_clip": 0.06423229, + "auxiliary_loss_mlp": 0.012674, + "balance_loss_clip": 0.06277048, + "balance_loss_mlp": 0.01256379, + "epoch": 0.5577634150007516, + "flos": 21586162083840.0, + "grad_norm": 1.8200099839217898, + "language_loss": 0.75387412, + "learning_rate": 1.7237837221515678e-06, + "loss": 0.83078039, + "num_input_tokens_seen": 199771365, + "router_z_loss_clip": 1.46191406, + "router_z_loss_mlp": 0.11022949, + "step": 9277, + "time_per_iteration": 2.514432907104492 + }, + { + "auxiliary_loss_clip": 0.06420221, + "auxiliary_loss_mlp": 0.01266216, + "balance_loss_clip": 0.06277104, + "balance_loss_mlp": 0.01255535, + "epoch": 0.5578235382534195, + "flos": 21145709496960.0, + "grad_norm": 1.5944068660293211, + "language_loss": 0.7198559, + "learning_rate": 1.7233979978603304e-06, + "loss": 0.79672027, + "num_input_tokens_seen": 199790035, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.10681152, + "step": 9278, + "time_per_iteration": 2.4954776763916016 + }, + { + "auxiliary_loss_clip": 0.06425839, + "auxiliary_loss_mlp": 0.01267939, + "balance_loss_clip": 0.06276909, + "balance_loss_mlp": 0.01255166, + "epoch": 0.5578836615060875, + "flos": 26512397400960.0, + "grad_norm": 1.4623548994871365, + "language_loss": 0.75693482, + "learning_rate": 1.723012284057868e-06, + "loss": 0.83387262, + "num_input_tokens_seen": 199811125, + "router_z_loss_clip": 1.48925781, + "router_z_loss_mlp": 0.12786865, + "step": 9279, + "time_per_iteration": 2.5537941455841064 + }, + { + "auxiliary_loss_clip": 0.06422286, + "auxiliary_loss_mlp": 0.01267149, + "balance_loss_clip": 0.06276134, + "balance_loss_mlp": 0.01255354, + "epoch": 0.5579437847587555, + "flos": 20159439413760.0, + "grad_norm": 1.637545301877737, + "language_loss": 0.67443848, + "learning_rate": 1.7226265807588082e-06, + "loss": 0.75133282, + "num_input_tokens_seen": 199829915, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.11791992, + "step": 9280, + "time_per_iteration": 2.489867925643921 + }, + { + "auxiliary_loss_clip": 0.06426547, + "auxiliary_loss_mlp": 0.01266943, + "balance_loss_clip": 0.06276332, + "balance_loss_mlp": 0.01255851, + "epoch": 0.5580039080114234, + "flos": 26109148826880.0, + "grad_norm": 1.5394249927656036, + "language_loss": 0.7336756, + "learning_rate": 1.7222408879777763e-06, + "loss": 0.81061053, + "num_input_tokens_seen": 199850670, + "router_z_loss_clip": 1.50195312, + "router_z_loss_mlp": 0.11090088, + "step": 9281, + "time_per_iteration": 2.693004846572876 + }, + { + "auxiliary_loss_clip": 0.06420805, + "auxiliary_loss_mlp": 0.01265902, + "balance_loss_clip": 0.06277525, + "balance_loss_mlp": 0.01255244, + "epoch": 0.5580640312640914, + "flos": 13776740426880.0, + "grad_norm": 2.347269898773066, + "language_loss": 0.75313729, + "learning_rate": 1.7218552057293974e-06, + "loss": 0.83000439, + "num_input_tokens_seen": 199867645, + "router_z_loss_clip": 1.43359375, + "router_z_loss_mlp": 0.10662842, + "step": 9282, + "time_per_iteration": 2.472775936126709 + }, + { + "auxiliary_loss_clip": 0.06421494, + "auxiliary_loss_mlp": 0.01270202, + "balance_loss_clip": 0.0627737, + "balance_loss_mlp": 0.01258871, + "epoch": 0.5581241545167593, + "flos": 17681765708160.0, + "grad_norm": 1.6208158464679243, + "language_loss": 0.66451746, + "learning_rate": 1.721469534028297e-06, + "loss": 0.74143445, + "num_input_tokens_seen": 199886320, + "router_z_loss_clip": 1.44140625, + "router_z_loss_mlp": 0.11334229, + "step": 9283, + "time_per_iteration": 2.495039224624634 + }, + { + "auxiliary_loss_clip": 0.06423882, + "auxiliary_loss_mlp": 0.01268075, + "balance_loss_clip": 0.06277125, + "balance_loss_mlp": 0.01257489, + "epoch": 0.5581842777694274, + "flos": 19574573114880.0, + "grad_norm": 1.8440828180500004, + "language_loss": 0.83265072, + "learning_rate": 1.7210838728890994e-06, + "loss": 0.90957028, + "num_input_tokens_seen": 199904895, + "router_z_loss_clip": 1.46582031, + "router_z_loss_mlp": 0.10583496, + "step": 9284, + "time_per_iteration": 2.479743719100952 + }, + { + "auxiliary_loss_clip": 0.06423684, + "auxiliary_loss_mlp": 0.01266546, + "balance_loss_clip": 0.06278059, + "balance_loss_mlp": 0.01255412, + "epoch": 0.5582444010220953, + "flos": 20601485228160.0, + "grad_norm": 2.4189186360573407, + "language_loss": 0.86142218, + "learning_rate": 1.7206982223264304e-06, + "loss": 0.93832451, + "num_input_tokens_seen": 199921090, + "router_z_loss_clip": 1.45800781, + "router_z_loss_mlp": 0.11132812, + "step": 9285, + "time_per_iteration": 2.528890371322632 + }, + { + "auxiliary_loss_clip": 0.06422924, + "auxiliary_loss_mlp": 0.01266589, + "balance_loss_clip": 0.06277917, + "balance_loss_mlp": 0.01255818, + "epoch": 0.5583045242747633, + "flos": 19141541614080.0, + "grad_norm": 2.3862114712175013, + "language_loss": 0.74476177, + "learning_rate": 1.720312582354912e-06, + "loss": 0.82165694, + "num_input_tokens_seen": 199939925, + "router_z_loss_clip": 1.44726562, + "router_z_loss_mlp": 0.10772705, + "step": 9286, + "time_per_iteration": 2.502807378768921 + }, + { + "auxiliary_loss_clip": 0.06421416, + "auxiliary_loss_mlp": 0.01267963, + "balance_loss_clip": 0.06276793, + "balance_loss_mlp": 0.01256448, + "epoch": 0.5583646475274312, + "flos": 27462050449920.0, + "grad_norm": 1.681368685974995, + "language_loss": 0.74959427, + "learning_rate": 1.7199269529891684e-06, + "loss": 0.82648808, + "num_input_tokens_seen": 199960015, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.11529541, + "step": 9287, + "time_per_iteration": 2.5700645446777344 + }, + { + "auxiliary_loss_clip": 0.06430193, + "auxiliary_loss_mlp": 0.01266629, + "balance_loss_clip": 0.06279745, + "balance_loss_mlp": 0.01254601, + "epoch": 0.5584247707800992, + "flos": 23659580966400.0, + "grad_norm": 1.4753035778898818, + "language_loss": 0.75157738, + "learning_rate": 1.7195413342438233e-06, + "loss": 0.82854563, + "num_input_tokens_seen": 199980505, + "router_z_loss_clip": 1.50488281, + "router_z_loss_mlp": 0.12036133, + "step": 9288, + "time_per_iteration": 2.529250383377075 + }, + { + "auxiliary_loss_clip": 0.06424332, + "auxiliary_loss_mlp": 0.01267185, + "balance_loss_clip": 0.06280167, + "balance_loss_mlp": 0.01254847, + "epoch": 0.5584848940327671, + "flos": 13703967555840.0, + "grad_norm": 2.2558701039351696, + "language_loss": 0.78180242, + "learning_rate": 1.7191557261334984e-06, + "loss": 0.85871768, + "num_input_tokens_seen": 199999020, + "router_z_loss_clip": 1.44238281, + "router_z_loss_mlp": 0.12329102, + "step": 9289, + "time_per_iteration": 2.5093841552734375 + }, + { + "auxiliary_loss_clip": 0.06428449, + "auxiliary_loss_mlp": 0.0126783, + "balance_loss_clip": 0.06276964, + "balance_loss_mlp": 0.01255921, + "epoch": 0.5585450172854352, + "flos": 27023526506880.0, + "grad_norm": 1.7277790144481269, + "language_loss": 0.61688149, + "learning_rate": 1.718770128672817e-06, + "loss": 0.69384426, + "num_input_tokens_seen": 200019020, + "router_z_loss_clip": 1.51464844, + "router_z_loss_mlp": 0.11914062, + "step": 9290, + "time_per_iteration": 2.5534214973449707 + }, + { + "auxiliary_loss_clip": 0.0642647, + "auxiliary_loss_mlp": 0.01268365, + "balance_loss_clip": 0.06277582, + "balance_loss_mlp": 0.01256581, + "epoch": 0.5586051405381031, + "flos": 23192406126720.0, + "grad_norm": 2.1760973422208965, + "language_loss": 0.67914414, + "learning_rate": 1.7183845418764e-06, + "loss": 0.75609255, + "num_input_tokens_seen": 200038110, + "router_z_loss_clip": 1.48925781, + "router_z_loss_mlp": 0.11767578, + "step": 9291, + "time_per_iteration": 2.5376763343811035 + }, + { + "auxiliary_loss_clip": 0.0642361, + "auxiliary_loss_mlp": 0.01267339, + "balance_loss_clip": 0.06277996, + "balance_loss_mlp": 0.01255764, + "epoch": 0.5586652637907711, + "flos": 20781551652480.0, + "grad_norm": 1.760966459417108, + "language_loss": 0.84366935, + "learning_rate": 1.7179989657588698e-06, + "loss": 0.92057884, + "num_input_tokens_seen": 200056210, + "router_z_loss_clip": 1.45703125, + "router_z_loss_mlp": 0.11578369, + "step": 9292, + "time_per_iteration": 2.5204405784606934 + }, + { + "auxiliary_loss_clip": 0.06422292, + "auxiliary_loss_mlp": 0.01268661, + "balance_loss_clip": 0.06279489, + "balance_loss_mlp": 0.01257848, + "epoch": 0.5587253870434391, + "flos": 28227360516480.0, + "grad_norm": 1.8754942991534513, + "language_loss": 0.7459076, + "learning_rate": 1.7176134003348476e-06, + "loss": 0.82281709, + "num_input_tokens_seen": 200075620, + "router_z_loss_clip": 1.42871094, + "router_z_loss_mlp": 0.10821533, + "step": 9293, + "time_per_iteration": 2.6592154502868652 + }, + { + "auxiliary_loss_clip": 0.06418014, + "auxiliary_loss_mlp": 0.01265353, + "balance_loss_clip": 0.06274671, + "balance_loss_mlp": 0.01254809, + "epoch": 0.558785510296107, + "flos": 26623128971520.0, + "grad_norm": 1.7285534178917525, + "language_loss": 0.72416651, + "learning_rate": 1.7172278456189523e-06, + "loss": 0.80100018, + "num_input_tokens_seen": 200095945, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.10546875, + "step": 9294, + "time_per_iteration": 2.538320779800415 + }, + { + "auxiliary_loss_clip": 0.06421927, + "auxiliary_loss_mlp": 0.01268134, + "balance_loss_clip": 0.06276325, + "balance_loss_mlp": 0.01257208, + "epoch": 0.558845633548775, + "flos": 20162919358080.0, + "grad_norm": 2.7937117268116656, + "language_loss": 0.69210899, + "learning_rate": 1.716842301625806e-06, + "loss": 0.76900959, + "num_input_tokens_seen": 200114185, + "router_z_loss_clip": 1.45507812, + "router_z_loss_mlp": 0.109375, + "step": 9295, + "time_per_iteration": 2.5218520164489746 + }, + { + "auxiliary_loss_clip": 0.06418794, + "auxiliary_loss_mlp": 0.0126519, + "balance_loss_clip": 0.06273518, + "balance_loss_mlp": 0.01253776, + "epoch": 0.5589057568014429, + "flos": 24357317114880.0, + "grad_norm": 1.5440712557728564, + "language_loss": 0.80893242, + "learning_rate": 1.7164567683700281e-06, + "loss": 0.88577229, + "num_input_tokens_seen": 200135030, + "router_z_loss_clip": 1.45117188, + "router_z_loss_mlp": 0.11419678, + "step": 9296, + "time_per_iteration": 3.9467618465423584 + }, + { + "auxiliary_loss_clip": 0.06419219, + "auxiliary_loss_mlp": 0.01265962, + "balance_loss_clip": 0.06275849, + "balance_loss_mlp": 0.01255019, + "epoch": 0.558965880054111, + "flos": 21111440376960.0, + "grad_norm": 1.9869508208087105, + "language_loss": 0.65690488, + "learning_rate": 1.7160712458662379e-06, + "loss": 0.73375666, + "num_input_tokens_seen": 200154290, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.10955811, + "step": 9297, + "time_per_iteration": 2.528181791305542 + }, + { + "auxiliary_loss_clip": 0.06424123, + "auxiliary_loss_mlp": 0.01267328, + "balance_loss_clip": 0.06275574, + "balance_loss_mlp": 0.0125527, + "epoch": 0.5590260033067789, + "flos": 18440954426880.0, + "grad_norm": 1.490575561372924, + "language_loss": 0.75263643, + "learning_rate": 1.7156857341290544e-06, + "loss": 0.82955098, + "num_input_tokens_seen": 200171555, + "router_z_loss_clip": 1.48632812, + "router_z_loss_mlp": 0.12054443, + "step": 9298, + "time_per_iteration": 2.5208308696746826 + }, + { + "auxiliary_loss_clip": 0.06311645, + "auxiliary_loss_mlp": 0.01252986, + "balance_loss_clip": 0.06249566, + "balance_loss_mlp": 0.01251184, + "epoch": 0.5590861265594469, + "flos": 70597673729280.0, + "grad_norm": 0.6945904868111653, + "language_loss": 0.52248931, + "learning_rate": 1.7153002331730967e-06, + "loss": 0.59813559, + "num_input_tokens_seen": 200237010, + "router_z_loss_clip": 0.62158203, + "router_z_loss_mlp": 0.01797485, + "step": 9299, + "time_per_iteration": 4.702880144119263 + }, + { + "auxiliary_loss_clip": 0.06418106, + "auxiliary_loss_mlp": 0.01267473, + "balance_loss_clip": 0.06276019, + "balance_loss_mlp": 0.01256905, + "epoch": 0.5591462498121148, + "flos": 30672274475520.0, + "grad_norm": 1.7758709427362191, + "language_loss": 0.68987107, + "learning_rate": 1.7149147430129824e-06, + "loss": 0.76672685, + "num_input_tokens_seen": 200260820, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.10571289, + "step": 9300, + "time_per_iteration": 2.6169886589050293 + }, + { + "auxiliary_loss_clip": 0.06428309, + "auxiliary_loss_mlp": 0.01266499, + "balance_loss_clip": 0.06278549, + "balance_loss_mlp": 0.01254727, + "epoch": 0.5592063730647828, + "flos": 18156319706880.0, + "grad_norm": 3.029569475440017, + "language_loss": 0.81908011, + "learning_rate": 1.7145292636633293e-06, + "loss": 0.89602816, + "num_input_tokens_seen": 200278035, + "router_z_loss_clip": 1.49804688, + "router_z_loss_mlp": 0.11761475, + "step": 9301, + "time_per_iteration": 2.4880383014678955 + }, + { + "auxiliary_loss_clip": 0.06421784, + "auxiliary_loss_mlp": 0.0126742, + "balance_loss_clip": 0.06274376, + "balance_loss_mlp": 0.01256101, + "epoch": 0.5592664963174507, + "flos": 24067148025600.0, + "grad_norm": 2.0495431587104216, + "language_loss": 0.67981839, + "learning_rate": 1.714143795138756e-06, + "loss": 0.75671041, + "num_input_tokens_seen": 200297255, + "router_z_loss_clip": 1.47265625, + "router_z_loss_mlp": 0.11315918, + "step": 9302, + "time_per_iteration": 2.5440263748168945 + }, + { + "auxiliary_loss_clip": 0.06427488, + "auxiliary_loss_mlp": 0.01266173, + "balance_loss_clip": 0.0627801, + "balance_loss_mlp": 0.01254121, + "epoch": 0.5593266195701188, + "flos": 19833911101440.0, + "grad_norm": 1.543967288464222, + "language_loss": 0.70932961, + "learning_rate": 1.713758337453878e-06, + "loss": 0.78626627, + "num_input_tokens_seen": 200317505, + "router_z_loss_clip": 1.49609375, + "router_z_loss_mlp": 0.12042236, + "step": 9303, + "time_per_iteration": 2.52182674407959 + }, + { + "auxiliary_loss_clip": 0.06417537, + "auxiliary_loss_mlp": 0.01265621, + "balance_loss_clip": 0.06276484, + "balance_loss_mlp": 0.01255453, + "epoch": 0.5593867428227867, + "flos": 25307682923520.0, + "grad_norm": 1.5891501411536748, + "language_loss": 0.73189592, + "learning_rate": 1.7133728906233124e-06, + "loss": 0.8087275, + "num_input_tokens_seen": 200338350, + "router_z_loss_clip": 1.41210938, + "router_z_loss_mlp": 0.10168457, + "step": 9304, + "time_per_iteration": 3.999878406524658 + }, + { + "auxiliary_loss_clip": 0.06421353, + "auxiliary_loss_mlp": 0.01266821, + "balance_loss_clip": 0.06276563, + "balance_loss_mlp": 0.01255693, + "epoch": 0.5594468660754547, + "flos": 12938028583680.0, + "grad_norm": 2.1417504305353563, + "language_loss": 0.78262866, + "learning_rate": 1.7129874546616763e-06, + "loss": 0.85951042, + "num_input_tokens_seen": 200353965, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.11132812, + "step": 9305, + "time_per_iteration": 2.5058751106262207 + }, + { + "auxiliary_loss_clip": 0.06419225, + "auxiliary_loss_mlp": 0.0126404, + "balance_loss_clip": 0.06278518, + "balance_loss_mlp": 0.01253341, + "epoch": 0.5595069893281227, + "flos": 19068768743040.0, + "grad_norm": 1.6214418695958237, + "language_loss": 0.69748855, + "learning_rate": 1.7126020295835836e-06, + "loss": 0.7743212, + "num_input_tokens_seen": 200373595, + "router_z_loss_clip": 1.40429688, + "router_z_loss_mlp": 0.10705566, + "step": 9306, + "time_per_iteration": 2.5216495990753174 + }, + { + "auxiliary_loss_clip": 0.06329086, + "auxiliary_loss_mlp": 0.01251264, + "balance_loss_clip": 0.06266434, + "balance_loss_mlp": 0.01249626, + "epoch": 0.5595671125807906, + "flos": 70291530437760.0, + "grad_norm": 0.8883282828550626, + "language_loss": 0.60321748, + "learning_rate": 1.7122166154036518e-06, + "loss": 0.679021, + "num_input_tokens_seen": 200429155, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 0.0164032, + "step": 9307, + "time_per_iteration": 3.2440812587738037 + }, + { + "auxiliary_loss_clip": 0.06421244, + "auxiliary_loss_mlp": 0.01267485, + "balance_loss_clip": 0.06278248, + "balance_loss_mlp": 0.01257013, + "epoch": 0.5596272358334586, + "flos": 20671407060480.0, + "grad_norm": 1.5654652346016935, + "language_loss": 0.7418704, + "learning_rate": 1.7118312121364943e-06, + "loss": 0.81875765, + "num_input_tokens_seen": 200448290, + "router_z_loss_clip": 1.42871094, + "router_z_loss_mlp": 0.10467529, + "step": 9308, + "time_per_iteration": 2.527722120285034 + }, + { + "auxiliary_loss_clip": 0.06423165, + "auxiliary_loss_mlp": 0.01265447, + "balance_loss_clip": 0.06275736, + "balance_loss_mlp": 0.01253371, + "epoch": 0.5596873590861265, + "flos": 25047170979840.0, + "grad_norm": 1.7977154981427412, + "language_loss": 0.70390081, + "learning_rate": 1.7114458197967257e-06, + "loss": 0.78078693, + "num_input_tokens_seen": 200466555, + "router_z_loss_clip": 1.47363281, + "router_z_loss_mlp": 0.12072754, + "step": 9309, + "time_per_iteration": 2.5592753887176514 + }, + { + "auxiliary_loss_clip": 0.06425751, + "auxiliary_loss_mlp": 0.01268716, + "balance_loss_clip": 0.06278521, + "balance_loss_mlp": 0.01255889, + "epoch": 0.5597474823387946, + "flos": 25965573655680.0, + "grad_norm": 1.826608872454741, + "language_loss": 0.7546587, + "learning_rate": 1.7110604383989613e-06, + "loss": 0.83160329, + "num_input_tokens_seen": 200485980, + "router_z_loss_clip": 1.47265625, + "router_z_loss_mlp": 0.12835693, + "step": 9310, + "time_per_iteration": 2.5775809288024902 + }, + { + "auxiliary_loss_clip": 0.06428897, + "auxiliary_loss_mlp": 0.01266019, + "balance_loss_clip": 0.06280525, + "balance_loss_mlp": 0.0125343, + "epoch": 0.5598076055914625, + "flos": 26184688882560.0, + "grad_norm": 2.287225356977705, + "language_loss": 0.70149207, + "learning_rate": 1.7106750679578133e-06, + "loss": 0.77844125, + "num_input_tokens_seen": 200504555, + "router_z_loss_clip": 1.48339844, + "router_z_loss_mlp": 0.12579346, + "step": 9311, + "time_per_iteration": 3.9833383560180664 + }, + { + "auxiliary_loss_clip": 0.06422099, + "auxiliary_loss_mlp": 0.01265696, + "balance_loss_clip": 0.06277782, + "balance_loss_mlp": 0.01254061, + "epoch": 0.5598677288441305, + "flos": 11660541235200.0, + "grad_norm": 2.2749325214124605, + "language_loss": 0.72917002, + "learning_rate": 1.7102897084878962e-06, + "loss": 0.80604798, + "num_input_tokens_seen": 200522700, + "router_z_loss_clip": 1.44335938, + "router_z_loss_mlp": 0.11645508, + "step": 9312, + "time_per_iteration": 2.5323050022125244 + }, + { + "auxiliary_loss_clip": 0.06420854, + "auxiliary_loss_mlp": 0.01267281, + "balance_loss_clip": 0.06276432, + "balance_loss_mlp": 0.01255772, + "epoch": 0.5599278520967984, + "flos": 22973290899840.0, + "grad_norm": 1.8427769518341257, + "language_loss": 0.89498973, + "learning_rate": 1.709904360003822e-06, + "loss": 0.97187102, + "num_input_tokens_seen": 200541910, + "router_z_loss_clip": 1.44335938, + "router_z_loss_mlp": 0.1151123, + "step": 9313, + "time_per_iteration": 2.5141191482543945 + }, + { + "auxiliary_loss_clip": 0.06423395, + "auxiliary_loss_mlp": 0.01268039, + "balance_loss_clip": 0.06279235, + "balance_loss_mlp": 0.01256804, + "epoch": 0.5599879753494664, + "flos": 21222004239360.0, + "grad_norm": 1.3323867384007686, + "language_loss": 0.7802453, + "learning_rate": 1.709519022520204e-06, + "loss": 0.85715961, + "num_input_tokens_seen": 200562600, + "router_z_loss_clip": 1.44140625, + "router_z_loss_mlp": 0.11242676, + "step": 9314, + "time_per_iteration": 2.587451934814453 + }, + { + "auxiliary_loss_clip": 0.06420899, + "auxiliary_loss_mlp": 0.01265189, + "balance_loss_clip": 0.06276683, + "balance_loss_mlp": 0.01254109, + "epoch": 0.5600480986021343, + "flos": 31911006510720.0, + "grad_norm": 1.5829567025911722, + "language_loss": 0.70587456, + "learning_rate": 1.7091336960516537e-06, + "loss": 0.78273547, + "num_input_tokens_seen": 200584795, + "router_z_loss_clip": 1.44140625, + "router_z_loss_mlp": 0.11083984, + "step": 9315, + "time_per_iteration": 2.585667371749878 + }, + { + "auxiliary_loss_clip": 0.06425041, + "auxiliary_loss_mlp": 0.01268206, + "balance_loss_clip": 0.06275864, + "balance_loss_mlp": 0.01256571, + "epoch": 0.5601082218548024, + "flos": 28483679756160.0, + "grad_norm": 1.7585144874491871, + "language_loss": 0.67066777, + "learning_rate": 1.7087483806127824e-06, + "loss": 0.7476002, + "num_input_tokens_seen": 200606945, + "router_z_loss_clip": 1.49023438, + "router_z_loss_mlp": 0.11645508, + "step": 9316, + "time_per_iteration": 2.5536792278289795 + }, + { + "auxiliary_loss_clip": 0.06421398, + "auxiliary_loss_mlp": 0.01264577, + "balance_loss_clip": 0.06276462, + "balance_loss_mlp": 0.01253324, + "epoch": 0.5601683451074703, + "flos": 24103974695040.0, + "grad_norm": 1.9270955506174936, + "language_loss": 0.87415564, + "learning_rate": 1.7083630762182022e-06, + "loss": 0.95101541, + "num_input_tokens_seen": 200626340, + "router_z_loss_clip": 1.45019531, + "router_z_loss_mlp": 0.11236572, + "step": 9317, + "time_per_iteration": 2.6297550201416016 + }, + { + "auxiliary_loss_clip": 0.06425779, + "auxiliary_loss_mlp": 0.01267741, + "balance_loss_clip": 0.06277692, + "balance_loss_mlp": 0.01255122, + "epoch": 0.5602284683601383, + "flos": 26362868590080.0, + "grad_norm": 1.81541721599753, + "language_loss": 0.77282947, + "learning_rate": 1.7079777828825233e-06, + "loss": 0.84976465, + "num_input_tokens_seen": 200644520, + "router_z_loss_clip": 1.47949219, + "router_z_loss_mlp": 0.1260376, + "step": 9318, + "time_per_iteration": 2.558359146118164 + }, + { + "auxiliary_loss_clip": 0.06418364, + "auxiliary_loss_mlp": 0.01266654, + "balance_loss_clip": 0.06273092, + "balance_loss_mlp": 0.01256301, + "epoch": 0.5602885916128063, + "flos": 24502904784000.0, + "grad_norm": 1.570238706906967, + "language_loss": 0.76465648, + "learning_rate": 1.7075925006203558e-06, + "loss": 0.84150666, + "num_input_tokens_seen": 200664845, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.10357666, + "step": 9319, + "time_per_iteration": 2.526543617248535 + }, + { + "auxiliary_loss_clip": 0.06418289, + "auxiliary_loss_mlp": 0.01264734, + "balance_loss_clip": 0.06273629, + "balance_loss_mlp": 0.01253427, + "epoch": 0.5603487148654742, + "flos": 27352450909440.0, + "grad_norm": 1.3333617188310043, + "language_loss": 0.85846102, + "learning_rate": 1.7072072294463101e-06, + "loss": 0.93529117, + "num_input_tokens_seen": 200686535, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.11309814, + "step": 9320, + "time_per_iteration": 2.5673651695251465 + }, + { + "auxiliary_loss_clip": 0.06334086, + "auxiliary_loss_mlp": 0.01252081, + "balance_loss_clip": 0.06272272, + "balance_loss_mlp": 0.01250187, + "epoch": 0.5604088381181422, + "flos": 54105555962880.0, + "grad_norm": 0.7541324814402665, + "language_loss": 0.52607638, + "learning_rate": 1.706821969374996e-06, + "loss": 0.60193801, + "num_input_tokens_seen": 200736965, + "router_z_loss_clip": 0.6171875, + "router_z_loss_mlp": 0.01890564, + "step": 9321, + "time_per_iteration": 2.977881908416748 + }, + { + "auxiliary_loss_clip": 0.06418586, + "auxiliary_loss_mlp": 0.01265276, + "balance_loss_clip": 0.06276635, + "balance_loss_mlp": 0.01254208, + "epoch": 0.5604689613708101, + "flos": 22242878858880.0, + "grad_norm": 1.3667787345793438, + "language_loss": 0.7480129, + "learning_rate": 1.7064367204210216e-06, + "loss": 0.82485151, + "num_input_tokens_seen": 200757420, + "router_z_loss_clip": 1.41699219, + "router_z_loss_mlp": 0.1105957, + "step": 9322, + "time_per_iteration": 2.532274007797241 + }, + { + "auxiliary_loss_clip": 0.06422681, + "auxiliary_loss_mlp": 0.01271383, + "balance_loss_clip": 0.06276275, + "balance_loss_mlp": 0.01258842, + "epoch": 0.5605290846234782, + "flos": 35306370132480.0, + "grad_norm": 1.7253794934771503, + "language_loss": 0.73680359, + "learning_rate": 1.7060514825989963e-06, + "loss": 0.81374425, + "num_input_tokens_seen": 200779520, + "router_z_loss_clip": 1.46289062, + "router_z_loss_mlp": 0.12542725, + "step": 9323, + "time_per_iteration": 2.6399970054626465 + }, + { + "auxiliary_loss_clip": 0.06425279, + "auxiliary_loss_mlp": 0.01266665, + "balance_loss_clip": 0.06275266, + "balance_loss_mlp": 0.01254505, + "epoch": 0.5605892078761461, + "flos": 20268997027200.0, + "grad_norm": 1.5398366577575928, + "language_loss": 0.62584162, + "learning_rate": 1.7056662559235286e-06, + "loss": 0.70276111, + "num_input_tokens_seen": 200799485, + "router_z_loss_clip": 1.49902344, + "router_z_loss_mlp": 0.12164307, + "step": 9324, + "time_per_iteration": 2.5179386138916016 + }, + { + "auxiliary_loss_clip": 0.06420085, + "auxiliary_loss_mlp": 0.01268132, + "balance_loss_clip": 0.0627415, + "balance_loss_mlp": 0.01255055, + "epoch": 0.5606493311288141, + "flos": 17313582867840.0, + "grad_norm": 2.467078298144656, + "language_loss": 0.88032669, + "learning_rate": 1.705281040409226e-06, + "loss": 0.95720887, + "num_input_tokens_seen": 200817540, + "router_z_loss_clip": 1.45898438, + "router_z_loss_mlp": 0.13092041, + "step": 9325, + "time_per_iteration": 2.5009984970092773 + }, + { + "auxiliary_loss_clip": 0.06425651, + "auxiliary_loss_mlp": 0.01271739, + "balance_loss_clip": 0.0627806, + "balance_loss_mlp": 0.01259454, + "epoch": 0.560709454381482, + "flos": 21659438079360.0, + "grad_norm": 1.5802994463075606, + "language_loss": 0.74048662, + "learning_rate": 1.7048958360706952e-06, + "loss": 0.81746054, + "num_input_tokens_seen": 200838380, + "router_z_loss_clip": 1.47558594, + "router_z_loss_mlp": 0.1229248, + "step": 9326, + "time_per_iteration": 2.53534197807312 + }, + { + "auxiliary_loss_clip": 0.06427591, + "auxiliary_loss_mlp": 0.0127498, + "balance_loss_clip": 0.06276761, + "balance_loss_mlp": 0.01262648, + "epoch": 0.56076957763415, + "flos": 20309639057280.0, + "grad_norm": 1.7151684776487535, + "language_loss": 0.79090071, + "learning_rate": 1.7045106429225447e-06, + "loss": 0.86792642, + "num_input_tokens_seen": 200855640, + "router_z_loss_clip": 1.50976562, + "router_z_loss_mlp": 0.12329102, + "step": 9327, + "time_per_iteration": 2.505734920501709 + }, + { + "auxiliary_loss_clip": 0.06422938, + "auxiliary_loss_mlp": 0.01268373, + "balance_loss_clip": 0.06277183, + "balance_loss_mlp": 0.01256201, + "epoch": 0.5608297008868179, + "flos": 25052873057280.0, + "grad_norm": 1.3540928387883675, + "language_loss": 0.7848016, + "learning_rate": 1.7041254609793795e-06, + "loss": 0.86171472, + "num_input_tokens_seen": 200876585, + "router_z_loss_clip": 1.45605469, + "router_z_loss_mlp": 0.12176514, + "step": 9328, + "time_per_iteration": 2.5479724407196045 + }, + { + "auxiliary_loss_clip": 0.06421052, + "auxiliary_loss_mlp": 0.01265937, + "balance_loss_clip": 0.06277333, + "balance_loss_mlp": 0.01255023, + "epoch": 0.560889824139486, + "flos": 19873253393280.0, + "grad_norm": 1.4144017329991472, + "language_loss": 0.7383225, + "learning_rate": 1.7037402902558066e-06, + "loss": 0.8151924, + "num_input_tokens_seen": 200898175, + "router_z_loss_clip": 1.43652344, + "router_z_loss_mlp": 0.10913086, + "step": 9329, + "time_per_iteration": 2.665193796157837 + }, + { + "auxiliary_loss_clip": 0.06430677, + "auxiliary_loss_mlp": 0.01265446, + "balance_loss_clip": 0.06278004, + "balance_loss_mlp": 0.01253269, + "epoch": 0.5609499473921539, + "flos": 22935961105920.0, + "grad_norm": 1.4811079467360542, + "language_loss": 0.83903289, + "learning_rate": 1.7033551307664324e-06, + "loss": 0.91599417, + "num_input_tokens_seen": 200917515, + "router_z_loss_clip": 1.5234375, + "router_z_loss_mlp": 0.12176514, + "step": 9330, + "time_per_iteration": 2.574812650680542 + }, + { + "auxiliary_loss_clip": 0.06343255, + "auxiliary_loss_mlp": 0.01254504, + "balance_loss_clip": 0.06281585, + "balance_loss_mlp": 0.01252853, + "epoch": 0.5610100706448219, + "flos": 53054479146240.0, + "grad_norm": 0.7010589280292991, + "language_loss": 0.57785869, + "learning_rate": 1.7029699825258603e-06, + "loss": 0.65383625, + "num_input_tokens_seen": 200978615, + "router_z_loss_clip": 0.6171875, + "router_z_loss_mlp": 0.01654053, + "step": 9331, + "time_per_iteration": 3.16204833984375 + }, + { + "auxiliary_loss_clip": 0.06429492, + "auxiliary_loss_mlp": 0.01266406, + "balance_loss_clip": 0.06280065, + "balance_loss_mlp": 0.01254723, + "epoch": 0.5610701938974898, + "flos": 21841349293440.0, + "grad_norm": 1.62115536838187, + "language_loss": 0.81915009, + "learning_rate": 1.7025848455486971e-06, + "loss": 0.89610904, + "num_input_tokens_seen": 200997745, + "router_z_loss_clip": 1.49414062, + "router_z_loss_mlp": 0.11682129, + "step": 9332, + "time_per_iteration": 2.503162145614624 + }, + { + "auxiliary_loss_clip": 0.06436246, + "auxiliary_loss_mlp": 0.01268376, + "balance_loss_clip": 0.06285603, + "balance_loss_mlp": 0.01255936, + "epoch": 0.5611303171501578, + "flos": 17462943970560.0, + "grad_norm": 2.4447262023658314, + "language_loss": 0.8238855, + "learning_rate": 1.7021997198495454e-06, + "loss": 0.90093172, + "num_input_tokens_seen": 201016370, + "router_z_loss_clip": 1.50585938, + "router_z_loss_mlp": 0.12451172, + "step": 9333, + "time_per_iteration": 2.5434911251068115 + }, + { + "auxiliary_loss_clip": 0.06429712, + "auxiliary_loss_mlp": 0.01266007, + "balance_loss_clip": 0.062811, + "balance_loss_mlp": 0.01254843, + "epoch": 0.5611904404028258, + "flos": 22644366497280.0, + "grad_norm": 1.7517485290647843, + "language_loss": 0.73036361, + "learning_rate": 1.7018146054430108e-06, + "loss": 0.80732077, + "num_input_tokens_seen": 201034310, + "router_z_loss_clip": 1.48535156, + "router_z_loss_mlp": 0.11157227, + "step": 9334, + "time_per_iteration": 2.5099892616271973 + }, + { + "auxiliary_loss_clip": 0.06427494, + "auxiliary_loss_mlp": 0.01271173, + "balance_loss_clip": 0.06281948, + "balance_loss_mlp": 0.01259771, + "epoch": 0.5612505636554938, + "flos": 14321048549760.0, + "grad_norm": 1.6258746678295788, + "language_loss": 0.71251893, + "learning_rate": 1.7014295023436961e-06, + "loss": 0.7895056, + "num_input_tokens_seen": 201052030, + "router_z_loss_clip": 1.453125, + "router_z_loss_mlp": 0.11395264, + "step": 9335, + "time_per_iteration": 3.8910462856292725 + }, + { + "auxiliary_loss_clip": 0.06430685, + "auxiliary_loss_mlp": 0.01266094, + "balance_loss_clip": 0.06283418, + "balance_loss_mlp": 0.01254149, + "epoch": 0.5613106869081618, + "flos": 16513835973120.0, + "grad_norm": 1.6562270786725333, + "language_loss": 0.7703501, + "learning_rate": 1.701044410566205e-06, + "loss": 0.84731793, + "num_input_tokens_seen": 201068445, + "router_z_loss_clip": 1.46972656, + "router_z_loss_mlp": 0.11932373, + "step": 9336, + "time_per_iteration": 2.5473687648773193 + }, + { + "auxiliary_loss_clip": 0.0642574, + "auxiliary_loss_mlp": 0.01265654, + "balance_loss_clip": 0.06282386, + "balance_loss_mlp": 0.0125489, + "epoch": 0.5613708101608297, + "flos": 24065009746560.0, + "grad_norm": 2.1630350478443625, + "language_loss": 0.64571506, + "learning_rate": 1.7006593301251393e-06, + "loss": 0.72262907, + "num_input_tokens_seen": 201082140, + "router_z_loss_clip": 1.43359375, + "router_z_loss_mlp": 0.10766602, + "step": 9337, + "time_per_iteration": 2.5193097591400146 + }, + { + "auxiliary_loss_clip": 0.06341661, + "auxiliary_loss_mlp": 0.01252845, + "balance_loss_clip": 0.06279477, + "balance_loss_mlp": 0.01251057, + "epoch": 0.5614309334134977, + "flos": 64922284984320.0, + "grad_norm": 0.883081868959654, + "language_loss": 0.62614578, + "learning_rate": 1.700274261035102e-06, + "loss": 0.7020908, + "num_input_tokens_seen": 201137245, + "router_z_loss_clip": 0.62353516, + "router_z_loss_mlp": 0.01785278, + "step": 9338, + "time_per_iteration": 3.115088939666748 + }, + { + "auxiliary_loss_clip": 0.06430536, + "auxiliary_loss_mlp": 0.01265908, + "balance_loss_clip": 0.0628281, + "balance_loss_mlp": 0.01254428, + "epoch": 0.5614910566661656, + "flos": 32926975666560.0, + "grad_norm": 1.7643724476932883, + "language_loss": 0.66069186, + "learning_rate": 1.6998892033106946e-06, + "loss": 0.73765635, + "num_input_tokens_seen": 201157270, + "router_z_loss_clip": 1.47851562, + "router_z_loss_mlp": 0.11474609, + "step": 9339, + "time_per_iteration": 4.156280040740967 + }, + { + "auxiliary_loss_clip": 0.06427112, + "auxiliary_loss_mlp": 0.01266835, + "balance_loss_clip": 0.06283177, + "balance_loss_mlp": 0.01254055, + "epoch": 0.5615511799188336, + "flos": 18594927504000.0, + "grad_norm": 1.6693116386089952, + "language_loss": 0.69893128, + "learning_rate": 1.6995041569665184e-06, + "loss": 0.77587074, + "num_input_tokens_seen": 201174530, + "router_z_loss_clip": 1.44042969, + "router_z_loss_mlp": 0.12774658, + "step": 9340, + "time_per_iteration": 2.4951670169830322 + }, + { + "auxiliary_loss_clip": 0.06425936, + "auxiliary_loss_mlp": 0.0126872, + "balance_loss_clip": 0.06286716, + "balance_loss_mlp": 0.01257168, + "epoch": 0.5616113031715015, + "flos": 22826571200640.0, + "grad_norm": 1.554264314492227, + "language_loss": 0.77897537, + "learning_rate": 1.6991191220171756e-06, + "loss": 0.85592192, + "num_input_tokens_seen": 201194905, + "router_z_loss_clip": 1.39257812, + "router_z_loss_mlp": 0.11566162, + "step": 9341, + "time_per_iteration": 2.557020902633667 + }, + { + "auxiliary_loss_clip": 0.06432091, + "auxiliary_loss_mlp": 0.01269049, + "balance_loss_clip": 0.06284195, + "balance_loss_mlp": 0.01256776, + "epoch": 0.5616714264241696, + "flos": 22352184910080.0, + "grad_norm": 1.797407374183417, + "language_loss": 0.80132401, + "learning_rate": 1.6987340984772653e-06, + "loss": 0.87833536, + "num_input_tokens_seen": 201213715, + "router_z_loss_clip": 1.47851562, + "router_z_loss_mlp": 0.12261963, + "step": 9342, + "time_per_iteration": 2.5441479682922363 + }, + { + "auxiliary_loss_clip": 0.06439396, + "auxiliary_loss_mlp": 0.01269037, + "balance_loss_clip": 0.06290646, + "balance_loss_mlp": 0.01257325, + "epoch": 0.5617315496768375, + "flos": 18813875022720.0, + "grad_norm": 2.3951377685236346, + "language_loss": 0.75757158, + "learning_rate": 1.6983490863613882e-06, + "loss": 0.83465594, + "num_input_tokens_seen": 201231415, + "router_z_loss_clip": 1.48828125, + "router_z_loss_mlp": 0.1171875, + "step": 9343, + "time_per_iteration": 2.552783489227295 + }, + { + "auxiliary_loss_clip": 0.06435137, + "auxiliary_loss_mlp": 0.01268416, + "balance_loss_clip": 0.06290908, + "balance_loss_mlp": 0.0125656, + "epoch": 0.5617916729295055, + "flos": 18375225298560.0, + "grad_norm": 1.7365132961619254, + "language_loss": 0.69429743, + "learning_rate": 1.6979640856841442e-06, + "loss": 0.77133292, + "num_input_tokens_seen": 201249625, + "router_z_loss_clip": 1.44140625, + "router_z_loss_mlp": 0.11853027, + "step": 9344, + "time_per_iteration": 3.940319061279297 + }, + { + "auxiliary_loss_clip": 0.06436205, + "auxiliary_loss_mlp": 0.01265611, + "balance_loss_clip": 0.06290596, + "balance_loss_mlp": 0.01254048, + "epoch": 0.5618517961821734, + "flos": 28186844267520.0, + "grad_norm": 2.084209166838754, + "language_loss": 0.66667032, + "learning_rate": 1.6975790964601318e-06, + "loss": 0.74368846, + "num_input_tokens_seen": 201271205, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.11560059, + "step": 9345, + "time_per_iteration": 2.5695786476135254 + }, + { + "auxiliary_loss_clip": 0.06434141, + "auxiliary_loss_mlp": 0.01269002, + "balance_loss_clip": 0.06287882, + "balance_loss_mlp": 0.01257683, + "epoch": 0.5619119194348414, + "flos": 15492290520960.0, + "grad_norm": 1.7418235878832828, + "language_loss": 0.88078266, + "learning_rate": 1.6971941187039512e-06, + "loss": 0.9578141, + "num_input_tokens_seen": 201287700, + "router_z_loss_clip": 1.46289062, + "router_z_loss_mlp": 0.11328125, + "step": 9346, + "time_per_iteration": 2.470212697982788 + }, + { + "auxiliary_loss_clip": 0.06433322, + "auxiliary_loss_mlp": 0.01273387, + "balance_loss_clip": 0.06289656, + "balance_loss_mlp": 0.01261257, + "epoch": 0.5619720426875094, + "flos": 29135700702720.0, + "grad_norm": 2.0124429779516335, + "language_loss": 0.5980221, + "learning_rate": 1.6968091524301993e-06, + "loss": 0.67508924, + "num_input_tokens_seen": 201307530, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.12139893, + "step": 9347, + "time_per_iteration": 2.5825982093811035 + }, + { + "auxiliary_loss_clip": 0.06435403, + "auxiliary_loss_mlp": 0.01270938, + "balance_loss_clip": 0.06288013, + "balance_loss_mlp": 0.01258349, + "epoch": 0.5620321659401774, + "flos": 18009474226560.0, + "grad_norm": 2.2126455504112066, + "language_loss": 0.69822383, + "learning_rate": 1.6964241976534745e-06, + "loss": 0.77528727, + "num_input_tokens_seen": 201326210, + "router_z_loss_clip": 1.47265625, + "router_z_loss_mlp": 0.12609863, + "step": 9348, + "time_per_iteration": 2.5037167072296143 + }, + { + "auxiliary_loss_clip": 0.0644159, + "auxiliary_loss_mlp": 0.01266077, + "balance_loss_clip": 0.06289469, + "balance_loss_mlp": 0.01254037, + "epoch": 0.5620922891928454, + "flos": 20600730541440.0, + "grad_norm": 3.445873194626742, + "language_loss": 0.79441649, + "learning_rate": 1.6960392543883754e-06, + "loss": 0.87149316, + "num_input_tokens_seen": 201346120, + "router_z_loss_clip": 1.52148438, + "router_z_loss_mlp": 0.12036133, + "step": 9349, + "time_per_iteration": 2.5519816875457764 + }, + { + "auxiliary_loss_clip": 0.06431362, + "auxiliary_loss_mlp": 0.01269513, + "balance_loss_clip": 0.06285249, + "balance_loss_mlp": 0.01257014, + "epoch": 0.5621524124455133, + "flos": 26294288423040.0, + "grad_norm": 2.015932955485816, + "language_loss": 0.67743355, + "learning_rate": 1.6956543226494975e-06, + "loss": 0.75444239, + "num_input_tokens_seen": 201365700, + "router_z_loss_clip": 1.45996094, + "router_z_loss_mlp": 0.12493896, + "step": 9350, + "time_per_iteration": 4.01330304145813 + }, + { + "auxiliary_loss_clip": 0.06434298, + "auxiliary_loss_mlp": 0.012681, + "balance_loss_clip": 0.06285301, + "balance_loss_mlp": 0.01256281, + "epoch": 0.5622125356981813, + "flos": 12755236901760.0, + "grad_norm": 2.011118504157059, + "language_loss": 0.78970456, + "learning_rate": 1.6952694024514381e-06, + "loss": 0.86672854, + "num_input_tokens_seen": 201382795, + "router_z_loss_clip": 1.49023438, + "router_z_loss_mlp": 0.11834717, + "step": 9351, + "time_per_iteration": 2.502434015274048 + }, + { + "auxiliary_loss_clip": 0.06430681, + "auxiliary_loss_mlp": 0.01265572, + "balance_loss_clip": 0.06279105, + "balance_loss_mlp": 0.01252894, + "epoch": 0.5622726589508492, + "flos": 23812086597120.0, + "grad_norm": 1.4860121982116354, + "language_loss": 0.59339732, + "learning_rate": 1.6948844938087945e-06, + "loss": 0.67035985, + "num_input_tokens_seen": 201402780, + "router_z_loss_clip": 1.51660156, + "router_z_loss_mlp": 0.12677002, + "step": 9352, + "time_per_iteration": 2.5574684143066406 + }, + { + "auxiliary_loss_clip": 0.06420172, + "auxiliary_loss_mlp": 0.01265668, + "balance_loss_clip": 0.062802, + "balance_loss_mlp": 0.01255041, + "epoch": 0.5623327822035172, + "flos": 24725248392960.0, + "grad_norm": 2.450009031651053, + "language_loss": 0.72177416, + "learning_rate": 1.6944995967361604e-06, + "loss": 0.7986325, + "num_input_tokens_seen": 201424140, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.10632324, + "step": 9353, + "time_per_iteration": 2.5429112911224365 + }, + { + "auxiliary_loss_clip": 0.06427602, + "auxiliary_loss_mlp": 0.01266418, + "balance_loss_clip": 0.06280185, + "balance_loss_mlp": 0.01255207, + "epoch": 0.5623929054561851, + "flos": 14023081031040.0, + "grad_norm": 3.091375667054191, + "language_loss": 0.7687071, + "learning_rate": 1.6941147112481327e-06, + "loss": 0.84564734, + "num_input_tokens_seen": 201439645, + "router_z_loss_clip": 1.47460938, + "router_z_loss_mlp": 0.11212158, + "step": 9354, + "time_per_iteration": 2.511843204498291 + }, + { + "auxiliary_loss_clip": 0.0643307, + "auxiliary_loss_mlp": 0.01268158, + "balance_loss_clip": 0.0628096, + "balance_loss_mlp": 0.01256672, + "epoch": 0.5624530287088532, + "flos": 20710707425280.0, + "grad_norm": 1.9243574999426976, + "language_loss": 0.72663665, + "learning_rate": 1.6937298373593056e-06, + "loss": 0.80364901, + "num_input_tokens_seen": 201459970, + "router_z_loss_clip": 1.52050781, + "router_z_loss_mlp": 0.1149292, + "step": 9355, + "time_per_iteration": 2.5472323894500732 + }, + { + "auxiliary_loss_clip": 0.06422609, + "auxiliary_loss_mlp": 0.01264166, + "balance_loss_clip": 0.06276853, + "balance_loss_mlp": 0.01252638, + "epoch": 0.5625131519615211, + "flos": 21477401084160.0, + "grad_norm": 1.4661709593952188, + "language_loss": 0.73949313, + "learning_rate": 1.693344975084274e-06, + "loss": 0.81636083, + "num_input_tokens_seen": 201480055, + "router_z_loss_clip": 1.45800781, + "router_z_loss_mlp": 0.11535645, + "step": 9356, + "time_per_iteration": 2.5417375564575195 + }, + { + "auxiliary_loss_clip": 0.06421204, + "auxiliary_loss_mlp": 0.01265523, + "balance_loss_clip": 0.0627971, + "balance_loss_mlp": 0.01254043, + "epoch": 0.5625732752141891, + "flos": 18704023920000.0, + "grad_norm": 1.8811670281572186, + "language_loss": 0.83384252, + "learning_rate": 1.6929601244376318e-06, + "loss": 0.9107098, + "num_input_tokens_seen": 201497645, + "router_z_loss_clip": 1.41601562, + "router_z_loss_mlp": 0.11480713, + "step": 9357, + "time_per_iteration": 2.4678521156311035 + }, + { + "auxiliary_loss_clip": 0.06426045, + "auxiliary_loss_mlp": 0.01266314, + "balance_loss_clip": 0.06279635, + "balance_loss_mlp": 0.01255705, + "epoch": 0.562633398466857, + "flos": 16222492926720.0, + "grad_norm": 2.0645024289256293, + "language_loss": 0.7263062, + "learning_rate": 1.6925752854339722e-06, + "loss": 0.80322981, + "num_input_tokens_seen": 201515455, + "router_z_loss_clip": 1.46289062, + "router_z_loss_mlp": 0.1060791, + "step": 9358, + "time_per_iteration": 2.5186126232147217 + }, + { + "auxiliary_loss_clip": 0.06416523, + "auxiliary_loss_mlp": 0.01266054, + "balance_loss_clip": 0.06273469, + "balance_loss_mlp": 0.0125408, + "epoch": 0.562693521719525, + "flos": 22498485338880.0, + "grad_norm": 1.808809546066597, + "language_loss": 0.78313565, + "learning_rate": 1.6921904580878885e-06, + "loss": 0.85996139, + "num_input_tokens_seen": 201534500, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.11981201, + "step": 9359, + "time_per_iteration": 2.4950146675109863 + }, + { + "auxiliary_loss_clip": 0.06422278, + "auxiliary_loss_mlp": 0.01266031, + "balance_loss_clip": 0.06277263, + "balance_loss_mlp": 0.01254123, + "epoch": 0.562753644972193, + "flos": 25337088506880.0, + "grad_norm": 1.6393117198147682, + "language_loss": 0.70198202, + "learning_rate": 1.6918056424139736e-06, + "loss": 0.77886516, + "num_input_tokens_seen": 201553280, + "router_z_loss_clip": 1.44921875, + "router_z_loss_mlp": 0.11920166, + "step": 9360, + "time_per_iteration": 2.5677337646484375 + }, + { + "auxiliary_loss_clip": 0.06333196, + "auxiliary_loss_mlp": 0.01259618, + "balance_loss_clip": 0.06271995, + "balance_loss_mlp": 0.01258209, + "epoch": 0.562813768224861, + "flos": 67410566231040.0, + "grad_norm": 0.7608015706194778, + "language_loss": 0.55599511, + "learning_rate": 1.6914208384268197e-06, + "loss": 0.63192326, + "num_input_tokens_seen": 201610030, + "router_z_loss_clip": 0.61328125, + "router_z_loss_mlp": 0.0140686, + "step": 9361, + "time_per_iteration": 3.047746419906616 + }, + { + "auxiliary_loss_clip": 0.06421309, + "auxiliary_loss_mlp": 0.01270958, + "balance_loss_clip": 0.06278641, + "balance_loss_mlp": 0.01260271, + "epoch": 0.562873891477529, + "flos": 23337868014720.0, + "grad_norm": 1.4415772957289732, + "language_loss": 0.82031697, + "learning_rate": 1.691036046141018e-06, + "loss": 0.89723963, + "num_input_tokens_seen": 201628370, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.10687256, + "step": 9362, + "time_per_iteration": 2.5085341930389404 + }, + { + "auxiliary_loss_clip": 0.06425183, + "auxiliary_loss_mlp": 0.01265052, + "balance_loss_clip": 0.06282046, + "balance_loss_mlp": 0.01254067, + "epoch": 0.5629340147301969, + "flos": 38482073475840.0, + "grad_norm": 1.5514506959778531, + "language_loss": 0.74991751, + "learning_rate": 1.6906512655711614e-06, + "loss": 0.8268199, + "num_input_tokens_seen": 201649790, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.10992432, + "step": 9363, + "time_per_iteration": 2.6483652591705322 + }, + { + "auxiliary_loss_clip": 0.06428041, + "auxiliary_loss_mlp": 0.01269517, + "balance_loss_clip": 0.06280389, + "balance_loss_mlp": 0.01257573, + "epoch": 0.5629941379828649, + "flos": 29249744509440.0, + "grad_norm": 1.527132274705304, + "language_loss": 0.82966727, + "learning_rate": 1.690266496731839e-06, + "loss": 0.90664279, + "num_input_tokens_seen": 201669175, + "router_z_loss_clip": 1.47460938, + "router_z_loss_mlp": 0.11962891, + "step": 9364, + "time_per_iteration": 2.585028648376465 + }, + { + "auxiliary_loss_clip": 0.06420554, + "auxiliary_loss_mlp": 0.01264228, + "balance_loss_clip": 0.06281281, + "balance_loss_mlp": 0.01253207, + "epoch": 0.5630542612355328, + "flos": 19425882844800.0, + "grad_norm": 1.9441356766600106, + "language_loss": 0.65449685, + "learning_rate": 1.689881739637642e-06, + "loss": 0.7313447, + "num_input_tokens_seen": 201687000, + "router_z_loss_clip": 1.39257812, + "router_z_loss_mlp": 0.11022949, + "step": 9365, + "time_per_iteration": 2.5320210456848145 + }, + { + "auxiliary_loss_clip": 0.06432588, + "auxiliary_loss_mlp": 0.01270623, + "balance_loss_clip": 0.06279749, + "balance_loss_mlp": 0.0125841, + "epoch": 0.5631143844882008, + "flos": 22271697463680.0, + "grad_norm": 2.4081978900655114, + "language_loss": 0.81779563, + "learning_rate": 1.6894969943031611e-06, + "loss": 0.89482784, + "num_input_tokens_seen": 201703335, + "router_z_loss_clip": 1.52929688, + "router_z_loss_mlp": 0.12213135, + "step": 9366, + "time_per_iteration": 2.5602293014526367 + }, + { + "auxiliary_loss_clip": 0.06419416, + "auxiliary_loss_mlp": 0.01263434, + "balance_loss_clip": 0.06277686, + "balance_loss_mlp": 0.01253033, + "epoch": 0.5631745077408687, + "flos": 22971781526400.0, + "grad_norm": 1.4555155937951827, + "language_loss": 0.73903221, + "learning_rate": 1.6891122607429845e-06, + "loss": 0.81586075, + "num_input_tokens_seen": 201723495, + "router_z_loss_clip": 1.41894531, + "router_z_loss_mlp": 0.10400391, + "step": 9367, + "time_per_iteration": 2.5222184658050537 + }, + { + "auxiliary_loss_clip": 0.0633425, + "auxiliary_loss_mlp": 0.01256933, + "balance_loss_clip": 0.06272865, + "balance_loss_mlp": 0.01255295, + "epoch": 0.5632346309935368, + "flos": 65101917409920.0, + "grad_norm": 0.6175920076853201, + "language_loss": 0.5334087, + "learning_rate": 1.6887275389717028e-06, + "loss": 0.60932058, + "num_input_tokens_seen": 201792615, + "router_z_loss_clip": 0.61669922, + "router_z_loss_mlp": 0.0164032, + "step": 9368, + "time_per_iteration": 3.3093104362487793 + }, + { + "auxiliary_loss_clip": 0.06421301, + "auxiliary_loss_mlp": 0.0127307, + "balance_loss_clip": 0.06277905, + "balance_loss_mlp": 0.01261757, + "epoch": 0.5632947542462047, + "flos": 23009572517760.0, + "grad_norm": 1.6075197920052449, + "language_loss": 0.69183493, + "learning_rate": 1.6883428290039046e-06, + "loss": 0.76877862, + "num_input_tokens_seen": 201812520, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.11315918, + "step": 9369, + "time_per_iteration": 2.5406625270843506 + }, + { + "auxiliary_loss_clip": 0.06420332, + "auxiliary_loss_mlp": 0.01269293, + "balance_loss_clip": 0.06275883, + "balance_loss_mlp": 0.01258105, + "epoch": 0.5633548774988727, + "flos": 30490530969600.0, + "grad_norm": 1.6779781841725052, + "language_loss": 0.76048809, + "learning_rate": 1.6879581308541763e-06, + "loss": 0.83738434, + "num_input_tokens_seen": 201834185, + "router_z_loss_clip": 1.4453125, + "router_z_loss_mlp": 0.11175537, + "step": 9370, + "time_per_iteration": 2.591212272644043 + }, + { + "auxiliary_loss_clip": 0.06424968, + "auxiliary_loss_mlp": 0.01266151, + "balance_loss_clip": 0.06275932, + "balance_loss_mlp": 0.01253908, + "epoch": 0.5634150007515406, + "flos": 18520938748800.0, + "grad_norm": 1.8374331787518619, + "language_loss": 0.76029092, + "learning_rate": 1.687573444537108e-06, + "loss": 0.83720207, + "num_input_tokens_seen": 201851305, + "router_z_loss_clip": 1.49023438, + "router_z_loss_mlp": 0.12237549, + "step": 9371, + "time_per_iteration": 2.5327818393707275 + }, + { + "auxiliary_loss_clip": 0.06421979, + "auxiliary_loss_mlp": 0.01268189, + "balance_loss_clip": 0.06277596, + "balance_loss_mlp": 0.01256739, + "epoch": 0.5634751240042086, + "flos": 19250679957120.0, + "grad_norm": 1.7360135917661768, + "language_loss": 0.762514, + "learning_rate": 1.687188770067285e-06, + "loss": 0.83941567, + "num_input_tokens_seen": 201870350, + "router_z_loss_clip": 1.44335938, + "router_z_loss_mlp": 0.11456299, + "step": 9372, + "time_per_iteration": 2.519404411315918 + }, + { + "auxiliary_loss_clip": 0.06422761, + "auxiliary_loss_mlp": 0.01266353, + "balance_loss_clip": 0.06280088, + "balance_loss_mlp": 0.01255016, + "epoch": 0.5635352472568766, + "flos": 12025453766400.0, + "grad_norm": 1.884768041604824, + "language_loss": 0.71853095, + "learning_rate": 1.6868041074592956e-06, + "loss": 0.79542208, + "num_input_tokens_seen": 201886800, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.11334229, + "step": 9373, + "time_per_iteration": 2.5053837299346924 + }, + { + "auxiliary_loss_clip": 0.06422034, + "auxiliary_loss_mlp": 0.01268801, + "balance_loss_clip": 0.06277832, + "balance_loss_mlp": 0.01256367, + "epoch": 0.5635953705095446, + "flos": 21878092108800.0, + "grad_norm": 1.841933865019323, + "language_loss": 0.83263683, + "learning_rate": 1.6864194567277264e-06, + "loss": 0.90954518, + "num_input_tokens_seen": 201904730, + "router_z_loss_clip": 1.44238281, + "router_z_loss_mlp": 0.12438965, + "step": 9374, + "time_per_iteration": 3.904900074005127 + }, + { + "auxiliary_loss_clip": 0.06420377, + "auxiliary_loss_mlp": 0.01266878, + "balance_loss_clip": 0.06277412, + "balance_loss_mlp": 0.01256131, + "epoch": 0.5636554937622126, + "flos": 27133587244800.0, + "grad_norm": 2.5670866003984583, + "language_loss": 0.66696084, + "learning_rate": 1.6860348178871618e-06, + "loss": 0.74383336, + "num_input_tokens_seen": 201924850, + "router_z_loss_clip": 1.42871094, + "router_z_loss_mlp": 0.10754395, + "step": 9375, + "time_per_iteration": 2.581921339035034 + }, + { + "auxiliary_loss_clip": 0.06426428, + "auxiliary_loss_mlp": 0.01265809, + "balance_loss_clip": 0.06279501, + "balance_loss_mlp": 0.0125433, + "epoch": 0.5637156170148805, + "flos": 12930314008320.0, + "grad_norm": 12.279905367602915, + "language_loss": 0.81403673, + "learning_rate": 1.6856501909521889e-06, + "loss": 0.89095908, + "num_input_tokens_seen": 201939500, + "router_z_loss_clip": 1.46972656, + "router_z_loss_mlp": 0.11474609, + "step": 9376, + "time_per_iteration": 2.5271008014678955 + }, + { + "auxiliary_loss_clip": 0.06430367, + "auxiliary_loss_mlp": 0.01265466, + "balance_loss_clip": 0.06280433, + "balance_loss_mlp": 0.01253974, + "epoch": 0.5637757402675485, + "flos": 45561460435200.0, + "grad_norm": 1.3765625381603785, + "language_loss": 0.69569075, + "learning_rate": 1.6852655759373925e-06, + "loss": 0.77264911, + "num_input_tokens_seen": 201963000, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.1149292, + "step": 9377, + "time_per_iteration": 2.7878713607788086 + }, + { + "auxiliary_loss_clip": 0.06418754, + "auxiliary_loss_mlp": 0.01265562, + "balance_loss_clip": 0.06278635, + "balance_loss_mlp": 0.01254887, + "epoch": 0.5638358635202164, + "flos": 20892241296000.0, + "grad_norm": 1.4815499035204616, + "language_loss": 0.75006419, + "learning_rate": 1.6848809728573565e-06, + "loss": 0.82690734, + "num_input_tokens_seen": 201983145, + "router_z_loss_clip": 1.40039062, + "router_z_loss_mlp": 0.10668945, + "step": 9378, + "time_per_iteration": 2.5742552280426025 + }, + { + "auxiliary_loss_clip": 0.06432593, + "auxiliary_loss_mlp": 0.01271419, + "balance_loss_clip": 0.06279133, + "balance_loss_mlp": 0.01258837, + "epoch": 0.5638959867728844, + "flos": 18812449503360.0, + "grad_norm": 2.3058329321149555, + "language_loss": 0.81874716, + "learning_rate": 1.6844963817266656e-06, + "loss": 0.8957873, + "num_input_tokens_seen": 202000335, + "router_z_loss_clip": 1.53710938, + "router_z_loss_mlp": 0.12585449, + "step": 9379, + "time_per_iteration": 3.9022350311279297 + }, + { + "auxiliary_loss_clip": 0.06428088, + "auxiliary_loss_mlp": 0.012688, + "balance_loss_clip": 0.06281307, + "balance_loss_mlp": 0.01256933, + "epoch": 0.5639561100255523, + "flos": 27497703162240.0, + "grad_norm": 1.9515300720121755, + "language_loss": 0.71783185, + "learning_rate": 1.6841118025599042e-06, + "loss": 0.79480064, + "num_input_tokens_seen": 202018275, + "router_z_loss_clip": 1.46582031, + "router_z_loss_mlp": 0.11859131, + "step": 9380, + "time_per_iteration": 2.6338086128234863 + }, + { + "auxiliary_loss_clip": 0.0642691, + "auxiliary_loss_mlp": 0.01266641, + "balance_loss_clip": 0.06279925, + "balance_loss_mlp": 0.01254857, + "epoch": 0.5640162332782204, + "flos": 18082289024640.0, + "grad_norm": 2.0751114915079687, + "language_loss": 0.75207865, + "learning_rate": 1.6837272353716542e-06, + "loss": 0.82901412, + "num_input_tokens_seen": 202034330, + "router_z_loss_clip": 1.46777344, + "router_z_loss_mlp": 0.11779785, + "step": 9381, + "time_per_iteration": 2.4637959003448486 + }, + { + "auxiliary_loss_clip": 0.06430316, + "auxiliary_loss_mlp": 0.01273879, + "balance_loss_clip": 0.06282466, + "balance_loss_mlp": 0.01262822, + "epoch": 0.5640763565308883, + "flos": 20890857703680.0, + "grad_norm": 2.2840815632275846, + "language_loss": 0.72823429, + "learning_rate": 1.683342680176499e-06, + "loss": 0.80527627, + "num_input_tokens_seen": 202053100, + "router_z_loss_clip": 1.47753906, + "router_z_loss_mlp": 0.11053467, + "step": 9382, + "time_per_iteration": 2.6038217544555664 + }, + { + "auxiliary_loss_clip": 0.0632898, + "auxiliary_loss_mlp": 0.01252773, + "balance_loss_clip": 0.06268109, + "balance_loss_mlp": 0.01251134, + "epoch": 0.5641364797835563, + "flos": 64467143205120.0, + "grad_norm": 0.7593633930380659, + "language_loss": 0.54457784, + "learning_rate": 1.682958136989022e-06, + "loss": 0.62039542, + "num_input_tokens_seen": 202120125, + "router_z_loss_clip": 0.609375, + "router_z_loss_mlp": 0.01641846, + "step": 9383, + "time_per_iteration": 4.702574253082275 + }, + { + "auxiliary_loss_clip": 0.06430694, + "auxiliary_loss_mlp": 0.01271925, + "balance_loss_clip": 0.06278884, + "balance_loss_mlp": 0.01260129, + "epoch": 0.5641966030362242, + "flos": 18666861834240.0, + "grad_norm": 1.6723183303987958, + "language_loss": 0.71441197, + "learning_rate": 1.6825736058238033e-06, + "loss": 0.79143822, + "num_input_tokens_seen": 202138030, + "router_z_loss_clip": 1.51757812, + "router_z_loss_mlp": 0.11798096, + "step": 9384, + "time_per_iteration": 2.4753105640411377 + }, + { + "auxiliary_loss_clip": 0.06421386, + "auxiliary_loss_mlp": 0.01266582, + "balance_loss_clip": 0.0627472, + "balance_loss_mlp": 0.01254626, + "epoch": 0.5642567262888922, + "flos": 22498946536320.0, + "grad_norm": 1.9187169203117838, + "language_loss": 0.76415217, + "learning_rate": 1.6821890866954263e-06, + "loss": 0.84103185, + "num_input_tokens_seen": 202155580, + "router_z_loss_clip": 1.46386719, + "router_z_loss_mlp": 0.1194458, + "step": 9385, + "time_per_iteration": 2.5245208740234375 + }, + { + "auxiliary_loss_clip": 0.06417953, + "auxiliary_loss_mlp": 0.01265769, + "balance_loss_clip": 0.0627504, + "balance_loss_mlp": 0.01255028, + "epoch": 0.5643168495415603, + "flos": 13008663175680.0, + "grad_norm": 1.914249541829808, + "language_loss": 0.82386243, + "learning_rate": 1.6818045796184703e-06, + "loss": 0.90069962, + "num_input_tokens_seen": 202170365, + "router_z_loss_clip": 1.42871094, + "router_z_loss_mlp": 0.10748291, + "step": 9386, + "time_per_iteration": 2.4669172763824463 + }, + { + "auxiliary_loss_clip": 0.06427868, + "auxiliary_loss_mlp": 0.01266292, + "balance_loss_clip": 0.06277144, + "balance_loss_mlp": 0.01255014, + "epoch": 0.5643769727942282, + "flos": 18594256671360.0, + "grad_norm": 1.9656567849197715, + "language_loss": 0.70471108, + "learning_rate": 1.681420084607516e-06, + "loss": 0.78165275, + "num_input_tokens_seen": 202189095, + "router_z_loss_clip": 1.50488281, + "router_z_loss_mlp": 0.112854, + "step": 9387, + "time_per_iteration": 2.5076122283935547 + }, + { + "auxiliary_loss_clip": 0.0642679, + "auxiliary_loss_mlp": 0.01267525, + "balance_loss_clip": 0.06276885, + "balance_loss_mlp": 0.01255348, + "epoch": 0.5644370960468962, + "flos": 33815343853440.0, + "grad_norm": 1.4623673546412521, + "language_loss": 0.75064629, + "learning_rate": 1.6810356016771452e-06, + "loss": 0.82758939, + "num_input_tokens_seen": 202213500, + "router_z_loss_clip": 1.49707031, + "router_z_loss_mlp": 0.12176514, + "step": 9388, + "time_per_iteration": 2.651616096496582 + }, + { + "auxiliary_loss_clip": 0.06417996, + "auxiliary_loss_mlp": 0.01267245, + "balance_loss_clip": 0.06276226, + "balance_loss_mlp": 0.01256892, + "epoch": 0.5644972192995641, + "flos": 21221249552640.0, + "grad_norm": 1.4874039445981817, + "language_loss": 0.82212514, + "learning_rate": 1.6806511308419353e-06, + "loss": 0.89897752, + "num_input_tokens_seen": 202231920, + "router_z_loss_clip": 1.41699219, + "router_z_loss_mlp": 0.10357666, + "step": 9389, + "time_per_iteration": 2.5609359741210938 + }, + { + "auxiliary_loss_clip": 0.06426319, + "auxiliary_loss_mlp": 0.01270818, + "balance_loss_clip": 0.06278206, + "balance_loss_mlp": 0.01258468, + "epoch": 0.5645573425522321, + "flos": 18593585838720.0, + "grad_norm": 2.1560569688057036, + "language_loss": 0.64486635, + "learning_rate": 1.680266672116467e-06, + "loss": 0.72183776, + "num_input_tokens_seen": 202247600, + "router_z_loss_clip": 1.47949219, + "router_z_loss_mlp": 0.12329102, + "step": 9390, + "time_per_iteration": 3.8905534744262695 + }, + { + "auxiliary_loss_clip": 0.06417844, + "auxiliary_loss_mlp": 0.01265997, + "balance_loss_clip": 0.06276117, + "balance_loss_mlp": 0.01255334, + "epoch": 0.5646174658049, + "flos": 18119660745600.0, + "grad_norm": 1.743379462466535, + "language_loss": 0.92393249, + "learning_rate": 1.6798822255153192e-06, + "loss": 1.00077093, + "num_input_tokens_seen": 202265350, + "router_z_loss_clip": 1.41992188, + "router_z_loss_mlp": 0.10662842, + "step": 9391, + "time_per_iteration": 2.4846012592315674 + }, + { + "auxiliary_loss_clip": 0.06426747, + "auxiliary_loss_mlp": 0.01269795, + "balance_loss_clip": 0.06274952, + "balance_loss_mlp": 0.0125751, + "epoch": 0.564677589057568, + "flos": 28337547035520.0, + "grad_norm": 2.079245602273352, + "language_loss": 0.60616773, + "learning_rate": 1.6794977910530684e-06, + "loss": 0.68313313, + "num_input_tokens_seen": 202284285, + "router_z_loss_clip": 1.51660156, + "router_z_loss_mlp": 0.12286377, + "step": 9392, + "time_per_iteration": 2.5709118843078613 + }, + { + "auxiliary_loss_clip": 0.06418676, + "auxiliary_loss_mlp": 0.01266956, + "balance_loss_clip": 0.06274032, + "balance_loss_mlp": 0.01255619, + "epoch": 0.564737712310236, + "flos": 22170273696000.0, + "grad_norm": 2.32400153493691, + "language_loss": 0.81762815, + "learning_rate": 1.6791133687442937e-06, + "loss": 0.8944844, + "num_input_tokens_seen": 202303450, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.11334229, + "step": 9393, + "time_per_iteration": 2.49820613861084 + }, + { + "auxiliary_loss_clip": 0.06420048, + "auxiliary_loss_mlp": 0.01268955, + "balance_loss_clip": 0.06274317, + "balance_loss_mlp": 0.01257434, + "epoch": 0.564797835562904, + "flos": 20965223802240.0, + "grad_norm": 1.8189771095125196, + "language_loss": 0.87738705, + "learning_rate": 1.6787289586035725e-06, + "loss": 0.95427704, + "num_input_tokens_seen": 202322315, + "router_z_loss_clip": 1.45703125, + "router_z_loss_mlp": 0.11523438, + "step": 9394, + "time_per_iteration": 2.5385193824768066 + }, + { + "auxiliary_loss_clip": 0.06421189, + "auxiliary_loss_mlp": 0.01271733, + "balance_loss_clip": 0.06278495, + "balance_loss_mlp": 0.01261135, + "epoch": 0.5648579588155719, + "flos": 17425991520000.0, + "grad_norm": 1.7000053900358165, + "language_loss": 0.84579873, + "learning_rate": 1.6783445606454814e-06, + "loss": 0.92272794, + "num_input_tokens_seen": 202339905, + "router_z_loss_clip": 1.42871094, + "router_z_loss_mlp": 0.1060791, + "step": 9395, + "time_per_iteration": 2.470017433166504 + }, + { + "auxiliary_loss_clip": 0.06326792, + "auxiliary_loss_mlp": 0.01253109, + "balance_loss_clip": 0.06265698, + "balance_loss_mlp": 0.01251535, + "epoch": 0.5649180820682399, + "flos": 69951187152000.0, + "grad_norm": 0.7657809500788333, + "language_loss": 0.57918489, + "learning_rate": 1.677960174884597e-06, + "loss": 0.65498388, + "num_input_tokens_seen": 202397320, + "router_z_loss_clip": 0.61132812, + "router_z_loss_mlp": 0.01573944, + "step": 9396, + "time_per_iteration": 3.1468727588653564 + }, + { + "auxiliary_loss_clip": 0.06423569, + "auxiliary_loss_mlp": 0.01267357, + "balance_loss_clip": 0.06276156, + "balance_loss_mlp": 0.01256205, + "epoch": 0.5649782053209078, + "flos": 24980058259200.0, + "grad_norm": 1.9294071175656426, + "language_loss": 0.70135093, + "learning_rate": 1.6775758013354943e-06, + "loss": 0.77826023, + "num_input_tokens_seen": 202416865, + "router_z_loss_clip": 1.47460938, + "router_z_loss_mlp": 0.11157227, + "step": 9397, + "time_per_iteration": 2.5551769733428955 + }, + { + "auxiliary_loss_clip": 0.06421924, + "auxiliary_loss_mlp": 0.01267113, + "balance_loss_clip": 0.06274733, + "balance_loss_mlp": 0.01256277, + "epoch": 0.5650383285735758, + "flos": 21733175272320.0, + "grad_norm": 3.1535749018048094, + "language_loss": 0.67165595, + "learning_rate": 1.67719144001275e-06, + "loss": 0.74854636, + "num_input_tokens_seen": 202436210, + "router_z_loss_clip": 1.47167969, + "router_z_loss_mlp": 0.10839844, + "step": 9398, + "time_per_iteration": 2.5690701007843018 + }, + { + "auxiliary_loss_clip": 0.06324084, + "auxiliary_loss_mlp": 0.01251867, + "balance_loss_clip": 0.06263297, + "balance_loss_mlp": 0.01250375, + "epoch": 0.5650984518262439, + "flos": 65923481093760.0, + "grad_norm": 0.7518933539640298, + "language_loss": 0.58143103, + "learning_rate": 1.6768070909309386e-06, + "loss": 0.65719062, + "num_input_tokens_seen": 202492925, + "router_z_loss_clip": 0.609375, + "router_z_loss_mlp": 0.01491547, + "step": 9399, + "time_per_iteration": 3.073493719100952 + }, + { + "auxiliary_loss_clip": 0.06425194, + "auxiliary_loss_mlp": 0.01269138, + "balance_loss_clip": 0.06275368, + "balance_loss_mlp": 0.01257158, + "epoch": 0.5651585750789118, + "flos": 21038919068160.0, + "grad_norm": 2.9284187471842213, + "language_loss": 0.73483676, + "learning_rate": 1.6764227541046347e-06, + "loss": 0.8117801, + "num_input_tokens_seen": 202511905, + "router_z_loss_clip": 1.49902344, + "router_z_loss_mlp": 0.11987305, + "step": 9400, + "time_per_iteration": 2.5129287242889404 + }, + { + "auxiliary_loss_clip": 0.06431332, + "auxiliary_loss_mlp": 0.01270587, + "balance_loss_clip": 0.06281202, + "balance_loss_mlp": 0.01258267, + "epoch": 0.5652186983315798, + "flos": 18557891199360.0, + "grad_norm": 1.781312568353633, + "language_loss": 0.61062682, + "learning_rate": 1.676038429548412e-06, + "loss": 0.68764603, + "num_input_tokens_seen": 202529815, + "router_z_loss_clip": 1.50097656, + "router_z_loss_mlp": 0.12322998, + "step": 9401, + "time_per_iteration": 2.484562397003174 + }, + { + "auxiliary_loss_clip": 0.06419288, + "auxiliary_loss_mlp": 0.01272594, + "balance_loss_clip": 0.06274588, + "balance_loss_mlp": 0.01261859, + "epoch": 0.5652788215842477, + "flos": 18484573276800.0, + "grad_norm": 1.8682667341725439, + "language_loss": 0.81175613, + "learning_rate": 1.6756541172768453e-06, + "loss": 0.88867497, + "num_input_tokens_seen": 202547710, + "router_z_loss_clip": 1.4453125, + "router_z_loss_mlp": 0.10736084, + "step": 9402, + "time_per_iteration": 2.5402467250823975 + }, + { + "auxiliary_loss_clip": 0.0641814, + "auxiliary_loss_mlp": 0.01269888, + "balance_loss_clip": 0.06276071, + "balance_loss_mlp": 0.0125898, + "epoch": 0.5653389448369157, + "flos": 30051797391360.0, + "grad_norm": 1.3435358668606565, + "language_loss": 0.77710259, + "learning_rate": 1.6752698173045068e-06, + "loss": 0.85398287, + "num_input_tokens_seen": 202568835, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.10900879, + "step": 9403, + "time_per_iteration": 2.5728204250335693 + }, + { + "auxiliary_loss_clip": 0.06421928, + "auxiliary_loss_mlp": 0.01268633, + "balance_loss_clip": 0.06276687, + "balance_loss_mlp": 0.01257458, + "epoch": 0.5653990680895836, + "flos": 16733202762240.0, + "grad_norm": 1.6255859835861872, + "language_loss": 0.69364876, + "learning_rate": 1.6748855296459685e-06, + "loss": 0.7705543, + "num_input_tokens_seen": 202587385, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.11187744, + "step": 9404, + "time_per_iteration": 2.5076894760131836 + }, + { + "auxiliary_loss_clip": 0.06414986, + "auxiliary_loss_mlp": 0.01268861, + "balance_loss_clip": 0.06274591, + "balance_loss_mlp": 0.01258156, + "epoch": 0.5654591913422516, + "flos": 14543517939840.0, + "grad_norm": 1.937007916536723, + "language_loss": 0.6753332, + "learning_rate": 1.6745012543158045e-06, + "loss": 0.75217164, + "num_input_tokens_seen": 202604815, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.1071167, + "step": 9405, + "time_per_iteration": 2.4678986072540283 + }, + { + "auxiliary_loss_clip": 0.06417301, + "auxiliary_loss_mlp": 0.01269096, + "balance_loss_clip": 0.0627932, + "balance_loss_mlp": 0.0125891, + "epoch": 0.5655193145949196, + "flos": 26216484307200.0, + "grad_norm": 1.7078210782531607, + "language_loss": 0.74488431, + "learning_rate": 1.6741169913285852e-06, + "loss": 0.82174826, + "num_input_tokens_seen": 202623775, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.10180664, + "step": 9406, + "time_per_iteration": 2.5344419479370117 + }, + { + "auxiliary_loss_clip": 0.06423233, + "auxiliary_loss_mlp": 0.01269998, + "balance_loss_clip": 0.06274547, + "balance_loss_mlp": 0.01258101, + "epoch": 0.5655794378475876, + "flos": 25053669671040.0, + "grad_norm": 1.6572482823915473, + "language_loss": 0.80165344, + "learning_rate": 1.673732740698882e-06, + "loss": 0.87858582, + "num_input_tokens_seen": 202643375, + "router_z_loss_clip": 1.48828125, + "router_z_loss_mlp": 0.11901855, + "step": 9407, + "time_per_iteration": 2.5318515300750732 + }, + { + "auxiliary_loss_clip": 0.06414818, + "auxiliary_loss_mlp": 0.01281674, + "balance_loss_clip": 0.06276679, + "balance_loss_mlp": 0.01270641, + "epoch": 0.5656395611002555, + "flos": 31041379710720.0, + "grad_norm": 1.3106223538314048, + "language_loss": 0.71445584, + "learning_rate": 1.6733485024412666e-06, + "loss": 0.79142082, + "num_input_tokens_seen": 202668400, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.1104126, + "step": 9408, + "time_per_iteration": 2.6315321922302246 + }, + { + "auxiliary_loss_clip": 0.06416275, + "auxiliary_loss_mlp": 0.01273077, + "balance_loss_clip": 0.06275165, + "balance_loss_mlp": 0.01262151, + "epoch": 0.5656996843529235, + "flos": 20235650302080.0, + "grad_norm": 1.8647463769564316, + "language_loss": 0.81496549, + "learning_rate": 1.672964276570308e-06, + "loss": 0.89185899, + "num_input_tokens_seen": 202685125, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.109375, + "step": 9409, + "time_per_iteration": 2.4874367713928223 + }, + { + "auxiliary_loss_clip": 0.06420213, + "auxiliary_loss_mlp": 0.01273147, + "balance_loss_clip": 0.06275219, + "balance_loss_mlp": 0.01261953, + "epoch": 0.5657598076055914, + "flos": 21002595523200.0, + "grad_norm": 1.5982364261864173, + "language_loss": 0.78488803, + "learning_rate": 1.6725800631005776e-06, + "loss": 0.86182165, + "num_input_tokens_seen": 202703830, + "router_z_loss_clip": 1.44921875, + "router_z_loss_mlp": 0.11187744, + "step": 9410, + "time_per_iteration": 2.568018913269043 + }, + { + "auxiliary_loss_clip": 0.06420635, + "auxiliary_loss_mlp": 0.01269622, + "balance_loss_clip": 0.06277133, + "balance_loss_mlp": 0.01258607, + "epoch": 0.5658199308582594, + "flos": 11550690132480.0, + "grad_norm": 1.9303419986806551, + "language_loss": 0.83679706, + "learning_rate": 1.6721958620466432e-06, + "loss": 0.91369963, + "num_input_tokens_seen": 202719835, + "router_z_loss_clip": 1.43457031, + "router_z_loss_mlp": 0.11016846, + "step": 9411, + "time_per_iteration": 2.4616551399230957 + }, + { + "auxiliary_loss_clip": 0.06428169, + "auxiliary_loss_mlp": 0.01269272, + "balance_loss_clip": 0.06277955, + "balance_loss_mlp": 0.01256725, + "epoch": 0.5658800541109275, + "flos": 14177137962240.0, + "grad_norm": 2.370687982223235, + "language_loss": 0.67829227, + "learning_rate": 1.6718116734230749e-06, + "loss": 0.75526661, + "num_input_tokens_seen": 202736795, + "router_z_loss_clip": 1.50292969, + "router_z_loss_mlp": 0.12548828, + "step": 9412, + "time_per_iteration": 2.5216641426086426 + }, + { + "auxiliary_loss_clip": 0.06415425, + "auxiliary_loss_mlp": 0.01268228, + "balance_loss_clip": 0.06277046, + "balance_loss_mlp": 0.01258488, + "epoch": 0.5659401773635954, + "flos": 27311934660480.0, + "grad_norm": 1.581889394574198, + "language_loss": 0.58742762, + "learning_rate": 1.6714274972444413e-06, + "loss": 0.6642642, + "num_input_tokens_seen": 202756900, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.09741211, + "step": 9413, + "time_per_iteration": 2.564143657684326 + }, + { + "auxiliary_loss_clip": 0.06415551, + "auxiliary_loss_mlp": 0.01265095, + "balance_loss_clip": 0.06274314, + "balance_loss_mlp": 0.01254294, + "epoch": 0.5660003006162634, + "flos": 16733957448960.0, + "grad_norm": 2.47913455673049, + "language_loss": 0.69196904, + "learning_rate": 1.6710433335253092e-06, + "loss": 0.76877546, + "num_input_tokens_seen": 202775145, + "router_z_loss_clip": 1.40917969, + "router_z_loss_mlp": 0.10791016, + "step": 9414, + "time_per_iteration": 3.924028158187866 + }, + { + "auxiliary_loss_clip": 0.0641676, + "auxiliary_loss_mlp": 0.01269168, + "balance_loss_clip": 0.06275219, + "balance_loss_mlp": 0.01258475, + "epoch": 0.5660604238689313, + "flos": 21659983130880.0, + "grad_norm": 1.6269222060357784, + "language_loss": 0.78177273, + "learning_rate": 1.670659182280247e-06, + "loss": 0.85863203, + "num_input_tokens_seen": 202794505, + "router_z_loss_clip": 1.41601562, + "router_z_loss_mlp": 0.10693359, + "step": 9415, + "time_per_iteration": 2.5426433086395264 + }, + { + "auxiliary_loss_clip": 0.06321331, + "auxiliary_loss_mlp": 0.01255911, + "balance_loss_clip": 0.06260875, + "balance_loss_mlp": 0.01254426, + "epoch": 0.5661205471215993, + "flos": 68843619884160.0, + "grad_norm": 0.6697066651048145, + "language_loss": 0.48973382, + "learning_rate": 1.670275043523822e-06, + "loss": 0.56550622, + "num_input_tokens_seen": 202858580, + "router_z_loss_clip": 0.60546875, + "router_z_loss_mlp": 0.0148468, + "step": 9416, + "time_per_iteration": 3.2625491619110107 + }, + { + "auxiliary_loss_clip": 0.06421995, + "auxiliary_loss_mlp": 0.01268122, + "balance_loss_clip": 0.06277312, + "balance_loss_mlp": 0.01256416, + "epoch": 0.5661806703742672, + "flos": 28629393206400.0, + "grad_norm": 1.9136616805420137, + "language_loss": 0.63439846, + "learning_rate": 1.6698909172706e-06, + "loss": 0.7112996, + "num_input_tokens_seen": 202878565, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.11706543, + "step": 9417, + "time_per_iteration": 2.5860400199890137 + }, + { + "auxiliary_loss_clip": 0.06423697, + "auxiliary_loss_mlp": 0.01269251, + "balance_loss_clip": 0.06277792, + "balance_loss_mlp": 0.01257419, + "epoch": 0.5662407936269352, + "flos": 21404418577920.0, + "grad_norm": 2.3766145169256485, + "language_loss": 0.6936692, + "learning_rate": 1.6695068035351479e-06, + "loss": 0.77059871, + "num_input_tokens_seen": 202897350, + "router_z_loss_clip": 1.45703125, + "router_z_loss_mlp": 0.1184082, + "step": 9418, + "time_per_iteration": 3.955557346343994 + }, + { + "auxiliary_loss_clip": 0.0642141, + "auxiliary_loss_mlp": 0.01267462, + "balance_loss_clip": 0.06276925, + "balance_loss_mlp": 0.01255261, + "epoch": 0.5663009168796032, + "flos": 25666054836480.0, + "grad_norm": 1.7349550199621107, + "language_loss": 0.65210938, + "learning_rate": 1.6691227023320304e-06, + "loss": 0.72899818, + "num_input_tokens_seen": 202916745, + "router_z_loss_clip": 1.44335938, + "router_z_loss_mlp": 0.12219238, + "step": 9419, + "time_per_iteration": 2.5426688194274902 + }, + { + "auxiliary_loss_clip": 0.06328249, + "auxiliary_loss_mlp": 0.01252694, + "balance_loss_clip": 0.06267616, + "balance_loss_mlp": 0.01251344, + "epoch": 0.5663610401322712, + "flos": 67953014835840.0, + "grad_norm": 0.7058455662611458, + "language_loss": 0.59640646, + "learning_rate": 1.6687386136758135e-06, + "loss": 0.67221588, + "num_input_tokens_seen": 202982375, + "router_z_loss_clip": 0.609375, + "router_z_loss_mlp": 0.01351929, + "step": 9420, + "time_per_iteration": 3.2174880504608154 + }, + { + "auxiliary_loss_clip": 0.064177, + "auxiliary_loss_mlp": 0.0126554, + "balance_loss_clip": 0.06276117, + "balance_loss_mlp": 0.01255235, + "epoch": 0.5664211633849391, + "flos": 24616487393280.0, + "grad_norm": 1.6106095517088517, + "language_loss": 0.74370563, + "learning_rate": 1.6683545375810618e-06, + "loss": 0.82053804, + "num_input_tokens_seen": 203002430, + "router_z_loss_clip": 1.41503906, + "router_z_loss_mlp": 0.10308838, + "step": 9421, + "time_per_iteration": 2.5415146350860596 + }, + { + "auxiliary_loss_clip": 0.06425875, + "auxiliary_loss_mlp": 0.0127111, + "balance_loss_clip": 0.0627939, + "balance_loss_mlp": 0.0125941, + "epoch": 0.5664812866376071, + "flos": 11652407389440.0, + "grad_norm": 1.8136120935488778, + "language_loss": 0.73536521, + "learning_rate": 1.6679704740623389e-06, + "loss": 0.81233501, + "num_input_tokens_seen": 203019425, + "router_z_loss_clip": 1.46679688, + "router_z_loss_mlp": 0.11700439, + "step": 9422, + "time_per_iteration": 2.4822769165039062 + }, + { + "auxiliary_loss_clip": 0.06420115, + "auxiliary_loss_mlp": 0.01264, + "balance_loss_clip": 0.06278713, + "balance_loss_mlp": 0.01253355, + "epoch": 0.566541409890275, + "flos": 24650798440320.0, + "grad_norm": 1.7038149529307767, + "language_loss": 0.8178972, + "learning_rate": 1.6675864231342085e-06, + "loss": 0.89473832, + "num_input_tokens_seen": 203039035, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.10656738, + "step": 9423, + "time_per_iteration": 4.039041519165039 + }, + { + "auxiliary_loss_clip": 0.06420702, + "auxiliary_loss_mlp": 0.01272474, + "balance_loss_clip": 0.06276573, + "balance_loss_mlp": 0.01260392, + "epoch": 0.566601533142943, + "flos": 22276686781440.0, + "grad_norm": 2.1916345423108092, + "language_loss": 0.81182116, + "learning_rate": 1.6672023848112353e-06, + "loss": 0.88875294, + "num_input_tokens_seen": 203059320, + "router_z_loss_clip": 1.44140625, + "router_z_loss_mlp": 0.12091064, + "step": 9424, + "time_per_iteration": 2.6186363697052 + }, + { + "auxiliary_loss_clip": 0.06424181, + "auxiliary_loss_mlp": 0.01267084, + "balance_loss_clip": 0.06276239, + "balance_loss_mlp": 0.01254788, + "epoch": 0.5666616563956111, + "flos": 29979485717760.0, + "grad_norm": 1.8421028893936136, + "language_loss": 0.79108143, + "learning_rate": 1.6668183591079805e-06, + "loss": 0.86799419, + "num_input_tokens_seen": 203078490, + "router_z_loss_clip": 1.47949219, + "router_z_loss_mlp": 0.1229248, + "step": 9425, + "time_per_iteration": 2.6103405952453613 + }, + { + "auxiliary_loss_clip": 0.06423585, + "auxiliary_loss_mlp": 0.01266807, + "balance_loss_clip": 0.06280398, + "balance_loss_mlp": 0.01254958, + "epoch": 0.566721779648279, + "flos": 17786585566080.0, + "grad_norm": 1.8792171756054583, + "language_loss": 0.59002221, + "learning_rate": 1.6664343460390064e-06, + "loss": 0.66692609, + "num_input_tokens_seen": 203096065, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.11853027, + "step": 9426, + "time_per_iteration": 2.5017449855804443 + }, + { + "auxiliary_loss_clip": 0.06425668, + "auxiliary_loss_mlp": 0.01271587, + "balance_loss_clip": 0.06278071, + "balance_loss_mlp": 0.01259881, + "epoch": 0.566781902900947, + "flos": 21039967244160.0, + "grad_norm": 1.8634987355301997, + "language_loss": 0.82228333, + "learning_rate": 1.6660503456188764e-06, + "loss": 0.89925593, + "num_input_tokens_seen": 203115270, + "router_z_loss_clip": 1.47558594, + "router_z_loss_mlp": 0.1171875, + "step": 9427, + "time_per_iteration": 2.565479040145874 + }, + { + "auxiliary_loss_clip": 0.06418218, + "auxiliary_loss_mlp": 0.01268516, + "balance_loss_clip": 0.06277822, + "balance_loss_mlp": 0.01257853, + "epoch": 0.5668420261536149, + "flos": 23155244040960.0, + "grad_norm": 1.8170517561621367, + "language_loss": 0.86107284, + "learning_rate": 1.6656663578621498e-06, + "loss": 0.93794018, + "num_input_tokens_seen": 203134290, + "router_z_loss_clip": 1.40332031, + "router_z_loss_mlp": 0.10662842, + "step": 9428, + "time_per_iteration": 2.5440726280212402 + }, + { + "auxiliary_loss_clip": 0.06425078, + "auxiliary_loss_mlp": 0.01266256, + "balance_loss_clip": 0.06276559, + "balance_loss_mlp": 0.01254549, + "epoch": 0.5669021494062829, + "flos": 22608210660480.0, + "grad_norm": 1.979218692390264, + "language_loss": 0.74058932, + "learning_rate": 1.6652823827833886e-06, + "loss": 0.81750262, + "num_input_tokens_seen": 203152935, + "router_z_loss_clip": 1.48339844, + "router_z_loss_mlp": 0.11700439, + "step": 9429, + "time_per_iteration": 2.5536460876464844 + }, + { + "auxiliary_loss_clip": 0.06425272, + "auxiliary_loss_mlp": 0.01264673, + "balance_loss_clip": 0.06277645, + "balance_loss_mlp": 0.01252943, + "epoch": 0.5669622726589508, + "flos": 17386481520000.0, + "grad_norm": 1.7940156011993331, + "language_loss": 0.75663137, + "learning_rate": 1.6648984203971538e-06, + "loss": 0.8335309, + "num_input_tokens_seen": 203170110, + "router_z_loss_clip": 1.47460938, + "router_z_loss_mlp": 0.11724854, + "step": 9430, + "time_per_iteration": 3.9432384967803955 + }, + { + "auxiliary_loss_clip": 0.06418042, + "auxiliary_loss_mlp": 0.01265203, + "balance_loss_clip": 0.06273438, + "balance_loss_mlp": 0.01254498, + "epoch": 0.5670223959116188, + "flos": 18767992112640.0, + "grad_norm": 1.7725274526585868, + "language_loss": 0.73046589, + "learning_rate": 1.6645144707180032e-06, + "loss": 0.80729836, + "num_input_tokens_seen": 203188825, + "router_z_loss_clip": 1.44726562, + "router_z_loss_mlp": 0.10705566, + "step": 9431, + "time_per_iteration": 2.4891881942749023 + }, + { + "auxiliary_loss_clip": 0.06413169, + "auxiliary_loss_mlp": 0.01269495, + "balance_loss_clip": 0.06278919, + "balance_loss_mlp": 0.0125907, + "epoch": 0.5670825191642868, + "flos": 13558463740800.0, + "grad_norm": 1.5232840780961514, + "language_loss": 0.7352109, + "learning_rate": 1.6641305337604984e-06, + "loss": 0.81203753, + "num_input_tokens_seen": 203206860, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.10424805, + "step": 9432, + "time_per_iteration": 2.539503812789917 + }, + { + "auxiliary_loss_clip": 0.06419028, + "auxiliary_loss_mlp": 0.0126609, + "balance_loss_clip": 0.0627542, + "balance_loss_mlp": 0.01254914, + "epoch": 0.5671426424169548, + "flos": 22060506447360.0, + "grad_norm": 1.4799006758092328, + "language_loss": 0.78516906, + "learning_rate": 1.663746609539197e-06, + "loss": 0.86202025, + "num_input_tokens_seen": 203225625, + "router_z_loss_clip": 1.43457031, + "router_z_loss_mlp": 0.11169434, + "step": 9433, + "time_per_iteration": 2.5004031658172607 + }, + { + "auxiliary_loss_clip": 0.06427075, + "auxiliary_loss_mlp": 0.01270712, + "balance_loss_clip": 0.06279536, + "balance_loss_mlp": 0.01257569, + "epoch": 0.5672027656696227, + "flos": 21330262114560.0, + "grad_norm": 1.7709414309866778, + "language_loss": 0.63719839, + "learning_rate": 1.6633626980686582e-06, + "loss": 0.71417624, + "num_input_tokens_seen": 203242920, + "router_z_loss_clip": 1.47363281, + "router_z_loss_mlp": 0.13134766, + "step": 9434, + "time_per_iteration": 2.5424575805664062 + }, + { + "auxiliary_loss_clip": 0.06413743, + "auxiliary_loss_mlp": 0.0126735, + "balance_loss_clip": 0.06274401, + "balance_loss_mlp": 0.01257188, + "epoch": 0.5672628889222907, + "flos": 23520869331840.0, + "grad_norm": 1.9335938837076005, + "language_loss": 0.66754067, + "learning_rate": 1.6629787993634399e-06, + "loss": 0.74435163, + "num_input_tokens_seen": 203261995, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.10162354, + "step": 9435, + "time_per_iteration": 2.5177414417266846 + }, + { + "auxiliary_loss_clip": 0.06416117, + "auxiliary_loss_mlp": 0.0126839, + "balance_loss_clip": 0.06274259, + "balance_loss_mlp": 0.01257333, + "epoch": 0.5673230121749586, + "flos": 27128639854080.0, + "grad_norm": 1.3319121805553942, + "language_loss": 0.71799958, + "learning_rate": 1.6625949134380984e-06, + "loss": 0.79484463, + "num_input_tokens_seen": 203280670, + "router_z_loss_clip": 1.41992188, + "router_z_loss_mlp": 0.11053467, + "step": 9436, + "time_per_iteration": 2.6037702560424805 + }, + { + "auxiliary_loss_clip": 0.06424177, + "auxiliary_loss_mlp": 0.01266696, + "balance_loss_clip": 0.06276658, + "balance_loss_mlp": 0.01254548, + "epoch": 0.5673831354276266, + "flos": 31150476126720.0, + "grad_norm": 1.399584944388347, + "language_loss": 0.7441892, + "learning_rate": 1.6622110403071921e-06, + "loss": 0.82109791, + "num_input_tokens_seen": 203304800, + "router_z_loss_clip": 1.47460938, + "router_z_loss_mlp": 0.12145996, + "step": 9437, + "time_per_iteration": 2.5982627868652344 + }, + { + "auxiliary_loss_clip": 0.0642609, + "auxiliary_loss_mlp": 0.01270521, + "balance_loss_clip": 0.06280209, + "balance_loss_mlp": 0.01258719, + "epoch": 0.5674432586802945, + "flos": 27680662552320.0, + "grad_norm": 1.8153515221603815, + "language_loss": 0.61647224, + "learning_rate": 1.661827179985277e-06, + "loss": 0.69343835, + "num_input_tokens_seen": 203324060, + "router_z_loss_clip": 1.45800781, + "router_z_loss_mlp": 0.11798096, + "step": 9438, + "time_per_iteration": 2.6188385486602783 + }, + { + "auxiliary_loss_clip": 0.0642384, + "auxiliary_loss_mlp": 0.01263986, + "balance_loss_clip": 0.06276964, + "balance_loss_mlp": 0.01252935, + "epoch": 0.5675033819329626, + "flos": 26622458138880.0, + "grad_norm": 1.4984637138093548, + "language_loss": 0.75628054, + "learning_rate": 1.661443332486909e-06, + "loss": 0.83315879, + "num_input_tokens_seen": 203344360, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 0.11053467, + "step": 9439, + "time_per_iteration": 2.5383174419403076 + }, + { + "auxiliary_loss_clip": 0.06420992, + "auxiliary_loss_mlp": 0.01270038, + "balance_loss_clip": 0.06280455, + "balance_loss_mlp": 0.0125798, + "epoch": 0.5675635051856306, + "flos": 19104295674240.0, + "grad_norm": 1.7526345830300347, + "language_loss": 0.8402319, + "learning_rate": 1.6610594978266438e-06, + "loss": 0.91714221, + "num_input_tokens_seen": 203362115, + "router_z_loss_clip": 1.40332031, + "router_z_loss_mlp": 0.1206665, + "step": 9440, + "time_per_iteration": 2.5894699096679688 + }, + { + "auxiliary_loss_clip": 0.06425986, + "auxiliary_loss_mlp": 0.01267618, + "balance_loss_clip": 0.06275898, + "balance_loss_mlp": 0.01255393, + "epoch": 0.5676236284382985, + "flos": 17572040386560.0, + "grad_norm": 2.304829714160468, + "language_loss": 0.75825876, + "learning_rate": 1.6606756760190365e-06, + "loss": 0.83519483, + "num_input_tokens_seen": 203380550, + "router_z_loss_clip": 1.50097656, + "router_z_loss_mlp": 0.12231445, + "step": 9441, + "time_per_iteration": 2.4910314083099365 + }, + { + "auxiliary_loss_clip": 0.0641818, + "auxiliary_loss_mlp": 0.0126441, + "balance_loss_clip": 0.0627504, + "balance_loss_mlp": 0.01253454, + "epoch": 0.5676837516909665, + "flos": 15958375257600.0, + "grad_norm": 1.9240949658540871, + "language_loss": 0.83086008, + "learning_rate": 1.6602918670786413e-06, + "loss": 0.907686, + "num_input_tokens_seen": 203396590, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.10955811, + "step": 9442, + "time_per_iteration": 2.53488826751709 + }, + { + "auxiliary_loss_clip": 0.06416862, + "auxiliary_loss_mlp": 0.01269111, + "balance_loss_clip": 0.06279622, + "balance_loss_mlp": 0.01258543, + "epoch": 0.5677438749436344, + "flos": 18301739667840.0, + "grad_norm": 1.8387898612646743, + "language_loss": 0.74695265, + "learning_rate": 1.6599080710200126e-06, + "loss": 0.82381237, + "num_input_tokens_seen": 203414280, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.10571289, + "step": 9443, + "time_per_iteration": 2.4844577312469482 + }, + { + "auxiliary_loss_clip": 0.06418682, + "auxiliary_loss_mlp": 0.01270397, + "balance_loss_clip": 0.06275757, + "balance_loss_mlp": 0.01258947, + "epoch": 0.5678039981963025, + "flos": 17937120625920.0, + "grad_norm": 2.224999400227568, + "language_loss": 0.77901411, + "learning_rate": 1.6595242878577046e-06, + "loss": 0.85590482, + "num_input_tokens_seen": 203433280, + "router_z_loss_clip": 1.42871094, + "router_z_loss_mlp": 0.11450195, + "step": 9444, + "time_per_iteration": 2.5525596141815186 + }, + { + "auxiliary_loss_clip": 0.06428226, + "auxiliary_loss_mlp": 0.01266607, + "balance_loss_clip": 0.06281613, + "balance_loss_mlp": 0.01255228, + "epoch": 0.5678641214489704, + "flos": 19322153089920.0, + "grad_norm": 1.7258632756557413, + "language_loss": 0.81218302, + "learning_rate": 1.6591405176062687e-06, + "loss": 0.88913137, + "num_input_tokens_seen": 203449935, + "router_z_loss_clip": 1.46484375, + "router_z_loss_mlp": 0.11376953, + "step": 9445, + "time_per_iteration": 2.501241683959961 + }, + { + "auxiliary_loss_clip": 0.06419222, + "auxiliary_loss_mlp": 0.01265991, + "balance_loss_clip": 0.06275924, + "balance_loss_mlp": 0.01255548, + "epoch": 0.5679242447016384, + "flos": 27759389063040.0, + "grad_norm": 1.2498061463372896, + "language_loss": 0.71243447, + "learning_rate": 1.658756760280259e-06, + "loss": 0.78928661, + "num_input_tokens_seen": 203473025, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.10443115, + "step": 9446, + "time_per_iteration": 2.6276121139526367 + }, + { + "auxiliary_loss_clip": 0.06425235, + "auxiliary_loss_mlp": 0.01269109, + "balance_loss_clip": 0.06276199, + "balance_loss_mlp": 0.01257277, + "epoch": 0.5679843679543063, + "flos": 23775888833280.0, + "grad_norm": 1.7407480451238082, + "language_loss": 0.73674792, + "learning_rate": 1.6583730158942276e-06, + "loss": 0.81369138, + "num_input_tokens_seen": 203492895, + "router_z_loss_clip": 1.48925781, + "router_z_loss_mlp": 0.11828613, + "step": 9447, + "time_per_iteration": 2.5189285278320312 + }, + { + "auxiliary_loss_clip": 0.06428251, + "auxiliary_loss_mlp": 0.01269652, + "balance_loss_clip": 0.06280248, + "balance_loss_mlp": 0.01257272, + "epoch": 0.5680444912069743, + "flos": 25598732480640.0, + "grad_norm": 1.8734928972182148, + "language_loss": 0.75381124, + "learning_rate": 1.657989284462725e-06, + "loss": 0.83079028, + "num_input_tokens_seen": 203513710, + "router_z_loss_clip": 1.47851562, + "router_z_loss_mlp": 0.1239624, + "step": 9448, + "time_per_iteration": 2.5984859466552734 + }, + { + "auxiliary_loss_clip": 0.06428179, + "auxiliary_loss_mlp": 0.01269794, + "balance_loss_clip": 0.0627953, + "balance_loss_mlp": 0.01258415, + "epoch": 0.5681046144596422, + "flos": 23702528983680.0, + "grad_norm": 2.0524228921166556, + "language_loss": 0.76618403, + "learning_rate": 1.6576055660003038e-06, + "loss": 0.84316373, + "num_input_tokens_seen": 203531630, + "router_z_loss_clip": 1.48632812, + "router_z_loss_mlp": 0.1137085, + "step": 9449, + "time_per_iteration": 2.515456438064575 + }, + { + "auxiliary_loss_clip": 0.06423233, + "auxiliary_loss_mlp": 0.012706, + "balance_loss_clip": 0.06278287, + "balance_loss_mlp": 0.01259174, + "epoch": 0.5681647377123102, + "flos": 28008161435520.0, + "grad_norm": 1.4260887566171934, + "language_loss": 0.74914038, + "learning_rate": 1.6572218605215128e-06, + "loss": 0.82607877, + "num_input_tokens_seen": 203551885, + "router_z_loss_clip": 1.44921875, + "router_z_loss_mlp": 0.11425781, + "step": 9450, + "time_per_iteration": 2.5997612476348877 + }, + { + "auxiliary_loss_clip": 0.06425043, + "auxiliary_loss_mlp": 0.01263493, + "balance_loss_clip": 0.06278814, + "balance_loss_mlp": 0.01252526, + "epoch": 0.5682248609649782, + "flos": 22754427235200.0, + "grad_norm": 1.6712621343134006, + "language_loss": 0.66650134, + "learning_rate": 1.6568381680409038e-06, + "loss": 0.74338675, + "num_input_tokens_seen": 203572250, + "router_z_loss_clip": 1.46191406, + "router_z_loss_mlp": 0.10974121, + "step": 9451, + "time_per_iteration": 2.5041069984436035 + }, + { + "auxiliary_loss_clip": 0.06437647, + "auxiliary_loss_mlp": 0.01268609, + "balance_loss_clip": 0.06282589, + "balance_loss_mlp": 0.01255126, + "epoch": 0.5682849842176462, + "flos": 21295070599680.0, + "grad_norm": 1.8399857372619135, + "language_loss": 0.72354877, + "learning_rate": 1.656454488573026e-06, + "loss": 0.80061138, + "num_input_tokens_seen": 203590605, + "router_z_loss_clip": 1.54980469, + "router_z_loss_mlp": 0.1348877, + "step": 9452, + "time_per_iteration": 2.529772996902466 + }, + { + "auxiliary_loss_clip": 0.06419612, + "auxiliary_loss_mlp": 0.01265219, + "balance_loss_clip": 0.06277338, + "balance_loss_mlp": 0.01253799, + "epoch": 0.5683451074703142, + "flos": 21147973557120.0, + "grad_norm": 1.3918203076927713, + "language_loss": 0.70862073, + "learning_rate": 1.656070822132428e-06, + "loss": 0.78546906, + "num_input_tokens_seen": 203610080, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.11419678, + "step": 9453, + "time_per_iteration": 3.975252151489258 + }, + { + "auxiliary_loss_clip": 0.06420393, + "auxiliary_loss_mlp": 0.01265438, + "balance_loss_clip": 0.06276751, + "balance_loss_mlp": 0.01255001, + "epoch": 0.5684052307229821, + "flos": 22350759390720.0, + "grad_norm": 1.7444047953592532, + "language_loss": 0.70346195, + "learning_rate": 1.6556871687336592e-06, + "loss": 0.78032023, + "num_input_tokens_seen": 203630060, + "router_z_loss_clip": 1.43652344, + "router_z_loss_mlp": 0.10443115, + "step": 9454, + "time_per_iteration": 2.530397415161133 + }, + { + "auxiliary_loss_clip": 0.06417777, + "auxiliary_loss_mlp": 0.01265567, + "balance_loss_clip": 0.06276377, + "balance_loss_mlp": 0.01255572, + "epoch": 0.5684653539756501, + "flos": 21805067675520.0, + "grad_norm": 2.3221034941278256, + "language_loss": 0.6090889, + "learning_rate": 1.6553035283912671e-06, + "loss": 0.68592238, + "num_input_tokens_seen": 203649065, + "router_z_loss_clip": 1.41308594, + "router_z_loss_mlp": 0.10003662, + "step": 9455, + "time_per_iteration": 2.5284998416900635 + }, + { + "auxiliary_loss_clip": 0.06432047, + "auxiliary_loss_mlp": 0.01270821, + "balance_loss_clip": 0.06281373, + "balance_loss_mlp": 0.01259144, + "epoch": 0.568525477228318, + "flos": 23005757157120.0, + "grad_norm": 1.7024948062012655, + "language_loss": 0.73315781, + "learning_rate": 1.6549199011198e-06, + "loss": 0.81018651, + "num_input_tokens_seen": 203667545, + "router_z_loss_clip": 1.5078125, + "router_z_loss_mlp": 0.11669922, + "step": 9456, + "time_per_iteration": 2.5266809463500977 + }, + { + "auxiliary_loss_clip": 0.06419168, + "auxiliary_loss_mlp": 0.01265297, + "balance_loss_clip": 0.06275652, + "balance_loss_mlp": 0.01254771, + "epoch": 0.568585600480986, + "flos": 21398045667840.0, + "grad_norm": 1.7476092517075434, + "language_loss": 0.77197653, + "learning_rate": 1.6545362869338048e-06, + "loss": 0.84882128, + "num_input_tokens_seen": 203686025, + "router_z_loss_clip": 1.43457031, + "router_z_loss_mlp": 0.10534668, + "step": 9457, + "time_per_iteration": 2.6098482608795166 + }, + { + "auxiliary_loss_clip": 0.06424686, + "auxiliary_loss_mlp": 0.01267717, + "balance_loss_clip": 0.06278071, + "balance_loss_mlp": 0.01255969, + "epoch": 0.568645723733654, + "flos": 30015054576000.0, + "grad_norm": 1.8479320449106564, + "language_loss": 0.6697377, + "learning_rate": 1.6541526858478285e-06, + "loss": 0.74666172, + "num_input_tokens_seen": 203705540, + "router_z_loss_clip": 1.46484375, + "router_z_loss_mlp": 0.11749268, + "step": 9458, + "time_per_iteration": 4.003401756286621 + }, + { + "auxiliary_loss_clip": 0.06424286, + "auxiliary_loss_mlp": 0.01264614, + "balance_loss_clip": 0.06276263, + "balance_loss_mlp": 0.01253295, + "epoch": 0.568705846986322, + "flos": 20418945108480.0, + "grad_norm": 2.1992346625709427, + "language_loss": 0.68311954, + "learning_rate": 1.6537690978764167e-06, + "loss": 0.76000857, + "num_input_tokens_seen": 203723670, + "router_z_loss_clip": 1.47949219, + "router_z_loss_mlp": 0.11315918, + "step": 9459, + "time_per_iteration": 2.5213470458984375 + }, + { + "auxiliary_loss_clip": 0.06427266, + "auxiliary_loss_mlp": 0.01264866, + "balance_loss_clip": 0.06277259, + "balance_loss_mlp": 0.01253756, + "epoch": 0.5687659702389899, + "flos": 17462440846080.0, + "grad_norm": 2.588089844490271, + "language_loss": 0.77003014, + "learning_rate": 1.6533855230341155e-06, + "loss": 0.84695148, + "num_input_tokens_seen": 203739705, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.11102295, + "step": 9460, + "time_per_iteration": 2.5016860961914062 + }, + { + "auxiliary_loss_clip": 0.06424034, + "auxiliary_loss_mlp": 0.01270464, + "balance_loss_clip": 0.0627469, + "balance_loss_mlp": 0.01258865, + "epoch": 0.5688260934916579, + "flos": 25412335073280.0, + "grad_norm": 1.5686079353810067, + "language_loss": 0.72504562, + "learning_rate": 1.65300196133547e-06, + "loss": 0.80199063, + "num_input_tokens_seen": 203759000, + "router_z_loss_clip": 1.49316406, + "router_z_loss_mlp": 0.11602783, + "step": 9461, + "time_per_iteration": 2.652650833129883 + }, + { + "auxiliary_loss_clip": 0.06420281, + "auxiliary_loss_mlp": 0.01265549, + "balance_loss_clip": 0.06276302, + "balance_loss_mlp": 0.01254707, + "epoch": 0.5688862167443258, + "flos": 21613052044800.0, + "grad_norm": 1.8456676032626356, + "language_loss": 0.73588586, + "learning_rate": 1.6526184127950249e-06, + "loss": 0.81274414, + "num_input_tokens_seen": 203774295, + "router_z_loss_clip": 1.43945312, + "router_z_loss_mlp": 0.10839844, + "step": 9462, + "time_per_iteration": 3.9915239810943604 + }, + { + "auxiliary_loss_clip": 0.06414893, + "auxiliary_loss_mlp": 0.01264818, + "balance_loss_clip": 0.06275715, + "balance_loss_mlp": 0.01254715, + "epoch": 0.5689463399969938, + "flos": 22425544759680.0, + "grad_norm": 2.0067901163228212, + "language_loss": 0.72924364, + "learning_rate": 1.6522348774273246e-06, + "loss": 0.80604076, + "num_input_tokens_seen": 203792710, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.10107422, + "step": 9463, + "time_per_iteration": 2.5026743412017822 + }, + { + "auxiliary_loss_clip": 0.06417, + "auxiliary_loss_mlp": 0.01266249, + "balance_loss_clip": 0.06272251, + "balance_loss_mlp": 0.01255115, + "epoch": 0.5690064632496618, + "flos": 18302787843840.0, + "grad_norm": 1.7796234570298675, + "language_loss": 0.7436375, + "learning_rate": 1.6518513552469123e-06, + "loss": 0.82046998, + "num_input_tokens_seen": 203811645, + "router_z_loss_clip": 1.4453125, + "router_z_loss_mlp": 0.11126709, + "step": 9464, + "time_per_iteration": 2.5418522357940674 + }, + { + "auxiliary_loss_clip": 0.06420638, + "auxiliary_loss_mlp": 0.01265209, + "balance_loss_clip": 0.06273931, + "balance_loss_mlp": 0.01253169, + "epoch": 0.5690665865023298, + "flos": 21585575105280.0, + "grad_norm": 1.531985348456469, + "language_loss": 0.84518385, + "learning_rate": 1.6514678462683312e-06, + "loss": 0.92204237, + "num_input_tokens_seen": 203830040, + "router_z_loss_clip": 1.46679688, + "router_z_loss_mlp": 0.12060547, + "step": 9465, + "time_per_iteration": 2.501640558242798 + }, + { + "auxiliary_loss_clip": 0.06416291, + "auxiliary_loss_mlp": 0.01262582, + "balance_loss_clip": 0.06275291, + "balance_loss_mlp": 0.01251954, + "epoch": 0.5691267097549978, + "flos": 24427616290560.0, + "grad_norm": 1.5399864144711508, + "language_loss": 0.72636294, + "learning_rate": 1.651084350506125e-06, + "loss": 0.80315161, + "num_input_tokens_seen": 203851245, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.10638428, + "step": 9466, + "time_per_iteration": 2.5872812271118164 + }, + { + "auxiliary_loss_clip": 0.06322309, + "auxiliary_loss_mlp": 0.01252779, + "balance_loss_clip": 0.06261392, + "balance_loss_mlp": 0.01251253, + "epoch": 0.5691868330076657, + "flos": 61679915389440.0, + "grad_norm": 0.706168287542021, + "language_loss": 0.55225098, + "learning_rate": 1.6507008679748343e-06, + "loss": 0.62800181, + "num_input_tokens_seen": 203916400, + "router_z_loss_clip": 0.609375, + "router_z_loss_mlp": 0.01525879, + "step": 9467, + "time_per_iteration": 3.1809115409851074 + }, + { + "auxiliary_loss_clip": 0.06421535, + "auxiliary_loss_mlp": 0.01265338, + "balance_loss_clip": 0.06275938, + "balance_loss_mlp": 0.01253471, + "epoch": 0.5692469562603337, + "flos": 21331687633920.0, + "grad_norm": 1.821723086609738, + "language_loss": 0.64103729, + "learning_rate": 1.6503173986890023e-06, + "loss": 0.717906, + "num_input_tokens_seen": 203935870, + "router_z_loss_clip": 1.45605469, + "router_z_loss_mlp": 0.11865234, + "step": 9468, + "time_per_iteration": 2.5419483184814453 + }, + { + "auxiliary_loss_clip": 0.06420718, + "auxiliary_loss_mlp": 0.01268612, + "balance_loss_clip": 0.06276828, + "balance_loss_mlp": 0.01257508, + "epoch": 0.5693070795130016, + "flos": 23374652757120.0, + "grad_norm": 2.0216455322076885, + "language_loss": 0.79510915, + "learning_rate": 1.64993394266317e-06, + "loss": 0.87200236, + "num_input_tokens_seen": 203954950, + "router_z_loss_clip": 1.43945312, + "router_z_loss_mlp": 0.11102295, + "step": 9469, + "time_per_iteration": 3.974965810775757 + }, + { + "auxiliary_loss_clip": 0.06424933, + "auxiliary_loss_mlp": 0.01267744, + "balance_loss_clip": 0.06275818, + "balance_loss_mlp": 0.01256133, + "epoch": 0.5693672027656697, + "flos": 18703143452160.0, + "grad_norm": 1.8253898689046395, + "language_loss": 0.69934285, + "learning_rate": 1.6495504999118769e-06, + "loss": 0.77626961, + "num_input_tokens_seen": 203972715, + "router_z_loss_clip": 1.48925781, + "router_z_loss_mlp": 0.11608887, + "step": 9470, + "time_per_iteration": 2.490144729614258 + }, + { + "auxiliary_loss_clip": 0.06418116, + "auxiliary_loss_mlp": 0.01266169, + "balance_loss_clip": 0.06273302, + "balance_loss_mlp": 0.01254391, + "epoch": 0.5694273260183376, + "flos": 20455478288640.0, + "grad_norm": 2.1472118271494574, + "language_loss": 0.75247335, + "learning_rate": 1.6491670704496644e-06, + "loss": 0.82931614, + "num_input_tokens_seen": 203990775, + "router_z_loss_clip": 1.44824219, + "router_z_loss_mlp": 0.11785889, + "step": 9471, + "time_per_iteration": 2.5518500804901123 + }, + { + "auxiliary_loss_clip": 0.06417546, + "auxiliary_loss_mlp": 0.01266321, + "balance_loss_clip": 0.06276481, + "balance_loss_mlp": 0.01255616, + "epoch": 0.5694874492710056, + "flos": 17608992837120.0, + "grad_norm": 1.6827496814774499, + "language_loss": 0.57877314, + "learning_rate": 1.6487836542910716e-06, + "loss": 0.65561181, + "num_input_tokens_seen": 204008845, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.10705566, + "step": 9472, + "time_per_iteration": 2.535846710205078 + }, + { + "auxiliary_loss_clip": 0.06416848, + "auxiliary_loss_mlp": 0.01268789, + "balance_loss_clip": 0.06277969, + "balance_loss_mlp": 0.01257411, + "epoch": 0.5695475725236735, + "flos": 13375923621120.0, + "grad_norm": 1.7815747768820038, + "language_loss": 0.73987466, + "learning_rate": 1.648400251450638e-06, + "loss": 0.81673104, + "num_input_tokens_seen": 204023755, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.11376953, + "step": 9473, + "time_per_iteration": 2.4858133792877197 + }, + { + "auxiliary_loss_clip": 0.06327727, + "auxiliary_loss_mlp": 0.01252353, + "balance_loss_clip": 0.06266978, + "balance_loss_mlp": 0.01250914, + "epoch": 0.5696076957763415, + "flos": 68195078881920.0, + "grad_norm": 0.6484051468543478, + "language_loss": 0.57388628, + "learning_rate": 1.6480168619429023e-06, + "loss": 0.64968711, + "num_input_tokens_seen": 204091255, + "router_z_loss_clip": 0.609375, + "router_z_loss_mlp": 0.01437378, + "step": 9474, + "time_per_iteration": 3.1554436683654785 + }, + { + "auxiliary_loss_clip": 0.06415011, + "auxiliary_loss_mlp": 0.01264959, + "balance_loss_clip": 0.06274811, + "balance_loss_mlp": 0.01254111, + "epoch": 0.5696678190290094, + "flos": 33846636153600.0, + "grad_norm": 1.6105466561987234, + "language_loss": 0.54358017, + "learning_rate": 1.6476334857824017e-06, + "loss": 0.62037987, + "num_input_tokens_seen": 204113285, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.10845947, + "step": 9475, + "time_per_iteration": 2.6193020343780518 + }, + { + "auxiliary_loss_clip": 0.06419323, + "auxiliary_loss_mlp": 0.01263613, + "balance_loss_clip": 0.06274848, + "balance_loss_mlp": 0.01252234, + "epoch": 0.5697279422816774, + "flos": 26363329787520.0, + "grad_norm": 2.008545727860435, + "language_loss": 0.79765999, + "learning_rate": 1.647250122983675e-06, + "loss": 0.87448931, + "num_input_tokens_seen": 204133045, + "router_z_loss_clip": 1.44726562, + "router_z_loss_mlp": 0.11383057, + "step": 9476, + "time_per_iteration": 2.543100595474243 + }, + { + "auxiliary_loss_clip": 0.06428041, + "auxiliary_loss_mlp": 0.01271624, + "balance_loss_clip": 0.06279552, + "balance_loss_mlp": 0.01260209, + "epoch": 0.5697880655343454, + "flos": 22937260844160.0, + "grad_norm": 1.735529425276041, + "language_loss": 0.66121185, + "learning_rate": 1.6468667735612592e-06, + "loss": 0.73820853, + "num_input_tokens_seen": 204152590, + "router_z_loss_clip": 1.48339844, + "router_z_loss_mlp": 0.11407471, + "step": 9477, + "time_per_iteration": 2.5366005897521973 + }, + { + "auxiliary_loss_clip": 0.06423311, + "auxiliary_loss_mlp": 0.01266336, + "balance_loss_clip": 0.06277082, + "balance_loss_mlp": 0.0125553, + "epoch": 0.5698481887870134, + "flos": 26768674713600.0, + "grad_norm": 1.6190739346076362, + "language_loss": 0.71115196, + "learning_rate": 1.6464834375296906e-06, + "loss": 0.78804839, + "num_input_tokens_seen": 204171815, + "router_z_loss_clip": 1.46289062, + "router_z_loss_mlp": 0.1081543, + "step": 9478, + "time_per_iteration": 2.5513012409210205 + }, + { + "auxiliary_loss_clip": 0.06415288, + "auxiliary_loss_mlp": 0.01266638, + "balance_loss_clip": 0.06277218, + "balance_loss_mlp": 0.01255718, + "epoch": 0.5699083120396814, + "flos": 15747729292800.0, + "grad_norm": 1.4794360727515914, + "language_loss": 0.69306439, + "learning_rate": 1.6461001149035055e-06, + "loss": 0.76988363, + "num_input_tokens_seen": 204188535, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.10913086, + "step": 9479, + "time_per_iteration": 2.5828471183776855 + }, + { + "auxiliary_loss_clip": 0.06413876, + "auxiliary_loss_mlp": 0.01267563, + "balance_loss_clip": 0.06275865, + "balance_loss_mlp": 0.0125734, + "epoch": 0.5699684352923493, + "flos": 19543448522880.0, + "grad_norm": 1.5013072139655574, + "language_loss": 0.71621788, + "learning_rate": 1.6457168056972392e-06, + "loss": 0.79303229, + "num_input_tokens_seen": 204208365, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.10223389, + "step": 9480, + "time_per_iteration": 2.5247299671173096 + }, + { + "auxiliary_loss_clip": 0.06418922, + "auxiliary_loss_mlp": 0.01267091, + "balance_loss_clip": 0.06276189, + "balance_loss_mlp": 0.01255319, + "epoch": 0.5700285585450173, + "flos": 16258942252800.0, + "grad_norm": 4.885605743124815, + "language_loss": 0.72444856, + "learning_rate": 1.6453335099254276e-06, + "loss": 0.80130869, + "num_input_tokens_seen": 204226560, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.11779785, + "step": 9481, + "time_per_iteration": 2.508589506149292 + }, + { + "auxiliary_loss_clip": 0.06421519, + "auxiliary_loss_mlp": 0.01270221, + "balance_loss_clip": 0.06279288, + "balance_loss_mlp": 0.01258461, + "epoch": 0.5700886817976852, + "flos": 19871115114240.0, + "grad_norm": 1.897422682992244, + "language_loss": 0.78625083, + "learning_rate": 1.6449502276026041e-06, + "loss": 0.86316824, + "num_input_tokens_seen": 204245410, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.11761475, + "step": 9482, + "time_per_iteration": 2.5139269828796387 + }, + { + "auxiliary_loss_clip": 0.06417527, + "auxiliary_loss_mlp": 0.01263816, + "balance_loss_clip": 0.06276704, + "balance_loss_mlp": 0.01253242, + "epoch": 0.5701488050503533, + "flos": 23848452069120.0, + "grad_norm": 2.496783055499815, + "language_loss": 0.78338385, + "learning_rate": 1.6445669587433043e-06, + "loss": 0.86019731, + "num_input_tokens_seen": 204264840, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.10571289, + "step": 9483, + "time_per_iteration": 2.547522783279419 + }, + { + "auxiliary_loss_clip": 0.06420138, + "auxiliary_loss_mlp": 0.01264109, + "balance_loss_clip": 0.06276282, + "balance_loss_mlp": 0.0125369, + "epoch": 0.5702089283030212, + "flos": 23666457000960.0, + "grad_norm": 1.5289248173251733, + "language_loss": 0.81642497, + "learning_rate": 1.6441837033620612e-06, + "loss": 0.89326739, + "num_input_tokens_seen": 204284335, + "router_z_loss_clip": 1.43652344, + "router_z_loss_mlp": 0.10424805, + "step": 9484, + "time_per_iteration": 2.546597719192505 + }, + { + "auxiliary_loss_clip": 0.06420925, + "auxiliary_loss_mlp": 0.01267488, + "balance_loss_clip": 0.06277504, + "balance_loss_mlp": 0.01255924, + "epoch": 0.5702690515556892, + "flos": 27898519968000.0, + "grad_norm": 1.8682928794178455, + "language_loss": 0.61101806, + "learning_rate": 1.6438004614734073e-06, + "loss": 0.68790221, + "num_input_tokens_seen": 204302590, + "router_z_loss_clip": 1.43457031, + "router_z_loss_mlp": 0.11560059, + "step": 9485, + "time_per_iteration": 2.5931575298309326 + }, + { + "auxiliary_loss_clip": 0.06421611, + "auxiliary_loss_mlp": 0.01267401, + "balance_loss_clip": 0.06277725, + "balance_loss_mlp": 0.01255748, + "epoch": 0.5703291748083571, + "flos": 24030698699520.0, + "grad_norm": 1.7282499785723824, + "language_loss": 0.65970731, + "learning_rate": 1.6434172330918757e-06, + "loss": 0.73659742, + "num_input_tokens_seen": 204323055, + "router_z_loss_clip": 1.43945312, + "router_z_loss_mlp": 0.11645508, + "step": 9486, + "time_per_iteration": 2.546604871749878 + }, + { + "auxiliary_loss_clip": 0.06330933, + "auxiliary_loss_mlp": 0.01257137, + "balance_loss_clip": 0.06271148, + "balance_loss_mlp": 0.01255769, + "epoch": 0.5703892980610251, + "flos": 57044478067200.0, + "grad_norm": 0.6556389442355417, + "language_loss": 0.47978726, + "learning_rate": 1.6430340182319978e-06, + "loss": 0.55566794, + "num_input_tokens_seen": 204386160, + "router_z_loss_clip": 0.60058594, + "router_z_loss_mlp": 0.01370239, + "step": 9487, + "time_per_iteration": 3.216449499130249 + }, + { + "auxiliary_loss_clip": 0.06419921, + "auxiliary_loss_mlp": 0.01266304, + "balance_loss_clip": 0.06275571, + "balance_loss_mlp": 0.01255212, + "epoch": 0.570449421313693, + "flos": 24357610604160.0, + "grad_norm": 1.4009858057112485, + "language_loss": 0.8597424, + "learning_rate": 1.6426508169083067e-06, + "loss": 0.93660462, + "num_input_tokens_seen": 204406315, + "router_z_loss_clip": 1.44335938, + "router_z_loss_mlp": 0.11102295, + "step": 9488, + "time_per_iteration": 2.5608506202697754 + }, + { + "auxiliary_loss_clip": 0.06428364, + "auxiliary_loss_mlp": 0.01270308, + "balance_loss_clip": 0.06281118, + "balance_loss_mlp": 0.01259055, + "epoch": 0.570509544566361, + "flos": 24835770328320.0, + "grad_norm": 1.8825828159705935, + "language_loss": 0.79195142, + "learning_rate": 1.6422676291353314e-06, + "loss": 0.86893809, + "num_input_tokens_seen": 204427645, + "router_z_loss_clip": 1.47265625, + "router_z_loss_mlp": 0.11260986, + "step": 9489, + "time_per_iteration": 2.553471088409424 + }, + { + "auxiliary_loss_clip": 0.06419341, + "auxiliary_loss_mlp": 0.01263993, + "balance_loss_clip": 0.06276694, + "balance_loss_mlp": 0.01253646, + "epoch": 0.570569667819029, + "flos": 21403663891200.0, + "grad_norm": 1.6360729178743676, + "language_loss": 0.7047472, + "learning_rate": 1.641884454927604e-06, + "loss": 0.78158057, + "num_input_tokens_seen": 204445910, + "router_z_loss_clip": 1.42480469, + "router_z_loss_mlp": 0.10345459, + "step": 9490, + "time_per_iteration": 2.5905275344848633 + }, + { + "auxiliary_loss_clip": 0.06421432, + "auxiliary_loss_mlp": 0.01269104, + "balance_loss_clip": 0.06279342, + "balance_loss_mlp": 0.01257803, + "epoch": 0.570629791071697, + "flos": 23222608323840.0, + "grad_norm": 1.4492809017584538, + "language_loss": 0.76252091, + "learning_rate": 1.6415012942996548e-06, + "loss": 0.83942628, + "num_input_tokens_seen": 204464680, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.11291504, + "step": 9491, + "time_per_iteration": 2.523472309112549 + }, + { + "auxiliary_loss_clip": 0.06328943, + "auxiliary_loss_mlp": 0.01263516, + "balance_loss_clip": 0.06268945, + "balance_loss_mlp": 0.01261694, + "epoch": 0.570689914324365, + "flos": 65303632915200.0, + "grad_norm": 0.7890932915341226, + "language_loss": 0.57371008, + "learning_rate": 1.641118147266011e-06, + "loss": 0.64963466, + "num_input_tokens_seen": 204525580, + "router_z_loss_clip": 0.6015625, + "router_z_loss_mlp": 0.01817322, + "step": 9492, + "time_per_iteration": 4.556811571121216 + }, + { + "auxiliary_loss_clip": 0.06420883, + "auxiliary_loss_mlp": 0.01266854, + "balance_loss_clip": 0.0627829, + "balance_loss_mlp": 0.01255809, + "epoch": 0.5707500375770329, + "flos": 21148225119360.0, + "grad_norm": 2.4823752626433357, + "language_loss": 0.71714401, + "learning_rate": 1.6407350138412035e-06, + "loss": 0.79402137, + "num_input_tokens_seen": 204541320, + "router_z_loss_clip": 1.42480469, + "router_z_loss_mlp": 0.1104126, + "step": 9493, + "time_per_iteration": 2.5404999256134033 + }, + { + "auxiliary_loss_clip": 0.06425234, + "auxiliary_loss_mlp": 0.01270244, + "balance_loss_clip": 0.06277438, + "balance_loss_mlp": 0.01258812, + "epoch": 0.5708101608297009, + "flos": 20818881446400.0, + "grad_norm": 1.6649189140980358, + "language_loss": 0.77940559, + "learning_rate": 1.6403518940397606e-06, + "loss": 0.85636032, + "num_input_tokens_seen": 204560275, + "router_z_loss_clip": 1.4765625, + "router_z_loss_mlp": 0.11431885, + "step": 9494, + "time_per_iteration": 2.5486340522766113 + }, + { + "auxiliary_loss_clip": 0.06427161, + "auxiliary_loss_mlp": 0.01267162, + "balance_loss_clip": 0.06276955, + "balance_loss_mlp": 0.01255026, + "epoch": 0.5708702840823688, + "flos": 25819482862080.0, + "grad_norm": 2.058789415113096, + "language_loss": 0.80377084, + "learning_rate": 1.6399687878762096e-06, + "loss": 0.88071406, + "num_input_tokens_seen": 204579430, + "router_z_loss_clip": 1.49902344, + "router_z_loss_mlp": 0.12127686, + "step": 9495, + "time_per_iteration": 2.5960187911987305 + }, + { + "auxiliary_loss_clip": 0.06429706, + "auxiliary_loss_mlp": 0.01275013, + "balance_loss_clip": 0.06277497, + "balance_loss_mlp": 0.01261567, + "epoch": 0.5709304073350369, + "flos": 23657400760320.0, + "grad_norm": 1.9375866549540641, + "language_loss": 0.66475153, + "learning_rate": 1.6395856953650784e-06, + "loss": 0.74179876, + "num_input_tokens_seen": 204597710, + "router_z_loss_clip": 1.5234375, + "router_z_loss_mlp": 0.13446045, + "step": 9496, + "time_per_iteration": 2.536844253540039 + }, + { + "auxiliary_loss_clip": 0.06424591, + "auxiliary_loss_mlp": 0.0126837, + "balance_loss_clip": 0.06275633, + "balance_loss_mlp": 0.01256485, + "epoch": 0.5709905305877048, + "flos": 16113144948480.0, + "grad_norm": 2.1097086993227068, + "language_loss": 0.70119512, + "learning_rate": 1.6392026165208938e-06, + "loss": 0.77812475, + "num_input_tokens_seen": 204616140, + "router_z_loss_clip": 1.49023438, + "router_z_loss_mlp": 0.11877441, + "step": 9497, + "time_per_iteration": 2.5001566410064697 + }, + { + "auxiliary_loss_clip": 0.06421457, + "auxiliary_loss_mlp": 0.01273203, + "balance_loss_clip": 0.06275579, + "balance_loss_mlp": 0.01261455, + "epoch": 0.5710506538403728, + "flos": 24757211525760.0, + "grad_norm": 5.203790092819982, + "language_loss": 0.81695306, + "learning_rate": 1.638819551358182e-06, + "loss": 0.89389962, + "num_input_tokens_seen": 204636470, + "router_z_loss_clip": 1.45800781, + "router_z_loss_mlp": 0.11755371, + "step": 9498, + "time_per_iteration": 3.979785203933716 + }, + { + "auxiliary_loss_clip": 0.06421061, + "auxiliary_loss_mlp": 0.01268836, + "balance_loss_clip": 0.06273817, + "balance_loss_mlp": 0.0125707, + "epoch": 0.5711107770930407, + "flos": 21988907533440.0, + "grad_norm": 1.778867640796668, + "language_loss": 0.66763413, + "learning_rate": 1.638436499891469e-06, + "loss": 0.74453306, + "num_input_tokens_seen": 204656640, + "router_z_loss_clip": 1.47167969, + "router_z_loss_mlp": 0.11767578, + "step": 9499, + "time_per_iteration": 2.560131788253784 + }, + { + "auxiliary_loss_clip": 0.06422064, + "auxiliary_loss_mlp": 0.01267168, + "balance_loss_clip": 0.06276537, + "balance_loss_mlp": 0.01255432, + "epoch": 0.5711709003457087, + "flos": 19580233265280.0, + "grad_norm": 1.5461706893268885, + "language_loss": 0.71884078, + "learning_rate": 1.6380534621352805e-06, + "loss": 0.79573303, + "num_input_tokens_seen": 204675475, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.11743164, + "step": 9500, + "time_per_iteration": 2.51857852935791 + }, + { + "auxiliary_loss_clip": 0.06426705, + "auxiliary_loss_mlp": 0.01270529, + "balance_loss_clip": 0.06277592, + "balance_loss_mlp": 0.01257893, + "epoch": 0.5712310235983766, + "flos": 24249436583040.0, + "grad_norm": 1.9132916799477426, + "language_loss": 0.76773643, + "learning_rate": 1.6376704381041407e-06, + "loss": 0.8447088, + "num_input_tokens_seen": 204695385, + "router_z_loss_clip": 1.4921875, + "router_z_loss_mlp": 0.12640381, + "step": 9501, + "time_per_iteration": 2.585303544998169 + }, + { + "auxiliary_loss_clip": 0.06424866, + "auxiliary_loss_mlp": 0.01265647, + "balance_loss_clip": 0.06278552, + "balance_loss_mlp": 0.01254233, + "epoch": 0.5712911468510447, + "flos": 21002469742080.0, + "grad_norm": 1.6366629976038132, + "language_loss": 0.75004148, + "learning_rate": 1.6372874278125742e-06, + "loss": 0.82694662, + "num_input_tokens_seen": 204714730, + "router_z_loss_clip": 1.46289062, + "router_z_loss_mlp": 0.11419678, + "step": 9502, + "time_per_iteration": 3.9893364906311035 + }, + { + "auxiliary_loss_clip": 0.06420161, + "auxiliary_loss_mlp": 0.0126738, + "balance_loss_clip": 0.0627653, + "balance_loss_mlp": 0.01256561, + "epoch": 0.5713512701037126, + "flos": 18923055292800.0, + "grad_norm": 1.7156142062685982, + "language_loss": 0.82350051, + "learning_rate": 1.636904431275105e-06, + "loss": 0.90037596, + "num_input_tokens_seen": 204735025, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.10827637, + "step": 9503, + "time_per_iteration": 2.5289459228515625 + }, + { + "auxiliary_loss_clip": 0.06420251, + "auxiliary_loss_mlp": 0.01271521, + "balance_loss_clip": 0.06276201, + "balance_loss_mlp": 0.01260375, + "epoch": 0.5714113933563806, + "flos": 17417983455360.0, + "grad_norm": 2.1350982520901827, + "language_loss": 0.86264861, + "learning_rate": 1.6365214485062553e-06, + "loss": 0.93956631, + "num_input_tokens_seen": 204751365, + "router_z_loss_clip": 1.44140625, + "router_z_loss_mlp": 0.1114502, + "step": 9504, + "time_per_iteration": 2.5180015563964844 + }, + { + "auxiliary_loss_clip": 0.06417073, + "auxiliary_loss_mlp": 0.01266636, + "balance_loss_clip": 0.06275576, + "balance_loss_mlp": 0.01255651, + "epoch": 0.5714715166090486, + "flos": 20199536392320.0, + "grad_norm": 2.0316869593340265, + "language_loss": 0.75480437, + "learning_rate": 1.6361384795205496e-06, + "loss": 0.83164144, + "num_input_tokens_seen": 204768980, + "router_z_loss_clip": 1.41503906, + "router_z_loss_mlp": 0.10980225, + "step": 9505, + "time_per_iteration": 2.497009754180908 + }, + { + "auxiliary_loss_clip": 0.06418754, + "auxiliary_loss_mlp": 0.01267922, + "balance_loss_clip": 0.06276823, + "balance_loss_mlp": 0.01256419, + "epoch": 0.5715316398617165, + "flos": 18557597710080.0, + "grad_norm": 1.6474042198541896, + "language_loss": 0.82215714, + "learning_rate": 1.635755524332509e-06, + "loss": 0.89902395, + "num_input_tokens_seen": 204788110, + "router_z_loss_clip": 1.41796875, + "router_z_loss_mlp": 0.1151123, + "step": 9506, + "time_per_iteration": 2.5657498836517334 + }, + { + "auxiliary_loss_clip": 0.06418438, + "auxiliary_loss_mlp": 0.01265118, + "balance_loss_clip": 0.0627599, + "balance_loss_mlp": 0.01254568, + "epoch": 0.5715917631143845, + "flos": 18484028225280.0, + "grad_norm": 1.482727560680873, + "language_loss": 0.77285796, + "learning_rate": 1.6353725829566552e-06, + "loss": 0.84969354, + "num_input_tokens_seen": 204807240, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.10546875, + "step": 9507, + "time_per_iteration": 2.485496997833252 + }, + { + "auxiliary_loss_clip": 0.06422855, + "auxiliary_loss_mlp": 0.01269089, + "balance_loss_clip": 0.06276034, + "balance_loss_mlp": 0.01257091, + "epoch": 0.5716518863670524, + "flos": 24026128652160.0, + "grad_norm": 1.4323391248104125, + "language_loss": 0.68799454, + "learning_rate": 1.63498965540751e-06, + "loss": 0.76491398, + "num_input_tokens_seen": 204826415, + "router_z_loss_clip": 1.46777344, + "router_z_loss_mlp": 0.12005615, + "step": 9508, + "time_per_iteration": 2.5643258094787598 + }, + { + "auxiliary_loss_clip": 0.06422228, + "auxiliary_loss_mlp": 0.01264898, + "balance_loss_clip": 0.06276274, + "balance_loss_mlp": 0.012529, + "epoch": 0.5717120096197205, + "flos": 17824879681920.0, + "grad_norm": 2.05386002816889, + "language_loss": 0.80054557, + "learning_rate": 1.634606741699593e-06, + "loss": 0.87741685, + "num_input_tokens_seen": 204844305, + "router_z_loss_clip": 1.45605469, + "router_z_loss_mlp": 0.11987305, + "step": 9509, + "time_per_iteration": 3.8947436809539795 + }, + { + "auxiliary_loss_clip": 0.06415324, + "auxiliary_loss_mlp": 0.0126599, + "balance_loss_clip": 0.06274744, + "balance_loss_mlp": 0.01255691, + "epoch": 0.5717721328723884, + "flos": 21871551490560.0, + "grad_norm": 1.798702817725972, + "language_loss": 0.72265553, + "learning_rate": 1.6342238418474255e-06, + "loss": 0.79946876, + "num_input_tokens_seen": 204861765, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.10302734, + "step": 9510, + "time_per_iteration": 2.496246099472046 + }, + { + "auxiliary_loss_clip": 0.06419715, + "auxiliary_loss_mlp": 0.01266842, + "balance_loss_clip": 0.0627699, + "balance_loss_mlp": 0.01255946, + "epoch": 0.5718322561250564, + "flos": 28444924442880.0, + "grad_norm": 1.3126461366590796, + "language_loss": 0.69652188, + "learning_rate": 1.6338409558655264e-06, + "loss": 0.77338743, + "num_input_tokens_seen": 204882505, + "router_z_loss_clip": 1.42480469, + "router_z_loss_mlp": 0.10906982, + "step": 9511, + "time_per_iteration": 2.5713541507720947 + }, + { + "auxiliary_loss_clip": 0.06420782, + "auxiliary_loss_mlp": 0.01268426, + "balance_loss_clip": 0.06277648, + "balance_loss_mlp": 0.01257136, + "epoch": 0.5718923793777243, + "flos": 13556702805120.0, + "grad_norm": 2.0681515910732715, + "language_loss": 0.61827439, + "learning_rate": 1.6334580837684152e-06, + "loss": 0.69516647, + "num_input_tokens_seen": 204899830, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.112854, + "step": 9512, + "time_per_iteration": 2.49580454826355 + }, + { + "auxiliary_loss_clip": 0.06421502, + "auxiliary_loss_mlp": 0.01268423, + "balance_loss_clip": 0.06278209, + "balance_loss_mlp": 0.01257498, + "epoch": 0.5719525026303923, + "flos": 17827856501760.0, + "grad_norm": 2.3676523534955685, + "language_loss": 0.76396298, + "learning_rate": 1.6330752255706104e-06, + "loss": 0.84086221, + "num_input_tokens_seen": 204918100, + "router_z_loss_clip": 1.43359375, + "router_z_loss_mlp": 0.10919189, + "step": 9513, + "time_per_iteration": 2.500870704650879 + }, + { + "auxiliary_loss_clip": 0.06326592, + "auxiliary_loss_mlp": 0.01253708, + "balance_loss_clip": 0.06266873, + "balance_loss_mlp": 0.01252076, + "epoch": 0.5720126258830602, + "flos": 61314724097280.0, + "grad_norm": 0.891161207726192, + "language_loss": 0.66879886, + "learning_rate": 1.6326923812866288e-06, + "loss": 0.74460191, + "num_input_tokens_seen": 204972925, + "router_z_loss_clip": 0.59765625, + "router_z_loss_mlp": 0.01634216, + "step": 9514, + "time_per_iteration": 3.1455137729644775 + }, + { + "auxiliary_loss_clip": 0.06430741, + "auxiliary_loss_mlp": 0.0127094, + "balance_loss_clip": 0.06282684, + "balance_loss_mlp": 0.01258941, + "epoch": 0.5720727491357283, + "flos": 23994878279040.0, + "grad_norm": 2.149685980416527, + "language_loss": 0.81938076, + "learning_rate": 1.63230955093099e-06, + "loss": 0.89639759, + "num_input_tokens_seen": 204990910, + "router_z_loss_clip": 1.48046875, + "router_z_loss_mlp": 0.12005615, + "step": 9515, + "time_per_iteration": 2.5996580123901367 + }, + { + "auxiliary_loss_clip": 0.0641297, + "auxiliary_loss_mlp": 0.01267881, + "balance_loss_clip": 0.06274894, + "balance_loss_mlp": 0.01257259, + "epoch": 0.5721328723883962, + "flos": 23412359894400.0, + "grad_norm": 1.6126279146943563, + "language_loss": 0.86095083, + "learning_rate": 1.6319267345182092e-06, + "loss": 0.93775928, + "num_input_tokens_seen": 205010500, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.10620117, + "step": 9516, + "time_per_iteration": 2.5553810596466064 + }, + { + "auxiliary_loss_clip": 0.06417726, + "auxiliary_loss_mlp": 0.01271814, + "balance_loss_clip": 0.06275768, + "balance_loss_mlp": 0.01260572, + "epoch": 0.5721929956410642, + "flos": 18810520859520.0, + "grad_norm": 2.197571780359881, + "language_loss": 0.87770617, + "learning_rate": 1.6315439320628038e-06, + "loss": 0.95460165, + "num_input_tokens_seen": 205028560, + "router_z_loss_clip": 1.41992188, + "router_z_loss_mlp": 0.11242676, + "step": 9517, + "time_per_iteration": 2.5858652591705322 + }, + { + "auxiliary_loss_clip": 0.06417002, + "auxiliary_loss_mlp": 0.01265386, + "balance_loss_clip": 0.0627486, + "balance_loss_mlp": 0.01254114, + "epoch": 0.5722531188937322, + "flos": 27203676785280.0, + "grad_norm": 1.5341934137919409, + "language_loss": 0.85065883, + "learning_rate": 1.6311611435792893e-06, + "loss": 0.92748272, + "num_input_tokens_seen": 205048650, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.11273193, + "step": 9518, + "time_per_iteration": 2.5850136280059814 + }, + { + "auxiliary_loss_clip": 0.06417416, + "auxiliary_loss_mlp": 0.01266921, + "balance_loss_clip": 0.06278273, + "balance_loss_mlp": 0.01256044, + "epoch": 0.5723132421464001, + "flos": 15201157109760.0, + "grad_norm": 1.5672659775495308, + "language_loss": 0.78797317, + "learning_rate": 1.6307783690821812e-06, + "loss": 0.86481655, + "num_input_tokens_seen": 205066480, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.10870361, + "step": 9519, + "time_per_iteration": 2.5459818840026855 + }, + { + "auxiliary_loss_clip": 0.06418845, + "auxiliary_loss_mlp": 0.01271535, + "balance_loss_clip": 0.06277601, + "balance_loss_mlp": 0.01260675, + "epoch": 0.5723733653990681, + "flos": 27606757651200.0, + "grad_norm": 1.4075514987328583, + "language_loss": 0.83134615, + "learning_rate": 1.6303956085859944e-06, + "loss": 0.90824991, + "num_input_tokens_seen": 205087475, + "router_z_loss_clip": 1.41210938, + "router_z_loss_mlp": 0.10864258, + "step": 9520, + "time_per_iteration": 2.66892671585083 + }, + { + "auxiliary_loss_clip": 0.06426139, + "auxiliary_loss_mlp": 0.01264412, + "balance_loss_clip": 0.06279796, + "balance_loss_mlp": 0.01253022, + "epoch": 0.572433488651736, + "flos": 18228673307520.0, + "grad_norm": 1.9996427544433133, + "language_loss": 0.73064411, + "learning_rate": 1.630012862105243e-06, + "loss": 0.80754966, + "num_input_tokens_seen": 205106495, + "router_z_loss_clip": 1.46484375, + "router_z_loss_mlp": 0.11383057, + "step": 9521, + "time_per_iteration": 2.5980701446533203 + }, + { + "auxiliary_loss_clip": 0.06419297, + "auxiliary_loss_mlp": 0.01270088, + "balance_loss_clip": 0.06276461, + "balance_loss_mlp": 0.01259073, + "epoch": 0.5724936119044041, + "flos": 31257224628480.0, + "grad_norm": 1.5867052207792396, + "language_loss": 0.77991247, + "learning_rate": 1.6296301296544415e-06, + "loss": 0.85680634, + "num_input_tokens_seen": 205128285, + "router_z_loss_clip": 1.42675781, + "router_z_loss_mlp": 0.11022949, + "step": 9522, + "time_per_iteration": 2.5890755653381348 + }, + { + "auxiliary_loss_clip": 0.06416851, + "auxiliary_loss_mlp": 0.01267889, + "balance_loss_clip": 0.06278282, + "balance_loss_mlp": 0.01257649, + "epoch": 0.572553735157072, + "flos": 19207186888320.0, + "grad_norm": 1.441878230551161, + "language_loss": 0.72110128, + "learning_rate": 1.629247411248102e-06, + "loss": 0.79794878, + "num_input_tokens_seen": 205146595, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.10235596, + "step": 9523, + "time_per_iteration": 2.511115789413452 + }, + { + "auxiliary_loss_clip": 0.06417882, + "auxiliary_loss_mlp": 0.0126736, + "balance_loss_clip": 0.06277744, + "balance_loss_mlp": 0.01257025, + "epoch": 0.57261385840974, + "flos": 21221249552640.0, + "grad_norm": 1.7953059857975224, + "language_loss": 0.70372975, + "learning_rate": 1.628864706900738e-06, + "loss": 0.78058219, + "num_input_tokens_seen": 205164295, + "router_z_loss_clip": 1.40039062, + "router_z_loss_mlp": 0.10339355, + "step": 9524, + "time_per_iteration": 2.507387161254883 + }, + { + "auxiliary_loss_clip": 0.0641823, + "auxiliary_loss_mlp": 0.0127028, + "balance_loss_clip": 0.06276852, + "balance_loss_mlp": 0.01259188, + "epoch": 0.5726739816624079, + "flos": 33992936582400.0, + "grad_norm": 1.3727338087163001, + "language_loss": 0.6519655, + "learning_rate": 1.6284820166268615e-06, + "loss": 0.7288506, + "num_input_tokens_seen": 205185380, + "router_z_loss_clip": 1.41503906, + "router_z_loss_mlp": 0.11096191, + "step": 9525, + "time_per_iteration": 2.6264822483062744 + }, + { + "auxiliary_loss_clip": 0.0641274, + "auxiliary_loss_mlp": 0.01266201, + "balance_loss_clip": 0.06272839, + "balance_loss_mlp": 0.01255842, + "epoch": 0.5727341049150759, + "flos": 24282196329600.0, + "grad_norm": 1.6388418597669483, + "language_loss": 0.72797775, + "learning_rate": 1.628099340440984e-06, + "loss": 0.80476719, + "num_input_tokens_seen": 205204895, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.10351562, + "step": 9526, + "time_per_iteration": 2.5209100246429443 + }, + { + "auxiliary_loss_clip": 0.06418388, + "auxiliary_loss_mlp": 0.01268542, + "balance_loss_clip": 0.06280835, + "balance_loss_mlp": 0.01257897, + "epoch": 0.5727942281677438, + "flos": 28407762357120.0, + "grad_norm": 1.5546981496666945, + "language_loss": 0.80170763, + "learning_rate": 1.6277166783576176e-06, + "loss": 0.87857693, + "num_input_tokens_seen": 205223440, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.10650635, + "step": 9527, + "time_per_iteration": 2.6143245697021484 + }, + { + "auxiliary_loss_clip": 0.06413873, + "auxiliary_loss_mlp": 0.01269872, + "balance_loss_clip": 0.06275712, + "balance_loss_mlp": 0.01258983, + "epoch": 0.5728543514204119, + "flos": 19542861544320.0, + "grad_norm": 2.5128112924339585, + "language_loss": 0.72641492, + "learning_rate": 1.6273340303912713e-06, + "loss": 0.8032524, + "num_input_tokens_seen": 205242800, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.10894775, + "step": 9528, + "time_per_iteration": 2.4896552562713623 + }, + { + "auxiliary_loss_clip": 0.06418886, + "auxiliary_loss_mlp": 0.01267185, + "balance_loss_clip": 0.06277183, + "balance_loss_mlp": 0.0125577, + "epoch": 0.5729144746730798, + "flos": 21513137650560.0, + "grad_norm": 1.7938485336826149, + "language_loss": 0.85978115, + "learning_rate": 1.6269513965564557e-06, + "loss": 0.93664181, + "num_input_tokens_seen": 205259465, + "router_z_loss_clip": 1.41699219, + "router_z_loss_mlp": 0.11407471, + "step": 9529, + "time_per_iteration": 2.539447784423828 + }, + { + "auxiliary_loss_clip": 0.063314, + "auxiliary_loss_mlp": 0.01256121, + "balance_loss_clip": 0.06271826, + "balance_loss_mlp": 0.0125448, + "epoch": 0.5729745979257478, + "flos": 58699638495360.0, + "grad_norm": 0.750499003321047, + "language_loss": 0.55969286, + "learning_rate": 1.6265687768676813e-06, + "loss": 0.63556802, + "num_input_tokens_seen": 205314100, + "router_z_loss_clip": 0.59375, + "router_z_loss_mlp": 0.01643372, + "step": 9530, + "time_per_iteration": 3.007678747177124 + }, + { + "auxiliary_loss_clip": 0.06425051, + "auxiliary_loss_mlp": 0.01265649, + "balance_loss_clip": 0.06280611, + "balance_loss_mlp": 0.01254276, + "epoch": 0.5730347211784158, + "flos": 18558100834560.0, + "grad_norm": 1.9102815745402744, + "language_loss": 0.66843903, + "learning_rate": 1.6261861713394553e-06, + "loss": 0.74534607, + "num_input_tokens_seen": 205333420, + "router_z_loss_clip": 1.4453125, + "router_z_loss_mlp": 0.1137085, + "step": 9531, + "time_per_iteration": 3.9059529304504395 + }, + { + "auxiliary_loss_clip": 0.06417044, + "auxiliary_loss_mlp": 0.01269124, + "balance_loss_clip": 0.06274498, + "balance_loss_mlp": 0.01257966, + "epoch": 0.5730948444310837, + "flos": 38040069588480.0, + "grad_norm": 1.9862057863273674, + "language_loss": 0.75881588, + "learning_rate": 1.6258035799862876e-06, + "loss": 0.83567762, + "num_input_tokens_seen": 205350995, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.11169434, + "step": 9532, + "time_per_iteration": 2.640389919281006 + }, + { + "auxiliary_loss_clip": 0.06421025, + "auxiliary_loss_mlp": 0.01267077, + "balance_loss_clip": 0.06278558, + "balance_loss_mlp": 0.01255794, + "epoch": 0.5731549676837517, + "flos": 25233861876480.0, + "grad_norm": 1.2592580925122039, + "language_loss": 0.79252976, + "learning_rate": 1.625421002822686e-06, + "loss": 0.86941075, + "num_input_tokens_seen": 205372675, + "router_z_loss_clip": 1.42480469, + "router_z_loss_mlp": 0.11291504, + "step": 9533, + "time_per_iteration": 2.559293508529663 + }, + { + "auxiliary_loss_clip": 0.06417587, + "auxiliary_loss_mlp": 0.01266539, + "balance_loss_clip": 0.06278279, + "balance_loss_mlp": 0.01256067, + "epoch": 0.5732150909364196, + "flos": 23375030100480.0, + "grad_norm": 3.634749275276224, + "language_loss": 0.8597486, + "learning_rate": 1.6250384398631574e-06, + "loss": 0.93658984, + "num_input_tokens_seen": 205392590, + "router_z_loss_clip": 1.39257812, + "router_z_loss_mlp": 0.10467529, + "step": 9534, + "time_per_iteration": 2.539487838745117 + }, + { + "auxiliary_loss_clip": 0.06421855, + "auxiliary_loss_mlp": 0.01269069, + "balance_loss_clip": 0.06279784, + "balance_loss_mlp": 0.01257625, + "epoch": 0.5732752141890877, + "flos": 23086621946880.0, + "grad_norm": 1.944302626791885, + "language_loss": 0.75668436, + "learning_rate": 1.6246558911222085e-06, + "loss": 0.83359355, + "num_input_tokens_seen": 205414885, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.11444092, + "step": 9535, + "time_per_iteration": 2.5488839149475098 + }, + { + "auxiliary_loss_clip": 0.06425361, + "auxiliary_loss_mlp": 0.01268179, + "balance_loss_clip": 0.06278601, + "balance_loss_mlp": 0.01256288, + "epoch": 0.5733353374417556, + "flos": 24359078050560.0, + "grad_norm": 1.5155376410848522, + "language_loss": 0.71395552, + "learning_rate": 1.624273356614346e-06, + "loss": 0.79089081, + "num_input_tokens_seen": 205434440, + "router_z_loss_clip": 1.46582031, + "router_z_loss_mlp": 0.11895752, + "step": 9536, + "time_per_iteration": 2.553239345550537 + }, + { + "auxiliary_loss_clip": 0.06416988, + "auxiliary_loss_mlp": 0.01269432, + "balance_loss_clip": 0.06275923, + "balance_loss_mlp": 0.01258244, + "epoch": 0.5733954606944236, + "flos": 27206234334720.0, + "grad_norm": 1.742372783929404, + "language_loss": 0.70031548, + "learning_rate": 1.6238908363540755e-06, + "loss": 0.77717972, + "num_input_tokens_seen": 205454225, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.11187744, + "step": 9537, + "time_per_iteration": 2.5490598678588867 + }, + { + "auxiliary_loss_clip": 0.06419763, + "auxiliary_loss_mlp": 0.0126804, + "balance_loss_clip": 0.06277005, + "balance_loss_mlp": 0.01257317, + "epoch": 0.5734555839470915, + "flos": 28772339472000.0, + "grad_norm": 2.334146865026381, + "language_loss": 0.63052773, + "learning_rate": 1.623508330355902e-06, + "loss": 0.70740581, + "num_input_tokens_seen": 205474750, + "router_z_loss_clip": 1.42480469, + "router_z_loss_mlp": 0.10723877, + "step": 9538, + "time_per_iteration": 4.013959169387817 + }, + { + "auxiliary_loss_clip": 0.0641904, + "auxiliary_loss_mlp": 0.01273663, + "balance_loss_clip": 0.06277157, + "balance_loss_mlp": 0.0126136, + "epoch": 0.5735157071997595, + "flos": 22973542462080.0, + "grad_norm": 1.806157803076428, + "language_loss": 0.82720077, + "learning_rate": 1.6231258386343306e-06, + "loss": 0.90412778, + "num_input_tokens_seen": 205495495, + "router_z_loss_clip": 1.41796875, + "router_z_loss_mlp": 0.12310791, + "step": 9539, + "time_per_iteration": 2.554189682006836 + }, + { + "auxiliary_loss_clip": 0.06422378, + "auxiliary_loss_mlp": 0.01264392, + "balance_loss_clip": 0.06276339, + "balance_loss_mlp": 0.01253115, + "epoch": 0.5735758304524274, + "flos": 18995450820480.0, + "grad_norm": 2.0055639259958107, + "language_loss": 0.73150325, + "learning_rate": 1.6227433612038647e-06, + "loss": 0.80837095, + "num_input_tokens_seen": 205510070, + "router_z_loss_clip": 1.45996094, + "router_z_loss_mlp": 0.11279297, + "step": 9540, + "time_per_iteration": 2.500077486038208 + }, + { + "auxiliary_loss_clip": 0.0641907, + "auxiliary_loss_mlp": 0.01265571, + "balance_loss_clip": 0.06276584, + "balance_loss_mlp": 0.01255039, + "epoch": 0.5736359537050955, + "flos": 28404701683200.0, + "grad_norm": 2.024476848130698, + "language_loss": 0.80249465, + "learning_rate": 1.6223608980790089e-06, + "loss": 0.87934107, + "num_input_tokens_seen": 205530190, + "router_z_loss_clip": 1.42675781, + "router_z_loss_mlp": 0.10528564, + "step": 9541, + "time_per_iteration": 4.051165342330933 + }, + { + "auxiliary_loss_clip": 0.06425047, + "auxiliary_loss_mlp": 0.01265692, + "balance_loss_clip": 0.06278428, + "balance_loss_mlp": 0.01253998, + "epoch": 0.5736960769577634, + "flos": 15631714915200.0, + "grad_norm": 2.008860171144918, + "language_loss": 0.64482939, + "learning_rate": 1.6219784492742654e-06, + "loss": 0.72173679, + "num_input_tokens_seen": 205547380, + "router_z_loss_clip": 1.46386719, + "router_z_loss_mlp": 0.11700439, + "step": 9542, + "time_per_iteration": 2.5055642127990723 + }, + { + "auxiliary_loss_clip": 0.06417751, + "auxiliary_loss_mlp": 0.01265337, + "balance_loss_clip": 0.0627488, + "balance_loss_mlp": 0.01254691, + "epoch": 0.5737562002104314, + "flos": 18009767715840.0, + "grad_norm": 2.2598183554381146, + "language_loss": 0.83200055, + "learning_rate": 1.6215960148041365e-06, + "loss": 0.90883142, + "num_input_tokens_seen": 205566540, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.10638428, + "step": 9543, + "time_per_iteration": 2.4916088581085205 + }, + { + "auxiliary_loss_clip": 0.06426359, + "auxiliary_loss_mlp": 0.0126626, + "balance_loss_clip": 0.06279086, + "balance_loss_mlp": 0.01254422, + "epoch": 0.5738163234630994, + "flos": 20703454047360.0, + "grad_norm": 1.617850922862876, + "language_loss": 0.74024302, + "learning_rate": 1.6212135946831257e-06, + "loss": 0.81716919, + "num_input_tokens_seen": 205584200, + "router_z_loss_clip": 1.47460938, + "router_z_loss_mlp": 0.1184082, + "step": 9544, + "time_per_iteration": 2.536583662033081 + }, + { + "auxiliary_loss_clip": 0.06424204, + "auxiliary_loss_mlp": 0.01268479, + "balance_loss_clip": 0.06278355, + "balance_loss_mlp": 0.01256809, + "epoch": 0.5738764467157673, + "flos": 23156082581760.0, + "grad_norm": 3.1974440280178595, + "language_loss": 0.76412272, + "learning_rate": 1.620831188925733e-06, + "loss": 0.84104949, + "num_input_tokens_seen": 205604675, + "router_z_loss_clip": 1.45898438, + "router_z_loss_mlp": 0.11676025, + "step": 9545, + "time_per_iteration": 2.5427141189575195 + }, + { + "auxiliary_loss_clip": 0.06423136, + "auxiliary_loss_mlp": 0.01267499, + "balance_loss_clip": 0.06279323, + "balance_loss_mlp": 0.01256162, + "epoch": 0.5739365699684353, + "flos": 29499942401280.0, + "grad_norm": 2.3578945444753447, + "language_loss": 0.56573224, + "learning_rate": 1.620448797546459e-06, + "loss": 0.64263856, + "num_input_tokens_seen": 205624680, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.11334229, + "step": 9546, + "time_per_iteration": 2.608128309249878 + }, + { + "auxiliary_loss_clip": 0.06422536, + "auxiliary_loss_mlp": 0.01268737, + "balance_loss_clip": 0.0627693, + "balance_loss_mlp": 0.01257746, + "epoch": 0.5739966932211032, + "flos": 14032388833920.0, + "grad_norm": 2.2022917684402996, + "language_loss": 0.76728261, + "learning_rate": 1.6200664205598055e-06, + "loss": 0.84419537, + "num_input_tokens_seen": 205641950, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.10980225, + "step": 9547, + "time_per_iteration": 2.5017452239990234 + }, + { + "auxiliary_loss_clip": 0.06421655, + "auxiliary_loss_mlp": 0.01268546, + "balance_loss_clip": 0.06277436, + "balance_loss_mlp": 0.01257114, + "epoch": 0.5740568164737713, + "flos": 19067972129280.0, + "grad_norm": 1.9505887412268983, + "language_loss": 0.7442795, + "learning_rate": 1.6196840579802704e-06, + "loss": 0.82118154, + "num_input_tokens_seen": 205660130, + "router_z_loss_clip": 1.44042969, + "router_z_loss_mlp": 0.11444092, + "step": 9548, + "time_per_iteration": 2.549558639526367 + }, + { + "auxiliary_loss_clip": 0.06418206, + "auxiliary_loss_mlp": 0.01266472, + "balance_loss_clip": 0.0627545, + "balance_loss_mlp": 0.01255064, + "epoch": 0.5741169397264392, + "flos": 22134453275520.0, + "grad_norm": 2.3791642109865228, + "language_loss": 0.69704068, + "learning_rate": 1.619301709822355e-06, + "loss": 0.77388746, + "num_input_tokens_seen": 205678895, + "router_z_loss_clip": 1.42871094, + "router_z_loss_mlp": 0.11419678, + "step": 9549, + "time_per_iteration": 3.933781147003174 + }, + { + "auxiliary_loss_clip": 0.06420065, + "auxiliary_loss_mlp": 0.01265483, + "balance_loss_clip": 0.06279664, + "balance_loss_mlp": 0.01254611, + "epoch": 0.5741770629791072, + "flos": 24943860495360.0, + "grad_norm": 1.461228472430463, + "language_loss": 0.79521686, + "learning_rate": 1.6189193761005564e-06, + "loss": 0.87207234, + "num_input_tokens_seen": 205698450, + "router_z_loss_clip": 1.40527344, + "router_z_loss_mlp": 0.10870361, + "step": 9550, + "time_per_iteration": 2.577768087387085 + }, + { + "auxiliary_loss_clip": 0.06419414, + "auxiliary_loss_mlp": 0.01265674, + "balance_loss_clip": 0.06276274, + "balance_loss_mlp": 0.01254832, + "epoch": 0.5742371862317751, + "flos": 18806495863680.0, + "grad_norm": 2.119345289493334, + "language_loss": 0.68877375, + "learning_rate": 1.6185370568293727e-06, + "loss": 0.76562458, + "num_input_tokens_seen": 205714870, + "router_z_loss_clip": 1.43164062, + "router_z_loss_mlp": 0.10845947, + "step": 9551, + "time_per_iteration": 2.480468273162842 + }, + { + "auxiliary_loss_clip": 0.06424205, + "auxiliary_loss_mlp": 0.01267294, + "balance_loss_clip": 0.06276421, + "balance_loss_mlp": 0.0125579, + "epoch": 0.5742973094844431, + "flos": 24467293998720.0, + "grad_norm": 1.5487820488887025, + "language_loss": 0.72033125, + "learning_rate": 1.6181547520233031e-06, + "loss": 0.79724622, + "num_input_tokens_seen": 205736045, + "router_z_loss_clip": 1.47460938, + "router_z_loss_mlp": 0.11505127, + "step": 9552, + "time_per_iteration": 2.5759360790252686 + }, + { + "auxiliary_loss_clip": 0.06417461, + "auxiliary_loss_mlp": 0.01265348, + "balance_loss_clip": 0.06274983, + "balance_loss_mlp": 0.0125469, + "epoch": 0.574357432737111, + "flos": 21659186517120.0, + "grad_norm": 3.0495771997900163, + "language_loss": 0.79982221, + "learning_rate": 1.617772461696843e-06, + "loss": 0.87665033, + "num_input_tokens_seen": 205754445, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.10662842, + "step": 9553, + "time_per_iteration": 2.49290132522583 + }, + { + "auxiliary_loss_clip": 0.06423397, + "auxiliary_loss_mlp": 0.01264041, + "balance_loss_clip": 0.06275378, + "balance_loss_mlp": 0.0125333, + "epoch": 0.5744175559897791, + "flos": 16550285299200.0, + "grad_norm": 2.1324379432349425, + "language_loss": 0.83817756, + "learning_rate": 1.6173901858644895e-06, + "loss": 0.91505194, + "num_input_tokens_seen": 205770595, + "router_z_loss_clip": 1.48144531, + "router_z_loss_mlp": 0.1071167, + "step": 9554, + "time_per_iteration": 2.5118370056152344 + }, + { + "auxiliary_loss_clip": 0.06422277, + "auxiliary_loss_mlp": 0.01267015, + "balance_loss_clip": 0.06277014, + "balance_loss_mlp": 0.0125575, + "epoch": 0.574477679242447, + "flos": 24214580484480.0, + "grad_norm": 1.3861221814355518, + "language_loss": 0.71406233, + "learning_rate": 1.6170079245407385e-06, + "loss": 0.79095531, + "num_input_tokens_seen": 205791935, + "router_z_loss_clip": 1.45117188, + "router_z_loss_mlp": 0.11254883, + "step": 9555, + "time_per_iteration": 2.5466480255126953 + }, + { + "auxiliary_loss_clip": 0.06421511, + "auxiliary_loss_mlp": 0.01268077, + "balance_loss_clip": 0.06277835, + "balance_loss_mlp": 0.01256478, + "epoch": 0.574537802495115, + "flos": 14908304689920.0, + "grad_norm": 2.185347344801511, + "language_loss": 0.73004574, + "learning_rate": 1.6166256777400853e-06, + "loss": 0.80694163, + "num_input_tokens_seen": 205807260, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.1159668, + "step": 9556, + "time_per_iteration": 2.4900078773498535 + }, + { + "auxiliary_loss_clip": 0.0641879, + "auxiliary_loss_mlp": 0.01265172, + "balance_loss_clip": 0.06276919, + "balance_loss_mlp": 0.01253406, + "epoch": 0.5745979257477829, + "flos": 24941680289280.0, + "grad_norm": 1.5306662340422301, + "language_loss": 0.74479866, + "learning_rate": 1.6162434454770248e-06, + "loss": 0.82163835, + "num_input_tokens_seen": 205826885, + "router_z_loss_clip": 1.41796875, + "router_z_loss_mlp": 0.11761475, + "step": 9557, + "time_per_iteration": 2.576296329498291 + }, + { + "auxiliary_loss_clip": 0.06420197, + "auxiliary_loss_mlp": 0.01263736, + "balance_loss_clip": 0.06277291, + "balance_loss_mlp": 0.01252572, + "epoch": 0.5746580490004509, + "flos": 17241061559040.0, + "grad_norm": 1.5775139248237169, + "language_loss": 0.68007201, + "learning_rate": 1.6158612277660514e-06, + "loss": 0.75691128, + "num_input_tokens_seen": 205844630, + "router_z_loss_clip": 1.42871094, + "router_z_loss_mlp": 0.11157227, + "step": 9558, + "time_per_iteration": 2.531812906265259 + }, + { + "auxiliary_loss_clip": 0.06424935, + "auxiliary_loss_mlp": 0.01267243, + "balance_loss_clip": 0.06275487, + "balance_loss_mlp": 0.01253779, + "epoch": 0.5747181722531189, + "flos": 13192838449920.0, + "grad_norm": 2.425506842460266, + "language_loss": 0.71628273, + "learning_rate": 1.615479024621659e-06, + "loss": 0.79320455, + "num_input_tokens_seen": 205860960, + "router_z_loss_clip": 1.49511719, + "router_z_loss_mlp": 0.13482666, + "step": 9559, + "time_per_iteration": 2.473419189453125 + }, + { + "auxiliary_loss_clip": 0.06419484, + "auxiliary_loss_mlp": 0.01266983, + "balance_loss_clip": 0.06277612, + "balance_loss_mlp": 0.01256921, + "epoch": 0.5747782955057869, + "flos": 22969098195840.0, + "grad_norm": 1.5670628486073652, + "language_loss": 0.79416776, + "learning_rate": 1.6150968360583398e-06, + "loss": 0.87103242, + "num_input_tokens_seen": 205880675, + "router_z_loss_clip": 1.41796875, + "router_z_loss_mlp": 0.10064697, + "step": 9560, + "time_per_iteration": 2.532862663269043 + }, + { + "auxiliary_loss_clip": 0.06421925, + "auxiliary_loss_mlp": 0.01267007, + "balance_loss_clip": 0.06276737, + "balance_loss_mlp": 0.01255581, + "epoch": 0.5748384187584549, + "flos": 23409802344960.0, + "grad_norm": 1.793006683486937, + "language_loss": 0.64777875, + "learning_rate": 1.614714662090588e-06, + "loss": 0.72466803, + "num_input_tokens_seen": 205900050, + "router_z_loss_clip": 1.45214844, + "router_z_loss_mlp": 0.11431885, + "step": 9561, + "time_per_iteration": 2.5111758708953857 + }, + { + "auxiliary_loss_clip": 0.06426983, + "auxiliary_loss_mlp": 0.01268046, + "balance_loss_clip": 0.06277155, + "balance_loss_mlp": 0.01256369, + "epoch": 0.5748985420111228, + "flos": 17791323321600.0, + "grad_norm": 1.4966227163397983, + "language_loss": 0.7114228, + "learning_rate": 1.6143325027328945e-06, + "loss": 0.78837311, + "num_input_tokens_seen": 205918855, + "router_z_loss_clip": 1.49804688, + "router_z_loss_mlp": 0.11682129, + "step": 9562, + "time_per_iteration": 2.5162081718444824 + }, + { + "auxiliary_loss_clip": 0.06425486, + "auxiliary_loss_mlp": 0.01266976, + "balance_loss_clip": 0.06280454, + "balance_loss_mlp": 0.01256081, + "epoch": 0.5749586652637908, + "flos": 19872582560640.0, + "grad_norm": 1.4328664867345224, + "language_loss": 0.84269559, + "learning_rate": 1.613950357999751e-06, + "loss": 0.91962022, + "num_input_tokens_seen": 205936970, + "router_z_loss_clip": 1.44921875, + "router_z_loss_mlp": 0.10888672, + "step": 9563, + "time_per_iteration": 2.5183188915252686 + }, + { + "auxiliary_loss_clip": 0.06421089, + "auxiliary_loss_mlp": 0.01268857, + "balance_loss_clip": 0.06273992, + "balance_loss_mlp": 0.01256733, + "epoch": 0.5750187885164587, + "flos": 21293477372160.0, + "grad_norm": 2.089685167133714, + "language_loss": 0.57297182, + "learning_rate": 1.6135682279056488e-06, + "loss": 0.64987123, + "num_input_tokens_seen": 205954630, + "router_z_loss_clip": 1.46972656, + "router_z_loss_mlp": 0.12127686, + "step": 9564, + "time_per_iteration": 2.5219571590423584 + }, + { + "auxiliary_loss_clip": 0.06414357, + "auxiliary_loss_mlp": 0.01264565, + "balance_loss_clip": 0.06276927, + "balance_loss_mlp": 0.0125389, + "epoch": 0.5750789117691267, + "flos": 18810227370240.0, + "grad_norm": 1.5824685354584669, + "language_loss": 0.76484299, + "learning_rate": 1.613186112465078e-06, + "loss": 0.84163225, + "num_input_tokens_seen": 205971510, + "router_z_loss_clip": 1.37402344, + "router_z_loss_mlp": 0.10681152, + "step": 9565, + "time_per_iteration": 2.4752280712127686 + }, + { + "auxiliary_loss_clip": 0.06321105, + "auxiliary_loss_mlp": 0.01250694, + "balance_loss_clip": 0.06260607, + "balance_loss_mlp": 0.01249219, + "epoch": 0.5751390350217946, + "flos": 70685624188800.0, + "grad_norm": 0.721103953507815, + "language_loss": 0.6068033, + "learning_rate": 1.6128040116925287e-06, + "loss": 0.68252128, + "num_input_tokens_seen": 206035125, + "router_z_loss_clip": 0.609375, + "router_z_loss_mlp": 0.01473999, + "step": 9566, + "time_per_iteration": 3.222144603729248 + }, + { + "auxiliary_loss_clip": 0.06420306, + "auxiliary_loss_mlp": 0.01268432, + "balance_loss_clip": 0.06276581, + "balance_loss_mlp": 0.01257673, + "epoch": 0.5751991582744627, + "flos": 14251545987840.0, + "grad_norm": 2.0959328312792467, + "language_loss": 0.75654471, + "learning_rate": 1.6124219256024901e-06, + "loss": 0.83343208, + "num_input_tokens_seen": 206052075, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.10760498, + "step": 9567, + "time_per_iteration": 2.4892570972442627 + }, + { + "auxiliary_loss_clip": 0.06417775, + "auxiliary_loss_mlp": 0.01267193, + "balance_loss_clip": 0.06274199, + "balance_loss_mlp": 0.01255875, + "epoch": 0.5752592815271306, + "flos": 18333283530240.0, + "grad_norm": 1.4488652909067903, + "language_loss": 0.75253701, + "learning_rate": 1.6120398542094504e-06, + "loss": 0.82938665, + "num_input_tokens_seen": 206069970, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.11322021, + "step": 9568, + "time_per_iteration": 2.473475217819214 + }, + { + "auxiliary_loss_clip": 0.06419896, + "auxiliary_loss_mlp": 0.01265316, + "balance_loss_clip": 0.06276227, + "balance_loss_mlp": 0.01254349, + "epoch": 0.5753194047797986, + "flos": 20928984111360.0, + "grad_norm": 1.5107907301615, + "language_loss": 0.71293747, + "learning_rate": 1.6116577975278994e-06, + "loss": 0.78978956, + "num_input_tokens_seen": 206088950, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.10968018, + "step": 9569, + "time_per_iteration": 2.6541481018066406 + }, + { + "auxiliary_loss_clip": 0.06420765, + "auxiliary_loss_mlp": 0.01267458, + "balance_loss_clip": 0.06275727, + "balance_loss_mlp": 0.01255764, + "epoch": 0.5753795280324665, + "flos": 19287925896960.0, + "grad_norm": 2.027519323892087, + "language_loss": 0.56120193, + "learning_rate": 1.6112757555723223e-06, + "loss": 0.63808417, + "num_input_tokens_seen": 206107780, + "router_z_loss_clip": 1.45019531, + "router_z_loss_mlp": 0.11694336, + "step": 9570, + "time_per_iteration": 2.5568745136260986 + }, + { + "auxiliary_loss_clip": 0.0641574, + "auxiliary_loss_mlp": 0.01264384, + "balance_loss_clip": 0.06274444, + "balance_loss_mlp": 0.01253715, + "epoch": 0.5754396512851345, + "flos": 21659312298240.0, + "grad_norm": 3.8103947749492355, + "language_loss": 0.64502007, + "learning_rate": 1.6108937283572082e-06, + "loss": 0.72182131, + "num_input_tokens_seen": 206127445, + "router_z_loss_clip": 1.41210938, + "router_z_loss_mlp": 0.10675049, + "step": 9571, + "time_per_iteration": 3.9861292839050293 + }, + { + "auxiliary_loss_clip": 0.06417111, + "auxiliary_loss_mlp": 0.01267965, + "balance_loss_clip": 0.06275386, + "balance_loss_mlp": 0.01257153, + "epoch": 0.5754997745378025, + "flos": 51032674707840.0, + "grad_norm": 1.44401056534108, + "language_loss": 0.67167187, + "learning_rate": 1.6105117158970434e-06, + "loss": 0.74852264, + "num_input_tokens_seen": 206152005, + "router_z_loss_clip": 1.41601562, + "router_z_loss_mlp": 0.10821533, + "step": 9572, + "time_per_iteration": 2.775322198867798 + }, + { + "auxiliary_loss_clip": 0.06417632, + "auxiliary_loss_mlp": 0.0126415, + "balance_loss_clip": 0.06276821, + "balance_loss_mlp": 0.01252378, + "epoch": 0.5755598977904705, + "flos": 22863523651200.0, + "grad_norm": 1.9643261986613603, + "language_loss": 0.72534865, + "learning_rate": 1.6101297182063123e-06, + "loss": 0.80216646, + "num_input_tokens_seen": 206169875, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.11767578, + "step": 9573, + "time_per_iteration": 2.504248857498169 + }, + { + "auxiliary_loss_clip": 0.06413124, + "auxiliary_loss_mlp": 0.01264891, + "balance_loss_clip": 0.06276227, + "balance_loss_mlp": 0.0125495, + "epoch": 0.5756200210431385, + "flos": 38482073475840.0, + "grad_norm": 1.6390607800794645, + "language_loss": 0.76527274, + "learning_rate": 1.6097477352995022e-06, + "loss": 0.84205294, + "num_input_tokens_seen": 206192635, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.09954834, + "step": 9574, + "time_per_iteration": 2.675445079803467 + }, + { + "auxiliary_loss_clip": 0.06426176, + "auxiliary_loss_mlp": 0.01264732, + "balance_loss_clip": 0.06277125, + "balance_loss_mlp": 0.01252865, + "epoch": 0.5756801442958064, + "flos": 23915984060160.0, + "grad_norm": 3.486560074307127, + "language_loss": 0.67186499, + "learning_rate": 1.6093657671910968e-06, + "loss": 0.74877405, + "num_input_tokens_seen": 206211485, + "router_z_loss_clip": 1.49121094, + "router_z_loss_mlp": 0.11877441, + "step": 9575, + "time_per_iteration": 2.5086028575897217 + }, + { + "auxiliary_loss_clip": 0.06414266, + "auxiliary_loss_mlp": 0.01263942, + "balance_loss_clip": 0.06275645, + "balance_loss_mlp": 0.01253899, + "epoch": 0.5757402675484744, + "flos": 21111566158080.0, + "grad_norm": 1.4184952738773886, + "language_loss": 0.80574554, + "learning_rate": 1.6089838138955804e-06, + "loss": 0.88252765, + "num_input_tokens_seen": 206231740, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.1005249, + "step": 9576, + "time_per_iteration": 2.502372980117798 + }, + { + "auxiliary_loss_clip": 0.06413178, + "auxiliary_loss_mlp": 0.01266947, + "balance_loss_clip": 0.06273341, + "balance_loss_mlp": 0.01256439, + "epoch": 0.5758003908011423, + "flos": 20565497099520.0, + "grad_norm": 1.5791511975506907, + "language_loss": 0.69807208, + "learning_rate": 1.6086018754274372e-06, + "loss": 0.77487338, + "num_input_tokens_seen": 206250975, + "router_z_loss_clip": 1.39648438, + "router_z_loss_mlp": 0.10510254, + "step": 9577, + "time_per_iteration": 4.000526428222656 + }, + { + "auxiliary_loss_clip": 0.06420817, + "auxiliary_loss_mlp": 0.0126492, + "balance_loss_clip": 0.06274913, + "balance_loss_mlp": 0.012544, + "epoch": 0.5758605140538103, + "flos": 16478770239360.0, + "grad_norm": 1.7483336770936004, + "language_loss": 0.66710907, + "learning_rate": 1.6082199518011504e-06, + "loss": 0.74396646, + "num_input_tokens_seen": 206268800, + "router_z_loss_clip": 1.45800781, + "router_z_loss_mlp": 0.10510254, + "step": 9578, + "time_per_iteration": 2.495589256286621 + }, + { + "auxiliary_loss_clip": 0.06417773, + "auxiliary_loss_mlp": 0.01264669, + "balance_loss_clip": 0.06276586, + "balance_loss_mlp": 0.01254274, + "epoch": 0.5759206373064782, + "flos": 21293854715520.0, + "grad_norm": 1.4632151435184575, + "language_loss": 0.72808439, + "learning_rate": 1.6078380430312016e-06, + "loss": 0.80490887, + "num_input_tokens_seen": 206287190, + "router_z_loss_clip": 1.41210938, + "router_z_loss_mlp": 0.10388184, + "step": 9579, + "time_per_iteration": 2.4900078773498535 + }, + { + "auxiliary_loss_clip": 0.06426738, + "auxiliary_loss_mlp": 0.01266533, + "balance_loss_clip": 0.06278113, + "balance_loss_mlp": 0.01254451, + "epoch": 0.5759807605591463, + "flos": 26075089342080.0, + "grad_norm": 2.9637416190029597, + "language_loss": 0.64800644, + "learning_rate": 1.6074561491320742e-06, + "loss": 0.72493923, + "num_input_tokens_seen": 206307020, + "router_z_loss_clip": 1.48535156, + "router_z_loss_mlp": 0.12072754, + "step": 9580, + "time_per_iteration": 2.532273292541504 + }, + { + "auxiliary_loss_clip": 0.06420532, + "auxiliary_loss_mlp": 0.01266688, + "balance_loss_clip": 0.06275357, + "balance_loss_mlp": 0.01255554, + "epoch": 0.5760408838118142, + "flos": 18877885142400.0, + "grad_norm": 1.6521602857434026, + "language_loss": 0.85497582, + "learning_rate": 1.6070742701182486e-06, + "loss": 0.93184799, + "num_input_tokens_seen": 206324095, + "router_z_loss_clip": 1.45214844, + "router_z_loss_mlp": 0.11132812, + "step": 9581, + "time_per_iteration": 3.9159321784973145 + }, + { + "auxiliary_loss_clip": 0.06425697, + "auxiliary_loss_mlp": 0.01268939, + "balance_loss_clip": 0.06276281, + "balance_loss_mlp": 0.01257483, + "epoch": 0.5761010070644822, + "flos": 15383655302400.0, + "grad_norm": 2.053627577895993, + "language_loss": 0.67847329, + "learning_rate": 1.6066924060042057e-06, + "loss": 0.75541961, + "num_input_tokens_seen": 206343210, + "router_z_loss_clip": 1.49414062, + "router_z_loss_mlp": 0.11450195, + "step": 9582, + "time_per_iteration": 2.468289613723755 + }, + { + "auxiliary_loss_clip": 0.06323063, + "auxiliary_loss_mlp": 0.0125238, + "balance_loss_clip": 0.06262786, + "balance_loss_mlp": 0.01250932, + "epoch": 0.5761611303171501, + "flos": 71495475500160.0, + "grad_norm": 0.6295597289579254, + "language_loss": 0.5722791, + "learning_rate": 1.6063105568044271e-06, + "loss": 0.64803356, + "num_input_tokens_seen": 206415935, + "router_z_loss_clip": 0.60253906, + "router_z_loss_mlp": 0.0144577, + "step": 9583, + "time_per_iteration": 3.280832052230835 + }, + { + "auxiliary_loss_clip": 0.06416009, + "auxiliary_loss_mlp": 0.0126413, + "balance_loss_clip": 0.06274246, + "balance_loss_mlp": 0.01253437, + "epoch": 0.5762212535698181, + "flos": 16250556844800.0, + "grad_norm": 1.895482028357212, + "language_loss": 0.82933408, + "learning_rate": 1.6059287225333912e-06, + "loss": 0.90613544, + "num_input_tokens_seen": 206431900, + "router_z_loss_clip": 1.41894531, + "router_z_loss_mlp": 0.10693359, + "step": 9584, + "time_per_iteration": 2.473771333694458 + }, + { + "auxiliary_loss_clip": 0.06325932, + "auxiliary_loss_mlp": 0.01252168, + "balance_loss_clip": 0.06265227, + "balance_loss_mlp": 0.01250696, + "epoch": 0.5762813768224861, + "flos": 70207254829440.0, + "grad_norm": 0.6148723792494001, + "language_loss": 0.49547607, + "learning_rate": 1.6055469032055773e-06, + "loss": 0.57125711, + "num_input_tokens_seen": 206501200, + "router_z_loss_clip": 0.609375, + "router_z_loss_mlp": 0.0147171, + "step": 9585, + "time_per_iteration": 3.220283031463623 + }, + { + "auxiliary_loss_clip": 0.06417918, + "auxiliary_loss_mlp": 0.0126733, + "balance_loss_clip": 0.06276701, + "balance_loss_mlp": 0.01256446, + "epoch": 0.5763415000751541, + "flos": 20523639185280.0, + "grad_norm": 1.396891707955096, + "language_loss": 0.84832788, + "learning_rate": 1.605165098835465e-06, + "loss": 0.92518032, + "num_input_tokens_seen": 206520575, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.10876465, + "step": 9586, + "time_per_iteration": 2.5044658184051514 + }, + { + "auxiliary_loss_clip": 0.0641425, + "auxiliary_loss_mlp": 0.01268611, + "balance_loss_clip": 0.06270906, + "balance_loss_mlp": 0.01257584, + "epoch": 0.5764016233278221, + "flos": 15821047215360.0, + "grad_norm": 1.5476594832750246, + "language_loss": 0.80150878, + "learning_rate": 1.6047833094375308e-06, + "loss": 0.87833744, + "num_input_tokens_seen": 206538060, + "router_z_loss_clip": 1.43457031, + "router_z_loss_mlp": 0.11035156, + "step": 9587, + "time_per_iteration": 2.494929552078247 + }, + { + "auxiliary_loss_clip": 0.06421454, + "auxiliary_loss_mlp": 0.01267229, + "balance_loss_clip": 0.06277972, + "balance_loss_mlp": 0.01256184, + "epoch": 0.57646174658049, + "flos": 20777778218880.0, + "grad_norm": 1.3785070074858572, + "language_loss": 0.6626485, + "learning_rate": 1.6044015350262542e-06, + "loss": 0.73953533, + "num_input_tokens_seen": 206557320, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.11047363, + "step": 9588, + "time_per_iteration": 3.990769863128662 + }, + { + "auxiliary_loss_clip": 0.06420319, + "auxiliary_loss_mlp": 0.01268847, + "balance_loss_clip": 0.0627601, + "balance_loss_mlp": 0.01256491, + "epoch": 0.576521869833158, + "flos": 23556647825280.0, + "grad_norm": 1.8252792275452514, + "language_loss": 0.79050291, + "learning_rate": 1.6040197756161104e-06, + "loss": 0.86739457, + "num_input_tokens_seen": 206575780, + "router_z_loss_clip": 1.44335938, + "router_z_loss_mlp": 0.1237793, + "step": 9589, + "time_per_iteration": 2.5151610374450684 + }, + { + "auxiliary_loss_clip": 0.06414266, + "auxiliary_loss_mlp": 0.01264887, + "balance_loss_clip": 0.06275681, + "balance_loss_mlp": 0.01254652, + "epoch": 0.5765819930858259, + "flos": 20272812387840.0, + "grad_norm": 1.9044444718181142, + "language_loss": 0.79799986, + "learning_rate": 1.6036380312215762e-06, + "loss": 0.87479138, + "num_input_tokens_seen": 206594100, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.10229492, + "step": 9590, + "time_per_iteration": 2.502588987350464 + }, + { + "auxiliary_loss_clip": 0.06424554, + "auxiliary_loss_mlp": 0.01266306, + "balance_loss_clip": 0.06279668, + "balance_loss_mlp": 0.01256096, + "epoch": 0.5766421163384939, + "flos": 23155453676160.0, + "grad_norm": 1.9323149052957644, + "language_loss": 0.63195986, + "learning_rate": 1.6032563018571283e-06, + "loss": 0.7088685, + "num_input_tokens_seen": 206613325, + "router_z_loss_clip": 1.44921875, + "router_z_loss_mlp": 0.10217285, + "step": 9591, + "time_per_iteration": 2.5217199325561523 + }, + { + "auxiliary_loss_clip": 0.0641837, + "auxiliary_loss_mlp": 0.0126852, + "balance_loss_clip": 0.06274436, + "balance_loss_mlp": 0.01258048, + "epoch": 0.5767022395911618, + "flos": 25856057969280.0, + "grad_norm": 1.7751118346977903, + "language_loss": 0.78161305, + "learning_rate": 1.6028745875372406e-06, + "loss": 0.85848188, + "num_input_tokens_seen": 206634265, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.10473633, + "step": 9592, + "time_per_iteration": 2.586398124694824 + }, + { + "auxiliary_loss_clip": 0.06325077, + "auxiliary_loss_mlp": 0.0125376, + "balance_loss_clip": 0.06264462, + "balance_loss_mlp": 0.01252203, + "epoch": 0.5767623628438299, + "flos": 68315579452800.0, + "grad_norm": 0.723864489522512, + "language_loss": 0.59626555, + "learning_rate": 1.6024928882763885e-06, + "loss": 0.67205393, + "num_input_tokens_seen": 206696990, + "router_z_loss_clip": 0.60888672, + "router_z_loss_mlp": 0.01555634, + "step": 9593, + "time_per_iteration": 3.245339870452881 + }, + { + "auxiliary_loss_clip": 0.06419121, + "auxiliary_loss_mlp": 0.01266388, + "balance_loss_clip": 0.06272256, + "balance_loss_mlp": 0.01254432, + "epoch": 0.5768224860964978, + "flos": 30195959541120.0, + "grad_norm": 1.4712512924104606, + "language_loss": 0.70970887, + "learning_rate": 1.6021112040890463e-06, + "loss": 0.78656393, + "num_input_tokens_seen": 206717815, + "router_z_loss_clip": 1.46972656, + "router_z_loss_mlp": 0.11956787, + "step": 9594, + "time_per_iteration": 2.575716018676758 + }, + { + "auxiliary_loss_clip": 0.06417293, + "auxiliary_loss_mlp": 0.01269346, + "balance_loss_clip": 0.0627408, + "balance_loss_mlp": 0.01259237, + "epoch": 0.5768826093491658, + "flos": 17900880935040.0, + "grad_norm": 1.6705807126416699, + "language_loss": 0.71305418, + "learning_rate": 1.6017295349896863e-06, + "loss": 0.78992057, + "num_input_tokens_seen": 206735985, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.10101318, + "step": 9595, + "time_per_iteration": 2.492614269256592 + }, + { + "auxiliary_loss_clip": 0.06416321, + "auxiliary_loss_mlp": 0.01269009, + "balance_loss_clip": 0.06273369, + "balance_loss_mlp": 0.01257481, + "epoch": 0.5769427326018337, + "flos": 17462943970560.0, + "grad_norm": 1.9433978950195214, + "language_loss": 0.69787997, + "learning_rate": 1.6013478809927828e-06, + "loss": 0.77473325, + "num_input_tokens_seen": 206753370, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.11529541, + "step": 9596, + "time_per_iteration": 2.527899742126465 + }, + { + "auxiliary_loss_clip": 0.06425576, + "auxiliary_loss_mlp": 0.01267355, + "balance_loss_clip": 0.06275462, + "balance_loss_mlp": 0.01254558, + "epoch": 0.5770028558545017, + "flos": 39431181473280.0, + "grad_norm": 1.7020557646527, + "language_loss": 0.67913234, + "learning_rate": 1.6009662421128074e-06, + "loss": 0.75606167, + "num_input_tokens_seen": 206777645, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.12792969, + "step": 9597, + "time_per_iteration": 2.6754841804504395 + }, + { + "auxiliary_loss_clip": 0.06417054, + "auxiliary_loss_mlp": 0.01265896, + "balance_loss_clip": 0.06273974, + "balance_loss_mlp": 0.01255322, + "epoch": 0.5770629791071697, + "flos": 21541620839040.0, + "grad_norm": 1.8412029810529236, + "language_loss": 0.82291842, + "learning_rate": 1.6005846183642323e-06, + "loss": 0.89974791, + "num_input_tokens_seen": 206794865, + "router_z_loss_clip": 1.43164062, + "router_z_loss_mlp": 0.105896, + "step": 9598, + "time_per_iteration": 2.510817527770996 + }, + { + "auxiliary_loss_clip": 0.06420396, + "auxiliary_loss_mlp": 0.01268157, + "balance_loss_clip": 0.06275374, + "balance_loss_mlp": 0.01256511, + "epoch": 0.5771231023598377, + "flos": 20893121763840.0, + "grad_norm": 1.43847663479929, + "language_loss": 0.73386133, + "learning_rate": 1.6002030097615277e-06, + "loss": 0.81074691, + "num_input_tokens_seen": 206814095, + "router_z_loss_clip": 1.44921875, + "router_z_loss_mlp": 0.11639404, + "step": 9599, + "time_per_iteration": 2.492751121520996 + }, + { + "auxiliary_loss_clip": 0.06411996, + "auxiliary_loss_mlp": 0.01265144, + "balance_loss_clip": 0.06272705, + "balance_loss_mlp": 0.01254772, + "epoch": 0.5771832256125057, + "flos": 18083043711360.0, + "grad_norm": 1.7867114623476337, + "language_loss": 0.78284144, + "learning_rate": 1.5998214163191663e-06, + "loss": 0.85961294, + "num_input_tokens_seen": 206832245, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.10369873, + "step": 9600, + "time_per_iteration": 2.4890565872192383 + }, + { + "auxiliary_loss_clip": 0.06422748, + "auxiliary_loss_mlp": 0.01268331, + "balance_loss_clip": 0.06276144, + "balance_loss_mlp": 0.01256893, + "epoch": 0.5772433488651736, + "flos": 26366222753280.0, + "grad_norm": 1.8856132517408855, + "language_loss": 0.72472572, + "learning_rate": 1.5994398380516163e-06, + "loss": 0.80163646, + "num_input_tokens_seen": 206851535, + "router_z_loss_clip": 1.46582031, + "router_z_loss_mlp": 0.11450195, + "step": 9601, + "time_per_iteration": 2.536994218826294 + }, + { + "auxiliary_loss_clip": 0.06415705, + "auxiliary_loss_mlp": 0.0126476, + "balance_loss_clip": 0.06274568, + "balance_loss_mlp": 0.01253506, + "epoch": 0.5773034721178416, + "flos": 19686814058880.0, + "grad_norm": 1.49916876372247, + "language_loss": 0.68989396, + "learning_rate": 1.599058274973348e-06, + "loss": 0.7666986, + "num_input_tokens_seen": 206870595, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.11254883, + "step": 9602, + "time_per_iteration": 2.4855434894561768 + }, + { + "auxiliary_loss_clip": 0.06409699, + "auxiliary_loss_mlp": 0.01267414, + "balance_loss_clip": 0.06272521, + "balance_loss_mlp": 0.01257287, + "epoch": 0.5773635953705095, + "flos": 25089951288960.0, + "grad_norm": 1.4178586949074146, + "language_loss": 0.73199558, + "learning_rate": 1.5986767270988297e-06, + "loss": 0.80876672, + "num_input_tokens_seen": 206892320, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.10125732, + "step": 9603, + "time_per_iteration": 2.5496528148651123 + }, + { + "auxiliary_loss_clip": 0.06418322, + "auxiliary_loss_mlp": 0.01267189, + "balance_loss_clip": 0.06276152, + "balance_loss_mlp": 0.01256162, + "epoch": 0.5774237186231775, + "flos": 21039380265600.0, + "grad_norm": 1.5159674911644692, + "language_loss": 0.76686621, + "learning_rate": 1.5982951944425298e-06, + "loss": 0.84372133, + "num_input_tokens_seen": 206912485, + "router_z_loss_clip": 1.41992188, + "router_z_loss_mlp": 0.11035156, + "step": 9604, + "time_per_iteration": 2.522033452987671 + }, + { + "auxiliary_loss_clip": 0.06420808, + "auxiliary_loss_mlp": 0.01271467, + "balance_loss_clip": 0.06277063, + "balance_loss_mlp": 0.01259373, + "epoch": 0.5774838418758454, + "flos": 15237145238400.0, + "grad_norm": 2.0065352138527808, + "language_loss": 0.83384192, + "learning_rate": 1.5979136770189174e-06, + "loss": 0.91076463, + "num_input_tokens_seen": 206929100, + "router_z_loss_clip": 1.43652344, + "router_z_loss_mlp": 0.12097168, + "step": 9605, + "time_per_iteration": 2.4643824100494385 + }, + { + "auxiliary_loss_clip": 0.0643101, + "auxiliary_loss_mlp": 0.01267132, + "balance_loss_clip": 0.06278086, + "balance_loss_mlp": 0.01254913, + "epoch": 0.5775439651285135, + "flos": 23588694812160.0, + "grad_norm": 1.6400067603153077, + "language_loss": 0.78330255, + "learning_rate": 1.5975321748424581e-06, + "loss": 0.86028397, + "num_input_tokens_seen": 206947020, + "router_z_loss_clip": 1.52832031, + "router_z_loss_mlp": 0.12207031, + "step": 9606, + "time_per_iteration": 2.5217928886413574 + }, + { + "auxiliary_loss_clip": 0.06417712, + "auxiliary_loss_mlp": 0.0126431, + "balance_loss_clip": 0.06273665, + "balance_loss_mlp": 0.01252687, + "epoch": 0.5776040883811814, + "flos": 18046300896000.0, + "grad_norm": 1.7192315062710783, + "language_loss": 0.73891246, + "learning_rate": 1.597150687927619e-06, + "loss": 0.81573272, + "num_input_tokens_seen": 206964065, + "router_z_loss_clip": 1.43847656, + "router_z_loss_mlp": 0.11633301, + "step": 9607, + "time_per_iteration": 2.4798216819763184 + }, + { + "auxiliary_loss_clip": 0.06424229, + "auxiliary_loss_mlp": 0.01268528, + "balance_loss_clip": 0.06277244, + "balance_loss_mlp": 0.01256368, + "epoch": 0.5776642116338494, + "flos": 18630580216320.0, + "grad_norm": 1.602339688767026, + "language_loss": 0.69749868, + "learning_rate": 1.5967692162888664e-06, + "loss": 0.77442622, + "num_input_tokens_seen": 206981940, + "router_z_loss_clip": 1.46972656, + "router_z_loss_mlp": 0.121521, + "step": 9608, + "time_per_iteration": 2.5238630771636963 + }, + { + "auxiliary_loss_clip": 0.06419271, + "auxiliary_loss_mlp": 0.01267568, + "balance_loss_clip": 0.06275126, + "balance_loss_mlp": 0.01255814, + "epoch": 0.5777243348865173, + "flos": 28410068344320.0, + "grad_norm": 1.9615645043462706, + "language_loss": 0.76945466, + "learning_rate": 1.596387759940665e-06, + "loss": 0.84632301, + "num_input_tokens_seen": 207002365, + "router_z_loss_clip": 1.44042969, + "router_z_loss_mlp": 0.11749268, + "step": 9609, + "time_per_iteration": 2.549933671951294 + }, + { + "auxiliary_loss_clip": 0.0642001, + "auxiliary_loss_mlp": 0.01267285, + "balance_loss_clip": 0.06273153, + "balance_loss_mlp": 0.01255084, + "epoch": 0.5777844581391853, + "flos": 24031579167360.0, + "grad_norm": 1.544459178362984, + "language_loss": 0.77057648, + "learning_rate": 1.5960063188974808e-06, + "loss": 0.84744948, + "num_input_tokens_seen": 207021195, + "router_z_loss_clip": 1.46777344, + "router_z_loss_mlp": 0.12200928, + "step": 9610, + "time_per_iteration": 2.5409657955169678 + }, + { + "auxiliary_loss_clip": 0.06419136, + "auxiliary_loss_mlp": 0.01273329, + "balance_loss_clip": 0.06273989, + "balance_loss_mlp": 0.01261104, + "epoch": 0.5778445813918534, + "flos": 17781805883520.0, + "grad_norm": 2.0334076468596463, + "language_loss": 0.69377804, + "learning_rate": 1.5956248931737777e-06, + "loss": 0.77070266, + "num_input_tokens_seen": 207037465, + "router_z_loss_clip": 1.44921875, + "router_z_loss_mlp": 0.12231445, + "step": 9611, + "time_per_iteration": 3.8771145343780518 + }, + { + "auxiliary_loss_clip": 0.06415454, + "auxiliary_loss_mlp": 0.01265667, + "balance_loss_clip": 0.06272358, + "balance_loss_mlp": 0.01254795, + "epoch": 0.5779047046445213, + "flos": 22239147352320.0, + "grad_norm": 1.7756554406320284, + "language_loss": 0.84048247, + "learning_rate": 1.5952434827840185e-06, + "loss": 0.91729373, + "num_input_tokens_seen": 207054230, + "router_z_loss_clip": 1.43359375, + "router_z_loss_mlp": 0.10876465, + "step": 9612, + "time_per_iteration": 2.4897758960723877 + }, + { + "auxiliary_loss_clip": 0.06417899, + "auxiliary_loss_mlp": 0.01267936, + "balance_loss_clip": 0.06275887, + "balance_loss_mlp": 0.01257046, + "epoch": 0.5779648278971893, + "flos": 21440825976960.0, + "grad_norm": 1.4853190478070708, + "language_loss": 0.80038643, + "learning_rate": 1.594862087742667e-06, + "loss": 0.87724483, + "num_input_tokens_seen": 207073150, + "router_z_loss_clip": 1.41894531, + "router_z_loss_mlp": 0.10894775, + "step": 9613, + "time_per_iteration": 2.512202501296997 + }, + { + "auxiliary_loss_clip": 0.06417654, + "auxiliary_loss_mlp": 0.01265916, + "balance_loss_clip": 0.06274515, + "balance_loss_mlp": 0.01254996, + "epoch": 0.5780249511498572, + "flos": 19032151708800.0, + "grad_norm": 1.6718641196950235, + "language_loss": 0.7774657, + "learning_rate": 1.5944807080641863e-06, + "loss": 0.85430139, + "num_input_tokens_seen": 207090375, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.10925293, + "step": 9614, + "time_per_iteration": 2.4882118701934814 + }, + { + "auxiliary_loss_clip": 0.06421545, + "auxiliary_loss_mlp": 0.0126591, + "balance_loss_clip": 0.06274751, + "balance_loss_mlp": 0.01254543, + "epoch": 0.5780850744025252, + "flos": 12128596542720.0, + "grad_norm": 2.0494146854902175, + "language_loss": 0.82224047, + "learning_rate": 1.5940993437630375e-06, + "loss": 0.89911503, + "num_input_tokens_seen": 207106030, + "router_z_loss_clip": 1.46679688, + "router_z_loss_mlp": 0.1137085, + "step": 9615, + "time_per_iteration": 2.472621440887451 + }, + { + "auxiliary_loss_clip": 0.0642141, + "auxiliary_loss_mlp": 0.01267646, + "balance_loss_clip": 0.06274787, + "balance_loss_mlp": 0.01255552, + "epoch": 0.5781451976551931, + "flos": 25051154048640.0, + "grad_norm": 1.4669220513135932, + "language_loss": 0.67472255, + "learning_rate": 1.5937179948536825e-06, + "loss": 0.75161308, + "num_input_tokens_seen": 207125435, + "router_z_loss_clip": 1.46679688, + "router_z_loss_mlp": 0.12097168, + "step": 9616, + "time_per_iteration": 2.534846782684326 + }, + { + "auxiliary_loss_clip": 0.06417294, + "auxiliary_loss_mlp": 0.01269205, + "balance_loss_clip": 0.06275527, + "balance_loss_mlp": 0.01257528, + "epoch": 0.5782053209078611, + "flos": 19251770060160.0, + "grad_norm": 1.8155832257801603, + "language_loss": 0.77963018, + "learning_rate": 1.5933366613505812e-06, + "loss": 0.85649514, + "num_input_tokens_seen": 207145095, + "router_z_loss_clip": 1.41699219, + "router_z_loss_mlp": 0.11669922, + "step": 9617, + "time_per_iteration": 4.014554977416992 + }, + { + "auxiliary_loss_clip": 0.064207, + "auxiliary_loss_mlp": 0.01269929, + "balance_loss_clip": 0.06277206, + "balance_loss_mlp": 0.012578, + "epoch": 0.578265444160529, + "flos": 26000849024640.0, + "grad_norm": 1.3678407791087424, + "language_loss": 0.75333905, + "learning_rate": 1.5929553432681947e-06, + "loss": 0.83024538, + "num_input_tokens_seen": 207166045, + "router_z_loss_clip": 1.43457031, + "router_z_loss_mlp": 0.12139893, + "step": 9618, + "time_per_iteration": 2.5390572547912598 + }, + { + "auxiliary_loss_clip": 0.06416163, + "auxiliary_loss_mlp": 0.01265823, + "balance_loss_clip": 0.06273779, + "balance_loss_mlp": 0.01254355, + "epoch": 0.5783255674131971, + "flos": 21805025748480.0, + "grad_norm": 1.6109172194310035, + "language_loss": 0.81657064, + "learning_rate": 1.5925740406209826e-06, + "loss": 0.89339048, + "num_input_tokens_seen": 207185290, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.11468506, + "step": 9619, + "time_per_iteration": 2.505831718444824 + }, + { + "auxiliary_loss_clip": 0.06420539, + "auxiliary_loss_mlp": 0.01265219, + "balance_loss_clip": 0.06275585, + "balance_loss_mlp": 0.01253972, + "epoch": 0.578385690665865, + "flos": 24796553817600.0, + "grad_norm": 1.540190718879446, + "language_loss": 0.72668874, + "learning_rate": 1.5921927534234039e-06, + "loss": 0.80354631, + "num_input_tokens_seen": 207205505, + "router_z_loss_clip": 1.44921875, + "router_z_loss_mlp": 0.11248779, + "step": 9620, + "time_per_iteration": 3.9673268795013428 + }, + { + "auxiliary_loss_clip": 0.06423381, + "auxiliary_loss_mlp": 0.01270714, + "balance_loss_clip": 0.06277235, + "balance_loss_mlp": 0.01258942, + "epoch": 0.578445813918533, + "flos": 21218859711360.0, + "grad_norm": 1.6605075192862409, + "language_loss": 0.77349472, + "learning_rate": 1.591811481689916e-06, + "loss": 0.85043567, + "num_input_tokens_seen": 207225315, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.11767578, + "step": 9621, + "time_per_iteration": 2.5077648162841797 + }, + { + "auxiliary_loss_clip": 0.06420489, + "auxiliary_loss_mlp": 0.01264338, + "balance_loss_clip": 0.0627306, + "balance_loss_mlp": 0.01252477, + "epoch": 0.5785059371712009, + "flos": 25053921233280.0, + "grad_norm": 1.4404835359445094, + "language_loss": 0.7094593, + "learning_rate": 1.5914302254349787e-06, + "loss": 0.78630757, + "num_input_tokens_seen": 207247690, + "router_z_loss_clip": 1.47558594, + "router_z_loss_mlp": 0.11859131, + "step": 9622, + "time_per_iteration": 2.5468451976776123 + }, + { + "auxiliary_loss_clip": 0.06311069, + "auxiliary_loss_mlp": 0.01252444, + "balance_loss_clip": 0.06251176, + "balance_loss_mlp": 0.01250508, + "epoch": 0.5785660604238689, + "flos": 70865187488640.0, + "grad_norm": 0.7596176351080388, + "language_loss": 0.55852556, + "learning_rate": 1.5910489846730476e-06, + "loss": 0.6341607, + "num_input_tokens_seen": 207301735, + "router_z_loss_clip": 0.6015625, + "router_z_loss_mlp": 0.01933289, + "step": 9623, + "time_per_iteration": 3.153353452682495 + }, + { + "auxiliary_loss_clip": 0.06425077, + "auxiliary_loss_mlp": 0.01267172, + "balance_loss_clip": 0.06277281, + "balance_loss_mlp": 0.01255233, + "epoch": 0.578626183676537, + "flos": 31658083361280.0, + "grad_norm": 2.2034040135587936, + "language_loss": 0.71319884, + "learning_rate": 1.5906677594185799e-06, + "loss": 0.79012132, + "num_input_tokens_seen": 207321240, + "router_z_loss_clip": 1.47949219, + "router_z_loss_mlp": 0.1194458, + "step": 9624, + "time_per_iteration": 2.5956926345825195 + }, + { + "auxiliary_loss_clip": 0.06420659, + "auxiliary_loss_mlp": 0.01270578, + "balance_loss_clip": 0.06275962, + "balance_loss_mlp": 0.01258222, + "epoch": 0.5786863069292049, + "flos": 21870545241600.0, + "grad_norm": 1.7015470008848133, + "language_loss": 0.82409322, + "learning_rate": 1.5902865496860322e-06, + "loss": 0.90100557, + "num_input_tokens_seen": 207339540, + "router_z_loss_clip": 1.44824219, + "router_z_loss_mlp": 0.12353516, + "step": 9625, + "time_per_iteration": 2.5166807174682617 + }, + { + "auxiliary_loss_clip": 0.06417123, + "auxiliary_loss_mlp": 0.01266026, + "balance_loss_clip": 0.06274764, + "balance_loss_mlp": 0.01253647, + "epoch": 0.5787464301818729, + "flos": 23371214739840.0, + "grad_norm": 1.4015207824111633, + "language_loss": 0.70712119, + "learning_rate": 1.5899053554898591e-06, + "loss": 0.78395265, + "num_input_tokens_seen": 207360470, + "router_z_loss_clip": 1.42382812, + "router_z_loss_mlp": 0.12384033, + "step": 9626, + "time_per_iteration": 2.5232555866241455 + }, + { + "auxiliary_loss_clip": 0.06417292, + "auxiliary_loss_mlp": 0.01266097, + "balance_loss_clip": 0.06275232, + "balance_loss_mlp": 0.01255278, + "epoch": 0.5788065534345408, + "flos": 30011155361280.0, + "grad_norm": 1.650883867076693, + "language_loss": 0.71934295, + "learning_rate": 1.5895241768445166e-06, + "loss": 0.79617685, + "num_input_tokens_seen": 207383080, + "router_z_loss_clip": 1.41992188, + "router_z_loss_mlp": 0.10827637, + "step": 9627, + "time_per_iteration": 2.5862505435943604 + }, + { + "auxiliary_loss_clip": 0.06419323, + "auxiliary_loss_mlp": 0.01268778, + "balance_loss_clip": 0.06276532, + "balance_loss_mlp": 0.01257643, + "epoch": 0.5788666766872088, + "flos": 24533526251520.0, + "grad_norm": 1.6845581870111699, + "language_loss": 0.84154361, + "learning_rate": 1.589143013764458e-06, + "loss": 0.91842461, + "num_input_tokens_seen": 207401000, + "router_z_loss_clip": 1.42675781, + "router_z_loss_mlp": 0.11138916, + "step": 9628, + "time_per_iteration": 4.011742830276489 + }, + { + "auxiliary_loss_clip": 0.06420035, + "auxiliary_loss_mlp": 0.01267996, + "balance_loss_clip": 0.06274278, + "balance_loss_mlp": 0.01255443, + "epoch": 0.5789267999398767, + "flos": 23739649142400.0, + "grad_norm": 1.4211285900013286, + "language_loss": 0.72366357, + "learning_rate": 1.5887618662641376e-06, + "loss": 0.8005439, + "num_input_tokens_seen": 207419230, + "router_z_loss_clip": 1.45898438, + "router_z_loss_mlp": 0.12548828, + "step": 9629, + "time_per_iteration": 2.535161018371582 + }, + { + "auxiliary_loss_clip": 0.06419079, + "auxiliary_loss_mlp": 0.01266785, + "balance_loss_clip": 0.06275524, + "balance_loss_mlp": 0.01254894, + "epoch": 0.5789869231925447, + "flos": 21140217054720.0, + "grad_norm": 1.8234862135922645, + "language_loss": 0.74396068, + "learning_rate": 1.5883807343580087e-06, + "loss": 0.82081938, + "num_input_tokens_seen": 207437615, + "router_z_loss_clip": 1.43652344, + "router_z_loss_mlp": 0.11883545, + "step": 9630, + "time_per_iteration": 2.4906413555145264 + }, + { + "auxiliary_loss_clip": 0.06409539, + "auxiliary_loss_mlp": 0.0126483, + "balance_loss_clip": 0.06270717, + "balance_loss_mlp": 0.0125344, + "epoch": 0.5790470464452127, + "flos": 21215086277760.0, + "grad_norm": 1.5521366007555986, + "language_loss": 0.78864127, + "learning_rate": 1.587999618060523e-06, + "loss": 0.86538494, + "num_input_tokens_seen": 207457270, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.11395264, + "step": 9631, + "time_per_iteration": 2.500326633453369 + }, + { + "auxiliary_loss_clip": 0.06417775, + "auxiliary_loss_mlp": 0.01264538, + "balance_loss_clip": 0.06272215, + "balance_loss_mlp": 0.01253147, + "epoch": 0.5791071696978807, + "flos": 23411144010240.0, + "grad_norm": 1.6622191818478913, + "language_loss": 0.7546376, + "learning_rate": 1.5876185173861333e-06, + "loss": 0.83146071, + "num_input_tokens_seen": 207477890, + "router_z_loss_clip": 1.45605469, + "router_z_loss_mlp": 0.1138916, + "step": 9632, + "time_per_iteration": 2.5060648918151855 + }, + { + "auxiliary_loss_clip": 0.06419455, + "auxiliary_loss_mlp": 0.01267637, + "balance_loss_clip": 0.06274837, + "balance_loss_mlp": 0.0125562, + "epoch": 0.5791672929505486, + "flos": 24213322673280.0, + "grad_norm": 1.7292582736877316, + "language_loss": 0.79532528, + "learning_rate": 1.5872374323492915e-06, + "loss": 0.8721962, + "num_input_tokens_seen": 207497670, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.12011719, + "step": 9633, + "time_per_iteration": 2.516359567642212 + }, + { + "auxiliary_loss_clip": 0.0643272, + "auxiliary_loss_mlp": 0.01269361, + "balance_loss_clip": 0.06278707, + "balance_loss_mlp": 0.01256635, + "epoch": 0.5792274162032166, + "flos": 24355094981760.0, + "grad_norm": 1.6340208840931036, + "language_loss": 0.7790345, + "learning_rate": 1.5868563629644464e-06, + "loss": 0.85605538, + "num_input_tokens_seen": 207516105, + "router_z_loss_clip": 1.5390625, + "router_z_loss_mlp": 0.1272583, + "step": 9634, + "time_per_iteration": 2.541090488433838 + }, + { + "auxiliary_loss_clip": 0.06422533, + "auxiliary_loss_mlp": 0.01268072, + "balance_loss_clip": 0.06273677, + "balance_loss_mlp": 0.01255406, + "epoch": 0.5792875394558845, + "flos": 20455729850880.0, + "grad_norm": 1.975369322400224, + "language_loss": 0.64063549, + "learning_rate": 1.5864753092460502e-06, + "loss": 0.71754158, + "num_input_tokens_seen": 207533685, + "router_z_loss_clip": 1.48828125, + "router_z_loss_mlp": 0.12652588, + "step": 9635, + "time_per_iteration": 2.4916157722473145 + }, + { + "auxiliary_loss_clip": 0.06417014, + "auxiliary_loss_mlp": 0.01265438, + "balance_loss_clip": 0.06273466, + "balance_loss_mlp": 0.01253327, + "epoch": 0.5793476627085525, + "flos": 24067064171520.0, + "grad_norm": 1.4766518541506428, + "language_loss": 0.77494228, + "learning_rate": 1.5860942712085516e-06, + "loss": 0.85176682, + "num_input_tokens_seen": 207552840, + "router_z_loss_clip": 1.43359375, + "router_z_loss_mlp": 0.12115479, + "step": 9636, + "time_per_iteration": 2.516622304916382 + }, + { + "auxiliary_loss_clip": 0.06411137, + "auxiliary_loss_mlp": 0.01268567, + "balance_loss_clip": 0.06271987, + "balance_loss_mlp": 0.01258226, + "epoch": 0.5794077859612206, + "flos": 22060799936640.0, + "grad_norm": 1.6556351940576073, + "language_loss": 0.68772542, + "learning_rate": 1.5857132488663998e-06, + "loss": 0.76452249, + "num_input_tokens_seen": 207572095, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.10333252, + "step": 9637, + "time_per_iteration": 2.509833812713623 + }, + { + "auxiliary_loss_clip": 0.06421766, + "auxiliary_loss_mlp": 0.0126905, + "balance_loss_clip": 0.06273458, + "balance_loss_mlp": 0.01256784, + "epoch": 0.5794679092138885, + "flos": 11439245802240.0, + "grad_norm": 2.540580609640148, + "language_loss": 0.72712755, + "learning_rate": 1.585332242234043e-06, + "loss": 0.80403578, + "num_input_tokens_seen": 207587495, + "router_z_loss_clip": 1.48339844, + "router_z_loss_mlp": 0.12261963, + "step": 9638, + "time_per_iteration": 2.4528071880340576 + }, + { + "auxiliary_loss_clip": 0.06416277, + "auxiliary_loss_mlp": 0.01266332, + "balance_loss_clip": 0.06273618, + "balance_loss_mlp": 0.0125521, + "epoch": 0.5795280324665565, + "flos": 18886228623360.0, + "grad_norm": 1.607875789180523, + "language_loss": 0.72792935, + "learning_rate": 1.5849512513259291e-06, + "loss": 0.80475545, + "num_input_tokens_seen": 207606795, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.11120605, + "step": 9639, + "time_per_iteration": 2.510347604751587 + }, + { + "auxiliary_loss_clip": 0.06418437, + "auxiliary_loss_mlp": 0.01269692, + "balance_loss_clip": 0.06273493, + "balance_loss_mlp": 0.01258332, + "epoch": 0.5795881557192244, + "flos": 13010969162880.0, + "grad_norm": 1.751039086833101, + "language_loss": 0.69813907, + "learning_rate": 1.5845702761565054e-06, + "loss": 0.7750203, + "num_input_tokens_seen": 207623620, + "router_z_loss_clip": 1.44824219, + "router_z_loss_mlp": 0.11364746, + "step": 9640, + "time_per_iteration": 2.453831672668457 + }, + { + "auxiliary_loss_clip": 0.06430758, + "auxiliary_loss_mlp": 0.01271889, + "balance_loss_clip": 0.0627775, + "balance_loss_mlp": 0.01259509, + "epoch": 0.5796482789718924, + "flos": 19937598929280.0, + "grad_norm": 2.3188274360648298, + "language_loss": 0.78378308, + "learning_rate": 1.5841893167402183e-06, + "loss": 0.8608095, + "num_input_tokens_seen": 207639380, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.12371826, + "step": 9641, + "time_per_iteration": 2.487333059310913 + }, + { + "auxiliary_loss_clip": 0.06416615, + "auxiliary_loss_mlp": 0.01268516, + "balance_loss_clip": 0.06271899, + "balance_loss_mlp": 0.01256685, + "epoch": 0.5797084022245603, + "flos": 21656880529920.0, + "grad_norm": 2.422042135441505, + "language_loss": 0.74201375, + "learning_rate": 1.5838083730915143e-06, + "loss": 0.81886506, + "num_input_tokens_seen": 207657915, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.1182251, + "step": 9642, + "time_per_iteration": 2.4917688369750977 + }, + { + "auxiliary_loss_clip": 0.06419542, + "auxiliary_loss_mlp": 0.01264152, + "balance_loss_clip": 0.06275794, + "balance_loss_mlp": 0.01252582, + "epoch": 0.5797685254772283, + "flos": 26038807724160.0, + "grad_norm": 1.4983613319397562, + "language_loss": 0.73538697, + "learning_rate": 1.5834274452248378e-06, + "loss": 0.81222391, + "num_input_tokens_seen": 207678620, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.11566162, + "step": 9643, + "time_per_iteration": 2.5357465744018555 + }, + { + "auxiliary_loss_clip": 0.06417159, + "auxiliary_loss_mlp": 0.01264721, + "balance_loss_clip": 0.06270476, + "balance_loss_mlp": 0.01253175, + "epoch": 0.5798286487298963, + "flos": 22710808385280.0, + "grad_norm": 1.6774180539317567, + "language_loss": 0.67605746, + "learning_rate": 1.5830465331546352e-06, + "loss": 0.75287628, + "num_input_tokens_seen": 207696980, + "router_z_loss_clip": 1.46679688, + "router_z_loss_mlp": 0.11547852, + "step": 9644, + "time_per_iteration": 2.485366106033325 + }, + { + "auxiliary_loss_clip": 0.06425455, + "auxiliary_loss_mlp": 0.01268613, + "balance_loss_clip": 0.06276956, + "balance_loss_mlp": 0.01256078, + "epoch": 0.5798887719825643, + "flos": 23155705238400.0, + "grad_norm": 2.0120452642465865, + "language_loss": 0.85497642, + "learning_rate": 1.5826656368953496e-06, + "loss": 0.93191713, + "num_input_tokens_seen": 207714065, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.12542725, + "step": 9645, + "time_per_iteration": 2.505467414855957 + }, + { + "auxiliary_loss_clip": 0.06418729, + "auxiliary_loss_mlp": 0.01266861, + "balance_loss_clip": 0.06275458, + "balance_loss_mlp": 0.01255774, + "epoch": 0.5799488952352322, + "flos": 24432982951680.0, + "grad_norm": 1.7616171208033915, + "language_loss": 0.75737381, + "learning_rate": 1.5822847564614244e-06, + "loss": 0.83422971, + "num_input_tokens_seen": 207734720, + "router_z_loss_clip": 1.43164062, + "router_z_loss_mlp": 0.11102295, + "step": 9646, + "time_per_iteration": 2.527848958969116 + }, + { + "auxiliary_loss_clip": 0.06425247, + "auxiliary_loss_mlp": 0.01268889, + "balance_loss_clip": 0.06276453, + "balance_loss_mlp": 0.01256461, + "epoch": 0.5800090184879002, + "flos": 38404478995200.0, + "grad_norm": 1.7871006843554935, + "language_loss": 0.59099573, + "learning_rate": 1.5819038918673038e-06, + "loss": 0.6679371, + "num_input_tokens_seen": 207755435, + "router_z_loss_clip": 1.48828125, + "router_z_loss_mlp": 0.12426758, + "step": 9647, + "time_per_iteration": 2.643890142440796 + }, + { + "auxiliary_loss_clip": 0.06425125, + "auxiliary_loss_mlp": 0.01271805, + "balance_loss_clip": 0.06275211, + "balance_loss_mlp": 0.01259276, + "epoch": 0.5800691417405681, + "flos": 19789747200000.0, + "grad_norm": 1.4917917867847632, + "language_loss": 0.84483784, + "learning_rate": 1.5815230431274288e-06, + "loss": 0.92180717, + "num_input_tokens_seen": 207773570, + "router_z_loss_clip": 1.49902344, + "router_z_loss_mlp": 0.12524414, + "step": 9648, + "time_per_iteration": 2.48917818069458 + }, + { + "auxiliary_loss_clip": 0.06311809, + "auxiliary_loss_mlp": 0.01252996, + "balance_loss_clip": 0.06251512, + "balance_loss_mlp": 0.01251245, + "epoch": 0.5801292649932361, + "flos": 70333514133120.0, + "grad_norm": 0.8366168453621474, + "language_loss": 0.63013005, + "learning_rate": 1.581142210256242e-06, + "loss": 0.70577806, + "num_input_tokens_seen": 207830095, + "router_z_loss_clip": 0.60449219, + "router_z_loss_mlp": 0.01756287, + "step": 9649, + "time_per_iteration": 3.167630434036255 + }, + { + "auxiliary_loss_clip": 0.064106, + "auxiliary_loss_mlp": 0.01264864, + "balance_loss_clip": 0.06269349, + "balance_loss_mlp": 0.01253903, + "epoch": 0.5801893882459042, + "flos": 18740892516480.0, + "grad_norm": 1.6385207780550837, + "language_loss": 0.82320833, + "learning_rate": 1.5807613932681857e-06, + "loss": 0.89996296, + "num_input_tokens_seen": 207848555, + "router_z_loss_clip": 1.41308594, + "router_z_loss_mlp": 0.10968018, + "step": 9650, + "time_per_iteration": 2.495060920715332 + }, + { + "auxiliary_loss_clip": 0.06424958, + "auxiliary_loss_mlp": 0.01267787, + "balance_loss_clip": 0.0627567, + "balance_loss_mlp": 0.01256194, + "epoch": 0.5802495114985721, + "flos": 15601973915520.0, + "grad_norm": 2.051158244012986, + "language_loss": 0.77640611, + "learning_rate": 1.580380592177698e-06, + "loss": 0.85333359, + "num_input_tokens_seen": 207867060, + "router_z_loss_clip": 1.49121094, + "router_z_loss_mlp": 0.11584473, + "step": 9651, + "time_per_iteration": 3.9003303050994873 + }, + { + "auxiliary_loss_clip": 0.06421195, + "auxiliary_loss_mlp": 0.01270828, + "balance_loss_clip": 0.0627306, + "balance_loss_mlp": 0.01258627, + "epoch": 0.5803096347512401, + "flos": 18260552586240.0, + "grad_norm": 1.678926948492491, + "language_loss": 0.74017727, + "learning_rate": 1.5799998069992213e-06, + "loss": 0.81709743, + "num_input_tokens_seen": 207884520, + "router_z_loss_clip": 1.48046875, + "router_z_loss_mlp": 0.12207031, + "step": 9652, + "time_per_iteration": 2.5226869583129883 + }, + { + "auxiliary_loss_clip": 0.0642662, + "auxiliary_loss_mlp": 0.01267654, + "balance_loss_clip": 0.06278314, + "balance_loss_mlp": 0.012559, + "epoch": 0.580369758003908, + "flos": 22899763342080.0, + "grad_norm": 1.9284827518212118, + "language_loss": 0.77118474, + "learning_rate": 1.579619037747193e-06, + "loss": 0.84812748, + "num_input_tokens_seen": 207905370, + "router_z_loss_clip": 1.48242188, + "router_z_loss_mlp": 0.11749268, + "step": 9653, + "time_per_iteration": 2.5736207962036133 + }, + { + "auxiliary_loss_clip": 0.06425463, + "auxiliary_loss_mlp": 0.01265074, + "balance_loss_clip": 0.06277624, + "balance_loss_mlp": 0.01252789, + "epoch": 0.580429881256576, + "flos": 18703646576640.0, + "grad_norm": 1.9366371532767657, + "language_loss": 0.75627828, + "learning_rate": 1.5792382844360534e-06, + "loss": 0.83318365, + "num_input_tokens_seen": 207923790, + "router_z_loss_clip": 1.47851562, + "router_z_loss_mlp": 0.1229248, + "step": 9654, + "time_per_iteration": 2.667048931121826 + }, + { + "auxiliary_loss_clip": 0.06413651, + "auxiliary_loss_mlp": 0.01265944, + "balance_loss_clip": 0.062739, + "balance_loss_mlp": 0.01254959, + "epoch": 0.5804900045092439, + "flos": 24689050629120.0, + "grad_norm": 1.638178903008904, + "language_loss": 0.70858634, + "learning_rate": 1.5788575470802408e-06, + "loss": 0.78538227, + "num_input_tokens_seen": 207942335, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.10992432, + "step": 9655, + "time_per_iteration": 2.5496294498443604 + }, + { + "auxiliary_loss_clip": 0.06424456, + "auxiliary_loss_mlp": 0.01266011, + "balance_loss_clip": 0.06273113, + "balance_loss_mlp": 0.0125378, + "epoch": 0.580550127761912, + "flos": 23119549401600.0, + "grad_norm": 2.0310142592924314, + "language_loss": 0.70043373, + "learning_rate": 1.5784768256941915e-06, + "loss": 0.77733833, + "num_input_tokens_seen": 207961975, + "router_z_loss_clip": 1.51171875, + "router_z_loss_mlp": 0.12231445, + "step": 9656, + "time_per_iteration": 4.0007078647613525 + }, + { + "auxiliary_loss_clip": 0.06411725, + "auxiliary_loss_mlp": 0.01265789, + "balance_loss_clip": 0.0627184, + "balance_loss_mlp": 0.01255203, + "epoch": 0.5806102510145799, + "flos": 18481093332480.0, + "grad_norm": 1.6851014534608593, + "language_loss": 0.71761322, + "learning_rate": 1.5780961202923433e-06, + "loss": 0.79438841, + "num_input_tokens_seen": 207979520, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.105896, + "step": 9657, + "time_per_iteration": 2.52081298828125 + }, + { + "auxiliary_loss_clip": 0.06426618, + "auxiliary_loss_mlp": 0.01265336, + "balance_loss_clip": 0.06275696, + "balance_loss_mlp": 0.01252843, + "epoch": 0.5806703742672479, + "flos": 23922566605440.0, + "grad_norm": 1.7911249599131025, + "language_loss": 0.70450497, + "learning_rate": 1.5777154308891328e-06, + "loss": 0.78142452, + "num_input_tokens_seen": 207998375, + "router_z_loss_clip": 1.50976562, + "router_z_loss_mlp": 0.12506104, + "step": 9658, + "time_per_iteration": 2.509723424911499 + }, + { + "auxiliary_loss_clip": 0.06307676, + "auxiliary_loss_mlp": 0.01252681, + "balance_loss_clip": 0.06247197, + "balance_loss_mlp": 0.01250939, + "epoch": 0.5807304975199158, + "flos": 66332096328960.0, + "grad_norm": 0.6445385314606554, + "language_loss": 0.53559077, + "learning_rate": 1.5773347574989953e-06, + "loss": 0.61119437, + "num_input_tokens_seen": 208060605, + "router_z_loss_clip": 0.60546875, + "router_z_loss_mlp": 0.01747131, + "step": 9659, + "time_per_iteration": 3.164217233657837 + }, + { + "auxiliary_loss_clip": 0.0642177, + "auxiliary_loss_mlp": 0.01266172, + "balance_loss_clip": 0.06271978, + "balance_loss_mlp": 0.01254191, + "epoch": 0.5807906207725838, + "flos": 31730478888960.0, + "grad_norm": 1.678223545722946, + "language_loss": 0.62300181, + "learning_rate": 1.576954100136366e-06, + "loss": 0.69988132, + "num_input_tokens_seen": 208080320, + "router_z_loss_clip": 1.49804688, + "router_z_loss_mlp": 0.11987305, + "step": 9660, + "time_per_iteration": 4.055291175842285 + }, + { + "auxiliary_loss_clip": 0.06418584, + "auxiliary_loss_mlp": 0.01267971, + "balance_loss_clip": 0.06270796, + "balance_loss_mlp": 0.01256443, + "epoch": 0.5808507440252517, + "flos": 23807223060480.0, + "grad_norm": 1.5142376676823694, + "language_loss": 0.65793735, + "learning_rate": 1.5765734588156797e-06, + "loss": 0.73480284, + "num_input_tokens_seen": 208099305, + "router_z_loss_clip": 1.4765625, + "router_z_loss_mlp": 0.11541748, + "step": 9661, + "time_per_iteration": 2.50545334815979 + }, + { + "auxiliary_loss_clip": 0.06409734, + "auxiliary_loss_mlp": 0.01265632, + "balance_loss_clip": 0.062701, + "balance_loss_mlp": 0.01255565, + "epoch": 0.5809108672779197, + "flos": 13703464431360.0, + "grad_norm": 1.88238902360882, + "language_loss": 0.74297959, + "learning_rate": 1.5761928335513704e-06, + "loss": 0.81973332, + "num_input_tokens_seen": 208116960, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.10070801, + "step": 9662, + "time_per_iteration": 2.4924473762512207 + }, + { + "auxiliary_loss_clip": 0.06306686, + "auxiliary_loss_mlp": 0.01251122, + "balance_loss_clip": 0.06246165, + "balance_loss_mlp": 0.0124951, + "epoch": 0.5809709905305876, + "flos": 69157687386240.0, + "grad_norm": 0.8243605057954629, + "language_loss": 0.58189029, + "learning_rate": 1.5758122243578709e-06, + "loss": 0.65746832, + "num_input_tokens_seen": 208182190, + "router_z_loss_clip": 0.60546875, + "router_z_loss_mlp": 0.0161438, + "step": 9663, + "time_per_iteration": 3.215336799621582 + }, + { + "auxiliary_loss_clip": 0.06414537, + "auxiliary_loss_mlp": 0.01265807, + "balance_loss_clip": 0.06272955, + "balance_loss_mlp": 0.01254392, + "epoch": 0.5810311137832557, + "flos": 19833491831040.0, + "grad_norm": 2.48301510503896, + "language_loss": 0.82404405, + "learning_rate": 1.5754316312496152e-06, + "loss": 0.90084743, + "num_input_tokens_seen": 208197015, + "router_z_loss_clip": 1.41601562, + "router_z_loss_mlp": 0.11413574, + "step": 9664, + "time_per_iteration": 2.663583278656006 + }, + { + "auxiliary_loss_clip": 0.06419012, + "auxiliary_loss_mlp": 0.01263414, + "balance_loss_clip": 0.06271498, + "balance_loss_mlp": 0.01252423, + "epoch": 0.5810912370359237, + "flos": 29245635659520.0, + "grad_norm": 1.676690255308112, + "language_loss": 0.81861937, + "learning_rate": 1.5750510542410337e-06, + "loss": 0.89544368, + "num_input_tokens_seen": 208215795, + "router_z_loss_clip": 1.47460938, + "router_z_loss_mlp": 0.10992432, + "step": 9665, + "time_per_iteration": 2.5936458110809326 + }, + { + "auxiliary_loss_clip": 0.06425443, + "auxiliary_loss_mlp": 0.01269377, + "balance_loss_clip": 0.0627546, + "balance_loss_mlp": 0.01257098, + "epoch": 0.5811513602885916, + "flos": 22792469788800.0, + "grad_norm": 1.7928396623098657, + "language_loss": 0.80963171, + "learning_rate": 1.5746704933465599e-06, + "loss": 0.88657987, + "num_input_tokens_seen": 208234655, + "router_z_loss_clip": 1.49804688, + "router_z_loss_mlp": 0.12268066, + "step": 9666, + "time_per_iteration": 2.556262969970703 + }, + { + "auxiliary_loss_clip": 0.06412445, + "auxiliary_loss_mlp": 0.01266794, + "balance_loss_clip": 0.06271029, + "balance_loss_mlp": 0.01256059, + "epoch": 0.5812114835412596, + "flos": 18740347464960.0, + "grad_norm": 1.6774912146747003, + "language_loss": 0.79895651, + "learning_rate": 1.5742899485806227e-06, + "loss": 0.87574893, + "num_input_tokens_seen": 208251300, + "router_z_loss_clip": 1.41308594, + "router_z_loss_mlp": 0.1072998, + "step": 9667, + "time_per_iteration": 3.980412483215332 + }, + { + "auxiliary_loss_clip": 0.06427534, + "auxiliary_loss_mlp": 0.01265338, + "balance_loss_clip": 0.06274875, + "balance_loss_mlp": 0.01252791, + "epoch": 0.5812716067939275, + "flos": 26438324791680.0, + "grad_norm": 1.482922365624984, + "language_loss": 0.79118401, + "learning_rate": 1.573909419957653e-06, + "loss": 0.86811268, + "num_input_tokens_seen": 208272685, + "router_z_loss_clip": 1.52636719, + "router_z_loss_mlp": 0.12536621, + "step": 9668, + "time_per_iteration": 2.565986156463623 + }, + { + "auxiliary_loss_clip": 0.06418585, + "auxiliary_loss_mlp": 0.01270366, + "balance_loss_clip": 0.06273644, + "balance_loss_mlp": 0.0125872, + "epoch": 0.5813317300465956, + "flos": 43407847595520.0, + "grad_norm": 1.832859625901051, + "language_loss": 0.64703673, + "learning_rate": 1.5735289074920819e-06, + "loss": 0.72392619, + "num_input_tokens_seen": 208294315, + "router_z_loss_clip": 1.44726562, + "router_z_loss_mlp": 0.11657715, + "step": 9669, + "time_per_iteration": 2.804957151412964 + }, + { + "auxiliary_loss_clip": 0.06415828, + "auxiliary_loss_mlp": 0.01266389, + "balance_loss_clip": 0.0627243, + "balance_loss_mlp": 0.01254969, + "epoch": 0.5813918532992635, + "flos": 24791564499840.0, + "grad_norm": 1.4489654033865982, + "language_loss": 0.73791713, + "learning_rate": 1.5731484111983363e-06, + "loss": 0.81473929, + "num_input_tokens_seen": 208315610, + "router_z_loss_clip": 1.43457031, + "router_z_loss_mlp": 0.11425781, + "step": 9670, + "time_per_iteration": 2.54849910736084 + }, + { + "auxiliary_loss_clip": 0.0641885, + "auxiliary_loss_mlp": 0.01269355, + "balance_loss_clip": 0.06272031, + "balance_loss_mlp": 0.0125822, + "epoch": 0.5814519765519315, + "flos": 22864068702720.0, + "grad_norm": 1.8471376195746119, + "language_loss": 0.79354227, + "learning_rate": 1.5727679310908464e-06, + "loss": 0.87042427, + "num_input_tokens_seen": 208334725, + "router_z_loss_clip": 1.46679688, + "router_z_loss_mlp": 0.11138916, + "step": 9671, + "time_per_iteration": 2.553971529006958 + }, + { + "auxiliary_loss_clip": 0.06426669, + "auxiliary_loss_mlp": 0.0126691, + "balance_loss_clip": 0.06274676, + "balance_loss_mlp": 0.01254685, + "epoch": 0.5815120998045994, + "flos": 24067651150080.0, + "grad_norm": 2.0867956489424495, + "language_loss": 0.61609662, + "learning_rate": 1.5723874671840399e-06, + "loss": 0.6930325, + "num_input_tokens_seen": 208353825, + "router_z_loss_clip": 1.51855469, + "router_z_loss_mlp": 0.12219238, + "step": 9672, + "time_per_iteration": 2.5135464668273926 + }, + { + "auxiliary_loss_clip": 0.06413487, + "auxiliary_loss_mlp": 0.01267774, + "balance_loss_clip": 0.06271096, + "balance_loss_mlp": 0.01256735, + "epoch": 0.5815722230572674, + "flos": 24286305179520.0, + "grad_norm": 2.966012751852424, + "language_loss": 0.81724179, + "learning_rate": 1.572007019492342e-06, + "loss": 0.89405441, + "num_input_tokens_seen": 208374160, + "router_z_loss_clip": 1.42285156, + "router_z_loss_mlp": 0.1104126, + "step": 9673, + "time_per_iteration": 2.531637668609619 + }, + { + "auxiliary_loss_clip": 0.06422119, + "auxiliary_loss_mlp": 0.01271004, + "balance_loss_clip": 0.06272921, + "balance_loss_mlp": 0.01258976, + "epoch": 0.5816323463099353, + "flos": 22206932657280.0, + "grad_norm": 1.7930668974507213, + "language_loss": 0.88784432, + "learning_rate": 1.5716265880301817e-06, + "loss": 0.9647755, + "num_input_tokens_seen": 208392105, + "router_z_loss_clip": 1.49121094, + "router_z_loss_mlp": 0.12030029, + "step": 9674, + "time_per_iteration": 2.490135908126831 + }, + { + "auxiliary_loss_clip": 0.06420779, + "auxiliary_loss_mlp": 0.01264457, + "balance_loss_clip": 0.0627536, + "balance_loss_mlp": 0.01253799, + "epoch": 0.5816924695626033, + "flos": 24141388343040.0, + "grad_norm": 1.4439307600636533, + "language_loss": 0.78848791, + "learning_rate": 1.571246172811984e-06, + "loss": 0.86534023, + "num_input_tokens_seen": 208411755, + "router_z_loss_clip": 1.453125, + "router_z_loss_mlp": 0.10656738, + "step": 9675, + "time_per_iteration": 2.570401191711426 + }, + { + "auxiliary_loss_clip": 0.06415851, + "auxiliary_loss_mlp": 0.01264178, + "balance_loss_clip": 0.06271321, + "balance_loss_mlp": 0.01252901, + "epoch": 0.5817525928152713, + "flos": 21330555603840.0, + "grad_norm": 2.1244098418378234, + "language_loss": 0.70489943, + "learning_rate": 1.5708657738521748e-06, + "loss": 0.78169978, + "num_input_tokens_seen": 208429995, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.11279297, + "step": 9676, + "time_per_iteration": 2.5234405994415283 + }, + { + "auxiliary_loss_clip": 0.06419084, + "auxiliary_loss_mlp": 0.01273498, + "balance_loss_clip": 0.06272397, + "balance_loss_mlp": 0.01262579, + "epoch": 0.5818127160679393, + "flos": 26940355729920.0, + "grad_norm": 2.3696751764318478, + "language_loss": 0.63762164, + "learning_rate": 1.5704853911651779e-06, + "loss": 0.71454746, + "num_input_tokens_seen": 208443655, + "router_z_loss_clip": 1.46582031, + "router_z_loss_mlp": 0.10906982, + "step": 9677, + "time_per_iteration": 2.5408287048339844 + }, + { + "auxiliary_loss_clip": 0.06307964, + "auxiliary_loss_mlp": 0.01264809, + "balance_loss_clip": 0.06247746, + "balance_loss_mlp": 0.01262844, + "epoch": 0.5818728393206073, + "flos": 63940779855360.0, + "grad_norm": 0.7897947317556949, + "language_loss": 0.54107881, + "learning_rate": 1.5701050247654182e-06, + "loss": 0.61680651, + "num_input_tokens_seen": 208498405, + "router_z_loss_clip": 0.6015625, + "router_z_loss_mlp": 0.01963806, + "step": 9678, + "time_per_iteration": 3.1962106227874756 + }, + { + "auxiliary_loss_clip": 0.0631143, + "auxiliary_loss_mlp": 0.0126129, + "balance_loss_clip": 0.06251128, + "balance_loss_mlp": 0.01259724, + "epoch": 0.5819329625732752, + "flos": 64972654087680.0, + "grad_norm": 0.717265543619072, + "language_loss": 0.56126428, + "learning_rate": 1.569724674667319e-06, + "loss": 0.6369915, + "num_input_tokens_seen": 208559075, + "router_z_loss_clip": 0.60546875, + "router_z_loss_mlp": 0.01565552, + "step": 9679, + "time_per_iteration": 3.0475993156433105 + }, + { + "auxiliary_loss_clip": 0.06420414, + "auxiliary_loss_mlp": 0.01271497, + "balance_loss_clip": 0.06274636, + "balance_loss_mlp": 0.01260386, + "epoch": 0.5819930858259432, + "flos": 21221668823040.0, + "grad_norm": 1.5334769221386826, + "language_loss": 0.65937847, + "learning_rate": 1.5693443408853032e-06, + "loss": 0.73629761, + "num_input_tokens_seen": 208577770, + "router_z_loss_clip": 1.45703125, + "router_z_loss_mlp": 0.11102295, + "step": 9680, + "time_per_iteration": 2.526440382003784 + }, + { + "auxiliary_loss_clip": 0.06418791, + "auxiliary_loss_mlp": 0.01266513, + "balance_loss_clip": 0.06274027, + "balance_loss_mlp": 0.01255909, + "epoch": 0.5820532090786111, + "flos": 19463715763200.0, + "grad_norm": 1.789175734331282, + "language_loss": 0.84067512, + "learning_rate": 1.5689640234337933e-06, + "loss": 0.91752815, + "num_input_tokens_seen": 208595110, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.10601807, + "step": 9681, + "time_per_iteration": 2.4850056171417236 + }, + { + "auxiliary_loss_clip": 0.06416699, + "auxiliary_loss_mlp": 0.01267084, + "balance_loss_clip": 0.06272473, + "balance_loss_mlp": 0.01255908, + "epoch": 0.5821133323312792, + "flos": 17718424669440.0, + "grad_norm": 2.261651210831951, + "language_loss": 0.76110494, + "learning_rate": 1.5685837223272109e-06, + "loss": 0.83794284, + "num_input_tokens_seen": 208612080, + "router_z_loss_clip": 1.44140625, + "router_z_loss_mlp": 0.11181641, + "step": 9682, + "time_per_iteration": 2.5017287731170654 + }, + { + "auxiliary_loss_clip": 0.06430176, + "auxiliary_loss_mlp": 0.01270705, + "balance_loss_clip": 0.06278756, + "balance_loss_mlp": 0.01258951, + "epoch": 0.5821734555839471, + "flos": 24578738328960.0, + "grad_norm": 2.1342093378293785, + "language_loss": 0.75805819, + "learning_rate": 1.568203437579977e-06, + "loss": 0.83506703, + "num_input_tokens_seen": 208630235, + "router_z_loss_clip": 1.515625, + "router_z_loss_mlp": 0.11749268, + "step": 9683, + "time_per_iteration": 2.5426952838897705 + }, + { + "auxiliary_loss_clip": 0.06429425, + "auxiliary_loss_mlp": 0.01275466, + "balance_loss_clip": 0.06278548, + "balance_loss_mlp": 0.0126283, + "epoch": 0.5822335788366151, + "flos": 22388760017280.0, + "grad_norm": 1.6377653311732083, + "language_loss": 0.74168241, + "learning_rate": 1.5678231692065116e-06, + "loss": 0.81873143, + "num_input_tokens_seen": 208647925, + "router_z_loss_clip": 1.50683594, + "router_z_loss_mlp": 0.12646484, + "step": 9684, + "time_per_iteration": 2.521773338317871 + }, + { + "auxiliary_loss_clip": 0.06424329, + "auxiliary_loss_mlp": 0.01273987, + "balance_loss_clip": 0.06276318, + "balance_loss_mlp": 0.01262114, + "epoch": 0.582293702089283, + "flos": 26729458202880.0, + "grad_norm": 2.7880175036552446, + "language_loss": 0.78406078, + "learning_rate": 1.5674429172212348e-06, + "loss": 0.86104393, + "num_input_tokens_seen": 208666180, + "router_z_loss_clip": 1.47851562, + "router_z_loss_mlp": 0.11871338, + "step": 9685, + "time_per_iteration": 2.53759503364563 + }, + { + "auxiliary_loss_clip": 0.06423293, + "auxiliary_loss_mlp": 0.01274993, + "balance_loss_clip": 0.06276082, + "balance_loss_mlp": 0.0126337, + "epoch": 0.582353825341951, + "flos": 17354560314240.0, + "grad_norm": 1.6209571199936617, + "language_loss": 0.75622851, + "learning_rate": 1.5670626816385667e-06, + "loss": 0.83321142, + "num_input_tokens_seen": 208684240, + "router_z_loss_clip": 1.47167969, + "router_z_loss_mlp": 0.11627197, + "step": 9686, + "time_per_iteration": 2.5203354358673096 + }, + { + "auxiliary_loss_clip": 0.06317171, + "auxiliary_loss_mlp": 0.01254478, + "balance_loss_clip": 0.06256813, + "balance_loss_mlp": 0.012529, + "epoch": 0.5824139485946189, + "flos": 55491133478400.0, + "grad_norm": 0.7976004724910164, + "language_loss": 0.57134593, + "learning_rate": 1.5666824624729244e-06, + "loss": 0.64706242, + "num_input_tokens_seen": 208736090, + "router_z_loss_clip": 0.60546875, + "router_z_loss_mlp": 0.01578522, + "step": 9687, + "time_per_iteration": 2.9669835567474365 + }, + { + "auxiliary_loss_clip": 0.06422709, + "auxiliary_loss_mlp": 0.01267333, + "balance_loss_clip": 0.06275669, + "balance_loss_mlp": 0.01255221, + "epoch": 0.582474071847287, + "flos": 20309261713920.0, + "grad_norm": 1.877177452165203, + "language_loss": 0.70002449, + "learning_rate": 1.566302259738727e-06, + "loss": 0.77692491, + "num_input_tokens_seen": 208754600, + "router_z_loss_clip": 1.47070312, + "router_z_loss_mlp": 0.12109375, + "step": 9688, + "time_per_iteration": 2.506741762161255 + }, + { + "auxiliary_loss_clip": 0.06417575, + "auxiliary_loss_mlp": 0.01265264, + "balance_loss_clip": 0.0627282, + "balance_loss_mlp": 0.01254673, + "epoch": 0.5825341950999549, + "flos": 23884733687040.0, + "grad_norm": 2.896352551150335, + "language_loss": 0.65452719, + "learning_rate": 1.5659220734503918e-06, + "loss": 0.73135561, + "num_input_tokens_seen": 208773140, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.10595703, + "step": 9689, + "time_per_iteration": 2.506406784057617 + }, + { + "auxiliary_loss_clip": 0.06415856, + "auxiliary_loss_mlp": 0.01273228, + "balance_loss_clip": 0.06272023, + "balance_loss_mlp": 0.0126126, + "epoch": 0.5825943183526229, + "flos": 23119842890880.0, + "grad_norm": 1.995545981005341, + "language_loss": 0.73637474, + "learning_rate": 1.5655419036223341e-06, + "loss": 0.81326556, + "num_input_tokens_seen": 208793410, + "router_z_loss_clip": 1.43847656, + "router_z_loss_mlp": 0.11956787, + "step": 9690, + "time_per_iteration": 3.9373486042022705 + }, + { + "auxiliary_loss_clip": 0.0642629, + "auxiliary_loss_mlp": 0.01267094, + "balance_loss_clip": 0.06275761, + "balance_loss_mlp": 0.01254887, + "epoch": 0.5826544416052909, + "flos": 22864152556800.0, + "grad_norm": 1.6091940048024238, + "language_loss": 0.76358879, + "learning_rate": 1.5651617502689717e-06, + "loss": 0.84052265, + "num_input_tokens_seen": 208811920, + "router_z_loss_clip": 1.50488281, + "router_z_loss_mlp": 0.12207031, + "step": 9691, + "time_per_iteration": 2.5036911964416504 + }, + { + "auxiliary_loss_clip": 0.06422149, + "auxiliary_loss_mlp": 0.01270283, + "balance_loss_clip": 0.06274154, + "balance_loss_mlp": 0.0125906, + "epoch": 0.5827145648579588, + "flos": 31509560799360.0, + "grad_norm": 1.692225094183595, + "language_loss": 0.80700606, + "learning_rate": 1.5647816134047184e-06, + "loss": 0.88393039, + "num_input_tokens_seen": 208834720, + "router_z_loss_clip": 1.48046875, + "router_z_loss_mlp": 0.11218262, + "step": 9692, + "time_per_iteration": 2.588819980621338 + }, + { + "auxiliary_loss_clip": 0.06307849, + "auxiliary_loss_mlp": 0.01251158, + "balance_loss_clip": 0.06247954, + "balance_loss_mlp": 0.01249412, + "epoch": 0.5827746881106268, + "flos": 69832028246400.0, + "grad_norm": 0.7844854120913538, + "language_loss": 0.5681411, + "learning_rate": 1.5644014930439907e-06, + "loss": 0.64373118, + "num_input_tokens_seen": 208898415, + "router_z_loss_clip": 0.6015625, + "router_z_loss_mlp": 0.01751709, + "step": 9693, + "time_per_iteration": 3.1347033977508545 + }, + { + "auxiliary_loss_clip": 0.0641888, + "auxiliary_loss_mlp": 0.01268479, + "balance_loss_clip": 0.06273088, + "balance_loss_mlp": 0.0125815, + "epoch": 0.5828348113632947, + "flos": 23119088204160.0, + "grad_norm": 1.522522739802819, + "language_loss": 0.78923696, + "learning_rate": 1.5640213892012025e-06, + "loss": 0.86611056, + "num_input_tokens_seen": 208919045, + "router_z_loss_clip": 1.45703125, + "router_z_loss_mlp": 0.10327148, + "step": 9694, + "time_per_iteration": 2.5068466663360596 + }, + { + "auxiliary_loss_clip": 0.06411383, + "auxiliary_loss_mlp": 0.01263322, + "balance_loss_clip": 0.06271289, + "balance_loss_mlp": 0.01253302, + "epoch": 0.5828949346159628, + "flos": 21879769190400.0, + "grad_norm": 1.3653324202123376, + "language_loss": 0.76330042, + "learning_rate": 1.5636413018907656e-06, + "loss": 0.84004748, + "num_input_tokens_seen": 208939375, + "router_z_loss_clip": 1.40332031, + "router_z_loss_mlp": 0.10021973, + "step": 9695, + "time_per_iteration": 2.556309700012207 + }, + { + "auxiliary_loss_clip": 0.06315481, + "auxiliary_loss_mlp": 0.01251352, + "balance_loss_clip": 0.06255624, + "balance_loss_mlp": 0.01249797, + "epoch": 0.5829550578686307, + "flos": 65985170497920.0, + "grad_norm": 0.7496740614083074, + "language_loss": 0.54866987, + "learning_rate": 1.563261231127095e-06, + "loss": 0.62433827, + "num_input_tokens_seen": 209004760, + "router_z_loss_clip": 0.6015625, + "router_z_loss_mlp": 0.01553345, + "step": 9696, + "time_per_iteration": 4.669760704040527 + }, + { + "auxiliary_loss_clip": 0.06418857, + "auxiliary_loss_mlp": 0.01264307, + "balance_loss_clip": 0.06272456, + "balance_loss_mlp": 0.01252893, + "epoch": 0.5830151811212987, + "flos": 16295391578880.0, + "grad_norm": 1.8785254946392194, + "language_loss": 0.76464188, + "learning_rate": 1.5628811769246021e-06, + "loss": 0.84147352, + "num_input_tokens_seen": 209022930, + "router_z_loss_clip": 1.46289062, + "router_z_loss_mlp": 0.11413574, + "step": 9697, + "time_per_iteration": 2.5041255950927734 + }, + { + "auxiliary_loss_clip": 0.06422254, + "auxiliary_loss_mlp": 0.01268851, + "balance_loss_clip": 0.06272788, + "balance_loss_mlp": 0.01256668, + "epoch": 0.5830753043739666, + "flos": 24175447827840.0, + "grad_norm": 1.6024364882265518, + "language_loss": 0.77965522, + "learning_rate": 1.5625011392976991e-06, + "loss": 0.85656625, + "num_input_tokens_seen": 209043740, + "router_z_loss_clip": 1.49511719, + "router_z_loss_mlp": 0.12188721, + "step": 9698, + "time_per_iteration": 2.5902624130249023 + }, + { + "auxiliary_loss_clip": 0.06415899, + "auxiliary_loss_mlp": 0.01273709, + "balance_loss_clip": 0.06272474, + "balance_loss_mlp": 0.01260894, + "epoch": 0.5831354276266346, + "flos": 27067438846080.0, + "grad_norm": 1.5547381527883266, + "language_loss": 0.84016132, + "learning_rate": 1.5621211182607966e-06, + "loss": 0.91705739, + "num_input_tokens_seen": 209068885, + "router_z_loss_clip": 1.43359375, + "router_z_loss_mlp": 0.12817383, + "step": 9699, + "time_per_iteration": 2.6469032764434814 + }, + { + "auxiliary_loss_clip": 0.0642215, + "auxiliary_loss_mlp": 0.01265721, + "balance_loss_clip": 0.06274705, + "balance_loss_mlp": 0.01254104, + "epoch": 0.5831955508793025, + "flos": 23630301164160.0, + "grad_norm": 1.933998465104238, + "language_loss": 0.65971506, + "learning_rate": 1.561741113828305e-06, + "loss": 0.73659378, + "num_input_tokens_seen": 209087340, + "router_z_loss_clip": 1.47558594, + "router_z_loss_mlp": 0.1161499, + "step": 9700, + "time_per_iteration": 3.9589943885803223 + }, + { + "auxiliary_loss_clip": 0.06417754, + "auxiliary_loss_mlp": 0.0126768, + "balance_loss_clip": 0.0627218, + "balance_loss_mlp": 0.01256086, + "epoch": 0.5832556741319705, + "flos": 24980267894400.0, + "grad_norm": 1.7460823027462598, + "language_loss": 0.71739107, + "learning_rate": 1.5613611260146344e-06, + "loss": 0.79424536, + "num_input_tokens_seen": 209108840, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.1159668, + "step": 9701, + "time_per_iteration": 2.591634511947632 + }, + { + "auxiliary_loss_clip": 0.06415233, + "auxiliary_loss_mlp": 0.01264901, + "balance_loss_clip": 0.06270908, + "balance_loss_mlp": 0.01253278, + "epoch": 0.5833157973846385, + "flos": 23228226547200.0, + "grad_norm": 1.7061750612547373, + "language_loss": 0.85686189, + "learning_rate": 1.5609811548341936e-06, + "loss": 0.93366319, + "num_input_tokens_seen": 209127985, + "router_z_loss_clip": 1.44335938, + "router_z_loss_mlp": 0.11627197, + "step": 9702, + "time_per_iteration": 2.552055835723877 + }, + { + "auxiliary_loss_clip": 0.0641585, + "auxiliary_loss_mlp": 0.01263882, + "balance_loss_clip": 0.0627341, + "balance_loss_mlp": 0.01253511, + "epoch": 0.5833759206373065, + "flos": 21983876288640.0, + "grad_norm": 1.4269240656932136, + "language_loss": 0.78200948, + "learning_rate": 1.560601200301392e-06, + "loss": 0.85880685, + "num_input_tokens_seen": 209146885, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.10369873, + "step": 9703, + "time_per_iteration": 2.500241279602051 + }, + { + "auxiliary_loss_clip": 0.06420664, + "auxiliary_loss_mlp": 0.01264639, + "balance_loss_clip": 0.06272513, + "balance_loss_mlp": 0.01252831, + "epoch": 0.5834360438899745, + "flos": 21768869911680.0, + "grad_norm": 1.5504614474031426, + "language_loss": 0.71309936, + "learning_rate": 1.5602212624306366e-06, + "loss": 0.78995246, + "num_input_tokens_seen": 209166130, + "router_z_loss_clip": 1.48242188, + "router_z_loss_mlp": 0.11816406, + "step": 9704, + "time_per_iteration": 2.5374741554260254 + }, + { + "auxiliary_loss_clip": 0.06421441, + "auxiliary_loss_mlp": 0.0126726, + "balance_loss_clip": 0.06276259, + "balance_loss_mlp": 0.01256919, + "epoch": 0.5834961671426424, + "flos": 15997214424960.0, + "grad_norm": 1.6199693671180324, + "language_loss": 0.81965989, + "learning_rate": 1.559841341236335e-06, + "loss": 0.89654684, + "num_input_tokens_seen": 209183350, + "router_z_loss_clip": 1.45117188, + "router_z_loss_mlp": 0.10339355, + "step": 9705, + "time_per_iteration": 2.5450189113616943 + }, + { + "auxiliary_loss_clip": 0.06418713, + "auxiliary_loss_mlp": 0.01264358, + "balance_loss_clip": 0.06273229, + "balance_loss_mlp": 0.01253379, + "epoch": 0.5835562903953104, + "flos": 22824600629760.0, + "grad_norm": 1.6206416307327924, + "language_loss": 0.80445373, + "learning_rate": 1.5594614367328937e-06, + "loss": 0.88128448, + "num_input_tokens_seen": 209203945, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.10986328, + "step": 9706, + "time_per_iteration": 2.5352673530578613 + }, + { + "auxiliary_loss_clip": 0.06415439, + "auxiliary_loss_mlp": 0.01273281, + "balance_loss_clip": 0.06272696, + "balance_loss_mlp": 0.01261003, + "epoch": 0.5836164136479783, + "flos": 48478664332800.0, + "grad_norm": 1.6746295019388222, + "language_loss": 0.74755418, + "learning_rate": 1.5590815489347187e-06, + "loss": 0.82444143, + "num_input_tokens_seen": 209227080, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.1227417, + "step": 9707, + "time_per_iteration": 4.184760808944702 + }, + { + "auxiliary_loss_clip": 0.06414578, + "auxiliary_loss_mlp": 0.01264036, + "balance_loss_clip": 0.06273817, + "balance_loss_mlp": 0.01253463, + "epoch": 0.5836765369006464, + "flos": 26913172279680.0, + "grad_norm": 1.726633366654796, + "language_loss": 0.81783116, + "learning_rate": 1.5587016778562163e-06, + "loss": 0.89461732, + "num_input_tokens_seen": 209248170, + "router_z_loss_clip": 1.40527344, + "router_z_loss_mlp": 0.10571289, + "step": 9708, + "time_per_iteration": 2.5494630336761475 + }, + { + "auxiliary_loss_clip": 0.064155, + "auxiliary_loss_mlp": 0.01267312, + "balance_loss_clip": 0.06274238, + "balance_loss_mlp": 0.01256404, + "epoch": 0.5837366601533143, + "flos": 20090230341120.0, + "grad_norm": 1.3928808196753693, + "language_loss": 0.78363276, + "learning_rate": 1.5583218235117896e-06, + "loss": 0.86046088, + "num_input_tokens_seen": 209267730, + "router_z_loss_clip": 1.41308594, + "router_z_loss_mlp": 0.10906982, + "step": 9709, + "time_per_iteration": 2.54146409034729 + }, + { + "auxiliary_loss_clip": 0.06313366, + "auxiliary_loss_mlp": 0.01252195, + "balance_loss_clip": 0.06253533, + "balance_loss_mlp": 0.01250684, + "epoch": 0.5837967834059823, + "flos": 65383910726400.0, + "grad_norm": 0.7481338178050596, + "language_loss": 0.5665468, + "learning_rate": 1.557941985915844e-06, + "loss": 0.64220238, + "num_input_tokens_seen": 209332510, + "router_z_loss_clip": 0.60058594, + "router_z_loss_mlp": 0.0151062, + "step": 9710, + "time_per_iteration": 3.130523443222046 + }, + { + "auxiliary_loss_clip": 0.06414168, + "auxiliary_loss_mlp": 0.01266687, + "balance_loss_clip": 0.06273045, + "balance_loss_mlp": 0.01256495, + "epoch": 0.5838569066586502, + "flos": 25345809331200.0, + "grad_norm": 1.5024705126599753, + "language_loss": 0.65656877, + "learning_rate": 1.5575621650827833e-06, + "loss": 0.73337734, + "num_input_tokens_seen": 209353355, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.10198975, + "step": 9711, + "time_per_iteration": 2.558560609817505 + }, + { + "auxiliary_loss_clip": 0.06425221, + "auxiliary_loss_mlp": 0.0126656, + "balance_loss_clip": 0.06273845, + "balance_loss_mlp": 0.0125393, + "epoch": 0.5839170299113182, + "flos": 22234535377920.0, + "grad_norm": 1.9299970772651502, + "language_loss": 0.79264128, + "learning_rate": 1.5571823610270085e-06, + "loss": 0.86955917, + "num_input_tokens_seen": 209370960, + "router_z_loss_clip": 1.51269531, + "router_z_loss_mlp": 0.12640381, + "step": 9712, + "time_per_iteration": 2.571164131164551 + }, + { + "auxiliary_loss_clip": 0.06417041, + "auxiliary_loss_mlp": 0.01264809, + "balance_loss_clip": 0.06273463, + "balance_loss_mlp": 0.01254021, + "epoch": 0.5839771531639861, + "flos": 22206513386880.0, + "grad_norm": 1.5054581881557743, + "language_loss": 0.73669749, + "learning_rate": 1.5568025737629234e-06, + "loss": 0.81351602, + "num_input_tokens_seen": 209390955, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.10784912, + "step": 9713, + "time_per_iteration": 2.5475780963897705 + }, + { + "auxiliary_loss_clip": 0.06424147, + "auxiliary_loss_mlp": 0.01265979, + "balance_loss_clip": 0.06274505, + "balance_loss_mlp": 0.01252932, + "epoch": 0.5840372764166541, + "flos": 22425964030080.0, + "grad_norm": 1.9255335004661567, + "language_loss": 0.70002109, + "learning_rate": 1.5564228033049292e-06, + "loss": 0.77692235, + "num_input_tokens_seen": 209410260, + "router_z_loss_clip": 1.49511719, + "router_z_loss_mlp": 0.13049316, + "step": 9714, + "time_per_iteration": 2.523638963699341 + }, + { + "auxiliary_loss_clip": 0.06419174, + "auxiliary_loss_mlp": 0.01266096, + "balance_loss_clip": 0.06273787, + "balance_loss_mlp": 0.012543, + "epoch": 0.5840973996693221, + "flos": 19834330371840.0, + "grad_norm": 1.8598920078622099, + "language_loss": 0.80627859, + "learning_rate": 1.5560430496674268e-06, + "loss": 0.88313133, + "num_input_tokens_seen": 209429920, + "router_z_loss_clip": 1.453125, + "router_z_loss_mlp": 0.11798096, + "step": 9715, + "time_per_iteration": 2.5382297039031982 + }, + { + "auxiliary_loss_clip": 0.06417744, + "auxiliary_loss_mlp": 0.01265495, + "balance_loss_clip": 0.0627513, + "balance_loss_mlp": 0.01254194, + "epoch": 0.5841575229219901, + "flos": 21149482930560.0, + "grad_norm": 1.9876848107590372, + "language_loss": 0.73826301, + "learning_rate": 1.5556633128648167e-06, + "loss": 0.81509537, + "num_input_tokens_seen": 209449470, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.11303711, + "step": 9716, + "time_per_iteration": 2.5080726146698 + }, + { + "auxiliary_loss_clip": 0.06413358, + "auxiliary_loss_mlp": 0.01264669, + "balance_loss_clip": 0.0627432, + "balance_loss_mlp": 0.01254202, + "epoch": 0.5842176461746581, + "flos": 24646521882240.0, + "grad_norm": 2.3723983049620876, + "language_loss": 0.75045407, + "learning_rate": 1.5552835929114976e-06, + "loss": 0.82723433, + "num_input_tokens_seen": 209467695, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.10467529, + "step": 9717, + "time_per_iteration": 2.5569300651550293 + }, + { + "auxiliary_loss_clip": 0.06420394, + "auxiliary_loss_mlp": 0.01265893, + "balance_loss_clip": 0.06276444, + "balance_loss_mlp": 0.01254759, + "epoch": 0.584277769427326, + "flos": 19136468442240.0, + "grad_norm": 2.2457444336667343, + "language_loss": 0.80242944, + "learning_rate": 1.5549038898218697e-06, + "loss": 0.87929225, + "num_input_tokens_seen": 209484250, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.11132812, + "step": 9718, + "time_per_iteration": 2.5623273849487305 + }, + { + "auxiliary_loss_clip": 0.06421262, + "auxiliary_loss_mlp": 0.01264972, + "balance_loss_clip": 0.0627823, + "balance_loss_mlp": 0.01253117, + "epoch": 0.584337892679994, + "flos": 22681822072320.0, + "grad_norm": 1.5991831303569484, + "language_loss": 0.67348599, + "learning_rate": 1.5545242036103306e-06, + "loss": 0.75034833, + "num_input_tokens_seen": 209502830, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.11853027, + "step": 9719, + "time_per_iteration": 2.5381717681884766 + }, + { + "auxiliary_loss_clip": 0.0641831, + "auxiliary_loss_mlp": 0.01263454, + "balance_loss_clip": 0.06271739, + "balance_loss_mlp": 0.01252022, + "epoch": 0.5843980159326619, + "flos": 31291954945920.0, + "grad_norm": 1.728104183061379, + "language_loss": 0.75697351, + "learning_rate": 1.5541445342912786e-06, + "loss": 0.83379114, + "num_input_tokens_seen": 209525995, + "router_z_loss_clip": 1.46679688, + "router_z_loss_mlp": 0.11425781, + "step": 9720, + "time_per_iteration": 2.6132402420043945 + }, + { + "auxiliary_loss_clip": 0.06421956, + "auxiliary_loss_mlp": 0.01266891, + "balance_loss_clip": 0.06276225, + "balance_loss_mlp": 0.01255799, + "epoch": 0.58445813918533, + "flos": 22754846505600.0, + "grad_norm": 1.447216358863969, + "language_loss": 0.83020425, + "learning_rate": 1.5537648818791105e-06, + "loss": 0.90709275, + "num_input_tokens_seen": 209545895, + "router_z_loss_clip": 1.45605469, + "router_z_loss_mlp": 0.11090088, + "step": 9721, + "time_per_iteration": 2.5127675533294678 + }, + { + "auxiliary_loss_clip": 0.06310159, + "auxiliary_loss_mlp": 0.01253726, + "balance_loss_clip": 0.06250554, + "balance_loss_mlp": 0.01252051, + "epoch": 0.5845182624379979, + "flos": 60704602992000.0, + "grad_norm": 0.9150346622366115, + "language_loss": 0.71186364, + "learning_rate": 1.5533852463882226e-06, + "loss": 0.78750253, + "num_input_tokens_seen": 209602315, + "router_z_loss_clip": 0.59765625, + "router_z_loss_mlp": 0.01678467, + "step": 9722, + "time_per_iteration": 3.1494555473327637 + }, + { + "auxiliary_loss_clip": 0.06417061, + "auxiliary_loss_mlp": 0.01268389, + "balance_loss_clip": 0.06274655, + "balance_loss_mlp": 0.01257255, + "epoch": 0.5845783856906659, + "flos": 16367996741760.0, + "grad_norm": 1.9087918582550145, + "language_loss": 0.8944329, + "learning_rate": 1.5530056278330113e-06, + "loss": 0.97128743, + "num_input_tokens_seen": 209617615, + "router_z_loss_clip": 1.42285156, + "router_z_loss_mlp": 0.11132812, + "step": 9723, + "time_per_iteration": 2.4576761722564697 + }, + { + "auxiliary_loss_clip": 0.06417491, + "auxiliary_loss_mlp": 0.01267794, + "balance_loss_clip": 0.06274551, + "balance_loss_mlp": 0.01256922, + "epoch": 0.5846385089433338, + "flos": 20089475654400.0, + "grad_norm": 1.3439404505357262, + "language_loss": 0.68925285, + "learning_rate": 1.5526260262278709e-06, + "loss": 0.76610565, + "num_input_tokens_seen": 209637005, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.10870361, + "step": 9724, + "time_per_iteration": 2.5088019371032715 + }, + { + "auxiliary_loss_clip": 0.06417604, + "auxiliary_loss_mlp": 0.01265081, + "balance_loss_clip": 0.06271344, + "balance_loss_mlp": 0.01252922, + "epoch": 0.5846986321960018, + "flos": 17316769322880.0, + "grad_norm": 2.3711774156816188, + "language_loss": 0.86716926, + "learning_rate": 1.552246441587197e-06, + "loss": 0.94399607, + "num_input_tokens_seen": 209653170, + "router_z_loss_clip": 1.46289062, + "router_z_loss_mlp": 0.121521, + "step": 9725, + "time_per_iteration": 2.4511706829071045 + }, + { + "auxiliary_loss_clip": 0.06423703, + "auxiliary_loss_mlp": 0.0127082, + "balance_loss_clip": 0.06276515, + "balance_loss_mlp": 0.01258995, + "epoch": 0.5847587554486697, + "flos": 17202977078400.0, + "grad_norm": 1.45457124956925, + "language_loss": 0.8335436, + "learning_rate": 1.5518668739253821e-06, + "loss": 0.91048884, + "num_input_tokens_seen": 209671275, + "router_z_loss_clip": 1.47265625, + "router_z_loss_mlp": 0.1182251, + "step": 9726, + "time_per_iteration": 2.506606340408325 + }, + { + "auxiliary_loss_clip": 0.06418396, + "auxiliary_loss_mlp": 0.01263644, + "balance_loss_clip": 0.06271654, + "balance_loss_mlp": 0.01252957, + "epoch": 0.5848188787013378, + "flos": 24534993697920.0, + "grad_norm": 1.7434091697787477, + "language_loss": 0.67301726, + "learning_rate": 1.5514873232568206e-06, + "loss": 0.7498377, + "num_input_tokens_seen": 209690380, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 0.10675049, + "step": 9727, + "time_per_iteration": 2.5283849239349365 + }, + { + "auxiliary_loss_clip": 0.06419774, + "auxiliary_loss_mlp": 0.01272592, + "balance_loss_clip": 0.06275018, + "balance_loss_mlp": 0.0126054, + "epoch": 0.5848790019540057, + "flos": 20634161120640.0, + "grad_norm": 1.6131340234861964, + "language_loss": 0.82272881, + "learning_rate": 1.5511077895959055e-06, + "loss": 0.89965248, + "num_input_tokens_seen": 209708845, + "router_z_loss_clip": 1.44824219, + "router_z_loss_mlp": 0.12060547, + "step": 9728, + "time_per_iteration": 2.5226187705993652 + }, + { + "auxiliary_loss_clip": 0.06412318, + "auxiliary_loss_mlp": 0.01270439, + "balance_loss_clip": 0.06272879, + "balance_loss_mlp": 0.01260198, + "epoch": 0.5849391252066737, + "flos": 22425377051520.0, + "grad_norm": 1.6963428440366448, + "language_loss": 0.78290164, + "learning_rate": 1.550728272957027e-06, + "loss": 0.85972917, + "num_input_tokens_seen": 209729000, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.10241699, + "step": 9729, + "time_per_iteration": 3.922197103500366 + }, + { + "auxiliary_loss_clip": 0.06418414, + "auxiliary_loss_mlp": 0.01265654, + "balance_loss_clip": 0.06272924, + "balance_loss_mlp": 0.01254228, + "epoch": 0.5849992484593417, + "flos": 25417995223680.0, + "grad_norm": 1.7817091958189777, + "language_loss": 0.71144295, + "learning_rate": 1.5503487733545782e-06, + "loss": 0.78828371, + "num_input_tokens_seen": 209747435, + "router_z_loss_clip": 1.45605469, + "router_z_loss_mlp": 0.11419678, + "step": 9730, + "time_per_iteration": 2.5403687953948975 + }, + { + "auxiliary_loss_clip": 0.06422406, + "auxiliary_loss_mlp": 0.01268067, + "balance_loss_clip": 0.0627557, + "balance_loss_mlp": 0.01256188, + "epoch": 0.5850593717120096, + "flos": 21070840273920.0, + "grad_norm": 1.6620919701985222, + "language_loss": 0.78394347, + "learning_rate": 1.5499692908029482e-06, + "loss": 0.86084819, + "num_input_tokens_seen": 209764910, + "router_z_loss_clip": 1.46679688, + "router_z_loss_mlp": 0.11883545, + "step": 9731, + "time_per_iteration": 2.5166611671447754 + }, + { + "auxiliary_loss_clip": 0.06415913, + "auxiliary_loss_mlp": 0.01267443, + "balance_loss_clip": 0.0627268, + "balance_loss_mlp": 0.01256088, + "epoch": 0.5851194949646776, + "flos": 25308605318400.0, + "grad_norm": 2.100344301849282, + "language_loss": 0.70174819, + "learning_rate": 1.549589825316528e-06, + "loss": 0.77858174, + "num_input_tokens_seen": 209786115, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.11352539, + "step": 9732, + "time_per_iteration": 2.538188934326172 + }, + { + "auxiliary_loss_clip": 0.06423078, + "auxiliary_loss_mlp": 0.01269533, + "balance_loss_clip": 0.06275669, + "balance_loss_mlp": 0.01256707, + "epoch": 0.5851796182173455, + "flos": 23594103400320.0, + "grad_norm": 2.4062469566098685, + "language_loss": 0.53286588, + "learning_rate": 1.5492103769097075e-06, + "loss": 0.60979199, + "num_input_tokens_seen": 209806095, + "router_z_loss_clip": 1.47363281, + "router_z_loss_mlp": 0.12823486, + "step": 9733, + "time_per_iteration": 2.511302947998047 + }, + { + "auxiliary_loss_clip": 0.06417008, + "auxiliary_loss_mlp": 0.01268655, + "balance_loss_clip": 0.06273425, + "balance_loss_mlp": 0.01256657, + "epoch": 0.5852397414700136, + "flos": 24828936220800.0, + "grad_norm": 2.0225140710518184, + "language_loss": 0.87949061, + "learning_rate": 1.5488309455968739e-06, + "loss": 0.95634717, + "num_input_tokens_seen": 209823650, + "router_z_loss_clip": 1.43652344, + "router_z_loss_mlp": 0.12005615, + "step": 9734, + "time_per_iteration": 2.538619041442871 + }, + { + "auxiliary_loss_clip": 0.06415038, + "auxiliary_loss_mlp": 0.01266318, + "balance_loss_clip": 0.06276681, + "balance_loss_mlp": 0.01255667, + "epoch": 0.5852998647226815, + "flos": 19943887985280.0, + "grad_norm": 1.4699537388912873, + "language_loss": 0.72430563, + "learning_rate": 1.5484515313924163e-06, + "loss": 0.80111921, + "num_input_tokens_seen": 209843220, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.10656738, + "step": 9735, + "time_per_iteration": 3.9566004276275635 + }, + { + "auxiliary_loss_clip": 0.06418768, + "auxiliary_loss_mlp": 0.01267652, + "balance_loss_clip": 0.06273651, + "balance_loss_mlp": 0.0125563, + "epoch": 0.5853599879753495, + "flos": 16724817354240.0, + "grad_norm": 2.1987965595401135, + "language_loss": 0.7462939, + "learning_rate": 1.5480721343107217e-06, + "loss": 0.82315814, + "num_input_tokens_seen": 209854880, + "router_z_loss_clip": 1.45117188, + "router_z_loss_mlp": 0.12017822, + "step": 9736, + "time_per_iteration": 2.4270691871643066 + }, + { + "auxiliary_loss_clip": 0.06417002, + "auxiliary_loss_mlp": 0.01263204, + "balance_loss_clip": 0.06274146, + "balance_loss_mlp": 0.0125241, + "epoch": 0.5854201112280174, + "flos": 44466848622720.0, + "grad_norm": 1.4975519288318198, + "language_loss": 0.7076987, + "learning_rate": 1.5476927543661772e-06, + "loss": 0.78450084, + "num_input_tokens_seen": 209877870, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.10791016, + "step": 9737, + "time_per_iteration": 2.744206190109253 + }, + { + "auxiliary_loss_clip": 0.06416388, + "auxiliary_loss_mlp": 0.01270708, + "balance_loss_clip": 0.06274648, + "balance_loss_mlp": 0.01259556, + "epoch": 0.5854802344806854, + "flos": 20345375623680.0, + "grad_norm": 1.6871127807078519, + "language_loss": 0.82840961, + "learning_rate": 1.547313391573169e-06, + "loss": 0.90528059, + "num_input_tokens_seen": 209896690, + "router_z_loss_clip": 1.41796875, + "router_z_loss_mlp": 0.11151123, + "step": 9738, + "time_per_iteration": 2.4849019050598145 + }, + { + "auxiliary_loss_clip": 0.06422549, + "auxiliary_loss_mlp": 0.01269287, + "balance_loss_clip": 0.06275184, + "balance_loss_mlp": 0.01257431, + "epoch": 0.5855403577333533, + "flos": 20927013540480.0, + "grad_norm": 1.6194676695443784, + "language_loss": 0.69157064, + "learning_rate": 1.546934045946082e-06, + "loss": 0.768489, + "num_input_tokens_seen": 209914640, + "router_z_loss_clip": 1.47363281, + "router_z_loss_mlp": 0.11846924, + "step": 9739, + "time_per_iteration": 3.941681146621704 + }, + { + "auxiliary_loss_clip": 0.0641816, + "auxiliary_loss_mlp": 0.01267651, + "balance_loss_clip": 0.06272583, + "balance_loss_mlp": 0.01255796, + "epoch": 0.5856004809860214, + "flos": 20454849383040.0, + "grad_norm": 2.1509507460713038, + "language_loss": 0.59265625, + "learning_rate": 1.5465547174993017e-06, + "loss": 0.66951436, + "num_input_tokens_seen": 209933375, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.11859131, + "step": 9740, + "time_per_iteration": 2.5459988117218018 + }, + { + "auxiliary_loss_clip": 0.06417701, + "auxiliary_loss_mlp": 0.01265897, + "balance_loss_clip": 0.06273193, + "balance_loss_mlp": 0.0125487, + "epoch": 0.5856606042386893, + "flos": 19645962393600.0, + "grad_norm": 1.6784070122461718, + "language_loss": 0.75433791, + "learning_rate": 1.5461754062472113e-06, + "loss": 0.83117396, + "num_input_tokens_seen": 209952055, + "router_z_loss_clip": 1.44726562, + "router_z_loss_mlp": 0.11029053, + "step": 9741, + "time_per_iteration": 2.488905668258667 + }, + { + "auxiliary_loss_clip": 0.06418155, + "auxiliary_loss_mlp": 0.01263599, + "balance_loss_clip": 0.06272431, + "balance_loss_mlp": 0.01251857, + "epoch": 0.5857207274913573, + "flos": 21692072044800.0, + "grad_norm": 1.4885669249171192, + "language_loss": 0.76157856, + "learning_rate": 1.5457961122041959e-06, + "loss": 0.83839613, + "num_input_tokens_seen": 209971190, + "router_z_loss_clip": 1.45605469, + "router_z_loss_mlp": 0.11743164, + "step": 9742, + "time_per_iteration": 2.5480451583862305 + }, + { + "auxiliary_loss_clip": 0.06415333, + "auxiliary_loss_mlp": 0.01266181, + "balance_loss_clip": 0.06272702, + "balance_loss_mlp": 0.01254737, + "epoch": 0.5857808507440253, + "flos": 23188968109440.0, + "grad_norm": 1.7165353954706328, + "language_loss": 0.75240624, + "learning_rate": 1.5454168353846369e-06, + "loss": 0.82922137, + "num_input_tokens_seen": 209990695, + "router_z_loss_clip": 1.42480469, + "router_z_loss_mlp": 0.11444092, + "step": 9743, + "time_per_iteration": 2.503702163696289 + }, + { + "auxiliary_loss_clip": 0.0641541, + "auxiliary_loss_mlp": 0.01265703, + "balance_loss_clip": 0.06275813, + "balance_loss_mlp": 0.01254944, + "epoch": 0.5858409739966932, + "flos": 27242683660800.0, + "grad_norm": 1.53753206771929, + "language_loss": 0.81320727, + "learning_rate": 1.5450375758029172e-06, + "loss": 0.8900184, + "num_input_tokens_seen": 210010210, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.10760498, + "step": 9744, + "time_per_iteration": 2.5923476219177246 + }, + { + "auxiliary_loss_clip": 0.06429034, + "auxiliary_loss_mlp": 0.01268911, + "balance_loss_clip": 0.06278567, + "balance_loss_mlp": 0.01256847, + "epoch": 0.5859010972493612, + "flos": 27862993036800.0, + "grad_norm": 1.7800190043611435, + "language_loss": 0.71494257, + "learning_rate": 1.5446583334734183e-06, + "loss": 0.79192197, + "num_input_tokens_seen": 210030030, + "router_z_loss_clip": 1.50390625, + "router_z_loss_mlp": 0.12072754, + "step": 9745, + "time_per_iteration": 2.5417301654815674 + }, + { + "auxiliary_loss_clip": 0.06318981, + "auxiliary_loss_mlp": 0.01251832, + "balance_loss_clip": 0.06258826, + "balance_loss_mlp": 0.01250336, + "epoch": 0.5859612205020291, + "flos": 70029452465280.0, + "grad_norm": 0.7182748841957548, + "language_loss": 0.53236032, + "learning_rate": 1.5442791084105204e-06, + "loss": 0.60806841, + "num_input_tokens_seen": 210094840, + "router_z_loss_clip": 0.6015625, + "router_z_loss_mlp": 0.01495361, + "step": 9746, + "time_per_iteration": 4.6102893352508545 + }, + { + "auxiliary_loss_clip": 0.06421819, + "auxiliary_loss_mlp": 0.01265345, + "balance_loss_clip": 0.06276383, + "balance_loss_mlp": 0.01253907, + "epoch": 0.5860213437546972, + "flos": 24062032926720.0, + "grad_norm": 1.805241505686608, + "language_loss": 0.7322374, + "learning_rate": 1.5438999006286054e-06, + "loss": 0.80910903, + "num_input_tokens_seen": 210114660, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.11437988, + "step": 9747, + "time_per_iteration": 2.5299086570739746 + }, + { + "auxiliary_loss_clip": 0.06420729, + "auxiliary_loss_mlp": 0.01264964, + "balance_loss_clip": 0.06275554, + "balance_loss_mlp": 0.01253806, + "epoch": 0.5860814670073651, + "flos": 18952670511360.0, + "grad_norm": 1.7528078306488855, + "language_loss": 0.81229597, + "learning_rate": 1.543520710142051e-06, + "loss": 0.88915294, + "num_input_tokens_seen": 210132770, + "router_z_loss_clip": 1.45214844, + "router_z_loss_mlp": 0.1116333, + "step": 9748, + "time_per_iteration": 2.5070362091064453 + }, + { + "auxiliary_loss_clip": 0.06422453, + "auxiliary_loss_mlp": 0.01268094, + "balance_loss_clip": 0.06275974, + "balance_loss_mlp": 0.01256674, + "epoch": 0.5861415902600331, + "flos": 22567904046720.0, + "grad_norm": 2.1315206911445217, + "language_loss": 0.72122687, + "learning_rate": 1.5431415369652375e-06, + "loss": 0.7981323, + "num_input_tokens_seen": 210151895, + "router_z_loss_clip": 1.46484375, + "router_z_loss_mlp": 0.11419678, + "step": 9749, + "time_per_iteration": 2.5568935871124268 + }, + { + "auxiliary_loss_clip": 0.06413895, + "auxiliary_loss_mlp": 0.01265815, + "balance_loss_clip": 0.06272951, + "balance_loss_mlp": 0.01254765, + "epoch": 0.586201713512701, + "flos": 14397217511040.0, + "grad_norm": 2.3126679183899608, + "language_loss": 0.75373948, + "learning_rate": 1.5427623811125428e-06, + "loss": 0.8305366, + "num_input_tokens_seen": 210168040, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.11053467, + "step": 9750, + "time_per_iteration": 2.456709623336792 + }, + { + "auxiliary_loss_clip": 0.06418054, + "auxiliary_loss_mlp": 0.01267589, + "balance_loss_clip": 0.06274709, + "balance_loss_mlp": 0.01256091, + "epoch": 0.586261836765369, + "flos": 19504357793280.0, + "grad_norm": 1.5048801591853769, + "language_loss": 0.70914859, + "learning_rate": 1.542383242598344e-06, + "loss": 0.78600496, + "num_input_tokens_seen": 210187720, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.11505127, + "step": 9751, + "time_per_iteration": 2.516965389251709 + }, + { + "auxiliary_loss_clip": 0.06427741, + "auxiliary_loss_mlp": 0.01267026, + "balance_loss_clip": 0.06278099, + "balance_loss_mlp": 0.01254748, + "epoch": 0.5863219600180369, + "flos": 20707688678400.0, + "grad_norm": 2.2695397417566134, + "language_loss": 0.74817115, + "learning_rate": 1.5420041214370184e-06, + "loss": 0.82511884, + "num_input_tokens_seen": 210206080, + "router_z_loss_clip": 1.49511719, + "router_z_loss_mlp": 0.12280273, + "step": 9752, + "time_per_iteration": 2.4829437732696533 + }, + { + "auxiliary_loss_clip": 0.06419428, + "auxiliary_loss_mlp": 0.01266631, + "balance_loss_clip": 0.06275827, + "balance_loss_mlp": 0.01255026, + "epoch": 0.586382083270705, + "flos": 19798258389120.0, + "grad_norm": 1.7375633359019997, + "language_loss": 0.77788973, + "learning_rate": 1.541625017642943e-06, + "loss": 0.85475028, + "num_input_tokens_seen": 210225660, + "router_z_loss_clip": 1.43457031, + "router_z_loss_mlp": 0.1159668, + "step": 9753, + "time_per_iteration": 2.5376296043395996 + }, + { + "auxiliary_loss_clip": 0.06415142, + "auxiliary_loss_mlp": 0.01266521, + "balance_loss_clip": 0.06275599, + "balance_loss_mlp": 0.01256478, + "epoch": 0.5864422065233729, + "flos": 16504821659520.0, + "grad_norm": 1.5941521516898884, + "language_loss": 0.71418774, + "learning_rate": 1.5412459312304927e-06, + "loss": 0.79100442, + "num_input_tokens_seen": 210242725, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.1003418, + "step": 9754, + "time_per_iteration": 2.482060670852661 + }, + { + "auxiliary_loss_clip": 0.06418964, + "auxiliary_loss_mlp": 0.01266127, + "balance_loss_clip": 0.06275275, + "balance_loss_mlp": 0.01254706, + "epoch": 0.5865023297760409, + "flos": 20419657868160.0, + "grad_norm": 1.5122611907827943, + "language_loss": 0.72473872, + "learning_rate": 1.540866862214043e-06, + "loss": 0.80158961, + "num_input_tokens_seen": 210263225, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.11407471, + "step": 9755, + "time_per_iteration": 2.5370032787323 + }, + { + "auxiliary_loss_clip": 0.06317496, + "auxiliary_loss_mlp": 0.01251101, + "balance_loss_clip": 0.06257688, + "balance_loss_mlp": 0.01249532, + "epoch": 0.5865624530287089, + "flos": 63369386864640.0, + "grad_norm": 0.7287908319651881, + "language_loss": 0.56949997, + "learning_rate": 1.540487810607967e-06, + "loss": 0.64518595, + "num_input_tokens_seen": 210322310, + "router_z_loss_clip": 0.60058594, + "router_z_loss_mlp": 0.01570129, + "step": 9756, + "time_per_iteration": 3.10322904586792 + }, + { + "auxiliary_loss_clip": 0.06418074, + "auxiliary_loss_mlp": 0.01268383, + "balance_loss_clip": 0.06273533, + "balance_loss_mlp": 0.01258048, + "epoch": 0.5866225762813768, + "flos": 27023610360960.0, + "grad_norm": 1.7386050489235434, + "language_loss": 0.76836097, + "learning_rate": 1.5401087764266396e-06, + "loss": 0.84522557, + "num_input_tokens_seen": 210340845, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.10333252, + "step": 9757, + "time_per_iteration": 2.5645911693573 + }, + { + "auxiliary_loss_clip": 0.06316153, + "auxiliary_loss_mlp": 0.01253974, + "balance_loss_clip": 0.06255822, + "balance_loss_mlp": 0.01252219, + "epoch": 0.5866826995340448, + "flos": 73007941224960.0, + "grad_norm": 0.8367731636564993, + "language_loss": 0.60245061, + "learning_rate": 1.5397297596844337e-06, + "loss": 0.67815191, + "num_input_tokens_seen": 210397815, + "router_z_loss_clip": 0.60449219, + "router_z_loss_mlp": 0.01760864, + "step": 9758, + "time_per_iteration": 3.129420042037964 + }, + { + "auxiliary_loss_clip": 0.06425761, + "auxiliary_loss_mlp": 0.01269834, + "balance_loss_clip": 0.06276144, + "balance_loss_mlp": 0.0125824, + "epoch": 0.5867428227867127, + "flos": 21291716436480.0, + "grad_norm": 2.341889353580635, + "language_loss": 0.7231499, + "learning_rate": 1.5393507603957212e-06, + "loss": 0.80010581, + "num_input_tokens_seen": 210413900, + "router_z_loss_clip": 1.49316406, + "router_z_loss_mlp": 0.11602783, + "step": 9759, + "time_per_iteration": 2.5044219493865967 + }, + { + "auxiliary_loss_clip": 0.06416983, + "auxiliary_loss_mlp": 0.01266034, + "balance_loss_clip": 0.06274659, + "balance_loss_mlp": 0.01254924, + "epoch": 0.5868029460393808, + "flos": 33476356961280.0, + "grad_norm": 1.459885556596891, + "language_loss": 0.73556709, + "learning_rate": 1.5389717785748742e-06, + "loss": 0.8123973, + "num_input_tokens_seen": 210434110, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.11114502, + "step": 9760, + "time_per_iteration": 2.662318229675293 + }, + { + "auxiliary_loss_clip": 0.06416824, + "auxiliary_loss_mlp": 0.01264293, + "balance_loss_clip": 0.06273922, + "balance_loss_mlp": 0.01252944, + "epoch": 0.5868630692920487, + "flos": 17894382243840.0, + "grad_norm": 1.6271911446451897, + "language_loss": 0.7251972, + "learning_rate": 1.5385928142362637e-06, + "loss": 0.80200839, + "num_input_tokens_seen": 210451685, + "router_z_loss_clip": 1.42871094, + "router_z_loss_mlp": 0.11352539, + "step": 9761, + "time_per_iteration": 2.635671377182007 + }, + { + "auxiliary_loss_clip": 0.06421126, + "auxiliary_loss_mlp": 0.01265487, + "balance_loss_clip": 0.06272967, + "balance_loss_mlp": 0.01253274, + "epoch": 0.5869231925447167, + "flos": 21041770106880.0, + "grad_norm": 1.8098960680000724, + "language_loss": 0.74938971, + "learning_rate": 1.5382138673942597e-06, + "loss": 0.8262558, + "num_input_tokens_seen": 210470825, + "router_z_loss_clip": 1.47949219, + "router_z_loss_mlp": 0.12200928, + "step": 9762, + "time_per_iteration": 2.511338472366333 + }, + { + "auxiliary_loss_clip": 0.06414436, + "auxiliary_loss_mlp": 0.01266483, + "balance_loss_clip": 0.06275184, + "balance_loss_mlp": 0.01255766, + "epoch": 0.5869833157973846, + "flos": 74753288974080.0, + "grad_norm": 1.2323244190692502, + "language_loss": 0.72678411, + "learning_rate": 1.5378349380632317e-06, + "loss": 0.80359328, + "num_input_tokens_seen": 210500075, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.10723877, + "step": 9763, + "time_per_iteration": 2.966012716293335 + }, + { + "auxiliary_loss_clip": 0.06416167, + "auxiliary_loss_mlp": 0.01264221, + "balance_loss_clip": 0.06274015, + "balance_loss_mlp": 0.01253296, + "epoch": 0.5870434390500526, + "flos": 17644687476480.0, + "grad_norm": 1.6070407244149296, + "language_loss": 0.79883134, + "learning_rate": 1.53745602625755e-06, + "loss": 0.87563521, + "num_input_tokens_seen": 210518150, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.10931396, + "step": 9764, + "time_per_iteration": 2.5360097885131836 + }, + { + "auxiliary_loss_clip": 0.06420099, + "auxiliary_loss_mlp": 0.01269959, + "balance_loss_clip": 0.06274946, + "balance_loss_mlp": 0.01258342, + "epoch": 0.5871035623027205, + "flos": 21512424890880.0, + "grad_norm": 2.0596306569779967, + "language_loss": 0.79149717, + "learning_rate": 1.5370771319915819e-06, + "loss": 0.86839771, + "num_input_tokens_seen": 210537760, + "router_z_loss_clip": 1.45019531, + "router_z_loss_mlp": 0.1161499, + "step": 9765, + "time_per_iteration": 2.523232936859131 + }, + { + "auxiliary_loss_clip": 0.06413256, + "auxiliary_loss_mlp": 0.01264834, + "balance_loss_clip": 0.06272542, + "balance_loss_mlp": 0.01254427, + "epoch": 0.5871636855553886, + "flos": 13556744732160.0, + "grad_norm": 1.6377752901078153, + "language_loss": 0.83660257, + "learning_rate": 1.5366982552796947e-06, + "loss": 0.91338348, + "num_input_tokens_seen": 210555515, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.10406494, + "step": 9766, + "time_per_iteration": 2.468043804168701 + }, + { + "auxiliary_loss_clip": 0.06423902, + "auxiliary_loss_mlp": 0.01268958, + "balance_loss_clip": 0.06274862, + "balance_loss_mlp": 0.01257639, + "epoch": 0.5872238088080565, + "flos": 26220006178560.0, + "grad_norm": 1.5173362705755495, + "language_loss": 0.69876915, + "learning_rate": 1.536319396136257e-06, + "loss": 0.77569771, + "num_input_tokens_seen": 210575000, + "router_z_loss_clip": 1.49121094, + "router_z_loss_mlp": 0.11322021, + "step": 9767, + "time_per_iteration": 2.53935170173645 + }, + { + "auxiliary_loss_clip": 0.06416009, + "auxiliary_loss_mlp": 0.01268588, + "balance_loss_clip": 0.06271268, + "balance_loss_mlp": 0.0125743, + "epoch": 0.5872839320607245, + "flos": 30673196870400.0, + "grad_norm": 6.458419959703109, + "language_loss": 0.64030594, + "learning_rate": 1.5359405545756336e-06, + "loss": 0.71715188, + "num_input_tokens_seen": 210595185, + "router_z_loss_clip": 1.44824219, + "router_z_loss_mlp": 0.11151123, + "step": 9768, + "time_per_iteration": 2.6036899089813232 + }, + { + "auxiliary_loss_clip": 0.06324692, + "auxiliary_loss_mlp": 0.01254391, + "balance_loss_clip": 0.06264571, + "balance_loss_mlp": 0.01252818, + "epoch": 0.5873440553133924, + "flos": 60324623925120.0, + "grad_norm": 0.7185710562845293, + "language_loss": 0.53754711, + "learning_rate": 1.5355617306121914e-06, + "loss": 0.61333793, + "num_input_tokens_seen": 210653210, + "router_z_loss_clip": 0.6015625, + "router_z_loss_mlp": 0.01573944, + "step": 9769, + "time_per_iteration": 4.53153133392334 + }, + { + "auxiliary_loss_clip": 0.06416724, + "auxiliary_loss_mlp": 0.01267359, + "balance_loss_clip": 0.0627375, + "balance_loss_mlp": 0.01256409, + "epoch": 0.5874041785660604, + "flos": 21545016929280.0, + "grad_norm": 1.3491952646211745, + "language_loss": 0.70993185, + "learning_rate": 1.5351829242602945e-06, + "loss": 0.78677267, + "num_input_tokens_seen": 210673750, + "router_z_loss_clip": 1.42871094, + "router_z_loss_mlp": 0.10949707, + "step": 9770, + "time_per_iteration": 2.5152831077575684 + }, + { + "auxiliary_loss_clip": 0.06416201, + "auxiliary_loss_mlp": 0.01268245, + "balance_loss_clip": 0.06274108, + "balance_loss_mlp": 0.01256801, + "epoch": 0.5874643018187284, + "flos": 24395778938880.0, + "grad_norm": 1.9550841164663295, + "language_loss": 0.67880088, + "learning_rate": 1.5348041355343077e-06, + "loss": 0.75564533, + "num_input_tokens_seen": 210692960, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.11444092, + "step": 9771, + "time_per_iteration": 2.518069267272949 + }, + { + "auxiliary_loss_clip": 0.06421787, + "auxiliary_loss_mlp": 0.0126791, + "balance_loss_clip": 0.06274431, + "balance_loss_mlp": 0.0125531, + "epoch": 0.5875244250713964, + "flos": 28155300405120.0, + "grad_norm": 1.4791048602495522, + "language_loss": 0.66491324, + "learning_rate": 1.5344253644485954e-06, + "loss": 0.74181026, + "num_input_tokens_seen": 210714040, + "router_z_loss_clip": 1.47363281, + "router_z_loss_mlp": 0.1260376, + "step": 9772, + "time_per_iteration": 2.5565338134765625 + }, + { + "auxiliary_loss_clip": 0.0642426, + "auxiliary_loss_mlp": 0.01271472, + "balance_loss_clip": 0.06276119, + "balance_loss_mlp": 0.01258866, + "epoch": 0.5875845483240644, + "flos": 25819566716160.0, + "grad_norm": 1.5545187987766196, + "language_loss": 0.7466417, + "learning_rate": 1.534046611017519e-06, + "loss": 0.82359904, + "num_input_tokens_seen": 210733710, + "router_z_loss_clip": 1.48144531, + "router_z_loss_mlp": 0.12615967, + "step": 9773, + "time_per_iteration": 2.533243179321289 + }, + { + "auxiliary_loss_clip": 0.06421398, + "auxiliary_loss_mlp": 0.0126674, + "balance_loss_clip": 0.06276072, + "balance_loss_mlp": 0.01255606, + "epoch": 0.5876446715767323, + "flos": 26913843112320.0, + "grad_norm": 1.8911636717759477, + "language_loss": 0.54071677, + "learning_rate": 1.5336678752554421e-06, + "loss": 0.61759812, + "num_input_tokens_seen": 210753580, + "router_z_loss_clip": 1.45507812, + "router_z_loss_mlp": 0.11138916, + "step": 9774, + "time_per_iteration": 2.5565576553344727 + }, + { + "auxiliary_loss_clip": 0.06419463, + "auxiliary_loss_mlp": 0.01264428, + "balance_loss_clip": 0.06276506, + "balance_loss_mlp": 0.01253192, + "epoch": 0.5877047948294003, + "flos": 36693750510720.0, + "grad_norm": 2.5652883668591886, + "language_loss": 0.65881801, + "learning_rate": 1.5332891571767264e-06, + "loss": 0.73565692, + "num_input_tokens_seen": 210773495, + "router_z_loss_clip": 1.42871094, + "router_z_loss_mlp": 0.11242676, + "step": 9775, + "time_per_iteration": 4.102318525314331 + }, + { + "auxiliary_loss_clip": 0.06418855, + "auxiliary_loss_mlp": 0.01267575, + "balance_loss_clip": 0.06274112, + "balance_loss_mlp": 0.01256459, + "epoch": 0.5877649180820682, + "flos": 26732057679360.0, + "grad_norm": 1.541611587459476, + "language_loss": 0.73877925, + "learning_rate": 1.5329104567957326e-06, + "loss": 0.81564349, + "num_input_tokens_seen": 210793645, + "router_z_loss_clip": 1.44726562, + "router_z_loss_mlp": 0.11114502, + "step": 9776, + "time_per_iteration": 2.534105062484741 + }, + { + "auxiliary_loss_clip": 0.06416035, + "auxiliary_loss_mlp": 0.01267161, + "balance_loss_clip": 0.06270815, + "balance_loss_mlp": 0.0125586, + "epoch": 0.5878250413347362, + "flos": 21038457870720.0, + "grad_norm": 1.5037279013590201, + "language_loss": 0.7431531, + "learning_rate": 1.532531774126821e-06, + "loss": 0.81998503, + "num_input_tokens_seen": 210813415, + "router_z_loss_clip": 1.45117188, + "router_z_loss_mlp": 0.11315918, + "step": 9777, + "time_per_iteration": 2.501791000366211 + }, + { + "auxiliary_loss_clip": 0.06412566, + "auxiliary_loss_mlp": 0.01267719, + "balance_loss_clip": 0.06273127, + "balance_loss_mlp": 0.01257407, + "epoch": 0.5878851645874041, + "flos": 25491397000320.0, + "grad_norm": 1.389592011343503, + "language_loss": 0.74136406, + "learning_rate": 1.5321531091843512e-06, + "loss": 0.81816691, + "num_input_tokens_seen": 210833850, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.10302734, + "step": 9778, + "time_per_iteration": 2.5198276042938232 + }, + { + "auxiliary_loss_clip": 0.06416066, + "auxiliary_loss_mlp": 0.01272779, + "balance_loss_clip": 0.06272696, + "balance_loss_mlp": 0.01261293, + "epoch": 0.5879452878400722, + "flos": 23775930760320.0, + "grad_norm": 1.6684393614308786, + "language_loss": 0.70061487, + "learning_rate": 1.5317744619826824e-06, + "loss": 0.77750337, + "num_input_tokens_seen": 210853115, + "router_z_loss_clip": 1.43457031, + "router_z_loss_mlp": 0.11486816, + "step": 9779, + "time_per_iteration": 3.9999070167541504 + }, + { + "auxiliary_loss_clip": 0.06419669, + "auxiliary_loss_mlp": 0.01264938, + "balance_loss_clip": 0.06273909, + "balance_loss_mlp": 0.0125331, + "epoch": 0.5880054110927401, + "flos": 17830749467520.0, + "grad_norm": 1.9325071243234666, + "language_loss": 0.67414713, + "learning_rate": 1.5313958325361727e-06, + "loss": 0.75099313, + "num_input_tokens_seen": 210872090, + "router_z_loss_clip": 1.45800781, + "router_z_loss_mlp": 0.11633301, + "step": 9780, + "time_per_iteration": 2.525421142578125 + }, + { + "auxiliary_loss_clip": 0.06422442, + "auxiliary_loss_mlp": 0.01271374, + "balance_loss_clip": 0.0627559, + "balance_loss_mlp": 0.0125981, + "epoch": 0.5880655343454081, + "flos": 19469417840640.0, + "grad_norm": 1.9086155780635632, + "language_loss": 0.73100537, + "learning_rate": 1.5310172208591807e-06, + "loss": 0.80794352, + "num_input_tokens_seen": 210888490, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 0.11572266, + "step": 9781, + "time_per_iteration": 2.4647257328033447 + }, + { + "auxiliary_loss_clip": 0.06415875, + "auxiliary_loss_mlp": 0.01269752, + "balance_loss_clip": 0.06273176, + "balance_loss_mlp": 0.01258731, + "epoch": 0.588125657598076, + "flos": 21403999307520.0, + "grad_norm": 1.283507981192047, + "language_loss": 0.7022016, + "learning_rate": 1.5306386269660622e-06, + "loss": 0.77905786, + "num_input_tokens_seen": 210908220, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.11016846, + "step": 9782, + "time_per_iteration": 2.531780481338501 + }, + { + "auxiliary_loss_clip": 0.06420694, + "auxiliary_loss_mlp": 0.01268128, + "balance_loss_clip": 0.06274669, + "balance_loss_mlp": 0.01256314, + "epoch": 0.588185780850744, + "flos": 16040246296320.0, + "grad_norm": 2.020771184042221, + "language_loss": 0.71036118, + "learning_rate": 1.5302600508711741e-06, + "loss": 0.78724945, + "num_input_tokens_seen": 210923945, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.11804199, + "step": 9783, + "time_per_iteration": 2.452061176300049 + }, + { + "auxiliary_loss_clip": 0.06426281, + "auxiliary_loss_mlp": 0.01267542, + "balance_loss_clip": 0.06277394, + "balance_loss_mlp": 0.01255538, + "epoch": 0.588245904103412, + "flos": 23734282481280.0, + "grad_norm": 1.861465214251895, + "language_loss": 0.69312334, + "learning_rate": 1.5298814925888719e-06, + "loss": 0.77006149, + "num_input_tokens_seen": 210941955, + "router_z_loss_clip": 1.48828125, + "router_z_loss_mlp": 0.12005615, + "step": 9784, + "time_per_iteration": 2.552767515182495 + }, + { + "auxiliary_loss_clip": 0.06421058, + "auxiliary_loss_mlp": 0.01265879, + "balance_loss_clip": 0.06273105, + "balance_loss_mlp": 0.01254596, + "epoch": 0.58830602735608, + "flos": 33810983441280.0, + "grad_norm": 1.7066395827536198, + "language_loss": 0.69576097, + "learning_rate": 1.5295029521335102e-06, + "loss": 0.77263039, + "num_input_tokens_seen": 210963105, + "router_z_loss_clip": 1.47753906, + "router_z_loss_mlp": 0.112854, + "step": 9785, + "time_per_iteration": 3.9847395420074463 + }, + { + "auxiliary_loss_clip": 0.06415717, + "auxiliary_loss_mlp": 0.01265718, + "balance_loss_clip": 0.06274907, + "balance_loss_mlp": 0.01255352, + "epoch": 0.588366150608748, + "flos": 17096144722560.0, + "grad_norm": 1.8665479354272698, + "language_loss": 0.78022271, + "learning_rate": 1.5291244295194448e-06, + "loss": 0.85703707, + "num_input_tokens_seen": 210978720, + "router_z_loss_clip": 1.40722656, + "router_z_loss_mlp": 0.10369873, + "step": 9786, + "time_per_iteration": 2.4842867851257324 + }, + { + "auxiliary_loss_clip": 0.06423976, + "auxiliary_loss_mlp": 0.01266691, + "balance_loss_clip": 0.06278287, + "balance_loss_mlp": 0.01255128, + "epoch": 0.5884262738614159, + "flos": 22133698588800.0, + "grad_norm": 1.4734886628165487, + "language_loss": 0.78796208, + "learning_rate": 1.5287459247610276e-06, + "loss": 0.86486876, + "num_input_tokens_seen": 210998750, + "router_z_loss_clip": 1.45605469, + "router_z_loss_mlp": 0.11566162, + "step": 9787, + "time_per_iteration": 2.497192144393921 + }, + { + "auxiliary_loss_clip": 0.06418703, + "auxiliary_loss_mlp": 0.01265555, + "balance_loss_clip": 0.06275064, + "balance_loss_mlp": 0.01254617, + "epoch": 0.5884863971140839, + "flos": 21038038600320.0, + "grad_norm": 1.5088398107909506, + "language_loss": 0.66488671, + "learning_rate": 1.5283674378726116e-06, + "loss": 0.74172926, + "num_input_tokens_seen": 211017550, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.10943604, + "step": 9788, + "time_per_iteration": 2.5208425521850586 + }, + { + "auxiliary_loss_clip": 0.06416389, + "auxiliary_loss_mlp": 0.01266859, + "balance_loss_clip": 0.0627557, + "balance_loss_mlp": 0.01255212, + "epoch": 0.5885465203667518, + "flos": 23811835034880.0, + "grad_norm": 2.124690797246634, + "language_loss": 0.8100794, + "learning_rate": 1.5279889688685506e-06, + "loss": 0.88691187, + "num_input_tokens_seen": 211034135, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.11651611, + "step": 9789, + "time_per_iteration": 2.497751235961914 + }, + { + "auxiliary_loss_clip": 0.06413969, + "auxiliary_loss_mlp": 0.01268072, + "balance_loss_clip": 0.06274658, + "balance_loss_mlp": 0.01257432, + "epoch": 0.5886066436194198, + "flos": 18886647893760.0, + "grad_norm": 1.5219157367370164, + "language_loss": 0.69998693, + "learning_rate": 1.5276105177631944e-06, + "loss": 0.77680737, + "num_input_tokens_seen": 211053850, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.10638428, + "step": 9790, + "time_per_iteration": 2.5238122940063477 + }, + { + "auxiliary_loss_clip": 0.06416899, + "auxiliary_loss_mlp": 0.01266137, + "balance_loss_clip": 0.06275025, + "balance_loss_mlp": 0.01254484, + "epoch": 0.5886667668720877, + "flos": 24797015015040.0, + "grad_norm": 1.9547129753533632, + "language_loss": 0.83327186, + "learning_rate": 1.527232084570895e-06, + "loss": 0.91010225, + "num_input_tokens_seen": 211072165, + "router_z_loss_clip": 1.41601562, + "router_z_loss_mlp": 0.11651611, + "step": 9791, + "time_per_iteration": 2.518833637237549 + }, + { + "auxiliary_loss_clip": 0.06420578, + "auxiliary_loss_mlp": 0.01270103, + "balance_loss_clip": 0.06276245, + "balance_loss_mlp": 0.01259297, + "epoch": 0.5887268901247558, + "flos": 21620473130880.0, + "grad_norm": 1.5293641441028467, + "language_loss": 0.76486295, + "learning_rate": 1.5268536693060026e-06, + "loss": 0.84176975, + "num_input_tokens_seen": 211089630, + "router_z_loss_clip": 1.44140625, + "router_z_loss_mlp": 0.1081543, + "step": 9792, + "time_per_iteration": 2.5101959705352783 + }, + { + "auxiliary_loss_clip": 0.06421857, + "auxiliary_loss_mlp": 0.01269547, + "balance_loss_clip": 0.06273879, + "balance_loss_mlp": 0.01258424, + "epoch": 0.5887870133774237, + "flos": 20487357567360.0, + "grad_norm": 2.1847202997614477, + "language_loss": 0.69169068, + "learning_rate": 1.5264752719828662e-06, + "loss": 0.76860476, + "num_input_tokens_seen": 211106120, + "router_z_loss_clip": 1.48046875, + "router_z_loss_mlp": 0.11114502, + "step": 9793, + "time_per_iteration": 2.4927995204925537 + }, + { + "auxiliary_loss_clip": 0.06418081, + "auxiliary_loss_mlp": 0.01269605, + "balance_loss_clip": 0.06276278, + "balance_loss_mlp": 0.01258483, + "epoch": 0.5888471366300917, + "flos": 19211966570880.0, + "grad_norm": 1.7416997591947727, + "language_loss": 0.60439771, + "learning_rate": 1.5260968926158353e-06, + "loss": 0.68127453, + "num_input_tokens_seen": 211122450, + "router_z_loss_clip": 1.41796875, + "router_z_loss_mlp": 0.11132812, + "step": 9794, + "time_per_iteration": 2.543231248855591 + }, + { + "auxiliary_loss_clip": 0.06420963, + "auxiliary_loss_mlp": 0.01267396, + "balance_loss_clip": 0.06275123, + "balance_loss_mlp": 0.01256113, + "epoch": 0.5889072598827596, + "flos": 19978786010880.0, + "grad_norm": 1.5723031838894885, + "language_loss": 0.65483499, + "learning_rate": 1.525718531219257e-06, + "loss": 0.73171854, + "num_input_tokens_seen": 211141765, + "router_z_loss_clip": 1.45703125, + "router_z_loss_mlp": 0.11291504, + "step": 9795, + "time_per_iteration": 2.502537965774536 + }, + { + "auxiliary_loss_clip": 0.06414207, + "auxiliary_loss_mlp": 0.01265609, + "balance_loss_clip": 0.06274657, + "balance_loss_mlp": 0.01255197, + "epoch": 0.5889673831354276, + "flos": 20747617948800.0, + "grad_norm": 1.4841948976653832, + "language_loss": 0.74256188, + "learning_rate": 1.5253401878074801e-06, + "loss": 0.81936008, + "num_input_tokens_seen": 211160475, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.10418701, + "step": 9796, + "time_per_iteration": 2.496511220932007 + }, + { + "auxiliary_loss_clip": 0.06417978, + "auxiliary_loss_mlp": 0.01267283, + "balance_loss_clip": 0.06275263, + "balance_loss_mlp": 0.01256238, + "epoch": 0.5890275063880956, + "flos": 25307892558720.0, + "grad_norm": 2.3243895650299566, + "language_loss": 0.83142781, + "learning_rate": 1.5249618623948507e-06, + "loss": 0.90828037, + "num_input_tokens_seen": 211180480, + "router_z_loss_clip": 1.42675781, + "router_z_loss_mlp": 0.11047363, + "step": 9797, + "time_per_iteration": 2.5991365909576416 + }, + { + "auxiliary_loss_clip": 0.06417, + "auxiliary_loss_mlp": 0.01261637, + "balance_loss_clip": 0.06275804, + "balance_loss_mlp": 0.01250806, + "epoch": 0.5890876296407636, + "flos": 11770182702720.0, + "grad_norm": 1.5626242229143896, + "language_loss": 0.79473782, + "learning_rate": 1.5245835549957152e-06, + "loss": 0.87152421, + "num_input_tokens_seen": 211198000, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.1083374, + "step": 9798, + "time_per_iteration": 2.5399045944213867 + }, + { + "auxiliary_loss_clip": 0.06414175, + "auxiliary_loss_mlp": 0.01264907, + "balance_loss_clip": 0.06274281, + "balance_loss_mlp": 0.01254584, + "epoch": 0.5891477528934316, + "flos": 13594535723520.0, + "grad_norm": 2.254418827792415, + "language_loss": 0.75000322, + "learning_rate": 1.5242052656244186e-06, + "loss": 0.82679403, + "num_input_tokens_seen": 211214765, + "router_z_loss_clip": 1.39941406, + "router_z_loss_mlp": 0.10321045, + "step": 9799, + "time_per_iteration": 2.4642131328582764 + }, + { + "auxiliary_loss_clip": 0.06420485, + "auxiliary_loss_mlp": 0.01266976, + "balance_loss_clip": 0.06274568, + "balance_loss_mlp": 0.01254798, + "epoch": 0.5892078761460995, + "flos": 15054563191680.0, + "grad_norm": 1.9320779180150096, + "language_loss": 0.76666486, + "learning_rate": 1.5238269942953064e-06, + "loss": 0.84353948, + "num_input_tokens_seen": 211232335, + "router_z_loss_clip": 1.45898438, + "router_z_loss_mlp": 0.12182617, + "step": 9800, + "time_per_iteration": 2.5170304775238037 + }, + { + "auxiliary_loss_clip": 0.06421179, + "auxiliary_loss_mlp": 0.01264846, + "balance_loss_clip": 0.06275316, + "balance_loss_mlp": 0.0125361, + "epoch": 0.5892679993987675, + "flos": 15783591640320.0, + "grad_norm": 1.6350760782373632, + "language_loss": 0.79415876, + "learning_rate": 1.523448741022722e-06, + "loss": 0.87101901, + "num_input_tokens_seen": 211249985, + "router_z_loss_clip": 1.45898438, + "router_z_loss_mlp": 0.11242676, + "step": 9801, + "time_per_iteration": 2.4804494380950928 + }, + { + "auxiliary_loss_clip": 0.06421967, + "auxiliary_loss_mlp": 0.01265274, + "balance_loss_clip": 0.06274579, + "balance_loss_mlp": 0.01253467, + "epoch": 0.5893281226514354, + "flos": 25272281773440.0, + "grad_norm": 1.6257193775599612, + "language_loss": 0.6664654, + "learning_rate": 1.5230705058210088e-06, + "loss": 0.74333781, + "num_input_tokens_seen": 211268425, + "router_z_loss_clip": 1.47265625, + "router_z_loss_mlp": 0.11804199, + "step": 9802, + "time_per_iteration": 2.536524534225464 + }, + { + "auxiliary_loss_clip": 0.06417859, + "auxiliary_loss_mlp": 0.01267449, + "balance_loss_clip": 0.06276833, + "balance_loss_mlp": 0.01256475, + "epoch": 0.5893882459041034, + "flos": 19463380346880.0, + "grad_norm": 2.7221530495776953, + "language_loss": 0.78339422, + "learning_rate": 1.5226922887045108e-06, + "loss": 0.86024731, + "num_input_tokens_seen": 211286680, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.10986328, + "step": 9803, + "time_per_iteration": 2.4658396244049072 + }, + { + "auxiliary_loss_clip": 0.06422158, + "auxiliary_loss_mlp": 0.01266134, + "balance_loss_clip": 0.06275959, + "balance_loss_mlp": 0.01255143, + "epoch": 0.5894483691567713, + "flos": 20640785592960.0, + "grad_norm": 1.3509589673333673, + "language_loss": 0.73070806, + "learning_rate": 1.5223140896875686e-06, + "loss": 0.80759096, + "num_input_tokens_seen": 211307700, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.10986328, + "step": 9804, + "time_per_iteration": 2.5561769008636475 + }, + { + "auxiliary_loss_clip": 0.06421436, + "auxiliary_loss_mlp": 0.01267021, + "balance_loss_clip": 0.06279321, + "balance_loss_mlp": 0.01255779, + "epoch": 0.5895084924094394, + "flos": 17782812132480.0, + "grad_norm": 4.893575785915148, + "language_loss": 0.74802667, + "learning_rate": 1.5219359087845234e-06, + "loss": 0.82491124, + "num_input_tokens_seen": 211324835, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.11254883, + "step": 9805, + "time_per_iteration": 2.4777255058288574 + }, + { + "auxiliary_loss_clip": 0.06430615, + "auxiliary_loss_mlp": 0.01266439, + "balance_loss_clip": 0.06278822, + "balance_loss_mlp": 0.01254542, + "epoch": 0.5895686156621073, + "flos": 20127350499840.0, + "grad_norm": 1.9675390106462767, + "language_loss": 0.78339982, + "learning_rate": 1.5215577460097174e-06, + "loss": 0.8603704, + "num_input_tokens_seen": 211344130, + "router_z_loss_clip": 1.515625, + "router_z_loss_mlp": 0.11901855, + "step": 9806, + "time_per_iteration": 2.556187868118286 + }, + { + "auxiliary_loss_clip": 0.06426841, + "auxiliary_loss_mlp": 0.01268335, + "balance_loss_clip": 0.06283563, + "balance_loss_mlp": 0.01256813, + "epoch": 0.5896287389147753, + "flos": 20856337021440.0, + "grad_norm": 1.8953677951134942, + "language_loss": 0.77413982, + "learning_rate": 1.5211796013774887e-06, + "loss": 0.85109162, + "num_input_tokens_seen": 211362915, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.11523438, + "step": 9807, + "time_per_iteration": 2.519200325012207 + }, + { + "auxiliary_loss_clip": 0.06425367, + "auxiliary_loss_mlp": 0.01268029, + "balance_loss_clip": 0.06276954, + "balance_loss_mlp": 0.01256341, + "epoch": 0.5896888621674432, + "flos": 14543098669440.0, + "grad_norm": 1.5805632295861456, + "language_loss": 0.75183058, + "learning_rate": 1.5208014749021786e-06, + "loss": 0.82876456, + "num_input_tokens_seen": 211380700, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.11694336, + "step": 9808, + "time_per_iteration": 3.908586025238037 + }, + { + "auxiliary_loss_clip": 0.06422409, + "auxiliary_loss_mlp": 0.01266023, + "balance_loss_clip": 0.06277257, + "balance_loss_mlp": 0.01253912, + "epoch": 0.5897489854201112, + "flos": 20893079836800.0, + "grad_norm": 1.9290339931200338, + "language_loss": 0.71909666, + "learning_rate": 1.5204233665981236e-06, + "loss": 0.79598099, + "num_input_tokens_seen": 211400095, + "router_z_loss_clip": 1.45019531, + "router_z_loss_mlp": 0.12103271, + "step": 9809, + "time_per_iteration": 2.5768144130706787 + }, + { + "auxiliary_loss_clip": 0.06423716, + "auxiliary_loss_mlp": 0.01266116, + "balance_loss_clip": 0.0627635, + "balance_loss_mlp": 0.01254272, + "epoch": 0.5898091086727792, + "flos": 20017331688960.0, + "grad_norm": 2.0062119760557473, + "language_loss": 0.82969332, + "learning_rate": 1.5200452764796627e-06, + "loss": 0.90659165, + "num_input_tokens_seen": 211417810, + "router_z_loss_clip": 1.47070312, + "router_z_loss_mlp": 0.1184082, + "step": 9810, + "time_per_iteration": 2.5024096965789795 + }, + { + "auxiliary_loss_clip": 0.06418087, + "auxiliary_loss_mlp": 0.01268409, + "balance_loss_clip": 0.06278655, + "balance_loss_mlp": 0.01257394, + "epoch": 0.5898692319254472, + "flos": 16258816471680.0, + "grad_norm": 2.656719323590735, + "language_loss": 0.81247234, + "learning_rate": 1.5196672045611336e-06, + "loss": 0.8893373, + "num_input_tokens_seen": 211436020, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.11016846, + "step": 9811, + "time_per_iteration": 2.5079774856567383 + }, + { + "auxiliary_loss_clip": 0.06424809, + "auxiliary_loss_mlp": 0.01266333, + "balance_loss_clip": 0.06278014, + "balance_loss_mlp": 0.01254442, + "epoch": 0.5899293551781152, + "flos": 20454723601920.0, + "grad_norm": 1.7175276958807264, + "language_loss": 0.7698791, + "learning_rate": 1.5192891508568715e-06, + "loss": 0.84679055, + "num_input_tokens_seen": 211454335, + "router_z_loss_clip": 1.46777344, + "router_z_loss_mlp": 0.11883545, + "step": 9812, + "time_per_iteration": 2.4813108444213867 + }, + { + "auxiliary_loss_clip": 0.06419283, + "auxiliary_loss_mlp": 0.01264428, + "balance_loss_clip": 0.0627578, + "balance_loss_mlp": 0.01253992, + "epoch": 0.5899894784307831, + "flos": 13886885018880.0, + "grad_norm": 1.6786934004730485, + "language_loss": 0.71137106, + "learning_rate": 1.5189111153812133e-06, + "loss": 0.78820813, + "num_input_tokens_seen": 211472775, + "router_z_loss_clip": 1.43359375, + "router_z_loss_mlp": 0.10437012, + "step": 9813, + "time_per_iteration": 2.5212063789367676 + }, + { + "auxiliary_loss_clip": 0.0641876, + "auxiliary_loss_mlp": 0.01270874, + "balance_loss_clip": 0.06273647, + "balance_loss_mlp": 0.01259394, + "epoch": 0.5900496016834511, + "flos": 20089936851840.0, + "grad_norm": 1.420675326684763, + "language_loss": 0.7244218, + "learning_rate": 1.518533098148494e-06, + "loss": 0.80131817, + "num_input_tokens_seen": 211492195, + "router_z_loss_clip": 1.45019531, + "router_z_loss_mlp": 0.11468506, + "step": 9814, + "time_per_iteration": 2.4773387908935547 + }, + { + "auxiliary_loss_clip": 0.06421163, + "auxiliary_loss_mlp": 0.01268081, + "balance_loss_clip": 0.06276704, + "balance_loss_mlp": 0.01256768, + "epoch": 0.590109724936119, + "flos": 20264133490560.0, + "grad_norm": 1.7152732807584992, + "language_loss": 0.7885775, + "learning_rate": 1.5181550991730476e-06, + "loss": 0.86546993, + "num_input_tokens_seen": 211510220, + "router_z_loss_clip": 1.4453125, + "router_z_loss_mlp": 0.11309814, + "step": 9815, + "time_per_iteration": 3.939445972442627 + }, + { + "auxiliary_loss_clip": 0.06427211, + "auxiliary_loss_mlp": 0.01267373, + "balance_loss_clip": 0.06275927, + "balance_loss_mlp": 0.01255142, + "epoch": 0.590169848188787, + "flos": 24240548050560.0, + "grad_norm": 1.7218203048390952, + "language_loss": 0.76316988, + "learning_rate": 1.5177771184692083e-06, + "loss": 0.84011579, + "num_input_tokens_seen": 211526260, + "router_z_loss_clip": 1.51171875, + "router_z_loss_mlp": 0.12243652, + "step": 9816, + "time_per_iteration": 2.5245048999786377 + }, + { + "auxiliary_loss_clip": 0.06419881, + "auxiliary_loss_mlp": 0.01267479, + "balance_loss_clip": 0.06277047, + "balance_loss_mlp": 0.01255725, + "epoch": 0.590229971441455, + "flos": 17790400926720.0, + "grad_norm": 1.8371364848215923, + "language_loss": 0.81572855, + "learning_rate": 1.517399156051309e-06, + "loss": 0.89260209, + "num_input_tokens_seen": 211542890, + "router_z_loss_clip": 1.42871094, + "router_z_loss_mlp": 0.11743164, + "step": 9817, + "time_per_iteration": 2.4621410369873047 + }, + { + "auxiliary_loss_clip": 0.06418833, + "auxiliary_loss_mlp": 0.01268261, + "balance_loss_clip": 0.06274004, + "balance_loss_mlp": 0.01257544, + "epoch": 0.590290094694123, + "flos": 22243465837440.0, + "grad_norm": 1.5541077044812335, + "language_loss": 0.76864719, + "learning_rate": 1.517021211933682e-06, + "loss": 0.84551811, + "num_input_tokens_seen": 211562685, + "router_z_loss_clip": 1.44726562, + "router_z_loss_mlp": 0.10717773, + "step": 9818, + "time_per_iteration": 2.5125410556793213 + }, + { + "auxiliary_loss_clip": 0.06416667, + "auxiliary_loss_mlp": 0.01265866, + "balance_loss_clip": 0.06275138, + "balance_loss_mlp": 0.01255501, + "epoch": 0.5903502179467909, + "flos": 19104589163520.0, + "grad_norm": 1.8321116335564553, + "language_loss": 0.67227435, + "learning_rate": 1.5166432861306592e-06, + "loss": 0.74909973, + "num_input_tokens_seen": 211579960, + "router_z_loss_clip": 1.41601562, + "router_z_loss_mlp": 0.10369873, + "step": 9819, + "time_per_iteration": 4.011074066162109 + }, + { + "auxiliary_loss_clip": 0.06420997, + "auxiliary_loss_mlp": 0.01266547, + "balance_loss_clip": 0.06275985, + "balance_loss_mlp": 0.01255819, + "epoch": 0.5904103411994589, + "flos": 24241051175040.0, + "grad_norm": 1.4923193447304384, + "language_loss": 0.7829935, + "learning_rate": 1.5162653786565714e-06, + "loss": 0.85986888, + "num_input_tokens_seen": 211599310, + "router_z_loss_clip": 1.45117188, + "router_z_loss_mlp": 0.10723877, + "step": 9820, + "time_per_iteration": 2.5523388385772705 + }, + { + "auxiliary_loss_clip": 0.06318125, + "auxiliary_loss_mlp": 0.01254512, + "balance_loss_clip": 0.06258737, + "balance_loss_mlp": 0.01253092, + "epoch": 0.5904704644521268, + "flos": 64894388774400.0, + "grad_norm": 0.9340841048050909, + "language_loss": 0.65183949, + "learning_rate": 1.5158874895257487e-06, + "loss": 0.72756588, + "num_input_tokens_seen": 211658790, + "router_z_loss_clip": 0.59375, + "router_z_loss_mlp": 0.01417542, + "step": 9821, + "time_per_iteration": 3.1619784832000732 + }, + { + "auxiliary_loss_clip": 0.06416959, + "auxiliary_loss_mlp": 0.0126236, + "balance_loss_clip": 0.06275654, + "balance_loss_mlp": 0.01251935, + "epoch": 0.5905305877047948, + "flos": 19616137539840.0, + "grad_norm": 2.101599923194391, + "language_loss": 0.6190716, + "learning_rate": 1.515509618752521e-06, + "loss": 0.69586486, + "num_input_tokens_seen": 211677240, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.10412598, + "step": 9822, + "time_per_iteration": 2.519482374191284 + }, + { + "auxiliary_loss_clip": 0.06419894, + "auxiliary_loss_mlp": 0.01268851, + "balance_loss_clip": 0.06275024, + "balance_loss_mlp": 0.01257365, + "epoch": 0.5905907109574628, + "flos": 18995660455680.0, + "grad_norm": 1.8507285157055846, + "language_loss": 0.82910419, + "learning_rate": 1.5151317663512173e-06, + "loss": 0.90599167, + "num_input_tokens_seen": 211695485, + "router_z_loss_clip": 1.44921875, + "router_z_loss_mlp": 0.1149292, + "step": 9823, + "time_per_iteration": 2.5134451389312744 + }, + { + "auxiliary_loss_clip": 0.06417045, + "auxiliary_loss_mlp": 0.01267549, + "balance_loss_clip": 0.06275238, + "balance_loss_mlp": 0.01256546, + "epoch": 0.5906508342101308, + "flos": 22206974584320.0, + "grad_norm": 1.8772651852061113, + "language_loss": 0.73388183, + "learning_rate": 1.514753932336165e-06, + "loss": 0.81072783, + "num_input_tokens_seen": 211713090, + "router_z_loss_clip": 1.41796875, + "router_z_loss_mlp": 0.11004639, + "step": 9824, + "time_per_iteration": 3.8841147422790527 + }, + { + "auxiliary_loss_clip": 0.064331, + "auxiliary_loss_mlp": 0.01267625, + "balance_loss_clip": 0.06277563, + "balance_loss_mlp": 0.01255013, + "epoch": 0.5907109574627988, + "flos": 20892995982720.0, + "grad_norm": 1.9523854086350827, + "language_loss": 0.82938302, + "learning_rate": 1.514376116721693e-06, + "loss": 0.90639031, + "num_input_tokens_seen": 211732510, + "router_z_loss_clip": 1.55566406, + "router_z_loss_mlp": 0.12609863, + "step": 9825, + "time_per_iteration": 2.527808427810669 + }, + { + "auxiliary_loss_clip": 0.06417271, + "auxiliary_loss_mlp": 0.01264281, + "balance_loss_clip": 0.06277614, + "balance_loss_mlp": 0.0125422, + "epoch": 0.5907710807154667, + "flos": 21513011869440.0, + "grad_norm": 1.8272335212588457, + "language_loss": 0.76679188, + "learning_rate": 1.5139983195221272e-06, + "loss": 0.84360743, + "num_input_tokens_seen": 211748695, + "router_z_loss_clip": 1.39648438, + "router_z_loss_mlp": 0.10058594, + "step": 9826, + "time_per_iteration": 2.4867746829986572 + }, + { + "auxiliary_loss_clip": 0.06416261, + "auxiliary_loss_mlp": 0.01262552, + "balance_loss_clip": 0.06274769, + "balance_loss_mlp": 0.01252419, + "epoch": 0.5908312039681347, + "flos": 22024979516160.0, + "grad_norm": 1.5050840799955296, + "language_loss": 0.7292102, + "learning_rate": 1.513620540751793e-06, + "loss": 0.80599833, + "num_input_tokens_seen": 211768545, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.10131836, + "step": 9827, + "time_per_iteration": 2.5261569023132324 + }, + { + "auxiliary_loss_clip": 0.06419525, + "auxiliary_loss_mlp": 0.01266997, + "balance_loss_clip": 0.0627335, + "balance_loss_mlp": 0.0125588, + "epoch": 0.5908913272208026, + "flos": 18485579525760.0, + "grad_norm": 1.8170415974974599, + "language_loss": 0.80223072, + "learning_rate": 1.5132427804250178e-06, + "loss": 0.87909591, + "num_input_tokens_seen": 211786665, + "router_z_loss_clip": 1.46191406, + "router_z_loss_mlp": 0.11120605, + "step": 9828, + "time_per_iteration": 2.4725866317749023 + }, + { + "auxiliary_loss_clip": 0.06421993, + "auxiliary_loss_mlp": 0.01272492, + "balance_loss_clip": 0.06275676, + "balance_loss_mlp": 0.01260375, + "epoch": 0.5909514504734706, + "flos": 12317006448000.0, + "grad_norm": 1.8455350152663679, + "language_loss": 0.88620806, + "learning_rate": 1.5128650385561241e-06, + "loss": 0.96315295, + "num_input_tokens_seen": 211801215, + "router_z_loss_clip": 1.46289062, + "router_z_loss_mlp": 0.12133789, + "step": 9829, + "time_per_iteration": 2.4783804416656494 + }, + { + "auxiliary_loss_clip": 0.06324679, + "auxiliary_loss_mlp": 0.01254341, + "balance_loss_clip": 0.06265787, + "balance_loss_mlp": 0.01252693, + "epoch": 0.5910115737261386, + "flos": 70233557811840.0, + "grad_norm": 0.7549892406299625, + "language_loss": 0.57903004, + "learning_rate": 1.5124873151594376e-06, + "loss": 0.6548202, + "num_input_tokens_seen": 211857005, + "router_z_loss_clip": 0.58984375, + "router_z_loss_mlp": 0.01651001, + "step": 9830, + "time_per_iteration": 3.0390307903289795 + }, + { + "auxiliary_loss_clip": 0.0643173, + "auxiliary_loss_mlp": 0.01269908, + "balance_loss_clip": 0.06281478, + "balance_loss_mlp": 0.01257308, + "epoch": 0.5910716969788066, + "flos": 22024266756480.0, + "grad_norm": 2.1560619163105965, + "language_loss": 0.75963652, + "learning_rate": 1.5121096102492812e-06, + "loss": 0.83665287, + "num_input_tokens_seen": 211876675, + "router_z_loss_clip": 1.50097656, + "router_z_loss_mlp": 0.12591553, + "step": 9831, + "time_per_iteration": 2.5367510318756104 + }, + { + "auxiliary_loss_clip": 0.06409759, + "auxiliary_loss_mlp": 0.01262704, + "balance_loss_clip": 0.06272636, + "balance_loss_mlp": 0.01252124, + "epoch": 0.5911318202314745, + "flos": 21258034295040.0, + "grad_norm": 1.5753423885742641, + "language_loss": 0.77885556, + "learning_rate": 1.5117319238399767e-06, + "loss": 0.85558021, + "num_input_tokens_seen": 211895725, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.10583496, + "step": 9832, + "time_per_iteration": 2.504584789276123 + }, + { + "auxiliary_loss_clip": 0.06416824, + "auxiliary_loss_mlp": 0.01265662, + "balance_loss_clip": 0.06274946, + "balance_loss_mlp": 0.01254797, + "epoch": 0.5911919434841425, + "flos": 17827353377280.0, + "grad_norm": 1.6998910709640538, + "language_loss": 0.83265263, + "learning_rate": 1.511354255945847e-06, + "loss": 0.90947747, + "num_input_tokens_seen": 211913860, + "router_z_loss_clip": 1.41894531, + "router_z_loss_mlp": 0.10864258, + "step": 9833, + "time_per_iteration": 2.508920192718506 + }, + { + "auxiliary_loss_clip": 0.06420296, + "auxiliary_loss_mlp": 0.01269729, + "balance_loss_clip": 0.06274877, + "balance_loss_mlp": 0.01259006, + "epoch": 0.5912520667368104, + "flos": 20380818700800.0, + "grad_norm": 1.4145847544307324, + "language_loss": 0.74488783, + "learning_rate": 1.5109766065812123e-06, + "loss": 0.82178807, + "num_input_tokens_seen": 211932880, + "router_z_loss_clip": 1.453125, + "router_z_loss_mlp": 0.10723877, + "step": 9834, + "time_per_iteration": 2.515340566635132 + }, + { + "auxiliary_loss_clip": 0.06420908, + "auxiliary_loss_mlp": 0.0126652, + "balance_loss_clip": 0.06276181, + "balance_loss_mlp": 0.01255308, + "epoch": 0.5913121899894784, + "flos": 17936240158080.0, + "grad_norm": 2.2554155860211296, + "language_loss": 0.78118962, + "learning_rate": 1.5105989757603942e-06, + "loss": 0.85806394, + "num_input_tokens_seen": 211948625, + "router_z_loss_clip": 1.44824219, + "router_z_loss_mlp": 0.11212158, + "step": 9835, + "time_per_iteration": 2.516449213027954 + }, + { + "auxiliary_loss_clip": 0.06422424, + "auxiliary_loss_mlp": 0.01268422, + "balance_loss_clip": 0.06274521, + "balance_loss_mlp": 0.0125724, + "epoch": 0.5913723132421465, + "flos": 22133405099520.0, + "grad_norm": 1.7910918924229287, + "language_loss": 0.74562353, + "learning_rate": 1.5102213634977117e-06, + "loss": 0.82253206, + "num_input_tokens_seen": 211965355, + "router_z_loss_clip": 1.47949219, + "router_z_loss_mlp": 0.11187744, + "step": 9836, + "time_per_iteration": 2.4944818019866943 + }, + { + "auxiliary_loss_clip": 0.06421088, + "auxiliary_loss_mlp": 0.01263965, + "balance_loss_clip": 0.06274953, + "balance_loss_mlp": 0.01252396, + "epoch": 0.5914324364948144, + "flos": 15702056017920.0, + "grad_norm": 1.9466597288818261, + "language_loss": 0.82267582, + "learning_rate": 1.5098437698074841e-06, + "loss": 0.89952636, + "num_input_tokens_seen": 211982245, + "router_z_loss_clip": 1.46289062, + "router_z_loss_mlp": 0.11572266, + "step": 9837, + "time_per_iteration": 2.5073657035827637 + }, + { + "auxiliary_loss_clip": 0.06423111, + "auxiliary_loss_mlp": 0.01265723, + "balance_loss_clip": 0.06276567, + "balance_loss_mlp": 0.01253665, + "epoch": 0.5914925597474824, + "flos": 22753924110720.0, + "grad_norm": 1.6146002375859378, + "language_loss": 0.7983368, + "learning_rate": 1.5094661947040304e-06, + "loss": 0.87522513, + "num_input_tokens_seen": 212000250, + "router_z_loss_clip": 1.46386719, + "router_z_loss_mlp": 0.1206665, + "step": 9838, + "time_per_iteration": 2.5024936199188232 + }, + { + "auxiliary_loss_clip": 0.06421801, + "auxiliary_loss_mlp": 0.01267887, + "balance_loss_clip": 0.06276052, + "balance_loss_mlp": 0.01256503, + "epoch": 0.5915526830001503, + "flos": 18298092015360.0, + "grad_norm": 1.7930328536333848, + "language_loss": 0.70194936, + "learning_rate": 1.5090886382016673e-06, + "loss": 0.77884626, + "num_input_tokens_seen": 212017505, + "router_z_loss_clip": 1.45703125, + "router_z_loss_mlp": 0.11383057, + "step": 9839, + "time_per_iteration": 2.5000133514404297 + }, + { + "auxiliary_loss_clip": 0.06421608, + "auxiliary_loss_mlp": 0.01265072, + "balance_loss_clip": 0.06275722, + "balance_loss_mlp": 0.01254462, + "epoch": 0.5916128062528183, + "flos": 17024713516800.0, + "grad_norm": 2.2460586823912254, + "language_loss": 0.65840614, + "learning_rate": 1.5087111003147124e-06, + "loss": 0.73527294, + "num_input_tokens_seen": 212034595, + "router_z_loss_clip": 1.45898438, + "router_z_loss_mlp": 0.10614014, + "step": 9840, + "time_per_iteration": 2.472325325012207 + }, + { + "auxiliary_loss_clip": 0.06421183, + "auxiliary_loss_mlp": 0.01269035, + "balance_loss_clip": 0.06273993, + "balance_loss_mlp": 0.01257019, + "epoch": 0.5916729295054862, + "flos": 24761194594560.0, + "grad_norm": 7.488465580129743, + "language_loss": 0.82013118, + "learning_rate": 1.5083335810574813e-06, + "loss": 0.89703333, + "num_input_tokens_seen": 212055775, + "router_z_loss_clip": 1.47265625, + "router_z_loss_mlp": 0.12023926, + "step": 9841, + "time_per_iteration": 2.539569139480591 + }, + { + "auxiliary_loss_clip": 0.06417108, + "auxiliary_loss_mlp": 0.01266112, + "balance_loss_clip": 0.06275231, + "balance_loss_mlp": 0.01255782, + "epoch": 0.5917330527581542, + "flos": 15963196867200.0, + "grad_norm": 1.7355438933283587, + "language_loss": 0.69817364, + "learning_rate": 1.507956080444291e-06, + "loss": 0.77500588, + "num_input_tokens_seen": 212074000, + "router_z_loss_clip": 1.41796875, + "router_z_loss_mlp": 0.10333252, + "step": 9842, + "time_per_iteration": 2.4748387336730957 + }, + { + "auxiliary_loss_clip": 0.06423896, + "auxiliary_loss_mlp": 0.0126808, + "balance_loss_clip": 0.06278209, + "balance_loss_mlp": 0.01256332, + "epoch": 0.5917931760108222, + "flos": 23806719936000.0, + "grad_norm": 2.0642371985300105, + "language_loss": 0.83243513, + "learning_rate": 1.5075785984894549e-06, + "loss": 0.90935493, + "num_input_tokens_seen": 212091415, + "router_z_loss_clip": 1.45703125, + "router_z_loss_mlp": 0.11755371, + "step": 9843, + "time_per_iteration": 2.5579354763031006 + }, + { + "auxiliary_loss_clip": 0.06423706, + "auxiliary_loss_mlp": 0.01264185, + "balance_loss_clip": 0.06277691, + "balance_loss_mlp": 0.01252419, + "epoch": 0.5918532992634902, + "flos": 23254864945920.0, + "grad_norm": 2.21208381325965, + "language_loss": 0.81869078, + "learning_rate": 1.5072011352072875e-06, + "loss": 0.89556968, + "num_input_tokens_seen": 212105255, + "router_z_loss_clip": 1.45898438, + "router_z_loss_mlp": 0.11773682, + "step": 9844, + "time_per_iteration": 2.4732062816619873 + }, + { + "auxiliary_loss_clip": 0.06423113, + "auxiliary_loss_mlp": 0.01264577, + "balance_loss_clip": 0.0627753, + "balance_loss_mlp": 0.01253496, + "epoch": 0.5919134225161581, + "flos": 19505867166720.0, + "grad_norm": 2.0396261684123966, + "language_loss": 0.74979722, + "learning_rate": 1.5068236906121032e-06, + "loss": 0.8266741, + "num_input_tokens_seen": 212122765, + "router_z_loss_clip": 1.45507812, + "router_z_loss_mlp": 0.11077881, + "step": 9845, + "time_per_iteration": 2.5498902797698975 + }, + { + "auxiliary_loss_clip": 0.0642004, + "auxiliary_loss_mlp": 0.01267189, + "balance_loss_clip": 0.06273404, + "balance_loss_mlp": 0.01255215, + "epoch": 0.5919735457688261, + "flos": 38810201264640.0, + "grad_norm": 1.7793580681254029, + "language_loss": 0.64624578, + "learning_rate": 1.506446264718213e-06, + "loss": 0.72311807, + "num_input_tokens_seen": 212143960, + "router_z_loss_clip": 1.46484375, + "router_z_loss_mlp": 0.11962891, + "step": 9846, + "time_per_iteration": 2.6562187671661377 + }, + { + "auxiliary_loss_clip": 0.0641156, + "auxiliary_loss_mlp": 0.01268591, + "balance_loss_clip": 0.06275991, + "balance_loss_mlp": 0.01258851, + "epoch": 0.592033669021494, + "flos": 22170567185280.0, + "grad_norm": 1.5989871653678733, + "language_loss": 0.76435882, + "learning_rate": 1.506068857539931e-06, + "loss": 0.84116036, + "num_input_tokens_seen": 212162005, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.09735107, + "step": 9847, + "time_per_iteration": 2.5877273082733154 + }, + { + "auxiliary_loss_clip": 0.06420001, + "auxiliary_loss_mlp": 0.01267428, + "balance_loss_clip": 0.06274936, + "balance_loss_mlp": 0.01255477, + "epoch": 0.592093792274162, + "flos": 22717600565760.0, + "grad_norm": 1.9085044692476394, + "language_loss": 0.62601185, + "learning_rate": 1.5056914690915667e-06, + "loss": 0.70288616, + "num_input_tokens_seen": 212181635, + "router_z_loss_clip": 1.44921875, + "router_z_loss_mlp": 0.11956787, + "step": 9848, + "time_per_iteration": 3.9838032722473145 + }, + { + "auxiliary_loss_clip": 0.06422321, + "auxiliary_loss_mlp": 0.01263974, + "balance_loss_clip": 0.06275022, + "balance_loss_mlp": 0.01252959, + "epoch": 0.59215391552683, + "flos": 22535605497600.0, + "grad_norm": 2.0066393042716855, + "language_loss": 0.76503384, + "learning_rate": 1.5053140993874312e-06, + "loss": 0.84189683, + "num_input_tokens_seen": 212201615, + "router_z_loss_clip": 1.47265625, + "router_z_loss_mlp": 0.11022949, + "step": 9849, + "time_per_iteration": 2.5015931129455566 + }, + { + "auxiliary_loss_clip": 0.06421839, + "auxiliary_loss_mlp": 0.01268681, + "balance_loss_clip": 0.06277264, + "balance_loss_mlp": 0.01256671, + "epoch": 0.592214038779498, + "flos": 24505965457920.0, + "grad_norm": 1.745648722955103, + "language_loss": 0.75836027, + "learning_rate": 1.5049367484418353e-06, + "loss": 0.8352654, + "num_input_tokens_seen": 212219355, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.12005615, + "step": 9850, + "time_per_iteration": 2.600179672241211 + }, + { + "auxiliary_loss_clip": 0.06417172, + "auxiliary_loss_mlp": 0.01268411, + "balance_loss_clip": 0.06275059, + "balance_loss_mlp": 0.01257367, + "epoch": 0.592274162032166, + "flos": 21837156589440.0, + "grad_norm": 1.6508975523953922, + "language_loss": 0.75545883, + "learning_rate": 1.5045594162690868e-06, + "loss": 0.83231473, + "num_input_tokens_seen": 212236710, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.1105957, + "step": 9851, + "time_per_iteration": 2.4818735122680664 + }, + { + "auxiliary_loss_clip": 0.06419359, + "auxiliary_loss_mlp": 0.01266702, + "balance_loss_clip": 0.06275028, + "balance_loss_mlp": 0.01254918, + "epoch": 0.5923342852848339, + "flos": 24615061873920.0, + "grad_norm": 1.7463946887344501, + "language_loss": 0.70506394, + "learning_rate": 1.5041821028834954e-06, + "loss": 0.78192449, + "num_input_tokens_seen": 212256195, + "router_z_loss_clip": 1.44238281, + "router_z_loss_mlp": 0.11779785, + "step": 9852, + "time_per_iteration": 2.587822675704956 + }, + { + "auxiliary_loss_clip": 0.06423963, + "auxiliary_loss_mlp": 0.01273382, + "balance_loss_clip": 0.06275325, + "balance_loss_mlp": 0.01261043, + "epoch": 0.5923944085375019, + "flos": 19944307255680.0, + "grad_norm": 1.582534152024796, + "language_loss": 0.80272847, + "learning_rate": 1.5038048082993685e-06, + "loss": 0.87970185, + "num_input_tokens_seen": 212274085, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.12347412, + "step": 9853, + "time_per_iteration": 2.4834022521972656 + }, + { + "auxiliary_loss_clip": 0.06412584, + "auxiliary_loss_mlp": 0.01264493, + "balance_loss_clip": 0.06272356, + "balance_loss_mlp": 0.01253985, + "epoch": 0.5924545317901698, + "flos": 28666177948800.0, + "grad_norm": 1.4145056961897013, + "language_loss": 0.67743915, + "learning_rate": 1.5034275325310124e-06, + "loss": 0.75421, + "num_input_tokens_seen": 212295530, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.1050415, + "step": 9854, + "time_per_iteration": 3.9716901779174805 + }, + { + "auxiliary_loss_clip": 0.06417395, + "auxiliary_loss_mlp": 0.01268291, + "balance_loss_clip": 0.06274853, + "balance_loss_mlp": 0.01257514, + "epoch": 0.5925146550428378, + "flos": 19870989333120.0, + "grad_norm": 1.7006302713228023, + "language_loss": 0.89085132, + "learning_rate": 1.5030502755927344e-06, + "loss": 0.96770817, + "num_input_tokens_seen": 212313770, + "router_z_loss_clip": 1.42675781, + "router_z_loss_mlp": 0.10772705, + "step": 9855, + "time_per_iteration": 2.54018235206604 + }, + { + "auxiliary_loss_clip": 0.06414687, + "auxiliary_loss_mlp": 0.01266215, + "balance_loss_clip": 0.06275393, + "balance_loss_mlp": 0.0125585, + "epoch": 0.5925747782955058, + "flos": 15128510019840.0, + "grad_norm": 1.7501100927117066, + "language_loss": 0.86997199, + "learning_rate": 1.5026730374988397e-06, + "loss": 0.94678098, + "num_input_tokens_seen": 212331525, + "router_z_loss_clip": 1.39257812, + "router_z_loss_mlp": 0.10369873, + "step": 9856, + "time_per_iteration": 2.5016441345214844 + }, + { + "auxiliary_loss_clip": 0.06422357, + "auxiliary_loss_mlp": 0.01265204, + "balance_loss_clip": 0.06275797, + "balance_loss_mlp": 0.01254177, + "epoch": 0.5926349015481738, + "flos": 18411297281280.0, + "grad_norm": 1.7487529922228526, + "language_loss": 0.77790916, + "learning_rate": 1.5022958182636332e-06, + "loss": 0.85478473, + "num_input_tokens_seen": 212347295, + "router_z_loss_clip": 1.46484375, + "router_z_loss_mlp": 0.11016846, + "step": 9857, + "time_per_iteration": 2.5232088565826416 + }, + { + "auxiliary_loss_clip": 0.06421745, + "auxiliary_loss_mlp": 0.01266127, + "balance_loss_clip": 0.06278913, + "balance_loss_mlp": 0.01254689, + "epoch": 0.5926950248008417, + "flos": 23117620757760.0, + "grad_norm": 2.3581492349261524, + "language_loss": 0.65045798, + "learning_rate": 1.501918617901419e-06, + "loss": 0.72733665, + "num_input_tokens_seen": 212365750, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.11431885, + "step": 9858, + "time_per_iteration": 4.080450773239136 + }, + { + "auxiliary_loss_clip": 0.06418257, + "auxiliary_loss_mlp": 0.01268065, + "balance_loss_clip": 0.06277932, + "balance_loss_mlp": 0.01256662, + "epoch": 0.5927551480535097, + "flos": 28040753473920.0, + "grad_norm": 1.620046821031832, + "language_loss": 0.77013564, + "learning_rate": 1.501541436426501e-06, + "loss": 0.84699887, + "num_input_tokens_seen": 212385300, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.11395264, + "step": 9859, + "time_per_iteration": 2.5496175289154053 + }, + { + "auxiliary_loss_clip": 0.06422819, + "auxiliary_loss_mlp": 0.01272084, + "balance_loss_clip": 0.06277181, + "balance_loss_mlp": 0.01260217, + "epoch": 0.5928152713061776, + "flos": 21805109602560.0, + "grad_norm": 2.0806402016169914, + "language_loss": 0.75381404, + "learning_rate": 1.5011642738531818e-06, + "loss": 0.8307631, + "num_input_tokens_seen": 212402140, + "router_z_loss_clip": 1.45507812, + "router_z_loss_mlp": 0.11865234, + "step": 9860, + "time_per_iteration": 2.4913806915283203 + }, + { + "auxiliary_loss_clip": 0.06419, + "auxiliary_loss_mlp": 0.01267797, + "balance_loss_clip": 0.06277152, + "balance_loss_mlp": 0.01257557, + "epoch": 0.5928753945588456, + "flos": 24323802681600.0, + "grad_norm": 1.5719426663731493, + "language_loss": 0.7657429, + "learning_rate": 1.500787130195763e-06, + "loss": 0.84261084, + "num_input_tokens_seen": 212421790, + "router_z_loss_clip": 1.41894531, + "router_z_loss_mlp": 0.10235596, + "step": 9861, + "time_per_iteration": 2.542318344116211 + }, + { + "auxiliary_loss_clip": 0.06416907, + "auxiliary_loss_mlp": 0.01266144, + "balance_loss_clip": 0.0627644, + "balance_loss_mlp": 0.01255355, + "epoch": 0.5929355178115137, + "flos": 26471126465280.0, + "grad_norm": 1.7884263747312634, + "language_loss": 0.70557332, + "learning_rate": 1.5004100054685465e-06, + "loss": 0.78240383, + "num_input_tokens_seen": 212442115, + "router_z_loss_clip": 1.40429688, + "router_z_loss_mlp": 0.10797119, + "step": 9862, + "time_per_iteration": 2.5269577503204346 + }, + { + "auxiliary_loss_clip": 0.06422247, + "auxiliary_loss_mlp": 0.01262904, + "balance_loss_clip": 0.06279124, + "balance_loss_mlp": 0.01252455, + "epoch": 0.5929956410641816, + "flos": 24971798632320.0, + "grad_norm": 1.7042567790148921, + "language_loss": 0.7816, + "learning_rate": 1.500032899685832e-06, + "loss": 0.85845149, + "num_input_tokens_seen": 212459535, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.10449219, + "step": 9863, + "time_per_iteration": 2.560952663421631 + }, + { + "auxiliary_loss_clip": 0.06423997, + "auxiliary_loss_mlp": 0.01269473, + "balance_loss_clip": 0.06280629, + "balance_loss_mlp": 0.01258917, + "epoch": 0.5930557643168496, + "flos": 26214639517440.0, + "grad_norm": 1.987432864542063, + "language_loss": 0.71297693, + "learning_rate": 1.499655812861921e-06, + "loss": 0.78991163, + "num_input_tokens_seen": 212479385, + "router_z_loss_clip": 1.43359375, + "router_z_loss_mlp": 0.10565186, + "step": 9864, + "time_per_iteration": 4.022796869277954 + }, + { + "auxiliary_loss_clip": 0.0642028, + "auxiliary_loss_mlp": 0.01268386, + "balance_loss_clip": 0.06276219, + "balance_loss_mlp": 0.01256578, + "epoch": 0.5931158875695175, + "flos": 27862322204160.0, + "grad_norm": 2.045271412380321, + "language_loss": 0.67615211, + "learning_rate": 1.4992787450111112e-06, + "loss": 0.75303876, + "num_input_tokens_seen": 212500060, + "router_z_loss_clip": 1.43847656, + "router_z_loss_mlp": 0.11816406, + "step": 9865, + "time_per_iteration": 2.542477607727051 + }, + { + "auxiliary_loss_clip": 0.06424178, + "auxiliary_loss_mlp": 0.01265131, + "balance_loss_clip": 0.06278679, + "balance_loss_mlp": 0.01253597, + "epoch": 0.5931760108221855, + "flos": 15419014525440.0, + "grad_norm": 2.0467341556470906, + "language_loss": 0.78422129, + "learning_rate": 1.4989016961477015e-06, + "loss": 0.86111438, + "num_input_tokens_seen": 212518590, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.11535645, + "step": 9866, + "time_per_iteration": 2.5601937770843506 + }, + { + "auxiliary_loss_clip": 0.06417245, + "auxiliary_loss_mlp": 0.01267033, + "balance_loss_clip": 0.06280121, + "balance_loss_mlp": 0.01256114, + "epoch": 0.5932361340748534, + "flos": 30196043395200.0, + "grad_norm": 1.6991427361252174, + "language_loss": 0.72385359, + "learning_rate": 1.4985246662859903e-06, + "loss": 0.80069637, + "num_input_tokens_seen": 212538190, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.10919189, + "step": 9867, + "time_per_iteration": 2.582200527191162 + }, + { + "auxiliary_loss_clip": 0.06421208, + "auxiliary_loss_mlp": 0.01268228, + "balance_loss_clip": 0.06280105, + "balance_loss_mlp": 0.0125589, + "epoch": 0.5932962573275214, + "flos": 20163841752960.0, + "grad_norm": 1.4126147288957658, + "language_loss": 0.6694321, + "learning_rate": 1.4981476554402732e-06, + "loss": 0.74632645, + "num_input_tokens_seen": 212557820, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.12335205, + "step": 9868, + "time_per_iteration": 2.515268087387085 + }, + { + "auxiliary_loss_clip": 0.06420252, + "auxiliary_loss_mlp": 0.01266526, + "balance_loss_clip": 0.06275701, + "balance_loss_mlp": 0.01255046, + "epoch": 0.5933563805801894, + "flos": 25452725541120.0, + "grad_norm": 1.59033500525529, + "language_loss": 0.75624323, + "learning_rate": 1.4977706636248478e-06, + "loss": 0.83311105, + "num_input_tokens_seen": 212577645, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.11474609, + "step": 9869, + "time_per_iteration": 2.5264642238616943 + }, + { + "auxiliary_loss_clip": 0.06425707, + "auxiliary_loss_mlp": 0.01265896, + "balance_loss_clip": 0.06281111, + "balance_loss_mlp": 0.01254779, + "epoch": 0.5934165038328574, + "flos": 60007971674880.0, + "grad_norm": 1.9233451977688907, + "language_loss": 0.74787021, + "learning_rate": 1.4973936908540091e-06, + "loss": 0.82478619, + "num_input_tokens_seen": 212603430, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.11114502, + "step": 9870, + "time_per_iteration": 2.8604302406311035 + }, + { + "auxiliary_loss_clip": 0.06422332, + "auxiliary_loss_mlp": 0.01265883, + "balance_loss_clip": 0.0627723, + "balance_loss_mlp": 0.01254719, + "epoch": 0.5934766270855253, + "flos": 24426568114560.0, + "grad_norm": 2.4352017906666226, + "language_loss": 0.72491121, + "learning_rate": 1.4970167371420517e-06, + "loss": 0.80179334, + "num_input_tokens_seen": 212620730, + "router_z_loss_clip": 1.45214844, + "router_z_loss_mlp": 0.11169434, + "step": 9871, + "time_per_iteration": 2.504990577697754 + }, + { + "auxiliary_loss_clip": 0.06424776, + "auxiliary_loss_mlp": 0.01266102, + "balance_loss_clip": 0.0627915, + "balance_loss_mlp": 0.01254843, + "epoch": 0.5935367503381933, + "flos": 23519821155840.0, + "grad_norm": 2.2688315988077736, + "language_loss": 0.74858117, + "learning_rate": 1.496639802503271e-06, + "loss": 0.82548994, + "num_input_tokens_seen": 212639745, + "router_z_loss_clip": 1.45703125, + "router_z_loss_mlp": 0.11254883, + "step": 9872, + "time_per_iteration": 2.5957329273223877 + }, + { + "auxiliary_loss_clip": 0.06431574, + "auxiliary_loss_mlp": 0.01267461, + "balance_loss_clip": 0.06283869, + "balance_loss_mlp": 0.01255517, + "epoch": 0.5935968735908612, + "flos": 18953550979200.0, + "grad_norm": 11.679124704717912, + "language_loss": 0.79073173, + "learning_rate": 1.4962628869519583e-06, + "loss": 0.86772209, + "num_input_tokens_seen": 212655915, + "router_z_loss_clip": 1.4765625, + "router_z_loss_mlp": 0.1194458, + "step": 9873, + "time_per_iteration": 2.4669687747955322 + }, + { + "auxiliary_loss_clip": 0.064208, + "auxiliary_loss_mlp": 0.01267302, + "balance_loss_clip": 0.06276259, + "balance_loss_mlp": 0.01255459, + "epoch": 0.5936569968435292, + "flos": 25490432678400.0, + "grad_norm": 1.6349451241448802, + "language_loss": 0.85223055, + "learning_rate": 1.4958859905024078e-06, + "loss": 0.9291116, + "num_input_tokens_seen": 212676115, + "router_z_loss_clip": 1.44726562, + "router_z_loss_mlp": 0.11853027, + "step": 9874, + "time_per_iteration": 2.5542490482330322 + }, + { + "auxiliary_loss_clip": 0.06322969, + "auxiliary_loss_mlp": 0.01256968, + "balance_loss_clip": 0.0626381, + "balance_loss_mlp": 0.01255485, + "epoch": 0.5937171200961973, + "flos": 66397364259840.0, + "grad_norm": 0.7006393782995821, + "language_loss": 0.59778833, + "learning_rate": 1.4955091131689115e-06, + "loss": 0.67358768, + "num_input_tokens_seen": 212737560, + "router_z_loss_clip": 0.59375, + "router_z_loss_mlp": 0.01482391, + "step": 9875, + "time_per_iteration": 3.2118613719940186 + }, + { + "auxiliary_loss_clip": 0.06429566, + "auxiliary_loss_mlp": 0.01269748, + "balance_loss_clip": 0.06278439, + "balance_loss_mlp": 0.01257302, + "epoch": 0.5937772433488652, + "flos": 14908849741440.0, + "grad_norm": 2.56951836872527, + "language_loss": 0.78072035, + "learning_rate": 1.4951322549657594e-06, + "loss": 0.85771352, + "num_input_tokens_seen": 212755365, + "router_z_loss_clip": 1.51367188, + "router_z_loss_mlp": 0.12451172, + "step": 9876, + "time_per_iteration": 2.488849401473999 + }, + { + "auxiliary_loss_clip": 0.06411201, + "auxiliary_loss_mlp": 0.0126454, + "balance_loss_clip": 0.06273002, + "balance_loss_mlp": 0.01253764, + "epoch": 0.5938373666015332, + "flos": 22567484776320.0, + "grad_norm": 1.5512644369371444, + "language_loss": 0.7603606, + "learning_rate": 1.494755415907243e-06, + "loss": 0.83711803, + "num_input_tokens_seen": 212773875, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.10772705, + "step": 9877, + "time_per_iteration": 2.5584661960601807 + }, + { + "auxiliary_loss_clip": 0.06419433, + "auxiliary_loss_mlp": 0.01268567, + "balance_loss_clip": 0.06274508, + "balance_loss_mlp": 0.01256801, + "epoch": 0.5938974898542011, + "flos": 18446572650240.0, + "grad_norm": 2.5934425226299243, + "language_loss": 0.81566256, + "learning_rate": 1.4943785960076522e-06, + "loss": 0.8925426, + "num_input_tokens_seen": 212790590, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.11779785, + "step": 9878, + "time_per_iteration": 2.498063802719116 + }, + { + "auxiliary_loss_clip": 0.0642112, + "auxiliary_loss_mlp": 0.0126802, + "balance_loss_clip": 0.06274901, + "balance_loss_mlp": 0.01256993, + "epoch": 0.5939576131068691, + "flos": 45597029293440.0, + "grad_norm": 1.6161422600744055, + "language_loss": 0.71359301, + "learning_rate": 1.4940017952812754e-06, + "loss": 0.79048443, + "num_input_tokens_seen": 212812265, + "router_z_loss_clip": 1.46191406, + "router_z_loss_mlp": 0.11029053, + "step": 9879, + "time_per_iteration": 2.7588438987731934 + }, + { + "auxiliary_loss_clip": 0.06414315, + "auxiliary_loss_mlp": 0.0126561, + "balance_loss_clip": 0.06272938, + "balance_loss_mlp": 0.01254166, + "epoch": 0.594017736359537, + "flos": 23594648451840.0, + "grad_norm": 1.558347600048505, + "language_loss": 0.57834136, + "learning_rate": 1.493625013742401e-06, + "loss": 0.65514064, + "num_input_tokens_seen": 212831915, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.11431885, + "step": 9880, + "time_per_iteration": 2.5477280616760254 + }, + { + "auxiliary_loss_clip": 0.0641728, + "auxiliary_loss_mlp": 0.012706, + "balance_loss_clip": 0.06272745, + "balance_loss_mlp": 0.01258751, + "epoch": 0.594077859612205, + "flos": 29464373543040.0, + "grad_norm": 1.9254284711947285, + "language_loss": 0.78115642, + "learning_rate": 1.4932482514053177e-06, + "loss": 0.85803521, + "num_input_tokens_seen": 212851350, + "router_z_loss_clip": 1.4453125, + "router_z_loss_mlp": 0.11846924, + "step": 9881, + "time_per_iteration": 2.596902847290039 + }, + { + "auxiliary_loss_clip": 0.06421138, + "auxiliary_loss_mlp": 0.0126373, + "balance_loss_clip": 0.06276222, + "balance_loss_mlp": 0.01252882, + "epoch": 0.594137982864873, + "flos": 16805682144000.0, + "grad_norm": 2.173471904433077, + "language_loss": 0.83138072, + "learning_rate": 1.4928715082843112e-06, + "loss": 0.90822935, + "num_input_tokens_seen": 212867995, + "router_z_loss_clip": 1.44726562, + "router_z_loss_mlp": 0.10839844, + "step": 9882, + "time_per_iteration": 2.483264446258545 + }, + { + "auxiliary_loss_clip": 0.06420217, + "auxiliary_loss_mlp": 0.01271488, + "balance_loss_clip": 0.06276472, + "balance_loss_mlp": 0.01260318, + "epoch": 0.594198106117541, + "flos": 12755194974720.0, + "grad_norm": 2.093124407330454, + "language_loss": 0.79720157, + "learning_rate": 1.492494784393667e-06, + "loss": 0.87411857, + "num_input_tokens_seen": 212885220, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.11175537, + "step": 9883, + "time_per_iteration": 2.5007734298706055 + }, + { + "auxiliary_loss_clip": 0.06424005, + "auxiliary_loss_mlp": 0.01269731, + "balance_loss_clip": 0.06275944, + "balance_loss_mlp": 0.01258097, + "epoch": 0.5942582293702089, + "flos": 21002930939520.0, + "grad_norm": 1.7867915832733556, + "language_loss": 0.7479161, + "learning_rate": 1.4921180797476725e-06, + "loss": 0.82485354, + "num_input_tokens_seen": 212903195, + "router_z_loss_clip": 1.48144531, + "router_z_loss_mlp": 0.11645508, + "step": 9884, + "time_per_iteration": 2.5044338703155518 + }, + { + "auxiliary_loss_clip": 0.06419083, + "auxiliary_loss_mlp": 0.01265524, + "balance_loss_clip": 0.06275263, + "balance_loss_mlp": 0.01253549, + "epoch": 0.5943183526228769, + "flos": 28298665941120.0, + "grad_norm": 2.661403390475952, + "language_loss": 0.6670655, + "learning_rate": 1.4917413943606106e-06, + "loss": 0.7439115, + "num_input_tokens_seen": 212923340, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.11975098, + "step": 9885, + "time_per_iteration": 2.592233180999756 + }, + { + "auxiliary_loss_clip": 0.06417437, + "auxiliary_loss_mlp": 0.01268066, + "balance_loss_clip": 0.06274392, + "balance_loss_mlp": 0.01256884, + "epoch": 0.5943784758755448, + "flos": 26621829233280.0, + "grad_norm": 2.23147400779812, + "language_loss": 0.76914746, + "learning_rate": 1.4913647282467667e-06, + "loss": 0.84600246, + "num_input_tokens_seen": 212942755, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.11181641, + "step": 9886, + "time_per_iteration": 2.5211451053619385 + }, + { + "auxiliary_loss_clip": 0.06318811, + "auxiliary_loss_mlp": 0.01252302, + "balance_loss_clip": 0.06259875, + "balance_loss_mlp": 0.01250785, + "epoch": 0.5944385991282128, + "flos": 64209859643520.0, + "grad_norm": 0.8085761446732002, + "language_loss": 0.64425516, + "learning_rate": 1.490988081420423e-06, + "loss": 0.71996629, + "num_input_tokens_seen": 212999355, + "router_z_loss_clip": 0.58984375, + "router_z_loss_mlp": 0.01515961, + "step": 9887, + "time_per_iteration": 4.4216148853302 + }, + { + "auxiliary_loss_clip": 0.06419201, + "auxiliary_loss_mlp": 0.01265936, + "balance_loss_clip": 0.06275857, + "balance_loss_mlp": 0.01254307, + "epoch": 0.5944987223808808, + "flos": 19577885351040.0, + "grad_norm": 1.7443994329425772, + "language_loss": 0.691764, + "learning_rate": 1.4906114538958615e-06, + "loss": 0.76861531, + "num_input_tokens_seen": 213018570, + "router_z_loss_clip": 1.43164062, + "router_z_loss_mlp": 0.11633301, + "step": 9888, + "time_per_iteration": 2.558119058609009 + }, + { + "auxiliary_loss_clip": 0.06419526, + "auxiliary_loss_mlp": 0.01269907, + "balance_loss_clip": 0.06276903, + "balance_loss_mlp": 0.01258773, + "epoch": 0.5945588456335488, + "flos": 26184856590720.0, + "grad_norm": 1.5028057851776446, + "language_loss": 0.7952224, + "learning_rate": 1.490234845687366e-06, + "loss": 0.87211674, + "num_input_tokens_seen": 213037735, + "router_z_loss_clip": 1.42675781, + "router_z_loss_mlp": 0.11138916, + "step": 9889, + "time_per_iteration": 2.556455612182617 + }, + { + "auxiliary_loss_clip": 0.06416804, + "auxiliary_loss_mlp": 0.01267591, + "balance_loss_clip": 0.06273508, + "balance_loss_mlp": 0.01257076, + "epoch": 0.5946189688862168, + "flos": 20452333760640.0, + "grad_norm": 1.5171149074997012, + "language_loss": 0.70987219, + "learning_rate": 1.4898582568092154e-06, + "loss": 0.7867161, + "num_input_tokens_seen": 213057160, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.1050415, + "step": 9890, + "time_per_iteration": 2.572852373123169 + }, + { + "auxiliary_loss_clip": 0.06420811, + "auxiliary_loss_mlp": 0.01269509, + "balance_loss_clip": 0.06275058, + "balance_loss_mlp": 0.01258041, + "epoch": 0.5946790921388847, + "flos": 13441568895360.0, + "grad_norm": 1.9815921383050485, + "language_loss": 0.697523, + "learning_rate": 1.489481687275691e-06, + "loss": 0.77442622, + "num_input_tokens_seen": 213073630, + "router_z_loss_clip": 1.45800781, + "router_z_loss_mlp": 0.11468506, + "step": 9891, + "time_per_iteration": 2.474308729171753 + }, + { + "auxiliary_loss_clip": 0.06419806, + "auxiliary_loss_mlp": 0.01266103, + "balance_loss_clip": 0.06277567, + "balance_loss_mlp": 0.01255839, + "epoch": 0.5947392153915527, + "flos": 20418483911040.0, + "grad_norm": 1.7485359350265648, + "language_loss": 0.53498697, + "learning_rate": 1.4891051371010726e-06, + "loss": 0.61184609, + "num_input_tokens_seen": 213092450, + "router_z_loss_clip": 1.42285156, + "router_z_loss_mlp": 0.10266113, + "step": 9892, + "time_per_iteration": 2.534221649169922 + }, + { + "auxiliary_loss_clip": 0.06313733, + "auxiliary_loss_mlp": 0.01253007, + "balance_loss_clip": 0.06254771, + "balance_loss_mlp": 0.01251455, + "epoch": 0.5947993386442206, + "flos": 65639181790080.0, + "grad_norm": 0.6531062006914405, + "language_loss": 0.54571462, + "learning_rate": 1.4887286062996375e-06, + "loss": 0.621382, + "num_input_tokens_seen": 213155465, + "router_z_loss_clip": 0.58984375, + "router_z_loss_mlp": 0.01551056, + "step": 9893, + "time_per_iteration": 3.1853702068328857 + }, + { + "auxiliary_loss_clip": 0.064126, + "auxiliary_loss_mlp": 0.0126532, + "balance_loss_clip": 0.06272365, + "balance_loss_mlp": 0.01254841, + "epoch": 0.5948594618968887, + "flos": 23189429306880.0, + "grad_norm": 1.6806512476713673, + "language_loss": 0.75017619, + "learning_rate": 1.4883520948856658e-06, + "loss": 0.82695538, + "num_input_tokens_seen": 213174875, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.10473633, + "step": 9894, + "time_per_iteration": 4.046506643295288 + }, + { + "auxiliary_loss_clip": 0.06415449, + "auxiliary_loss_mlp": 0.01265281, + "balance_loss_clip": 0.06273435, + "balance_loss_mlp": 0.01253831, + "epoch": 0.5949195851495566, + "flos": 13631991298560.0, + "grad_norm": 1.844376504699444, + "language_loss": 0.77997828, + "learning_rate": 1.487975602873434e-06, + "loss": 0.8567856, + "num_input_tokens_seen": 213192695, + "router_z_loss_clip": 1.41894531, + "router_z_loss_mlp": 0.11444092, + "step": 9895, + "time_per_iteration": 2.5028066635131836 + }, + { + "auxiliary_loss_clip": 0.06421571, + "auxiliary_loss_mlp": 0.01264682, + "balance_loss_clip": 0.06273872, + "balance_loss_mlp": 0.01252862, + "epoch": 0.5949797084022246, + "flos": 19756358547840.0, + "grad_norm": 2.034072439962686, + "language_loss": 0.79318964, + "learning_rate": 1.4875991302772182e-06, + "loss": 0.8700521, + "num_input_tokens_seen": 213211195, + "router_z_loss_clip": 1.47753906, + "router_z_loss_mlp": 0.11816406, + "step": 9896, + "time_per_iteration": 2.496610164642334 + }, + { + "auxiliary_loss_clip": 0.06420637, + "auxiliary_loss_mlp": 0.01265344, + "balance_loss_clip": 0.06275238, + "balance_loss_mlp": 0.01253709, + "epoch": 0.5950398316548925, + "flos": 25780685621760.0, + "grad_norm": 1.4418973411464253, + "language_loss": 0.8331461, + "learning_rate": 1.4872226771112954e-06, + "loss": 0.91000593, + "num_input_tokens_seen": 213231975, + "router_z_loss_clip": 1.45507812, + "router_z_loss_mlp": 0.11645508, + "step": 9897, + "time_per_iteration": 2.6055963039398193 + }, + { + "auxiliary_loss_clip": 0.06422365, + "auxiliary_loss_mlp": 0.0126489, + "balance_loss_clip": 0.06278124, + "balance_loss_mlp": 0.01254012, + "epoch": 0.5950999549075605, + "flos": 23045644500480.0, + "grad_norm": 2.157917564883112, + "language_loss": 0.71089602, + "learning_rate": 1.486846243389939e-06, + "loss": 0.78776848, + "num_input_tokens_seen": 213249760, + "router_z_loss_clip": 1.44335938, + "router_z_loss_mlp": 0.10882568, + "step": 9898, + "time_per_iteration": 3.95219087600708 + }, + { + "auxiliary_loss_clip": 0.06426959, + "auxiliary_loss_mlp": 0.01267336, + "balance_loss_clip": 0.06277014, + "balance_loss_mlp": 0.01254897, + "epoch": 0.5951600781602284, + "flos": 32453553697920.0, + "grad_norm": 2.106705884146929, + "language_loss": 0.63699448, + "learning_rate": 1.4864698291274251e-06, + "loss": 0.71393746, + "num_input_tokens_seen": 213269890, + "router_z_loss_clip": 1.49902344, + "router_z_loss_mlp": 0.12451172, + "step": 9899, + "time_per_iteration": 2.597721576690674 + }, + { + "auxiliary_loss_clip": 0.06419618, + "auxiliary_loss_mlp": 0.01270579, + "balance_loss_clip": 0.06276435, + "balance_loss_mlp": 0.01259999, + "epoch": 0.5952202014128964, + "flos": 23806887644160.0, + "grad_norm": 1.5164228353921223, + "language_loss": 0.72182071, + "learning_rate": 1.4860934343380267e-06, + "loss": 0.79872268, + "num_input_tokens_seen": 213289400, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.10571289, + "step": 9900, + "time_per_iteration": 2.5579535961151123 + }, + { + "auxiliary_loss_clip": 0.06414567, + "auxiliary_loss_mlp": 0.01267005, + "balance_loss_clip": 0.06274517, + "balance_loss_mlp": 0.01255484, + "epoch": 0.5952803246655644, + "flos": 22498778828160.0, + "grad_norm": 1.774545476213964, + "language_loss": 0.84691358, + "learning_rate": 1.4857170590360169e-06, + "loss": 0.9237293, + "num_input_tokens_seen": 213308040, + "router_z_loss_clip": 1.40039062, + "router_z_loss_mlp": 0.11523438, + "step": 9901, + "time_per_iteration": 2.532650947570801 + }, + { + "auxiliary_loss_clip": 0.06311554, + "auxiliary_loss_mlp": 0.01252152, + "balance_loss_clip": 0.06252782, + "balance_loss_mlp": 0.01250599, + "epoch": 0.5953404479182324, + "flos": 51250810884480.0, + "grad_norm": 0.7741789718205083, + "language_loss": 0.58204901, + "learning_rate": 1.4853407032356674e-06, + "loss": 0.65768605, + "num_input_tokens_seen": 213358585, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 0.01550293, + "step": 9902, + "time_per_iteration": 2.995508909225464 + }, + { + "auxiliary_loss_clip": 0.06422149, + "auxiliary_loss_mlp": 0.01268252, + "balance_loss_clip": 0.06274737, + "balance_loss_mlp": 0.01256653, + "epoch": 0.5954005711709004, + "flos": 23119423620480.0, + "grad_norm": 1.8631652775155525, + "language_loss": 0.77643347, + "learning_rate": 1.4849643669512503e-06, + "loss": 0.85333747, + "num_input_tokens_seen": 213379585, + "router_z_loss_clip": 1.47363281, + "router_z_loss_mlp": 0.11608887, + "step": 9903, + "time_per_iteration": 2.526265859603882 + }, + { + "auxiliary_loss_clip": 0.06419012, + "auxiliary_loss_mlp": 0.01265075, + "balance_loss_clip": 0.06274754, + "balance_loss_mlp": 0.01253691, + "epoch": 0.5954606944235683, + "flos": 35963464250880.0, + "grad_norm": 1.7611381352056217, + "language_loss": 0.78137469, + "learning_rate": 1.4845880501970362e-06, + "loss": 0.85821557, + "num_input_tokens_seen": 213401465, + "router_z_loss_clip": 1.44140625, + "router_z_loss_mlp": 0.1138916, + "step": 9904, + "time_per_iteration": 4.04362940788269 + }, + { + "auxiliary_loss_clip": 0.0642558, + "auxiliary_loss_mlp": 0.012642, + "balance_loss_clip": 0.06275237, + "balance_loss_mlp": 0.01252619, + "epoch": 0.5955208176762363, + "flos": 30451188677760.0, + "grad_norm": 1.2800711014437993, + "language_loss": 0.72963494, + "learning_rate": 1.4842117529872942e-06, + "loss": 0.80653274, + "num_input_tokens_seen": 213422720, + "router_z_loss_clip": 1.50390625, + "router_z_loss_mlp": 0.11566162, + "step": 9905, + "time_per_iteration": 2.630237340927124 + }, + { + "auxiliary_loss_clip": 0.06417751, + "auxiliary_loss_mlp": 0.01267213, + "balance_loss_clip": 0.06273159, + "balance_loss_mlp": 0.01255942, + "epoch": 0.5955809409289042, + "flos": 17645987214720.0, + "grad_norm": 2.1926975812717524, + "language_loss": 0.70104027, + "learning_rate": 1.483835475336295e-06, + "loss": 0.77788991, + "num_input_tokens_seen": 213439480, + "router_z_loss_clip": 1.4453125, + "router_z_loss_mlp": 0.11273193, + "step": 9906, + "time_per_iteration": 2.5136594772338867 + }, + { + "auxiliary_loss_clip": 0.06423035, + "auxiliary_loss_mlp": 0.0126641, + "balance_loss_clip": 0.06276789, + "balance_loss_mlp": 0.01254316, + "epoch": 0.5956410641815723, + "flos": 24286766376960.0, + "grad_norm": 1.7055783949352592, + "language_loss": 0.74976909, + "learning_rate": 1.4834592172583057e-06, + "loss": 0.82666361, + "num_input_tokens_seen": 213458895, + "router_z_loss_clip": 1.46191406, + "router_z_loss_mlp": 0.12103271, + "step": 9907, + "time_per_iteration": 2.5186941623687744 + }, + { + "auxiliary_loss_clip": 0.06419441, + "auxiliary_loss_mlp": 0.01268122, + "balance_loss_clip": 0.06274839, + "balance_loss_mlp": 0.01256618, + "epoch": 0.5957011874342402, + "flos": 35742713869440.0, + "grad_norm": 1.9121613205115942, + "language_loss": 0.67437243, + "learning_rate": 1.483082978767595e-06, + "loss": 0.75124806, + "num_input_tokens_seen": 213481730, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.11505127, + "step": 9908, + "time_per_iteration": 2.641977310180664 + }, + { + "auxiliary_loss_clip": 0.06417987, + "auxiliary_loss_mlp": 0.01266521, + "balance_loss_clip": 0.0627388, + "balance_loss_mlp": 0.01255459, + "epoch": 0.5957613106869082, + "flos": 21250277792640.0, + "grad_norm": 1.9262426125407, + "language_loss": 0.7637223, + "learning_rate": 1.4827067598784298e-06, + "loss": 0.84056735, + "num_input_tokens_seen": 213497225, + "router_z_loss_clip": 1.43945312, + "router_z_loss_mlp": 0.1105957, + "step": 9909, + "time_per_iteration": 2.4708259105682373 + }, + { + "auxiliary_loss_clip": 0.06309633, + "auxiliary_loss_mlp": 0.01253319, + "balance_loss_clip": 0.06250934, + "balance_loss_mlp": 0.01251702, + "epoch": 0.5958214339395761, + "flos": 65959972346880.0, + "grad_norm": 0.8925366465224025, + "language_loss": 0.73392916, + "learning_rate": 1.4823305606050753e-06, + "loss": 0.80955869, + "num_input_tokens_seen": 213556890, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 0.01618958, + "step": 9910, + "time_per_iteration": 3.2132058143615723 + }, + { + "auxiliary_loss_clip": 0.06420797, + "auxiliary_loss_mlp": 0.01266027, + "balance_loss_clip": 0.06273291, + "balance_loss_mlp": 0.01253838, + "epoch": 0.5958815571922441, + "flos": 23224872384000.0, + "grad_norm": 1.906132958424511, + "language_loss": 0.69966662, + "learning_rate": 1.481954380961799e-06, + "loss": 0.77653486, + "num_input_tokens_seen": 213575800, + "router_z_loss_clip": 1.47753906, + "router_z_loss_mlp": 0.12194824, + "step": 9911, + "time_per_iteration": 2.5891547203063965 + }, + { + "auxiliary_loss_clip": 0.06430559, + "auxiliary_loss_mlp": 0.01269185, + "balance_loss_clip": 0.06277213, + "balance_loss_mlp": 0.01256471, + "epoch": 0.595941680444912, + "flos": 16543157702400.0, + "grad_norm": 1.8117496085568294, + "language_loss": 0.65995622, + "learning_rate": 1.4815782209628631e-06, + "loss": 0.73695368, + "num_input_tokens_seen": 213592740, + "router_z_loss_clip": 1.53515625, + "router_z_loss_mlp": 0.12713623, + "step": 9912, + "time_per_iteration": 2.5106897354125977 + }, + { + "auxiliary_loss_clip": 0.06418723, + "auxiliary_loss_mlp": 0.01269847, + "balance_loss_clip": 0.06273462, + "balance_loss_mlp": 0.01257681, + "epoch": 0.59600180369758, + "flos": 27826334075520.0, + "grad_norm": 1.8937269812557305, + "language_loss": 0.73603946, + "learning_rate": 1.4812020806225337e-06, + "loss": 0.81292516, + "num_input_tokens_seen": 213611970, + "router_z_loss_clip": 1.45117188, + "router_z_loss_mlp": 0.12145996, + "step": 9913, + "time_per_iteration": 2.5845842361450195 + }, + { + "auxiliary_loss_clip": 0.06422256, + "auxiliary_loss_mlp": 0.01265933, + "balance_loss_clip": 0.06272183, + "balance_loss_mlp": 0.01254316, + "epoch": 0.596061926950248, + "flos": 29498349173760.0, + "grad_norm": 2.1687664822630692, + "language_loss": 0.79983938, + "learning_rate": 1.4808259599550738e-06, + "loss": 0.87672126, + "num_input_tokens_seen": 213632230, + "router_z_loss_clip": 1.50097656, + "router_z_loss_mlp": 0.1161499, + "step": 9914, + "time_per_iteration": 2.677943229675293 + }, + { + "auxiliary_loss_clip": 0.06418366, + "auxiliary_loss_mlp": 0.01267743, + "balance_loss_clip": 0.06274056, + "balance_loss_mlp": 0.01256233, + "epoch": 0.596122050202916, + "flos": 16842424959360.0, + "grad_norm": 1.662988077903936, + "language_loss": 0.67750293, + "learning_rate": 1.4804498589747448e-06, + "loss": 0.75436401, + "num_input_tokens_seen": 213649645, + "router_z_loss_clip": 1.44238281, + "router_z_loss_mlp": 0.1149292, + "step": 9915, + "time_per_iteration": 2.527804374694824 + }, + { + "auxiliary_loss_clip": 0.06422138, + "auxiliary_loss_mlp": 0.01266284, + "balance_loss_clip": 0.06274668, + "balance_loss_mlp": 0.01254888, + "epoch": 0.596182173455584, + "flos": 21003056720640.0, + "grad_norm": 1.4119869222981658, + "language_loss": 0.7862711, + "learning_rate": 1.4800737776958095e-06, + "loss": 0.86315531, + "num_input_tokens_seen": 213668850, + "router_z_loss_clip": 1.47363281, + "router_z_loss_mlp": 0.11395264, + "step": 9916, + "time_per_iteration": 2.5146098136901855 + }, + { + "auxiliary_loss_clip": 0.06422624, + "auxiliary_loss_mlp": 0.01266472, + "balance_loss_clip": 0.06273377, + "balance_loss_mlp": 0.01254808, + "epoch": 0.5962422967082519, + "flos": 16070364639360.0, + "grad_norm": 1.8279133386942186, + "language_loss": 0.83302379, + "learning_rate": 1.4796977161325286e-06, + "loss": 0.90991473, + "num_input_tokens_seen": 213685695, + "router_z_loss_clip": 1.49511719, + "router_z_loss_mlp": 0.11657715, + "step": 9917, + "time_per_iteration": 2.5148332118988037 + }, + { + "auxiliary_loss_clip": 0.06418853, + "auxiliary_loss_mlp": 0.0126709, + "balance_loss_clip": 0.06274682, + "balance_loss_mlp": 0.01256236, + "epoch": 0.5963024199609199, + "flos": 12171879976320.0, + "grad_norm": 1.6879177929284592, + "language_loss": 0.77521312, + "learning_rate": 1.4793216742991625e-06, + "loss": 0.85207248, + "num_input_tokens_seen": 213703515, + "router_z_loss_clip": 1.44335938, + "router_z_loss_mlp": 0.10852051, + "step": 9918, + "time_per_iteration": 2.4897613525390625 + }, + { + "auxiliary_loss_clip": 0.06419399, + "auxiliary_loss_mlp": 0.01267485, + "balance_loss_clip": 0.06274245, + "balance_loss_mlp": 0.01256661, + "epoch": 0.5963625432135878, + "flos": 28081772847360.0, + "grad_norm": 1.5296515450402863, + "language_loss": 0.7930398, + "learning_rate": 1.4789456522099707e-06, + "loss": 0.86990869, + "num_input_tokens_seen": 213724170, + "router_z_loss_clip": 1.45214844, + "router_z_loss_mlp": 0.10821533, + "step": 9919, + "time_per_iteration": 2.6023364067077637 + }, + { + "auxiliary_loss_clip": 0.06424099, + "auxiliary_loss_mlp": 0.01265086, + "balance_loss_clip": 0.06277885, + "balance_loss_mlp": 0.01253434, + "epoch": 0.5964226664662559, + "flos": 19865664599040.0, + "grad_norm": 2.0582572283345537, + "language_loss": 0.77598941, + "learning_rate": 1.4785696498792122e-06, + "loss": 0.85288125, + "num_input_tokens_seen": 213740620, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.11645508, + "step": 9920, + "time_per_iteration": 2.499610424041748 + }, + { + "auxiliary_loss_clip": 0.06428593, + "auxiliary_loss_mlp": 0.01269926, + "balance_loss_clip": 0.06280707, + "balance_loss_mlp": 0.01258124, + "epoch": 0.5964827897189238, + "flos": 12937567386240.0, + "grad_norm": 2.9535163377991647, + "language_loss": 0.8317768, + "learning_rate": 1.4781936673211446e-06, + "loss": 0.90876198, + "num_input_tokens_seen": 213755390, + "router_z_loss_clip": 1.47753906, + "router_z_loss_mlp": 0.11798096, + "step": 9921, + "time_per_iteration": 2.5134449005126953 + }, + { + "auxiliary_loss_clip": 0.06416389, + "auxiliary_loss_mlp": 0.01268083, + "balance_loss_clip": 0.0627341, + "balance_loss_mlp": 0.01256389, + "epoch": 0.5965429129715918, + "flos": 18156738977280.0, + "grad_norm": 1.8928045831706461, + "language_loss": 0.80601788, + "learning_rate": 1.4778177045500252e-06, + "loss": 0.88286257, + "num_input_tokens_seen": 213773225, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.11694336, + "step": 9922, + "time_per_iteration": 2.4813597202301025 + }, + { + "auxiliary_loss_clip": 0.06417114, + "auxiliary_loss_mlp": 0.01269772, + "balance_loss_clip": 0.06271716, + "balance_loss_mlp": 0.01258828, + "epoch": 0.5966030362242597, + "flos": 21769834233600.0, + "grad_norm": 3.055273537118157, + "language_loss": 0.7726593, + "learning_rate": 1.477441761580111e-06, + "loss": 0.84952813, + "num_input_tokens_seen": 213791860, + "router_z_loss_clip": 1.453125, + "router_z_loss_mlp": 0.10949707, + "step": 9923, + "time_per_iteration": 2.5638489723205566 + }, + { + "auxiliary_loss_clip": 0.06424043, + "auxiliary_loss_mlp": 0.01268694, + "balance_loss_clip": 0.06273048, + "balance_loss_mlp": 0.01254973, + "epoch": 0.5966631594769277, + "flos": 18813204190080.0, + "grad_norm": 1.8922524994378742, + "language_loss": 0.76095831, + "learning_rate": 1.4770658384256573e-06, + "loss": 0.83788568, + "num_input_tokens_seen": 213809455, + "router_z_loss_clip": 1.51074219, + "router_z_loss_mlp": 0.13720703, + "step": 9924, + "time_per_iteration": 2.4999732971191406 + }, + { + "auxiliary_loss_clip": 0.06413831, + "auxiliary_loss_mlp": 0.01268542, + "balance_loss_clip": 0.06272236, + "balance_loss_mlp": 0.01256633, + "epoch": 0.5967232827295956, + "flos": 14069383211520.0, + "grad_norm": 1.7112851014893713, + "language_loss": 0.66830564, + "learning_rate": 1.4766899351009204e-06, + "loss": 0.74512935, + "num_input_tokens_seen": 213826615, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.11920166, + "step": 9925, + "time_per_iteration": 2.5139551162719727 + }, + { + "auxiliary_loss_clip": 0.06421202, + "auxiliary_loss_mlp": 0.0126999, + "balance_loss_clip": 0.06279947, + "balance_loss_mlp": 0.01258409, + "epoch": 0.5967834059822636, + "flos": 17243954524800.0, + "grad_norm": 1.861204364539265, + "language_loss": 0.72200316, + "learning_rate": 1.4763140516201528e-06, + "loss": 0.79891503, + "num_input_tokens_seen": 213844495, + "router_z_loss_clip": 1.41210938, + "router_z_loss_mlp": 0.11584473, + "step": 9926, + "time_per_iteration": 3.9693188667297363 + }, + { + "auxiliary_loss_clip": 0.06422362, + "auxiliary_loss_mlp": 0.01270656, + "balance_loss_clip": 0.06274919, + "balance_loss_mlp": 0.01258556, + "epoch": 0.5968435292349316, + "flos": 42529751533440.0, + "grad_norm": 1.9299553445847866, + "language_loss": 0.70147216, + "learning_rate": 1.4759381879976088e-06, + "loss": 0.77840233, + "num_input_tokens_seen": 213869125, + "router_z_loss_clip": 1.47460938, + "router_z_loss_mlp": 0.12103271, + "step": 9927, + "time_per_iteration": 2.7299752235412598 + }, + { + "auxiliary_loss_clip": 0.06429256, + "auxiliary_loss_mlp": 0.01266883, + "balance_loss_clip": 0.06277983, + "balance_loss_mlp": 0.0125467, + "epoch": 0.5969036524875996, + "flos": 37639546272000.0, + "grad_norm": 1.5668113041571725, + "language_loss": 0.63611758, + "learning_rate": 1.4755623442475415e-06, + "loss": 0.71307898, + "num_input_tokens_seen": 213891115, + "router_z_loss_clip": 1.51269531, + "router_z_loss_mlp": 0.12213135, + "step": 9928, + "time_per_iteration": 2.7166144847869873 + }, + { + "auxiliary_loss_clip": 0.06418041, + "auxiliary_loss_mlp": 0.01265529, + "balance_loss_clip": 0.06274209, + "balance_loss_mlp": 0.01254454, + "epoch": 0.5969637757402676, + "flos": 23154992478720.0, + "grad_norm": 2.1979213221977596, + "language_loss": 0.69668317, + "learning_rate": 1.4751865203842022e-06, + "loss": 0.77351892, + "num_input_tokens_seen": 213911925, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.1105957, + "step": 9929, + "time_per_iteration": 2.51379656791687 + }, + { + "auxiliary_loss_clip": 0.0641327, + "auxiliary_loss_mlp": 0.01270831, + "balance_loss_clip": 0.06274718, + "balance_loss_mlp": 0.01259697, + "epoch": 0.5970238989929355, + "flos": 24027176828160.0, + "grad_norm": 1.690473988948275, + "language_loss": 0.7685796, + "learning_rate": 1.4748107164218431e-06, + "loss": 0.8454206, + "num_input_tokens_seen": 213930715, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.11138916, + "step": 9930, + "time_per_iteration": 2.590068817138672 + }, + { + "auxiliary_loss_clip": 0.06427103, + "auxiliary_loss_mlp": 0.01271306, + "balance_loss_clip": 0.06277532, + "balance_loss_mlp": 0.01259206, + "epoch": 0.5970840222456035, + "flos": 19432884660480.0, + "grad_norm": 1.4319660868037594, + "language_loss": 0.69073558, + "learning_rate": 1.4744349323747146e-06, + "loss": 0.76771963, + "num_input_tokens_seen": 213950015, + "router_z_loss_clip": 1.49511719, + "router_z_loss_mlp": 0.12097168, + "step": 9931, + "time_per_iteration": 2.4879701137542725 + }, + { + "auxiliary_loss_clip": 0.06314774, + "auxiliary_loss_mlp": 0.01252398, + "balance_loss_clip": 0.06255934, + "balance_loss_mlp": 0.01250752, + "epoch": 0.5971441454982714, + "flos": 62993615230080.0, + "grad_norm": 0.8560146868595252, + "language_loss": 0.64260876, + "learning_rate": 1.474059168257065e-06, + "loss": 0.71828043, + "num_input_tokens_seen": 214003330, + "router_z_loss_clip": 0.58984375, + "router_z_loss_mlp": 0.01649475, + "step": 9932, + "time_per_iteration": 3.0806198120117188 + }, + { + "auxiliary_loss_clip": 0.06415366, + "auxiliary_loss_mlp": 0.01270842, + "balance_loss_clip": 0.06272191, + "balance_loss_mlp": 0.01259976, + "epoch": 0.5972042687509395, + "flos": 20272393117440.0, + "grad_norm": 1.7768464871728415, + "language_loss": 0.74403048, + "learning_rate": 1.4736834240831454e-06, + "loss": 0.82089257, + "num_input_tokens_seen": 214021680, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.10864258, + "step": 9933, + "time_per_iteration": 3.9164891242980957 + }, + { + "auxiliary_loss_clip": 0.06316046, + "auxiliary_loss_mlp": 0.01258623, + "balance_loss_clip": 0.06257492, + "balance_loss_mlp": 0.01256835, + "epoch": 0.5972643920036074, + "flos": 71675625778560.0, + "grad_norm": 0.666650666050939, + "language_loss": 0.51957405, + "learning_rate": 1.473307699867203e-06, + "loss": 0.59532076, + "num_input_tokens_seen": 214090265, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 0.01785278, + "step": 9934, + "time_per_iteration": 3.263599157333374 + }, + { + "auxiliary_loss_clip": 0.06320157, + "auxiliary_loss_mlp": 0.01253316, + "balance_loss_clip": 0.06261201, + "balance_loss_mlp": 0.01251523, + "epoch": 0.5973245152562754, + "flos": 56910225427200.0, + "grad_norm": 0.8129555240105609, + "language_loss": 0.54121673, + "learning_rate": 1.4729319956234849e-06, + "loss": 0.61695147, + "num_input_tokens_seen": 214146375, + "router_z_loss_clip": 0.58984375, + "router_z_loss_mlp": 0.0178833, + "step": 9935, + "time_per_iteration": 3.13610577583313 + }, + { + "auxiliary_loss_clip": 0.0641949, + "auxiliary_loss_mlp": 0.01266102, + "balance_loss_clip": 0.06273362, + "balance_loss_mlp": 0.01254229, + "epoch": 0.5973846385089433, + "flos": 24170206947840.0, + "grad_norm": 1.6283043946182527, + "language_loss": 0.65934885, + "learning_rate": 1.4725563113662394e-06, + "loss": 0.7362048, + "num_input_tokens_seen": 214165340, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.11883545, + "step": 9936, + "time_per_iteration": 2.5317225456237793 + }, + { + "auxiliary_loss_clip": 0.06426519, + "auxiliary_loss_mlp": 0.01266905, + "balance_loss_clip": 0.06278973, + "balance_loss_mlp": 0.01256027, + "epoch": 0.5974447617616113, + "flos": 17675476652160.0, + "grad_norm": 1.977673103112211, + "language_loss": 0.67786443, + "learning_rate": 1.4721806471097103e-06, + "loss": 0.75479865, + "num_input_tokens_seen": 214181360, + "router_z_loss_clip": 1.47460938, + "router_z_loss_mlp": 0.10882568, + "step": 9937, + "time_per_iteration": 2.51056170463562 + }, + { + "auxiliary_loss_clip": 0.0642201, + "auxiliary_loss_mlp": 0.01272578, + "balance_loss_clip": 0.06274251, + "balance_loss_mlp": 0.01260073, + "epoch": 0.5975048850142792, + "flos": 22899008655360.0, + "grad_norm": 2.0510739773646853, + "language_loss": 0.77639204, + "learning_rate": 1.4718050028681442e-06, + "loss": 0.85333794, + "num_input_tokens_seen": 214198525, + "router_z_loss_clip": 1.47558594, + "router_z_loss_mlp": 0.12512207, + "step": 9938, + "time_per_iteration": 3.988826274871826 + }, + { + "auxiliary_loss_clip": 0.06425326, + "auxiliary_loss_mlp": 0.01266797, + "balance_loss_clip": 0.06278642, + "balance_loss_mlp": 0.01255145, + "epoch": 0.5975650082669473, + "flos": 24360042372480.0, + "grad_norm": 1.4729050693859964, + "language_loss": 0.76065636, + "learning_rate": 1.4714293786557855e-06, + "loss": 0.83757758, + "num_input_tokens_seen": 214218710, + "router_z_loss_clip": 1.46679688, + "router_z_loss_mlp": 0.11645508, + "step": 9939, + "time_per_iteration": 2.556417226791382 + }, + { + "auxiliary_loss_clip": 0.06427339, + "auxiliary_loss_mlp": 0.01268522, + "balance_loss_clip": 0.06275803, + "balance_loss_mlp": 0.01255206, + "epoch": 0.5976251315196152, + "flos": 20929696871040.0, + "grad_norm": 2.2639919876209498, + "language_loss": 0.68839771, + "learning_rate": 1.471053774486878e-06, + "loss": 0.7653563, + "num_input_tokens_seen": 214237800, + "router_z_loss_clip": 1.515625, + "router_z_loss_mlp": 0.13323975, + "step": 9940, + "time_per_iteration": 2.5342793464660645 + }, + { + "auxiliary_loss_clip": 0.06417148, + "auxiliary_loss_mlp": 0.01270575, + "balance_loss_clip": 0.06276263, + "balance_loss_mlp": 0.01259602, + "epoch": 0.5976852547722832, + "flos": 35853193877760.0, + "grad_norm": 1.2345186889810322, + "language_loss": 0.69966424, + "learning_rate": 1.470678190375664e-06, + "loss": 0.77654147, + "num_input_tokens_seen": 214260355, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.10968018, + "step": 9941, + "time_per_iteration": 2.6775453090667725 + }, + { + "auxiliary_loss_clip": 0.06416304, + "auxiliary_loss_mlp": 0.01265548, + "balance_loss_clip": 0.06272396, + "balance_loss_mlp": 0.0125433, + "epoch": 0.5977453780249512, + "flos": 12860266394880.0, + "grad_norm": 1.7893879951427467, + "language_loss": 0.77519101, + "learning_rate": 1.470302626336386e-06, + "loss": 0.85200953, + "num_input_tokens_seen": 214277120, + "router_z_loss_clip": 1.43847656, + "router_z_loss_mlp": 0.11224365, + "step": 9942, + "time_per_iteration": 2.5630502700805664 + }, + { + "auxiliary_loss_clip": 0.06422595, + "auxiliary_loss_mlp": 0.01267488, + "balance_loss_clip": 0.06273595, + "balance_loss_mlp": 0.0125478, + "epoch": 0.5978055012776191, + "flos": 20965391510400.0, + "grad_norm": 1.999196380936964, + "language_loss": 0.76118851, + "learning_rate": 1.4699270823832857e-06, + "loss": 0.83808935, + "num_input_tokens_seen": 214295300, + "router_z_loss_clip": 1.48828125, + "router_z_loss_mlp": 0.12713623, + "step": 9943, + "time_per_iteration": 3.9001221656799316 + }, + { + "auxiliary_loss_clip": 0.06417957, + "auxiliary_loss_mlp": 0.01266022, + "balance_loss_clip": 0.06274446, + "balance_loss_mlp": 0.01255728, + "epoch": 0.5978656245302871, + "flos": 34066506067200.0, + "grad_norm": 1.9908445339246823, + "language_loss": 0.62211335, + "learning_rate": 1.4695515585306032e-06, + "loss": 0.69895315, + "num_input_tokens_seen": 214317050, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.10296631, + "step": 9944, + "time_per_iteration": 2.6546871662139893 + }, + { + "auxiliary_loss_clip": 0.06420632, + "auxiliary_loss_mlp": 0.01266771, + "balance_loss_clip": 0.06276795, + "balance_loss_mlp": 0.01255333, + "epoch": 0.597925747782955, + "flos": 37381508023680.0, + "grad_norm": 1.6358533401507223, + "language_loss": 0.72854936, + "learning_rate": 1.4691760547925795e-06, + "loss": 0.80542344, + "num_input_tokens_seen": 214337470, + "router_z_loss_clip": 1.43945312, + "router_z_loss_mlp": 0.11450195, + "step": 9945, + "time_per_iteration": 2.631753444671631 + }, + { + "auxiliary_loss_clip": 0.06419382, + "auxiliary_loss_mlp": 0.01270411, + "balance_loss_clip": 0.06275386, + "balance_loss_mlp": 0.01258997, + "epoch": 0.5979858710356231, + "flos": 25381923240960.0, + "grad_norm": 1.7624660559370904, + "language_loss": 0.67425656, + "learning_rate": 1.4688005711834522e-06, + "loss": 0.75115454, + "num_input_tokens_seen": 214357975, + "router_z_loss_clip": 1.43945312, + "router_z_loss_mlp": 0.11401367, + "step": 9946, + "time_per_iteration": 2.5964295864105225 + }, + { + "auxiliary_loss_clip": 0.06427635, + "auxiliary_loss_mlp": 0.01269885, + "balance_loss_clip": 0.06277838, + "balance_loss_mlp": 0.01257678, + "epoch": 0.598045994288291, + "flos": 13703422504320.0, + "grad_norm": 1.825350503307894, + "language_loss": 0.88689518, + "learning_rate": 1.468425107717461e-06, + "loss": 0.96387035, + "num_input_tokens_seen": 214374125, + "router_z_loss_clip": 1.49804688, + "router_z_loss_mlp": 0.12194824, + "step": 9947, + "time_per_iteration": 2.47194766998291 + }, + { + "auxiliary_loss_clip": 0.06412566, + "auxiliary_loss_mlp": 0.01263948, + "balance_loss_clip": 0.06274778, + "balance_loss_mlp": 0.01253409, + "epoch": 0.598106117540959, + "flos": 21987859357440.0, + "grad_norm": 1.5868690486029033, + "language_loss": 0.71892309, + "learning_rate": 1.4680496644088432e-06, + "loss": 0.79568821, + "num_input_tokens_seen": 214393395, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.10540771, + "step": 9948, + "time_per_iteration": 2.519465446472168 + }, + { + "auxiliary_loss_clip": 0.06424625, + "auxiliary_loss_mlp": 0.0126681, + "balance_loss_clip": 0.06277405, + "balance_loss_mlp": 0.01255015, + "epoch": 0.5981662407936269, + "flos": 20565790588800.0, + "grad_norm": 1.9625714193598658, + "language_loss": 0.89521587, + "learning_rate": 1.4676742412718347e-06, + "loss": 0.97213024, + "num_input_tokens_seen": 214411550, + "router_z_loss_clip": 1.46972656, + "router_z_loss_mlp": 0.11791992, + "step": 9949, + "time_per_iteration": 2.512617588043213 + }, + { + "auxiliary_loss_clip": 0.0641937, + "auxiliary_loss_mlp": 0.01266363, + "balance_loss_clip": 0.06276002, + "balance_loss_mlp": 0.0125524, + "epoch": 0.5982263640462949, + "flos": 14069005868160.0, + "grad_norm": 2.2044341220338484, + "language_loss": 0.70866632, + "learning_rate": 1.467298838320673e-06, + "loss": 0.78552365, + "num_input_tokens_seen": 214429780, + "router_z_loss_clip": 1.43359375, + "router_z_loss_mlp": 0.11126709, + "step": 9950, + "time_per_iteration": 2.4983901977539062 + }, + { + "auxiliary_loss_clip": 0.06423427, + "auxiliary_loss_mlp": 0.01265207, + "balance_loss_clip": 0.06276861, + "balance_loss_mlp": 0.01254103, + "epoch": 0.5982864872989628, + "flos": 17712135613440.0, + "grad_norm": 1.7147951868971159, + "language_loss": 0.7865026, + "learning_rate": 1.4669234555695921e-06, + "loss": 0.86338896, + "num_input_tokens_seen": 214447775, + "router_z_loss_clip": 1.46484375, + "router_z_loss_mlp": 0.11102295, + "step": 9951, + "time_per_iteration": 2.5179500579833984 + }, + { + "auxiliary_loss_clip": 0.06422336, + "auxiliary_loss_mlp": 0.01268893, + "balance_loss_clip": 0.06276189, + "balance_loss_mlp": 0.01256215, + "epoch": 0.5983466105516309, + "flos": 16770574483200.0, + "grad_norm": 2.724642744329358, + "language_loss": 0.73936313, + "learning_rate": 1.4665480930328275e-06, + "loss": 0.81627548, + "num_input_tokens_seen": 214467245, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.12689209, + "step": 9952, + "time_per_iteration": 2.5671274662017822 + }, + { + "auxiliary_loss_clip": 0.06420863, + "auxiliary_loss_mlp": 0.01266742, + "balance_loss_clip": 0.06275067, + "balance_loss_mlp": 0.01254243, + "epoch": 0.5984067338042988, + "flos": 20048078937600.0, + "grad_norm": 1.9086154248374307, + "language_loss": 0.79033399, + "learning_rate": 1.466172750724613e-06, + "loss": 0.86721003, + "num_input_tokens_seen": 214484385, + "router_z_loss_clip": 1.45800781, + "router_z_loss_mlp": 0.12512207, + "step": 9953, + "time_per_iteration": 2.5575039386749268 + }, + { + "auxiliary_loss_clip": 0.06419245, + "auxiliary_loss_mlp": 0.01268437, + "balance_loss_clip": 0.06276231, + "balance_loss_mlp": 0.01257267, + "epoch": 0.5984668570569668, + "flos": 26326586972160.0, + "grad_norm": 1.3586799739820394, + "language_loss": 0.69871485, + "learning_rate": 1.4657974286591807e-06, + "loss": 0.77559167, + "num_input_tokens_seen": 214503465, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.1116333, + "step": 9954, + "time_per_iteration": 2.5664639472961426 + }, + { + "auxiliary_loss_clip": 0.06421678, + "auxiliary_loss_mlp": 0.01264771, + "balance_loss_clip": 0.06275603, + "balance_loss_mlp": 0.01253953, + "epoch": 0.5985269803096348, + "flos": 20599808146560.0, + "grad_norm": 3.504460387705041, + "language_loss": 0.73099947, + "learning_rate": 1.4654221268507637e-06, + "loss": 0.80786395, + "num_input_tokens_seen": 214520725, + "router_z_loss_clip": 1.46191406, + "router_z_loss_mlp": 0.10803223, + "step": 9955, + "time_per_iteration": 2.5450916290283203 + }, + { + "auxiliary_loss_clip": 0.06417805, + "auxiliary_loss_mlp": 0.01264034, + "balance_loss_clip": 0.06273872, + "balance_loss_mlp": 0.01252632, + "epoch": 0.5985871035623027, + "flos": 26871859416960.0, + "grad_norm": 1.7558609344018261, + "language_loss": 0.68993962, + "learning_rate": 1.4650468453135934e-06, + "loss": 0.76675797, + "num_input_tokens_seen": 214540675, + "router_z_loss_clip": 1.43945312, + "router_z_loss_mlp": 0.11401367, + "step": 9956, + "time_per_iteration": 2.596081256866455 + }, + { + "auxiliary_loss_clip": 0.06423829, + "auxiliary_loss_mlp": 0.01264045, + "balance_loss_clip": 0.06278121, + "balance_loss_mlp": 0.01253346, + "epoch": 0.5986472268149707, + "flos": 19615802123520.0, + "grad_norm": 2.031153762409854, + "language_loss": 0.74002242, + "learning_rate": 1.4646715840618999e-06, + "loss": 0.81690115, + "num_input_tokens_seen": 214559910, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.10699463, + "step": 9957, + "time_per_iteration": 2.5518100261688232 + }, + { + "auxiliary_loss_clip": 0.06412163, + "auxiliary_loss_mlp": 0.01266872, + "balance_loss_clip": 0.06272288, + "balance_loss_mlp": 0.01256071, + "epoch": 0.5987073500676386, + "flos": 21800371847040.0, + "grad_norm": 1.7255020808995434, + "language_loss": 0.84429491, + "learning_rate": 1.4642963431099138e-06, + "loss": 0.92108524, + "num_input_tokens_seen": 214575960, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.10803223, + "step": 9958, + "time_per_iteration": 2.5053975582122803 + }, + { + "auxiliary_loss_clip": 0.06420925, + "auxiliary_loss_mlp": 0.01267847, + "balance_loss_clip": 0.06275073, + "balance_loss_mlp": 0.01256594, + "epoch": 0.5987674733203067, + "flos": 24320909715840.0, + "grad_norm": 1.676255529467866, + "language_loss": 0.66404957, + "learning_rate": 1.463921122471864e-06, + "loss": 0.74093723, + "num_input_tokens_seen": 214594230, + "router_z_loss_clip": 1.45703125, + "router_z_loss_mlp": 0.11248779, + "step": 9959, + "time_per_iteration": 2.577558994293213 + }, + { + "auxiliary_loss_clip": 0.06423216, + "auxiliary_loss_mlp": 0.01263705, + "balance_loss_clip": 0.06278974, + "balance_loss_mlp": 0.01253418, + "epoch": 0.5988275965729746, + "flos": 21325859775360.0, + "grad_norm": 1.5343309289681366, + "language_loss": 0.83860743, + "learning_rate": 1.4635459221619796e-06, + "loss": 0.91547662, + "num_input_tokens_seen": 214613130, + "router_z_loss_clip": 1.44335938, + "router_z_loss_mlp": 0.10296631, + "step": 9960, + "time_per_iteration": 2.5171096324920654 + }, + { + "auxiliary_loss_clip": 0.06416292, + "auxiliary_loss_mlp": 0.01266192, + "balance_loss_clip": 0.0627322, + "balance_loss_mlp": 0.01254927, + "epoch": 0.5988877198256426, + "flos": 25124891241600.0, + "grad_norm": 1.3977520489587403, + "language_loss": 0.79645187, + "learning_rate": 1.4631707421944868e-06, + "loss": 0.87327671, + "num_input_tokens_seen": 214634470, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.11260986, + "step": 9961, + "time_per_iteration": 2.5664830207824707 + }, + { + "auxiliary_loss_clip": 0.06418522, + "auxiliary_loss_mlp": 0.01263845, + "balance_loss_clip": 0.0627479, + "balance_loss_mlp": 0.01253337, + "epoch": 0.5989478430783105, + "flos": 26435767242240.0, + "grad_norm": 1.8145848373023497, + "language_loss": 0.67511421, + "learning_rate": 1.4627955825836136e-06, + "loss": 0.75193793, + "num_input_tokens_seen": 214654030, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.10516357, + "step": 9962, + "time_per_iteration": 2.5658552646636963 + }, + { + "auxiliary_loss_clip": 0.06419411, + "auxiliary_loss_mlp": 0.01269677, + "balance_loss_clip": 0.06275185, + "balance_loss_mlp": 0.01258698, + "epoch": 0.5990079663309785, + "flos": 25786010355840.0, + "grad_norm": 1.2715525883777674, + "language_loss": 0.74696618, + "learning_rate": 1.4624204433435857e-06, + "loss": 0.82385707, + "num_input_tokens_seen": 214676985, + "router_z_loss_clip": 1.44238281, + "router_z_loss_mlp": 0.10980225, + "step": 9963, + "time_per_iteration": 2.5959842205047607 + }, + { + "auxiliary_loss_clip": 0.06414087, + "auxiliary_loss_mlp": 0.01266086, + "balance_loss_clip": 0.06273367, + "balance_loss_mlp": 0.01255494, + "epoch": 0.5990680895836464, + "flos": 36840889480320.0, + "grad_norm": 1.7000475586235915, + "language_loss": 0.68318057, + "learning_rate": 1.4620453244886281e-06, + "loss": 0.75998235, + "num_input_tokens_seen": 214700105, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.10601807, + "step": 9964, + "time_per_iteration": 2.652066230773926 + }, + { + "auxiliary_loss_clip": 0.06415234, + "auxiliary_loss_mlp": 0.01266775, + "balance_loss_clip": 0.06276559, + "balance_loss_mlp": 0.01256219, + "epoch": 0.5991282128363145, + "flos": 24140340167040.0, + "grad_norm": 1.9446201927807645, + "language_loss": 0.77307773, + "learning_rate": 1.4616702260329662e-06, + "loss": 0.84989786, + "num_input_tokens_seen": 214717885, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.10559082, + "step": 9965, + "time_per_iteration": 2.5652666091918945 + }, + { + "auxiliary_loss_clip": 0.0641766, + "auxiliary_loss_mlp": 0.01265032, + "balance_loss_clip": 0.0627239, + "balance_loss_mlp": 0.01254076, + "epoch": 0.5991883360889824, + "flos": 10308310444800.0, + "grad_norm": 2.43508720605834, + "language_loss": 0.77253437, + "learning_rate": 1.4612951479908229e-06, + "loss": 0.8493613, + "num_input_tokens_seen": 214733680, + "router_z_loss_clip": 1.45214844, + "router_z_loss_mlp": 0.10955811, + "step": 9966, + "time_per_iteration": 3.8983960151672363 + }, + { + "auxiliary_loss_clip": 0.06418956, + "auxiliary_loss_mlp": 0.01264547, + "balance_loss_clip": 0.06277221, + "balance_loss_mlp": 0.01254462, + "epoch": 0.5992484593416504, + "flos": 23957967755520.0, + "grad_norm": 1.382537362814459, + "language_loss": 0.73829538, + "learning_rate": 1.460920090376422e-06, + "loss": 0.81513047, + "num_input_tokens_seen": 214753285, + "router_z_loss_clip": 1.41894531, + "router_z_loss_mlp": 0.10095215, + "step": 9967, + "time_per_iteration": 2.55789852142334 + }, + { + "auxiliary_loss_clip": 0.06430869, + "auxiliary_loss_mlp": 0.01269853, + "balance_loss_clip": 0.06279887, + "balance_loss_mlp": 0.01258177, + "epoch": 0.5993085825943184, + "flos": 11948320483200.0, + "grad_norm": 2.02451624384261, + "language_loss": 0.69043863, + "learning_rate": 1.4605450532039847e-06, + "loss": 0.76744592, + "num_input_tokens_seen": 214767810, + "router_z_loss_clip": 1.50878906, + "router_z_loss_mlp": 0.11669922, + "step": 9968, + "time_per_iteration": 2.4782519340515137 + }, + { + "auxiliary_loss_clip": 0.06417669, + "auxiliary_loss_mlp": 0.01265537, + "balance_loss_clip": 0.06270653, + "balance_loss_mlp": 0.01253926, + "epoch": 0.5993687058469863, + "flos": 19032990249600.0, + "grad_norm": 1.5128271497944086, + "language_loss": 0.79284239, + "learning_rate": 1.4601700364877334e-06, + "loss": 0.86967438, + "num_input_tokens_seen": 214786040, + "router_z_loss_clip": 1.46972656, + "router_z_loss_mlp": 0.11608887, + "step": 9969, + "time_per_iteration": 2.5151612758636475 + }, + { + "auxiliary_loss_clip": 0.06416395, + "auxiliary_loss_mlp": 0.01265086, + "balance_loss_clip": 0.06272908, + "balance_loss_mlp": 0.0125369, + "epoch": 0.5994288290996543, + "flos": 14288204949120.0, + "grad_norm": 1.5374697799261579, + "language_loss": 0.81015587, + "learning_rate": 1.4597950402418889e-06, + "loss": 0.88697076, + "num_input_tokens_seen": 214803110, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.11383057, + "step": 9970, + "time_per_iteration": 2.5037295818328857 + }, + { + "auxiliary_loss_clip": 0.06425726, + "auxiliary_loss_mlp": 0.01265933, + "balance_loss_clip": 0.06278643, + "balance_loss_mlp": 0.01253136, + "epoch": 0.5994889523523222, + "flos": 19212385841280.0, + "grad_norm": 1.7784771847806544, + "language_loss": 0.6253432, + "learning_rate": 1.4594200644806697e-06, + "loss": 0.70225984, + "num_input_tokens_seen": 214819945, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 0.12805176, + "step": 9971, + "time_per_iteration": 2.5600948333740234 + }, + { + "auxiliary_loss_clip": 0.0641441, + "auxiliary_loss_mlp": 0.01262981, + "balance_loss_clip": 0.06275569, + "balance_loss_mlp": 0.01252121, + "epoch": 0.5995490756049903, + "flos": 28044401126400.0, + "grad_norm": 1.5809560666799003, + "language_loss": 0.79321986, + "learning_rate": 1.4590451092182962e-06, + "loss": 0.86999381, + "num_input_tokens_seen": 214838810, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.10864258, + "step": 9972, + "time_per_iteration": 2.5908236503601074 + }, + { + "auxiliary_loss_clip": 0.06426332, + "auxiliary_loss_mlp": 0.01268265, + "balance_loss_clip": 0.06275315, + "balance_loss_mlp": 0.01256595, + "epoch": 0.5996091988576582, + "flos": 29059531741440.0, + "grad_norm": 2.0347749890566957, + "language_loss": 0.76122165, + "learning_rate": 1.4586701744689864e-06, + "loss": 0.83816767, + "num_input_tokens_seen": 214857040, + "router_z_loss_clip": 1.50878906, + "router_z_loss_mlp": 0.11663818, + "step": 9973, + "time_per_iteration": 4.03744912147522 + }, + { + "auxiliary_loss_clip": 0.06415765, + "auxiliary_loss_mlp": 0.01269004, + "balance_loss_clip": 0.06271605, + "balance_loss_mlp": 0.01258048, + "epoch": 0.5996693221103262, + "flos": 20820306965760.0, + "grad_norm": 8.14230844682113, + "language_loss": 0.65456331, + "learning_rate": 1.4582952602469578e-06, + "loss": 0.73141098, + "num_input_tokens_seen": 214873375, + "router_z_loss_clip": 1.44140625, + "router_z_loss_mlp": 0.10961914, + "step": 9974, + "time_per_iteration": 2.545727491378784 + }, + { + "auxiliary_loss_clip": 0.06421987, + "auxiliary_loss_mlp": 0.01267073, + "balance_loss_clip": 0.06277154, + "balance_loss_mlp": 0.0125607, + "epoch": 0.5997294453629941, + "flos": 23775679198080.0, + "grad_norm": 1.6348808694128185, + "language_loss": 0.74560261, + "learning_rate": 1.457920366566428e-06, + "loss": 0.8224932, + "num_input_tokens_seen": 214893900, + "router_z_loss_clip": 1.44726562, + "router_z_loss_mlp": 0.11010742, + "step": 9975, + "time_per_iteration": 2.515960931777954 + }, + { + "auxiliary_loss_clip": 0.06416074, + "auxiliary_loss_mlp": 0.01267839, + "balance_loss_clip": 0.06272042, + "balance_loss_mlp": 0.01256985, + "epoch": 0.5997895686156621, + "flos": 20966397759360.0, + "grad_norm": 1.627086760059136, + "language_loss": 0.77381539, + "learning_rate": 1.457545493441611e-06, + "loss": 0.85065448, + "num_input_tokens_seen": 214912110, + "router_z_loss_clip": 1.44042969, + "router_z_loss_mlp": 0.10864258, + "step": 9976, + "time_per_iteration": 2.5143842697143555 + }, + { + "auxiliary_loss_clip": 0.06419265, + "auxiliary_loss_mlp": 0.01265963, + "balance_loss_clip": 0.06276691, + "balance_loss_mlp": 0.01255162, + "epoch": 0.59984969186833, + "flos": 28372864331520.0, + "grad_norm": 2.2336999868815837, + "language_loss": 0.75166976, + "learning_rate": 1.4571706408867237e-06, + "loss": 0.82852209, + "num_input_tokens_seen": 214930140, + "router_z_loss_clip": 1.42480469, + "router_z_loss_mlp": 0.10803223, + "step": 9977, + "time_per_iteration": 2.5434179306030273 + }, + { + "auxiliary_loss_clip": 0.06417818, + "auxiliary_loss_mlp": 0.01269861, + "balance_loss_clip": 0.06272452, + "balance_loss_mlp": 0.01258358, + "epoch": 0.5999098151209981, + "flos": 22572641802240.0, + "grad_norm": 1.5140714638849335, + "language_loss": 0.69135988, + "learning_rate": 1.4567958089159802e-06, + "loss": 0.76823664, + "num_input_tokens_seen": 214949200, + "router_z_loss_clip": 1.453125, + "router_z_loss_mlp": 0.11499023, + "step": 9978, + "time_per_iteration": 3.9952354431152344 + }, + { + "auxiliary_loss_clip": 0.06421594, + "auxiliary_loss_mlp": 0.01266213, + "balance_loss_clip": 0.06274537, + "balance_loss_mlp": 0.01254977, + "epoch": 0.599969938373666, + "flos": 18774365022720.0, + "grad_norm": 1.8838130799328623, + "language_loss": 0.81737733, + "learning_rate": 1.456420997543594e-06, + "loss": 0.89425546, + "num_input_tokens_seen": 214965775, + "router_z_loss_clip": 1.46972656, + "router_z_loss_mlp": 0.11236572, + "step": 9979, + "time_per_iteration": 2.5352437496185303 + }, + { + "auxiliary_loss_clip": 0.06412499, + "auxiliary_loss_mlp": 0.01267556, + "balance_loss_clip": 0.06274675, + "balance_loss_mlp": 0.01257239, + "epoch": 0.600030061626334, + "flos": 11331910321920.0, + "grad_norm": 1.7106471218945785, + "language_loss": 0.70199746, + "learning_rate": 1.4560462067837782e-06, + "loss": 0.77879798, + "num_input_tokens_seen": 214982480, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.10314941, + "step": 9980, + "time_per_iteration": 2.4757728576660156 + }, + { + "auxiliary_loss_clip": 0.06423149, + "auxiliary_loss_mlp": 0.01269991, + "balance_loss_clip": 0.06274426, + "balance_loss_mlp": 0.01258463, + "epoch": 0.600090184879002, + "flos": 16583799732480.0, + "grad_norm": 2.417469697653489, + "language_loss": 0.690139, + "learning_rate": 1.4556714366507445e-06, + "loss": 0.76707041, + "num_input_tokens_seen": 214998110, + "router_z_loss_clip": 1.48925781, + "router_z_loss_mlp": 0.11523438, + "step": 9981, + "time_per_iteration": 2.4791438579559326 + }, + { + "auxiliary_loss_clip": 0.0641709, + "auxiliary_loss_mlp": 0.01265689, + "balance_loss_clip": 0.0627474, + "balance_loss_mlp": 0.01255342, + "epoch": 0.6001503081316699, + "flos": 23624641013760.0, + "grad_norm": 3.5503488009813275, + "language_loss": 0.78682542, + "learning_rate": 1.4552966871587048e-06, + "loss": 0.86365318, + "num_input_tokens_seen": 215017995, + "router_z_loss_clip": 1.42382812, + "router_z_loss_mlp": 0.10345459, + "step": 9982, + "time_per_iteration": 2.517265796661377 + }, + { + "auxiliary_loss_clip": 0.06418465, + "auxiliary_loss_mlp": 0.01269533, + "balance_loss_clip": 0.06276916, + "balance_loss_mlp": 0.01258852, + "epoch": 0.6002104313843379, + "flos": 20673922682880.0, + "grad_norm": 1.4834511581102687, + "language_loss": 0.72993171, + "learning_rate": 1.4549219583218686e-06, + "loss": 0.80681169, + "num_input_tokens_seen": 215038285, + "router_z_loss_clip": 1.41308594, + "router_z_loss_mlp": 0.10681152, + "step": 9983, + "time_per_iteration": 2.5322060585021973 + }, + { + "auxiliary_loss_clip": 0.06419442, + "auxiliary_loss_mlp": 0.01265277, + "balance_loss_clip": 0.0627455, + "balance_loss_mlp": 0.01254274, + "epoch": 0.6002705546370058, + "flos": 22461742523520.0, + "grad_norm": 1.817313812044092, + "language_loss": 0.77973288, + "learning_rate": 1.454547250154447e-06, + "loss": 0.85658008, + "num_input_tokens_seen": 215057825, + "router_z_loss_clip": 1.44921875, + "router_z_loss_mlp": 0.10998535, + "step": 9984, + "time_per_iteration": 3.889902353286743 + }, + { + "auxiliary_loss_clip": 0.06414619, + "auxiliary_loss_mlp": 0.01267761, + "balance_loss_clip": 0.06271429, + "balance_loss_mlp": 0.01256568, + "epoch": 0.6003306778896739, + "flos": 25199005777920.0, + "grad_norm": 1.5215747487142872, + "language_loss": 0.83512825, + "learning_rate": 1.4541725626706485e-06, + "loss": 0.91195202, + "num_input_tokens_seen": 215077790, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.11199951, + "step": 9985, + "time_per_iteration": 2.575650691986084 + }, + { + "auxiliary_loss_clip": 0.06417745, + "auxiliary_loss_mlp": 0.01270811, + "balance_loss_clip": 0.06274939, + "balance_loss_mlp": 0.01260666, + "epoch": 0.6003908011423418, + "flos": 26694979447680.0, + "grad_norm": 1.7185413261664646, + "language_loss": 0.71617854, + "learning_rate": 1.4537978958846809e-06, + "loss": 0.79306406, + "num_input_tokens_seen": 215097650, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.10144043, + "step": 9986, + "time_per_iteration": 2.603126287460327 + }, + { + "auxiliary_loss_clip": 0.06418968, + "auxiliary_loss_mlp": 0.0127052, + "balance_loss_clip": 0.0627557, + "balance_loss_mlp": 0.01259451, + "epoch": 0.6004509243950098, + "flos": 22571677480320.0, + "grad_norm": 1.4916160282529034, + "language_loss": 0.72118956, + "learning_rate": 1.4534232498107514e-06, + "loss": 0.79808438, + "num_input_tokens_seen": 215118235, + "router_z_loss_clip": 1.43359375, + "router_z_loss_mlp": 0.11077881, + "step": 9987, + "time_per_iteration": 2.5536653995513916 + }, + { + "auxiliary_loss_clip": 0.06410448, + "auxiliary_loss_mlp": 0.01267963, + "balance_loss_clip": 0.06270513, + "balance_loss_mlp": 0.01257741, + "epoch": 0.6005110476476777, + "flos": 19725443591040.0, + "grad_norm": 1.6002442710001008, + "language_loss": 0.85169375, + "learning_rate": 1.4530486244630673e-06, + "loss": 0.92847788, + "num_input_tokens_seen": 215136755, + "router_z_loss_clip": 1.39941406, + "router_z_loss_mlp": 0.10223389, + "step": 9988, + "time_per_iteration": 2.676584482192993 + }, + { + "auxiliary_loss_clip": 0.06413879, + "auxiliary_loss_mlp": 0.01268869, + "balance_loss_clip": 0.06271169, + "balance_loss_mlp": 0.0125783, + "epoch": 0.6005711709003457, + "flos": 17718340815360.0, + "grad_norm": 1.8176771569563623, + "language_loss": 0.66009402, + "learning_rate": 1.4526740198558346e-06, + "loss": 0.73692149, + "num_input_tokens_seen": 215155225, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.1104126, + "step": 9989, + "time_per_iteration": 2.486422300338745 + }, + { + "auxiliary_loss_clip": 0.06419196, + "auxiliary_loss_mlp": 0.01266624, + "balance_loss_clip": 0.06276186, + "balance_loss_mlp": 0.01256288, + "epoch": 0.6006312941530136, + "flos": 18520267916160.0, + "grad_norm": 1.406905965203465, + "language_loss": 0.80891693, + "learning_rate": 1.452299436003257e-06, + "loss": 0.88577515, + "num_input_tokens_seen": 215174815, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.10327148, + "step": 9990, + "time_per_iteration": 2.535477876663208 + }, + { + "auxiliary_loss_clip": 0.06421524, + "auxiliary_loss_mlp": 0.01272993, + "balance_loss_clip": 0.06275146, + "balance_loss_mlp": 0.01261829, + "epoch": 0.6006914174056817, + "flos": 21396117024000.0, + "grad_norm": 2.6934120952656557, + "language_loss": 0.82880741, + "learning_rate": 1.4519248729195403e-06, + "loss": 0.9057526, + "num_input_tokens_seen": 215192045, + "router_z_loss_clip": 1.46386719, + "router_z_loss_mlp": 0.11157227, + "step": 9991, + "time_per_iteration": 2.518101215362549 + }, + { + "auxiliary_loss_clip": 0.06412101, + "auxiliary_loss_mlp": 0.01267116, + "balance_loss_clip": 0.06272052, + "balance_loss_mlp": 0.01256012, + "epoch": 0.6007515406583496, + "flos": 12755488464000.0, + "grad_norm": 1.8815822669797526, + "language_loss": 0.83029675, + "learning_rate": 1.4515503306188878e-06, + "loss": 0.90708888, + "num_input_tokens_seen": 215209885, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.11096191, + "step": 9992, + "time_per_iteration": 2.521474599838257 + }, + { + "auxiliary_loss_clip": 0.06415074, + "auxiliary_loss_mlp": 0.01267357, + "balance_loss_clip": 0.06272477, + "balance_loss_mlp": 0.01256098, + "epoch": 0.6008116639110176, + "flos": 19212679330560.0, + "grad_norm": 1.7865103371256597, + "language_loss": 0.66380614, + "learning_rate": 1.4511758091155008e-06, + "loss": 0.74063051, + "num_input_tokens_seen": 215228150, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.11260986, + "step": 9993, + "time_per_iteration": 2.4865942001342773 + }, + { + "auxiliary_loss_clip": 0.0641458, + "auxiliary_loss_mlp": 0.01267154, + "balance_loss_clip": 0.06271669, + "balance_loss_mlp": 0.0125633, + "epoch": 0.6008717871636855, + "flos": 17060953207680.0, + "grad_norm": 2.3852752129116115, + "language_loss": 0.81380951, + "learning_rate": 1.4508013084235826e-06, + "loss": 0.89062685, + "num_input_tokens_seen": 215243755, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.1083374, + "step": 9994, + "time_per_iteration": 2.500990390777588 + }, + { + "auxiliary_loss_clip": 0.0640981, + "auxiliary_loss_mlp": 0.01267464, + "balance_loss_clip": 0.06272399, + "balance_loss_mlp": 0.01257242, + "epoch": 0.6009319104163535, + "flos": 20304188542080.0, + "grad_norm": 1.763050873993328, + "language_loss": 0.72585195, + "learning_rate": 1.4504268285573337e-06, + "loss": 0.8026247, + "num_input_tokens_seen": 215262130, + "router_z_loss_clip": 1.37402344, + "router_z_loss_mlp": 0.10229492, + "step": 9995, + "time_per_iteration": 2.482269287109375 + }, + { + "auxiliary_loss_clip": 0.06416491, + "auxiliary_loss_mlp": 0.01267971, + "balance_loss_clip": 0.06272282, + "balance_loss_mlp": 0.01257242, + "epoch": 0.6009920336690215, + "flos": 21843487572480.0, + "grad_norm": 1.6604568353476683, + "language_loss": 0.81016338, + "learning_rate": 1.4500523695309546e-06, + "loss": 0.88700801, + "num_input_tokens_seen": 215281785, + "router_z_loss_clip": 1.44433594, + "router_z_loss_mlp": 0.10736084, + "step": 9996, + "time_per_iteration": 2.5466809272766113 + }, + { + "auxiliary_loss_clip": 0.06416655, + "auxiliary_loss_mlp": 0.01270292, + "balance_loss_clip": 0.06274925, + "balance_loss_mlp": 0.01259772, + "epoch": 0.6010521569216895, + "flos": 22601795823360.0, + "grad_norm": 1.669746646683285, + "language_loss": 0.79055232, + "learning_rate": 1.4496779313586447e-06, + "loss": 0.86742181, + "num_input_tokens_seen": 215297550, + "router_z_loss_clip": 1.41796875, + "router_z_loss_mlp": 0.10522461, + "step": 9997, + "time_per_iteration": 2.489703416824341 + }, + { + "auxiliary_loss_clip": 0.06421417, + "auxiliary_loss_mlp": 0.01266279, + "balance_loss_clip": 0.06274536, + "balance_loss_mlp": 0.0125496, + "epoch": 0.6011122801743575, + "flos": 19177697450880.0, + "grad_norm": 1.7167006806270684, + "language_loss": 0.72813851, + "learning_rate": 1.4493035140546028e-06, + "loss": 0.80501544, + "num_input_tokens_seen": 215316360, + "router_z_loss_clip": 1.46777344, + "router_z_loss_mlp": 0.11315918, + "step": 9998, + "time_per_iteration": 2.5477771759033203 + }, + { + "auxiliary_loss_clip": 0.06413899, + "auxiliary_loss_mlp": 0.01264976, + "balance_loss_clip": 0.0627325, + "balance_loss_mlp": 0.01254671, + "epoch": 0.6011724034270254, + "flos": 25017094563840.0, + "grad_norm": 1.4177411729498055, + "language_loss": 0.72547859, + "learning_rate": 1.448929117633027e-06, + "loss": 0.80226737, + "num_input_tokens_seen": 215336405, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.10302734, + "step": 9999, + "time_per_iteration": 2.658071517944336 + }, + { + "auxiliary_loss_clip": 0.06419925, + "auxiliary_loss_mlp": 0.0126529, + "balance_loss_clip": 0.06273222, + "balance_loss_mlp": 0.01253948, + "epoch": 0.6012325266796934, + "flos": 21803935645440.0, + "grad_norm": 1.3735035595460474, + "language_loss": 0.78419137, + "learning_rate": 1.4485547421081142e-06, + "loss": 0.86104351, + "num_input_tokens_seen": 215356590, + "router_z_loss_clip": 1.46777344, + "router_z_loss_mlp": 0.11346436, + "step": 10000, + "time_per_iteration": 2.6216328144073486 + }, + { + "auxiliary_loss_clip": 0.06423375, + "auxiliary_loss_mlp": 0.0126636, + "balance_loss_clip": 0.06274506, + "balance_loss_mlp": 0.01253974, + "epoch": 0.6012926499323613, + "flos": 19579059308160.0, + "grad_norm": 2.6942443051056797, + "language_loss": 0.77449071, + "learning_rate": 1.4481803874940608e-06, + "loss": 0.85138798, + "num_input_tokens_seen": 215374295, + "router_z_loss_clip": 1.48828125, + "router_z_loss_mlp": 0.1239624, + "step": 10001, + "time_per_iteration": 2.4916481971740723 + }, + { + "auxiliary_loss_clip": 0.06419365, + "auxiliary_loss_mlp": 0.01264494, + "balance_loss_clip": 0.06272592, + "balance_loss_mlp": 0.01253479, + "epoch": 0.6013527731850293, + "flos": 34869439416960.0, + "grad_norm": 2.005983259780714, + "language_loss": 0.59280682, + "learning_rate": 1.4478060538050624e-06, + "loss": 0.66964543, + "num_input_tokens_seen": 215394535, + "router_z_loss_clip": 1.46679688, + "router_z_loss_mlp": 0.11004639, + "step": 10002, + "time_per_iteration": 2.6645169258117676 + }, + { + "auxiliary_loss_clip": 0.06426313, + "auxiliary_loss_mlp": 0.01266842, + "balance_loss_clip": 0.06280068, + "balance_loss_mlp": 0.01255636, + "epoch": 0.6014128964376972, + "flos": 23298190306560.0, + "grad_norm": 1.4832163301855164, + "language_loss": 0.78208435, + "learning_rate": 1.447431741055314e-06, + "loss": 0.85901594, + "num_input_tokens_seen": 215414355, + "router_z_loss_clip": 1.46289062, + "router_z_loss_mlp": 0.11199951, + "step": 10003, + "time_per_iteration": 2.5180611610412598 + }, + { + "auxiliary_loss_clip": 0.0641861, + "auxiliary_loss_mlp": 0.01265947, + "balance_loss_clip": 0.06273924, + "balance_loss_mlp": 0.01254503, + "epoch": 0.6014730196903653, + "flos": 24826839868800.0, + "grad_norm": 2.3891485516500857, + "language_loss": 0.77473211, + "learning_rate": 1.4470574492590091e-06, + "loss": 0.8515777, + "num_input_tokens_seen": 215428280, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.11437988, + "step": 10004, + "time_per_iteration": 2.6330173015594482 + }, + { + "auxiliary_loss_clip": 0.06419056, + "auxiliary_loss_mlp": 0.01264798, + "balance_loss_clip": 0.06274185, + "balance_loss_mlp": 0.01253622, + "epoch": 0.6015331429430332, + "flos": 23119046277120.0, + "grad_norm": 1.439097178617253, + "language_loss": 0.72748709, + "learning_rate": 1.4466831784303408e-06, + "loss": 0.80432558, + "num_input_tokens_seen": 215448970, + "router_z_loss_clip": 1.45019531, + "router_z_loss_mlp": 0.11187744, + "step": 10005, + "time_per_iteration": 3.9784722328186035 + }, + { + "auxiliary_loss_clip": 0.06408843, + "auxiliary_loss_mlp": 0.01267392, + "balance_loss_clip": 0.06270996, + "balance_loss_mlp": 0.01257415, + "epoch": 0.6015932661957012, + "flos": 19206222566400.0, + "grad_norm": 2.0810783182593453, + "language_loss": 0.75111496, + "learning_rate": 1.4463089285835026e-06, + "loss": 0.82787728, + "num_input_tokens_seen": 215465260, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.09979248, + "step": 10006, + "time_per_iteration": 2.479973793029785 + }, + { + "auxiliary_loss_clip": 0.06413963, + "auxiliary_loss_mlp": 0.01266799, + "balance_loss_clip": 0.06270643, + "balance_loss_mlp": 0.01255659, + "epoch": 0.6016533894483691, + "flos": 18119451110400.0, + "grad_norm": 1.7404924752402045, + "language_loss": 0.74258769, + "learning_rate": 1.445934699732685e-06, + "loss": 0.8193953, + "num_input_tokens_seen": 215482725, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.1114502, + "step": 10007, + "time_per_iteration": 2.514868974685669 + }, + { + "auxiliary_loss_clip": 0.06414758, + "auxiliary_loss_mlp": 0.01265594, + "balance_loss_clip": 0.06273913, + "balance_loss_mlp": 0.01254161, + "epoch": 0.6017135127010371, + "flos": 16222492926720.0, + "grad_norm": 1.6904603378944318, + "language_loss": 0.70442504, + "learning_rate": 1.4455604918920785e-06, + "loss": 0.78122854, + "num_input_tokens_seen": 215500420, + "router_z_loss_clip": 1.40722656, + "router_z_loss_mlp": 0.11425781, + "step": 10008, + "time_per_iteration": 2.491718053817749 + }, + { + "auxiliary_loss_clip": 0.0641681, + "auxiliary_loss_mlp": 0.01264471, + "balance_loss_clip": 0.06274457, + "balance_loss_mlp": 0.01254291, + "epoch": 0.6017736359537051, + "flos": 23451576405120.0, + "grad_norm": 1.626126690886893, + "language_loss": 0.7634151, + "learning_rate": 1.4451863050758748e-06, + "loss": 0.84022784, + "num_input_tokens_seen": 215522260, + "router_z_loss_clip": 1.42480469, + "router_z_loss_mlp": 0.10186768, + "step": 10009, + "time_per_iteration": 2.599497079849243 + }, + { + "auxiliary_loss_clip": 0.06414296, + "auxiliary_loss_mlp": 0.01267744, + "balance_loss_clip": 0.06272782, + "balance_loss_mlp": 0.01256455, + "epoch": 0.601833759206373, + "flos": 23520869331840.0, + "grad_norm": 2.016447610820272, + "language_loss": 0.73958981, + "learning_rate": 1.4448121392982608e-06, + "loss": 0.8164103, + "num_input_tokens_seen": 215541715, + "router_z_loss_clip": 1.41503906, + "router_z_loss_mlp": 0.11279297, + "step": 10010, + "time_per_iteration": 2.542102098464966 + }, + { + "auxiliary_loss_clip": 0.06320257, + "auxiliary_loss_mlp": 0.01264863, + "balance_loss_clip": 0.06261265, + "balance_loss_mlp": 0.01263333, + "epoch": 0.6018938824590411, + "flos": 64013846215680.0, + "grad_norm": 0.9512553520354263, + "language_loss": 0.55134046, + "learning_rate": 1.4444379945734268e-06, + "loss": 0.6271916, + "num_input_tokens_seen": 215603020, + "router_z_loss_clip": 0.59130859, + "router_z_loss_mlp": 0.01529694, + "step": 10011, + "time_per_iteration": 3.219438076019287 + }, + { + "auxiliary_loss_clip": 0.064165, + "auxiliary_loss_mlp": 0.01266395, + "balance_loss_clip": 0.06272937, + "balance_loss_mlp": 0.01256233, + "epoch": 0.601954005711709, + "flos": 34648311692160.0, + "grad_norm": 1.3620910382501825, + "language_loss": 0.6241864, + "learning_rate": 1.44406387091556e-06, + "loss": 0.70101535, + "num_input_tokens_seen": 215625115, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.1015625, + "step": 10012, + "time_per_iteration": 4.187492609024048 + }, + { + "auxiliary_loss_clip": 0.06412341, + "auxiliary_loss_mlp": 0.01261432, + "balance_loss_clip": 0.06271702, + "balance_loss_mlp": 0.0125155, + "epoch": 0.602014128964377, + "flos": 19433094295680.0, + "grad_norm": 1.6346863878236784, + "language_loss": 0.75188845, + "learning_rate": 1.4436897683388462e-06, + "loss": 0.82862616, + "num_input_tokens_seen": 215643730, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.09881592, + "step": 10013, + "time_per_iteration": 2.4897818565368652 + }, + { + "auxiliary_loss_clip": 0.06409096, + "auxiliary_loss_mlp": 0.01262449, + "balance_loss_clip": 0.06273073, + "balance_loss_mlp": 0.01252823, + "epoch": 0.6020742522170449, + "flos": 28336876202880.0, + "grad_norm": 1.4752372512859242, + "language_loss": 0.81565046, + "learning_rate": 1.4433156868574732e-06, + "loss": 0.89236587, + "num_input_tokens_seen": 215664425, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.09625244, + "step": 10014, + "time_per_iteration": 2.5903513431549072 + }, + { + "auxiliary_loss_clip": 0.06408108, + "auxiliary_loss_mlp": 0.01262661, + "balance_loss_clip": 0.06272644, + "balance_loss_mlp": 0.01252617, + "epoch": 0.6021343754697129, + "flos": 22753588694400.0, + "grad_norm": 1.6084117246958012, + "language_loss": 0.72432387, + "learning_rate": 1.442941626485624e-06, + "loss": 0.80103159, + "num_input_tokens_seen": 215684280, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.10046387, + "step": 10015, + "time_per_iteration": 2.5320956707000732 + }, + { + "auxiliary_loss_clip": 0.06313504, + "auxiliary_loss_mlp": 0.01271116, + "balance_loss_clip": 0.06254423, + "balance_loss_mlp": 0.01269587, + "epoch": 0.6021944987223808, + "flos": 65769885360000.0, + "grad_norm": 0.8212846281484271, + "language_loss": 0.54902303, + "learning_rate": 1.4425675872374848e-06, + "loss": 0.62486923, + "num_input_tokens_seen": 215739780, + "router_z_loss_clip": 0.59375, + "router_z_loss_mlp": 0.01528168, + "step": 10016, + "time_per_iteration": 3.0691990852355957 + }, + { + "auxiliary_loss_clip": 0.06413935, + "auxiliary_loss_mlp": 0.01266264, + "balance_loss_clip": 0.06274504, + "balance_loss_mlp": 0.01255767, + "epoch": 0.6022546219750489, + "flos": 16110377763840.0, + "grad_norm": 1.6476177539901398, + "language_loss": 0.82975459, + "learning_rate": 1.4421935691272381e-06, + "loss": 0.90655655, + "num_input_tokens_seen": 215757885, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.10498047, + "step": 10017, + "time_per_iteration": 4.000306606292725 + }, + { + "auxiliary_loss_clip": 0.06413059, + "auxiliary_loss_mlp": 0.01267664, + "balance_loss_clip": 0.06276649, + "balance_loss_mlp": 0.01257465, + "epoch": 0.6023147452277168, + "flos": 25518328888320.0, + "grad_norm": 1.7212842530240955, + "language_loss": 0.83736604, + "learning_rate": 1.4418195721690677e-06, + "loss": 0.91417325, + "num_input_tokens_seen": 215776415, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.10198975, + "step": 10018, + "time_per_iteration": 2.5354957580566406 + }, + { + "auxiliary_loss_clip": 0.06423128, + "auxiliary_loss_mlp": 0.01265844, + "balance_loss_clip": 0.06276394, + "balance_loss_mlp": 0.01254751, + "epoch": 0.6023748684803848, + "flos": 22642353999360.0, + "grad_norm": 1.5941982193166335, + "language_loss": 0.78464353, + "learning_rate": 1.4414455963771549e-06, + "loss": 0.86153316, + "num_input_tokens_seen": 215794865, + "router_z_loss_clip": 1.46777344, + "router_z_loss_mlp": 0.11096191, + "step": 10019, + "time_per_iteration": 2.534315586090088 + }, + { + "auxiliary_loss_clip": 0.06414038, + "auxiliary_loss_mlp": 0.0126418, + "balance_loss_clip": 0.06272502, + "balance_loss_mlp": 0.01253433, + "epoch": 0.6024349917330527, + "flos": 26217113212800.0, + "grad_norm": 1.7295998133508477, + "language_loss": 0.7397396, + "learning_rate": 1.441071641765681e-06, + "loss": 0.81652176, + "num_input_tokens_seen": 215816840, + "router_z_loss_clip": 1.41503906, + "router_z_loss_mlp": 0.10742188, + "step": 10020, + "time_per_iteration": 2.5745153427124023 + }, + { + "auxiliary_loss_clip": 0.06419009, + "auxiliary_loss_mlp": 0.01267121, + "balance_loss_clip": 0.06276802, + "balance_loss_mlp": 0.01256875, + "epoch": 0.6024951149857207, + "flos": 21258160076160.0, + "grad_norm": 1.6276524527254101, + "language_loss": 0.64517641, + "learning_rate": 1.4406977083488264e-06, + "loss": 0.72203767, + "num_input_tokens_seen": 215836100, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.10247803, + "step": 10021, + "time_per_iteration": 2.5457210540771484 + }, + { + "auxiliary_loss_clip": 0.06415432, + "auxiliary_loss_mlp": 0.01267969, + "balance_loss_clip": 0.06273261, + "balance_loss_mlp": 0.01256776, + "epoch": 0.6025552382383887, + "flos": 26950795562880.0, + "grad_norm": 1.4058190289621155, + "language_loss": 0.80931878, + "learning_rate": 1.4403237961407704e-06, + "loss": 0.88615286, + "num_input_tokens_seen": 215858480, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.11187744, + "step": 10022, + "time_per_iteration": 4.0118248462677 + }, + { + "auxiliary_loss_clip": 0.06419462, + "auxiliary_loss_mlp": 0.0126571, + "balance_loss_clip": 0.06273965, + "balance_loss_mlp": 0.01255089, + "epoch": 0.6026153614910567, + "flos": 31692142846080.0, + "grad_norm": 1.4147504892998892, + "language_loss": 0.66787559, + "learning_rate": 1.439949905155693e-06, + "loss": 0.74472731, + "num_input_tokens_seen": 215879950, + "router_z_loss_clip": 1.45703125, + "router_z_loss_mlp": 0.10620117, + "step": 10023, + "time_per_iteration": 2.6242425441741943 + }, + { + "auxiliary_loss_clip": 0.0642107, + "auxiliary_loss_mlp": 0.01268575, + "balance_loss_clip": 0.06277968, + "balance_loss_mlp": 0.01257554, + "epoch": 0.6026754847437247, + "flos": 29320085612160.0, + "grad_norm": 1.6857710992723132, + "language_loss": 0.73865843, + "learning_rate": 1.4395760354077707e-06, + "loss": 0.81555492, + "num_input_tokens_seen": 215899830, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.11029053, + "step": 10024, + "time_per_iteration": 2.5943942070007324 + }, + { + "auxiliary_loss_clip": 0.06414223, + "auxiliary_loss_mlp": 0.01264046, + "balance_loss_clip": 0.06273946, + "balance_loss_mlp": 0.01253454, + "epoch": 0.6027356079963926, + "flos": 23593558348800.0, + "grad_norm": 1.5719504936966129, + "language_loss": 0.72838885, + "learning_rate": 1.4392021869111815e-06, + "loss": 0.80517155, + "num_input_tokens_seen": 215920440, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.105896, + "step": 10025, + "time_per_iteration": 2.5456719398498535 + }, + { + "auxiliary_loss_clip": 0.06421927, + "auxiliary_loss_mlp": 0.0126511, + "balance_loss_clip": 0.06274527, + "balance_loss_mlp": 0.01253469, + "epoch": 0.6027957312490606, + "flos": 20820055403520.0, + "grad_norm": 2.0657942826528526, + "language_loss": 0.67852134, + "learning_rate": 1.4388283596801016e-06, + "loss": 0.75539172, + "num_input_tokens_seen": 215940535, + "router_z_loss_clip": 1.47265625, + "router_z_loss_mlp": 0.11651611, + "step": 10026, + "time_per_iteration": 2.598649024963379 + }, + { + "auxiliary_loss_clip": 0.06409953, + "auxiliary_loss_mlp": 0.0126467, + "balance_loss_clip": 0.06272997, + "balance_loss_mlp": 0.01254794, + "epoch": 0.6028558545017285, + "flos": 19941540071040.0, + "grad_norm": 1.6702920817519378, + "language_loss": 0.80409044, + "learning_rate": 1.4384545537287061e-06, + "loss": 0.88083661, + "num_input_tokens_seen": 215958045, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.09881592, + "step": 10027, + "time_per_iteration": 2.4931211471557617 + }, + { + "auxiliary_loss_clip": 0.06421126, + "auxiliary_loss_mlp": 0.01265388, + "balance_loss_clip": 0.06276809, + "balance_loss_mlp": 0.01254516, + "epoch": 0.6029159777543965, + "flos": 22827535522560.0, + "grad_norm": 2.164274421178336, + "language_loss": 0.71328938, + "learning_rate": 1.438080769071171e-06, + "loss": 0.79015452, + "num_input_tokens_seen": 215977330, + "router_z_loss_clip": 1.44335938, + "router_z_loss_mlp": 0.10876465, + "step": 10028, + "time_per_iteration": 2.5468251705169678 + }, + { + "auxiliary_loss_clip": 0.06418602, + "auxiliary_loss_mlp": 0.01267926, + "balance_loss_clip": 0.06276453, + "balance_loss_mlp": 0.01256911, + "epoch": 0.6029761010070644, + "flos": 23594103400320.0, + "grad_norm": 1.6575222347679248, + "language_loss": 0.84050506, + "learning_rate": 1.437707005721669e-06, + "loss": 0.91737038, + "num_input_tokens_seen": 215997865, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.11016846, + "step": 10029, + "time_per_iteration": 2.529097557067871 + }, + { + "auxiliary_loss_clip": 0.06414534, + "auxiliary_loss_mlp": 0.01271064, + "balance_loss_clip": 0.06275196, + "balance_loss_mlp": 0.01261146, + "epoch": 0.6030362242597325, + "flos": 13667518229760.0, + "grad_norm": 1.639514659773033, + "language_loss": 0.800816, + "learning_rate": 1.437333263694373e-06, + "loss": 0.8776719, + "num_input_tokens_seen": 216016230, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.09918213, + "step": 10030, + "time_per_iteration": 2.527984619140625 + }, + { + "auxiliary_loss_clip": 0.06420292, + "auxiliary_loss_mlp": 0.01267091, + "balance_loss_clip": 0.06277453, + "balance_loss_mlp": 0.01256595, + "epoch": 0.6030963475124004, + "flos": 24429293372160.0, + "grad_norm": 1.55352827539933, + "language_loss": 0.71218026, + "learning_rate": 1.4369595430034572e-06, + "loss": 0.7890541, + "num_input_tokens_seen": 216035785, + "router_z_loss_clip": 1.42675781, + "router_z_loss_mlp": 0.1050415, + "step": 10031, + "time_per_iteration": 2.5585272312164307 + }, + { + "auxiliary_loss_clip": 0.06422323, + "auxiliary_loss_mlp": 0.01265322, + "balance_loss_clip": 0.06275461, + "balance_loss_mlp": 0.01253592, + "epoch": 0.6031564707650684, + "flos": 29651944907520.0, + "grad_norm": 1.5252565411095604, + "language_loss": 0.73936534, + "learning_rate": 1.4365858436630912e-06, + "loss": 0.81624174, + "num_input_tokens_seen": 216059555, + "router_z_loss_clip": 1.46777344, + "router_z_loss_mlp": 0.11730957, + "step": 10032, + "time_per_iteration": 2.6043312549591064 + }, + { + "auxiliary_loss_clip": 0.06425112, + "auxiliary_loss_mlp": 0.01269372, + "balance_loss_clip": 0.06280036, + "balance_loss_mlp": 0.01258124, + "epoch": 0.6032165940177363, + "flos": 16624525616640.0, + "grad_norm": 1.652390402199518, + "language_loss": 0.68466848, + "learning_rate": 1.4362121656874465e-06, + "loss": 0.76161331, + "num_input_tokens_seen": 216077235, + "router_z_loss_clip": 1.45214844, + "router_z_loss_mlp": 0.11242676, + "step": 10033, + "time_per_iteration": 2.4788658618927 + }, + { + "auxiliary_loss_clip": 0.06415801, + "auxiliary_loss_mlp": 0.01267578, + "balance_loss_clip": 0.06276157, + "balance_loss_mlp": 0.01256676, + "epoch": 0.6032767172704043, + "flos": 17493020386560.0, + "grad_norm": 2.062963272365632, + "language_loss": 0.76036859, + "learning_rate": 1.4358385090906934e-06, + "loss": 0.83720237, + "num_input_tokens_seen": 216094985, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.10900879, + "step": 10034, + "time_per_iteration": 2.5080766677856445 + }, + { + "auxiliary_loss_clip": 0.06421614, + "auxiliary_loss_mlp": 0.01268433, + "balance_loss_clip": 0.06277142, + "balance_loss_mlp": 0.01257668, + "epoch": 0.6033368405230723, + "flos": 26840105919360.0, + "grad_norm": 1.6546972875454138, + "language_loss": 0.74774975, + "learning_rate": 1.4354648738870004e-06, + "loss": 0.82465017, + "num_input_tokens_seen": 216115905, + "router_z_loss_clip": 1.4453125, + "router_z_loss_mlp": 0.10754395, + "step": 10035, + "time_per_iteration": 2.563206434249878 + }, + { + "auxiliary_loss_clip": 0.06417766, + "auxiliary_loss_mlp": 0.0126329, + "balance_loss_clip": 0.06278257, + "balance_loss_mlp": 0.01252752, + "epoch": 0.6033969637757403, + "flos": 16915575173760.0, + "grad_norm": 1.5348173305795916, + "language_loss": 0.86666334, + "learning_rate": 1.435091260090536e-06, + "loss": 0.94347388, + "num_input_tokens_seen": 216132420, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.10552979, + "step": 10036, + "time_per_iteration": 2.5237104892730713 + }, + { + "auxiliary_loss_clip": 0.06422649, + "auxiliary_loss_mlp": 0.01265037, + "balance_loss_clip": 0.06279111, + "balance_loss_mlp": 0.01253641, + "epoch": 0.6034570870284083, + "flos": 22936757719680.0, + "grad_norm": 1.8203362960867906, + "language_loss": 0.70372736, + "learning_rate": 1.4347176677154676e-06, + "loss": 0.78060424, + "num_input_tokens_seen": 216149800, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.11401367, + "step": 10037, + "time_per_iteration": 2.5395092964172363 + }, + { + "auxiliary_loss_clip": 0.06418501, + "auxiliary_loss_mlp": 0.0126923, + "balance_loss_clip": 0.06279185, + "balance_loss_mlp": 0.01258603, + "epoch": 0.6035172102810762, + "flos": 23372807967360.0, + "grad_norm": 1.59892513624744, + "language_loss": 0.85074937, + "learning_rate": 1.4343440967759616e-06, + "loss": 0.92762661, + "num_input_tokens_seen": 216168200, + "router_z_loss_clip": 1.39257812, + "router_z_loss_mlp": 0.10626221, + "step": 10038, + "time_per_iteration": 2.5844480991363525 + }, + { + "auxiliary_loss_clip": 0.06419212, + "auxiliary_loss_mlp": 0.01268169, + "balance_loss_clip": 0.06275028, + "balance_loss_mlp": 0.01257786, + "epoch": 0.6035773335337442, + "flos": 20893457180160.0, + "grad_norm": 2.8819957775512757, + "language_loss": 0.77070892, + "learning_rate": 1.4339705472861846e-06, + "loss": 0.8475827, + "num_input_tokens_seen": 216187105, + "router_z_loss_clip": 1.44140625, + "router_z_loss_mlp": 0.1038208, + "step": 10039, + "time_per_iteration": 2.5122628211975098 + }, + { + "auxiliary_loss_clip": 0.06415309, + "auxiliary_loss_mlp": 0.01264287, + "balance_loss_clip": 0.06274001, + "balance_loss_mlp": 0.01253981, + "epoch": 0.6036374567864121, + "flos": 24943231589760.0, + "grad_norm": 1.5604135097118987, + "language_loss": 0.71224856, + "learning_rate": 1.433597019260301e-06, + "loss": 0.78904456, + "num_input_tokens_seen": 216205440, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.10296631, + "step": 10040, + "time_per_iteration": 2.571869373321533 + }, + { + "auxiliary_loss_clip": 0.06419596, + "auxiliary_loss_mlp": 0.01268369, + "balance_loss_clip": 0.06274244, + "balance_loss_mlp": 0.01256627, + "epoch": 0.6036975800390801, + "flos": 23154866697600.0, + "grad_norm": 1.8943612239225145, + "language_loss": 0.7865687, + "learning_rate": 1.433223512712475e-06, + "loss": 0.86344838, + "num_input_tokens_seen": 216223130, + "router_z_loss_clip": 1.45117188, + "router_z_loss_mlp": 0.11749268, + "step": 10041, + "time_per_iteration": 2.4987337589263916 + }, + { + "auxiliary_loss_clip": 0.0641794, + "auxiliary_loss_mlp": 0.01264385, + "balance_loss_clip": 0.0627731, + "balance_loss_mlp": 0.01254026, + "epoch": 0.603757703291748, + "flos": 18666610272000.0, + "grad_norm": 4.973303913397253, + "language_loss": 0.75757015, + "learning_rate": 1.4328500276568704e-06, + "loss": 0.83439338, + "num_input_tokens_seen": 216240260, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.10357666, + "step": 10042, + "time_per_iteration": 2.5307700634002686 + }, + { + "auxiliary_loss_clip": 0.06414665, + "auxiliary_loss_mlp": 0.0126551, + "balance_loss_clip": 0.06273496, + "balance_loss_mlp": 0.01254477, + "epoch": 0.6038178265444161, + "flos": 19688700775680.0, + "grad_norm": 1.7644311631125091, + "language_loss": 0.84805411, + "learning_rate": 1.4324765641076498e-06, + "loss": 0.92485589, + "num_input_tokens_seen": 216258510, + "router_z_loss_clip": 1.41210938, + "router_z_loss_mlp": 0.1104126, + "step": 10043, + "time_per_iteration": 2.483207941055298 + }, + { + "auxiliary_loss_clip": 0.06418431, + "auxiliary_loss_mlp": 0.01267651, + "balance_loss_clip": 0.06272442, + "balance_loss_mlp": 0.01256034, + "epoch": 0.603877949797084, + "flos": 22644869621760.0, + "grad_norm": 1.873589684997381, + "language_loss": 0.69873232, + "learning_rate": 1.432103122078974e-06, + "loss": 0.77559316, + "num_input_tokens_seen": 216277550, + "router_z_loss_clip": 1.45996094, + "router_z_loss_mlp": 0.1161499, + "step": 10044, + "time_per_iteration": 3.940486192703247 + }, + { + "auxiliary_loss_clip": 0.0642198, + "auxiliary_loss_mlp": 0.01265838, + "balance_loss_clip": 0.06277104, + "balance_loss_mlp": 0.01254168, + "epoch": 0.603938073049752, + "flos": 25455031528320.0, + "grad_norm": 2.2351691288080966, + "language_loss": 0.77851117, + "learning_rate": 1.4317297015850057e-06, + "loss": 0.85538936, + "num_input_tokens_seen": 216296690, + "router_z_loss_clip": 1.44726562, + "router_z_loss_mlp": 0.11669922, + "step": 10045, + "time_per_iteration": 2.5411202907562256 + }, + { + "auxiliary_loss_clip": 0.06414884, + "auxiliary_loss_mlp": 0.01268718, + "balance_loss_clip": 0.06274995, + "balance_loss_mlp": 0.01257697, + "epoch": 0.6039981963024199, + "flos": 22345686218880.0, + "grad_norm": 1.7669017569149148, + "language_loss": 0.77354729, + "learning_rate": 1.4313563026399036e-06, + "loss": 0.85038328, + "num_input_tokens_seen": 216316110, + "router_z_loss_clip": 1.39648438, + "router_z_loss_mlp": 0.11010742, + "step": 10046, + "time_per_iteration": 2.6118433475494385 + }, + { + "auxiliary_loss_clip": 0.064179, + "auxiliary_loss_mlp": 0.01266383, + "balance_loss_clip": 0.06273997, + "balance_loss_mlp": 0.0125643, + "epoch": 0.6040583195550879, + "flos": 20709239978880.0, + "grad_norm": 1.4772024450084065, + "language_loss": 0.87242824, + "learning_rate": 1.430982925257827e-06, + "loss": 0.94927108, + "num_input_tokens_seen": 216333855, + "router_z_loss_clip": 1.43945312, + "router_z_loss_mlp": 0.09960938, + "step": 10047, + "time_per_iteration": 2.5964560508728027 + }, + { + "auxiliary_loss_clip": 0.06416798, + "auxiliary_loss_mlp": 0.01263003, + "balance_loss_clip": 0.06279427, + "balance_loss_mlp": 0.01252459, + "epoch": 0.604118442807756, + "flos": 27170623549440.0, + "grad_norm": 1.57099000963109, + "language_loss": 0.76137155, + "learning_rate": 1.4306095694529358e-06, + "loss": 0.83816957, + "num_input_tokens_seen": 216354890, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.10540771, + "step": 10048, + "time_per_iteration": 2.619131326675415 + }, + { + "auxiliary_loss_clip": 0.06423929, + "auxiliary_loss_mlp": 0.01267255, + "balance_loss_clip": 0.06274632, + "balance_loss_mlp": 0.0125512, + "epoch": 0.6041785660604239, + "flos": 30889125642240.0, + "grad_norm": 2.0836935767176508, + "language_loss": 0.66702586, + "learning_rate": 1.430236235239386e-06, + "loss": 0.74393767, + "num_input_tokens_seen": 216376055, + "router_z_loss_clip": 1.49316406, + "router_z_loss_mlp": 0.12121582, + "step": 10049, + "time_per_iteration": 2.650125741958618 + }, + { + "auxiliary_loss_clip": 0.06413972, + "auxiliary_loss_mlp": 0.01268699, + "balance_loss_clip": 0.06272484, + "balance_loss_mlp": 0.0125769, + "epoch": 0.6042386893130919, + "flos": 19944391109760.0, + "grad_norm": 1.425076043351067, + "language_loss": 0.6651637, + "learning_rate": 1.429862922631336e-06, + "loss": 0.74199045, + "num_input_tokens_seen": 216396295, + "router_z_loss_clip": 1.41503906, + "router_z_loss_mlp": 0.11004639, + "step": 10050, + "time_per_iteration": 2.523010015487671 + }, + { + "auxiliary_loss_clip": 0.06421351, + "auxiliary_loss_mlp": 0.01263894, + "balance_loss_clip": 0.06279106, + "balance_loss_mlp": 0.01252956, + "epoch": 0.6042988125657598, + "flos": 32424106187520.0, + "grad_norm": 1.5652221823172618, + "language_loss": 0.70055592, + "learning_rate": 1.4294896316429408e-06, + "loss": 0.7774083, + "num_input_tokens_seen": 216416605, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.10949707, + "step": 10051, + "time_per_iteration": 2.6328225135803223 + }, + { + "auxiliary_loss_clip": 0.06413503, + "auxiliary_loss_mlp": 0.01264826, + "balance_loss_clip": 0.062729, + "balance_loss_mlp": 0.01253167, + "epoch": 0.6043589358184278, + "flos": 17426578498560.0, + "grad_norm": 1.814191650563656, + "language_loss": 0.64989793, + "learning_rate": 1.4291163622883553e-06, + "loss": 0.72668123, + "num_input_tokens_seen": 216435130, + "router_z_loss_clip": 1.40722656, + "router_z_loss_mlp": 0.11651611, + "step": 10052, + "time_per_iteration": 4.032447814941406 + }, + { + "auxiliary_loss_clip": 0.06422505, + "auxiliary_loss_mlp": 0.01270462, + "balance_loss_clip": 0.06280071, + "balance_loss_mlp": 0.01259275, + "epoch": 0.6044190590710957, + "flos": 27680243281920.0, + "grad_norm": 1.5013537444726899, + "language_loss": 0.69046491, + "learning_rate": 1.4287431145817358e-06, + "loss": 0.76739454, + "num_input_tokens_seen": 216455640, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.11187744, + "step": 10053, + "time_per_iteration": 2.5837066173553467 + }, + { + "auxiliary_loss_clip": 0.06317958, + "auxiliary_loss_mlp": 0.01251886, + "balance_loss_clip": 0.06259381, + "balance_loss_mlp": 0.01250314, + "epoch": 0.6044791823237637, + "flos": 65334422090880.0, + "grad_norm": 0.7098963484594624, + "language_loss": 0.60469133, + "learning_rate": 1.4283698885372336e-06, + "loss": 0.68038976, + "num_input_tokens_seen": 216518130, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 0.01572418, + "step": 10054, + "time_per_iteration": 3.282451868057251 + }, + { + "auxiliary_loss_clip": 0.0641373, + "auxiliary_loss_mlp": 0.0126602, + "balance_loss_clip": 0.06275851, + "balance_loss_mlp": 0.01255023, + "epoch": 0.6045393055764317, + "flos": 24498208955520.0, + "grad_norm": 1.4963816601479185, + "language_loss": 0.85832298, + "learning_rate": 1.4279966841690027e-06, + "loss": 0.93512046, + "num_input_tokens_seen": 216536845, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.10998535, + "step": 10055, + "time_per_iteration": 2.5359747409820557 + }, + { + "auxiliary_loss_clip": 0.06417194, + "auxiliary_loss_mlp": 0.01268307, + "balance_loss_clip": 0.0627384, + "balance_loss_mlp": 0.01256416, + "epoch": 0.6045994288290997, + "flos": 19058999742720.0, + "grad_norm": 2.4042532312332243, + "language_loss": 0.74155682, + "learning_rate": 1.4276235014911952e-06, + "loss": 0.81841183, + "num_input_tokens_seen": 216551860, + "router_z_loss_clip": 1.43164062, + "router_z_loss_mlp": 0.11895752, + "step": 10056, + "time_per_iteration": 2.5254933834075928 + }, + { + "auxiliary_loss_clip": 0.06408785, + "auxiliary_loss_mlp": 0.01263059, + "balance_loss_clip": 0.06271578, + "balance_loss_mlp": 0.01252926, + "epoch": 0.6046595520817676, + "flos": 26583660898560.0, + "grad_norm": 1.6233300173420022, + "language_loss": 0.80582207, + "learning_rate": 1.4272503405179616e-06, + "loss": 0.88254052, + "num_input_tokens_seen": 216574775, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.10137939, + "step": 10057, + "time_per_iteration": 3.975159168243408 + }, + { + "auxiliary_loss_clip": 0.06411809, + "auxiliary_loss_mlp": 0.01267453, + "balance_loss_clip": 0.06273948, + "balance_loss_mlp": 0.01256557, + "epoch": 0.6047196753344356, + "flos": 13586150315520.0, + "grad_norm": 2.1360006581590727, + "language_loss": 0.751284, + "learning_rate": 1.4268772012634527e-06, + "loss": 0.82807666, + "num_input_tokens_seen": 216590100, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.10900879, + "step": 10058, + "time_per_iteration": 2.519793748855591 + }, + { + "auxiliary_loss_clip": 0.06412867, + "auxiliary_loss_mlp": 0.0126516, + "balance_loss_clip": 0.06274287, + "balance_loss_mlp": 0.01253967, + "epoch": 0.6047797985871035, + "flos": 25527552837120.0, + "grad_norm": 1.8108696315105546, + "language_loss": 0.70813042, + "learning_rate": 1.4265040837418176e-06, + "loss": 0.78491068, + "num_input_tokens_seen": 216610145, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.11181641, + "step": 10059, + "time_per_iteration": 2.5327351093292236 + }, + { + "auxiliary_loss_clip": 0.06417379, + "auxiliary_loss_mlp": 0.01264428, + "balance_loss_clip": 0.06274404, + "balance_loss_mlp": 0.01253538, + "epoch": 0.6048399218397715, + "flos": 20526112880640.0, + "grad_norm": 1.5165980047863354, + "language_loss": 0.76569366, + "learning_rate": 1.4261309879672054e-06, + "loss": 0.84251177, + "num_input_tokens_seen": 216630625, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.10888672, + "step": 10060, + "time_per_iteration": 2.5674891471862793 + }, + { + "auxiliary_loss_clip": 0.06412329, + "auxiliary_loss_mlp": 0.01266467, + "balance_loss_clip": 0.06271071, + "balance_loss_mlp": 0.01256036, + "epoch": 0.6049000450924396, + "flos": 20414416988160.0, + "grad_norm": 1.961791815817934, + "language_loss": 0.73817396, + "learning_rate": 1.4257579139537628e-06, + "loss": 0.81496191, + "num_input_tokens_seen": 216649255, + "router_z_loss_clip": 1.41210938, + "router_z_loss_mlp": 0.10418701, + "step": 10061, + "time_per_iteration": 2.4917149543762207 + }, + { + "auxiliary_loss_clip": 0.06419303, + "auxiliary_loss_mlp": 0.0126307, + "balance_loss_clip": 0.06275985, + "balance_loss_mlp": 0.01252497, + "epoch": 0.6049601683451075, + "flos": 20747743729920.0, + "grad_norm": 1.6943031579927808, + "language_loss": 0.67628121, + "learning_rate": 1.425384861715639e-06, + "loss": 0.75310493, + "num_input_tokens_seen": 216668100, + "router_z_loss_clip": 1.43164062, + "router_z_loss_mlp": 0.10565186, + "step": 10062, + "time_per_iteration": 3.9096996784210205 + }, + { + "auxiliary_loss_clip": 0.06412483, + "auxiliary_loss_mlp": 0.01265649, + "balance_loss_clip": 0.06272361, + "balance_loss_mlp": 0.01254246, + "epoch": 0.6050202915977755, + "flos": 20089140238080.0, + "grad_norm": 1.9017616396263957, + "language_loss": 0.71490061, + "learning_rate": 1.425011831266978e-06, + "loss": 0.79168195, + "num_input_tokens_seen": 216686125, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.11395264, + "step": 10063, + "time_per_iteration": 2.532278299331665 + }, + { + "auxiliary_loss_clip": 0.06410936, + "auxiliary_loss_mlp": 0.01264295, + "balance_loss_clip": 0.06271436, + "balance_loss_mlp": 0.01253858, + "epoch": 0.6050804148504434, + "flos": 15966257541120.0, + "grad_norm": 1.545014679780644, + "language_loss": 0.84818602, + "learning_rate": 1.424638822621926e-06, + "loss": 0.92493832, + "num_input_tokens_seen": 216704265, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.10430908, + "step": 10064, + "time_per_iteration": 2.4977669715881348 + }, + { + "auxiliary_loss_clip": 0.06412817, + "auxiliary_loss_mlp": 0.01264433, + "balance_loss_clip": 0.06272112, + "balance_loss_mlp": 0.01253567, + "epoch": 0.6051405381031114, + "flos": 17462315064960.0, + "grad_norm": 2.0946043423181293, + "language_loss": 0.801759, + "learning_rate": 1.4242658357946278e-06, + "loss": 0.87853146, + "num_input_tokens_seen": 216721765, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.10870361, + "step": 10065, + "time_per_iteration": 2.563521146774292 + }, + { + "auxiliary_loss_clip": 0.06424835, + "auxiliary_loss_mlp": 0.01265118, + "balance_loss_clip": 0.06278696, + "balance_loss_mlp": 0.0125371, + "epoch": 0.6052006613557793, + "flos": 11404808974080.0, + "grad_norm": 1.8141288170700578, + "language_loss": 0.7897802, + "learning_rate": 1.423892870799226e-06, + "loss": 0.86667973, + "num_input_tokens_seen": 216738295, + "router_z_loss_clip": 1.45996094, + "router_z_loss_mlp": 0.11413574, + "step": 10066, + "time_per_iteration": 2.4816365242004395 + }, + { + "auxiliary_loss_clip": 0.0641356, + "auxiliary_loss_mlp": 0.0126889, + "balance_loss_clip": 0.06272712, + "balance_loss_mlp": 0.01257857, + "epoch": 0.6052607846084473, + "flos": 24757421160960.0, + "grad_norm": 1.6017965029602446, + "language_loss": 0.73526549, + "learning_rate": 1.4235199276498655e-06, + "loss": 0.81208998, + "num_input_tokens_seen": 216759875, + "router_z_loss_clip": 1.40722656, + "router_z_loss_mlp": 0.1104126, + "step": 10067, + "time_per_iteration": 2.585381269454956 + }, + { + "auxiliary_loss_clip": 0.06416602, + "auxiliary_loss_mlp": 0.01267036, + "balance_loss_clip": 0.06275155, + "balance_loss_mlp": 0.01255646, + "epoch": 0.6053209078611153, + "flos": 20747492167680.0, + "grad_norm": 1.2388364270447627, + "language_loss": 0.68978894, + "learning_rate": 1.4231470063606863e-06, + "loss": 0.76662529, + "num_input_tokens_seen": 216780705, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.1138916, + "step": 10068, + "time_per_iteration": 2.533571243286133 + }, + { + "auxiliary_loss_clip": 0.06416383, + "auxiliary_loss_mlp": 0.01265473, + "balance_loss_clip": 0.06272757, + "balance_loss_mlp": 0.01254864, + "epoch": 0.6053810311137833, + "flos": 18959169202560.0, + "grad_norm": 2.164785155160147, + "language_loss": 0.87104344, + "learning_rate": 1.4227741069458303e-06, + "loss": 0.94786203, + "num_input_tokens_seen": 216797625, + "router_z_loss_clip": 1.43652344, + "router_z_loss_mlp": 0.1060791, + "step": 10069, + "time_per_iteration": 2.5425305366516113 + }, + { + "auxiliary_loss_clip": 0.06414159, + "auxiliary_loss_mlp": 0.01265922, + "balance_loss_clip": 0.06274873, + "balance_loss_mlp": 0.01255259, + "epoch": 0.6054411543664512, + "flos": 23957883901440.0, + "grad_norm": 1.623757415978513, + "language_loss": 0.83496463, + "learning_rate": 1.4224012294194387e-06, + "loss": 0.91176546, + "num_input_tokens_seen": 216817610, + "router_z_loss_clip": 1.39257812, + "router_z_loss_mlp": 0.10662842, + "step": 10070, + "time_per_iteration": 2.528780221939087 + }, + { + "auxiliary_loss_clip": 0.06416136, + "auxiliary_loss_mlp": 0.0126614, + "balance_loss_clip": 0.06271877, + "balance_loss_mlp": 0.0125528, + "epoch": 0.6055012776191192, + "flos": 20600101635840.0, + "grad_norm": 1.4904746237370996, + "language_loss": 0.86489964, + "learning_rate": 1.4220283737956496e-06, + "loss": 0.94172239, + "num_input_tokens_seen": 216836835, + "router_z_loss_clip": 1.44238281, + "router_z_loss_mlp": 0.10858154, + "step": 10071, + "time_per_iteration": 2.538874387741089 + }, + { + "auxiliary_loss_clip": 0.06422232, + "auxiliary_loss_mlp": 0.01271365, + "balance_loss_clip": 0.06276511, + "balance_loss_mlp": 0.01259129, + "epoch": 0.6055614008717871, + "flos": 30305768716800.0, + "grad_norm": 1.8258498039752344, + "language_loss": 0.77371645, + "learning_rate": 1.421655540088603e-06, + "loss": 0.85065246, + "num_input_tokens_seen": 216856760, + "router_z_loss_clip": 1.45703125, + "router_z_loss_mlp": 0.12231445, + "step": 10072, + "time_per_iteration": 2.5658671855926514 + }, + { + "auxiliary_loss_clip": 0.06419331, + "auxiliary_loss_mlp": 0.01267468, + "balance_loss_clip": 0.06274524, + "balance_loss_mlp": 0.01255523, + "epoch": 0.6056215241244551, + "flos": 27132245579520.0, + "grad_norm": 1.5250709401817175, + "language_loss": 0.74363017, + "learning_rate": 1.4212827283124367e-06, + "loss": 0.82049823, + "num_input_tokens_seen": 216878795, + "router_z_loss_clip": 1.44921875, + "router_z_loss_mlp": 0.11962891, + "step": 10073, + "time_per_iteration": 2.5838263034820557 + }, + { + "auxiliary_loss_clip": 0.06330025, + "auxiliary_loss_mlp": 0.01255009, + "balance_loss_clip": 0.06271286, + "balance_loss_mlp": 0.01253449, + "epoch": 0.6056816473771232, + "flos": 56023073124480.0, + "grad_norm": 0.7392641743542041, + "language_loss": 0.55267042, + "learning_rate": 1.4209099384812863e-06, + "loss": 0.62852079, + "num_input_tokens_seen": 216937800, + "router_z_loss_clip": 0.58740234, + "router_z_loss_mlp": 0.01560211, + "step": 10074, + "time_per_iteration": 3.192260503768921 + }, + { + "auxiliary_loss_clip": 0.06416894, + "auxiliary_loss_mlp": 0.01266981, + "balance_loss_clip": 0.0627609, + "balance_loss_mlp": 0.01256353, + "epoch": 0.6057417706297911, + "flos": 23556144700800.0, + "grad_norm": 1.6660379644056391, + "language_loss": 0.81972474, + "learning_rate": 1.4205371706092894e-06, + "loss": 0.89656347, + "num_input_tokens_seen": 216955280, + "router_z_loss_clip": 1.40722656, + "router_z_loss_mlp": 0.10626221, + "step": 10075, + "time_per_iteration": 2.514631509780884 + }, + { + "auxiliary_loss_clip": 0.06414524, + "auxiliary_loss_mlp": 0.01266219, + "balance_loss_clip": 0.06272351, + "balance_loss_mlp": 0.01255526, + "epoch": 0.6058018938824591, + "flos": 27751464852480.0, + "grad_norm": 1.6456827746682687, + "language_loss": 0.78334481, + "learning_rate": 1.4201644247105813e-06, + "loss": 0.86015224, + "num_input_tokens_seen": 216976950, + "router_z_loss_clip": 1.41992188, + "router_z_loss_mlp": 0.10699463, + "step": 10076, + "time_per_iteration": 2.5620245933532715 + }, + { + "auxiliary_loss_clip": 0.06419735, + "auxiliary_loss_mlp": 0.01264098, + "balance_loss_clip": 0.06275415, + "balance_loss_mlp": 0.01252994, + "epoch": 0.605862017135127, + "flos": 22789912239360.0, + "grad_norm": 1.939163307933087, + "language_loss": 0.72597015, + "learning_rate": 1.4197917007992964e-06, + "loss": 0.80280852, + "num_input_tokens_seen": 216996945, + "router_z_loss_clip": 1.44238281, + "router_z_loss_mlp": 0.11102295, + "step": 10077, + "time_per_iteration": 2.5249850749969482 + }, + { + "auxiliary_loss_clip": 0.06421016, + "auxiliary_loss_mlp": 0.0126711, + "balance_loss_clip": 0.06278025, + "balance_loss_mlp": 0.01256155, + "epoch": 0.605922140387795, + "flos": 21221375333760.0, + "grad_norm": 1.5785416430125656, + "language_loss": 0.55953008, + "learning_rate": 1.4194189988895682e-06, + "loss": 0.63641137, + "num_input_tokens_seen": 217016580, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.10961914, + "step": 10078, + "time_per_iteration": 2.5278408527374268 + }, + { + "auxiliary_loss_clip": 0.06424035, + "auxiliary_loss_mlp": 0.01271223, + "balance_loss_clip": 0.06278145, + "balance_loss_mlp": 0.01259911, + "epoch": 0.6059822636404629, + "flos": 27275191845120.0, + "grad_norm": 1.4527216797355516, + "language_loss": 0.70788896, + "learning_rate": 1.4190463189955297e-06, + "loss": 0.78484154, + "num_input_tokens_seen": 217037300, + "router_z_loss_clip": 1.45800781, + "router_z_loss_mlp": 0.11322021, + "step": 10079, + "time_per_iteration": 2.5871152877807617 + }, + { + "auxiliary_loss_clip": 0.06417212, + "auxiliary_loss_mlp": 0.0126863, + "balance_loss_clip": 0.06276966, + "balance_loss_mlp": 0.01257991, + "epoch": 0.606042386893131, + "flos": 20637599137920.0, + "grad_norm": 1.8315516840845918, + "language_loss": 0.63098562, + "learning_rate": 1.4186736611313131e-06, + "loss": 0.70784402, + "num_input_tokens_seen": 217055805, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.10638428, + "step": 10080, + "time_per_iteration": 2.491398334503174 + }, + { + "auxiliary_loss_clip": 0.06417031, + "auxiliary_loss_mlp": 0.01266608, + "balance_loss_clip": 0.06274322, + "balance_loss_mlp": 0.01255289, + "epoch": 0.6061025101457989, + "flos": 23008859758080.0, + "grad_norm": 1.6961363468706865, + "language_loss": 0.71255064, + "learning_rate": 1.4183010253110492e-06, + "loss": 0.78938705, + "num_input_tokens_seen": 217074175, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.11322021, + "step": 10081, + "time_per_iteration": 2.512700080871582 + }, + { + "auxiliary_loss_clip": 0.06420416, + "auxiliary_loss_mlp": 0.01269117, + "balance_loss_clip": 0.06277903, + "balance_loss_mlp": 0.01258406, + "epoch": 0.6061626333984669, + "flos": 29906796700800.0, + "grad_norm": 1.5910736573937334, + "language_loss": 0.69392467, + "learning_rate": 1.4179284115488691e-06, + "loss": 0.77082002, + "num_input_tokens_seen": 217095695, + "router_z_loss_clip": 1.42382812, + "router_z_loss_mlp": 0.10717773, + "step": 10082, + "time_per_iteration": 2.5597543716430664 + }, + { + "auxiliary_loss_clip": 0.06418272, + "auxiliary_loss_mlp": 0.01266999, + "balance_loss_clip": 0.06275124, + "balance_loss_mlp": 0.01256514, + "epoch": 0.6062227566511348, + "flos": 25016130241920.0, + "grad_norm": 1.2876460924932913, + "language_loss": 0.66258222, + "learning_rate": 1.4175558198589015e-06, + "loss": 0.7394349, + "num_input_tokens_seen": 217116260, + "router_z_loss_clip": 1.43164062, + "router_z_loss_mlp": 0.1048584, + "step": 10083, + "time_per_iteration": 4.032879114151001 + }, + { + "auxiliary_loss_clip": 0.06418855, + "auxiliary_loss_mlp": 0.01266697, + "balance_loss_clip": 0.06274892, + "balance_loss_mlp": 0.01256147, + "epoch": 0.6062828799038028, + "flos": 19470046746240.0, + "grad_norm": 1.984600644426631, + "language_loss": 0.74219275, + "learning_rate": 1.4171832502552764e-06, + "loss": 0.81904829, + "num_input_tokens_seen": 217134465, + "router_z_loss_clip": 1.43945312, + "router_z_loss_mlp": 0.10546875, + "step": 10084, + "time_per_iteration": 2.549463987350464 + }, + { + "auxiliary_loss_clip": 0.0641944, + "auxiliary_loss_mlp": 0.0126482, + "balance_loss_clip": 0.06277829, + "balance_loss_mlp": 0.01254305, + "epoch": 0.6063430031564707, + "flos": 13594661504640.0, + "grad_norm": 2.649456512280636, + "language_loss": 0.72717726, + "learning_rate": 1.4168107027521204e-06, + "loss": 0.80401981, + "num_input_tokens_seen": 217149920, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.10516357, + "step": 10085, + "time_per_iteration": 2.569584846496582 + }, + { + "auxiliary_loss_clip": 0.06415457, + "auxiliary_loss_mlp": 0.01267297, + "balance_loss_clip": 0.06275511, + "balance_loss_mlp": 0.01256771, + "epoch": 0.6064031264091387, + "flos": 23261740980480.0, + "grad_norm": 2.0482376544916057, + "language_loss": 0.76309711, + "learning_rate": 1.4164381773635605e-06, + "loss": 0.83992463, + "num_input_tokens_seen": 217168165, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.10522461, + "step": 10086, + "time_per_iteration": 2.5559799671173096 + }, + { + "auxiliary_loss_clip": 0.0641108, + "auxiliary_loss_mlp": 0.01265292, + "balance_loss_clip": 0.06273226, + "balance_loss_mlp": 0.01255231, + "epoch": 0.6064632496618068, + "flos": 22465515957120.0, + "grad_norm": 1.2564833731282572, + "language_loss": 0.72978222, + "learning_rate": 1.4160656741037246e-06, + "loss": 0.80654591, + "num_input_tokens_seen": 217190070, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.10070801, + "step": 10087, + "time_per_iteration": 2.5399293899536133 + }, + { + "auxiliary_loss_clip": 0.06412689, + "auxiliary_loss_mlp": 0.01269622, + "balance_loss_clip": 0.06275249, + "balance_loss_mlp": 0.01259555, + "epoch": 0.6065233729144747, + "flos": 25125604001280.0, + "grad_norm": 1.521602814132933, + "language_loss": 0.83829105, + "learning_rate": 1.4156931929867355e-06, + "loss": 0.91511416, + "num_input_tokens_seen": 217209370, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.10058594, + "step": 10088, + "time_per_iteration": 2.5622670650482178 + }, + { + "auxiliary_loss_clip": 0.06411251, + "auxiliary_loss_mlp": 0.01268104, + "balance_loss_clip": 0.06272328, + "balance_loss_mlp": 0.01257709, + "epoch": 0.6065834961671427, + "flos": 23484126516480.0, + "grad_norm": 1.9713789944159437, + "language_loss": 0.71166384, + "learning_rate": 1.4153207340267201e-06, + "loss": 0.78845739, + "num_input_tokens_seen": 217226990, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.10400391, + "step": 10089, + "time_per_iteration": 2.516352891921997 + }, + { + "auxiliary_loss_clip": 0.06418794, + "auxiliary_loss_mlp": 0.01265974, + "balance_loss_clip": 0.06277877, + "balance_loss_mlp": 0.01255835, + "epoch": 0.6066436194198106, + "flos": 17025090860160.0, + "grad_norm": 1.830033701594393, + "language_loss": 0.82651365, + "learning_rate": 1.4149482972378009e-06, + "loss": 0.90336132, + "num_input_tokens_seen": 217244585, + "router_z_loss_clip": 1.40917969, + "router_z_loss_mlp": 0.10137939, + "step": 10090, + "time_per_iteration": 2.5144259929656982 + }, + { + "auxiliary_loss_clip": 0.06427157, + "auxiliary_loss_mlp": 0.01267358, + "balance_loss_clip": 0.06276274, + "balance_loss_mlp": 0.01255848, + "epoch": 0.6067037426724786, + "flos": 18520603332480.0, + "grad_norm": 2.204687443594168, + "language_loss": 0.76034927, + "learning_rate": 1.4145758826341e-06, + "loss": 0.83729446, + "num_input_tokens_seen": 217263435, + "router_z_loss_clip": 1.5078125, + "router_z_loss_mlp": 0.11505127, + "step": 10091, + "time_per_iteration": 2.4818389415740967 + }, + { + "auxiliary_loss_clip": 0.06416716, + "auxiliary_loss_mlp": 0.01268883, + "balance_loss_clip": 0.06278287, + "balance_loss_mlp": 0.01258041, + "epoch": 0.6067638659251465, + "flos": 22352520326400.0, + "grad_norm": 1.3588116701946646, + "language_loss": 0.7976529, + "learning_rate": 1.4142034902297415e-06, + "loss": 0.87450886, + "num_input_tokens_seen": 217283725, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.10858154, + "step": 10092, + "time_per_iteration": 4.102951765060425 + }, + { + "auxiliary_loss_clip": 0.06413257, + "auxiliary_loss_mlp": 0.01264393, + "balance_loss_clip": 0.06271385, + "balance_loss_mlp": 0.01253623, + "epoch": 0.6068239891778145, + "flos": 12454669906560.0, + "grad_norm": 1.7580568445861304, + "language_loss": 0.76897407, + "learning_rate": 1.4138311200388444e-06, + "loss": 0.84575057, + "num_input_tokens_seen": 217301120, + "router_z_loss_clip": 1.41796875, + "router_z_loss_mlp": 0.10778809, + "step": 10093, + "time_per_iteration": 2.5497262477874756 + }, + { + "auxiliary_loss_clip": 0.06417312, + "auxiliary_loss_mlp": 0.01264272, + "balance_loss_clip": 0.06280127, + "balance_loss_mlp": 0.01254396, + "epoch": 0.6068841124304825, + "flos": 23192657688960.0, + "grad_norm": 1.756366452209319, + "language_loss": 0.87924957, + "learning_rate": 1.4134587720755304e-06, + "loss": 0.95606542, + "num_input_tokens_seen": 217319585, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.09887695, + "step": 10094, + "time_per_iteration": 2.5853447914123535 + }, + { + "auxiliary_loss_clip": 0.06414801, + "auxiliary_loss_mlp": 0.01269704, + "balance_loss_clip": 0.06274891, + "balance_loss_mlp": 0.01258891, + "epoch": 0.6069442356831505, + "flos": 18593795473920.0, + "grad_norm": 1.6037560799373654, + "language_loss": 0.72400463, + "learning_rate": 1.413086446353919e-06, + "loss": 0.80084968, + "num_input_tokens_seen": 217338880, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.1081543, + "step": 10095, + "time_per_iteration": 2.522684335708618 + }, + { + "auxiliary_loss_clip": 0.06416344, + "auxiliary_loss_mlp": 0.01265543, + "balance_loss_clip": 0.06275313, + "balance_loss_mlp": 0.01255202, + "epoch": 0.6070043589358184, + "flos": 20966775102720.0, + "grad_norm": 1.6943237110311855, + "language_loss": 0.76768452, + "learning_rate": 1.4127141428881273e-06, + "loss": 0.8445034, + "num_input_tokens_seen": 217357480, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.10333252, + "step": 10096, + "time_per_iteration": 3.974635362625122 + }, + { + "auxiliary_loss_clip": 0.06419063, + "auxiliary_loss_mlp": 0.01267681, + "balance_loss_clip": 0.06276296, + "balance_loss_mlp": 0.01257018, + "epoch": 0.6070644821884864, + "flos": 11697242123520.0, + "grad_norm": 1.6709554759687573, + "language_loss": 0.80418944, + "learning_rate": 1.4123418616922749e-06, + "loss": 0.8810569, + "num_input_tokens_seen": 217374575, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.10668945, + "step": 10097, + "time_per_iteration": 2.5277743339538574 + }, + { + "auxiliary_loss_clip": 0.06411067, + "auxiliary_loss_mlp": 0.01267086, + "balance_loss_clip": 0.0627345, + "balance_loss_mlp": 0.01256888, + "epoch": 0.6071246054411543, + "flos": 19315402836480.0, + "grad_norm": 1.4624120271510725, + "language_loss": 0.6741221, + "learning_rate": 1.411969602780478e-06, + "loss": 0.75090361, + "num_input_tokens_seen": 217392950, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.10198975, + "step": 10098, + "time_per_iteration": 2.476284980773926 + }, + { + "auxiliary_loss_clip": 0.06410795, + "auxiliary_loss_mlp": 0.01267637, + "balance_loss_clip": 0.06272739, + "balance_loss_mlp": 0.01257695, + "epoch": 0.6071847286938223, + "flos": 17754832068480.0, + "grad_norm": 1.6528826990411218, + "language_loss": 0.80661249, + "learning_rate": 1.4115973661668523e-06, + "loss": 0.8833968, + "num_input_tokens_seen": 217412145, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.0994873, + "step": 10099, + "time_per_iteration": 2.5101730823516846 + }, + { + "auxiliary_loss_clip": 0.06419415, + "auxiliary_loss_mlp": 0.01267814, + "balance_loss_clip": 0.06273925, + "balance_loss_mlp": 0.01256382, + "epoch": 0.6072448519464904, + "flos": 22644031080960.0, + "grad_norm": 1.7660509562429656, + "language_loss": 0.71092284, + "learning_rate": 1.4112251518655133e-06, + "loss": 0.78779513, + "num_input_tokens_seen": 217432080, + "router_z_loss_clip": 1.45507812, + "router_z_loss_mlp": 0.11437988, + "step": 10100, + "time_per_iteration": 2.5284388065338135 + }, + { + "auxiliary_loss_clip": 0.06417382, + "auxiliary_loss_mlp": 0.012671, + "balance_loss_clip": 0.06275873, + "balance_loss_mlp": 0.01255072, + "epoch": 0.6073049751991583, + "flos": 19543490449920.0, + "grad_norm": 2.5847426043420807, + "language_loss": 0.71003377, + "learning_rate": 1.4108529598905764e-06, + "loss": 0.78687859, + "num_input_tokens_seen": 217450945, + "router_z_loss_clip": 1.41503906, + "router_z_loss_mlp": 0.12030029, + "step": 10101, + "time_per_iteration": 2.5114076137542725 + }, + { + "auxiliary_loss_clip": 0.06414101, + "auxiliary_loss_mlp": 0.01264452, + "balance_loss_clip": 0.06275541, + "balance_loss_mlp": 0.01254534, + "epoch": 0.6073650984518263, + "flos": 28301936250240.0, + "grad_norm": 1.5889760307817664, + "language_loss": 0.69726598, + "learning_rate": 1.410480790256154e-06, + "loss": 0.77405149, + "num_input_tokens_seen": 217473105, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.09924316, + "step": 10102, + "time_per_iteration": 4.067505836486816 + }, + { + "auxiliary_loss_clip": 0.06414825, + "auxiliary_loss_mlp": 0.01266798, + "balance_loss_clip": 0.06273274, + "balance_loss_mlp": 0.01256409, + "epoch": 0.6074252217044942, + "flos": 25671211862400.0, + "grad_norm": 1.7072302673605428, + "language_loss": 0.73599881, + "learning_rate": 1.4101086429763589e-06, + "loss": 0.81281507, + "num_input_tokens_seen": 217491780, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.10394287, + "step": 10103, + "time_per_iteration": 2.5059690475463867 + }, + { + "auxiliary_loss_clip": 0.06429945, + "auxiliary_loss_mlp": 0.01270767, + "balance_loss_clip": 0.06280673, + "balance_loss_mlp": 0.01259215, + "epoch": 0.6074853449571622, + "flos": 22863775213440.0, + "grad_norm": 2.6623380378388943, + "language_loss": 0.76573825, + "learning_rate": 1.4097365180653032e-06, + "loss": 0.84274542, + "num_input_tokens_seen": 217510605, + "router_z_loss_clip": 1.4921875, + "router_z_loss_mlp": 0.11560059, + "step": 10104, + "time_per_iteration": 2.5691661834716797 + }, + { + "auxiliary_loss_clip": 0.06324141, + "auxiliary_loss_mlp": 0.01255914, + "balance_loss_clip": 0.0626532, + "balance_loss_mlp": 0.01253873, + "epoch": 0.6075454682098301, + "flos": 67131088536960.0, + "grad_norm": 0.6977033795055727, + "language_loss": 0.55382067, + "learning_rate": 1.4093644155370977e-06, + "loss": 0.62962115, + "num_input_tokens_seen": 217574815, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 0.02041626, + "step": 10105, + "time_per_iteration": 3.1780333518981934 + }, + { + "auxiliary_loss_clip": 0.06325028, + "auxiliary_loss_mlp": 0.0125398, + "balance_loss_clip": 0.06266589, + "balance_loss_mlp": 0.01252049, + "epoch": 0.6076055914624982, + "flos": 70730389797120.0, + "grad_norm": 1.0472762602622778, + "language_loss": 0.5682922, + "learning_rate": 1.4089923354058533e-06, + "loss": 0.64408225, + "num_input_tokens_seen": 217632375, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 0.01928711, + "step": 10106, + "time_per_iteration": 3.1282505989074707 + }, + { + "auxiliary_loss_clip": 0.06414115, + "auxiliary_loss_mlp": 0.01266002, + "balance_loss_clip": 0.06276634, + "balance_loss_mlp": 0.01256042, + "epoch": 0.6076657147151661, + "flos": 28371816155520.0, + "grad_norm": 1.4629042426300594, + "language_loss": 0.69019145, + "learning_rate": 1.4086202776856784e-06, + "loss": 0.76699257, + "num_input_tokens_seen": 217653055, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.09954834, + "step": 10107, + "time_per_iteration": 2.6175951957702637 + }, + { + "auxiliary_loss_clip": 0.0642143, + "auxiliary_loss_mlp": 0.01265799, + "balance_loss_clip": 0.06277055, + "balance_loss_mlp": 0.01255297, + "epoch": 0.6077258379678341, + "flos": 15055234024320.0, + "grad_norm": 1.7550359653422893, + "language_loss": 0.80674279, + "learning_rate": 1.4082482423906815e-06, + "loss": 0.88361514, + "num_input_tokens_seen": 217671520, + "router_z_loss_clip": 1.44433594, + "router_z_loss_mlp": 0.1050415, + "step": 10108, + "time_per_iteration": 2.482895851135254 + }, + { + "auxiliary_loss_clip": 0.06424679, + "auxiliary_loss_mlp": 0.01267352, + "balance_loss_clip": 0.06279299, + "balance_loss_mlp": 0.01256223, + "epoch": 0.607785961220502, + "flos": 36174948756480.0, + "grad_norm": 1.6080944832957944, + "language_loss": 0.71795905, + "learning_rate": 1.4078762295349714e-06, + "loss": 0.79487944, + "num_input_tokens_seen": 217691880, + "router_z_loss_clip": 1.45214844, + "router_z_loss_mlp": 0.11138916, + "step": 10109, + "time_per_iteration": 2.6855504512786865 + }, + { + "auxiliary_loss_clip": 0.06412528, + "auxiliary_loss_mlp": 0.01268721, + "balance_loss_clip": 0.06276727, + "balance_loss_mlp": 0.01259119, + "epoch": 0.60784608447317, + "flos": 22530113055360.0, + "grad_norm": 1.591486225286121, + "language_loss": 0.80463254, + "learning_rate": 1.407504239132653e-06, + "loss": 0.88144499, + "num_input_tokens_seen": 217710530, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.09613037, + "step": 10110, + "time_per_iteration": 2.4970977306365967 + }, + { + "auxiliary_loss_clip": 0.06416238, + "auxiliary_loss_mlp": 0.01268709, + "balance_loss_clip": 0.06275235, + "balance_loss_mlp": 0.01258052, + "epoch": 0.6079062077258379, + "flos": 23847823163520.0, + "grad_norm": 17.062743331014456, + "language_loss": 0.7053231, + "learning_rate": 1.4071322711978338e-06, + "loss": 0.78217256, + "num_input_tokens_seen": 217728650, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.10656738, + "step": 10111, + "time_per_iteration": 2.5446176528930664 + }, + { + "auxiliary_loss_clip": 0.0641928, + "auxiliary_loss_mlp": 0.01267582, + "balance_loss_clip": 0.06276086, + "balance_loss_mlp": 0.01255631, + "epoch": 0.6079663309785059, + "flos": 23373646508160.0, + "grad_norm": 1.767884967540518, + "language_loss": 0.64890563, + "learning_rate": 1.4067603257446186e-06, + "loss": 0.72577429, + "num_input_tokens_seen": 217747135, + "router_z_loss_clip": 1.43164062, + "router_z_loss_mlp": 0.11950684, + "step": 10112, + "time_per_iteration": 2.5041110515594482 + }, + { + "auxiliary_loss_clip": 0.06319214, + "auxiliary_loss_mlp": 0.01254153, + "balance_loss_clip": 0.0626073, + "balance_loss_mlp": 0.01252635, + "epoch": 0.6080264542311739, + "flos": 71403709680000.0, + "grad_norm": 0.6188727131541597, + "language_loss": 0.49428421, + "learning_rate": 1.4063884027871105e-06, + "loss": 0.57001793, + "num_input_tokens_seen": 217811860, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 0.01517487, + "step": 10113, + "time_per_iteration": 3.2030844688415527 + }, + { + "auxiliary_loss_clip": 0.06322706, + "auxiliary_loss_mlp": 0.01253815, + "balance_loss_clip": 0.06264073, + "balance_loss_mlp": 0.01252375, + "epoch": 0.6080865774838419, + "flos": 66549786036480.0, + "grad_norm": 0.826261074954681, + "language_loss": 0.57000625, + "learning_rate": 1.4060165023394147e-06, + "loss": 0.64577138, + "num_input_tokens_seen": 217866510, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 0.01438141, + "step": 10114, + "time_per_iteration": 3.0561811923980713 + }, + { + "auxiliary_loss_clip": 0.06416565, + "auxiliary_loss_mlp": 0.0126699, + "balance_loss_clip": 0.0627362, + "balance_loss_mlp": 0.01255528, + "epoch": 0.6081467007365099, + "flos": 19213895214720.0, + "grad_norm": 2.9429969583310744, + "language_loss": 0.70665103, + "learning_rate": 1.4056446244156317e-06, + "loss": 0.7834866, + "num_input_tokens_seen": 217885650, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.11456299, + "step": 10115, + "time_per_iteration": 2.536123037338257 + }, + { + "auxiliary_loss_clip": 0.06416753, + "auxiliary_loss_mlp": 0.01265083, + "balance_loss_clip": 0.06275412, + "balance_loss_mlp": 0.01254128, + "epoch": 0.6082068239891778, + "flos": 24174148089600.0, + "grad_norm": 2.2262194131188617, + "language_loss": 0.72516567, + "learning_rate": 1.4052727690298642e-06, + "loss": 0.80198407, + "num_input_tokens_seen": 217905300, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.10961914, + "step": 10116, + "time_per_iteration": 2.5744457244873047 + }, + { + "auxiliary_loss_clip": 0.06418931, + "auxiliary_loss_mlp": 0.0126628, + "balance_loss_clip": 0.06275393, + "balance_loss_mlp": 0.01254562, + "epoch": 0.6082669472418458, + "flos": 37422150053760.0, + "grad_norm": 1.8492666967546532, + "language_loss": 0.54224104, + "learning_rate": 1.4049009361962138e-06, + "loss": 0.61909318, + "num_input_tokens_seen": 217927845, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.1171875, + "step": 10117, + "time_per_iteration": 2.7010717391967773 + }, + { + "auxiliary_loss_clip": 0.06415669, + "auxiliary_loss_mlp": 0.01262582, + "balance_loss_clip": 0.06273679, + "balance_loss_mlp": 0.01252431, + "epoch": 0.6083270704945137, + "flos": 15090886736640.0, + "grad_norm": 1.6926126638400165, + "language_loss": 0.70553619, + "learning_rate": 1.4045291259287786e-06, + "loss": 0.78231865, + "num_input_tokens_seen": 217946145, + "router_z_loss_clip": 1.41894531, + "router_z_loss_mlp": 0.1015625, + "step": 10118, + "time_per_iteration": 2.5118987560272217 + }, + { + "auxiliary_loss_clip": 0.0641689, + "auxiliary_loss_mlp": 0.01265841, + "balance_loss_clip": 0.06276117, + "balance_loss_mlp": 0.01255857, + "epoch": 0.6083871937471818, + "flos": 20674845077760.0, + "grad_norm": 1.454621938136119, + "language_loss": 0.75087917, + "learning_rate": 1.4041573382416588e-06, + "loss": 0.82770652, + "num_input_tokens_seen": 217965190, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.09979248, + "step": 10119, + "time_per_iteration": 2.5343713760375977 + }, + { + "auxiliary_loss_clip": 0.06418591, + "auxiliary_loss_mlp": 0.01266372, + "balance_loss_clip": 0.06277768, + "balance_loss_mlp": 0.0125559, + "epoch": 0.6084473169998497, + "flos": 21513305358720.0, + "grad_norm": 1.7245965425427678, + "language_loss": 0.67339104, + "learning_rate": 1.4037855731489525e-06, + "loss": 0.75024068, + "num_input_tokens_seen": 217983625, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.10784912, + "step": 10120, + "time_per_iteration": 2.4992902278900146 + }, + { + "auxiliary_loss_clip": 0.06424947, + "auxiliary_loss_mlp": 0.01267829, + "balance_loss_clip": 0.06279485, + "balance_loss_mlp": 0.0125673, + "epoch": 0.6085074402525177, + "flos": 26877309932160.0, + "grad_norm": 1.7168671771406325, + "language_loss": 0.74690855, + "learning_rate": 1.4034138306647571e-06, + "loss": 0.82383633, + "num_input_tokens_seen": 218006005, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.11096191, + "step": 10121, + "time_per_iteration": 2.552943468093872 + }, + { + "auxiliary_loss_clip": 0.06415446, + "auxiliary_loss_mlp": 0.0126478, + "balance_loss_clip": 0.06275289, + "balance_loss_mlp": 0.01254844, + "epoch": 0.6085675635051856, + "flos": 10894518408960.0, + "grad_norm": 1.695682661500106, + "language_loss": 0.80907005, + "learning_rate": 1.4030421108031685e-06, + "loss": 0.88587236, + "num_input_tokens_seen": 218024195, + "router_z_loss_clip": 1.40039062, + "router_z_loss_mlp": 0.09936523, + "step": 10122, + "time_per_iteration": 3.890413522720337 + }, + { + "auxiliary_loss_clip": 0.06419112, + "auxiliary_loss_mlp": 0.0126449, + "balance_loss_clip": 0.06278858, + "balance_loss_mlp": 0.01254483, + "epoch": 0.6086276867578536, + "flos": 34871074571520.0, + "grad_norm": 1.4621063194109842, + "language_loss": 0.55791676, + "learning_rate": 1.402670413578284e-06, + "loss": 0.63475281, + "num_input_tokens_seen": 218047190, + "router_z_loss_clip": 1.40429688, + "router_z_loss_mlp": 0.10015869, + "step": 10123, + "time_per_iteration": 2.6325483322143555 + }, + { + "auxiliary_loss_clip": 0.06419839, + "auxiliary_loss_mlp": 0.01264678, + "balance_loss_clip": 0.06281708, + "balance_loss_mlp": 0.0125355, + "epoch": 0.6086878100105215, + "flos": 20053906796160.0, + "grad_norm": 1.6808318536129285, + "language_loss": 0.74430656, + "learning_rate": 1.4022987390041965e-06, + "loss": 0.82115179, + "num_input_tokens_seen": 218065945, + "router_z_loss_clip": 1.38183594, + "router_z_loss_mlp": 0.11114502, + "step": 10124, + "time_per_iteration": 2.5358493328094482 + }, + { + "auxiliary_loss_clip": 0.06421429, + "auxiliary_loss_mlp": 0.01269718, + "balance_loss_clip": 0.06278759, + "balance_loss_mlp": 0.01258393, + "epoch": 0.6087479332631895, + "flos": 18338314775040.0, + "grad_norm": 11.543954575524463, + "language_loss": 0.65884316, + "learning_rate": 1.4019270870950006e-06, + "loss": 0.73575461, + "num_input_tokens_seen": 218085285, + "router_z_loss_clip": 1.42675781, + "router_z_loss_mlp": 0.11322021, + "step": 10125, + "time_per_iteration": 2.4864342212677 + }, + { + "auxiliary_loss_clip": 0.06421918, + "auxiliary_loss_mlp": 0.01264385, + "balance_loss_clip": 0.06282578, + "balance_loss_mlp": 0.01253841, + "epoch": 0.6088080565158575, + "flos": 24499424839680.0, + "grad_norm": 2.2712886028305, + "language_loss": 0.76395416, + "learning_rate": 1.40155545786479e-06, + "loss": 0.84081715, + "num_input_tokens_seen": 218104735, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.10552979, + "step": 10126, + "time_per_iteration": 2.5664777755737305 + }, + { + "auxiliary_loss_clip": 0.06427297, + "auxiliary_loss_mlp": 0.01266279, + "balance_loss_clip": 0.06280977, + "balance_loss_mlp": 0.0125524, + "epoch": 0.6088681797685255, + "flos": 10273496273280.0, + "grad_norm": 5.11214091408941, + "language_loss": 0.71820217, + "learning_rate": 1.4011838513276558e-06, + "loss": 0.79513788, + "num_input_tokens_seen": 218121855, + "router_z_loss_clip": 1.46386719, + "router_z_loss_mlp": 0.1104126, + "step": 10127, + "time_per_iteration": 2.478034257888794 + }, + { + "auxiliary_loss_clip": 0.06430127, + "auxiliary_loss_mlp": 0.01266951, + "balance_loss_clip": 0.06284942, + "balance_loss_mlp": 0.01255465, + "epoch": 0.6089283030211935, + "flos": 21978928897920.0, + "grad_norm": 2.2629720759221996, + "language_loss": 0.72788715, + "learning_rate": 1.400812267497691e-06, + "loss": 0.80485797, + "num_input_tokens_seen": 218137325, + "router_z_loss_clip": 1.44921875, + "router_z_loss_mlp": 0.11486816, + "step": 10128, + "time_per_iteration": 2.553764820098877 + }, + { + "auxiliary_loss_clip": 0.06422316, + "auxiliary_loss_mlp": 0.0126747, + "balance_loss_clip": 0.06282373, + "balance_loss_mlp": 0.01257355, + "epoch": 0.6089884262738614, + "flos": 17790945978240.0, + "grad_norm": 1.9776728101481476, + "language_loss": 0.7314598, + "learning_rate": 1.4004407063889842e-06, + "loss": 0.8083576, + "num_input_tokens_seen": 218155530, + "router_z_loss_clip": 1.40039062, + "router_z_loss_mlp": 0.10119629, + "step": 10129, + "time_per_iteration": 2.4939491748809814 + }, + { + "auxiliary_loss_clip": 0.06421769, + "auxiliary_loss_mlp": 0.01271284, + "balance_loss_clip": 0.06280705, + "balance_loss_mlp": 0.01260764, + "epoch": 0.6090485495265294, + "flos": 36920496458880.0, + "grad_norm": 1.3316519758914749, + "language_loss": 0.65839994, + "learning_rate": 1.400069168015626e-06, + "loss": 0.73533046, + "num_input_tokens_seen": 218182535, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.10528564, + "step": 10130, + "time_per_iteration": 2.7194180488586426 + }, + { + "auxiliary_loss_clip": 0.0641261, + "auxiliary_loss_mlp": 0.012646, + "balance_loss_clip": 0.06274526, + "balance_loss_mlp": 0.01254926, + "epoch": 0.6091086727791973, + "flos": 19904755328640.0, + "grad_norm": 1.5918133317154841, + "language_loss": 0.77794468, + "learning_rate": 1.3996976523917054e-06, + "loss": 0.85471684, + "num_input_tokens_seen": 218201740, + "router_z_loss_clip": 1.38183594, + "router_z_loss_mlp": 0.09680176, + "step": 10131, + "time_per_iteration": 2.5376133918762207 + }, + { + "auxiliary_loss_clip": 0.0641945, + "auxiliary_loss_mlp": 0.01265085, + "balance_loss_clip": 0.06279676, + "balance_loss_mlp": 0.01255071, + "epoch": 0.6091687960318654, + "flos": 22170147914880.0, + "grad_norm": 1.8790929127191944, + "language_loss": 0.77705514, + "learning_rate": 1.3993261595313093e-06, + "loss": 0.85390049, + "num_input_tokens_seen": 218219800, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.10003662, + "step": 10132, + "time_per_iteration": 3.9999635219573975 + }, + { + "auxiliary_loss_clip": 0.06414825, + "auxiliary_loss_mlp": 0.01267619, + "balance_loss_clip": 0.06278821, + "balance_loss_mlp": 0.01257618, + "epoch": 0.6092289192845333, + "flos": 21470818538880.0, + "grad_norm": 2.2139477747978136, + "language_loss": 0.75865889, + "learning_rate": 1.3989546894485261e-06, + "loss": 0.83548331, + "num_input_tokens_seen": 218237585, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10003662, + "step": 10133, + "time_per_iteration": 2.545747756958008 + }, + { + "auxiliary_loss_clip": 0.06417366, + "auxiliary_loss_mlp": 0.01266553, + "balance_loss_clip": 0.06276603, + "balance_loss_mlp": 0.01255973, + "epoch": 0.6092890425372013, + "flos": 28702585347840.0, + "grad_norm": 1.8044338362434222, + "language_loss": 0.64228314, + "learning_rate": 1.3985832421574414e-06, + "loss": 0.71912241, + "num_input_tokens_seen": 218258700, + "router_z_loss_clip": 1.40722656, + "router_z_loss_mlp": 0.10583496, + "step": 10134, + "time_per_iteration": 2.563861131668091 + }, + { + "auxiliary_loss_clip": 0.06424356, + "auxiliary_loss_mlp": 0.01263619, + "balance_loss_clip": 0.06285493, + "balance_loss_mlp": 0.01253331, + "epoch": 0.6093491657898692, + "flos": 20819384570880.0, + "grad_norm": 1.7758601490441968, + "language_loss": 0.78973985, + "learning_rate": 1.3982118176721397e-06, + "loss": 0.86661959, + "num_input_tokens_seen": 218275655, + "router_z_loss_clip": 1.38476562, + "router_z_loss_mlp": 0.10290527, + "step": 10135, + "time_per_iteration": 2.553738832473755 + }, + { + "auxiliary_loss_clip": 0.06420235, + "auxiliary_loss_mlp": 0.01266173, + "balance_loss_clip": 0.06279118, + "balance_loss_mlp": 0.01256416, + "epoch": 0.6094092890425372, + "flos": 25453983352320.0, + "grad_norm": 1.626137919034545, + "language_loss": 0.72278392, + "learning_rate": 1.3978404160067069e-06, + "loss": 0.79964805, + "num_input_tokens_seen": 218295720, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.09753418, + "step": 10136, + "time_per_iteration": 4.003901958465576 + }, + { + "auxiliary_loss_clip": 0.06420286, + "auxiliary_loss_mlp": 0.01265077, + "balance_loss_clip": 0.06279141, + "balance_loss_mlp": 0.0125464, + "epoch": 0.6094694122952051, + "flos": 35629089333120.0, + "grad_norm": 1.6356074117681172, + "language_loss": 0.74919081, + "learning_rate": 1.3974690371752253e-06, + "loss": 0.82604444, + "num_input_tokens_seen": 218316745, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.10443115, + "step": 10137, + "time_per_iteration": 2.634158134460449 + }, + { + "auxiliary_loss_clip": 0.06417631, + "auxiliary_loss_mlp": 0.01266963, + "balance_loss_clip": 0.06275456, + "balance_loss_mlp": 0.01256246, + "epoch": 0.6095295355478731, + "flos": 24462975513600.0, + "grad_norm": 2.0845106182551163, + "language_loss": 0.80188054, + "learning_rate": 1.3970976811917785e-06, + "loss": 0.87872648, + "num_input_tokens_seen": 218335385, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.10717773, + "step": 10138, + "time_per_iteration": 2.5884156227111816 + }, + { + "auxiliary_loss_clip": 0.06410988, + "auxiliary_loss_mlp": 0.01265559, + "balance_loss_clip": 0.06275302, + "balance_loss_mlp": 0.01255354, + "epoch": 0.6095896588005411, + "flos": 15638716730880.0, + "grad_norm": 1.5018300865324132, + "language_loss": 0.81360239, + "learning_rate": 1.3967263480704481e-06, + "loss": 0.89036787, + "num_input_tokens_seen": 218353320, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.10205078, + "step": 10139, + "time_per_iteration": 2.4757158756256104 + }, + { + "auxiliary_loss_clip": 0.06419017, + "auxiliary_loss_mlp": 0.01267763, + "balance_loss_clip": 0.06276064, + "balance_loss_mlp": 0.01255895, + "epoch": 0.6096497820532091, + "flos": 15554455850880.0, + "grad_norm": 1.944047007891517, + "language_loss": 0.83626902, + "learning_rate": 1.396355037825315e-06, + "loss": 0.91313678, + "num_input_tokens_seen": 218365620, + "router_z_loss_clip": 1.42871094, + "router_z_loss_mlp": 0.11865234, + "step": 10140, + "time_per_iteration": 2.5361695289611816 + }, + { + "auxiliary_loss_clip": 0.06419208, + "auxiliary_loss_mlp": 0.0126965, + "balance_loss_clip": 0.06277294, + "balance_loss_mlp": 0.01258718, + "epoch": 0.6097099053058771, + "flos": 24210932832000.0, + "grad_norm": 1.8133263657959964, + "language_loss": 0.75536144, + "learning_rate": 1.3959837504704592e-06, + "loss": 0.83225, + "num_input_tokens_seen": 218383785, + "router_z_loss_clip": 1.41992188, + "router_z_loss_mlp": 0.10925293, + "step": 10141, + "time_per_iteration": 3.9623372554779053 + }, + { + "auxiliary_loss_clip": 0.06413428, + "auxiliary_loss_mlp": 0.01263151, + "balance_loss_clip": 0.06275016, + "balance_loss_mlp": 0.01253358, + "epoch": 0.609770028558545, + "flos": 19575830926080.0, + "grad_norm": 2.621888589140599, + "language_loss": 0.76574522, + "learning_rate": 1.3956124860199603e-06, + "loss": 0.842511, + "num_input_tokens_seen": 218399055, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.09790039, + "step": 10142, + "time_per_iteration": 2.5719213485717773 + }, + { + "auxiliary_loss_clip": 0.06415378, + "auxiliary_loss_mlp": 0.01266124, + "balance_loss_clip": 0.06274366, + "balance_loss_mlp": 0.01255979, + "epoch": 0.609830151811213, + "flos": 23955619841280.0, + "grad_norm": 1.612746865863279, + "language_loss": 0.76346582, + "learning_rate": 1.3952412444878964e-06, + "loss": 0.84028077, + "num_input_tokens_seen": 218419120, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.10150146, + "step": 10143, + "time_per_iteration": 2.529778242111206 + }, + { + "auxiliary_loss_clip": 0.06417874, + "auxiliary_loss_mlp": 0.01264047, + "balance_loss_clip": 0.06277366, + "balance_loss_mlp": 0.01253467, + "epoch": 0.6098902750638809, + "flos": 16185205059840.0, + "grad_norm": 2.5594432881750104, + "language_loss": 0.7530098, + "learning_rate": 1.3948700258883448e-06, + "loss": 0.82982898, + "num_input_tokens_seen": 218435290, + "router_z_loss_clip": 1.40527344, + "router_z_loss_mlp": 0.105896, + "step": 10144, + "time_per_iteration": 2.526620864868164 + }, + { + "auxiliary_loss_clip": 0.06420074, + "auxiliary_loss_mlp": 0.01264405, + "balance_loss_clip": 0.06276617, + "balance_loss_mlp": 0.01253634, + "epoch": 0.609950398316549, + "flos": 44536141549440.0, + "grad_norm": 2.1298130564389224, + "language_loss": 0.73869997, + "learning_rate": 1.394498830235383e-06, + "loss": 0.81554472, + "num_input_tokens_seen": 218457880, + "router_z_loss_clip": 1.43457031, + "router_z_loss_mlp": 0.10772705, + "step": 10145, + "time_per_iteration": 2.7241427898406982 + }, + { + "auxiliary_loss_clip": 0.06415195, + "auxiliary_loss_mlp": 0.01263159, + "balance_loss_clip": 0.06274907, + "balance_loss_mlp": 0.01252156, + "epoch": 0.6100105215692169, + "flos": 23228436182400.0, + "grad_norm": 1.5962491809481525, + "language_loss": 0.69665307, + "learning_rate": 1.3941276575430862e-06, + "loss": 0.77343661, + "num_input_tokens_seen": 218475930, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.11004639, + "step": 10146, + "time_per_iteration": 2.557990312576294 + }, + { + "auxiliary_loss_clip": 0.0641242, + "auxiliary_loss_mlp": 0.01264789, + "balance_loss_clip": 0.06276412, + "balance_loss_mlp": 0.01254865, + "epoch": 0.6100706448218849, + "flos": 15017904230400.0, + "grad_norm": 1.5284940617625797, + "language_loss": 0.76506376, + "learning_rate": 1.3937565078255289e-06, + "loss": 0.84183586, + "num_input_tokens_seen": 218493675, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.09936523, + "step": 10147, + "time_per_iteration": 2.5613648891448975 + }, + { + "auxiliary_loss_clip": 0.06412101, + "auxiliary_loss_mlp": 0.01262446, + "balance_loss_clip": 0.0627313, + "balance_loss_mlp": 0.01252153, + "epoch": 0.6101307680745528, + "flos": 19645039998720.0, + "grad_norm": 1.6729040728987632, + "language_loss": 0.78694391, + "learning_rate": 1.393385381096786e-06, + "loss": 0.86368936, + "num_input_tokens_seen": 218511780, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.10296631, + "step": 10148, + "time_per_iteration": 2.5073816776275635 + }, + { + "auxiliary_loss_clip": 0.06424719, + "auxiliary_loss_mlp": 0.01265462, + "balance_loss_clip": 0.06278485, + "balance_loss_mlp": 0.01253672, + "epoch": 0.6101908913272208, + "flos": 29943455662080.0, + "grad_norm": 11.644498336945409, + "language_loss": 0.53887326, + "learning_rate": 1.39301427737093e-06, + "loss": 0.61577505, + "num_input_tokens_seen": 218531850, + "router_z_loss_clip": 1.45996094, + "router_z_loss_mlp": 0.11779785, + "step": 10149, + "time_per_iteration": 2.579378843307495 + }, + { + "auxiliary_loss_clip": 0.0641048, + "auxiliary_loss_mlp": 0.01264861, + "balance_loss_clip": 0.06277239, + "balance_loss_mlp": 0.0125511, + "epoch": 0.6102510145798887, + "flos": 21805067675520.0, + "grad_norm": 1.6674264382808133, + "language_loss": 0.80347526, + "learning_rate": 1.3926431966620333e-06, + "loss": 0.8802287, + "num_input_tokens_seen": 218551245, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.09753418, + "step": 10150, + "time_per_iteration": 2.542039394378662 + }, + { + "auxiliary_loss_clip": 0.06418844, + "auxiliary_loss_mlp": 0.01266292, + "balance_loss_clip": 0.06277014, + "balance_loss_mlp": 0.01254747, + "epoch": 0.6103111378325567, + "flos": 20712719923200.0, + "grad_norm": 1.6063484518637994, + "language_loss": 0.69615412, + "learning_rate": 1.3922721389841684e-06, + "loss": 0.77300549, + "num_input_tokens_seen": 218571365, + "router_z_loss_clip": 1.41699219, + "router_z_loss_mlp": 0.11529541, + "step": 10151, + "time_per_iteration": 2.5254616737365723 + }, + { + "auxiliary_loss_clip": 0.06415872, + "auxiliary_loss_mlp": 0.01264029, + "balance_loss_clip": 0.06276833, + "balance_loss_mlp": 0.01254218, + "epoch": 0.6103712610852247, + "flos": 29388330362880.0, + "grad_norm": 1.5395706469140102, + "language_loss": 0.71042097, + "learning_rate": 1.3919011043514036e-06, + "loss": 0.78722, + "num_input_tokens_seen": 218588315, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.0980835, + "step": 10152, + "time_per_iteration": 2.565767288208008 + }, + { + "auxiliary_loss_clip": 0.06416918, + "auxiliary_loss_mlp": 0.01268582, + "balance_loss_clip": 0.06275494, + "balance_loss_mlp": 0.01257883, + "epoch": 0.6104313843378927, + "flos": 20819216862720.0, + "grad_norm": 1.604020409534104, + "language_loss": 0.78784543, + "learning_rate": 1.391530092777811e-06, + "loss": 0.86470044, + "num_input_tokens_seen": 218605940, + "router_z_loss_clip": 1.41308594, + "router_z_loss_mlp": 0.10699463, + "step": 10153, + "time_per_iteration": 2.5230531692504883 + }, + { + "auxiliary_loss_clip": 0.06414121, + "auxiliary_loss_mlp": 0.01268779, + "balance_loss_clip": 0.06273308, + "balance_loss_mlp": 0.01258873, + "epoch": 0.6104915075905607, + "flos": 26585715323520.0, + "grad_norm": 1.630222855772095, + "language_loss": 0.79992545, + "learning_rate": 1.3911591042774573e-06, + "loss": 0.8767544, + "num_input_tokens_seen": 218626100, + "router_z_loss_clip": 1.40917969, + "router_z_loss_mlp": 0.09906006, + "step": 10154, + "time_per_iteration": 2.5763237476348877 + }, + { + "auxiliary_loss_clip": 0.06417637, + "auxiliary_loss_mlp": 0.01269392, + "balance_loss_clip": 0.06279704, + "balance_loss_mlp": 0.01258937, + "epoch": 0.6105516308432286, + "flos": 23922734313600.0, + "grad_norm": 1.4598935838539129, + "language_loss": 0.70770371, + "learning_rate": 1.3907881388644116e-06, + "loss": 0.78457403, + "num_input_tokens_seen": 218645060, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.10455322, + "step": 10155, + "time_per_iteration": 2.5680413246154785 + }, + { + "auxiliary_loss_clip": 0.06418546, + "auxiliary_loss_mlp": 0.01266443, + "balance_loss_clip": 0.06278499, + "balance_loss_mlp": 0.0125569, + "epoch": 0.6106117540958966, + "flos": 31585520125440.0, + "grad_norm": 1.5387182092943745, + "language_loss": 0.71842468, + "learning_rate": 1.3904171965527413e-06, + "loss": 0.79527456, + "num_input_tokens_seen": 218667690, + "router_z_loss_clip": 1.39941406, + "router_z_loss_mlp": 0.10742188, + "step": 10156, + "time_per_iteration": 2.6240859031677246 + }, + { + "auxiliary_loss_clip": 0.06412362, + "auxiliary_loss_mlp": 0.01266681, + "balance_loss_clip": 0.06277083, + "balance_loss_mlp": 0.01255422, + "epoch": 0.6106718773485645, + "flos": 19613999260800.0, + "grad_norm": 1.3880208824071523, + "language_loss": 0.67516112, + "learning_rate": 1.3900462773565114e-06, + "loss": 0.75195158, + "num_input_tokens_seen": 218687505, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.11254883, + "step": 10157, + "time_per_iteration": 2.533141613006592 + }, + { + "auxiliary_loss_clip": 0.06414488, + "auxiliary_loss_mlp": 0.01264295, + "balance_loss_clip": 0.06273255, + "balance_loss_mlp": 0.01253888, + "epoch": 0.6107320006012326, + "flos": 17128778688000.0, + "grad_norm": 1.7065905103759618, + "language_loss": 0.72894049, + "learning_rate": 1.3896753812897877e-06, + "loss": 0.80572832, + "num_input_tokens_seen": 218705315, + "router_z_loss_clip": 1.41308594, + "router_z_loss_mlp": 0.10400391, + "step": 10158, + "time_per_iteration": 2.4852585792541504 + }, + { + "auxiliary_loss_clip": 0.06417953, + "auxiliary_loss_mlp": 0.01268086, + "balance_loss_clip": 0.06274998, + "balance_loss_mlp": 0.01257917, + "epoch": 0.6107921238539005, + "flos": 30155107875840.0, + "grad_norm": 1.7026117107079757, + "language_loss": 0.69434297, + "learning_rate": 1.389304508366635e-06, + "loss": 0.7712034, + "num_input_tokens_seen": 218725735, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.1015625, + "step": 10159, + "time_per_iteration": 2.6481263637542725 + }, + { + "auxiliary_loss_clip": 0.06416903, + "auxiliary_loss_mlp": 0.01266619, + "balance_loss_clip": 0.06276091, + "balance_loss_mlp": 0.01255747, + "epoch": 0.6108522471065685, + "flos": 18445859890560.0, + "grad_norm": 1.7469967655501557, + "language_loss": 0.79027724, + "learning_rate": 1.3889336586011167e-06, + "loss": 0.86711246, + "num_input_tokens_seen": 218743215, + "router_z_loss_clip": 1.40722656, + "router_z_loss_mlp": 0.10876465, + "step": 10160, + "time_per_iteration": 2.5056142807006836 + }, + { + "auxiliary_loss_clip": 0.06325343, + "auxiliary_loss_mlp": 0.01260291, + "balance_loss_clip": 0.06266694, + "balance_loss_mlp": 0.01258597, + "epoch": 0.6109123703592364, + "flos": 64157295605760.0, + "grad_norm": 0.797024648042973, + "language_loss": 0.61520749, + "learning_rate": 1.388562832007295e-06, + "loss": 0.69106382, + "num_input_tokens_seen": 218806440, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 0.01698303, + "step": 10161, + "time_per_iteration": 3.325639486312866 + }, + { + "auxiliary_loss_clip": 0.06418448, + "auxiliary_loss_mlp": 0.01268382, + "balance_loss_clip": 0.06276111, + "balance_loss_mlp": 0.01257099, + "epoch": 0.6109724936119044, + "flos": 20674132318080.0, + "grad_norm": 2.3454759388543316, + "language_loss": 0.76444739, + "learning_rate": 1.3881920285992324e-06, + "loss": 0.84131569, + "num_input_tokens_seen": 218825720, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.112854, + "step": 10162, + "time_per_iteration": 4.040041446685791 + }, + { + "auxiliary_loss_clip": 0.06414326, + "auxiliary_loss_mlp": 0.01264875, + "balance_loss_clip": 0.0627545, + "balance_loss_mlp": 0.01253669, + "epoch": 0.6110326168645723, + "flos": 31358899958400.0, + "grad_norm": 1.528039199186958, + "language_loss": 0.71962601, + "learning_rate": 1.3878212483909888e-06, + "loss": 0.79641795, + "num_input_tokens_seen": 218847735, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.11218262, + "step": 10163, + "time_per_iteration": 2.5920441150665283 + }, + { + "auxiliary_loss_clip": 0.06409657, + "auxiliary_loss_mlp": 0.01267118, + "balance_loss_clip": 0.06273548, + "balance_loss_mlp": 0.01257903, + "epoch": 0.6110927401172404, + "flos": 25009338061440.0, + "grad_norm": 1.7630876229655692, + "language_loss": 0.60071069, + "learning_rate": 1.387450491396625e-06, + "loss": 0.67747843, + "num_input_tokens_seen": 218866585, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.09210205, + "step": 10164, + "time_per_iteration": 2.559441328048706 + }, + { + "auxiliary_loss_clip": 0.06414106, + "auxiliary_loss_mlp": 0.01269871, + "balance_loss_clip": 0.0627519, + "balance_loss_mlp": 0.0125975, + "epoch": 0.6111528633699083, + "flos": 26254946131200.0, + "grad_norm": 1.466434652755145, + "language_loss": 0.75936824, + "learning_rate": 1.3870797576302003e-06, + "loss": 0.83620799, + "num_input_tokens_seen": 218885560, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.10119629, + "step": 10165, + "time_per_iteration": 2.521923542022705 + }, + { + "auxiliary_loss_clip": 0.0641854, + "auxiliary_loss_mlp": 0.01268441, + "balance_loss_clip": 0.06282263, + "balance_loss_mlp": 0.0125807, + "epoch": 0.6112129866225763, + "flos": 22389011579520.0, + "grad_norm": 1.518231620716018, + "language_loss": 0.79607749, + "learning_rate": 1.3867090471057719e-06, + "loss": 0.87294728, + "num_input_tokens_seen": 218905055, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.10375977, + "step": 10166, + "time_per_iteration": 2.5410702228546143 + }, + { + "auxiliary_loss_clip": 0.06416941, + "auxiliary_loss_mlp": 0.01265827, + "balance_loss_clip": 0.06276624, + "balance_loss_mlp": 0.01254949, + "epoch": 0.6112731098752443, + "flos": 25234826198400.0, + "grad_norm": 7.9003095632563385, + "language_loss": 0.68483454, + "learning_rate": 1.3863383598373987e-06, + "loss": 0.76166224, + "num_input_tokens_seen": 218924030, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.10876465, + "step": 10167, + "time_per_iteration": 2.5295464992523193 + }, + { + "auxiliary_loss_clip": 0.0641242, + "auxiliary_loss_mlp": 0.01265962, + "balance_loss_clip": 0.06275839, + "balance_loss_mlp": 0.01256586, + "epoch": 0.6113332331279122, + "flos": 22899763342080.0, + "grad_norm": 1.6873056368761516, + "language_loss": 0.7915386, + "learning_rate": 1.3859676958391364e-06, + "loss": 0.86832243, + "num_input_tokens_seen": 218943750, + "router_z_loss_clip": 1.36621094, + "router_z_loss_mlp": 0.09381104, + "step": 10168, + "time_per_iteration": 2.53782320022583 + }, + { + "auxiliary_loss_clip": 0.06426514, + "auxiliary_loss_mlp": 0.0126727, + "balance_loss_clip": 0.06277908, + "balance_loss_mlp": 0.01254991, + "epoch": 0.6113933563805802, + "flos": 18625548971520.0, + "grad_norm": 2.2514835469058405, + "language_loss": 0.86128104, + "learning_rate": 1.3855970551250398e-06, + "loss": 0.93821883, + "num_input_tokens_seen": 218957585, + "router_z_loss_clip": 1.48632812, + "router_z_loss_mlp": 0.12286377, + "step": 10169, + "time_per_iteration": 2.4681122303009033 + }, + { + "auxiliary_loss_clip": 0.06415342, + "auxiliary_loss_mlp": 0.01264532, + "balance_loss_clip": 0.06275853, + "balance_loss_mlp": 0.01254871, + "epoch": 0.6114534796332481, + "flos": 41876137359360.0, + "grad_norm": 1.5861355547500362, + "language_loss": 0.79530609, + "learning_rate": 1.3852264377091652e-06, + "loss": 0.87210482, + "num_input_tokens_seen": 218980025, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.09661865, + "step": 10170, + "time_per_iteration": 2.707791566848755 + }, + { + "auxiliary_loss_clip": 0.06423808, + "auxiliary_loss_mlp": 0.01264285, + "balance_loss_clip": 0.06277203, + "balance_loss_mlp": 0.01252359, + "epoch": 0.6115136028859162, + "flos": 21914960705280.0, + "grad_norm": 2.240444553593937, + "language_loss": 0.6873374, + "learning_rate": 1.3848558436055651e-06, + "loss": 0.76421833, + "num_input_tokens_seen": 218998200, + "router_z_loss_clip": 1.46679688, + "router_z_loss_mlp": 0.1192627, + "step": 10171, + "time_per_iteration": 2.505051612854004 + }, + { + "auxiliary_loss_clip": 0.06420024, + "auxiliary_loss_mlp": 0.01266591, + "balance_loss_clip": 0.06277289, + "balance_loss_mlp": 0.01254634, + "epoch": 0.6115737261385841, + "flos": 28812604158720.0, + "grad_norm": 6.231678075331036, + "language_loss": 0.79464412, + "learning_rate": 1.3844852728282934e-06, + "loss": 0.87151027, + "num_input_tokens_seen": 219017910, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.11962891, + "step": 10172, + "time_per_iteration": 4.057689666748047 + }, + { + "auxiliary_loss_clip": 0.06425016, + "auxiliary_loss_mlp": 0.01268326, + "balance_loss_clip": 0.06279068, + "balance_loss_mlp": 0.01257222, + "epoch": 0.6116338493912521, + "flos": 21257824659840.0, + "grad_norm": 1.6337666078989976, + "language_loss": 0.67181307, + "learning_rate": 1.3841147253914022e-06, + "loss": 0.74874651, + "num_input_tokens_seen": 219037730, + "router_z_loss_clip": 1.45703125, + "router_z_loss_mlp": 0.11108398, + "step": 10173, + "time_per_iteration": 2.5301437377929688 + }, + { + "auxiliary_loss_clip": 0.06418002, + "auxiliary_loss_mlp": 0.01270854, + "balance_loss_clip": 0.06275578, + "balance_loss_mlp": 0.01259261, + "epoch": 0.61169397264392, + "flos": 17535968403840.0, + "grad_norm": 1.769252328158937, + "language_loss": 0.56344169, + "learning_rate": 1.3837442013089416e-06, + "loss": 0.64033026, + "num_input_tokens_seen": 219056755, + "router_z_loss_clip": 1.42285156, + "router_z_loss_mlp": 0.1159668, + "step": 10174, + "time_per_iteration": 2.530437707901001 + }, + { + "auxiliary_loss_clip": 0.064185, + "auxiliary_loss_mlp": 0.01267148, + "balance_loss_clip": 0.06277028, + "balance_loss_mlp": 0.01255931, + "epoch": 0.611754095896588, + "flos": 23958387025920.0, + "grad_norm": 1.6825013036462741, + "language_loss": 0.66233337, + "learning_rate": 1.3833737005949628e-06, + "loss": 0.73918986, + "num_input_tokens_seen": 219076985, + "router_z_loss_clip": 1.41699219, + "router_z_loss_mlp": 0.11212158, + "step": 10175, + "time_per_iteration": 4.048693656921387 + }, + { + "auxiliary_loss_clip": 0.06415173, + "auxiliary_loss_mlp": 0.01263056, + "balance_loss_clip": 0.06275052, + "balance_loss_mlp": 0.01253019, + "epoch": 0.6118142191492559, + "flos": 26002064908800.0, + "grad_norm": 1.985962827753808, + "language_loss": 0.82859969, + "learning_rate": 1.3830032232635154e-06, + "loss": 0.90538198, + "num_input_tokens_seen": 219096050, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.10040283, + "step": 10176, + "time_per_iteration": 2.5558836460113525 + }, + { + "auxiliary_loss_clip": 0.06419128, + "auxiliary_loss_mlp": 0.01271507, + "balance_loss_clip": 0.06277899, + "balance_loss_mlp": 0.01259491, + "epoch": 0.611874342401924, + "flos": 24609275942400.0, + "grad_norm": 1.5904100346197647, + "language_loss": 0.77812099, + "learning_rate": 1.3826327693286474e-06, + "loss": 0.85502738, + "num_input_tokens_seen": 219112665, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.12011719, + "step": 10177, + "time_per_iteration": 2.5346739292144775 + }, + { + "auxiliary_loss_clip": 0.06416818, + "auxiliary_loss_mlp": 0.01269124, + "balance_loss_clip": 0.06275249, + "balance_loss_mlp": 0.01257924, + "epoch": 0.6119344656545919, + "flos": 15892436494080.0, + "grad_norm": 2.6097925851891755, + "language_loss": 0.75949138, + "learning_rate": 1.3822623388044065e-06, + "loss": 0.8363508, + "num_input_tokens_seen": 219129120, + "router_z_loss_clip": 1.41503906, + "router_z_loss_mlp": 0.11212158, + "step": 10178, + "time_per_iteration": 2.524557113647461 + }, + { + "auxiliary_loss_clip": 0.06418636, + "auxiliary_loss_mlp": 0.01267998, + "balance_loss_clip": 0.06276189, + "balance_loss_mlp": 0.01256435, + "epoch": 0.6119945889072599, + "flos": 21659312298240.0, + "grad_norm": 1.5720284026291744, + "language_loss": 0.67318261, + "learning_rate": 1.3818919317048402e-06, + "loss": 0.75004888, + "num_input_tokens_seen": 219148950, + "router_z_loss_clip": 1.42285156, + "router_z_loss_mlp": 0.11553955, + "step": 10179, + "time_per_iteration": 2.5297069549560547 + }, + { + "auxiliary_loss_clip": 0.06419764, + "auxiliary_loss_mlp": 0.01264087, + "balance_loss_clip": 0.06276909, + "balance_loss_mlp": 0.01253179, + "epoch": 0.6120547121599279, + "flos": 13777746675840.0, + "grad_norm": 1.9709040238374929, + "language_loss": 0.83888078, + "learning_rate": 1.3815215480439933e-06, + "loss": 0.91571933, + "num_input_tokens_seen": 219165585, + "router_z_loss_clip": 1.42675781, + "router_z_loss_mlp": 0.10906982, + "step": 10180, + "time_per_iteration": 3.9827919006347656 + }, + { + "auxiliary_loss_clip": 0.06417181, + "auxiliary_loss_mlp": 0.01268448, + "balance_loss_clip": 0.06276719, + "balance_loss_mlp": 0.01256683, + "epoch": 0.6121148354125958, + "flos": 20084528263680.0, + "grad_norm": 1.549982980411044, + "language_loss": 0.77731764, + "learning_rate": 1.3811511878359113e-06, + "loss": 0.8541739, + "num_input_tokens_seen": 219183280, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.11761475, + "step": 10181, + "time_per_iteration": 2.4853463172912598 + }, + { + "auxiliary_loss_clip": 0.06420098, + "auxiliary_loss_mlp": 0.01269807, + "balance_loss_clip": 0.06277204, + "balance_loss_mlp": 0.01258565, + "epoch": 0.6121749586652638, + "flos": 13474915620480.0, + "grad_norm": 2.0089243925599973, + "language_loss": 0.8071022, + "learning_rate": 1.3807808510946384e-06, + "loss": 0.88400126, + "num_input_tokens_seen": 219197200, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.11248779, + "step": 10182, + "time_per_iteration": 2.4935574531555176 + }, + { + "auxiliary_loss_clip": 0.06411545, + "auxiliary_loss_mlp": 0.01267116, + "balance_loss_clip": 0.0627587, + "balance_loss_mlp": 0.0125805, + "epoch": 0.6122350819179317, + "flos": 20126721594240.0, + "grad_norm": 1.501667213386016, + "language_loss": 0.83102655, + "learning_rate": 1.3804105378342177e-06, + "loss": 0.90781319, + "num_input_tokens_seen": 219216825, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.09069824, + "step": 10183, + "time_per_iteration": 2.5836997032165527 + }, + { + "auxiliary_loss_clip": 0.06327992, + "auxiliary_loss_mlp": 0.01253825, + "balance_loss_clip": 0.06268366, + "balance_loss_mlp": 0.01252147, + "epoch": 0.6122952051705998, + "flos": 65448004700160.0, + "grad_norm": 0.7149962337899693, + "language_loss": 0.62764937, + "learning_rate": 1.3800402480686914e-06, + "loss": 0.70346749, + "num_input_tokens_seen": 219283795, + "router_z_loss_clip": 0.59765625, + "router_z_loss_mlp": 0.01681519, + "step": 10184, + "time_per_iteration": 3.3003170490264893 + }, + { + "auxiliary_loss_clip": 0.06420484, + "auxiliary_loss_mlp": 0.01263793, + "balance_loss_clip": 0.06279504, + "balance_loss_mlp": 0.01253857, + "epoch": 0.6123553284232677, + "flos": 20382537709440.0, + "grad_norm": 1.6441224641064962, + "language_loss": 0.82408071, + "learning_rate": 1.379669981812101e-06, + "loss": 0.90092349, + "num_input_tokens_seen": 219302385, + "router_z_loss_clip": 1.40917969, + "router_z_loss_mlp": 0.09936523, + "step": 10185, + "time_per_iteration": 2.5150225162506104 + }, + { + "auxiliary_loss_clip": 0.06425197, + "auxiliary_loss_mlp": 0.01266627, + "balance_loss_clip": 0.06278922, + "balance_loss_mlp": 0.01255487, + "epoch": 0.6124154516759357, + "flos": 23994417081600.0, + "grad_norm": 1.7366290964606979, + "language_loss": 0.75121021, + "learning_rate": 1.3792997390784868e-06, + "loss": 0.82812846, + "num_input_tokens_seen": 219319765, + "router_z_loss_clip": 1.46386719, + "router_z_loss_mlp": 0.11151123, + "step": 10186, + "time_per_iteration": 2.627387046813965 + }, + { + "auxiliary_loss_clip": 0.06415901, + "auxiliary_loss_mlp": 0.01262607, + "balance_loss_clip": 0.06275809, + "balance_loss_mlp": 0.01252599, + "epoch": 0.6124755749286036, + "flos": 21474927388800.0, + "grad_norm": 1.4642741872217127, + "language_loss": 0.78637451, + "learning_rate": 1.3789295198818895e-06, + "loss": 0.8631596, + "num_input_tokens_seen": 219337440, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.10003662, + "step": 10187, + "time_per_iteration": 2.49202561378479 + }, + { + "auxiliary_loss_clip": 0.06414475, + "auxiliary_loss_mlp": 0.01265646, + "balance_loss_clip": 0.06274372, + "balance_loss_mlp": 0.0125472, + "epoch": 0.6125356981812716, + "flos": 23886117279360.0, + "grad_norm": 1.4743912854017487, + "language_loss": 0.83344066, + "learning_rate": 1.3785593242363462e-06, + "loss": 0.91024196, + "num_input_tokens_seen": 219357525, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.10925293, + "step": 10188, + "time_per_iteration": 2.555687427520752 + }, + { + "auxiliary_loss_clip": 0.06417944, + "auxiliary_loss_mlp": 0.01265819, + "balance_loss_clip": 0.06276008, + "balance_loss_mlp": 0.01255168, + "epoch": 0.6125958214339395, + "flos": 14430312673920.0, + "grad_norm": 1.6601752905069214, + "language_loss": 0.75527823, + "learning_rate": 1.378189152155896e-06, + "loss": 0.83211589, + "num_input_tokens_seen": 219374855, + "router_z_loss_clip": 1.41992188, + "router_z_loss_mlp": 0.10656738, + "step": 10189, + "time_per_iteration": 2.4994595050811768 + }, + { + "auxiliary_loss_clip": 0.06417951, + "auxiliary_loss_mlp": 0.01265327, + "balance_loss_clip": 0.06275356, + "balance_loss_mlp": 0.012543, + "epoch": 0.6126559446866076, + "flos": 23265933684480.0, + "grad_norm": 1.4192081343801892, + "language_loss": 0.74300897, + "learning_rate": 1.3778190036545758e-06, + "loss": 0.81984174, + "num_input_tokens_seen": 219394740, + "router_z_loss_clip": 1.42480469, + "router_z_loss_mlp": 0.11016846, + "step": 10190, + "time_per_iteration": 2.6080024242401123 + }, + { + "auxiliary_loss_clip": 0.06418385, + "auxiliary_loss_mlp": 0.01266786, + "balance_loss_clip": 0.0627688, + "balance_loss_mlp": 0.01255044, + "epoch": 0.6127160679392755, + "flos": 26871188584320.0, + "grad_norm": 1.672928736412144, + "language_loss": 0.68484575, + "learning_rate": 1.3774488787464207e-06, + "loss": 0.76169741, + "num_input_tokens_seen": 219413755, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.11749268, + "step": 10191, + "time_per_iteration": 2.54805064201355 + }, + { + "auxiliary_loss_clip": 0.06419395, + "auxiliary_loss_mlp": 0.012717, + "balance_loss_clip": 0.06276383, + "balance_loss_mlp": 0.01259833, + "epoch": 0.6127761911919435, + "flos": 26403720255360.0, + "grad_norm": 1.7824154048725067, + "language_loss": 0.73771405, + "learning_rate": 1.377078777445467e-06, + "loss": 0.81462502, + "num_input_tokens_seen": 219433560, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.11859131, + "step": 10192, + "time_per_iteration": 2.556392192840576 + }, + { + "auxiliary_loss_clip": 0.06413901, + "auxiliary_loss_mlp": 0.01263543, + "balance_loss_clip": 0.06275194, + "balance_loss_mlp": 0.01253225, + "epoch": 0.6128363144446115, + "flos": 22640802698880.0, + "grad_norm": 1.814520897334069, + "language_loss": 0.84227109, + "learning_rate": 1.3767086997657478e-06, + "loss": 0.91904557, + "num_input_tokens_seen": 219452640, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.10314941, + "step": 10193, + "time_per_iteration": 2.5000216960906982 + }, + { + "auxiliary_loss_clip": 0.06417094, + "auxiliary_loss_mlp": 0.01267497, + "balance_loss_clip": 0.06275633, + "balance_loss_mlp": 0.01256625, + "epoch": 0.6128964376972794, + "flos": 26766033310080.0, + "grad_norm": 2.0280898056271255, + "language_loss": 0.707515, + "learning_rate": 1.3763386457212979e-06, + "loss": 0.78436089, + "num_input_tokens_seen": 219468585, + "router_z_loss_clip": 1.41308594, + "router_z_loss_mlp": 0.10870361, + "step": 10194, + "time_per_iteration": 2.5357043743133545 + }, + { + "auxiliary_loss_clip": 0.06330009, + "auxiliary_loss_mlp": 0.01254574, + "balance_loss_clip": 0.06270672, + "balance_loss_mlp": 0.01252429, + "epoch": 0.6129565609499474, + "flos": 65585500450560.0, + "grad_norm": 0.7963949843311754, + "language_loss": 0.58648682, + "learning_rate": 1.375968615326149e-06, + "loss": 0.66233265, + "num_input_tokens_seen": 219523015, + "router_z_loss_clip": 0.59375, + "router_z_loss_mlp": 0.02146912, + "step": 10195, + "time_per_iteration": 2.935722589492798 + }, + { + "auxiliary_loss_clip": 0.06416507, + "auxiliary_loss_mlp": 0.01269514, + "balance_loss_clip": 0.06275862, + "balance_loss_mlp": 0.01257873, + "epoch": 0.6130166842026153, + "flos": 16367577471360.0, + "grad_norm": 1.8676293874241905, + "language_loss": 0.69944096, + "learning_rate": 1.3755986085943324e-06, + "loss": 0.77630115, + "num_input_tokens_seen": 219539980, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.11639404, + "step": 10196, + "time_per_iteration": 2.522855520248413 + }, + { + "auxiliary_loss_clip": 0.06413607, + "auxiliary_loss_mlp": 0.0126591, + "balance_loss_clip": 0.06273703, + "balance_loss_mlp": 0.01255795, + "epoch": 0.6130768074552834, + "flos": 23658029665920.0, + "grad_norm": 1.6623431982713033, + "language_loss": 0.7114116, + "learning_rate": 1.3752286255398788e-06, + "loss": 0.78820676, + "num_input_tokens_seen": 219556980, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.10113525, + "step": 10197, + "time_per_iteration": 2.576241970062256 + }, + { + "auxiliary_loss_clip": 0.06418445, + "auxiliary_loss_mlp": 0.01271491, + "balance_loss_clip": 0.06275209, + "balance_loss_mlp": 0.01260828, + "epoch": 0.6131369307079513, + "flos": 20053613306880.0, + "grad_norm": 1.7635400810353365, + "language_loss": 0.78912157, + "learning_rate": 1.3748586661768191e-06, + "loss": 0.86602092, + "num_input_tokens_seen": 219576410, + "router_z_loss_clip": 1.43164062, + "router_z_loss_mlp": 0.10675049, + "step": 10198, + "time_per_iteration": 2.5441195964813232 + }, + { + "auxiliary_loss_clip": 0.06419414, + "auxiliary_loss_mlp": 0.01266374, + "balance_loss_clip": 0.06274287, + "balance_loss_mlp": 0.01255138, + "epoch": 0.6131970539606193, + "flos": 22678384055040.0, + "grad_norm": 1.422407986186852, + "language_loss": 0.74737686, + "learning_rate": 1.374488730519181e-06, + "loss": 0.82423472, + "num_input_tokens_seen": 219597180, + "router_z_loss_clip": 1.44824219, + "router_z_loss_mlp": 0.11236572, + "step": 10199, + "time_per_iteration": 2.567636251449585 + }, + { + "auxiliary_loss_clip": 0.06417924, + "auxiliary_loss_mlp": 0.01269269, + "balance_loss_clip": 0.06272729, + "balance_loss_mlp": 0.01257735, + "epoch": 0.6132571772132872, + "flos": 26878316181120.0, + "grad_norm": 1.5670545162327942, + "language_loss": 0.62008464, + "learning_rate": 1.374118818580993e-06, + "loss": 0.69695652, + "num_input_tokens_seen": 219617630, + "router_z_loss_clip": 1.45214844, + "router_z_loss_mlp": 0.11541748, + "step": 10200, + "time_per_iteration": 2.561591863632202 + }, + { + "auxiliary_loss_clip": 0.06416481, + "auxiliary_loss_mlp": 0.01270085, + "balance_loss_clip": 0.06275273, + "balance_loss_mlp": 0.0125944, + "epoch": 0.6133173004659552, + "flos": 22899176363520.0, + "grad_norm": 1.7093296118249273, + "language_loss": 0.69054127, + "learning_rate": 1.3737489303762822e-06, + "loss": 0.76740688, + "num_input_tokens_seen": 219637025, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.10644531, + "step": 10201, + "time_per_iteration": 3.9431076049804688 + }, + { + "auxiliary_loss_clip": 0.06409751, + "auxiliary_loss_mlp": 0.01268274, + "balance_loss_clip": 0.06271015, + "balance_loss_mlp": 0.01257462, + "epoch": 0.6133774237186231, + "flos": 20491298709120.0, + "grad_norm": 2.3821613548396368, + "language_loss": 0.83898175, + "learning_rate": 1.3733790659190746e-06, + "loss": 0.91576207, + "num_input_tokens_seen": 219656625, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.10809326, + "step": 10202, + "time_per_iteration": 2.496201276779175 + }, + { + "auxiliary_loss_clip": 0.06332828, + "auxiliary_loss_mlp": 0.01255453, + "balance_loss_clip": 0.06274157, + "balance_loss_mlp": 0.01253526, + "epoch": 0.6134375469712912, + "flos": 69433643208960.0, + "grad_norm": 0.8530026378603166, + "language_loss": 0.66995066, + "learning_rate": 1.3730092252233953e-06, + "loss": 0.74583346, + "num_input_tokens_seen": 219718090, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 0.01924133, + "step": 10203, + "time_per_iteration": 3.1688590049743652 + }, + { + "auxiliary_loss_clip": 0.06417629, + "auxiliary_loss_mlp": 0.01266234, + "balance_loss_clip": 0.06275114, + "balance_loss_mlp": 0.0125538, + "epoch": 0.6134976702239591, + "flos": 41291145279360.0, + "grad_norm": 1.6901163598507989, + "language_loss": 0.61053431, + "learning_rate": 1.37263940830327e-06, + "loss": 0.68737298, + "num_input_tokens_seen": 219740100, + "router_z_loss_clip": 1.42675781, + "router_z_loss_mlp": 0.10845947, + "step": 10204, + "time_per_iteration": 2.7038605213165283 + }, + { + "auxiliary_loss_clip": 0.06412404, + "auxiliary_loss_mlp": 0.01263093, + "balance_loss_clip": 0.06273691, + "balance_loss_mlp": 0.01252901, + "epoch": 0.6135577934766271, + "flos": 22353233086080.0, + "grad_norm": 1.6787218918093536, + "language_loss": 0.72929007, + "learning_rate": 1.3722696151727204e-06, + "loss": 0.80604506, + "num_input_tokens_seen": 219761225, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.10198975, + "step": 10205, + "time_per_iteration": 2.5766189098358154 + }, + { + "auxiliary_loss_clip": 0.06411709, + "auxiliary_loss_mlp": 0.01265007, + "balance_loss_clip": 0.06273441, + "balance_loss_mlp": 0.01253843, + "epoch": 0.6136179167292951, + "flos": 23734198627200.0, + "grad_norm": 1.5218154078879744, + "language_loss": 0.76180834, + "learning_rate": 1.3718998458457701e-06, + "loss": 0.83857548, + "num_input_tokens_seen": 219780085, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.1116333, + "step": 10206, + "time_per_iteration": 2.5717761516571045 + }, + { + "auxiliary_loss_clip": 0.0641268, + "auxiliary_loss_mlp": 0.01265782, + "balance_loss_clip": 0.06271984, + "balance_loss_mlp": 0.01254595, + "epoch": 0.613678039981963, + "flos": 26030757732480.0, + "grad_norm": 2.128320629636919, + "language_loss": 0.7591306, + "learning_rate": 1.3715301003364407e-06, + "loss": 0.83591521, + "num_input_tokens_seen": 219797895, + "router_z_loss_clip": 1.40722656, + "router_z_loss_mlp": 0.11181641, + "step": 10207, + "time_per_iteration": 2.5353450775146484 + }, + { + "auxiliary_loss_clip": 0.06418657, + "auxiliary_loss_mlp": 0.01264109, + "balance_loss_clip": 0.06278594, + "balance_loss_mlp": 0.01253362, + "epoch": 0.613738163234631, + "flos": 9863078175360.0, + "grad_norm": 1.9702213064203427, + "language_loss": 0.82853335, + "learning_rate": 1.3711603786587525e-06, + "loss": 0.90536106, + "num_input_tokens_seen": 219811295, + "router_z_loss_clip": 1.39941406, + "router_z_loss_mlp": 0.10748291, + "step": 10208, + "time_per_iteration": 2.4810874462127686 + }, + { + "auxiliary_loss_clip": 0.06422867, + "auxiliary_loss_mlp": 0.01267664, + "balance_loss_clip": 0.06278636, + "balance_loss_mlp": 0.01255814, + "epoch": 0.613798286487299, + "flos": 33190380576000.0, + "grad_norm": 1.7610608340758167, + "language_loss": 0.72894984, + "learning_rate": 1.3707906808267265e-06, + "loss": 0.8058551, + "num_input_tokens_seen": 219832735, + "router_z_loss_clip": 1.44238281, + "router_z_loss_mlp": 0.1184082, + "step": 10209, + "time_per_iteration": 2.6061112880706787 + }, + { + "auxiliary_loss_clip": 0.06413165, + "auxiliary_loss_mlp": 0.01267749, + "balance_loss_clip": 0.06273563, + "balance_loss_mlp": 0.01257157, + "epoch": 0.613858409739967, + "flos": 25634678682240.0, + "grad_norm": 1.6794559835324834, + "language_loss": 0.74641943, + "learning_rate": 1.37042100685438e-06, + "loss": 0.8232286, + "num_input_tokens_seen": 219852755, + "router_z_loss_clip": 1.39648438, + "router_z_loss_mlp": 0.10595703, + "step": 10210, + "time_per_iteration": 2.5699121952056885 + }, + { + "auxiliary_loss_clip": 0.06324588, + "auxiliary_loss_mlp": 0.01253647, + "balance_loss_clip": 0.06266326, + "balance_loss_mlp": 0.01251882, + "epoch": 0.6139185329926349, + "flos": 67213336919040.0, + "grad_norm": 0.8410650121869828, + "language_loss": 0.65019715, + "learning_rate": 1.3700513567557325e-06, + "loss": 0.72597951, + "num_input_tokens_seen": 219922785, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 0.01765442, + "step": 10211, + "time_per_iteration": 3.2996082305908203 + }, + { + "auxiliary_loss_clip": 0.06413533, + "auxiliary_loss_mlp": 0.01270005, + "balance_loss_clip": 0.06274238, + "balance_loss_mlp": 0.01258889, + "epoch": 0.6139786562453029, + "flos": 21550090101120.0, + "grad_norm": 1.5192132224806107, + "language_loss": 0.75830382, + "learning_rate": 1.369681730544801e-06, + "loss": 0.83513916, + "num_input_tokens_seen": 219942215, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.11120605, + "step": 10212, + "time_per_iteration": 3.9495487213134766 + }, + { + "auxiliary_loss_clip": 0.06416361, + "auxiliary_loss_mlp": 0.01273486, + "balance_loss_clip": 0.06276919, + "balance_loss_mlp": 0.01262614, + "epoch": 0.6140387794979708, + "flos": 26075802101760.0, + "grad_norm": 1.4991601562707406, + "language_loss": 0.74122798, + "learning_rate": 1.3693121282356009e-06, + "loss": 0.8181265, + "num_input_tokens_seen": 219963830, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.10882568, + "step": 10213, + "time_per_iteration": 2.550542116165161 + }, + { + "auxiliary_loss_clip": 0.06420778, + "auxiliary_loss_mlp": 0.01265233, + "balance_loss_clip": 0.06275892, + "balance_loss_mlp": 0.01253742, + "epoch": 0.6140989027506388, + "flos": 23701145391360.0, + "grad_norm": 1.8705312076501914, + "language_loss": 0.73641956, + "learning_rate": 1.3689425498421483e-06, + "loss": 0.81327969, + "num_input_tokens_seen": 219983815, + "router_z_loss_clip": 1.44824219, + "router_z_loss_mlp": 0.11499023, + "step": 10214, + "time_per_iteration": 2.524115562438965 + }, + { + "auxiliary_loss_clip": 0.06416141, + "auxiliary_loss_mlp": 0.012644, + "balance_loss_clip": 0.06273637, + "balance_loss_mlp": 0.01253289, + "epoch": 0.6141590260033067, + "flos": 22237428343680.0, + "grad_norm": 1.5033107567748507, + "language_loss": 0.74553859, + "learning_rate": 1.3685729953784572e-06, + "loss": 0.82234401, + "num_input_tokens_seen": 220003165, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.11108398, + "step": 10215, + "time_per_iteration": 3.9794795513153076 + }, + { + "auxiliary_loss_clip": 0.06410043, + "auxiliary_loss_mlp": 0.01269466, + "balance_loss_clip": 0.06271295, + "balance_loss_mlp": 0.01258719, + "epoch": 0.6142191492559748, + "flos": 23877312600960.0, + "grad_norm": 1.5966298517178832, + "language_loss": 0.78681469, + "learning_rate": 1.368203464858542e-06, + "loss": 0.86360973, + "num_input_tokens_seen": 220021015, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.10742188, + "step": 10216, + "time_per_iteration": 2.5095551013946533 + }, + { + "auxiliary_loss_clip": 0.06413998, + "auxiliary_loss_mlp": 0.01267734, + "balance_loss_clip": 0.06273836, + "balance_loss_mlp": 0.0125694, + "epoch": 0.6142792725086427, + "flos": 15046764762240.0, + "grad_norm": 2.0499714549796475, + "language_loss": 0.8017531, + "learning_rate": 1.3678339582964147e-06, + "loss": 0.87857044, + "num_input_tokens_seen": 220035780, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.10797119, + "step": 10217, + "time_per_iteration": 2.530963897705078 + }, + { + "auxiliary_loss_clip": 0.06415407, + "auxiliary_loss_mlp": 0.01266792, + "balance_loss_clip": 0.06273971, + "balance_loss_mlp": 0.01255789, + "epoch": 0.6143393957613107, + "flos": 23337616452480.0, + "grad_norm": 2.309819184905194, + "language_loss": 0.78097677, + "learning_rate": 1.3674644757060865e-06, + "loss": 0.85779876, + "num_input_tokens_seen": 220054280, + "router_z_loss_clip": 1.41503906, + "router_z_loss_mlp": 0.11004639, + "step": 10218, + "time_per_iteration": 2.5020768642425537 + }, + { + "auxiliary_loss_clip": 0.06413251, + "auxiliary_loss_mlp": 0.01268832, + "balance_loss_clip": 0.06275171, + "balance_loss_mlp": 0.01258032, + "epoch": 0.6143995190139786, + "flos": 20122696598400.0, + "grad_norm": 1.7507364905585892, + "language_loss": 0.82176745, + "learning_rate": 1.367095017101569e-06, + "loss": 0.89858824, + "num_input_tokens_seen": 220074120, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.10803223, + "step": 10219, + "time_per_iteration": 4.098464250564575 + }, + { + "auxiliary_loss_clip": 0.06413841, + "auxiliary_loss_mlp": 0.01271094, + "balance_loss_clip": 0.06272161, + "balance_loss_mlp": 0.01259602, + "epoch": 0.6144596422666466, + "flos": 42313403491200.0, + "grad_norm": 1.6881627886326696, + "language_loss": 0.66870147, + "learning_rate": 1.3667255824968717e-06, + "loss": 0.74555075, + "num_input_tokens_seen": 220096320, + "router_z_loss_clip": 1.41699219, + "router_z_loss_mlp": 0.1149292, + "step": 10220, + "time_per_iteration": 2.724275827407837 + }, + { + "auxiliary_loss_clip": 0.0641406, + "auxiliary_loss_mlp": 0.01269064, + "balance_loss_clip": 0.06274959, + "balance_loss_mlp": 0.012584, + "epoch": 0.6145197655193146, + "flos": 21578992560000.0, + "grad_norm": 2.2248894315314454, + "language_loss": 0.72078216, + "learning_rate": 1.3663561719060041e-06, + "loss": 0.79761338, + "num_input_tokens_seen": 220114850, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.10656738, + "step": 10221, + "time_per_iteration": 2.5253100395202637 + }, + { + "auxiliary_loss_clip": 0.06412181, + "auxiliary_loss_mlp": 0.01267039, + "balance_loss_clip": 0.06273518, + "balance_loss_mlp": 0.01256609, + "epoch": 0.6145798887719826, + "flos": 21477610719360.0, + "grad_norm": 1.6538985449457846, + "language_loss": 0.7942664, + "learning_rate": 1.3659867853429735e-06, + "loss": 0.87105858, + "num_input_tokens_seen": 220133395, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.10430908, + "step": 10222, + "time_per_iteration": 2.5524139404296875 + }, + { + "auxiliary_loss_clip": 0.06418169, + "auxiliary_loss_mlp": 0.01267247, + "balance_loss_clip": 0.06275628, + "balance_loss_mlp": 0.01256447, + "epoch": 0.6146400120246506, + "flos": 20783270661120.0, + "grad_norm": 1.750623742282724, + "language_loss": 0.76586866, + "learning_rate": 1.365617422821788e-06, + "loss": 0.84272277, + "num_input_tokens_seen": 220152790, + "router_z_loss_clip": 1.42480469, + "router_z_loss_mlp": 0.10803223, + "step": 10223, + "time_per_iteration": 2.507918119430542 + }, + { + "auxiliary_loss_clip": 0.06413615, + "auxiliary_loss_mlp": 0.01266598, + "balance_loss_clip": 0.06278135, + "balance_loss_mlp": 0.01255392, + "epoch": 0.6147001352773185, + "flos": 13886423821440.0, + "grad_norm": 2.0249480129984287, + "language_loss": 0.78430009, + "learning_rate": 1.3652480843564535e-06, + "loss": 0.86110222, + "num_input_tokens_seen": 220169535, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.11212158, + "step": 10224, + "time_per_iteration": 2.5212504863739014 + }, + { + "auxiliary_loss_clip": 0.06409969, + "auxiliary_loss_mlp": 0.0126517, + "balance_loss_clip": 0.06274317, + "balance_loss_mlp": 0.01255359, + "epoch": 0.6147602585299865, + "flos": 56653920915840.0, + "grad_norm": 1.2562846499273215, + "language_loss": 0.66504145, + "learning_rate": 1.3648787699609746e-06, + "loss": 0.74179292, + "num_input_tokens_seen": 220195305, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.09814453, + "step": 10225, + "time_per_iteration": 2.814272880554199 + }, + { + "auxiliary_loss_clip": 0.06418905, + "auxiliary_loss_mlp": 0.01269548, + "balance_loss_clip": 0.06276867, + "balance_loss_mlp": 0.01258884, + "epoch": 0.6148203817826544, + "flos": 32825468044800.0, + "grad_norm": 1.9241791753141533, + "language_loss": 0.6340794, + "learning_rate": 1.364509479649357e-06, + "loss": 0.71096396, + "num_input_tokens_seen": 220215040, + "router_z_loss_clip": 1.41796875, + "router_z_loss_mlp": 0.10675049, + "step": 10226, + "time_per_iteration": 2.629307270050049 + }, + { + "auxiliary_loss_clip": 0.06414378, + "auxiliary_loss_mlp": 0.01266247, + "balance_loss_clip": 0.0627353, + "balance_loss_mlp": 0.01255303, + "epoch": 0.6148805050353224, + "flos": 18337811650560.0, + "grad_norm": 1.8500325381447646, + "language_loss": 0.76063347, + "learning_rate": 1.3641402134356037e-06, + "loss": 0.83743972, + "num_input_tokens_seen": 220234205, + "router_z_loss_clip": 1.40722656, + "router_z_loss_mlp": 0.10949707, + "step": 10227, + "time_per_iteration": 2.5072264671325684 + }, + { + "auxiliary_loss_clip": 0.06417207, + "auxiliary_loss_mlp": 0.01270328, + "balance_loss_clip": 0.06274723, + "balance_loss_mlp": 0.0125678, + "epoch": 0.6149406282879903, + "flos": 14069173576320.0, + "grad_norm": 4.1558900532043, + "language_loss": 0.62490618, + "learning_rate": 1.3637709713337164e-06, + "loss": 0.70178151, + "num_input_tokens_seen": 220252730, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.13568115, + "step": 10228, + "time_per_iteration": 2.625681161880493 + }, + { + "auxiliary_loss_clip": 0.06412059, + "auxiliary_loss_mlp": 0.01265474, + "balance_loss_clip": 0.0627415, + "balance_loss_mlp": 0.01254763, + "epoch": 0.6150007515406584, + "flos": 25196909425920.0, + "grad_norm": 1.4129638919460634, + "language_loss": 0.74878526, + "learning_rate": 1.3634017533576985e-06, + "loss": 0.82556051, + "num_input_tokens_seen": 220273345, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.1071167, + "step": 10229, + "time_per_iteration": 2.5437581539154053 + }, + { + "auxiliary_loss_clip": 0.06413749, + "auxiliary_loss_mlp": 0.01267795, + "balance_loss_clip": 0.0627471, + "balance_loss_mlp": 0.01256876, + "epoch": 0.6150608747933263, + "flos": 21951829301760.0, + "grad_norm": 1.6020000118574074, + "language_loss": 0.78397381, + "learning_rate": 1.3630325595215493e-06, + "loss": 0.86078924, + "num_input_tokens_seen": 220293845, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.10906982, + "step": 10230, + "time_per_iteration": 2.530174732208252 + }, + { + "auxiliary_loss_clip": 0.06413004, + "auxiliary_loss_mlp": 0.01266985, + "balance_loss_clip": 0.06270448, + "balance_loss_mlp": 0.01256149, + "epoch": 0.6151209980459943, + "flos": 30125283022080.0, + "grad_norm": 1.40012821108437, + "language_loss": 0.72963595, + "learning_rate": 1.36266338983927e-06, + "loss": 0.80643588, + "num_input_tokens_seen": 220316070, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.10827637, + "step": 10231, + "time_per_iteration": 2.5843095779418945 + }, + { + "auxiliary_loss_clip": 0.0641135, + "auxiliary_loss_mlp": 0.01267055, + "balance_loss_clip": 0.06271622, + "balance_loss_mlp": 0.01256434, + "epoch": 0.6151811212986622, + "flos": 30016228533120.0, + "grad_norm": 1.7264160083970947, + "language_loss": 0.70266879, + "learning_rate": 1.362294244324858e-06, + "loss": 0.77945286, + "num_input_tokens_seen": 220335695, + "router_z_loss_clip": 1.39648438, + "router_z_loss_mlp": 0.10626221, + "step": 10232, + "time_per_iteration": 2.5726914405822754 + }, + { + "auxiliary_loss_clip": 0.06409374, + "auxiliary_loss_mlp": 0.01268072, + "balance_loss_clip": 0.06274308, + "balance_loss_mlp": 0.01258112, + "epoch": 0.6152412445513302, + "flos": 18877675507200.0, + "grad_norm": 2.1019570874525484, + "language_loss": 0.92268974, + "learning_rate": 1.3619251229923126e-06, + "loss": 0.99946421, + "num_input_tokens_seen": 220353720, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.09960938, + "step": 10233, + "time_per_iteration": 2.475142002105713 + }, + { + "auxiliary_loss_clip": 0.06412026, + "auxiliary_loss_mlp": 0.01266426, + "balance_loss_clip": 0.06274009, + "balance_loss_mlp": 0.01256019, + "epoch": 0.6153013678039982, + "flos": 25710847643520.0, + "grad_norm": 1.7026564571899578, + "language_loss": 0.7220425, + "learning_rate": 1.3615560258556306e-06, + "loss": 0.79882705, + "num_input_tokens_seen": 220372515, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.10412598, + "step": 10234, + "time_per_iteration": 2.538825750350952 + }, + { + "auxiliary_loss_clip": 0.06412051, + "auxiliary_loss_mlp": 0.01265802, + "balance_loss_clip": 0.06270387, + "balance_loss_mlp": 0.01255187, + "epoch": 0.6153614910566662, + "flos": 28517529605760.0, + "grad_norm": 1.8042716232808833, + "language_loss": 0.67118728, + "learning_rate": 1.3611869529288077e-06, + "loss": 0.74796581, + "num_input_tokens_seen": 220393490, + "router_z_loss_clip": 1.41699219, + "router_z_loss_mlp": 0.10620117, + "step": 10235, + "time_per_iteration": 2.5539941787719727 + }, + { + "auxiliary_loss_clip": 0.06416909, + "auxiliary_loss_mlp": 0.01269314, + "balance_loss_clip": 0.06272343, + "balance_loss_mlp": 0.01258489, + "epoch": 0.6154216143093342, + "flos": 23556480117120.0, + "grad_norm": 1.5012129447427485, + "language_loss": 0.81535256, + "learning_rate": 1.3608179042258398e-06, + "loss": 0.89221478, + "num_input_tokens_seen": 220412855, + "router_z_loss_clip": 1.4453125, + "router_z_loss_mlp": 0.10821533, + "step": 10236, + "time_per_iteration": 2.538961887359619 + }, + { + "auxiliary_loss_clip": 0.06413287, + "auxiliary_loss_mlp": 0.01269421, + "balance_loss_clip": 0.06269701, + "balance_loss_mlp": 0.01258281, + "epoch": 0.6154817375620021, + "flos": 22754804578560.0, + "grad_norm": 1.3960361226739142, + "language_loss": 0.8069132, + "learning_rate": 1.360448879760721e-06, + "loss": 0.88374025, + "num_input_tokens_seen": 220433440, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.11138916, + "step": 10237, + "time_per_iteration": 2.5317978858947754 + }, + { + "auxiliary_loss_clip": 0.06410801, + "auxiliary_loss_mlp": 0.01271969, + "balance_loss_clip": 0.06272944, + "balance_loss_mlp": 0.01261198, + "epoch": 0.6155418608146701, + "flos": 27170455841280.0, + "grad_norm": 1.5039507372145677, + "language_loss": 0.76442957, + "learning_rate": 1.3600798795474449e-06, + "loss": 0.84125727, + "num_input_tokens_seen": 220453445, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.10772705, + "step": 10238, + "time_per_iteration": 2.5912821292877197 + }, + { + "auxiliary_loss_clip": 0.06320563, + "auxiliary_loss_mlp": 0.01256509, + "balance_loss_clip": 0.06262375, + "balance_loss_mlp": 0.01254774, + "epoch": 0.615601984067338, + "flos": 68828610003840.0, + "grad_norm": 1.135422984419524, + "language_loss": 0.57526618, + "learning_rate": 1.3597109036000036e-06, + "loss": 0.65103698, + "num_input_tokens_seen": 220509730, + "router_z_loss_clip": 0.58203125, + "router_z_loss_mlp": 0.01739502, + "step": 10239, + "time_per_iteration": 3.167433738708496 + }, + { + "auxiliary_loss_clip": 0.06415902, + "auxiliary_loss_mlp": 0.01263733, + "balance_loss_clip": 0.06273024, + "balance_loss_mlp": 0.0125323, + "epoch": 0.615662107320006, + "flos": 15521528396160.0, + "grad_norm": 1.8815161483190883, + "language_loss": 0.77940285, + "learning_rate": 1.3593419519323892e-06, + "loss": 0.8561992, + "num_input_tokens_seen": 220527295, + "router_z_loss_clip": 1.42871094, + "router_z_loss_mlp": 0.10498047, + "step": 10240, + "time_per_iteration": 2.4900901317596436 + }, + { + "auxiliary_loss_clip": 0.06418262, + "auxiliary_loss_mlp": 0.01272722, + "balance_loss_clip": 0.06275868, + "balance_loss_mlp": 0.01262017, + "epoch": 0.615722230572674, + "flos": 21069121265280.0, + "grad_norm": 2.263045257123095, + "language_loss": 0.72996962, + "learning_rate": 1.3589730245585922e-06, + "loss": 0.80687952, + "num_input_tokens_seen": 220542730, + "router_z_loss_clip": 1.42382812, + "router_z_loss_mlp": 0.1071167, + "step": 10241, + "time_per_iteration": 3.901360511779785 + }, + { + "auxiliary_loss_clip": 0.06409363, + "auxiliary_loss_mlp": 0.01269863, + "balance_loss_clip": 0.0627209, + "balance_loss_mlp": 0.01259873, + "epoch": 0.615782353825342, + "flos": 23263250353920.0, + "grad_norm": 1.504543290987149, + "language_loss": 0.72248924, + "learning_rate": 1.3586041214926018e-06, + "loss": 0.79928148, + "num_input_tokens_seen": 220562995, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.09997559, + "step": 10242, + "time_per_iteration": 2.5169565677642822 + }, + { + "auxiliary_loss_clip": 0.06411266, + "auxiliary_loss_mlp": 0.01265628, + "balance_loss_clip": 0.06271993, + "balance_loss_mlp": 0.01255066, + "epoch": 0.6158424770780099, + "flos": 21109972930560.0, + "grad_norm": 2.215067200442713, + "language_loss": 0.7281, + "learning_rate": 1.3582352427484086e-06, + "loss": 0.80486894, + "num_input_tokens_seen": 220581775, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.10565186, + "step": 10243, + "time_per_iteration": 2.540512800216675 + }, + { + "auxiliary_loss_clip": 0.06321675, + "auxiliary_loss_mlp": 0.01255828, + "balance_loss_clip": 0.06263578, + "balance_loss_mlp": 0.01254183, + "epoch": 0.6159026003306779, + "flos": 70355358120960.0, + "grad_norm": 0.7449608811837395, + "language_loss": 0.56762981, + "learning_rate": 1.3578663883399984e-06, + "loss": 0.64340484, + "num_input_tokens_seen": 220646395, + "router_z_loss_clip": 0.58105469, + "router_z_loss_mlp": 0.01647949, + "step": 10244, + "time_per_iteration": 3.2194366455078125 + }, + { + "auxiliary_loss_clip": 0.06409553, + "auxiliary_loss_mlp": 0.01267536, + "balance_loss_clip": 0.06271067, + "balance_loss_mlp": 0.01256855, + "epoch": 0.6159627235833458, + "flos": 33882624282240.0, + "grad_norm": 1.5482958097169006, + "language_loss": 0.63865972, + "learning_rate": 1.3574975582813593e-06, + "loss": 0.71543062, + "num_input_tokens_seen": 220668335, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.10675049, + "step": 10245, + "time_per_iteration": 2.640113353729248 + }, + { + "auxiliary_loss_clip": 0.06409854, + "auxiliary_loss_mlp": 0.01267557, + "balance_loss_clip": 0.06270616, + "balance_loss_mlp": 0.01257442, + "epoch": 0.6160228468360138, + "flos": 26582193452160.0, + "grad_norm": 1.6235599905950853, + "language_loss": 0.79032344, + "learning_rate": 1.3571287525864771e-06, + "loss": 0.8670975, + "num_input_tokens_seen": 220688915, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.10119629, + "step": 10246, + "time_per_iteration": 2.5686607360839844 + }, + { + "auxiliary_loss_clip": 0.0641896, + "auxiliary_loss_mlp": 0.0127079, + "balance_loss_clip": 0.0627369, + "balance_loss_mlp": 0.01258952, + "epoch": 0.6160829700886818, + "flos": 17197568490240.0, + "grad_norm": 2.4844316843996825, + "language_loss": 0.88253343, + "learning_rate": 1.3567599712693368e-06, + "loss": 0.95943093, + "num_input_tokens_seen": 220703465, + "router_z_loss_clip": 1.453125, + "router_z_loss_mlp": 0.1184082, + "step": 10247, + "time_per_iteration": 2.450960397720337 + }, + { + "auxiliary_loss_clip": 0.06417046, + "auxiliary_loss_mlp": 0.01268101, + "balance_loss_clip": 0.06275311, + "balance_loss_mlp": 0.01258028, + "epoch": 0.6161430933413498, + "flos": 23630385018240.0, + "grad_norm": 1.598841912113341, + "language_loss": 0.80267406, + "learning_rate": 1.3563912143439235e-06, + "loss": 0.87952548, + "num_input_tokens_seen": 220722090, + "router_z_loss_clip": 1.41699219, + "router_z_loss_mlp": 0.10076904, + "step": 10248, + "time_per_iteration": 2.5717732906341553 + }, + { + "auxiliary_loss_clip": 0.06409503, + "auxiliary_loss_mlp": 0.01268015, + "balance_loss_clip": 0.06271905, + "balance_loss_mlp": 0.01257733, + "epoch": 0.6162032165940178, + "flos": 23009027466240.0, + "grad_norm": 1.6786182085700423, + "language_loss": 0.87678397, + "learning_rate": 1.3560224818242191e-06, + "loss": 0.95355916, + "num_input_tokens_seen": 220741075, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.10284424, + "step": 10249, + "time_per_iteration": 2.5637669563293457 + }, + { + "auxiliary_loss_clip": 0.06414458, + "auxiliary_loss_mlp": 0.01265908, + "balance_loss_clip": 0.06273694, + "balance_loss_mlp": 0.01255239, + "epoch": 0.6162633398466857, + "flos": 39431474962560.0, + "grad_norm": 2.372002019412244, + "language_loss": 0.70129162, + "learning_rate": 1.3556537737242072e-06, + "loss": 0.7780953, + "num_input_tokens_seen": 220763395, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.10668945, + "step": 10250, + "time_per_iteration": 2.700856924057007 + }, + { + "auxiliary_loss_clip": 0.06403701, + "auxiliary_loss_mlp": 0.01263182, + "balance_loss_clip": 0.06270384, + "balance_loss_mlp": 0.0125386, + "epoch": 0.6163234630993537, + "flos": 19250679957120.0, + "grad_norm": 1.6751579708994577, + "language_loss": 0.74076283, + "learning_rate": 1.3552850900578692e-06, + "loss": 0.81743157, + "num_input_tokens_seen": 220780640, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.09320068, + "step": 10251, + "time_per_iteration": 3.9032137393951416 + }, + { + "auxiliary_loss_clip": 0.06412694, + "auxiliary_loss_mlp": 0.01265039, + "balance_loss_clip": 0.06272181, + "balance_loss_mlp": 0.01255288, + "epoch": 0.6163835863520216, + "flos": 15967389571200.0, + "grad_norm": 1.9695671027525665, + "language_loss": 0.69094777, + "learning_rate": 1.3549164308391844e-06, + "loss": 0.76772505, + "num_input_tokens_seen": 220797960, + "router_z_loss_clip": 1.40429688, + "router_z_loss_mlp": 0.09753418, + "step": 10252, + "time_per_iteration": 2.546041250228882 + }, + { + "auxiliary_loss_clip": 0.06321114, + "auxiliary_loss_mlp": 0.01253403, + "balance_loss_clip": 0.06262837, + "balance_loss_mlp": 0.01252003, + "epoch": 0.6164437096046896, + "flos": 68124905487360.0, + "grad_norm": 0.8614248496363994, + "language_loss": 0.57690394, + "learning_rate": 1.3545477960821333e-06, + "loss": 0.6526491, + "num_input_tokens_seen": 220856930, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 0.01400757, + "step": 10253, + "time_per_iteration": 3.1977267265319824 + }, + { + "auxiliary_loss_clip": 0.06417613, + "auxiliary_loss_mlp": 0.01268494, + "balance_loss_clip": 0.06274711, + "balance_loss_mlp": 0.01257783, + "epoch": 0.6165038328573575, + "flos": 21367633835520.0, + "grad_norm": 1.503369483441608, + "language_loss": 0.79960692, + "learning_rate": 1.3541791858006946e-06, + "loss": 0.876468, + "num_input_tokens_seen": 220877595, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.1071167, + "step": 10254, + "time_per_iteration": 3.95928692817688 + }, + { + "auxiliary_loss_clip": 0.06419028, + "auxiliary_loss_mlp": 0.01264054, + "balance_loss_clip": 0.06276255, + "balance_loss_mlp": 0.01253128, + "epoch": 0.6165639561100256, + "flos": 21107708870400.0, + "grad_norm": 1.746255949432921, + "language_loss": 0.81143081, + "learning_rate": 1.353810600008846e-06, + "loss": 0.88826168, + "num_input_tokens_seen": 220896880, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.10925293, + "step": 10255, + "time_per_iteration": 2.5300750732421875 + }, + { + "auxiliary_loss_clip": 0.06416211, + "auxiliary_loss_mlp": 0.01266666, + "balance_loss_clip": 0.06273863, + "balance_loss_mlp": 0.01255371, + "epoch": 0.6166240793626935, + "flos": 25345683550080.0, + "grad_norm": 1.880965378472566, + "language_loss": 0.65514123, + "learning_rate": 1.3534420387205646e-06, + "loss": 0.73196995, + "num_input_tokens_seen": 220916425, + "router_z_loss_clip": 1.42285156, + "router_z_loss_mlp": 0.11291504, + "step": 10256, + "time_per_iteration": 2.539006233215332 + }, + { + "auxiliary_loss_clip": 0.06415517, + "auxiliary_loss_mlp": 0.01267871, + "balance_loss_clip": 0.06277969, + "balance_loss_mlp": 0.0125806, + "epoch": 0.6166842026153615, + "flos": 19688742702720.0, + "grad_norm": 1.5659047978931129, + "language_loss": 0.72409272, + "learning_rate": 1.353073501949825e-06, + "loss": 0.80092663, + "num_input_tokens_seen": 220935050, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.09802246, + "step": 10257, + "time_per_iteration": 2.5153865814208984 + }, + { + "auxiliary_loss_clip": 0.06416216, + "auxiliary_loss_mlp": 0.01264385, + "balance_loss_clip": 0.06275131, + "balance_loss_mlp": 0.01253788, + "epoch": 0.6167443258680294, + "flos": 19324501004160.0, + "grad_norm": 1.6557108650811327, + "language_loss": 0.71972775, + "learning_rate": 1.3527049897106034e-06, + "loss": 0.79653382, + "num_input_tokens_seen": 220953085, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.1060791, + "step": 10258, + "time_per_iteration": 2.480304718017578 + }, + { + "auxiliary_loss_clip": 0.06417316, + "auxiliary_loss_mlp": 0.01263861, + "balance_loss_clip": 0.06275502, + "balance_loss_mlp": 0.01253222, + "epoch": 0.6168044491206974, + "flos": 25272323700480.0, + "grad_norm": 1.9257678582667488, + "language_loss": 0.63553512, + "learning_rate": 1.3523365020168735e-06, + "loss": 0.71234685, + "num_input_tokens_seen": 220969050, + "router_z_loss_clip": 1.41601562, + "router_z_loss_mlp": 0.10638428, + "step": 10259, + "time_per_iteration": 4.02075719833374 + }, + { + "auxiliary_loss_clip": 0.06410451, + "auxiliary_loss_mlp": 0.0126865, + "balance_loss_clip": 0.0627453, + "balance_loss_mlp": 0.01257898, + "epoch": 0.6168645723733654, + "flos": 13224130750080.0, + "grad_norm": 1.6228127894065456, + "language_loss": 0.71578032, + "learning_rate": 1.3519680388826084e-06, + "loss": 0.79257131, + "num_input_tokens_seen": 220985825, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.10748291, + "step": 10260, + "time_per_iteration": 2.4910624027252197 + }, + { + "auxiliary_loss_clip": 0.06424432, + "auxiliary_loss_mlp": 0.01268478, + "balance_loss_clip": 0.06278151, + "balance_loss_mlp": 0.01256492, + "epoch": 0.6169246956260334, + "flos": 26659410589440.0, + "grad_norm": 1.7088590339487795, + "language_loss": 0.68640685, + "learning_rate": 1.3515996003217803e-06, + "loss": 0.76333594, + "num_input_tokens_seen": 221004465, + "router_z_loss_clip": 1.46289062, + "router_z_loss_mlp": 0.11981201, + "step": 10261, + "time_per_iteration": 2.5747649669647217 + }, + { + "auxiliary_loss_clip": 0.06414127, + "auxiliary_loss_mlp": 0.01264284, + "balance_loss_clip": 0.06274065, + "balance_loss_mlp": 0.01254151, + "epoch": 0.6169848188787014, + "flos": 23155034405760.0, + "grad_norm": 1.7119551141937153, + "language_loss": 0.71845949, + "learning_rate": 1.3512311863483602e-06, + "loss": 0.79524362, + "num_input_tokens_seen": 221023260, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.10131836, + "step": 10262, + "time_per_iteration": 2.560232162475586 + }, + { + "auxiliary_loss_clip": 0.06416971, + "auxiliary_loss_mlp": 0.01265583, + "balance_loss_clip": 0.06277905, + "balance_loss_mlp": 0.01254425, + "epoch": 0.6170449421313693, + "flos": 23338748482560.0, + "grad_norm": 1.8792858261778465, + "language_loss": 0.70386994, + "learning_rate": 1.3508627969763188e-06, + "loss": 0.7806955, + "num_input_tokens_seen": 221043090, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.11157227, + "step": 10263, + "time_per_iteration": 2.5188369750976562 + }, + { + "auxiliary_loss_clip": 0.06418619, + "auxiliary_loss_mlp": 0.01266762, + "balance_loss_clip": 0.06274839, + "balance_loss_mlp": 0.01256618, + "epoch": 0.6171050653840373, + "flos": 15857077271040.0, + "grad_norm": 2.3172465393141404, + "language_loss": 0.76572752, + "learning_rate": 1.3504944322196244e-06, + "loss": 0.84258133, + "num_input_tokens_seen": 221061435, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.10150146, + "step": 10264, + "time_per_iteration": 2.525599956512451 + }, + { + "auxiliary_loss_clip": 0.06414546, + "auxiliary_loss_mlp": 0.01266705, + "balance_loss_clip": 0.06275049, + "balance_loss_mlp": 0.01255726, + "epoch": 0.6171651886367052, + "flos": 20051349246720.0, + "grad_norm": 2.349171582745048, + "language_loss": 0.85150325, + "learning_rate": 1.350126092092247e-06, + "loss": 0.92831576, + "num_input_tokens_seen": 221078705, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.10992432, + "step": 10265, + "time_per_iteration": 2.5084152221679688 + }, + { + "auxiliary_loss_clip": 0.06410134, + "auxiliary_loss_mlp": 0.01264888, + "balance_loss_clip": 0.0627327, + "balance_loss_mlp": 0.01254099, + "epoch": 0.6172253118893732, + "flos": 26439959946240.0, + "grad_norm": 2.0102817715219112, + "language_loss": 0.64766055, + "learning_rate": 1.349757776608153e-06, + "loss": 0.72441077, + "num_input_tokens_seen": 221099245, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.10791016, + "step": 10266, + "time_per_iteration": 2.5796725749969482 + }, + { + "auxiliary_loss_clip": 0.06410654, + "auxiliary_loss_mlp": 0.01267414, + "balance_loss_clip": 0.06270823, + "balance_loss_mlp": 0.01257263, + "epoch": 0.6172854351420412, + "flos": 22638622492800.0, + "grad_norm": 1.5096082169739153, + "language_loss": 0.76070148, + "learning_rate": 1.3493894857813094e-06, + "loss": 0.83748215, + "num_input_tokens_seen": 221116930, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.10150146, + "step": 10267, + "time_per_iteration": 2.5105693340301514 + }, + { + "auxiliary_loss_clip": 0.06419747, + "auxiliary_loss_mlp": 0.01265039, + "balance_loss_clip": 0.06275809, + "balance_loss_mlp": 0.01254066, + "epoch": 0.6173455583947092, + "flos": 21218943565440.0, + "grad_norm": 1.6454778934730863, + "language_loss": 0.7525773, + "learning_rate": 1.3490212196256818e-06, + "loss": 0.82942522, + "num_input_tokens_seen": 221137660, + "router_z_loss_clip": 1.43847656, + "router_z_loss_mlp": 0.10974121, + "step": 10268, + "time_per_iteration": 2.587233543395996 + }, + { + "auxiliary_loss_clip": 0.06419453, + "auxiliary_loss_mlp": 0.0126697, + "balance_loss_clip": 0.06273817, + "balance_loss_mlp": 0.01256396, + "epoch": 0.6174056816473771, + "flos": 19506370291200.0, + "grad_norm": 1.5800856340056704, + "language_loss": 0.75772798, + "learning_rate": 1.3486529781552342e-06, + "loss": 0.83459222, + "num_input_tokens_seen": 221156225, + "router_z_loss_clip": 1.45703125, + "router_z_loss_mlp": 0.10583496, + "step": 10269, + "time_per_iteration": 2.4955811500549316 + }, + { + "auxiliary_loss_clip": 0.06411718, + "auxiliary_loss_mlp": 0.01267212, + "balance_loss_clip": 0.06271979, + "balance_loss_mlp": 0.01256549, + "epoch": 0.6174658049000451, + "flos": 16002790721280.0, + "grad_norm": 2.3324483712409685, + "language_loss": 0.76473081, + "learning_rate": 1.3482847613839318e-06, + "loss": 0.84152013, + "num_input_tokens_seen": 221173820, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.10662842, + "step": 10270, + "time_per_iteration": 2.5138041973114014 + }, + { + "auxiliary_loss_clip": 0.0641441, + "auxiliary_loss_mlp": 0.01270386, + "balance_loss_clip": 0.06274129, + "balance_loss_mlp": 0.0125986, + "epoch": 0.617525928152713, + "flos": 21909635971200.0, + "grad_norm": 1.7440039477364133, + "language_loss": 0.82272917, + "learning_rate": 1.347916569325736e-06, + "loss": 0.89957708, + "num_input_tokens_seen": 221191815, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.10522461, + "step": 10271, + "time_per_iteration": 2.488560676574707 + }, + { + "auxiliary_loss_clip": 0.06416266, + "auxiliary_loss_mlp": 0.01264784, + "balance_loss_clip": 0.06273527, + "balance_loss_mlp": 0.01254801, + "epoch": 0.617586051405381, + "flos": 21112362771840.0, + "grad_norm": 1.4517106193495921, + "language_loss": 0.77416623, + "learning_rate": 1.3475484019946093e-06, + "loss": 0.85097671, + "num_input_tokens_seen": 221211205, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.09985352, + "step": 10272, + "time_per_iteration": 2.520111560821533 + }, + { + "auxiliary_loss_clip": 0.06312063, + "auxiliary_loss_mlp": 0.01254406, + "balance_loss_clip": 0.06253687, + "balance_loss_mlp": 0.01252749, + "epoch": 0.617646174658049, + "flos": 58629129684480.0, + "grad_norm": 0.7932568322885909, + "language_loss": 0.59031951, + "learning_rate": 1.347180259404513e-06, + "loss": 0.66598421, + "num_input_tokens_seen": 221268430, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 0.01660156, + "step": 10273, + "time_per_iteration": 2.9967992305755615 + }, + { + "auxiliary_loss_clip": 0.0640862, + "auxiliary_loss_mlp": 0.01264919, + "balance_loss_clip": 0.06270938, + "balance_loss_mlp": 0.01254274, + "epoch": 0.617706297910717, + "flos": 13883363147520.0, + "grad_norm": 2.2785278271278897, + "language_loss": 0.73286194, + "learning_rate": 1.3468121415694059e-06, + "loss": 0.80959731, + "num_input_tokens_seen": 221281930, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.10632324, + "step": 10274, + "time_per_iteration": 2.4770405292510986 + }, + { + "auxiliary_loss_clip": 0.06412372, + "auxiliary_loss_mlp": 0.01266317, + "balance_loss_clip": 0.06272519, + "balance_loss_mlp": 0.01255713, + "epoch": 0.617766421163385, + "flos": 19214482193280.0, + "grad_norm": 1.605129158536194, + "language_loss": 0.77453375, + "learning_rate": 1.3464440485032484e-06, + "loss": 0.85132062, + "num_input_tokens_seen": 221301605, + "router_z_loss_clip": 1.39941406, + "router_z_loss_mlp": 0.1060791, + "step": 10275, + "time_per_iteration": 2.4878437519073486 + }, + { + "auxiliary_loss_clip": 0.06409969, + "auxiliary_loss_mlp": 0.01271601, + "balance_loss_clip": 0.06272689, + "balance_loss_mlp": 0.01261134, + "epoch": 0.6178265444160529, + "flos": 22572725656320.0, + "grad_norm": 1.5524938527976675, + "language_loss": 0.79471135, + "learning_rate": 1.346075980219998e-06, + "loss": 0.87152702, + "num_input_tokens_seen": 221320105, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.10461426, + "step": 10276, + "time_per_iteration": 2.644413709640503 + }, + { + "auxiliary_loss_clip": 0.06416178, + "auxiliary_loss_mlp": 0.0126935, + "balance_loss_clip": 0.06274026, + "balance_loss_mlp": 0.01258192, + "epoch": 0.6178866676687209, + "flos": 11989130221440.0, + "grad_norm": 2.611664280498841, + "language_loss": 0.81007028, + "learning_rate": 1.345707936733612e-06, + "loss": 0.88692558, + "num_input_tokens_seen": 221335915, + "router_z_loss_clip": 1.42285156, + "router_z_loss_mlp": 0.1114502, + "step": 10277, + "time_per_iteration": 2.497955799102783 + }, + { + "auxiliary_loss_clip": 0.06418674, + "auxiliary_loss_mlp": 0.01267294, + "balance_loss_clip": 0.06274392, + "balance_loss_mlp": 0.01256381, + "epoch": 0.6179467909213888, + "flos": 20997061153920.0, + "grad_norm": 1.6653557744536012, + "language_loss": 0.81855345, + "learning_rate": 1.3453399180580466e-06, + "loss": 0.89541304, + "num_input_tokens_seen": 221353965, + "router_z_loss_clip": 1.44042969, + "router_z_loss_mlp": 0.10925293, + "step": 10278, + "time_per_iteration": 2.529439687728882 + }, + { + "auxiliary_loss_clip": 0.06410799, + "auxiliary_loss_mlp": 0.01263691, + "balance_loss_clip": 0.06271666, + "balance_loss_mlp": 0.0125394, + "epoch": 0.6180069141740568, + "flos": 25345180425600.0, + "grad_norm": 1.5510866303043802, + "language_loss": 0.74313521, + "learning_rate": 1.3449719242072567e-06, + "loss": 0.81988013, + "num_input_tokens_seen": 221374080, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.09753418, + "step": 10279, + "time_per_iteration": 2.5355474948883057 + }, + { + "auxiliary_loss_clip": 0.06408358, + "auxiliary_loss_mlp": 0.01263048, + "balance_loss_clip": 0.06268996, + "balance_loss_mlp": 0.0125316, + "epoch": 0.6180670374267248, + "flos": 19651748325120.0, + "grad_norm": 1.3695497899575455, + "language_loss": 0.70764935, + "learning_rate": 1.3446039551951975e-06, + "loss": 0.78436339, + "num_input_tokens_seen": 221392910, + "router_z_loss_clip": 1.39257812, + "router_z_loss_mlp": 0.09887695, + "step": 10280, + "time_per_iteration": 3.9792449474334717 + }, + { + "auxiliary_loss_clip": 0.06417054, + "auxiliary_loss_mlp": 0.01267828, + "balance_loss_clip": 0.06274389, + "balance_loss_mlp": 0.01256873, + "epoch": 0.6181271606793928, + "flos": 19471136849280.0, + "grad_norm": 1.3977623720923391, + "language_loss": 0.73107064, + "learning_rate": 1.3442360110358215e-06, + "loss": 0.8079195, + "num_input_tokens_seen": 221410990, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.10943604, + "step": 10281, + "time_per_iteration": 2.515800952911377 + }, + { + "auxiliary_loss_clip": 0.06410573, + "auxiliary_loss_mlp": 0.01266845, + "balance_loss_clip": 0.06274214, + "balance_loss_mlp": 0.01256927, + "epoch": 0.6181872839320607, + "flos": 25601541592320.0, + "grad_norm": 1.5934743777966283, + "language_loss": 0.76599932, + "learning_rate": 1.3438680917430827e-06, + "loss": 0.84277344, + "num_input_tokens_seen": 221431020, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.09924316, + "step": 10282, + "time_per_iteration": 2.5432822704315186 + }, + { + "auxiliary_loss_clip": 0.06415926, + "auxiliary_loss_mlp": 0.01266703, + "balance_loss_clip": 0.06272847, + "balance_loss_mlp": 0.01254884, + "epoch": 0.6182474071847287, + "flos": 25558048523520.0, + "grad_norm": 1.5342450755249748, + "language_loss": 0.69123679, + "learning_rate": 1.343500197330931e-06, + "loss": 0.76806307, + "num_input_tokens_seen": 221453235, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.1182251, + "step": 10283, + "time_per_iteration": 2.588545322418213 + }, + { + "auxiliary_loss_clip": 0.06422709, + "auxiliary_loss_mlp": 0.0126698, + "balance_loss_clip": 0.06273957, + "balance_loss_mlp": 0.01255607, + "epoch": 0.6183075304373966, + "flos": 22129673592960.0, + "grad_norm": 1.473012438045687, + "language_loss": 0.75165606, + "learning_rate": 1.3431323278133176e-06, + "loss": 0.82855296, + "num_input_tokens_seen": 221472560, + "router_z_loss_clip": 1.48828125, + "router_z_loss_mlp": 0.11364746, + "step": 10284, + "time_per_iteration": 2.4986348152160645 + }, + { + "auxiliary_loss_clip": 0.06405671, + "auxiliary_loss_mlp": 0.01268655, + "balance_loss_clip": 0.06274024, + "balance_loss_mlp": 0.01259034, + "epoch": 0.6183676536900646, + "flos": 22462161793920.0, + "grad_norm": 1.4548798471123576, + "language_loss": 0.75635868, + "learning_rate": 1.3427644832041922e-06, + "loss": 0.83310193, + "num_input_tokens_seen": 221492835, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09619141, + "step": 10285, + "time_per_iteration": 2.585350513458252 + }, + { + "auxiliary_loss_clip": 0.06410024, + "auxiliary_loss_mlp": 0.0126635, + "balance_loss_clip": 0.06269899, + "balance_loss_mlp": 0.01255377, + "epoch": 0.6184277769427327, + "flos": 23370250417920.0, + "grad_norm": 1.3734994412846095, + "language_loss": 0.72883123, + "learning_rate": 1.342396663517503e-06, + "loss": 0.80559498, + "num_input_tokens_seen": 221511870, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.10974121, + "step": 10286, + "time_per_iteration": 2.569110870361328 + }, + { + "auxiliary_loss_clip": 0.06411327, + "auxiliary_loss_mlp": 0.01268421, + "balance_loss_clip": 0.0627317, + "balance_loss_mlp": 0.01257311, + "epoch": 0.6184879001954006, + "flos": 22717684419840.0, + "grad_norm": 1.5486281180664692, + "language_loss": 0.76501298, + "learning_rate": 1.342028868767199e-06, + "loss": 0.84181046, + "num_input_tokens_seen": 221529915, + "router_z_loss_clip": 1.38183594, + "router_z_loss_mlp": 0.11108398, + "step": 10287, + "time_per_iteration": 2.5511634349823 + }, + { + "auxiliary_loss_clip": 0.06411948, + "auxiliary_loss_mlp": 0.01264572, + "balance_loss_clip": 0.06272362, + "balance_loss_mlp": 0.01253587, + "epoch": 0.6185480234480686, + "flos": 23848703631360.0, + "grad_norm": 1.5880408145773481, + "language_loss": 0.73586667, + "learning_rate": 1.3416610989672262e-06, + "loss": 0.81263179, + "num_input_tokens_seen": 221549745, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.10986328, + "step": 10288, + "time_per_iteration": 2.507291555404663 + }, + { + "auxiliary_loss_clip": 0.06409134, + "auxiliary_loss_mlp": 0.01263119, + "balance_loss_clip": 0.06273092, + "balance_loss_mlp": 0.0125264, + "epoch": 0.6186081467007365, + "flos": 45487932877440.0, + "grad_norm": 1.4570853227015406, + "language_loss": 0.73074299, + "learning_rate": 1.3412933541315296e-06, + "loss": 0.80746555, + "num_input_tokens_seen": 221572455, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.10473633, + "step": 10289, + "time_per_iteration": 2.7538769245147705 + }, + { + "auxiliary_loss_clip": 0.0641107, + "auxiliary_loss_mlp": 0.01268567, + "balance_loss_clip": 0.06269012, + "balance_loss_mlp": 0.01257468, + "epoch": 0.6186682699534045, + "flos": 23557737928320.0, + "grad_norm": 1.4253961785396534, + "language_loss": 0.79380536, + "learning_rate": 1.340925634274056e-06, + "loss": 0.87060177, + "num_input_tokens_seen": 221591325, + "router_z_loss_clip": 1.41992188, + "router_z_loss_mlp": 0.11090088, + "step": 10290, + "time_per_iteration": 2.532860040664673 + }, + { + "auxiliary_loss_clip": 0.06417654, + "auxiliary_loss_mlp": 0.01269395, + "balance_loss_clip": 0.06273635, + "balance_loss_mlp": 0.01258374, + "epoch": 0.6187283932060724, + "flos": 25781062965120.0, + "grad_norm": 1.5195693495374782, + "language_loss": 0.81756544, + "learning_rate": 1.3405579394087475e-06, + "loss": 0.89443594, + "num_input_tokens_seen": 221611640, + "router_z_loss_clip": 1.44140625, + "router_z_loss_mlp": 0.11022949, + "step": 10291, + "time_per_iteration": 3.985360860824585 + }, + { + "auxiliary_loss_clip": 0.06414646, + "auxiliary_loss_mlp": 0.0126579, + "balance_loss_clip": 0.06274836, + "balance_loss_mlp": 0.01255967, + "epoch": 0.6187885164587404, + "flos": 25272281773440.0, + "grad_norm": 5.259543114674327, + "language_loss": 0.78044999, + "learning_rate": 1.3401902695495487e-06, + "loss": 0.85725427, + "num_input_tokens_seen": 221631225, + "router_z_loss_clip": 1.39941406, + "router_z_loss_mlp": 0.09820557, + "step": 10292, + "time_per_iteration": 2.5699048042297363 + }, + { + "auxiliary_loss_clip": 0.06421922, + "auxiliary_loss_mlp": 0.01269104, + "balance_loss_clip": 0.06274973, + "balance_loss_mlp": 0.01257285, + "epoch": 0.6188486397114084, + "flos": 26258090659200.0, + "grad_norm": 2.757581205213687, + "language_loss": 0.73825526, + "learning_rate": 1.339822624710401e-06, + "loss": 0.81516558, + "num_input_tokens_seen": 221651035, + "router_z_loss_clip": 1.46972656, + "router_z_loss_mlp": 0.11816406, + "step": 10293, + "time_per_iteration": 4.005521774291992 + }, + { + "auxiliary_loss_clip": 0.06414802, + "auxiliary_loss_mlp": 0.01268302, + "balance_loss_clip": 0.06274456, + "balance_loss_mlp": 0.0125721, + "epoch": 0.6189087629640764, + "flos": 20929738798080.0, + "grad_norm": 1.751787926809697, + "language_loss": 0.83461618, + "learning_rate": 1.3394550049052454e-06, + "loss": 0.91144723, + "num_input_tokens_seen": 221671300, + "router_z_loss_clip": 1.40332031, + "router_z_loss_mlp": 0.11096191, + "step": 10294, + "time_per_iteration": 2.5416274070739746 + }, + { + "auxiliary_loss_clip": 0.06413339, + "auxiliary_loss_mlp": 0.01271366, + "balance_loss_clip": 0.06272751, + "balance_loss_mlp": 0.01260434, + "epoch": 0.6189688862167443, + "flos": 14835070621440.0, + "grad_norm": 2.3983238935990525, + "language_loss": 0.70671308, + "learning_rate": 1.3390874101480225e-06, + "loss": 0.7835601, + "num_input_tokens_seen": 221687320, + "router_z_loss_clip": 1.40429688, + "router_z_loss_mlp": 0.10931396, + "step": 10295, + "time_per_iteration": 2.474698781967163 + }, + { + "auxiliary_loss_clip": 0.06411821, + "auxiliary_loss_mlp": 0.01272777, + "balance_loss_clip": 0.06273046, + "balance_loss_mlp": 0.01261494, + "epoch": 0.6190290094694123, + "flos": 24292803870720.0, + "grad_norm": 1.4317659849997142, + "language_loss": 0.69952327, + "learning_rate": 1.3387198404526705e-06, + "loss": 0.77636921, + "num_input_tokens_seen": 221710175, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.11291504, + "step": 10296, + "time_per_iteration": 2.618892192840576 + }, + { + "auxiliary_loss_clip": 0.06412887, + "auxiliary_loss_mlp": 0.01267051, + "balance_loss_clip": 0.06270926, + "balance_loss_mlp": 0.0125547, + "epoch": 0.6190891327220802, + "flos": 22536192476160.0, + "grad_norm": 1.9563521083429962, + "language_loss": 0.71887541, + "learning_rate": 1.3383522958331287e-06, + "loss": 0.7956748, + "num_input_tokens_seen": 221728145, + "router_z_loss_clip": 1.41992188, + "router_z_loss_mlp": 0.11584473, + "step": 10297, + "time_per_iteration": 2.5115151405334473 + }, + { + "auxiliary_loss_clip": 0.0631431, + "auxiliary_loss_mlp": 0.01254184, + "balance_loss_clip": 0.0625589, + "balance_loss_mlp": 0.01252958, + "epoch": 0.6191492559747482, + "flos": 67748756509440.0, + "grad_norm": 0.8712851262632907, + "language_loss": 0.64291644, + "learning_rate": 1.3379847763033345e-06, + "loss": 0.71860135, + "num_input_tokens_seen": 221786100, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 0.01225281, + "step": 10298, + "time_per_iteration": 3.0254995822906494 + }, + { + "auxiliary_loss_clip": 0.06415632, + "auxiliary_loss_mlp": 0.01266663, + "balance_loss_clip": 0.06271654, + "balance_loss_mlp": 0.01255517, + "epoch": 0.6192093792274163, + "flos": 22353316940160.0, + "grad_norm": 1.6622389387462033, + "language_loss": 0.73995864, + "learning_rate": 1.3376172818772236e-06, + "loss": 0.81678164, + "num_input_tokens_seen": 221806450, + "router_z_loss_clip": 1.43945312, + "router_z_loss_mlp": 0.11157227, + "step": 10299, + "time_per_iteration": 3.9369277954101562 + }, + { + "auxiliary_loss_clip": 0.06421331, + "auxiliary_loss_mlp": 0.01268355, + "balance_loss_clip": 0.06274632, + "balance_loss_mlp": 0.01257054, + "epoch": 0.6192695024800842, + "flos": 13559176500480.0, + "grad_norm": 1.5604516058647369, + "language_loss": 0.68912721, + "learning_rate": 1.337249812568732e-06, + "loss": 0.76602411, + "num_input_tokens_seen": 221823330, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 0.11297607, + "step": 10300, + "time_per_iteration": 2.462852716445923 + }, + { + "auxiliary_loss_clip": 0.06414428, + "auxiliary_loss_mlp": 0.01266769, + "balance_loss_clip": 0.06272526, + "balance_loss_mlp": 0.01255241, + "epoch": 0.6193296257327522, + "flos": 17420163661440.0, + "grad_norm": 1.6482033452585196, + "language_loss": 0.67021179, + "learning_rate": 1.3368823683917939e-06, + "loss": 0.74702382, + "num_input_tokens_seen": 221839360, + "router_z_loss_clip": 1.41992188, + "router_z_loss_mlp": 0.11529541, + "step": 10301, + "time_per_iteration": 2.496779680252075 + }, + { + "auxiliary_loss_clip": 0.06414926, + "auxiliary_loss_mlp": 0.01266961, + "balance_loss_clip": 0.06272815, + "balance_loss_mlp": 0.01256411, + "epoch": 0.6193897489854201, + "flos": 31108869774720.0, + "grad_norm": 1.608536765976836, + "language_loss": 0.72948015, + "learning_rate": 1.3365149493603424e-06, + "loss": 0.80629897, + "num_input_tokens_seen": 221859465, + "router_z_loss_clip": 1.41992188, + "router_z_loss_mlp": 0.10546875, + "step": 10302, + "time_per_iteration": 2.5844531059265137 + }, + { + "auxiliary_loss_clip": 0.06413972, + "auxiliary_loss_mlp": 0.01269333, + "balance_loss_clip": 0.06273288, + "balance_loss_mlp": 0.01258038, + "epoch": 0.6194498722380881, + "flos": 19139822605440.0, + "grad_norm": 1.7442373384203957, + "language_loss": 0.81269908, + "learning_rate": 1.3361475554883107e-06, + "loss": 0.88953209, + "num_input_tokens_seen": 221878555, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.11303711, + "step": 10303, + "time_per_iteration": 2.527067184448242 + }, + { + "auxiliary_loss_clip": 0.06420361, + "auxiliary_loss_mlp": 0.01268221, + "balance_loss_clip": 0.06272827, + "balance_loss_mlp": 0.01255274, + "epoch": 0.619509995490756, + "flos": 21841517001600.0, + "grad_norm": 1.6019319576417599, + "language_loss": 0.76846468, + "learning_rate": 1.3357801867896307e-06, + "loss": 0.8453505, + "num_input_tokens_seen": 221898790, + "router_z_loss_clip": 1.47851562, + "router_z_loss_mlp": 0.12957764, + "step": 10304, + "time_per_iteration": 2.4880640506744385 + }, + { + "auxiliary_loss_clip": 0.06424797, + "auxiliary_loss_mlp": 0.01268109, + "balance_loss_clip": 0.06276388, + "balance_loss_mlp": 0.0125617, + "epoch": 0.619570118743424, + "flos": 23813512116480.0, + "grad_norm": 1.7485917713195505, + "language_loss": 0.77554089, + "learning_rate": 1.3354128432782324e-06, + "loss": 0.85246998, + "num_input_tokens_seen": 221918875, + "router_z_loss_clip": 1.48339844, + "router_z_loss_mlp": 0.1194458, + "step": 10305, + "time_per_iteration": 2.5362794399261475 + }, + { + "auxiliary_loss_clip": 0.06418667, + "auxiliary_loss_mlp": 0.01267084, + "balance_loss_clip": 0.06272887, + "balance_loss_mlp": 0.0125508, + "epoch": 0.619630241996092, + "flos": 21107289600000.0, + "grad_norm": 1.5608682149054525, + "language_loss": 0.79292911, + "learning_rate": 1.335045524968045e-06, + "loss": 0.86978668, + "num_input_tokens_seen": 221937895, + "router_z_loss_clip": 1.45703125, + "router_z_loss_mlp": 0.12005615, + "step": 10306, + "time_per_iteration": 2.5073060989379883 + }, + { + "auxiliary_loss_clip": 0.0640957, + "auxiliary_loss_mlp": 0.01267646, + "balance_loss_clip": 0.06271125, + "balance_loss_mlp": 0.01258067, + "epoch": 0.61969036524876, + "flos": 27315666167040.0, + "grad_norm": 1.5979283875043302, + "language_loss": 0.80772972, + "learning_rate": 1.3346782318729988e-06, + "loss": 0.88450187, + "num_input_tokens_seen": 221955920, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.09576416, + "step": 10307, + "time_per_iteration": 2.576525926589966 + }, + { + "auxiliary_loss_clip": 0.06313084, + "auxiliary_loss_mlp": 0.01252494, + "balance_loss_clip": 0.06255361, + "balance_loss_mlp": 0.01251256, + "epoch": 0.6197504885014279, + "flos": 51667308403200.0, + "grad_norm": 0.783320902533958, + "language_loss": 0.59562945, + "learning_rate": 1.3343109640070203e-06, + "loss": 0.67128521, + "num_input_tokens_seen": 222011405, + "router_z_loss_clip": 0.578125, + "router_z_loss_mlp": 0.01237488, + "step": 10308, + "time_per_iteration": 3.167433738708496 + }, + { + "auxiliary_loss_clip": 0.06410602, + "auxiliary_loss_mlp": 0.01264735, + "balance_loss_clip": 0.06271984, + "balance_loss_mlp": 0.01254191, + "epoch": 0.6198106117540959, + "flos": 30565316338560.0, + "grad_norm": 1.6157907948964547, + "language_loss": 0.68128729, + "learning_rate": 1.333943721384037e-06, + "loss": 0.75804067, + "num_input_tokens_seen": 222034545, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.10540771, + "step": 10309, + "time_per_iteration": 2.5872271060943604 + }, + { + "auxiliary_loss_clip": 0.06412695, + "auxiliary_loss_mlp": 0.01268034, + "balance_loss_clip": 0.06273058, + "balance_loss_mlp": 0.01257108, + "epoch": 0.6198707350067638, + "flos": 18914586030720.0, + "grad_norm": 1.6991122803597551, + "language_loss": 0.725124, + "learning_rate": 1.3335765040179746e-06, + "loss": 0.80193126, + "num_input_tokens_seen": 222052690, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.10925293, + "step": 10310, + "time_per_iteration": 2.5339155197143555 + }, + { + "auxiliary_loss_clip": 0.0642102, + "auxiliary_loss_mlp": 0.01267157, + "balance_loss_clip": 0.06275747, + "balance_loss_mlp": 0.01254974, + "epoch": 0.6199308582594318, + "flos": 21440238998400.0, + "grad_norm": 1.796323815916351, + "language_loss": 0.78780711, + "learning_rate": 1.3332093119227573e-06, + "loss": 0.86468887, + "num_input_tokens_seen": 222069095, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.12176514, + "step": 10311, + "time_per_iteration": 2.5148420333862305 + }, + { + "auxiliary_loss_clip": 0.06414344, + "auxiliary_loss_mlp": 0.01267618, + "balance_loss_clip": 0.06271456, + "balance_loss_mlp": 0.0125643, + "epoch": 0.6199909815120999, + "flos": 18413561341440.0, + "grad_norm": 2.1642456621818935, + "language_loss": 0.72494328, + "learning_rate": 1.3328421451123105e-06, + "loss": 0.80176294, + "num_input_tokens_seen": 222087360, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.11175537, + "step": 10312, + "time_per_iteration": 2.5287880897521973 + }, + { + "auxiliary_loss_clip": 0.0642011, + "auxiliary_loss_mlp": 0.01266003, + "balance_loss_clip": 0.06274375, + "balance_loss_mlp": 0.01254744, + "epoch": 0.6200511047647678, + "flos": 21472663328640.0, + "grad_norm": 5.562964449835012, + "language_loss": 0.72224271, + "learning_rate": 1.3324750036005557e-06, + "loss": 0.79910386, + "num_input_tokens_seen": 222106130, + "router_z_loss_clip": 1.45898438, + "router_z_loss_mlp": 0.1126709, + "step": 10313, + "time_per_iteration": 2.5028812885284424 + }, + { + "auxiliary_loss_clip": 0.06422722, + "auxiliary_loss_mlp": 0.01266585, + "balance_loss_clip": 0.06275584, + "balance_loss_mlp": 0.01254521, + "epoch": 0.6201112280174358, + "flos": 18220539461760.0, + "grad_norm": 1.7747609453089435, + "language_loss": 0.78361583, + "learning_rate": 1.332107887401416e-06, + "loss": 0.86050892, + "num_input_tokens_seen": 222123125, + "router_z_loss_clip": 1.47167969, + "router_z_loss_mlp": 0.12054443, + "step": 10314, + "time_per_iteration": 2.5241122245788574 + }, + { + "auxiliary_loss_clip": 0.06416035, + "auxiliary_loss_mlp": 0.01264642, + "balance_loss_clip": 0.06273148, + "balance_loss_mlp": 0.01253723, + "epoch": 0.6201713512701037, + "flos": 20017373616000.0, + "grad_norm": 1.7540334225503873, + "language_loss": 0.78008437, + "learning_rate": 1.331740796528812e-06, + "loss": 0.8568911, + "num_input_tokens_seen": 222140655, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.10925293, + "step": 10315, + "time_per_iteration": 2.515916585922241 + }, + { + "auxiliary_loss_clip": 0.06417818, + "auxiliary_loss_mlp": 0.01268496, + "balance_loss_clip": 0.06271202, + "balance_loss_mlp": 0.01257719, + "epoch": 0.6202314745227717, + "flos": 22493537948160.0, + "grad_norm": 2.219101181270965, + "language_loss": 0.76005399, + "learning_rate": 1.3313737309966641e-06, + "loss": 0.83691716, + "num_input_tokens_seen": 222160450, + "router_z_loss_clip": 1.46582031, + "router_z_loss_mlp": 0.10766602, + "step": 10316, + "time_per_iteration": 2.5367636680603027 + }, + { + "auxiliary_loss_clip": 0.06417404, + "auxiliary_loss_mlp": 0.01267951, + "balance_loss_clip": 0.06271914, + "balance_loss_mlp": 0.01256948, + "epoch": 0.6202915977754396, + "flos": 26835116601600.0, + "grad_norm": 1.8483221587209677, + "language_loss": 0.77761883, + "learning_rate": 1.3310066908188915e-06, + "loss": 0.8544724, + "num_input_tokens_seen": 222179170, + "router_z_loss_clip": 1.45507812, + "router_z_loss_mlp": 0.11004639, + "step": 10317, + "time_per_iteration": 2.5396320819854736 + }, + { + "auxiliary_loss_clip": 0.06315257, + "auxiliary_loss_mlp": 0.01256399, + "balance_loss_clip": 0.0625724, + "balance_loss_mlp": 0.01255023, + "epoch": 0.6203517210281076, + "flos": 62763248828160.0, + "grad_norm": 0.6893904060556487, + "language_loss": 0.58856946, + "learning_rate": 1.3306396760094122e-06, + "loss": 0.66428602, + "num_input_tokens_seen": 222242660, + "router_z_loss_clip": 0.578125, + "router_z_loss_mlp": 0.01377869, + "step": 10318, + "time_per_iteration": 3.1691195964813232 + }, + { + "auxiliary_loss_clip": 0.06414767, + "auxiliary_loss_mlp": 0.01270191, + "balance_loss_clip": 0.06272453, + "balance_loss_mlp": 0.01258425, + "epoch": 0.6204118442807756, + "flos": 23411018229120.0, + "grad_norm": 1.7666446205430133, + "language_loss": 0.78163171, + "learning_rate": 1.330272686582143e-06, + "loss": 0.85848129, + "num_input_tokens_seen": 222262170, + "router_z_loss_clip": 1.41992188, + "router_z_loss_mlp": 0.11755371, + "step": 10319, + "time_per_iteration": 2.5313587188720703 + }, + { + "auxiliary_loss_clip": 0.06410229, + "auxiliary_loss_mlp": 0.01267722, + "balance_loss_clip": 0.06271461, + "balance_loss_mlp": 0.01257589, + "epoch": 0.6204719675334436, + "flos": 20199871808640.0, + "grad_norm": 1.5707406021720693, + "language_loss": 0.66525, + "learning_rate": 1.3299057225510013e-06, + "loss": 0.74202955, + "num_input_tokens_seen": 222280375, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.10137939, + "step": 10320, + "time_per_iteration": 3.8696272373199463 + }, + { + "auxiliary_loss_clip": 0.06407389, + "auxiliary_loss_mlp": 0.01264937, + "balance_loss_clip": 0.06270511, + "balance_loss_mlp": 0.01255025, + "epoch": 0.6205320907861115, + "flos": 13193048085120.0, + "grad_norm": 1.6249727148286428, + "language_loss": 0.76339847, + "learning_rate": 1.3295387839299013e-06, + "loss": 0.84012175, + "num_input_tokens_seen": 222297325, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.09912109, + "step": 10321, + "time_per_iteration": 2.4867870807647705 + }, + { + "auxiliary_loss_clip": 0.06409396, + "auxiliary_loss_mlp": 0.01266949, + "balance_loss_clip": 0.06270664, + "balance_loss_mlp": 0.01256256, + "epoch": 0.6205922140387795, + "flos": 20674761223680.0, + "grad_norm": 1.5610091783179405, + "language_loss": 0.74460745, + "learning_rate": 1.329171870732758e-06, + "loss": 0.82137096, + "num_input_tokens_seen": 222317095, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.10693359, + "step": 10322, + "time_per_iteration": 2.506465196609497 + }, + { + "auxiliary_loss_clip": 0.06410797, + "auxiliary_loss_mlp": 0.01264937, + "balance_loss_clip": 0.06272407, + "balance_loss_mlp": 0.01255275, + "epoch": 0.6206523372914474, + "flos": 23884524051840.0, + "grad_norm": 1.6823894915828839, + "language_loss": 0.72711974, + "learning_rate": 1.3288049829734845e-06, + "loss": 0.80387706, + "num_input_tokens_seen": 222337055, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.09667969, + "step": 10323, + "time_per_iteration": 2.5490479469299316 + }, + { + "auxiliary_loss_clip": 0.06424229, + "auxiliary_loss_mlp": 0.012682, + "balance_loss_clip": 0.06274472, + "balance_loss_mlp": 0.01257322, + "epoch": 0.6207124605441154, + "flos": 13411576333440.0, + "grad_norm": 31.978129858103646, + "language_loss": 0.59017056, + "learning_rate": 1.3284381206659933e-06, + "loss": 0.66709483, + "num_input_tokens_seen": 222354515, + "router_z_loss_clip": 1.49707031, + "router_z_loss_mlp": 0.10876465, + "step": 10324, + "time_per_iteration": 2.5541300773620605 + }, + { + "auxiliary_loss_clip": 0.0641806, + "auxiliary_loss_mlp": 0.01267454, + "balance_loss_clip": 0.06274732, + "balance_loss_mlp": 0.01255664, + "epoch": 0.6207725837967835, + "flos": 18922300606080.0, + "grad_norm": 1.723600813321157, + "language_loss": 0.76792443, + "learning_rate": 1.3280712838241956e-06, + "loss": 0.84477955, + "num_input_tokens_seen": 222372755, + "router_z_loss_clip": 1.43164062, + "router_z_loss_mlp": 0.11791992, + "step": 10325, + "time_per_iteration": 2.5330686569213867 + }, + { + "auxiliary_loss_clip": 0.06421543, + "auxiliary_loss_mlp": 0.01267318, + "balance_loss_clip": 0.06275088, + "balance_loss_mlp": 0.01256207, + "epoch": 0.6208327070494514, + "flos": 23985738184320.0, + "grad_norm": 1.8229064209367492, + "language_loss": 0.72747815, + "learning_rate": 1.327704472462003e-06, + "loss": 0.80436671, + "num_input_tokens_seen": 222391380, + "router_z_loss_clip": 1.46289062, + "router_z_loss_mlp": 0.11120605, + "step": 10326, + "time_per_iteration": 2.5343799591064453 + }, + { + "auxiliary_loss_clip": 0.06419887, + "auxiliary_loss_mlp": 0.01268815, + "balance_loss_clip": 0.06274612, + "balance_loss_mlp": 0.0125687, + "epoch": 0.6208928303021194, + "flos": 22827032398080.0, + "grad_norm": 1.9354170249209526, + "language_loss": 0.73989004, + "learning_rate": 1.3273376865933234e-06, + "loss": 0.81677705, + "num_input_tokens_seen": 222411165, + "router_z_loss_clip": 1.45214844, + "router_z_loss_mlp": 0.11950684, + "step": 10327, + "time_per_iteration": 2.555742025375366 + }, + { + "auxiliary_loss_clip": 0.06417272, + "auxiliary_loss_mlp": 0.0126664, + "balance_loss_clip": 0.06271765, + "balance_loss_mlp": 0.01255261, + "epoch": 0.6209529535547873, + "flos": 17569944034560.0, + "grad_norm": 2.1609251311460493, + "language_loss": 0.80099189, + "learning_rate": 1.326970926232066e-06, + "loss": 0.8778311, + "num_input_tokens_seen": 222428110, + "router_z_loss_clip": 1.45507812, + "router_z_loss_mlp": 0.11364746, + "step": 10328, + "time_per_iteration": 2.4839911460876465 + }, + { + "auxiliary_loss_clip": 0.06413457, + "auxiliary_loss_mlp": 0.0126611, + "balance_loss_clip": 0.06270879, + "balance_loss_mlp": 0.01254791, + "epoch": 0.6210130768074553, + "flos": 22017432648960.0, + "grad_norm": 1.8104585499122046, + "language_loss": 0.78316593, + "learning_rate": 1.3266041913921396e-06, + "loss": 0.85996157, + "num_input_tokens_seen": 222446385, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.11322021, + "step": 10329, + "time_per_iteration": 2.551748514175415 + }, + { + "auxiliary_loss_clip": 0.06317136, + "auxiliary_loss_mlp": 0.01252093, + "balance_loss_clip": 0.0625931, + "balance_loss_mlp": 0.0125077, + "epoch": 0.6210732000601232, + "flos": 63695166739200.0, + "grad_norm": 0.8181079803134828, + "language_loss": 0.62296569, + "learning_rate": 1.3262374820874484e-06, + "loss": 0.69865799, + "num_input_tokens_seen": 222502150, + "router_z_loss_clip": 0.578125, + "router_z_loss_mlp": 0.013237, + "step": 10330, + "time_per_iteration": 4.52486252784729 + }, + { + "auxiliary_loss_clip": 0.06422883, + "auxiliary_loss_mlp": 0.01268071, + "balance_loss_clip": 0.06276384, + "balance_loss_mlp": 0.01256275, + "epoch": 0.6211333233127913, + "flos": 24250233196800.0, + "grad_norm": 2.0105352809521517, + "language_loss": 0.77933174, + "learning_rate": 1.3258707983319002e-06, + "loss": 0.85624135, + "num_input_tokens_seen": 222519880, + "router_z_loss_clip": 1.46582031, + "router_z_loss_mlp": 0.11791992, + "step": 10331, + "time_per_iteration": 2.558311939239502 + }, + { + "auxiliary_loss_clip": 0.06423557, + "auxiliary_loss_mlp": 0.01267101, + "balance_loss_clip": 0.06275949, + "balance_loss_mlp": 0.01255151, + "epoch": 0.6211934465654592, + "flos": 16949047680000.0, + "grad_norm": 2.3537089497540147, + "language_loss": 0.67977309, + "learning_rate": 1.3255041401393992e-06, + "loss": 0.75667971, + "num_input_tokens_seen": 222538545, + "router_z_loss_clip": 1.4765625, + "router_z_loss_mlp": 0.11950684, + "step": 10332, + "time_per_iteration": 2.4883179664611816 + }, + { + "auxiliary_loss_clip": 0.06419694, + "auxiliary_loss_mlp": 0.01266096, + "balance_loss_clip": 0.06276092, + "balance_loss_mlp": 0.01255677, + "epoch": 0.6212535698181272, + "flos": 15272672169600.0, + "grad_norm": 1.3382118578807503, + "language_loss": 0.76498306, + "learning_rate": 1.3251375075238476e-06, + "loss": 0.84184092, + "num_input_tokens_seen": 222556935, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.10418701, + "step": 10333, + "time_per_iteration": 3.9705252647399902 + }, + { + "auxiliary_loss_clip": 0.06414539, + "auxiliary_loss_mlp": 0.01267678, + "balance_loss_clip": 0.06275988, + "balance_loss_mlp": 0.012563, + "epoch": 0.6213136930707951, + "flos": 13449073835520.0, + "grad_norm": 2.1789310130446227, + "language_loss": 0.70102298, + "learning_rate": 1.3247709004991507e-06, + "loss": 0.77784514, + "num_input_tokens_seen": 222574035, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.11383057, + "step": 10334, + "time_per_iteration": 2.5797176361083984 + }, + { + "auxiliary_loss_clip": 0.06414784, + "auxiliary_loss_mlp": 0.01264307, + "balance_loss_clip": 0.06275611, + "balance_loss_mlp": 0.01254168, + "epoch": 0.6213738163234631, + "flos": 18116641998720.0, + "grad_norm": 1.637338123067712, + "language_loss": 0.70408571, + "learning_rate": 1.3244043190792078e-06, + "loss": 0.78087658, + "num_input_tokens_seen": 222592290, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.10137939, + "step": 10335, + "time_per_iteration": 2.482482671737671 + }, + { + "auxiliary_loss_clip": 0.06413939, + "auxiliary_loss_mlp": 0.01267616, + "balance_loss_clip": 0.0627524, + "balance_loss_mlp": 0.01257185, + "epoch": 0.621433939576131, + "flos": 25344299957760.0, + "grad_norm": 1.5093006351890013, + "language_loss": 0.80123997, + "learning_rate": 1.3240377632779213e-06, + "loss": 0.87805557, + "num_input_tokens_seen": 222612805, + "router_z_loss_clip": 1.38476562, + "router_z_loss_mlp": 0.10430908, + "step": 10336, + "time_per_iteration": 2.5523369312286377 + }, + { + "auxiliary_loss_clip": 0.06410298, + "auxiliary_loss_mlp": 0.01268916, + "balance_loss_clip": 0.06271983, + "balance_loss_mlp": 0.0125848, + "epoch": 0.621494062828799, + "flos": 22572306385920.0, + "grad_norm": 1.6169920799644502, + "language_loss": 0.73330015, + "learning_rate": 1.3236712331091907e-06, + "loss": 0.81009233, + "num_input_tokens_seen": 222632260, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.10437012, + "step": 10337, + "time_per_iteration": 2.4964675903320312 + }, + { + "auxiliary_loss_clip": 0.0642301, + "auxiliary_loss_mlp": 0.01266548, + "balance_loss_clip": 0.06278226, + "balance_loss_mlp": 0.012548, + "epoch": 0.621554186081467, + "flos": 27425433415680.0, + "grad_norm": 1.8853547327091988, + "language_loss": 0.63167447, + "learning_rate": 1.3233047285869145e-06, + "loss": 0.70857, + "num_input_tokens_seen": 222653570, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.11755371, + "step": 10338, + "time_per_iteration": 4.016883611679077 + }, + { + "auxiliary_loss_clip": 0.06417143, + "auxiliary_loss_mlp": 0.0126833, + "balance_loss_clip": 0.06275916, + "balance_loss_mlp": 0.01257787, + "epoch": 0.621614309334135, + "flos": 22353484648320.0, + "grad_norm": 1.7306917238363975, + "language_loss": 0.71876323, + "learning_rate": 1.322938249724991e-06, + "loss": 0.79561794, + "num_input_tokens_seen": 222672480, + "router_z_loss_clip": 1.41210938, + "router_z_loss_mlp": 0.10546875, + "step": 10339, + "time_per_iteration": 2.5129294395446777 + }, + { + "auxiliary_loss_clip": 0.06411034, + "auxiliary_loss_mlp": 0.01266092, + "balance_loss_clip": 0.06274111, + "balance_loss_mlp": 0.0125519, + "epoch": 0.621674432586803, + "flos": 19287255064320.0, + "grad_norm": 1.654477546235719, + "language_loss": 0.69824433, + "learning_rate": 1.3225717965373166e-06, + "loss": 0.77501559, + "num_input_tokens_seen": 222691200, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.10906982, + "step": 10340, + "time_per_iteration": 2.491989850997925 + }, + { + "auxiliary_loss_clip": 0.0641477, + "auxiliary_loss_mlp": 0.01265499, + "balance_loss_clip": 0.06276464, + "balance_loss_mlp": 0.01255074, + "epoch": 0.6217345558394709, + "flos": 21614812980480.0, + "grad_norm": 1.760593238290477, + "language_loss": 0.68765497, + "learning_rate": 1.322205369037788e-06, + "loss": 0.76445758, + "num_input_tokens_seen": 222709975, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.10430908, + "step": 10341, + "time_per_iteration": 2.6119179725646973 + }, + { + "auxiliary_loss_clip": 0.06421542, + "auxiliary_loss_mlp": 0.01267354, + "balance_loss_clip": 0.06278797, + "balance_loss_mlp": 0.01256089, + "epoch": 0.6217946790921389, + "flos": 18009893496960.0, + "grad_norm": 2.3031674054515867, + "language_loss": 0.81059158, + "learning_rate": 1.321838967240299e-06, + "loss": 0.88748062, + "num_input_tokens_seen": 222729005, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.11273193, + "step": 10342, + "time_per_iteration": 2.4969582557678223 + }, + { + "auxiliary_loss_clip": 0.0631469, + "auxiliary_loss_mlp": 0.0125491, + "balance_loss_clip": 0.0625717, + "balance_loss_mlp": 0.01253292, + "epoch": 0.6218548023448068, + "flos": 61993578349440.0, + "grad_norm": 0.8110464269458239, + "language_loss": 0.5724324, + "learning_rate": 1.3214725911587452e-06, + "loss": 0.64812839, + "num_input_tokens_seen": 222786090, + "router_z_loss_clip": 0.578125, + "router_z_loss_mlp": 0.01620483, + "step": 10343, + "time_per_iteration": 3.0396130084991455 + }, + { + "auxiliary_loss_clip": 0.06411558, + "auxiliary_loss_mlp": 0.01264969, + "balance_loss_clip": 0.06274949, + "balance_loss_mlp": 0.01254812, + "epoch": 0.6219149255974749, + "flos": 25746248793600.0, + "grad_norm": 1.838833235576279, + "language_loss": 0.73063612, + "learning_rate": 1.3211062408070184e-06, + "loss": 0.80740142, + "num_input_tokens_seen": 222806100, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.1015625, + "step": 10344, + "time_per_iteration": 2.5173933506011963 + }, + { + "auxiliary_loss_clip": 0.0641374, + "auxiliary_loss_mlp": 0.01264496, + "balance_loss_clip": 0.06273273, + "balance_loss_mlp": 0.01253803, + "epoch": 0.6219750488501428, + "flos": 25418162931840.0, + "grad_norm": 2.137498021001217, + "language_loss": 0.60161531, + "learning_rate": 1.3207399161990105e-06, + "loss": 0.67839766, + "num_input_tokens_seen": 222826575, + "router_z_loss_clip": 1.40332031, + "router_z_loss_mlp": 0.10699463, + "step": 10345, + "time_per_iteration": 2.5472302436828613 + }, + { + "auxiliary_loss_clip": 0.06417334, + "auxiliary_loss_mlp": 0.01264437, + "balance_loss_clip": 0.06275278, + "balance_loss_mlp": 0.01253357, + "epoch": 0.6220351721028108, + "flos": 20053529452800.0, + "grad_norm": 2.827284227984571, + "language_loss": 0.78566015, + "learning_rate": 1.320373617348614e-06, + "loss": 0.86247778, + "num_input_tokens_seen": 222845285, + "router_z_loss_clip": 1.41992188, + "router_z_loss_mlp": 0.11083984, + "step": 10346, + "time_per_iteration": 2.487410068511963 + }, + { + "auxiliary_loss_clip": 0.06418615, + "auxiliary_loss_mlp": 0.01266577, + "balance_loss_clip": 0.06276032, + "balance_loss_mlp": 0.01255419, + "epoch": 0.6220952953554787, + "flos": 27495439102080.0, + "grad_norm": 1.506091245470688, + "language_loss": 0.71672869, + "learning_rate": 1.3200073442697171e-06, + "loss": 0.79358065, + "num_input_tokens_seen": 222864575, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.11151123, + "step": 10347, + "time_per_iteration": 2.589825391769409 + }, + { + "auxiliary_loss_clip": 0.06409717, + "auxiliary_loss_mlp": 0.01264267, + "balance_loss_clip": 0.06270842, + "balance_loss_mlp": 0.01254117, + "epoch": 0.6221554186081467, + "flos": 19213517871360.0, + "grad_norm": 1.5983272943469429, + "language_loss": 0.7253015, + "learning_rate": 1.3196410969762108e-06, + "loss": 0.80204135, + "num_input_tokens_seen": 222884420, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.10144043, + "step": 10348, + "time_per_iteration": 2.497612953186035 + }, + { + "auxiliary_loss_clip": 0.06308477, + "auxiliary_loss_mlp": 0.01254968, + "balance_loss_clip": 0.06251626, + "balance_loss_mlp": 0.01253483, + "epoch": 0.6222155418608146, + "flos": 62969744016000.0, + "grad_norm": 0.7906840461302661, + "language_loss": 0.54113448, + "learning_rate": 1.3192748754819815e-06, + "loss": 0.61676896, + "num_input_tokens_seen": 222944690, + "router_z_loss_clip": 0.5703125, + "router_z_loss_mlp": 0.01483154, + "step": 10349, + "time_per_iteration": 3.123992681503296 + }, + { + "auxiliary_loss_clip": 0.06409817, + "auxiliary_loss_mlp": 0.01266982, + "balance_loss_clip": 0.06269394, + "balance_loss_mlp": 0.01256086, + "epoch": 0.6222756651134826, + "flos": 22607623681920.0, + "grad_norm": 1.7328717856317462, + "language_loss": 0.69908136, + "learning_rate": 1.3189086798009173e-06, + "loss": 0.77584934, + "num_input_tokens_seen": 222962990, + "router_z_loss_clip": 1.40429688, + "router_z_loss_mlp": 0.10894775, + "step": 10350, + "time_per_iteration": 2.5098471641540527 + }, + { + "auxiliary_loss_clip": 0.0641721, + "auxiliary_loss_mlp": 0.01269342, + "balance_loss_clip": 0.06275678, + "balance_loss_mlp": 0.01257946, + "epoch": 0.6223357883661506, + "flos": 21148602462720.0, + "grad_norm": 1.8273350624055802, + "language_loss": 0.57737762, + "learning_rate": 1.3185425099469046e-06, + "loss": 0.65424317, + "num_input_tokens_seen": 222980715, + "router_z_loss_clip": 1.41308594, + "router_z_loss_mlp": 0.11395264, + "step": 10351, + "time_per_iteration": 2.508089780807495 + }, + { + "auxiliary_loss_clip": 0.06308511, + "auxiliary_loss_mlp": 0.01256508, + "balance_loss_clip": 0.06251398, + "balance_loss_mlp": 0.01254946, + "epoch": 0.6223959116188186, + "flos": 63785926310400.0, + "grad_norm": 0.780725998939495, + "language_loss": 0.61087048, + "learning_rate": 1.3181763659338276e-06, + "loss": 0.6865207, + "num_input_tokens_seen": 223040685, + "router_z_loss_clip": 0.5703125, + "router_z_loss_mlp": 0.01560974, + "step": 10352, + "time_per_iteration": 3.1217076778411865 + }, + { + "auxiliary_loss_clip": 0.06412127, + "auxiliary_loss_mlp": 0.0126301, + "balance_loss_clip": 0.06274231, + "balance_loss_mlp": 0.01252866, + "epoch": 0.6224560348714866, + "flos": 22572432167040.0, + "grad_norm": 2.017492088511814, + "language_loss": 0.82234097, + "learning_rate": 1.3178102477755714e-06, + "loss": 0.89909232, + "num_input_tokens_seen": 223059000, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.10144043, + "step": 10353, + "time_per_iteration": 2.527926445007324 + }, + { + "auxiliary_loss_clip": 0.06406288, + "auxiliary_loss_mlp": 0.01271685, + "balance_loss_clip": 0.06271318, + "balance_loss_mlp": 0.01261474, + "epoch": 0.6225161581241545, + "flos": 24104645527680.0, + "grad_norm": 1.3564318500578532, + "language_loss": 0.75680768, + "learning_rate": 1.3174441554860195e-06, + "loss": 0.83358729, + "num_input_tokens_seen": 223079345, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.10217285, + "step": 10354, + "time_per_iteration": 2.577965021133423 + }, + { + "auxiliary_loss_clip": 0.06411938, + "auxiliary_loss_mlp": 0.0126369, + "balance_loss_clip": 0.06273422, + "balance_loss_mlp": 0.01253659, + "epoch": 0.6225762813768225, + "flos": 20448853816320.0, + "grad_norm": 1.3905640818253433, + "language_loss": 0.7869665, + "learning_rate": 1.3170780890790528e-06, + "loss": 0.8637228, + "num_input_tokens_seen": 223097880, + "router_z_loss_clip": 1.38476562, + "router_z_loss_mlp": 0.1003418, + "step": 10355, + "time_per_iteration": 2.520951986312866 + }, + { + "auxiliary_loss_clip": 0.06414016, + "auxiliary_loss_mlp": 0.01267836, + "balance_loss_clip": 0.06272769, + "balance_loss_mlp": 0.01257757, + "epoch": 0.6226364046294904, + "flos": 27205395793920.0, + "grad_norm": 1.8039879302815294, + "language_loss": 0.78103602, + "learning_rate": 1.3167120485685538e-06, + "loss": 0.85785455, + "num_input_tokens_seen": 223118185, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.10083008, + "step": 10356, + "time_per_iteration": 2.595402956008911 + }, + { + "auxiliary_loss_clip": 0.06422309, + "auxiliary_loss_mlp": 0.01269591, + "balance_loss_clip": 0.06274671, + "balance_loss_mlp": 0.01257307, + "epoch": 0.6226965278821585, + "flos": 20451495219840.0, + "grad_norm": 2.2679706310330037, + "language_loss": 0.67886806, + "learning_rate": 1.3163460339684024e-06, + "loss": 0.75578707, + "num_input_tokens_seen": 223137600, + "router_z_loss_clip": 1.47753906, + "router_z_loss_mlp": 0.1229248, + "step": 10357, + "time_per_iteration": 2.5113070011138916 + }, + { + "auxiliary_loss_clip": 0.06419406, + "auxiliary_loss_mlp": 0.01267785, + "balance_loss_clip": 0.06272604, + "balance_loss_mlp": 0.0125578, + "epoch": 0.6227566511348264, + "flos": 22169099738880.0, + "grad_norm": 2.9791987901041788, + "language_loss": 0.76851863, + "learning_rate": 1.3159800452924778e-06, + "loss": 0.84539044, + "num_input_tokens_seen": 223154360, + "router_z_loss_clip": 1.46777344, + "router_z_loss_mlp": 0.11999512, + "step": 10358, + "time_per_iteration": 2.532348394393921 + }, + { + "auxiliary_loss_clip": 0.06416389, + "auxiliary_loss_mlp": 0.01266377, + "balance_loss_clip": 0.0627404, + "balance_loss_mlp": 0.01255922, + "epoch": 0.6228167743874944, + "flos": 18046720166400.0, + "grad_norm": 1.8844002351613314, + "language_loss": 0.82833385, + "learning_rate": 1.3156140825546588e-06, + "loss": 0.9051615, + "num_input_tokens_seen": 223172255, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.10455322, + "step": 10359, + "time_per_iteration": 3.914476156234741 + }, + { + "auxiliary_loss_clip": 0.06410404, + "auxiliary_loss_mlp": 0.01263862, + "balance_loss_clip": 0.06273699, + "balance_loss_mlp": 0.01253353, + "epoch": 0.6228768976401623, + "flos": 17747620617600.0, + "grad_norm": 2.053797228905972, + "language_loss": 0.73535556, + "learning_rate": 1.315248145768822e-06, + "loss": 0.81209821, + "num_input_tokens_seen": 223186965, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.10510254, + "step": 10360, + "time_per_iteration": 2.476815700531006 + }, + { + "auxiliary_loss_clip": 0.06415363, + "auxiliary_loss_mlp": 0.01268466, + "balance_loss_clip": 0.06274994, + "balance_loss_mlp": 0.01257999, + "epoch": 0.6229370208928303, + "flos": 17900755153920.0, + "grad_norm": 2.156230361739645, + "language_loss": 0.77647728, + "learning_rate": 1.3148822349488442e-06, + "loss": 0.85331559, + "num_input_tokens_seen": 223206045, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.10461426, + "step": 10361, + "time_per_iteration": 2.4798471927642822 + }, + { + "auxiliary_loss_clip": 0.06413896, + "auxiliary_loss_mlp": 0.01265649, + "balance_loss_clip": 0.0627467, + "balance_loss_mlp": 0.01255618, + "epoch": 0.6229971441454982, + "flos": 17353512138240.0, + "grad_norm": 1.5462012893965447, + "language_loss": 0.68078434, + "learning_rate": 1.3145163501086005e-06, + "loss": 0.7575798, + "num_input_tokens_seen": 223224820, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.1003418, + "step": 10362, + "time_per_iteration": 2.5225536823272705 + }, + { + "auxiliary_loss_clip": 0.06412376, + "auxiliary_loss_mlp": 0.01266163, + "balance_loss_clip": 0.06272472, + "balance_loss_mlp": 0.0125466, + "epoch": 0.6230572673981662, + "flos": 29248989822720.0, + "grad_norm": 1.9753113738567412, + "language_loss": 0.67607152, + "learning_rate": 1.3141504912619658e-06, + "loss": 0.75285697, + "num_input_tokens_seen": 223243205, + "router_z_loss_clip": 1.40039062, + "router_z_loss_mlp": 0.11505127, + "step": 10363, + "time_per_iteration": 2.5485036373138428 + }, + { + "auxiliary_loss_clip": 0.06417742, + "auxiliary_loss_mlp": 0.01267367, + "balance_loss_clip": 0.06273825, + "balance_loss_mlp": 0.01256305, + "epoch": 0.6231173906508342, + "flos": 16331505488640.0, + "grad_norm": 1.8348569408777065, + "language_loss": 0.86522818, + "learning_rate": 1.3137846584228127e-06, + "loss": 0.94207931, + "num_input_tokens_seen": 223261370, + "router_z_loss_clip": 1.44042969, + "router_z_loss_mlp": 0.11071777, + "step": 10364, + "time_per_iteration": 2.510781764984131 + }, + { + "auxiliary_loss_clip": 0.06305057, + "auxiliary_loss_mlp": 0.01252144, + "balance_loss_clip": 0.06248282, + "balance_loss_mlp": 0.01250801, + "epoch": 0.6231775139035022, + "flos": 68719513587840.0, + "grad_norm": 0.8659025027753965, + "language_loss": 0.60801929, + "learning_rate": 1.313418851605015e-06, + "loss": 0.68359125, + "num_input_tokens_seen": 223315050, + "router_z_loss_clip": 0.5703125, + "router_z_loss_mlp": 0.01345062, + "step": 10365, + "time_per_iteration": 3.1263084411621094 + }, + { + "auxiliary_loss_clip": 0.06424095, + "auxiliary_loss_mlp": 0.0127084, + "balance_loss_clip": 0.0627584, + "balance_loss_mlp": 0.01257948, + "epoch": 0.6232376371561702, + "flos": 19825903036800.0, + "grad_norm": 1.776687810821879, + "language_loss": 0.75874949, + "learning_rate": 1.3130530708224427e-06, + "loss": 0.83569884, + "num_input_tokens_seen": 223332130, + "router_z_loss_clip": 1.48242188, + "router_z_loss_mlp": 0.12884521, + "step": 10366, + "time_per_iteration": 2.522902488708496 + }, + { + "auxiliary_loss_clip": 0.06416557, + "auxiliary_loss_mlp": 0.0126853, + "balance_loss_clip": 0.06272408, + "balance_loss_mlp": 0.01257372, + "epoch": 0.6232977604088381, + "flos": 23264969362560.0, + "grad_norm": 1.9573356945915528, + "language_loss": 0.77186829, + "learning_rate": 1.3126873160889665e-06, + "loss": 0.84871918, + "num_input_tokens_seen": 223351605, + "router_z_loss_clip": 1.44042969, + "router_z_loss_mlp": 0.11157227, + "step": 10367, + "time_per_iteration": 2.538060426712036 + }, + { + "auxiliary_loss_clip": 0.06409356, + "auxiliary_loss_mlp": 0.01268566, + "balance_loss_clip": 0.06272524, + "balance_loss_mlp": 0.0125841, + "epoch": 0.6233578836615061, + "flos": 21112907823360.0, + "grad_norm": 1.357507759578204, + "language_loss": 0.78851044, + "learning_rate": 1.312321587418457e-06, + "loss": 0.86528963, + "num_input_tokens_seen": 223372090, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.10162354, + "step": 10368, + "time_per_iteration": 2.525911569595337 + }, + { + "auxiliary_loss_clip": 0.06415667, + "auxiliary_loss_mlp": 0.01267784, + "balance_loss_clip": 0.06274077, + "balance_loss_mlp": 0.0125693, + "epoch": 0.623418006914174, + "flos": 23776266176640.0, + "grad_norm": 1.7380644464591393, + "language_loss": 0.69022548, + "learning_rate": 1.3119558848247811e-06, + "loss": 0.76706004, + "num_input_tokens_seen": 223390110, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.10864258, + "step": 10369, + "time_per_iteration": 3.9844348430633545 + }, + { + "auxiliary_loss_clip": 0.06414494, + "auxiliary_loss_mlp": 0.0126546, + "balance_loss_clip": 0.06272612, + "balance_loss_mlp": 0.01253861, + "epoch": 0.6234781301668421, + "flos": 17895556200960.0, + "grad_norm": 1.8898374142824015, + "language_loss": 0.88083899, + "learning_rate": 1.3115902083218072e-06, + "loss": 0.95763862, + "num_input_tokens_seen": 223404205, + "router_z_loss_clip": 1.41894531, + "router_z_loss_mlp": 0.1161499, + "step": 10370, + "time_per_iteration": 2.4602532386779785 + }, + { + "auxiliary_loss_clip": 0.06409945, + "auxiliary_loss_mlp": 0.01266714, + "balance_loss_clip": 0.06271629, + "balance_loss_mlp": 0.01256217, + "epoch": 0.62353825341951, + "flos": 26182424822400.0, + "grad_norm": 1.435666838781933, + "language_loss": 0.66256654, + "learning_rate": 1.311224557923402e-06, + "loss": 0.73933315, + "num_input_tokens_seen": 223424855, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.10510254, + "step": 10371, + "time_per_iteration": 2.585590124130249 + }, + { + "auxiliary_loss_clip": 0.06403823, + "auxiliary_loss_mlp": 0.01263874, + "balance_loss_clip": 0.06271943, + "balance_loss_mlp": 0.01254474, + "epoch": 0.623598376672178, + "flos": 31148044358400.0, + "grad_norm": 3.7034450225790962, + "language_loss": 0.77720612, + "learning_rate": 1.3108589336434298e-06, + "loss": 0.85388303, + "num_input_tokens_seen": 223447225, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09405518, + "step": 10372, + "time_per_iteration": 4.1913182735443115 + }, + { + "auxiliary_loss_clip": 0.06414315, + "auxiliary_loss_mlp": 0.01265562, + "balance_loss_clip": 0.06273008, + "balance_loss_mlp": 0.01254011, + "epoch": 0.6236584999248459, + "flos": 23736588468480.0, + "grad_norm": 1.6658386756111663, + "language_loss": 0.78006816, + "learning_rate": 1.3104933354957568e-06, + "loss": 0.85686696, + "num_input_tokens_seen": 223467520, + "router_z_loss_clip": 1.41210938, + "router_z_loss_mlp": 0.11553955, + "step": 10373, + "time_per_iteration": 2.5229697227478027 + }, + { + "auxiliary_loss_clip": 0.06407828, + "auxiliary_loss_mlp": 0.01266675, + "balance_loss_clip": 0.06271695, + "balance_loss_mlp": 0.01256494, + "epoch": 0.6237186231775139, + "flos": 21769289182080.0, + "grad_norm": 1.5443019053614775, + "language_loss": 0.69842112, + "learning_rate": 1.3101277634942448e-06, + "loss": 0.77516615, + "num_input_tokens_seen": 223488130, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.10174561, + "step": 10374, + "time_per_iteration": 2.546381711959839 + }, + { + "auxiliary_loss_clip": 0.06416135, + "auxiliary_loss_mlp": 0.01266815, + "balance_loss_clip": 0.06273846, + "balance_loss_mlp": 0.01256325, + "epoch": 0.6237787464301818, + "flos": 14944795943040.0, + "grad_norm": 1.644641658888945, + "language_loss": 0.77371937, + "learning_rate": 1.3097622176527577e-06, + "loss": 0.85054886, + "num_input_tokens_seen": 223505105, + "router_z_loss_clip": 1.42480469, + "router_z_loss_mlp": 0.10491943, + "step": 10375, + "time_per_iteration": 2.4894163608551025 + }, + { + "auxiliary_loss_clip": 0.06411552, + "auxiliary_loss_mlp": 0.01264147, + "balance_loss_clip": 0.06274613, + "balance_loss_mlp": 0.0125439, + "epoch": 0.6238388696828499, + "flos": 35599054844160.0, + "grad_norm": 1.2901779302370762, + "language_loss": 0.70425236, + "learning_rate": 1.3093966979851566e-06, + "loss": 0.78100938, + "num_input_tokens_seen": 223528065, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.09753418, + "step": 10376, + "time_per_iteration": 2.6778111457824707 + }, + { + "auxiliary_loss_clip": 0.06417015, + "auxiliary_loss_mlp": 0.01265351, + "balance_loss_clip": 0.06274082, + "balance_loss_mlp": 0.0125405, + "epoch": 0.6238989929355178, + "flos": 23630343091200.0, + "grad_norm": 1.5935175737828453, + "language_loss": 0.76607609, + "learning_rate": 1.309031204505301e-06, + "loss": 0.84289968, + "num_input_tokens_seen": 223547305, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.11315918, + "step": 10377, + "time_per_iteration": 4.115941524505615 + }, + { + "auxiliary_loss_clip": 0.06413018, + "auxiliary_loss_mlp": 0.01268384, + "balance_loss_clip": 0.06273637, + "balance_loss_mlp": 0.01258442, + "epoch": 0.6239591161881858, + "flos": 22093433902080.0, + "grad_norm": 1.8691726356193223, + "language_loss": 0.67910546, + "learning_rate": 1.308665737227052e-06, + "loss": 0.75591946, + "num_input_tokens_seen": 223567205, + "router_z_loss_clip": 1.39257812, + "router_z_loss_mlp": 0.09942627, + "step": 10378, + "time_per_iteration": 2.5460588932037354 + }, + { + "auxiliary_loss_clip": 0.06413449, + "auxiliary_loss_mlp": 0.01265408, + "balance_loss_clip": 0.06274828, + "balance_loss_mlp": 0.01254572, + "epoch": 0.6240192394408538, + "flos": 24542959835520.0, + "grad_norm": 1.7661801800879762, + "language_loss": 0.7668879, + "learning_rate": 1.3083002961642675e-06, + "loss": 0.84367645, + "num_input_tokens_seen": 223586560, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.1083374, + "step": 10379, + "time_per_iteration": 2.594383955001831 + }, + { + "auxiliary_loss_clip": 0.06411, + "auxiliary_loss_mlp": 0.01266487, + "balance_loss_clip": 0.0627025, + "balance_loss_mlp": 0.01255723, + "epoch": 0.6240793626935217, + "flos": 27940000538880.0, + "grad_norm": 1.331820718073444, + "language_loss": 0.79390121, + "learning_rate": 1.3079348813308051e-06, + "loss": 0.87067604, + "num_input_tokens_seen": 223610595, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.10766602, + "step": 10380, + "time_per_iteration": 2.593872308731079 + }, + { + "auxiliary_loss_clip": 0.06410354, + "auxiliary_loss_mlp": 0.01264738, + "balance_loss_clip": 0.06274755, + "balance_loss_mlp": 0.01254486, + "epoch": 0.6241394859461897, + "flos": 22899008655360.0, + "grad_norm": 1.5236398593874663, + "language_loss": 0.8010897, + "learning_rate": 1.3075694927405207e-06, + "loss": 0.87784058, + "num_input_tokens_seen": 223630230, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.1026001, + "step": 10381, + "time_per_iteration": 2.640678882598877 + }, + { + "auxiliary_loss_clip": 0.06414736, + "auxiliary_loss_mlp": 0.01267898, + "balance_loss_clip": 0.06274471, + "balance_loss_mlp": 0.01257079, + "epoch": 0.6241996091988576, + "flos": 12755781953280.0, + "grad_norm": 1.9060003648467456, + "language_loss": 0.74558902, + "learning_rate": 1.3072041304072718e-06, + "loss": 0.82241541, + "num_input_tokens_seen": 223648360, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.10821533, + "step": 10382, + "time_per_iteration": 2.479747772216797 + }, + { + "auxiliary_loss_clip": 0.06410253, + "auxiliary_loss_mlp": 0.01268006, + "balance_loss_clip": 0.06273764, + "balance_loss_mlp": 0.01258243, + "epoch": 0.6242597324515257, + "flos": 25858867080960.0, + "grad_norm": 1.410036242187738, + "language_loss": 0.78590852, + "learning_rate": 1.306838794344911e-06, + "loss": 0.8626911, + "num_input_tokens_seen": 223671255, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.09771729, + "step": 10383, + "time_per_iteration": 2.598404884338379 + }, + { + "auxiliary_loss_clip": 0.06411845, + "auxiliary_loss_mlp": 0.01264124, + "balance_loss_clip": 0.06273676, + "balance_loss_mlp": 0.01254236, + "epoch": 0.6243198557041936, + "flos": 19943804131200.0, + "grad_norm": 1.7487914543970622, + "language_loss": 0.75636935, + "learning_rate": 1.3064734845672925e-06, + "loss": 0.83312905, + "num_input_tokens_seen": 223689860, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.09899902, + "step": 10384, + "time_per_iteration": 2.493638038635254 + }, + { + "auxiliary_loss_clip": 0.06412329, + "auxiliary_loss_mlp": 0.01265797, + "balance_loss_clip": 0.06271704, + "balance_loss_mlp": 0.01254353, + "epoch": 0.6243799789568616, + "flos": 18412177749120.0, + "grad_norm": 2.229109392374204, + "language_loss": 0.66725862, + "learning_rate": 1.3061082010882694e-06, + "loss": 0.74403983, + "num_input_tokens_seen": 223707835, + "router_z_loss_clip": 1.40917969, + "router_z_loss_mlp": 0.11444092, + "step": 10385, + "time_per_iteration": 2.5185563564300537 + }, + { + "auxiliary_loss_clip": 0.06304897, + "auxiliary_loss_mlp": 0.01254771, + "balance_loss_clip": 0.06248314, + "balance_loss_mlp": 0.01253304, + "epoch": 0.6244401022095295, + "flos": 66048887128320.0, + "grad_norm": 0.7408334865403556, + "language_loss": 0.61911088, + "learning_rate": 1.305742943921692e-06, + "loss": 0.69470763, + "num_input_tokens_seen": 223771875, + "router_z_loss_clip": 0.56640625, + "router_z_loss_mlp": 0.01464844, + "step": 10386, + "time_per_iteration": 3.1636085510253906 + }, + { + "auxiliary_loss_clip": 0.06412023, + "auxiliary_loss_mlp": 0.01269919, + "balance_loss_clip": 0.06271843, + "balance_loss_mlp": 0.01258952, + "epoch": 0.6245002254621975, + "flos": 24578109423360.0, + "grad_norm": 2.35418101440168, + "language_loss": 0.71798837, + "learning_rate": 1.3053777130814128e-06, + "loss": 0.79480779, + "num_input_tokens_seen": 223788895, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.10974121, + "step": 10387, + "time_per_iteration": 2.5554144382476807 + }, + { + "auxiliary_loss_clip": 0.06417753, + "auxiliary_loss_mlp": 0.01266065, + "balance_loss_clip": 0.06271799, + "balance_loss_mlp": 0.01253399, + "epoch": 0.6245603487148654, + "flos": 29176510440960.0, + "grad_norm": 2.0504228233869886, + "language_loss": 0.65577459, + "learning_rate": 1.3050125085812798e-06, + "loss": 0.73261279, + "num_input_tokens_seen": 223810385, + "router_z_loss_clip": 1.45898438, + "router_z_loss_mlp": 0.12664795, + "step": 10388, + "time_per_iteration": 2.5694010257720947 + }, + { + "auxiliary_loss_clip": 0.0641178, + "auxiliary_loss_mlp": 0.0126472, + "balance_loss_clip": 0.06273006, + "balance_loss_mlp": 0.01255505, + "epoch": 0.6246204719675335, + "flos": 14794805934720.0, + "grad_norm": 1.572723869665335, + "language_loss": 0.79661775, + "learning_rate": 1.3046473304351417e-06, + "loss": 0.87338269, + "num_input_tokens_seen": 223826040, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.09216309, + "step": 10389, + "time_per_iteration": 2.497745990753174 + }, + { + "auxiliary_loss_clip": 0.06407995, + "auxiliary_loss_mlp": 0.01263578, + "balance_loss_clip": 0.06270336, + "balance_loss_mlp": 0.01253928, + "epoch": 0.6246805952202014, + "flos": 12498204902400.0, + "grad_norm": 2.3002980745210384, + "language_loss": 0.60729766, + "learning_rate": 1.3042821786568475e-06, + "loss": 0.68401337, + "num_input_tokens_seen": 223842300, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.09643555, + "step": 10390, + "time_per_iteration": 2.47084379196167 + }, + { + "auxiliary_loss_clip": 0.06418662, + "auxiliary_loss_mlp": 0.01265735, + "balance_loss_clip": 0.06275147, + "balance_loss_mlp": 0.01254553, + "epoch": 0.6247407184728694, + "flos": 12791602373760.0, + "grad_norm": 1.9019889358611486, + "language_loss": 0.77116674, + "learning_rate": 1.3039170532602416e-06, + "loss": 0.84801072, + "num_input_tokens_seen": 223858320, + "router_z_loss_clip": 1.43652344, + "router_z_loss_mlp": 0.11181641, + "step": 10391, + "time_per_iteration": 2.5408506393432617 + }, + { + "auxiliary_loss_clip": 0.06416374, + "auxiliary_loss_mlp": 0.0126612, + "balance_loss_clip": 0.06274267, + "balance_loss_mlp": 0.01255165, + "epoch": 0.6248008417255374, + "flos": 40639417822080.0, + "grad_norm": 1.6390307551388046, + "language_loss": 0.64875287, + "learning_rate": 1.3035519542591718e-06, + "loss": 0.72557783, + "num_input_tokens_seen": 223883545, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.10943604, + "step": 10392, + "time_per_iteration": 2.7098827362060547 + }, + { + "auxiliary_loss_clip": 0.06416553, + "auxiliary_loss_mlp": 0.01266782, + "balance_loss_clip": 0.06274416, + "balance_loss_mlp": 0.01255618, + "epoch": 0.6248609649782053, + "flos": 19908235272960.0, + "grad_norm": 1.9113748677122278, + "language_loss": 0.76920122, + "learning_rate": 1.3031868816674819e-06, + "loss": 0.84603459, + "num_input_tokens_seen": 223901445, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.11169434, + "step": 10393, + "time_per_iteration": 2.548680543899536 + }, + { + "auxiliary_loss_clip": 0.0641488, + "auxiliary_loss_mlp": 0.01268434, + "balance_loss_clip": 0.06272462, + "balance_loss_mlp": 0.01255971, + "epoch": 0.6249210882308733, + "flos": 19688868483840.0, + "grad_norm": 1.752087282406205, + "language_loss": 0.82699966, + "learning_rate": 1.3028218354990142e-06, + "loss": 0.90383279, + "num_input_tokens_seen": 223920170, + "router_z_loss_clip": 1.42285156, + "router_z_loss_mlp": 0.12451172, + "step": 10394, + "time_per_iteration": 2.5310568809509277 + }, + { + "auxiliary_loss_clip": 0.064147, + "auxiliary_loss_mlp": 0.01265504, + "balance_loss_clip": 0.06271648, + "balance_loss_mlp": 0.01254721, + "epoch": 0.6249812114835412, + "flos": 13995855653760.0, + "grad_norm": 1.7190801919243177, + "language_loss": 0.75490797, + "learning_rate": 1.3024568157676128e-06, + "loss": 0.83170998, + "num_input_tokens_seen": 223936495, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.10784912, + "step": 10395, + "time_per_iteration": 2.5296716690063477 + }, + { + "auxiliary_loss_clip": 0.06417533, + "auxiliary_loss_mlp": 0.01267604, + "balance_loss_clip": 0.06273706, + "balance_loss_mlp": 0.01256536, + "epoch": 0.6250413347362093, + "flos": 14533916647680.0, + "grad_norm": 2.451423836023636, + "language_loss": 0.73157996, + "learning_rate": 1.302091822487119e-06, + "loss": 0.80843133, + "num_input_tokens_seen": 223950070, + "router_z_loss_clip": 1.44042969, + "router_z_loss_mlp": 0.11065674, + "step": 10396, + "time_per_iteration": 2.5183842182159424 + }, + { + "auxiliary_loss_clip": 0.06411869, + "auxiliary_loss_mlp": 0.01266063, + "balance_loss_clip": 0.06272602, + "balance_loss_mlp": 0.01255936, + "epoch": 0.6251014579888772, + "flos": 22969098195840.0, + "grad_norm": 1.6502966804998584, + "language_loss": 0.76563799, + "learning_rate": 1.3017268556713732e-06, + "loss": 0.84241736, + "num_input_tokens_seen": 223970065, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.10131836, + "step": 10397, + "time_per_iteration": 2.5712759494781494 + }, + { + "auxiliary_loss_clip": 0.06415206, + "auxiliary_loss_mlp": 0.01267814, + "balance_loss_clip": 0.06274014, + "balance_loss_mlp": 0.0125718, + "epoch": 0.6251615812415452, + "flos": 28118809152000.0, + "grad_norm": 1.853529789472771, + "language_loss": 0.75433117, + "learning_rate": 1.3013619153342154e-06, + "loss": 0.83116138, + "num_input_tokens_seen": 223990315, + "router_z_loss_clip": 1.41308594, + "router_z_loss_mlp": 0.10638428, + "step": 10398, + "time_per_iteration": 4.095698595046997 + }, + { + "auxiliary_loss_clip": 0.0641809, + "auxiliary_loss_mlp": 0.01267876, + "balance_loss_clip": 0.0627377, + "balance_loss_mlp": 0.01256462, + "epoch": 0.6252217044942131, + "flos": 26731764190080.0, + "grad_norm": 1.615458357588448, + "language_loss": 0.74413693, + "learning_rate": 1.300997001489483e-06, + "loss": 0.82099664, + "num_input_tokens_seen": 224009960, + "router_z_loss_clip": 1.44335938, + "router_z_loss_mlp": 0.11419678, + "step": 10399, + "time_per_iteration": 2.5753824710845947 + }, + { + "auxiliary_loss_clip": 0.06412279, + "auxiliary_loss_mlp": 0.01266467, + "balance_loss_clip": 0.0627217, + "balance_loss_mlp": 0.01256287, + "epoch": 0.6252818277468811, + "flos": 20012216590080.0, + "grad_norm": 1.6187380573242784, + "language_loss": 0.74690026, + "learning_rate": 1.3006321141510147e-06, + "loss": 0.82368767, + "num_input_tokens_seen": 224028870, + "router_z_loss_clip": 1.40039062, + "router_z_loss_mlp": 0.10180664, + "step": 10400, + "time_per_iteration": 2.5361061096191406 + }, + { + "auxiliary_loss_clip": 0.06307141, + "auxiliary_loss_mlp": 0.01253939, + "balance_loss_clip": 0.06249951, + "balance_loss_mlp": 0.01252542, + "epoch": 0.625341950999549, + "flos": 59298550352640.0, + "grad_norm": 0.8247682302462489, + "language_loss": 0.56403446, + "learning_rate": 1.3002672533326465e-06, + "loss": 0.63964522, + "num_input_tokens_seen": 224094140, + "router_z_loss_clip": 0.57373047, + "router_z_loss_mlp": 0.01399231, + "step": 10401, + "time_per_iteration": 3.2024521827697754 + }, + { + "auxiliary_loss_clip": 0.06411454, + "auxiliary_loss_mlp": 0.01264191, + "balance_loss_clip": 0.06270526, + "balance_loss_mlp": 0.01253135, + "epoch": 0.625402074252217, + "flos": 20163296701440.0, + "grad_norm": 1.9270860159318792, + "language_loss": 0.82986021, + "learning_rate": 1.2999024190482146e-06, + "loss": 0.90661669, + "num_input_tokens_seen": 224113235, + "router_z_loss_clip": 1.40722656, + "router_z_loss_mlp": 0.1105957, + "step": 10402, + "time_per_iteration": 2.5365302562713623 + }, + { + "auxiliary_loss_clip": 0.06408338, + "auxiliary_loss_mlp": 0.01268072, + "balance_loss_clip": 0.06270024, + "balance_loss_mlp": 0.0125751, + "epoch": 0.625462197504885, + "flos": 29140228823040.0, + "grad_norm": 1.8928346901761637, + "language_loss": 0.68982589, + "learning_rate": 1.2995376113115527e-06, + "loss": 0.76659, + "num_input_tokens_seen": 224134530, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.10565186, + "step": 10403, + "time_per_iteration": 2.582432985305786 + }, + { + "auxiliary_loss_clip": 0.06414935, + "auxiliary_loss_mlp": 0.01268099, + "balance_loss_clip": 0.06273684, + "balance_loss_mlp": 0.01255791, + "epoch": 0.625522320757553, + "flos": 26111664449280.0, + "grad_norm": 1.458072120324879, + "language_loss": 0.7191205, + "learning_rate": 1.2991728301364954e-06, + "loss": 0.79595077, + "num_input_tokens_seen": 224154170, + "router_z_loss_clip": 1.41308594, + "router_z_loss_mlp": 0.12310791, + "step": 10404, + "time_per_iteration": 2.561168909072876 + }, + { + "auxiliary_loss_clip": 0.06414899, + "auxiliary_loss_mlp": 0.01265432, + "balance_loss_clip": 0.06274525, + "balance_loss_mlp": 0.01254376, + "epoch": 0.625582444010221, + "flos": 20637179867520.0, + "grad_norm": 1.708836006791191, + "language_loss": 0.69769311, + "learning_rate": 1.2988080755368742e-06, + "loss": 0.77449644, + "num_input_tokens_seen": 224172730, + "router_z_loss_clip": 1.40527344, + "router_z_loss_mlp": 0.11053467, + "step": 10405, + "time_per_iteration": 2.5165655612945557 + }, + { + "auxiliary_loss_clip": 0.06413669, + "auxiliary_loss_mlp": 0.01268037, + "balance_loss_clip": 0.06275192, + "balance_loss_mlp": 0.01257332, + "epoch": 0.6256425672628889, + "flos": 20527706108160.0, + "grad_norm": 1.5616382463324912, + "language_loss": 0.79137939, + "learning_rate": 1.2984433475265207e-06, + "loss": 0.86819649, + "num_input_tokens_seen": 224192620, + "router_z_loss_clip": 1.38476562, + "router_z_loss_mlp": 0.10693359, + "step": 10406, + "time_per_iteration": 2.526115894317627 + }, + { + "auxiliary_loss_clip": 0.06414723, + "auxiliary_loss_mlp": 0.01268249, + "balance_loss_clip": 0.06273726, + "balance_loss_mlp": 0.01257598, + "epoch": 0.6257026905155569, + "flos": 29536182092160.0, + "grad_norm": 1.7875701803121953, + "language_loss": 0.69265002, + "learning_rate": 1.2980786461192666e-06, + "loss": 0.76947975, + "num_input_tokens_seen": 224214660, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.10650635, + "step": 10407, + "time_per_iteration": 2.58450984954834 + }, + { + "auxiliary_loss_clip": 0.06403035, + "auxiliary_loss_mlp": 0.01268168, + "balance_loss_clip": 0.06269637, + "balance_loss_mlp": 0.01257898, + "epoch": 0.6257628137682248, + "flos": 24031788802560.0, + "grad_norm": 1.594681235705685, + "language_loss": 0.85355765, + "learning_rate": 1.2977139713289398e-06, + "loss": 0.93026972, + "num_input_tokens_seen": 224234170, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.10272217, + "step": 10408, + "time_per_iteration": 2.5464730262756348 + }, + { + "auxiliary_loss_clip": 0.06411938, + "auxiliary_loss_mlp": 0.01265758, + "balance_loss_clip": 0.06273568, + "balance_loss_mlp": 0.01255757, + "epoch": 0.6258229370208929, + "flos": 20857385197440.0, + "grad_norm": 1.6518363285256767, + "language_loss": 0.7993108, + "learning_rate": 1.2973493231693699e-06, + "loss": 0.87608778, + "num_input_tokens_seen": 224253115, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.09997559, + "step": 10409, + "time_per_iteration": 4.006382465362549 + }, + { + "auxiliary_loss_clip": 0.06408045, + "auxiliary_loss_mlp": 0.01265341, + "balance_loss_clip": 0.06269314, + "balance_loss_mlp": 0.0125475, + "epoch": 0.6258830602735608, + "flos": 22237218708480.0, + "grad_norm": 2.026280584027718, + "language_loss": 0.6951521, + "learning_rate": 1.2969847016543845e-06, + "loss": 0.77188593, + "num_input_tokens_seen": 224271375, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.10601807, + "step": 10410, + "time_per_iteration": 2.4960851669311523 + }, + { + "auxiliary_loss_clip": 0.06406428, + "auxiliary_loss_mlp": 0.0126592, + "balance_loss_clip": 0.0627175, + "balance_loss_mlp": 0.01256712, + "epoch": 0.6259431835262288, + "flos": 25082949473280.0, + "grad_norm": 1.7089284959721278, + "language_loss": 0.68380713, + "learning_rate": 1.2966201067978086e-06, + "loss": 0.76053059, + "num_input_tokens_seen": 224290315, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.09210205, + "step": 10411, + "time_per_iteration": 2.555173397064209 + }, + { + "auxiliary_loss_clip": 0.06413864, + "auxiliary_loss_mlp": 0.01267605, + "balance_loss_clip": 0.06273196, + "balance_loss_mlp": 0.01256489, + "epoch": 0.6260033067788967, + "flos": 28259072087040.0, + "grad_norm": 1.650436219337463, + "language_loss": 0.70024323, + "learning_rate": 1.2962555386134702e-06, + "loss": 0.77705795, + "num_input_tokens_seen": 224310545, + "router_z_loss_clip": 1.40722656, + "router_z_loss_mlp": 0.11114502, + "step": 10412, + "time_per_iteration": 4.113879919052124 + }, + { + "auxiliary_loss_clip": 0.06406923, + "auxiliary_loss_mlp": 0.01266017, + "balance_loss_clip": 0.06270111, + "balance_loss_mlp": 0.01256152, + "epoch": 0.6260634300315647, + "flos": 23374107705600.0, + "grad_norm": 1.4649345950741752, + "language_loss": 0.69805682, + "learning_rate": 1.2958909971151908e-06, + "loss": 0.77478617, + "num_input_tokens_seen": 224331115, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.09869385, + "step": 10413, + "time_per_iteration": 2.519340753555298 + }, + { + "auxiliary_loss_clip": 0.06415603, + "auxiliary_loss_mlp": 0.0126598, + "balance_loss_clip": 0.06269616, + "balance_loss_mlp": 0.01254101, + "epoch": 0.6261235532842326, + "flos": 18040221475200.0, + "grad_norm": 2.973303633857383, + "language_loss": 0.81012505, + "learning_rate": 1.295526482316796e-06, + "loss": 0.88694084, + "num_input_tokens_seen": 224347525, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.11877441, + "step": 10414, + "time_per_iteration": 2.5359139442443848 + }, + { + "auxiliary_loss_clip": 0.06411665, + "auxiliary_loss_mlp": 0.01265079, + "balance_loss_clip": 0.06273223, + "balance_loss_mlp": 0.012545, + "epoch": 0.6261836765369007, + "flos": 22016677962240.0, + "grad_norm": 1.921958755127535, + "language_loss": 0.74850363, + "learning_rate": 1.2951619942321083e-06, + "loss": 0.82527107, + "num_input_tokens_seen": 224367045, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.10577393, + "step": 10415, + "time_per_iteration": 2.529327630996704 + }, + { + "auxiliary_loss_clip": 0.06409019, + "auxiliary_loss_mlp": 0.01267114, + "balance_loss_clip": 0.06273155, + "balance_loss_mlp": 0.01256993, + "epoch": 0.6262437997895686, + "flos": 24942896173440.0, + "grad_norm": 1.4283741323498855, + "language_loss": 0.74384236, + "learning_rate": 1.2947975328749472e-06, + "loss": 0.82060367, + "num_input_tokens_seen": 224388860, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10119629, + "step": 10416, + "time_per_iteration": 2.626948595046997 + }, + { + "auxiliary_loss_clip": 0.06405699, + "auxiliary_loss_mlp": 0.01264334, + "balance_loss_clip": 0.06271897, + "balance_loss_mlp": 0.01254523, + "epoch": 0.6263039230422366, + "flos": 31615680395520.0, + "grad_norm": 1.6046151983772523, + "language_loss": 0.84637046, + "learning_rate": 1.2944330982591352e-06, + "loss": 0.92307079, + "num_input_tokens_seen": 224409645, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.09814453, + "step": 10417, + "time_per_iteration": 4.062727689743042 + }, + { + "auxiliary_loss_clip": 0.06414269, + "auxiliary_loss_mlp": 0.01264072, + "balance_loss_clip": 0.0627402, + "balance_loss_mlp": 0.01253713, + "epoch": 0.6263640462949046, + "flos": 17645232528000.0, + "grad_norm": 2.126036841621572, + "language_loss": 0.57267582, + "learning_rate": 1.2940686903984904e-06, + "loss": 0.6494593, + "num_input_tokens_seen": 224428530, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.10357666, + "step": 10418, + "time_per_iteration": 2.5384292602539062 + }, + { + "auxiliary_loss_clip": 0.06423989, + "auxiliary_loss_mlp": 0.01267395, + "balance_loss_clip": 0.06278068, + "balance_loss_mlp": 0.01255629, + "epoch": 0.6264241695475725, + "flos": 19981175852160.0, + "grad_norm": 2.5601033776039688, + "language_loss": 0.85281551, + "learning_rate": 1.2937043093068316e-06, + "loss": 0.92972934, + "num_input_tokens_seen": 224447175, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.11767578, + "step": 10419, + "time_per_iteration": 2.6254498958587646 + }, + { + "auxiliary_loss_clip": 0.0641915, + "auxiliary_loss_mlp": 0.01269689, + "balance_loss_clip": 0.06276678, + "balance_loss_mlp": 0.01258644, + "epoch": 0.6264842928002405, + "flos": 27351654295680.0, + "grad_norm": 1.7349665783281947, + "language_loss": 0.64790374, + "learning_rate": 1.2933399549979762e-06, + "loss": 0.72479212, + "num_input_tokens_seen": 224469445, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.1104126, + "step": 10420, + "time_per_iteration": 2.6838459968566895 + }, + { + "auxiliary_loss_clip": 0.06413981, + "auxiliary_loss_mlp": 0.01268518, + "balance_loss_clip": 0.0627203, + "balance_loss_mlp": 0.01257056, + "epoch": 0.6265444160529084, + "flos": 23002989972480.0, + "grad_norm": 1.7751280230906503, + "language_loss": 0.85910356, + "learning_rate": 1.292975627485741e-06, + "loss": 0.93592852, + "num_input_tokens_seen": 224486590, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.11462402, + "step": 10421, + "time_per_iteration": 2.502638101577759 + }, + { + "auxiliary_loss_clip": 0.06412976, + "auxiliary_loss_mlp": 0.01265861, + "balance_loss_clip": 0.06274194, + "balance_loss_mlp": 0.01255454, + "epoch": 0.6266045393055765, + "flos": 19944516890880.0, + "grad_norm": 1.9594550321950581, + "language_loss": 0.79719132, + "learning_rate": 1.2926113267839403e-06, + "loss": 0.87397969, + "num_input_tokens_seen": 224502795, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.10406494, + "step": 10422, + "time_per_iteration": 2.506927013397217 + }, + { + "auxiliary_loss_clip": 0.06411508, + "auxiliary_loss_mlp": 0.01266347, + "balance_loss_clip": 0.062728, + "balance_loss_mlp": 0.01255845, + "epoch": 0.6266646625582444, + "flos": 24395946647040.0, + "grad_norm": 1.5344190640547188, + "language_loss": 0.74784446, + "learning_rate": 1.292247052906389e-06, + "loss": 0.82462305, + "num_input_tokens_seen": 224522300, + "router_z_loss_clip": 1.38476562, + "router_z_loss_mlp": 0.10510254, + "step": 10423, + "time_per_iteration": 2.5245227813720703 + }, + { + "auxiliary_loss_clip": 0.064162, + "auxiliary_loss_mlp": 0.01266629, + "balance_loss_clip": 0.06277235, + "balance_loss_mlp": 0.01256186, + "epoch": 0.6267247858109124, + "flos": 14689021754880.0, + "grad_norm": 2.220018745384266, + "language_loss": 0.77700025, + "learning_rate": 1.2918828058669004e-06, + "loss": 0.85382849, + "num_input_tokens_seen": 224538260, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.10443115, + "step": 10424, + "time_per_iteration": 2.477313756942749 + }, + { + "auxiliary_loss_clip": 0.06416199, + "auxiliary_loss_mlp": 0.0126622, + "balance_loss_clip": 0.06277827, + "balance_loss_mlp": 0.01255139, + "epoch": 0.6267849090635803, + "flos": 24935852430720.0, + "grad_norm": 1.661217463389483, + "language_loss": 0.69195008, + "learning_rate": 1.2915185856792868e-06, + "loss": 0.76877427, + "num_input_tokens_seen": 224559155, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.11077881, + "step": 10425, + "time_per_iteration": 2.543240547180176 + }, + { + "auxiliary_loss_clip": 0.06407383, + "auxiliary_loss_mlp": 0.01264995, + "balance_loss_clip": 0.0627373, + "balance_loss_mlp": 0.01255232, + "epoch": 0.6268450323162483, + "flos": 25344886936320.0, + "grad_norm": 1.5301783551006911, + "language_loss": 0.74874127, + "learning_rate": 1.2911543923573598e-06, + "loss": 0.82546508, + "num_input_tokens_seen": 224578660, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.09765625, + "step": 10426, + "time_per_iteration": 2.541133403778076 + }, + { + "auxiliary_loss_clip": 0.06415579, + "auxiliary_loss_mlp": 0.01266633, + "balance_loss_clip": 0.06275427, + "balance_loss_mlp": 0.01256268, + "epoch": 0.6269051555689162, + "flos": 26184521174400.0, + "grad_norm": 1.3173967967859561, + "language_loss": 0.80809879, + "learning_rate": 1.290790225914929e-06, + "loss": 0.88492095, + "num_input_tokens_seen": 224599080, + "router_z_loss_clip": 1.40429688, + "router_z_loss_mlp": 0.10369873, + "step": 10427, + "time_per_iteration": 2.582977294921875 + }, + { + "auxiliary_loss_clip": 0.06420124, + "auxiliary_loss_mlp": 0.01267442, + "balance_loss_clip": 0.06276904, + "balance_loss_mlp": 0.01256618, + "epoch": 0.6269652788215843, + "flos": 18262271594880.0, + "grad_norm": 2.288264071636072, + "language_loss": 0.68539417, + "learning_rate": 1.2904260863658034e-06, + "loss": 0.76226991, + "num_input_tokens_seen": 224614225, + "router_z_loss_clip": 1.43164062, + "router_z_loss_mlp": 0.10821533, + "step": 10428, + "time_per_iteration": 2.470303773880005 + }, + { + "auxiliary_loss_clip": 0.06415083, + "auxiliary_loss_mlp": 0.01265748, + "balance_loss_clip": 0.06275322, + "balance_loss_mlp": 0.01255156, + "epoch": 0.6270254020742522, + "flos": 11770224629760.0, + "grad_norm": 1.7672728863863079, + "language_loss": 0.71438128, + "learning_rate": 1.2900619737237928e-06, + "loss": 0.79118955, + "num_input_tokens_seen": 224632365, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.105896, + "step": 10429, + "time_per_iteration": 2.4885928630828857 + }, + { + "auxiliary_loss_clip": 0.0641719, + "auxiliary_loss_mlp": 0.01266586, + "balance_loss_clip": 0.06274317, + "balance_loss_mlp": 0.01254665, + "epoch": 0.6270855253269202, + "flos": 23482114018560.0, + "grad_norm": 1.4192780160361307, + "language_loss": 0.80064285, + "learning_rate": 1.2896978880027023e-06, + "loss": 0.87748063, + "num_input_tokens_seen": 224651125, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.11920166, + "step": 10430, + "time_per_iteration": 2.695157766342163 + }, + { + "auxiliary_loss_clip": 0.06316154, + "auxiliary_loss_mlp": 0.01261761, + "balance_loss_clip": 0.0625899, + "balance_loss_mlp": 0.01260201, + "epoch": 0.6271456485795882, + "flos": 70084322490240.0, + "grad_norm": 0.7576452894497838, + "language_loss": 0.59208155, + "learning_rate": 1.2893338292163393e-06, + "loss": 0.66786075, + "num_input_tokens_seen": 224716115, + "router_z_loss_clip": 0.5703125, + "router_z_loss_mlp": 0.01556396, + "step": 10431, + "time_per_iteration": 3.2964041233062744 + }, + { + "auxiliary_loss_clip": 0.06312843, + "auxiliary_loss_mlp": 0.01258809, + "balance_loss_clip": 0.06255913, + "balance_loss_mlp": 0.01257378, + "epoch": 0.6272057718322561, + "flos": 65178673349760.0, + "grad_norm": 0.9858891279415538, + "language_loss": 0.63665617, + "learning_rate": 1.2889697973785095e-06, + "loss": 0.71237266, + "num_input_tokens_seen": 224782930, + "router_z_loss_clip": 0.5703125, + "router_z_loss_mlp": 0.01428223, + "step": 10432, + "time_per_iteration": 3.2280328273773193 + }, + { + "auxiliary_loss_clip": 0.06412185, + "auxiliary_loss_mlp": 0.01266828, + "balance_loss_clip": 0.06274938, + "balance_loss_mlp": 0.01256952, + "epoch": 0.6272658950849241, + "flos": 24396240136320.0, + "grad_norm": 1.6010176873941773, + "language_loss": 0.65241134, + "learning_rate": 1.2886057925030153e-06, + "loss": 0.72920156, + "num_input_tokens_seen": 224802010, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.09875488, + "step": 10433, + "time_per_iteration": 2.6001501083374023 + }, + { + "auxiliary_loss_clip": 0.06421921, + "auxiliary_loss_mlp": 0.01264381, + "balance_loss_clip": 0.06276838, + "balance_loss_mlp": 0.01252985, + "epoch": 0.627326018337592, + "flos": 17971515527040.0, + "grad_norm": 2.0859900141473897, + "language_loss": 0.62490857, + "learning_rate": 1.2882418146036612e-06, + "loss": 0.70177162, + "num_input_tokens_seen": 224818875, + "router_z_loss_clip": 1.44921875, + "router_z_loss_mlp": 0.1138916, + "step": 10434, + "time_per_iteration": 2.4881582260131836 + }, + { + "auxiliary_loss_clip": 0.06417267, + "auxiliary_loss_mlp": 0.01265758, + "balance_loss_clip": 0.06275722, + "balance_loss_mlp": 0.01255363, + "epoch": 0.6273861415902601, + "flos": 20236321134720.0, + "grad_norm": 1.4988303322096788, + "language_loss": 0.84577382, + "learning_rate": 1.2878778636942484e-06, + "loss": 0.92260414, + "num_input_tokens_seen": 224837790, + "router_z_loss_clip": 1.41601562, + "router_z_loss_mlp": 0.10394287, + "step": 10435, + "time_per_iteration": 2.508821487426758 + }, + { + "auxiliary_loss_clip": 0.06310409, + "auxiliary_loss_mlp": 0.01254017, + "balance_loss_clip": 0.06253147, + "balance_loss_mlp": 0.0125247, + "epoch": 0.627446264842928, + "flos": 64971605911680.0, + "grad_norm": 0.7140995203776986, + "language_loss": 0.6143651, + "learning_rate": 1.2875139397885786e-06, + "loss": 0.69000936, + "num_input_tokens_seen": 224899685, + "router_z_loss_clip": 0.57421875, + "router_z_loss_mlp": 0.01544952, + "step": 10436, + "time_per_iteration": 3.1841728687286377 + }, + { + "auxiliary_loss_clip": 0.06415884, + "auxiliary_loss_mlp": 0.01270936, + "balance_loss_clip": 0.06275365, + "balance_loss_mlp": 0.01259635, + "epoch": 0.627506388095596, + "flos": 23590623456000.0, + "grad_norm": 1.4165717499809394, + "language_loss": 0.77800572, + "learning_rate": 1.2871500429004523e-06, + "loss": 0.8548739, + "num_input_tokens_seen": 224918650, + "router_z_loss_clip": 1.40722656, + "router_z_loss_mlp": 0.11303711, + "step": 10437, + "time_per_iteration": 2.5377817153930664 + }, + { + "auxiliary_loss_clip": 0.06309696, + "auxiliary_loss_mlp": 0.01252859, + "balance_loss_clip": 0.06252521, + "balance_loss_mlp": 0.01251612, + "epoch": 0.6275665113482639, + "flos": 67603043059200.0, + "grad_norm": 0.7073778525823976, + "language_loss": 0.54094195, + "learning_rate": 1.2867861730436667e-06, + "loss": 0.61656755, + "num_input_tokens_seen": 224981575, + "router_z_loss_clip": 0.57226562, + "router_z_loss_mlp": 0.01247406, + "step": 10438, + "time_per_iteration": 4.560008764266968 + }, + { + "auxiliary_loss_clip": 0.06412268, + "auxiliary_loss_mlp": 0.01266715, + "balance_loss_clip": 0.06273399, + "balance_loss_mlp": 0.01255569, + "epoch": 0.6276266346009319, + "flos": 27644422861440.0, + "grad_norm": 1.692810124153385, + "language_loss": 0.84027016, + "learning_rate": 1.2864223302320214e-06, + "loss": 0.91705996, + "num_input_tokens_seen": 225000820, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.11138916, + "step": 10439, + "time_per_iteration": 2.5736849308013916 + }, + { + "auxiliary_loss_clip": 0.06415922, + "auxiliary_loss_mlp": 0.01266206, + "balance_loss_clip": 0.06272548, + "balance_loss_mlp": 0.01255399, + "epoch": 0.6276867578535998, + "flos": 22752540518400.0, + "grad_norm": 2.0302945438571047, + "language_loss": 0.80827779, + "learning_rate": 1.2860585144793128e-06, + "loss": 0.88509905, + "num_input_tokens_seen": 225017585, + "router_z_loss_clip": 1.43359375, + "router_z_loss_mlp": 0.10809326, + "step": 10440, + "time_per_iteration": 2.5353291034698486 + }, + { + "auxiliary_loss_clip": 0.06405526, + "auxiliary_loss_mlp": 0.01265635, + "balance_loss_clip": 0.0627224, + "balance_loss_mlp": 0.01256241, + "epoch": 0.6277468811062679, + "flos": 24651050002560.0, + "grad_norm": 1.4466963642107937, + "language_loss": 0.74692273, + "learning_rate": 1.285694725799337e-06, + "loss": 0.82363433, + "num_input_tokens_seen": 225039085, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.09393311, + "step": 10441, + "time_per_iteration": 2.5965688228607178 + }, + { + "auxiliary_loss_clip": 0.06410202, + "auxiliary_loss_mlp": 0.01267405, + "balance_loss_clip": 0.06272199, + "balance_loss_mlp": 0.01256932, + "epoch": 0.6278070043589358, + "flos": 19684466144640.0, + "grad_norm": 1.738690700547975, + "language_loss": 0.72243971, + "learning_rate": 1.2853309642058884e-06, + "loss": 0.79921579, + "num_input_tokens_seen": 225058105, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.1048584, + "step": 10442, + "time_per_iteration": 2.5236124992370605 + }, + { + "auxiliary_loss_clip": 0.06413672, + "auxiliary_loss_mlp": 0.01264225, + "balance_loss_clip": 0.06273279, + "balance_loss_mlp": 0.01254443, + "epoch": 0.6278671276116038, + "flos": 22127451459840.0, + "grad_norm": 1.5746919411428797, + "language_loss": 0.71842909, + "learning_rate": 1.284967229712762e-06, + "loss": 0.7952081, + "num_input_tokens_seen": 225077605, + "router_z_loss_clip": 1.40332031, + "router_z_loss_mlp": 0.09783936, + "step": 10443, + "time_per_iteration": 2.523799419403076 + }, + { + "auxiliary_loss_clip": 0.06411857, + "auxiliary_loss_mlp": 0.01265717, + "balance_loss_clip": 0.06272158, + "balance_loss_mlp": 0.01255412, + "epoch": 0.6279272508642717, + "flos": 23045099448960.0, + "grad_norm": 2.0032164077839787, + "language_loss": 0.73292875, + "learning_rate": 1.2846035223337492e-06, + "loss": 0.80970454, + "num_input_tokens_seen": 225097775, + "router_z_loss_clip": 1.39648438, + "router_z_loss_mlp": 0.10302734, + "step": 10444, + "time_per_iteration": 2.557166337966919 + }, + { + "auxiliary_loss_clip": 0.06410734, + "auxiliary_loss_mlp": 0.01266329, + "balance_loss_clip": 0.0627318, + "balance_loss_mlp": 0.01255595, + "epoch": 0.6279873741169397, + "flos": 19829466835200.0, + "grad_norm": 2.156521717901959, + "language_loss": 0.72276205, + "learning_rate": 1.2842398420826423e-06, + "loss": 0.79953271, + "num_input_tokens_seen": 225115585, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.10736084, + "step": 10445, + "time_per_iteration": 2.526127815246582 + }, + { + "auxiliary_loss_clip": 0.06412753, + "auxiliary_loss_mlp": 0.01265639, + "balance_loss_clip": 0.0627303, + "balance_loss_mlp": 0.01254601, + "epoch": 0.6280474973696077, + "flos": 23922273116160.0, + "grad_norm": 1.5888677783518865, + "language_loss": 0.69281161, + "learning_rate": 1.2838761889732331e-06, + "loss": 0.76959556, + "num_input_tokens_seen": 225135575, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.1104126, + "step": 10446, + "time_per_iteration": 2.530104637145996 + }, + { + "auxiliary_loss_clip": 0.06423883, + "auxiliary_loss_mlp": 0.01267771, + "balance_loss_clip": 0.06276697, + "balance_loss_mlp": 0.01256637, + "epoch": 0.6281076206222757, + "flos": 17973821514240.0, + "grad_norm": 1.8539120492479848, + "language_loss": 0.73894954, + "learning_rate": 1.2835125630193102e-06, + "loss": 0.81586611, + "num_input_tokens_seen": 225154230, + "router_z_loss_clip": 1.47167969, + "router_z_loss_mlp": 0.1114502, + "step": 10447, + "time_per_iteration": 2.4985270500183105 + }, + { + "auxiliary_loss_clip": 0.06304939, + "auxiliary_loss_mlp": 0.01257491, + "balance_loss_clip": 0.06248139, + "balance_loss_mlp": 0.01256266, + "epoch": 0.6281677438749437, + "flos": 66797216743680.0, + "grad_norm": 0.6871055611916008, + "language_loss": 0.51990867, + "learning_rate": 1.2831489642346626e-06, + "loss": 0.59553301, + "num_input_tokens_seen": 225213650, + "router_z_loss_clip": 0.5703125, + "router_z_loss_mlp": 0.01223755, + "step": 10448, + "time_per_iteration": 4.437039136886597 + }, + { + "auxiliary_loss_clip": 0.0641938, + "auxiliary_loss_mlp": 0.01268052, + "balance_loss_clip": 0.0627671, + "balance_loss_mlp": 0.01256346, + "epoch": 0.6282278671276116, + "flos": 11661002432640.0, + "grad_norm": 1.9501627229016425, + "language_loss": 0.91483194, + "learning_rate": 1.282785392633079e-06, + "loss": 0.99170625, + "num_input_tokens_seen": 225230135, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.1171875, + "step": 10449, + "time_per_iteration": 2.5085034370422363 + }, + { + "auxiliary_loss_clip": 0.06415906, + "auxiliary_loss_mlp": 0.01270346, + "balance_loss_clip": 0.06275564, + "balance_loss_mlp": 0.01260452, + "epoch": 0.6282879903802796, + "flos": 42751550090880.0, + "grad_norm": 1.4186227693043074, + "language_loss": 0.60281998, + "learning_rate": 1.2824218482283438e-06, + "loss": 0.67968249, + "num_input_tokens_seen": 225253520, + "router_z_loss_clip": 1.40332031, + "router_z_loss_mlp": 0.09893799, + "step": 10450, + "time_per_iteration": 2.6810834407806396 + }, + { + "auxiliary_loss_clip": 0.06408551, + "auxiliary_loss_mlp": 0.0126717, + "balance_loss_clip": 0.06272364, + "balance_loss_mlp": 0.01256269, + "epoch": 0.6283481136329475, + "flos": 20015067628800.0, + "grad_norm": 1.5189772221694435, + "language_loss": 0.77163285, + "learning_rate": 1.2820583310342452e-06, + "loss": 0.8483901, + "num_input_tokens_seen": 225272460, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.10906982, + "step": 10451, + "time_per_iteration": 2.5098116397857666 + }, + { + "auxiliary_loss_clip": 0.06416346, + "auxiliary_loss_mlp": 0.01265995, + "balance_loss_clip": 0.06274851, + "balance_loss_mlp": 0.01254652, + "epoch": 0.6284082368856155, + "flos": 21910264876800.0, + "grad_norm": 1.4797334153303925, + "language_loss": 0.77516776, + "learning_rate": 1.281694841064566e-06, + "loss": 0.85199118, + "num_input_tokens_seen": 225291700, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.11346436, + "step": 10452, + "time_per_iteration": 4.029058933258057 + }, + { + "auxiliary_loss_clip": 0.06413398, + "auxiliary_loss_mlp": 0.01268188, + "balance_loss_clip": 0.06273846, + "balance_loss_mlp": 0.01257173, + "epoch": 0.6284683601382834, + "flos": 25491313146240.0, + "grad_norm": 1.654591158178899, + "language_loss": 0.72948235, + "learning_rate": 1.2813313783330904e-06, + "loss": 0.8062982, + "num_input_tokens_seen": 225311470, + "router_z_loss_clip": 1.39648438, + "router_z_loss_mlp": 0.11029053, + "step": 10453, + "time_per_iteration": 2.542074680328369 + }, + { + "auxiliary_loss_clip": 0.06415626, + "auxiliary_loss_mlp": 0.0126451, + "balance_loss_clip": 0.0627359, + "balance_loss_mlp": 0.01253268, + "epoch": 0.6285284833909515, + "flos": 16543241556480.0, + "grad_norm": 1.6231177337896328, + "language_loss": 0.80777168, + "learning_rate": 1.2809679428536013e-06, + "loss": 0.88457304, + "num_input_tokens_seen": 225328385, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.11236572, + "step": 10454, + "time_per_iteration": 2.5263936519622803 + }, + { + "auxiliary_loss_clip": 0.06409679, + "auxiliary_loss_mlp": 0.0127067, + "balance_loss_clip": 0.06273915, + "balance_loss_mlp": 0.01260728, + "epoch": 0.6285886066436194, + "flos": 22827367814400.0, + "grad_norm": 1.7338027562142968, + "language_loss": 0.82249027, + "learning_rate": 1.2806045346398792e-06, + "loss": 0.89929378, + "num_input_tokens_seen": 225348415, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.09936523, + "step": 10455, + "time_per_iteration": 2.500506639480591 + }, + { + "auxiliary_loss_clip": 0.06415103, + "auxiliary_loss_mlp": 0.01264745, + "balance_loss_clip": 0.0627536, + "balance_loss_mlp": 0.01254225, + "epoch": 0.6286487298962874, + "flos": 24722355427200.0, + "grad_norm": 1.4932136487879293, + "language_loss": 0.82079554, + "learning_rate": 1.280241153705706e-06, + "loss": 0.89759403, + "num_input_tokens_seen": 225367740, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.10516357, + "step": 10456, + "time_per_iteration": 2.561309814453125 + }, + { + "auxiliary_loss_clip": 0.06420746, + "auxiliary_loss_mlp": 0.01268645, + "balance_loss_clip": 0.06275859, + "balance_loss_mlp": 0.01257624, + "epoch": 0.6287088531489553, + "flos": 20747114824320.0, + "grad_norm": 1.4461153744951818, + "language_loss": 0.72119695, + "learning_rate": 1.27987780006486e-06, + "loss": 0.79809082, + "num_input_tokens_seen": 225388405, + "router_z_loss_clip": 1.44921875, + "router_z_loss_mlp": 0.11022949, + "step": 10457, + "time_per_iteration": 3.957395076751709 + }, + { + "auxiliary_loss_clip": 0.06422028, + "auxiliary_loss_mlp": 0.01264534, + "balance_loss_clip": 0.06275769, + "balance_loss_mlp": 0.01253316, + "epoch": 0.6287689764016233, + "flos": 23076433676160.0, + "grad_norm": 1.6277999457875445, + "language_loss": 0.79939413, + "learning_rate": 1.2795144737311202e-06, + "loss": 0.8762598, + "num_input_tokens_seen": 225408360, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.11224365, + "step": 10458, + "time_per_iteration": 2.5144598484039307 + }, + { + "auxiliary_loss_clip": 0.06420826, + "auxiliary_loss_mlp": 0.01272203, + "balance_loss_clip": 0.06276783, + "balance_loss_mlp": 0.01261081, + "epoch": 0.6288290996542913, + "flos": 32241859557120.0, + "grad_norm": 1.5510176438747023, + "language_loss": 0.61428088, + "learning_rate": 1.2791511747182635e-06, + "loss": 0.69121122, + "num_input_tokens_seen": 225431310, + "router_z_loss_clip": 1.44140625, + "router_z_loss_mlp": 0.11120605, + "step": 10459, + "time_per_iteration": 2.673271894454956 + }, + { + "auxiliary_loss_clip": 0.06418507, + "auxiliary_loss_mlp": 0.01266867, + "balance_loss_clip": 0.06276773, + "balance_loss_mlp": 0.01256066, + "epoch": 0.6288892229069593, + "flos": 24647695839360.0, + "grad_norm": 1.5279768291149622, + "language_loss": 0.79008341, + "learning_rate": 1.2787879030400666e-06, + "loss": 0.86693716, + "num_input_tokens_seen": 225450385, + "router_z_loss_clip": 1.41894531, + "router_z_loss_mlp": 0.10809326, + "step": 10460, + "time_per_iteration": 2.5390427112579346 + }, + { + "auxiliary_loss_clip": 0.06411569, + "auxiliary_loss_mlp": 0.0126639, + "balance_loss_clip": 0.06274751, + "balance_loss_mlp": 0.01256305, + "epoch": 0.6289493461596273, + "flos": 17864138119680.0, + "grad_norm": 1.9201849344746347, + "language_loss": 0.73887581, + "learning_rate": 1.2784246587103047e-06, + "loss": 0.81565541, + "num_input_tokens_seen": 225467325, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.10089111, + "step": 10461, + "time_per_iteration": 2.524601459503174 + }, + { + "auxiliary_loss_clip": 0.06411408, + "auxiliary_loss_mlp": 0.01263734, + "balance_loss_clip": 0.06275996, + "balance_loss_mlp": 0.01253637, + "epoch": 0.6290094694122952, + "flos": 22351807566720.0, + "grad_norm": 1.8529909730554852, + "language_loss": 0.70305121, + "learning_rate": 1.2780614417427523e-06, + "loss": 0.77980262, + "num_input_tokens_seen": 225487370, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.10101318, + "step": 10462, + "time_per_iteration": 2.5161097049713135 + }, + { + "auxiliary_loss_clip": 0.06407323, + "auxiliary_loss_mlp": 0.01263666, + "balance_loss_clip": 0.06275059, + "balance_loss_mlp": 0.01254224, + "epoch": 0.6290695926649632, + "flos": 28409942563200.0, + "grad_norm": 1.9398923730208482, + "language_loss": 0.72176754, + "learning_rate": 1.2776982521511821e-06, + "loss": 0.79847741, + "num_input_tokens_seen": 225506915, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.09442139, + "step": 10463, + "time_per_iteration": 2.579223394393921 + }, + { + "auxiliary_loss_clip": 0.06409386, + "auxiliary_loss_mlp": 0.01271723, + "balance_loss_clip": 0.06276202, + "balance_loss_mlp": 0.01261751, + "epoch": 0.6291297159176311, + "flos": 21511628277120.0, + "grad_norm": 1.539324014350412, + "language_loss": 0.7288208, + "learning_rate": 1.2773350899493665e-06, + "loss": 0.80563188, + "num_input_tokens_seen": 225525670, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.09967041, + "step": 10464, + "time_per_iteration": 2.494276762008667 + }, + { + "auxiliary_loss_clip": 0.06412283, + "auxiliary_loss_mlp": 0.01266428, + "balance_loss_clip": 0.06275527, + "balance_loss_mlp": 0.01256969, + "epoch": 0.6291898391702991, + "flos": 12208203521280.0, + "grad_norm": 1.7590102978799784, + "language_loss": 0.69385099, + "learning_rate": 1.2769719551510768e-06, + "loss": 0.77063811, + "num_input_tokens_seen": 225542235, + "router_z_loss_clip": 1.36621094, + "router_z_loss_mlp": 0.09466553, + "step": 10465, + "time_per_iteration": 2.5754034519195557 + }, + { + "auxiliary_loss_clip": 0.06303164, + "auxiliary_loss_mlp": 0.01258656, + "balance_loss_clip": 0.06246626, + "balance_loss_mlp": 0.01257341, + "epoch": 0.629249962422967, + "flos": 69319347840000.0, + "grad_norm": 0.6721611616517246, + "language_loss": 0.59656096, + "learning_rate": 1.2766088477700832e-06, + "loss": 0.67217922, + "num_input_tokens_seen": 225607185, + "router_z_loss_clip": 0.56640625, + "router_z_loss_mlp": 0.01316833, + "step": 10466, + "time_per_iteration": 3.231010913848877 + }, + { + "auxiliary_loss_clip": 0.0640944, + "auxiliary_loss_mlp": 0.01262544, + "balance_loss_clip": 0.06271854, + "balance_loss_mlp": 0.01253305, + "epoch": 0.6293100856756351, + "flos": 40087353196800.0, + "grad_norm": 2.1464377164547916, + "language_loss": 0.64920712, + "learning_rate": 1.276245767820154e-06, + "loss": 0.72592694, + "num_input_tokens_seen": 225628785, + "router_z_loss_clip": 1.37695312, + "router_z_loss_mlp": 0.09234619, + "step": 10467, + "time_per_iteration": 2.7820122241973877 + }, + { + "auxiliary_loss_clip": 0.06300975, + "auxiliary_loss_mlp": 0.01258806, + "balance_loss_clip": 0.06244308, + "balance_loss_mlp": 0.01257555, + "epoch": 0.629370208928303, + "flos": 67518907960320.0, + "grad_norm": 0.7784779642706487, + "language_loss": 0.56803113, + "learning_rate": 1.2758827153150586e-06, + "loss": 0.64362895, + "num_input_tokens_seen": 225678980, + "router_z_loss_clip": 0.56640625, + "router_z_loss_mlp": 0.01250458, + "step": 10468, + "time_per_iteration": 2.934441089630127 + }, + { + "auxiliary_loss_clip": 0.06299016, + "auxiliary_loss_mlp": 0.0125297, + "balance_loss_clip": 0.06242396, + "balance_loss_mlp": 0.01251782, + "epoch": 0.629430332180971, + "flos": 60680228653440.0, + "grad_norm": 0.7475097067157215, + "language_loss": 0.57685459, + "learning_rate": 1.2755196902685626e-06, + "loss": 0.65237445, + "num_input_tokens_seen": 225740295, + "router_z_loss_clip": 0.56689453, + "router_z_loss_mlp": 0.01186371, + "step": 10469, + "time_per_iteration": 3.097425699234009 + }, + { + "auxiliary_loss_clip": 0.06301235, + "auxiliary_loss_mlp": 0.01251651, + "balance_loss_clip": 0.0624446, + "balance_loss_mlp": 0.01250373, + "epoch": 0.6294904554336389, + "flos": 66891707821440.0, + "grad_norm": 0.675756451414952, + "language_loss": 0.5208174, + "learning_rate": 1.2751566926944329e-06, + "loss": 0.59634632, + "num_input_tokens_seen": 225805615, + "router_z_loss_clip": 0.5703125, + "router_z_loss_mlp": 0.01277924, + "step": 10470, + "time_per_iteration": 3.224271774291992 + }, + { + "auxiliary_loss_clip": 0.06409313, + "auxiliary_loss_mlp": 0.01268407, + "balance_loss_clip": 0.06274216, + "balance_loss_mlp": 0.01258322, + "epoch": 0.6295505786863069, + "flos": 42532728353280.0, + "grad_norm": 1.628220195821946, + "language_loss": 0.75025994, + "learning_rate": 1.2747937226064342e-06, + "loss": 0.8270371, + "num_input_tokens_seen": 225826585, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.10076904, + "step": 10471, + "time_per_iteration": 2.7104806900024414 + }, + { + "auxiliary_loss_clip": 0.06416945, + "auxiliary_loss_mlp": 0.01263691, + "balance_loss_clip": 0.06276174, + "balance_loss_mlp": 0.01253689, + "epoch": 0.629610701938975, + "flos": 17389877610240.0, + "grad_norm": 1.7371618192940372, + "language_loss": 0.63321209, + "learning_rate": 1.2744307800183297e-06, + "loss": 0.71001846, + "num_input_tokens_seen": 225844095, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.10003662, + "step": 10472, + "time_per_iteration": 2.51810884475708 + }, + { + "auxiliary_loss_clip": 0.06414427, + "auxiliary_loss_mlp": 0.01266162, + "balance_loss_clip": 0.06273856, + "balance_loss_mlp": 0.01255434, + "epoch": 0.6296708251916429, + "flos": 24249730072320.0, + "grad_norm": 1.5892163482922788, + "language_loss": 0.69503713, + "learning_rate": 1.2740678649438828e-06, + "loss": 0.77184302, + "num_input_tokens_seen": 225864310, + "router_z_loss_clip": 1.40527344, + "router_z_loss_mlp": 0.10717773, + "step": 10473, + "time_per_iteration": 2.5234594345092773 + }, + { + "auxiliary_loss_clip": 0.06411944, + "auxiliary_loss_mlp": 0.0126239, + "balance_loss_clip": 0.0627473, + "balance_loss_mlp": 0.01252502, + "epoch": 0.6297309484443109, + "flos": 19284110536320.0, + "grad_norm": 1.4968676246915393, + "language_loss": 0.74922514, + "learning_rate": 1.2737049773968554e-06, + "loss": 0.8259685, + "num_input_tokens_seen": 225883830, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.09899902, + "step": 10474, + "time_per_iteration": 2.581749200820923 + }, + { + "auxiliary_loss_clip": 0.06412183, + "auxiliary_loss_mlp": 0.01264808, + "balance_loss_clip": 0.06272481, + "balance_loss_mlp": 0.0125455, + "epoch": 0.6297910716969788, + "flos": 30670261977600.0, + "grad_norm": 1.6340326591826166, + "language_loss": 0.66562986, + "learning_rate": 1.2733421173910081e-06, + "loss": 0.74239981, + "num_input_tokens_seen": 225905755, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.10253906, + "step": 10475, + "time_per_iteration": 2.6167984008789062 + }, + { + "auxiliary_loss_clip": 0.06403632, + "auxiliary_loss_mlp": 0.01261865, + "balance_loss_clip": 0.06270278, + "balance_loss_mlp": 0.01252107, + "epoch": 0.6298511949496468, + "flos": 14427293927040.0, + "grad_norm": 1.8082220709351975, + "language_loss": 0.90615106, + "learning_rate": 1.272979284940101e-06, + "loss": 0.98280615, + "num_input_tokens_seen": 225922155, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.09759521, + "step": 10476, + "time_per_iteration": 2.5575828552246094 + }, + { + "auxiliary_loss_clip": 0.06412712, + "auxiliary_loss_mlp": 0.01271614, + "balance_loss_clip": 0.06276231, + "balance_loss_mlp": 0.01261285, + "epoch": 0.6299113182023147, + "flos": 23520995112960.0, + "grad_norm": 1.6129960695216716, + "language_loss": 0.75463134, + "learning_rate": 1.2726164800578913e-06, + "loss": 0.83147454, + "num_input_tokens_seen": 225941060, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.10321045, + "step": 10477, + "time_per_iteration": 4.080779314041138 + }, + { + "auxiliary_loss_clip": 0.06409407, + "auxiliary_loss_mlp": 0.01263638, + "balance_loss_clip": 0.06271356, + "balance_loss_mlp": 0.01252927, + "epoch": 0.6299714414549827, + "flos": 22681109312640.0, + "grad_norm": 1.9893759064975287, + "language_loss": 0.70635891, + "learning_rate": 1.272253702758138e-06, + "loss": 0.7830894, + "num_input_tokens_seen": 225960870, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.10717773, + "step": 10478, + "time_per_iteration": 2.526340961456299 + }, + { + "auxiliary_loss_clip": 0.06415921, + "auxiliary_loss_mlp": 0.01267154, + "balance_loss_clip": 0.06272538, + "balance_loss_mlp": 0.01256419, + "epoch": 0.6300315647076506, + "flos": 14506984759680.0, + "grad_norm": 2.55864896023097, + "language_loss": 0.6816293, + "learning_rate": 1.2718909530545974e-06, + "loss": 0.75846004, + "num_input_tokens_seen": 225977895, + "router_z_loss_clip": 1.43359375, + "router_z_loss_mlp": 0.10742188, + "step": 10479, + "time_per_iteration": 2.5156965255737305 + }, + { + "auxiliary_loss_clip": 0.06411125, + "auxiliary_loss_mlp": 0.01264946, + "balance_loss_clip": 0.06273742, + "balance_loss_mlp": 0.01254188, + "epoch": 0.6300916879603187, + "flos": 21878134035840.0, + "grad_norm": 1.462422599280115, + "language_loss": 0.73846787, + "learning_rate": 1.2715282309610245e-06, + "loss": 0.81522858, + "num_input_tokens_seen": 225997835, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.10760498, + "step": 10480, + "time_per_iteration": 2.528325319290161 + }, + { + "auxiliary_loss_clip": 0.06412197, + "auxiliary_loss_mlp": 0.01263721, + "balance_loss_clip": 0.06272143, + "balance_loss_mlp": 0.01253141, + "epoch": 0.6301518112129866, + "flos": 21840301117440.0, + "grad_norm": 1.7175758648379602, + "language_loss": 0.78970373, + "learning_rate": 1.2711655364911744e-06, + "loss": 0.86646283, + "num_input_tokens_seen": 226017620, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.10571289, + "step": 10481, + "time_per_iteration": 2.60512638092041 + }, + { + "auxiliary_loss_clip": 0.06303924, + "auxiliary_loss_mlp": 0.01252426, + "balance_loss_clip": 0.06247687, + "balance_loss_mlp": 0.01251297, + "epoch": 0.6302119344656546, + "flos": 44348429675520.0, + "grad_norm": 0.8754005674495109, + "language_loss": 0.61759591, + "learning_rate": 1.2708028696588e-06, + "loss": 0.69315946, + "num_input_tokens_seen": 226068755, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01131439, + "step": 10482, + "time_per_iteration": 2.8790156841278076 + }, + { + "auxiliary_loss_clip": 0.06422234, + "auxiliary_loss_mlp": 0.01270647, + "balance_loss_clip": 0.0627502, + "balance_loss_mlp": 0.01259125, + "epoch": 0.6302720577183225, + "flos": 11222604270720.0, + "grad_norm": 1.8532441203732761, + "language_loss": 0.82836294, + "learning_rate": 1.2704402304776541e-06, + "loss": 0.90529174, + "num_input_tokens_seen": 226084395, + "router_z_loss_clip": 1.47167969, + "router_z_loss_mlp": 0.11517334, + "step": 10483, + "time_per_iteration": 2.5396814346313477 + }, + { + "auxiliary_loss_clip": 0.06401882, + "auxiliary_loss_mlp": 0.01265558, + "balance_loss_clip": 0.06271434, + "balance_loss_mlp": 0.01255873, + "epoch": 0.6303321809709905, + "flos": 27972424869120.0, + "grad_norm": 1.7223788623313236, + "language_loss": 0.72617853, + "learning_rate": 1.270077618961487e-06, + "loss": 0.80285299, + "num_input_tokens_seen": 226105890, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.09680176, + "step": 10484, + "time_per_iteration": 2.580455780029297 + }, + { + "auxiliary_loss_clip": 0.06412905, + "auxiliary_loss_mlp": 0.01264883, + "balance_loss_clip": 0.06272406, + "balance_loss_mlp": 0.01254804, + "epoch": 0.6303923042236586, + "flos": 28228366765440.0, + "grad_norm": 1.5965857276488986, + "language_loss": 0.74397701, + "learning_rate": 1.2697150351240506e-06, + "loss": 0.82075489, + "num_input_tokens_seen": 226126760, + "router_z_loss_clip": 1.40429688, + "router_z_loss_mlp": 0.10064697, + "step": 10485, + "time_per_iteration": 2.5941050052642822 + }, + { + "auxiliary_loss_clip": 0.06418431, + "auxiliary_loss_mlp": 0.01266454, + "balance_loss_clip": 0.06274744, + "balance_loss_mlp": 0.01255552, + "epoch": 0.6304524274763265, + "flos": 27637546826880.0, + "grad_norm": 2.046844751133349, + "language_loss": 0.81281161, + "learning_rate": 1.269352478979093e-06, + "loss": 0.88966042, + "num_input_tokens_seen": 226147315, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.10906982, + "step": 10486, + "time_per_iteration": 2.558913469314575 + }, + { + "auxiliary_loss_clip": 0.06410582, + "auxiliary_loss_mlp": 0.01264672, + "balance_loss_clip": 0.06273519, + "balance_loss_mlp": 0.01254617, + "epoch": 0.6305125507289945, + "flos": 17317062812160.0, + "grad_norm": 2.0599224612771923, + "language_loss": 0.6412251, + "learning_rate": 1.2689899505403628e-06, + "loss": 0.71797758, + "num_input_tokens_seen": 226165935, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.1005249, + "step": 10487, + "time_per_iteration": 2.5042107105255127 + }, + { + "auxiliary_loss_clip": 0.06409851, + "auxiliary_loss_mlp": 0.01271472, + "balance_loss_clip": 0.06273416, + "balance_loss_mlp": 0.01261816, + "epoch": 0.6305726739816624, + "flos": 25814745106560.0, + "grad_norm": 1.4604670858512163, + "language_loss": 0.67510849, + "learning_rate": 1.2686274498216065e-06, + "loss": 0.75192171, + "num_input_tokens_seen": 226186890, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.09655762, + "step": 10488, + "time_per_iteration": 4.039014101028442 + }, + { + "auxiliary_loss_clip": 0.06409914, + "auxiliary_loss_mlp": 0.01266449, + "balance_loss_clip": 0.06271197, + "balance_loss_mlp": 0.01255827, + "epoch": 0.6306327972343304, + "flos": 21803684083200.0, + "grad_norm": 1.7399651792203026, + "language_loss": 0.67476416, + "learning_rate": 1.2682649768365706e-06, + "loss": 0.75152779, + "num_input_tokens_seen": 226206710, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.10620117, + "step": 10489, + "time_per_iteration": 2.522010564804077 + }, + { + "auxiliary_loss_clip": 0.06421866, + "auxiliary_loss_mlp": 0.01264052, + "balance_loss_clip": 0.06273416, + "balance_loss_mlp": 0.0125256, + "epoch": 0.6306929204869983, + "flos": 20783689931520.0, + "grad_norm": 1.8067939569631877, + "language_loss": 0.69957733, + "learning_rate": 1.2679025315990007e-06, + "loss": 0.77643645, + "num_input_tokens_seen": 226225565, + "router_z_loss_clip": 1.48242188, + "router_z_loss_mlp": 0.11486816, + "step": 10490, + "time_per_iteration": 2.56429123878479 + }, + { + "auxiliary_loss_clip": 0.06410774, + "auxiliary_loss_mlp": 0.01267822, + "balance_loss_clip": 0.06271295, + "balance_loss_mlp": 0.01257123, + "epoch": 0.6307530437396663, + "flos": 23660084090880.0, + "grad_norm": 1.7944305121470099, + "language_loss": 0.78453183, + "learning_rate": 1.2675401141226393e-06, + "loss": 0.86131787, + "num_input_tokens_seen": 226243680, + "router_z_loss_clip": 1.39648438, + "router_z_loss_mlp": 0.10699463, + "step": 10491, + "time_per_iteration": 3.9702792167663574 + }, + { + "auxiliary_loss_clip": 0.06410797, + "auxiliary_loss_mlp": 0.01264458, + "balance_loss_clip": 0.0627301, + "balance_loss_mlp": 0.01253753, + "epoch": 0.6308131669923343, + "flos": 24726170787840.0, + "grad_norm": 2.4094216465826914, + "language_loss": 0.55782068, + "learning_rate": 1.2671777244212308e-06, + "loss": 0.63457322, + "num_input_tokens_seen": 226264345, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.10705566, + "step": 10492, + "time_per_iteration": 2.5553138256073 + }, + { + "auxiliary_loss_clip": 0.06413063, + "auxiliary_loss_mlp": 0.01265406, + "balance_loss_clip": 0.06272843, + "balance_loss_mlp": 0.01254772, + "epoch": 0.6308732902450023, + "flos": 22572054823680.0, + "grad_norm": 2.1354270064325935, + "language_loss": 0.64787519, + "learning_rate": 1.2668153625085168e-06, + "loss": 0.72465986, + "num_input_tokens_seen": 226283165, + "router_z_loss_clip": 1.40332031, + "router_z_loss_mlp": 0.10620117, + "step": 10493, + "time_per_iteration": 2.532414197921753 + }, + { + "auxiliary_loss_clip": 0.06409045, + "auxiliary_loss_mlp": 0.01266138, + "balance_loss_clip": 0.06271107, + "balance_loss_mlp": 0.01255797, + "epoch": 0.6309334134976702, + "flos": 24651050002560.0, + "grad_norm": 1.3969800101414371, + "language_loss": 0.82710558, + "learning_rate": 1.2664530283982367e-06, + "loss": 0.90385741, + "num_input_tokens_seen": 226304080, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.10345459, + "step": 10494, + "time_per_iteration": 2.5479516983032227 + }, + { + "auxiliary_loss_clip": 0.06410792, + "auxiliary_loss_mlp": 0.01270884, + "balance_loss_clip": 0.06271842, + "balance_loss_mlp": 0.01260691, + "epoch": 0.6309935367503382, + "flos": 41437655343360.0, + "grad_norm": 1.6454448829725794, + "language_loss": 0.79526448, + "learning_rate": 1.2660907221041317e-06, + "loss": 0.87208128, + "num_input_tokens_seen": 226325925, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.10192871, + "step": 10495, + "time_per_iteration": 2.705066204071045 + }, + { + "auxiliary_loss_clip": 0.06412271, + "auxiliary_loss_mlp": 0.01267403, + "balance_loss_clip": 0.06272048, + "balance_loss_mlp": 0.01257019, + "epoch": 0.6310536600030061, + "flos": 15123772264320.0, + "grad_norm": 1.7689443425086426, + "language_loss": 0.70583153, + "learning_rate": 1.2657284436399403e-06, + "loss": 0.78262818, + "num_input_tokens_seen": 226344190, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.1038208, + "step": 10496, + "time_per_iteration": 2.4985408782958984 + }, + { + "auxiliary_loss_clip": 0.06412859, + "auxiliary_loss_mlp": 0.01267227, + "balance_loss_clip": 0.06273797, + "balance_loss_mlp": 0.01256212, + "epoch": 0.6311137832556741, + "flos": 15237019457280.0, + "grad_norm": 3.784046746171531, + "language_loss": 0.80308318, + "learning_rate": 1.2653661930193997e-06, + "loss": 0.879884, + "num_input_tokens_seen": 226361520, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.11016846, + "step": 10497, + "time_per_iteration": 3.934098243713379 + }, + { + "auxiliary_loss_clip": 0.06407946, + "auxiliary_loss_mlp": 0.0126368, + "balance_loss_clip": 0.06270217, + "balance_loss_mlp": 0.01254495, + "epoch": 0.6311739065083422, + "flos": 22025314932480.0, + "grad_norm": 1.763173694901495, + "language_loss": 0.7404235, + "learning_rate": 1.265003970256247e-06, + "loss": 0.81713974, + "num_input_tokens_seen": 226381920, + "router_z_loss_clip": 1.37695312, + "router_z_loss_mlp": 0.09185791, + "step": 10498, + "time_per_iteration": 2.499866485595703 + }, + { + "auxiliary_loss_clip": 0.06410685, + "auxiliary_loss_mlp": 0.01267486, + "balance_loss_clip": 0.06272636, + "balance_loss_mlp": 0.01257174, + "epoch": 0.6312340297610101, + "flos": 22717349003520.0, + "grad_norm": 2.1933614541595543, + "language_loss": 0.70156991, + "learning_rate": 1.264641775364217e-06, + "loss": 0.77835166, + "num_input_tokens_seen": 226400035, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.10308838, + "step": 10499, + "time_per_iteration": 2.52750825881958 + }, + { + "auxiliary_loss_clip": 0.06406461, + "auxiliary_loss_mlp": 0.01267196, + "balance_loss_clip": 0.06273144, + "balance_loss_mlp": 0.01257122, + "epoch": 0.6312941530136781, + "flos": 24287017939200.0, + "grad_norm": 1.829578685045339, + "language_loss": 0.69904381, + "learning_rate": 1.2642796083570448e-06, + "loss": 0.77578032, + "num_input_tokens_seen": 226418280, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.10083008, + "step": 10500, + "time_per_iteration": 2.5188052654266357 + }, + { + "auxiliary_loss_clip": 0.06409658, + "auxiliary_loss_mlp": 0.01264556, + "balance_loss_clip": 0.06272549, + "balance_loss_mlp": 0.01254412, + "epoch": 0.631354276266346, + "flos": 21732420585600.0, + "grad_norm": 1.7241647945677354, + "language_loss": 0.74330127, + "learning_rate": 1.2639174692484634e-06, + "loss": 0.82004339, + "num_input_tokens_seen": 226436650, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.10144043, + "step": 10501, + "time_per_iteration": 2.5523152351379395 + }, + { + "auxiliary_loss_clip": 0.06406975, + "auxiliary_loss_mlp": 0.01265441, + "balance_loss_clip": 0.06271887, + "balance_loss_mlp": 0.01254331, + "epoch": 0.631414399519014, + "flos": 24032040364800.0, + "grad_norm": 1.6086243864849348, + "language_loss": 0.75708318, + "learning_rate": 1.2635553580522053e-06, + "loss": 0.83380735, + "num_input_tokens_seen": 226456275, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.11102295, + "step": 10502, + "time_per_iteration": 2.531738519668579 + }, + { + "auxiliary_loss_clip": 0.06415547, + "auxiliary_loss_mlp": 0.01269255, + "balance_loss_clip": 0.06271978, + "balance_loss_mlp": 0.01258026, + "epoch": 0.6314745227716819, + "flos": 24322586797440.0, + "grad_norm": 1.857189484196882, + "language_loss": 0.85481834, + "learning_rate": 1.2631932747820022e-06, + "loss": 0.93166631, + "num_input_tokens_seen": 226473610, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.11230469, + "step": 10503, + "time_per_iteration": 2.552402973175049 + }, + { + "auxiliary_loss_clip": 0.06410381, + "auxiliary_loss_mlp": 0.01264313, + "balance_loss_clip": 0.06270783, + "balance_loss_mlp": 0.01254061, + "epoch": 0.6315346460243499, + "flos": 23372891821440.0, + "grad_norm": 1.6307573056927078, + "language_loss": 0.86482477, + "learning_rate": 1.2628312194515838e-06, + "loss": 0.94157171, + "num_input_tokens_seen": 226493665, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.10253906, + "step": 10504, + "time_per_iteration": 2.5060269832611084 + }, + { + "auxiliary_loss_clip": 0.064176, + "auxiliary_loss_mlp": 0.01268121, + "balance_loss_clip": 0.06273593, + "balance_loss_mlp": 0.01257142, + "epoch": 0.6315947692770179, + "flos": 20265517082880.0, + "grad_norm": 1.678620058857516, + "language_loss": 0.76972538, + "learning_rate": 1.2624691920746793e-06, + "loss": 0.84658259, + "num_input_tokens_seen": 226511625, + "router_z_loss_clip": 1.44042969, + "router_z_loss_mlp": 0.10974121, + "step": 10505, + "time_per_iteration": 2.5305702686309814 + }, + { + "auxiliary_loss_clip": 0.06409689, + "auxiliary_loss_mlp": 0.01264983, + "balance_loss_clip": 0.06271394, + "balance_loss_mlp": 0.01254647, + "epoch": 0.6316548925296859, + "flos": 25273036460160.0, + "grad_norm": 1.9130295201566025, + "language_loss": 0.82312322, + "learning_rate": 1.2621071926650166e-06, + "loss": 0.89986992, + "num_input_tokens_seen": 226530085, + "router_z_loss_clip": 1.38476562, + "router_z_loss_mlp": 0.10339355, + "step": 10506, + "time_per_iteration": 2.5286946296691895 + }, + { + "auxiliary_loss_clip": 0.06409711, + "auxiliary_loss_mlp": 0.01264286, + "balance_loss_clip": 0.06270994, + "balance_loss_mlp": 0.01253164, + "epoch": 0.6317150157823538, + "flos": 22937344698240.0, + "grad_norm": 1.904699510430935, + "language_loss": 0.74647379, + "learning_rate": 1.2617452212363238e-06, + "loss": 0.82321376, + "num_input_tokens_seen": 226548115, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.11120605, + "step": 10507, + "time_per_iteration": 2.5269975662231445 + }, + { + "auxiliary_loss_clip": 0.06414819, + "auxiliary_loss_mlp": 0.01266326, + "balance_loss_clip": 0.06273329, + "balance_loss_mlp": 0.01254876, + "epoch": 0.6317751390350218, + "flos": 22533383364480.0, + "grad_norm": 1.9107193302266279, + "language_loss": 0.68296039, + "learning_rate": 1.2613832778023258e-06, + "loss": 0.75977188, + "num_input_tokens_seen": 226567955, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.11456299, + "step": 10508, + "time_per_iteration": 2.522627830505371 + }, + { + "auxiliary_loss_clip": 0.06408058, + "auxiliary_loss_mlp": 0.01267063, + "balance_loss_clip": 0.06270574, + "balance_loss_mlp": 0.0125662, + "epoch": 0.6318352622876897, + "flos": 23301460615680.0, + "grad_norm": 1.6343142360187424, + "language_loss": 0.70864749, + "learning_rate": 1.2610213623767478e-06, + "loss": 0.78539872, + "num_input_tokens_seen": 226588205, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.10449219, + "step": 10509, + "time_per_iteration": 2.542271614074707 + }, + { + "auxiliary_loss_clip": 0.06404234, + "auxiliary_loss_mlp": 0.01267915, + "balance_loss_clip": 0.06269038, + "balance_loss_mlp": 0.01257901, + "epoch": 0.6318953855403577, + "flos": 20710330081920.0, + "grad_norm": 1.5692460316561092, + "language_loss": 0.79883605, + "learning_rate": 1.2606594749733143e-06, + "loss": 0.87555748, + "num_input_tokens_seen": 226606965, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.10003662, + "step": 10510, + "time_per_iteration": 2.5088951587677 + }, + { + "auxiliary_loss_clip": 0.06416003, + "auxiliary_loss_mlp": 0.01266499, + "balance_loss_clip": 0.0627503, + "balance_loss_mlp": 0.01255627, + "epoch": 0.6319555087930258, + "flos": 22826613127680.0, + "grad_norm": 1.472787804562701, + "language_loss": 0.71112996, + "learning_rate": 1.2602976156057469e-06, + "loss": 0.78795499, + "num_input_tokens_seen": 226627845, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.10870361, + "step": 10511, + "time_per_iteration": 2.5239315032958984 + }, + { + "auxiliary_loss_clip": 0.06404155, + "auxiliary_loss_mlp": 0.01264501, + "balance_loss_clip": 0.06270795, + "balance_loss_mlp": 0.01254863, + "epoch": 0.6320156320456937, + "flos": 19976480023680.0, + "grad_norm": 1.5136926076294552, + "language_loss": 0.80152798, + "learning_rate": 1.2599357842877684e-06, + "loss": 0.87821454, + "num_input_tokens_seen": 226645855, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.09631348, + "step": 10512, + "time_per_iteration": 2.4730801582336426 + }, + { + "auxiliary_loss_clip": 0.06412748, + "auxiliary_loss_mlp": 0.01269686, + "balance_loss_clip": 0.06273789, + "balance_loss_mlp": 0.01258599, + "epoch": 0.6320757552983617, + "flos": 27020256197760.0, + "grad_norm": 1.640445181436539, + "language_loss": 0.71047747, + "learning_rate": 1.2595739810330994e-06, + "loss": 0.7873019, + "num_input_tokens_seen": 226665375, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.11090088, + "step": 10513, + "time_per_iteration": 2.554516077041626 + }, + { + "auxiliary_loss_clip": 0.06414016, + "auxiliary_loss_mlp": 0.01265922, + "balance_loss_clip": 0.06272392, + "balance_loss_mlp": 0.01255527, + "epoch": 0.6321358785510296, + "flos": 23702696691840.0, + "grad_norm": 1.6086341634408383, + "language_loss": 0.67001855, + "learning_rate": 1.259212205855459e-06, + "loss": 0.74681789, + "num_input_tokens_seen": 226685270, + "router_z_loss_clip": 1.41699219, + "router_z_loss_mlp": 0.10394287, + "step": 10514, + "time_per_iteration": 2.519026517868042 + }, + { + "auxiliary_loss_clip": 0.06407337, + "auxiliary_loss_mlp": 0.01266338, + "balance_loss_clip": 0.06271799, + "balance_loss_mlp": 0.01256491, + "epoch": 0.6321960018036976, + "flos": 26002484179200.0, + "grad_norm": 1.6426182718028832, + "language_loss": 0.74301624, + "learning_rate": 1.2588504587685663e-06, + "loss": 0.81975299, + "num_input_tokens_seen": 226705325, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.09851074, + "step": 10515, + "time_per_iteration": 2.6021077632904053 + }, + { + "auxiliary_loss_clip": 0.06406167, + "auxiliary_loss_mlp": 0.0126568, + "balance_loss_clip": 0.0627216, + "balance_loss_mlp": 0.01256054, + "epoch": 0.6322561250563655, + "flos": 22827745157760.0, + "grad_norm": 1.6516346518134952, + "language_loss": 0.90002799, + "learning_rate": 1.2584887397861379e-06, + "loss": 0.9767465, + "num_input_tokens_seen": 226723815, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.09631348, + "step": 10516, + "time_per_iteration": 3.9120290279388428 + }, + { + "auxiliary_loss_clip": 0.0641951, + "auxiliary_loss_mlp": 0.01266296, + "balance_loss_clip": 0.06273714, + "balance_loss_mlp": 0.01254208, + "epoch": 0.6323162483090335, + "flos": 18994234936320.0, + "grad_norm": 1.6653274793264599, + "language_loss": 0.81976604, + "learning_rate": 1.2581270489218911e-06, + "loss": 0.89662409, + "num_input_tokens_seen": 226741550, + "router_z_loss_clip": 1.45898438, + "router_z_loss_mlp": 0.12084961, + "step": 10517, + "time_per_iteration": 2.478886127471924 + }, + { + "auxiliary_loss_clip": 0.06409353, + "auxiliary_loss_mlp": 0.01263914, + "balance_loss_clip": 0.06273272, + "balance_loss_mlp": 0.01254312, + "epoch": 0.6323763715617015, + "flos": 19871324749440.0, + "grad_norm": 1.77487902385547, + "language_loss": 0.77740157, + "learning_rate": 1.257765386189541e-06, + "loss": 0.8541342, + "num_input_tokens_seen": 226761115, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.0960083, + "step": 10518, + "time_per_iteration": 2.529668092727661 + }, + { + "auxiliary_loss_clip": 0.06409025, + "auxiliary_loss_mlp": 0.01262964, + "balance_loss_clip": 0.0627432, + "balance_loss_mlp": 0.01253475, + "epoch": 0.6324364948143695, + "flos": 22789115625600.0, + "grad_norm": 1.399689960822604, + "language_loss": 0.85268837, + "learning_rate": 1.2574037516028018e-06, + "loss": 0.92940825, + "num_input_tokens_seen": 226782225, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.0949707, + "step": 10519, + "time_per_iteration": 2.5316224098205566 + }, + { + "auxiliary_loss_clip": 0.06407413, + "auxiliary_loss_mlp": 0.01264534, + "balance_loss_clip": 0.06274519, + "balance_loss_mlp": 0.01255081, + "epoch": 0.6324966180670374, + "flos": 22242333807360.0, + "grad_norm": 1.7591221317630206, + "language_loss": 0.7227571, + "learning_rate": 1.2570421451753867e-06, + "loss": 0.79947662, + "num_input_tokens_seen": 226802375, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.09454346, + "step": 10520, + "time_per_iteration": 2.593050479888916 + }, + { + "auxiliary_loss_clip": 0.06409709, + "auxiliary_loss_mlp": 0.01264525, + "balance_loss_clip": 0.06273077, + "balance_loss_mlp": 0.01254405, + "epoch": 0.6325567413197054, + "flos": 21695593916160.0, + "grad_norm": 1.8135575738100813, + "language_loss": 0.71838474, + "learning_rate": 1.2566805669210081e-06, + "loss": 0.79512703, + "num_input_tokens_seen": 226822165, + "router_z_loss_clip": 1.36621094, + "router_z_loss_mlp": 0.10119629, + "step": 10521, + "time_per_iteration": 2.5069823265075684 + }, + { + "auxiliary_loss_clip": 0.06414442, + "auxiliary_loss_mlp": 0.01265675, + "balance_loss_clip": 0.06276147, + "balance_loss_mlp": 0.01255018, + "epoch": 0.6326168645723733, + "flos": 19943133298560.0, + "grad_norm": 1.6828366730110347, + "language_loss": 0.7199434, + "learning_rate": 1.256319016853377e-06, + "loss": 0.79674459, + "num_input_tokens_seen": 226841645, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.10662842, + "step": 10522, + "time_per_iteration": 2.6152310371398926 + }, + { + "auxiliary_loss_clip": 0.06406049, + "auxiliary_loss_mlp": 0.0126663, + "balance_loss_clip": 0.06271145, + "balance_loss_mlp": 0.01256897, + "epoch": 0.6326769878250413, + "flos": 20236614624000.0, + "grad_norm": 1.7290468863072455, + "language_loss": 0.8156153, + "learning_rate": 1.2559574949862023e-06, + "loss": 0.89234209, + "num_input_tokens_seen": 226860355, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.09735107, + "step": 10523, + "time_per_iteration": 2.5101752281188965 + }, + { + "auxiliary_loss_clip": 0.06411799, + "auxiliary_loss_mlp": 0.012662, + "balance_loss_clip": 0.06276074, + "balance_loss_mlp": 0.01256669, + "epoch": 0.6327371110777094, + "flos": 20781803214720.0, + "grad_norm": 1.7543720010709223, + "language_loss": 0.73841488, + "learning_rate": 1.255596001333195e-06, + "loss": 0.81519485, + "num_input_tokens_seen": 226878390, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.09527588, + "step": 10524, + "time_per_iteration": 2.5357463359832764 + }, + { + "auxiliary_loss_clip": 0.06421272, + "auxiliary_loss_mlp": 0.01269485, + "balance_loss_clip": 0.06276855, + "balance_loss_mlp": 0.01258977, + "epoch": 0.6327972343303773, + "flos": 30344440176000.0, + "grad_norm": 2.100184187405554, + "language_loss": 0.84972739, + "learning_rate": 1.2552345359080615e-06, + "loss": 0.92663497, + "num_input_tokens_seen": 226898420, + "router_z_loss_clip": 1.44140625, + "router_z_loss_mlp": 0.10510254, + "step": 10525, + "time_per_iteration": 2.579566478729248 + }, + { + "auxiliary_loss_clip": 0.06407693, + "auxiliary_loss_mlp": 0.01265026, + "balance_loss_clip": 0.06272401, + "balance_loss_mlp": 0.01255632, + "epoch": 0.6328573575830453, + "flos": 17097947585280.0, + "grad_norm": 1.5662936390284432, + "language_loss": 0.67044812, + "learning_rate": 1.2548730987245093e-06, + "loss": 0.74717528, + "num_input_tokens_seen": 226916305, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.09393311, + "step": 10526, + "time_per_iteration": 2.6565749645233154 + }, + { + "auxiliary_loss_clip": 0.06418256, + "auxiliary_loss_mlp": 0.0126482, + "balance_loss_clip": 0.06276698, + "balance_loss_mlp": 0.01254002, + "epoch": 0.6329174808357132, + "flos": 25054340503680.0, + "grad_norm": 1.744260985628437, + "language_loss": 0.73593014, + "learning_rate": 1.254511689796244e-06, + "loss": 0.81276095, + "num_input_tokens_seen": 226937705, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.10821533, + "step": 10527, + "time_per_iteration": 4.000992298126221 + }, + { + "auxiliary_loss_clip": 0.06408013, + "auxiliary_loss_mlp": 0.01264369, + "balance_loss_clip": 0.062744, + "balance_loss_mlp": 0.01255124, + "epoch": 0.6329776040883812, + "flos": 16842466886400.0, + "grad_norm": 2.0238254127026347, + "language_loss": 0.72017205, + "learning_rate": 1.2541503091369693e-06, + "loss": 0.79689586, + "num_input_tokens_seen": 226954880, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.0925293, + "step": 10528, + "time_per_iteration": 2.482356548309326 + }, + { + "auxiliary_loss_clip": 0.06410971, + "auxiliary_loss_mlp": 0.01266595, + "balance_loss_clip": 0.0627386, + "balance_loss_mlp": 0.01256647, + "epoch": 0.6330377273410491, + "flos": 13521804779520.0, + "grad_norm": 2.0709634573058966, + "language_loss": 0.67286944, + "learning_rate": 1.2537889567603905e-06, + "loss": 0.74964511, + "num_input_tokens_seen": 226972595, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.0994873, + "step": 10529, + "time_per_iteration": 2.506375551223755 + }, + { + "auxiliary_loss_clip": 0.06417675, + "auxiliary_loss_mlp": 0.01266042, + "balance_loss_clip": 0.06276476, + "balance_loss_mlp": 0.0125486, + "epoch": 0.6330978505937171, + "flos": 21544471877760.0, + "grad_norm": 1.8153408645192133, + "language_loss": 0.75284207, + "learning_rate": 1.2534276326802092e-06, + "loss": 0.82967925, + "num_input_tokens_seen": 226991910, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.11181641, + "step": 10530, + "time_per_iteration": 4.016285419464111 + }, + { + "auxiliary_loss_clip": 0.06421702, + "auxiliary_loss_mlp": 0.01265839, + "balance_loss_clip": 0.06280397, + "balance_loss_mlp": 0.01255557, + "epoch": 0.6331579738463851, + "flos": 25016465658240.0, + "grad_norm": 1.412209042537855, + "language_loss": 0.74000126, + "learning_rate": 1.2530663369101259e-06, + "loss": 0.81687671, + "num_input_tokens_seen": 227010175, + "router_z_loss_clip": 1.41308594, + "router_z_loss_mlp": 0.10284424, + "step": 10531, + "time_per_iteration": 2.5478739738464355 + }, + { + "auxiliary_loss_clip": 0.06410021, + "auxiliary_loss_mlp": 0.0126691, + "balance_loss_clip": 0.06275932, + "balance_loss_mlp": 0.01257093, + "epoch": 0.6332180970990531, + "flos": 14981329123200.0, + "grad_norm": 4.395160978524889, + "language_loss": 0.80356932, + "learning_rate": 1.2527050694638432e-06, + "loss": 0.88033861, + "num_input_tokens_seen": 227025540, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.0980835, + "step": 10532, + "time_per_iteration": 2.4629757404327393 + }, + { + "auxiliary_loss_clip": 0.06411614, + "auxiliary_loss_mlp": 0.01265113, + "balance_loss_clip": 0.06276565, + "balance_loss_mlp": 0.01256017, + "epoch": 0.633278220351721, + "flos": 22712904737280.0, + "grad_norm": 1.6509114242634397, + "language_loss": 0.75345361, + "learning_rate": 1.2523438303550582e-06, + "loss": 0.83022094, + "num_input_tokens_seen": 227045520, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.09094238, + "step": 10533, + "time_per_iteration": 2.5486817359924316 + }, + { + "auxiliary_loss_clip": 0.06421439, + "auxiliary_loss_mlp": 0.01266816, + "balance_loss_clip": 0.0627851, + "balance_loss_mlp": 0.01255586, + "epoch": 0.633338343604389, + "flos": 12607594807680.0, + "grad_norm": 2.155852114283844, + "language_loss": 0.7738024, + "learning_rate": 1.2519826195974706e-06, + "loss": 0.850685, + "num_input_tokens_seen": 227059420, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.11224365, + "step": 10534, + "time_per_iteration": 2.447556257247925 + }, + { + "auxiliary_loss_clip": 0.06414493, + "auxiliary_loss_mlp": 0.01265709, + "balance_loss_clip": 0.06277296, + "balance_loss_mlp": 0.01255314, + "epoch": 0.6333984668570569, + "flos": 25967586153600.0, + "grad_norm": 8.614230799549778, + "language_loss": 0.85787749, + "learning_rate": 1.251621437204777e-06, + "loss": 0.93467951, + "num_input_tokens_seen": 227081310, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.10400391, + "step": 10535, + "time_per_iteration": 2.564028739929199 + }, + { + "auxiliary_loss_clip": 0.06413931, + "auxiliary_loss_mlp": 0.01265846, + "balance_loss_clip": 0.06276763, + "balance_loss_mlp": 0.01255606, + "epoch": 0.6334585901097249, + "flos": 23665953876480.0, + "grad_norm": 1.7881941276129079, + "language_loss": 0.76803362, + "learning_rate": 1.2512602831906733e-06, + "loss": 0.84483141, + "num_input_tokens_seen": 227100365, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.10235596, + "step": 10536, + "time_per_iteration": 4.017718315124512 + }, + { + "auxiliary_loss_clip": 0.06411674, + "auxiliary_loss_mlp": 0.01264671, + "balance_loss_clip": 0.06276245, + "balance_loss_mlp": 0.01254848, + "epoch": 0.633518713362393, + "flos": 28766930883840.0, + "grad_norm": 1.5924161290871786, + "language_loss": 0.6050871, + "learning_rate": 1.250899157568855e-06, + "loss": 0.68185055, + "num_input_tokens_seen": 227119680, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.09820557, + "step": 10537, + "time_per_iteration": 2.575690746307373 + }, + { + "auxiliary_loss_clip": 0.0632174, + "auxiliary_loss_mlp": 0.01257375, + "balance_loss_clip": 0.06265318, + "balance_loss_mlp": 0.01256043, + "epoch": 0.6335788366150609, + "flos": 70438669407360.0, + "grad_norm": 0.7645314683588974, + "language_loss": 0.5222913, + "learning_rate": 1.2505380603530155e-06, + "loss": 0.59808248, + "num_input_tokens_seen": 227184465, + "router_z_loss_clip": 0.56298828, + "router_z_loss_mlp": 0.01334381, + "step": 10538, + "time_per_iteration": 3.254763126373291 + }, + { + "auxiliary_loss_clip": 0.06417011, + "auxiliary_loss_mlp": 0.01268273, + "balance_loss_clip": 0.06275439, + "balance_loss_mlp": 0.01257383, + "epoch": 0.6336389598677289, + "flos": 23738768674560.0, + "grad_norm": 1.8043673999860153, + "language_loss": 0.83927584, + "learning_rate": 1.250176991556848e-06, + "loss": 0.91612864, + "num_input_tokens_seen": 227202185, + "router_z_loss_clip": 1.41503906, + "router_z_loss_mlp": 0.10888672, + "step": 10539, + "time_per_iteration": 2.533168315887451 + }, + { + "auxiliary_loss_clip": 0.06413823, + "auxiliary_loss_mlp": 0.01265505, + "balance_loss_clip": 0.06273531, + "balance_loss_mlp": 0.01254526, + "epoch": 0.6336990831203968, + "flos": 29284097483520.0, + "grad_norm": 1.5633861305622094, + "language_loss": 0.87373441, + "learning_rate": 1.2498159511940438e-06, + "loss": 0.95052767, + "num_input_tokens_seen": 227222020, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.10980225, + "step": 10540, + "time_per_iteration": 2.5700464248657227 + }, + { + "auxiliary_loss_clip": 0.0641039, + "auxiliary_loss_mlp": 0.01263695, + "balance_loss_clip": 0.06275897, + "balance_loss_mlp": 0.01254671, + "epoch": 0.6337592063730648, + "flos": 29104659964800.0, + "grad_norm": 1.757260374288504, + "language_loss": 0.7308234, + "learning_rate": 1.2494549392782943e-06, + "loss": 0.80756426, + "num_input_tokens_seen": 227240885, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.090271, + "step": 10541, + "time_per_iteration": 2.5605950355529785 + }, + { + "auxiliary_loss_clip": 0.06419826, + "auxiliary_loss_mlp": 0.01267808, + "balance_loss_clip": 0.06276362, + "balance_loss_mlp": 0.01255934, + "epoch": 0.6338193296257327, + "flos": 34713705404160.0, + "grad_norm": 3.0522247844622217, + "language_loss": 0.85394645, + "learning_rate": 1.2490939558232887e-06, + "loss": 0.93082273, + "num_input_tokens_seen": 227257880, + "router_z_loss_clip": 1.43359375, + "router_z_loss_mlp": 0.11865234, + "step": 10542, + "time_per_iteration": 2.711641788482666 + }, + { + "auxiliary_loss_clip": 0.06413235, + "auxiliary_loss_mlp": 0.01264694, + "balance_loss_clip": 0.06275927, + "balance_loss_mlp": 0.01253477, + "epoch": 0.6338794528784008, + "flos": 16692644586240.0, + "grad_norm": 1.6414110705076674, + "language_loss": 0.77927899, + "learning_rate": 1.2487330008427153e-06, + "loss": 0.85605824, + "num_input_tokens_seen": 227274840, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.11224365, + "step": 10543, + "time_per_iteration": 2.4868364334106445 + }, + { + "auxiliary_loss_clip": 0.06406207, + "auxiliary_loss_mlp": 0.01263491, + "balance_loss_clip": 0.0627438, + "balance_loss_mlp": 0.01254049, + "epoch": 0.6339395761310687, + "flos": 22353233086080.0, + "grad_norm": 1.4561914884468037, + "language_loss": 0.73388422, + "learning_rate": 1.2483720743502618e-06, + "loss": 0.81058121, + "num_input_tokens_seen": 227294835, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09442139, + "step": 10544, + "time_per_iteration": 2.5364322662353516 + }, + { + "auxiliary_loss_clip": 0.06420652, + "auxiliary_loss_mlp": 0.01265757, + "balance_loss_clip": 0.06277749, + "balance_loss_mlp": 0.0125501, + "epoch": 0.6339996993837367, + "flos": 18557765418240.0, + "grad_norm": 2.1124884217915953, + "language_loss": 0.68196738, + "learning_rate": 1.2480111763596144e-06, + "loss": 0.7588315, + "num_input_tokens_seen": 227314935, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.10736084, + "step": 10545, + "time_per_iteration": 2.498805284500122 + }, + { + "auxiliary_loss_clip": 0.06407638, + "auxiliary_loss_mlp": 0.01263932, + "balance_loss_clip": 0.06273287, + "balance_loss_mlp": 0.01254217, + "epoch": 0.6340598226364046, + "flos": 12974519836800.0, + "grad_norm": 1.9119054748089928, + "language_loss": 0.71463943, + "learning_rate": 1.2476503068844592e-06, + "loss": 0.79135519, + "num_input_tokens_seen": 227332905, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.09710693, + "step": 10546, + "time_per_iteration": 2.494575262069702 + }, + { + "auxiliary_loss_clip": 0.06404417, + "auxiliary_loss_mlp": 0.012635, + "balance_loss_clip": 0.06273207, + "balance_loss_mlp": 0.01254214, + "epoch": 0.6341199458890726, + "flos": 26695272936960.0, + "grad_norm": 1.3275160208019028, + "language_loss": 0.78403944, + "learning_rate": 1.2472894659384792e-06, + "loss": 0.86071861, + "num_input_tokens_seen": 227354915, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09283447, + "step": 10547, + "time_per_iteration": 2.565394639968872 + }, + { + "auxiliary_loss_clip": 0.06415725, + "auxiliary_loss_mlp": 0.01266589, + "balance_loss_clip": 0.0627535, + "balance_loss_mlp": 0.01256462, + "epoch": 0.6341800691417405, + "flos": 18740263610880.0, + "grad_norm": 1.5896144863347355, + "language_loss": 0.63801014, + "learning_rate": 1.2469286535353578e-06, + "loss": 0.71483326, + "num_input_tokens_seen": 227372990, + "router_z_loss_clip": 1.40332031, + "router_z_loss_mlp": 0.10131836, + "step": 10548, + "time_per_iteration": 2.531881332397461 + }, + { + "auxiliary_loss_clip": 0.06408647, + "auxiliary_loss_mlp": 0.01263438, + "balance_loss_clip": 0.06272966, + "balance_loss_mlp": 0.01253746, + "epoch": 0.6342401923944085, + "flos": 26256539358720.0, + "grad_norm": 1.5473137822842997, + "language_loss": 0.61999178, + "learning_rate": 1.2465678696887785e-06, + "loss": 0.69671261, + "num_input_tokens_seen": 227393270, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.09698486, + "step": 10549, + "time_per_iteration": 2.590090274810791 + }, + { + "auxiliary_loss_clip": 0.06413013, + "auxiliary_loss_mlp": 0.01265888, + "balance_loss_clip": 0.06276116, + "balance_loss_mlp": 0.01256047, + "epoch": 0.6343003156470765, + "flos": 24687834744960.0, + "grad_norm": 1.5414529536537591, + "language_loss": 0.74040842, + "learning_rate": 1.2462071144124197e-06, + "loss": 0.81719744, + "num_input_tokens_seen": 227413630, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.09844971, + "step": 10550, + "time_per_iteration": 2.575768232345581 + }, + { + "auxiliary_loss_clip": 0.06314379, + "auxiliary_loss_mlp": 0.01254446, + "balance_loss_clip": 0.06258175, + "balance_loss_mlp": 0.01252981, + "epoch": 0.6343604388997445, + "flos": 69824481379200.0, + "grad_norm": 0.6831342981577847, + "language_loss": 0.57712334, + "learning_rate": 1.2458463877199638e-06, + "loss": 0.65281159, + "num_input_tokens_seen": 227476630, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01463318, + "step": 10551, + "time_per_iteration": 3.169085741043091 + }, + { + "auxiliary_loss_clip": 0.06408188, + "auxiliary_loss_mlp": 0.01264711, + "balance_loss_clip": 0.06273001, + "balance_loss_mlp": 0.01255257, + "epoch": 0.6344205621524125, + "flos": 21989117168640.0, + "grad_norm": 1.9821146557890166, + "language_loss": 0.67052966, + "learning_rate": 1.2454856896250881e-06, + "loss": 0.74725866, + "num_input_tokens_seen": 227496060, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.09454346, + "step": 10552, + "time_per_iteration": 2.51409649848938 + }, + { + "auxiliary_loss_clip": 0.06415403, + "auxiliary_loss_mlp": 0.01263367, + "balance_loss_clip": 0.0627457, + "balance_loss_mlp": 0.01252883, + "epoch": 0.6344806854050804, + "flos": 20455100945280.0, + "grad_norm": 1.6854116098373486, + "language_loss": 0.82256383, + "learning_rate": 1.24512502014147e-06, + "loss": 0.89935154, + "num_input_tokens_seen": 227513440, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.1048584, + "step": 10553, + "time_per_iteration": 2.5263893604278564 + }, + { + "auxiliary_loss_clip": 0.06412624, + "auxiliary_loss_mlp": 0.01266225, + "balance_loss_clip": 0.06273618, + "balance_loss_mlp": 0.01256021, + "epoch": 0.6345408086577484, + "flos": 40519294594560.0, + "grad_norm": 1.7209630881675668, + "language_loss": 0.55282557, + "learning_rate": 1.2447643792827879e-06, + "loss": 0.629614, + "num_input_tokens_seen": 227535395, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.10205078, + "step": 10554, + "time_per_iteration": 2.6742208003997803 + }, + { + "auxiliary_loss_clip": 0.06412828, + "auxiliary_loss_mlp": 0.01266072, + "balance_loss_clip": 0.06274945, + "balance_loss_mlp": 0.01255701, + "epoch": 0.6346009319104163, + "flos": 21367759616640.0, + "grad_norm": 1.6547697162667994, + "language_loss": 0.7092278, + "learning_rate": 1.2444037670627153e-06, + "loss": 0.78601682, + "num_input_tokens_seen": 227554545, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.10369873, + "step": 10555, + "time_per_iteration": 2.5059010982513428 + }, + { + "auxiliary_loss_clip": 0.06308, + "auxiliary_loss_mlp": 0.01256771, + "balance_loss_clip": 0.06252061, + "balance_loss_mlp": 0.01255482, + "epoch": 0.6346610551630844, + "flos": 71383333138560.0, + "grad_norm": 0.7594485734837986, + "language_loss": 0.5526008, + "learning_rate": 1.2440431834949276e-06, + "loss": 0.62824851, + "num_input_tokens_seen": 227608575, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.01290131, + "step": 10556, + "time_per_iteration": 4.480233669281006 + }, + { + "auxiliary_loss_clip": 0.0641848, + "auxiliary_loss_mlp": 0.01268051, + "balance_loss_clip": 0.06276923, + "balance_loss_mlp": 0.01257227, + "epoch": 0.6347211784157523, + "flos": 25418666056320.0, + "grad_norm": 1.720664259353744, + "language_loss": 0.68248415, + "learning_rate": 1.2436826285930985e-06, + "loss": 0.75934947, + "num_input_tokens_seen": 227628175, + "router_z_loss_clip": 1.41503906, + "router_z_loss_mlp": 0.10827637, + "step": 10557, + "time_per_iteration": 2.5347533226013184 + }, + { + "auxiliary_loss_clip": 0.06415346, + "auxiliary_loss_mlp": 0.01266286, + "balance_loss_clip": 0.06277986, + "balance_loss_mlp": 0.01256069, + "epoch": 0.6347813016684203, + "flos": 15748274344320.0, + "grad_norm": 1.7185775847351308, + "language_loss": 0.7034533, + "learning_rate": 1.2433221023709002e-06, + "loss": 0.78026962, + "num_input_tokens_seen": 227645330, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.10211182, + "step": 10558, + "time_per_iteration": 2.5184271335601807 + }, + { + "auxiliary_loss_clip": 0.06415297, + "auxiliary_loss_mlp": 0.01267927, + "balance_loss_clip": 0.06277342, + "balance_loss_mlp": 0.01257812, + "epoch": 0.6348414249210882, + "flos": 21470231560320.0, + "grad_norm": 1.5690247234550625, + "language_loss": 0.78373873, + "learning_rate": 1.2429616048420031e-06, + "loss": 0.86057091, + "num_input_tokens_seen": 227665250, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.10113525, + "step": 10559, + "time_per_iteration": 2.5017571449279785 + }, + { + "auxiliary_loss_clip": 0.06413186, + "auxiliary_loss_mlp": 0.01268043, + "balance_loss_clip": 0.06274431, + "balance_loss_mlp": 0.01257404, + "epoch": 0.6349015481737562, + "flos": 21659521933440.0, + "grad_norm": 1.6584174732731671, + "language_loss": 0.68334514, + "learning_rate": 1.242601136020078e-06, + "loss": 0.76015741, + "num_input_tokens_seen": 227685070, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.10638428, + "step": 10560, + "time_per_iteration": 2.536973237991333 + }, + { + "auxiliary_loss_clip": 0.06413247, + "auxiliary_loss_mlp": 0.01267835, + "balance_loss_clip": 0.06275544, + "balance_loss_mlp": 0.01257679, + "epoch": 0.6349616714264241, + "flos": 22200643601280.0, + "grad_norm": 1.5868389258687317, + "language_loss": 0.77125943, + "learning_rate": 1.2422406959187939e-06, + "loss": 0.84807026, + "num_input_tokens_seen": 227704430, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.10150146, + "step": 10561, + "time_per_iteration": 2.5515172481536865 + }, + { + "auxiliary_loss_clip": 0.06412898, + "auxiliary_loss_mlp": 0.01265705, + "balance_loss_clip": 0.06273612, + "balance_loss_mlp": 0.01254433, + "epoch": 0.6350217946790921, + "flos": 25417324391040.0, + "grad_norm": 1.8175837603303404, + "language_loss": 0.72219515, + "learning_rate": 1.2418802845518178e-06, + "loss": 0.79898125, + "num_input_tokens_seen": 227724920, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.11279297, + "step": 10562, + "time_per_iteration": 2.563812255859375 + }, + { + "auxiliary_loss_clip": 0.06418765, + "auxiliary_loss_mlp": 0.0126928, + "balance_loss_clip": 0.0627933, + "balance_loss_mlp": 0.01258808, + "epoch": 0.63508191793176, + "flos": 19725024320640.0, + "grad_norm": 1.9663518722420297, + "language_loss": 0.81324869, + "learning_rate": 1.2415199019328185e-06, + "loss": 0.89012909, + "num_input_tokens_seen": 227743400, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.10473633, + "step": 10563, + "time_per_iteration": 2.618112087249756 + }, + { + "auxiliary_loss_clip": 0.06424198, + "auxiliary_loss_mlp": 0.01272987, + "balance_loss_clip": 0.06281862, + "balance_loss_mlp": 0.01262092, + "epoch": 0.6351420411844281, + "flos": 18192810960000.0, + "grad_norm": 2.213984919304992, + "language_loss": 0.81394589, + "learning_rate": 1.2411595480754597e-06, + "loss": 0.89091778, + "num_input_tokens_seen": 227759990, + "router_z_loss_clip": 1.42382812, + "router_z_loss_mlp": 0.10913086, + "step": 10564, + "time_per_iteration": 2.54693341255188 + }, + { + "auxiliary_loss_clip": 0.06417058, + "auxiliary_loss_mlp": 0.01266083, + "balance_loss_clip": 0.06278841, + "balance_loss_mlp": 0.01256272, + "epoch": 0.6352021644370961, + "flos": 33734437136640.0, + "grad_norm": 2.2491852390349614, + "language_loss": 0.73082668, + "learning_rate": 1.240799222993407e-06, + "loss": 0.80765808, + "num_input_tokens_seen": 227780835, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.09796143, + "step": 10565, + "time_per_iteration": 2.6810452938079834 + }, + { + "auxiliary_loss_clip": 0.06416303, + "auxiliary_loss_mlp": 0.01267579, + "balance_loss_clip": 0.06276368, + "balance_loss_mlp": 0.01256093, + "epoch": 0.635262287689764, + "flos": 20380818700800.0, + "grad_norm": 2.01281164224499, + "language_loss": 0.68792611, + "learning_rate": 1.240438926700324e-06, + "loss": 0.7647649, + "num_input_tokens_seen": 227798580, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.1149292, + "step": 10566, + "time_per_iteration": 2.5485215187072754 + }, + { + "auxiliary_loss_clip": 0.06410012, + "auxiliary_loss_mlp": 0.01265054, + "balance_loss_clip": 0.06277308, + "balance_loss_mlp": 0.01255022, + "epoch": 0.635322410942432, + "flos": 27532559260800.0, + "grad_norm": 1.717445195940493, + "language_loss": 0.69661963, + "learning_rate": 1.2400786592098725e-06, + "loss": 0.77337033, + "num_input_tokens_seen": 227819210, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.1003418, + "step": 10567, + "time_per_iteration": 4.017431974411011 + }, + { + "auxiliary_loss_clip": 0.064077, + "auxiliary_loss_mlp": 0.01265057, + "balance_loss_clip": 0.06274484, + "balance_loss_mlp": 0.01255151, + "epoch": 0.6353825341950999, + "flos": 21550048174080.0, + "grad_norm": 1.9561940375454367, + "language_loss": 0.84912741, + "learning_rate": 1.2397184205357154e-06, + "loss": 0.92585498, + "num_input_tokens_seen": 227838340, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.09906006, + "step": 10568, + "time_per_iteration": 2.528050422668457 + }, + { + "auxiliary_loss_clip": 0.06414051, + "auxiliary_loss_mlp": 0.012645, + "balance_loss_clip": 0.06275208, + "balance_loss_mlp": 0.01254427, + "epoch": 0.635442657447768, + "flos": 31767934464000.0, + "grad_norm": 1.8080598645215213, + "language_loss": 0.84412146, + "learning_rate": 1.2393582106915113e-06, + "loss": 0.92090696, + "num_input_tokens_seen": 227859170, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.10070801, + "step": 10569, + "time_per_iteration": 2.6543846130371094 + }, + { + "auxiliary_loss_clip": 0.06409843, + "auxiliary_loss_mlp": 0.01268445, + "balance_loss_clip": 0.06274843, + "balance_loss_mlp": 0.0125811, + "epoch": 0.6355027807004359, + "flos": 19835001204480.0, + "grad_norm": 1.4845804125044393, + "language_loss": 0.69596767, + "learning_rate": 1.2389980296909198e-06, + "loss": 0.77275056, + "num_input_tokens_seen": 227878545, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.10327148, + "step": 10570, + "time_per_iteration": 3.903024435043335 + }, + { + "auxiliary_loss_clip": 0.06413252, + "auxiliary_loss_mlp": 0.01264199, + "balance_loss_clip": 0.06273122, + "balance_loss_mlp": 0.01253989, + "epoch": 0.6355629039531039, + "flos": 30380176742400.0, + "grad_norm": 1.6479967140904772, + "language_loss": 0.66236866, + "learning_rate": 1.2386378775476e-06, + "loss": 0.73914319, + "num_input_tokens_seen": 227898875, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.10211182, + "step": 10571, + "time_per_iteration": 2.571477174758911 + }, + { + "auxiliary_loss_clip": 0.06416899, + "auxiliary_loss_mlp": 0.01266469, + "balance_loss_clip": 0.06277502, + "balance_loss_mlp": 0.01256097, + "epoch": 0.6356230272057718, + "flos": 17938001093760.0, + "grad_norm": 1.5990791790465455, + "language_loss": 0.71629465, + "learning_rate": 1.2382777542752074e-06, + "loss": 0.79312837, + "num_input_tokens_seen": 227917130, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.10375977, + "step": 10572, + "time_per_iteration": 2.466371774673462 + }, + { + "auxiliary_loss_clip": 0.06409136, + "auxiliary_loss_mlp": 0.0126563, + "balance_loss_clip": 0.06273179, + "balance_loss_mlp": 0.01255623, + "epoch": 0.6356831504584398, + "flos": 25383139125120.0, + "grad_norm": 1.3707006156469355, + "language_loss": 0.81310254, + "learning_rate": 1.2379176598873992e-06, + "loss": 0.88985026, + "num_input_tokens_seen": 227939550, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.10015869, + "step": 10573, + "time_per_iteration": 2.5966269969940186 + }, + { + "auxiliary_loss_clip": 0.06417162, + "auxiliary_loss_mlp": 0.01267057, + "balance_loss_clip": 0.06277572, + "balance_loss_mlp": 0.01255899, + "epoch": 0.6357432737111077, + "flos": 46511029630080.0, + "grad_norm": 1.745983210040395, + "language_loss": 0.68758935, + "learning_rate": 1.2375575943978303e-06, + "loss": 0.76443154, + "num_input_tokens_seen": 227962200, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.11151123, + "step": 10574, + "time_per_iteration": 2.7297935485839844 + }, + { + "auxiliary_loss_clip": 0.06411967, + "auxiliary_loss_mlp": 0.01265063, + "balance_loss_clip": 0.06275427, + "balance_loss_mlp": 0.01254513, + "epoch": 0.6358033969637757, + "flos": 17280026507520.0, + "grad_norm": 2.032779061466396, + "language_loss": 0.8712132, + "learning_rate": 1.2371975578201525e-06, + "loss": 0.9479835, + "num_input_tokens_seen": 227979270, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.10540771, + "step": 10575, + "time_per_iteration": 2.505861520767212 + }, + { + "auxiliary_loss_clip": 0.06410281, + "auxiliary_loss_mlp": 0.01265614, + "balance_loss_clip": 0.06273504, + "balance_loss_mlp": 0.01255946, + "epoch": 0.6358635202164437, + "flos": 27132832558080.0, + "grad_norm": 1.4971132099643523, + "language_loss": 0.72510445, + "learning_rate": 1.2368375501680204e-06, + "loss": 0.80186343, + "num_input_tokens_seen": 228000550, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.09667969, + "step": 10576, + "time_per_iteration": 3.991710901260376 + }, + { + "auxiliary_loss_clip": 0.06415755, + "auxiliary_loss_mlp": 0.01267596, + "balance_loss_clip": 0.06276268, + "balance_loss_mlp": 0.01257368, + "epoch": 0.6359236434691117, + "flos": 27532307698560.0, + "grad_norm": 1.4171583307321047, + "language_loss": 0.6902113, + "learning_rate": 1.236477571455085e-06, + "loss": 0.76704478, + "num_input_tokens_seen": 228022005, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.10223389, + "step": 10577, + "time_per_iteration": 2.553823947906494 + }, + { + "auxiliary_loss_clip": 0.06410993, + "auxiliary_loss_mlp": 0.01267287, + "balance_loss_clip": 0.06274246, + "balance_loss_mlp": 0.01257613, + "epoch": 0.6359837667217797, + "flos": 39357653915520.0, + "grad_norm": 1.7634862953282429, + "language_loss": 0.72702098, + "learning_rate": 1.2361176216949964e-06, + "loss": 0.8038038, + "num_input_tokens_seen": 228043770, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.09674072, + "step": 10578, + "time_per_iteration": 2.7065927982330322 + }, + { + "auxiliary_loss_clip": 0.06310344, + "auxiliary_loss_mlp": 0.01250981, + "balance_loss_clip": 0.06254056, + "balance_loss_mlp": 0.0124968, + "epoch": 0.6360438899744476, + "flos": 56430472475520.0, + "grad_norm": 0.7091193353039391, + "language_loss": 0.54502332, + "learning_rate": 1.2357577009014044e-06, + "loss": 0.62063658, + "num_input_tokens_seen": 228104985, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01301575, + "step": 10579, + "time_per_iteration": 3.198455333709717 + }, + { + "auxiliary_loss_clip": 0.06409089, + "auxiliary_loss_mlp": 0.01264424, + "balance_loss_clip": 0.06272582, + "balance_loss_mlp": 0.01254369, + "epoch": 0.6361040132271156, + "flos": 24980100186240.0, + "grad_norm": 1.5151266119166613, + "language_loss": 0.77508366, + "learning_rate": 1.2353978090879568e-06, + "loss": 0.8518188, + "num_input_tokens_seen": 228125620, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.1005249, + "step": 10580, + "time_per_iteration": 2.5561928749084473 + }, + { + "auxiliary_loss_clip": 0.06411447, + "auxiliary_loss_mlp": 0.01269158, + "balance_loss_clip": 0.06273703, + "balance_loss_mlp": 0.01259043, + "epoch": 0.6361641364797835, + "flos": 23266059465600.0, + "grad_norm": 1.9638125336396983, + "language_loss": 0.66766918, + "learning_rate": 1.235037946268301e-06, + "loss": 0.74447519, + "num_input_tokens_seen": 228143495, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.10113525, + "step": 10581, + "time_per_iteration": 2.5164785385131836 + }, + { + "auxiliary_loss_clip": 0.06410715, + "auxiliary_loss_mlp": 0.01264464, + "balance_loss_clip": 0.06273356, + "balance_loss_mlp": 0.01254683, + "epoch": 0.6362242597324516, + "flos": 26001645638400.0, + "grad_norm": 1.4228320252439628, + "language_loss": 0.6843577, + "learning_rate": 1.2346781124560828e-06, + "loss": 0.76110947, + "num_input_tokens_seen": 228166500, + "router_z_loss_clip": 1.37402344, + "router_z_loss_mlp": 0.09783936, + "step": 10582, + "time_per_iteration": 2.6015806198120117 + }, + { + "auxiliary_loss_clip": 0.06416672, + "auxiliary_loss_mlp": 0.01264747, + "balance_loss_clip": 0.06276425, + "balance_loss_mlp": 0.01254203, + "epoch": 0.6362843829851195, + "flos": 25710428373120.0, + "grad_norm": 2.448331234664856, + "language_loss": 0.84422374, + "learning_rate": 1.2343183076649473e-06, + "loss": 0.92103791, + "num_input_tokens_seen": 228185325, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.10552979, + "step": 10583, + "time_per_iteration": 2.5657055377960205 + }, + { + "auxiliary_loss_clip": 0.06411825, + "auxiliary_loss_mlp": 0.01266338, + "balance_loss_clip": 0.06278308, + "balance_loss_mlp": 0.01256086, + "epoch": 0.6363445062377875, + "flos": 20529341262720.0, + "grad_norm": 1.5773260338409785, + "language_loss": 0.75534987, + "learning_rate": 1.233958531908538e-06, + "loss": 0.83213151, + "num_input_tokens_seen": 228204050, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.10247803, + "step": 10584, + "time_per_iteration": 2.527031421661377 + }, + { + "auxiliary_loss_clip": 0.06414576, + "auxiliary_loss_mlp": 0.01267643, + "balance_loss_clip": 0.06273754, + "balance_loss_mlp": 0.01256139, + "epoch": 0.6364046294904554, + "flos": 19469879038080.0, + "grad_norm": 1.7122506045265105, + "language_loss": 0.73591262, + "learning_rate": 1.2335987852004985e-06, + "loss": 0.81273478, + "num_input_tokens_seen": 228222430, + "router_z_loss_clip": 1.40722656, + "router_z_loss_mlp": 0.11505127, + "step": 10585, + "time_per_iteration": 2.4975733757019043 + }, + { + "auxiliary_loss_clip": 0.06413724, + "auxiliary_loss_mlp": 0.01264888, + "balance_loss_clip": 0.06275959, + "balance_loss_mlp": 0.01254981, + "epoch": 0.6364647527431234, + "flos": 21002176252800.0, + "grad_norm": 1.805788279769041, + "language_loss": 0.83174026, + "learning_rate": 1.2332390675544697e-06, + "loss": 0.9085263, + "num_input_tokens_seen": 228241925, + "router_z_loss_clip": 1.37695312, + "router_z_loss_mlp": 0.09906006, + "step": 10586, + "time_per_iteration": 2.531947612762451 + }, + { + "auxiliary_loss_clip": 0.06412107, + "auxiliary_loss_mlp": 0.0126422, + "balance_loss_clip": 0.06275982, + "balance_loss_mlp": 0.0125435, + "epoch": 0.6365248759957913, + "flos": 25777079896320.0, + "grad_norm": 1.5441547949198797, + "language_loss": 0.72916567, + "learning_rate": 1.2328793789840918e-06, + "loss": 0.80592889, + "num_input_tokens_seen": 228262535, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.09863281, + "step": 10587, + "time_per_iteration": 2.589169979095459 + }, + { + "auxiliary_loss_clip": 0.06412084, + "auxiliary_loss_mlp": 0.01264457, + "balance_loss_clip": 0.062725, + "balance_loss_mlp": 0.01254014, + "epoch": 0.6365849992484593, + "flos": 22462161793920.0, + "grad_norm": 2.0110608871651823, + "language_loss": 0.77360207, + "learning_rate": 1.2325197195030058e-06, + "loss": 0.85036743, + "num_input_tokens_seen": 228281340, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.10443115, + "step": 10588, + "time_per_iteration": 2.5107719898223877 + }, + { + "auxiliary_loss_clip": 0.06404337, + "auxiliary_loss_mlp": 0.01266834, + "balance_loss_clip": 0.06271751, + "balance_loss_mlp": 0.01256564, + "epoch": 0.6366451225011273, + "flos": 19031648584320.0, + "grad_norm": 1.403923680448765, + "language_loss": 0.79945314, + "learning_rate": 1.2321600891248478e-06, + "loss": 0.87616491, + "num_input_tokens_seen": 228300865, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.10266113, + "step": 10589, + "time_per_iteration": 2.5198166370391846 + }, + { + "auxiliary_loss_clip": 0.06407724, + "auxiliary_loss_mlp": 0.01266892, + "balance_loss_clip": 0.06272867, + "balance_loss_mlp": 0.01256616, + "epoch": 0.6367052457537953, + "flos": 25235413176960.0, + "grad_norm": 1.9669131634706534, + "language_loss": 0.67181933, + "learning_rate": 1.231800487863257e-06, + "loss": 0.74856544, + "num_input_tokens_seen": 228320815, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.1027832, + "step": 10590, + "time_per_iteration": 2.5376667976379395 + }, + { + "auxiliary_loss_clip": 0.0642258, + "auxiliary_loss_mlp": 0.01266478, + "balance_loss_clip": 0.06278451, + "balance_loss_mlp": 0.01254945, + "epoch": 0.6367653690064633, + "flos": 19214482193280.0, + "grad_norm": 1.635127472973657, + "language_loss": 0.7910291, + "learning_rate": 1.2314409157318685e-06, + "loss": 0.86791968, + "num_input_tokens_seen": 228339065, + "router_z_loss_clip": 1.44238281, + "router_z_loss_mlp": 0.11523438, + "step": 10591, + "time_per_iteration": 2.542515993118286 + }, + { + "auxiliary_loss_clip": 0.06405823, + "auxiliary_loss_mlp": 0.01265189, + "balance_loss_clip": 0.06271368, + "balance_loss_mlp": 0.0125564, + "epoch": 0.6368254922591312, + "flos": 23553000172800.0, + "grad_norm": 1.3721943309197018, + "language_loss": 0.89071333, + "learning_rate": 1.231081372744317e-06, + "loss": 0.96742344, + "num_input_tokens_seen": 228359210, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.09552002, + "step": 10592, + "time_per_iteration": 2.51094126701355 + }, + { + "auxiliary_loss_clip": 0.06405515, + "auxiliary_loss_mlp": 0.01266442, + "balance_loss_clip": 0.06272536, + "balance_loss_mlp": 0.01256906, + "epoch": 0.6368856155117992, + "flos": 26474270993280.0, + "grad_norm": 1.3189503052137, + "language_loss": 0.68928409, + "learning_rate": 1.2307218589142376e-06, + "loss": 0.76600361, + "num_input_tokens_seen": 228379630, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.09533691, + "step": 10593, + "time_per_iteration": 2.5533511638641357 + }, + { + "auxiliary_loss_clip": 0.06408849, + "auxiliary_loss_mlp": 0.01266265, + "balance_loss_clip": 0.06273521, + "balance_loss_mlp": 0.01256329, + "epoch": 0.6369457387644671, + "flos": 33700754995200.0, + "grad_norm": 1.6851555086975611, + "language_loss": 0.6369772, + "learning_rate": 1.2303623742552618e-06, + "loss": 0.71372831, + "num_input_tokens_seen": 228401410, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.09942627, + "step": 10594, + "time_per_iteration": 2.6149699687957764 + }, + { + "auxiliary_loss_clip": 0.06308158, + "auxiliary_loss_mlp": 0.01250909, + "balance_loss_clip": 0.06252004, + "balance_loss_mlp": 0.01249539, + "epoch": 0.6370058620171352, + "flos": 70929365316480.0, + "grad_norm": 0.7572264790485472, + "language_loss": 0.54663223, + "learning_rate": 1.230002918781022e-06, + "loss": 0.6222229, + "num_input_tokens_seen": 228470335, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01372528, + "step": 10595, + "time_per_iteration": 4.630947589874268 + }, + { + "auxiliary_loss_clip": 0.06416945, + "auxiliary_loss_mlp": 0.01266296, + "balance_loss_clip": 0.06275225, + "balance_loss_mlp": 0.01255436, + "epoch": 0.6370659852698031, + "flos": 21148267046400.0, + "grad_norm": 1.6750235845380184, + "language_loss": 0.66897941, + "learning_rate": 1.2296434925051493e-06, + "loss": 0.74581182, + "num_input_tokens_seen": 228490765, + "router_z_loss_clip": 1.41699219, + "router_z_loss_mlp": 0.10858154, + "step": 10596, + "time_per_iteration": 2.550053834915161 + }, + { + "auxiliary_loss_clip": 0.06410693, + "auxiliary_loss_mlp": 0.01266417, + "balance_loss_clip": 0.06275079, + "balance_loss_mlp": 0.01256022, + "epoch": 0.6371261085224711, + "flos": 20199452538240.0, + "grad_norm": 4.2038058583126405, + "language_loss": 0.79555941, + "learning_rate": 1.2292840954412718e-06, + "loss": 0.87233055, + "num_input_tokens_seen": 228509700, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.10400391, + "step": 10597, + "time_per_iteration": 2.5332624912261963 + }, + { + "auxiliary_loss_clip": 0.06414443, + "auxiliary_loss_mlp": 0.01266294, + "balance_loss_clip": 0.06275137, + "balance_loss_mlp": 0.01255446, + "epoch": 0.637186231775139, + "flos": 19689790878720.0, + "grad_norm": 1.6206633129115742, + "language_loss": 0.7509104, + "learning_rate": 1.2289247276030189e-06, + "loss": 0.82771772, + "num_input_tokens_seen": 228529050, + "router_z_loss_clip": 1.39257812, + "router_z_loss_mlp": 0.10852051, + "step": 10598, + "time_per_iteration": 2.5732879638671875 + }, + { + "auxiliary_loss_clip": 0.06411502, + "auxiliary_loss_mlp": 0.01263084, + "balance_loss_clip": 0.06272967, + "balance_loss_mlp": 0.01253381, + "epoch": 0.637246355027807, + "flos": 13074937355520.0, + "grad_norm": 1.7290939316313776, + "language_loss": 0.68839526, + "learning_rate": 1.2285653890040176e-06, + "loss": 0.76514107, + "num_input_tokens_seen": 228544665, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.0970459, + "step": 10599, + "time_per_iteration": 2.476140260696411 + }, + { + "auxiliary_loss_clip": 0.06417891, + "auxiliary_loss_mlp": 0.01266352, + "balance_loss_clip": 0.06276424, + "balance_loss_mlp": 0.01254664, + "epoch": 0.6373064782804749, + "flos": 18228421745280.0, + "grad_norm": 1.9832548083292807, + "language_loss": 0.80652881, + "learning_rate": 1.2282060796578942e-06, + "loss": 0.88337129, + "num_input_tokens_seen": 228562060, + "router_z_loss_clip": 1.41308594, + "router_z_loss_mlp": 0.11700439, + "step": 10600, + "time_per_iteration": 2.496344804763794 + }, + { + "auxiliary_loss_clip": 0.06407046, + "auxiliary_loss_mlp": 0.0126749, + "balance_loss_clip": 0.06272307, + "balance_loss_mlp": 0.01257626, + "epoch": 0.637366601533143, + "flos": 24505336552320.0, + "grad_norm": 1.383513371134078, + "language_loss": 0.79706007, + "learning_rate": 1.2278467995782732e-06, + "loss": 0.8738054, + "num_input_tokens_seen": 228582550, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.09863281, + "step": 10601, + "time_per_iteration": 2.533555269241333 + }, + { + "auxiliary_loss_clip": 0.06416898, + "auxiliary_loss_mlp": 0.01263888, + "balance_loss_clip": 0.06276521, + "balance_loss_mlp": 0.01253332, + "epoch": 0.6374267247858109, + "flos": 26366180826240.0, + "grad_norm": 2.20794570441013, + "language_loss": 0.67092741, + "learning_rate": 1.2274875487787797e-06, + "loss": 0.74773526, + "num_input_tokens_seen": 228604960, + "router_z_loss_clip": 1.40332031, + "router_z_loss_mlp": 0.10559082, + "step": 10602, + "time_per_iteration": 2.5890238285064697 + }, + { + "auxiliary_loss_clip": 0.06413972, + "auxiliary_loss_mlp": 0.01266008, + "balance_loss_clip": 0.06275181, + "balance_loss_mlp": 0.0125578, + "epoch": 0.6374868480384789, + "flos": 20377254902400.0, + "grad_norm": 1.5742012675871089, + "language_loss": 0.79736137, + "learning_rate": 1.2271283272730354e-06, + "loss": 0.87416112, + "num_input_tokens_seen": 228622195, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.10223389, + "step": 10603, + "time_per_iteration": 2.4978857040405273 + }, + { + "auxiliary_loss_clip": 0.0641142, + "auxiliary_loss_mlp": 0.01265674, + "balance_loss_clip": 0.06272836, + "balance_loss_mlp": 0.01255058, + "epoch": 0.6375469712911469, + "flos": 21002595523200.0, + "grad_norm": 2.075723287568445, + "language_loss": 0.76759392, + "learning_rate": 1.2267691350746621e-06, + "loss": 0.84436482, + "num_input_tokens_seen": 228639735, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.10626221, + "step": 10604, + "time_per_iteration": 2.5228052139282227 + }, + { + "auxiliary_loss_clip": 0.0641887, + "auxiliary_loss_mlp": 0.01265156, + "balance_loss_clip": 0.062751, + "balance_loss_mlp": 0.01253551, + "epoch": 0.6376070945438148, + "flos": 19721292814080.0, + "grad_norm": 2.969254888536146, + "language_loss": 0.77310598, + "learning_rate": 1.226409972197281e-06, + "loss": 0.84994626, + "num_input_tokens_seen": 228658195, + "router_z_loss_clip": 1.43847656, + "router_z_loss_mlp": 0.11608887, + "step": 10605, + "time_per_iteration": 2.4766769409179688 + }, + { + "auxiliary_loss_clip": 0.06417184, + "auxiliary_loss_mlp": 0.01265543, + "balance_loss_clip": 0.06277403, + "balance_loss_mlp": 0.01254087, + "epoch": 0.6376672177964828, + "flos": 21513137650560.0, + "grad_norm": 1.8415567136743551, + "language_loss": 0.66146404, + "learning_rate": 1.2260508386545106e-06, + "loss": 0.73829126, + "num_input_tokens_seen": 228677415, + "router_z_loss_clip": 1.39648438, + "router_z_loss_mlp": 0.11437988, + "step": 10606, + "time_per_iteration": 3.962454080581665 + }, + { + "auxiliary_loss_clip": 0.06409881, + "auxiliary_loss_mlp": 0.01267672, + "balance_loss_clip": 0.06276855, + "balance_loss_mlp": 0.01257891, + "epoch": 0.6377273410491507, + "flos": 18849905078400.0, + "grad_norm": 1.5392078588294233, + "language_loss": 0.75399411, + "learning_rate": 1.225691734459971e-06, + "loss": 0.8307696, + "num_input_tokens_seen": 228696450, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.09777832, + "step": 10607, + "time_per_iteration": 2.481400489807129 + }, + { + "auxiliary_loss_clip": 0.06417431, + "auxiliary_loss_mlp": 0.01270028, + "balance_loss_clip": 0.06278283, + "balance_loss_mlp": 0.01259514, + "epoch": 0.6377874643018188, + "flos": 53073962749440.0, + "grad_norm": 1.6290224643321956, + "language_loss": 0.655065, + "learning_rate": 1.225332659627278e-06, + "loss": 0.73193955, + "num_input_tokens_seen": 228721600, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.1050415, + "step": 10608, + "time_per_iteration": 2.80210018157959 + }, + { + "auxiliary_loss_clip": 0.06314453, + "auxiliary_loss_mlp": 0.01252573, + "balance_loss_clip": 0.0625798, + "balance_loss_mlp": 0.01251221, + "epoch": 0.6378475875544867, + "flos": 65153349417600.0, + "grad_norm": 0.7210390428690479, + "language_loss": 0.5201869, + "learning_rate": 1.2249736141700475e-06, + "loss": 0.59585714, + "num_input_tokens_seen": 228784535, + "router_z_loss_clip": 0.56591797, + "router_z_loss_mlp": 0.01354218, + "step": 10609, + "time_per_iteration": 4.542863368988037 + }, + { + "auxiliary_loss_clip": 0.06406713, + "auxiliary_loss_mlp": 0.01266217, + "balance_loss_clip": 0.06272352, + "balance_loss_mlp": 0.01257122, + "epoch": 0.6379077108071547, + "flos": 23009404809600.0, + "grad_norm": 1.4796346735577246, + "language_loss": 0.74981046, + "learning_rate": 1.2246145981018965e-06, + "loss": 0.82653975, + "num_input_tokens_seen": 228804110, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.09100342, + "step": 10610, + "time_per_iteration": 2.5884346961975098 + }, + { + "auxiliary_loss_clip": 0.06314634, + "auxiliary_loss_mlp": 0.01251771, + "balance_loss_clip": 0.06257996, + "balance_loss_mlp": 0.01250523, + "epoch": 0.6379678340598226, + "flos": 67624425849600.0, + "grad_norm": 0.8350558513372389, + "language_loss": 0.62598002, + "learning_rate": 1.2242556114364364e-06, + "loss": 0.70164406, + "num_input_tokens_seen": 228867705, + "router_z_loss_clip": 0.56689453, + "router_z_loss_mlp": 0.01247406, + "step": 10611, + "time_per_iteration": 3.208292245864868 + }, + { + "auxiliary_loss_clip": 0.06416688, + "auxiliary_loss_mlp": 0.01263819, + "balance_loss_clip": 0.06276392, + "balance_loss_mlp": 0.01253513, + "epoch": 0.6380279573124906, + "flos": 29687891109120.0, + "grad_norm": 2.188557109067727, + "language_loss": 0.72870415, + "learning_rate": 1.223896654187282e-06, + "loss": 0.80550921, + "num_input_tokens_seen": 228889215, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.10308838, + "step": 10612, + "time_per_iteration": 2.5807394981384277 + }, + { + "auxiliary_loss_clip": 0.06312064, + "auxiliary_loss_mlp": 0.01253142, + "balance_loss_clip": 0.06255382, + "balance_loss_mlp": 0.01251885, + "epoch": 0.6380880805651585, + "flos": 66502435680000.0, + "grad_norm": 0.7266099968525627, + "language_loss": 0.57775903, + "learning_rate": 1.2235377263680446e-06, + "loss": 0.65341103, + "num_input_tokens_seen": 228948465, + "router_z_loss_clip": 0.56982422, + "router_z_loss_mlp": 0.01256561, + "step": 10613, + "time_per_iteration": 3.0924766063690186 + }, + { + "auxiliary_loss_clip": 0.06422254, + "auxiliary_loss_mlp": 0.01264432, + "balance_loss_clip": 0.06280632, + "balance_loss_mlp": 0.01253483, + "epoch": 0.6381482038178266, + "flos": 23921811918720.0, + "grad_norm": 1.7742162127346608, + "language_loss": 0.75586814, + "learning_rate": 1.2231788279923334e-06, + "loss": 0.832735, + "num_input_tokens_seen": 228967955, + "router_z_loss_clip": 1.41601562, + "router_z_loss_mlp": 0.10949707, + "step": 10614, + "time_per_iteration": 2.5669398307800293 + }, + { + "auxiliary_loss_clip": 0.06413062, + "auxiliary_loss_mlp": 0.01263583, + "balance_loss_clip": 0.0627507, + "balance_loss_mlp": 0.01253277, + "epoch": 0.6382083270704945, + "flos": 24249855853440.0, + "grad_norm": 1.866062102155962, + "language_loss": 0.79879516, + "learning_rate": 1.2228199590737599e-06, + "loss": 0.87556159, + "num_input_tokens_seen": 228985495, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.10314941, + "step": 10615, + "time_per_iteration": 3.9333317279815674 + }, + { + "auxiliary_loss_clip": 0.06313558, + "auxiliary_loss_mlp": 0.01251207, + "balance_loss_clip": 0.0625703, + "balance_loss_mlp": 0.01249947, + "epoch": 0.6382684503231625, + "flos": 70798452111360.0, + "grad_norm": 0.6364915071256667, + "language_loss": 0.55039352, + "learning_rate": 1.2224611196259305e-06, + "loss": 0.62604117, + "num_input_tokens_seen": 229052995, + "router_z_loss_clip": 0.56640625, + "router_z_loss_mlp": 0.01260376, + "step": 10616, + "time_per_iteration": 3.2114999294281006 + }, + { + "auxiliary_loss_clip": 0.06411368, + "auxiliary_loss_mlp": 0.01263079, + "balance_loss_clip": 0.06272632, + "balance_loss_mlp": 0.01252654, + "epoch": 0.6383285735758305, + "flos": 16550411080320.0, + "grad_norm": 1.6623229086008653, + "language_loss": 0.84516096, + "learning_rate": 1.2221023096624538e-06, + "loss": 0.92190546, + "num_input_tokens_seen": 229071030, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.10430908, + "step": 10617, + "time_per_iteration": 2.50490665435791 + }, + { + "auxiliary_loss_clip": 0.06414464, + "auxiliary_loss_mlp": 0.01266034, + "balance_loss_clip": 0.06274582, + "balance_loss_mlp": 0.01255037, + "epoch": 0.6383886968284984, + "flos": 14432702515200.0, + "grad_norm": 1.7049012321551236, + "language_loss": 0.86996436, + "learning_rate": 1.221743529196936e-06, + "loss": 0.94676924, + "num_input_tokens_seen": 229088275, + "router_z_loss_clip": 1.39941406, + "router_z_loss_mlp": 0.10998535, + "step": 10618, + "time_per_iteration": 2.4782254695892334 + }, + { + "auxiliary_loss_clip": 0.06414133, + "auxiliary_loss_mlp": 0.01263472, + "balance_loss_clip": 0.06273396, + "balance_loss_mlp": 0.01253536, + "epoch": 0.6384488200811664, + "flos": 17935191982080.0, + "grad_norm": 1.660467856665914, + "language_loss": 0.73454595, + "learning_rate": 1.2213847782429806e-06, + "loss": 0.81132197, + "num_input_tokens_seen": 229105190, + "router_z_loss_clip": 1.40527344, + "router_z_loss_mlp": 0.09936523, + "step": 10619, + "time_per_iteration": 2.5073039531707764 + }, + { + "auxiliary_loss_clip": 0.06421836, + "auxiliary_loss_mlp": 0.01269484, + "balance_loss_clip": 0.06276071, + "balance_loss_mlp": 0.01258475, + "epoch": 0.6385089433338343, + "flos": 18521567654400.0, + "grad_norm": 1.8426309945064288, + "language_loss": 0.7661649, + "learning_rate": 1.221026056814193e-06, + "loss": 0.84307802, + "num_input_tokens_seen": 229122290, + "router_z_loss_clip": 1.45703125, + "router_z_loss_mlp": 0.11010742, + "step": 10620, + "time_per_iteration": 2.5297937393188477 + }, + { + "auxiliary_loss_clip": 0.06419566, + "auxiliary_loss_mlp": 0.01267834, + "balance_loss_clip": 0.0628044, + "balance_loss_mlp": 0.01256963, + "epoch": 0.6385690665865024, + "flos": 24760481834880.0, + "grad_norm": 2.368652650522925, + "language_loss": 0.70688897, + "learning_rate": 1.2206673649241752e-06, + "loss": 0.78376299, + "num_input_tokens_seen": 229141620, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.10870361, + "step": 10621, + "time_per_iteration": 2.5605804920196533 + }, + { + "auxiliary_loss_clip": 0.0640726, + "auxiliary_loss_mlp": 0.01264019, + "balance_loss_clip": 0.06274956, + "balance_loss_mlp": 0.01254887, + "epoch": 0.6386291898391703, + "flos": 20126763521280.0, + "grad_norm": 1.5541804815340177, + "language_loss": 0.77669823, + "learning_rate": 1.220308702586529e-06, + "loss": 0.85341108, + "num_input_tokens_seen": 229161570, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.09130859, + "step": 10622, + "time_per_iteration": 2.495631217956543 + }, + { + "auxiliary_loss_clip": 0.06408195, + "auxiliary_loss_mlp": 0.0126391, + "balance_loss_clip": 0.06273771, + "balance_loss_mlp": 0.01253903, + "epoch": 0.6386893130918383, + "flos": 16871914396800.0, + "grad_norm": 1.737894673487703, + "language_loss": 0.74773431, + "learning_rate": 1.2199500698148546e-06, + "loss": 0.82445532, + "num_input_tokens_seen": 229178465, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.10015869, + "step": 10623, + "time_per_iteration": 2.5214576721191406 + }, + { + "auxiliary_loss_clip": 0.0640855, + "auxiliary_loss_mlp": 0.01264001, + "balance_loss_clip": 0.06273185, + "balance_loss_mlp": 0.01254512, + "epoch": 0.6387494363445062, + "flos": 22972913556480.0, + "grad_norm": 1.3339080512049293, + "language_loss": 0.77151477, + "learning_rate": 1.2195914666227527e-06, + "loss": 0.84824026, + "num_input_tokens_seen": 229198975, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.09490967, + "step": 10624, + "time_per_iteration": 2.5108532905578613 + }, + { + "auxiliary_loss_clip": 0.064144, + "auxiliary_loss_mlp": 0.01262692, + "balance_loss_clip": 0.06276258, + "balance_loss_mlp": 0.01252637, + "epoch": 0.6388095595971742, + "flos": 22864487973120.0, + "grad_norm": 1.5899649446688702, + "language_loss": 0.80630493, + "learning_rate": 1.21923289302382e-06, + "loss": 0.88307583, + "num_input_tokens_seen": 229218825, + "router_z_loss_clip": 1.38183594, + "router_z_loss_mlp": 0.10064697, + "step": 10625, + "time_per_iteration": 2.5426197052001953 + }, + { + "auxiliary_loss_clip": 0.06416376, + "auxiliary_loss_mlp": 0.0126597, + "balance_loss_clip": 0.06277139, + "balance_loss_mlp": 0.01254842, + "epoch": 0.6388696828498421, + "flos": 17317314374400.0, + "grad_norm": 1.7136519687434957, + "language_loss": 0.72979832, + "learning_rate": 1.218874349031654e-06, + "loss": 0.80662179, + "num_input_tokens_seen": 229236060, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.11126709, + "step": 10626, + "time_per_iteration": 2.494306802749634 + }, + { + "auxiliary_loss_clip": 0.06408393, + "auxiliary_loss_mlp": 0.01265881, + "balance_loss_clip": 0.06270021, + "balance_loss_mlp": 0.01255015, + "epoch": 0.6389298061025102, + "flos": 17134313057280.0, + "grad_norm": 1.513972649351316, + "language_loss": 0.73141295, + "learning_rate": 1.2185158346598517e-06, + "loss": 0.80815566, + "num_input_tokens_seen": 229255160, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.10870361, + "step": 10627, + "time_per_iteration": 2.5244781970977783 + }, + { + "auxiliary_loss_clip": 0.06419984, + "auxiliary_loss_mlp": 0.01267157, + "balance_loss_clip": 0.06274766, + "balance_loss_mlp": 0.01255391, + "epoch": 0.6389899293551781, + "flos": 27718663178880.0, + "grad_norm": 1.6703880840860492, + "language_loss": 0.66923428, + "learning_rate": 1.2181573499220064e-06, + "loss": 0.74610573, + "num_input_tokens_seen": 229278705, + "router_z_loss_clip": 1.45117188, + "router_z_loss_mlp": 0.11773682, + "step": 10628, + "time_per_iteration": 2.575000762939453 + }, + { + "auxiliary_loss_clip": 0.06410551, + "auxiliary_loss_mlp": 0.01264342, + "balance_loss_clip": 0.0627692, + "balance_loss_mlp": 0.01254197, + "epoch": 0.6390500526078461, + "flos": 21222171947520.0, + "grad_norm": 1.956585229435901, + "language_loss": 0.68194425, + "learning_rate": 1.2177988948317135e-06, + "loss": 0.7586931, + "num_input_tokens_seen": 229299990, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.10150146, + "step": 10629, + "time_per_iteration": 2.5807948112487793 + }, + { + "auxiliary_loss_clip": 0.06422858, + "auxiliary_loss_mlp": 0.01271827, + "balance_loss_clip": 0.0627673, + "balance_loss_mlp": 0.01258708, + "epoch": 0.6391101758605141, + "flos": 21587671457280.0, + "grad_norm": 1.5207801965767835, + "language_loss": 0.75444686, + "learning_rate": 1.2174404694025646e-06, + "loss": 0.83139372, + "num_input_tokens_seen": 229319230, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.13116455, + "step": 10630, + "time_per_iteration": 2.5017268657684326 + }, + { + "auxiliary_loss_clip": 0.06408527, + "auxiliary_loss_mlp": 0.01264942, + "balance_loss_clip": 0.06272866, + "balance_loss_mlp": 0.01255, + "epoch": 0.639170299113182, + "flos": 19906432410240.0, + "grad_norm": 1.6356950234102068, + "language_loss": 0.70487773, + "learning_rate": 1.2170820736481511e-06, + "loss": 0.78161246, + "num_input_tokens_seen": 229338600, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.09942627, + "step": 10631, + "time_per_iteration": 2.55197811126709 + }, + { + "auxiliary_loss_clip": 0.06314358, + "auxiliary_loss_mlp": 0.01251531, + "balance_loss_clip": 0.06258033, + "balance_loss_mlp": 0.01250199, + "epoch": 0.63923042236585, + "flos": 69896625344640.0, + "grad_norm": 0.7602289508759135, + "language_loss": 0.62733555, + "learning_rate": 1.2167237075820646e-06, + "loss": 0.70299447, + "num_input_tokens_seen": 229402420, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01333618, + "step": 10632, + "time_per_iteration": 3.190108060836792 + }, + { + "auxiliary_loss_clip": 0.06410427, + "auxiliary_loss_mlp": 0.01266129, + "balance_loss_clip": 0.0627519, + "balance_loss_mlp": 0.01255948, + "epoch": 0.639290545618518, + "flos": 22681486656000.0, + "grad_norm": 2.160270989856127, + "language_loss": 0.66821963, + "learning_rate": 1.216365371217893e-06, + "loss": 0.74498516, + "num_input_tokens_seen": 229419185, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.10174561, + "step": 10633, + "time_per_iteration": 2.552823543548584 + }, + { + "auxiliary_loss_clip": 0.06411168, + "auxiliary_loss_mlp": 0.01267004, + "balance_loss_clip": 0.06274083, + "balance_loss_mlp": 0.01256472, + "epoch": 0.639350668871186, + "flos": 19835420474880.0, + "grad_norm": 2.0078331211958638, + "language_loss": 0.82085246, + "learning_rate": 1.216007064569225e-06, + "loss": 0.89763421, + "num_input_tokens_seen": 229436735, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.10540771, + "step": 10634, + "time_per_iteration": 3.9264204502105713 + }, + { + "auxiliary_loss_clip": 0.06411835, + "auxiliary_loss_mlp": 0.01269552, + "balance_loss_clip": 0.06274228, + "balance_loss_mlp": 0.01258585, + "epoch": 0.6394107921238539, + "flos": 20558746846080.0, + "grad_norm": 1.4689992647467067, + "language_loss": 0.75053954, + "learning_rate": 1.2156487876496483e-06, + "loss": 0.82735342, + "num_input_tokens_seen": 229455595, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.10968018, + "step": 10635, + "time_per_iteration": 2.4891774654388428 + }, + { + "auxiliary_loss_clip": 0.06409803, + "auxiliary_loss_mlp": 0.01264504, + "balance_loss_clip": 0.06272061, + "balance_loss_mlp": 0.01254878, + "epoch": 0.6394709153765219, + "flos": 25781985360000.0, + "grad_norm": 1.6046642220248264, + "language_loss": 0.71619642, + "learning_rate": 1.2152905404727475e-06, + "loss": 0.79293942, + "num_input_tokens_seen": 229476230, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.09637451, + "step": 10636, + "time_per_iteration": 2.5812439918518066 + }, + { + "auxiliary_loss_clip": 0.06415339, + "auxiliary_loss_mlp": 0.01266128, + "balance_loss_clip": 0.0627417, + "balance_loss_mlp": 0.01255352, + "epoch": 0.6395310386291898, + "flos": 17535926476800.0, + "grad_norm": 2.1920700627694867, + "language_loss": 0.73530567, + "learning_rate": 1.2149323230521085e-06, + "loss": 0.81212032, + "num_input_tokens_seen": 229494300, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.10772705, + "step": 10637, + "time_per_iteration": 2.485643148422241 + }, + { + "auxiliary_loss_clip": 0.0641741, + "auxiliary_loss_mlp": 0.01266874, + "balance_loss_clip": 0.06276354, + "balance_loss_mlp": 0.01255871, + "epoch": 0.6395911618818578, + "flos": 18594172817280.0, + "grad_norm": 1.7577292466251317, + "language_loss": 0.78289723, + "learning_rate": 1.2145741354013143e-06, + "loss": 0.85974002, + "num_input_tokens_seen": 229512985, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.10986328, + "step": 10638, + "time_per_iteration": 2.482006549835205 + }, + { + "auxiliary_loss_clip": 0.06409052, + "auxiliary_loss_mlp": 0.01264378, + "balance_loss_clip": 0.0627173, + "balance_loss_mlp": 0.01253655, + "epoch": 0.6396512851345257, + "flos": 28374164069760.0, + "grad_norm": 1.4288466998721474, + "language_loss": 0.815153, + "learning_rate": 1.2142159775339478e-06, + "loss": 0.89188731, + "num_input_tokens_seen": 229534270, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.10717773, + "step": 10639, + "time_per_iteration": 2.553853750228882 + }, + { + "auxiliary_loss_clip": 0.06314266, + "auxiliary_loss_mlp": 0.01251751, + "balance_loss_clip": 0.06258021, + "balance_loss_mlp": 0.01250554, + "epoch": 0.6397114083871938, + "flos": 70744728844800.0, + "grad_norm": 0.7996184433796636, + "language_loss": 0.59009802, + "learning_rate": 1.21385784946359e-06, + "loss": 0.66575813, + "num_input_tokens_seen": 229596455, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01195526, + "step": 10640, + "time_per_iteration": 3.0804762840270996 + }, + { + "auxiliary_loss_clip": 0.0640569, + "auxiliary_loss_mlp": 0.01265577, + "balance_loss_clip": 0.06272022, + "balance_loss_mlp": 0.01255963, + "epoch": 0.6397715316398617, + "flos": 18147095758080.0, + "grad_norm": 1.6659836554468106, + "language_loss": 0.78961474, + "learning_rate": 1.2134997512038215e-06, + "loss": 0.8663274, + "num_input_tokens_seen": 229612860, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.09619141, + "step": 10641, + "time_per_iteration": 2.470735788345337 + }, + { + "auxiliary_loss_clip": 0.06423657, + "auxiliary_loss_mlp": 0.01266539, + "balance_loss_clip": 0.06278598, + "balance_loss_mlp": 0.01255422, + "epoch": 0.6398316548925297, + "flos": 25746668064000.0, + "grad_norm": 2.1982581134788672, + "language_loss": 0.63584703, + "learning_rate": 1.2131416827682209e-06, + "loss": 0.712749, + "num_input_tokens_seen": 229633960, + "router_z_loss_clip": 1.45117188, + "router_z_loss_mlp": 0.11120605, + "step": 10642, + "time_per_iteration": 2.572493314743042 + }, + { + "auxiliary_loss_clip": 0.06314563, + "auxiliary_loss_mlp": 0.0125166, + "balance_loss_clip": 0.06258431, + "balance_loss_mlp": 0.01250544, + "epoch": 0.6398917781451977, + "flos": 71231246778240.0, + "grad_norm": 0.888550554325656, + "language_loss": 0.55987263, + "learning_rate": 1.2127836441703667e-06, + "loss": 0.63553476, + "num_input_tokens_seen": 229686730, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01118469, + "step": 10643, + "time_per_iteration": 3.0916545391082764 + }, + { + "auxiliary_loss_clip": 0.06416592, + "auxiliary_loss_mlp": 0.01266284, + "balance_loss_clip": 0.06274326, + "balance_loss_mlp": 0.01255252, + "epoch": 0.6399519013978656, + "flos": 20528083451520.0, + "grad_norm": 1.8692423093064807, + "language_loss": 0.772012, + "learning_rate": 1.2124256354238358e-06, + "loss": 0.84884077, + "num_input_tokens_seen": 229704800, + "router_z_loss_clip": 1.42285156, + "router_z_loss_mlp": 0.11022949, + "step": 10644, + "time_per_iteration": 2.523844003677368 + }, + { + "auxiliary_loss_clip": 0.06409791, + "auxiliary_loss_mlp": 0.0126534, + "balance_loss_clip": 0.06274743, + "balance_loss_mlp": 0.01254676, + "epoch": 0.6400120246505336, + "flos": 24467503633920.0, + "grad_norm": 1.3560803021320431, + "language_loss": 0.82639438, + "learning_rate": 1.212067656542203e-06, + "loss": 0.90314567, + "num_input_tokens_seen": 229725265, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.10675049, + "step": 10645, + "time_per_iteration": 2.546128749847412 + }, + { + "auxiliary_loss_clip": 0.06421367, + "auxiliary_loss_mlp": 0.01263793, + "balance_loss_clip": 0.06277816, + "balance_loss_mlp": 0.01251997, + "epoch": 0.6400721479032015, + "flos": 28373619018240.0, + "grad_norm": 1.814178451427478, + "language_loss": 0.73952079, + "learning_rate": 1.2117097075390447e-06, + "loss": 0.81637239, + "num_input_tokens_seen": 229744840, + "router_z_loss_clip": 1.43652344, + "router_z_loss_mlp": 0.11798096, + "step": 10646, + "time_per_iteration": 3.966240167617798 + }, + { + "auxiliary_loss_clip": 0.06412562, + "auxiliary_loss_mlp": 0.01268277, + "balance_loss_clip": 0.06275235, + "balance_loss_mlp": 0.01257167, + "epoch": 0.6401322711558696, + "flos": 17821441664640.0, + "grad_norm": 1.9335985649403467, + "language_loss": 0.80623794, + "learning_rate": 1.2113517884279327e-06, + "loss": 0.88304639, + "num_input_tokens_seen": 229759095, + "router_z_loss_clip": 1.37402344, + "router_z_loss_mlp": 0.11114502, + "step": 10647, + "time_per_iteration": 2.497234582901001 + }, + { + "auxiliary_loss_clip": 0.06410154, + "auxiliary_loss_mlp": 0.01265126, + "balance_loss_clip": 0.06276208, + "balance_loss_mlp": 0.01255094, + "epoch": 0.6401923944085375, + "flos": 26037969183360.0, + "grad_norm": 1.5109233302980645, + "language_loss": 0.75784671, + "learning_rate": 1.2109938992224399e-06, + "loss": 0.83459949, + "num_input_tokens_seen": 229777750, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.10028076, + "step": 10648, + "time_per_iteration": 2.5445501804351807 + }, + { + "auxiliary_loss_clip": 0.06407083, + "auxiliary_loss_mlp": 0.01263508, + "balance_loss_clip": 0.06269361, + "balance_loss_mlp": 0.01253525, + "epoch": 0.6402525176612055, + "flos": 23593181005440.0, + "grad_norm": 1.948589206417596, + "language_loss": 0.79203671, + "learning_rate": 1.210636039936138e-06, + "loss": 0.86874264, + "num_input_tokens_seen": 229796785, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.09979248, + "step": 10649, + "time_per_iteration": 3.9821319580078125 + }, + { + "auxiliary_loss_clip": 0.06411543, + "auxiliary_loss_mlp": 0.01264939, + "balance_loss_clip": 0.06272741, + "balance_loss_mlp": 0.01254222, + "epoch": 0.6403126409138734, + "flos": 18047349072000.0, + "grad_norm": 2.12746104130849, + "language_loss": 0.75310314, + "learning_rate": 1.2102782105825956e-06, + "loss": 0.82986802, + "num_input_tokens_seen": 229815425, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.1071167, + "step": 10650, + "time_per_iteration": 2.488818883895874 + }, + { + "auxiliary_loss_clip": 0.06408805, + "auxiliary_loss_mlp": 0.01268267, + "balance_loss_clip": 0.06273156, + "balance_loss_mlp": 0.01256513, + "epoch": 0.6403727641665414, + "flos": 21985679151360.0, + "grad_norm": 1.3966136649863612, + "language_loss": 0.70929539, + "learning_rate": 1.2099204111753833e-06, + "loss": 0.78606611, + "num_input_tokens_seen": 229834545, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.11743164, + "step": 10651, + "time_per_iteration": 2.5219950675964355 + }, + { + "auxiliary_loss_clip": 0.06413059, + "auxiliary_loss_mlp": 0.01269512, + "balance_loss_clip": 0.06274731, + "balance_loss_mlp": 0.0125824, + "epoch": 0.6404328874192093, + "flos": 24901751018880.0, + "grad_norm": 2.1293665277256624, + "language_loss": 0.64404488, + "learning_rate": 1.2095626417280684e-06, + "loss": 0.72087055, + "num_input_tokens_seen": 229849175, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.11273193, + "step": 10652, + "time_per_iteration": 2.5231480598449707 + }, + { + "auxiliary_loss_clip": 0.06411535, + "auxiliary_loss_mlp": 0.01262653, + "balance_loss_clip": 0.06274502, + "balance_loss_mlp": 0.01252509, + "epoch": 0.6404930106718774, + "flos": 17601991021440.0, + "grad_norm": 1.8908665793351147, + "language_loss": 0.79652649, + "learning_rate": 1.2092049022542168e-06, + "loss": 0.87326837, + "num_input_tokens_seen": 229865400, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.10150146, + "step": 10653, + "time_per_iteration": 2.5704574584960938 + }, + { + "auxiliary_loss_clip": 0.06425246, + "auxiliary_loss_mlp": 0.0127165, + "balance_loss_clip": 0.06277368, + "balance_loss_mlp": 0.01259973, + "epoch": 0.6405531339245453, + "flos": 20164219096320.0, + "grad_norm": 2.6567000735134463, + "language_loss": 0.70885104, + "learning_rate": 1.2088471927673952e-06, + "loss": 0.78582001, + "num_input_tokens_seen": 229882945, + "router_z_loss_clip": 1.47753906, + "router_z_loss_mlp": 0.11682129, + "step": 10654, + "time_per_iteration": 2.534069061279297 + }, + { + "auxiliary_loss_clip": 0.0641733, + "auxiliary_loss_mlp": 0.01267285, + "balance_loss_clip": 0.06274031, + "balance_loss_mlp": 0.0125574, + "epoch": 0.6406132571772133, + "flos": 21948349357440.0, + "grad_norm": 1.5377239110005414, + "language_loss": 0.72583055, + "learning_rate": 1.2084895132811666e-06, + "loss": 0.80267668, + "num_input_tokens_seen": 229901590, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.11553955, + "step": 10655, + "time_per_iteration": 3.9230480194091797 + }, + { + "auxiliary_loss_clip": 0.06412716, + "auxiliary_loss_mlp": 0.01268726, + "balance_loss_clip": 0.06272289, + "balance_loss_mlp": 0.01257074, + "epoch": 0.6406733804298813, + "flos": 28775693635200.0, + "grad_norm": 1.9128350177290707, + "language_loss": 0.82931209, + "learning_rate": 1.2081318638090952e-06, + "loss": 0.90612656, + "num_input_tokens_seen": 229922535, + "router_z_loss_clip": 1.40332031, + "router_z_loss_mlp": 0.11657715, + "step": 10656, + "time_per_iteration": 2.601238489151001 + }, + { + "auxiliary_loss_clip": 0.06410467, + "auxiliary_loss_mlp": 0.01268343, + "balance_loss_clip": 0.06271605, + "balance_loss_mlp": 0.01257817, + "epoch": 0.6407335036825492, + "flos": 17462943970560.0, + "grad_norm": 3.923220638478792, + "language_loss": 0.72232449, + "learning_rate": 1.2077742443647433e-06, + "loss": 0.79911268, + "num_input_tokens_seen": 229939575, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.10516357, + "step": 10657, + "time_per_iteration": 2.478569984436035 + }, + { + "auxiliary_loss_clip": 0.06411502, + "auxiliary_loss_mlp": 0.01272042, + "balance_loss_clip": 0.06272899, + "balance_loss_mlp": 0.01261766, + "epoch": 0.6407936269352172, + "flos": 22131476455680.0, + "grad_norm": 1.5017144440006371, + "language_loss": 0.77455044, + "learning_rate": 1.2074166549616707e-06, + "loss": 0.85138589, + "num_input_tokens_seen": 229958840, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.10272217, + "step": 10658, + "time_per_iteration": 2.6262331008911133 + }, + { + "auxiliary_loss_clip": 0.06414957, + "auxiliary_loss_mlp": 0.01267425, + "balance_loss_clip": 0.06273896, + "balance_loss_mlp": 0.01256494, + "epoch": 0.6408537501878852, + "flos": 23117033779200.0, + "grad_norm": 1.5568653096914684, + "language_loss": 0.76262242, + "learning_rate": 1.2070590956134386e-06, + "loss": 0.83944625, + "num_input_tokens_seen": 229979680, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.10943604, + "step": 10659, + "time_per_iteration": 2.5234532356262207 + }, + { + "auxiliary_loss_clip": 0.06413037, + "auxiliary_loss_mlp": 0.01263947, + "balance_loss_clip": 0.06271739, + "balance_loss_mlp": 0.01253719, + "epoch": 0.6409138734405532, + "flos": 16478099406720.0, + "grad_norm": 1.5970917751630926, + "language_loss": 0.77884215, + "learning_rate": 1.2067015663336046e-06, + "loss": 0.85561204, + "num_input_tokens_seen": 229996830, + "router_z_loss_clip": 1.41210938, + "router_z_loss_mlp": 0.10229492, + "step": 10660, + "time_per_iteration": 2.522568941116333 + }, + { + "auxiliary_loss_clip": 0.0642052, + "auxiliary_loss_mlp": 0.01265628, + "balance_loss_clip": 0.06275806, + "balance_loss_mlp": 0.01253796, + "epoch": 0.6409739966932211, + "flos": 22783539329280.0, + "grad_norm": 1.8503290839739344, + "language_loss": 0.6901319, + "learning_rate": 1.206344067135727e-06, + "loss": 0.7669934, + "num_input_tokens_seen": 230015115, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.11834717, + "step": 10661, + "time_per_iteration": 2.5030124187469482 + }, + { + "auxiliary_loss_clip": 0.06407891, + "auxiliary_loss_mlp": 0.01269221, + "balance_loss_clip": 0.06273415, + "balance_loss_mlp": 0.01259017, + "epoch": 0.6410341199458891, + "flos": 25158489528960.0, + "grad_norm": 1.7100659203746285, + "language_loss": 0.7628997, + "learning_rate": 1.205986598033362e-06, + "loss": 0.83967084, + "num_input_tokens_seen": 230035515, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.10205078, + "step": 10662, + "time_per_iteration": 2.5515527725219727 + }, + { + "auxiliary_loss_clip": 0.0641142, + "auxiliary_loss_mlp": 0.01265377, + "balance_loss_clip": 0.06272576, + "balance_loss_mlp": 0.01255507, + "epoch": 0.641094243198557, + "flos": 27052428965760.0, + "grad_norm": 1.7631594614441006, + "language_loss": 0.69671446, + "learning_rate": 1.2056291590400644e-06, + "loss": 0.77348244, + "num_input_tokens_seen": 230054355, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.09863281, + "step": 10663, + "time_per_iteration": 2.5377395153045654 + }, + { + "auxiliary_loss_clip": 0.06414999, + "auxiliary_loss_mlp": 0.01271226, + "balance_loss_clip": 0.062755, + "balance_loss_mlp": 0.01258876, + "epoch": 0.641154366451225, + "flos": 25381629751680.0, + "grad_norm": 1.9040182096837255, + "language_loss": 0.68253797, + "learning_rate": 1.205271750169389e-06, + "loss": 0.75940025, + "num_input_tokens_seen": 230074605, + "router_z_loss_clip": 1.39648438, + "router_z_loss_mlp": 0.12353516, + "step": 10664, + "time_per_iteration": 2.5686044692993164 + }, + { + "auxiliary_loss_clip": 0.06408753, + "auxiliary_loss_mlp": 0.01265685, + "balance_loss_clip": 0.06271468, + "balance_loss_mlp": 0.01255081, + "epoch": 0.6412144897038929, + "flos": 25159998902400.0, + "grad_norm": 1.8980640494634613, + "language_loss": 0.66647685, + "learning_rate": 1.2049143714348881e-06, + "loss": 0.74322122, + "num_input_tokens_seen": 230093820, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.10601807, + "step": 10665, + "time_per_iteration": 2.5681324005126953 + }, + { + "auxiliary_loss_clip": 0.06406175, + "auxiliary_loss_mlp": 0.01263975, + "balance_loss_clip": 0.06270282, + "balance_loss_mlp": 0.01254027, + "epoch": 0.641274612956561, + "flos": 23447509482240.0, + "grad_norm": 1.7797122960809293, + "language_loss": 0.64406478, + "learning_rate": 1.2045570228501145e-06, + "loss": 0.72076625, + "num_input_tokens_seen": 230114285, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.0994873, + "step": 10666, + "time_per_iteration": 2.560159921646118 + }, + { + "auxiliary_loss_clip": 0.06411792, + "auxiliary_loss_mlp": 0.01267404, + "balance_loss_clip": 0.06272641, + "balance_loss_mlp": 0.01256556, + "epoch": 0.6413347362092289, + "flos": 19433597420160.0, + "grad_norm": 1.633933286881918, + "language_loss": 0.70997214, + "learning_rate": 1.2041997044286176e-06, + "loss": 0.78676403, + "num_input_tokens_seen": 230132760, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.10839844, + "step": 10667, + "time_per_iteration": 2.478955030441284 + }, + { + "auxiliary_loss_clip": 0.06424954, + "auxiliary_loss_mlp": 0.0127036, + "balance_loss_clip": 0.0627383, + "balance_loss_mlp": 0.01258004, + "epoch": 0.6413948594618969, + "flos": 17201425777920.0, + "grad_norm": 2.6317109326582204, + "language_loss": 0.78275955, + "learning_rate": 1.2038424161839484e-06, + "loss": 0.85971272, + "num_input_tokens_seen": 230149690, + "router_z_loss_clip": 1.50976562, + "router_z_loss_mlp": 0.12359619, + "step": 10668, + "time_per_iteration": 2.5198874473571777 + }, + { + "auxiliary_loss_clip": 0.06411108, + "auxiliary_loss_mlp": 0.01270624, + "balance_loss_clip": 0.06274307, + "balance_loss_mlp": 0.01259913, + "epoch": 0.6414549827145648, + "flos": 22275764386560.0, + "grad_norm": 1.497004648642511, + "language_loss": 0.67674375, + "learning_rate": 1.2034851581296544e-06, + "loss": 0.75356108, + "num_input_tokens_seen": 230166950, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.10705566, + "step": 10669, + "time_per_iteration": 2.589388132095337 + }, + { + "auxiliary_loss_clip": 0.06420371, + "auxiliary_loss_mlp": 0.01265605, + "balance_loss_clip": 0.0627445, + "balance_loss_mlp": 0.01254382, + "epoch": 0.6415151059672328, + "flos": 19645291560960.0, + "grad_norm": 1.6345904804173623, + "language_loss": 0.7890048, + "learning_rate": 1.2031279302792825e-06, + "loss": 0.86586452, + "num_input_tokens_seen": 230184785, + "router_z_loss_clip": 1.45703125, + "router_z_loss_mlp": 0.11224365, + "step": 10670, + "time_per_iteration": 2.539581537246704 + }, + { + "auxiliary_loss_clip": 0.06415358, + "auxiliary_loss_mlp": 0.01263886, + "balance_loss_clip": 0.06272778, + "balance_loss_mlp": 0.01252752, + "epoch": 0.6415752292199008, + "flos": 14871016823040.0, + "grad_norm": 2.295733548922842, + "language_loss": 0.88453639, + "learning_rate": 1.20277073264638e-06, + "loss": 0.96132886, + "num_input_tokens_seen": 230201385, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.11138916, + "step": 10671, + "time_per_iteration": 2.477959632873535 + }, + { + "auxiliary_loss_clip": 0.06407315, + "auxiliary_loss_mlp": 0.01263473, + "balance_loss_clip": 0.0627213, + "balance_loss_mlp": 0.01253591, + "epoch": 0.6416353524725688, + "flos": 13740710371200.0, + "grad_norm": 1.4227697494992897, + "language_loss": 0.6938256, + "learning_rate": 1.2024135652444907e-06, + "loss": 0.77053344, + "num_input_tokens_seen": 230220380, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.09893799, + "step": 10672, + "time_per_iteration": 2.5083000659942627 + }, + { + "auxiliary_loss_clip": 0.06417342, + "auxiliary_loss_mlp": 0.01266287, + "balance_loss_clip": 0.06272715, + "balance_loss_mlp": 0.0125343, + "epoch": 0.6416954757252368, + "flos": 24541785878400.0, + "grad_norm": 1.8997700971465656, + "language_loss": 0.74453592, + "learning_rate": 1.2020564280871593e-06, + "loss": 0.82137227, + "num_input_tokens_seen": 230239845, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.128479, + "step": 10673, + "time_per_iteration": 3.9653780460357666 + }, + { + "auxiliary_loss_clip": 0.06409254, + "auxiliary_loss_mlp": 0.01267909, + "balance_loss_clip": 0.06269009, + "balance_loss_mlp": 0.01256948, + "epoch": 0.6417555989779047, + "flos": 27717531148800.0, + "grad_norm": 1.5327640795153767, + "language_loss": 0.69868958, + "learning_rate": 1.2016993211879283e-06, + "loss": 0.77546132, + "num_input_tokens_seen": 230262420, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.10961914, + "step": 10674, + "time_per_iteration": 2.563626766204834 + }, + { + "auxiliary_loss_clip": 0.06417114, + "auxiliary_loss_mlp": 0.01264104, + "balance_loss_clip": 0.06272194, + "balance_loss_mlp": 0.01253376, + "epoch": 0.6418157222305727, + "flos": 20562604133760.0, + "grad_norm": 1.803070032007693, + "language_loss": 0.67809439, + "learning_rate": 1.201342244560338e-06, + "loss": 0.75490659, + "num_input_tokens_seen": 230279950, + "router_z_loss_clip": 1.44921875, + "router_z_loss_mlp": 0.10736084, + "step": 10675, + "time_per_iteration": 2.508819580078125 + }, + { + "auxiliary_loss_clip": 0.06411684, + "auxiliary_loss_mlp": 0.01266305, + "balance_loss_clip": 0.06274499, + "balance_loss_mlp": 0.01255648, + "epoch": 0.6418758454832406, + "flos": 22608126806400.0, + "grad_norm": 1.6761966103099513, + "language_loss": 0.66968966, + "learning_rate": 1.2009851982179307e-06, + "loss": 0.7464695, + "num_input_tokens_seen": 230299705, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.10662842, + "step": 10676, + "time_per_iteration": 2.504427909851074 + }, + { + "auxiliary_loss_clip": 0.06413673, + "auxiliary_loss_mlp": 0.01266671, + "balance_loss_clip": 0.06272808, + "balance_loss_mlp": 0.01255078, + "epoch": 0.6419359687359086, + "flos": 27381479149440.0, + "grad_norm": 1.8338510977392408, + "language_loss": 0.75681728, + "learning_rate": 1.2006281821742446e-06, + "loss": 0.83362073, + "num_input_tokens_seen": 230320030, + "router_z_loss_clip": 1.40917969, + "router_z_loss_mlp": 0.11590576, + "step": 10677, + "time_per_iteration": 2.5891265869140625 + }, + { + "auxiliary_loss_clip": 0.06311014, + "auxiliary_loss_mlp": 0.01250224, + "balance_loss_clip": 0.06254409, + "balance_loss_mlp": 0.01249042, + "epoch": 0.6419960919885765, + "flos": 67270722566400.0, + "grad_norm": 0.7408362116441561, + "language_loss": 0.60777372, + "learning_rate": 1.200271196442818e-06, + "loss": 0.68338609, + "num_input_tokens_seen": 230381495, + "router_z_loss_clip": 0.56640625, + "router_z_loss_mlp": 0.01180267, + "step": 10678, + "time_per_iteration": 3.185296058654785 + }, + { + "auxiliary_loss_clip": 0.06408557, + "auxiliary_loss_mlp": 0.01266002, + "balance_loss_clip": 0.06272914, + "balance_loss_mlp": 0.01255816, + "epoch": 0.6420562152412446, + "flos": 19908067564800.0, + "grad_norm": 2.4133916332472083, + "language_loss": 0.67507815, + "learning_rate": 1.1999142410371875e-06, + "loss": 0.75182372, + "num_input_tokens_seen": 230401385, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10186768, + "step": 10679, + "time_per_iteration": 2.5243141651153564 + }, + { + "auxiliary_loss_clip": 0.06412959, + "auxiliary_loss_mlp": 0.01264697, + "balance_loss_clip": 0.06271341, + "balance_loss_mlp": 0.0125395, + "epoch": 0.6421163384939125, + "flos": 24797056942080.0, + "grad_norm": 1.7795780158399093, + "language_loss": 0.73073864, + "learning_rate": 1.1995573159708897e-06, + "loss": 0.8075152, + "num_input_tokens_seen": 230421340, + "router_z_loss_clip": 1.41503906, + "router_z_loss_mlp": 0.10742188, + "step": 10680, + "time_per_iteration": 2.5331122875213623 + }, + { + "auxiliary_loss_clip": 0.06414793, + "auxiliary_loss_mlp": 0.01266326, + "balance_loss_clip": 0.06276178, + "balance_loss_mlp": 0.01256014, + "epoch": 0.6421764617465805, + "flos": 25599822583680.0, + "grad_norm": 2.391895628783687, + "language_loss": 0.68047994, + "learning_rate": 1.1992004212574582e-06, + "loss": 0.75729114, + "num_input_tokens_seen": 230441270, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.10308838, + "step": 10681, + "time_per_iteration": 2.53722882270813 + }, + { + "auxiliary_loss_clip": 0.06410016, + "auxiliary_loss_mlp": 0.01263743, + "balance_loss_clip": 0.06272537, + "balance_loss_mlp": 0.01253318, + "epoch": 0.6422365849992484, + "flos": 14139556606080.0, + "grad_norm": 1.5905545864535235, + "language_loss": 0.74707049, + "learning_rate": 1.198843556910427e-06, + "loss": 0.82380807, + "num_input_tokens_seen": 230457455, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.10437012, + "step": 10682, + "time_per_iteration": 2.472856283187866 + }, + { + "auxiliary_loss_clip": 0.06400837, + "auxiliary_loss_mlp": 0.01268483, + "balance_loss_clip": 0.06270464, + "balance_loss_mlp": 0.01258499, + "epoch": 0.6422967082519164, + "flos": 22390688661120.0, + "grad_norm": 1.4486797107477571, + "language_loss": 0.79339921, + "learning_rate": 1.1984867229433287e-06, + "loss": 0.87009233, + "num_input_tokens_seen": 230478955, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.09985352, + "step": 10683, + "time_per_iteration": 2.5533552169799805 + }, + { + "auxiliary_loss_clip": 0.06413358, + "auxiliary_loss_mlp": 0.01265956, + "balance_loss_clip": 0.06272833, + "balance_loss_mlp": 0.01254607, + "epoch": 0.6423568315045844, + "flos": 14653243261440.0, + "grad_norm": 1.9282526307042827, + "language_loss": 0.67605591, + "learning_rate": 1.1981299193696941e-06, + "loss": 0.75284898, + "num_input_tokens_seen": 230496425, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.11334229, + "step": 10684, + "time_per_iteration": 2.482949733734131 + }, + { + "auxiliary_loss_clip": 0.06413907, + "auxiliary_loss_mlp": 0.01267282, + "balance_loss_clip": 0.06273498, + "balance_loss_mlp": 0.01255909, + "epoch": 0.6424169547572524, + "flos": 26841237949440.0, + "grad_norm": 1.917462680158283, + "language_loss": 0.71542668, + "learning_rate": 1.1977731462030533e-06, + "loss": 0.79223859, + "num_input_tokens_seen": 230516245, + "router_z_loss_clip": 1.40527344, + "router_z_loss_mlp": 0.1137085, + "step": 10685, + "time_per_iteration": 3.9797728061676025 + }, + { + "auxiliary_loss_clip": 0.06408305, + "auxiliary_loss_mlp": 0.01271537, + "balance_loss_clip": 0.06272995, + "balance_loss_mlp": 0.01260451, + "epoch": 0.6424770780099204, + "flos": 22713449788800.0, + "grad_norm": 1.7465950797369785, + "language_loss": 0.75233316, + "learning_rate": 1.197416403456935e-06, + "loss": 0.8291316, + "num_input_tokens_seen": 230534745, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.11083984, + "step": 10686, + "time_per_iteration": 2.5496456623077393 + }, + { + "auxiliary_loss_clip": 0.06415822, + "auxiliary_loss_mlp": 0.01270285, + "balance_loss_clip": 0.06274287, + "balance_loss_mlp": 0.01258501, + "epoch": 0.6425372012625883, + "flos": 28476049034880.0, + "grad_norm": 2.381729998669287, + "language_loss": 0.68881834, + "learning_rate": 1.197059691144867e-06, + "loss": 0.76567948, + "num_input_tokens_seen": 230555895, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.11767578, + "step": 10687, + "time_per_iteration": 2.570040464401245 + }, + { + "auxiliary_loss_clip": 0.06416762, + "auxiliary_loss_mlp": 0.01265122, + "balance_loss_clip": 0.06275085, + "balance_loss_mlp": 0.01254089, + "epoch": 0.6425973245152563, + "flos": 29359469831040.0, + "grad_norm": 1.9635514388954842, + "language_loss": 0.66698802, + "learning_rate": 1.1967030092803767e-06, + "loss": 0.74380684, + "num_input_tokens_seen": 230577460, + "router_z_loss_clip": 1.41796875, + "router_z_loss_mlp": 0.11029053, + "step": 10688, + "time_per_iteration": 4.0477213859558105 + }, + { + "auxiliary_loss_clip": 0.06411983, + "auxiliary_loss_mlp": 0.01266463, + "balance_loss_clip": 0.06273896, + "balance_loss_mlp": 0.01255716, + "epoch": 0.6426574477679242, + "flos": 16435109462400.0, + "grad_norm": 1.9153737313813421, + "language_loss": 0.73537695, + "learning_rate": 1.1963463578769876e-06, + "loss": 0.81216139, + "num_input_tokens_seen": 230595030, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.10742188, + "step": 10689, + "time_per_iteration": 2.5043931007385254 + }, + { + "auxiliary_loss_clip": 0.06405617, + "auxiliary_loss_mlp": 0.01262867, + "balance_loss_clip": 0.06272008, + "balance_loss_mlp": 0.01252758, + "epoch": 0.6427175710205922, + "flos": 21842481323520.0, + "grad_norm": 2.0498755252573932, + "language_loss": 0.72094941, + "learning_rate": 1.195989736948226e-06, + "loss": 0.79763424, + "num_input_tokens_seen": 230615135, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.10101318, + "step": 10690, + "time_per_iteration": 2.5244081020355225 + }, + { + "auxiliary_loss_clip": 0.06408664, + "auxiliary_loss_mlp": 0.01266562, + "balance_loss_clip": 0.06273106, + "balance_loss_mlp": 0.01256203, + "epoch": 0.6427776942732601, + "flos": 17792623059840.0, + "grad_norm": 2.705995899316003, + "language_loss": 0.78068197, + "learning_rate": 1.1956331465076143e-06, + "loss": 0.85743421, + "num_input_tokens_seen": 230631965, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.1036377, + "step": 10691, + "time_per_iteration": 2.530010461807251 + }, + { + "auxiliary_loss_clip": 0.0641586, + "auxiliary_loss_mlp": 0.0126902, + "balance_loss_clip": 0.06274788, + "balance_loss_mlp": 0.0125822, + "epoch": 0.6428378175259282, + "flos": 15091306007040.0, + "grad_norm": 1.6963645960197293, + "language_loss": 0.74278462, + "learning_rate": 1.1952765865686738e-06, + "loss": 0.81963336, + "num_input_tokens_seen": 230649565, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.10797119, + "step": 10692, + "time_per_iteration": 2.4988198280334473 + }, + { + "auxiliary_loss_clip": 0.06415784, + "auxiliary_loss_mlp": 0.01265088, + "balance_loss_clip": 0.06276909, + "balance_loss_mlp": 0.01254371, + "epoch": 0.6428979407785961, + "flos": 23848535923200.0, + "grad_norm": 1.7731596560048748, + "language_loss": 0.61612236, + "learning_rate": 1.1949200571449263e-06, + "loss": 0.69293106, + "num_input_tokens_seen": 230669265, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.1071167, + "step": 10693, + "time_per_iteration": 2.5508644580841064 + }, + { + "auxiliary_loss_clip": 0.06415299, + "auxiliary_loss_mlp": 0.01263917, + "balance_loss_clip": 0.06272541, + "balance_loss_mlp": 0.01252258, + "epoch": 0.6429580640312641, + "flos": 32935151439360.0, + "grad_norm": 1.6308651969538634, + "language_loss": 0.59823889, + "learning_rate": 1.1945635582498903e-06, + "loss": 0.67503107, + "num_input_tokens_seen": 230690575, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.11669922, + "step": 10694, + "time_per_iteration": 3.998856544494629 + }, + { + "auxiliary_loss_clip": 0.0641511, + "auxiliary_loss_mlp": 0.012666, + "balance_loss_clip": 0.06274424, + "balance_loss_mlp": 0.01255645, + "epoch": 0.643018187283932, + "flos": 21074571780480.0, + "grad_norm": 1.333714526566846, + "language_loss": 0.79901004, + "learning_rate": 1.1942070898970853e-06, + "loss": 0.87582707, + "num_input_tokens_seen": 230709420, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.10961914, + "step": 10695, + "time_per_iteration": 2.5433716773986816 + }, + { + "auxiliary_loss_clip": 0.0641124, + "auxiliary_loss_mlp": 0.01265686, + "balance_loss_clip": 0.06271, + "balance_loss_mlp": 0.01254904, + "epoch": 0.6430783105366, + "flos": 26731973825280.0, + "grad_norm": 1.5735391795945948, + "language_loss": 0.73628104, + "learning_rate": 1.1938506521000285e-06, + "loss": 0.81305027, + "num_input_tokens_seen": 230729350, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.10778809, + "step": 10696, + "time_per_iteration": 2.5438404083251953 + }, + { + "auxiliary_loss_clip": 0.06407514, + "auxiliary_loss_mlp": 0.01265995, + "balance_loss_clip": 0.06272715, + "balance_loss_mlp": 0.01255779, + "epoch": 0.643138433789268, + "flos": 23703744867840.0, + "grad_norm": 1.7384218375133755, + "language_loss": 0.75689638, + "learning_rate": 1.1934942448722347e-06, + "loss": 0.83363152, + "num_input_tokens_seen": 230749220, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.10211182, + "step": 10697, + "time_per_iteration": 2.538093090057373 + }, + { + "auxiliary_loss_clip": 0.06406935, + "auxiliary_loss_mlp": 0.01264883, + "balance_loss_clip": 0.06271957, + "balance_loss_mlp": 0.01255066, + "epoch": 0.643198557041936, + "flos": 34210416654720.0, + "grad_norm": 1.3977759922631694, + "language_loss": 0.65892148, + "learning_rate": 1.1931378682272208e-06, + "loss": 0.73563969, + "num_input_tokens_seen": 230770245, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.09820557, + "step": 10698, + "time_per_iteration": 2.598088026046753 + }, + { + "auxiliary_loss_clip": 0.06311838, + "auxiliary_loss_mlp": 0.01254343, + "balance_loss_clip": 0.06254914, + "balance_loss_mlp": 0.01253054, + "epoch": 0.643258680294604, + "flos": 67646955398400.0, + "grad_norm": 0.7781801094870626, + "language_loss": 0.63529652, + "learning_rate": 1.1927815221784996e-06, + "loss": 0.71095836, + "num_input_tokens_seen": 230837030, + "router_z_loss_clip": 0.5703125, + "router_z_loss_mlp": 0.01290131, + "step": 10699, + "time_per_iteration": 3.115173101425171 + }, + { + "auxiliary_loss_clip": 0.06406387, + "auxiliary_loss_mlp": 0.01265934, + "balance_loss_clip": 0.06272414, + "balance_loss_mlp": 0.01256397, + "epoch": 0.6433188035472719, + "flos": 25192003962240.0, + "grad_norm": 1.4785466380460042, + "language_loss": 0.69763827, + "learning_rate": 1.1924252067395838e-06, + "loss": 0.77436155, + "num_input_tokens_seen": 230856845, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.09545898, + "step": 10700, + "time_per_iteration": 2.5910451412200928 + }, + { + "auxiliary_loss_clip": 0.06412176, + "auxiliary_loss_mlp": 0.01267748, + "balance_loss_clip": 0.0627284, + "balance_loss_mlp": 0.01256918, + "epoch": 0.6433789267999399, + "flos": 24980645237760.0, + "grad_norm": 1.528088543997644, + "language_loss": 0.73932713, + "learning_rate": 1.1920689219239855e-06, + "loss": 0.81612635, + "num_input_tokens_seen": 230878785, + "router_z_loss_clip": 1.39257812, + "router_z_loss_mlp": 0.10827637, + "step": 10701, + "time_per_iteration": 2.544930934906006 + }, + { + "auxiliary_loss_clip": 0.06417713, + "auxiliary_loss_mlp": 0.01266156, + "balance_loss_clip": 0.06274359, + "balance_loss_mlp": 0.012551, + "epoch": 0.6434390500526078, + "flos": 17571704970240.0, + "grad_norm": 2.0241741030403064, + "language_loss": 0.81973577, + "learning_rate": 1.1917126677452144e-06, + "loss": 0.8965745, + "num_input_tokens_seen": 230895445, + "router_z_loss_clip": 1.43457031, + "router_z_loss_mlp": 0.1105957, + "step": 10702, + "time_per_iteration": 2.5270791053771973 + }, + { + "auxiliary_loss_clip": 0.06410103, + "auxiliary_loss_mlp": 0.01270083, + "balance_loss_clip": 0.06273524, + "balance_loss_mlp": 0.01259927, + "epoch": 0.6434991733052758, + "flos": 20848790154240.0, + "grad_norm": 1.961461723280124, + "language_loss": 0.74951881, + "learning_rate": 1.1913564442167798e-06, + "loss": 0.82632065, + "num_input_tokens_seen": 230911375, + "router_z_loss_clip": 1.36621094, + "router_z_loss_mlp": 0.1015625, + "step": 10703, + "time_per_iteration": 2.490809917449951 + }, + { + "auxiliary_loss_clip": 0.06306668, + "auxiliary_loss_mlp": 0.01249951, + "balance_loss_clip": 0.06250144, + "balance_loss_mlp": 0.01248577, + "epoch": 0.6435592965579437, + "flos": 66114909745920.0, + "grad_norm": 0.6384717488493646, + "language_loss": 0.54610157, + "learning_rate": 1.1910002513521898e-06, + "loss": 0.62166774, + "num_input_tokens_seen": 230975990, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01377106, + "step": 10704, + "time_per_iteration": 3.160659074783325 + }, + { + "auxiliary_loss_clip": 0.06412737, + "auxiliary_loss_mlp": 0.01269762, + "balance_loss_clip": 0.06273799, + "balance_loss_mlp": 0.0125994, + "epoch": 0.6436194198106118, + "flos": 23775595344000.0, + "grad_norm": 1.7759265636720112, + "language_loss": 0.77319264, + "learning_rate": 1.1906440891649519e-06, + "loss": 0.85001761, + "num_input_tokens_seen": 230997110, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.09814453, + "step": 10705, + "time_per_iteration": 2.543015718460083 + }, + { + "auxiliary_loss_clip": 0.06412525, + "auxiliary_loss_mlp": 0.01267692, + "balance_loss_clip": 0.06272702, + "balance_loss_mlp": 0.0125694, + "epoch": 0.6436795430632797, + "flos": 20236572696960.0, + "grad_norm": 1.551816271189714, + "language_loss": 0.79286802, + "learning_rate": 1.1902879576685708e-06, + "loss": 0.86967015, + "num_input_tokens_seen": 231015590, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.10748291, + "step": 10706, + "time_per_iteration": 2.571018934249878 + }, + { + "auxiliary_loss_clip": 0.06408278, + "auxiliary_loss_mlp": 0.01264089, + "balance_loss_clip": 0.06270924, + "balance_loss_mlp": 0.01253807, + "epoch": 0.6437396663159477, + "flos": 20307878121600.0, + "grad_norm": 1.8116162091626624, + "language_loss": 0.80532277, + "learning_rate": 1.1899318568765518e-06, + "loss": 0.8820464, + "num_input_tokens_seen": 231033800, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.10284424, + "step": 10707, + "time_per_iteration": 2.49252986907959 + }, + { + "auxiliary_loss_clip": 0.06408471, + "auxiliary_loss_mlp": 0.01266248, + "balance_loss_clip": 0.06271025, + "balance_loss_mlp": 0.01256151, + "epoch": 0.6437997895686156, + "flos": 23885404519680.0, + "grad_norm": 1.5335483275855415, + "language_loss": 0.85439938, + "learning_rate": 1.1895757868023978e-06, + "loss": 0.93114662, + "num_input_tokens_seen": 231053160, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.10101318, + "step": 10708, + "time_per_iteration": 2.554351806640625 + }, + { + "auxiliary_loss_clip": 0.0642588, + "auxiliary_loss_mlp": 0.01267773, + "balance_loss_clip": 0.06278181, + "balance_loss_mlp": 0.0125649, + "epoch": 0.6438599128212836, + "flos": 18995241185280.0, + "grad_norm": 2.1632531373454507, + "language_loss": 0.66272986, + "learning_rate": 1.1892197474596106e-06, + "loss": 0.73966646, + "num_input_tokens_seen": 231069470, + "router_z_loss_clip": 1.4765625, + "router_z_loss_mlp": 0.11279297, + "step": 10709, + "time_per_iteration": 2.4882705211639404 + }, + { + "auxiliary_loss_clip": 0.06406571, + "auxiliary_loss_mlp": 0.01264597, + "balance_loss_clip": 0.06270951, + "balance_loss_mlp": 0.01254793, + "epoch": 0.6439200360739517, + "flos": 24103010373120.0, + "grad_norm": 1.6506823259196688, + "language_loss": 0.80511576, + "learning_rate": 1.1888637388616929e-06, + "loss": 0.88182747, + "num_input_tokens_seen": 231088205, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.09802246, + "step": 10710, + "time_per_iteration": 2.56453537940979 + }, + { + "auxiliary_loss_clip": 0.0640994, + "auxiliary_loss_mlp": 0.01264827, + "balance_loss_clip": 0.06274116, + "balance_loss_mlp": 0.01254676, + "epoch": 0.6439801593266196, + "flos": 31909748699520.0, + "grad_norm": 1.6423775297739596, + "language_loss": 0.66664886, + "learning_rate": 1.1885077610221425e-06, + "loss": 0.74339652, + "num_input_tokens_seen": 231107850, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.1015625, + "step": 10711, + "time_per_iteration": 2.5858142375946045 + }, + { + "auxiliary_loss_clip": 0.06416127, + "auxiliary_loss_mlp": 0.01267658, + "balance_loss_clip": 0.06276122, + "balance_loss_mlp": 0.01257155, + "epoch": 0.6440402825792876, + "flos": 27133251828480.0, + "grad_norm": 1.4850866798945335, + "language_loss": 0.78739464, + "learning_rate": 1.1881518139544597e-06, + "loss": 0.86423248, + "num_input_tokens_seen": 231127200, + "router_z_loss_clip": 1.39941406, + "router_z_loss_mlp": 0.10498047, + "step": 10712, + "time_per_iteration": 2.5875256061553955 + }, + { + "auxiliary_loss_clip": 0.06415762, + "auxiliary_loss_mlp": 0.01268856, + "balance_loss_clip": 0.0627311, + "balance_loss_mlp": 0.01258264, + "epoch": 0.6441004058319555, + "flos": 20673964609920.0, + "grad_norm": 4.153275753738836, + "language_loss": 0.82697159, + "learning_rate": 1.1877958976721417e-06, + "loss": 0.90381777, + "num_input_tokens_seen": 231146360, + "router_z_loss_clip": 1.42480469, + "router_z_loss_mlp": 0.105896, + "step": 10713, + "time_per_iteration": 3.9446072578430176 + }, + { + "auxiliary_loss_clip": 0.06405178, + "auxiliary_loss_mlp": 0.01267711, + "balance_loss_clip": 0.06273344, + "balance_loss_mlp": 0.0125691, + "epoch": 0.6441605290846235, + "flos": 26032309032960.0, + "grad_norm": 1.3361931407869754, + "language_loss": 0.78574234, + "learning_rate": 1.187440012188684e-06, + "loss": 0.86247128, + "num_input_tokens_seen": 231168350, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.10809326, + "step": 10714, + "time_per_iteration": 2.530367612838745 + }, + { + "auxiliary_loss_clip": 0.06407861, + "auxiliary_loss_mlp": 0.01264356, + "balance_loss_clip": 0.0627133, + "balance_loss_mlp": 0.01254741, + "epoch": 0.6442206523372914, + "flos": 24906362993280.0, + "grad_norm": 1.4535353305453917, + "language_loss": 0.81736881, + "learning_rate": 1.187084157517583e-06, + "loss": 0.89409101, + "num_input_tokens_seen": 231188385, + "router_z_loss_clip": 1.36621094, + "router_z_loss_mlp": 0.09619141, + "step": 10715, + "time_per_iteration": 2.563981294631958 + }, + { + "auxiliary_loss_clip": 0.06417291, + "auxiliary_loss_mlp": 0.01266314, + "balance_loss_clip": 0.06276529, + "balance_loss_mlp": 0.01255812, + "epoch": 0.6442807755899594, + "flos": 25163478846720.0, + "grad_norm": 2.5611767206234335, + "language_loss": 0.81585336, + "learning_rate": 1.186728333672332e-06, + "loss": 0.89268947, + "num_input_tokens_seen": 231209880, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.10498047, + "step": 10716, + "time_per_iteration": 2.54089617729187 + }, + { + "auxiliary_loss_clip": 0.06414896, + "auxiliary_loss_mlp": 0.0126582, + "balance_loss_clip": 0.06271458, + "balance_loss_mlp": 0.01254931, + "epoch": 0.6443408988426274, + "flos": 27351863930880.0, + "grad_norm": 1.9349198900461007, + "language_loss": 0.783328, + "learning_rate": 1.186372540666424e-06, + "loss": 0.8601352, + "num_input_tokens_seen": 231230765, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.10894775, + "step": 10717, + "time_per_iteration": 2.726794719696045 + }, + { + "auxiliary_loss_clip": 0.06407352, + "auxiliary_loss_mlp": 0.0126699, + "balance_loss_clip": 0.06274462, + "balance_loss_mlp": 0.01256929, + "epoch": 0.6444010220952954, + "flos": 27935807834880.0, + "grad_norm": 1.5112707746860563, + "language_loss": 0.68381333, + "learning_rate": 1.1860167785133513e-06, + "loss": 0.76055682, + "num_input_tokens_seen": 231252350, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.10058594, + "step": 10718, + "time_per_iteration": 2.610858201980591 + }, + { + "auxiliary_loss_clip": 0.0630646, + "auxiliary_loss_mlp": 0.01253706, + "balance_loss_clip": 0.06250188, + "balance_loss_mlp": 0.01252236, + "epoch": 0.6444611453479633, + "flos": 71232169173120.0, + "grad_norm": 0.7437918033374209, + "language_loss": 0.49586019, + "learning_rate": 1.185661047226603e-06, + "loss": 0.5714618, + "num_input_tokens_seen": 231313865, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01467896, + "step": 10719, + "time_per_iteration": 3.303040027618408 + }, + { + "auxiliary_loss_clip": 0.06416054, + "auxiliary_loss_mlp": 0.01264815, + "balance_loss_clip": 0.06274591, + "balance_loss_mlp": 0.01253598, + "epoch": 0.6445212686006313, + "flos": 22710766458240.0, + "grad_norm": 1.8616807218185105, + "language_loss": 0.77902591, + "learning_rate": 1.18530534681967e-06, + "loss": 0.8558346, + "num_input_tokens_seen": 231331710, + "router_z_loss_clip": 1.41503906, + "router_z_loss_mlp": 0.11212158, + "step": 10720, + "time_per_iteration": 2.4988739490509033 + }, + { + "auxiliary_loss_clip": 0.06409489, + "auxiliary_loss_mlp": 0.01265868, + "balance_loss_clip": 0.06272486, + "balance_loss_mlp": 0.01255556, + "epoch": 0.6445813918532992, + "flos": 21185219496960.0, + "grad_norm": 1.7169707268636247, + "language_loss": 0.77512503, + "learning_rate": 1.18494967730604e-06, + "loss": 0.85187852, + "num_input_tokens_seen": 231350705, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.10314941, + "step": 10721, + "time_per_iteration": 2.5300545692443848 + }, + { + "auxiliary_loss_clip": 0.06412297, + "auxiliary_loss_mlp": 0.01265332, + "balance_loss_clip": 0.0627277, + "balance_loss_mlp": 0.01254722, + "epoch": 0.6446415151059672, + "flos": 25198921923840.0, + "grad_norm": 2.0971313720175253, + "language_loss": 0.72901034, + "learning_rate": 1.1845940386991995e-06, + "loss": 0.80578673, + "num_input_tokens_seen": 231369550, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.1060791, + "step": 10722, + "time_per_iteration": 2.5350587368011475 + }, + { + "auxiliary_loss_clip": 0.06411985, + "auxiliary_loss_mlp": 0.01267025, + "balance_loss_clip": 0.06273404, + "balance_loss_mlp": 0.01257149, + "epoch": 0.6447016383586353, + "flos": 25309401932160.0, + "grad_norm": 1.4844277887266815, + "language_loss": 0.78381926, + "learning_rate": 1.184238431012635e-06, + "loss": 0.86060935, + "num_input_tokens_seen": 231389285, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.09881592, + "step": 10723, + "time_per_iteration": 2.550785541534424 + }, + { + "auxiliary_loss_clip": 0.06412604, + "auxiliary_loss_mlp": 0.01264685, + "balance_loss_clip": 0.06270273, + "balance_loss_mlp": 0.01253825, + "epoch": 0.6447617616113032, + "flos": 27709523084160.0, + "grad_norm": 1.5774078355025598, + "language_loss": 0.58958089, + "learning_rate": 1.1838828542598312e-06, + "loss": 0.66635382, + "num_input_tokens_seen": 231408820, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.10858154, + "step": 10724, + "time_per_iteration": 2.54042387008667 + }, + { + "auxiliary_loss_clip": 0.06404805, + "auxiliary_loss_mlp": 0.012629, + "balance_loss_clip": 0.06271456, + "balance_loss_mlp": 0.0125294, + "epoch": 0.6448218848639712, + "flos": 23045728354560.0, + "grad_norm": 1.8379385823931873, + "language_loss": 0.83613712, + "learning_rate": 1.183527308454271e-06, + "loss": 0.91281414, + "num_input_tokens_seen": 231428100, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.09960938, + "step": 10725, + "time_per_iteration": 3.910567045211792 + }, + { + "auxiliary_loss_clip": 0.06409329, + "auxiliary_loss_mlp": 0.0126531, + "balance_loss_clip": 0.06272514, + "balance_loss_mlp": 0.01255123, + "epoch": 0.6448820081166391, + "flos": 24502569367680.0, + "grad_norm": 1.6966621719955104, + "language_loss": 0.82546258, + "learning_rate": 1.1831717936094368e-06, + "loss": 0.90220898, + "num_input_tokens_seen": 231445810, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.10186768, + "step": 10726, + "time_per_iteration": 2.5510244369506836 + }, + { + "auxiliary_loss_clip": 0.06413421, + "auxiliary_loss_mlp": 0.01265367, + "balance_loss_clip": 0.06271534, + "balance_loss_mlp": 0.01254757, + "epoch": 0.6449421313693071, + "flos": 22425880176000.0, + "grad_norm": 1.8351379370292278, + "language_loss": 0.82230431, + "learning_rate": 1.1828163097388108e-06, + "loss": 0.8990922, + "num_input_tokens_seen": 231463570, + "router_z_loss_clip": 1.41992188, + "router_z_loss_mlp": 0.10601807, + "step": 10727, + "time_per_iteration": 4.002009153366089 + }, + { + "auxiliary_loss_clip": 0.0641925, + "auxiliary_loss_mlp": 0.01267298, + "balance_loss_clip": 0.06273851, + "balance_loss_mlp": 0.01255908, + "epoch": 0.645002254621975, + "flos": 20231206035840.0, + "grad_norm": 1.8310574877771004, + "language_loss": 0.79621851, + "learning_rate": 1.1824608568558717e-06, + "loss": 0.87308395, + "num_input_tokens_seen": 231482155, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.1138916, + "step": 10728, + "time_per_iteration": 2.500166416168213 + }, + { + "auxiliary_loss_clip": 0.06411231, + "auxiliary_loss_mlp": 0.0126533, + "balance_loss_clip": 0.06273383, + "balance_loss_mlp": 0.01253767, + "epoch": 0.645062377874643, + "flos": 27862909182720.0, + "grad_norm": 1.7840301112259453, + "language_loss": 0.7434454, + "learning_rate": 1.1821054349740988e-06, + "loss": 0.82021105, + "num_input_tokens_seen": 231502465, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.11578369, + "step": 10729, + "time_per_iteration": 2.5444576740264893 + }, + { + "auxiliary_loss_clip": 0.06416906, + "auxiliary_loss_mlp": 0.01269622, + "balance_loss_clip": 0.06276138, + "balance_loss_mlp": 0.01258971, + "epoch": 0.645122501127311, + "flos": 25308563391360.0, + "grad_norm": 1.804382369686425, + "language_loss": 0.66694868, + "learning_rate": 1.1817500441069706e-06, + "loss": 0.74381399, + "num_input_tokens_seen": 231522740, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.10662842, + "step": 10730, + "time_per_iteration": 2.557570695877075 + }, + { + "auxiliary_loss_clip": 0.06414691, + "auxiliary_loss_mlp": 0.01268999, + "balance_loss_clip": 0.0627515, + "balance_loss_mlp": 0.01257823, + "epoch": 0.645182624379979, + "flos": 18813371898240.0, + "grad_norm": 1.7610800842195338, + "language_loss": 0.64359826, + "learning_rate": 1.1813946842679614e-06, + "loss": 0.72043514, + "num_input_tokens_seen": 231542050, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.11181641, + "step": 10731, + "time_per_iteration": 2.496885299682617 + }, + { + "auxiliary_loss_clip": 0.06408474, + "auxiliary_loss_mlp": 0.01264128, + "balance_loss_clip": 0.06272513, + "balance_loss_mlp": 0.01253507, + "epoch": 0.6452427476326469, + "flos": 18337979358720.0, + "grad_norm": 1.6539865973631505, + "language_loss": 0.68541694, + "learning_rate": 1.1810393554705492e-06, + "loss": 0.76214296, + "num_input_tokens_seen": 231560380, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.10620117, + "step": 10732, + "time_per_iteration": 2.5379278659820557 + }, + { + "auxiliary_loss_clip": 0.06405264, + "auxiliary_loss_mlp": 0.01268037, + "balance_loss_clip": 0.06272335, + "balance_loss_mlp": 0.01257392, + "epoch": 0.6453028708853149, + "flos": 22791505466880.0, + "grad_norm": 1.6003799317808598, + "language_loss": 0.75854611, + "learning_rate": 1.1806840577282055e-06, + "loss": 0.83527917, + "num_input_tokens_seen": 231580810, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.10638428, + "step": 10733, + "time_per_iteration": 2.5387895107269287 + }, + { + "auxiliary_loss_clip": 0.06419903, + "auxiliary_loss_mlp": 0.01269065, + "balance_loss_clip": 0.06276383, + "balance_loss_mlp": 0.01257466, + "epoch": 0.6453629941379828, + "flos": 23951888334720.0, + "grad_norm": 1.8221527595961244, + "language_loss": 0.6735214, + "learning_rate": 1.1803287910544048e-06, + "loss": 0.75041103, + "num_input_tokens_seen": 231600585, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.1159668, + "step": 10734, + "time_per_iteration": 3.968029260635376 + }, + { + "auxiliary_loss_clip": 0.06404681, + "auxiliary_loss_mlp": 0.01263578, + "balance_loss_clip": 0.06273787, + "balance_loss_mlp": 0.01252694, + "epoch": 0.6454231173906508, + "flos": 17682226905600.0, + "grad_norm": 2.0600495273099377, + "language_loss": 0.7393254, + "learning_rate": 1.1799735554626191e-06, + "loss": 0.81600797, + "num_input_tokens_seen": 231618765, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.10882568, + "step": 10735, + "time_per_iteration": 2.5028645992279053 + }, + { + "auxiliary_loss_clip": 0.06413495, + "auxiliary_loss_mlp": 0.01265876, + "balance_loss_clip": 0.06275123, + "balance_loss_mlp": 0.01255791, + "epoch": 0.6454832406433189, + "flos": 23299154628480.0, + "grad_norm": 1.713856204545893, + "language_loss": 0.75178444, + "learning_rate": 1.1796183509663176e-06, + "loss": 0.82857811, + "num_input_tokens_seen": 231638525, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.10083008, + "step": 10736, + "time_per_iteration": 2.52396821975708 + }, + { + "auxiliary_loss_clip": 0.06414569, + "auxiliary_loss_mlp": 0.01265141, + "balance_loss_clip": 0.06272043, + "balance_loss_mlp": 0.01254097, + "epoch": 0.6455433638959868, + "flos": 20163422482560.0, + "grad_norm": 1.900325282027751, + "language_loss": 0.70704216, + "learning_rate": 1.1792631775789708e-06, + "loss": 0.78383923, + "num_input_tokens_seen": 231656785, + "router_z_loss_clip": 1.42382812, + "router_z_loss_mlp": 0.1104126, + "step": 10737, + "time_per_iteration": 2.533444404602051 + }, + { + "auxiliary_loss_clip": 0.06321093, + "auxiliary_loss_mlp": 0.01260403, + "balance_loss_clip": 0.06264752, + "balance_loss_mlp": 0.01258907, + "epoch": 0.6456034871486548, + "flos": 66553391761920.0, + "grad_norm": 0.7654525046837665, + "language_loss": 0.58448923, + "learning_rate": 1.1789080353140464e-06, + "loss": 0.66030419, + "num_input_tokens_seen": 231719075, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01495361, + "step": 10738, + "time_per_iteration": 3.180669069290161 + }, + { + "auxiliary_loss_clip": 0.06409475, + "auxiliary_loss_mlp": 0.01265038, + "balance_loss_clip": 0.06273897, + "balance_loss_mlp": 0.0125478, + "epoch": 0.6456636104013227, + "flos": 24212819548800.0, + "grad_norm": 2.1666946936849434, + "language_loss": 0.74776822, + "learning_rate": 1.1785529241850118e-06, + "loss": 0.82451332, + "num_input_tokens_seen": 231737810, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.1026001, + "step": 10739, + "time_per_iteration": 2.556649923324585 + }, + { + "auxiliary_loss_clip": 0.06415305, + "auxiliary_loss_mlp": 0.01264707, + "balance_loss_clip": 0.06273393, + "balance_loss_mlp": 0.01254098, + "epoch": 0.6457237336539907, + "flos": 23631013923840.0, + "grad_norm": 1.691973671023819, + "language_loss": 0.71430027, + "learning_rate": 1.1781978442053324e-06, + "loss": 0.79110038, + "num_input_tokens_seen": 231756140, + "router_z_loss_clip": 1.41894531, + "router_z_loss_mlp": 0.1060791, + "step": 10740, + "time_per_iteration": 2.5294902324676514 + }, + { + "auxiliary_loss_clip": 0.06311092, + "auxiliary_loss_mlp": 0.0125644, + "balance_loss_clip": 0.06254861, + "balance_loss_mlp": 0.01255001, + "epoch": 0.6457838569066586, + "flos": 65867437111680.0, + "grad_norm": 1.1432056527915397, + "language_loss": 0.55345345, + "learning_rate": 1.1778427953884733e-06, + "loss": 0.62912881, + "num_input_tokens_seen": 231823665, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01437378, + "step": 10741, + "time_per_iteration": 3.1684045791625977 + }, + { + "auxiliary_loss_clip": 0.06412791, + "auxiliary_loss_mlp": 0.01265658, + "balance_loss_clip": 0.06276751, + "balance_loss_mlp": 0.01255149, + "epoch": 0.6458439801593266, + "flos": 22388424600960.0, + "grad_norm": 1.6129388785112204, + "language_loss": 0.80396634, + "learning_rate": 1.1774877777478977e-06, + "loss": 0.88075083, + "num_input_tokens_seen": 231844500, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.1050415, + "step": 10742, + "time_per_iteration": 2.5326621532440186 + }, + { + "auxiliary_loss_clip": 0.06404757, + "auxiliary_loss_mlp": 0.01265116, + "balance_loss_clip": 0.06273461, + "balance_loss_mlp": 0.01254643, + "epoch": 0.6459041034119946, + "flos": 24795966839040.0, + "grad_norm": 1.5649270887964326, + "language_loss": 0.81750703, + "learning_rate": 1.1771327912970678e-06, + "loss": 0.89420575, + "num_input_tokens_seen": 231864510, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.10467529, + "step": 10743, + "time_per_iteration": 2.525972366333008 + }, + { + "auxiliary_loss_clip": 0.06406047, + "auxiliary_loss_mlp": 0.01265343, + "balance_loss_clip": 0.06271668, + "balance_loss_mlp": 0.01255377, + "epoch": 0.6459642266646626, + "flos": 18330013221120.0, + "grad_norm": 1.6048937891157424, + "language_loss": 0.71681064, + "learning_rate": 1.1767778360494453e-06, + "loss": 0.79352456, + "num_input_tokens_seen": 231881555, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.09973145, + "step": 10744, + "time_per_iteration": 2.571387767791748 + }, + { + "auxiliary_loss_clip": 0.06408056, + "auxiliary_loss_mlp": 0.01267463, + "balance_loss_clip": 0.0627251, + "balance_loss_mlp": 0.01257753, + "epoch": 0.6460243499173305, + "flos": 43591561672320.0, + "grad_norm": 1.9454844326150766, + "language_loss": 0.67213976, + "learning_rate": 1.1764229120184896e-06, + "loss": 0.74889499, + "num_input_tokens_seen": 231905945, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.0970459, + "step": 10745, + "time_per_iteration": 2.6937074661254883 + }, + { + "auxiliary_loss_clip": 0.06406983, + "auxiliary_loss_mlp": 0.01268476, + "balance_loss_clip": 0.0627151, + "balance_loss_mlp": 0.01257711, + "epoch": 0.6460844731699985, + "flos": 19249925270400.0, + "grad_norm": 2.096395113743082, + "language_loss": 0.74313092, + "learning_rate": 1.1760680192176597e-06, + "loss": 0.81988549, + "num_input_tokens_seen": 231922535, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.10778809, + "step": 10746, + "time_per_iteration": 2.5105156898498535 + }, + { + "auxiliary_loss_clip": 0.06413017, + "auxiliary_loss_mlp": 0.012653, + "balance_loss_clip": 0.06273216, + "balance_loss_mlp": 0.01254649, + "epoch": 0.6461445964226664, + "flos": 27460624930560.0, + "grad_norm": 1.4939234449131917, + "language_loss": 0.67274344, + "learning_rate": 1.175713157660413e-06, + "loss": 0.74952662, + "num_input_tokens_seen": 231944800, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.10644531, + "step": 10747, + "time_per_iteration": 2.5424420833587646 + }, + { + "auxiliary_loss_clip": 0.0641461, + "auxiliary_loss_mlp": 0.01265405, + "balance_loss_clip": 0.0627532, + "balance_loss_mlp": 0.01255272, + "epoch": 0.6462047196753344, + "flos": 20300457035520.0, + "grad_norm": 1.6454594650819265, + "language_loss": 0.67613244, + "learning_rate": 1.1753583273602056e-06, + "loss": 0.75293255, + "num_input_tokens_seen": 231962970, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.10137939, + "step": 10748, + "time_per_iteration": 2.529270887374878 + }, + { + "auxiliary_loss_clip": 0.0641374, + "auxiliary_loss_mlp": 0.01266285, + "balance_loss_clip": 0.06271876, + "balance_loss_mlp": 0.01254764, + "epoch": 0.6462648429280025, + "flos": 22024937589120.0, + "grad_norm": 1.9564061615945416, + "language_loss": 0.76055253, + "learning_rate": 1.1750035283304937e-06, + "loss": 0.83735275, + "num_input_tokens_seen": 231981195, + "router_z_loss_clip": 1.41796875, + "router_z_loss_mlp": 0.11517334, + "step": 10749, + "time_per_iteration": 2.5083682537078857 + }, + { + "auxiliary_loss_clip": 0.06411772, + "auxiliary_loss_mlp": 0.01264574, + "balance_loss_clip": 0.0627101, + "balance_loss_mlp": 0.01254208, + "epoch": 0.6463249661806704, + "flos": 27788375376000.0, + "grad_norm": 1.4570564957131642, + "language_loss": 0.77334827, + "learning_rate": 1.17464876058473e-06, + "loss": 0.85011172, + "num_input_tokens_seen": 232001735, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.10369873, + "step": 10750, + "time_per_iteration": 2.5812573432922363 + }, + { + "auxiliary_loss_clip": 0.06417309, + "auxiliary_loss_mlp": 0.01269158, + "balance_loss_clip": 0.06274579, + "balance_loss_mlp": 0.01258268, + "epoch": 0.6463850894333384, + "flos": 22056481451520.0, + "grad_norm": 2.0670822566581437, + "language_loss": 0.6898241, + "learning_rate": 1.1742940241363683e-06, + "loss": 0.76668882, + "num_input_tokens_seen": 232019830, + "router_z_loss_clip": 1.42675781, + "router_z_loss_mlp": 0.10900879, + "step": 10751, + "time_per_iteration": 2.4936625957489014 + }, + { + "auxiliary_loss_clip": 0.06414577, + "auxiliary_loss_mlp": 0.01265138, + "balance_loss_clip": 0.06273049, + "balance_loss_mlp": 0.0125448, + "epoch": 0.6464452126860063, + "flos": 21112698188160.0, + "grad_norm": 1.7780067956451429, + "language_loss": 0.71182156, + "learning_rate": 1.1739393189988604e-06, + "loss": 0.78861868, + "num_input_tokens_seen": 232039625, + "router_z_loss_clip": 1.41601562, + "router_z_loss_mlp": 0.10662842, + "step": 10752, + "time_per_iteration": 3.927877426147461 + }, + { + "auxiliary_loss_clip": 0.06415342, + "auxiliary_loss_mlp": 0.01266476, + "balance_loss_clip": 0.06274153, + "balance_loss_mlp": 0.01253661, + "epoch": 0.6465053359386743, + "flos": 16032531720960.0, + "grad_norm": 1.540910380020274, + "language_loss": 0.77855444, + "learning_rate": 1.1735846451856554e-06, + "loss": 0.85537261, + "num_input_tokens_seen": 232055855, + "router_z_loss_clip": 1.41210938, + "router_z_loss_mlp": 0.12823486, + "step": 10753, + "time_per_iteration": 2.4648597240448 + }, + { + "auxiliary_loss_clip": 0.06412196, + "auxiliary_loss_mlp": 0.01268464, + "balance_loss_clip": 0.0627618, + "balance_loss_mlp": 0.01256871, + "epoch": 0.6465654591913422, + "flos": 23404477610880.0, + "grad_norm": 1.596791967646976, + "language_loss": 0.85541224, + "learning_rate": 1.1732300027102041e-06, + "loss": 0.93221891, + "num_input_tokens_seen": 232073475, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.11584473, + "step": 10754, + "time_per_iteration": 2.5978291034698486 + }, + { + "auxiliary_loss_clip": 0.06414384, + "auxiliary_loss_mlp": 0.01267288, + "balance_loss_clip": 0.06275849, + "balance_loss_mlp": 0.01256374, + "epoch": 0.6466255824440102, + "flos": 15382649053440.0, + "grad_norm": 2.138696261718271, + "language_loss": 0.6015234, + "learning_rate": 1.1728753915859541e-06, + "loss": 0.67834014, + "num_input_tokens_seen": 232091090, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.10919189, + "step": 10755, + "time_per_iteration": 2.5456504821777344 + }, + { + "auxiliary_loss_clip": 0.06412394, + "auxiliary_loss_mlp": 0.01268659, + "balance_loss_clip": 0.06275767, + "balance_loss_mlp": 0.01257025, + "epoch": 0.6466857056966782, + "flos": 16258355274240.0, + "grad_norm": 2.6815820423410845, + "language_loss": 0.68557096, + "learning_rate": 1.1725208118263518e-06, + "loss": 0.76238149, + "num_input_tokens_seen": 232107320, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.11633301, + "step": 10756, + "time_per_iteration": 2.4882616996765137 + }, + { + "auxiliary_loss_clip": 0.06423604, + "auxiliary_loss_mlp": 0.01266345, + "balance_loss_clip": 0.06278333, + "balance_loss_mlp": 0.01255199, + "epoch": 0.6467458289493462, + "flos": 21184548664320.0, + "grad_norm": 2.427580887606393, + "language_loss": 0.74556214, + "learning_rate": 1.172166263444844e-06, + "loss": 0.82246166, + "num_input_tokens_seen": 232123930, + "router_z_loss_clip": 1.453125, + "router_z_loss_mlp": 0.1114502, + "step": 10757, + "time_per_iteration": 2.5800364017486572 + }, + { + "auxiliary_loss_clip": 0.06404246, + "auxiliary_loss_mlp": 0.01268605, + "balance_loss_clip": 0.06271093, + "balance_loss_mlp": 0.01257357, + "epoch": 0.6468059522020141, + "flos": 17974198857600.0, + "grad_norm": 1.6114695233803533, + "language_loss": 0.74794757, + "learning_rate": 1.1718117464548734e-06, + "loss": 0.82467604, + "num_input_tokens_seen": 232142905, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.11248779, + "step": 10758, + "time_per_iteration": 2.537113666534424 + }, + { + "auxiliary_loss_clip": 0.06411805, + "auxiliary_loss_mlp": 0.0127172, + "balance_loss_clip": 0.06272358, + "balance_loss_mlp": 0.01259715, + "epoch": 0.6468660754546821, + "flos": 17895178857600.0, + "grad_norm": 1.7921091077439633, + "language_loss": 0.6853838, + "learning_rate": 1.1714572608698845e-06, + "loss": 0.76221907, + "num_input_tokens_seen": 232162230, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.11999512, + "step": 10759, + "time_per_iteration": 2.5501279830932617 + }, + { + "auxiliary_loss_clip": 0.06419058, + "auxiliary_loss_mlp": 0.01268931, + "balance_loss_clip": 0.0627493, + "balance_loss_mlp": 0.01257666, + "epoch": 0.64692619870735, + "flos": 22607497900800.0, + "grad_norm": 1.5782597023408493, + "language_loss": 0.75492609, + "learning_rate": 1.1711028067033197e-06, + "loss": 0.831806, + "num_input_tokens_seen": 232182700, + "router_z_loss_clip": 1.44335938, + "router_z_loss_mlp": 0.11273193, + "step": 10760, + "time_per_iteration": 2.5426504611968994 + }, + { + "auxiliary_loss_clip": 0.06408913, + "auxiliary_loss_mlp": 0.0126904, + "balance_loss_clip": 0.06273125, + "balance_loss_mlp": 0.01258621, + "epoch": 0.646986321960018, + "flos": 49611863750400.0, + "grad_norm": 1.5088139829750542, + "language_loss": 0.65700191, + "learning_rate": 1.1707483839686194e-06, + "loss": 0.73378146, + "num_input_tokens_seen": 232208235, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10406494, + "step": 10761, + "time_per_iteration": 2.8235716819763184 + }, + { + "auxiliary_loss_clip": 0.0641157, + "auxiliary_loss_mlp": 0.01270239, + "balance_loss_clip": 0.06273905, + "balance_loss_mlp": 0.0125886, + "epoch": 0.6470464452126861, + "flos": 21914960705280.0, + "grad_norm": 4.087602702214583, + "language_loss": 0.70041698, + "learning_rate": 1.1703939926792235e-06, + "loss": 0.77723515, + "num_input_tokens_seen": 232228720, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.11376953, + "step": 10762, + "time_per_iteration": 2.4962708950042725 + }, + { + "auxiliary_loss_clip": 0.06415009, + "auxiliary_loss_mlp": 0.01270412, + "balance_loss_clip": 0.06273261, + "balance_loss_mlp": 0.01259039, + "epoch": 0.647106568465354, + "flos": 18110688359040.0, + "grad_norm": 2.044366921559264, + "language_loss": 0.82845706, + "learning_rate": 1.1700396328485705e-06, + "loss": 0.90531123, + "num_input_tokens_seen": 232244655, + "router_z_loss_clip": 1.41699219, + "router_z_loss_mlp": 0.11364746, + "step": 10763, + "time_per_iteration": 2.5127148628234863 + }, + { + "auxiliary_loss_clip": 0.06315573, + "auxiliary_loss_mlp": 0.01250562, + "balance_loss_clip": 0.06259283, + "balance_loss_mlp": 0.01249394, + "epoch": 0.647166691718022, + "flos": 69499623899520.0, + "grad_norm": 0.6915624783517184, + "language_loss": 0.5774473, + "learning_rate": 1.1696853044900978e-06, + "loss": 0.65310872, + "num_input_tokens_seen": 232308685, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01165009, + "step": 10764, + "time_per_iteration": 4.764317035675049 + }, + { + "auxiliary_loss_clip": 0.06411065, + "auxiliary_loss_mlp": 0.01264999, + "balance_loss_clip": 0.06273772, + "balance_loss_mlp": 0.01254532, + "epoch": 0.6472268149706899, + "flos": 34103793934080.0, + "grad_norm": 1.637421021891431, + "language_loss": 0.60742128, + "learning_rate": 1.1693310076172413e-06, + "loss": 0.68418187, + "num_input_tokens_seen": 232327520, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.10467529, + "step": 10765, + "time_per_iteration": 2.6306469440460205 + }, + { + "auxiliary_loss_clip": 0.06408644, + "auxiliary_loss_mlp": 0.01269206, + "balance_loss_clip": 0.0627322, + "balance_loss_mlp": 0.01258924, + "epoch": 0.6472869382233579, + "flos": 28118809152000.0, + "grad_norm": 2.0826927975642273, + "language_loss": 0.63338971, + "learning_rate": 1.168976742243437e-06, + "loss": 0.71016824, + "num_input_tokens_seen": 232349025, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.1027832, + "step": 10766, + "time_per_iteration": 2.608025074005127 + }, + { + "auxiliary_loss_clip": 0.06411771, + "auxiliary_loss_mlp": 0.01268357, + "balance_loss_clip": 0.06273695, + "balance_loss_mlp": 0.01257616, + "epoch": 0.6473470614760258, + "flos": 22498736901120.0, + "grad_norm": 1.6916160768027213, + "language_loss": 0.75775635, + "learning_rate": 1.1686225083821174e-06, + "loss": 0.83455759, + "num_input_tokens_seen": 232367835, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.10736084, + "step": 10767, + "time_per_iteration": 3.9129326343536377 + }, + { + "auxiliary_loss_clip": 0.06410106, + "auxiliary_loss_mlp": 0.01266795, + "balance_loss_clip": 0.06272191, + "balance_loss_mlp": 0.01255613, + "epoch": 0.6474071847286939, + "flos": 14544314553600.0, + "grad_norm": 1.8076972632130168, + "language_loss": 0.77841228, + "learning_rate": 1.1682683060467153e-06, + "loss": 0.85518134, + "num_input_tokens_seen": 232385840, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.11187744, + "step": 10768, + "time_per_iteration": 2.5130937099456787 + }, + { + "auxiliary_loss_clip": 0.06411847, + "auxiliary_loss_mlp": 0.01266069, + "balance_loss_clip": 0.06274557, + "balance_loss_mlp": 0.01255894, + "epoch": 0.6474673079813618, + "flos": 24105190579200.0, + "grad_norm": 1.6392494709530092, + "language_loss": 0.71794009, + "learning_rate": 1.167914135250663e-06, + "loss": 0.79471928, + "num_input_tokens_seen": 232406205, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.10162354, + "step": 10769, + "time_per_iteration": 2.5274879932403564 + }, + { + "auxiliary_loss_clip": 0.06409761, + "auxiliary_loss_mlp": 0.01267023, + "balance_loss_clip": 0.06276036, + "balance_loss_mlp": 0.01256985, + "epoch": 0.6475274312340298, + "flos": 14981538758400.0, + "grad_norm": 1.8331179769777781, + "language_loss": 0.73102438, + "learning_rate": 1.1675599960073895e-06, + "loss": 0.80779225, + "num_input_tokens_seen": 232424995, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.10040283, + "step": 10770, + "time_per_iteration": 2.4902164936065674 + }, + { + "auxiliary_loss_clip": 0.0641522, + "auxiliary_loss_mlp": 0.01267005, + "balance_loss_clip": 0.0627073, + "balance_loss_mlp": 0.01254357, + "epoch": 0.6475875544866977, + "flos": 25052202224640.0, + "grad_norm": 1.6464816515513445, + "language_loss": 0.73554993, + "learning_rate": 1.167205888330325e-06, + "loss": 0.81237221, + "num_input_tokens_seen": 232445870, + "router_z_loss_clip": 1.4453125, + "router_z_loss_mlp": 0.12646484, + "step": 10771, + "time_per_iteration": 2.5617709159851074 + }, + { + "auxiliary_loss_clip": 0.06412145, + "auxiliary_loss_mlp": 0.0126638, + "balance_loss_clip": 0.06276581, + "balance_loss_mlp": 0.01255324, + "epoch": 0.6476476777393657, + "flos": 16477763990400.0, + "grad_norm": 2.394956758167514, + "language_loss": 0.74415565, + "learning_rate": 1.1668518122328958e-06, + "loss": 0.82094085, + "num_input_tokens_seen": 232464285, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.1105957, + "step": 10772, + "time_per_iteration": 2.54032826423645 + }, + { + "auxiliary_loss_clip": 0.06408937, + "auxiliary_loss_mlp": 0.01268327, + "balance_loss_clip": 0.06275553, + "balance_loss_mlp": 0.01258987, + "epoch": 0.6477078009920336, + "flos": 25819399008000.0, + "grad_norm": 1.4893197324025274, + "language_loss": 0.82968116, + "learning_rate": 1.1664977677285305e-06, + "loss": 0.90645373, + "num_input_tokens_seen": 232485815, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.09338379, + "step": 10773, + "time_per_iteration": 3.9616613388061523 + }, + { + "auxiliary_loss_clip": 0.06405786, + "auxiliary_loss_mlp": 0.01267593, + "balance_loss_clip": 0.06272345, + "balance_loss_mlp": 0.01257776, + "epoch": 0.6477679242447016, + "flos": 17681933416320.0, + "grad_norm": 1.4328505723610274, + "language_loss": 0.78670597, + "learning_rate": 1.1661437548306524e-06, + "loss": 0.8634398, + "num_input_tokens_seen": 232504875, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.0982666, + "step": 10774, + "time_per_iteration": 2.471349000930786 + }, + { + "auxiliary_loss_clip": 0.06414998, + "auxiliary_loss_mlp": 0.01270742, + "balance_loss_clip": 0.0627519, + "balance_loss_mlp": 0.01259406, + "epoch": 0.6478280474973696, + "flos": 21038583651840.0, + "grad_norm": 2.0152385899029763, + "language_loss": 0.69592845, + "learning_rate": 1.1657897735526867e-06, + "loss": 0.7727859, + "num_input_tokens_seen": 232521945, + "router_z_loss_clip": 1.40039062, + "router_z_loss_mlp": 0.11346436, + "step": 10775, + "time_per_iteration": 2.518340826034546 + }, + { + "auxiliary_loss_clip": 0.06416593, + "auxiliary_loss_mlp": 0.01267491, + "balance_loss_clip": 0.06272413, + "balance_loss_mlp": 0.01256792, + "epoch": 0.6478881707500376, + "flos": 21623449950720.0, + "grad_norm": 1.6656343992417288, + "language_loss": 0.65808022, + "learning_rate": 1.1654358239080574e-06, + "loss": 0.73492104, + "num_input_tokens_seen": 232541500, + "router_z_loss_clip": 1.44238281, + "router_z_loss_mlp": 0.10705566, + "step": 10776, + "time_per_iteration": 2.511101722717285 + }, + { + "auxiliary_loss_clip": 0.06413212, + "auxiliary_loss_mlp": 0.01268424, + "balance_loss_clip": 0.06273791, + "balance_loss_mlp": 0.01257343, + "epoch": 0.6479482940027056, + "flos": 18448543221120.0, + "grad_norm": 2.2928682482209015, + "language_loss": 0.79598206, + "learning_rate": 1.1650819059101839e-06, + "loss": 0.87279832, + "num_input_tokens_seen": 232559720, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.11096191, + "step": 10777, + "time_per_iteration": 2.554004669189453 + }, + { + "auxiliary_loss_clip": 0.06412454, + "auxiliary_loss_mlp": 0.01266878, + "balance_loss_clip": 0.06275424, + "balance_loss_mlp": 0.01256311, + "epoch": 0.6480084172553735, + "flos": 22170651039360.0, + "grad_norm": 1.8955877147463427, + "language_loss": 0.74017107, + "learning_rate": 1.1647280195724896e-06, + "loss": 0.81696445, + "num_input_tokens_seen": 232579370, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.10571289, + "step": 10778, + "time_per_iteration": 2.5087220668792725 + }, + { + "auxiliary_loss_clip": 0.06407086, + "auxiliary_loss_mlp": 0.01264071, + "balance_loss_clip": 0.06272884, + "balance_loss_mlp": 0.01253694, + "epoch": 0.6480685405080415, + "flos": 24323089921920.0, + "grad_norm": 1.3775726820823926, + "language_loss": 0.78463447, + "learning_rate": 1.1643741649083923e-06, + "loss": 0.86134601, + "num_input_tokens_seen": 232600495, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.10388184, + "step": 10779, + "time_per_iteration": 2.5677905082702637 + }, + { + "auxiliary_loss_clip": 0.06319194, + "auxiliary_loss_mlp": 0.01254794, + "balance_loss_clip": 0.06262461, + "balance_loss_mlp": 0.01253526, + "epoch": 0.6481286637607094, + "flos": 59910348539520.0, + "grad_norm": 0.7063734620210058, + "language_loss": 0.59437895, + "learning_rate": 1.1640203419313095e-06, + "loss": 0.67011881, + "num_input_tokens_seen": 232663165, + "router_z_loss_clip": 0.56640625, + "router_z_loss_mlp": 0.01268005, + "step": 10780, + "time_per_iteration": 3.11826229095459 + }, + { + "auxiliary_loss_clip": 0.06409959, + "auxiliary_loss_mlp": 0.01264952, + "balance_loss_clip": 0.06273422, + "balance_loss_mlp": 0.01254974, + "epoch": 0.6481887870133775, + "flos": 25491313146240.0, + "grad_norm": 1.83776143864241, + "language_loss": 0.79705411, + "learning_rate": 1.1636665506546599e-06, + "loss": 0.87380326, + "num_input_tokens_seen": 232683385, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.09979248, + "step": 10781, + "time_per_iteration": 2.5406956672668457 + }, + { + "auxiliary_loss_clip": 0.06418487, + "auxiliary_loss_mlp": 0.0127058, + "balance_loss_clip": 0.06278095, + "balance_loss_mlp": 0.01258636, + "epoch": 0.6482489102660454, + "flos": 19935041379840.0, + "grad_norm": 2.151495176949557, + "language_loss": 0.78676552, + "learning_rate": 1.1633127910918578e-06, + "loss": 0.86365616, + "num_input_tokens_seen": 232699095, + "router_z_loss_clip": 1.40429688, + "router_z_loss_mlp": 0.11938477, + "step": 10782, + "time_per_iteration": 2.5015201568603516 + }, + { + "auxiliary_loss_clip": 0.06412151, + "auxiliary_loss_mlp": 0.01268158, + "balance_loss_clip": 0.06272621, + "balance_loss_mlp": 0.01257525, + "epoch": 0.6483090335187134, + "flos": 26986741764480.0, + "grad_norm": 3.0083350466584378, + "language_loss": 0.64055502, + "learning_rate": 1.1629590632563187e-06, + "loss": 0.71735811, + "num_input_tokens_seen": 232717920, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.10632324, + "step": 10783, + "time_per_iteration": 2.536803960800171 + }, + { + "auxiliary_loss_clip": 0.06416991, + "auxiliary_loss_mlp": 0.01269846, + "balance_loss_clip": 0.06275127, + "balance_loss_mlp": 0.01258426, + "epoch": 0.6483691567713813, + "flos": 25084207284480.0, + "grad_norm": 1.8907849838824615, + "language_loss": 0.89016545, + "learning_rate": 1.1626053671614561e-06, + "loss": 0.96703386, + "num_input_tokens_seen": 232737605, + "router_z_loss_clip": 1.41894531, + "router_z_loss_mlp": 0.11431885, + "step": 10784, + "time_per_iteration": 2.5452053546905518 + }, + { + "auxiliary_loss_clip": 0.0641108, + "auxiliary_loss_mlp": 0.01266426, + "balance_loss_clip": 0.06276603, + "balance_loss_mlp": 0.01254636, + "epoch": 0.6484292800240493, + "flos": 16111300158720.0, + "grad_norm": 2.486751490302504, + "language_loss": 0.73449266, + "learning_rate": 1.1622517028206815e-06, + "loss": 0.81126773, + "num_input_tokens_seen": 232755110, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.11798096, + "step": 10785, + "time_per_iteration": 2.4847772121429443 + }, + { + "auxiliary_loss_clip": 0.06405519, + "auxiliary_loss_mlp": 0.01266455, + "balance_loss_clip": 0.06272283, + "balance_loss_mlp": 0.01256507, + "epoch": 0.6484894032767172, + "flos": 28848005308800.0, + "grad_norm": 1.4322253483725718, + "language_loss": 0.69456708, + "learning_rate": 1.1618980702474071e-06, + "loss": 0.77128685, + "num_input_tokens_seen": 232779040, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.0994873, + "step": 10786, + "time_per_iteration": 2.585789918899536 + }, + { + "auxiliary_loss_clip": 0.06408978, + "auxiliary_loss_mlp": 0.0126988, + "balance_loss_clip": 0.06272955, + "balance_loss_mlp": 0.01259122, + "epoch": 0.6485495265293852, + "flos": 30234924489600.0, + "grad_norm": 2.0420211875900285, + "language_loss": 0.71877193, + "learning_rate": 1.161544469455041e-06, + "loss": 0.79556048, + "num_input_tokens_seen": 232800515, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.10760498, + "step": 10787, + "time_per_iteration": 2.566206216812134 + }, + { + "auxiliary_loss_clip": 0.06411794, + "auxiliary_loss_mlp": 0.01266479, + "balance_loss_clip": 0.06271984, + "balance_loss_mlp": 0.01255595, + "epoch": 0.6486096497820532, + "flos": 20088050135040.0, + "grad_norm": 1.7621323533283269, + "language_loss": 0.84403133, + "learning_rate": 1.1611909004569934e-06, + "loss": 0.92081404, + "num_input_tokens_seen": 232818450, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.10882568, + "step": 10788, + "time_per_iteration": 2.482072353363037 + }, + { + "auxiliary_loss_clip": 0.06410778, + "auxiliary_loss_mlp": 0.01268935, + "balance_loss_clip": 0.06273876, + "balance_loss_mlp": 0.01258111, + "epoch": 0.6486697730347212, + "flos": 17134816181760.0, + "grad_norm": 2.2095301330311643, + "language_loss": 0.77364171, + "learning_rate": 1.1608373632666708e-06, + "loss": 0.85043883, + "num_input_tokens_seen": 232834785, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.10821533, + "step": 10789, + "time_per_iteration": 2.5368380546569824 + }, + { + "auxiliary_loss_clip": 0.06408279, + "auxiliary_loss_mlp": 0.01268929, + "balance_loss_clip": 0.06272905, + "balance_loss_mlp": 0.01258606, + "epoch": 0.6487298962873892, + "flos": 38921477886720.0, + "grad_norm": 1.570352466870208, + "language_loss": 0.76618487, + "learning_rate": 1.160483857897479e-06, + "loss": 0.8429569, + "num_input_tokens_seen": 232856050, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.10327148, + "step": 10790, + "time_per_iteration": 2.6590943336486816 + }, + { + "auxiliary_loss_clip": 0.06408708, + "auxiliary_loss_mlp": 0.01266087, + "balance_loss_clip": 0.0627384, + "balance_loss_mlp": 0.01256169, + "epoch": 0.6487900195400571, + "flos": 11952680895360.0, + "grad_norm": 2.134716405653686, + "language_loss": 0.59979677, + "learning_rate": 1.160130384362823e-06, + "loss": 0.67654467, + "num_input_tokens_seen": 232873945, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.09924316, + "step": 10791, + "time_per_iteration": 3.963503360748291 + }, + { + "auxiliary_loss_clip": 0.06410848, + "auxiliary_loss_mlp": 0.01268892, + "balance_loss_clip": 0.06274579, + "balance_loss_mlp": 0.01258646, + "epoch": 0.6488501427927251, + "flos": 22350717463680.0, + "grad_norm": 1.5491724826349689, + "language_loss": 0.8594861, + "learning_rate": 1.1597769426761082e-06, + "loss": 0.93628347, + "num_input_tokens_seen": 232892160, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.10253906, + "step": 10792, + "time_per_iteration": 2.555723190307617 + }, + { + "auxiliary_loss_clip": 0.06414551, + "auxiliary_loss_mlp": 0.01268197, + "balance_loss_clip": 0.06273945, + "balance_loss_mlp": 0.01256419, + "epoch": 0.648910266045393, + "flos": 22242753077760.0, + "grad_norm": 1.7314529044761888, + "language_loss": 0.78069973, + "learning_rate": 1.159423532850735e-06, + "loss": 0.85752726, + "num_input_tokens_seen": 232911725, + "router_z_loss_clip": 1.40527344, + "router_z_loss_mlp": 0.11773682, + "step": 10793, + "time_per_iteration": 2.5019938945770264 + }, + { + "auxiliary_loss_clip": 0.06413871, + "auxiliary_loss_mlp": 0.01268245, + "balance_loss_clip": 0.06274308, + "balance_loss_mlp": 0.01257367, + "epoch": 0.6489703892980611, + "flos": 25308269902080.0, + "grad_norm": 1.950729669882986, + "language_loss": 0.74567354, + "learning_rate": 1.1590701549001055e-06, + "loss": 0.82249475, + "num_input_tokens_seen": 232929085, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.10882568, + "step": 10794, + "time_per_iteration": 2.5795669555664062 + }, + { + "auxiliary_loss_clip": 0.06410497, + "auxiliary_loss_mlp": 0.01265921, + "balance_loss_clip": 0.06272513, + "balance_loss_mlp": 0.01254655, + "epoch": 0.649030512550729, + "flos": 24578864110080.0, + "grad_norm": 1.8148879038848986, + "language_loss": 0.699453, + "learning_rate": 1.158716808837621e-06, + "loss": 0.77621716, + "num_input_tokens_seen": 232949455, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.11273193, + "step": 10795, + "time_per_iteration": 2.538400173187256 + }, + { + "auxiliary_loss_clip": 0.06416844, + "auxiliary_loss_mlp": 0.01273855, + "balance_loss_clip": 0.06276066, + "balance_loss_mlp": 0.01261964, + "epoch": 0.649090635803397, + "flos": 26251004989440.0, + "grad_norm": 1.9678382508243188, + "language_loss": 0.54238826, + "learning_rate": 1.158363494676679e-06, + "loss": 0.61929524, + "num_input_tokens_seen": 232969445, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.11895752, + "step": 10796, + "time_per_iteration": 2.6402297019958496 + }, + { + "auxiliary_loss_clip": 0.06412029, + "auxiliary_loss_mlp": 0.01265233, + "balance_loss_clip": 0.06273568, + "balance_loss_mlp": 0.01254767, + "epoch": 0.6491507590560649, + "flos": 24944489400960.0, + "grad_norm": 1.676360773921332, + "language_loss": 0.77936971, + "learning_rate": 1.1580102124306775e-06, + "loss": 0.85614228, + "num_input_tokens_seen": 232988900, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.10467529, + "step": 10797, + "time_per_iteration": 2.5467689037323 + }, + { + "auxiliary_loss_clip": 0.06405483, + "auxiliary_loss_mlp": 0.01265668, + "balance_loss_clip": 0.06273736, + "balance_loss_mlp": 0.01255935, + "epoch": 0.6492108823087329, + "flos": 19505783312640.0, + "grad_norm": 3.2369805565604053, + "language_loss": 0.7037648, + "learning_rate": 1.1576569621130134e-06, + "loss": 0.78047633, + "num_input_tokens_seen": 233005060, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.09729004, + "step": 10798, + "time_per_iteration": 2.5187807083129883 + }, + { + "auxiliary_loss_clip": 0.06409095, + "auxiliary_loss_mlp": 0.01266435, + "balance_loss_clip": 0.06272874, + "balance_loss_mlp": 0.01256493, + "epoch": 0.6492710055614008, + "flos": 19725443591040.0, + "grad_norm": 1.928025975497767, + "language_loss": 0.77484357, + "learning_rate": 1.1573037437370811e-06, + "loss": 0.85159886, + "num_input_tokens_seen": 233023375, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.09942627, + "step": 10799, + "time_per_iteration": 2.4996323585510254 + }, + { + "auxiliary_loss_clip": 0.06416353, + "auxiliary_loss_mlp": 0.01268958, + "balance_loss_clip": 0.06274813, + "balance_loss_mlp": 0.01257466, + "epoch": 0.6493311288140688, + "flos": 24324012316800.0, + "grad_norm": 1.6859277521525557, + "language_loss": 0.72046328, + "learning_rate": 1.1569505573162755e-06, + "loss": 0.79731631, + "num_input_tokens_seen": 233043130, + "router_z_loss_clip": 1.41503906, + "router_z_loss_mlp": 0.11480713, + "step": 10800, + "time_per_iteration": 2.5757715702056885 + }, + { + "auxiliary_loss_clip": 0.06306565, + "auxiliary_loss_mlp": 0.01256479, + "balance_loss_clip": 0.06250083, + "balance_loss_mlp": 0.01255134, + "epoch": 0.6493912520667368, + "flos": 70953655800960.0, + "grad_norm": 0.743676703722325, + "language_loss": 0.60158885, + "learning_rate": 1.1565974028639897e-06, + "loss": 0.67721927, + "num_input_tokens_seen": 233110560, + "router_z_loss_clip": 0.56640625, + "router_z_loss_mlp": 0.01346588, + "step": 10801, + "time_per_iteration": 3.246039867401123 + }, + { + "auxiliary_loss_clip": 0.06415623, + "auxiliary_loss_mlp": 0.01272232, + "balance_loss_clip": 0.06277107, + "balance_loss_mlp": 0.01260782, + "epoch": 0.6494513753194048, + "flos": 25344803082240.0, + "grad_norm": 1.7594241437691729, + "language_loss": 0.78884411, + "learning_rate": 1.156244280393614e-06, + "loss": 0.86572272, + "num_input_tokens_seen": 233130080, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.11456299, + "step": 10802, + "time_per_iteration": 2.563626766204834 + }, + { + "auxiliary_loss_clip": 0.06407687, + "auxiliary_loss_mlp": 0.01265006, + "balance_loss_clip": 0.06270398, + "balance_loss_mlp": 0.01254385, + "epoch": 0.6495114985720728, + "flos": 24689050629120.0, + "grad_norm": 1.4701116877862836, + "language_loss": 0.7461825, + "learning_rate": 1.155891189918541e-06, + "loss": 0.82290947, + "num_input_tokens_seen": 233150235, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.10620117, + "step": 10803, + "time_per_iteration": 2.6647095680236816 + }, + { + "auxiliary_loss_clip": 0.06410737, + "auxiliary_loss_mlp": 0.01268913, + "balance_loss_clip": 0.06273716, + "balance_loss_mlp": 0.01258112, + "epoch": 0.6495716218247407, + "flos": 23656520292480.0, + "grad_norm": 2.024891036997784, + "language_loss": 0.6987229, + "learning_rate": 1.1555381314521578e-06, + "loss": 0.77551937, + "num_input_tokens_seen": 233166710, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.10803223, + "step": 10804, + "time_per_iteration": 3.998316526412964 + }, + { + "auxiliary_loss_clip": 0.06410199, + "auxiliary_loss_mlp": 0.01264742, + "balance_loss_clip": 0.0627581, + "balance_loss_mlp": 0.01254019, + "epoch": 0.6496317450774087, + "flos": 22352729961600.0, + "grad_norm": 1.61833096357978, + "language_loss": 0.72940427, + "learning_rate": 1.1551851050078537e-06, + "loss": 0.80615366, + "num_input_tokens_seen": 233185445, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.1072998, + "step": 10805, + "time_per_iteration": 2.550152540206909 + }, + { + "auxiliary_loss_clip": 0.06408597, + "auxiliary_loss_mlp": 0.01268433, + "balance_loss_clip": 0.06270424, + "balance_loss_mlp": 0.01258384, + "epoch": 0.6496918683300766, + "flos": 30526519098240.0, + "grad_norm": 1.9854028073217467, + "language_loss": 0.66420656, + "learning_rate": 1.1548321105990155e-06, + "loss": 0.74097693, + "num_input_tokens_seen": 233205805, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.1005249, + "step": 10806, + "time_per_iteration": 4.017642021179199 + }, + { + "auxiliary_loss_clip": 0.06412096, + "auxiliary_loss_mlp": 0.01270405, + "balance_loss_clip": 0.06272469, + "balance_loss_mlp": 0.01259587, + "epoch": 0.6497519915827447, + "flos": 12463977709440.0, + "grad_norm": 2.120421469188937, + "language_loss": 0.79874885, + "learning_rate": 1.1544791482390275e-06, + "loss": 0.87557387, + "num_input_tokens_seen": 233224215, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.10821533, + "step": 10807, + "time_per_iteration": 2.47318959236145 + }, + { + "auxiliary_loss_clip": 0.06308749, + "auxiliary_loss_mlp": 0.01254009, + "balance_loss_clip": 0.06252696, + "balance_loss_mlp": 0.01252862, + "epoch": 0.6498121148354126, + "flos": 69115787544960.0, + "grad_norm": 0.7752767775633225, + "language_loss": 0.5892998, + "learning_rate": 1.1541262179412745e-06, + "loss": 0.66492736, + "num_input_tokens_seen": 233294440, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01145935, + "step": 10808, + "time_per_iteration": 3.316317319869995 + }, + { + "auxiliary_loss_clip": 0.06407646, + "auxiliary_loss_mlp": 0.0126717, + "balance_loss_clip": 0.06275291, + "balance_loss_mlp": 0.01257043, + "epoch": 0.6498722380880806, + "flos": 36904983454080.0, + "grad_norm": 1.693655644054658, + "language_loss": 0.63518184, + "learning_rate": 1.1537733197191415e-06, + "loss": 0.71192998, + "num_input_tokens_seen": 233316125, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.10131836, + "step": 10809, + "time_per_iteration": 2.6661953926086426 + }, + { + "auxiliary_loss_clip": 0.06407648, + "auxiliary_loss_mlp": 0.01268298, + "balance_loss_clip": 0.06274128, + "balance_loss_mlp": 0.01258499, + "epoch": 0.6499323613407485, + "flos": 29024549861760.0, + "grad_norm": 1.455455865849343, + "language_loss": 0.81994486, + "learning_rate": 1.153420453586008e-06, + "loss": 0.89670432, + "num_input_tokens_seen": 233336140, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.09796143, + "step": 10810, + "time_per_iteration": 2.582893133163452 + }, + { + "auxiliary_loss_clip": 0.06403928, + "auxiliary_loss_mlp": 0.01273294, + "balance_loss_clip": 0.06272624, + "balance_loss_mlp": 0.01263382, + "epoch": 0.6499924845934165, + "flos": 20125212220800.0, + "grad_norm": 1.5531414073118446, + "language_loss": 0.71929145, + "learning_rate": 1.1530676195552561e-06, + "loss": 0.79606366, + "num_input_tokens_seen": 233356095, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.09912109, + "step": 10811, + "time_per_iteration": 2.5130205154418945 + }, + { + "auxiliary_loss_clip": 0.06403043, + "auxiliary_loss_mlp": 0.01269239, + "balance_loss_clip": 0.06273396, + "balance_loss_mlp": 0.01259273, + "epoch": 0.6500526078460844, + "flos": 24427490509440.0, + "grad_norm": 1.5864651817553501, + "language_loss": 0.78127778, + "learning_rate": 1.1527148176402649e-06, + "loss": 0.85800058, + "num_input_tokens_seen": 233376830, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.09967041, + "step": 10812, + "time_per_iteration": 2.5567028522491455 + }, + { + "auxiliary_loss_clip": 0.06411995, + "auxiliary_loss_mlp": 0.01269878, + "balance_loss_clip": 0.06273413, + "balance_loss_mlp": 0.01258887, + "epoch": 0.6501127310987524, + "flos": 23337700306560.0, + "grad_norm": 1.8208092909693303, + "language_loss": 0.85530257, + "learning_rate": 1.152362047854413e-06, + "loss": 0.93212128, + "num_input_tokens_seen": 233395275, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.10992432, + "step": 10813, + "time_per_iteration": 3.9791102409362793 + }, + { + "auxiliary_loss_clip": 0.06410386, + "auxiliary_loss_mlp": 0.01268379, + "balance_loss_clip": 0.0627619, + "balance_loss_mlp": 0.01257955, + "epoch": 0.6501728543514204, + "flos": 18703814284800.0, + "grad_norm": 1.7861415482224605, + "language_loss": 0.80307227, + "learning_rate": 1.1520093102110764e-06, + "loss": 0.87985992, + "num_input_tokens_seen": 233413345, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.10424805, + "step": 10814, + "time_per_iteration": 2.4790940284729004 + }, + { + "auxiliary_loss_clip": 0.06415637, + "auxiliary_loss_mlp": 0.01266919, + "balance_loss_clip": 0.06275604, + "balance_loss_mlp": 0.01256119, + "epoch": 0.6502329776040884, + "flos": 44209858550400.0, + "grad_norm": 1.5485248232594282, + "language_loss": 0.65536499, + "learning_rate": 1.1516566047236328e-06, + "loss": 0.73219061, + "num_input_tokens_seen": 233436105, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.10803223, + "step": 10815, + "time_per_iteration": 2.7446234226226807 + }, + { + "auxiliary_loss_clip": 0.06417957, + "auxiliary_loss_mlp": 0.01270506, + "balance_loss_clip": 0.06274943, + "balance_loss_mlp": 0.01257667, + "epoch": 0.6502931008567564, + "flos": 14580009192960.0, + "grad_norm": 1.8474906541134053, + "language_loss": 0.75516546, + "learning_rate": 1.1513039314054546e-06, + "loss": 0.83205009, + "num_input_tokens_seen": 233452320, + "router_z_loss_clip": 1.43164062, + "router_z_loss_mlp": 0.12841797, + "step": 10816, + "time_per_iteration": 2.4595513343811035 + }, + { + "auxiliary_loss_clip": 0.06411922, + "auxiliary_loss_mlp": 0.01272269, + "balance_loss_clip": 0.06278138, + "balance_loss_mlp": 0.01261845, + "epoch": 0.6503532241094243, + "flos": 21400980560640.0, + "grad_norm": 1.6906297848786114, + "language_loss": 0.73428237, + "learning_rate": 1.1509512902699174e-06, + "loss": 0.81112432, + "num_input_tokens_seen": 233469920, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.10424805, + "step": 10817, + "time_per_iteration": 2.5484201908111572 + }, + { + "auxiliary_loss_clip": 0.06410678, + "auxiliary_loss_mlp": 0.01266458, + "balance_loss_clip": 0.06273761, + "balance_loss_mlp": 0.01255783, + "epoch": 0.6504133473620923, + "flos": 74756349648000.0, + "grad_norm": 1.454828626029086, + "language_loss": 0.71655715, + "learning_rate": 1.1505986813303916e-06, + "loss": 0.79332852, + "num_input_tokens_seen": 233499780, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.10675049, + "step": 10818, + "time_per_iteration": 2.908658504486084 + }, + { + "auxiliary_loss_clip": 0.06415702, + "auxiliary_loss_mlp": 0.01267764, + "balance_loss_clip": 0.06276265, + "balance_loss_mlp": 0.01257261, + "epoch": 0.6504734706147602, + "flos": 19718399848320.0, + "grad_norm": 2.191602402717942, + "language_loss": 0.64758539, + "learning_rate": 1.150246104600249e-06, + "loss": 0.72442001, + "num_input_tokens_seen": 233518235, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.10510254, + "step": 10819, + "time_per_iteration": 2.5333735942840576 + }, + { + "auxiliary_loss_clip": 0.06412923, + "auxiliary_loss_mlp": 0.01268465, + "balance_loss_clip": 0.06274152, + "balance_loss_mlp": 0.01257849, + "epoch": 0.6505335938674283, + "flos": 25563960236160.0, + "grad_norm": 1.7905989506117173, + "language_loss": 0.83637512, + "learning_rate": 1.14989356009286e-06, + "loss": 0.91318899, + "num_input_tokens_seen": 233535215, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.10614014, + "step": 10820, + "time_per_iteration": 2.5265371799468994 + }, + { + "auxiliary_loss_clip": 0.06416887, + "auxiliary_loss_mlp": 0.01268038, + "balance_loss_clip": 0.06276121, + "balance_loss_mlp": 0.01256278, + "epoch": 0.6505937171200962, + "flos": 17827143742080.0, + "grad_norm": 2.110303525663697, + "language_loss": 0.78078735, + "learning_rate": 1.1495410478215914e-06, + "loss": 0.85763657, + "num_input_tokens_seen": 233552775, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.11755371, + "step": 10821, + "time_per_iteration": 2.5157594680786133 + }, + { + "auxiliary_loss_clip": 0.06407174, + "auxiliary_loss_mlp": 0.01267611, + "balance_loss_clip": 0.06274926, + "balance_loss_mlp": 0.01258193, + "epoch": 0.6506538403727642, + "flos": 20674467734400.0, + "grad_norm": 1.345963122833849, + "language_loss": 0.79950106, + "learning_rate": 1.1491885677998126e-06, + "loss": 0.8762489, + "num_input_tokens_seen": 233572080, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.09417725, + "step": 10822, + "time_per_iteration": 2.556008815765381 + }, + { + "auxiliary_loss_clip": 0.06409828, + "auxiliary_loss_mlp": 0.01265301, + "balance_loss_clip": 0.06275606, + "balance_loss_mlp": 0.01254489, + "epoch": 0.6507139636254321, + "flos": 11724970625280.0, + "grad_norm": 1.7704738467059193, + "language_loss": 0.87903178, + "learning_rate": 1.1488361200408883e-06, + "loss": 0.95578313, + "num_input_tokens_seen": 233589155, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.1081543, + "step": 10823, + "time_per_iteration": 2.5153284072875977 + }, + { + "auxiliary_loss_clip": 0.06410562, + "auxiliary_loss_mlp": 0.01263649, + "balance_loss_clip": 0.06273584, + "balance_loss_mlp": 0.01252885, + "epoch": 0.6507740868781001, + "flos": 26769177838080.0, + "grad_norm": 1.5876907781405154, + "language_loss": 0.66698307, + "learning_rate": 1.148483704558183e-06, + "loss": 0.74372518, + "num_input_tokens_seen": 233608180, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.10760498, + "step": 10824, + "time_per_iteration": 2.5415477752685547 + }, + { + "auxiliary_loss_clip": 0.06414588, + "auxiliary_loss_mlp": 0.01270098, + "balance_loss_clip": 0.06274509, + "balance_loss_mlp": 0.01259471, + "epoch": 0.650834210130768, + "flos": 16477260865920.0, + "grad_norm": 2.5628817527572365, + "language_loss": 0.88034272, + "learning_rate": 1.1481313213650607e-06, + "loss": 0.95718956, + "num_input_tokens_seen": 233625750, + "router_z_loss_clip": 1.40039062, + "router_z_loss_mlp": 0.10632324, + "step": 10825, + "time_per_iteration": 2.5432024002075195 + }, + { + "auxiliary_loss_clip": 0.06415717, + "auxiliary_loss_mlp": 0.01269359, + "balance_loss_clip": 0.0627567, + "balance_loss_mlp": 0.012577, + "epoch": 0.650894333383436, + "flos": 17134354984320.0, + "grad_norm": 2.078178971450375, + "language_loss": 0.73451078, + "learning_rate": 1.147778970474885e-06, + "loss": 0.81136155, + "num_input_tokens_seen": 233644235, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.11651611, + "step": 10826, + "time_per_iteration": 2.483405113220215 + }, + { + "auxiliary_loss_clip": 0.06414787, + "auxiliary_loss_mlp": 0.0126516, + "balance_loss_clip": 0.06277563, + "balance_loss_mlp": 0.01255057, + "epoch": 0.650954456636104, + "flos": 18740221683840.0, + "grad_norm": 2.050300118391263, + "language_loss": 0.69847488, + "learning_rate": 1.1474266519010157e-06, + "loss": 0.7752744, + "num_input_tokens_seen": 233662845, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.10107422, + "step": 10827, + "time_per_iteration": 2.529306650161743 + }, + { + "auxiliary_loss_clip": 0.06416346, + "auxiliary_loss_mlp": 0.01267488, + "balance_loss_clip": 0.0627773, + "balance_loss_mlp": 0.01256479, + "epoch": 0.651014579888772, + "flos": 24533987448960.0, + "grad_norm": 2.390068067700356, + "language_loss": 0.77023715, + "learning_rate": 1.1470743656568136e-06, + "loss": 0.84707546, + "num_input_tokens_seen": 233681990, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.10998535, + "step": 10828, + "time_per_iteration": 2.5035903453826904 + }, + { + "auxiliary_loss_clip": 0.06409818, + "auxiliary_loss_mlp": 0.01263344, + "balance_loss_clip": 0.06275382, + "balance_loss_mlp": 0.01252961, + "epoch": 0.65107470314144, + "flos": 24067944639360.0, + "grad_norm": 1.7088923896554455, + "language_loss": 0.89246607, + "learning_rate": 1.1467221117556362e-06, + "loss": 0.96919769, + "num_input_tokens_seen": 233698930, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.10388184, + "step": 10829, + "time_per_iteration": 2.51090931892395 + }, + { + "auxiliary_loss_clip": 0.06314664, + "auxiliary_loss_mlp": 0.0125328, + "balance_loss_clip": 0.06258522, + "balance_loss_mlp": 0.01251908, + "epoch": 0.6511348263941079, + "flos": 72502304561280.0, + "grad_norm": 0.6366010219235949, + "language_loss": 0.55376649, + "learning_rate": 1.1463698902108428e-06, + "loss": 0.62944591, + "num_input_tokens_seen": 233769825, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01374817, + "step": 10830, + "time_per_iteration": 3.2892563343048096 + }, + { + "auxiliary_loss_clip": 0.06424817, + "auxiliary_loss_mlp": 0.01266709, + "balance_loss_clip": 0.06282428, + "balance_loss_mlp": 0.01255372, + "epoch": 0.6511949496467759, + "flos": 23374401194880.0, + "grad_norm": 2.1202653739592026, + "language_loss": 0.75132632, + "learning_rate": 1.1460177010357878e-06, + "loss": 0.82824159, + "num_input_tokens_seen": 233787095, + "router_z_loss_clip": 1.42480469, + "router_z_loss_mlp": 0.11334229, + "step": 10831, + "time_per_iteration": 4.007694482803345 + }, + { + "auxiliary_loss_clip": 0.06315142, + "auxiliary_loss_mlp": 0.01253248, + "balance_loss_clip": 0.06259014, + "balance_loss_mlp": 0.01251801, + "epoch": 0.6512550728994438, + "flos": 67353390218880.0, + "grad_norm": 0.6347055670227107, + "language_loss": 0.51072258, + "learning_rate": 1.145665544243828e-06, + "loss": 0.58640647, + "num_input_tokens_seen": 233853050, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01445007, + "step": 10832, + "time_per_iteration": 3.2983696460723877 + }, + { + "auxiliary_loss_clip": 0.06417792, + "auxiliary_loss_mlp": 0.01264906, + "balance_loss_clip": 0.06276103, + "balance_loss_mlp": 0.01254195, + "epoch": 0.6513151961521119, + "flos": 21147973557120.0, + "grad_norm": 2.2140276605758693, + "language_loss": 0.8367548, + "learning_rate": 1.145313419848316e-06, + "loss": 0.91358173, + "num_input_tokens_seen": 233871385, + "router_z_loss_clip": 1.41699219, + "router_z_loss_mlp": 0.10699463, + "step": 10833, + "time_per_iteration": 2.511261463165283 + }, + { + "auxiliary_loss_clip": 0.06416205, + "auxiliary_loss_mlp": 0.01266301, + "balance_loss_clip": 0.06280707, + "balance_loss_mlp": 0.01255471, + "epoch": 0.6513753194047798, + "flos": 15164246586240.0, + "grad_norm": 10.86743731426701, + "language_loss": 0.84111547, + "learning_rate": 1.1449613278626049e-06, + "loss": 0.9179405, + "num_input_tokens_seen": 233888175, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.1083374, + "step": 10834, + "time_per_iteration": 2.4789986610412598 + }, + { + "auxiliary_loss_clip": 0.06416395, + "auxiliary_loss_mlp": 0.01267897, + "balance_loss_clip": 0.06278732, + "balance_loss_mlp": 0.01257979, + "epoch": 0.6514354426574478, + "flos": 30234421365120.0, + "grad_norm": 1.7456774308536143, + "language_loss": 0.77525127, + "learning_rate": 1.1446092683000455e-06, + "loss": 0.85209417, + "num_input_tokens_seen": 233911470, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.09918213, + "step": 10835, + "time_per_iteration": 2.588974714279175 + }, + { + "auxiliary_loss_clip": 0.06414215, + "auxiliary_loss_mlp": 0.01268341, + "balance_loss_clip": 0.06276295, + "balance_loss_mlp": 0.01257624, + "epoch": 0.6514955659101157, + "flos": 24212232570240.0, + "grad_norm": 5.683759297238724, + "language_loss": 0.77732491, + "learning_rate": 1.1442572411739882e-06, + "loss": 0.85415047, + "num_input_tokens_seen": 233932135, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.10717773, + "step": 10836, + "time_per_iteration": 2.5676357746124268 + }, + { + "auxiliary_loss_clip": 0.06414723, + "auxiliary_loss_mlp": 0.01267155, + "balance_loss_clip": 0.06277227, + "balance_loss_mlp": 0.01256552, + "epoch": 0.6515556891627837, + "flos": 12381351984000.0, + "grad_norm": 1.8169643503490496, + "language_loss": 0.82167637, + "learning_rate": 1.143905246497783e-06, + "loss": 0.8984952, + "num_input_tokens_seen": 233947880, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.1060791, + "step": 10837, + "time_per_iteration": 2.483123779296875 + }, + { + "auxiliary_loss_clip": 0.06414027, + "auxiliary_loss_mlp": 0.01269762, + "balance_loss_clip": 0.06281339, + "balance_loss_mlp": 0.01258753, + "epoch": 0.6516158124154516, + "flos": 49612366874880.0, + "grad_norm": 1.9745505880128194, + "language_loss": 0.59549761, + "learning_rate": 1.1435532842847758e-06, + "loss": 0.67233551, + "num_input_tokens_seen": 233971475, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.11004639, + "step": 10838, + "time_per_iteration": 2.762786865234375 + }, + { + "auxiliary_loss_clip": 0.06317103, + "auxiliary_loss_mlp": 0.01253866, + "balance_loss_clip": 0.06261341, + "balance_loss_mlp": 0.01252529, + "epoch": 0.6516759356681197, + "flos": 59720848531200.0, + "grad_norm": 0.7135395932752281, + "language_loss": 0.60686612, + "learning_rate": 1.1432013545483147e-06, + "loss": 0.68257582, + "num_input_tokens_seen": 234030690, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.01338196, + "step": 10839, + "time_per_iteration": 3.223712921142578 + }, + { + "auxiliary_loss_clip": 0.06412867, + "auxiliary_loss_mlp": 0.01261941, + "balance_loss_clip": 0.06278579, + "balance_loss_mlp": 0.01252809, + "epoch": 0.6517360589207876, + "flos": 37459815264000.0, + "grad_norm": 1.5945463275519725, + "language_loss": 0.67963755, + "learning_rate": 1.1428494573017439e-06, + "loss": 0.75638568, + "num_input_tokens_seen": 234052470, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.09136963, + "step": 10840, + "time_per_iteration": 2.6288609504699707 + }, + { + "auxiliary_loss_clip": 0.06418526, + "auxiliary_loss_mlp": 0.01264725, + "balance_loss_clip": 0.06281736, + "balance_loss_mlp": 0.01254264, + "epoch": 0.6517961821734556, + "flos": 25382049022080.0, + "grad_norm": 2.724184034803811, + "language_loss": 0.73645818, + "learning_rate": 1.1424975925584071e-06, + "loss": 0.81329072, + "num_input_tokens_seen": 234071495, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.10461426, + "step": 10841, + "time_per_iteration": 2.6020925045013428 + }, + { + "auxiliary_loss_clip": 0.06416935, + "auxiliary_loss_mlp": 0.01263784, + "balance_loss_clip": 0.0627799, + "balance_loss_mlp": 0.01252632, + "epoch": 0.6518563054261236, + "flos": 28774519678080.0, + "grad_norm": 1.3493483862035613, + "language_loss": 0.6300385, + "learning_rate": 1.142145760331648e-06, + "loss": 0.7068457, + "num_input_tokens_seen": 234092325, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.11151123, + "step": 10842, + "time_per_iteration": 2.550992012023926 + }, + { + "auxiliary_loss_clip": 0.06321006, + "auxiliary_loss_mlp": 0.01250785, + "balance_loss_clip": 0.06265368, + "balance_loss_mlp": 0.01249527, + "epoch": 0.6519164286787915, + "flos": 68942905372800.0, + "grad_norm": 0.8268303815829595, + "language_loss": 0.56121087, + "learning_rate": 1.141793960634807e-06, + "loss": 0.6369288, + "num_input_tokens_seen": 234148005, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01258087, + "step": 10843, + "time_per_iteration": 4.4302709102630615 + }, + { + "auxiliary_loss_clip": 0.06418709, + "auxiliary_loss_mlp": 0.01268693, + "balance_loss_clip": 0.06276315, + "balance_loss_mlp": 0.01256844, + "epoch": 0.6519765519314595, + "flos": 20447009026560.0, + "grad_norm": 1.9018808017225726, + "language_loss": 0.83082736, + "learning_rate": 1.1414421934812253e-06, + "loss": 0.90770137, + "num_input_tokens_seen": 234164280, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.11846924, + "step": 10844, + "time_per_iteration": 2.600843906402588 + }, + { + "auxiliary_loss_clip": 0.06412451, + "auxiliary_loss_mlp": 0.01265857, + "balance_loss_clip": 0.06274604, + "balance_loss_mlp": 0.01254598, + "epoch": 0.6520366751841274, + "flos": 28410571468800.0, + "grad_norm": 1.712600797448846, + "language_loss": 0.60434437, + "learning_rate": 1.1410904588842421e-06, + "loss": 0.68112737, + "num_input_tokens_seen": 234185090, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.11260986, + "step": 10845, + "time_per_iteration": 2.5539886951446533 + }, + { + "auxiliary_loss_clip": 0.0641913, + "auxiliary_loss_mlp": 0.01267979, + "balance_loss_clip": 0.0628117, + "balance_loss_mlp": 0.01256964, + "epoch": 0.6520967984367955, + "flos": 22279999017600.0, + "grad_norm": 1.7154837264423382, + "language_loss": 0.79721403, + "learning_rate": 1.140738756857194e-06, + "loss": 0.87408507, + "num_input_tokens_seen": 234204050, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.11010742, + "step": 10846, + "time_per_iteration": 3.9483704566955566 + }, + { + "auxiliary_loss_clip": 0.06323321, + "auxiliary_loss_mlp": 0.01252083, + "balance_loss_clip": 0.06267467, + "balance_loss_mlp": 0.01250644, + "epoch": 0.6521569216894634, + "flos": 68940123459840.0, + "grad_norm": 0.9959560363450068, + "language_loss": 0.60117191, + "learning_rate": 1.1403870874134192e-06, + "loss": 0.67692602, + "num_input_tokens_seen": 234269790, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.01437378, + "step": 10847, + "time_per_iteration": 3.259263277053833 + }, + { + "auxiliary_loss_clip": 0.06419109, + "auxiliary_loss_mlp": 0.01266521, + "balance_loss_clip": 0.06278636, + "balance_loss_mlp": 0.01255196, + "epoch": 0.6522170449421314, + "flos": 29137880908800.0, + "grad_norm": 1.6024469489184654, + "language_loss": 0.81200469, + "learning_rate": 1.1400354505662514e-06, + "loss": 0.88886106, + "num_input_tokens_seen": 234290135, + "router_z_loss_clip": 1.40429688, + "router_z_loss_mlp": 0.11322021, + "step": 10848, + "time_per_iteration": 2.5693862438201904 + }, + { + "auxiliary_loss_clip": 0.06413288, + "auxiliary_loss_mlp": 0.0127236, + "balance_loss_clip": 0.06276944, + "balance_loss_mlp": 0.01262072, + "epoch": 0.6522771681947993, + "flos": 26659284808320.0, + "grad_norm": 2.0899993216020527, + "language_loss": 0.74621618, + "learning_rate": 1.1396838463290263e-06, + "loss": 0.82307267, + "num_input_tokens_seen": 234309535, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.10284424, + "step": 10849, + "time_per_iteration": 2.636046886444092 + }, + { + "auxiliary_loss_clip": 0.06412181, + "auxiliary_loss_mlp": 0.01268851, + "balance_loss_clip": 0.06278128, + "balance_loss_mlp": 0.01258129, + "epoch": 0.6523372914474673, + "flos": 25746961553280.0, + "grad_norm": 1.4470039882385268, + "language_loss": 0.68371421, + "learning_rate": 1.1393322747150752e-06, + "loss": 0.76052451, + "num_input_tokens_seen": 234328755, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.1072998, + "step": 10850, + "time_per_iteration": 2.52057147026062 + }, + { + "auxiliary_loss_clip": 0.0640863, + "auxiliary_loss_mlp": 0.01263783, + "balance_loss_clip": 0.06275396, + "balance_loss_mlp": 0.01253752, + "epoch": 0.6523974147001352, + "flos": 24834344808960.0, + "grad_norm": 1.562549828159254, + "language_loss": 0.67212379, + "learning_rate": 1.1389807357377313e-06, + "loss": 0.7488479, + "num_input_tokens_seen": 234348655, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.10046387, + "step": 10851, + "time_per_iteration": 2.5808029174804688 + }, + { + "auxiliary_loss_clip": 0.06416307, + "auxiliary_loss_mlp": 0.01265062, + "balance_loss_clip": 0.06276499, + "balance_loss_mlp": 0.01254386, + "epoch": 0.6524575379528033, + "flos": 26323945568640.0, + "grad_norm": 2.0070314818502695, + "language_loss": 0.7443608, + "learning_rate": 1.1386292294103235e-06, + "loss": 0.8211745, + "num_input_tokens_seen": 234367445, + "router_z_loss_clip": 1.39941406, + "router_z_loss_mlp": 0.10687256, + "step": 10852, + "time_per_iteration": 4.013243675231934 + }, + { + "auxiliary_loss_clip": 0.0641986, + "auxiliary_loss_mlp": 0.01268659, + "balance_loss_clip": 0.06278665, + "balance_loss_mlp": 0.01257191, + "epoch": 0.6525176612054712, + "flos": 19499200767360.0, + "grad_norm": 1.9187417240841533, + "language_loss": 0.67066777, + "learning_rate": 1.1382777557461812e-06, + "loss": 0.74755299, + "num_input_tokens_seen": 234384825, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.11468506, + "step": 10853, + "time_per_iteration": 2.506601572036743 + }, + { + "auxiliary_loss_clip": 0.06318477, + "auxiliary_loss_mlp": 0.01256063, + "balance_loss_clip": 0.06262536, + "balance_loss_mlp": 0.0125474, + "epoch": 0.6525777844581392, + "flos": 71727057786240.0, + "grad_norm": 0.715298954462881, + "language_loss": 0.63038433, + "learning_rate": 1.137926314758634e-06, + "loss": 0.70612979, + "num_input_tokens_seen": 234450630, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01324463, + "step": 10854, + "time_per_iteration": 3.2700932025909424 + }, + { + "auxiliary_loss_clip": 0.06413402, + "auxiliary_loss_mlp": 0.01267951, + "balance_loss_clip": 0.06275877, + "balance_loss_mlp": 0.01256549, + "epoch": 0.6526379077108072, + "flos": 26660668400640.0, + "grad_norm": 1.6617688619573214, + "language_loss": 0.77541685, + "learning_rate": 1.1375749064610072e-06, + "loss": 0.85223043, + "num_input_tokens_seen": 234473505, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.11407471, + "step": 10855, + "time_per_iteration": 2.5642480850219727 + }, + { + "auxiliary_loss_clip": 0.06405862, + "auxiliary_loss_mlp": 0.01265521, + "balance_loss_clip": 0.06274554, + "balance_loss_mlp": 0.01255174, + "epoch": 0.6526980309634751, + "flos": 22826990471040.0, + "grad_norm": 1.7631241717885235, + "language_loss": 0.79621822, + "learning_rate": 1.1372235308666256e-06, + "loss": 0.87293208, + "num_input_tokens_seen": 234492485, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.10345459, + "step": 10856, + "time_per_iteration": 2.537353992462158 + }, + { + "auxiliary_loss_clip": 0.06408816, + "auxiliary_loss_mlp": 0.01267488, + "balance_loss_clip": 0.06273422, + "balance_loss_mlp": 0.01256408, + "epoch": 0.6527581542161431, + "flos": 28372403134080.0, + "grad_norm": 1.6923564955573929, + "language_loss": 0.73936152, + "learning_rate": 1.136872187988815e-06, + "loss": 0.81612456, + "num_input_tokens_seen": 234512645, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.11077881, + "step": 10857, + "time_per_iteration": 2.5426032543182373 + }, + { + "auxiliary_loss_clip": 0.06409546, + "auxiliary_loss_mlp": 0.01266483, + "balance_loss_clip": 0.06273436, + "balance_loss_mlp": 0.01256195, + "epoch": 0.652818277468811, + "flos": 18375099517440.0, + "grad_norm": 2.1707425213383136, + "language_loss": 0.63389534, + "learning_rate": 1.1365208778408965e-06, + "loss": 0.71065563, + "num_input_tokens_seen": 234529310, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.10290527, + "step": 10858, + "time_per_iteration": 2.495542049407959 + }, + { + "auxiliary_loss_clip": 0.06408103, + "auxiliary_loss_mlp": 0.01265114, + "balance_loss_clip": 0.06274083, + "balance_loss_mlp": 0.01254784, + "epoch": 0.6528784007214791, + "flos": 18041227724160.0, + "grad_norm": 1.644037371034234, + "language_loss": 0.78852642, + "learning_rate": 1.1361696004361939e-06, + "loss": 0.86525851, + "num_input_tokens_seen": 234546685, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.10333252, + "step": 10859, + "time_per_iteration": 2.5497894287109375 + }, + { + "auxiliary_loss_clip": 0.06414656, + "auxiliary_loss_mlp": 0.01263542, + "balance_loss_clip": 0.06273727, + "balance_loss_mlp": 0.01252611, + "epoch": 0.652938523974147, + "flos": 22388466528000.0, + "grad_norm": 1.5493254250566866, + "language_loss": 0.67967153, + "learning_rate": 1.1358183557880256e-06, + "loss": 0.75645357, + "num_input_tokens_seen": 234566255, + "router_z_loss_clip": 1.40917969, + "router_z_loss_mlp": 0.10931396, + "step": 10860, + "time_per_iteration": 2.5913808345794678 + }, + { + "auxiliary_loss_clip": 0.06418759, + "auxiliary_loss_mlp": 0.01268006, + "balance_loss_clip": 0.06276677, + "balance_loss_mlp": 0.01257426, + "epoch": 0.652998647226815, + "flos": 16769694015360.0, + "grad_norm": 1.8207811146767594, + "language_loss": 0.67290318, + "learning_rate": 1.135467143909712e-06, + "loss": 0.74977076, + "num_input_tokens_seen": 234585405, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.10583496, + "step": 10861, + "time_per_iteration": 2.50136137008667 + }, + { + "auxiliary_loss_clip": 0.06415796, + "auxiliary_loss_mlp": 0.01266199, + "balance_loss_clip": 0.06276291, + "balance_loss_mlp": 0.01254886, + "epoch": 0.6530587704794829, + "flos": 35781259547520.0, + "grad_norm": 2.0180062200449744, + "language_loss": 0.65632504, + "learning_rate": 1.135115964814572e-06, + "loss": 0.733145, + "num_input_tokens_seen": 234608095, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.11309814, + "step": 10862, + "time_per_iteration": 2.7082483768463135 + }, + { + "auxiliary_loss_clip": 0.06413227, + "auxiliary_loss_mlp": 0.01267111, + "balance_loss_clip": 0.06276508, + "balance_loss_mlp": 0.01256912, + "epoch": 0.6531188937321509, + "flos": 19321901527680.0, + "grad_norm": 1.7523951884589628, + "language_loss": 0.77599865, + "learning_rate": 1.13476481851592e-06, + "loss": 0.85280204, + "num_input_tokens_seen": 234627335, + "router_z_loss_clip": 1.36621094, + "router_z_loss_mlp": 0.10198975, + "step": 10863, + "time_per_iteration": 2.525467872619629 + }, + { + "auxiliary_loss_clip": 0.06412541, + "auxiliary_loss_mlp": 0.01266016, + "balance_loss_clip": 0.06275116, + "balance_loss_mlp": 0.01255579, + "epoch": 0.6531790169848188, + "flos": 22900476101760.0, + "grad_norm": 1.5537645301307006, + "language_loss": 0.74952781, + "learning_rate": 1.1344137050270739e-06, + "loss": 0.82631332, + "num_input_tokens_seen": 234646540, + "router_z_loss_clip": 1.37402344, + "router_z_loss_mlp": 0.10430908, + "step": 10864, + "time_per_iteration": 2.5613489151000977 + }, + { + "auxiliary_loss_clip": 0.06410347, + "auxiliary_loss_mlp": 0.01267199, + "balance_loss_clip": 0.06275168, + "balance_loss_mlp": 0.01256929, + "epoch": 0.6532391402374869, + "flos": 29570157722880.0, + "grad_norm": 1.9052418824081008, + "language_loss": 0.86169875, + "learning_rate": 1.1340626243613458e-06, + "loss": 0.93847424, + "num_input_tokens_seen": 234665470, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.1026001, + "step": 10865, + "time_per_iteration": 2.5604805946350098 + }, + { + "auxiliary_loss_clip": 0.06417938, + "auxiliary_loss_mlp": 0.01269286, + "balance_loss_clip": 0.0627698, + "balance_loss_mlp": 0.01258926, + "epoch": 0.6532992634901548, + "flos": 23110996285440.0, + "grad_norm": 1.6108799527314137, + "language_loss": 0.81515527, + "learning_rate": 1.133711576532051e-06, + "loss": 0.8920275, + "num_input_tokens_seen": 234683955, + "router_z_loss_clip": 1.40917969, + "router_z_loss_mlp": 0.10357666, + "step": 10866, + "time_per_iteration": 2.5684125423431396 + }, + { + "auxiliary_loss_clip": 0.06411187, + "auxiliary_loss_mlp": 0.01264991, + "balance_loss_clip": 0.06275405, + "balance_loss_mlp": 0.01254382, + "epoch": 0.6533593867428228, + "flos": 26074460436480.0, + "grad_norm": 1.6718467663998162, + "language_loss": 0.82545173, + "learning_rate": 1.1333605615524995e-06, + "loss": 0.90221351, + "num_input_tokens_seen": 234704595, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.10614014, + "step": 10867, + "time_per_iteration": 2.5475850105285645 + }, + { + "auxiliary_loss_clip": 0.06413805, + "auxiliary_loss_mlp": 0.01264816, + "balance_loss_clip": 0.06276451, + "balance_loss_mlp": 0.01254314, + "epoch": 0.6534195099954908, + "flos": 21218398513920.0, + "grad_norm": 1.6506076303544417, + "language_loss": 0.81211448, + "learning_rate": 1.1330095794360016e-06, + "loss": 0.88890064, + "num_input_tokens_seen": 234724090, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.1050415, + "step": 10868, + "time_per_iteration": 2.5498743057250977 + }, + { + "auxiliary_loss_clip": 0.06418251, + "auxiliary_loss_mlp": 0.01266421, + "balance_loss_clip": 0.06277823, + "balance_loss_mlp": 0.01255579, + "epoch": 0.6534796332481587, + "flos": 19652754574080.0, + "grad_norm": 1.774479415812712, + "language_loss": 0.7959047, + "learning_rate": 1.1326586301958675e-06, + "loss": 0.87275141, + "num_input_tokens_seen": 234742560, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.10845947, + "step": 10869, + "time_per_iteration": 2.5166242122650146 + }, + { + "auxiliary_loss_clip": 0.06413683, + "auxiliary_loss_mlp": 0.01266573, + "balance_loss_clip": 0.0627695, + "balance_loss_mlp": 0.01256172, + "epoch": 0.6535397565008267, + "flos": 24028979690880.0, + "grad_norm": 2.0325113837901703, + "language_loss": 0.72014058, + "learning_rate": 1.1323077138454063e-06, + "loss": 0.79694319, + "num_input_tokens_seen": 234762315, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.10406494, + "step": 10870, + "time_per_iteration": 2.5486953258514404 + }, + { + "auxiliary_loss_clip": 0.06413276, + "auxiliary_loss_mlp": 0.01265068, + "balance_loss_clip": 0.06275949, + "balance_loss_mlp": 0.01254584, + "epoch": 0.6535998797534947, + "flos": 24608772817920.0, + "grad_norm": 1.9753517025590153, + "language_loss": 0.74408901, + "learning_rate": 1.1319568303979221e-06, + "loss": 0.82087243, + "num_input_tokens_seen": 234781300, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.10479736, + "step": 10871, + "time_per_iteration": 4.039932489395142 + }, + { + "auxiliary_loss_clip": 0.06410573, + "auxiliary_loss_mlp": 0.01263739, + "balance_loss_clip": 0.06277861, + "balance_loss_mlp": 0.01253791, + "epoch": 0.6536600030061627, + "flos": 23370292344960.0, + "grad_norm": 1.4980578991412412, + "language_loss": 0.56041443, + "learning_rate": 1.1316059798667227e-06, + "loss": 0.6371575, + "num_input_tokens_seen": 234801040, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.0994873, + "step": 10872, + "time_per_iteration": 2.502490282058716 + }, + { + "auxiliary_loss_clip": 0.06416132, + "auxiliary_loss_mlp": 0.01267921, + "balance_loss_clip": 0.06281123, + "balance_loss_mlp": 0.01256918, + "epoch": 0.6537201262588306, + "flos": 23885278738560.0, + "grad_norm": 1.5337992373700162, + "language_loss": 0.75344592, + "learning_rate": 1.1312551622651112e-06, + "loss": 0.8302865, + "num_input_tokens_seen": 234821415, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.11022949, + "step": 10873, + "time_per_iteration": 2.5598514080047607 + }, + { + "auxiliary_loss_clip": 0.06410979, + "auxiliary_loss_mlp": 0.012657, + "balance_loss_clip": 0.06274614, + "balance_loss_mlp": 0.01255585, + "epoch": 0.6537802495114986, + "flos": 24361971016320.0, + "grad_norm": 1.420531378230647, + "language_loss": 0.76059687, + "learning_rate": 1.1309043776063917e-06, + "loss": 0.8373636, + "num_input_tokens_seen": 234843795, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.10113525, + "step": 10874, + "time_per_iteration": 2.549380302429199 + }, + { + "auxiliary_loss_clip": 0.06415659, + "auxiliary_loss_mlp": 0.01268814, + "balance_loss_clip": 0.06279317, + "balance_loss_mlp": 0.01258032, + "epoch": 0.6538403727641665, + "flos": 28003633315200.0, + "grad_norm": 1.5256219818178185, + "language_loss": 0.81805712, + "learning_rate": 1.1305536259038642e-06, + "loss": 0.89490187, + "num_input_tokens_seen": 234862350, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.10784912, + "step": 10875, + "time_per_iteration": 2.583240270614624 + }, + { + "auxiliary_loss_clip": 0.06411637, + "auxiliary_loss_mlp": 0.01266928, + "balance_loss_clip": 0.06273378, + "balance_loss_mlp": 0.01256372, + "epoch": 0.6539004960168345, + "flos": 27571021084800.0, + "grad_norm": 1.6524409835803482, + "language_loss": 0.69961172, + "learning_rate": 1.1302029071708314e-06, + "loss": 0.77639741, + "num_input_tokens_seen": 234881790, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.10552979, + "step": 10876, + "time_per_iteration": 2.53607439994812 + }, + { + "auxiliary_loss_clip": 0.0641342, + "auxiliary_loss_mlp": 0.01265066, + "balance_loss_clip": 0.06277761, + "balance_loss_mlp": 0.01254177, + "epoch": 0.6539606192695024, + "flos": 14533958574720.0, + "grad_norm": 1.8504141345372043, + "language_loss": 0.79613322, + "learning_rate": 1.1298522214205908e-06, + "loss": 0.87291813, + "num_input_tokens_seen": 234897775, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.10888672, + "step": 10877, + "time_per_iteration": 2.482450246810913 + }, + { + "auxiliary_loss_clip": 0.0641083, + "auxiliary_loss_mlp": 0.01271317, + "balance_loss_clip": 0.06274553, + "balance_loss_mlp": 0.01260976, + "epoch": 0.6540207425221705, + "flos": 21622779118080.0, + "grad_norm": 2.1988791511764507, + "language_loss": 0.80130821, + "learning_rate": 1.1295015686664408e-06, + "loss": 0.87812972, + "num_input_tokens_seen": 234918395, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.10333252, + "step": 10878, + "time_per_iteration": 2.4935176372528076 + }, + { + "auxiliary_loss_clip": 0.06409772, + "auxiliary_loss_mlp": 0.01267486, + "balance_loss_clip": 0.06272677, + "balance_loss_mlp": 0.01256185, + "epoch": 0.6540808657748384, + "flos": 17673589935360.0, + "grad_norm": 2.582136269580718, + "language_loss": 0.8441155, + "learning_rate": 1.1291509489216797e-06, + "loss": 0.92088807, + "num_input_tokens_seen": 234936260, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.11309814, + "step": 10879, + "time_per_iteration": 2.478309392929077 + }, + { + "auxiliary_loss_clip": 0.06413597, + "auxiliary_loss_mlp": 0.01266934, + "balance_loss_clip": 0.06273437, + "balance_loss_mlp": 0.01255937, + "epoch": 0.6541409890275064, + "flos": 14543559866880.0, + "grad_norm": 2.245673949677598, + "language_loss": 0.72627622, + "learning_rate": 1.128800362199601e-06, + "loss": 0.80308151, + "num_input_tokens_seen": 234952110, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.11004639, + "step": 10880, + "time_per_iteration": 2.448975086212158 + }, + { + "auxiliary_loss_clip": 0.06410271, + "auxiliary_loss_mlp": 0.01269229, + "balance_loss_clip": 0.06275423, + "balance_loss_mlp": 0.01258899, + "epoch": 0.6542011122801744, + "flos": 17171013945600.0, + "grad_norm": 1.8546451564603688, + "language_loss": 0.84333724, + "learning_rate": 1.1284498085135005e-06, + "loss": 0.92013222, + "num_input_tokens_seen": 234970810, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.10333252, + "step": 10881, + "time_per_iteration": 2.5005478858947754 + }, + { + "auxiliary_loss_clip": 0.06415182, + "auxiliary_loss_mlp": 0.01264701, + "balance_loss_clip": 0.0627469, + "balance_loss_mlp": 0.01252995, + "epoch": 0.6542612355328423, + "flos": 18192433616640.0, + "grad_norm": 1.7673801500025483, + "language_loss": 0.78099298, + "learning_rate": 1.1280992878766699e-06, + "loss": 0.85779178, + "num_input_tokens_seen": 234989565, + "router_z_loss_clip": 1.40332031, + "router_z_loss_mlp": 0.11700439, + "step": 10882, + "time_per_iteration": 2.4750256538391113 + }, + { + "auxiliary_loss_clip": 0.06413694, + "auxiliary_loss_mlp": 0.01268989, + "balance_loss_clip": 0.06275713, + "balance_loss_mlp": 0.01257837, + "epoch": 0.6543213587855103, + "flos": 19798635732480.0, + "grad_norm": 1.55805041018917, + "language_loss": 0.81790304, + "learning_rate": 1.1277488003024024e-06, + "loss": 0.89472985, + "num_input_tokens_seen": 235007955, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.1116333, + "step": 10883, + "time_per_iteration": 3.958979368209839 + }, + { + "auxiliary_loss_clip": 0.06415352, + "auxiliary_loss_mlp": 0.0126774, + "balance_loss_clip": 0.06277536, + "balance_loss_mlp": 0.01256427, + "epoch": 0.6543814820381783, + "flos": 21111356522880.0, + "grad_norm": 2.318256186808643, + "language_loss": 0.85692853, + "learning_rate": 1.127398345803988e-06, + "loss": 0.93375945, + "num_input_tokens_seen": 235024860, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.11322021, + "step": 10884, + "time_per_iteration": 2.4991559982299805 + }, + { + "auxiliary_loss_clip": 0.06414054, + "auxiliary_loss_mlp": 0.01263304, + "balance_loss_clip": 0.06276435, + "balance_loss_mlp": 0.01252623, + "epoch": 0.6544416052908463, + "flos": 20200333006080.0, + "grad_norm": 2.0262705152465985, + "language_loss": 0.8030138, + "learning_rate": 1.127047924394715e-06, + "loss": 0.87978739, + "num_input_tokens_seen": 235043815, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.10687256, + "step": 10885, + "time_per_iteration": 3.945915699005127 + }, + { + "auxiliary_loss_clip": 0.06412613, + "auxiliary_loss_mlp": 0.01269809, + "balance_loss_clip": 0.06277589, + "balance_loss_mlp": 0.01259468, + "epoch": 0.6545017285435142, + "flos": 23375072027520.0, + "grad_norm": 1.9399514462864902, + "language_loss": 0.72038162, + "learning_rate": 1.1266975360878722e-06, + "loss": 0.79720581, + "num_input_tokens_seen": 235062985, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.10339355, + "step": 10886, + "time_per_iteration": 2.592869520187378 + }, + { + "auxiliary_loss_clip": 0.06412855, + "auxiliary_loss_mlp": 0.0126236, + "balance_loss_clip": 0.06275351, + "balance_loss_mlp": 0.01252591, + "epoch": 0.6545618517961822, + "flos": 19140619219200.0, + "grad_norm": 1.841753490100957, + "language_loss": 0.78875196, + "learning_rate": 1.1263471808967468e-06, + "loss": 0.86550403, + "num_input_tokens_seen": 235081670, + "router_z_loss_clip": 1.37402344, + "router_z_loss_mlp": 0.09765625, + "step": 10887, + "time_per_iteration": 2.4951751232147217 + }, + { + "auxiliary_loss_clip": 0.06415602, + "auxiliary_loss_mlp": 0.0126552, + "balance_loss_clip": 0.06278757, + "balance_loss_mlp": 0.01255346, + "epoch": 0.6546219750488501, + "flos": 14943789694080.0, + "grad_norm": 1.7286309451287045, + "language_loss": 0.791143, + "learning_rate": 1.1259968588346234e-06, + "loss": 0.86795419, + "num_input_tokens_seen": 235098510, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.10168457, + "step": 10888, + "time_per_iteration": 2.5363447666168213 + }, + { + "auxiliary_loss_clip": 0.0641408, + "auxiliary_loss_mlp": 0.01266895, + "balance_loss_clip": 0.06279381, + "balance_loss_mlp": 0.01257025, + "epoch": 0.6546820983015181, + "flos": 36329466885120.0, + "grad_norm": 1.4489059834180797, + "language_loss": 0.66680413, + "learning_rate": 1.1256465699147874e-06, + "loss": 0.7436139, + "num_input_tokens_seen": 235119990, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.09869385, + "step": 10889, + "time_per_iteration": 2.631702184677124 + }, + { + "auxiliary_loss_clip": 0.06413323, + "auxiliary_loss_mlp": 0.01267679, + "balance_loss_clip": 0.06274237, + "balance_loss_mlp": 0.01255359, + "epoch": 0.654742221554186, + "flos": 20417519589120.0, + "grad_norm": 1.4090787224296468, + "language_loss": 0.80175591, + "learning_rate": 1.1252963141505203e-06, + "loss": 0.87856597, + "num_input_tokens_seen": 235139255, + "router_z_loss_clip": 1.39257812, + "router_z_loss_mlp": 0.12322998, + "step": 10890, + "time_per_iteration": 2.553987503051758 + }, + { + "auxiliary_loss_clip": 0.06413622, + "auxiliary_loss_mlp": 0.01266787, + "balance_loss_clip": 0.06272978, + "balance_loss_mlp": 0.01255963, + "epoch": 0.6548023448068541, + "flos": 24870626426880.0, + "grad_norm": 1.9658735826984712, + "language_loss": 0.66080928, + "learning_rate": 1.1249460915551052e-06, + "loss": 0.73761332, + "num_input_tokens_seen": 235158455, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.10827637, + "step": 10891, + "time_per_iteration": 3.981126546859741 + }, + { + "auxiliary_loss_clip": 0.06412101, + "auxiliary_loss_mlp": 0.01268584, + "balance_loss_clip": 0.06276606, + "balance_loss_mlp": 0.01258314, + "epoch": 0.654862468059522, + "flos": 21432901766400.0, + "grad_norm": 1.7619514062333756, + "language_loss": 0.80124283, + "learning_rate": 1.1245959021418214e-06, + "loss": 0.87804967, + "num_input_tokens_seen": 235177350, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.1027832, + "step": 10892, + "time_per_iteration": 2.487014055252075 + }, + { + "auxiliary_loss_clip": 0.06417862, + "auxiliary_loss_mlp": 0.01267184, + "balance_loss_clip": 0.06275848, + "balance_loss_mlp": 0.01256502, + "epoch": 0.65492259131219, + "flos": 26585002563840.0, + "grad_norm": 1.8517707324094554, + "language_loss": 0.78348118, + "learning_rate": 1.1242457459239497e-06, + "loss": 0.86033165, + "num_input_tokens_seen": 235196435, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.10675049, + "step": 10893, + "time_per_iteration": 2.5751121044158936 + }, + { + "auxiliary_loss_clip": 0.06416593, + "auxiliary_loss_mlp": 0.01265779, + "balance_loss_clip": 0.06276494, + "balance_loss_mlp": 0.01254126, + "epoch": 0.6549827145648579, + "flos": 21506806667520.0, + "grad_norm": 1.5510106151766068, + "language_loss": 0.70386314, + "learning_rate": 1.123895622914766e-06, + "loss": 0.78068686, + "num_input_tokens_seen": 235215430, + "router_z_loss_clip": 1.40039062, + "router_z_loss_mlp": 0.11651611, + "step": 10894, + "time_per_iteration": 2.492877721786499 + }, + { + "auxiliary_loss_clip": 0.06416629, + "auxiliary_loss_mlp": 0.01264665, + "balance_loss_clip": 0.06276509, + "balance_loss_mlp": 0.01252959, + "epoch": 0.6550428378175259, + "flos": 22599657544320.0, + "grad_norm": 2.852975580128828, + "language_loss": 0.62881947, + "learning_rate": 1.123545533127549e-06, + "loss": 0.70563233, + "num_input_tokens_seen": 235232015, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.11712646, + "step": 10895, + "time_per_iteration": 2.508265733718872 + }, + { + "auxiliary_loss_clip": 0.06409365, + "auxiliary_loss_mlp": 0.01264591, + "balance_loss_clip": 0.06273523, + "balance_loss_mlp": 0.0125487, + "epoch": 0.655102961070194, + "flos": 12828848313600.0, + "grad_norm": 1.7300998551667346, + "language_loss": 0.79205835, + "learning_rate": 1.1231954765755722e-06, + "loss": 0.8687979, + "num_input_tokens_seen": 235248115, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.097229, + "step": 10896, + "time_per_iteration": 2.4711906909942627 + }, + { + "auxiliary_loss_clip": 0.06409965, + "auxiliary_loss_mlp": 0.0126749, + "balance_loss_clip": 0.06276735, + "balance_loss_mlp": 0.01257417, + "epoch": 0.6551630843228619, + "flos": 24798105118080.0, + "grad_norm": 1.3882264371892772, + "language_loss": 0.70543504, + "learning_rate": 1.1228454532721111e-06, + "loss": 0.78220963, + "num_input_tokens_seen": 235270785, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.10076904, + "step": 10897, + "time_per_iteration": 2.6822469234466553 + }, + { + "auxiliary_loss_clip": 0.06417882, + "auxiliary_loss_mlp": 0.01268345, + "balance_loss_clip": 0.06276685, + "balance_loss_mlp": 0.01257628, + "epoch": 0.6552232075755299, + "flos": 16729597036800.0, + "grad_norm": 1.5280933060289523, + "language_loss": 0.75582546, + "learning_rate": 1.1224954632304391e-06, + "loss": 0.83268768, + "num_input_tokens_seen": 235287905, + "router_z_loss_clip": 1.41210938, + "router_z_loss_mlp": 0.10717773, + "step": 10898, + "time_per_iteration": 2.475172519683838 + }, + { + "auxiliary_loss_clip": 0.06413586, + "auxiliary_loss_mlp": 0.0126988, + "balance_loss_clip": 0.06276682, + "balance_loss_mlp": 0.0125986, + "epoch": 0.6552833308281978, + "flos": 22022757383040.0, + "grad_norm": 2.1698837802172193, + "language_loss": 0.7396723, + "learning_rate": 1.122145506463827e-06, + "loss": 0.81650698, + "num_input_tokens_seen": 235305525, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.10028076, + "step": 10899, + "time_per_iteration": 2.5430071353912354 + }, + { + "auxiliary_loss_clip": 0.06414597, + "auxiliary_loss_mlp": 0.0126991, + "balance_loss_clip": 0.06275821, + "balance_loss_mlp": 0.01259229, + "epoch": 0.6553434540808658, + "flos": 24870332937600.0, + "grad_norm": 2.0271227306533346, + "language_loss": 0.56131774, + "learning_rate": 1.1217955829855443e-06, + "loss": 0.63816285, + "num_input_tokens_seen": 235324415, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.10693359, + "step": 10900, + "time_per_iteration": 2.5413925647735596 + }, + { + "auxiliary_loss_clip": 0.06419879, + "auxiliary_loss_mlp": 0.01265514, + "balance_loss_clip": 0.06280822, + "balance_loss_mlp": 0.01254511, + "epoch": 0.6554035773335337, + "flos": 23227639568640.0, + "grad_norm": 1.632650390975927, + "language_loss": 0.77087748, + "learning_rate": 1.1214456928088622e-06, + "loss": 0.84773135, + "num_input_tokens_seen": 235341595, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.11004639, + "step": 10901, + "time_per_iteration": 2.5584566593170166 + }, + { + "auxiliary_loss_clip": 0.06417914, + "auxiliary_loss_mlp": 0.01269861, + "balance_loss_clip": 0.06281441, + "balance_loss_mlp": 0.01259484, + "epoch": 0.6554637005862017, + "flos": 22790163801600.0, + "grad_norm": 1.6269884512414954, + "language_loss": 0.73415089, + "learning_rate": 1.1210958359470463e-06, + "loss": 0.81102872, + "num_input_tokens_seen": 235361700, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.10375977, + "step": 10902, + "time_per_iteration": 2.5149738788604736 + }, + { + "auxiliary_loss_clip": 0.06411173, + "auxiliary_loss_mlp": 0.01265501, + "balance_loss_clip": 0.0627598, + "balance_loss_mlp": 0.01255118, + "epoch": 0.6555238238388696, + "flos": 21513682702080.0, + "grad_norm": 2.0084891996216254, + "language_loss": 0.68054104, + "learning_rate": 1.1207460124133645e-06, + "loss": 0.75730777, + "num_input_tokens_seen": 235382065, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.10388184, + "step": 10903, + "time_per_iteration": 2.5427961349487305 + }, + { + "auxiliary_loss_clip": 0.06420846, + "auxiliary_loss_mlp": 0.01267584, + "balance_loss_clip": 0.06277949, + "balance_loss_mlp": 0.01255926, + "epoch": 0.6555839470915377, + "flos": 30527483420160.0, + "grad_norm": 1.6549904072812014, + "language_loss": 0.67021459, + "learning_rate": 1.1203962222210832e-06, + "loss": 0.74709886, + "num_input_tokens_seen": 235402130, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.11645508, + "step": 10904, + "time_per_iteration": 2.5631024837493896 + }, + { + "auxiliary_loss_clip": 0.06421356, + "auxiliary_loss_mlp": 0.01264475, + "balance_loss_clip": 0.06279784, + "balance_loss_mlp": 0.0125327, + "epoch": 0.6556440703442056, + "flos": 24649582556160.0, + "grad_norm": 1.7705609323248692, + "language_loss": 0.90557879, + "learning_rate": 1.120046465383464e-06, + "loss": 0.98243713, + "num_input_tokens_seen": 235420435, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.11212158, + "step": 10905, + "time_per_iteration": 2.551908493041992 + }, + { + "auxiliary_loss_clip": 0.06408294, + "auxiliary_loss_mlp": 0.01265256, + "balance_loss_clip": 0.06275466, + "balance_loss_mlp": 0.01255194, + "epoch": 0.6557041935968736, + "flos": 23739229872000.0, + "grad_norm": 1.7103913409482634, + "language_loss": 0.75575101, + "learning_rate": 1.1196967419137721e-06, + "loss": 0.83248651, + "num_input_tokens_seen": 235439960, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.10058594, + "step": 10906, + "time_per_iteration": 2.5098323822021484 + }, + { + "auxiliary_loss_clip": 0.06419322, + "auxiliary_loss_mlp": 0.0126702, + "balance_loss_clip": 0.06278144, + "balance_loss_mlp": 0.01256094, + "epoch": 0.6557643168495415, + "flos": 11106464112000.0, + "grad_norm": 2.5310893479547385, + "language_loss": 0.75316978, + "learning_rate": 1.119347051825267e-06, + "loss": 0.83003318, + "num_input_tokens_seen": 235457495, + "router_z_loss_clip": 1.41210938, + "router_z_loss_mlp": 0.10925293, + "step": 10907, + "time_per_iteration": 2.5110371112823486 + }, + { + "auxiliary_loss_clip": 0.06413908, + "auxiliary_loss_mlp": 0.01264522, + "balance_loss_clip": 0.06275952, + "balance_loss_mlp": 0.01253585, + "epoch": 0.6558244401022095, + "flos": 30198978288000.0, + "grad_norm": 1.3099733417202022, + "language_loss": 0.7233519, + "learning_rate": 1.118997395131211e-06, + "loss": 0.80013621, + "num_input_tokens_seen": 235479525, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.109375, + "step": 10908, + "time_per_iteration": 2.6000733375549316 + }, + { + "auxiliary_loss_clip": 0.06419864, + "auxiliary_loss_mlp": 0.01265366, + "balance_loss_clip": 0.06280993, + "balance_loss_mlp": 0.01254912, + "epoch": 0.6558845633548775, + "flos": 17936827136640.0, + "grad_norm": 2.2254285972113155, + "language_loss": 0.82226503, + "learning_rate": 1.118647771844861e-06, + "loss": 0.89911729, + "num_input_tokens_seen": 235496305, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.10455322, + "step": 10909, + "time_per_iteration": 2.524258613586426 + }, + { + "auxiliary_loss_clip": 0.06420204, + "auxiliary_loss_mlp": 0.01267528, + "balance_loss_clip": 0.0627941, + "balance_loss_mlp": 0.01256567, + "epoch": 0.6559446866075455, + "flos": 21909929460480.0, + "grad_norm": 2.0664641654441334, + "language_loss": 0.64063025, + "learning_rate": 1.1182981819794767e-06, + "loss": 0.71750748, + "num_input_tokens_seen": 235512545, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.10968018, + "step": 10910, + "time_per_iteration": 4.0342183113098145 + }, + { + "auxiliary_loss_clip": 0.06428535, + "auxiliary_loss_mlp": 0.012681, + "balance_loss_clip": 0.06281586, + "balance_loss_mlp": 0.01256501, + "epoch": 0.6560048098602135, + "flos": 14131674322560.0, + "grad_norm": 2.6155993780376408, + "language_loss": 0.76254046, + "learning_rate": 1.117948625548313e-06, + "loss": 0.8395068, + "num_input_tokens_seen": 235526045, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 0.1159668, + "step": 10911, + "time_per_iteration": 2.447054386138916 + }, + { + "auxiliary_loss_clip": 0.06411637, + "auxiliary_loss_mlp": 0.01268286, + "balance_loss_clip": 0.0627694, + "balance_loss_mlp": 0.0125798, + "epoch": 0.6560649331128814, + "flos": 18813623460480.0, + "grad_norm": 1.5982338886507241, + "language_loss": 0.756971, + "learning_rate": 1.1175991025646265e-06, + "loss": 0.83377028, + "num_input_tokens_seen": 235545285, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.10308838, + "step": 10912, + "time_per_iteration": 2.5681815147399902 + }, + { + "auxiliary_loss_clip": 0.06430128, + "auxiliary_loss_mlp": 0.01272614, + "balance_loss_clip": 0.0628223, + "balance_loss_mlp": 0.01260431, + "epoch": 0.6561250563655494, + "flos": 17058940709760.0, + "grad_norm": 1.6202794136024683, + "language_loss": 0.77903795, + "learning_rate": 1.1172496130416697e-06, + "loss": 0.85606527, + "num_input_tokens_seen": 235563150, + "router_z_loss_clip": 1.47949219, + "router_z_loss_mlp": 0.12176514, + "step": 10913, + "time_per_iteration": 2.4939568042755127 + }, + { + "auxiliary_loss_clip": 0.0641174, + "auxiliary_loss_mlp": 0.01263849, + "balance_loss_clip": 0.06277423, + "balance_loss_mlp": 0.01254425, + "epoch": 0.6561851796182173, + "flos": 22644198789120.0, + "grad_norm": 1.7766660084969559, + "language_loss": 0.71619821, + "learning_rate": 1.1169001569926961e-06, + "loss": 0.79295409, + "num_input_tokens_seen": 235582535, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.09423828, + "step": 10914, + "time_per_iteration": 2.569068431854248 + }, + { + "auxiliary_loss_clip": 0.06418359, + "auxiliary_loss_mlp": 0.01264819, + "balance_loss_clip": 0.06280423, + "balance_loss_mlp": 0.01254149, + "epoch": 0.6562453028708853, + "flos": 19244307047040.0, + "grad_norm": 1.8135755345317126, + "language_loss": 0.74166334, + "learning_rate": 1.116550734430958e-06, + "loss": 0.81849515, + "num_input_tokens_seen": 235601490, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.10675049, + "step": 10915, + "time_per_iteration": 2.487908363342285 + }, + { + "auxiliary_loss_clip": 0.06413562, + "auxiliary_loss_mlp": 0.01266089, + "balance_loss_clip": 0.06277299, + "balance_loss_mlp": 0.01254823, + "epoch": 0.6563054261235532, + "flos": 23807390768640.0, + "grad_norm": 1.4909835290624114, + "language_loss": 0.79751885, + "learning_rate": 1.1162013453697042e-06, + "loss": 0.87431538, + "num_input_tokens_seen": 235619165, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.11254883, + "step": 10916, + "time_per_iteration": 2.5246381759643555 + }, + { + "auxiliary_loss_clip": 0.06414592, + "auxiliary_loss_mlp": 0.01266229, + "balance_loss_clip": 0.06275203, + "balance_loss_mlp": 0.01255727, + "epoch": 0.6563655493762213, + "flos": 19245271368960.0, + "grad_norm": 1.7342152629791572, + "language_loss": 0.76458621, + "learning_rate": 1.1158519898221831e-06, + "loss": 0.84139442, + "num_input_tokens_seen": 235637115, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.10498047, + "step": 10917, + "time_per_iteration": 2.468027353286743 + }, + { + "auxiliary_loss_clip": 0.06412656, + "auxiliary_loss_mlp": 0.01267091, + "balance_loss_clip": 0.06277646, + "balance_loss_mlp": 0.0125678, + "epoch": 0.6564256726288892, + "flos": 25563457111680.0, + "grad_norm": 1.7726258593528208, + "language_loss": 0.70893037, + "learning_rate": 1.1155026678016445e-06, + "loss": 0.78572786, + "num_input_tokens_seen": 235656330, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.10314941, + "step": 10918, + "time_per_iteration": 2.5601627826690674 + }, + { + "auxiliary_loss_clip": 0.06410314, + "auxiliary_loss_mlp": 0.01263599, + "balance_loss_clip": 0.06277462, + "balance_loss_mlp": 0.01253806, + "epoch": 0.6564857958815572, + "flos": 22207226146560.0, + "grad_norm": 1.5162098354406723, + "language_loss": 0.76179051, + "learning_rate": 1.115153379321332e-06, + "loss": 0.83852965, + "num_input_tokens_seen": 235674510, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09802246, + "step": 10919, + "time_per_iteration": 2.515432357788086 + }, + { + "auxiliary_loss_clip": 0.06311788, + "auxiliary_loss_mlp": 0.01255206, + "balance_loss_clip": 0.06255645, + "balance_loss_mlp": 0.01254054, + "epoch": 0.6565459191342251, + "flos": 58139188462080.0, + "grad_norm": 0.7048888157954881, + "language_loss": 0.52975726, + "learning_rate": 1.1148041243944931e-06, + "loss": 0.60542721, + "num_input_tokens_seen": 235735050, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01150513, + "step": 10920, + "time_per_iteration": 3.225492238998413 + }, + { + "auxiliary_loss_clip": 0.06409396, + "auxiliary_loss_mlp": 0.01264778, + "balance_loss_clip": 0.06275034, + "balance_loss_mlp": 0.01254252, + "epoch": 0.6566060423868931, + "flos": 30817400947200.0, + "grad_norm": 2.612121109527078, + "language_loss": 0.66109598, + "learning_rate": 1.1144549030343697e-06, + "loss": 0.73783767, + "num_input_tokens_seen": 235757545, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.10516357, + "step": 10921, + "time_per_iteration": 2.5863046646118164 + }, + { + "auxiliary_loss_clip": 0.06413272, + "auxiliary_loss_mlp": 0.01265745, + "balance_loss_clip": 0.06276343, + "balance_loss_mlp": 0.01254086, + "epoch": 0.6566661656395612, + "flos": 23374107705600.0, + "grad_norm": 1.6764293200295557, + "language_loss": 0.81199658, + "learning_rate": 1.114105715254205e-06, + "loss": 0.88878673, + "num_input_tokens_seen": 235777265, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.11657715, + "step": 10922, + "time_per_iteration": 3.958033800125122 + }, + { + "auxiliary_loss_clip": 0.06414749, + "auxiliary_loss_mlp": 0.01268836, + "balance_loss_clip": 0.06275846, + "balance_loss_mlp": 0.01258131, + "epoch": 0.6567262888922291, + "flos": 25742098016640.0, + "grad_norm": 1.8770672525164127, + "language_loss": 0.71403915, + "learning_rate": 1.1137565610672414e-06, + "loss": 0.79087496, + "num_input_tokens_seen": 235796565, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.1071167, + "step": 10923, + "time_per_iteration": 2.6299500465393066 + }, + { + "auxiliary_loss_clip": 0.06414993, + "auxiliary_loss_mlp": 0.01266649, + "balance_loss_clip": 0.06276433, + "balance_loss_mlp": 0.0125629, + "epoch": 0.6567864121448971, + "flos": 17128569052800.0, + "grad_norm": 1.8445128185559154, + "language_loss": 0.80703431, + "learning_rate": 1.1134074404867169e-06, + "loss": 0.88385069, + "num_input_tokens_seen": 235814805, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.10357666, + "step": 10924, + "time_per_iteration": 2.474226713180542 + }, + { + "auxiliary_loss_clip": 0.06413686, + "auxiliary_loss_mlp": 0.01262003, + "balance_loss_clip": 0.06275852, + "balance_loss_mlp": 0.0125187, + "epoch": 0.656846535397565, + "flos": 22425922103040.0, + "grad_norm": 2.0896707953815543, + "language_loss": 0.72634912, + "learning_rate": 1.1130583535258717e-06, + "loss": 0.80310595, + "num_input_tokens_seen": 235833405, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.10137939, + "step": 10925, + "time_per_iteration": 4.006798982620239 + }, + { + "auxiliary_loss_clip": 0.0641509, + "auxiliary_loss_mlp": 0.01263906, + "balance_loss_clip": 0.06276507, + "balance_loss_mlp": 0.01253768, + "epoch": 0.656906658650233, + "flos": 17708991085440.0, + "grad_norm": 2.4212353880000586, + "language_loss": 0.72549468, + "learning_rate": 1.112709300197942e-06, + "loss": 0.80228466, + "num_input_tokens_seen": 235848530, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.10137939, + "step": 10926, + "time_per_iteration": 2.470264434814453 + }, + { + "auxiliary_loss_clip": 0.06419797, + "auxiliary_loss_mlp": 0.01265954, + "balance_loss_clip": 0.06277547, + "balance_loss_mlp": 0.01254498, + "epoch": 0.6569667819029009, + "flos": 21180942938880.0, + "grad_norm": 1.9117955392450259, + "language_loss": 0.72684854, + "learning_rate": 1.1123602805161656e-06, + "loss": 0.80370605, + "num_input_tokens_seen": 235867225, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.11468506, + "step": 10927, + "time_per_iteration": 2.5509166717529297 + }, + { + "auxiliary_loss_clip": 0.06310604, + "auxiliary_loss_mlp": 0.01252717, + "balance_loss_clip": 0.06254312, + "balance_loss_mlp": 0.01251483, + "epoch": 0.6570269051555689, + "flos": 68783299344000.0, + "grad_norm": 0.7240640825769642, + "language_loss": 0.64406443, + "learning_rate": 1.112011294493775e-06, + "loss": 0.71969765, + "num_input_tokens_seen": 235932925, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.0123291, + "step": 10928, + "time_per_iteration": 3.1493797302246094 + }, + { + "auxiliary_loss_clip": 0.06413682, + "auxiliary_loss_mlp": 0.01270572, + "balance_loss_clip": 0.06277151, + "balance_loss_mlp": 0.01259354, + "epoch": 0.6570870284082369, + "flos": 26325874212480.0, + "grad_norm": 2.727605777521059, + "language_loss": 0.78076899, + "learning_rate": 1.1116623421440063e-06, + "loss": 0.85761154, + "num_input_tokens_seen": 235952680, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.11212158, + "step": 10929, + "time_per_iteration": 2.602822780609131 + }, + { + "auxiliary_loss_clip": 0.06411244, + "auxiliary_loss_mlp": 0.01264052, + "balance_loss_clip": 0.06275063, + "balance_loss_mlp": 0.01253181, + "epoch": 0.6571471516609049, + "flos": 26181544354560.0, + "grad_norm": 1.645365805026195, + "language_loss": 0.65459454, + "learning_rate": 1.1113134234800895e-06, + "loss": 0.73134756, + "num_input_tokens_seen": 235972075, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.10876465, + "step": 10930, + "time_per_iteration": 3.964470863342285 + }, + { + "auxiliary_loss_clip": 0.06414342, + "auxiliary_loss_mlp": 0.01268622, + "balance_loss_clip": 0.06276581, + "balance_loss_mlp": 0.01257733, + "epoch": 0.6572072749135728, + "flos": 20382537709440.0, + "grad_norm": 1.4804583724978688, + "language_loss": 0.71204734, + "learning_rate": 1.110964538515258e-06, + "loss": 0.78887701, + "num_input_tokens_seen": 235990340, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.10888672, + "step": 10931, + "time_per_iteration": 2.4909491539001465 + }, + { + "auxiliary_loss_clip": 0.06417586, + "auxiliary_loss_mlp": 0.0127043, + "balance_loss_clip": 0.06275665, + "balance_loss_mlp": 0.01259784, + "epoch": 0.6572673981662408, + "flos": 17134438838400.0, + "grad_norm": 1.8915521473051504, + "language_loss": 0.68812561, + "learning_rate": 1.1106156872627393e-06, + "loss": 0.76500577, + "num_input_tokens_seen": 236007470, + "router_z_loss_clip": 1.41894531, + "router_z_loss_mlp": 0.10644531, + "step": 10932, + "time_per_iteration": 2.5176515579223633 + }, + { + "auxiliary_loss_clip": 0.06412166, + "auxiliary_loss_mlp": 0.01268175, + "balance_loss_clip": 0.06274658, + "balance_loss_mlp": 0.01257952, + "epoch": 0.6573275214189087, + "flos": 41283640339200.0, + "grad_norm": 1.6891496229276404, + "language_loss": 0.80723727, + "learning_rate": 1.1102668697357626e-06, + "loss": 0.88404071, + "num_input_tokens_seen": 236029030, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.10229492, + "step": 10933, + "time_per_iteration": 2.6675453186035156 + }, + { + "auxiliary_loss_clip": 0.06419124, + "auxiliary_loss_mlp": 0.01264988, + "balance_loss_clip": 0.06278023, + "balance_loss_mlp": 0.01254432, + "epoch": 0.6573876446715767, + "flos": 22896241470720.0, + "grad_norm": 1.753523075649994, + "language_loss": 0.73957497, + "learning_rate": 1.1099180859475571e-06, + "loss": 0.81641608, + "num_input_tokens_seen": 236047160, + "router_z_loss_clip": 1.41210938, + "router_z_loss_mlp": 0.10552979, + "step": 10934, + "time_per_iteration": 2.555539131164551 + }, + { + "auxiliary_loss_clip": 0.0641007, + "auxiliary_loss_mlp": 0.01270037, + "balance_loss_clip": 0.0627473, + "balance_loss_mlp": 0.01259445, + "epoch": 0.6574477679242448, + "flos": 44028240825600.0, + "grad_norm": 1.5029164504422408, + "language_loss": 0.76213276, + "learning_rate": 1.1095693359113454e-06, + "loss": 0.83893389, + "num_input_tokens_seen": 236069215, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.10583496, + "step": 10935, + "time_per_iteration": 2.6976189613342285 + }, + { + "auxiliary_loss_clip": 0.06416147, + "auxiliary_loss_mlp": 0.01270518, + "balance_loss_clip": 0.06277473, + "balance_loss_mlp": 0.01258967, + "epoch": 0.6575078911769127, + "flos": 24578402912640.0, + "grad_norm": 1.4839652411177968, + "language_loss": 0.78411627, + "learning_rate": 1.1092206196403538e-06, + "loss": 0.86098289, + "num_input_tokens_seen": 236088335, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.11553955, + "step": 10936, + "time_per_iteration": 2.518728494644165 + }, + { + "auxiliary_loss_clip": 0.06411346, + "auxiliary_loss_mlp": 0.01270987, + "balance_loss_clip": 0.06275463, + "balance_loss_mlp": 0.01261301, + "epoch": 0.6575680144295807, + "flos": 20930493484800.0, + "grad_norm": 1.7706689890869223, + "language_loss": 0.68970346, + "learning_rate": 1.1088719371478056e-06, + "loss": 0.76652682, + "num_input_tokens_seen": 236108540, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.09692383, + "step": 10937, + "time_per_iteration": 2.5257480144500732 + }, + { + "auxiliary_loss_clip": 0.06410159, + "auxiliary_loss_mlp": 0.01266555, + "balance_loss_clip": 0.06273675, + "balance_loss_mlp": 0.01255696, + "epoch": 0.6576281376822486, + "flos": 10930213048320.0, + "grad_norm": 2.6009314091519804, + "language_loss": 0.68779373, + "learning_rate": 1.1085232884469236e-06, + "loss": 0.76456088, + "num_input_tokens_seen": 236124495, + "router_z_loss_clip": 1.36621094, + "router_z_loss_mlp": 0.10858154, + "step": 10938, + "time_per_iteration": 2.487494468688965 + }, + { + "auxiliary_loss_clip": 0.06411414, + "auxiliary_loss_mlp": 0.01265537, + "balance_loss_clip": 0.06273697, + "balance_loss_mlp": 0.01254659, + "epoch": 0.6576882609349166, + "flos": 19287632407680.0, + "grad_norm": 1.7840896081065163, + "language_loss": 0.71399069, + "learning_rate": 1.108174673550927e-06, + "loss": 0.79076016, + "num_input_tokens_seen": 236142550, + "router_z_loss_clip": 1.37695312, + "router_z_loss_mlp": 0.10876465, + "step": 10939, + "time_per_iteration": 2.4861202239990234 + }, + { + "auxiliary_loss_clip": 0.0641602, + "auxiliary_loss_mlp": 0.01267708, + "balance_loss_clip": 0.06275935, + "balance_loss_mlp": 0.01256199, + "epoch": 0.6577483841875845, + "flos": 20225168542080.0, + "grad_norm": 5.914491475263239, + "language_loss": 0.77965903, + "learning_rate": 1.107826092473037e-06, + "loss": 0.85649633, + "num_input_tokens_seen": 236156620, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.11505127, + "step": 10940, + "time_per_iteration": 2.491938829421997 + }, + { + "auxiliary_loss_clip": 0.06417249, + "auxiliary_loss_mlp": 0.01271369, + "balance_loss_clip": 0.06273827, + "balance_loss_mlp": 0.01260253, + "epoch": 0.6578085074402525, + "flos": 34759672168320.0, + "grad_norm": 1.9394980575704135, + "language_loss": 0.69278842, + "learning_rate": 1.107477545226471e-06, + "loss": 0.76967466, + "num_input_tokens_seen": 236177095, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.11132812, + "step": 10941, + "time_per_iteration": 2.6296122074127197 + }, + { + "auxiliary_loss_clip": 0.06406929, + "auxiliary_loss_mlp": 0.0126384, + "balance_loss_clip": 0.06270303, + "balance_loss_mlp": 0.01253934, + "epoch": 0.6578686306929205, + "flos": 23476705430400.0, + "grad_norm": 1.8720735918703966, + "language_loss": 0.68617851, + "learning_rate": 1.1071290318244448e-06, + "loss": 0.76288623, + "num_input_tokens_seen": 236194695, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.09906006, + "step": 10942, + "time_per_iteration": 2.5199849605560303 + }, + { + "auxiliary_loss_clip": 0.06417514, + "auxiliary_loss_mlp": 0.01265909, + "balance_loss_clip": 0.0627285, + "balance_loss_mlp": 0.0125391, + "epoch": 0.6579287539455885, + "flos": 18082876003200.0, + "grad_norm": 1.8863772080566783, + "language_loss": 0.71839166, + "learning_rate": 1.1067805522801753e-06, + "loss": 0.7952258, + "num_input_tokens_seen": 236213885, + "router_z_loss_clip": 1.44433594, + "router_z_loss_mlp": 0.12005615, + "step": 10943, + "time_per_iteration": 2.4810752868652344 + }, + { + "auxiliary_loss_clip": 0.06409079, + "auxiliary_loss_mlp": 0.01268026, + "balance_loss_clip": 0.06272689, + "balance_loss_mlp": 0.01257327, + "epoch": 0.6579888771982564, + "flos": 28669532112000.0, + "grad_norm": 1.7035342930552537, + "language_loss": 0.59567684, + "learning_rate": 1.1064321066068778e-06, + "loss": 0.67244786, + "num_input_tokens_seen": 236237315, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.10687256, + "step": 10944, + "time_per_iteration": 2.593003273010254 + }, + { + "auxiliary_loss_clip": 0.06423099, + "auxiliary_loss_mlp": 0.01269429, + "balance_loss_clip": 0.06277057, + "balance_loss_mlp": 0.01257555, + "epoch": 0.6580490004509244, + "flos": 25053627744000.0, + "grad_norm": 1.4789836122868327, + "language_loss": 0.72602201, + "learning_rate": 1.1060836948177646e-06, + "loss": 0.80294728, + "num_input_tokens_seen": 236256345, + "router_z_loss_clip": 1.45898438, + "router_z_loss_mlp": 0.11871338, + "step": 10945, + "time_per_iteration": 2.53983998298645 + }, + { + "auxiliary_loss_clip": 0.06410586, + "auxiliary_loss_mlp": 0.01266442, + "balance_loss_clip": 0.06275351, + "balance_loss_mlp": 0.01256321, + "epoch": 0.6581091237035923, + "flos": 43519040363520.0, + "grad_norm": 1.838349836001675, + "language_loss": 0.70316982, + "learning_rate": 1.105735316926046e-06, + "loss": 0.77994007, + "num_input_tokens_seen": 236281890, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.10119629, + "step": 10946, + "time_per_iteration": 2.798476219177246 + }, + { + "auxiliary_loss_clip": 0.06410632, + "auxiliary_loss_mlp": 0.01265957, + "balance_loss_clip": 0.06272982, + "balance_loss_mlp": 0.01255514, + "epoch": 0.6581692469562603, + "flos": 22421352055680.0, + "grad_norm": 1.8876327732241813, + "language_loss": 0.82383513, + "learning_rate": 1.105386972944934e-06, + "loss": 0.90060103, + "num_input_tokens_seen": 236298370, + "router_z_loss_clip": 1.37402344, + "router_z_loss_mlp": 0.10443115, + "step": 10947, + "time_per_iteration": 2.5243499279022217 + }, + { + "auxiliary_loss_clip": 0.06414369, + "auxiliary_loss_mlp": 0.01263895, + "balance_loss_clip": 0.0627495, + "balance_loss_mlp": 0.01253447, + "epoch": 0.6582293702089284, + "flos": 24866098306560.0, + "grad_norm": 1.5151980350674914, + "language_loss": 0.77415752, + "learning_rate": 1.1050386628876385e-06, + "loss": 0.85094017, + "num_input_tokens_seen": 236317380, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.10449219, + "step": 10948, + "time_per_iteration": 2.543790578842163 + }, + { + "auxiliary_loss_clip": 0.06411085, + "auxiliary_loss_mlp": 0.01265504, + "balance_loss_clip": 0.06274116, + "balance_loss_mlp": 0.01255288, + "epoch": 0.6582894934615963, + "flos": 23046399187200.0, + "grad_norm": 1.478986900014917, + "language_loss": 0.79121858, + "learning_rate": 1.1046903867673655e-06, + "loss": 0.86798447, + "num_input_tokens_seen": 236336210, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.10223389, + "step": 10949, + "time_per_iteration": 2.535895824432373 + }, + { + "auxiliary_loss_clip": 0.06312477, + "auxiliary_loss_mlp": 0.01264797, + "balance_loss_clip": 0.06256588, + "balance_loss_mlp": 0.01263514, + "epoch": 0.6583496167142643, + "flos": 72573274569600.0, + "grad_norm": 0.7232821189613112, + "language_loss": 0.61788374, + "learning_rate": 1.104342144597323e-06, + "loss": 0.69365644, + "num_input_tokens_seen": 236403090, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.01284027, + "step": 10950, + "time_per_iteration": 4.580410957336426 + }, + { + "auxiliary_loss_clip": 0.06408125, + "auxiliary_loss_mlp": 0.01268595, + "balance_loss_clip": 0.06274961, + "balance_loss_mlp": 0.01258778, + "epoch": 0.6584097399669322, + "flos": 13083867815040.0, + "grad_norm": 2.2244546266186354, + "language_loss": 0.6719563, + "learning_rate": 1.1039939363907178e-06, + "loss": 0.74872345, + "num_input_tokens_seen": 236420475, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.09820557, + "step": 10951, + "time_per_iteration": 2.510561466217041 + }, + { + "auxiliary_loss_clip": 0.06409305, + "auxiliary_loss_mlp": 0.01270102, + "balance_loss_clip": 0.06273426, + "balance_loss_mlp": 0.01259921, + "epoch": 0.6584698632196002, + "flos": 28700530922880.0, + "grad_norm": 1.3260041408046892, + "language_loss": 0.76428199, + "learning_rate": 1.1036457621607504e-06, + "loss": 0.84107602, + "num_input_tokens_seen": 236441915, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.10180664, + "step": 10952, + "time_per_iteration": 2.5918259620666504 + }, + { + "auxiliary_loss_clip": 0.06409515, + "auxiliary_loss_mlp": 0.01268051, + "balance_loss_clip": 0.06275044, + "balance_loss_mlp": 0.01257954, + "epoch": 0.6585299864722681, + "flos": 14324486567040.0, + "grad_norm": 1.6835884668716123, + "language_loss": 0.73700249, + "learning_rate": 1.1032976219206257e-06, + "loss": 0.81377816, + "num_input_tokens_seen": 236460340, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.10083008, + "step": 10953, + "time_per_iteration": 2.5165388584136963 + }, + { + "auxiliary_loss_clip": 0.06410642, + "auxiliary_loss_mlp": 0.01266102, + "balance_loss_clip": 0.06274508, + "balance_loss_mlp": 0.01255427, + "epoch": 0.6585901097249361, + "flos": 26805291747840.0, + "grad_norm": 1.6924688741082035, + "language_loss": 0.79007798, + "learning_rate": 1.102949515683546e-06, + "loss": 0.86684537, + "num_input_tokens_seen": 236478280, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.10681152, + "step": 10954, + "time_per_iteration": 2.564539909362793 + }, + { + "auxiliary_loss_clip": 0.06413999, + "auxiliary_loss_mlp": 0.01267466, + "balance_loss_clip": 0.06276879, + "balance_loss_mlp": 0.01257411, + "epoch": 0.658650232977604, + "flos": 18738921945600.0, + "grad_norm": 3.4725197474545215, + "language_loss": 0.69489324, + "learning_rate": 1.1026014434627096e-06, + "loss": 0.77170783, + "num_input_tokens_seen": 236493225, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.10058594, + "step": 10955, + "time_per_iteration": 2.495082139968872 + }, + { + "auxiliary_loss_clip": 0.06405246, + "auxiliary_loss_mlp": 0.01266042, + "balance_loss_clip": 0.06274106, + "balance_loss_mlp": 0.01256398, + "epoch": 0.6587103562302721, + "flos": 24760272199680.0, + "grad_norm": 2.1168101225513056, + "language_loss": 0.81125724, + "learning_rate": 1.1022534052713172e-06, + "loss": 0.88797009, + "num_input_tokens_seen": 236514420, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.09637451, + "step": 10956, + "time_per_iteration": 2.636908531188965 + }, + { + "auxiliary_loss_clip": 0.06413392, + "auxiliary_loss_mlp": 0.0127424, + "balance_loss_clip": 0.06275264, + "balance_loss_mlp": 0.01262808, + "epoch": 0.65877047948294, + "flos": 22352688034560.0, + "grad_norm": 2.1582606979270462, + "language_loss": 0.81753582, + "learning_rate": 1.1019054011225648e-06, + "loss": 0.89441204, + "num_input_tokens_seen": 236532785, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.11431885, + "step": 10957, + "time_per_iteration": 2.6302380561828613 + }, + { + "auxiliary_loss_clip": 0.06405203, + "auxiliary_loss_mlp": 0.01264716, + "balance_loss_clip": 0.0627131, + "balance_loss_mlp": 0.01255513, + "epoch": 0.658830602735608, + "flos": 45189965358720.0, + "grad_norm": 1.6069945820528309, + "language_loss": 0.76651394, + "learning_rate": 1.1015574310296506e-06, + "loss": 0.8432132, + "num_input_tokens_seen": 236553330, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.09197998, + "step": 10958, + "time_per_iteration": 2.7235934734344482 + }, + { + "auxiliary_loss_clip": 0.06409356, + "auxiliary_loss_mlp": 0.01266973, + "balance_loss_clip": 0.0627449, + "balance_loss_mlp": 0.01256811, + "epoch": 0.6588907259882759, + "flos": 19907774075520.0, + "grad_norm": 1.6704982273704214, + "language_loss": 0.75102574, + "learning_rate": 1.1012094950057678e-06, + "loss": 0.82778907, + "num_input_tokens_seen": 236572960, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.10168457, + "step": 10959, + "time_per_iteration": 2.4919495582580566 + }, + { + "auxiliary_loss_clip": 0.06411363, + "auxiliary_loss_mlp": 0.01263366, + "balance_loss_clip": 0.062753, + "balance_loss_mlp": 0.01253609, + "epoch": 0.6589508492409439, + "flos": 24140591729280.0, + "grad_norm": 1.5345825682480954, + "language_loss": 0.65334243, + "learning_rate": 1.1008615930641107e-06, + "loss": 0.73008978, + "num_input_tokens_seen": 236594090, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.09759521, + "step": 10960, + "time_per_iteration": 2.539113998413086 + }, + { + "auxiliary_loss_clip": 0.06417534, + "auxiliary_loss_mlp": 0.01266682, + "balance_loss_clip": 0.06274159, + "balance_loss_mlp": 0.01256305, + "epoch": 0.659010972493612, + "flos": 18228715234560.0, + "grad_norm": 1.960089741542263, + "language_loss": 0.81517863, + "learning_rate": 1.1005137252178734e-06, + "loss": 0.89202076, + "num_input_tokens_seen": 236610190, + "router_z_loss_clip": 1.43359375, + "router_z_loss_mlp": 0.1038208, + "step": 10961, + "time_per_iteration": 3.8582499027252197 + }, + { + "auxiliary_loss_clip": 0.0641351, + "auxiliary_loss_mlp": 0.01267598, + "balance_loss_clip": 0.06275603, + "balance_loss_mlp": 0.01257292, + "epoch": 0.6590710957462799, + "flos": 27607428483840.0, + "grad_norm": 1.7237322524813996, + "language_loss": 0.736247, + "learning_rate": 1.1001658914802453e-06, + "loss": 0.81305802, + "num_input_tokens_seen": 236631575, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.10302734, + "step": 10962, + "time_per_iteration": 2.542795419692993 + }, + { + "auxiliary_loss_clip": 0.06414889, + "auxiliary_loss_mlp": 0.01268579, + "balance_loss_clip": 0.06274842, + "balance_loss_mlp": 0.01257522, + "epoch": 0.6591312189989479, + "flos": 20309177859840.0, + "grad_norm": 1.8258870034084347, + "language_loss": 0.80250466, + "learning_rate": 1.0998180918644165e-06, + "loss": 0.87933934, + "num_input_tokens_seen": 236649815, + "router_z_loss_clip": 1.39941406, + "router_z_loss_mlp": 0.11071777, + "step": 10963, + "time_per_iteration": 2.484524965286255 + }, + { + "auxiliary_loss_clip": 0.06407138, + "auxiliary_loss_mlp": 0.01266706, + "balance_loss_clip": 0.06273393, + "balance_loss_mlp": 0.0125696, + "epoch": 0.6591913422516158, + "flos": 12317886915840.0, + "grad_norm": 1.5886018528393113, + "language_loss": 0.78204167, + "learning_rate": 1.0994703263835754e-06, + "loss": 0.85878009, + "num_input_tokens_seen": 236668335, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.09753418, + "step": 10964, + "time_per_iteration": 4.032490015029907 + }, + { + "auxiliary_loss_clip": 0.06414784, + "auxiliary_loss_mlp": 0.0126533, + "balance_loss_clip": 0.06274471, + "balance_loss_mlp": 0.01255787, + "epoch": 0.6592514655042838, + "flos": 25891626827520.0, + "grad_norm": 1.653857660787362, + "language_loss": 0.7398777, + "learning_rate": 1.0991225950509106e-06, + "loss": 0.81667888, + "num_input_tokens_seen": 236688945, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.09539795, + "step": 10965, + "time_per_iteration": 2.558753490447998 + }, + { + "auxiliary_loss_clip": 0.06415711, + "auxiliary_loss_mlp": 0.01266111, + "balance_loss_clip": 0.0627279, + "balance_loss_mlp": 0.0125528, + "epoch": 0.6593115887569517, + "flos": 14068754305920.0, + "grad_norm": 2.292623636057082, + "language_loss": 0.74313521, + "learning_rate": 1.0987748978796067e-06, + "loss": 0.81995344, + "num_input_tokens_seen": 236707055, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.1083374, + "step": 10966, + "time_per_iteration": 2.4695546627044678 + }, + { + "auxiliary_loss_clip": 0.06410235, + "auxiliary_loss_mlp": 0.01265948, + "balance_loss_clip": 0.06273091, + "balance_loss_mlp": 0.01255273, + "epoch": 0.6593717120096197, + "flos": 24724912976640.0, + "grad_norm": 1.5343869413599147, + "language_loss": 0.77172506, + "learning_rate": 1.0984272348828487e-06, + "loss": 0.8484869, + "num_input_tokens_seen": 236725900, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.10662842, + "step": 10967, + "time_per_iteration": 2.554844856262207 + }, + { + "auxiliary_loss_clip": 0.0630592, + "auxiliary_loss_mlp": 0.01258736, + "balance_loss_clip": 0.06250164, + "balance_loss_mlp": 0.01257491, + "epoch": 0.6594318352622877, + "flos": 55577951907840.0, + "grad_norm": 0.6831964979389027, + "language_loss": 0.48237032, + "learning_rate": 1.0980796060738221e-06, + "loss": 0.5580169, + "num_input_tokens_seen": 236788415, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.01243591, + "step": 10968, + "time_per_iteration": 3.1279184818267822 + }, + { + "auxiliary_loss_clip": 0.06412826, + "auxiliary_loss_mlp": 0.01261785, + "balance_loss_clip": 0.06273898, + "balance_loss_mlp": 0.01251569, + "epoch": 0.6594919585149557, + "flos": 17462650481280.0, + "grad_norm": 1.6973549586156937, + "language_loss": 0.79805654, + "learning_rate": 1.0977320114657058e-06, + "loss": 0.87480259, + "num_input_tokens_seen": 236805155, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.10211182, + "step": 10969, + "time_per_iteration": 3.929111957550049 + }, + { + "auxiliary_loss_clip": 0.0641497, + "auxiliary_loss_mlp": 0.01265533, + "balance_loss_clip": 0.06276352, + "balance_loss_mlp": 0.01255239, + "epoch": 0.6595520817676236, + "flos": 18229092577920.0, + "grad_norm": 1.9822858612354273, + "language_loss": 0.65968251, + "learning_rate": 1.0973844510716817e-06, + "loss": 0.73648757, + "num_input_tokens_seen": 236824360, + "router_z_loss_clip": 1.38476562, + "router_z_loss_mlp": 0.10296631, + "step": 10970, + "time_per_iteration": 2.534639835357666 + }, + { + "auxiliary_loss_clip": 0.06411758, + "auxiliary_loss_mlp": 0.01263435, + "balance_loss_clip": 0.06272757, + "balance_loss_mlp": 0.01253368, + "epoch": 0.6596122050202916, + "flos": 22206219897600.0, + "grad_norm": 1.4827049257585125, + "language_loss": 0.76440203, + "learning_rate": 1.0970369249049308e-06, + "loss": 0.84115398, + "num_input_tokens_seen": 236844640, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.10064697, + "step": 10971, + "time_per_iteration": 2.518568515777588 + }, + { + "auxiliary_loss_clip": 0.06414073, + "auxiliary_loss_mlp": 0.01263478, + "balance_loss_clip": 0.06274455, + "balance_loss_mlp": 0.01253101, + "epoch": 0.6596723282729595, + "flos": 14179108533120.0, + "grad_norm": 2.58028286016492, + "language_loss": 0.70073628, + "learning_rate": 1.096689432978629e-06, + "loss": 0.77751178, + "num_input_tokens_seen": 236861160, + "router_z_loss_clip": 1.39648438, + "router_z_loss_mlp": 0.10388184, + "step": 10972, + "time_per_iteration": 2.5301804542541504 + }, + { + "auxiliary_loss_clip": 0.06411418, + "auxiliary_loss_mlp": 0.01263284, + "balance_loss_clip": 0.0627436, + "balance_loss_mlp": 0.01252931, + "epoch": 0.6597324515256275, + "flos": 30560746291200.0, + "grad_norm": 1.6494264278825825, + "language_loss": 0.55793309, + "learning_rate": 1.0963419753059556e-06, + "loss": 0.63468015, + "num_input_tokens_seen": 236880465, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.10351562, + "step": 10973, + "time_per_iteration": 2.5836968421936035 + }, + { + "auxiliary_loss_clip": 0.06425004, + "auxiliary_loss_mlp": 0.01265958, + "balance_loss_clip": 0.06279783, + "balance_loss_mlp": 0.0125579, + "epoch": 0.6597925747782956, + "flos": 17645693725440.0, + "grad_norm": 2.424477152178303, + "language_loss": 0.78669357, + "learning_rate": 1.0959945519000839e-06, + "loss": 0.86360323, + "num_input_tokens_seen": 236897730, + "router_z_loss_clip": 1.45214844, + "router_z_loss_mlp": 0.10174561, + "step": 10974, + "time_per_iteration": 2.5438265800476074 + }, + { + "auxiliary_loss_clip": 0.06416789, + "auxiliary_loss_mlp": 0.01266385, + "balance_loss_clip": 0.06276938, + "balance_loss_mlp": 0.01255567, + "epoch": 0.6598526980309635, + "flos": 22825523024640.0, + "grad_norm": 2.75247163208804, + "language_loss": 0.69161505, + "learning_rate": 1.0956471627741906e-06, + "loss": 0.7684468, + "num_input_tokens_seen": 236917300, + "router_z_loss_clip": 1.39941406, + "router_z_loss_mlp": 0.10821533, + "step": 10975, + "time_per_iteration": 2.517643690109253 + }, + { + "auxiliary_loss_clip": 0.06413519, + "auxiliary_loss_mlp": 0.01263226, + "balance_loss_clip": 0.06275275, + "balance_loss_mlp": 0.01252766, + "epoch": 0.6599128212836315, + "flos": 21074194437120.0, + "grad_norm": 1.6033931639433516, + "language_loss": 0.70794642, + "learning_rate": 1.0952998079414464e-06, + "loss": 0.78471386, + "num_input_tokens_seen": 236935590, + "router_z_loss_clip": 1.38183594, + "router_z_loss_mlp": 0.10455322, + "step": 10976, + "time_per_iteration": 2.5318117141723633 + }, + { + "auxiliary_loss_clip": 0.06410262, + "auxiliary_loss_mlp": 0.01267729, + "balance_loss_clip": 0.06275579, + "balance_loss_mlp": 0.01257065, + "epoch": 0.6599729445362994, + "flos": 22170022133760.0, + "grad_norm": 1.5758270650588126, + "language_loss": 0.67691094, + "learning_rate": 1.0949524874150243e-06, + "loss": 0.75369084, + "num_input_tokens_seen": 236952830, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.10668945, + "step": 10977, + "time_per_iteration": 2.485891342163086 + }, + { + "auxiliary_loss_clip": 0.06420588, + "auxiliary_loss_mlp": 0.01267585, + "balance_loss_clip": 0.0627695, + "balance_loss_mlp": 0.01256427, + "epoch": 0.6600330677889674, + "flos": 18155900436480.0, + "grad_norm": 2.2117923844530694, + "language_loss": 0.81200063, + "learning_rate": 1.0946052012080952e-06, + "loss": 0.8888824, + "num_input_tokens_seen": 236971930, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.11157227, + "step": 10978, + "time_per_iteration": 2.5422048568725586 + }, + { + "auxiliary_loss_clip": 0.0641408, + "auxiliary_loss_mlp": 0.01266547, + "balance_loss_clip": 0.06273466, + "balance_loss_mlp": 0.01255461, + "epoch": 0.6600931910416353, + "flos": 18155942363520.0, + "grad_norm": 2.6619753374489767, + "language_loss": 0.67523986, + "learning_rate": 1.0942579493338278e-06, + "loss": 0.75204611, + "num_input_tokens_seen": 236989920, + "router_z_loss_clip": 1.40429688, + "router_z_loss_mlp": 0.11096191, + "step": 10979, + "time_per_iteration": 2.5064504146575928 + }, + { + "auxiliary_loss_clip": 0.06413005, + "auxiliary_loss_mlp": 0.01265818, + "balance_loss_clip": 0.06272849, + "balance_loss_mlp": 0.0125528, + "epoch": 0.6601533142943034, + "flos": 17426494644480.0, + "grad_norm": 2.8604366894108324, + "language_loss": 0.73473299, + "learning_rate": 1.0939107318053889e-06, + "loss": 0.81152123, + "num_input_tokens_seen": 237006570, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.10540771, + "step": 10980, + "time_per_iteration": 2.5004913806915283 + }, + { + "auxiliary_loss_clip": 0.06408733, + "auxiliary_loss_mlp": 0.01271257, + "balance_loss_clip": 0.06274907, + "balance_loss_mlp": 0.01261441, + "epoch": 0.6602134375469713, + "flos": 28226983173120.0, + "grad_norm": 1.584002725324806, + "language_loss": 0.72518432, + "learning_rate": 1.0935635486359459e-06, + "loss": 0.80198425, + "num_input_tokens_seen": 237028415, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.09814453, + "step": 10981, + "time_per_iteration": 2.552730083465576 + }, + { + "auxiliary_loss_clip": 0.0641138, + "auxiliary_loss_mlp": 0.01266077, + "balance_loss_clip": 0.06272905, + "balance_loss_mlp": 0.01256111, + "epoch": 0.6602735607996393, + "flos": 29424737761920.0, + "grad_norm": 1.8532747935564327, + "language_loss": 0.69432831, + "learning_rate": 1.0932163998386647e-06, + "loss": 0.77110291, + "num_input_tokens_seen": 237046595, + "router_z_loss_clip": 1.38476562, + "router_z_loss_mlp": 0.09960938, + "step": 10982, + "time_per_iteration": 2.591977834701538 + }, + { + "auxiliary_loss_clip": 0.06413966, + "auxiliary_loss_mlp": 0.01264907, + "balance_loss_clip": 0.06277901, + "balance_loss_mlp": 0.01254148, + "epoch": 0.6603336840523072, + "flos": 18593963182080.0, + "grad_norm": 1.4024673840301536, + "language_loss": 0.69806457, + "learning_rate": 1.0928692854267075e-06, + "loss": 0.77485329, + "num_input_tokens_seen": 237066150, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.10760498, + "step": 10983, + "time_per_iteration": 2.483527660369873 + }, + { + "auxiliary_loss_clip": 0.06413279, + "auxiliary_loss_mlp": 0.012674, + "balance_loss_clip": 0.06274509, + "balance_loss_mlp": 0.01256409, + "epoch": 0.6603938073049752, + "flos": 33263153447040.0, + "grad_norm": 1.5623815208568963, + "language_loss": 0.70765328, + "learning_rate": 1.092522205413239e-06, + "loss": 0.78446013, + "num_input_tokens_seen": 237087060, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.10998535, + "step": 10984, + "time_per_iteration": 2.6334474086761475 + }, + { + "auxiliary_loss_clip": 0.06408207, + "auxiliary_loss_mlp": 0.01266467, + "balance_loss_clip": 0.06274273, + "balance_loss_mlp": 0.01256078, + "epoch": 0.6604539305576431, + "flos": 17390045318400.0, + "grad_norm": 1.8218342593599246, + "language_loss": 0.84316599, + "learning_rate": 1.0921751598114193e-06, + "loss": 0.9199127, + "num_input_tokens_seen": 237103825, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.10394287, + "step": 10985, + "time_per_iteration": 2.4621846675872803 + }, + { + "auxiliary_loss_clip": 0.06415112, + "auxiliary_loss_mlp": 0.01267549, + "balance_loss_clip": 0.06275454, + "balance_loss_mlp": 0.01256779, + "epoch": 0.6605140538103111, + "flos": 21257447316480.0, + "grad_norm": 1.9945336241456124, + "language_loss": 0.74090636, + "learning_rate": 1.0918281486344077e-06, + "loss": 0.81773293, + "num_input_tokens_seen": 237121740, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.10778809, + "step": 10986, + "time_per_iteration": 2.5241971015930176 + }, + { + "auxiliary_loss_clip": 0.06414539, + "auxiliary_loss_mlp": 0.01269603, + "balance_loss_clip": 0.06278964, + "balance_loss_mlp": 0.01259673, + "epoch": 0.6605741770629792, + "flos": 13886885018880.0, + "grad_norm": 1.8900199688101529, + "language_loss": 0.79989499, + "learning_rate": 1.0914811718953636e-06, + "loss": 0.8767364, + "num_input_tokens_seen": 237139565, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.09936523, + "step": 10987, + "time_per_iteration": 2.467759132385254 + }, + { + "auxiliary_loss_clip": 0.06315437, + "auxiliary_loss_mlp": 0.01250965, + "balance_loss_clip": 0.06259646, + "balance_loss_mlp": 0.0124932, + "epoch": 0.6606343003156471, + "flos": 69338885840640.0, + "grad_norm": 0.958585987636571, + "language_loss": 0.5413903, + "learning_rate": 1.0911342296074454e-06, + "loss": 0.61705434, + "num_input_tokens_seen": 237201055, + "router_z_loss_clip": 0.55566406, + "router_z_loss_mlp": 0.01647949, + "step": 10988, + "time_per_iteration": 3.2449100017547607 + }, + { + "auxiliary_loss_clip": 0.0641297, + "auxiliary_loss_mlp": 0.01265201, + "balance_loss_clip": 0.06277774, + "balance_loss_mlp": 0.0125508, + "epoch": 0.6606944235683151, + "flos": 27279887673600.0, + "grad_norm": 1.4331259688792952, + "language_loss": 0.77265781, + "learning_rate": 1.0907873217838077e-06, + "loss": 0.8494395, + "num_input_tokens_seen": 237221805, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.10119629, + "step": 10989, + "time_per_iteration": 2.565397262573242 + }, + { + "auxiliary_loss_clip": 0.06413271, + "auxiliary_loss_mlp": 0.01268256, + "balance_loss_clip": 0.06277858, + "balance_loss_mlp": 0.01257796, + "epoch": 0.660754546820983, + "flos": 13778082092160.0, + "grad_norm": 1.981088082283497, + "language_loss": 0.77234143, + "learning_rate": 1.0904404484376064e-06, + "loss": 0.84915674, + "num_input_tokens_seen": 237238270, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.10461426, + "step": 10990, + "time_per_iteration": 3.8957126140594482 + }, + { + "auxiliary_loss_clip": 0.06422216, + "auxiliary_loss_mlp": 0.01267426, + "balance_loss_clip": 0.06283079, + "balance_loss_mlp": 0.0125693, + "epoch": 0.660814670073651, + "flos": 15710567207040.0, + "grad_norm": 2.3076268356000864, + "language_loss": 0.60737276, + "learning_rate": 1.0900936095819937e-06, + "loss": 0.68426919, + "num_input_tokens_seen": 237255400, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.10491943, + "step": 10991, + "time_per_iteration": 2.528184175491333 + }, + { + "auxiliary_loss_clip": 0.0641991, + "auxiliary_loss_mlp": 0.01270981, + "balance_loss_clip": 0.06280324, + "balance_loss_mlp": 0.012599, + "epoch": 0.6608747933263189, + "flos": 20856295094400.0, + "grad_norm": 2.771721604026619, + "language_loss": 0.67745811, + "learning_rate": 1.0897468052301234e-06, + "loss": 0.75436699, + "num_input_tokens_seen": 237273105, + "router_z_loss_clip": 1.39257812, + "router_z_loss_mlp": 0.11083984, + "step": 10992, + "time_per_iteration": 2.5081818103790283 + }, + { + "auxiliary_loss_clip": 0.06419984, + "auxiliary_loss_mlp": 0.01265645, + "balance_loss_clip": 0.06279188, + "balance_loss_mlp": 0.01254588, + "epoch": 0.660934916578987, + "flos": 20638521532800.0, + "grad_norm": 1.8747370045388403, + "language_loss": 0.87962919, + "learning_rate": 1.0894000353951444e-06, + "loss": 0.95648551, + "num_input_tokens_seen": 237292650, + "router_z_loss_clip": 1.40722656, + "router_z_loss_mlp": 0.11053467, + "step": 10993, + "time_per_iteration": 2.5521185398101807 + }, + { + "auxiliary_loss_clip": 0.0642574, + "auxiliary_loss_mlp": 0.0126717, + "balance_loss_clip": 0.06281907, + "balance_loss_mlp": 0.01255434, + "epoch": 0.6609950398316549, + "flos": 25119692288640.0, + "grad_norm": 1.7537930651875573, + "language_loss": 0.67272747, + "learning_rate": 1.0890533000902078e-06, + "loss": 0.74965656, + "num_input_tokens_seen": 237312865, + "router_z_loss_clip": 1.43847656, + "router_z_loss_mlp": 0.11737061, + "step": 10994, + "time_per_iteration": 2.6144933700561523 + }, + { + "auxiliary_loss_clip": 0.06417718, + "auxiliary_loss_mlp": 0.01264904, + "balance_loss_clip": 0.06279863, + "balance_loss_mlp": 0.01253812, + "epoch": 0.6610551630843229, + "flos": 18667155323520.0, + "grad_norm": 1.5859648112701323, + "language_loss": 0.77035165, + "learning_rate": 1.0887065993284626e-06, + "loss": 0.84717792, + "num_input_tokens_seen": 237331210, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.11096191, + "step": 10995, + "time_per_iteration": 2.5111653804779053 + }, + { + "auxiliary_loss_clip": 0.06421737, + "auxiliary_loss_mlp": 0.01276001, + "balance_loss_clip": 0.06282931, + "balance_loss_mlp": 0.01265868, + "epoch": 0.6611152863369908, + "flos": 23264885508480.0, + "grad_norm": 1.7748442712796604, + "language_loss": 0.74969876, + "learning_rate": 1.088359933123053e-06, + "loss": 0.82667613, + "num_input_tokens_seen": 237349455, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.10137939, + "step": 10996, + "time_per_iteration": 2.5098516941070557 + }, + { + "auxiliary_loss_clip": 0.06418104, + "auxiliary_loss_mlp": 0.01265908, + "balance_loss_clip": 0.06280057, + "balance_loss_mlp": 0.0125562, + "epoch": 0.6611754095896588, + "flos": 22165577867520.0, + "grad_norm": 1.6113039426712623, + "language_loss": 0.69186199, + "learning_rate": 1.088013301487126e-06, + "loss": 0.76870203, + "num_input_tokens_seen": 237367100, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.10296631, + "step": 10997, + "time_per_iteration": 2.525808095932007 + }, + { + "auxiliary_loss_clip": 0.06421575, + "auxiliary_loss_mlp": 0.01265058, + "balance_loss_clip": 0.06279309, + "balance_loss_mlp": 0.01254467, + "epoch": 0.6612355328423267, + "flos": 13996442632320.0, + "grad_norm": 1.959031062109239, + "language_loss": 0.68880165, + "learning_rate": 1.0876667044338269e-06, + "loss": 0.76566797, + "num_input_tokens_seen": 237384840, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.10601807, + "step": 10998, + "time_per_iteration": 2.457221269607544 + }, + { + "auxiliary_loss_clip": 0.06313896, + "auxiliary_loss_mlp": 0.01252861, + "balance_loss_clip": 0.06257924, + "balance_loss_mlp": 0.01251496, + "epoch": 0.6612956560949947, + "flos": 61472051337600.0, + "grad_norm": 0.641819710963161, + "language_loss": 0.50997436, + "learning_rate": 1.087320141976297e-06, + "loss": 0.58564192, + "num_input_tokens_seen": 237443355, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01367188, + "step": 10999, + "time_per_iteration": 3.1182916164398193 + }, + { + "auxiliary_loss_clip": 0.06424031, + "auxiliary_loss_mlp": 0.01268354, + "balance_loss_clip": 0.06280085, + "balance_loss_mlp": 0.01257554, + "epoch": 0.6613557793476627, + "flos": 21623114534400.0, + "grad_norm": 2.559990275838241, + "language_loss": 0.70366681, + "learning_rate": 1.086973614127679e-06, + "loss": 0.78059065, + "num_input_tokens_seen": 237459205, + "router_z_loss_clip": 1.43945312, + "router_z_loss_mlp": 0.10797119, + "step": 11000, + "time_per_iteration": 3.9581432342529297 + }, + { + "auxiliary_loss_clip": 0.06411293, + "auxiliary_loss_mlp": 0.01264334, + "balance_loss_clip": 0.06276174, + "balance_loss_mlp": 0.01254523, + "epoch": 0.6614159026003307, + "flos": 34028379659520.0, + "grad_norm": 1.6165930596704574, + "language_loss": 0.65563923, + "learning_rate": 1.0866271209011133e-06, + "loss": 0.73239553, + "num_input_tokens_seen": 237483580, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.0980835, + "step": 11001, + "time_per_iteration": 2.6200945377349854 + }, + { + "auxiliary_loss_clip": 0.06414855, + "auxiliary_loss_mlp": 0.01264334, + "balance_loss_clip": 0.06279069, + "balance_loss_mlp": 0.01254207, + "epoch": 0.6614760258529987, + "flos": 24104100476160.0, + "grad_norm": 1.733561890110771, + "language_loss": 0.73266578, + "learning_rate": 1.086280662309739e-06, + "loss": 0.80945766, + "num_input_tokens_seen": 237502860, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.10137939, + "step": 11002, + "time_per_iteration": 2.5620791912078857 + }, + { + "auxiliary_loss_clip": 0.06415205, + "auxiliary_loss_mlp": 0.01266083, + "balance_loss_clip": 0.06279428, + "balance_loss_mlp": 0.01255372, + "epoch": 0.6615361491056666, + "flos": 14909227084800.0, + "grad_norm": 2.451590701969631, + "language_loss": 0.79098624, + "learning_rate": 1.0859342383666928e-06, + "loss": 0.86779916, + "num_input_tokens_seen": 237521030, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10705566, + "step": 11003, + "time_per_iteration": 2.481431007385254 + }, + { + "auxiliary_loss_clip": 0.06419842, + "auxiliary_loss_mlp": 0.01267917, + "balance_loss_clip": 0.06279956, + "balance_loss_mlp": 0.01256449, + "epoch": 0.6615962723583346, + "flos": 15310337379840.0, + "grad_norm": 2.101443479539304, + "language_loss": 0.69193184, + "learning_rate": 1.0855878490851119e-06, + "loss": 0.76880944, + "num_input_tokens_seen": 237539585, + "router_z_loss_clip": 1.39941406, + "router_z_loss_mlp": 0.11468506, + "step": 11004, + "time_per_iteration": 4.006279945373535 + }, + { + "auxiliary_loss_clip": 0.06422809, + "auxiliary_loss_mlp": 0.0127206, + "balance_loss_clip": 0.06279877, + "balance_loss_mlp": 0.01260741, + "epoch": 0.6616563956110025, + "flos": 18738293040000.0, + "grad_norm": 2.056452219231189, + "language_loss": 0.70325673, + "learning_rate": 1.085241494478132e-06, + "loss": 0.78020537, + "num_input_tokens_seen": 237557655, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.11328125, + "step": 11005, + "time_per_iteration": 2.4944448471069336 + }, + { + "auxiliary_loss_clip": 0.06413882, + "auxiliary_loss_mlp": 0.01266539, + "balance_loss_clip": 0.06277984, + "balance_loss_mlp": 0.01256019, + "epoch": 0.6617165188636706, + "flos": 24501353483520.0, + "grad_norm": 1.5254702956902315, + "language_loss": 0.78776741, + "learning_rate": 1.0848951745588855e-06, + "loss": 0.86457157, + "num_input_tokens_seen": 237577000, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10510254, + "step": 11006, + "time_per_iteration": 2.5451557636260986 + }, + { + "auxiliary_loss_clip": 0.06416766, + "auxiliary_loss_mlp": 0.01267157, + "balance_loss_clip": 0.06280621, + "balance_loss_mlp": 0.01256649, + "epoch": 0.6617766421163385, + "flos": 22385741270400.0, + "grad_norm": 1.834529140929997, + "language_loss": 0.76486355, + "learning_rate": 1.0845488893405068e-06, + "loss": 0.84170276, + "num_input_tokens_seen": 237597960, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.1050415, + "step": 11007, + "time_per_iteration": 2.5298049449920654 + }, + { + "auxiliary_loss_clip": 0.0641939, + "auxiliary_loss_mlp": 0.01265491, + "balance_loss_clip": 0.06281586, + "balance_loss_mlp": 0.01255185, + "epoch": 0.6618367653690065, + "flos": 20856756291840.0, + "grad_norm": 1.4555215695175368, + "language_loss": 0.78606236, + "learning_rate": 1.0842026388361248e-06, + "loss": 0.86291116, + "num_input_tokens_seen": 237616385, + "router_z_loss_clip": 1.37695312, + "router_z_loss_mlp": 0.10302734, + "step": 11008, + "time_per_iteration": 4.0146424770355225 + }, + { + "auxiliary_loss_clip": 0.06420049, + "auxiliary_loss_mlp": 0.01265114, + "balance_loss_clip": 0.06275912, + "balance_loss_mlp": 0.01254004, + "epoch": 0.6618968886216744, + "flos": 17718089253120.0, + "grad_norm": 1.6552311812920846, + "language_loss": 0.82077724, + "learning_rate": 1.0838564230588715e-06, + "loss": 0.89762884, + "num_input_tokens_seen": 237634930, + "router_z_loss_clip": 1.44140625, + "router_z_loss_mlp": 0.11114502, + "step": 11009, + "time_per_iteration": 2.532111883163452 + }, + { + "auxiliary_loss_clip": 0.06314184, + "auxiliary_loss_mlp": 0.01255522, + "balance_loss_clip": 0.06257774, + "balance_loss_mlp": 0.01254004, + "epoch": 0.6619570118743424, + "flos": 67054500305280.0, + "grad_norm": 0.9881156540659067, + "language_loss": 0.67673898, + "learning_rate": 1.0835102420218735e-06, + "loss": 0.75243598, + "num_input_tokens_seen": 237693175, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01517487, + "step": 11010, + "time_per_iteration": 3.0648674964904785 + }, + { + "auxiliary_loss_clip": 0.06415196, + "auxiliary_loss_mlp": 0.01266404, + "balance_loss_clip": 0.0627633, + "balance_loss_mlp": 0.01254745, + "epoch": 0.6620171351270103, + "flos": 18666819907200.0, + "grad_norm": 1.5625294645604648, + "language_loss": 0.71682811, + "learning_rate": 1.0831640957382593e-06, + "loss": 0.79364407, + "num_input_tokens_seen": 237713160, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.11657715, + "step": 11011, + "time_per_iteration": 2.527869939804077 + }, + { + "auxiliary_loss_clip": 0.06418953, + "auxiliary_loss_mlp": 0.0126958, + "balance_loss_clip": 0.06281759, + "balance_loss_mlp": 0.01259548, + "epoch": 0.6620772583796783, + "flos": 24177376471680.0, + "grad_norm": 1.61722758281003, + "language_loss": 0.72627336, + "learning_rate": 1.0828179842211557e-06, + "loss": 0.80315864, + "num_input_tokens_seen": 237733600, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.10040283, + "step": 11012, + "time_per_iteration": 2.53691029548645 + }, + { + "auxiliary_loss_clip": 0.0640786, + "auxiliary_loss_mlp": 0.01270166, + "balance_loss_clip": 0.06279317, + "balance_loss_mlp": 0.01260903, + "epoch": 0.6621373816323463, + "flos": 23630385018240.0, + "grad_norm": 1.5542286383883441, + "language_loss": 0.79656094, + "learning_rate": 1.0824719074836845e-06, + "loss": 0.8733412, + "num_input_tokens_seen": 237752135, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.09265137, + "step": 11013, + "time_per_iteration": 2.5782439708709717 + }, + { + "auxiliary_loss_clip": 0.06413269, + "auxiliary_loss_mlp": 0.01265248, + "balance_loss_clip": 0.062774, + "balance_loss_mlp": 0.01254973, + "epoch": 0.6621975048850143, + "flos": 18448123950720.0, + "grad_norm": 1.9713400088604554, + "language_loss": 0.70423663, + "learning_rate": 1.082125865538971e-06, + "loss": 0.78102177, + "num_input_tokens_seen": 237770735, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10266113, + "step": 11014, + "time_per_iteration": 2.474597454071045 + }, + { + "auxiliary_loss_clip": 0.06411768, + "auxiliary_loss_mlp": 0.01265368, + "balance_loss_clip": 0.06278192, + "balance_loss_mlp": 0.01256475, + "epoch": 0.6622576281376823, + "flos": 14069047795200.0, + "grad_norm": 1.5898800545059366, + "language_loss": 0.77497208, + "learning_rate": 1.081779858400137e-06, + "loss": 0.85174346, + "num_input_tokens_seen": 237789005, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.08886719, + "step": 11015, + "time_per_iteration": 2.5123109817504883 + }, + { + "auxiliary_loss_clip": 0.06413803, + "auxiliary_loss_mlp": 0.01267289, + "balance_loss_clip": 0.06278028, + "balance_loss_mlp": 0.01256191, + "epoch": 0.6623177513903502, + "flos": 17024587735680.0, + "grad_norm": 1.7138462778054382, + "language_loss": 0.82368481, + "learning_rate": 1.0814338860803021e-06, + "loss": 0.90049571, + "num_input_tokens_seen": 237807740, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.11102295, + "step": 11016, + "time_per_iteration": 2.477137565612793 + }, + { + "auxiliary_loss_clip": 0.06418676, + "auxiliary_loss_mlp": 0.01263773, + "balance_loss_clip": 0.06277445, + "balance_loss_mlp": 0.01253175, + "epoch": 0.6623778746430182, + "flos": 17276127292800.0, + "grad_norm": 2.159067097867079, + "language_loss": 0.70195687, + "learning_rate": 1.0810879485925864e-06, + "loss": 0.77878135, + "num_input_tokens_seen": 237826340, + "router_z_loss_clip": 1.41210938, + "router_z_loss_mlp": 0.10583496, + "step": 11017, + "time_per_iteration": 2.5194361209869385 + }, + { + "auxiliary_loss_clip": 0.06414436, + "auxiliary_loss_mlp": 0.01267466, + "balance_loss_clip": 0.0627765, + "balance_loss_mlp": 0.01257101, + "epoch": 0.6624379978956861, + "flos": 48802725198720.0, + "grad_norm": 1.7089146920832974, + "language_loss": 0.77715868, + "learning_rate": 1.0807420459501084e-06, + "loss": 0.85397768, + "num_input_tokens_seen": 237848305, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.1036377, + "step": 11018, + "time_per_iteration": 2.7684452533721924 + }, + { + "auxiliary_loss_clip": 0.06414039, + "auxiliary_loss_mlp": 0.0126262, + "balance_loss_clip": 0.06278235, + "balance_loss_mlp": 0.01252714, + "epoch": 0.6624981211483542, + "flos": 18958330661760.0, + "grad_norm": 1.809730512167174, + "language_loss": 0.83465689, + "learning_rate": 1.0803961781659841e-06, + "loss": 0.91142356, + "num_input_tokens_seen": 237867020, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.09899902, + "step": 11019, + "time_per_iteration": 2.5207102298736572 + }, + { + "auxiliary_loss_clip": 0.06410275, + "auxiliary_loss_mlp": 0.01263185, + "balance_loss_clip": 0.0627672, + "balance_loss_mlp": 0.01253434, + "epoch": 0.6625582444010221, + "flos": 23262998791680.0, + "grad_norm": 1.565039350749023, + "language_loss": 0.72290635, + "learning_rate": 1.080050345253328e-06, + "loss": 0.79964089, + "num_input_tokens_seen": 237886710, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.09747314, + "step": 11020, + "time_per_iteration": 2.52868914604187 + }, + { + "auxiliary_loss_clip": 0.06419435, + "auxiliary_loss_mlp": 0.01268652, + "balance_loss_clip": 0.06276274, + "balance_loss_mlp": 0.01257601, + "epoch": 0.6626183676536901, + "flos": 21400770925440.0, + "grad_norm": 3.661943544447812, + "language_loss": 0.72194296, + "learning_rate": 1.0797045472252554e-06, + "loss": 0.79882383, + "num_input_tokens_seen": 237904795, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.11047363, + "step": 11021, + "time_per_iteration": 2.5214977264404297 + }, + { + "auxiliary_loss_clip": 0.06417044, + "auxiliary_loss_mlp": 0.01269377, + "balance_loss_clip": 0.06279403, + "balance_loss_mlp": 0.0125891, + "epoch": 0.662678490906358, + "flos": 14575984197120.0, + "grad_norm": 4.221661740882693, + "language_loss": 0.83307576, + "learning_rate": 1.0793587840948793e-06, + "loss": 0.90993994, + "num_input_tokens_seen": 237921320, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.10467529, + "step": 11022, + "time_per_iteration": 2.495877981185913 + }, + { + "auxiliary_loss_clip": 0.0642494, + "auxiliary_loss_mlp": 0.01267242, + "balance_loss_clip": 0.06277288, + "balance_loss_mlp": 0.0125513, + "epoch": 0.662738614159026, + "flos": 15996962862720.0, + "grad_norm": 2.5511625457855116, + "language_loss": 0.73115802, + "learning_rate": 1.0790130558753099e-06, + "loss": 0.80807984, + "num_input_tokens_seen": 237933525, + "router_z_loss_clip": 1.47753906, + "router_z_loss_mlp": 0.12115479, + "step": 11023, + "time_per_iteration": 2.475238800048828 + }, + { + "auxiliary_loss_clip": 0.06413288, + "auxiliary_loss_mlp": 0.01270086, + "balance_loss_clip": 0.06276564, + "balance_loss_mlp": 0.01259327, + "epoch": 0.6627987374116939, + "flos": 19542358419840.0, + "grad_norm": 1.582084315278466, + "language_loss": 0.75136846, + "learning_rate": 1.0786673625796574e-06, + "loss": 0.82820219, + "num_input_tokens_seen": 237953395, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.10748291, + "step": 11024, + "time_per_iteration": 2.5104072093963623 + }, + { + "auxiliary_loss_clip": 0.06414796, + "auxiliary_loss_mlp": 0.01267042, + "balance_loss_clip": 0.06277162, + "balance_loss_mlp": 0.01256635, + "epoch": 0.662858860664362, + "flos": 15707800022400.0, + "grad_norm": 3.5687971531497236, + "language_loss": 0.70028591, + "learning_rate": 1.0783217042210306e-06, + "loss": 0.77710426, + "num_input_tokens_seen": 237971445, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.10406494, + "step": 11025, + "time_per_iteration": 2.528007745742798 + }, + { + "auxiliary_loss_clip": 0.06416678, + "auxiliary_loss_mlp": 0.01266074, + "balance_loss_clip": 0.06279378, + "balance_loss_mlp": 0.01255513, + "epoch": 0.6629189839170299, + "flos": 20160026392320.0, + "grad_norm": 1.3776452398710215, + "language_loss": 0.78906387, + "learning_rate": 1.0779760808125379e-06, + "loss": 0.8658914, + "num_input_tokens_seen": 237989965, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.10565186, + "step": 11026, + "time_per_iteration": 2.5116465091705322 + }, + { + "auxiliary_loss_clip": 0.06413042, + "auxiliary_loss_mlp": 0.0126691, + "balance_loss_clip": 0.06277484, + "balance_loss_mlp": 0.01256759, + "epoch": 0.6629791071696979, + "flos": 20920430995200.0, + "grad_norm": 1.672126176860425, + "language_loss": 0.76636124, + "learning_rate": 1.0776304923672842e-06, + "loss": 0.84316075, + "num_input_tokens_seen": 238006820, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.1015625, + "step": 11027, + "time_per_iteration": 2.496917486190796 + }, + { + "auxiliary_loss_clip": 0.06414916, + "auxiliary_loss_mlp": 0.01265895, + "balance_loss_clip": 0.0627641, + "balance_loss_mlp": 0.01254708, + "epoch": 0.6630392304223659, + "flos": 20852647441920.0, + "grad_norm": 2.0836235208298115, + "language_loss": 0.70842957, + "learning_rate": 1.0772849388983742e-06, + "loss": 0.78523767, + "num_input_tokens_seen": 238022560, + "router_z_loss_clip": 1.38476562, + "router_z_loss_mlp": 0.11193848, + "step": 11028, + "time_per_iteration": 2.5055668354034424 + }, + { + "auxiliary_loss_clip": 0.06413043, + "auxiliary_loss_mlp": 0.01264718, + "balance_loss_clip": 0.06275769, + "balance_loss_mlp": 0.01254741, + "epoch": 0.6630993536750338, + "flos": 21002092398720.0, + "grad_norm": 1.9464575885295123, + "language_loss": 0.79627401, + "learning_rate": 1.0769394204189138e-06, + "loss": 0.87305164, + "num_input_tokens_seen": 238041895, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.09979248, + "step": 11029, + "time_per_iteration": 4.029799461364746 + }, + { + "auxiliary_loss_clip": 0.06414881, + "auxiliary_loss_mlp": 0.01267354, + "balance_loss_clip": 0.06275269, + "balance_loss_mlp": 0.01255755, + "epoch": 0.6631594769277018, + "flos": 18264787217280.0, + "grad_norm": 2.0842184585841994, + "language_loss": 0.76459014, + "learning_rate": 1.0765939369420012e-06, + "loss": 0.84141254, + "num_input_tokens_seen": 238060445, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.1159668, + "step": 11030, + "time_per_iteration": 2.499678611755371 + }, + { + "auxiliary_loss_clip": 0.06420542, + "auxiliary_loss_mlp": 0.01269601, + "balance_loss_clip": 0.06277149, + "balance_loss_mlp": 0.01258426, + "epoch": 0.6632196001803697, + "flos": 17826053639040.0, + "grad_norm": 2.267864257363868, + "language_loss": 0.75185478, + "learning_rate": 1.0762484884807391e-06, + "loss": 0.82875621, + "num_input_tokens_seen": 238077080, + "router_z_loss_clip": 1.43359375, + "router_z_loss_mlp": 0.11169434, + "step": 11031, + "time_per_iteration": 2.470355272293091 + }, + { + "auxiliary_loss_clip": 0.06414694, + "auxiliary_loss_mlp": 0.01264566, + "balance_loss_clip": 0.06273525, + "balance_loss_mlp": 0.0125342, + "epoch": 0.6632797234330378, + "flos": 12673910914560.0, + "grad_norm": 2.431299325405645, + "language_loss": 0.74500775, + "learning_rate": 1.075903075048228e-06, + "loss": 0.82180035, + "num_input_tokens_seen": 238091045, + "router_z_loss_clip": 1.41210938, + "router_z_loss_mlp": 0.11151123, + "step": 11032, + "time_per_iteration": 2.485921859741211 + }, + { + "auxiliary_loss_clip": 0.06407184, + "auxiliary_loss_mlp": 0.01266513, + "balance_loss_clip": 0.06272276, + "balance_loss_mlp": 0.01256296, + "epoch": 0.6633398466857057, + "flos": 23591168507520.0, + "grad_norm": 1.735276154326279, + "language_loss": 0.80570471, + "learning_rate": 1.0755576966575635e-06, + "loss": 0.88244164, + "num_input_tokens_seen": 238110220, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.10217285, + "step": 11033, + "time_per_iteration": 2.5526669025421143 + }, + { + "auxiliary_loss_clip": 0.0641445, + "auxiliary_loss_mlp": 0.01269108, + "balance_loss_clip": 0.06275126, + "balance_loss_mlp": 0.01257497, + "epoch": 0.6633999699383737, + "flos": 20638018408320.0, + "grad_norm": 1.5867971062319928, + "language_loss": 0.80710161, + "learning_rate": 1.0752123533218451e-06, + "loss": 0.88393718, + "num_input_tokens_seen": 238130400, + "router_z_loss_clip": 1.39257812, + "router_z_loss_mlp": 0.11608887, + "step": 11034, + "time_per_iteration": 2.5465288162231445 + }, + { + "auxiliary_loss_clip": 0.06408665, + "auxiliary_loss_mlp": 0.01266422, + "balance_loss_clip": 0.06272399, + "balance_loss_mlp": 0.01256569, + "epoch": 0.6634600931910416, + "flos": 21803264812800.0, + "grad_norm": 1.6372739814417405, + "language_loss": 0.76400816, + "learning_rate": 1.074867045054166e-06, + "loss": 0.84075904, + "num_input_tokens_seen": 238148165, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.09851074, + "step": 11035, + "time_per_iteration": 2.5024783611297607 + }, + { + "auxiliary_loss_clip": 0.06416409, + "auxiliary_loss_mlp": 0.01265126, + "balance_loss_clip": 0.06273785, + "balance_loss_mlp": 0.01254648, + "epoch": 0.6635202164437096, + "flos": 18738628456320.0, + "grad_norm": 1.632864185122063, + "language_loss": 0.8277241, + "learning_rate": 1.074521771867622e-06, + "loss": 0.90453947, + "num_input_tokens_seen": 238166360, + "router_z_loss_clip": 1.42675781, + "router_z_loss_mlp": 0.10491943, + "step": 11036, + "time_per_iteration": 2.5380334854125977 + }, + { + "auxiliary_loss_clip": 0.06308148, + "auxiliary_loss_mlp": 0.01254977, + "balance_loss_clip": 0.06252232, + "balance_loss_mlp": 0.0125369, + "epoch": 0.6635803396963775, + "flos": 60242501324160.0, + "grad_norm": 0.7586749678323187, + "language_loss": 0.5225606, + "learning_rate": 1.0741765337753044e-06, + "loss": 0.59819186, + "num_input_tokens_seen": 238227630, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01287842, + "step": 11037, + "time_per_iteration": 3.1442580223083496 + }, + { + "auxiliary_loss_clip": 0.06412059, + "auxiliary_loss_mlp": 0.01266845, + "balance_loss_clip": 0.06273833, + "balance_loss_mlp": 0.01255443, + "epoch": 0.6636404629490456, + "flos": 29174414088960.0, + "grad_norm": 1.6208815133420311, + "language_loss": 0.79116094, + "learning_rate": 1.0738313307903052e-06, + "loss": 0.86795002, + "num_input_tokens_seen": 238248435, + "router_z_loss_clip": 1.38476562, + "router_z_loss_mlp": 0.11407471, + "step": 11038, + "time_per_iteration": 2.5753371715545654 + }, + { + "auxiliary_loss_clip": 0.06411879, + "auxiliary_loss_mlp": 0.01264789, + "balance_loss_clip": 0.06272987, + "balance_loss_mlp": 0.01253542, + "epoch": 0.6637005862017135, + "flos": 38916530496000.0, + "grad_norm": 2.008253443704211, + "language_loss": 0.6435625, + "learning_rate": 1.073486162925716e-06, + "loss": 0.72032923, + "num_input_tokens_seen": 238268755, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.11248779, + "step": 11039, + "time_per_iteration": 2.6589627265930176 + }, + { + "auxiliary_loss_clip": 0.06414853, + "auxiliary_loss_mlp": 0.01265068, + "balance_loss_clip": 0.06273548, + "balance_loss_mlp": 0.01254613, + "epoch": 0.6637607094543815, + "flos": 22789870312320.0, + "grad_norm": 2.5741405662525856, + "language_loss": 0.64139444, + "learning_rate": 1.0731410301946237e-06, + "loss": 0.71819365, + "num_input_tokens_seen": 238290120, + "router_z_loss_clip": 1.41503906, + "router_z_loss_mlp": 0.10455322, + "step": 11040, + "time_per_iteration": 3.924652338027954 + }, + { + "auxiliary_loss_clip": 0.06410997, + "auxiliary_loss_mlp": 0.01267386, + "balance_loss_clip": 0.06275022, + "balance_loss_mlp": 0.01257909, + "epoch": 0.6638208327070495, + "flos": 18119996161920.0, + "grad_norm": 1.923413934429174, + "language_loss": 0.72439963, + "learning_rate": 1.0727959326101161e-06, + "loss": 0.80118346, + "num_input_tokens_seen": 238309290, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.09484863, + "step": 11041, + "time_per_iteration": 2.5356383323669434 + }, + { + "auxiliary_loss_clip": 0.06416036, + "auxiliary_loss_mlp": 0.01265882, + "balance_loss_clip": 0.06278844, + "balance_loss_mlp": 0.01255415, + "epoch": 0.6638809559597174, + "flos": 29432703899520.0, + "grad_norm": 2.049859271676146, + "language_loss": 0.61855423, + "learning_rate": 1.0724508701852806e-06, + "loss": 0.69537336, + "num_input_tokens_seen": 238327280, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.10473633, + "step": 11042, + "time_per_iteration": 2.664304256439209 + }, + { + "auxiliary_loss_clip": 0.06417962, + "auxiliary_loss_mlp": 0.012679, + "balance_loss_clip": 0.06273351, + "balance_loss_mlp": 0.01256444, + "epoch": 0.6639410792123854, + "flos": 28079928057600.0, + "grad_norm": 1.8233607330526647, + "language_loss": 0.69058919, + "learning_rate": 1.0721058429331998e-06, + "loss": 0.76744783, + "num_input_tokens_seen": 238346330, + "router_z_loss_clip": 1.44433594, + "router_z_loss_mlp": 0.11462402, + "step": 11043, + "time_per_iteration": 4.0889365673065186 + }, + { + "auxiliary_loss_clip": 0.06404908, + "auxiliary_loss_mlp": 0.01269104, + "balance_loss_clip": 0.06272525, + "balance_loss_mlp": 0.01259818, + "epoch": 0.6640012024650533, + "flos": 25563373257600.0, + "grad_norm": 1.464057970327077, + "language_loss": 0.83693618, + "learning_rate": 1.0717608508669587e-06, + "loss": 0.91367632, + "num_input_tokens_seen": 238364650, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.09283447, + "step": 11044, + "time_per_iteration": 2.5765178203582764 + }, + { + "auxiliary_loss_clip": 0.0640911, + "auxiliary_loss_mlp": 0.01263885, + "balance_loss_clip": 0.0627351, + "balance_loss_mlp": 0.01253234, + "epoch": 0.6640613257177214, + "flos": 14872316561280.0, + "grad_norm": 2.273920138408825, + "language_loss": 0.69855309, + "learning_rate": 1.0714158939996392e-06, + "loss": 0.77528304, + "num_input_tokens_seen": 238381630, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.10650635, + "step": 11045, + "time_per_iteration": 2.475839376449585 + }, + { + "auxiliary_loss_clip": 0.06414758, + "auxiliary_loss_mlp": 0.01268834, + "balance_loss_clip": 0.06275514, + "balance_loss_mlp": 0.01258349, + "epoch": 0.6641214489703893, + "flos": 23227681495680.0, + "grad_norm": 1.3157905928087725, + "language_loss": 0.64253563, + "learning_rate": 1.0710709723443235e-06, + "loss": 0.71937156, + "num_input_tokens_seen": 238402595, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.10479736, + "step": 11046, + "time_per_iteration": 2.550718307495117 + }, + { + "auxiliary_loss_clip": 0.06412549, + "auxiliary_loss_mlp": 0.01265992, + "balance_loss_clip": 0.06275138, + "balance_loss_mlp": 0.01255859, + "epoch": 0.6641815722230573, + "flos": 37751661434880.0, + "grad_norm": 1.3902156312209348, + "language_loss": 0.71747851, + "learning_rate": 1.070726085914088e-06, + "loss": 0.79426396, + "num_input_tokens_seen": 238426860, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.10137939, + "step": 11047, + "time_per_iteration": 2.6542744636535645 + }, + { + "auxiliary_loss_clip": 0.06412829, + "auxiliary_loss_mlp": 0.01265859, + "balance_loss_clip": 0.06275409, + "balance_loss_mlp": 0.01255226, + "epoch": 0.6642416954757252, + "flos": 17936910990720.0, + "grad_norm": 1.7027644321315345, + "language_loss": 0.77464539, + "learning_rate": 1.0703812347220126e-06, + "loss": 0.8514322, + "num_input_tokens_seen": 238443990, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.10632324, + "step": 11048, + "time_per_iteration": 3.896479606628418 + }, + { + "auxiliary_loss_clip": 0.06311446, + "auxiliary_loss_mlp": 0.01254354, + "balance_loss_clip": 0.06255244, + "balance_loss_mlp": 0.01253094, + "epoch": 0.6643018187283932, + "flos": 52010712362880.0, + "grad_norm": 0.7347657101869507, + "language_loss": 0.55013496, + "learning_rate": 1.0700364187811745e-06, + "loss": 0.62579298, + "num_input_tokens_seen": 238503045, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01259613, + "step": 11049, + "time_per_iteration": 3.139099359512329 + }, + { + "auxiliary_loss_clip": 0.06414302, + "auxiliary_loss_mlp": 0.01268369, + "balance_loss_clip": 0.06277852, + "balance_loss_mlp": 0.01258189, + "epoch": 0.6643619419810611, + "flos": 30234463292160.0, + "grad_norm": 1.5235184894534042, + "language_loss": 0.64387465, + "learning_rate": 1.069691638104648e-06, + "loss": 0.72070134, + "num_input_tokens_seen": 238527320, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.10174561, + "step": 11050, + "time_per_iteration": 2.5815443992614746 + }, + { + "auxiliary_loss_clip": 0.06413838, + "auxiliary_loss_mlp": 0.0126498, + "balance_loss_clip": 0.06278379, + "balance_loss_mlp": 0.01254948, + "epoch": 0.6644220652337292, + "flos": 22972745848320.0, + "grad_norm": 1.9836199726179196, + "language_loss": 0.7914626, + "learning_rate": 1.0693468927055085e-06, + "loss": 0.86825073, + "num_input_tokens_seen": 238546030, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.1003418, + "step": 11051, + "time_per_iteration": 2.554255247116089 + }, + { + "auxiliary_loss_clip": 0.06413689, + "auxiliary_loss_mlp": 0.01267197, + "balance_loss_clip": 0.06275009, + "balance_loss_mlp": 0.01256778, + "epoch": 0.6644821884863971, + "flos": 21148602462720.0, + "grad_norm": 1.572752749022216, + "language_loss": 0.85833442, + "learning_rate": 1.0690021825968276e-06, + "loss": 0.93514335, + "num_input_tokens_seen": 238564175, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.10418701, + "step": 11052, + "time_per_iteration": 2.526331663131714 + }, + { + "auxiliary_loss_clip": 0.06422149, + "auxiliary_loss_mlp": 0.01265962, + "balance_loss_clip": 0.06279093, + "balance_loss_mlp": 0.01255108, + "epoch": 0.6645423117390651, + "flos": 20198907486720.0, + "grad_norm": 2.2521915942040134, + "language_loss": 0.75079048, + "learning_rate": 1.0686575077916776e-06, + "loss": 0.82767153, + "num_input_tokens_seen": 238581010, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.10864258, + "step": 11053, + "time_per_iteration": 2.495643377304077 + }, + { + "auxiliary_loss_clip": 0.06411796, + "auxiliary_loss_mlp": 0.01267797, + "balance_loss_clip": 0.06275838, + "balance_loss_mlp": 0.01257659, + "epoch": 0.6646024349917331, + "flos": 24358700707200.0, + "grad_norm": 1.4285282050820745, + "language_loss": 0.79548883, + "learning_rate": 1.0683128683031278e-06, + "loss": 0.87228477, + "num_input_tokens_seen": 238601365, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10144043, + "step": 11054, + "time_per_iteration": 2.533238649368286 + }, + { + "auxiliary_loss_clip": 0.06410603, + "auxiliary_loss_mlp": 0.0126873, + "balance_loss_clip": 0.06275114, + "balance_loss_mlp": 0.01258848, + "epoch": 0.664662558244401, + "flos": 18812617211520.0, + "grad_norm": 1.7645551715374934, + "language_loss": 0.73951137, + "learning_rate": 1.0679682641442472e-06, + "loss": 0.81630468, + "num_input_tokens_seen": 238619850, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.09875488, + "step": 11055, + "time_per_iteration": 2.5263750553131104 + }, + { + "auxiliary_loss_clip": 0.0641698, + "auxiliary_loss_mlp": 0.01266606, + "balance_loss_clip": 0.06276543, + "balance_loss_mlp": 0.01255186, + "epoch": 0.664722681497069, + "flos": 18958749932160.0, + "grad_norm": 1.6799288466366076, + "language_loss": 0.72991651, + "learning_rate": 1.0676236953281042e-06, + "loss": 0.80675244, + "num_input_tokens_seen": 238637635, + "router_z_loss_clip": 1.40527344, + "router_z_loss_mlp": 0.11431885, + "step": 11056, + "time_per_iteration": 2.4944491386413574 + }, + { + "auxiliary_loss_clip": 0.064121, + "auxiliary_loss_mlp": 0.01267868, + "balance_loss_clip": 0.06275958, + "balance_loss_mlp": 0.01257508, + "epoch": 0.6647828047497369, + "flos": 19577046810240.0, + "grad_norm": 1.7319313014316244, + "language_loss": 0.69902766, + "learning_rate": 1.0672791618677641e-06, + "loss": 0.77582735, + "num_input_tokens_seen": 238656200, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.1036377, + "step": 11057, + "time_per_iteration": 2.5427403450012207 + }, + { + "auxiliary_loss_clip": 0.06416071, + "auxiliary_loss_mlp": 0.01265479, + "balance_loss_clip": 0.06276184, + "balance_loss_mlp": 0.01255298, + "epoch": 0.664842928002405, + "flos": 23156250289920.0, + "grad_norm": 1.6627595883052484, + "language_loss": 0.80624598, + "learning_rate": 1.066934663776291e-06, + "loss": 0.88306141, + "num_input_tokens_seen": 238675005, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.10186768, + "step": 11058, + "time_per_iteration": 2.543358325958252 + }, + { + "auxiliary_loss_clip": 0.06310651, + "auxiliary_loss_mlp": 0.01251744, + "balance_loss_clip": 0.06254779, + "balance_loss_mlp": 0.01250295, + "epoch": 0.6649030512550729, + "flos": 65263326301440.0, + "grad_norm": 0.7825270857978761, + "language_loss": 0.6256783, + "learning_rate": 1.0665902010667496e-06, + "loss": 0.70130229, + "num_input_tokens_seen": 238731425, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.01447296, + "step": 11059, + "time_per_iteration": 3.081268548965454 + }, + { + "auxiliary_loss_clip": 0.0641288, + "auxiliary_loss_mlp": 0.01266045, + "balance_loss_clip": 0.06275995, + "balance_loss_mlp": 0.01255549, + "epoch": 0.6649631745077409, + "flos": 20201213473920.0, + "grad_norm": 1.6475331375538982, + "language_loss": 0.79008389, + "learning_rate": 1.0662457737522008e-06, + "loss": 0.86687315, + "num_input_tokens_seen": 238752020, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.1048584, + "step": 11060, + "time_per_iteration": 2.5021138191223145 + }, + { + "auxiliary_loss_clip": 0.06418125, + "auxiliary_loss_mlp": 0.0126778, + "balance_loss_clip": 0.06280607, + "balance_loss_mlp": 0.01257266, + "epoch": 0.6650232977604088, + "flos": 17244331868160.0, + "grad_norm": 2.2525334751718358, + "language_loss": 0.79225111, + "learning_rate": 1.0659013818457055e-06, + "loss": 0.86911017, + "num_input_tokens_seen": 238769665, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.10510254, + "step": 11061, + "time_per_iteration": 2.4997215270996094 + }, + { + "auxiliary_loss_clip": 0.06414805, + "auxiliary_loss_mlp": 0.01266652, + "balance_loss_clip": 0.06278637, + "balance_loss_mlp": 0.01256102, + "epoch": 0.6650834210130768, + "flos": 10010175217920.0, + "grad_norm": 1.965420807772364, + "language_loss": 0.57191408, + "learning_rate": 1.0655570253603243e-06, + "loss": 0.64872867, + "num_input_tokens_seen": 238782180, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.10552979, + "step": 11062, + "time_per_iteration": 2.457599401473999 + }, + { + "auxiliary_loss_clip": 0.06419773, + "auxiliary_loss_mlp": 0.01266686, + "balance_loss_clip": 0.06275927, + "balance_loss_mlp": 0.01254533, + "epoch": 0.6651435442657447, + "flos": 10456707225600.0, + "grad_norm": 2.498798138431811, + "language_loss": 0.76121116, + "learning_rate": 1.0652127043091144e-06, + "loss": 0.83807576, + "num_input_tokens_seen": 238800315, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.121521, + "step": 11063, + "time_per_iteration": 2.5354268550872803 + }, + { + "auxiliary_loss_clip": 0.06417998, + "auxiliary_loss_mlp": 0.01266902, + "balance_loss_clip": 0.06278798, + "balance_loss_mlp": 0.0125724, + "epoch": 0.6652036675184128, + "flos": 22350465901440.0, + "grad_norm": 2.2315353157370836, + "language_loss": 0.708628, + "learning_rate": 1.0648684187051316e-06, + "loss": 0.78547704, + "num_input_tokens_seen": 238822250, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.09655762, + "step": 11064, + "time_per_iteration": 2.601271390914917 + }, + { + "auxiliary_loss_clip": 0.06307759, + "auxiliary_loss_mlp": 0.01252714, + "balance_loss_clip": 0.06251188, + "balance_loss_mlp": 0.01251267, + "epoch": 0.6652637907710807, + "flos": 52925467386240.0, + "grad_norm": 0.8269137521288277, + "language_loss": 0.62977844, + "learning_rate": 1.0645241685614322e-06, + "loss": 0.70538318, + "num_input_tokens_seen": 238877190, + "router_z_loss_clip": 0.56640625, + "router_z_loss_mlp": 0.01445007, + "step": 11065, + "time_per_iteration": 3.088651180267334 + }, + { + "auxiliary_loss_clip": 0.06417314, + "auxiliary_loss_mlp": 0.01265582, + "balance_loss_clip": 0.06277956, + "balance_loss_mlp": 0.01255091, + "epoch": 0.6653239140237487, + "flos": 23110031963520.0, + "grad_norm": 1.7770048566161585, + "language_loss": 0.62216848, + "learning_rate": 1.0641799538910708e-06, + "loss": 0.69899738, + "num_input_tokens_seen": 238896010, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.10491943, + "step": 11066, + "time_per_iteration": 2.514662981033325 + }, + { + "auxiliary_loss_clip": 0.06416589, + "auxiliary_loss_mlp": 0.01266733, + "balance_loss_clip": 0.06276087, + "balance_loss_mlp": 0.01256123, + "epoch": 0.6653840372764167, + "flos": 25966747612800.0, + "grad_norm": 1.500590710166923, + "language_loss": 0.70431817, + "learning_rate": 1.0638357747070985e-06, + "loss": 0.78115141, + "num_input_tokens_seen": 238918990, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.1060791, + "step": 11067, + "time_per_iteration": 2.629611015319824 + }, + { + "auxiliary_loss_clip": 0.06312129, + "auxiliary_loss_mlp": 0.01250999, + "balance_loss_clip": 0.06255849, + "balance_loss_mlp": 0.01249609, + "epoch": 0.6654441605290846, + "flos": 66059593251840.0, + "grad_norm": 0.8851345245048583, + "language_loss": 0.71944451, + "learning_rate": 1.0634916310225684e-06, + "loss": 0.79507577, + "num_input_tokens_seen": 238975735, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01391602, + "step": 11068, + "time_per_iteration": 3.1097211837768555 + }, + { + "auxiliary_loss_clip": 0.06313328, + "auxiliary_loss_mlp": 0.01253328, + "balance_loss_clip": 0.0625675, + "balance_loss_mlp": 0.01251991, + "epoch": 0.6655042837817526, + "flos": 65218560693120.0, + "grad_norm": 0.7108385158391787, + "language_loss": 0.577793, + "learning_rate": 1.0631475228505285e-06, + "loss": 0.65345955, + "num_input_tokens_seen": 239042360, + "router_z_loss_clip": 0.56640625, + "router_z_loss_mlp": 0.01338959, + "step": 11069, + "time_per_iteration": 4.7683820724487305 + }, + { + "auxiliary_loss_clip": 0.0631298, + "auxiliary_loss_mlp": 0.01252294, + "balance_loss_clip": 0.0625658, + "balance_loss_mlp": 0.01250911, + "epoch": 0.6655644070344205, + "flos": 69028759480320.0, + "grad_norm": 0.7328423376388431, + "language_loss": 0.63529485, + "learning_rate": 1.062803450204029e-06, + "loss": 0.71094757, + "num_input_tokens_seen": 239109410, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01386261, + "step": 11070, + "time_per_iteration": 3.218775749206543 + }, + { + "auxiliary_loss_clip": 0.06412843, + "auxiliary_loss_mlp": 0.0126417, + "balance_loss_clip": 0.06274422, + "balance_loss_mlp": 0.01253668, + "epoch": 0.6656245302870886, + "flos": 36323680953600.0, + "grad_norm": 1.5647890242278204, + "language_loss": 0.58715665, + "learning_rate": 1.062459413096116e-06, + "loss": 0.66392684, + "num_input_tokens_seen": 239135345, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.1050415, + "step": 11071, + "time_per_iteration": 2.6759583950042725 + }, + { + "auxiliary_loss_clip": 0.06415486, + "auxiliary_loss_mlp": 0.01266737, + "balance_loss_clip": 0.06278834, + "balance_loss_mlp": 0.01256544, + "epoch": 0.6656846535397565, + "flos": 21800623409280.0, + "grad_norm": 1.6094882760656495, + "language_loss": 0.7278558, + "learning_rate": 1.0621154115398364e-06, + "loss": 0.80467808, + "num_input_tokens_seen": 239154340, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.10192871, + "step": 11072, + "time_per_iteration": 2.506439685821533 + }, + { + "auxiliary_loss_clip": 0.0641008, + "auxiliary_loss_mlp": 0.01266315, + "balance_loss_clip": 0.06274915, + "balance_loss_mlp": 0.01255729, + "epoch": 0.6657447767924245, + "flos": 37496683860480.0, + "grad_norm": 1.9931671493726393, + "language_loss": 0.70538545, + "learning_rate": 1.0617714455482353e-06, + "loss": 0.78214943, + "num_input_tokens_seen": 239177815, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.10583496, + "step": 11073, + "time_per_iteration": 2.687361240386963 + }, + { + "auxiliary_loss_clip": 0.06420862, + "auxiliary_loss_mlp": 0.01262058, + "balance_loss_clip": 0.06278072, + "balance_loss_mlp": 0.01251353, + "epoch": 0.6658049000450924, + "flos": 16843473135360.0, + "grad_norm": 1.8042269767870909, + "language_loss": 0.5659616, + "learning_rate": 1.061427515134354e-06, + "loss": 0.64279079, + "num_input_tokens_seen": 239195735, + "router_z_loss_clip": 1.42675781, + "router_z_loss_mlp": 0.10699463, + "step": 11074, + "time_per_iteration": 2.476226568222046 + }, + { + "auxiliary_loss_clip": 0.06415518, + "auxiliary_loss_mlp": 0.01268741, + "balance_loss_clip": 0.06278802, + "balance_loss_mlp": 0.01258417, + "epoch": 0.6658650232977604, + "flos": 33519430759680.0, + "grad_norm": 1.4700349170865334, + "language_loss": 0.72126347, + "learning_rate": 1.061083620311235e-06, + "loss": 0.79810607, + "num_input_tokens_seen": 239217535, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.10321045, + "step": 11075, + "time_per_iteration": 2.655700922012329 + }, + { + "auxiliary_loss_clip": 0.06410009, + "auxiliary_loss_mlp": 0.01264716, + "balance_loss_clip": 0.06274687, + "balance_loss_mlp": 0.01254983, + "epoch": 0.6659251465504283, + "flos": 37715379816960.0, + "grad_norm": 1.432398272569416, + "language_loss": 0.66657937, + "learning_rate": 1.0607397610919202e-06, + "loss": 0.7433266, + "num_input_tokens_seen": 239241975, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.09729004, + "step": 11076, + "time_per_iteration": 2.66424822807312 + }, + { + "auxiliary_loss_clip": 0.06411892, + "auxiliary_loss_mlp": 0.01265269, + "balance_loss_clip": 0.06275803, + "balance_loss_mlp": 0.01254433, + "epoch": 0.6659852698030964, + "flos": 24899277323520.0, + "grad_norm": 1.6226979142446254, + "language_loss": 0.75448096, + "learning_rate": 1.0603959374894468e-06, + "loss": 0.83125257, + "num_input_tokens_seen": 239262025, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.10845947, + "step": 11077, + "time_per_iteration": 2.5727341175079346 + }, + { + "auxiliary_loss_clip": 0.06412426, + "auxiliary_loss_mlp": 0.01263175, + "balance_loss_clip": 0.06273601, + "balance_loss_mlp": 0.01252631, + "epoch": 0.6660453930557643, + "flos": 24359706956160.0, + "grad_norm": 1.8442117034793826, + "language_loss": 0.66886055, + "learning_rate": 1.0600521495168538e-06, + "loss": 0.74561661, + "num_input_tokens_seen": 239282775, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.10546875, + "step": 11078, + "time_per_iteration": 2.543839931488037 + }, + { + "auxiliary_loss_clip": 0.06421163, + "auxiliary_loss_mlp": 0.01268494, + "balance_loss_clip": 0.06279247, + "balance_loss_mlp": 0.01257533, + "epoch": 0.6661055163084323, + "flos": 10602420675840.0, + "grad_norm": 1.9694934778902873, + "language_loss": 0.69631219, + "learning_rate": 1.0597083971871783e-06, + "loss": 0.77320874, + "num_input_tokens_seen": 239299775, + "router_z_loss_clip": 1.41894531, + "router_z_loss_mlp": 0.10961914, + "step": 11079, + "time_per_iteration": 2.541069269180298 + }, + { + "auxiliary_loss_clip": 0.06411281, + "auxiliary_loss_mlp": 0.01265438, + "balance_loss_clip": 0.06274305, + "balance_loss_mlp": 0.01255067, + "epoch": 0.6661656395611003, + "flos": 24063751935360.0, + "grad_norm": 2.893983796141558, + "language_loss": 0.80461812, + "learning_rate": 1.0593646805134544e-06, + "loss": 0.88138527, + "num_input_tokens_seen": 239319660, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.10375977, + "step": 11080, + "time_per_iteration": 4.085668087005615 + }, + { + "auxiliary_loss_clip": 0.06407166, + "auxiliary_loss_mlp": 0.01263859, + "balance_loss_clip": 0.06275053, + "balance_loss_mlp": 0.01254114, + "epoch": 0.6662257628137682, + "flos": 23042332264320.0, + "grad_norm": 1.7166684069014877, + "language_loss": 0.78285092, + "learning_rate": 1.0590209995087157e-06, + "loss": 0.85956115, + "num_input_tokens_seen": 239339215, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09729004, + "step": 11081, + "time_per_iteration": 2.5193705558776855 + }, + { + "auxiliary_loss_clip": 0.06415745, + "auxiliary_loss_mlp": 0.01265653, + "balance_loss_clip": 0.06274147, + "balance_loss_mlp": 0.01254364, + "epoch": 0.6662858860664362, + "flos": 24761446156800.0, + "grad_norm": 1.6242146726224216, + "language_loss": 0.80530953, + "learning_rate": 1.0586773541859946e-06, + "loss": 0.88212347, + "num_input_tokens_seen": 239358545, + "router_z_loss_clip": 1.41601562, + "router_z_loss_mlp": 0.11291504, + "step": 11082, + "time_per_iteration": 2.569957971572876 + }, + { + "auxiliary_loss_clip": 0.0641424, + "auxiliary_loss_mlp": 0.01265735, + "balance_loss_clip": 0.06276894, + "balance_loss_mlp": 0.01255757, + "epoch": 0.6663460093191041, + "flos": 20014899920640.0, + "grad_norm": 1.3932549437891448, + "language_loss": 0.83467507, + "learning_rate": 1.0583337445583234e-06, + "loss": 0.91147482, + "num_input_tokens_seen": 239376665, + "router_z_loss_clip": 1.37402344, + "router_z_loss_mlp": 0.09979248, + "step": 11083, + "time_per_iteration": 3.9742698669433594 + }, + { + "auxiliary_loss_clip": 0.06423122, + "auxiliary_loss_mlp": 0.01266045, + "balance_loss_clip": 0.06280323, + "balance_loss_mlp": 0.01254995, + "epoch": 0.6664061325717722, + "flos": 17827101815040.0, + "grad_norm": 2.1194460311014023, + "language_loss": 0.85585803, + "learning_rate": 1.057990170638731e-06, + "loss": 0.93274969, + "num_input_tokens_seen": 239394345, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.11053467, + "step": 11084, + "time_per_iteration": 2.4959633350372314 + }, + { + "auxiliary_loss_clip": 0.0642017, + "auxiliary_loss_mlp": 0.01265063, + "balance_loss_clip": 0.06277794, + "balance_loss_mlp": 0.0125434, + "epoch": 0.6664662558244401, + "flos": 18082666368000.0, + "grad_norm": 2.6259945452160185, + "language_loss": 0.73187411, + "learning_rate": 1.0576466324402452e-06, + "loss": 0.80872643, + "num_input_tokens_seen": 239410605, + "router_z_loss_clip": 1.42382812, + "router_z_loss_mlp": 0.1072998, + "step": 11085, + "time_per_iteration": 2.475743055343628 + }, + { + "auxiliary_loss_clip": 0.06412315, + "auxiliary_loss_mlp": 0.01264882, + "balance_loss_clip": 0.06275545, + "balance_loss_mlp": 0.01253718, + "epoch": 0.6665263790771081, + "flos": 21579663392640.0, + "grad_norm": 1.7551532896089992, + "language_loss": 0.80931759, + "learning_rate": 1.057303129975894e-06, + "loss": 0.88608956, + "num_input_tokens_seen": 239427155, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.11157227, + "step": 11086, + "time_per_iteration": 2.537797689437866 + }, + { + "auxiliary_loss_clip": 0.06411488, + "auxiliary_loss_mlp": 0.01267617, + "balance_loss_clip": 0.06275479, + "balance_loss_mlp": 0.01257079, + "epoch": 0.666586502329776, + "flos": 24213448454400.0, + "grad_norm": 1.98835460832662, + "language_loss": 0.7529, + "learning_rate": 1.056959663258702e-06, + "loss": 0.82969105, + "num_input_tokens_seen": 239445510, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.10540771, + "step": 11087, + "time_per_iteration": 2.5238702297210693 + }, + { + "auxiliary_loss_clip": 0.06414294, + "auxiliary_loss_mlp": 0.01264278, + "balance_loss_clip": 0.06277943, + "balance_loss_mlp": 0.01253621, + "epoch": 0.666646625582444, + "flos": 22207100365440.0, + "grad_norm": 1.5295252788179032, + "language_loss": 0.65136206, + "learning_rate": 1.0566162323016939e-06, + "loss": 0.72814775, + "num_input_tokens_seen": 239464805, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.10656738, + "step": 11088, + "time_per_iteration": 3.9619038105010986 + }, + { + "auxiliary_loss_clip": 0.06416193, + "auxiliary_loss_mlp": 0.01266589, + "balance_loss_clip": 0.06277834, + "balance_loss_mlp": 0.01255753, + "epoch": 0.6667067488351119, + "flos": 18265835393280.0, + "grad_norm": 1.9855105228277763, + "language_loss": 0.64599085, + "learning_rate": 1.0562728371178928e-06, + "loss": 0.72281867, + "num_input_tokens_seen": 239483890, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.1083374, + "step": 11089, + "time_per_iteration": 2.5900728702545166 + }, + { + "auxiliary_loss_clip": 0.06409112, + "auxiliary_loss_mlp": 0.01265636, + "balance_loss_clip": 0.06274208, + "balance_loss_mlp": 0.01255313, + "epoch": 0.66676687208778, + "flos": 17241983953920.0, + "grad_norm": 2.1106067212474704, + "language_loss": 0.81439161, + "learning_rate": 1.0559294777203221e-06, + "loss": 0.89113915, + "num_input_tokens_seen": 239500080, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.10314941, + "step": 11090, + "time_per_iteration": 2.4597456455230713 + }, + { + "auxiliary_loss_clip": 0.06415623, + "auxiliary_loss_mlp": 0.01266415, + "balance_loss_clip": 0.06274828, + "balance_loss_mlp": 0.01255877, + "epoch": 0.6668269953404479, + "flos": 19757742140160.0, + "grad_norm": 1.8443713907824004, + "language_loss": 0.7767818, + "learning_rate": 1.0555861541219984e-06, + "loss": 0.85360217, + "num_input_tokens_seen": 239517335, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.10540771, + "step": 11091, + "time_per_iteration": 2.5587215423583984 + }, + { + "auxiliary_loss_clip": 0.06415166, + "auxiliary_loss_mlp": 0.01267323, + "balance_loss_clip": 0.06277118, + "balance_loss_mlp": 0.01256487, + "epoch": 0.6668871185931159, + "flos": 20564700485760.0, + "grad_norm": 3.5971234891656265, + "language_loss": 0.79227078, + "learning_rate": 1.0552428663359425e-06, + "loss": 0.86909568, + "num_input_tokens_seen": 239536240, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.10827637, + "step": 11092, + "time_per_iteration": 2.4899661540985107 + }, + { + "auxiliary_loss_clip": 0.06313632, + "auxiliary_loss_mlp": 0.01258221, + "balance_loss_clip": 0.06257559, + "balance_loss_mlp": 0.0125709, + "epoch": 0.6669472418457839, + "flos": 58104458144640.0, + "grad_norm": 0.7522047627769642, + "language_loss": 0.57524383, + "learning_rate": 1.0548996143751724e-06, + "loss": 0.65096241, + "num_input_tokens_seen": 239598000, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01133728, + "step": 11093, + "time_per_iteration": 3.147273540496826 + }, + { + "auxiliary_loss_clip": 0.06411624, + "auxiliary_loss_mlp": 0.01265167, + "balance_loss_clip": 0.06275775, + "balance_loss_mlp": 0.0125504, + "epoch": 0.6670073650984518, + "flos": 26071860960000.0, + "grad_norm": 1.491694696645918, + "language_loss": 0.76499665, + "learning_rate": 1.054556398252703e-06, + "loss": 0.84176457, + "num_input_tokens_seen": 239617650, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.10125732, + "step": 11094, + "time_per_iteration": 2.654946804046631 + }, + { + "auxiliary_loss_clip": 0.06412062, + "auxiliary_loss_mlp": 0.01267472, + "balance_loss_clip": 0.06274471, + "balance_loss_mlp": 0.01256349, + "epoch": 0.6670674883511198, + "flos": 32425196290560.0, + "grad_norm": 1.786455566216807, + "language_loss": 0.73555851, + "learning_rate": 1.05421321798155e-06, + "loss": 0.81235385, + "num_input_tokens_seen": 239639825, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.11132812, + "step": 11095, + "time_per_iteration": 2.6546003818511963 + }, + { + "auxiliary_loss_clip": 0.06414741, + "auxiliary_loss_mlp": 0.01270593, + "balance_loss_clip": 0.06277339, + "balance_loss_mlp": 0.01260145, + "epoch": 0.6671276116037878, + "flos": 18043114440960.0, + "grad_norm": 1.9034949183118532, + "language_loss": 0.73389214, + "learning_rate": 1.053870073574727e-06, + "loss": 0.81074548, + "num_input_tokens_seen": 239656300, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.10437012, + "step": 11096, + "time_per_iteration": 2.5232880115509033 + }, + { + "auxiliary_loss_clip": 0.06407115, + "auxiliary_loss_mlp": 0.01267563, + "balance_loss_clip": 0.06273691, + "balance_loss_mlp": 0.01257419, + "epoch": 0.6671877348564558, + "flos": 23773498992000.0, + "grad_norm": 1.8900040408751917, + "language_loss": 0.64173019, + "learning_rate": 1.0535269650452456e-06, + "loss": 0.71847701, + "num_input_tokens_seen": 239676655, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.10144043, + "step": 11097, + "time_per_iteration": 2.53245210647583 + }, + { + "auxiliary_loss_clip": 0.06414811, + "auxiliary_loss_mlp": 0.01270626, + "balance_loss_clip": 0.06272861, + "balance_loss_mlp": 0.01259939, + "epoch": 0.6672478581091237, + "flos": 20923869012480.0, + "grad_norm": 1.7889953519105342, + "language_loss": 0.76164997, + "learning_rate": 1.0531838924061158e-06, + "loss": 0.83850437, + "num_input_tokens_seen": 239695430, + "router_z_loss_clip": 1.41894531, + "router_z_loss_mlp": 0.10681152, + "step": 11098, + "time_per_iteration": 2.5418834686279297 + }, + { + "auxiliary_loss_clip": 0.0641548, + "auxiliary_loss_mlp": 0.01271314, + "balance_loss_clip": 0.0627536, + "balance_loss_mlp": 0.01260675, + "epoch": 0.6673079813617917, + "flos": 27863328453120.0, + "grad_norm": 1.4249693183378689, + "language_loss": 0.74138522, + "learning_rate": 1.0528408556703476e-06, + "loss": 0.81825316, + "num_input_tokens_seen": 239717070, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.10632324, + "step": 11099, + "time_per_iteration": 2.6019399166107178 + }, + { + "auxiliary_loss_clip": 0.06409659, + "auxiliary_loss_mlp": 0.01264891, + "balance_loss_clip": 0.06275995, + "balance_loss_mlp": 0.01254722, + "epoch": 0.6673681046144596, + "flos": 21623366096640.0, + "grad_norm": 1.7662195801139693, + "language_loss": 0.78545117, + "learning_rate": 1.0524978548509502e-06, + "loss": 0.86219656, + "num_input_tokens_seen": 239737105, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.1015625, + "step": 11100, + "time_per_iteration": 2.681669235229492 + }, + { + "auxiliary_loss_clip": 0.06412613, + "auxiliary_loss_mlp": 0.01264451, + "balance_loss_clip": 0.06276593, + "balance_loss_mlp": 0.01254247, + "epoch": 0.6674282278671276, + "flos": 20896727489280.0, + "grad_norm": 1.8459209339693166, + "language_loss": 0.60927689, + "learning_rate": 1.0521548899609288e-06, + "loss": 0.68604755, + "num_input_tokens_seen": 239757835, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.10211182, + "step": 11101, + "time_per_iteration": 2.53374981880188 + }, + { + "auxiliary_loss_clip": 0.06421657, + "auxiliary_loss_mlp": 0.0126643, + "balance_loss_clip": 0.06276177, + "balance_loss_mlp": 0.01254276, + "epoch": 0.6674883511197955, + "flos": 23631139704960.0, + "grad_norm": 1.6188105594216948, + "language_loss": 0.7136634, + "learning_rate": 1.0518119610132884e-06, + "loss": 0.79054427, + "num_input_tokens_seen": 239775425, + "router_z_loss_clip": 1.45507812, + "router_z_loss_mlp": 0.121521, + "step": 11102, + "time_per_iteration": 2.572932481765747 + }, + { + "auxiliary_loss_clip": 0.06414107, + "auxiliary_loss_mlp": 0.01266311, + "balance_loss_clip": 0.06274531, + "balance_loss_mlp": 0.01256041, + "epoch": 0.6675484743724636, + "flos": 19615760196480.0, + "grad_norm": 1.3319232732101594, + "language_loss": 0.84587741, + "learning_rate": 1.051469068021034e-06, + "loss": 0.92268157, + "num_input_tokens_seen": 239794605, + "router_z_loss_clip": 1.39648438, + "router_z_loss_mlp": 0.1027832, + "step": 11103, + "time_per_iteration": 2.5075833797454834 + }, + { + "auxiliary_loss_clip": 0.06411143, + "auxiliary_loss_mlp": 0.01263708, + "balance_loss_clip": 0.06271936, + "balance_loss_mlp": 0.01254482, + "epoch": 0.6676085976251315, + "flos": 14324696202240.0, + "grad_norm": 1.9260757560792952, + "language_loss": 0.78627831, + "learning_rate": 1.0511262109971668e-06, + "loss": 0.86302686, + "num_input_tokens_seen": 239812135, + "router_z_loss_clip": 1.39257812, + "router_z_loss_mlp": 0.09222412, + "step": 11104, + "time_per_iteration": 2.5494680404663086 + }, + { + "auxiliary_loss_clip": 0.06418018, + "auxiliary_loss_mlp": 0.01267231, + "balance_loss_clip": 0.0627483, + "balance_loss_mlp": 0.01256531, + "epoch": 0.6676687208777995, + "flos": 38113219802880.0, + "grad_norm": 1.3963666193820934, + "language_loss": 0.58238858, + "learning_rate": 1.0507833899546889e-06, + "loss": 0.65924108, + "num_input_tokens_seen": 239835845, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.10693359, + "step": 11105, + "time_per_iteration": 2.6544291973114014 + }, + { + "auxiliary_loss_clip": 0.06419846, + "auxiliary_loss_mlp": 0.01267664, + "balance_loss_clip": 0.06274708, + "balance_loss_mlp": 0.01255921, + "epoch": 0.6677288441304675, + "flos": 23987331411840.0, + "grad_norm": 1.4856417680447878, + "language_loss": 0.72987849, + "learning_rate": 1.0504406049066e-06, + "loss": 0.80675358, + "num_input_tokens_seen": 239853820, + "router_z_loss_clip": 1.45019531, + "router_z_loss_mlp": 0.11749268, + "step": 11106, + "time_per_iteration": 2.591508150100708 + }, + { + "auxiliary_loss_clip": 0.06410738, + "auxiliary_loss_mlp": 0.01269876, + "balance_loss_clip": 0.06272997, + "balance_loss_mlp": 0.01259392, + "epoch": 0.6677889673831354, + "flos": 24177586106880.0, + "grad_norm": 1.6277621549569181, + "language_loss": 0.76611882, + "learning_rate": 1.0500978558659e-06, + "loss": 0.84292495, + "num_input_tokens_seen": 239873365, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.1048584, + "step": 11107, + "time_per_iteration": 2.5117390155792236 + }, + { + "auxiliary_loss_clip": 0.06407823, + "auxiliary_loss_mlp": 0.01272133, + "balance_loss_clip": 0.06275569, + "balance_loss_mlp": 0.01262364, + "epoch": 0.6678490906358034, + "flos": 22316196781440.0, + "grad_norm": 2.1688615595462033, + "language_loss": 0.90383065, + "learning_rate": 1.049755142845583e-06, + "loss": 0.98063028, + "num_input_tokens_seen": 239891215, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.09765625, + "step": 11108, + "time_per_iteration": 3.940439224243164 + }, + { + "auxiliary_loss_clip": 0.06408696, + "auxiliary_loss_mlp": 0.01263517, + "balance_loss_clip": 0.0627287, + "balance_loss_mlp": 0.01254499, + "epoch": 0.6679092138884714, + "flos": 36906870170880.0, + "grad_norm": 1.379580541372803, + "language_loss": 0.82916903, + "learning_rate": 1.049412465858646e-06, + "loss": 0.90589124, + "num_input_tokens_seen": 239913490, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.09020996, + "step": 11109, + "time_per_iteration": 2.6550536155700684 + }, + { + "auxiliary_loss_clip": 0.06415845, + "auxiliary_loss_mlp": 0.01269099, + "balance_loss_clip": 0.06276993, + "balance_loss_mlp": 0.01257869, + "epoch": 0.6679693371411394, + "flos": 18156151998720.0, + "grad_norm": 1.7439527968582467, + "language_loss": 0.69522661, + "learning_rate": 1.0490698249180847e-06, + "loss": 0.77207607, + "num_input_tokens_seen": 239931565, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.11236572, + "step": 11110, + "time_per_iteration": 2.505737543106079 + }, + { + "auxiliary_loss_clip": 0.06418422, + "auxiliary_loss_mlp": 0.01267651, + "balance_loss_clip": 0.06277301, + "balance_loss_mlp": 0.01255886, + "epoch": 0.6680294603938073, + "flos": 27205437720960.0, + "grad_norm": 1.4770947447978742, + "language_loss": 0.73935318, + "learning_rate": 1.04872722003689e-06, + "loss": 0.81621397, + "num_input_tokens_seen": 239952395, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.11767578, + "step": 11111, + "time_per_iteration": 2.6036081314086914 + }, + { + "auxiliary_loss_clip": 0.06412682, + "auxiliary_loss_mlp": 0.01267643, + "balance_loss_clip": 0.06276079, + "balance_loss_mlp": 0.01257266, + "epoch": 0.6680895836464753, + "flos": 21731665898880.0, + "grad_norm": 1.7721381481924603, + "language_loss": 0.65662813, + "learning_rate": 1.0483846512280553e-06, + "loss": 0.73343134, + "num_input_tokens_seen": 239968910, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.10375977, + "step": 11112, + "time_per_iteration": 2.5148162841796875 + }, + { + "auxiliary_loss_clip": 0.06408017, + "auxiliary_loss_mlp": 0.01264862, + "balance_loss_clip": 0.06270978, + "balance_loss_mlp": 0.01254509, + "epoch": 0.6681497068991432, + "flos": 19652628792960.0, + "grad_norm": 2.188254018589407, + "language_loss": 0.63796169, + "learning_rate": 1.048042118504569e-06, + "loss": 0.71469045, + "num_input_tokens_seen": 239987680, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.10357666, + "step": 11113, + "time_per_iteration": 2.5091605186462402 + }, + { + "auxiliary_loss_clip": 0.06408161, + "auxiliary_loss_mlp": 0.0126667, + "balance_loss_clip": 0.06274618, + "balance_loss_mlp": 0.01257008, + "epoch": 0.6682098301518112, + "flos": 17424649854720.0, + "grad_norm": 1.7204263321571711, + "language_loss": 0.65997386, + "learning_rate": 1.047699621879422e-06, + "loss": 0.73672217, + "num_input_tokens_seen": 240005790, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.09667969, + "step": 11114, + "time_per_iteration": 2.5244226455688477 + }, + { + "auxiliary_loss_clip": 0.06406785, + "auxiliary_loss_mlp": 0.01265665, + "balance_loss_clip": 0.06270755, + "balance_loss_mlp": 0.01255378, + "epoch": 0.6682699534044791, + "flos": 22605191913600.0, + "grad_norm": 1.4259756578870375, + "language_loss": 0.78704619, + "learning_rate": 1.0473571613655998e-06, + "loss": 0.86377072, + "num_input_tokens_seen": 240025895, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.10290527, + "step": 11115, + "time_per_iteration": 2.544543504714966 + }, + { + "auxiliary_loss_clip": 0.06410562, + "auxiliary_loss_mlp": 0.01266412, + "balance_loss_clip": 0.06271002, + "balance_loss_mlp": 0.01256703, + "epoch": 0.6683300766571472, + "flos": 24870668353920.0, + "grad_norm": 1.896886529208747, + "language_loss": 0.79640424, + "learning_rate": 1.0470147369760896e-06, + "loss": 0.87317395, + "num_input_tokens_seen": 240044880, + "router_z_loss_clip": 1.39648438, + "router_z_loss_mlp": 0.09716797, + "step": 11116, + "time_per_iteration": 2.5271427631378174 + }, + { + "auxiliary_loss_clip": 0.06415811, + "auxiliary_loss_mlp": 0.01274733, + "balance_loss_clip": 0.06276368, + "balance_loss_mlp": 0.01263891, + "epoch": 0.6683901999098151, + "flos": 27134132296320.0, + "grad_norm": 1.70831438842013, + "language_loss": 0.79465652, + "learning_rate": 1.0466723487238768e-06, + "loss": 0.871562, + "num_input_tokens_seen": 240065785, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.10852051, + "step": 11117, + "time_per_iteration": 2.5867950916290283 + }, + { + "auxiliary_loss_clip": 0.06413716, + "auxiliary_loss_mlp": 0.01269769, + "balance_loss_clip": 0.06274913, + "balance_loss_mlp": 0.01258147, + "epoch": 0.6684503231624831, + "flos": 20745018472320.0, + "grad_norm": 1.68089949787921, + "language_loss": 0.65774792, + "learning_rate": 1.0463299966219441e-06, + "loss": 0.73458278, + "num_input_tokens_seen": 240085130, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.1161499, + "step": 11118, + "time_per_iteration": 2.5065219402313232 + }, + { + "auxiliary_loss_clip": 0.06409101, + "auxiliary_loss_mlp": 0.01267125, + "balance_loss_clip": 0.06272688, + "balance_loss_mlp": 0.01256426, + "epoch": 0.668510446415151, + "flos": 21768618349440.0, + "grad_norm": 1.4670277033373609, + "language_loss": 0.69327927, + "learning_rate": 1.0459876806832727e-06, + "loss": 0.77004153, + "num_input_tokens_seen": 240105495, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.10705566, + "step": 11119, + "time_per_iteration": 3.9497127532958984 + }, + { + "auxiliary_loss_clip": 0.06411311, + "auxiliary_loss_mlp": 0.01263174, + "balance_loss_clip": 0.06272611, + "balance_loss_mlp": 0.01253155, + "epoch": 0.668570569667819, + "flos": 30199229850240.0, + "grad_norm": 1.557441143928688, + "language_loss": 0.67133182, + "learning_rate": 1.0456454009208448e-06, + "loss": 0.74807668, + "num_input_tokens_seen": 240125455, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.10015869, + "step": 11120, + "time_per_iteration": 2.583557605743408 + }, + { + "auxiliary_loss_clip": 0.06409501, + "auxiliary_loss_mlp": 0.01266513, + "balance_loss_clip": 0.06271403, + "balance_loss_mlp": 0.0125551, + "epoch": 0.668630692920487, + "flos": 24177544179840.0, + "grad_norm": 1.6997365737566905, + "language_loss": 0.72227985, + "learning_rate": 1.045303157347638e-06, + "loss": 0.79904002, + "num_input_tokens_seen": 240143870, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.10998535, + "step": 11121, + "time_per_iteration": 2.5303213596343994 + }, + { + "auxiliary_loss_clip": 0.06415744, + "auxiliary_loss_mlp": 0.01268909, + "balance_loss_clip": 0.06275598, + "balance_loss_mlp": 0.01258442, + "epoch": 0.668690816173155, + "flos": 17462902043520.0, + "grad_norm": 2.410576654010779, + "language_loss": 0.70488191, + "learning_rate": 1.0449609499766316e-06, + "loss": 0.78172839, + "num_input_tokens_seen": 240161020, + "router_z_loss_clip": 1.39941406, + "router_z_loss_mlp": 0.10467529, + "step": 11122, + "time_per_iteration": 2.480928897857666 + }, + { + "auxiliary_loss_clip": 0.06412323, + "auxiliary_loss_mlp": 0.01265084, + "balance_loss_clip": 0.06273821, + "balance_loss_mlp": 0.01254683, + "epoch": 0.668750939425823, + "flos": 25011350559360.0, + "grad_norm": 1.579363869036545, + "language_loss": 0.71597642, + "learning_rate": 1.0446187788208015e-06, + "loss": 0.79275048, + "num_input_tokens_seen": 240179820, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.10406494, + "step": 11123, + "time_per_iteration": 3.993523597717285 + }, + { + "auxiliary_loss_clip": 0.06416023, + "auxiliary_loss_mlp": 0.01267794, + "balance_loss_clip": 0.06275098, + "balance_loss_mlp": 0.01256713, + "epoch": 0.6688110626784909, + "flos": 24103513497600.0, + "grad_norm": 1.6918402194537734, + "language_loss": 0.79247653, + "learning_rate": 1.0442766438931244e-06, + "loss": 0.86931467, + "num_input_tokens_seen": 240200130, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.11090088, + "step": 11124, + "time_per_iteration": 2.5730183124542236 + }, + { + "auxiliary_loss_clip": 0.06414519, + "auxiliary_loss_mlp": 0.0126539, + "balance_loss_clip": 0.06277663, + "balance_loss_mlp": 0.01255496, + "epoch": 0.6688711859311589, + "flos": 21765515748480.0, + "grad_norm": 1.8258374996153537, + "language_loss": 0.74714315, + "learning_rate": 1.0439345452065716e-06, + "loss": 0.8239423, + "num_input_tokens_seen": 240217945, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.09899902, + "step": 11125, + "time_per_iteration": 2.586688995361328 + }, + { + "auxiliary_loss_clip": 0.06414272, + "auxiliary_loss_mlp": 0.01265114, + "balance_loss_clip": 0.06274511, + "balance_loss_mlp": 0.01254802, + "epoch": 0.6689313091838268, + "flos": 22936254595200.0, + "grad_norm": 1.821756692805589, + "language_loss": 0.66474277, + "learning_rate": 1.043592482774116e-06, + "loss": 0.74153662, + "num_input_tokens_seen": 240237220, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.10314941, + "step": 11126, + "time_per_iteration": 2.5671706199645996 + }, + { + "auxiliary_loss_clip": 0.06412929, + "auxiliary_loss_mlp": 0.01267353, + "balance_loss_clip": 0.06274001, + "balance_loss_mlp": 0.01256774, + "epoch": 0.6689914324364948, + "flos": 20892367077120.0, + "grad_norm": 1.6855233783346146, + "language_loss": 0.71609974, + "learning_rate": 1.0432504566087305e-06, + "loss": 0.79290259, + "num_input_tokens_seen": 240256000, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.10577393, + "step": 11127, + "time_per_iteration": 3.9430463314056396 + }, + { + "auxiliary_loss_clip": 0.06417182, + "auxiliary_loss_mlp": 0.01267327, + "balance_loss_clip": 0.06273168, + "balance_loss_mlp": 0.01255841, + "epoch": 0.6690515556891627, + "flos": 22754972286720.0, + "grad_norm": 1.8544786849615413, + "language_loss": 0.80330718, + "learning_rate": 1.0429084667233827e-06, + "loss": 0.88015223, + "num_input_tokens_seen": 240275845, + "router_z_loss_clip": 1.44042969, + "router_z_loss_mlp": 0.11486816, + "step": 11128, + "time_per_iteration": 2.545502185821533 + }, + { + "auxiliary_loss_clip": 0.06412885, + "auxiliary_loss_mlp": 0.01266335, + "balance_loss_clip": 0.06271905, + "balance_loss_mlp": 0.01255582, + "epoch": 0.6691116789418308, + "flos": 23338203431040.0, + "grad_norm": 1.7840790291668756, + "language_loss": 0.81335264, + "learning_rate": 1.0425665131310427e-06, + "loss": 0.89014482, + "num_input_tokens_seen": 240294095, + "router_z_loss_clip": 1.40917969, + "router_z_loss_mlp": 0.10742188, + "step": 11129, + "time_per_iteration": 2.5280702114105225 + }, + { + "auxiliary_loss_clip": 0.06404583, + "auxiliary_loss_mlp": 0.01264694, + "balance_loss_clip": 0.06271389, + "balance_loss_mlp": 0.01254972, + "epoch": 0.6691718021944987, + "flos": 32454308384640.0, + "grad_norm": 1.6197681941265856, + "language_loss": 0.70428884, + "learning_rate": 1.0422245958446762e-06, + "loss": 0.7809816, + "num_input_tokens_seen": 240313460, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.097229, + "step": 11130, + "time_per_iteration": 2.578578233718872 + }, + { + "auxiliary_loss_clip": 0.06406342, + "auxiliary_loss_mlp": 0.01262916, + "balance_loss_clip": 0.0627137, + "balance_loss_mlp": 0.0125301, + "epoch": 0.6692319254471667, + "flos": 23738223623040.0, + "grad_norm": 1.529399392054523, + "language_loss": 0.70701146, + "learning_rate": 1.0418827148772486e-06, + "loss": 0.78370404, + "num_input_tokens_seen": 240333540, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.09918213, + "step": 11131, + "time_per_iteration": 2.537551164627075 + }, + { + "auxiliary_loss_clip": 0.06414618, + "auxiliary_loss_mlp": 0.01266754, + "balance_loss_clip": 0.06274183, + "balance_loss_mlp": 0.01255906, + "epoch": 0.6692920486998346, + "flos": 14432996004480.0, + "grad_norm": 2.3888765741874645, + "language_loss": 0.65664881, + "learning_rate": 1.0415408702417243e-06, + "loss": 0.73346257, + "num_input_tokens_seen": 240350085, + "router_z_loss_clip": 1.40527344, + "router_z_loss_mlp": 0.10858154, + "step": 11132, + "time_per_iteration": 2.45595383644104 + }, + { + "auxiliary_loss_clip": 0.06414949, + "auxiliary_loss_mlp": 0.0126617, + "balance_loss_clip": 0.06275167, + "balance_loss_mlp": 0.01254839, + "epoch": 0.6693521719525026, + "flos": 21513976191360.0, + "grad_norm": 1.5662057284927036, + "language_loss": 0.74730015, + "learning_rate": 1.0411990619510661e-06, + "loss": 0.82411134, + "num_input_tokens_seen": 240370015, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.11340332, + "step": 11133, + "time_per_iteration": 2.5248849391937256 + }, + { + "auxiliary_loss_clip": 0.06419569, + "auxiliary_loss_mlp": 0.01271511, + "balance_loss_clip": 0.06276593, + "balance_loss_mlp": 0.01259412, + "epoch": 0.6694122952051706, + "flos": 25413341322240.0, + "grad_norm": 3.5912228691538757, + "language_loss": 0.66650522, + "learning_rate": 1.0408572900182363e-06, + "loss": 0.74341607, + "num_input_tokens_seen": 240390770, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.12097168, + "step": 11134, + "time_per_iteration": 2.556043863296509 + }, + { + "auxiliary_loss_clip": 0.06424067, + "auxiliary_loss_mlp": 0.01264606, + "balance_loss_clip": 0.06279507, + "balance_loss_mlp": 0.01253294, + "epoch": 0.6694724184578386, + "flos": 25668067334400.0, + "grad_norm": 1.7597980858171118, + "language_loss": 0.77272904, + "learning_rate": 1.0405155544561943e-06, + "loss": 0.84961575, + "num_input_tokens_seen": 240409590, + "router_z_loss_clip": 1.4453125, + "router_z_loss_mlp": 0.11309814, + "step": 11135, + "time_per_iteration": 2.572221279144287 + }, + { + "auxiliary_loss_clip": 0.06406624, + "auxiliary_loss_mlp": 0.0126679, + "balance_loss_clip": 0.0627154, + "balance_loss_mlp": 0.01256079, + "epoch": 0.6695325417105066, + "flos": 17714567381760.0, + "grad_norm": 1.4860361528198607, + "language_loss": 0.74150556, + "learning_rate": 1.040173855277898e-06, + "loss": 0.81823969, + "num_input_tokens_seen": 240428180, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.1071167, + "step": 11136, + "time_per_iteration": 2.482616662979126 + }, + { + "auxiliary_loss_clip": 0.06421445, + "auxiliary_loss_mlp": 0.01264954, + "balance_loss_clip": 0.06277363, + "balance_loss_mlp": 0.01253814, + "epoch": 0.6695926649631745, + "flos": 24466581239040.0, + "grad_norm": 1.5006390680612098, + "language_loss": 0.622679, + "learning_rate": 1.0398321924963061e-06, + "loss": 0.69954294, + "num_input_tokens_seen": 240447815, + "router_z_loss_clip": 1.43945312, + "router_z_loss_mlp": 0.1114502, + "step": 11137, + "time_per_iteration": 2.60404109954834 + }, + { + "auxiliary_loss_clip": 0.06413136, + "auxiliary_loss_mlp": 0.01269365, + "balance_loss_clip": 0.0627469, + "balance_loss_mlp": 0.01258535, + "epoch": 0.6696527882158425, + "flos": 24287059866240.0, + "grad_norm": 1.73693802973788, + "language_loss": 0.66198957, + "learning_rate": 1.0394905661243724e-06, + "loss": 0.73881459, + "num_input_tokens_seen": 240468635, + "router_z_loss_clip": 1.38476562, + "router_z_loss_mlp": 0.1083374, + "step": 11138, + "time_per_iteration": 2.5446555614471436 + }, + { + "auxiliary_loss_clip": 0.06407638, + "auxiliary_loss_mlp": 0.01264748, + "balance_loss_clip": 0.06273118, + "balance_loss_mlp": 0.01255009, + "epoch": 0.6697129114685104, + "flos": 23009404809600.0, + "grad_norm": 1.563215252926209, + "language_loss": 0.73026919, + "learning_rate": 1.039148976175053e-06, + "loss": 0.80699301, + "num_input_tokens_seen": 240488550, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.09741211, + "step": 11139, + "time_per_iteration": 2.5669844150543213 + }, + { + "auxiliary_loss_clip": 0.06403776, + "auxiliary_loss_mlp": 0.01266346, + "balance_loss_clip": 0.06271777, + "balance_loss_mlp": 0.01256326, + "epoch": 0.6697730347211784, + "flos": 22644743840640.0, + "grad_norm": 1.6502373859256334, + "language_loss": 0.70972526, + "learning_rate": 1.0388074226613016e-06, + "loss": 0.78642654, + "num_input_tokens_seen": 240508330, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.10015869, + "step": 11140, + "time_per_iteration": 2.524345874786377 + }, + { + "auxiliary_loss_clip": 0.06414337, + "auxiliary_loss_mlp": 0.01264927, + "balance_loss_clip": 0.06273174, + "balance_loss_mlp": 0.01254103, + "epoch": 0.6698331579738463, + "flos": 28884915832320.0, + "grad_norm": 1.9955464769525513, + "language_loss": 0.75788713, + "learning_rate": 1.0384659055960691e-06, + "loss": 0.83467978, + "num_input_tokens_seen": 240528470, + "router_z_loss_clip": 1.41210938, + "router_z_loss_mlp": 0.1083374, + "step": 11141, + "time_per_iteration": 2.610853433609009 + }, + { + "auxiliary_loss_clip": 0.06411906, + "auxiliary_loss_mlp": 0.01271137, + "balance_loss_clip": 0.06273371, + "balance_loss_mlp": 0.01260337, + "epoch": 0.6698932812265144, + "flos": 24213993505920.0, + "grad_norm": 1.7317387192226181, + "language_loss": 0.82309425, + "learning_rate": 1.0381244249923052e-06, + "loss": 0.8999247, + "num_input_tokens_seen": 240547815, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.10803223, + "step": 11142, + "time_per_iteration": 2.5797901153564453 + }, + { + "auxiliary_loss_clip": 0.0640756, + "auxiliary_loss_mlp": 0.01269267, + "balance_loss_clip": 0.06271559, + "balance_loss_mlp": 0.01258556, + "epoch": 0.6699534044791823, + "flos": 22096704211200.0, + "grad_norm": 1.4627194343759278, + "language_loss": 0.70282012, + "learning_rate": 1.037782980862959e-06, + "loss": 0.77958834, + "num_input_tokens_seen": 240567765, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.1071167, + "step": 11143, + "time_per_iteration": 2.543877601623535 + }, + { + "auxiliary_loss_clip": 0.06405188, + "auxiliary_loss_mlp": 0.01262215, + "balance_loss_clip": 0.06271453, + "balance_loss_mlp": 0.01252577, + "epoch": 0.6700135277318503, + "flos": 25199466975360.0, + "grad_norm": 1.4915968751654103, + "language_loss": 0.70360661, + "learning_rate": 1.0374415732209796e-06, + "loss": 0.78028065, + "num_input_tokens_seen": 240590750, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.09637451, + "step": 11144, + "time_per_iteration": 2.5488550662994385 + }, + { + "auxiliary_loss_clip": 0.06411098, + "auxiliary_loss_mlp": 0.01264928, + "balance_loss_clip": 0.06275296, + "balance_loss_mlp": 0.01253735, + "epoch": 0.6700736509845182, + "flos": 23446838649600.0, + "grad_norm": 1.6240872047460435, + "language_loss": 0.74927717, + "learning_rate": 1.0371002020793114e-06, + "loss": 0.82603747, + "num_input_tokens_seen": 240608875, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.11193848, + "step": 11145, + "time_per_iteration": 2.542711019515991 + }, + { + "auxiliary_loss_clip": 0.06415901, + "auxiliary_loss_mlp": 0.0126542, + "balance_loss_clip": 0.06274743, + "balance_loss_mlp": 0.01254405, + "epoch": 0.6701337742371862, + "flos": 24396952896000.0, + "grad_norm": 1.5772021074008409, + "language_loss": 0.71292794, + "learning_rate": 1.0367588674509008e-06, + "loss": 0.7897411, + "num_input_tokens_seen": 240628565, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.11016846, + "step": 11146, + "time_per_iteration": 2.5397775173187256 + }, + { + "auxiliary_loss_clip": 0.06402436, + "auxiliary_loss_mlp": 0.01264562, + "balance_loss_clip": 0.06271266, + "balance_loss_mlp": 0.0125459, + "epoch": 0.6701938974898543, + "flos": 14798956711680.0, + "grad_norm": 2.075971191875419, + "language_loss": 0.78937066, + "learning_rate": 1.0364175693486905e-06, + "loss": 0.86604059, + "num_input_tokens_seen": 240646325, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09979248, + "step": 11147, + "time_per_iteration": 2.521651029586792 + }, + { + "auxiliary_loss_clip": 0.06408454, + "auxiliary_loss_mlp": 0.01268691, + "balance_loss_clip": 0.06272413, + "balance_loss_mlp": 0.0125801, + "epoch": 0.6702540207425222, + "flos": 20159690976000.0, + "grad_norm": 1.9550194289938683, + "language_loss": 0.70223355, + "learning_rate": 1.0360763077856218e-06, + "loss": 0.77900505, + "num_input_tokens_seen": 240666145, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.10687256, + "step": 11148, + "time_per_iteration": 4.084912300109863 + }, + { + "auxiliary_loss_clip": 0.06407622, + "auxiliary_loss_mlp": 0.01263909, + "balance_loss_clip": 0.06271225, + "balance_loss_mlp": 0.01253991, + "epoch": 0.6703141439951902, + "flos": 21220369084800.0, + "grad_norm": 1.6593895437552093, + "language_loss": 0.70494747, + "learning_rate": 1.035735082774636e-06, + "loss": 0.78166282, + "num_input_tokens_seen": 240685570, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.09918213, + "step": 11149, + "time_per_iteration": 2.532682418823242 + }, + { + "auxiliary_loss_clip": 0.06408584, + "auxiliary_loss_mlp": 0.0126327, + "balance_loss_clip": 0.06269559, + "balance_loss_mlp": 0.01253245, + "epoch": 0.6703742672478581, + "flos": 23119255912320.0, + "grad_norm": 2.1651783548168124, + "language_loss": 0.73744798, + "learning_rate": 1.0353938943286727e-06, + "loss": 0.81416655, + "num_input_tokens_seen": 240706945, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.10028076, + "step": 11150, + "time_per_iteration": 2.591546058654785 + }, + { + "auxiliary_loss_clip": 0.06414528, + "auxiliary_loss_mlp": 0.01264123, + "balance_loss_clip": 0.06276007, + "balance_loss_mlp": 0.01253829, + "epoch": 0.6704343905005261, + "flos": 22535563570560.0, + "grad_norm": 1.9523081475406603, + "language_loss": 0.78322434, + "learning_rate": 1.035052742460671e-06, + "loss": 0.86001086, + "num_input_tokens_seen": 240727990, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.10296631, + "step": 11151, + "time_per_iteration": 2.536759853363037 + }, + { + "auxiliary_loss_clip": 0.06307358, + "auxiliary_loss_mlp": 0.01251405, + "balance_loss_clip": 0.06251603, + "balance_loss_mlp": 0.01250013, + "epoch": 0.670494513753194, + "flos": 64815270192000.0, + "grad_norm": 0.7758908798936945, + "language_loss": 0.55567682, + "learning_rate": 1.0347116271835643e-06, + "loss": 0.63126445, + "num_input_tokens_seen": 240790380, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.0139389, + "step": 11152, + "time_per_iteration": 3.201535224914551 + }, + { + "auxiliary_loss_clip": 0.06410956, + "auxiliary_loss_mlp": 0.01264996, + "balance_loss_clip": 0.06271775, + "balance_loss_mlp": 0.01254815, + "epoch": 0.670554637005862, + "flos": 23517892512000.0, + "grad_norm": 1.915770962366586, + "language_loss": 0.81010997, + "learning_rate": 1.0343705485102896e-06, + "loss": 0.88686949, + "num_input_tokens_seen": 240811545, + "router_z_loss_clip": 1.39257812, + "router_z_loss_mlp": 0.10186768, + "step": 11153, + "time_per_iteration": 2.537212371826172 + }, + { + "auxiliary_loss_clip": 0.06411768, + "auxiliary_loss_mlp": 0.0126436, + "balance_loss_clip": 0.06273108, + "balance_loss_mlp": 0.01253822, + "epoch": 0.67061476025853, + "flos": 19469417840640.0, + "grad_norm": 1.508737872634347, + "language_loss": 0.76268411, + "learning_rate": 1.0340295064537814e-06, + "loss": 0.83944541, + "num_input_tokens_seen": 240831380, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.10534668, + "step": 11154, + "time_per_iteration": 2.558519124984741 + }, + { + "auxiliary_loss_clip": 0.06415759, + "auxiliary_loss_mlp": 0.01269836, + "balance_loss_clip": 0.06274074, + "balance_loss_mlp": 0.01259304, + "epoch": 0.670674883511198, + "flos": 20525903245440.0, + "grad_norm": 3.082678767747609, + "language_loss": 0.76461852, + "learning_rate": 1.0336885010269702e-06, + "loss": 0.84147453, + "num_input_tokens_seen": 240851855, + "router_z_loss_clip": 1.41699219, + "router_z_loss_mlp": 0.10534668, + "step": 11155, + "time_per_iteration": 2.504171371459961 + }, + { + "auxiliary_loss_clip": 0.06407665, + "auxiliary_loss_mlp": 0.01265908, + "balance_loss_clip": 0.0627, + "balance_loss_mlp": 0.01256187, + "epoch": 0.6707350067638659, + "flos": 25491061584000.0, + "grad_norm": 2.1059181531121873, + "language_loss": 0.82157421, + "learning_rate": 1.0333475322427878e-06, + "loss": 0.89830995, + "num_input_tokens_seen": 240869980, + "router_z_loss_clip": 1.37695312, + "router_z_loss_mlp": 0.09716797, + "step": 11156, + "time_per_iteration": 2.562812089920044 + }, + { + "auxiliary_loss_clip": 0.06406271, + "auxiliary_loss_mlp": 0.01265165, + "balance_loss_clip": 0.06271681, + "balance_loss_mlp": 0.01255706, + "epoch": 0.6707951300165339, + "flos": 22280040944640.0, + "grad_norm": 1.7628533784510112, + "language_loss": 0.74903405, + "learning_rate": 1.033006600114165e-06, + "loss": 0.82574838, + "num_input_tokens_seen": 240888680, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.09460449, + "step": 11157, + "time_per_iteration": 2.5089879035949707 + }, + { + "auxiliary_loss_clip": 0.06412502, + "auxiliary_loss_mlp": 0.01267451, + "balance_loss_clip": 0.06273752, + "balance_loss_mlp": 0.01256919, + "epoch": 0.6708552532692018, + "flos": 23990853283200.0, + "grad_norm": 1.6697268751930758, + "language_loss": 0.74289936, + "learning_rate": 1.0326657046540282e-06, + "loss": 0.81969893, + "num_input_tokens_seen": 240909050, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.10528564, + "step": 11158, + "time_per_iteration": 2.5533461570739746 + }, + { + "auxiliary_loss_clip": 0.06413293, + "auxiliary_loss_mlp": 0.01263254, + "balance_loss_clip": 0.06271626, + "balance_loss_mlp": 0.01253449, + "epoch": 0.6709153765218698, + "flos": 24944657109120.0, + "grad_norm": 1.5416620862644819, + "language_loss": 0.81707746, + "learning_rate": 1.0323248458753044e-06, + "loss": 0.89384294, + "num_input_tokens_seen": 240930035, + "router_z_loss_clip": 1.41699219, + "router_z_loss_mlp": 0.0980835, + "step": 11159, + "time_per_iteration": 4.040963649749756 + }, + { + "auxiliary_loss_clip": 0.06412386, + "auxiliary_loss_mlp": 0.01268767, + "balance_loss_clip": 0.06273866, + "balance_loss_mlp": 0.01258986, + "epoch": 0.6709754997745379, + "flos": 17536010330880.0, + "grad_norm": 1.5609798446772174, + "language_loss": 0.7718569, + "learning_rate": 1.0319840237909193e-06, + "loss": 0.84866846, + "num_input_tokens_seen": 240948895, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.09783936, + "step": 11160, + "time_per_iteration": 2.4715282917022705 + }, + { + "auxiliary_loss_clip": 0.06406286, + "auxiliary_loss_mlp": 0.01263422, + "balance_loss_clip": 0.06271639, + "balance_loss_mlp": 0.01254397, + "epoch": 0.6710356230272058, + "flos": 22097416970880.0, + "grad_norm": 1.6605543467204091, + "language_loss": 0.73893428, + "learning_rate": 1.0316432384137978e-06, + "loss": 0.81563139, + "num_input_tokens_seen": 240967770, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.09020996, + "step": 11161, + "time_per_iteration": 2.5761518478393555 + }, + { + "auxiliary_loss_clip": 0.0641313, + "auxiliary_loss_mlp": 0.01268388, + "balance_loss_clip": 0.06271637, + "balance_loss_mlp": 0.01257874, + "epoch": 0.6710957462798738, + "flos": 24213238819200.0, + "grad_norm": 1.698475212339427, + "language_loss": 0.68223077, + "learning_rate": 1.0313024897568618e-06, + "loss": 0.75904596, + "num_input_tokens_seen": 240988985, + "router_z_loss_clip": 1.41503906, + "router_z_loss_mlp": 0.10522461, + "step": 11162, + "time_per_iteration": 4.0347349643707275 + }, + { + "auxiliary_loss_clip": 0.06406809, + "auxiliary_loss_mlp": 0.01265434, + "balance_loss_clip": 0.06271581, + "balance_loss_mlp": 0.01255367, + "epoch": 0.6711558695325417, + "flos": 19099138648320.0, + "grad_norm": 1.6208038414483141, + "language_loss": 0.70270795, + "learning_rate": 1.030961777833032e-06, + "loss": 0.77943039, + "num_input_tokens_seen": 241005455, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.10064697, + "step": 11163, + "time_per_iteration": 2.4880189895629883 + }, + { + "auxiliary_loss_clip": 0.06402589, + "auxiliary_loss_mlp": 0.01262753, + "balance_loss_clip": 0.06269395, + "balance_loss_mlp": 0.0125383, + "epoch": 0.6712159927852097, + "flos": 25565134193280.0, + "grad_norm": 1.5352927814280746, + "language_loss": 0.75905788, + "learning_rate": 1.0306211026552291e-06, + "loss": 0.8357113, + "num_input_tokens_seen": 241026175, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.08929443, + "step": 11164, + "time_per_iteration": 2.5312371253967285 + }, + { + "auxiliary_loss_clip": 0.06409736, + "auxiliary_loss_mlp": 0.01265492, + "balance_loss_clip": 0.06273673, + "balance_loss_mlp": 0.01254907, + "epoch": 0.6712761160378776, + "flos": 22234032253440.0, + "grad_norm": 2.0741329798372408, + "language_loss": 0.65590626, + "learning_rate": 1.0302804642363704e-06, + "loss": 0.73265851, + "num_input_tokens_seen": 241044040, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.10595703, + "step": 11165, + "time_per_iteration": 2.5017032623291016 + }, + { + "auxiliary_loss_clip": 0.06407681, + "auxiliary_loss_mlp": 0.01264452, + "balance_loss_clip": 0.06272089, + "balance_loss_mlp": 0.01254444, + "epoch": 0.6713362392905456, + "flos": 22462077939840.0, + "grad_norm": 1.8809222742523355, + "language_loss": 0.71774828, + "learning_rate": 1.0299398625893738e-06, + "loss": 0.79446959, + "num_input_tokens_seen": 241063615, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.10015869, + "step": 11166, + "time_per_iteration": 2.50738787651062 + }, + { + "auxiliary_loss_clip": 0.06404926, + "auxiliary_loss_mlp": 0.01262643, + "balance_loss_clip": 0.06272519, + "balance_loss_mlp": 0.0125282, + "epoch": 0.6713963625432136, + "flos": 25637362012800.0, + "grad_norm": 1.8955119453047675, + "language_loss": 0.77147096, + "learning_rate": 1.0295992977271546e-06, + "loss": 0.84814668, + "num_input_tokens_seen": 241082520, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.09814453, + "step": 11167, + "time_per_iteration": 3.929837942123413 + }, + { + "auxiliary_loss_clip": 0.06410499, + "auxiliary_loss_mlp": 0.01266509, + "balance_loss_clip": 0.06272188, + "balance_loss_mlp": 0.01256078, + "epoch": 0.6714564857958816, + "flos": 35015110940160.0, + "grad_norm": 1.8086126039126507, + "language_loss": 0.68893099, + "learning_rate": 1.029258769662629e-06, + "loss": 0.76570106, + "num_input_tokens_seen": 241103505, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.10437012, + "step": 11168, + "time_per_iteration": 2.6505095958709717 + }, + { + "auxiliary_loss_clip": 0.06413946, + "auxiliary_loss_mlp": 0.01269172, + "balance_loss_clip": 0.0627404, + "balance_loss_mlp": 0.012578, + "epoch": 0.6715166090485495, + "flos": 26286028796160.0, + "grad_norm": 1.7287934282524213, + "language_loss": 0.73465478, + "learning_rate": 1.0289182784087068e-06, + "loss": 0.81148595, + "num_input_tokens_seen": 241122885, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.11358643, + "step": 11169, + "time_per_iteration": 2.5538253784179688 + }, + { + "auxiliary_loss_clip": 0.06410573, + "auxiliary_loss_mlp": 0.01265262, + "balance_loss_clip": 0.0627141, + "balance_loss_mlp": 0.01254706, + "epoch": 0.6715767323012175, + "flos": 15929556652800.0, + "grad_norm": 1.9811109571628822, + "language_loss": 0.76329374, + "learning_rate": 1.0285778239783005e-06, + "loss": 0.84005201, + "num_input_tokens_seen": 241140865, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.10565186, + "step": 11170, + "time_per_iteration": 2.5357441902160645 + }, + { + "auxiliary_loss_clip": 0.06412025, + "auxiliary_loss_mlp": 0.01265598, + "balance_loss_clip": 0.06272931, + "balance_loss_mlp": 0.01254964, + "epoch": 0.6716368555538854, + "flos": 17496835747200.0, + "grad_norm": 1.8551997359651162, + "language_loss": 0.74972916, + "learning_rate": 1.0282374063843212e-06, + "loss": 0.82650542, + "num_input_tokens_seen": 241158225, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.10626221, + "step": 11171, + "time_per_iteration": 2.4740569591522217 + }, + { + "auxiliary_loss_clip": 0.06413123, + "auxiliary_loss_mlp": 0.01262691, + "balance_loss_clip": 0.06273066, + "balance_loss_mlp": 0.01252344, + "epoch": 0.6716969788065534, + "flos": 16766759122560.0, + "grad_norm": 1.4543204322223777, + "language_loss": 0.86493564, + "learning_rate": 1.0278970256396762e-06, + "loss": 0.94169378, + "num_input_tokens_seen": 241175215, + "router_z_loss_clip": 1.40039062, + "router_z_loss_mlp": 0.10345459, + "step": 11172, + "time_per_iteration": 2.5120010375976562 + }, + { + "auxiliary_loss_clip": 0.06408751, + "auxiliary_loss_mlp": 0.01266926, + "balance_loss_clip": 0.06271499, + "balance_loss_mlp": 0.01256763, + "epoch": 0.6717571020592215, + "flos": 22716216973440.0, + "grad_norm": 2.0454540055069863, + "language_loss": 0.63633478, + "learning_rate": 1.0275566817572733e-06, + "loss": 0.71309155, + "num_input_tokens_seen": 241195250, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.10168457, + "step": 11173, + "time_per_iteration": 2.49975848197937 + }, + { + "auxiliary_loss_clip": 0.06422, + "auxiliary_loss_mlp": 0.01271665, + "balance_loss_clip": 0.06275772, + "balance_loss_mlp": 0.01260549, + "epoch": 0.6718172253118894, + "flos": 18740053975680.0, + "grad_norm": 4.441337622220845, + "language_loss": 0.71819955, + "learning_rate": 1.02721637475002e-06, + "loss": 0.79513621, + "num_input_tokens_seen": 241210720, + "router_z_loss_clip": 1.46191406, + "router_z_loss_mlp": 0.11108398, + "step": 11174, + "time_per_iteration": 2.483900547027588 + }, + { + "auxiliary_loss_clip": 0.06401111, + "auxiliary_loss_mlp": 0.01264943, + "balance_loss_clip": 0.06269203, + "balance_loss_mlp": 0.01255472, + "epoch": 0.6718773485645574, + "flos": 15637920117120.0, + "grad_norm": 1.9560679016643376, + "language_loss": 0.69026506, + "learning_rate": 1.0268761046308178e-06, + "loss": 0.76692557, + "num_input_tokens_seen": 241227395, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.09472656, + "step": 11175, + "time_per_iteration": 2.463592767715454 + }, + { + "auxiliary_loss_clip": 0.06406569, + "auxiliary_loss_mlp": 0.01265187, + "balance_loss_clip": 0.06273869, + "balance_loss_mlp": 0.01255341, + "epoch": 0.6719374718172253, + "flos": 19360908403200.0, + "grad_norm": 1.7117830890697936, + "language_loss": 0.74226189, + "learning_rate": 1.0265358714125714e-06, + "loss": 0.8189795, + "num_input_tokens_seen": 241246355, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09844971, + "step": 11176, + "time_per_iteration": 2.5074222087860107 + }, + { + "auxiliary_loss_clip": 0.06410944, + "auxiliary_loss_mlp": 0.0126684, + "balance_loss_clip": 0.062714, + "balance_loss_mlp": 0.0125654, + "epoch": 0.6719975950698933, + "flos": 21987817430400.0, + "grad_norm": 2.8444182697169014, + "language_loss": 0.73030323, + "learning_rate": 1.026195675108182e-06, + "loss": 0.80708104, + "num_input_tokens_seen": 241264180, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.10296631, + "step": 11177, + "time_per_iteration": 2.4807181358337402 + }, + { + "auxiliary_loss_clip": 0.06411102, + "auxiliary_loss_mlp": 0.01268926, + "balance_loss_clip": 0.06272686, + "balance_loss_mlp": 0.01258144, + "epoch": 0.6720577183225612, + "flos": 25235035833600.0, + "grad_norm": 2.1466059593233755, + "language_loss": 0.76338404, + "learning_rate": 1.025855515730551e-06, + "loss": 0.84018433, + "num_input_tokens_seen": 241282245, + "router_z_loss_clip": 1.38476562, + "router_z_loss_mlp": 0.10772705, + "step": 11178, + "time_per_iteration": 2.5277843475341797 + }, + { + "auxiliary_loss_clip": 0.06410985, + "auxiliary_loss_mlp": 0.01264657, + "balance_loss_clip": 0.06272356, + "balance_loss_mlp": 0.01255007, + "epoch": 0.6721178415752292, + "flos": 16951479448320.0, + "grad_norm": 1.7634405951154783, + "language_loss": 0.70127761, + "learning_rate": 1.0255153932925766e-06, + "loss": 0.77803409, + "num_input_tokens_seen": 241300745, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.09643555, + "step": 11179, + "time_per_iteration": 2.4638893604278564 + }, + { + "auxiliary_loss_clip": 0.06403655, + "auxiliary_loss_mlp": 0.01265471, + "balance_loss_clip": 0.06269027, + "balance_loss_mlp": 0.01256077, + "epoch": 0.6721779648278972, + "flos": 21547448697600.0, + "grad_norm": 1.4326115817211162, + "language_loss": 0.74262661, + "learning_rate": 1.0251753078071557e-06, + "loss": 0.81931782, + "num_input_tokens_seen": 241319320, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.09393311, + "step": 11180, + "time_per_iteration": 2.5094285011291504 + }, + { + "auxiliary_loss_clip": 0.0640661, + "auxiliary_loss_mlp": 0.01263274, + "balance_loss_clip": 0.06271511, + "balance_loss_mlp": 0.01252843, + "epoch": 0.6722380880805652, + "flos": 22612696853760.0, + "grad_norm": 1.3575184211837767, + "language_loss": 0.75178289, + "learning_rate": 1.0248352592871848e-06, + "loss": 0.82848167, + "num_input_tokens_seen": 241342225, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.10424805, + "step": 11181, + "time_per_iteration": 2.5373446941375732 + }, + { + "auxiliary_loss_clip": 0.06412126, + "auxiliary_loss_mlp": 0.0126461, + "balance_loss_clip": 0.06272763, + "balance_loss_mlp": 0.01254615, + "epoch": 0.6722982113332331, + "flos": 15930856391040.0, + "grad_norm": 2.2936660091873597, + "language_loss": 0.75133812, + "learning_rate": 1.0244952477455585e-06, + "loss": 0.82810551, + "num_input_tokens_seen": 241358240, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.09991455, + "step": 11182, + "time_per_iteration": 2.5146076679229736 + }, + { + "auxiliary_loss_clip": 0.06407333, + "auxiliary_loss_mlp": 0.01266179, + "balance_loss_clip": 0.06272985, + "balance_loss_mlp": 0.01256535, + "epoch": 0.6723583345859011, + "flos": 20602659185280.0, + "grad_norm": 1.7825231183024703, + "language_loss": 0.69884634, + "learning_rate": 1.0241552731951699e-06, + "loss": 0.77558148, + "num_input_tokens_seen": 241378420, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.09643555, + "step": 11183, + "time_per_iteration": 2.510972499847412 + }, + { + "auxiliary_loss_clip": 0.06407849, + "auxiliary_loss_mlp": 0.01268223, + "balance_loss_clip": 0.06270228, + "balance_loss_mlp": 0.01258234, + "epoch": 0.672418457838569, + "flos": 21732294804480.0, + "grad_norm": 1.4388499153565433, + "language_loss": 0.78377849, + "learning_rate": 1.0238153356489112e-06, + "loss": 0.8605392, + "num_input_tokens_seen": 241397185, + "router_z_loss_clip": 1.37695312, + "router_z_loss_mlp": 0.09985352, + "step": 11184, + "time_per_iteration": 2.5102083683013916 + }, + { + "auxiliary_loss_clip": 0.06418785, + "auxiliary_loss_mlp": 0.01263963, + "balance_loss_clip": 0.06274929, + "balance_loss_mlp": 0.01253305, + "epoch": 0.672478581091237, + "flos": 21476772178560.0, + "grad_norm": 2.087218631508525, + "language_loss": 0.66671652, + "learning_rate": 1.0234754351196743e-06, + "loss": 0.74354398, + "num_input_tokens_seen": 241415785, + "router_z_loss_clip": 1.43847656, + "router_z_loss_mlp": 0.10668945, + "step": 11185, + "time_per_iteration": 2.4922776222229004 + }, + { + "auxiliary_loss_clip": 0.06405509, + "auxiliary_loss_mlp": 0.01264604, + "balance_loss_clip": 0.06269497, + "balance_loss_mlp": 0.01253905, + "epoch": 0.6725387043439051, + "flos": 30854646887040.0, + "grad_norm": 3.8783146360767518, + "language_loss": 0.80847633, + "learning_rate": 1.023135571620345e-06, + "loss": 0.88517749, + "num_input_tokens_seen": 241437390, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.10693359, + "step": 11186, + "time_per_iteration": 2.650069236755371 + }, + { + "auxiliary_loss_clip": 0.06405525, + "auxiliary_loss_mlp": 0.01268075, + "balance_loss_clip": 0.06272745, + "balance_loss_mlp": 0.01258753, + "epoch": 0.672598827596573, + "flos": 24061949072640.0, + "grad_norm": 1.3182024269377546, + "language_loss": 0.807257, + "learning_rate": 1.022795745163813e-06, + "loss": 0.88399297, + "num_input_tokens_seen": 241458085, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.09320068, + "step": 11187, + "time_per_iteration": 2.5736026763916016 + }, + { + "auxiliary_loss_clip": 0.06414247, + "auxiliary_loss_mlp": 0.01266802, + "balance_loss_clip": 0.06271032, + "balance_loss_mlp": 0.01255996, + "epoch": 0.672658950849241, + "flos": 21878343671040.0, + "grad_norm": 1.7328673404989177, + "language_loss": 0.71004307, + "learning_rate": 1.022455955762965e-06, + "loss": 0.78685355, + "num_input_tokens_seen": 241476880, + "router_z_loss_clip": 1.43164062, + "router_z_loss_mlp": 0.1081543, + "step": 11188, + "time_per_iteration": 3.9358599185943604 + }, + { + "auxiliary_loss_clip": 0.06400838, + "auxiliary_loss_mlp": 0.01264937, + "balance_loss_clip": 0.06269124, + "balance_loss_mlp": 0.01255364, + "epoch": 0.6727190741019089, + "flos": 23228855452800.0, + "grad_norm": 1.7513555431786316, + "language_loss": 0.75587308, + "learning_rate": 1.0221162034306842e-06, + "loss": 0.83253086, + "num_input_tokens_seen": 241496535, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09576416, + "step": 11189, + "time_per_iteration": 2.558595895767212 + }, + { + "auxiliary_loss_clip": 0.06412518, + "auxiliary_loss_mlp": 0.01264313, + "balance_loss_clip": 0.06271306, + "balance_loss_mlp": 0.01252762, + "epoch": 0.6727791973545769, + "flos": 15784052837760.0, + "grad_norm": 2.0872354058578186, + "language_loss": 0.75281942, + "learning_rate": 1.0217764881798562e-06, + "loss": 0.8295877, + "num_input_tokens_seen": 241513465, + "router_z_loss_clip": 1.41210938, + "router_z_loss_mlp": 0.11547852, + "step": 11190, + "time_per_iteration": 2.465223550796509 + }, + { + "auxiliary_loss_clip": 0.06406397, + "auxiliary_loss_mlp": 0.01267439, + "balance_loss_clip": 0.06271145, + "balance_loss_mlp": 0.01256788, + "epoch": 0.6728393206072448, + "flos": 21255937943040.0, + "grad_norm": 1.3785573959073936, + "language_loss": 0.76754856, + "learning_rate": 1.0214368100233612e-06, + "loss": 0.84428692, + "num_input_tokens_seen": 241534125, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.10650635, + "step": 11191, + "time_per_iteration": 2.519883155822754 + }, + { + "auxiliary_loss_clip": 0.06406602, + "auxiliary_loss_mlp": 0.01266147, + "balance_loss_clip": 0.06273556, + "balance_loss_mlp": 0.01256509, + "epoch": 0.6728994438599128, + "flos": 32131295694720.0, + "grad_norm": 1.5727699537163, + "language_loss": 0.86438018, + "learning_rate": 1.0210971689740802e-06, + "loss": 0.94110769, + "num_input_tokens_seen": 241556340, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.09637451, + "step": 11192, + "time_per_iteration": 2.589451789855957 + }, + { + "auxiliary_loss_clip": 0.06414255, + "auxiliary_loss_mlp": 0.0126838, + "balance_loss_clip": 0.06275576, + "balance_loss_mlp": 0.01256948, + "epoch": 0.6729595671125808, + "flos": 23119046277120.0, + "grad_norm": 2.0400596637632997, + "language_loss": 0.76247764, + "learning_rate": 1.0207575650448923e-06, + "loss": 0.83930409, + "num_input_tokens_seen": 241575185, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.11437988, + "step": 11193, + "time_per_iteration": 2.569079637527466 + }, + { + "auxiliary_loss_clip": 0.06408816, + "auxiliary_loss_mlp": 0.01268779, + "balance_loss_clip": 0.06272899, + "balance_loss_mlp": 0.0125802, + "epoch": 0.6730196903652488, + "flos": 14616710081280.0, + "grad_norm": 1.7886354434370773, + "language_loss": 0.78477633, + "learning_rate": 1.0204179982486758e-06, + "loss": 0.86155224, + "num_input_tokens_seen": 241592970, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.10766602, + "step": 11194, + "time_per_iteration": 2.501262664794922 + }, + { + "auxiliary_loss_clip": 0.06410375, + "auxiliary_loss_mlp": 0.01264075, + "balance_loss_clip": 0.06271183, + "balance_loss_mlp": 0.01253889, + "epoch": 0.6730798136179167, + "flos": 21112320844800.0, + "grad_norm": 1.7894428961307616, + "language_loss": 0.90123671, + "learning_rate": 1.0200784685983075e-06, + "loss": 0.97798121, + "num_input_tokens_seen": 241610245, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.10192871, + "step": 11195, + "time_per_iteration": 2.529911994934082 + }, + { + "auxiliary_loss_clip": 0.06404506, + "auxiliary_loss_mlp": 0.01267592, + "balance_loss_clip": 0.0627027, + "balance_loss_mlp": 0.01257119, + "epoch": 0.6731399368705847, + "flos": 28993886467200.0, + "grad_norm": 1.9634861378348352, + "language_loss": 0.72801971, + "learning_rate": 1.019738976106662e-06, + "loss": 0.80474073, + "num_input_tokens_seen": 241630350, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.10467529, + "step": 11196, + "time_per_iteration": 2.5403385162353516 + }, + { + "auxiliary_loss_clip": 0.06306562, + "auxiliary_loss_mlp": 0.01254217, + "balance_loss_clip": 0.06250267, + "balance_loss_mlp": 0.01253061, + "epoch": 0.6732000601232526, + "flos": 64763643277440.0, + "grad_norm": 0.755157348431284, + "language_loss": 0.56539071, + "learning_rate": 1.0193995207866123e-06, + "loss": 0.64099848, + "num_input_tokens_seen": 241692380, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01152802, + "step": 11197, + "time_per_iteration": 3.103764295578003 + }, + { + "auxiliary_loss_clip": 0.06400825, + "auxiliary_loss_mlp": 0.01269132, + "balance_loss_clip": 0.06270334, + "balance_loss_mlp": 0.01259316, + "epoch": 0.6732601833759206, + "flos": 17207337490560.0, + "grad_norm": 1.957045035118017, + "language_loss": 0.76133382, + "learning_rate": 1.0190601026510312e-06, + "loss": 0.83803332, + "num_input_tokens_seen": 241710430, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.09814453, + "step": 11198, + "time_per_iteration": 2.4750118255615234 + }, + { + "auxiliary_loss_clip": 0.06411158, + "auxiliary_loss_mlp": 0.01264495, + "balance_loss_clip": 0.06272125, + "balance_loss_mlp": 0.01253492, + "epoch": 0.6733203066285887, + "flos": 18664430065920.0, + "grad_norm": 2.5858701419359185, + "language_loss": 0.81900644, + "learning_rate": 1.0187207217127892e-06, + "loss": 0.89576292, + "num_input_tokens_seen": 241724775, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.11010742, + "step": 11199, + "time_per_iteration": 3.915224075317383 + }, + { + "auxiliary_loss_clip": 0.06408331, + "auxiliary_loss_mlp": 0.01268211, + "balance_loss_clip": 0.06268819, + "balance_loss_mlp": 0.01257566, + "epoch": 0.6733804298812566, + "flos": 35818128144000.0, + "grad_norm": 1.7377353958720951, + "language_loss": 0.71924305, + "learning_rate": 1.0183813779847552e-06, + "loss": 0.79600847, + "num_input_tokens_seen": 241744440, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.10650635, + "step": 11200, + "time_per_iteration": 2.6547374725341797 + }, + { + "auxiliary_loss_clip": 0.06413474, + "auxiliary_loss_mlp": 0.01270012, + "balance_loss_clip": 0.06276008, + "balance_loss_mlp": 0.01259581, + "epoch": 0.6734405531339246, + "flos": 61651545511680.0, + "grad_norm": 1.525289564934158, + "language_loss": 0.64700097, + "learning_rate": 1.0180420714797987e-06, + "loss": 0.72383583, + "num_input_tokens_seen": 241771705, + "router_z_loss_clip": 1.37402344, + "router_z_loss_mlp": 0.10437012, + "step": 11201, + "time_per_iteration": 2.884462356567383 + }, + { + "auxiliary_loss_clip": 0.06414636, + "auxiliary_loss_mlp": 0.01267107, + "balance_loss_clip": 0.06272763, + "balance_loss_mlp": 0.01255466, + "epoch": 0.6735006763865925, + "flos": 20528670430080.0, + "grad_norm": 1.5117322786205176, + "language_loss": 0.63124895, + "learning_rate": 1.0177028022107856e-06, + "loss": 0.7080664, + "num_input_tokens_seen": 241790830, + "router_z_loss_clip": 1.41796875, + "router_z_loss_mlp": 0.11639404, + "step": 11202, + "time_per_iteration": 3.9962854385375977 + }, + { + "auxiliary_loss_clip": 0.06410715, + "auxiliary_loss_mlp": 0.0126867, + "balance_loss_clip": 0.06272809, + "balance_loss_mlp": 0.01258198, + "epoch": 0.6735607996392605, + "flos": 13924172885760.0, + "grad_norm": 1.7265240314624624, + "language_loss": 0.75169051, + "learning_rate": 1.0173635701905796e-06, + "loss": 0.82848436, + "num_input_tokens_seen": 241808165, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.10473633, + "step": 11203, + "time_per_iteration": 2.4805357456207275 + }, + { + "auxiliary_loss_clip": 0.06417318, + "auxiliary_loss_mlp": 0.0126681, + "balance_loss_clip": 0.0627423, + "balance_loss_mlp": 0.01254979, + "epoch": 0.6736209228919284, + "flos": 18813246117120.0, + "grad_norm": 2.5086879815410996, + "language_loss": 0.6739623, + "learning_rate": 1.0170243754320456e-06, + "loss": 0.75080359, + "num_input_tokens_seen": 241826925, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.11834717, + "step": 11204, + "time_per_iteration": 2.5092830657958984 + }, + { + "auxiliary_loss_clip": 0.06417938, + "auxiliary_loss_mlp": 0.01267705, + "balance_loss_clip": 0.06275398, + "balance_loss_mlp": 0.01256565, + "epoch": 0.6736810461445965, + "flos": 20378890056960.0, + "grad_norm": 1.4739361265515354, + "language_loss": 0.74145937, + "learning_rate": 1.0166852179480465e-06, + "loss": 0.81831586, + "num_input_tokens_seen": 241845525, + "router_z_loss_clip": 1.42480469, + "router_z_loss_mlp": 0.11151123, + "step": 11205, + "time_per_iteration": 2.5575578212738037 + }, + { + "auxiliary_loss_clip": 0.06405318, + "auxiliary_loss_mlp": 0.01270325, + "balance_loss_clip": 0.06271175, + "balance_loss_mlp": 0.0126027, + "epoch": 0.6737411693972644, + "flos": 30015264211200.0, + "grad_norm": 1.4826905039931084, + "language_loss": 0.71781552, + "learning_rate": 1.0163460977514416e-06, + "loss": 0.79457194, + "num_input_tokens_seen": 241866815, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.10058594, + "step": 11206, + "time_per_iteration": 4.010627031326294 + }, + { + "auxiliary_loss_clip": 0.0641677, + "auxiliary_loss_mlp": 0.01267501, + "balance_loss_clip": 0.06272575, + "balance_loss_mlp": 0.0125648, + "epoch": 0.6738012926499324, + "flos": 25454402622720.0, + "grad_norm": 2.885338634405065, + "language_loss": 0.67620468, + "learning_rate": 1.016007014855092e-06, + "loss": 0.75304735, + "num_input_tokens_seen": 241887050, + "router_z_loss_clip": 1.44335938, + "router_z_loss_mlp": 0.11016846, + "step": 11207, + "time_per_iteration": 2.5686817169189453 + }, + { + "auxiliary_loss_clip": 0.06404196, + "auxiliary_loss_mlp": 0.01268865, + "balance_loss_clip": 0.06272342, + "balance_loss_mlp": 0.01258672, + "epoch": 0.6738614159026003, + "flos": 20783102952960.0, + "grad_norm": 2.0413352600750145, + "language_loss": 0.74134195, + "learning_rate": 1.0156679692718553e-06, + "loss": 0.81807256, + "num_input_tokens_seen": 241904280, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.10186768, + "step": 11208, + "time_per_iteration": 2.4913690090179443 + }, + { + "auxiliary_loss_clip": 0.06408808, + "auxiliary_loss_mlp": 0.01269437, + "balance_loss_clip": 0.06270136, + "balance_loss_mlp": 0.01257432, + "epoch": 0.6739215391552683, + "flos": 19571931711360.0, + "grad_norm": 1.741711609442522, + "language_loss": 0.75868964, + "learning_rate": 1.0153289610145867e-06, + "loss": 0.83547217, + "num_input_tokens_seen": 241919190, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.11999512, + "step": 11209, + "time_per_iteration": 2.494077444076538 + }, + { + "auxiliary_loss_clip": 0.06402588, + "auxiliary_loss_mlp": 0.01265115, + "balance_loss_clip": 0.062707, + "balance_loss_mlp": 0.01255042, + "epoch": 0.6739816624079362, + "flos": 24394898471040.0, + "grad_norm": 1.8799682247559513, + "language_loss": 0.66601419, + "learning_rate": 1.0149899900961428e-06, + "loss": 0.74269128, + "num_input_tokens_seen": 241940525, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.10064697, + "step": 11210, + "time_per_iteration": 2.531925916671753 + }, + { + "auxiliary_loss_clip": 0.06400777, + "auxiliary_loss_mlp": 0.0126575, + "balance_loss_clip": 0.06269025, + "balance_loss_mlp": 0.01256297, + "epoch": 0.6740417856606042, + "flos": 22534683102720.0, + "grad_norm": 3.725779709718602, + "language_loss": 0.8045913, + "learning_rate": 1.014651056529377e-06, + "loss": 0.88125658, + "num_input_tokens_seen": 241959290, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09454346, + "step": 11211, + "time_per_iteration": 2.546027898788452 + }, + { + "auxiliary_loss_clip": 0.06403598, + "auxiliary_loss_mlp": 0.01265983, + "balance_loss_clip": 0.06271007, + "balance_loss_mlp": 0.01256208, + "epoch": 0.6741019089132723, + "flos": 25782530411520.0, + "grad_norm": 1.3057254169112946, + "language_loss": 0.76753151, + "learning_rate": 1.014312160327143e-06, + "loss": 0.84422737, + "num_input_tokens_seen": 241980715, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.09777832, + "step": 11212, + "time_per_iteration": 2.542628049850464 + }, + { + "auxiliary_loss_clip": 0.06409732, + "auxiliary_loss_mlp": 0.01268637, + "balance_loss_clip": 0.06270209, + "balance_loss_mlp": 0.01257539, + "epoch": 0.6741620321659402, + "flos": 21112027355520.0, + "grad_norm": 1.7288185495326422, + "language_loss": 0.78622723, + "learning_rate": 1.0139733015022905e-06, + "loss": 0.86301088, + "num_input_tokens_seen": 241999985, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.11108398, + "step": 11213, + "time_per_iteration": 2.553414821624756 + }, + { + "auxiliary_loss_clip": 0.06413242, + "auxiliary_loss_mlp": 0.01267804, + "balance_loss_clip": 0.06272203, + "balance_loss_mlp": 0.01256789, + "epoch": 0.6742221554186082, + "flos": 20746653626880.0, + "grad_norm": 1.7499991393106977, + "language_loss": 0.6779902, + "learning_rate": 1.0136344800676685e-06, + "loss": 0.75480068, + "num_input_tokens_seen": 242018990, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.11016846, + "step": 11214, + "time_per_iteration": 2.4924774169921875 + }, + { + "auxiliary_loss_clip": 0.06411138, + "auxiliary_loss_mlp": 0.01266837, + "balance_loss_clip": 0.06271094, + "balance_loss_mlp": 0.01256907, + "epoch": 0.6742822786712761, + "flos": 37782366756480.0, + "grad_norm": 1.5348832786859372, + "language_loss": 0.73044717, + "learning_rate": 1.0132956960361263e-06, + "loss": 0.8072269, + "num_input_tokens_seen": 242039340, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.0993042, + "step": 11215, + "time_per_iteration": 2.6919710636138916 + }, + { + "auxiliary_loss_clip": 0.06411563, + "auxiliary_loss_mlp": 0.01266913, + "balance_loss_clip": 0.06272543, + "balance_loss_mlp": 0.0125653, + "epoch": 0.6743424019239441, + "flos": 37272118118400.0, + "grad_norm": 1.6783781241391482, + "language_loss": 0.66716719, + "learning_rate": 1.0129569494205096e-06, + "loss": 0.74395192, + "num_input_tokens_seen": 242062215, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.1038208, + "step": 11216, + "time_per_iteration": 2.6457085609436035 + }, + { + "auxiliary_loss_clip": 0.06304459, + "auxiliary_loss_mlp": 0.01251318, + "balance_loss_clip": 0.06248666, + "balance_loss_mlp": 0.012498, + "epoch": 0.674402525176612, + "flos": 66020152377600.0, + "grad_norm": 0.6583920548662452, + "language_loss": 0.56272531, + "learning_rate": 1.0126182402336646e-06, + "loss": 0.63828307, + "num_input_tokens_seen": 242131130, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.01516724, + "step": 11217, + "time_per_iteration": 3.2267727851867676 + }, + { + "auxiliary_loss_clip": 0.064037, + "auxiliary_loss_mlp": 0.01266203, + "balance_loss_clip": 0.06268451, + "balance_loss_mlp": 0.01255939, + "epoch": 0.67446264842928, + "flos": 26467143396480.0, + "grad_norm": 1.8797709757007424, + "language_loss": 0.74946856, + "learning_rate": 1.0122795684884363e-06, + "loss": 0.82616764, + "num_input_tokens_seen": 242149720, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.1026001, + "step": 11218, + "time_per_iteration": 2.5534565448760986 + }, + { + "auxiliary_loss_clip": 0.06412031, + "auxiliary_loss_mlp": 0.01268347, + "balance_loss_clip": 0.06273925, + "balance_loss_mlp": 0.01257189, + "epoch": 0.674522771681948, + "flos": 23739146017920.0, + "grad_norm": 1.571619211134611, + "language_loss": 0.6640991, + "learning_rate": 1.0119409341976639e-06, + "loss": 0.74090284, + "num_input_tokens_seen": 242168875, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.1116333, + "step": 11219, + "time_per_iteration": 2.5408942699432373 + }, + { + "auxiliary_loss_clip": 0.06409343, + "auxiliary_loss_mlp": 0.01269022, + "balance_loss_clip": 0.062702, + "balance_loss_mlp": 0.01257935, + "epoch": 0.674582894934616, + "flos": 24761320375680.0, + "grad_norm": 1.6133708722293332, + "language_loss": 0.75378865, + "learning_rate": 1.0116023373741904e-06, + "loss": 0.83057231, + "num_input_tokens_seen": 242188465, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.11090088, + "step": 11220, + "time_per_iteration": 2.556192398071289 + }, + { + "auxiliary_loss_clip": 0.0640621, + "auxiliary_loss_mlp": 0.01265502, + "balance_loss_clip": 0.06268732, + "balance_loss_mlp": 0.01254988, + "epoch": 0.6746430181872839, + "flos": 24833506268160.0, + "grad_norm": 1.5601512803843804, + "language_loss": 0.70583248, + "learning_rate": 1.0112637780308554e-06, + "loss": 0.78254962, + "num_input_tokens_seen": 242208675, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.10522461, + "step": 11221, + "time_per_iteration": 2.538742780685425 + }, + { + "auxiliary_loss_clip": 0.06408031, + "auxiliary_loss_mlp": 0.01264539, + "balance_loss_clip": 0.06272538, + "balance_loss_mlp": 0.01255032, + "epoch": 0.6747031414399519, + "flos": 16879167774720.0, + "grad_norm": 2.089456373953198, + "language_loss": 0.58824384, + "learning_rate": 1.010925256180498e-06, + "loss": 0.66496956, + "num_input_tokens_seen": 242227440, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.09509277, + "step": 11222, + "time_per_iteration": 2.5625038146972656 + }, + { + "auxiliary_loss_clip": 0.06411393, + "auxiliary_loss_mlp": 0.01266063, + "balance_loss_clip": 0.0627331, + "balance_loss_mlp": 0.01255, + "epoch": 0.6747632646926198, + "flos": 22791715102080.0, + "grad_norm": 1.7403006489773343, + "language_loss": 0.76732111, + "learning_rate": 1.0105867718359528e-06, + "loss": 0.84409571, + "num_input_tokens_seen": 242245240, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.11065674, + "step": 11223, + "time_per_iteration": 2.499220132827759 + }, + { + "auxiliary_loss_clip": 0.06407724, + "auxiliary_loss_mlp": 0.01267921, + "balance_loss_clip": 0.06270097, + "balance_loss_mlp": 0.01257854, + "epoch": 0.6748233879452878, + "flos": 20052020079360.0, + "grad_norm": 1.8418495567149014, + "language_loss": 0.75473273, + "learning_rate": 1.0102483250100574e-06, + "loss": 0.83148926, + "num_input_tokens_seen": 242263435, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.10064697, + "step": 11224, + "time_per_iteration": 2.5515925884246826 + }, + { + "auxiliary_loss_clip": 0.06404493, + "auxiliary_loss_mlp": 0.01263212, + "balance_loss_clip": 0.06271124, + "balance_loss_mlp": 0.01254289, + "epoch": 0.6748835111979558, + "flos": 23009488663680.0, + "grad_norm": 1.6780430249692133, + "language_loss": 0.63333517, + "learning_rate": 1.0099099157156445e-06, + "loss": 0.7100122, + "num_input_tokens_seen": 242282765, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.0892334, + "step": 11225, + "time_per_iteration": 2.5058155059814453 + }, + { + "auxiliary_loss_clip": 0.0639993, + "auxiliary_loss_mlp": 0.01263232, + "balance_loss_clip": 0.06269206, + "balance_loss_mlp": 0.012541, + "epoch": 0.6749436344506238, + "flos": 12201201705600.0, + "grad_norm": 1.7347966506914976, + "language_loss": 0.64211845, + "learning_rate": 1.0095715439655462e-06, + "loss": 0.71875006, + "num_input_tokens_seen": 242298980, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.09130859, + "step": 11226, + "time_per_iteration": 2.5148916244506836 + }, + { + "auxiliary_loss_clip": 0.06412213, + "auxiliary_loss_mlp": 0.01266854, + "balance_loss_clip": 0.06273121, + "balance_loss_mlp": 0.01256256, + "epoch": 0.6750037577032918, + "flos": 11878356723840.0, + "grad_norm": 2.584638628864584, + "language_loss": 0.72339863, + "learning_rate": 1.0092332097725945e-06, + "loss": 0.80018932, + "num_input_tokens_seen": 242315420, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.10595703, + "step": 11227, + "time_per_iteration": 2.4601356983184814 + }, + { + "auxiliary_loss_clip": 0.06406709, + "auxiliary_loss_mlp": 0.01263943, + "balance_loss_clip": 0.06272034, + "balance_loss_mlp": 0.01254097, + "epoch": 0.6750638809559597, + "flos": 17025342422400.0, + "grad_norm": 2.4759856374415077, + "language_loss": 0.7107985, + "learning_rate": 1.0088949131496183e-06, + "loss": 0.78750503, + "num_input_tokens_seen": 242332805, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.09851074, + "step": 11228, + "time_per_iteration": 3.974013566970825 + }, + { + "auxiliary_loss_clip": 0.0630679, + "auxiliary_loss_mlp": 0.01262425, + "balance_loss_clip": 0.06250891, + "balance_loss_mlp": 0.01260476, + "epoch": 0.6751240042086277, + "flos": 70972774531200.0, + "grad_norm": 0.7443387383646383, + "language_loss": 0.52992356, + "learning_rate": 1.0085566541094482e-06, + "loss": 0.60561574, + "num_input_tokens_seen": 242396160, + "router_z_loss_clip": 0.55908203, + "router_z_loss_mlp": 0.01947021, + "step": 11229, + "time_per_iteration": 3.1949167251586914 + }, + { + "auxiliary_loss_clip": 0.06405008, + "auxiliary_loss_mlp": 0.01265887, + "balance_loss_clip": 0.06271674, + "balance_loss_mlp": 0.01256249, + "epoch": 0.6751841274612956, + "flos": 22681863999360.0, + "grad_norm": 2.9468842422151673, + "language_loss": 0.80432749, + "learning_rate": 1.0082184326649072e-06, + "loss": 0.88103646, + "num_input_tokens_seen": 242414660, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.09625244, + "step": 11230, + "time_per_iteration": 2.5213663578033447 + }, + { + "auxiliary_loss_clip": 0.06402741, + "auxiliary_loss_mlp": 0.01262658, + "balance_loss_clip": 0.06269971, + "balance_loss_mlp": 0.0125333, + "epoch": 0.6752442507139637, + "flos": 21295112526720.0, + "grad_norm": 1.434197979050497, + "language_loss": 0.65974534, + "learning_rate": 1.0078802488288228e-06, + "loss": 0.73639941, + "num_input_tokens_seen": 242434225, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09326172, + "step": 11231, + "time_per_iteration": 2.512449026107788 + }, + { + "auxiliary_loss_clip": 0.06417508, + "auxiliary_loss_mlp": 0.01271667, + "balance_loss_clip": 0.06276156, + "balance_loss_mlp": 0.01260396, + "epoch": 0.6753043739666316, + "flos": 28264480675200.0, + "grad_norm": 1.8511033060394846, + "language_loss": 0.66944438, + "learning_rate": 1.0075421026140198e-06, + "loss": 0.7463361, + "num_input_tokens_seen": 242454355, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.11260986, + "step": 11232, + "time_per_iteration": 2.5738155841827393 + }, + { + "auxiliary_loss_clip": 0.06404346, + "auxiliary_loss_mlp": 0.01266971, + "balance_loss_clip": 0.0627114, + "balance_loss_mlp": 0.01257226, + "epoch": 0.6753644972192996, + "flos": 21366627586560.0, + "grad_norm": 1.674017645319507, + "language_loss": 0.72178799, + "learning_rate": 1.0072039940333188e-06, + "loss": 0.79850119, + "num_input_tokens_seen": 242474935, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.09735107, + "step": 11233, + "time_per_iteration": 2.5327250957489014 + }, + { + "auxiliary_loss_clip": 0.0640566, + "auxiliary_loss_mlp": 0.01263187, + "balance_loss_clip": 0.06269811, + "balance_loss_mlp": 0.01253579, + "epoch": 0.6754246204719675, + "flos": 26549224070400.0, + "grad_norm": 1.499022886883579, + "language_loss": 0.7716381, + "learning_rate": 1.0068659230995418e-06, + "loss": 0.84832656, + "num_input_tokens_seen": 242495530, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.09606934, + "step": 11234, + "time_per_iteration": 2.607923746109009 + }, + { + "auxiliary_loss_clip": 0.0640721, + "auxiliary_loss_mlp": 0.01266453, + "balance_loss_clip": 0.06272233, + "balance_loss_mlp": 0.01255224, + "epoch": 0.6754847437246355, + "flos": 25563750600960.0, + "grad_norm": 1.4543561341667586, + "language_loss": 0.75457549, + "learning_rate": 1.0065278898255101e-06, + "loss": 0.83131212, + "num_input_tokens_seen": 242514550, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.11230469, + "step": 11235, + "time_per_iteration": 2.614145278930664 + }, + { + "auxiliary_loss_clip": 0.06304054, + "auxiliary_loss_mlp": 0.01255487, + "balance_loss_clip": 0.06248432, + "balance_loss_mlp": 0.01253944, + "epoch": 0.6755448669773034, + "flos": 59530216492800.0, + "grad_norm": 0.7576799363115112, + "language_loss": 0.51220065, + "learning_rate": 1.0061898942240387e-06, + "loss": 0.58779609, + "num_input_tokens_seen": 242569200, + "router_z_loss_clip": 0.55712891, + "router_z_loss_mlp": 0.01538849, + "step": 11236, + "time_per_iteration": 3.079153060913086 + }, + { + "auxiliary_loss_clip": 0.06406215, + "auxiliary_loss_mlp": 0.01265101, + "balance_loss_clip": 0.06270912, + "balance_loss_mlp": 0.01253931, + "epoch": 0.6756049902299714, + "flos": 23301209053440.0, + "grad_norm": 1.9064890293106858, + "language_loss": 0.75501907, + "learning_rate": 1.0058519363079464e-06, + "loss": 0.83173215, + "num_input_tokens_seen": 242586950, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.11181641, + "step": 11237, + "time_per_iteration": 2.591219186782837 + }, + { + "auxiliary_loss_clip": 0.06407686, + "auxiliary_loss_mlp": 0.01265319, + "balance_loss_clip": 0.06271937, + "balance_loss_mlp": 0.01254441, + "epoch": 0.6756651134826394, + "flos": 31583256065280.0, + "grad_norm": 1.6435273747755843, + "language_loss": 0.77603805, + "learning_rate": 1.0055140160900482e-06, + "loss": 0.85276806, + "num_input_tokens_seen": 242607380, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10876465, + "step": 11238, + "time_per_iteration": 4.004278659820557 + }, + { + "auxiliary_loss_clip": 0.06411187, + "auxiliary_loss_mlp": 0.01266355, + "balance_loss_clip": 0.06269816, + "balance_loss_mlp": 0.01255834, + "epoch": 0.6757252367353074, + "flos": 27279761892480.0, + "grad_norm": 1.8597789781280543, + "language_loss": 0.66815203, + "learning_rate": 1.0051761335831587e-06, + "loss": 0.74492747, + "num_input_tokens_seen": 242628025, + "router_z_loss_clip": 1.41503906, + "router_z_loss_mlp": 0.10510254, + "step": 11239, + "time_per_iteration": 2.5872182846069336 + }, + { + "auxiliary_loss_clip": 0.06401898, + "auxiliary_loss_mlp": 0.01262458, + "balance_loss_clip": 0.06269912, + "balance_loss_mlp": 0.01252927, + "epoch": 0.6757853599879754, + "flos": 16835548924800.0, + "grad_norm": 2.5961823999819074, + "language_loss": 0.8317802, + "learning_rate": 1.0048382888000898e-06, + "loss": 0.90842378, + "num_input_tokens_seen": 242643825, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09533691, + "step": 11240, + "time_per_iteration": 2.4803500175476074 + }, + { + "auxiliary_loss_clip": 0.0641778, + "auxiliary_loss_mlp": 0.01269049, + "balance_loss_clip": 0.06275319, + "balance_loss_mlp": 0.0125677, + "epoch": 0.6758454832406433, + "flos": 23226465611520.0, + "grad_norm": 1.9848396876019143, + "language_loss": 0.7422142, + "learning_rate": 1.0045004817536525e-06, + "loss": 0.8190825, + "num_input_tokens_seen": 242661820, + "router_z_loss_clip": 1.42382812, + "router_z_loss_mlp": 0.12268066, + "step": 11241, + "time_per_iteration": 2.526111602783203 + }, + { + "auxiliary_loss_clip": 0.06407639, + "auxiliary_loss_mlp": 0.0126396, + "balance_loss_clip": 0.06271756, + "balance_loss_mlp": 0.01253833, + "epoch": 0.6759056064933113, + "flos": 16295098089600.0, + "grad_norm": 2.0527933437331343, + "language_loss": 0.80294073, + "learning_rate": 1.0041627124566572e-06, + "loss": 0.87965673, + "num_input_tokens_seen": 242679890, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.10131836, + "step": 11242, + "time_per_iteration": 3.933396339416504 + }, + { + "auxiliary_loss_clip": 0.06405968, + "auxiliary_loss_mlp": 0.01263229, + "balance_loss_clip": 0.06268989, + "balance_loss_mlp": 0.01253734, + "epoch": 0.6759657297459792, + "flos": 25929543600000.0, + "grad_norm": 1.6744190932532899, + "language_loss": 0.72630656, + "learning_rate": 1.0038249809219109e-06, + "loss": 0.80299854, + "num_input_tokens_seen": 242699495, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.09490967, + "step": 11243, + "time_per_iteration": 2.514404535293579 + }, + { + "auxiliary_loss_clip": 0.06407295, + "auxiliary_loss_mlp": 0.01265212, + "balance_loss_clip": 0.06272102, + "balance_loss_mlp": 0.01255306, + "epoch": 0.6760258529986473, + "flos": 23007140749440.0, + "grad_norm": 1.5647847453275578, + "language_loss": 0.72900802, + "learning_rate": 1.003487287162221e-06, + "loss": 0.80573308, + "num_input_tokens_seen": 242719500, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.09906006, + "step": 11244, + "time_per_iteration": 2.5581138134002686 + }, + { + "auxiliary_loss_clip": 0.06405992, + "auxiliary_loss_mlp": 0.01266317, + "balance_loss_clip": 0.06269385, + "balance_loss_mlp": 0.01255887, + "epoch": 0.6760859762513152, + "flos": 20965601145600.0, + "grad_norm": 4.977975302469332, + "language_loss": 0.85911322, + "learning_rate": 1.003149631190393e-06, + "loss": 0.93583632, + "num_input_tokens_seen": 242738325, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.10437012, + "step": 11245, + "time_per_iteration": 2.485227584838867 + }, + { + "auxiliary_loss_clip": 0.06410875, + "auxiliary_loss_mlp": 0.01265401, + "balance_loss_clip": 0.06269195, + "balance_loss_mlp": 0.01254743, + "epoch": 0.6761460995039832, + "flos": 23629672258560.0, + "grad_norm": 1.7215460318487352, + "language_loss": 0.74000847, + "learning_rate": 1.0028120130192327e-06, + "loss": 0.81677115, + "num_input_tokens_seen": 242756620, + "router_z_loss_clip": 1.41796875, + "router_z_loss_mlp": 0.10656738, + "step": 11246, + "time_per_iteration": 3.958766460418701 + }, + { + "auxiliary_loss_clip": 0.06405219, + "auxiliary_loss_mlp": 0.01262106, + "balance_loss_clip": 0.0626854, + "balance_loss_mlp": 0.01251896, + "epoch": 0.6762062227566511, + "flos": 20776101137280.0, + "grad_norm": 1.7168055925724897, + "language_loss": 0.87943971, + "learning_rate": 1.002474432661539e-06, + "loss": 0.95611298, + "num_input_tokens_seen": 242774505, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.10205078, + "step": 11247, + "time_per_iteration": 2.586812973022461 + }, + { + "auxiliary_loss_clip": 0.06307312, + "auxiliary_loss_mlp": 0.01250807, + "balance_loss_clip": 0.06251501, + "balance_loss_mlp": 0.01249509, + "epoch": 0.6762663460093191, + "flos": 52836915219840.0, + "grad_norm": 0.8036403587512043, + "language_loss": 0.53957772, + "learning_rate": 1.002136890130115e-06, + "loss": 0.61515892, + "num_input_tokens_seen": 242828645, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.01298523, + "step": 11248, + "time_per_iteration": 3.125509262084961 + }, + { + "auxiliary_loss_clip": 0.06402693, + "auxiliary_loss_mlp": 0.01266342, + "balance_loss_clip": 0.06271251, + "balance_loss_mlp": 0.0125671, + "epoch": 0.676326469261987, + "flos": 23703115962240.0, + "grad_norm": 1.8151620805455404, + "language_loss": 0.73989308, + "learning_rate": 1.001799385437761e-06, + "loss": 0.81658345, + "num_input_tokens_seen": 242850100, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09625244, + "step": 11249, + "time_per_iteration": 2.6366310119628906 + }, + { + "auxiliary_loss_clip": 0.06411433, + "auxiliary_loss_mlp": 0.01264935, + "balance_loss_clip": 0.06270382, + "balance_loss_mlp": 0.01253372, + "epoch": 0.676386592514655, + "flos": 14068880087040.0, + "grad_norm": 2.152895610647936, + "language_loss": 0.74230921, + "learning_rate": 1.0014619185972732e-06, + "loss": 0.81907284, + "num_input_tokens_seen": 242867775, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.11566162, + "step": 11250, + "time_per_iteration": 2.458453416824341 + }, + { + "auxiliary_loss_clip": 0.06409556, + "auxiliary_loss_mlp": 0.01266298, + "balance_loss_clip": 0.06271183, + "balance_loss_mlp": 0.01256082, + "epoch": 0.676446715767323, + "flos": 20418441984000.0, + "grad_norm": 1.8697083640776453, + "language_loss": 0.74947959, + "learning_rate": 1.0011244896214497e-06, + "loss": 0.82623816, + "num_input_tokens_seen": 242886865, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.10217285, + "step": 11251, + "time_per_iteration": 2.568087100982666 + }, + { + "auxiliary_loss_clip": 0.06411379, + "auxiliary_loss_mlp": 0.01266225, + "balance_loss_clip": 0.06275384, + "balance_loss_mlp": 0.012553, + "epoch": 0.676506839019991, + "flos": 21294651329280.0, + "grad_norm": 1.5310605534253319, + "language_loss": 0.69863832, + "learning_rate": 1.0007870985230873e-06, + "loss": 0.77541435, + "num_input_tokens_seen": 242906705, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.109375, + "step": 11252, + "time_per_iteration": 2.541651725769043 + }, + { + "auxiliary_loss_clip": 0.06405863, + "auxiliary_loss_mlp": 0.0126458, + "balance_loss_clip": 0.06270701, + "balance_loss_mlp": 0.01254406, + "epoch": 0.676566962272659, + "flos": 29939849936640.0, + "grad_norm": 2.258609602750375, + "language_loss": 0.67108035, + "learning_rate": 1.0004497453149765e-06, + "loss": 0.74778473, + "num_input_tokens_seen": 242925215, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.10174561, + "step": 11253, + "time_per_iteration": 2.6143195629119873 + }, + { + "auxiliary_loss_clip": 0.06413913, + "auxiliary_loss_mlp": 0.01267285, + "balance_loss_clip": 0.06273795, + "balance_loss_mlp": 0.01255722, + "epoch": 0.6766270855253269, + "flos": 17936994844800.0, + "grad_norm": 1.5309002898419535, + "language_loss": 0.77274752, + "learning_rate": 1.0001124300099115e-06, + "loss": 0.84955955, + "num_input_tokens_seen": 242944750, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.11560059, + "step": 11254, + "time_per_iteration": 2.4911346435546875 + }, + { + "auxiliary_loss_clip": 0.06411318, + "auxiliary_loss_mlp": 0.01266788, + "balance_loss_clip": 0.06272383, + "balance_loss_mlp": 0.01255439, + "epoch": 0.6766872087779949, + "flos": 23110283525760.0, + "grad_norm": 2.0449563599790874, + "language_loss": 0.71835911, + "learning_rate": 9.997751526206835e-07, + "loss": 0.79514015, + "num_input_tokens_seen": 242963860, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.11340332, + "step": 11255, + "time_per_iteration": 2.5604913234710693 + }, + { + "auxiliary_loss_clip": 0.0641115, + "auxiliary_loss_mlp": 0.0126876, + "balance_loss_clip": 0.06271946, + "balance_loss_mlp": 0.01257376, + "epoch": 0.6767473320306628, + "flos": 26220257740800.0, + "grad_norm": 1.9457423412026578, + "language_loss": 0.75806832, + "learning_rate": 9.994379131600828e-07, + "loss": 0.83486742, + "num_input_tokens_seen": 242983050, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.11383057, + "step": 11256, + "time_per_iteration": 2.5321764945983887 + }, + { + "auxiliary_loss_clip": 0.06411014, + "auxiliary_loss_mlp": 0.01265861, + "balance_loss_clip": 0.06275011, + "balance_loss_mlp": 0.01255192, + "epoch": 0.6768074552833309, + "flos": 18374554465920.0, + "grad_norm": 2.012218384442974, + "language_loss": 0.65943599, + "learning_rate": 9.991007116408965e-07, + "loss": 0.73620474, + "num_input_tokens_seen": 243001125, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.10662842, + "step": 11257, + "time_per_iteration": 2.502154588699341 + }, + { + "auxiliary_loss_clip": 0.06409346, + "auxiliary_loss_mlp": 0.01265352, + "balance_loss_clip": 0.0627479, + "balance_loss_mlp": 0.01255159, + "epoch": 0.6768675785359988, + "flos": 23046692676480.0, + "grad_norm": 1.399276257571999, + "language_loss": 0.75707698, + "learning_rate": 9.987635480759109e-07, + "loss": 0.83382392, + "num_input_tokens_seen": 243021865, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.10186768, + "step": 11258, + "time_per_iteration": 2.536574602127075 + }, + { + "auxiliary_loss_clip": 0.06402203, + "auxiliary_loss_mlp": 0.01264608, + "balance_loss_clip": 0.06270992, + "balance_loss_mlp": 0.01254696, + "epoch": 0.6769277017886668, + "flos": 33044876760960.0, + "grad_norm": 1.5373580485699971, + "language_loss": 0.66955268, + "learning_rate": 9.984264224779127e-07, + "loss": 0.74622083, + "num_input_tokens_seen": 243042970, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.09912109, + "step": 11259, + "time_per_iteration": 2.59914231300354 + }, + { + "auxiliary_loss_clip": 0.06411228, + "auxiliary_loss_mlp": 0.01264994, + "balance_loss_clip": 0.06273773, + "balance_loss_mlp": 0.01254218, + "epoch": 0.6769878250413347, + "flos": 20854408377600.0, + "grad_norm": 2.0822099065238397, + "language_loss": 0.85664153, + "learning_rate": 9.980893348596839e-07, + "loss": 0.93340379, + "num_input_tokens_seen": 243058470, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.10778809, + "step": 11260, + "time_per_iteration": 2.470489501953125 + }, + { + "auxiliary_loss_clip": 0.06415793, + "auxiliary_loss_mlp": 0.01265606, + "balance_loss_clip": 0.06273471, + "balance_loss_mlp": 0.01253453, + "epoch": 0.6770479482940027, + "flos": 15601345009920.0, + "grad_norm": 2.2691636202149206, + "language_loss": 0.77703118, + "learning_rate": 9.977522852340081e-07, + "loss": 0.85384524, + "num_input_tokens_seen": 243076630, + "router_z_loss_clip": 1.42285156, + "router_z_loss_mlp": 0.12164307, + "step": 11261, + "time_per_iteration": 2.5071561336517334 + }, + { + "auxiliary_loss_clip": 0.06410246, + "auxiliary_loss_mlp": 0.01267278, + "balance_loss_clip": 0.06271648, + "balance_loss_mlp": 0.01256013, + "epoch": 0.6771080715466706, + "flos": 18626345585280.0, + "grad_norm": 1.5719770677718063, + "language_loss": 0.87847519, + "learning_rate": 9.97415273613666e-07, + "loss": 0.95525038, + "num_input_tokens_seen": 243092260, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.1126709, + "step": 11262, + "time_per_iteration": 2.4645345211029053 + }, + { + "auxiliary_loss_clip": 0.06413369, + "auxiliary_loss_mlp": 0.01265118, + "balance_loss_clip": 0.06273858, + "balance_loss_mlp": 0.01254371, + "epoch": 0.6771681947993387, + "flos": 12500427035520.0, + "grad_norm": 1.7525589115394145, + "language_loss": 0.74310911, + "learning_rate": 9.97078300011439e-07, + "loss": 0.81989402, + "num_input_tokens_seen": 243109405, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.10754395, + "step": 11263, + "time_per_iteration": 2.6041438579559326 + }, + { + "auxiliary_loss_clip": 0.06415032, + "auxiliary_loss_mlp": 0.01264304, + "balance_loss_clip": 0.06272443, + "balance_loss_mlp": 0.01252406, + "epoch": 0.6772283180520066, + "flos": 22243549691520.0, + "grad_norm": 2.1938876589125544, + "language_loss": 0.68432045, + "learning_rate": 9.967413644401016e-07, + "loss": 0.76111376, + "num_input_tokens_seen": 243128135, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.11901855, + "step": 11264, + "time_per_iteration": 2.5002152919769287 + }, + { + "auxiliary_loss_clip": 0.0641073, + "auxiliary_loss_mlp": 0.01264807, + "balance_loss_clip": 0.062745, + "balance_loss_mlp": 0.01254006, + "epoch": 0.6772884413046746, + "flos": 16148588025600.0, + "grad_norm": 1.8587455254700258, + "language_loss": 0.73335183, + "learning_rate": 9.964044669124324e-07, + "loss": 0.81010723, + "num_input_tokens_seen": 243146785, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.10797119, + "step": 11265, + "time_per_iteration": 2.469163179397583 + }, + { + "auxiliary_loss_clip": 0.06407094, + "auxiliary_loss_mlp": 0.01269883, + "balance_loss_clip": 0.06273008, + "balance_loss_mlp": 0.01258969, + "epoch": 0.6773485645573426, + "flos": 19141835103360.0, + "grad_norm": 1.6254501454395083, + "language_loss": 0.61922127, + "learning_rate": 9.96067607441207e-07, + "loss": 0.69599104, + "num_input_tokens_seen": 243165275, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.10913086, + "step": 11266, + "time_per_iteration": 2.495842933654785 + }, + { + "auxiliary_loss_clip": 0.06409343, + "auxiliary_loss_mlp": 0.01269206, + "balance_loss_clip": 0.06271549, + "balance_loss_mlp": 0.01258829, + "epoch": 0.6774086878100105, + "flos": 14142114155520.0, + "grad_norm": 1.8179552610473837, + "language_loss": 0.70953995, + "learning_rate": 9.957307860391976e-07, + "loss": 0.78632545, + "num_input_tokens_seen": 243182845, + "router_z_loss_clip": 1.37695312, + "router_z_loss_mlp": 0.1038208, + "step": 11267, + "time_per_iteration": 2.517019033432007 + }, + { + "auxiliary_loss_clip": 0.06410597, + "auxiliary_loss_mlp": 0.01264315, + "balance_loss_clip": 0.06273153, + "balance_loss_mlp": 0.01254009, + "epoch": 0.6774688110626785, + "flos": 22203075369600.0, + "grad_norm": 4.7399438404850525, + "language_loss": 0.71134216, + "learning_rate": 9.953940027191785e-07, + "loss": 0.7880913, + "num_input_tokens_seen": 243201475, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.10314941, + "step": 11268, + "time_per_iteration": 3.937225103378296 + }, + { + "auxiliary_loss_clip": 0.06412301, + "auxiliary_loss_mlp": 0.01268549, + "balance_loss_clip": 0.06274435, + "balance_loss_mlp": 0.0125726, + "epoch": 0.6775289343153464, + "flos": 23046734603520.0, + "grad_norm": 1.4295252958840357, + "language_loss": 0.76893616, + "learning_rate": 9.950572574939194e-07, + "loss": 0.84574473, + "num_input_tokens_seen": 243221850, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.11291504, + "step": 11269, + "time_per_iteration": 2.5114824771881104 + }, + { + "auxiliary_loss_clip": 0.06414156, + "auxiliary_loss_mlp": 0.01271853, + "balance_loss_clip": 0.06274021, + "balance_loss_mlp": 0.01259879, + "epoch": 0.6775890575680145, + "flos": 18298930556160.0, + "grad_norm": 1.7033288836702745, + "language_loss": 0.74101746, + "learning_rate": 9.94720550376189e-07, + "loss": 0.81787759, + "num_input_tokens_seen": 243239855, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.11968994, + "step": 11270, + "time_per_iteration": 2.4997193813323975 + }, + { + "auxiliary_loss_clip": 0.06411543, + "auxiliary_loss_mlp": 0.01265167, + "balance_loss_clip": 0.06274433, + "balance_loss_mlp": 0.01254504, + "epoch": 0.6776491808206824, + "flos": 25343251781760.0, + "grad_norm": 1.5419173604084193, + "language_loss": 0.72974074, + "learning_rate": 9.94383881378756e-07, + "loss": 0.80650789, + "num_input_tokens_seen": 243260085, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.10668945, + "step": 11271, + "time_per_iteration": 2.5310120582580566 + }, + { + "auxiliary_loss_clip": 0.06411068, + "auxiliary_loss_mlp": 0.01265404, + "balance_loss_clip": 0.06274058, + "balance_loss_mlp": 0.01254902, + "epoch": 0.6777093040733504, + "flos": 26034908509440.0, + "grad_norm": 1.6287619781350626, + "language_loss": 0.6787045, + "learning_rate": 9.94047250514387e-07, + "loss": 0.75546926, + "num_input_tokens_seen": 243280065, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.10498047, + "step": 11272, + "time_per_iteration": 2.556326389312744 + }, + { + "auxiliary_loss_clip": 0.06416756, + "auxiliary_loss_mlp": 0.01268859, + "balance_loss_clip": 0.06274517, + "balance_loss_mlp": 0.01256723, + "epoch": 0.6777694273260183, + "flos": 18009306518400.0, + "grad_norm": 2.0957855047238865, + "language_loss": 0.73988581, + "learning_rate": 9.937106577958481e-07, + "loss": 0.81674194, + "num_input_tokens_seen": 243297775, + "router_z_loss_clip": 1.42285156, + "router_z_loss_mlp": 0.121521, + "step": 11273, + "time_per_iteration": 2.4888038635253906 + }, + { + "auxiliary_loss_clip": 0.0640964, + "auxiliary_loss_mlp": 0.01267095, + "balance_loss_clip": 0.06273794, + "balance_loss_mlp": 0.01256069, + "epoch": 0.6778295505786863, + "flos": 23447886825600.0, + "grad_norm": 1.597740332843532, + "language_loss": 0.70512903, + "learning_rate": 9.933741032359015e-07, + "loss": 0.78189635, + "num_input_tokens_seen": 243315760, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.11022949, + "step": 11274, + "time_per_iteration": 2.5328569412231445 + }, + { + "auxiliary_loss_clip": 0.06408958, + "auxiliary_loss_mlp": 0.01270481, + "balance_loss_clip": 0.06268886, + "balance_loss_mlp": 0.01259413, + "epoch": 0.6778896738313542, + "flos": 19104337601280.0, + "grad_norm": 1.549823334564571, + "language_loss": 0.65894532, + "learning_rate": 9.930375868473093e-07, + "loss": 0.73573971, + "num_input_tokens_seen": 243335715, + "router_z_loss_clip": 1.40039062, + "router_z_loss_mlp": 0.1105957, + "step": 11275, + "time_per_iteration": 2.511591672897339 + }, + { + "auxiliary_loss_clip": 0.06410493, + "auxiliary_loss_mlp": 0.01266749, + "balance_loss_clip": 0.06273688, + "balance_loss_mlp": 0.01256801, + "epoch": 0.6779497970840223, + "flos": 26111077470720.0, + "grad_norm": 1.6541358125051857, + "language_loss": 0.72680271, + "learning_rate": 9.927011086428335e-07, + "loss": 0.80357516, + "num_input_tokens_seen": 243356935, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.0994873, + "step": 11276, + "time_per_iteration": 2.5891473293304443 + }, + { + "auxiliary_loss_clip": 0.06409149, + "auxiliary_loss_mlp": 0.01265896, + "balance_loss_clip": 0.06273319, + "balance_loss_mlp": 0.01255245, + "epoch": 0.6780099203366902, + "flos": 19725359736960.0, + "grad_norm": 1.5650058182326292, + "language_loss": 0.76883596, + "learning_rate": 9.923646686352317e-07, + "loss": 0.84558642, + "num_input_tokens_seen": 243375625, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.10650635, + "step": 11277, + "time_per_iteration": 3.915508985519409 + }, + { + "auxiliary_loss_clip": 0.06416161, + "auxiliary_loss_mlp": 0.01266536, + "balance_loss_clip": 0.06275125, + "balance_loss_mlp": 0.01254633, + "epoch": 0.6780700435893582, + "flos": 18218946234240.0, + "grad_norm": 2.711703251949157, + "language_loss": 0.83725727, + "learning_rate": 9.920282668372627e-07, + "loss": 0.91408426, + "num_input_tokens_seen": 243390195, + "router_z_loss_clip": 1.40917969, + "router_z_loss_mlp": 0.11907959, + "step": 11278, + "time_per_iteration": 2.4728851318359375 + }, + { + "auxiliary_loss_clip": 0.06408397, + "auxiliary_loss_mlp": 0.01270203, + "balance_loss_clip": 0.06273898, + "balance_loss_mlp": 0.01259862, + "epoch": 0.6781301668420262, + "flos": 25383600322560.0, + "grad_norm": 1.4808013348463376, + "language_loss": 0.70247126, + "learning_rate": 9.916919032616844e-07, + "loss": 0.77925724, + "num_input_tokens_seen": 243411690, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.10339355, + "step": 11279, + "time_per_iteration": 2.5876686573028564 + }, + { + "auxiliary_loss_clip": 0.06411046, + "auxiliary_loss_mlp": 0.01265971, + "balance_loss_clip": 0.06272636, + "balance_loss_mlp": 0.01254027, + "epoch": 0.6781902900946941, + "flos": 24026589849600.0, + "grad_norm": 1.7835400791989957, + "language_loss": 0.74185818, + "learning_rate": 9.913555779212485e-07, + "loss": 0.81862831, + "num_input_tokens_seen": 243430280, + "router_z_loss_clip": 1.38476562, + "router_z_loss_mlp": 0.1194458, + "step": 11280, + "time_per_iteration": 2.558945655822754 + }, + { + "auxiliary_loss_clip": 0.06412832, + "auxiliary_loss_mlp": 0.01263795, + "balance_loss_clip": 0.06270506, + "balance_loss_mlp": 0.01251844, + "epoch": 0.6782504133473621, + "flos": 19652964209280.0, + "grad_norm": 1.818075538813212, + "language_loss": 0.70597506, + "learning_rate": 9.910192908287104e-07, + "loss": 0.78274131, + "num_input_tokens_seen": 243448690, + "router_z_loss_clip": 1.42382812, + "router_z_loss_mlp": 0.11950684, + "step": 11281, + "time_per_iteration": 2.5192151069641113 + }, + { + "auxiliary_loss_clip": 0.06408101, + "auxiliary_loss_mlp": 0.01268091, + "balance_loss_clip": 0.06274794, + "balance_loss_mlp": 0.01257821, + "epoch": 0.67831053660003, + "flos": 24939080812800.0, + "grad_norm": 1.5294707212527767, + "language_loss": 0.63880533, + "learning_rate": 9.906830419968217e-07, + "loss": 0.71556723, + "num_input_tokens_seen": 243470695, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.1026001, + "step": 11282, + "time_per_iteration": 4.0389556884765625 + }, + { + "auxiliary_loss_clip": 0.06416775, + "auxiliary_loss_mlp": 0.01269152, + "balance_loss_clip": 0.06272826, + "balance_loss_mlp": 0.01257434, + "epoch": 0.6783706598526981, + "flos": 31215785984640.0, + "grad_norm": 1.5661846366283017, + "language_loss": 0.74472761, + "learning_rate": 9.90346831438334e-07, + "loss": 0.82158691, + "num_input_tokens_seen": 243493345, + "router_z_loss_clip": 1.43945312, + "router_z_loss_mlp": 0.11712646, + "step": 11283, + "time_per_iteration": 2.5889575481414795 + }, + { + "auxiliary_loss_clip": 0.06409109, + "auxiliary_loss_mlp": 0.01265353, + "balance_loss_clip": 0.06271229, + "balance_loss_mlp": 0.01255179, + "epoch": 0.678430783105366, + "flos": 35449526033280.0, + "grad_norm": 1.6303319808688523, + "language_loss": 0.57121617, + "learning_rate": 9.900106591659948e-07, + "loss": 0.64796078, + "num_input_tokens_seen": 243515670, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.10180664, + "step": 11284, + "time_per_iteration": 2.622241258621216 + }, + { + "auxiliary_loss_clip": 0.0640896, + "auxiliary_loss_mlp": 0.01263816, + "balance_loss_clip": 0.06271388, + "balance_loss_mlp": 0.01253719, + "epoch": 0.678490906358034, + "flos": 14434044180480.0, + "grad_norm": 1.7585312003136033, + "language_loss": 0.75540352, + "learning_rate": 9.896745251925535e-07, + "loss": 0.83213127, + "num_input_tokens_seen": 243533625, + "router_z_loss_clip": 1.37402344, + "router_z_loss_mlp": 0.10095215, + "step": 11285, + "time_per_iteration": 3.914513111114502 + }, + { + "auxiliary_loss_clip": 0.06408092, + "auxiliary_loss_mlp": 0.01264708, + "balance_loss_clip": 0.06274541, + "balance_loss_mlp": 0.01254355, + "epoch": 0.6785510296107019, + "flos": 24317262063360.0, + "grad_norm": 1.6087593577428982, + "language_loss": 0.66518104, + "learning_rate": 9.893384295307557e-07, + "loss": 0.74190903, + "num_input_tokens_seen": 243553040, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.10351562, + "step": 11286, + "time_per_iteration": 2.5443532466888428 + }, + { + "auxiliary_loss_clip": 0.06411726, + "auxiliary_loss_mlp": 0.01266212, + "balance_loss_clip": 0.06271268, + "balance_loss_mlp": 0.01254553, + "epoch": 0.6786111528633699, + "flos": 26984142288000.0, + "grad_norm": 2.2563712255718453, + "language_loss": 0.52888298, + "learning_rate": 9.890023721933447e-07, + "loss": 0.60566235, + "num_input_tokens_seen": 243572590, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.11663818, + "step": 11287, + "time_per_iteration": 2.5215566158294678 + }, + { + "auxiliary_loss_clip": 0.06408818, + "auxiliary_loss_mlp": 0.01265445, + "balance_loss_clip": 0.06271936, + "balance_loss_mlp": 0.01255265, + "epoch": 0.6786712761160378, + "flos": 24324641222400.0, + "grad_norm": 1.4827043233914352, + "language_loss": 0.7744714, + "learning_rate": 9.886663531930655e-07, + "loss": 0.85121405, + "num_input_tokens_seen": 243594140, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.10180664, + "step": 11288, + "time_per_iteration": 2.5451719760894775 + }, + { + "auxiliary_loss_clip": 0.06414543, + "auxiliary_loss_mlp": 0.01270807, + "balance_loss_clip": 0.06275427, + "balance_loss_mlp": 0.0125993, + "epoch": 0.6787313993687059, + "flos": 22937176990080.0, + "grad_norm": 1.9021636809125866, + "language_loss": 0.73458755, + "learning_rate": 9.883303725426593e-07, + "loss": 0.81144106, + "num_input_tokens_seen": 243615170, + "router_z_loss_clip": 1.39257812, + "router_z_loss_mlp": 0.10882568, + "step": 11289, + "time_per_iteration": 2.524062395095825 + }, + { + "auxiliary_loss_clip": 0.0640981, + "auxiliary_loss_mlp": 0.01268655, + "balance_loss_clip": 0.06271172, + "balance_loss_mlp": 0.01257795, + "epoch": 0.6787915226213738, + "flos": 26875423215360.0, + "grad_norm": 1.3961935649800772, + "language_loss": 0.80240023, + "learning_rate": 9.879944302548682e-07, + "loss": 0.87918484, + "num_input_tokens_seen": 243635675, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.10852051, + "step": 11290, + "time_per_iteration": 2.563781499862671 + }, + { + "auxiliary_loss_clip": 0.06406706, + "auxiliary_loss_mlp": 0.01270194, + "balance_loss_clip": 0.06273251, + "balance_loss_mlp": 0.01260395, + "epoch": 0.6788516458740418, + "flos": 20014648358400.0, + "grad_norm": 1.3943952846011585, + "language_loss": 0.75320244, + "learning_rate": 9.87658526342428e-07, + "loss": 0.82997143, + "num_input_tokens_seen": 243654950, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.09802246, + "step": 11291, + "time_per_iteration": 2.4833710193634033 + }, + { + "auxiliary_loss_clip": 0.06409583, + "auxiliary_loss_mlp": 0.01265199, + "balance_loss_clip": 0.06270351, + "balance_loss_mlp": 0.01254709, + "epoch": 0.6789117691267098, + "flos": 28734045356160.0, + "grad_norm": 1.6032413484745063, + "language_loss": 0.75235522, + "learning_rate": 9.873226608180785e-07, + "loss": 0.82910305, + "num_input_tokens_seen": 243674970, + "router_z_loss_clip": 1.39257812, + "router_z_loss_mlp": 0.10491943, + "step": 11292, + "time_per_iteration": 2.5987610816955566 + }, + { + "auxiliary_loss_clip": 0.06407046, + "auxiliary_loss_mlp": 0.01271571, + "balance_loss_clip": 0.06271286, + "balance_loss_mlp": 0.01261235, + "epoch": 0.6789718923793777, + "flos": 23410053907200.0, + "grad_norm": 1.8128590339737811, + "language_loss": 0.84362906, + "learning_rate": 9.869868336945556e-07, + "loss": 0.92041528, + "num_input_tokens_seen": 243693440, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.10345459, + "step": 11293, + "time_per_iteration": 2.6490092277526855 + }, + { + "auxiliary_loss_clip": 0.06418362, + "auxiliary_loss_mlp": 0.01266521, + "balance_loss_clip": 0.06273804, + "balance_loss_mlp": 0.01255661, + "epoch": 0.6790320156320457, + "flos": 20455100945280.0, + "grad_norm": 2.3830710729233937, + "language_loss": 0.79575551, + "learning_rate": 9.866510449845929e-07, + "loss": 0.87260431, + "num_input_tokens_seen": 243710055, + "router_z_loss_clip": 1.44433594, + "router_z_loss_mlp": 0.10852051, + "step": 11294, + "time_per_iteration": 2.540187120437622 + }, + { + "auxiliary_loss_clip": 0.06410551, + "auxiliary_loss_mlp": 0.0126649, + "balance_loss_clip": 0.06273465, + "balance_loss_mlp": 0.01256507, + "epoch": 0.6790921388847136, + "flos": 24173519184000.0, + "grad_norm": 1.663290513792591, + "language_loss": 0.79323423, + "learning_rate": 9.86315294700924e-07, + "loss": 0.87000465, + "num_input_tokens_seen": 243728635, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.09985352, + "step": 11295, + "time_per_iteration": 2.539522171020508 + }, + { + "auxiliary_loss_clip": 0.06403016, + "auxiliary_loss_mlp": 0.01270622, + "balance_loss_clip": 0.06270514, + "balance_loss_mlp": 0.01261312, + "epoch": 0.6791522621373817, + "flos": 21914541434880.0, + "grad_norm": 1.9398184157871654, + "language_loss": 0.71742594, + "learning_rate": 9.859795828562823e-07, + "loss": 0.79416239, + "num_input_tokens_seen": 243748330, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.09313965, + "step": 11296, + "time_per_iteration": 2.5390286445617676 + }, + { + "auxiliary_loss_clip": 0.06406362, + "auxiliary_loss_mlp": 0.01266184, + "balance_loss_clip": 0.06269884, + "balance_loss_mlp": 0.01256212, + "epoch": 0.6792123853900496, + "flos": 24833380487040.0, + "grad_norm": 1.7008493408846614, + "language_loss": 0.70970011, + "learning_rate": 9.856439094633949e-07, + "loss": 0.78642553, + "num_input_tokens_seen": 243769380, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.09979248, + "step": 11297, + "time_per_iteration": 2.5342774391174316 + }, + { + "auxiliary_loss_clip": 0.06413988, + "auxiliary_loss_mlp": 0.01268754, + "balance_loss_clip": 0.06271179, + "balance_loss_mlp": 0.01257691, + "epoch": 0.6792725086427176, + "flos": 17571998459520.0, + "grad_norm": 2.072165205112126, + "language_loss": 0.66610634, + "learning_rate": 9.853082745349918e-07, + "loss": 0.74293375, + "num_input_tokens_seen": 243785510, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.11071777, + "step": 11298, + "time_per_iteration": 2.5330231189727783 + }, + { + "auxiliary_loss_clip": 0.06408876, + "auxiliary_loss_mlp": 0.01265536, + "balance_loss_clip": 0.06269588, + "balance_loss_mlp": 0.01255767, + "epoch": 0.6793326318953855, + "flos": 26948908846080.0, + "grad_norm": 1.6501656577542423, + "language_loss": 0.71810848, + "learning_rate": 9.84972678083801e-07, + "loss": 0.79485255, + "num_input_tokens_seen": 243805545, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.09771729, + "step": 11299, + "time_per_iteration": 2.547666072845459 + }, + { + "auxiliary_loss_clip": 0.06407908, + "auxiliary_loss_mlp": 0.01269253, + "balance_loss_clip": 0.06269622, + "balance_loss_mlp": 0.01258196, + "epoch": 0.6793927551480535, + "flos": 24325479763200.0, + "grad_norm": 1.2577197776351332, + "language_loss": 0.77542967, + "learning_rate": 9.846371201225488e-07, + "loss": 0.85220122, + "num_input_tokens_seen": 243825185, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.1105957, + "step": 11300, + "time_per_iteration": 2.568537473678589 + }, + { + "auxiliary_loss_clip": 0.06409447, + "auxiliary_loss_mlp": 0.01267373, + "balance_loss_clip": 0.06272208, + "balance_loss_mlp": 0.01256847, + "epoch": 0.6794528784007214, + "flos": 11441300227200.0, + "grad_norm": 1.9915071500414414, + "language_loss": 0.63348699, + "learning_rate": 9.843016006639577e-07, + "loss": 0.71025515, + "num_input_tokens_seen": 243841600, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.10534668, + "step": 11301, + "time_per_iteration": 2.4696924686431885 + }, + { + "auxiliary_loss_clip": 0.06409229, + "auxiliary_loss_mlp": 0.01266875, + "balance_loss_clip": 0.06270877, + "balance_loss_mlp": 0.01256772, + "epoch": 0.6795130016533895, + "flos": 25236922550400.0, + "grad_norm": 1.7173390721705748, + "language_loss": 0.82948458, + "learning_rate": 9.839661197207525e-07, + "loss": 0.90624553, + "num_input_tokens_seen": 243862250, + "router_z_loss_clip": 1.38476562, + "router_z_loss_mlp": 0.10107422, + "step": 11302, + "time_per_iteration": 2.598444938659668 + }, + { + "auxiliary_loss_clip": 0.0641208, + "auxiliary_loss_mlp": 0.01264081, + "balance_loss_clip": 0.06272297, + "balance_loss_mlp": 0.01254121, + "epoch": 0.6795731249060574, + "flos": 18302326646400.0, + "grad_norm": 1.7779256028698032, + "language_loss": 0.69851995, + "learning_rate": 9.83630677305654e-07, + "loss": 0.77528167, + "num_input_tokens_seen": 243880560, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.09954834, + "step": 11303, + "time_per_iteration": 2.4852330684661865 + }, + { + "auxiliary_loss_clip": 0.06413473, + "auxiliary_loss_mlp": 0.01264262, + "balance_loss_clip": 0.06271894, + "balance_loss_mlp": 0.0125336, + "epoch": 0.6796332481587254, + "flos": 20306159112960.0, + "grad_norm": 1.8204218049780263, + "language_loss": 0.70597726, + "learning_rate": 9.832952734313813e-07, + "loss": 0.7827546, + "num_input_tokens_seen": 243900635, + "router_z_loss_clip": 1.41601562, + "router_z_loss_mlp": 0.10900879, + "step": 11304, + "time_per_iteration": 2.5139074325561523 + }, + { + "auxiliary_loss_clip": 0.0641301, + "auxiliary_loss_mlp": 0.01268726, + "balance_loss_clip": 0.0627501, + "balance_loss_mlp": 0.01257794, + "epoch": 0.6796933714113934, + "flos": 23593642202880.0, + "grad_norm": 2.4376362863510046, + "language_loss": 0.72319949, + "learning_rate": 9.829599081106536e-07, + "loss": 0.80001682, + "num_input_tokens_seen": 243920160, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.109375, + "step": 11305, + "time_per_iteration": 2.522174119949341 + }, + { + "auxiliary_loss_clip": 0.06407507, + "auxiliary_loss_mlp": 0.01264269, + "balance_loss_clip": 0.06268832, + "balance_loss_mlp": 0.01252986, + "epoch": 0.6797534946640613, + "flos": 27126291939840.0, + "grad_norm": 2.8826024363137535, + "language_loss": 0.66289663, + "learning_rate": 9.826245813561882e-07, + "loss": 0.73961437, + "num_input_tokens_seen": 243939015, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.11297607, + "step": 11306, + "time_per_iteration": 2.5523674488067627 + }, + { + "auxiliary_loss_clip": 0.06408583, + "auxiliary_loss_mlp": 0.01265584, + "balance_loss_clip": 0.06272008, + "balance_loss_mlp": 0.01255547, + "epoch": 0.6798136179167293, + "flos": 22133992078080.0, + "grad_norm": 1.614397517334369, + "language_loss": 0.80464542, + "learning_rate": 9.822892931807021e-07, + "loss": 0.88138705, + "num_input_tokens_seen": 243958470, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.10028076, + "step": 11307, + "time_per_iteration": 3.9510881900787354 + }, + { + "auxiliary_loss_clip": 0.06403545, + "auxiliary_loss_mlp": 0.0126431, + "balance_loss_clip": 0.06269792, + "balance_loss_mlp": 0.01253677, + "epoch": 0.6798737411693972, + "flos": 17493565438080.0, + "grad_norm": 1.503954365849396, + "language_loss": 0.89141631, + "learning_rate": 9.819540435969066e-07, + "loss": 0.96809489, + "num_input_tokens_seen": 243975450, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.10638428, + "step": 11308, + "time_per_iteration": 2.454899549484253 + }, + { + "auxiliary_loss_clip": 0.06406927, + "auxiliary_loss_mlp": 0.01264598, + "balance_loss_clip": 0.06268145, + "balance_loss_mlp": 0.01253792, + "epoch": 0.6799338644220653, + "flos": 22898715166080.0, + "grad_norm": 1.9892982746856287, + "language_loss": 0.71669519, + "learning_rate": 9.816188326175154e-07, + "loss": 0.79341042, + "num_input_tokens_seen": 243994355, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.1081543, + "step": 11309, + "time_per_iteration": 2.537949562072754 + }, + { + "auxiliary_loss_clip": 0.06407045, + "auxiliary_loss_mlp": 0.01269522, + "balance_loss_clip": 0.0626999, + "balance_loss_mlp": 0.01259312, + "epoch": 0.6799939876747332, + "flos": 23186284778880.0, + "grad_norm": 2.168983976078807, + "language_loss": 0.84444106, + "learning_rate": 9.812836602552411e-07, + "loss": 0.92120677, + "num_input_tokens_seen": 244011620, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.10217285, + "step": 11310, + "time_per_iteration": 2.5093727111816406 + }, + { + "auxiliary_loss_clip": 0.06401814, + "auxiliary_loss_mlp": 0.01262918, + "balance_loss_clip": 0.06269856, + "balance_loss_mlp": 0.0125331, + "epoch": 0.6800541109274012, + "flos": 19505951020800.0, + "grad_norm": 1.936116503903549, + "language_loss": 0.83367699, + "learning_rate": 9.80948526522792e-07, + "loss": 0.91032434, + "num_input_tokens_seen": 244029925, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09613037, + "step": 11311, + "time_per_iteration": 2.5046095848083496 + }, + { + "auxiliary_loss_clip": 0.064105, + "auxiliary_loss_mlp": 0.01269609, + "balance_loss_clip": 0.06267536, + "balance_loss_mlp": 0.01257491, + "epoch": 0.6801142341800691, + "flos": 22284946408320.0, + "grad_norm": 1.5408548920294685, + "language_loss": 0.7658841, + "learning_rate": 9.806134314328767e-07, + "loss": 0.84268516, + "num_input_tokens_seen": 244051225, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.12133789, + "step": 11312, + "time_per_iteration": 2.5174195766448975 + }, + { + "auxiliary_loss_clip": 0.06310892, + "auxiliary_loss_mlp": 0.01252687, + "balance_loss_clip": 0.06255079, + "balance_loss_mlp": 0.01251411, + "epoch": 0.6801743574327371, + "flos": 68734439614080.0, + "grad_norm": 0.6438614608961274, + "language_loss": 0.57270527, + "learning_rate": 9.802783749982038e-07, + "loss": 0.64834106, + "num_input_tokens_seen": 244115930, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.01276398, + "step": 11313, + "time_per_iteration": 3.2520179748535156 + }, + { + "auxiliary_loss_clip": 0.06408104, + "auxiliary_loss_mlp": 0.01265154, + "balance_loss_clip": 0.06268254, + "balance_loss_mlp": 0.0125483, + "epoch": 0.680234480685405, + "flos": 29468146976640.0, + "grad_norm": 1.6190653949052565, + "language_loss": 0.69341791, + "learning_rate": 9.799433572314754e-07, + "loss": 0.77015042, + "num_input_tokens_seen": 244137320, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.10327148, + "step": 11314, + "time_per_iteration": 2.5535359382629395 + }, + { + "auxiliary_loss_clip": 0.06404889, + "auxiliary_loss_mlp": 0.01267434, + "balance_loss_clip": 0.06268796, + "balance_loss_mlp": 0.01257731, + "epoch": 0.6802946039380731, + "flos": 15921045463680.0, + "grad_norm": 1.9728888269672866, + "language_loss": 0.81508797, + "learning_rate": 9.796083781453972e-07, + "loss": 0.89181113, + "num_input_tokens_seen": 244152755, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.0970459, + "step": 11315, + "time_per_iteration": 2.5169835090637207 + }, + { + "auxiliary_loss_clip": 0.06405143, + "auxiliary_loss_mlp": 0.01264219, + "balance_loss_clip": 0.06267972, + "balance_loss_mlp": 0.01253723, + "epoch": 0.680354727190741, + "flos": 22025314932480.0, + "grad_norm": 1.6675934827220065, + "language_loss": 0.70277983, + "learning_rate": 9.792734377526718e-07, + "loss": 0.77947348, + "num_input_tokens_seen": 244171480, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.1048584, + "step": 11316, + "time_per_iteration": 2.4984679222106934 + }, + { + "auxiliary_loss_clip": 0.06405444, + "auxiliary_loss_mlp": 0.01268676, + "balance_loss_clip": 0.06269848, + "balance_loss_mlp": 0.01258478, + "epoch": 0.680414850443409, + "flos": 18447285409920.0, + "grad_norm": 2.1628292849287267, + "language_loss": 0.67277592, + "learning_rate": 9.789385360660003e-07, + "loss": 0.74951708, + "num_input_tokens_seen": 244187920, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.10205078, + "step": 11317, + "time_per_iteration": 3.912996292114258 + }, + { + "auxiliary_loss_clip": 0.06412488, + "auxiliary_loss_mlp": 0.01266936, + "balance_loss_clip": 0.06273043, + "balance_loss_mlp": 0.01256666, + "epoch": 0.680474973696077, + "flos": 26365677701760.0, + "grad_norm": 1.4339432029892007, + "language_loss": 0.74834979, + "learning_rate": 9.78603673098082e-07, + "loss": 0.82514405, + "num_input_tokens_seen": 244209565, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.10266113, + "step": 11318, + "time_per_iteration": 2.613416910171509 + }, + { + "auxiliary_loss_clip": 0.06405453, + "auxiliary_loss_mlp": 0.01261508, + "balance_loss_clip": 0.06270547, + "balance_loss_mlp": 0.01252502, + "epoch": 0.6805350969487449, + "flos": 18339069461760.0, + "grad_norm": 1.741381394136802, + "language_loss": 0.6821155, + "learning_rate": 9.782688488616143e-07, + "loss": 0.75878513, + "num_input_tokens_seen": 244228015, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.09008789, + "step": 11319, + "time_per_iteration": 2.4735772609710693 + }, + { + "auxiliary_loss_clip": 0.06402999, + "auxiliary_loss_mlp": 0.01267278, + "balance_loss_clip": 0.06269106, + "balance_loss_mlp": 0.01257354, + "epoch": 0.6805952202014129, + "flos": 19943552568960.0, + "grad_norm": 1.589394100312008, + "language_loss": 0.77030569, + "learning_rate": 9.779340633692945e-07, + "loss": 0.84700847, + "num_input_tokens_seen": 244245615, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.09924316, + "step": 11320, + "time_per_iteration": 2.5447402000427246 + }, + { + "auxiliary_loss_clip": 0.06406876, + "auxiliary_loss_mlp": 0.01264766, + "balance_loss_clip": 0.06270229, + "balance_loss_mlp": 0.01254341, + "epoch": 0.6806553434540809, + "flos": 25230633494400.0, + "grad_norm": 1.8063346564210203, + "language_loss": 0.75357598, + "learning_rate": 9.77599316633817e-07, + "loss": 0.8302924, + "num_input_tokens_seen": 244263625, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.10437012, + "step": 11321, + "time_per_iteration": 3.959946393966675 + }, + { + "auxiliary_loss_clip": 0.064097, + "auxiliary_loss_mlp": 0.01264729, + "balance_loss_clip": 0.06270082, + "balance_loss_mlp": 0.01254274, + "epoch": 0.6807154667067489, + "flos": 17791407175680.0, + "grad_norm": 2.0443838016403495, + "language_loss": 0.73213184, + "learning_rate": 9.772646086678758e-07, + "loss": 0.80887616, + "num_input_tokens_seen": 244282745, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.10461426, + "step": 11322, + "time_per_iteration": 2.508143663406372 + }, + { + "auxiliary_loss_clip": 0.0641022, + "auxiliary_loss_mlp": 0.01264866, + "balance_loss_clip": 0.06270386, + "balance_loss_mlp": 0.01253517, + "epoch": 0.6807755899594168, + "flos": 22206387605760.0, + "grad_norm": 1.7755779600619086, + "language_loss": 0.78547817, + "learning_rate": 9.769299394841638e-07, + "loss": 0.86222905, + "num_input_tokens_seen": 244303770, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.11352539, + "step": 11323, + "time_per_iteration": 2.5345656871795654 + }, + { + "auxiliary_loss_clip": 0.06315179, + "auxiliary_loss_mlp": 0.01251391, + "balance_loss_clip": 0.06259721, + "balance_loss_mlp": 0.0125015, + "epoch": 0.6808357132120848, + "flos": 68648878995840.0, + "grad_norm": 0.7384546914137473, + "language_loss": 0.57113785, + "learning_rate": 9.765953090953714e-07, + "loss": 0.64680356, + "num_input_tokens_seen": 244355910, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.0124054, + "step": 11324, + "time_per_iteration": 2.9890177249908447 + }, + { + "auxiliary_loss_clip": 0.06410179, + "auxiliary_loss_mlp": 0.01265495, + "balance_loss_clip": 0.06271601, + "balance_loss_mlp": 0.01254301, + "epoch": 0.6808958364647527, + "flos": 23850380712960.0, + "grad_norm": 1.8768737712077719, + "language_loss": 0.68368208, + "learning_rate": 9.76260717514186e-07, + "loss": 0.76043886, + "num_input_tokens_seen": 244376610, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.11193848, + "step": 11325, + "time_per_iteration": 4.024105072021484 + }, + { + "auxiliary_loss_clip": 0.06410693, + "auxiliary_loss_mlp": 0.0126769, + "balance_loss_clip": 0.06269176, + "balance_loss_mlp": 0.01256705, + "epoch": 0.6809559597174207, + "flos": 17717376493440.0, + "grad_norm": 2.1078464153023924, + "language_loss": 0.70419264, + "learning_rate": 9.759261647532974e-07, + "loss": 0.78097641, + "num_input_tokens_seen": 244393000, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.10986328, + "step": 11326, + "time_per_iteration": 2.484449625015259 + }, + { + "auxiliary_loss_clip": 0.06407395, + "auxiliary_loss_mlp": 0.01261696, + "balance_loss_clip": 0.06270957, + "balance_loss_mlp": 0.01251551, + "epoch": 0.6810160829700886, + "flos": 22498443411840.0, + "grad_norm": 1.638017241748174, + "language_loss": 0.72914612, + "learning_rate": 9.75591650825392e-07, + "loss": 0.80583698, + "num_input_tokens_seen": 244409515, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.10150146, + "step": 11327, + "time_per_iteration": 2.502293586730957 + }, + { + "auxiliary_loss_clip": 0.06405802, + "auxiliary_loss_mlp": 0.01266544, + "balance_loss_clip": 0.06270967, + "balance_loss_mlp": 0.01255839, + "epoch": 0.6810762062227567, + "flos": 16837854912000.0, + "grad_norm": 1.827919270381089, + "language_loss": 0.77294552, + "learning_rate": 9.752571757431526e-07, + "loss": 0.84966898, + "num_input_tokens_seen": 244427165, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.10705566, + "step": 11328, + "time_per_iteration": 2.469923734664917 + }, + { + "auxiliary_loss_clip": 0.06412201, + "auxiliary_loss_mlp": 0.01264628, + "balance_loss_clip": 0.0627179, + "balance_loss_mlp": 0.01253941, + "epoch": 0.6811363294754246, + "flos": 12719751897600.0, + "grad_norm": 1.8250307958699987, + "language_loss": 0.64754045, + "learning_rate": 9.74922739519265e-07, + "loss": 0.72430873, + "num_input_tokens_seen": 244445705, + "router_z_loss_clip": 1.40527344, + "router_z_loss_mlp": 0.10681152, + "step": 11329, + "time_per_iteration": 2.5292539596557617 + }, + { + "auxiliary_loss_clip": 0.06409349, + "auxiliary_loss_mlp": 0.01264815, + "balance_loss_clip": 0.06270607, + "balance_loss_mlp": 0.01254182, + "epoch": 0.6811964527280926, + "flos": 17717669982720.0, + "grad_norm": 1.8641198647355242, + "language_loss": 0.79316872, + "learning_rate": 9.745883421664096e-07, + "loss": 0.86991036, + "num_input_tokens_seen": 244460415, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.10638428, + "step": 11330, + "time_per_iteration": 2.4813790321350098 + }, + { + "auxiliary_loss_clip": 0.0641039, + "auxiliary_loss_mlp": 0.01264709, + "balance_loss_clip": 0.06272174, + "balance_loss_mlp": 0.0125376, + "epoch": 0.6812565759807605, + "flos": 24870416791680.0, + "grad_norm": 2.109092836267495, + "language_loss": 0.64502859, + "learning_rate": 9.742539836972665e-07, + "loss": 0.72177964, + "num_input_tokens_seen": 244480555, + "router_z_loss_clip": 1.38183594, + "router_z_loss_mlp": 0.10943604, + "step": 11331, + "time_per_iteration": 2.6124520301818848 + }, + { + "auxiliary_loss_clip": 0.06407228, + "auxiliary_loss_mlp": 0.01265019, + "balance_loss_clip": 0.06270872, + "balance_loss_mlp": 0.01254666, + "epoch": 0.6813166992334285, + "flos": 17171852486400.0, + "grad_norm": 1.5406157015161637, + "language_loss": 0.72821605, + "learning_rate": 9.739196641245148e-07, + "loss": 0.80493855, + "num_input_tokens_seen": 244498540, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.1036377, + "step": 11332, + "time_per_iteration": 2.483144760131836 + }, + { + "auxiliary_loss_clip": 0.06412952, + "auxiliary_loss_mlp": 0.01267338, + "balance_loss_clip": 0.06272908, + "balance_loss_mlp": 0.01256705, + "epoch": 0.6813768224860965, + "flos": 18849527735040.0, + "grad_norm": 2.149720533461842, + "language_loss": 0.74508882, + "learning_rate": 9.735853834608326e-07, + "loss": 0.82189173, + "num_input_tokens_seen": 244517015, + "router_z_loss_clip": 1.39941406, + "router_z_loss_mlp": 0.10638428, + "step": 11333, + "time_per_iteration": 2.5427186489105225 + }, + { + "auxiliary_loss_clip": 0.06414136, + "auxiliary_loss_mlp": 0.01267127, + "balance_loss_clip": 0.06272501, + "balance_loss_mlp": 0.01256786, + "epoch": 0.6814369457387645, + "flos": 24539228328960.0, + "grad_norm": 1.3823548887580743, + "language_loss": 0.72367668, + "learning_rate": 9.732511417188963e-07, + "loss": 0.80048931, + "num_input_tokens_seen": 244537450, + "router_z_loss_clip": 1.41601562, + "router_z_loss_mlp": 0.10345459, + "step": 11334, + "time_per_iteration": 2.537958860397339 + }, + { + "auxiliary_loss_clip": 0.06405447, + "auxiliary_loss_mlp": 0.01266429, + "balance_loss_clip": 0.06271046, + "balance_loss_mlp": 0.0125607, + "epoch": 0.6814970689914325, + "flos": 18228799088640.0, + "grad_norm": 1.6460074116702026, + "language_loss": 0.86505604, + "learning_rate": 9.729169389113791e-07, + "loss": 0.94177485, + "num_input_tokens_seen": 244555640, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.10357666, + "step": 11335, + "time_per_iteration": 2.5018861293792725 + }, + { + "auxiliary_loss_clip": 0.06401964, + "auxiliary_loss_mlp": 0.01265777, + "balance_loss_clip": 0.06271435, + "balance_loss_mlp": 0.01255656, + "epoch": 0.6815571922441004, + "flos": 25235874374400.0, + "grad_norm": 1.6438782420335836, + "language_loss": 0.81760287, + "learning_rate": 9.725827750509542e-07, + "loss": 0.89428031, + "num_input_tokens_seen": 244574005, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.10125732, + "step": 11336, + "time_per_iteration": 2.5359947681427 + }, + { + "auxiliary_loss_clip": 0.06403621, + "auxiliary_loss_mlp": 0.01268492, + "balance_loss_clip": 0.06270905, + "balance_loss_mlp": 0.0125818, + "epoch": 0.6816173154967684, + "flos": 19460864724480.0, + "grad_norm": 1.9165693219649298, + "language_loss": 0.82064402, + "learning_rate": 9.72248650150294e-07, + "loss": 0.89736515, + "num_input_tokens_seen": 244591395, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.10321045, + "step": 11337, + "time_per_iteration": 2.511289119720459 + }, + { + "auxiliary_loss_clip": 0.06404516, + "auxiliary_loss_mlp": 0.01264446, + "balance_loss_clip": 0.06271403, + "balance_loss_mlp": 0.01254462, + "epoch": 0.6816774387494363, + "flos": 17937288334080.0, + "grad_norm": 1.560533910826156, + "language_loss": 0.73002589, + "learning_rate": 9.719145642220673e-07, + "loss": 0.80671549, + "num_input_tokens_seen": 244610400, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.09979248, + "step": 11338, + "time_per_iteration": 2.511681318283081 + }, + { + "auxiliary_loss_clip": 0.06413732, + "auxiliary_loss_mlp": 0.01264898, + "balance_loss_clip": 0.06275684, + "balance_loss_mlp": 0.01254337, + "epoch": 0.6817375620021043, + "flos": 22238937717120.0, + "grad_norm": 1.4240412111564371, + "language_loss": 0.77416432, + "learning_rate": 9.715805172789435e-07, + "loss": 0.8509506, + "num_input_tokens_seen": 244630400, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.10559082, + "step": 11339, + "time_per_iteration": 2.5428354740142822 + }, + { + "auxiliary_loss_clip": 0.06410687, + "auxiliary_loss_mlp": 0.01264953, + "balance_loss_clip": 0.06272987, + "balance_loss_mlp": 0.012542, + "epoch": 0.6817976852547722, + "flos": 25381462043520.0, + "grad_norm": 1.7944902461652392, + "language_loss": 0.71041632, + "learning_rate": 9.712465093335901e-07, + "loss": 0.78717273, + "num_input_tokens_seen": 244649155, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.10748291, + "step": 11340, + "time_per_iteration": 2.550901412963867 + }, + { + "auxiliary_loss_clip": 0.06413396, + "auxiliary_loss_mlp": 0.01267156, + "balance_loss_clip": 0.06273545, + "balance_loss_mlp": 0.01256725, + "epoch": 0.6818578085074403, + "flos": 22271068558080.0, + "grad_norm": 2.180704981107058, + "language_loss": 0.84409666, + "learning_rate": 9.709125403986722e-07, + "loss": 0.92090219, + "num_input_tokens_seen": 244665470, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.10437012, + "step": 11341, + "time_per_iteration": 2.5165159702301025 + }, + { + "auxiliary_loss_clip": 0.06414375, + "auxiliary_loss_mlp": 0.01266506, + "balance_loss_clip": 0.06275092, + "balance_loss_mlp": 0.01255831, + "epoch": 0.6819179317601082, + "flos": 19324249441920.0, + "grad_norm": 1.5598647366733476, + "language_loss": 0.68810844, + "learning_rate": 9.705786104868531e-07, + "loss": 0.76491725, + "num_input_tokens_seen": 244684390, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.10681152, + "step": 11342, + "time_per_iteration": 2.593763589859009 + }, + { + "auxiliary_loss_clip": 0.06407441, + "auxiliary_loss_mlp": 0.01261474, + "balance_loss_clip": 0.0627171, + "balance_loss_mlp": 0.01251342, + "epoch": 0.6819780550127762, + "flos": 21110224492800.0, + "grad_norm": 1.6656061272859015, + "language_loss": 0.74818993, + "learning_rate": 9.702447196107963e-07, + "loss": 0.82487905, + "num_input_tokens_seen": 244703370, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.10131836, + "step": 11343, + "time_per_iteration": 2.524341344833374 + }, + { + "auxiliary_loss_clip": 0.06415273, + "auxiliary_loss_mlp": 0.01266539, + "balance_loss_clip": 0.06277119, + "balance_loss_mlp": 0.01256055, + "epoch": 0.6820381782654441, + "flos": 29724214654080.0, + "grad_norm": 1.6102730777044594, + "language_loss": 0.80077457, + "learning_rate": 9.699108677831639e-07, + "loss": 0.87759268, + "num_input_tokens_seen": 244723325, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.1048584, + "step": 11344, + "time_per_iteration": 2.559631586074829 + }, + { + "auxiliary_loss_clip": 0.06412022, + "auxiliary_loss_mlp": 0.01263183, + "balance_loss_clip": 0.06272998, + "balance_loss_mlp": 0.01252747, + "epoch": 0.6820983015181121, + "flos": 29249870290560.0, + "grad_norm": 1.8689488071291331, + "language_loss": 0.66530693, + "learning_rate": 9.695770550166136e-07, + "loss": 0.74205899, + "num_input_tokens_seen": 244745650, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.10424805, + "step": 11345, + "time_per_iteration": 2.588878870010376 + }, + { + "auxiliary_loss_clip": 0.06416089, + "auxiliary_loss_mlp": 0.01264993, + "balance_loss_clip": 0.06275414, + "balance_loss_mlp": 0.01254538, + "epoch": 0.6821584247707801, + "flos": 18876375768960.0, + "grad_norm": 2.261790357681116, + "language_loss": 0.65540516, + "learning_rate": 9.692432813238054e-07, + "loss": 0.732216, + "num_input_tokens_seen": 244760270, + "router_z_loss_clip": 1.40527344, + "router_z_loss_mlp": 0.10461426, + "step": 11346, + "time_per_iteration": 2.4776885509490967 + }, + { + "auxiliary_loss_clip": 0.06415972, + "auxiliary_loss_mlp": 0.01264195, + "balance_loss_clip": 0.06274392, + "balance_loss_mlp": 0.01253567, + "epoch": 0.6822185480234481, + "flos": 21330974874240.0, + "grad_norm": 1.434084459819624, + "language_loss": 0.7886349, + "learning_rate": 9.689095467173952e-07, + "loss": 0.86543655, + "num_input_tokens_seen": 244779565, + "router_z_loss_clip": 1.41699219, + "router_z_loss_mlp": 0.10632324, + "step": 11347, + "time_per_iteration": 3.919304132461548 + }, + { + "auxiliary_loss_clip": 0.06316185, + "auxiliary_loss_mlp": 0.01255511, + "balance_loss_clip": 0.06260848, + "balance_loss_mlp": 0.01254305, + "epoch": 0.6822786712761161, + "flos": 63505540949760.0, + "grad_norm": 0.7177694724545725, + "language_loss": 0.52512419, + "learning_rate": 9.685758512100378e-07, + "loss": 0.60084116, + "num_input_tokens_seen": 244838480, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01203918, + "step": 11348, + "time_per_iteration": 3.14101505279541 + }, + { + "auxiliary_loss_clip": 0.06413009, + "auxiliary_loss_mlp": 0.01264656, + "balance_loss_clip": 0.06278681, + "balance_loss_mlp": 0.01255209, + "epoch": 0.682338794528784, + "flos": 21075242613120.0, + "grad_norm": 1.7094709865372797, + "language_loss": 0.79881036, + "learning_rate": 9.682421948143873e-07, + "loss": 0.87558699, + "num_input_tokens_seen": 244855265, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.09448242, + "step": 11349, + "time_per_iteration": 2.497866630554199 + }, + { + "auxiliary_loss_clip": 0.06425133, + "auxiliary_loss_mlp": 0.01267838, + "balance_loss_clip": 0.06278804, + "balance_loss_mlp": 0.01255595, + "epoch": 0.682398917781452, + "flos": 36292053237120.0, + "grad_norm": 1.5698213232216975, + "language_loss": 0.7393533, + "learning_rate": 9.67908577543096e-07, + "loss": 0.81628305, + "num_input_tokens_seen": 244875555, + "router_z_loss_clip": 1.46289062, + "router_z_loss_mlp": 0.12243652, + "step": 11350, + "time_per_iteration": 2.62261700630188 + }, + { + "auxiliary_loss_clip": 0.06411327, + "auxiliary_loss_mlp": 0.01267917, + "balance_loss_clip": 0.06275079, + "balance_loss_mlp": 0.01258094, + "epoch": 0.6824590410341199, + "flos": 24865427473920.0, + "grad_norm": 1.5591585279724258, + "language_loss": 0.79965377, + "learning_rate": 9.675749994088161e-07, + "loss": 0.87644625, + "num_input_tokens_seen": 244895270, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.09832764, + "step": 11351, + "time_per_iteration": 2.528369665145874 + }, + { + "auxiliary_loss_clip": 0.06409021, + "auxiliary_loss_mlp": 0.01262582, + "balance_loss_clip": 0.06272362, + "balance_loss_mlp": 0.0125292, + "epoch": 0.6825191642867879, + "flos": 22458430287360.0, + "grad_norm": 1.5623570195172147, + "language_loss": 0.73523104, + "learning_rate": 9.672414604241954e-07, + "loss": 0.81194711, + "num_input_tokens_seen": 244914535, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.09661865, + "step": 11352, + "time_per_iteration": 2.522172451019287 + }, + { + "auxiliary_loss_clip": 0.06413847, + "auxiliary_loss_mlp": 0.0126533, + "balance_loss_clip": 0.0627329, + "balance_loss_mlp": 0.01253677, + "epoch": 0.6825792875394558, + "flos": 29432116920960.0, + "grad_norm": 1.626079801889606, + "language_loss": 0.804649, + "learning_rate": 9.669079606018814e-07, + "loss": 0.88144076, + "num_input_tokens_seen": 244936095, + "router_z_loss_clip": 1.40332031, + "router_z_loss_mlp": 0.11639404, + "step": 11353, + "time_per_iteration": 2.5686585903167725 + }, + { + "auxiliary_loss_clip": 0.06413363, + "auxiliary_loss_mlp": 0.01265606, + "balance_loss_clip": 0.06276349, + "balance_loss_mlp": 0.01254747, + "epoch": 0.6826394107921239, + "flos": 18777006426240.0, + "grad_norm": 1.604562568600035, + "language_loss": 0.78506744, + "learning_rate": 9.665744999545218e-07, + "loss": 0.86185712, + "num_input_tokens_seen": 244955290, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.10864258, + "step": 11354, + "time_per_iteration": 2.5204999446868896 + }, + { + "auxiliary_loss_clip": 0.06408085, + "auxiliary_loss_mlp": 0.01263379, + "balance_loss_clip": 0.06272091, + "balance_loss_mlp": 0.0125355, + "epoch": 0.6826995340447918, + "flos": 16623142024320.0, + "grad_norm": 2.019321118646576, + "language_loss": 0.62111843, + "learning_rate": 9.662410784947599e-07, + "loss": 0.69783312, + "num_input_tokens_seen": 244972935, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.09814453, + "step": 11355, + "time_per_iteration": 2.4766104221343994 + }, + { + "auxiliary_loss_clip": 0.06412464, + "auxiliary_loss_mlp": 0.01263892, + "balance_loss_clip": 0.0627443, + "balance_loss_mlp": 0.01254117, + "epoch": 0.6827596572974598, + "flos": 20854282596480.0, + "grad_norm": 1.7897850919384148, + "language_loss": 0.82221437, + "learning_rate": 9.659076962352398e-07, + "loss": 0.89897794, + "num_input_tokens_seen": 244989440, + "router_z_loss_clip": 1.38183594, + "router_z_loss_mlp": 0.09771729, + "step": 11356, + "time_per_iteration": 3.9204885959625244 + }, + { + "auxiliary_loss_clip": 0.06415853, + "auxiliary_loss_mlp": 0.01263188, + "balance_loss_clip": 0.06275809, + "balance_loss_mlp": 0.01252561, + "epoch": 0.6828197805501277, + "flos": 22754804578560.0, + "grad_norm": 1.6532324250211312, + "language_loss": 0.78508228, + "learning_rate": 9.655743531886052e-07, + "loss": 0.86187267, + "num_input_tokens_seen": 245007830, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.10626221, + "step": 11357, + "time_per_iteration": 2.5153608322143555 + }, + { + "auxiliary_loss_clip": 0.06314074, + "auxiliary_loss_mlp": 0.01254778, + "balance_loss_clip": 0.06258625, + "balance_loss_mlp": 0.01253596, + "epoch": 0.6828799038027957, + "flos": 71668833598080.0, + "grad_norm": 0.7966113468619515, + "language_loss": 0.59682757, + "learning_rate": 9.65241049367493e-07, + "loss": 0.67251611, + "num_input_tokens_seen": 245070720, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01180267, + "step": 11358, + "time_per_iteration": 3.1846532821655273 + }, + { + "auxiliary_loss_clip": 0.06419402, + "auxiliary_loss_mlp": 0.01269456, + "balance_loss_clip": 0.06276588, + "balance_loss_mlp": 0.01257648, + "epoch": 0.6829400270554637, + "flos": 19835378547840.0, + "grad_norm": 1.7044245093067194, + "language_loss": 0.78866333, + "learning_rate": 9.64907784784544e-07, + "loss": 0.86555189, + "num_input_tokens_seen": 245089070, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.11816406, + "step": 11359, + "time_per_iteration": 2.5490803718566895 + }, + { + "auxiliary_loss_clip": 0.064127, + "auxiliary_loss_mlp": 0.01264331, + "balance_loss_clip": 0.06273861, + "balance_loss_mlp": 0.01253734, + "epoch": 0.6830001503081317, + "flos": 21987020816640.0, + "grad_norm": 2.0193369174380664, + "language_loss": 0.82223153, + "learning_rate": 9.645745594523958e-07, + "loss": 0.89900184, + "num_input_tokens_seen": 245106500, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.105896, + "step": 11360, + "time_per_iteration": 3.9807236194610596 + }, + { + "auxiliary_loss_clip": 0.0641343, + "auxiliary_loss_mlp": 0.01265293, + "balance_loss_clip": 0.06274153, + "balance_loss_mlp": 0.01254677, + "epoch": 0.6830602735607997, + "flos": 24323718827520.0, + "grad_norm": 1.651921957497636, + "language_loss": 0.75011313, + "learning_rate": 9.642413733836844e-07, + "loss": 0.82690036, + "num_input_tokens_seen": 245125260, + "router_z_loss_clip": 1.39257812, + "router_z_loss_mlp": 0.1060791, + "step": 11361, + "time_per_iteration": 2.535749673843384 + }, + { + "auxiliary_loss_clip": 0.06309322, + "auxiliary_loss_mlp": 0.01254085, + "balance_loss_clip": 0.06253715, + "balance_loss_mlp": 0.01252928, + "epoch": 0.6831203968134676, + "flos": 57706827793920.0, + "grad_norm": 0.8409522652001101, + "language_loss": 0.595146, + "learning_rate": 9.639082265910437e-07, + "loss": 0.67078006, + "num_input_tokens_seen": 245188730, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01154327, + "step": 11362, + "time_per_iteration": 3.249852180480957 + }, + { + "auxiliary_loss_clip": 0.06412338, + "auxiliary_loss_mlp": 0.0126686, + "balance_loss_clip": 0.06271093, + "balance_loss_mlp": 0.01255792, + "epoch": 0.6831805200661356, + "flos": 14393024807040.0, + "grad_norm": 2.0585212828502004, + "language_loss": 0.76010299, + "learning_rate": 9.635751190871074e-07, + "loss": 0.83689499, + "num_input_tokens_seen": 245205065, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.11077881, + "step": 11363, + "time_per_iteration": 2.5203006267547607 + }, + { + "auxiliary_loss_clip": 0.06410082, + "auxiliary_loss_mlp": 0.01264688, + "balance_loss_clip": 0.06273843, + "balance_loss_mlp": 0.01253828, + "epoch": 0.6832406433188035, + "flos": 22826906616960.0, + "grad_norm": 2.358731005347766, + "language_loss": 0.89481944, + "learning_rate": 9.632420508845063e-07, + "loss": 0.97156709, + "num_input_tokens_seen": 245224265, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.10870361, + "step": 11364, + "time_per_iteration": 2.5663001537323 + }, + { + "auxiliary_loss_clip": 0.06405666, + "auxiliary_loss_mlp": 0.0126555, + "balance_loss_clip": 0.06269991, + "balance_loss_mlp": 0.01255721, + "epoch": 0.6833007665714715, + "flos": 17566673725440.0, + "grad_norm": 1.8217270673941708, + "language_loss": 0.88218802, + "learning_rate": 9.629090219958697e-07, + "loss": 0.95890021, + "num_input_tokens_seen": 245243360, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.09838867, + "step": 11365, + "time_per_iteration": 3.9711902141571045 + }, + { + "auxiliary_loss_clip": 0.06422257, + "auxiliary_loss_mlp": 0.0127244, + "balance_loss_clip": 0.06279552, + "balance_loss_mlp": 0.01261222, + "epoch": 0.6833608898241395, + "flos": 22450883420160.0, + "grad_norm": 1.95679459658848, + "language_loss": 0.81100428, + "learning_rate": 9.625760324338272e-07, + "loss": 0.88795125, + "num_input_tokens_seen": 245256350, + "router_z_loss_clip": 1.42675781, + "router_z_loss_mlp": 0.11230469, + "step": 11366, + "time_per_iteration": 2.496051788330078 + }, + { + "auxiliary_loss_clip": 0.06410712, + "auxiliary_loss_mlp": 0.01263817, + "balance_loss_clip": 0.06271282, + "balance_loss_mlp": 0.01253434, + "epoch": 0.6834210130768075, + "flos": 24541450462080.0, + "grad_norm": 1.3668234382616995, + "language_loss": 0.76664793, + "learning_rate": 9.622430822110062e-07, + "loss": 0.84339321, + "num_input_tokens_seen": 245277575, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.1038208, + "step": 11367, + "time_per_iteration": 2.597698450088501 + }, + { + "auxiliary_loss_clip": 0.06411598, + "auxiliary_loss_mlp": 0.01263902, + "balance_loss_clip": 0.06272662, + "balance_loss_mlp": 0.0125312, + "epoch": 0.6834811363294754, + "flos": 20053235963520.0, + "grad_norm": 1.5010742143698117, + "language_loss": 0.69233596, + "learning_rate": 9.619101713400312e-07, + "loss": 0.76909101, + "num_input_tokens_seen": 245296615, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.10791016, + "step": 11368, + "time_per_iteration": 2.520679473876953 + }, + { + "auxiliary_loss_clip": 0.06409574, + "auxiliary_loss_mlp": 0.01266367, + "balance_loss_clip": 0.06272889, + "balance_loss_mlp": 0.0125553, + "epoch": 0.6835412595821434, + "flos": 24797727774720.0, + "grad_norm": 1.604090291521746, + "language_loss": 0.73295021, + "learning_rate": 9.615772998335261e-07, + "loss": 0.80970967, + "num_input_tokens_seen": 245316275, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.1083374, + "step": 11369, + "time_per_iteration": 2.5773866176605225 + }, + { + "auxiliary_loss_clip": 0.06409427, + "auxiliary_loss_mlp": 0.01264128, + "balance_loss_clip": 0.06271335, + "balance_loss_mlp": 0.01254067, + "epoch": 0.6836013828348113, + "flos": 19506454145280.0, + "grad_norm": 1.9399454003386187, + "language_loss": 0.79163188, + "learning_rate": 9.612444677041138e-07, + "loss": 0.86836743, + "num_input_tokens_seen": 245334595, + "router_z_loss_clip": 1.38183594, + "router_z_loss_mlp": 0.10064697, + "step": 11370, + "time_per_iteration": 2.4922618865966797 + }, + { + "auxiliary_loss_clip": 0.06306867, + "auxiliary_loss_mlp": 0.01250813, + "balance_loss_clip": 0.06251401, + "balance_loss_mlp": 0.0124961, + "epoch": 0.6836615060874793, + "flos": 58383753402240.0, + "grad_norm": 0.8179842252969125, + "language_loss": 0.59746689, + "learning_rate": 9.609116749644162e-07, + "loss": 0.67304367, + "num_input_tokens_seen": 245389750, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.0120163, + "step": 11371, + "time_per_iteration": 3.0478594303131104 + }, + { + "auxiliary_loss_clip": 0.06402698, + "auxiliary_loss_mlp": 0.01263932, + "balance_loss_clip": 0.06270069, + "balance_loss_mlp": 0.01254175, + "epoch": 0.6837216293401474, + "flos": 12171796122240.0, + "grad_norm": 1.5508500684767301, + "language_loss": 0.63639355, + "learning_rate": 9.605789216270511e-07, + "loss": 0.71305984, + "num_input_tokens_seen": 245407530, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09759521, + "step": 11372, + "time_per_iteration": 2.4811301231384277 + }, + { + "auxiliary_loss_clip": 0.06408484, + "auxiliary_loss_mlp": 0.01265592, + "balance_loss_clip": 0.06272547, + "balance_loss_mlp": 0.01255137, + "epoch": 0.6837817525928153, + "flos": 22134159786240.0, + "grad_norm": 1.4333850518313196, + "language_loss": 0.71846133, + "learning_rate": 9.602462077046375e-07, + "loss": 0.79520208, + "num_input_tokens_seen": 245427000, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.10461426, + "step": 11373, + "time_per_iteration": 2.5287580490112305 + }, + { + "auxiliary_loss_clip": 0.06305692, + "auxiliary_loss_mlp": 0.01251081, + "balance_loss_clip": 0.06250165, + "balance_loss_mlp": 0.01249923, + "epoch": 0.6838418758454833, + "flos": 65027048186880.0, + "grad_norm": 1.1033743133145881, + "language_loss": 0.56752723, + "learning_rate": 9.599135332097935e-07, + "loss": 0.6430949, + "num_input_tokens_seen": 245491620, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01155853, + "step": 11374, + "time_per_iteration": 3.302116632461548 + }, + { + "auxiliary_loss_clip": 0.06410992, + "auxiliary_loss_mlp": 0.01268892, + "balance_loss_clip": 0.06272627, + "balance_loss_mlp": 0.01257895, + "epoch": 0.6839019990981512, + "flos": 21036864643200.0, + "grad_norm": 1.4837774857580213, + "language_loss": 0.7423023, + "learning_rate": 9.595808981551312e-07, + "loss": 0.81910115, + "num_input_tokens_seen": 245511285, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.11001587, + "step": 11375, + "time_per_iteration": 2.5274906158447266 + }, + { + "auxiliary_loss_clip": 0.06406655, + "auxiliary_loss_mlp": 0.01267316, + "balance_loss_clip": 0.06271502, + "balance_loss_mlp": 0.01257684, + "epoch": 0.6839621223508192, + "flos": 24942351121920.0, + "grad_norm": 1.6223536594822023, + "language_loss": 0.7043916, + "learning_rate": 9.592483025532651e-07, + "loss": 0.78113139, + "num_input_tokens_seen": 245532910, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.09637451, + "step": 11376, + "time_per_iteration": 2.5494120121002197 + }, + { + "auxiliary_loss_clip": 0.06412984, + "auxiliary_loss_mlp": 0.012638, + "balance_loss_clip": 0.06272008, + "balance_loss_mlp": 0.01253161, + "epoch": 0.6840222456034871, + "flos": 26365929264000.0, + "grad_norm": 1.7833627654713686, + "language_loss": 0.74259639, + "learning_rate": 9.58915746416808e-07, + "loss": 0.81936419, + "num_input_tokens_seen": 245550540, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.10632324, + "step": 11377, + "time_per_iteration": 2.5434489250183105 + }, + { + "auxiliary_loss_clip": 0.06309253, + "auxiliary_loss_mlp": 0.01251187, + "balance_loss_clip": 0.06253564, + "balance_loss_mlp": 0.01249992, + "epoch": 0.6840823688561551, + "flos": 66009167493120.0, + "grad_norm": 0.7064811243320783, + "language_loss": 0.56814432, + "learning_rate": 9.585832297583707e-07, + "loss": 0.64374876, + "num_input_tokens_seen": 245619570, + "router_z_loss_clip": 0.55517578, + "router_z_loss_mlp": 0.01193237, + "step": 11378, + "time_per_iteration": 3.2616686820983887 + }, + { + "auxiliary_loss_clip": 0.06409612, + "auxiliary_loss_mlp": 0.01265612, + "balance_loss_clip": 0.06271753, + "balance_loss_mlp": 0.01254764, + "epoch": 0.684142492108823, + "flos": 21403999307520.0, + "grad_norm": 1.6132418851945567, + "language_loss": 0.78663373, + "learning_rate": 9.58250752590561e-07, + "loss": 0.86338598, + "num_input_tokens_seen": 245637980, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.10858154, + "step": 11379, + "time_per_iteration": 2.53483247756958 + }, + { + "auxiliary_loss_clip": 0.06401949, + "auxiliary_loss_mlp": 0.01263666, + "balance_loss_clip": 0.06271493, + "balance_loss_mlp": 0.01254976, + "epoch": 0.6842026153614911, + "flos": 18806453936640.0, + "grad_norm": 2.5056443246249, + "language_loss": 0.68875623, + "learning_rate": 9.57918314925988e-07, + "loss": 0.76541233, + "num_input_tokens_seen": 245655690, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.08685303, + "step": 11380, + "time_per_iteration": 2.5189809799194336 + }, + { + "auxiliary_loss_clip": 0.06407002, + "auxiliary_loss_mlp": 0.01265061, + "balance_loss_clip": 0.06271026, + "balance_loss_mlp": 0.01254678, + "epoch": 0.684262738614159, + "flos": 19652544938880.0, + "grad_norm": 1.774794382077768, + "language_loss": 0.78619421, + "learning_rate": 9.575859167772568e-07, + "loss": 0.8629148, + "num_input_tokens_seen": 245671525, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.1038208, + "step": 11381, + "time_per_iteration": 2.5038013458251953 + }, + { + "auxiliary_loss_clip": 0.0631157, + "auxiliary_loss_mlp": 0.01250817, + "balance_loss_clip": 0.06255913, + "balance_loss_mlp": 0.01249629, + "epoch": 0.684322861866827, + "flos": 62371041793920.0, + "grad_norm": 0.8443750872588546, + "language_loss": 0.67272472, + "learning_rate": 9.572535581569713e-07, + "loss": 0.74834859, + "num_input_tokens_seen": 245724115, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01186371, + "step": 11382, + "time_per_iteration": 3.022620677947998 + }, + { + "auxiliary_loss_clip": 0.06309118, + "auxiliary_loss_mlp": 0.01252769, + "balance_loss_clip": 0.06253339, + "balance_loss_mlp": 0.01251537, + "epoch": 0.6843829851194949, + "flos": 65825704978560.0, + "grad_norm": 0.8346748203160914, + "language_loss": 0.58115959, + "learning_rate": 9.569212390777356e-07, + "loss": 0.65677845, + "num_input_tokens_seen": 245789245, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.01231384, + "step": 11383, + "time_per_iteration": 3.205733060836792 + }, + { + "auxiliary_loss_clip": 0.06403822, + "auxiliary_loss_mlp": 0.01263656, + "balance_loss_clip": 0.06269525, + "balance_loss_mlp": 0.01253697, + "epoch": 0.6844431083721629, + "flos": 27862573766400.0, + "grad_norm": 1.743965936300629, + "language_loss": 0.79892695, + "learning_rate": 9.565889595521517e-07, + "loss": 0.87560171, + "num_input_tokens_seen": 245812420, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.09960938, + "step": 11384, + "time_per_iteration": 2.576397657394409 + }, + { + "auxiliary_loss_clip": 0.0641057, + "auxiliary_loss_mlp": 0.01264349, + "balance_loss_clip": 0.06270487, + "balance_loss_mlp": 0.01253459, + "epoch": 0.684503231624831, + "flos": 18260091388800.0, + "grad_norm": 1.8125132078887, + "language_loss": 0.77559322, + "learning_rate": 9.562567195928187e-07, + "loss": 0.85234237, + "num_input_tokens_seen": 245829135, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.10894775, + "step": 11385, + "time_per_iteration": 2.5222182273864746 + }, + { + "auxiliary_loss_clip": 0.06418984, + "auxiliary_loss_mlp": 0.01266461, + "balance_loss_clip": 0.0627387, + "balance_loss_mlp": 0.01254397, + "epoch": 0.6845633548774989, + "flos": 17645484090240.0, + "grad_norm": 2.2044599558463105, + "language_loss": 0.84624577, + "learning_rate": 9.55924519212335e-07, + "loss": 0.92310023, + "num_input_tokens_seen": 245847140, + "router_z_loss_clip": 1.45019531, + "router_z_loss_mlp": 0.12072754, + "step": 11386, + "time_per_iteration": 3.9474587440490723 + }, + { + "auxiliary_loss_clip": 0.06409421, + "auxiliary_loss_mlp": 0.01262563, + "balance_loss_clip": 0.06272484, + "balance_loss_mlp": 0.01252883, + "epoch": 0.6846234781301669, + "flos": 20812843952640.0, + "grad_norm": 1.925558647056537, + "language_loss": 0.83398205, + "learning_rate": 9.555923584232984e-07, + "loss": 0.91070187, + "num_input_tokens_seen": 245862855, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.09680176, + "step": 11387, + "time_per_iteration": 2.5117714405059814 + }, + { + "auxiliary_loss_clip": 0.06405626, + "auxiliary_loss_mlp": 0.01263725, + "balance_loss_clip": 0.06270427, + "balance_loss_mlp": 0.01254033, + "epoch": 0.6846836013828348, + "flos": 36110016241920.0, + "grad_norm": 1.588804983998274, + "language_loss": 0.72422922, + "learning_rate": 9.552602372383047e-07, + "loss": 0.80092275, + "num_input_tokens_seen": 245885415, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.09692383, + "step": 11388, + "time_per_iteration": 2.669675588607788 + }, + { + "auxiliary_loss_clip": 0.0640699, + "auxiliary_loss_mlp": 0.01267663, + "balance_loss_clip": 0.06272318, + "balance_loss_mlp": 0.01258198, + "epoch": 0.6847437246355028, + "flos": 43152408823680.0, + "grad_norm": 2.116517308354933, + "language_loss": 0.63188899, + "learning_rate": 9.549281556699469e-07, + "loss": 0.70863551, + "num_input_tokens_seen": 245906285, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.09460449, + "step": 11389, + "time_per_iteration": 2.775179862976074 + }, + { + "auxiliary_loss_clip": 0.06304318, + "auxiliary_loss_mlp": 0.01252682, + "balance_loss_clip": 0.06248381, + "balance_loss_mlp": 0.01251546, + "epoch": 0.6848038478881707, + "flos": 71682768103680.0, + "grad_norm": 0.7038129025924749, + "language_loss": 0.55774271, + "learning_rate": 9.54596113730818e-07, + "loss": 0.63331264, + "num_input_tokens_seen": 245967620, + "router_z_loss_clip": 0.55908203, + "router_z_loss_mlp": 0.01138306, + "step": 11390, + "time_per_iteration": 3.2121734619140625 + }, + { + "auxiliary_loss_clip": 0.06409647, + "auxiliary_loss_mlp": 0.01266416, + "balance_loss_clip": 0.06272963, + "balance_loss_mlp": 0.01255997, + "epoch": 0.6848639711408387, + "flos": 19943929912320.0, + "grad_norm": 1.8977282247890388, + "language_loss": 0.87613106, + "learning_rate": 9.542641114335109e-07, + "loss": 0.95289165, + "num_input_tokens_seen": 245985075, + "router_z_loss_clip": 1.36621094, + "router_z_loss_mlp": 0.10424805, + "step": 11391, + "time_per_iteration": 2.500140428543091 + }, + { + "auxiliary_loss_clip": 0.06412797, + "auxiliary_loss_mlp": 0.01263893, + "balance_loss_clip": 0.0627296, + "balance_loss_mlp": 0.01253343, + "epoch": 0.6849240943935067, + "flos": 26874333112320.0, + "grad_norm": 1.48935328965904, + "language_loss": 0.79339015, + "learning_rate": 9.539321487906117e-07, + "loss": 0.870157, + "num_input_tokens_seen": 246003560, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.10552979, + "step": 11392, + "time_per_iteration": 2.557020902633667 + }, + { + "auxiliary_loss_clip": 0.06403191, + "auxiliary_loss_mlp": 0.01264788, + "balance_loss_clip": 0.06271005, + "balance_loss_mlp": 0.01254751, + "epoch": 0.6849842176461747, + "flos": 13740458808960.0, + "grad_norm": 2.0081405471627884, + "language_loss": 0.71175981, + "learning_rate": 9.536002258147104e-07, + "loss": 0.78843963, + "num_input_tokens_seen": 246019600, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.10040283, + "step": 11393, + "time_per_iteration": 2.5271036624908447 + }, + { + "auxiliary_loss_clip": 0.06415832, + "auxiliary_loss_mlp": 0.01265598, + "balance_loss_clip": 0.0627556, + "balance_loss_mlp": 0.01255, + "epoch": 0.6850443408988426, + "flos": 24980058259200.0, + "grad_norm": 1.5317798757580128, + "language_loss": 0.64661515, + "learning_rate": 9.532683425183936e-07, + "loss": 0.72342944, + "num_input_tokens_seen": 246038920, + "router_z_loss_clip": 1.40332031, + "router_z_loss_mlp": 0.10595703, + "step": 11394, + "time_per_iteration": 2.53812313079834 + }, + { + "auxiliary_loss_clip": 0.06411145, + "auxiliary_loss_mlp": 0.01264493, + "balance_loss_clip": 0.06272422, + "balance_loss_mlp": 0.0125439, + "epoch": 0.6851044641515106, + "flos": 27751380998400.0, + "grad_norm": 1.5645262580549901, + "language_loss": 0.80918968, + "learning_rate": 9.529364989142468e-07, + "loss": 0.88594604, + "num_input_tokens_seen": 246060490, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.10101318, + "step": 11395, + "time_per_iteration": 2.550346851348877 + }, + { + "auxiliary_loss_clip": 0.06410371, + "auxiliary_loss_mlp": 0.01268735, + "balance_loss_clip": 0.06274814, + "balance_loss_mlp": 0.01258144, + "epoch": 0.6851645874041785, + "flos": 24357652531200.0, + "grad_norm": 1.7469268170163024, + "language_loss": 0.72832096, + "learning_rate": 9.526046950148527e-07, + "loss": 0.80511206, + "num_input_tokens_seen": 246081465, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10595703, + "step": 11396, + "time_per_iteration": 3.9635422229766846 + }, + { + "auxiliary_loss_clip": 0.06410467, + "auxiliary_loss_mlp": 0.01265588, + "balance_loss_clip": 0.06270725, + "balance_loss_mlp": 0.01255056, + "epoch": 0.6852247106568465, + "flos": 15081914350080.0, + "grad_norm": 2.3772034852800643, + "language_loss": 0.79818743, + "learning_rate": 9.522729308327931e-07, + "loss": 0.87494791, + "num_input_tokens_seen": 246096110, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.10528564, + "step": 11397, + "time_per_iteration": 2.481863260269165 + }, + { + "auxiliary_loss_clip": 0.06411494, + "auxiliary_loss_mlp": 0.01267109, + "balance_loss_clip": 0.0627315, + "balance_loss_mlp": 0.01256828, + "epoch": 0.6852848339095146, + "flos": 18775874396160.0, + "grad_norm": 1.839103323810105, + "language_loss": 0.71941662, + "learning_rate": 9.519412063806493e-07, + "loss": 0.7962026, + "num_input_tokens_seen": 246114785, + "router_z_loss_clip": 1.38476562, + "router_z_loss_mlp": 0.10284424, + "step": 11398, + "time_per_iteration": 2.5322060585021973 + }, + { + "auxiliary_loss_clip": 0.06403108, + "auxiliary_loss_mlp": 0.01265797, + "balance_loss_clip": 0.06270117, + "balance_loss_mlp": 0.0125632, + "epoch": 0.6853449571621825, + "flos": 27861651371520.0, + "grad_norm": 1.5188649145265738, + "language_loss": 0.71170795, + "learning_rate": 9.516095216709996e-07, + "loss": 0.78839701, + "num_input_tokens_seen": 246136375, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.0947876, + "step": 11399, + "time_per_iteration": 3.972925901412964 + }, + { + "auxiliary_loss_clip": 0.06411214, + "auxiliary_loss_mlp": 0.01269341, + "balance_loss_clip": 0.06273123, + "balance_loss_mlp": 0.01259119, + "epoch": 0.6854050804148505, + "flos": 18156403560960.0, + "grad_norm": 1.6092651373600877, + "language_loss": 0.70567757, + "learning_rate": 9.512778767164217e-07, + "loss": 0.78248316, + "num_input_tokens_seen": 246155090, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.10217285, + "step": 11400, + "time_per_iteration": 2.474824905395508 + }, + { + "auxiliary_loss_clip": 0.06426042, + "auxiliary_loss_mlp": 0.01267609, + "balance_loss_clip": 0.06277213, + "balance_loss_mlp": 0.01255163, + "epoch": 0.6854652036675184, + "flos": 16331798977920.0, + "grad_norm": 1.9177955333528751, + "language_loss": 0.77889669, + "learning_rate": 9.509462715294927e-07, + "loss": 0.85583317, + "num_input_tokens_seen": 246172645, + "router_z_loss_clip": 1.48632812, + "router_z_loss_mlp": 0.12463379, + "step": 11401, + "time_per_iteration": 2.5186407566070557 + }, + { + "auxiliary_loss_clip": 0.06405222, + "auxiliary_loss_mlp": 0.01266109, + "balance_loss_clip": 0.06271464, + "balance_loss_mlp": 0.01256537, + "epoch": 0.6855253269201864, + "flos": 14946347243520.0, + "grad_norm": 2.060399475016654, + "language_loss": 0.75462782, + "learning_rate": 9.50614706122786e-07, + "loss": 0.83134115, + "num_input_tokens_seen": 246189055, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.0958252, + "step": 11402, + "time_per_iteration": 2.461958885192871 + }, + { + "auxiliary_loss_clip": 0.06414859, + "auxiliary_loss_mlp": 0.01266931, + "balance_loss_clip": 0.06273296, + "balance_loss_mlp": 0.01255487, + "epoch": 0.6855854501728543, + "flos": 23044135127040.0, + "grad_norm": 1.4779944862214063, + "language_loss": 0.73165995, + "learning_rate": 9.502831805088742e-07, + "loss": 0.80847782, + "num_input_tokens_seen": 246207990, + "router_z_loss_clip": 1.41503906, + "router_z_loss_mlp": 0.11444092, + "step": 11403, + "time_per_iteration": 2.5588088035583496 + }, + { + "auxiliary_loss_clip": 0.06407753, + "auxiliary_loss_mlp": 0.01264829, + "balance_loss_clip": 0.06272316, + "balance_loss_mlp": 0.0125522, + "epoch": 0.6856455734255223, + "flos": 13257393621120.0, + "grad_norm": 3.459862281853561, + "language_loss": 0.81727648, + "learning_rate": 9.499516947003294e-07, + "loss": 0.89400232, + "num_input_tokens_seen": 246221595, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.09613037, + "step": 11404, + "time_per_iteration": 3.899538993835449 + }, + { + "auxiliary_loss_clip": 0.06407394, + "auxiliary_loss_mlp": 0.01269418, + "balance_loss_clip": 0.06274688, + "balance_loss_mlp": 0.01259381, + "epoch": 0.6857056966781903, + "flos": 23340551345280.0, + "grad_norm": 1.3350169784860642, + "language_loss": 0.7794162, + "learning_rate": 9.496202487097222e-07, + "loss": 0.8561843, + "num_input_tokens_seen": 246242970, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.10046387, + "step": 11405, + "time_per_iteration": 2.618781089782715 + }, + { + "auxiliary_loss_clip": 0.06313835, + "auxiliary_loss_mlp": 0.01251022, + "balance_loss_clip": 0.06257869, + "balance_loss_mlp": 0.01250013, + "epoch": 0.6857658199308583, + "flos": 61870646010240.0, + "grad_norm": 0.7926132752302004, + "language_loss": 0.60793728, + "learning_rate": 9.492888425496199e-07, + "loss": 0.68358588, + "num_input_tokens_seen": 246300405, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01009369, + "step": 11406, + "time_per_iteration": 3.192826986312866 + }, + { + "auxiliary_loss_clip": 0.06409362, + "auxiliary_loss_mlp": 0.0126412, + "balance_loss_clip": 0.06271882, + "balance_loss_mlp": 0.01253826, + "epoch": 0.6858259431835262, + "flos": 16660178328960.0, + "grad_norm": 1.6678552032285212, + "language_loss": 0.77383244, + "learning_rate": 9.489574762325907e-07, + "loss": 0.85056722, + "num_input_tokens_seen": 246318780, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.10296631, + "step": 11407, + "time_per_iteration": 2.5133752822875977 + }, + { + "auxiliary_loss_clip": 0.06408191, + "auxiliary_loss_mlp": 0.01265606, + "balance_loss_clip": 0.0626992, + "balance_loss_mlp": 0.0125455, + "epoch": 0.6858860664361942, + "flos": 21879643409280.0, + "grad_norm": 2.893760051958565, + "language_loss": 0.71341193, + "learning_rate": 9.486261497711991e-07, + "loss": 0.79014993, + "num_input_tokens_seen": 246339405, + "router_z_loss_clip": 1.38183594, + "router_z_loss_mlp": 0.11053467, + "step": 11408, + "time_per_iteration": 2.5356616973876953 + }, + { + "auxiliary_loss_clip": 0.06413727, + "auxiliary_loss_mlp": 0.01265844, + "balance_loss_clip": 0.06273487, + "balance_loss_mlp": 0.0125514, + "epoch": 0.6859461896888621, + "flos": 15272965658880.0, + "grad_norm": 1.731957908279727, + "language_loss": 0.70413965, + "learning_rate": 9.482948631780087e-07, + "loss": 0.78093535, + "num_input_tokens_seen": 246357055, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.1071167, + "step": 11409, + "time_per_iteration": 2.52020525932312 + }, + { + "auxiliary_loss_clip": 0.0640128, + "auxiliary_loss_mlp": 0.01263971, + "balance_loss_clip": 0.06270745, + "balance_loss_mlp": 0.01254733, + "epoch": 0.6860063129415301, + "flos": 18625507044480.0, + "grad_norm": 1.590904402895803, + "language_loss": 0.78129441, + "learning_rate": 9.479636164655825e-07, + "loss": 0.85794687, + "num_input_tokens_seen": 246374050, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.09240723, + "step": 11410, + "time_per_iteration": 2.546893358230591 + }, + { + "auxiliary_loss_clip": 0.06412078, + "auxiliary_loss_mlp": 0.01266884, + "balance_loss_clip": 0.06270525, + "balance_loss_mlp": 0.01256078, + "epoch": 0.6860664361941982, + "flos": 23958177390720.0, + "grad_norm": 1.8721880718662787, + "language_loss": 0.7200377, + "learning_rate": 9.476324096464821e-07, + "loss": 0.79682732, + "num_input_tokens_seen": 246392910, + "router_z_loss_clip": 1.41601562, + "router_z_loss_mlp": 0.1081543, + "step": 11411, + "time_per_iteration": 2.532982349395752 + }, + { + "auxiliary_loss_clip": 0.0641197, + "auxiliary_loss_mlp": 0.01268743, + "balance_loss_clip": 0.06274374, + "balance_loss_mlp": 0.01258551, + "epoch": 0.6861265594468661, + "flos": 20413243031040.0, + "grad_norm": 1.9740044070304406, + "language_loss": 0.70534211, + "learning_rate": 9.473012427332654e-07, + "loss": 0.78214926, + "num_input_tokens_seen": 246411540, + "router_z_loss_clip": 1.37695312, + "router_z_loss_mlp": 0.10192871, + "step": 11412, + "time_per_iteration": 2.5798745155334473 + }, + { + "auxiliary_loss_clip": 0.06410308, + "auxiliary_loss_mlp": 0.01266719, + "balance_loss_clip": 0.06272474, + "balance_loss_mlp": 0.01256324, + "epoch": 0.6861866826995341, + "flos": 11431908570240.0, + "grad_norm": 3.0856036818138692, + "language_loss": 0.71973193, + "learning_rate": 9.469701157384919e-07, + "loss": 0.79650223, + "num_input_tokens_seen": 246423295, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.10394287, + "step": 11413, + "time_per_iteration": 2.4693074226379395 + }, + { + "auxiliary_loss_clip": 0.06411856, + "auxiliary_loss_mlp": 0.01267734, + "balance_loss_clip": 0.06274316, + "balance_loss_mlp": 0.01257518, + "epoch": 0.686246805952202, + "flos": 16003084210560.0, + "grad_norm": 1.8173139685722925, + "language_loss": 0.73670095, + "learning_rate": 9.466390286747164e-07, + "loss": 0.81349689, + "num_input_tokens_seen": 246441045, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.10217285, + "step": 11414, + "time_per_iteration": 2.510739803314209 + }, + { + "auxiliary_loss_clip": 0.06415157, + "auxiliary_loss_mlp": 0.01267285, + "balance_loss_clip": 0.06276812, + "balance_loss_mlp": 0.01256425, + "epoch": 0.68630692920487, + "flos": 19832527509120.0, + "grad_norm": 2.474590574257684, + "language_loss": 0.87128049, + "learning_rate": 9.46307981554495e-07, + "loss": 0.94810498, + "num_input_tokens_seen": 246456905, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.10852051, + "step": 11415, + "time_per_iteration": 2.4847946166992188 + }, + { + "auxiliary_loss_clip": 0.06415314, + "auxiliary_loss_mlp": 0.01266339, + "balance_loss_clip": 0.06276202, + "balance_loss_mlp": 0.01254705, + "epoch": 0.6863670524575379, + "flos": 26293366028160.0, + "grad_norm": 9.907368268016192, + "language_loss": 0.67353249, + "learning_rate": 9.459769743903801e-07, + "loss": 0.75034899, + "num_input_tokens_seen": 246477545, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.11633301, + "step": 11416, + "time_per_iteration": 2.5904948711395264 + }, + { + "auxiliary_loss_clip": 0.06403923, + "auxiliary_loss_mlp": 0.0126434, + "balance_loss_clip": 0.06269173, + "balance_loss_mlp": 0.01254284, + "epoch": 0.686427175710206, + "flos": 19179374532480.0, + "grad_norm": 1.4750819254499818, + "language_loss": 0.76489693, + "learning_rate": 9.456460071949237e-07, + "loss": 0.84157956, + "num_input_tokens_seen": 246496705, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.10058594, + "step": 11417, + "time_per_iteration": 2.487197160720825 + }, + { + "auxiliary_loss_clip": 0.06410322, + "auxiliary_loss_mlp": 0.0126862, + "balance_loss_clip": 0.0627322, + "balance_loss_mlp": 0.01258863, + "epoch": 0.6864872989628739, + "flos": 18922636022400.0, + "grad_norm": 1.8452434101813986, + "language_loss": 0.77370739, + "learning_rate": 9.45315079980678e-07, + "loss": 0.85049683, + "num_input_tokens_seen": 246514860, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.09759521, + "step": 11418, + "time_per_iteration": 2.510810375213623 + }, + { + "auxiliary_loss_clip": 0.06410821, + "auxiliary_loss_mlp": 0.01265598, + "balance_loss_clip": 0.06272699, + "balance_loss_mlp": 0.01255382, + "epoch": 0.6865474222155419, + "flos": 25963016106240.0, + "grad_norm": 1.6317928435070383, + "language_loss": 0.76463497, + "learning_rate": 9.449841927601887e-07, + "loss": 0.84139907, + "num_input_tokens_seen": 246536145, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.10217285, + "step": 11419, + "time_per_iteration": 2.5700454711914062 + }, + { + "auxiliary_loss_clip": 0.06407338, + "auxiliary_loss_mlp": 0.01267938, + "balance_loss_clip": 0.06270772, + "balance_loss_mlp": 0.01258359, + "epoch": 0.6866075454682098, + "flos": 18483902444160.0, + "grad_norm": 1.6443171286333353, + "language_loss": 0.71588171, + "learning_rate": 9.446533455460044e-07, + "loss": 0.79263443, + "num_input_tokens_seen": 246553265, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.0958252, + "step": 11420, + "time_per_iteration": 2.5144495964050293 + }, + { + "auxiliary_loss_clip": 0.06407318, + "auxiliary_loss_mlp": 0.0126343, + "balance_loss_clip": 0.06272776, + "balance_loss_mlp": 0.01253506, + "epoch": 0.6866676687208778, + "flos": 34248459208320.0, + "grad_norm": 1.3410332761873145, + "language_loss": 0.75059515, + "learning_rate": 9.443225383506712e-07, + "loss": 0.82730258, + "num_input_tokens_seen": 246575130, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.09924316, + "step": 11421, + "time_per_iteration": 2.61454176902771 + }, + { + "auxiliary_loss_clip": 0.0640727, + "auxiliary_loss_mlp": 0.01265626, + "balance_loss_clip": 0.06272772, + "balance_loss_mlp": 0.01255982, + "epoch": 0.6867277919735457, + "flos": 21727515121920.0, + "grad_norm": 1.6725729939473468, + "language_loss": 0.77230668, + "learning_rate": 9.439917711867338e-07, + "loss": 0.84903562, + "num_input_tokens_seen": 246593095, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.09637451, + "step": 11422, + "time_per_iteration": 2.5174617767333984 + }, + { + "auxiliary_loss_clip": 0.0641562, + "auxiliary_loss_mlp": 0.01272736, + "balance_loss_clip": 0.06279219, + "balance_loss_mlp": 0.01261536, + "epoch": 0.6867879152262137, + "flos": 24104939016960.0, + "grad_norm": 1.647039828063758, + "language_loss": 0.77276117, + "learning_rate": 9.436610440667334e-07, + "loss": 0.84964472, + "num_input_tokens_seen": 246612165, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.11206055, + "step": 11423, + "time_per_iteration": 2.5189144611358643 + }, + { + "auxiliary_loss_clip": 0.06414216, + "auxiliary_loss_mlp": 0.01267082, + "balance_loss_clip": 0.06274028, + "balance_loss_mlp": 0.01256461, + "epoch": 0.6868480384788818, + "flos": 21622150212480.0, + "grad_norm": 1.4426214659548335, + "language_loss": 0.73124474, + "learning_rate": 9.433303570032129e-07, + "loss": 0.80805779, + "num_input_tokens_seen": 246632065, + "router_z_loss_clip": 1.40332031, + "router_z_loss_mlp": 0.10614014, + "step": 11424, + "time_per_iteration": 2.5789601802825928 + }, + { + "auxiliary_loss_clip": 0.06411408, + "auxiliary_loss_mlp": 0.01265287, + "balance_loss_clip": 0.06273325, + "balance_loss_mlp": 0.01254839, + "epoch": 0.6869081617315497, + "flos": 26293282174080.0, + "grad_norm": 1.8417753723265369, + "language_loss": 0.65276968, + "learning_rate": 9.429997100087112e-07, + "loss": 0.72953665, + "num_input_tokens_seen": 246651245, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.10437012, + "step": 11425, + "time_per_iteration": 2.547678232192993 + }, + { + "auxiliary_loss_clip": 0.06408506, + "auxiliary_loss_mlp": 0.0126771, + "balance_loss_clip": 0.06275355, + "balance_loss_mlp": 0.01257381, + "epoch": 0.6869682849842177, + "flos": 21111356522880.0, + "grad_norm": 1.3347714221988014, + "language_loss": 0.71902603, + "learning_rate": 9.426691030957657e-07, + "loss": 0.79578817, + "num_input_tokens_seen": 246672225, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.10327148, + "step": 11426, + "time_per_iteration": 4.051712512969971 + }, + { + "auxiliary_loss_clip": 0.06412126, + "auxiliary_loss_mlp": 0.01266408, + "balance_loss_clip": 0.06274693, + "balance_loss_mlp": 0.0125606, + "epoch": 0.6870284082368856, + "flos": 17098408782720.0, + "grad_norm": 2.192498277588843, + "language_loss": 0.85740101, + "learning_rate": 9.423385362769136e-07, + "loss": 0.93418634, + "num_input_tokens_seen": 246688385, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.10351562, + "step": 11427, + "time_per_iteration": 2.533590316772461 + }, + { + "auxiliary_loss_clip": 0.06408241, + "auxiliary_loss_mlp": 0.01263719, + "balance_loss_clip": 0.06273334, + "balance_loss_mlp": 0.01253312, + "epoch": 0.6870885314895536, + "flos": 27315456531840.0, + "grad_norm": 1.4340637684485376, + "language_loss": 0.76548541, + "learning_rate": 9.420080095646909e-07, + "loss": 0.84220493, + "num_input_tokens_seen": 246710730, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.10412598, + "step": 11428, + "time_per_iteration": 2.579432249069214 + }, + { + "auxiliary_loss_clip": 0.06414707, + "auxiliary_loss_mlp": 0.0127044, + "balance_loss_clip": 0.06273684, + "balance_loss_mlp": 0.01259002, + "epoch": 0.6871486547422215, + "flos": 20820977798400.0, + "grad_norm": 2.1898072552839087, + "language_loss": 0.73509127, + "learning_rate": 9.4167752297163e-07, + "loss": 0.81194276, + "num_input_tokens_seen": 246730350, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.11437988, + "step": 11429, + "time_per_iteration": 2.508434772491455 + }, + { + "auxiliary_loss_clip": 0.0641626, + "auxiliary_loss_mlp": 0.01266327, + "balance_loss_clip": 0.06277661, + "balance_loss_mlp": 0.01256474, + "epoch": 0.6872087779948896, + "flos": 30161983910400.0, + "grad_norm": 1.931452469341354, + "language_loss": 0.83630431, + "learning_rate": 9.413470765102643e-07, + "loss": 0.91313016, + "num_input_tokens_seen": 246751700, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.09851074, + "step": 11430, + "time_per_iteration": 2.630755662918091 + }, + { + "auxiliary_loss_clip": 0.06412026, + "auxiliary_loss_mlp": 0.0126587, + "balance_loss_clip": 0.06274621, + "balance_loss_mlp": 0.0125504, + "epoch": 0.6872689012475575, + "flos": 20710917060480.0, + "grad_norm": 2.0596974928309253, + "language_loss": 0.70543802, + "learning_rate": 9.410166701931225e-07, + "loss": 0.78221703, + "num_input_tokens_seen": 246769860, + "router_z_loss_clip": 1.37402344, + "router_z_loss_mlp": 0.10827637, + "step": 11431, + "time_per_iteration": 2.491147756576538 + }, + { + "auxiliary_loss_clip": 0.06409967, + "auxiliary_loss_mlp": 0.01264771, + "balance_loss_clip": 0.06271458, + "balance_loss_mlp": 0.01254293, + "epoch": 0.6873290245002255, + "flos": 25528014034560.0, + "grad_norm": 1.7781814059522836, + "language_loss": 0.80397063, + "learning_rate": 9.406863040327355e-07, + "loss": 0.88071799, + "num_input_tokens_seen": 246789905, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.1048584, + "step": 11432, + "time_per_iteration": 2.5659162998199463 + }, + { + "auxiliary_loss_clip": 0.06404472, + "auxiliary_loss_mlp": 0.01268851, + "balance_loss_clip": 0.06272881, + "balance_loss_mlp": 0.01259362, + "epoch": 0.6873891477528934, + "flos": 25198418799360.0, + "grad_norm": 2.2741442538336125, + "language_loss": 0.68286675, + "learning_rate": 9.403559780416295e-07, + "loss": 0.75959998, + "num_input_tokens_seen": 246808815, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09490967, + "step": 11433, + "time_per_iteration": 2.6121439933776855 + }, + { + "auxiliary_loss_clip": 0.064156, + "auxiliary_loss_mlp": 0.01269066, + "balance_loss_clip": 0.06278776, + "balance_loss_mlp": 0.01258665, + "epoch": 0.6874492710055614, + "flos": 35161034025600.0, + "grad_norm": 2.030098002823672, + "language_loss": 0.72783715, + "learning_rate": 9.400256922323309e-07, + "loss": 0.8046838, + "num_input_tokens_seen": 246829775, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.10400391, + "step": 11434, + "time_per_iteration": 2.6294844150543213 + }, + { + "auxiliary_loss_clip": 0.06410138, + "auxiliary_loss_mlp": 0.01269251, + "balance_loss_clip": 0.06275442, + "balance_loss_mlp": 0.0125919, + "epoch": 0.6875093942582293, + "flos": 17828066136960.0, + "grad_norm": 1.5552043430175444, + "language_loss": 0.80520236, + "learning_rate": 9.396954466173657e-07, + "loss": 0.88199627, + "num_input_tokens_seen": 246848045, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.10064697, + "step": 11435, + "time_per_iteration": 2.501239061355591 + }, + { + "auxiliary_loss_clip": 0.06411996, + "auxiliary_loss_mlp": 0.01269183, + "balance_loss_clip": 0.06272568, + "balance_loss_mlp": 0.01258227, + "epoch": 0.6875695175108973, + "flos": 20710875133440.0, + "grad_norm": 9.52111477806384, + "language_loss": 0.8158865, + "learning_rate": 9.393652412092538e-07, + "loss": 0.89269829, + "num_input_tokens_seen": 246866095, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.10943604, + "step": 11436, + "time_per_iteration": 3.8841755390167236 + }, + { + "auxiliary_loss_clip": 0.064064, + "auxiliary_loss_mlp": 0.01268806, + "balance_loss_clip": 0.0627645, + "balance_loss_mlp": 0.01259806, + "epoch": 0.6876296407635654, + "flos": 25381000846080.0, + "grad_norm": 1.6419248940044093, + "language_loss": 0.81966716, + "learning_rate": 9.390350760205183e-07, + "loss": 0.89641917, + "num_input_tokens_seen": 246883975, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.08996582, + "step": 11437, + "time_per_iteration": 2.5980188846588135 + }, + { + "auxiliary_loss_clip": 0.06421375, + "auxiliary_loss_mlp": 0.01270532, + "balance_loss_clip": 0.06274987, + "balance_loss_mlp": 0.01257729, + "epoch": 0.6876897640162333, + "flos": 23229107015040.0, + "grad_norm": 2.1640181952928486, + "language_loss": 0.77725911, + "learning_rate": 9.387049510636793e-07, + "loss": 0.85417819, + "num_input_tokens_seen": 246901560, + "router_z_loss_clip": 1.46484375, + "router_z_loss_mlp": 0.12792969, + "step": 11438, + "time_per_iteration": 2.5095889568328857 + }, + { + "auxiliary_loss_clip": 0.06405748, + "auxiliary_loss_mlp": 0.01270285, + "balance_loss_clip": 0.06273987, + "balance_loss_mlp": 0.01260838, + "epoch": 0.6877498872689013, + "flos": 27131448965760.0, + "grad_norm": 1.6644547524403899, + "language_loss": 0.72329235, + "learning_rate": 9.383748663512554e-07, + "loss": 0.80005264, + "num_input_tokens_seen": 246922655, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09448242, + "step": 11439, + "time_per_iteration": 3.9927306175231934 + }, + { + "auxiliary_loss_clip": 0.06406644, + "auxiliary_loss_mlp": 0.01268484, + "balance_loss_clip": 0.06271771, + "balance_loss_mlp": 0.01258554, + "epoch": 0.6878100105215692, + "flos": 11586217063680.0, + "grad_norm": 1.9676653989850965, + "language_loss": 0.75157619, + "learning_rate": 9.380448218957623e-07, + "loss": 0.82832754, + "num_input_tokens_seen": 246940100, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.09936523, + "step": 11440, + "time_per_iteration": 2.4851269721984863 + }, + { + "auxiliary_loss_clip": 0.06404521, + "auxiliary_loss_mlp": 0.01267859, + "balance_loss_clip": 0.06272353, + "balance_loss_mlp": 0.012584, + "epoch": 0.6878701337742372, + "flos": 20309429422080.0, + "grad_norm": 1.4828372396976293, + "language_loss": 0.71795368, + "learning_rate": 9.377148177097167e-07, + "loss": 0.79467738, + "num_input_tokens_seen": 246958545, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.09448242, + "step": 11441, + "time_per_iteration": 2.514653444290161 + }, + { + "auxiliary_loss_clip": 0.06418902, + "auxiliary_loss_mlp": 0.01272176, + "balance_loss_clip": 0.06276838, + "balance_loss_mlp": 0.01260893, + "epoch": 0.6879302570269051, + "flos": 13844398199040.0, + "grad_norm": 1.6175108384355714, + "language_loss": 0.66777945, + "learning_rate": 9.373848538056317e-07, + "loss": 0.74469018, + "num_input_tokens_seen": 246974805, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.11291504, + "step": 11442, + "time_per_iteration": 2.5146420001983643 + }, + { + "auxiliary_loss_clip": 0.06411453, + "auxiliary_loss_mlp": 0.01266841, + "balance_loss_clip": 0.06274946, + "balance_loss_mlp": 0.01256547, + "epoch": 0.6879903802795732, + "flos": 21331058728320.0, + "grad_norm": 2.38232064736284, + "language_loss": 0.69958794, + "learning_rate": 9.370549301960189e-07, + "loss": 0.77637082, + "num_input_tokens_seen": 246992505, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.10290527, + "step": 11443, + "time_per_iteration": 2.493436574935913 + }, + { + "auxiliary_loss_clip": 0.06419516, + "auxiliary_loss_mlp": 0.01266925, + "balance_loss_clip": 0.06279808, + "balance_loss_mlp": 0.01256524, + "epoch": 0.6880505035322411, + "flos": 25158489528960.0, + "grad_norm": 1.390720225309701, + "language_loss": 0.763533, + "learning_rate": 9.367250468933893e-07, + "loss": 0.84039736, + "num_input_tokens_seen": 247013370, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.10394287, + "step": 11444, + "time_per_iteration": 3.9500269889831543 + }, + { + "auxiliary_loss_clip": 0.06406762, + "auxiliary_loss_mlp": 0.01267311, + "balance_loss_clip": 0.06272952, + "balance_loss_mlp": 0.01257059, + "epoch": 0.6881106267849091, + "flos": 23221182804480.0, + "grad_norm": 1.8756092745031845, + "language_loss": 0.76660252, + "learning_rate": 9.363952039102536e-07, + "loss": 0.84334326, + "num_input_tokens_seen": 247029855, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.10253906, + "step": 11445, + "time_per_iteration": 2.488555908203125 + }, + { + "auxiliary_loss_clip": 0.06317502, + "auxiliary_loss_mlp": 0.01252549, + "balance_loss_clip": 0.06261797, + "balance_loss_mlp": 0.01251243, + "epoch": 0.688170750037577, + "flos": 48497741136000.0, + "grad_norm": 0.8087198242159813, + "language_loss": 0.58278191, + "learning_rate": 9.360654012591183e-07, + "loss": 0.65848243, + "num_input_tokens_seen": 247085030, + "router_z_loss_clip": 0.55810547, + "router_z_loss_mlp": 0.01306915, + "step": 11446, + "time_per_iteration": 3.1777503490448 + }, + { + "auxiliary_loss_clip": 0.06413881, + "auxiliary_loss_mlp": 0.0126538, + "balance_loss_clip": 0.06273392, + "balance_loss_mlp": 0.01254562, + "epoch": 0.688230873290245, + "flos": 22790205728640.0, + "grad_norm": 1.616943103064761, + "language_loss": 0.76008183, + "learning_rate": 9.357356389524886e-07, + "loss": 0.83687443, + "num_input_tokens_seen": 247104840, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.10821533, + "step": 11447, + "time_per_iteration": 2.5756897926330566 + }, + { + "auxiliary_loss_clip": 0.06411539, + "auxiliary_loss_mlp": 0.01266898, + "balance_loss_clip": 0.06274877, + "balance_loss_mlp": 0.01256884, + "epoch": 0.6882909965429129, + "flos": 22462245648000.0, + "grad_norm": 1.9129765382773336, + "language_loss": 0.74044937, + "learning_rate": 9.354059170028705e-07, + "loss": 0.81723368, + "num_input_tokens_seen": 247121905, + "router_z_loss_clip": 1.36621094, + "router_z_loss_mlp": 0.10015869, + "step": 11448, + "time_per_iteration": 2.5083351135253906 + }, + { + "auxiliary_loss_clip": 0.06417549, + "auxiliary_loss_mlp": 0.01266481, + "balance_loss_clip": 0.06275415, + "balance_loss_mlp": 0.01255376, + "epoch": 0.688351119795581, + "flos": 26221431697920.0, + "grad_norm": 1.5605900643108004, + "language_loss": 0.74581099, + "learning_rate": 9.350762354227673e-07, + "loss": 0.82265133, + "num_input_tokens_seen": 247142375, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.11102295, + "step": 11449, + "time_per_iteration": 2.585969924926758 + }, + { + "auxiliary_loss_clip": 0.06408881, + "auxiliary_loss_mlp": 0.01266876, + "balance_loss_clip": 0.06273638, + "balance_loss_mlp": 0.01256809, + "epoch": 0.6884112430482489, + "flos": 22571887115520.0, + "grad_norm": 1.6262008407242425, + "language_loss": 0.70027089, + "learning_rate": 9.34746594224679e-07, + "loss": 0.77702844, + "num_input_tokens_seen": 247161095, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.1005249, + "step": 11450, + "time_per_iteration": 2.5182437896728516 + }, + { + "auxiliary_loss_clip": 0.06418543, + "auxiliary_loss_mlp": 0.0126869, + "balance_loss_clip": 0.06276023, + "balance_loss_mlp": 0.01257187, + "epoch": 0.6884713663009169, + "flos": 17345671781760.0, + "grad_norm": 1.9477242871289788, + "language_loss": 0.76100504, + "learning_rate": 9.344169934211068e-07, + "loss": 0.83787739, + "num_input_tokens_seen": 247178565, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.1151123, + "step": 11451, + "time_per_iteration": 2.5395891666412354 + }, + { + "auxiliary_loss_clip": 0.06416887, + "auxiliary_loss_mlp": 0.01263826, + "balance_loss_clip": 0.06276768, + "balance_loss_mlp": 0.01253926, + "epoch": 0.6885314895535849, + "flos": 26478379843200.0, + "grad_norm": 1.2780895399548546, + "language_loss": 0.69393182, + "learning_rate": 9.340874330245505e-07, + "loss": 0.77073896, + "num_input_tokens_seen": 247202345, + "router_z_loss_clip": 1.40039062, + "router_z_loss_mlp": 0.09899902, + "step": 11452, + "time_per_iteration": 2.584246873855591 + }, + { + "auxiliary_loss_clip": 0.06409479, + "auxiliary_loss_mlp": 0.01267469, + "balance_loss_clip": 0.06273156, + "balance_loss_mlp": 0.0125553, + "epoch": 0.6885916128062528, + "flos": 20527748035200.0, + "grad_norm": 1.553726438653973, + "language_loss": 0.71749568, + "learning_rate": 9.337579130475042e-07, + "loss": 0.79426515, + "num_input_tokens_seen": 247219240, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.11932373, + "step": 11453, + "time_per_iteration": 2.5244805812835693 + }, + { + "auxiliary_loss_clip": 0.06314202, + "auxiliary_loss_mlp": 0.01249184, + "balance_loss_clip": 0.06258714, + "balance_loss_mlp": 0.01248031, + "epoch": 0.6886517360589208, + "flos": 70734792136320.0, + "grad_norm": 0.77256871445285, + "language_loss": 0.50623441, + "learning_rate": 9.334284335024644e-07, + "loss": 0.58186829, + "num_input_tokens_seen": 247272010, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01150513, + "step": 11454, + "time_per_iteration": 2.982760190963745 + }, + { + "auxiliary_loss_clip": 0.06402037, + "auxiliary_loss_mlp": 0.01264708, + "balance_loss_clip": 0.06273487, + "balance_loss_mlp": 0.01254998, + "epoch": 0.6887118593115887, + "flos": 17899119999360.0, + "grad_norm": 1.70106225646023, + "language_loss": 0.75493348, + "learning_rate": 9.330989944019263e-07, + "loss": 0.8316009, + "num_input_tokens_seen": 247290630, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.09716797, + "step": 11455, + "time_per_iteration": 2.5417535305023193 + }, + { + "auxiliary_loss_clip": 0.0641242, + "auxiliary_loss_mlp": 0.01266873, + "balance_loss_clip": 0.06273204, + "balance_loss_mlp": 0.01255286, + "epoch": 0.6887719825642568, + "flos": 17458080433920.0, + "grad_norm": 2.3349527650336945, + "language_loss": 0.72984523, + "learning_rate": 9.327695957583803e-07, + "loss": 0.80663818, + "num_input_tokens_seen": 247304800, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.11578369, + "step": 11456, + "time_per_iteration": 2.452291250228882 + }, + { + "auxiliary_loss_clip": 0.0640955, + "auxiliary_loss_mlp": 0.01265957, + "balance_loss_clip": 0.06275116, + "balance_loss_mlp": 0.01255621, + "epoch": 0.6888321058169247, + "flos": 23075930551680.0, + "grad_norm": 1.6190505365782226, + "language_loss": 0.81124002, + "learning_rate": 9.32440237584319e-07, + "loss": 0.88799506, + "num_input_tokens_seen": 247323450, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.10339355, + "step": 11457, + "time_per_iteration": 2.540853977203369 + }, + { + "auxiliary_loss_clip": 0.06415743, + "auxiliary_loss_mlp": 0.01267797, + "balance_loss_clip": 0.06276038, + "balance_loss_mlp": 0.01257152, + "epoch": 0.6888922290695927, + "flos": 23375742860160.0, + "grad_norm": 1.590427454304544, + "language_loss": 0.7679534, + "learning_rate": 9.321109198922301e-07, + "loss": 0.84478879, + "num_input_tokens_seen": 247343845, + "router_z_loss_clip": 1.39648438, + "router_z_loss_mlp": 0.10638428, + "step": 11458, + "time_per_iteration": 2.510422706604004 + }, + { + "auxiliary_loss_clip": 0.06409671, + "auxiliary_loss_mlp": 0.01264265, + "balance_loss_clip": 0.0627234, + "balance_loss_mlp": 0.012539, + "epoch": 0.6889523523222606, + "flos": 17636092433280.0, + "grad_norm": 2.414805126891923, + "language_loss": 0.68316978, + "learning_rate": 9.31781642694603e-07, + "loss": 0.75990915, + "num_input_tokens_seen": 247356650, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.1036377, + "step": 11459, + "time_per_iteration": 2.5042388439178467 + }, + { + "auxiliary_loss_clip": 0.06414565, + "auxiliary_loss_mlp": 0.01267614, + "balance_loss_clip": 0.06275657, + "balance_loss_mlp": 0.01257976, + "epoch": 0.6890124755749286, + "flos": 25235119687680.0, + "grad_norm": 1.5145065442588617, + "language_loss": 0.68853188, + "learning_rate": 9.314524060039221e-07, + "loss": 0.76535368, + "num_input_tokens_seen": 247377340, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.09637451, + "step": 11460, + "time_per_iteration": 2.548172950744629 + }, + { + "auxiliary_loss_clip": 0.06421833, + "auxiliary_loss_mlp": 0.01269493, + "balance_loss_clip": 0.06274051, + "balance_loss_mlp": 0.01257727, + "epoch": 0.6890725988275965, + "flos": 20236488842880.0, + "grad_norm": 1.6636597256364867, + "language_loss": 0.77513885, + "learning_rate": 9.311232098326731e-07, + "loss": 0.85205209, + "num_input_tokens_seen": 247395805, + "router_z_loss_clip": 1.47753906, + "router_z_loss_mlp": 0.11761475, + "step": 11461, + "time_per_iteration": 2.524261474609375 + }, + { + "auxiliary_loss_clip": 0.06409161, + "auxiliary_loss_mlp": 0.01267077, + "balance_loss_clip": 0.06273255, + "balance_loss_mlp": 0.01256777, + "epoch": 0.6891327220802645, + "flos": 14540079922560.0, + "grad_norm": 2.0638516380212932, + "language_loss": 0.69867802, + "learning_rate": 9.307940541933401e-07, + "loss": 0.77544034, + "num_input_tokens_seen": 247413165, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.10302734, + "step": 11462, + "time_per_iteration": 2.470341444015503 + }, + { + "auxiliary_loss_clip": 0.06410427, + "auxiliary_loss_mlp": 0.01263925, + "balance_loss_clip": 0.06272087, + "balance_loss_mlp": 0.01253864, + "epoch": 0.6891928453329325, + "flos": 21144996737280.0, + "grad_norm": 1.4840489217528152, + "language_loss": 0.87375474, + "learning_rate": 9.304649390984034e-07, + "loss": 0.95049822, + "num_input_tokens_seen": 247433140, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.10064697, + "step": 11463, + "time_per_iteration": 2.550734043121338 + }, + { + "auxiliary_loss_clip": 0.06405184, + "auxiliary_loss_mlp": 0.01265431, + "balance_loss_clip": 0.06273332, + "balance_loss_mlp": 0.01255656, + "epoch": 0.6892529685856005, + "flos": 17864347754880.0, + "grad_norm": 1.4959389236419984, + "language_loss": 0.68525398, + "learning_rate": 9.301358645603428e-07, + "loss": 0.76196021, + "num_input_tokens_seen": 247451265, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.09771729, + "step": 11464, + "time_per_iteration": 3.9007256031036377 + }, + { + "auxiliary_loss_clip": 0.06409206, + "auxiliary_loss_mlp": 0.01266234, + "balance_loss_clip": 0.06272039, + "balance_loss_mlp": 0.01255571, + "epoch": 0.6893130918382685, + "flos": 29942575194240.0, + "grad_norm": 1.7446769813388354, + "language_loss": 0.65578705, + "learning_rate": 9.298068305916373e-07, + "loss": 0.73254144, + "num_input_tokens_seen": 247471645, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.10662842, + "step": 11465, + "time_per_iteration": 2.554800271987915 + }, + { + "auxiliary_loss_clip": 0.06418021, + "auxiliary_loss_mlp": 0.01264957, + "balance_loss_clip": 0.06274985, + "balance_loss_mlp": 0.01253388, + "epoch": 0.6893732150909364, + "flos": 24395275814400.0, + "grad_norm": 1.468256683851191, + "language_loss": 0.72699749, + "learning_rate": 9.294778372047649e-07, + "loss": 0.80382729, + "num_input_tokens_seen": 247491170, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.11578369, + "step": 11466, + "time_per_iteration": 2.5593020915985107 + }, + { + "auxiliary_loss_clip": 0.06412645, + "auxiliary_loss_mlp": 0.01265937, + "balance_loss_clip": 0.06275305, + "balance_loss_mlp": 0.01255632, + "epoch": 0.6894333383436044, + "flos": 16988557680000.0, + "grad_norm": 1.6869523120590046, + "language_loss": 0.72136575, + "learning_rate": 9.291488844121995e-07, + "loss": 0.79815149, + "num_input_tokens_seen": 247509005, + "router_z_loss_clip": 1.37402344, + "router_z_loss_mlp": 0.10302734, + "step": 11467, + "time_per_iteration": 2.4603004455566406 + }, + { + "auxiliary_loss_clip": 0.06414096, + "auxiliary_loss_mlp": 0.01266553, + "balance_loss_clip": 0.0627349, + "balance_loss_mlp": 0.0125462, + "epoch": 0.6894934615962723, + "flos": 18990880773120.0, + "grad_norm": 1.8974823893079618, + "language_loss": 0.80639178, + "learning_rate": 9.288199722264156e-07, + "loss": 0.88319826, + "num_input_tokens_seen": 247527050, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.11950684, + "step": 11468, + "time_per_iteration": 2.500204086303711 + }, + { + "auxiliary_loss_clip": 0.06415653, + "auxiliary_loss_mlp": 0.01266091, + "balance_loss_clip": 0.06276623, + "balance_loss_mlp": 0.01255941, + "epoch": 0.6895535848489404, + "flos": 34540137671040.0, + "grad_norm": 1.4230744907421156, + "language_loss": 0.66238683, + "learning_rate": 9.284911006598875e-07, + "loss": 0.73920429, + "num_input_tokens_seen": 247547765, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.10137939, + "step": 11469, + "time_per_iteration": 2.6155412197113037 + }, + { + "auxiliary_loss_clip": 0.06315388, + "auxiliary_loss_mlp": 0.01251862, + "balance_loss_clip": 0.06259958, + "balance_loss_mlp": 0.01250618, + "epoch": 0.6896137081016083, + "flos": 50093237128320.0, + "grad_norm": 0.7794555860117556, + "language_loss": 0.54945397, + "learning_rate": 9.281622697250824e-07, + "loss": 0.62512648, + "num_input_tokens_seen": 247603515, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01243591, + "step": 11470, + "time_per_iteration": 3.0223581790924072 + }, + { + "auxiliary_loss_clip": 0.0640993, + "auxiliary_loss_mlp": 0.01264419, + "balance_loss_clip": 0.0627588, + "balance_loss_mlp": 0.01255133, + "epoch": 0.6896738313542763, + "flos": 19944391109760.0, + "grad_norm": 1.6677407290115414, + "language_loss": 0.78484243, + "learning_rate": 9.278334794344715e-07, + "loss": 0.86158597, + "num_input_tokens_seen": 247622110, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.09283447, + "step": 11471, + "time_per_iteration": 2.486112594604492 + }, + { + "auxiliary_loss_clip": 0.0641201, + "auxiliary_loss_mlp": 0.0126608, + "balance_loss_clip": 0.06274249, + "balance_loss_mlp": 0.01255369, + "epoch": 0.6897339546069442, + "flos": 21731875534080.0, + "grad_norm": 1.810273606719927, + "language_loss": 0.78542721, + "learning_rate": 9.275047298005232e-07, + "loss": 0.86220813, + "num_input_tokens_seen": 247641905, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.10723877, + "step": 11472, + "time_per_iteration": 2.5265328884124756 + }, + { + "auxiliary_loss_clip": 0.06408779, + "auxiliary_loss_mlp": 0.01266157, + "balance_loss_clip": 0.06272413, + "balance_loss_mlp": 0.01256168, + "epoch": 0.6897940778596122, + "flos": 19832275946880.0, + "grad_norm": 1.5025655331144128, + "language_loss": 0.76723063, + "learning_rate": 9.271760208357024e-07, + "loss": 0.84398007, + "num_input_tokens_seen": 247660945, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.09985352, + "step": 11473, + "time_per_iteration": 2.5112764835357666 + }, + { + "auxiliary_loss_clip": 0.06415299, + "auxiliary_loss_mlp": 0.01264941, + "balance_loss_clip": 0.06274555, + "balance_loss_mlp": 0.01254099, + "epoch": 0.6898542011122801, + "flos": 17315595365760.0, + "grad_norm": 1.762455288405268, + "language_loss": 0.75548446, + "learning_rate": 9.268473525524751e-07, + "loss": 0.83228695, + "num_input_tokens_seen": 247678395, + "router_z_loss_clip": 1.40722656, + "router_z_loss_mlp": 0.10839844, + "step": 11474, + "time_per_iteration": 2.527608871459961 + }, + { + "auxiliary_loss_clip": 0.06414007, + "auxiliary_loss_mlp": 0.0127013, + "balance_loss_clip": 0.06276175, + "balance_loss_mlp": 0.01259097, + "epoch": 0.6899143243649482, + "flos": 24760984959360.0, + "grad_norm": 1.5301145681679174, + "language_loss": 0.74686491, + "learning_rate": 9.26518724963303e-07, + "loss": 0.82370627, + "num_input_tokens_seen": 247698380, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.11047363, + "step": 11475, + "time_per_iteration": 2.61885404586792 + }, + { + "auxiliary_loss_clip": 0.06408798, + "auxiliary_loss_mlp": 0.01264551, + "balance_loss_clip": 0.0627286, + "balance_loss_mlp": 0.01254168, + "epoch": 0.6899744476176161, + "flos": 17239636039680.0, + "grad_norm": 1.9758347439707513, + "language_loss": 0.89060938, + "learning_rate": 9.261901380806491e-07, + "loss": 0.96734291, + "num_input_tokens_seen": 247716370, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.1038208, + "step": 11476, + "time_per_iteration": 3.9992854595184326 + }, + { + "auxiliary_loss_clip": 0.06409539, + "auxiliary_loss_mlp": 0.01267337, + "balance_loss_clip": 0.06274991, + "balance_loss_mlp": 0.01256864, + "epoch": 0.6900345708702841, + "flos": 25417701734400.0, + "grad_norm": 1.3283080082562368, + "language_loss": 0.70312291, + "learning_rate": 9.258615919169724e-07, + "loss": 0.77989161, + "num_input_tokens_seen": 247737335, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.10473633, + "step": 11477, + "time_per_iteration": 2.5792300701141357 + }, + { + "auxiliary_loss_clip": 0.06419337, + "auxiliary_loss_mlp": 0.01267418, + "balance_loss_clip": 0.06276701, + "balance_loss_mlp": 0.0125567, + "epoch": 0.6900946941229521, + "flos": 23439836833920.0, + "grad_norm": 2.3323261899860386, + "language_loss": 0.68125427, + "learning_rate": 9.255330864847313e-07, + "loss": 0.75812185, + "num_input_tokens_seen": 247756680, + "router_z_loss_clip": 1.42675781, + "router_z_loss_mlp": 0.11737061, + "step": 11478, + "time_per_iteration": 4.033671855926514 + }, + { + "auxiliary_loss_clip": 0.06415287, + "auxiliary_loss_mlp": 0.01266031, + "balance_loss_clip": 0.06275256, + "balance_loss_mlp": 0.01255469, + "epoch": 0.69015481737562, + "flos": 17825592441600.0, + "grad_norm": 2.187140386680911, + "language_loss": 0.76715493, + "learning_rate": 9.252046217963843e-07, + "loss": 0.84396803, + "num_input_tokens_seen": 247774265, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.10565186, + "step": 11479, + "time_per_iteration": 2.507310390472412 + }, + { + "auxiliary_loss_clip": 0.06417705, + "auxiliary_loss_mlp": 0.0126466, + "balance_loss_clip": 0.06277484, + "balance_loss_mlp": 0.01253084, + "epoch": 0.690214940628288, + "flos": 17462147356800.0, + "grad_norm": 1.7422547235207548, + "language_loss": 0.78936756, + "learning_rate": 9.248761978643856e-07, + "loss": 0.86619121, + "num_input_tokens_seen": 247792395, + "router_z_loss_clip": 1.40039062, + "router_z_loss_mlp": 0.11584473, + "step": 11480, + "time_per_iteration": 2.4853224754333496 + }, + { + "auxiliary_loss_clip": 0.06408322, + "auxiliary_loss_mlp": 0.01267563, + "balance_loss_clip": 0.06271941, + "balance_loss_mlp": 0.01256685, + "epoch": 0.6902750638809559, + "flos": 29573847302400.0, + "grad_norm": 1.6397986809458904, + "language_loss": 0.75654733, + "learning_rate": 9.245478147011885e-07, + "loss": 0.83330619, + "num_input_tokens_seen": 247811985, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.10870361, + "step": 11481, + "time_per_iteration": 2.557511806488037 + }, + { + "auxiliary_loss_clip": 0.06409919, + "auxiliary_loss_mlp": 0.01267642, + "balance_loss_clip": 0.06274407, + "balance_loss_mlp": 0.01257151, + "epoch": 0.690335187133624, + "flos": 25564253725440.0, + "grad_norm": 1.7034098487881468, + "language_loss": 0.69767886, + "learning_rate": 9.24219472319246e-07, + "loss": 0.77445447, + "num_input_tokens_seen": 247831880, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.10491943, + "step": 11482, + "time_per_iteration": 2.52620267868042 + }, + { + "auxiliary_loss_clip": 0.06410135, + "auxiliary_loss_mlp": 0.01265009, + "balance_loss_clip": 0.06271818, + "balance_loss_mlp": 0.0125403, + "epoch": 0.6903953103862919, + "flos": 22494418416000.0, + "grad_norm": 1.3936382068363662, + "language_loss": 0.82645047, + "learning_rate": 9.238911707310096e-07, + "loss": 0.90320188, + "num_input_tokens_seen": 247851170, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.10980225, + "step": 11483, + "time_per_iteration": 3.9243674278259277 + }, + { + "auxiliary_loss_clip": 0.06413989, + "auxiliary_loss_mlp": 0.01264114, + "balance_loss_clip": 0.06273346, + "balance_loss_mlp": 0.01254202, + "epoch": 0.6904554336389599, + "flos": 26107094401920.0, + "grad_norm": 1.7789545949672325, + "language_loss": 0.65774268, + "learning_rate": 9.235629099489273e-07, + "loss": 0.73452371, + "num_input_tokens_seen": 247868950, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.09918213, + "step": 11484, + "time_per_iteration": 2.570255994796753 + }, + { + "auxiliary_loss_clip": 0.06407849, + "auxiliary_loss_mlp": 0.01267989, + "balance_loss_clip": 0.06274161, + "balance_loss_mlp": 0.01257838, + "epoch": 0.6905155568916278, + "flos": 31179127023360.0, + "grad_norm": 1.529832254030816, + "language_loss": 0.73510063, + "learning_rate": 9.232346899854479e-07, + "loss": 0.81185901, + "num_input_tokens_seen": 247889805, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.1015625, + "step": 11485, + "time_per_iteration": 2.6148314476013184 + }, + { + "auxiliary_loss_clip": 0.06415319, + "auxiliary_loss_mlp": 0.0126655, + "balance_loss_clip": 0.0627619, + "balance_loss_mlp": 0.01255863, + "epoch": 0.6905756801442958, + "flos": 17645484090240.0, + "grad_norm": 1.7447168149804075, + "language_loss": 0.85063231, + "learning_rate": 9.22906510853017e-07, + "loss": 0.92745095, + "num_input_tokens_seen": 247908585, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.10687256, + "step": 11486, + "time_per_iteration": 2.5396366119384766 + }, + { + "auxiliary_loss_clip": 0.06414411, + "auxiliary_loss_mlp": 0.0126458, + "balance_loss_clip": 0.06275952, + "balance_loss_mlp": 0.01254071, + "epoch": 0.6906358033969637, + "flos": 22349836995840.0, + "grad_norm": 1.4442882109961312, + "language_loss": 0.73110938, + "learning_rate": 9.225783725640786e-07, + "loss": 0.8078993, + "num_input_tokens_seen": 247928480, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.10510254, + "step": 11487, + "time_per_iteration": 2.5067358016967773 + }, + { + "auxiliary_loss_clip": 0.06322645, + "auxiliary_loss_mlp": 0.01254949, + "balance_loss_clip": 0.06266931, + "balance_loss_mlp": 0.01253606, + "epoch": 0.6906959266496318, + "flos": 69769485573120.0, + "grad_norm": 0.8802440439282012, + "language_loss": 0.66566062, + "learning_rate": 9.222502751310759e-07, + "loss": 0.74143648, + "num_input_tokens_seen": 247988855, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.01345062, + "step": 11488, + "time_per_iteration": 3.1760408878326416 + }, + { + "auxiliary_loss_clip": 0.06420241, + "auxiliary_loss_mlp": 0.01268855, + "balance_loss_clip": 0.06275697, + "balance_loss_mlp": 0.01256773, + "epoch": 0.6907560499022997, + "flos": 21440700195840.0, + "grad_norm": 1.9049138044907, + "language_loss": 0.75416613, + "learning_rate": 9.219222185664519e-07, + "loss": 0.83105707, + "num_input_tokens_seen": 248007685, + "router_z_loss_clip": 1.4453125, + "router_z_loss_mlp": 0.12072754, + "step": 11489, + "time_per_iteration": 2.515700578689575 + }, + { + "auxiliary_loss_clip": 0.06413751, + "auxiliary_loss_mlp": 0.01269098, + "balance_loss_clip": 0.06274071, + "balance_loss_mlp": 0.01257862, + "epoch": 0.6908161731549677, + "flos": 14397427146240.0, + "grad_norm": 2.0018253870073806, + "language_loss": 0.62274224, + "learning_rate": 9.215942028826445e-07, + "loss": 0.69957072, + "num_input_tokens_seen": 248025145, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.11236572, + "step": 11490, + "time_per_iteration": 2.532935857772827 + }, + { + "auxiliary_loss_clip": 0.06417898, + "auxiliary_loss_mlp": 0.01266366, + "balance_loss_clip": 0.06278036, + "balance_loss_mlp": 0.01255911, + "epoch": 0.6908762964076357, + "flos": 20017122053760.0, + "grad_norm": 1.8130615922920168, + "language_loss": 0.73057532, + "learning_rate": 9.212662280920937e-07, + "loss": 0.80741799, + "num_input_tokens_seen": 248043750, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.10455322, + "step": 11491, + "time_per_iteration": 2.521466016769409 + }, + { + "auxiliary_loss_clip": 0.0640818, + "auxiliary_loss_mlp": 0.0126409, + "balance_loss_clip": 0.06273587, + "balance_loss_mlp": 0.01253117, + "epoch": 0.6909364196603036, + "flos": 28776951446400.0, + "grad_norm": 1.7336299759284137, + "language_loss": 0.7042138, + "learning_rate": 9.20938294207235e-07, + "loss": 0.78093648, + "num_input_tokens_seen": 248065765, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.10968018, + "step": 11492, + "time_per_iteration": 2.585730791091919 + }, + { + "auxiliary_loss_clip": 0.06420228, + "auxiliary_loss_mlp": 0.01266161, + "balance_loss_clip": 0.0627589, + "balance_loss_mlp": 0.01255545, + "epoch": 0.6909965429129716, + "flos": 22534641175680.0, + "grad_norm": 1.7712531915598577, + "language_loss": 0.7470516, + "learning_rate": 9.206104012405049e-07, + "loss": 0.82391548, + "num_input_tokens_seen": 248083810, + "router_z_loss_clip": 1.44335938, + "router_z_loss_mlp": 0.1060791, + "step": 11493, + "time_per_iteration": 2.5050244331359863 + }, + { + "auxiliary_loss_clip": 0.06412148, + "auxiliary_loss_mlp": 0.01265374, + "balance_loss_clip": 0.06274831, + "balance_loss_mlp": 0.01254211, + "epoch": 0.6910566661656395, + "flos": 18411884259840.0, + "grad_norm": 1.6258065693735415, + "language_loss": 0.74673963, + "learning_rate": 9.20282549204336e-07, + "loss": 0.82351482, + "num_input_tokens_seen": 248103185, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.1116333, + "step": 11494, + "time_per_iteration": 2.5276567935943604 + }, + { + "auxiliary_loss_clip": 0.06411964, + "auxiliary_loss_mlp": 0.01267354, + "balance_loss_clip": 0.06274857, + "balance_loss_mlp": 0.01257263, + "epoch": 0.6911167894183076, + "flos": 30781874016000.0, + "grad_norm": 1.529019816420153, + "language_loss": 0.68227768, + "learning_rate": 9.19954738111161e-07, + "loss": 0.75907087, + "num_input_tokens_seen": 248125665, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.10101318, + "step": 11495, + "time_per_iteration": 2.5842087268829346 + }, + { + "auxiliary_loss_clip": 0.06411652, + "auxiliary_loss_mlp": 0.01268081, + "balance_loss_clip": 0.06274678, + "balance_loss_mlp": 0.01256863, + "epoch": 0.6911769126709755, + "flos": 13740878079360.0, + "grad_norm": 1.6566133128888745, + "language_loss": 0.74368346, + "learning_rate": 9.196269679734119e-07, + "loss": 0.82048082, + "num_input_tokens_seen": 248142545, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.11224365, + "step": 11496, + "time_per_iteration": 2.5154151916503906 + }, + { + "auxiliary_loss_clip": 0.06410149, + "auxiliary_loss_mlp": 0.01262738, + "balance_loss_clip": 0.06274073, + "balance_loss_mlp": 0.01252987, + "epoch": 0.6912370359236435, + "flos": 17572669292160.0, + "grad_norm": 1.7205825998793636, + "language_loss": 0.80305141, + "learning_rate": 9.19299238803515e-07, + "loss": 0.87978023, + "num_input_tokens_seen": 248160225, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.09753418, + "step": 11497, + "time_per_iteration": 2.4925076961517334 + }, + { + "auxiliary_loss_clip": 0.06416431, + "auxiliary_loss_mlp": 0.01267714, + "balance_loss_clip": 0.06275152, + "balance_loss_mlp": 0.01256061, + "epoch": 0.6912971591763114, + "flos": 22097291189760.0, + "grad_norm": 1.653826561150034, + "language_loss": 0.8077867, + "learning_rate": 9.189715506138993e-07, + "loss": 0.88462818, + "num_input_tokens_seen": 248180430, + "router_z_loss_clip": 1.41308594, + "router_z_loss_mlp": 0.11651611, + "step": 11498, + "time_per_iteration": 2.5465574264526367 + }, + { + "auxiliary_loss_clip": 0.06408113, + "auxiliary_loss_mlp": 0.01262525, + "balance_loss_clip": 0.06274167, + "balance_loss_mlp": 0.01251701, + "epoch": 0.6913572824289794, + "flos": 29979276082560.0, + "grad_norm": 2.039776107623003, + "language_loss": 0.85973012, + "learning_rate": 9.186439034169915e-07, + "loss": 0.93643653, + "num_input_tokens_seen": 248202365, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.10827637, + "step": 11499, + "time_per_iteration": 2.5665283203125 + }, + { + "auxiliary_loss_clip": 0.06408866, + "auxiliary_loss_mlp": 0.01265419, + "balance_loss_clip": 0.06275891, + "balance_loss_mlp": 0.01255399, + "epoch": 0.6914174056816473, + "flos": 20455184799360.0, + "grad_norm": 1.6118393659485355, + "language_loss": 0.7559222, + "learning_rate": 9.183162972252145e-07, + "loss": 0.83266509, + "num_input_tokens_seen": 248221750, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.10021973, + "step": 11500, + "time_per_iteration": 2.503854751586914 + }, + { + "auxiliary_loss_clip": 0.06412221, + "auxiliary_loss_mlp": 0.01266959, + "balance_loss_clip": 0.06274468, + "balance_loss_mlp": 0.0125567, + "epoch": 0.6914775289343154, + "flos": 21287984929920.0, + "grad_norm": 1.8512682937239455, + "language_loss": 0.77863973, + "learning_rate": 9.179887320509921e-07, + "loss": 0.85543144, + "num_input_tokens_seen": 248239535, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.112854, + "step": 11501, + "time_per_iteration": 2.4953453540802 + }, + { + "auxiliary_loss_clip": 0.06417021, + "auxiliary_loss_mlp": 0.01267471, + "balance_loss_clip": 0.06276537, + "balance_loss_mlp": 0.01256748, + "epoch": 0.6915376521869833, + "flos": 23884859468160.0, + "grad_norm": 1.8723825147208624, + "language_loss": 0.73532307, + "learning_rate": 9.176612079067458e-07, + "loss": 0.81216794, + "num_input_tokens_seen": 248259055, + "router_z_loss_clip": 1.40527344, + "router_z_loss_mlp": 0.10717773, + "step": 11502, + "time_per_iteration": 2.5416178703308105 + }, + { + "auxiliary_loss_clip": 0.06414314, + "auxiliary_loss_mlp": 0.01265378, + "balance_loss_clip": 0.06275123, + "balance_loss_mlp": 0.01253993, + "epoch": 0.6915977754396513, + "flos": 11515079347200.0, + "grad_norm": 1.8781803370630783, + "language_loss": 0.73954153, + "learning_rate": 9.173337248048953e-07, + "loss": 0.81633848, + "num_input_tokens_seen": 248276765, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.11395264, + "step": 11503, + "time_per_iteration": 2.499391794204712 + }, + { + "auxiliary_loss_clip": 0.06408094, + "auxiliary_loss_mlp": 0.01262533, + "balance_loss_clip": 0.06271478, + "balance_loss_mlp": 0.01252233, + "epoch": 0.6916578986923193, + "flos": 22607833317120.0, + "grad_norm": 1.5988526178616205, + "language_loss": 0.77127218, + "learning_rate": 9.170062827578575e-07, + "loss": 0.84797841, + "num_input_tokens_seen": 248295310, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.10302734, + "step": 11504, + "time_per_iteration": 3.9501583576202393 + }, + { + "auxiliary_loss_clip": 0.06413034, + "auxiliary_loss_mlp": 0.01266076, + "balance_loss_clip": 0.06275813, + "balance_loss_mlp": 0.01255472, + "epoch": 0.6917180219449872, + "flos": 23484126516480.0, + "grad_norm": 1.8617681816675509, + "language_loss": 0.73855585, + "learning_rate": 9.166788817780499e-07, + "loss": 0.81534696, + "num_input_tokens_seen": 248315230, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.10601807, + "step": 11505, + "time_per_iteration": 2.5829193592071533 + }, + { + "auxiliary_loss_clip": 0.06409241, + "auxiliary_loss_mlp": 0.01267959, + "balance_loss_clip": 0.06273368, + "balance_loss_mlp": 0.0125723, + "epoch": 0.6917781451976552, + "flos": 23739313726080.0, + "grad_norm": 1.75743437760736, + "language_loss": 0.876764, + "learning_rate": 9.163515218778886e-07, + "loss": 0.95353591, + "num_input_tokens_seen": 248332980, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.1072998, + "step": 11506, + "time_per_iteration": 2.5154294967651367 + }, + { + "auxiliary_loss_clip": 0.06412455, + "auxiliary_loss_mlp": 0.01265369, + "balance_loss_clip": 0.06276374, + "balance_loss_mlp": 0.01254783, + "epoch": 0.6918382684503231, + "flos": 31474704700800.0, + "grad_norm": 2.0688391280679648, + "language_loss": 0.7024008, + "learning_rate": 9.160242030697856e-07, + "loss": 0.7791791, + "num_input_tokens_seen": 248352865, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.105896, + "step": 11507, + "time_per_iteration": 2.5845768451690674 + }, + { + "auxiliary_loss_clip": 0.06413335, + "auxiliary_loss_mlp": 0.01264122, + "balance_loss_clip": 0.06273569, + "balance_loss_mlp": 0.01253631, + "epoch": 0.6918983917029912, + "flos": 21656503186560.0, + "grad_norm": 1.743467082940077, + "language_loss": 0.77142328, + "learning_rate": 9.156969253661538e-07, + "loss": 0.84819788, + "num_input_tokens_seen": 248371125, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.10491943, + "step": 11508, + "time_per_iteration": 2.4946086406707764 + }, + { + "auxiliary_loss_clip": 0.06406476, + "auxiliary_loss_mlp": 0.01267235, + "balance_loss_clip": 0.06273084, + "balance_loss_mlp": 0.01257501, + "epoch": 0.6919585149556591, + "flos": 25556036025600.0, + "grad_norm": 1.485663055998357, + "language_loss": 0.75072491, + "learning_rate": 9.153696887794027e-07, + "loss": 0.82746202, + "num_input_tokens_seen": 248390455, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.09735107, + "step": 11509, + "time_per_iteration": 2.591611623764038 + }, + { + "auxiliary_loss_clip": 0.06409086, + "auxiliary_loss_mlp": 0.0126353, + "balance_loss_clip": 0.06273773, + "balance_loss_mlp": 0.01253344, + "epoch": 0.6920186382083271, + "flos": 23666582782080.0, + "grad_norm": 1.6709622746913153, + "language_loss": 0.64358246, + "learning_rate": 9.150424933219425e-07, + "loss": 0.7203086, + "num_input_tokens_seen": 248411305, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.10192871, + "step": 11510, + "time_per_iteration": 2.522277593612671 + }, + { + "auxiliary_loss_clip": 0.06419423, + "auxiliary_loss_mlp": 0.0126943, + "balance_loss_clip": 0.06276652, + "balance_loss_mlp": 0.01257938, + "epoch": 0.692078761460995, + "flos": 19067888275200.0, + "grad_norm": 1.58502931536568, + "language_loss": 0.75757432, + "learning_rate": 9.147153390061788e-07, + "loss": 0.83446282, + "num_input_tokens_seen": 248430190, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.1149292, + "step": 11511, + "time_per_iteration": 2.5163841247558594 + }, + { + "auxiliary_loss_clip": 0.06410709, + "auxiliary_loss_mlp": 0.0126443, + "balance_loss_clip": 0.06275946, + "balance_loss_mlp": 0.01254482, + "epoch": 0.692138884713663, + "flos": 29031006625920.0, + "grad_norm": 1.5915143740912923, + "language_loss": 0.62864697, + "learning_rate": 9.143882258445184e-07, + "loss": 0.70539832, + "num_input_tokens_seen": 248450830, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.0994873, + "step": 11512, + "time_per_iteration": 2.5597567558288574 + }, + { + "auxiliary_loss_clip": 0.06413583, + "auxiliary_loss_mlp": 0.01267879, + "balance_loss_clip": 0.06275637, + "balance_loss_mlp": 0.01257323, + "epoch": 0.6921990079663309, + "flos": 14763262072320.0, + "grad_norm": 2.1370127100150373, + "language_loss": 0.83359182, + "learning_rate": 9.140611538493666e-07, + "loss": 0.91040647, + "num_input_tokens_seen": 248468585, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.10559082, + "step": 11513, + "time_per_iteration": 2.5295650959014893 + }, + { + "auxiliary_loss_clip": 0.06406762, + "auxiliary_loss_mlp": 0.01263079, + "balance_loss_clip": 0.06272393, + "balance_loss_mlp": 0.01253614, + "epoch": 0.692259131218999, + "flos": 23848619777280.0, + "grad_norm": 1.3335195335102994, + "language_loss": 0.78370172, + "learning_rate": 9.137341230331233e-07, + "loss": 0.86040014, + "num_input_tokens_seen": 248490535, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.09466553, + "step": 11514, + "time_per_iteration": 2.5325093269348145 + }, + { + "auxiliary_loss_clip": 0.06413436, + "auxiliary_loss_mlp": 0.01264156, + "balance_loss_clip": 0.06271526, + "balance_loss_mlp": 0.0125323, + "epoch": 0.6923192544716669, + "flos": 19141038489600.0, + "grad_norm": 1.7641312985276416, + "language_loss": 0.7541517, + "learning_rate": 9.134071334081907e-07, + "loss": 0.83092761, + "num_input_tokens_seen": 248508575, + "router_z_loss_clip": 1.41796875, + "router_z_loss_mlp": 0.10919189, + "step": 11515, + "time_per_iteration": 2.4964303970336914 + }, + { + "auxiliary_loss_clip": 0.06405345, + "auxiliary_loss_mlp": 0.01265608, + "balance_loss_clip": 0.06272751, + "balance_loss_mlp": 0.01255606, + "epoch": 0.6923793777243349, + "flos": 28082192117760.0, + "grad_norm": 1.899911587445346, + "language_loss": 0.53861475, + "learning_rate": 9.130801849869694e-07, + "loss": 0.61532426, + "num_input_tokens_seen": 248527025, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.10003662, + "step": 11516, + "time_per_iteration": 3.975773811340332 + }, + { + "auxiliary_loss_clip": 0.06402789, + "auxiliary_loss_mlp": 0.01269302, + "balance_loss_clip": 0.06273137, + "balance_loss_mlp": 0.01258812, + "epoch": 0.6924395009770029, + "flos": 16586818479360.0, + "grad_norm": 1.754197992941401, + "language_loss": 0.73113155, + "learning_rate": 9.127532777818557e-07, + "loss": 0.80785251, + "num_input_tokens_seen": 248544275, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.1048584, + "step": 11517, + "time_per_iteration": 2.5128793716430664 + }, + { + "auxiliary_loss_clip": 0.06413449, + "auxiliary_loss_mlp": 0.01270737, + "balance_loss_clip": 0.06275631, + "balance_loss_mlp": 0.01260223, + "epoch": 0.6924996242296708, + "flos": 16661058796800.0, + "grad_norm": 1.5645702983922471, + "language_loss": 0.76377338, + "learning_rate": 9.124264118052465e-07, + "loss": 0.84061527, + "num_input_tokens_seen": 248561870, + "router_z_loss_clip": 1.37695312, + "router_z_loss_mlp": 0.10510254, + "step": 11518, + "time_per_iteration": 4.030726432800293 + }, + { + "auxiliary_loss_clip": 0.06418861, + "auxiliary_loss_mlp": 0.01271759, + "balance_loss_clip": 0.06276505, + "balance_loss_mlp": 0.01260065, + "epoch": 0.6925597474823388, + "flos": 34763277893760.0, + "grad_norm": 1.2922865476436283, + "language_loss": 0.64748263, + "learning_rate": 9.120995870695376e-07, + "loss": 0.72438884, + "num_input_tokens_seen": 248588190, + "router_z_loss_clip": 1.42382812, + "router_z_loss_mlp": 0.11712646, + "step": 11519, + "time_per_iteration": 2.6468279361724854 + }, + { + "auxiliary_loss_clip": 0.06410517, + "auxiliary_loss_mlp": 0.01266916, + "balance_loss_clip": 0.06272532, + "balance_loss_mlp": 0.01255746, + "epoch": 0.6926198707350067, + "flos": 21878175962880.0, + "grad_norm": 1.754829284599123, + "language_loss": 0.62671852, + "learning_rate": 9.117728035871212e-07, + "loss": 0.70349276, + "num_input_tokens_seen": 248606460, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.1116333, + "step": 11520, + "time_per_iteration": 2.6443254947662354 + }, + { + "auxiliary_loss_clip": 0.06421007, + "auxiliary_loss_mlp": 0.0127025, + "balance_loss_clip": 0.06274754, + "balance_loss_mlp": 0.01259104, + "epoch": 0.6926799939876748, + "flos": 13011346506240.0, + "grad_norm": 1.8045037459633815, + "language_loss": 0.78247267, + "learning_rate": 9.114460613703887e-07, + "loss": 0.85938519, + "num_input_tokens_seen": 248623715, + "router_z_loss_clip": 1.46484375, + "router_z_loss_mlp": 0.11151123, + "step": 11521, + "time_per_iteration": 2.540693521499634 + }, + { + "auxiliary_loss_clip": 0.0641452, + "auxiliary_loss_mlp": 0.0126495, + "balance_loss_clip": 0.06273233, + "balance_loss_mlp": 0.0125356, + "epoch": 0.6927401172403427, + "flos": 16766423706240.0, + "grad_norm": 1.8333636519131566, + "language_loss": 0.82234508, + "learning_rate": 9.111193604317304e-07, + "loss": 0.89913976, + "num_input_tokens_seen": 248640575, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.11383057, + "step": 11522, + "time_per_iteration": 3.9248740673065186 + }, + { + "auxiliary_loss_clip": 0.06410085, + "auxiliary_loss_mlp": 0.01264492, + "balance_loss_clip": 0.06274056, + "balance_loss_mlp": 0.01254013, + "epoch": 0.6928002404930107, + "flos": 25713237484800.0, + "grad_norm": 1.543280654363121, + "language_loss": 0.77247906, + "learning_rate": 9.107927007835361e-07, + "loss": 0.84922481, + "num_input_tokens_seen": 248663535, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.10479736, + "step": 11523, + "time_per_iteration": 2.6300647258758545 + }, + { + "auxiliary_loss_clip": 0.0640799, + "auxiliary_loss_mlp": 0.01264871, + "balance_loss_clip": 0.06273483, + "balance_loss_mlp": 0.01255227, + "epoch": 0.6928603637456786, + "flos": 18594214744320.0, + "grad_norm": 1.7989990955818747, + "language_loss": 0.68682468, + "learning_rate": 9.104660824381915e-07, + "loss": 0.76355332, + "num_input_tokens_seen": 248681125, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.09637451, + "step": 11524, + "time_per_iteration": 2.4765005111694336 + }, + { + "auxiliary_loss_clip": 0.06415472, + "auxiliary_loss_mlp": 0.01265103, + "balance_loss_clip": 0.06274404, + "balance_loss_mlp": 0.0125385, + "epoch": 0.6929204869983466, + "flos": 22207519635840.0, + "grad_norm": 1.775837201090113, + "language_loss": 0.64731717, + "learning_rate": 9.101395054080815e-07, + "loss": 0.72412294, + "num_input_tokens_seen": 248700555, + "router_z_loss_clip": 1.41210938, + "router_z_loss_mlp": 0.1126709, + "step": 11525, + "time_per_iteration": 2.5243499279022217 + }, + { + "auxiliary_loss_clip": 0.06416623, + "auxiliary_loss_mlp": 0.01268916, + "balance_loss_clip": 0.06279063, + "balance_loss_mlp": 0.01258568, + "epoch": 0.6929806102510145, + "flos": 17900545518720.0, + "grad_norm": 2.0930840901881007, + "language_loss": 0.70522892, + "learning_rate": 9.098129697055907e-07, + "loss": 0.78208423, + "num_input_tokens_seen": 248716095, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.10351562, + "step": 11526, + "time_per_iteration": 2.4600794315338135 + }, + { + "auxiliary_loss_clip": 0.06409934, + "auxiliary_loss_mlp": 0.01263712, + "balance_loss_clip": 0.06273712, + "balance_loss_mlp": 0.01253186, + "epoch": 0.6930407335036826, + "flos": 19761222084480.0, + "grad_norm": 1.7010928543667516, + "language_loss": 0.76265514, + "learning_rate": 9.094864753431022e-07, + "loss": 0.83939159, + "num_input_tokens_seen": 248735330, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.10516357, + "step": 11527, + "time_per_iteration": 2.5164694786071777 + }, + { + "auxiliary_loss_clip": 0.06411794, + "auxiliary_loss_mlp": 0.01263204, + "balance_loss_clip": 0.06273556, + "balance_loss_mlp": 0.01253149, + "epoch": 0.6931008567563505, + "flos": 21550802860800.0, + "grad_norm": 1.5438747158568011, + "language_loss": 0.79877269, + "learning_rate": 9.091600223329952e-07, + "loss": 0.87552267, + "num_input_tokens_seen": 248754530, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.1005249, + "step": 11528, + "time_per_iteration": 2.501044988632202 + }, + { + "auxiliary_loss_clip": 0.06405636, + "auxiliary_loss_mlp": 0.01267062, + "balance_loss_clip": 0.06273603, + "balance_loss_mlp": 0.01256917, + "epoch": 0.6931609800090185, + "flos": 26257210191360.0, + "grad_norm": 1.3083455635421857, + "language_loss": 0.75950116, + "learning_rate": 9.088336106876491e-07, + "loss": 0.83622813, + "num_input_tokens_seen": 248775825, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.10144043, + "step": 11529, + "time_per_iteration": 2.5608596801757812 + }, + { + "auxiliary_loss_clip": 0.06410852, + "auxiliary_loss_mlp": 0.01265207, + "balance_loss_clip": 0.06276192, + "balance_loss_mlp": 0.01254961, + "epoch": 0.6932211032616865, + "flos": 32351626805760.0, + "grad_norm": 2.07531682890069, + "language_loss": 0.73131585, + "learning_rate": 9.085072404194436e-07, + "loss": 0.80807638, + "num_input_tokens_seen": 248796180, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.10241699, + "step": 11530, + "time_per_iteration": 2.5931029319763184 + }, + { + "auxiliary_loss_clip": 0.06423162, + "auxiliary_loss_mlp": 0.01267459, + "balance_loss_clip": 0.06278834, + "balance_loss_mlp": 0.0125598, + "epoch": 0.6932812265143544, + "flos": 22054720515840.0, + "grad_norm": 1.8331163383956572, + "language_loss": 0.78110623, + "learning_rate": 9.081809115407513e-07, + "loss": 0.85801244, + "num_input_tokens_seen": 248814735, + "router_z_loss_clip": 1.44238281, + "router_z_loss_mlp": 0.11474609, + "step": 11531, + "time_per_iteration": 2.537781000137329 + }, + { + "auxiliary_loss_clip": 0.06406952, + "auxiliary_loss_mlp": 0.01266064, + "balance_loss_clip": 0.06274234, + "balance_loss_mlp": 0.01256092, + "epoch": 0.6933413497670224, + "flos": 26264924766720.0, + "grad_norm": 1.4723585148230005, + "language_loss": 0.69516993, + "learning_rate": 9.078546240639484e-07, + "loss": 0.77190006, + "num_input_tokens_seen": 248839140, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.09973145, + "step": 11532, + "time_per_iteration": 2.6068294048309326 + }, + { + "auxiliary_loss_clip": 0.06414198, + "auxiliary_loss_mlp": 0.01265385, + "balance_loss_clip": 0.06275293, + "balance_loss_mlp": 0.0125403, + "epoch": 0.6934014730196904, + "flos": 19579059308160.0, + "grad_norm": 1.68179431170249, + "language_loss": 0.66939062, + "learning_rate": 9.075283780014082e-07, + "loss": 0.74618644, + "num_input_tokens_seen": 248858300, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.11358643, + "step": 11533, + "time_per_iteration": 2.5188937187194824 + }, + { + "auxiliary_loss_clip": 0.06414025, + "auxiliary_loss_mlp": 0.01266342, + "balance_loss_clip": 0.06274263, + "balance_loss_mlp": 0.01254892, + "epoch": 0.6934615962723584, + "flos": 22124432712960.0, + "grad_norm": 2.2635878062852384, + "language_loss": 0.59154713, + "learning_rate": 9.072021733655007e-07, + "loss": 0.66835076, + "num_input_tokens_seen": 248876310, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.11456299, + "step": 11534, + "time_per_iteration": 2.513169288635254 + }, + { + "auxiliary_loss_clip": 0.06412862, + "auxiliary_loss_mlp": 0.01265908, + "balance_loss_clip": 0.06276149, + "balance_loss_mlp": 0.01255639, + "epoch": 0.6935217195250263, + "flos": 21367172638080.0, + "grad_norm": 2.468732709113743, + "language_loss": 0.71063632, + "learning_rate": 9.068760101685971e-07, + "loss": 0.78742403, + "num_input_tokens_seen": 248895650, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.10266113, + "step": 11535, + "time_per_iteration": 2.5125019550323486 + }, + { + "auxiliary_loss_clip": 0.0632171, + "auxiliary_loss_mlp": 0.012535, + "balance_loss_clip": 0.06265885, + "balance_loss_mlp": 0.01252321, + "epoch": 0.6935818427776943, + "flos": 64085864400000.0, + "grad_norm": 0.6899850160451471, + "language_loss": 0.58968407, + "learning_rate": 9.065498884230638e-07, + "loss": 0.66543621, + "num_input_tokens_seen": 248963920, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.01176453, + "step": 11536, + "time_per_iteration": 3.2811362743377686 + }, + { + "auxiliary_loss_clip": 0.06415699, + "auxiliary_loss_mlp": 0.01266201, + "balance_loss_clip": 0.06274739, + "balance_loss_mlp": 0.01255628, + "epoch": 0.6936419660303622, + "flos": 20308716662400.0, + "grad_norm": 1.4806055752543272, + "language_loss": 0.72754341, + "learning_rate": 9.062238081412692e-07, + "loss": 0.80436242, + "num_input_tokens_seen": 248983380, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.10571289, + "step": 11537, + "time_per_iteration": 2.521667242050171 + }, + { + "auxiliary_loss_clip": 0.06322287, + "auxiliary_loss_mlp": 0.01253211, + "balance_loss_clip": 0.06266545, + "balance_loss_mlp": 0.01252035, + "epoch": 0.6937020892830302, + "flos": 67201974691200.0, + "grad_norm": 0.7781896456354132, + "language_loss": 0.5562225, + "learning_rate": 9.058977693355767e-07, + "loss": 0.63197744, + "num_input_tokens_seen": 249044680, + "router_z_loss_clip": 0.55517578, + "router_z_loss_mlp": 0.01173401, + "step": 11538, + "time_per_iteration": 3.133890390396118 + }, + { + "auxiliary_loss_clip": 0.06402846, + "auxiliary_loss_mlp": 0.01263458, + "balance_loss_clip": 0.0627329, + "balance_loss_mlp": 0.01253844, + "epoch": 0.6937622125356981, + "flos": 23884943322240.0, + "grad_norm": 1.4430233846230829, + "language_loss": 0.7770322, + "learning_rate": 9.055717720183505e-07, + "loss": 0.85369527, + "num_input_tokens_seen": 249061060, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.09613037, + "step": 11539, + "time_per_iteration": 2.5152971744537354 + }, + { + "auxiliary_loss_clip": 0.0640855, + "auxiliary_loss_mlp": 0.01262731, + "balance_loss_clip": 0.06274487, + "balance_loss_mlp": 0.01252664, + "epoch": 0.6938223357883662, + "flos": 28738154206080.0, + "grad_norm": 1.7708768043043424, + "language_loss": 0.64184511, + "learning_rate": 9.05245816201953e-07, + "loss": 0.71855795, + "num_input_tokens_seen": 249081430, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.10070801, + "step": 11540, + "time_per_iteration": 2.5849952697753906 + }, + { + "auxiliary_loss_clip": 0.06409811, + "auxiliary_loss_mlp": 0.01263592, + "balance_loss_clip": 0.06274833, + "balance_loss_mlp": 0.01254288, + "epoch": 0.6938824590410341, + "flos": 28662111025920.0, + "grad_norm": 1.4340903998261632, + "language_loss": 0.87096, + "learning_rate": 9.049199018987437e-07, + "loss": 0.94769406, + "num_input_tokens_seen": 249103020, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.09301758, + "step": 11541, + "time_per_iteration": 2.5415987968444824 + }, + { + "auxiliary_loss_clip": 0.06411604, + "auxiliary_loss_mlp": 0.01265165, + "balance_loss_clip": 0.06272925, + "balance_loss_mlp": 0.0125474, + "epoch": 0.6939425822937021, + "flos": 18987987807360.0, + "grad_norm": 1.6079825627082245, + "language_loss": 0.84464371, + "learning_rate": 9.04594029121081e-07, + "loss": 0.92141145, + "num_input_tokens_seen": 249120810, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.10418701, + "step": 11542, + "time_per_iteration": 2.499424457550049 + }, + { + "auxiliary_loss_clip": 0.06415489, + "auxiliary_loss_mlp": 0.01265068, + "balance_loss_clip": 0.06275496, + "balance_loss_mlp": 0.01254136, + "epoch": 0.6940027055463701, + "flos": 23082513096960.0, + "grad_norm": 1.8518042954467828, + "language_loss": 0.75316143, + "learning_rate": 9.04268197881323e-07, + "loss": 0.82996696, + "num_input_tokens_seen": 249138050, + "router_z_loss_clip": 1.39941406, + "router_z_loss_mlp": 0.10931396, + "step": 11543, + "time_per_iteration": 3.9085495471954346 + }, + { + "auxiliary_loss_clip": 0.06410378, + "auxiliary_loss_mlp": 0.01265988, + "balance_loss_clip": 0.06273862, + "balance_loss_mlp": 0.01255373, + "epoch": 0.694062828799038, + "flos": 18192391689600.0, + "grad_norm": 1.648222513312388, + "language_loss": 0.76331246, + "learning_rate": 9.039424081918241e-07, + "loss": 0.84007609, + "num_input_tokens_seen": 249155570, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.10614014, + "step": 11544, + "time_per_iteration": 2.5347986221313477 + }, + { + "auxiliary_loss_clip": 0.06413911, + "auxiliary_loss_mlp": 0.0126496, + "balance_loss_clip": 0.06275374, + "balance_loss_mlp": 0.012541, + "epoch": 0.694122952051706, + "flos": 17827269523200.0, + "grad_norm": 1.8058959765981615, + "language_loss": 0.71283519, + "learning_rate": 9.036166600649388e-07, + "loss": 0.78962398, + "num_input_tokens_seen": 249172960, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.10864258, + "step": 11545, + "time_per_iteration": 2.4718210697174072 + }, + { + "auxiliary_loss_clip": 0.06407937, + "auxiliary_loss_mlp": 0.01262856, + "balance_loss_clip": 0.06275916, + "balance_loss_mlp": 0.01253039, + "epoch": 0.694183075304374, + "flos": 21221710750080.0, + "grad_norm": 1.516472070644587, + "language_loss": 0.79896855, + "learning_rate": 9.0329095351302e-07, + "loss": 0.87567645, + "num_input_tokens_seen": 249192450, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.09814453, + "step": 11546, + "time_per_iteration": 2.5148062705993652 + }, + { + "auxiliary_loss_clip": 0.06411743, + "auxiliary_loss_mlp": 0.01267153, + "balance_loss_clip": 0.06275012, + "balance_loss_mlp": 0.01256281, + "epoch": 0.694243198557042, + "flos": 24067273806720.0, + "grad_norm": 1.4558199270771826, + "language_loss": 0.7883184, + "learning_rate": 9.029652885484194e-07, + "loss": 0.8651073, + "num_input_tokens_seen": 249214320, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.10870361, + "step": 11547, + "time_per_iteration": 2.5461182594299316 + }, + { + "auxiliary_loss_clip": 0.06409074, + "auxiliary_loss_mlp": 0.01267288, + "balance_loss_clip": 0.06275046, + "balance_loss_mlp": 0.01256845, + "epoch": 0.6943033218097099, + "flos": 21148183192320.0, + "grad_norm": 2.180775706849967, + "language_loss": 0.80900609, + "learning_rate": 9.026396651834834e-07, + "loss": 0.88576972, + "num_input_tokens_seen": 249230925, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.10443115, + "step": 11548, + "time_per_iteration": 2.499633312225342 + }, + { + "auxiliary_loss_clip": 0.06316315, + "auxiliary_loss_mlp": 0.01251651, + "balance_loss_clip": 0.06260554, + "balance_loss_mlp": 0.01250445, + "epoch": 0.6943634450623779, + "flos": 57830892163200.0, + "grad_norm": 0.8127275261655555, + "language_loss": 0.53539848, + "learning_rate": 9.023140834305613e-07, + "loss": 0.61107814, + "num_input_tokens_seen": 249293975, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.01203918, + "step": 11549, + "time_per_iteration": 3.1340725421905518 + }, + { + "auxiliary_loss_clip": 0.06409207, + "auxiliary_loss_mlp": 0.01267856, + "balance_loss_clip": 0.0627339, + "balance_loss_mlp": 0.01256924, + "epoch": 0.6944235683150458, + "flos": 30598411501440.0, + "grad_norm": 1.3218169673539149, + "language_loss": 0.73849893, + "learning_rate": 9.01988543302e-07, + "loss": 0.81526959, + "num_input_tokens_seen": 249315285, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.109375, + "step": 11550, + "time_per_iteration": 2.5708651542663574 + }, + { + "auxiliary_loss_clip": 0.06414837, + "auxiliary_loss_mlp": 0.01267221, + "balance_loss_clip": 0.06273603, + "balance_loss_mlp": 0.01255836, + "epoch": 0.6944836915677138, + "flos": 19725611299200.0, + "grad_norm": 2.422306593837277, + "language_loss": 0.7436735, + "learning_rate": 9.016630448101425e-07, + "loss": 0.82049412, + "num_input_tokens_seen": 249333505, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.11364746, + "step": 11551, + "time_per_iteration": 2.527280807495117 + }, + { + "auxiliary_loss_clip": 0.06412678, + "auxiliary_loss_mlp": 0.01266399, + "balance_loss_clip": 0.06274699, + "balance_loss_mlp": 0.01255592, + "epoch": 0.6945438148203817, + "flos": 24870542572800.0, + "grad_norm": 1.4976139060418592, + "language_loss": 0.84468353, + "learning_rate": 9.01337587967333e-07, + "loss": 0.92147428, + "num_input_tokens_seen": 249354180, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.10797119, + "step": 11552, + "time_per_iteration": 2.5304994583129883 + }, + { + "auxiliary_loss_clip": 0.06412995, + "auxiliary_loss_mlp": 0.01266444, + "balance_loss_clip": 0.06275281, + "balance_loss_mlp": 0.01255787, + "epoch": 0.6946039380730498, + "flos": 33334752360960.0, + "grad_norm": 1.8566044703469122, + "language_loss": 0.67553848, + "learning_rate": 9.010121727859117e-07, + "loss": 0.75233287, + "num_input_tokens_seen": 249377035, + "router_z_loss_clip": 1.37695312, + "router_z_loss_mlp": 0.10656738, + "step": 11553, + "time_per_iteration": 2.6192421913146973 + }, + { + "auxiliary_loss_clip": 0.064182, + "auxiliary_loss_mlp": 0.01265466, + "balance_loss_clip": 0.06275068, + "balance_loss_mlp": 0.01254314, + "epoch": 0.6946640613257177, + "flos": 20857385197440.0, + "grad_norm": 1.702671495962781, + "language_loss": 0.79674661, + "learning_rate": 9.006867992782195e-07, + "loss": 0.87358326, + "num_input_tokens_seen": 249396155, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.11138916, + "step": 11554, + "time_per_iteration": 2.486833095550537 + }, + { + "auxiliary_loss_clip": 0.06411414, + "auxiliary_loss_mlp": 0.0126656, + "balance_loss_clip": 0.06272921, + "balance_loss_mlp": 0.01256064, + "epoch": 0.6947241845783857, + "flos": 19360992257280.0, + "grad_norm": 2.4583328560659825, + "language_loss": 0.72664356, + "learning_rate": 9.003614674565934e-07, + "loss": 0.80342329, + "num_input_tokens_seen": 249414555, + "router_z_loss_clip": 1.38476562, + "router_z_loss_mlp": 0.10498047, + "step": 11555, + "time_per_iteration": 4.000531196594238 + }, + { + "auxiliary_loss_clip": 0.0640734, + "auxiliary_loss_mlp": 0.01264698, + "balance_loss_clip": 0.0627168, + "balance_loss_mlp": 0.01254404, + "epoch": 0.6947843078310536, + "flos": 27126669283200.0, + "grad_norm": 1.6806828217534537, + "language_loss": 0.78220618, + "learning_rate": 9.000361773333705e-07, + "loss": 0.85892653, + "num_input_tokens_seen": 249433570, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10284424, + "step": 11556, + "time_per_iteration": 2.5366411209106445 + }, + { + "auxiliary_loss_clip": 0.06412055, + "auxiliary_loss_mlp": 0.01264593, + "balance_loss_clip": 0.06273782, + "balance_loss_mlp": 0.01254198, + "epoch": 0.6948444310837216, + "flos": 28592692318080.0, + "grad_norm": 2.2663636290746205, + "language_loss": 0.60655725, + "learning_rate": 8.997109289208869e-07, + "loss": 0.68332362, + "num_input_tokens_seen": 249453735, + "router_z_loss_clip": 1.38183594, + "router_z_loss_mlp": 0.10394287, + "step": 11557, + "time_per_iteration": 2.5730667114257812 + }, + { + "auxiliary_loss_clip": 0.06406298, + "auxiliary_loss_mlp": 0.0126677, + "balance_loss_clip": 0.06273069, + "balance_loss_mlp": 0.01256923, + "epoch": 0.6949045543363896, + "flos": 15674704859520.0, + "grad_norm": 1.6481144158645147, + "language_loss": 0.85564643, + "learning_rate": 8.993857222314752e-07, + "loss": 0.9323771, + "num_input_tokens_seen": 249470805, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.09851074, + "step": 11558, + "time_per_iteration": 3.9160499572753906 + }, + { + "auxiliary_loss_clip": 0.06415498, + "auxiliary_loss_mlp": 0.01268636, + "balance_loss_clip": 0.06274904, + "balance_loss_mlp": 0.01257764, + "epoch": 0.6949646775890576, + "flos": 23266311027840.0, + "grad_norm": 1.591782165805242, + "language_loss": 0.70581871, + "learning_rate": 8.990605572774664e-07, + "loss": 0.78266007, + "num_input_tokens_seen": 249491150, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.10876465, + "step": 11559, + "time_per_iteration": 2.527818441390991 + }, + { + "auxiliary_loss_clip": 0.06411439, + "auxiliary_loss_mlp": 0.01267371, + "balance_loss_clip": 0.06274717, + "balance_loss_mlp": 0.01256946, + "epoch": 0.6950248008417256, + "flos": 22389095433600.0, + "grad_norm": 1.4072009263276422, + "language_loss": 0.78738344, + "learning_rate": 8.987354340711921e-07, + "loss": 0.8641715, + "num_input_tokens_seen": 249511560, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.10424805, + "step": 11560, + "time_per_iteration": 2.5627846717834473 + }, + { + "auxiliary_loss_clip": 0.06408294, + "auxiliary_loss_mlp": 0.01264919, + "balance_loss_clip": 0.06274506, + "balance_loss_mlp": 0.01255614, + "epoch": 0.6950849240943935, + "flos": 23484126516480.0, + "grad_norm": 1.4947787442240967, + "language_loss": 0.76889873, + "learning_rate": 8.9841035262498e-07, + "loss": 0.84563088, + "num_input_tokens_seen": 249531910, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.09307861, + "step": 11561, + "time_per_iteration": 2.4997048377990723 + }, + { + "auxiliary_loss_clip": 0.06411804, + "auxiliary_loss_mlp": 0.01269689, + "balance_loss_clip": 0.06277403, + "balance_loss_mlp": 0.012589, + "epoch": 0.6951450473470615, + "flos": 17426285009280.0, + "grad_norm": 1.734417047783141, + "language_loss": 0.78360051, + "learning_rate": 8.980853129511577e-07, + "loss": 0.86041546, + "num_input_tokens_seen": 249550300, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.10784912, + "step": 11562, + "time_per_iteration": 3.868687868118286 + }, + { + "auxiliary_loss_clip": 0.06413691, + "auxiliary_loss_mlp": 0.0126516, + "balance_loss_clip": 0.06274996, + "balance_loss_mlp": 0.01254509, + "epoch": 0.6952051705997294, + "flos": 20492053395840.0, + "grad_norm": 2.791172268200526, + "language_loss": 0.69210434, + "learning_rate": 8.977603150620515e-07, + "loss": 0.76889294, + "num_input_tokens_seen": 249567740, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.10656738, + "step": 11563, + "time_per_iteration": 2.521984338760376 + }, + { + "auxiliary_loss_clip": 0.0640626, + "auxiliary_loss_mlp": 0.01264877, + "balance_loss_clip": 0.06274064, + "balance_loss_mlp": 0.01255006, + "epoch": 0.6952652938523974, + "flos": 13994472061440.0, + "grad_norm": 2.2938813143699943, + "language_loss": 0.73795921, + "learning_rate": 8.974353589699846e-07, + "loss": 0.81467056, + "num_input_tokens_seen": 249582700, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.09869385, + "step": 11564, + "time_per_iteration": 2.454090118408203 + }, + { + "auxiliary_loss_clip": 0.06431751, + "auxiliary_loss_mlp": 0.01272001, + "balance_loss_clip": 0.06280031, + "balance_loss_mlp": 0.01259174, + "epoch": 0.6953254171050653, + "flos": 30961479242880.0, + "grad_norm": 1.9156541387809913, + "language_loss": 0.71630907, + "learning_rate": 8.971104446872785e-07, + "loss": 0.79334664, + "num_input_tokens_seen": 249602920, + "router_z_loss_clip": 1.51660156, + "router_z_loss_mlp": 0.12823486, + "step": 11565, + "time_per_iteration": 2.6339352130889893 + }, + { + "auxiliary_loss_clip": 0.06312925, + "auxiliary_loss_mlp": 0.01254517, + "balance_loss_clip": 0.0625705, + "balance_loss_mlp": 0.01253326, + "epoch": 0.6953855403577334, + "flos": 61688231671680.0, + "grad_norm": 0.9056621867794188, + "language_loss": 0.58358586, + "learning_rate": 8.96785572226255e-07, + "loss": 0.65926027, + "num_input_tokens_seen": 249660400, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.01189423, + "step": 11566, + "time_per_iteration": 2.9703423976898193 + }, + { + "auxiliary_loss_clip": 0.0641438, + "auxiliary_loss_mlp": 0.01265896, + "balance_loss_clip": 0.06273914, + "balance_loss_mlp": 0.01254237, + "epoch": 0.6954456636104013, + "flos": 23045644500480.0, + "grad_norm": 1.741502187715767, + "language_loss": 0.74213183, + "learning_rate": 8.964607415992338e-07, + "loss": 0.81893462, + "num_input_tokens_seen": 249679335, + "router_z_loss_clip": 1.40429688, + "router_z_loss_mlp": 0.11663818, + "step": 11567, + "time_per_iteration": 2.5282747745513916 + }, + { + "auxiliary_loss_clip": 0.06409914, + "auxiliary_loss_mlp": 0.01264668, + "balance_loss_clip": 0.06274567, + "balance_loss_mlp": 0.0125382, + "epoch": 0.6955057868630693, + "flos": 23925920768640.0, + "grad_norm": 1.2088897193849768, + "language_loss": 0.76795661, + "learning_rate": 8.961359528185313e-07, + "loss": 0.84470242, + "num_input_tokens_seen": 249701805, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.10858154, + "step": 11568, + "time_per_iteration": 2.555664300918579 + }, + { + "auxiliary_loss_clip": 0.06409561, + "auxiliary_loss_mlp": 0.01267134, + "balance_loss_clip": 0.06274664, + "balance_loss_mlp": 0.01257567, + "epoch": 0.6955659101157372, + "flos": 22600076814720.0, + "grad_norm": 2.0811162561190444, + "language_loss": 0.72560644, + "learning_rate": 8.958112058964649e-07, + "loss": 0.80237341, + "num_input_tokens_seen": 249720550, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.09570312, + "step": 11569, + "time_per_iteration": 2.550203323364258 + }, + { + "auxiliary_loss_clip": 0.06412488, + "auxiliary_loss_mlp": 0.01267303, + "balance_loss_clip": 0.0627417, + "balance_loss_mlp": 0.01256568, + "epoch": 0.6956260333684052, + "flos": 24579576869760.0, + "grad_norm": 1.4598042665233286, + "language_loss": 0.77169657, + "learning_rate": 8.954865008453471e-07, + "loss": 0.84849441, + "num_input_tokens_seen": 249740325, + "router_z_loss_clip": 1.38183594, + "router_z_loss_mlp": 0.10736084, + "step": 11570, + "time_per_iteration": 2.5227878093719482 + }, + { + "auxiliary_loss_clip": 0.06413926, + "auxiliary_loss_mlp": 0.01265729, + "balance_loss_clip": 0.06273335, + "balance_loss_mlp": 0.01255436, + "epoch": 0.6956861566210732, + "flos": 25852745733120.0, + "grad_norm": 1.7591175950059927, + "language_loss": 0.7487582, + "learning_rate": 8.95161837677493e-07, + "loss": 0.82555479, + "num_input_tokens_seen": 249760570, + "router_z_loss_clip": 1.40429688, + "router_z_loss_mlp": 0.10284424, + "step": 11571, + "time_per_iteration": 2.597681999206543 + }, + { + "auxiliary_loss_clip": 0.06403409, + "auxiliary_loss_mlp": 0.01263016, + "balance_loss_clip": 0.062727, + "balance_loss_mlp": 0.01253241, + "epoch": 0.6957462798737412, + "flos": 15306270456960.0, + "grad_norm": 1.6743829197171876, + "language_loss": 0.74611163, + "learning_rate": 8.948372164052118e-07, + "loss": 0.8227759, + "num_input_tokens_seen": 249778290, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09771729, + "step": 11572, + "time_per_iteration": 2.479717254638672 + }, + { + "auxiliary_loss_clip": 0.06411865, + "auxiliary_loss_mlp": 0.01266562, + "balance_loss_clip": 0.06272524, + "balance_loss_mlp": 0.01256036, + "epoch": 0.6958064031264092, + "flos": 36255645838080.0, + "grad_norm": 1.9177386659246018, + "language_loss": 0.70336205, + "learning_rate": 8.94512637040814e-07, + "loss": 0.7801463, + "num_input_tokens_seen": 249800925, + "router_z_loss_clip": 1.39257812, + "router_z_loss_mlp": 0.10522461, + "step": 11573, + "time_per_iteration": 2.646585702896118 + }, + { + "auxiliary_loss_clip": 0.064174, + "auxiliary_loss_mlp": 0.01266296, + "balance_loss_clip": 0.06275034, + "balance_loss_mlp": 0.01254935, + "epoch": 0.6958665263790771, + "flos": 19214817609600.0, + "grad_norm": 1.6543405774844155, + "language_loss": 0.75180942, + "learning_rate": 8.941880995966095e-07, + "loss": 0.82864642, + "num_input_tokens_seen": 249820500, + "router_z_loss_clip": 1.42382812, + "router_z_loss_mlp": 0.11364746, + "step": 11574, + "time_per_iteration": 2.5017471313476562 + }, + { + "auxiliary_loss_clip": 0.06413898, + "auxiliary_loss_mlp": 0.0126532, + "balance_loss_clip": 0.06272998, + "balance_loss_mlp": 0.01254996, + "epoch": 0.6959266496317451, + "flos": 21801797366400.0, + "grad_norm": 1.6788443251259586, + "language_loss": 0.74745572, + "learning_rate": 8.938636040849014e-07, + "loss": 0.8242479, + "num_input_tokens_seen": 249839845, + "router_z_loss_clip": 1.40722656, + "router_z_loss_mlp": 0.10327148, + "step": 11575, + "time_per_iteration": 2.5528361797332764 + }, + { + "auxiliary_loss_clip": 0.06409347, + "auxiliary_loss_mlp": 0.01269096, + "balance_loss_clip": 0.06271648, + "balance_loss_mlp": 0.01258248, + "epoch": 0.695986772884413, + "flos": 20564490850560.0, + "grad_norm": 1.717283083984882, + "language_loss": 0.79060346, + "learning_rate": 8.935391505179966e-07, + "loss": 0.86738789, + "num_input_tokens_seen": 249857400, + "router_z_loss_clip": 1.37695312, + "router_z_loss_mlp": 0.10845947, + "step": 11576, + "time_per_iteration": 2.4801833629608154 + }, + { + "auxiliary_loss_clip": 0.06413432, + "auxiliary_loss_mlp": 0.01262741, + "balance_loss_clip": 0.06272326, + "balance_loss_mlp": 0.01252191, + "epoch": 0.696046896137081, + "flos": 14940980582400.0, + "grad_norm": 2.5670489052023404, + "language_loss": 0.57032454, + "learning_rate": 8.932147389081985e-07, + "loss": 0.64708626, + "num_input_tokens_seen": 249871645, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.10559082, + "step": 11577, + "time_per_iteration": 2.502033233642578 + }, + { + "auxiliary_loss_clip": 0.06404924, + "auxiliary_loss_mlp": 0.01266503, + "balance_loss_clip": 0.06274053, + "balance_loss_mlp": 0.01257521, + "epoch": 0.696107019389749, + "flos": 30748569217920.0, + "grad_norm": 1.378295678041548, + "language_loss": 0.76719046, + "learning_rate": 8.928903692678081e-07, + "loss": 0.84390473, + "num_input_tokens_seen": 249894215, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.08984375, + "step": 11578, + "time_per_iteration": 2.605837821960449 + }, + { + "auxiliary_loss_clip": 0.06414018, + "auxiliary_loss_mlp": 0.01262965, + "balance_loss_clip": 0.0627658, + "balance_loss_mlp": 0.01253249, + "epoch": 0.696167142642417, + "flos": 20782935244800.0, + "grad_norm": 3.119426120413718, + "language_loss": 0.79773849, + "learning_rate": 8.925660416091254e-07, + "loss": 0.87450832, + "num_input_tokens_seen": 249912850, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.09716797, + "step": 11579, + "time_per_iteration": 2.5537924766540527 + }, + { + "auxiliary_loss_clip": 0.06405934, + "auxiliary_loss_mlp": 0.01263768, + "balance_loss_clip": 0.06271495, + "balance_loss_mlp": 0.01253558, + "epoch": 0.6962272658950849, + "flos": 22571761334400.0, + "grad_norm": 1.5861987374843416, + "language_loss": 0.72813702, + "learning_rate": 8.922417559444502e-07, + "loss": 0.80483407, + "num_input_tokens_seen": 249932650, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.10205078, + "step": 11580, + "time_per_iteration": 2.5217056274414062 + }, + { + "auxiliary_loss_clip": 0.0641515, + "auxiliary_loss_mlp": 0.01267668, + "balance_loss_clip": 0.06275546, + "balance_loss_mlp": 0.01255896, + "epoch": 0.6962873891477529, + "flos": 22206681095040.0, + "grad_norm": 2.1085212775747975, + "language_loss": 0.66371673, + "learning_rate": 8.919175122860787e-07, + "loss": 0.74054492, + "num_input_tokens_seen": 249951205, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.11767578, + "step": 11581, + "time_per_iteration": 2.5470681190490723 + }, + { + "auxiliary_loss_clip": 0.06415606, + "auxiliary_loss_mlp": 0.01263239, + "balance_loss_clip": 0.06278277, + "balance_loss_mlp": 0.01253726, + "epoch": 0.6963475124004208, + "flos": 12493718709120.0, + "grad_norm": 3.192459541289618, + "language_loss": 0.76738924, + "learning_rate": 8.915933106463056e-07, + "loss": 0.84417772, + "num_input_tokens_seen": 249967045, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.09509277, + "step": 11582, + "time_per_iteration": 2.5975067615509033 + }, + { + "auxiliary_loss_clip": 0.06411912, + "auxiliary_loss_mlp": 0.01266649, + "balance_loss_clip": 0.06274536, + "balance_loss_mlp": 0.01256355, + "epoch": 0.6964076356530888, + "flos": 17170762383360.0, + "grad_norm": 2.14882454800848, + "language_loss": 0.70161986, + "learning_rate": 8.91269151037425e-07, + "loss": 0.77840543, + "num_input_tokens_seen": 249984565, + "router_z_loss_clip": 1.37402344, + "router_z_loss_mlp": 0.10290527, + "step": 11583, + "time_per_iteration": 3.9500138759613037 + }, + { + "auxiliary_loss_clip": 0.06410628, + "auxiliary_loss_mlp": 0.01268947, + "balance_loss_clip": 0.06274879, + "balance_loss_mlp": 0.01258272, + "epoch": 0.6964677589057569, + "flos": 19943342933760.0, + "grad_norm": 1.7749969250449007, + "language_loss": 0.82683307, + "learning_rate": 8.909450334717301e-07, + "loss": 0.90362883, + "num_input_tokens_seen": 250004235, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.10681152, + "step": 11584, + "time_per_iteration": 2.5435311794281006 + }, + { + "auxiliary_loss_clip": 0.06411311, + "auxiliary_loss_mlp": 0.01267824, + "balance_loss_clip": 0.06271736, + "balance_loss_mlp": 0.01256565, + "epoch": 0.6965278821584248, + "flos": 22790708853120.0, + "grad_norm": 2.098465309846489, + "language_loss": 0.79802585, + "learning_rate": 8.906209579615107e-07, + "loss": 0.87481719, + "num_input_tokens_seen": 250017645, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.1126709, + "step": 11585, + "time_per_iteration": 2.490299701690674 + }, + { + "auxiliary_loss_clip": 0.06406368, + "auxiliary_loss_mlp": 0.01265153, + "balance_loss_clip": 0.06273674, + "balance_loss_mlp": 0.01255735, + "epoch": 0.6965880054110928, + "flos": 20053739088000.0, + "grad_norm": 1.7604905238703683, + "language_loss": 0.77940738, + "learning_rate": 8.90296924519055e-07, + "loss": 0.85612255, + "num_input_tokens_seen": 250037640, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.09411621, + "step": 11586, + "time_per_iteration": 2.5373406410217285 + }, + { + "auxiliary_loss_clip": 0.06404427, + "auxiliary_loss_mlp": 0.01266758, + "balance_loss_clip": 0.06273477, + "balance_loss_mlp": 0.0125706, + "epoch": 0.6966481286637607, + "flos": 21914709143040.0, + "grad_norm": 1.8539557700987637, + "language_loss": 0.78935838, + "learning_rate": 8.899729331566519e-07, + "loss": 0.86607027, + "num_input_tokens_seen": 250056490, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09698486, + "step": 11587, + "time_per_iteration": 2.4801838397979736 + }, + { + "auxiliary_loss_clip": 0.06406583, + "auxiliary_loss_mlp": 0.01264851, + "balance_loss_clip": 0.0627536, + "balance_loss_mlp": 0.01254915, + "epoch": 0.6967082519164287, + "flos": 15638674803840.0, + "grad_norm": 1.9230111566874013, + "language_loss": 0.73017895, + "learning_rate": 8.896489838865857e-07, + "loss": 0.80689335, + "num_input_tokens_seen": 250074285, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09936523, + "step": 11588, + "time_per_iteration": 2.488046646118164 + }, + { + "auxiliary_loss_clip": 0.06411311, + "auxiliary_loss_mlp": 0.01262306, + "balance_loss_clip": 0.06274327, + "balance_loss_mlp": 0.01252507, + "epoch": 0.6967683751690966, + "flos": 24031453386240.0, + "grad_norm": 2.0364063263002885, + "language_loss": 0.74887639, + "learning_rate": 8.893250767211413e-07, + "loss": 0.82561255, + "num_input_tokens_seen": 250093350, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.09802246, + "step": 11589, + "time_per_iteration": 2.548539400100708 + }, + { + "auxiliary_loss_clip": 0.06411868, + "auxiliary_loss_mlp": 0.01265329, + "balance_loss_clip": 0.06274883, + "balance_loss_mlp": 0.01254773, + "epoch": 0.6968284984217646, + "flos": 31031862272640.0, + "grad_norm": 4.3993143538672275, + "language_loss": 0.63862813, + "learning_rate": 8.890012116726012e-07, + "loss": 0.71539998, + "num_input_tokens_seen": 250114170, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.10552979, + "step": 11590, + "time_per_iteration": 2.6050679683685303 + }, + { + "auxiliary_loss_clip": 0.06316171, + "auxiliary_loss_mlp": 0.01251394, + "balance_loss_clip": 0.06259812, + "balance_loss_mlp": 0.0125019, + "epoch": 0.6968886216744326, + "flos": 67642888475520.0, + "grad_norm": 0.7383814790063842, + "language_loss": 0.6120699, + "learning_rate": 8.88677388753248e-07, + "loss": 0.68774557, + "num_input_tokens_seen": 250178250, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01203156, + "step": 11591, + "time_per_iteration": 3.205728530883789 + }, + { + "auxiliary_loss_clip": 0.06413443, + "auxiliary_loss_mlp": 0.01267566, + "balance_loss_clip": 0.0627727, + "balance_loss_mlp": 0.01256539, + "epoch": 0.6969487449271006, + "flos": 24870668353920.0, + "grad_norm": 1.4802717401382182, + "language_loss": 0.69663697, + "learning_rate": 8.883536079753582e-07, + "loss": 0.77344704, + "num_input_tokens_seen": 250198420, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.11029053, + "step": 11592, + "time_per_iteration": 2.530959367752075 + }, + { + "auxiliary_loss_clip": 0.06411387, + "auxiliary_loss_mlp": 0.01269289, + "balance_loss_clip": 0.06275564, + "balance_loss_mlp": 0.01259132, + "epoch": 0.6970088681797685, + "flos": 28775525927040.0, + "grad_norm": 1.753602003372511, + "language_loss": 0.62838447, + "learning_rate": 8.880298693512109e-07, + "loss": 0.70519125, + "num_input_tokens_seen": 250220650, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.10150146, + "step": 11593, + "time_per_iteration": 2.5508384704589844 + }, + { + "auxiliary_loss_clip": 0.06406593, + "auxiliary_loss_mlp": 0.01263771, + "balance_loss_clip": 0.06274071, + "balance_loss_mlp": 0.012547, + "epoch": 0.6970689914324365, + "flos": 27316001583360.0, + "grad_norm": 1.3874621408455479, + "language_loss": 0.54750943, + "learning_rate": 8.877061728930832e-07, + "loss": 0.6242131, + "num_input_tokens_seen": 250241750, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09069824, + "step": 11594, + "time_per_iteration": 2.559556484222412 + }, + { + "auxiliary_loss_clip": 0.06411646, + "auxiliary_loss_mlp": 0.01264287, + "balance_loss_clip": 0.06274341, + "balance_loss_mlp": 0.01254106, + "epoch": 0.6971291146851044, + "flos": 19142422081920.0, + "grad_norm": 1.79939196206485, + "language_loss": 0.77473152, + "learning_rate": 8.87382518613248e-07, + "loss": 0.85149086, + "num_input_tokens_seen": 250259445, + "router_z_loss_clip": 1.37402344, + "router_z_loss_mlp": 0.10186768, + "step": 11595, + "time_per_iteration": 3.9267494678497314 + }, + { + "auxiliary_loss_clip": 0.06412616, + "auxiliary_loss_mlp": 0.0126537, + "balance_loss_clip": 0.06274199, + "balance_loss_mlp": 0.01254611, + "epoch": 0.6971892379377724, + "flos": 14615661905280.0, + "grad_norm": 2.356908454706418, + "language_loss": 0.72375011, + "learning_rate": 8.870589065239793e-07, + "loss": 0.80052996, + "num_input_tokens_seen": 250275640, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.10766602, + "step": 11596, + "time_per_iteration": 2.4861929416656494 + }, + { + "auxiliary_loss_clip": 0.0641246, + "auxiliary_loss_mlp": 0.01264522, + "balance_loss_clip": 0.06275618, + "balance_loss_mlp": 0.0125368, + "epoch": 0.6972493611904405, + "flos": 22313639232000.0, + "grad_norm": 1.9958593203679207, + "language_loss": 0.76570636, + "learning_rate": 8.867353366375492e-07, + "loss": 0.84247619, + "num_input_tokens_seen": 250296435, + "router_z_loss_clip": 1.36621094, + "router_z_loss_mlp": 0.10839844, + "step": 11597, + "time_per_iteration": 3.9746484756469727 + }, + { + "auxiliary_loss_clip": 0.064082, + "auxiliary_loss_mlp": 0.01267633, + "balance_loss_clip": 0.06272379, + "balance_loss_mlp": 0.01257232, + "epoch": 0.6973094844431084, + "flos": 17426075374080.0, + "grad_norm": 1.890364129189079, + "language_loss": 0.74871194, + "learning_rate": 8.864118089662267e-07, + "loss": 0.82547033, + "num_input_tokens_seen": 250314035, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10406494, + "step": 11598, + "time_per_iteration": 2.4967358112335205 + }, + { + "auxiliary_loss_clip": 0.06416015, + "auxiliary_loss_mlp": 0.01267108, + "balance_loss_clip": 0.06276817, + "balance_loss_mlp": 0.01256111, + "epoch": 0.6973696076957764, + "flos": 27242767514880.0, + "grad_norm": 1.672066699636808, + "language_loss": 0.89636326, + "learning_rate": 8.860883235222791e-07, + "loss": 0.97319448, + "num_input_tokens_seen": 250332995, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.10998535, + "step": 11599, + "time_per_iteration": 2.5665690898895264 + }, + { + "auxiliary_loss_clip": 0.06421445, + "auxiliary_loss_mlp": 0.01269073, + "balance_loss_clip": 0.06277397, + "balance_loss_mlp": 0.0125798, + "epoch": 0.6974297309484443, + "flos": 22024644099840.0, + "grad_norm": 1.8416467781869745, + "language_loss": 0.70383334, + "learning_rate": 8.85764880317974e-07, + "loss": 0.78073853, + "num_input_tokens_seen": 250352120, + "router_z_loss_clip": 1.43847656, + "router_z_loss_mlp": 0.11090088, + "step": 11600, + "time_per_iteration": 2.491593360900879 + }, + { + "auxiliary_loss_clip": 0.0641008, + "auxiliary_loss_mlp": 0.0126546, + "balance_loss_clip": 0.06272715, + "balance_loss_mlp": 0.01254958, + "epoch": 0.6974898542011123, + "flos": 28374038288640.0, + "grad_norm": 1.5173038128226022, + "language_loss": 0.76574016, + "learning_rate": 8.854414793655771e-07, + "loss": 0.84249556, + "num_input_tokens_seen": 250371705, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.10498047, + "step": 11601, + "time_per_iteration": 4.1049439907073975 + }, + { + "auxiliary_loss_clip": 0.06404468, + "auxiliary_loss_mlp": 0.01265462, + "balance_loss_clip": 0.06272994, + "balance_loss_mlp": 0.01255615, + "epoch": 0.6975499774537802, + "flos": 15237522581760.0, + "grad_norm": 1.8655763623744426, + "language_loss": 0.72371268, + "learning_rate": 8.851181206773508e-07, + "loss": 0.80041194, + "num_input_tokens_seen": 250390485, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09851074, + "step": 11602, + "time_per_iteration": 2.5268797874450684 + }, + { + "auxiliary_loss_clip": 0.06410255, + "auxiliary_loss_mlp": 0.01265285, + "balance_loss_clip": 0.06275497, + "balance_loss_mlp": 0.01255343, + "epoch": 0.6976101007064482, + "flos": 22162894536960.0, + "grad_norm": 2.1937279130738365, + "language_loss": 0.77231717, + "learning_rate": 8.847948042655567e-07, + "loss": 0.84907258, + "num_input_tokens_seen": 250407020, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.09942627, + "step": 11603, + "time_per_iteration": 2.4806923866271973 + }, + { + "auxiliary_loss_clip": 0.06408552, + "auxiliary_loss_mlp": 0.01263968, + "balance_loss_clip": 0.06273254, + "balance_loss_mlp": 0.01254211, + "epoch": 0.6976702239591162, + "flos": 22280124798720.0, + "grad_norm": 1.4370854048834028, + "language_loss": 0.62313223, + "learning_rate": 8.844715301424557e-07, + "loss": 0.69985747, + "num_input_tokens_seen": 250425880, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.09759521, + "step": 11604, + "time_per_iteration": 2.556675910949707 + }, + { + "auxiliary_loss_clip": 0.06411324, + "auxiliary_loss_mlp": 0.01265602, + "balance_loss_clip": 0.06273848, + "balance_loss_mlp": 0.01254486, + "epoch": 0.6977303472117842, + "flos": 25855722552960.0, + "grad_norm": 2.158609093070266, + "language_loss": 0.8206296, + "learning_rate": 8.841482983203057e-07, + "loss": 0.89739883, + "num_input_tokens_seen": 250442925, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.11120605, + "step": 11605, + "time_per_iteration": 2.5453009605407715 + }, + { + "auxiliary_loss_clip": 0.06408873, + "auxiliary_loss_mlp": 0.01266358, + "balance_loss_clip": 0.0627379, + "balance_loss_mlp": 0.01256637, + "epoch": 0.6977904704644521, + "flos": 20965894634880.0, + "grad_norm": 1.4817287317876005, + "language_loss": 0.7024073, + "learning_rate": 8.838251088113638e-07, + "loss": 0.77915967, + "num_input_tokens_seen": 250461220, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.09716797, + "step": 11606, + "time_per_iteration": 2.524181604385376 + }, + { + "auxiliary_loss_clip": 0.06411228, + "auxiliary_loss_mlp": 0.01265998, + "balance_loss_clip": 0.06271623, + "balance_loss_mlp": 0.01255221, + "epoch": 0.6978505937171201, + "flos": 22061680404480.0, + "grad_norm": 2.145616317364061, + "language_loss": 0.82643318, + "learning_rate": 8.835019616278856e-07, + "loss": 0.90320545, + "num_input_tokens_seen": 250480975, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.10772705, + "step": 11607, + "time_per_iteration": 2.4895663261413574 + }, + { + "auxiliary_loss_clip": 0.06416652, + "auxiliary_loss_mlp": 0.01265204, + "balance_loss_clip": 0.06274567, + "balance_loss_mlp": 0.01254201, + "epoch": 0.697910716969788, + "flos": 20049252894720.0, + "grad_norm": 2.008483115639311, + "language_loss": 0.79149514, + "learning_rate": 8.831788567821265e-07, + "loss": 0.86831373, + "num_input_tokens_seen": 250497980, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.11004639, + "step": 11608, + "time_per_iteration": 2.517848014831543 + }, + { + "auxiliary_loss_clip": 0.06411079, + "auxiliary_loss_mlp": 0.01264975, + "balance_loss_clip": 0.06272355, + "balance_loss_mlp": 0.0125461, + "epoch": 0.697970840222456, + "flos": 15893736232320.0, + "grad_norm": 1.856773515642951, + "language_loss": 0.9026711, + "learning_rate": 8.828557942863357e-07, + "loss": 0.97943169, + "num_input_tokens_seen": 250511910, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.10357666, + "step": 11609, + "time_per_iteration": 2.464045763015747 + }, + { + "auxiliary_loss_clip": 0.06410901, + "auxiliary_loss_mlp": 0.01262705, + "balance_loss_clip": 0.06270923, + "balance_loss_mlp": 0.01252965, + "epoch": 0.698030963475124, + "flos": 21222088093440.0, + "grad_norm": 1.4134029282176452, + "language_loss": 0.64230514, + "learning_rate": 8.82532774152765e-07, + "loss": 0.71904123, + "num_input_tokens_seen": 250531090, + "router_z_loss_clip": 1.40039062, + "router_z_loss_mlp": 0.09747314, + "step": 11610, + "time_per_iteration": 2.5426440238952637 + }, + { + "auxiliary_loss_clip": 0.06407233, + "auxiliary_loss_mlp": 0.01264187, + "balance_loss_clip": 0.06273091, + "balance_loss_mlp": 0.0125446, + "epoch": 0.698091086727792, + "flos": 33767113029120.0, + "grad_norm": 1.5536592755713354, + "language_loss": 0.84326196, + "learning_rate": 8.822097963936643e-07, + "loss": 0.91997612, + "num_input_tokens_seen": 250551565, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.097229, + "step": 11611, + "time_per_iteration": 2.6129181385040283 + }, + { + "auxiliary_loss_clip": 0.06411347, + "auxiliary_loss_mlp": 0.01264511, + "balance_loss_clip": 0.06272921, + "balance_loss_mlp": 0.01253752, + "epoch": 0.69815120998046, + "flos": 15893275034880.0, + "grad_norm": 1.864564945323593, + "language_loss": 0.70917654, + "learning_rate": 8.818868610212793e-07, + "loss": 0.78593516, + "num_input_tokens_seen": 250569625, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.10754395, + "step": 11612, + "time_per_iteration": 2.4869654178619385 + }, + { + "auxiliary_loss_clip": 0.06406604, + "auxiliary_loss_mlp": 0.01264449, + "balance_loss_clip": 0.06273325, + "balance_loss_mlp": 0.01254096, + "epoch": 0.6982113332331279, + "flos": 18952041605760.0, + "grad_norm": 1.4951443393996662, + "language_loss": 0.81150031, + "learning_rate": 8.815639680478573e-07, + "loss": 0.88821077, + "num_input_tokens_seen": 250586960, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.10345459, + "step": 11613, + "time_per_iteration": 2.4747042655944824 + }, + { + "auxiliary_loss_clip": 0.06409472, + "auxiliary_loss_mlp": 0.01267068, + "balance_loss_clip": 0.06274355, + "balance_loss_mlp": 0.01257335, + "epoch": 0.6982714564857959, + "flos": 24396533625600.0, + "grad_norm": 1.8067810947897194, + "language_loss": 0.75539565, + "learning_rate": 8.812411174856411e-07, + "loss": 0.83216107, + "num_input_tokens_seen": 250605080, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.09741211, + "step": 11614, + "time_per_iteration": 2.533997058868408 + }, + { + "auxiliary_loss_clip": 0.06408294, + "auxiliary_loss_mlp": 0.01268326, + "balance_loss_clip": 0.06272974, + "balance_loss_mlp": 0.0125817, + "epoch": 0.6983315797384638, + "flos": 20089852997760.0, + "grad_norm": 1.9161960736489865, + "language_loss": 0.77505577, + "learning_rate": 8.809183093468746e-07, + "loss": 0.85182202, + "num_input_tokens_seen": 250623965, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.10162354, + "step": 11615, + "time_per_iteration": 2.4810245037078857 + }, + { + "auxiliary_loss_clip": 0.06403261, + "auxiliary_loss_mlp": 0.01262746, + "balance_loss_clip": 0.06272578, + "balance_loss_mlp": 0.01253048, + "epoch": 0.6983917029911318, + "flos": 13516815461760.0, + "grad_norm": 1.8844428750511293, + "language_loss": 0.73254174, + "learning_rate": 8.80595543643797e-07, + "loss": 0.80920184, + "num_input_tokens_seen": 250640675, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.09692383, + "step": 11616, + "time_per_iteration": 2.4856157302856445 + }, + { + "auxiliary_loss_clip": 0.06408458, + "auxiliary_loss_mlp": 0.01264075, + "balance_loss_clip": 0.06277423, + "balance_loss_mlp": 0.01254091, + "epoch": 0.6984518262437998, + "flos": 22025021443200.0, + "grad_norm": 1.4724184586515745, + "language_loss": 0.84294975, + "learning_rate": 8.802728203886487e-07, + "loss": 0.91967505, + "num_input_tokens_seen": 250660295, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.09979248, + "step": 11617, + "time_per_iteration": 2.503758668899536 + }, + { + "auxiliary_loss_clip": 0.0641643, + "auxiliary_loss_mlp": 0.0126771, + "balance_loss_clip": 0.0627649, + "balance_loss_mlp": 0.01257035, + "epoch": 0.6985119494964678, + "flos": 18776587155840.0, + "grad_norm": 2.0634899151280623, + "language_loss": 0.59477413, + "learning_rate": 8.799501395936682e-07, + "loss": 0.67161554, + "num_input_tokens_seen": 250678155, + "router_z_loss_clip": 1.39941406, + "router_z_loss_mlp": 0.10668945, + "step": 11618, + "time_per_iteration": 2.502458333969116 + }, + { + "auxiliary_loss_clip": 0.06411035, + "auxiliary_loss_mlp": 0.0126303, + "balance_loss_clip": 0.06276886, + "balance_loss_mlp": 0.0125307, + "epoch": 0.6985720727491357, + "flos": 22389430849920.0, + "grad_norm": 2.158587147069475, + "language_loss": 0.83073372, + "learning_rate": 8.796275012710903e-07, + "loss": 0.9074744, + "num_input_tokens_seen": 250697230, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.0994873, + "step": 11619, + "time_per_iteration": 2.4989545345306396 + }, + { + "auxiliary_loss_clip": 0.06409271, + "auxiliary_loss_mlp": 0.01266979, + "balance_loss_clip": 0.06278059, + "balance_loss_mlp": 0.01258152, + "epoch": 0.6986321960018037, + "flos": 39577398048000.0, + "grad_norm": 1.554266189454373, + "language_loss": 0.67337298, + "learning_rate": 8.793049054331494e-07, + "loss": 0.75013542, + "num_input_tokens_seen": 250719865, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.08825684, + "step": 11620, + "time_per_iteration": 2.765410900115967 + }, + { + "auxiliary_loss_clip": 0.06411748, + "auxiliary_loss_mlp": 0.01266896, + "balance_loss_clip": 0.06273868, + "balance_loss_mlp": 0.01256621, + "epoch": 0.6986923192544716, + "flos": 17973528024960.0, + "grad_norm": 2.4474211013812432, + "language_loss": 0.73446906, + "learning_rate": 8.789823520920794e-07, + "loss": 0.81125557, + "num_input_tokens_seen": 250736565, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.1027832, + "step": 11621, + "time_per_iteration": 2.4840140342712402 + }, + { + "auxiliary_loss_clip": 0.06412227, + "auxiliary_loss_mlp": 0.01264203, + "balance_loss_clip": 0.06272949, + "balance_loss_mlp": 0.01253737, + "epoch": 0.6987524425071396, + "flos": 25601583519360.0, + "grad_norm": 1.724040192260788, + "language_loss": 0.68410677, + "learning_rate": 8.7865984126011e-07, + "loss": 0.76087105, + "num_input_tokens_seen": 250757235, + "router_z_loss_clip": 1.39257812, + "router_z_loss_mlp": 0.10461426, + "step": 11622, + "time_per_iteration": 3.950021743774414 + }, + { + "auxiliary_loss_clip": 0.06409498, + "auxiliary_loss_mlp": 0.01267194, + "balance_loss_clip": 0.0627782, + "balance_loss_mlp": 0.01257383, + "epoch": 0.6988125657598077, + "flos": 17535842622720.0, + "grad_norm": 1.8022622371846757, + "language_loss": 0.62591398, + "learning_rate": 8.783373729494721e-07, + "loss": 0.70268083, + "num_input_tokens_seen": 250775585, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.09814453, + "step": 11623, + "time_per_iteration": 2.529270887374878 + }, + { + "auxiliary_loss_clip": 0.06415178, + "auxiliary_loss_mlp": 0.01265248, + "balance_loss_clip": 0.06272644, + "balance_loss_mlp": 0.01254817, + "epoch": 0.6988726890124756, + "flos": 39175029941760.0, + "grad_norm": 1.7670185249526673, + "language_loss": 0.60458779, + "learning_rate": 8.780149471723932e-07, + "loss": 0.68139207, + "num_input_tokens_seen": 250795725, + "router_z_loss_clip": 1.42285156, + "router_z_loss_mlp": 0.10430908, + "step": 11624, + "time_per_iteration": 2.6375675201416016 + }, + { + "auxiliary_loss_clip": 0.06411561, + "auxiliary_loss_mlp": 0.01267973, + "balance_loss_clip": 0.06272775, + "balance_loss_mlp": 0.01256564, + "epoch": 0.6989328122651436, + "flos": 20199662173440.0, + "grad_norm": 1.5069469972343055, + "language_loss": 0.78510606, + "learning_rate": 8.776925639411017e-07, + "loss": 0.8619014, + "num_input_tokens_seen": 250814555, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.11413574, + "step": 11625, + "time_per_iteration": 2.534061908721924 + }, + { + "auxiliary_loss_clip": 0.06406638, + "auxiliary_loss_mlp": 0.01266638, + "balance_loss_clip": 0.06275126, + "balance_loss_mlp": 0.01257256, + "epoch": 0.6989929355178115, + "flos": 21841265439360.0, + "grad_norm": 1.6759866105601053, + "language_loss": 0.66316259, + "learning_rate": 8.773702232678188e-07, + "loss": 0.73989534, + "num_input_tokens_seen": 250833105, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09381104, + "step": 11626, + "time_per_iteration": 2.4902937412261963 + }, + { + "auxiliary_loss_clip": 0.06411765, + "auxiliary_loss_mlp": 0.0126589, + "balance_loss_clip": 0.06275335, + "balance_loss_mlp": 0.01255733, + "epoch": 0.6990530587704795, + "flos": 26330066916480.0, + "grad_norm": 2.0325683536698205, + "language_loss": 0.70813847, + "learning_rate": 8.770479251647697e-07, + "loss": 0.78491497, + "num_input_tokens_seen": 250852570, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.10144043, + "step": 11627, + "time_per_iteration": 2.5748379230499268 + }, + { + "auxiliary_loss_clip": 0.0640467, + "auxiliary_loss_mlp": 0.01264187, + "balance_loss_clip": 0.06273688, + "balance_loss_mlp": 0.0125508, + "epoch": 0.6991131820231474, + "flos": 19835168912640.0, + "grad_norm": 1.7164277105253158, + "language_loss": 0.62609565, + "learning_rate": 8.767256696441768e-07, + "loss": 0.70278424, + "num_input_tokens_seen": 250870500, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.09112549, + "step": 11628, + "time_per_iteration": 2.4829564094543457 + }, + { + "auxiliary_loss_clip": 0.06410889, + "auxiliary_loss_mlp": 0.01265997, + "balance_loss_clip": 0.06272821, + "balance_loss_mlp": 0.0125559, + "epoch": 0.6991733052758154, + "flos": 33993271998720.0, + "grad_norm": 1.816957818772296, + "language_loss": 0.68972111, + "learning_rate": 8.764034567182581e-07, + "loss": 0.76648998, + "num_input_tokens_seen": 250892745, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.10412598, + "step": 11629, + "time_per_iteration": 2.6509320735931396 + }, + { + "auxiliary_loss_clip": 0.06409748, + "auxiliary_loss_mlp": 0.01265873, + "balance_loss_clip": 0.06276409, + "balance_loss_mlp": 0.0125515, + "epoch": 0.6992334285284834, + "flos": 15638632876800.0, + "grad_norm": 1.5060784407018701, + "language_loss": 0.72445923, + "learning_rate": 8.760812863992337e-07, + "loss": 0.80121547, + "num_input_tokens_seen": 250910225, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.1072998, + "step": 11630, + "time_per_iteration": 2.4783284664154053 + }, + { + "auxiliary_loss_clip": 0.0641311, + "auxiliary_loss_mlp": 0.01265861, + "balance_loss_clip": 0.06278898, + "balance_loss_mlp": 0.01255943, + "epoch": 0.6992935517811514, + "flos": 21732797928960.0, + "grad_norm": 1.7108311606213942, + "language_loss": 0.74144894, + "learning_rate": 8.757591586993196e-07, + "loss": 0.81823862, + "num_input_tokens_seen": 250929715, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.09912109, + "step": 11631, + "time_per_iteration": 2.5788233280181885 + }, + { + "auxiliary_loss_clip": 0.06419384, + "auxiliary_loss_mlp": 0.01269329, + "balance_loss_clip": 0.0628057, + "balance_loss_mlp": 0.01258022, + "epoch": 0.6993536750338193, + "flos": 20120558319360.0, + "grad_norm": 2.3602125436995105, + "language_loss": 0.89111435, + "learning_rate": 8.7543707363073e-07, + "loss": 0.96800154, + "num_input_tokens_seen": 250944230, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.11303711, + "step": 11632, + "time_per_iteration": 2.473422050476074 + }, + { + "auxiliary_loss_clip": 0.06414177, + "auxiliary_loss_mlp": 0.01264877, + "balance_loss_clip": 0.06276321, + "balance_loss_mlp": 0.01254864, + "epoch": 0.6994137982864873, + "flos": 22015839421440.0, + "grad_norm": 1.6028389301274413, + "language_loss": 0.79952157, + "learning_rate": 8.751150312056792e-07, + "loss": 0.87631214, + "num_input_tokens_seen": 250961865, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.10009766, + "step": 11633, + "time_per_iteration": 2.513282060623169 + }, + { + "auxiliary_loss_clip": 0.06417207, + "auxiliary_loss_mlp": 0.01265902, + "balance_loss_clip": 0.06276365, + "balance_loss_mlp": 0.01254202, + "epoch": 0.6994739215391552, + "flos": 25525875755520.0, + "grad_norm": 1.8057869627886596, + "language_loss": 0.67083466, + "learning_rate": 8.747930314363794e-07, + "loss": 0.7476657, + "num_input_tokens_seen": 250982025, + "router_z_loss_clip": 1.40917969, + "router_z_loss_mlp": 0.11712646, + "step": 11634, + "time_per_iteration": 3.9409241676330566 + }, + { + "auxiliary_loss_clip": 0.06321115, + "auxiliary_loss_mlp": 0.0125178, + "balance_loss_clip": 0.06264269, + "balance_loss_mlp": 0.01250645, + "epoch": 0.6995340447918232, + "flos": 59147931438720.0, + "grad_norm": 0.6717939190194797, + "language_loss": 0.53298014, + "learning_rate": 8.744710743350412e-07, + "loss": 0.6087091, + "num_input_tokens_seen": 251046900, + "router_z_loss_clip": 0.5703125, + "router_z_loss_mlp": 0.0113678, + "step": 11635, + "time_per_iteration": 3.2486236095428467 + }, + { + "auxiliary_loss_clip": 0.06412114, + "auxiliary_loss_mlp": 0.01264348, + "balance_loss_clip": 0.06275758, + "balance_loss_mlp": 0.01253631, + "epoch": 0.6995941680444913, + "flos": 17973653806080.0, + "grad_norm": 1.479923932232007, + "language_loss": 0.8206256, + "learning_rate": 8.741491599138726e-07, + "loss": 0.89739013, + "num_input_tokens_seen": 251065050, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.1071167, + "step": 11636, + "time_per_iteration": 2.516813039779663 + }, + { + "auxiliary_loss_clip": 0.06416257, + "auxiliary_loss_mlp": 0.01266147, + "balance_loss_clip": 0.06278151, + "balance_loss_mlp": 0.01255722, + "epoch": 0.6996542912971592, + "flos": 21986391911040.0, + "grad_norm": 3.1669516008633813, + "language_loss": 0.83141685, + "learning_rate": 8.738272881850801e-07, + "loss": 0.90824091, + "num_input_tokens_seen": 251083355, + "router_z_loss_clip": 1.38183594, + "router_z_loss_mlp": 0.10430908, + "step": 11637, + "time_per_iteration": 3.917647123336792 + }, + { + "auxiliary_loss_clip": 0.06409974, + "auxiliary_loss_mlp": 0.0126639, + "balance_loss_clip": 0.06274991, + "balance_loss_mlp": 0.0125584, + "epoch": 0.6997144145498272, + "flos": 11689904891520.0, + "grad_norm": 1.7413253088603204, + "language_loss": 0.68017536, + "learning_rate": 8.735054591608704e-07, + "loss": 0.75693905, + "num_input_tokens_seen": 251096420, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.10559082, + "step": 11638, + "time_per_iteration": 2.455333709716797 + }, + { + "auxiliary_loss_clip": 0.06417674, + "auxiliary_loss_mlp": 0.01267445, + "balance_loss_clip": 0.06275746, + "balance_loss_mlp": 0.01255244, + "epoch": 0.6997745378024951, + "flos": 29614992456960.0, + "grad_norm": 1.8583897053492529, + "language_loss": 0.77953184, + "learning_rate": 8.731836728534459e-07, + "loss": 0.85638303, + "num_input_tokens_seen": 251115410, + "router_z_loss_clip": 1.41894531, + "router_z_loss_mlp": 0.12200928, + "step": 11639, + "time_per_iteration": 2.5732390880584717 + }, + { + "auxiliary_loss_clip": 0.06415096, + "auxiliary_loss_mlp": 0.01267452, + "balance_loss_clip": 0.06277713, + "balance_loss_mlp": 0.01256842, + "epoch": 0.6998346610551631, + "flos": 20892912128640.0, + "grad_norm": 1.9224229885402988, + "language_loss": 0.83357054, + "learning_rate": 8.728619292750093e-07, + "loss": 0.91039604, + "num_input_tokens_seen": 251133530, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.10601807, + "step": 11640, + "time_per_iteration": 2.518707275390625 + }, + { + "auxiliary_loss_clip": 0.06408644, + "auxiliary_loss_mlp": 0.01265078, + "balance_loss_clip": 0.06273933, + "balance_loss_mlp": 0.01255422, + "epoch": 0.699894784307831, + "flos": 27170539695360.0, + "grad_norm": 1.6039437808829469, + "language_loss": 0.75522578, + "learning_rate": 8.725402284377619e-07, + "loss": 0.83196306, + "num_input_tokens_seen": 251153985, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.09655762, + "step": 11641, + "time_per_iteration": 4.078887701034546 + }, + { + "auxiliary_loss_clip": 0.06412257, + "auxiliary_loss_mlp": 0.01266553, + "balance_loss_clip": 0.06275941, + "balance_loss_mlp": 0.01256361, + "epoch": 0.699954907560499, + "flos": 20930032287360.0, + "grad_norm": 1.8680055959443465, + "language_loss": 0.77721083, + "learning_rate": 8.722185703539022e-07, + "loss": 0.85399896, + "num_input_tokens_seen": 251173225, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.10192871, + "step": 11642, + "time_per_iteration": 2.500046730041504 + }, + { + "auxiliary_loss_clip": 0.0641754, + "auxiliary_loss_mlp": 0.01265471, + "balance_loss_clip": 0.06277227, + "balance_loss_mlp": 0.01253592, + "epoch": 0.700015030813167, + "flos": 28665339408000.0, + "grad_norm": 2.533169755671386, + "language_loss": 0.74393576, + "learning_rate": 8.718969550356266e-07, + "loss": 0.82076585, + "num_input_tokens_seen": 251192485, + "router_z_loss_clip": 1.40332031, + "router_z_loss_mlp": 0.11883545, + "step": 11643, + "time_per_iteration": 2.5775840282440186 + }, + { + "auxiliary_loss_clip": 0.06414674, + "auxiliary_loss_mlp": 0.01264637, + "balance_loss_clip": 0.06276005, + "balance_loss_mlp": 0.01254362, + "epoch": 0.700075154065835, + "flos": 29212959767040.0, + "grad_norm": 1.5245425147272047, + "language_loss": 0.60040998, + "learning_rate": 8.715753824951315e-07, + "loss": 0.67720306, + "num_input_tokens_seen": 251214965, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.1027832, + "step": 11644, + "time_per_iteration": 2.552072286605835 + }, + { + "auxiliary_loss_clip": 0.06407935, + "auxiliary_loss_mlp": 0.01271385, + "balance_loss_clip": 0.06275052, + "balance_loss_mlp": 0.01260579, + "epoch": 0.7001352773185029, + "flos": 23119130131200.0, + "grad_norm": 1.5458952120749485, + "language_loss": 0.82132351, + "learning_rate": 8.712538527446119e-07, + "loss": 0.89811671, + "num_input_tokens_seen": 251234500, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.10809326, + "step": 11645, + "time_per_iteration": 2.558337450027466 + }, + { + "auxiliary_loss_clip": 0.06407823, + "auxiliary_loss_mlp": 0.01266733, + "balance_loss_clip": 0.06274226, + "balance_loss_mlp": 0.01256743, + "epoch": 0.7001954005711709, + "flos": 21328962376320.0, + "grad_norm": 2.5779246493483177, + "language_loss": 0.68295795, + "learning_rate": 8.709323657962584e-07, + "loss": 0.75970346, + "num_input_tokens_seen": 251254360, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.09985352, + "step": 11646, + "time_per_iteration": 2.5126430988311768 + }, + { + "auxiliary_loss_clip": 0.06410798, + "auxiliary_loss_mlp": 0.01264039, + "balance_loss_clip": 0.06276618, + "balance_loss_mlp": 0.0125371, + "epoch": 0.7002555238238388, + "flos": 24542834054400.0, + "grad_norm": 1.467898418777351, + "language_loss": 0.71547973, + "learning_rate": 8.706109216622635e-07, + "loss": 0.7922281, + "num_input_tokens_seen": 251274790, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.10339355, + "step": 11647, + "time_per_iteration": 2.5304250717163086 + }, + { + "auxiliary_loss_clip": 0.06414019, + "auxiliary_loss_mlp": 0.01269431, + "balance_loss_clip": 0.0627712, + "balance_loss_mlp": 0.01258041, + "epoch": 0.7003156470765068, + "flos": 39065891598720.0, + "grad_norm": 1.749288264158044, + "language_loss": 0.72289455, + "learning_rate": 8.702895203548155e-07, + "loss": 0.79972911, + "num_input_tokens_seen": 251296275, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.1138916, + "step": 11648, + "time_per_iteration": 2.678863525390625 + }, + { + "auxiliary_loss_clip": 0.06409213, + "auxiliary_loss_mlp": 0.01267629, + "balance_loss_clip": 0.06275574, + "balance_loss_mlp": 0.01257377, + "epoch": 0.7003757703291749, + "flos": 28811723690880.0, + "grad_norm": 1.4492190580209505, + "language_loss": 0.77860492, + "learning_rate": 8.699681618861014e-07, + "loss": 0.85537332, + "num_input_tokens_seen": 251317375, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.10247803, + "step": 11649, + "time_per_iteration": 2.558931589126587 + }, + { + "auxiliary_loss_clip": 0.06409431, + "auxiliary_loss_mlp": 0.01267142, + "balance_loss_clip": 0.06275406, + "balance_loss_mlp": 0.01257421, + "epoch": 0.7004358935818428, + "flos": 15958123695360.0, + "grad_norm": 1.4433792721312992, + "language_loss": 0.78238451, + "learning_rate": 8.69646846268308e-07, + "loss": 0.85915029, + "num_input_tokens_seen": 251333570, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.097229, + "step": 11650, + "time_per_iteration": 2.461639642715454 + }, + { + "auxiliary_loss_clip": 0.06409653, + "auxiliary_loss_mlp": 0.0126613, + "balance_loss_clip": 0.06273135, + "balance_loss_mlp": 0.01256247, + "epoch": 0.7004960168345108, + "flos": 20418148494720.0, + "grad_norm": 2.0802744101319406, + "language_loss": 0.78669983, + "learning_rate": 8.693255735136194e-07, + "loss": 0.86345768, + "num_input_tokens_seen": 251351070, + "router_z_loss_clip": 1.36621094, + "router_z_loss_mlp": 0.09881592, + "step": 11651, + "time_per_iteration": 2.500000238418579 + }, + { + "auxiliary_loss_clip": 0.06420258, + "auxiliary_loss_mlp": 0.01269045, + "balance_loss_clip": 0.06280224, + "balance_loss_mlp": 0.01258649, + "epoch": 0.7005561400871787, + "flos": 17353260576000.0, + "grad_norm": 1.5099151755448044, + "language_loss": 0.70310026, + "learning_rate": 8.690043436342198e-07, + "loss": 0.7799933, + "num_input_tokens_seen": 251370005, + "router_z_loss_clip": 1.40039062, + "router_z_loss_mlp": 0.10388184, + "step": 11652, + "time_per_iteration": 2.4739015102386475 + }, + { + "auxiliary_loss_clip": 0.06413841, + "auxiliary_loss_mlp": 0.01263486, + "balance_loss_clip": 0.06277132, + "balance_loss_mlp": 0.01253663, + "epoch": 0.7006162633398467, + "flos": 25309276151040.0, + "grad_norm": 1.323517960695476, + "language_loss": 0.74456298, + "learning_rate": 8.686831566422874e-07, + "loss": 0.82133621, + "num_input_tokens_seen": 251391210, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.0982666, + "step": 11653, + "time_per_iteration": 2.532655954360962 + }, + { + "auxiliary_loss_clip": 0.06417534, + "auxiliary_loss_mlp": 0.01263141, + "balance_loss_clip": 0.06278478, + "balance_loss_mlp": 0.0125271, + "epoch": 0.7006763865925146, + "flos": 20675473983360.0, + "grad_norm": 2.0288883835732228, + "language_loss": 0.70729959, + "learning_rate": 8.68362012550003e-07, + "loss": 0.78410637, + "num_input_tokens_seen": 251411505, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.10430908, + "step": 11654, + "time_per_iteration": 2.519660711288452 + }, + { + "auxiliary_loss_clip": 0.06415437, + "auxiliary_loss_mlp": 0.0126811, + "balance_loss_clip": 0.06277716, + "balance_loss_mlp": 0.01256696, + "epoch": 0.7007365098451827, + "flos": 20052439349760.0, + "grad_norm": 2.2628281377067134, + "language_loss": 0.72993428, + "learning_rate": 8.680409113695453e-07, + "loss": 0.80676985, + "num_input_tokens_seen": 251428975, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.11413574, + "step": 11655, + "time_per_iteration": 2.48612117767334 + }, + { + "auxiliary_loss_clip": 0.06424905, + "auxiliary_loss_mlp": 0.01271007, + "balance_loss_clip": 0.06280498, + "balance_loss_mlp": 0.01259062, + "epoch": 0.7007966330978506, + "flos": 20783689931520.0, + "grad_norm": 1.9221196897273614, + "language_loss": 0.70366073, + "learning_rate": 8.677198531130889e-07, + "loss": 0.78061986, + "num_input_tokens_seen": 251446940, + "router_z_loss_clip": 1.44335938, + "router_z_loss_mlp": 0.11950684, + "step": 11656, + "time_per_iteration": 2.4856395721435547 + }, + { + "auxiliary_loss_clip": 0.06408404, + "auxiliary_loss_mlp": 0.01266899, + "balance_loss_clip": 0.06273983, + "balance_loss_mlp": 0.01257123, + "epoch": 0.7008567563505186, + "flos": 29645110800000.0, + "grad_norm": 1.5392970097639627, + "language_loss": 0.78185248, + "learning_rate": 8.673988377928092e-07, + "loss": 0.8586055, + "num_input_tokens_seen": 251466205, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.09783936, + "step": 11657, + "time_per_iteration": 2.5812113285064697 + }, + { + "auxiliary_loss_clip": 0.06419835, + "auxiliary_loss_mlp": 0.01268196, + "balance_loss_clip": 0.06277259, + "balance_loss_mlp": 0.01257229, + "epoch": 0.7009168796031865, + "flos": 17097654096000.0, + "grad_norm": 2.227553712273129, + "language_loss": 0.78159571, + "learning_rate": 8.670778654208797e-07, + "loss": 0.85847604, + "num_input_tokens_seen": 251484820, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.10968018, + "step": 11658, + "time_per_iteration": 2.4778008460998535 + }, + { + "auxiliary_loss_clip": 0.0640991, + "auxiliary_loss_mlp": 0.0126385, + "balance_loss_clip": 0.06276852, + "balance_loss_mlp": 0.01254099, + "epoch": 0.7009770028558545, + "flos": 20455226726400.0, + "grad_norm": 1.6635136984807588, + "language_loss": 0.83274609, + "learning_rate": 8.667569360094713e-07, + "loss": 0.90948367, + "num_input_tokens_seen": 251502670, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.09747314, + "step": 11659, + "time_per_iteration": 2.4965016841888428 + }, + { + "auxiliary_loss_clip": 0.06406507, + "auxiliary_loss_mlp": 0.01265707, + "balance_loss_clip": 0.06273511, + "balance_loss_mlp": 0.01256296, + "epoch": 0.7010371261085224, + "flos": 19251225008640.0, + "grad_norm": 2.205019124031737, + "language_loss": 0.69561887, + "learning_rate": 8.664360495707526e-07, + "loss": 0.77234095, + "num_input_tokens_seen": 251521630, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.09411621, + "step": 11660, + "time_per_iteration": 2.4827144145965576 + }, + { + "auxiliary_loss_clip": 0.06414962, + "auxiliary_loss_mlp": 0.01266553, + "balance_loss_clip": 0.06275482, + "balance_loss_mlp": 0.01256134, + "epoch": 0.7010972493611904, + "flos": 22134159786240.0, + "grad_norm": 2.0869897578232295, + "language_loss": 0.81401628, + "learning_rate": 8.661152061168924e-07, + "loss": 0.89083141, + "num_input_tokens_seen": 251540105, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.10412598, + "step": 11661, + "time_per_iteration": 3.9388158321380615 + }, + { + "auxiliary_loss_clip": 0.06407215, + "auxiliary_loss_mlp": 0.01264683, + "balance_loss_clip": 0.06272362, + "balance_loss_mlp": 0.01254544, + "epoch": 0.7011573726138585, + "flos": 31398619593600.0, + "grad_norm": 1.8643289831680394, + "language_loss": 0.79429448, + "learning_rate": 8.657944056600579e-07, + "loss": 0.87101352, + "num_input_tokens_seen": 251560530, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.10137939, + "step": 11662, + "time_per_iteration": 2.6265618801116943 + }, + { + "auxiliary_loss_clip": 0.06416287, + "auxiliary_loss_mlp": 0.01267119, + "balance_loss_clip": 0.06277344, + "balance_loss_mlp": 0.01256295, + "epoch": 0.7012174958665264, + "flos": 18156487415040.0, + "grad_norm": 1.6800388441509395, + "language_loss": 0.83806753, + "learning_rate": 8.654736482124134e-07, + "loss": 0.91490161, + "num_input_tokens_seen": 251577930, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.10821533, + "step": 11663, + "time_per_iteration": 2.488739252090454 + }, + { + "auxiliary_loss_clip": 0.06318727, + "auxiliary_loss_mlp": 0.01250759, + "balance_loss_clip": 0.06262303, + "balance_loss_mlp": 0.012494, + "epoch": 0.7012776191191944, + "flos": 60669495331200.0, + "grad_norm": 0.8224381055881935, + "language_loss": 0.5391866, + "learning_rate": 8.651529337861209e-07, + "loss": 0.6148814, + "num_input_tokens_seen": 251638820, + "router_z_loss_clip": 0.56591797, + "router_z_loss_mlp": 0.01361084, + "step": 11664, + "time_per_iteration": 3.160693645477295 + }, + { + "auxiliary_loss_clip": 0.06413987, + "auxiliary_loss_mlp": 0.01267114, + "balance_loss_clip": 0.06275371, + "balance_loss_mlp": 0.01256731, + "epoch": 0.7013377423718623, + "flos": 27205940845440.0, + "grad_norm": 1.7370315255440756, + "language_loss": 0.79090619, + "learning_rate": 8.64832262393344e-07, + "loss": 0.86771721, + "num_input_tokens_seen": 251658070, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.1038208, + "step": 11665, + "time_per_iteration": 2.5398123264312744 + }, + { + "auxiliary_loss_clip": 0.06412809, + "auxiliary_loss_mlp": 0.01262516, + "balance_loss_clip": 0.06277609, + "balance_loss_mlp": 0.01252563, + "epoch": 0.7013978656245303, + "flos": 16548901706880.0, + "grad_norm": 2.00554211734292, + "language_loss": 0.76867342, + "learning_rate": 8.645116340462404e-07, + "loss": 0.84542668, + "num_input_tokens_seen": 251671575, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.09954834, + "step": 11666, + "time_per_iteration": 2.4652414321899414 + }, + { + "auxiliary_loss_clip": 0.0641577, + "auxiliary_loss_mlp": 0.01267108, + "balance_loss_clip": 0.06279963, + "balance_loss_mlp": 0.01256725, + "epoch": 0.7014579888771982, + "flos": 23149625817600.0, + "grad_norm": 1.7866180274258885, + "language_loss": 0.81048751, + "learning_rate": 8.641910487569695e-07, + "loss": 0.88731629, + "num_input_tokens_seen": 251689350, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10388184, + "step": 11667, + "time_per_iteration": 2.5062241554260254 + }, + { + "auxiliary_loss_clip": 0.06409969, + "auxiliary_loss_mlp": 0.01266348, + "balance_loss_clip": 0.06275474, + "balance_loss_mlp": 0.01255917, + "epoch": 0.7015181121298663, + "flos": 25089028894080.0, + "grad_norm": 2.0567499658134087, + "language_loss": 0.65901959, + "learning_rate": 8.638705065376879e-07, + "loss": 0.73578274, + "num_input_tokens_seen": 251704635, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.10443115, + "step": 11668, + "time_per_iteration": 2.6001944541931152 + }, + { + "auxiliary_loss_clip": 0.06415643, + "auxiliary_loss_mlp": 0.01266119, + "balance_loss_clip": 0.06275932, + "balance_loss_mlp": 0.01255248, + "epoch": 0.7015782353825342, + "flos": 23334052654080.0, + "grad_norm": 1.636860913695636, + "language_loss": 0.76856339, + "learning_rate": 8.635500074005519e-07, + "loss": 0.84538102, + "num_input_tokens_seen": 251723035, + "router_z_loss_clip": 1.39648438, + "router_z_loss_mlp": 0.10870361, + "step": 11669, + "time_per_iteration": 2.580120801925659 + }, + { + "auxiliary_loss_clip": 0.06316374, + "auxiliary_loss_mlp": 0.01249475, + "balance_loss_clip": 0.06259722, + "balance_loss_mlp": 0.01248101, + "epoch": 0.7016383586352022, + "flos": 70417733086080.0, + "grad_norm": 0.683633883002792, + "language_loss": 0.54477966, + "learning_rate": 8.632295513577122e-07, + "loss": 0.62043816, + "num_input_tokens_seen": 251791630, + "router_z_loss_clip": 0.56640625, + "router_z_loss_mlp": 0.01376343, + "step": 11670, + "time_per_iteration": 3.239391565322876 + }, + { + "auxiliary_loss_clip": 0.06410887, + "auxiliary_loss_mlp": 0.01266693, + "balance_loss_clip": 0.06276417, + "balance_loss_mlp": 0.01256447, + "epoch": 0.7016984818878701, + "flos": 19798426097280.0, + "grad_norm": 1.5820465602747873, + "language_loss": 0.81851846, + "learning_rate": 8.629091384213218e-07, + "loss": 0.89529431, + "num_input_tokens_seen": 251809840, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.10247803, + "step": 11671, + "time_per_iteration": 2.5156307220458984 + }, + { + "auxiliary_loss_clip": 0.06415814, + "auxiliary_loss_mlp": 0.01265108, + "balance_loss_clip": 0.06276827, + "balance_loss_mlp": 0.01254611, + "epoch": 0.7017586051405381, + "flos": 12901998528000.0, + "grad_norm": 1.7162410726978943, + "language_loss": 0.74825186, + "learning_rate": 8.625887686035313e-07, + "loss": 0.82506108, + "num_input_tokens_seen": 251827550, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.10498047, + "step": 11672, + "time_per_iteration": 2.4657065868377686 + }, + { + "auxiliary_loss_clip": 0.064162, + "auxiliary_loss_mlp": 0.01267901, + "balance_loss_clip": 0.06278486, + "balance_loss_mlp": 0.01256922, + "epoch": 0.701818728393206, + "flos": 18338734045440.0, + "grad_norm": 1.6561114230567193, + "language_loss": 0.87079096, + "learning_rate": 8.622684419164883e-07, + "loss": 0.94763196, + "num_input_tokens_seen": 251844880, + "router_z_loss_clip": 1.37695312, + "router_z_loss_mlp": 0.10980225, + "step": 11673, + "time_per_iteration": 2.51084303855896 + }, + { + "auxiliary_loss_clip": 0.06411691, + "auxiliary_loss_mlp": 0.01268986, + "balance_loss_clip": 0.06277934, + "balance_loss_mlp": 0.01258502, + "epoch": 0.701878851645874, + "flos": 17389961464320.0, + "grad_norm": 1.7599431551764082, + "language_loss": 0.73397923, + "learning_rate": 8.619481583723399e-07, + "loss": 0.81078601, + "num_input_tokens_seen": 251861025, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.10491943, + "step": 11674, + "time_per_iteration": 3.8845224380493164 + }, + { + "auxiliary_loss_clip": 0.06408197, + "auxiliary_loss_mlp": 0.01264811, + "balance_loss_clip": 0.06276836, + "balance_loss_mlp": 0.01255173, + "epoch": 0.701938974898542, + "flos": 23922398897280.0, + "grad_norm": 1.5893184098427633, + "language_loss": 0.72403145, + "learning_rate": 8.616279179832329e-07, + "loss": 0.80076146, + "num_input_tokens_seen": 251880175, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09631348, + "step": 11675, + "time_per_iteration": 2.535900115966797 + }, + { + "auxiliary_loss_clip": 0.06414977, + "auxiliary_loss_mlp": 0.01267979, + "balance_loss_clip": 0.06276758, + "balance_loss_mlp": 0.01257047, + "epoch": 0.70199909815121, + "flos": 21801503877120.0, + "grad_norm": 2.0246464203601278, + "language_loss": 0.51067138, + "learning_rate": 8.613077207613078e-07, + "loss": 0.58750093, + "num_input_tokens_seen": 251899005, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.109375, + "step": 11676, + "time_per_iteration": 2.555906057357788 + }, + { + "auxiliary_loss_clip": 0.06319048, + "auxiliary_loss_mlp": 0.01249904, + "balance_loss_clip": 0.06262474, + "balance_loss_mlp": 0.01248563, + "epoch": 0.702059221403878, + "flos": 71736575224320.0, + "grad_norm": 0.7224738346499476, + "language_loss": 0.59202904, + "learning_rate": 8.609875667187079e-07, + "loss": 0.66771859, + "num_input_tokens_seen": 251966790, + "router_z_loss_clip": 0.56640625, + "router_z_loss_mlp": 0.01343536, + "step": 11677, + "time_per_iteration": 4.580153942108154 + }, + { + "auxiliary_loss_clip": 0.06413269, + "auxiliary_loss_mlp": 0.01266317, + "balance_loss_clip": 0.06275491, + "balance_loss_mlp": 0.0125582, + "epoch": 0.7021193446565459, + "flos": 28118599516800.0, + "grad_norm": 1.944945343813431, + "language_loss": 0.6293093, + "learning_rate": 8.606674558675737e-07, + "loss": 0.70610511, + "num_input_tokens_seen": 251989315, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.10498047, + "step": 11678, + "time_per_iteration": 2.652944803237915 + }, + { + "auxiliary_loss_clip": 0.06410077, + "auxiliary_loss_mlp": 0.0126477, + "balance_loss_clip": 0.06276654, + "balance_loss_mlp": 0.01254786, + "epoch": 0.7021794679092139, + "flos": 22930720225920.0, + "grad_norm": 1.5864608475530155, + "language_loss": 0.7993412, + "learning_rate": 8.603473882200444e-07, + "loss": 0.87608963, + "num_input_tokens_seen": 252006620, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.09991455, + "step": 11679, + "time_per_iteration": 2.517608404159546 + }, + { + "auxiliary_loss_clip": 0.06410368, + "auxiliary_loss_mlp": 0.01263633, + "balance_loss_clip": 0.06277052, + "balance_loss_mlp": 0.01254615, + "epoch": 0.7022395911618818, + "flos": 18083756471040.0, + "grad_norm": 2.1970830940848614, + "language_loss": 0.70462888, + "learning_rate": 8.600273637882567e-07, + "loss": 0.78136891, + "num_input_tokens_seen": 252024570, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.09014893, + "step": 11680, + "time_per_iteration": 2.4937846660614014 + }, + { + "auxiliary_loss_clip": 0.06416643, + "auxiliary_loss_mlp": 0.01267202, + "balance_loss_clip": 0.06276958, + "balance_loss_mlp": 0.01256408, + "epoch": 0.7022997144145499, + "flos": 16039827025920.0, + "grad_norm": 1.5993399056299638, + "language_loss": 0.74800062, + "learning_rate": 8.597073825843446e-07, + "loss": 0.82483912, + "num_input_tokens_seen": 252042775, + "router_z_loss_clip": 1.39648438, + "router_z_loss_mlp": 0.10791016, + "step": 11681, + "time_per_iteration": 3.912652015686035 + }, + { + "auxiliary_loss_clip": 0.06407465, + "auxiliary_loss_mlp": 0.01264961, + "balance_loss_clip": 0.06273095, + "balance_loss_mlp": 0.01254536, + "epoch": 0.7023598376672178, + "flos": 26475864220800.0, + "grad_norm": 1.529501150189484, + "language_loss": 0.77074146, + "learning_rate": 8.593874446204434e-07, + "loss": 0.84746575, + "num_input_tokens_seen": 252063690, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.10424805, + "step": 11682, + "time_per_iteration": 2.5244510173797607 + }, + { + "auxiliary_loss_clip": 0.06414787, + "auxiliary_loss_mlp": 0.01267242, + "balance_loss_clip": 0.06274539, + "balance_loss_mlp": 0.01255625, + "epoch": 0.7024199609198858, + "flos": 17061624040320.0, + "grad_norm": 2.0146711656624947, + "language_loss": 0.73610115, + "learning_rate": 8.590675499086841e-07, + "loss": 0.81292146, + "num_input_tokens_seen": 252080335, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.11627197, + "step": 11683, + "time_per_iteration": 2.4807722568511963 + }, + { + "auxiliary_loss_clip": 0.06412771, + "auxiliary_loss_mlp": 0.01265673, + "balance_loss_clip": 0.06278127, + "balance_loss_mlp": 0.01254467, + "epoch": 0.7024800841725537, + "flos": 25856225677440.0, + "grad_norm": 1.8616488886702496, + "language_loss": 0.7201761, + "learning_rate": 8.587476984611976e-07, + "loss": 0.79696059, + "num_input_tokens_seen": 252101075, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.11212158, + "step": 11684, + "time_per_iteration": 2.5248489379882812 + }, + { + "auxiliary_loss_clip": 0.06409675, + "auxiliary_loss_mlp": 0.01268405, + "balance_loss_clip": 0.06274322, + "balance_loss_mlp": 0.01257741, + "epoch": 0.7025402074252217, + "flos": 23519653447680.0, + "grad_norm": 2.2560693638667386, + "language_loss": 0.72109079, + "learning_rate": 8.584278902901128e-07, + "loss": 0.79787153, + "num_input_tokens_seen": 252120510, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.10668945, + "step": 11685, + "time_per_iteration": 2.5545883178710938 + }, + { + "auxiliary_loss_clip": 0.06411938, + "auxiliary_loss_mlp": 0.01264141, + "balance_loss_clip": 0.06274469, + "balance_loss_mlp": 0.01254021, + "epoch": 0.7026003306778896, + "flos": 20156169104640.0, + "grad_norm": 1.6059462262520903, + "language_loss": 0.8497479, + "learning_rate": 8.581081254075582e-07, + "loss": 0.92650867, + "num_input_tokens_seen": 252137590, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.10119629, + "step": 11686, + "time_per_iteration": 2.4869866371154785 + }, + { + "auxiliary_loss_clip": 0.06311645, + "auxiliary_loss_mlp": 0.01250458, + "balance_loss_clip": 0.06255314, + "balance_loss_mlp": 0.01249239, + "epoch": 0.7026604539305576, + "flos": 64791036362880.0, + "grad_norm": 0.9748591985428325, + "language_loss": 0.6989513, + "learning_rate": 8.577884038256566e-07, + "loss": 0.77457231, + "num_input_tokens_seen": 252199830, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01217651, + "step": 11687, + "time_per_iteration": 3.2795140743255615 + }, + { + "auxiliary_loss_clip": 0.06411874, + "auxiliary_loss_mlp": 0.01269631, + "balance_loss_clip": 0.06276284, + "balance_loss_mlp": 0.01259421, + "epoch": 0.7027205771832256, + "flos": 21877882473600.0, + "grad_norm": 2.1687744057978575, + "language_loss": 0.7759158, + "learning_rate": 8.574687255565329e-07, + "loss": 0.85273087, + "num_input_tokens_seen": 252217200, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.10205078, + "step": 11688, + "time_per_iteration": 2.506697416305542 + }, + { + "auxiliary_loss_clip": 0.06409185, + "auxiliary_loss_mlp": 0.01263217, + "balance_loss_clip": 0.06273778, + "balance_loss_mlp": 0.0125287, + "epoch": 0.7027807004358936, + "flos": 23374526976000.0, + "grad_norm": 2.0500924601059687, + "language_loss": 0.69007778, + "learning_rate": 8.571490906123107e-07, + "loss": 0.76680183, + "num_input_tokens_seen": 252236105, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.10339355, + "step": 11689, + "time_per_iteration": 2.526963472366333 + }, + { + "auxiliary_loss_clip": 0.06412712, + "auxiliary_loss_mlp": 0.01267707, + "balance_loss_clip": 0.0627338, + "balance_loss_mlp": 0.01255834, + "epoch": 0.7028408236885616, + "flos": 15309624620160.0, + "grad_norm": 2.4528764604041977, + "language_loss": 0.79761183, + "learning_rate": 8.568294990051086e-07, + "loss": 0.87441605, + "num_input_tokens_seen": 252253315, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.11871338, + "step": 11690, + "time_per_iteration": 2.5314319133758545 + }, + { + "auxiliary_loss_clip": 0.06412818, + "auxiliary_loss_mlp": 0.01269418, + "balance_loss_clip": 0.06277384, + "balance_loss_mlp": 0.01258677, + "epoch": 0.7029009469412295, + "flos": 22024769880960.0, + "grad_norm": 1.8333973382314617, + "language_loss": 0.75588238, + "learning_rate": 8.56509950747047e-07, + "loss": 0.83270478, + "num_input_tokens_seen": 252272765, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.10748291, + "step": 11691, + "time_per_iteration": 2.5446360111236572 + }, + { + "auxiliary_loss_clip": 0.06412929, + "auxiliary_loss_mlp": 0.01264486, + "balance_loss_clip": 0.06278588, + "balance_loss_mlp": 0.0125449, + "epoch": 0.7029610701938975, + "flos": 21842020126080.0, + "grad_norm": 1.7290780486458988, + "language_loss": 0.81951666, + "learning_rate": 8.561904458502429e-07, + "loss": 0.89629078, + "num_input_tokens_seen": 252290510, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.09997559, + "step": 11692, + "time_per_iteration": 2.475939989089966 + }, + { + "auxiliary_loss_clip": 0.06407632, + "auxiliary_loss_mlp": 0.01264663, + "balance_loss_clip": 0.06272383, + "balance_loss_mlp": 0.01253577, + "epoch": 0.7030211934465654, + "flos": 19141709322240.0, + "grad_norm": 1.4786815492141234, + "language_loss": 0.76637983, + "learning_rate": 8.558709843268111e-07, + "loss": 0.84310281, + "num_input_tokens_seen": 252309365, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.11090088, + "step": 11693, + "time_per_iteration": 2.523207664489746 + }, + { + "auxiliary_loss_clip": 0.06409247, + "auxiliary_loss_mlp": 0.01267485, + "balance_loss_clip": 0.06274758, + "balance_loss_mlp": 0.01256959, + "epoch": 0.7030813166992335, + "flos": 38555307544320.0, + "grad_norm": 3.0680910714990945, + "language_loss": 0.685, + "learning_rate": 8.55551566188866e-07, + "loss": 0.76176739, + "num_input_tokens_seen": 252333010, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.10522461, + "step": 11694, + "time_per_iteration": 2.6671559810638428 + }, + { + "auxiliary_loss_clip": 0.06413712, + "auxiliary_loss_mlp": 0.01265339, + "balance_loss_clip": 0.06276645, + "balance_loss_mlp": 0.01255105, + "epoch": 0.7031414399519014, + "flos": 14726225767680.0, + "grad_norm": 2.01117706312431, + "language_loss": 0.75637174, + "learning_rate": 8.552321914485203e-07, + "loss": 0.83316225, + "num_input_tokens_seen": 252351330, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.10235596, + "step": 11695, + "time_per_iteration": 2.508373975753784 + }, + { + "auxiliary_loss_clip": 0.0642024, + "auxiliary_loss_mlp": 0.01270249, + "balance_loss_clip": 0.06280233, + "balance_loss_mlp": 0.01258644, + "epoch": 0.7032015632045694, + "flos": 14032388833920.0, + "grad_norm": 1.954001814184471, + "language_loss": 0.74258196, + "learning_rate": 8.549128601178852e-07, + "loss": 0.81948686, + "num_input_tokens_seen": 252369580, + "router_z_loss_clip": 1.39941406, + "router_z_loss_mlp": 0.11602783, + "step": 11696, + "time_per_iteration": 2.4646289348602295 + }, + { + "auxiliary_loss_clip": 0.06413354, + "auxiliary_loss_mlp": 0.01266085, + "balance_loss_clip": 0.06275193, + "balance_loss_mlp": 0.01254969, + "epoch": 0.7032616864572373, + "flos": 27644716350720.0, + "grad_norm": 7.188542829701478, + "language_loss": 0.75876927, + "learning_rate": 8.545935722090693e-07, + "loss": 0.83556366, + "num_input_tokens_seen": 252390525, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.11108398, + "step": 11697, + "time_per_iteration": 2.564423084259033 + }, + { + "auxiliary_loss_clip": 0.06411704, + "auxiliary_loss_mlp": 0.0126863, + "balance_loss_clip": 0.06273724, + "balance_loss_mlp": 0.01257508, + "epoch": 0.7033218097099053, + "flos": 17973024900480.0, + "grad_norm": 1.6931225387398507, + "language_loss": 0.80683148, + "learning_rate": 8.542743277341793e-07, + "loss": 0.88363487, + "num_input_tokens_seen": 252407470, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.11126709, + "step": 11698, + "time_per_iteration": 2.4535627365112305 + }, + { + "auxiliary_loss_clip": 0.0641105, + "auxiliary_loss_mlp": 0.01266224, + "balance_loss_clip": 0.06272902, + "balance_loss_mlp": 0.01255239, + "epoch": 0.7033819329625732, + "flos": 19508047372800.0, + "grad_norm": 1.3566537423348073, + "language_loss": 0.84644032, + "learning_rate": 8.539551267053222e-07, + "loss": 0.92321312, + "num_input_tokens_seen": 252427025, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.10974121, + "step": 11699, + "time_per_iteration": 2.5543456077575684 + }, + { + "auxiliary_loss_clip": 0.06408502, + "auxiliary_loss_mlp": 0.01265387, + "balance_loss_clip": 0.06274264, + "balance_loss_mlp": 0.01254628, + "epoch": 0.7034420562152413, + "flos": 23994417081600.0, + "grad_norm": 1.970773248623371, + "language_loss": 0.7962184, + "learning_rate": 8.53635969134601e-07, + "loss": 0.87295729, + "num_input_tokens_seen": 252445410, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.10760498, + "step": 11700, + "time_per_iteration": 2.4985594749450684 + }, + { + "auxiliary_loss_clip": 0.06412737, + "auxiliary_loss_mlp": 0.01264767, + "balance_loss_clip": 0.06273302, + "balance_loss_mlp": 0.01253507, + "epoch": 0.7035021794679092, + "flos": 35052147244800.0, + "grad_norm": 1.812061465534113, + "language_loss": 0.74477667, + "learning_rate": 8.533168550341186e-07, + "loss": 0.82155174, + "num_input_tokens_seen": 252463905, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.11254883, + "step": 11701, + "time_per_iteration": 4.042437314987183 + }, + { + "auxiliary_loss_clip": 0.064155, + "auxiliary_loss_mlp": 0.01264422, + "balance_loss_clip": 0.06275072, + "balance_loss_mlp": 0.01253246, + "epoch": 0.7035623027205772, + "flos": 11001811962240.0, + "grad_norm": 2.072031067866928, + "language_loss": 0.83952713, + "learning_rate": 8.529977844159769e-07, + "loss": 0.91632634, + "num_input_tokens_seen": 252478655, + "router_z_loss_clip": 1.40527344, + "router_z_loss_mlp": 0.11175537, + "step": 11702, + "time_per_iteration": 2.5586178302764893 + }, + { + "auxiliary_loss_clip": 0.0641142, + "auxiliary_loss_mlp": 0.01264208, + "balance_loss_clip": 0.06272231, + "balance_loss_mlp": 0.01253825, + "epoch": 0.7036224259732452, + "flos": 23630594653440.0, + "grad_norm": 1.6523267572786273, + "language_loss": 0.61088848, + "learning_rate": 8.526787572922738e-07, + "loss": 0.68764472, + "num_input_tokens_seen": 252498740, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.1038208, + "step": 11703, + "time_per_iteration": 2.521512985229492 + }, + { + "auxiliary_loss_clip": 0.06413552, + "auxiliary_loss_mlp": 0.01266937, + "balance_loss_clip": 0.06275339, + "balance_loss_mlp": 0.01255869, + "epoch": 0.7036825492259131, + "flos": 31694239198080.0, + "grad_norm": 1.8799008475861942, + "language_loss": 0.61646456, + "learning_rate": 8.523597736751067e-07, + "loss": 0.69326943, + "num_input_tokens_seen": 252517800, + "router_z_loss_clip": 1.38183594, + "router_z_loss_mlp": 0.11065674, + "step": 11704, + "time_per_iteration": 2.637000560760498 + }, + { + "auxiliary_loss_clip": 0.06406493, + "auxiliary_loss_mlp": 0.0126777, + "balance_loss_clip": 0.06273523, + "balance_loss_mlp": 0.01258109, + "epoch": 0.7037426724785811, + "flos": 30201116567040.0, + "grad_norm": 1.5166852635712837, + "language_loss": 0.70736712, + "learning_rate": 8.520408335765719e-07, + "loss": 0.78410971, + "num_input_tokens_seen": 252539620, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.09667969, + "step": 11705, + "time_per_iteration": 2.5815892219543457 + }, + { + "auxiliary_loss_clip": 0.06409339, + "auxiliary_loss_mlp": 0.01265192, + "balance_loss_clip": 0.06274589, + "balance_loss_mlp": 0.01254833, + "epoch": 0.703802795731249, + "flos": 24317597479680.0, + "grad_norm": 1.8692688199911445, + "language_loss": 0.61916155, + "learning_rate": 8.517219370087645e-07, + "loss": 0.69590688, + "num_input_tokens_seen": 252557300, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.10351562, + "step": 11706, + "time_per_iteration": 2.537567615509033 + }, + { + "auxiliary_loss_clip": 0.06410844, + "auxiliary_loss_mlp": 0.01265613, + "balance_loss_clip": 0.06273291, + "balance_loss_mlp": 0.01254061, + "epoch": 0.7038629189839171, + "flos": 22535605497600.0, + "grad_norm": 2.4391424281987506, + "language_loss": 0.68479651, + "learning_rate": 8.514030839837756e-07, + "loss": 0.76156104, + "num_input_tokens_seen": 252576715, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.11560059, + "step": 11707, + "time_per_iteration": 2.4984869956970215 + }, + { + "auxiliary_loss_clip": 0.06406912, + "auxiliary_loss_mlp": 0.01267156, + "balance_loss_clip": 0.06272735, + "balance_loss_mlp": 0.01257101, + "epoch": 0.703923042236585, + "flos": 26257755242880.0, + "grad_norm": 1.9008341016793249, + "language_loss": 0.76335013, + "learning_rate": 8.510842745136974e-07, + "loss": 0.84009075, + "num_input_tokens_seen": 252596190, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.10058594, + "step": 11708, + "time_per_iteration": 2.552219867706299 + }, + { + "auxiliary_loss_clip": 0.06407606, + "auxiliary_loss_mlp": 0.01261422, + "balance_loss_clip": 0.06274488, + "balance_loss_mlp": 0.01251313, + "epoch": 0.703983165489253, + "flos": 19396225699200.0, + "grad_norm": 1.582678176456311, + "language_loss": 0.7205376, + "learning_rate": 8.50765508610619e-07, + "loss": 0.79722786, + "num_input_tokens_seen": 252613410, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.10107422, + "step": 11709, + "time_per_iteration": 2.479956865310669 + }, + { + "auxiliary_loss_clip": 0.06409952, + "auxiliary_loss_mlp": 0.01266177, + "balance_loss_clip": 0.06274274, + "balance_loss_mlp": 0.01256098, + "epoch": 0.7040432887419209, + "flos": 16688032611840.0, + "grad_norm": 1.9337929130323093, + "language_loss": 0.79638529, + "learning_rate": 8.504467862866267e-07, + "loss": 0.87314653, + "num_input_tokens_seen": 252629150, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10076904, + "step": 11710, + "time_per_iteration": 2.495333194732666 + }, + { + "auxiliary_loss_clip": 0.06415999, + "auxiliary_loss_mlp": 0.0126626, + "balance_loss_clip": 0.06278241, + "balance_loss_mlp": 0.01255674, + "epoch": 0.7041034119945889, + "flos": 21147638140800.0, + "grad_norm": 1.663598845140954, + "language_loss": 0.77776545, + "learning_rate": 8.501281075538076e-07, + "loss": 0.85458803, + "num_input_tokens_seen": 252648225, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.105896, + "step": 11711, + "time_per_iteration": 2.500640392303467 + }, + { + "auxiliary_loss_clip": 0.06410688, + "auxiliary_loss_mlp": 0.01265823, + "balance_loss_clip": 0.06276608, + "balance_loss_mlp": 0.01255237, + "epoch": 0.7041635352472568, + "flos": 16916036371200.0, + "grad_norm": 1.9928632293831094, + "language_loss": 0.7447651, + "learning_rate": 8.498094724242457e-07, + "loss": 0.82153022, + "num_input_tokens_seen": 252665380, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.10583496, + "step": 11712, + "time_per_iteration": 2.501585006713867 + }, + { + "auxiliary_loss_clip": 0.06320854, + "auxiliary_loss_mlp": 0.01257118, + "balance_loss_clip": 0.06264362, + "balance_loss_mlp": 0.01255823, + "epoch": 0.7042236584999249, + "flos": 71703186572160.0, + "grad_norm": 0.8590002483868424, + "language_loss": 0.64672804, + "learning_rate": 8.494908809100247e-07, + "loss": 0.72250772, + "num_input_tokens_seen": 252727950, + "router_z_loss_clip": 0.56640625, + "router_z_loss_mlp": 0.01295471, + "step": 11713, + "time_per_iteration": 4.5734851360321045 + }, + { + "auxiliary_loss_clip": 0.06410141, + "auxiliary_loss_mlp": 0.01263047, + "balance_loss_clip": 0.06274079, + "balance_loss_mlp": 0.01252991, + "epoch": 0.7042837817525928, + "flos": 28665800605440.0, + "grad_norm": 1.9680516689018257, + "language_loss": 0.72915512, + "learning_rate": 8.49172333023225e-07, + "loss": 0.80588698, + "num_input_tokens_seen": 252746770, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.1005249, + "step": 11714, + "time_per_iteration": 2.5535781383514404 + }, + { + "auxiliary_loss_clip": 0.06411086, + "auxiliary_loss_mlp": 0.01268594, + "balance_loss_clip": 0.06275805, + "balance_loss_mlp": 0.01256757, + "epoch": 0.7043439050052608, + "flos": 19759335367680.0, + "grad_norm": 2.3616586102145805, + "language_loss": 0.80244958, + "learning_rate": 8.488538287759248e-07, + "loss": 0.87924635, + "num_input_tokens_seen": 252765610, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.11828613, + "step": 11715, + "time_per_iteration": 2.4991419315338135 + }, + { + "auxiliary_loss_clip": 0.06414278, + "auxiliary_loss_mlp": 0.01267093, + "balance_loss_clip": 0.0627607, + "balance_loss_mlp": 0.01256155, + "epoch": 0.7044040282579288, + "flos": 11541969308160.0, + "grad_norm": 1.9765202948162532, + "language_loss": 0.71383488, + "learning_rate": 8.485353681802037e-07, + "loss": 0.79064858, + "num_input_tokens_seen": 252781610, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.10931396, + "step": 11716, + "time_per_iteration": 3.9245705604553223 + }, + { + "auxiliary_loss_clip": 0.06418915, + "auxiliary_loss_mlp": 0.01264541, + "balance_loss_clip": 0.06277251, + "balance_loss_mlp": 0.01253783, + "epoch": 0.7044641515105967, + "flos": 33664473377280.0, + "grad_norm": 1.7730534730356675, + "language_loss": 0.66482782, + "learning_rate": 8.482169512481358e-07, + "loss": 0.74166238, + "num_input_tokens_seen": 252800600, + "router_z_loss_clip": 1.41699219, + "router_z_loss_mlp": 0.10760498, + "step": 11717, + "time_per_iteration": 2.6029398441314697 + }, + { + "auxiliary_loss_clip": 0.06415347, + "auxiliary_loss_mlp": 0.01266424, + "balance_loss_clip": 0.0627737, + "balance_loss_mlp": 0.01256011, + "epoch": 0.7045242747632647, + "flos": 26731051430400.0, + "grad_norm": 1.5043477958415044, + "language_loss": 0.74609149, + "learning_rate": 8.478985779917967e-07, + "loss": 0.82290918, + "num_input_tokens_seen": 252822310, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.10412598, + "step": 11718, + "time_per_iteration": 2.574075937271118 + }, + { + "auxiliary_loss_clip": 0.06412348, + "auxiliary_loss_mlp": 0.01264631, + "balance_loss_clip": 0.06277113, + "balance_loss_mlp": 0.01254224, + "epoch": 0.7045843980159326, + "flos": 26804998258560.0, + "grad_norm": 1.5984477962629227, + "language_loss": 0.80229437, + "learning_rate": 8.475802484232606e-07, + "loss": 0.8790642, + "num_input_tokens_seen": 252842355, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.10412598, + "step": 11719, + "time_per_iteration": 2.557602643966675 + }, + { + "auxiliary_loss_clip": 0.0641358, + "auxiliary_loss_mlp": 0.01263485, + "balance_loss_clip": 0.06277666, + "balance_loss_mlp": 0.01252524, + "epoch": 0.7046445212686007, + "flos": 41584710458880.0, + "grad_norm": 1.6868566975802164, + "language_loss": 0.65635586, + "learning_rate": 8.472619625545951e-07, + "loss": 0.73312646, + "num_input_tokens_seen": 252866785, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.10961914, + "step": 11720, + "time_per_iteration": 4.092779159545898 + }, + { + "auxiliary_loss_clip": 0.06422915, + "auxiliary_loss_mlp": 0.01266179, + "balance_loss_clip": 0.06280062, + "balance_loss_mlp": 0.01255194, + "epoch": 0.7047046445212686, + "flos": 15565650370560.0, + "grad_norm": 2.147768548041585, + "language_loss": 0.8022362, + "learning_rate": 8.46943720397872e-07, + "loss": 0.87912714, + "num_input_tokens_seen": 252881870, + "router_z_loss_clip": 1.42871094, + "router_z_loss_mlp": 0.10986328, + "step": 11721, + "time_per_iteration": 2.4634041786193848 + }, + { + "auxiliary_loss_clip": 0.06318594, + "auxiliary_loss_mlp": 0.01253531, + "balance_loss_clip": 0.06262027, + "balance_loss_mlp": 0.01252384, + "epoch": 0.7047647677739366, + "flos": 70433036455680.0, + "grad_norm": 0.7472916144331851, + "language_loss": 0.64821076, + "learning_rate": 8.466255219651582e-07, + "loss": 0.72393203, + "num_input_tokens_seen": 252951300, + "router_z_loss_clip": 0.56640625, + "router_z_loss_mlp": 0.01146698, + "step": 11722, + "time_per_iteration": 3.2447893619537354 + }, + { + "auxiliary_loss_clip": 0.06410772, + "auxiliary_loss_mlp": 0.0126411, + "balance_loss_clip": 0.06275559, + "balance_loss_mlp": 0.01253536, + "epoch": 0.7048248910266045, + "flos": 23666876271360.0, + "grad_norm": 2.268842508315268, + "language_loss": 0.66067719, + "learning_rate": 8.463073672685211e-07, + "loss": 0.73742604, + "num_input_tokens_seen": 252971400, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.10571289, + "step": 11723, + "time_per_iteration": 2.556645154953003 + }, + { + "auxiliary_loss_clip": 0.06413794, + "auxiliary_loss_mlp": 0.01263861, + "balance_loss_clip": 0.06275541, + "balance_loss_mlp": 0.01252703, + "epoch": 0.7048850142792725, + "flos": 21403496183040.0, + "grad_norm": 1.9667058211108481, + "language_loss": 0.80938751, + "learning_rate": 8.459892563200235e-07, + "loss": 0.88616407, + "num_input_tokens_seen": 252989475, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.11151123, + "step": 11724, + "time_per_iteration": 2.521294116973877 + }, + { + "auxiliary_loss_clip": 0.06412652, + "auxiliary_loss_mlp": 0.01263234, + "balance_loss_clip": 0.06273533, + "balance_loss_mlp": 0.01252619, + "epoch": 0.7049451375319404, + "flos": 21653736001920.0, + "grad_norm": 1.878825511688235, + "language_loss": 0.73036087, + "learning_rate": 8.456711891317296e-07, + "loss": 0.80711973, + "num_input_tokens_seen": 253007220, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.10620117, + "step": 11725, + "time_per_iteration": 2.491532325744629 + }, + { + "auxiliary_loss_clip": 0.06419054, + "auxiliary_loss_mlp": 0.01266944, + "balance_loss_clip": 0.06278444, + "balance_loss_mlp": 0.01256275, + "epoch": 0.7050052607846085, + "flos": 14872148853120.0, + "grad_norm": 1.93227359409925, + "language_loss": 0.78747177, + "learning_rate": 8.453531657156998e-07, + "loss": 0.86433172, + "num_input_tokens_seen": 253025410, + "router_z_loss_clip": 1.40527344, + "router_z_loss_mlp": 0.10668945, + "step": 11726, + "time_per_iteration": 2.625894069671631 + }, + { + "auxiliary_loss_clip": 0.06411958, + "auxiliary_loss_mlp": 0.0126862, + "balance_loss_clip": 0.06275987, + "balance_loss_mlp": 0.01258273, + "epoch": 0.7050653840372764, + "flos": 19247283866880.0, + "grad_norm": 2.1540780661141374, + "language_loss": 0.70452571, + "learning_rate": 8.450351860839931e-07, + "loss": 0.78133154, + "num_input_tokens_seen": 253043305, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.10351562, + "step": 11727, + "time_per_iteration": 2.540519952774048 + }, + { + "auxiliary_loss_clip": 0.06403094, + "auxiliary_loss_mlp": 0.01263675, + "balance_loss_clip": 0.0627404, + "balance_loss_mlp": 0.01254752, + "epoch": 0.7051255072899444, + "flos": 27787536835200.0, + "grad_norm": 1.531115099301347, + "language_loss": 0.69006073, + "learning_rate": 8.44717250248668e-07, + "loss": 0.7667284, + "num_input_tokens_seen": 253062790, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.08917236, + "step": 11728, + "time_per_iteration": 2.5793302059173584 + }, + { + "auxiliary_loss_clip": 0.06412704, + "auxiliary_loss_mlp": 0.0126399, + "balance_loss_clip": 0.06276618, + "balance_loss_mlp": 0.01253773, + "epoch": 0.7051856305426124, + "flos": 27899526216960.0, + "grad_norm": 1.8133071590962522, + "language_loss": 0.73397171, + "learning_rate": 8.443993582217803e-07, + "loss": 0.81073868, + "num_input_tokens_seen": 253082055, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.10211182, + "step": 11729, + "time_per_iteration": 2.632077693939209 + }, + { + "auxiliary_loss_clip": 0.06421916, + "auxiliary_loss_mlp": 0.01265278, + "balance_loss_clip": 0.06277753, + "balance_loss_mlp": 0.01253775, + "epoch": 0.7052457537952803, + "flos": 25050147799680.0, + "grad_norm": 1.613038649768226, + "language_loss": 0.78167063, + "learning_rate": 8.440815100153862e-07, + "loss": 0.8585425, + "num_input_tokens_seen": 253102575, + "router_z_loss_clip": 1.44140625, + "router_z_loss_mlp": 0.1149292, + "step": 11730, + "time_per_iteration": 2.5648131370544434 + }, + { + "auxiliary_loss_clip": 0.06414882, + "auxiliary_loss_mlp": 0.01268388, + "balance_loss_clip": 0.06275609, + "balance_loss_mlp": 0.0125698, + "epoch": 0.7053058770479483, + "flos": 21878175962880.0, + "grad_norm": 2.325298368428052, + "language_loss": 0.62874782, + "learning_rate": 8.437637056415359e-07, + "loss": 0.70558047, + "num_input_tokens_seen": 253121290, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.11401367, + "step": 11731, + "time_per_iteration": 2.546156167984009 + }, + { + "auxiliary_loss_clip": 0.06416281, + "auxiliary_loss_mlp": 0.01270278, + "balance_loss_clip": 0.06275978, + "balance_loss_mlp": 0.01258679, + "epoch": 0.7053660003006162, + "flos": 16404236432640.0, + "grad_norm": 1.9339047251972874, + "language_loss": 0.74811733, + "learning_rate": 8.434459451122815e-07, + "loss": 0.82498294, + "num_input_tokens_seen": 253139720, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.1159668, + "step": 11732, + "time_per_iteration": 2.4927430152893066 + }, + { + "auxiliary_loss_clip": 0.06408133, + "auxiliary_loss_mlp": 0.01266798, + "balance_loss_clip": 0.06274602, + "balance_loss_mlp": 0.01256534, + "epoch": 0.7054261235532843, + "flos": 22718271398400.0, + "grad_norm": 1.4288707050417415, + "language_loss": 0.71580064, + "learning_rate": 8.431282284396735e-07, + "loss": 0.79254997, + "num_input_tokens_seen": 253160250, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.1026001, + "step": 11733, + "time_per_iteration": 2.543832540512085 + }, + { + "auxiliary_loss_clip": 0.06411871, + "auxiliary_loss_mlp": 0.01268245, + "balance_loss_clip": 0.06275688, + "balance_loss_mlp": 0.01258154, + "epoch": 0.7054862468059522, + "flos": 13594829212800.0, + "grad_norm": 1.9266065814345037, + "language_loss": 0.73917806, + "learning_rate": 8.428105556357583e-07, + "loss": 0.81597924, + "num_input_tokens_seen": 253178710, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.10095215, + "step": 11734, + "time_per_iteration": 2.496680736541748 + }, + { + "auxiliary_loss_clip": 0.06421253, + "auxiliary_loss_mlp": 0.01273046, + "balance_loss_clip": 0.06277873, + "balance_loss_mlp": 0.0126184, + "epoch": 0.7055463700586202, + "flos": 15884931553920.0, + "grad_norm": 4.995085142451974, + "language_loss": 0.70442164, + "learning_rate": 8.424929267125829e-07, + "loss": 0.78136462, + "num_input_tokens_seen": 253194805, + "router_z_loss_clip": 1.43359375, + "router_z_loss_mlp": 0.11206055, + "step": 11735, + "time_per_iteration": 2.560451030731201 + }, + { + "auxiliary_loss_clip": 0.06413963, + "auxiliary_loss_mlp": 0.01270144, + "balance_loss_clip": 0.06274843, + "balance_loss_mlp": 0.01257955, + "epoch": 0.7056064933112881, + "flos": 23082890440320.0, + "grad_norm": 1.6821797399985068, + "language_loss": 0.72724199, + "learning_rate": 8.421753416821933e-07, + "loss": 0.80408299, + "num_input_tokens_seen": 253213895, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.12182617, + "step": 11736, + "time_per_iteration": 2.5113935470581055 + }, + { + "auxiliary_loss_clip": 0.06410478, + "auxiliary_loss_mlp": 0.0126459, + "balance_loss_clip": 0.06277382, + "balance_loss_mlp": 0.01254356, + "epoch": 0.7056666165639561, + "flos": 24063374592000.0, + "grad_norm": 1.617495345914111, + "language_loss": 0.69220245, + "learning_rate": 8.41857800556629e-07, + "loss": 0.7689532, + "num_input_tokens_seen": 253231620, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.10235596, + "step": 11737, + "time_per_iteration": 2.5327107906341553 + }, + { + "auxiliary_loss_clip": 0.06416027, + "auxiliary_loss_mlp": 0.01265741, + "balance_loss_clip": 0.06277978, + "balance_loss_mlp": 0.01254279, + "epoch": 0.705726739816624, + "flos": 17498932099200.0, + "grad_norm": 1.8698204681752435, + "language_loss": 0.67921227, + "learning_rate": 8.415403033479332e-07, + "loss": 0.75602996, + "num_input_tokens_seen": 253249590, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.11474609, + "step": 11738, + "time_per_iteration": 2.458019733428955 + }, + { + "auxiliary_loss_clip": 0.06411514, + "auxiliary_loss_mlp": 0.01264856, + "balance_loss_clip": 0.06274632, + "balance_loss_mlp": 0.0125408, + "epoch": 0.7057868630692921, + "flos": 51361515256320.0, + "grad_norm": 7.975241590020644, + "language_loss": 0.74895537, + "learning_rate": 8.41222850068145e-07, + "loss": 0.82571906, + "num_input_tokens_seen": 253273870, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.10784912, + "step": 11739, + "time_per_iteration": 2.7849392890930176 + }, + { + "auxiliary_loss_clip": 0.0641078, + "auxiliary_loss_mlp": 0.01263148, + "balance_loss_clip": 0.0627811, + "balance_loss_mlp": 0.01252663, + "epoch": 0.70584698632196, + "flos": 26109945440640.0, + "grad_norm": 1.5818256072351289, + "language_loss": 0.71794957, + "learning_rate": 8.409054407293032e-07, + "loss": 0.79468888, + "num_input_tokens_seen": 253293720, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.10479736, + "step": 11740, + "time_per_iteration": 4.018102645874023 + }, + { + "auxiliary_loss_clip": 0.06408996, + "auxiliary_loss_mlp": 0.01270494, + "balance_loss_clip": 0.06274964, + "balance_loss_mlp": 0.01260939, + "epoch": 0.705907109574628, + "flos": 21549503122560.0, + "grad_norm": 1.4620628375932287, + "language_loss": 0.82029426, + "learning_rate": 8.405880753434434e-07, + "loss": 0.89708912, + "num_input_tokens_seen": 253313700, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.09558105, + "step": 11741, + "time_per_iteration": 2.5226922035217285 + }, + { + "auxiliary_loss_clip": 0.06412125, + "auxiliary_loss_mlp": 0.0126669, + "balance_loss_clip": 0.06276572, + "balance_loss_mlp": 0.01255389, + "epoch": 0.705967232827296, + "flos": 22717432857600.0, + "grad_norm": 1.792685843416777, + "language_loss": 0.7848987, + "learning_rate": 8.402707539225993e-07, + "loss": 0.86168694, + "num_input_tokens_seen": 253332425, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.11297607, + "step": 11742, + "time_per_iteration": 2.4881513118743896 + }, + { + "auxiliary_loss_clip": 0.06420448, + "auxiliary_loss_mlp": 0.01267345, + "balance_loss_clip": 0.06277722, + "balance_loss_mlp": 0.01256408, + "epoch": 0.7060273560799639, + "flos": 28698266862720.0, + "grad_norm": 1.447375520003719, + "language_loss": 0.64323652, + "learning_rate": 8.39953476478805e-07, + "loss": 0.72011447, + "num_input_tokens_seen": 253353620, + "router_z_loss_clip": 1.42675781, + "router_z_loss_mlp": 0.10919189, + "step": 11743, + "time_per_iteration": 2.5737526416778564 + }, + { + "auxiliary_loss_clip": 0.06413458, + "auxiliary_loss_mlp": 0.01269024, + "balance_loss_clip": 0.0627328, + "balance_loss_mlp": 0.01257693, + "epoch": 0.7060874793326319, + "flos": 15711699237120.0, + "grad_norm": 1.7211358867446458, + "language_loss": 0.65871137, + "learning_rate": 8.396362430240902e-07, + "loss": 0.73553622, + "num_input_tokens_seen": 253370930, + "router_z_loss_clip": 1.40039062, + "router_z_loss_mlp": 0.11322021, + "step": 11744, + "time_per_iteration": 2.479001998901367 + }, + { + "auxiliary_loss_clip": 0.06408134, + "auxiliary_loss_mlp": 0.01271135, + "balance_loss_clip": 0.06274446, + "balance_loss_mlp": 0.01260728, + "epoch": 0.7061476025852998, + "flos": 21513137650560.0, + "grad_norm": 2.025199572577618, + "language_loss": 0.63794267, + "learning_rate": 8.393190535704857e-07, + "loss": 0.71473539, + "num_input_tokens_seen": 253389810, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.10394287, + "step": 11745, + "time_per_iteration": 2.52616810798645 + }, + { + "auxiliary_loss_clip": 0.06410205, + "auxiliary_loss_mlp": 0.01263676, + "balance_loss_clip": 0.06273259, + "balance_loss_mlp": 0.01253311, + "epoch": 0.7062077258379679, + "flos": 28189024473600.0, + "grad_norm": 1.8444242196367828, + "language_loss": 0.71914798, + "learning_rate": 8.390019081300188e-07, + "loss": 0.79588681, + "num_input_tokens_seen": 253408685, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.10369873, + "step": 11746, + "time_per_iteration": 2.5588066577911377 + }, + { + "auxiliary_loss_clip": 0.06411352, + "auxiliary_loss_mlp": 0.01268167, + "balance_loss_clip": 0.06275406, + "balance_loss_mlp": 0.01257653, + "epoch": 0.7062678490906358, + "flos": 27860854757760.0, + "grad_norm": 1.5188195218955072, + "language_loss": 0.79773951, + "learning_rate": 8.386848067147175e-07, + "loss": 0.87453461, + "num_input_tokens_seen": 253429685, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.10510254, + "step": 11747, + "time_per_iteration": 2.5661420822143555 + }, + { + "auxiliary_loss_clip": 0.06411886, + "auxiliary_loss_mlp": 0.01264357, + "balance_loss_clip": 0.06277459, + "balance_loss_mlp": 0.01254307, + "epoch": 0.7063279723433038, + "flos": 23191483731840.0, + "grad_norm": 1.5251666611578065, + "language_loss": 0.65140951, + "learning_rate": 8.383677493366031e-07, + "loss": 0.72817194, + "num_input_tokens_seen": 253448260, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.1005249, + "step": 11748, + "time_per_iteration": 2.5165350437164307 + }, + { + "auxiliary_loss_clip": 0.06412359, + "auxiliary_loss_mlp": 0.01266364, + "balance_loss_clip": 0.06276652, + "balance_loss_mlp": 0.0125548, + "epoch": 0.7063880955959717, + "flos": 20194043950080.0, + "grad_norm": 1.8580174500745112, + "language_loss": 0.79421908, + "learning_rate": 8.380507360077003e-07, + "loss": 0.87100631, + "num_input_tokens_seen": 253467725, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.10888672, + "step": 11749, + "time_per_iteration": 2.5304911136627197 + }, + { + "auxiliary_loss_clip": 0.06318866, + "auxiliary_loss_mlp": 0.01253368, + "balance_loss_clip": 0.06263049, + "balance_loss_mlp": 0.01252189, + "epoch": 0.7064482188486397, + "flos": 63685020395520.0, + "grad_norm": 0.7869711578789559, + "language_loss": 0.54065382, + "learning_rate": 8.377337667400304e-07, + "loss": 0.61637622, + "num_input_tokens_seen": 253526940, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.01176453, + "step": 11750, + "time_per_iteration": 3.118065118789673 + }, + { + "auxiliary_loss_clip": 0.06410946, + "auxiliary_loss_mlp": 0.01265459, + "balance_loss_clip": 0.06275111, + "balance_loss_mlp": 0.01254623, + "epoch": 0.7065083421013076, + "flos": 25198125310080.0, + "grad_norm": 1.6339849961789776, + "language_loss": 0.78829509, + "learning_rate": 8.37416841545612e-07, + "loss": 0.86505914, + "num_input_tokens_seen": 253546160, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.10839844, + "step": 11751, + "time_per_iteration": 2.5452511310577393 + }, + { + "auxiliary_loss_clip": 0.0640781, + "auxiliary_loss_mlp": 0.0126673, + "balance_loss_clip": 0.06274024, + "balance_loss_mlp": 0.01256842, + "epoch": 0.7065684653539757, + "flos": 22900392247680.0, + "grad_norm": 1.6672445306420212, + "language_loss": 0.68168157, + "learning_rate": 8.370999604364634e-07, + "loss": 0.75842696, + "num_input_tokens_seen": 253565505, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.09893799, + "step": 11752, + "time_per_iteration": 3.9393372535705566 + }, + { + "auxiliary_loss_clip": 0.06408882, + "auxiliary_loss_mlp": 0.01267025, + "balance_loss_clip": 0.06275536, + "balance_loss_mlp": 0.01256934, + "epoch": 0.7066285886066436, + "flos": 23557025168640.0, + "grad_norm": 1.8022680768003871, + "language_loss": 0.76729679, + "learning_rate": 8.367831234246025e-07, + "loss": 0.84405589, + "num_input_tokens_seen": 253585125, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.10083008, + "step": 11753, + "time_per_iteration": 2.5189971923828125 + }, + { + "auxiliary_loss_clip": 0.06404173, + "auxiliary_loss_mlp": 0.01265164, + "balance_loss_clip": 0.06273716, + "balance_loss_mlp": 0.01255097, + "epoch": 0.7066887118593116, + "flos": 21075661883520.0, + "grad_norm": 1.4940357111697604, + "language_loss": 0.7128973, + "learning_rate": 8.364663305220405e-07, + "loss": 0.78959066, + "num_input_tokens_seen": 253604815, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.10076904, + "step": 11754, + "time_per_iteration": 2.5660195350646973 + }, + { + "auxiliary_loss_clip": 0.064097, + "auxiliary_loss_mlp": 0.01267445, + "balance_loss_clip": 0.0627328, + "balance_loss_mlp": 0.01257491, + "epoch": 0.7067488351119796, + "flos": 21182284604160.0, + "grad_norm": 1.5428805294467156, + "language_loss": 0.89486808, + "learning_rate": 8.361495817407919e-07, + "loss": 0.97163951, + "num_input_tokens_seen": 253622855, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.09960938, + "step": 11755, + "time_per_iteration": 2.507603883743286 + }, + { + "auxiliary_loss_clip": 0.06407668, + "auxiliary_loss_mlp": 0.01267402, + "balance_loss_clip": 0.06274056, + "balance_loss_mlp": 0.01257293, + "epoch": 0.7068089583646475, + "flos": 20455520215680.0, + "grad_norm": 1.4982614193498491, + "language_loss": 0.79735661, + "learning_rate": 8.358328770928678e-07, + "loss": 0.87410736, + "num_input_tokens_seen": 253642760, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.10119629, + "step": 11756, + "time_per_iteration": 3.994943618774414 + }, + { + "auxiliary_loss_clip": 0.06321511, + "auxiliary_loss_mlp": 0.0125505, + "balance_loss_clip": 0.06265193, + "balance_loss_mlp": 0.01253739, + "epoch": 0.7068690816173155, + "flos": 59125542399360.0, + "grad_norm": 0.8066454127458581, + "language_loss": 0.6018793, + "learning_rate": 8.355162165902785e-07, + "loss": 0.67764497, + "num_input_tokens_seen": 253695685, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01311493, + "step": 11757, + "time_per_iteration": 2.9342048168182373 + }, + { + "auxiliary_loss_clip": 0.06406799, + "auxiliary_loss_mlp": 0.01267209, + "balance_loss_clip": 0.06273741, + "balance_loss_mlp": 0.01256135, + "epoch": 0.7069292048699835, + "flos": 16256845900800.0, + "grad_norm": 2.1598051545702264, + "language_loss": 0.80614579, + "learning_rate": 8.351996002450307e-07, + "loss": 0.88288587, + "num_input_tokens_seen": 253713305, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.11071777, + "step": 11758, + "time_per_iteration": 2.4969773292541504 + }, + { + "auxiliary_loss_clip": 0.06407057, + "auxiliary_loss_mlp": 0.01266896, + "balance_loss_clip": 0.06273986, + "balance_loss_mlp": 0.01256143, + "epoch": 0.7069893281226515, + "flos": 41182468133760.0, + "grad_norm": 1.7333024967156656, + "language_loss": 0.77613515, + "learning_rate": 8.348830280691304e-07, + "loss": 0.85287464, + "num_input_tokens_seen": 253736100, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.10754395, + "step": 11759, + "time_per_iteration": 2.6857149600982666 + }, + { + "auxiliary_loss_clip": 0.06407617, + "auxiliary_loss_mlp": 0.01266387, + "balance_loss_clip": 0.06274342, + "balance_loss_mlp": 0.01254746, + "epoch": 0.7070494513753194, + "flos": 24214203141120.0, + "grad_norm": 1.49498062494056, + "language_loss": 0.68238914, + "learning_rate": 8.34566500074583e-07, + "loss": 0.75912917, + "num_input_tokens_seen": 253757350, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.11639404, + "step": 11760, + "time_per_iteration": 4.106550455093384 + }, + { + "auxiliary_loss_clip": 0.06414315, + "auxiliary_loss_mlp": 0.01264826, + "balance_loss_clip": 0.06276926, + "balance_loss_mlp": 0.01254354, + "epoch": 0.7071095746279874, + "flos": 20190564005760.0, + "grad_norm": 1.927414071449925, + "language_loss": 0.79955995, + "learning_rate": 8.342500162733899e-07, + "loss": 0.8763513, + "num_input_tokens_seen": 253772855, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.10479736, + "step": 11761, + "time_per_iteration": 2.4826464653015137 + }, + { + "auxiliary_loss_clip": 0.0640934, + "auxiliary_loss_mlp": 0.01267235, + "balance_loss_clip": 0.06273883, + "balance_loss_mlp": 0.0125588, + "epoch": 0.7071696978806553, + "flos": 18188282839680.0, + "grad_norm": 2.2121961398440684, + "language_loss": 0.75218999, + "learning_rate": 8.33933576677553e-07, + "loss": 0.82895583, + "num_input_tokens_seen": 253790360, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.11352539, + "step": 11762, + "time_per_iteration": 2.4954895973205566 + }, + { + "auxiliary_loss_clip": 0.06405114, + "auxiliary_loss_mlp": 0.01264533, + "balance_loss_clip": 0.06272383, + "balance_loss_mlp": 0.01254579, + "epoch": 0.7072298211333233, + "flos": 24138201888000.0, + "grad_norm": 1.8799497376122591, + "language_loss": 0.77263492, + "learning_rate": 8.336171812990724e-07, + "loss": 0.84933138, + "num_input_tokens_seen": 253810585, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09954834, + "step": 11763, + "time_per_iteration": 2.53564453125 + }, + { + "auxiliary_loss_clip": 0.06407874, + "auxiliary_loss_mlp": 0.01264442, + "balance_loss_clip": 0.062722, + "balance_loss_mlp": 0.01253493, + "epoch": 0.7072899443859912, + "flos": 27205731210240.0, + "grad_norm": 2.480752014730448, + "language_loss": 0.78787279, + "learning_rate": 8.333008301499453e-07, + "loss": 0.86459595, + "num_input_tokens_seen": 253829080, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10949707, + "step": 11764, + "time_per_iteration": 2.652902841567993 + }, + { + "auxiliary_loss_clip": 0.06416324, + "auxiliary_loss_mlp": 0.01267754, + "balance_loss_clip": 0.06276786, + "balance_loss_mlp": 0.01256852, + "epoch": 0.7073500676386593, + "flos": 16441188883200.0, + "grad_norm": 1.6649904523449048, + "language_loss": 0.79710478, + "learning_rate": 8.32984523242167e-07, + "loss": 0.87394559, + "num_input_tokens_seen": 253846780, + "router_z_loss_clip": 1.39648438, + "router_z_loss_mlp": 0.10900879, + "step": 11765, + "time_per_iteration": 2.478731632232666 + }, + { + "auxiliary_loss_clip": 0.0640541, + "auxiliary_loss_mlp": 0.01265613, + "balance_loss_clip": 0.06272826, + "balance_loss_mlp": 0.0125638, + "epoch": 0.7074101908913272, + "flos": 27681291457920.0, + "grad_norm": 1.64401676901429, + "language_loss": 0.69017607, + "learning_rate": 8.326682605877324e-07, + "loss": 0.76688629, + "num_input_tokens_seen": 253867075, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09222412, + "step": 11766, + "time_per_iteration": 2.5636019706726074 + }, + { + "auxiliary_loss_clip": 0.06409839, + "auxiliary_loss_mlp": 0.01267425, + "balance_loss_clip": 0.06272456, + "balance_loss_mlp": 0.01256399, + "epoch": 0.7074703141439952, + "flos": 22244849429760.0, + "grad_norm": 1.7806465184891558, + "language_loss": 0.64121795, + "learning_rate": 8.323520421986352e-07, + "loss": 0.71799058, + "num_input_tokens_seen": 253885790, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.11016846, + "step": 11767, + "time_per_iteration": 2.509098529815674 + }, + { + "auxiliary_loss_clip": 0.06408227, + "auxiliary_loss_mlp": 0.01264258, + "balance_loss_clip": 0.06273193, + "balance_loss_mlp": 0.01253768, + "epoch": 0.7075304373966632, + "flos": 29650980585600.0, + "grad_norm": 1.5320251232109037, + "language_loss": 0.53099549, + "learning_rate": 8.320358680868646e-07, + "loss": 0.60772038, + "num_input_tokens_seen": 253907070, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.10491943, + "step": 11768, + "time_per_iteration": 2.5991628170013428 + }, + { + "auxiliary_loss_clip": 0.06404776, + "auxiliary_loss_mlp": 0.01263382, + "balance_loss_clip": 0.06271052, + "balance_loss_mlp": 0.01253565, + "epoch": 0.7075905606493311, + "flos": 19761264011520.0, + "grad_norm": 1.5482480325031622, + "language_loss": 0.75826794, + "learning_rate": 8.317197382644119e-07, + "loss": 0.83494949, + "num_input_tokens_seen": 253927290, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.0980835, + "step": 11769, + "time_per_iteration": 2.553248167037964 + }, + { + "auxiliary_loss_clip": 0.063171, + "auxiliary_loss_mlp": 0.01250363, + "balance_loss_clip": 0.06260812, + "balance_loss_mlp": 0.01249205, + "epoch": 0.7076506839019991, + "flos": 65734106866560.0, + "grad_norm": 0.8156037445248981, + "language_loss": 0.6198988, + "learning_rate": 8.314036527432637e-07, + "loss": 0.69557339, + "num_input_tokens_seen": 253983440, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01155853, + "step": 11770, + "time_per_iteration": 3.0812795162200928 + }, + { + "auxiliary_loss_clip": 0.0641284, + "auxiliary_loss_mlp": 0.01265072, + "balance_loss_clip": 0.06274459, + "balance_loss_mlp": 0.01254516, + "epoch": 0.707710807154667, + "flos": 23771444567040.0, + "grad_norm": 1.6411438931926623, + "language_loss": 0.76769519, + "learning_rate": 8.310876115354055e-07, + "loss": 0.84447432, + "num_input_tokens_seen": 254003825, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.10552979, + "step": 11771, + "time_per_iteration": 2.5363407135009766 + }, + { + "auxiliary_loss_clip": 0.06403352, + "auxiliary_loss_mlp": 0.01265567, + "balance_loss_clip": 0.06272224, + "balance_loss_mlp": 0.01255482, + "epoch": 0.7077709304073351, + "flos": 21257698878720.0, + "grad_norm": 1.3979456660804543, + "language_loss": 0.71690625, + "learning_rate": 8.307716146528221e-07, + "loss": 0.79359543, + "num_input_tokens_seen": 254023345, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.10083008, + "step": 11772, + "time_per_iteration": 2.517993688583374 + }, + { + "auxiliary_loss_clip": 0.06417513, + "auxiliary_loss_mlp": 0.01264872, + "balance_loss_clip": 0.06277703, + "balance_loss_mlp": 0.01253535, + "epoch": 0.707831053660003, + "flos": 20747030970240.0, + "grad_norm": 1.7220446646082324, + "language_loss": 0.69968081, + "learning_rate": 8.30455662107496e-07, + "loss": 0.77650464, + "num_input_tokens_seen": 254041815, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.11334229, + "step": 11773, + "time_per_iteration": 2.487250328063965 + }, + { + "auxiliary_loss_clip": 0.06409782, + "auxiliary_loss_mlp": 0.01269179, + "balance_loss_clip": 0.0627438, + "balance_loss_mlp": 0.01259016, + "epoch": 0.707891176912671, + "flos": 21987440087040.0, + "grad_norm": 1.361330798775882, + "language_loss": 0.70201778, + "learning_rate": 8.301397539114095e-07, + "loss": 0.77880728, + "num_input_tokens_seen": 254062065, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.10150146, + "step": 11774, + "time_per_iteration": 2.519763231277466 + }, + { + "auxiliary_loss_clip": 0.0640517, + "auxiliary_loss_mlp": 0.01266109, + "balance_loss_clip": 0.06274074, + "balance_loss_mlp": 0.01256316, + "epoch": 0.7079513001653389, + "flos": 21075284540160.0, + "grad_norm": 1.498970377219278, + "language_loss": 0.7492069, + "learning_rate": 8.298238900765407e-07, + "loss": 0.82591969, + "num_input_tokens_seen": 254080605, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.09802246, + "step": 11775, + "time_per_iteration": 2.5430877208709717 + }, + { + "auxiliary_loss_clip": 0.06415135, + "auxiliary_loss_mlp": 0.01264314, + "balance_loss_clip": 0.06278447, + "balance_loss_mlp": 0.01254014, + "epoch": 0.7080114234180069, + "flos": 18046468604160.0, + "grad_norm": 1.621138107650678, + "language_loss": 0.87510455, + "learning_rate": 8.295080706148665e-07, + "loss": 0.95189905, + "num_input_tokens_seen": 254098710, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.10314941, + "step": 11776, + "time_per_iteration": 2.517082691192627 + }, + { + "auxiliary_loss_clip": 0.06408748, + "auxiliary_loss_mlp": 0.01265871, + "balance_loss_clip": 0.0627363, + "balance_loss_mlp": 0.01256096, + "epoch": 0.7080715466706748, + "flos": 15127671479040.0, + "grad_norm": 1.4637417425019663, + "language_loss": 0.75087041, + "learning_rate": 8.291922955383641e-07, + "loss": 0.82761657, + "num_input_tokens_seen": 254117200, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.09777832, + "step": 11777, + "time_per_iteration": 2.5164589881896973 + }, + { + "auxiliary_loss_clip": 0.06418398, + "auxiliary_loss_mlp": 0.01264812, + "balance_loss_clip": 0.0627712, + "balance_loss_mlp": 0.01253928, + "epoch": 0.7081316699233429, + "flos": 14427042364800.0, + "grad_norm": 1.984175776722718, + "language_loss": 0.82697594, + "learning_rate": 8.288765648590066e-07, + "loss": 0.903808, + "num_input_tokens_seen": 254132115, + "router_z_loss_clip": 1.41308594, + "router_z_loss_mlp": 0.10888672, + "step": 11778, + "time_per_iteration": 2.5013656616210938 + }, + { + "auxiliary_loss_clip": 0.06404569, + "auxiliary_loss_mlp": 0.01264308, + "balance_loss_clip": 0.06274152, + "balance_loss_mlp": 0.01255213, + "epoch": 0.7081917931760108, + "flos": 23229190869120.0, + "grad_norm": 1.4143364906484888, + "language_loss": 0.84851789, + "learning_rate": 8.285608785887673e-07, + "loss": 0.9252066, + "num_input_tokens_seen": 254152285, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.09100342, + "step": 11779, + "time_per_iteration": 2.5495359897613525 + }, + { + "auxiliary_loss_clip": 0.06410395, + "auxiliary_loss_mlp": 0.01264448, + "balance_loss_clip": 0.06273511, + "balance_loss_mlp": 0.01254321, + "epoch": 0.7082519164286788, + "flos": 39317221520640.0, + "grad_norm": 1.7515830912849983, + "language_loss": 0.7191208, + "learning_rate": 8.28245236739618e-07, + "loss": 0.79586923, + "num_input_tokens_seen": 254172805, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.10125732, + "step": 11780, + "time_per_iteration": 4.163387775421143 + }, + { + "auxiliary_loss_clip": 0.06407901, + "auxiliary_loss_mlp": 0.01267276, + "balance_loss_clip": 0.06274346, + "balance_loss_mlp": 0.01257382, + "epoch": 0.7083120396813467, + "flos": 21657299800320.0, + "grad_norm": 1.349993887717698, + "language_loss": 0.73180461, + "learning_rate": 8.279296393235256e-07, + "loss": 0.80855638, + "num_input_tokens_seen": 254191890, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.09887695, + "step": 11781, + "time_per_iteration": 2.523428440093994 + }, + { + "auxiliary_loss_clip": 0.06407337, + "auxiliary_loss_mlp": 0.0126471, + "balance_loss_clip": 0.06273166, + "balance_loss_mlp": 0.01254625, + "epoch": 0.7083721629340147, + "flos": 17572878927360.0, + "grad_norm": 2.699338792660173, + "language_loss": 0.77578008, + "learning_rate": 8.276140863524585e-07, + "loss": 0.85250056, + "num_input_tokens_seen": 254210150, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.10089111, + "step": 11782, + "time_per_iteration": 2.458449363708496 + }, + { + "auxiliary_loss_clip": 0.06406146, + "auxiliary_loss_mlp": 0.0126417, + "balance_loss_clip": 0.06272672, + "balance_loss_mlp": 0.01254991, + "epoch": 0.7084322861866827, + "flos": 29358086238720.0, + "grad_norm": 1.4360937815095354, + "language_loss": 0.70182502, + "learning_rate": 8.272985778383828e-07, + "loss": 0.77852821, + "num_input_tokens_seen": 254233015, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.09173584, + "step": 11783, + "time_per_iteration": 2.5887033939361572 + }, + { + "auxiliary_loss_clip": 0.06414656, + "auxiliary_loss_mlp": 0.01268164, + "balance_loss_clip": 0.06274001, + "balance_loss_mlp": 0.0125768, + "epoch": 0.7084924094393507, + "flos": 20200626495360.0, + "grad_norm": 1.5971747704172947, + "language_loss": 0.79307884, + "learning_rate": 8.269831137932632e-07, + "loss": 0.86990702, + "num_input_tokens_seen": 254251345, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.1048584, + "step": 11784, + "time_per_iteration": 2.490954637527466 + }, + { + "auxiliary_loss_clip": 0.0640732, + "auxiliary_loss_mlp": 0.01267, + "balance_loss_clip": 0.06272314, + "balance_loss_mlp": 0.01256737, + "epoch": 0.7085525326920187, + "flos": 23483958808320.0, + "grad_norm": 1.617674750849371, + "language_loss": 0.77606887, + "learning_rate": 8.266676942290609e-07, + "loss": 0.85281205, + "num_input_tokens_seen": 254269905, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.1026001, + "step": 11785, + "time_per_iteration": 2.521693706512451 + }, + { + "auxiliary_loss_clip": 0.06413119, + "auxiliary_loss_mlp": 0.01265727, + "balance_loss_clip": 0.06278774, + "balance_loss_mlp": 0.01255934, + "epoch": 0.7086126559446866, + "flos": 25966076780160.0, + "grad_norm": 1.4386102379185288, + "language_loss": 0.78040558, + "learning_rate": 8.26352319157738e-07, + "loss": 0.85719407, + "num_input_tokens_seen": 254289990, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.09796143, + "step": 11786, + "time_per_iteration": 2.522735834121704 + }, + { + "auxiliary_loss_clip": 0.06412391, + "auxiliary_loss_mlp": 0.01268502, + "balance_loss_clip": 0.06275783, + "balance_loss_mlp": 0.0125834, + "epoch": 0.7086727791973546, + "flos": 26732141533440.0, + "grad_norm": 1.8351634972642936, + "language_loss": 0.79121733, + "learning_rate": 8.260369885912526e-07, + "loss": 0.86802632, + "num_input_tokens_seen": 254309085, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.10162354, + "step": 11787, + "time_per_iteration": 2.5581464767456055 + }, + { + "auxiliary_loss_clip": 0.06412619, + "auxiliary_loss_mlp": 0.01271025, + "balance_loss_clip": 0.06277216, + "balance_loss_mlp": 0.01260475, + "epoch": 0.7087329024500225, + "flos": 21688801735680.0, + "grad_norm": 1.8228289571149952, + "language_loss": 0.76948512, + "learning_rate": 8.257217025415615e-07, + "loss": 0.84632152, + "num_input_tokens_seen": 254327045, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.10540771, + "step": 11788, + "time_per_iteration": 2.490006446838379 + }, + { + "auxiliary_loss_clip": 0.06420539, + "auxiliary_loss_mlp": 0.01270333, + "balance_loss_clip": 0.06279223, + "balance_loss_mlp": 0.01259014, + "epoch": 0.7087930257026905, + "flos": 17936827136640.0, + "grad_norm": 2.296634586886211, + "language_loss": 0.67989695, + "learning_rate": 8.254064610206212e-07, + "loss": 0.75680566, + "num_input_tokens_seen": 254344585, + "router_z_loss_clip": 1.41308594, + "router_z_loss_mlp": 0.11322021, + "step": 11789, + "time_per_iteration": 2.5101919174194336 + }, + { + "auxiliary_loss_clip": 0.06411231, + "auxiliary_loss_mlp": 0.01266357, + "balance_loss_clip": 0.06272002, + "balance_loss_mlp": 0.01256111, + "epoch": 0.7088531489553584, + "flos": 18916682382720.0, + "grad_norm": 1.5602629922400044, + "language_loss": 0.77709448, + "learning_rate": 8.250912640403858e-07, + "loss": 0.85387033, + "num_input_tokens_seen": 254362470, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.10241699, + "step": 11790, + "time_per_iteration": 2.484931468963623 + }, + { + "auxiliary_loss_clip": 0.06419586, + "auxiliary_loss_mlp": 0.01267055, + "balance_loss_clip": 0.06277139, + "balance_loss_mlp": 0.01255253, + "epoch": 0.7089132722080265, + "flos": 27388229402880.0, + "grad_norm": 1.5308750679240268, + "language_loss": 0.71250129, + "learning_rate": 8.247761116128085e-07, + "loss": 0.78936774, + "num_input_tokens_seen": 254383190, + "router_z_loss_clip": 1.42382812, + "router_z_loss_mlp": 0.11798096, + "step": 11791, + "time_per_iteration": 2.583948850631714 + }, + { + "auxiliary_loss_clip": 0.06410724, + "auxiliary_loss_mlp": 0.01267551, + "balance_loss_clip": 0.06275617, + "balance_loss_mlp": 0.0125675, + "epoch": 0.7089733954606944, + "flos": 22169309374080.0, + "grad_norm": 1.511652721397476, + "language_loss": 0.82245874, + "learning_rate": 8.244610037498376e-07, + "loss": 0.89924157, + "num_input_tokens_seen": 254403115, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.1081543, + "step": 11792, + "time_per_iteration": 3.987499475479126 + }, + { + "auxiliary_loss_clip": 0.06412215, + "auxiliary_loss_mlp": 0.01267904, + "balance_loss_clip": 0.06272028, + "balance_loss_mlp": 0.01256817, + "epoch": 0.7090335187133624, + "flos": 24432731389440.0, + "grad_norm": 1.9294753325302831, + "language_loss": 0.65135908, + "learning_rate": 8.241459404634232e-07, + "loss": 0.72816032, + "num_input_tokens_seen": 254421875, + "router_z_loss_clip": 1.40039062, + "router_z_loss_mlp": 0.11083984, + "step": 11793, + "time_per_iteration": 2.5396199226379395 + }, + { + "auxiliary_loss_clip": 0.06407128, + "auxiliary_loss_mlp": 0.01268973, + "balance_loss_clip": 0.06271678, + "balance_loss_mlp": 0.01258834, + "epoch": 0.7090936419660303, + "flos": 21841684709760.0, + "grad_norm": 1.9925409901798494, + "language_loss": 0.70387089, + "learning_rate": 8.238309217655133e-07, + "loss": 0.78063184, + "num_input_tokens_seen": 254440765, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.10144043, + "step": 11794, + "time_per_iteration": 2.5805962085723877 + }, + { + "auxiliary_loss_clip": 0.06410742, + "auxiliary_loss_mlp": 0.01263848, + "balance_loss_clip": 0.0627709, + "balance_loss_mlp": 0.01253828, + "epoch": 0.7091537652186983, + "flos": 20088259770240.0, + "grad_norm": 1.8813846026416328, + "language_loss": 0.76058149, + "learning_rate": 8.23515947668052e-07, + "loss": 0.83732742, + "num_input_tokens_seen": 254459480, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.10015869, + "step": 11795, + "time_per_iteration": 3.9482054710388184 + }, + { + "auxiliary_loss_clip": 0.06412329, + "auxiliary_loss_mlp": 0.0126988, + "balance_loss_clip": 0.06275567, + "balance_loss_mlp": 0.01258812, + "epoch": 0.7092138884713663, + "flos": 13156556832000.0, + "grad_norm": 2.0194589674634242, + "language_loss": 0.75623167, + "learning_rate": 8.232010181829838e-07, + "loss": 0.83305377, + "num_input_tokens_seen": 254473985, + "router_z_loss_clip": 1.36621094, + "router_z_loss_mlp": 0.11077881, + "step": 11796, + "time_per_iteration": 2.49794340133667 + }, + { + "auxiliary_loss_clip": 0.06421532, + "auxiliary_loss_mlp": 0.01265378, + "balance_loss_clip": 0.06280202, + "balance_loss_mlp": 0.01254024, + "epoch": 0.7092740117240343, + "flos": 21651262306560.0, + "grad_norm": 1.5362456233213855, + "language_loss": 0.74430573, + "learning_rate": 8.228861333222523e-07, + "loss": 0.8211748, + "num_input_tokens_seen": 254492135, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.11352539, + "step": 11797, + "time_per_iteration": 2.5082199573516846 + }, + { + "auxiliary_loss_clip": 0.06411034, + "auxiliary_loss_mlp": 0.01266935, + "balance_loss_clip": 0.06274262, + "balance_loss_mlp": 0.01256326, + "epoch": 0.7093341349767023, + "flos": 21038835214080.0, + "grad_norm": 1.402262543828535, + "language_loss": 0.79553568, + "learning_rate": 8.225712930977953e-07, + "loss": 0.87231541, + "num_input_tokens_seen": 254512865, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.10614014, + "step": 11798, + "time_per_iteration": 2.5451393127441406 + }, + { + "auxiliary_loss_clip": 0.06409004, + "auxiliary_loss_mlp": 0.01266407, + "balance_loss_clip": 0.06273472, + "balance_loss_mlp": 0.01255911, + "epoch": 0.7093942582293702, + "flos": 22024140975360.0, + "grad_norm": 2.0553615011101236, + "language_loss": 0.67001218, + "learning_rate": 8.222564975215529e-07, + "loss": 0.74676633, + "num_input_tokens_seen": 254532605, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.10491943, + "step": 11799, + "time_per_iteration": 3.9047088623046875 + }, + { + "auxiliary_loss_clip": 0.06411745, + "auxiliary_loss_mlp": 0.01265473, + "balance_loss_clip": 0.06276356, + "balance_loss_mlp": 0.01254548, + "epoch": 0.7094543814820382, + "flos": 27243019077120.0, + "grad_norm": 1.5384407371377906, + "language_loss": 0.82004559, + "learning_rate": 8.219417466054622e-07, + "loss": 0.89681768, + "num_input_tokens_seen": 254553780, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.10925293, + "step": 11800, + "time_per_iteration": 2.54984188079834 + }, + { + "auxiliary_loss_clip": 0.06408048, + "auxiliary_loss_mlp": 0.01264695, + "balance_loss_clip": 0.06274039, + "balance_loss_mlp": 0.01255218, + "epoch": 0.7095145047347061, + "flos": 12093237319680.0, + "grad_norm": 1.8049515172262331, + "language_loss": 0.86792338, + "learning_rate": 8.21627040361459e-07, + "loss": 0.94465083, + "num_input_tokens_seen": 254567510, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.0947876, + "step": 11801, + "time_per_iteration": 2.472968339920044 + }, + { + "auxiliary_loss_clip": 0.06414308, + "auxiliary_loss_mlp": 0.01268303, + "balance_loss_clip": 0.06278587, + "balance_loss_mlp": 0.01257896, + "epoch": 0.7095746279873741, + "flos": 19388678832000.0, + "grad_norm": 1.9685683260033982, + "language_loss": 0.7659384, + "learning_rate": 8.213123788014758e-07, + "loss": 0.8427645, + "num_input_tokens_seen": 254585565, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.10412598, + "step": 11802, + "time_per_iteration": 2.469217300415039 + }, + { + "auxiliary_loss_clip": 0.06413268, + "auxiliary_loss_mlp": 0.01270796, + "balance_loss_clip": 0.06277166, + "balance_loss_mlp": 0.01259948, + "epoch": 0.709634751240042, + "flos": 21366921075840.0, + "grad_norm": 1.7164711115559128, + "language_loss": 0.81734449, + "learning_rate": 8.209977619374462e-07, + "loss": 0.89418513, + "num_input_tokens_seen": 254603465, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.10845947, + "step": 11803, + "time_per_iteration": 2.5675346851348877 + }, + { + "auxiliary_loss_clip": 0.06413771, + "auxiliary_loss_mlp": 0.01268086, + "balance_loss_clip": 0.0627571, + "balance_loss_mlp": 0.01256702, + "epoch": 0.7096948744927101, + "flos": 13922034606720.0, + "grad_norm": 2.2508010678544363, + "language_loss": 0.6771282, + "learning_rate": 8.206831897812995e-07, + "loss": 0.75394678, + "num_input_tokens_seen": 254620500, + "router_z_loss_clip": 1.38183594, + "router_z_loss_mlp": 0.1137085, + "step": 11804, + "time_per_iteration": 2.4850802421569824 + }, + { + "auxiliary_loss_clip": 0.06406445, + "auxiliary_loss_mlp": 0.01269291, + "balance_loss_clip": 0.06276047, + "balance_loss_mlp": 0.01259694, + "epoch": 0.709754997745378, + "flos": 30305936424960.0, + "grad_norm": 1.836033307049916, + "language_loss": 0.78141153, + "learning_rate": 8.203686623449637e-07, + "loss": 0.8581689, + "num_input_tokens_seen": 254638565, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.0960083, + "step": 11805, + "time_per_iteration": 2.5807907581329346 + }, + { + "auxiliary_loss_clip": 0.06411435, + "auxiliary_loss_mlp": 0.01266806, + "balance_loss_clip": 0.06275858, + "balance_loss_mlp": 0.01256202, + "epoch": 0.709815120998046, + "flos": 18521064529920.0, + "grad_norm": 3.360423816262503, + "language_loss": 0.78911841, + "learning_rate": 8.200541796403667e-07, + "loss": 0.86590087, + "num_input_tokens_seen": 254657505, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.1060791, + "step": 11806, + "time_per_iteration": 2.4750113487243652 + }, + { + "auxiliary_loss_clip": 0.06409614, + "auxiliary_loss_mlp": 0.01266594, + "balance_loss_clip": 0.06275766, + "balance_loss_mlp": 0.01256503, + "epoch": 0.7098752442507139, + "flos": 22279034695680.0, + "grad_norm": 3.0880614568331883, + "language_loss": 0.56418979, + "learning_rate": 8.197397416794332e-07, + "loss": 0.64095187, + "num_input_tokens_seen": 254674730, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.10095215, + "step": 11807, + "time_per_iteration": 2.5265543460845947 + }, + { + "auxiliary_loss_clip": 0.06416228, + "auxiliary_loss_mlp": 0.01269148, + "balance_loss_clip": 0.06274513, + "balance_loss_mlp": 0.01257686, + "epoch": 0.7099353675033819, + "flos": 19280504810880.0, + "grad_norm": 2.07369456244542, + "language_loss": 0.68290567, + "learning_rate": 8.194253484740882e-07, + "loss": 0.75975943, + "num_input_tokens_seen": 254691665, + "router_z_loss_clip": 1.41699219, + "router_z_loss_mlp": 0.11462402, + "step": 11808, + "time_per_iteration": 2.472132444381714 + }, + { + "auxiliary_loss_clip": 0.06414328, + "auxiliary_loss_mlp": 0.01264603, + "balance_loss_clip": 0.06275385, + "balance_loss_mlp": 0.01254512, + "epoch": 0.70999549075605, + "flos": 21915044559360.0, + "grad_norm": 1.9968242899147548, + "language_loss": 0.71669781, + "learning_rate": 8.191110000362513e-07, + "loss": 0.79348707, + "num_input_tokens_seen": 254711610, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.10089111, + "step": 11809, + "time_per_iteration": 2.524571180343628 + }, + { + "auxiliary_loss_clip": 0.06322539, + "auxiliary_loss_mlp": 0.01256903, + "balance_loss_clip": 0.06266782, + "balance_loss_mlp": 0.01255681, + "epoch": 0.7100556140087179, + "flos": 70474280192640.0, + "grad_norm": 0.7372364518861584, + "language_loss": 0.59065175, + "learning_rate": 8.187966963778435e-07, + "loss": 0.66644615, + "num_input_tokens_seen": 254772615, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.01220703, + "step": 11810, + "time_per_iteration": 3.2093372344970703 + }, + { + "auxiliary_loss_clip": 0.06413063, + "auxiliary_loss_mlp": 0.01263776, + "balance_loss_clip": 0.06277919, + "balance_loss_mlp": 0.01253721, + "epoch": 0.7101157372613859, + "flos": 23046273406080.0, + "grad_norm": 1.545725512324635, + "language_loss": 0.74353242, + "learning_rate": 8.18482437510784e-07, + "loss": 0.82030082, + "num_input_tokens_seen": 254791375, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.10064697, + "step": 11811, + "time_per_iteration": 2.5427846908569336 + }, + { + "auxiliary_loss_clip": 0.06404351, + "auxiliary_loss_mlp": 0.0126459, + "balance_loss_clip": 0.06272991, + "balance_loss_mlp": 0.01255149, + "epoch": 0.7101758605140538, + "flos": 23192028783360.0, + "grad_norm": 1.7044281012631433, + "language_loss": 0.83467686, + "learning_rate": 8.181682234469882e-07, + "loss": 0.91136628, + "num_input_tokens_seen": 254809300, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09442139, + "step": 11812, + "time_per_iteration": 2.5327343940734863 + }, + { + "auxiliary_loss_clip": 0.0641521, + "auxiliary_loss_mlp": 0.0126703, + "balance_loss_clip": 0.06277661, + "balance_loss_mlp": 0.01256659, + "epoch": 0.7102359837667218, + "flos": 23702906327040.0, + "grad_norm": 1.4051092754707344, + "language_loss": 0.69960868, + "learning_rate": 8.178540541983716e-07, + "loss": 0.77643108, + "num_input_tokens_seen": 254829325, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.10375977, + "step": 11813, + "time_per_iteration": 2.6402204036712646 + }, + { + "auxiliary_loss_clip": 0.06402316, + "auxiliary_loss_mlp": 0.01264286, + "balance_loss_clip": 0.06270487, + "balance_loss_mlp": 0.01254451, + "epoch": 0.7102961070193897, + "flos": 19397231948160.0, + "grad_norm": 1.7011399194035903, + "language_loss": 0.82479846, + "learning_rate": 8.175399297768495e-07, + "loss": 0.90146458, + "num_input_tokens_seen": 254847690, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09832764, + "step": 11814, + "time_per_iteration": 2.4825360774993896 + }, + { + "auxiliary_loss_clip": 0.06407954, + "auxiliary_loss_mlp": 0.01266287, + "balance_loss_clip": 0.06273861, + "balance_loss_mlp": 0.01255308, + "epoch": 0.7103562302720577, + "flos": 21514018118400.0, + "grad_norm": 1.9900571557306543, + "language_loss": 0.76711023, + "learning_rate": 8.172258501943301e-07, + "loss": 0.84385264, + "num_input_tokens_seen": 254865960, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.10974121, + "step": 11815, + "time_per_iteration": 2.5411629676818848 + }, + { + "auxiliary_loss_clip": 0.06407356, + "auxiliary_loss_mlp": 0.01265787, + "balance_loss_clip": 0.0627517, + "balance_loss_mlp": 0.01256012, + "epoch": 0.7104163535247257, + "flos": 14539786433280.0, + "grad_norm": 2.148014854725882, + "language_loss": 0.78734261, + "learning_rate": 8.16911815462725e-07, + "loss": 0.86407399, + "num_input_tokens_seen": 254882815, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.09777832, + "step": 11816, + "time_per_iteration": 2.4732110500335693 + }, + { + "auxiliary_loss_clip": 0.06409387, + "auxiliary_loss_mlp": 0.0126716, + "balance_loss_clip": 0.06273407, + "balance_loss_mlp": 0.01257415, + "epoch": 0.7104764767773937, + "flos": 11405018609280.0, + "grad_norm": 1.710233044928932, + "language_loss": 0.87136269, + "learning_rate": 8.165978255939426e-07, + "loss": 0.9481281, + "num_input_tokens_seen": 254898705, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.09747314, + "step": 11817, + "time_per_iteration": 2.4930732250213623 + }, + { + "auxiliary_loss_clip": 0.06405669, + "auxiliary_loss_mlp": 0.01263794, + "balance_loss_clip": 0.06273086, + "balance_loss_mlp": 0.01254358, + "epoch": 0.7105366000300616, + "flos": 11694894209280.0, + "grad_norm": 2.3467290312942906, + "language_loss": 0.84727818, + "learning_rate": 8.162838805998897e-07, + "loss": 0.92397279, + "num_input_tokens_seen": 254913665, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09436035, + "step": 11818, + "time_per_iteration": 2.4601902961730957 + }, + { + "auxiliary_loss_clip": 0.06407452, + "auxiliary_loss_mlp": 0.01265048, + "balance_loss_clip": 0.06270839, + "balance_loss_mlp": 0.01254808, + "epoch": 0.7105967232827296, + "flos": 19360027935360.0, + "grad_norm": 1.943101872130184, + "language_loss": 0.76065433, + "learning_rate": 8.159699804924709e-07, + "loss": 0.83737928, + "num_input_tokens_seen": 254932140, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.10235596, + "step": 11819, + "time_per_iteration": 2.5082414150238037 + }, + { + "auxiliary_loss_clip": 0.06408325, + "auxiliary_loss_mlp": 0.01273169, + "balance_loss_clip": 0.06273748, + "balance_loss_mlp": 0.01262422, + "epoch": 0.7106568465353975, + "flos": 22937135063040.0, + "grad_norm": 1.5613953087486683, + "language_loss": 0.71238112, + "learning_rate": 8.156561252835883e-07, + "loss": 0.78919601, + "num_input_tokens_seen": 254951580, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.10748291, + "step": 11820, + "time_per_iteration": 3.9562554359436035 + }, + { + "auxiliary_loss_clip": 0.06406607, + "auxiliary_loss_mlp": 0.01266388, + "balance_loss_clip": 0.06272983, + "balance_loss_mlp": 0.01256309, + "epoch": 0.7107169697880655, + "flos": 19105805047680.0, + "grad_norm": 1.709009415960719, + "language_loss": 0.75201517, + "learning_rate": 8.153423149851449e-07, + "loss": 0.82874513, + "num_input_tokens_seen": 254969425, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.10083008, + "step": 11821, + "time_per_iteration": 2.4773855209350586 + }, + { + "auxiliary_loss_clip": 0.0631486, + "auxiliary_loss_mlp": 0.0125056, + "balance_loss_clip": 0.06259306, + "balance_loss_mlp": 0.01249267, + "epoch": 0.7107770930407336, + "flos": 63655950228480.0, + "grad_norm": 0.8065746142119063, + "language_loss": 0.55105186, + "learning_rate": 8.150285496090388e-07, + "loss": 0.626706, + "num_input_tokens_seen": 255032680, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01293182, + "step": 11822, + "time_per_iteration": 3.1728925704956055 + }, + { + "auxiliary_loss_clip": 0.06399868, + "auxiliary_loss_mlp": 0.01265617, + "balance_loss_clip": 0.0627214, + "balance_loss_mlp": 0.01256313, + "epoch": 0.7108372162934015, + "flos": 22061009571840.0, + "grad_norm": 1.7664810996184872, + "language_loss": 0.61042011, + "learning_rate": 8.147148291671688e-07, + "loss": 0.68707502, + "num_input_tokens_seen": 255054400, + "router_z_loss_clip": 1.27539062, + "router_z_loss_mlp": 0.09301758, + "step": 11823, + "time_per_iteration": 2.685396194458008 + }, + { + "auxiliary_loss_clip": 0.06409906, + "auxiliary_loss_mlp": 0.01263571, + "balance_loss_clip": 0.0627628, + "balance_loss_mlp": 0.01254195, + "epoch": 0.7108973395460695, + "flos": 19141122343680.0, + "grad_norm": 1.95026020169961, + "language_loss": 0.71794426, + "learning_rate": 8.144011536714322e-07, + "loss": 0.79467905, + "num_input_tokens_seen": 255072785, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.09375, + "step": 11824, + "time_per_iteration": 2.5620133876800537 + }, + { + "auxiliary_loss_clip": 0.06401232, + "auxiliary_loss_mlp": 0.01266002, + "balance_loss_clip": 0.06271533, + "balance_loss_mlp": 0.01256841, + "epoch": 0.7109574627987374, + "flos": 17900168175360.0, + "grad_norm": 2.011245948242179, + "language_loss": 0.72948581, + "learning_rate": 8.140875231337223e-07, + "loss": 0.80615819, + "num_input_tokens_seen": 255091820, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.09161377, + "step": 11825, + "time_per_iteration": 2.481990098953247 + }, + { + "auxiliary_loss_clip": 0.06409375, + "auxiliary_loss_mlp": 0.01265195, + "balance_loss_clip": 0.06273198, + "balance_loss_mlp": 0.01254669, + "epoch": 0.7110175860514054, + "flos": 28986129964800.0, + "grad_norm": 1.8577779500908889, + "language_loss": 0.80001605, + "learning_rate": 8.137739375659321e-07, + "loss": 0.87676173, + "num_input_tokens_seen": 255111720, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.10540771, + "step": 11826, + "time_per_iteration": 2.5934202671051025 + }, + { + "auxiliary_loss_clip": 0.06401698, + "auxiliary_loss_mlp": 0.01267979, + "balance_loss_clip": 0.06270775, + "balance_loss_mlp": 0.0125846, + "epoch": 0.7110777093040733, + "flos": 26179867272960.0, + "grad_norm": 1.3769409852595975, + "language_loss": 0.83070964, + "learning_rate": 8.134603969799527e-07, + "loss": 0.90740645, + "num_input_tokens_seen": 255133495, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09521484, + "step": 11827, + "time_per_iteration": 2.5412826538085938 + }, + { + "auxiliary_loss_clip": 0.0640677, + "auxiliary_loss_mlp": 0.01267116, + "balance_loss_clip": 0.06271519, + "balance_loss_mlp": 0.01256507, + "epoch": 0.7111378325567413, + "flos": 26877184151040.0, + "grad_norm": 1.489155185626094, + "language_loss": 0.62609684, + "learning_rate": 8.131469013876748e-07, + "loss": 0.70283562, + "num_input_tokens_seen": 255156880, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.10601807, + "step": 11828, + "time_per_iteration": 2.549358367919922 + }, + { + "auxiliary_loss_clip": 0.0640718, + "auxiliary_loss_mlp": 0.01265747, + "balance_loss_clip": 0.06272048, + "balance_loss_mlp": 0.01255543, + "epoch": 0.7111979558094093, + "flos": 27279216840960.0, + "grad_norm": 1.3931875657884774, + "language_loss": 0.72552299, + "learning_rate": 8.128334508009846e-07, + "loss": 0.80225229, + "num_input_tokens_seen": 255178920, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.10205078, + "step": 11829, + "time_per_iteration": 2.538902997970581 + }, + { + "auxiliary_loss_clip": 0.06404835, + "auxiliary_loss_mlp": 0.01268934, + "balance_loss_clip": 0.06271756, + "balance_loss_mlp": 0.01259343, + "epoch": 0.7112580790620773, + "flos": 25054088941440.0, + "grad_norm": 1.7068284012281256, + "language_loss": 0.80460179, + "learning_rate": 8.125200452317697e-07, + "loss": 0.88133949, + "num_input_tokens_seen": 255198095, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.09594727, + "step": 11830, + "time_per_iteration": 2.527684450149536 + }, + { + "auxiliary_loss_clip": 0.064045, + "auxiliary_loss_mlp": 0.01264964, + "balance_loss_clip": 0.06270975, + "balance_loss_mlp": 0.01255338, + "epoch": 0.7113182023147452, + "flos": 21652016993280.0, + "grad_norm": 1.5791795722004685, + "language_loss": 0.84228051, + "learning_rate": 8.122066846919138e-07, + "loss": 0.91897511, + "num_input_tokens_seen": 255215860, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.09625244, + "step": 11831, + "time_per_iteration": 3.8946433067321777 + }, + { + "auxiliary_loss_clip": 0.06405313, + "auxiliary_loss_mlp": 0.01264799, + "balance_loss_clip": 0.06270519, + "balance_loss_mlp": 0.01255453, + "epoch": 0.7113783255674132, + "flos": 21002637450240.0, + "grad_norm": 1.9181792200519638, + "language_loss": 0.77265865, + "learning_rate": 8.118933691932985e-07, + "loss": 0.84935975, + "num_input_tokens_seen": 255235425, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.09344482, + "step": 11832, + "time_per_iteration": 2.517416477203369 + }, + { + "auxiliary_loss_clip": 0.06316236, + "auxiliary_loss_mlp": 0.01252897, + "balance_loss_clip": 0.06260582, + "balance_loss_mlp": 0.01251798, + "epoch": 0.7114384488200811, + "flos": 66788705554560.0, + "grad_norm": 0.7355523312106115, + "language_loss": 0.56510413, + "learning_rate": 8.115800987478059e-07, + "loss": 0.64079541, + "num_input_tokens_seen": 255291680, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01100922, + "step": 11833, + "time_per_iteration": 3.083800792694092 + }, + { + "auxiliary_loss_clip": 0.06404281, + "auxiliary_loss_mlp": 0.01264607, + "balance_loss_clip": 0.06270045, + "balance_loss_mlp": 0.01255255, + "epoch": 0.7114985720727491, + "flos": 25017136490880.0, + "grad_norm": 1.685224360571569, + "language_loss": 0.71167994, + "learning_rate": 8.11266873367315e-07, + "loss": 0.78836882, + "num_input_tokens_seen": 255313880, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.09350586, + "step": 11834, + "time_per_iteration": 2.5492658615112305 + }, + { + "auxiliary_loss_clip": 0.06408249, + "auxiliary_loss_mlp": 0.01268558, + "balance_loss_clip": 0.06272918, + "balance_loss_mlp": 0.01257972, + "epoch": 0.7115586953254172, + "flos": 21476478689280.0, + "grad_norm": 1.811757150622914, + "language_loss": 0.79512018, + "learning_rate": 8.10953693063704e-07, + "loss": 0.87188828, + "num_input_tokens_seen": 255332390, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.10583496, + "step": 11835, + "time_per_iteration": 3.936241865158081 + }, + { + "auxiliary_loss_clip": 0.06403308, + "auxiliary_loss_mlp": 0.01266062, + "balance_loss_clip": 0.06270957, + "balance_loss_mlp": 0.0125646, + "epoch": 0.7116188185780851, + "flos": 28630357528320.0, + "grad_norm": 1.5711246954693516, + "language_loss": 0.76045537, + "learning_rate": 8.10640557848848e-07, + "loss": 0.83714908, + "num_input_tokens_seen": 255354025, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09606934, + "step": 11836, + "time_per_iteration": 2.5701663494110107 + }, + { + "auxiliary_loss_clip": 0.06406698, + "auxiliary_loss_mlp": 0.01265952, + "balance_loss_clip": 0.06274588, + "balance_loss_mlp": 0.01256653, + "epoch": 0.7116789418307531, + "flos": 25299339442560.0, + "grad_norm": 1.6743206701340672, + "language_loss": 0.69986928, + "learning_rate": 8.103274677346208e-07, + "loss": 0.77659577, + "num_input_tokens_seen": 255371400, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09301758, + "step": 11837, + "time_per_iteration": 2.575038194656372 + }, + { + "auxiliary_loss_clip": 0.0641223, + "auxiliary_loss_mlp": 0.01266229, + "balance_loss_clip": 0.06274512, + "balance_loss_mlp": 0.01255494, + "epoch": 0.711739065083421, + "flos": 25564463360640.0, + "grad_norm": 1.8455270082673318, + "language_loss": 0.61858809, + "learning_rate": 8.100144227328958e-07, + "loss": 0.69537258, + "num_input_tokens_seen": 255390710, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.10736084, + "step": 11838, + "time_per_iteration": 2.5805752277374268 + }, + { + "auxiliary_loss_clip": 0.06409779, + "auxiliary_loss_mlp": 0.01267582, + "balance_loss_clip": 0.0627556, + "balance_loss_mlp": 0.01257699, + "epoch": 0.711799188336089, + "flos": 26148239556480.0, + "grad_norm": 2.1939319933932424, + "language_loss": 0.68031204, + "learning_rate": 8.097014228555426e-07, + "loss": 0.75708568, + "num_input_tokens_seen": 255408790, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.09875488, + "step": 11839, + "time_per_iteration": 3.951659679412842 + }, + { + "auxiliary_loss_clip": 0.06405699, + "auxiliary_loss_mlp": 0.01265718, + "balance_loss_clip": 0.06272204, + "balance_loss_mlp": 0.01256349, + "epoch": 0.7118593115887569, + "flos": 21146757672960.0, + "grad_norm": 2.0203738416997226, + "language_loss": 0.8447386, + "learning_rate": 8.093884681144305e-07, + "loss": 0.92145276, + "num_input_tokens_seen": 255426280, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.09375, + "step": 11840, + "time_per_iteration": 2.5161664485931396 + }, + { + "auxiliary_loss_clip": 0.0641197, + "auxiliary_loss_mlp": 0.01266296, + "balance_loss_clip": 0.06274749, + "balance_loss_mlp": 0.01256413, + "epoch": 0.711919434841425, + "flos": 14980951779840.0, + "grad_norm": 1.9072315995358804, + "language_loss": 0.77299631, + "learning_rate": 8.090755585214277e-07, + "loss": 0.84977901, + "num_input_tokens_seen": 255442935, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.09881592, + "step": 11841, + "time_per_iteration": 2.5373709201812744 + }, + { + "auxiliary_loss_clip": 0.06406824, + "auxiliary_loss_mlp": 0.01265843, + "balance_loss_clip": 0.06271842, + "balance_loss_mlp": 0.01256348, + "epoch": 0.7119795580940929, + "flos": 16514674513920.0, + "grad_norm": 2.1386907373947186, + "language_loss": 0.75567174, + "learning_rate": 8.087626940883994e-07, + "loss": 0.83239841, + "num_input_tokens_seen": 255460925, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.0949707, + "step": 11842, + "time_per_iteration": 2.5253396034240723 + }, + { + "auxiliary_loss_clip": 0.06309856, + "auxiliary_loss_mlp": 0.01250631, + "balance_loss_clip": 0.06254404, + "balance_loss_mlp": 0.01249538, + "epoch": 0.7120396813467609, + "flos": 66591434315520.0, + "grad_norm": 0.7631692514869006, + "language_loss": 0.61363775, + "learning_rate": 8.084498748272082e-07, + "loss": 0.6892426, + "num_input_tokens_seen": 255521360, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01094818, + "step": 11843, + "time_per_iteration": 3.097399950027466 + }, + { + "auxiliary_loss_clip": 0.06403574, + "auxiliary_loss_mlp": 0.01266422, + "balance_loss_clip": 0.06270365, + "balance_loss_mlp": 0.01256432, + "epoch": 0.7120998045994288, + "flos": 26440001873280.0, + "grad_norm": 3.96385360450405, + "language_loss": 0.80268991, + "learning_rate": 8.081371007497171e-07, + "loss": 0.87938976, + "num_input_tokens_seen": 255541435, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.09997559, + "step": 11844, + "time_per_iteration": 2.552259683609009 + }, + { + "auxiliary_loss_clip": 0.06406216, + "auxiliary_loss_mlp": 0.01262016, + "balance_loss_clip": 0.06270443, + "balance_loss_mlp": 0.01252759, + "epoch": 0.7121599278520968, + "flos": 16432300350720.0, + "grad_norm": 2.2064261749206784, + "language_loss": 0.79144967, + "learning_rate": 8.078243718677873e-07, + "loss": 0.868132, + "num_input_tokens_seen": 255558505, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.09259033, + "step": 11845, + "time_per_iteration": 2.5421273708343506 + }, + { + "auxiliary_loss_clip": 0.06402468, + "auxiliary_loss_mlp": 0.01265331, + "balance_loss_clip": 0.06271762, + "balance_loss_mlp": 0.01255532, + "epoch": 0.7122200511047647, + "flos": 28957520995200.0, + "grad_norm": 2.3428288803792485, + "language_loss": 0.77299261, + "learning_rate": 8.075116881932762e-07, + "loss": 0.84967065, + "num_input_tokens_seen": 255577815, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09796143, + "step": 11846, + "time_per_iteration": 2.527745485305786 + }, + { + "auxiliary_loss_clip": 0.06408693, + "auxiliary_loss_mlp": 0.01266657, + "balance_loss_clip": 0.06274035, + "balance_loss_mlp": 0.01256334, + "epoch": 0.7122801743574327, + "flos": 16477428574080.0, + "grad_norm": 1.8749902395969622, + "language_loss": 0.58446372, + "learning_rate": 8.071990497380421e-07, + "loss": 0.66121721, + "num_input_tokens_seen": 255595885, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.10314941, + "step": 11847, + "time_per_iteration": 2.4880757331848145 + }, + { + "auxiliary_loss_clip": 0.06397726, + "auxiliary_loss_mlp": 0.01263987, + "balance_loss_clip": 0.06270626, + "balance_loss_mlp": 0.01254081, + "epoch": 0.7123402976101008, + "flos": 20637263721600.0, + "grad_norm": 1.2877189780235179, + "language_loss": 0.71294212, + "learning_rate": 8.068864565139395e-07, + "loss": 0.78955925, + "num_input_tokens_seen": 255616750, + "router_z_loss_clip": 1.27050781, + "router_z_loss_mlp": 0.09918213, + "step": 11848, + "time_per_iteration": 2.5513198375701904 + }, + { + "auxiliary_loss_clip": 0.0630827, + "auxiliary_loss_mlp": 0.01254097, + "balance_loss_clip": 0.06252526, + "balance_loss_mlp": 0.01252904, + "epoch": 0.7124004208627687, + "flos": 62343606781440.0, + "grad_norm": 0.847952001487362, + "language_loss": 0.6271292, + "learning_rate": 8.065739085328211e-07, + "loss": 0.70275289, + "num_input_tokens_seen": 255677900, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.01190948, + "step": 11849, + "time_per_iteration": 3.1112751960754395 + }, + { + "auxiliary_loss_clip": 0.06405951, + "auxiliary_loss_mlp": 0.01264545, + "balance_loss_clip": 0.06269863, + "balance_loss_mlp": 0.01254699, + "epoch": 0.7124605441154367, + "flos": 39685278579840.0, + "grad_norm": 1.4089636975562345, + "language_loss": 0.64458466, + "learning_rate": 8.0626140580654e-07, + "loss": 0.72128963, + "num_input_tokens_seen": 255699140, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.09844971, + "step": 11850, + "time_per_iteration": 2.632457733154297 + }, + { + "auxiliary_loss_clip": 0.06404182, + "auxiliary_loss_mlp": 0.01262554, + "balance_loss_clip": 0.06269325, + "balance_loss_mlp": 0.0125254, + "epoch": 0.7125206673681046, + "flos": 28189066400640.0, + "grad_norm": 1.5452031150775634, + "language_loss": 0.70381355, + "learning_rate": 8.05948948346946e-07, + "loss": 0.78048086, + "num_input_tokens_seen": 255719640, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.10003662, + "step": 11851, + "time_per_iteration": 2.563063144683838 + }, + { + "auxiliary_loss_clip": 0.06402514, + "auxiliary_loss_mlp": 0.01261637, + "balance_loss_clip": 0.06271089, + "balance_loss_mlp": 0.0125275, + "epoch": 0.7125807906207726, + "flos": 26184101904000.0, + "grad_norm": 1.4548821396986709, + "language_loss": 0.83386576, + "learning_rate": 8.056365361658882e-07, + "loss": 0.9105072, + "num_input_tokens_seen": 255740450, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.08892822, + "step": 11852, + "time_per_iteration": 2.5185182094573975 + }, + { + "auxiliary_loss_clip": 0.06408215, + "auxiliary_loss_mlp": 0.01266945, + "balance_loss_clip": 0.06270768, + "balance_loss_mlp": 0.01256759, + "epoch": 0.7126409138734405, + "flos": 17161706142720.0, + "grad_norm": 2.03558575161385, + "language_loss": 0.72365862, + "learning_rate": 8.053241692752126e-07, + "loss": 0.80041021, + "num_input_tokens_seen": 255758070, + "router_z_loss_clip": 1.37402344, + "router_z_loss_mlp": 0.10186768, + "step": 11853, + "time_per_iteration": 2.4712510108947754 + }, + { + "auxiliary_loss_clip": 0.06400356, + "auxiliary_loss_mlp": 0.01265707, + "balance_loss_clip": 0.06273182, + "balance_loss_mlp": 0.01257005, + "epoch": 0.7127010371261085, + "flos": 18775790542080.0, + "grad_norm": 1.725464250509213, + "language_loss": 0.92318237, + "learning_rate": 8.050118476867635e-07, + "loss": 0.999843, + "num_input_tokens_seen": 255775685, + "router_z_loss_clip": 1.27246094, + "router_z_loss_mlp": 0.08703613, + "step": 11854, + "time_per_iteration": 2.4725341796875 + }, + { + "auxiliary_loss_clip": 0.06403268, + "auxiliary_loss_mlp": 0.01268625, + "balance_loss_clip": 0.06272953, + "balance_loss_mlp": 0.01260018, + "epoch": 0.7127611603787765, + "flos": 20382747344640.0, + "grad_norm": 1.8133122260210155, + "language_loss": 0.79957211, + "learning_rate": 8.046995714123856e-07, + "loss": 0.8762911, + "num_input_tokens_seen": 255794750, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.08612061, + "step": 11855, + "time_per_iteration": 2.5004756450653076 + }, + { + "auxiliary_loss_clip": 0.0640405, + "auxiliary_loss_mlp": 0.01264358, + "balance_loss_clip": 0.06273045, + "balance_loss_mlp": 0.01254244, + "epoch": 0.7128212836314445, + "flos": 20455268653440.0, + "grad_norm": 1.8163189094799566, + "language_loss": 0.73227429, + "learning_rate": 8.043873404639192e-07, + "loss": 0.80895841, + "num_input_tokens_seen": 255813325, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.10119629, + "step": 11856, + "time_per_iteration": 2.489022731781006 + }, + { + "auxiliary_loss_clip": 0.06408788, + "auxiliary_loss_mlp": 0.01268564, + "balance_loss_clip": 0.06273171, + "balance_loss_mlp": 0.01258634, + "epoch": 0.7128814068841124, + "flos": 23447593336320.0, + "grad_norm": 1.4996097551327818, + "language_loss": 0.69965553, + "learning_rate": 8.040751548532046e-07, + "loss": 0.77642906, + "num_input_tokens_seen": 255832470, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.0993042, + "step": 11857, + "time_per_iteration": 2.5889153480529785 + }, + { + "auxiliary_loss_clip": 0.06401453, + "auxiliary_loss_mlp": 0.01263435, + "balance_loss_clip": 0.06270251, + "balance_loss_mlp": 0.01253488, + "epoch": 0.7129415301367804, + "flos": 18228757161600.0, + "grad_norm": 1.9673696792632074, + "language_loss": 0.85894734, + "learning_rate": 8.03763014592081e-07, + "loss": 0.93559623, + "num_input_tokens_seen": 255849740, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.09942627, + "step": 11858, + "time_per_iteration": 2.4554738998413086 + }, + { + "auxiliary_loss_clip": 0.0641135, + "auxiliary_loss_mlp": 0.01265866, + "balance_loss_clip": 0.062728, + "balance_loss_mlp": 0.01255697, + "epoch": 0.7130016533894483, + "flos": 15529410679680.0, + "grad_norm": 1.7544523597871677, + "language_loss": 0.80554175, + "learning_rate": 8.034509196923829e-07, + "loss": 0.88231397, + "num_input_tokens_seen": 255866975, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.10168457, + "step": 11859, + "time_per_iteration": 3.9745945930480957 + }, + { + "auxiliary_loss_clip": 0.06400725, + "auxiliary_loss_mlp": 0.01264096, + "balance_loss_clip": 0.06269667, + "balance_loss_mlp": 0.0125472, + "epoch": 0.7130617766421163, + "flos": 57127804081920.0, + "grad_norm": 1.1922495989293056, + "language_loss": 0.69005597, + "learning_rate": 8.031388701659456e-07, + "loss": 0.76670408, + "num_input_tokens_seen": 255892915, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.09381104, + "step": 11860, + "time_per_iteration": 2.891012668609619 + }, + { + "auxiliary_loss_clip": 0.06406054, + "auxiliary_loss_mlp": 0.01266268, + "balance_loss_clip": 0.06271956, + "balance_loss_mlp": 0.01255575, + "epoch": 0.7131218998947844, + "flos": 19793730268800.0, + "grad_norm": 2.1261081147363097, + "language_loss": 0.64239693, + "learning_rate": 8.028268660246023e-07, + "loss": 0.71912014, + "num_input_tokens_seen": 255911480, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.10693359, + "step": 11861, + "time_per_iteration": 2.5796282291412354 + }, + { + "auxiliary_loss_clip": 0.06410623, + "auxiliary_loss_mlp": 0.01265484, + "balance_loss_clip": 0.06273146, + "balance_loss_mlp": 0.01254857, + "epoch": 0.7131820231474523, + "flos": 26659242881280.0, + "grad_norm": 3.187443939826819, + "language_loss": 0.67274332, + "learning_rate": 8.025149072801849e-07, + "loss": 0.74950445, + "num_input_tokens_seen": 255931140, + "router_z_loss_clip": 1.37402344, + "router_z_loss_mlp": 0.10620117, + "step": 11862, + "time_per_iteration": 2.576899528503418 + }, + { + "auxiliary_loss_clip": 0.064044, + "auxiliary_loss_mlp": 0.01265218, + "balance_loss_clip": 0.06273039, + "balance_loss_mlp": 0.01255926, + "epoch": 0.7132421464001203, + "flos": 29213337110400.0, + "grad_norm": 2.2144093674445426, + "language_loss": 0.67745155, + "learning_rate": 8.022029939445214e-07, + "loss": 0.75414771, + "num_input_tokens_seen": 255951665, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.09283447, + "step": 11863, + "time_per_iteration": 2.563467264175415 + }, + { + "auxiliary_loss_clip": 0.06412646, + "auxiliary_loss_mlp": 0.0126882, + "balance_loss_clip": 0.06272405, + "balance_loss_mlp": 0.01258103, + "epoch": 0.7133022696527882, + "flos": 23079913620480.0, + "grad_norm": 1.7053563824160904, + "language_loss": 0.6612097, + "learning_rate": 8.018911260294414e-07, + "loss": 0.73802435, + "num_input_tokens_seen": 255970055, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.10717773, + "step": 11864, + "time_per_iteration": 2.5226974487304688 + }, + { + "auxiliary_loss_clip": 0.06409131, + "auxiliary_loss_mlp": 0.01265229, + "balance_loss_clip": 0.06273311, + "balance_loss_mlp": 0.01255019, + "epoch": 0.7133623929054562, + "flos": 17462860116480.0, + "grad_norm": 3.439605466883789, + "language_loss": 0.86094218, + "learning_rate": 8.015793035467697e-07, + "loss": 0.93768573, + "num_input_tokens_seen": 255987720, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10217285, + "step": 11865, + "time_per_iteration": 2.441121816635132 + }, + { + "auxiliary_loss_clip": 0.06408411, + "auxiliary_loss_mlp": 0.01263379, + "balance_loss_clip": 0.06273241, + "balance_loss_mlp": 0.01252942, + "epoch": 0.7134225161581241, + "flos": 19542609982080.0, + "grad_norm": 2.0189990892571807, + "language_loss": 0.75141108, + "learning_rate": 8.012675265083304e-07, + "loss": 0.82812905, + "num_input_tokens_seen": 256005490, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.10443115, + "step": 11866, + "time_per_iteration": 2.4785237312316895 + }, + { + "auxiliary_loss_clip": 0.06411657, + "auxiliary_loss_mlp": 0.01267167, + "balance_loss_clip": 0.06275963, + "balance_loss_mlp": 0.01256408, + "epoch": 0.7134826394107922, + "flos": 26257294045440.0, + "grad_norm": 3.679418691378197, + "language_loss": 0.70483118, + "learning_rate": 8.009557949259464e-07, + "loss": 0.78161943, + "num_input_tokens_seen": 256026030, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.10748291, + "step": 11867, + "time_per_iteration": 2.518202066421509 + }, + { + "auxiliary_loss_clip": 0.06403194, + "auxiliary_loss_mlp": 0.01266031, + "balance_loss_clip": 0.06272841, + "balance_loss_mlp": 0.01256477, + "epoch": 0.7135427626634601, + "flos": 15820795653120.0, + "grad_norm": 4.975034900378342, + "language_loss": 0.71782935, + "learning_rate": 8.006441088114397e-07, + "loss": 0.79452157, + "num_input_tokens_seen": 256043680, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.09552002, + "step": 11868, + "time_per_iteration": 2.4938719272613525 + }, + { + "auxiliary_loss_clip": 0.06411693, + "auxiliary_loss_mlp": 0.01268858, + "balance_loss_clip": 0.06273223, + "balance_loss_mlp": 0.01257635, + "epoch": 0.7136028859161281, + "flos": 18229302213120.0, + "grad_norm": 1.9405833387691556, + "language_loss": 0.66333723, + "learning_rate": 8.003324681766286e-07, + "loss": 0.7401427, + "num_input_tokens_seen": 256059705, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.11236572, + "step": 11869, + "time_per_iteration": 2.4637274742126465 + }, + { + "auxiliary_loss_clip": 0.06408057, + "auxiliary_loss_mlp": 0.01264796, + "balance_loss_clip": 0.06273142, + "balance_loss_mlp": 0.01255003, + "epoch": 0.713663009168796, + "flos": 24321454767360.0, + "grad_norm": 1.4404508285538464, + "language_loss": 0.77963442, + "learning_rate": 8.000208730333298e-07, + "loss": 0.856363, + "num_input_tokens_seen": 256079785, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.09790039, + "step": 11870, + "time_per_iteration": 2.545146942138672 + }, + { + "auxiliary_loss_clip": 0.06407803, + "auxiliary_loss_mlp": 0.01266072, + "balance_loss_clip": 0.06275113, + "balance_loss_mlp": 0.01255248, + "epoch": 0.713723132421464, + "flos": 26545157147520.0, + "grad_norm": 2.250105845614367, + "language_loss": 0.81401408, + "learning_rate": 7.997093233933597e-07, + "loss": 0.89075279, + "num_input_tokens_seen": 256099000, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.10821533, + "step": 11871, + "time_per_iteration": 4.061939477920532 + }, + { + "auxiliary_loss_clip": 0.06409386, + "auxiliary_loss_mlp": 0.01271871, + "balance_loss_clip": 0.06272148, + "balance_loss_mlp": 0.01261541, + "epoch": 0.7137832556741319, + "flos": 19871911728000.0, + "grad_norm": 1.5669444552919631, + "language_loss": 0.78963834, + "learning_rate": 7.993978192685331e-07, + "loss": 0.86645091, + "num_input_tokens_seen": 256117985, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.10321045, + "step": 11872, + "time_per_iteration": 2.502652645111084 + }, + { + "auxiliary_loss_clip": 0.06413025, + "auxiliary_loss_mlp": 0.01263574, + "balance_loss_clip": 0.06273353, + "balance_loss_mlp": 0.01253108, + "epoch": 0.7138433789267999, + "flos": 21695300426880.0, + "grad_norm": 2.078419347550335, + "language_loss": 0.83881956, + "learning_rate": 7.990863606706606e-07, + "loss": 0.91558552, + "num_input_tokens_seen": 256134350, + "router_z_loss_clip": 1.39648438, + "router_z_loss_mlp": 0.10473633, + "step": 11873, + "time_per_iteration": 2.49755859375 + }, + { + "auxiliary_loss_clip": 0.06404479, + "auxiliary_loss_mlp": 0.01264148, + "balance_loss_clip": 0.06273785, + "balance_loss_mlp": 0.0125491, + "epoch": 0.713903502179468, + "flos": 17608447785600.0, + "grad_norm": 2.139862978747737, + "language_loss": 0.85866129, + "learning_rate": 7.987749476115539e-07, + "loss": 0.93534762, + "num_input_tokens_seen": 256150610, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.09240723, + "step": 11874, + "time_per_iteration": 2.446295976638794 + }, + { + "auxiliary_loss_clip": 0.0641006, + "auxiliary_loss_mlp": 0.01266331, + "balance_loss_clip": 0.06275686, + "balance_loss_mlp": 0.01256043, + "epoch": 0.7139636254321359, + "flos": 18046091260800.0, + "grad_norm": 1.75973654551926, + "language_loss": 0.83120143, + "learning_rate": 7.984635801030228e-07, + "loss": 0.90796536, + "num_input_tokens_seen": 256168620, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.10284424, + "step": 11875, + "time_per_iteration": 3.8960680961608887 + }, + { + "auxiliary_loss_clip": 0.06414599, + "auxiliary_loss_mlp": 0.01267914, + "balance_loss_clip": 0.06272531, + "balance_loss_mlp": 0.01256136, + "epoch": 0.7140237486848039, + "flos": 23337826087680.0, + "grad_norm": 1.757783447264505, + "language_loss": 0.69900811, + "learning_rate": 7.981522581568721e-07, + "loss": 0.77583325, + "num_input_tokens_seen": 256186700, + "router_z_loss_clip": 1.41992188, + "router_z_loss_mlp": 0.11779785, + "step": 11876, + "time_per_iteration": 2.491225481033325 + }, + { + "auxiliary_loss_clip": 0.06411763, + "auxiliary_loss_mlp": 0.01262915, + "balance_loss_clip": 0.06274708, + "balance_loss_mlp": 0.01252663, + "epoch": 0.7140838719374718, + "flos": 16842760375680.0, + "grad_norm": 1.8106538192439035, + "language_loss": 0.78886259, + "learning_rate": 7.978409817849079e-07, + "loss": 0.86560941, + "num_input_tokens_seen": 256205390, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.10253906, + "step": 11877, + "time_per_iteration": 2.493778705596924 + }, + { + "auxiliary_loss_clip": 0.0640865, + "auxiliary_loss_mlp": 0.01267195, + "balance_loss_clip": 0.06276323, + "balance_loss_mlp": 0.01257611, + "epoch": 0.7141439951901398, + "flos": 21148350900480.0, + "grad_norm": 1.8508532405281077, + "language_loss": 0.70390731, + "learning_rate": 7.97529750998934e-07, + "loss": 0.78066581, + "num_input_tokens_seen": 256224575, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.0958252, + "step": 11878, + "time_per_iteration": 3.8979172706604004 + }, + { + "auxiliary_loss_clip": 0.06407811, + "auxiliary_loss_mlp": 0.01264089, + "balance_loss_clip": 0.06277137, + "balance_loss_mlp": 0.01254153, + "epoch": 0.7142041184428077, + "flos": 24724661414400.0, + "grad_norm": 1.94673596086021, + "language_loss": 0.67341477, + "learning_rate": 7.972185658107535e-07, + "loss": 0.75013375, + "num_input_tokens_seen": 256242130, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09936523, + "step": 11879, + "time_per_iteration": 2.5100598335266113 + }, + { + "auxiliary_loss_clip": 0.06410161, + "auxiliary_loss_mlp": 0.01262301, + "balance_loss_clip": 0.06275017, + "balance_loss_mlp": 0.01252037, + "epoch": 0.7142642416954758, + "flos": 21914667216000.0, + "grad_norm": 1.6535111085971643, + "language_loss": 0.69445574, + "learning_rate": 7.969074262321646e-07, + "loss": 0.77118039, + "num_input_tokens_seen": 256261920, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.10266113, + "step": 11880, + "time_per_iteration": 2.507603406906128 + }, + { + "auxiliary_loss_clip": 0.0641037, + "auxiliary_loss_mlp": 0.01264833, + "balance_loss_clip": 0.06273447, + "balance_loss_mlp": 0.01254772, + "epoch": 0.7143243649481437, + "flos": 20810579892480.0, + "grad_norm": 2.0343383375931894, + "language_loss": 0.80753726, + "learning_rate": 7.965963322749674e-07, + "loss": 0.88428932, + "num_input_tokens_seen": 256277970, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.10058594, + "step": 11881, + "time_per_iteration": 2.4606220722198486 + }, + { + "auxiliary_loss_clip": 0.06409037, + "auxiliary_loss_mlp": 0.01264183, + "balance_loss_clip": 0.06274998, + "balance_loss_mlp": 0.01254539, + "epoch": 0.7143844882008117, + "flos": 27242348244480.0, + "grad_norm": 1.58430278316452, + "language_loss": 0.64282894, + "learning_rate": 7.962852839509579e-07, + "loss": 0.71956116, + "num_input_tokens_seen": 256298205, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.09643555, + "step": 11882, + "time_per_iteration": 2.56210994720459 + }, + { + "auxiliary_loss_clip": 0.06411886, + "auxiliary_loss_mlp": 0.01263564, + "balance_loss_clip": 0.06275278, + "balance_loss_mlp": 0.01253473, + "epoch": 0.7144446114534796, + "flos": 17935150055040.0, + "grad_norm": 1.872999181445386, + "language_loss": 0.69193482, + "learning_rate": 7.959742812719304e-07, + "loss": 0.76868939, + "num_input_tokens_seen": 256316685, + "router_z_loss_clip": 1.36621094, + "router_z_loss_mlp": 0.10101318, + "step": 11883, + "time_per_iteration": 2.4767167568206787 + }, + { + "auxiliary_loss_clip": 0.06408374, + "auxiliary_loss_mlp": 0.01263792, + "balance_loss_clip": 0.06277797, + "balance_loss_mlp": 0.01253761, + "epoch": 0.7145047347061476, + "flos": 20747282532480.0, + "grad_norm": 2.264759730138534, + "language_loss": 0.7842024, + "learning_rate": 7.956633242496788e-07, + "loss": 0.86092412, + "num_input_tokens_seen": 256334205, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.10040283, + "step": 11884, + "time_per_iteration": 2.5488386154174805 + }, + { + "auxiliary_loss_clip": 0.06414723, + "auxiliary_loss_mlp": 0.01266561, + "balance_loss_clip": 0.06273861, + "balance_loss_mlp": 0.01255517, + "epoch": 0.7145648579588155, + "flos": 21184967934720.0, + "grad_norm": 5.179157665604164, + "language_loss": 0.74281037, + "learning_rate": 7.953524128959954e-07, + "loss": 0.81962323, + "num_input_tokens_seen": 256353340, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.1104126, + "step": 11885, + "time_per_iteration": 2.4918782711029053 + }, + { + "auxiliary_loss_clip": 0.06317447, + "auxiliary_loss_mlp": 0.01252483, + "balance_loss_clip": 0.06261733, + "balance_loss_mlp": 0.0125137, + "epoch": 0.7146249812114835, + "flos": 64805207702400.0, + "grad_norm": 0.9938747796430238, + "language_loss": 0.66419291, + "learning_rate": 7.95041547222669e-07, + "loss": 0.73989218, + "num_input_tokens_seen": 256411550, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01115417, + "step": 11886, + "time_per_iteration": 3.0856966972351074 + }, + { + "auxiliary_loss_clip": 0.06409487, + "auxiliary_loss_mlp": 0.01262772, + "balance_loss_clip": 0.06275956, + "balance_loss_mlp": 0.01253361, + "epoch": 0.7146851044641516, + "flos": 18119744599680.0, + "grad_norm": 1.9726076644282031, + "language_loss": 0.75334477, + "learning_rate": 7.947307272414874e-07, + "loss": 0.8300674, + "num_input_tokens_seen": 256430360, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.09411621, + "step": 11887, + "time_per_iteration": 2.457226037979126 + }, + { + "auxiliary_loss_clip": 0.06411713, + "auxiliary_loss_mlp": 0.01264697, + "balance_loss_clip": 0.06275448, + "balance_loss_mlp": 0.01254701, + "epoch": 0.7147452277168195, + "flos": 19249715635200.0, + "grad_norm": 1.4837579130348453, + "language_loss": 0.71681702, + "learning_rate": 7.944199529642372e-07, + "loss": 0.79358119, + "num_input_tokens_seen": 256449750, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.10003662, + "step": 11888, + "time_per_iteration": 2.5040013790130615 + }, + { + "auxiliary_loss_clip": 0.06412415, + "auxiliary_loss_mlp": 0.0126625, + "balance_loss_clip": 0.06273472, + "balance_loss_mlp": 0.01256266, + "epoch": 0.7148053509694875, + "flos": 23770773734400.0, + "grad_norm": 1.770417967060374, + "language_loss": 0.84754878, + "learning_rate": 7.941092244027041e-07, + "loss": 0.92433536, + "num_input_tokens_seen": 256467330, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.09991455, + "step": 11889, + "time_per_iteration": 2.498847246170044 + }, + { + "auxiliary_loss_clip": 0.0640825, + "auxiliary_loss_mlp": 0.01263505, + "balance_loss_clip": 0.06273344, + "balance_loss_mlp": 0.0125401, + "epoch": 0.7148654742221554, + "flos": 22490770763520.0, + "grad_norm": 1.697229185177074, + "language_loss": 0.75894499, + "learning_rate": 7.937985415686695e-07, + "loss": 0.8356626, + "num_input_tokens_seen": 256485705, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.0949707, + "step": 11890, + "time_per_iteration": 2.5205180644989014 + }, + { + "auxiliary_loss_clip": 0.06404347, + "auxiliary_loss_mlp": 0.01264735, + "balance_loss_clip": 0.06271873, + "balance_loss_mlp": 0.01255073, + "epoch": 0.7149255974748234, + "flos": 24685822247040.0, + "grad_norm": 1.9172824039571863, + "language_loss": 0.74212694, + "learning_rate": 7.934879044739147e-07, + "loss": 0.81881773, + "num_input_tokens_seen": 256504755, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.09667969, + "step": 11891, + "time_per_iteration": 2.515684127807617 + }, + { + "auxiliary_loss_clip": 0.06409282, + "auxiliary_loss_mlp": 0.0126706, + "balance_loss_clip": 0.06273171, + "balance_loss_mlp": 0.01256963, + "epoch": 0.7149857207274913, + "flos": 18411464989440.0, + "grad_norm": 1.8378637994341889, + "language_loss": 0.68246537, + "learning_rate": 7.931773131302211e-07, + "loss": 0.75922883, + "num_input_tokens_seen": 256523670, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.10101318, + "step": 11892, + "time_per_iteration": 2.4761176109313965 + }, + { + "auxiliary_loss_clip": 0.06410619, + "auxiliary_loss_mlp": 0.01266239, + "balance_loss_clip": 0.06271711, + "balance_loss_mlp": 0.01254813, + "epoch": 0.7150458439801594, + "flos": 24975907482240.0, + "grad_norm": 1.712623401245163, + "language_loss": 0.74044412, + "learning_rate": 7.928667675493632e-07, + "loss": 0.81721264, + "num_input_tokens_seen": 256542225, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.11413574, + "step": 11893, + "time_per_iteration": 2.5127475261688232 + }, + { + "auxiliary_loss_clip": 0.06412932, + "auxiliary_loss_mlp": 0.01265609, + "balance_loss_clip": 0.06273164, + "balance_loss_mlp": 0.01253873, + "epoch": 0.7151059672328273, + "flos": 16696376092800.0, + "grad_norm": 2.7158372012320315, + "language_loss": 0.66545182, + "learning_rate": 7.925562677431185e-07, + "loss": 0.74223733, + "num_input_tokens_seen": 256560730, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.11743164, + "step": 11894, + "time_per_iteration": 2.5338070392608643 + }, + { + "auxiliary_loss_clip": 0.06413232, + "auxiliary_loss_mlp": 0.01263618, + "balance_loss_clip": 0.06275386, + "balance_loss_mlp": 0.0125364, + "epoch": 0.7151660904854953, + "flos": 27279216840960.0, + "grad_norm": 7.327232790836601, + "language_loss": 0.77995753, + "learning_rate": 7.922458137232613e-07, + "loss": 0.85672593, + "num_input_tokens_seen": 256580505, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.09979248, + "step": 11895, + "time_per_iteration": 2.545539379119873 + }, + { + "auxiliary_loss_clip": 0.06408492, + "auxiliary_loss_mlp": 0.01262254, + "balance_loss_clip": 0.06271514, + "balance_loss_mlp": 0.01251776, + "epoch": 0.7152262137381632, + "flos": 18338063212800.0, + "grad_norm": 2.1720944859755327, + "language_loss": 0.69649661, + "learning_rate": 7.919354055015643e-07, + "loss": 0.77320409, + "num_input_tokens_seen": 256597330, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.1048584, + "step": 11896, + "time_per_iteration": 2.5020852088928223 + }, + { + "auxiliary_loss_clip": 0.06410179, + "auxiliary_loss_mlp": 0.01270904, + "balance_loss_clip": 0.06272502, + "balance_loss_mlp": 0.01259203, + "epoch": 0.7152863369908312, + "flos": 21805822362240.0, + "grad_norm": 1.8979241109476415, + "language_loss": 0.8686198, + "learning_rate": 7.91625043089798e-07, + "loss": 0.94543064, + "num_input_tokens_seen": 256616030, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.11694336, + "step": 11897, + "time_per_iteration": 2.4981558322906494 + }, + { + "auxiliary_loss_clip": 0.06406087, + "auxiliary_loss_mlp": 0.01264151, + "balance_loss_clip": 0.06274753, + "balance_loss_mlp": 0.01254084, + "epoch": 0.7153464602434991, + "flos": 22164068494080.0, + "grad_norm": 1.7720635566598981, + "language_loss": 0.78347677, + "learning_rate": 7.913147264997304e-07, + "loss": 0.86017919, + "num_input_tokens_seen": 256635570, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.10070801, + "step": 11898, + "time_per_iteration": 2.568208694458008 + }, + { + "auxiliary_loss_clip": 0.06413846, + "auxiliary_loss_mlp": 0.01263525, + "balance_loss_clip": 0.0627441, + "balance_loss_mlp": 0.01252868, + "epoch": 0.7154065834961671, + "flos": 24722732770560.0, + "grad_norm": 1.7720575063877593, + "language_loss": 0.73240674, + "learning_rate": 7.910044557431302e-07, + "loss": 0.8091805, + "num_input_tokens_seen": 256655290, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.10656738, + "step": 11899, + "time_per_iteration": 3.9873409271240234 + }, + { + "auxiliary_loss_clip": 0.06406702, + "auxiliary_loss_mlp": 0.0126605, + "balance_loss_clip": 0.06271633, + "balance_loss_mlp": 0.01255482, + "epoch": 0.7154667067488351, + "flos": 22608084879360.0, + "grad_norm": 2.7184837218905216, + "language_loss": 0.75906515, + "learning_rate": 7.906942308317614e-07, + "loss": 0.83579266, + "num_input_tokens_seen": 256671605, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.10565186, + "step": 11900, + "time_per_iteration": 2.48612380027771 + }, + { + "auxiliary_loss_clip": 0.06410916, + "auxiliary_loss_mlp": 0.01263744, + "balance_loss_clip": 0.06274971, + "balance_loss_mlp": 0.01254064, + "epoch": 0.7155268300015031, + "flos": 18777216061440.0, + "grad_norm": 1.8830405388899822, + "language_loss": 0.80537415, + "learning_rate": 7.903840517773886e-07, + "loss": 0.88212073, + "num_input_tokens_seen": 256689680, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.09680176, + "step": 11901, + "time_per_iteration": 2.538071632385254 + }, + { + "auxiliary_loss_clip": 0.06413621, + "auxiliary_loss_mlp": 0.01265462, + "balance_loss_clip": 0.0627216, + "balance_loss_mlp": 0.01254626, + "epoch": 0.7155869532541711, + "flos": 18302242792320.0, + "grad_norm": 1.8091761354011133, + "language_loss": 0.82077742, + "learning_rate": 7.900739185917744e-07, + "loss": 0.89756829, + "num_input_tokens_seen": 256707760, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.10839844, + "step": 11902, + "time_per_iteration": 2.4796504974365234 + }, + { + "auxiliary_loss_clip": 0.06407838, + "auxiliary_loss_mlp": 0.01263676, + "balance_loss_clip": 0.06271887, + "balance_loss_mlp": 0.01254306, + "epoch": 0.715647076506839, + "flos": 11985063298560.0, + "grad_norm": 1.8489548968848413, + "language_loss": 0.68603027, + "learning_rate": 7.897638312866785e-07, + "loss": 0.76274538, + "num_input_tokens_seen": 256724150, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.09356689, + "step": 11903, + "time_per_iteration": 2.502664566040039 + }, + { + "auxiliary_loss_clip": 0.06406122, + "auxiliary_loss_mlp": 0.01265685, + "balance_loss_clip": 0.06273056, + "balance_loss_mlp": 0.0125591, + "epoch": 0.715707199759507, + "flos": 18957408266880.0, + "grad_norm": 1.5823213300778882, + "language_loss": 0.75905824, + "learning_rate": 7.894537898738589e-07, + "loss": 0.83577633, + "num_input_tokens_seen": 256742780, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.09765625, + "step": 11904, + "time_per_iteration": 2.4838523864746094 + }, + { + "auxiliary_loss_clip": 0.06408757, + "auxiliary_loss_mlp": 0.01267288, + "balance_loss_clip": 0.06273915, + "balance_loss_mlp": 0.01255838, + "epoch": 0.7157673230121749, + "flos": 15309792328320.0, + "grad_norm": 1.6671251370747393, + "language_loss": 0.7200684, + "learning_rate": 7.891437943650727e-07, + "loss": 0.79682887, + "num_input_tokens_seen": 256761355, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.11456299, + "step": 11905, + "time_per_iteration": 2.5194296836853027 + }, + { + "auxiliary_loss_clip": 0.06407201, + "auxiliary_loss_mlp": 0.0126414, + "balance_loss_clip": 0.06273023, + "balance_loss_mlp": 0.01254377, + "epoch": 0.715827446264843, + "flos": 23228561963520.0, + "grad_norm": 1.7268826203228764, + "language_loss": 0.7871933, + "learning_rate": 7.88833844772076e-07, + "loss": 0.86390674, + "num_input_tokens_seen": 256781335, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.09765625, + "step": 11906, + "time_per_iteration": 2.505692720413208 + }, + { + "auxiliary_loss_clip": 0.06311446, + "auxiliary_loss_mlp": 0.01249409, + "balance_loss_clip": 0.06255978, + "balance_loss_mlp": 0.01248228, + "epoch": 0.7158875695175109, + "flos": 60993011145600.0, + "grad_norm": 0.7186868091888179, + "language_loss": 0.55247056, + "learning_rate": 7.885239411066205e-07, + "loss": 0.62807906, + "num_input_tokens_seen": 256838890, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01179504, + "step": 11907, + "time_per_iteration": 3.077824354171753 + }, + { + "auxiliary_loss_clip": 0.06404838, + "auxiliary_loss_mlp": 0.01262889, + "balance_loss_clip": 0.06269851, + "balance_loss_mlp": 0.01252893, + "epoch": 0.7159476927701789, + "flos": 17134480765440.0, + "grad_norm": 1.7650418564568968, + "language_loss": 0.69603425, + "learning_rate": 7.882140833804593e-07, + "loss": 0.77271152, + "num_input_tokens_seen": 256858145, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.09985352, + "step": 11908, + "time_per_iteration": 2.4865145683288574 + }, + { + "auxiliary_loss_clip": 0.06412758, + "auxiliary_loss_mlp": 0.01264461, + "balance_loss_clip": 0.06276048, + "balance_loss_mlp": 0.01253625, + "epoch": 0.7160078160228468, + "flos": 22496934038400.0, + "grad_norm": 1.9817565541714355, + "language_loss": 0.71485305, + "learning_rate": 7.879042716053415e-07, + "loss": 0.79162526, + "num_input_tokens_seen": 256878545, + "router_z_loss_clip": 1.36621094, + "router_z_loss_mlp": 0.1083374, + "step": 11909, + "time_per_iteration": 2.5261456966400146 + }, + { + "auxiliary_loss_clip": 0.06411682, + "auxiliary_loss_mlp": 0.01264075, + "balance_loss_clip": 0.06275836, + "balance_loss_mlp": 0.01253316, + "epoch": 0.7160679392755148, + "flos": 30598704990720.0, + "grad_norm": 1.38087645688004, + "language_loss": 0.75330472, + "learning_rate": 7.875945057930144e-07, + "loss": 0.83006227, + "num_input_tokens_seen": 256899920, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.10766602, + "step": 11910, + "time_per_iteration": 4.044188022613525 + }, + { + "auxiliary_loss_clip": 0.06406509, + "auxiliary_loss_mlp": 0.01266213, + "balance_loss_clip": 0.06271985, + "balance_loss_mlp": 0.01256098, + "epoch": 0.7161280625281827, + "flos": 21329884771200.0, + "grad_norm": 1.597685322541952, + "language_loss": 0.76519787, + "learning_rate": 7.872847859552251e-07, + "loss": 0.84192502, + "num_input_tokens_seen": 256918460, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.10107422, + "step": 11911, + "time_per_iteration": 2.665767192840576 + }, + { + "auxiliary_loss_clip": 0.06409479, + "auxiliary_loss_mlp": 0.01265159, + "balance_loss_clip": 0.06274366, + "balance_loss_mlp": 0.01254376, + "epoch": 0.7161881857808508, + "flos": 61873218288000.0, + "grad_norm": 1.667698649027388, + "language_loss": 0.58612812, + "learning_rate": 7.869751121037192e-07, + "loss": 0.66287452, + "num_input_tokens_seen": 256942015, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.10791016, + "step": 11912, + "time_per_iteration": 2.9163358211517334 + }, + { + "auxiliary_loss_clip": 0.06408441, + "auxiliary_loss_mlp": 0.01264274, + "balance_loss_clip": 0.06275295, + "balance_loss_mlp": 0.0125398, + "epoch": 0.7162483090335187, + "flos": 20818126759680.0, + "grad_norm": 1.9057750004055583, + "language_loss": 0.78541219, + "learning_rate": 7.866654842502376e-07, + "loss": 0.86213928, + "num_input_tokens_seen": 256961065, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.10296631, + "step": 11913, + "time_per_iteration": 2.496882438659668 + }, + { + "auxiliary_loss_clip": 0.06405665, + "auxiliary_loss_mlp": 0.01267442, + "balance_loss_clip": 0.06273191, + "balance_loss_mlp": 0.01257864, + "epoch": 0.7163084322861867, + "flos": 24104393965440.0, + "grad_norm": 1.590904649851159, + "language_loss": 0.7420674, + "learning_rate": 7.863559024065234e-07, + "loss": 0.81879842, + "num_input_tokens_seen": 256982165, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.0958252, + "step": 11914, + "time_per_iteration": 3.96821665763855 + }, + { + "auxiliary_loss_clip": 0.06403452, + "auxiliary_loss_mlp": 0.01261289, + "balance_loss_clip": 0.06272376, + "balance_loss_mlp": 0.01251574, + "epoch": 0.7163685555388547, + "flos": 20086540761600.0, + "grad_norm": 1.6632734389842445, + "language_loss": 0.74058056, + "learning_rate": 7.860463665843143e-07, + "loss": 0.81722796, + "num_input_tokens_seen": 256999825, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.097229, + "step": 11915, + "time_per_iteration": 2.4962167739868164 + }, + { + "auxiliary_loss_clip": 0.06405881, + "auxiliary_loss_mlp": 0.01264509, + "balance_loss_clip": 0.06270003, + "balance_loss_mlp": 0.01254323, + "epoch": 0.7164286787915226, + "flos": 17462692408320.0, + "grad_norm": 1.6596246771079706, + "language_loss": 0.81293082, + "learning_rate": 7.85736876795349e-07, + "loss": 0.88963467, + "num_input_tokens_seen": 257017450, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.10186768, + "step": 11916, + "time_per_iteration": 2.5293524265289307 + }, + { + "auxiliary_loss_clip": 0.06407885, + "auxiliary_loss_mlp": 0.01267283, + "balance_loss_clip": 0.06272584, + "balance_loss_mlp": 0.01257555, + "epoch": 0.7164888020441906, + "flos": 19724982393600.0, + "grad_norm": 1.9910779108762084, + "language_loss": 0.68661398, + "learning_rate": 7.854274330513626e-07, + "loss": 0.76336563, + "num_input_tokens_seen": 257035465, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.09729004, + "step": 11917, + "time_per_iteration": 2.5082740783691406 + }, + { + "auxiliary_loss_clip": 0.0640521, + "auxiliary_loss_mlp": 0.01268808, + "balance_loss_clip": 0.06270327, + "balance_loss_mlp": 0.01258127, + "epoch": 0.7165489252968585, + "flos": 21476939886720.0, + "grad_norm": 1.5888688683522953, + "language_loss": 0.76160645, + "learning_rate": 7.851180353640896e-07, + "loss": 0.8383466, + "num_input_tokens_seen": 257053750, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.10687256, + "step": 11918, + "time_per_iteration": 3.8991646766662598 + }, + { + "auxiliary_loss_clip": 0.06316125, + "auxiliary_loss_mlp": 0.01260952, + "balance_loss_clip": 0.06260598, + "balance_loss_mlp": 0.01259661, + "epoch": 0.7166090485495266, + "flos": 69949426216320.0, + "grad_norm": 0.6355552708819127, + "language_loss": 0.53723788, + "learning_rate": 7.848086837452639e-07, + "loss": 0.61300862, + "num_input_tokens_seen": 257121215, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01291656, + "step": 11919, + "time_per_iteration": 3.2083816528320312 + }, + { + "auxiliary_loss_clip": 0.06411423, + "auxiliary_loss_mlp": 0.01266152, + "balance_loss_clip": 0.06274234, + "balance_loss_mlp": 0.01255948, + "epoch": 0.7166691718021945, + "flos": 27351151171200.0, + "grad_norm": 2.064464674479712, + "language_loss": 0.69286996, + "learning_rate": 7.844993782066132e-07, + "loss": 0.76964575, + "num_input_tokens_seen": 257143370, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.10211182, + "step": 11920, + "time_per_iteration": 2.6113531589508057 + }, + { + "auxiliary_loss_clip": 0.064086, + "auxiliary_loss_mlp": 0.01265203, + "balance_loss_clip": 0.06273469, + "balance_loss_mlp": 0.01255106, + "epoch": 0.7167292950548625, + "flos": 30416667995520.0, + "grad_norm": 1.8345459175809258, + "language_loss": 0.75019145, + "learning_rate": 7.841901187598678e-07, + "loss": 0.82692945, + "num_input_tokens_seen": 257162160, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.10101318, + "step": 11921, + "time_per_iteration": 2.5700902938842773 + }, + { + "auxiliary_loss_clip": 0.06416579, + "auxiliary_loss_mlp": 0.01267308, + "balance_loss_clip": 0.06275436, + "balance_loss_mlp": 0.01254177, + "epoch": 0.7167894183075304, + "flos": 14575942270080.0, + "grad_norm": 1.9367359294583022, + "language_loss": 0.75734651, + "learning_rate": 7.83880905416755e-07, + "loss": 0.83418536, + "num_input_tokens_seen": 257179300, + "router_z_loss_clip": 1.41210938, + "router_z_loss_mlp": 0.13128662, + "step": 11922, + "time_per_iteration": 2.465078830718994 + }, + { + "auxiliary_loss_clip": 0.06313948, + "auxiliary_loss_mlp": 0.0125594, + "balance_loss_clip": 0.06258468, + "balance_loss_mlp": 0.01254771, + "epoch": 0.7168495415601984, + "flos": 64128365948160.0, + "grad_norm": 0.7346387486828846, + "language_loss": 0.55178893, + "learning_rate": 7.83571738189001e-07, + "loss": 0.62748784, + "num_input_tokens_seen": 257235470, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01166534, + "step": 11923, + "time_per_iteration": 2.953462839126587 + }, + { + "auxiliary_loss_clip": 0.06408657, + "auxiliary_loss_mlp": 0.01267487, + "balance_loss_clip": 0.06272471, + "balance_loss_mlp": 0.01257062, + "epoch": 0.7169096648128663, + "flos": 24688421723520.0, + "grad_norm": 1.4959305525203388, + "language_loss": 0.77240855, + "learning_rate": 7.832626170883279e-07, + "loss": 0.84916997, + "num_input_tokens_seen": 257255850, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.10430908, + "step": 11924, + "time_per_iteration": 2.540371894836426 + }, + { + "auxiliary_loss_clip": 0.06404062, + "auxiliary_loss_mlp": 0.01264587, + "balance_loss_clip": 0.06271583, + "balance_loss_mlp": 0.01254776, + "epoch": 0.7169697880655344, + "flos": 20673754974720.0, + "grad_norm": 1.6022064591556118, + "language_loss": 0.68295527, + "learning_rate": 7.829535421264588e-07, + "loss": 0.75964177, + "num_input_tokens_seen": 257275425, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.0980835, + "step": 11925, + "time_per_iteration": 2.517883539199829 + }, + { + "auxiliary_loss_clip": 0.06401929, + "auxiliary_loss_mlp": 0.01264464, + "balance_loss_clip": 0.062714, + "balance_loss_mlp": 0.01254689, + "epoch": 0.7170299113182023, + "flos": 21039044849280.0, + "grad_norm": 1.4805989114047955, + "language_loss": 0.77453327, + "learning_rate": 7.826445133151133e-07, + "loss": 0.85119712, + "num_input_tokens_seen": 257295740, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.09771729, + "step": 11926, + "time_per_iteration": 2.525294065475464 + }, + { + "auxiliary_loss_clip": 0.06412005, + "auxiliary_loss_mlp": 0.01265458, + "balance_loss_clip": 0.06270812, + "balance_loss_mlp": 0.01254652, + "epoch": 0.7170900345708703, + "flos": 22899931050240.0, + "grad_norm": 2.0777865418109798, + "language_loss": 0.77830517, + "learning_rate": 7.823355306660093e-07, + "loss": 0.85507977, + "num_input_tokens_seen": 257315970, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.10809326, + "step": 11927, + "time_per_iteration": 2.5361175537109375 + }, + { + "auxiliary_loss_clip": 0.06405352, + "auxiliary_loss_mlp": 0.012651, + "balance_loss_clip": 0.06273961, + "balance_loss_mlp": 0.01255134, + "epoch": 0.7171501578235383, + "flos": 15523331258880.0, + "grad_norm": 1.5750787532555974, + "language_loss": 0.69694316, + "learning_rate": 7.820265941908642e-07, + "loss": 0.77364767, + "num_input_tokens_seen": 257334230, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09960938, + "step": 11928, + "time_per_iteration": 2.5053482055664062 + }, + { + "auxiliary_loss_clip": 0.06404196, + "auxiliary_loss_mlp": 0.01263642, + "balance_loss_clip": 0.06272393, + "balance_loss_mlp": 0.01253563, + "epoch": 0.7172102810762062, + "flos": 26111496741120.0, + "grad_norm": 1.7658790260288333, + "language_loss": 0.65507495, + "learning_rate": 7.817177039013931e-07, + "loss": 0.73175335, + "num_input_tokens_seen": 257352145, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.10076904, + "step": 11929, + "time_per_iteration": 2.5298080444335938 + }, + { + "auxiliary_loss_clip": 0.06411615, + "auxiliary_loss_mlp": 0.01264642, + "balance_loss_clip": 0.06275426, + "balance_loss_mlp": 0.01254455, + "epoch": 0.7172704043288742, + "flos": 21513011869440.0, + "grad_norm": 1.88648366975717, + "language_loss": 0.70105934, + "learning_rate": 7.81408859809308e-07, + "loss": 0.7778219, + "num_input_tokens_seen": 257371460, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.10186768, + "step": 11930, + "time_per_iteration": 2.492851972579956 + }, + { + "auxiliary_loss_clip": 0.06407914, + "auxiliary_loss_mlp": 0.0126604, + "balance_loss_clip": 0.06271791, + "balance_loss_mlp": 0.01255675, + "epoch": 0.7173305275815421, + "flos": 18776964499200.0, + "grad_norm": 1.6767880793565944, + "language_loss": 0.80551809, + "learning_rate": 7.811000619263219e-07, + "loss": 0.88225758, + "num_input_tokens_seen": 257390800, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.10351562, + "step": 11931, + "time_per_iteration": 2.5129940509796143 + }, + { + "auxiliary_loss_clip": 0.06405962, + "auxiliary_loss_mlp": 0.01263185, + "balance_loss_clip": 0.06272676, + "balance_loss_mlp": 0.01253398, + "epoch": 0.7173906508342102, + "flos": 16185372768000.0, + "grad_norm": 2.3164344242090245, + "language_loss": 0.78938711, + "learning_rate": 7.80791310264143e-07, + "loss": 0.8660785, + "num_input_tokens_seen": 257407495, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.09790039, + "step": 11932, + "time_per_iteration": 2.458064317703247 + }, + { + "auxiliary_loss_clip": 0.06406456, + "auxiliary_loss_mlp": 0.01265129, + "balance_loss_clip": 0.06274296, + "balance_loss_mlp": 0.01255163, + "epoch": 0.7174507740868781, + "flos": 26620948765440.0, + "grad_norm": 2.941669914403725, + "language_loss": 0.75155187, + "learning_rate": 7.804826048344803e-07, + "loss": 0.82826775, + "num_input_tokens_seen": 257429675, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.09960938, + "step": 11933, + "time_per_iteration": 2.5739805698394775 + }, + { + "auxiliary_loss_clip": 0.06418844, + "auxiliary_loss_mlp": 0.01266714, + "balance_loss_clip": 0.06277472, + "balance_loss_mlp": 0.01254858, + "epoch": 0.7175108973395461, + "flos": 18437264847360.0, + "grad_norm": 7.531680164120171, + "language_loss": 0.69827807, + "learning_rate": 7.801739456490388e-07, + "loss": 0.77513361, + "num_input_tokens_seen": 257442765, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.11859131, + "step": 11934, + "time_per_iteration": 2.4455020427703857 + }, + { + "auxiliary_loss_clip": 0.06406108, + "auxiliary_loss_mlp": 0.01263916, + "balance_loss_clip": 0.06272999, + "balance_loss_mlp": 0.0125395, + "epoch": 0.717571020592214, + "flos": 23921769991680.0, + "grad_norm": 2.2343261949316013, + "language_loss": 0.86673319, + "learning_rate": 7.798653327195237e-07, + "loss": 0.9434334, + "num_input_tokens_seen": 257459310, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.09967041, + "step": 11935, + "time_per_iteration": 2.528456211090088 + }, + { + "auxiliary_loss_clip": 0.06406541, + "auxiliary_loss_mlp": 0.0126352, + "balance_loss_clip": 0.06272122, + "balance_loss_mlp": 0.01253202, + "epoch": 0.717631143844882, + "flos": 38266647828480.0, + "grad_norm": 1.602642316585254, + "language_loss": 0.73995256, + "learning_rate": 7.795567660576388e-07, + "loss": 0.81665319, + "num_input_tokens_seen": 257484750, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.10314941, + "step": 11936, + "time_per_iteration": 2.67246413230896 + }, + { + "auxiliary_loss_clip": 0.06313888, + "auxiliary_loss_mlp": 0.01249886, + "balance_loss_clip": 0.06258012, + "balance_loss_mlp": 0.01248772, + "epoch": 0.7176912670975499, + "flos": 65536961408640.0, + "grad_norm": 0.7536478557805156, + "language_loss": 0.55813849, + "learning_rate": 7.79248245675082e-07, + "loss": 0.63377625, + "num_input_tokens_seen": 257543110, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.0111618, + "step": 11937, + "time_per_iteration": 3.14385724067688 + }, + { + "auxiliary_loss_clip": 0.06410685, + "auxiliary_loss_mlp": 0.01264931, + "balance_loss_clip": 0.06272934, + "balance_loss_mlp": 0.01254042, + "epoch": 0.717751390350218, + "flos": 31288433074560.0, + "grad_norm": 3.0696111718968555, + "language_loss": 0.54891688, + "learning_rate": 7.789397715835542e-07, + "loss": 0.62567306, + "num_input_tokens_seen": 257567410, + "router_z_loss_clip": 1.37695312, + "router_z_loss_mlp": 0.10900879, + "step": 11938, + "time_per_iteration": 2.612314462661743 + }, + { + "auxiliary_loss_clip": 0.06404351, + "auxiliary_loss_mlp": 0.01261396, + "balance_loss_clip": 0.06274119, + "balance_loss_mlp": 0.01251811, + "epoch": 0.7178115136028859, + "flos": 19864155225600.0, + "grad_norm": 1.5149026364788483, + "language_loss": 0.77031577, + "learning_rate": 7.786313437947527e-07, + "loss": 0.84697324, + "num_input_tokens_seen": 257586270, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.09576416, + "step": 11939, + "time_per_iteration": 3.9376840591430664 + }, + { + "auxiliary_loss_clip": 0.06311642, + "auxiliary_loss_mlp": 0.01253055, + "balance_loss_clip": 0.06255894, + "balance_loss_mlp": 0.01251996, + "epoch": 0.7178716368555539, + "flos": 64369576725120.0, + "grad_norm": 0.7379302398056043, + "language_loss": 0.6123156, + "learning_rate": 7.783229623203738e-07, + "loss": 0.68796259, + "num_input_tokens_seen": 257647415, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01060486, + "step": 11940, + "time_per_iteration": 3.106687545776367 + }, + { + "auxiliary_loss_clip": 0.0640372, + "auxiliary_loss_mlp": 0.01262674, + "balance_loss_clip": 0.06272845, + "balance_loss_mlp": 0.01253209, + "epoch": 0.7179317601082219, + "flos": 26770184087040.0, + "grad_norm": 1.6027609306181398, + "language_loss": 0.59101206, + "learning_rate": 7.780146271721097e-07, + "loss": 0.66767597, + "num_input_tokens_seen": 257669795, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09466553, + "step": 11941, + "time_per_iteration": 2.6309211254119873 + }, + { + "auxiliary_loss_clip": 0.06405525, + "auxiliary_loss_mlp": 0.01263209, + "balance_loss_clip": 0.06273725, + "balance_loss_mlp": 0.01253779, + "epoch": 0.7179918833608898, + "flos": 23520575842560.0, + "grad_norm": 1.7346427869736905, + "language_loss": 0.79611468, + "learning_rate": 7.777063383616543e-07, + "loss": 0.87280202, + "num_input_tokens_seen": 257687415, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09429932, + "step": 11942, + "time_per_iteration": 2.5131733417510986 + }, + { + "auxiliary_loss_clip": 0.06404739, + "auxiliary_loss_mlp": 0.01268984, + "balance_loss_clip": 0.06271753, + "balance_loss_mlp": 0.01258345, + "epoch": 0.7180520066135578, + "flos": 17171349361920.0, + "grad_norm": 2.144705941723289, + "language_loss": 0.66274554, + "learning_rate": 7.773980959006968e-07, + "loss": 0.73948282, + "num_input_tokens_seen": 257706215, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.10638428, + "step": 11943, + "time_per_iteration": 2.5236313343048096 + }, + { + "auxiliary_loss_clip": 0.06407227, + "auxiliary_loss_mlp": 0.01268193, + "balance_loss_clip": 0.06273103, + "balance_loss_mlp": 0.01257798, + "epoch": 0.7181121298662257, + "flos": 17572417729920.0, + "grad_norm": 1.703985250404805, + "language_loss": 0.78651738, + "learning_rate": 7.770898998009254e-07, + "loss": 0.86327153, + "num_input_tokens_seen": 257724740, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.10388184, + "step": 11944, + "time_per_iteration": 2.489701271057129 + }, + { + "auxiliary_loss_clip": 0.06407581, + "auxiliary_loss_mlp": 0.01268486, + "balance_loss_clip": 0.06269886, + "balance_loss_mlp": 0.01256243, + "epoch": 0.7181722531188938, + "flos": 11952471260160.0, + "grad_norm": 2.3927781343480024, + "language_loss": 0.62825882, + "learning_rate": 7.767817500740277e-07, + "loss": 0.70501947, + "num_input_tokens_seen": 257742060, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.12243652, + "step": 11945, + "time_per_iteration": 2.523031711578369 + }, + { + "auxiliary_loss_clip": 0.0631476, + "auxiliary_loss_mlp": 0.0125155, + "balance_loss_clip": 0.06259042, + "balance_loss_mlp": 0.01250277, + "epoch": 0.7182323763715617, + "flos": 65522664288000.0, + "grad_norm": 0.6825637115139678, + "language_loss": 0.5092659, + "learning_rate": 7.76473646731689e-07, + "loss": 0.58492899, + "num_input_tokens_seen": 257802250, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.01273346, + "step": 11946, + "time_per_iteration": 3.0530238151550293 + }, + { + "auxiliary_loss_clip": 0.06408353, + "auxiliary_loss_mlp": 0.01265169, + "balance_loss_clip": 0.06271833, + "balance_loss_mlp": 0.01254553, + "epoch": 0.7182924996242297, + "flos": 20637137940480.0, + "grad_norm": 1.6252151206202925, + "language_loss": 0.7525813, + "learning_rate": 7.761655897855925e-07, + "loss": 0.8293165, + "num_input_tokens_seen": 257821155, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.10620117, + "step": 11947, + "time_per_iteration": 2.535158157348633 + }, + { + "auxiliary_loss_clip": 0.0640206, + "auxiliary_loss_mlp": 0.01266117, + "balance_loss_clip": 0.06270691, + "balance_loss_mlp": 0.01256556, + "epoch": 0.7183526228768976, + "flos": 16221947875200.0, + "grad_norm": 1.376797817491515, + "language_loss": 0.7316047, + "learning_rate": 7.758575792474187e-07, + "loss": 0.80828649, + "num_input_tokens_seen": 257839905, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09564209, + "step": 11948, + "time_per_iteration": 2.465437173843384 + }, + { + "auxiliary_loss_clip": 0.06408493, + "auxiliary_loss_mlp": 0.01270033, + "balance_loss_clip": 0.06272705, + "balance_loss_mlp": 0.0125959, + "epoch": 0.7184127461295656, + "flos": 22238518446720.0, + "grad_norm": 1.618352037269111, + "language_loss": 0.71604127, + "learning_rate": 7.755496151288483e-07, + "loss": 0.79282653, + "num_input_tokens_seen": 257860055, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10443115, + "step": 11949, + "time_per_iteration": 2.5727827548980713 + }, + { + "auxiliary_loss_clip": 0.06405893, + "auxiliary_loss_mlp": 0.01265064, + "balance_loss_clip": 0.06273241, + "balance_loss_mlp": 0.01255659, + "epoch": 0.7184728693822335, + "flos": 27351863930880.0, + "grad_norm": 2.584174612007466, + "language_loss": 0.76537007, + "learning_rate": 7.752416974415598e-07, + "loss": 0.84207964, + "num_input_tokens_seen": 257879315, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09411621, + "step": 11950, + "time_per_iteration": 4.074851751327515 + }, + { + "auxiliary_loss_clip": 0.0641187, + "auxiliary_loss_mlp": 0.01266048, + "balance_loss_clip": 0.06275279, + "balance_loss_mlp": 0.01254968, + "epoch": 0.7185329926349016, + "flos": 16514129462400.0, + "grad_norm": 2.1607831663839163, + "language_loss": 0.67883182, + "learning_rate": 7.749338261972282e-07, + "loss": 0.75561094, + "num_input_tokens_seen": 257896570, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.11071777, + "step": 11951, + "time_per_iteration": 2.4646525382995605 + }, + { + "auxiliary_loss_clip": 0.06409188, + "auxiliary_loss_mlp": 0.0126641, + "balance_loss_clip": 0.0627277, + "balance_loss_mlp": 0.01254549, + "epoch": 0.7185931158875695, + "flos": 23957800047360.0, + "grad_norm": 1.7824491955160577, + "language_loss": 0.78629339, + "learning_rate": 7.746260014075286e-07, + "loss": 0.86304945, + "num_input_tokens_seen": 257916855, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.11865234, + "step": 11952, + "time_per_iteration": 2.516615390777588 + }, + { + "auxiliary_loss_clip": 0.06412063, + "auxiliary_loss_mlp": 0.01268038, + "balance_loss_clip": 0.06272954, + "balance_loss_mlp": 0.0125725, + "epoch": 0.7186532391402375, + "flos": 26549265997440.0, + "grad_norm": 1.8155741690117748, + "language_loss": 0.74781901, + "learning_rate": 7.743182230841352e-07, + "loss": 0.82462001, + "num_input_tokens_seen": 257937140, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.10803223, + "step": 11953, + "time_per_iteration": 2.527876853942871 + }, + { + "auxiliary_loss_clip": 0.06407471, + "auxiliary_loss_mlp": 0.01266403, + "balance_loss_clip": 0.06272335, + "balance_loss_mlp": 0.01256223, + "epoch": 0.7187133623929055, + "flos": 22389682412160.0, + "grad_norm": 1.6183356638137696, + "language_loss": 0.73045003, + "learning_rate": 7.740104912387164e-07, + "loss": 0.80718875, + "num_input_tokens_seen": 257956785, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.10180664, + "step": 11954, + "time_per_iteration": 3.9654276371002197 + }, + { + "auxiliary_loss_clip": 0.06407467, + "auxiliary_loss_mlp": 0.01268821, + "balance_loss_clip": 0.0627225, + "balance_loss_mlp": 0.01258372, + "epoch": 0.7187734856455734, + "flos": 15785184867840.0, + "grad_norm": 1.5034974225164766, + "language_loss": 0.74558902, + "learning_rate": 7.737028058829425e-07, + "loss": 0.82235181, + "num_input_tokens_seen": 257975455, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.10455322, + "step": 11955, + "time_per_iteration": 2.478512763977051 + }, + { + "auxiliary_loss_clip": 0.0640816, + "auxiliary_loss_mlp": 0.01262735, + "balance_loss_clip": 0.06272267, + "balance_loss_mlp": 0.01253032, + "epoch": 0.7188336088982414, + "flos": 31767766755840.0, + "grad_norm": 1.8388372007030418, + "language_loss": 0.73576057, + "learning_rate": 7.733951670284817e-07, + "loss": 0.81246948, + "num_input_tokens_seen": 257996850, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.09698486, + "step": 11956, + "time_per_iteration": 2.5664751529693604 + }, + { + "auxiliary_loss_clip": 0.06408941, + "auxiliary_loss_mlp": 0.01266307, + "balance_loss_clip": 0.06270766, + "balance_loss_mlp": 0.01255793, + "epoch": 0.7188937321509093, + "flos": 21470734684800.0, + "grad_norm": 1.7841137783080476, + "language_loss": 0.70991242, + "learning_rate": 7.730875746869987e-07, + "loss": 0.7866649, + "num_input_tokens_seen": 258016145, + "router_z_loss_clip": 1.38183594, + "router_z_loss_mlp": 0.1050415, + "step": 11957, + "time_per_iteration": 2.5579633712768555 + }, + { + "auxiliary_loss_clip": 0.0641226, + "auxiliary_loss_mlp": 0.01268285, + "balance_loss_clip": 0.06273985, + "balance_loss_mlp": 0.01256966, + "epoch": 0.7189538554035774, + "flos": 27278839497600.0, + "grad_norm": 1.7957042197859685, + "language_loss": 0.74078369, + "learning_rate": 7.727800288701582e-07, + "loss": 0.81758916, + "num_input_tokens_seen": 258035420, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.11322021, + "step": 11958, + "time_per_iteration": 3.9170804023742676 + }, + { + "auxiliary_loss_clip": 0.06403583, + "auxiliary_loss_mlp": 0.01264508, + "balance_loss_clip": 0.06271464, + "balance_loss_mlp": 0.0125484, + "epoch": 0.7190139786562453, + "flos": 21587168332800.0, + "grad_norm": 1.5040650051227977, + "language_loss": 0.84225762, + "learning_rate": 7.724725295896215e-07, + "loss": 0.91893852, + "num_input_tokens_seen": 258053520, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09667969, + "step": 11959, + "time_per_iteration": 2.506953239440918 + }, + { + "auxiliary_loss_clip": 0.06412622, + "auxiliary_loss_mlp": 0.01263708, + "balance_loss_clip": 0.06274716, + "balance_loss_mlp": 0.01253665, + "epoch": 0.7190741019089133, + "flos": 26727990756480.0, + "grad_norm": 1.629776742462507, + "language_loss": 0.82108045, + "learning_rate": 7.7216507685705e-07, + "loss": 0.89784372, + "num_input_tokens_seen": 258073020, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.10046387, + "step": 11960, + "time_per_iteration": 2.5172626972198486 + }, + { + "auxiliary_loss_clip": 0.06408188, + "auxiliary_loss_mlp": 0.01266297, + "balance_loss_clip": 0.06274426, + "balance_loss_mlp": 0.01256051, + "epoch": 0.7191342251615812, + "flos": 26112041792640.0, + "grad_norm": 2.013110188990865, + "language_loss": 0.7794981, + "learning_rate": 7.718576706841013e-07, + "loss": 0.85624301, + "num_input_tokens_seen": 258093155, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.10241699, + "step": 11961, + "time_per_iteration": 2.585214853286743 + }, + { + "auxiliary_loss_clip": 0.06404266, + "auxiliary_loss_mlp": 0.01266808, + "balance_loss_clip": 0.06274937, + "balance_loss_mlp": 0.01257164, + "epoch": 0.7191943484142492, + "flos": 22973794024320.0, + "grad_norm": 1.3445368370245, + "language_loss": 0.75350589, + "learning_rate": 7.715503110824326e-07, + "loss": 0.83021665, + "num_input_tokens_seen": 258113905, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.09643555, + "step": 11962, + "time_per_iteration": 2.5126750469207764 + }, + { + "auxiliary_loss_clip": 0.06408066, + "auxiliary_loss_mlp": 0.01264043, + "balance_loss_clip": 0.06272985, + "balance_loss_mlp": 0.01253428, + "epoch": 0.7192544716669171, + "flos": 22571970969600.0, + "grad_norm": 1.8990374225745255, + "language_loss": 0.7543835, + "learning_rate": 7.712429980637001e-07, + "loss": 0.83110464, + "num_input_tokens_seen": 258132820, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.10614014, + "step": 11963, + "time_per_iteration": 2.531531572341919 + }, + { + "auxiliary_loss_clip": 0.0641598, + "auxiliary_loss_mlp": 0.01268254, + "balance_loss_clip": 0.06276201, + "balance_loss_mlp": 0.01256888, + "epoch": 0.7193145949195852, + "flos": 18986981558400.0, + "grad_norm": 2.117256305222674, + "language_loss": 0.81201178, + "learning_rate": 7.709357316395564e-07, + "loss": 0.88885415, + "num_input_tokens_seen": 258148055, + "router_z_loss_clip": 1.39941406, + "router_z_loss_mlp": 0.11364746, + "step": 11964, + "time_per_iteration": 2.455134630203247 + }, + { + "auxiliary_loss_clip": 0.06404482, + "auxiliary_loss_mlp": 0.01268831, + "balance_loss_clip": 0.0627102, + "balance_loss_mlp": 0.01258854, + "epoch": 0.7193747181722531, + "flos": 18010061205120.0, + "grad_norm": 1.7059884029893508, + "language_loss": 0.75202858, + "learning_rate": 7.70628511821652e-07, + "loss": 0.8287617, + "num_input_tokens_seen": 258165995, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.09979248, + "step": 11965, + "time_per_iteration": 2.49127459526062 + }, + { + "auxiliary_loss_clip": 0.06410991, + "auxiliary_loss_mlp": 0.01265781, + "balance_loss_clip": 0.06272766, + "balance_loss_mlp": 0.01255172, + "epoch": 0.7194348414249211, + "flos": 24396323990400.0, + "grad_norm": 1.448883188350496, + "language_loss": 0.77801377, + "learning_rate": 7.703213386216377e-07, + "loss": 0.85478151, + "num_input_tokens_seen": 258186165, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.1060791, + "step": 11966, + "time_per_iteration": 2.5172245502471924 + }, + { + "auxiliary_loss_clip": 0.06405114, + "auxiliary_loss_mlp": 0.01265324, + "balance_loss_clip": 0.06270087, + "balance_loss_mlp": 0.01254953, + "epoch": 0.7194949646775891, + "flos": 22169938279680.0, + "grad_norm": 1.704579112714729, + "language_loss": 0.73619503, + "learning_rate": 7.700142120511619e-07, + "loss": 0.81289935, + "num_input_tokens_seen": 258204595, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.10375977, + "step": 11967, + "time_per_iteration": 2.5002834796905518 + }, + { + "auxiliary_loss_clip": 0.06399344, + "auxiliary_loss_mlp": 0.01265984, + "balance_loss_clip": 0.06271313, + "balance_loss_mlp": 0.01256679, + "epoch": 0.719555087930257, + "flos": 20272560825600.0, + "grad_norm": 1.5295572568049065, + "language_loss": 0.82314783, + "learning_rate": 7.6970713212187e-07, + "loss": 0.89980114, + "num_input_tokens_seen": 258223110, + "router_z_loss_clip": 1.27929688, + "router_z_loss_mlp": 0.09307861, + "step": 11968, + "time_per_iteration": 2.5851659774780273 + }, + { + "auxiliary_loss_clip": 0.06403178, + "auxiliary_loss_mlp": 0.01265341, + "balance_loss_clip": 0.0627176, + "balance_loss_mlp": 0.01255262, + "epoch": 0.719615211182925, + "flos": 24723026259840.0, + "grad_norm": 1.755748062324177, + "language_loss": 0.76839387, + "learning_rate": 7.69400098845407e-07, + "loss": 0.84507906, + "num_input_tokens_seen": 258242660, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.10070801, + "step": 11969, + "time_per_iteration": 2.52701997756958 + }, + { + "auxiliary_loss_clip": 0.06404562, + "auxiliary_loss_mlp": 0.01266338, + "balance_loss_clip": 0.06269367, + "balance_loss_mlp": 0.01255973, + "epoch": 0.719675334435593, + "flos": 20015570753280.0, + "grad_norm": 1.3860945342705195, + "language_loss": 0.71083385, + "learning_rate": 7.69093112233417e-07, + "loss": 0.78754288, + "num_input_tokens_seen": 258261850, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.1036377, + "step": 11970, + "time_per_iteration": 2.4650230407714844 + }, + { + "auxiliary_loss_clip": 0.0631284, + "auxiliary_loss_mlp": 0.01254485, + "balance_loss_clip": 0.06257641, + "balance_loss_mlp": 0.0125341, + "epoch": 0.719735457688261, + "flos": 44215965169920.0, + "grad_norm": 0.888192753215213, + "language_loss": 0.60509741, + "learning_rate": 7.68786172297538e-07, + "loss": 0.68077064, + "num_input_tokens_seen": 258312570, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01076508, + "step": 11971, + "time_per_iteration": 3.049323558807373 + }, + { + "auxiliary_loss_clip": 0.06412932, + "auxiliary_loss_mlp": 0.01264656, + "balance_loss_clip": 0.06273159, + "balance_loss_mlp": 0.01254541, + "epoch": 0.7197955809409289, + "flos": 16808952453120.0, + "grad_norm": 1.9914531833581635, + "language_loss": 0.79825729, + "learning_rate": 7.684792790494105e-07, + "loss": 0.87503314, + "num_input_tokens_seen": 258331600, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.10107422, + "step": 11972, + "time_per_iteration": 2.4930012226104736 + }, + { + "auxiliary_loss_clip": 0.06406973, + "auxiliary_loss_mlp": 0.01266584, + "balance_loss_clip": 0.062718, + "balance_loss_mlp": 0.01256487, + "epoch": 0.7198557041935969, + "flos": 24542330929920.0, + "grad_norm": 1.4491238198032386, + "language_loss": 0.76038206, + "learning_rate": 7.681724325006733e-07, + "loss": 0.83711761, + "num_input_tokens_seen": 258351785, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.10095215, + "step": 11973, + "time_per_iteration": 2.548208475112915 + }, + { + "auxiliary_loss_clip": 0.06313819, + "auxiliary_loss_mlp": 0.01251276, + "balance_loss_clip": 0.06258664, + "balance_loss_mlp": 0.01250185, + "epoch": 0.7199158274462648, + "flos": 70729006204800.0, + "grad_norm": 0.8373324972209466, + "language_loss": 0.57018536, + "learning_rate": 7.6786563266296e-07, + "loss": 0.64583629, + "num_input_tokens_seen": 258404035, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01093292, + "step": 11974, + "time_per_iteration": 2.9727988243103027 + }, + { + "auxiliary_loss_clip": 0.06406881, + "auxiliary_loss_mlp": 0.01266132, + "balance_loss_clip": 0.06270801, + "balance_loss_mlp": 0.01256082, + "epoch": 0.7199759506989328, + "flos": 29355151345920.0, + "grad_norm": 2.3495582662204164, + "language_loss": 0.61703098, + "learning_rate": 7.675588795479062e-07, + "loss": 0.69376105, + "num_input_tokens_seen": 258424850, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.10058594, + "step": 11975, + "time_per_iteration": 2.5667810440063477 + }, + { + "auxiliary_loss_clip": 0.06407548, + "auxiliary_loss_mlp": 0.01266502, + "balance_loss_clip": 0.06274091, + "balance_loss_mlp": 0.01256465, + "epoch": 0.7200360739516007, + "flos": 24646689590400.0, + "grad_norm": 1.7506172714592478, + "language_loss": 0.6773572, + "learning_rate": 7.672521731671425e-07, + "loss": 0.7540977, + "num_input_tokens_seen": 258445485, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.10040283, + "step": 11976, + "time_per_iteration": 2.5304412841796875 + }, + { + "auxiliary_loss_clip": 0.06406543, + "auxiliary_loss_mlp": 0.01261585, + "balance_loss_clip": 0.06271597, + "balance_loss_mlp": 0.01252024, + "epoch": 0.7200961972042688, + "flos": 20819007227520.0, + "grad_norm": 1.8109272198274133, + "language_loss": 0.6749649, + "learning_rate": 7.669455135323004e-07, + "loss": 0.75164616, + "num_input_tokens_seen": 258464505, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.09564209, + "step": 11977, + "time_per_iteration": 2.547656536102295 + }, + { + "auxiliary_loss_clip": 0.06408069, + "auxiliary_loss_mlp": 0.012691, + "balance_loss_clip": 0.06271597, + "balance_loss_mlp": 0.01258336, + "epoch": 0.7201563204569367, + "flos": 31253493121920.0, + "grad_norm": 1.5436676151403905, + "language_loss": 0.754664, + "learning_rate": 7.666389006550074e-07, + "loss": 0.83143568, + "num_input_tokens_seen": 258487190, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.10766602, + "step": 11978, + "time_per_iteration": 4.067101240158081 + }, + { + "auxiliary_loss_clip": 0.06403241, + "auxiliary_loss_mlp": 0.01264162, + "balance_loss_clip": 0.06271459, + "balance_loss_mlp": 0.01254327, + "epoch": 0.7202164437096047, + "flos": 26658655902720.0, + "grad_norm": 1.78319056574555, + "language_loss": 0.78890365, + "learning_rate": 7.663323345468908e-07, + "loss": 0.86557764, + "num_input_tokens_seen": 258503790, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.09832764, + "step": 11979, + "time_per_iteration": 2.5176994800567627 + }, + { + "auxiliary_loss_clip": 0.06404784, + "auxiliary_loss_mlp": 0.01266233, + "balance_loss_clip": 0.06271341, + "balance_loss_mlp": 0.01255999, + "epoch": 0.7202765669622727, + "flos": 25966999175040.0, + "grad_norm": 1.5387882255892862, + "language_loss": 0.64881861, + "learning_rate": 7.660258152195767e-07, + "loss": 0.72552878, + "num_input_tokens_seen": 258527335, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.10235596, + "step": 11980, + "time_per_iteration": 2.5968124866485596 + }, + { + "auxiliary_loss_clip": 0.06408978, + "auxiliary_loss_mlp": 0.01265341, + "balance_loss_clip": 0.06272249, + "balance_loss_mlp": 0.01254618, + "epoch": 0.7203366902149406, + "flos": 28519961374080.0, + "grad_norm": 1.8098282466640043, + "language_loss": 0.67242014, + "learning_rate": 7.657193426846871e-07, + "loss": 0.74916333, + "num_input_tokens_seen": 258546690, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.10717773, + "step": 11981, + "time_per_iteration": 2.5330793857574463 + }, + { + "auxiliary_loss_clip": 0.0640622, + "auxiliary_loss_mlp": 0.01265599, + "balance_loss_clip": 0.06270846, + "balance_loss_mlp": 0.01255555, + "epoch": 0.7203968134676086, + "flos": 21112446625920.0, + "grad_norm": 1.6958532399278234, + "language_loss": 0.74167675, + "learning_rate": 7.65412916953843e-07, + "loss": 0.81839496, + "num_input_tokens_seen": 258566340, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.10040283, + "step": 11982, + "time_per_iteration": 2.510929584503174 + }, + { + "auxiliary_loss_clip": 0.06405748, + "auxiliary_loss_mlp": 0.01266184, + "balance_loss_clip": 0.06270775, + "balance_loss_mlp": 0.01256802, + "epoch": 0.7204569367202766, + "flos": 18337937431680.0, + "grad_norm": 1.8860370503158916, + "language_loss": 0.65837574, + "learning_rate": 7.65106538038665e-07, + "loss": 0.73509502, + "num_input_tokens_seen": 258584455, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.09387207, + "step": 11983, + "time_per_iteration": 2.4505462646484375 + }, + { + "auxiliary_loss_clip": 0.06406046, + "auxiliary_loss_mlp": 0.01264887, + "balance_loss_clip": 0.06271453, + "balance_loss_mlp": 0.01254445, + "epoch": 0.7205170599729446, + "flos": 23261279783040.0, + "grad_norm": 1.4437514392705604, + "language_loss": 0.66617727, + "learning_rate": 7.648002059507715e-07, + "loss": 0.74288666, + "num_input_tokens_seen": 258604725, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.10449219, + "step": 11984, + "time_per_iteration": 2.547555446624756 + }, + { + "auxiliary_loss_clip": 0.06413494, + "auxiliary_loss_mlp": 0.01268675, + "balance_loss_clip": 0.06275403, + "balance_loss_mlp": 0.01257994, + "epoch": 0.7205771832256125, + "flos": 20127140864640.0, + "grad_norm": 1.765838717363193, + "language_loss": 0.74360126, + "learning_rate": 7.644939207017771e-07, + "loss": 0.82042295, + "num_input_tokens_seen": 258622885, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.10687256, + "step": 11985, + "time_per_iteration": 2.4865455627441406 + }, + { + "auxiliary_loss_clip": 0.06406047, + "auxiliary_loss_mlp": 0.0126526, + "balance_loss_clip": 0.06272492, + "balance_loss_mlp": 0.01255652, + "epoch": 0.7206373064782805, + "flos": 27709648865280.0, + "grad_norm": 1.7467712742919994, + "language_loss": 0.62577748, + "learning_rate": 7.641876823032977e-07, + "loss": 0.70249057, + "num_input_tokens_seen": 258644305, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.0960083, + "step": 11986, + "time_per_iteration": 2.5774106979370117 + }, + { + "auxiliary_loss_clip": 0.06410712, + "auxiliary_loss_mlp": 0.0127024, + "balance_loss_clip": 0.06274345, + "balance_loss_mlp": 0.01258951, + "epoch": 0.7206974297309484, + "flos": 17974031149440.0, + "grad_norm": 1.663451860117408, + "language_loss": 0.72484905, + "learning_rate": 7.638814907669455e-07, + "loss": 0.80165857, + "num_input_tokens_seen": 258661775, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.11291504, + "step": 11987, + "time_per_iteration": 2.4724771976470947 + }, + { + "auxiliary_loss_clip": 0.06410339, + "auxiliary_loss_mlp": 0.01263822, + "balance_loss_clip": 0.06273559, + "balance_loss_mlp": 0.01253689, + "epoch": 0.7207575529836164, + "flos": 16988893096320.0, + "grad_norm": 2.5242604109279574, + "language_loss": 0.78976148, + "learning_rate": 7.635753461043301e-07, + "loss": 0.86650312, + "num_input_tokens_seen": 258679830, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.10125732, + "step": 11988, + "time_per_iteration": 2.495361566543579 + }, + { + "auxiliary_loss_clip": 0.06404472, + "auxiliary_loss_mlp": 0.01263556, + "balance_loss_clip": 0.06269506, + "balance_loss_mlp": 0.01253489, + "epoch": 0.7208176762362843, + "flos": 18732465181440.0, + "grad_norm": 1.7087764254113869, + "language_loss": 0.79046804, + "learning_rate": 7.632692483270618e-07, + "loss": 0.86714828, + "num_input_tokens_seen": 258697415, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.10064697, + "step": 11989, + "time_per_iteration": 2.5043447017669678 + }, + { + "auxiliary_loss_clip": 0.06400688, + "auxiliary_loss_mlp": 0.01267699, + "balance_loss_clip": 0.06270982, + "balance_loss_mlp": 0.01257364, + "epoch": 0.7208777994889524, + "flos": 18740515173120.0, + "grad_norm": 1.790178990562424, + "language_loss": 0.8290503, + "learning_rate": 7.629631974467481e-07, + "loss": 0.90573412, + "num_input_tokens_seen": 258716755, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.10345459, + "step": 11990, + "time_per_iteration": 3.926800012588501 + }, + { + "auxiliary_loss_clip": 0.064039, + "auxiliary_loss_mlp": 0.01274305, + "balance_loss_clip": 0.06273188, + "balance_loss_mlp": 0.0126484, + "epoch": 0.7209379227416203, + "flos": 14798705149440.0, + "grad_norm": 2.036094389130557, + "language_loss": 0.7637105, + "learning_rate": 7.626571934749931e-07, + "loss": 0.84049255, + "num_input_tokens_seen": 258733270, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09472656, + "step": 11991, + "time_per_iteration": 2.504420042037964 + }, + { + "auxiliary_loss_clip": 0.06401916, + "auxiliary_loss_mlp": 0.01266823, + "balance_loss_clip": 0.06271645, + "balance_loss_mlp": 0.01256976, + "epoch": 0.7209980459942883, + "flos": 29643559499520.0, + "grad_norm": 1.4029888682461984, + "language_loss": 0.72727466, + "learning_rate": 7.623512364234022e-07, + "loss": 0.80396211, + "num_input_tokens_seen": 258755270, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.09844971, + "step": 11992, + "time_per_iteration": 2.5568339824676514 + }, + { + "auxiliary_loss_clip": 0.06410159, + "auxiliary_loss_mlp": 0.01263161, + "balance_loss_clip": 0.06273486, + "balance_loss_mlp": 0.01252695, + "epoch": 0.7210581692469563, + "flos": 23483916881280.0, + "grad_norm": 1.4497931031993367, + "language_loss": 0.66405648, + "learning_rate": 7.620453263035755e-07, + "loss": 0.74078965, + "num_input_tokens_seen": 258775340, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.10473633, + "step": 11993, + "time_per_iteration": 2.6186561584472656 + }, + { + "auxiliary_loss_clip": 0.06405848, + "auxiliary_loss_mlp": 0.01269619, + "balance_loss_clip": 0.06271709, + "balance_loss_mlp": 0.01259695, + "epoch": 0.7211182924996242, + "flos": 26106297788160.0, + "grad_norm": 1.8933872495895026, + "language_loss": 0.6622234, + "learning_rate": 7.61739463127115e-07, + "loss": 0.73897809, + "num_input_tokens_seen": 258794580, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.0993042, + "step": 11994, + "time_per_iteration": 3.895599126815796 + }, + { + "auxiliary_loss_clip": 0.06404895, + "auxiliary_loss_mlp": 0.01266355, + "balance_loss_clip": 0.06270497, + "balance_loss_mlp": 0.01255888, + "epoch": 0.7211784157522922, + "flos": 17717795763840.0, + "grad_norm": 1.9331486787733179, + "language_loss": 0.67162377, + "learning_rate": 7.614336469056172e-07, + "loss": 0.7483362, + "num_input_tokens_seen": 258812330, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.10473633, + "step": 11995, + "time_per_iteration": 2.4796035289764404 + }, + { + "auxiliary_loss_clip": 0.06403686, + "auxiliary_loss_mlp": 0.01265301, + "balance_loss_clip": 0.06274262, + "balance_loss_mlp": 0.01254721, + "epoch": 0.7212385390049602, + "flos": 24430173840000.0, + "grad_norm": 1.6348621026253527, + "language_loss": 0.7952925, + "learning_rate": 7.6112787765068e-07, + "loss": 0.87198234, + "num_input_tokens_seen": 258831770, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.10577393, + "step": 11996, + "time_per_iteration": 2.513824939727783 + }, + { + "auxiliary_loss_clip": 0.06409439, + "auxiliary_loss_mlp": 0.0126526, + "balance_loss_clip": 0.06274767, + "balance_loss_mlp": 0.01255056, + "epoch": 0.7212986622576282, + "flos": 28154755353600.0, + "grad_norm": 3.3591238798386285, + "language_loss": 0.81663775, + "learning_rate": 7.60822155373899e-07, + "loss": 0.89338481, + "num_input_tokens_seen": 258849090, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.10192871, + "step": 11997, + "time_per_iteration": 3.9435391426086426 + }, + { + "auxiliary_loss_clip": 0.06409244, + "auxiliary_loss_mlp": 0.01266354, + "balance_loss_clip": 0.06272081, + "balance_loss_mlp": 0.01255363, + "epoch": 0.7213587855102961, + "flos": 21842313615360.0, + "grad_norm": 1.9166262285811178, + "language_loss": 0.67322028, + "learning_rate": 7.605164800868646e-07, + "loss": 0.74997622, + "num_input_tokens_seen": 258868230, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.10992432, + "step": 11998, + "time_per_iteration": 2.496742010116577 + }, + { + "auxiliary_loss_clip": 0.06405417, + "auxiliary_loss_mlp": 0.01266027, + "balance_loss_clip": 0.06271802, + "balance_loss_mlp": 0.01256777, + "epoch": 0.7214189087629641, + "flos": 14616877789440.0, + "grad_norm": 1.7752534320688365, + "language_loss": 0.72513527, + "learning_rate": 7.602108518011696e-07, + "loss": 0.80184972, + "num_input_tokens_seen": 258885525, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.0925293, + "step": 11999, + "time_per_iteration": 2.458315849304199 + }, + { + "auxiliary_loss_clip": 0.0640653, + "auxiliary_loss_mlp": 0.01266506, + "balance_loss_clip": 0.06272668, + "balance_loss_mlp": 0.01256158, + "epoch": 0.721479032015632, + "flos": 19396938458880.0, + "grad_norm": 2.0883117148535937, + "language_loss": 0.83569586, + "learning_rate": 7.599052705284039e-07, + "loss": 0.91242623, + "num_input_tokens_seen": 258903245, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.10351562, + "step": 12000, + "time_per_iteration": 2.4941916465759277 + }, + { + "auxiliary_loss_clip": 0.06409671, + "auxiliary_loss_mlp": 0.01262001, + "balance_loss_clip": 0.06275104, + "balance_loss_mlp": 0.01251826, + "epoch": 0.7215391552683, + "flos": 18518423126400.0, + "grad_norm": 1.7464338798301249, + "language_loss": 0.77261817, + "learning_rate": 7.59599736280154e-07, + "loss": 0.8493349, + "num_input_tokens_seen": 258921245, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.10174561, + "step": 12001, + "time_per_iteration": 2.4661076068878174 + }, + { + "auxiliary_loss_clip": 0.0640439, + "auxiliary_loss_mlp": 0.01267788, + "balance_loss_clip": 0.06274766, + "balance_loss_mlp": 0.01258323, + "epoch": 0.721599278520968, + "flos": 23265514414080.0, + "grad_norm": 2.52401774728115, + "language_loss": 0.81887865, + "learning_rate": 7.592942490680066e-07, + "loss": 0.89560032, + "num_input_tokens_seen": 258939425, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.09454346, + "step": 12002, + "time_per_iteration": 2.5698509216308594 + }, + { + "auxiliary_loss_clip": 0.06409481, + "auxiliary_loss_mlp": 0.01264806, + "balance_loss_clip": 0.06272879, + "balance_loss_mlp": 0.01254363, + "epoch": 0.721659401773636, + "flos": 39207831615360.0, + "grad_norm": 2.1337554314771117, + "language_loss": 0.62387294, + "learning_rate": 7.589888089035462e-07, + "loss": 0.70061582, + "num_input_tokens_seen": 258960710, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.10437012, + "step": 12003, + "time_per_iteration": 2.646667003631592 + }, + { + "auxiliary_loss_clip": 0.06408672, + "auxiliary_loss_mlp": 0.01269946, + "balance_loss_clip": 0.06271918, + "balance_loss_mlp": 0.01258639, + "epoch": 0.7217195250263039, + "flos": 14945299067520.0, + "grad_norm": 3.165928110898167, + "language_loss": 0.69158828, + "learning_rate": 7.586834157983544e-07, + "loss": 0.76837444, + "num_input_tokens_seen": 258978475, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.11297607, + "step": 12004, + "time_per_iteration": 2.4904415607452393 + }, + { + "auxiliary_loss_clip": 0.06301466, + "auxiliary_loss_mlp": 0.0124999, + "balance_loss_clip": 0.06246269, + "balance_loss_mlp": 0.01249087, + "epoch": 0.7217796482789719, + "flos": 70889477973120.0, + "grad_norm": 0.8473059140767815, + "language_loss": 0.54124975, + "learning_rate": 7.583780697640112e-07, + "loss": 0.61676431, + "num_input_tokens_seen": 259037520, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.00901794, + "step": 12005, + "time_per_iteration": 3.085909366607666 + }, + { + "auxiliary_loss_clip": 0.06406818, + "auxiliary_loss_mlp": 0.0126308, + "balance_loss_clip": 0.06273066, + "balance_loss_mlp": 0.0125349, + "epoch": 0.7218397715316398, + "flos": 37460653804800.0, + "grad_norm": 1.5183383178903638, + "language_loss": 0.63201904, + "learning_rate": 7.580727708120962e-07, + "loss": 0.708718, + "num_input_tokens_seen": 259061325, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.09588623, + "step": 12006, + "time_per_iteration": 2.7121994495391846 + }, + { + "auxiliary_loss_clip": 0.06407326, + "auxiliary_loss_mlp": 0.01263158, + "balance_loss_clip": 0.0627062, + "balance_loss_mlp": 0.0125352, + "epoch": 0.7218998947843078, + "flos": 22717223222400.0, + "grad_norm": 1.5926677831370504, + "language_loss": 0.92170072, + "learning_rate": 7.577675189541865e-07, + "loss": 0.99840552, + "num_input_tokens_seen": 259078135, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.09643555, + "step": 12007, + "time_per_iteration": 2.534914016723633 + }, + { + "auxiliary_loss_clip": 0.06408784, + "auxiliary_loss_mlp": 0.01266152, + "balance_loss_clip": 0.06272783, + "balance_loss_mlp": 0.01255191, + "epoch": 0.7219600180369758, + "flos": 12172131538560.0, + "grad_norm": 1.6024431968555108, + "language_loss": 0.63807905, + "learning_rate": 7.574623142018568e-07, + "loss": 0.71482843, + "num_input_tokens_seen": 259095910, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.10961914, + "step": 12008, + "time_per_iteration": 2.5015389919281006 + }, + { + "auxiliary_loss_clip": 0.0641045, + "auxiliary_loss_mlp": 0.01266515, + "balance_loss_clip": 0.06271577, + "balance_loss_mlp": 0.01256144, + "epoch": 0.7220201412896438, + "flos": 22602340874880.0, + "grad_norm": 1.927754748237573, + "language_loss": 0.79281247, + "learning_rate": 7.57157156566681e-07, + "loss": 0.86958218, + "num_input_tokens_seen": 259114225, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.1038208, + "step": 12009, + "time_per_iteration": 2.5008604526519775 + }, + { + "auxiliary_loss_clip": 0.06407045, + "auxiliary_loss_mlp": 0.01266982, + "balance_loss_clip": 0.06269218, + "balance_loss_mlp": 0.01255533, + "epoch": 0.7220802645423118, + "flos": 26724972009600.0, + "grad_norm": 2.605024867459915, + "language_loss": 0.6418041, + "learning_rate": 7.568520460602297e-07, + "loss": 0.71854436, + "num_input_tokens_seen": 259134660, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.11450195, + "step": 12010, + "time_per_iteration": 2.527949571609497 + }, + { + "auxiliary_loss_clip": 0.06404176, + "auxiliary_loss_mlp": 0.01266927, + "balance_loss_clip": 0.06270487, + "balance_loss_mlp": 0.01256854, + "epoch": 0.7221403877949797, + "flos": 24426568114560.0, + "grad_norm": 1.594533265957021, + "language_loss": 0.77320325, + "learning_rate": 7.565469826940742e-07, + "loss": 0.84991425, + "num_input_tokens_seen": 259153300, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.10070801, + "step": 12011, + "time_per_iteration": 2.5198636054992676 + }, + { + "auxiliary_loss_clip": 0.0640825, + "auxiliary_loss_mlp": 0.01263324, + "balance_loss_clip": 0.06273598, + "balance_loss_mlp": 0.0125368, + "epoch": 0.7222005110476477, + "flos": 23521246675200.0, + "grad_norm": 1.6737582547209497, + "language_loss": 0.79734701, + "learning_rate": 7.56241966479781e-07, + "loss": 0.87406272, + "num_input_tokens_seen": 259172115, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.09637451, + "step": 12012, + "time_per_iteration": 2.5218822956085205 + }, + { + "auxiliary_loss_clip": 0.06409319, + "auxiliary_loss_mlp": 0.01264498, + "balance_loss_clip": 0.0627391, + "balance_loss_mlp": 0.01254955, + "epoch": 0.7222606343003156, + "flos": 23119255912320.0, + "grad_norm": 2.6909809043391744, + "language_loss": 0.76237571, + "learning_rate": 7.559369974289171e-07, + "loss": 0.83911389, + "num_input_tokens_seen": 259191345, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.09533691, + "step": 12013, + "time_per_iteration": 2.501549005508423 + }, + { + "auxiliary_loss_clip": 0.06401782, + "auxiliary_loss_mlp": 0.01266309, + "balance_loss_clip": 0.06270641, + "balance_loss_mlp": 0.01256456, + "epoch": 0.7223207575529836, + "flos": 24357778312320.0, + "grad_norm": 1.4242237370924462, + "language_loss": 0.76199239, + "learning_rate": 7.556320755530484e-07, + "loss": 0.83867329, + "num_input_tokens_seen": 259211700, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09851074, + "step": 12014, + "time_per_iteration": 2.6219167709350586 + }, + { + "auxiliary_loss_clip": 0.0640952, + "auxiliary_loss_mlp": 0.01262375, + "balance_loss_clip": 0.0627341, + "balance_loss_mlp": 0.01252445, + "epoch": 0.7223808808056515, + "flos": 28337798597760.0, + "grad_norm": 1.6715764427822655, + "language_loss": 0.86861187, + "learning_rate": 7.553272008637346e-07, + "loss": 0.9453308, + "num_input_tokens_seen": 259233825, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.09924316, + "step": 12015, + "time_per_iteration": 2.5629379749298096 + }, + { + "auxiliary_loss_clip": 0.0640379, + "auxiliary_loss_mlp": 0.01267259, + "balance_loss_clip": 0.06271358, + "balance_loss_mlp": 0.01257365, + "epoch": 0.7224410040583196, + "flos": 21075829591680.0, + "grad_norm": 2.031854447065517, + "language_loss": 0.78420502, + "learning_rate": 7.55022373372538e-07, + "loss": 0.86091554, + "num_input_tokens_seen": 259253055, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.09899902, + "step": 12016, + "time_per_iteration": 2.549696207046509 + }, + { + "auxiliary_loss_clip": 0.06403818, + "auxiliary_loss_mlp": 0.01265816, + "balance_loss_clip": 0.06270836, + "balance_loss_mlp": 0.01255839, + "epoch": 0.7225011273109875, + "flos": 26802398782080.0, + "grad_norm": 1.3727875388559247, + "language_loss": 0.77603066, + "learning_rate": 7.547175930910186e-07, + "loss": 0.85272694, + "num_input_tokens_seen": 259273420, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.09979248, + "step": 12017, + "time_per_iteration": 2.5937881469726562 + }, + { + "auxiliary_loss_clip": 0.06402834, + "auxiliary_loss_mlp": 0.01265872, + "balance_loss_clip": 0.06271364, + "balance_loss_mlp": 0.0125609, + "epoch": 0.7225612505636555, + "flos": 23589826842240.0, + "grad_norm": 1.6197156862149726, + "language_loss": 0.74198735, + "learning_rate": 7.54412860030732e-07, + "loss": 0.81867433, + "num_input_tokens_seen": 259291000, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09783936, + "step": 12018, + "time_per_iteration": 3.996819257736206 + }, + { + "auxiliary_loss_clip": 0.06402058, + "auxiliary_loss_mlp": 0.01270158, + "balance_loss_clip": 0.06272961, + "balance_loss_mlp": 0.01260812, + "epoch": 0.7226213738163234, + "flos": 20783983420800.0, + "grad_norm": 1.7233802894536456, + "language_loss": 0.77552009, + "learning_rate": 7.541081742032347e-07, + "loss": 0.85224223, + "num_input_tokens_seen": 259312390, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.09344482, + "step": 12019, + "time_per_iteration": 2.52474308013916 + }, + { + "auxiliary_loss_clip": 0.0640556, + "auxiliary_loss_mlp": 0.01263394, + "balance_loss_clip": 0.06272571, + "balance_loss_mlp": 0.01253363, + "epoch": 0.7226814970689914, + "flos": 32644227663360.0, + "grad_norm": 1.6248881332172511, + "language_loss": 0.73835564, + "learning_rate": 7.53803535620081e-07, + "loss": 0.81504518, + "num_input_tokens_seen": 259332645, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.10028076, + "step": 12020, + "time_per_iteration": 2.577397346496582 + }, + { + "auxiliary_loss_clip": 0.06409635, + "auxiliary_loss_mlp": 0.01262192, + "balance_loss_clip": 0.06272969, + "balance_loss_mlp": 0.01252054, + "epoch": 0.7227416203216595, + "flos": 22460736274560.0, + "grad_norm": 1.6075634360932833, + "language_loss": 0.77574962, + "learning_rate": 7.534989442928219e-07, + "loss": 0.85246789, + "num_input_tokens_seen": 259353810, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.10137939, + "step": 12021, + "time_per_iteration": 2.530141592025757 + }, + { + "auxiliary_loss_clip": 0.06403421, + "auxiliary_loss_mlp": 0.0126503, + "balance_loss_clip": 0.06270886, + "balance_loss_mlp": 0.01255267, + "epoch": 0.7228017435743274, + "flos": 21658641465600.0, + "grad_norm": 1.5420069016517286, + "language_loss": 0.68414694, + "learning_rate": 7.531944002330073e-07, + "loss": 0.76083142, + "num_input_tokens_seen": 259372460, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09765625, + "step": 12022, + "time_per_iteration": 2.504757881164551 + }, + { + "auxiliary_loss_clip": 0.06407183, + "auxiliary_loss_mlp": 0.01266298, + "balance_loss_clip": 0.06271838, + "balance_loss_mlp": 0.0125613, + "epoch": 0.7228618668269954, + "flos": 29541171409920.0, + "grad_norm": 1.8382982507035688, + "language_loss": 0.69865435, + "learning_rate": 7.528899034521858e-07, + "loss": 0.77538919, + "num_input_tokens_seen": 259393275, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.10168457, + "step": 12023, + "time_per_iteration": 2.572157859802246 + }, + { + "auxiliary_loss_clip": 0.06405231, + "auxiliary_loss_mlp": 0.01262251, + "balance_loss_clip": 0.06272452, + "balance_loss_mlp": 0.01252356, + "epoch": 0.7229219900796633, + "flos": 27461169982080.0, + "grad_norm": 1.6264829845814306, + "language_loss": 0.71353316, + "learning_rate": 7.525854539619052e-07, + "loss": 0.79020798, + "num_input_tokens_seen": 259416205, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09887695, + "step": 12024, + "time_per_iteration": 2.548758029937744 + }, + { + "auxiliary_loss_clip": 0.06407243, + "auxiliary_loss_mlp": 0.01265115, + "balance_loss_clip": 0.06272963, + "balance_loss_mlp": 0.01254946, + "epoch": 0.7229821133323313, + "flos": 16294888454400.0, + "grad_norm": 2.8784491415688427, + "language_loss": 0.75972795, + "learning_rate": 7.522810517737089e-07, + "loss": 0.83645153, + "num_input_tokens_seen": 259433115, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.10168457, + "step": 12025, + "time_per_iteration": 2.4729340076446533 + }, + { + "auxiliary_loss_clip": 0.06403269, + "auxiliary_loss_mlp": 0.01264783, + "balance_loss_clip": 0.06271631, + "balance_loss_mlp": 0.01255049, + "epoch": 0.7230422365849992, + "flos": 20418567765120.0, + "grad_norm": 1.900331951753324, + "language_loss": 0.76300782, + "learning_rate": 7.519766968991395e-07, + "loss": 0.83968836, + "num_input_tokens_seen": 259450475, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09741211, + "step": 12026, + "time_per_iteration": 2.4887609481811523 + }, + { + "auxiliary_loss_clip": 0.06407255, + "auxiliary_loss_mlp": 0.01263175, + "balance_loss_clip": 0.06272575, + "balance_loss_mlp": 0.01253114, + "epoch": 0.7231023598376672, + "flos": 25600619197440.0, + "grad_norm": 1.727853118389861, + "language_loss": 0.67822838, + "learning_rate": 7.516723893497388e-07, + "loss": 0.75493264, + "num_input_tokens_seen": 259469355, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.10064697, + "step": 12027, + "time_per_iteration": 2.5328831672668457 + }, + { + "auxiliary_loss_clip": 0.06409849, + "auxiliary_loss_mlp": 0.01267637, + "balance_loss_clip": 0.06273012, + "balance_loss_mlp": 0.01256372, + "epoch": 0.7231624830903352, + "flos": 25155638490240.0, + "grad_norm": 20.233836516227683, + "language_loss": 0.79796958, + "learning_rate": 7.513681291370469e-07, + "loss": 0.87474453, + "num_input_tokens_seen": 259486565, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.11260986, + "step": 12028, + "time_per_iteration": 2.5175299644470215 + }, + { + "auxiliary_loss_clip": 0.06406561, + "auxiliary_loss_mlp": 0.01262813, + "balance_loss_clip": 0.06271036, + "balance_loss_mlp": 0.01252722, + "epoch": 0.7232226063430032, + "flos": 21732169023360.0, + "grad_norm": 1.6712799697819898, + "language_loss": 0.8266964, + "learning_rate": 7.510639162726e-07, + "loss": 0.90339005, + "num_input_tokens_seen": 259505070, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.10089111, + "step": 12029, + "time_per_iteration": 3.9506967067718506 + }, + { + "auxiliary_loss_clip": 0.06311534, + "auxiliary_loss_mlp": 0.01251495, + "balance_loss_clip": 0.06256342, + "balance_loss_mlp": 0.01250514, + "epoch": 0.7232827295956711, + "flos": 68458693426560.0, + "grad_norm": 0.7790969864555375, + "language_loss": 0.6171549, + "learning_rate": 7.507597507679347e-07, + "loss": 0.6927852, + "num_input_tokens_seen": 259569135, + "router_z_loss_clip": 0.55322266, + "router_z_loss_mlp": 0.00980377, + "step": 12030, + "time_per_iteration": 3.187685489654541 + }, + { + "auxiliary_loss_clip": 0.06405394, + "auxiliary_loss_mlp": 0.01265748, + "balance_loss_clip": 0.06273839, + "balance_loss_mlp": 0.01255753, + "epoch": 0.7233428528483391, + "flos": 20198697851520.0, + "grad_norm": 1.6342080054038326, + "language_loss": 0.78514922, + "learning_rate": 7.504556326345859e-07, + "loss": 0.86186063, + "num_input_tokens_seen": 259587035, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09997559, + "step": 12031, + "time_per_iteration": 2.47151255607605 + }, + { + "auxiliary_loss_clip": 0.06411318, + "auxiliary_loss_mlp": 0.01265123, + "balance_loss_clip": 0.0627391, + "balance_loss_mlp": 0.01254465, + "epoch": 0.723402976101007, + "flos": 23955955257600.0, + "grad_norm": 1.8287937473952962, + "language_loss": 0.81728959, + "learning_rate": 7.501515618840834e-07, + "loss": 0.894054, + "num_input_tokens_seen": 259606140, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.10656738, + "step": 12032, + "time_per_iteration": 2.5481441020965576 + }, + { + "auxiliary_loss_clip": 0.06416769, + "auxiliary_loss_mlp": 0.01265155, + "balance_loss_clip": 0.06275293, + "balance_loss_mlp": 0.01254485, + "epoch": 0.723463099353675, + "flos": 20819636133120.0, + "grad_norm": 1.8204115009796795, + "language_loss": 0.75397038, + "learning_rate": 7.498475385279592e-07, + "loss": 0.83078963, + "num_input_tokens_seen": 259624275, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.10662842, + "step": 12033, + "time_per_iteration": 3.957021951675415 + }, + { + "auxiliary_loss_clip": 0.0640196, + "auxiliary_loss_mlp": 0.01261304, + "balance_loss_clip": 0.06271483, + "balance_loss_mlp": 0.01251874, + "epoch": 0.723523222606343, + "flos": 19103876403840.0, + "grad_norm": 1.563188843970664, + "language_loss": 0.75271815, + "learning_rate": 7.495435625777423e-07, + "loss": 0.82935083, + "num_input_tokens_seen": 259643465, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.09423828, + "step": 12034, + "time_per_iteration": 2.479860782623291 + }, + { + "auxiliary_loss_clip": 0.0640718, + "auxiliary_loss_mlp": 0.01261832, + "balance_loss_clip": 0.06273372, + "balance_loss_mlp": 0.01252146, + "epoch": 0.723583345859011, + "flos": 26514493752960.0, + "grad_norm": 1.7350921748415202, + "language_loss": 0.80701005, + "learning_rate": 7.492396340449578e-07, + "loss": 0.88370025, + "num_input_tokens_seen": 259662500, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.09680176, + "step": 12035, + "time_per_iteration": 2.559680700302124 + }, + { + "auxiliary_loss_clip": 0.06410785, + "auxiliary_loss_mlp": 0.01263828, + "balance_loss_clip": 0.06273998, + "balance_loss_mlp": 0.01253361, + "epoch": 0.723643469111679, + "flos": 16039323901440.0, + "grad_norm": 3.114522084917199, + "language_loss": 0.61466223, + "learning_rate": 7.489357529411326e-07, + "loss": 0.69140834, + "num_input_tokens_seen": 259680140, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.10473633, + "step": 12036, + "time_per_iteration": 2.4680371284484863 + }, + { + "auxiliary_loss_clip": 0.06403697, + "auxiliary_loss_mlp": 0.01264009, + "balance_loss_clip": 0.06272744, + "balance_loss_mlp": 0.01254914, + "epoch": 0.7237035923643469, + "flos": 21952164718080.0, + "grad_norm": 1.4930749372643133, + "language_loss": 0.67717707, + "learning_rate": 7.486319192777883e-07, + "loss": 0.75385416, + "num_input_tokens_seen": 259700160, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09094238, + "step": 12037, + "time_per_iteration": 3.957728862762451 + }, + { + "auxiliary_loss_clip": 0.06406017, + "auxiliary_loss_mlp": 0.01265379, + "balance_loss_clip": 0.06273565, + "balance_loss_mlp": 0.01255091, + "epoch": 0.7237637156170149, + "flos": 23589281790720.0, + "grad_norm": 1.7134802369768287, + "language_loss": 0.73071694, + "learning_rate": 7.483281330664479e-07, + "loss": 0.80743086, + "num_input_tokens_seen": 259720525, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.10296631, + "step": 12038, + "time_per_iteration": 2.5239899158477783 + }, + { + "auxiliary_loss_clip": 0.06408326, + "auxiliary_loss_mlp": 0.012677, + "balance_loss_clip": 0.06274582, + "balance_loss_mlp": 0.0125625, + "epoch": 0.7238238388696828, + "flos": 20600940176640.0, + "grad_norm": 1.583420390669157, + "language_loss": 0.72335035, + "learning_rate": 7.480243943186293e-07, + "loss": 0.80011058, + "num_input_tokens_seen": 259738680, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.11437988, + "step": 12039, + "time_per_iteration": 2.5016210079193115 + }, + { + "auxiliary_loss_clip": 0.06408711, + "auxiliary_loss_mlp": 0.01262586, + "balance_loss_clip": 0.06274222, + "balance_loss_mlp": 0.0125346, + "epoch": 0.7238839621223508, + "flos": 24213909651840.0, + "grad_norm": 1.553952761498081, + "language_loss": 0.7617048, + "learning_rate": 7.477207030458513e-07, + "loss": 0.83841777, + "num_input_tokens_seen": 259758790, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.09130859, + "step": 12040, + "time_per_iteration": 2.4979355335235596 + }, + { + "auxiliary_loss_clip": 0.0640977, + "auxiliary_loss_mlp": 0.01263735, + "balance_loss_clip": 0.06273755, + "balance_loss_mlp": 0.01252898, + "epoch": 0.7239440853750188, + "flos": 14214928953600.0, + "grad_norm": 1.6058378864892022, + "language_loss": 0.77005613, + "learning_rate": 7.474170592596301e-07, + "loss": 0.84679121, + "num_input_tokens_seen": 259777370, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.10845947, + "step": 12041, + "time_per_iteration": 2.519228458404541 + }, + { + "auxiliary_loss_clip": 0.06408431, + "auxiliary_loss_mlp": 0.01263027, + "balance_loss_clip": 0.06271957, + "balance_loss_mlp": 0.01253365, + "epoch": 0.7240042086276868, + "flos": 21620976255360.0, + "grad_norm": 1.9889626365674344, + "language_loss": 0.63348103, + "learning_rate": 7.471134629714797e-07, + "loss": 0.7101956, + "num_input_tokens_seen": 259794665, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.09667969, + "step": 12042, + "time_per_iteration": 2.475182294845581 + }, + { + "auxiliary_loss_clip": 0.06410774, + "auxiliary_loss_mlp": 0.01268078, + "balance_loss_clip": 0.06275245, + "balance_loss_mlp": 0.012567, + "epoch": 0.7240643318803547, + "flos": 23338203431040.0, + "grad_norm": 1.8474585554645233, + "language_loss": 0.83173352, + "learning_rate": 7.468099141929116e-07, + "loss": 0.90852207, + "num_input_tokens_seen": 259811110, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.11376953, + "step": 12043, + "time_per_iteration": 2.5139901638031006 + }, + { + "auxiliary_loss_clip": 0.06409861, + "auxiliary_loss_mlp": 0.01267458, + "balance_loss_clip": 0.06273165, + "balance_loss_mlp": 0.01256354, + "epoch": 0.7241244551330227, + "flos": 24031746875520.0, + "grad_norm": 2.293056245042729, + "language_loss": 0.64671153, + "learning_rate": 7.465064129354379e-07, + "loss": 0.72348469, + "num_input_tokens_seen": 259831080, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.11102295, + "step": 12044, + "time_per_iteration": 2.499971866607666 + }, + { + "auxiliary_loss_clip": 0.06411785, + "auxiliary_loss_mlp": 0.01265003, + "balance_loss_clip": 0.06276388, + "balance_loss_mlp": 0.01254781, + "epoch": 0.7241845783856906, + "flos": 18735651636480.0, + "grad_norm": 1.9189721390747507, + "language_loss": 0.81796312, + "learning_rate": 7.462029592105658e-07, + "loss": 0.89473093, + "num_input_tokens_seen": 259850135, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.10211182, + "step": 12045, + "time_per_iteration": 2.4791791439056396 + }, + { + "auxiliary_loss_clip": 0.06403655, + "auxiliary_loss_mlp": 0.0126726, + "balance_loss_clip": 0.06274088, + "balance_loss_mlp": 0.01256752, + "epoch": 0.7242447016383586, + "flos": 19504483574400.0, + "grad_norm": 2.888520203836974, + "language_loss": 0.72249848, + "learning_rate": 7.458995530298034e-07, + "loss": 0.79920763, + "num_input_tokens_seen": 259868185, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.1050415, + "step": 12046, + "time_per_iteration": 2.4642648696899414 + }, + { + "auxiliary_loss_clip": 0.064097, + "auxiliary_loss_mlp": 0.01264976, + "balance_loss_clip": 0.06273885, + "balance_loss_mlp": 0.01254396, + "epoch": 0.7243048248910267, + "flos": 22169980206720.0, + "grad_norm": 1.724287594820583, + "language_loss": 0.71379775, + "learning_rate": 7.455961944046553e-07, + "loss": 0.79054451, + "num_input_tokens_seen": 259887055, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.10571289, + "step": 12047, + "time_per_iteration": 2.5032777786254883 + }, + { + "auxiliary_loss_clip": 0.06410667, + "auxiliary_loss_mlp": 0.01264735, + "balance_loss_clip": 0.06274027, + "balance_loss_mlp": 0.01253673, + "epoch": 0.7243649481436946, + "flos": 27680159427840.0, + "grad_norm": 1.6409687158316038, + "language_loss": 0.70148283, + "learning_rate": 7.45292883346627e-07, + "loss": 0.77823687, + "num_input_tokens_seen": 259908295, + "router_z_loss_clip": 1.36621094, + "router_z_loss_mlp": 0.11065674, + "step": 12048, + "time_per_iteration": 2.537400007247925 + }, + { + "auxiliary_loss_clip": 0.06309511, + "auxiliary_loss_mlp": 0.01254196, + "balance_loss_clip": 0.06254156, + "balance_loss_mlp": 0.01253124, + "epoch": 0.7244250713963626, + "flos": 63263686538880.0, + "grad_norm": 0.8079275009265211, + "language_loss": 0.53702354, + "learning_rate": 7.449896198672168e-07, + "loss": 0.61266059, + "num_input_tokens_seen": 259968475, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01072693, + "step": 12049, + "time_per_iteration": 3.117490768432617 + }, + { + "auxiliary_loss_clip": 0.06415777, + "auxiliary_loss_mlp": 0.01264713, + "balance_loss_clip": 0.06273454, + "balance_loss_mlp": 0.01252971, + "epoch": 0.7244851946490305, + "flos": 17972815265280.0, + "grad_norm": 2.160877059772018, + "language_loss": 0.60396636, + "learning_rate": 7.446864039779258e-07, + "loss": 0.68077123, + "num_input_tokens_seen": 259984865, + "router_z_loss_clip": 1.42382812, + "router_z_loss_mlp": 0.11737061, + "step": 12050, + "time_per_iteration": 2.4579668045043945 + }, + { + "auxiliary_loss_clip": 0.06310994, + "auxiliary_loss_mlp": 0.01250921, + "balance_loss_clip": 0.06255537, + "balance_loss_mlp": 0.01249847, + "epoch": 0.7245453179016985, + "flos": 70964179488000.0, + "grad_norm": 0.6964887094333322, + "language_loss": 0.53128082, + "learning_rate": 7.443832356902528e-07, + "loss": 0.60689998, + "num_input_tokens_seen": 260046735, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01075745, + "step": 12051, + "time_per_iteration": 3.1524975299835205 + }, + { + "auxiliary_loss_clip": 0.06405707, + "auxiliary_loss_mlp": 0.01263012, + "balance_loss_clip": 0.06273422, + "balance_loss_mlp": 0.01253594, + "epoch": 0.7246054411543664, + "flos": 24574839114240.0, + "grad_norm": 1.4328858557340107, + "language_loss": 0.71919692, + "learning_rate": 7.440801150156927e-07, + "loss": 0.79588413, + "num_input_tokens_seen": 260067950, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.09417725, + "step": 12052, + "time_per_iteration": 2.599375009536743 + }, + { + "auxiliary_loss_clip": 0.06409772, + "auxiliary_loss_mlp": 0.01266045, + "balance_loss_clip": 0.06275947, + "balance_loss_mlp": 0.01255608, + "epoch": 0.7246655644070344, + "flos": 32345715093120.0, + "grad_norm": 1.7264545008228058, + "language_loss": 0.74337375, + "learning_rate": 7.437770419657415e-07, + "loss": 0.8201319, + "num_input_tokens_seen": 260087730, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.10430908, + "step": 12053, + "time_per_iteration": 2.572556495666504 + }, + { + "auxiliary_loss_clip": 0.06411305, + "auxiliary_loss_mlp": 0.01265609, + "balance_loss_clip": 0.06278073, + "balance_loss_mlp": 0.01254952, + "epoch": 0.7247256876597024, + "flos": 21879056430720.0, + "grad_norm": 2.130811806275834, + "language_loss": 0.78439468, + "learning_rate": 7.434740165518898e-07, + "loss": 0.86116385, + "num_input_tokens_seen": 260107760, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.10650635, + "step": 12054, + "time_per_iteration": 2.594451427459717 + }, + { + "auxiliary_loss_clip": 0.0641022, + "auxiliary_loss_mlp": 0.01263418, + "balance_loss_clip": 0.06276123, + "balance_loss_mlp": 0.01253011, + "epoch": 0.7247858109123704, + "flos": 16218048660480.0, + "grad_norm": 2.4211075094396692, + "language_loss": 0.68897808, + "learning_rate": 7.431710387856301e-07, + "loss": 0.76571441, + "num_input_tokens_seen": 260123660, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.10406494, + "step": 12055, + "time_per_iteration": 2.490989923477173 + }, + { + "auxiliary_loss_clip": 0.06406957, + "auxiliary_loss_mlp": 0.01264855, + "balance_loss_clip": 0.062728, + "balance_loss_mlp": 0.01255467, + "epoch": 0.7248459341650383, + "flos": 20857091708160.0, + "grad_norm": 1.6323335153205245, + "language_loss": 0.74211532, + "learning_rate": 7.428681086784496e-07, + "loss": 0.81883347, + "num_input_tokens_seen": 260142690, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.09387207, + "step": 12056, + "time_per_iteration": 2.5162346363067627 + }, + { + "auxiliary_loss_clip": 0.06405525, + "auxiliary_loss_mlp": 0.01261212, + "balance_loss_clip": 0.06274804, + "balance_loss_mlp": 0.0125152, + "epoch": 0.7249060574177063, + "flos": 25928956621440.0, + "grad_norm": 1.8158169987002448, + "language_loss": 0.70777828, + "learning_rate": 7.425652262418368e-07, + "loss": 0.78444564, + "num_input_tokens_seen": 260162590, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09680176, + "step": 12057, + "time_per_iteration": 4.079265594482422 + }, + { + "auxiliary_loss_clip": 0.0641495, + "auxiliary_loss_mlp": 0.01269409, + "balance_loss_clip": 0.06275235, + "balance_loss_mlp": 0.01258704, + "epoch": 0.7249661806703742, + "flos": 17350912661760.0, + "grad_norm": 1.9388728601507708, + "language_loss": 0.62604892, + "learning_rate": 7.42262391487277e-07, + "loss": 0.70289254, + "num_input_tokens_seen": 260181065, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.1071167, + "step": 12058, + "time_per_iteration": 2.567502737045288 + }, + { + "auxiliary_loss_clip": 0.06412682, + "auxiliary_loss_mlp": 0.01264257, + "balance_loss_clip": 0.06279195, + "balance_loss_mlp": 0.01253963, + "epoch": 0.7250263039230422, + "flos": 19580400973440.0, + "grad_norm": 1.9516605705856642, + "language_loss": 0.75217509, + "learning_rate": 7.419596044262535e-07, + "loss": 0.82894444, + "num_input_tokens_seen": 260200330, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.10290527, + "step": 12059, + "time_per_iteration": 2.4943277835845947 + }, + { + "auxiliary_loss_clip": 0.06405184, + "auxiliary_loss_mlp": 0.01262509, + "balance_loss_clip": 0.06274289, + "balance_loss_mlp": 0.01253282, + "epoch": 0.7250864271757103, + "flos": 21982366915200.0, + "grad_norm": 1.7883051719653056, + "language_loss": 0.79778695, + "learning_rate": 7.416568650702472e-07, + "loss": 0.87446392, + "num_input_tokens_seen": 260219975, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09222412, + "step": 12060, + "time_per_iteration": 2.519117593765259 + }, + { + "auxiliary_loss_clip": 0.06412885, + "auxiliary_loss_mlp": 0.01266886, + "balance_loss_clip": 0.06276695, + "balance_loss_mlp": 0.01256449, + "epoch": 0.7251465504283782, + "flos": 25020113310720.0, + "grad_norm": 1.8093299142299697, + "language_loss": 0.76421869, + "learning_rate": 7.413541734307393e-07, + "loss": 0.84101641, + "num_input_tokens_seen": 260242025, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.10443115, + "step": 12061, + "time_per_iteration": 2.5503969192504883 + }, + { + "auxiliary_loss_clip": 0.06405508, + "auxiliary_loss_mlp": 0.01263826, + "balance_loss_clip": 0.06275885, + "balance_loss_mlp": 0.01253747, + "epoch": 0.7252066736810462, + "flos": 16695621406080.0, + "grad_norm": 1.6247315463998022, + "language_loss": 0.81481957, + "learning_rate": 7.410515295192068e-07, + "loss": 0.89151287, + "num_input_tokens_seen": 260260015, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.10083008, + "step": 12062, + "time_per_iteration": 2.478410482406616 + }, + { + "auxiliary_loss_clip": 0.06418011, + "auxiliary_loss_mlp": 0.01265854, + "balance_loss_clip": 0.06279325, + "balance_loss_mlp": 0.01255066, + "epoch": 0.7252667969337141, + "flos": 25710176810880.0, + "grad_norm": 2.2019312286273705, + "language_loss": 0.69337016, + "learning_rate": 7.407489333471262e-07, + "loss": 0.77020884, + "num_input_tokens_seen": 260278635, + "router_z_loss_clip": 1.38476562, + "router_z_loss_mlp": 0.10778809, + "step": 12063, + "time_per_iteration": 2.5213000774383545 + }, + { + "auxiliary_loss_clip": 0.06404665, + "auxiliary_loss_mlp": 0.01264944, + "balance_loss_clip": 0.06275742, + "balance_loss_mlp": 0.01255186, + "epoch": 0.7253269201863821, + "flos": 18265835393280.0, + "grad_norm": 1.3337230483147808, + "language_loss": 0.70080262, + "learning_rate": 7.40446384925973e-07, + "loss": 0.77749866, + "num_input_tokens_seen": 260298510, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.09759521, + "step": 12064, + "time_per_iteration": 2.4883687496185303 + }, + { + "auxiliary_loss_clip": 0.06412718, + "auxiliary_loss_mlp": 0.01263925, + "balance_loss_clip": 0.06279429, + "balance_loss_mlp": 0.01253846, + "epoch": 0.72538704343905, + "flos": 20417938859520.0, + "grad_norm": 1.6031100014197759, + "language_loss": 0.90715456, + "learning_rate": 7.401438842672192e-07, + "loss": 0.98392093, + "num_input_tokens_seen": 260317405, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.10076904, + "step": 12065, + "time_per_iteration": 2.6608688831329346 + }, + { + "auxiliary_loss_clip": 0.06315897, + "auxiliary_loss_mlp": 0.01252262, + "balance_loss_clip": 0.0626056, + "balance_loss_mlp": 0.01251238, + "epoch": 0.725447166691718, + "flos": 70173321125760.0, + "grad_norm": 0.6440962314349006, + "language_loss": 0.56150329, + "learning_rate": 7.398414313823349e-07, + "loss": 0.63718486, + "num_input_tokens_seen": 260388085, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01023865, + "step": 12066, + "time_per_iteration": 3.253070592880249 + }, + { + "auxiliary_loss_clip": 0.064081, + "auxiliary_loss_mlp": 0.01266559, + "balance_loss_clip": 0.06276315, + "balance_loss_mlp": 0.01257029, + "epoch": 0.725507289944386, + "flos": 27059598489600.0, + "grad_norm": 1.6969511416209166, + "language_loss": 0.76925343, + "learning_rate": 7.395390262827897e-07, + "loss": 0.84600002, + "num_input_tokens_seen": 260406165, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09533691, + "step": 12067, + "time_per_iteration": 2.553955554962158 + }, + { + "auxiliary_loss_clip": 0.0632008, + "auxiliary_loss_mlp": 0.01251739, + "balance_loss_clip": 0.06264634, + "balance_loss_mlp": 0.01250711, + "epoch": 0.725567413197054, + "flos": 62941973587200.0, + "grad_norm": 0.7126407397816765, + "language_loss": 0.56957459, + "learning_rate": 7.392366689800515e-07, + "loss": 0.64529276, + "num_input_tokens_seen": 260461365, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01028442, + "step": 12068, + "time_per_iteration": 3.020040512084961 + }, + { + "auxiliary_loss_clip": 0.06320577, + "auxiliary_loss_mlp": 0.01251119, + "balance_loss_clip": 0.0626526, + "balance_loss_mlp": 0.01250047, + "epoch": 0.7256275364497219, + "flos": 60315735392640.0, + "grad_norm": 0.6491964300681237, + "language_loss": 0.55317146, + "learning_rate": 7.389343594855848e-07, + "loss": 0.62888843, + "num_input_tokens_seen": 260523795, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01074219, + "step": 12069, + "time_per_iteration": 4.627661228179932 + }, + { + "auxiliary_loss_clip": 0.0640723, + "auxiliary_loss_mlp": 0.01261481, + "balance_loss_clip": 0.06277817, + "balance_loss_mlp": 0.01252726, + "epoch": 0.7256876597023899, + "flos": 24505378479360.0, + "grad_norm": 2.803632714871867, + "language_loss": 0.80079329, + "learning_rate": 7.38632097810854e-07, + "loss": 0.87748045, + "num_input_tokens_seen": 260544765, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.08770752, + "step": 12070, + "time_per_iteration": 2.5643179416656494 + }, + { + "auxiliary_loss_clip": 0.06405459, + "auxiliary_loss_mlp": 0.01262838, + "balance_loss_clip": 0.06277329, + "balance_loss_mlp": 0.01252867, + "epoch": 0.7257477829550578, + "flos": 24359623102080.0, + "grad_norm": 1.9027271039299547, + "language_loss": 0.72591138, + "learning_rate": 7.383298839673197e-07, + "loss": 0.80259442, + "num_input_tokens_seen": 260564340, + "router_z_loss_clip": 1.28027344, + "router_z_loss_mlp": 0.09979248, + "step": 12071, + "time_per_iteration": 2.527245283126831 + }, + { + "auxiliary_loss_clip": 0.06408995, + "auxiliary_loss_mlp": 0.01268506, + "balance_loss_clip": 0.06277612, + "balance_loss_mlp": 0.01258379, + "epoch": 0.7258079062077258, + "flos": 17208008323200.0, + "grad_norm": 1.784714322475179, + "language_loss": 0.70686817, + "learning_rate": 7.380277179664436e-07, + "loss": 0.78364313, + "num_input_tokens_seen": 260582565, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.10113525, + "step": 12072, + "time_per_iteration": 3.9422738552093506 + }, + { + "auxiliary_loss_clip": 0.06411255, + "auxiliary_loss_mlp": 0.01265945, + "balance_loss_clip": 0.06273982, + "balance_loss_mlp": 0.01255264, + "epoch": 0.7258680294603939, + "flos": 21586832916480.0, + "grad_norm": 1.7307594033578553, + "language_loss": 0.79001957, + "learning_rate": 7.377255998196821e-07, + "loss": 0.86679161, + "num_input_tokens_seen": 260601700, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.10675049, + "step": 12073, + "time_per_iteration": 2.5204336643218994 + }, + { + "auxiliary_loss_clip": 0.06408107, + "auxiliary_loss_mlp": 0.01262862, + "balance_loss_clip": 0.06276815, + "balance_loss_mlp": 0.0125292, + "epoch": 0.7259281527130618, + "flos": 34863150360960.0, + "grad_norm": 1.4580787781655038, + "language_loss": 0.7035231, + "learning_rate": 7.374235295384923e-07, + "loss": 0.78023279, + "num_input_tokens_seen": 260623040, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09942627, + "step": 12074, + "time_per_iteration": 2.6230850219726562 + }, + { + "auxiliary_loss_clip": 0.06411288, + "auxiliary_loss_mlp": 0.01265218, + "balance_loss_clip": 0.06275726, + "balance_loss_mlp": 0.01255342, + "epoch": 0.7259882759657298, + "flos": 25410657991680.0, + "grad_norm": 2.2056247097324193, + "language_loss": 0.74623215, + "learning_rate": 7.371215071343302e-07, + "loss": 0.82299727, + "num_input_tokens_seen": 260642735, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.09875488, + "step": 12075, + "time_per_iteration": 2.556225538253784 + }, + { + "auxiliary_loss_clip": 0.06410095, + "auxiliary_loss_mlp": 0.01264907, + "balance_loss_clip": 0.06275606, + "balance_loss_mlp": 0.01254089, + "epoch": 0.7260483992183977, + "flos": 62966781924480.0, + "grad_norm": 1.5598815820341405, + "language_loss": 0.64038914, + "learning_rate": 7.368195326186458e-07, + "loss": 0.71713918, + "num_input_tokens_seen": 260669935, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.10803223, + "step": 12076, + "time_per_iteration": 4.355054616928101 + }, + { + "auxiliary_loss_clip": 0.064101, + "auxiliary_loss_mlp": 0.01263502, + "balance_loss_clip": 0.06276171, + "balance_loss_mlp": 0.01253703, + "epoch": 0.7261085224710657, + "flos": 26474522555520.0, + "grad_norm": 1.8575056289170144, + "language_loss": 0.7908951, + "learning_rate": 7.365176060028912e-07, + "loss": 0.86763114, + "num_input_tokens_seen": 260689605, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.09796143, + "step": 12077, + "time_per_iteration": 2.5509204864501953 + }, + { + "auxiliary_loss_clip": 0.06314351, + "auxiliary_loss_mlp": 0.01251566, + "balance_loss_clip": 0.06259085, + "balance_loss_mlp": 0.01250447, + "epoch": 0.7261686457237336, + "flos": 66790634198400.0, + "grad_norm": 0.8642282673020346, + "language_loss": 0.64994717, + "learning_rate": 7.362157272985163e-07, + "loss": 0.72560632, + "num_input_tokens_seen": 260748265, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01121521, + "step": 12078, + "time_per_iteration": 3.138261556625366 + }, + { + "auxiliary_loss_clip": 0.06315269, + "auxiliary_loss_mlp": 0.0125259, + "balance_loss_clip": 0.06259946, + "balance_loss_mlp": 0.01251419, + "epoch": 0.7262287689764017, + "flos": 70020731640960.0, + "grad_norm": 0.7225013247461266, + "language_loss": 0.59434861, + "learning_rate": 7.359138965169671e-07, + "loss": 0.67002714, + "num_input_tokens_seen": 260816715, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.0116806, + "step": 12079, + "time_per_iteration": 3.2418954372406006 + }, + { + "auxiliary_loss_clip": 0.06405665, + "auxiliary_loss_mlp": 0.01266491, + "balance_loss_clip": 0.06273351, + "balance_loss_mlp": 0.01256495, + "epoch": 0.7262888922290696, + "flos": 23812212378240.0, + "grad_norm": 1.9020587797469353, + "language_loss": 0.64648104, + "learning_rate": 7.356121136696895e-07, + "loss": 0.72320265, + "num_input_tokens_seen": 260836765, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.09997559, + "step": 12080, + "time_per_iteration": 2.559204339981079 + }, + { + "auxiliary_loss_clip": 0.06412919, + "auxiliary_loss_mlp": 0.01265282, + "balance_loss_clip": 0.06278147, + "balance_loss_mlp": 0.01254637, + "epoch": 0.7263490154817376, + "flos": 19506412218240.0, + "grad_norm": 2.774312810040863, + "language_loss": 0.70093364, + "learning_rate": 7.35310378768128e-07, + "loss": 0.77771568, + "num_input_tokens_seen": 260854610, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.10644531, + "step": 12081, + "time_per_iteration": 2.4881443977355957 + }, + { + "auxiliary_loss_clip": 0.06414886, + "auxiliary_loss_mlp": 0.01264794, + "balance_loss_clip": 0.06277792, + "balance_loss_mlp": 0.01255144, + "epoch": 0.7264091387344055, + "flos": 16291240801920.0, + "grad_norm": 1.7064307786891335, + "language_loss": 0.81121981, + "learning_rate": 7.350086918237237e-07, + "loss": 0.88801658, + "num_input_tokens_seen": 260871620, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.09655762, + "step": 12082, + "time_per_iteration": 2.51804256439209 + }, + { + "auxiliary_loss_clip": 0.06418996, + "auxiliary_loss_mlp": 0.0126418, + "balance_loss_clip": 0.06277427, + "balance_loss_mlp": 0.01252474, + "epoch": 0.7264692619870735, + "flos": 24358784561280.0, + "grad_norm": 2.224005114416304, + "language_loss": 0.77144599, + "learning_rate": 7.347070528479158e-07, + "loss": 0.84827775, + "num_input_tokens_seen": 260890490, + "router_z_loss_clip": 1.41503906, + "router_z_loss_mlp": 0.11706543, + "step": 12083, + "time_per_iteration": 2.5199551582336426 + }, + { + "auxiliary_loss_clip": 0.06416926, + "auxiliary_loss_mlp": 0.01265943, + "balance_loss_clip": 0.06278973, + "balance_loss_mlp": 0.01255441, + "epoch": 0.7265293852397414, + "flos": 25126568323200.0, + "grad_norm": 1.6593932119603014, + "language_loss": 0.72771877, + "learning_rate": 7.344054618521433e-07, + "loss": 0.80454749, + "num_input_tokens_seen": 260909700, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.10498047, + "step": 12084, + "time_per_iteration": 2.5542185306549072 + }, + { + "auxiliary_loss_clip": 0.06412492, + "auxiliary_loss_mlp": 0.0126483, + "balance_loss_clip": 0.06276167, + "balance_loss_mlp": 0.01254173, + "epoch": 0.7265895084924094, + "flos": 22644869621760.0, + "grad_norm": 1.8149106211320094, + "language_loss": 0.78171599, + "learning_rate": 7.34103918847843e-07, + "loss": 0.85848927, + "num_input_tokens_seen": 260929090, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.10656738, + "step": 12085, + "time_per_iteration": 2.5213918685913086 + }, + { + "auxiliary_loss_clip": 0.06410021, + "auxiliary_loss_mlp": 0.0126413, + "balance_loss_clip": 0.06274905, + "balance_loss_mlp": 0.0125473, + "epoch": 0.7266496317450775, + "flos": 23375030100480.0, + "grad_norm": 1.688683771457735, + "language_loss": 0.7278198, + "learning_rate": 7.338024238464493e-07, + "loss": 0.80456126, + "num_input_tokens_seen": 260946615, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.09405518, + "step": 12086, + "time_per_iteration": 2.5169167518615723 + }, + { + "auxiliary_loss_clip": 0.06407881, + "auxiliary_loss_mlp": 0.01265357, + "balance_loss_clip": 0.06275129, + "balance_loss_mlp": 0.01255123, + "epoch": 0.7267097549977454, + "flos": 28082150190720.0, + "grad_norm": 1.7618222753787933, + "language_loss": 0.69773293, + "learning_rate": 7.335009768593938e-07, + "loss": 0.77446526, + "num_input_tokens_seen": 260968515, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.10247803, + "step": 12087, + "time_per_iteration": 2.552579641342163 + }, + { + "auxiliary_loss_clip": 0.06413816, + "auxiliary_loss_mlp": 0.01265425, + "balance_loss_clip": 0.06276657, + "balance_loss_mlp": 0.01254195, + "epoch": 0.7267698782504134, + "flos": 22201272506880.0, + "grad_norm": 1.8690535814436378, + "language_loss": 0.79212523, + "learning_rate": 7.331995778981088e-07, + "loss": 0.86891758, + "num_input_tokens_seen": 260986790, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.11230469, + "step": 12088, + "time_per_iteration": 2.5224051475524902 + }, + { + "auxiliary_loss_clip": 0.06411967, + "auxiliary_loss_mlp": 0.01267729, + "balance_loss_clip": 0.06275503, + "balance_loss_mlp": 0.01257561, + "epoch": 0.7268300015030813, + "flos": 18520729113600.0, + "grad_norm": 2.081138271531092, + "language_loss": 0.74134862, + "learning_rate": 7.328982269740221e-07, + "loss": 0.81814551, + "num_input_tokens_seen": 261004925, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.10168457, + "step": 12089, + "time_per_iteration": 2.4536690711975098 + }, + { + "auxiliary_loss_clip": 0.06410675, + "auxiliary_loss_mlp": 0.01266044, + "balance_loss_clip": 0.06273594, + "balance_loss_mlp": 0.0125606, + "epoch": 0.7268901247557493, + "flos": 23992530364800.0, + "grad_norm": 1.672566959006191, + "language_loss": 0.71264297, + "learning_rate": 7.325969240985616e-07, + "loss": 0.78941011, + "num_input_tokens_seen": 261023895, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.09979248, + "step": 12090, + "time_per_iteration": 2.518209457397461 + }, + { + "auxiliary_loss_clip": 0.06411642, + "auxiliary_loss_mlp": 0.01265075, + "balance_loss_clip": 0.06275435, + "balance_loss_mlp": 0.01254209, + "epoch": 0.7269502480084172, + "flos": 32096313815040.0, + "grad_norm": 1.7636278155243394, + "language_loss": 0.774212, + "learning_rate": 7.322956692831528e-07, + "loss": 0.85097921, + "num_input_tokens_seen": 261045445, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.10864258, + "step": 12091, + "time_per_iteration": 2.5809051990509033 + }, + { + "auxiliary_loss_clip": 0.06407169, + "auxiliary_loss_mlp": 0.01262324, + "balance_loss_clip": 0.06273061, + "balance_loss_mlp": 0.01251947, + "epoch": 0.7270103712610853, + "flos": 19068852597120.0, + "grad_norm": 1.7821213244340646, + "language_loss": 0.71747637, + "learning_rate": 7.319944625392205e-07, + "loss": 0.79417133, + "num_input_tokens_seen": 261064275, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.10375977, + "step": 12092, + "time_per_iteration": 2.5037333965301514 + }, + { + "auxiliary_loss_clip": 0.06409185, + "auxiliary_loss_mlp": 0.01262916, + "balance_loss_clip": 0.0627584, + "balance_loss_mlp": 0.01252718, + "epoch": 0.7270704945137532, + "flos": 34541605117440.0, + "grad_norm": 1.8451884643439012, + "language_loss": 0.61625177, + "learning_rate": 7.31693303878184e-07, + "loss": 0.69297278, + "num_input_tokens_seen": 261083310, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.10198975, + "step": 12093, + "time_per_iteration": 2.6145272254943848 + }, + { + "auxiliary_loss_clip": 0.06407997, + "auxiliary_loss_mlp": 0.0127204, + "balance_loss_clip": 0.06275733, + "balance_loss_mlp": 0.01261461, + "epoch": 0.7271306177664212, + "flos": 21514101972480.0, + "grad_norm": 1.4518547441748084, + "language_loss": 0.7566582, + "learning_rate": 7.313921933114644e-07, + "loss": 0.83345854, + "num_input_tokens_seen": 261103460, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.10583496, + "step": 12094, + "time_per_iteration": 2.5348317623138428 + }, + { + "auxiliary_loss_clip": 0.06402551, + "auxiliary_loss_mlp": 0.01268346, + "balance_loss_clip": 0.0627303, + "balance_loss_mlp": 0.01258976, + "epoch": 0.7271907410190891, + "flos": 22278866987520.0, + "grad_norm": 1.9666023712862966, + "language_loss": 0.84875292, + "learning_rate": 7.310911308504808e-07, + "loss": 0.92546189, + "num_input_tokens_seen": 261121375, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.09375, + "step": 12095, + "time_per_iteration": 2.4921047687530518 + }, + { + "auxiliary_loss_clip": 0.06408881, + "auxiliary_loss_mlp": 0.01266756, + "balance_loss_clip": 0.06273626, + "balance_loss_mlp": 0.01256319, + "epoch": 0.7272508642717571, + "flos": 22899721415040.0, + "grad_norm": 1.6073112969743308, + "language_loss": 0.77431858, + "learning_rate": 7.307901165066479e-07, + "loss": 0.85107493, + "num_input_tokens_seen": 261141105, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.10437012, + "step": 12096, + "time_per_iteration": 2.5228958129882812 + }, + { + "auxiliary_loss_clip": 0.06409237, + "auxiliary_loss_mlp": 0.01264949, + "balance_loss_clip": 0.06274968, + "balance_loss_mlp": 0.01254852, + "epoch": 0.727310987524425, + "flos": 11660667016320.0, + "grad_norm": 1.766744410162751, + "language_loss": 0.72485346, + "learning_rate": 7.30489150291381e-07, + "loss": 0.80159533, + "num_input_tokens_seen": 261159255, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.10095215, + "step": 12097, + "time_per_iteration": 3.9472336769104004 + }, + { + "auxiliary_loss_clip": 0.06410161, + "auxiliary_loss_mlp": 0.01263507, + "balance_loss_clip": 0.06275506, + "balance_loss_mlp": 0.01253111, + "epoch": 0.727371110777093, + "flos": 24542247075840.0, + "grad_norm": 1.6914945832849257, + "language_loss": 0.76620024, + "learning_rate": 7.301882322160935e-07, + "loss": 0.84293687, + "num_input_tokens_seen": 261177960, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.10400391, + "step": 12098, + "time_per_iteration": 2.5401840209960938 + }, + { + "auxiliary_loss_clip": 0.06412796, + "auxiliary_loss_mlp": 0.0126774, + "balance_loss_clip": 0.06274997, + "balance_loss_mlp": 0.01256982, + "epoch": 0.7274312340297611, + "flos": 74755175690880.0, + "grad_norm": 1.647144818498915, + "language_loss": 0.67571467, + "learning_rate": 7.298873622921952e-07, + "loss": 0.75252008, + "num_input_tokens_seen": 261205660, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.10766602, + "step": 12099, + "time_per_iteration": 2.933919668197632 + }, + { + "auxiliary_loss_clip": 0.06414318, + "auxiliary_loss_mlp": 0.01268861, + "balance_loss_clip": 0.06274534, + "balance_loss_mlp": 0.0125731, + "epoch": 0.727491357282429, + "flos": 22348872673920.0, + "grad_norm": 1.593136067800256, + "language_loss": 0.72549355, + "learning_rate": 7.29586540531095e-07, + "loss": 0.80232537, + "num_input_tokens_seen": 261225185, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.11560059, + "step": 12100, + "time_per_iteration": 2.485959053039551 + }, + { + "auxiliary_loss_clip": 0.06406155, + "auxiliary_loss_mlp": 0.01265862, + "balance_loss_clip": 0.06273396, + "balance_loss_mlp": 0.01256778, + "epoch": 0.727551480535097, + "flos": 23304730924800.0, + "grad_norm": 1.4119889543918884, + "language_loss": 0.75127757, + "learning_rate": 7.292857669442005e-07, + "loss": 0.82799774, + "num_input_tokens_seen": 261247965, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.09088135, + "step": 12101, + "time_per_iteration": 2.610421895980835 + }, + { + "auxiliary_loss_clip": 0.06405263, + "auxiliary_loss_mlp": 0.01263956, + "balance_loss_clip": 0.06274393, + "balance_loss_mlp": 0.01254651, + "epoch": 0.7276116037877649, + "flos": 21476981813760.0, + "grad_norm": 1.6630445155880014, + "language_loss": 0.82583451, + "learning_rate": 7.289850415429177e-07, + "loss": 0.90252674, + "num_input_tokens_seen": 261267585, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.09295654, + "step": 12102, + "time_per_iteration": 2.5227344036102295 + }, + { + "auxiliary_loss_clip": 0.06406877, + "auxiliary_loss_mlp": 0.01266073, + "balance_loss_clip": 0.06273448, + "balance_loss_mlp": 0.012565, + "epoch": 0.7276717270404329, + "flos": 21469393019520.0, + "grad_norm": 2.031204621507473, + "language_loss": 0.81889427, + "learning_rate": 7.286843643386495e-07, + "loss": 0.89562374, + "num_input_tokens_seen": 261285200, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.09570312, + "step": 12103, + "time_per_iteration": 2.4974191188812256 + }, + { + "auxiliary_loss_clip": 0.06410246, + "auxiliary_loss_mlp": 0.01264171, + "balance_loss_clip": 0.06276208, + "balance_loss_mlp": 0.01253818, + "epoch": 0.7277318502931008, + "flos": 16842928083840.0, + "grad_norm": 1.574176499871837, + "language_loss": 0.66993153, + "learning_rate": 7.283837353427968e-07, + "loss": 0.74667573, + "num_input_tokens_seen": 261303645, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.10351562, + "step": 12104, + "time_per_iteration": 2.4653480052948 + }, + { + "auxiliary_loss_clip": 0.06406664, + "auxiliary_loss_mlp": 0.01268067, + "balance_loss_clip": 0.06276865, + "balance_loss_mlp": 0.01257815, + "epoch": 0.7277919735457689, + "flos": 33408824970240.0, + "grad_norm": 1.70221768283368, + "language_loss": 0.65823901, + "learning_rate": 7.280831545667611e-07, + "loss": 0.73498631, + "num_input_tokens_seen": 261323265, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.1026001, + "step": 12105, + "time_per_iteration": 2.6353166103363037 + }, + { + "auxiliary_loss_clip": 0.06408508, + "auxiliary_loss_mlp": 0.01267339, + "balance_loss_clip": 0.06276379, + "balance_loss_mlp": 0.01257599, + "epoch": 0.7278520967984368, + "flos": 19212218133120.0, + "grad_norm": 2.1199426403905197, + "language_loss": 0.75508106, + "learning_rate": 7.27782622021939e-07, + "loss": 0.83183956, + "num_input_tokens_seen": 261339745, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.09741211, + "step": 12106, + "time_per_iteration": 2.46575665473938 + }, + { + "auxiliary_loss_clip": 0.06411369, + "auxiliary_loss_mlp": 0.01266618, + "balance_loss_clip": 0.06273164, + "balance_loss_mlp": 0.01255228, + "epoch": 0.7279122200511048, + "flos": 34103206955520.0, + "grad_norm": 1.806710660650235, + "language_loss": 0.70616901, + "learning_rate": 7.274821377197273e-07, + "loss": 0.78294891, + "num_input_tokens_seen": 261359310, + "router_z_loss_clip": 1.38183594, + "router_z_loss_mlp": 0.11395264, + "step": 12107, + "time_per_iteration": 2.6280477046966553 + }, + { + "auxiliary_loss_clip": 0.06407417, + "auxiliary_loss_mlp": 0.01263404, + "balance_loss_clip": 0.06274886, + "balance_loss_mlp": 0.01253885, + "epoch": 0.7279723433037727, + "flos": 54610913865600.0, + "grad_norm": 1.4427675680101948, + "language_loss": 0.75342691, + "learning_rate": 7.271817016715205e-07, + "loss": 0.83013523, + "num_input_tokens_seen": 261384640, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.09515381, + "step": 12108, + "time_per_iteration": 4.324532985687256 + }, + { + "auxiliary_loss_clip": 0.0640891, + "auxiliary_loss_mlp": 0.01265846, + "balance_loss_clip": 0.06273282, + "balance_loss_mlp": 0.01255809, + "epoch": 0.7280324665564407, + "flos": 36146297859840.0, + "grad_norm": 1.5700716356881925, + "language_loss": 0.67018294, + "learning_rate": 7.268813138887124e-07, + "loss": 0.74693048, + "num_input_tokens_seen": 261405290, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.1003418, + "step": 12109, + "time_per_iteration": 2.615412473678589 + }, + { + "auxiliary_loss_clip": 0.06406409, + "auxiliary_loss_mlp": 0.01267734, + "balance_loss_clip": 0.06273519, + "balance_loss_mlp": 0.01256218, + "epoch": 0.7280925898091086, + "flos": 11623169514240.0, + "grad_norm": 7.186110502128194, + "language_loss": 0.63434047, + "learning_rate": 7.265809743826912e-07, + "loss": 0.71108198, + "num_input_tokens_seen": 261419710, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.11517334, + "step": 12110, + "time_per_iteration": 2.4591712951660156 + }, + { + "auxiliary_loss_clip": 0.06409231, + "auxiliary_loss_mlp": 0.01266788, + "balance_loss_clip": 0.06271385, + "balance_loss_mlp": 0.01256184, + "epoch": 0.7281527130617766, + "flos": 34285663221120.0, + "grad_norm": 1.770442169865723, + "language_loss": 0.5852263, + "learning_rate": 7.26280683164847e-07, + "loss": 0.66198647, + "num_input_tokens_seen": 261442385, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.10595703, + "step": 12111, + "time_per_iteration": 2.5891120433807373 + }, + { + "auxiliary_loss_clip": 0.06411764, + "auxiliary_loss_mlp": 0.01265175, + "balance_loss_clip": 0.0627564, + "balance_loss_mlp": 0.01254744, + "epoch": 0.7282128363144446, + "flos": 13923208563840.0, + "grad_norm": 2.24560382762785, + "language_loss": 0.74143445, + "learning_rate": 7.259804402465677e-07, + "loss": 0.81820381, + "num_input_tokens_seen": 261459805, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.10430908, + "step": 12112, + "time_per_iteration": 3.927354335784912 + }, + { + "auxiliary_loss_clip": 0.0640655, + "auxiliary_loss_mlp": 0.01266322, + "balance_loss_clip": 0.06273867, + "balance_loss_mlp": 0.01256767, + "epoch": 0.7282729595671126, + "flos": 20783983420800.0, + "grad_norm": 2.386616636448106, + "language_loss": 0.66917908, + "learning_rate": 7.25680245639237e-07, + "loss": 0.74590778, + "num_input_tokens_seen": 261477175, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09552002, + "step": 12113, + "time_per_iteration": 2.501143455505371 + }, + { + "auxiliary_loss_clip": 0.06406707, + "auxiliary_loss_mlp": 0.01264241, + "balance_loss_clip": 0.06271793, + "balance_loss_mlp": 0.01254311, + "epoch": 0.7283330828197806, + "flos": 16330876583040.0, + "grad_norm": 1.6899344961685594, + "language_loss": 0.73054916, + "learning_rate": 7.253800993542399e-07, + "loss": 0.80725861, + "num_input_tokens_seen": 261494990, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.0993042, + "step": 12114, + "time_per_iteration": 2.492030382156372 + }, + { + "auxiliary_loss_clip": 0.06404929, + "auxiliary_loss_mlp": 0.01265418, + "balance_loss_clip": 0.06272236, + "balance_loss_mlp": 0.01255429, + "epoch": 0.7283932060724485, + "flos": 27497535454080.0, + "grad_norm": 1.7662061899425427, + "language_loss": 0.68715543, + "learning_rate": 7.250800014029564e-07, + "loss": 0.76385891, + "num_input_tokens_seen": 261514445, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09985352, + "step": 12115, + "time_per_iteration": 2.557182788848877 + }, + { + "auxiliary_loss_clip": 0.06409318, + "auxiliary_loss_mlp": 0.01265218, + "balance_loss_clip": 0.06271549, + "balance_loss_mlp": 0.01254811, + "epoch": 0.7284533293251165, + "flos": 18373548216960.0, + "grad_norm": 1.8492705823258373, + "language_loss": 0.60310125, + "learning_rate": 7.247799517967674e-07, + "loss": 0.67984653, + "num_input_tokens_seen": 261533565, + "router_z_loss_clip": 1.37695312, + "router_z_loss_mlp": 0.10406494, + "step": 12116, + "time_per_iteration": 3.906881093978882 + }, + { + "auxiliary_loss_clip": 0.06408231, + "auxiliary_loss_mlp": 0.01266827, + "balance_loss_clip": 0.06275375, + "balance_loss_mlp": 0.01256766, + "epoch": 0.7285134525777844, + "flos": 21731917461120.0, + "grad_norm": 1.7320251042844839, + "language_loss": 0.72842097, + "learning_rate": 7.2447995054705e-07, + "loss": 0.80517155, + "num_input_tokens_seen": 261553795, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.10058594, + "step": 12117, + "time_per_iteration": 2.522825002670288 + }, + { + "auxiliary_loss_clip": 0.06408626, + "auxiliary_loss_mlp": 0.01265235, + "balance_loss_clip": 0.06274951, + "balance_loss_mlp": 0.01254673, + "epoch": 0.7285735758304525, + "flos": 20747743729920.0, + "grad_norm": 1.8305634695552309, + "language_loss": 0.69773346, + "learning_rate": 7.241799976651807e-07, + "loss": 0.77447206, + "num_input_tokens_seen": 261572565, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.10565186, + "step": 12118, + "time_per_iteration": 2.48207426071167 + }, + { + "auxiliary_loss_clip": 0.06402861, + "auxiliary_loss_mlp": 0.01267316, + "balance_loss_clip": 0.06275323, + "balance_loss_mlp": 0.01257714, + "epoch": 0.7286336990831204, + "flos": 17316643541760.0, + "grad_norm": 1.7593601335155638, + "language_loss": 0.84603906, + "learning_rate": 7.238800931625346e-07, + "loss": 0.92274088, + "num_input_tokens_seen": 261590910, + "router_z_loss_clip": 1.27539062, + "router_z_loss_mlp": 0.0960083, + "step": 12119, + "time_per_iteration": 2.6029109954833984 + }, + { + "auxiliary_loss_clip": 0.0640807, + "auxiliary_loss_mlp": 0.01265759, + "balance_loss_clip": 0.06272867, + "balance_loss_mlp": 0.01255454, + "epoch": 0.7286938223357884, + "flos": 19792724019840.0, + "grad_norm": 1.9939013522780928, + "language_loss": 0.82186806, + "learning_rate": 7.235802370504831e-07, + "loss": 0.89860642, + "num_input_tokens_seen": 261606005, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.10308838, + "step": 12120, + "time_per_iteration": 2.4777402877807617 + }, + { + "auxiliary_loss_clip": 0.06409417, + "auxiliary_loss_mlp": 0.01265212, + "balance_loss_clip": 0.06275336, + "balance_loss_mlp": 0.0125496, + "epoch": 0.7287539455884563, + "flos": 15346241654400.0, + "grad_norm": 1.8086433157736466, + "language_loss": 0.7907117, + "learning_rate": 7.232804293403963e-07, + "loss": 0.86745799, + "num_input_tokens_seen": 261622305, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.10266113, + "step": 12121, + "time_per_iteration": 2.493319511413574 + }, + { + "auxiliary_loss_clip": 0.06409892, + "auxiliary_loss_mlp": 0.01266243, + "balance_loss_clip": 0.06270927, + "balance_loss_mlp": 0.01255693, + "epoch": 0.7288140688411243, + "flos": 25199592756480.0, + "grad_norm": 1.5783623622806526, + "language_loss": 0.69521451, + "learning_rate": 7.229806700436441e-07, + "loss": 0.77197587, + "num_input_tokens_seen": 261642465, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.10559082, + "step": 12122, + "time_per_iteration": 2.524064064025879 + }, + { + "auxiliary_loss_clip": 0.06402311, + "auxiliary_loss_mlp": 0.01263872, + "balance_loss_clip": 0.06270998, + "balance_loss_mlp": 0.01254586, + "epoch": 0.7288741920937922, + "flos": 23990350158720.0, + "grad_norm": 1.7454149846167522, + "language_loss": 0.87436593, + "learning_rate": 7.226809591715923e-07, + "loss": 0.95102781, + "num_input_tokens_seen": 261661420, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.09283447, + "step": 12123, + "time_per_iteration": 2.542051315307617 + }, + { + "auxiliary_loss_clip": 0.06402463, + "auxiliary_loss_mlp": 0.01265281, + "balance_loss_clip": 0.06270853, + "balance_loss_mlp": 0.01255094, + "epoch": 0.7289343153464602, + "flos": 22751114999040.0, + "grad_norm": 1.6465558507133775, + "language_loss": 0.8315962, + "learning_rate": 7.223812967356065e-07, + "loss": 0.90827358, + "num_input_tokens_seen": 261680865, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.10186768, + "step": 12124, + "time_per_iteration": 2.493330955505371 + }, + { + "auxiliary_loss_clip": 0.06405756, + "auxiliary_loss_mlp": 0.01266287, + "balance_loss_clip": 0.06272376, + "balance_loss_mlp": 0.01256173, + "epoch": 0.7289944385991282, + "flos": 24906991898880.0, + "grad_norm": 1.5973594077423074, + "language_loss": 0.66998374, + "learning_rate": 7.220816827470499e-07, + "loss": 0.74670422, + "num_input_tokens_seen": 261701455, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.10113525, + "step": 12125, + "time_per_iteration": 2.5571157932281494 + }, + { + "auxiliary_loss_clip": 0.06410982, + "auxiliary_loss_mlp": 0.01267293, + "balance_loss_clip": 0.06272356, + "balance_loss_mlp": 0.01255521, + "epoch": 0.7290545618517962, + "flos": 22973835951360.0, + "grad_norm": 1.7735347741305036, + "language_loss": 0.75574493, + "learning_rate": 7.217821172172855e-07, + "loss": 0.83252764, + "num_input_tokens_seen": 261721260, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.11773682, + "step": 12126, + "time_per_iteration": 2.4986443519592285 + }, + { + "auxiliary_loss_clip": 0.0631386, + "auxiliary_loss_mlp": 0.01254001, + "balance_loss_clip": 0.06258902, + "balance_loss_mlp": 0.01252942, + "epoch": 0.7291146851044642, + "flos": 61921602092160.0, + "grad_norm": 0.8043212871024376, + "language_loss": 0.58652955, + "learning_rate": 7.2148260015767e-07, + "loss": 0.66220808, + "num_input_tokens_seen": 261779370, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.01060486, + "step": 12127, + "time_per_iteration": 3.065887689590454 + }, + { + "auxiliary_loss_clip": 0.06406868, + "auxiliary_loss_mlp": 0.01268134, + "balance_loss_clip": 0.06276388, + "balance_loss_mlp": 0.01259032, + "epoch": 0.7291748083571321, + "flos": 23337616452480.0, + "grad_norm": 2.002154348717822, + "language_loss": 0.68532437, + "learning_rate": 7.21183131579562e-07, + "loss": 0.76207435, + "num_input_tokens_seen": 261798050, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.09100342, + "step": 12128, + "time_per_iteration": 2.5636982917785645 + }, + { + "auxiliary_loss_clip": 0.06407112, + "auxiliary_loss_mlp": 0.01265972, + "balance_loss_clip": 0.06272791, + "balance_loss_mlp": 0.01255493, + "epoch": 0.7292349316098001, + "flos": 28337588962560.0, + "grad_norm": 1.9770234243530824, + "language_loss": 0.65893352, + "learning_rate": 7.20883711494319e-07, + "loss": 0.73566437, + "num_input_tokens_seen": 261817660, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.10479736, + "step": 12129, + "time_per_iteration": 2.5952858924865723 + }, + { + "auxiliary_loss_clip": 0.06401228, + "auxiliary_loss_mlp": 0.01265963, + "balance_loss_clip": 0.06271209, + "balance_loss_mlp": 0.01255878, + "epoch": 0.729295054862468, + "flos": 24138788866560.0, + "grad_norm": 2.8834397381641206, + "language_loss": 0.74323857, + "learning_rate": 7.205843399132927e-07, + "loss": 0.81991053, + "num_input_tokens_seen": 261837935, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.10076904, + "step": 12130, + "time_per_iteration": 2.5151498317718506 + }, + { + "auxiliary_loss_clip": 0.06408465, + "auxiliary_loss_mlp": 0.01266174, + "balance_loss_clip": 0.06273751, + "balance_loss_mlp": 0.01256548, + "epoch": 0.7293551781151361, + "flos": 22822168861440.0, + "grad_norm": 1.7601185133573507, + "language_loss": 0.69902027, + "learning_rate": 7.202850168478374e-07, + "loss": 0.77576661, + "num_input_tokens_seen": 261857575, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.09625244, + "step": 12131, + "time_per_iteration": 2.5700907707214355 + }, + { + "auxiliary_loss_clip": 0.06405198, + "auxiliary_loss_mlp": 0.0126315, + "balance_loss_clip": 0.06273468, + "balance_loss_mlp": 0.01253238, + "epoch": 0.729415301367804, + "flos": 22133111610240.0, + "grad_norm": 1.4321727616978588, + "language_loss": 0.77646959, + "learning_rate": 7.199857423093025e-07, + "loss": 0.85315311, + "num_input_tokens_seen": 261877265, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09912109, + "step": 12132, + "time_per_iteration": 2.5047810077667236 + }, + { + "auxiliary_loss_clip": 0.06406032, + "auxiliary_loss_mlp": 0.01268163, + "balance_loss_clip": 0.06274553, + "balance_loss_mlp": 0.01258382, + "epoch": 0.729475424620472, + "flos": 12354587804160.0, + "grad_norm": 2.26553261567321, + "language_loss": 0.79865611, + "learning_rate": 7.196865163090358e-07, + "loss": 0.87539804, + "num_input_tokens_seen": 261893695, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.09790039, + "step": 12133, + "time_per_iteration": 2.5156800746917725 + }, + { + "auxiliary_loss_clip": 0.06405626, + "auxiliary_loss_mlp": 0.01262377, + "balance_loss_clip": 0.06273352, + "balance_loss_mlp": 0.01252555, + "epoch": 0.7295355478731399, + "flos": 22201020944640.0, + "grad_norm": 2.1172065702021228, + "language_loss": 0.72792143, + "learning_rate": 7.193873388583846e-07, + "loss": 0.80460143, + "num_input_tokens_seen": 261911825, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.09832764, + "step": 12134, + "time_per_iteration": 2.493656873703003 + }, + { + "auxiliary_loss_clip": 0.06407951, + "auxiliary_loss_mlp": 0.01266233, + "balance_loss_clip": 0.06272922, + "balance_loss_mlp": 0.01255796, + "epoch": 0.7295956711258079, + "flos": 23228771598720.0, + "grad_norm": 1.8016892870366705, + "language_loss": 0.7149846, + "learning_rate": 7.190882099686939e-07, + "loss": 0.79172647, + "num_input_tokens_seen": 261931190, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.10424805, + "step": 12135, + "time_per_iteration": 2.5029256343841553 + }, + { + "auxiliary_loss_clip": 0.06412001, + "auxiliary_loss_mlp": 0.01266167, + "balance_loss_clip": 0.06275906, + "balance_loss_mlp": 0.01256362, + "epoch": 0.7296557943784758, + "flos": 31877282442240.0, + "grad_norm": 2.0055855777259683, + "language_loss": 0.62525374, + "learning_rate": 7.187891296513075e-07, + "loss": 0.70203543, + "num_input_tokens_seen": 261951240, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.0980835, + "step": 12136, + "time_per_iteration": 2.6325221061706543 + }, + { + "auxiliary_loss_clip": 0.06405275, + "auxiliary_loss_mlp": 0.01264655, + "balance_loss_clip": 0.06272214, + "balance_loss_mlp": 0.01255184, + "epoch": 0.7297159176311439, + "flos": 26659033246080.0, + "grad_norm": 1.794436841721563, + "language_loss": 0.7470715, + "learning_rate": 7.184900979175654e-07, + "loss": 0.82377088, + "num_input_tokens_seen": 261971605, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.09472656, + "step": 12137, + "time_per_iteration": 3.958789825439453 + }, + { + "auxiliary_loss_clip": 0.06406206, + "auxiliary_loss_mlp": 0.0126361, + "balance_loss_clip": 0.0627296, + "balance_loss_mlp": 0.01253466, + "epoch": 0.7297760408838118, + "flos": 24755744079360.0, + "grad_norm": 1.5243930727188364, + "language_loss": 0.74341732, + "learning_rate": 7.181911147788069e-07, + "loss": 0.82011551, + "num_input_tokens_seen": 261990830, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.10162354, + "step": 12138, + "time_per_iteration": 2.5344252586364746 + }, + { + "auxiliary_loss_clip": 0.06401816, + "auxiliary_loss_mlp": 0.01265792, + "balance_loss_clip": 0.06270966, + "balance_loss_mlp": 0.01256434, + "epoch": 0.7298361641364798, + "flos": 18079018715520.0, + "grad_norm": 2.292743835188078, + "language_loss": 0.72074485, + "learning_rate": 7.178921802463702e-07, + "loss": 0.79742092, + "num_input_tokens_seen": 262008190, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09350586, + "step": 12139, + "time_per_iteration": 2.4686436653137207 + }, + { + "auxiliary_loss_clip": 0.06401777, + "auxiliary_loss_mlp": 0.01264266, + "balance_loss_clip": 0.06273351, + "balance_loss_mlp": 0.01255015, + "epoch": 0.7298962873891478, + "flos": 29902897486080.0, + "grad_norm": 1.4427366017316514, + "language_loss": 0.73659438, + "learning_rate": 7.175932943315898e-07, + "loss": 0.81325477, + "num_input_tokens_seen": 262030460, + "router_z_loss_clip": 1.28417969, + "router_z_loss_mlp": 0.09246826, + "step": 12140, + "time_per_iteration": 2.5841948986053467 + }, + { + "auxiliary_loss_clip": 0.06410205, + "auxiliary_loss_mlp": 0.01265019, + "balance_loss_clip": 0.06274636, + "balance_loss_mlp": 0.01254266, + "epoch": 0.7299564106418157, + "flos": 32273613054720.0, + "grad_norm": 1.4465948977154814, + "language_loss": 0.55615419, + "learning_rate": 7.172944570458003e-07, + "loss": 0.63290644, + "num_input_tokens_seen": 262050830, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.10748291, + "step": 12141, + "time_per_iteration": 2.5818471908569336 + }, + { + "auxiliary_loss_clip": 0.06406234, + "auxiliary_loss_mlp": 0.01263105, + "balance_loss_clip": 0.06276207, + "balance_loss_mlp": 0.01254277, + "epoch": 0.7300165338944837, + "flos": 22937009281920.0, + "grad_norm": 1.432470794912082, + "language_loss": 0.73197258, + "learning_rate": 7.169956684003342e-07, + "loss": 0.80866599, + "num_input_tokens_seen": 262071245, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.0881958, + "step": 12142, + "time_per_iteration": 2.5505692958831787 + }, + { + "auxiliary_loss_clip": 0.0640648, + "auxiliary_loss_mlp": 0.01261695, + "balance_loss_clip": 0.06273788, + "balance_loss_mlp": 0.01252629, + "epoch": 0.7300766571471516, + "flos": 19834959277440.0, + "grad_norm": 1.6768515180809767, + "language_loss": 0.74087632, + "learning_rate": 7.16696928406521e-07, + "loss": 0.81755805, + "num_input_tokens_seen": 262087525, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.09063721, + "step": 12143, + "time_per_iteration": 2.490084648132324 + }, + { + "auxiliary_loss_clip": 0.06409657, + "auxiliary_loss_mlp": 0.01263891, + "balance_loss_clip": 0.0627545, + "balance_loss_mlp": 0.01253907, + "epoch": 0.7301367803998197, + "flos": 24353879097600.0, + "grad_norm": 2.204410002817552, + "language_loss": 0.66878092, + "learning_rate": 7.163982370756882e-07, + "loss": 0.74551642, + "num_input_tokens_seen": 262107355, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.09973145, + "step": 12144, + "time_per_iteration": 2.54231858253479 + }, + { + "auxiliary_loss_clip": 0.06408693, + "auxiliary_loss_mlp": 0.0126374, + "balance_loss_clip": 0.06274417, + "balance_loss_mlp": 0.01253232, + "epoch": 0.7301969036524876, + "flos": 15309918109440.0, + "grad_norm": 1.5759955689849319, + "language_loss": 0.79171866, + "learning_rate": 7.160995944191627e-07, + "loss": 0.86844301, + "num_input_tokens_seen": 262125645, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.10510254, + "step": 12145, + "time_per_iteration": 2.479991912841797 + }, + { + "auxiliary_loss_clip": 0.06406255, + "auxiliary_loss_mlp": 0.01266826, + "balance_loss_clip": 0.06275664, + "balance_loss_mlp": 0.01256819, + "epoch": 0.7302570269051556, + "flos": 23512945121280.0, + "grad_norm": 1.601000858309641, + "language_loss": 0.92001355, + "learning_rate": 7.158010004482702e-07, + "loss": 0.99674433, + "num_input_tokens_seen": 262144075, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.10003662, + "step": 12146, + "time_per_iteration": 2.536653757095337 + }, + { + "auxiliary_loss_clip": 0.06406654, + "auxiliary_loss_mlp": 0.01262625, + "balance_loss_clip": 0.06276748, + "balance_loss_mlp": 0.01252885, + "epoch": 0.7303171501578235, + "flos": 20529508970880.0, + "grad_norm": 1.778676340204468, + "language_loss": 0.62199593, + "learning_rate": 7.155024551743316e-07, + "loss": 0.69868875, + "num_input_tokens_seen": 262165940, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.097229, + "step": 12147, + "time_per_iteration": 3.9292736053466797 + }, + { + "auxiliary_loss_clip": 0.06418571, + "auxiliary_loss_mlp": 0.01266018, + "balance_loss_clip": 0.06282554, + "balance_loss_mlp": 0.0125579, + "epoch": 0.7303772734104915, + "flos": 18338482483200.0, + "grad_norm": 1.749812940389672, + "language_loss": 0.75328469, + "learning_rate": 7.152039586086693e-07, + "loss": 0.83013058, + "num_input_tokens_seen": 262184520, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.10229492, + "step": 12148, + "time_per_iteration": 2.466489791870117 + }, + { + "auxiliary_loss_clip": 0.06311627, + "auxiliary_loss_mlp": 0.01255211, + "balance_loss_clip": 0.06256207, + "balance_loss_mlp": 0.01254079, + "epoch": 0.7304373966631594, + "flos": 60673604181120.0, + "grad_norm": 3.1920126472148245, + "language_loss": 0.56622815, + "learning_rate": 7.149055107626017e-07, + "loss": 0.64189649, + "num_input_tokens_seen": 262247070, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01133728, + "step": 12149, + "time_per_iteration": 3.1208536624908447 + }, + { + "auxiliary_loss_clip": 0.06409251, + "auxiliary_loss_mlp": 0.01266086, + "balance_loss_clip": 0.06273203, + "balance_loss_mlp": 0.01256108, + "epoch": 0.7304975199158275, + "flos": 19834120736640.0, + "grad_norm": 2.2110460738796847, + "language_loss": 0.74197543, + "learning_rate": 7.146071116474451e-07, + "loss": 0.8187288, + "num_input_tokens_seen": 262266605, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.09979248, + "step": 12150, + "time_per_iteration": 2.563061475753784 + }, + { + "auxiliary_loss_clip": 0.06411943, + "auxiliary_loss_mlp": 0.01268026, + "balance_loss_clip": 0.0627417, + "balance_loss_mlp": 0.01257804, + "epoch": 0.7305576431684954, + "flos": 13228910432640.0, + "grad_norm": 2.0644493545304012, + "language_loss": 0.845092, + "learning_rate": 7.143087612745158e-07, + "loss": 0.92189169, + "num_input_tokens_seen": 262283880, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.10229492, + "step": 12151, + "time_per_iteration": 3.9333503246307373 + }, + { + "auxiliary_loss_clip": 0.0641029, + "auxiliary_loss_mlp": 0.01268677, + "balance_loss_clip": 0.06276184, + "balance_loss_mlp": 0.01258395, + "epoch": 0.7306177664211634, + "flos": 24067231879680.0, + "grad_norm": 1.709088154989502, + "language_loss": 0.77853483, + "learning_rate": 7.14010459655127e-07, + "loss": 0.85532451, + "num_input_tokens_seen": 262304155, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.10272217, + "step": 12152, + "time_per_iteration": 2.549255132675171 + }, + { + "auxiliary_loss_clip": 0.06408677, + "auxiliary_loss_mlp": 0.01265757, + "balance_loss_clip": 0.06275931, + "balance_loss_mlp": 0.0125588, + "epoch": 0.7306778896738314, + "flos": 27096425159040.0, + "grad_norm": 1.4467429234304112, + "language_loss": 0.79911304, + "learning_rate": 7.137122068005919e-07, + "loss": 0.87585741, + "num_input_tokens_seen": 262325660, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.09875488, + "step": 12153, + "time_per_iteration": 2.584221839904785 + }, + { + "auxiliary_loss_clip": 0.06409719, + "auxiliary_loss_mlp": 0.01268444, + "balance_loss_clip": 0.06271548, + "balance_loss_mlp": 0.01258473, + "epoch": 0.7307380129264993, + "flos": 16696250311680.0, + "grad_norm": 1.5292836861635837, + "language_loss": 0.67226088, + "learning_rate": 7.134140027222173e-07, + "loss": 0.74904257, + "num_input_tokens_seen": 262344075, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.09967041, + "step": 12154, + "time_per_iteration": 2.482377052307129 + }, + { + "auxiliary_loss_clip": 0.06408456, + "auxiliary_loss_mlp": 0.01265496, + "balance_loss_clip": 0.06273298, + "balance_loss_mlp": 0.01255584, + "epoch": 0.7307981361791673, + "flos": 21732169023360.0, + "grad_norm": 1.735892015555871, + "language_loss": 0.66179639, + "learning_rate": 7.131158474313128e-07, + "loss": 0.73853588, + "num_input_tokens_seen": 262363305, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.09912109, + "step": 12155, + "time_per_iteration": 3.920834541320801 + }, + { + "auxiliary_loss_clip": 0.06405047, + "auxiliary_loss_mlp": 0.01263947, + "balance_loss_clip": 0.06273931, + "balance_loss_mlp": 0.01254416, + "epoch": 0.7308582594318352, + "flos": 18046468604160.0, + "grad_norm": 1.7732442430270934, + "language_loss": 0.82409012, + "learning_rate": 7.128177409391851e-07, + "loss": 0.90078008, + "num_input_tokens_seen": 262380730, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.09527588, + "step": 12156, + "time_per_iteration": 2.498297691345215 + }, + { + "auxiliary_loss_clip": 0.06404316, + "auxiliary_loss_mlp": 0.01268424, + "balance_loss_clip": 0.06272586, + "balance_loss_mlp": 0.01259304, + "epoch": 0.7309183826845033, + "flos": 13850100276480.0, + "grad_norm": 2.231479695583903, + "language_loss": 0.75512803, + "learning_rate": 7.125196832571367e-07, + "loss": 0.83185542, + "num_input_tokens_seen": 262395480, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09118652, + "step": 12157, + "time_per_iteration": 2.469118595123291 + }, + { + "auxiliary_loss_clip": 0.06404246, + "auxiliary_loss_mlp": 0.0126719, + "balance_loss_clip": 0.06274454, + "balance_loss_mlp": 0.0125816, + "epoch": 0.7309785059371712, + "flos": 17024881224960.0, + "grad_norm": 1.9988755435472185, + "language_loss": 0.73910487, + "learning_rate": 7.122216743964713e-07, + "loss": 0.81581926, + "num_input_tokens_seen": 262413340, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.090271, + "step": 12158, + "time_per_iteration": 2.498945713043213 + }, + { + "auxiliary_loss_clip": 0.06413946, + "auxiliary_loss_mlp": 0.01263808, + "balance_loss_clip": 0.06278427, + "balance_loss_mlp": 0.0125417, + "epoch": 0.7310386291898392, + "flos": 26509127091840.0, + "grad_norm": 1.5605455050098358, + "language_loss": 0.85817492, + "learning_rate": 7.119237143684896e-07, + "loss": 0.93495244, + "num_input_tokens_seen": 262433455, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.09637451, + "step": 12159, + "time_per_iteration": 2.5414113998413086 + }, + { + "auxiliary_loss_clip": 0.06415824, + "auxiliary_loss_mlp": 0.01267306, + "balance_loss_clip": 0.0627675, + "balance_loss_mlp": 0.01256148, + "epoch": 0.7310987524425071, + "flos": 16951521375360.0, + "grad_norm": 1.9612355888194155, + "language_loss": 0.74199778, + "learning_rate": 7.116258031844895e-07, + "loss": 0.81882906, + "num_input_tokens_seen": 262450335, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.1114502, + "step": 12160, + "time_per_iteration": 2.598435163497925 + }, + { + "auxiliary_loss_clip": 0.06413984, + "auxiliary_loss_mlp": 0.01266348, + "balance_loss_clip": 0.06275676, + "balance_loss_mlp": 0.01256304, + "epoch": 0.7311588756951751, + "flos": 13850477619840.0, + "grad_norm": 2.3687706371159023, + "language_loss": 0.72816062, + "learning_rate": 7.113279408557675e-07, + "loss": 0.80496389, + "num_input_tokens_seen": 262468240, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.10040283, + "step": 12161, + "time_per_iteration": 2.487931728363037 + }, + { + "auxiliary_loss_clip": 0.06419692, + "auxiliary_loss_mlp": 0.01265118, + "balance_loss_clip": 0.06277676, + "balance_loss_mlp": 0.01254413, + "epoch": 0.731218998947843, + "flos": 28775567854080.0, + "grad_norm": 1.7390428804054665, + "language_loss": 0.69832623, + "learning_rate": 7.110301273936192e-07, + "loss": 0.77517438, + "num_input_tokens_seen": 262487045, + "router_z_loss_clip": 1.41992188, + "router_z_loss_mlp": 0.10705566, + "step": 12162, + "time_per_iteration": 2.578719139099121 + }, + { + "auxiliary_loss_clip": 0.06409628, + "auxiliary_loss_mlp": 0.01266805, + "balance_loss_clip": 0.0627304, + "balance_loss_mlp": 0.01256785, + "epoch": 0.7312791222005111, + "flos": 27096047815680.0, + "grad_norm": 1.6401378277284773, + "language_loss": 0.67019415, + "learning_rate": 7.107323628093382e-07, + "loss": 0.74695843, + "num_input_tokens_seen": 262504855, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.10028076, + "step": 12163, + "time_per_iteration": 2.5393404960632324 + }, + { + "auxiliary_loss_clip": 0.06406513, + "auxiliary_loss_mlp": 0.01266726, + "balance_loss_clip": 0.0627192, + "balance_loss_mlp": 0.012566, + "epoch": 0.731339245453179, + "flos": 20930493484800.0, + "grad_norm": 1.6144773935767842, + "language_loss": 0.68972957, + "learning_rate": 7.104346471142153e-07, + "loss": 0.76646197, + "num_input_tokens_seen": 262524920, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.10119629, + "step": 12164, + "time_per_iteration": 2.5153493881225586 + }, + { + "auxiliary_loss_clip": 0.06404346, + "auxiliary_loss_mlp": 0.01263865, + "balance_loss_clip": 0.06274466, + "balance_loss_mlp": 0.01254262, + "epoch": 0.731399368705847, + "flos": 23082345388800.0, + "grad_norm": 1.4748874559419136, + "language_loss": 0.73714507, + "learning_rate": 7.101369803195391e-07, + "loss": 0.81382716, + "num_input_tokens_seen": 262545725, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.0960083, + "step": 12165, + "time_per_iteration": 2.5240328311920166 + }, + { + "auxiliary_loss_clip": 0.06409434, + "auxiliary_loss_mlp": 0.01264974, + "balance_loss_clip": 0.06273365, + "balance_loss_mlp": 0.01254782, + "epoch": 0.731459491958515, + "flos": 23588778666240.0, + "grad_norm": 1.7494932066214843, + "language_loss": 0.76978707, + "learning_rate": 7.098393624365988e-07, + "loss": 0.84653127, + "num_input_tokens_seen": 262565480, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.10192871, + "step": 12166, + "time_per_iteration": 2.535602569580078 + }, + { + "auxiliary_loss_clip": 0.06405294, + "auxiliary_loss_mlp": 0.01264593, + "balance_loss_clip": 0.06273952, + "balance_loss_mlp": 0.01254574, + "epoch": 0.7315196152111829, + "flos": 22385280072960.0, + "grad_norm": 1.6529519301050002, + "language_loss": 0.79870826, + "learning_rate": 7.095417934766781e-07, + "loss": 0.87540716, + "num_input_tokens_seen": 262584145, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.10015869, + "step": 12167, + "time_per_iteration": 2.5016744136810303 + }, + { + "auxiliary_loss_clip": 0.06406464, + "auxiliary_loss_mlp": 0.0126602, + "balance_loss_clip": 0.06274685, + "balance_loss_mlp": 0.01256155, + "epoch": 0.7315797384638509, + "flos": 26184227685120.0, + "grad_norm": 1.5786791569795495, + "language_loss": 0.77113497, + "learning_rate": 7.092442734510622e-07, + "loss": 0.84785974, + "num_input_tokens_seen": 262604045, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09863281, + "step": 12168, + "time_per_iteration": 2.550841808319092 + }, + { + "auxiliary_loss_clip": 0.06411693, + "auxiliary_loss_mlp": 0.01264978, + "balance_loss_clip": 0.06273279, + "balance_loss_mlp": 0.01254011, + "epoch": 0.7316398617165188, + "flos": 21512634526080.0, + "grad_norm": 1.4637772541157787, + "language_loss": 0.82124925, + "learning_rate": 7.089468023710326e-07, + "loss": 0.89801592, + "num_input_tokens_seen": 262624540, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.10955811, + "step": 12169, + "time_per_iteration": 2.4971840381622314 + }, + { + "auxiliary_loss_clip": 0.06413089, + "auxiliary_loss_mlp": 0.01269449, + "balance_loss_clip": 0.06276171, + "balance_loss_mlp": 0.01259031, + "epoch": 0.7316999849691869, + "flos": 30490489042560.0, + "grad_norm": 1.5962469016193046, + "language_loss": 0.70136017, + "learning_rate": 7.08649380247871e-07, + "loss": 0.77818549, + "num_input_tokens_seen": 262644545, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.10418701, + "step": 12170, + "time_per_iteration": 2.580601692199707 + }, + { + "auxiliary_loss_clip": 0.06408713, + "auxiliary_loss_mlp": 0.01268064, + "balance_loss_clip": 0.06273422, + "balance_loss_mlp": 0.01256655, + "epoch": 0.7317601082218548, + "flos": 21550257809280.0, + "grad_norm": 1.8557087884597323, + "language_loss": 0.69686925, + "learning_rate": 7.083520070928533e-07, + "loss": 0.773637, + "num_input_tokens_seen": 262662570, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.11413574, + "step": 12171, + "time_per_iteration": 2.483708143234253 + }, + { + "auxiliary_loss_clip": 0.06406379, + "auxiliary_loss_mlp": 0.01269004, + "balance_loss_clip": 0.06272611, + "balance_loss_mlp": 0.01258406, + "epoch": 0.7318202314745228, + "flos": 33259338086400.0, + "grad_norm": 1.4958611702028526, + "language_loss": 0.65253127, + "learning_rate": 7.080546829172564e-07, + "loss": 0.72928506, + "num_input_tokens_seen": 262683245, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.10595703, + "step": 12172, + "time_per_iteration": 2.6077332496643066 + }, + { + "auxiliary_loss_clip": 0.06410083, + "auxiliary_loss_mlp": 0.01266169, + "balance_loss_clip": 0.06274219, + "balance_loss_mlp": 0.0125547, + "epoch": 0.7318803547271907, + "flos": 20163254774400.0, + "grad_norm": 2.043922732836794, + "language_loss": 0.61819667, + "learning_rate": 7.077574077323564e-07, + "loss": 0.69495922, + "num_input_tokens_seen": 262701585, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.10693359, + "step": 12173, + "time_per_iteration": 2.4937400817871094 + }, + { + "auxiliary_loss_clip": 0.06411927, + "auxiliary_loss_mlp": 0.01265932, + "balance_loss_clip": 0.0627674, + "balance_loss_mlp": 0.01256395, + "epoch": 0.7319404779798587, + "flos": 20564826266880.0, + "grad_norm": 1.776213405218001, + "language_loss": 0.74138248, + "learning_rate": 7.074601815494243e-07, + "loss": 0.81816107, + "num_input_tokens_seen": 262719295, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.09533691, + "step": 12174, + "time_per_iteration": 2.5296590328216553 + }, + { + "auxiliary_loss_clip": 0.06402949, + "auxiliary_loss_mlp": 0.01263773, + "balance_loss_clip": 0.06272517, + "balance_loss_mlp": 0.01254701, + "epoch": 0.7320006012325266, + "flos": 28703130399360.0, + "grad_norm": 1.6525649397268998, + "language_loss": 0.81230605, + "learning_rate": 7.071630043797317e-07, + "loss": 0.88897324, + "num_input_tokens_seen": 262739995, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.09069824, + "step": 12175, + "time_per_iteration": 2.5799436569213867 + }, + { + "auxiliary_loss_clip": 0.06408073, + "auxiliary_loss_mlp": 0.01263853, + "balance_loss_clip": 0.06274186, + "balance_loss_mlp": 0.01253846, + "epoch": 0.7320607244851947, + "flos": 16368290231040.0, + "grad_norm": 1.8780371649414138, + "language_loss": 0.76478672, + "learning_rate": 7.068658762345488e-07, + "loss": 0.841506, + "num_input_tokens_seen": 262757680, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.10009766, + "step": 12176, + "time_per_iteration": 2.48456072807312 + }, + { + "auxiliary_loss_clip": 0.06404638, + "auxiliary_loss_mlp": 0.01267397, + "balance_loss_clip": 0.06272592, + "balance_loss_mlp": 0.01257526, + "epoch": 0.7321208477378626, + "flos": 20960653754880.0, + "grad_norm": 1.8116961288906432, + "language_loss": 0.76882672, + "learning_rate": 7.065687971251399e-07, + "loss": 0.84554708, + "num_input_tokens_seen": 262776990, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.09881592, + "step": 12177, + "time_per_iteration": 3.9612483978271484 + }, + { + "auxiliary_loss_clip": 0.06404608, + "auxiliary_loss_mlp": 0.01266624, + "balance_loss_clip": 0.06272198, + "balance_loss_mlp": 0.01257183, + "epoch": 0.7321809709905306, + "flos": 13850226057600.0, + "grad_norm": 2.0192997733839855, + "language_loss": 0.74703526, + "learning_rate": 7.06271767062772e-07, + "loss": 0.82374752, + "num_input_tokens_seen": 262795440, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09442139, + "step": 12178, + "time_per_iteration": 2.451946973800659 + }, + { + "auxiliary_loss_clip": 0.06407191, + "auxiliary_loss_mlp": 0.0126406, + "balance_loss_clip": 0.062708, + "balance_loss_mlp": 0.01253617, + "epoch": 0.7322410942431986, + "flos": 26987286816000.0, + "grad_norm": 1.9092278699703453, + "language_loss": 0.82810688, + "learning_rate": 7.059747860587084e-07, + "loss": 0.90481937, + "num_input_tokens_seen": 262816385, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.10449219, + "step": 12179, + "time_per_iteration": 2.5572235584259033 + }, + { + "auxiliary_loss_clip": 0.06400885, + "auxiliary_loss_mlp": 0.01265696, + "balance_loss_clip": 0.0627311, + "balance_loss_mlp": 0.0125573, + "epoch": 0.7323012174958665, + "flos": 17645526017280.0, + "grad_norm": 1.5024024158805138, + "language_loss": 0.7521069, + "learning_rate": 7.056778541242115e-07, + "loss": 0.82877266, + "num_input_tokens_seen": 262834955, + "router_z_loss_clip": 1.27832031, + "router_z_loss_mlp": 0.09960938, + "step": 12180, + "time_per_iteration": 2.455678701400757 + }, + { + "auxiliary_loss_clip": 0.06411432, + "auxiliary_loss_mlp": 0.01267053, + "balance_loss_clip": 0.06272306, + "balance_loss_mlp": 0.01256503, + "epoch": 0.7323613407485345, + "flos": 32350914046080.0, + "grad_norm": 1.8054283665304076, + "language_loss": 0.79850274, + "learning_rate": 7.053809712705396e-07, + "loss": 0.87528759, + "num_input_tokens_seen": 262853555, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.10552979, + "step": 12181, + "time_per_iteration": 2.595571756362915 + }, + { + "auxiliary_loss_clip": 0.06413537, + "auxiliary_loss_mlp": 0.01272467, + "balance_loss_clip": 0.06274928, + "balance_loss_mlp": 0.01261625, + "epoch": 0.7324214640012024, + "flos": 18367594577280.0, + "grad_norm": 1.7248361460474335, + "language_loss": 0.72176909, + "learning_rate": 7.050841375089506e-07, + "loss": 0.79862905, + "num_input_tokens_seen": 262870975, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.10852051, + "step": 12182, + "time_per_iteration": 2.4603164196014404 + }, + { + "auxiliary_loss_clip": 0.06412099, + "auxiliary_loss_mlp": 0.01267282, + "balance_loss_clip": 0.06276859, + "balance_loss_mlp": 0.01257268, + "epoch": 0.7324815872538705, + "flos": 30820503548160.0, + "grad_norm": 1.5618517746342058, + "language_loss": 0.71680033, + "learning_rate": 7.047873528507015e-07, + "loss": 0.79359412, + "num_input_tokens_seen": 262892635, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.10021973, + "step": 12183, + "time_per_iteration": 2.6027462482452393 + }, + { + "auxiliary_loss_clip": 0.0641363, + "auxiliary_loss_mlp": 0.01270088, + "balance_loss_clip": 0.06275654, + "balance_loss_mlp": 0.01258441, + "epoch": 0.7325417105065384, + "flos": 21511167079680.0, + "grad_norm": 1.8564082179513295, + "language_loss": 0.72663099, + "learning_rate": 7.04490617307045e-07, + "loss": 0.80346817, + "num_input_tokens_seen": 262910725, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.11639404, + "step": 12184, + "time_per_iteration": 2.481126070022583 + }, + { + "auxiliary_loss_clip": 0.06312383, + "auxiliary_loss_mlp": 0.01252618, + "balance_loss_clip": 0.06257074, + "balance_loss_mlp": 0.01251615, + "epoch": 0.7326018337592064, + "flos": 67277514746880.0, + "grad_norm": 0.738407632839968, + "language_loss": 0.65071452, + "learning_rate": 7.041939308892344e-07, + "loss": 0.72636449, + "num_input_tokens_seen": 262974150, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01002502, + "step": 12185, + "time_per_iteration": 3.106149196624756 + }, + { + "auxiliary_loss_clip": 0.06409767, + "auxiliary_loss_mlp": 0.01263715, + "balance_loss_clip": 0.06272019, + "balance_loss_mlp": 0.01253278, + "epoch": 0.7326619570118743, + "flos": 22863733286400.0, + "grad_norm": 1.8830306075887209, + "language_loss": 0.8029325, + "learning_rate": 7.038972936085197e-07, + "loss": 0.87966728, + "num_input_tokens_seen": 262993370, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.10443115, + "step": 12186, + "time_per_iteration": 3.9164252281188965 + }, + { + "auxiliary_loss_clip": 0.06409957, + "auxiliary_loss_mlp": 0.01267283, + "balance_loss_clip": 0.06272968, + "balance_loss_mlp": 0.0125656, + "epoch": 0.7327220802645423, + "flos": 23333591456640.0, + "grad_norm": 3.1049708773187685, + "language_loss": 0.73623288, + "learning_rate": 7.036007054761508e-07, + "loss": 0.81300521, + "num_input_tokens_seen": 263012665, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.10717773, + "step": 12187, + "time_per_iteration": 2.534468412399292 + }, + { + "auxiliary_loss_clip": 0.06412861, + "auxiliary_loss_mlp": 0.01267726, + "balance_loss_clip": 0.06277903, + "balance_loss_mlp": 0.01257462, + "epoch": 0.7327822035172102, + "flos": 23186578268160.0, + "grad_norm": 1.736323244132865, + "language_loss": 0.89323306, + "learning_rate": 7.033041665033716e-07, + "loss": 0.97003901, + "num_input_tokens_seen": 263031475, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.10272217, + "step": 12188, + "time_per_iteration": 2.6024370193481445 + }, + { + "auxiliary_loss_clip": 0.06405529, + "auxiliary_loss_mlp": 0.01267933, + "balance_loss_clip": 0.06268479, + "balance_loss_mlp": 0.01257449, + "epoch": 0.7328423267698783, + "flos": 21072517355520.0, + "grad_norm": 1.8789204802001953, + "language_loss": 0.75451827, + "learning_rate": 7.030076767014284e-07, + "loss": 0.83125293, + "num_input_tokens_seen": 263051445, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.10479736, + "step": 12189, + "time_per_iteration": 2.4941177368164062 + }, + { + "auxiliary_loss_clip": 0.06409896, + "auxiliary_loss_mlp": 0.01268331, + "balance_loss_clip": 0.06272592, + "balance_loss_mlp": 0.01257865, + "epoch": 0.7329024500225462, + "flos": 21696055113600.0, + "grad_norm": 1.5072102792760083, + "language_loss": 0.82332706, + "learning_rate": 7.027112360815648e-07, + "loss": 0.90010929, + "num_input_tokens_seen": 263070835, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.10473633, + "step": 12190, + "time_per_iteration": 2.526470184326172 + }, + { + "auxiliary_loss_clip": 0.06406286, + "auxiliary_loss_mlp": 0.01269765, + "balance_loss_clip": 0.06270757, + "balance_loss_mlp": 0.01258995, + "epoch": 0.7329625732752142, + "flos": 24169829604480.0, + "grad_norm": 1.85565696251354, + "language_loss": 0.72012609, + "learning_rate": 7.024148446550204e-07, + "loss": 0.79688656, + "num_input_tokens_seen": 263090070, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.10766602, + "step": 12191, + "time_per_iteration": 3.952462673187256 + }, + { + "auxiliary_loss_clip": 0.06405483, + "auxiliary_loss_mlp": 0.01267854, + "balance_loss_clip": 0.06271866, + "balance_loss_mlp": 0.01257793, + "epoch": 0.7330226965278822, + "flos": 30085227970560.0, + "grad_norm": 1.8630604521541774, + "language_loss": 0.69281983, + "learning_rate": 7.021185024330361e-07, + "loss": 0.76955318, + "num_input_tokens_seen": 263110030, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.10058594, + "step": 12192, + "time_per_iteration": 2.569606065750122 + }, + { + "auxiliary_loss_clip": 0.06404717, + "auxiliary_loss_mlp": 0.01264705, + "balance_loss_clip": 0.06270668, + "balance_loss_mlp": 0.01254859, + "epoch": 0.7330828197805501, + "flos": 23375113954560.0, + "grad_norm": 2.149879925519752, + "language_loss": 0.73025858, + "learning_rate": 7.01822209426848e-07, + "loss": 0.80695283, + "num_input_tokens_seen": 263129735, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.09844971, + "step": 12193, + "time_per_iteration": 2.5172417163848877 + }, + { + "auxiliary_loss_clip": 0.06408362, + "auxiliary_loss_mlp": 0.01270537, + "balance_loss_clip": 0.06271482, + "balance_loss_mlp": 0.01260207, + "epoch": 0.7331429430332181, + "flos": 21039170630400.0, + "grad_norm": 1.6561607292660703, + "language_loss": 0.77499682, + "learning_rate": 7.015259656476911e-07, + "loss": 0.85178578, + "num_input_tokens_seen": 263149100, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.10333252, + "step": 12194, + "time_per_iteration": 2.479529857635498 + }, + { + "auxiliary_loss_clip": 0.06405737, + "auxiliary_loss_mlp": 0.01263406, + "balance_loss_clip": 0.06272283, + "balance_loss_mlp": 0.0125285, + "epoch": 0.733203066285886, + "flos": 14653201334400.0, + "grad_norm": 1.6173563987107382, + "language_loss": 0.70813656, + "learning_rate": 7.012297711067998e-07, + "loss": 0.78482801, + "num_input_tokens_seen": 263166620, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.10552979, + "step": 12195, + "time_per_iteration": 3.877392292022705 + }, + { + "auxiliary_loss_clip": 0.06408596, + "auxiliary_loss_mlp": 0.01263504, + "balance_loss_clip": 0.06272919, + "balance_loss_mlp": 0.01253991, + "epoch": 0.7332631895385541, + "flos": 17171013945600.0, + "grad_norm": 1.8915458632347482, + "language_loss": 0.72392344, + "learning_rate": 7.009336258154057e-07, + "loss": 0.80064452, + "num_input_tokens_seen": 263184780, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.09515381, + "step": 12196, + "time_per_iteration": 2.475527286529541 + }, + { + "auxiliary_loss_clip": 0.0640474, + "auxiliary_loss_mlp": 0.01267096, + "balance_loss_clip": 0.06272123, + "balance_loss_mlp": 0.01256808, + "epoch": 0.733323312791222, + "flos": 28665758678400.0, + "grad_norm": 1.6827859274042947, + "language_loss": 0.7184931, + "learning_rate": 7.006375297847394e-07, + "loss": 0.79521143, + "num_input_tokens_seen": 263204625, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.10290527, + "step": 12197, + "time_per_iteration": 2.535411834716797 + }, + { + "auxiliary_loss_clip": 0.06414885, + "auxiliary_loss_mlp": 0.0127094, + "balance_loss_clip": 0.06273107, + "balance_loss_mlp": 0.0125918, + "epoch": 0.73338343604389, + "flos": 16624106346240.0, + "grad_norm": 1.8099581096795507, + "language_loss": 0.7810899, + "learning_rate": 7.003414830260282e-07, + "loss": 0.85794812, + "num_input_tokens_seen": 263221565, + "router_z_loss_clip": 1.41699219, + "router_z_loss_mlp": 0.11767578, + "step": 12198, + "time_per_iteration": 2.5611343383789062 + }, + { + "auxiliary_loss_clip": 0.06406511, + "auxiliary_loss_mlp": 0.01266433, + "balance_loss_clip": 0.06270938, + "balance_loss_mlp": 0.0125661, + "epoch": 0.7334435592965579, + "flos": 21148434754560.0, + "grad_norm": 1.7977488720869146, + "language_loss": 0.74877429, + "learning_rate": 7.000454855504974e-07, + "loss": 0.82550371, + "num_input_tokens_seen": 263240620, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.0982666, + "step": 12199, + "time_per_iteration": 2.549605369567871 + }, + { + "auxiliary_loss_clip": 0.06412543, + "auxiliary_loss_mlp": 0.01267154, + "balance_loss_clip": 0.06272766, + "balance_loss_mlp": 0.01255984, + "epoch": 0.7335036825492259, + "flos": 17130455769600.0, + "grad_norm": 2.1057189118558655, + "language_loss": 0.76952875, + "learning_rate": 6.997495373693729e-07, + "loss": 0.84632576, + "num_input_tokens_seen": 263254365, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.11175537, + "step": 12200, + "time_per_iteration": 2.4664149284362793 + }, + { + "auxiliary_loss_clip": 0.06406954, + "auxiliary_loss_mlp": 0.01269537, + "balance_loss_clip": 0.06272939, + "balance_loss_mlp": 0.01258874, + "epoch": 0.7335638058018938, + "flos": 23738475185280.0, + "grad_norm": 1.6692295634407006, + "language_loss": 0.61729515, + "learning_rate": 6.994536384938754e-07, + "loss": 0.69406003, + "num_input_tokens_seen": 263275880, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.10662842, + "step": 12201, + "time_per_iteration": 2.5405964851379395 + }, + { + "auxiliary_loss_clip": 0.0640207, + "auxiliary_loss_mlp": 0.01264063, + "balance_loss_clip": 0.06269816, + "balance_loss_mlp": 0.01254544, + "epoch": 0.7336239290545619, + "flos": 34941876871680.0, + "grad_norm": 1.7828880391385733, + "language_loss": 0.52268887, + "learning_rate": 6.991577889352264e-07, + "loss": 0.59935021, + "num_input_tokens_seen": 263298315, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.09521484, + "step": 12202, + "time_per_iteration": 2.610280990600586 + }, + { + "auxiliary_loss_clip": 0.06403884, + "auxiliary_loss_mlp": 0.01264935, + "balance_loss_clip": 0.06270868, + "balance_loss_mlp": 0.01255082, + "epoch": 0.7336840523072298, + "flos": 21108966681600.0, + "grad_norm": 3.0029682825255706, + "language_loss": 0.686993, + "learning_rate": 6.98861988704645e-07, + "loss": 0.76368117, + "num_input_tokens_seen": 263318615, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.09844971, + "step": 12203, + "time_per_iteration": 2.507932424545288 + }, + { + "auxiliary_loss_clip": 0.06414039, + "auxiliary_loss_mlp": 0.01270628, + "balance_loss_clip": 0.06272701, + "balance_loss_mlp": 0.01259959, + "epoch": 0.7337441755598978, + "flos": 24031243751040.0, + "grad_norm": 2.856553755482537, + "language_loss": 0.66825521, + "learning_rate": 6.985662378133474e-07, + "loss": 0.74510193, + "num_input_tokens_seen": 263336705, + "router_z_loss_clip": 1.41210938, + "router_z_loss_mlp": 0.10668945, + "step": 12204, + "time_per_iteration": 2.514671802520752 + }, + { + "auxiliary_loss_clip": 0.06406862, + "auxiliary_loss_mlp": 0.01263286, + "balance_loss_clip": 0.06273827, + "balance_loss_mlp": 0.01253779, + "epoch": 0.7338042988125658, + "flos": 22717977909120.0, + "grad_norm": 1.8458208661726296, + "language_loss": 0.77401447, + "learning_rate": 6.982705362725479e-07, + "loss": 0.85071599, + "num_input_tokens_seen": 263355065, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.09509277, + "step": 12205, + "time_per_iteration": 2.5407674312591553 + }, + { + "auxiliary_loss_clip": 0.06401809, + "auxiliary_loss_mlp": 0.01264175, + "balance_loss_clip": 0.06270801, + "balance_loss_mlp": 0.01255288, + "epoch": 0.7338644220652337, + "flos": 21367382273280.0, + "grad_norm": 2.465584123041792, + "language_loss": 0.80136371, + "learning_rate": 6.979748840934601e-07, + "loss": 0.87802351, + "num_input_tokens_seen": 263374460, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.08892822, + "step": 12206, + "time_per_iteration": 2.505405902862549 + }, + { + "auxiliary_loss_clip": 0.06407475, + "auxiliary_loss_mlp": 0.01266198, + "balance_loss_clip": 0.06271542, + "balance_loss_mlp": 0.01256447, + "epoch": 0.7339245453179017, + "flos": 30928216371840.0, + "grad_norm": 1.8649817824814656, + "language_loss": 0.71671152, + "learning_rate": 6.976792812872958e-07, + "loss": 0.79344821, + "num_input_tokens_seen": 263393610, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.09747314, + "step": 12207, + "time_per_iteration": 2.5743727684020996 + }, + { + "auxiliary_loss_clip": 0.06311717, + "auxiliary_loss_mlp": 0.01252748, + "balance_loss_clip": 0.06256534, + "balance_loss_mlp": 0.01251759, + "epoch": 0.7339846685705697, + "flos": 67916789873280.0, + "grad_norm": 0.7657187342696471, + "language_loss": 0.54859233, + "learning_rate": 6.97383727865263e-07, + "loss": 0.62423694, + "num_input_tokens_seen": 263450340, + "router_z_loss_clip": 0.55419922, + "router_z_loss_mlp": 0.00988007, + "step": 12208, + "time_per_iteration": 3.215527057647705 + }, + { + "auxiliary_loss_clip": 0.06409256, + "auxiliary_loss_mlp": 0.01263774, + "balance_loss_clip": 0.06273347, + "balance_loss_mlp": 0.01253963, + "epoch": 0.7340447918232377, + "flos": 22243298129280.0, + "grad_norm": 1.295062015849254, + "language_loss": 0.80369568, + "learning_rate": 6.970882238385703e-07, + "loss": 0.88042593, + "num_input_tokens_seen": 263471735, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.0980835, + "step": 12209, + "time_per_iteration": 2.604940414428711 + }, + { + "auxiliary_loss_clip": 0.06402272, + "auxiliary_loss_mlp": 0.01265832, + "balance_loss_clip": 0.06270745, + "balance_loss_mlp": 0.01256164, + "epoch": 0.7341049150759056, + "flos": 23770857588480.0, + "grad_norm": 1.3756281752304946, + "language_loss": 0.7923339, + "learning_rate": 6.96792769218423e-07, + "loss": 0.86901498, + "num_input_tokens_seen": 263493245, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09661865, + "step": 12210, + "time_per_iteration": 2.586808919906616 + }, + { + "auxiliary_loss_clip": 0.06405463, + "auxiliary_loss_mlp": 0.01263055, + "balance_loss_clip": 0.06273089, + "balance_loss_mlp": 0.01253142, + "epoch": 0.7341650383285736, + "flos": 17241983953920.0, + "grad_norm": 1.587399394910607, + "language_loss": 0.76868075, + "learning_rate": 6.964973640160236e-07, + "loss": 0.84536588, + "num_input_tokens_seen": 263511660, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.09918213, + "step": 12211, + "time_per_iteration": 2.5032119750976562 + }, + { + "auxiliary_loss_clip": 0.06406663, + "auxiliary_loss_mlp": 0.01269483, + "balance_loss_clip": 0.06271002, + "balance_loss_mlp": 0.01259464, + "epoch": 0.7342251615812415, + "flos": 23410640885760.0, + "grad_norm": 1.8683107617310235, + "language_loss": 0.7257871, + "learning_rate": 6.962020082425748e-07, + "loss": 0.80254853, + "num_input_tokens_seen": 263530875, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.10021973, + "step": 12212, + "time_per_iteration": 2.529822826385498 + }, + { + "auxiliary_loss_clip": 0.06408443, + "auxiliary_loss_mlp": 0.01264026, + "balance_loss_clip": 0.06274249, + "balance_loss_mlp": 0.01253983, + "epoch": 0.7342852848339095, + "flos": 22753756402560.0, + "grad_norm": 1.4731208484223037, + "language_loss": 0.69065344, + "learning_rate": 6.959067019092766e-07, + "loss": 0.76737809, + "num_input_tokens_seen": 263551585, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.10046387, + "step": 12213, + "time_per_iteration": 2.5050880908966064 + }, + { + "auxiliary_loss_clip": 0.06311147, + "auxiliary_loss_mlp": 0.01250993, + "balance_loss_clip": 0.06256209, + "balance_loss_mlp": 0.01250006, + "epoch": 0.7343454080865774, + "flos": 53960219856000.0, + "grad_norm": 0.6961582505379801, + "language_loss": 0.54205143, + "learning_rate": 6.956114450273276e-07, + "loss": 0.61767286, + "num_input_tokens_seen": 263609545, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.00987244, + "step": 12214, + "time_per_iteration": 3.01758074760437 + }, + { + "auxiliary_loss_clip": 0.06412373, + "auxiliary_loss_mlp": 0.0126565, + "balance_loss_clip": 0.06272756, + "balance_loss_mlp": 0.01255058, + "epoch": 0.7344055313392455, + "flos": 12171754195200.0, + "grad_norm": 1.9351269551691648, + "language_loss": 0.70493495, + "learning_rate": 6.953162376079233e-07, + "loss": 0.78171515, + "num_input_tokens_seen": 263627880, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.105896, + "step": 12215, + "time_per_iteration": 2.450974941253662 + }, + { + "auxiliary_loss_clip": 0.06400481, + "auxiliary_loss_mlp": 0.0126608, + "balance_loss_clip": 0.06270639, + "balance_loss_mlp": 0.01256347, + "epoch": 0.7344656545919134, + "flos": 18555710993280.0, + "grad_norm": 1.5126294577685706, + "language_loss": 0.7330094, + "learning_rate": 6.950210796622573e-07, + "loss": 0.80967498, + "num_input_tokens_seen": 263645665, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.09741211, + "step": 12216, + "time_per_iteration": 3.8361501693725586 + }, + { + "auxiliary_loss_clip": 0.06417778, + "auxiliary_loss_mlp": 0.01265589, + "balance_loss_clip": 0.06274825, + "balance_loss_mlp": 0.01254085, + "epoch": 0.7345257778445814, + "flos": 23668762988160.0, + "grad_norm": 1.664988120098628, + "language_loss": 0.78114659, + "learning_rate": 6.947259712015236e-07, + "loss": 0.85798025, + "num_input_tokens_seen": 263668170, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.11505127, + "step": 12217, + "time_per_iteration": 2.5286312103271484 + }, + { + "auxiliary_loss_clip": 0.06405286, + "auxiliary_loss_mlp": 0.01265343, + "balance_loss_clip": 0.06273887, + "balance_loss_mlp": 0.01256056, + "epoch": 0.7345859010972494, + "flos": 13813818658560.0, + "grad_norm": 2.564959401036019, + "language_loss": 0.78167617, + "learning_rate": 6.94430912236911e-07, + "loss": 0.85838252, + "num_input_tokens_seen": 263684190, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.09289551, + "step": 12218, + "time_per_iteration": 2.4696590900421143 + }, + { + "auxiliary_loss_clip": 0.06401719, + "auxiliary_loss_mlp": 0.0126533, + "balance_loss_clip": 0.06270626, + "balance_loss_mlp": 0.01255567, + "epoch": 0.7346460243499173, + "flos": 22279202403840.0, + "grad_norm": 1.5944736181083394, + "language_loss": 0.72325158, + "learning_rate": 6.941359027796092e-07, + "loss": 0.79992205, + "num_input_tokens_seen": 263702095, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09777832, + "step": 12219, + "time_per_iteration": 2.5853631496429443 + }, + { + "auxiliary_loss_clip": 0.06402183, + "auxiliary_loss_mlp": 0.01264937, + "balance_loss_clip": 0.06272361, + "balance_loss_mlp": 0.01255531, + "epoch": 0.7347061476025853, + "flos": 23261447491200.0, + "grad_norm": 1.646626241048598, + "language_loss": 0.74960732, + "learning_rate": 6.938409428408061e-07, + "loss": 0.82627851, + "num_input_tokens_seen": 263721385, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.09405518, + "step": 12220, + "time_per_iteration": 2.5074381828308105 + }, + { + "auxiliary_loss_clip": 0.06411088, + "auxiliary_loss_mlp": 0.01266137, + "balance_loss_clip": 0.06272232, + "balance_loss_mlp": 0.01255384, + "epoch": 0.7347662708552533, + "flos": 15272881804800.0, + "grad_norm": 1.5752596580091636, + "language_loss": 0.65676045, + "learning_rate": 6.93546032431684e-07, + "loss": 0.73353267, + "num_input_tokens_seen": 263737835, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.10742188, + "step": 12221, + "time_per_iteration": 2.4807536602020264 + }, + { + "auxiliary_loss_clip": 0.06407331, + "auxiliary_loss_mlp": 0.01266734, + "balance_loss_clip": 0.06272253, + "balance_loss_mlp": 0.01256809, + "epoch": 0.7348263941079213, + "flos": 24866349868800.0, + "grad_norm": 1.700720501906822, + "language_loss": 0.6957171, + "learning_rate": 6.932511715634273e-07, + "loss": 0.77245772, + "num_input_tokens_seen": 263756480, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.09918213, + "step": 12222, + "time_per_iteration": 2.550657272338867 + }, + { + "auxiliary_loss_clip": 0.06405503, + "auxiliary_loss_mlp": 0.01265893, + "balance_loss_clip": 0.06273381, + "balance_loss_mlp": 0.01257054, + "epoch": 0.7348865173605892, + "flos": 24358868415360.0, + "grad_norm": 1.4474540063064079, + "language_loss": 0.66394234, + "learning_rate": 6.92956360247217e-07, + "loss": 0.74065632, + "num_input_tokens_seen": 263776440, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.08843994, + "step": 12223, + "time_per_iteration": 2.5699193477630615 + }, + { + "auxiliary_loss_clip": 0.06405693, + "auxiliary_loss_mlp": 0.0126412, + "balance_loss_clip": 0.06272776, + "balance_loss_mlp": 0.01254404, + "epoch": 0.7349466406132572, + "flos": 20009700967680.0, + "grad_norm": 2.3059227794211834, + "language_loss": 0.72692394, + "learning_rate": 6.926615984942332e-07, + "loss": 0.80362213, + "num_input_tokens_seen": 263793700, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.09716797, + "step": 12224, + "time_per_iteration": 2.470388412475586 + }, + { + "auxiliary_loss_clip": 0.06410325, + "auxiliary_loss_mlp": 0.01265671, + "balance_loss_clip": 0.06273518, + "balance_loss_mlp": 0.01254776, + "epoch": 0.7350067638659251, + "flos": 29832766018560.0, + "grad_norm": 1.7299293804881801, + "language_loss": 0.72725701, + "learning_rate": 6.92366886315652e-07, + "loss": 0.80401695, + "num_input_tokens_seen": 263814620, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.10900879, + "step": 12225, + "time_per_iteration": 2.596513509750366 + }, + { + "auxiliary_loss_clip": 0.06415132, + "auxiliary_loss_mlp": 0.0126347, + "balance_loss_clip": 0.06274726, + "balance_loss_mlp": 0.01252825, + "epoch": 0.7350668871185931, + "flos": 21871677271680.0, + "grad_norm": 1.7624309121462833, + "language_loss": 0.76816809, + "learning_rate": 6.920722237226501e-07, + "loss": 0.84495413, + "num_input_tokens_seen": 263832725, + "router_z_loss_clip": 1.40429688, + "router_z_loss_mlp": 0.10644531, + "step": 12226, + "time_per_iteration": 3.9786300659179688 + }, + { + "auxiliary_loss_clip": 0.06405763, + "auxiliary_loss_mlp": 0.01263929, + "balance_loss_clip": 0.06270237, + "balance_loss_mlp": 0.01254041, + "epoch": 0.735127010371261, + "flos": 22572893364480.0, + "grad_norm": 1.4073989113743075, + "language_loss": 0.67142195, + "learning_rate": 6.917776107264008e-07, + "loss": 0.74811888, + "num_input_tokens_seen": 263853850, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.09893799, + "step": 12227, + "time_per_iteration": 2.5849621295928955 + }, + { + "auxiliary_loss_clip": 0.06410711, + "auxiliary_loss_mlp": 0.012626, + "balance_loss_clip": 0.06274848, + "balance_loss_mlp": 0.0125292, + "epoch": 0.7351871336239291, + "flos": 25891333338240.0, + "grad_norm": 1.4691171153634894, + "language_loss": 0.63763392, + "learning_rate": 6.914830473380749e-07, + "loss": 0.71436703, + "num_input_tokens_seen": 263874760, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.09680176, + "step": 12228, + "time_per_iteration": 2.535334587097168 + }, + { + "auxiliary_loss_clip": 0.06409031, + "auxiliary_loss_mlp": 0.01263285, + "balance_loss_clip": 0.06274029, + "balance_loss_mlp": 0.0125404, + "epoch": 0.735247256876597, + "flos": 17938126874880.0, + "grad_norm": 1.6163859960159983, + "language_loss": 0.6387676, + "learning_rate": 6.911885335688427e-07, + "loss": 0.7154907, + "num_input_tokens_seen": 263893390, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.09246826, + "step": 12229, + "time_per_iteration": 2.5226519107818604 + }, + { + "auxiliary_loss_clip": 0.06409419, + "auxiliary_loss_mlp": 0.01264039, + "balance_loss_clip": 0.06271814, + "balance_loss_mlp": 0.01253352, + "epoch": 0.735307380129265, + "flos": 28882484064000.0, + "grad_norm": 1.5503109559277863, + "language_loss": 0.734267, + "learning_rate": 6.908940694298726e-07, + "loss": 0.81100154, + "num_input_tokens_seen": 263911180, + "router_z_loss_clip": 1.37695312, + "router_z_loss_mlp": 0.10693359, + "step": 12230, + "time_per_iteration": 3.9754912853240967 + }, + { + "auxiliary_loss_clip": 0.06410781, + "auxiliary_loss_mlp": 0.01267625, + "balance_loss_clip": 0.06275117, + "balance_loss_mlp": 0.01257177, + "epoch": 0.7353675033819329, + "flos": 13630691560320.0, + "grad_norm": 2.023268936424561, + "language_loss": 0.72356808, + "learning_rate": 6.90599654932332e-07, + "loss": 0.8003521, + "num_input_tokens_seen": 263928975, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.10455322, + "step": 12231, + "time_per_iteration": 2.4864163398742676 + }, + { + "auxiliary_loss_clip": 0.06412238, + "auxiliary_loss_mlp": 0.01272917, + "balance_loss_clip": 0.06275348, + "balance_loss_mlp": 0.01262003, + "epoch": 0.7354276266346009, + "flos": 19469040497280.0, + "grad_norm": 2.0034739477169965, + "language_loss": 0.64325827, + "learning_rate": 6.903052900873823e-07, + "loss": 0.72010976, + "num_input_tokens_seen": 263944495, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.10906982, + "step": 12232, + "time_per_iteration": 2.5125675201416016 + }, + { + "auxiliary_loss_clip": 0.06407313, + "auxiliary_loss_mlp": 0.01267406, + "balance_loss_clip": 0.06270695, + "balance_loss_mlp": 0.01256922, + "epoch": 0.735487749887269, + "flos": 15776170554240.0, + "grad_norm": 1.8738456436799267, + "language_loss": 0.75562924, + "learning_rate": 6.900109749061874e-07, + "loss": 0.83237642, + "num_input_tokens_seen": 263961325, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.10491943, + "step": 12233, + "time_per_iteration": 2.496495246887207 + }, + { + "auxiliary_loss_clip": 0.06407893, + "auxiliary_loss_mlp": 0.01268588, + "balance_loss_clip": 0.06273118, + "balance_loss_mlp": 0.0125805, + "epoch": 0.7355478731399369, + "flos": 18266673934080.0, + "grad_norm": 1.8052457003626037, + "language_loss": 0.73313487, + "learning_rate": 6.897167093999079e-07, + "loss": 0.80989963, + "num_input_tokens_seen": 263980445, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.10534668, + "step": 12234, + "time_per_iteration": 3.9552576541900635 + }, + { + "auxiliary_loss_clip": 0.064089, + "auxiliary_loss_mlp": 0.01265135, + "balance_loss_clip": 0.06273393, + "balance_loss_mlp": 0.01255104, + "epoch": 0.7356079963926049, + "flos": 26549307924480.0, + "grad_norm": 1.8318735304656244, + "language_loss": 0.59923625, + "learning_rate": 6.894224935797017e-07, + "loss": 0.67597657, + "num_input_tokens_seen": 263999330, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.10028076, + "step": 12235, + "time_per_iteration": 2.536958932876587 + }, + { + "auxiliary_loss_clip": 0.06406462, + "auxiliary_loss_mlp": 0.01266095, + "balance_loss_clip": 0.06273465, + "balance_loss_mlp": 0.01255611, + "epoch": 0.7356681196452728, + "flos": 10782990224640.0, + "grad_norm": 2.1420111841430445, + "language_loss": 0.86364961, + "learning_rate": 6.891283274567259e-07, + "loss": 0.94037515, + "num_input_tokens_seen": 264014150, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.10479736, + "step": 12236, + "time_per_iteration": 2.4920454025268555 + }, + { + "auxiliary_loss_clip": 0.0641176, + "auxiliary_loss_mlp": 0.01264567, + "balance_loss_clip": 0.06274892, + "balance_loss_mlp": 0.01254178, + "epoch": 0.7357282428979408, + "flos": 19724730831360.0, + "grad_norm": 1.819458830371115, + "language_loss": 0.69971436, + "learning_rate": 6.888342110421364e-07, + "loss": 0.77647763, + "num_input_tokens_seen": 264033140, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.1038208, + "step": 12237, + "time_per_iteration": 2.5083632469177246 + }, + { + "auxiliary_loss_clip": 0.0640821, + "auxiliary_loss_mlp": 0.01262709, + "balance_loss_clip": 0.06271386, + "balance_loss_mlp": 0.01252647, + "epoch": 0.7357883661506087, + "flos": 19470130600320.0, + "grad_norm": 1.6051120472726816, + "language_loss": 0.72315025, + "learning_rate": 6.885401443470839e-07, + "loss": 0.79985946, + "num_input_tokens_seen": 264052105, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.10070801, + "step": 12238, + "time_per_iteration": 2.5418028831481934 + }, + { + "auxiliary_loss_clip": 0.06415435, + "auxiliary_loss_mlp": 0.01267845, + "balance_loss_clip": 0.06272001, + "balance_loss_mlp": 0.01257038, + "epoch": 0.7358484894032767, + "flos": 27129897665280.0, + "grad_norm": 1.6224977172165573, + "language_loss": 0.73030883, + "learning_rate": 6.882461273827205e-07, + "loss": 0.8071416, + "num_input_tokens_seen": 264070690, + "router_z_loss_clip": 1.43359375, + "router_z_loss_mlp": 0.10809326, + "step": 12239, + "time_per_iteration": 2.57132887840271 + }, + { + "auxiliary_loss_clip": 0.06405096, + "auxiliary_loss_mlp": 0.01263816, + "balance_loss_clip": 0.06275095, + "balance_loss_mlp": 0.01254786, + "epoch": 0.7359086126559446, + "flos": 24509780818560.0, + "grad_norm": 1.236291832045993, + "language_loss": 0.79114598, + "learning_rate": 6.879521601601954e-07, + "loss": 0.8678351, + "num_input_tokens_seen": 264094225, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.09033203, + "step": 12240, + "time_per_iteration": 2.574645757675171 + }, + { + "auxiliary_loss_clip": 0.06410246, + "auxiliary_loss_mlp": 0.01266401, + "balance_loss_clip": 0.0627753, + "balance_loss_mlp": 0.01256942, + "epoch": 0.7359687359086127, + "flos": 23337993795840.0, + "grad_norm": 1.821182153740144, + "language_loss": 0.83331031, + "learning_rate": 6.876582426906565e-07, + "loss": 0.91007674, + "num_input_tokens_seen": 264113190, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09454346, + "step": 12241, + "time_per_iteration": 2.5325047969818115 + }, + { + "auxiliary_loss_clip": 0.06407616, + "auxiliary_loss_mlp": 0.01262523, + "balance_loss_clip": 0.06274907, + "balance_loss_mlp": 0.01252909, + "epoch": 0.7360288591612806, + "flos": 20199578319360.0, + "grad_norm": 1.8489352198230395, + "language_loss": 0.78972995, + "learning_rate": 6.873643749852484e-07, + "loss": 0.86643136, + "num_input_tokens_seen": 264132050, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09606934, + "step": 12242, + "time_per_iteration": 2.4817190170288086 + }, + { + "auxiliary_loss_clip": 0.06405145, + "auxiliary_loss_mlp": 0.01268429, + "balance_loss_clip": 0.06273502, + "balance_loss_mlp": 0.01258981, + "epoch": 0.7360889824139486, + "flos": 24979722842880.0, + "grad_norm": 1.7750845941868088, + "language_loss": 0.79797709, + "learning_rate": 6.870705570551145e-07, + "loss": 0.87471282, + "num_input_tokens_seen": 264152800, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09436035, + "step": 12243, + "time_per_iteration": 2.5396323204040527 + }, + { + "auxiliary_loss_clip": 0.06411023, + "auxiliary_loss_mlp": 0.01264312, + "balance_loss_clip": 0.06271946, + "balance_loss_mlp": 0.01253423, + "epoch": 0.7361491056666165, + "flos": 15017610741120.0, + "grad_norm": 2.051473837828663, + "language_loss": 0.74682987, + "learning_rate": 6.867767889113969e-07, + "loss": 0.82358325, + "num_input_tokens_seen": 264169650, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.10888672, + "step": 12244, + "time_per_iteration": 2.468791961669922 + }, + { + "auxiliary_loss_clip": 0.06409503, + "auxiliary_loss_mlp": 0.01266285, + "balance_loss_clip": 0.06271558, + "balance_loss_mlp": 0.01256033, + "epoch": 0.7362092289192845, + "flos": 22937135063040.0, + "grad_norm": 1.5646917897943269, + "language_loss": 0.69797492, + "learning_rate": 6.864830705652347e-07, + "loss": 0.77473283, + "num_input_tokens_seen": 264190530, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.10253906, + "step": 12245, + "time_per_iteration": 2.6041831970214844 + }, + { + "auxiliary_loss_clip": 0.06401391, + "auxiliary_loss_mlp": 0.01266236, + "balance_loss_clip": 0.06273212, + "balance_loss_mlp": 0.01255787, + "epoch": 0.7362693521719526, + "flos": 20708694927360.0, + "grad_norm": 1.4104590909640493, + "language_loss": 0.73381358, + "learning_rate": 6.861894020277658e-07, + "loss": 0.81048983, + "num_input_tokens_seen": 264210820, + "router_z_loss_clip": 1.28222656, + "router_z_loss_mlp": 0.10449219, + "step": 12246, + "time_per_iteration": 2.5084409713745117 + }, + { + "auxiliary_loss_clip": 0.06402211, + "auxiliary_loss_mlp": 0.01268595, + "balance_loss_clip": 0.06273086, + "balance_loss_mlp": 0.01259833, + "epoch": 0.7363294754246205, + "flos": 13115747093760.0, + "grad_norm": 1.8401513132222869, + "language_loss": 0.73210883, + "learning_rate": 6.858957833101266e-07, + "loss": 0.80881691, + "num_input_tokens_seen": 264227430, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.08758545, + "step": 12247, + "time_per_iteration": 2.5997636318206787 + }, + { + "auxiliary_loss_clip": 0.06406122, + "auxiliary_loss_mlp": 0.01262591, + "balance_loss_clip": 0.06276006, + "balance_loss_mlp": 0.0125269, + "epoch": 0.7363895986772885, + "flos": 14032598469120.0, + "grad_norm": 1.520275800225871, + "language_loss": 0.74474341, + "learning_rate": 6.856022144234526e-07, + "loss": 0.8214305, + "num_input_tokens_seen": 264245230, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.09899902, + "step": 12248, + "time_per_iteration": 2.4908292293548584 + }, + { + "auxiliary_loss_clip": 0.06410165, + "auxiliary_loss_mlp": 0.01271268, + "balance_loss_clip": 0.06274056, + "balance_loss_mlp": 0.01261022, + "epoch": 0.7364497219299564, + "flos": 19726240204800.0, + "grad_norm": 1.8587136102784652, + "language_loss": 0.73065788, + "learning_rate": 6.853086953788727e-07, + "loss": 0.80747223, + "num_input_tokens_seen": 264263945, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.1026001, + "step": 12249, + "time_per_iteration": 2.5477547645568848 + }, + { + "auxiliary_loss_clip": 0.06408364, + "auxiliary_loss_mlp": 0.01269722, + "balance_loss_clip": 0.06275103, + "balance_loss_mlp": 0.0125922, + "epoch": 0.7365098451826244, + "flos": 21367843470720.0, + "grad_norm": 1.7459434910305351, + "language_loss": 0.7680105, + "learning_rate": 6.850152261875189e-07, + "loss": 0.84479141, + "num_input_tokens_seen": 264281500, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.1050415, + "step": 12250, + "time_per_iteration": 2.50736665725708 + }, + { + "auxiliary_loss_clip": 0.06411077, + "auxiliary_loss_mlp": 0.01264873, + "balance_loss_clip": 0.0627429, + "balance_loss_mlp": 0.01254043, + "epoch": 0.7365699684352923, + "flos": 23375030100480.0, + "grad_norm": 1.6059448981622937, + "language_loss": 0.71334994, + "learning_rate": 6.8472180686052e-07, + "loss": 0.79010946, + "num_input_tokens_seen": 264301625, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.10839844, + "step": 12251, + "time_per_iteration": 2.545740842819214 + }, + { + "auxiliary_loss_clip": 0.0640523, + "auxiliary_loss_mlp": 0.01263198, + "balance_loss_clip": 0.06272127, + "balance_loss_mlp": 0.01253584, + "epoch": 0.7366300916879603, + "flos": 59537610380160.0, + "grad_norm": 1.4529727777201047, + "language_loss": 0.66069037, + "learning_rate": 6.844284374090015e-07, + "loss": 0.73737466, + "num_input_tokens_seen": 264323975, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.09606934, + "step": 12252, + "time_per_iteration": 2.884873628616333 + }, + { + "auxiliary_loss_clip": 0.06412438, + "auxiliary_loss_mlp": 0.0126905, + "balance_loss_clip": 0.06274702, + "balance_loss_mlp": 0.01258488, + "epoch": 0.7366902149406283, + "flos": 20929445308800.0, + "grad_norm": 1.6593281267940243, + "language_loss": 0.79292876, + "learning_rate": 6.841351178440884e-07, + "loss": 0.86974359, + "num_input_tokens_seen": 264343785, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.10559082, + "step": 12253, + "time_per_iteration": 2.56786847114563 + }, + { + "auxiliary_loss_clip": 0.06405851, + "auxiliary_loss_mlp": 0.01262554, + "balance_loss_clip": 0.06274677, + "balance_loss_mlp": 0.01253739, + "epoch": 0.7367503381932963, + "flos": 17353973335680.0, + "grad_norm": 1.9323805517919423, + "language_loss": 0.76607239, + "learning_rate": 6.83841848176905e-07, + "loss": 0.84275639, + "num_input_tokens_seen": 264361130, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.08813477, + "step": 12254, + "time_per_iteration": 2.465092182159424 + }, + { + "auxiliary_loss_clip": 0.06408474, + "auxiliary_loss_mlp": 0.01264148, + "balance_loss_clip": 0.06274708, + "balance_loss_mlp": 0.0125361, + "epoch": 0.7368104614459642, + "flos": 17827017960960.0, + "grad_norm": 3.2694109886339366, + "language_loss": 0.69397593, + "learning_rate": 6.835486284185692e-07, + "loss": 0.77070212, + "num_input_tokens_seen": 264376965, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.10534668, + "step": 12255, + "time_per_iteration": 2.5002591609954834 + }, + { + "auxiliary_loss_clip": 0.06412044, + "auxiliary_loss_mlp": 0.01265607, + "balance_loss_clip": 0.06276523, + "balance_loss_mlp": 0.01255117, + "epoch": 0.7368705846986322, + "flos": 24612672032640.0, + "grad_norm": 1.5801315841847023, + "language_loss": 0.75219184, + "learning_rate": 6.832554585802012e-07, + "loss": 0.82896841, + "num_input_tokens_seen": 264396310, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.10491943, + "step": 12256, + "time_per_iteration": 4.017148494720459 + }, + { + "auxiliary_loss_clip": 0.06408297, + "auxiliary_loss_mlp": 0.0126377, + "balance_loss_clip": 0.06273545, + "balance_loss_mlp": 0.01254043, + "epoch": 0.7369307079513001, + "flos": 34978829322240.0, + "grad_norm": 1.5326155216287436, + "language_loss": 0.74032342, + "learning_rate": 6.829623386729182e-07, + "loss": 0.81704414, + "num_input_tokens_seen": 264418085, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.09729004, + "step": 12257, + "time_per_iteration": 2.647477388381958 + }, + { + "auxiliary_loss_clip": 0.06406973, + "auxiliary_loss_mlp": 0.01263484, + "balance_loss_clip": 0.0627301, + "balance_loss_mlp": 0.01253965, + "epoch": 0.7369908312039681, + "flos": 21220872209280.0, + "grad_norm": 1.4761434387135868, + "language_loss": 0.78534251, + "learning_rate": 6.826692687078362e-07, + "loss": 0.86204708, + "num_input_tokens_seen": 264437595, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.09521484, + "step": 12258, + "time_per_iteration": 2.572261333465576 + }, + { + "auxiliary_loss_clip": 0.06412143, + "auxiliary_loss_mlp": 0.01264951, + "balance_loss_clip": 0.06274798, + "balance_loss_mlp": 0.01255194, + "epoch": 0.7370509544566362, + "flos": 23630510799360.0, + "grad_norm": 1.4160381635671, + "language_loss": 0.66616917, + "learning_rate": 6.823762486960674e-07, + "loss": 0.74294007, + "num_input_tokens_seen": 264457385, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.09759521, + "step": 12259, + "time_per_iteration": 2.507096290588379 + }, + { + "auxiliary_loss_clip": 0.06408918, + "auxiliary_loss_mlp": 0.01264842, + "balance_loss_clip": 0.06274989, + "balance_loss_mlp": 0.01254406, + "epoch": 0.7371110777093041, + "flos": 24834764079360.0, + "grad_norm": 1.6356397611324185, + "language_loss": 0.73572636, + "learning_rate": 6.820832786487225e-07, + "loss": 0.81246388, + "num_input_tokens_seen": 264477205, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.10424805, + "step": 12260, + "time_per_iteration": 2.55729341506958 + }, + { + "auxiliary_loss_clip": 0.06410116, + "auxiliary_loss_mlp": 0.0126791, + "balance_loss_clip": 0.06274181, + "balance_loss_mlp": 0.01257217, + "epoch": 0.7371712009619721, + "flos": 23156292216960.0, + "grad_norm": 1.5911507549060615, + "language_loss": 0.7366817, + "learning_rate": 6.817903585769125e-07, + "loss": 0.81346196, + "num_input_tokens_seen": 264497195, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.10693359, + "step": 12261, + "time_per_iteration": 2.4976613521575928 + }, + { + "auxiliary_loss_clip": 0.06411919, + "auxiliary_loss_mlp": 0.01266277, + "balance_loss_clip": 0.06273584, + "balance_loss_mlp": 0.01254845, + "epoch": 0.73723132421464, + "flos": 23119675182720.0, + "grad_norm": 1.9595701183137586, + "language_loss": 0.67333376, + "learning_rate": 6.814974884917438e-07, + "loss": 0.75011569, + "num_input_tokens_seen": 264516950, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.11425781, + "step": 12262, + "time_per_iteration": 2.5359151363372803 + }, + { + "auxiliary_loss_clip": 0.06410287, + "auxiliary_loss_mlp": 0.01266365, + "balance_loss_clip": 0.06273925, + "balance_loss_mlp": 0.01255881, + "epoch": 0.737291447467308, + "flos": 19278031115520.0, + "grad_norm": 1.8055684860594015, + "language_loss": 0.8872509, + "learning_rate": 6.81204668404322e-07, + "loss": 0.96401745, + "num_input_tokens_seen": 264532675, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.10479736, + "step": 12263, + "time_per_iteration": 2.4645025730133057 + }, + { + "auxiliary_loss_clip": 0.06401009, + "auxiliary_loss_mlp": 0.01262824, + "balance_loss_clip": 0.06273551, + "balance_loss_mlp": 0.01253717, + "epoch": 0.7373515707199759, + "flos": 25125142803840.0, + "grad_norm": 1.5128594481302715, + "language_loss": 0.67552602, + "learning_rate": 6.809118983257522e-07, + "loss": 0.75216436, + "num_input_tokens_seen": 264555635, + "router_z_loss_clip": 1.27441406, + "router_z_loss_mlp": 0.09100342, + "step": 12264, + "time_per_iteration": 2.569833517074585 + }, + { + "auxiliary_loss_clip": 0.06405195, + "auxiliary_loss_mlp": 0.012641, + "balance_loss_clip": 0.06273174, + "balance_loss_mlp": 0.0125442, + "epoch": 0.737411693972644, + "flos": 32415427290240.0, + "grad_norm": 1.6707890497545697, + "language_loss": 0.80282211, + "learning_rate": 6.806191782671356e-07, + "loss": 0.87951505, + "num_input_tokens_seen": 264573140, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.09674072, + "step": 12265, + "time_per_iteration": 3.997997283935547 + }, + { + "auxiliary_loss_clip": 0.06415318, + "auxiliary_loss_mlp": 0.01264678, + "balance_loss_clip": 0.06273959, + "balance_loss_mlp": 0.01253758, + "epoch": 0.7374718172253119, + "flos": 24322586797440.0, + "grad_norm": 1.6052844739789887, + "language_loss": 0.75045347, + "learning_rate": 6.803265082395711e-07, + "loss": 0.82725346, + "num_input_tokens_seen": 264591610, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.10919189, + "step": 12266, + "time_per_iteration": 2.5624334812164307 + }, + { + "auxiliary_loss_clip": 0.06408488, + "auxiliary_loss_mlp": 0.01267186, + "balance_loss_clip": 0.06273493, + "balance_loss_mlp": 0.0125697, + "epoch": 0.7375319404779799, + "flos": 27162447776640.0, + "grad_norm": 1.557791078804126, + "language_loss": 0.73471284, + "learning_rate": 6.800338882541576e-07, + "loss": 0.81146955, + "num_input_tokens_seen": 264611170, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.10217285, + "step": 12267, + "time_per_iteration": 2.561325788497925 + }, + { + "auxiliary_loss_clip": 0.06408671, + "auxiliary_loss_mlp": 0.0126606, + "balance_loss_clip": 0.06273606, + "balance_loss_mlp": 0.01256654, + "epoch": 0.7375920637306478, + "flos": 18885977061120.0, + "grad_norm": 1.9471728084971924, + "language_loss": 0.83236742, + "learning_rate": 6.797413183219923e-07, + "loss": 0.90911472, + "num_input_tokens_seen": 264629365, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.09411621, + "step": 12268, + "time_per_iteration": 2.515185832977295 + }, + { + "auxiliary_loss_clip": 0.06403858, + "auxiliary_loss_mlp": 0.01268762, + "balance_loss_clip": 0.06272093, + "balance_loss_mlp": 0.01258641, + "epoch": 0.7376521869833158, + "flos": 15675291838080.0, + "grad_norm": 1.7639029349548874, + "language_loss": 0.73450869, + "learning_rate": 6.794487984541677e-07, + "loss": 0.81123489, + "num_input_tokens_seen": 264647915, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.10113525, + "step": 12269, + "time_per_iteration": 3.9070801734924316 + }, + { + "auxiliary_loss_clip": 0.06414587, + "auxiliary_loss_mlp": 0.01264636, + "balance_loss_clip": 0.06275409, + "balance_loss_mlp": 0.01253186, + "epoch": 0.7377123102359837, + "flos": 36980146166400.0, + "grad_norm": 1.919355815322485, + "language_loss": 0.70780635, + "learning_rate": 6.791563286617776e-07, + "loss": 0.78459859, + "num_input_tokens_seen": 264669620, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.11450195, + "step": 12270, + "time_per_iteration": 2.6150050163269043 + }, + { + "auxiliary_loss_clip": 0.06405621, + "auxiliary_loss_mlp": 0.01267086, + "balance_loss_clip": 0.06273162, + "balance_loss_mlp": 0.01257514, + "epoch": 0.7377724334886517, + "flos": 24502779002880.0, + "grad_norm": 1.650003260672948, + "language_loss": 0.69519281, + "learning_rate": 6.788639089559119e-07, + "loss": 0.77191985, + "num_input_tokens_seen": 264689345, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09564209, + "step": 12271, + "time_per_iteration": 2.545802593231201 + }, + { + "auxiliary_loss_clip": 0.06407182, + "auxiliary_loss_mlp": 0.01265449, + "balance_loss_clip": 0.06271105, + "balance_loss_mlp": 0.01254565, + "epoch": 0.7378325567413198, + "flos": 24397036750080.0, + "grad_norm": 2.0373077116973577, + "language_loss": 0.67736673, + "learning_rate": 6.785715393476586e-07, + "loss": 0.75409299, + "num_input_tokens_seen": 264707625, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.10882568, + "step": 12272, + "time_per_iteration": 2.5161080360412598 + }, + { + "auxiliary_loss_clip": 0.064047, + "auxiliary_loss_mlp": 0.01266291, + "balance_loss_clip": 0.06272876, + "balance_loss_mlp": 0.01255812, + "epoch": 0.7378926799939877, + "flos": 17421421472640.0, + "grad_norm": 1.6693820905355277, + "language_loss": 0.78472829, + "learning_rate": 6.782792198481049e-07, + "loss": 0.86143827, + "num_input_tokens_seen": 264725575, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.10479736, + "step": 12273, + "time_per_iteration": 2.527449369430542 + }, + { + "auxiliary_loss_clip": 0.0640404, + "auxiliary_loss_mlp": 0.01265172, + "balance_loss_clip": 0.06270438, + "balance_loss_mlp": 0.01255111, + "epoch": 0.7379528032466557, + "flos": 18479374323840.0, + "grad_norm": 1.7204820046502844, + "language_loss": 0.83983135, + "learning_rate": 6.779869504683355e-07, + "loss": 0.91652346, + "num_input_tokens_seen": 264742855, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.1005249, + "step": 12274, + "time_per_iteration": 3.8728952407836914 + }, + { + "auxiliary_loss_clip": 0.06420162, + "auxiliary_loss_mlp": 0.0126937, + "balance_loss_clip": 0.06277606, + "balance_loss_mlp": 0.01258414, + "epoch": 0.7380129264993236, + "flos": 17827814574720.0, + "grad_norm": 1.7616073867402775, + "language_loss": 0.7422626, + "learning_rate": 6.776947312194341e-07, + "loss": 0.81915796, + "num_input_tokens_seen": 264761155, + "router_z_loss_clip": 1.42382812, + "router_z_loss_mlp": 0.10961914, + "step": 12275, + "time_per_iteration": 2.528137445449829 + }, + { + "auxiliary_loss_clip": 0.06413853, + "auxiliary_loss_mlp": 0.01270057, + "balance_loss_clip": 0.06274875, + "balance_loss_mlp": 0.01259352, + "epoch": 0.7380730497519916, + "flos": 23003115753600.0, + "grad_norm": 1.6499843647208283, + "language_loss": 0.73819113, + "learning_rate": 6.774025621124813e-07, + "loss": 0.81503022, + "num_input_tokens_seen": 264780660, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.10699463, + "step": 12276, + "time_per_iteration": 2.49808931350708 + }, + { + "auxiliary_loss_clip": 0.06408275, + "auxiliary_loss_mlp": 0.01262969, + "balance_loss_clip": 0.062733, + "balance_loss_mlp": 0.0125329, + "epoch": 0.7381331730046595, + "flos": 20272435044480.0, + "grad_norm": 1.938538877021236, + "language_loss": 0.77922094, + "learning_rate": 6.771104431585551e-07, + "loss": 0.85593343, + "num_input_tokens_seen": 264798850, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.09680176, + "step": 12277, + "time_per_iteration": 2.5433340072631836 + }, + { + "auxiliary_loss_clip": 0.06408259, + "auxiliary_loss_mlp": 0.01270849, + "balance_loss_clip": 0.06276105, + "balance_loss_mlp": 0.01260495, + "epoch": 0.7381932962573275, + "flos": 19760467397760.0, + "grad_norm": 1.5941630218798921, + "language_loss": 0.79001057, + "learning_rate": 6.768183743687338e-07, + "loss": 0.86680162, + "num_input_tokens_seen": 264816795, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.10351562, + "step": 12278, + "time_per_iteration": 2.5074949264526367 + }, + { + "auxiliary_loss_clip": 0.06409795, + "auxiliary_loss_mlp": 0.01264815, + "balance_loss_clip": 0.06271898, + "balance_loss_mlp": 0.01254248, + "epoch": 0.7382534195099955, + "flos": 17310060996480.0, + "grad_norm": 3.5373334504988474, + "language_loss": 0.71857256, + "learning_rate": 6.765263557540921e-07, + "loss": 0.79531866, + "num_input_tokens_seen": 264834105, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.10577393, + "step": 12279, + "time_per_iteration": 2.516350269317627 + }, + { + "auxiliary_loss_clip": 0.06410283, + "auxiliary_loss_mlp": 0.01266626, + "balance_loss_clip": 0.0627284, + "balance_loss_mlp": 0.01256243, + "epoch": 0.7383135427626635, + "flos": 18703269233280.0, + "grad_norm": 2.101190205716009, + "language_loss": 0.85982198, + "learning_rate": 6.762343873257034e-07, + "loss": 0.93659103, + "num_input_tokens_seen": 264850895, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.10388184, + "step": 12280, + "time_per_iteration": 2.4823272228240967 + }, + { + "auxiliary_loss_clip": 0.06411093, + "auxiliary_loss_mlp": 0.01264508, + "balance_loss_clip": 0.06273913, + "balance_loss_mlp": 0.01253493, + "epoch": 0.7383736660153314, + "flos": 20886706926720.0, + "grad_norm": 1.8639643742325518, + "language_loss": 0.72394395, + "learning_rate": 6.759424690946408e-07, + "loss": 0.80069995, + "num_input_tokens_seen": 264869505, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.11016846, + "step": 12281, + "time_per_iteration": 2.5224528312683105 + }, + { + "auxiliary_loss_clip": 0.06412193, + "auxiliary_loss_mlp": 0.01266264, + "balance_loss_clip": 0.06275124, + "balance_loss_mlp": 0.01255821, + "epoch": 0.7384337892679994, + "flos": 20668723729920.0, + "grad_norm": 1.7354362664323408, + "language_loss": 0.61005342, + "learning_rate": 6.756506010719711e-07, + "loss": 0.68683791, + "num_input_tokens_seen": 264886915, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.10449219, + "step": 12282, + "time_per_iteration": 2.5047874450683594 + }, + { + "auxiliary_loss_clip": 0.06414121, + "auxiliary_loss_mlp": 0.01267578, + "balance_loss_clip": 0.06274915, + "balance_loss_mlp": 0.01256945, + "epoch": 0.7384939125206673, + "flos": 29177432835840.0, + "grad_norm": 1.7016014462601576, + "language_loss": 0.6800909, + "learning_rate": 6.753587832687632e-07, + "loss": 0.75690794, + "num_input_tokens_seen": 264910350, + "router_z_loss_clip": 1.39257812, + "router_z_loss_mlp": 0.10632324, + "step": 12283, + "time_per_iteration": 2.5679969787597656 + }, + { + "auxiliary_loss_clip": 0.06408164, + "auxiliary_loss_mlp": 0.01266727, + "balance_loss_clip": 0.06274041, + "balance_loss_mlp": 0.01256636, + "epoch": 0.7385540357733353, + "flos": 36320494498560.0, + "grad_norm": 1.58111004650423, + "language_loss": 0.76160252, + "learning_rate": 6.750670156960832e-07, + "loss": 0.83835149, + "num_input_tokens_seen": 264930705, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.10095215, + "step": 12284, + "time_per_iteration": 2.6471667289733887 + }, + { + "auxiliary_loss_clip": 0.06415117, + "auxiliary_loss_mlp": 0.01265727, + "balance_loss_clip": 0.06277623, + "balance_loss_mlp": 0.01255028, + "epoch": 0.7386141590260034, + "flos": 20308758589440.0, + "grad_norm": 2.367235737464537, + "language_loss": 0.69446218, + "learning_rate": 6.747752983649954e-07, + "loss": 0.77127063, + "num_input_tokens_seen": 264946975, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.10705566, + "step": 12285, + "time_per_iteration": 2.473684549331665 + }, + { + "auxiliary_loss_clip": 0.06417808, + "auxiliary_loss_mlp": 0.01266655, + "balance_loss_clip": 0.06276424, + "balance_loss_mlp": 0.0125499, + "epoch": 0.7386742822786713, + "flos": 25490851948800.0, + "grad_norm": 1.8974918118522153, + "language_loss": 0.80231923, + "learning_rate": 6.744836312865602e-07, + "loss": 0.87916386, + "num_input_tokens_seen": 264967665, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.11669922, + "step": 12286, + "time_per_iteration": 2.552478313446045 + }, + { + "auxiliary_loss_clip": 0.06409865, + "auxiliary_loss_mlp": 0.01264773, + "balance_loss_clip": 0.06276139, + "balance_loss_mlp": 0.01254897, + "epoch": 0.7387344055313393, + "flos": 13777075843200.0, + "grad_norm": 2.0836319453796452, + "language_loss": 0.65815514, + "learning_rate": 6.741920144718396e-07, + "loss": 0.73490155, + "num_input_tokens_seen": 264985480, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.09881592, + "step": 12287, + "time_per_iteration": 2.47298264503479 + }, + { + "auxiliary_loss_clip": 0.0640405, + "auxiliary_loss_mlp": 0.01265177, + "balance_loss_clip": 0.06273359, + "balance_loss_mlp": 0.01255557, + "epoch": 0.7387945287840072, + "flos": 27862615693440.0, + "grad_norm": 1.674403553414071, + "language_loss": 0.76529717, + "learning_rate": 6.739004479318903e-07, + "loss": 0.84198946, + "num_input_tokens_seen": 265004790, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.09619141, + "step": 12288, + "time_per_iteration": 2.5699422359466553 + }, + { + "auxiliary_loss_clip": 0.06413888, + "auxiliary_loss_mlp": 0.0126915, + "balance_loss_clip": 0.06274378, + "balance_loss_mlp": 0.01257689, + "epoch": 0.7388546520366752, + "flos": 44242492515840.0, + "grad_norm": 1.8421640794180243, + "language_loss": 0.58466721, + "learning_rate": 6.736089316777684e-07, + "loss": 0.66149765, + "num_input_tokens_seen": 265028790, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.11462402, + "step": 12289, + "time_per_iteration": 2.691962242126465 + }, + { + "auxiliary_loss_clip": 0.06318665, + "auxiliary_loss_mlp": 0.01255253, + "balance_loss_clip": 0.06263465, + "balance_loss_mlp": 0.01254091, + "epoch": 0.7389147752893431, + "flos": 70700145672960.0, + "grad_norm": 0.6181631309216685, + "language_loss": 0.49242556, + "learning_rate": 6.733174657205287e-07, + "loss": 0.56816471, + "num_input_tokens_seen": 265096660, + "router_z_loss_clip": 0.55419922, + "router_z_loss_mlp": 0.01159668, + "step": 12290, + "time_per_iteration": 3.2382025718688965 + }, + { + "auxiliary_loss_clip": 0.06410427, + "auxiliary_loss_mlp": 0.01269006, + "balance_loss_clip": 0.0627414, + "balance_loss_mlp": 0.01256811, + "epoch": 0.7389748985420111, + "flos": 26002190689920.0, + "grad_norm": 1.6462515447687802, + "language_loss": 0.67644894, + "learning_rate": 6.730260500712237e-07, + "loss": 0.75324321, + "num_input_tokens_seen": 265116375, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.12182617, + "step": 12291, + "time_per_iteration": 2.5330934524536133 + }, + { + "auxiliary_loss_clip": 0.06323051, + "auxiliary_loss_mlp": 0.01253715, + "balance_loss_clip": 0.06267922, + "balance_loss_mlp": 0.01252465, + "epoch": 0.7390350217946791, + "flos": 54419428558080.0, + "grad_norm": 0.9538265155410941, + "language_loss": 0.60977232, + "learning_rate": 6.727346847409052e-07, + "loss": 0.68553996, + "num_input_tokens_seen": 265161230, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01249695, + "step": 12292, + "time_per_iteration": 2.809068202972412 + }, + { + "auxiliary_loss_clip": 0.06409512, + "auxiliary_loss_mlp": 0.01265193, + "balance_loss_clip": 0.06275129, + "balance_loss_mlp": 0.01255388, + "epoch": 0.7390951450473471, + "flos": 32205116741760.0, + "grad_norm": 2.042192821638958, + "language_loss": 0.67519832, + "learning_rate": 6.724433697406191e-07, + "loss": 0.75194532, + "num_input_tokens_seen": 265182515, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.09814453, + "step": 12293, + "time_per_iteration": 2.633490800857544 + }, + { + "auxiliary_loss_clip": 0.06407283, + "auxiliary_loss_mlp": 0.01264321, + "balance_loss_clip": 0.06273873, + "balance_loss_mlp": 0.01253682, + "epoch": 0.739155268300015, + "flos": 16688745371520.0, + "grad_norm": 1.7465858872032636, + "language_loss": 0.84024155, + "learning_rate": 6.721521050814134e-07, + "loss": 0.91695762, + "num_input_tokens_seen": 265198160, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.10644531, + "step": 12294, + "time_per_iteration": 2.4902942180633545 + }, + { + "auxiliary_loss_clip": 0.064035, + "auxiliary_loss_mlp": 0.01264966, + "balance_loss_clip": 0.06273185, + "balance_loss_mlp": 0.01254976, + "epoch": 0.739215391552683, + "flos": 31657831799040.0, + "grad_norm": 1.4686013728036598, + "language_loss": 0.72988927, + "learning_rate": 6.718608907743337e-07, + "loss": 0.80657387, + "num_input_tokens_seen": 265218480, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.09985352, + "step": 12295, + "time_per_iteration": 4.01623272895813 + }, + { + "auxiliary_loss_clip": 0.06404971, + "auxiliary_loss_mlp": 0.01264401, + "balance_loss_clip": 0.06274794, + "balance_loss_mlp": 0.01254906, + "epoch": 0.7392755148053509, + "flos": 29726688349440.0, + "grad_norm": 1.6462168088608014, + "language_loss": 0.78829199, + "learning_rate": 6.715697268304215e-07, + "loss": 0.8649857, + "num_input_tokens_seen": 265240165, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.09490967, + "step": 12296, + "time_per_iteration": 2.6365103721618652 + }, + { + "auxiliary_loss_clip": 0.06404981, + "auxiliary_loss_mlp": 0.01267023, + "balance_loss_clip": 0.0627135, + "balance_loss_mlp": 0.01256008, + "epoch": 0.7393356380580189, + "flos": 37059585436800.0, + "grad_norm": 1.8865876945980686, + "language_loss": 0.67489415, + "learning_rate": 6.712786132607182e-07, + "loss": 0.75161421, + "num_input_tokens_seen": 265263295, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.11010742, + "step": 12297, + "time_per_iteration": 2.6924734115600586 + }, + { + "auxiliary_loss_clip": 0.06407569, + "auxiliary_loss_mlp": 0.01264759, + "balance_loss_clip": 0.06272732, + "balance_loss_mlp": 0.01254447, + "epoch": 0.739395761310687, + "flos": 19725820934400.0, + "grad_norm": 1.5263040230444953, + "language_loss": 0.68836749, + "learning_rate": 6.709875500762645e-07, + "loss": 0.7650907, + "num_input_tokens_seen": 265282740, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.10308838, + "step": 12298, + "time_per_iteration": 2.501797914505005 + }, + { + "auxiliary_loss_clip": 0.06407927, + "auxiliary_loss_mlp": 0.01267097, + "balance_loss_clip": 0.06273854, + "balance_loss_mlp": 0.01256559, + "epoch": 0.7394558845633549, + "flos": 11806254685440.0, + "grad_norm": 2.783354408484115, + "language_loss": 0.74698675, + "learning_rate": 6.706965372880946e-07, + "loss": 0.82373697, + "num_input_tokens_seen": 265300175, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.10534668, + "step": 12299, + "time_per_iteration": 2.479194164276123 + }, + { + "auxiliary_loss_clip": 0.06317861, + "auxiliary_loss_mlp": 0.01251014, + "balance_loss_clip": 0.06262733, + "balance_loss_mlp": 0.01249821, + "epoch": 0.7395160078160229, + "flos": 66214782213120.0, + "grad_norm": 0.7124865082748734, + "language_loss": 0.60634726, + "learning_rate": 6.704055749072455e-07, + "loss": 0.68203598, + "num_input_tokens_seen": 265363275, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.01190948, + "step": 12300, + "time_per_iteration": 3.154963493347168 + }, + { + "auxiliary_loss_clip": 0.06409278, + "auxiliary_loss_mlp": 0.01265061, + "balance_loss_clip": 0.06273282, + "balance_loss_mlp": 0.01254451, + "epoch": 0.7395761310686908, + "flos": 21255770234880.0, + "grad_norm": 1.6643476346606387, + "language_loss": 0.80243456, + "learning_rate": 6.7011466294475e-07, + "loss": 0.87917793, + "num_input_tokens_seen": 265382935, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.1060791, + "step": 12301, + "time_per_iteration": 2.529728889465332 + }, + { + "auxiliary_loss_clip": 0.064082, + "auxiliary_loss_mlp": 0.01264915, + "balance_loss_clip": 0.06274755, + "balance_loss_mlp": 0.01254508, + "epoch": 0.7396362543213588, + "flos": 25961967930240.0, + "grad_norm": 1.3607409082618038, + "language_loss": 0.72955477, + "learning_rate": 6.698238014116406e-07, + "loss": 0.80628592, + "num_input_tokens_seen": 265403245, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.10406494, + "step": 12302, + "time_per_iteration": 2.546940326690674 + }, + { + "auxiliary_loss_clip": 0.06409822, + "auxiliary_loss_mlp": 0.01265837, + "balance_loss_clip": 0.06272913, + "balance_loss_mlp": 0.01255567, + "epoch": 0.7396963775740267, + "flos": 27384791385600.0, + "grad_norm": 1.8966052271775322, + "language_loss": 0.74529129, + "learning_rate": 6.695329903189451e-07, + "loss": 0.82204789, + "num_input_tokens_seen": 265423105, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.1027832, + "step": 12303, + "time_per_iteration": 2.5615267753601074 + }, + { + "auxiliary_loss_clip": 0.06403703, + "auxiliary_loss_mlp": 0.01264851, + "balance_loss_clip": 0.06271822, + "balance_loss_mlp": 0.01255546, + "epoch": 0.7397565008266948, + "flos": 25527175493760.0, + "grad_norm": 1.6634023085525402, + "language_loss": 0.54497898, + "learning_rate": 6.692422296776927e-07, + "loss": 0.62166452, + "num_input_tokens_seen": 265443445, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09307861, + "step": 12304, + "time_per_iteration": 2.5219099521636963 + }, + { + "auxiliary_loss_clip": 0.06408396, + "auxiliary_loss_mlp": 0.01263792, + "balance_loss_clip": 0.06273419, + "balance_loss_mlp": 0.01253808, + "epoch": 0.7398166240793627, + "flos": 23733737429760.0, + "grad_norm": 6.743550792885306, + "language_loss": 0.84620976, + "learning_rate": 6.689515194989084e-07, + "loss": 0.92293161, + "num_input_tokens_seen": 265462085, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.09979248, + "step": 12305, + "time_per_iteration": 3.947659969329834 + }, + { + "auxiliary_loss_clip": 0.06311572, + "auxiliary_loss_mlp": 0.01252487, + "balance_loss_clip": 0.06256508, + "balance_loss_mlp": 0.01251203, + "epoch": 0.7398767473320307, + "flos": 67289002755840.0, + "grad_norm": 0.8626934880407965, + "language_loss": 0.57769525, + "learning_rate": 6.68660859793615e-07, + "loss": 0.65333581, + "num_input_tokens_seen": 265521190, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.0128479, + "step": 12306, + "time_per_iteration": 3.1756792068481445 + }, + { + "auxiliary_loss_clip": 0.06411088, + "auxiliary_loss_mlp": 0.01263791, + "balance_loss_clip": 0.06273864, + "balance_loss_mlp": 0.0125327, + "epoch": 0.7399368705846986, + "flos": 22025356859520.0, + "grad_norm": 1.7963583951725388, + "language_loss": 0.81658536, + "learning_rate": 6.683702505728355e-07, + "loss": 0.89333415, + "num_input_tokens_seen": 265539705, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.10516357, + "step": 12307, + "time_per_iteration": 2.506915330886841 + }, + { + "auxiliary_loss_clip": 0.06403811, + "auxiliary_loss_mlp": 0.0126475, + "balance_loss_clip": 0.06274117, + "balance_loss_mlp": 0.01255696, + "epoch": 0.7399969938373666, + "flos": 14179150460160.0, + "grad_norm": 1.6050625884123768, + "language_loss": 0.70237017, + "learning_rate": 6.680796918475893e-07, + "loss": 0.77905583, + "num_input_tokens_seen": 265555855, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.09051514, + "step": 12308, + "time_per_iteration": 3.91337513923645 + }, + { + "auxiliary_loss_clip": 0.06401709, + "auxiliary_loss_mlp": 0.01262204, + "balance_loss_clip": 0.06271876, + "balance_loss_mlp": 0.01252459, + "epoch": 0.7400571170900345, + "flos": 25308521464320.0, + "grad_norm": 1.6982405979686375, + "language_loss": 0.81117153, + "learning_rate": 6.67789183628896e-07, + "loss": 0.88781071, + "num_input_tokens_seen": 265575455, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.09747314, + "step": 12309, + "time_per_iteration": 2.5796985626220703 + }, + { + "auxiliary_loss_clip": 0.06409381, + "auxiliary_loss_mlp": 0.01269417, + "balance_loss_clip": 0.06270479, + "balance_loss_mlp": 0.01258534, + "epoch": 0.7401172403427025, + "flos": 22718019836160.0, + "grad_norm": 5.238582270491251, + "language_loss": 0.73371196, + "learning_rate": 6.674987259277692e-07, + "loss": 0.81049991, + "num_input_tokens_seen": 265595250, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.10882568, + "step": 12310, + "time_per_iteration": 2.5165646076202393 + }, + { + "auxiliary_loss_clip": 0.06409644, + "auxiliary_loss_mlp": 0.01269084, + "balance_loss_clip": 0.06274551, + "balance_loss_mlp": 0.01257669, + "epoch": 0.7401773635953706, + "flos": 18071639556480.0, + "grad_norm": 2.7222235322625417, + "language_loss": 0.89223385, + "learning_rate": 6.672083187552239e-07, + "loss": 0.96902108, + "num_input_tokens_seen": 265606945, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.11425781, + "step": 12311, + "time_per_iteration": 2.467475652694702 + }, + { + "auxiliary_loss_clip": 0.0640601, + "auxiliary_loss_mlp": 0.01266757, + "balance_loss_clip": 0.06272036, + "balance_loss_mlp": 0.01256934, + "epoch": 0.7402374868480385, + "flos": 22718942231040.0, + "grad_norm": 1.4999851664761075, + "language_loss": 0.8031621, + "learning_rate": 6.669179621222738e-07, + "loss": 0.87988985, + "num_input_tokens_seen": 265626115, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.09832764, + "step": 12312, + "time_per_iteration": 2.5331287384033203 + }, + { + "auxiliary_loss_clip": 0.06405149, + "auxiliary_loss_mlp": 0.01264931, + "balance_loss_clip": 0.06272588, + "balance_loss_mlp": 0.01255072, + "epoch": 0.7402976101007065, + "flos": 22863272088960.0, + "grad_norm": 1.7972684240515402, + "language_loss": 0.78719336, + "learning_rate": 6.666276560399273e-07, + "loss": 0.86389416, + "num_input_tokens_seen": 265646520, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.09857178, + "step": 12313, + "time_per_iteration": 2.5370211601257324 + }, + { + "auxiliary_loss_clip": 0.06407566, + "auxiliary_loss_mlp": 0.01265144, + "balance_loss_clip": 0.0626882, + "balance_loss_mlp": 0.01254308, + "epoch": 0.7403577333533744, + "flos": 12350143537920.0, + "grad_norm": 1.8417739265455044, + "language_loss": 0.79031622, + "learning_rate": 6.663374005191937e-07, + "loss": 0.86704326, + "num_input_tokens_seen": 265661875, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.10827637, + "step": 12314, + "time_per_iteration": 3.856675148010254 + }, + { + "auxiliary_loss_clip": 0.06317294, + "auxiliary_loss_mlp": 0.01250351, + "balance_loss_clip": 0.06261952, + "balance_loss_mlp": 0.01249078, + "epoch": 0.7404178566060424, + "flos": 60346189152000.0, + "grad_norm": 0.8038008604712399, + "language_loss": 0.55230701, + "learning_rate": 6.660471955710809e-07, + "loss": 0.62798345, + "num_input_tokens_seen": 265721255, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01273346, + "step": 12315, + "time_per_iteration": 3.094839334487915 + }, + { + "auxiliary_loss_clip": 0.06400545, + "auxiliary_loss_mlp": 0.01269055, + "balance_loss_clip": 0.06270912, + "balance_loss_mlp": 0.01259298, + "epoch": 0.7404779798587103, + "flos": 32022786257280.0, + "grad_norm": 1.42588959053577, + "language_loss": 0.79849303, + "learning_rate": 6.65757041206591e-07, + "loss": 0.87518907, + "num_input_tokens_seen": 265743970, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.09759521, + "step": 12316, + "time_per_iteration": 2.6217541694641113 + }, + { + "auxiliary_loss_clip": 0.06405086, + "auxiliary_loss_mlp": 0.01263693, + "balance_loss_clip": 0.06270514, + "balance_loss_mlp": 0.01253703, + "epoch": 0.7405381031113784, + "flos": 12893571192960.0, + "grad_norm": 1.9031027598783419, + "language_loss": 0.74949759, + "learning_rate": 6.654669374367275e-07, + "loss": 0.82618535, + "num_input_tokens_seen": 265760890, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.09997559, + "step": 12317, + "time_per_iteration": 2.4909305572509766 + }, + { + "auxiliary_loss_clip": 0.06398293, + "auxiliary_loss_mlp": 0.01265661, + "balance_loss_clip": 0.06270675, + "balance_loss_mlp": 0.01256625, + "epoch": 0.7405982263640463, + "flos": 20235189104640.0, + "grad_norm": 1.7604511064610666, + "language_loss": 0.81780982, + "learning_rate": 6.651768842724917e-07, + "loss": 0.89444935, + "num_input_tokens_seen": 265779600, + "router_z_loss_clip": 1.27636719, + "router_z_loss_mlp": 0.09039307, + "step": 12318, + "time_per_iteration": 2.5435891151428223 + }, + { + "auxiliary_loss_clip": 0.06408297, + "auxiliary_loss_mlp": 0.01266199, + "balance_loss_clip": 0.06271499, + "balance_loss_mlp": 0.01256317, + "epoch": 0.7406583496167143, + "flos": 17573088562560.0, + "grad_norm": 1.866306408499981, + "language_loss": 0.76751161, + "learning_rate": 6.648868817248827e-07, + "loss": 0.84425652, + "num_input_tokens_seen": 265797030, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.09887695, + "step": 12319, + "time_per_iteration": 2.4622530937194824 + }, + { + "auxiliary_loss_clip": 0.0640564, + "auxiliary_loss_mlp": 0.01263336, + "balance_loss_clip": 0.06272121, + "balance_loss_mlp": 0.01253645, + "epoch": 0.7407184728693822, + "flos": 18301530032640.0, + "grad_norm": 2.0432497673800563, + "language_loss": 0.63919193, + "learning_rate": 6.64596929804897e-07, + "loss": 0.71588171, + "num_input_tokens_seen": 265815055, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.09698486, + "step": 12320, + "time_per_iteration": 2.491823196411133 + }, + { + "auxiliary_loss_clip": 0.06412543, + "auxiliary_loss_mlp": 0.01263353, + "balance_loss_clip": 0.06273834, + "balance_loss_mlp": 0.01252761, + "epoch": 0.7407785961220502, + "flos": 16696124530560.0, + "grad_norm": 2.5007986584617767, + "language_loss": 0.82488716, + "learning_rate": 6.643070285235288e-07, + "loss": 0.90164608, + "num_input_tokens_seen": 265828480, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.10583496, + "step": 12321, + "time_per_iteration": 2.472942352294922 + }, + { + "auxiliary_loss_clip": 0.06413056, + "auxiliary_loss_mlp": 0.01275475, + "balance_loss_clip": 0.06272203, + "balance_loss_mlp": 0.01263488, + "epoch": 0.7408387193747181, + "flos": 22094440151040.0, + "grad_norm": 1.687827757394498, + "language_loss": 0.72481614, + "learning_rate": 6.640171778917727e-07, + "loss": 0.80170149, + "num_input_tokens_seen": 265845825, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.11993408, + "step": 12322, + "time_per_iteration": 2.5148372650146484 + }, + { + "auxiliary_loss_clip": 0.06410389, + "auxiliary_loss_mlp": 0.01264964, + "balance_loss_clip": 0.06275401, + "balance_loss_mlp": 0.01254969, + "epoch": 0.7408988426273861, + "flos": 24242476694400.0, + "grad_norm": 1.7223397407589476, + "language_loss": 0.64227688, + "learning_rate": 6.637273779206183e-07, + "loss": 0.71903044, + "num_input_tokens_seen": 265866335, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.09991455, + "step": 12323, + "time_per_iteration": 2.545907735824585 + }, + { + "auxiliary_loss_clip": 0.06410556, + "auxiliary_loss_mlp": 0.01267934, + "balance_loss_clip": 0.06273916, + "balance_loss_mlp": 0.01257348, + "epoch": 0.7409589658800542, + "flos": 29030671209600.0, + "grad_norm": 1.3447635409056256, + "language_loss": 0.76155257, + "learning_rate": 6.634376286210559e-07, + "loss": 0.83833748, + "num_input_tokens_seen": 265888945, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.105896, + "step": 12324, + "time_per_iteration": 2.6743714809417725 + }, + { + "auxiliary_loss_clip": 0.06405617, + "auxiliary_loss_mlp": 0.01264226, + "balance_loss_clip": 0.06272118, + "balance_loss_mlp": 0.01254272, + "epoch": 0.7410190891327221, + "flos": 19356925334400.0, + "grad_norm": 13.963490844682125, + "language_loss": 0.74922419, + "learning_rate": 6.63147930004073e-07, + "loss": 0.82592261, + "num_input_tokens_seen": 265908030, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.09960938, + "step": 12325, + "time_per_iteration": 2.471677780151367 + }, + { + "auxiliary_loss_clip": 0.064167, + "auxiliary_loss_mlp": 0.01267104, + "balance_loss_clip": 0.06275749, + "balance_loss_mlp": 0.01256208, + "epoch": 0.7410792123853901, + "flos": 22754301454080.0, + "grad_norm": 1.6510689232341687, + "language_loss": 0.68920004, + "learning_rate": 6.628582820806545e-07, + "loss": 0.76603806, + "num_input_tokens_seen": 265927030, + "router_z_loss_clip": 1.40917969, + "router_z_loss_mlp": 0.10906982, + "step": 12326, + "time_per_iteration": 2.544271469116211 + }, + { + "auxiliary_loss_clip": 0.06406512, + "auxiliary_loss_mlp": 0.01270057, + "balance_loss_clip": 0.06271762, + "balance_loss_mlp": 0.01259943, + "epoch": 0.741139335638058, + "flos": 25379156056320.0, + "grad_norm": 2.684979070680883, + "language_loss": 0.89408934, + "learning_rate": 6.625686848617835e-07, + "loss": 0.97085506, + "num_input_tokens_seen": 265945490, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.10113525, + "step": 12327, + "time_per_iteration": 2.514342784881592 + }, + { + "auxiliary_loss_clip": 0.06405853, + "auxiliary_loss_mlp": 0.01270995, + "balance_loss_clip": 0.0627297, + "balance_loss_mlp": 0.01260326, + "epoch": 0.741199458890726, + "flos": 18591154070400.0, + "grad_norm": 1.616289045038266, + "language_loss": 0.86022431, + "learning_rate": 6.62279138358442e-07, + "loss": 0.93699282, + "num_input_tokens_seen": 265963265, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.10668945, + "step": 12328, + "time_per_iteration": 2.546849012374878 + }, + { + "auxiliary_loss_clip": 0.06404015, + "auxiliary_loss_mlp": 0.01266041, + "balance_loss_clip": 0.06273206, + "balance_loss_mlp": 0.01256373, + "epoch": 0.7412595821433939, + "flos": 22133572807680.0, + "grad_norm": 3.0862478099951476, + "language_loss": 0.66898477, + "learning_rate": 6.619896425816103e-07, + "loss": 0.74568534, + "num_input_tokens_seen": 265982270, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09655762, + "step": 12329, + "time_per_iteration": 2.4837799072265625 + }, + { + "auxiliary_loss_clip": 0.06415252, + "auxiliary_loss_mlp": 0.01271747, + "balance_loss_clip": 0.06274865, + "balance_loss_mlp": 0.01261262, + "epoch": 0.741319705396062, + "flos": 29177516689920.0, + "grad_norm": 1.6153996639831127, + "language_loss": 0.67172372, + "learning_rate": 6.617001975422647e-07, + "loss": 0.74859369, + "num_input_tokens_seen": 266003835, + "router_z_loss_clip": 1.40332031, + "router_z_loss_mlp": 0.10479736, + "step": 12330, + "time_per_iteration": 2.59244441986084 + }, + { + "auxiliary_loss_clip": 0.06414045, + "auxiliary_loss_mlp": 0.01265631, + "balance_loss_clip": 0.06274007, + "balance_loss_mlp": 0.01254467, + "epoch": 0.7413798286487299, + "flos": 20673713047680.0, + "grad_norm": 1.8418070280678467, + "language_loss": 0.85594726, + "learning_rate": 6.614108032513823e-07, + "loss": 0.93274403, + "num_input_tokens_seen": 266021595, + "router_z_loss_clip": 1.39941406, + "router_z_loss_mlp": 0.11169434, + "step": 12331, + "time_per_iteration": 2.6050429344177246 + }, + { + "auxiliary_loss_clip": 0.06410865, + "auxiliary_loss_mlp": 0.01264119, + "balance_loss_clip": 0.06275013, + "balance_loss_mlp": 0.01253837, + "epoch": 0.7414399519013979, + "flos": 16404446067840.0, + "grad_norm": 1.9259075760322277, + "language_loss": 0.69746608, + "learning_rate": 6.611214597199364e-07, + "loss": 0.77421594, + "num_input_tokens_seen": 266039860, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10284424, + "step": 12332, + "time_per_iteration": 2.519845485687256 + }, + { + "auxiliary_loss_clip": 0.06408165, + "auxiliary_loss_mlp": 0.01266174, + "balance_loss_clip": 0.06273398, + "balance_loss_mlp": 0.01255761, + "epoch": 0.7415000751540658, + "flos": 25637403939840.0, + "grad_norm": 1.899841467346803, + "language_loss": 0.63552696, + "learning_rate": 6.608321669588984e-07, + "loss": 0.71227038, + "num_input_tokens_seen": 266058050, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.10418701, + "step": 12333, + "time_per_iteration": 2.5220582485198975 + }, + { + "auxiliary_loss_clip": 0.06403545, + "auxiliary_loss_mlp": 0.0126491, + "balance_loss_clip": 0.06274091, + "balance_loss_mlp": 0.01255391, + "epoch": 0.7415601984067338, + "flos": 24506803998720.0, + "grad_norm": 1.7352435942597948, + "language_loss": 0.7115826, + "learning_rate": 6.605429249792387e-07, + "loss": 0.78826714, + "num_input_tokens_seen": 266078060, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.09521484, + "step": 12334, + "time_per_iteration": 3.9428293704986572 + }, + { + "auxiliary_loss_clip": 0.0640265, + "auxiliary_loss_mlp": 0.01263886, + "balance_loss_clip": 0.06269788, + "balance_loss_mlp": 0.01253628, + "epoch": 0.7416203216594017, + "flos": 20893541034240.0, + "grad_norm": 1.579239832257194, + "language_loss": 0.82769573, + "learning_rate": 6.602537337919257e-07, + "loss": 0.90436113, + "num_input_tokens_seen": 266097110, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.10253906, + "step": 12335, + "time_per_iteration": 2.5163700580596924 + }, + { + "auxiliary_loss_clip": 0.06406333, + "auxiliary_loss_mlp": 0.01267868, + "balance_loss_clip": 0.06269982, + "balance_loss_mlp": 0.01257556, + "epoch": 0.7416804449120697, + "flos": 15628276897920.0, + "grad_norm": 2.378220107859676, + "language_loss": 0.75595701, + "learning_rate": 6.599645934079259e-07, + "loss": 0.832699, + "num_input_tokens_seen": 266110870, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.10308838, + "step": 12336, + "time_per_iteration": 2.471386432647705 + }, + { + "auxiliary_loss_clip": 0.06412801, + "auxiliary_loss_mlp": 0.01265477, + "balance_loss_clip": 0.06276821, + "balance_loss_mlp": 0.01255582, + "epoch": 0.7417405681647377, + "flos": 17124795619200.0, + "grad_norm": 1.7670482081057908, + "language_loss": 0.73856127, + "learning_rate": 6.596755038382029e-07, + "loss": 0.8153441, + "num_input_tokens_seen": 266127845, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.09899902, + "step": 12337, + "time_per_iteration": 2.466338872909546 + }, + { + "auxiliary_loss_clip": 0.06405115, + "auxiliary_loss_mlp": 0.01266953, + "balance_loss_clip": 0.06274252, + "balance_loss_mlp": 0.01257428, + "epoch": 0.7418006914174057, + "flos": 18886354404480.0, + "grad_norm": 1.7252215797420232, + "language_loss": 0.76747906, + "learning_rate": 6.593864650937186e-07, + "loss": 0.84419966, + "num_input_tokens_seen": 266145400, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09527588, + "step": 12338, + "time_per_iteration": 2.4993648529052734 + }, + { + "auxiliary_loss_clip": 0.06403196, + "auxiliary_loss_mlp": 0.01266291, + "balance_loss_clip": 0.06271601, + "balance_loss_mlp": 0.01256993, + "epoch": 0.7418608146700737, + "flos": 21587294113920.0, + "grad_norm": 1.629364816328998, + "language_loss": 0.72958922, + "learning_rate": 6.590974771854345e-07, + "loss": 0.80628407, + "num_input_tokens_seen": 266164430, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09301758, + "step": 12339, + "time_per_iteration": 2.4901506900787354 + }, + { + "auxiliary_loss_clip": 0.06403936, + "auxiliary_loss_mlp": 0.01263048, + "balance_loss_clip": 0.06271182, + "balance_loss_mlp": 0.01253011, + "epoch": 0.7419209379227416, + "flos": 22346063562240.0, + "grad_norm": 3.4897351250421322, + "language_loss": 0.79916894, + "learning_rate": 6.588085401243077e-07, + "loss": 0.87583876, + "num_input_tokens_seen": 266183855, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.10046387, + "step": 12340, + "time_per_iteration": 2.5338644981384277 + }, + { + "auxiliary_loss_clip": 0.0640725, + "auxiliary_loss_mlp": 0.0126408, + "balance_loss_clip": 0.06272589, + "balance_loss_mlp": 0.01254168, + "epoch": 0.7419810611754096, + "flos": 16767639590400.0, + "grad_norm": 1.374564761122075, + "language_loss": 0.76099288, + "learning_rate": 6.585196539212958e-07, + "loss": 0.83770621, + "num_input_tokens_seen": 266202085, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.09912109, + "step": 12341, + "time_per_iteration": 2.495758056640625 + }, + { + "auxiliary_loss_clip": 0.06401518, + "auxiliary_loss_mlp": 0.01269793, + "balance_loss_clip": 0.06276906, + "balance_loss_mlp": 0.01260292, + "epoch": 0.7420411844280775, + "flos": 26220048105600.0, + "grad_norm": 1.417674408189636, + "language_loss": 0.80324268, + "learning_rate": 6.582308185873535e-07, + "loss": 0.87995577, + "num_input_tokens_seen": 266223445, + "router_z_loss_clip": 1.24511719, + "router_z_loss_mlp": 0.09503174, + "step": 12342, + "time_per_iteration": 2.5588223934173584 + }, + { + "auxiliary_loss_clip": 0.06405, + "auxiliary_loss_mlp": 0.01266068, + "balance_loss_clip": 0.06272244, + "balance_loss_mlp": 0.01256328, + "epoch": 0.7421013076807456, + "flos": 68542354857600.0, + "grad_norm": 1.7864358028362888, + "language_loss": 0.7745598, + "learning_rate": 6.57942034133433e-07, + "loss": 0.85127044, + "num_input_tokens_seen": 266246575, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09741211, + "step": 12343, + "time_per_iteration": 2.893523693084717 + }, + { + "auxiliary_loss_clip": 0.0640204, + "auxiliary_loss_mlp": 0.01267663, + "balance_loss_clip": 0.06267961, + "balance_loss_mlp": 0.01257482, + "epoch": 0.7421614309334135, + "flos": 24432144410880.0, + "grad_norm": 1.492444453579108, + "language_loss": 0.68024582, + "learning_rate": 6.576533005704843e-07, + "loss": 0.75694287, + "num_input_tokens_seen": 266266055, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.10186768, + "step": 12344, + "time_per_iteration": 4.0460686683654785 + }, + { + "auxiliary_loss_clip": 0.0640749, + "auxiliary_loss_mlp": 0.0126471, + "balance_loss_clip": 0.06272101, + "balance_loss_mlp": 0.01254178, + "epoch": 0.7422215541860815, + "flos": 12315706709760.0, + "grad_norm": 2.0673948051612983, + "language_loss": 0.81438386, + "learning_rate": 6.573646179094572e-07, + "loss": 0.89110589, + "num_input_tokens_seen": 266282240, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.10522461, + "step": 12345, + "time_per_iteration": 2.5168869495391846 + }, + { + "auxiliary_loss_clip": 0.06405216, + "auxiliary_loss_mlp": 0.01263643, + "balance_loss_clip": 0.06271648, + "balance_loss_mlp": 0.01253975, + "epoch": 0.7422816774387494, + "flos": 19651580616960.0, + "grad_norm": 1.781451237104089, + "language_loss": 0.70713991, + "learning_rate": 6.570759861612988e-07, + "loss": 0.7838285, + "num_input_tokens_seen": 266300980, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.09661865, + "step": 12346, + "time_per_iteration": 2.481515407562256 + }, + { + "auxiliary_loss_clip": 0.06407449, + "auxiliary_loss_mlp": 0.01266551, + "balance_loss_clip": 0.06272161, + "balance_loss_mlp": 0.0125683, + "epoch": 0.7423418006914174, + "flos": 32024337557760.0, + "grad_norm": 1.4530238546108785, + "language_loss": 0.73483253, + "learning_rate": 6.56787405336953e-07, + "loss": 0.81157255, + "num_input_tokens_seen": 266322215, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.097229, + "step": 12347, + "time_per_iteration": 2.6118276119232178 + }, + { + "auxiliary_loss_clip": 0.06410117, + "auxiliary_loss_mlp": 0.01263875, + "balance_loss_clip": 0.06271449, + "balance_loss_mlp": 0.01253355, + "epoch": 0.7424019239440853, + "flos": 18923013365760.0, + "grad_norm": 2.221279445831195, + "language_loss": 0.81336832, + "learning_rate": 6.564988754473642e-07, + "loss": 0.89010823, + "num_input_tokens_seen": 266341600, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.10522461, + "step": 12348, + "time_per_iteration": 3.9795804023742676 + }, + { + "auxiliary_loss_clip": 0.06404714, + "auxiliary_loss_mlp": 0.01264602, + "balance_loss_clip": 0.06274206, + "balance_loss_mlp": 0.01254827, + "epoch": 0.7424620471967533, + "flos": 35884360396800.0, + "grad_norm": 1.7176907745599117, + "language_loss": 0.72897398, + "learning_rate": 6.562103965034724e-07, + "loss": 0.8056671, + "num_input_tokens_seen": 266362895, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.09765625, + "step": 12349, + "time_per_iteration": 2.5986247062683105 + }, + { + "auxiliary_loss_clip": 0.0641204, + "auxiliary_loss_mlp": 0.01266614, + "balance_loss_clip": 0.06272119, + "balance_loss_mlp": 0.01255629, + "epoch": 0.7425221704494213, + "flos": 27023987704320.0, + "grad_norm": 1.8752409058268018, + "language_loss": 0.79401171, + "learning_rate": 6.559219685162165e-07, + "loss": 0.87079823, + "num_input_tokens_seen": 266384015, + "router_z_loss_clip": 1.39941406, + "router_z_loss_mlp": 0.10986328, + "step": 12350, + "time_per_iteration": 2.5616562366485596 + }, + { + "auxiliary_loss_clip": 0.06404371, + "auxiliary_loss_mlp": 0.01263238, + "balance_loss_clip": 0.06270912, + "balance_loss_mlp": 0.01253147, + "epoch": 0.7425822937020893, + "flos": 34175602483200.0, + "grad_norm": 3.363091942962461, + "language_loss": 0.75271994, + "learning_rate": 6.556335914965343e-07, + "loss": 0.82939601, + "num_input_tokens_seen": 266405990, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.10101318, + "step": 12351, + "time_per_iteration": 2.5991873741149902 + }, + { + "auxiliary_loss_clip": 0.06407189, + "auxiliary_loss_mlp": 0.01264826, + "balance_loss_clip": 0.06273928, + "balance_loss_mlp": 0.01255033, + "epoch": 0.7426424169547573, + "flos": 21289200814080.0, + "grad_norm": 1.9305253620740155, + "language_loss": 0.81533462, + "learning_rate": 6.553452654553611e-07, + "loss": 0.89205474, + "num_input_tokens_seen": 266424260, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.09790039, + "step": 12352, + "time_per_iteration": 2.531691551208496 + }, + { + "auxiliary_loss_clip": 0.06410765, + "auxiliary_loss_mlp": 0.01263525, + "balance_loss_clip": 0.06275038, + "balance_loss_mlp": 0.01253386, + "epoch": 0.7427025402074252, + "flos": 22453818312960.0, + "grad_norm": 1.6215241658944841, + "language_loss": 0.71717203, + "learning_rate": 6.550569904036307e-07, + "loss": 0.79391491, + "num_input_tokens_seen": 266444580, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10144043, + "step": 12353, + "time_per_iteration": 4.0272791385650635 + }, + { + "auxiliary_loss_clip": 0.06404988, + "auxiliary_loss_mlp": 0.01265185, + "balance_loss_clip": 0.0627149, + "balance_loss_mlp": 0.01255731, + "epoch": 0.7427626634600932, + "flos": 22530532325760.0, + "grad_norm": 2.41683810368099, + "language_loss": 0.72524661, + "learning_rate": 6.547687663522739e-07, + "loss": 0.80194831, + "num_input_tokens_seen": 266465640, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.09454346, + "step": 12354, + "time_per_iteration": 2.5672101974487305 + }, + { + "auxiliary_loss_clip": 0.06316006, + "auxiliary_loss_mlp": 0.01252952, + "balance_loss_clip": 0.0626021, + "balance_loss_mlp": 0.01251813, + "epoch": 0.7428227867127611, + "flos": 67227271424640.0, + "grad_norm": 0.6879551946330541, + "language_loss": 0.59384382, + "learning_rate": 6.544805933122199e-07, + "loss": 0.66953337, + "num_input_tokens_seen": 266531950, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.01139832, + "step": 12355, + "time_per_iteration": 3.244594097137451 + }, + { + "auxiliary_loss_clip": 0.06405793, + "auxiliary_loss_mlp": 0.01264507, + "balance_loss_clip": 0.06270608, + "balance_loss_mlp": 0.01254363, + "epoch": 0.7428829099654292, + "flos": 14726603111040.0, + "grad_norm": 1.6011597337483758, + "language_loss": 0.67696226, + "learning_rate": 6.541924712943971e-07, + "loss": 0.75366527, + "num_input_tokens_seen": 266550665, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.10150146, + "step": 12356, + "time_per_iteration": 2.48699951171875 + }, + { + "auxiliary_loss_clip": 0.06406914, + "auxiliary_loss_mlp": 0.01263054, + "balance_loss_clip": 0.06269816, + "balance_loss_mlp": 0.01252623, + "epoch": 0.7429430332180971, + "flos": 48656466696960.0, + "grad_norm": 1.5868291550448252, + "language_loss": 0.72533596, + "learning_rate": 6.539044003097301e-07, + "loss": 0.80203569, + "num_input_tokens_seen": 266572455, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.10424805, + "step": 12357, + "time_per_iteration": 2.8397207260131836 + }, + { + "auxiliary_loss_clip": 0.06402919, + "auxiliary_loss_mlp": 0.01263418, + "balance_loss_clip": 0.06274128, + "balance_loss_mlp": 0.01254495, + "epoch": 0.7430031564707651, + "flos": 16769735942400.0, + "grad_norm": 1.978658121021226, + "language_loss": 0.65120018, + "learning_rate": 6.53616380369143e-07, + "loss": 0.72786361, + "num_input_tokens_seen": 266590895, + "router_z_loss_clip": 1.28808594, + "router_z_loss_mlp": 0.08917236, + "step": 12358, + "time_per_iteration": 2.4834437370300293 + }, + { + "auxiliary_loss_clip": 0.06409361, + "auxiliary_loss_mlp": 0.01267679, + "balance_loss_clip": 0.06271667, + "balance_loss_mlp": 0.01256807, + "epoch": 0.743063279723433, + "flos": 23876054789760.0, + "grad_norm": 1.7508744864963774, + "language_loss": 0.81005955, + "learning_rate": 6.533284114835591e-07, + "loss": 0.88682991, + "num_input_tokens_seen": 266607660, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.10864258, + "step": 12359, + "time_per_iteration": 2.5511791706085205 + }, + { + "auxiliary_loss_clip": 0.06404864, + "auxiliary_loss_mlp": 0.01269499, + "balance_loss_clip": 0.06269827, + "balance_loss_mlp": 0.01259491, + "epoch": 0.743123402976101, + "flos": 14396840167680.0, + "grad_norm": 2.4409850901837924, + "language_loss": 0.688115, + "learning_rate": 6.530404936638956e-07, + "loss": 0.7648586, + "num_input_tokens_seen": 266624260, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.10009766, + "step": 12360, + "time_per_iteration": 2.454799175262451 + }, + { + "auxiliary_loss_clip": 0.06402747, + "auxiliary_loss_mlp": 0.01266625, + "balance_loss_clip": 0.06271, + "balance_loss_mlp": 0.01256695, + "epoch": 0.7431835262287689, + "flos": 27461756960640.0, + "grad_norm": 1.612303136385371, + "language_loss": 0.73023605, + "learning_rate": 6.527526269210715e-07, + "loss": 0.80692977, + "num_input_tokens_seen": 266644210, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09936523, + "step": 12361, + "time_per_iteration": 2.563950538635254 + }, + { + "auxiliary_loss_clip": 0.06409371, + "auxiliary_loss_mlp": 0.01263731, + "balance_loss_clip": 0.06271869, + "balance_loss_mlp": 0.01253706, + "epoch": 0.743243649481437, + "flos": 20965810780800.0, + "grad_norm": 2.1605200841945345, + "language_loss": 0.56417334, + "learning_rate": 6.524648112660027e-07, + "loss": 0.64090431, + "num_input_tokens_seen": 266664230, + "router_z_loss_clip": 1.37695312, + "router_z_loss_mlp": 0.10028076, + "step": 12362, + "time_per_iteration": 2.5222644805908203 + }, + { + "auxiliary_loss_clip": 0.06406482, + "auxiliary_loss_mlp": 0.01263965, + "balance_loss_clip": 0.06272303, + "balance_loss_mlp": 0.012541, + "epoch": 0.7433037727341049, + "flos": 22789660677120.0, + "grad_norm": 2.4729179704806796, + "language_loss": 0.77661127, + "learning_rate": 6.521770467096039e-07, + "loss": 0.85331571, + "num_input_tokens_seen": 266683270, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.09869385, + "step": 12363, + "time_per_iteration": 2.5122897624969482 + }, + { + "auxiliary_loss_clip": 0.06408481, + "auxiliary_loss_mlp": 0.01264275, + "balance_loss_clip": 0.06273359, + "balance_loss_mlp": 0.01255054, + "epoch": 0.7433638959867729, + "flos": 22202656099200.0, + "grad_norm": 1.616246538203827, + "language_loss": 0.78287363, + "learning_rate": 6.518893332627862e-07, + "loss": 0.85960114, + "num_input_tokens_seen": 266701235, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.09222412, + "step": 12364, + "time_per_iteration": 2.492027521133423 + }, + { + "auxiliary_loss_clip": 0.06406204, + "auxiliary_loss_mlp": 0.01264726, + "balance_loss_clip": 0.06272129, + "balance_loss_mlp": 0.01254867, + "epoch": 0.7434240192394409, + "flos": 23303808529920.0, + "grad_norm": 1.801205271942991, + "language_loss": 0.78693449, + "learning_rate": 6.516016709364604e-07, + "loss": 0.86364377, + "num_input_tokens_seen": 266721495, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.09851074, + "step": 12365, + "time_per_iteration": 2.536839485168457 + }, + { + "auxiliary_loss_clip": 0.06409302, + "auxiliary_loss_mlp": 0.01265053, + "balance_loss_clip": 0.06271569, + "balance_loss_mlp": 0.01254884, + "epoch": 0.7434841424921088, + "flos": 54020387416320.0, + "grad_norm": 1.5444951998265788, + "language_loss": 0.77106571, + "learning_rate": 6.513140597415346e-07, + "loss": 0.8478092, + "num_input_tokens_seen": 266747400, + "router_z_loss_clip": 1.37695312, + "router_z_loss_mlp": 0.10168457, + "step": 12366, + "time_per_iteration": 2.7708029747009277 + }, + { + "auxiliary_loss_clip": 0.06405418, + "auxiliary_loss_mlp": 0.01263106, + "balance_loss_clip": 0.06275211, + "balance_loss_mlp": 0.01254588, + "epoch": 0.7435442657447768, + "flos": 21440364779520.0, + "grad_norm": 1.560298463472275, + "language_loss": 0.71305168, + "learning_rate": 6.510264996889141e-07, + "loss": 0.78973687, + "num_input_tokens_seen": 266767630, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.08514404, + "step": 12367, + "time_per_iteration": 2.5184154510498047 + }, + { + "auxiliary_loss_clip": 0.06410043, + "auxiliary_loss_mlp": 0.01265202, + "balance_loss_clip": 0.06271939, + "balance_loss_mlp": 0.01255242, + "epoch": 0.7436043889974447, + "flos": 24506426655360.0, + "grad_norm": 1.476887140959893, + "language_loss": 0.75017029, + "learning_rate": 6.507389907895038e-07, + "loss": 0.82692266, + "num_input_tokens_seen": 266788015, + "router_z_loss_clip": 1.38183594, + "router_z_loss_mlp": 0.09960938, + "step": 12368, + "time_per_iteration": 2.5212924480438232 + }, + { + "auxiliary_loss_clip": 0.0640331, + "auxiliary_loss_mlp": 0.01266737, + "balance_loss_clip": 0.06271964, + "balance_loss_mlp": 0.01257248, + "epoch": 0.7436645122501128, + "flos": 40707997989120.0, + "grad_norm": 1.6519128138397359, + "language_loss": 0.69042623, + "learning_rate": 6.50451533054207e-07, + "loss": 0.76712668, + "num_input_tokens_seen": 266809010, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.09490967, + "step": 12369, + "time_per_iteration": 2.7047884464263916 + }, + { + "auxiliary_loss_clip": 0.06408005, + "auxiliary_loss_mlp": 0.01266433, + "balance_loss_clip": 0.06272747, + "balance_loss_mlp": 0.01256258, + "epoch": 0.7437246355027807, + "flos": 18913537854720.0, + "grad_norm": 1.595861424874944, + "language_loss": 0.75370234, + "learning_rate": 6.501641264939233e-07, + "loss": 0.83044672, + "num_input_tokens_seen": 266825390, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.10168457, + "step": 12370, + "time_per_iteration": 2.473238468170166 + }, + { + "auxiliary_loss_clip": 0.06403841, + "auxiliary_loss_mlp": 0.01266197, + "balance_loss_clip": 0.06273004, + "balance_loss_mlp": 0.01256487, + "epoch": 0.7437847587554487, + "flos": 21550299736320.0, + "grad_norm": 1.5233822709060378, + "language_loss": 0.78544998, + "learning_rate": 6.498767711195503e-07, + "loss": 0.86215037, + "num_input_tokens_seen": 266844675, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09710693, + "step": 12371, + "time_per_iteration": 2.5248806476593018 + }, + { + "auxiliary_loss_clip": 0.06407221, + "auxiliary_loss_mlp": 0.0126359, + "balance_loss_clip": 0.06274284, + "balance_loss_mlp": 0.01253415, + "epoch": 0.7438448820081166, + "flos": 27789926676480.0, + "grad_norm": 1.5517667722387558, + "language_loss": 0.69689578, + "learning_rate": 6.495894669419857e-07, + "loss": 0.77360392, + "num_input_tokens_seen": 266865160, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.10168457, + "step": 12372, + "time_per_iteration": 2.552630662918091 + }, + { + "auxiliary_loss_clip": 0.06404461, + "auxiliary_loss_mlp": 0.01263234, + "balance_loss_clip": 0.06271353, + "balance_loss_mlp": 0.01253519, + "epoch": 0.7439050052607846, + "flos": 17973653806080.0, + "grad_norm": 1.7715467949119694, + "language_loss": 0.75746936, + "learning_rate": 6.493022139721245e-07, + "loss": 0.83414626, + "num_input_tokens_seen": 266883285, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.09716797, + "step": 12373, + "time_per_iteration": 2.546383857727051 + }, + { + "auxiliary_loss_clip": 0.06406415, + "auxiliary_loss_mlp": 0.01264372, + "balance_loss_clip": 0.06269443, + "balance_loss_mlp": 0.01253643, + "epoch": 0.7439651285134525, + "flos": 22964066951040.0, + "grad_norm": 1.646659393981313, + "language_loss": 0.77668065, + "learning_rate": 6.49015012220858e-07, + "loss": 0.85338849, + "num_input_tokens_seen": 266900960, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.10723877, + "step": 12374, + "time_per_iteration": 3.92050838470459 + }, + { + "auxiliary_loss_clip": 0.0640787, + "auxiliary_loss_mlp": 0.01263238, + "balance_loss_clip": 0.06273149, + "balance_loss_mlp": 0.0125323, + "epoch": 0.7440252517661206, + "flos": 18812701065600.0, + "grad_norm": 2.0942511176343936, + "language_loss": 0.76647848, + "learning_rate": 6.487278616990774e-07, + "loss": 0.8431896, + "num_input_tokens_seen": 266917710, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.10009766, + "step": 12375, + "time_per_iteration": 2.4693682193756104 + }, + { + "auxiliary_loss_clip": 0.06401422, + "auxiliary_loss_mlp": 0.01264376, + "balance_loss_clip": 0.06271002, + "balance_loss_mlp": 0.0125509, + "epoch": 0.7440853750187885, + "flos": 20272476971520.0, + "grad_norm": 1.9421008713204126, + "language_loss": 0.77613479, + "learning_rate": 6.484407624176733e-07, + "loss": 0.85279274, + "num_input_tokens_seen": 266934220, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.09289551, + "step": 12376, + "time_per_iteration": 2.5313687324523926 + }, + { + "auxiliary_loss_clip": 0.06411325, + "auxiliary_loss_mlp": 0.0126521, + "balance_loss_clip": 0.06274679, + "balance_loss_mlp": 0.01254195, + "epoch": 0.7441454982714565, + "flos": 25344216103680.0, + "grad_norm": 1.6879518297233593, + "language_loss": 0.79368329, + "learning_rate": 6.481537143875296e-07, + "loss": 0.87044865, + "num_input_tokens_seen": 266955210, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.11010742, + "step": 12377, + "time_per_iteration": 2.5384654998779297 + }, + { + "auxiliary_loss_clip": 0.0640887, + "auxiliary_loss_mlp": 0.01264545, + "balance_loss_clip": 0.06272136, + "balance_loss_mlp": 0.01254025, + "epoch": 0.7442056215241245, + "flos": 64493460915840.0, + "grad_norm": 1.858045271266799, + "language_loss": 0.67843312, + "learning_rate": 6.478667176195322e-07, + "loss": 0.75516731, + "num_input_tokens_seen": 266976555, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.10528564, + "step": 12378, + "time_per_iteration": 2.898494005203247 + }, + { + "auxiliary_loss_clip": 0.06408532, + "auxiliary_loss_mlp": 0.0126824, + "balance_loss_clip": 0.06271744, + "balance_loss_mlp": 0.01256784, + "epoch": 0.7442657447767924, + "flos": 31293464319360.0, + "grad_norm": 1.6105987456814335, + "language_loss": 0.71894264, + "learning_rate": 6.475797721245648e-07, + "loss": 0.79571033, + "num_input_tokens_seen": 266997640, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.11462402, + "step": 12379, + "time_per_iteration": 2.5628533363342285 + }, + { + "auxiliary_loss_clip": 0.06407094, + "auxiliary_loss_mlp": 0.01265472, + "balance_loss_clip": 0.06273466, + "balance_loss_mlp": 0.01255292, + "epoch": 0.7443258680294604, + "flos": 20813221296000.0, + "grad_norm": 1.9550409468219483, + "language_loss": 0.65543461, + "learning_rate": 6.472928779135085e-07, + "loss": 0.73216021, + "num_input_tokens_seen": 267016165, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.10186768, + "step": 12380, + "time_per_iteration": 2.5494651794433594 + }, + { + "auxiliary_loss_clip": 0.06408666, + "auxiliary_loss_mlp": 0.01266245, + "balance_loss_clip": 0.0627347, + "balance_loss_mlp": 0.01256267, + "epoch": 0.7443859912821283, + "flos": 22206303751680.0, + "grad_norm": 1.8887848682533184, + "language_loss": 0.79213363, + "learning_rate": 6.470060349972411e-07, + "loss": 0.86888278, + "num_input_tokens_seen": 267034075, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.09973145, + "step": 12381, + "time_per_iteration": 2.4954755306243896 + }, + { + "auxiliary_loss_clip": 0.06412176, + "auxiliary_loss_mlp": 0.01265606, + "balance_loss_clip": 0.06274785, + "balance_loss_mlp": 0.0125446, + "epoch": 0.7444461145347964, + "flos": 22024350610560.0, + "grad_norm": 1.8902076761628224, + "language_loss": 0.73109865, + "learning_rate": 6.467192433866411e-07, + "loss": 0.80787647, + "num_input_tokens_seen": 267053645, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.1114502, + "step": 12382, + "time_per_iteration": 2.534949779510498 + }, + { + "auxiliary_loss_clip": 0.06317867, + "auxiliary_loss_mlp": 0.01256388, + "balance_loss_clip": 0.06262469, + "balance_loss_mlp": 0.01255137, + "epoch": 0.7445062377874643, + "flos": 70582313704320.0, + "grad_norm": 0.6399574084951353, + "language_loss": 0.54684198, + "learning_rate": 6.464325030925831e-07, + "loss": 0.62258446, + "num_input_tokens_seen": 267121830, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01251221, + "step": 12383, + "time_per_iteration": 3.2762465476989746 + }, + { + "auxiliary_loss_clip": 0.06408082, + "auxiliary_loss_mlp": 0.01263086, + "balance_loss_clip": 0.06273709, + "balance_loss_mlp": 0.01253168, + "epoch": 0.7445663610401323, + "flos": 22171070309760.0, + "grad_norm": 1.8693949570564194, + "language_loss": 0.76230967, + "learning_rate": 6.461458141259395e-07, + "loss": 0.83902138, + "num_input_tokens_seen": 267141145, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.09924316, + "step": 12384, + "time_per_iteration": 3.9471797943115234 + }, + { + "auxiliary_loss_clip": 0.0640517, + "auxiliary_loss_mlp": 0.01268527, + "balance_loss_clip": 0.06271986, + "balance_loss_mlp": 0.01258162, + "epoch": 0.7446264842928002, + "flos": 24177082982400.0, + "grad_norm": 2.0160606528555665, + "language_loss": 0.79418957, + "learning_rate": 6.458591764975823e-07, + "loss": 0.87092656, + "num_input_tokens_seen": 267159280, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.1036377, + "step": 12385, + "time_per_iteration": 2.548703193664551 + }, + { + "auxiliary_loss_clip": 0.06411269, + "auxiliary_loss_mlp": 0.01267945, + "balance_loss_clip": 0.06273325, + "balance_loss_mlp": 0.0125609, + "epoch": 0.7446866075454682, + "flos": 24141514124160.0, + "grad_norm": 1.683035804247251, + "language_loss": 0.81670487, + "learning_rate": 6.455725902183813e-07, + "loss": 0.89349711, + "num_input_tokens_seen": 267179390, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.11859131, + "step": 12386, + "time_per_iteration": 2.5256152153015137 + }, + { + "auxiliary_loss_clip": 0.06404106, + "auxiliary_loss_mlp": 0.01267713, + "balance_loss_clip": 0.06274322, + "balance_loss_mlp": 0.01257598, + "epoch": 0.7447467307981361, + "flos": 23554467619200.0, + "grad_norm": 1.6483993248680413, + "language_loss": 0.71268487, + "learning_rate": 6.452860552992037e-07, + "loss": 0.78940308, + "num_input_tokens_seen": 267198165, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.10119629, + "step": 12387, + "time_per_iteration": 3.9517242908477783 + }, + { + "auxiliary_loss_clip": 0.0640709, + "auxiliary_loss_mlp": 0.01265221, + "balance_loss_clip": 0.06274819, + "balance_loss_mlp": 0.01255464, + "epoch": 0.7448068540508042, + "flos": 19573021814400.0, + "grad_norm": 1.9204384374405874, + "language_loss": 0.70408261, + "learning_rate": 6.449995717509138e-07, + "loss": 0.78080571, + "num_input_tokens_seen": 267214520, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.09771729, + "step": 12388, + "time_per_iteration": 2.5048129558563232 + }, + { + "auxiliary_loss_clip": 0.06406976, + "auxiliary_loss_mlp": 0.0126489, + "balance_loss_clip": 0.06273593, + "balance_loss_mlp": 0.01254727, + "epoch": 0.7448669773034721, + "flos": 21846925589760.0, + "grad_norm": 1.5688285062230494, + "language_loss": 0.85222888, + "learning_rate": 6.447131395843761e-07, + "loss": 0.92894751, + "num_input_tokens_seen": 267236555, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.10162354, + "step": 12389, + "time_per_iteration": 2.5551319122314453 + }, + { + "auxiliary_loss_clip": 0.06411929, + "auxiliary_loss_mlp": 0.01264711, + "balance_loss_clip": 0.06275173, + "balance_loss_mlp": 0.01254388, + "epoch": 0.7449271005561401, + "flos": 25162388743680.0, + "grad_norm": 1.6015967900986, + "language_loss": 0.79076087, + "learning_rate": 6.444267588104526e-07, + "loss": 0.86752725, + "num_input_tokens_seen": 267254800, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.10333252, + "step": 12390, + "time_per_iteration": 2.5427069664001465 + }, + { + "auxiliary_loss_clip": 0.06406707, + "auxiliary_loss_mlp": 0.01265994, + "balance_loss_clip": 0.06271118, + "balance_loss_mlp": 0.01255414, + "epoch": 0.7449872238088081, + "flos": 22279915163520.0, + "grad_norm": 1.7310702404068883, + "language_loss": 0.84598923, + "learning_rate": 6.441404294400014e-07, + "loss": 0.92271626, + "num_input_tokens_seen": 267274610, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10577393, + "step": 12391, + "time_per_iteration": 2.563535451889038 + }, + { + "auxiliary_loss_clip": 0.0640666, + "auxiliary_loss_mlp": 0.01267143, + "balance_loss_clip": 0.06273681, + "balance_loss_mlp": 0.01257481, + "epoch": 0.745047347061476, + "flos": 20601065957760.0, + "grad_norm": 1.6668133059608343, + "language_loss": 0.74029422, + "learning_rate": 6.438541514838811e-07, + "loss": 0.81703228, + "num_input_tokens_seen": 267292600, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.09655762, + "step": 12392, + "time_per_iteration": 2.54951548576355 + }, + { + "auxiliary_loss_clip": 0.06402859, + "auxiliary_loss_mlp": 0.01260815, + "balance_loss_clip": 0.06272476, + "balance_loss_mlp": 0.01251344, + "epoch": 0.745107470314144, + "flos": 22134117859200.0, + "grad_norm": 1.5576525473269558, + "language_loss": 0.76858068, + "learning_rate": 6.435679249529487e-07, + "loss": 0.84521741, + "num_input_tokens_seen": 267311295, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.09466553, + "step": 12393, + "time_per_iteration": 3.9006175994873047 + }, + { + "auxiliary_loss_clip": 0.06406154, + "auxiliary_loss_mlp": 0.01264743, + "balance_loss_clip": 0.06273723, + "balance_loss_mlp": 0.01253681, + "epoch": 0.745167593566812, + "flos": 22243004640000.0, + "grad_norm": 1.8129190571327771, + "language_loss": 0.72895974, + "learning_rate": 6.432817498580552e-07, + "loss": 0.80566871, + "num_input_tokens_seen": 267328390, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.11065674, + "step": 12394, + "time_per_iteration": 2.5072154998779297 + }, + { + "auxiliary_loss_clip": 0.06409433, + "auxiliary_loss_mlp": 0.0126662, + "balance_loss_clip": 0.062764, + "balance_loss_mlp": 0.01256386, + "epoch": 0.74522771681948, + "flos": 20672245601280.0, + "grad_norm": 1.907024512464057, + "language_loss": 0.81604195, + "learning_rate": 6.429956262100535e-07, + "loss": 0.89280254, + "num_input_tokens_seen": 267348185, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.10229492, + "step": 12395, + "time_per_iteration": 2.558364152908325 + }, + { + "auxiliary_loss_clip": 0.06410865, + "auxiliary_loss_mlp": 0.01263239, + "balance_loss_clip": 0.06272958, + "balance_loss_mlp": 0.0125276, + "epoch": 0.7452878400721479, + "flos": 21113578656000.0, + "grad_norm": 2.0296389774228696, + "language_loss": 0.71353412, + "learning_rate": 6.427095540197937e-07, + "loss": 0.7902751, + "num_input_tokens_seen": 267367010, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.10479736, + "step": 12396, + "time_per_iteration": 2.5333800315856934 + }, + { + "auxiliary_loss_clip": 0.06410335, + "auxiliary_loss_mlp": 0.01270272, + "balance_loss_clip": 0.0627405, + "balance_loss_mlp": 0.01259817, + "epoch": 0.7453479633248159, + "flos": 26695356791040.0, + "grad_norm": 1.7653498862939656, + "language_loss": 0.68180245, + "learning_rate": 6.424235332981245e-07, + "loss": 0.75860852, + "num_input_tokens_seen": 267386605, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.10455322, + "step": 12397, + "time_per_iteration": 2.578571081161499 + }, + { + "auxiliary_loss_clip": 0.06405051, + "auxiliary_loss_mlp": 0.0126851, + "balance_loss_clip": 0.0627315, + "balance_loss_mlp": 0.01258926, + "epoch": 0.7454080865774838, + "flos": 17021191645440.0, + "grad_norm": 1.6817792283863804, + "language_loss": 0.77217615, + "learning_rate": 6.421375640558908e-07, + "loss": 0.84891176, + "num_input_tokens_seen": 267404135, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09576416, + "step": 12398, + "time_per_iteration": 2.512648344039917 + }, + { + "auxiliary_loss_clip": 0.06403591, + "auxiliary_loss_mlp": 0.01261876, + "balance_loss_clip": 0.06272794, + "balance_loss_mlp": 0.01252328, + "epoch": 0.7454682098301518, + "flos": 21330178260480.0, + "grad_norm": 1.5838932633911913, + "language_loss": 0.78415573, + "learning_rate": 6.418516463039363e-07, + "loss": 0.8608104, + "num_input_tokens_seen": 267423120, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09552002, + "step": 12399, + "time_per_iteration": 2.505819320678711 + }, + { + "auxiliary_loss_clip": 0.06400932, + "auxiliary_loss_mlp": 0.01264955, + "balance_loss_clip": 0.06273317, + "balance_loss_mlp": 0.01255728, + "epoch": 0.7455283330828197, + "flos": 17864138119680.0, + "grad_norm": 1.9696837581168143, + "language_loss": 0.7409634, + "learning_rate": 6.415657800531038e-07, + "loss": 0.81762224, + "num_input_tokens_seen": 267441250, + "router_z_loss_clip": 1.27539062, + "router_z_loss_mlp": 0.09222412, + "step": 12400, + "time_per_iteration": 2.5325090885162354 + }, + { + "auxiliary_loss_clip": 0.06404567, + "auxiliary_loss_mlp": 0.01264569, + "balance_loss_clip": 0.06272677, + "balance_loss_mlp": 0.01254829, + "epoch": 0.7455884563354878, + "flos": 30782209432320.0, + "grad_norm": 1.9542118355306637, + "language_loss": 0.82345331, + "learning_rate": 6.412799653142327e-07, + "loss": 0.90014458, + "num_input_tokens_seen": 267462820, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.09735107, + "step": 12401, + "time_per_iteration": 2.577702283859253 + }, + { + "auxiliary_loss_clip": 0.06408406, + "auxiliary_loss_mlp": 0.01262184, + "balance_loss_clip": 0.06275339, + "balance_loss_mlp": 0.01252689, + "epoch": 0.7456485795881557, + "flos": 23192280345600.0, + "grad_norm": 1.6740517505744856, + "language_loss": 0.65013397, + "learning_rate": 6.409942020981611e-07, + "loss": 0.72683978, + "num_input_tokens_seen": 267483065, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.0949707, + "step": 12402, + "time_per_iteration": 2.6253459453582764 + }, + { + "auxiliary_loss_clip": 0.06401449, + "auxiliary_loss_mlp": 0.01264812, + "balance_loss_clip": 0.06271583, + "balance_loss_mlp": 0.01255472, + "epoch": 0.7457087028408237, + "flos": 38736254436480.0, + "grad_norm": 1.537912259359591, + "language_loss": 0.73276114, + "learning_rate": 6.407084904157265e-07, + "loss": 0.8094238, + "num_input_tokens_seen": 267504825, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.09350586, + "step": 12403, + "time_per_iteration": 2.700143575668335 + }, + { + "auxiliary_loss_clip": 0.06316997, + "auxiliary_loss_mlp": 0.01251636, + "balance_loss_clip": 0.06261828, + "balance_loss_mlp": 0.0125041, + "epoch": 0.7457688260934917, + "flos": 56059480523520.0, + "grad_norm": 1.1139053392521483, + "language_loss": 0.58594716, + "learning_rate": 6.404228302777621e-07, + "loss": 0.66163349, + "num_input_tokens_seen": 267559260, + "router_z_loss_clip": 0.55371094, + "router_z_loss_mlp": 0.01225281, + "step": 12404, + "time_per_iteration": 2.995051145553589 + }, + { + "auxiliary_loss_clip": 0.06405495, + "auxiliary_loss_mlp": 0.01263977, + "balance_loss_clip": 0.06272737, + "balance_loss_mlp": 0.01254256, + "epoch": 0.7458289493461596, + "flos": 20121606495360.0, + "grad_norm": 1.4914507939432748, + "language_loss": 0.77947497, + "learning_rate": 6.401372216950995e-07, + "loss": 0.85616976, + "num_input_tokens_seen": 267578720, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09710693, + "step": 12405, + "time_per_iteration": 2.5471739768981934 + }, + { + "auxiliary_loss_clip": 0.0640135, + "auxiliary_loss_mlp": 0.01269033, + "balance_loss_clip": 0.06272865, + "balance_loss_mlp": 0.01259067, + "epoch": 0.7458890725988276, + "flos": 20199200976000.0, + "grad_norm": 1.4963815731193124, + "language_loss": 0.69489747, + "learning_rate": 6.398516646785698e-07, + "loss": 0.77160132, + "num_input_tokens_seen": 267598250, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.09960938, + "step": 12406, + "time_per_iteration": 2.5200746059417725 + }, + { + "auxiliary_loss_clip": 0.0641366, + "auxiliary_loss_mlp": 0.012669, + "balance_loss_clip": 0.06274001, + "balance_loss_mlp": 0.01256344, + "epoch": 0.7459491958514956, + "flos": 17024336173440.0, + "grad_norm": 1.8403958635643813, + "language_loss": 0.65422976, + "learning_rate": 6.39566159239002e-07, + "loss": 0.73103529, + "num_input_tokens_seen": 267615430, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.10559082, + "step": 12407, + "time_per_iteration": 2.508833408355713 + }, + { + "auxiliary_loss_clip": 0.06406917, + "auxiliary_loss_mlp": 0.01262212, + "balance_loss_clip": 0.06270534, + "balance_loss_mlp": 0.01251775, + "epoch": 0.7460093191041636, + "flos": 25085087752320.0, + "grad_norm": 1.7359295101063332, + "language_loss": 0.721986, + "learning_rate": 6.392807053872212e-07, + "loss": 0.79867733, + "num_input_tokens_seen": 267635075, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.10443115, + "step": 12408, + "time_per_iteration": 2.5363566875457764 + }, + { + "auxiliary_loss_clip": 0.06410854, + "auxiliary_loss_mlp": 0.01270325, + "balance_loss_clip": 0.06272398, + "balance_loss_mlp": 0.01258875, + "epoch": 0.7460694423568315, + "flos": 21915044559360.0, + "grad_norm": 1.699572837322079, + "language_loss": 0.72972172, + "learning_rate": 6.38995303134053e-07, + "loss": 0.80653358, + "num_input_tokens_seen": 267654105, + "router_z_loss_clip": 1.38476562, + "router_z_loss_mlp": 0.11444092, + "step": 12409, + "time_per_iteration": 2.546006441116333 + }, + { + "auxiliary_loss_clip": 0.06399277, + "auxiliary_loss_mlp": 0.01265888, + "balance_loss_clip": 0.06271146, + "balance_loss_mlp": 0.0125671, + "epoch": 0.7461295656094995, + "flos": 21222213874560.0, + "grad_norm": 1.598232986197546, + "language_loss": 0.6626668, + "learning_rate": 6.38709952490319e-07, + "loss": 0.73931849, + "num_input_tokens_seen": 267673090, + "router_z_loss_clip": 1.28222656, + "router_z_loss_mlp": 0.09173584, + "step": 12410, + "time_per_iteration": 2.539109468460083 + }, + { + "auxiliary_loss_clip": 0.06399163, + "auxiliary_loss_mlp": 0.01263377, + "balance_loss_clip": 0.06269792, + "balance_loss_mlp": 0.01253912, + "epoch": 0.7461896888621674, + "flos": 22353526575360.0, + "grad_norm": 1.945676042330692, + "language_loss": 0.84313834, + "learning_rate": 6.384246534668396e-07, + "loss": 0.9197638, + "num_input_tokens_seen": 267690605, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.09466553, + "step": 12411, + "time_per_iteration": 2.5426361560821533 + }, + { + "auxiliary_loss_clip": 0.06406285, + "auxiliary_loss_mlp": 0.01265139, + "balance_loss_clip": 0.06272309, + "balance_loss_mlp": 0.01255412, + "epoch": 0.7462498121148354, + "flos": 25489845699840.0, + "grad_norm": 1.4027823600738436, + "language_loss": 0.78116751, + "learning_rate": 6.381394060744339e-07, + "loss": 0.85788167, + "num_input_tokens_seen": 267710540, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.09729004, + "step": 12412, + "time_per_iteration": 2.533936023712158 + }, + { + "auxiliary_loss_clip": 0.06404398, + "auxiliary_loss_mlp": 0.01264219, + "balance_loss_clip": 0.06270991, + "balance_loss_mlp": 0.01254599, + "epoch": 0.7463099353675033, + "flos": 33956319548160.0, + "grad_norm": 1.7620547753312321, + "language_loss": 0.62684309, + "learning_rate": 6.378542103239188e-07, + "loss": 0.70352924, + "num_input_tokens_seen": 267730780, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.09625244, + "step": 12413, + "time_per_iteration": 2.6400840282440186 + }, + { + "auxiliary_loss_clip": 0.06308331, + "auxiliary_loss_mlp": 0.01251289, + "balance_loss_clip": 0.06253117, + "balance_loss_mlp": 0.01250132, + "epoch": 0.7463700586201714, + "flos": 62786365355520.0, + "grad_norm": 0.710053456092447, + "language_loss": 0.54915559, + "learning_rate": 6.375690662261082e-07, + "loss": 0.62475181, + "num_input_tokens_seen": 267794240, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.0115509, + "step": 12414, + "time_per_iteration": 4.637887954711914 + }, + { + "auxiliary_loss_clip": 0.06405766, + "auxiliary_loss_mlp": 0.01265973, + "balance_loss_clip": 0.06271549, + "balance_loss_mlp": 0.01255924, + "epoch": 0.7464301818728393, + "flos": 33440201124480.0, + "grad_norm": 1.8480790856179932, + "language_loss": 0.54996049, + "learning_rate": 6.372839737918154e-07, + "loss": 0.62667787, + "num_input_tokens_seen": 267817190, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.1005249, + "step": 12415, + "time_per_iteration": 2.615811347961426 + }, + { + "auxiliary_loss_clip": 0.06405137, + "auxiliary_loss_mlp": 0.01263099, + "balance_loss_clip": 0.06273064, + "balance_loss_mlp": 0.01252985, + "epoch": 0.7464903051255073, + "flos": 26877100296960.0, + "grad_norm": 1.5361542558007044, + "language_loss": 0.75346631, + "learning_rate": 6.369989330318506e-07, + "loss": 0.8301487, + "num_input_tokens_seen": 267836245, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.10107422, + "step": 12416, + "time_per_iteration": 2.5900840759277344 + }, + { + "auxiliary_loss_clip": 0.06405427, + "auxiliary_loss_mlp": 0.0126512, + "balance_loss_clip": 0.06271985, + "balance_loss_mlp": 0.01254868, + "epoch": 0.7465504283781753, + "flos": 44096359795200.0, + "grad_norm": 1.4549877982075725, + "language_loss": 0.69495994, + "learning_rate": 6.367139439570233e-07, + "loss": 0.77166545, + "num_input_tokens_seen": 267858310, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.10247803, + "step": 12417, + "time_per_iteration": 2.7127816677093506 + }, + { + "auxiliary_loss_clip": 0.06411283, + "auxiliary_loss_mlp": 0.01262613, + "balance_loss_clip": 0.0627514, + "balance_loss_mlp": 0.01252456, + "epoch": 0.7466105516308432, + "flos": 19681111981440.0, + "grad_norm": 1.698297081844245, + "language_loss": 0.74025893, + "learning_rate": 6.364290065781392e-07, + "loss": 0.81699783, + "num_input_tokens_seen": 267876345, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.10162354, + "step": 12418, + "time_per_iteration": 2.535360336303711 + }, + { + "auxiliary_loss_clip": 0.06406084, + "auxiliary_loss_mlp": 0.01266736, + "balance_loss_clip": 0.06273702, + "balance_loss_mlp": 0.01256526, + "epoch": 0.7466706748835112, + "flos": 20526783713280.0, + "grad_norm": 1.5246031666283997, + "language_loss": 0.68934214, + "learning_rate": 6.361441209060039e-07, + "loss": 0.76607031, + "num_input_tokens_seen": 267896740, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.10211182, + "step": 12419, + "time_per_iteration": 2.555774211883545 + }, + { + "auxiliary_loss_clip": 0.06398122, + "auxiliary_loss_mlp": 0.01265504, + "balance_loss_clip": 0.06271016, + "balance_loss_mlp": 0.01256307, + "epoch": 0.7467307981361792, + "flos": 21696851727360.0, + "grad_norm": 1.9457389695389966, + "language_loss": 0.7466985, + "learning_rate": 6.358592869514216e-07, + "loss": 0.82333469, + "num_input_tokens_seen": 267914765, + "router_z_loss_clip": 1.27050781, + "router_z_loss_mlp": 0.09197998, + "step": 12420, + "time_per_iteration": 2.570023536682129 + }, + { + "auxiliary_loss_clip": 0.06408262, + "auxiliary_loss_mlp": 0.01264113, + "balance_loss_clip": 0.06273928, + "balance_loss_mlp": 0.01253152, + "epoch": 0.7467909213888472, + "flos": 19579855921920.0, + "grad_norm": 2.0032714530696087, + "language_loss": 0.67321241, + "learning_rate": 6.355745047251904e-07, + "loss": 0.7499361, + "num_input_tokens_seen": 267934085, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.10955811, + "step": 12421, + "time_per_iteration": 2.474916696548462 + }, + { + "auxiliary_loss_clip": 0.06408735, + "auxiliary_loss_mlp": 0.01265956, + "balance_loss_clip": 0.06271867, + "balance_loss_mlp": 0.0125574, + "epoch": 0.7468510446415151, + "flos": 23701858151040.0, + "grad_norm": 1.5609377146869152, + "language_loss": 0.72308791, + "learning_rate": 6.352897742381107e-07, + "loss": 0.79983485, + "num_input_tokens_seen": 267955170, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.10223389, + "step": 12422, + "time_per_iteration": 2.5997939109802246 + }, + { + "auxiliary_loss_clip": 0.06401733, + "auxiliary_loss_mlp": 0.01265232, + "balance_loss_clip": 0.06271507, + "balance_loss_mlp": 0.01255272, + "epoch": 0.7469111678941831, + "flos": 29323649410560.0, + "grad_norm": 1.8474742568559126, + "language_loss": 0.75012529, + "learning_rate": 6.350050955009796e-07, + "loss": 0.82679492, + "num_input_tokens_seen": 267974980, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.09960938, + "step": 12423, + "time_per_iteration": 4.05024266242981 + }, + { + "auxiliary_loss_clip": 0.06402838, + "auxiliary_loss_mlp": 0.01263552, + "balance_loss_clip": 0.06272693, + "balance_loss_mlp": 0.01254534, + "epoch": 0.746971291146851, + "flos": 21805067675520.0, + "grad_norm": 1.325189199688027, + "language_loss": 0.67964166, + "learning_rate": 6.347204685245929e-07, + "loss": 0.75630558, + "num_input_tokens_seen": 267994985, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.09020996, + "step": 12424, + "time_per_iteration": 2.531129837036133 + }, + { + "auxiliary_loss_clip": 0.06410465, + "auxiliary_loss_mlp": 0.01267373, + "balance_loss_clip": 0.06274019, + "balance_loss_mlp": 0.01257491, + "epoch": 0.747031414399519, + "flos": 36253591413120.0, + "grad_norm": 1.7828664572749888, + "language_loss": 0.74532795, + "learning_rate": 6.344358933197418e-07, + "loss": 0.82210636, + "num_input_tokens_seen": 268014985, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.09881592, + "step": 12425, + "time_per_iteration": 2.7197470664978027 + }, + { + "auxiliary_loss_clip": 0.06402496, + "auxiliary_loss_mlp": 0.01265684, + "balance_loss_clip": 0.06268051, + "balance_loss_mlp": 0.01254431, + "epoch": 0.7470915376521869, + "flos": 19981133925120.0, + "grad_norm": 2.1292666289385016, + "language_loss": 0.69784462, + "learning_rate": 6.341513698972194e-07, + "loss": 0.77452642, + "num_input_tokens_seen": 268034395, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.1126709, + "step": 12426, + "time_per_iteration": 3.9324328899383545 + }, + { + "auxiliary_loss_clip": 0.06403908, + "auxiliary_loss_mlp": 0.01267662, + "balance_loss_clip": 0.06274264, + "balance_loss_mlp": 0.01258269, + "epoch": 0.747151660904855, + "flos": 20090523830400.0, + "grad_norm": 1.610031666552814, + "language_loss": 0.65698165, + "learning_rate": 6.338668982678139e-07, + "loss": 0.73369735, + "num_input_tokens_seen": 268054485, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.09399414, + "step": 12427, + "time_per_iteration": 2.544971466064453 + }, + { + "auxiliary_loss_clip": 0.06408876, + "auxiliary_loss_mlp": 0.01263755, + "balance_loss_clip": 0.06273834, + "balance_loss_mlp": 0.01253754, + "epoch": 0.7472117841575229, + "flos": 16296062411520.0, + "grad_norm": 1.5416820216719087, + "language_loss": 0.74925625, + "learning_rate": 6.335824784423118e-07, + "loss": 0.82598257, + "num_input_tokens_seen": 268072250, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.09997559, + "step": 12428, + "time_per_iteration": 2.4757473468780518 + }, + { + "auxiliary_loss_clip": 0.06413485, + "auxiliary_loss_mlp": 0.01264592, + "balance_loss_clip": 0.06274045, + "balance_loss_mlp": 0.01253756, + "epoch": 0.7472719074101909, + "flos": 21395068848000.0, + "grad_norm": 2.468151584449191, + "language_loss": 0.58381009, + "learning_rate": 6.33298110431499e-07, + "loss": 0.66059089, + "num_input_tokens_seen": 268089840, + "router_z_loss_clip": 1.39257812, + "router_z_loss_mlp": 0.1083374, + "step": 12429, + "time_per_iteration": 2.5076515674591064 + }, + { + "auxiliary_loss_clip": 0.06411515, + "auxiliary_loss_mlp": 0.0126451, + "balance_loss_clip": 0.06274679, + "balance_loss_mlp": 0.01254395, + "epoch": 0.7473320306628589, + "flos": 29651064439680.0, + "grad_norm": 1.7643839025540142, + "language_loss": 0.60671711, + "learning_rate": 6.330137942461595e-07, + "loss": 0.6834774, + "num_input_tokens_seen": 268109360, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.10113525, + "step": 12430, + "time_per_iteration": 2.580826997756958 + }, + { + "auxiliary_loss_clip": 0.06397452, + "auxiliary_loss_mlp": 0.01264423, + "balance_loss_clip": 0.06268569, + "balance_loss_mlp": 0.01255339, + "epoch": 0.7473921539155268, + "flos": 24143316986880.0, + "grad_norm": 1.3480044268517646, + "language_loss": 0.7548542, + "learning_rate": 6.327295298970734e-07, + "loss": 0.83147293, + "num_input_tokens_seen": 268131840, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.09088135, + "step": 12431, + "time_per_iteration": 2.5767364501953125 + }, + { + "auxiliary_loss_clip": 0.06404008, + "auxiliary_loss_mlp": 0.01264023, + "balance_loss_clip": 0.06270575, + "balance_loss_mlp": 0.01253831, + "epoch": 0.7474522771681948, + "flos": 17492768824320.0, + "grad_norm": 2.003596145191226, + "language_loss": 0.75284076, + "learning_rate": 6.32445317395021e-07, + "loss": 0.82952106, + "num_input_tokens_seen": 268148300, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.10198975, + "step": 12432, + "time_per_iteration": 3.9378252029418945 + }, + { + "auxiliary_loss_clip": 0.06408846, + "auxiliary_loss_mlp": 0.01264276, + "balance_loss_clip": 0.06271054, + "balance_loss_mlp": 0.01253833, + "epoch": 0.7475124004208628, + "flos": 16732909272960.0, + "grad_norm": 2.3826566050681652, + "language_loss": 0.70483506, + "learning_rate": 6.321611567507787e-07, + "loss": 0.78156626, + "num_input_tokens_seen": 268166450, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.10437012, + "step": 12433, + "time_per_iteration": 2.4768426418304443 + }, + { + "auxiliary_loss_clip": 0.06408405, + "auxiliary_loss_mlp": 0.01266362, + "balance_loss_clip": 0.06274009, + "balance_loss_mlp": 0.01255782, + "epoch": 0.7475725236735308, + "flos": 19726533694080.0, + "grad_norm": 1.7388304285111835, + "language_loss": 0.67580962, + "learning_rate": 6.318770479751232e-07, + "loss": 0.75255728, + "num_input_tokens_seen": 268186165, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.105896, + "step": 12434, + "time_per_iteration": 2.547088384628296 + }, + { + "auxiliary_loss_clip": 0.06395668, + "auxiliary_loss_mlp": 0.01264935, + "balance_loss_clip": 0.06270221, + "balance_loss_mlp": 0.01256042, + "epoch": 0.7476326469261987, + "flos": 26293114465920.0, + "grad_norm": 1.4738346539678335, + "language_loss": 0.7966851, + "learning_rate": 6.315929910788263e-07, + "loss": 0.87329113, + "num_input_tokens_seen": 268208145, + "router_z_loss_clip": 1.25488281, + "router_z_loss_mlp": 0.08898926, + "step": 12435, + "time_per_iteration": 2.5363943576812744 + }, + { + "auxiliary_loss_clip": 0.06409591, + "auxiliary_loss_mlp": 0.01267417, + "balance_loss_clip": 0.0627221, + "balance_loss_mlp": 0.01257236, + "epoch": 0.7476927701788667, + "flos": 31839868794240.0, + "grad_norm": 2.1319276645513736, + "language_loss": 0.68030941, + "learning_rate": 6.313089860726604e-07, + "loss": 0.75707954, + "num_input_tokens_seen": 268228345, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.10180664, + "step": 12436, + "time_per_iteration": 2.655866861343384 + }, + { + "auxiliary_loss_clip": 0.06408997, + "auxiliary_loss_mlp": 0.01263336, + "balance_loss_clip": 0.06271006, + "balance_loss_mlp": 0.01252732, + "epoch": 0.7477528934315346, + "flos": 31803545249280.0, + "grad_norm": 1.4428842251570377, + "language_loss": 0.7086063, + "learning_rate": 6.31025032967396e-07, + "loss": 0.78532964, + "num_input_tokens_seen": 268250260, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.10601807, + "step": 12437, + "time_per_iteration": 2.5668420791625977 + }, + { + "auxiliary_loss_clip": 0.06400211, + "auxiliary_loss_mlp": 0.01266102, + "balance_loss_clip": 0.06271319, + "balance_loss_mlp": 0.01256929, + "epoch": 0.7478130166842026, + "flos": 20377548391680.0, + "grad_norm": 1.5941584942666511, + "language_loss": 0.6725921, + "learning_rate": 6.307411317737986e-07, + "loss": 0.74925524, + "num_input_tokens_seen": 268268440, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.09179688, + "step": 12438, + "time_per_iteration": 2.5391809940338135 + }, + { + "auxiliary_loss_clip": 0.06402425, + "auxiliary_loss_mlp": 0.01269468, + "balance_loss_clip": 0.06270497, + "balance_loss_mlp": 0.01259878, + "epoch": 0.7478731399368705, + "flos": 18154558771200.0, + "grad_norm": 1.5910882903057735, + "language_loss": 0.81170976, + "learning_rate": 6.304572825026344e-07, + "loss": 0.88842869, + "num_input_tokens_seen": 268285765, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09588623, + "step": 12439, + "time_per_iteration": 2.530305862426758 + }, + { + "auxiliary_loss_clip": 0.06401659, + "auxiliary_loss_mlp": 0.01264664, + "balance_loss_clip": 0.0627173, + "balance_loss_mlp": 0.0125502, + "epoch": 0.7479332631895386, + "flos": 15273259148160.0, + "grad_norm": 2.0986943273037335, + "language_loss": 0.71237975, + "learning_rate": 6.301734851646674e-07, + "loss": 0.78904307, + "num_input_tokens_seen": 268304015, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.09655762, + "step": 12440, + "time_per_iteration": 2.5543224811553955 + }, + { + "auxiliary_loss_clip": 0.06400722, + "auxiliary_loss_mlp": 0.01265179, + "balance_loss_clip": 0.06271139, + "balance_loss_mlp": 0.01255606, + "epoch": 0.7479933864422065, + "flos": 21148937879040.0, + "grad_norm": 1.8969303435383589, + "language_loss": 0.74162072, + "learning_rate": 6.298897397706597e-07, + "loss": 0.81827968, + "num_input_tokens_seen": 268323290, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.09570312, + "step": 12441, + "time_per_iteration": 2.4814085960388184 + }, + { + "auxiliary_loss_clip": 0.06407572, + "auxiliary_loss_mlp": 0.01269518, + "balance_loss_clip": 0.06270711, + "balance_loss_mlp": 0.01258664, + "epoch": 0.7480535096948745, + "flos": 14397217511040.0, + "grad_norm": 2.1766125237206384, + "language_loss": 0.82771671, + "learning_rate": 6.296060463313698e-07, + "loss": 0.90448761, + "num_input_tokens_seen": 268339490, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.10858154, + "step": 12442, + "time_per_iteration": 2.474766969680786 + }, + { + "auxiliary_loss_clip": 0.06407404, + "auxiliary_loss_mlp": 0.01264143, + "balance_loss_clip": 0.06271537, + "balance_loss_mlp": 0.01253551, + "epoch": 0.7481136329475425, + "flos": 27352073566080.0, + "grad_norm": 2.1201863783826087, + "language_loss": 0.63084489, + "learning_rate": 6.293224048575565e-07, + "loss": 0.7075603, + "num_input_tokens_seen": 268359865, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.105896, + "step": 12443, + "time_per_iteration": 2.537418842315674 + }, + { + "auxiliary_loss_clip": 0.06402731, + "auxiliary_loss_mlp": 0.01263567, + "balance_loss_clip": 0.06270343, + "balance_loss_mlp": 0.01254, + "epoch": 0.7481737562002104, + "flos": 19536656342400.0, + "grad_norm": 1.7130617298160193, + "language_loss": 0.71587157, + "learning_rate": 6.29038815359975e-07, + "loss": 0.79253459, + "num_input_tokens_seen": 268377065, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09570312, + "step": 12444, + "time_per_iteration": 2.5142312049865723 + }, + { + "auxiliary_loss_clip": 0.06404774, + "auxiliary_loss_mlp": 0.01263681, + "balance_loss_clip": 0.06271861, + "balance_loss_mlp": 0.01253483, + "epoch": 0.7482338794528784, + "flos": 21766102727040.0, + "grad_norm": 1.3467287331144688, + "language_loss": 0.68781805, + "learning_rate": 6.287552778493786e-07, + "loss": 0.76450258, + "num_input_tokens_seen": 268396935, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.10198975, + "step": 12445, + "time_per_iteration": 2.498960018157959 + }, + { + "auxiliary_loss_clip": 0.06400403, + "auxiliary_loss_mlp": 0.01264071, + "balance_loss_clip": 0.06269241, + "balance_loss_mlp": 0.01254319, + "epoch": 0.7482940027055464, + "flos": 18703269233280.0, + "grad_norm": 1.5654377266954753, + "language_loss": 0.74401557, + "learning_rate": 6.28471792336519e-07, + "loss": 0.82066035, + "num_input_tokens_seen": 268414460, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.09747314, + "step": 12446, + "time_per_iteration": 2.489685535430908 + }, + { + "auxiliary_loss_clip": 0.06408426, + "auxiliary_loss_mlp": 0.01264963, + "balance_loss_clip": 0.06271491, + "balance_loss_mlp": 0.01254467, + "epoch": 0.7483541259582144, + "flos": 16003587335040.0, + "grad_norm": 1.896183227268288, + "language_loss": 0.7341156, + "learning_rate": 6.281883588321475e-07, + "loss": 0.81084955, + "num_input_tokens_seen": 268432225, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.10491943, + "step": 12447, + "time_per_iteration": 2.464768648147583 + }, + { + "auxiliary_loss_clip": 0.06403442, + "auxiliary_loss_mlp": 0.01263884, + "balance_loss_clip": 0.06270905, + "balance_loss_mlp": 0.01254102, + "epoch": 0.7484142492108823, + "flos": 25563289403520.0, + "grad_norm": 2.623161293575912, + "language_loss": 0.72332132, + "learning_rate": 6.279049773470109e-07, + "loss": 0.79999459, + "num_input_tokens_seen": 268449270, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09777832, + "step": 12448, + "time_per_iteration": 2.601579427719116 + }, + { + "auxiliary_loss_clip": 0.06408041, + "auxiliary_loss_mlp": 0.01266135, + "balance_loss_clip": 0.06272004, + "balance_loss_mlp": 0.01256145, + "epoch": 0.7484743724635503, + "flos": 22893432359040.0, + "grad_norm": 1.636804246707767, + "language_loss": 0.73365426, + "learning_rate": 6.276216478918543e-07, + "loss": 0.81039608, + "num_input_tokens_seen": 268467250, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.09991455, + "step": 12449, + "time_per_iteration": 2.54630184173584 + }, + { + "auxiliary_loss_clip": 0.06411887, + "auxiliary_loss_mlp": 0.012677, + "balance_loss_clip": 0.06271833, + "balance_loss_mlp": 0.01256548, + "epoch": 0.7485344957162182, + "flos": 25307137872000.0, + "grad_norm": 1.841554129413667, + "language_loss": 0.61420983, + "learning_rate": 6.273383704774225e-07, + "loss": 0.69100565, + "num_input_tokens_seen": 268487270, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.11151123, + "step": 12450, + "time_per_iteration": 2.5542476177215576 + }, + { + "auxiliary_loss_clip": 0.06399691, + "auxiliary_loss_mlp": 0.01263156, + "balance_loss_clip": 0.0627162, + "balance_loss_mlp": 0.01254156, + "epoch": 0.7485946189688862, + "flos": 27060395103360.0, + "grad_norm": 1.84091608525743, + "language_loss": 0.70658576, + "learning_rate": 6.270551451144577e-07, + "loss": 0.78321427, + "num_input_tokens_seen": 268508020, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.08990479, + "step": 12451, + "time_per_iteration": 2.552686929702759 + }, + { + "auxiliary_loss_clip": 0.06414381, + "auxiliary_loss_mlp": 0.01265729, + "balance_loss_clip": 0.06273015, + "balance_loss_mlp": 0.01255143, + "epoch": 0.7486547422215541, + "flos": 26914052747520.0, + "grad_norm": 1.8323009368960723, + "language_loss": 0.80237973, + "learning_rate": 6.267719718136988e-07, + "loss": 0.87918079, + "num_input_tokens_seen": 268527375, + "router_z_loss_clip": 1.41308594, + "router_z_loss_mlp": 0.105896, + "step": 12452, + "time_per_iteration": 2.525906562805176 + }, + { + "auxiliary_loss_clip": 0.06414159, + "auxiliary_loss_mlp": 0.01265227, + "balance_loss_clip": 0.06274606, + "balance_loss_mlp": 0.01254898, + "epoch": 0.7487148654742222, + "flos": 22352855742720.0, + "grad_norm": 2.4829537234299184, + "language_loss": 0.72200477, + "learning_rate": 6.264888505858843e-07, + "loss": 0.79879862, + "num_input_tokens_seen": 268544870, + "router_z_loss_clip": 1.39648438, + "router_z_loss_mlp": 0.10333252, + "step": 12453, + "time_per_iteration": 3.899683952331543 + }, + { + "auxiliary_loss_clip": 0.06408122, + "auxiliary_loss_mlp": 0.01265158, + "balance_loss_clip": 0.06273174, + "balance_loss_mlp": 0.01255544, + "epoch": 0.7487749887268901, + "flos": 23045392938240.0, + "grad_norm": 1.5935388766621728, + "language_loss": 0.74146187, + "learning_rate": 6.262057814417517e-07, + "loss": 0.81819469, + "num_input_tokens_seen": 268564580, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.09619141, + "step": 12454, + "time_per_iteration": 2.494929552078247 + }, + { + "auxiliary_loss_clip": 0.06311407, + "auxiliary_loss_mlp": 0.012513, + "balance_loss_clip": 0.06256338, + "balance_loss_mlp": 0.01250216, + "epoch": 0.7488351119795581, + "flos": 71545565842560.0, + "grad_norm": 0.7199296433862132, + "language_loss": 0.59468263, + "learning_rate": 6.259227643920322e-07, + "loss": 0.67030972, + "num_input_tokens_seen": 268629550, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.01085663, + "step": 12455, + "time_per_iteration": 3.2877697944641113 + }, + { + "auxiliary_loss_clip": 0.06402359, + "auxiliary_loss_mlp": 0.01260932, + "balance_loss_clip": 0.06271666, + "balance_loss_mlp": 0.01251759, + "epoch": 0.748895235232226, + "flos": 17201048434560.0, + "grad_norm": 1.6203322015377568, + "language_loss": 0.79953825, + "learning_rate": 6.256397994474592e-07, + "loss": 0.87617117, + "num_input_tokens_seen": 268646645, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.09179688, + "step": 12456, + "time_per_iteration": 2.4608328342437744 + }, + { + "auxiliary_loss_clip": 0.06310637, + "auxiliary_loss_mlp": 0.01250455, + "balance_loss_clip": 0.06255627, + "balance_loss_mlp": 0.01249323, + "epoch": 0.748955358484894, + "flos": 58998276846720.0, + "grad_norm": 0.8208514355444383, + "language_loss": 0.61328387, + "learning_rate": 6.25356886618763e-07, + "loss": 0.68889475, + "num_input_tokens_seen": 268702275, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.01134491, + "step": 12457, + "time_per_iteration": 3.048952102661133 + }, + { + "auxiliary_loss_clip": 0.06408623, + "auxiliary_loss_mlp": 0.01266166, + "balance_loss_clip": 0.06272934, + "balance_loss_mlp": 0.01255867, + "epoch": 0.749015481737562, + "flos": 11364544287360.0, + "grad_norm": 1.9496047447072924, + "language_loss": 0.67320937, + "learning_rate": 6.250740259166711e-07, + "loss": 0.7499572, + "num_input_tokens_seen": 268716265, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.10308838, + "step": 12458, + "time_per_iteration": 2.4301834106445312 + }, + { + "auxiliary_loss_clip": 0.06403044, + "auxiliary_loss_mlp": 0.01266185, + "balance_loss_clip": 0.06271131, + "balance_loss_mlp": 0.01256279, + "epoch": 0.74907560499023, + "flos": 21112991677440.0, + "grad_norm": 1.7212914648304267, + "language_loss": 0.80174047, + "learning_rate": 6.247912173519106e-07, + "loss": 0.87843275, + "num_input_tokens_seen": 268734330, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09912109, + "step": 12459, + "time_per_iteration": 2.518477439880371 + }, + { + "auxiliary_loss_clip": 0.06404047, + "auxiliary_loss_mlp": 0.01264599, + "balance_loss_clip": 0.06271756, + "balance_loss_mlp": 0.01254926, + "epoch": 0.749135728242898, + "flos": 22273709961600.0, + "grad_norm": 1.512865855807545, + "language_loss": 0.80564761, + "learning_rate": 6.245084609352043e-07, + "loss": 0.88233417, + "num_input_tokens_seen": 268753500, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.09674072, + "step": 12460, + "time_per_iteration": 2.5079431533813477 + }, + { + "auxiliary_loss_clip": 0.06403753, + "auxiliary_loss_mlp": 0.01265433, + "balance_loss_clip": 0.0627199, + "balance_loss_mlp": 0.0125477, + "epoch": 0.7491958514955659, + "flos": 24063793862400.0, + "grad_norm": 1.6076689252740726, + "language_loss": 0.86212254, + "learning_rate": 6.242257566772755e-07, + "loss": 0.93881446, + "num_input_tokens_seen": 268772055, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.10662842, + "step": 12461, + "time_per_iteration": 2.542217969894409 + }, + { + "auxiliary_loss_clip": 0.06400948, + "auxiliary_loss_mlp": 0.01263344, + "balance_loss_clip": 0.06270917, + "balance_loss_mlp": 0.01254254, + "epoch": 0.7492559747482339, + "flos": 24497915466240.0, + "grad_norm": 1.880430722981425, + "language_loss": 0.69432622, + "learning_rate": 6.239431045888435e-07, + "loss": 0.77096915, + "num_input_tokens_seen": 268792265, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.09088135, + "step": 12462, + "time_per_iteration": 2.5493383407592773 + }, + { + "auxiliary_loss_clip": 0.06405858, + "auxiliary_loss_mlp": 0.01266202, + "balance_loss_clip": 0.06273175, + "balance_loss_mlp": 0.01255301, + "epoch": 0.7493160980009018, + "flos": 27752680736640.0, + "grad_norm": 1.8211376167609288, + "language_loss": 0.70671761, + "learning_rate": 6.236605046806267e-07, + "loss": 0.78343821, + "num_input_tokens_seen": 268812735, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.10900879, + "step": 12463, + "time_per_iteration": 3.986877918243408 + }, + { + "auxiliary_loss_clip": 0.06407613, + "auxiliary_loss_mlp": 0.01265922, + "balance_loss_clip": 0.06274509, + "balance_loss_mlp": 0.01255664, + "epoch": 0.7493762212535698, + "flos": 30233918240640.0, + "grad_norm": 1.7635457747868553, + "language_loss": 0.77660054, + "learning_rate": 6.233779569633419e-07, + "loss": 0.85333592, + "num_input_tokens_seen": 268833090, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.10247803, + "step": 12464, + "time_per_iteration": 2.613281726837158 + }, + { + "auxiliary_loss_clip": 0.06402797, + "auxiliary_loss_mlp": 0.01263814, + "balance_loss_clip": 0.06269908, + "balance_loss_mlp": 0.01254289, + "epoch": 0.7494363445062378, + "flos": 21950906906880.0, + "grad_norm": 1.6126979618339465, + "language_loss": 0.78109074, + "learning_rate": 6.230954614477034e-07, + "loss": 0.85775691, + "num_input_tokens_seen": 268851880, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.09521484, + "step": 12465, + "time_per_iteration": 2.4863994121551514 + }, + { + "auxiliary_loss_clip": 0.06420696, + "auxiliary_loss_mlp": 0.01267627, + "balance_loss_clip": 0.06278575, + "balance_loss_mlp": 0.01256332, + "epoch": 0.7494964677589058, + "flos": 12494473395840.0, + "grad_norm": 2.5697202625678877, + "language_loss": 0.74354923, + "learning_rate": 6.22813018144422e-07, + "loss": 0.82043248, + "num_input_tokens_seen": 268867910, + "router_z_loss_clip": 1.41992188, + "router_z_loss_mlp": 0.11303711, + "step": 12466, + "time_per_iteration": 3.9045188426971436 + }, + { + "auxiliary_loss_clip": 0.06406893, + "auxiliary_loss_mlp": 0.01262068, + "balance_loss_clip": 0.06270187, + "balance_loss_mlp": 0.01252293, + "epoch": 0.7495565910115737, + "flos": 21659521933440.0, + "grad_norm": 1.9829684209764449, + "language_loss": 0.66688263, + "learning_rate": 6.22530627064209e-07, + "loss": 0.74357224, + "num_input_tokens_seen": 268887260, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.09777832, + "step": 12467, + "time_per_iteration": 2.54917049407959 + }, + { + "auxiliary_loss_clip": 0.06409226, + "auxiliary_loss_mlp": 0.01263538, + "balance_loss_clip": 0.06273383, + "balance_loss_mlp": 0.01253501, + "epoch": 0.7496167142642417, + "flos": 15274013834880.0, + "grad_norm": 2.0991094746025416, + "language_loss": 0.76436639, + "learning_rate": 6.222482882177735e-07, + "loss": 0.84109402, + "num_input_tokens_seen": 268902520, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.1003418, + "step": 12468, + "time_per_iteration": 2.4655251502990723 + }, + { + "auxiliary_loss_clip": 0.0640367, + "auxiliary_loss_mlp": 0.01266554, + "balance_loss_clip": 0.06271279, + "balance_loss_mlp": 0.01256129, + "epoch": 0.7496768375169096, + "flos": 22061554623360.0, + "grad_norm": 1.9736124429451793, + "language_loss": 0.69775021, + "learning_rate": 6.219660016158201e-07, + "loss": 0.77445245, + "num_input_tokens_seen": 268920970, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.10430908, + "step": 12469, + "time_per_iteration": 2.533859968185425 + }, + { + "auxiliary_loss_clip": 0.06409403, + "auxiliary_loss_mlp": 0.01264633, + "balance_loss_clip": 0.06274202, + "balance_loss_mlp": 0.01254726, + "epoch": 0.7497369607695776, + "flos": 19062144270720.0, + "grad_norm": 2.2473454659812107, + "language_loss": 0.6920374, + "learning_rate": 6.216837672690543e-07, + "loss": 0.76877773, + "num_input_tokens_seen": 268936600, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.09899902, + "step": 12470, + "time_per_iteration": 2.4770658016204834 + }, + { + "auxiliary_loss_clip": 0.06413378, + "auxiliary_loss_mlp": 0.01268274, + "balance_loss_clip": 0.06271495, + "balance_loss_mlp": 0.01256329, + "epoch": 0.7497970840222457, + "flos": 21624036929280.0, + "grad_norm": 1.7361312699239924, + "language_loss": 0.75303179, + "learning_rate": 6.214015851881793e-07, + "loss": 0.82984829, + "num_input_tokens_seen": 268956560, + "router_z_loss_clip": 1.41894531, + "router_z_loss_mlp": 0.11950684, + "step": 12471, + "time_per_iteration": 2.5342705249786377 + }, + { + "auxiliary_loss_clip": 0.06412168, + "auxiliary_loss_mlp": 0.01265091, + "balance_loss_clip": 0.06277177, + "balance_loss_mlp": 0.01255, + "epoch": 0.7498572072749136, + "flos": 13740710371200.0, + "grad_norm": 2.1773399303982663, + "language_loss": 0.77400845, + "learning_rate": 6.211194553838929e-07, + "loss": 0.85078096, + "num_input_tokens_seen": 268973945, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.10089111, + "step": 12472, + "time_per_iteration": 3.870166540145874 + }, + { + "auxiliary_loss_clip": 0.06403755, + "auxiliary_loss_mlp": 0.01264487, + "balance_loss_clip": 0.06272653, + "balance_loss_mlp": 0.01255039, + "epoch": 0.7499173305275816, + "flos": 22973207045760.0, + "grad_norm": 1.4354078089227125, + "language_loss": 0.84353936, + "learning_rate": 6.208373778668951e-07, + "loss": 0.92022181, + "num_input_tokens_seen": 268993245, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.09460449, + "step": 12473, + "time_per_iteration": 2.537057399749756 + }, + { + "auxiliary_loss_clip": 0.06410777, + "auxiliary_loss_mlp": 0.01268473, + "balance_loss_clip": 0.06273849, + "balance_loss_mlp": 0.01257261, + "epoch": 0.7499774537802495, + "flos": 22745916046080.0, + "grad_norm": 1.8524575994010102, + "language_loss": 0.73466665, + "learning_rate": 6.205553526478829e-07, + "loss": 0.81145918, + "num_input_tokens_seen": 269012125, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.11212158, + "step": 12474, + "time_per_iteration": 2.4842028617858887 + }, + { + "auxiliary_loss_clip": 0.06415059, + "auxiliary_loss_mlp": 0.01267095, + "balance_loss_clip": 0.06274258, + "balance_loss_mlp": 0.01255311, + "epoch": 0.7500375770329175, + "flos": 18302494354560.0, + "grad_norm": 1.6095037145271875, + "language_loss": 0.74770164, + "learning_rate": 6.202733797375492e-07, + "loss": 0.82452309, + "num_input_tokens_seen": 269030545, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.11779785, + "step": 12475, + "time_per_iteration": 2.4979960918426514 + }, + { + "auxiliary_loss_clip": 0.06415677, + "auxiliary_loss_mlp": 0.01269527, + "balance_loss_clip": 0.06274221, + "balance_loss_mlp": 0.01257898, + "epoch": 0.7500977002855854, + "flos": 19175684952960.0, + "grad_norm": 2.1095772826483907, + "language_loss": 0.80763221, + "learning_rate": 6.199914591465878e-07, + "loss": 0.88448429, + "num_input_tokens_seen": 269048180, + "router_z_loss_clip": 1.41308594, + "router_z_loss_mlp": 0.11633301, + "step": 12476, + "time_per_iteration": 2.491819381713867 + }, + { + "auxiliary_loss_clip": 0.06407472, + "auxiliary_loss_mlp": 0.01264673, + "balance_loss_clip": 0.06272332, + "balance_loss_mlp": 0.01254999, + "epoch": 0.7501578235382534, + "flos": 22170441404160.0, + "grad_norm": 7.116833282628377, + "language_loss": 0.77544057, + "learning_rate": 6.19709590885688e-07, + "loss": 0.852162, + "num_input_tokens_seen": 269068600, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.09674072, + "step": 12477, + "time_per_iteration": 2.5502593517303467 + }, + { + "auxiliary_loss_clip": 0.06310226, + "auxiliary_loss_mlp": 0.01250565, + "balance_loss_clip": 0.06254882, + "balance_loss_mlp": 0.01249338, + "epoch": 0.7502179467909214, + "flos": 64481035783680.0, + "grad_norm": 0.7848730842725032, + "language_loss": 0.54270738, + "learning_rate": 6.194277749655394e-07, + "loss": 0.61831528, + "num_input_tokens_seen": 269119045, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01226044, + "step": 12478, + "time_per_iteration": 3.0923471450805664 + }, + { + "auxiliary_loss_clip": 0.06402513, + "auxiliary_loss_mlp": 0.01266297, + "balance_loss_clip": 0.06272154, + "balance_loss_mlp": 0.01255747, + "epoch": 0.7502780700435894, + "flos": 20483332571520.0, + "grad_norm": 1.5542360710976224, + "language_loss": 0.80265927, + "learning_rate": 6.191460113968272e-07, + "loss": 0.87934738, + "num_input_tokens_seen": 269136755, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.10559082, + "step": 12479, + "time_per_iteration": 2.503929615020752 + }, + { + "auxiliary_loss_clip": 0.06412464, + "auxiliary_loss_mlp": 0.01265738, + "balance_loss_clip": 0.06273077, + "balance_loss_mlp": 0.01254162, + "epoch": 0.7503381932962573, + "flos": 20450908241280.0, + "grad_norm": 4.66275961009968, + "language_loss": 0.62624717, + "learning_rate": 6.188643001902369e-07, + "loss": 0.70302922, + "num_input_tokens_seen": 269156120, + "router_z_loss_clip": 1.39257812, + "router_z_loss_mlp": 0.11566162, + "step": 12480, + "time_per_iteration": 2.488246202468872 + }, + { + "auxiliary_loss_clip": 0.06401666, + "auxiliary_loss_mlp": 0.0126556, + "balance_loss_clip": 0.06272847, + "balance_loss_mlp": 0.01256148, + "epoch": 0.7503983165489253, + "flos": 22388382673920.0, + "grad_norm": 1.5669372883229389, + "language_loss": 0.784675, + "learning_rate": 6.185826413564512e-07, + "loss": 0.86134732, + "num_input_tokens_seen": 269175650, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.09411621, + "step": 12481, + "time_per_iteration": 2.514516830444336 + }, + { + "auxiliary_loss_clip": 0.06406647, + "auxiliary_loss_mlp": 0.01271353, + "balance_loss_clip": 0.06270355, + "balance_loss_mlp": 0.01260159, + "epoch": 0.7504584398015932, + "flos": 24906321066240.0, + "grad_norm": 1.6690563670496772, + "language_loss": 0.71560133, + "learning_rate": 6.183010349061501e-07, + "loss": 0.79238129, + "num_input_tokens_seen": 269197080, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.11193848, + "step": 12482, + "time_per_iteration": 2.570258140563965 + }, + { + "auxiliary_loss_clip": 0.06406072, + "auxiliary_loss_mlp": 0.01265844, + "balance_loss_clip": 0.06272655, + "balance_loss_mlp": 0.0125505, + "epoch": 0.7505185630542612, + "flos": 25892381514240.0, + "grad_norm": 1.622739148659245, + "language_loss": 0.70420146, + "learning_rate": 6.180194808500118e-07, + "loss": 0.78092062, + "num_input_tokens_seen": 269218600, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.10784912, + "step": 12483, + "time_per_iteration": 2.545875072479248 + }, + { + "auxiliary_loss_clip": 0.06406315, + "auxiliary_loss_mlp": 0.01266459, + "balance_loss_clip": 0.0627225, + "balance_loss_mlp": 0.01257227, + "epoch": 0.7505786863069293, + "flos": 23149709671680.0, + "grad_norm": 1.6112204819340308, + "language_loss": 0.74173069, + "learning_rate": 6.177379791987131e-07, + "loss": 0.81845844, + "num_input_tokens_seen": 269239245, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.09240723, + "step": 12484, + "time_per_iteration": 2.50899600982666 + }, + { + "auxiliary_loss_clip": 0.06404275, + "auxiliary_loss_mlp": 0.01267227, + "balance_loss_clip": 0.06272139, + "balance_loss_mlp": 0.01256761, + "epoch": 0.7506388095595972, + "flos": 16989144658560.0, + "grad_norm": 1.988075921906434, + "language_loss": 0.84860504, + "learning_rate": 6.174565299629295e-07, + "loss": 0.92532003, + "num_input_tokens_seen": 269258520, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.10473633, + "step": 12485, + "time_per_iteration": 2.5089685916900635 + }, + { + "auxiliary_loss_clip": 0.06403236, + "auxiliary_loss_mlp": 0.01262842, + "balance_loss_clip": 0.06270488, + "balance_loss_mlp": 0.01253121, + "epoch": 0.7506989328122652, + "flos": 22351346369280.0, + "grad_norm": 1.4931669119648077, + "language_loss": 0.78489572, + "learning_rate": 6.171751331533323e-07, + "loss": 0.86155653, + "num_input_tokens_seen": 269278320, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.097229, + "step": 12486, + "time_per_iteration": 2.5051820278167725 + }, + { + "auxiliary_loss_clip": 0.06408528, + "auxiliary_loss_mlp": 0.01263313, + "balance_loss_clip": 0.06272987, + "balance_loss_mlp": 0.01253245, + "epoch": 0.7507590560649331, + "flos": 25783243171200.0, + "grad_norm": 1.7753955887486508, + "language_loss": 0.73021758, + "learning_rate": 6.168937887805932e-07, + "loss": 0.80693603, + "num_input_tokens_seen": 269298025, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.10064697, + "step": 12487, + "time_per_iteration": 2.547999382019043 + }, + { + "auxiliary_loss_clip": 0.06404672, + "auxiliary_loss_mlp": 0.01263386, + "balance_loss_clip": 0.0626927, + "balance_loss_mlp": 0.01253528, + "epoch": 0.7508191793176011, + "flos": 24286221325440.0, + "grad_norm": 1.9310699455089921, + "language_loss": 0.67608893, + "learning_rate": 6.166124968553801e-07, + "loss": 0.75276947, + "num_input_tokens_seen": 269316770, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.09857178, + "step": 12488, + "time_per_iteration": 2.5895445346832275 + }, + { + "auxiliary_loss_clip": 0.0640392, + "auxiliary_loss_mlp": 0.0126508, + "balance_loss_clip": 0.06270676, + "balance_loss_mlp": 0.01254822, + "epoch": 0.750879302570269, + "flos": 19905384234240.0, + "grad_norm": 1.5890652635946048, + "language_loss": 0.77430677, + "learning_rate": 6.163312573883592e-07, + "loss": 0.85099679, + "num_input_tokens_seen": 269334755, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.10253906, + "step": 12489, + "time_per_iteration": 2.5337159633636475 + }, + { + "auxiliary_loss_clip": 0.0640057, + "auxiliary_loss_mlp": 0.01265302, + "balance_loss_clip": 0.06270728, + "balance_loss_mlp": 0.01255431, + "epoch": 0.750939425822937, + "flos": 29213420964480.0, + "grad_norm": 1.5668986388800445, + "language_loss": 0.75072443, + "learning_rate": 6.160500703901956e-07, + "loss": 0.8273831, + "num_input_tokens_seen": 269353810, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.09875488, + "step": 12490, + "time_per_iteration": 2.5781826972961426 + }, + { + "auxiliary_loss_clip": 0.06405737, + "auxiliary_loss_mlp": 0.01266052, + "balance_loss_clip": 0.06274259, + "balance_loss_mlp": 0.0125592, + "epoch": 0.750999549075605, + "flos": 21148686316800.0, + "grad_norm": 1.487741862942094, + "language_loss": 0.7861315, + "learning_rate": 6.157689358715527e-07, + "loss": 0.86284935, + "num_input_tokens_seen": 269372910, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.10144043, + "step": 12491, + "time_per_iteration": 2.5030393600463867 + }, + { + "auxiliary_loss_clip": 0.06398296, + "auxiliary_loss_mlp": 0.01269676, + "balance_loss_clip": 0.06269314, + "balance_loss_mlp": 0.01260473, + "epoch": 0.751059672328273, + "flos": 23554090275840.0, + "grad_norm": 1.6435305052483133, + "language_loss": 0.76645952, + "learning_rate": 6.154878538430899e-07, + "loss": 0.84313929, + "num_input_tokens_seen": 269391545, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.09210205, + "step": 12492, + "time_per_iteration": 2.5466179847717285 + }, + { + "auxiliary_loss_clip": 0.06403392, + "auxiliary_loss_mlp": 0.01267084, + "balance_loss_clip": 0.06270675, + "balance_loss_mlp": 0.01257446, + "epoch": 0.7511197955809409, + "flos": 18995786236800.0, + "grad_norm": 1.8268388211945472, + "language_loss": 0.71465898, + "learning_rate": 6.152068243154671e-07, + "loss": 0.79136372, + "num_input_tokens_seen": 269408530, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09637451, + "step": 12493, + "time_per_iteration": 3.923126697540283 + }, + { + "auxiliary_loss_clip": 0.06408728, + "auxiliary_loss_mlp": 0.01268664, + "balance_loss_clip": 0.06274524, + "balance_loss_mlp": 0.01258603, + "epoch": 0.7511799188336089, + "flos": 22052246820480.0, + "grad_norm": 1.6129417562793205, + "language_loss": 0.80984807, + "learning_rate": 6.149258472993395e-07, + "loss": 0.88662201, + "num_input_tokens_seen": 269425930, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.10070801, + "step": 12494, + "time_per_iteration": 2.499166488647461 + }, + { + "auxiliary_loss_clip": 0.06403729, + "auxiliary_loss_mlp": 0.01266628, + "balance_loss_clip": 0.06270036, + "balance_loss_mlp": 0.01256418, + "epoch": 0.7512400420862768, + "flos": 16471894204800.0, + "grad_norm": 1.701536760083375, + "language_loss": 0.79124582, + "learning_rate": 6.146449228053634e-07, + "loss": 0.86794937, + "num_input_tokens_seen": 269443945, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.10211182, + "step": 12495, + "time_per_iteration": 2.482259511947632 + }, + { + "auxiliary_loss_clip": 0.06400186, + "auxiliary_loss_mlp": 0.01262526, + "balance_loss_clip": 0.06269289, + "balance_loss_mlp": 0.01253108, + "epoch": 0.7513001653389448, + "flos": 20454472039680.0, + "grad_norm": 1.7104928099780732, + "language_loss": 0.71375751, + "learning_rate": 6.143640508441898e-07, + "loss": 0.79038465, + "num_input_tokens_seen": 269463625, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09417725, + "step": 12496, + "time_per_iteration": 2.513437032699585 + }, + { + "auxiliary_loss_clip": 0.06405301, + "auxiliary_loss_mlp": 0.01263444, + "balance_loss_clip": 0.06272015, + "balance_loss_mlp": 0.01253907, + "epoch": 0.7513602885916129, + "flos": 23483497610880.0, + "grad_norm": 1.6654554654788911, + "language_loss": 0.78218853, + "learning_rate": 6.140832314264705e-07, + "loss": 0.85887605, + "num_input_tokens_seen": 269483415, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.09533691, + "step": 12497, + "time_per_iteration": 2.513091564178467 + }, + { + "auxiliary_loss_clip": 0.06402559, + "auxiliary_loss_mlp": 0.01266934, + "balance_loss_clip": 0.06268804, + "balance_loss_mlp": 0.01256867, + "epoch": 0.7514204118442808, + "flos": 26804495134080.0, + "grad_norm": 1.4375816508354362, + "language_loss": 0.77240133, + "learning_rate": 6.13802464562855e-07, + "loss": 0.8490963, + "num_input_tokens_seen": 269504635, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.10070801, + "step": 12498, + "time_per_iteration": 2.5410008430480957 + }, + { + "auxiliary_loss_clip": 0.06400871, + "auxiliary_loss_mlp": 0.01263117, + "balance_loss_clip": 0.06272262, + "balance_loss_mlp": 0.01254462, + "epoch": 0.7514805350969488, + "flos": 19871869800960.0, + "grad_norm": 1.7337697309070021, + "language_loss": 0.74015534, + "learning_rate": 6.135217502639878e-07, + "loss": 0.81679523, + "num_input_tokens_seen": 269523955, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.08654785, + "step": 12499, + "time_per_iteration": 2.557349443435669 + }, + { + "auxiliary_loss_clip": 0.06399096, + "auxiliary_loss_mlp": 0.01264017, + "balance_loss_clip": 0.06268655, + "balance_loss_mlp": 0.01254737, + "epoch": 0.7515406583496167, + "flos": 24578444839680.0, + "grad_norm": 2.167576832097364, + "language_loss": 0.79499745, + "learning_rate": 6.132410885405148e-07, + "loss": 0.87162852, + "num_input_tokens_seen": 269544410, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.09277344, + "step": 12500, + "time_per_iteration": 2.5547473430633545 + }, + { + "auxiliary_loss_clip": 0.06415384, + "auxiliary_loss_mlp": 0.01265407, + "balance_loss_clip": 0.06272934, + "balance_loss_mlp": 0.01253772, + "epoch": 0.7516007816022847, + "flos": 20126386177920.0, + "grad_norm": 1.9841359152283422, + "language_loss": 0.73215604, + "learning_rate": 6.129604794030794e-07, + "loss": 0.80896389, + "num_input_tokens_seen": 269563315, + "router_z_loss_clip": 1.42382812, + "router_z_loss_mlp": 0.11639404, + "step": 12501, + "time_per_iteration": 2.4737539291381836 + }, + { + "auxiliary_loss_clip": 0.06401603, + "auxiliary_loss_mlp": 0.0126533, + "balance_loss_clip": 0.06269078, + "balance_loss_mlp": 0.01255764, + "epoch": 0.7516609048549526, + "flos": 22791379685760.0, + "grad_norm": 1.708165440784374, + "language_loss": 0.7856493, + "learning_rate": 6.126799228623207e-07, + "loss": 0.86231852, + "num_input_tokens_seen": 269583950, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09570312, + "step": 12502, + "time_per_iteration": 4.065747499465942 + }, + { + "auxiliary_loss_clip": 0.0640514, + "auxiliary_loss_mlp": 0.01262296, + "balance_loss_clip": 0.06270734, + "balance_loss_mlp": 0.01251895, + "epoch": 0.7517210281076206, + "flos": 10638576512640.0, + "grad_norm": 2.198342230636315, + "language_loss": 0.70527124, + "learning_rate": 6.123994189288786e-07, + "loss": 0.78194559, + "num_input_tokens_seen": 269600120, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.10406494, + "step": 12503, + "time_per_iteration": 2.4975264072418213 + }, + { + "auxiliary_loss_clip": 0.06308451, + "auxiliary_loss_mlp": 0.01250423, + "balance_loss_clip": 0.06253403, + "balance_loss_mlp": 0.01249304, + "epoch": 0.7517811513602886, + "flos": 66071542458240.0, + "grad_norm": 0.9653674550577583, + "language_loss": 0.63868368, + "learning_rate": 6.121189676133903e-07, + "loss": 0.71427244, + "num_input_tokens_seen": 269659815, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.01122284, + "step": 12504, + "time_per_iteration": 3.0423572063446045 + }, + { + "auxiliary_loss_clip": 0.06398649, + "auxiliary_loss_mlp": 0.01267599, + "balance_loss_clip": 0.06269499, + "balance_loss_mlp": 0.01258533, + "epoch": 0.7518412746129566, + "flos": 37277317071360.0, + "grad_norm": 1.461644685561848, + "language_loss": 0.68779212, + "learning_rate": 6.118385689264896e-07, + "loss": 0.7644546, + "num_input_tokens_seen": 269684565, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.09069824, + "step": 12505, + "time_per_iteration": 4.1895623207092285 + }, + { + "auxiliary_loss_clip": 0.06309824, + "auxiliary_loss_mlp": 0.01250829, + "balance_loss_clip": 0.06254642, + "balance_loss_mlp": 0.01249779, + "epoch": 0.7519013978656245, + "flos": 60539001396480.0, + "grad_norm": 0.633292190388587, + "language_loss": 0.55014133, + "learning_rate": 6.11558222878809e-07, + "loss": 0.6257478, + "num_input_tokens_seen": 269752325, + "router_z_loss_clip": 0.55126953, + "router_z_loss_mlp": 0.01050568, + "step": 12506, + "time_per_iteration": 3.249525785446167 + }, + { + "auxiliary_loss_clip": 0.06407043, + "auxiliary_loss_mlp": 0.01265184, + "balance_loss_clip": 0.0627189, + "balance_loss_mlp": 0.01254831, + "epoch": 0.7519615211182925, + "flos": 18812826846720.0, + "grad_norm": 1.7032377600653197, + "language_loss": 0.78890646, + "learning_rate": 6.112779294809796e-07, + "loss": 0.86562872, + "num_input_tokens_seen": 269770630, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.10339355, + "step": 12507, + "time_per_iteration": 2.4874064922332764 + }, + { + "auxiliary_loss_clip": 0.06398805, + "auxiliary_loss_mlp": 0.01267855, + "balance_loss_clip": 0.06269046, + "balance_loss_mlp": 0.0125808, + "epoch": 0.7520216443709604, + "flos": 14580596171520.0, + "grad_norm": 1.7335317284626974, + "language_loss": 0.71662533, + "learning_rate": 6.10997688743631e-07, + "loss": 0.79329199, + "num_input_tokens_seen": 269787280, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.09777832, + "step": 12508, + "time_per_iteration": 2.5105843544006348 + }, + { + "auxiliary_loss_clip": 0.06401521, + "auxiliary_loss_mlp": 0.0126325, + "balance_loss_clip": 0.06269743, + "balance_loss_mlp": 0.01254262, + "epoch": 0.7520817676236284, + "flos": 17062420654080.0, + "grad_norm": 1.5570539032807615, + "language_loss": 0.72277093, + "learning_rate": 6.107175006773885e-07, + "loss": 0.79941863, + "num_input_tokens_seen": 269805205, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.08984375, + "step": 12509, + "time_per_iteration": 2.452536106109619 + }, + { + "auxiliary_loss_clip": 0.06410283, + "auxiliary_loss_mlp": 0.01268332, + "balance_loss_clip": 0.06271298, + "balance_loss_mlp": 0.01257496, + "epoch": 0.7521418908762965, + "flos": 25673517849600.0, + "grad_norm": 1.5708944313915068, + "language_loss": 0.61849803, + "learning_rate": 6.104373652928785e-07, + "loss": 0.69528419, + "num_input_tokens_seen": 269824820, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.10839844, + "step": 12510, + "time_per_iteration": 2.5873842239379883 + }, + { + "auxiliary_loss_clip": 0.0640108, + "auxiliary_loss_mlp": 0.01265287, + "balance_loss_clip": 0.06272186, + "balance_loss_mlp": 0.01255613, + "epoch": 0.7522020141289644, + "flos": 20893079836800.0, + "grad_norm": 2.376424166314484, + "language_loss": 0.81816781, + "learning_rate": 6.10157282600722e-07, + "loss": 0.89483154, + "num_input_tokens_seen": 269842825, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.09674072, + "step": 12511, + "time_per_iteration": 3.9771971702575684 + }, + { + "auxiliary_loss_clip": 0.06408679, + "auxiliary_loss_mlp": 0.01269282, + "balance_loss_clip": 0.06270606, + "balance_loss_mlp": 0.01258571, + "epoch": 0.7522621373816324, + "flos": 12645134236800.0, + "grad_norm": 1.635821418460478, + "language_loss": 0.76383078, + "learning_rate": 6.098772526115412e-07, + "loss": 0.84061033, + "num_input_tokens_seen": 269859000, + "router_z_loss_clip": 1.38183594, + "router_z_loss_mlp": 0.1071167, + "step": 12512, + "time_per_iteration": 2.497439384460449 + }, + { + "auxiliary_loss_clip": 0.06396883, + "auxiliary_loss_mlp": 0.01265576, + "balance_loss_clip": 0.06270725, + "balance_loss_mlp": 0.01256557, + "epoch": 0.7523222606343003, + "flos": 25632624257280.0, + "grad_norm": 1.702992973321348, + "language_loss": 0.82472456, + "learning_rate": 6.095972753359537e-07, + "loss": 0.90134907, + "num_input_tokens_seen": 269878895, + "router_z_loss_clip": 1.26171875, + "router_z_loss_mlp": 0.09002686, + "step": 12513, + "time_per_iteration": 2.581941604614258 + }, + { + "auxiliary_loss_clip": 0.06405152, + "auxiliary_loss_mlp": 0.01262838, + "balance_loss_clip": 0.06268971, + "balance_loss_mlp": 0.01252747, + "epoch": 0.7523823838869683, + "flos": 20455142872320.0, + "grad_norm": 1.6682256759648477, + "language_loss": 0.7510156, + "learning_rate": 6.093173507845771e-07, + "loss": 0.82769549, + "num_input_tokens_seen": 269897280, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.10089111, + "step": 12514, + "time_per_iteration": 2.4942328929901123 + }, + { + "auxiliary_loss_clip": 0.06397319, + "auxiliary_loss_mlp": 0.0126564, + "balance_loss_clip": 0.06269006, + "balance_loss_mlp": 0.01256955, + "epoch": 0.7524425071396362, + "flos": 14725890351360.0, + "grad_norm": 1.7883586477571864, + "language_loss": 0.689107, + "learning_rate": 6.090374789680271e-07, + "loss": 0.76573658, + "num_input_tokens_seen": 269914640, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.08679199, + "step": 12515, + "time_per_iteration": 2.494940996170044 + }, + { + "auxiliary_loss_clip": 0.06405492, + "auxiliary_loss_mlp": 0.01266715, + "balance_loss_clip": 0.06272881, + "balance_loss_mlp": 0.01257225, + "epoch": 0.7525026303923043, + "flos": 30600004728960.0, + "grad_norm": 2.8396136921883905, + "language_loss": 0.70415783, + "learning_rate": 6.087576598969137e-07, + "loss": 0.78087991, + "num_input_tokens_seen": 269934960, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.09490967, + "step": 12516, + "time_per_iteration": 2.584015130996704 + }, + { + "auxiliary_loss_clip": 0.06399474, + "auxiliary_loss_mlp": 0.01267761, + "balance_loss_clip": 0.06270649, + "balance_loss_mlp": 0.01258325, + "epoch": 0.7525627536449722, + "flos": 24798901731840.0, + "grad_norm": 1.5910108360276343, + "language_loss": 0.89611065, + "learning_rate": 6.084778935818495e-07, + "loss": 0.97278303, + "num_input_tokens_seen": 269956655, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.09436035, + "step": 12517, + "time_per_iteration": 2.5272841453552246 + }, + { + "auxiliary_loss_clip": 0.06410724, + "auxiliary_loss_mlp": 0.01264516, + "balance_loss_clip": 0.06273246, + "balance_loss_mlp": 0.01254359, + "epoch": 0.7526228768976402, + "flos": 20786499043200.0, + "grad_norm": 1.4709684896857864, + "language_loss": 0.74636328, + "learning_rate": 6.081981800334437e-07, + "loss": 0.82311571, + "num_input_tokens_seen": 269976835, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.10150146, + "step": 12518, + "time_per_iteration": 2.507249116897583 + }, + { + "auxiliary_loss_clip": 0.06313983, + "auxiliary_loss_mlp": 0.01251233, + "balance_loss_clip": 0.06258783, + "balance_loss_mlp": 0.01250141, + "epoch": 0.7526830001503081, + "flos": 66578017662720.0, + "grad_norm": 0.6920212642256274, + "language_loss": 0.55552846, + "learning_rate": 6.079185192623017e-07, + "loss": 0.63118064, + "num_input_tokens_seen": 270040630, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01094055, + "step": 12519, + "time_per_iteration": 3.1638381481170654 + }, + { + "auxiliary_loss_clip": 0.06402172, + "auxiliary_loss_mlp": 0.0126505, + "balance_loss_clip": 0.06268954, + "balance_loss_mlp": 0.01255423, + "epoch": 0.7527431234029761, + "flos": 23484755422080.0, + "grad_norm": 1.392327642078427, + "language_loss": 0.77952313, + "learning_rate": 6.07638911279029e-07, + "loss": 0.85619533, + "num_input_tokens_seen": 270059695, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.09625244, + "step": 12520, + "time_per_iteration": 2.5008206367492676 + }, + { + "auxiliary_loss_clip": 0.06405456, + "auxiliary_loss_mlp": 0.01265903, + "balance_loss_clip": 0.06273633, + "balance_loss_mlp": 0.01256158, + "epoch": 0.752803246655644, + "flos": 22055265567360.0, + "grad_norm": 8.971083878889642, + "language_loss": 0.74495649, + "learning_rate": 6.07359356094229e-07, + "loss": 0.82167011, + "num_input_tokens_seen": 270078420, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.09747314, + "step": 12521, + "time_per_iteration": 2.5451552867889404 + }, + { + "auxiliary_loss_clip": 0.06412265, + "auxiliary_loss_mlp": 0.0126799, + "balance_loss_clip": 0.06272561, + "balance_loss_mlp": 0.01257059, + "epoch": 0.752863369908312, + "flos": 30161606567040.0, + "grad_norm": 1.8189760564155686, + "language_loss": 0.67176616, + "learning_rate": 6.070798537185016e-07, + "loss": 0.74856877, + "num_input_tokens_seen": 270097040, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.10925293, + "step": 12522, + "time_per_iteration": 2.556718349456787 + }, + { + "auxiliary_loss_clip": 0.06409014, + "auxiliary_loss_mlp": 0.01271964, + "balance_loss_clip": 0.06271487, + "balance_loss_mlp": 0.01261825, + "epoch": 0.7529234931609801, + "flos": 24573874792320.0, + "grad_norm": 1.5612093736475694, + "language_loss": 0.78733182, + "learning_rate": 6.068004041624453e-07, + "loss": 0.86414158, + "num_input_tokens_seen": 270116365, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.10137939, + "step": 12523, + "time_per_iteration": 2.5776190757751465 + }, + { + "auxiliary_loss_clip": 0.0639995, + "auxiliary_loss_mlp": 0.01266629, + "balance_loss_clip": 0.06269381, + "balance_loss_mlp": 0.0125683, + "epoch": 0.752983616413648, + "flos": 23119088204160.0, + "grad_norm": 1.791528721862032, + "language_loss": 0.80482811, + "learning_rate": 6.065210074366571e-07, + "loss": 0.88149387, + "num_input_tokens_seen": 270135395, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.09796143, + "step": 12524, + "time_per_iteration": 2.500800132751465 + }, + { + "auxiliary_loss_clip": 0.06402539, + "auxiliary_loss_mlp": 0.01269955, + "balance_loss_clip": 0.06271717, + "balance_loss_mlp": 0.01260996, + "epoch": 0.753043739666316, + "flos": 24323928462720.0, + "grad_norm": 1.510186119620748, + "language_loss": 0.74149638, + "learning_rate": 6.062416635517326e-07, + "loss": 0.81822133, + "num_input_tokens_seen": 270156425, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.08953857, + "step": 12525, + "time_per_iteration": 2.5363988876342773 + }, + { + "auxiliary_loss_clip": 0.0639966, + "auxiliary_loss_mlp": 0.01264528, + "balance_loss_clip": 0.06270238, + "balance_loss_mlp": 0.01254777, + "epoch": 0.7531038629189839, + "flos": 24250149342720.0, + "grad_norm": 1.8502310757699438, + "language_loss": 0.725272, + "learning_rate": 6.059623725182641e-07, + "loss": 0.80191386, + "num_input_tokens_seen": 270176905, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.09753418, + "step": 12526, + "time_per_iteration": 2.5115420818328857 + }, + { + "auxiliary_loss_clip": 0.06402011, + "auxiliary_loss_mlp": 0.01263679, + "balance_loss_clip": 0.06270412, + "balance_loss_mlp": 0.01254167, + "epoch": 0.7531639861716519, + "flos": 30196378811520.0, + "grad_norm": 1.617761308290089, + "language_loss": 0.72719419, + "learning_rate": 6.056831343468414e-07, + "loss": 0.80385113, + "num_input_tokens_seen": 270196640, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09509277, + "step": 12527, + "time_per_iteration": 2.620079517364502 + }, + { + "auxiliary_loss_clip": 0.06399914, + "auxiliary_loss_mlp": 0.01265035, + "balance_loss_clip": 0.06268723, + "balance_loss_mlp": 0.01255558, + "epoch": 0.7532241094243198, + "flos": 18229050650880.0, + "grad_norm": 1.8406342788129475, + "language_loss": 0.81231797, + "learning_rate": 6.054039490480539e-07, + "loss": 0.88896745, + "num_input_tokens_seen": 270213905, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.0947876, + "step": 12528, + "time_per_iteration": 2.4696736335754395 + }, + { + "auxiliary_loss_clip": 0.06403716, + "auxiliary_loss_mlp": 0.01265532, + "balance_loss_clip": 0.06269462, + "balance_loss_mlp": 0.0125525, + "epoch": 0.7532842326769879, + "flos": 20886413437440.0, + "grad_norm": 2.282089070313471, + "language_loss": 0.85098541, + "learning_rate": 6.051248166324892e-07, + "loss": 0.92767787, + "num_input_tokens_seen": 270231995, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.1027832, + "step": 12529, + "time_per_iteration": 2.5071592330932617 + }, + { + "auxiliary_loss_clip": 0.06410262, + "auxiliary_loss_mlp": 0.01264635, + "balance_loss_clip": 0.06272294, + "balance_loss_mlp": 0.01254818, + "epoch": 0.7533443559296558, + "flos": 18084762720000.0, + "grad_norm": 1.902579288696582, + "language_loss": 0.74726146, + "learning_rate": 6.048457371107303e-07, + "loss": 0.82401049, + "num_input_tokens_seen": 270251480, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.09814453, + "step": 12530, + "time_per_iteration": 2.502178192138672 + }, + { + "auxiliary_loss_clip": 0.06308636, + "auxiliary_loss_mlp": 0.01252721, + "balance_loss_clip": 0.06253405, + "balance_loss_mlp": 0.01251678, + "epoch": 0.7534044791823238, + "flos": 50271668398080.0, + "grad_norm": 0.8173638776820421, + "language_loss": 0.63636577, + "learning_rate": 6.045667104933612e-07, + "loss": 0.71197939, + "num_input_tokens_seen": 270306480, + "router_z_loss_clip": 0.55273438, + "router_z_loss_mlp": 0.01044464, + "step": 12531, + "time_per_iteration": 2.9869658946990967 + }, + { + "auxiliary_loss_clip": 0.06406563, + "auxiliary_loss_mlp": 0.01265391, + "balance_loss_clip": 0.06270574, + "balance_loss_mlp": 0.01255437, + "epoch": 0.7534646024349917, + "flos": 20856588583680.0, + "grad_norm": 2.370705934223187, + "language_loss": 0.70650482, + "learning_rate": 6.042877367909633e-07, + "loss": 0.78322434, + "num_input_tokens_seen": 270324595, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.0994873, + "step": 12532, + "time_per_iteration": 3.92488169670105 + }, + { + "auxiliary_loss_clip": 0.06397863, + "auxiliary_loss_mlp": 0.01267261, + "balance_loss_clip": 0.06270358, + "balance_loss_mlp": 0.01257814, + "epoch": 0.7535247256876597, + "flos": 23077775341440.0, + "grad_norm": 1.5088215588647627, + "language_loss": 0.77771306, + "learning_rate": 6.040088160141132e-07, + "loss": 0.85436428, + "num_input_tokens_seen": 270344375, + "router_z_loss_clip": 1.27441406, + "router_z_loss_mlp": 0.09442139, + "step": 12533, + "time_per_iteration": 2.489647626876831 + }, + { + "auxiliary_loss_clip": 0.06306736, + "auxiliary_loss_mlp": 0.01251137, + "balance_loss_clip": 0.06251442, + "balance_loss_mlp": 0.01250062, + "epoch": 0.7535848489403276, + "flos": 58643888002560.0, + "grad_norm": 0.7841580581676975, + "language_loss": 0.57404244, + "learning_rate": 6.037299481733886e-07, + "loss": 0.64962119, + "num_input_tokens_seen": 270405235, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01076508, + "step": 12534, + "time_per_iteration": 3.1910510063171387 + }, + { + "auxiliary_loss_clip": 0.06403376, + "auxiliary_loss_mlp": 0.01267552, + "balance_loss_clip": 0.06270553, + "balance_loss_mlp": 0.01257568, + "epoch": 0.7536449721929956, + "flos": 26585044490880.0, + "grad_norm": 1.3288810458432065, + "language_loss": 0.71601486, + "learning_rate": 6.03451133279365e-07, + "loss": 0.79272413, + "num_input_tokens_seen": 270425820, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.09991455, + "step": 12535, + "time_per_iteration": 2.5521280765533447 + }, + { + "auxiliary_loss_clip": 0.06405595, + "auxiliary_loss_mlp": 0.0126787, + "balance_loss_clip": 0.06269699, + "balance_loss_mlp": 0.01258024, + "epoch": 0.7537050954456637, + "flos": 25742559214080.0, + "grad_norm": 1.4204428074088968, + "language_loss": 0.80683547, + "learning_rate": 6.031723713426135e-07, + "loss": 0.88357008, + "num_input_tokens_seen": 270447120, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.09838867, + "step": 12536, + "time_per_iteration": 2.612800359725952 + }, + { + "auxiliary_loss_clip": 0.06397747, + "auxiliary_loss_mlp": 0.01263423, + "balance_loss_clip": 0.06268154, + "balance_loss_mlp": 0.01254006, + "epoch": 0.7537652186983316, + "flos": 30231863815680.0, + "grad_norm": 2.5926766320548333, + "language_loss": 0.7478568, + "learning_rate": 6.028936623737067e-07, + "loss": 0.82446849, + "num_input_tokens_seen": 270468680, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.09423828, + "step": 12537, + "time_per_iteration": 2.6071624755859375 + }, + { + "auxiliary_loss_clip": 0.06407893, + "auxiliary_loss_mlp": 0.01268462, + "balance_loss_clip": 0.06273423, + "balance_loss_mlp": 0.01258771, + "epoch": 0.7538253419509996, + "flos": 12646224339840.0, + "grad_norm": 1.6302297616085528, + "language_loss": 0.74427301, + "learning_rate": 6.026150063832111e-07, + "loss": 0.82103658, + "num_input_tokens_seen": 270486310, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.09698486, + "step": 12538, + "time_per_iteration": 2.532360076904297 + }, + { + "auxiliary_loss_clip": 0.06404191, + "auxiliary_loss_mlp": 0.01267125, + "balance_loss_clip": 0.06270571, + "balance_loss_mlp": 0.01256676, + "epoch": 0.7538854652036675, + "flos": 23192783470080.0, + "grad_norm": 1.9550849129782661, + "language_loss": 0.67649639, + "learning_rate": 6.023364033816956e-07, + "loss": 0.75320947, + "num_input_tokens_seen": 270507210, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.10455322, + "step": 12539, + "time_per_iteration": 2.5289549827575684 + }, + { + "auxiliary_loss_clip": 0.06399977, + "auxiliary_loss_mlp": 0.01264844, + "balance_loss_clip": 0.06269806, + "balance_loss_mlp": 0.01255296, + "epoch": 0.7539455884563355, + "flos": 23193076959360.0, + "grad_norm": 1.5765955359694397, + "language_loss": 0.74866569, + "learning_rate": 6.020578533797229e-07, + "loss": 0.82531393, + "num_input_tokens_seen": 270525250, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.09552002, + "step": 12540, + "time_per_iteration": 2.519505023956299 + }, + { + "auxiliary_loss_clip": 0.06404985, + "auxiliary_loss_mlp": 0.01264812, + "balance_loss_clip": 0.06269932, + "balance_loss_mlp": 0.01254816, + "epoch": 0.7540057117090034, + "flos": 13184998093440.0, + "grad_norm": 1.8443764292717588, + "language_loss": 0.73148596, + "learning_rate": 6.017793563878566e-07, + "loss": 0.80818391, + "num_input_tokens_seen": 270539295, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.10003662, + "step": 12541, + "time_per_iteration": 2.4335999488830566 + }, + { + "auxiliary_loss_clip": 0.06404177, + "auxiliary_loss_mlp": 0.0126394, + "balance_loss_clip": 0.06270086, + "balance_loss_mlp": 0.01254254, + "epoch": 0.7540658349616715, + "flos": 45488561783040.0, + "grad_norm": 1.5152984414319595, + "language_loss": 0.72388256, + "learning_rate": 6.015009124166576e-07, + "loss": 0.80056369, + "num_input_tokens_seen": 270562815, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.09680176, + "step": 12542, + "time_per_iteration": 4.1390299797058105 + }, + { + "auxiliary_loss_clip": 0.06397901, + "auxiliary_loss_mlp": 0.01264113, + "balance_loss_clip": 0.06268644, + "balance_loss_mlp": 0.01254344, + "epoch": 0.7541259582143394, + "flos": 19935754139520.0, + "grad_norm": 2.884156487358873, + "language_loss": 0.84689027, + "learning_rate": 6.012225214766844e-07, + "loss": 0.92351043, + "num_input_tokens_seen": 270579055, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.09771729, + "step": 12543, + "time_per_iteration": 2.503478765487671 + }, + { + "auxiliary_loss_clip": 0.06401214, + "auxiliary_loss_mlp": 0.0126353, + "balance_loss_clip": 0.06271526, + "balance_loss_mlp": 0.01253886, + "epoch": 0.7541860814670074, + "flos": 27205521575040.0, + "grad_norm": 2.0819371266250095, + "language_loss": 0.73893505, + "learning_rate": 6.009441835784927e-07, + "loss": 0.81558251, + "num_input_tokens_seen": 270599080, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.09643555, + "step": 12544, + "time_per_iteration": 2.5382394790649414 + }, + { + "auxiliary_loss_clip": 0.06402065, + "auxiliary_loss_mlp": 0.01263786, + "balance_loss_clip": 0.06270371, + "balance_loss_mlp": 0.01254505, + "epoch": 0.7542462047196753, + "flos": 21330471749760.0, + "grad_norm": 1.7394409636932977, + "language_loss": 0.68186235, + "learning_rate": 6.006658987326383e-07, + "loss": 0.7585209, + "num_input_tokens_seen": 270618715, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09277344, + "step": 12545, + "time_per_iteration": 3.9819624423980713 + }, + { + "auxiliary_loss_clip": 0.06407365, + "auxiliary_loss_mlp": 0.01263612, + "balance_loss_clip": 0.06273335, + "balance_loss_mlp": 0.01254326, + "epoch": 0.7543063279723433, + "flos": 11944630903680.0, + "grad_norm": 1.6656335194491443, + "language_loss": 0.69190776, + "learning_rate": 6.003876669496728e-07, + "loss": 0.76861751, + "num_input_tokens_seen": 270635695, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.09283447, + "step": 12546, + "time_per_iteration": 2.5855300426483154 + }, + { + "auxiliary_loss_clip": 0.06408285, + "auxiliary_loss_mlp": 0.01269444, + "balance_loss_clip": 0.06272961, + "balance_loss_mlp": 0.01258423, + "epoch": 0.7543664512250112, + "flos": 22826529273600.0, + "grad_norm": 2.2583251382821268, + "language_loss": 0.73943269, + "learning_rate": 6.00109488240147e-07, + "loss": 0.81620997, + "num_input_tokens_seen": 270654325, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.11022949, + "step": 12547, + "time_per_iteration": 2.5086138248443604 + }, + { + "auxiliary_loss_clip": 0.0640479, + "auxiliary_loss_mlp": 0.01264266, + "balance_loss_clip": 0.06272002, + "balance_loss_mlp": 0.01253943, + "epoch": 0.7544265744776792, + "flos": 20930283849600.0, + "grad_norm": 1.77678899313766, + "language_loss": 0.68066597, + "learning_rate": 5.998313626146099e-07, + "loss": 0.75735652, + "num_input_tokens_seen": 270674260, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.10333252, + "step": 12548, + "time_per_iteration": 2.534188747406006 + }, + { + "auxiliary_loss_clip": 0.0640662, + "auxiliary_loss_mlp": 0.01267563, + "balance_loss_clip": 0.06271043, + "balance_loss_mlp": 0.01257811, + "epoch": 0.7544866977303473, + "flos": 15200947474560.0, + "grad_norm": 1.8925592973514778, + "language_loss": 0.87693512, + "learning_rate": 5.995532900836088e-07, + "loss": 0.95367694, + "num_input_tokens_seen": 270692200, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.09747314, + "step": 12549, + "time_per_iteration": 2.508145332336426 + }, + { + "auxiliary_loss_clip": 0.06395473, + "auxiliary_loss_mlp": 0.01264006, + "balance_loss_clip": 0.06269422, + "balance_loss_mlp": 0.01254213, + "epoch": 0.7545468209830152, + "flos": 27090094176000.0, + "grad_norm": 1.707615461244764, + "language_loss": 0.77432424, + "learning_rate": 5.992752706576865e-07, + "loss": 0.85091901, + "num_input_tokens_seen": 270709675, + "router_z_loss_clip": 1.25976562, + "router_z_loss_mlp": 0.09790039, + "step": 12550, + "time_per_iteration": 3.9424808025360107 + }, + { + "auxiliary_loss_clip": 0.06406951, + "auxiliary_loss_mlp": 0.01264837, + "balance_loss_clip": 0.06272922, + "balance_loss_mlp": 0.01254967, + "epoch": 0.7546069442356832, + "flos": 26879238576000.0, + "grad_norm": 1.4048272187532633, + "language_loss": 0.6982311, + "learning_rate": 5.98997304347386e-07, + "loss": 0.77494895, + "num_input_tokens_seen": 270733055, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.09869385, + "step": 12551, + "time_per_iteration": 2.577078342437744 + }, + { + "auxiliary_loss_clip": 0.06402165, + "auxiliary_loss_mlp": 0.01267501, + "balance_loss_clip": 0.06271981, + "balance_loss_mlp": 0.0125766, + "epoch": 0.7546670674883511, + "flos": 15748735541760.0, + "grad_norm": 1.8643367564290814, + "language_loss": 0.86457175, + "learning_rate": 5.987193911632487e-07, + "loss": 0.94126844, + "num_input_tokens_seen": 270749275, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.09832764, + "step": 12552, + "time_per_iteration": 2.5127792358398438 + }, + { + "auxiliary_loss_clip": 0.06407504, + "auxiliary_loss_mlp": 0.01265602, + "balance_loss_clip": 0.0627365, + "balance_loss_mlp": 0.01256393, + "epoch": 0.7547271907410191, + "flos": 23484545786880.0, + "grad_norm": 1.6196877851330536, + "language_loss": 0.78280461, + "learning_rate": 5.98441531115812e-07, + "loss": 0.85953569, + "num_input_tokens_seen": 270768230, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.09210205, + "step": 12553, + "time_per_iteration": 2.5273962020874023 + }, + { + "auxiliary_loss_clip": 0.06404902, + "auxiliary_loss_mlp": 0.01264178, + "balance_loss_clip": 0.06272501, + "balance_loss_mlp": 0.01254027, + "epoch": 0.754787313993687, + "flos": 31730898159360.0, + "grad_norm": 2.42415612197757, + "language_loss": 0.63542819, + "learning_rate": 5.981637242156135e-07, + "loss": 0.71211898, + "num_input_tokens_seen": 270786285, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.1015625, + "step": 12554, + "time_per_iteration": 2.5882747173309326 + }, + { + "auxiliary_loss_clip": 0.06402658, + "auxiliary_loss_mlp": 0.01263371, + "balance_loss_clip": 0.06271334, + "balance_loss_mlp": 0.01253983, + "epoch": 0.7548474372463551, + "flos": 27570392179200.0, + "grad_norm": 1.504037054855903, + "language_loss": 0.73400116, + "learning_rate": 5.978859704731864e-07, + "loss": 0.81066149, + "num_input_tokens_seen": 270805505, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09393311, + "step": 12555, + "time_per_iteration": 2.539822578430176 + }, + { + "auxiliary_loss_clip": 0.0640943, + "auxiliary_loss_mlp": 0.01263982, + "balance_loss_clip": 0.06275169, + "balance_loss_mlp": 0.01253599, + "epoch": 0.754907560499023, + "flos": 19324752566400.0, + "grad_norm": 1.737792546565587, + "language_loss": 0.78918052, + "learning_rate": 5.976082698990645e-07, + "loss": 0.86591458, + "num_input_tokens_seen": 270824610, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.10388184, + "step": 12556, + "time_per_iteration": 2.520672082901001 + }, + { + "auxiliary_loss_clip": 0.06309493, + "auxiliary_loss_mlp": 0.01252888, + "balance_loss_clip": 0.06254127, + "balance_loss_mlp": 0.01251748, + "epoch": 0.754967683751691, + "flos": 69765795993600.0, + "grad_norm": 0.6939528334291757, + "language_loss": 0.50454944, + "learning_rate": 5.973306225037769e-07, + "loss": 0.58017325, + "num_input_tokens_seen": 270886155, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01139832, + "step": 12557, + "time_per_iteration": 3.1293344497680664 + }, + { + "auxiliary_loss_clip": 0.06408815, + "auxiliary_loss_mlp": 0.01264037, + "balance_loss_clip": 0.06273429, + "balance_loss_mlp": 0.01253857, + "epoch": 0.7550278070043589, + "flos": 24428161342080.0, + "grad_norm": 1.622493392306736, + "language_loss": 0.71709013, + "learning_rate": 5.970530282978525e-07, + "loss": 0.79381871, + "num_input_tokens_seen": 270905325, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.10186768, + "step": 12558, + "time_per_iteration": 2.5321953296661377 + }, + { + "auxiliary_loss_clip": 0.06402349, + "auxiliary_loss_mlp": 0.01266792, + "balance_loss_clip": 0.0626944, + "balance_loss_mlp": 0.01257726, + "epoch": 0.7550879302570269, + "flos": 32642802144000.0, + "grad_norm": 1.8637892647127214, + "language_loss": 0.80580068, + "learning_rate": 5.967754872918187e-07, + "loss": 0.88249207, + "num_input_tokens_seen": 270927535, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09063721, + "step": 12559, + "time_per_iteration": 2.615544557571411 + }, + { + "auxiliary_loss_clip": 0.06405831, + "auxiliary_loss_mlp": 0.01265308, + "balance_loss_clip": 0.06270069, + "balance_loss_mlp": 0.01255276, + "epoch": 0.7551480535096948, + "flos": 21801461950080.0, + "grad_norm": 1.6337605293226678, + "language_loss": 0.78857327, + "learning_rate": 5.96497999496199e-07, + "loss": 0.86528468, + "num_input_tokens_seen": 270946920, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.10021973, + "step": 12560, + "time_per_iteration": 2.5266849994659424 + }, + { + "auxiliary_loss_clip": 0.06401823, + "auxiliary_loss_mlp": 0.01266116, + "balance_loss_clip": 0.06271054, + "balance_loss_mlp": 0.01256752, + "epoch": 0.7552081767623628, + "flos": 18521022602880.0, + "grad_norm": 1.579385743882106, + "language_loss": 0.70900261, + "learning_rate": 5.96220564921515e-07, + "loss": 0.78568202, + "num_input_tokens_seen": 270965705, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09362793, + "step": 12561, + "time_per_iteration": 2.4935779571533203 + }, + { + "auxiliary_loss_clip": 0.06401284, + "auxiliary_loss_mlp": 0.01266321, + "balance_loss_clip": 0.06268281, + "balance_loss_mlp": 0.01256594, + "epoch": 0.7552683000150308, + "flos": 27641949166080.0, + "grad_norm": 1.5637953071800728, + "language_loss": 0.7579698, + "learning_rate": 5.959431835782889e-07, + "loss": 0.83464587, + "num_input_tokens_seen": 270986550, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.09735107, + "step": 12562, + "time_per_iteration": 2.5509040355682373 + }, + { + "auxiliary_loss_clip": 0.06403111, + "auxiliary_loss_mlp": 0.01264985, + "balance_loss_clip": 0.06271905, + "balance_loss_mlp": 0.01255144, + "epoch": 0.7553284232676988, + "flos": 20309135932800.0, + "grad_norm": 1.8403167486550738, + "language_loss": 0.75524759, + "learning_rate": 5.956658554770371e-07, + "loss": 0.83192855, + "num_input_tokens_seen": 271006250, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09838867, + "step": 12563, + "time_per_iteration": 2.513921022415161 + }, + { + "auxiliary_loss_clip": 0.06417328, + "auxiliary_loss_mlp": 0.01264981, + "balance_loss_clip": 0.06274921, + "balance_loss_mlp": 0.01253454, + "epoch": 0.7553885465203668, + "flos": 33263866206720.0, + "grad_norm": 2.816655574793258, + "language_loss": 0.67061448, + "learning_rate": 5.953885806282768e-07, + "loss": 0.7474376, + "num_input_tokens_seen": 271025575, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.11529541, + "step": 12564, + "time_per_iteration": 2.5836448669433594 + }, + { + "auxiliary_loss_clip": 0.06408054, + "auxiliary_loss_mlp": 0.01265348, + "balance_loss_clip": 0.06272587, + "balance_loss_mlp": 0.01254929, + "epoch": 0.7554486697730347, + "flos": 21622653336960.0, + "grad_norm": 1.6673790511457676, + "language_loss": 0.68740308, + "learning_rate": 5.951113590425228e-07, + "loss": 0.76413709, + "num_input_tokens_seen": 271045805, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.10412598, + "step": 12565, + "time_per_iteration": 2.547016143798828 + }, + { + "auxiliary_loss_clip": 0.06408931, + "auxiliary_loss_mlp": 0.01266223, + "balance_loss_clip": 0.06269513, + "balance_loss_mlp": 0.01255864, + "epoch": 0.7555087930257027, + "flos": 27639810887040.0, + "grad_norm": 1.5709631477548602, + "language_loss": 0.74854088, + "learning_rate": 5.94834190730287e-07, + "loss": 0.82529235, + "num_input_tokens_seen": 271066065, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.10357666, + "step": 12566, + "time_per_iteration": 2.5360589027404785 + }, + { + "auxiliary_loss_clip": 0.06412722, + "auxiliary_loss_mlp": 0.01268164, + "balance_loss_clip": 0.0627517, + "balance_loss_mlp": 0.01257399, + "epoch": 0.7555689162783706, + "flos": 23628162885120.0, + "grad_norm": 2.012452039611991, + "language_loss": 0.74581742, + "learning_rate": 5.945570757020789e-07, + "loss": 0.82262623, + "num_input_tokens_seen": 271085870, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.10766602, + "step": 12567, + "time_per_iteration": 2.5815160274505615 + }, + { + "auxiliary_loss_clip": 0.06405583, + "auxiliary_loss_mlp": 0.01263668, + "balance_loss_clip": 0.06273046, + "balance_loss_mlp": 0.01254155, + "epoch": 0.7556290395310387, + "flos": 24869955594240.0, + "grad_norm": 2.2187055340404216, + "language_loss": 0.62846589, + "learning_rate": 5.942800139684073e-07, + "loss": 0.70515835, + "num_input_tokens_seen": 271104260, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09515381, + "step": 12568, + "time_per_iteration": 2.5301473140716553 + }, + { + "auxiliary_loss_clip": 0.06402, + "auxiliary_loss_mlp": 0.01264781, + "balance_loss_clip": 0.06270471, + "balance_loss_mlp": 0.01255471, + "epoch": 0.7556891627837066, + "flos": 43553770680960.0, + "grad_norm": 1.9192871198198145, + "language_loss": 0.66908652, + "learning_rate": 5.940030055397789e-07, + "loss": 0.7457543, + "num_input_tokens_seen": 271125745, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09301758, + "step": 12569, + "time_per_iteration": 2.707559585571289 + }, + { + "auxiliary_loss_clip": 0.06408378, + "auxiliary_loss_mlp": 0.01264951, + "balance_loss_clip": 0.06271549, + "balance_loss_mlp": 0.01254527, + "epoch": 0.7557492860363746, + "flos": 26658110851200.0, + "grad_norm": 2.041017717148161, + "language_loss": 0.67703956, + "learning_rate": 5.93726050426697e-07, + "loss": 0.75377285, + "num_input_tokens_seen": 271147145, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.10424805, + "step": 12570, + "time_per_iteration": 2.5359280109405518 + }, + { + "auxiliary_loss_clip": 0.06407271, + "auxiliary_loss_mlp": 0.0126553, + "balance_loss_clip": 0.0627284, + "balance_loss_mlp": 0.01255868, + "epoch": 0.7558094092890425, + "flos": 55194857769600.0, + "grad_norm": 1.6855740351628876, + "language_loss": 0.71908271, + "learning_rate": 5.934491486396647e-07, + "loss": 0.7958107, + "num_input_tokens_seen": 271170865, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.09667969, + "step": 12571, + "time_per_iteration": 2.8340237140655518 + }, + { + "auxiliary_loss_clip": 0.06408758, + "auxiliary_loss_mlp": 0.01265226, + "balance_loss_clip": 0.06272244, + "balance_loss_mlp": 0.01255242, + "epoch": 0.7558695325417105, + "flos": 23995171768320.0, + "grad_norm": 1.5360803868989372, + "language_loss": 0.74071586, + "learning_rate": 5.931723001891811e-07, + "loss": 0.81745565, + "num_input_tokens_seen": 271191450, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.09985352, + "step": 12572, + "time_per_iteration": 4.078891754150391 + }, + { + "auxiliary_loss_clip": 0.06408488, + "auxiliary_loss_mlp": 0.01264697, + "balance_loss_clip": 0.06272225, + "balance_loss_mlp": 0.01254981, + "epoch": 0.7559296557943784, + "flos": 14616542373120.0, + "grad_norm": 2.087893523265595, + "language_loss": 0.77022463, + "learning_rate": 5.928955050857456e-07, + "loss": 0.84695649, + "num_input_tokens_seen": 271207335, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.097229, + "step": 12573, + "time_per_iteration": 2.4667983055114746 + }, + { + "auxiliary_loss_clip": 0.06406313, + "auxiliary_loss_mlp": 0.01264981, + "balance_loss_clip": 0.06269629, + "balance_loss_mlp": 0.01254032, + "epoch": 0.7559897790470465, + "flos": 18556214117760.0, + "grad_norm": 1.6481386316669568, + "language_loss": 0.69339514, + "learning_rate": 5.926187633398527e-07, + "loss": 0.7701081, + "num_input_tokens_seen": 271226895, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.10955811, + "step": 12574, + "time_per_iteration": 2.521108627319336 + }, + { + "auxiliary_loss_clip": 0.06401183, + "auxiliary_loss_mlp": 0.0126439, + "balance_loss_clip": 0.0626963, + "balance_loss_mlp": 0.01254532, + "epoch": 0.7560499022997144, + "flos": 17973695733120.0, + "grad_norm": 2.167691196758321, + "language_loss": 0.71799374, + "learning_rate": 5.923420749619974e-07, + "loss": 0.79464948, + "num_input_tokens_seen": 271244375, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09869385, + "step": 12575, + "time_per_iteration": 2.4676809310913086 + }, + { + "auxiliary_loss_clip": 0.0640292, + "auxiliary_loss_mlp": 0.01261787, + "balance_loss_clip": 0.0626974, + "balance_loss_mlp": 0.0125222, + "epoch": 0.7561100255523824, + "flos": 15742530339840.0, + "grad_norm": 1.985003709379718, + "language_loss": 0.7146281, + "learning_rate": 5.92065439962673e-07, + "loss": 0.79127514, + "num_input_tokens_seen": 271259530, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.09564209, + "step": 12576, + "time_per_iteration": 2.525620937347412 + }, + { + "auxiliary_loss_clip": 0.06402552, + "auxiliary_loss_mlp": 0.01265228, + "balance_loss_clip": 0.06271128, + "balance_loss_mlp": 0.0125497, + "epoch": 0.7561701488050504, + "flos": 15893568524160.0, + "grad_norm": 1.7792307856828309, + "language_loss": 0.67103839, + "learning_rate": 5.917888583523669e-07, + "loss": 0.74771613, + "num_input_tokens_seen": 271276835, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.10247803, + "step": 12577, + "time_per_iteration": 2.468843936920166 + }, + { + "auxiliary_loss_clip": 0.06400042, + "auxiliary_loss_mlp": 0.01263628, + "balance_loss_clip": 0.06269364, + "balance_loss_mlp": 0.01253978, + "epoch": 0.7562302720577183, + "flos": 20345333696640.0, + "grad_norm": 1.5059365090765435, + "language_loss": 0.78157711, + "learning_rate": 5.915123301415685e-07, + "loss": 0.85821384, + "num_input_tokens_seen": 271296275, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09649658, + "step": 12578, + "time_per_iteration": 2.530263900756836 + }, + { + "auxiliary_loss_clip": 0.0640607, + "auxiliary_loss_mlp": 0.01262105, + "balance_loss_clip": 0.06271346, + "balance_loss_mlp": 0.01251871, + "epoch": 0.7562903953103863, + "flos": 20818252540800.0, + "grad_norm": 1.5853993549027412, + "language_loss": 0.76139581, + "learning_rate": 5.912358553407641e-07, + "loss": 0.83807755, + "num_input_tokens_seen": 271315685, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.10229492, + "step": 12579, + "time_per_iteration": 2.507765054702759 + }, + { + "auxiliary_loss_clip": 0.06411377, + "auxiliary_loss_mlp": 0.01264596, + "balance_loss_clip": 0.06272445, + "balance_loss_mlp": 0.01253599, + "epoch": 0.7563505185630542, + "flos": 37606073765760.0, + "grad_norm": 1.7167109835920158, + "language_loss": 0.62744486, + "learning_rate": 5.90959433960437e-07, + "loss": 0.70420462, + "num_input_tokens_seen": 271336790, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.11004639, + "step": 12580, + "time_per_iteration": 2.6855556964874268 + }, + { + "auxiliary_loss_clip": 0.06404164, + "auxiliary_loss_mlp": 0.01266589, + "balance_loss_clip": 0.06272098, + "balance_loss_mlp": 0.01256355, + "epoch": 0.7564106418157223, + "flos": 20237369310720.0, + "grad_norm": 3.698052227516868, + "language_loss": 0.75504309, + "learning_rate": 5.906830660110691e-07, + "loss": 0.83175057, + "num_input_tokens_seen": 271355470, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.10241699, + "step": 12581, + "time_per_iteration": 3.9208571910858154 + }, + { + "auxiliary_loss_clip": 0.06411214, + "auxiliary_loss_mlp": 0.01264654, + "balance_loss_clip": 0.06274357, + "balance_loss_mlp": 0.01254682, + "epoch": 0.7564707650683902, + "flos": 24761949281280.0, + "grad_norm": 1.712129660168012, + "language_loss": 0.63223112, + "learning_rate": 5.904067515031412e-07, + "loss": 0.70898986, + "num_input_tokens_seen": 271375810, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.09967041, + "step": 12582, + "time_per_iteration": 2.5469281673431396 + }, + { + "auxiliary_loss_clip": 0.06310637, + "auxiliary_loss_mlp": 0.01252832, + "balance_loss_clip": 0.06255485, + "balance_loss_mlp": 0.01251842, + "epoch": 0.7565308883210582, + "flos": 48544965711360.0, + "grad_norm": 0.9271563619933442, + "language_loss": 0.60731697, + "learning_rate": 5.901304904471307e-07, + "loss": 0.68295169, + "num_input_tokens_seen": 271424775, + "router_z_loss_clip": 0.55322266, + "router_z_loss_mlp": 0.00989532, + "step": 12583, + "time_per_iteration": 2.8734805583953857 + }, + { + "auxiliary_loss_clip": 0.06408859, + "auxiliary_loss_mlp": 0.01265781, + "balance_loss_clip": 0.06275302, + "balance_loss_mlp": 0.01255792, + "epoch": 0.7565910115737261, + "flos": 12500007765120.0, + "grad_norm": 1.9446553716026287, + "language_loss": 0.7914691, + "learning_rate": 5.898542828535125e-07, + "loss": 0.8682155, + "num_input_tokens_seen": 271440500, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.09985352, + "step": 12584, + "time_per_iteration": 2.5946009159088135 + }, + { + "auxiliary_loss_clip": 0.06402295, + "auxiliary_loss_mlp": 0.01264418, + "balance_loss_clip": 0.06272683, + "balance_loss_mlp": 0.01254559, + "epoch": 0.7566511348263941, + "flos": 21178427316480.0, + "grad_norm": 5.075260482718231, + "language_loss": 0.7806747, + "learning_rate": 5.895781287327612e-07, + "loss": 0.85734189, + "num_input_tokens_seen": 271458180, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.09857178, + "step": 12585, + "time_per_iteration": 4.006917953491211 + }, + { + "auxiliary_loss_clip": 0.06406915, + "auxiliary_loss_mlp": 0.01263646, + "balance_loss_clip": 0.06271342, + "balance_loss_mlp": 0.01253609, + "epoch": 0.756711258079062, + "flos": 21760023306240.0, + "grad_norm": 1.5685604080996611, + "language_loss": 0.83183873, + "learning_rate": 5.893020280953493e-07, + "loss": 0.9085443, + "num_input_tokens_seen": 271475730, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.1003418, + "step": 12586, + "time_per_iteration": 2.4981296062469482 + }, + { + "auxiliary_loss_clip": 0.06409433, + "auxiliary_loss_mlp": 0.01265703, + "balance_loss_clip": 0.06271765, + "balance_loss_mlp": 0.01255487, + "epoch": 0.75677138133173, + "flos": 22389514704000.0, + "grad_norm": 2.1588778105399116, + "language_loss": 0.83529806, + "learning_rate": 5.890259809517459e-07, + "loss": 0.91204941, + "num_input_tokens_seen": 271495030, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.10223389, + "step": 12587, + "time_per_iteration": 2.5264017581939697 + }, + { + "auxiliary_loss_clip": 0.06405166, + "auxiliary_loss_mlp": 0.01262614, + "balance_loss_clip": 0.06272217, + "balance_loss_mlp": 0.01252356, + "epoch": 0.756831504584398, + "flos": 22715252651520.0, + "grad_norm": 1.5206694910339098, + "language_loss": 0.71336639, + "learning_rate": 5.88749987312418e-07, + "loss": 0.79004425, + "num_input_tokens_seen": 271515355, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.1026001, + "step": 12588, + "time_per_iteration": 2.522880792617798 + }, + { + "auxiliary_loss_clip": 0.06410505, + "auxiliary_loss_mlp": 0.01264184, + "balance_loss_clip": 0.06272528, + "balance_loss_mlp": 0.01253777, + "epoch": 0.756891627837066, + "flos": 24105358287360.0, + "grad_norm": 1.8052754527396453, + "language_loss": 0.69118118, + "learning_rate": 5.884740471878327e-07, + "loss": 0.76792806, + "num_input_tokens_seen": 271535090, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.10412598, + "step": 12589, + "time_per_iteration": 2.543221950531006 + }, + { + "auxiliary_loss_clip": 0.06404439, + "auxiliary_loss_mlp": 0.01269435, + "balance_loss_clip": 0.06271015, + "balance_loss_mlp": 0.01259499, + "epoch": 0.756951751089734, + "flos": 19754010633600.0, + "grad_norm": 1.742132882513342, + "language_loss": 0.92203468, + "learning_rate": 5.881981605884522e-07, + "loss": 0.99877346, + "num_input_tokens_seen": 271551075, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.09942627, + "step": 12590, + "time_per_iteration": 3.913285732269287 + }, + { + "auxiliary_loss_clip": 0.06402917, + "auxiliary_loss_mlp": 0.01263743, + "balance_loss_clip": 0.06272686, + "balance_loss_mlp": 0.01253092, + "epoch": 0.7570118743424019, + "flos": 35087883811200.0, + "grad_norm": 1.7860803954634257, + "language_loss": 0.65924931, + "learning_rate": 5.879223275247391e-07, + "loss": 0.7359159, + "num_input_tokens_seen": 271571035, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.10644531, + "step": 12591, + "time_per_iteration": 2.6003847122192383 + }, + { + "auxiliary_loss_clip": 0.06403872, + "auxiliary_loss_mlp": 0.01263019, + "balance_loss_clip": 0.06273251, + "balance_loss_mlp": 0.01253667, + "epoch": 0.7570719975950699, + "flos": 25601835081600.0, + "grad_norm": 1.452450221530786, + "language_loss": 0.73701084, + "learning_rate": 5.876465480071528e-07, + "loss": 0.81367981, + "num_input_tokens_seen": 271592950, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.09356689, + "step": 12592, + "time_per_iteration": 2.5929007530212402 + }, + { + "auxiliary_loss_clip": 0.06405754, + "auxiliary_loss_mlp": 0.01266898, + "balance_loss_clip": 0.06270353, + "balance_loss_mlp": 0.01257165, + "epoch": 0.7571321208477378, + "flos": 10820781216000.0, + "grad_norm": 2.164551759300356, + "language_loss": 0.71882141, + "learning_rate": 5.873708220461522e-07, + "loss": 0.79554784, + "num_input_tokens_seen": 271608835, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.09741211, + "step": 12593, + "time_per_iteration": 2.4659135341644287 + }, + { + "auxiliary_loss_clip": 0.0640605, + "auxiliary_loss_mlp": 0.01263408, + "balance_loss_clip": 0.06271473, + "balance_loss_mlp": 0.01253216, + "epoch": 0.7571922441004059, + "flos": 18266045028480.0, + "grad_norm": 1.7009854752836593, + "language_loss": 0.66789973, + "learning_rate": 5.870951496521903e-07, + "loss": 0.74459434, + "num_input_tokens_seen": 271627730, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.10192871, + "step": 12594, + "time_per_iteration": 2.6039915084838867 + }, + { + "auxiliary_loss_clip": 0.06412069, + "auxiliary_loss_mlp": 0.01266946, + "balance_loss_clip": 0.06273807, + "balance_loss_mlp": 0.01256599, + "epoch": 0.7572523673530738, + "flos": 22896660741120.0, + "grad_norm": 1.6054592725551893, + "language_loss": 0.80899853, + "learning_rate": 5.86819530835722e-07, + "loss": 0.88578868, + "num_input_tokens_seen": 271646415, + "router_z_loss_clip": 1.38183594, + "router_z_loss_mlp": 0.10339355, + "step": 12595, + "time_per_iteration": 2.571235179901123 + }, + { + "auxiliary_loss_clip": 0.06404546, + "auxiliary_loss_mlp": 0.01268345, + "balance_loss_clip": 0.06273299, + "balance_loss_mlp": 0.01259166, + "epoch": 0.7573124906057418, + "flos": 21002679377280.0, + "grad_norm": 1.9975391540186431, + "language_loss": 0.71918476, + "learning_rate": 5.865439656071993e-07, + "loss": 0.7959137, + "num_input_tokens_seen": 271666240, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09185791, + "step": 12596, + "time_per_iteration": 2.551135301589966 + }, + { + "auxiliary_loss_clip": 0.0640128, + "auxiliary_loss_mlp": 0.01266132, + "balance_loss_clip": 0.06271507, + "balance_loss_mlp": 0.0125737, + "epoch": 0.7573726138584097, + "flos": 20892534785280.0, + "grad_norm": 1.4422973158795673, + "language_loss": 0.80943167, + "learning_rate": 5.862684539770706e-07, + "loss": 0.8861059, + "num_input_tokens_seen": 271686370, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.08764648, + "step": 12597, + "time_per_iteration": 2.4924709796905518 + }, + { + "auxiliary_loss_clip": 0.06410646, + "auxiliary_loss_mlp": 0.01265912, + "balance_loss_clip": 0.06274585, + "balance_loss_mlp": 0.01255076, + "epoch": 0.7574327371110777, + "flos": 24536628852480.0, + "grad_norm": 1.549330306362407, + "language_loss": 0.83572793, + "learning_rate": 5.859929959557835e-07, + "loss": 0.91249353, + "num_input_tokens_seen": 271705050, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.10839844, + "step": 12598, + "time_per_iteration": 2.5620381832122803 + }, + { + "auxiliary_loss_clip": 0.0640049, + "auxiliary_loss_mlp": 0.01265859, + "balance_loss_clip": 0.06269588, + "balance_loss_mlp": 0.01256656, + "epoch": 0.7574928603637456, + "flos": 23370711615360.0, + "grad_norm": 1.5128329006829742, + "language_loss": 0.62814438, + "learning_rate": 5.857175915537845e-07, + "loss": 0.70480788, + "num_input_tokens_seen": 271724915, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09197998, + "step": 12599, + "time_per_iteration": 2.517794132232666 + }, + { + "auxiliary_loss_clip": 0.06412463, + "auxiliary_loss_mlp": 0.01264733, + "balance_loss_clip": 0.06273595, + "balance_loss_mlp": 0.01253641, + "epoch": 0.7575529836164137, + "flos": 13521301655040.0, + "grad_norm": 2.5096070763269047, + "language_loss": 0.63904691, + "learning_rate": 5.854422407815161e-07, + "loss": 0.71581882, + "num_input_tokens_seen": 271742410, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.11096191, + "step": 12600, + "time_per_iteration": 2.4784600734710693 + }, + { + "auxiliary_loss_clip": 0.06401792, + "auxiliary_loss_mlp": 0.01265717, + "balance_loss_clip": 0.06272122, + "balance_loss_mlp": 0.01255709, + "epoch": 0.7576131068690816, + "flos": 19652754574080.0, + "grad_norm": 1.7462695207740195, + "language_loss": 0.66372097, + "learning_rate": 5.851669436494191e-07, + "loss": 0.74039608, + "num_input_tokens_seen": 271761425, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.10003662, + "step": 12601, + "time_per_iteration": 2.473879337310791 + }, + { + "auxiliary_loss_clip": 0.06400197, + "auxiliary_loss_mlp": 0.01265733, + "balance_loss_clip": 0.06269629, + "balance_loss_mlp": 0.01256429, + "epoch": 0.7576732301217496, + "flos": 20054535701760.0, + "grad_norm": 2.2130741302051904, + "language_loss": 0.68382788, + "learning_rate": 5.848917001679335e-07, + "loss": 0.7604872, + "num_input_tokens_seen": 271780875, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09301758, + "step": 12602, + "time_per_iteration": 2.49818754196167 + }, + { + "auxiliary_loss_clip": 0.0640595, + "auxiliary_loss_mlp": 0.0126578, + "balance_loss_clip": 0.06273246, + "balance_loss_mlp": 0.01255373, + "epoch": 0.7577333533744176, + "flos": 15382439418240.0, + "grad_norm": 1.7531421277811328, + "language_loss": 0.67018741, + "learning_rate": 5.846165103474967e-07, + "loss": 0.74690473, + "num_input_tokens_seen": 271799490, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.10412598, + "step": 12603, + "time_per_iteration": 2.4679315090179443 + }, + { + "auxiliary_loss_clip": 0.06399174, + "auxiliary_loss_mlp": 0.0126693, + "balance_loss_clip": 0.06270204, + "balance_loss_mlp": 0.01257441, + "epoch": 0.7577934766270855, + "flos": 17900671299840.0, + "grad_norm": 2.0091560992358417, + "language_loss": 0.62072337, + "learning_rate": 5.843413741985439e-07, + "loss": 0.69738448, + "num_input_tokens_seen": 271817040, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.09484863, + "step": 12604, + "time_per_iteration": 2.4903266429901123 + }, + { + "auxiliary_loss_clip": 0.06405266, + "auxiliary_loss_mlp": 0.01268866, + "balance_loss_clip": 0.0627261, + "balance_loss_mlp": 0.01258256, + "epoch": 0.7578535998797535, + "flos": 21619760371200.0, + "grad_norm": 1.8724094104834093, + "language_loss": 0.80161738, + "learning_rate": 5.840662917315076e-07, + "loss": 0.87835866, + "num_input_tokens_seen": 271835480, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.10614014, + "step": 12605, + "time_per_iteration": 2.4841203689575195 + }, + { + "auxiliary_loss_clip": 0.06405874, + "auxiliary_loss_mlp": 0.01267443, + "balance_loss_clip": 0.06269677, + "balance_loss_mlp": 0.01256863, + "epoch": 0.7579137231324214, + "flos": 18484237860480.0, + "grad_norm": 2.5250222349386866, + "language_loss": 0.80021864, + "learning_rate": 5.837912629568198e-07, + "loss": 0.87695181, + "num_input_tokens_seen": 271849835, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.10577393, + "step": 12606, + "time_per_iteration": 2.4846410751342773 + }, + { + "auxiliary_loss_clip": 0.06398265, + "auxiliary_loss_mlp": 0.01262661, + "balance_loss_clip": 0.06272207, + "balance_loss_mlp": 0.01254048, + "epoch": 0.7579738463850895, + "flos": 23261195928960.0, + "grad_norm": 1.3978882073919028, + "language_loss": 0.73257685, + "learning_rate": 5.835162878849087e-07, + "loss": 0.8091861, + "num_input_tokens_seen": 271869560, + "router_z_loss_clip": 1.25976562, + "router_z_loss_mlp": 0.08612061, + "step": 12607, + "time_per_iteration": 2.5159242153167725 + }, + { + "auxiliary_loss_clip": 0.06412238, + "auxiliary_loss_mlp": 0.01270562, + "balance_loss_clip": 0.06273781, + "balance_loss_mlp": 0.01260798, + "epoch": 0.7580339696377574, + "flos": 14032137271680.0, + "grad_norm": 1.9743130927740786, + "language_loss": 0.74911094, + "learning_rate": 5.83241366526202e-07, + "loss": 0.82593894, + "num_input_tokens_seen": 271887950, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.09759521, + "step": 12608, + "time_per_iteration": 2.497614622116089 + }, + { + "auxiliary_loss_clip": 0.06404, + "auxiliary_loss_mlp": 0.01265498, + "balance_loss_clip": 0.06272872, + "balance_loss_mlp": 0.01255335, + "epoch": 0.7580940928904254, + "flos": 25089825507840.0, + "grad_norm": 1.4850994343846526, + "language_loss": 0.71440935, + "learning_rate": 5.829664988911245e-07, + "loss": 0.79110432, + "num_input_tokens_seen": 271907700, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.10162354, + "step": 12609, + "time_per_iteration": 2.5046613216400146 + }, + { + "auxiliary_loss_clip": 0.06403238, + "auxiliary_loss_mlp": 0.01266218, + "balance_loss_clip": 0.06269231, + "balance_loss_mlp": 0.01255149, + "epoch": 0.7581542161430933, + "flos": 23842288794240.0, + "grad_norm": 1.5362768058581475, + "language_loss": 0.81678033, + "learning_rate": 5.826916849901007e-07, + "loss": 0.89347494, + "num_input_tokens_seen": 271926840, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.11071777, + "step": 12610, + "time_per_iteration": 2.517946243286133 + }, + { + "auxiliary_loss_clip": 0.06408758, + "auxiliary_loss_mlp": 0.01262988, + "balance_loss_clip": 0.0627152, + "balance_loss_mlp": 0.01252921, + "epoch": 0.7582143393957613, + "flos": 22243591618560.0, + "grad_norm": 1.594141702958548, + "language_loss": 0.70561087, + "learning_rate": 5.824169248335488e-07, + "loss": 0.78232837, + "num_input_tokens_seen": 271946465, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.10070801, + "step": 12611, + "time_per_iteration": 2.490994930267334 + }, + { + "auxiliary_loss_clip": 0.06402324, + "auxiliary_loss_mlp": 0.01265013, + "balance_loss_clip": 0.0626975, + "balance_loss_mlp": 0.0125516, + "epoch": 0.7582744626484292, + "flos": 21112865896320.0, + "grad_norm": 1.5348173916293948, + "language_loss": 0.70921582, + "learning_rate": 5.821422184318893e-07, + "loss": 0.78588921, + "num_input_tokens_seen": 271967295, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.09857178, + "step": 12612, + "time_per_iteration": 3.989048719406128 + }, + { + "auxiliary_loss_clip": 0.06410398, + "auxiliary_loss_mlp": 0.01264672, + "balance_loss_clip": 0.06273097, + "balance_loss_mlp": 0.01254641, + "epoch": 0.7583345859010973, + "flos": 24611120732160.0, + "grad_norm": 1.3541649077655429, + "language_loss": 0.60250545, + "learning_rate": 5.818675657955397e-07, + "loss": 0.6792562, + "num_input_tokens_seen": 271987960, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.10028076, + "step": 12613, + "time_per_iteration": 2.5280654430389404 + }, + { + "auxiliary_loss_clip": 0.06406002, + "auxiliary_loss_mlp": 0.01265434, + "balance_loss_clip": 0.06272647, + "balance_loss_mlp": 0.01255367, + "epoch": 0.7583947091537652, + "flos": 33555167326080.0, + "grad_norm": 1.434876816663814, + "language_loss": 0.60180938, + "learning_rate": 5.815929669349135e-07, + "loss": 0.67852372, + "num_input_tokens_seen": 272011780, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.10064697, + "step": 12614, + "time_per_iteration": 2.6500730514526367 + }, + { + "auxiliary_loss_clip": 0.06408043, + "auxiliary_loss_mlp": 0.01264127, + "balance_loss_clip": 0.06270881, + "balance_loss_mlp": 0.01253976, + "epoch": 0.7584548324064332, + "flos": 20127266645760.0, + "grad_norm": 1.6646286333989884, + "language_loss": 0.73613036, + "learning_rate": 5.813184218604246e-07, + "loss": 0.81285203, + "num_input_tokens_seen": 272030825, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.1015625, + "step": 12615, + "time_per_iteration": 2.5028393268585205 + }, + { + "auxiliary_loss_clip": 0.06306437, + "auxiliary_loss_mlp": 0.01253251, + "balance_loss_clip": 0.06250888, + "balance_loss_mlp": 0.01251755, + "epoch": 0.7585149556591012, + "flos": 70424064069120.0, + "grad_norm": 0.8421080448004001, + "language_loss": 0.67521149, + "learning_rate": 5.810439305824828e-07, + "loss": 0.75080836, + "num_input_tokens_seen": 272095825, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01496124, + "step": 12616, + "time_per_iteration": 3.1849849224090576 + }, + { + "auxiliary_loss_clip": 0.06408077, + "auxiliary_loss_mlp": 0.01262858, + "balance_loss_clip": 0.06270512, + "balance_loss_mlp": 0.01252779, + "epoch": 0.7585750789117691, + "flos": 16149342712320.0, + "grad_norm": 1.7878130457508898, + "language_loss": 0.84241217, + "learning_rate": 5.807694931114979e-07, + "loss": 0.9191215, + "num_input_tokens_seen": 272113950, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.10076904, + "step": 12617, + "time_per_iteration": 2.4973013401031494 + }, + { + "auxiliary_loss_clip": 0.06407297, + "auxiliary_loss_mlp": 0.01262597, + "balance_loss_clip": 0.06272709, + "balance_loss_mlp": 0.01253257, + "epoch": 0.7586352021644371, + "flos": 17498848245120.0, + "grad_norm": 2.3587408181523544, + "language_loss": 0.74931777, + "learning_rate": 5.804951094578757e-07, + "loss": 0.82601666, + "num_input_tokens_seen": 272130315, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.09338379, + "step": 12618, + "time_per_iteration": 2.494654417037964 + }, + { + "auxiliary_loss_clip": 0.06410335, + "auxiliary_loss_mlp": 0.01262457, + "balance_loss_clip": 0.06271516, + "balance_loss_mlp": 0.01251967, + "epoch": 0.758695325417105, + "flos": 17280990829440.0, + "grad_norm": 2.0665265442485485, + "language_loss": 0.77541107, + "learning_rate": 5.802207796320209e-07, + "loss": 0.852139, + "num_input_tokens_seen": 272149080, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.1048584, + "step": 12619, + "time_per_iteration": 2.5350186824798584 + }, + { + "auxiliary_loss_clip": 0.06403962, + "auxiliary_loss_mlp": 0.01265943, + "balance_loss_clip": 0.06272481, + "balance_loss_mlp": 0.01255751, + "epoch": 0.7587554486697731, + "flos": 29503128856320.0, + "grad_norm": 1.7154948098726508, + "language_loss": 0.82232845, + "learning_rate": 5.79946503644337e-07, + "loss": 0.89902753, + "num_input_tokens_seen": 272168285, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.10180664, + "step": 12620, + "time_per_iteration": 2.5445215702056885 + }, + { + "auxiliary_loss_clip": 0.06409103, + "auxiliary_loss_mlp": 0.01267734, + "balance_loss_clip": 0.06271064, + "balance_loss_mlp": 0.0125651, + "epoch": 0.758815571922441, + "flos": 16105262664960.0, + "grad_norm": 2.254667976985654, + "language_loss": 0.82809436, + "learning_rate": 5.796722815052242e-07, + "loss": 0.90486276, + "num_input_tokens_seen": 272184585, + "router_z_loss_clip": 1.38183594, + "router_z_loss_mlp": 0.11236572, + "step": 12621, + "time_per_iteration": 3.918266534805298 + }, + { + "auxiliary_loss_clip": 0.0640413, + "auxiliary_loss_mlp": 0.01267456, + "balance_loss_clip": 0.06271367, + "balance_loss_mlp": 0.01257717, + "epoch": 0.758875695175109, + "flos": 16149258858240.0, + "grad_norm": 1.986087185770293, + "language_loss": 0.73904622, + "learning_rate": 5.7939811322508e-07, + "loss": 0.81576204, + "num_input_tokens_seen": 272200205, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09747314, + "step": 12622, + "time_per_iteration": 2.4622373580932617 + }, + { + "auxiliary_loss_clip": 0.06310892, + "auxiliary_loss_mlp": 0.01253319, + "balance_loss_clip": 0.06255639, + "balance_loss_mlp": 0.01252096, + "epoch": 0.7589358184277769, + "flos": 68482019589120.0, + "grad_norm": 0.8176590581901009, + "language_loss": 0.60799408, + "learning_rate": 5.791239988143024e-07, + "loss": 0.68363619, + "num_input_tokens_seen": 272259670, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01221466, + "step": 12623, + "time_per_iteration": 3.143218755722046 + }, + { + "auxiliary_loss_clip": 0.06401753, + "auxiliary_loss_mlp": 0.01262985, + "balance_loss_clip": 0.06271981, + "balance_loss_mlp": 0.01254349, + "epoch": 0.7589959416804449, + "flos": 20053445598720.0, + "grad_norm": 1.8387445657701582, + "language_loss": 0.67715496, + "learning_rate": 5.788499382832847e-07, + "loss": 0.75380242, + "num_input_tokens_seen": 272277925, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.08636475, + "step": 12624, + "time_per_iteration": 3.9293882846832275 + }, + { + "auxiliary_loss_clip": 0.06401351, + "auxiliary_loss_mlp": 0.01266658, + "balance_loss_clip": 0.06270081, + "balance_loss_mlp": 0.01257038, + "epoch": 0.7590560649331128, + "flos": 18777970748160.0, + "grad_norm": 1.6859497284261105, + "language_loss": 0.76178044, + "learning_rate": 5.785759316424196e-07, + "loss": 0.83846056, + "num_input_tokens_seen": 272296010, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.09625244, + "step": 12625, + "time_per_iteration": 2.4780449867248535 + }, + { + "auxiliary_loss_clip": 0.06401481, + "auxiliary_loss_mlp": 0.01264022, + "balance_loss_clip": 0.06271295, + "balance_loss_mlp": 0.0125383, + "epoch": 0.7591161881857809, + "flos": 29833017580800.0, + "grad_norm": 1.7327397977395311, + "language_loss": 0.63387203, + "learning_rate": 5.783019789020977e-07, + "loss": 0.71052712, + "num_input_tokens_seen": 272318330, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.10198975, + "step": 12626, + "time_per_iteration": 2.5631775856018066 + }, + { + "auxiliary_loss_clip": 0.06407394, + "auxiliary_loss_mlp": 0.01265555, + "balance_loss_clip": 0.06272081, + "balance_loss_mlp": 0.01255715, + "epoch": 0.7591763114384488, + "flos": 20308884370560.0, + "grad_norm": 1.7841706388815284, + "language_loss": 0.74468005, + "learning_rate": 5.780280800727084e-07, + "loss": 0.82140952, + "num_input_tokens_seen": 272335265, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.09844971, + "step": 12627, + "time_per_iteration": 2.469609260559082 + }, + { + "auxiliary_loss_clip": 0.06408302, + "auxiliary_loss_mlp": 0.012668, + "balance_loss_clip": 0.06272177, + "balance_loss_mlp": 0.01257412, + "epoch": 0.7592364346911168, + "flos": 20819887695360.0, + "grad_norm": 2.5677146388224728, + "language_loss": 0.69222355, + "learning_rate": 5.777542351646356e-07, + "loss": 0.76897466, + "num_input_tokens_seen": 272354795, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.09387207, + "step": 12628, + "time_per_iteration": 2.520756483078003 + }, + { + "auxiliary_loss_clip": 0.06418896, + "auxiliary_loss_mlp": 0.01268483, + "balance_loss_clip": 0.06277822, + "balance_loss_mlp": 0.01257951, + "epoch": 0.7592965579437848, + "flos": 21257866586880.0, + "grad_norm": 2.617063400341695, + "language_loss": 0.62842494, + "learning_rate": 5.774804441882648e-07, + "loss": 0.70529878, + "num_input_tokens_seen": 272372875, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.10528564, + "step": 12629, + "time_per_iteration": 3.9617972373962402 + }, + { + "auxiliary_loss_clip": 0.06400847, + "auxiliary_loss_mlp": 0.01264471, + "balance_loss_clip": 0.06271888, + "balance_loss_mlp": 0.0125463, + "epoch": 0.7593566811964527, + "flos": 26220802792320.0, + "grad_norm": 1.4187303097446593, + "language_loss": 0.7784214, + "learning_rate": 5.772067071539786e-07, + "loss": 0.85507464, + "num_input_tokens_seen": 272394715, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.09844971, + "step": 12630, + "time_per_iteration": 2.5400242805480957 + }, + { + "auxiliary_loss_clip": 0.0631338, + "auxiliary_loss_mlp": 0.01256151, + "balance_loss_clip": 0.06257843, + "balance_loss_mlp": 0.01255109, + "epoch": 0.7594168044491207, + "flos": 71258122010880.0, + "grad_norm": 0.8178625518129599, + "language_loss": 0.61609149, + "learning_rate": 5.769330240721562e-07, + "loss": 0.69178677, + "num_input_tokens_seen": 272458775, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01042175, + "step": 12631, + "time_per_iteration": 3.2121753692626953 + }, + { + "auxiliary_loss_clip": 0.06412616, + "auxiliary_loss_mlp": 0.0126774, + "balance_loss_clip": 0.06273548, + "balance_loss_mlp": 0.01256188, + "epoch": 0.7594769277017887, + "flos": 26620319859840.0, + "grad_norm": 1.723696706430517, + "language_loss": 0.74189103, + "learning_rate": 5.766593949531767e-07, + "loss": 0.81869459, + "num_input_tokens_seen": 272479355, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.11547852, + "step": 12632, + "time_per_iteration": 2.633206605911255 + }, + { + "auxiliary_loss_clip": 0.06406914, + "auxiliary_loss_mlp": 0.01263252, + "balance_loss_clip": 0.06272458, + "balance_loss_mlp": 0.01252743, + "epoch": 0.7595370509544567, + "flos": 17600523575040.0, + "grad_norm": 1.7631507541187388, + "language_loss": 0.75345957, + "learning_rate": 5.763858198074154e-07, + "loss": 0.83016121, + "num_input_tokens_seen": 272493555, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.1050415, + "step": 12633, + "time_per_iteration": 2.4908735752105713 + }, + { + "auxiliary_loss_clip": 0.06404668, + "auxiliary_loss_mlp": 0.01264134, + "balance_loss_clip": 0.06271268, + "balance_loss_mlp": 0.01254883, + "epoch": 0.7595971742071246, + "flos": 18008551831680.0, + "grad_norm": 1.9259614725215357, + "language_loss": 0.73589694, + "learning_rate": 5.76112298645246e-07, + "loss": 0.81258494, + "num_input_tokens_seen": 272508925, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.09240723, + "step": 12634, + "time_per_iteration": 2.463972330093384 + }, + { + "auxiliary_loss_clip": 0.06401845, + "auxiliary_loss_mlp": 0.01266383, + "balance_loss_clip": 0.06269458, + "balance_loss_mlp": 0.01256715, + "epoch": 0.7596572974597926, + "flos": 28847921454720.0, + "grad_norm": 1.6183361542433332, + "language_loss": 0.65202701, + "learning_rate": 5.758388314770408e-07, + "loss": 0.72870934, + "num_input_tokens_seen": 272528805, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.09661865, + "step": 12635, + "time_per_iteration": 2.5608267784118652 + }, + { + "auxiliary_loss_clip": 0.06408376, + "auxiliary_loss_mlp": 0.01262438, + "balance_loss_clip": 0.06272096, + "balance_loss_mlp": 0.01252252, + "epoch": 0.7597174207124605, + "flos": 14288037240960.0, + "grad_norm": 1.6247637528825494, + "language_loss": 0.69144988, + "learning_rate": 5.7556541831317e-07, + "loss": 0.76815796, + "num_input_tokens_seen": 272546655, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.10186768, + "step": 12636, + "time_per_iteration": 2.4801905155181885 + }, + { + "auxiliary_loss_clip": 0.0640962, + "auxiliary_loss_mlp": 0.01262748, + "balance_loss_clip": 0.06271771, + "balance_loss_mlp": 0.01252103, + "epoch": 0.7597775439651285, + "flos": 21695300426880.0, + "grad_norm": 1.9394255431745338, + "language_loss": 0.81419599, + "learning_rate": 5.752920591640018e-07, + "loss": 0.89091963, + "num_input_tokens_seen": 272564010, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.10650635, + "step": 12637, + "time_per_iteration": 2.535862922668457 + }, + { + "auxiliary_loss_clip": 0.06405479, + "auxiliary_loss_mlp": 0.01261246, + "balance_loss_clip": 0.06269705, + "balance_loss_mlp": 0.01251781, + "epoch": 0.7598376672177964, + "flos": 36110100096000.0, + "grad_norm": 1.8287091414841325, + "language_loss": 0.66797674, + "learning_rate": 5.750187540399017e-07, + "loss": 0.74464405, + "num_input_tokens_seen": 272585840, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.09460449, + "step": 12638, + "time_per_iteration": 2.620074987411499 + }, + { + "auxiliary_loss_clip": 0.06408533, + "auxiliary_loss_mlp": 0.01265156, + "balance_loss_clip": 0.06273371, + "balance_loss_mlp": 0.01254135, + "epoch": 0.7598977904704645, + "flos": 18338147066880.0, + "grad_norm": 2.2175642348047746, + "language_loss": 0.65482736, + "learning_rate": 5.747455029512323e-07, + "loss": 0.73156428, + "num_input_tokens_seen": 272602300, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.11022949, + "step": 12639, + "time_per_iteration": 2.495577096939087 + }, + { + "auxiliary_loss_clip": 0.06406114, + "auxiliary_loss_mlp": 0.01266924, + "balance_loss_clip": 0.06273108, + "balance_loss_mlp": 0.0125706, + "epoch": 0.7599579137231324, + "flos": 20198697851520.0, + "grad_norm": 2.4320385733819814, + "language_loss": 0.69979274, + "learning_rate": 5.744723059083572e-07, + "loss": 0.77652305, + "num_input_tokens_seen": 272619595, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.09863281, + "step": 12640, + "time_per_iteration": 2.5001392364501953 + }, + { + "auxiliary_loss_clip": 0.06408872, + "auxiliary_loss_mlp": 0.01266047, + "balance_loss_clip": 0.06271793, + "balance_loss_mlp": 0.01254788, + "epoch": 0.7600180369758004, + "flos": 24031746875520.0, + "grad_norm": 1.6154408738671377, + "language_loss": 0.66895354, + "learning_rate": 5.741991629216343e-07, + "loss": 0.74570274, + "num_input_tokens_seen": 272638825, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.11260986, + "step": 12641, + "time_per_iteration": 2.5159339904785156 + }, + { + "auxiliary_loss_clip": 0.064065, + "auxiliary_loss_mlp": 0.01265385, + "balance_loss_clip": 0.06269056, + "balance_loss_mlp": 0.01254865, + "epoch": 0.7600781602284684, + "flos": 18995534674560.0, + "grad_norm": 2.038376474313416, + "language_loss": 0.6667732, + "learning_rate": 5.73926074001422e-07, + "loss": 0.74349207, + "num_input_tokens_seen": 272657240, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.10522461, + "step": 12642, + "time_per_iteration": 2.4950852394104004 + }, + { + "auxiliary_loss_clip": 0.06405585, + "auxiliary_loss_mlp": 0.01265846, + "balance_loss_clip": 0.0627634, + "balance_loss_mlp": 0.01256571, + "epoch": 0.7601382834811363, + "flos": 26074670071680.0, + "grad_norm": 1.8779608812077913, + "language_loss": 0.75724566, + "learning_rate": 5.736530391580765e-07, + "loss": 0.83396, + "num_input_tokens_seen": 272677520, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.0927124, + "step": 12643, + "time_per_iteration": 2.660304069519043 + }, + { + "auxiliary_loss_clip": 0.06411186, + "auxiliary_loss_mlp": 0.01265406, + "balance_loss_clip": 0.06275575, + "balance_loss_mlp": 0.01254219, + "epoch": 0.7601984067338043, + "flos": 18850324348800.0, + "grad_norm": 1.8216194715113248, + "language_loss": 0.78901958, + "learning_rate": 5.733800584019508e-07, + "loss": 0.86578548, + "num_input_tokens_seen": 272696770, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.11187744, + "step": 12644, + "time_per_iteration": 2.513680934906006 + }, + { + "auxiliary_loss_clip": 0.06404514, + "auxiliary_loss_mlp": 0.01261707, + "balance_loss_clip": 0.06268981, + "balance_loss_mlp": 0.01251801, + "epoch": 0.7602585299864723, + "flos": 24653607552000.0, + "grad_norm": 1.4015203810474768, + "language_loss": 0.807042, + "learning_rate": 5.731071317433957e-07, + "loss": 0.88370419, + "num_input_tokens_seen": 272718340, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.09912109, + "step": 12645, + "time_per_iteration": 2.7170186042785645 + }, + { + "auxiliary_loss_clip": 0.06406523, + "auxiliary_loss_mlp": 0.01267162, + "balance_loss_clip": 0.06271391, + "balance_loss_mlp": 0.01256779, + "epoch": 0.7603186532391403, + "flos": 23848913266560.0, + "grad_norm": 1.4313892113151905, + "language_loss": 0.7345466, + "learning_rate": 5.728342591927611e-07, + "loss": 0.81128347, + "num_input_tokens_seen": 272739575, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.1038208, + "step": 12646, + "time_per_iteration": 2.7041969299316406 + }, + { + "auxiliary_loss_clip": 0.06405969, + "auxiliary_loss_mlp": 0.01267521, + "balance_loss_clip": 0.06275387, + "balance_loss_mlp": 0.0125842, + "epoch": 0.7603787764918082, + "flos": 22206387605760.0, + "grad_norm": 1.8247890758149474, + "language_loss": 0.67541718, + "learning_rate": 5.725614407603949e-07, + "loss": 0.75215209, + "num_input_tokens_seen": 272758710, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.09100342, + "step": 12647, + "time_per_iteration": 2.631646156311035 + }, + { + "auxiliary_loss_clip": 0.06309351, + "auxiliary_loss_mlp": 0.01254415, + "balance_loss_clip": 0.06253824, + "balance_loss_mlp": 0.01253126, + "epoch": 0.7604388997444762, + "flos": 54104549713920.0, + "grad_norm": 0.6718107108151633, + "language_loss": 0.48995575, + "learning_rate": 5.722886764566415e-07, + "loss": 0.56559336, + "num_input_tokens_seen": 272814855, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01289368, + "step": 12648, + "time_per_iteration": 3.0884687900543213 + }, + { + "auxiliary_loss_clip": 0.06397881, + "auxiliary_loss_mlp": 0.01264414, + "balance_loss_clip": 0.06268241, + "balance_loss_mlp": 0.01255801, + "epoch": 0.7604990229971441, + "flos": 19687904161920.0, + "grad_norm": 1.3891263247246097, + "language_loss": 0.76770478, + "learning_rate": 5.720159662918451e-07, + "loss": 0.84432769, + "num_input_tokens_seen": 272834400, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.08612061, + "step": 12649, + "time_per_iteration": 2.4948225021362305 + }, + { + "auxiliary_loss_clip": 0.06400768, + "auxiliary_loss_mlp": 0.01263835, + "balance_loss_clip": 0.06269015, + "balance_loss_mlp": 0.01254501, + "epoch": 0.7605591462498121, + "flos": 25234993906560.0, + "grad_norm": 1.5285209228148775, + "language_loss": 0.6904434, + "learning_rate": 5.717433102763462e-07, + "loss": 0.76708949, + "num_input_tokens_seen": 272854760, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.09332275, + "step": 12650, + "time_per_iteration": 2.5328054428100586 + }, + { + "auxiliary_loss_clip": 0.06313049, + "auxiliary_loss_mlp": 0.01254535, + "balance_loss_clip": 0.06257538, + "balance_loss_mlp": 0.01253279, + "epoch": 0.76061926950248, + "flos": 66803505799680.0, + "grad_norm": 0.7352332079053004, + "language_loss": 0.62801003, + "learning_rate": 5.714707084204838e-07, + "loss": 0.70368588, + "num_input_tokens_seen": 272919030, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01255798, + "step": 12651, + "time_per_iteration": 4.553870916366577 + }, + { + "auxiliary_loss_clip": 0.06400903, + "auxiliary_loss_mlp": 0.01266142, + "balance_loss_clip": 0.06269742, + "balance_loss_mlp": 0.01256629, + "epoch": 0.7606793927551481, + "flos": 25345473914880.0, + "grad_norm": 1.3627527735409288, + "language_loss": 0.71875393, + "learning_rate": 5.711981607345951e-07, + "loss": 0.79542446, + "num_input_tokens_seen": 272938925, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09515381, + "step": 12652, + "time_per_iteration": 2.5254390239715576 + }, + { + "auxiliary_loss_clip": 0.06403194, + "auxiliary_loss_mlp": 0.0126807, + "balance_loss_clip": 0.06270062, + "balance_loss_mlp": 0.01258229, + "epoch": 0.760739516007816, + "flos": 18229553775360.0, + "grad_norm": 1.992377129366734, + "language_loss": 0.80116236, + "learning_rate": 5.709256672290152e-07, + "loss": 0.87787497, + "num_input_tokens_seen": 272954945, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.09838867, + "step": 12653, + "time_per_iteration": 2.475878953933716 + }, + { + "auxiliary_loss_clip": 0.06406933, + "auxiliary_loss_mlp": 0.01265577, + "balance_loss_clip": 0.06269571, + "balance_loss_mlp": 0.01255248, + "epoch": 0.760799639260484, + "flos": 22564717591680.0, + "grad_norm": 1.5079651219958228, + "language_loss": 0.80019051, + "learning_rate": 5.706532279140785e-07, + "loss": 0.87691557, + "num_input_tokens_seen": 272972855, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.10327148, + "step": 12654, + "time_per_iteration": 2.4968621730804443 + }, + { + "auxiliary_loss_clip": 0.06408094, + "auxiliary_loss_mlp": 0.01268021, + "balance_loss_clip": 0.0627185, + "balance_loss_mlp": 0.01256953, + "epoch": 0.760859762513152, + "flos": 22315819438080.0, + "grad_norm": 2.0930481497067968, + "language_loss": 0.79525441, + "learning_rate": 5.703808428001136e-07, + "loss": 0.87201554, + "num_input_tokens_seen": 272989895, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.11065674, + "step": 12655, + "time_per_iteration": 2.5296621322631836 + }, + { + "auxiliary_loss_clip": 0.06400845, + "auxiliary_loss_mlp": 0.01263727, + "balance_loss_clip": 0.06271712, + "balance_loss_mlp": 0.0125565, + "epoch": 0.7609198857658199, + "flos": 24870919916160.0, + "grad_norm": 1.5227214319467992, + "language_loss": 0.68902338, + "learning_rate": 5.701085118974505e-07, + "loss": 0.76566911, + "num_input_tokens_seen": 273011695, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.08068848, + "step": 12656, + "time_per_iteration": 2.541064739227295 + }, + { + "auxiliary_loss_clip": 0.06410336, + "auxiliary_loss_mlp": 0.01267534, + "balance_loss_clip": 0.06272005, + "balance_loss_mlp": 0.01256913, + "epoch": 0.760980009018488, + "flos": 16842424959360.0, + "grad_norm": 2.207190684629195, + "language_loss": 0.73558354, + "learning_rate": 5.698362352164164e-07, + "loss": 0.81236219, + "num_input_tokens_seen": 273028815, + "router_z_loss_clip": 1.38476562, + "router_z_loss_mlp": 0.10632324, + "step": 12657, + "time_per_iteration": 2.492959499359131 + }, + { + "auxiliary_loss_clip": 0.06312352, + "auxiliary_loss_mlp": 0.01255494, + "balance_loss_clip": 0.06256969, + "balance_loss_mlp": 0.01254303, + "epoch": 0.7610401322711559, + "flos": 61248198355200.0, + "grad_norm": 0.8387316949065597, + "language_loss": 0.65017879, + "learning_rate": 5.695640127673347e-07, + "loss": 0.7258572, + "num_input_tokens_seen": 273084080, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01189423, + "step": 12658, + "time_per_iteration": 3.0756664276123047 + }, + { + "auxiliary_loss_clip": 0.06397738, + "auxiliary_loss_mlp": 0.0126605, + "balance_loss_clip": 0.06270427, + "balance_loss_mlp": 0.01255691, + "epoch": 0.7611002555238239, + "flos": 19645920466560.0, + "grad_norm": 1.5440041293540654, + "language_loss": 0.7962606, + "learning_rate": 5.692918445605293e-07, + "loss": 0.87289846, + "num_input_tokens_seen": 273102295, + "router_z_loss_clip": 1.2734375, + "router_z_loss_mlp": 0.1036377, + "step": 12659, + "time_per_iteration": 2.5428194999694824 + }, + { + "auxiliary_loss_clip": 0.0640292, + "auxiliary_loss_mlp": 0.01264514, + "balance_loss_clip": 0.06270297, + "balance_loss_mlp": 0.01255138, + "epoch": 0.7611603787764918, + "flos": 26879825554560.0, + "grad_norm": 1.4756646122445365, + "language_loss": 0.69142807, + "learning_rate": 5.690197306063209e-07, + "loss": 0.76810235, + "num_input_tokens_seen": 273123400, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09375, + "step": 12660, + "time_per_iteration": 4.065267086029053 + }, + { + "auxiliary_loss_clip": 0.06405179, + "auxiliary_loss_mlp": 0.01264177, + "balance_loss_clip": 0.06272516, + "balance_loss_mlp": 0.01254759, + "epoch": 0.7612205020291598, + "flos": 27351570441600.0, + "grad_norm": 1.631280435549901, + "language_loss": 0.70831662, + "learning_rate": 5.687476709150281e-07, + "loss": 0.78501016, + "num_input_tokens_seen": 273145150, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.09423828, + "step": 12661, + "time_per_iteration": 2.541351079940796 + }, + { + "auxiliary_loss_clip": 0.06405234, + "auxiliary_loss_mlp": 0.01265085, + "balance_loss_clip": 0.06271017, + "balance_loss_mlp": 0.01255447, + "epoch": 0.7612806252818277, + "flos": 29322265818240.0, + "grad_norm": 1.4447529833958312, + "language_loss": 0.84105158, + "learning_rate": 5.68475665496966e-07, + "loss": 0.91775477, + "num_input_tokens_seen": 273165180, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.09637451, + "step": 12662, + "time_per_iteration": 2.654850721359253 + }, + { + "auxiliary_loss_clip": 0.06407061, + "auxiliary_loss_mlp": 0.01269015, + "balance_loss_clip": 0.06273231, + "balance_loss_mlp": 0.0125974, + "epoch": 0.7613407485344957, + "flos": 19032067854720.0, + "grad_norm": 1.6864772603594633, + "language_loss": 0.69368142, + "learning_rate": 5.682037143624505e-07, + "loss": 0.77044225, + "num_input_tokens_seen": 273184005, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.09283447, + "step": 12663, + "time_per_iteration": 3.926262617111206 + }, + { + "auxiliary_loss_clip": 0.06401078, + "auxiliary_loss_mlp": 0.01261863, + "balance_loss_clip": 0.0627175, + "balance_loss_mlp": 0.01253119, + "epoch": 0.7614008717871636, + "flos": 23262369886080.0, + "grad_norm": 1.4557154718503251, + "language_loss": 0.70039129, + "learning_rate": 5.67931817521794e-07, + "loss": 0.77702069, + "num_input_tokens_seen": 273203565, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.08746338, + "step": 12664, + "time_per_iteration": 2.5054047107696533 + }, + { + "auxiliary_loss_clip": 0.06409515, + "auxiliary_loss_mlp": 0.01268679, + "balance_loss_clip": 0.06272146, + "balance_loss_mlp": 0.01257724, + "epoch": 0.7614609950398317, + "flos": 21586329792000.0, + "grad_norm": 1.5992794514882698, + "language_loss": 0.79600513, + "learning_rate": 5.676599749853066e-07, + "loss": 0.87278712, + "num_input_tokens_seen": 273221645, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.10949707, + "step": 12665, + "time_per_iteration": 2.599689483642578 + }, + { + "auxiliary_loss_clip": 0.06403616, + "auxiliary_loss_mlp": 0.01268033, + "balance_loss_clip": 0.06274, + "balance_loss_mlp": 0.01258097, + "epoch": 0.7615211182924996, + "flos": 29285523002880.0, + "grad_norm": 1.8706140840131316, + "language_loss": 0.88243985, + "learning_rate": 5.673881867632959e-07, + "loss": 0.95915639, + "num_input_tokens_seen": 273242040, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.09936523, + "step": 12666, + "time_per_iteration": 2.5415070056915283 + }, + { + "auxiliary_loss_clip": 0.06408084, + "auxiliary_loss_mlp": 0.0126673, + "balance_loss_clip": 0.06272887, + "balance_loss_mlp": 0.01256472, + "epoch": 0.7615812415451676, + "flos": 13266156372480.0, + "grad_norm": 2.0248103449736963, + "language_loss": 0.83170617, + "learning_rate": 5.671164528660693e-07, + "loss": 0.90845418, + "num_input_tokens_seen": 273257365, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.10253906, + "step": 12667, + "time_per_iteration": 2.4605929851531982 + }, + { + "auxiliary_loss_clip": 0.06401822, + "auxiliary_loss_mlp": 0.01264725, + "balance_loss_clip": 0.06271848, + "balance_loss_mlp": 0.01255266, + "epoch": 0.7616413647978356, + "flos": 18590105894400.0, + "grad_norm": 1.5289232692663373, + "language_loss": 0.78628266, + "learning_rate": 5.668447733039296e-07, + "loss": 0.86294812, + "num_input_tokens_seen": 273274710, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.09460449, + "step": 12668, + "time_per_iteration": 3.9720492362976074 + }, + { + "auxiliary_loss_clip": 0.06403045, + "auxiliary_loss_mlp": 0.01263851, + "balance_loss_clip": 0.06270594, + "balance_loss_mlp": 0.01254469, + "epoch": 0.7617014880505035, + "flos": 18522280414080.0, + "grad_norm": 1.6924413590277445, + "language_loss": 0.64424682, + "learning_rate": 5.6657314808718e-07, + "loss": 0.72091579, + "num_input_tokens_seen": 273292870, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.09381104, + "step": 12669, + "time_per_iteration": 2.4817726612091064 + }, + { + "auxiliary_loss_clip": 0.0640804, + "auxiliary_loss_mlp": 0.01266418, + "balance_loss_clip": 0.06272504, + "balance_loss_mlp": 0.01255403, + "epoch": 0.7617616113031715, + "flos": 24980184040320.0, + "grad_norm": 1.625894991767346, + "language_loss": 0.66114289, + "learning_rate": 5.663015772261202e-07, + "loss": 0.7378875, + "num_input_tokens_seen": 273312375, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.11016846, + "step": 12670, + "time_per_iteration": 2.531942844390869 + }, + { + "auxiliary_loss_clip": 0.06408806, + "auxiliary_loss_mlp": 0.01267085, + "balance_loss_clip": 0.06272422, + "balance_loss_mlp": 0.01256821, + "epoch": 0.7618217345558395, + "flos": 23301796032000.0, + "grad_norm": 1.6261426293442, + "language_loss": 0.72730261, + "learning_rate": 5.660300607310493e-07, + "loss": 0.80406153, + "num_input_tokens_seen": 273332590, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.10266113, + "step": 12671, + "time_per_iteration": 2.555997133255005 + }, + { + "auxiliary_loss_clip": 0.06401184, + "auxiliary_loss_mlp": 0.01263811, + "balance_loss_clip": 0.06269476, + "balance_loss_mlp": 0.01254686, + "epoch": 0.7618818578085075, + "flos": 25489803772800.0, + "grad_norm": 1.5891051355844041, + "language_loss": 0.73397064, + "learning_rate": 5.657585986122613e-07, + "loss": 0.81062061, + "num_input_tokens_seen": 273352885, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09124756, + "step": 12672, + "time_per_iteration": 2.5291435718536377 + }, + { + "auxiliary_loss_clip": 0.06309396, + "auxiliary_loss_mlp": 0.01251395, + "balance_loss_clip": 0.06254143, + "balance_loss_mlp": 0.01250371, + "epoch": 0.7619419810611754, + "flos": 61168633303680.0, + "grad_norm": 0.7432915400862121, + "language_loss": 0.56722248, + "learning_rate": 5.654871908800506e-07, + "loss": 0.64283037, + "num_input_tokens_seen": 273411730, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01023865, + "step": 12673, + "time_per_iteration": 3.134204864501953 + }, + { + "auxiliary_loss_clip": 0.06401986, + "auxiliary_loss_mlp": 0.01266349, + "balance_loss_clip": 0.06268115, + "balance_loss_mlp": 0.01256371, + "epoch": 0.7620021043138434, + "flos": 23265430560000.0, + "grad_norm": 1.7103416042413309, + "language_loss": 0.74883175, + "learning_rate": 5.652158375447102e-07, + "loss": 0.82551509, + "num_input_tokens_seen": 273430020, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.09985352, + "step": 12674, + "time_per_iteration": 2.507917642593384 + }, + { + "auxiliary_loss_clip": 0.06398366, + "auxiliary_loss_mlp": 0.01265734, + "balance_loss_clip": 0.06268415, + "balance_loss_mlp": 0.01257002, + "epoch": 0.7620622275665113, + "flos": 25089490091520.0, + "grad_norm": 2.2685266755673847, + "language_loss": 0.72315985, + "learning_rate": 5.649445386165286e-07, + "loss": 0.79980081, + "num_input_tokens_seen": 273448690, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.08728027, + "step": 12675, + "time_per_iteration": 2.5618882179260254 + }, + { + "auxiliary_loss_clip": 0.0640251, + "auxiliary_loss_mlp": 0.01264495, + "balance_loss_clip": 0.06272566, + "balance_loss_mlp": 0.01254911, + "epoch": 0.7621223508191793, + "flos": 20160864933120.0, + "grad_norm": 1.9392842077457455, + "language_loss": 0.7294848, + "learning_rate": 5.646732941057936e-07, + "loss": 0.80615485, + "num_input_tokens_seen": 273465190, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.09588623, + "step": 12676, + "time_per_iteration": 2.4889016151428223 + }, + { + "auxiliary_loss_clip": 0.06412819, + "auxiliary_loss_mlp": 0.01265496, + "balance_loss_clip": 0.06273077, + "balance_loss_mlp": 0.01255125, + "epoch": 0.7621824740718472, + "flos": 18005323449600.0, + "grad_norm": 3.350191420610347, + "language_loss": 0.54523033, + "learning_rate": 5.644021040227927e-07, + "loss": 0.62201345, + "num_input_tokens_seen": 273478620, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.10357666, + "step": 12677, + "time_per_iteration": 2.479889392852783 + }, + { + "auxiliary_loss_clip": 0.06403828, + "auxiliary_loss_mlp": 0.01261111, + "balance_loss_clip": 0.06271364, + "balance_loss_mlp": 0.0125102, + "epoch": 0.7622425973245153, + "flos": 21732085169280.0, + "grad_norm": 1.924626512292605, + "language_loss": 0.79229861, + "learning_rate": 5.641309683778064e-07, + "loss": 0.86894798, + "num_input_tokens_seen": 273497635, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.10101318, + "step": 12678, + "time_per_iteration": 2.5050454139709473 + }, + { + "auxiliary_loss_clip": 0.0640271, + "auxiliary_loss_mlp": 0.0126229, + "balance_loss_clip": 0.06268604, + "balance_loss_mlp": 0.01252694, + "epoch": 0.7623027205771832, + "flos": 19724563123200.0, + "grad_norm": 2.0630846770322133, + "language_loss": 0.77460301, + "learning_rate": 5.638598871811175e-07, + "loss": 0.85125297, + "num_input_tokens_seen": 273513955, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.09588623, + "step": 12679, + "time_per_iteration": 2.5036091804504395 + }, + { + "auxiliary_loss_clip": 0.06405875, + "auxiliary_loss_mlp": 0.01264484, + "balance_loss_clip": 0.06272455, + "balance_loss_mlp": 0.0125526, + "epoch": 0.7623628438298512, + "flos": 23995800673920.0, + "grad_norm": 1.5339500294685882, + "language_loss": 0.79924572, + "learning_rate": 5.635888604430059e-07, + "loss": 0.87594938, + "num_input_tokens_seen": 273533970, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.0921936, + "step": 12680, + "time_per_iteration": 2.5672616958618164 + }, + { + "auxiliary_loss_clip": 0.06404954, + "auxiliary_loss_mlp": 0.01265568, + "balance_loss_clip": 0.06273016, + "balance_loss_mlp": 0.0125565, + "epoch": 0.7624229670825191, + "flos": 22352184910080.0, + "grad_norm": 1.9657419278541466, + "language_loss": 0.62747079, + "learning_rate": 5.633178881737493e-07, + "loss": 0.70417601, + "num_input_tokens_seen": 273553090, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09918213, + "step": 12681, + "time_per_iteration": 2.5365428924560547 + }, + { + "auxiliary_loss_clip": 0.06399923, + "auxiliary_loss_mlp": 0.01266445, + "balance_loss_clip": 0.06270124, + "balance_loss_mlp": 0.01256789, + "epoch": 0.7624830903351871, + "flos": 22718522960640.0, + "grad_norm": 2.3247043396178335, + "language_loss": 0.76673269, + "learning_rate": 5.63046970383622e-07, + "loss": 0.84339643, + "num_input_tokens_seen": 273572460, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.09649658, + "step": 12682, + "time_per_iteration": 2.5021934509277344 + }, + { + "auxiliary_loss_clip": 0.06400375, + "auxiliary_loss_mlp": 0.0126528, + "balance_loss_clip": 0.06271029, + "balance_loss_mlp": 0.01256554, + "epoch": 0.7625432135878552, + "flos": 25600870759680.0, + "grad_norm": 1.6797876321314247, + "language_loss": 0.68138206, + "learning_rate": 5.627761070828974e-07, + "loss": 0.75803858, + "num_input_tokens_seen": 273592815, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.08728027, + "step": 12683, + "time_per_iteration": 2.5445661544799805 + }, + { + "auxiliary_loss_clip": 0.06401844, + "auxiliary_loss_mlp": 0.01265651, + "balance_loss_clip": 0.06269109, + "balance_loss_mlp": 0.01256078, + "epoch": 0.7626033368405231, + "flos": 23994417081600.0, + "grad_norm": 1.9075173015451221, + "language_loss": 0.83300132, + "learning_rate": 5.625052982818472e-07, + "loss": 0.90967631, + "num_input_tokens_seen": 273611790, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.09564209, + "step": 12684, + "time_per_iteration": 2.545069932937622 + }, + { + "auxiliary_loss_clip": 0.06406077, + "auxiliary_loss_mlp": 0.0126848, + "balance_loss_clip": 0.06271654, + "balance_loss_mlp": 0.01258264, + "epoch": 0.7626634600931911, + "flos": 12603150541440.0, + "grad_norm": 1.7483092151310056, + "language_loss": 0.82848525, + "learning_rate": 5.622345439907396e-07, + "loss": 0.90523082, + "num_input_tokens_seen": 273628340, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.10211182, + "step": 12685, + "time_per_iteration": 2.5331482887268066 + }, + { + "auxiliary_loss_clip": 0.06405815, + "auxiliary_loss_mlp": 0.01266629, + "balance_loss_clip": 0.0627293, + "balance_loss_mlp": 0.0125692, + "epoch": 0.762723583345859, + "flos": 26329731500160.0, + "grad_norm": 1.6739148989024917, + "language_loss": 0.77748114, + "learning_rate": 5.619638442198422e-07, + "loss": 0.85420561, + "num_input_tokens_seen": 273646585, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.0970459, + "step": 12686, + "time_per_iteration": 2.529662609100342 + }, + { + "auxiliary_loss_clip": 0.06407499, + "auxiliary_loss_mlp": 0.01264706, + "balance_loss_clip": 0.06270917, + "balance_loss_mlp": 0.01254204, + "epoch": 0.762783706598527, + "flos": 21913325550720.0, + "grad_norm": 1.6937601944819862, + "language_loss": 0.72154206, + "learning_rate": 5.616931989794198e-07, + "loss": 0.79826409, + "num_input_tokens_seen": 273665410, + "router_z_loss_clip": 1.36621094, + "router_z_loss_mlp": 0.1050415, + "step": 12687, + "time_per_iteration": 2.486391544342041 + }, + { + "auxiliary_loss_clip": 0.06404573, + "auxiliary_loss_mlp": 0.01266259, + "balance_loss_clip": 0.062728, + "balance_loss_mlp": 0.01256263, + "epoch": 0.7628438298511949, + "flos": 15344983843200.0, + "grad_norm": 3.1096174425988656, + "language_loss": 0.65146047, + "learning_rate": 5.614226082797369e-07, + "loss": 0.72816885, + "num_input_tokens_seen": 273683035, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09991455, + "step": 12688, + "time_per_iteration": 2.486335515975952 + }, + { + "auxiliary_loss_clip": 0.06397952, + "auxiliary_loss_mlp": 0.01267437, + "balance_loss_clip": 0.062691, + "balance_loss_mlp": 0.01258103, + "epoch": 0.7629039531038629, + "flos": 13011388433280.0, + "grad_norm": 1.9926161434676632, + "language_loss": 0.70924902, + "learning_rate": 5.611520721310515e-07, + "loss": 0.78590292, + "num_input_tokens_seen": 273700130, + "router_z_loss_clip": 1.28613281, + "router_z_loss_mlp": 0.09332275, + "step": 12689, + "time_per_iteration": 2.5037851333618164 + }, + { + "auxiliary_loss_clip": 0.06412265, + "auxiliary_loss_mlp": 0.01264555, + "balance_loss_clip": 0.06273138, + "balance_loss_mlp": 0.01254493, + "epoch": 0.7629640763565309, + "flos": 26177938629120.0, + "grad_norm": 1.870564488725158, + "language_loss": 0.70028657, + "learning_rate": 5.608815905436238e-07, + "loss": 0.77705473, + "num_input_tokens_seen": 273720310, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.10058594, + "step": 12690, + "time_per_iteration": 2.533437728881836 + }, + { + "auxiliary_loss_clip": 0.06403746, + "auxiliary_loss_mlp": 0.01262782, + "balance_loss_clip": 0.06271788, + "balance_loss_mlp": 0.01253174, + "epoch": 0.7630241996091989, + "flos": 36802553437440.0, + "grad_norm": 1.3861533863354163, + "language_loss": 0.69748205, + "learning_rate": 5.606111635277109e-07, + "loss": 0.77414727, + "num_input_tokens_seen": 273744475, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.0960083, + "step": 12691, + "time_per_iteration": 4.015859127044678 + }, + { + "auxiliary_loss_clip": 0.06401307, + "auxiliary_loss_mlp": 0.01260884, + "balance_loss_clip": 0.06269828, + "balance_loss_mlp": 0.01252003, + "epoch": 0.7630843228618668, + "flos": 21841600855680.0, + "grad_norm": 1.5523680121734649, + "language_loss": 0.82087487, + "learning_rate": 5.603407910935662e-07, + "loss": 0.89749676, + "num_input_tokens_seen": 273764635, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.08880615, + "step": 12692, + "time_per_iteration": 2.5389950275421143 + }, + { + "auxiliary_loss_clip": 0.06409267, + "auxiliary_loss_mlp": 0.01265339, + "balance_loss_clip": 0.06275039, + "balance_loss_mlp": 0.01255993, + "epoch": 0.7631444461145348, + "flos": 12645385799040.0, + "grad_norm": 2.3344184890866564, + "language_loss": 0.77300888, + "learning_rate": 5.600704732514438e-07, + "loss": 0.84975493, + "num_input_tokens_seen": 273780115, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.09344482, + "step": 12693, + "time_per_iteration": 2.445725917816162 + }, + { + "auxiliary_loss_clip": 0.064025, + "auxiliary_loss_mlp": 0.0126808, + "balance_loss_clip": 0.06269249, + "balance_loss_mlp": 0.01257643, + "epoch": 0.7632045693672027, + "flos": 16842215324160.0, + "grad_norm": 1.879033723685166, + "language_loss": 0.7319355, + "learning_rate": 5.598002100115933e-07, + "loss": 0.80864131, + "num_input_tokens_seen": 273796605, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.10437012, + "step": 12694, + "time_per_iteration": 2.480100154876709 + }, + { + "auxiliary_loss_clip": 0.06401706, + "auxiliary_loss_mlp": 0.01263272, + "balance_loss_clip": 0.06270289, + "balance_loss_mlp": 0.01253663, + "epoch": 0.7632646926198707, + "flos": 22023763632000.0, + "grad_norm": 1.7362595054615078, + "language_loss": 0.70577729, + "learning_rate": 5.595300013842625e-07, + "loss": 0.78242707, + "num_input_tokens_seen": 273816515, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09619141, + "step": 12695, + "time_per_iteration": 2.484557867050171 + }, + { + "auxiliary_loss_clip": 0.06405228, + "auxiliary_loss_mlp": 0.01265272, + "balance_loss_clip": 0.0627223, + "balance_loss_mlp": 0.01255134, + "epoch": 0.7633248158725388, + "flos": 23120974920960.0, + "grad_norm": 1.5006607242564833, + "language_loss": 0.72539437, + "learning_rate": 5.592598473796985e-07, + "loss": 0.80209941, + "num_input_tokens_seen": 273837060, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.10150146, + "step": 12696, + "time_per_iteration": 2.535898208618164 + }, + { + "auxiliary_loss_clip": 0.06401037, + "auxiliary_loss_mlp": 0.0126526, + "balance_loss_clip": 0.06268622, + "balance_loss_mlp": 0.01255568, + "epoch": 0.7633849391252067, + "flos": 10894518408960.0, + "grad_norm": 2.5144564572490116, + "language_loss": 0.71505952, + "learning_rate": 5.589897480081453e-07, + "loss": 0.79172248, + "num_input_tokens_seen": 273853365, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09692383, + "step": 12697, + "time_per_iteration": 2.4591684341430664 + }, + { + "auxiliary_loss_clip": 0.06400824, + "auxiliary_loss_mlp": 0.01260764, + "balance_loss_clip": 0.06270981, + "balance_loss_mlp": 0.01251179, + "epoch": 0.7634450623778747, + "flos": 21000163754880.0, + "grad_norm": 1.880904163415611, + "language_loss": 0.67272222, + "learning_rate": 5.587197032798461e-07, + "loss": 0.74933803, + "num_input_tokens_seen": 273870750, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.0958252, + "step": 12698, + "time_per_iteration": 2.5230917930603027 + }, + { + "auxiliary_loss_clip": 0.06403317, + "auxiliary_loss_mlp": 0.01265477, + "balance_loss_clip": 0.06270997, + "balance_loss_mlp": 0.01255529, + "epoch": 0.7635051856305426, + "flos": 18888366902400.0, + "grad_norm": 1.5780107163253119, + "language_loss": 0.72484887, + "learning_rate": 5.5844971320504e-07, + "loss": 0.8015368, + "num_input_tokens_seen": 273890890, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.0994873, + "step": 12699, + "time_per_iteration": 2.5273780822753906 + }, + { + "auxiliary_loss_clip": 0.0640247, + "auxiliary_loss_mlp": 0.0126796, + "balance_loss_clip": 0.06273928, + "balance_loss_mlp": 0.01258906, + "epoch": 0.7635653088832106, + "flos": 34795492588800.0, + "grad_norm": 1.9895424194721678, + "language_loss": 0.73307264, + "learning_rate": 5.581797777939648e-07, + "loss": 0.8097769, + "num_input_tokens_seen": 273914015, + "router_z_loss_clip": 1.28613281, + "router_z_loss_mlp": 0.09069824, + "step": 12700, + "time_per_iteration": 4.06644868850708 + }, + { + "auxiliary_loss_clip": 0.0640322, + "auxiliary_loss_mlp": 0.01269407, + "balance_loss_clip": 0.06270028, + "balance_loss_mlp": 0.01259608, + "epoch": 0.7636254321358785, + "flos": 23183978791680.0, + "grad_norm": 1.8289500414025046, + "language_loss": 0.69277215, + "learning_rate": 5.579098970568574e-07, + "loss": 0.76949847, + "num_input_tokens_seen": 273927415, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.09796143, + "step": 12701, + "time_per_iteration": 2.4977099895477295 + }, + { + "auxiliary_loss_clip": 0.06401876, + "auxiliary_loss_mlp": 0.01262857, + "balance_loss_clip": 0.06269674, + "balance_loss_mlp": 0.01253243, + "epoch": 0.7636855553885465, + "flos": 21331729560960.0, + "grad_norm": 1.5301057508918974, + "language_loss": 0.64290726, + "learning_rate": 5.576400710039508e-07, + "loss": 0.7195546, + "num_input_tokens_seen": 273946690, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.09606934, + "step": 12702, + "time_per_iteration": 2.4910881519317627 + }, + { + "auxiliary_loss_clip": 0.06402961, + "auxiliary_loss_mlp": 0.01265669, + "balance_loss_clip": 0.06269959, + "balance_loss_mlp": 0.01256234, + "epoch": 0.7637456786412145, + "flos": 28665674824320.0, + "grad_norm": 1.963609141873143, + "language_loss": 0.66137874, + "learning_rate": 5.57370299645477e-07, + "loss": 0.738065, + "num_input_tokens_seen": 273966870, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.09429932, + "step": 12703, + "time_per_iteration": 3.9583401679992676 + }, + { + "auxiliary_loss_clip": 0.06406517, + "auxiliary_loss_mlp": 0.01265828, + "balance_loss_clip": 0.06273364, + "balance_loss_mlp": 0.01256721, + "epoch": 0.7638058018938825, + "flos": 21913577112960.0, + "grad_norm": 2.0195903258707757, + "language_loss": 0.83478069, + "learning_rate": 5.571005829916668e-07, + "loss": 0.91150421, + "num_input_tokens_seen": 273986360, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.09112549, + "step": 12704, + "time_per_iteration": 2.5038557052612305 + }, + { + "auxiliary_loss_clip": 0.0640365, + "auxiliary_loss_mlp": 0.01268211, + "balance_loss_clip": 0.06271724, + "balance_loss_mlp": 0.01258686, + "epoch": 0.7638659251465504, + "flos": 29651777199360.0, + "grad_norm": 1.4030805409759646, + "language_loss": 0.68150222, + "learning_rate": 5.568309210527469e-07, + "loss": 0.75822091, + "num_input_tokens_seen": 274009745, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.09527588, + "step": 12705, + "time_per_iteration": 2.5900156497955322 + }, + { + "auxiliary_loss_clip": 0.06400676, + "auxiliary_loss_mlp": 0.01264845, + "balance_loss_clip": 0.06270821, + "balance_loss_mlp": 0.01255672, + "epoch": 0.7639260483992184, + "flos": 26148449191680.0, + "grad_norm": 1.5410038713701188, + "language_loss": 0.74538386, + "learning_rate": 5.565613138389427e-07, + "loss": 0.82203901, + "num_input_tokens_seen": 274028775, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.09173584, + "step": 12706, + "time_per_iteration": 2.559558391571045 + }, + { + "auxiliary_loss_clip": 0.06403012, + "auxiliary_loss_mlp": 0.01265533, + "balance_loss_clip": 0.0627191, + "balance_loss_mlp": 0.01256336, + "epoch": 0.7639861716518863, + "flos": 20162835504000.0, + "grad_norm": 1.755600712442579, + "language_loss": 0.78974855, + "learning_rate": 5.562917613604781e-07, + "loss": 0.86643398, + "num_input_tokens_seen": 274047520, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09191895, + "step": 12707, + "time_per_iteration": 3.932704210281372 + }, + { + "auxiliary_loss_clip": 0.06401724, + "auxiliary_loss_mlp": 0.01265201, + "balance_loss_clip": 0.06268962, + "balance_loss_mlp": 0.01255283, + "epoch": 0.7640462949045543, + "flos": 18588219177600.0, + "grad_norm": 6.1940407959342885, + "language_loss": 0.80090815, + "learning_rate": 5.560222636275751e-07, + "loss": 0.87757736, + "num_input_tokens_seen": 274065350, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.0993042, + "step": 12708, + "time_per_iteration": 2.4813318252563477 + }, + { + "auxiliary_loss_clip": 0.06315993, + "auxiliary_loss_mlp": 0.0125198, + "balance_loss_clip": 0.06260599, + "balance_loss_mlp": 0.01250996, + "epoch": 0.7641064181572224, + "flos": 68342972538240.0, + "grad_norm": 0.7968333839429529, + "language_loss": 0.5539844, + "learning_rate": 5.557528206504521e-07, + "loss": 0.62966412, + "num_input_tokens_seen": 274122315, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.00983429, + "step": 12709, + "time_per_iteration": 3.1384057998657227 + }, + { + "auxiliary_loss_clip": 0.0640793, + "auxiliary_loss_mlp": 0.01269871, + "balance_loss_clip": 0.06272653, + "balance_loss_mlp": 0.0125925, + "epoch": 0.7641665414098903, + "flos": 17974995471360.0, + "grad_norm": 1.6571298349962345, + "language_loss": 0.63628614, + "learning_rate": 5.554834324393271e-07, + "loss": 0.71306419, + "num_input_tokens_seen": 274140555, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.10614014, + "step": 12710, + "time_per_iteration": 2.503221273422241 + }, + { + "auxiliary_loss_clip": 0.06405756, + "auxiliary_loss_mlp": 0.01266035, + "balance_loss_clip": 0.06270481, + "balance_loss_mlp": 0.01255705, + "epoch": 0.7642266646625583, + "flos": 21258537419520.0, + "grad_norm": 2.423165664894835, + "language_loss": 0.64622939, + "learning_rate": 5.552140990044154e-07, + "loss": 0.72294724, + "num_input_tokens_seen": 274161125, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.10327148, + "step": 12711, + "time_per_iteration": 2.48382568359375 + }, + { + "auxiliary_loss_clip": 0.06402837, + "auxiliary_loss_mlp": 0.01266675, + "balance_loss_clip": 0.06270531, + "balance_loss_mlp": 0.01257216, + "epoch": 0.7642867879152262, + "flos": 22754469162240.0, + "grad_norm": 1.499831368340144, + "language_loss": 0.73271233, + "learning_rate": 5.549448203559293e-07, + "loss": 0.80940747, + "num_input_tokens_seen": 274180835, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.09454346, + "step": 12712, + "time_per_iteration": 2.518559455871582 + }, + { + "auxiliary_loss_clip": 0.06399734, + "auxiliary_loss_mlp": 0.01265086, + "balance_loss_clip": 0.06270479, + "balance_loss_mlp": 0.01256247, + "epoch": 0.7643469111678942, + "flos": 23339000044800.0, + "grad_norm": 4.100229806424162, + "language_loss": 0.80473924, + "learning_rate": 5.546755965040804e-07, + "loss": 0.88138747, + "num_input_tokens_seen": 274201190, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.08837891, + "step": 12713, + "time_per_iteration": 2.495666742324829 + }, + { + "auxiliary_loss_clip": 0.0640631, + "auxiliary_loss_mlp": 0.01266494, + "balance_loss_clip": 0.06271648, + "balance_loss_mlp": 0.01256237, + "epoch": 0.7644070344205621, + "flos": 19861891165440.0, + "grad_norm": 2.1468665185465396, + "language_loss": 0.84159482, + "learning_rate": 5.544064274590776e-07, + "loss": 0.91832292, + "num_input_tokens_seen": 274217595, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.10266113, + "step": 12714, + "time_per_iteration": 2.4871368408203125 + }, + { + "auxiliary_loss_clip": 0.06406413, + "auxiliary_loss_mlp": 0.01267342, + "balance_loss_clip": 0.06272297, + "balance_loss_mlp": 0.01257603, + "epoch": 0.7644671576732301, + "flos": 22097123481600.0, + "grad_norm": 1.4736408355385546, + "language_loss": 0.73087925, + "learning_rate": 5.541373132311287e-07, + "loss": 0.80761683, + "num_input_tokens_seen": 274237885, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.09741211, + "step": 12715, + "time_per_iteration": 2.4971745014190674 + }, + { + "auxiliary_loss_clip": 0.06399769, + "auxiliary_loss_mlp": 0.01265115, + "balance_loss_clip": 0.06267397, + "balance_loss_mlp": 0.01256252, + "epoch": 0.7645272809258981, + "flos": 25488084764160.0, + "grad_norm": 1.606219528134415, + "language_loss": 0.63579881, + "learning_rate": 5.538682538304376e-07, + "loss": 0.71244764, + "num_input_tokens_seen": 274258820, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.08868408, + "step": 12716, + "time_per_iteration": 2.5588536262512207 + }, + { + "auxiliary_loss_clip": 0.06410594, + "auxiliary_loss_mlp": 0.01264337, + "balance_loss_clip": 0.06273409, + "balance_loss_mlp": 0.01254353, + "epoch": 0.7645874041785661, + "flos": 21548035676160.0, + "grad_norm": 1.605402904200963, + "language_loss": 0.80340159, + "learning_rate": 5.535992492672068e-07, + "loss": 0.88015091, + "num_input_tokens_seen": 274278835, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.09991455, + "step": 12717, + "time_per_iteration": 2.4905505180358887 + }, + { + "auxiliary_loss_clip": 0.06401056, + "auxiliary_loss_mlp": 0.01264789, + "balance_loss_clip": 0.06271626, + "balance_loss_mlp": 0.01255342, + "epoch": 0.764647527431234, + "flos": 20637096013440.0, + "grad_norm": 2.3928982518870474, + "language_loss": 0.669339, + "learning_rate": 5.53330299551638e-07, + "loss": 0.74599743, + "num_input_tokens_seen": 274297110, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.09448242, + "step": 12718, + "time_per_iteration": 2.492809772491455 + }, + { + "auxiliary_loss_clip": 0.06399414, + "auxiliary_loss_mlp": 0.01266678, + "balance_loss_clip": 0.06269114, + "balance_loss_mlp": 0.01257368, + "epoch": 0.764707650683902, + "flos": 21440490560640.0, + "grad_norm": 1.7155178939343805, + "language_loss": 0.77496254, + "learning_rate": 5.530614046939286e-07, + "loss": 0.85162342, + "num_input_tokens_seen": 274315610, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.09301758, + "step": 12719, + "time_per_iteration": 2.5259573459625244 + }, + { + "auxiliary_loss_clip": 0.06404945, + "auxiliary_loss_mlp": 0.01264588, + "balance_loss_clip": 0.06272125, + "balance_loss_mlp": 0.01255021, + "epoch": 0.7647677739365699, + "flos": 22717852128000.0, + "grad_norm": 1.9590152643999037, + "language_loss": 0.69958895, + "learning_rate": 5.527925647042754e-07, + "loss": 0.77628434, + "num_input_tokens_seen": 274333975, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09564209, + "step": 12720, + "time_per_iteration": 2.539653778076172 + }, + { + "auxiliary_loss_clip": 0.06404178, + "auxiliary_loss_mlp": 0.01262819, + "balance_loss_clip": 0.06272593, + "balance_loss_mlp": 0.01252716, + "epoch": 0.7648278971892379, + "flos": 21330429822720.0, + "grad_norm": 1.6704748814369004, + "language_loss": 0.73973656, + "learning_rate": 5.52523779592875e-07, + "loss": 0.81640649, + "num_input_tokens_seen": 274353695, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.10107422, + "step": 12721, + "time_per_iteration": 2.501253128051758 + }, + { + "auxiliary_loss_clip": 0.06403898, + "auxiliary_loss_mlp": 0.01264362, + "balance_loss_clip": 0.06270562, + "balance_loss_mlp": 0.01254771, + "epoch": 0.764888020441906, + "flos": 20673545339520.0, + "grad_norm": 1.706168153440744, + "language_loss": 0.73528266, + "learning_rate": 5.522550493699163e-07, + "loss": 0.81196523, + "num_input_tokens_seen": 274371120, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.09594727, + "step": 12722, + "time_per_iteration": 2.509871244430542 + }, + { + "auxiliary_loss_clip": 0.06399025, + "auxiliary_loss_mlp": 0.01265445, + "balance_loss_clip": 0.06269681, + "balance_loss_mlp": 0.01256015, + "epoch": 0.7649481436945739, + "flos": 25089532018560.0, + "grad_norm": 1.7286135730297545, + "language_loss": 0.74329245, + "learning_rate": 5.519863740455912e-07, + "loss": 0.81993717, + "num_input_tokens_seen": 274389665, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.09423828, + "step": 12723, + "time_per_iteration": 2.510096549987793 + }, + { + "auxiliary_loss_clip": 0.06404193, + "auxiliary_loss_mlp": 0.01262404, + "balance_loss_clip": 0.06269242, + "balance_loss_mlp": 0.01252688, + "epoch": 0.7650082669472419, + "flos": 24907998147840.0, + "grad_norm": 2.2850113448580958, + "language_loss": 0.73361677, + "learning_rate": 5.517177536300881e-07, + "loss": 0.81028277, + "num_input_tokens_seen": 274408750, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.09710693, + "step": 12724, + "time_per_iteration": 2.5588150024414062 + }, + { + "auxiliary_loss_clip": 0.06401032, + "auxiliary_loss_mlp": 0.01264201, + "balance_loss_clip": 0.06271203, + "balance_loss_mlp": 0.01254885, + "epoch": 0.7650683901999098, + "flos": 14652614355840.0, + "grad_norm": 1.6932286249415067, + "language_loss": 0.84691983, + "learning_rate": 5.514491881335935e-07, + "loss": 0.92357218, + "num_input_tokens_seen": 274424600, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.09320068, + "step": 12725, + "time_per_iteration": 2.4555823802948 + }, + { + "auxiliary_loss_clip": 0.06405662, + "auxiliary_loss_mlp": 0.01270715, + "balance_loss_clip": 0.06275846, + "balance_loss_mlp": 0.01260433, + "epoch": 0.7651285134525778, + "flos": 26358466250880.0, + "grad_norm": 1.7988072143781486, + "language_loss": 0.77533686, + "learning_rate": 5.511806775662901e-07, + "loss": 0.85210061, + "num_input_tokens_seen": 274443075, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.10284424, + "step": 12726, + "time_per_iteration": 2.56742000579834 + }, + { + "auxiliary_loss_clip": 0.06403583, + "auxiliary_loss_mlp": 0.01263268, + "balance_loss_clip": 0.06271972, + "balance_loss_mlp": 0.01254024, + "epoch": 0.7651886367052457, + "flos": 26653373095680.0, + "grad_norm": 1.6652210765488402, + "language_loss": 0.70600379, + "learning_rate": 5.509122219383615e-07, + "loss": 0.78267229, + "num_input_tokens_seen": 274463240, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09240723, + "step": 12727, + "time_per_iteration": 2.5245282649993896 + }, + { + "auxiliary_loss_clip": 0.06395786, + "auxiliary_loss_mlp": 0.01263203, + "balance_loss_clip": 0.06267853, + "balance_loss_mlp": 0.01254024, + "epoch": 0.7652487599579137, + "flos": 25709967175680.0, + "grad_norm": 1.6422371786213563, + "language_loss": 0.80038959, + "learning_rate": 5.506438212599864e-07, + "loss": 0.87697947, + "num_input_tokens_seen": 274482750, + "router_z_loss_clip": 1.27832031, + "router_z_loss_mlp": 0.09179688, + "step": 12728, + "time_per_iteration": 2.553881883621216 + }, + { + "auxiliary_loss_clip": 0.064078, + "auxiliary_loss_mlp": 0.01267492, + "balance_loss_clip": 0.0627337, + "balance_loss_mlp": 0.01257395, + "epoch": 0.7653088832105817, + "flos": 28593237369600.0, + "grad_norm": 1.6909382906919501, + "language_loss": 0.55773109, + "learning_rate": 5.503754755413424e-07, + "loss": 0.63448405, + "num_input_tokens_seen": 274503545, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.10089111, + "step": 12729, + "time_per_iteration": 2.561567783355713 + }, + { + "auxiliary_loss_clip": 0.06402748, + "auxiliary_loss_mlp": 0.01266568, + "balance_loss_clip": 0.0627131, + "balance_loss_mlp": 0.01256435, + "epoch": 0.7653690064632497, + "flos": 23373311091840.0, + "grad_norm": 1.5255211318254533, + "language_loss": 0.77756214, + "learning_rate": 5.501071847926055e-07, + "loss": 0.85425532, + "num_input_tokens_seen": 274523825, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.10131836, + "step": 12730, + "time_per_iteration": 3.951883316040039 + }, + { + "auxiliary_loss_clip": 0.0640994, + "auxiliary_loss_mlp": 0.01263677, + "balance_loss_clip": 0.06275389, + "balance_loss_mlp": 0.01253496, + "epoch": 0.7654291297159176, + "flos": 15778560395520.0, + "grad_norm": 1.5538691638081712, + "language_loss": 0.68886495, + "learning_rate": 5.498389490239495e-07, + "loss": 0.7656011, + "num_input_tokens_seen": 274541625, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.10180664, + "step": 12731, + "time_per_iteration": 2.496400833129883 + }, + { + "auxiliary_loss_clip": 0.06406744, + "auxiliary_loss_mlp": 0.01266172, + "balance_loss_clip": 0.06273277, + "balance_loss_mlp": 0.01255997, + "epoch": 0.7654892529685856, + "flos": 18038460539520.0, + "grad_norm": 1.970235991711743, + "language_loss": 0.70561087, + "learning_rate": 5.495707682455471e-07, + "loss": 0.78233999, + "num_input_tokens_seen": 274557580, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.10174561, + "step": 12732, + "time_per_iteration": 2.4463298320770264 + }, + { + "auxiliary_loss_clip": 0.06407348, + "auxiliary_loss_mlp": 0.01267052, + "balance_loss_clip": 0.06273159, + "balance_loss_mlp": 0.01257009, + "epoch": 0.7655493762212535, + "flos": 27243522201600.0, + "grad_norm": 1.6975746826212326, + "language_loss": 0.7867943, + "learning_rate": 5.493026424675653e-07, + "loss": 0.86353827, + "num_input_tokens_seen": 274578135, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.10040283, + "step": 12733, + "time_per_iteration": 2.5465524196624756 + }, + { + "auxiliary_loss_clip": 0.06404738, + "auxiliary_loss_mlp": 0.01264475, + "balance_loss_clip": 0.06275003, + "balance_loss_mlp": 0.01254843, + "epoch": 0.7656094994739215, + "flos": 20779706862720.0, + "grad_norm": 1.7438651719482663, + "language_loss": 0.78086102, + "learning_rate": 5.490345717001726e-07, + "loss": 0.85755318, + "num_input_tokens_seen": 274595655, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.09637451, + "step": 12734, + "time_per_iteration": 2.491992235183716 + }, + { + "auxiliary_loss_clip": 0.06409705, + "auxiliary_loss_mlp": 0.01265243, + "balance_loss_clip": 0.06273736, + "balance_loss_mlp": 0.01254628, + "epoch": 0.7656696227265896, + "flos": 23045896062720.0, + "grad_norm": 1.5457458237043498, + "language_loss": 0.73303032, + "learning_rate": 5.48766555953535e-07, + "loss": 0.80977982, + "num_input_tokens_seen": 274616305, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.1060791, + "step": 12735, + "time_per_iteration": 2.549952507019043 + }, + { + "auxiliary_loss_clip": 0.06403875, + "auxiliary_loss_mlp": 0.01265362, + "balance_loss_clip": 0.0627028, + "balance_loss_mlp": 0.01255956, + "epoch": 0.7657297459792575, + "flos": 27532810823040.0, + "grad_norm": 1.38702410103644, + "language_loss": 0.72968668, + "learning_rate": 5.484985952378145e-07, + "loss": 0.80637902, + "num_input_tokens_seen": 274638110, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.09399414, + "step": 12736, + "time_per_iteration": 2.5478687286376953 + }, + { + "auxiliary_loss_clip": 0.06409203, + "auxiliary_loss_mlp": 0.0126645, + "balance_loss_clip": 0.06272754, + "balance_loss_mlp": 0.01255399, + "epoch": 0.7657898692319255, + "flos": 17134103422080.0, + "grad_norm": 1.7853161990922843, + "language_loss": 0.77847868, + "learning_rate": 5.482306895631728e-07, + "loss": 0.85523522, + "num_input_tokens_seen": 274656565, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.11065674, + "step": 12737, + "time_per_iteration": 2.517828941345215 + }, + { + "auxiliary_loss_clip": 0.06403487, + "auxiliary_loss_mlp": 0.01264987, + "balance_loss_clip": 0.06271316, + "balance_loss_mlp": 0.01254795, + "epoch": 0.7658499924845934, + "flos": 21471363590400.0, + "grad_norm": 1.7993008956393386, + "language_loss": 0.7689963, + "learning_rate": 5.479628389397699e-07, + "loss": 0.84568107, + "num_input_tokens_seen": 274674215, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.10186768, + "step": 12738, + "time_per_iteration": 2.4858741760253906 + }, + { + "auxiliary_loss_clip": 0.06409841, + "auxiliary_loss_mlp": 0.01265376, + "balance_loss_clip": 0.06272836, + "balance_loss_mlp": 0.01254748, + "epoch": 0.7659101157372614, + "flos": 29504302813440.0, + "grad_norm": 1.7653019874765563, + "language_loss": 0.6329987, + "learning_rate": 5.476950433777603e-07, + "loss": 0.70975083, + "num_input_tokens_seen": 274693445, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.10620117, + "step": 12739, + "time_per_iteration": 3.9952597618103027 + }, + { + "auxiliary_loss_clip": 0.06407788, + "auxiliary_loss_mlp": 0.0126759, + "balance_loss_clip": 0.06274374, + "balance_loss_mlp": 0.01256718, + "epoch": 0.7659702389899293, + "flos": 18557765418240.0, + "grad_norm": 1.7669010799995182, + "language_loss": 0.7909317, + "learning_rate": 5.474273028873004e-07, + "loss": 0.8676855, + "num_input_tokens_seen": 274712815, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.10870361, + "step": 12740, + "time_per_iteration": 2.5115749835968018 + }, + { + "auxiliary_loss_clip": 0.06403244, + "auxiliary_loss_mlp": 0.01263789, + "balance_loss_clip": 0.06271347, + "balance_loss_mlp": 0.01253853, + "epoch": 0.7660303622425974, + "flos": 23555767357440.0, + "grad_norm": 1.6620793532611546, + "language_loss": 0.65799433, + "learning_rate": 5.471596174785429e-07, + "loss": 0.73466468, + "num_input_tokens_seen": 274732690, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.09924316, + "step": 12741, + "time_per_iteration": 2.55269718170166 + }, + { + "auxiliary_loss_clip": 0.06404097, + "auxiliary_loss_mlp": 0.01266317, + "balance_loss_clip": 0.06272512, + "balance_loss_mlp": 0.01256482, + "epoch": 0.7660904854952653, + "flos": 18922761803520.0, + "grad_norm": 1.4348808707369967, + "language_loss": 0.76128972, + "learning_rate": 5.468919871616386e-07, + "loss": 0.83799386, + "num_input_tokens_seen": 274752460, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09832764, + "step": 12742, + "time_per_iteration": 3.9655463695526123 + }, + { + "auxiliary_loss_clip": 0.06397024, + "auxiliary_loss_mlp": 0.01262102, + "balance_loss_clip": 0.06269021, + "balance_loss_mlp": 0.01253274, + "epoch": 0.7661506087479333, + "flos": 23153986229760.0, + "grad_norm": 1.3105418877806154, + "language_loss": 0.76677555, + "learning_rate": 5.46624411946736e-07, + "loss": 0.84336686, + "num_input_tokens_seen": 274773070, + "router_z_loss_clip": 1.27929688, + "router_z_loss_mlp": 0.08831787, + "step": 12743, + "time_per_iteration": 2.4942922592163086 + }, + { + "auxiliary_loss_clip": 0.064053, + "auxiliary_loss_mlp": 0.01263354, + "balance_loss_clip": 0.06272225, + "balance_loss_mlp": 0.01253918, + "epoch": 0.7662107320006012, + "flos": 17571411480960.0, + "grad_norm": 1.8622912064646877, + "language_loss": 0.75256228, + "learning_rate": 5.463568918439805e-07, + "loss": 0.82924885, + "num_input_tokens_seen": 274790220, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.09442139, + "step": 12744, + "time_per_iteration": 2.500877618789673 + }, + { + "auxiliary_loss_clip": 0.06405517, + "auxiliary_loss_mlp": 0.01265062, + "balance_loss_clip": 0.06271944, + "balance_loss_mlp": 0.01255078, + "epoch": 0.7662708552532692, + "flos": 22308524133120.0, + "grad_norm": 3.023764218410669, + "language_loss": 0.70912051, + "learning_rate": 5.460894268635181e-07, + "loss": 0.78582633, + "num_input_tokens_seen": 274805095, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.09979248, + "step": 12745, + "time_per_iteration": 2.4632673263549805 + }, + { + "auxiliary_loss_clip": 0.06404217, + "auxiliary_loss_mlp": 0.01263005, + "balance_loss_clip": 0.0627097, + "balance_loss_mlp": 0.01252938, + "epoch": 0.7663309785059371, + "flos": 15747477730560.0, + "grad_norm": 2.4148009048873975, + "language_loss": 0.77143252, + "learning_rate": 5.458220170154896e-07, + "loss": 0.84810472, + "num_input_tokens_seen": 274821800, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.10058594, + "step": 12746, + "time_per_iteration": 2.470808506011963 + }, + { + "auxiliary_loss_clip": 0.06317573, + "auxiliary_loss_mlp": 0.01252549, + "balance_loss_clip": 0.06262261, + "balance_loss_mlp": 0.0125142, + "epoch": 0.7663911017586051, + "flos": 62184503877120.0, + "grad_norm": 0.6541980070594193, + "language_loss": 0.56711543, + "learning_rate": 5.455546623100362e-07, + "loss": 0.6428166, + "num_input_tokens_seen": 274886970, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01132202, + "step": 12747, + "time_per_iteration": 4.652554273605347 + }, + { + "auxiliary_loss_clip": 0.06402487, + "auxiliary_loss_mlp": 0.01263124, + "balance_loss_clip": 0.06272968, + "balance_loss_mlp": 0.01254393, + "epoch": 0.7664512250112732, + "flos": 26513361722880.0, + "grad_norm": 1.4294052686303238, + "language_loss": 0.72911537, + "learning_rate": 5.452873627572956e-07, + "loss": 0.80577153, + "num_input_tokens_seen": 274907240, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.08728027, + "step": 12748, + "time_per_iteration": 2.532306432723999 + }, + { + "auxiliary_loss_clip": 0.06404538, + "auxiliary_loss_mlp": 0.01268933, + "balance_loss_clip": 0.0627327, + "balance_loss_mlp": 0.01259348, + "epoch": 0.7665113482639411, + "flos": 16254497986560.0, + "grad_norm": 1.791719003468204, + "language_loss": 0.70015478, + "learning_rate": 5.450201183674052e-07, + "loss": 0.77688944, + "num_input_tokens_seen": 274924650, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.0958252, + "step": 12749, + "time_per_iteration": 2.492206573486328 + }, + { + "auxiliary_loss_clip": 0.06405895, + "auxiliary_loss_mlp": 0.01264322, + "balance_loss_clip": 0.06271075, + "balance_loss_mlp": 0.01254136, + "epoch": 0.7665714715166091, + "flos": 27205102304640.0, + "grad_norm": 1.5075173450833508, + "language_loss": 0.73696417, + "learning_rate": 5.447529291504967e-07, + "loss": 0.81366634, + "num_input_tokens_seen": 274944550, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.10180664, + "step": 12750, + "time_per_iteration": 2.6194586753845215 + }, + { + "auxiliary_loss_clip": 0.06403321, + "auxiliary_loss_mlp": 0.01264912, + "balance_loss_clip": 0.06273864, + "balance_loss_mlp": 0.01255637, + "epoch": 0.766631594769277, + "flos": 21073900947840.0, + "grad_norm": 2.338667432338341, + "language_loss": 0.75889468, + "learning_rate": 5.444857951167026e-07, + "loss": 0.83557701, + "num_input_tokens_seen": 274961330, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.09265137, + "step": 12751, + "time_per_iteration": 2.535900354385376 + }, + { + "auxiliary_loss_clip": 0.06405959, + "auxiliary_loss_mlp": 0.01265211, + "balance_loss_clip": 0.06275126, + "balance_loss_mlp": 0.01255442, + "epoch": 0.766691718021945, + "flos": 24104897089920.0, + "grad_norm": 1.8024081309521767, + "language_loss": 0.61214471, + "learning_rate": 5.442187162761537e-07, + "loss": 0.68885642, + "num_input_tokens_seen": 274981655, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09759521, + "step": 12752, + "time_per_iteration": 2.520057439804077 + }, + { + "auxiliary_loss_clip": 0.06407845, + "auxiliary_loss_mlp": 0.01265918, + "balance_loss_clip": 0.06274091, + "balance_loss_mlp": 0.01255452, + "epoch": 0.7667518412746129, + "flos": 23447383701120.0, + "grad_norm": 2.502768793247081, + "language_loss": 0.68991947, + "learning_rate": 5.439516926389767e-07, + "loss": 0.76665711, + "num_input_tokens_seen": 274999970, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.10467529, + "step": 12753, + "time_per_iteration": 2.5649516582489014 + }, + { + "auxiliary_loss_clip": 0.06405421, + "auxiliary_loss_mlp": 0.01267269, + "balance_loss_clip": 0.06272765, + "balance_loss_mlp": 0.01257339, + "epoch": 0.766811964527281, + "flos": 18154391063040.0, + "grad_norm": 2.2031278091751103, + "language_loss": 0.62667269, + "learning_rate": 5.436847242152971e-07, + "loss": 0.7033996, + "num_input_tokens_seen": 275015805, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.09936523, + "step": 12754, + "time_per_iteration": 2.4367518424987793 + }, + { + "auxiliary_loss_clip": 0.06402913, + "auxiliary_loss_mlp": 0.01263482, + "balance_loss_clip": 0.06272813, + "balance_loss_mlp": 0.01253426, + "epoch": 0.7668720877799489, + "flos": 19542023003520.0, + "grad_norm": 2.343791341299276, + "language_loss": 0.80305493, + "learning_rate": 5.434178110152401e-07, + "loss": 0.87971884, + "num_input_tokens_seen": 275031810, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.10040283, + "step": 12755, + "time_per_iteration": 2.4789938926696777 + }, + { + "auxiliary_loss_clip": 0.06403362, + "auxiliary_loss_mlp": 0.01266077, + "balance_loss_clip": 0.06272961, + "balance_loss_mlp": 0.0125626, + "epoch": 0.7669322110326169, + "flos": 22680899677440.0, + "grad_norm": 1.9246427907733588, + "language_loss": 0.70196575, + "learning_rate": 5.431509530489242e-07, + "loss": 0.77866018, + "num_input_tokens_seen": 275049325, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.09820557, + "step": 12756, + "time_per_iteration": 2.4842453002929688 + }, + { + "auxiliary_loss_clip": 0.06408253, + "auxiliary_loss_mlp": 0.01265925, + "balance_loss_clip": 0.06274906, + "balance_loss_mlp": 0.01256621, + "epoch": 0.7669923342852848, + "flos": 26476702761600.0, + "grad_norm": 1.4236493885684283, + "language_loss": 0.70190722, + "learning_rate": 5.428841503264706e-07, + "loss": 0.77864897, + "num_input_tokens_seen": 275070865, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.09307861, + "step": 12757, + "time_per_iteration": 2.5436339378356934 + }, + { + "auxiliary_loss_clip": 0.06405462, + "auxiliary_loss_mlp": 0.01264437, + "balance_loss_clip": 0.06275049, + "balance_loss_mlp": 0.01254089, + "epoch": 0.7670524575379528, + "flos": 22862643183360.0, + "grad_norm": 1.8472558815325884, + "language_loss": 0.76448315, + "learning_rate": 5.426174028579955e-07, + "loss": 0.84118211, + "num_input_tokens_seen": 275088015, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.10345459, + "step": 12758, + "time_per_iteration": 2.4789509773254395 + }, + { + "auxiliary_loss_clip": 0.06399853, + "auxiliary_loss_mlp": 0.01265053, + "balance_loss_clip": 0.06270798, + "balance_loss_mlp": 0.01255576, + "epoch": 0.7671125807906207, + "flos": 22458136798080.0, + "grad_norm": 1.6508827422801604, + "language_loss": 0.76464295, + "learning_rate": 5.423507106536156e-07, + "loss": 0.84129202, + "num_input_tokens_seen": 275106975, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.0947876, + "step": 12759, + "time_per_iteration": 2.5259945392608643 + }, + { + "auxiliary_loss_clip": 0.0640488, + "auxiliary_loss_mlp": 0.01263564, + "balance_loss_clip": 0.06270535, + "balance_loss_mlp": 0.01254195, + "epoch": 0.7671727040432887, + "flos": 35380275033600.0, + "grad_norm": 1.982345292184502, + "language_loss": 0.68377602, + "learning_rate": 5.420840737234425e-07, + "loss": 0.7604605, + "num_input_tokens_seen": 275129560, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.09368896, + "step": 12760, + "time_per_iteration": 2.5982978343963623 + }, + { + "auxiliary_loss_clip": 0.06406338, + "auxiliary_loss_mlp": 0.01265901, + "balance_loss_clip": 0.06272851, + "balance_loss_mlp": 0.0125584, + "epoch": 0.7672328272959568, + "flos": 22502007210240.0, + "grad_norm": 1.3719850689198565, + "language_loss": 0.79309064, + "learning_rate": 5.418174920775871e-07, + "loss": 0.86981302, + "num_input_tokens_seen": 275151180, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.10058594, + "step": 12761, + "time_per_iteration": 2.5480268001556396 + }, + { + "auxiliary_loss_clip": 0.06403705, + "auxiliary_loss_mlp": 0.01267963, + "balance_loss_clip": 0.06276072, + "balance_loss_mlp": 0.01258289, + "epoch": 0.7672929505486247, + "flos": 22821372247680.0, + "grad_norm": 2.021114982719017, + "language_loss": 0.66376638, + "learning_rate": 5.415509657261589e-07, + "loss": 0.74048305, + "num_input_tokens_seen": 275170605, + "router_z_loss_clip": 1.27539062, + "router_z_loss_mlp": 0.09674072, + "step": 12762, + "time_per_iteration": 2.487494707107544 + }, + { + "auxiliary_loss_clip": 0.06406671, + "auxiliary_loss_mlp": 0.01262822, + "balance_loss_clip": 0.06272823, + "balance_loss_mlp": 0.01253148, + "epoch": 0.7673530738012927, + "flos": 20344956353280.0, + "grad_norm": 1.669517530242866, + "language_loss": 0.74410594, + "learning_rate": 5.412844946792639e-07, + "loss": 0.82080084, + "num_input_tokens_seen": 275188750, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.09667969, + "step": 12763, + "time_per_iteration": 2.50715970993042 + }, + { + "auxiliary_loss_clip": 0.06406026, + "auxiliary_loss_mlp": 0.01264927, + "balance_loss_clip": 0.06275215, + "balance_loss_mlp": 0.01254836, + "epoch": 0.7674131970539606, + "flos": 34942212288000.0, + "grad_norm": 1.4115021004744182, + "language_loss": 0.70948029, + "learning_rate": 5.410180789470067e-07, + "loss": 0.78618985, + "num_input_tokens_seen": 275211365, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.10089111, + "step": 12764, + "time_per_iteration": 2.625321388244629 + }, + { + "auxiliary_loss_clip": 0.06405284, + "auxiliary_loss_mlp": 0.0126607, + "balance_loss_clip": 0.06274922, + "balance_loss_mlp": 0.01256241, + "epoch": 0.7674733203066286, + "flos": 28336247297280.0, + "grad_norm": 1.6715058951392505, + "language_loss": 0.69761688, + "learning_rate": 5.40751718539491e-07, + "loss": 0.77433044, + "num_input_tokens_seen": 275231670, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.0982666, + "step": 12765, + "time_per_iteration": 2.6227502822875977 + }, + { + "auxiliary_loss_clip": 0.06399858, + "auxiliary_loss_mlp": 0.012619, + "balance_loss_clip": 0.06270436, + "balance_loss_mlp": 0.01252769, + "epoch": 0.7675334435592965, + "flos": 16295307724800.0, + "grad_norm": 1.8004519699404298, + "language_loss": 0.6087966, + "learning_rate": 5.404854134668162e-07, + "loss": 0.6854142, + "num_input_tokens_seen": 275249425, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.09136963, + "step": 12766, + "time_per_iteration": 2.4817140102386475 + }, + { + "auxiliary_loss_clip": 0.06319875, + "auxiliary_loss_mlp": 0.01254158, + "balance_loss_clip": 0.06264514, + "balance_loss_mlp": 0.01252872, + "epoch": 0.7675935668119646, + "flos": 64847778376320.0, + "grad_norm": 0.7247432278410384, + "language_loss": 0.6077764, + "learning_rate": 5.402191637390803e-07, + "loss": 0.68351674, + "num_input_tokens_seen": 275312485, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01286316, + "step": 12767, + "time_per_iteration": 3.2508630752563477 + }, + { + "auxiliary_loss_clip": 0.06402268, + "auxiliary_loss_mlp": 0.01266038, + "balance_loss_clip": 0.06271527, + "balance_loss_mlp": 0.0125668, + "epoch": 0.7676536900646325, + "flos": 22682157488640.0, + "grad_norm": 1.91918463694606, + "language_loss": 0.69715631, + "learning_rate": 5.399529693663801e-07, + "loss": 0.77383935, + "num_input_tokens_seen": 275331680, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.09356689, + "step": 12768, + "time_per_iteration": 2.502361297607422 + }, + { + "auxiliary_loss_clip": 0.06411647, + "auxiliary_loss_mlp": 0.01267577, + "balance_loss_clip": 0.06273838, + "balance_loss_mlp": 0.01256729, + "epoch": 0.7677138133173005, + "flos": 26946393223680.0, + "grad_norm": 1.5949336757988604, + "language_loss": 0.70845366, + "learning_rate": 5.3968683035881e-07, + "loss": 0.7852459, + "num_input_tokens_seen": 275351615, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.10864258, + "step": 12769, + "time_per_iteration": 2.554861068725586 + }, + { + "auxiliary_loss_clip": 0.0641087, + "auxiliary_loss_mlp": 0.01267364, + "balance_loss_clip": 0.06275321, + "balance_loss_mlp": 0.01257184, + "epoch": 0.7677739365699684, + "flos": 23805336343680.0, + "grad_norm": 1.7985045785763099, + "language_loss": 0.80694544, + "learning_rate": 5.394207467264611e-07, + "loss": 0.88372779, + "num_input_tokens_seen": 275368815, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10174561, + "step": 12770, + "time_per_iteration": 3.9488418102264404 + }, + { + "auxiliary_loss_clip": 0.06402189, + "auxiliary_loss_mlp": 0.01263232, + "balance_loss_clip": 0.06272912, + "balance_loss_mlp": 0.01254363, + "epoch": 0.7678340598226364, + "flos": 34463423658240.0, + "grad_norm": 1.5007452698192065, + "language_loss": 0.78956687, + "learning_rate": 5.391547184794245e-07, + "loss": 0.86622107, + "num_input_tokens_seen": 275389345, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.08868408, + "step": 12771, + "time_per_iteration": 2.5934486389160156 + }, + { + "auxiliary_loss_clip": 0.06403628, + "auxiliary_loss_mlp": 0.01263065, + "balance_loss_clip": 0.06271377, + "balance_loss_mlp": 0.01253487, + "epoch": 0.7678941830753043, + "flos": 23848493996160.0, + "grad_norm": 1.2517341680866723, + "language_loss": 0.68444574, + "learning_rate": 5.388887456277876e-07, + "loss": 0.76111269, + "num_input_tokens_seen": 275411240, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09576416, + "step": 12772, + "time_per_iteration": 2.5651042461395264 + }, + { + "auxiliary_loss_clip": 0.06401607, + "auxiliary_loss_mlp": 0.01265845, + "balance_loss_clip": 0.0627486, + "balance_loss_mlp": 0.01256893, + "epoch": 0.7679543063279723, + "flos": 25417995223680.0, + "grad_norm": 1.427251107853352, + "language_loss": 0.73993248, + "learning_rate": 5.386228281816349e-07, + "loss": 0.816607, + "num_input_tokens_seen": 275432010, + "router_z_loss_clip": 1.265625, + "router_z_loss_mlp": 0.08953857, + "step": 12773, + "time_per_iteration": 2.5750787258148193 + }, + { + "auxiliary_loss_clip": 0.0639642, + "auxiliary_loss_mlp": 0.01264695, + "balance_loss_clip": 0.06268573, + "balance_loss_mlp": 0.01256554, + "epoch": 0.7680144295806404, + "flos": 27969448049280.0, + "grad_norm": 1.5249418922144822, + "language_loss": 0.81278884, + "learning_rate": 5.383569661510512e-07, + "loss": 0.88940001, + "num_input_tokens_seen": 275453710, + "router_z_loss_clip": 1.27734375, + "router_z_loss_mlp": 0.0814209, + "step": 12774, + "time_per_iteration": 2.549635648727417 + }, + { + "auxiliary_loss_clip": 0.06401657, + "auxiliary_loss_mlp": 0.01264098, + "balance_loss_clip": 0.06272675, + "balance_loss_mlp": 0.01254757, + "epoch": 0.7680745528333083, + "flos": 20419112816640.0, + "grad_norm": 2.7097792481139122, + "language_loss": 0.69999617, + "learning_rate": 5.380911595461177e-07, + "loss": 0.77665365, + "num_input_tokens_seen": 275472915, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.09338379, + "step": 12775, + "time_per_iteration": 2.502872943878174 + }, + { + "auxiliary_loss_clip": 0.06317612, + "auxiliary_loss_mlp": 0.01254016, + "balance_loss_clip": 0.0626227, + "balance_loss_mlp": 0.0125271, + "epoch": 0.7681346760859763, + "flos": 68423124568320.0, + "grad_norm": 0.6822831430052362, + "language_loss": 0.5694207, + "learning_rate": 5.378254083769147e-07, + "loss": 0.64513695, + "num_input_tokens_seen": 275534785, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01306915, + "step": 12776, + "time_per_iteration": 3.1927366256713867 + }, + { + "auxiliary_loss_clip": 0.0640178, + "auxiliary_loss_mlp": 0.0126464, + "balance_loss_clip": 0.06271428, + "balance_loss_mlp": 0.01255545, + "epoch": 0.7681947993386442, + "flos": 21257824659840.0, + "grad_norm": 1.8462760284119832, + "language_loss": 0.74373579, + "learning_rate": 5.375597126535188e-07, + "loss": 0.8204, + "num_input_tokens_seen": 275553205, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.09100342, + "step": 12777, + "time_per_iteration": 2.5175979137420654 + }, + { + "auxiliary_loss_clip": 0.06408069, + "auxiliary_loss_mlp": 0.0126398, + "balance_loss_clip": 0.06275662, + "balance_loss_mlp": 0.01254837, + "epoch": 0.7682549225913122, + "flos": 21404125088640.0, + "grad_norm": 1.9483232393983472, + "language_loss": 0.70101172, + "learning_rate": 5.372940723860043e-07, + "loss": 0.77773219, + "num_input_tokens_seen": 275571490, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.09143066, + "step": 12778, + "time_per_iteration": 2.6068058013916016 + }, + { + "auxiliary_loss_clip": 0.06405266, + "auxiliary_loss_mlp": 0.0126478, + "balance_loss_clip": 0.06274477, + "balance_loss_mlp": 0.01255172, + "epoch": 0.7683150458439801, + "flos": 23045518719360.0, + "grad_norm": 1.8309114800353317, + "language_loss": 0.70335215, + "learning_rate": 5.37028487584446e-07, + "loss": 0.7800526, + "num_input_tokens_seen": 275589665, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09619141, + "step": 12779, + "time_per_iteration": 4.003666639328003 + }, + { + "auxiliary_loss_clip": 0.0640587, + "auxiliary_loss_mlp": 0.01265519, + "balance_loss_clip": 0.062737, + "balance_loss_mlp": 0.01255898, + "epoch": 0.7683751690966482, + "flos": 67346361204480.0, + "grad_norm": 1.5118738364126798, + "language_loss": 0.58973181, + "learning_rate": 5.367629582589133e-07, + "loss": 0.66644573, + "num_input_tokens_seen": 275615605, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09619141, + "step": 12780, + "time_per_iteration": 2.915029525756836 + }, + { + "auxiliary_loss_clip": 0.06409752, + "auxiliary_loss_mlp": 0.01268476, + "balance_loss_clip": 0.06273384, + "balance_loss_mlp": 0.01258587, + "epoch": 0.7684352923493161, + "flos": 21805361164800.0, + "grad_norm": 2.2303773736896373, + "language_loss": 0.68361402, + "learning_rate": 5.364974844194759e-07, + "loss": 0.7603963, + "num_input_tokens_seen": 275634965, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.09881592, + "step": 12781, + "time_per_iteration": 4.043205976486206 + }, + { + "auxiliary_loss_clip": 0.06404178, + "auxiliary_loss_mlp": 0.01263917, + "balance_loss_clip": 0.06271324, + "balance_loss_mlp": 0.01254428, + "epoch": 0.7684954156019841, + "flos": 25854548595840.0, + "grad_norm": 1.651939170673441, + "language_loss": 0.79629219, + "learning_rate": 5.362320660762016e-07, + "loss": 0.87297314, + "num_input_tokens_seen": 275655785, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.0949707, + "step": 12782, + "time_per_iteration": 2.5380043983459473 + }, + { + "auxiliary_loss_clip": 0.06406912, + "auxiliary_loss_mlp": 0.01263775, + "balance_loss_clip": 0.06272779, + "balance_loss_mlp": 0.01253719, + "epoch": 0.768555538854652, + "flos": 25454444549760.0, + "grad_norm": 1.9972993449433587, + "language_loss": 0.66687256, + "learning_rate": 5.35966703239153e-07, + "loss": 0.74357939, + "num_input_tokens_seen": 275676160, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.10058594, + "step": 12783, + "time_per_iteration": 2.5223419666290283 + }, + { + "auxiliary_loss_clip": 0.0640647, + "auxiliary_loss_mlp": 0.01262671, + "balance_loss_clip": 0.06273863, + "balance_loss_mlp": 0.01253069, + "epoch": 0.76861566210732, + "flos": 19652503011840.0, + "grad_norm": 1.5789937278772177, + "language_loss": 0.69208997, + "learning_rate": 5.357013959183938e-07, + "loss": 0.7687813, + "num_input_tokens_seen": 275695660, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09606934, + "step": 12784, + "time_per_iteration": 2.5100221633911133 + }, + { + "auxiliary_loss_clip": 0.06402996, + "auxiliary_loss_mlp": 0.01264042, + "balance_loss_clip": 0.06271263, + "balance_loss_mlp": 0.01255482, + "epoch": 0.7686757853599879, + "flos": 22425586686720.0, + "grad_norm": 2.2747197635366074, + "language_loss": 0.80762935, + "learning_rate": 5.354361441239843e-07, + "loss": 0.88429976, + "num_input_tokens_seen": 275714025, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.08551025, + "step": 12785, + "time_per_iteration": 2.4869916439056396 + }, + { + "auxiliary_loss_clip": 0.06404176, + "auxiliary_loss_mlp": 0.01265645, + "balance_loss_clip": 0.06271531, + "balance_loss_mlp": 0.01255506, + "epoch": 0.768735908612656, + "flos": 47784659690880.0, + "grad_norm": 2.213863326437895, + "language_loss": 0.7748611, + "learning_rate": 5.351709478659836e-07, + "loss": 0.85155928, + "num_input_tokens_seen": 275737300, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.10137939, + "step": 12786, + "time_per_iteration": 2.7327218055725098 + }, + { + "auxiliary_loss_clip": 0.06400453, + "auxiliary_loss_mlp": 0.01264363, + "balance_loss_clip": 0.06269495, + "balance_loss_mlp": 0.01254844, + "epoch": 0.7687960318653239, + "flos": 30270996472320.0, + "grad_norm": 1.9359041928849132, + "language_loss": 0.58734947, + "learning_rate": 5.349058071544468e-07, + "loss": 0.66399765, + "num_input_tokens_seen": 275757895, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.09515381, + "step": 12787, + "time_per_iteration": 4.117979288101196 + }, + { + "auxiliary_loss_clip": 0.06401558, + "auxiliary_loss_mlp": 0.01264466, + "balance_loss_clip": 0.06272475, + "balance_loss_mlp": 0.01254972, + "epoch": 0.7688561551179919, + "flos": 19579562432640.0, + "grad_norm": 1.5619171139299415, + "language_loss": 0.76386726, + "learning_rate": 5.346407219994292e-07, + "loss": 0.84052753, + "num_input_tokens_seen": 275776745, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.0949707, + "step": 12788, + "time_per_iteration": 2.5265915393829346 + }, + { + "auxiliary_loss_clip": 0.06405907, + "auxiliary_loss_mlp": 0.0126463, + "balance_loss_clip": 0.06274015, + "balance_loss_mlp": 0.01254771, + "epoch": 0.7689162783706599, + "flos": 22790373436800.0, + "grad_norm": 1.5307962602577754, + "language_loss": 0.666574, + "learning_rate": 5.343756924109821e-07, + "loss": 0.74327934, + "num_input_tokens_seen": 275797205, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09844971, + "step": 12789, + "time_per_iteration": 2.5482897758483887 + }, + { + "auxiliary_loss_clip": 0.06407897, + "auxiliary_loss_mlp": 0.01269129, + "balance_loss_clip": 0.062732, + "balance_loss_mlp": 0.01258842, + "epoch": 0.7689764016233278, + "flos": 34212764568960.0, + "grad_norm": 1.7716505240879148, + "language_loss": 0.68803114, + "learning_rate": 5.341107183991553e-07, + "loss": 0.76480138, + "num_input_tokens_seen": 275817935, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.10290527, + "step": 12790, + "time_per_iteration": 2.6209323406219482 + }, + { + "auxiliary_loss_clip": 0.06403899, + "auxiliary_loss_mlp": 0.01263088, + "balance_loss_clip": 0.0627263, + "balance_loss_mlp": 0.01253825, + "epoch": 0.7690365248759958, + "flos": 17280152288640.0, + "grad_norm": 1.3993850053379062, + "language_loss": 0.68957317, + "learning_rate": 5.338457999739969e-07, + "loss": 0.76624304, + "num_input_tokens_seen": 275837145, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09265137, + "step": 12791, + "time_per_iteration": 2.5464963912963867 + }, + { + "auxiliary_loss_clip": 0.06400929, + "auxiliary_loss_mlp": 0.01264866, + "balance_loss_clip": 0.06270476, + "balance_loss_mlp": 0.01255418, + "epoch": 0.7690966481286637, + "flos": 18229008723840.0, + "grad_norm": 1.5956237198168277, + "language_loss": 0.79798484, + "learning_rate": 5.335809371455526e-07, + "loss": 0.87464273, + "num_input_tokens_seen": 275855705, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.09448242, + "step": 12792, + "time_per_iteration": 2.489346981048584 + }, + { + "auxiliary_loss_clip": 0.06410688, + "auxiliary_loss_mlp": 0.01268815, + "balance_loss_clip": 0.06273898, + "balance_loss_mlp": 0.01258999, + "epoch": 0.7691567713813318, + "flos": 21543004431360.0, + "grad_norm": 1.8308011822945844, + "language_loss": 0.73121727, + "learning_rate": 5.333161299238673e-07, + "loss": 0.80801225, + "num_input_tokens_seen": 275873930, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.09814453, + "step": 12793, + "time_per_iteration": 2.558523416519165 + }, + { + "auxiliary_loss_clip": 0.06407025, + "auxiliary_loss_mlp": 0.01264714, + "balance_loss_clip": 0.06272246, + "balance_loss_mlp": 0.01254689, + "epoch": 0.7692168946339997, + "flos": 39388568872320.0, + "grad_norm": 1.7835594774438226, + "language_loss": 0.63780582, + "learning_rate": 5.330513783189803e-07, + "loss": 0.7145232, + "num_input_tokens_seen": 275895895, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.1003418, + "step": 12794, + "time_per_iteration": 2.6618335247039795 + }, + { + "auxiliary_loss_clip": 0.06408365, + "auxiliary_loss_mlp": 0.01265956, + "balance_loss_clip": 0.06273225, + "balance_loss_mlp": 0.01256336, + "epoch": 0.7692770178866677, + "flos": 25017010709760.0, + "grad_norm": 1.4664054108250584, + "language_loss": 0.76531231, + "learning_rate": 5.327866823409319e-07, + "loss": 0.84205556, + "num_input_tokens_seen": 275917825, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.09619141, + "step": 12795, + "time_per_iteration": 2.5922963619232178 + }, + { + "auxiliary_loss_clip": 0.0640534, + "auxiliary_loss_mlp": 0.01263991, + "balance_loss_clip": 0.0627051, + "balance_loss_mlp": 0.01253453, + "epoch": 0.7693371411393356, + "flos": 24722984332800.0, + "grad_norm": 1.4884281283084904, + "language_loss": 0.72098613, + "learning_rate": 5.325220419997601e-07, + "loss": 0.79767948, + "num_input_tokens_seen": 275937890, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.10540771, + "step": 12796, + "time_per_iteration": 2.5227742195129395 + }, + { + "auxiliary_loss_clip": 0.06403993, + "auxiliary_loss_mlp": 0.01265667, + "balance_loss_clip": 0.06272089, + "balance_loss_mlp": 0.01255994, + "epoch": 0.7693972643920036, + "flos": 15930311339520.0, + "grad_norm": 1.7278751632986438, + "language_loss": 0.64795017, + "learning_rate": 5.32257457305499e-07, + "loss": 0.72464675, + "num_input_tokens_seen": 275954495, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09667969, + "step": 12797, + "time_per_iteration": 2.503452777862549 + }, + { + "auxiliary_loss_clip": 0.06409369, + "auxiliary_loss_mlp": 0.0127561, + "balance_loss_clip": 0.06275479, + "balance_loss_mlp": 0.01264798, + "epoch": 0.7694573876446715, + "flos": 25412125438080.0, + "grad_norm": 1.8485649321852773, + "language_loss": 0.91645068, + "learning_rate": 5.319929282681823e-07, + "loss": 0.9933005, + "num_input_tokens_seen": 275972395, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.10809326, + "step": 12798, + "time_per_iteration": 2.5266406536102295 + }, + { + "auxiliary_loss_clip": 0.06401522, + "auxiliary_loss_mlp": 0.01265889, + "balance_loss_clip": 0.06268082, + "balance_loss_mlp": 0.01256489, + "epoch": 0.7695175108973396, + "flos": 16659800985600.0, + "grad_norm": 1.7639360291305515, + "language_loss": 0.82879943, + "learning_rate": 5.317284548978418e-07, + "loss": 0.90547353, + "num_input_tokens_seen": 275989020, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.09387207, + "step": 12799, + "time_per_iteration": 2.4981637001037598 + }, + { + "auxiliary_loss_clip": 0.06404725, + "auxiliary_loss_mlp": 0.01268019, + "balance_loss_clip": 0.06270967, + "balance_loss_mlp": 0.01257862, + "epoch": 0.7695776341500075, + "flos": 13631697809280.0, + "grad_norm": 2.5788494866617513, + "language_loss": 0.78243637, + "learning_rate": 5.314640372045045e-07, + "loss": 0.85916382, + "num_input_tokens_seen": 276006525, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.10162354, + "step": 12800, + "time_per_iteration": 2.472907304763794 + }, + { + "auxiliary_loss_clip": 0.06410202, + "auxiliary_loss_mlp": 0.01266803, + "balance_loss_clip": 0.06270645, + "balance_loss_mlp": 0.01256182, + "epoch": 0.7696377574026755, + "flos": 24283034870400.0, + "grad_norm": 1.8264730167588297, + "language_loss": 0.84045184, + "learning_rate": 5.31199675198198e-07, + "loss": 0.9172219, + "num_input_tokens_seen": 276027130, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.10620117, + "step": 12801, + "time_per_iteration": 2.53623366355896 + }, + { + "auxiliary_loss_clip": 0.06406119, + "auxiliary_loss_mlp": 0.01267538, + "balance_loss_clip": 0.06272501, + "balance_loss_mlp": 0.01257495, + "epoch": 0.7696978806553435, + "flos": 20929445308800.0, + "grad_norm": 1.8709548721646438, + "language_loss": 0.73054564, + "learning_rate": 5.30935368888947e-07, + "loss": 0.80728221, + "num_input_tokens_seen": 276045715, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.1003418, + "step": 12802, + "time_per_iteration": 2.4759271144866943 + }, + { + "auxiliary_loss_clip": 0.06399865, + "auxiliary_loss_mlp": 0.01265258, + "balance_loss_clip": 0.06271532, + "balance_loss_mlp": 0.01255757, + "epoch": 0.7697580039080114, + "flos": 22936212668160.0, + "grad_norm": 1.8081953162086668, + "language_loss": 0.76470077, + "learning_rate": 5.306711182867747e-07, + "loss": 0.84135199, + "num_input_tokens_seen": 276065375, + "router_z_loss_clip": 1.28417969, + "router_z_loss_mlp": 0.0949707, + "step": 12803, + "time_per_iteration": 2.5474445819854736 + }, + { + "auxiliary_loss_clip": 0.06313179, + "auxiliary_loss_mlp": 0.01253049, + "balance_loss_clip": 0.06258132, + "balance_loss_mlp": 0.01251863, + "epoch": 0.7698181271606794, + "flos": 68737751850240.0, + "grad_norm": 0.742546771949619, + "language_loss": 0.55879092, + "learning_rate": 5.304069234017001e-07, + "loss": 0.63445318, + "num_input_tokens_seen": 276131405, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.01184082, + "step": 12804, + "time_per_iteration": 3.1489827632904053 + }, + { + "auxiliary_loss_clip": 0.06316254, + "auxiliary_loss_mlp": 0.0125264, + "balance_loss_clip": 0.0626114, + "balance_loss_mlp": 0.01251505, + "epoch": 0.7698782504133473, + "flos": 67430523502080.0, + "grad_norm": 0.7295540312789194, + "language_loss": 0.53939354, + "learning_rate": 5.301427842437429e-07, + "loss": 0.61508244, + "num_input_tokens_seen": 276200755, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.0113678, + "step": 12805, + "time_per_iteration": 3.2659192085266113 + }, + { + "auxiliary_loss_clip": 0.0640514, + "auxiliary_loss_mlp": 0.01270733, + "balance_loss_clip": 0.06272765, + "balance_loss_mlp": 0.01261047, + "epoch": 0.7699383736660154, + "flos": 22494879613440.0, + "grad_norm": 3.06352805467247, + "language_loss": 0.73035467, + "learning_rate": 5.298787008229187e-07, + "loss": 0.80711341, + "num_input_tokens_seen": 276217880, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.09686279, + "step": 12806, + "time_per_iteration": 2.4905054569244385 + }, + { + "auxiliary_loss_clip": 0.06401073, + "auxiliary_loss_mlp": 0.01266133, + "balance_loss_clip": 0.06269582, + "balance_loss_mlp": 0.01256704, + "epoch": 0.7699984969186833, + "flos": 21545520053760.0, + "grad_norm": 1.6739965963260217, + "language_loss": 0.75159943, + "learning_rate": 5.296146731492408e-07, + "loss": 0.82827145, + "num_input_tokens_seen": 276234810, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09423828, + "step": 12807, + "time_per_iteration": 2.5074682235717773 + }, + { + "auxiliary_loss_clip": 0.06406098, + "auxiliary_loss_mlp": 0.01264768, + "balance_loss_clip": 0.0626993, + "balance_loss_mlp": 0.01254098, + "epoch": 0.7700586201713513, + "flos": 21724412520960.0, + "grad_norm": 2.037865665188592, + "language_loss": 0.8067742, + "learning_rate": 5.293507012327218e-07, + "loss": 0.88348287, + "num_input_tokens_seen": 276252850, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.10681152, + "step": 12808, + "time_per_iteration": 3.8791632652282715 + }, + { + "auxiliary_loss_clip": 0.06407686, + "auxiliary_loss_mlp": 0.01266704, + "balance_loss_clip": 0.06271963, + "balance_loss_mlp": 0.01256595, + "epoch": 0.7701187434240192, + "flos": 27863580015360.0, + "grad_norm": 1.7006184108687237, + "language_loss": 0.7921378, + "learning_rate": 5.290867850833718e-07, + "loss": 0.8688817, + "num_input_tokens_seen": 276272525, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10113525, + "step": 12809, + "time_per_iteration": 2.5961480140686035 + }, + { + "auxiliary_loss_clip": 0.06399591, + "auxiliary_loss_mlp": 0.01264077, + "balance_loss_clip": 0.06270431, + "balance_loss_mlp": 0.01254594, + "epoch": 0.7701788666766872, + "flos": 28628848154880.0, + "grad_norm": 1.4421816702879584, + "language_loss": 0.70197344, + "learning_rate": 5.288229247111993e-07, + "loss": 0.77861011, + "num_input_tokens_seen": 276294210, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.0947876, + "step": 12810, + "time_per_iteration": 2.6107945442199707 + }, + { + "auxiliary_loss_clip": 0.06406891, + "auxiliary_loss_mlp": 0.01266297, + "balance_loss_clip": 0.06271058, + "balance_loss_mlp": 0.01254769, + "epoch": 0.7702389899293551, + "flos": 14251671768960.0, + "grad_norm": 2.2769003713635967, + "language_loss": 0.78979844, + "learning_rate": 5.285591201262079e-07, + "loss": 0.8665303, + "num_input_tokens_seen": 276310290, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.11523438, + "step": 12811, + "time_per_iteration": 2.555101156234741 + }, + { + "auxiliary_loss_clip": 0.06317817, + "auxiliary_loss_mlp": 0.01251839, + "balance_loss_clip": 0.06262816, + "balance_loss_mlp": 0.01250771, + "epoch": 0.7702991131820232, + "flos": 70593816441600.0, + "grad_norm": 0.7969175673938892, + "language_loss": 0.56677693, + "learning_rate": 5.28295371338402e-07, + "loss": 0.64247346, + "num_input_tokens_seen": 276371715, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.01069641, + "step": 12812, + "time_per_iteration": 3.1775879859924316 + }, + { + "auxiliary_loss_clip": 0.06404653, + "auxiliary_loss_mlp": 0.01265227, + "balance_loss_clip": 0.0627086, + "balance_loss_mlp": 0.01254898, + "epoch": 0.7703592364346911, + "flos": 25486449609600.0, + "grad_norm": 1.6911953299431426, + "language_loss": 0.72016954, + "learning_rate": 5.280316783577836e-07, + "loss": 0.79686838, + "num_input_tokens_seen": 276389895, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.10327148, + "step": 12813, + "time_per_iteration": 2.525716781616211 + }, + { + "auxiliary_loss_clip": 0.06403896, + "auxiliary_loss_mlp": 0.01265029, + "balance_loss_clip": 0.06270216, + "balance_loss_mlp": 0.01254962, + "epoch": 0.7704193596873591, + "flos": 19286877720960.0, + "grad_norm": 1.5106493285856717, + "language_loss": 0.66542912, + "learning_rate": 5.27768041194351e-07, + "loss": 0.74211836, + "num_input_tokens_seen": 276408990, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.10058594, + "step": 12814, + "time_per_iteration": 2.511730432510376 + }, + { + "auxiliary_loss_clip": 0.06403521, + "auxiliary_loss_mlp": 0.01267694, + "balance_loss_clip": 0.06271755, + "balance_loss_mlp": 0.01258288, + "epoch": 0.7704794829400271, + "flos": 23665031481600.0, + "grad_norm": 1.765991608700586, + "language_loss": 0.65916228, + "learning_rate": 5.275044598581018e-07, + "loss": 0.73587441, + "num_input_tokens_seen": 276428190, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09399414, + "step": 12815, + "time_per_iteration": 2.552647113800049 + }, + { + "auxiliary_loss_clip": 0.06402738, + "auxiliary_loss_mlp": 0.01262909, + "balance_loss_clip": 0.06270017, + "balance_loss_mlp": 0.01253324, + "epoch": 0.770539606192695, + "flos": 18995283112320.0, + "grad_norm": 3.1094364137223325, + "language_loss": 0.65588892, + "learning_rate": 5.272409343590322e-07, + "loss": 0.73254538, + "num_input_tokens_seen": 276446855, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.0958252, + "step": 12816, + "time_per_iteration": 2.5682597160339355 + }, + { + "auxiliary_loss_clip": 0.06410483, + "auxiliary_loss_mlp": 0.01271453, + "balance_loss_clip": 0.06275068, + "balance_loss_mlp": 0.01261321, + "epoch": 0.770599729445363, + "flos": 11833605843840.0, + "grad_norm": 2.2637093644731685, + "language_loss": 0.72246104, + "learning_rate": 5.26977464707133e-07, + "loss": 0.79928041, + "num_input_tokens_seen": 276462000, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.10131836, + "step": 12817, + "time_per_iteration": 2.485805034637451 + }, + { + "auxiliary_loss_clip": 0.06404669, + "auxiliary_loss_mlp": 0.01264386, + "balance_loss_clip": 0.06271846, + "balance_loss_mlp": 0.01254677, + "epoch": 0.770659852698031, + "flos": 17828527334400.0, + "grad_norm": 3.0609511184199523, + "language_loss": 0.61409748, + "learning_rate": 5.267140509123957e-07, + "loss": 0.69078803, + "num_input_tokens_seen": 276481190, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.0970459, + "step": 12818, + "time_per_iteration": 2.487680673599243 + }, + { + "auxiliary_loss_clip": 0.06399722, + "auxiliary_loss_mlp": 0.01262281, + "balance_loss_clip": 0.062704, + "balance_loss_mlp": 0.01253603, + "epoch": 0.770719975950699, + "flos": 21878469452160.0, + "grad_norm": 1.7396688274909713, + "language_loss": 0.67373377, + "learning_rate": 5.264506929848093e-07, + "loss": 0.75035375, + "num_input_tokens_seen": 276499520, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.08676147, + "step": 12819, + "time_per_iteration": 3.9379172325134277 + }, + { + "auxiliary_loss_clip": 0.06406172, + "auxiliary_loss_mlp": 0.01263778, + "balance_loss_clip": 0.06271698, + "balance_loss_mlp": 0.01253848, + "epoch": 0.7707800992033669, + "flos": 21331519925760.0, + "grad_norm": 1.7217491542401215, + "language_loss": 0.57604039, + "learning_rate": 5.261873909343608e-07, + "loss": 0.65273988, + "num_input_tokens_seen": 276519110, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.09924316, + "step": 12820, + "time_per_iteration": 2.495925188064575 + }, + { + "auxiliary_loss_clip": 0.06404679, + "auxiliary_loss_mlp": 0.01262498, + "balance_loss_clip": 0.06269978, + "balance_loss_mlp": 0.01252735, + "epoch": 0.7708402224560349, + "flos": 28186215361920.0, + "grad_norm": 1.643911762743471, + "language_loss": 0.81179225, + "learning_rate": 5.259241447710343e-07, + "loss": 0.88846403, + "num_input_tokens_seen": 276538805, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.09771729, + "step": 12821, + "time_per_iteration": 3.986278772354126 + }, + { + "auxiliary_loss_clip": 0.06404622, + "auxiliary_loss_mlp": 0.012636, + "balance_loss_clip": 0.06271188, + "balance_loss_mlp": 0.01253521, + "epoch": 0.7709003457087028, + "flos": 15382397491200.0, + "grad_norm": 1.8555601189743978, + "language_loss": 0.68379205, + "learning_rate": 5.256609545048114e-07, + "loss": 0.76047421, + "num_input_tokens_seen": 276554770, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.10076904, + "step": 12822, + "time_per_iteration": 2.4856462478637695 + }, + { + "auxiliary_loss_clip": 0.06400201, + "auxiliary_loss_mlp": 0.01266424, + "balance_loss_clip": 0.0626999, + "balance_loss_mlp": 0.01256786, + "epoch": 0.7709604689613708, + "flos": 30628697552640.0, + "grad_norm": 2.043450133419636, + "language_loss": 0.72353333, + "learning_rate": 5.253978201456733e-07, + "loss": 0.80019963, + "num_input_tokens_seen": 276574535, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.09637451, + "step": 12823, + "time_per_iteration": 2.5663697719573975 + }, + { + "auxiliary_loss_clip": 0.06408671, + "auxiliary_loss_mlp": 0.0126507, + "balance_loss_clip": 0.06270947, + "balance_loss_mlp": 0.01254437, + "epoch": 0.7710205922140387, + "flos": 20307207288960.0, + "grad_norm": 1.6756825279286318, + "language_loss": 0.76604235, + "learning_rate": 5.251347417035969e-07, + "loss": 0.84277976, + "num_input_tokens_seen": 276592925, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.10632324, + "step": 12824, + "time_per_iteration": 2.5135273933410645 + }, + { + "auxiliary_loss_clip": 0.0640358, + "auxiliary_loss_mlp": 0.01264934, + "balance_loss_clip": 0.06270701, + "balance_loss_mlp": 0.01255332, + "epoch": 0.7710807154667068, + "flos": 19649987389440.0, + "grad_norm": 2.8682033137355605, + "language_loss": 0.72291267, + "learning_rate": 5.248717191885592e-07, + "loss": 0.79959786, + "num_input_tokens_seen": 276610540, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.0960083, + "step": 12825, + "time_per_iteration": 2.539870262145996 + }, + { + "auxiliary_loss_clip": 0.06397466, + "auxiliary_loss_mlp": 0.01266775, + "balance_loss_clip": 0.06270086, + "balance_loss_mlp": 0.01257602, + "epoch": 0.7711408387193747, + "flos": 20011713465600.0, + "grad_norm": 1.348856880561093, + "language_loss": 0.73990041, + "learning_rate": 5.246087526105343e-07, + "loss": 0.8165428, + "num_input_tokens_seen": 276629200, + "router_z_loss_clip": 1.27539062, + "router_z_loss_mlp": 0.0916748, + "step": 12826, + "time_per_iteration": 3.9455349445343018 + }, + { + "auxiliary_loss_clip": 0.06404951, + "auxiliary_loss_mlp": 0.012643, + "balance_loss_clip": 0.06270047, + "balance_loss_mlp": 0.01253554, + "epoch": 0.7712009619720427, + "flos": 24977794199040.0, + "grad_norm": 1.495331253862981, + "language_loss": 0.81176156, + "learning_rate": 5.243458419794933e-07, + "loss": 0.88845408, + "num_input_tokens_seen": 276648655, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.10748291, + "step": 12827, + "time_per_iteration": 2.5489249229431152 + }, + { + "auxiliary_loss_clip": 0.0631479, + "auxiliary_loss_mlp": 0.01256103, + "balance_loss_clip": 0.06259546, + "balance_loss_mlp": 0.01255053, + "epoch": 0.7712610852247107, + "flos": 63269682105600.0, + "grad_norm": 0.8475476558719117, + "language_loss": 0.55242074, + "learning_rate": 5.240829873054051e-07, + "loss": 0.6281296, + "num_input_tokens_seen": 276716500, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01051331, + "step": 12828, + "time_per_iteration": 3.2874319553375244 + }, + { + "auxiliary_loss_clip": 0.06395887, + "auxiliary_loss_mlp": 0.01264145, + "balance_loss_clip": 0.06267989, + "balance_loss_mlp": 0.01255317, + "epoch": 0.7713212084773786, + "flos": 18703856211840.0, + "grad_norm": 1.6628752588878346, + "language_loss": 0.69472146, + "learning_rate": 5.23820188598238e-07, + "loss": 0.77132177, + "num_input_tokens_seen": 276733535, + "router_z_loss_clip": 1.27832031, + "router_z_loss_mlp": 0.08825684, + "step": 12829, + "time_per_iteration": 2.5006113052368164 + }, + { + "auxiliary_loss_clip": 0.06407359, + "auxiliary_loss_mlp": 0.01263662, + "balance_loss_clip": 0.06270751, + "balance_loss_mlp": 0.01253428, + "epoch": 0.7713813317300466, + "flos": 14178563481600.0, + "grad_norm": 2.5004318889819146, + "language_loss": 0.79485464, + "learning_rate": 5.235574458679579e-07, + "loss": 0.87156487, + "num_input_tokens_seen": 276749575, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.10235596, + "step": 12830, + "time_per_iteration": 2.455521821975708 + }, + { + "auxiliary_loss_clip": 0.06408571, + "auxiliary_loss_mlp": 0.01265761, + "balance_loss_clip": 0.06271582, + "balance_loss_mlp": 0.01254853, + "epoch": 0.7714414549827145, + "flos": 25711266913920.0, + "grad_norm": 1.5558349458942582, + "language_loss": 0.78193223, + "learning_rate": 5.232947591245269e-07, + "loss": 0.85867554, + "num_input_tokens_seen": 276769460, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.10906982, + "step": 12831, + "time_per_iteration": 2.55888295173645 + }, + { + "auxiliary_loss_clip": 0.06400928, + "auxiliary_loss_mlp": 0.01266262, + "balance_loss_clip": 0.06268953, + "balance_loss_mlp": 0.01256547, + "epoch": 0.7715015782353826, + "flos": 30563219986560.0, + "grad_norm": 1.4404933685883998, + "language_loss": 0.61150742, + "learning_rate": 5.230321283779071e-07, + "loss": 0.68817931, + "num_input_tokens_seen": 276790820, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.0970459, + "step": 12832, + "time_per_iteration": 2.5705411434173584 + }, + { + "auxiliary_loss_clip": 0.06408297, + "auxiliary_loss_mlp": 0.01268082, + "balance_loss_clip": 0.06271287, + "balance_loss_mlp": 0.01258271, + "epoch": 0.7715617014880505, + "flos": 20235440666880.0, + "grad_norm": 1.4904530814793735, + "language_loss": 0.79785657, + "learning_rate": 5.227695536380572e-07, + "loss": 0.87462032, + "num_input_tokens_seen": 276811345, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.09814453, + "step": 12833, + "time_per_iteration": 2.5475685596466064 + }, + { + "auxiliary_loss_clip": 0.06315958, + "auxiliary_loss_mlp": 0.01251107, + "balance_loss_clip": 0.06260836, + "balance_loss_mlp": 0.01250079, + "epoch": 0.7716218247407185, + "flos": 63681037326720.0, + "grad_norm": 0.8315874052432679, + "language_loss": 0.55088067, + "learning_rate": 5.22507034914933e-07, + "loss": 0.62655127, + "num_input_tokens_seen": 276870950, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.01027679, + "step": 12834, + "time_per_iteration": 3.1191012859344482 + }, + { + "auxiliary_loss_clip": 0.0640831, + "auxiliary_loss_mlp": 0.01264302, + "balance_loss_clip": 0.06273386, + "balance_loss_mlp": 0.01254294, + "epoch": 0.7716819479933864, + "flos": 19797881045760.0, + "grad_norm": 2.410723884633937, + "language_loss": 0.73350394, + "learning_rate": 5.222445722184903e-07, + "loss": 0.81023002, + "num_input_tokens_seen": 276890760, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.09997559, + "step": 12835, + "time_per_iteration": 2.5506582260131836 + }, + { + "auxiliary_loss_clip": 0.06406028, + "auxiliary_loss_mlp": 0.01267171, + "balance_loss_clip": 0.06272173, + "balance_loss_mlp": 0.01257884, + "epoch": 0.7717420712460544, + "flos": 18448082023680.0, + "grad_norm": 2.0308771684786113, + "language_loss": 0.70508468, + "learning_rate": 5.219821655586814e-07, + "loss": 0.78181666, + "num_input_tokens_seen": 276909625, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.09289551, + "step": 12836, + "time_per_iteration": 2.5232300758361816 + }, + { + "auxiliary_loss_clip": 0.06398998, + "auxiliary_loss_mlp": 0.01268729, + "balance_loss_clip": 0.06270441, + "balance_loss_mlp": 0.01259222, + "epoch": 0.7718021944987223, + "flos": 35198238038400.0, + "grad_norm": 1.831037228573652, + "language_loss": 0.60367215, + "learning_rate": 5.217198149454575e-07, + "loss": 0.68034947, + "num_input_tokens_seen": 276930760, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.09509277, + "step": 12837, + "time_per_iteration": 2.6591076850891113 + }, + { + "auxiliary_loss_clip": 0.06317183, + "auxiliary_loss_mlp": 0.01257562, + "balance_loss_clip": 0.0626177, + "balance_loss_mlp": 0.01256482, + "epoch": 0.7718623177513904, + "flos": 67944503646720.0, + "grad_norm": 0.8462887217652507, + "language_loss": 0.55739456, + "learning_rate": 5.214575203887666e-07, + "loss": 0.63314199, + "num_input_tokens_seen": 276989580, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01081848, + "step": 12838, + "time_per_iteration": 3.0941390991210938 + }, + { + "auxiliary_loss_clip": 0.06402552, + "auxiliary_loss_mlp": 0.01264762, + "balance_loss_clip": 0.06271369, + "balance_loss_mlp": 0.01255345, + "epoch": 0.7719224410040583, + "flos": 18586206679680.0, + "grad_norm": 2.2960724340178156, + "language_loss": 0.69924515, + "learning_rate": 5.211952818985538e-07, + "loss": 0.77591836, + "num_input_tokens_seen": 277005450, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09411621, + "step": 12839, + "time_per_iteration": 2.4651598930358887 + }, + { + "auxiliary_loss_clip": 0.06401128, + "auxiliary_loss_mlp": 0.01263167, + "balance_loss_clip": 0.0627085, + "balance_loss_mlp": 0.01253893, + "epoch": 0.7719825642567263, + "flos": 23082471169920.0, + "grad_norm": 1.724099382102015, + "language_loss": 0.79996341, + "learning_rate": 5.209330994847647e-07, + "loss": 0.87660646, + "num_input_tokens_seen": 277023055, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.09277344, + "step": 12840, + "time_per_iteration": 2.494185447692871 + }, + { + "auxiliary_loss_clip": 0.0640455, + "auxiliary_loss_mlp": 0.01263769, + "balance_loss_clip": 0.06271051, + "balance_loss_mlp": 0.01254202, + "epoch": 0.7720426875093943, + "flos": 20345249842560.0, + "grad_norm": 1.700648368789641, + "language_loss": 0.80246019, + "learning_rate": 5.206709731573402e-07, + "loss": 0.87914336, + "num_input_tokens_seen": 277041150, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.09564209, + "step": 12841, + "time_per_iteration": 2.4959654808044434 + }, + { + "auxiliary_loss_clip": 0.06402302, + "auxiliary_loss_mlp": 0.01263637, + "balance_loss_clip": 0.06268935, + "balance_loss_mlp": 0.01254261, + "epoch": 0.7721028107620622, + "flos": 23887878215040.0, + "grad_norm": 1.6460484096163284, + "language_loss": 0.76556861, + "learning_rate": 5.204089029262208e-07, + "loss": 0.84222806, + "num_input_tokens_seen": 277063895, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.09381104, + "step": 12842, + "time_per_iteration": 2.5414130687713623 + }, + { + "auxiliary_loss_clip": 0.06408067, + "auxiliary_loss_mlp": 0.0126426, + "balance_loss_clip": 0.06272548, + "balance_loss_mlp": 0.01254527, + "epoch": 0.7721629340147302, + "flos": 26658865537920.0, + "grad_norm": 1.6198153669730124, + "language_loss": 0.68824613, + "learning_rate": 5.201468888013445e-07, + "loss": 0.76496947, + "num_input_tokens_seen": 277084045, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.09735107, + "step": 12843, + "time_per_iteration": 2.555246353149414 + }, + { + "auxiliary_loss_clip": 0.06407151, + "auxiliary_loss_mlp": 0.01263842, + "balance_loss_clip": 0.06270268, + "balance_loss_mlp": 0.01254377, + "epoch": 0.7722230572673981, + "flos": 21185261424000.0, + "grad_norm": 1.9549573678277232, + "language_loss": 0.73833585, + "learning_rate": 5.198849307926465e-07, + "loss": 0.81504577, + "num_input_tokens_seen": 277102625, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.09472656, + "step": 12844, + "time_per_iteration": 2.475722312927246 + }, + { + "auxiliary_loss_clip": 0.06400653, + "auxiliary_loss_mlp": 0.01262464, + "balance_loss_clip": 0.0626903, + "balance_loss_mlp": 0.01253327, + "epoch": 0.7722831805200662, + "flos": 27972089452800.0, + "grad_norm": 1.4105737815374062, + "language_loss": 0.71880949, + "learning_rate": 5.196230289100596e-07, + "loss": 0.79544067, + "num_input_tokens_seen": 277123210, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09143066, + "step": 12845, + "time_per_iteration": 2.537477493286133 + }, + { + "auxiliary_loss_clip": 0.06397612, + "auxiliary_loss_mlp": 0.01266239, + "balance_loss_clip": 0.06268354, + "balance_loss_mlp": 0.01257095, + "epoch": 0.7723433037727341, + "flos": 33884049801600.0, + "grad_norm": 1.693366944822723, + "language_loss": 0.64408147, + "learning_rate": 5.193611831635159e-07, + "loss": 0.72071993, + "num_input_tokens_seen": 277144895, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.09143066, + "step": 12846, + "time_per_iteration": 2.5818498134613037 + }, + { + "auxiliary_loss_clip": 0.06312131, + "auxiliary_loss_mlp": 0.01253105, + "balance_loss_clip": 0.06256564, + "balance_loss_mlp": 0.01252078, + "epoch": 0.7724034270254021, + "flos": 62868194467200.0, + "grad_norm": 0.7376748551210195, + "language_loss": 0.61336023, + "learning_rate": 5.19099393562945e-07, + "loss": 0.68901265, + "num_input_tokens_seen": 277205160, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01026917, + "step": 12847, + "time_per_iteration": 3.0541763305664062 + }, + { + "auxiliary_loss_clip": 0.06401889, + "auxiliary_loss_mlp": 0.01264508, + "balance_loss_clip": 0.06268549, + "balance_loss_mlp": 0.01254983, + "epoch": 0.77246355027807, + "flos": 23302299156480.0, + "grad_norm": 1.5812634929817273, + "language_loss": 0.79369843, + "learning_rate": 5.188376601182732e-07, + "loss": 0.8703624, + "num_input_tokens_seen": 277223005, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.09527588, + "step": 12848, + "time_per_iteration": 3.9165518283843994 + }, + { + "auxiliary_loss_clip": 0.06404726, + "auxiliary_loss_mlp": 0.01266909, + "balance_loss_clip": 0.06268495, + "balance_loss_mlp": 0.01257086, + "epoch": 0.772523673530738, + "flos": 20127602062080.0, + "grad_norm": 1.566706530012109, + "language_loss": 0.73342961, + "learning_rate": 5.185759828394261e-07, + "loss": 0.81014597, + "num_input_tokens_seen": 277241785, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.0982666, + "step": 12849, + "time_per_iteration": 2.476515293121338 + }, + { + "auxiliary_loss_clip": 0.06402398, + "auxiliary_loss_mlp": 0.012638, + "balance_loss_clip": 0.06268849, + "balance_loss_mlp": 0.01254126, + "epoch": 0.7725837967834059, + "flos": 17825592441600.0, + "grad_norm": 2.2364064713439156, + "language_loss": 0.78424966, + "learning_rate": 5.183143617363261e-07, + "loss": 0.86091167, + "num_input_tokens_seen": 277259050, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.09667969, + "step": 12850, + "time_per_iteration": 2.4794983863830566 + }, + { + "auxiliary_loss_clip": 0.0640396, + "auxiliary_loss_mlp": 0.0126685, + "balance_loss_clip": 0.06267555, + "balance_loss_mlp": 0.01256616, + "epoch": 0.772643920036074, + "flos": 27206318188800.0, + "grad_norm": 1.5059914394205691, + "language_loss": 0.80266678, + "learning_rate": 5.180527968188935e-07, + "loss": 0.87937486, + "num_input_tokens_seen": 277278235, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.10235596, + "step": 12851, + "time_per_iteration": 2.5322558879852295 + }, + { + "auxiliary_loss_clip": 0.06400898, + "auxiliary_loss_mlp": 0.01263165, + "balance_loss_clip": 0.06270088, + "balance_loss_mlp": 0.01253193, + "epoch": 0.7727040432887419, + "flos": 21585868594560.0, + "grad_norm": 1.7096231270301345, + "language_loss": 0.73980415, + "learning_rate": 5.177912880970474e-07, + "loss": 0.81644481, + "num_input_tokens_seen": 277298355, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09973145, + "step": 12852, + "time_per_iteration": 2.5234642028808594 + }, + { + "auxiliary_loss_clip": 0.06399091, + "auxiliary_loss_mlp": 0.01264912, + "balance_loss_clip": 0.06268281, + "balance_loss_mlp": 0.01255685, + "epoch": 0.7727641665414099, + "flos": 22243172348160.0, + "grad_norm": 1.8458923236919589, + "language_loss": 0.82645077, + "learning_rate": 5.17529835580704e-07, + "loss": 0.90309083, + "num_input_tokens_seen": 277316095, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.09222412, + "step": 12853, + "time_per_iteration": 2.4855525493621826 + }, + { + "auxiliary_loss_clip": 0.06312872, + "auxiliary_loss_mlp": 0.01252237, + "balance_loss_clip": 0.06257433, + "balance_loss_mlp": 0.01251258, + "epoch": 0.7728242897940779, + "flos": 54852613038720.0, + "grad_norm": 0.7809207037354382, + "language_loss": 0.54245615, + "learning_rate": 5.172684392797786e-07, + "loss": 0.6181072, + "num_input_tokens_seen": 277380130, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.00978088, + "step": 12854, + "time_per_iteration": 3.1956636905670166 + }, + { + "auxiliary_loss_clip": 0.06408576, + "auxiliary_loss_mlp": 0.01265841, + "balance_loss_clip": 0.06272317, + "balance_loss_mlp": 0.01255667, + "epoch": 0.7728844130467458, + "flos": 34470970525440.0, + "grad_norm": 1.470895080979425, + "language_loss": 0.7210083, + "learning_rate": 5.170070992041826e-07, + "loss": 0.7977525, + "num_input_tokens_seen": 277404015, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.10168457, + "step": 12855, + "time_per_iteration": 2.6422533988952637 + }, + { + "auxiliary_loss_clip": 0.0640472, + "auxiliary_loss_mlp": 0.01265685, + "balance_loss_clip": 0.06271958, + "balance_loss_mlp": 0.01256059, + "epoch": 0.7729445362994138, + "flos": 18922300606080.0, + "grad_norm": 1.643707808983738, + "language_loss": 0.68152243, + "learning_rate": 5.167458153638254e-07, + "loss": 0.75822645, + "num_input_tokens_seen": 277421375, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09619141, + "step": 12856, + "time_per_iteration": 2.581195592880249 + }, + { + "auxiliary_loss_clip": 0.06403085, + "auxiliary_loss_mlp": 0.01263682, + "balance_loss_clip": 0.06271385, + "balance_loss_mlp": 0.01254241, + "epoch": 0.7730046595520818, + "flos": 22206555313920.0, + "grad_norm": 2.739925215135401, + "language_loss": 0.7896111, + "learning_rate": 5.164845877686162e-07, + "loss": 0.86627877, + "num_input_tokens_seen": 277440170, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.09442139, + "step": 12857, + "time_per_iteration": 2.536677360534668 + }, + { + "auxiliary_loss_clip": 0.06400988, + "auxiliary_loss_mlp": 0.01266407, + "balance_loss_clip": 0.06271593, + "balance_loss_mlp": 0.01256447, + "epoch": 0.7730647828047498, + "flos": 13557289783680.0, + "grad_norm": 1.6864648119346977, + "language_loss": 0.7856096, + "learning_rate": 5.162234164284591e-07, + "loss": 0.86228359, + "num_input_tokens_seen": 277456880, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.09954834, + "step": 12858, + "time_per_iteration": 3.9322428703308105 + }, + { + "auxiliary_loss_clip": 0.06406689, + "auxiliary_loss_mlp": 0.01266364, + "balance_loss_clip": 0.06271519, + "balance_loss_mlp": 0.01256392, + "epoch": 0.7731249060574177, + "flos": 21981654155520.0, + "grad_norm": 1.7779455572777159, + "language_loss": 0.77746201, + "learning_rate": 5.159623013532591e-07, + "loss": 0.8541925, + "num_input_tokens_seen": 277475365, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.09967041, + "step": 12859, + "time_per_iteration": 2.513849973678589 + }, + { + "auxiliary_loss_clip": 0.06403208, + "auxiliary_loss_mlp": 0.01261712, + "balance_loss_clip": 0.06273893, + "balance_loss_mlp": 0.01253284, + "epoch": 0.7731850293100857, + "flos": 22608462222720.0, + "grad_norm": 1.6555727720253302, + "language_loss": 0.67912078, + "learning_rate": 5.157012425529186e-07, + "loss": 0.75576997, + "num_input_tokens_seen": 277494975, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.08428955, + "step": 12860, + "time_per_iteration": 4.005707740783691 + }, + { + "auxiliary_loss_clip": 0.06407683, + "auxiliary_loss_mlp": 0.01265641, + "balance_loss_clip": 0.06270751, + "balance_loss_mlp": 0.01255449, + "epoch": 0.7732451525627536, + "flos": 14103274988160.0, + "grad_norm": 2.651215964660107, + "language_loss": 0.75251514, + "learning_rate": 5.154402400373343e-07, + "loss": 0.82924837, + "num_input_tokens_seen": 277510520, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.10198975, + "step": 12861, + "time_per_iteration": 2.444032907485962 + }, + { + "auxiliary_loss_clip": 0.06406768, + "auxiliary_loss_mlp": 0.01262473, + "balance_loss_clip": 0.06270678, + "balance_loss_mlp": 0.01252328, + "epoch": 0.7733052758154216, + "flos": 21476352908160.0, + "grad_norm": 3.091257297697316, + "language_loss": 0.75125277, + "learning_rate": 5.15179293816405e-07, + "loss": 0.82794511, + "num_input_tokens_seen": 277530505, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.10137939, + "step": 12862, + "time_per_iteration": 2.5575408935546875 + }, + { + "auxiliary_loss_clip": 0.06400394, + "auxiliary_loss_mlp": 0.01264588, + "balance_loss_clip": 0.06270863, + "balance_loss_mlp": 0.01255552, + "epoch": 0.7733653990680895, + "flos": 21400142019840.0, + "grad_norm": 1.5224536718195483, + "language_loss": 0.83015412, + "learning_rate": 5.149184039000256e-07, + "loss": 0.90680391, + "num_input_tokens_seen": 277550810, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.09039307, + "step": 12863, + "time_per_iteration": 2.500004529953003 + }, + { + "auxiliary_loss_clip": 0.06403436, + "auxiliary_loss_mlp": 0.01266726, + "balance_loss_clip": 0.06272671, + "balance_loss_mlp": 0.01257172, + "epoch": 0.7734255223207576, + "flos": 17681849562240.0, + "grad_norm": 1.666044209334627, + "language_loss": 0.73906845, + "learning_rate": 5.146575702980898e-07, + "loss": 0.81577015, + "num_input_tokens_seen": 277567680, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.09558105, + "step": 12864, + "time_per_iteration": 2.502202272415161 + }, + { + "auxiliary_loss_clip": 0.06405224, + "auxiliary_loss_mlp": 0.01262028, + "balance_loss_clip": 0.06273071, + "balance_loss_mlp": 0.01253117, + "epoch": 0.7734856455734255, + "flos": 25238264215680.0, + "grad_norm": 1.8553120895059094, + "language_loss": 0.82274187, + "learning_rate": 5.143967930204871e-07, + "loss": 0.89941442, + "num_input_tokens_seen": 277588970, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.08911133, + "step": 12865, + "time_per_iteration": 2.5821845531463623 + }, + { + "auxiliary_loss_clip": 0.0640586, + "auxiliary_loss_mlp": 0.0126401, + "balance_loss_clip": 0.06269649, + "balance_loss_mlp": 0.01253627, + "epoch": 0.7735457688260935, + "flos": 23438579022720.0, + "grad_norm": 2.0985789262446763, + "language_loss": 0.71729589, + "learning_rate": 5.141360720771077e-07, + "loss": 0.79399455, + "num_input_tokens_seen": 277605450, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.10375977, + "step": 12866, + "time_per_iteration": 3.9061973094940186 + }, + { + "auxiliary_loss_clip": 0.06406082, + "auxiliary_loss_mlp": 0.01266662, + "balance_loss_clip": 0.06272133, + "balance_loss_mlp": 0.01256309, + "epoch": 0.7736058920787615, + "flos": 18734393825280.0, + "grad_norm": 2.2008061294183046, + "language_loss": 0.64883512, + "learning_rate": 5.138754074778371e-07, + "loss": 0.72556257, + "num_input_tokens_seen": 277622530, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.1036377, + "step": 12867, + "time_per_iteration": 2.438513994216919 + }, + { + "auxiliary_loss_clip": 0.06398055, + "auxiliary_loss_mlp": 0.01264338, + "balance_loss_clip": 0.06268299, + "balance_loss_mlp": 0.01254897, + "epoch": 0.7736660153314294, + "flos": 22899931050240.0, + "grad_norm": 1.3982915625107966, + "language_loss": 0.71222079, + "learning_rate": 5.136147992325595e-07, + "loss": 0.7888447, + "num_input_tokens_seen": 277642700, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.09442139, + "step": 12868, + "time_per_iteration": 2.521263599395752 + }, + { + "auxiliary_loss_clip": 0.06407171, + "auxiliary_loss_mlp": 0.01263296, + "balance_loss_clip": 0.06272081, + "balance_loss_mlp": 0.01253252, + "epoch": 0.7737261385840974, + "flos": 13804762417920.0, + "grad_norm": 1.9680842128147285, + "language_loss": 0.78157473, + "learning_rate": 5.133542473511578e-07, + "loss": 0.85827935, + "num_input_tokens_seen": 277660005, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.10046387, + "step": 12869, + "time_per_iteration": 2.4751439094543457 + }, + { + "auxiliary_loss_clip": 0.06399751, + "auxiliary_loss_mlp": 0.01264789, + "balance_loss_clip": 0.06270332, + "balance_loss_mlp": 0.0125536, + "epoch": 0.7737862618367654, + "flos": 28738279987200.0, + "grad_norm": 1.45372997777974, + "language_loss": 0.73862869, + "learning_rate": 5.130937518435124e-07, + "loss": 0.81527412, + "num_input_tokens_seen": 277682890, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.09429932, + "step": 12870, + "time_per_iteration": 2.568042278289795 + }, + { + "auxiliary_loss_clip": 0.06404359, + "auxiliary_loss_mlp": 0.01266949, + "balance_loss_clip": 0.06270356, + "balance_loss_mlp": 0.01257102, + "epoch": 0.7738463850894334, + "flos": 17024126538240.0, + "grad_norm": 1.914928650569768, + "language_loss": 0.75650132, + "learning_rate": 5.12833312719501e-07, + "loss": 0.83321428, + "num_input_tokens_seen": 277699330, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.09851074, + "step": 12871, + "time_per_iteration": 2.4711315631866455 + }, + { + "auxiliary_loss_clip": 0.06402566, + "auxiliary_loss_mlp": 0.0126384, + "balance_loss_clip": 0.06271693, + "balance_loss_mlp": 0.01254416, + "epoch": 0.7739065083421013, + "flos": 20710246227840.0, + "grad_norm": 1.4478463877402143, + "language_loss": 0.69638461, + "learning_rate": 5.12572929988999e-07, + "loss": 0.77304864, + "num_input_tokens_seen": 277718750, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.09417725, + "step": 12872, + "time_per_iteration": 2.520254135131836 + }, + { + "auxiliary_loss_clip": 0.06404334, + "auxiliary_loss_mlp": 0.01264657, + "balance_loss_clip": 0.0627078, + "balance_loss_mlp": 0.01254173, + "epoch": 0.7739666315947693, + "flos": 20702322017280.0, + "grad_norm": 2.162643360462714, + "language_loss": 0.8514446, + "learning_rate": 5.123126036618804e-07, + "loss": 0.92813456, + "num_input_tokens_seen": 277734645, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.10479736, + "step": 12873, + "time_per_iteration": 2.5746922492980957 + }, + { + "auxiliary_loss_clip": 0.06405018, + "auxiliary_loss_mlp": 0.01265436, + "balance_loss_clip": 0.06272902, + "balance_loss_mlp": 0.0125612, + "epoch": 0.7740267548474372, + "flos": 29578501203840.0, + "grad_norm": 2.074777829849384, + "language_loss": 0.66097724, + "learning_rate": 5.120523337480174e-07, + "loss": 0.73768181, + "num_input_tokens_seen": 277755535, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09313965, + "step": 12874, + "time_per_iteration": 2.5801379680633545 + }, + { + "auxiliary_loss_clip": 0.06399316, + "auxiliary_loss_mlp": 0.01262488, + "balance_loss_clip": 0.06268813, + "balance_loss_mlp": 0.01253166, + "epoch": 0.7740868781001052, + "flos": 23665786168320.0, + "grad_norm": 1.7962266070608972, + "language_loss": 0.62437928, + "learning_rate": 5.117921202572785e-07, + "loss": 0.70099723, + "num_input_tokens_seen": 277775585, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.09313965, + "step": 12875, + "time_per_iteration": 2.5030999183654785 + }, + { + "auxiliary_loss_clip": 0.06404817, + "auxiliary_loss_mlp": 0.01262981, + "balance_loss_clip": 0.06269525, + "balance_loss_mlp": 0.0125264, + "epoch": 0.7741470013527731, + "flos": 24724200216960.0, + "grad_norm": 1.663352661776614, + "language_loss": 0.65509927, + "learning_rate": 5.115319631995318e-07, + "loss": 0.73177719, + "num_input_tokens_seen": 277794795, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.10345459, + "step": 12876, + "time_per_iteration": 2.5258145332336426 + }, + { + "auxiliary_loss_clip": 0.06400372, + "auxiliary_loss_mlp": 0.01266731, + "balance_loss_clip": 0.06269747, + "balance_loss_mlp": 0.01258005, + "epoch": 0.7742071246054412, + "flos": 21878092108800.0, + "grad_norm": 1.7333890551620577, + "language_loss": 0.71176594, + "learning_rate": 5.112718625846433e-07, + "loss": 0.78843695, + "num_input_tokens_seen": 277813235, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.08734131, + "step": 12877, + "time_per_iteration": 2.4929704666137695 + }, + { + "auxiliary_loss_clip": 0.06407753, + "auxiliary_loss_mlp": 0.01264403, + "balance_loss_clip": 0.06269468, + "balance_loss_mlp": 0.01254371, + "epoch": 0.7742672478581091, + "flos": 22680815823360.0, + "grad_norm": 1.9764136329910882, + "language_loss": 0.82948673, + "learning_rate": 5.110118184224736e-07, + "loss": 0.90620828, + "num_input_tokens_seen": 277832560, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.1003418, + "step": 12878, + "time_per_iteration": 2.502988338470459 + }, + { + "auxiliary_loss_clip": 0.06402762, + "auxiliary_loss_mlp": 0.01265169, + "balance_loss_clip": 0.06269325, + "balance_loss_mlp": 0.0125531, + "epoch": 0.7743273711107771, + "flos": 18846425134080.0, + "grad_norm": 1.6763538175981627, + "language_loss": 0.73367083, + "learning_rate": 5.10751830722885e-07, + "loss": 0.81035012, + "num_input_tokens_seen": 277850120, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.09857178, + "step": 12879, + "time_per_iteration": 2.4705021381378174 + }, + { + "auxiliary_loss_clip": 0.06397247, + "auxiliary_loss_mlp": 0.01265601, + "balance_loss_clip": 0.06268625, + "balance_loss_mlp": 0.01256219, + "epoch": 0.7743874943634451, + "flos": 28736644832640.0, + "grad_norm": 1.5623883440546136, + "language_loss": 0.79838526, + "learning_rate": 5.104918994957364e-07, + "loss": 0.87501371, + "num_input_tokens_seen": 277871020, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.09381104, + "step": 12880, + "time_per_iteration": 2.556452989578247 + }, + { + "auxiliary_loss_clip": 0.06398898, + "auxiliary_loss_mlp": 0.01266264, + "balance_loss_clip": 0.06267609, + "balance_loss_mlp": 0.01255899, + "epoch": 0.774447617616113, + "flos": 21916344297600.0, + "grad_norm": 1.366667718096845, + "language_loss": 0.70864272, + "learning_rate": 5.102320247508847e-07, + "loss": 0.78529441, + "num_input_tokens_seen": 277891525, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.10375977, + "step": 12881, + "time_per_iteration": 2.521993637084961 + }, + { + "auxiliary_loss_clip": 0.06408711, + "auxiliary_loss_mlp": 0.01270141, + "balance_loss_clip": 0.06270668, + "balance_loss_mlp": 0.01258512, + "epoch": 0.774507740868781, + "flos": 19506789561600.0, + "grad_norm": 2.127818654803154, + "language_loss": 0.84771377, + "learning_rate": 5.099722064981832e-07, + "loss": 0.92450231, + "num_input_tokens_seen": 277910425, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.11627197, + "step": 12882, + "time_per_iteration": 2.5355141162872314 + }, + { + "auxiliary_loss_clip": 0.06311849, + "auxiliary_loss_mlp": 0.01254336, + "balance_loss_clip": 0.06256157, + "balance_loss_mlp": 0.01253313, + "epoch": 0.774567864121449, + "flos": 59447240622720.0, + "grad_norm": 0.7584667410578986, + "language_loss": 0.60187125, + "learning_rate": 5.097124447474858e-07, + "loss": 0.67753309, + "num_input_tokens_seen": 277972795, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.01023102, + "step": 12883, + "time_per_iteration": 3.124359607696533 + }, + { + "auxiliary_loss_clip": 0.06403667, + "auxiliary_loss_mlp": 0.01265861, + "balance_loss_clip": 0.06270087, + "balance_loss_mlp": 0.01255073, + "epoch": 0.774627987374117, + "flos": 13230461733120.0, + "grad_norm": 1.8439274810077488, + "language_loss": 0.72904599, + "learning_rate": 5.094527395086416e-07, + "loss": 0.80574125, + "num_input_tokens_seen": 277990675, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.10778809, + "step": 12884, + "time_per_iteration": 2.4965550899505615 + }, + { + "auxiliary_loss_clip": 0.06399918, + "auxiliary_loss_mlp": 0.01266004, + "balance_loss_clip": 0.06270594, + "balance_loss_mlp": 0.01257301, + "epoch": 0.7746881106267849, + "flos": 21399848530560.0, + "grad_norm": 1.5524278185982343, + "language_loss": 0.81275487, + "learning_rate": 5.091930907914986e-07, + "loss": 0.88941407, + "num_input_tokens_seen": 278010050, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.08703613, + "step": 12885, + "time_per_iteration": 2.557429075241089 + }, + { + "auxiliary_loss_clip": 0.06401367, + "auxiliary_loss_mlp": 0.01263161, + "balance_loss_clip": 0.06271436, + "balance_loss_mlp": 0.01254084, + "epoch": 0.7747482338794529, + "flos": 25636355763840.0, + "grad_norm": 1.6694918727870636, + "language_loss": 0.63739854, + "learning_rate": 5.089334986059029e-07, + "loss": 0.71404386, + "num_input_tokens_seen": 278030660, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.09088135, + "step": 12886, + "time_per_iteration": 2.5352628231048584 + }, + { + "auxiliary_loss_clip": 0.06405632, + "auxiliary_loss_mlp": 0.01262726, + "balance_loss_clip": 0.06271148, + "balance_loss_mlp": 0.01254221, + "epoch": 0.7748083571321208, + "flos": 11551780235520.0, + "grad_norm": 2.0761314412195335, + "language_loss": 0.69713193, + "learning_rate": 5.086739629616987e-07, + "loss": 0.77381551, + "num_input_tokens_seen": 278047645, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.08508301, + "step": 12887, + "time_per_iteration": 3.896411657333374 + }, + { + "auxiliary_loss_clip": 0.06400104, + "auxiliary_loss_mlp": 0.01265417, + "balance_loss_clip": 0.0626978, + "balance_loss_mlp": 0.01256036, + "epoch": 0.7748684803847888, + "flos": 19068433326720.0, + "grad_norm": 1.724718840710913, + "language_loss": 0.70770532, + "learning_rate": 5.084144838687275e-07, + "loss": 0.78436053, + "num_input_tokens_seen": 278066170, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.09381104, + "step": 12888, + "time_per_iteration": 2.5054144859313965 + }, + { + "auxiliary_loss_clip": 0.06406914, + "auxiliary_loss_mlp": 0.01266857, + "balance_loss_clip": 0.06270684, + "balance_loss_mlp": 0.01256372, + "epoch": 0.7749286036374567, + "flos": 22279705528320.0, + "grad_norm": 1.6247326651931444, + "language_loss": 0.8212378, + "learning_rate": 5.081550613368279e-07, + "loss": 0.89797544, + "num_input_tokens_seen": 278085545, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.1048584, + "step": 12889, + "time_per_iteration": 2.503159999847412 + }, + { + "auxiliary_loss_clip": 0.0640256, + "auxiliary_loss_mlp": 0.01267254, + "balance_loss_clip": 0.0627083, + "balance_loss_mlp": 0.01258122, + "epoch": 0.7749887268901248, + "flos": 20198488216320.0, + "grad_norm": 1.8373652721061162, + "language_loss": 0.79928273, + "learning_rate": 5.07895695375838e-07, + "loss": 0.87598085, + "num_input_tokens_seen": 278102995, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.09130859, + "step": 12890, + "time_per_iteration": 2.4615426063537598 + }, + { + "auxiliary_loss_clip": 0.06406836, + "auxiliary_loss_mlp": 0.01270493, + "balance_loss_clip": 0.06273372, + "balance_loss_mlp": 0.01260206, + "epoch": 0.7750488501427927, + "flos": 20343446979840.0, + "grad_norm": 1.6840660181274105, + "language_loss": 0.66623914, + "learning_rate": 5.076363859955932e-07, + "loss": 0.74301237, + "num_input_tokens_seen": 278121460, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.1027832, + "step": 12891, + "time_per_iteration": 2.4890570640563965 + }, + { + "auxiliary_loss_clip": 0.06404784, + "auxiliary_loss_mlp": 0.01265118, + "balance_loss_clip": 0.06270394, + "balance_loss_mlp": 0.01255241, + "epoch": 0.7751089733954607, + "flos": 28371229176960.0, + "grad_norm": 1.3810973475198156, + "language_loss": 0.79341507, + "learning_rate": 5.073771332059257e-07, + "loss": 0.87011403, + "num_input_tokens_seen": 278143905, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.09881592, + "step": 12892, + "time_per_iteration": 2.5426137447357178 + }, + { + "auxiliary_loss_clip": 0.06410879, + "auxiliary_loss_mlp": 0.01265811, + "balance_loss_clip": 0.06274527, + "balance_loss_mlp": 0.01255273, + "epoch": 0.7751690966481286, + "flos": 16949047680000.0, + "grad_norm": 1.9398212373821864, + "language_loss": 0.67894936, + "learning_rate": 5.071179370166669e-07, + "loss": 0.75571626, + "num_input_tokens_seen": 278160850, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.10522461, + "step": 12893, + "time_per_iteration": 2.469115734100342 + }, + { + "auxiliary_loss_clip": 0.06313038, + "auxiliary_loss_mlp": 0.0125019, + "balance_loss_clip": 0.06257471, + "balance_loss_mlp": 0.01248948, + "epoch": 0.7752292199007966, + "flos": 65690179799040.0, + "grad_norm": 0.7899277487406899, + "language_loss": 0.58551872, + "learning_rate": 5.068587974376468e-07, + "loss": 0.66115099, + "num_input_tokens_seen": 278219950, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01241302, + "step": 12894, + "time_per_iteration": 3.1802139282226562 + }, + { + "auxiliary_loss_clip": 0.06405281, + "auxiliary_loss_mlp": 0.012653, + "balance_loss_clip": 0.06270818, + "balance_loss_mlp": 0.01254637, + "epoch": 0.7752893431534646, + "flos": 20600898249600.0, + "grad_norm": 2.1408661734068697, + "language_loss": 0.78008652, + "learning_rate": 5.065997144786895e-07, + "loss": 0.85679233, + "num_input_tokens_seen": 278237805, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.10662842, + "step": 12895, + "time_per_iteration": 2.517387866973877 + }, + { + "auxiliary_loss_clip": 0.06404513, + "auxiliary_loss_mlp": 0.01265126, + "balance_loss_clip": 0.06271935, + "balance_loss_mlp": 0.01255124, + "epoch": 0.7753494664061326, + "flos": 20491592198400.0, + "grad_norm": 1.7101210231802921, + "language_loss": 0.67742205, + "learning_rate": 5.063406881496209e-07, + "loss": 0.75411844, + "num_input_tokens_seen": 278257660, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.10003662, + "step": 12896, + "time_per_iteration": 2.508040428161621 + }, + { + "auxiliary_loss_clip": 0.06401385, + "auxiliary_loss_mlp": 0.01264283, + "balance_loss_clip": 0.06268774, + "balance_loss_mlp": 0.01254717, + "epoch": 0.7754095896588006, + "flos": 20272015774080.0, + "grad_norm": 1.718290101877412, + "language_loss": 0.68828535, + "learning_rate": 5.060817184602629e-07, + "loss": 0.76494199, + "num_input_tokens_seen": 278275110, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.09570312, + "step": 12897, + "time_per_iteration": 3.958052158355713 + }, + { + "auxiliary_loss_clip": 0.06406542, + "auxiliary_loss_mlp": 0.01265206, + "balance_loss_clip": 0.06272966, + "balance_loss_mlp": 0.01255074, + "epoch": 0.7754697129114685, + "flos": 23337784160640.0, + "grad_norm": 1.8777545444749013, + "language_loss": 0.75346845, + "learning_rate": 5.058228054204364e-07, + "loss": 0.83018595, + "num_input_tokens_seen": 278293035, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.10131836, + "step": 12898, + "time_per_iteration": 2.548725128173828 + }, + { + "auxiliary_loss_clip": 0.06405295, + "auxiliary_loss_mlp": 0.0126368, + "balance_loss_clip": 0.06271052, + "balance_loss_mlp": 0.01253231, + "epoch": 0.7755298361641365, + "flos": 17353344430080.0, + "grad_norm": 2.11113178190308, + "language_loss": 0.70727742, + "learning_rate": 5.055639490399588e-07, + "loss": 0.78396714, + "num_input_tokens_seen": 278311010, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.10443115, + "step": 12899, + "time_per_iteration": 2.4659245014190674 + }, + { + "auxiliary_loss_clip": 0.06405385, + "auxiliary_loss_mlp": 0.01264565, + "balance_loss_clip": 0.06272905, + "balance_loss_mlp": 0.01254266, + "epoch": 0.7755899594168044, + "flos": 19651916033280.0, + "grad_norm": 2.07260093915493, + "language_loss": 0.74897844, + "learning_rate": 5.053051493286453e-07, + "loss": 0.82567799, + "num_input_tokens_seen": 278329900, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.10302734, + "step": 12900, + "time_per_iteration": 4.011428117752075 + }, + { + "auxiliary_loss_clip": 0.06400472, + "auxiliary_loss_mlp": 0.01264751, + "balance_loss_clip": 0.06270377, + "balance_loss_mlp": 0.01255525, + "epoch": 0.7756500826694724, + "flos": 27421324565760.0, + "grad_norm": 1.5623703239819655, + "language_loss": 0.77776372, + "learning_rate": 5.050464062963113e-07, + "loss": 0.85441595, + "num_input_tokens_seen": 278349980, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.09234619, + "step": 12901, + "time_per_iteration": 2.551858425140381 + }, + { + "auxiliary_loss_clip": 0.0639973, + "auxiliary_loss_mlp": 0.0126504, + "balance_loss_clip": 0.06269458, + "balance_loss_mlp": 0.01255289, + "epoch": 0.7757102059221404, + "flos": 28738028424960.0, + "grad_norm": 1.3485417524175327, + "language_loss": 0.77421844, + "learning_rate": 5.047877199527666e-07, + "loss": 0.8508662, + "num_input_tokens_seen": 278372485, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.09747314, + "step": 12902, + "time_per_iteration": 2.5616962909698486 + }, + { + "auxiliary_loss_clip": 0.06401799, + "auxiliary_loss_mlp": 0.01266411, + "balance_loss_clip": 0.06270513, + "balance_loss_mlp": 0.01256898, + "epoch": 0.7757703291748084, + "flos": 22492489772160.0, + "grad_norm": 1.8023361426905782, + "language_loss": 0.73515046, + "learning_rate": 5.045290903078215e-07, + "loss": 0.81183261, + "num_input_tokens_seen": 278391660, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.09509277, + "step": 12903, + "time_per_iteration": 2.5368919372558594 + }, + { + "auxiliary_loss_clip": 0.06400372, + "auxiliary_loss_mlp": 0.01263703, + "balance_loss_clip": 0.06269526, + "balance_loss_mlp": 0.01253851, + "epoch": 0.7758304524274763, + "flos": 21435920513280.0, + "grad_norm": 2.3012880989025946, + "language_loss": 0.75830078, + "learning_rate": 5.042705173712835e-07, + "loss": 0.83494151, + "num_input_tokens_seen": 278409125, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09863281, + "step": 12904, + "time_per_iteration": 2.476417064666748 + }, + { + "auxiliary_loss_clip": 0.06397906, + "auxiliary_loss_mlp": 0.01264748, + "balance_loss_clip": 0.06269727, + "balance_loss_mlp": 0.01256093, + "epoch": 0.7758905756801443, + "flos": 23665953876480.0, + "grad_norm": 1.8947972098454593, + "language_loss": 0.68449861, + "learning_rate": 5.040120011529576e-07, + "loss": 0.76112515, + "num_input_tokens_seen": 278429450, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.08654785, + "step": 12905, + "time_per_iteration": 3.922461748123169 + }, + { + "auxiliary_loss_clip": 0.06398395, + "auxiliary_loss_mlp": 0.01266837, + "balance_loss_clip": 0.06270361, + "balance_loss_mlp": 0.0125736, + "epoch": 0.7759506989328122, + "flos": 28372906258560.0, + "grad_norm": 1.53682543204514, + "language_loss": 0.67685688, + "learning_rate": 5.037535416626459e-07, + "loss": 0.75350916, + "num_input_tokens_seen": 278449925, + "router_z_loss_clip": 1.28027344, + "router_z_loss_mlp": 0.0947876, + "step": 12906, + "time_per_iteration": 2.5313022136688232 + }, + { + "auxiliary_loss_clip": 0.06400718, + "auxiliary_loss_mlp": 0.01267007, + "balance_loss_clip": 0.06268603, + "balance_loss_mlp": 0.01257124, + "epoch": 0.7760108221854802, + "flos": 14908053127680.0, + "grad_norm": 2.1235046530395167, + "language_loss": 0.81742978, + "learning_rate": 5.034951389101498e-07, + "loss": 0.8941071, + "num_input_tokens_seen": 278467255, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.09887695, + "step": 12907, + "time_per_iteration": 2.4844870567321777 + }, + { + "auxiliary_loss_clip": 0.06399026, + "auxiliary_loss_mlp": 0.01267683, + "balance_loss_clip": 0.06271745, + "balance_loss_mlp": 0.01258584, + "epoch": 0.7760709454381483, + "flos": 14797615046400.0, + "grad_norm": 2.0283728968783006, + "language_loss": 0.67200708, + "learning_rate": 5.032367929052685e-07, + "loss": 0.74867415, + "num_input_tokens_seen": 278484250, + "router_z_loss_clip": 1.27246094, + "router_z_loss_mlp": 0.09103394, + "step": 12908, + "time_per_iteration": 2.489652633666992 + }, + { + "auxiliary_loss_clip": 0.06403653, + "auxiliary_loss_mlp": 0.01267977, + "balance_loss_clip": 0.06269245, + "balance_loss_mlp": 0.01258017, + "epoch": 0.7761310686908162, + "flos": 17384846365440.0, + "grad_norm": 1.5208070969667713, + "language_loss": 0.70563579, + "learning_rate": 5.029785036577976e-07, + "loss": 0.78235209, + "num_input_tokens_seen": 278502740, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.09954834, + "step": 12909, + "time_per_iteration": 2.484180450439453 + }, + { + "auxiliary_loss_clip": 0.06401674, + "auxiliary_loss_mlp": 0.01271334, + "balance_loss_clip": 0.06272651, + "balance_loss_mlp": 0.01262208, + "epoch": 0.7761911919434842, + "flos": 25563582892800.0, + "grad_norm": 1.6528787080895593, + "language_loss": 0.68030262, + "learning_rate": 5.027202711775324e-07, + "loss": 0.75703275, + "num_input_tokens_seen": 278523890, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.09130859, + "step": 12910, + "time_per_iteration": 2.5219783782958984 + }, + { + "auxiliary_loss_clip": 0.06401049, + "auxiliary_loss_mlp": 0.01265939, + "balance_loss_clip": 0.06268351, + "balance_loss_mlp": 0.01256193, + "epoch": 0.7762513151961521, + "flos": 23185530092160.0, + "grad_norm": 1.572866205055694, + "language_loss": 0.7175374, + "learning_rate": 5.024620954742646e-07, + "loss": 0.79420727, + "num_input_tokens_seen": 278543185, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09747314, + "step": 12911, + "time_per_iteration": 2.533684730529785 + }, + { + "auxiliary_loss_clip": 0.06403443, + "auxiliary_loss_mlp": 0.01265195, + "balance_loss_clip": 0.06270085, + "balance_loss_mlp": 0.01254651, + "epoch": 0.7763114384488201, + "flos": 21696097040640.0, + "grad_norm": 3.1287600736894867, + "language_loss": 0.63521278, + "learning_rate": 5.022039765577836e-07, + "loss": 0.71189916, + "num_input_tokens_seen": 278559220, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.10546875, + "step": 12912, + "time_per_iteration": 2.4713103771209717 + }, + { + "auxiliary_loss_clip": 0.06310222, + "auxiliary_loss_mlp": 0.01256155, + "balance_loss_clip": 0.06254428, + "balance_loss_mlp": 0.012551, + "epoch": 0.776371561701488, + "flos": 69048381335040.0, + "grad_norm": 0.7692138307274686, + "language_loss": 0.53290647, + "learning_rate": 5.019459144378779e-07, + "loss": 0.60857022, + "num_input_tokens_seen": 278618185, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.01056671, + "step": 12913, + "time_per_iteration": 3.1764438152313232 + }, + { + "auxiliary_loss_clip": 0.06402822, + "auxiliary_loss_mlp": 0.01263376, + "balance_loss_clip": 0.06270348, + "balance_loss_mlp": 0.01254495, + "epoch": 0.776431684954156, + "flos": 22900643809920.0, + "grad_norm": 1.5625942669092794, + "language_loss": 0.6230467, + "learning_rate": 5.016879091243338e-07, + "loss": 0.6997087, + "num_input_tokens_seen": 278636210, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.08880615, + "step": 12914, + "time_per_iteration": 2.534447193145752 + }, + { + "auxiliary_loss_clip": 0.06399079, + "auxiliary_loss_mlp": 0.012627, + "balance_loss_clip": 0.06268825, + "balance_loss_mlp": 0.01253259, + "epoch": 0.776491808206824, + "flos": 20266942602240.0, + "grad_norm": 1.633160981645456, + "language_loss": 0.82489586, + "learning_rate": 5.014299606269339e-07, + "loss": 0.9015137, + "num_input_tokens_seen": 278653305, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.09436035, + "step": 12915, + "time_per_iteration": 2.4910573959350586 + }, + { + "auxiliary_loss_clip": 0.06403746, + "auxiliary_loss_mlp": 0.01265286, + "balance_loss_clip": 0.06268285, + "balance_loss_mlp": 0.01255266, + "epoch": 0.776551931459492, + "flos": 26766033310080.0, + "grad_norm": 1.7528109604711235, + "language_loss": 0.74837983, + "learning_rate": 5.011720689554603e-07, + "loss": 0.82507014, + "num_input_tokens_seen": 278671850, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.10021973, + "step": 12916, + "time_per_iteration": 2.5818369388580322 + }, + { + "auxiliary_loss_clip": 0.06402493, + "auxiliary_loss_mlp": 0.01264205, + "balance_loss_clip": 0.06269188, + "balance_loss_mlp": 0.01254281, + "epoch": 0.7766120547121599, + "flos": 52676583960960.0, + "grad_norm": 1.4770261011777261, + "language_loss": 0.65460002, + "learning_rate": 5.009142341196919e-07, + "loss": 0.73126698, + "num_input_tokens_seen": 278697860, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.09924316, + "step": 12917, + "time_per_iteration": 2.776418924331665 + }, + { + "auxiliary_loss_clip": 0.06402885, + "auxiliary_loss_mlp": 0.01264757, + "balance_loss_clip": 0.06269239, + "balance_loss_mlp": 0.0125522, + "epoch": 0.7766721779648279, + "flos": 25163353065600.0, + "grad_norm": 1.489121757644636, + "language_loss": 0.6467213, + "learning_rate": 5.006564561294065e-07, + "loss": 0.72339773, + "num_input_tokens_seen": 278720655, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.09533691, + "step": 12918, + "time_per_iteration": 2.5809319019317627 + }, + { + "auxiliary_loss_clip": 0.06400011, + "auxiliary_loss_mlp": 0.01265679, + "balance_loss_clip": 0.06268477, + "balance_loss_mlp": 0.01256792, + "epoch": 0.7767323012174958, + "flos": 23766161760000.0, + "grad_norm": 2.1752593632817425, + "language_loss": 0.73467445, + "learning_rate": 5.003987349943777e-07, + "loss": 0.81133133, + "num_input_tokens_seen": 278737375, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.08886719, + "step": 12919, + "time_per_iteration": 2.498762369155884 + }, + { + "auxiliary_loss_clip": 0.06403969, + "auxiliary_loss_mlp": 0.01266374, + "balance_loss_clip": 0.06270312, + "balance_loss_mlp": 0.0125626, + "epoch": 0.7767924244701638, + "flos": 22092469580160.0, + "grad_norm": 1.6453382869225388, + "language_loss": 0.79804212, + "learning_rate": 5.001410707243792e-07, + "loss": 0.87474561, + "num_input_tokens_seen": 278756510, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.10113525, + "step": 12920, + "time_per_iteration": 2.5327045917510986 + }, + { + "auxiliary_loss_clip": 0.06406744, + "auxiliary_loss_mlp": 0.01265583, + "balance_loss_clip": 0.06271371, + "balance_loss_mlp": 0.012561, + "epoch": 0.7768525477228319, + "flos": 21988194773760.0, + "grad_norm": 1.540123297700945, + "language_loss": 0.71420145, + "learning_rate": 4.998834633291829e-07, + "loss": 0.79092473, + "num_input_tokens_seen": 278775410, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.09490967, + "step": 12921, + "time_per_iteration": 2.493539333343506 + }, + { + "auxiliary_loss_clip": 0.06407829, + "auxiliary_loss_mlp": 0.01268758, + "balance_loss_clip": 0.06272625, + "balance_loss_mlp": 0.01258643, + "epoch": 0.7769126709754998, + "flos": 21800329920000.0, + "grad_norm": 1.5870112514861305, + "language_loss": 0.764503, + "learning_rate": 4.996259128185547e-07, + "loss": 0.8412689, + "num_input_tokens_seen": 278794260, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.10113525, + "step": 12922, + "time_per_iteration": 2.664897918701172 + }, + { + "auxiliary_loss_clip": 0.06402089, + "auxiliary_loss_mlp": 0.01264843, + "balance_loss_clip": 0.06270384, + "balance_loss_mlp": 0.01254853, + "epoch": 0.7769727942281678, + "flos": 20054242212480.0, + "grad_norm": 2.0384511748654286, + "language_loss": 0.80950773, + "learning_rate": 4.993684192022625e-07, + "loss": 0.88617706, + "num_input_tokens_seen": 278813290, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09991455, + "step": 12923, + "time_per_iteration": 2.4884073734283447 + }, + { + "auxiliary_loss_clip": 0.06402602, + "auxiliary_loss_mlp": 0.01263266, + "balance_loss_clip": 0.06271294, + "balance_loss_mlp": 0.01253914, + "epoch": 0.7770329174808357, + "flos": 21692784804480.0, + "grad_norm": 1.8529148039982746, + "language_loss": 0.92405283, + "learning_rate": 4.991109824900699e-07, + "loss": 1.00071156, + "num_input_tokens_seen": 278830610, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.09356689, + "step": 12924, + "time_per_iteration": 2.52184796333313 + }, + { + "auxiliary_loss_clip": 0.06402275, + "auxiliary_loss_mlp": 0.01264418, + "balance_loss_clip": 0.06269395, + "balance_loss_mlp": 0.01254804, + "epoch": 0.7770930407335037, + "flos": 25856477239680.0, + "grad_norm": 1.997586908265186, + "language_loss": 0.66484189, + "learning_rate": 4.988536026917401e-07, + "loss": 0.74150878, + "num_input_tokens_seen": 278849530, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.09606934, + "step": 12925, + "time_per_iteration": 2.528657913208008 + }, + { + "auxiliary_loss_clip": 0.06409155, + "auxiliary_loss_mlp": 0.01270758, + "balance_loss_clip": 0.06273882, + "balance_loss_mlp": 0.01261019, + "epoch": 0.7771531639861716, + "flos": 24353921024640.0, + "grad_norm": 1.7055491864849242, + "language_loss": 0.72285664, + "learning_rate": 4.985962798170314e-07, + "loss": 0.7996558, + "num_input_tokens_seen": 278869005, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.09729004, + "step": 12926, + "time_per_iteration": 2.529508352279663 + }, + { + "auxiliary_loss_clip": 0.06404512, + "auxiliary_loss_mlp": 0.01263957, + "balance_loss_clip": 0.06270072, + "balance_loss_mlp": 0.01253914, + "epoch": 0.7772132872388396, + "flos": 25637068523520.0, + "grad_norm": 1.8006607912850339, + "language_loss": 0.65851128, + "learning_rate": 4.983390138757027e-07, + "loss": 0.73519599, + "num_input_tokens_seen": 278888790, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.10046387, + "step": 12927, + "time_per_iteration": 3.9577128887176514 + }, + { + "auxiliary_loss_clip": 0.06403954, + "auxiliary_loss_mlp": 0.01268877, + "balance_loss_clip": 0.06270983, + "balance_loss_mlp": 0.01258607, + "epoch": 0.7772734104915076, + "flos": 26074544290560.0, + "grad_norm": 2.5615945281545147, + "language_loss": 0.72538382, + "learning_rate": 4.980818048775093e-07, + "loss": 0.8021121, + "num_input_tokens_seen": 278908150, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.1026001, + "step": 12928, + "time_per_iteration": 2.524092197418213 + }, + { + "auxiliary_loss_clip": 0.06398363, + "auxiliary_loss_mlp": 0.0126847, + "balance_loss_clip": 0.0626855, + "balance_loss_mlp": 0.0125935, + "epoch": 0.7773335337441756, + "flos": 22930887934080.0, + "grad_norm": 1.7899805445519197, + "language_loss": 0.74762726, + "learning_rate": 4.978246528322036e-07, + "loss": 0.82429558, + "num_input_tokens_seen": 278927425, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.09118652, + "step": 12929, + "time_per_iteration": 2.50419282913208 + }, + { + "auxiliary_loss_clip": 0.06401908, + "auxiliary_loss_mlp": 0.01268664, + "balance_loss_clip": 0.06269601, + "balance_loss_mlp": 0.01258871, + "epoch": 0.7773936569968435, + "flos": 20782977171840.0, + "grad_norm": 1.7754986557966836, + "language_loss": 0.77492833, + "learning_rate": 4.975675577495377e-07, + "loss": 0.85163409, + "num_input_tokens_seen": 278946475, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.09796143, + "step": 12930, + "time_per_iteration": 2.5014841556549072 + }, + { + "auxiliary_loss_clip": 0.06403639, + "auxiliary_loss_mlp": 0.01265185, + "balance_loss_clip": 0.06271214, + "balance_loss_mlp": 0.01255291, + "epoch": 0.7774537802495115, + "flos": 20377883808000.0, + "grad_norm": 1.923217497642762, + "language_loss": 0.80022055, + "learning_rate": 4.973105196392613e-07, + "loss": 0.87690878, + "num_input_tokens_seen": 278964345, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09893799, + "step": 12931, + "time_per_iteration": 2.479499340057373 + }, + { + "auxiliary_loss_clip": 0.06306946, + "auxiliary_loss_mlp": 0.0125312, + "balance_loss_clip": 0.06251584, + "balance_loss_mlp": 0.01252035, + "epoch": 0.7775139035021794, + "flos": 53930981980800.0, + "grad_norm": 0.7888811218125162, + "language_loss": 0.59670961, + "learning_rate": 4.970535385111199e-07, + "loss": 0.67231035, + "num_input_tokens_seen": 279022380, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01087189, + "step": 12932, + "time_per_iteration": 3.131812810897827 + }, + { + "auxiliary_loss_clip": 0.06405772, + "auxiliary_loss_mlp": 0.01263803, + "balance_loss_clip": 0.06271382, + "balance_loss_mlp": 0.01254373, + "epoch": 0.7775740267548474, + "flos": 28850437077120.0, + "grad_norm": 1.493641616196245, + "language_loss": 0.76082242, + "learning_rate": 4.967966143748595e-07, + "loss": 0.83751822, + "num_input_tokens_seen": 279044275, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.09436035, + "step": 12933, + "time_per_iteration": 2.657081127166748 + }, + { + "auxiliary_loss_clip": 0.06403433, + "auxiliary_loss_mlp": 0.01262442, + "balance_loss_clip": 0.06271302, + "balance_loss_mlp": 0.01252077, + "epoch": 0.7776341500075155, + "flos": 21879056430720.0, + "grad_norm": 1.8678224067901799, + "language_loss": 0.73828089, + "learning_rate": 4.965397472402215e-07, + "loss": 0.81493968, + "num_input_tokens_seen": 279063375, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.10369873, + "step": 12934, + "time_per_iteration": 2.514028549194336 + }, + { + "auxiliary_loss_clip": 0.06404053, + "auxiliary_loss_mlp": 0.01265488, + "balance_loss_clip": 0.06270254, + "balance_loss_mlp": 0.01255468, + "epoch": 0.7776942732601834, + "flos": 20236027645440.0, + "grad_norm": 1.899249869710296, + "language_loss": 0.70498896, + "learning_rate": 4.962829371169475e-07, + "loss": 0.78168434, + "num_input_tokens_seen": 279082680, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.10009766, + "step": 12935, + "time_per_iteration": 2.5094125270843506 + }, + { + "auxiliary_loss_clip": 0.06407169, + "auxiliary_loss_mlp": 0.01265988, + "balance_loss_clip": 0.06272172, + "balance_loss_mlp": 0.01256333, + "epoch": 0.7777543965128514, + "flos": 22237554124800.0, + "grad_norm": 1.4942918595564652, + "language_loss": 0.83564198, + "learning_rate": 4.960261840147746e-07, + "loss": 0.91237354, + "num_input_tokens_seen": 279099805, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.09661865, + "step": 12936, + "time_per_iteration": 2.4796142578125 + }, + { + "auxiliary_loss_clip": 0.0640949, + "auxiliary_loss_mlp": 0.0126322, + "balance_loss_clip": 0.06271779, + "balance_loss_mlp": 0.01254202, + "epoch": 0.7778145197655193, + "flos": 14507236321920.0, + "grad_norm": 1.7034390365737724, + "language_loss": 0.67389679, + "learning_rate": 4.957694879434397e-07, + "loss": 0.75062388, + "num_input_tokens_seen": 279117975, + "router_z_loss_clip": 1.37695312, + "router_z_loss_mlp": 0.09020996, + "step": 12937, + "time_per_iteration": 3.914120674133301 + }, + { + "auxiliary_loss_clip": 0.06402509, + "auxiliary_loss_mlp": 0.01264387, + "balance_loss_clip": 0.06269647, + "balance_loss_mlp": 0.01254928, + "epoch": 0.7778746430181873, + "flos": 21146338402560.0, + "grad_norm": 1.4641946456132704, + "language_loss": 0.87061489, + "learning_rate": 4.955128489126777e-07, + "loss": 0.94728386, + "num_input_tokens_seen": 279137255, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.09460449, + "step": 12938, + "time_per_iteration": 2.494309663772583 + }, + { + "auxiliary_loss_clip": 0.06401877, + "auxiliary_loss_mlp": 0.01265878, + "balance_loss_clip": 0.06269386, + "balance_loss_mlp": 0.01255972, + "epoch": 0.7779347662708552, + "flos": 20272560825600.0, + "grad_norm": 1.9237142576123536, + "language_loss": 0.8554709, + "learning_rate": 4.95256266932218e-07, + "loss": 0.93214846, + "num_input_tokens_seen": 279154500, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09906006, + "step": 12939, + "time_per_iteration": 2.4730064868927 + }, + { + "auxiliary_loss_clip": 0.06398107, + "auxiliary_loss_mlp": 0.01265311, + "balance_loss_clip": 0.0626917, + "balance_loss_mlp": 0.01256084, + "epoch": 0.7779948895235232, + "flos": 19215153025920.0, + "grad_norm": 1.7540702962563577, + "language_loss": 0.69412231, + "learning_rate": 4.949997420117915e-07, + "loss": 0.77075648, + "num_input_tokens_seen": 279173635, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.09228516, + "step": 12940, + "time_per_iteration": 3.918668270111084 + }, + { + "auxiliary_loss_clip": 0.064026, + "auxiliary_loss_mlp": 0.01265044, + "balance_loss_clip": 0.06269296, + "balance_loss_mlp": 0.01255627, + "epoch": 0.7780550127761912, + "flos": 23921476502400.0, + "grad_norm": 4.631352047296881, + "language_loss": 0.77788246, + "learning_rate": 4.947432741611255e-07, + "loss": 0.85455894, + "num_input_tokens_seen": 279194430, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.09423828, + "step": 12941, + "time_per_iteration": 2.5110888481140137 + }, + { + "auxiliary_loss_clip": 0.06410088, + "auxiliary_loss_mlp": 0.01268786, + "balance_loss_clip": 0.06272246, + "balance_loss_mlp": 0.01257813, + "epoch": 0.7781151360288592, + "flos": 32424148114560.0, + "grad_norm": 2.2460397891674697, + "language_loss": 0.73285127, + "learning_rate": 4.944868633899462e-07, + "loss": 0.80964005, + "num_input_tokens_seen": 279212920, + "router_z_loss_clip": 1.37695312, + "router_z_loss_mlp": 0.10974121, + "step": 12942, + "time_per_iteration": 2.5817012786865234 + }, + { + "auxiliary_loss_clip": 0.06399062, + "auxiliary_loss_mlp": 0.01266209, + "balance_loss_clip": 0.06270151, + "balance_loss_mlp": 0.01257239, + "epoch": 0.7781752592815271, + "flos": 22352981523840.0, + "grad_norm": 1.9559350984473978, + "language_loss": 0.68287194, + "learning_rate": 4.942305097079751e-07, + "loss": 0.75952458, + "num_input_tokens_seen": 279232310, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.08972168, + "step": 12943, + "time_per_iteration": 2.4933464527130127 + }, + { + "auxiliary_loss_clip": 0.06304064, + "auxiliary_loss_mlp": 0.01250725, + "balance_loss_clip": 0.06248597, + "balance_loss_mlp": 0.01249737, + "epoch": 0.7782353825341951, + "flos": 70479101802240.0, + "grad_norm": 0.7622073777913676, + "language_loss": 0.58524758, + "learning_rate": 4.939742131249347e-07, + "loss": 0.66079545, + "num_input_tokens_seen": 279295375, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.00987244, + "step": 12944, + "time_per_iteration": 3.2943570613861084 + }, + { + "auxiliary_loss_clip": 0.0640593, + "auxiliary_loss_mlp": 0.0126598, + "balance_loss_clip": 0.06270279, + "balance_loss_mlp": 0.01255495, + "epoch": 0.778295505786863, + "flos": 19068601034880.0, + "grad_norm": 1.9954002249316443, + "language_loss": 0.68333346, + "learning_rate": 4.937179736505428e-07, + "loss": 0.76005256, + "num_input_tokens_seen": 279313660, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.10491943, + "step": 12945, + "time_per_iteration": 3.963608741760254 + }, + { + "auxiliary_loss_clip": 0.06401619, + "auxiliary_loss_mlp": 0.01263231, + "balance_loss_clip": 0.06268932, + "balance_loss_mlp": 0.01253837, + "epoch": 0.778355629039531, + "flos": 21006662446080.0, + "grad_norm": 2.4482608319638404, + "language_loss": 0.69179362, + "learning_rate": 4.93461791294516e-07, + "loss": 0.76844209, + "num_input_tokens_seen": 279334495, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09387207, + "step": 12946, + "time_per_iteration": 2.528555393218994 + }, + { + "auxiliary_loss_clip": 0.06402339, + "auxiliary_loss_mlp": 0.01264109, + "balance_loss_clip": 0.06268816, + "balance_loss_mlp": 0.01254328, + "epoch": 0.7784157522921991, + "flos": 21404586286080.0, + "grad_norm": 1.63285369155658, + "language_loss": 0.65319461, + "learning_rate": 4.932056660665689e-07, + "loss": 0.72985911, + "num_input_tokens_seen": 279352985, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.09783936, + "step": 12947, + "time_per_iteration": 2.533308744430542 + }, + { + "auxiliary_loss_clip": 0.06402348, + "auxiliary_loss_mlp": 0.01262916, + "balance_loss_clip": 0.06270808, + "balance_loss_mlp": 0.01253499, + "epoch": 0.778475875544867, + "flos": 20820181184640.0, + "grad_norm": 1.87438794738079, + "language_loss": 0.65581381, + "learning_rate": 4.929495979764147e-07, + "loss": 0.73246646, + "num_input_tokens_seen": 279371360, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.09417725, + "step": 12948, + "time_per_iteration": 2.5082039833068848 + }, + { + "auxiliary_loss_clip": 0.0640206, + "auxiliary_loss_mlp": 0.01261972, + "balance_loss_clip": 0.06271663, + "balance_loss_mlp": 0.01252078, + "epoch": 0.778535998797535, + "flos": 14360516622720.0, + "grad_norm": 1.7911059027184133, + "language_loss": 0.75669527, + "learning_rate": 4.926935870337625e-07, + "loss": 0.83333564, + "num_input_tokens_seen": 279389400, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.09893799, + "step": 12949, + "time_per_iteration": 2.499680519104004 + }, + { + "auxiliary_loss_clip": 0.06407519, + "auxiliary_loss_mlp": 0.0126663, + "balance_loss_clip": 0.06271057, + "balance_loss_mlp": 0.01255871, + "epoch": 0.7785961220502029, + "flos": 19215781931520.0, + "grad_norm": 1.2917746110021882, + "language_loss": 0.69081604, + "learning_rate": 4.924376332483202e-07, + "loss": 0.7675575, + "num_input_tokens_seen": 279409715, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.10760498, + "step": 12950, + "time_per_iteration": 2.4793641567230225 + }, + { + "auxiliary_loss_clip": 0.06404532, + "auxiliary_loss_mlp": 0.0126582, + "balance_loss_clip": 0.06268837, + "balance_loss_mlp": 0.01256307, + "epoch": 0.7786562453028709, + "flos": 25745787596160.0, + "grad_norm": 1.5705407772733666, + "language_loss": 0.72314119, + "learning_rate": 4.921817366297938e-07, + "loss": 0.79984468, + "num_input_tokens_seen": 279427705, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.09509277, + "step": 12951, + "time_per_iteration": 2.533123731613159 + }, + { + "auxiliary_loss_clip": 0.06403095, + "auxiliary_loss_mlp": 0.01262496, + "balance_loss_clip": 0.06272363, + "balance_loss_mlp": 0.01252238, + "epoch": 0.7787163685555388, + "flos": 25746584209920.0, + "grad_norm": 1.6880059510178558, + "language_loss": 0.65866429, + "learning_rate": 4.919258971878877e-07, + "loss": 0.73532021, + "num_input_tokens_seen": 279448215, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.1026001, + "step": 12952, + "time_per_iteration": 2.5218706130981445 + }, + { + "auxiliary_loss_clip": 0.06394114, + "auxiliary_loss_mlp": 0.01264734, + "balance_loss_clip": 0.06268984, + "balance_loss_mlp": 0.01256032, + "epoch": 0.7787764918082068, + "flos": 22754385308160.0, + "grad_norm": 2.055033459437186, + "language_loss": 0.81612301, + "learning_rate": 4.916701149323022e-07, + "loss": 0.89271152, + "num_input_tokens_seen": 279466260, + "router_z_loss_clip": 1.25097656, + "router_z_loss_mlp": 0.08709717, + "step": 12953, + "time_per_iteration": 2.5306200981140137 + }, + { + "auxiliary_loss_clip": 0.06410024, + "auxiliary_loss_mlp": 0.01264944, + "balance_loss_clip": 0.06273989, + "balance_loss_mlp": 0.01254972, + "epoch": 0.7788366150608748, + "flos": 15195538886400.0, + "grad_norm": 1.8925370756412514, + "language_loss": 0.76971662, + "learning_rate": 4.91414389872737e-07, + "loss": 0.8464663, + "num_input_tokens_seen": 279484520, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.09960938, + "step": 12954, + "time_per_iteration": 2.4636683464050293 + }, + { + "auxiliary_loss_clip": 0.0640775, + "auxiliary_loss_mlp": 0.01263138, + "balance_loss_clip": 0.06270479, + "balance_loss_mlp": 0.01253369, + "epoch": 0.7788967383135428, + "flos": 21215799037440.0, + "grad_norm": 1.4850490788267763, + "language_loss": 0.7292642, + "learning_rate": 4.911587220188905e-07, + "loss": 0.80597305, + "num_input_tokens_seen": 279503130, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.09765625, + "step": 12955, + "time_per_iteration": 2.4956090450286865 + }, + { + "auxiliary_loss_clip": 0.06403288, + "auxiliary_loss_mlp": 0.01263998, + "balance_loss_clip": 0.06270338, + "balance_loss_mlp": 0.01253973, + "epoch": 0.7789568615662107, + "flos": 21688340538240.0, + "grad_norm": 1.3614080537003919, + "language_loss": 0.68852103, + "learning_rate": 4.909031113804551e-07, + "loss": 0.76519388, + "num_input_tokens_seen": 279521930, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.10021973, + "step": 12956, + "time_per_iteration": 2.5246806144714355 + }, + { + "auxiliary_loss_clip": 0.06403255, + "auxiliary_loss_mlp": 0.01262407, + "balance_loss_clip": 0.06269701, + "balance_loss_mlp": 0.01252864, + "epoch": 0.7790169848188787, + "flos": 26367732126720.0, + "grad_norm": 1.5408189512052117, + "language_loss": 0.7640478, + "learning_rate": 4.906475579671252e-07, + "loss": 0.84070438, + "num_input_tokens_seen": 279542375, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.09539795, + "step": 12957, + "time_per_iteration": 2.560433864593506 + }, + { + "auxiliary_loss_clip": 0.06402086, + "auxiliary_loss_mlp": 0.01265224, + "balance_loss_clip": 0.06269553, + "balance_loss_mlp": 0.01255407, + "epoch": 0.7790771080715466, + "flos": 25522563519360.0, + "grad_norm": 1.6277364892308188, + "language_loss": 0.77872479, + "learning_rate": 4.903920617885917e-07, + "loss": 0.85539794, + "num_input_tokens_seen": 279561885, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.0982666, + "step": 12958, + "time_per_iteration": 2.5132603645324707 + }, + { + "auxiliary_loss_clip": 0.06403212, + "auxiliary_loss_mlp": 0.0126808, + "balance_loss_clip": 0.06270035, + "balance_loss_mlp": 0.01257995, + "epoch": 0.7791372313242146, + "flos": 16039701244800.0, + "grad_norm": 2.1750549436439295, + "language_loss": 0.71726602, + "learning_rate": 4.901366228545418e-07, + "loss": 0.79397893, + "num_input_tokens_seen": 279579965, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.10076904, + "step": 12959, + "time_per_iteration": 2.4766464233398438 + }, + { + "auxiliary_loss_clip": 0.06403412, + "auxiliary_loss_mlp": 0.01265995, + "balance_loss_clip": 0.06269655, + "balance_loss_mlp": 0.01256208, + "epoch": 0.7791973545768827, + "flos": 23849039047680.0, + "grad_norm": 1.6457903967738072, + "language_loss": 0.77779013, + "learning_rate": 4.898812411746632e-07, + "loss": 0.8544842, + "num_input_tokens_seen": 279599030, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.09783936, + "step": 12960, + "time_per_iteration": 2.5057005882263184 + }, + { + "auxiliary_loss_clip": 0.06403294, + "auxiliary_loss_mlp": 0.01269347, + "balance_loss_clip": 0.06269927, + "balance_loss_mlp": 0.0125934, + "epoch": 0.7792574778295506, + "flos": 24174902776320.0, + "grad_norm": 1.862849792327091, + "language_loss": 0.75439703, + "learning_rate": 4.896259167586385e-07, + "loss": 0.83112347, + "num_input_tokens_seen": 279614400, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.10003662, + "step": 12961, + "time_per_iteration": 2.523517608642578 + }, + { + "auxiliary_loss_clip": 0.06400951, + "auxiliary_loss_mlp": 0.01266276, + "balance_loss_clip": 0.06274296, + "balance_loss_mlp": 0.01257592, + "epoch": 0.7793176010822186, + "flos": 21470399268480.0, + "grad_norm": 1.5483353660342332, + "language_loss": 0.73957908, + "learning_rate": 4.893706496161511e-07, + "loss": 0.81625128, + "num_input_tokens_seen": 279633745, + "router_z_loss_clip": 1.26660156, + "router_z_loss_mlp": 0.08679199, + "step": 12962, + "time_per_iteration": 2.498566150665283 + }, + { + "auxiliary_loss_clip": 0.06398464, + "auxiliary_loss_mlp": 0.01264334, + "balance_loss_clip": 0.06269018, + "balance_loss_mlp": 0.01255012, + "epoch": 0.7793777243348865, + "flos": 20672790652800.0, + "grad_norm": 1.8192572691514057, + "language_loss": 0.70224059, + "learning_rate": 4.891154397568795e-07, + "loss": 0.77886856, + "num_input_tokens_seen": 279651165, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.09326172, + "step": 12963, + "time_per_iteration": 2.507917881011963 + }, + { + "auxiliary_loss_clip": 0.06401575, + "auxiliary_loss_mlp": 0.01264258, + "balance_loss_clip": 0.06272756, + "balance_loss_mlp": 0.01254805, + "epoch": 0.7794378475875545, + "flos": 27133126047360.0, + "grad_norm": 1.5815995663676223, + "language_loss": 0.63879544, + "learning_rate": 4.888602871905019e-07, + "loss": 0.71545374, + "num_input_tokens_seen": 279671175, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.09460449, + "step": 12964, + "time_per_iteration": 2.52024245262146 + }, + { + "auxiliary_loss_clip": 0.06404367, + "auxiliary_loss_mlp": 0.01264838, + "balance_loss_clip": 0.0627073, + "balance_loss_mlp": 0.01254622, + "epoch": 0.7794979708402224, + "flos": 28081605139200.0, + "grad_norm": 1.6072168370659738, + "language_loss": 0.76559496, + "learning_rate": 4.88605191926694e-07, + "loss": 0.84228694, + "num_input_tokens_seen": 279688675, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.10211182, + "step": 12965, + "time_per_iteration": 2.5686237812042236 + }, + { + "auxiliary_loss_clip": 0.06394182, + "auxiliary_loss_mlp": 0.01263131, + "balance_loss_clip": 0.06269042, + "balance_loss_mlp": 0.01254429, + "epoch": 0.7795580940928905, + "flos": 26876722953600.0, + "grad_norm": 1.5862680415926609, + "language_loss": 0.72998363, + "learning_rate": 4.883501539751289e-07, + "loss": 0.80655676, + "num_input_tokens_seen": 279710245, + "router_z_loss_clip": 1.25195312, + "router_z_loss_mlp": 0.08703613, + "step": 12966, + "time_per_iteration": 2.51505708694458 + }, + { + "auxiliary_loss_clip": 0.06398065, + "auxiliary_loss_mlp": 0.01262043, + "balance_loss_clip": 0.06270934, + "balance_loss_mlp": 0.01253323, + "epoch": 0.7796182173455584, + "flos": 23841072910080.0, + "grad_norm": 1.47410798363511, + "language_loss": 0.74184883, + "learning_rate": 4.880951733454768e-07, + "loss": 0.81844991, + "num_input_tokens_seen": 279729045, + "router_z_loss_clip": 1.2734375, + "router_z_loss_mlp": 0.08721924, + "step": 12967, + "time_per_iteration": 3.9195239543914795 + }, + { + "auxiliary_loss_clip": 0.06406528, + "auxiliary_loss_mlp": 0.01262611, + "balance_loss_clip": 0.06272194, + "balance_loss_mlp": 0.01253462, + "epoch": 0.7796783405982264, + "flos": 19798384170240.0, + "grad_norm": 2.482748311118984, + "language_loss": 0.72366989, + "learning_rate": 4.878402500474073e-07, + "loss": 0.80036128, + "num_input_tokens_seen": 279748350, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.09155273, + "step": 12968, + "time_per_iteration": 2.5332348346710205 + }, + { + "auxiliary_loss_clip": 0.06398027, + "auxiliary_loss_mlp": 0.01268988, + "balance_loss_clip": 0.0626802, + "balance_loss_mlp": 0.01259249, + "epoch": 0.7797384638508943, + "flos": 15455589632640.0, + "grad_norm": 1.8161833543427846, + "language_loss": 0.61633801, + "learning_rate": 4.875853840905874e-07, + "loss": 0.69300812, + "num_input_tokens_seen": 279765620, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.09735107, + "step": 12969, + "time_per_iteration": 2.477679967880249 + }, + { + "auxiliary_loss_clip": 0.06398109, + "auxiliary_loss_mlp": 0.0126421, + "balance_loss_clip": 0.06271819, + "balance_loss_mlp": 0.01255651, + "epoch": 0.7797985871035623, + "flos": 20928984111360.0, + "grad_norm": 1.617507688823146, + "language_loss": 0.70254469, + "learning_rate": 4.873305754846811e-07, + "loss": 0.77916789, + "num_input_tokens_seen": 279782485, + "router_z_loss_clip": 1.26171875, + "router_z_loss_mlp": 0.08563232, + "step": 12970, + "time_per_iteration": 2.510071039199829 + }, + { + "auxiliary_loss_clip": 0.06403705, + "auxiliary_loss_mlp": 0.01266712, + "balance_loss_clip": 0.06272732, + "balance_loss_mlp": 0.01256901, + "epoch": 0.7798587103562302, + "flos": 36945667411200.0, + "grad_norm": 1.5338115729729769, + "language_loss": 0.72291183, + "learning_rate": 4.870758242393507e-07, + "loss": 0.79961598, + "num_input_tokens_seen": 279804170, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09814453, + "step": 12971, + "time_per_iteration": 2.654513359069824 + }, + { + "auxiliary_loss_clip": 0.06410386, + "auxiliary_loss_mlp": 0.01266468, + "balance_loss_clip": 0.06272395, + "balance_loss_mlp": 0.01256174, + "epoch": 0.7799188336088982, + "flos": 22425880176000.0, + "grad_norm": 1.7218916493252936, + "language_loss": 0.74606651, + "learning_rate": 4.868211303642578e-07, + "loss": 0.82283497, + "num_input_tokens_seen": 279823730, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.10290527, + "step": 12972, + "time_per_iteration": 2.517273187637329 + }, + { + "auxiliary_loss_clip": 0.06402341, + "auxiliary_loss_mlp": 0.01263993, + "balance_loss_clip": 0.06269114, + "balance_loss_mlp": 0.01254146, + "epoch": 0.7799789568615663, + "flos": 18886522112640.0, + "grad_norm": 2.215385328919691, + "language_loss": 0.71494085, + "learning_rate": 4.865664938690584e-07, + "loss": 0.79160416, + "num_input_tokens_seen": 279843035, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.09844971, + "step": 12973, + "time_per_iteration": 2.472104549407959 + }, + { + "auxiliary_loss_clip": 0.06400935, + "auxiliary_loss_mlp": 0.01265477, + "balance_loss_clip": 0.0627044, + "balance_loss_mlp": 0.01256435, + "epoch": 0.7800390801142342, + "flos": 20267781143040.0, + "grad_norm": 1.7807969698368138, + "language_loss": 0.78121793, + "learning_rate": 4.863119147634089e-07, + "loss": 0.85788202, + "num_input_tokens_seen": 279861450, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.09039307, + "step": 12974, + "time_per_iteration": 2.4978132247924805 + }, + { + "auxiliary_loss_clip": 0.06402993, + "auxiliary_loss_mlp": 0.01264928, + "balance_loss_clip": 0.06272218, + "balance_loss_mlp": 0.01255313, + "epoch": 0.7800992033669022, + "flos": 16695831041280.0, + "grad_norm": 1.52512308426482, + "language_loss": 0.6983875, + "learning_rate": 4.86057393056964e-07, + "loss": 0.77506667, + "num_input_tokens_seen": 279878660, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09619141, + "step": 12975, + "time_per_iteration": 2.4792943000793457 + }, + { + "auxiliary_loss_clip": 0.06404307, + "auxiliary_loss_mlp": 0.01265828, + "balance_loss_clip": 0.06273738, + "balance_loss_mlp": 0.01256703, + "epoch": 0.7801593266195701, + "flos": 18590650945920.0, + "grad_norm": 2.5885152450409654, + "language_loss": 0.82135439, + "learning_rate": 4.858029287593739e-07, + "loss": 0.89805579, + "num_input_tokens_seen": 279895685, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.09124756, + "step": 12976, + "time_per_iteration": 3.9093782901763916 + }, + { + "auxiliary_loss_clip": 0.06403226, + "auxiliary_loss_mlp": 0.01266163, + "balance_loss_clip": 0.06269425, + "balance_loss_mlp": 0.01256299, + "epoch": 0.7802194498722381, + "flos": 25492193614080.0, + "grad_norm": 1.298093609119966, + "language_loss": 0.66121942, + "learning_rate": 4.85548521880289e-07, + "loss": 0.73791331, + "num_input_tokens_seen": 279917240, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.09857178, + "step": 12977, + "time_per_iteration": 2.5382373332977295 + }, + { + "auxiliary_loss_clip": 0.06398032, + "auxiliary_loss_mlp": 0.01265496, + "balance_loss_clip": 0.06268156, + "balance_loss_mlp": 0.01256293, + "epoch": 0.780279573124906, + "flos": 31184451757440.0, + "grad_norm": 1.3843135589513191, + "language_loss": 0.74921417, + "learning_rate": 4.852941724293554e-07, + "loss": 0.82584947, + "num_input_tokens_seen": 279938665, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.09204102, + "step": 12978, + "time_per_iteration": 2.5999321937561035 + }, + { + "auxiliary_loss_clip": 0.0640787, + "auxiliary_loss_mlp": 0.01263935, + "balance_loss_clip": 0.06272239, + "balance_loss_mlp": 0.01253529, + "epoch": 0.780339696377574, + "flos": 26951466395520.0, + "grad_norm": 1.7189824497298882, + "language_loss": 0.6233561, + "learning_rate": 4.85039880416219e-07, + "loss": 0.70007408, + "num_input_tokens_seen": 279957965, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.10406494, + "step": 12979, + "time_per_iteration": 4.002735137939453 + }, + { + "auxiliary_loss_clip": 0.0640031, + "auxiliary_loss_mlp": 0.01264611, + "balance_loss_clip": 0.06269379, + "balance_loss_mlp": 0.01255163, + "epoch": 0.780399819630242, + "flos": 27963662117760.0, + "grad_norm": 1.7958108111348887, + "language_loss": 0.77048111, + "learning_rate": 4.847856458505217e-07, + "loss": 0.8471303, + "num_input_tokens_seen": 279977490, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09454346, + "step": 12980, + "time_per_iteration": 2.574740171432495 + }, + { + "auxiliary_loss_clip": 0.06404287, + "auxiliary_loss_mlp": 0.0126621, + "balance_loss_clip": 0.06269396, + "balance_loss_mlp": 0.01256941, + "epoch": 0.78045994288291, + "flos": 22492489772160.0, + "grad_norm": 7.38729106022631, + "language_loss": 0.77965951, + "learning_rate": 4.845314687419046e-07, + "loss": 0.85636449, + "num_input_tokens_seen": 279994220, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.09259033, + "step": 12981, + "time_per_iteration": 2.6090612411499023 + }, + { + "auxiliary_loss_clip": 0.06406559, + "auxiliary_loss_mlp": 0.01273892, + "balance_loss_clip": 0.0627367, + "balance_loss_mlp": 0.01264642, + "epoch": 0.7805200661355779, + "flos": 20857259416320.0, + "grad_norm": 1.7019427662247137, + "language_loss": 0.72918165, + "learning_rate": 4.842773491000067e-07, + "loss": 0.80598617, + "num_input_tokens_seen": 280012590, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.09246826, + "step": 12982, + "time_per_iteration": 2.538454294204712 + }, + { + "auxiliary_loss_clip": 0.06401584, + "auxiliary_loss_mlp": 0.01261641, + "balance_loss_clip": 0.06268401, + "balance_loss_mlp": 0.01251932, + "epoch": 0.7805801893882459, + "flos": 25673014725120.0, + "grad_norm": 1.3557046111100475, + "language_loss": 0.73713994, + "learning_rate": 4.840232869344636e-07, + "loss": 0.8137722, + "num_input_tokens_seen": 280033700, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.0970459, + "step": 12983, + "time_per_iteration": 2.55915904045105 + }, + { + "auxiliary_loss_clip": 0.06403306, + "auxiliary_loss_mlp": 0.01265365, + "balance_loss_clip": 0.06270759, + "balance_loss_mlp": 0.0125584, + "epoch": 0.7806403126409138, + "flos": 11332581154560.0, + "grad_norm": 1.8511733827062056, + "language_loss": 0.7564944, + "learning_rate": 4.837692822549086e-07, + "loss": 0.83318114, + "num_input_tokens_seen": 280052215, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.09521484, + "step": 12984, + "time_per_iteration": 3.9226207733154297 + }, + { + "auxiliary_loss_clip": 0.06401315, + "auxiliary_loss_mlp": 0.01261166, + "balance_loss_clip": 0.06270321, + "balance_loss_mlp": 0.01252345, + "epoch": 0.7807004358935818, + "flos": 19579478578560.0, + "grad_norm": 1.6909183647734616, + "language_loss": 0.81444597, + "learning_rate": 4.835153350709746e-07, + "loss": 0.89107084, + "num_input_tokens_seen": 280070525, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.08831787, + "step": 12985, + "time_per_iteration": 2.495833396911621 + }, + { + "auxiliary_loss_clip": 0.06404648, + "auxiliary_loss_mlp": 0.01270247, + "balance_loss_clip": 0.06273016, + "balance_loss_mlp": 0.01260007, + "epoch": 0.7807605591462499, + "flos": 19141918957440.0, + "grad_norm": 1.5866346872788593, + "language_loss": 0.7735818, + "learning_rate": 4.832614453922915e-07, + "loss": 0.85033077, + "num_input_tokens_seen": 280089855, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.10235596, + "step": 12986, + "time_per_iteration": 2.4942498207092285 + }, + { + "auxiliary_loss_clip": 0.06404544, + "auxiliary_loss_mlp": 0.01262193, + "balance_loss_clip": 0.06271936, + "balance_loss_mlp": 0.01252829, + "epoch": 0.7808206823989178, + "flos": 32382038638080.0, + "grad_norm": 1.540132157025115, + "language_loss": 0.74469846, + "learning_rate": 4.830076132284859e-07, + "loss": 0.82136583, + "num_input_tokens_seen": 280109960, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09375, + "step": 12987, + "time_per_iteration": 2.6014459133148193 + }, + { + "auxiliary_loss_clip": 0.06307278, + "auxiliary_loss_mlp": 0.01248897, + "balance_loss_clip": 0.06251733, + "balance_loss_mlp": 0.01247845, + "epoch": 0.7808808056515858, + "flos": 55070512381440.0, + "grad_norm": 0.7358853994181496, + "language_loss": 0.55100733, + "learning_rate": 4.82753838589184e-07, + "loss": 0.62656909, + "num_input_tokens_seen": 280169805, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01052094, + "step": 12988, + "time_per_iteration": 3.1363513469696045 + }, + { + "auxiliary_loss_clip": 0.06395964, + "auxiliary_loss_mlp": 0.01273063, + "balance_loss_clip": 0.06268729, + "balance_loss_mlp": 0.01264235, + "epoch": 0.7809409289042537, + "flos": 12864375244800.0, + "grad_norm": 2.503136362743708, + "language_loss": 0.80932319, + "learning_rate": 4.82500121484009e-07, + "loss": 0.88601345, + "num_input_tokens_seen": 280184630, + "router_z_loss_clip": 1.27246094, + "router_z_loss_mlp": 0.08831787, + "step": 12989, + "time_per_iteration": 2.4550793170928955 + }, + { + "auxiliary_loss_clip": 0.06397895, + "auxiliary_loss_mlp": 0.0126169, + "balance_loss_clip": 0.06268378, + "balance_loss_mlp": 0.0125269, + "epoch": 0.7810010521569217, + "flos": 21693329856000.0, + "grad_norm": 1.5548108351785217, + "language_loss": 0.70569479, + "learning_rate": 4.822464619225806e-07, + "loss": 0.78229064, + "num_input_tokens_seen": 280203880, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.09002686, + "step": 12990, + "time_per_iteration": 2.534583330154419 + }, + { + "auxiliary_loss_clip": 0.064027, + "auxiliary_loss_mlp": 0.01265995, + "balance_loss_clip": 0.06270639, + "balance_loss_mlp": 0.01255666, + "epoch": 0.7810611754095896, + "flos": 16761560169600.0, + "grad_norm": 2.151540581159162, + "language_loss": 0.78160757, + "learning_rate": 4.819928599145184e-07, + "loss": 0.85829455, + "num_input_tokens_seen": 280220460, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.10327148, + "step": 12991, + "time_per_iteration": 2.4641294479370117 + }, + { + "auxiliary_loss_clip": 0.06403095, + "auxiliary_loss_mlp": 0.01267597, + "balance_loss_clip": 0.06270657, + "balance_loss_mlp": 0.01257071, + "epoch": 0.7811212986622577, + "flos": 43517489063040.0, + "grad_norm": 1.4386933089332317, + "language_loss": 0.66202235, + "learning_rate": 4.817393154694398e-07, + "loss": 0.73872924, + "num_input_tokens_seen": 280242680, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.10528564, + "step": 12992, + "time_per_iteration": 2.712284564971924 + }, + { + "auxiliary_loss_clip": 0.06407847, + "auxiliary_loss_mlp": 0.0126388, + "balance_loss_clip": 0.06273159, + "balance_loss_mlp": 0.01254373, + "epoch": 0.7811814219149256, + "flos": 21763377469440.0, + "grad_norm": 1.666565007875902, + "language_loss": 0.61892599, + "learning_rate": 4.814858285969578e-07, + "loss": 0.69564325, + "num_input_tokens_seen": 280260655, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.09503174, + "step": 12993, + "time_per_iteration": 2.4966509342193604 + }, + { + "auxiliary_loss_clip": 0.06400012, + "auxiliary_loss_mlp": 0.0126208, + "balance_loss_clip": 0.06270296, + "balance_loss_mlp": 0.01252532, + "epoch": 0.7812415451675936, + "flos": 24068447763840.0, + "grad_norm": 1.3952221037257373, + "language_loss": 0.68836015, + "learning_rate": 4.812323993066862e-07, + "loss": 0.76498109, + "num_input_tokens_seen": 280281185, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.09545898, + "step": 12994, + "time_per_iteration": 2.536137819290161 + }, + { + "auxiliary_loss_clip": 0.06404947, + "auxiliary_loss_mlp": 0.01263745, + "balance_loss_clip": 0.06273837, + "balance_loss_mlp": 0.01254703, + "epoch": 0.7813016684202615, + "flos": 18995744309760.0, + "grad_norm": 1.7501216946691078, + "language_loss": 0.69363022, + "learning_rate": 4.809790276082335e-07, + "loss": 0.77031708, + "num_input_tokens_seen": 280298255, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.09039307, + "step": 12995, + "time_per_iteration": 2.470670700073242 + }, + { + "auxiliary_loss_clip": 0.06396692, + "auxiliary_loss_mlp": 0.01265345, + "balance_loss_clip": 0.06268929, + "balance_loss_mlp": 0.0125644, + "epoch": 0.7813617916729295, + "flos": 25267124747520.0, + "grad_norm": 1.5705022516303782, + "language_loss": 0.75361514, + "learning_rate": 4.807257135112088e-07, + "loss": 0.83023554, + "num_input_tokens_seen": 280319000, + "router_z_loss_clip": 1.27734375, + "router_z_loss_mlp": 0.08905029, + "step": 12996, + "time_per_iteration": 2.548156261444092 + }, + { + "auxiliary_loss_clip": 0.06408437, + "auxiliary_loss_mlp": 0.01266772, + "balance_loss_clip": 0.06271097, + "balance_loss_mlp": 0.01256055, + "epoch": 0.7814219149255974, + "flos": 17971557454080.0, + "grad_norm": 2.5240024848484284, + "language_loss": 0.68320543, + "learning_rate": 4.804724570252167e-07, + "loss": 0.75995755, + "num_input_tokens_seen": 280336375, + "router_z_loss_clip": 1.37402344, + "router_z_loss_mlp": 0.10723877, + "step": 12997, + "time_per_iteration": 2.4495344161987305 + }, + { + "auxiliary_loss_clip": 0.06410494, + "auxiliary_loss_mlp": 0.01266795, + "balance_loss_clip": 0.06272165, + "balance_loss_mlp": 0.01256018, + "epoch": 0.7814820381782654, + "flos": 25783368952320.0, + "grad_norm": 1.6126365862237693, + "language_loss": 0.82193416, + "learning_rate": 4.802192581598614e-07, + "loss": 0.89870703, + "num_input_tokens_seen": 280358760, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.10778809, + "step": 12998, + "time_per_iteration": 2.535696506500244 + }, + { + "auxiliary_loss_clip": 0.06407057, + "auxiliary_loss_mlp": 0.01266001, + "balance_loss_clip": 0.06273869, + "balance_loss_mlp": 0.01256166, + "epoch": 0.7815421614309335, + "flos": 20525442048000.0, + "grad_norm": 1.8946982526297624, + "language_loss": 0.7477777, + "learning_rate": 4.799661169247453e-07, + "loss": 0.82450831, + "num_input_tokens_seen": 280377085, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.09844971, + "step": 12999, + "time_per_iteration": 2.4902775287628174 + }, + { + "auxiliary_loss_clip": 0.06407912, + "auxiliary_loss_mlp": 0.01262829, + "balance_loss_clip": 0.06271957, + "balance_loss_mlp": 0.01252517, + "epoch": 0.7816022846836014, + "flos": 21293980496640.0, + "grad_norm": 1.4384947504961985, + "language_loss": 0.84615433, + "learning_rate": 4.797130333294652e-07, + "loss": 0.92286175, + "num_input_tokens_seen": 280395465, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.10314941, + "step": 13000, + "time_per_iteration": 2.512596607208252 + }, + { + "auxiliary_loss_clip": 0.0640571, + "auxiliary_loss_mlp": 0.01264665, + "balance_loss_clip": 0.06273641, + "balance_loss_mlp": 0.01254126, + "epoch": 0.7816624079362694, + "flos": 19214440266240.0, + "grad_norm": 1.8073266601471953, + "language_loss": 0.66751462, + "learning_rate": 4.794600073836192e-07, + "loss": 0.74421835, + "num_input_tokens_seen": 280412775, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.10540771, + "step": 13001, + "time_per_iteration": 2.4772894382476807 + }, + { + "auxiliary_loss_clip": 0.06405921, + "auxiliary_loss_mlp": 0.01263146, + "balance_loss_clip": 0.06271157, + "balance_loss_mlp": 0.01253526, + "epoch": 0.7817225311889373, + "flos": 26111957938560.0, + "grad_norm": 1.5273491192329303, + "language_loss": 0.66959155, + "learning_rate": 4.792070390968027e-07, + "loss": 0.74628222, + "num_input_tokens_seen": 280432905, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.09625244, + "step": 13002, + "time_per_iteration": 2.5820791721343994 + }, + { + "auxiliary_loss_clip": 0.06409384, + "auxiliary_loss_mlp": 0.01266696, + "balance_loss_clip": 0.06275305, + "balance_loss_mlp": 0.01256176, + "epoch": 0.7817826544416053, + "flos": 21257195754240.0, + "grad_norm": 2.018800094451087, + "language_loss": 0.73878789, + "learning_rate": 4.78954128478607e-07, + "loss": 0.81554866, + "num_input_tokens_seen": 280450785, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.10534668, + "step": 13003, + "time_per_iteration": 2.481661319732666 + }, + { + "auxiliary_loss_clip": 0.06404527, + "auxiliary_loss_mlp": 0.01265727, + "balance_loss_clip": 0.0627287, + "balance_loss_mlp": 0.01256208, + "epoch": 0.7818427776942732, + "flos": 19937347367040.0, + "grad_norm": 1.9756660000355053, + "language_loss": 0.62827951, + "learning_rate": 4.787012755386233e-07, + "loss": 0.70498204, + "num_input_tokens_seen": 280468400, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09515381, + "step": 13004, + "time_per_iteration": 2.497821569442749 + }, + { + "auxiliary_loss_clip": 0.0639583, + "auxiliary_loss_mlp": 0.01262478, + "balance_loss_clip": 0.06268562, + "balance_loss_mlp": 0.01253669, + "epoch": 0.7819029009469413, + "flos": 11368443502080.0, + "grad_norm": 1.7802974888908354, + "language_loss": 0.83142269, + "learning_rate": 4.784484802864403e-07, + "loss": 0.90800571, + "num_input_tokens_seen": 280483930, + "router_z_loss_clip": 1.2734375, + "router_z_loss_mlp": 0.08807373, + "step": 13005, + "time_per_iteration": 2.455112934112549 + }, + { + "auxiliary_loss_clip": 0.06402773, + "auxiliary_loss_mlp": 0.01265138, + "balance_loss_clip": 0.06270364, + "balance_loss_mlp": 0.01255172, + "epoch": 0.7819630241996092, + "flos": 24286053617280.0, + "grad_norm": 1.9304449854635368, + "language_loss": 0.73000956, + "learning_rate": 4.781957427316432e-07, + "loss": 0.80668867, + "num_input_tokens_seen": 280503465, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.09973145, + "step": 13006, + "time_per_iteration": 3.923842191696167 + }, + { + "auxiliary_loss_clip": 0.06406109, + "auxiliary_loss_mlp": 0.01263435, + "balance_loss_clip": 0.06271446, + "balance_loss_mlp": 0.01252891, + "epoch": 0.7820231474522772, + "flos": 22715168797440.0, + "grad_norm": 1.5911839097464888, + "language_loss": 0.72339863, + "learning_rate": 4.779430628838157e-07, + "loss": 0.80009413, + "num_input_tokens_seen": 280523375, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.10540771, + "step": 13007, + "time_per_iteration": 2.5166056156158447 + }, + { + "auxiliary_loss_clip": 0.06406694, + "auxiliary_loss_mlp": 0.01267894, + "balance_loss_clip": 0.06271846, + "balance_loss_mlp": 0.0125782, + "epoch": 0.7820832707049451, + "flos": 20053571379840.0, + "grad_norm": 2.020015501308364, + "language_loss": 0.69036144, + "learning_rate": 4.776904407525397e-07, + "loss": 0.76710731, + "num_input_tokens_seen": 280542920, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.10070801, + "step": 13008, + "time_per_iteration": 2.495736837387085 + }, + { + "auxiliary_loss_clip": 0.064032, + "auxiliary_loss_mlp": 0.012644, + "balance_loss_clip": 0.06269944, + "balance_loss_mlp": 0.01253457, + "epoch": 0.7821433939576131, + "flos": 27170246206080.0, + "grad_norm": 1.7298477969217696, + "language_loss": 0.69919395, + "learning_rate": 4.774378763473954e-07, + "loss": 0.77586997, + "num_input_tokens_seen": 280561700, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.10949707, + "step": 13009, + "time_per_iteration": 2.5899367332458496 + }, + { + "auxiliary_loss_clip": 0.06399304, + "auxiliary_loss_mlp": 0.01262145, + "balance_loss_clip": 0.06269169, + "balance_loss_mlp": 0.01252781, + "epoch": 0.782203517210281, + "flos": 22608755712000.0, + "grad_norm": 1.790636522261297, + "language_loss": 0.81948966, + "learning_rate": 4.771853696779586e-07, + "loss": 0.89610416, + "num_input_tokens_seen": 280580605, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.09362793, + "step": 13010, + "time_per_iteration": 2.5066049098968506 + }, + { + "auxiliary_loss_clip": 0.06400339, + "auxiliary_loss_mlp": 0.01262085, + "balance_loss_clip": 0.06270656, + "balance_loss_mlp": 0.01252692, + "epoch": 0.782263640462949, + "flos": 29067539806080.0, + "grad_norm": 1.385682436411659, + "language_loss": 0.62627685, + "learning_rate": 4.76932920753806e-07, + "loss": 0.70290112, + "num_input_tokens_seen": 280601495, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.09399414, + "step": 13011, + "time_per_iteration": 2.6026289463043213 + }, + { + "auxiliary_loss_clip": 0.06399235, + "auxiliary_loss_mlp": 0.0126419, + "balance_loss_clip": 0.0626906, + "balance_loss_mlp": 0.01255306, + "epoch": 0.782323763715617, + "flos": 25306215477120.0, + "grad_norm": 1.6427811316724177, + "language_loss": 0.70159376, + "learning_rate": 4.7668052958450913e-07, + "loss": 0.77822804, + "num_input_tokens_seen": 280622760, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.08883667, + "step": 13012, + "time_per_iteration": 2.53303861618042 + }, + { + "auxiliary_loss_clip": 0.0630969, + "auxiliary_loss_mlp": 0.01250424, + "balance_loss_clip": 0.06253915, + "balance_loss_mlp": 0.01249417, + "epoch": 0.782383886968285, + "flos": 65216548195200.0, + "grad_norm": 0.6922289036219499, + "language_loss": 0.55011511, + "learning_rate": 4.764281961796395e-07, + "loss": 0.62571621, + "num_input_tokens_seen": 280687115, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.0100708, + "step": 13013, + "time_per_iteration": 3.228905439376831 + }, + { + "auxiliary_loss_clip": 0.06409347, + "auxiliary_loss_mlp": 0.01264895, + "balance_loss_clip": 0.06273122, + "balance_loss_mlp": 0.01254708, + "epoch": 0.782444010220953, + "flos": 18411297281280.0, + "grad_norm": 1.7267010887219136, + "language_loss": 0.6554383, + "learning_rate": 4.76175920548765e-07, + "loss": 0.73218066, + "num_input_tokens_seen": 280705000, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.10186768, + "step": 13014, + "time_per_iteration": 2.4842281341552734 + }, + { + "auxiliary_loss_clip": 0.06309456, + "auxiliary_loss_mlp": 0.01249284, + "balance_loss_clip": 0.06253707, + "balance_loss_mlp": 0.01248232, + "epoch": 0.7825041334736209, + "flos": 63977145327360.0, + "grad_norm": 0.6946375412557042, + "language_loss": 0.58183634, + "learning_rate": 4.759237027014524e-07, + "loss": 0.65742373, + "num_input_tokens_seen": 280773525, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.01052094, + "step": 13015, + "time_per_iteration": 4.588924169540405 + }, + { + "auxiliary_loss_clip": 0.06401119, + "auxiliary_loss_mlp": 0.01267469, + "balance_loss_clip": 0.06269481, + "balance_loss_mlp": 0.01258141, + "epoch": 0.7825642567262889, + "flos": 20345585258880.0, + "grad_norm": 1.703957116588016, + "language_loss": 0.75081736, + "learning_rate": 4.756715426472666e-07, + "loss": 0.8275032, + "num_input_tokens_seen": 280791915, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.09326172, + "step": 13016, + "time_per_iteration": 2.5329108238220215 + }, + { + "auxiliary_loss_clip": 0.06404392, + "auxiliary_loss_mlp": 0.01262942, + "balance_loss_clip": 0.0627065, + "balance_loss_mlp": 0.01252303, + "epoch": 0.7826243799789568, + "flos": 20268577756800.0, + "grad_norm": 1.8073604316882006, + "language_loss": 0.75204456, + "learning_rate": 4.7541944039576766e-07, + "loss": 0.82871789, + "num_input_tokens_seen": 280811460, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.10644531, + "step": 13017, + "time_per_iteration": 2.475156307220459 + }, + { + "auxiliary_loss_clip": 0.06402843, + "auxiliary_loss_mlp": 0.01267244, + "balance_loss_clip": 0.06268843, + "balance_loss_mlp": 0.01256974, + "epoch": 0.7826845032316249, + "flos": 21137743359360.0, + "grad_norm": 2.040801926545799, + "language_loss": 0.76392686, + "learning_rate": 4.7516739595651636e-07, + "loss": 0.84062773, + "num_input_tokens_seen": 280825415, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.10272217, + "step": 13018, + "time_per_iteration": 2.487426280975342 + }, + { + "auxiliary_loss_clip": 0.06399854, + "auxiliary_loss_mlp": 0.01266755, + "balance_loss_clip": 0.06267899, + "balance_loss_mlp": 0.01256652, + "epoch": 0.7827446264842928, + "flos": 22498862682240.0, + "grad_norm": 1.372243474464688, + "language_loss": 0.77303207, + "learning_rate": 4.749154093390708e-07, + "loss": 0.84969819, + "num_input_tokens_seen": 280845335, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.10101318, + "step": 13019, + "time_per_iteration": 3.9929661750793457 + }, + { + "auxiliary_loss_clip": 0.06402994, + "auxiliary_loss_mlp": 0.01262289, + "balance_loss_clip": 0.06270827, + "balance_loss_mlp": 0.01252716, + "epoch": 0.7828047497369608, + "flos": 28848298798080.0, + "grad_norm": 1.5302046245116039, + "language_loss": 0.6745941, + "learning_rate": 4.746634805529852e-07, + "loss": 0.75124693, + "num_input_tokens_seen": 280867145, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.09570312, + "step": 13020, + "time_per_iteration": 2.564709424972534 + }, + { + "auxiliary_loss_clip": 0.06400368, + "auxiliary_loss_mlp": 0.012665, + "balance_loss_clip": 0.0626877, + "balance_loss_mlp": 0.01256397, + "epoch": 0.7828648729896287, + "flos": 23264298529920.0, + "grad_norm": 2.6855687872649825, + "language_loss": 0.62745917, + "learning_rate": 4.7441160960781325e-07, + "loss": 0.70412791, + "num_input_tokens_seen": 280886185, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.10101318, + "step": 13021, + "time_per_iteration": 2.4964163303375244 + }, + { + "auxiliary_loss_clip": 0.06403099, + "auxiliary_loss_mlp": 0.01264616, + "balance_loss_clip": 0.06270363, + "balance_loss_mlp": 0.01255592, + "epoch": 0.7829249962422967, + "flos": 25272826824960.0, + "grad_norm": 1.5874593754725228, + "language_loss": 0.69790453, + "learning_rate": 4.7415979651310636e-07, + "loss": 0.77458167, + "num_input_tokens_seen": 280907665, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.090271, + "step": 13022, + "time_per_iteration": 2.5415072441101074 + }, + { + "auxiliary_loss_clip": 0.06309162, + "auxiliary_loss_mlp": 0.01253506, + "balance_loss_clip": 0.06253611, + "balance_loss_mlp": 0.01252549, + "epoch": 0.7829851194949646, + "flos": 70742087441280.0, + "grad_norm": 0.6386935126948231, + "language_loss": 0.56138313, + "learning_rate": 4.739080412784131e-07, + "loss": 0.6370098, + "num_input_tokens_seen": 280971405, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.009552, + "step": 13023, + "time_per_iteration": 4.637472867965698 + }, + { + "auxiliary_loss_clip": 0.06393711, + "auxiliary_loss_mlp": 0.01265571, + "balance_loss_clip": 0.06267409, + "balance_loss_mlp": 0.01256451, + "epoch": 0.7830452427476327, + "flos": 25666977231360.0, + "grad_norm": 1.576482021290812, + "language_loss": 0.67401826, + "learning_rate": 4.736563439132792e-07, + "loss": 0.75061107, + "num_input_tokens_seen": 280989615, + "router_z_loss_clip": 1.26464844, + "router_z_loss_mlp": 0.09118652, + "step": 13024, + "time_per_iteration": 2.538425922393799 + }, + { + "auxiliary_loss_clip": 0.06403638, + "auxiliary_loss_mlp": 0.01263953, + "balance_loss_clip": 0.06269067, + "balance_loss_mlp": 0.0125357, + "epoch": 0.7831053660003006, + "flos": 22791002342400.0, + "grad_norm": 1.5665497407988729, + "language_loss": 0.77940929, + "learning_rate": 4.734047044272498e-07, + "loss": 0.85608524, + "num_input_tokens_seen": 281009450, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.10369873, + "step": 13025, + "time_per_iteration": 2.5431177616119385 + }, + { + "auxiliary_loss_clip": 0.0640173, + "auxiliary_loss_mlp": 0.01265493, + "balance_loss_clip": 0.06270472, + "balance_loss_mlp": 0.01256302, + "epoch": 0.7831654892529686, + "flos": 25819399008000.0, + "grad_norm": 1.644612426825064, + "language_loss": 0.7874493, + "learning_rate": 4.731531228298673e-07, + "loss": 0.86412156, + "num_input_tokens_seen": 281028120, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09197998, + "step": 13026, + "time_per_iteration": 2.556727647781372 + }, + { + "auxiliary_loss_clip": 0.06404313, + "auxiliary_loss_mlp": 0.01262471, + "balance_loss_clip": 0.06272115, + "balance_loss_mlp": 0.01253006, + "epoch": 0.7832256125056366, + "flos": 20776897751040.0, + "grad_norm": 2.5804756283092334, + "language_loss": 0.75804269, + "learning_rate": 4.729015991306715e-07, + "loss": 0.83471048, + "num_input_tokens_seen": 281042130, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.09466553, + "step": 13027, + "time_per_iteration": 2.4878506660461426 + }, + { + "auxiliary_loss_clip": 0.0639909, + "auxiliary_loss_mlp": 0.01265777, + "balance_loss_clip": 0.06269808, + "balance_loss_mlp": 0.01255978, + "epoch": 0.7832857357583045, + "flos": 21512886088320.0, + "grad_norm": 1.7061440421315746, + "language_loss": 0.70765603, + "learning_rate": 4.726501333391997e-07, + "loss": 0.78430474, + "num_input_tokens_seen": 281060945, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.09802246, + "step": 13028, + "time_per_iteration": 2.498478651046753 + }, + { + "auxiliary_loss_clip": 0.06406339, + "auxiliary_loss_mlp": 0.01268084, + "balance_loss_clip": 0.06271327, + "balance_loss_mlp": 0.01257874, + "epoch": 0.7833458590109725, + "flos": 18083714544000.0, + "grad_norm": 1.9644194417750374, + "language_loss": 0.68658125, + "learning_rate": 4.7239872546498774e-07, + "loss": 0.76332551, + "num_input_tokens_seen": 281079270, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.10217285, + "step": 13029, + "time_per_iteration": 2.580122470855713 + }, + { + "auxiliary_loss_clip": 0.06403092, + "auxiliary_loss_mlp": 0.01267866, + "balance_loss_clip": 0.0626725, + "balance_loss_mlp": 0.01258001, + "epoch": 0.7834059822636404, + "flos": 28295521413120.0, + "grad_norm": 1.7391755665392523, + "language_loss": 0.81014347, + "learning_rate": 4.721473755175698e-07, + "loss": 0.88685304, + "num_input_tokens_seen": 281099500, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.09869385, + "step": 13030, + "time_per_iteration": 2.5314316749572754 + }, + { + "auxiliary_loss_clip": 0.06404968, + "auxiliary_loss_mlp": 0.01261968, + "balance_loss_clip": 0.06269055, + "balance_loss_mlp": 0.01251949, + "epoch": 0.7834661055163085, + "flos": 31694281125120.0, + "grad_norm": 1.5048813517509494, + "language_loss": 0.70804811, + "learning_rate": 4.71896083506476e-07, + "loss": 0.78471744, + "num_input_tokens_seen": 281121250, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10021973, + "step": 13031, + "time_per_iteration": 2.5823378562927246 + }, + { + "auxiliary_loss_clip": 0.06405063, + "auxiliary_loss_mlp": 0.01266526, + "balance_loss_clip": 0.06270566, + "balance_loss_mlp": 0.01257079, + "epoch": 0.7835262287689764, + "flos": 12938238218880.0, + "grad_norm": 2.7115393333323468, + "language_loss": 0.78693461, + "learning_rate": 4.7164484944123574e-07, + "loss": 0.86365044, + "num_input_tokens_seen": 281138760, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.09442139, + "step": 13032, + "time_per_iteration": 2.4609038829803467 + }, + { + "auxiliary_loss_clip": 0.06404404, + "auxiliary_loss_mlp": 0.01268456, + "balance_loss_clip": 0.06269069, + "balance_loss_mlp": 0.01258317, + "epoch": 0.7835863520216444, + "flos": 16148671879680.0, + "grad_norm": 1.9002530639505248, + "language_loss": 0.63003838, + "learning_rate": 4.7139367333137726e-07, + "loss": 0.70676696, + "num_input_tokens_seen": 281157420, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.10137939, + "step": 13033, + "time_per_iteration": 2.500108242034912 + }, + { + "auxiliary_loss_clip": 0.06404372, + "auxiliary_loss_mlp": 0.01263517, + "balance_loss_clip": 0.06270869, + "balance_loss_mlp": 0.01253492, + "epoch": 0.7836464752743123, + "flos": 11514660076800.0, + "grad_norm": 1.5173952682400234, + "language_loss": 0.72150695, + "learning_rate": 4.7114255518642255e-07, + "loss": 0.79818583, + "num_input_tokens_seen": 281174620, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.10021973, + "step": 13034, + "time_per_iteration": 2.4920992851257324 + }, + { + "auxiliary_loss_clip": 0.06405693, + "auxiliary_loss_mlp": 0.01268729, + "balance_loss_clip": 0.06272385, + "balance_loss_mlp": 0.01258685, + "epoch": 0.7837065985269803, + "flos": 18229637629440.0, + "grad_norm": 1.7491156010672833, + "language_loss": 0.7212472, + "learning_rate": 4.7089149501589555e-07, + "loss": 0.79799139, + "num_input_tokens_seen": 281193865, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.1005249, + "step": 13035, + "time_per_iteration": 2.482640027999878 + }, + { + "auxiliary_loss_clip": 0.06404319, + "auxiliary_loss_mlp": 0.01270811, + "balance_loss_clip": 0.06270225, + "balance_loss_mlp": 0.01260541, + "epoch": 0.7837667217796482, + "flos": 24761404229760.0, + "grad_norm": 2.0189753157396373, + "language_loss": 0.66216964, + "learning_rate": 4.7064049282931664e-07, + "loss": 0.73892099, + "num_input_tokens_seen": 281212250, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.10266113, + "step": 13036, + "time_per_iteration": 2.5221505165100098 + }, + { + "auxiliary_loss_clip": 0.06407806, + "auxiliary_loss_mlp": 0.01272324, + "balance_loss_clip": 0.06269644, + "balance_loss_mlp": 0.01260981, + "epoch": 0.7838268450323163, + "flos": 22389766266240.0, + "grad_norm": 2.337708376501524, + "language_loss": 0.73523962, + "learning_rate": 4.703895486362031e-07, + "loss": 0.81204098, + "num_input_tokens_seen": 281230850, + "router_z_loss_clip": 1.38183594, + "router_z_loss_mlp": 0.11340332, + "step": 13037, + "time_per_iteration": 2.5027549266815186 + }, + { + "auxiliary_loss_clip": 0.06402339, + "auxiliary_loss_mlp": 0.01265411, + "balance_loss_clip": 0.06268933, + "balance_loss_mlp": 0.01255099, + "epoch": 0.7838869682849842, + "flos": 19506370291200.0, + "grad_norm": 2.111880919052157, + "language_loss": 0.60144168, + "learning_rate": 4.701386624460717e-07, + "loss": 0.67811918, + "num_input_tokens_seen": 281249810, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.10321045, + "step": 13038, + "time_per_iteration": 2.4813334941864014 + }, + { + "auxiliary_loss_clip": 0.06401114, + "auxiliary_loss_mlp": 0.01264836, + "balance_loss_clip": 0.06270541, + "balance_loss_mlp": 0.01255484, + "epoch": 0.7839470915376522, + "flos": 32901553152000.0, + "grad_norm": 1.5605584713979823, + "language_loss": 0.68332416, + "learning_rate": 4.698878342684349e-07, + "loss": 0.75998366, + "num_input_tokens_seen": 281273730, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.09350586, + "step": 13039, + "time_per_iteration": 2.616943359375 + }, + { + "auxiliary_loss_clip": 0.06395827, + "auxiliary_loss_mlp": 0.01261469, + "balance_loss_clip": 0.06267862, + "balance_loss_mlp": 0.01253244, + "epoch": 0.7840072147903202, + "flos": 29683153353600.0, + "grad_norm": 1.67583580210183, + "language_loss": 0.69978261, + "learning_rate": 4.6963706411280537e-07, + "loss": 0.77635556, + "num_input_tokens_seen": 281293670, + "router_z_loss_clip": 1.27832031, + "router_z_loss_mlp": 0.08227539, + "step": 13040, + "time_per_iteration": 2.575289726257324 + }, + { + "auxiliary_loss_clip": 0.06404934, + "auxiliary_loss_mlp": 0.01266779, + "balance_loss_clip": 0.06269483, + "balance_loss_mlp": 0.01256503, + "epoch": 0.7840673380429881, + "flos": 18192601324800.0, + "grad_norm": 1.9496315301470044, + "language_loss": 0.67735672, + "learning_rate": 4.6938635198869116e-07, + "loss": 0.75407386, + "num_input_tokens_seen": 281313070, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.10272217, + "step": 13041, + "time_per_iteration": 2.5014941692352295 + }, + { + "auxiliary_loss_clip": 0.06304124, + "auxiliary_loss_mlp": 0.01252304, + "balance_loss_clip": 0.06248714, + "balance_loss_mlp": 0.01251298, + "epoch": 0.7841274612956561, + "flos": 66365694616320.0, + "grad_norm": 0.8059954256946308, + "language_loss": 0.57385874, + "learning_rate": 4.691356979055998e-07, + "loss": 0.649423, + "num_input_tokens_seen": 281374880, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01005554, + "step": 13042, + "time_per_iteration": 3.0931692123413086 + }, + { + "auxiliary_loss_clip": 0.06405251, + "auxiliary_loss_mlp": 0.0126489, + "balance_loss_clip": 0.06270869, + "balance_loss_mlp": 0.0125564, + "epoch": 0.784187584548324, + "flos": 26655259812480.0, + "grad_norm": 2.4178981590312105, + "language_loss": 0.84631729, + "learning_rate": 4.688851018730369e-07, + "loss": 0.92301869, + "num_input_tokens_seen": 281392620, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.09246826, + "step": 13043, + "time_per_iteration": 2.5591118335723877 + }, + { + "auxiliary_loss_clip": 0.0639644, + "auxiliary_loss_mlp": 0.01264718, + "balance_loss_clip": 0.06267819, + "balance_loss_mlp": 0.01255796, + "epoch": 0.7842477078009921, + "flos": 25747422750720.0, + "grad_norm": 1.364522654088724, + "language_loss": 0.88473415, + "learning_rate": 4.6863456390050425e-07, + "loss": 0.96134579, + "num_input_tokens_seen": 281413140, + "router_z_loss_clip": 1.28613281, + "router_z_loss_mlp": 0.08917236, + "step": 13044, + "time_per_iteration": 2.5349628925323486 + }, + { + "auxiliary_loss_clip": 0.06410815, + "auxiliary_loss_mlp": 0.01269176, + "balance_loss_clip": 0.06271672, + "balance_loss_mlp": 0.01259132, + "epoch": 0.78430783105366, + "flos": 21987398160000.0, + "grad_norm": 1.6046981571270753, + "language_loss": 0.79284698, + "learning_rate": 4.6838408399750195e-07, + "loss": 0.86964685, + "num_input_tokens_seen": 281430860, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.10040283, + "step": 13045, + "time_per_iteration": 3.9486923217773438 + }, + { + "auxiliary_loss_clip": 0.06400262, + "auxiliary_loss_mlp": 0.01262142, + "balance_loss_clip": 0.0626996, + "balance_loss_mlp": 0.01252862, + "epoch": 0.784367954306328, + "flos": 23849122901760.0, + "grad_norm": 1.3651332690132787, + "language_loss": 0.72812819, + "learning_rate": 4.6813366217352925e-07, + "loss": 0.80475229, + "num_input_tokens_seen": 281451385, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.09277344, + "step": 13046, + "time_per_iteration": 2.5449562072753906 + }, + { + "auxiliary_loss_clip": 0.06399076, + "auxiliary_loss_mlp": 0.01262656, + "balance_loss_clip": 0.06269773, + "balance_loss_mlp": 0.01253036, + "epoch": 0.7844280775589959, + "flos": 24833548195200.0, + "grad_norm": 1.4113250051922885, + "language_loss": 0.63375705, + "learning_rate": 4.678832984380809e-07, + "loss": 0.71037436, + "num_input_tokens_seen": 281472255, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.09619141, + "step": 13047, + "time_per_iteration": 2.555187940597534 + }, + { + "auxiliary_loss_clip": 0.06397624, + "auxiliary_loss_mlp": 0.01263441, + "balance_loss_clip": 0.06269644, + "balance_loss_mlp": 0.01253892, + "epoch": 0.7844882008116639, + "flos": 22462245648000.0, + "grad_norm": 1.5637844175125322, + "language_loss": 0.73288012, + "learning_rate": 4.676329928006515e-07, + "loss": 0.8094908, + "num_input_tokens_seen": 281492860, + "router_z_loss_clip": 1.28027344, + "router_z_loss_mlp": 0.09552002, + "step": 13048, + "time_per_iteration": 2.500697374343872 + }, + { + "auxiliary_loss_clip": 0.06406703, + "auxiliary_loss_mlp": 0.01264778, + "balance_loss_clip": 0.06269943, + "balance_loss_mlp": 0.01254586, + "epoch": 0.7845483240643318, + "flos": 26111203251840.0, + "grad_norm": 1.7122203145326895, + "language_loss": 0.74653435, + "learning_rate": 4.6738274527073243e-07, + "loss": 0.8232491, + "num_input_tokens_seen": 281511815, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.10198975, + "step": 13049, + "time_per_iteration": 2.525059700012207 + }, + { + "auxiliary_loss_clip": 0.06406355, + "auxiliary_loss_mlp": 0.012639, + "balance_loss_clip": 0.06269609, + "balance_loss_mlp": 0.0125279, + "epoch": 0.7846084473169999, + "flos": 19360363351680.0, + "grad_norm": 1.8695615724941215, + "language_loss": 0.72989309, + "learning_rate": 4.6713255585781454e-07, + "loss": 0.80659556, + "num_input_tokens_seen": 281530090, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.11114502, + "step": 13050, + "time_per_iteration": 2.502976655960083 + }, + { + "auxiliary_loss_clip": 0.0640547, + "auxiliary_loss_mlp": 0.01264968, + "balance_loss_clip": 0.06273313, + "balance_loss_mlp": 0.01255658, + "epoch": 0.7846685705696678, + "flos": 23331620885760.0, + "grad_norm": 1.8649850140502078, + "language_loss": 0.73895067, + "learning_rate": 4.668824245713825e-07, + "loss": 0.81565511, + "num_input_tokens_seen": 281547075, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09320068, + "step": 13051, + "time_per_iteration": 2.5090999603271484 + }, + { + "auxiliary_loss_clip": 0.06407961, + "auxiliary_loss_mlp": 0.01270446, + "balance_loss_clip": 0.06272332, + "balance_loss_mlp": 0.01259622, + "epoch": 0.7847286938223358, + "flos": 35818379706240.0, + "grad_norm": 2.0718578838618527, + "language_loss": 0.73053241, + "learning_rate": 4.666323514209227e-07, + "loss": 0.80731648, + "num_input_tokens_seen": 281568080, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10827637, + "step": 13052, + "time_per_iteration": 2.6086881160736084 + }, + { + "auxiliary_loss_clip": 0.06395121, + "auxiliary_loss_mlp": 0.01262593, + "balance_loss_clip": 0.06268048, + "balance_loss_mlp": 0.01253241, + "epoch": 0.7847888170750038, + "flos": 18483986298240.0, + "grad_norm": 1.9107364869927201, + "language_loss": 0.69673455, + "learning_rate": 4.663823364159183e-07, + "loss": 0.77331167, + "num_input_tokens_seen": 281586925, + "router_z_loss_clip": 1.27050781, + "router_z_loss_mlp": 0.09344482, + "step": 13053, + "time_per_iteration": 2.471815586090088 + }, + { + "auxiliary_loss_clip": 0.06401109, + "auxiliary_loss_mlp": 0.01260742, + "balance_loss_clip": 0.06270862, + "balance_loss_mlp": 0.01251807, + "epoch": 0.7848489403276717, + "flos": 25126190979840.0, + "grad_norm": 1.8867575378742971, + "language_loss": 0.70537353, + "learning_rate": 4.6613237956584893e-07, + "loss": 0.78199208, + "num_input_tokens_seen": 281603915, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.08929443, + "step": 13054, + "time_per_iteration": 2.5749151706695557 + }, + { + "auxiliary_loss_clip": 0.06405072, + "auxiliary_loss_mlp": 0.01264324, + "balance_loss_clip": 0.06269364, + "balance_loss_mlp": 0.01253971, + "epoch": 0.7849090635803397, + "flos": 26509169018880.0, + "grad_norm": 1.610774832305801, + "language_loss": 0.76244235, + "learning_rate": 4.658824808801938e-07, + "loss": 0.8391363, + "num_input_tokens_seen": 281624220, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10357666, + "step": 13055, + "time_per_iteration": 3.9623241424560547 + }, + { + "auxiliary_loss_clip": 0.06407758, + "auxiliary_loss_mlp": 0.01263649, + "balance_loss_clip": 0.06270571, + "balance_loss_mlp": 0.01253922, + "epoch": 0.7849691868330076, + "flos": 20965978488960.0, + "grad_norm": 1.9205969834144307, + "language_loss": 0.75488204, + "learning_rate": 4.656326403684283e-07, + "loss": 0.83159614, + "num_input_tokens_seen": 281642325, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.09729004, + "step": 13056, + "time_per_iteration": 2.4767720699310303 + }, + { + "auxiliary_loss_clip": 0.06400058, + "auxiliary_loss_mlp": 0.01266253, + "balance_loss_clip": 0.06269453, + "balance_loss_mlp": 0.01256841, + "epoch": 0.7850293100856757, + "flos": 26074628144640.0, + "grad_norm": 1.52924099348992, + "language_loss": 0.70278704, + "learning_rate": 4.6538285804002744e-07, + "loss": 0.77945018, + "num_input_tokens_seen": 281663065, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09423828, + "step": 13057, + "time_per_iteration": 2.5652661323547363 + }, + { + "auxiliary_loss_clip": 0.06407446, + "auxiliary_loss_mlp": 0.01266111, + "balance_loss_clip": 0.06271527, + "balance_loss_mlp": 0.01256789, + "epoch": 0.7850894333383436, + "flos": 22498443411840.0, + "grad_norm": 2.33768341300027, + "language_loss": 0.76614606, + "learning_rate": 4.6513313390446175e-07, + "loss": 0.84288156, + "num_input_tokens_seen": 281681005, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.09326172, + "step": 13058, + "time_per_iteration": 2.479261875152588 + }, + { + "auxiliary_loss_clip": 0.06401752, + "auxiliary_loss_mlp": 0.0126406, + "balance_loss_clip": 0.06268829, + "balance_loss_mlp": 0.01254244, + "epoch": 0.7851495565910116, + "flos": 20564952048000.0, + "grad_norm": 1.4951701283618941, + "language_loss": 0.71132874, + "learning_rate": 4.6488346797120146e-07, + "loss": 0.78798681, + "num_input_tokens_seen": 281697965, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.0982666, + "step": 13059, + "time_per_iteration": 3.9393692016601562 + }, + { + "auxiliary_loss_clip": 0.06412531, + "auxiliary_loss_mlp": 0.01265523, + "balance_loss_clip": 0.06272064, + "balance_loss_mlp": 0.01254842, + "epoch": 0.7852096798436795, + "flos": 15930353266560.0, + "grad_norm": 1.897902046144861, + "language_loss": 0.77542412, + "learning_rate": 4.646338602497144e-07, + "loss": 0.85220468, + "num_input_tokens_seen": 281716035, + "router_z_loss_clip": 1.40332031, + "router_z_loss_mlp": 0.10687256, + "step": 13060, + "time_per_iteration": 2.4718637466430664 + }, + { + "auxiliary_loss_clip": 0.06402256, + "auxiliary_loss_mlp": 0.0126411, + "balance_loss_clip": 0.06269743, + "balance_loss_mlp": 0.01254085, + "epoch": 0.7852698030963475, + "flos": 19068265618560.0, + "grad_norm": 1.8441572725485498, + "language_loss": 0.76857173, + "learning_rate": 4.643843107494654e-07, + "loss": 0.84523541, + "num_input_tokens_seen": 281732815, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.1003418, + "step": 13061, + "time_per_iteration": 2.4667510986328125 + }, + { + "auxiliary_loss_clip": 0.06403807, + "auxiliary_loss_mlp": 0.01266965, + "balance_loss_clip": 0.06270888, + "balance_loss_mlp": 0.01257738, + "epoch": 0.7853299263490154, + "flos": 24651259637760.0, + "grad_norm": 1.784620382168378, + "language_loss": 0.74518055, + "learning_rate": 4.641348194799164e-07, + "loss": 0.82188833, + "num_input_tokens_seen": 281751980, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.09234619, + "step": 13062, + "time_per_iteration": 2.5519487857818604 + }, + { + "auxiliary_loss_clip": 0.06401968, + "auxiliary_loss_mlp": 0.01263435, + "balance_loss_clip": 0.06270862, + "balance_loss_mlp": 0.01254501, + "epoch": 0.7853900496016835, + "flos": 22024518318720.0, + "grad_norm": 1.444565661483555, + "language_loss": 0.6925329, + "learning_rate": 4.638853864505297e-07, + "loss": 0.76918697, + "num_input_tokens_seen": 281772670, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.08935547, + "step": 13063, + "time_per_iteration": 3.896639585494995 + }, + { + "auxiliary_loss_clip": 0.064018, + "auxiliary_loss_mlp": 0.01262061, + "balance_loss_clip": 0.06272244, + "balance_loss_mlp": 0.01252858, + "epoch": 0.7854501728543514, + "flos": 30235343760000.0, + "grad_norm": 1.975335557654558, + "language_loss": 0.72825849, + "learning_rate": 4.636360116707625e-07, + "loss": 0.80489707, + "num_input_tokens_seen": 281792930, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.09210205, + "step": 13064, + "time_per_iteration": 2.567704200744629 + }, + { + "auxiliary_loss_clip": 0.06403325, + "auxiliary_loss_mlp": 0.01265412, + "balance_loss_clip": 0.0626822, + "balance_loss_mlp": 0.01255583, + "epoch": 0.7855102961070194, + "flos": 18849695443200.0, + "grad_norm": 1.5878092382689184, + "language_loss": 0.67936897, + "learning_rate": 4.633866951500718e-07, + "loss": 0.75605631, + "num_input_tokens_seen": 281811805, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.09838867, + "step": 13065, + "time_per_iteration": 2.470630168914795 + }, + { + "auxiliary_loss_clip": 0.06404464, + "auxiliary_loss_mlp": 0.01266751, + "balance_loss_clip": 0.06273209, + "balance_loss_mlp": 0.01257184, + "epoch": 0.7855704193596874, + "flos": 22316574124800.0, + "grad_norm": 3.292833578537852, + "language_loss": 0.75992739, + "learning_rate": 4.6313743689791196e-07, + "loss": 0.83663952, + "num_input_tokens_seen": 281831885, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09576416, + "step": 13066, + "time_per_iteration": 2.5433592796325684 + }, + { + "auxiliary_loss_clip": 0.06310245, + "auxiliary_loss_mlp": 0.01255234, + "balance_loss_clip": 0.06254524, + "balance_loss_mlp": 0.01254291, + "epoch": 0.7856305426123553, + "flos": 60024224638080.0, + "grad_norm": 0.6974485320329921, + "language_loss": 0.53405064, + "learning_rate": 4.628882369237346e-07, + "loss": 0.60970545, + "num_input_tokens_seen": 281900310, + "router_z_loss_clip": 0.55664062, + "router_z_loss_mlp": 0.00940704, + "step": 13067, + "time_per_iteration": 3.3080852031707764 + }, + { + "auxiliary_loss_clip": 0.06404316, + "auxiliary_loss_mlp": 0.0126413, + "balance_loss_clip": 0.06269915, + "balance_loss_mlp": 0.012542, + "epoch": 0.7856906658650233, + "flos": 21874528310400.0, + "grad_norm": 1.4327852205336962, + "language_loss": 0.68056738, + "learning_rate": 4.62639095236989e-07, + "loss": 0.75725186, + "num_input_tokens_seen": 281918870, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.0993042, + "step": 13068, + "time_per_iteration": 2.5869228839874268 + }, + { + "auxiliary_loss_clip": 0.06399503, + "auxiliary_loss_mlp": 0.01263997, + "balance_loss_clip": 0.06269825, + "balance_loss_mlp": 0.01254883, + "epoch": 0.7857507891176913, + "flos": 23629672258560.0, + "grad_norm": 1.764601675005712, + "language_loss": 0.68482268, + "learning_rate": 4.6239001184712267e-07, + "loss": 0.76145768, + "num_input_tokens_seen": 281936905, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.09112549, + "step": 13069, + "time_per_iteration": 2.5437350273132324 + }, + { + "auxiliary_loss_clip": 0.06404187, + "auxiliary_loss_mlp": 0.01263836, + "balance_loss_clip": 0.06271039, + "balance_loss_mlp": 0.01253984, + "epoch": 0.7858109123703593, + "flos": 25527091639680.0, + "grad_norm": 1.7842031457039946, + "language_loss": 0.76992953, + "learning_rate": 4.6214098676358195e-07, + "loss": 0.84660977, + "num_input_tokens_seen": 281955625, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.09857178, + "step": 13070, + "time_per_iteration": 2.5414490699768066 + }, + { + "auxiliary_loss_clip": 0.06396306, + "auxiliary_loss_mlp": 0.01264006, + "balance_loss_clip": 0.06267333, + "balance_loss_mlp": 0.01255298, + "epoch": 0.7858710356230272, + "flos": 17463195532800.0, + "grad_norm": 1.5496724726178355, + "language_loss": 0.6583572, + "learning_rate": 4.618920199958083e-07, + "loss": 0.73496032, + "num_input_tokens_seen": 281973285, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.08703613, + "step": 13071, + "time_per_iteration": 2.469886541366577 + }, + { + "auxiliary_loss_clip": 0.06407128, + "auxiliary_loss_mlp": 0.01264805, + "balance_loss_clip": 0.06271265, + "balance_loss_mlp": 0.01254946, + "epoch": 0.7859311588756952, + "flos": 24686367298560.0, + "grad_norm": 1.6110892083187893, + "language_loss": 0.73717749, + "learning_rate": 4.616431115532442e-07, + "loss": 0.81389678, + "num_input_tokens_seen": 281991410, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.09857178, + "step": 13072, + "time_per_iteration": 2.519676923751831 + }, + { + "auxiliary_loss_clip": 0.06403338, + "auxiliary_loss_mlp": 0.01268392, + "balance_loss_clip": 0.06269255, + "balance_loss_mlp": 0.01257288, + "epoch": 0.7859912821283631, + "flos": 21805654654080.0, + "grad_norm": 1.8631403345440603, + "language_loss": 0.71523631, + "learning_rate": 4.613942614453268e-07, + "loss": 0.79195362, + "num_input_tokens_seen": 282010845, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.11108398, + "step": 13073, + "time_per_iteration": 2.5105767250061035 + }, + { + "auxiliary_loss_clip": 0.06404594, + "auxiliary_loss_mlp": 0.01265595, + "balance_loss_clip": 0.06270787, + "balance_loss_mlp": 0.01255295, + "epoch": 0.7860514053810311, + "flos": 20853108639360.0, + "grad_norm": 1.5490527180797131, + "language_loss": 0.76964885, + "learning_rate": 4.611454696814938e-07, + "loss": 0.84635073, + "num_input_tokens_seen": 282029635, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.10302734, + "step": 13074, + "time_per_iteration": 2.4855496883392334 + }, + { + "auxiliary_loss_clip": 0.06398475, + "auxiliary_loss_mlp": 0.01266136, + "balance_loss_clip": 0.06269623, + "balance_loss_mlp": 0.01256504, + "epoch": 0.786111528633699, + "flos": 24322461016320.0, + "grad_norm": 1.8530422938464213, + "language_loss": 0.75361305, + "learning_rate": 4.608967362711782e-07, + "loss": 0.8302592, + "num_input_tokens_seen": 282050285, + "router_z_loss_clip": 1.28808594, + "router_z_loss_mlp": 0.09637451, + "step": 13075, + "time_per_iteration": 2.5396533012390137 + }, + { + "auxiliary_loss_clip": 0.06403027, + "auxiliary_loss_mlp": 0.01261838, + "balance_loss_clip": 0.06270842, + "balance_loss_mlp": 0.01252677, + "epoch": 0.7861716518863671, + "flos": 24360126226560.0, + "grad_norm": 1.639337001432503, + "language_loss": 0.68816268, + "learning_rate": 4.6064806122381283e-07, + "loss": 0.7648114, + "num_input_tokens_seen": 282071040, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.09161377, + "step": 13076, + "time_per_iteration": 2.507643461227417 + }, + { + "auxiliary_loss_clip": 0.06400099, + "auxiliary_loss_mlp": 0.01267556, + "balance_loss_clip": 0.06270486, + "balance_loss_mlp": 0.01258461, + "epoch": 0.786231775139035, + "flos": 14026728683520.0, + "grad_norm": 2.3148125900767065, + "language_loss": 0.79768962, + "learning_rate": 4.603994445488282e-07, + "loss": 0.87436622, + "num_input_tokens_seen": 282086610, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.09088135, + "step": 13077, + "time_per_iteration": 2.470398426055908 + }, + { + "auxiliary_loss_clip": 0.06401075, + "auxiliary_loss_mlp": 0.0126456, + "balance_loss_clip": 0.06269512, + "balance_loss_mlp": 0.01255, + "epoch": 0.786291898391703, + "flos": 33731795733120.0, + "grad_norm": 1.615733156524089, + "language_loss": 0.70986831, + "learning_rate": 4.6015088625564956e-07, + "loss": 0.78652471, + "num_input_tokens_seen": 282107440, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.09552002, + "step": 13078, + "time_per_iteration": 2.6685726642608643 + }, + { + "auxiliary_loss_clip": 0.06401184, + "auxiliary_loss_mlp": 0.01265393, + "balance_loss_clip": 0.06270616, + "balance_loss_mlp": 0.01255875, + "epoch": 0.786352021644371, + "flos": 25818476613120.0, + "grad_norm": 1.4651879237887804, + "language_loss": 0.81708902, + "learning_rate": 4.599023863537039e-07, + "loss": 0.89375478, + "num_input_tokens_seen": 282127290, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.09509277, + "step": 13079, + "time_per_iteration": 2.5660455226898193 + }, + { + "auxiliary_loss_clip": 0.0639349, + "auxiliary_loss_mlp": 0.01269341, + "balance_loss_clip": 0.0626843, + "balance_loss_mlp": 0.01260209, + "epoch": 0.7864121448970389, + "flos": 28918010995200.0, + "grad_norm": 1.4929435922037373, + "language_loss": 0.68745899, + "learning_rate": 4.596539448524146e-07, + "loss": 0.76408732, + "num_input_tokens_seen": 282147505, + "router_z_loss_clip": 1.25195312, + "router_z_loss_mlp": 0.09124756, + "step": 13080, + "time_per_iteration": 2.5500268936157227 + }, + { + "auxiliary_loss_clip": 0.06401475, + "auxiliary_loss_mlp": 0.0126541, + "balance_loss_clip": 0.06269769, + "balance_loss_mlp": 0.012552, + "epoch": 0.7864722681497069, + "flos": 19214943390720.0, + "grad_norm": 1.6425983942021263, + "language_loss": 0.70132333, + "learning_rate": 4.594055617612016e-07, + "loss": 0.77799213, + "num_input_tokens_seen": 282166450, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.10211182, + "step": 13081, + "time_per_iteration": 2.508885622024536 + }, + { + "auxiliary_loss_clip": 0.06405645, + "auxiliary_loss_mlp": 0.01264379, + "balance_loss_clip": 0.06271995, + "balance_loss_mlp": 0.01255021, + "epoch": 0.7865323914023749, + "flos": 21878008254720.0, + "grad_norm": 2.0927961593492737, + "language_loss": 0.68778342, + "learning_rate": 4.591572370894838e-07, + "loss": 0.76448363, + "num_input_tokens_seen": 282186465, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.09362793, + "step": 13082, + "time_per_iteration": 2.5268876552581787 + }, + { + "auxiliary_loss_clip": 0.0639787, + "auxiliary_loss_mlp": 0.01264108, + "balance_loss_clip": 0.0626892, + "balance_loss_mlp": 0.01254584, + "epoch": 0.7865925146550429, + "flos": 25527385128960.0, + "grad_norm": 1.5194289662582627, + "language_loss": 0.66099608, + "learning_rate": 4.589089708466789e-07, + "loss": 0.73761588, + "num_input_tokens_seen": 282207180, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.09527588, + "step": 13083, + "time_per_iteration": 2.5328421592712402 + }, + { + "auxiliary_loss_clip": 0.06405569, + "auxiliary_loss_mlp": 0.01266332, + "balance_loss_clip": 0.0627001, + "balance_loss_mlp": 0.01255424, + "epoch": 0.7866526379077108, + "flos": 19103121717120.0, + "grad_norm": 2.2309831052205387, + "language_loss": 0.74742764, + "learning_rate": 4.5866076304220015e-07, + "loss": 0.82414663, + "num_input_tokens_seen": 282225865, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.10906982, + "step": 13084, + "time_per_iteration": 3.8599534034729004 + }, + { + "auxiliary_loss_clip": 0.06398539, + "auxiliary_loss_mlp": 0.01265, + "balance_loss_clip": 0.0626938, + "balance_loss_mlp": 0.01255678, + "epoch": 0.7867127611603788, + "flos": 16178245171200.0, + "grad_norm": 1.7096991986275847, + "language_loss": 0.7048676, + "learning_rate": 4.584126136854591e-07, + "loss": 0.7815029, + "num_input_tokens_seen": 282242895, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.09313965, + "step": 13085, + "time_per_iteration": 2.4548091888427734 + }, + { + "auxiliary_loss_clip": 0.06404947, + "auxiliary_loss_mlp": 0.01266508, + "balance_loss_clip": 0.0626765, + "balance_loss_mlp": 0.01256238, + "epoch": 0.7867728844130467, + "flos": 20779329519360.0, + "grad_norm": 1.9009229295966659, + "language_loss": 0.72873515, + "learning_rate": 4.5816452278586617e-07, + "loss": 0.80544972, + "num_input_tokens_seen": 282260425, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.10266113, + "step": 13086, + "time_per_iteration": 2.4679646492004395 + }, + { + "auxiliary_loss_clip": 0.06401749, + "auxiliary_loss_mlp": 0.0126499, + "balance_loss_clip": 0.0626972, + "balance_loss_mlp": 0.01256132, + "epoch": 0.7868330076657147, + "flos": 21766186581120.0, + "grad_norm": 1.6915622771395795, + "language_loss": 0.75259304, + "learning_rate": 4.5791649035282965e-07, + "loss": 0.82926041, + "num_input_tokens_seen": 282279335, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.08862305, + "step": 13087, + "time_per_iteration": 2.4868595600128174 + }, + { + "auxiliary_loss_clip": 0.06401436, + "auxiliary_loss_mlp": 0.01266533, + "balance_loss_clip": 0.06271186, + "balance_loss_mlp": 0.01257431, + "epoch": 0.7868931309183826, + "flos": 25707451553280.0, + "grad_norm": 1.5159741083416707, + "language_loss": 0.71450847, + "learning_rate": 4.5766851639575456e-07, + "loss": 0.79118818, + "num_input_tokens_seen": 282299905, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.09088135, + "step": 13088, + "time_per_iteration": 2.5030412673950195 + }, + { + "auxiliary_loss_clip": 0.06311038, + "auxiliary_loss_mlp": 0.01250466, + "balance_loss_clip": 0.06255361, + "balance_loss_mlp": 0.012495, + "epoch": 0.7869532541710507, + "flos": 64666579921920.0, + "grad_norm": 0.663330829427475, + "language_loss": 0.55047309, + "learning_rate": 4.574206009240431e-07, + "loss": 0.62608814, + "num_input_tokens_seen": 282367620, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.00964355, + "step": 13089, + "time_per_iteration": 3.1940503120422363 + }, + { + "auxiliary_loss_clip": 0.06311715, + "auxiliary_loss_mlp": 0.01259019, + "balance_loss_clip": 0.0625612, + "balance_loss_mlp": 0.01257986, + "epoch": 0.7870133774237186, + "flos": 67475651725440.0, + "grad_norm": 0.7045101458235505, + "language_loss": 0.49567109, + "learning_rate": 4.571727439470976e-07, + "loss": 0.57137847, + "num_input_tokens_seen": 282435695, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01033783, + "step": 13090, + "time_per_iteration": 3.2323949337005615 + }, + { + "auxiliary_loss_clip": 0.06399588, + "auxiliary_loss_mlp": 0.01264155, + "balance_loss_clip": 0.0626979, + "balance_loss_mlp": 0.01255006, + "epoch": 0.7870735006763866, + "flos": 26075592466560.0, + "grad_norm": 1.3918495812457483, + "language_loss": 0.84173477, + "learning_rate": 4.5692494547431583e-07, + "loss": 0.91837221, + "num_input_tokens_seen": 282456025, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.0914917, + "step": 13091, + "time_per_iteration": 2.5303354263305664 + }, + { + "auxiliary_loss_clip": 0.06311627, + "auxiliary_loss_mlp": 0.01253337, + "balance_loss_clip": 0.0625616, + "balance_loss_mlp": 0.01252234, + "epoch": 0.7871336239290546, + "flos": 70310439532800.0, + "grad_norm": 0.6984253533928471, + "language_loss": 0.63944566, + "learning_rate": 4.566772055150947e-07, + "loss": 0.71509528, + "num_input_tokens_seen": 282520995, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01104736, + "step": 13092, + "time_per_iteration": 3.186598300933838 + }, + { + "auxiliary_loss_clip": 0.06405234, + "auxiliary_loss_mlp": 0.01264101, + "balance_loss_clip": 0.06272719, + "balance_loss_mlp": 0.01254749, + "epoch": 0.7871937471817225, + "flos": 15784010910720.0, + "grad_norm": 2.677362510314703, + "language_loss": 0.79394525, + "learning_rate": 4.564295240788285e-07, + "loss": 0.87063861, + "num_input_tokens_seen": 282539355, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09350586, + "step": 13093, + "time_per_iteration": 2.4746809005737305 + }, + { + "auxiliary_loss_clip": 0.06399192, + "auxiliary_loss_mlp": 0.01262897, + "balance_loss_clip": 0.06268847, + "balance_loss_mlp": 0.01253747, + "epoch": 0.7872538704343905, + "flos": 20491466417280.0, + "grad_norm": 1.6510022815590566, + "language_loss": 0.75735247, + "learning_rate": 4.561819011749106e-07, + "loss": 0.83397341, + "num_input_tokens_seen": 282555735, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.0914917, + "step": 13094, + "time_per_iteration": 4.020095109939575 + }, + { + "auxiliary_loss_clip": 0.06407712, + "auxiliary_loss_mlp": 0.01266386, + "balance_loss_clip": 0.06273055, + "balance_loss_mlp": 0.01256719, + "epoch": 0.7873139936870585, + "flos": 25089699726720.0, + "grad_norm": 1.5509563724400146, + "language_loss": 0.79440391, + "learning_rate": 4.5593433681272884e-07, + "loss": 0.87114489, + "num_input_tokens_seen": 282574550, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.09674072, + "step": 13095, + "time_per_iteration": 2.609463930130005 + }, + { + "auxiliary_loss_clip": 0.06408177, + "auxiliary_loss_mlp": 0.01265337, + "balance_loss_clip": 0.06271407, + "balance_loss_mlp": 0.01255425, + "epoch": 0.7873741169397265, + "flos": 30891054286080.0, + "grad_norm": 1.609249488827552, + "language_loss": 0.68118989, + "learning_rate": 4.556868310016715e-07, + "loss": 0.75792503, + "num_input_tokens_seen": 282596520, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.09918213, + "step": 13096, + "time_per_iteration": 2.5687479972839355 + }, + { + "auxiliary_loss_clip": 0.0639504, + "auxiliary_loss_mlp": 0.01263751, + "balance_loss_clip": 0.06268235, + "balance_loss_mlp": 0.01255102, + "epoch": 0.7874342401923944, + "flos": 46802666165760.0, + "grad_norm": 1.4338734934522757, + "language_loss": 0.70958376, + "learning_rate": 4.55439383751125e-07, + "loss": 0.78617167, + "num_input_tokens_seen": 282620560, + "router_z_loss_clip": 1.26953125, + "router_z_loss_mlp": 0.08648682, + "step": 13097, + "time_per_iteration": 2.739225387573242 + }, + { + "auxiliary_loss_clip": 0.0640981, + "auxiliary_loss_mlp": 0.01270015, + "balance_loss_clip": 0.06274028, + "balance_loss_mlp": 0.0125987, + "epoch": 0.7874943634450624, + "flos": 23590958872320.0, + "grad_norm": 4.324515792208533, + "language_loss": 0.8066771, + "learning_rate": 4.5519199507047126e-07, + "loss": 0.8834753, + "num_input_tokens_seen": 282639830, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.10144043, + "step": 13098, + "time_per_iteration": 4.011147737503052 + }, + { + "auxiliary_loss_clip": 0.06403133, + "auxiliary_loss_mlp": 0.01264821, + "balance_loss_clip": 0.06272101, + "balance_loss_mlp": 0.0125591, + "epoch": 0.7875544866977303, + "flos": 20196978842880.0, + "grad_norm": 1.6374038368604131, + "language_loss": 0.74357909, + "learning_rate": 4.5494466496909177e-07, + "loss": 0.82025862, + "num_input_tokens_seen": 282660130, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.08898926, + "step": 13099, + "time_per_iteration": 2.5371813774108887 + }, + { + "auxiliary_loss_clip": 0.06403521, + "auxiliary_loss_mlp": 0.01264223, + "balance_loss_clip": 0.06272208, + "balance_loss_mlp": 0.01254811, + "epoch": 0.7876146099503983, + "flos": 22609342690560.0, + "grad_norm": 1.4701340709539035, + "language_loss": 0.78340292, + "learning_rate": 4.5469739345636603e-07, + "loss": 0.86008036, + "num_input_tokens_seen": 282681125, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.09417725, + "step": 13100, + "time_per_iteration": 2.518275737762451 + }, + { + "auxiliary_loss_clip": 0.06411106, + "auxiliary_loss_mlp": 0.01262468, + "balance_loss_clip": 0.06271806, + "balance_loss_mlp": 0.01251334, + "epoch": 0.7876747332030662, + "flos": 10710217353600.0, + "grad_norm": 2.2988714589951122, + "language_loss": 0.66578412, + "learning_rate": 4.5445018054167007e-07, + "loss": 0.74251986, + "num_input_tokens_seen": 282696690, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.11138916, + "step": 13101, + "time_per_iteration": 2.478010416030884 + }, + { + "auxiliary_loss_clip": 0.06403912, + "auxiliary_loss_mlp": 0.01262729, + "balance_loss_clip": 0.06271445, + "balance_loss_mlp": 0.01253026, + "epoch": 0.7877348564557343, + "flos": 38408462064000.0, + "grad_norm": 1.3711840285849346, + "language_loss": 0.78050315, + "learning_rate": 4.5420302623437745e-07, + "loss": 0.85716951, + "num_input_tokens_seen": 282721210, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.09716797, + "step": 13102, + "time_per_iteration": 2.6512677669525146 + }, + { + "auxiliary_loss_clip": 0.06402024, + "auxiliary_loss_mlp": 0.01263165, + "balance_loss_clip": 0.06270896, + "balance_loss_mlp": 0.01253968, + "epoch": 0.7877949797084022, + "flos": 18334876757760.0, + "grad_norm": 3.387524543051336, + "language_loss": 0.82612967, + "learning_rate": 4.5395593054386093e-07, + "loss": 0.90278161, + "num_input_tokens_seen": 282738505, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09197998, + "step": 13103, + "time_per_iteration": 3.8968992233276367 + }, + { + "auxiliary_loss_clip": 0.0640745, + "auxiliary_loss_mlp": 0.01262901, + "balance_loss_clip": 0.0627317, + "balance_loss_mlp": 0.0125349, + "epoch": 0.7878551029610702, + "flos": 25812942243840.0, + "grad_norm": 2.089208992674617, + "language_loss": 0.80857301, + "learning_rate": 4.537088934794913e-07, + "loss": 0.8852765, + "num_input_tokens_seen": 282756895, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.09405518, + "step": 13104, + "time_per_iteration": 2.531153917312622 + }, + { + "auxiliary_loss_clip": 0.06404544, + "auxiliary_loss_mlp": 0.01264676, + "balance_loss_clip": 0.06272654, + "balance_loss_mlp": 0.0125505, + "epoch": 0.7879152262137382, + "flos": 22348663038720.0, + "grad_norm": 1.6665656648061993, + "language_loss": 0.74192965, + "learning_rate": 4.5346191505063515e-07, + "loss": 0.81862175, + "num_input_tokens_seen": 282774955, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.09619141, + "step": 13105, + "time_per_iteration": 2.470590114593506 + }, + { + "auxiliary_loss_clip": 0.06407781, + "auxiliary_loss_mlp": 0.01265901, + "balance_loss_clip": 0.0627221, + "balance_loss_mlp": 0.01255798, + "epoch": 0.7879753494664061, + "flos": 24791396791680.0, + "grad_norm": 1.540938509232933, + "language_loss": 0.75896162, + "learning_rate": 4.5321499526665776e-07, + "loss": 0.83569837, + "num_input_tokens_seen": 282793165, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.10101318, + "step": 13106, + "time_per_iteration": 2.5313045978546143 + }, + { + "auxiliary_loss_clip": 0.06404249, + "auxiliary_loss_mlp": 0.01267414, + "balance_loss_clip": 0.06271406, + "balance_loss_mlp": 0.01257592, + "epoch": 0.7880354727190741, + "flos": 16914610851840.0, + "grad_norm": 2.261490692087697, + "language_loss": 0.7317878, + "learning_rate": 4.5296813413692337e-07, + "loss": 0.80850446, + "num_input_tokens_seen": 282809820, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.09832764, + "step": 13107, + "time_per_iteration": 2.4657392501831055 + }, + { + "auxiliary_loss_clip": 0.0640149, + "auxiliary_loss_mlp": 0.01266906, + "balance_loss_clip": 0.06272627, + "balance_loss_mlp": 0.01257083, + "epoch": 0.7880955959717421, + "flos": 22236002824320.0, + "grad_norm": 1.7249934129069375, + "language_loss": 0.73170471, + "learning_rate": 4.5272133167079165e-07, + "loss": 0.80838865, + "num_input_tokens_seen": 282828600, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.09820557, + "step": 13108, + "time_per_iteration": 2.522061347961426 + }, + { + "auxiliary_loss_clip": 0.06308442, + "auxiliary_loss_mlp": 0.01251318, + "balance_loss_clip": 0.06252776, + "balance_loss_mlp": 0.01250208, + "epoch": 0.7881557192244101, + "flos": 69201907943040.0, + "grad_norm": 0.865010287169312, + "language_loss": 0.60254252, + "learning_rate": 4.5247458787762216e-07, + "loss": 0.6781401, + "num_input_tokens_seen": 282882775, + "router_z_loss_clip": 0.55810547, + "router_z_loss_mlp": 0.01112366, + "step": 13109, + "time_per_iteration": 3.0764577388763428 + }, + { + "auxiliary_loss_clip": 0.06398489, + "auxiliary_loss_mlp": 0.0126099, + "balance_loss_clip": 0.06271066, + "balance_loss_mlp": 0.01252025, + "epoch": 0.788215842477078, + "flos": 24942225340800.0, + "grad_norm": 1.5302071478358445, + "language_loss": 0.72546446, + "learning_rate": 4.5222790276677126e-07, + "loss": 0.80205929, + "num_input_tokens_seen": 282902680, + "router_z_loss_clip": 1.2734375, + "router_z_loss_mlp": 0.08959961, + "step": 13110, + "time_per_iteration": 2.5210487842559814 + }, + { + "auxiliary_loss_clip": 0.06396982, + "auxiliary_loss_mlp": 0.01264197, + "balance_loss_clip": 0.06268892, + "balance_loss_mlp": 0.01255453, + "epoch": 0.788275965729746, + "flos": 26114054290560.0, + "grad_norm": 1.2956006250382688, + "language_loss": 0.75373393, + "learning_rate": 4.5198127634759455e-07, + "loss": 0.83034575, + "num_input_tokens_seen": 282923625, + "router_z_loss_clip": 1.27929688, + "router_z_loss_mlp": 0.08734131, + "step": 13111, + "time_per_iteration": 2.5650205612182617 + }, + { + "auxiliary_loss_clip": 0.06403745, + "auxiliary_loss_mlp": 0.01269317, + "balance_loss_clip": 0.06272365, + "balance_loss_mlp": 0.01259524, + "epoch": 0.7883360889824139, + "flos": 21221123771520.0, + "grad_norm": 1.7931682275164638, + "language_loss": 0.6193608, + "learning_rate": 4.5173470862944206e-07, + "loss": 0.69609141, + "num_input_tokens_seen": 282941955, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09790039, + "step": 13112, + "time_per_iteration": 2.5178818702697754 + }, + { + "auxiliary_loss_clip": 0.06402722, + "auxiliary_loss_mlp": 0.01268033, + "balance_loss_clip": 0.06271605, + "balance_loss_mlp": 0.01258025, + "epoch": 0.7883962122350819, + "flos": 21148979806080.0, + "grad_norm": 1.7329728491097858, + "language_loss": 0.67358041, + "learning_rate": 4.514881996216644e-07, + "loss": 0.75028789, + "num_input_tokens_seen": 282961280, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.10003662, + "step": 13113, + "time_per_iteration": 2.4997618198394775 + }, + { + "auxiliary_loss_clip": 0.06400861, + "auxiliary_loss_mlp": 0.01265802, + "balance_loss_clip": 0.06271458, + "balance_loss_mlp": 0.01256629, + "epoch": 0.7884563354877498, + "flos": 15308031392640.0, + "grad_norm": 2.191522970823139, + "language_loss": 0.58949661, + "learning_rate": 4.5124174933361e-07, + "loss": 0.66616333, + "num_input_tokens_seen": 282978210, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.0916748, + "step": 13114, + "time_per_iteration": 2.499992609024048 + }, + { + "auxiliary_loss_clip": 0.06405228, + "auxiliary_loss_mlp": 0.01263713, + "balance_loss_clip": 0.06271623, + "balance_loss_mlp": 0.01254063, + "epoch": 0.7885164587404179, + "flos": 24395024252160.0, + "grad_norm": 2.5351098559279452, + "language_loss": 0.67195284, + "learning_rate": 4.5099535777462306e-07, + "loss": 0.74864221, + "num_input_tokens_seen": 282998845, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.09649658, + "step": 13115, + "time_per_iteration": 2.6665830612182617 + }, + { + "auxiliary_loss_clip": 0.06404252, + "auxiliary_loss_mlp": 0.0126713, + "balance_loss_clip": 0.06272528, + "balance_loss_mlp": 0.0125732, + "epoch": 0.7885765819930858, + "flos": 14390047987200.0, + "grad_norm": 1.969107246296687, + "language_loss": 0.8892082, + "learning_rate": 4.50749024954048e-07, + "loss": 0.965922, + "num_input_tokens_seen": 283015200, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.0980835, + "step": 13116, + "time_per_iteration": 2.488569498062134 + }, + { + "auxiliary_loss_clip": 0.06413092, + "auxiliary_loss_mlp": 0.0126853, + "balance_loss_clip": 0.06272166, + "balance_loss_mlp": 0.01257551, + "epoch": 0.7886367052457538, + "flos": 18265835393280.0, + "grad_norm": 2.2399693742143296, + "language_loss": 0.73226219, + "learning_rate": 4.505027508812245e-07, + "loss": 0.80907845, + "num_input_tokens_seen": 283033680, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.10986328, + "step": 13117, + "time_per_iteration": 2.4811642169952393 + }, + { + "auxiliary_loss_clip": 0.06399462, + "auxiliary_loss_mlp": 0.01262163, + "balance_loss_clip": 0.06271145, + "balance_loss_mlp": 0.0125355, + "epoch": 0.7886968284984217, + "flos": 15310588942080.0, + "grad_norm": 1.3858230532181541, + "language_loss": 0.80464065, + "learning_rate": 4.502565355654926e-07, + "loss": 0.88125694, + "num_input_tokens_seen": 283050620, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.08612061, + "step": 13118, + "time_per_iteration": 2.486297369003296 + }, + { + "auxiliary_loss_clip": 0.06400422, + "auxiliary_loss_mlp": 0.01266146, + "balance_loss_clip": 0.06270169, + "balance_loss_mlp": 0.01256538, + "epoch": 0.7887569517510897, + "flos": 21221878458240.0, + "grad_norm": 1.766770664669928, + "language_loss": 0.7323485, + "learning_rate": 4.500103790161878e-07, + "loss": 0.80901414, + "num_input_tokens_seen": 283070215, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.09613037, + "step": 13119, + "time_per_iteration": 2.4904284477233887 + }, + { + "auxiliary_loss_clip": 0.06406539, + "auxiliary_loss_mlp": 0.01262086, + "balance_loss_clip": 0.06272633, + "balance_loss_mlp": 0.01253146, + "epoch": 0.7888170750037578, + "flos": 22717894055040.0, + "grad_norm": 1.2838410999725969, + "language_loss": 0.7203325, + "learning_rate": 4.4976428124264454e-07, + "loss": 0.79701877, + "num_input_tokens_seen": 283091485, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.0894165, + "step": 13120, + "time_per_iteration": 2.531905174255371 + }, + { + "auxiliary_loss_clip": 0.06402384, + "auxiliary_loss_mlp": 0.01269736, + "balance_loss_clip": 0.0627251, + "balance_loss_mlp": 0.01259919, + "epoch": 0.7888771982564257, + "flos": 36437976322560.0, + "grad_norm": 1.5849995361084, + "language_loss": 0.79042959, + "learning_rate": 4.4951824225419564e-07, + "loss": 0.86715084, + "num_input_tokens_seen": 283115040, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.0980835, + "step": 13121, + "time_per_iteration": 2.6270458698272705 + }, + { + "auxiliary_loss_clip": 0.06399482, + "auxiliary_loss_mlp": 0.01265138, + "balance_loss_clip": 0.06271152, + "balance_loss_mlp": 0.01255524, + "epoch": 0.7889373215090937, + "flos": 27317678664960.0, + "grad_norm": 1.3500924966016437, + "language_loss": 0.80276608, + "learning_rate": 4.4927226206017057e-07, + "loss": 0.87941229, + "num_input_tokens_seen": 283136925, + "router_z_loss_clip": 1.28320312, + "router_z_loss_mlp": 0.09613037, + "step": 13122, + "time_per_iteration": 2.5672237873077393 + }, + { + "auxiliary_loss_clip": 0.06403008, + "auxiliary_loss_mlp": 0.01263927, + "balance_loss_clip": 0.06269404, + "balance_loss_mlp": 0.0125526, + "epoch": 0.7889974447617616, + "flos": 19835210839680.0, + "grad_norm": 1.809945605348313, + "language_loss": 0.78323883, + "learning_rate": 4.4902634066989597e-07, + "loss": 0.85990816, + "num_input_tokens_seen": 283155725, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.08666992, + "step": 13123, + "time_per_iteration": 2.5139808654785156 + }, + { + "auxiliary_loss_clip": 0.06405288, + "auxiliary_loss_mlp": 0.01262619, + "balance_loss_clip": 0.06270181, + "balance_loss_mlp": 0.01253154, + "epoch": 0.7890575680144296, + "flos": 17276336928000.0, + "grad_norm": 3.407845901525998, + "language_loss": 0.67230475, + "learning_rate": 4.487804780926985e-07, + "loss": 0.7489838, + "num_input_tokens_seen": 283173845, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.09466553, + "step": 13124, + "time_per_iteration": 3.877263069152832 + }, + { + "auxiliary_loss_clip": 0.06410992, + "auxiliary_loss_mlp": 0.01265224, + "balance_loss_clip": 0.06275047, + "balance_loss_mlp": 0.01255598, + "epoch": 0.7891176912670975, + "flos": 27607596192000.0, + "grad_norm": 2.1455737597716995, + "language_loss": 0.73154545, + "learning_rate": 4.4853467433790036e-07, + "loss": 0.80830753, + "num_input_tokens_seen": 283191985, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.09619141, + "step": 13125, + "time_per_iteration": 2.5944886207580566 + }, + { + "auxiliary_loss_clip": 0.06402256, + "auxiliary_loss_mlp": 0.01261205, + "balance_loss_clip": 0.06267411, + "balance_loss_mlp": 0.01251728, + "epoch": 0.7891778145197655, + "flos": 22718397179520.0, + "grad_norm": 1.8448957307034948, + "language_loss": 0.73224074, + "learning_rate": 4.4828892941482267e-07, + "loss": 0.80887532, + "num_input_tokens_seen": 283210855, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.09472656, + "step": 13126, + "time_per_iteration": 2.6197116374969482 + }, + { + "auxiliary_loss_clip": 0.06406458, + "auxiliary_loss_mlp": 0.01265351, + "balance_loss_clip": 0.06271337, + "balance_loss_mlp": 0.0125604, + "epoch": 0.7892379377724335, + "flos": 17316433906560.0, + "grad_norm": 1.6718073300601826, + "language_loss": 0.77387738, + "learning_rate": 4.480432433327845e-07, + "loss": 0.85059547, + "num_input_tokens_seen": 283229665, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.09301758, + "step": 13127, + "time_per_iteration": 2.475583553314209 + }, + { + "auxiliary_loss_clip": 0.06398283, + "auxiliary_loss_mlp": 0.01266293, + "balance_loss_clip": 0.06270358, + "balance_loss_mlp": 0.01256649, + "epoch": 0.7892980610251015, + "flos": 25782781973760.0, + "grad_norm": 1.6570002472061196, + "language_loss": 0.85693359, + "learning_rate": 4.47797616101103e-07, + "loss": 0.93357939, + "num_input_tokens_seen": 283248615, + "router_z_loss_clip": 1.27734375, + "router_z_loss_mlp": 0.09643555, + "step": 13128, + "time_per_iteration": 2.506098508834839 + }, + { + "auxiliary_loss_clip": 0.06401196, + "auxiliary_loss_mlp": 0.01265664, + "balance_loss_clip": 0.06271003, + "balance_loss_mlp": 0.01256634, + "epoch": 0.7893581842777694, + "flos": 21586371719040.0, + "grad_norm": 1.9505455740147257, + "language_loss": 0.69738185, + "learning_rate": 4.475520477290904e-07, + "loss": 0.77405041, + "num_input_tokens_seen": 283267135, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.09033203, + "step": 13129, + "time_per_iteration": 2.492781400680542 + }, + { + "auxiliary_loss_clip": 0.06314191, + "auxiliary_loss_mlp": 0.01255045, + "balance_loss_clip": 0.06258637, + "balance_loss_mlp": 0.01254005, + "epoch": 0.7894183075304374, + "flos": 69037773793920.0, + "grad_norm": 0.7003894761434999, + "language_loss": 0.61533356, + "learning_rate": 4.473065382260597e-07, + "loss": 0.69102591, + "num_input_tokens_seen": 283328940, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01041412, + "step": 13130, + "time_per_iteration": 3.109016180038452 + }, + { + "auxiliary_loss_clip": 0.06405208, + "auxiliary_loss_mlp": 0.01262252, + "balance_loss_clip": 0.06272055, + "balance_loss_mlp": 0.01252686, + "epoch": 0.7894784307831053, + "flos": 24250107415680.0, + "grad_norm": 1.475922878769178, + "language_loss": 0.74187315, + "learning_rate": 4.4706108760132124e-07, + "loss": 0.81854773, + "num_input_tokens_seen": 283350000, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.09564209, + "step": 13131, + "time_per_iteration": 2.526529312133789 + }, + { + "auxiliary_loss_clip": 0.06417171, + "auxiliary_loss_mlp": 0.01266681, + "balance_loss_clip": 0.06273621, + "balance_loss_mlp": 0.01255297, + "epoch": 0.7895385540357733, + "flos": 20272770460800.0, + "grad_norm": 15.433314794516651, + "language_loss": 0.69895113, + "learning_rate": 4.4681569586418153e-07, + "loss": 0.77578956, + "num_input_tokens_seen": 283368020, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.11376953, + "step": 13132, + "time_per_iteration": 2.5669658184051514 + }, + { + "auxiliary_loss_clip": 0.06403211, + "auxiliary_loss_mlp": 0.01266676, + "balance_loss_clip": 0.06269971, + "balance_loss_mlp": 0.01256573, + "epoch": 0.7895986772884414, + "flos": 21002972866560.0, + "grad_norm": 2.4066374074433186, + "language_loss": 0.61959308, + "learning_rate": 4.465703630239468e-07, + "loss": 0.69629192, + "num_input_tokens_seen": 283387030, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.10113525, + "step": 13133, + "time_per_iteration": 2.4860470294952393 + }, + { + "auxiliary_loss_clip": 0.06406127, + "auxiliary_loss_mlp": 0.01270355, + "balance_loss_clip": 0.06272439, + "balance_loss_mlp": 0.01259644, + "epoch": 0.7896588005411093, + "flos": 18663423816960.0, + "grad_norm": 2.0571343653676326, + "language_loss": 0.8017205, + "learning_rate": 4.463250890899195e-07, + "loss": 0.87848526, + "num_input_tokens_seen": 283402090, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.10717773, + "step": 13134, + "time_per_iteration": 3.9168148040771484 + }, + { + "auxiliary_loss_clip": 0.06404164, + "auxiliary_loss_mlp": 0.01263167, + "balance_loss_clip": 0.06271005, + "balance_loss_mlp": 0.01254059, + "epoch": 0.7897189237937773, + "flos": 18411842332800.0, + "grad_norm": 2.033133539223884, + "language_loss": 0.80772352, + "learning_rate": 4.460798740713998e-07, + "loss": 0.88439691, + "num_input_tokens_seen": 283421035, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.09112549, + "step": 13135, + "time_per_iteration": 2.4654078483581543 + }, + { + "auxiliary_loss_clip": 0.06399068, + "auxiliary_loss_mlp": 0.01263162, + "balance_loss_clip": 0.06268865, + "balance_loss_mlp": 0.01253089, + "epoch": 0.7897790470464452, + "flos": 23738223623040.0, + "grad_norm": 1.6530850460824498, + "language_loss": 0.72782981, + "learning_rate": 4.4583471797768733e-07, + "loss": 0.80445212, + "num_input_tokens_seen": 283441830, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.10076904, + "step": 13136, + "time_per_iteration": 2.5253071784973145 + }, + { + "auxiliary_loss_clip": 0.06410457, + "auxiliary_loss_mlp": 0.01263296, + "balance_loss_clip": 0.06271002, + "balance_loss_mlp": 0.01252222, + "epoch": 0.7898391702991132, + "flos": 15923477232000.0, + "grad_norm": 2.3537390068214656, + "language_loss": 0.70506489, + "learning_rate": 4.455896208180778e-07, + "loss": 0.78180242, + "num_input_tokens_seen": 283459540, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.11077881, + "step": 13137, + "time_per_iteration": 2.468620777130127 + }, + { + "auxiliary_loss_clip": 0.06401488, + "auxiliary_loss_mlp": 0.01264377, + "balance_loss_clip": 0.06271732, + "balance_loss_mlp": 0.01254506, + "epoch": 0.7898992935517811, + "flos": 19835252766720.0, + "grad_norm": 1.578942697411419, + "language_loss": 0.74176329, + "learning_rate": 4.4534458260186645e-07, + "loss": 0.81842196, + "num_input_tokens_seen": 283478790, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.09869385, + "step": 13138, + "time_per_iteration": 3.9565515518188477 + }, + { + "auxiliary_loss_clip": 0.0640148, + "auxiliary_loss_mlp": 0.0126554, + "balance_loss_clip": 0.06271301, + "balance_loss_mlp": 0.01256271, + "epoch": 0.7899594168044491, + "flos": 16221738240000.0, + "grad_norm": 1.9480374334640547, + "language_loss": 0.686391, + "learning_rate": 4.4509960333834426e-07, + "loss": 0.76306117, + "num_input_tokens_seen": 283495720, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.09277344, + "step": 13139, + "time_per_iteration": 2.4804084300994873 + }, + { + "auxiliary_loss_clip": 0.06313749, + "auxiliary_loss_mlp": 0.01251905, + "balance_loss_clip": 0.06258325, + "balance_loss_mlp": 0.01250762, + "epoch": 0.790019540057117, + "flos": 68353496225280.0, + "grad_norm": 0.8282799229852567, + "language_loss": 0.60166419, + "learning_rate": 4.448546830368003e-07, + "loss": 0.67732072, + "num_input_tokens_seen": 283558795, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01143646, + "step": 13140, + "time_per_iteration": 3.181234359741211 + }, + { + "auxiliary_loss_clip": 0.06408462, + "auxiliary_loss_mlp": 0.01266869, + "balance_loss_clip": 0.06275274, + "balance_loss_mlp": 0.01257619, + "epoch": 0.7900796633097851, + "flos": 30340037836800.0, + "grad_norm": 1.5194345427413907, + "language_loss": 0.76587826, + "learning_rate": 4.4460982170652304e-07, + "loss": 0.84263158, + "num_input_tokens_seen": 283579305, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.09259033, + "step": 13141, + "time_per_iteration": 2.5935022830963135 + }, + { + "auxiliary_loss_clip": 0.06406665, + "auxiliary_loss_mlp": 0.01265708, + "balance_loss_clip": 0.06272526, + "balance_loss_mlp": 0.01255421, + "epoch": 0.790139786562453, + "flos": 22133237391360.0, + "grad_norm": 1.706504607669126, + "language_loss": 0.68517488, + "learning_rate": 4.4436501935679694e-07, + "loss": 0.76189852, + "num_input_tokens_seen": 283597840, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.10290527, + "step": 13142, + "time_per_iteration": 3.9123146533966064 + }, + { + "auxiliary_loss_clip": 0.06313135, + "auxiliary_loss_mlp": 0.01253569, + "balance_loss_clip": 0.06257692, + "balance_loss_mlp": 0.01252476, + "epoch": 0.790199909815121, + "flos": 58225210277760.0, + "grad_norm": 0.7895590429355487, + "language_loss": 0.59896362, + "learning_rate": 4.441202759969049e-07, + "loss": 0.6746307, + "num_input_tokens_seen": 283647950, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01094818, + "step": 13143, + "time_per_iteration": 2.9545323848724365 + }, + { + "auxiliary_loss_clip": 0.06407971, + "auxiliary_loss_mlp": 0.01265938, + "balance_loss_clip": 0.06271341, + "balance_loss_mlp": 0.01255495, + "epoch": 0.7902600330677889, + "flos": 34542066314880.0, + "grad_norm": 1.4595073006493966, + "language_loss": 0.74559182, + "learning_rate": 4.4387559163612875e-07, + "loss": 0.82233089, + "num_input_tokens_seen": 283670645, + "router_z_loss_clip": 1.36621094, + "router_z_loss_mlp": 0.10443115, + "step": 13144, + "time_per_iteration": 2.6375374794006348 + }, + { + "auxiliary_loss_clip": 0.06405632, + "auxiliary_loss_mlp": 0.01270956, + "balance_loss_clip": 0.0627213, + "balance_loss_mlp": 0.01260537, + "epoch": 0.7903201563204569, + "flos": 22352981523840.0, + "grad_norm": 1.6890449908385896, + "language_loss": 0.83446616, + "learning_rate": 4.4363096628374605e-07, + "loss": 0.91123205, + "num_input_tokens_seen": 283688830, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.10424805, + "step": 13145, + "time_per_iteration": 2.499363660812378 + }, + { + "auxiliary_loss_clip": 0.06395718, + "auxiliary_loss_mlp": 0.01261823, + "balance_loss_clip": 0.06268772, + "balance_loss_mlp": 0.01252971, + "epoch": 0.790380279573125, + "flos": 22059919468800.0, + "grad_norm": 1.6613829846262294, + "language_loss": 0.7342999, + "learning_rate": 4.4338639994903235e-07, + "loss": 0.81087536, + "num_input_tokens_seen": 283708625, + "router_z_loss_clip": 1.26953125, + "router_z_loss_mlp": 0.08862305, + "step": 13146, + "time_per_iteration": 2.515782356262207 + }, + { + "auxiliary_loss_clip": 0.06406832, + "auxiliary_loss_mlp": 0.01262426, + "balance_loss_clip": 0.0627181, + "balance_loss_mlp": 0.01252704, + "epoch": 0.7904404028257929, + "flos": 20308758589440.0, + "grad_norm": 1.836231171589266, + "language_loss": 0.76197815, + "learning_rate": 4.4314189264126246e-07, + "loss": 0.83867073, + "num_input_tokens_seen": 283725710, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.09716797, + "step": 13147, + "time_per_iteration": 2.4807651042938232 + }, + { + "auxiliary_loss_clip": 0.06400219, + "auxiliary_loss_mlp": 0.01266803, + "balance_loss_clip": 0.06270348, + "balance_loss_mlp": 0.01256921, + "epoch": 0.7905005260784609, + "flos": 20014732212480.0, + "grad_norm": 1.7419913226116706, + "language_loss": 0.72276485, + "learning_rate": 4.428974443697087e-07, + "loss": 0.79943514, + "num_input_tokens_seen": 283744150, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.09881592, + "step": 13148, + "time_per_iteration": 2.506728410720825 + }, + { + "auxiliary_loss_clip": 0.06406561, + "auxiliary_loss_mlp": 0.01264165, + "balance_loss_clip": 0.06271912, + "balance_loss_mlp": 0.01253782, + "epoch": 0.7905606493311288, + "flos": 26913088425600.0, + "grad_norm": 1.5866446208537701, + "language_loss": 0.71421397, + "learning_rate": 4.4265305514363913e-07, + "loss": 0.79092121, + "num_input_tokens_seen": 283764170, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.1038208, + "step": 13149, + "time_per_iteration": 2.5299153327941895 + }, + { + "auxiliary_loss_clip": 0.0640769, + "auxiliary_loss_mlp": 0.01263913, + "balance_loss_clip": 0.0627196, + "balance_loss_mlp": 0.01253417, + "epoch": 0.7906207725837968, + "flos": 23703032108160.0, + "grad_norm": 2.1166900358706138, + "language_loss": 0.65887839, + "learning_rate": 4.424087249723225e-07, + "loss": 0.73559439, + "num_input_tokens_seen": 283784305, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.10498047, + "step": 13150, + "time_per_iteration": 2.5118424892425537 + }, + { + "auxiliary_loss_clip": 0.06400509, + "auxiliary_loss_mlp": 0.01263964, + "balance_loss_clip": 0.06269284, + "balance_loss_mlp": 0.01254171, + "epoch": 0.7906808958364647, + "flos": 20854911502080.0, + "grad_norm": 1.5600793718059285, + "language_loss": 0.70213783, + "learning_rate": 4.421644538650231e-07, + "loss": 0.77878249, + "num_input_tokens_seen": 283804040, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09790039, + "step": 13151, + "time_per_iteration": 2.479990243911743 + }, + { + "auxiliary_loss_clip": 0.06407944, + "auxiliary_loss_mlp": 0.01264552, + "balance_loss_clip": 0.06272637, + "balance_loss_mlp": 0.01254682, + "epoch": 0.7907410190891327, + "flos": 40744866585600.0, + "grad_norm": 1.3436721274508034, + "language_loss": 0.70374179, + "learning_rate": 4.4192024183100306e-07, + "loss": 0.78046679, + "num_input_tokens_seen": 283827120, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.09875488, + "step": 13152, + "time_per_iteration": 2.66023850440979 + }, + { + "auxiliary_loss_clip": 0.06400564, + "auxiliary_loss_mlp": 0.01268098, + "balance_loss_clip": 0.06269571, + "balance_loss_mlp": 0.01258919, + "epoch": 0.7908011423418007, + "flos": 13266198299520.0, + "grad_norm": 1.733827476588534, + "language_loss": 0.72901142, + "learning_rate": 4.4167608887952367e-07, + "loss": 0.8056981, + "num_input_tokens_seen": 283844820, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.09179688, + "step": 13153, + "time_per_iteration": 2.4535181522369385 + }, + { + "auxiliary_loss_clip": 0.06401587, + "auxiliary_loss_mlp": 0.01266515, + "balance_loss_clip": 0.06268425, + "balance_loss_mlp": 0.01256502, + "epoch": 0.7908612655944687, + "flos": 19760718960000.0, + "grad_norm": 1.4410962438109587, + "language_loss": 0.78749764, + "learning_rate": 4.4143199501984306e-07, + "loss": 0.86417866, + "num_input_tokens_seen": 283862870, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.10009766, + "step": 13154, + "time_per_iteration": 2.481267213821411 + }, + { + "auxiliary_loss_clip": 0.06410754, + "auxiliary_loss_mlp": 0.01263056, + "balance_loss_clip": 0.06270463, + "balance_loss_mlp": 0.01252286, + "epoch": 0.7909213888471366, + "flos": 21294064350720.0, + "grad_norm": 1.8857519871038082, + "language_loss": 0.70335776, + "learning_rate": 4.411879602612185e-07, + "loss": 0.78009582, + "num_input_tokens_seen": 283882405, + "router_z_loss_clip": 1.40332031, + "router_z_loss_mlp": 0.10778809, + "step": 13155, + "time_per_iteration": 2.474088668823242 + }, + { + "auxiliary_loss_clip": 0.06405213, + "auxiliary_loss_mlp": 0.01266856, + "balance_loss_clip": 0.06271385, + "balance_loss_mlp": 0.01257069, + "epoch": 0.7909815120998046, + "flos": 22535521643520.0, + "grad_norm": 2.510036385951424, + "language_loss": 0.77293575, + "learning_rate": 4.4094398461290174e-07, + "loss": 0.8496564, + "num_input_tokens_seen": 283902070, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.09790039, + "step": 13156, + "time_per_iteration": 2.513814926147461 + }, + { + "auxiliary_loss_clip": 0.06403618, + "auxiliary_loss_mlp": 0.01263274, + "balance_loss_clip": 0.06271893, + "balance_loss_mlp": 0.01254185, + "epoch": 0.7910416353524725, + "flos": 26735537623680.0, + "grad_norm": 1.591424288088247, + "language_loss": 0.65432274, + "learning_rate": 4.4070006808414526e-07, + "loss": 0.73099172, + "num_input_tokens_seen": 283924100, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09088135, + "step": 13157, + "time_per_iteration": 2.534609079360962 + }, + { + "auxiliary_loss_clip": 0.0640482, + "auxiliary_loss_mlp": 0.01266464, + "balance_loss_clip": 0.06269716, + "balance_loss_mlp": 0.01256272, + "epoch": 0.7911017586051405, + "flos": 24651804689280.0, + "grad_norm": 2.191693050285661, + "language_loss": 0.7477805, + "learning_rate": 4.4045621068419894e-07, + "loss": 0.82449341, + "num_input_tokens_seen": 283944955, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.10192871, + "step": 13158, + "time_per_iteration": 2.5379066467285156 + }, + { + "auxiliary_loss_clip": 0.06396219, + "auxiliary_loss_mlp": 0.01263878, + "balance_loss_clip": 0.06268845, + "balance_loss_mlp": 0.01255116, + "epoch": 0.7911618818578086, + "flos": 17571076064640.0, + "grad_norm": 1.9112834208400953, + "language_loss": 0.67451692, + "learning_rate": 4.40212412422309e-07, + "loss": 0.75111789, + "num_input_tokens_seen": 283963125, + "router_z_loss_clip": 1.2734375, + "router_z_loss_mlp": 0.08764648, + "step": 13159, + "time_per_iteration": 2.464768171310425 + }, + { + "auxiliary_loss_clip": 0.06400043, + "auxiliary_loss_mlp": 0.01266297, + "balance_loss_clip": 0.06269793, + "balance_loss_mlp": 0.0125645, + "epoch": 0.7912220051104765, + "flos": 16726326727680.0, + "grad_norm": 1.6817860395466344, + "language_loss": 0.67496979, + "learning_rate": 4.399686733077206e-07, + "loss": 0.75163317, + "num_input_tokens_seen": 283982850, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.09838867, + "step": 13160, + "time_per_iteration": 2.5563478469848633 + }, + { + "auxiliary_loss_clip": 0.0639656, + "auxiliary_loss_mlp": 0.01260248, + "balance_loss_clip": 0.06270408, + "balance_loss_mlp": 0.01252225, + "epoch": 0.7912821283631445, + "flos": 13703799847680.0, + "grad_norm": 1.7956028234892243, + "language_loss": 0.73223495, + "learning_rate": 4.3972499334967694e-07, + "loss": 0.80880302, + "num_input_tokens_seen": 283998275, + "router_z_loss_clip": 1.26171875, + "router_z_loss_mlp": 0.08007812, + "step": 13161, + "time_per_iteration": 2.449843406677246 + }, + { + "auxiliary_loss_clip": 0.0639775, + "auxiliary_loss_mlp": 0.01264548, + "balance_loss_clip": 0.06270458, + "balance_loss_mlp": 0.01255142, + "epoch": 0.7913422516158124, + "flos": 23775804979200.0, + "grad_norm": 1.579946795431406, + "language_loss": 0.73348385, + "learning_rate": 4.39481372557418e-07, + "loss": 0.81010681, + "num_input_tokens_seen": 284018750, + "router_z_loss_clip": 1.2734375, + "router_z_loss_mlp": 0.09399414, + "step": 13162, + "time_per_iteration": 2.538973093032837 + }, + { + "auxiliary_loss_clip": 0.06408161, + "auxiliary_loss_mlp": 0.01265697, + "balance_loss_clip": 0.06272799, + "balance_loss_mlp": 0.01255326, + "epoch": 0.7914023748684804, + "flos": 19944433036800.0, + "grad_norm": 3.1550813809291127, + "language_loss": 0.72027671, + "learning_rate": 4.392378109401811e-07, + "loss": 0.79701531, + "num_input_tokens_seen": 284037850, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.10369873, + "step": 13163, + "time_per_iteration": 2.481580972671509 + }, + { + "auxiliary_loss_clip": 0.06402975, + "auxiliary_loss_mlp": 0.01263483, + "balance_loss_clip": 0.06272998, + "balance_loss_mlp": 0.01253315, + "epoch": 0.7914624981211483, + "flos": 20601065957760.0, + "grad_norm": 1.7688129227744467, + "language_loss": 0.69559741, + "learning_rate": 4.3899430850720296e-07, + "loss": 0.77226198, + "num_input_tokens_seen": 284056380, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.10168457, + "step": 13164, + "time_per_iteration": 3.9441864490509033 + }, + { + "auxiliary_loss_clip": 0.06400138, + "auxiliary_loss_mlp": 0.01262142, + "balance_loss_clip": 0.0626981, + "balance_loss_mlp": 0.01253058, + "epoch": 0.7915226213738163, + "flos": 21806031997440.0, + "grad_norm": 1.639968913344359, + "language_loss": 0.66723585, + "learning_rate": 4.387508652677177e-07, + "loss": 0.74385864, + "num_input_tokens_seen": 284074945, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.09088135, + "step": 13165, + "time_per_iteration": 2.480177164077759 + }, + { + "auxiliary_loss_clip": 0.06395824, + "auxiliary_loss_mlp": 0.01263637, + "balance_loss_clip": 0.06268749, + "balance_loss_mlp": 0.01254887, + "epoch": 0.7915827446264843, + "flos": 16293714497280.0, + "grad_norm": 1.7980788419504534, + "language_loss": 0.72814763, + "learning_rate": 4.385074812309557e-07, + "loss": 0.80474222, + "num_input_tokens_seen": 284092070, + "router_z_loss_clip": 1.27148438, + "router_z_loss_mlp": 0.08758545, + "step": 13166, + "time_per_iteration": 2.5405478477478027 + }, + { + "auxiliary_loss_clip": 0.06400768, + "auxiliary_loss_mlp": 0.01267015, + "balance_loss_clip": 0.06271509, + "balance_loss_mlp": 0.01256602, + "epoch": 0.7916428678791523, + "flos": 25709673686400.0, + "grad_norm": 1.5950499739045652, + "language_loss": 0.77752012, + "learning_rate": 4.382641564061462e-07, + "loss": 0.85419798, + "num_input_tokens_seen": 284112255, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.10412598, + "step": 13167, + "time_per_iteration": 2.513096332550049 + }, + { + "auxiliary_loss_clip": 0.06400877, + "auxiliary_loss_mlp": 0.01265571, + "balance_loss_clip": 0.0627252, + "balance_loss_mlp": 0.01256553, + "epoch": 0.7917029911318202, + "flos": 23885320665600.0, + "grad_norm": 1.5971175695751862, + "language_loss": 0.84140885, + "learning_rate": 4.3802089080251713e-07, + "loss": 0.9180733, + "num_input_tokens_seen": 284132330, + "router_z_loss_clip": 1.28222656, + "router_z_loss_mlp": 0.09020996, + "step": 13168, + "time_per_iteration": 2.5276131629943848 + }, + { + "auxiliary_loss_clip": 0.06402327, + "auxiliary_loss_mlp": 0.0126475, + "balance_loss_clip": 0.06270839, + "balance_loss_mlp": 0.01254939, + "epoch": 0.7917631143844882, + "flos": 21651975066240.0, + "grad_norm": 1.4948037375095564, + "language_loss": 0.72659689, + "learning_rate": 4.3777768442929155e-07, + "loss": 0.8032676, + "num_input_tokens_seen": 284150640, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09820557, + "step": 13169, + "time_per_iteration": 2.476069211959839 + }, + { + "auxiliary_loss_clip": 0.06405612, + "auxiliary_loss_mlp": 0.01262617, + "balance_loss_clip": 0.06269795, + "balance_loss_mlp": 0.01252794, + "epoch": 0.7918232376371561, + "flos": 38883519187200.0, + "grad_norm": 1.931209408255316, + "language_loss": 0.674968, + "learning_rate": 4.3753453729569287e-07, + "loss": 0.75165027, + "num_input_tokens_seen": 284171910, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.09820557, + "step": 13170, + "time_per_iteration": 2.632267951965332 + }, + { + "auxiliary_loss_clip": 0.06402327, + "auxiliary_loss_mlp": 0.01263649, + "balance_loss_clip": 0.06270221, + "balance_loss_mlp": 0.01255108, + "epoch": 0.7918833608898241, + "flos": 20781551652480.0, + "grad_norm": 1.5871676794676228, + "language_loss": 0.70988441, + "learning_rate": 4.372914494109412e-07, + "loss": 0.7865442, + "num_input_tokens_seen": 284191340, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.08544922, + "step": 13171, + "time_per_iteration": 2.510680675506592 + }, + { + "auxiliary_loss_clip": 0.06402034, + "auxiliary_loss_mlp": 0.01267973, + "balance_loss_clip": 0.06270307, + "balance_loss_mlp": 0.0125855, + "epoch": 0.7919434841424922, + "flos": 33918276994560.0, + "grad_norm": 2.589962482835532, + "language_loss": 0.67366862, + "learning_rate": 4.370484207842553e-07, + "loss": 0.75036865, + "num_input_tokens_seen": 284212495, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.09417725, + "step": 13172, + "time_per_iteration": 2.6106696128845215 + }, + { + "auxiliary_loss_clip": 0.06403903, + "auxiliary_loss_mlp": 0.0126396, + "balance_loss_clip": 0.06273881, + "balance_loss_mlp": 0.01254209, + "epoch": 0.7920036073951601, + "flos": 21070253295360.0, + "grad_norm": 1.738065699124664, + "language_loss": 0.80093193, + "learning_rate": 4.3680545142484893e-07, + "loss": 0.87761056, + "num_input_tokens_seen": 284230825, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.09753418, + "step": 13173, + "time_per_iteration": 3.950551986694336 + }, + { + "auxiliary_loss_clip": 0.06400689, + "auxiliary_loss_mlp": 0.01261307, + "balance_loss_clip": 0.06269704, + "balance_loss_mlp": 0.01252307, + "epoch": 0.7920637306478281, + "flos": 23662138515840.0, + "grad_norm": 1.8426798849917176, + "language_loss": 0.77325201, + "learning_rate": 4.365625413419365e-07, + "loss": 0.84987199, + "num_input_tokens_seen": 284250365, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09002686, + "step": 13174, + "time_per_iteration": 2.591482639312744 + }, + { + "auxiliary_loss_clip": 0.06398596, + "auxiliary_loss_mlp": 0.01261992, + "balance_loss_clip": 0.06270695, + "balance_loss_mlp": 0.01253219, + "epoch": 0.792123853900496, + "flos": 27202251265920.0, + "grad_norm": 1.5031237737360255, + "language_loss": 0.71669394, + "learning_rate": 4.363196905447297e-07, + "loss": 0.79329979, + "num_input_tokens_seen": 284269635, + "router_z_loss_clip": 1.27832031, + "router_z_loss_mlp": 0.08770752, + "step": 13175, + "time_per_iteration": 2.587193489074707 + }, + { + "auxiliary_loss_clip": 0.06401914, + "auxiliary_loss_mlp": 0.01263613, + "balance_loss_clip": 0.06270476, + "balance_loss_mlp": 0.01254601, + "epoch": 0.792183977153164, + "flos": 19104631090560.0, + "grad_norm": 1.9608803410251472, + "language_loss": 0.59982938, + "learning_rate": 4.360768990424364e-07, + "loss": 0.67648464, + "num_input_tokens_seen": 284288380, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09014893, + "step": 13176, + "time_per_iteration": 2.4545774459838867 + }, + { + "auxiliary_loss_clip": 0.06398389, + "auxiliary_loss_mlp": 0.01268261, + "balance_loss_clip": 0.06270067, + "balance_loss_mlp": 0.01258635, + "epoch": 0.7922441004058319, + "flos": 17134564619520.0, + "grad_norm": 1.8342420107617015, + "language_loss": 0.73352873, + "learning_rate": 4.3583416684426376e-07, + "loss": 0.81019521, + "num_input_tokens_seen": 284306920, + "router_z_loss_clip": 1.28320312, + "router_z_loss_mlp": 0.09619141, + "step": 13177, + "time_per_iteration": 3.9278790950775146 + }, + { + "auxiliary_loss_clip": 0.06401221, + "auxiliary_loss_mlp": 0.0126363, + "balance_loss_clip": 0.06272082, + "balance_loss_mlp": 0.01254475, + "epoch": 0.7923042236585, + "flos": 17827395304320.0, + "grad_norm": 1.8523697538025845, + "language_loss": 0.64460981, + "learning_rate": 4.355914939594174e-07, + "loss": 0.72125828, + "num_input_tokens_seen": 284324700, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.09155273, + "step": 13178, + "time_per_iteration": 2.464949131011963 + }, + { + "auxiliary_loss_clip": 0.06402718, + "auxiliary_loss_mlp": 0.01261465, + "balance_loss_clip": 0.06270282, + "balance_loss_mlp": 0.01252804, + "epoch": 0.7923643469111679, + "flos": 29943036391680.0, + "grad_norm": 1.8056668444425423, + "language_loss": 0.69007665, + "learning_rate": 4.3534888039709726e-07, + "loss": 0.76671851, + "num_input_tokens_seen": 284345985, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.08660889, + "step": 13179, + "time_per_iteration": 2.560208559036255 + }, + { + "auxiliary_loss_clip": 0.06402154, + "auxiliary_loss_mlp": 0.01265495, + "balance_loss_clip": 0.06272629, + "balance_loss_mlp": 0.01256155, + "epoch": 0.7924244701638359, + "flos": 22681360874880.0, + "grad_norm": 2.1905203910288105, + "language_loss": 0.74228048, + "learning_rate": 4.3510632616650444e-07, + "loss": 0.81895697, + "num_input_tokens_seen": 284364475, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.09332275, + "step": 13180, + "time_per_iteration": 2.5125856399536133 + }, + { + "auxiliary_loss_clip": 0.06402977, + "auxiliary_loss_mlp": 0.01265326, + "balance_loss_clip": 0.06271179, + "balance_loss_mlp": 0.01254729, + "epoch": 0.7924845934165038, + "flos": 17974031149440.0, + "grad_norm": 2.3420456225908524, + "language_loss": 0.81796247, + "learning_rate": 4.3486383127683646e-07, + "loss": 0.89464545, + "num_input_tokens_seen": 284382125, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.10595703, + "step": 13181, + "time_per_iteration": 2.4527087211608887 + }, + { + "auxiliary_loss_clip": 0.06399131, + "auxiliary_loss_mlp": 0.01263297, + "balance_loss_clip": 0.06270739, + "balance_loss_mlp": 0.01253791, + "epoch": 0.7925447166691718, + "flos": 23483665319040.0, + "grad_norm": 1.8219768185370055, + "language_loss": 0.7760042, + "learning_rate": 4.346213957372895e-07, + "loss": 0.85262847, + "num_input_tokens_seen": 284401585, + "router_z_loss_clip": 1.28320312, + "router_z_loss_mlp": 0.09509277, + "step": 13182, + "time_per_iteration": 4.028662919998169 + }, + { + "auxiliary_loss_clip": 0.06410173, + "auxiliary_loss_mlp": 0.01265893, + "balance_loss_clip": 0.06274082, + "balance_loss_mlp": 0.01254866, + "epoch": 0.7926048399218397, + "flos": 20453591571840.0, + "grad_norm": 1.6188805399457735, + "language_loss": 0.74277139, + "learning_rate": 4.34379019557056e-07, + "loss": 0.8195321, + "num_input_tokens_seen": 284419125, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.11029053, + "step": 13183, + "time_per_iteration": 2.4738929271698 + }, + { + "auxiliary_loss_clip": 0.06403777, + "auxiliary_loss_mlp": 0.01263216, + "balance_loss_clip": 0.06273498, + "balance_loss_mlp": 0.0125424, + "epoch": 0.7926649631745077, + "flos": 37169184977280.0, + "grad_norm": 1.7084157774544453, + "language_loss": 0.68652374, + "learning_rate": 4.341367027453264e-07, + "loss": 0.76319367, + "num_input_tokens_seen": 284440445, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.08978271, + "step": 13184, + "time_per_iteration": 2.6054959297180176 + }, + { + "auxiliary_loss_clip": 0.06404284, + "auxiliary_loss_mlp": 0.01263636, + "balance_loss_clip": 0.06271448, + "balance_loss_mlp": 0.01254082, + "epoch": 0.7927250864271758, + "flos": 17024168465280.0, + "grad_norm": 1.8074716343378143, + "language_loss": 0.71104252, + "learning_rate": 4.338944453112907e-07, + "loss": 0.78772175, + "num_input_tokens_seen": 284459370, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09558105, + "step": 13185, + "time_per_iteration": 2.457500696182251 + }, + { + "auxiliary_loss_clip": 0.06404824, + "auxiliary_loss_mlp": 0.01263758, + "balance_loss_clip": 0.06271466, + "balance_loss_mlp": 0.01254377, + "epoch": 0.7927852096798437, + "flos": 17755041703680.0, + "grad_norm": 2.0425556514381777, + "language_loss": 0.65721595, + "learning_rate": 4.3365224726413375e-07, + "loss": 0.73390174, + "num_input_tokens_seen": 284477525, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.09381104, + "step": 13186, + "time_per_iteration": 2.491744041442871 + }, + { + "auxiliary_loss_clip": 0.06399564, + "auxiliary_loss_mlp": 0.01262578, + "balance_loss_clip": 0.06271927, + "balance_loss_mlp": 0.01253965, + "epoch": 0.7928453329325117, + "flos": 23844636708480.0, + "grad_norm": 1.452369328079203, + "language_loss": 0.77105349, + "learning_rate": 4.334101086130408e-07, + "loss": 0.84767497, + "num_input_tokens_seen": 284496590, + "router_z_loss_clip": 1.27539062, + "router_z_loss_mlp": 0.08612061, + "step": 13187, + "time_per_iteration": 2.512676239013672 + }, + { + "auxiliary_loss_clip": 0.06400672, + "auxiliary_loss_mlp": 0.01265003, + "balance_loss_clip": 0.06270963, + "balance_loss_mlp": 0.01255741, + "epoch": 0.7929054561851796, + "flos": 17460302567040.0, + "grad_norm": 1.9206985573704325, + "language_loss": 0.72777045, + "learning_rate": 4.3316802936719334e-07, + "loss": 0.80442715, + "num_input_tokens_seen": 284511470, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.09259033, + "step": 13188, + "time_per_iteration": 2.4961729049682617 + }, + { + "auxiliary_loss_clip": 0.06405029, + "auxiliary_loss_mlp": 0.01265612, + "balance_loss_clip": 0.06271419, + "balance_loss_mlp": 0.01254633, + "epoch": 0.7929655794378476, + "flos": 21987775503360.0, + "grad_norm": 2.0256790948802066, + "language_loss": 0.63584489, + "learning_rate": 4.329260095357725e-07, + "loss": 0.71255124, + "num_input_tokens_seen": 284531125, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.10980225, + "step": 13189, + "time_per_iteration": 2.481018304824829 + }, + { + "auxiliary_loss_clip": 0.06406255, + "auxiliary_loss_mlp": 0.01267784, + "balance_loss_clip": 0.06275403, + "balance_loss_mlp": 0.01258539, + "epoch": 0.7930257026905155, + "flos": 17279523383040.0, + "grad_norm": 2.1940059966398557, + "language_loss": 0.72796714, + "learning_rate": 4.3268404912795307e-07, + "loss": 0.80470747, + "num_input_tokens_seen": 284549340, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09240723, + "step": 13190, + "time_per_iteration": 2.489017963409424 + }, + { + "auxiliary_loss_clip": 0.06397982, + "auxiliary_loss_mlp": 0.01262706, + "balance_loss_clip": 0.06271739, + "balance_loss_mlp": 0.01254487, + "epoch": 0.7930858259431836, + "flos": 27306693780480.0, + "grad_norm": 2.0481734999626213, + "language_loss": 0.73499632, + "learning_rate": 4.3244214815291166e-07, + "loss": 0.81160319, + "num_input_tokens_seen": 284567060, + "router_z_loss_clip": 1.26269531, + "router_z_loss_mlp": 0.08221436, + "step": 13191, + "time_per_iteration": 2.523073196411133 + }, + { + "auxiliary_loss_clip": 0.06402196, + "auxiliary_loss_mlp": 0.01264267, + "balance_loss_clip": 0.06271292, + "balance_loss_mlp": 0.01254915, + "epoch": 0.7931459491958515, + "flos": 19869647667840.0, + "grad_norm": 1.6892778710359044, + "language_loss": 0.69173294, + "learning_rate": 4.322003066198219e-07, + "loss": 0.76839757, + "num_input_tokens_seen": 284586600, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09350586, + "step": 13192, + "time_per_iteration": 2.4932494163513184 + }, + { + "auxiliary_loss_clip": 0.06401037, + "auxiliary_loss_mlp": 0.0126355, + "balance_loss_clip": 0.06269231, + "balance_loss_mlp": 0.01254395, + "epoch": 0.7932060724485195, + "flos": 23153525032320.0, + "grad_norm": 1.5309974551938075, + "language_loss": 0.75287253, + "learning_rate": 4.3195852453785274e-07, + "loss": 0.82951844, + "num_input_tokens_seen": 284605715, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.0914917, + "step": 13193, + "time_per_iteration": 2.4988462924957275 + }, + { + "auxiliary_loss_clip": 0.0639962, + "auxiliary_loss_mlp": 0.01263491, + "balance_loss_clip": 0.06269534, + "balance_loss_mlp": 0.01253216, + "epoch": 0.7932661957011874, + "flos": 29942617121280.0, + "grad_norm": 1.4608356167152348, + "language_loss": 0.72191167, + "learning_rate": 4.317168019161741e-07, + "loss": 0.7985428, + "num_input_tokens_seen": 284628540, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.1026001, + "step": 13194, + "time_per_iteration": 2.545863151550293 + }, + { + "auxiliary_loss_clip": 0.06407529, + "auxiliary_loss_mlp": 0.01263238, + "balance_loss_clip": 0.06271923, + "balance_loss_mlp": 0.0125323, + "epoch": 0.7933263189538554, + "flos": 22564717591680.0, + "grad_norm": 1.9164119447525156, + "language_loss": 0.70693266, + "learning_rate": 4.314751387639517e-07, + "loss": 0.78364033, + "num_input_tokens_seen": 284646040, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.10015869, + "step": 13195, + "time_per_iteration": 2.478484869003296 + }, + { + "auxiliary_loss_clip": 0.06403863, + "auxiliary_loss_mlp": 0.0126619, + "balance_loss_clip": 0.06272461, + "balance_loss_mlp": 0.0125679, + "epoch": 0.7933864422065233, + "flos": 25485317579520.0, + "grad_norm": 1.4419483453830304, + "language_loss": 0.77285999, + "learning_rate": 4.3123353509034844e-07, + "loss": 0.8495605, + "num_input_tokens_seen": 284665110, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.09411621, + "step": 13196, + "time_per_iteration": 2.5209035873413086 + }, + { + "auxiliary_loss_clip": 0.06408395, + "auxiliary_loss_mlp": 0.01271096, + "balance_loss_clip": 0.06274862, + "balance_loss_mlp": 0.01261196, + "epoch": 0.7934465654591913, + "flos": 33591490871040.0, + "grad_norm": 1.6476530892648569, + "language_loss": 0.6925202, + "learning_rate": 4.309919909045268e-07, + "loss": 0.76931512, + "num_input_tokens_seen": 284686515, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.09899902, + "step": 13197, + "time_per_iteration": 2.6008334159851074 + }, + { + "auxiliary_loss_clip": 0.06401211, + "auxiliary_loss_mlp": 0.012638, + "balance_loss_clip": 0.06270218, + "balance_loss_mlp": 0.01254281, + "epoch": 0.7935066887118594, + "flos": 31440854851200.0, + "grad_norm": 1.7257166200150085, + "language_loss": 0.65332729, + "learning_rate": 4.30750506215646e-07, + "loss": 0.72997743, + "num_input_tokens_seen": 284707300, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09521484, + "step": 13198, + "time_per_iteration": 2.5760626792907715 + }, + { + "auxiliary_loss_clip": 0.06407583, + "auxiliary_loss_mlp": 0.01266914, + "balance_loss_clip": 0.06272698, + "balance_loss_mlp": 0.0125696, + "epoch": 0.7935668119645273, + "flos": 14687638162560.0, + "grad_norm": 1.9381240473938566, + "language_loss": 0.72217059, + "learning_rate": 4.30509081032864e-07, + "loss": 0.79891551, + "num_input_tokens_seen": 284723545, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.0994873, + "step": 13199, + "time_per_iteration": 2.4537320137023926 + }, + { + "auxiliary_loss_clip": 0.06404065, + "auxiliary_loss_mlp": 0.01264064, + "balance_loss_clip": 0.06271455, + "balance_loss_mlp": 0.01254647, + "epoch": 0.7936269352171953, + "flos": 18010061205120.0, + "grad_norm": 1.8593669017855428, + "language_loss": 0.80699968, + "learning_rate": 4.302677153653349e-07, + "loss": 0.88368094, + "num_input_tokens_seen": 284742650, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09411621, + "step": 13200, + "time_per_iteration": 2.4965553283691406 + }, + { + "auxiliary_loss_clip": 0.06395376, + "auxiliary_loss_mlp": 0.01263957, + "balance_loss_clip": 0.06269375, + "balance_loss_mlp": 0.01254527, + "epoch": 0.7936870584698632, + "flos": 18886228623360.0, + "grad_norm": 1.593396762237453, + "language_loss": 0.77522814, + "learning_rate": 4.3002640922221077e-07, + "loss": 0.85182142, + "num_input_tokens_seen": 284760955, + "router_z_loss_clip": 1.26171875, + "router_z_loss_mlp": 0.09423828, + "step": 13201, + "time_per_iteration": 2.497309446334839 + }, + { + "auxiliary_loss_clip": 0.06399371, + "auxiliary_loss_mlp": 0.01265865, + "balance_loss_clip": 0.06270684, + "balance_loss_mlp": 0.01256149, + "epoch": 0.7937471817225312, + "flos": 23373604581120.0, + "grad_norm": 1.5839447213043625, + "language_loss": 0.67329711, + "learning_rate": 4.2978516261264296e-07, + "loss": 0.74994946, + "num_input_tokens_seen": 284780745, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.09716797, + "step": 13202, + "time_per_iteration": 2.5105254650115967 + }, + { + "auxiliary_loss_clip": 0.06399509, + "auxiliary_loss_mlp": 0.01267318, + "balance_loss_clip": 0.06267376, + "balance_loss_mlp": 0.01257501, + "epoch": 0.7938073049751991, + "flos": 22681025458560.0, + "grad_norm": 1.8682622779044114, + "language_loss": 0.75083208, + "learning_rate": 4.2954397554577884e-07, + "loss": 0.82750034, + "num_input_tokens_seen": 284799000, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.09820557, + "step": 13203, + "time_per_iteration": 3.8750996589660645 + }, + { + "auxiliary_loss_clip": 0.06400256, + "auxiliary_loss_mlp": 0.01263086, + "balance_loss_clip": 0.06268462, + "balance_loss_mlp": 0.01253907, + "epoch": 0.7938674282278672, + "flos": 22857150741120.0, + "grad_norm": 1.6792002510464108, + "language_loss": 0.66683894, + "learning_rate": 4.293028480307643e-07, + "loss": 0.74347234, + "num_input_tokens_seen": 284817450, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09173584, + "step": 13204, + "time_per_iteration": 2.4866726398468018 + }, + { + "auxiliary_loss_clip": 0.0640104, + "auxiliary_loss_mlp": 0.01260862, + "balance_loss_clip": 0.0627223, + "balance_loss_mlp": 0.01252249, + "epoch": 0.7939275514805351, + "flos": 27019208021760.0, + "grad_norm": 1.3684183312797948, + "language_loss": 0.79726428, + "learning_rate": 4.290617800767438e-07, + "loss": 0.87388325, + "num_input_tokens_seen": 284838865, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.08605957, + "step": 13205, + "time_per_iteration": 2.555922746658325 + }, + { + "auxiliary_loss_clip": 0.06398693, + "auxiliary_loss_mlp": 0.012639, + "balance_loss_clip": 0.06270471, + "balance_loss_mlp": 0.01254596, + "epoch": 0.7939876747332031, + "flos": 21149315222400.0, + "grad_norm": 1.956372656118469, + "language_loss": 0.77988601, + "learning_rate": 4.28820771692858e-07, + "loss": 0.85651195, + "num_input_tokens_seen": 284857975, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.09295654, + "step": 13206, + "time_per_iteration": 2.5223846435546875 + }, + { + "auxiliary_loss_clip": 0.06407081, + "auxiliary_loss_mlp": 0.01264461, + "balance_loss_clip": 0.06272183, + "balance_loss_mlp": 0.01254638, + "epoch": 0.794047797985871, + "flos": 23294836143360.0, + "grad_norm": 2.5564565777737265, + "language_loss": 0.78640836, + "learning_rate": 4.285798228882456e-07, + "loss": 0.86312377, + "num_input_tokens_seen": 284877145, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.0982666, + "step": 13207, + "time_per_iteration": 2.5289721488952637 + }, + { + "auxiliary_loss_clip": 0.06401804, + "auxiliary_loss_mlp": 0.01266401, + "balance_loss_clip": 0.06270908, + "balance_loss_mlp": 0.01256679, + "epoch": 0.794107921238539, + "flos": 24614978019840.0, + "grad_norm": 1.988476360796287, + "language_loss": 0.84176642, + "learning_rate": 4.2833893367204375e-07, + "loss": 0.91844845, + "num_input_tokens_seen": 284895560, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.097229, + "step": 13208, + "time_per_iteration": 2.5182619094848633 + }, + { + "auxiliary_loss_clip": 0.06307561, + "auxiliary_loss_mlp": 0.01252747, + "balance_loss_clip": 0.06251705, + "balance_loss_mlp": 0.01251759, + "epoch": 0.7941680444912069, + "flos": 64114641077760.0, + "grad_norm": 0.7251481470508581, + "language_loss": 0.58347547, + "learning_rate": 4.280981040533875e-07, + "loss": 0.65907854, + "num_input_tokens_seen": 284963135, + "router_z_loss_clip": 0.56005859, + "router_z_loss_mlp": 0.00986481, + "step": 13209, + "time_per_iteration": 3.215669631958008 + }, + { + "auxiliary_loss_clip": 0.06411248, + "auxiliary_loss_mlp": 0.01263694, + "balance_loss_clip": 0.06275053, + "balance_loss_mlp": 0.01253753, + "epoch": 0.794228167743875, + "flos": 24395653157760.0, + "grad_norm": 2.3239436118534544, + "language_loss": 0.63244212, + "learning_rate": 4.2785733404140825e-07, + "loss": 0.70919156, + "num_input_tokens_seen": 284981755, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.09936523, + "step": 13210, + "time_per_iteration": 2.509675979614258 + }, + { + "auxiliary_loss_clip": 0.0639855, + "auxiliary_loss_mlp": 0.01264565, + "balance_loss_clip": 0.06268808, + "balance_loss_mlp": 0.01255135, + "epoch": 0.794288290996543, + "flos": 28520129082240.0, + "grad_norm": 1.5283303816318292, + "language_loss": 0.69651222, + "learning_rate": 4.2761662364523676e-07, + "loss": 0.77314341, + "num_input_tokens_seen": 285003060, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.09423828, + "step": 13211, + "time_per_iteration": 2.5609560012817383 + }, + { + "auxiliary_loss_clip": 0.0640647, + "auxiliary_loss_mlp": 0.01264423, + "balance_loss_clip": 0.06271889, + "balance_loss_mlp": 0.01253593, + "epoch": 0.7943484142492109, + "flos": 25929333964800.0, + "grad_norm": 1.5675650116890587, + "language_loss": 0.72487032, + "learning_rate": 4.2737597287400074e-07, + "loss": 0.80157924, + "num_input_tokens_seen": 285021640, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.10827637, + "step": 13212, + "time_per_iteration": 2.5255634784698486 + }, + { + "auxiliary_loss_clip": 0.06398303, + "auxiliary_loss_mlp": 0.0126368, + "balance_loss_clip": 0.06271377, + "balance_loss_mlp": 0.01254716, + "epoch": 0.7944085375018789, + "flos": 23922147335040.0, + "grad_norm": 1.6395336684596964, + "language_loss": 0.80590618, + "learning_rate": 4.271353817368246e-07, + "loss": 0.88252604, + "num_input_tokens_seen": 285040490, + "router_z_loss_clip": 1.26855469, + "router_z_loss_mlp": 0.08972168, + "step": 13213, + "time_per_iteration": 3.9452641010284424 + }, + { + "auxiliary_loss_clip": 0.06409128, + "auxiliary_loss_mlp": 0.01263209, + "balance_loss_clip": 0.06274794, + "balance_loss_mlp": 0.0125316, + "epoch": 0.7944686607545468, + "flos": 20236153426560.0, + "grad_norm": 2.1556158344518463, + "language_loss": 0.67980099, + "learning_rate": 4.268948502428327e-07, + "loss": 0.75652432, + "num_input_tokens_seen": 285059270, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.10046387, + "step": 13214, + "time_per_iteration": 2.5221662521362305 + }, + { + "auxiliary_loss_clip": 0.06399108, + "auxiliary_loss_mlp": 0.0126568, + "balance_loss_clip": 0.06270888, + "balance_loss_mlp": 0.01256215, + "epoch": 0.7945287840072148, + "flos": 21987440087040.0, + "grad_norm": 1.6557569175319402, + "language_loss": 0.72647429, + "learning_rate": 4.2665437840114535e-07, + "loss": 0.80312216, + "num_input_tokens_seen": 285075390, + "router_z_loss_clip": 1.28222656, + "router_z_loss_mlp": 0.09454346, + "step": 13215, + "time_per_iteration": 2.482057809829712 + }, + { + "auxiliary_loss_clip": 0.06396606, + "auxiliary_loss_mlp": 0.01264543, + "balance_loss_clip": 0.06269813, + "balance_loss_mlp": 0.0125512, + "epoch": 0.7945889072598827, + "flos": 26405229628800.0, + "grad_norm": 1.661805737915831, + "language_loss": 0.79503906, + "learning_rate": 4.2641396622088253e-07, + "loss": 0.87165052, + "num_input_tokens_seen": 285096290, + "router_z_loss_clip": 1.26855469, + "router_z_loss_mlp": 0.09429932, + "step": 13216, + "time_per_iteration": 2.5464351177215576 + }, + { + "auxiliary_loss_clip": 0.06404807, + "auxiliary_loss_mlp": 0.01263362, + "balance_loss_clip": 0.06270844, + "balance_loss_mlp": 0.01253772, + "epoch": 0.7946490305125508, + "flos": 25817051093760.0, + "grad_norm": 1.6049687625888907, + "language_loss": 0.73967838, + "learning_rate": 4.261736137111598e-07, + "loss": 0.81636012, + "num_input_tokens_seen": 285116020, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.09588623, + "step": 13217, + "time_per_iteration": 3.931478977203369 + }, + { + "auxiliary_loss_clip": 0.06401365, + "auxiliary_loss_mlp": 0.01263665, + "balance_loss_clip": 0.0627373, + "balance_loss_mlp": 0.0125408, + "epoch": 0.7947091537652187, + "flos": 15966425249280.0, + "grad_norm": 1.8482353685704531, + "language_loss": 0.74055278, + "learning_rate": 4.259333208810907e-07, + "loss": 0.81720304, + "num_input_tokens_seen": 285133510, + "router_z_loss_clip": 1.27636719, + "router_z_loss_mlp": 0.09591675, + "step": 13218, + "time_per_iteration": 2.4553987979888916 + }, + { + "auxiliary_loss_clip": 0.06410147, + "auxiliary_loss_mlp": 0.01263676, + "balance_loss_clip": 0.06273754, + "balance_loss_mlp": 0.0125424, + "epoch": 0.7947692770178867, + "flos": 18593753546880.0, + "grad_norm": 1.8816401972337626, + "language_loss": 0.83479667, + "learning_rate": 4.2569308773978817e-07, + "loss": 0.91153485, + "num_input_tokens_seen": 285151690, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.09442139, + "step": 13219, + "time_per_iteration": 2.44667911529541 + }, + { + "auxiliary_loss_clip": 0.06409134, + "auxiliary_loss_mlp": 0.01268173, + "balance_loss_clip": 0.06272696, + "balance_loss_mlp": 0.01258064, + "epoch": 0.7948294002705546, + "flos": 20447344442880.0, + "grad_norm": 1.667648831846699, + "language_loss": 0.7587316, + "learning_rate": 4.2545291429636123e-07, + "loss": 0.83550465, + "num_input_tokens_seen": 285170485, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.10113525, + "step": 13220, + "time_per_iteration": 2.515125036239624 + }, + { + "auxiliary_loss_clip": 0.06413321, + "auxiliary_loss_mlp": 0.01262935, + "balance_loss_clip": 0.0627633, + "balance_loss_mlp": 0.01253041, + "epoch": 0.7948895235232226, + "flos": 38190436940160.0, + "grad_norm": 1.659539697860105, + "language_loss": 0.72439814, + "learning_rate": 4.252128005599176e-07, + "loss": 0.80116069, + "num_input_tokens_seen": 285191050, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.09893799, + "step": 13221, + "time_per_iteration": 4.03423810005188 + }, + { + "auxiliary_loss_clip": 0.06401148, + "auxiliary_loss_mlp": 0.01264392, + "balance_loss_clip": 0.0627249, + "balance_loss_mlp": 0.01255052, + "epoch": 0.7949496467758905, + "flos": 15565231100160.0, + "grad_norm": 2.544368910491826, + "language_loss": 0.75068891, + "learning_rate": 4.249727465395634e-07, + "loss": 0.8273443, + "num_input_tokens_seen": 285208750, + "router_z_loss_clip": 1.28613281, + "router_z_loss_mlp": 0.09332275, + "step": 13222, + "time_per_iteration": 2.491516590118408 + }, + { + "auxiliary_loss_clip": 0.06308898, + "auxiliary_loss_mlp": 0.01254396, + "balance_loss_clip": 0.06253184, + "balance_loss_mlp": 0.01253385, + "epoch": 0.7950097700285585, + "flos": 70915864809600.0, + "grad_norm": 0.7838771916152429, + "language_loss": 0.66774839, + "learning_rate": 4.247327522443993e-07, + "loss": 0.74338138, + "num_input_tokens_seen": 285264605, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.01010132, + "step": 13223, + "time_per_iteration": 3.031728744506836 + }, + { + "auxiliary_loss_clip": 0.06404258, + "auxiliary_loss_mlp": 0.01264069, + "balance_loss_clip": 0.06271882, + "balance_loss_mlp": 0.01253829, + "epoch": 0.7950698932812266, + "flos": 23958470880000.0, + "grad_norm": 1.6379349696855243, + "language_loss": 0.71398437, + "learning_rate": 4.2449281768352717e-07, + "loss": 0.79066753, + "num_input_tokens_seen": 285283940, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.10241699, + "step": 13224, + "time_per_iteration": 2.5175724029541016 + }, + { + "auxiliary_loss_clip": 0.06312153, + "auxiliary_loss_mlp": 0.01251169, + "balance_loss_clip": 0.06256486, + "balance_loss_mlp": 0.01250191, + "epoch": 0.7951300165338945, + "flos": 60300096606720.0, + "grad_norm": 0.6591691135419323, + "language_loss": 0.55062973, + "learning_rate": 4.2425294286604527e-07, + "loss": 0.62626302, + "num_input_tokens_seen": 285349525, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.00976562, + "step": 13225, + "time_per_iteration": 3.178450345993042 + }, + { + "auxiliary_loss_clip": 0.06401074, + "auxiliary_loss_mlp": 0.01261342, + "balance_loss_clip": 0.06272745, + "balance_loss_mlp": 0.01252884, + "epoch": 0.7951901397865625, + "flos": 22825397243520.0, + "grad_norm": 2.154430910035814, + "language_loss": 0.65301824, + "learning_rate": 4.2401312780105034e-07, + "loss": 0.72964251, + "num_input_tokens_seen": 285367355, + "router_z_loss_clip": 1.28320312, + "router_z_loss_mlp": 0.08459473, + "step": 13226, + "time_per_iteration": 2.5249226093292236 + }, + { + "auxiliary_loss_clip": 0.06407489, + "auxiliary_loss_mlp": 0.0126573, + "balance_loss_clip": 0.06274739, + "balance_loss_mlp": 0.01256062, + "epoch": 0.7952502630392304, + "flos": 35703748920960.0, + "grad_norm": 2.011551916679729, + "language_loss": 0.70672739, + "learning_rate": 4.237733724976349e-07, + "loss": 0.78345954, + "num_input_tokens_seen": 285386190, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09680176, + "step": 13227, + "time_per_iteration": 2.6486446857452393 + }, + { + "auxiliary_loss_clip": 0.06398386, + "auxiliary_loss_mlp": 0.01262858, + "balance_loss_clip": 0.06269887, + "balance_loss_mlp": 0.01254162, + "epoch": 0.7953103862918984, + "flos": 25636942742400.0, + "grad_norm": 1.7944937078069616, + "language_loss": 0.69723666, + "learning_rate": 4.2353367696489184e-07, + "loss": 0.77384907, + "num_input_tokens_seen": 285406150, + "router_z_loss_clip": 1.28417969, + "router_z_loss_mlp": 0.08691406, + "step": 13228, + "time_per_iteration": 2.6445536613464355 + }, + { + "auxiliary_loss_clip": 0.06402546, + "auxiliary_loss_mlp": 0.01265131, + "balance_loss_clip": 0.06270213, + "balance_loss_mlp": 0.01255564, + "epoch": 0.7953705095445663, + "flos": 40561487925120.0, + "grad_norm": 1.474530595441345, + "language_loss": 0.70921922, + "learning_rate": 4.232940412119095e-07, + "loss": 0.78589594, + "num_input_tokens_seen": 285429900, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.09558105, + "step": 13229, + "time_per_iteration": 2.6637799739837646 + }, + { + "auxiliary_loss_clip": 0.0641102, + "auxiliary_loss_mlp": 0.0126613, + "balance_loss_clip": 0.06274529, + "balance_loss_mlp": 0.01256063, + "epoch": 0.7954306327972344, + "flos": 27644129372160.0, + "grad_norm": 1.7873536766913725, + "language_loss": 0.71492708, + "learning_rate": 4.2305446524777457e-07, + "loss": 0.79169858, + "num_input_tokens_seen": 285452555, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.10076904, + "step": 13230, + "time_per_iteration": 2.574101209640503 + }, + { + "auxiliary_loss_clip": 0.06309671, + "auxiliary_loss_mlp": 0.01251481, + "balance_loss_clip": 0.06254265, + "balance_loss_mlp": 0.01250479, + "epoch": 0.7954907560499023, + "flos": 59525505936000.0, + "grad_norm": 0.8781067484442618, + "language_loss": 0.63612801, + "learning_rate": 4.2281494908157247e-07, + "loss": 0.71173954, + "num_input_tokens_seen": 285515700, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.0100174, + "step": 13231, + "time_per_iteration": 3.143348217010498 + }, + { + "auxiliary_loss_clip": 0.06401561, + "auxiliary_loss_mlp": 0.01263604, + "balance_loss_clip": 0.0627121, + "balance_loss_mlp": 0.01253615, + "epoch": 0.7955508793025703, + "flos": 20126721594240.0, + "grad_norm": 1.6206459895498453, + "language_loss": 0.69870329, + "learning_rate": 4.2257549272238566e-07, + "loss": 0.77535492, + "num_input_tokens_seen": 285533910, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.09991455, + "step": 13232, + "time_per_iteration": 2.534808874130249 + }, + { + "auxiliary_loss_clip": 0.06401277, + "auxiliary_loss_mlp": 0.01262737, + "balance_loss_clip": 0.06270236, + "balance_loss_mlp": 0.01253272, + "epoch": 0.7956110025552382, + "flos": 26512607036160.0, + "grad_norm": 1.7341819887914223, + "language_loss": 0.78396481, + "learning_rate": 4.223360961792952e-07, + "loss": 0.860605, + "num_input_tokens_seen": 285554080, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09466553, + "step": 13233, + "time_per_iteration": 2.5741093158721924 + }, + { + "auxiliary_loss_clip": 0.06403272, + "auxiliary_loss_mlp": 0.01265137, + "balance_loss_clip": 0.06270528, + "balance_loss_mlp": 0.01255803, + "epoch": 0.7956711258079062, + "flos": 22572138677760.0, + "grad_norm": 1.88878875282178, + "language_loss": 0.78960502, + "learning_rate": 4.220967594613769e-07, + "loss": 0.86628914, + "num_input_tokens_seen": 285572325, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.09332275, + "step": 13234, + "time_per_iteration": 2.5267715454101562 + }, + { + "auxiliary_loss_clip": 0.064052, + "auxiliary_loss_mlp": 0.01262721, + "balance_loss_clip": 0.06274294, + "balance_loss_mlp": 0.01254102, + "epoch": 0.7957312490605741, + "flos": 17383882043520.0, + "grad_norm": 2.969852188387872, + "language_loss": 0.70354939, + "learning_rate": 4.218574825777077e-07, + "loss": 0.78022861, + "num_input_tokens_seen": 285589770, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.08618164, + "step": 13235, + "time_per_iteration": 2.472926616668701 + }, + { + "auxiliary_loss_clip": 0.0640211, + "auxiliary_loss_mlp": 0.0126658, + "balance_loss_clip": 0.06269485, + "balance_loss_mlp": 0.012564, + "epoch": 0.7957913723132422, + "flos": 22497898360320.0, + "grad_norm": 3.326054048453629, + "language_loss": 0.68091619, + "learning_rate": 4.2161826553736145e-07, + "loss": 0.75760305, + "num_input_tokens_seen": 285610065, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.10174561, + "step": 13236, + "time_per_iteration": 2.5275604724884033 + }, + { + "auxiliary_loss_clip": 0.06401785, + "auxiliary_loss_mlp": 0.01265164, + "balance_loss_clip": 0.06272059, + "balance_loss_mlp": 0.01256295, + "epoch": 0.7958514955659101, + "flos": 22644701913600.0, + "grad_norm": 1.5838694899419836, + "language_loss": 0.75233686, + "learning_rate": 4.2137910834940826e-07, + "loss": 0.82900631, + "num_input_tokens_seen": 285628480, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.08874512, + "step": 13237, + "time_per_iteration": 2.5152275562286377 + }, + { + "auxiliary_loss_clip": 0.06404451, + "auxiliary_loss_mlp": 0.0126561, + "balance_loss_clip": 0.06271912, + "balance_loss_mlp": 0.01255788, + "epoch": 0.7959116188185781, + "flos": 20710497790080.0, + "grad_norm": 1.909101485463629, + "language_loss": 0.71454495, + "learning_rate": 4.211400110229175e-07, + "loss": 0.79124558, + "num_input_tokens_seen": 285647805, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.0982666, + "step": 13238, + "time_per_iteration": 2.5149312019348145 + }, + { + "auxiliary_loss_clip": 0.0640163, + "auxiliary_loss_mlp": 0.01263785, + "balance_loss_clip": 0.06269349, + "balance_loss_mlp": 0.01254844, + "epoch": 0.7959717420712461, + "flos": 19030474627200.0, + "grad_norm": 2.2119566924128584, + "language_loss": 0.74293685, + "learning_rate": 4.2090097356695684e-07, + "loss": 0.81959099, + "num_input_tokens_seen": 285665505, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.0894165, + "step": 13239, + "time_per_iteration": 2.4692234992980957 + }, + { + "auxiliary_loss_clip": 0.06405409, + "auxiliary_loss_mlp": 0.01264077, + "balance_loss_clip": 0.0627186, + "balance_loss_mlp": 0.01254314, + "epoch": 0.796031865323914, + "flos": 26363371714560.0, + "grad_norm": 4.594953960637003, + "language_loss": 0.69371974, + "learning_rate": 4.2066199599058814e-07, + "loss": 0.77041459, + "num_input_tokens_seen": 285685855, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.09765625, + "step": 13240, + "time_per_iteration": 2.5826754570007324 + }, + { + "auxiliary_loss_clip": 0.06308684, + "auxiliary_loss_mlp": 0.01255726, + "balance_loss_clip": 0.06253344, + "balance_loss_mlp": 0.01254768, + "epoch": 0.796091988576582, + "flos": 62087119833600.0, + "grad_norm": 0.8806225517212096, + "language_loss": 0.5847106, + "learning_rate": 4.2042307830287526e-07, + "loss": 0.66035473, + "num_input_tokens_seen": 285735710, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.00956726, + "step": 13241, + "time_per_iteration": 2.9126768112182617 + }, + { + "auxiliary_loss_clip": 0.06403052, + "auxiliary_loss_mlp": 0.01265132, + "balance_loss_clip": 0.06270704, + "balance_loss_mlp": 0.01255864, + "epoch": 0.7961521118292499, + "flos": 39029442272640.0, + "grad_norm": 2.127726994888291, + "language_loss": 0.64769882, + "learning_rate": 4.201842205128772e-07, + "loss": 0.72438073, + "num_input_tokens_seen": 285757045, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.09265137, + "step": 13242, + "time_per_iteration": 2.635535717010498 + }, + { + "auxiliary_loss_clip": 0.06402293, + "auxiliary_loss_mlp": 0.01267879, + "balance_loss_clip": 0.06268795, + "balance_loss_mlp": 0.01257795, + "epoch": 0.796212235081918, + "flos": 21769373036160.0, + "grad_norm": 2.0186777582920024, + "language_loss": 0.76239574, + "learning_rate": 4.199454226296526e-07, + "loss": 0.83909744, + "num_input_tokens_seen": 285776050, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.10083008, + "step": 13243, + "time_per_iteration": 3.8618268966674805 + }, + { + "auxiliary_loss_clip": 0.06402823, + "auxiliary_loss_mlp": 0.01264428, + "balance_loss_clip": 0.06270328, + "balance_loss_mlp": 0.01254605, + "epoch": 0.7962723583345859, + "flos": 21185261424000.0, + "grad_norm": 1.6364985939961718, + "language_loss": 0.79507935, + "learning_rate": 4.1970668466225565e-07, + "loss": 0.8717519, + "num_input_tokens_seen": 285796830, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.09832764, + "step": 13244, + "time_per_iteration": 2.51326322555542 + }, + { + "auxiliary_loss_clip": 0.06406613, + "auxiliary_loss_mlp": 0.01264352, + "balance_loss_clip": 0.06270078, + "balance_loss_mlp": 0.01254308, + "epoch": 0.7963324815872539, + "flos": 17134313057280.0, + "grad_norm": 1.908775351263593, + "language_loss": 0.68666172, + "learning_rate": 4.1946800661973934e-07, + "loss": 0.76337141, + "num_input_tokens_seen": 285814755, + "router_z_loss_clip": 1.36621094, + "router_z_loss_mlp": 0.10046387, + "step": 13245, + "time_per_iteration": 2.461880922317505 + }, + { + "auxiliary_loss_clip": 0.0640422, + "auxiliary_loss_mlp": 0.01265244, + "balance_loss_clip": 0.06271861, + "balance_loss_mlp": 0.01255749, + "epoch": 0.7963926048399218, + "flos": 21403873526400.0, + "grad_norm": 1.7297162444203578, + "language_loss": 0.79002523, + "learning_rate": 4.192293885111549e-07, + "loss": 0.86671984, + "num_input_tokens_seen": 285834255, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.0949707, + "step": 13246, + "time_per_iteration": 2.4906105995178223 + }, + { + "auxiliary_loss_clip": 0.06404968, + "auxiliary_loss_mlp": 0.012642, + "balance_loss_clip": 0.06269813, + "balance_loss_mlp": 0.01254073, + "epoch": 0.7964527280925898, + "flos": 25189907610240.0, + "grad_norm": 1.8120227230539676, + "language_loss": 0.66180718, + "learning_rate": 4.1899083034555007e-07, + "loss": 0.73849887, + "num_input_tokens_seen": 285853540, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.10125732, + "step": 13247, + "time_per_iteration": 2.534837484359741 + }, + { + "auxiliary_loss_clip": 0.0639786, + "auxiliary_loss_mlp": 0.01263181, + "balance_loss_clip": 0.06269214, + "balance_loss_mlp": 0.0125458, + "epoch": 0.7965128513452577, + "flos": 27023149163520.0, + "grad_norm": 1.7943633437832778, + "language_loss": 0.71878839, + "learning_rate": 4.1875233213197123e-07, + "loss": 0.79539883, + "num_input_tokens_seen": 285872705, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.08599854, + "step": 13248, + "time_per_iteration": 2.5318338871002197 + }, + { + "auxiliary_loss_clip": 0.06404188, + "auxiliary_loss_mlp": 0.01265183, + "balance_loss_clip": 0.06268889, + "balance_loss_mlp": 0.01255378, + "epoch": 0.7965729745979258, + "flos": 24425436084480.0, + "grad_norm": 2.290940910554294, + "language_loss": 0.76236963, + "learning_rate": 4.1851389387946255e-07, + "loss": 0.83906335, + "num_input_tokens_seen": 285890290, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.0980835, + "step": 13249, + "time_per_iteration": 2.5285370349884033 + }, + { + "auxiliary_loss_clip": 0.06399461, + "auxiliary_loss_mlp": 0.01262002, + "balance_loss_clip": 0.06270114, + "balance_loss_mlp": 0.01252703, + "epoch": 0.7966330978505937, + "flos": 18845838155520.0, + "grad_norm": 1.9207763897520123, + "language_loss": 0.61375982, + "learning_rate": 4.1827551559706674e-07, + "loss": 0.69037437, + "num_input_tokens_seen": 285909190, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.09307861, + "step": 13250, + "time_per_iteration": 2.4775562286376953 + }, + { + "auxiliary_loss_clip": 0.06399567, + "auxiliary_loss_mlp": 0.01263631, + "balance_loss_clip": 0.06269053, + "balance_loss_mlp": 0.01253982, + "epoch": 0.7966932211032617, + "flos": 13157437299840.0, + "grad_norm": 2.289000304094375, + "language_loss": 0.72802746, + "learning_rate": 4.180371972938206e-07, + "loss": 0.80465943, + "num_input_tokens_seen": 285927570, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.09655762, + "step": 13251, + "time_per_iteration": 2.5408740043640137 + }, + { + "auxiliary_loss_clip": 0.06409312, + "auxiliary_loss_mlp": 0.01265133, + "balance_loss_clip": 0.06273971, + "balance_loss_mlp": 0.01254654, + "epoch": 0.7967533443559297, + "flos": 23956290673920.0, + "grad_norm": 1.9875673178726758, + "language_loss": 0.73053861, + "learning_rate": 4.177989389787624e-07, + "loss": 0.80728304, + "num_input_tokens_seen": 285945810, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.1048584, + "step": 13252, + "time_per_iteration": 3.9433846473693848 + }, + { + "auxiliary_loss_clip": 0.06396703, + "auxiliary_loss_mlp": 0.01266191, + "balance_loss_clip": 0.06269825, + "balance_loss_mlp": 0.01256886, + "epoch": 0.7968134676085976, + "flos": 30375984038400.0, + "grad_norm": 1.8369149171198353, + "language_loss": 0.66266763, + "learning_rate": 4.175607406609278e-07, + "loss": 0.73929667, + "num_input_tokens_seen": 285964235, + "router_z_loss_clip": 1.26757812, + "router_z_loss_mlp": 0.09307861, + "step": 13253, + "time_per_iteration": 2.5753839015960693 + }, + { + "auxiliary_loss_clip": 0.06402615, + "auxiliary_loss_mlp": 0.01264505, + "balance_loss_clip": 0.06269044, + "balance_loss_mlp": 0.01254289, + "epoch": 0.7968735908612656, + "flos": 23081590702080.0, + "grad_norm": 1.5642785207566534, + "language_loss": 0.67620826, + "learning_rate": 4.1732260234934767e-07, + "loss": 0.75287944, + "num_input_tokens_seen": 285983710, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.10223389, + "step": 13254, + "time_per_iteration": 2.587885856628418 + }, + { + "auxiliary_loss_clip": 0.0640402, + "auxiliary_loss_mlp": 0.01267658, + "balance_loss_clip": 0.06271625, + "balance_loss_mlp": 0.01258467, + "epoch": 0.7969337141139335, + "flos": 23588275541760.0, + "grad_norm": 2.088422762405943, + "language_loss": 0.69607329, + "learning_rate": 4.1708452405305314e-07, + "loss": 0.77279007, + "num_input_tokens_seen": 286003425, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.09191895, + "step": 13255, + "time_per_iteration": 2.5366928577423096 + }, + { + "auxiliary_loss_clip": 0.06399679, + "auxiliary_loss_mlp": 0.01263773, + "balance_loss_clip": 0.0626971, + "balance_loss_mlp": 0.0125463, + "epoch": 0.7969938373666016, + "flos": 19762018698240.0, + "grad_norm": 1.6762095197917861, + "language_loss": 0.79241788, + "learning_rate": 4.168465057810733e-07, + "loss": 0.86905241, + "num_input_tokens_seen": 286020130, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.09143066, + "step": 13256, + "time_per_iteration": 3.9199607372283936 + }, + { + "auxiliary_loss_clip": 0.06405733, + "auxiliary_loss_mlp": 0.01263678, + "balance_loss_clip": 0.06272037, + "balance_loss_mlp": 0.01254195, + "epoch": 0.7970539606192695, + "flos": 24140969072640.0, + "grad_norm": 1.817522476863435, + "language_loss": 0.66469562, + "learning_rate": 4.166085475424315e-07, + "loss": 0.74138975, + "num_input_tokens_seen": 286040230, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.09484863, + "step": 13257, + "time_per_iteration": 2.4968059062957764 + }, + { + "auxiliary_loss_clip": 0.06411573, + "auxiliary_loss_mlp": 0.01262722, + "balance_loss_clip": 0.06272082, + "balance_loss_mlp": 0.01252977, + "epoch": 0.7971140838719375, + "flos": 17974576200960.0, + "grad_norm": 2.293552355321388, + "language_loss": 0.721138, + "learning_rate": 4.163706493461523e-07, + "loss": 0.79788101, + "num_input_tokens_seen": 286059475, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.09753418, + "step": 13258, + "time_per_iteration": 2.466635227203369 + }, + { + "auxiliary_loss_clip": 0.06404628, + "auxiliary_loss_mlp": 0.01268173, + "balance_loss_clip": 0.06270341, + "balance_loss_mlp": 0.01257439, + "epoch": 0.7971742071246054, + "flos": 19175181828480.0, + "grad_norm": 1.7912391212808825, + "language_loss": 0.69168359, + "learning_rate": 4.1613281120125655e-07, + "loss": 0.76841164, + "num_input_tokens_seen": 286077820, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.1072998, + "step": 13259, + "time_per_iteration": 2.5077145099639893 + }, + { + "auxiliary_loss_clip": 0.06399243, + "auxiliary_loss_mlp": 0.01264467, + "balance_loss_clip": 0.06270258, + "balance_loss_mlp": 0.01255467, + "epoch": 0.7972343303772734, + "flos": 27133335682560.0, + "grad_norm": 1.8522631827723854, + "language_loss": 0.73832285, + "learning_rate": 4.158950331167641e-07, + "loss": 0.81495994, + "num_input_tokens_seen": 286097285, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.09002686, + "step": 13260, + "time_per_iteration": 2.542802333831787 + }, + { + "auxiliary_loss_clip": 0.0640289, + "auxiliary_loss_mlp": 0.01260989, + "balance_loss_clip": 0.06273317, + "balance_loss_mlp": 0.01251559, + "epoch": 0.7972944536299413, + "flos": 21003056720640.0, + "grad_norm": 1.7849042953427723, + "language_loss": 0.78480017, + "learning_rate": 4.1565731510169065e-07, + "loss": 0.86143899, + "num_input_tokens_seen": 286116000, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.09423828, + "step": 13261, + "time_per_iteration": 3.9328079223632812 + }, + { + "auxiliary_loss_clip": 0.06398886, + "auxiliary_loss_mlp": 0.01262833, + "balance_loss_clip": 0.06273298, + "balance_loss_mlp": 0.01254673, + "epoch": 0.7973545768826094, + "flos": 21586455573120.0, + "grad_norm": 1.5738375071778383, + "language_loss": 0.76378083, + "learning_rate": 4.154196571650501e-07, + "loss": 0.84039807, + "num_input_tokens_seen": 286135110, + "router_z_loss_clip": 1.25488281, + "router_z_loss_mlp": 0.081604, + "step": 13262, + "time_per_iteration": 2.563962936401367 + }, + { + "auxiliary_loss_clip": 0.06407683, + "auxiliary_loss_mlp": 0.01266045, + "balance_loss_clip": 0.06271025, + "balance_loss_mlp": 0.01254929, + "epoch": 0.7974147001352773, + "flos": 20564826266880.0, + "grad_norm": 2.3741111295907626, + "language_loss": 0.70724112, + "learning_rate": 4.1518205931585524e-07, + "loss": 0.7839784, + "num_input_tokens_seen": 286152835, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.11126709, + "step": 13263, + "time_per_iteration": 2.4744935035705566 + }, + { + "auxiliary_loss_clip": 0.0641284, + "auxiliary_loss_mlp": 0.01264474, + "balance_loss_clip": 0.06274222, + "balance_loss_mlp": 0.01253174, + "epoch": 0.7974748233879453, + "flos": 21003224428800.0, + "grad_norm": 1.8041636283725375, + "language_loss": 0.71434695, + "learning_rate": 4.149445215631153e-07, + "loss": 0.79112011, + "num_input_tokens_seen": 286171785, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.11297607, + "step": 13264, + "time_per_iteration": 2.485276460647583 + }, + { + "auxiliary_loss_clip": 0.06398866, + "auxiliary_loss_mlp": 0.0126452, + "balance_loss_clip": 0.06270253, + "balance_loss_mlp": 0.01256187, + "epoch": 0.7975349466406133, + "flos": 22571803261440.0, + "grad_norm": 1.6689770527063423, + "language_loss": 0.77659208, + "learning_rate": 4.1470704391583776e-07, + "loss": 0.85322595, + "num_input_tokens_seen": 286190420, + "router_z_loss_clip": 1.28417969, + "router_z_loss_mlp": 0.08331299, + "step": 13265, + "time_per_iteration": 2.50765061378479 + }, + { + "auxiliary_loss_clip": 0.06407373, + "auxiliary_loss_mlp": 0.01269533, + "balance_loss_clip": 0.06273501, + "balance_loss_mlp": 0.01259609, + "epoch": 0.7975950698932812, + "flos": 21696013186560.0, + "grad_norm": 1.8504698542540234, + "language_loss": 0.76059192, + "learning_rate": 4.144696263830285e-07, + "loss": 0.83736098, + "num_input_tokens_seen": 286210105, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.0993042, + "step": 13266, + "time_per_iteration": 2.5207157135009766 + }, + { + "auxiliary_loss_clip": 0.06402943, + "auxiliary_loss_mlp": 0.01264296, + "balance_loss_clip": 0.06272074, + "balance_loss_mlp": 0.01255183, + "epoch": 0.7976551931459492, + "flos": 19609806556800.0, + "grad_norm": 1.6112289211308914, + "language_loss": 0.83747739, + "learning_rate": 4.1423226897369015e-07, + "loss": 0.91414976, + "num_input_tokens_seen": 286228180, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.09112549, + "step": 13267, + "time_per_iteration": 2.523797035217285 + }, + { + "auxiliary_loss_clip": 0.06403189, + "auxiliary_loss_mlp": 0.01266238, + "balance_loss_clip": 0.06272589, + "balance_loss_mlp": 0.01256725, + "epoch": 0.7977153163986171, + "flos": 21693749126400.0, + "grad_norm": 1.4537624263579578, + "language_loss": 0.76656401, + "learning_rate": 4.139949716968223e-07, + "loss": 0.84325826, + "num_input_tokens_seen": 286247305, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09503174, + "step": 13268, + "time_per_iteration": 2.50384783744812 + }, + { + "auxiliary_loss_clip": 0.06404118, + "auxiliary_loss_mlp": 0.0126592, + "balance_loss_clip": 0.06272426, + "balance_loss_mlp": 0.01256574, + "epoch": 0.7977754396512852, + "flos": 23483455683840.0, + "grad_norm": 1.5523298062662978, + "language_loss": 0.78092402, + "learning_rate": 4.1375773456142403e-07, + "loss": 0.85762441, + "num_input_tokens_seen": 286268145, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09344482, + "step": 13269, + "time_per_iteration": 2.544590473175049 + }, + { + "auxiliary_loss_clip": 0.06399094, + "auxiliary_loss_mlp": 0.01261853, + "balance_loss_clip": 0.06270756, + "balance_loss_mlp": 0.01253043, + "epoch": 0.7978355629039531, + "flos": 22388718090240.0, + "grad_norm": 1.6478961708757416, + "language_loss": 0.82291299, + "learning_rate": 4.135205575764922e-07, + "loss": 0.89952242, + "num_input_tokens_seen": 286286775, + "router_z_loss_clip": 1.28222656, + "router_z_loss_mlp": 0.08813477, + "step": 13270, + "time_per_iteration": 2.4902870655059814 + }, + { + "auxiliary_loss_clip": 0.06401956, + "auxiliary_loss_mlp": 0.01264701, + "balance_loss_clip": 0.06270558, + "balance_loss_mlp": 0.01255331, + "epoch": 0.7978956861566211, + "flos": 20272518898560.0, + "grad_norm": 2.1156464454549297, + "language_loss": 0.59938061, + "learning_rate": 4.1328344075101905e-07, + "loss": 0.67604721, + "num_input_tokens_seen": 286305590, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.09362793, + "step": 13271, + "time_per_iteration": 2.5591602325439453 + }, + { + "auxiliary_loss_clip": 0.06410769, + "auxiliary_loss_mlp": 0.01265645, + "balance_loss_clip": 0.06274214, + "balance_loss_mlp": 0.01256037, + "epoch": 0.797955809409289, + "flos": 28120192744320.0, + "grad_norm": 1.4386088451054988, + "language_loss": 0.73758554, + "learning_rate": 4.130463840939975e-07, + "loss": 0.81434965, + "num_input_tokens_seen": 286328050, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.09606934, + "step": 13272, + "time_per_iteration": 2.570200204849243 + }, + { + "auxiliary_loss_clip": 0.06401898, + "auxiliary_loss_mlp": 0.0126542, + "balance_loss_clip": 0.06270777, + "balance_loss_mlp": 0.012558, + "epoch": 0.798015932661957, + "flos": 15564979537920.0, + "grad_norm": 2.1482391429317067, + "language_loss": 0.71803975, + "learning_rate": 4.128093876144161e-07, + "loss": 0.79471296, + "num_input_tokens_seen": 286345265, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09625244, + "step": 13273, + "time_per_iteration": 2.4748198986053467 + }, + { + "auxiliary_loss_clip": 0.0640889, + "auxiliary_loss_mlp": 0.01264134, + "balance_loss_clip": 0.06274156, + "balance_loss_mlp": 0.012539, + "epoch": 0.7980760559146249, + "flos": 23957967755520.0, + "grad_norm": 1.5725586223842085, + "language_loss": 0.75832808, + "learning_rate": 4.1257245132126117e-07, + "loss": 0.83505827, + "num_input_tokens_seen": 286364465, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.10241699, + "step": 13274, + "time_per_iteration": 2.55397629737854 + }, + { + "auxiliary_loss_clip": 0.06394248, + "auxiliary_loss_mlp": 0.01262515, + "balance_loss_clip": 0.06268619, + "balance_loss_mlp": 0.01253622, + "epoch": 0.798136179167293, + "flos": 28045617010560.0, + "grad_norm": 1.334626175327206, + "language_loss": 0.77871919, + "learning_rate": 4.12335575223518e-07, + "loss": 0.85528684, + "num_input_tokens_seen": 286385565, + "router_z_loss_clip": 1.25585938, + "router_z_loss_mlp": 0.08892822, + "step": 13275, + "time_per_iteration": 2.594181776046753 + }, + { + "auxiliary_loss_clip": 0.0640621, + "auxiliary_loss_mlp": 0.01265971, + "balance_loss_clip": 0.06270525, + "balance_loss_mlp": 0.01255189, + "epoch": 0.7981963024199609, + "flos": 35992157074560.0, + "grad_norm": 2.855483452086949, + "language_loss": 0.64085776, + "learning_rate": 4.1209875933016877e-07, + "loss": 0.71757954, + "num_input_tokens_seen": 286403950, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.10784912, + "step": 13276, + "time_per_iteration": 2.5930356979370117 + }, + { + "auxiliary_loss_clip": 0.06401938, + "auxiliary_loss_mlp": 0.01267748, + "balance_loss_clip": 0.06273316, + "balance_loss_mlp": 0.0125805, + "epoch": 0.7982564256726289, + "flos": 25892004170880.0, + "grad_norm": 1.5904474642505515, + "language_loss": 0.61038435, + "learning_rate": 4.118620036501945e-07, + "loss": 0.68708122, + "num_input_tokens_seen": 286426160, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.09692383, + "step": 13277, + "time_per_iteration": 2.5839786529541016 + }, + { + "auxiliary_loss_clip": 0.06411898, + "auxiliary_loss_mlp": 0.0126538, + "balance_loss_clip": 0.06276092, + "balance_loss_mlp": 0.012561, + "epoch": 0.7983165489252969, + "flos": 25746248793600.0, + "grad_norm": 1.8327445572983765, + "language_loss": 0.79849744, + "learning_rate": 4.1162530819257227e-07, + "loss": 0.87527025, + "num_input_tokens_seen": 286446610, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.09283447, + "step": 13278, + "time_per_iteration": 2.5260982513427734 + }, + { + "auxiliary_loss_clip": 0.06405683, + "auxiliary_loss_mlp": 0.01263371, + "balance_loss_clip": 0.06271876, + "balance_loss_mlp": 0.01253518, + "epoch": 0.7983766721779648, + "flos": 21914667216000.0, + "grad_norm": 1.9889744564125917, + "language_loss": 0.63581717, + "learning_rate": 4.113886729662768e-07, + "loss": 0.71250772, + "num_input_tokens_seen": 286465460, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.09844971, + "step": 13279, + "time_per_iteration": 2.5182244777679443 + }, + { + "auxiliary_loss_clip": 0.06394448, + "auxiliary_loss_mlp": 0.01266216, + "balance_loss_clip": 0.06270408, + "balance_loss_mlp": 0.01257925, + "epoch": 0.7984367954306328, + "flos": 29354480513280.0, + "grad_norm": 1.5743045282106698, + "language_loss": 0.71176022, + "learning_rate": 4.111520979802825e-07, + "loss": 0.78836685, + "num_input_tokens_seen": 286485720, + "router_z_loss_clip": 1.24121094, + "router_z_loss_mlp": 0.08282471, + "step": 13280, + "time_per_iteration": 2.575366258621216 + }, + { + "auxiliary_loss_clip": 0.06409226, + "auxiliary_loss_mlp": 0.01266632, + "balance_loss_clip": 0.06273544, + "balance_loss_mlp": 0.01257149, + "epoch": 0.7984969186833007, + "flos": 31365775992960.0, + "grad_norm": 1.6558048262309357, + "language_loss": 0.62836027, + "learning_rate": 4.1091558324355955e-07, + "loss": 0.70511883, + "num_input_tokens_seen": 286507465, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.0947876, + "step": 13281, + "time_per_iteration": 2.624361276626587 + }, + { + "auxiliary_loss_clip": 0.06407207, + "auxiliary_loss_mlp": 0.01265261, + "balance_loss_clip": 0.06269886, + "balance_loss_mlp": 0.01254807, + "epoch": 0.7985570419359688, + "flos": 24319232634240.0, + "grad_norm": 1.8833916192642874, + "language_loss": 0.79982495, + "learning_rate": 4.1067912876507683e-07, + "loss": 0.8765496, + "num_input_tokens_seen": 286526345, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.10449219, + "step": 13282, + "time_per_iteration": 2.522733211517334 + }, + { + "auxiliary_loss_clip": 0.06405975, + "auxiliary_loss_mlp": 0.01265316, + "balance_loss_clip": 0.06271493, + "balance_loss_mlp": 0.01256339, + "epoch": 0.7986171651886367, + "flos": 15747687365760.0, + "grad_norm": 2.26715299858664, + "language_loss": 0.72620189, + "learning_rate": 4.10442734553802e-07, + "loss": 0.8029148, + "num_input_tokens_seen": 286544095, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.08972168, + "step": 13283, + "time_per_iteration": 3.8687400817871094 + }, + { + "auxiliary_loss_clip": 0.06398675, + "auxiliary_loss_mlp": 0.01262054, + "balance_loss_clip": 0.06269114, + "balance_loss_mlp": 0.01253072, + "epoch": 0.7986772884413047, + "flos": 11624175763200.0, + "grad_norm": 2.1421699909472474, + "language_loss": 0.73992294, + "learning_rate": 4.102064006186967e-07, + "loss": 0.81653023, + "num_input_tokens_seen": 286560960, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.08984375, + "step": 13284, + "time_per_iteration": 2.464895486831665 + }, + { + "auxiliary_loss_clip": 0.06401472, + "auxiliary_loss_mlp": 0.01263764, + "balance_loss_clip": 0.06270264, + "balance_loss_mlp": 0.01254883, + "epoch": 0.7987374116939726, + "flos": 22097626606080.0, + "grad_norm": 1.6639585561146113, + "language_loss": 0.70836139, + "learning_rate": 4.0997012696872415e-07, + "loss": 0.78501368, + "num_input_tokens_seen": 286579865, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.08874512, + "step": 13285, + "time_per_iteration": 2.5129339694976807 + }, + { + "auxiliary_loss_clip": 0.06401065, + "auxiliary_loss_mlp": 0.01262275, + "balance_loss_clip": 0.06268647, + "balance_loss_mlp": 0.01252982, + "epoch": 0.7987975349466406, + "flos": 17895807763200.0, + "grad_norm": 1.6553012923822499, + "language_loss": 0.73934168, + "learning_rate": 4.097339136128437e-07, + "loss": 0.81597507, + "num_input_tokens_seen": 286597295, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.09295654, + "step": 13286, + "time_per_iteration": 2.4993607997894287 + }, + { + "auxiliary_loss_clip": 0.0640146, + "auxiliary_loss_mlp": 0.01262205, + "balance_loss_clip": 0.06270432, + "balance_loss_mlp": 0.01252859, + "epoch": 0.7988576581993085, + "flos": 19725359736960.0, + "grad_norm": 1.5989615606819938, + "language_loss": 0.75195587, + "learning_rate": 4.0949776056001296e-07, + "loss": 0.82859248, + "num_input_tokens_seen": 286616270, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09350586, + "step": 13287, + "time_per_iteration": 2.498539447784424 + }, + { + "auxiliary_loss_clip": 0.0640296, + "auxiliary_loss_mlp": 0.01263938, + "balance_loss_clip": 0.06271001, + "balance_loss_mlp": 0.01254598, + "epoch": 0.7989177814519766, + "flos": 28043604512640.0, + "grad_norm": 1.4032913596903045, + "language_loss": 0.62071377, + "learning_rate": 4.092616678191863e-07, + "loss": 0.69738275, + "num_input_tokens_seen": 286638315, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.09338379, + "step": 13288, + "time_per_iteration": 2.5561347007751465 + }, + { + "auxiliary_loss_clip": 0.06401485, + "auxiliary_loss_mlp": 0.01264116, + "balance_loss_clip": 0.06273647, + "balance_loss_mlp": 0.01255122, + "epoch": 0.7989779047046445, + "flos": 28877662454400.0, + "grad_norm": 2.6038900989096705, + "language_loss": 0.70626175, + "learning_rate": 4.090256353993169e-07, + "loss": 0.78291774, + "num_input_tokens_seen": 286658630, + "router_z_loss_clip": 1.27832031, + "router_z_loss_mlp": 0.08996582, + "step": 13289, + "time_per_iteration": 2.5535638332366943 + }, + { + "auxiliary_loss_clip": 0.06396915, + "auxiliary_loss_mlp": 0.01263033, + "balance_loss_clip": 0.06270102, + "balance_loss_mlp": 0.01253771, + "epoch": 0.7990380279573125, + "flos": 18192769032960.0, + "grad_norm": 2.213156856555218, + "language_loss": 0.63382244, + "learning_rate": 4.0878966330935506e-07, + "loss": 0.71042198, + "num_input_tokens_seen": 286676870, + "router_z_loss_clip": 1.26757812, + "router_z_loss_mlp": 0.09259033, + "step": 13290, + "time_per_iteration": 2.4844484329223633 + }, + { + "auxiliary_loss_clip": 0.06406233, + "auxiliary_loss_mlp": 0.01266627, + "balance_loss_clip": 0.06273846, + "balance_loss_mlp": 0.01256458, + "epoch": 0.7990981512099805, + "flos": 20885113699200.0, + "grad_norm": 1.8461892272796565, + "language_loss": 0.71634483, + "learning_rate": 4.08553751558248e-07, + "loss": 0.79307342, + "num_input_tokens_seen": 286694300, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.10168457, + "step": 13291, + "time_per_iteration": 2.526987314224243 + }, + { + "auxiliary_loss_clip": 0.06397383, + "auxiliary_loss_mlp": 0.01264262, + "balance_loss_clip": 0.06268732, + "balance_loss_mlp": 0.01255107, + "epoch": 0.7991582744626484, + "flos": 26106381642240.0, + "grad_norm": 1.5963617377533177, + "language_loss": 0.63653862, + "learning_rate": 4.083179001549422e-07, + "loss": 0.71315503, + "num_input_tokens_seen": 286714545, + "router_z_loss_clip": 1.28613281, + "router_z_loss_mlp": 0.09161377, + "step": 13292, + "time_per_iteration": 3.920006513595581 + }, + { + "auxiliary_loss_clip": 0.06398708, + "auxiliary_loss_mlp": 0.01264318, + "balance_loss_clip": 0.06267934, + "balance_loss_mlp": 0.01254733, + "epoch": 0.7992183977153164, + "flos": 35304106072320.0, + "grad_norm": 1.797759826858067, + "language_loss": 0.56198502, + "learning_rate": 4.0808210910838105e-07, + "loss": 0.63861531, + "num_input_tokens_seen": 286734525, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.0958252, + "step": 13293, + "time_per_iteration": 2.625302314758301 + }, + { + "auxiliary_loss_clip": 0.06404014, + "auxiliary_loss_mlp": 0.01264714, + "balance_loss_clip": 0.06272873, + "balance_loss_mlp": 0.01255284, + "epoch": 0.7992785209679844, + "flos": 51863294632320.0, + "grad_norm": 2.2763572451506944, + "language_loss": 0.71341664, + "learning_rate": 4.0784637842750704e-07, + "loss": 0.79010391, + "num_input_tokens_seen": 286753430, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.09429932, + "step": 13294, + "time_per_iteration": 2.76823353767395 + }, + { + "auxiliary_loss_clip": 0.06401891, + "auxiliary_loss_mlp": 0.01262732, + "balance_loss_clip": 0.06269768, + "balance_loss_mlp": 0.01252623, + "epoch": 0.7993386442206524, + "flos": 22571719407360.0, + "grad_norm": 1.8830431252935182, + "language_loss": 0.72672385, + "learning_rate": 4.0761070812125675e-07, + "loss": 0.80337006, + "num_input_tokens_seen": 286771915, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.10107422, + "step": 13295, + "time_per_iteration": 3.9486594200134277 + }, + { + "auxiliary_loss_clip": 0.06399785, + "auxiliary_loss_mlp": 0.01270961, + "balance_loss_clip": 0.06270969, + "balance_loss_mlp": 0.01262367, + "epoch": 0.7993987674733203, + "flos": 18805112271360.0, + "grad_norm": 1.8035732738246322, + "language_loss": 0.76883113, + "learning_rate": 4.0737509819856797e-07, + "loss": 0.84553862, + "num_input_tokens_seen": 286789835, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.0859375, + "step": 13296, + "time_per_iteration": 2.5124893188476562 + }, + { + "auxiliary_loss_clip": 0.06317963, + "auxiliary_loss_mlp": 0.01251058, + "balance_loss_clip": 0.06262526, + "balance_loss_mlp": 0.0125003, + "epoch": 0.7994588907259883, + "flos": 69443747625600.0, + "grad_norm": 0.6778750345647286, + "language_loss": 0.60765332, + "learning_rate": 4.0713954866837573e-07, + "loss": 0.68334353, + "num_input_tokens_seen": 286855580, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01027679, + "step": 13297, + "time_per_iteration": 3.258441209793091 + }, + { + "auxiliary_loss_clip": 0.06401801, + "auxiliary_loss_mlp": 0.01265804, + "balance_loss_clip": 0.06271636, + "balance_loss_mlp": 0.01256398, + "epoch": 0.7995190139786562, + "flos": 13485439307520.0, + "grad_norm": 2.2443800001049645, + "language_loss": 0.70575351, + "learning_rate": 4.0690405953961073e-07, + "loss": 0.78242958, + "num_input_tokens_seen": 286874360, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.09399414, + "step": 13298, + "time_per_iteration": 2.4816195964813232 + }, + { + "auxiliary_loss_clip": 0.06406148, + "auxiliary_loss_mlp": 0.01264059, + "balance_loss_clip": 0.06270477, + "balance_loss_mlp": 0.01253563, + "epoch": 0.7995791372313242, + "flos": 21659270371200.0, + "grad_norm": 1.914137701086928, + "language_loss": 0.76235688, + "learning_rate": 4.066686308212037e-07, + "loss": 0.839059, + "num_input_tokens_seen": 286891950, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.10498047, + "step": 13299, + "time_per_iteration": 2.491387128829956 + }, + { + "auxiliary_loss_clip": 0.06396549, + "auxiliary_loss_mlp": 0.01265326, + "balance_loss_clip": 0.06268974, + "balance_loss_mlp": 0.01256779, + "epoch": 0.7996392604839921, + "flos": 26075382831360.0, + "grad_norm": 1.6376768390824803, + "language_loss": 0.77644742, + "learning_rate": 4.064332625220828e-07, + "loss": 0.85306615, + "num_input_tokens_seen": 286911725, + "router_z_loss_clip": 1.27734375, + "router_z_loss_mlp": 0.08544922, + "step": 13300, + "time_per_iteration": 3.941457986831665 + }, + { + "auxiliary_loss_clip": 0.06406416, + "auxiliary_loss_mlp": 0.01264711, + "balance_loss_clip": 0.06271292, + "balance_loss_mlp": 0.01255473, + "epoch": 0.7996993837366602, + "flos": 24613594427520.0, + "grad_norm": 1.7813390500304356, + "language_loss": 0.64086711, + "learning_rate": 4.0619795465117115e-07, + "loss": 0.71757841, + "num_input_tokens_seen": 286931400, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.09228516, + "step": 13301, + "time_per_iteration": 2.5052661895751953 + }, + { + "auxiliary_loss_clip": 0.06398593, + "auxiliary_loss_mlp": 0.01264195, + "balance_loss_clip": 0.06270251, + "balance_loss_mlp": 0.01255285, + "epoch": 0.7997595069893281, + "flos": 20997690059520.0, + "grad_norm": 1.5469395807720157, + "language_loss": 0.71982718, + "learning_rate": 4.059627072173928e-07, + "loss": 0.79645514, + "num_input_tokens_seen": 286949795, + "router_z_loss_clip": 1.28320312, + "router_z_loss_mlp": 0.08911133, + "step": 13302, + "time_per_iteration": 2.489457368850708 + }, + { + "auxiliary_loss_clip": 0.06408885, + "auxiliary_loss_mlp": 0.01265444, + "balance_loss_clip": 0.0627289, + "balance_loss_mlp": 0.01255967, + "epoch": 0.7998196302419961, + "flos": 24433528003200.0, + "grad_norm": 1.7910708704236549, + "language_loss": 0.83398485, + "learning_rate": 4.057275202296684e-07, + "loss": 0.91072816, + "num_input_tokens_seen": 286968805, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.09484863, + "step": 13303, + "time_per_iteration": 2.5182011127471924 + }, + { + "auxiliary_loss_clip": 0.06399085, + "auxiliary_loss_mlp": 0.01263644, + "balance_loss_clip": 0.06271808, + "balance_loss_mlp": 0.01254429, + "epoch": 0.7998797534946641, + "flos": 30272715480960.0, + "grad_norm": 1.579021550921295, + "language_loss": 0.58929861, + "learning_rate": 4.054923936969166e-07, + "loss": 0.66592586, + "num_input_tokens_seen": 286990235, + "router_z_loss_clip": 1.27148438, + "router_z_loss_mlp": 0.09210205, + "step": 13304, + "time_per_iteration": 2.584608316421509 + }, + { + "auxiliary_loss_clip": 0.06406042, + "auxiliary_loss_mlp": 0.01261222, + "balance_loss_clip": 0.06271531, + "balance_loss_mlp": 0.0125202, + "epoch": 0.799939876747332, + "flos": 23520785477760.0, + "grad_norm": 1.5411018505136698, + "language_loss": 0.68989539, + "learning_rate": 4.0525732762805265e-07, + "loss": 0.76656806, + "num_input_tokens_seen": 287011060, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.09210205, + "step": 13305, + "time_per_iteration": 2.495842218399048 + }, + { + "auxiliary_loss_clip": 0.06398628, + "auxiliary_loss_mlp": 0.0126253, + "balance_loss_clip": 0.06269637, + "balance_loss_mlp": 0.01254028, + "epoch": 0.8, + "flos": 19324207514880.0, + "grad_norm": 1.5483879862096703, + "language_loss": 0.6919629, + "learning_rate": 4.0502232203199107e-07, + "loss": 0.76857448, + "num_input_tokens_seen": 287029215, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.08493042, + "step": 13306, + "time_per_iteration": 2.4815428256988525 + }, + { + "auxiliary_loss_clip": 0.06404909, + "auxiliary_loss_mlp": 0.01263974, + "balance_loss_clip": 0.06271838, + "balance_loss_mlp": 0.01254813, + "epoch": 0.800060123252668, + "flos": 32420039264640.0, + "grad_norm": 1.3465720910639238, + "language_loss": 0.69548619, + "learning_rate": 4.0478737691764286e-07, + "loss": 0.77217495, + "num_input_tokens_seen": 287050855, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.09155273, + "step": 13307, + "time_per_iteration": 2.5902602672576904 + }, + { + "auxiliary_loss_clip": 0.06402986, + "auxiliary_loss_mlp": 0.01264461, + "balance_loss_clip": 0.06269908, + "balance_loss_mlp": 0.01255151, + "epoch": 0.800120246505336, + "flos": 20016702783360.0, + "grad_norm": 1.932839582685843, + "language_loss": 0.77209872, + "learning_rate": 4.0455249229391677e-07, + "loss": 0.84877324, + "num_input_tokens_seen": 287069915, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.09313965, + "step": 13308, + "time_per_iteration": 2.5227887630462646 + }, + { + "auxiliary_loss_clip": 0.06406727, + "auxiliary_loss_mlp": 0.01264112, + "balance_loss_clip": 0.06270848, + "balance_loss_mlp": 0.0125395, + "epoch": 0.8001803697580039, + "flos": 31876318120320.0, + "grad_norm": 1.398024400765408, + "language_loss": 0.78861815, + "learning_rate": 4.0431766816972e-07, + "loss": 0.86532652, + "num_input_tokens_seen": 287091450, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.10174561, + "step": 13309, + "time_per_iteration": 2.694766044616699 + }, + { + "auxiliary_loss_clip": 0.06317627, + "auxiliary_loss_mlp": 0.01253959, + "balance_loss_clip": 0.06261955, + "balance_loss_mlp": 0.01252847, + "epoch": 0.8002404930106719, + "flos": 63411496341120.0, + "grad_norm": 0.9515368521242993, + "language_loss": 0.64834917, + "learning_rate": 4.040829045539571e-07, + "loss": 0.72406501, + "num_input_tokens_seen": 287148365, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.01114655, + "step": 13310, + "time_per_iteration": 3.0877020359039307 + }, + { + "auxiliary_loss_clip": 0.06409021, + "auxiliary_loss_mlp": 0.01267758, + "balance_loss_clip": 0.06276361, + "balance_loss_mlp": 0.01258257, + "epoch": 0.8003006162633398, + "flos": 27862951109760.0, + "grad_norm": 1.8032558576679762, + "language_loss": 0.83180302, + "learning_rate": 4.0384820145553156e-07, + "loss": 0.90857077, + "num_input_tokens_seen": 287168280, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.0949707, + "step": 13311, + "time_per_iteration": 2.555682897567749 + }, + { + "auxiliary_loss_clip": 0.06402326, + "auxiliary_loss_mlp": 0.01265058, + "balance_loss_clip": 0.06271294, + "balance_loss_mlp": 0.01255944, + "epoch": 0.8003607395160078, + "flos": 18229218359040.0, + "grad_norm": 1.9156158973382509, + "language_loss": 0.6619851, + "learning_rate": 4.0361355888334116e-07, + "loss": 0.73865891, + "num_input_tokens_seen": 287185980, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.09118652, + "step": 13312, + "time_per_iteration": 2.4853975772857666 + }, + { + "auxiliary_loss_clip": 0.06408212, + "auxiliary_loss_mlp": 0.01263878, + "balance_loss_clip": 0.06272315, + "balance_loss_mlp": 0.01253805, + "epoch": 0.8004208627686757, + "flos": 20893331399040.0, + "grad_norm": 1.7788171673051, + "language_loss": 0.75784224, + "learning_rate": 4.033789768462843e-07, + "loss": 0.83456314, + "num_input_tokens_seen": 287203875, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.10070801, + "step": 13313, + "time_per_iteration": 2.5811471939086914 + }, + { + "auxiliary_loss_clip": 0.0640287, + "auxiliary_loss_mlp": 0.01266155, + "balance_loss_clip": 0.06270996, + "balance_loss_mlp": 0.01256661, + "epoch": 0.8004809860213438, + "flos": 26443984942080.0, + "grad_norm": 1.3059892404938946, + "language_loss": 0.75943661, + "learning_rate": 4.031444553532575e-07, + "loss": 0.83612692, + "num_input_tokens_seen": 287226445, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09490967, + "step": 13314, + "time_per_iteration": 2.5711114406585693 + }, + { + "auxiliary_loss_clip": 0.06314123, + "auxiliary_loss_mlp": 0.01251747, + "balance_loss_clip": 0.06258671, + "balance_loss_mlp": 0.0125083, + "epoch": 0.8005411092740117, + "flos": 63668276778240.0, + "grad_norm": 0.7688266609144837, + "language_loss": 0.53789216, + "learning_rate": 4.029099944131522e-07, + "loss": 0.61355084, + "num_input_tokens_seen": 287286240, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.00914764, + "step": 13315, + "time_per_iteration": 3.0470640659332275 + }, + { + "auxiliary_loss_clip": 0.06399442, + "auxiliary_loss_mlp": 0.01266642, + "balance_loss_clip": 0.0626928, + "balance_loss_mlp": 0.0125707, + "epoch": 0.8006012325266797, + "flos": 36146968692480.0, + "grad_norm": 1.5921677145384265, + "language_loss": 0.71092463, + "learning_rate": 4.026755940348603e-07, + "loss": 0.78758544, + "num_input_tokens_seen": 287310265, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.09576416, + "step": 13316, + "time_per_iteration": 2.688965320587158 + }, + { + "auxiliary_loss_clip": 0.06405424, + "auxiliary_loss_mlp": 0.01265946, + "balance_loss_clip": 0.06270652, + "balance_loss_mlp": 0.01256755, + "epoch": 0.8006613557793477, + "flos": 33847390840320.0, + "grad_norm": 1.7083449929688843, + "language_loss": 0.65030324, + "learning_rate": 4.024412542272706e-07, + "loss": 0.72701693, + "num_input_tokens_seen": 287331610, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.09185791, + "step": 13317, + "time_per_iteration": 2.6344261169433594 + }, + { + "auxiliary_loss_clip": 0.06308497, + "auxiliary_loss_mlp": 0.01250396, + "balance_loss_clip": 0.06252623, + "balance_loss_mlp": 0.01249407, + "epoch": 0.8007214790320156, + "flos": 67371041502720.0, + "grad_norm": 0.7463075809766724, + "language_loss": 0.58964193, + "learning_rate": 4.0220697499926783e-07, + "loss": 0.66523087, + "num_input_tokens_seen": 287394795, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.00988007, + "step": 13318, + "time_per_iteration": 3.211217164993286 + }, + { + "auxiliary_loss_clip": 0.06398984, + "auxiliary_loss_mlp": 0.01261211, + "balance_loss_clip": 0.06267591, + "balance_loss_mlp": 0.01252532, + "epoch": 0.8007816022846836, + "flos": 23192406126720.0, + "grad_norm": 3.1434956654413484, + "language_loss": 0.66706848, + "learning_rate": 4.019727563597366e-07, + "loss": 0.74367046, + "num_input_tokens_seen": 287414595, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.08673096, + "step": 13319, + "time_per_iteration": 2.5540733337402344 + }, + { + "auxiliary_loss_clip": 0.06403084, + "auxiliary_loss_mlp": 0.01265724, + "balance_loss_clip": 0.06268618, + "balance_loss_mlp": 0.01255699, + "epoch": 0.8008417255373516, + "flos": 21987901284480.0, + "grad_norm": 1.728669041883902, + "language_loss": 0.73937488, + "learning_rate": 4.0173859831755873e-07, + "loss": 0.81606293, + "num_input_tokens_seen": 287434395, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.10028076, + "step": 13320, + "time_per_iteration": 2.582298994064331 + }, + { + "auxiliary_loss_clip": 0.06404214, + "auxiliary_loss_mlp": 0.01263523, + "balance_loss_clip": 0.06271582, + "balance_loss_mlp": 0.01253575, + "epoch": 0.8009018487900196, + "flos": 16732951200000.0, + "grad_norm": 2.01191871556705, + "language_loss": 0.8012563, + "learning_rate": 4.015045008816138e-07, + "loss": 0.87793362, + "num_input_tokens_seen": 287450590, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09954834, + "step": 13321, + "time_per_iteration": 2.4715728759765625 + }, + { + "auxiliary_loss_clip": 0.06396499, + "auxiliary_loss_mlp": 0.01262518, + "balance_loss_clip": 0.06268975, + "balance_loss_mlp": 0.01253536, + "epoch": 0.8009619720426875, + "flos": 20819887695360.0, + "grad_norm": 1.7373613026127328, + "language_loss": 0.65706664, + "learning_rate": 4.0127046406077825e-07, + "loss": 0.73365676, + "num_input_tokens_seen": 287468455, + "router_z_loss_clip": 1.27441406, + "router_z_loss_mlp": 0.08978271, + "step": 13322, + "time_per_iteration": 3.9246838092803955 + }, + { + "auxiliary_loss_clip": 0.0639898, + "auxiliary_loss_mlp": 0.01263212, + "balance_loss_clip": 0.06267587, + "balance_loss_mlp": 0.01254206, + "epoch": 0.8010220952953555, + "flos": 17936869063680.0, + "grad_norm": 1.6818709041886202, + "language_loss": 0.78149015, + "learning_rate": 4.010364878639265e-07, + "loss": 0.85811198, + "num_input_tokens_seen": 287486485, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09002686, + "step": 13323, + "time_per_iteration": 2.4993720054626465 + }, + { + "auxiliary_loss_clip": 0.06405957, + "auxiliary_loss_mlp": 0.01261855, + "balance_loss_clip": 0.06270777, + "balance_loss_mlp": 0.01251872, + "epoch": 0.8010822185480234, + "flos": 24579241453440.0, + "grad_norm": 2.3981073460441187, + "language_loss": 0.71711612, + "learning_rate": 4.00802572299932e-07, + "loss": 0.79379427, + "num_input_tokens_seen": 287503940, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.09979248, + "step": 13324, + "time_per_iteration": 2.6039645671844482 + }, + { + "auxiliary_loss_clip": 0.06404987, + "auxiliary_loss_mlp": 0.01262523, + "balance_loss_clip": 0.06270103, + "balance_loss_mlp": 0.01252456, + "epoch": 0.8011423418006914, + "flos": 21835563361920.0, + "grad_norm": 1.6339854847519542, + "language_loss": 0.76400465, + "learning_rate": 4.005687173776635e-07, + "loss": 0.84067976, + "num_input_tokens_seen": 287521660, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.10070801, + "step": 13325, + "time_per_iteration": 2.5225205421447754 + }, + { + "auxiliary_loss_clip": 0.06393359, + "auxiliary_loss_mlp": 0.01264051, + "balance_loss_clip": 0.06268814, + "balance_loss_mlp": 0.01256022, + "epoch": 0.8012024650533593, + "flos": 23922021553920.0, + "grad_norm": 1.571695790316147, + "language_loss": 0.80098516, + "learning_rate": 4.003349231059898e-07, + "loss": 0.87755924, + "num_input_tokens_seen": 287541505, + "router_z_loss_clip": 1.24804688, + "router_z_loss_mlp": 0.08026123, + "step": 13326, + "time_per_iteration": 2.5184433460235596 + }, + { + "auxiliary_loss_clip": 0.06396009, + "auxiliary_loss_mlp": 0.01263378, + "balance_loss_clip": 0.06269439, + "balance_loss_mlp": 0.01254921, + "epoch": 0.8012625883060274, + "flos": 23593893765120.0, + "grad_norm": 2.1709213640524156, + "language_loss": 0.66244531, + "learning_rate": 4.001011894937765e-07, + "loss": 0.73903918, + "num_input_tokens_seen": 287560015, + "router_z_loss_clip": 1.265625, + "router_z_loss_mlp": 0.08453369, + "step": 13327, + "time_per_iteration": 2.5192511081695557 + }, + { + "auxiliary_loss_clip": 0.06397668, + "auxiliary_loss_mlp": 0.0126388, + "balance_loss_clip": 0.06270249, + "balance_loss_mlp": 0.0125497, + "epoch": 0.8013227115586953, + "flos": 20820265038720.0, + "grad_norm": 1.5237011846909325, + "language_loss": 0.73911273, + "learning_rate": 3.9986751654988636e-07, + "loss": 0.81572825, + "num_input_tokens_seen": 287579150, + "router_z_loss_clip": 1.27441406, + "router_z_loss_mlp": 0.08911133, + "step": 13328, + "time_per_iteration": 2.490879535675049 + }, + { + "auxiliary_loss_clip": 0.0640716, + "auxiliary_loss_mlp": 0.01265301, + "balance_loss_clip": 0.06271626, + "balance_loss_mlp": 0.0125493, + "epoch": 0.8013828348113633, + "flos": 15893820086400.0, + "grad_norm": 2.1070162273043938, + "language_loss": 0.74215919, + "learning_rate": 3.996339042831798e-07, + "loss": 0.81888378, + "num_input_tokens_seen": 287597420, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.10369873, + "step": 13329, + "time_per_iteration": 2.478027105331421 + }, + { + "auxiliary_loss_clip": 0.06312898, + "auxiliary_loss_mlp": 0.01251725, + "balance_loss_clip": 0.06257395, + "balance_loss_mlp": 0.01250756, + "epoch": 0.8014429580640313, + "flos": 71085183183360.0, + "grad_norm": 0.6797565507978373, + "language_loss": 0.52515209, + "learning_rate": 3.9940035270251605e-07, + "loss": 0.60079831, + "num_input_tokens_seen": 287667280, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.00967407, + "step": 13330, + "time_per_iteration": 3.21134614944458 + }, + { + "auxiliary_loss_clip": 0.0640648, + "auxiliary_loss_mlp": 0.01263996, + "balance_loss_clip": 0.06270502, + "balance_loss_mlp": 0.01253518, + "epoch": 0.8015030813166992, + "flos": 23083100075520.0, + "grad_norm": 1.654890173556639, + "language_loss": 0.7351566, + "learning_rate": 3.991668618167519e-07, + "loss": 0.8118614, + "num_input_tokens_seen": 287687375, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.10479736, + "step": 13331, + "time_per_iteration": 3.970208168029785 + }, + { + "auxiliary_loss_clip": 0.06399897, + "auxiliary_loss_mlp": 0.01262704, + "balance_loss_clip": 0.06269284, + "balance_loss_mlp": 0.01254037, + "epoch": 0.8015632045693672, + "flos": 21878888722560.0, + "grad_norm": 1.8984062723918875, + "language_loss": 0.77560246, + "learning_rate": 3.989334316347401e-07, + "loss": 0.8522284, + "num_input_tokens_seen": 287707895, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.08666992, + "step": 13332, + "time_per_iteration": 2.5455820560455322 + }, + { + "auxiliary_loss_clip": 0.0640306, + "auxiliary_loss_mlp": 0.0126663, + "balance_loss_clip": 0.06269315, + "balance_loss_mlp": 0.01256402, + "epoch": 0.8016233278220352, + "flos": 23663018983680.0, + "grad_norm": 1.6654900113929851, + "language_loss": 0.83571923, + "learning_rate": 3.987000621653338e-07, + "loss": 0.91241622, + "num_input_tokens_seen": 287723990, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.10217285, + "step": 13333, + "time_per_iteration": 2.510481595993042 + }, + { + "auxiliary_loss_clip": 0.06403299, + "auxiliary_loss_mlp": 0.01262076, + "balance_loss_clip": 0.06270902, + "balance_loss_mlp": 0.01252724, + "epoch": 0.8016834510747032, + "flos": 16258732617600.0, + "grad_norm": 1.578647328304289, + "language_loss": 0.73791355, + "learning_rate": 3.9846675341738133e-07, + "loss": 0.81456727, + "num_input_tokens_seen": 287742380, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.09350586, + "step": 13334, + "time_per_iteration": 2.5875518321990967 + }, + { + "auxiliary_loss_clip": 0.06397326, + "auxiliary_loss_mlp": 0.01262334, + "balance_loss_clip": 0.06269726, + "balance_loss_mlp": 0.01253292, + "epoch": 0.8017435743273711, + "flos": 12280892538240.0, + "grad_norm": 1.8344549459968347, + "language_loss": 0.74896538, + "learning_rate": 3.9823350539972967e-07, + "loss": 0.82556194, + "num_input_tokens_seen": 287760130, + "router_z_loss_clip": 1.27539062, + "router_z_loss_mlp": 0.09042358, + "step": 13335, + "time_per_iteration": 4.024559259414673 + }, + { + "auxiliary_loss_clip": 0.06397076, + "auxiliary_loss_mlp": 0.01263938, + "balance_loss_clip": 0.06266247, + "balance_loss_mlp": 0.01254044, + "epoch": 0.8018036975800391, + "flos": 17200880726400.0, + "grad_norm": 1.7648515567643608, + "language_loss": 0.75561655, + "learning_rate": 3.9800031812122416e-07, + "loss": 0.83222669, + "num_input_tokens_seen": 287777565, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.09881592, + "step": 13336, + "time_per_iteration": 2.4966955184936523 + }, + { + "auxiliary_loss_clip": 0.0640955, + "auxiliary_loss_mlp": 0.01264608, + "balance_loss_clip": 0.06270093, + "balance_loss_mlp": 0.01253736, + "epoch": 0.801863820832707, + "flos": 20638228043520.0, + "grad_norm": 1.8494004813437324, + "language_loss": 0.74727678, + "learning_rate": 3.977671915907068e-07, + "loss": 0.82401836, + "num_input_tokens_seen": 287796310, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.10870361, + "step": 13337, + "time_per_iteration": 2.493006944656372 + }, + { + "auxiliary_loss_clip": 0.06406038, + "auxiliary_loss_mlp": 0.01263988, + "balance_loss_clip": 0.06269336, + "balance_loss_mlp": 0.01253962, + "epoch": 0.801923944085375, + "flos": 30453410810880.0, + "grad_norm": 1.5897406325584222, + "language_loss": 0.8002277, + "learning_rate": 3.9753412581701883e-07, + "loss": 0.87692797, + "num_input_tokens_seen": 287817330, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.1003418, + "step": 13338, + "time_per_iteration": 2.5765812397003174 + }, + { + "auxiliary_loss_clip": 0.06405494, + "auxiliary_loss_mlp": 0.01265523, + "balance_loss_clip": 0.06270125, + "balance_loss_mlp": 0.01254317, + "epoch": 0.801984067338043, + "flos": 20016660856320.0, + "grad_norm": 1.9676799431141796, + "language_loss": 0.74850368, + "learning_rate": 3.9730112080899733e-07, + "loss": 0.82521391, + "num_input_tokens_seen": 287835095, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.11218262, + "step": 13339, + "time_per_iteration": 3.9401278495788574 + }, + { + "auxiliary_loss_clip": 0.06401505, + "auxiliary_loss_mlp": 0.01264432, + "balance_loss_clip": 0.06271123, + "balance_loss_mlp": 0.01255253, + "epoch": 0.802044190590711, + "flos": 22790666926080.0, + "grad_norm": 1.5626805992517288, + "language_loss": 0.7945329, + "learning_rate": 3.970681765754775e-07, + "loss": 0.87119228, + "num_input_tokens_seen": 287854595, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.09179688, + "step": 13340, + "time_per_iteration": 2.5232396125793457 + }, + { + "auxiliary_loss_clip": 0.06404866, + "auxiliary_loss_mlp": 0.01263789, + "balance_loss_clip": 0.06272231, + "balance_loss_mlp": 0.01254831, + "epoch": 0.8021043138433789, + "flos": 27607554264960.0, + "grad_norm": 1.7600307740007948, + "language_loss": 0.68075955, + "learning_rate": 3.968352931252936e-07, + "loss": 0.75744605, + "num_input_tokens_seen": 287876960, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.08953857, + "step": 13341, + "time_per_iteration": 2.5519580841064453 + }, + { + "auxiliary_loss_clip": 0.06309702, + "auxiliary_loss_mlp": 0.01251381, + "balance_loss_clip": 0.06254174, + "balance_loss_mlp": 0.01250354, + "epoch": 0.8021644370960469, + "flos": 62080453434240.0, + "grad_norm": 0.7935303767570981, + "language_loss": 0.61211252, + "learning_rate": 3.9660247046727547e-07, + "loss": 0.68772334, + "num_input_tokens_seen": 287936530, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01027679, + "step": 13342, + "time_per_iteration": 3.0668532848358154 + }, + { + "auxiliary_loss_clip": 0.06403046, + "auxiliary_loss_mlp": 0.01263587, + "balance_loss_clip": 0.06271387, + "balance_loss_mlp": 0.01253788, + "epoch": 0.8022245603487148, + "flos": 23367525160320.0, + "grad_norm": 1.685983088220024, + "language_loss": 0.63982582, + "learning_rate": 3.963697086102522e-07, + "loss": 0.71649212, + "num_input_tokens_seen": 287954285, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09802246, + "step": 13343, + "time_per_iteration": 2.52908992767334 + }, + { + "auxiliary_loss_clip": 0.06393635, + "auxiliary_loss_mlp": 0.01262737, + "balance_loss_clip": 0.06267881, + "balance_loss_mlp": 0.01254142, + "epoch": 0.8022846836013828, + "flos": 10858027155840.0, + "grad_norm": 1.7400180605672049, + "language_loss": 0.6898669, + "learning_rate": 3.96137007563051e-07, + "loss": 0.76643062, + "num_input_tokens_seen": 287971595, + "router_z_loss_clip": 1.25683594, + "router_z_loss_mlp": 0.0859375, + "step": 13344, + "time_per_iteration": 2.467531204223633 + }, + { + "auxiliary_loss_clip": 0.06399775, + "auxiliary_loss_mlp": 0.0126374, + "balance_loss_clip": 0.06268416, + "balance_loss_mlp": 0.01254538, + "epoch": 0.8023448068540509, + "flos": 29247899719680.0, + "grad_norm": 1.4831700839828168, + "language_loss": 0.70263791, + "learning_rate": 3.9590436733449506e-07, + "loss": 0.77927303, + "num_input_tokens_seen": 287992540, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.09191895, + "step": 13345, + "time_per_iteration": 2.5930464267730713 + }, + { + "auxiliary_loss_clip": 0.06311318, + "auxiliary_loss_mlp": 0.01250528, + "balance_loss_clip": 0.06255944, + "balance_loss_mlp": 0.01249584, + "epoch": 0.8024049301067188, + "flos": 64172362141440.0, + "grad_norm": 0.847535442910353, + "language_loss": 0.62905973, + "learning_rate": 3.956717879334059e-07, + "loss": 0.70467818, + "num_input_tokens_seen": 288052810, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.0094223, + "step": 13346, + "time_per_iteration": 3.2076127529144287 + }, + { + "auxiliary_loss_clip": 0.06396353, + "auxiliary_loss_mlp": 0.01263037, + "balance_loss_clip": 0.06268937, + "balance_loss_mlp": 0.01253715, + "epoch": 0.8024650533593868, + "flos": 28592985807360.0, + "grad_norm": 3.633465076952704, + "language_loss": 0.72895849, + "learning_rate": 3.9543926936860327e-07, + "loss": 0.80555242, + "num_input_tokens_seen": 288073045, + "router_z_loss_clip": 1.27441406, + "router_z_loss_mlp": 0.09326172, + "step": 13347, + "time_per_iteration": 2.5710387229919434 + }, + { + "auxiliary_loss_clip": 0.06403917, + "auxiliary_loss_mlp": 0.01266411, + "balance_loss_clip": 0.06269814, + "balance_loss_mlp": 0.01256577, + "epoch": 0.8025251766120547, + "flos": 16987844920320.0, + "grad_norm": 2.5900803344062115, + "language_loss": 0.73302913, + "learning_rate": 3.9520681164890493e-07, + "loss": 0.80973244, + "num_input_tokens_seen": 288091165, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.09838867, + "step": 13348, + "time_per_iteration": 2.4676120281219482 + }, + { + "auxiliary_loss_clip": 0.06403141, + "auxiliary_loss_mlp": 0.01262753, + "balance_loss_clip": 0.06271264, + "balance_loss_mlp": 0.01253336, + "epoch": 0.8025852998647227, + "flos": 22170189841920.0, + "grad_norm": 1.6273039125060904, + "language_loss": 0.7625345, + "learning_rate": 3.9497441478312444e-07, + "loss": 0.83919346, + "num_input_tokens_seen": 288110595, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09423828, + "step": 13349, + "time_per_iteration": 2.5136961936950684 + }, + { + "auxiliary_loss_clip": 0.06400917, + "auxiliary_loss_mlp": 0.01264363, + "balance_loss_clip": 0.06269996, + "balance_loss_mlp": 0.01255076, + "epoch": 0.8026454231173906, + "flos": 22023386288640.0, + "grad_norm": 2.7562634008625846, + "language_loss": 0.83666581, + "learning_rate": 3.947420787800755e-07, + "loss": 0.91331869, + "num_input_tokens_seen": 288128995, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09283447, + "step": 13350, + "time_per_iteration": 2.519904851913452 + }, + { + "auxiliary_loss_clip": 0.06399673, + "auxiliary_loss_mlp": 0.01265698, + "balance_loss_clip": 0.0626874, + "balance_loss_mlp": 0.0125665, + "epoch": 0.8027055463700586, + "flos": 22497772579200.0, + "grad_norm": 1.5771958395635441, + "language_loss": 0.71500349, + "learning_rate": 3.945098036485679e-07, + "loss": 0.79165721, + "num_input_tokens_seen": 288149265, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.0904541, + "step": 13351, + "time_per_iteration": 2.536276340484619 + }, + { + "auxiliary_loss_clip": 0.06399149, + "auxiliary_loss_mlp": 0.01267076, + "balance_loss_clip": 0.0626966, + "balance_loss_mlp": 0.01257921, + "epoch": 0.8027656696227266, + "flos": 28920442763520.0, + "grad_norm": 1.6393100884614646, + "language_loss": 0.62040806, + "learning_rate": 3.9427758939740885e-07, + "loss": 0.6970703, + "num_input_tokens_seen": 288170745, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.09161377, + "step": 13352, + "time_per_iteration": 2.572496175765991 + }, + { + "auxiliary_loss_clip": 0.06404066, + "auxiliary_loss_mlp": 0.01264871, + "balance_loss_clip": 0.06273441, + "balance_loss_mlp": 0.01255078, + "epoch": 0.8028257928753946, + "flos": 18595514482560.0, + "grad_norm": 1.84085315360638, + "language_loss": 0.77318871, + "learning_rate": 3.940454360354046e-07, + "loss": 0.84987807, + "num_input_tokens_seen": 288189415, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09796143, + "step": 13353, + "time_per_iteration": 2.591125726699829 + }, + { + "auxiliary_loss_clip": 0.0641156, + "auxiliary_loss_mlp": 0.01271346, + "balance_loss_clip": 0.06270623, + "balance_loss_mlp": 0.01260713, + "epoch": 0.8028859161280625, + "flos": 19135126776960.0, + "grad_norm": 2.1440519982160726, + "language_loss": 0.73642856, + "learning_rate": 3.938133435713582e-07, + "loss": 0.81325769, + "num_input_tokens_seen": 288206900, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.10632324, + "step": 13354, + "time_per_iteration": 2.4713294506073 + }, + { + "auxiliary_loss_clip": 0.06405748, + "auxiliary_loss_mlp": 0.01261139, + "balance_loss_clip": 0.06271609, + "balance_loss_mlp": 0.01251835, + "epoch": 0.8029460393807305, + "flos": 20236069572480.0, + "grad_norm": 2.691632863229345, + "language_loss": 0.65962112, + "learning_rate": 3.935813120140714e-07, + "loss": 0.73628998, + "num_input_tokens_seen": 288224800, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.09295654, + "step": 13355, + "time_per_iteration": 2.487391710281372 + }, + { + "auxiliary_loss_clip": 0.06404544, + "auxiliary_loss_mlp": 0.01265286, + "balance_loss_clip": 0.06268579, + "balance_loss_mlp": 0.01254724, + "epoch": 0.8030061626333984, + "flos": 49794445797120.0, + "grad_norm": 2.169594763741831, + "language_loss": 0.69115853, + "learning_rate": 3.9334934137234235e-07, + "loss": 0.7678569, + "num_input_tokens_seen": 288249400, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.10565186, + "step": 13356, + "time_per_iteration": 2.771540403366089 + }, + { + "auxiliary_loss_clip": 0.06398716, + "auxiliary_loss_mlp": 0.01262043, + "balance_loss_clip": 0.06268562, + "balance_loss_mlp": 0.01253555, + "epoch": 0.8030662858860664, + "flos": 21621479379840.0, + "grad_norm": 1.8816626292041285, + "language_loss": 0.7745564, + "learning_rate": 3.931174316549666e-07, + "loss": 0.85116398, + "num_input_tokens_seen": 288268780, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.08483887, + "step": 13357, + "time_per_iteration": 2.4969570636749268 + }, + { + "auxiliary_loss_clip": 0.0640809, + "auxiliary_loss_mlp": 0.01263369, + "balance_loss_clip": 0.06269683, + "balance_loss_mlp": 0.01253219, + "epoch": 0.8031264091387345, + "flos": 25637194304640.0, + "grad_norm": 1.5133182895220076, + "language_loss": 0.77548575, + "learning_rate": 3.9288558287073937e-07, + "loss": 0.85220027, + "num_input_tokens_seen": 288290830, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.10150146, + "step": 13358, + "time_per_iteration": 2.623896837234497 + }, + { + "auxiliary_loss_clip": 0.0639957, + "auxiliary_loss_mlp": 0.01261602, + "balance_loss_clip": 0.06269436, + "balance_loss_mlp": 0.01252408, + "epoch": 0.8031865323914024, + "flos": 19652335303680.0, + "grad_norm": 1.5054224659704207, + "language_loss": 0.84991813, + "learning_rate": 3.9265379502845143e-07, + "loss": 0.92652988, + "num_input_tokens_seen": 288308865, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.09194946, + "step": 13359, + "time_per_iteration": 2.5500707626342773 + }, + { + "auxiliary_loss_clip": 0.06401375, + "auxiliary_loss_mlp": 0.01262567, + "balance_loss_clip": 0.0627083, + "balance_loss_mlp": 0.01253435, + "epoch": 0.8032466556440704, + "flos": 26174961809280.0, + "grad_norm": 1.8378585000154632, + "language_loss": 0.7306003, + "learning_rate": 3.924220681368928e-07, + "loss": 0.80723965, + "num_input_tokens_seen": 288327325, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.09136963, + "step": 13360, + "time_per_iteration": 2.548150062561035 + }, + { + "auxiliary_loss_clip": 0.06402496, + "auxiliary_loss_mlp": 0.01264804, + "balance_loss_clip": 0.06269519, + "balance_loss_mlp": 0.01255423, + "epoch": 0.8033067788967383, + "flos": 25527049712640.0, + "grad_norm": 2.141449143899577, + "language_loss": 0.69812787, + "learning_rate": 3.921904022048512e-07, + "loss": 0.7748009, + "num_input_tokens_seen": 288347285, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.09387207, + "step": 13361, + "time_per_iteration": 2.5122880935668945 + }, + { + "auxiliary_loss_clip": 0.06408579, + "auxiliary_loss_mlp": 0.01263892, + "balance_loss_clip": 0.06272987, + "balance_loss_mlp": 0.01253861, + "epoch": 0.8033669021494063, + "flos": 24031076042880.0, + "grad_norm": 1.5411892792753266, + "language_loss": 0.70487249, + "learning_rate": 3.919587972411098e-07, + "loss": 0.7815972, + "num_input_tokens_seen": 288367785, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.1003418, + "step": 13362, + "time_per_iteration": 3.9490444660186768 + }, + { + "auxiliary_loss_clip": 0.06412524, + "auxiliary_loss_mlp": 0.01268791, + "balance_loss_clip": 0.06271197, + "balance_loss_mlp": 0.01257299, + "epoch": 0.8034270254020742, + "flos": 13592900568960.0, + "grad_norm": 2.526180707519333, + "language_loss": 0.78481448, + "learning_rate": 3.91727253254452e-07, + "loss": 0.8616277, + "num_input_tokens_seen": 288384135, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.11505127, + "step": 13363, + "time_per_iteration": 2.4621450901031494 + }, + { + "auxiliary_loss_clip": 0.06403825, + "auxiliary_loss_mlp": 0.01266513, + "balance_loss_clip": 0.06268764, + "balance_loss_mlp": 0.01256619, + "epoch": 0.8034871486547422, + "flos": 27419228213760.0, + "grad_norm": 2.002665668472871, + "language_loss": 0.7498951, + "learning_rate": 3.9149577025365787e-07, + "loss": 0.82659847, + "num_input_tokens_seen": 288403805, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.09893799, + "step": 13364, + "time_per_iteration": 2.5504682064056396 + }, + { + "auxiliary_loss_clip": 0.0640076, + "auxiliary_loss_mlp": 0.01264787, + "balance_loss_clip": 0.06270374, + "balance_loss_mlp": 0.01255673, + "epoch": 0.8035472719074102, + "flos": 32606855942400.0, + "grad_norm": 1.9519754952718025, + "language_loss": 0.61201763, + "learning_rate": 3.9126434824750596e-07, + "loss": 0.68867314, + "num_input_tokens_seen": 288424895, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.09112549, + "step": 13365, + "time_per_iteration": 2.637441396713257 + }, + { + "auxiliary_loss_clip": 0.06407268, + "auxiliary_loss_mlp": 0.01265341, + "balance_loss_clip": 0.06271231, + "balance_loss_mlp": 0.01255304, + "epoch": 0.8036073951600782, + "flos": 21294357840000.0, + "grad_norm": 1.6745258568385837, + "language_loss": 0.6602062, + "learning_rate": 3.910329872447706e-07, + "loss": 0.73693228, + "num_input_tokens_seen": 288443865, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.10040283, + "step": 13366, + "time_per_iteration": 2.5265872478485107 + }, + { + "auxiliary_loss_clip": 0.06398745, + "auxiliary_loss_mlp": 0.01261552, + "balance_loss_clip": 0.06269355, + "balance_loss_mlp": 0.01252308, + "epoch": 0.8036675184127461, + "flos": 18119702672640.0, + "grad_norm": 2.0189500018467146, + "language_loss": 0.75098139, + "learning_rate": 3.908016872542259e-07, + "loss": 0.82758439, + "num_input_tokens_seen": 288461065, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.09234619, + "step": 13367, + "time_per_iteration": 2.507988214492798 + }, + { + "auxiliary_loss_clip": 0.06403186, + "auxiliary_loss_mlp": 0.01263311, + "balance_loss_clip": 0.06272097, + "balance_loss_mlp": 0.01254024, + "epoch": 0.8037276416654141, + "flos": 26037298350720.0, + "grad_norm": 1.466952171960805, + "language_loss": 0.74368006, + "learning_rate": 3.905704482846428e-07, + "loss": 0.82034504, + "num_input_tokens_seen": 288481865, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.09283447, + "step": 13368, + "time_per_iteration": 2.5691888332366943 + }, + { + "auxiliary_loss_clip": 0.0640569, + "auxiliary_loss_mlp": 0.0126344, + "balance_loss_clip": 0.06270935, + "balance_loss_mlp": 0.01253671, + "epoch": 0.803787764918082, + "flos": 18807334404480.0, + "grad_norm": 1.851125330609221, + "language_loss": 0.69820118, + "learning_rate": 3.90339270344789e-07, + "loss": 0.77489251, + "num_input_tokens_seen": 288499345, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.09771729, + "step": 13369, + "time_per_iteration": 2.5154571533203125 + }, + { + "auxiliary_loss_clip": 0.06399469, + "auxiliary_loss_mlp": 0.01262611, + "balance_loss_clip": 0.06269622, + "balance_loss_mlp": 0.01253808, + "epoch": 0.80384788817075, + "flos": 20231289889920.0, + "grad_norm": 1.5121727430472034, + "language_loss": 0.73977184, + "learning_rate": 3.901081534434312e-07, + "loss": 0.81639266, + "num_input_tokens_seen": 288517660, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.08807373, + "step": 13370, + "time_per_iteration": 2.501655101776123 + }, + { + "auxiliary_loss_clip": 0.06407988, + "auxiliary_loss_mlp": 0.01264642, + "balance_loss_clip": 0.06271008, + "balance_loss_mlp": 0.012551, + "epoch": 0.8039080114234181, + "flos": 18521232238080.0, + "grad_norm": 2.479350396293282, + "language_loss": 0.87167275, + "learning_rate": 3.898770975893342e-07, + "loss": 0.94839901, + "num_input_tokens_seen": 288534180, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.09539795, + "step": 13371, + "time_per_iteration": 3.886564016342163 + }, + { + "auxiliary_loss_clip": 0.06406743, + "auxiliary_loss_mlp": 0.01265329, + "balance_loss_clip": 0.06270692, + "balance_loss_mlp": 0.01254815, + "epoch": 0.803968134676086, + "flos": 22389053506560.0, + "grad_norm": 1.8483310810057103, + "language_loss": 0.74931836, + "learning_rate": 3.89646102791259e-07, + "loss": 0.82603908, + "num_input_tokens_seen": 288553350, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.10522461, + "step": 13372, + "time_per_iteration": 2.505094289779663 + }, + { + "auxiliary_loss_clip": 0.06399661, + "auxiliary_loss_mlp": 0.01264572, + "balance_loss_clip": 0.06268448, + "balance_loss_mlp": 0.01254707, + "epoch": 0.804028257928754, + "flos": 23849458318080.0, + "grad_norm": 2.2445203393539965, + "language_loss": 0.79285675, + "learning_rate": 3.894151690579646e-07, + "loss": 0.86949909, + "num_input_tokens_seen": 288571325, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09863281, + "step": 13373, + "time_per_iteration": 2.537801742553711 + }, + { + "auxiliary_loss_clip": 0.06399599, + "auxiliary_loss_mlp": 0.01263438, + "balance_loss_clip": 0.06269559, + "balance_loss_mlp": 0.01254897, + "epoch": 0.8040883811814219, + "flos": 23557570220160.0, + "grad_norm": 1.4911107147206584, + "language_loss": 0.74763751, + "learning_rate": 3.8918429639820815e-07, + "loss": 0.82426786, + "num_input_tokens_seen": 288592100, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.08532715, + "step": 13374, + "time_per_iteration": 3.975172281265259 + }, + { + "auxiliary_loss_clip": 0.06405147, + "auxiliary_loss_mlp": 0.01264438, + "balance_loss_clip": 0.06269235, + "balance_loss_mlp": 0.01254198, + "epoch": 0.8041485044340899, + "flos": 19032319416960.0, + "grad_norm": 2.1627910258731546, + "language_loss": 0.69120371, + "learning_rate": 3.889534848207452e-07, + "loss": 0.76789951, + "num_input_tokens_seen": 288612305, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.10247803, + "step": 13375, + "time_per_iteration": 2.5215139389038086 + }, + { + "auxiliary_loss_clip": 0.06307931, + "auxiliary_loss_mlp": 0.0125401, + "balance_loss_clip": 0.06252438, + "balance_loss_mlp": 0.01252982, + "epoch": 0.8042086276867578, + "flos": 70027817310720.0, + "grad_norm": 0.7167965805045454, + "language_loss": 0.55595809, + "learning_rate": 3.887227343343271e-07, + "loss": 0.63157749, + "num_input_tokens_seen": 288676015, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01027679, + "step": 13376, + "time_per_iteration": 3.172804355621338 + }, + { + "auxiliary_loss_clip": 0.06404025, + "auxiliary_loss_mlp": 0.01267218, + "balance_loss_clip": 0.06268938, + "balance_loss_mlp": 0.01257681, + "epoch": 0.8042687509394258, + "flos": 21879014503680.0, + "grad_norm": 1.674981149404826, + "language_loss": 0.73782766, + "learning_rate": 3.8849204494770425e-07, + "loss": 0.81454003, + "num_input_tokens_seen": 288696455, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.09539795, + "step": 13377, + "time_per_iteration": 2.503901243209839 + }, + { + "auxiliary_loss_clip": 0.0640146, + "auxiliary_loss_mlp": 0.01263857, + "balance_loss_clip": 0.0626822, + "balance_loss_mlp": 0.01254237, + "epoch": 0.8043288741920938, + "flos": 26622122722560.0, + "grad_norm": 1.6914077439182815, + "language_loss": 0.70630229, + "learning_rate": 3.8826141666962567e-07, + "loss": 0.78295547, + "num_input_tokens_seen": 288715560, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.09619141, + "step": 13378, + "time_per_iteration": 2.5762038230895996 + }, + { + "auxiliary_loss_clip": 0.06403045, + "auxiliary_loss_mlp": 0.01262509, + "balance_loss_clip": 0.06269714, + "balance_loss_mlp": 0.012533, + "epoch": 0.8043889974447618, + "flos": 33412137206400.0, + "grad_norm": 1.3386362745905136, + "language_loss": 0.69531369, + "learning_rate": 3.880308495088347e-07, + "loss": 0.7719692, + "num_input_tokens_seen": 288739485, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.09204102, + "step": 13379, + "time_per_iteration": 4.13545298576355 + }, + { + "auxiliary_loss_clip": 0.06408659, + "auxiliary_loss_mlp": 0.01264563, + "balance_loss_clip": 0.06269853, + "balance_loss_mlp": 0.01253697, + "epoch": 0.8044491206974297, + "flos": 20382202293120.0, + "grad_norm": 1.6780556856140154, + "language_loss": 0.76333177, + "learning_rate": 3.8780034347407533e-07, + "loss": 0.84006405, + "num_input_tokens_seen": 288757420, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.10864258, + "step": 13380, + "time_per_iteration": 2.5246059894561768 + }, + { + "auxiliary_loss_clip": 0.06399637, + "auxiliary_loss_mlp": 0.01263232, + "balance_loss_clip": 0.06269045, + "balance_loss_mlp": 0.01254035, + "epoch": 0.8045092439500977, + "flos": 23410473177600.0, + "grad_norm": 5.962253365542073, + "language_loss": 0.69472402, + "learning_rate": 3.875698985740887e-07, + "loss": 0.77135271, + "num_input_tokens_seen": 288775535, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.09191895, + "step": 13381, + "time_per_iteration": 2.513369083404541 + }, + { + "auxiliary_loss_clip": 0.06405897, + "auxiliary_loss_mlp": 0.01267366, + "balance_loss_clip": 0.06273341, + "balance_loss_mlp": 0.01257805, + "epoch": 0.8045693672027656, + "flos": 24104058549120.0, + "grad_norm": 1.8201650419638222, + "language_loss": 0.64036882, + "learning_rate": 3.873395148176135e-07, + "loss": 0.71710145, + "num_input_tokens_seen": 288795035, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.09564209, + "step": 13382, + "time_per_iteration": 2.522407054901123 + }, + { + "auxiliary_loss_clip": 0.06400527, + "auxiliary_loss_mlp": 0.01265284, + "balance_loss_clip": 0.06269531, + "balance_loss_mlp": 0.01256176, + "epoch": 0.8046294904554336, + "flos": 27714218912640.0, + "grad_norm": 2.245463185943566, + "language_loss": 0.76378274, + "learning_rate": 3.8710919221338487e-07, + "loss": 0.84044087, + "num_input_tokens_seen": 288816270, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09112549, + "step": 13383, + "time_per_iteration": 2.5720760822296143 + }, + { + "auxiliary_loss_clip": 0.06401812, + "auxiliary_loss_mlp": 0.01262594, + "balance_loss_clip": 0.06270383, + "balance_loss_mlp": 0.01253188, + "epoch": 0.8046896137081017, + "flos": 24979974405120.0, + "grad_norm": 2.429847725728327, + "language_loss": 0.69923508, + "learning_rate": 3.868789307701381e-07, + "loss": 0.77587903, + "num_input_tokens_seen": 288836050, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.09405518, + "step": 13384, + "time_per_iteration": 2.5203967094421387 + }, + { + "auxiliary_loss_clip": 0.06404511, + "auxiliary_loss_mlp": 0.01262325, + "balance_loss_clip": 0.06269559, + "balance_loss_mlp": 0.01252258, + "epoch": 0.8047497369607696, + "flos": 17681178729600.0, + "grad_norm": 2.046096721285892, + "language_loss": 0.79958355, + "learning_rate": 3.8664873049660375e-07, + "loss": 0.87625194, + "num_input_tokens_seen": 288852900, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.10070801, + "step": 13385, + "time_per_iteration": 2.4725265502929688 + }, + { + "auxiliary_loss_clip": 0.06403039, + "auxiliary_loss_mlp": 0.01267415, + "balance_loss_clip": 0.06269456, + "balance_loss_mlp": 0.01257193, + "epoch": 0.8048098602134376, + "flos": 22388550382080.0, + "grad_norm": 1.837937550839016, + "language_loss": 0.72530949, + "learning_rate": 3.864185914015108e-07, + "loss": 0.80201405, + "num_input_tokens_seen": 288872625, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.10223389, + "step": 13386, + "time_per_iteration": 2.486330270767212 + }, + { + "auxiliary_loss_clip": 0.06309167, + "auxiliary_loss_mlp": 0.01254218, + "balance_loss_clip": 0.06253965, + "balance_loss_mlp": 0.01253243, + "epoch": 0.8048699834661055, + "flos": 71221840392960.0, + "grad_norm": 0.6523037243567322, + "language_loss": 0.51220822, + "learning_rate": 3.861885134935865e-07, + "loss": 0.58784211, + "num_input_tokens_seen": 288939180, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.00974274, + "step": 13387, + "time_per_iteration": 3.1729602813720703 + }, + { + "auxiliary_loss_clip": 0.06402306, + "auxiliary_loss_mlp": 0.01263122, + "balance_loss_clip": 0.06268468, + "balance_loss_mlp": 0.01253186, + "epoch": 0.8049301067187735, + "flos": 23667211687680.0, + "grad_norm": 1.5827606972372845, + "language_loss": 0.74150264, + "learning_rate": 3.859584967815559e-07, + "loss": 0.8181569, + "num_input_tokens_seen": 288958925, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.0993042, + "step": 13388, + "time_per_iteration": 2.521761894226074 + }, + { + "auxiliary_loss_clip": 0.0640045, + "auxiliary_loss_mlp": 0.01264289, + "balance_loss_clip": 0.06270331, + "balance_loss_mlp": 0.012544, + "epoch": 0.8049902299714414, + "flos": 24433318368000.0, + "grad_norm": 1.503353867290701, + "language_loss": 0.71913797, + "learning_rate": 3.857285412741411e-07, + "loss": 0.79578537, + "num_input_tokens_seen": 288980935, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.09887695, + "step": 13389, + "time_per_iteration": 2.5576906204223633 + }, + { + "auxiliary_loss_clip": 0.06400909, + "auxiliary_loss_mlp": 0.01263971, + "balance_loss_clip": 0.06271061, + "balance_loss_mlp": 0.01254273, + "epoch": 0.8050503532241094, + "flos": 17498219339520.0, + "grad_norm": 1.9489558948159147, + "language_loss": 0.83189499, + "learning_rate": 3.8549864698006097e-07, + "loss": 0.90854383, + "num_input_tokens_seen": 288996780, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.0970459, + "step": 13390, + "time_per_iteration": 2.4616317749023438 + }, + { + "auxiliary_loss_clip": 0.06308493, + "auxiliary_loss_mlp": 0.01248902, + "balance_loss_clip": 0.06253241, + "balance_loss_mlp": 0.01247792, + "epoch": 0.8051104764767774, + "flos": 57675535493760.0, + "grad_norm": 0.764906547770961, + "language_loss": 0.55567837, + "learning_rate": 3.8526881390803424e-07, + "loss": 0.63125229, + "num_input_tokens_seen": 289057590, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01112366, + "step": 13391, + "time_per_iteration": 3.141718626022339 + }, + { + "auxiliary_loss_clip": 0.06397294, + "auxiliary_loss_mlp": 0.01265249, + "balance_loss_clip": 0.06269481, + "balance_loss_mlp": 0.01256302, + "epoch": 0.8051705997294454, + "flos": 18009138810240.0, + "grad_norm": 1.5129842521720784, + "language_loss": 0.84422779, + "learning_rate": 3.850390420667762e-07, + "loss": 0.92085326, + "num_input_tokens_seen": 289076285, + "router_z_loss_clip": 1.27734375, + "router_z_loss_mlp": 0.0894165, + "step": 13392, + "time_per_iteration": 2.507310390472412 + }, + { + "auxiliary_loss_clip": 0.06402355, + "auxiliary_loss_mlp": 0.01266445, + "balance_loss_clip": 0.06268811, + "balance_loss_mlp": 0.01257063, + "epoch": 0.8052307229821133, + "flos": 26405271555840.0, + "grad_norm": 1.5077686390868956, + "language_loss": 0.705845, + "learning_rate": 3.8480933146499914e-07, + "loss": 0.78253293, + "num_input_tokens_seen": 289097585, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.09381104, + "step": 13393, + "time_per_iteration": 2.60556960105896 + }, + { + "auxiliary_loss_clip": 0.06403892, + "auxiliary_loss_mlp": 0.0126422, + "balance_loss_clip": 0.06269234, + "balance_loss_mlp": 0.01254701, + "epoch": 0.8052908462347813, + "flos": 21762580855680.0, + "grad_norm": 1.8325597430410605, + "language_loss": 0.77066338, + "learning_rate": 3.84579682111414e-07, + "loss": 0.84734452, + "num_input_tokens_seen": 289116890, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.09521484, + "step": 13394, + "time_per_iteration": 2.4934189319610596 + }, + { + "auxiliary_loss_clip": 0.06404327, + "auxiliary_loss_mlp": 0.01264444, + "balance_loss_clip": 0.06272115, + "balance_loss_mlp": 0.0125564, + "epoch": 0.8053509694874492, + "flos": 25448490910080.0, + "grad_norm": 1.6042981916986414, + "language_loss": 0.64741898, + "learning_rate": 3.843500940147304e-07, + "loss": 0.72410667, + "num_input_tokens_seen": 289136670, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.08807373, + "step": 13395, + "time_per_iteration": 2.533311128616333 + }, + { + "auxiliary_loss_clip": 0.06312156, + "auxiliary_loss_mlp": 0.01248555, + "balance_loss_clip": 0.06256828, + "balance_loss_mlp": 0.01247604, + "epoch": 0.8054110927401172, + "flos": 57687316992000.0, + "grad_norm": 0.7425366741213568, + "language_loss": 0.57110387, + "learning_rate": 3.8412056718365206e-07, + "loss": 0.64671093, + "num_input_tokens_seen": 289200150, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.00949097, + "step": 13396, + "time_per_iteration": 3.259113073348999 + }, + { + "auxiliary_loss_clip": 0.06404525, + "auxiliary_loss_mlp": 0.01265419, + "balance_loss_clip": 0.06271605, + "balance_loss_mlp": 0.01255137, + "epoch": 0.8054712159927853, + "flos": 19281385278720.0, + "grad_norm": 1.6270130332272381, + "language_loss": 0.77506781, + "learning_rate": 3.8389110162688353e-07, + "loss": 0.85176718, + "num_input_tokens_seen": 289218125, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.10284424, + "step": 13397, + "time_per_iteration": 2.559624671936035 + }, + { + "auxiliary_loss_clip": 0.06403451, + "auxiliary_loss_mlp": 0.01266829, + "balance_loss_clip": 0.06271873, + "balance_loss_mlp": 0.01257025, + "epoch": 0.8055313392454532, + "flos": 17973402243840.0, + "grad_norm": 1.701332340336638, + "language_loss": 0.70611137, + "learning_rate": 3.836616973531266e-07, + "loss": 0.78281415, + "num_input_tokens_seen": 289237115, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09796143, + "step": 13398, + "time_per_iteration": 2.497774600982666 + }, + { + "auxiliary_loss_clip": 0.06399795, + "auxiliary_loss_mlp": 0.01265088, + "balance_loss_clip": 0.06268992, + "balance_loss_mlp": 0.01256565, + "epoch": 0.8055914624981212, + "flos": 13483133320320.0, + "grad_norm": 2.1436610227849693, + "language_loss": 0.69285464, + "learning_rate": 3.834323543710805e-07, + "loss": 0.76950341, + "num_input_tokens_seen": 289253635, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.08532715, + "step": 13399, + "time_per_iteration": 2.4626171588897705 + }, + { + "auxiliary_loss_clip": 0.06404109, + "auxiliary_loss_mlp": 0.01267023, + "balance_loss_clip": 0.06272507, + "balance_loss_mlp": 0.01258208, + "epoch": 0.8056515857507891, + "flos": 13229832827520.0, + "grad_norm": 2.1990447378092566, + "language_loss": 0.72496057, + "learning_rate": 3.8320307268944153e-07, + "loss": 0.80167186, + "num_input_tokens_seen": 289270085, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.08807373, + "step": 13400, + "time_per_iteration": 2.4746367931365967 + }, + { + "auxiliary_loss_clip": 0.06401473, + "auxiliary_loss_mlp": 0.0126175, + "balance_loss_clip": 0.06270804, + "balance_loss_mlp": 0.012521, + "epoch": 0.8057117090034571, + "flos": 23884943322240.0, + "grad_norm": 1.7063053615868358, + "language_loss": 0.64111948, + "learning_rate": 3.829738523169037e-07, + "loss": 0.71775174, + "num_input_tokens_seen": 289289645, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.09655762, + "step": 13401, + "time_per_iteration": 4.023234128952026 + }, + { + "auxiliary_loss_clip": 0.06402341, + "auxiliary_loss_mlp": 0.01263217, + "balance_loss_clip": 0.06269568, + "balance_loss_mlp": 0.01254301, + "epoch": 0.805771832256125, + "flos": 21220536792960.0, + "grad_norm": 2.264659490025675, + "language_loss": 0.84643924, + "learning_rate": 3.8274469326215985e-07, + "loss": 0.92309481, + "num_input_tokens_seen": 289306630, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.08917236, + "step": 13402, + "time_per_iteration": 2.5050251483917236 + }, + { + "auxiliary_loss_clip": 0.06405149, + "auxiliary_loss_mlp": 0.01263056, + "balance_loss_clip": 0.06272706, + "balance_loss_mlp": 0.01253799, + "epoch": 0.805831955508793, + "flos": 17572627365120.0, + "grad_norm": 2.3703538824260035, + "language_loss": 0.68481362, + "learning_rate": 3.8251559553389876e-07, + "loss": 0.76149571, + "num_input_tokens_seen": 289324960, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.09246826, + "step": 13403, + "time_per_iteration": 2.4735195636749268 + }, + { + "auxiliary_loss_clip": 0.06400239, + "auxiliary_loss_mlp": 0.01261852, + "balance_loss_clip": 0.06271947, + "balance_loss_mlp": 0.0125287, + "epoch": 0.805892078761461, + "flos": 26914975142400.0, + "grad_norm": 1.5925529869996475, + "language_loss": 0.8470757, + "learning_rate": 3.822865591408084e-07, + "loss": 0.92369658, + "num_input_tokens_seen": 289344980, + "router_z_loss_clip": 1.28320312, + "router_z_loss_mlp": 0.08984375, + "step": 13404, + "time_per_iteration": 2.5682694911956787 + }, + { + "auxiliary_loss_clip": 0.06395989, + "auxiliary_loss_mlp": 0.01263837, + "balance_loss_clip": 0.06269123, + "balance_loss_mlp": 0.01255004, + "epoch": 0.805952202014129, + "flos": 31514927460480.0, + "grad_norm": 1.526531849234785, + "language_loss": 0.70693904, + "learning_rate": 3.820575840915743e-07, + "loss": 0.78353727, + "num_input_tokens_seen": 289367500, + "router_z_loss_clip": 1.26855469, + "router_z_loss_mlp": 0.08837891, + "step": 13405, + "time_per_iteration": 2.5887579917907715 + }, + { + "auxiliary_loss_clip": 0.06400827, + "auxiliary_loss_mlp": 0.01262326, + "balance_loss_clip": 0.06271822, + "balance_loss_mlp": 0.01253439, + "epoch": 0.8060123252667969, + "flos": 24396952896000.0, + "grad_norm": 2.4387244414721247, + "language_loss": 0.75653315, + "learning_rate": 3.818286703948788e-07, + "loss": 0.83316469, + "num_input_tokens_seen": 289385930, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.08874512, + "step": 13406, + "time_per_iteration": 2.5906982421875 + }, + { + "auxiliary_loss_clip": 0.0640468, + "auxiliary_loss_mlp": 0.0126352, + "balance_loss_clip": 0.06271018, + "balance_loss_mlp": 0.01254139, + "epoch": 0.8060724485194649, + "flos": 23487145263360.0, + "grad_norm": 1.4318493035492519, + "language_loss": 0.76315004, + "learning_rate": 3.815998180594018e-07, + "loss": 0.83983201, + "num_input_tokens_seen": 289408025, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.09387207, + "step": 13407, + "time_per_iteration": 2.550020456314087 + }, + { + "auxiliary_loss_clip": 0.06398082, + "auxiliary_loss_mlp": 0.01267404, + "balance_loss_clip": 0.06267268, + "balance_loss_mlp": 0.01257849, + "epoch": 0.8061325717721328, + "flos": 18630412508160.0, + "grad_norm": 1.6703188276302636, + "language_loss": 0.74090451, + "learning_rate": 3.81371027093822e-07, + "loss": 0.81755936, + "num_input_tokens_seen": 289426575, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09558105, + "step": 13408, + "time_per_iteration": 2.470579147338867 + }, + { + "auxiliary_loss_clip": 0.0640013, + "auxiliary_loss_mlp": 0.01265287, + "balance_loss_clip": 0.06269395, + "balance_loss_mlp": 0.01255363, + "epoch": 0.8061926950248008, + "flos": 23588862520320.0, + "grad_norm": 2.2758390778618227, + "language_loss": 0.70484757, + "learning_rate": 3.8114229750681523e-07, + "loss": 0.78150177, + "num_input_tokens_seen": 289447760, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09924316, + "step": 13409, + "time_per_iteration": 2.5231001377105713 + }, + { + "auxiliary_loss_clip": 0.06405453, + "auxiliary_loss_mlp": 0.0126443, + "balance_loss_clip": 0.06271958, + "balance_loss_mlp": 0.01254809, + "epoch": 0.8062528182774689, + "flos": 11147735047680.0, + "grad_norm": 2.081436146875831, + "language_loss": 0.77509671, + "learning_rate": 3.809136293070545e-07, + "loss": 0.85179555, + "num_input_tokens_seen": 289463920, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.09625244, + "step": 13410, + "time_per_iteration": 3.973681926727295 + }, + { + "auxiliary_loss_clip": 0.064013, + "auxiliary_loss_mlp": 0.01264495, + "balance_loss_clip": 0.06271458, + "balance_loss_mlp": 0.01254708, + "epoch": 0.8063129415301368, + "flos": 22353484648320.0, + "grad_norm": 1.8160554729971454, + "language_loss": 0.69222361, + "learning_rate": 3.806850225032117e-07, + "loss": 0.76888156, + "num_input_tokens_seen": 289482635, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.09796143, + "step": 13411, + "time_per_iteration": 2.5478432178497314 + }, + { + "auxiliary_loss_clip": 0.0640078, + "auxiliary_loss_mlp": 0.01266206, + "balance_loss_clip": 0.06270846, + "balance_loss_mlp": 0.01256496, + "epoch": 0.8063730647828048, + "flos": 23995297549440.0, + "grad_norm": 1.6928705363709327, + "language_loss": 0.68386424, + "learning_rate": 3.804564771039551e-07, + "loss": 0.76053417, + "num_input_tokens_seen": 289502040, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.0970459, + "step": 13412, + "time_per_iteration": 2.5194411277770996 + }, + { + "auxiliary_loss_clip": 0.06407973, + "auxiliary_loss_mlp": 0.01269354, + "balance_loss_clip": 0.06271837, + "balance_loss_mlp": 0.0125931, + "epoch": 0.8064331880354727, + "flos": 21327369148800.0, + "grad_norm": 2.7853306409882075, + "language_loss": 0.81920803, + "learning_rate": 3.8022799311795064e-07, + "loss": 0.89598131, + "num_input_tokens_seen": 289520740, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.1005249, + "step": 13413, + "time_per_iteration": 3.956393003463745 + }, + { + "auxiliary_loss_clip": 0.06400369, + "auxiliary_loss_mlp": 0.01263377, + "balance_loss_clip": 0.06269833, + "balance_loss_mlp": 0.01254144, + "epoch": 0.8064933112881407, + "flos": 19689036192000.0, + "grad_norm": 1.9565362890159896, + "language_loss": 0.855667, + "learning_rate": 3.7999957055386303e-07, + "loss": 0.9323045, + "num_input_tokens_seen": 289535840, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.09222412, + "step": 13414, + "time_per_iteration": 2.478431463241577 + }, + { + "auxiliary_loss_clip": 0.06395735, + "auxiliary_loss_mlp": 0.01261118, + "balance_loss_clip": 0.06267722, + "balance_loss_mlp": 0.0125241, + "epoch": 0.8065534345408086, + "flos": 19285494128640.0, + "grad_norm": 1.77092386295028, + "language_loss": 0.67096937, + "learning_rate": 3.7977120942035467e-07, + "loss": 0.74753791, + "num_input_tokens_seen": 289555205, + "router_z_loss_clip": 1.27734375, + "router_z_loss_mlp": 0.08703613, + "step": 13415, + "time_per_iteration": 2.563744306564331 + }, + { + "auxiliary_loss_clip": 0.06397079, + "auxiliary_loss_mlp": 0.01262985, + "balance_loss_clip": 0.06269363, + "balance_loss_mlp": 0.01254336, + "epoch": 0.8066135577934767, + "flos": 19682998698240.0, + "grad_norm": 1.462252167408637, + "language_loss": 0.76685238, + "learning_rate": 3.7954290972608383e-07, + "loss": 0.84345299, + "num_input_tokens_seen": 289573000, + "router_z_loss_clip": 1.27636719, + "router_z_loss_mlp": 0.08642578, + "step": 13416, + "time_per_iteration": 2.4804248809814453 + }, + { + "auxiliary_loss_clip": 0.06406631, + "auxiliary_loss_mlp": 0.01264601, + "balance_loss_clip": 0.0627007, + "balance_loss_mlp": 0.0125572, + "epoch": 0.8066736810461446, + "flos": 21150195690240.0, + "grad_norm": 1.5328758960444588, + "language_loss": 0.65077549, + "learning_rate": 3.793146714797086e-07, + "loss": 0.7274878, + "num_input_tokens_seen": 289592625, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.08886719, + "step": 13417, + "time_per_iteration": 2.5191526412963867 + }, + { + "auxiliary_loss_clip": 0.06405359, + "auxiliary_loss_mlp": 0.01264787, + "balance_loss_clip": 0.06270691, + "balance_loss_mlp": 0.01255316, + "epoch": 0.8067338042988126, + "flos": 22604311445760.0, + "grad_norm": 1.8039686506560615, + "language_loss": 0.80821931, + "learning_rate": 3.7908649468988306e-07, + "loss": 0.88492072, + "num_input_tokens_seen": 289610780, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.0947876, + "step": 13418, + "time_per_iteration": 2.5160207748413086 + }, + { + "auxiliary_loss_clip": 0.06405315, + "auxiliary_loss_mlp": 0.01266074, + "balance_loss_clip": 0.0627213, + "balance_loss_mlp": 0.01256197, + "epoch": 0.8067939275514805, + "flos": 16514003681280.0, + "grad_norm": 1.5721182795151136, + "language_loss": 0.8479256, + "learning_rate": 3.7885837936526066e-07, + "loss": 0.92463952, + "num_input_tokens_seen": 289628890, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.09869385, + "step": 13419, + "time_per_iteration": 3.9109416007995605 + }, + { + "auxiliary_loss_clip": 0.06404698, + "auxiliary_loss_mlp": 0.01263141, + "balance_loss_clip": 0.06270822, + "balance_loss_mlp": 0.01253247, + "epoch": 0.8068540508041485, + "flos": 28548276854400.0, + "grad_norm": 3.4687459017553457, + "language_loss": 0.76469827, + "learning_rate": 3.7863032551449047e-07, + "loss": 0.84137666, + "num_input_tokens_seen": 289647220, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.09899902, + "step": 13420, + "time_per_iteration": 2.5552561283111572 + }, + { + "auxiliary_loss_clip": 0.06399071, + "auxiliary_loss_mlp": 0.01262613, + "balance_loss_clip": 0.06269781, + "balance_loss_mlp": 0.01254399, + "epoch": 0.8069141740568164, + "flos": 21658851100800.0, + "grad_norm": 1.688287839835823, + "language_loss": 0.78943896, + "learning_rate": 3.784023331462207e-07, + "loss": 0.86605579, + "num_input_tokens_seen": 289665800, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.08215332, + "step": 13421, + "time_per_iteration": 2.4860880374908447 + }, + { + "auxiliary_loss_clip": 0.0640534, + "auxiliary_loss_mlp": 0.0126436, + "balance_loss_clip": 0.06272358, + "balance_loss_mlp": 0.01255109, + "epoch": 0.8069742973094844, + "flos": 17534962154880.0, + "grad_norm": 1.6579871645529392, + "language_loss": 0.79629791, + "learning_rate": 3.78174402269098e-07, + "loss": 0.8729949, + "num_input_tokens_seen": 289682705, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.09246826, + "step": 13422, + "time_per_iteration": 2.4994351863861084 + }, + { + "auxiliary_loss_clip": 0.0640166, + "auxiliary_loss_mlp": 0.01264919, + "balance_loss_clip": 0.06269953, + "balance_loss_mlp": 0.01255406, + "epoch": 0.8070344205621525, + "flos": 23373646508160.0, + "grad_norm": 1.5141862299887854, + "language_loss": 0.68537223, + "learning_rate": 3.7794653289176347e-07, + "loss": 0.76203805, + "num_input_tokens_seen": 289702920, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.09509277, + "step": 13423, + "time_per_iteration": 2.5125439167022705 + }, + { + "auxiliary_loss_clip": 0.06405628, + "auxiliary_loss_mlp": 0.01264277, + "balance_loss_clip": 0.06268807, + "balance_loss_mlp": 0.0125393, + "epoch": 0.8070945438148204, + "flos": 22936883500800.0, + "grad_norm": 1.7490687501288111, + "language_loss": 0.80183315, + "learning_rate": 3.7771872502285904e-07, + "loss": 0.87853223, + "num_input_tokens_seen": 289723280, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.10351562, + "step": 13424, + "time_per_iteration": 2.525763511657715 + }, + { + "auxiliary_loss_clip": 0.06411269, + "auxiliary_loss_mlp": 0.01266373, + "balance_loss_clip": 0.06275322, + "balance_loss_mlp": 0.01256807, + "epoch": 0.8071546670674884, + "flos": 25307599069440.0, + "grad_norm": 1.3989158711688392, + "language_loss": 0.79125178, + "learning_rate": 3.774909786710232e-07, + "loss": 0.86802822, + "num_input_tokens_seen": 289743475, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.09570312, + "step": 13425, + "time_per_iteration": 2.56131649017334 + }, + { + "auxiliary_loss_clip": 0.06402414, + "auxiliary_loss_mlp": 0.01263563, + "balance_loss_clip": 0.06271134, + "balance_loss_mlp": 0.01255176, + "epoch": 0.8072147903201563, + "flos": 18119534964480.0, + "grad_norm": 3.747532904590834, + "language_loss": 0.75868148, + "learning_rate": 3.772632938448923e-07, + "loss": 0.83534127, + "num_input_tokens_seen": 289761400, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.08392334, + "step": 13426, + "time_per_iteration": 2.5067336559295654 + }, + { + "auxiliary_loss_clip": 0.06402829, + "auxiliary_loss_mlp": 0.01265353, + "balance_loss_clip": 0.06269912, + "balance_loss_mlp": 0.01255823, + "epoch": 0.8072749135728243, + "flos": 26695482572160.0, + "grad_norm": 1.699020195158221, + "language_loss": 0.7311064, + "learning_rate": 3.770356705530997e-07, + "loss": 0.80778825, + "num_input_tokens_seen": 289781025, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.09533691, + "step": 13427, + "time_per_iteration": 2.5475499629974365 + }, + { + "auxiliary_loss_clip": 0.06399049, + "auxiliary_loss_mlp": 0.01264857, + "balance_loss_clip": 0.06268165, + "balance_loss_mlp": 0.01255678, + "epoch": 0.8073350368254922, + "flos": 19245564858240.0, + "grad_norm": 1.5262575334072062, + "language_loss": 0.70244026, + "learning_rate": 3.768081088042774e-07, + "loss": 0.77907926, + "num_input_tokens_seen": 289798380, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.09179688, + "step": 13428, + "time_per_iteration": 2.4958949089050293 + }, + { + "auxiliary_loss_clip": 0.06403936, + "auxiliary_loss_mlp": 0.01261298, + "balance_loss_clip": 0.06270581, + "balance_loss_mlp": 0.01252501, + "epoch": 0.8073951600781603, + "flos": 13339642003200.0, + "grad_norm": 1.7655256411115205, + "language_loss": 0.74963367, + "learning_rate": 3.765806086070544e-07, + "loss": 0.82628596, + "num_input_tokens_seen": 289814515, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.0880127, + "step": 13429, + "time_per_iteration": 2.4495036602020264 + }, + { + "auxiliary_loss_clip": 0.06396128, + "auxiliary_loss_mlp": 0.01267542, + "balance_loss_clip": 0.06269226, + "balance_loss_mlp": 0.01258655, + "epoch": 0.8074552833308282, + "flos": 22859205166080.0, + "grad_norm": 1.6937365017718335, + "language_loss": 0.67073148, + "learning_rate": 3.763531699700568e-07, + "loss": 0.74736816, + "num_input_tokens_seen": 289834315, + "router_z_loss_clip": 1.27050781, + "router_z_loss_mlp": 0.08886719, + "step": 13430, + "time_per_iteration": 2.5136795043945312 + }, + { + "auxiliary_loss_clip": 0.063988, + "auxiliary_loss_mlp": 0.01265178, + "balance_loss_clip": 0.06269097, + "balance_loss_mlp": 0.01255689, + "epoch": 0.8075154065834962, + "flos": 20345627185920.0, + "grad_norm": 1.9845601369160015, + "language_loss": 0.80206978, + "learning_rate": 3.7612579290190994e-07, + "loss": 0.87870961, + "num_input_tokens_seen": 289853770, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.09490967, + "step": 13431, + "time_per_iteration": 2.4789979457855225 + }, + { + "auxiliary_loss_clip": 0.06399001, + "auxiliary_loss_mlp": 0.01262918, + "balance_loss_clip": 0.06270722, + "balance_loss_mlp": 0.01253208, + "epoch": 0.8075755298361641, + "flos": 21914499507840.0, + "grad_norm": 1.684620767458615, + "language_loss": 0.803487, + "learning_rate": 3.7589847741123593e-07, + "loss": 0.88010621, + "num_input_tokens_seen": 289870480, + "router_z_loss_clip": 1.28222656, + "router_z_loss_mlp": 0.09716797, + "step": 13432, + "time_per_iteration": 2.5136168003082275 + }, + { + "auxiliary_loss_clip": 0.06406735, + "auxiliary_loss_mlp": 0.01265738, + "balance_loss_clip": 0.06269664, + "balance_loss_mlp": 0.01255748, + "epoch": 0.8076356530888321, + "flos": 15674746786560.0, + "grad_norm": 1.7687436770793032, + "language_loss": 0.70454299, + "learning_rate": 3.7567122350665415e-07, + "loss": 0.78126764, + "num_input_tokens_seen": 289888275, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.09985352, + "step": 13433, + "time_per_iteration": 2.4561402797698975 + }, + { + "auxiliary_loss_clip": 0.06400432, + "auxiliary_loss_mlp": 0.01263944, + "balance_loss_clip": 0.0626875, + "balance_loss_mlp": 0.01254503, + "epoch": 0.8076957763415, + "flos": 37786182117120.0, + "grad_norm": 1.418853459910882, + "language_loss": 0.72760022, + "learning_rate": 3.754440311967828e-07, + "loss": 0.80424392, + "num_input_tokens_seen": 289911495, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09448242, + "step": 13434, + "time_per_iteration": 2.6385674476623535 + }, + { + "auxiliary_loss_clip": 0.0640171, + "auxiliary_loss_mlp": 0.01262368, + "balance_loss_clip": 0.06270848, + "balance_loss_mlp": 0.01253088, + "epoch": 0.807755899594168, + "flos": 19617059934720.0, + "grad_norm": 1.6864587297815326, + "language_loss": 0.6805675, + "learning_rate": 3.752169004902361e-07, + "loss": 0.75720823, + "num_input_tokens_seen": 289930045, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.09277344, + "step": 13435, + "time_per_iteration": 2.4785990715026855 + }, + { + "auxiliary_loss_clip": 0.06405824, + "auxiliary_loss_mlp": 0.01265903, + "balance_loss_clip": 0.06270979, + "balance_loss_mlp": 0.01255419, + "epoch": 0.8078160228468361, + "flos": 23301628323840.0, + "grad_norm": 1.5075228238156948, + "language_loss": 0.75472784, + "learning_rate": 3.749898313956279e-07, + "loss": 0.83144516, + "num_input_tokens_seen": 289950815, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.1048584, + "step": 13436, + "time_per_iteration": 2.5161588191986084 + }, + { + "auxiliary_loss_clip": 0.0639594, + "auxiliary_loss_mlp": 0.01264176, + "balance_loss_clip": 0.06268739, + "balance_loss_mlp": 0.01255015, + "epoch": 0.807876146099504, + "flos": 27170078497920.0, + "grad_norm": 2.2394405611791233, + "language_loss": 0.70518959, + "learning_rate": 3.747628239215674e-07, + "loss": 0.78179073, + "num_input_tokens_seen": 289971730, + "router_z_loss_clip": 1.27148438, + "router_z_loss_mlp": 0.09161377, + "step": 13437, + "time_per_iteration": 2.544955253601074 + }, + { + "auxiliary_loss_clip": 0.06399636, + "auxiliary_loss_mlp": 0.0126867, + "balance_loss_clip": 0.06271046, + "balance_loss_mlp": 0.01259547, + "epoch": 0.807936269352172, + "flos": 27167017824000.0, + "grad_norm": 1.6660512068527857, + "language_loss": 0.72636318, + "learning_rate": 3.745358780766636e-07, + "loss": 0.80304617, + "num_input_tokens_seen": 289992995, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.09118652, + "step": 13438, + "time_per_iteration": 2.557361602783203 + }, + { + "auxiliary_loss_clip": 0.06401914, + "auxiliary_loss_mlp": 0.01263852, + "balance_loss_clip": 0.06271158, + "balance_loss_mlp": 0.01254596, + "epoch": 0.8079963926048399, + "flos": 20746653626880.0, + "grad_norm": 1.7758378703265403, + "language_loss": 0.77106637, + "learning_rate": 3.7430899386952344e-07, + "loss": 0.84772402, + "num_input_tokens_seen": 290009405, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.09259033, + "step": 13439, + "time_per_iteration": 2.4744443893432617 + }, + { + "auxiliary_loss_clip": 0.0639698, + "auxiliary_loss_mlp": 0.01267748, + "balance_loss_clip": 0.0626818, + "balance_loss_mlp": 0.0125867, + "epoch": 0.8080565158575079, + "flos": 25016675293440.0, + "grad_norm": 1.4635512483706237, + "language_loss": 0.78747815, + "learning_rate": 3.7408217130874786e-07, + "loss": 0.86412537, + "num_input_tokens_seen": 290031085, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.09075928, + "step": 13440, + "time_per_iteration": 3.9725441932678223 + }, + { + "auxiliary_loss_clip": 0.06403578, + "auxiliary_loss_mlp": 0.01264545, + "balance_loss_clip": 0.06269738, + "balance_loss_mlp": 0.01254824, + "epoch": 0.8081166391101758, + "flos": 18704107774080.0, + "grad_norm": 1.8241112266239554, + "language_loss": 0.59381831, + "learning_rate": 3.7385541040293946e-07, + "loss": 0.67049956, + "num_input_tokens_seen": 290048670, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.097229, + "step": 13441, + "time_per_iteration": 2.4906275272369385 + }, + { + "auxiliary_loss_clip": 0.06400108, + "auxiliary_loss_mlp": 0.01265175, + "balance_loss_clip": 0.06268845, + "balance_loss_mlp": 0.01254959, + "epoch": 0.8081767623628439, + "flos": 19834791569280.0, + "grad_norm": 1.7995495906095618, + "language_loss": 0.76109755, + "learning_rate": 3.7362871116069684e-07, + "loss": 0.83775043, + "num_input_tokens_seen": 290064085, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.10211182, + "step": 13442, + "time_per_iteration": 2.4604549407958984 + }, + { + "auxiliary_loss_clip": 0.06400936, + "auxiliary_loss_mlp": 0.01265886, + "balance_loss_clip": 0.06269497, + "balance_loss_mlp": 0.01256963, + "epoch": 0.8082368856155118, + "flos": 35781762672000.0, + "grad_norm": 1.6604750720544754, + "language_loss": 0.70819938, + "learning_rate": 3.734020735906169e-07, + "loss": 0.78486764, + "num_input_tokens_seen": 290086255, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.0892334, + "step": 13443, + "time_per_iteration": 2.649662733078003 + }, + { + "auxiliary_loss_clip": 0.06397702, + "auxiliary_loss_mlp": 0.01263095, + "balance_loss_clip": 0.06270011, + "balance_loss_mlp": 0.01254083, + "epoch": 0.8082970088681798, + "flos": 17203102859520.0, + "grad_norm": 1.69624931733301, + "language_loss": 0.82922244, + "learning_rate": 3.7317549770129286e-07, + "loss": 0.90583038, + "num_input_tokens_seen": 290103995, + "router_z_loss_clip": 1.27539062, + "router_z_loss_mlp": 0.09002686, + "step": 13444, + "time_per_iteration": 2.4664461612701416 + }, + { + "auxiliary_loss_clip": 0.06307255, + "auxiliary_loss_mlp": 0.01255825, + "balance_loss_clip": 0.06252229, + "balance_loss_mlp": 0.0125482, + "epoch": 0.8083571321208477, + "flos": 63571437786240.0, + "grad_norm": 0.8022589405220855, + "language_loss": 0.53542054, + "learning_rate": 3.7294898350131754e-07, + "loss": 0.61105132, + "num_input_tokens_seen": 290157245, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.01004791, + "step": 13445, + "time_per_iteration": 2.97573184967041 + }, + { + "auxiliary_loss_clip": 0.0640046, + "auxiliary_loss_mlp": 0.01265553, + "balance_loss_clip": 0.06270578, + "balance_loss_mlp": 0.01255795, + "epoch": 0.8084172553735157, + "flos": 17936407866240.0, + "grad_norm": 1.9107072136167604, + "language_loss": 0.71992731, + "learning_rate": 3.7272253099927964e-07, + "loss": 0.79658741, + "num_input_tokens_seen": 290174970, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.09759521, + "step": 13446, + "time_per_iteration": 2.470470428466797 + }, + { + "auxiliary_loss_clip": 0.06404857, + "auxiliary_loss_mlp": 0.01268125, + "balance_loss_clip": 0.06270967, + "balance_loss_mlp": 0.0125816, + "epoch": 0.8084773786261836, + "flos": 24104939016960.0, + "grad_norm": 1.745974209686923, + "language_loss": 0.71612984, + "learning_rate": 3.7249614020376606e-07, + "loss": 0.79285973, + "num_input_tokens_seen": 290194395, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.09973145, + "step": 13447, + "time_per_iteration": 2.51505184173584 + }, + { + "auxiliary_loss_clip": 0.06409042, + "auxiliary_loss_mlp": 0.01262824, + "balance_loss_clip": 0.06273446, + "balance_loss_mlp": 0.01252363, + "epoch": 0.8085375018788516, + "flos": 15592288769280.0, + "grad_norm": 2.3228732633180544, + "language_loss": 0.7492891, + "learning_rate": 3.7226981112336197e-07, + "loss": 0.82600772, + "num_input_tokens_seen": 290209200, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10467529, + "step": 13448, + "time_per_iteration": 2.442843437194824 + }, + { + "auxiliary_loss_clip": 0.06307342, + "auxiliary_loss_mlp": 0.01252569, + "balance_loss_clip": 0.06252244, + "balance_loss_mlp": 0.01251538, + "epoch": 0.8085976251315197, + "flos": 67583071059840.0, + "grad_norm": 0.7146391235313417, + "language_loss": 0.6385448, + "learning_rate": 3.7204354376665024e-07, + "loss": 0.71414399, + "num_input_tokens_seen": 290274565, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.01032257, + "step": 13449, + "time_per_iteration": 3.196397066116333 + }, + { + "auxiliary_loss_clip": 0.06401651, + "auxiliary_loss_mlp": 0.01263751, + "balance_loss_clip": 0.06271486, + "balance_loss_mlp": 0.01253922, + "epoch": 0.8086577483841876, + "flos": 22567442849280.0, + "grad_norm": 1.604658676228095, + "language_loss": 0.74288607, + "learning_rate": 3.718173381422105e-07, + "loss": 0.81954008, + "num_input_tokens_seen": 290293630, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.09838867, + "step": 13450, + "time_per_iteration": 4.000797510147095 + }, + { + "auxiliary_loss_clip": 0.06401928, + "auxiliary_loss_mlp": 0.01263996, + "balance_loss_clip": 0.06270078, + "balance_loss_mlp": 0.01254304, + "epoch": 0.8087178716368556, + "flos": 17973947295360.0, + "grad_norm": 1.6133158920878963, + "language_loss": 0.74275053, + "learning_rate": 3.7159119425861986e-07, + "loss": 0.81940979, + "num_input_tokens_seen": 290311450, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09686279, + "step": 13451, + "time_per_iteration": 2.4525790214538574 + }, + { + "auxiliary_loss_clip": 0.06405082, + "auxiliary_loss_mlp": 0.012649, + "balance_loss_clip": 0.06269129, + "balance_loss_mlp": 0.01254339, + "epoch": 0.8087779948895235, + "flos": 21724915645440.0, + "grad_norm": 1.6921247392748657, + "language_loss": 0.8051089, + "learning_rate": 3.713651121244543e-07, + "loss": 0.88180876, + "num_input_tokens_seen": 290330165, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.10552979, + "step": 13452, + "time_per_iteration": 2.516119956970215 + }, + { + "auxiliary_loss_clip": 0.06403025, + "auxiliary_loss_mlp": 0.01262182, + "balance_loss_clip": 0.06269191, + "balance_loss_mlp": 0.0125255, + "epoch": 0.8088381181421915, + "flos": 29100047990400.0, + "grad_norm": 1.6952548496868898, + "language_loss": 0.78266019, + "learning_rate": 3.711390917482875e-07, + "loss": 0.8593123, + "num_input_tokens_seen": 290350815, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.09637451, + "step": 13453, + "time_per_iteration": 4.042112827301025 + }, + { + "auxiliary_loss_clip": 0.06403942, + "auxiliary_loss_mlp": 0.01265524, + "balance_loss_clip": 0.06271793, + "balance_loss_mlp": 0.01255642, + "epoch": 0.8088982413948594, + "flos": 22204668597120.0, + "grad_norm": 2.3407226705929514, + "language_loss": 0.77383858, + "learning_rate": 3.709131331386892e-07, + "loss": 0.85053325, + "num_input_tokens_seen": 290367380, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.09875488, + "step": 13454, + "time_per_iteration": 2.554422378540039 + }, + { + "auxiliary_loss_clip": 0.06400093, + "auxiliary_loss_mlp": 0.01268227, + "balance_loss_clip": 0.06270082, + "balance_loss_mlp": 0.01257391, + "epoch": 0.8089583646475275, + "flos": 28044023783040.0, + "grad_norm": 1.8288081098987639, + "language_loss": 0.76939356, + "learning_rate": 3.7068723630422795e-07, + "loss": 0.84607673, + "num_input_tokens_seen": 290387965, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.1083374, + "step": 13455, + "time_per_iteration": 2.5715341567993164 + }, + { + "auxiliary_loss_clip": 0.06401575, + "auxiliary_loss_mlp": 0.01262608, + "balance_loss_clip": 0.06268826, + "balance_loss_mlp": 0.01253679, + "epoch": 0.8090184879001954, + "flos": 16623309732480.0, + "grad_norm": 1.6907159449842466, + "language_loss": 0.78554362, + "learning_rate": 3.70461401253471e-07, + "loss": 0.86218548, + "num_input_tokens_seen": 290404150, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.0892334, + "step": 13456, + "time_per_iteration": 2.508582830429077 + }, + { + "auxiliary_loss_clip": 0.0640007, + "auxiliary_loss_mlp": 0.01264463, + "balance_loss_clip": 0.06270983, + "balance_loss_mlp": 0.01255498, + "epoch": 0.8090786111528634, + "flos": 27347545445760.0, + "grad_norm": 1.776897039919432, + "language_loss": 0.71710402, + "learning_rate": 3.702356279949801e-07, + "loss": 0.79374933, + "num_input_tokens_seen": 290422370, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.08966064, + "step": 13457, + "time_per_iteration": 2.5812559127807617 + }, + { + "auxiliary_loss_clip": 0.06398778, + "auxiliary_loss_mlp": 0.01262909, + "balance_loss_clip": 0.0626803, + "balance_loss_mlp": 0.01253777, + "epoch": 0.8091387344055313, + "flos": 21112111209600.0, + "grad_norm": 1.6184921643640915, + "language_loss": 0.73064125, + "learning_rate": 3.700099165373176e-07, + "loss": 0.80725813, + "num_input_tokens_seen": 290442645, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09143066, + "step": 13458, + "time_per_iteration": 3.9770147800445557 + }, + { + "auxiliary_loss_clip": 0.06401807, + "auxiliary_loss_mlp": 0.01264535, + "balance_loss_clip": 0.06270815, + "balance_loss_mlp": 0.01255022, + "epoch": 0.8091988576581993, + "flos": 11659702694400.0, + "grad_norm": 2.4320264643935348, + "language_loss": 0.78925645, + "learning_rate": 3.6978426688904275e-07, + "loss": 0.86591995, + "num_input_tokens_seen": 290458520, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.09509277, + "step": 13459, + "time_per_iteration": 2.4999613761901855 + }, + { + "auxiliary_loss_clip": 0.06403743, + "auxiliary_loss_mlp": 0.01264391, + "balance_loss_clip": 0.06267793, + "balance_loss_mlp": 0.01254938, + "epoch": 0.8092589809108672, + "flos": 22969475539200.0, + "grad_norm": 2.9044403495473494, + "language_loss": 0.80189556, + "learning_rate": 3.695586790587113e-07, + "loss": 0.87857693, + "num_input_tokens_seen": 290474465, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.09442139, + "step": 13460, + "time_per_iteration": 2.4736809730529785 + }, + { + "auxiliary_loss_clip": 0.06403811, + "auxiliary_loss_mlp": 0.01265447, + "balance_loss_clip": 0.06270553, + "balance_loss_mlp": 0.01255463, + "epoch": 0.8093191041635353, + "flos": 13265988664320.0, + "grad_norm": 1.703012580351455, + "language_loss": 0.8516379, + "learning_rate": 3.693331530548789e-07, + "loss": 0.92833048, + "num_input_tokens_seen": 290492060, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.09973145, + "step": 13461, + "time_per_iteration": 2.472332000732422 + }, + { + "auxiliary_loss_clip": 0.06405523, + "auxiliary_loss_mlp": 0.01269044, + "balance_loss_clip": 0.06270871, + "balance_loss_mlp": 0.01258995, + "epoch": 0.8093792274162032, + "flos": 25522353884160.0, + "grad_norm": 1.7015064491080825, + "language_loss": 0.76382649, + "learning_rate": 3.69107688886096e-07, + "loss": 0.84057218, + "num_input_tokens_seen": 290511510, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.1005249, + "step": 13462, + "time_per_iteration": 2.5191242694854736 + }, + { + "auxiliary_loss_clip": 0.0640429, + "auxiliary_loss_mlp": 0.01263724, + "balance_loss_clip": 0.06271065, + "balance_loss_mlp": 0.01253812, + "epoch": 0.8094393506688712, + "flos": 23552622829440.0, + "grad_norm": 1.6157350617712933, + "language_loss": 0.82945341, + "learning_rate": 3.6888228656091357e-07, + "loss": 0.90613359, + "num_input_tokens_seen": 290530035, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.09906006, + "step": 13463, + "time_per_iteration": 2.5403740406036377 + }, + { + "auxiliary_loss_clip": 0.06398586, + "auxiliary_loss_mlp": 0.01262495, + "balance_loss_clip": 0.06268895, + "balance_loss_mlp": 0.0125371, + "epoch": 0.8094994739215392, + "flos": 17061624040320.0, + "grad_norm": 3.129891781410948, + "language_loss": 0.6203239, + "learning_rate": 3.686569460878779e-07, + "loss": 0.69693464, + "num_input_tokens_seen": 290548245, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.08789062, + "step": 13464, + "time_per_iteration": 2.5035338401794434 + }, + { + "auxiliary_loss_clip": 0.06398399, + "auxiliary_loss_mlp": 0.01268957, + "balance_loss_clip": 0.06270058, + "balance_loss_mlp": 0.01260386, + "epoch": 0.8095595971742071, + "flos": 23558157198720.0, + "grad_norm": 1.527244163455927, + "language_loss": 0.61969072, + "learning_rate": 3.684316674755341e-07, + "loss": 0.69636428, + "num_input_tokens_seen": 290568625, + "router_z_loss_clip": 1.28417969, + "router_z_loss_mlp": 0.08575439, + "step": 13465, + "time_per_iteration": 2.511592388153076 + }, + { + "auxiliary_loss_clip": 0.06402411, + "auxiliary_loss_mlp": 0.01268671, + "balance_loss_clip": 0.06272465, + "balance_loss_mlp": 0.01259319, + "epoch": 0.8096197204268751, + "flos": 20378973911040.0, + "grad_norm": 1.9869568826877384, + "language_loss": 0.8212142, + "learning_rate": 3.682064507324256e-07, + "loss": 0.89792502, + "num_input_tokens_seen": 290586575, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.09350586, + "step": 13466, + "time_per_iteration": 2.4735896587371826 + }, + { + "auxiliary_loss_clip": 0.06405444, + "auxiliary_loss_mlp": 0.0126549, + "balance_loss_clip": 0.06271167, + "balance_loss_mlp": 0.0125578, + "epoch": 0.809679843679543, + "flos": 27826208294400.0, + "grad_norm": 2.9775086459835225, + "language_loss": 0.76277745, + "learning_rate": 3.6798129586709204e-07, + "loss": 0.83948678, + "num_input_tokens_seen": 290606790, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.09710693, + "step": 13467, + "time_per_iteration": 2.541368007659912 + }, + { + "auxiliary_loss_clip": 0.06402574, + "auxiliary_loss_mlp": 0.0126548, + "balance_loss_clip": 0.06270781, + "balance_loss_mlp": 0.01256462, + "epoch": 0.8097399669322111, + "flos": 22019990198400.0, + "grad_norm": 1.5745990150023057, + "language_loss": 0.791363, + "learning_rate": 3.6775620288807073e-07, + "loss": 0.86804354, + "num_input_tokens_seen": 290625525, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09020996, + "step": 13468, + "time_per_iteration": 2.4730474948883057 + }, + { + "auxiliary_loss_clip": 0.06396127, + "auxiliary_loss_mlp": 0.0126498, + "balance_loss_clip": 0.06268477, + "balance_loss_mlp": 0.01255879, + "epoch": 0.809800090184879, + "flos": 18994905768960.0, + "grad_norm": 1.625398825677948, + "language_loss": 0.68054199, + "learning_rate": 3.675311718038978e-07, + "loss": 0.75715309, + "num_input_tokens_seen": 290644935, + "router_z_loss_clip": 1.27734375, + "router_z_loss_mlp": 0.09100342, + "step": 13469, + "time_per_iteration": 2.484276533126831 + }, + { + "auxiliary_loss_clip": 0.0630585, + "auxiliary_loss_mlp": 0.01249591, + "balance_loss_clip": 0.06250963, + "balance_loss_mlp": 0.01248598, + "epoch": 0.809860213437547, + "flos": 66120653750400.0, + "grad_norm": 0.6770585331201862, + "language_loss": 0.54451334, + "learning_rate": 3.6730620262310683e-07, + "loss": 0.62006778, + "num_input_tokens_seen": 290710735, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.00993347, + "step": 13470, + "time_per_iteration": 3.1943366527557373 + }, + { + "auxiliary_loss_clip": 0.06399186, + "auxiliary_loss_mlp": 0.01263139, + "balance_loss_clip": 0.0626805, + "balance_loss_mlp": 0.01253805, + "epoch": 0.8099203366902149, + "flos": 20888090519040.0, + "grad_norm": 1.850656923683804, + "language_loss": 0.69889498, + "learning_rate": 3.670812953542279e-07, + "loss": 0.77551824, + "num_input_tokens_seen": 290729565, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.09332275, + "step": 13471, + "time_per_iteration": 2.521888494491577 + }, + { + "auxiliary_loss_clip": 0.06400762, + "auxiliary_loss_mlp": 0.01264036, + "balance_loss_clip": 0.06269421, + "balance_loss_mlp": 0.01254899, + "epoch": 0.8099804599428829, + "flos": 26038053037440.0, + "grad_norm": 2.7576436132891584, + "language_loss": 0.80252707, + "learning_rate": 3.6685645000579003e-07, + "loss": 0.87917507, + "num_input_tokens_seen": 290749360, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09143066, + "step": 13472, + "time_per_iteration": 2.5895776748657227 + }, + { + "auxiliary_loss_clip": 0.06301145, + "auxiliary_loss_mlp": 0.01257277, + "balance_loss_clip": 0.06245954, + "balance_loss_mlp": 0.01256171, + "epoch": 0.8100405831955508, + "flos": 69324127522560.0, + "grad_norm": 0.7337883216097973, + "language_loss": 0.57360721, + "learning_rate": 3.666316665863201e-07, + "loss": 0.64919138, + "num_input_tokens_seen": 290812145, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.01108551, + "step": 13473, + "time_per_iteration": 3.0853075981140137 + }, + { + "auxiliary_loss_clip": 0.06402718, + "auxiliary_loss_mlp": 0.01263044, + "balance_loss_clip": 0.06268923, + "balance_loss_mlp": 0.01253585, + "epoch": 0.8101007064482189, + "flos": 15017820376320.0, + "grad_norm": 1.8256752375562084, + "language_loss": 0.74556285, + "learning_rate": 3.664069451043399e-07, + "loss": 0.82222044, + "num_input_tokens_seen": 290829845, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.09454346, + "step": 13474, + "time_per_iteration": 2.4723920822143555 + }, + { + "auxiliary_loss_clip": 0.06406249, + "auxiliary_loss_mlp": 0.01269145, + "balance_loss_clip": 0.06270969, + "balance_loss_mlp": 0.01259698, + "epoch": 0.8101608297008868, + "flos": 21073230115200.0, + "grad_norm": 1.4992308701275703, + "language_loss": 0.78592277, + "learning_rate": 3.661822855683723e-07, + "loss": 0.86267674, + "num_input_tokens_seen": 290848815, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.09442139, + "step": 13475, + "time_per_iteration": 2.49446964263916 + }, + { + "auxiliary_loss_clip": 0.06399214, + "auxiliary_loss_mlp": 0.01264956, + "balance_loss_clip": 0.06269421, + "balance_loss_mlp": 0.01255545, + "epoch": 0.8102209529535548, + "flos": 23737846279680.0, + "grad_norm": 2.1011404448378674, + "language_loss": 0.76127887, + "learning_rate": 3.659576879869364e-07, + "loss": 0.83792061, + "num_input_tokens_seen": 290868580, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.09405518, + "step": 13476, + "time_per_iteration": 2.623260259628296 + }, + { + "auxiliary_loss_clip": 0.06409746, + "auxiliary_loss_mlp": 0.0126529, + "balance_loss_clip": 0.06272443, + "balance_loss_mlp": 0.01255199, + "epoch": 0.8102810762062228, + "flos": 10959408996480.0, + "grad_norm": 1.9990272490296594, + "language_loss": 0.73678935, + "learning_rate": 3.657331523685485e-07, + "loss": 0.81353962, + "num_input_tokens_seen": 290883540, + "router_z_loss_clip": 1.37402344, + "router_z_loss_mlp": 0.10083008, + "step": 13477, + "time_per_iteration": 2.460721731185913 + }, + { + "auxiliary_loss_clip": 0.06398121, + "auxiliary_loss_mlp": 0.01261498, + "balance_loss_clip": 0.06267326, + "balance_loss_mlp": 0.01252123, + "epoch": 0.8103411994588907, + "flos": 14654291437440.0, + "grad_norm": 1.923341621184723, + "language_loss": 0.6978184, + "learning_rate": 3.6550867872172365e-07, + "loss": 0.7744146, + "num_input_tokens_seen": 290901560, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09375, + "step": 13478, + "time_per_iteration": 2.4879016876220703 + }, + { + "auxiliary_loss_clip": 0.06305265, + "auxiliary_loss_mlp": 0.01250833, + "balance_loss_clip": 0.06250156, + "balance_loss_mlp": 0.01249791, + "epoch": 0.8104013227115587, + "flos": 59170964112000.0, + "grad_norm": 0.6706213336405785, + "language_loss": 0.52182806, + "learning_rate": 3.6528426705497293e-07, + "loss": 0.5973891, + "num_input_tokens_seen": 290959185, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.01042175, + "step": 13479, + "time_per_iteration": 3.042278289794922 + }, + { + "auxiliary_loss_clip": 0.06402652, + "auxiliary_loss_mlp": 0.01265309, + "balance_loss_clip": 0.06272212, + "balance_loss_mlp": 0.01256183, + "epoch": 0.8104614459642266, + "flos": 19834833496320.0, + "grad_norm": 1.5781047108750677, + "language_loss": 0.71602416, + "learning_rate": 3.650599173768072e-07, + "loss": 0.79270375, + "num_input_tokens_seen": 290979585, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.09124756, + "step": 13480, + "time_per_iteration": 3.9115874767303467 + }, + { + "auxiliary_loss_clip": 0.06400708, + "auxiliary_loss_mlp": 0.0126312, + "balance_loss_clip": 0.06268963, + "balance_loss_mlp": 0.01253983, + "epoch": 0.8105215692168947, + "flos": 25381294335360.0, + "grad_norm": 1.960101511676754, + "language_loss": 0.79864734, + "learning_rate": 3.648356296957327e-07, + "loss": 0.87528563, + "num_input_tokens_seen": 291000865, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.09136963, + "step": 13481, + "time_per_iteration": 2.5275304317474365 + }, + { + "auxiliary_loss_clip": 0.06402725, + "auxiliary_loss_mlp": 0.0126697, + "balance_loss_clip": 0.06271543, + "balance_loss_mlp": 0.01258047, + "epoch": 0.8105816924695626, + "flos": 20487357567360.0, + "grad_norm": 1.7047460645728882, + "language_loss": 0.72716773, + "learning_rate": 3.646114040202548e-07, + "loss": 0.80386472, + "num_input_tokens_seen": 291018285, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.08929443, + "step": 13482, + "time_per_iteration": 2.5445470809936523 + }, + { + "auxiliary_loss_clip": 0.06404884, + "auxiliary_loss_mlp": 0.01266536, + "balance_loss_clip": 0.06271972, + "balance_loss_mlp": 0.01256773, + "epoch": 0.8106418157222306, + "flos": 14544021064320.0, + "grad_norm": 1.9920968678364395, + "language_loss": 0.65563977, + "learning_rate": 3.6438724035887705e-07, + "loss": 0.73235393, + "num_input_tokens_seen": 291035745, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.09771729, + "step": 13483, + "time_per_iteration": 2.5054430961608887 + }, + { + "auxiliary_loss_clip": 0.06400222, + "auxiliary_loss_mlp": 0.01266657, + "balance_loss_clip": 0.06270905, + "balance_loss_mlp": 0.01257514, + "epoch": 0.8107019389748985, + "flos": 22570964720640.0, + "grad_norm": 1.8159029910366271, + "language_loss": 0.76454484, + "learning_rate": 3.641631387200992e-07, + "loss": 0.84121364, + "num_input_tokens_seen": 291053280, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.09155273, + "step": 13484, + "time_per_iteration": 2.5171079635620117 + }, + { + "auxiliary_loss_clip": 0.06410594, + "auxiliary_loss_mlp": 0.01267955, + "balance_loss_clip": 0.06272984, + "balance_loss_mlp": 0.01257274, + "epoch": 0.8107620622275665, + "flos": 19615634415360.0, + "grad_norm": 1.4402469557627227, + "language_loss": 0.72541213, + "learning_rate": 3.639390991124183e-07, + "loss": 0.80219758, + "num_input_tokens_seen": 291072855, + "router_z_loss_clip": 1.37402344, + "router_z_loss_mlp": 0.10693359, + "step": 13485, + "time_per_iteration": 2.5724942684173584 + }, + { + "auxiliary_loss_clip": 0.06396358, + "auxiliary_loss_mlp": 0.01265165, + "balance_loss_clip": 0.06270035, + "balance_loss_mlp": 0.01256636, + "epoch": 0.8108221854802344, + "flos": 16149007296000.0, + "grad_norm": 1.8147105780341508, + "language_loss": 0.76297033, + "learning_rate": 3.637151215443308e-07, + "loss": 0.83958554, + "num_input_tokens_seen": 291090285, + "router_z_loss_clip": 1.26269531, + "router_z_loss_mlp": 0.08520508, + "step": 13486, + "time_per_iteration": 2.4431118965148926 + }, + { + "auxiliary_loss_clip": 0.06407452, + "auxiliary_loss_mlp": 0.01265864, + "balance_loss_clip": 0.06269525, + "balance_loss_mlp": 0.01256226, + "epoch": 0.8108823087329025, + "flos": 21112656261120.0, + "grad_norm": 1.8644106456764877, + "language_loss": 0.72075516, + "learning_rate": 3.6349120602433045e-07, + "loss": 0.79748833, + "num_input_tokens_seen": 291107675, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.09643555, + "step": 13487, + "time_per_iteration": 2.473879337310791 + }, + { + "auxiliary_loss_clip": 0.06400521, + "auxiliary_loss_mlp": 0.01268012, + "balance_loss_clip": 0.06272428, + "balance_loss_mlp": 0.01259018, + "epoch": 0.8109424319855704, + "flos": 29206377221760.0, + "grad_norm": 2.193678189628865, + "language_loss": 0.84388292, + "learning_rate": 3.6326735256090715e-07, + "loss": 0.92056829, + "num_input_tokens_seen": 291126900, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.08984375, + "step": 13488, + "time_per_iteration": 2.543301582336426 + }, + { + "auxiliary_loss_clip": 0.06405022, + "auxiliary_loss_mlp": 0.01264443, + "balance_loss_clip": 0.06271579, + "balance_loss_mlp": 0.01255198, + "epoch": 0.8110025552382384, + "flos": 23118459298560.0, + "grad_norm": 2.075195554418006, + "language_loss": 0.74304891, + "learning_rate": 3.630435611625502e-07, + "loss": 0.81974351, + "num_input_tokens_seen": 291145285, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.09234619, + "step": 13489, + "time_per_iteration": 3.9371306896209717 + }, + { + "auxiliary_loss_clip": 0.06400748, + "auxiliary_loss_mlp": 0.01266118, + "balance_loss_clip": 0.06272238, + "balance_loss_mlp": 0.01257523, + "epoch": 0.8110626784909064, + "flos": 22386076686720.0, + "grad_norm": 1.8053041582092544, + "language_loss": 0.71944815, + "learning_rate": 3.628198318377453e-07, + "loss": 0.79611677, + "num_input_tokens_seen": 291163485, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.0859375, + "step": 13490, + "time_per_iteration": 2.5005099773406982 + }, + { + "auxiliary_loss_clip": 0.0640538, + "auxiliary_loss_mlp": 0.01266534, + "balance_loss_clip": 0.06270941, + "balance_loss_mlp": 0.01256067, + "epoch": 0.8111228017435743, + "flos": 23374820465280.0, + "grad_norm": 2.2367527372378166, + "language_loss": 0.72137296, + "learning_rate": 3.625961645949762e-07, + "loss": 0.79809213, + "num_input_tokens_seen": 291182215, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.10461426, + "step": 13491, + "time_per_iteration": 2.5076067447662354 + }, + { + "auxiliary_loss_clip": 0.06401882, + "auxiliary_loss_mlp": 0.01265496, + "balance_loss_clip": 0.06270771, + "balance_loss_mlp": 0.01256115, + "epoch": 0.8111829249962423, + "flos": 21292680758400.0, + "grad_norm": 1.729765137359799, + "language_loss": 0.67871809, + "learning_rate": 3.623725594427245e-07, + "loss": 0.7553919, + "num_input_tokens_seen": 291203145, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09381104, + "step": 13492, + "time_per_iteration": 3.959716320037842 + }, + { + "auxiliary_loss_clip": 0.06405997, + "auxiliary_loss_mlp": 0.01263308, + "balance_loss_clip": 0.06272483, + "balance_loss_mlp": 0.01253605, + "epoch": 0.8112430482489102, + "flos": 22352017201920.0, + "grad_norm": 1.7889439150881994, + "language_loss": 0.72219712, + "learning_rate": 3.6214901638947006e-07, + "loss": 0.79889023, + "num_input_tokens_seen": 291220600, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.09698486, + "step": 13493, + "time_per_iteration": 2.498922348022461 + }, + { + "auxiliary_loss_clip": 0.06403151, + "auxiliary_loss_mlp": 0.01266475, + "balance_loss_clip": 0.06270409, + "balance_loss_mlp": 0.01256396, + "epoch": 0.8113031715015783, + "flos": 31146199568640.0, + "grad_norm": 1.5274300154238956, + "language_loss": 0.70765322, + "learning_rate": 3.619255354436885e-07, + "loss": 0.78434944, + "num_input_tokens_seen": 291241195, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.10070801, + "step": 13494, + "time_per_iteration": 2.582156181335449 + }, + { + "auxiliary_loss_clip": 0.06407354, + "auxiliary_loss_mlp": 0.01271061, + "balance_loss_clip": 0.0627186, + "balance_loss_mlp": 0.0126038, + "epoch": 0.8113632947542462, + "flos": 25342077824640.0, + "grad_norm": 1.9696659846261377, + "language_loss": 0.76812732, + "learning_rate": 3.6170211661385543e-07, + "loss": 0.84491146, + "num_input_tokens_seen": 291258715, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.10687256, + "step": 13495, + "time_per_iteration": 2.4944467544555664 + }, + { + "auxiliary_loss_clip": 0.06403805, + "auxiliary_loss_mlp": 0.01265569, + "balance_loss_clip": 0.06269392, + "balance_loss_mlp": 0.0125619, + "epoch": 0.8114234180069142, + "flos": 28446727305600.0, + "grad_norm": 1.6848017039498533, + "language_loss": 0.80030304, + "learning_rate": 3.614787599084417e-07, + "loss": 0.87699676, + "num_input_tokens_seen": 291278030, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.09375, + "step": 13496, + "time_per_iteration": 2.5573971271514893 + }, + { + "auxiliary_loss_clip": 0.06403383, + "auxiliary_loss_mlp": 0.01264908, + "balance_loss_clip": 0.06270055, + "balance_loss_mlp": 0.01254829, + "epoch": 0.8114835412595821, + "flos": 20344998280320.0, + "grad_norm": 1.6257058100958846, + "language_loss": 0.71732903, + "learning_rate": 3.6125546533591787e-07, + "loss": 0.79401189, + "num_input_tokens_seen": 291296740, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.10083008, + "step": 13497, + "time_per_iteration": 3.9020187854766846 + }, + { + "auxiliary_loss_clip": 0.06405488, + "auxiliary_loss_mlp": 0.01264686, + "balance_loss_clip": 0.06270734, + "balance_loss_mlp": 0.01255859, + "epoch": 0.8115436645122501, + "flos": 22497269454720.0, + "grad_norm": 1.6450222664154983, + "language_loss": 0.76774496, + "learning_rate": 3.610322329047508e-07, + "loss": 0.84444666, + "num_input_tokens_seen": 291318730, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.0881958, + "step": 13498, + "time_per_iteration": 2.53695011138916 + }, + { + "auxiliary_loss_clip": 0.06400445, + "auxiliary_loss_mlp": 0.01265682, + "balance_loss_clip": 0.06268942, + "balance_loss_mlp": 0.01256359, + "epoch": 0.811603787764918, + "flos": 13850477619840.0, + "grad_norm": 1.8314590117714953, + "language_loss": 0.84328604, + "learning_rate": 3.608090626234055e-07, + "loss": 0.91994727, + "num_input_tokens_seen": 291336755, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09313965, + "step": 13499, + "time_per_iteration": 2.4478304386138916 + }, + { + "auxiliary_loss_clip": 0.06405481, + "auxiliary_loss_mlp": 0.01265922, + "balance_loss_clip": 0.06274162, + "balance_loss_mlp": 0.01254311, + "epoch": 0.8116639110175861, + "flos": 21620766620160.0, + "grad_norm": 1.4739026591670814, + "language_loss": 0.76078045, + "learning_rate": 3.6058595450034603e-07, + "loss": 0.83749443, + "num_input_tokens_seen": 291356795, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.1161499, + "step": 13500, + "time_per_iteration": 2.5095434188842773 + }, + { + "auxiliary_loss_clip": 0.06305633, + "auxiliary_loss_mlp": 0.01251852, + "balance_loss_clip": 0.06250529, + "balance_loss_mlp": 0.01250827, + "epoch": 0.811724034270254, + "flos": 64481021055360.0, + "grad_norm": 0.7829192652401806, + "language_loss": 0.59720683, + "learning_rate": 3.603629085440303e-07, + "loss": 0.67278171, + "num_input_tokens_seen": 291416005, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.01024628, + "step": 13501, + "time_per_iteration": 3.165794610977173 + }, + { + "auxiliary_loss_clip": 0.06395126, + "auxiliary_loss_mlp": 0.01264174, + "balance_loss_clip": 0.06268613, + "balance_loss_mlp": 0.01255257, + "epoch": 0.811784157522922, + "flos": 24761068813440.0, + "grad_norm": 1.4866763661196265, + "language_loss": 0.793163, + "learning_rate": 3.6013992476291753e-07, + "loss": 0.86975598, + "num_input_tokens_seen": 291434870, + "router_z_loss_clip": 1.26464844, + "router_z_loss_mlp": 0.08905029, + "step": 13502, + "time_per_iteration": 2.5414836406707764 + }, + { + "auxiliary_loss_clip": 0.06399107, + "auxiliary_loss_mlp": 0.0126301, + "balance_loss_clip": 0.0626989, + "balance_loss_mlp": 0.01254188, + "epoch": 0.81184428077559, + "flos": 12172089611520.0, + "grad_norm": 2.6111507442822086, + "language_loss": 0.71246618, + "learning_rate": 3.599170031654635e-07, + "loss": 0.78908736, + "num_input_tokens_seen": 291452230, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.08825684, + "step": 13503, + "time_per_iteration": 2.595961332321167 + }, + { + "auxiliary_loss_clip": 0.06402574, + "auxiliary_loss_mlp": 0.0126551, + "balance_loss_clip": 0.06270054, + "balance_loss_mlp": 0.01255044, + "epoch": 0.8119044040282579, + "flos": 44432621429760.0, + "grad_norm": 1.4625675219914986, + "language_loss": 0.68073899, + "learning_rate": 3.5969414376012065e-07, + "loss": 0.75741982, + "num_input_tokens_seen": 291477425, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.10473633, + "step": 13504, + "time_per_iteration": 2.777693271636963 + }, + { + "auxiliary_loss_clip": 0.06402649, + "auxiliary_loss_mlp": 0.01265991, + "balance_loss_clip": 0.06269131, + "balance_loss_mlp": 0.0125593, + "epoch": 0.8119645272809259, + "flos": 52167131936640.0, + "grad_norm": 1.8871049986927122, + "language_loss": 0.75274026, + "learning_rate": 3.594713465553403e-07, + "loss": 0.82942665, + "num_input_tokens_seen": 291501070, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.10070801, + "step": 13505, + "time_per_iteration": 2.7910561561584473 + }, + { + "auxiliary_loss_clip": 0.06404154, + "auxiliary_loss_mlp": 0.01267225, + "balance_loss_clip": 0.06272307, + "balance_loss_mlp": 0.01257295, + "epoch": 0.8120246505335939, + "flos": 30241842451200.0, + "grad_norm": 4.0148732645076475, + "language_loss": 0.72911733, + "learning_rate": 3.5924861155957123e-07, + "loss": 0.80583107, + "num_input_tokens_seen": 291524945, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09924316, + "step": 13506, + "time_per_iteration": 2.5993027687072754 + }, + { + "auxiliary_loss_clip": 0.06410645, + "auxiliary_loss_mlp": 0.01265349, + "balance_loss_clip": 0.06271695, + "balance_loss_mlp": 0.01255496, + "epoch": 0.8120847737862619, + "flos": 22134243640320.0, + "grad_norm": 2.0108057093252754, + "language_loss": 0.76670831, + "learning_rate": 3.590259387812593e-07, + "loss": 0.84346819, + "num_input_tokens_seen": 291544605, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.09863281, + "step": 13507, + "time_per_iteration": 2.5172982215881348 + }, + { + "auxiliary_loss_clip": 0.06410617, + "auxiliary_loss_mlp": 0.01264037, + "balance_loss_clip": 0.06271885, + "balance_loss_mlp": 0.01253999, + "epoch": 0.8121448970389298, + "flos": 23301963740160.0, + "grad_norm": 1.6354212384469264, + "language_loss": 0.70526397, + "learning_rate": 3.5880332822884783e-07, + "loss": 0.7820105, + "num_input_tokens_seen": 291563850, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.10040283, + "step": 13508, + "time_per_iteration": 2.4784016609191895 + }, + { + "auxiliary_loss_clip": 0.06400366, + "auxiliary_loss_mlp": 0.01263654, + "balance_loss_clip": 0.06270534, + "balance_loss_mlp": 0.01254445, + "epoch": 0.8122050202915978, + "flos": 22170734893440.0, + "grad_norm": 1.5714430393800305, + "language_loss": 0.7640515, + "learning_rate": 3.585807799107785e-07, + "loss": 0.84069169, + "num_input_tokens_seen": 291581730, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.09204102, + "step": 13509, + "time_per_iteration": 2.489997625350952 + }, + { + "auxiliary_loss_clip": 0.06405313, + "auxiliary_loss_mlp": 0.01263273, + "balance_loss_clip": 0.06270471, + "balance_loss_mlp": 0.01253366, + "epoch": 0.8122651435442657, + "flos": 23265765976320.0, + "grad_norm": 1.7111560106150059, + "language_loss": 0.76858175, + "learning_rate": 3.58358293835491e-07, + "loss": 0.84526753, + "num_input_tokens_seen": 291601225, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.09899902, + "step": 13510, + "time_per_iteration": 2.5373711585998535 + }, + { + "auxiliary_loss_clip": 0.06409149, + "auxiliary_loss_mlp": 0.01263873, + "balance_loss_clip": 0.06272087, + "balance_loss_mlp": 0.01253806, + "epoch": 0.8123252667969337, + "flos": 16144940373120.0, + "grad_norm": 1.6338009615149598, + "language_loss": 0.70005399, + "learning_rate": 3.581358700114212e-07, + "loss": 0.77678418, + "num_input_tokens_seen": 291616995, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.10058594, + "step": 13511, + "time_per_iteration": 2.4621431827545166 + }, + { + "auxiliary_loss_clip": 0.06406134, + "auxiliary_loss_mlp": 0.01264376, + "balance_loss_clip": 0.06270903, + "balance_loss_mlp": 0.01254988, + "epoch": 0.8123853900496016, + "flos": 21250738990080.0, + "grad_norm": 3.4887790010923023, + "language_loss": 0.79486072, + "learning_rate": 3.57913508447004e-07, + "loss": 0.87156576, + "num_input_tokens_seen": 291636145, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.09387207, + "step": 13512, + "time_per_iteration": 2.5077874660491943 + }, + { + "auxiliary_loss_clip": 0.06401815, + "auxiliary_loss_mlp": 0.01262813, + "balance_loss_clip": 0.06269997, + "balance_loss_mlp": 0.01253723, + "epoch": 0.8124455133022697, + "flos": 64391156680320.0, + "grad_norm": 1.5302890319846227, + "language_loss": 0.64037752, + "learning_rate": 3.5769120915067076e-07, + "loss": 0.71702385, + "num_input_tokens_seen": 291662440, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09094238, + "step": 13513, + "time_per_iteration": 2.8918113708496094 + }, + { + "auxiliary_loss_clip": 0.06406252, + "auxiliary_loss_mlp": 0.01266377, + "balance_loss_clip": 0.06270798, + "balance_loss_mlp": 0.01256131, + "epoch": 0.8125056365549376, + "flos": 23849039047680.0, + "grad_norm": 1.8518380601721225, + "language_loss": 0.71717697, + "learning_rate": 3.5746897213085194e-07, + "loss": 0.79390329, + "num_input_tokens_seen": 291680950, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.10241699, + "step": 13514, + "time_per_iteration": 2.502861499786377 + }, + { + "auxiliary_loss_clip": 0.06401537, + "auxiliary_loss_mlp": 0.0126663, + "balance_loss_clip": 0.06270736, + "balance_loss_mlp": 0.01257434, + "epoch": 0.8125657598076056, + "flos": 23557109022720.0, + "grad_norm": 1.401731769675591, + "language_loss": 0.63314271, + "learning_rate": 3.5724679739597364e-07, + "loss": 0.70982432, + "num_input_tokens_seen": 291702395, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.09197998, + "step": 13515, + "time_per_iteration": 2.5460987091064453 + }, + { + "auxiliary_loss_clip": 0.06395491, + "auxiliary_loss_mlp": 0.01268356, + "balance_loss_clip": 0.06270037, + "balance_loss_mlp": 0.0125932, + "epoch": 0.8126258830602736, + "flos": 20710497790080.0, + "grad_norm": 1.4629712579476926, + "language_loss": 0.75324374, + "learning_rate": 3.570246849544616e-07, + "loss": 0.8298822, + "num_input_tokens_seen": 291721135, + "router_z_loss_clip": 1.25488281, + "router_z_loss_mlp": 0.09033203, + "step": 13516, + "time_per_iteration": 2.4880564212799072 + }, + { + "auxiliary_loss_clip": 0.06403796, + "auxiliary_loss_mlp": 0.01264686, + "balance_loss_clip": 0.06268365, + "balance_loss_mlp": 0.01254619, + "epoch": 0.8126860063129415, + "flos": 23624095962240.0, + "grad_norm": 1.3855330172277736, + "language_loss": 0.91489208, + "learning_rate": 3.5680263481473907e-07, + "loss": 0.99157685, + "num_input_tokens_seen": 291741235, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.10064697, + "step": 13517, + "time_per_iteration": 2.523481607437134 + }, + { + "auxiliary_loss_clip": 0.06405374, + "auxiliary_loss_mlp": 0.01263081, + "balance_loss_clip": 0.06272045, + "balance_loss_mlp": 0.01253711, + "epoch": 0.8127461295656095, + "flos": 25013740400640.0, + "grad_norm": 1.3744470429477684, + "language_loss": 0.78856122, + "learning_rate": 3.565806469852244e-07, + "loss": 0.86524576, + "num_input_tokens_seen": 291761430, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.09368896, + "step": 13518, + "time_per_iteration": 2.513049602508545 + }, + { + "auxiliary_loss_clip": 0.06401889, + "auxiliary_loss_mlp": 0.01264412, + "balance_loss_clip": 0.06271425, + "balance_loss_mlp": 0.01255799, + "epoch": 0.8128062528182775, + "flos": 27349138673280.0, + "grad_norm": 1.7240881927600378, + "language_loss": 0.79624963, + "learning_rate": 3.56358721474336e-07, + "loss": 0.87291259, + "num_input_tokens_seen": 291781755, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.08612061, + "step": 13519, + "time_per_iteration": 3.9774365425109863 + }, + { + "auxiliary_loss_clip": 0.06407484, + "auxiliary_loss_mlp": 0.01262058, + "balance_loss_clip": 0.06272454, + "balance_loss_mlp": 0.01253022, + "epoch": 0.8128663760709455, + "flos": 26513697139200.0, + "grad_norm": 1.5686471804974786, + "language_loss": 0.70565975, + "learning_rate": 3.561368582904905e-07, + "loss": 0.78235519, + "num_input_tokens_seen": 291804410, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.09033203, + "step": 13520, + "time_per_iteration": 2.5642969608306885 + }, + { + "auxiliary_loss_clip": 0.06403634, + "auxiliary_loss_mlp": 0.01265169, + "balance_loss_clip": 0.06270102, + "balance_loss_mlp": 0.01255746, + "epoch": 0.8129264993236134, + "flos": 17937036771840.0, + "grad_norm": 1.3447484311146394, + "language_loss": 0.72752047, + "learning_rate": 3.5591505744209925e-07, + "loss": 0.80420852, + "num_input_tokens_seen": 291823285, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.09417725, + "step": 13521, + "time_per_iteration": 2.483443260192871 + }, + { + "auxiliary_loss_clip": 0.0640461, + "auxiliary_loss_mlp": 0.01267618, + "balance_loss_clip": 0.06270576, + "balance_loss_mlp": 0.01257527, + "epoch": 0.8129866225762814, + "flos": 26184982371840.0, + "grad_norm": 1.5624785217553507, + "language_loss": 0.70352554, + "learning_rate": 3.5569331893757394e-07, + "loss": 0.78024787, + "num_input_tokens_seen": 291845305, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.10089111, + "step": 13522, + "time_per_iteration": 2.5880520343780518 + }, + { + "auxiliary_loss_clip": 0.06397097, + "auxiliary_loss_mlp": 0.01264851, + "balance_loss_clip": 0.06269517, + "balance_loss_mlp": 0.01255464, + "epoch": 0.8130467458289493, + "flos": 21038457870720.0, + "grad_norm": 1.4005848592108407, + "language_loss": 0.70769501, + "learning_rate": 3.554716427853233e-07, + "loss": 0.78431445, + "num_input_tokens_seen": 291863715, + "router_z_loss_clip": 1.27636719, + "router_z_loss_mlp": 0.09381104, + "step": 13523, + "time_per_iteration": 2.5072546005249023 + }, + { + "auxiliary_loss_clip": 0.06398432, + "auxiliary_loss_mlp": 0.01262757, + "balance_loss_clip": 0.06268294, + "balance_loss_mlp": 0.01252965, + "epoch": 0.8131068690816173, + "flos": 15492500156160.0, + "grad_norm": 2.331426517879502, + "language_loss": 0.70879388, + "learning_rate": 3.5525002899375256e-07, + "loss": 0.78540576, + "num_input_tokens_seen": 291880735, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.09777832, + "step": 13524, + "time_per_iteration": 2.4804911613464355 + }, + { + "auxiliary_loss_clip": 0.06399479, + "auxiliary_loss_mlp": 0.01264007, + "balance_loss_clip": 0.06268516, + "balance_loss_mlp": 0.01254756, + "epoch": 0.8131669923342852, + "flos": 29358924779520.0, + "grad_norm": 1.7261650681481027, + "language_loss": 0.63128257, + "learning_rate": 3.550284775712653e-07, + "loss": 0.70791739, + "num_input_tokens_seen": 291900535, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.09259033, + "step": 13525, + "time_per_iteration": 2.5361483097076416 + }, + { + "auxiliary_loss_clip": 0.06397866, + "auxiliary_loss_mlp": 0.01261329, + "balance_loss_clip": 0.06266545, + "balance_loss_mlp": 0.01251947, + "epoch": 0.8132271155869533, + "flos": 35263883312640.0, + "grad_norm": 1.6825597330397746, + "language_loss": 0.65842247, + "learning_rate": 3.548069885262628e-07, + "loss": 0.73501444, + "num_input_tokens_seen": 291919760, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09381104, + "step": 13526, + "time_per_iteration": 2.6087794303894043 + }, + { + "auxiliary_loss_clip": 0.06400132, + "auxiliary_loss_mlp": 0.01263098, + "balance_loss_clip": 0.06268608, + "balance_loss_mlp": 0.0125408, + "epoch": 0.8132872388396212, + "flos": 27789255843840.0, + "grad_norm": 1.4880547068923822, + "language_loss": 0.75493729, + "learning_rate": 3.5458556186714473e-07, + "loss": 0.83156955, + "num_input_tokens_seen": 291938915, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.09020996, + "step": 13527, + "time_per_iteration": 2.539010763168335 + }, + { + "auxiliary_loss_clip": 0.06402984, + "auxiliary_loss_mlp": 0.01266217, + "balance_loss_clip": 0.06270985, + "balance_loss_mlp": 0.01257098, + "epoch": 0.8133473620922892, + "flos": 27827172616320.0, + "grad_norm": 1.9409057063309785, + "language_loss": 0.70657897, + "learning_rate": 3.5436419760230706e-07, + "loss": 0.78327101, + "num_input_tokens_seen": 291958145, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.09124756, + "step": 13528, + "time_per_iteration": 2.5862042903900146 + }, + { + "auxiliary_loss_clip": 0.06401546, + "auxiliary_loss_mlp": 0.01260608, + "balance_loss_clip": 0.06268697, + "balance_loss_mlp": 0.01251185, + "epoch": 0.8134074853449572, + "flos": 18995534674560.0, + "grad_norm": 1.667225217482648, + "language_loss": 0.68823183, + "learning_rate": 3.5414289574014357e-07, + "loss": 0.76485336, + "num_input_tokens_seen": 291976860, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.09423828, + "step": 13529, + "time_per_iteration": 3.9062068462371826 + }, + { + "auxiliary_loss_clip": 0.06397647, + "auxiliary_loss_mlp": 0.01261144, + "balance_loss_clip": 0.06269309, + "balance_loss_mlp": 0.0125231, + "epoch": 0.8134676085976251, + "flos": 24249646218240.0, + "grad_norm": 1.3410194216884235, + "language_loss": 0.77744162, + "learning_rate": 3.5392165628904635e-07, + "loss": 0.85402954, + "num_input_tokens_seen": 291998085, + "router_z_loss_clip": 1.28417969, + "router_z_loss_mlp": 0.08837891, + "step": 13530, + "time_per_iteration": 2.508969306945801 + }, + { + "auxiliary_loss_clip": 0.06397682, + "auxiliary_loss_mlp": 0.0126503, + "balance_loss_clip": 0.06267507, + "balance_loss_mlp": 0.01255821, + "epoch": 0.8135277318502931, + "flos": 19068391399680.0, + "grad_norm": 1.6036142935304527, + "language_loss": 0.81703323, + "learning_rate": 3.537004792574052e-07, + "loss": 0.89366037, + "num_input_tokens_seen": 292016585, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.09216309, + "step": 13531, + "time_per_iteration": 2.465648889541626 + }, + { + "auxiliary_loss_clip": 0.06403959, + "auxiliary_loss_mlp": 0.0126883, + "balance_loss_clip": 0.06269965, + "balance_loss_mlp": 0.01258853, + "epoch": 0.813587855102961, + "flos": 17274617919360.0, + "grad_norm": 2.024023030441739, + "language_loss": 0.72077084, + "learning_rate": 3.534793646536065e-07, + "loss": 0.7974987, + "num_input_tokens_seen": 292033255, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.09985352, + "step": 13532, + "time_per_iteration": 3.8888938426971436 + }, + { + "auxiliary_loss_clip": 0.06401416, + "auxiliary_loss_mlp": 0.0126398, + "balance_loss_clip": 0.06270882, + "balance_loss_mlp": 0.01254896, + "epoch": 0.8136479783556291, + "flos": 20163883680000.0, + "grad_norm": 1.8388062199181954, + "language_loss": 0.77024227, + "learning_rate": 3.5325831248603533e-07, + "loss": 0.84689629, + "num_input_tokens_seen": 292051800, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.09082031, + "step": 13533, + "time_per_iteration": 2.495201826095581 + }, + { + "auxiliary_loss_clip": 0.0640831, + "auxiliary_loss_mlp": 0.01262799, + "balance_loss_clip": 0.06271634, + "balance_loss_mlp": 0.0125247, + "epoch": 0.813708101608297, + "flos": 22058535876480.0, + "grad_norm": 1.4349700882895242, + "language_loss": 0.76950037, + "learning_rate": 3.5303732276307495e-07, + "loss": 0.84621155, + "num_input_tokens_seen": 292072215, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.10314941, + "step": 13534, + "time_per_iteration": 2.508604049682617 + }, + { + "auxiliary_loss_clip": 0.06400474, + "auxiliary_loss_mlp": 0.01265486, + "balance_loss_clip": 0.06269156, + "balance_loss_mlp": 0.0125685, + "epoch": 0.813768224860965, + "flos": 16177825900800.0, + "grad_norm": 2.1221620950684676, + "language_loss": 0.93678272, + "learning_rate": 3.5281639549310336e-07, + "loss": 1.0134424, + "num_input_tokens_seen": 292088830, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.08630371, + "step": 13535, + "time_per_iteration": 2.4925365447998047 + }, + { + "auxiliary_loss_clip": 0.06397314, + "auxiliary_loss_mlp": 0.01265583, + "balance_loss_clip": 0.06270063, + "balance_loss_mlp": 0.01256451, + "epoch": 0.8138283481136329, + "flos": 24359119977600.0, + "grad_norm": 1.584672003718744, + "language_loss": 0.70635736, + "learning_rate": 3.52595530684499e-07, + "loss": 0.78298628, + "num_input_tokens_seen": 292109225, + "router_z_loss_clip": 1.27246094, + "router_z_loss_mlp": 0.09130859, + "step": 13536, + "time_per_iteration": 2.5193591117858887 + }, + { + "auxiliary_loss_clip": 0.06398758, + "auxiliary_loss_mlp": 0.01267555, + "balance_loss_clip": 0.06267327, + "balance_loss_mlp": 0.01257744, + "epoch": 0.8138884713663009, + "flos": 25522773154560.0, + "grad_norm": 1.4221719644735906, + "language_loss": 0.75364375, + "learning_rate": 3.5237472834563775e-07, + "loss": 0.83030683, + "num_input_tokens_seen": 292129660, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09814453, + "step": 13537, + "time_per_iteration": 3.939565420150757 + }, + { + "auxiliary_loss_clip": 0.06399003, + "auxiliary_loss_mlp": 0.01263044, + "balance_loss_clip": 0.06270146, + "balance_loss_mlp": 0.01254307, + "epoch": 0.8139485946189688, + "flos": 22460736274560.0, + "grad_norm": 1.471007913892401, + "language_loss": 0.76099801, + "learning_rate": 3.5215398848489163e-07, + "loss": 0.83761841, + "num_input_tokens_seen": 292149090, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.08734131, + "step": 13538, + "time_per_iteration": 2.5426995754241943 + }, + { + "auxiliary_loss_clip": 0.06402089, + "auxiliary_loss_mlp": 0.01264597, + "balance_loss_clip": 0.06269269, + "balance_loss_mlp": 0.01255317, + "epoch": 0.8140087178716369, + "flos": 21256566848640.0, + "grad_norm": 1.5246310927324862, + "language_loss": 0.78052437, + "learning_rate": 3.5193331111063176e-07, + "loss": 0.85719126, + "num_input_tokens_seen": 292169260, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.09283447, + "step": 13539, + "time_per_iteration": 2.496209144592285 + }, + { + "auxiliary_loss_clip": 0.06397711, + "auxiliary_loss_mlp": 0.01270691, + "balance_loss_clip": 0.06270097, + "balance_loss_mlp": 0.01261834, + "epoch": 0.8140688411243048, + "flos": 39424179657600.0, + "grad_norm": 2.587253276724192, + "language_loss": 0.66418785, + "learning_rate": 3.5171269623122533e-07, + "loss": 0.74087191, + "num_input_tokens_seen": 292188145, + "router_z_loss_clip": 1.27636719, + "router_z_loss_mlp": 0.08856201, + "step": 13540, + "time_per_iteration": 2.634174108505249 + }, + { + "auxiliary_loss_clip": 0.06402186, + "auxiliary_loss_mlp": 0.01265102, + "balance_loss_clip": 0.06269908, + "balance_loss_mlp": 0.01256024, + "epoch": 0.8141289643769728, + "flos": 25423781155200.0, + "grad_norm": 1.4733031204112998, + "language_loss": 0.67490125, + "learning_rate": 3.5149214385503913e-07, + "loss": 0.7515741, + "num_input_tokens_seen": 292212135, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09063721, + "step": 13541, + "time_per_iteration": 2.57694149017334 + }, + { + "auxiliary_loss_clip": 0.06399746, + "auxiliary_loss_mlp": 0.01265517, + "balance_loss_clip": 0.06268999, + "balance_loss_mlp": 0.01255527, + "epoch": 0.8141890876296408, + "flos": 12572990271360.0, + "grad_norm": 2.4381124883520404, + "language_loss": 0.69473195, + "learning_rate": 3.512716539904355e-07, + "loss": 0.7713846, + "num_input_tokens_seen": 292230645, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09991455, + "step": 13542, + "time_per_iteration": 2.4687132835388184 + }, + { + "auxiliary_loss_clip": 0.06406985, + "auxiliary_loss_mlp": 0.01266697, + "balance_loss_clip": 0.06269906, + "balance_loss_mlp": 0.0125623, + "epoch": 0.8142492108823087, + "flos": 14971015071360.0, + "grad_norm": 3.9353973875515895, + "language_loss": 0.79934382, + "learning_rate": 3.5105122664577613e-07, + "loss": 0.87608063, + "num_input_tokens_seen": 292243540, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.10467529, + "step": 13543, + "time_per_iteration": 2.4158408641815186 + }, + { + "auxiliary_loss_clip": 0.06409101, + "auxiliary_loss_mlp": 0.01264352, + "balance_loss_clip": 0.06271017, + "balance_loss_mlp": 0.01253879, + "epoch": 0.8143093341349767, + "flos": 12426899477760.0, + "grad_norm": 2.3767283525757943, + "language_loss": 0.78172165, + "learning_rate": 3.5083086182942003e-07, + "loss": 0.85845613, + "num_input_tokens_seen": 292261715, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.10467529, + "step": 13544, + "time_per_iteration": 2.4718081951141357 + }, + { + "auxiliary_loss_clip": 0.06415416, + "auxiliary_loss_mlp": 0.0126488, + "balance_loss_clip": 0.06274471, + "balance_loss_mlp": 0.01253859, + "epoch": 0.8143694573876447, + "flos": 11915267247360.0, + "grad_norm": 2.5713851454912557, + "language_loss": 0.74007636, + "learning_rate": 3.5061055954972264e-07, + "loss": 0.81687939, + "num_input_tokens_seen": 292275080, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.11022949, + "step": 13545, + "time_per_iteration": 2.464211940765381 + }, + { + "auxiliary_loss_clip": 0.06398509, + "auxiliary_loss_mlp": 0.01265881, + "balance_loss_clip": 0.0627104, + "balance_loss_mlp": 0.01256928, + "epoch": 0.8144295806403127, + "flos": 21218901638400.0, + "grad_norm": 1.5996287062852548, + "language_loss": 0.77183664, + "learning_rate": 3.5039031981503776e-07, + "loss": 0.84848052, + "num_input_tokens_seen": 292294635, + "router_z_loss_clip": 1.27441406, + "router_z_loss_mlp": 0.08953857, + "step": 13546, + "time_per_iteration": 2.5111136436462402 + }, + { + "auxiliary_loss_clip": 0.06407703, + "auxiliary_loss_mlp": 0.01264502, + "balance_loss_clip": 0.06273138, + "balance_loss_mlp": 0.01255311, + "epoch": 0.8144897038929806, + "flos": 19871450530560.0, + "grad_norm": 2.7448316541236144, + "language_loss": 0.71193993, + "learning_rate": 3.501701426337178e-07, + "loss": 0.78866196, + "num_input_tokens_seen": 292312695, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.09179688, + "step": 13547, + "time_per_iteration": 2.495678186416626 + }, + { + "auxiliary_loss_clip": 0.06408043, + "auxiliary_loss_mlp": 0.01267842, + "balance_loss_clip": 0.06272228, + "balance_loss_mlp": 0.01257775, + "epoch": 0.8145498271456486, + "flos": 24578654474880.0, + "grad_norm": 1.7869845648084397, + "language_loss": 0.71165389, + "learning_rate": 3.49950028014111e-07, + "loss": 0.78841269, + "num_input_tokens_seen": 292332005, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.10070801, + "step": 13548, + "time_per_iteration": 2.562206506729126 + }, + { + "auxiliary_loss_clip": 0.06407051, + "auxiliary_loss_mlp": 0.01261806, + "balance_loss_clip": 0.06273579, + "balance_loss_mlp": 0.01251733, + "epoch": 0.8146099503983165, + "flos": 20199159048960.0, + "grad_norm": 1.9522520316462837, + "language_loss": 0.77203232, + "learning_rate": 3.4972997596456444e-07, + "loss": 0.84872091, + "num_input_tokens_seen": 292348365, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.10083008, + "step": 13549, + "time_per_iteration": 2.502742290496826 + }, + { + "auxiliary_loss_clip": 0.06405576, + "auxiliary_loss_mlp": 0.0126447, + "balance_loss_clip": 0.06270814, + "balance_loss_mlp": 0.01254707, + "epoch": 0.8146700736509845, + "flos": 19543071179520.0, + "grad_norm": 1.8670916613162452, + "language_loss": 0.71610808, + "learning_rate": 3.4950998649342233e-07, + "loss": 0.79280859, + "num_input_tokens_seen": 292368050, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.09753418, + "step": 13550, + "time_per_iteration": 2.6039505004882812 + }, + { + "auxiliary_loss_clip": 0.06397806, + "auxiliary_loss_mlp": 0.01263957, + "balance_loss_clip": 0.06269183, + "balance_loss_mlp": 0.01255654, + "epoch": 0.8147301969036524, + "flos": 18047265217920.0, + "grad_norm": 1.6838631897121676, + "language_loss": 0.71859229, + "learning_rate": 3.4929005960902826e-07, + "loss": 0.79520994, + "num_input_tokens_seen": 292385315, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.08300781, + "step": 13551, + "time_per_iteration": 2.551734447479248 + }, + { + "auxiliary_loss_clip": 0.06410958, + "auxiliary_loss_mlp": 0.01264146, + "balance_loss_clip": 0.06273584, + "balance_loss_mlp": 0.0125393, + "epoch": 0.8147903201563205, + "flos": 18010606256640.0, + "grad_norm": 1.848963719334946, + "language_loss": 0.69100463, + "learning_rate": 3.4907019531971926e-07, + "loss": 0.76775569, + "num_input_tokens_seen": 292403375, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.10217285, + "step": 13552, + "time_per_iteration": 2.5665345191955566 + }, + { + "auxiliary_loss_clip": 0.06406602, + "auxiliary_loss_mlp": 0.01264219, + "balance_loss_clip": 0.0627239, + "balance_loss_mlp": 0.01254492, + "epoch": 0.8148504434089884, + "flos": 20264343125760.0, + "grad_norm": 1.7694730597096269, + "language_loss": 0.82387245, + "learning_rate": 3.4885039363383407e-07, + "loss": 0.90058064, + "num_input_tokens_seen": 292419260, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.09729004, + "step": 13553, + "time_per_iteration": 2.450315475463867 + }, + { + "auxiliary_loss_clip": 0.06406596, + "auxiliary_loss_mlp": 0.01265423, + "balance_loss_clip": 0.06271842, + "balance_loss_mlp": 0.01255916, + "epoch": 0.8149105666616564, + "flos": 12499588494720.0, + "grad_norm": 1.7106764714025673, + "language_loss": 0.68241465, + "learning_rate": 3.4863065455970795e-07, + "loss": 0.75913489, + "num_input_tokens_seen": 292436095, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.09509277, + "step": 13554, + "time_per_iteration": 2.462124824523926 + }, + { + "auxiliary_loss_clip": 0.06406562, + "auxiliary_loss_mlp": 0.01265488, + "balance_loss_clip": 0.06274106, + "balance_loss_mlp": 0.01255624, + "epoch": 0.8149706899143244, + "flos": 32531609376000.0, + "grad_norm": 1.6041901948473798, + "language_loss": 0.6636458, + "learning_rate": 3.484109781056723e-07, + "loss": 0.74036634, + "num_input_tokens_seen": 292457190, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09857178, + "step": 13555, + "time_per_iteration": 2.553244113922119 + }, + { + "auxiliary_loss_clip": 0.0640707, + "auxiliary_loss_mlp": 0.01264187, + "balance_loss_clip": 0.06269799, + "balance_loss_mlp": 0.01254352, + "epoch": 0.8150308131669923, + "flos": 19391362162560.0, + "grad_norm": 1.6698699385134061, + "language_loss": 0.74007624, + "learning_rate": 3.4819136428005844e-07, + "loss": 0.81678879, + "num_input_tokens_seen": 292474300, + "router_z_loss_clip": 1.37402344, + "router_z_loss_mlp": 0.09838867, + "step": 13556, + "time_per_iteration": 2.457014322280884 + }, + { + "auxiliary_loss_clip": 0.064043, + "auxiliary_loss_mlp": 0.01263626, + "balance_loss_clip": 0.06272946, + "balance_loss_mlp": 0.0125465, + "epoch": 0.8150909364196604, + "flos": 17427249331200.0, + "grad_norm": 1.7345154652881483, + "language_loss": 0.8086679, + "learning_rate": 3.4797181309119307e-07, + "loss": 0.88534719, + "num_input_tokens_seen": 292492420, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.08978271, + "step": 13557, + "time_per_iteration": 2.460977077484131 + }, + { + "auxiliary_loss_clip": 0.06408045, + "auxiliary_loss_mlp": 0.01267038, + "balance_loss_clip": 0.06272027, + "balance_loss_mlp": 0.01256923, + "epoch": 0.8151510596723283, + "flos": 27170246206080.0, + "grad_norm": 1.581815205811392, + "language_loss": 0.65745318, + "learning_rate": 3.4775232454740255e-07, + "loss": 0.73420399, + "num_input_tokens_seen": 292512895, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.10119629, + "step": 13558, + "time_per_iteration": 2.5265209674835205 + }, + { + "auxiliary_loss_clip": 0.06310294, + "auxiliary_loss_mlp": 0.01251766, + "balance_loss_clip": 0.06255711, + "balance_loss_mlp": 0.0125069, + "epoch": 0.8152111829249963, + "flos": 64236581896320.0, + "grad_norm": 1.1628242103674278, + "language_loss": 0.56932402, + "learning_rate": 3.4753289865700896e-07, + "loss": 0.64494467, + "num_input_tokens_seen": 292566580, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.01077271, + "step": 13559, + "time_per_iteration": 4.45433497428894 + }, + { + "auxiliary_loss_clip": 0.06308051, + "auxiliary_loss_mlp": 0.01250118, + "balance_loss_clip": 0.06253337, + "balance_loss_mlp": 0.01249046, + "epoch": 0.8152713061776642, + "flos": 67091201193600.0, + "grad_norm": 0.6673892366494375, + "language_loss": 0.55275512, + "learning_rate": 3.473135354283334e-07, + "loss": 0.62833685, + "num_input_tokens_seen": 292621490, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.01073456, + "step": 13560, + "time_per_iteration": 2.997331380844116 + }, + { + "auxiliary_loss_clip": 0.06405302, + "auxiliary_loss_mlp": 0.01266787, + "balance_loss_clip": 0.06273372, + "balance_loss_mlp": 0.01257703, + "epoch": 0.8153314294303322, + "flos": 14396169335040.0, + "grad_norm": 1.5961356559953426, + "language_loss": 0.67774737, + "learning_rate": 3.470942348696948e-07, + "loss": 0.7544682, + "num_input_tokens_seen": 292638660, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.09082031, + "step": 13561, + "time_per_iteration": 2.578291416168213 + }, + { + "auxiliary_loss_clip": 0.06410162, + "auxiliary_loss_mlp": 0.01264267, + "balance_loss_clip": 0.06272513, + "balance_loss_mlp": 0.0125423, + "epoch": 0.8153915526830001, + "flos": 25629563583360.0, + "grad_norm": 1.4593268747943478, + "language_loss": 0.81970775, + "learning_rate": 3.468749969894085e-07, + "loss": 0.89645207, + "num_input_tokens_seen": 292658545, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.10040283, + "step": 13562, + "time_per_iteration": 2.5358498096466064 + }, + { + "auxiliary_loss_clip": 0.06404186, + "auxiliary_loss_mlp": 0.01265369, + "balance_loss_clip": 0.0627111, + "balance_loss_mlp": 0.01255689, + "epoch": 0.8154516759356681, + "flos": 23376120203520.0, + "grad_norm": 1.50215259842858, + "language_loss": 0.71958882, + "learning_rate": 3.4665582179578734e-07, + "loss": 0.79628438, + "num_input_tokens_seen": 292678460, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.09680176, + "step": 13563, + "time_per_iteration": 2.5700597763061523 + }, + { + "auxiliary_loss_clip": 0.06403195, + "auxiliary_loss_mlp": 0.01269781, + "balance_loss_clip": 0.06270723, + "balance_loss_mlp": 0.0125963, + "epoch": 0.815511799188336, + "flos": 28157019413760.0, + "grad_norm": 1.7257040784897213, + "language_loss": 0.70323086, + "learning_rate": 3.4643670929714387e-07, + "loss": 0.77996063, + "num_input_tokens_seen": 292699815, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.10144043, + "step": 13564, + "time_per_iteration": 2.579045295715332 + }, + { + "auxiliary_loss_clip": 0.06402478, + "auxiliary_loss_mlp": 0.01261987, + "balance_loss_clip": 0.06269638, + "balance_loss_mlp": 0.01252862, + "epoch": 0.8155719224410041, + "flos": 16989186585600.0, + "grad_norm": 1.70957475209218, + "language_loss": 0.70465791, + "learning_rate": 3.462176595017854e-07, + "loss": 0.78130251, + "num_input_tokens_seen": 292717370, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.09130859, + "step": 13565, + "time_per_iteration": 2.482426166534424 + }, + { + "auxiliary_loss_clip": 0.0640111, + "auxiliary_loss_mlp": 0.01264996, + "balance_loss_clip": 0.06269612, + "balance_loss_mlp": 0.01255346, + "epoch": 0.815632045693672, + "flos": 24688757139840.0, + "grad_norm": 2.037805159050188, + "language_loss": 0.79566395, + "learning_rate": 3.459986724180188e-07, + "loss": 0.87232494, + "num_input_tokens_seen": 292737110, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09655762, + "step": 13566, + "time_per_iteration": 2.51450514793396 + }, + { + "auxiliary_loss_clip": 0.06398387, + "auxiliary_loss_mlp": 0.01263188, + "balance_loss_clip": 0.06270022, + "balance_loss_mlp": 0.01253991, + "epoch": 0.81569216894634, + "flos": 19944516890880.0, + "grad_norm": 1.680610729726936, + "language_loss": 0.8259697, + "learning_rate": 3.457797480541491e-07, + "loss": 0.90258545, + "num_input_tokens_seen": 292756510, + "router_z_loss_clip": 1.28222656, + "router_z_loss_mlp": 0.09197998, + "step": 13567, + "time_per_iteration": 2.5065062046051025 + }, + { + "auxiliary_loss_clip": 0.063999, + "auxiliary_loss_mlp": 0.01263286, + "balance_loss_clip": 0.06270279, + "balance_loss_mlp": 0.01254661, + "epoch": 0.8157522921990079, + "flos": 21805948143360.0, + "grad_norm": 1.901722812011985, + "language_loss": 0.79928589, + "learning_rate": 3.455608864184771e-07, + "loss": 0.87591779, + "num_input_tokens_seen": 292776710, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.08624268, + "step": 13568, + "time_per_iteration": 2.482262372970581 + }, + { + "auxiliary_loss_clip": 0.06399144, + "auxiliary_loss_mlp": 0.01266035, + "balance_loss_clip": 0.06270231, + "balance_loss_mlp": 0.01257118, + "epoch": 0.8158124154516759, + "flos": 18513098392320.0, + "grad_norm": 1.6787478080624303, + "language_loss": 0.77251327, + "learning_rate": 3.453420875193016e-07, + "loss": 0.84916508, + "num_input_tokens_seen": 292794350, + "router_z_loss_clip": 1.28808594, + "router_z_loss_mlp": 0.0892334, + "step": 13569, + "time_per_iteration": 3.9400181770324707 + }, + { + "auxiliary_loss_clip": 0.06403175, + "auxiliary_loss_mlp": 0.01263186, + "balance_loss_clip": 0.06272935, + "balance_loss_mlp": 0.01254067, + "epoch": 0.815872538704344, + "flos": 26837590296960.0, + "grad_norm": 2.286730013168615, + "language_loss": 0.58822525, + "learning_rate": 3.451233513649199e-07, + "loss": 0.66488886, + "num_input_tokens_seen": 292814005, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.09118652, + "step": 13570, + "time_per_iteration": 2.524815082550049 + }, + { + "auxiliary_loss_clip": 0.0640761, + "auxiliary_loss_mlp": 0.01263763, + "balance_loss_clip": 0.06271629, + "balance_loss_mlp": 0.01253577, + "epoch": 0.8159326619570119, + "flos": 21732127096320.0, + "grad_norm": 1.6002303397111248, + "language_loss": 0.82693851, + "learning_rate": 3.4490467796362687e-07, + "loss": 0.90365231, + "num_input_tokens_seen": 292833485, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10192871, + "step": 13571, + "time_per_iteration": 2.497116804122925 + }, + { + "auxiliary_loss_clip": 0.06401446, + "auxiliary_loss_mlp": 0.0126676, + "balance_loss_clip": 0.06269471, + "balance_loss_mlp": 0.0125621, + "epoch": 0.8159927852096799, + "flos": 13845152885760.0, + "grad_norm": 2.3316068768824905, + "language_loss": 0.79288316, + "learning_rate": 3.446860673237142e-07, + "loss": 0.86956525, + "num_input_tokens_seen": 292848045, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.10546875, + "step": 13572, + "time_per_iteration": 3.9277310371398926 + }, + { + "auxiliary_loss_clip": 0.06405439, + "auxiliary_loss_mlp": 0.01264472, + "balance_loss_clip": 0.06271061, + "balance_loss_mlp": 0.01254965, + "epoch": 0.8160529084623478, + "flos": 24506552436480.0, + "grad_norm": 1.8410369456410705, + "language_loss": 0.65139991, + "learning_rate": 3.4446751945347186e-07, + "loss": 0.72809899, + "num_input_tokens_seen": 292869965, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.09509277, + "step": 13573, + "time_per_iteration": 2.5813302993774414 + }, + { + "auxiliary_loss_clip": 0.06397152, + "auxiliary_loss_mlp": 0.01265693, + "balance_loss_clip": 0.06268078, + "balance_loss_mlp": 0.01257193, + "epoch": 0.8161130317150158, + "flos": 24833170851840.0, + "grad_norm": 3.2728754081568443, + "language_loss": 0.75079989, + "learning_rate": 3.442490343611868e-07, + "loss": 0.8274284, + "num_input_tokens_seen": 292889680, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.08508301, + "step": 13574, + "time_per_iteration": 2.520437002182007 + }, + { + "auxiliary_loss_clip": 0.06406549, + "auxiliary_loss_mlp": 0.01263703, + "balance_loss_clip": 0.06272028, + "balance_loss_mlp": 0.01253737, + "epoch": 0.8161731549676837, + "flos": 30964497989760.0, + "grad_norm": 1.5623209445924822, + "language_loss": 0.5998435, + "learning_rate": 3.4403061205514485e-07, + "loss": 0.67654604, + "num_input_tokens_seen": 292912360, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.09973145, + "step": 13575, + "time_per_iteration": 2.5688116550445557 + }, + { + "auxiliary_loss_clip": 0.06401668, + "auxiliary_loss_mlp": 0.01262946, + "balance_loss_clip": 0.0626979, + "balance_loss_mlp": 0.01253445, + "epoch": 0.8162332782203517, + "flos": 18557975053440.0, + "grad_norm": 2.3600977728532846, + "language_loss": 0.7450968, + "learning_rate": 3.4381225254362736e-07, + "loss": 0.82174295, + "num_input_tokens_seen": 292928325, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09503174, + "step": 13576, + "time_per_iteration": 2.4884495735168457 + }, + { + "auxiliary_loss_clip": 0.06308542, + "auxiliary_loss_mlp": 0.01250123, + "balance_loss_clip": 0.06253725, + "balance_loss_mlp": 0.01249126, + "epoch": 0.8162934014730197, + "flos": 70405700025600.0, + "grad_norm": 0.8084788466791542, + "language_loss": 0.58613569, + "learning_rate": 3.435939558349155e-07, + "loss": 0.6617223, + "num_input_tokens_seen": 292992795, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.00996399, + "step": 13577, + "time_per_iteration": 4.5383522510528564 + }, + { + "auxiliary_loss_clip": 0.06398452, + "auxiliary_loss_mlp": 0.01264954, + "balance_loss_clip": 0.06270564, + "balance_loss_mlp": 0.01255912, + "epoch": 0.8163535247256877, + "flos": 21221165698560.0, + "grad_norm": 1.6710813942162877, + "language_loss": 0.70834422, + "learning_rate": 3.4337572193728747e-07, + "loss": 0.78497839, + "num_input_tokens_seen": 293011950, + "router_z_loss_clip": 1.27832031, + "router_z_loss_mlp": 0.09039307, + "step": 13578, + "time_per_iteration": 2.4779903888702393 + }, + { + "auxiliary_loss_clip": 0.06404679, + "auxiliary_loss_mlp": 0.01264719, + "balance_loss_clip": 0.06272516, + "balance_loss_mlp": 0.01255433, + "epoch": 0.8164136479783556, + "flos": 21104061217920.0, + "grad_norm": 1.6470970354914776, + "language_loss": 0.73678112, + "learning_rate": 3.431575508590172e-07, + "loss": 0.81347507, + "num_input_tokens_seen": 293030175, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.09283447, + "step": 13579, + "time_per_iteration": 2.509214162826538 + }, + { + "auxiliary_loss_clip": 0.06405793, + "auxiliary_loss_mlp": 0.01262409, + "balance_loss_clip": 0.06271651, + "balance_loss_mlp": 0.01253433, + "epoch": 0.8164737712310236, + "flos": 21726215383680.0, + "grad_norm": 1.6525660309020993, + "language_loss": 0.79023516, + "learning_rate": 3.4293944260837873e-07, + "loss": 0.86691713, + "num_input_tokens_seen": 293047980, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.08978271, + "step": 13580, + "time_per_iteration": 2.481717109680176 + }, + { + "auxiliary_loss_clip": 0.06397673, + "auxiliary_loss_mlp": 0.01267208, + "balance_loss_clip": 0.0626862, + "balance_loss_mlp": 0.01258103, + "epoch": 0.8165338944836915, + "flos": 19542903471360.0, + "grad_norm": 1.6359776593640634, + "language_loss": 0.68975896, + "learning_rate": 3.4272139719364314e-07, + "loss": 0.76640779, + "num_input_tokens_seen": 293067030, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.0909729, + "step": 13581, + "time_per_iteration": 2.4774811267852783 + }, + { + "auxiliary_loss_clip": 0.06403127, + "auxiliary_loss_mlp": 0.01263284, + "balance_loss_clip": 0.06270953, + "balance_loss_mlp": 0.01254487, + "epoch": 0.8165940177363595, + "flos": 22934996784000.0, + "grad_norm": 1.6436764796534944, + "language_loss": 0.60097897, + "learning_rate": 3.4250341462307786e-07, + "loss": 0.67764312, + "num_input_tokens_seen": 293085575, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.08795166, + "step": 13582, + "time_per_iteration": 2.5118255615234375 + }, + { + "auxiliary_loss_clip": 0.06393835, + "auxiliary_loss_mlp": 0.0126456, + "balance_loss_clip": 0.06269538, + "balance_loss_mlp": 0.01256323, + "epoch": 0.8166541409890276, + "flos": 23377545722880.0, + "grad_norm": 1.3287136998810383, + "language_loss": 0.82430953, + "learning_rate": 3.4228549490494897e-07, + "loss": 0.90089345, + "num_input_tokens_seen": 293108200, + "router_z_loss_clip": 1.24414062, + "router_z_loss_mlp": 0.0824585, + "step": 13583, + "time_per_iteration": 2.5597774982452393 + }, + { + "auxiliary_loss_clip": 0.06401314, + "auxiliary_loss_mlp": 0.01261966, + "balance_loss_clip": 0.0626954, + "balance_loss_mlp": 0.01252942, + "epoch": 0.8167142642416955, + "flos": 18447872388480.0, + "grad_norm": 1.5767524844469751, + "language_loss": 0.74625087, + "learning_rate": 3.4206763804752093e-07, + "loss": 0.82288373, + "num_input_tokens_seen": 293126020, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09020996, + "step": 13584, + "time_per_iteration": 2.4725546836853027 + }, + { + "auxiliary_loss_clip": 0.06409091, + "auxiliary_loss_mlp": 0.01262966, + "balance_loss_clip": 0.06275168, + "balance_loss_mlp": 0.01253178, + "epoch": 0.8167743874943635, + "flos": 21221333406720.0, + "grad_norm": 1.5450053783632358, + "language_loss": 0.74571323, + "learning_rate": 3.4184984405905405e-07, + "loss": 0.82243377, + "num_input_tokens_seen": 293144620, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.09790039, + "step": 13585, + "time_per_iteration": 2.5259850025177 + }, + { + "auxiliary_loss_clip": 0.06405304, + "auxiliary_loss_mlp": 0.01265456, + "balance_loss_clip": 0.06274252, + "balance_loss_mlp": 0.01255598, + "epoch": 0.8168345107470314, + "flos": 18703646576640.0, + "grad_norm": 1.6400360392779838, + "language_loss": 0.70096934, + "learning_rate": 3.416321129478068e-07, + "loss": 0.77767694, + "num_input_tokens_seen": 293162850, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09857178, + "step": 13586, + "time_per_iteration": 2.4900436401367188 + }, + { + "auxiliary_loss_clip": 0.06405935, + "auxiliary_loss_mlp": 0.01267633, + "balance_loss_clip": 0.06273375, + "balance_loss_mlp": 0.01258442, + "epoch": 0.8168946339996994, + "flos": 16258648763520.0, + "grad_norm": 1.5247146211880829, + "language_loss": 0.61139441, + "learning_rate": 3.4141444472203594e-07, + "loss": 0.68813008, + "num_input_tokens_seen": 293181620, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09191895, + "step": 13587, + "time_per_iteration": 2.514977216720581 + }, + { + "auxiliary_loss_clip": 0.0641003, + "auxiliary_loss_mlp": 0.01265651, + "balance_loss_clip": 0.06272879, + "balance_loss_mlp": 0.01255757, + "epoch": 0.8169547572523673, + "flos": 26948615356800.0, + "grad_norm": 2.301034308927258, + "language_loss": 0.69020987, + "learning_rate": 3.4119683938999624e-07, + "loss": 0.7669667, + "num_input_tokens_seen": 293200270, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.09887695, + "step": 13588, + "time_per_iteration": 2.5375754833221436 + }, + { + "auxiliary_loss_clip": 0.06406662, + "auxiliary_loss_mlp": 0.01272493, + "balance_loss_clip": 0.06272782, + "balance_loss_mlp": 0.0126242, + "epoch": 0.8170148805050353, + "flos": 18958204880640.0, + "grad_norm": 1.5284621283458033, + "language_loss": 0.73197293, + "learning_rate": 3.4097929695993854e-07, + "loss": 0.80876452, + "num_input_tokens_seen": 293218960, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.10070801, + "step": 13589, + "time_per_iteration": 2.517772674560547 + }, + { + "auxiliary_loss_clip": 0.06399844, + "auxiliary_loss_mlp": 0.01267676, + "balance_loss_clip": 0.06269893, + "balance_loss_mlp": 0.0125786, + "epoch": 0.8170750037577033, + "flos": 21841307366400.0, + "grad_norm": 1.7674157156585606, + "language_loss": 0.73466247, + "learning_rate": 3.4076181744011166e-07, + "loss": 0.81133771, + "num_input_tokens_seen": 293236450, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.09820557, + "step": 13590, + "time_per_iteration": 2.4888107776641846 + }, + { + "auxiliary_loss_clip": 0.06408446, + "auxiliary_loss_mlp": 0.01267037, + "balance_loss_clip": 0.06271075, + "balance_loss_mlp": 0.01256833, + "epoch": 0.8171351270103713, + "flos": 33514986493440.0, + "grad_norm": 1.8858247117206646, + "language_loss": 0.65332603, + "learning_rate": 3.4054440083876345e-07, + "loss": 0.7300809, + "num_input_tokens_seen": 293256480, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.10198975, + "step": 13591, + "time_per_iteration": 2.658235788345337 + }, + { + "auxiliary_loss_clip": 0.06408292, + "auxiliary_loss_mlp": 0.01266694, + "balance_loss_clip": 0.06271869, + "balance_loss_mlp": 0.01256693, + "epoch": 0.8171952502630392, + "flos": 22714330256640.0, + "grad_norm": 2.6750207052174817, + "language_loss": 0.68109965, + "learning_rate": 3.403270471641373e-07, + "loss": 0.75784951, + "num_input_tokens_seen": 293274960, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.10003662, + "step": 13592, + "time_per_iteration": 2.569607973098755 + }, + { + "auxiliary_loss_clip": 0.06402638, + "auxiliary_loss_mlp": 0.01263684, + "balance_loss_clip": 0.06269838, + "balance_loss_mlp": 0.01253897, + "epoch": 0.8172553735157072, + "flos": 26730883722240.0, + "grad_norm": 1.8292699977541562, + "language_loss": 0.66788435, + "learning_rate": 3.401097564244759e-07, + "loss": 0.74454749, + "num_input_tokens_seen": 293295945, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.09790039, + "step": 13593, + "time_per_iteration": 2.6358397006988525 + }, + { + "auxiliary_loss_clip": 0.06402188, + "auxiliary_loss_mlp": 0.01262856, + "balance_loss_clip": 0.06270669, + "balance_loss_mlp": 0.01254118, + "epoch": 0.8173154967683751, + "flos": 15966551030400.0, + "grad_norm": 1.8879994801878386, + "language_loss": 0.69759774, + "learning_rate": 3.398925286280188e-07, + "loss": 0.77424812, + "num_input_tokens_seen": 293313300, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.08740234, + "step": 13594, + "time_per_iteration": 2.4728150367736816 + }, + { + "auxiliary_loss_clip": 0.06406444, + "auxiliary_loss_mlp": 0.01262646, + "balance_loss_clip": 0.06270047, + "balance_loss_mlp": 0.01253229, + "epoch": 0.8173756200210431, + "flos": 25992547470720.0, + "grad_norm": 1.7768009841467751, + "language_loss": 0.66399467, + "learning_rate": 3.3967536378300456e-07, + "loss": 0.74068558, + "num_input_tokens_seen": 293333085, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.09417725, + "step": 13595, + "time_per_iteration": 2.581303834915161 + }, + { + "auxiliary_loss_clip": 0.0641185, + "auxiliary_loss_mlp": 0.01271254, + "balance_loss_clip": 0.06272434, + "balance_loss_mlp": 0.01261771, + "epoch": 0.8174357432737112, + "flos": 25671211862400.0, + "grad_norm": 1.436739203563198, + "language_loss": 0.78803599, + "learning_rate": 3.394582618976658e-07, + "loss": 0.86486703, + "num_input_tokens_seen": 293351895, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.09490967, + "step": 13596, + "time_per_iteration": 2.5674192905426025 + }, + { + "auxiliary_loss_clip": 0.06401101, + "auxiliary_loss_mlp": 0.01264044, + "balance_loss_clip": 0.06269896, + "balance_loss_mlp": 0.01254245, + "epoch": 0.8174958665263791, + "flos": 21841517001600.0, + "grad_norm": 4.608400276683875, + "language_loss": 0.58776182, + "learning_rate": 3.392412229802362e-07, + "loss": 0.66441321, + "num_input_tokens_seen": 293371165, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09802246, + "step": 13597, + "time_per_iteration": 2.5309157371520996 + }, + { + "auxiliary_loss_clip": 0.06398574, + "auxiliary_loss_mlp": 0.01266127, + "balance_loss_clip": 0.06269415, + "balance_loss_mlp": 0.01257443, + "epoch": 0.8175559897790471, + "flos": 22462077939840.0, + "grad_norm": 1.411078794675908, + "language_loss": 0.82824457, + "learning_rate": 3.390242470389462e-07, + "loss": 0.90489155, + "num_input_tokens_seen": 293391150, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.0869751, + "step": 13598, + "time_per_iteration": 2.494666337966919 + }, + { + "auxiliary_loss_clip": 0.06402759, + "auxiliary_loss_mlp": 0.01265938, + "balance_loss_clip": 0.06267741, + "balance_loss_mlp": 0.01256384, + "epoch": 0.817616113031715, + "flos": 23621328777600.0, + "grad_norm": 1.988288541952237, + "language_loss": 0.82828057, + "learning_rate": 3.3880733408202277e-07, + "loss": 0.90496761, + "num_input_tokens_seen": 293409440, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.09552002, + "step": 13599, + "time_per_iteration": 3.9362494945526123 + }, + { + "auxiliary_loss_clip": 0.0639835, + "auxiliary_loss_mlp": 0.01264171, + "balance_loss_clip": 0.06268303, + "balance_loss_mlp": 0.0125532, + "epoch": 0.817676236284383, + "flos": 27679572449280.0, + "grad_norm": 1.9427559574144415, + "language_loss": 0.84026325, + "learning_rate": 3.3859048411769186e-07, + "loss": 0.91688854, + "num_input_tokens_seen": 293428995, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.08850098, + "step": 13600, + "time_per_iteration": 2.575437068939209 + }, + { + "auxiliary_loss_clip": 0.06406076, + "auxiliary_loss_mlp": 0.01265545, + "balance_loss_clip": 0.06270483, + "balance_loss_mlp": 0.01255495, + "epoch": 0.8177363595370509, + "flos": 24687918599040.0, + "grad_norm": 1.5668233698326273, + "language_loss": 0.73828596, + "learning_rate": 3.383736971541766e-07, + "loss": 0.8150022, + "num_input_tokens_seen": 293449155, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.10046387, + "step": 13601, + "time_per_iteration": 2.583362579345703 + }, + { + "auxiliary_loss_clip": 0.06410781, + "auxiliary_loss_mlp": 0.01263621, + "balance_loss_clip": 0.06272674, + "balance_loss_mlp": 0.01253757, + "epoch": 0.817796482789719, + "flos": 17351835056640.0, + "grad_norm": 2.6342445376151042, + "language_loss": 0.68994367, + "learning_rate": 3.3815697319969737e-07, + "loss": 0.76668769, + "num_input_tokens_seen": 293466125, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.09863281, + "step": 13602, + "time_per_iteration": 2.4818198680877686 + }, + { + "auxiliary_loss_clip": 0.06398925, + "auxiliary_loss_mlp": 0.01264753, + "balance_loss_clip": 0.06269614, + "balance_loss_mlp": 0.01255664, + "epoch": 0.8178566060423869, + "flos": 17783105621760.0, + "grad_norm": 2.059095926222651, + "language_loss": 0.84576654, + "learning_rate": 3.379403122624718e-07, + "loss": 0.92240334, + "num_input_tokens_seen": 293481345, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.09088135, + "step": 13603, + "time_per_iteration": 2.4598805904388428 + }, + { + "auxiliary_loss_clip": 0.06402913, + "auxiliary_loss_mlp": 0.01264877, + "balance_loss_clip": 0.06270468, + "balance_loss_mlp": 0.01255656, + "epoch": 0.8179167292950549, + "flos": 24980267894400.0, + "grad_norm": 1.5537552775340278, + "language_loss": 0.6937784, + "learning_rate": 3.377237143507159e-07, + "loss": 0.77045631, + "num_input_tokens_seen": 293502330, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.09216309, + "step": 13604, + "time_per_iteration": 2.5589122772216797 + }, + { + "auxiliary_loss_clip": 0.06399256, + "auxiliary_loss_mlp": 0.01266705, + "balance_loss_clip": 0.06269272, + "balance_loss_mlp": 0.01257561, + "epoch": 0.8179768525477228, + "flos": 22863397870080.0, + "grad_norm": 1.66498006246138, + "language_loss": 0.74241424, + "learning_rate": 3.3750717947264406e-07, + "loss": 0.8190738, + "num_input_tokens_seen": 293521415, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.09143066, + "step": 13605, + "time_per_iteration": 2.490889549255371 + }, + { + "auxiliary_loss_clip": 0.06400125, + "auxiliary_loss_mlp": 0.0126796, + "balance_loss_clip": 0.06271368, + "balance_loss_mlp": 0.01257791, + "epoch": 0.8180369758003908, + "flos": 18521064529920.0, + "grad_norm": 1.9159252087251424, + "language_loss": 0.74754506, + "learning_rate": 3.372907076364666e-07, + "loss": 0.8242259, + "num_input_tokens_seen": 293539245, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.10168457, + "step": 13606, + "time_per_iteration": 2.492121696472168 + }, + { + "auxiliary_loss_clip": 0.0640065, + "auxiliary_loss_mlp": 0.01270584, + "balance_loss_clip": 0.06269969, + "balance_loss_mlp": 0.01261566, + "epoch": 0.8180970990530587, + "flos": 33190422503040.0, + "grad_norm": 2.3548971551907916, + "language_loss": 0.65977269, + "learning_rate": 3.370742988503916e-07, + "loss": 0.73648506, + "num_input_tokens_seen": 293560640, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.09020996, + "step": 13607, + "time_per_iteration": 2.5886800289154053 + }, + { + "auxiliary_loss_clip": 0.06403854, + "auxiliary_loss_mlp": 0.01264189, + "balance_loss_clip": 0.06270081, + "balance_loss_mlp": 0.01254449, + "epoch": 0.8181572223057267, + "flos": 25017094563840.0, + "grad_norm": 1.7602024891247647, + "language_loss": 0.70355219, + "learning_rate": 3.3685795312262634e-07, + "loss": 0.78023267, + "num_input_tokens_seen": 293579465, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.09741211, + "step": 13608, + "time_per_iteration": 3.953319549560547 + }, + { + "auxiliary_loss_clip": 0.06399265, + "auxiliary_loss_mlp": 0.01266613, + "balance_loss_clip": 0.06268296, + "balance_loss_mlp": 0.01257154, + "epoch": 0.8182173455583948, + "flos": 28556326846080.0, + "grad_norm": 1.587446090270585, + "language_loss": 0.79743207, + "learning_rate": 3.366416704613735e-07, + "loss": 0.87409091, + "num_input_tokens_seen": 293600540, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09454346, + "step": 13609, + "time_per_iteration": 2.5585644245147705 + }, + { + "auxiliary_loss_clip": 0.06308096, + "auxiliary_loss_mlp": 0.01250941, + "balance_loss_clip": 0.0625338, + "balance_loss_mlp": 0.01249896, + "epoch": 0.8182774688110627, + "flos": 72047051729280.0, + "grad_norm": 0.7345769255501511, + "language_loss": 0.55927861, + "learning_rate": 3.3642545087483544e-07, + "loss": 0.63486898, + "num_input_tokens_seen": 293665160, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.0104599, + "step": 13610, + "time_per_iteration": 3.2368791103363037 + }, + { + "auxiliary_loss_clip": 0.06394055, + "auxiliary_loss_mlp": 0.01265977, + "balance_loss_clip": 0.06268248, + "balance_loss_mlp": 0.01257078, + "epoch": 0.8183375920637307, + "flos": 19761431719680.0, + "grad_norm": 1.6752147679341796, + "language_loss": 0.78055751, + "learning_rate": 3.362092943712107e-07, + "loss": 0.85715789, + "num_input_tokens_seen": 293683995, + "router_z_loss_clip": 1.25878906, + "router_z_loss_mlp": 0.08892822, + "step": 13611, + "time_per_iteration": 2.5044984817504883 + }, + { + "auxiliary_loss_clip": 0.06411519, + "auxiliary_loss_mlp": 0.0126604, + "balance_loss_clip": 0.06271686, + "balance_loss_mlp": 0.01255467, + "epoch": 0.8183977153163986, + "flos": 22347740643840.0, + "grad_norm": 1.936289550368914, + "language_loss": 0.77789629, + "learning_rate": 3.3599320095869745e-07, + "loss": 0.85467196, + "num_input_tokens_seen": 293704115, + "router_z_loss_clip": 1.39941406, + "router_z_loss_mlp": 0.10577393, + "step": 13612, + "time_per_iteration": 3.9493825435638428 + }, + { + "auxiliary_loss_clip": 0.06397919, + "auxiliary_loss_mlp": 0.01263793, + "balance_loss_clip": 0.0626799, + "balance_loss_mlp": 0.01254733, + "epoch": 0.8184578385690666, + "flos": 17718256961280.0, + "grad_norm": 1.9954357370848774, + "language_loss": 0.86433131, + "learning_rate": 3.3577717064548793e-07, + "loss": 0.94094843, + "num_input_tokens_seen": 293722225, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.09063721, + "step": 13613, + "time_per_iteration": 2.493557929992676 + }, + { + "auxiliary_loss_clip": 0.0640028, + "auxiliary_loss_mlp": 0.01263676, + "balance_loss_clip": 0.06271352, + "balance_loss_mlp": 0.01254687, + "epoch": 0.8185179618217345, + "flos": 25707996604800.0, + "grad_norm": 1.7004353778600403, + "language_loss": 0.73161137, + "learning_rate": 3.355612034397746e-07, + "loss": 0.8082509, + "num_input_tokens_seen": 293743995, + "router_z_loss_clip": 1.28808594, + "router_z_loss_mlp": 0.08990479, + "step": 13614, + "time_per_iteration": 2.5492005348205566 + }, + { + "auxiliary_loss_clip": 0.06401198, + "auxiliary_loss_mlp": 0.01266903, + "balance_loss_clip": 0.06267008, + "balance_loss_mlp": 0.01257551, + "epoch": 0.8185780850744026, + "flos": 25967837715840.0, + "grad_norm": 2.1633186140321583, + "language_loss": 0.81232059, + "learning_rate": 3.353452993497479e-07, + "loss": 0.88900155, + "num_input_tokens_seen": 293764935, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.09344482, + "step": 13615, + "time_per_iteration": 2.569638967514038 + }, + { + "auxiliary_loss_clip": 0.0640194, + "auxiliary_loss_mlp": 0.01265752, + "balance_loss_clip": 0.06269952, + "balance_loss_mlp": 0.01256049, + "epoch": 0.8186382083270705, + "flos": 25235455104000.0, + "grad_norm": 2.0233105033334158, + "language_loss": 0.75650156, + "learning_rate": 3.3512945838359375e-07, + "loss": 0.83317852, + "num_input_tokens_seen": 293784035, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.09698486, + "step": 13616, + "time_per_iteration": 3.942837715148926 + }, + { + "auxiliary_loss_clip": 0.06401451, + "auxiliary_loss_mlp": 0.01266202, + "balance_loss_clip": 0.06271508, + "balance_loss_mlp": 0.01256361, + "epoch": 0.8186983315797385, + "flos": 22420890858240.0, + "grad_norm": 1.6571547627109076, + "language_loss": 0.75235343, + "learning_rate": 3.349136805494979e-07, + "loss": 0.82902998, + "num_input_tokens_seen": 293803360, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.09838867, + "step": 13617, + "time_per_iteration": 2.4979913234710693 + }, + { + "auxiliary_loss_clip": 0.06399617, + "auxiliary_loss_mlp": 0.01267076, + "balance_loss_clip": 0.06269683, + "balance_loss_mlp": 0.01257968, + "epoch": 0.8187584548324064, + "flos": 22024560245760.0, + "grad_norm": 1.7428000144990041, + "language_loss": 0.68450582, + "learning_rate": 3.346979658556415e-07, + "loss": 0.76117277, + "num_input_tokens_seen": 293821325, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.09112549, + "step": 13618, + "time_per_iteration": 2.58243465423584 + }, + { + "auxiliary_loss_clip": 0.06411767, + "auxiliary_loss_mlp": 0.01263534, + "balance_loss_clip": 0.06273052, + "balance_loss_mlp": 0.01253604, + "epoch": 0.8188185780850744, + "flos": 29249325239040.0, + "grad_norm": 1.8955704094009027, + "language_loss": 0.69656849, + "learning_rate": 3.344823143102058e-07, + "loss": 0.77332145, + "num_input_tokens_seen": 293840315, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.09936523, + "step": 13619, + "time_per_iteration": 2.552861452102661 + }, + { + "auxiliary_loss_clip": 0.06405166, + "auxiliary_loss_mlp": 0.01267919, + "balance_loss_clip": 0.06271726, + "balance_loss_mlp": 0.01258257, + "epoch": 0.8188787013377423, + "flos": 20701483476480.0, + "grad_norm": 2.032902910475726, + "language_loss": 0.74368906, + "learning_rate": 3.3426672592136694e-07, + "loss": 0.82041991, + "num_input_tokens_seen": 293855685, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.09661865, + "step": 13620, + "time_per_iteration": 2.4927451610565186 + }, + { + "auxiliary_loss_clip": 0.06397671, + "auxiliary_loss_mlp": 0.01265569, + "balance_loss_clip": 0.06268847, + "balance_loss_mlp": 0.01256515, + "epoch": 0.8189388245904103, + "flos": 23739816850560.0, + "grad_norm": 1.5173921020881993, + "language_loss": 0.76409143, + "learning_rate": 3.340512006973011e-07, + "loss": 0.84072381, + "num_input_tokens_seen": 293875540, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.09051514, + "step": 13621, + "time_per_iteration": 2.4968793392181396 + }, + { + "auxiliary_loss_clip": 0.06401004, + "auxiliary_loss_mlp": 0.01262724, + "balance_loss_clip": 0.06269065, + "balance_loss_mlp": 0.01252425, + "epoch": 0.8189989478430784, + "flos": 28262342396160.0, + "grad_norm": 2.0156762185325934, + "language_loss": 0.66266668, + "learning_rate": 3.3383573864618076e-07, + "loss": 0.73930395, + "num_input_tokens_seen": 293896570, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.10302734, + "step": 13622, + "time_per_iteration": 2.556950807571411 + }, + { + "auxiliary_loss_clip": 0.06408148, + "auxiliary_loss_mlp": 0.01264921, + "balance_loss_clip": 0.06274983, + "balance_loss_mlp": 0.01255563, + "epoch": 0.8190590710957463, + "flos": 21404125088640.0, + "grad_norm": 1.7883534032676356, + "language_loss": 0.75312483, + "learning_rate": 3.3362033977617653e-07, + "loss": 0.8298555, + "num_input_tokens_seen": 293914680, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.09356689, + "step": 13623, + "time_per_iteration": 2.499490261077881 + }, + { + "auxiliary_loss_clip": 0.06404785, + "auxiliary_loss_mlp": 0.01265588, + "balance_loss_clip": 0.06270933, + "balance_loss_mlp": 0.01256075, + "epoch": 0.8191191943484143, + "flos": 38804960384640.0, + "grad_norm": 1.8675492206945747, + "language_loss": 0.63666874, + "learning_rate": 3.3340500409545527e-07, + "loss": 0.71337247, + "num_input_tokens_seen": 293936480, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.09515381, + "step": 13624, + "time_per_iteration": 2.6544313430786133 + }, + { + "auxiliary_loss_clip": 0.06400229, + "auxiliary_loss_mlp": 0.01264991, + "balance_loss_clip": 0.06273144, + "balance_loss_mlp": 0.01256104, + "epoch": 0.8191793176010822, + "flos": 25453438300800.0, + "grad_norm": 1.460649877308724, + "language_loss": 0.78395194, + "learning_rate": 3.3318973161218386e-07, + "loss": 0.86060411, + "num_input_tokens_seen": 293957815, + "router_z_loss_clip": 1.27148438, + "router_z_loss_mlp": 0.08880615, + "step": 13625, + "time_per_iteration": 2.5359597206115723 + }, + { + "auxiliary_loss_clip": 0.06410608, + "auxiliary_loss_mlp": 0.01263881, + "balance_loss_clip": 0.06269354, + "balance_loss_mlp": 0.01254035, + "epoch": 0.8192394408537502, + "flos": 25090118997120.0, + "grad_norm": 1.884478371292304, + "language_loss": 0.75783712, + "learning_rate": 3.329745223345244e-07, + "loss": 0.83458203, + "num_input_tokens_seen": 293975440, + "router_z_loss_clip": 1.41308594, + "router_z_loss_mlp": 0.09851074, + "step": 13626, + "time_per_iteration": 2.532646656036377 + }, + { + "auxiliary_loss_clip": 0.06401683, + "auxiliary_loss_mlp": 0.0126518, + "balance_loss_clip": 0.06270789, + "balance_loss_mlp": 0.01256079, + "epoch": 0.8192995641064181, + "flos": 27681291457920.0, + "grad_norm": 1.4150920843677999, + "language_loss": 0.7395972, + "learning_rate": 3.3275937627063823e-07, + "loss": 0.81626576, + "num_input_tokens_seen": 293997540, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09100342, + "step": 13627, + "time_per_iteration": 2.5652401447296143 + }, + { + "auxiliary_loss_clip": 0.06406218, + "auxiliary_loss_mlp": 0.01265828, + "balance_loss_clip": 0.06270798, + "balance_loss_mlp": 0.01255522, + "epoch": 0.8193596873590862, + "flos": 21294944818560.0, + "grad_norm": 1.5860896739474306, + "language_loss": 0.68839937, + "learning_rate": 3.3254429342868353e-07, + "loss": 0.76511979, + "num_input_tokens_seen": 294017030, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.10308838, + "step": 13628, + "time_per_iteration": 2.5479671955108643 + }, + { + "auxiliary_loss_clip": 0.0641032, + "auxiliary_loss_mlp": 0.01265204, + "balance_loss_clip": 0.0627242, + "balance_loss_mlp": 0.01254702, + "epoch": 0.8194198106117541, + "flos": 17498219339520.0, + "grad_norm": 1.489340257893301, + "language_loss": 0.85434711, + "learning_rate": 3.323292738168171e-07, + "loss": 0.93110228, + "num_input_tokens_seen": 294035700, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.10516357, + "step": 13629, + "time_per_iteration": 2.483988046646118 + }, + { + "auxiliary_loss_clip": 0.06403497, + "auxiliary_loss_mlp": 0.01264453, + "balance_loss_clip": 0.06271183, + "balance_loss_mlp": 0.01255209, + "epoch": 0.8194799338644221, + "flos": 15273301075200.0, + "grad_norm": 2.0412434679276203, + "language_loss": 0.74637675, + "learning_rate": 3.3211431744319084e-07, + "loss": 0.82305628, + "num_input_tokens_seen": 294049730, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.09246826, + "step": 13630, + "time_per_iteration": 2.535431146621704 + }, + { + "auxiliary_loss_clip": 0.06406824, + "auxiliary_loss_mlp": 0.0126407, + "balance_loss_clip": 0.06272252, + "balance_loss_mlp": 0.01254468, + "epoch": 0.81954005711709, + "flos": 14723793999360.0, + "grad_norm": 1.6899565751326817, + "language_loss": 0.72566128, + "learning_rate": 3.31899424315957e-07, + "loss": 0.80237019, + "num_input_tokens_seen": 294066545, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.09606934, + "step": 13631, + "time_per_iteration": 2.4677011966705322 + }, + { + "auxiliary_loss_clip": 0.06404364, + "auxiliary_loss_mlp": 0.01262964, + "balance_loss_clip": 0.06271352, + "balance_loss_mlp": 0.01253838, + "epoch": 0.819600180369758, + "flos": 23080416744960.0, + "grad_norm": 1.6434442490728178, + "language_loss": 0.76678276, + "learning_rate": 3.3168459444326447e-07, + "loss": 0.84345603, + "num_input_tokens_seen": 294087455, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.09124756, + "step": 13632, + "time_per_iteration": 2.567342519760132 + }, + { + "auxiliary_loss_clip": 0.06402865, + "auxiliary_loss_mlp": 0.01264631, + "balance_loss_clip": 0.06271514, + "balance_loss_mlp": 0.0125597, + "epoch": 0.8196603036224259, + "flos": 27607176921600.0, + "grad_norm": 1.7777195570066433, + "language_loss": 0.66198611, + "learning_rate": 3.314698278332588e-07, + "loss": 0.73866111, + "num_input_tokens_seen": 294107480, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.08660889, + "step": 13633, + "time_per_iteration": 2.5596518516540527 + }, + { + "auxiliary_loss_clip": 0.06396772, + "auxiliary_loss_mlp": 0.01261231, + "balance_loss_clip": 0.06268521, + "balance_loss_mlp": 0.0125235, + "epoch": 0.8197204268750939, + "flos": 28589086592640.0, + "grad_norm": 1.7854505067066941, + "language_loss": 0.75938737, + "learning_rate": 3.3125512449408513e-07, + "loss": 0.83596742, + "num_input_tokens_seen": 294130115, + "router_z_loss_clip": 1.28222656, + "router_z_loss_mlp": 0.08880615, + "step": 13634, + "time_per_iteration": 2.5733511447906494 + }, + { + "auxiliary_loss_clip": 0.06397436, + "auxiliary_loss_mlp": 0.01268994, + "balance_loss_clip": 0.06269581, + "balance_loss_mlp": 0.01259892, + "epoch": 0.819780550127762, + "flos": 23265011289600.0, + "grad_norm": 2.294761376034913, + "language_loss": 0.81912637, + "learning_rate": 3.310404844338841e-07, + "loss": 0.8957907, + "num_input_tokens_seen": 294148495, + "router_z_loss_clip": 1.27734375, + "router_z_loss_mlp": 0.09094238, + "step": 13635, + "time_per_iteration": 2.521009922027588 + }, + { + "auxiliary_loss_clip": 0.06407675, + "auxiliary_loss_mlp": 0.01266496, + "balance_loss_clip": 0.0627351, + "balance_loss_mlp": 0.01256506, + "epoch": 0.8198406733804299, + "flos": 26692086481920.0, + "grad_norm": 1.490251576995218, + "language_loss": 0.75829619, + "learning_rate": 3.308259076607949e-07, + "loss": 0.83503789, + "num_input_tokens_seen": 294169595, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.09997559, + "step": 13636, + "time_per_iteration": 2.566101551055908 + }, + { + "auxiliary_loss_clip": 0.06401062, + "auxiliary_loss_mlp": 0.01262174, + "balance_loss_clip": 0.06270652, + "balance_loss_mlp": 0.01252727, + "epoch": 0.8199007966330979, + "flos": 20090272268160.0, + "grad_norm": 1.9556414121680055, + "language_loss": 0.81463081, + "learning_rate": 3.3061139418295445e-07, + "loss": 0.89126313, + "num_input_tokens_seen": 294183885, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.09454346, + "step": 13637, + "time_per_iteration": 2.4868228435516357 + }, + { + "auxiliary_loss_clip": 0.06399955, + "auxiliary_loss_mlp": 0.01262595, + "balance_loss_clip": 0.06269722, + "balance_loss_mlp": 0.01253452, + "epoch": 0.8199609198857658, + "flos": 31910503386240.0, + "grad_norm": 1.913324988944965, + "language_loss": 0.70908749, + "learning_rate": 3.3039694400849725e-07, + "loss": 0.78571296, + "num_input_tokens_seen": 294200150, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.09143066, + "step": 13638, + "time_per_iteration": 3.970994472503662 + }, + { + "auxiliary_loss_clip": 0.06407509, + "auxiliary_loss_mlp": 0.01266247, + "balance_loss_clip": 0.06270528, + "balance_loss_mlp": 0.01255232, + "epoch": 0.8200210431384338, + "flos": 26477583229440.0, + "grad_norm": 3.8164285850122854, + "language_loss": 0.80088663, + "learning_rate": 3.3018255714555564e-07, + "loss": 0.87762421, + "num_input_tokens_seen": 294220385, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.11016846, + "step": 13639, + "time_per_iteration": 2.5780816078186035 + }, + { + "auxiliary_loss_clip": 0.06400024, + "auxiliary_loss_mlp": 0.0126402, + "balance_loss_clip": 0.06269088, + "balance_loss_mlp": 0.01255121, + "epoch": 0.8200811663911017, + "flos": 22098087803520.0, + "grad_norm": 2.39997075184638, + "language_loss": 0.79083264, + "learning_rate": 3.299682336022589e-07, + "loss": 0.86747313, + "num_input_tokens_seen": 294239355, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.08898926, + "step": 13640, + "time_per_iteration": 2.482212781906128 + }, + { + "auxiliary_loss_clip": 0.06413399, + "auxiliary_loss_mlp": 0.01270919, + "balance_loss_clip": 0.06273437, + "balance_loss_mlp": 0.01261103, + "epoch": 0.8201412896437698, + "flos": 37602174551040.0, + "grad_norm": 1.7066462026776184, + "language_loss": 0.63058311, + "learning_rate": 3.297539733867336e-07, + "loss": 0.70742631, + "num_input_tokens_seen": 294259395, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.09820557, + "step": 13641, + "time_per_iteration": 2.698233127593994 + }, + { + "auxiliary_loss_clip": 0.06402028, + "auxiliary_loss_mlp": 0.01266334, + "balance_loss_clip": 0.06270909, + "balance_loss_mlp": 0.0125638, + "epoch": 0.8202014128964377, + "flos": 19652461084800.0, + "grad_norm": 1.9366215144343786, + "language_loss": 0.73740256, + "learning_rate": 3.295397765071055e-07, + "loss": 0.81408608, + "num_input_tokens_seen": 294277365, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.09942627, + "step": 13642, + "time_per_iteration": 2.511960744857788 + }, + { + "auxiliary_loss_clip": 0.06402153, + "auxiliary_loss_mlp": 0.01267253, + "balance_loss_clip": 0.06271218, + "balance_loss_mlp": 0.0125796, + "epoch": 0.8202615361491057, + "flos": 31475375533440.0, + "grad_norm": 1.5751213862396989, + "language_loss": 0.70581281, + "learning_rate": 3.2932564297149615e-07, + "loss": 0.78250694, + "num_input_tokens_seen": 294297555, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09295654, + "step": 13643, + "time_per_iteration": 2.6206700801849365 + }, + { + "auxiliary_loss_clip": 0.06402344, + "auxiliary_loss_mlp": 0.01268107, + "balance_loss_clip": 0.06272536, + "balance_loss_mlp": 0.01259215, + "epoch": 0.8203216594017736, + "flos": 24722145792000.0, + "grad_norm": 1.7907058552656372, + "language_loss": 0.66027546, + "learning_rate": 3.291115727880256e-07, + "loss": 0.73697996, + "num_input_tokens_seen": 294317600, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.08898926, + "step": 13644, + "time_per_iteration": 2.5345609188079834 + }, + { + "auxiliary_loss_clip": 0.06405216, + "auxiliary_loss_mlp": 0.01267007, + "balance_loss_clip": 0.06271123, + "balance_loss_mlp": 0.01257149, + "epoch": 0.8203817826544416, + "flos": 26039101213440.0, + "grad_norm": 1.3794487731864136, + "language_loss": 0.70734018, + "learning_rate": 3.2889756596481234e-07, + "loss": 0.78406239, + "num_input_tokens_seen": 294340215, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.09863281, + "step": 13645, + "time_per_iteration": 2.554086446762085 + }, + { + "auxiliary_loss_clip": 0.0639934, + "auxiliary_loss_mlp": 0.0126526, + "balance_loss_clip": 0.06269216, + "balance_loss_mlp": 0.01256069, + "epoch": 0.8204419059071095, + "flos": 25961087462400.0, + "grad_norm": 1.8361710653661691, + "language_loss": 0.7172327, + "learning_rate": 3.286836225099707e-07, + "loss": 0.79387873, + "num_input_tokens_seen": 294358590, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.09185791, + "step": 13646, + "time_per_iteration": 2.5273547172546387 + }, + { + "auxiliary_loss_clip": 0.06407963, + "auxiliary_loss_mlp": 0.01268435, + "balance_loss_clip": 0.062723, + "balance_loss_mlp": 0.01258642, + "epoch": 0.8205020291597775, + "flos": 23585717992320.0, + "grad_norm": 2.0092863306251676, + "language_loss": 0.79515278, + "learning_rate": 3.284697424316132e-07, + "loss": 0.87191677, + "num_input_tokens_seen": 294375825, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.09796143, + "step": 13647, + "time_per_iteration": 2.521698474884033 + }, + { + "auxiliary_loss_clip": 0.06397481, + "auxiliary_loss_mlp": 0.01264437, + "balance_loss_clip": 0.06270614, + "balance_loss_mlp": 0.01255294, + "epoch": 0.8205621524124456, + "flos": 26806759194240.0, + "grad_norm": 1.3474560258501684, + "language_loss": 0.68241918, + "learning_rate": 3.2825592573785034e-07, + "loss": 0.75903839, + "num_input_tokens_seen": 294398500, + "router_z_loss_clip": 1.26953125, + "router_z_loss_mlp": 0.09136963, + "step": 13648, + "time_per_iteration": 3.9541409015655518 + }, + { + "auxiliary_loss_clip": 0.06400238, + "auxiliary_loss_mlp": 0.01268028, + "balance_loss_clip": 0.06267244, + "balance_loss_mlp": 0.0125799, + "epoch": 0.8206222756651135, + "flos": 27535410299520.0, + "grad_norm": 1.764747006246769, + "language_loss": 0.80002069, + "learning_rate": 3.28042172436791e-07, + "loss": 0.87670338, + "num_input_tokens_seen": 294418840, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.10046387, + "step": 13649, + "time_per_iteration": 2.6316652297973633 + }, + { + "auxiliary_loss_clip": 0.06406631, + "auxiliary_loss_mlp": 0.01266937, + "balance_loss_clip": 0.06273945, + "balance_loss_mlp": 0.01256917, + "epoch": 0.8206823989177815, + "flos": 21184967934720.0, + "grad_norm": 1.546894359217093, + "language_loss": 0.69079524, + "learning_rate": 3.278284825365396e-07, + "loss": 0.76753092, + "num_input_tokens_seen": 294438215, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.10021973, + "step": 13650, + "time_per_iteration": 2.5335919857025146 + }, + { + "auxiliary_loss_clip": 0.06402709, + "auxiliary_loss_mlp": 0.01267243, + "balance_loss_clip": 0.06271584, + "balance_loss_mlp": 0.01257176, + "epoch": 0.8207425221704494, + "flos": 11514324660480.0, + "grad_norm": 2.3595864556173614, + "language_loss": 0.61227095, + "learning_rate": 3.276148560452001e-07, + "loss": 0.68897045, + "num_input_tokens_seen": 294455260, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.10064697, + "step": 13651, + "time_per_iteration": 2.4735312461853027 + }, + { + "auxiliary_loss_clip": 0.06405269, + "auxiliary_loss_mlp": 0.01265417, + "balance_loss_clip": 0.06270625, + "balance_loss_mlp": 0.01254968, + "epoch": 0.8208026454231174, + "flos": 19798090680960.0, + "grad_norm": 2.1521682694916313, + "language_loss": 0.72795534, + "learning_rate": 3.2740129297087293e-07, + "loss": 0.80466217, + "num_input_tokens_seen": 294473205, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.10449219, + "step": 13652, + "time_per_iteration": 3.9963738918304443 + }, + { + "auxiliary_loss_clip": 0.06397925, + "auxiliary_loss_mlp": 0.01264227, + "balance_loss_clip": 0.06271631, + "balance_loss_mlp": 0.01255525, + "epoch": 0.8208627686757853, + "flos": 15672692361600.0, + "grad_norm": 2.2964720489620976, + "language_loss": 0.72892058, + "learning_rate": 3.271877933216558e-07, + "loss": 0.80554199, + "num_input_tokens_seen": 294490645, + "router_z_loss_clip": 1.26269531, + "router_z_loss_mlp": 0.08709717, + "step": 13653, + "time_per_iteration": 2.4748480319976807 + }, + { + "auxiliary_loss_clip": 0.06416966, + "auxiliary_loss_mlp": 0.01270598, + "balance_loss_clip": 0.06278365, + "balance_loss_mlp": 0.01260132, + "epoch": 0.8209228919284534, + "flos": 37490897928960.0, + "grad_norm": 1.7768200929387925, + "language_loss": 0.6321249, + "learning_rate": 3.269743571056451e-07, + "loss": 0.70900059, + "num_input_tokens_seen": 294513500, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.10473633, + "step": 13654, + "time_per_iteration": 2.6520609855651855 + }, + { + "auxiliary_loss_clip": 0.06403168, + "auxiliary_loss_mlp": 0.01264491, + "balance_loss_clip": 0.06268303, + "balance_loss_mlp": 0.01254651, + "epoch": 0.8209830151811213, + "flos": 23119759036800.0, + "grad_norm": 1.6261113247907222, + "language_loss": 0.7042315, + "learning_rate": 3.2676098433093447e-07, + "loss": 0.78090811, + "num_input_tokens_seen": 294535710, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.09832764, + "step": 13655, + "time_per_iteration": 4.040972948074341 + }, + { + "auxiliary_loss_clip": 0.06399737, + "auxiliary_loss_mlp": 0.01264964, + "balance_loss_clip": 0.06270001, + "balance_loss_mlp": 0.01255528, + "epoch": 0.8210431384337893, + "flos": 21294567475200.0, + "grad_norm": 1.966782681323648, + "language_loss": 0.8200593, + "learning_rate": 3.265476750056162e-07, + "loss": 0.89670628, + "num_input_tokens_seen": 294554055, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.09429932, + "step": 13656, + "time_per_iteration": 2.5089569091796875 + }, + { + "auxiliary_loss_clip": 0.06398742, + "auxiliary_loss_mlp": 0.01266727, + "balance_loss_clip": 0.06270912, + "balance_loss_mlp": 0.01256898, + "epoch": 0.8211032616864572, + "flos": 11505897325440.0, + "grad_norm": 2.0352847360821196, + "language_loss": 0.73977625, + "learning_rate": 3.2633442913777654e-07, + "loss": 0.81643093, + "num_input_tokens_seen": 294570390, + "router_z_loss_clip": 1.27832031, + "router_z_loss_mlp": 0.0982666, + "step": 13657, + "time_per_iteration": 2.521794080734253 + }, + { + "auxiliary_loss_clip": 0.0640122, + "auxiliary_loss_mlp": 0.0126387, + "balance_loss_clip": 0.06269388, + "balance_loss_mlp": 0.01254966, + "epoch": 0.8211633849391252, + "flos": 29828573314560.0, + "grad_norm": 1.5761103965210477, + "language_loss": 0.55795848, + "learning_rate": 3.2612124673550325e-07, + "loss": 0.63460934, + "num_input_tokens_seen": 294593050, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.08898926, + "step": 13658, + "time_per_iteration": 2.591646432876587 + }, + { + "auxiliary_loss_clip": 0.06403513, + "auxiliary_loss_mlp": 0.01264952, + "balance_loss_clip": 0.0627034, + "balance_loss_mlp": 0.01255427, + "epoch": 0.8212235081917931, + "flos": 13120484849280.0, + "grad_norm": 8.99922619161595, + "language_loss": 0.794406, + "learning_rate": 3.259081278068805e-07, + "loss": 0.87109065, + "num_input_tokens_seen": 294608550, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.09521484, + "step": 13659, + "time_per_iteration": 2.4667892456054688 + }, + { + "auxiliary_loss_clip": 0.06397028, + "auxiliary_loss_mlp": 0.01264114, + "balance_loss_clip": 0.06268722, + "balance_loss_mlp": 0.01255424, + "epoch": 0.8212836314444611, + "flos": 40524828963840.0, + "grad_norm": 1.49148705733067, + "language_loss": 0.59613037, + "learning_rate": 3.256950723599887e-07, + "loss": 0.67274177, + "num_input_tokens_seen": 294630380, + "router_z_loss_clip": 1.28320312, + "router_z_loss_mlp": 0.08691406, + "step": 13660, + "time_per_iteration": 2.6636433601379395 + }, + { + "auxiliary_loss_clip": 0.06408002, + "auxiliary_loss_mlp": 0.0126705, + "balance_loss_clip": 0.06273358, + "balance_loss_mlp": 0.01256811, + "epoch": 0.8213437546971292, + "flos": 18776503301760.0, + "grad_norm": 1.9851690167899483, + "language_loss": 0.73083544, + "learning_rate": 3.254820804029075e-07, + "loss": 0.80758601, + "num_input_tokens_seen": 294648655, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.10241699, + "step": 13661, + "time_per_iteration": 2.4820919036865234 + }, + { + "auxiliary_loss_clip": 0.06408828, + "auxiliary_loss_mlp": 0.01265721, + "balance_loss_clip": 0.06272434, + "balance_loss_mlp": 0.01255904, + "epoch": 0.8214038779497971, + "flos": 19688323432320.0, + "grad_norm": 1.9325667410517933, + "language_loss": 0.75407529, + "learning_rate": 3.252691519437143e-07, + "loss": 0.8308208, + "num_input_tokens_seen": 294666915, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.09814453, + "step": 13662, + "time_per_iteration": 2.473001718521118 + }, + { + "auxiliary_loss_clip": 0.06316656, + "auxiliary_loss_mlp": 0.01256268, + "balance_loss_clip": 0.06261721, + "balance_loss_mlp": 0.01255036, + "epoch": 0.8214640012024651, + "flos": 71624040791040.0, + "grad_norm": 0.7272151584082011, + "language_loss": 0.54061127, + "learning_rate": 3.250562869904825e-07, + "loss": 0.61634052, + "num_input_tokens_seen": 294731545, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.01231384, + "step": 13663, + "time_per_iteration": 3.272303342819214 + }, + { + "auxiliary_loss_clip": 0.06399679, + "auxiliary_loss_mlp": 0.01266039, + "balance_loss_clip": 0.06268212, + "balance_loss_mlp": 0.0125643, + "epoch": 0.821524124455133, + "flos": 14762507385600.0, + "grad_norm": 2.215887467335205, + "language_loss": 0.65920115, + "learning_rate": 3.248434855512838e-07, + "loss": 0.73585832, + "num_input_tokens_seen": 294748745, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09613037, + "step": 13664, + "time_per_iteration": 2.477029323577881 + }, + { + "auxiliary_loss_clip": 0.06399576, + "auxiliary_loss_mlp": 0.01261557, + "balance_loss_clip": 0.06270959, + "balance_loss_mlp": 0.0125267, + "epoch": 0.821584247707801, + "flos": 25089238529280.0, + "grad_norm": 1.4192636174003572, + "language_loss": 0.75023228, + "learning_rate": 3.246307476341881e-07, + "loss": 0.82684362, + "num_input_tokens_seen": 294768955, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.08892822, + "step": 13665, + "time_per_iteration": 2.5525918006896973 + }, + { + "auxiliary_loss_clip": 0.06401828, + "auxiliary_loss_mlp": 0.01264308, + "balance_loss_clip": 0.06269041, + "balance_loss_mlp": 0.01254962, + "epoch": 0.8216443709604689, + "flos": 36839631669120.0, + "grad_norm": 1.9379151169740247, + "language_loss": 0.6576277, + "learning_rate": 3.2441807324726256e-07, + "loss": 0.73428911, + "num_input_tokens_seen": 294789250, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.09350586, + "step": 13666, + "time_per_iteration": 2.607255697250366 + }, + { + "auxiliary_loss_clip": 0.06399558, + "auxiliary_loss_mlp": 0.01266329, + "balance_loss_clip": 0.06267319, + "balance_loss_mlp": 0.01257132, + "epoch": 0.821704494213137, + "flos": 25088693477760.0, + "grad_norm": 1.6153303259870018, + "language_loss": 0.76945007, + "learning_rate": 3.2420546239857174e-07, + "loss": 0.84610897, + "num_input_tokens_seen": 294809760, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.09185791, + "step": 13667, + "time_per_iteration": 2.5342323780059814 + }, + { + "auxiliary_loss_clip": 0.0640602, + "auxiliary_loss_mlp": 0.0126598, + "balance_loss_clip": 0.06270644, + "balance_loss_mlp": 0.01255948, + "epoch": 0.8217646174658049, + "flos": 14361397090560.0, + "grad_norm": 2.524024827589192, + "language_loss": 0.77698529, + "learning_rate": 3.239929150961773e-07, + "loss": 0.85370529, + "num_input_tokens_seen": 294826495, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.10040283, + "step": 13668, + "time_per_iteration": 2.466806411743164 + }, + { + "auxiliary_loss_clip": 0.06399126, + "auxiliary_loss_mlp": 0.01264171, + "balance_loss_clip": 0.06269765, + "balance_loss_mlp": 0.01254933, + "epoch": 0.8218247407184729, + "flos": 22097039627520.0, + "grad_norm": 1.8128637689922475, + "language_loss": 0.73614395, + "learning_rate": 3.2378043134813984e-07, + "loss": 0.81277692, + "num_input_tokens_seen": 294845370, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.09240723, + "step": 13669, + "time_per_iteration": 2.5289034843444824 + }, + { + "auxiliary_loss_clip": 0.06400953, + "auxiliary_loss_mlp": 0.01266356, + "balance_loss_clip": 0.06269199, + "balance_loss_mlp": 0.01256819, + "epoch": 0.8218848639711408, + "flos": 16769694015360.0, + "grad_norm": 1.5575474443223831, + "language_loss": 0.79151839, + "learning_rate": 3.235680111625161e-07, + "loss": 0.86819142, + "num_input_tokens_seen": 294863740, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09533691, + "step": 13670, + "time_per_iteration": 2.4716804027557373 + }, + { + "auxiliary_loss_clip": 0.06409052, + "auxiliary_loss_mlp": 0.01266161, + "balance_loss_clip": 0.06273171, + "balance_loss_mlp": 0.01256415, + "epoch": 0.8219449872238088, + "flos": 26001981054720.0, + "grad_norm": 1.6601212313444933, + "language_loss": 0.7576502, + "learning_rate": 3.2335565454736123e-07, + "loss": 0.83440232, + "num_input_tokens_seen": 294882815, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.09741211, + "step": 13671, + "time_per_iteration": 2.550118923187256 + }, + { + "auxiliary_loss_clip": 0.06410009, + "auxiliary_loss_mlp": 0.01266966, + "balance_loss_clip": 0.06270236, + "balance_loss_mlp": 0.01255969, + "epoch": 0.8220051104764767, + "flos": 20784528472320.0, + "grad_norm": 1.5724018090314842, + "language_loss": 0.76455218, + "learning_rate": 3.23143361510728e-07, + "loss": 0.84132195, + "num_input_tokens_seen": 294901985, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.11004639, + "step": 13672, + "time_per_iteration": 2.5448882579803467 + }, + { + "auxiliary_loss_clip": 0.06402946, + "auxiliary_loss_mlp": 0.01263319, + "balance_loss_clip": 0.06269625, + "balance_loss_mlp": 0.01253175, + "epoch": 0.8220652337291448, + "flos": 14580134974080.0, + "grad_norm": 2.095121195508436, + "language_loss": 0.74924457, + "learning_rate": 3.2293113206066733e-07, + "loss": 0.82590723, + "num_input_tokens_seen": 294919705, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.10150146, + "step": 13673, + "time_per_iteration": 2.5026438236236572 + }, + { + "auxiliary_loss_clip": 0.06410329, + "auxiliary_loss_mlp": 0.01264871, + "balance_loss_clip": 0.06274365, + "balance_loss_mlp": 0.01254941, + "epoch": 0.8221253569818128, + "flos": 23812715502720.0, + "grad_norm": 1.4999475516036749, + "language_loss": 0.79556978, + "learning_rate": 3.227189662052254e-07, + "loss": 0.87232178, + "num_input_tokens_seen": 294939900, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.0993042, + "step": 13674, + "time_per_iteration": 2.5405590534210205 + }, + { + "auxiliary_loss_clip": 0.06404756, + "auxiliary_loss_mlp": 0.01265536, + "balance_loss_clip": 0.06272387, + "balance_loss_mlp": 0.01255398, + "epoch": 0.8221854802344807, + "flos": 21294651329280.0, + "grad_norm": 1.7231814451382148, + "language_loss": 0.70641446, + "learning_rate": 3.225068639524484e-07, + "loss": 0.78311741, + "num_input_tokens_seen": 294959110, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.10144043, + "step": 13675, + "time_per_iteration": 2.514972448348999 + }, + { + "auxiliary_loss_clip": 0.06394869, + "auxiliary_loss_mlp": 0.01267052, + "balance_loss_clip": 0.06267343, + "balance_loss_mlp": 0.01257885, + "epoch": 0.8222456034871487, + "flos": 20962624325760.0, + "grad_norm": 1.5221695463620175, + "language_loss": 0.74239552, + "learning_rate": 3.2229482531037965e-07, + "loss": 0.81901473, + "num_input_tokens_seen": 294978660, + "router_z_loss_clip": 1.27539062, + "router_z_loss_mlp": 0.0916748, + "step": 13676, + "time_per_iteration": 2.6151413917541504 + }, + { + "auxiliary_loss_clip": 0.06403306, + "auxiliary_loss_mlp": 0.01266386, + "balance_loss_clip": 0.06270932, + "balance_loss_mlp": 0.01257195, + "epoch": 0.8223057267398166, + "flos": 21403915453440.0, + "grad_norm": 1.5912659161296756, + "language_loss": 0.80806673, + "learning_rate": 3.2208285028705893e-07, + "loss": 0.88476366, + "num_input_tokens_seen": 294998075, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.09191895, + "step": 13677, + "time_per_iteration": 2.524010419845581 + }, + { + "auxiliary_loss_clip": 0.06403331, + "auxiliary_loss_mlp": 0.01265658, + "balance_loss_clip": 0.06269956, + "balance_loss_mlp": 0.01256258, + "epoch": 0.8223658499924846, + "flos": 15273636491520.0, + "grad_norm": 1.9046398747416602, + "language_loss": 0.70346785, + "learning_rate": 3.218709388905245e-07, + "loss": 0.78015774, + "num_input_tokens_seen": 295015950, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.09405518, + "step": 13678, + "time_per_iteration": 3.918046236038208 + }, + { + "auxiliary_loss_clip": 0.06398967, + "auxiliary_loss_mlp": 0.01266892, + "balance_loss_clip": 0.06268647, + "balance_loss_mlp": 0.0125785, + "epoch": 0.8224259732451525, + "flos": 31257727752960.0, + "grad_norm": 1.3904742391636824, + "language_loss": 0.71421492, + "learning_rate": 3.216590911288133e-07, + "loss": 0.79087353, + "num_input_tokens_seen": 295036800, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.09039307, + "step": 13679, + "time_per_iteration": 2.5868563652038574 + }, + { + "auxiliary_loss_clip": 0.06397314, + "auxiliary_loss_mlp": 0.0126288, + "balance_loss_clip": 0.06268158, + "balance_loss_mlp": 0.01253748, + "epoch": 0.8224860964978206, + "flos": 21580166517120.0, + "grad_norm": 2.1427210155629797, + "language_loss": 0.70038605, + "learning_rate": 3.214473070099564e-07, + "loss": 0.77698797, + "num_input_tokens_seen": 295055300, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.09130859, + "step": 13680, + "time_per_iteration": 2.5864291191101074 + }, + { + "auxiliary_loss_clip": 0.06400996, + "auxiliary_loss_mlp": 0.01262464, + "balance_loss_clip": 0.06270762, + "balance_loss_mlp": 0.01253618, + "epoch": 0.8225462197504885, + "flos": 25490181116160.0, + "grad_norm": 1.609067591062343, + "language_loss": 0.60291123, + "learning_rate": 3.21235586541986e-07, + "loss": 0.67954582, + "num_input_tokens_seen": 295076420, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.08837891, + "step": 13681, + "time_per_iteration": 2.5397136211395264 + }, + { + "auxiliary_loss_clip": 0.06406465, + "auxiliary_loss_mlp": 0.01264863, + "balance_loss_clip": 0.06269526, + "balance_loss_mlp": 0.01255148, + "epoch": 0.8226063430031565, + "flos": 39394941782400.0, + "grad_norm": 1.559829133589283, + "language_loss": 0.70002699, + "learning_rate": 3.2102392973293047e-07, + "loss": 0.77674025, + "num_input_tokens_seen": 295100540, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.09716797, + "step": 13682, + "time_per_iteration": 2.685002565383911 + }, + { + "auxiliary_loss_clip": 0.06403206, + "auxiliary_loss_mlp": 0.01263586, + "balance_loss_clip": 0.06270599, + "balance_loss_mlp": 0.01253334, + "epoch": 0.8226664662558244, + "flos": 22821036831360.0, + "grad_norm": 1.8759178686827869, + "language_loss": 0.79682559, + "learning_rate": 3.20812336590816e-07, + "loss": 0.87349349, + "num_input_tokens_seen": 295120180, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.10253906, + "step": 13683, + "time_per_iteration": 2.519693613052368 + }, + { + "auxiliary_loss_clip": 0.06397998, + "auxiliary_loss_mlp": 0.01263016, + "balance_loss_clip": 0.06270218, + "balance_loss_mlp": 0.01254293, + "epoch": 0.8227265895084924, + "flos": 25672595454720.0, + "grad_norm": 1.9461522710413164, + "language_loss": 0.87060094, + "learning_rate": 3.206008071236661e-07, + "loss": 0.94721103, + "num_input_tokens_seen": 295138530, + "router_z_loss_clip": 1.27636719, + "router_z_loss_mlp": 0.08712769, + "step": 13684, + "time_per_iteration": 2.520162343978882 + }, + { + "auxiliary_loss_clip": 0.06394877, + "auxiliary_loss_mlp": 0.01264494, + "balance_loss_clip": 0.06267917, + "balance_loss_mlp": 0.0125556, + "epoch": 0.8227867127611603, + "flos": 26186827161600.0, + "grad_norm": 1.6760308925685343, + "language_loss": 0.80106431, + "learning_rate": 3.2038934133950157e-07, + "loss": 0.87765801, + "num_input_tokens_seen": 295160260, + "router_z_loss_clip": 1.26953125, + "router_z_loss_mlp": 0.08935547, + "step": 13685, + "time_per_iteration": 2.571464776992798 + }, + { + "auxiliary_loss_clip": 0.06403354, + "auxiliary_loss_mlp": 0.01266206, + "balance_loss_clip": 0.06270622, + "balance_loss_mlp": 0.01256848, + "epoch": 0.8228468360138284, + "flos": 22024602172800.0, + "grad_norm": 1.5711922940184833, + "language_loss": 0.68850559, + "learning_rate": 3.2017793924634194e-07, + "loss": 0.76520115, + "num_input_tokens_seen": 295177055, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.09356689, + "step": 13686, + "time_per_iteration": 2.516918182373047 + }, + { + "auxiliary_loss_clip": 0.06405336, + "auxiliary_loss_mlp": 0.01265357, + "balance_loss_clip": 0.06271816, + "balance_loss_mlp": 0.01256047, + "epoch": 0.8229069592664963, + "flos": 14908723960320.0, + "grad_norm": 2.294675899071434, + "language_loss": 0.78351545, + "learning_rate": 3.1996660085220263e-07, + "loss": 0.86022234, + "num_input_tokens_seen": 295193870, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.09307861, + "step": 13687, + "time_per_iteration": 3.9513440132141113 + }, + { + "auxiliary_loss_clip": 0.06402496, + "auxiliary_loss_mlp": 0.01262779, + "balance_loss_clip": 0.0627033, + "balance_loss_mlp": 0.01253022, + "epoch": 0.8229670825191643, + "flos": 15674956421760.0, + "grad_norm": 1.643594619200351, + "language_loss": 0.72294796, + "learning_rate": 3.1975532616509825e-07, + "loss": 0.79960072, + "num_input_tokens_seen": 295211040, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.09759521, + "step": 13688, + "time_per_iteration": 2.4735567569732666 + }, + { + "auxiliary_loss_clip": 0.0640309, + "auxiliary_loss_mlp": 0.01266638, + "balance_loss_clip": 0.0627107, + "balance_loss_mlp": 0.01257375, + "epoch": 0.8230272057718323, + "flos": 23189890504320.0, + "grad_norm": 1.5346344233597629, + "language_loss": 0.73226428, + "learning_rate": 3.1954411519304025e-07, + "loss": 0.80896151, + "num_input_tokens_seen": 295231300, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.09265137, + "step": 13689, + "time_per_iteration": 2.5417935848236084 + }, + { + "auxiliary_loss_clip": 0.06407392, + "auxiliary_loss_mlp": 0.01262871, + "balance_loss_clip": 0.06272584, + "balance_loss_mlp": 0.01253758, + "epoch": 0.8230873290245002, + "flos": 21038709432960.0, + "grad_norm": 2.1431822438071744, + "language_loss": 0.69692594, + "learning_rate": 3.1933296794403887e-07, + "loss": 0.77362859, + "num_input_tokens_seen": 295251045, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.09106445, + "step": 13690, + "time_per_iteration": 2.5107438564300537 + }, + { + "auxiliary_loss_clip": 0.0640377, + "auxiliary_loss_mlp": 0.01264747, + "balance_loss_clip": 0.06269638, + "balance_loss_mlp": 0.01255389, + "epoch": 0.8231474522771682, + "flos": 21256273359360.0, + "grad_norm": 1.6874962726355067, + "language_loss": 0.85794926, + "learning_rate": 3.191218844260988e-07, + "loss": 0.93463445, + "num_input_tokens_seen": 295270225, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.09350586, + "step": 13691, + "time_per_iteration": 4.0233988761901855 + }, + { + "auxiliary_loss_clip": 0.06406488, + "auxiliary_loss_mlp": 0.01265781, + "balance_loss_clip": 0.06271985, + "balance_loss_mlp": 0.0125637, + "epoch": 0.8232075755298361, + "flos": 23848829412480.0, + "grad_norm": 1.7540371277413798, + "language_loss": 0.76951766, + "learning_rate": 3.189108646472252e-07, + "loss": 0.8462404, + "num_input_tokens_seen": 295288950, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.09423828, + "step": 13692, + "time_per_iteration": 2.5032553672790527 + }, + { + "auxiliary_loss_clip": 0.06399276, + "auxiliary_loss_mlp": 0.01263194, + "balance_loss_clip": 0.06268877, + "balance_loss_mlp": 0.01254254, + "epoch": 0.8232676987825042, + "flos": 21660570109440.0, + "grad_norm": 1.5658390187310423, + "language_loss": 0.71956593, + "learning_rate": 3.186999086154205e-07, + "loss": 0.79619062, + "num_input_tokens_seen": 295309405, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.08935547, + "step": 13693, + "time_per_iteration": 2.5067594051361084 + }, + { + "auxiliary_loss_clip": 0.06396094, + "auxiliary_loss_mlp": 0.0126338, + "balance_loss_clip": 0.06269097, + "balance_loss_mlp": 0.01254367, + "epoch": 0.8233278220351721, + "flos": 26329857281280.0, + "grad_norm": 1.2936928608658458, + "language_loss": 0.8396762, + "learning_rate": 3.1848901633868355e-07, + "loss": 0.91627085, + "num_input_tokens_seen": 295331115, + "router_z_loss_clip": 1.26855469, + "router_z_loss_mlp": 0.09014893, + "step": 13694, + "time_per_iteration": 2.679731845855713 + }, + { + "auxiliary_loss_clip": 0.06406334, + "auxiliary_loss_mlp": 0.01265409, + "balance_loss_clip": 0.06271011, + "balance_loss_mlp": 0.01255771, + "epoch": 0.8233879452878401, + "flos": 21732252877440.0, + "grad_norm": 1.6355767467742353, + "language_loss": 0.77244568, + "learning_rate": 3.182781878250118e-07, + "loss": 0.84916306, + "num_input_tokens_seen": 295350495, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.09637451, + "step": 13695, + "time_per_iteration": 3.963965892791748 + }, + { + "auxiliary_loss_clip": 0.06402577, + "auxiliary_loss_mlp": 0.0126261, + "balance_loss_clip": 0.06271192, + "balance_loss_mlp": 0.0125389, + "epoch": 0.823448068540508, + "flos": 20563903872000.0, + "grad_norm": 1.8210752561146564, + "language_loss": 0.81778234, + "learning_rate": 3.1806742308239985e-07, + "loss": 0.89443427, + "num_input_tokens_seen": 295368225, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.0871582, + "step": 13696, + "time_per_iteration": 2.4970433712005615 + }, + { + "auxiliary_loss_clip": 0.06309157, + "auxiliary_loss_mlp": 0.01254773, + "balance_loss_clip": 0.06254191, + "balance_loss_mlp": 0.01253599, + "epoch": 0.823508191793176, + "flos": 67296130352640.0, + "grad_norm": 0.7182469320351987, + "language_loss": 0.63648844, + "learning_rate": 3.178567221188393e-07, + "loss": 0.7121278, + "num_input_tokens_seen": 295430035, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.01171875, + "step": 13697, + "time_per_iteration": 3.223705291748047 + }, + { + "auxiliary_loss_clip": 0.06395958, + "auxiliary_loss_mlp": 0.0126361, + "balance_loss_clip": 0.06268628, + "balance_loss_mlp": 0.01255003, + "epoch": 0.8235683150458439, + "flos": 17933724535680.0, + "grad_norm": 1.4706232042527567, + "language_loss": 0.72879517, + "learning_rate": 3.1764608494232037e-07, + "loss": 0.80539095, + "num_input_tokens_seen": 295447765, + "router_z_loss_clip": 1.27539062, + "router_z_loss_mlp": 0.08605957, + "step": 13698, + "time_per_iteration": 2.518505334854126 + }, + { + "auxiliary_loss_clip": 0.06407038, + "auxiliary_loss_mlp": 0.0126933, + "balance_loss_clip": 0.06273619, + "balance_loss_mlp": 0.01259203, + "epoch": 0.823628438298512, + "flos": 18922007116800.0, + "grad_norm": 1.754695390070976, + "language_loss": 0.71798617, + "learning_rate": 3.174355115608305e-07, + "loss": 0.79474986, + "num_input_tokens_seen": 295464810, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.10113525, + "step": 13699, + "time_per_iteration": 2.4939382076263428 + }, + { + "auxiliary_loss_clip": 0.06397603, + "auxiliary_loss_mlp": 0.01263248, + "balance_loss_clip": 0.06267754, + "balance_loss_mlp": 0.01253824, + "epoch": 0.8236885615511799, + "flos": 18702221057280.0, + "grad_norm": 1.8849458807724966, + "language_loss": 0.82397747, + "learning_rate": 3.1722500198235526e-07, + "loss": 0.90058601, + "num_input_tokens_seen": 295482605, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.09423828, + "step": 13700, + "time_per_iteration": 2.4839980602264404 + }, + { + "auxiliary_loss_clip": 0.06404804, + "auxiliary_loss_mlp": 0.01263758, + "balance_loss_clip": 0.06269407, + "balance_loss_mlp": 0.01254292, + "epoch": 0.8237486848038479, + "flos": 23701606588800.0, + "grad_norm": 1.5465027348479181, + "language_loss": 0.73049653, + "learning_rate": 3.170145562148763e-07, + "loss": 0.80718207, + "num_input_tokens_seen": 295503780, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.09466553, + "step": 13701, + "time_per_iteration": 2.5388693809509277 + }, + { + "auxiliary_loss_clip": 0.06404001, + "auxiliary_loss_mlp": 0.01265145, + "balance_loss_clip": 0.06270056, + "balance_loss_mlp": 0.01254625, + "epoch": 0.8238088080565159, + "flos": 23448138387840.0, + "grad_norm": 1.7645589694369792, + "language_loss": 0.69761407, + "learning_rate": 3.1680417426637384e-07, + "loss": 0.77430546, + "num_input_tokens_seen": 295522035, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.10522461, + "step": 13702, + "time_per_iteration": 2.5435500144958496 + }, + { + "auxiliary_loss_clip": 0.06406841, + "auxiliary_loss_mlp": 0.01264836, + "balance_loss_clip": 0.06274645, + "balance_loss_mlp": 0.01254882, + "epoch": 0.8238689313091838, + "flos": 22753001715840.0, + "grad_norm": 1.7292259180096456, + "language_loss": 0.74427319, + "learning_rate": 3.1659385614482603e-07, + "loss": 0.82098991, + "num_input_tokens_seen": 295541190, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.0994873, + "step": 13703, + "time_per_iteration": 2.5351295471191406 + }, + { + "auxiliary_loss_clip": 0.06409708, + "auxiliary_loss_mlp": 0.0126609, + "balance_loss_clip": 0.06270649, + "balance_loss_mlp": 0.01255868, + "epoch": 0.8239290545618518, + "flos": 25637236231680.0, + "grad_norm": 1.7672638463517, + "language_loss": 0.70240831, + "learning_rate": 3.1638360185820755e-07, + "loss": 0.77916628, + "num_input_tokens_seen": 295558860, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.10223389, + "step": 13704, + "time_per_iteration": 2.551124095916748 + }, + { + "auxiliary_loss_clip": 0.06402259, + "auxiliary_loss_mlp": 0.01263375, + "balance_loss_clip": 0.06270658, + "balance_loss_mlp": 0.01254392, + "epoch": 0.8239891778145197, + "flos": 26032854084480.0, + "grad_norm": 2.8793334355033076, + "language_loss": 0.64149827, + "learning_rate": 3.161734114144916e-07, + "loss": 0.71815455, + "num_input_tokens_seen": 295578155, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.08978271, + "step": 13705, + "time_per_iteration": 2.5648598670959473 + }, + { + "auxiliary_loss_clip": 0.06407434, + "auxiliary_loss_mlp": 0.01269004, + "balance_loss_clip": 0.06272142, + "balance_loss_mlp": 0.0125933, + "epoch": 0.8240493010671878, + "flos": 21839378722560.0, + "grad_norm": 2.201240453400887, + "language_loss": 0.69536072, + "learning_rate": 3.1596328482164915e-07, + "loss": 0.77212507, + "num_input_tokens_seen": 295599170, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.09680176, + "step": 13706, + "time_per_iteration": 2.5266029834747314 + }, + { + "auxiliary_loss_clip": 0.06408302, + "auxiliary_loss_mlp": 0.0126458, + "balance_loss_clip": 0.06274252, + "balance_loss_mlp": 0.01254483, + "epoch": 0.8241094243198557, + "flos": 18557891199360.0, + "grad_norm": 1.7625023749977664, + "language_loss": 0.69611287, + "learning_rate": 3.157532220876475e-07, + "loss": 0.77284169, + "num_input_tokens_seen": 295617465, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.10083008, + "step": 13707, + "time_per_iteration": 2.5589535236358643 + }, + { + "auxiliary_loss_clip": 0.06404749, + "auxiliary_loss_mlp": 0.01262733, + "balance_loss_clip": 0.06270427, + "balance_loss_mlp": 0.01252881, + "epoch": 0.8241695475725237, + "flos": 25454192987520.0, + "grad_norm": 1.5789270946690015, + "language_loss": 0.79172903, + "learning_rate": 3.1554322322045226e-07, + "loss": 0.86840385, + "num_input_tokens_seen": 295634960, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.09851074, + "step": 13708, + "time_per_iteration": 2.519388437271118 + }, + { + "auxiliary_loss_clip": 0.06402726, + "auxiliary_loss_mlp": 0.0126348, + "balance_loss_clip": 0.06268608, + "balance_loss_mlp": 0.01253418, + "epoch": 0.8242296708251916, + "flos": 18995702382720.0, + "grad_norm": 3.0867439551253195, + "language_loss": 0.69106972, + "learning_rate": 3.1533328822802664e-07, + "loss": 0.76773179, + "num_input_tokens_seen": 295652725, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.10064697, + "step": 13709, + "time_per_iteration": 2.505873441696167 + }, + { + "auxiliary_loss_clip": 0.06406131, + "auxiliary_loss_mlp": 0.01265491, + "balance_loss_clip": 0.06271987, + "balance_loss_mlp": 0.012558, + "epoch": 0.8242897940778596, + "flos": 22607372119680.0, + "grad_norm": 1.8896201135226782, + "language_loss": 0.83090842, + "learning_rate": 3.151234171183319e-07, + "loss": 0.9076246, + "num_input_tokens_seen": 295671195, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.09692383, + "step": 13710, + "time_per_iteration": 2.5083086490631104 + }, + { + "auxiliary_loss_clip": 0.06402289, + "auxiliary_loss_mlp": 0.01264664, + "balance_loss_clip": 0.06270906, + "balance_loss_mlp": 0.01254883, + "epoch": 0.8243499173305275, + "flos": 21474172702080.0, + "grad_norm": 2.010119969171323, + "language_loss": 0.78586245, + "learning_rate": 3.149136098993257e-07, + "loss": 0.8625319, + "num_input_tokens_seen": 295689130, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09780884, + "step": 13711, + "time_per_iteration": 2.504279136657715 + }, + { + "auxiliary_loss_clip": 0.06402823, + "auxiliary_loss_mlp": 0.01266322, + "balance_loss_clip": 0.06270982, + "balance_loss_mlp": 0.01256189, + "epoch": 0.8244100405831956, + "flos": 20016409294080.0, + "grad_norm": 1.7618946203552466, + "language_loss": 0.65925729, + "learning_rate": 3.1470386657896473e-07, + "loss": 0.73594874, + "num_input_tokens_seen": 295706385, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.10131836, + "step": 13712, + "time_per_iteration": 2.468043804168701 + }, + { + "auxiliary_loss_clip": 0.06404002, + "auxiliary_loss_mlp": 0.01265304, + "balance_loss_clip": 0.06271501, + "balance_loss_mlp": 0.0125619, + "epoch": 0.8244701638358635, + "flos": 26437612032000.0, + "grad_norm": 1.6609701051981949, + "language_loss": 0.74622256, + "learning_rate": 3.14494187165202e-07, + "loss": 0.82291561, + "num_input_tokens_seen": 295727925, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09106445, + "step": 13713, + "time_per_iteration": 2.551905393600464 + }, + { + "auxiliary_loss_clip": 0.06404902, + "auxiliary_loss_mlp": 0.01268602, + "balance_loss_clip": 0.06270953, + "balance_loss_mlp": 0.012595, + "epoch": 0.8245302870885315, + "flos": 17645861433600.0, + "grad_norm": 1.6587982213804435, + "language_loss": 0.81258547, + "learning_rate": 3.1428457166598833e-07, + "loss": 0.88932049, + "num_input_tokens_seen": 295744420, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.09106445, + "step": 13714, + "time_per_iteration": 2.452026844024658 + }, + { + "auxiliary_loss_clip": 0.06404838, + "auxiliary_loss_mlp": 0.01266065, + "balance_loss_clip": 0.062736, + "balance_loss_mlp": 0.0125666, + "epoch": 0.8245904103411995, + "flos": 26216023109760.0, + "grad_norm": 2.7428711337446736, + "language_loss": 0.66907775, + "learning_rate": 3.1407502008927235e-07, + "loss": 0.74578679, + "num_input_tokens_seen": 295765105, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.09405518, + "step": 13715, + "time_per_iteration": 2.5828397274017334 + }, + { + "auxiliary_loss_clip": 0.06407429, + "auxiliary_loss_mlp": 0.01263847, + "balance_loss_clip": 0.06271131, + "balance_loss_mlp": 0.01254358, + "epoch": 0.8246505335938674, + "flos": 24211645591680.0, + "grad_norm": 1.6923917814594924, + "language_loss": 0.75099182, + "learning_rate": 3.1386553244300086e-07, + "loss": 0.82770455, + "num_input_tokens_seen": 295784200, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.09484863, + "step": 13716, + "time_per_iteration": 2.4810688495635986 + }, + { + "auxiliary_loss_clip": 0.06310038, + "auxiliary_loss_mlp": 0.01249676, + "balance_loss_clip": 0.06255137, + "balance_loss_mlp": 0.01248568, + "epoch": 0.8247106568465354, + "flos": 67114764190080.0, + "grad_norm": 0.7022312920639184, + "language_loss": 0.58953023, + "learning_rate": 3.136561087351175e-07, + "loss": 0.66512734, + "num_input_tokens_seen": 295846555, + "router_z_loss_clip": 0.54785156, + "router_z_loss_mlp": 0.01110077, + "step": 13717, + "time_per_iteration": 3.246941328048706 + }, + { + "auxiliary_loss_clip": 0.06403467, + "auxiliary_loss_mlp": 0.01264543, + "balance_loss_clip": 0.06271186, + "balance_loss_mlp": 0.01255149, + "epoch": 0.8247707800992033, + "flos": 12573199906560.0, + "grad_norm": 1.9324122684588263, + "language_loss": 0.79839373, + "learning_rate": 3.1344674897356373e-07, + "loss": 0.87507385, + "num_input_tokens_seen": 295863425, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.09387207, + "step": 13718, + "time_per_iteration": 3.9733448028564453 + }, + { + "auxiliary_loss_clip": 0.06403176, + "auxiliary_loss_mlp": 0.01265559, + "balance_loss_clip": 0.06274208, + "balance_loss_mlp": 0.0125573, + "epoch": 0.8248309033518714, + "flos": 15928927747200.0, + "grad_norm": 1.6030825184413535, + "language_loss": 0.69140959, + "learning_rate": 3.132374531662778e-07, + "loss": 0.76809692, + "num_input_tokens_seen": 295880925, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.09832764, + "step": 13719, + "time_per_iteration": 2.5168843269348145 + }, + { + "auxiliary_loss_clip": 0.06406642, + "auxiliary_loss_mlp": 0.01266218, + "balance_loss_clip": 0.06272849, + "balance_loss_mlp": 0.01256348, + "epoch": 0.8248910266045393, + "flos": 17570195596800.0, + "grad_norm": 2.330025020870477, + "language_loss": 0.6986599, + "learning_rate": 3.13028221321197e-07, + "loss": 0.77538854, + "num_input_tokens_seen": 295898205, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.09869385, + "step": 13720, + "time_per_iteration": 2.4678380489349365 + }, + { + "auxiliary_loss_clip": 0.06404991, + "auxiliary_loss_mlp": 0.01264532, + "balance_loss_clip": 0.06269173, + "balance_loss_mlp": 0.01254954, + "epoch": 0.8249511498572073, + "flos": 28626919511040.0, + "grad_norm": 1.5185794987899917, + "language_loss": 0.75965858, + "learning_rate": 3.1281905344625467e-07, + "loss": 0.83635378, + "num_input_tokens_seen": 295918130, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.0958252, + "step": 13721, + "time_per_iteration": 2.6373937129974365 + }, + { + "auxiliary_loss_clip": 0.06402366, + "auxiliary_loss_mlp": 0.01262873, + "balance_loss_clip": 0.0627152, + "balance_loss_mlp": 0.01253718, + "epoch": 0.8250112731098752, + "flos": 25563624819840.0, + "grad_norm": 1.7041844507677804, + "language_loss": 0.77799296, + "learning_rate": 3.1260994954938305e-07, + "loss": 0.85464543, + "num_input_tokens_seen": 295937760, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.0914917, + "step": 13722, + "time_per_iteration": 2.5994813442230225 + }, + { + "auxiliary_loss_clip": 0.06398278, + "auxiliary_loss_mlp": 0.01264674, + "balance_loss_clip": 0.06269009, + "balance_loss_mlp": 0.01255596, + "epoch": 0.8250713963625432, + "flos": 27753645058560.0, + "grad_norm": 1.6949642691113342, + "language_loss": 0.63508642, + "learning_rate": 3.1240090963851205e-07, + "loss": 0.71171594, + "num_input_tokens_seen": 295957585, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.09082031, + "step": 13723, + "time_per_iteration": 2.5635523796081543 + }, + { + "auxiliary_loss_clip": 0.06404909, + "auxiliary_loss_mlp": 0.01267168, + "balance_loss_clip": 0.06271261, + "balance_loss_mlp": 0.01257328, + "epoch": 0.8251315196152111, + "flos": 21616070791680.0, + "grad_norm": 1.4018010369843736, + "language_loss": 0.74626708, + "learning_rate": 3.121919337215666e-07, + "loss": 0.82298779, + "num_input_tokens_seen": 295977135, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.09844971, + "step": 13724, + "time_per_iteration": 2.513502836227417 + }, + { + "auxiliary_loss_clip": 0.06404832, + "auxiliary_loss_mlp": 0.0126482, + "balance_loss_clip": 0.06271145, + "balance_loss_mlp": 0.01254508, + "epoch": 0.8251916428678792, + "flos": 28585983991680.0, + "grad_norm": 1.793661817459537, + "language_loss": 0.64819729, + "learning_rate": 3.1198302180647253e-07, + "loss": 0.72489381, + "num_input_tokens_seen": 295996265, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.10302734, + "step": 13725, + "time_per_iteration": 2.529151678085327 + }, + { + "auxiliary_loss_clip": 0.06405316, + "auxiliary_loss_mlp": 0.01262656, + "balance_loss_clip": 0.06274511, + "balance_loss_mlp": 0.01253227, + "epoch": 0.8252517661205471, + "flos": 23081758410240.0, + "grad_norm": 1.672809814905788, + "language_loss": 0.81857646, + "learning_rate": 3.1177417390115125e-07, + "loss": 0.89525616, + "num_input_tokens_seen": 296014745, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09436035, + "step": 13726, + "time_per_iteration": 2.5228326320648193 + }, + { + "auxiliary_loss_clip": 0.06397386, + "auxiliary_loss_mlp": 0.01264386, + "balance_loss_clip": 0.06270818, + "balance_loss_mlp": 0.01255475, + "epoch": 0.8253118893732151, + "flos": 31767724828800.0, + "grad_norm": 1.6706774467929177, + "language_loss": 0.70475507, + "learning_rate": 3.1156539001352286e-07, + "loss": 0.78137279, + "num_input_tokens_seen": 296036960, + "router_z_loss_clip": 1.265625, + "router_z_loss_mlp": 0.08911133, + "step": 13727, + "time_per_iteration": 3.978147506713867 + }, + { + "auxiliary_loss_clip": 0.0640934, + "auxiliary_loss_mlp": 0.01267735, + "balance_loss_clip": 0.0627425, + "balance_loss_mlp": 0.01256881, + "epoch": 0.8253720126258831, + "flos": 18302326646400.0, + "grad_norm": 1.91309895747183, + "language_loss": 0.63201261, + "learning_rate": 3.113566701515036e-07, + "loss": 0.70878333, + "num_input_tokens_seen": 296056540, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.10858154, + "step": 13728, + "time_per_iteration": 2.5155835151672363 + }, + { + "auxiliary_loss_clip": 0.06411063, + "auxiliary_loss_mlp": 0.01265983, + "balance_loss_clip": 0.0627272, + "balance_loss_mlp": 0.01255486, + "epoch": 0.825432135878551, + "flos": 26804620915200.0, + "grad_norm": 1.603278449226732, + "language_loss": 0.71536702, + "learning_rate": 3.111480143230092e-07, + "loss": 0.7921375, + "num_input_tokens_seen": 296077950, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.10498047, + "step": 13729, + "time_per_iteration": 2.5187203884124756 + }, + { + "auxiliary_loss_clip": 0.06315145, + "auxiliary_loss_mlp": 0.01248813, + "balance_loss_clip": 0.06260362, + "balance_loss_mlp": 0.01247758, + "epoch": 0.825492259131219, + "flos": 54234498597120.0, + "grad_norm": 0.8544615284034055, + "language_loss": 0.62620342, + "learning_rate": 3.109394225359514e-07, + "loss": 0.70184296, + "num_input_tokens_seen": 296127060, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.01055908, + "step": 13730, + "time_per_iteration": 2.9303290843963623 + }, + { + "auxiliary_loss_clip": 0.06404001, + "auxiliary_loss_mlp": 0.01264633, + "balance_loss_clip": 0.06272161, + "balance_loss_mlp": 0.01254744, + "epoch": 0.825552382383887, + "flos": 43765087478400.0, + "grad_norm": 1.7912471248364803, + "language_loss": 0.63930857, + "learning_rate": 3.1073089479823945e-07, + "loss": 0.71599495, + "num_input_tokens_seen": 296147775, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09893799, + "step": 13731, + "time_per_iteration": 4.08091139793396 + }, + { + "auxiliary_loss_clip": 0.06411815, + "auxiliary_loss_mlp": 0.01266713, + "balance_loss_clip": 0.06272149, + "balance_loss_mlp": 0.0125645, + "epoch": 0.825612505636555, + "flos": 12607469026560.0, + "grad_norm": 2.0738047653444855, + "language_loss": 0.70323932, + "learning_rate": 3.105224311177812e-07, + "loss": 0.78002459, + "num_input_tokens_seen": 296163560, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.10266113, + "step": 13732, + "time_per_iteration": 2.4617788791656494 + }, + { + "auxiliary_loss_clip": 0.06410882, + "auxiliary_loss_mlp": 0.01264735, + "balance_loss_clip": 0.06272789, + "balance_loss_mlp": 0.01254304, + "epoch": 0.8256726288892229, + "flos": 17600146231680.0, + "grad_norm": 2.908441012815726, + "language_loss": 0.71335745, + "learning_rate": 3.103140315024817e-07, + "loss": 0.79011369, + "num_input_tokens_seen": 296178730, + "router_z_loss_clip": 1.38183594, + "router_z_loss_mlp": 0.10437012, + "step": 13733, + "time_per_iteration": 2.4824366569519043 + }, + { + "auxiliary_loss_clip": 0.0639869, + "auxiliary_loss_mlp": 0.01262669, + "balance_loss_clip": 0.0626872, + "balance_loss_mlp": 0.01253597, + "epoch": 0.8257327521418909, + "flos": 23812631648640.0, + "grad_norm": 1.388790191971181, + "language_loss": 0.82709062, + "learning_rate": 3.1010569596024437e-07, + "loss": 0.90370417, + "num_input_tokens_seen": 296200175, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.09069824, + "step": 13734, + "time_per_iteration": 4.009546995162964 + }, + { + "auxiliary_loss_clip": 0.06404021, + "auxiliary_loss_mlp": 0.01264839, + "balance_loss_clip": 0.06273267, + "balance_loss_mlp": 0.01255141, + "epoch": 0.8257928753945588, + "flos": 19287129283200.0, + "grad_norm": 1.9103831477956985, + "language_loss": 0.83209223, + "learning_rate": 3.098974244989676e-07, + "loss": 0.90878081, + "num_input_tokens_seen": 296219305, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.09698486, + "step": 13735, + "time_per_iteration": 2.5026960372924805 + }, + { + "auxiliary_loss_clip": 0.06407285, + "auxiliary_loss_mlp": 0.01266501, + "balance_loss_clip": 0.06273124, + "balance_loss_mlp": 0.01256988, + "epoch": 0.8258529986472268, + "flos": 18484782912000.0, + "grad_norm": 1.810689318637808, + "language_loss": 0.70870662, + "learning_rate": 3.096892171265497e-07, + "loss": 0.7854445, + "num_input_tokens_seen": 296236945, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.09515381, + "step": 13736, + "time_per_iteration": 2.473515748977661 + }, + { + "auxiliary_loss_clip": 0.06316115, + "auxiliary_loss_mlp": 0.01253987, + "balance_loss_clip": 0.06261094, + "balance_loss_mlp": 0.01252863, + "epoch": 0.8259131218998947, + "flos": 62154903386880.0, + "grad_norm": 1.3034739276824252, + "language_loss": 0.67937154, + "learning_rate": 3.0948107385088665e-07, + "loss": 0.75507253, + "num_input_tokens_seen": 296294685, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.01126862, + "step": 13737, + "time_per_iteration": 3.0982251167297363 + }, + { + "auxiliary_loss_clip": 0.06403725, + "auxiliary_loss_mlp": 0.01264242, + "balance_loss_clip": 0.06270637, + "balance_loss_mlp": 0.01254538, + "epoch": 0.8259732451525628, + "flos": 22164781253760.0, + "grad_norm": 2.085431266289398, + "language_loss": 0.69943869, + "learning_rate": 3.0927299467987e-07, + "loss": 0.7761184, + "num_input_tokens_seen": 296314790, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.09698486, + "step": 13738, + "time_per_iteration": 2.5181643962860107 + }, + { + "auxiliary_loss_clip": 0.06404846, + "auxiliary_loss_mlp": 0.01267281, + "balance_loss_clip": 0.06271113, + "balance_loss_mlp": 0.0125626, + "epoch": 0.8260333684052307, + "flos": 38370587218560.0, + "grad_norm": 1.709303321450842, + "language_loss": 0.6325919, + "learning_rate": 3.090649796213911e-07, + "loss": 0.70931315, + "num_input_tokens_seen": 296335355, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.11016846, + "step": 13739, + "time_per_iteration": 2.622809886932373 + }, + { + "auxiliary_loss_clip": 0.06316274, + "auxiliary_loss_mlp": 0.01250838, + "balance_loss_clip": 0.06261257, + "balance_loss_mlp": 0.01249742, + "epoch": 0.8260934916578987, + "flos": 62204433949440.0, + "grad_norm": 0.8068403235468483, + "language_loss": 0.59232754, + "learning_rate": 3.0885702868333853e-07, + "loss": 0.66799867, + "num_input_tokens_seen": 296399885, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.01098633, + "step": 13740, + "time_per_iteration": 3.185506582260132 + }, + { + "auxiliary_loss_clip": 0.06413467, + "auxiliary_loss_mlp": 0.01267061, + "balance_loss_clip": 0.06273782, + "balance_loss_mlp": 0.01256571, + "epoch": 0.8261536149105667, + "flos": 22572138677760.0, + "grad_norm": 1.9838230010912559, + "language_loss": 0.75877976, + "learning_rate": 3.086491418735959e-07, + "loss": 0.83558506, + "num_input_tokens_seen": 296417660, + "router_z_loss_clip": 1.39648438, + "router_z_loss_mlp": 0.1048584, + "step": 13741, + "time_per_iteration": 2.5053927898406982 + }, + { + "auxiliary_loss_clip": 0.06405714, + "auxiliary_loss_mlp": 0.01264631, + "balance_loss_clip": 0.06272768, + "balance_loss_mlp": 0.01255124, + "epoch": 0.8262137381632346, + "flos": 32533705728000.0, + "grad_norm": 1.822033080058508, + "language_loss": 0.62812448, + "learning_rate": 3.0844131920004726e-07, + "loss": 0.70482796, + "num_input_tokens_seen": 296438255, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.09515381, + "step": 13742, + "time_per_iteration": 2.5799756050109863 + }, + { + "auxiliary_loss_clip": 0.06413151, + "auxiliary_loss_mlp": 0.01267602, + "balance_loss_clip": 0.06273061, + "balance_loss_mlp": 0.01256343, + "epoch": 0.8262738614159026, + "flos": 14141569104000.0, + "grad_norm": 3.472691543240307, + "language_loss": 0.67042887, + "learning_rate": 3.0823356067057327e-07, + "loss": 0.74723649, + "num_input_tokens_seen": 296454485, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.11254883, + "step": 13743, + "time_per_iteration": 2.4885993003845215 + }, + { + "auxiliary_loss_clip": 0.06408446, + "auxiliary_loss_mlp": 0.01266141, + "balance_loss_clip": 0.06274473, + "balance_loss_mlp": 0.01256133, + "epoch": 0.8263339846685706, + "flos": 19830934281600.0, + "grad_norm": 1.9106016851298016, + "language_loss": 0.67223948, + "learning_rate": 3.0802586629305283e-07, + "loss": 0.74898529, + "num_input_tokens_seen": 296473740, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.10009766, + "step": 13744, + "time_per_iteration": 2.50083589553833 + }, + { + "auxiliary_loss_clip": 0.06407204, + "auxiliary_loss_mlp": 0.0126747, + "balance_loss_clip": 0.06273009, + "balance_loss_mlp": 0.01257391, + "epoch": 0.8263941079212386, + "flos": 22752330883200.0, + "grad_norm": 1.6353552178667967, + "language_loss": 0.75895423, + "learning_rate": 3.078182360753612e-07, + "loss": 0.83570099, + "num_input_tokens_seen": 296493355, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.10083008, + "step": 13745, + "time_per_iteration": 2.5865373611450195 + }, + { + "auxiliary_loss_clip": 0.06400856, + "auxiliary_loss_mlp": 0.01263189, + "balance_loss_clip": 0.062732, + "balance_loss_mlp": 0.01254374, + "epoch": 0.8264542311739065, + "flos": 20126847375360.0, + "grad_norm": 1.8085857006682091, + "language_loss": 0.79174644, + "learning_rate": 3.076106700253709e-07, + "loss": 0.86838686, + "num_input_tokens_seen": 296510520, + "router_z_loss_clip": 1.27832031, + "router_z_loss_mlp": 0.0881958, + "step": 13746, + "time_per_iteration": 2.5261435508728027 + }, + { + "auxiliary_loss_clip": 0.06416452, + "auxiliary_loss_mlp": 0.01265894, + "balance_loss_clip": 0.06277022, + "balance_loss_mlp": 0.0125544, + "epoch": 0.8265143544265745, + "flos": 16842844229760.0, + "grad_norm": 2.5785036479328354, + "language_loss": 0.68477845, + "learning_rate": 3.0740316815095415e-07, + "loss": 0.76160187, + "num_input_tokens_seen": 296528265, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.10461426, + "step": 13747, + "time_per_iteration": 2.5118043422698975 + }, + { + "auxiliary_loss_clip": 0.06406212, + "auxiliary_loss_mlp": 0.01264342, + "balance_loss_clip": 0.06271359, + "balance_loss_mlp": 0.01254013, + "epoch": 0.8265744776792424, + "flos": 22025231078400.0, + "grad_norm": 1.914079416513022, + "language_loss": 0.75505137, + "learning_rate": 3.0719573045997835e-07, + "loss": 0.83175695, + "num_input_tokens_seen": 296547810, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.10327148, + "step": 13748, + "time_per_iteration": 2.5839946269989014 + }, + { + "auxiliary_loss_clip": 0.06398661, + "auxiliary_loss_mlp": 0.01266472, + "balance_loss_clip": 0.06269635, + "balance_loss_mlp": 0.01257442, + "epoch": 0.8266346009319104, + "flos": 19250889592320.0, + "grad_norm": 1.8963276954120185, + "language_loss": 0.63934255, + "learning_rate": 3.069883569603102e-07, + "loss": 0.71599388, + "num_input_tokens_seen": 296565940, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.09033203, + "step": 13749, + "time_per_iteration": 2.465831995010376 + }, + { + "auxiliary_loss_clip": 0.06401607, + "auxiliary_loss_mlp": 0.01267069, + "balance_loss_clip": 0.06269521, + "balance_loss_mlp": 0.01257806, + "epoch": 0.8266947241845783, + "flos": 24173016059520.0, + "grad_norm": 1.605270256625375, + "language_loss": 0.74094856, + "learning_rate": 3.067810476598132e-07, + "loss": 0.81763524, + "num_input_tokens_seen": 296585090, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.09259033, + "step": 13750, + "time_per_iteration": 2.516474723815918 + }, + { + "auxiliary_loss_clip": 0.06407044, + "auxiliary_loss_mlp": 0.01265047, + "balance_loss_clip": 0.0627216, + "balance_loss_mlp": 0.0125489, + "epoch": 0.8267548474372464, + "flos": 21112195063680.0, + "grad_norm": 1.905483524829514, + "language_loss": 0.65982723, + "learning_rate": 3.065738025663496e-07, + "loss": 0.73654807, + "num_input_tokens_seen": 296604950, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.1015625, + "step": 13751, + "time_per_iteration": 2.5073559284210205 + }, + { + "auxiliary_loss_clip": 0.064018, + "auxiliary_loss_mlp": 0.01263322, + "balance_loss_clip": 0.06270954, + "balance_loss_mlp": 0.01254382, + "epoch": 0.8268149706899143, + "flos": 39977711729280.0, + "grad_norm": 1.3811895515091794, + "language_loss": 0.60690141, + "learning_rate": 3.0636662168777607e-07, + "loss": 0.68355262, + "num_input_tokens_seen": 296627780, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.0894165, + "step": 13752, + "time_per_iteration": 2.6502721309661865 + }, + { + "auxiliary_loss_clip": 0.06312872, + "auxiliary_loss_mlp": 0.01249988, + "balance_loss_clip": 0.06258056, + "balance_loss_mlp": 0.01248881, + "epoch": 0.8268750939425823, + "flos": 65799290943360.0, + "grad_norm": 1.574540710975994, + "language_loss": 0.57428581, + "learning_rate": 3.0615950503194986e-07, + "loss": 0.64991438, + "num_input_tokens_seen": 296683850, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.01109314, + "step": 13753, + "time_per_iteration": 3.17626953125 + }, + { + "auxiliary_loss_clip": 0.06316203, + "auxiliary_loss_mlp": 0.01249962, + "balance_loss_clip": 0.06261422, + "balance_loss_mlp": 0.01248899, + "epoch": 0.8269352171952503, + "flos": 52997108227200.0, + "grad_norm": 0.6861116904276556, + "language_loss": 0.54860449, + "learning_rate": 3.0595245260672563e-07, + "loss": 0.62426615, + "num_input_tokens_seen": 296741420, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.01063538, + "step": 13754, + "time_per_iteration": 3.251030921936035 + }, + { + "auxiliary_loss_clip": 0.06401195, + "auxiliary_loss_mlp": 0.01262943, + "balance_loss_clip": 0.06269863, + "balance_loss_mlp": 0.0125386, + "epoch": 0.8269953404479182, + "flos": 23082848513280.0, + "grad_norm": 1.821155505252388, + "language_loss": 0.69514215, + "learning_rate": 3.0574546441995354e-07, + "loss": 0.77178347, + "num_input_tokens_seen": 296759620, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09082031, + "step": 13755, + "time_per_iteration": 2.5638794898986816 + }, + { + "auxiliary_loss_clip": 0.06400982, + "auxiliary_loss_mlp": 0.01263943, + "balance_loss_clip": 0.0627033, + "balance_loss_mlp": 0.01255408, + "epoch": 0.8270554637005862, + "flos": 14215222442880.0, + "grad_norm": 1.9620156908641344, + "language_loss": 0.70154935, + "learning_rate": 3.0553854047948324e-07, + "loss": 0.7781986, + "num_input_tokens_seen": 296777275, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.08538818, + "step": 13756, + "time_per_iteration": 2.4718971252441406 + }, + { + "auxiliary_loss_clip": 0.06405632, + "auxiliary_loss_mlp": 0.01265207, + "balance_loss_clip": 0.06272529, + "balance_loss_mlp": 0.01254997, + "epoch": 0.8271155869532542, + "flos": 21768450641280.0, + "grad_norm": 2.9125961441146204, + "language_loss": 0.72791404, + "learning_rate": 3.053316807931623e-07, + "loss": 0.80462241, + "num_input_tokens_seen": 296796655, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.10217285, + "step": 13757, + "time_per_iteration": 3.9486069679260254 + }, + { + "auxiliary_loss_clip": 0.06411837, + "auxiliary_loss_mlp": 0.01268236, + "balance_loss_clip": 0.06274478, + "balance_loss_mlp": 0.01256374, + "epoch": 0.8271757102059222, + "flos": 15125575127040.0, + "grad_norm": 2.5593838529176467, + "language_loss": 0.69374532, + "learning_rate": 3.0512488536883283e-07, + "loss": 0.77054602, + "num_input_tokens_seen": 296813705, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.11871338, + "step": 13758, + "time_per_iteration": 2.4854576587677 + }, + { + "auxiliary_loss_clip": 0.06399594, + "auxiliary_loss_mlp": 0.01266198, + "balance_loss_clip": 0.06270184, + "balance_loss_mlp": 0.01256984, + "epoch": 0.8272358334585901, + "flos": 24140549802240.0, + "grad_norm": 1.7114391651617498, + "language_loss": 0.70266873, + "learning_rate": 3.0491815421433775e-07, + "loss": 0.77932668, + "num_input_tokens_seen": 296833985, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.09210205, + "step": 13759, + "time_per_iteration": 2.517610788345337 + }, + { + "auxiliary_loss_clip": 0.06402884, + "auxiliary_loss_mlp": 0.01263273, + "balance_loss_clip": 0.0627152, + "balance_loss_mlp": 0.01253415, + "epoch": 0.8272959567112581, + "flos": 18996918266880.0, + "grad_norm": 1.6139248234121746, + "language_loss": 0.71018773, + "learning_rate": 3.047114873375161e-07, + "loss": 0.78684926, + "num_input_tokens_seen": 296850150, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.09863281, + "step": 13760, + "time_per_iteration": 2.5143585205078125 + }, + { + "auxiliary_loss_clip": 0.06399237, + "auxiliary_loss_mlp": 0.01265407, + "balance_loss_clip": 0.06269812, + "balance_loss_mlp": 0.01256162, + "epoch": 0.827356079963926, + "flos": 20637934554240.0, + "grad_norm": 1.8803974399165198, + "language_loss": 0.78203416, + "learning_rate": 3.0450488474620505e-07, + "loss": 0.85868061, + "num_input_tokens_seen": 296869585, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.09240723, + "step": 13761, + "time_per_iteration": 2.4832279682159424 + }, + { + "auxiliary_loss_clip": 0.06399886, + "auxiliary_loss_mlp": 0.01266752, + "balance_loss_clip": 0.06270774, + "balance_loss_mlp": 0.01257627, + "epoch": 0.827416203216594, + "flos": 22422777575040.0, + "grad_norm": 2.196661188611125, + "language_loss": 0.69947863, + "learning_rate": 3.042983464482387e-07, + "loss": 0.77614498, + "num_input_tokens_seen": 296887710, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.09124756, + "step": 13762, + "time_per_iteration": 2.522721290588379 + }, + { + "auxiliary_loss_clip": 0.06399816, + "auxiliary_loss_mlp": 0.01264392, + "balance_loss_clip": 0.06268964, + "balance_loss_mlp": 0.01255082, + "epoch": 0.827476326469262, + "flos": 19032235562880.0, + "grad_norm": 1.792228037314928, + "language_loss": 0.7011888, + "learning_rate": 3.0409187245144853e-07, + "loss": 0.77783084, + "num_input_tokens_seen": 296906265, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09313965, + "step": 13763, + "time_per_iteration": 2.486668825149536 + }, + { + "auxiliary_loss_clip": 0.0631156, + "auxiliary_loss_mlp": 0.01249503, + "balance_loss_clip": 0.06256869, + "balance_loss_mlp": 0.01248406, + "epoch": 0.82753644972193, + "flos": 68520942610560.0, + "grad_norm": 0.817208911394718, + "language_loss": 0.65143663, + "learning_rate": 3.038854627636651e-07, + "loss": 0.7270472, + "num_input_tokens_seen": 296971290, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.01098633, + "step": 13764, + "time_per_iteration": 3.1860270500183105 + }, + { + "auxiliary_loss_clip": 0.06402349, + "auxiliary_loss_mlp": 0.01265175, + "balance_loss_clip": 0.06270835, + "balance_loss_mlp": 0.01255001, + "epoch": 0.8275965729745979, + "flos": 18411255354240.0, + "grad_norm": 2.1408558147856427, + "language_loss": 0.7802, + "learning_rate": 3.0367911739271423e-07, + "loss": 0.8568753, + "num_input_tokens_seen": 296989060, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.10174561, + "step": 13765, + "time_per_iteration": 2.4712343215942383 + }, + { + "auxiliary_loss_clip": 0.06409816, + "auxiliary_loss_mlp": 0.01264455, + "balance_loss_clip": 0.06273708, + "balance_loss_mlp": 0.01254626, + "epoch": 0.8276566962272659, + "flos": 28519625957760.0, + "grad_norm": 1.515558220425856, + "language_loss": 0.62899083, + "learning_rate": 3.034728363464214e-07, + "loss": 0.70573354, + "num_input_tokens_seen": 297011300, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.0982666, + "step": 13766, + "time_per_iteration": 2.5880696773529053 + }, + { + "auxiliary_loss_clip": 0.06403887, + "auxiliary_loss_mlp": 0.01263304, + "balance_loss_clip": 0.06270833, + "balance_loss_mlp": 0.01253523, + "epoch": 0.8277168194799339, + "flos": 20236488842880.0, + "grad_norm": 1.5277982558115004, + "language_loss": 0.82747239, + "learning_rate": 3.03266619632609e-07, + "loss": 0.90414429, + "num_input_tokens_seen": 297030350, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.09777832, + "step": 13767, + "time_per_iteration": 3.9617438316345215 + }, + { + "auxiliary_loss_clip": 0.06405637, + "auxiliary_loss_mlp": 0.0126823, + "balance_loss_clip": 0.06271689, + "balance_loss_mlp": 0.01258717, + "epoch": 0.8277769427326018, + "flos": 28484350588800.0, + "grad_norm": 1.4875953854555823, + "language_loss": 0.69132233, + "learning_rate": 3.030604672590964e-07, + "loss": 0.76806098, + "num_input_tokens_seen": 297049710, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.09509277, + "step": 13768, + "time_per_iteration": 2.60477876663208 + }, + { + "auxiliary_loss_clip": 0.06398913, + "auxiliary_loss_mlp": 0.01264792, + "balance_loss_clip": 0.06269988, + "balance_loss_mlp": 0.01255649, + "epoch": 0.8278370659852698, + "flos": 27204808815360.0, + "grad_norm": 1.7806138521409314, + "language_loss": 0.74606562, + "learning_rate": 3.028543792337006e-07, + "loss": 0.82270265, + "num_input_tokens_seen": 297070510, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.09143066, + "step": 13769, + "time_per_iteration": 2.6588950157165527 + }, + { + "auxiliary_loss_clip": 0.06405737, + "auxiliary_loss_mlp": 0.01267282, + "balance_loss_clip": 0.06271692, + "balance_loss_mlp": 0.01257, + "epoch": 0.8278971892379378, + "flos": 37825272846720.0, + "grad_norm": 1.8746055345971568, + "language_loss": 0.74295783, + "learning_rate": 3.0264835556423675e-07, + "loss": 0.81968796, + "num_input_tokens_seen": 297092585, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.10290527, + "step": 13770, + "time_per_iteration": 4.066660165786743 + }, + { + "auxiliary_loss_clip": 0.06405378, + "auxiliary_loss_mlp": 0.0126564, + "balance_loss_clip": 0.06270339, + "balance_loss_mlp": 0.01255472, + "epoch": 0.8279573124906058, + "flos": 22565933475840.0, + "grad_norm": 1.7096340379903676, + "language_loss": 0.75903618, + "learning_rate": 3.0244239625851785e-07, + "loss": 0.83574641, + "num_input_tokens_seen": 297110055, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.10174561, + "step": 13771, + "time_per_iteration": 2.5009427070617676 + }, + { + "auxiliary_loss_clip": 0.06401806, + "auxiliary_loss_mlp": 0.01267922, + "balance_loss_clip": 0.06269084, + "balance_loss_mlp": 0.01258582, + "epoch": 0.8280174357432737, + "flos": 36073441134720.0, + "grad_norm": 1.4307953664451067, + "language_loss": 0.72807586, + "learning_rate": 3.0223650132435284e-07, + "loss": 0.80477321, + "num_input_tokens_seen": 297132170, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.09350586, + "step": 13772, + "time_per_iteration": 2.598695993423462 + }, + { + "auxiliary_loss_clip": 0.06398449, + "auxiliary_loss_mlp": 0.01266732, + "balance_loss_clip": 0.06268763, + "balance_loss_mlp": 0.01256885, + "epoch": 0.8280775589959417, + "flos": 22966834135680.0, + "grad_norm": 2.013252985793075, + "language_loss": 0.74714899, + "learning_rate": 3.0203067076955035e-07, + "loss": 0.8238008, + "num_input_tokens_seen": 297149515, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.09838867, + "step": 13773, + "time_per_iteration": 2.5045857429504395 + }, + { + "auxiliary_loss_clip": 0.06402349, + "auxiliary_loss_mlp": 0.01264809, + "balance_loss_clip": 0.06272508, + "balance_loss_mlp": 0.01255385, + "epoch": 0.8281376822486096, + "flos": 26069722680960.0, + "grad_norm": 1.7704579459247693, + "language_loss": 0.7591548, + "learning_rate": 3.01824904601915e-07, + "loss": 0.8358264, + "num_input_tokens_seen": 297170320, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.09429932, + "step": 13774, + "time_per_iteration": 3.989100694656372 + }, + { + "auxiliary_loss_clip": 0.064128, + "auxiliary_loss_mlp": 0.01264143, + "balance_loss_clip": 0.0627373, + "balance_loss_mlp": 0.0125432, + "epoch": 0.8281978055012776, + "flos": 20674048464000.0, + "grad_norm": 1.628782431293184, + "language_loss": 0.74902624, + "learning_rate": 3.01619202829249e-07, + "loss": 0.82579559, + "num_input_tokens_seen": 297189935, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.09820557, + "step": 13775, + "time_per_iteration": 2.4677510261535645 + }, + { + "auxiliary_loss_clip": 0.0640965, + "auxiliary_loss_mlp": 0.01264724, + "balance_loss_clip": 0.06271163, + "balance_loss_mlp": 0.01253882, + "epoch": 0.8282579287539455, + "flos": 29323062432000.0, + "grad_norm": 2.180106071080934, + "language_loss": 0.74249536, + "learning_rate": 3.01413565459353e-07, + "loss": 0.81923908, + "num_input_tokens_seen": 297210885, + "router_z_loss_clip": 1.38476562, + "router_z_loss_mlp": 0.10845947, + "step": 13776, + "time_per_iteration": 2.6236319541931152 + }, + { + "auxiliary_loss_clip": 0.0640358, + "auxiliary_loss_mlp": 0.01264371, + "balance_loss_clip": 0.06269941, + "balance_loss_mlp": 0.01254655, + "epoch": 0.8283180520066136, + "flos": 15711699237120.0, + "grad_norm": 1.9384324289396857, + "language_loss": 0.77343374, + "learning_rate": 3.0120799250002483e-07, + "loss": 0.85011321, + "num_input_tokens_seen": 297228500, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.097229, + "step": 13777, + "time_per_iteration": 2.456892490386963 + }, + { + "auxiliary_loss_clip": 0.06401777, + "auxiliary_loss_mlp": 0.01265761, + "balance_loss_clip": 0.06271677, + "balance_loss_mlp": 0.01256558, + "epoch": 0.8283781752592815, + "flos": 24798566315520.0, + "grad_norm": 1.5185722645753612, + "language_loss": 0.82944041, + "learning_rate": 3.010024839590604e-07, + "loss": 0.90611577, + "num_input_tokens_seen": 297249470, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.09191895, + "step": 13778, + "time_per_iteration": 2.5368337631225586 + }, + { + "auxiliary_loss_clip": 0.06397066, + "auxiliary_loss_mlp": 0.01264401, + "balance_loss_clip": 0.06269608, + "balance_loss_mlp": 0.012553, + "epoch": 0.8284382985119495, + "flos": 18987694318080.0, + "grad_norm": 1.7308701020376125, + "language_loss": 0.74615109, + "learning_rate": 3.0079703984425187e-07, + "loss": 0.82276577, + "num_input_tokens_seen": 297265970, + "router_z_loss_clip": 1.2734375, + "router_z_loss_mlp": 0.09100342, + "step": 13779, + "time_per_iteration": 2.4684152603149414 + }, + { + "auxiliary_loss_clip": 0.06314863, + "auxiliary_loss_mlp": 0.01250131, + "balance_loss_clip": 0.06260095, + "balance_loss_mlp": 0.01249052, + "epoch": 0.8284984217646175, + "flos": 61055832579840.0, + "grad_norm": 0.7787786070050955, + "language_loss": 0.56615424, + "learning_rate": 3.0059166016338954e-07, + "loss": 0.64180422, + "num_input_tokens_seen": 297325525, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.01081085, + "step": 13780, + "time_per_iteration": 3.151190757751465 + }, + { + "auxiliary_loss_clip": 0.06399573, + "auxiliary_loss_mlp": 0.01265439, + "balance_loss_clip": 0.06268763, + "balance_loss_mlp": 0.01256081, + "epoch": 0.8285585450172854, + "flos": 19719993075840.0, + "grad_norm": 1.6749294614493886, + "language_loss": 0.80124277, + "learning_rate": 3.0038634492426205e-07, + "loss": 0.87789285, + "num_input_tokens_seen": 297345025, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09350586, + "step": 13781, + "time_per_iteration": 2.486316442489624 + }, + { + "auxiliary_loss_clip": 0.06404715, + "auxiliary_loss_mlp": 0.01265372, + "balance_loss_clip": 0.06271574, + "balance_loss_mlp": 0.01254632, + "epoch": 0.8286186682699535, + "flos": 21695258499840.0, + "grad_norm": 1.909161291798896, + "language_loss": 0.76221263, + "learning_rate": 3.001810941346543e-07, + "loss": 0.83891356, + "num_input_tokens_seen": 297363570, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.10748291, + "step": 13782, + "time_per_iteration": 2.517943859100342 + }, + { + "auxiliary_loss_clip": 0.06404275, + "auxiliary_loss_mlp": 0.01263731, + "balance_loss_clip": 0.06269363, + "balance_loss_mlp": 0.01254212, + "epoch": 0.8286787915226214, + "flos": 25782656192640.0, + "grad_norm": 1.4991404242218924, + "language_loss": 0.76445484, + "learning_rate": 2.9997590780234983e-07, + "loss": 0.84113491, + "num_input_tokens_seen": 297385385, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.09527588, + "step": 13783, + "time_per_iteration": 2.521440267562866 + }, + { + "auxiliary_loss_clip": 0.06402531, + "auxiliary_loss_mlp": 0.01266148, + "balance_loss_clip": 0.06269924, + "balance_loss_mlp": 0.01256873, + "epoch": 0.8287389147752894, + "flos": 21294777110400.0, + "grad_norm": 1.7532816495627446, + "language_loss": 0.74151248, + "learning_rate": 2.997707859351304e-07, + "loss": 0.81819928, + "num_input_tokens_seen": 297403950, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09277344, + "step": 13784, + "time_per_iteration": 2.5014326572418213 + }, + { + "auxiliary_loss_clip": 0.06404807, + "auxiliary_loss_mlp": 0.01266618, + "balance_loss_clip": 0.06268123, + "balance_loss_mlp": 0.01255847, + "epoch": 0.8287990380279573, + "flos": 33552903265920.0, + "grad_norm": 3.27470400867833, + "language_loss": 0.69467115, + "learning_rate": 2.99565728540772e-07, + "loss": 0.77138543, + "num_input_tokens_seen": 297424565, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.10778809, + "step": 13785, + "time_per_iteration": 2.601536989212036 + }, + { + "auxiliary_loss_clip": 0.0640759, + "auxiliary_loss_mlp": 0.01266942, + "balance_loss_clip": 0.06274858, + "balance_loss_mlp": 0.0125722, + "epoch": 0.8288591612806253, + "flos": 22972997410560.0, + "grad_norm": 1.427433422724433, + "language_loss": 0.68698609, + "learning_rate": 2.993607356270516e-07, + "loss": 0.76373136, + "num_input_tokens_seen": 297445180, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.097229, + "step": 13786, + "time_per_iteration": 2.547952175140381 + }, + { + "auxiliary_loss_clip": 0.06411159, + "auxiliary_loss_mlp": 0.01263721, + "balance_loss_clip": 0.06272699, + "balance_loss_mlp": 0.01253648, + "epoch": 0.8289192845332932, + "flos": 18595053285120.0, + "grad_norm": 2.0138458745515635, + "language_loss": 0.77133876, + "learning_rate": 2.991558072017426e-07, + "loss": 0.84808755, + "num_input_tokens_seen": 297463790, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.10070801, + "step": 13787, + "time_per_iteration": 2.48760986328125 + }, + { + "auxiliary_loss_clip": 0.06400535, + "auxiliary_loss_mlp": 0.01266768, + "balance_loss_clip": 0.06270656, + "balance_loss_mlp": 0.01257053, + "epoch": 0.8289794077859612, + "flos": 15455841194880.0, + "grad_norm": 1.5818802638105176, + "language_loss": 0.80619884, + "learning_rate": 2.989509432726163e-07, + "loss": 0.88287187, + "num_input_tokens_seen": 297480100, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.09710693, + "step": 13788, + "time_per_iteration": 2.506680488586426 + }, + { + "auxiliary_loss_clip": 0.0640239, + "auxiliary_loss_mlp": 0.01262913, + "balance_loss_clip": 0.062705, + "balance_loss_mlp": 0.01252935, + "epoch": 0.8290395310386292, + "flos": 28885628592000.0, + "grad_norm": 1.4921693552910416, + "language_loss": 0.71268535, + "learning_rate": 2.9874614384744014e-07, + "loss": 0.78933835, + "num_input_tokens_seen": 297499890, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09973145, + "step": 13789, + "time_per_iteration": 2.559659719467163 + }, + { + "auxiliary_loss_clip": 0.06403467, + "auxiliary_loss_mlp": 0.01265989, + "balance_loss_clip": 0.06268575, + "balance_loss_mlp": 0.01255796, + "epoch": 0.8290996542912972, + "flos": 36585324927360.0, + "grad_norm": 1.757152625782574, + "language_loss": 0.68272877, + "learning_rate": 2.985414089339813e-07, + "loss": 0.75942338, + "num_input_tokens_seen": 297521440, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.10198975, + "step": 13790, + "time_per_iteration": 2.6251883506774902 + }, + { + "auxiliary_loss_clip": 0.06406529, + "auxiliary_loss_mlp": 0.01270326, + "balance_loss_clip": 0.06272461, + "balance_loss_mlp": 0.01259448, + "epoch": 0.8291597775439651, + "flos": 23629756112640.0, + "grad_norm": 1.6234366506097078, + "language_loss": 0.77228737, + "learning_rate": 2.9833673854000265e-07, + "loss": 0.84905589, + "num_input_tokens_seen": 297539920, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.10876465, + "step": 13791, + "time_per_iteration": 2.501948356628418 + }, + { + "auxiliary_loss_clip": 0.0639832, + "auxiliary_loss_mlp": 0.01264601, + "balance_loss_clip": 0.06269881, + "balance_loss_mlp": 0.0125507, + "epoch": 0.8292199007966331, + "flos": 21403873526400.0, + "grad_norm": 1.4641764539166646, + "language_loss": 0.7021268, + "learning_rate": 2.981321326732651e-07, + "loss": 0.77875602, + "num_input_tokens_seen": 297560000, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.09533691, + "step": 13792, + "time_per_iteration": 2.4955878257751465 + }, + { + "auxiliary_loss_clip": 0.06403746, + "auxiliary_loss_mlp": 0.01262629, + "balance_loss_clip": 0.06269513, + "balance_loss_mlp": 0.01253051, + "epoch": 0.829280024049301, + "flos": 28775232437760.0, + "grad_norm": 1.4298994778553897, + "language_loss": 0.65538836, + "learning_rate": 2.9792759134152736e-07, + "loss": 0.73205209, + "num_input_tokens_seen": 297579300, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.09576416, + "step": 13793, + "time_per_iteration": 2.6276164054870605 + }, + { + "auxiliary_loss_clip": 0.06406765, + "auxiliary_loss_mlp": 0.01265372, + "balance_loss_clip": 0.06271418, + "balance_loss_mlp": 0.01254375, + "epoch": 0.829340147301969, + "flos": 19944223401600.0, + "grad_norm": 1.8265320285164077, + "language_loss": 0.66246361, + "learning_rate": 2.977231145525461e-07, + "loss": 0.73918492, + "num_input_tokens_seen": 297598095, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.10992432, + "step": 13794, + "time_per_iteration": 2.5835254192352295 + }, + { + "auxiliary_loss_clip": 0.06403525, + "auxiliary_loss_mlp": 0.01263482, + "balance_loss_clip": 0.06269032, + "balance_loss_mlp": 0.01253224, + "epoch": 0.829400270554637, + "flos": 25235622812160.0, + "grad_norm": 1.749339694321301, + "language_loss": 0.6647079, + "learning_rate": 2.975187023140757e-07, + "loss": 0.74137801, + "num_input_tokens_seen": 297615955, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.10253906, + "step": 13795, + "time_per_iteration": 2.550981044769287 + }, + { + "auxiliary_loss_clip": 0.06396833, + "auxiliary_loss_mlp": 0.01263528, + "balance_loss_clip": 0.06271346, + "balance_loss_mlp": 0.01254325, + "epoch": 0.829460393807305, + "flos": 24470690088960.0, + "grad_norm": 1.6723308404898531, + "language_loss": 0.66547108, + "learning_rate": 2.973143546338661e-07, + "loss": 0.74207467, + "num_input_tokens_seen": 297636285, + "router_z_loss_clip": 1.25488281, + "router_z_loss_mlp": 0.09197998, + "step": 13796, + "time_per_iteration": 3.9565439224243164 + }, + { + "auxiliary_loss_clip": 0.06399691, + "auxiliary_loss_mlp": 0.01264289, + "balance_loss_clip": 0.06269552, + "balance_loss_mlp": 0.01254955, + "epoch": 0.829520517059973, + "flos": 15127923041280.0, + "grad_norm": 1.5185455706473978, + "language_loss": 0.7187897, + "learning_rate": 2.971100715196666e-07, + "loss": 0.79542947, + "num_input_tokens_seen": 297653315, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.09338379, + "step": 13797, + "time_per_iteration": 2.4948043823242188 + }, + { + "auxiliary_loss_clip": 0.06402339, + "auxiliary_loss_mlp": 0.01264653, + "balance_loss_clip": 0.06269293, + "balance_loss_mlp": 0.01255086, + "epoch": 0.8295806403126409, + "flos": 21586413646080.0, + "grad_norm": 2.404757591111986, + "language_loss": 0.7246393, + "learning_rate": 2.969058529792243e-07, + "loss": 0.80130923, + "num_input_tokens_seen": 297673480, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.09576416, + "step": 13798, + "time_per_iteration": 2.4797022342681885 + }, + { + "auxiliary_loss_clip": 0.06397392, + "auxiliary_loss_mlp": 0.01265773, + "balance_loss_clip": 0.06269975, + "balance_loss_mlp": 0.01256153, + "epoch": 0.8296407635653089, + "flos": 21733133345280.0, + "grad_norm": 1.6550926081962973, + "language_loss": 0.76771939, + "learning_rate": 2.967016990202822e-07, + "loss": 0.84435105, + "num_input_tokens_seen": 297693250, + "router_z_loss_clip": 1.27441406, + "router_z_loss_mlp": 0.09613037, + "step": 13799, + "time_per_iteration": 2.555518865585327 + }, + { + "auxiliary_loss_clip": 0.06404122, + "auxiliary_loss_mlp": 0.01265719, + "balance_loss_clip": 0.0627386, + "balance_loss_mlp": 0.01255861, + "epoch": 0.8297008868179768, + "flos": 11185777601280.0, + "grad_norm": 2.1813399594174707, + "language_loss": 0.67236793, + "learning_rate": 2.9649760965058245e-07, + "loss": 0.74906635, + "num_input_tokens_seen": 297710975, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.09844971, + "step": 13800, + "time_per_iteration": 2.4783506393432617 + }, + { + "auxiliary_loss_clip": 0.06410688, + "auxiliary_loss_mlp": 0.01267608, + "balance_loss_clip": 0.06274987, + "balance_loss_mlp": 0.01257475, + "epoch": 0.8297610100706448, + "flos": 20669688051840.0, + "grad_norm": 1.7037177836560289, + "language_loss": 0.74784625, + "learning_rate": 2.9629358487786515e-07, + "loss": 0.82462925, + "num_input_tokens_seen": 297730860, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10125732, + "step": 13801, + "time_per_iteration": 2.5596258640289307 + }, + { + "auxiliary_loss_clip": 0.0640378, + "auxiliary_loss_mlp": 0.01262459, + "balance_loss_clip": 0.06269964, + "balance_loss_mlp": 0.01253578, + "epoch": 0.8298211333233128, + "flos": 20382621563520.0, + "grad_norm": 1.588003382045365, + "language_loss": 0.73570353, + "learning_rate": 2.9608962470986476e-07, + "loss": 0.81236589, + "num_input_tokens_seen": 297749765, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.08880615, + "step": 13802, + "time_per_iteration": 2.496119260787964 + }, + { + "auxiliary_loss_clip": 0.06403106, + "auxiliary_loss_mlp": 0.01264947, + "balance_loss_clip": 0.06270137, + "balance_loss_mlp": 0.01255946, + "epoch": 0.8298812565759808, + "flos": 21515401710720.0, + "grad_norm": 2.0519420047620183, + "language_loss": 0.7494061, + "learning_rate": 2.9588572915431644e-07, + "loss": 0.82608664, + "num_input_tokens_seen": 297770380, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.09002686, + "step": 13803, + "time_per_iteration": 2.6064913272857666 + }, + { + "auxiliary_loss_clip": 0.06402676, + "auxiliary_loss_mlp": 0.01265284, + "balance_loss_clip": 0.0627119, + "balance_loss_mlp": 0.01255806, + "epoch": 0.8299413798286487, + "flos": 22825019900160.0, + "grad_norm": 1.5242051417957505, + "language_loss": 0.79350966, + "learning_rate": 2.9568189821895215e-07, + "loss": 0.87018931, + "num_input_tokens_seen": 297789440, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09484863, + "step": 13804, + "time_per_iteration": 2.505054235458374 + }, + { + "auxiliary_loss_clip": 0.06401961, + "auxiliary_loss_mlp": 0.01267397, + "balance_loss_clip": 0.06270176, + "balance_loss_mlp": 0.01258748, + "epoch": 0.8300015030813167, + "flos": 29686884860160.0, + "grad_norm": 1.6213005522916255, + "language_loss": 0.73804402, + "learning_rate": 2.954781319115016e-07, + "loss": 0.81473756, + "num_input_tokens_seen": 297810425, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.0864563, + "step": 13805, + "time_per_iteration": 2.5898725986480713 + }, + { + "auxiliary_loss_clip": 0.06408954, + "auxiliary_loss_mlp": 0.01266284, + "balance_loss_clip": 0.06273445, + "balance_loss_mlp": 0.01256729, + "epoch": 0.8300616263339846, + "flos": 19725653226240.0, + "grad_norm": 2.0487162307072637, + "language_loss": 0.7747584, + "learning_rate": 2.952744302396906e-07, + "loss": 0.85151076, + "num_input_tokens_seen": 297827680, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.09558105, + "step": 13806, + "time_per_iteration": 3.945065975189209 + }, + { + "auxiliary_loss_clip": 0.06408199, + "auxiliary_loss_mlp": 0.01269037, + "balance_loss_clip": 0.06272151, + "balance_loss_mlp": 0.01258612, + "epoch": 0.8301217495866526, + "flos": 19908151418880.0, + "grad_norm": 1.6678953757169233, + "language_loss": 0.6362474, + "learning_rate": 2.950707932112444e-07, + "loss": 0.71301973, + "num_input_tokens_seen": 297848005, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.10424805, + "step": 13807, + "time_per_iteration": 2.502906560897827 + }, + { + "auxiliary_loss_clip": 0.06403744, + "auxiliary_loss_mlp": 0.01264976, + "balance_loss_clip": 0.06271553, + "balance_loss_mlp": 0.01254712, + "epoch": 0.8301818728393207, + "flos": 19721334741120.0, + "grad_norm": 1.7549844688218141, + "language_loss": 0.73209536, + "learning_rate": 2.948672208338847e-07, + "loss": 0.80878258, + "num_input_tokens_seen": 297866730, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.1026001, + "step": 13808, + "time_per_iteration": 2.5253429412841797 + }, + { + "auxiliary_loss_clip": 0.06410588, + "auxiliary_loss_mlp": 0.01271132, + "balance_loss_clip": 0.06272304, + "balance_loss_mlp": 0.01259962, + "epoch": 0.8302419960919886, + "flos": 28301265417600.0, + "grad_norm": 1.9399976077342271, + "language_loss": 0.66693079, + "learning_rate": 2.9466371311533046e-07, + "loss": 0.74374801, + "num_input_tokens_seen": 297886390, + "router_z_loss_clip": 1.38183594, + "router_z_loss_mlp": 0.11169434, + "step": 13809, + "time_per_iteration": 2.5805299282073975 + }, + { + "auxiliary_loss_clip": 0.06404272, + "auxiliary_loss_mlp": 0.01264954, + "balance_loss_clip": 0.06270543, + "balance_loss_mlp": 0.0125543, + "epoch": 0.8303021193446566, + "flos": 18229344140160.0, + "grad_norm": 2.150755697017939, + "language_loss": 0.74353659, + "learning_rate": 2.9446027006329896e-07, + "loss": 0.82022887, + "num_input_tokens_seen": 297905110, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.09515381, + "step": 13810, + "time_per_iteration": 4.044435739517212 + }, + { + "auxiliary_loss_clip": 0.06400876, + "auxiliary_loss_mlp": 0.01262766, + "balance_loss_clip": 0.06271921, + "balance_loss_mlp": 0.01253873, + "epoch": 0.8303622425973245, + "flos": 23117956174080.0, + "grad_norm": 1.448926431854177, + "language_loss": 0.80966514, + "learning_rate": 2.94256891685505e-07, + "loss": 0.88630158, + "num_input_tokens_seen": 297925460, + "router_z_loss_clip": 1.28808594, + "router_z_loss_mlp": 0.08886719, + "step": 13811, + "time_per_iteration": 2.5290420055389404 + }, + { + "auxiliary_loss_clip": 0.06407966, + "auxiliary_loss_mlp": 0.01264465, + "balance_loss_clip": 0.06273555, + "balance_loss_mlp": 0.0125503, + "epoch": 0.8304223658499925, + "flos": 19578891600000.0, + "grad_norm": 1.6908085329827338, + "language_loss": 0.73443186, + "learning_rate": 2.9405357798966156e-07, + "loss": 0.81115615, + "num_input_tokens_seen": 297941760, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.09442139, + "step": 13812, + "time_per_iteration": 2.568941593170166 + }, + { + "auxiliary_loss_clip": 0.06397095, + "auxiliary_loss_mlp": 0.01262647, + "balance_loss_clip": 0.06270333, + "balance_loss_mlp": 0.0125311, + "epoch": 0.8304824891026604, + "flos": 24433066805760.0, + "grad_norm": 1.5937291888664733, + "language_loss": 0.78513122, + "learning_rate": 2.9385032898347664e-07, + "loss": 0.86172867, + "num_input_tokens_seen": 297959745, + "router_z_loss_clip": 1.26855469, + "router_z_loss_mlp": 0.09539795, + "step": 13813, + "time_per_iteration": 3.97314715385437 + }, + { + "auxiliary_loss_clip": 0.06403156, + "auxiliary_loss_mlp": 0.01268699, + "balance_loss_clip": 0.06268767, + "balance_loss_mlp": 0.01259037, + "epoch": 0.8305426123553284, + "flos": 22388214965760.0, + "grad_norm": 2.2493046221779154, + "language_loss": 0.70725965, + "learning_rate": 2.93647144674658e-07, + "loss": 0.78397816, + "num_input_tokens_seen": 297977665, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.09661865, + "step": 13814, + "time_per_iteration": 2.4843966960906982 + }, + { + "auxiliary_loss_clip": 0.06417993, + "auxiliary_loss_mlp": 0.01265221, + "balance_loss_clip": 0.06274544, + "balance_loss_mlp": 0.0125395, + "epoch": 0.8306027356079964, + "flos": 14908975522560.0, + "grad_norm": 1.9454896448298435, + "language_loss": 0.68298322, + "learning_rate": 2.9344402507091116e-07, + "loss": 0.75981534, + "num_input_tokens_seen": 297993525, + "router_z_loss_clip": 1.43652344, + "router_z_loss_mlp": 0.112854, + "step": 13815, + "time_per_iteration": 2.46174955368042 + }, + { + "auxiliary_loss_clip": 0.06407799, + "auxiliary_loss_mlp": 0.01266189, + "balance_loss_clip": 0.06275922, + "balance_loss_mlp": 0.01256813, + "epoch": 0.8306628588606644, + "flos": 19650406659840.0, + "grad_norm": 1.8213318920984873, + "language_loss": 0.75822055, + "learning_rate": 2.9324097017993745e-07, + "loss": 0.83496046, + "num_input_tokens_seen": 298012920, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09375, + "step": 13816, + "time_per_iteration": 2.4854626655578613 + }, + { + "auxiliary_loss_clip": 0.0640255, + "auxiliary_loss_mlp": 0.01267592, + "balance_loss_clip": 0.06270975, + "balance_loss_mlp": 0.01257877, + "epoch": 0.8307229821133323, + "flos": 24396701333760.0, + "grad_norm": 1.6852177652556903, + "language_loss": 0.81272721, + "learning_rate": 2.930379800094371e-07, + "loss": 0.88942862, + "num_input_tokens_seen": 298033310, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.09716797, + "step": 13817, + "time_per_iteration": 2.5231449604034424 + }, + { + "auxiliary_loss_clip": 0.06404524, + "auxiliary_loss_mlp": 0.01265573, + "balance_loss_clip": 0.06270438, + "balance_loss_mlp": 0.0125544, + "epoch": 0.8307831053660003, + "flos": 21003392136960.0, + "grad_norm": 8.152901765268279, + "language_loss": 0.78097743, + "learning_rate": 2.9283505456710875e-07, + "loss": 0.85767841, + "num_input_tokens_seen": 298053530, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.10137939, + "step": 13818, + "time_per_iteration": 2.485922336578369 + }, + { + "auxiliary_loss_clip": 0.06407157, + "auxiliary_loss_mlp": 0.01267645, + "balance_loss_clip": 0.06273211, + "balance_loss_mlp": 0.01258055, + "epoch": 0.8308432286186682, + "flos": 21403663891200.0, + "grad_norm": 1.7425405604946866, + "language_loss": 0.81941187, + "learning_rate": 2.926321938606453e-07, + "loss": 0.89615989, + "num_input_tokens_seen": 298069305, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.0958252, + "step": 13819, + "time_per_iteration": 2.502380609512329 + }, + { + "auxiliary_loss_clip": 0.06311036, + "auxiliary_loss_mlp": 0.01249, + "balance_loss_clip": 0.06256118, + "balance_loss_mlp": 0.01247877, + "epoch": 0.8309033518713362, + "flos": 62549724625920.0, + "grad_norm": 0.7595557497085774, + "language_loss": 0.56252456, + "learning_rate": 2.924293978977399e-07, + "loss": 0.63812494, + "num_input_tokens_seen": 298125830, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.01125336, + "step": 13820, + "time_per_iteration": 3.130770206451416 + }, + { + "auxiliary_loss_clip": 0.06398563, + "auxiliary_loss_mlp": 0.01264943, + "balance_loss_clip": 0.06269278, + "balance_loss_mlp": 0.0125549, + "epoch": 0.8309634751240043, + "flos": 16984155340800.0, + "grad_norm": 2.741466528675375, + "language_loss": 0.68642658, + "learning_rate": 2.922266666860831e-07, + "loss": 0.76306164, + "num_input_tokens_seen": 298142320, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.09460449, + "step": 13821, + "time_per_iteration": 2.4699923992156982 + }, + { + "auxiliary_loss_clip": 0.06413125, + "auxiliary_loss_mlp": 0.01271837, + "balance_loss_clip": 0.06274682, + "balance_loss_mlp": 0.01261227, + "epoch": 0.8310235983766722, + "flos": 22681067385600.0, + "grad_norm": 2.0343002066143656, + "language_loss": 0.69761801, + "learning_rate": 2.920240002333625e-07, + "loss": 0.77446771, + "num_input_tokens_seen": 298161845, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.10614014, + "step": 13822, + "time_per_iteration": 2.5079588890075684 + }, + { + "auxiliary_loss_clip": 0.06400213, + "auxiliary_loss_mlp": 0.01266021, + "balance_loss_clip": 0.06271067, + "balance_loss_mlp": 0.01256657, + "epoch": 0.8310837216293402, + "flos": 30819539226240.0, + "grad_norm": 1.7328336243228404, + "language_loss": 0.62461919, + "learning_rate": 2.918213985472631e-07, + "loss": 0.70128155, + "num_input_tokens_seen": 298184165, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.09362793, + "step": 13823, + "time_per_iteration": 2.5789008140563965 + }, + { + "auxiliary_loss_clip": 0.06309561, + "auxiliary_loss_mlp": 0.01248333, + "balance_loss_clip": 0.06254762, + "balance_loss_mlp": 0.01247223, + "epoch": 0.8311438448820081, + "flos": 71297338521600.0, + "grad_norm": 1.1093680468899019, + "language_loss": 0.61912626, + "learning_rate": 2.916188616354669e-07, + "loss": 0.69470519, + "num_input_tokens_seen": 298251720, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.01111603, + "step": 13824, + "time_per_iteration": 3.2229104042053223 + }, + { + "auxiliary_loss_clip": 0.06403864, + "auxiliary_loss_mlp": 0.01264891, + "balance_loss_clip": 0.06271043, + "balance_loss_mlp": 0.01255312, + "epoch": 0.8312039681346761, + "flos": 20893457180160.0, + "grad_norm": 1.4744362315601292, + "language_loss": 0.74351555, + "learning_rate": 2.914163895056552e-07, + "loss": 0.82020307, + "num_input_tokens_seen": 298271910, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.09576416, + "step": 13825, + "time_per_iteration": 2.517179250717163 + }, + { + "auxiliary_loss_clip": 0.06408161, + "auxiliary_loss_mlp": 0.01265357, + "balance_loss_clip": 0.06272039, + "balance_loss_mlp": 0.01255022, + "epoch": 0.831264091387344, + "flos": 17022910654080.0, + "grad_norm": 2.2747419309497454, + "language_loss": 0.80132711, + "learning_rate": 2.9121398216550486e-07, + "loss": 0.87806225, + "num_input_tokens_seen": 298288105, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.10333252, + "step": 13826, + "time_per_iteration": 2.525205612182617 + }, + { + "auxiliary_loss_clip": 0.06405398, + "auxiliary_loss_mlp": 0.01267526, + "balance_loss_clip": 0.06271683, + "balance_loss_mlp": 0.01257417, + "epoch": 0.831324214640012, + "flos": 24425436084480.0, + "grad_norm": 1.5111655704985965, + "language_loss": 0.68116403, + "learning_rate": 2.910116396226914e-07, + "loss": 0.75789326, + "num_input_tokens_seen": 298307600, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.10101318, + "step": 13827, + "time_per_iteration": 2.5607199668884277 + }, + { + "auxiliary_loss_clip": 0.06401044, + "auxiliary_loss_mlp": 0.01265932, + "balance_loss_clip": 0.06268896, + "balance_loss_mlp": 0.01257164, + "epoch": 0.83138433789268, + "flos": 13549407500160.0, + "grad_norm": 1.7373805058539677, + "language_loss": 0.74242985, + "learning_rate": 2.9080936188488834e-07, + "loss": 0.81909966, + "num_input_tokens_seen": 298323055, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.08770752, + "step": 13828, + "time_per_iteration": 2.5458273887634277 + }, + { + "auxiliary_loss_clip": 0.06403871, + "auxiliary_loss_mlp": 0.01267016, + "balance_loss_clip": 0.06269043, + "balance_loss_mlp": 0.01257461, + "epoch": 0.831444461145348, + "flos": 44502543262080.0, + "grad_norm": 1.528950624080937, + "language_loss": 0.67366755, + "learning_rate": 2.906071489597657e-07, + "loss": 0.75037646, + "num_input_tokens_seen": 298346950, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.09552002, + "step": 13829, + "time_per_iteration": 2.7112882137298584 + }, + { + "auxiliary_loss_clip": 0.06407791, + "auxiliary_loss_mlp": 0.0126509, + "balance_loss_clip": 0.06270997, + "balance_loss_mlp": 0.01255685, + "epoch": 0.8315045843980159, + "flos": 22710640677120.0, + "grad_norm": 1.4737259193269003, + "language_loss": 0.83000511, + "learning_rate": 2.9040500085499054e-07, + "loss": 0.90673393, + "num_input_tokens_seen": 298366315, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.09411621, + "step": 13830, + "time_per_iteration": 2.552797794342041 + }, + { + "auxiliary_loss_clip": 0.06401931, + "auxiliary_loss_mlp": 0.01267116, + "balance_loss_clip": 0.06269692, + "balance_loss_mlp": 0.01257621, + "epoch": 0.8315647076506839, + "flos": 16879167774720.0, + "grad_norm": 2.538750938791545, + "language_loss": 0.74429476, + "learning_rate": 2.9020291757822925e-07, + "loss": 0.8209852, + "num_input_tokens_seen": 298385185, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.0949707, + "step": 13831, + "time_per_iteration": 2.4796969890594482 + }, + { + "auxiliary_loss_clip": 0.06402907, + "auxiliary_loss_mlp": 0.01264485, + "balance_loss_clip": 0.06268609, + "balance_loss_mlp": 0.01254209, + "epoch": 0.8316248309033518, + "flos": 13813902512640.0, + "grad_norm": 1.6232172408700758, + "language_loss": 0.71379286, + "learning_rate": 2.9000089913714523e-07, + "loss": 0.79046679, + "num_input_tokens_seen": 298402335, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.10272217, + "step": 13832, + "time_per_iteration": 2.503822088241577 + }, + { + "auxiliary_loss_clip": 0.06402344, + "auxiliary_loss_mlp": 0.01266599, + "balance_loss_clip": 0.06269842, + "balance_loss_mlp": 0.01256532, + "epoch": 0.8316849541560198, + "flos": 23519066469120.0, + "grad_norm": 1.7239960485103385, + "language_loss": 0.84317935, + "learning_rate": 2.897989455393979e-07, + "loss": 0.91986877, + "num_input_tokens_seen": 298423370, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.10076904, + "step": 13833, + "time_per_iteration": 2.5225701332092285 + }, + { + "auxiliary_loss_clip": 0.06408376, + "auxiliary_loss_mlp": 0.01269207, + "balance_loss_clip": 0.06272519, + "balance_loss_mlp": 0.01258955, + "epoch": 0.8317450774086879, + "flos": 23778530236800.0, + "grad_norm": 1.4639374420943632, + "language_loss": 0.76301664, + "learning_rate": 2.8959705679264625e-07, + "loss": 0.83979249, + "num_input_tokens_seen": 298444835, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.10253906, + "step": 13834, + "time_per_iteration": 2.6075844764709473 + }, + { + "auxiliary_loss_clip": 0.06396806, + "auxiliary_loss_mlp": 0.01266204, + "balance_loss_clip": 0.06267913, + "balance_loss_mlp": 0.01256846, + "epoch": 0.8318052006613558, + "flos": 16220899699200.0, + "grad_norm": 2.069589955376862, + "language_loss": 0.79849654, + "learning_rate": 2.893952329045459e-07, + "loss": 0.87512666, + "num_input_tokens_seen": 298461845, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.09356689, + "step": 13835, + "time_per_iteration": 3.9197564125061035 + }, + { + "auxiliary_loss_clip": 0.06407574, + "auxiliary_loss_mlp": 0.01269404, + "balance_loss_clip": 0.06272114, + "balance_loss_mlp": 0.01258651, + "epoch": 0.8318653239140238, + "flos": 19980714654720.0, + "grad_norm": 1.9805915742571252, + "language_loss": 0.81482506, + "learning_rate": 2.8919347388274905e-07, + "loss": 0.89159477, + "num_input_tokens_seen": 298479095, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.10760498, + "step": 13836, + "time_per_iteration": 2.506603240966797 + }, + { + "auxiliary_loss_clip": 0.06401465, + "auxiliary_loss_mlp": 0.01266316, + "balance_loss_clip": 0.06271641, + "balance_loss_mlp": 0.01257041, + "epoch": 0.8319254471666917, + "flos": 17709200720640.0, + "grad_norm": 1.8870445084181289, + "language_loss": 0.77578962, + "learning_rate": 2.8899177973490727e-07, + "loss": 0.85246742, + "num_input_tokens_seen": 298494475, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.09277344, + "step": 13837, + "time_per_iteration": 2.454270839691162 + }, + { + "auxiliary_loss_clip": 0.06408006, + "auxiliary_loss_mlp": 0.01264544, + "balance_loss_clip": 0.06268995, + "balance_loss_mlp": 0.01253749, + "epoch": 0.8319855704193597, + "flos": 19542609982080.0, + "grad_norm": 2.110524167983125, + "language_loss": 0.8394767, + "learning_rate": 2.887901504686685e-07, + "loss": 0.91620213, + "num_input_tokens_seen": 298513185, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.10791016, + "step": 13838, + "time_per_iteration": 2.5159199237823486 + }, + { + "auxiliary_loss_clip": 0.06400914, + "auxiliary_loss_mlp": 0.0126734, + "balance_loss_clip": 0.06270094, + "balance_loss_mlp": 0.01257595, + "epoch": 0.8320456936720276, + "flos": 21184339029120.0, + "grad_norm": 1.9156833366254606, + "language_loss": 0.74626046, + "learning_rate": 2.885885860916795e-07, + "loss": 0.82294297, + "num_input_tokens_seen": 298531885, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.09753418, + "step": 13839, + "time_per_iteration": 2.491990327835083 + }, + { + "auxiliary_loss_clip": 0.06401457, + "auxiliary_loss_mlp": 0.01267004, + "balance_loss_clip": 0.06270081, + "balance_loss_mlp": 0.01256871, + "epoch": 0.8321058169246957, + "flos": 33258499545600.0, + "grad_norm": 1.3285467240980675, + "language_loss": 0.6792466, + "learning_rate": 2.8838708661158253e-07, + "loss": 0.7559312, + "num_input_tokens_seen": 298554905, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.10125732, + "step": 13840, + "time_per_iteration": 2.6044259071350098 + }, + { + "auxiliary_loss_clip": 0.06402262, + "auxiliary_loss_mlp": 0.01264716, + "balance_loss_clip": 0.06269044, + "balance_loss_mlp": 0.01254864, + "epoch": 0.8321659401773636, + "flos": 14213042236800.0, + "grad_norm": 4.021967682846655, + "language_loss": 0.79046482, + "learning_rate": 2.8818565203601843e-07, + "loss": 0.86713463, + "num_input_tokens_seen": 298571185, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.09857178, + "step": 13841, + "time_per_iteration": 2.504321813583374 + }, + { + "auxiliary_loss_clip": 0.06404769, + "auxiliary_loss_mlp": 0.01265123, + "balance_loss_clip": 0.06273419, + "balance_loss_mlp": 0.01255116, + "epoch": 0.8322260634300316, + "flos": 15163575753600.0, + "grad_norm": 1.790117375766772, + "language_loss": 0.6903125, + "learning_rate": 2.879842823726262e-07, + "loss": 0.7670114, + "num_input_tokens_seen": 298588505, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.10003662, + "step": 13842, + "time_per_iteration": 2.604609727859497 + }, + { + "auxiliary_loss_clip": 0.06401818, + "auxiliary_loss_mlp": 0.0126278, + "balance_loss_clip": 0.06271365, + "balance_loss_mlp": 0.0125335, + "epoch": 0.8322861866826995, + "flos": 25307766777600.0, + "grad_norm": 1.5988272572181073, + "language_loss": 0.7293849, + "learning_rate": 2.8778297762904124e-07, + "loss": 0.80603087, + "num_input_tokens_seen": 298609295, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.09429932, + "step": 13843, + "time_per_iteration": 2.5483405590057373 + }, + { + "auxiliary_loss_clip": 0.06402604, + "auxiliary_loss_mlp": 0.01265654, + "balance_loss_clip": 0.06272865, + "balance_loss_mlp": 0.012557, + "epoch": 0.8323463099353675, + "flos": 17025048933120.0, + "grad_norm": 1.8400123235458858, + "language_loss": 0.77913845, + "learning_rate": 2.875817378128975e-07, + "loss": 0.85582101, + "num_input_tokens_seen": 298625765, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.09960938, + "step": 13844, + "time_per_iteration": 2.4670820236206055 + }, + { + "auxiliary_loss_clip": 0.06305504, + "auxiliary_loss_mlp": 0.01249937, + "balance_loss_clip": 0.06250882, + "balance_loss_mlp": 0.01248978, + "epoch": 0.8324064331880354, + "flos": 55623891473280.0, + "grad_norm": 0.7656518325639754, + "language_loss": 0.55256236, + "learning_rate": 2.8738056293182624e-07, + "loss": 0.62811679, + "num_input_tokens_seen": 298683005, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.00958252, + "step": 13845, + "time_per_iteration": 3.0772175788879395 + }, + { + "auxiliary_loss_clip": 0.06407619, + "auxiliary_loss_mlp": 0.01269902, + "balance_loss_clip": 0.06272799, + "balance_loss_mlp": 0.01259436, + "epoch": 0.8324665564407034, + "flos": 26145472371840.0, + "grad_norm": 1.647375417376456, + "language_loss": 0.75653505, + "learning_rate": 2.871794529934555e-07, + "loss": 0.83331025, + "num_input_tokens_seen": 298703060, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.10467529, + "step": 13846, + "time_per_iteration": 3.9581072330474854 + }, + { + "auxiliary_loss_clip": 0.06408981, + "auxiliary_loss_mlp": 0.01263143, + "balance_loss_clip": 0.0627064, + "balance_loss_mlp": 0.01253064, + "epoch": 0.8325266796933715, + "flos": 22054846296960.0, + "grad_norm": 1.6287665885130769, + "language_loss": 0.79051238, + "learning_rate": 2.8697840800541115e-07, + "loss": 0.86723363, + "num_input_tokens_seen": 298721765, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.10083008, + "step": 13847, + "time_per_iteration": 2.5202043056488037 + }, + { + "auxiliary_loss_clip": 0.06398055, + "auxiliary_loss_mlp": 0.01262414, + "balance_loss_clip": 0.06268965, + "balance_loss_mlp": 0.01253759, + "epoch": 0.8325868029460394, + "flos": 22822630058880.0, + "grad_norm": 1.530549975631268, + "language_loss": 0.74613917, + "learning_rate": 2.867774279753175e-07, + "loss": 0.82274389, + "num_input_tokens_seen": 298740825, + "router_z_loss_clip": 1.28808594, + "router_z_loss_mlp": 0.08660889, + "step": 13848, + "time_per_iteration": 2.4909098148345947 + }, + { + "auxiliary_loss_clip": 0.06400839, + "auxiliary_loss_mlp": 0.01264258, + "balance_loss_clip": 0.06268533, + "balance_loss_mlp": 0.01254698, + "epoch": 0.8326469261987074, + "flos": 14762800874880.0, + "grad_norm": 1.7394702497172616, + "language_loss": 0.63918781, + "learning_rate": 2.8657651291079554e-07, + "loss": 0.71583879, + "num_input_tokens_seen": 298758515, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.09552002, + "step": 13849, + "time_per_iteration": 3.910769462585449 + }, + { + "auxiliary_loss_clip": 0.06406453, + "auxiliary_loss_mlp": 0.01263072, + "balance_loss_clip": 0.06271137, + "balance_loss_mlp": 0.01253517, + "epoch": 0.8327070494513753, + "flos": 22932145745280.0, + "grad_norm": 2.1227901634386503, + "language_loss": 0.80123168, + "learning_rate": 2.863756628194638e-07, + "loss": 0.87792695, + "num_input_tokens_seen": 298776375, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.09558105, + "step": 13850, + "time_per_iteration": 2.566984176635742 + }, + { + "auxiliary_loss_clip": 0.06396942, + "auxiliary_loss_mlp": 0.01264144, + "balance_loss_clip": 0.06270191, + "balance_loss_mlp": 0.01255215, + "epoch": 0.8327671727040433, + "flos": 20671197425280.0, + "grad_norm": 1.4808337562018643, + "language_loss": 0.7818718, + "learning_rate": 2.8617487770893877e-07, + "loss": 0.85848272, + "num_input_tokens_seen": 298795135, + "router_z_loss_clip": 1.26757812, + "router_z_loss_mlp": 0.08929443, + "step": 13851, + "time_per_iteration": 2.5099880695343018 + }, + { + "auxiliary_loss_clip": 0.06312843, + "auxiliary_loss_mlp": 0.01249612, + "balance_loss_clip": 0.0625807, + "balance_loss_mlp": 0.01248607, + "epoch": 0.8328272959567112, + "flos": 56079353940480.0, + "grad_norm": 0.7536621450911318, + "language_loss": 0.55871034, + "learning_rate": 2.859741575868344e-07, + "loss": 0.63433486, + "num_input_tokens_seen": 298855475, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.01004028, + "step": 13852, + "time_per_iteration": 4.512012481689453 + }, + { + "auxiliary_loss_clip": 0.06398302, + "auxiliary_loss_mlp": 0.01263734, + "balance_loss_clip": 0.06268968, + "balance_loss_mlp": 0.01254489, + "epoch": 0.8328874192093793, + "flos": 32310691286400.0, + "grad_norm": 1.6000652878279704, + "language_loss": 0.67475963, + "learning_rate": 2.8577350246076125e-07, + "loss": 0.75137997, + "num_input_tokens_seen": 298875875, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.09234619, + "step": 13853, + "time_per_iteration": 2.668245792388916 + }, + { + "auxiliary_loss_clip": 0.06401832, + "auxiliary_loss_mlp": 0.01265209, + "balance_loss_clip": 0.06269578, + "balance_loss_mlp": 0.01256078, + "epoch": 0.8329475424620472, + "flos": 23519276104320.0, + "grad_norm": 1.4809556144890181, + "language_loss": 0.78642273, + "learning_rate": 2.855729123383286e-07, + "loss": 0.86309314, + "num_input_tokens_seen": 298895950, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.09124756, + "step": 13854, + "time_per_iteration": 2.5354175567626953 + }, + { + "auxiliary_loss_clip": 0.06309453, + "auxiliary_loss_mlp": 0.0124937, + "balance_loss_clip": 0.06254782, + "balance_loss_mlp": 0.01248336, + "epoch": 0.8330076657147152, + "flos": 67860410474880.0, + "grad_norm": 4.455324750963288, + "language_loss": 0.58546513, + "learning_rate": 2.8537238722714295e-07, + "loss": 0.66105336, + "num_input_tokens_seen": 298955770, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.01035309, + "step": 13855, + "time_per_iteration": 3.0676519870758057 + }, + { + "auxiliary_loss_clip": 0.06402063, + "auxiliary_loss_mlp": 0.01267065, + "balance_loss_clip": 0.06270753, + "balance_loss_mlp": 0.01257623, + "epoch": 0.8330677889673831, + "flos": 22899344071680.0, + "grad_norm": 1.6055070221032506, + "language_loss": 0.72260499, + "learning_rate": 2.8517192713480853e-07, + "loss": 0.79929626, + "num_input_tokens_seen": 298976545, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09442139, + "step": 13856, + "time_per_iteration": 2.50966477394104 + }, + { + "auxiliary_loss_clip": 0.06400804, + "auxiliary_loss_mlp": 0.01263391, + "balance_loss_clip": 0.06269211, + "balance_loss_mlp": 0.01254302, + "epoch": 0.8331279122200511, + "flos": 27352492836480.0, + "grad_norm": 1.6328456842097132, + "language_loss": 0.75703955, + "learning_rate": 2.8497153206892677e-07, + "loss": 0.83368158, + "num_input_tokens_seen": 298996750, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09088135, + "step": 13857, + "time_per_iteration": 2.582209348678589 + }, + { + "auxiliary_loss_clip": 0.06396064, + "auxiliary_loss_mlp": 0.01264751, + "balance_loss_clip": 0.06270817, + "balance_loss_mlp": 0.01256412, + "epoch": 0.833188035472719, + "flos": 19944349182720.0, + "grad_norm": 1.4115605365578703, + "language_loss": 0.73776948, + "learning_rate": 2.847712020370958e-07, + "loss": 0.81437761, + "num_input_tokens_seen": 299014895, + "router_z_loss_clip": 1.25097656, + "router_z_loss_mlp": 0.08343506, + "step": 13858, + "time_per_iteration": 2.4927241802215576 + }, + { + "auxiliary_loss_clip": 0.06408291, + "auxiliary_loss_mlp": 0.01263657, + "balance_loss_clip": 0.06270398, + "balance_loss_mlp": 0.01253077, + "epoch": 0.833248158725387, + "flos": 15238193414400.0, + "grad_norm": 1.8777327656699931, + "language_loss": 0.73586631, + "learning_rate": 2.8457093704691316e-07, + "loss": 0.81258577, + "num_input_tokens_seen": 299032855, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.10577393, + "step": 13859, + "time_per_iteration": 2.486278772354126 + }, + { + "auxiliary_loss_clip": 0.06396432, + "auxiliary_loss_mlp": 0.01261836, + "balance_loss_clip": 0.06268351, + "balance_loss_mlp": 0.01253068, + "epoch": 0.8333082819780551, + "flos": 24542498638080.0, + "grad_norm": 1.8006360517161475, + "language_loss": 0.79587913, + "learning_rate": 2.8437073710597205e-07, + "loss": 0.8724618, + "num_input_tokens_seen": 299052055, + "router_z_loss_clip": 1.27929688, + "router_z_loss_mlp": 0.08758545, + "step": 13860, + "time_per_iteration": 2.5347378253936768 + }, + { + "auxiliary_loss_clip": 0.06397815, + "auxiliary_loss_mlp": 0.01263404, + "balance_loss_clip": 0.06269313, + "balance_loss_mlp": 0.01253587, + "epoch": 0.833368405230723, + "flos": 31475459387520.0, + "grad_norm": 1.331184598111947, + "language_loss": 0.82059163, + "learning_rate": 2.841706022218644e-07, + "loss": 0.8972038, + "num_input_tokens_seen": 299075285, + "router_z_loss_clip": 1.28613281, + "router_z_loss_mlp": 0.09814453, + "step": 13861, + "time_per_iteration": 2.596620798110962 + }, + { + "auxiliary_loss_clip": 0.06403266, + "auxiliary_loss_mlp": 0.01263304, + "balance_loss_clip": 0.06269847, + "balance_loss_mlp": 0.01253612, + "epoch": 0.833428528483391, + "flos": 14907969273600.0, + "grad_norm": 2.0930392556912447, + "language_loss": 0.79152417, + "learning_rate": 2.839705324021806e-07, + "loss": 0.86818981, + "num_input_tokens_seen": 299092520, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.09686279, + "step": 13862, + "time_per_iteration": 2.4472010135650635 + }, + { + "auxiliary_loss_clip": 0.06405707, + "auxiliary_loss_mlp": 0.01262183, + "balance_loss_clip": 0.06271005, + "balance_loss_mlp": 0.0125279, + "epoch": 0.8334886517360589, + "flos": 22206303751680.0, + "grad_norm": 1.6507722224166845, + "language_loss": 0.74980801, + "learning_rate": 2.83770527654505e-07, + "loss": 0.82648689, + "num_input_tokens_seen": 299109450, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.09399414, + "step": 13863, + "time_per_iteration": 2.494450569152832 + }, + { + "auxiliary_loss_clip": 0.06399452, + "auxiliary_loss_mlp": 0.01266138, + "balance_loss_clip": 0.06272257, + "balance_loss_mlp": 0.01256995, + "epoch": 0.8335487749887269, + "flos": 30380386377600.0, + "grad_norm": 5.135787436980748, + "language_loss": 0.74829161, + "learning_rate": 2.835705879864232e-07, + "loss": 0.82494748, + "num_input_tokens_seen": 299129540, + "router_z_loss_clip": 1.27148438, + "router_z_loss_mlp": 0.09136963, + "step": 13864, + "time_per_iteration": 2.5583794116973877 + }, + { + "auxiliary_loss_clip": 0.06403541, + "auxiliary_loss_mlp": 0.01261902, + "balance_loss_clip": 0.06270568, + "balance_loss_mlp": 0.01252132, + "epoch": 0.8336088982413948, + "flos": 24688086307200.0, + "grad_norm": 1.7939101265667057, + "language_loss": 0.69765973, + "learning_rate": 2.833707134055168e-07, + "loss": 0.77431417, + "num_input_tokens_seen": 299148670, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.09765625, + "step": 13865, + "time_per_iteration": 2.534938097000122 + }, + { + "auxiliary_loss_clip": 0.06400782, + "auxiliary_loss_mlp": 0.01264858, + "balance_loss_clip": 0.06268555, + "balance_loss_mlp": 0.01254814, + "epoch": 0.8336690214940629, + "flos": 38185783038720.0, + "grad_norm": 1.4964179575406336, + "language_loss": 0.75587916, + "learning_rate": 2.831709039193653e-07, + "loss": 0.83253551, + "num_input_tokens_seen": 299169330, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.10046387, + "step": 13866, + "time_per_iteration": 2.6298201084136963 + }, + { + "auxiliary_loss_clip": 0.06310411, + "auxiliary_loss_mlp": 0.01251665, + "balance_loss_clip": 0.06255429, + "balance_loss_mlp": 0.01250576, + "epoch": 0.8337291447467308, + "flos": 55580062988160.0, + "grad_norm": 0.8509039314990504, + "language_loss": 0.6284281, + "learning_rate": 2.8297115953554465e-07, + "loss": 0.70404887, + "num_input_tokens_seen": 299220980, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.01091003, + "step": 13867, + "time_per_iteration": 3.0660109519958496 + }, + { + "auxiliary_loss_clip": 0.06398972, + "auxiliary_loss_mlp": 0.01264557, + "balance_loss_clip": 0.0626954, + "balance_loss_mlp": 0.01255515, + "epoch": 0.8337892679993988, + "flos": 24140340167040.0, + "grad_norm": 1.8547798231476953, + "language_loss": 0.72195852, + "learning_rate": 2.827714802616301e-07, + "loss": 0.79859376, + "num_input_tokens_seen": 299240130, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.09039307, + "step": 13868, + "time_per_iteration": 2.5227153301239014 + }, + { + "auxiliary_loss_clip": 0.06403849, + "auxiliary_loss_mlp": 0.0126499, + "balance_loss_clip": 0.06272048, + "balance_loss_mlp": 0.01255406, + "epoch": 0.8338493912520667, + "flos": 28191456241920.0, + "grad_norm": 1.3524554239509516, + "language_loss": 0.8040902, + "learning_rate": 2.8257186610519325e-07, + "loss": 0.88077855, + "num_input_tokens_seen": 299260705, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.0958252, + "step": 13869, + "time_per_iteration": 2.564680814743042 + }, + { + "auxiliary_loss_clip": 0.06403009, + "auxiliary_loss_mlp": 0.01267319, + "balance_loss_clip": 0.06271793, + "balance_loss_mlp": 0.01257818, + "epoch": 0.8339095145047347, + "flos": 22163984640000.0, + "grad_norm": 1.4875129545200938, + "language_loss": 0.82728314, + "learning_rate": 2.823723170738028e-07, + "loss": 0.90398633, + "num_input_tokens_seen": 299278925, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09515381, + "step": 13870, + "time_per_iteration": 2.5508410930633545 + }, + { + "auxiliary_loss_clip": 0.06403069, + "auxiliary_loss_mlp": 0.01263716, + "balance_loss_clip": 0.0626779, + "balance_loss_mlp": 0.01253732, + "epoch": 0.8339696377574026, + "flos": 17312157348480.0, + "grad_norm": 2.4426569314724897, + "language_loss": 0.70744705, + "learning_rate": 2.821728331750264e-07, + "loss": 0.78411496, + "num_input_tokens_seen": 299291580, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.09985352, + "step": 13871, + "time_per_iteration": 2.4675514698028564 + }, + { + "auxiliary_loss_clip": 0.06398696, + "auxiliary_loss_mlp": 0.01268514, + "balance_loss_clip": 0.06271016, + "balance_loss_mlp": 0.01259239, + "epoch": 0.8340297610100706, + "flos": 20674719296640.0, + "grad_norm": 1.8163865761424458, + "language_loss": 0.69741249, + "learning_rate": 2.8197341441642853e-07, + "loss": 0.77408463, + "num_input_tokens_seen": 299310385, + "router_z_loss_clip": 1.27734375, + "router_z_loss_mlp": 0.0927124, + "step": 13872, + "time_per_iteration": 2.491567850112915 + }, + { + "auxiliary_loss_clip": 0.06401074, + "auxiliary_loss_mlp": 0.01264411, + "balance_loss_clip": 0.06269651, + "balance_loss_mlp": 0.01255393, + "epoch": 0.8340898842627387, + "flos": 20520620438400.0, + "grad_norm": 1.9144712990345532, + "language_loss": 0.73314548, + "learning_rate": 2.817740608055712e-07, + "loss": 0.80980027, + "num_input_tokens_seen": 299327660, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09014893, + "step": 13873, + "time_per_iteration": 2.5135762691497803 + }, + { + "auxiliary_loss_clip": 0.0640478, + "auxiliary_loss_mlp": 0.01264886, + "balance_loss_clip": 0.06268793, + "balance_loss_mlp": 0.01253406, + "epoch": 0.8341500075154066, + "flos": 21430889268480.0, + "grad_norm": 1.9289693759151987, + "language_loss": 0.75107884, + "learning_rate": 2.81574772350013e-07, + "loss": 0.82777548, + "num_input_tokens_seen": 299343685, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.1149292, + "step": 13874, + "time_per_iteration": 3.931234836578369 + }, + { + "auxiliary_loss_clip": 0.06398948, + "auxiliary_loss_mlp": 0.01263903, + "balance_loss_clip": 0.06270257, + "balance_loss_mlp": 0.0125433, + "epoch": 0.8342101307680746, + "flos": 22097542752000.0, + "grad_norm": 1.82369189329911, + "language_loss": 0.66693133, + "learning_rate": 2.813755490573118e-07, + "loss": 0.74355984, + "num_input_tokens_seen": 299363305, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.09570312, + "step": 13875, + "time_per_iteration": 2.5164341926574707 + }, + { + "auxiliary_loss_clip": 0.06399906, + "auxiliary_loss_mlp": 0.01265355, + "balance_loss_clip": 0.06269918, + "balance_loss_mlp": 0.01256301, + "epoch": 0.8342702540207425, + "flos": 21877882473600.0, + "grad_norm": 1.7413436247771745, + "language_loss": 0.80487454, + "learning_rate": 2.8117639093502243e-07, + "loss": 0.88152719, + "num_input_tokens_seen": 299382630, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.09051514, + "step": 13876, + "time_per_iteration": 2.5353636741638184 + }, + { + "auxiliary_loss_clip": 0.06400505, + "auxiliary_loss_mlp": 0.01261691, + "balance_loss_clip": 0.06270204, + "balance_loss_mlp": 0.01251999, + "epoch": 0.8343303772734105, + "flos": 22535060446080.0, + "grad_norm": 1.96733671294141, + "language_loss": 0.87216544, + "learning_rate": 2.8097729799069615e-07, + "loss": 0.94878739, + "num_input_tokens_seen": 299402385, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.09686279, + "step": 13877, + "time_per_iteration": 2.505556583404541 + }, + { + "auxiliary_loss_clip": 0.0640115, + "auxiliary_loss_mlp": 0.01262747, + "balance_loss_clip": 0.06269793, + "balance_loss_mlp": 0.01253902, + "epoch": 0.8343905005260784, + "flos": 14945131359360.0, + "grad_norm": 1.8494974533553123, + "language_loss": 0.69351619, + "learning_rate": 2.807782702318828e-07, + "loss": 0.77015519, + "num_input_tokens_seen": 299419820, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.08843994, + "step": 13878, + "time_per_iteration": 2.5137927532196045 + }, + { + "auxiliary_loss_clip": 0.0640167, + "auxiliary_loss_mlp": 0.01265368, + "balance_loss_clip": 0.06269883, + "balance_loss_mlp": 0.01255897, + "epoch": 0.8344506237787465, + "flos": 15017778449280.0, + "grad_norm": 1.9676517124492925, + "language_loss": 0.79576242, + "learning_rate": 2.805793076661309e-07, + "loss": 0.87243277, + "num_input_tokens_seen": 299436265, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.0947876, + "step": 13879, + "time_per_iteration": 2.474787950515747 + }, + { + "auxiliary_loss_clip": 0.06397855, + "auxiliary_loss_mlp": 0.01264416, + "balance_loss_clip": 0.06268258, + "balance_loss_mlp": 0.0125609, + "epoch": 0.8345107470314144, + "flos": 17565122424960.0, + "grad_norm": 1.9102558295245906, + "language_loss": 0.83550584, + "learning_rate": 2.803804103009828e-07, + "loss": 0.91212851, + "num_input_tokens_seen": 299451660, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.08331299, + "step": 13880, + "time_per_iteration": 2.5329551696777344 + }, + { + "auxiliary_loss_clip": 0.06401896, + "auxiliary_loss_mlp": 0.01263382, + "balance_loss_clip": 0.0626949, + "balance_loss_mlp": 0.0125366, + "epoch": 0.8345708702840824, + "flos": 25193513335680.0, + "grad_norm": 1.6329117748195123, + "language_loss": 0.78477925, + "learning_rate": 2.80181578143982e-07, + "loss": 0.86143202, + "num_input_tokens_seen": 299472070, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.09716797, + "step": 13881, + "time_per_iteration": 2.5319807529449463 + }, + { + "auxiliary_loss_clip": 0.06393664, + "auxiliary_loss_mlp": 0.01268201, + "balance_loss_clip": 0.06268856, + "balance_loss_mlp": 0.01260118, + "epoch": 0.8346309935367503, + "flos": 15088580749440.0, + "grad_norm": 2.3152636189856306, + "language_loss": 0.79627961, + "learning_rate": 2.7998281120266807e-07, + "loss": 0.87289822, + "num_input_tokens_seen": 299486725, + "router_z_loss_clip": 1.24902344, + "router_z_loss_mlp": 0.08078003, + "step": 13882, + "time_per_iteration": 2.4848222732543945 + }, + { + "auxiliary_loss_clip": 0.06398013, + "auxiliary_loss_mlp": 0.01266738, + "balance_loss_clip": 0.06268071, + "balance_loss_mlp": 0.01257041, + "epoch": 0.8346911167894183, + "flos": 22937386625280.0, + "grad_norm": 1.9057723326308558, + "language_loss": 0.81047702, + "learning_rate": 2.79784109484579e-07, + "loss": 0.88712454, + "num_input_tokens_seen": 299505435, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.09692383, + "step": 13883, + "time_per_iteration": 2.50827956199646 + }, + { + "auxiliary_loss_clip": 0.06402916, + "auxiliary_loss_mlp": 0.01265732, + "balance_loss_clip": 0.0626992, + "balance_loss_mlp": 0.0125632, + "epoch": 0.8347512400420862, + "flos": 20199159048960.0, + "grad_norm": 2.2082056544036637, + "language_loss": 0.74074388, + "learning_rate": 2.795854729972482e-07, + "loss": 0.81743038, + "num_input_tokens_seen": 299523555, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.09417725, + "step": 13884, + "time_per_iteration": 2.507692813873291 + }, + { + "auxiliary_loss_clip": 0.06410012, + "auxiliary_loss_mlp": 0.01263974, + "balance_loss_clip": 0.06270047, + "balance_loss_mlp": 0.01253382, + "epoch": 0.8348113632947542, + "flos": 25961422878720.0, + "grad_norm": 2.212491110586892, + "language_loss": 0.70608038, + "learning_rate": 2.7938690174820913e-07, + "loss": 0.78282022, + "num_input_tokens_seen": 299541660, + "router_z_loss_clip": 1.40039062, + "router_z_loss_mlp": 0.10595703, + "step": 13885, + "time_per_iteration": 3.943305492401123 + }, + { + "auxiliary_loss_clip": 0.06403215, + "auxiliary_loss_mlp": 0.01261876, + "balance_loss_clip": 0.06270201, + "balance_loss_mlp": 0.01252345, + "epoch": 0.8348714865474223, + "flos": 34213183839360.0, + "grad_norm": 1.4992796639632324, + "language_loss": 0.69971478, + "learning_rate": 2.791883957449912e-07, + "loss": 0.7763657, + "num_input_tokens_seen": 299562465, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.09533691, + "step": 13886, + "time_per_iteration": 2.606997013092041 + }, + { + "auxiliary_loss_clip": 0.06399034, + "auxiliary_loss_mlp": 0.01263136, + "balance_loss_clip": 0.06269737, + "balance_loss_mlp": 0.01253892, + "epoch": 0.8349316098000902, + "flos": 24397162531200.0, + "grad_norm": 3.511326627037885, + "language_loss": 0.79448175, + "learning_rate": 2.7898995499512134e-07, + "loss": 0.87110341, + "num_input_tokens_seen": 299582700, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.0925293, + "step": 13887, + "time_per_iteration": 2.534818172454834 + }, + { + "auxiliary_loss_clip": 0.06410402, + "auxiliary_loss_mlp": 0.01265767, + "balance_loss_clip": 0.06272294, + "balance_loss_mlp": 0.01255294, + "epoch": 0.8349917330527582, + "flos": 23038307268480.0, + "grad_norm": 2.205099646078452, + "language_loss": 0.63997847, + "learning_rate": 2.7879157950612467e-07, + "loss": 0.71674013, + "num_input_tokens_seen": 299600310, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.10467529, + "step": 13888, + "time_per_iteration": 2.494823455810547 + }, + { + "auxiliary_loss_clip": 0.06403908, + "auxiliary_loss_mlp": 0.01264348, + "balance_loss_clip": 0.06267237, + "balance_loss_mlp": 0.01254453, + "epoch": 0.8350518563054261, + "flos": 13630943122560.0, + "grad_norm": 2.167942528379587, + "language_loss": 0.66939718, + "learning_rate": 2.785932692855244e-07, + "loss": 0.74607974, + "num_input_tokens_seen": 299617025, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.09893799, + "step": 13889, + "time_per_iteration": 4.012948513031006 + }, + { + "auxiliary_loss_clip": 0.06399906, + "auxiliary_loss_mlp": 0.01264254, + "balance_loss_clip": 0.06270322, + "balance_loss_mlp": 0.01255284, + "epoch": 0.8351119795580941, + "flos": 21586204010880.0, + "grad_norm": 1.8598402777124405, + "language_loss": 0.69043732, + "learning_rate": 2.783950243408399e-07, + "loss": 0.76707888, + "num_input_tokens_seen": 299633050, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.08978271, + "step": 13890, + "time_per_iteration": 2.538703203201294 + }, + { + "auxiliary_loss_clip": 0.06405049, + "auxiliary_loss_mlp": 0.01267852, + "balance_loss_clip": 0.06271869, + "balance_loss_mlp": 0.01257546, + "epoch": 0.835172102810762, + "flos": 20042921911680.0, + "grad_norm": 2.381877208795948, + "language_loss": 0.59337091, + "learning_rate": 2.7819684467958817e-07, + "loss": 0.67009991, + "num_input_tokens_seen": 299646445, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.10284424, + "step": 13891, + "time_per_iteration": 2.476916551589966 + }, + { + "auxiliary_loss_clip": 0.06403613, + "auxiliary_loss_mlp": 0.01261397, + "balance_loss_clip": 0.06271984, + "balance_loss_mlp": 0.01252373, + "epoch": 0.8352322260634301, + "flos": 25117344374400.0, + "grad_norm": 1.5227027869920424, + "language_loss": 0.72106713, + "learning_rate": 2.779987303092846e-07, + "loss": 0.79771721, + "num_input_tokens_seen": 299662665, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09020996, + "step": 13892, + "time_per_iteration": 3.963770627975464 + }, + { + "auxiliary_loss_clip": 0.06396841, + "auxiliary_loss_mlp": 0.01265593, + "balance_loss_clip": 0.0626855, + "balance_loss_mlp": 0.01256158, + "epoch": 0.835292349316098, + "flos": 24870752208000.0, + "grad_norm": 1.5147233284160702, + "language_loss": 0.65907598, + "learning_rate": 2.7780068123744207e-07, + "loss": 0.73570037, + "num_input_tokens_seen": 299683585, + "router_z_loss_clip": 1.28222656, + "router_z_loss_mlp": 0.09436035, + "step": 13893, + "time_per_iteration": 2.5296645164489746 + }, + { + "auxiliary_loss_clip": 0.06401062, + "auxiliary_loss_mlp": 0.01262319, + "balance_loss_clip": 0.06268641, + "balance_loss_mlp": 0.01253385, + "epoch": 0.835352472568766, + "flos": 19871785946880.0, + "grad_norm": 2.485102746656012, + "language_loss": 0.78644305, + "learning_rate": 2.7760269747156996e-07, + "loss": 0.86307693, + "num_input_tokens_seen": 299702680, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.0892334, + "step": 13894, + "time_per_iteration": 2.5228044986724854 + }, + { + "auxiliary_loss_clip": 0.06396501, + "auxiliary_loss_mlp": 0.01264603, + "balance_loss_clip": 0.06271871, + "balance_loss_mlp": 0.01255949, + "epoch": 0.8354125958214339, + "flos": 22061344988160.0, + "grad_norm": 1.604151565001046, + "language_loss": 0.72635913, + "learning_rate": 2.7740477901917625e-07, + "loss": 0.80297017, + "num_input_tokens_seen": 299721050, + "router_z_loss_clip": 1.24902344, + "router_z_loss_mlp": 0.08654785, + "step": 13895, + "time_per_iteration": 2.5038208961486816 + }, + { + "auxiliary_loss_clip": 0.06404788, + "auxiliary_loss_mlp": 0.01268276, + "balance_loss_clip": 0.06268989, + "balance_loss_mlp": 0.01257661, + "epoch": 0.8354727190741019, + "flos": 21404250869760.0, + "grad_norm": 1.9382861122392194, + "language_loss": 0.7216146, + "learning_rate": 2.772069258877667e-07, + "loss": 0.79834521, + "num_input_tokens_seen": 299738255, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.10601807, + "step": 13896, + "time_per_iteration": 2.5257046222686768 + }, + { + "auxiliary_loss_clip": 0.06398962, + "auxiliary_loss_mlp": 0.01268313, + "balance_loss_clip": 0.06270335, + "balance_loss_mlp": 0.01259211, + "epoch": 0.8355328423267698, + "flos": 50852230940160.0, + "grad_norm": 2.03682748324138, + "language_loss": 0.58762497, + "learning_rate": 2.770091380848423e-07, + "loss": 0.6642977, + "num_input_tokens_seen": 299761315, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.09106445, + "step": 13897, + "time_per_iteration": 2.7454147338867188 + }, + { + "auxiliary_loss_clip": 0.06307182, + "auxiliary_loss_mlp": 0.01250088, + "balance_loss_clip": 0.06252273, + "balance_loss_mlp": 0.0124903, + "epoch": 0.8355929655794379, + "flos": 65571901361280.0, + "grad_norm": 0.7124810299660945, + "language_loss": 0.57679689, + "learning_rate": 2.7681141561790423e-07, + "loss": 0.65236962, + "num_input_tokens_seen": 299828735, + "router_z_loss_clip": 0.54931641, + "router_z_loss_mlp": 0.0105896, + "step": 13898, + "time_per_iteration": 3.2076830863952637 + }, + { + "auxiliary_loss_clip": 0.06407744, + "auxiliary_loss_mlp": 0.01269697, + "balance_loss_clip": 0.06271542, + "balance_loss_mlp": 0.01258861, + "epoch": 0.8356530888321058, + "flos": 19176313858560.0, + "grad_norm": 1.7011409569690659, + "language_loss": 0.80137974, + "learning_rate": 2.7661375849444967e-07, + "loss": 0.87815416, + "num_input_tokens_seen": 299848395, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.10839844, + "step": 13899, + "time_per_iteration": 2.5566153526306152 + }, + { + "auxiliary_loss_clip": 0.06400511, + "auxiliary_loss_mlp": 0.01265988, + "balance_loss_clip": 0.06267878, + "balance_loss_mlp": 0.01256898, + "epoch": 0.8357132120847738, + "flos": 44136624481920.0, + "grad_norm": 1.5711758150102046, + "language_loss": 0.69132239, + "learning_rate": 2.764161667219749e-07, + "loss": 0.76798737, + "num_input_tokens_seen": 299871665, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09088135, + "step": 13900, + "time_per_iteration": 2.7178146839141846 + }, + { + "auxiliary_loss_clip": 0.06403154, + "auxiliary_loss_mlp": 0.01267795, + "balance_loss_clip": 0.06271988, + "balance_loss_mlp": 0.01258658, + "epoch": 0.8357733353374418, + "flos": 24396659406720.0, + "grad_norm": 1.5477695677500147, + "language_loss": 0.71333092, + "learning_rate": 2.762186403079716e-07, + "loss": 0.79004037, + "num_input_tokens_seen": 299891960, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.09136963, + "step": 13901, + "time_per_iteration": 2.5601279735565186 + }, + { + "auxiliary_loss_clip": 0.06405643, + "auxiliary_loss_mlp": 0.01266448, + "balance_loss_clip": 0.06268835, + "balance_loss_mlp": 0.01256607, + "epoch": 0.8358334585901097, + "flos": 20921479171200.0, + "grad_norm": 2.4248634759308247, + "language_loss": 0.803698, + "learning_rate": 2.7602117925992963e-07, + "loss": 0.8804189, + "num_input_tokens_seen": 299905070, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.09844971, + "step": 13902, + "time_per_iteration": 2.4966886043548584 + }, + { + "auxiliary_loss_clip": 0.06397945, + "auxiliary_loss_mlp": 0.01264374, + "balance_loss_clip": 0.06269498, + "balance_loss_mlp": 0.01254849, + "epoch": 0.8358935818427777, + "flos": 19250092978560.0, + "grad_norm": 1.5337059639078017, + "language_loss": 0.62622327, + "learning_rate": 2.758237835853379e-07, + "loss": 0.70284647, + "num_input_tokens_seen": 299925130, + "router_z_loss_clip": 1.28417969, + "router_z_loss_mlp": 0.09527588, + "step": 13903, + "time_per_iteration": 2.487577438354492 + }, + { + "auxiliary_loss_clip": 0.06401621, + "auxiliary_loss_mlp": 0.01268796, + "balance_loss_clip": 0.06271307, + "balance_loss_mlp": 0.01259838, + "epoch": 0.8359537050954456, + "flos": 24140927145600.0, + "grad_norm": 1.6577519293367657, + "language_loss": 0.74130571, + "learning_rate": 2.7562645329168054e-07, + "loss": 0.81800985, + "num_input_tokens_seen": 299943845, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.08953857, + "step": 13904, + "time_per_iteration": 2.523071050643921 + }, + { + "auxiliary_loss_clip": 0.06397306, + "auxiliary_loss_mlp": 0.01262041, + "balance_loss_clip": 0.0626936, + "balance_loss_mlp": 0.01253226, + "epoch": 0.8360138283481137, + "flos": 16186001673600.0, + "grad_norm": 1.6562816533457836, + "language_loss": 0.72656274, + "learning_rate": 2.7542918838644104e-07, + "loss": 0.80315626, + "num_input_tokens_seen": 299961620, + "router_z_loss_clip": 1.27929688, + "router_z_loss_mlp": 0.0881958, + "step": 13905, + "time_per_iteration": 2.4608371257781982 + }, + { + "auxiliary_loss_clip": 0.06397828, + "auxiliary_loss_mlp": 0.01261404, + "balance_loss_clip": 0.06270939, + "balance_loss_mlp": 0.01253095, + "epoch": 0.8360739516007816, + "flos": 22205213648640.0, + "grad_norm": 1.4212033615941317, + "language_loss": 0.66774136, + "learning_rate": 2.752319888771e-07, + "loss": 0.74433374, + "num_input_tokens_seen": 299982170, + "router_z_loss_clip": 1.27050781, + "router_z_loss_mlp": 0.08300781, + "step": 13906, + "time_per_iteration": 2.521660089492798 + }, + { + "auxiliary_loss_clip": 0.06401214, + "auxiliary_loss_mlp": 0.01264056, + "balance_loss_clip": 0.06270771, + "balance_loss_mlp": 0.01254627, + "epoch": 0.8361340748534496, + "flos": 20929445308800.0, + "grad_norm": 1.3169375476629854, + "language_loss": 0.74235398, + "learning_rate": 2.7503485477113475e-07, + "loss": 0.81900668, + "num_input_tokens_seen": 300001330, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.09429932, + "step": 13907, + "time_per_iteration": 2.5083837509155273 + }, + { + "auxiliary_loss_clip": 0.06404978, + "auxiliary_loss_mlp": 0.01264938, + "balance_loss_clip": 0.06269656, + "balance_loss_mlp": 0.01254698, + "epoch": 0.8361941981061175, + "flos": 26180202689280.0, + "grad_norm": 1.7253020139754567, + "language_loss": 0.75386989, + "learning_rate": 2.7483778607602005e-07, + "loss": 0.83056903, + "num_input_tokens_seen": 300020645, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.10241699, + "step": 13908, + "time_per_iteration": 2.5349066257476807 + }, + { + "auxiliary_loss_clip": 0.06400359, + "auxiliary_loss_mlp": 0.012632, + "balance_loss_clip": 0.0626875, + "balance_loss_mlp": 0.01253348, + "epoch": 0.8362543213587855, + "flos": 24425184522240.0, + "grad_norm": 2.1793256705024415, + "language_loss": 0.71560019, + "learning_rate": 2.7464078279922964e-07, + "loss": 0.79223579, + "num_input_tokens_seen": 300039945, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09851074, + "step": 13909, + "time_per_iteration": 2.528615951538086 + }, + { + "auxiliary_loss_clip": 0.06405953, + "auxiliary_loss_mlp": 0.01261262, + "balance_loss_clip": 0.06269771, + "balance_loss_mlp": 0.01251177, + "epoch": 0.8363144446114534, + "flos": 17208217958400.0, + "grad_norm": 1.756161355340015, + "language_loss": 0.7331903, + "learning_rate": 2.744438449482338e-07, + "loss": 0.80986243, + "num_input_tokens_seen": 300058260, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.10083008, + "step": 13910, + "time_per_iteration": 2.47664213180542 + }, + { + "auxiliary_loss_clip": 0.06405869, + "auxiliary_loss_mlp": 0.01264589, + "balance_loss_clip": 0.062729, + "balance_loss_mlp": 0.01255601, + "epoch": 0.8363745678641215, + "flos": 19285116785280.0, + "grad_norm": 1.9545322554977718, + "language_loss": 0.7355817, + "learning_rate": 2.742469725305001e-07, + "loss": 0.81228626, + "num_input_tokens_seen": 300076720, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.08978271, + "step": 13911, + "time_per_iteration": 2.48702073097229 + }, + { + "auxiliary_loss_clip": 0.06402719, + "auxiliary_loss_mlp": 0.01265291, + "balance_loss_clip": 0.06269197, + "balance_loss_mlp": 0.01255719, + "epoch": 0.8364346911167894, + "flos": 11879698389120.0, + "grad_norm": 1.8881216376034646, + "language_loss": 0.78823018, + "learning_rate": 2.740501655534946e-07, + "loss": 0.86491024, + "num_input_tokens_seen": 300092950, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.09564209, + "step": 13912, + "time_per_iteration": 2.4519803524017334 + }, + { + "auxiliary_loss_clip": 0.06396623, + "auxiliary_loss_mlp": 0.01263862, + "balance_loss_clip": 0.06267013, + "balance_loss_mlp": 0.01254766, + "epoch": 0.8364948143694574, + "flos": 20230619057280.0, + "grad_norm": 2.102603712064964, + "language_loss": 0.78802848, + "learning_rate": 2.738534240246797e-07, + "loss": 0.86463332, + "num_input_tokens_seen": 300110950, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.09106445, + "step": 13913, + "time_per_iteration": 2.5273799896240234 + }, + { + "auxiliary_loss_clip": 0.06401996, + "auxiliary_loss_mlp": 0.01269109, + "balance_loss_clip": 0.06268221, + "balance_loss_mlp": 0.01258946, + "epoch": 0.8365549376221254, + "flos": 21618754122240.0, + "grad_norm": 2.0629823157258955, + "language_loss": 0.73688346, + "learning_rate": 2.736567479515153e-07, + "loss": 0.81359446, + "num_input_tokens_seen": 300128705, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.10162354, + "step": 13914, + "time_per_iteration": 3.949573278427124 + }, + { + "auxiliary_loss_clip": 0.06403138, + "auxiliary_loss_mlp": 0.01263701, + "balance_loss_clip": 0.06270348, + "balance_loss_mlp": 0.01253759, + "epoch": 0.8366150608747933, + "flos": 23300831710080.0, + "grad_norm": 1.6012769275209868, + "language_loss": 0.71500385, + "learning_rate": 2.7346013734146025e-07, + "loss": 0.79167223, + "num_input_tokens_seen": 300148635, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.0994873, + "step": 13915, + "time_per_iteration": 2.5438222885131836 + }, + { + "auxiliary_loss_clip": 0.06404576, + "auxiliary_loss_mlp": 0.01266095, + "balance_loss_clip": 0.06271189, + "balance_loss_mlp": 0.01256847, + "epoch": 0.8366751841274613, + "flos": 15273007585920.0, + "grad_norm": 1.8381191598065743, + "language_loss": 0.72815526, + "learning_rate": 2.7326359220197035e-07, + "loss": 0.80486196, + "num_input_tokens_seen": 300165490, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.09249878, + "step": 13916, + "time_per_iteration": 2.4785351753234863 + }, + { + "auxiliary_loss_clip": 0.06402785, + "auxiliary_loss_mlp": 0.01263006, + "balance_loss_clip": 0.06270997, + "balance_loss_mlp": 0.01253523, + "epoch": 0.8367353073801292, + "flos": 13230000535680.0, + "grad_norm": 1.8555718594178066, + "language_loss": 0.74952316, + "learning_rate": 2.7306711254049755e-07, + "loss": 0.82618105, + "num_input_tokens_seen": 300182130, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.09490967, + "step": 13917, + "time_per_iteration": 2.493027687072754 + }, + { + "auxiliary_loss_clip": 0.0639746, + "auxiliary_loss_mlp": 0.01267651, + "balance_loss_clip": 0.06271464, + "balance_loss_mlp": 0.01258747, + "epoch": 0.8367954306327973, + "flos": 24211645591680.0, + "grad_norm": 1.4849909859984562, + "language_loss": 0.79520977, + "learning_rate": 2.728706983644933e-07, + "loss": 0.87186092, + "num_input_tokens_seen": 300203050, + "router_z_loss_clip": 1.25976562, + "router_z_loss_mlp": 0.08911133, + "step": 13918, + "time_per_iteration": 2.52976131439209 + }, + { + "auxiliary_loss_clip": 0.06398945, + "auxiliary_loss_mlp": 0.01262746, + "balance_loss_clip": 0.06267535, + "balance_loss_mlp": 0.01253478, + "epoch": 0.8368555538854652, + "flos": 24541576243200.0, + "grad_norm": 1.6786160238572738, + "language_loss": 0.68168354, + "learning_rate": 2.7267434968140457e-07, + "loss": 0.75830042, + "num_input_tokens_seen": 300224380, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.0927124, + "step": 13919, + "time_per_iteration": 2.599942684173584 + }, + { + "auxiliary_loss_clip": 0.06399108, + "auxiliary_loss_mlp": 0.01264149, + "balance_loss_clip": 0.06269257, + "balance_loss_mlp": 0.01255375, + "epoch": 0.8369156771381332, + "flos": 20264385052800.0, + "grad_norm": 1.5840065764282198, + "language_loss": 0.74044919, + "learning_rate": 2.7247806649867835e-07, + "loss": 0.81708181, + "num_input_tokens_seen": 300242915, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.08764648, + "step": 13920, + "time_per_iteration": 2.4778757095336914 + }, + { + "auxiliary_loss_clip": 0.06400211, + "auxiliary_loss_mlp": 0.01265161, + "balance_loss_clip": 0.0626861, + "balance_loss_mlp": 0.01255589, + "epoch": 0.8369758003908011, + "flos": 21842062053120.0, + "grad_norm": 1.6917442964392928, + "language_loss": 0.68960786, + "learning_rate": 2.722818488237566e-07, + "loss": 0.76626152, + "num_input_tokens_seen": 300261905, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09576416, + "step": 13921, + "time_per_iteration": 2.536294460296631 + }, + { + "auxiliary_loss_clip": 0.06407334, + "auxiliary_loss_mlp": 0.0127037, + "balance_loss_clip": 0.06271015, + "balance_loss_mlp": 0.01260887, + "epoch": 0.8370359236434691, + "flos": 21724664083200.0, + "grad_norm": 1.9282887707714638, + "language_loss": 0.85106057, + "learning_rate": 2.720856966640801e-07, + "loss": 0.92783767, + "num_input_tokens_seen": 300281145, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.09484863, + "step": 13922, + "time_per_iteration": 2.5004947185516357 + }, + { + "auxiliary_loss_clip": 0.06399621, + "auxiliary_loss_mlp": 0.01266109, + "balance_loss_clip": 0.0627037, + "balance_loss_mlp": 0.01256077, + "epoch": 0.837096046896137, + "flos": 23155579457280.0, + "grad_norm": 1.562676302335632, + "language_loss": 0.71699303, + "learning_rate": 2.71889610027088e-07, + "loss": 0.79365033, + "num_input_tokens_seen": 300301610, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.10028076, + "step": 13923, + "time_per_iteration": 2.524562358856201 + }, + { + "auxiliary_loss_clip": 0.06398217, + "auxiliary_loss_mlp": 0.01267054, + "balance_loss_clip": 0.06270584, + "balance_loss_mlp": 0.01257535, + "epoch": 0.8371561701488051, + "flos": 24498795934080.0, + "grad_norm": 1.7133401299934226, + "language_loss": 0.76249665, + "learning_rate": 2.7169358892021433e-07, + "loss": 0.83914936, + "num_input_tokens_seen": 300319420, + "router_z_loss_clip": 1.27636719, + "router_z_loss_mlp": 0.09515381, + "step": 13924, + "time_per_iteration": 2.5283994674682617 + }, + { + "auxiliary_loss_clip": 0.06401788, + "auxiliary_loss_mlp": 0.01267733, + "balance_loss_clip": 0.06271337, + "balance_loss_mlp": 0.01258297, + "epoch": 0.837216293401473, + "flos": 29214636848640.0, + "grad_norm": 1.4446644492995726, + "language_loss": 0.64699805, + "learning_rate": 2.7149763335089293e-07, + "loss": 0.72369325, + "num_input_tokens_seen": 300341325, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.09436035, + "step": 13925, + "time_per_iteration": 4.010638236999512 + }, + { + "auxiliary_loss_clip": 0.06404626, + "auxiliary_loss_mlp": 0.0126583, + "balance_loss_clip": 0.06270886, + "balance_loss_mlp": 0.0125534, + "epoch": 0.837276416654141, + "flos": 25272365627520.0, + "grad_norm": 1.5030494095367346, + "language_loss": 0.74794483, + "learning_rate": 2.713017433265543e-07, + "loss": 0.82464945, + "num_input_tokens_seen": 300361620, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.1048584, + "step": 13926, + "time_per_iteration": 2.5488619804382324 + }, + { + "auxiliary_loss_clip": 0.0640581, + "auxiliary_loss_mlp": 0.01264949, + "balance_loss_clip": 0.0627261, + "balance_loss_mlp": 0.01255258, + "epoch": 0.837336539906809, + "flos": 13887262362240.0, + "grad_norm": 1.6939060787098483, + "language_loss": 0.71716177, + "learning_rate": 2.711059188546274e-07, + "loss": 0.79386938, + "num_input_tokens_seen": 300378675, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.09692383, + "step": 13927, + "time_per_iteration": 2.4630274772644043 + }, + { + "auxiliary_loss_clip": 0.06308782, + "auxiliary_loss_mlp": 0.01252714, + "balance_loss_clip": 0.06254104, + "balance_loss_mlp": 0.01251694, + "epoch": 0.8373966631594769, + "flos": 68891892635520.0, + "grad_norm": 0.6934409858082019, + "language_loss": 0.58671498, + "learning_rate": 2.7091015994253695e-07, + "loss": 0.66233003, + "num_input_tokens_seen": 300449740, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.0102005, + "step": 13928, + "time_per_iteration": 4.668534994125366 + }, + { + "auxiliary_loss_clip": 0.06404306, + "auxiliary_loss_mlp": 0.01266431, + "balance_loss_clip": 0.06273213, + "balance_loss_mlp": 0.01256292, + "epoch": 0.8374567864121449, + "flos": 20455226726400.0, + "grad_norm": 1.60382683016984, + "language_loss": 0.70053691, + "learning_rate": 2.707144665977068e-07, + "loss": 0.77724433, + "num_input_tokens_seen": 300470000, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.10144043, + "step": 13929, + "time_per_iteration": 2.522420644760132 + }, + { + "auxiliary_loss_clip": 0.06407779, + "auxiliary_loss_mlp": 0.01267395, + "balance_loss_clip": 0.06272074, + "balance_loss_mlp": 0.01256827, + "epoch": 0.8375169096648128, + "flos": 41914305694080.0, + "grad_norm": 1.4727423205493513, + "language_loss": 0.67365968, + "learning_rate": 2.705188388275574e-07, + "loss": 0.75041145, + "num_input_tokens_seen": 300494975, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10571289, + "step": 13930, + "time_per_iteration": 2.692265748977661 + }, + { + "auxiliary_loss_clip": 0.06397972, + "auxiliary_loss_mlp": 0.01263394, + "balance_loss_clip": 0.06269804, + "balance_loss_mlp": 0.01254382, + "epoch": 0.8375770329174809, + "flos": 20015235336960.0, + "grad_norm": 1.5773395160364472, + "language_loss": 0.71745491, + "learning_rate": 2.703232766395067e-07, + "loss": 0.79406852, + "num_input_tokens_seen": 300513175, + "router_z_loss_clip": 1.28222656, + "router_z_loss_mlp": 0.09008789, + "step": 13931, + "time_per_iteration": 3.92305064201355 + }, + { + "auxiliary_loss_clip": 0.06398615, + "auxiliary_loss_mlp": 0.01261005, + "balance_loss_clip": 0.06268796, + "balance_loss_mlp": 0.01251838, + "epoch": 0.8376371561701488, + "flos": 22790163801600.0, + "grad_norm": 1.728417843969976, + "language_loss": 0.71899205, + "learning_rate": 2.701277800409705e-07, + "loss": 0.79558825, + "num_input_tokens_seen": 300533770, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.09173584, + "step": 13932, + "time_per_iteration": 2.5266075134277344 + }, + { + "auxiliary_loss_clip": 0.06401943, + "auxiliary_loss_mlp": 0.01265576, + "balance_loss_clip": 0.06270765, + "balance_loss_mlp": 0.01256194, + "epoch": 0.8376972794228168, + "flos": 23921183013120.0, + "grad_norm": 1.9674813085768024, + "language_loss": 0.67152762, + "learning_rate": 2.699323490393628e-07, + "loss": 0.7482028, + "num_input_tokens_seen": 300552995, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09387207, + "step": 13933, + "time_per_iteration": 2.523416042327881 + }, + { + "auxiliary_loss_clip": 0.06398617, + "auxiliary_loss_mlp": 0.01266669, + "balance_loss_clip": 0.0627097, + "balance_loss_mlp": 0.01257067, + "epoch": 0.8377574026754847, + "flos": 13739704122240.0, + "grad_norm": 1.8869122869605348, + "language_loss": 0.7637918, + "learning_rate": 2.697369836420933e-07, + "loss": 0.84044468, + "num_input_tokens_seen": 300570275, + "router_z_loss_clip": 1.27636719, + "router_z_loss_mlp": 0.0960083, + "step": 13934, + "time_per_iteration": 2.467869997024536 + }, + { + "auxiliary_loss_clip": 0.06402792, + "auxiliary_loss_mlp": 0.01265545, + "balance_loss_clip": 0.06273244, + "balance_loss_mlp": 0.0125645, + "epoch": 0.8378175259281527, + "flos": 21657509435520.0, + "grad_norm": 2.239259212098959, + "language_loss": 0.77590597, + "learning_rate": 2.6954168385657115e-07, + "loss": 0.85258937, + "num_input_tokens_seen": 300590875, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.09100342, + "step": 13935, + "time_per_iteration": 2.5480756759643555 + }, + { + "auxiliary_loss_clip": 0.06400282, + "auxiliary_loss_mlp": 0.01261735, + "balance_loss_clip": 0.06268634, + "balance_loss_mlp": 0.01252419, + "epoch": 0.8378776491808206, + "flos": 15453954478080.0, + "grad_norm": 3.5319600801449886, + "language_loss": 0.57043457, + "learning_rate": 2.6934644969020135e-07, + "loss": 0.64705479, + "num_input_tokens_seen": 300607490, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09313965, + "step": 13936, + "time_per_iteration": 2.4494564533233643 + }, + { + "auxiliary_loss_clip": 0.06400599, + "auxiliary_loss_mlp": 0.01263383, + "balance_loss_clip": 0.06270145, + "balance_loss_mlp": 0.01254562, + "epoch": 0.8379377724334887, + "flos": 14725638789120.0, + "grad_norm": 1.878391680874537, + "language_loss": 0.89756596, + "learning_rate": 2.691512811503882e-07, + "loss": 0.97420573, + "num_input_tokens_seen": 300623635, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.0881958, + "step": 13937, + "time_per_iteration": 2.4906821250915527 + }, + { + "auxiliary_loss_clip": 0.06402005, + "auxiliary_loss_mlp": 0.01262073, + "balance_loss_clip": 0.06270915, + "balance_loss_mlp": 0.01252715, + "epoch": 0.8379978956861566, + "flos": 24542163221760.0, + "grad_norm": 1.6373242147454181, + "language_loss": 0.81663287, + "learning_rate": 2.689561782445313e-07, + "loss": 0.89327371, + "num_input_tokens_seen": 300643835, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.09362793, + "step": 13938, + "time_per_iteration": 2.6027586460113525 + }, + { + "auxiliary_loss_clip": 0.06405147, + "auxiliary_loss_mlp": 0.01264623, + "balance_loss_clip": 0.062712, + "balance_loss_mlp": 0.01254711, + "epoch": 0.8380580189388246, + "flos": 18958540296960.0, + "grad_norm": 1.6258157555571138, + "language_loss": 0.70874858, + "learning_rate": 2.6876114098002965e-07, + "loss": 0.78544629, + "num_input_tokens_seen": 300662500, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.09918213, + "step": 13939, + "time_per_iteration": 2.529512882232666 + }, + { + "auxiliary_loss_clip": 0.06405655, + "auxiliary_loss_mlp": 0.01269834, + "balance_loss_clip": 0.06271434, + "balance_loss_mlp": 0.01260071, + "epoch": 0.8381181421914926, + "flos": 26547253499520.0, + "grad_norm": 1.6173394319792127, + "language_loss": 0.76280761, + "learning_rate": 2.6856616936428e-07, + "loss": 0.83956242, + "num_input_tokens_seen": 300681480, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.09759521, + "step": 13940, + "time_per_iteration": 2.539008378982544 + }, + { + "auxiliary_loss_clip": 0.06398639, + "auxiliary_loss_mlp": 0.01263497, + "balance_loss_clip": 0.06269714, + "balance_loss_mlp": 0.01253984, + "epoch": 0.8381782654441605, + "flos": 23297645255040.0, + "grad_norm": 1.571823062249585, + "language_loss": 0.76635635, + "learning_rate": 2.6837126340467374e-07, + "loss": 0.84297776, + "num_input_tokens_seen": 300699165, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.0949707, + "step": 13941, + "time_per_iteration": 2.498035192489624 + }, + { + "auxiliary_loss_clip": 0.06407124, + "auxiliary_loss_mlp": 0.01264368, + "balance_loss_clip": 0.06270672, + "balance_loss_mlp": 0.01254593, + "epoch": 0.8382383886968285, + "flos": 26765739820800.0, + "grad_norm": 1.8960561722214873, + "language_loss": 0.73615742, + "learning_rate": 2.6817642310860276e-07, + "loss": 0.81287235, + "num_input_tokens_seen": 300714615, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.09777832, + "step": 13942, + "time_per_iteration": 2.534268856048584 + }, + { + "auxiliary_loss_clip": 0.06414034, + "auxiliary_loss_mlp": 0.01264269, + "balance_loss_clip": 0.06272998, + "balance_loss_mlp": 0.0125385, + "epoch": 0.8382985119494964, + "flos": 26111790230400.0, + "grad_norm": 1.4953994641731532, + "language_loss": 0.79530114, + "learning_rate": 2.679816484834554e-07, + "loss": 0.87208414, + "num_input_tokens_seen": 300734860, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.10424805, + "step": 13943, + "time_per_iteration": 2.548069715499878 + }, + { + "auxiliary_loss_clip": 0.06402889, + "auxiliary_loss_mlp": 0.01262959, + "balance_loss_clip": 0.0627131, + "balance_loss_mlp": 0.01253482, + "epoch": 0.8383586352021645, + "flos": 16440643831680.0, + "grad_norm": 1.9362990480164113, + "language_loss": 0.85566223, + "learning_rate": 2.6778693953661766e-07, + "loss": 0.93232077, + "num_input_tokens_seen": 300752735, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.0947876, + "step": 13944, + "time_per_iteration": 2.4838459491729736 + }, + { + "auxiliary_loss_clip": 0.0631334, + "auxiliary_loss_mlp": 0.01250973, + "balance_loss_clip": 0.06258479, + "balance_loss_mlp": 0.01249939, + "epoch": 0.8384187584548324, + "flos": 64215226304640.0, + "grad_norm": 0.6035369639047439, + "language_loss": 0.50281239, + "learning_rate": 2.6759229627547263e-07, + "loss": 0.57845557, + "num_input_tokens_seen": 300820760, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.01034546, + "step": 13945, + "time_per_iteration": 3.2410154342651367 + }, + { + "auxiliary_loss_clip": 0.06398898, + "auxiliary_loss_mlp": 0.01263596, + "balance_loss_clip": 0.06270773, + "balance_loss_mlp": 0.01254, + "epoch": 0.8384788817075004, + "flos": 22389514704000.0, + "grad_norm": 1.6641583948208805, + "language_loss": 0.65164709, + "learning_rate": 2.673977187074017e-07, + "loss": 0.72827202, + "num_input_tokens_seen": 300840025, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.0960083, + "step": 13946, + "time_per_iteration": 2.6071982383728027 + }, + { + "auxiliary_loss_clip": 0.06405137, + "auxiliary_loss_mlp": 0.01264205, + "balance_loss_clip": 0.06271759, + "balance_loss_mlp": 0.01254483, + "epoch": 0.8385390049601683, + "flos": 29504512448640.0, + "grad_norm": 1.5353623663640485, + "language_loss": 0.67792797, + "learning_rate": 2.672032068397829e-07, + "loss": 0.75462139, + "num_input_tokens_seen": 300860380, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.09729004, + "step": 13947, + "time_per_iteration": 2.63541579246521 + }, + { + "auxiliary_loss_clip": 0.06404837, + "auxiliary_loss_mlp": 0.0126567, + "balance_loss_clip": 0.06270772, + "balance_loss_mlp": 0.01255472, + "epoch": 0.8385991282128363, + "flos": 32716036212480.0, + "grad_norm": 1.4309661771954254, + "language_loss": 0.6985665, + "learning_rate": 2.6700876067999176e-07, + "loss": 0.77527153, + "num_input_tokens_seen": 300881895, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.10205078, + "step": 13948, + "time_per_iteration": 2.6386852264404297 + }, + { + "auxiliary_loss_clip": 0.06396742, + "auxiliary_loss_mlp": 0.01262841, + "balance_loss_clip": 0.06268944, + "balance_loss_mlp": 0.01254753, + "epoch": 0.8386592514655042, + "flos": 25447023463680.0, + "grad_norm": 1.6745888315245265, + "language_loss": 0.84810793, + "learning_rate": 2.6681438023540194e-07, + "loss": 0.92470378, + "num_input_tokens_seen": 300901575, + "router_z_loss_clip": 1.27832031, + "router_z_loss_mlp": 0.08087158, + "step": 13949, + "time_per_iteration": 2.5083541870117188 + }, + { + "auxiliary_loss_clip": 0.0639628, + "auxiliary_loss_mlp": 0.01266559, + "balance_loss_clip": 0.06268419, + "balance_loss_mlp": 0.01257713, + "epoch": 0.8387193747181723, + "flos": 22022086550400.0, + "grad_norm": 1.712891634847403, + "language_loss": 0.71039176, + "learning_rate": 2.66620065513385e-07, + "loss": 0.78702009, + "num_input_tokens_seen": 300919735, + "router_z_loss_clip": 1.28027344, + "router_z_loss_mlp": 0.08850098, + "step": 13950, + "time_per_iteration": 2.51889967918396 + }, + { + "auxiliary_loss_clip": 0.06399944, + "auxiliary_loss_mlp": 0.01267019, + "balance_loss_clip": 0.06270279, + "balance_loss_mlp": 0.01257375, + "epoch": 0.8387794979708402, + "flos": 18156068144640.0, + "grad_norm": 1.579038787139869, + "language_loss": 0.64784032, + "learning_rate": 2.6642581652130913e-07, + "loss": 0.72450995, + "num_input_tokens_seen": 300939150, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.09643555, + "step": 13951, + "time_per_iteration": 2.4674899578094482 + }, + { + "auxiliary_loss_clip": 0.06404419, + "auxiliary_loss_mlp": 0.0126323, + "balance_loss_clip": 0.06272285, + "balance_loss_mlp": 0.01253396, + "epoch": 0.8388396212235082, + "flos": 25418330640000.0, + "grad_norm": 1.4418962880777595, + "language_loss": 0.70313966, + "learning_rate": 2.662316332665393e-07, + "loss": 0.77981615, + "num_input_tokens_seen": 300959730, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.09832764, + "step": 13952, + "time_per_iteration": 2.554793119430542 + }, + { + "auxiliary_loss_clip": 0.06395441, + "auxiliary_loss_mlp": 0.01263784, + "balance_loss_clip": 0.06266855, + "balance_loss_mlp": 0.01255159, + "epoch": 0.8388997444761762, + "flos": 22279579747200.0, + "grad_norm": 1.8744107681123892, + "language_loss": 0.73154211, + "learning_rate": 2.6603751575643987e-07, + "loss": 0.80813444, + "num_input_tokens_seen": 300976120, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.08624268, + "step": 13953, + "time_per_iteration": 3.9177489280700684 + }, + { + "auxiliary_loss_clip": 0.06400088, + "auxiliary_loss_mlp": 0.01263583, + "balance_loss_clip": 0.06270742, + "balance_loss_mlp": 0.01254296, + "epoch": 0.8389598677288441, + "flos": 19579310870400.0, + "grad_norm": 2.072771641574644, + "language_loss": 0.67898321, + "learning_rate": 2.6584346399837176e-07, + "loss": 0.75561988, + "num_input_tokens_seen": 300995080, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.09289551, + "step": 13954, + "time_per_iteration": 2.489935874938965 + }, + { + "auxiliary_loss_clip": 0.06399843, + "auxiliary_loss_mlp": 0.01265295, + "balance_loss_clip": 0.06269407, + "balance_loss_mlp": 0.01255996, + "epoch": 0.8390199909815121, + "flos": 17390548442880.0, + "grad_norm": 2.000257217636036, + "language_loss": 0.74052519, + "learning_rate": 2.656494779996932e-07, + "loss": 0.81717652, + "num_input_tokens_seen": 301012920, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.09301758, + "step": 13955, + "time_per_iteration": 2.5330069065093994 + }, + { + "auxiliary_loss_clip": 0.06402773, + "auxiliary_loss_mlp": 0.01265265, + "balance_loss_clip": 0.06271341, + "balance_loss_mlp": 0.01256152, + "epoch": 0.83908011423418, + "flos": 24645725268480.0, + "grad_norm": 2.260030049088017, + "language_loss": 0.65815377, + "learning_rate": 2.6545555776775995e-07, + "loss": 0.73483419, + "num_input_tokens_seen": 301028875, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09100342, + "step": 13956, + "time_per_iteration": 2.5096991062164307 + }, + { + "auxiliary_loss_clip": 0.06408071, + "auxiliary_loss_mlp": 0.01264206, + "balance_loss_clip": 0.06273458, + "balance_loss_mlp": 0.0125458, + "epoch": 0.8391402374868481, + "flos": 24725416101120.0, + "grad_norm": 1.7667751667109255, + "language_loss": 0.80019176, + "learning_rate": 2.6526170330992667e-07, + "loss": 0.87691456, + "num_input_tokens_seen": 301050115, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.09625244, + "step": 13957, + "time_per_iteration": 2.5260238647460938 + }, + { + "auxiliary_loss_clip": 0.06310308, + "auxiliary_loss_mlp": 0.01251038, + "balance_loss_clip": 0.06255397, + "balance_loss_mlp": 0.01249962, + "epoch": 0.839200360739516, + "flos": 56891804728320.0, + "grad_norm": 0.7372887676076516, + "language_loss": 0.53274184, + "learning_rate": 2.6506791463354283e-07, + "loss": 0.60835534, + "num_input_tokens_seen": 301114155, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.01077271, + "step": 13958, + "time_per_iteration": 3.2426984310150146 + }, + { + "auxiliary_loss_clip": 0.0640053, + "auxiliary_loss_mlp": 0.0126646, + "balance_loss_clip": 0.06271528, + "balance_loss_mlp": 0.01256756, + "epoch": 0.839260483992184, + "flos": 18338692118400.0, + "grad_norm": 1.7639772408046024, + "language_loss": 0.73410964, + "learning_rate": 2.648741917459574e-07, + "loss": 0.81077951, + "num_input_tokens_seen": 301133150, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.09698486, + "step": 13959, + "time_per_iteration": 2.5194149017333984 + }, + { + "auxiliary_loss_clip": 0.06397633, + "auxiliary_loss_mlp": 0.01265334, + "balance_loss_clip": 0.06271541, + "balance_loss_mlp": 0.01256209, + "epoch": 0.8393206072448519, + "flos": 27095041566720.0, + "grad_norm": 1.5424729354791942, + "language_loss": 0.56095922, + "learning_rate": 2.646805346545169e-07, + "loss": 0.63758886, + "num_input_tokens_seen": 301153600, + "router_z_loss_clip": 1.25976562, + "router_z_loss_mlp": 0.09130859, + "step": 13960, + "time_per_iteration": 2.537529706954956 + }, + { + "auxiliary_loss_clip": 0.06315308, + "auxiliary_loss_mlp": 0.01251161, + "balance_loss_clip": 0.06260296, + "balance_loss_mlp": 0.01249989, + "epoch": 0.8393807304975199, + "flos": 61538619006720.0, + "grad_norm": 0.753849352995641, + "language_loss": 0.60770983, + "learning_rate": 2.6448694336656397e-07, + "loss": 0.68337452, + "num_input_tokens_seen": 301214335, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.01169586, + "step": 13961, + "time_per_iteration": 3.2075889110565186 + }, + { + "auxiliary_loss_clip": 0.06403187, + "auxiliary_loss_mlp": 0.01262351, + "balance_loss_clip": 0.06271735, + "balance_loss_mlp": 0.01252922, + "epoch": 0.8394408537501878, + "flos": 14898787251840.0, + "grad_norm": 2.1675856051835987, + "language_loss": 0.68842262, + "learning_rate": 2.642934178894405e-07, + "loss": 0.76507801, + "num_input_tokens_seen": 301228960, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.09429932, + "step": 13962, + "time_per_iteration": 2.4669265747070312 + }, + { + "auxiliary_loss_clip": 0.0640391, + "auxiliary_loss_mlp": 0.01265749, + "balance_loss_clip": 0.06269991, + "balance_loss_mlp": 0.01256194, + "epoch": 0.8395009770028559, + "flos": 17416516008960.0, + "grad_norm": 1.8287277787854637, + "language_loss": 0.73421824, + "learning_rate": 2.640999582304841e-07, + "loss": 0.81091487, + "num_input_tokens_seen": 301245875, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.09552002, + "step": 13963, + "time_per_iteration": 2.506747245788574 + }, + { + "auxiliary_loss_clip": 0.0640099, + "auxiliary_loss_mlp": 0.01266167, + "balance_loss_clip": 0.06270087, + "balance_loss_mlp": 0.01257585, + "epoch": 0.8395611002555238, + "flos": 27931615130880.0, + "grad_norm": 2.0600599297786646, + "language_loss": 0.7623294, + "learning_rate": 2.6390656439703173e-07, + "loss": 0.839001, + "num_input_tokens_seen": 301265550, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.08587646, + "step": 13964, + "time_per_iteration": 3.9650909900665283 + }, + { + "auxiliary_loss_clip": 0.06405744, + "auxiliary_loss_mlp": 0.0126646, + "balance_loss_clip": 0.06269985, + "balance_loss_mlp": 0.01255589, + "epoch": 0.8396212235081918, + "flos": 11104325832960.0, + "grad_norm": 1.8673180285408302, + "language_loss": 0.78343093, + "learning_rate": 2.637132363964161e-07, + "loss": 0.86015296, + "num_input_tokens_seen": 301282035, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10864258, + "step": 13965, + "time_per_iteration": 2.4702308177948 + }, + { + "auxiliary_loss_clip": 0.06399923, + "auxiliary_loss_mlp": 0.01263836, + "balance_loss_clip": 0.0627108, + "balance_loss_mlp": 0.0125483, + "epoch": 0.8396813467608598, + "flos": 35744307096960.0, + "grad_norm": 1.4537191303723818, + "language_loss": 0.65587616, + "learning_rate": 2.635199742359684e-07, + "loss": 0.73251367, + "num_input_tokens_seen": 301305210, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.09002686, + "step": 13966, + "time_per_iteration": 2.646740436553955 + }, + { + "auxiliary_loss_clip": 0.06399661, + "auxiliary_loss_mlp": 0.01262484, + "balance_loss_clip": 0.06269723, + "balance_loss_mlp": 0.01253705, + "epoch": 0.8397414700135277, + "flos": 26183850341760.0, + "grad_norm": 1.5561591084545034, + "language_loss": 0.74881774, + "learning_rate": 2.633267779230177e-07, + "loss": 0.82543921, + "num_input_tokens_seen": 301324885, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.08782959, + "step": 13967, + "time_per_iteration": 2.5391502380371094 + }, + { + "auxiliary_loss_clip": 0.06401393, + "auxiliary_loss_mlp": 0.01262984, + "balance_loss_clip": 0.06270708, + "balance_loss_mlp": 0.01254157, + "epoch": 0.8398015932661957, + "flos": 18339069461760.0, + "grad_norm": 1.695171973316043, + "language_loss": 0.82986927, + "learning_rate": 2.6313364746488974e-07, + "loss": 0.90651309, + "num_input_tokens_seen": 301343070, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.08837891, + "step": 13968, + "time_per_iteration": 3.9714221954345703 + }, + { + "auxiliary_loss_clip": 0.06404668, + "auxiliary_loss_mlp": 0.01265348, + "balance_loss_clip": 0.06270289, + "balance_loss_mlp": 0.01255877, + "epoch": 0.8398617165188637, + "flos": 17384469022080.0, + "grad_norm": 2.314933377391938, + "language_loss": 0.77319568, + "learning_rate": 2.629405828689075e-07, + "loss": 0.84989589, + "num_input_tokens_seen": 301359280, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.09472656, + "step": 13969, + "time_per_iteration": 2.4677093029022217 + }, + { + "auxiliary_loss_clip": 0.06403182, + "auxiliary_loss_mlp": 0.01262152, + "balance_loss_clip": 0.06268304, + "balance_loss_mlp": 0.01252192, + "epoch": 0.8399218397715317, + "flos": 22936296522240.0, + "grad_norm": 1.9974672469929322, + "language_loss": 0.77134824, + "learning_rate": 2.627475841423923e-07, + "loss": 0.8480016, + "num_input_tokens_seen": 301376465, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.09954834, + "step": 13970, + "time_per_iteration": 3.9030561447143555 + }, + { + "auxiliary_loss_clip": 0.06401689, + "auxiliary_loss_mlp": 0.01266818, + "balance_loss_clip": 0.06269555, + "balance_loss_mlp": 0.01257042, + "epoch": 0.8399819630241996, + "flos": 23156376071040.0, + "grad_norm": 2.053890179437049, + "language_loss": 0.72737366, + "learning_rate": 2.625546512926633e-07, + "loss": 0.80405873, + "num_input_tokens_seen": 301396000, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09771729, + "step": 13971, + "time_per_iteration": 2.514538049697876 + }, + { + "auxiliary_loss_clip": 0.06401571, + "auxiliary_loss_mlp": 0.01263608, + "balance_loss_clip": 0.06270111, + "balance_loss_mlp": 0.01254059, + "epoch": 0.8400420862768676, + "flos": 16402727059200.0, + "grad_norm": 2.070954045877117, + "language_loss": 0.77785814, + "learning_rate": 2.623617843270358e-07, + "loss": 0.85450995, + "num_input_tokens_seen": 301413160, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09545898, + "step": 13972, + "time_per_iteration": 2.4673666954040527 + }, + { + "auxiliary_loss_clip": 0.06399271, + "auxiliary_loss_mlp": 0.01263901, + "balance_loss_clip": 0.06270008, + "balance_loss_mlp": 0.01254484, + "epoch": 0.8401022095295355, + "flos": 21293770861440.0, + "grad_norm": 1.3621569173910255, + "language_loss": 0.68392384, + "learning_rate": 2.6216898325282333e-07, + "loss": 0.76055562, + "num_input_tokens_seen": 301433325, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.09429932, + "step": 13973, + "time_per_iteration": 2.530261516571045 + }, + { + "auxiliary_loss_clip": 0.06399777, + "auxiliary_loss_mlp": 0.01266286, + "balance_loss_clip": 0.06268927, + "balance_loss_mlp": 0.01256875, + "epoch": 0.8401623327822035, + "flos": 17317062812160.0, + "grad_norm": 1.787125184070989, + "language_loss": 0.78559691, + "learning_rate": 2.619762480773382e-07, + "loss": 0.86225754, + "num_input_tokens_seen": 301450265, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09417725, + "step": 13974, + "time_per_iteration": 2.462040424346924 + }, + { + "auxiliary_loss_clip": 0.0640364, + "auxiliary_loss_mlp": 0.01264498, + "balance_loss_clip": 0.06270675, + "balance_loss_mlp": 0.01255211, + "epoch": 0.8402224560348714, + "flos": 22243214275200.0, + "grad_norm": 1.4562103354507534, + "language_loss": 0.72743988, + "learning_rate": 2.617835788078868e-07, + "loss": 0.80412126, + "num_input_tokens_seen": 301470760, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.09289551, + "step": 13975, + "time_per_iteration": 2.555020332336426 + }, + { + "auxiliary_loss_clip": 0.06401096, + "auxiliary_loss_mlp": 0.01266475, + "balance_loss_clip": 0.0627125, + "balance_loss_mlp": 0.01256623, + "epoch": 0.8402825792875395, + "flos": 20236153426560.0, + "grad_norm": 1.6874682167845347, + "language_loss": 0.72985578, + "learning_rate": 2.6159097545177645e-07, + "loss": 0.80653155, + "num_input_tokens_seen": 301489425, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.09857178, + "step": 13976, + "time_per_iteration": 2.4919087886810303 + }, + { + "auxiliary_loss_clip": 0.06400332, + "auxiliary_loss_mlp": 0.01260889, + "balance_loss_clip": 0.06269455, + "balance_loss_mlp": 0.01252359, + "epoch": 0.8403427025402074, + "flos": 23295884319360.0, + "grad_norm": 1.6877264487051344, + "language_loss": 0.72409099, + "learning_rate": 2.61398438016311e-07, + "loss": 0.80070317, + "num_input_tokens_seen": 301508885, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.08526611, + "step": 13977, + "time_per_iteration": 2.5217444896698 + }, + { + "auxiliary_loss_clip": 0.06405861, + "auxiliary_loss_mlp": 0.01264239, + "balance_loss_clip": 0.06272191, + "balance_loss_mlp": 0.01254548, + "epoch": 0.8404028257928754, + "flos": 32684534277120.0, + "grad_norm": 1.3954911875741427, + "language_loss": 0.68609047, + "learning_rate": 2.6120596650879043e-07, + "loss": 0.76279151, + "num_input_tokens_seen": 301533780, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.09686279, + "step": 13978, + "time_per_iteration": 2.6191842555999756 + }, + { + "auxiliary_loss_clip": 0.06397029, + "auxiliary_loss_mlp": 0.01262166, + "balance_loss_clip": 0.06270245, + "balance_loss_mlp": 0.01252808, + "epoch": 0.8404629490455434, + "flos": 16186127454720.0, + "grad_norm": 1.5374499175737208, + "language_loss": 0.78201067, + "learning_rate": 2.610135609365145e-07, + "loss": 0.85860264, + "num_input_tokens_seen": 301551775, + "router_z_loss_clip": 1.26660156, + "router_z_loss_mlp": 0.09350586, + "step": 13979, + "time_per_iteration": 2.4852335453033447 + }, + { + "auxiliary_loss_clip": 0.06403331, + "auxiliary_loss_mlp": 0.01265246, + "balance_loss_clip": 0.06270297, + "balance_loss_mlp": 0.01255214, + "epoch": 0.8405230722982113, + "flos": 15199731590400.0, + "grad_norm": 1.8725202434622394, + "language_loss": 0.78169626, + "learning_rate": 2.60821221306778e-07, + "loss": 0.85838211, + "num_input_tokens_seen": 301570495, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.1003418, + "step": 13980, + "time_per_iteration": 2.4990322589874268 + }, + { + "auxiliary_loss_clip": 0.06397291, + "auxiliary_loss_mlp": 0.0126609, + "balance_loss_clip": 0.06270248, + "balance_loss_mlp": 0.01256941, + "epoch": 0.8405831955508793, + "flos": 27818787208320.0, + "grad_norm": 1.5682421159240296, + "language_loss": 0.86943978, + "learning_rate": 2.606289476268757e-07, + "loss": 0.94607365, + "num_input_tokens_seen": 301591705, + "router_z_loss_clip": 1.27246094, + "router_z_loss_mlp": 0.09155273, + "step": 13981, + "time_per_iteration": 2.568634271621704 + }, + { + "auxiliary_loss_clip": 0.06401773, + "auxiliary_loss_mlp": 0.01267361, + "balance_loss_clip": 0.0627171, + "balance_loss_mlp": 0.01258027, + "epoch": 0.8406433188035473, + "flos": 23776308103680.0, + "grad_norm": 1.7497238195302791, + "language_loss": 0.67594308, + "learning_rate": 2.6043673990409745e-07, + "loss": 0.75263447, + "num_input_tokens_seen": 301611670, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.09332275, + "step": 13982, + "time_per_iteration": 2.507876396179199 + }, + { + "auxiliary_loss_clip": 0.06406415, + "auxiliary_loss_mlp": 0.01263144, + "balance_loss_clip": 0.0627246, + "balance_loss_mlp": 0.01252618, + "epoch": 0.8407034420562153, + "flos": 29213420964480.0, + "grad_norm": 1.5190356335780981, + "language_loss": 0.68256176, + "learning_rate": 2.602445981457324e-07, + "loss": 0.75925732, + "num_input_tokens_seen": 301632540, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.10522461, + "step": 13983, + "time_per_iteration": 2.575272560119629 + }, + { + "auxiliary_loss_clip": 0.06401223, + "auxiliary_loss_mlp": 0.01262569, + "balance_loss_clip": 0.06268837, + "balance_loss_mlp": 0.01253116, + "epoch": 0.8407635653088832, + "flos": 26367396710400.0, + "grad_norm": 1.902440913607337, + "language_loss": 0.79216588, + "learning_rate": 2.6005252235906684e-07, + "loss": 0.86880374, + "num_input_tokens_seen": 301651480, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.09454346, + "step": 13984, + "time_per_iteration": 2.5355708599090576 + }, + { + "auxiliary_loss_clip": 0.06399589, + "auxiliary_loss_mlp": 0.01263872, + "balance_loss_clip": 0.06269123, + "balance_loss_mlp": 0.01254478, + "epoch": 0.8408236885615512, + "flos": 21474927388800.0, + "grad_norm": 1.837857036965972, + "language_loss": 0.61041355, + "learning_rate": 2.598605125513842e-07, + "loss": 0.6870482, + "num_input_tokens_seen": 301670010, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.09387207, + "step": 13985, + "time_per_iteration": 2.5293657779693604 + }, + { + "auxiliary_loss_clip": 0.06404386, + "auxiliary_loss_mlp": 0.0126397, + "balance_loss_clip": 0.06271429, + "balance_loss_mlp": 0.01254373, + "epoch": 0.8408838118142191, + "flos": 22969936736640.0, + "grad_norm": 1.803818187093242, + "language_loss": 0.82403111, + "learning_rate": 2.5966856872996467e-07, + "loss": 0.90071464, + "num_input_tokens_seen": 301689785, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.09588623, + "step": 13986, + "time_per_iteration": 2.5350451469421387 + }, + { + "auxiliary_loss_clip": 0.06401613, + "auxiliary_loss_mlp": 0.0126388, + "balance_loss_clip": 0.06271525, + "balance_loss_mlp": 0.01254892, + "epoch": 0.8409439350668871, + "flos": 26807765443200.0, + "grad_norm": 1.3955353905275312, + "language_loss": 0.66139162, + "learning_rate": 2.5947669090208755e-07, + "loss": 0.73804653, + "num_input_tokens_seen": 301712225, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.08984375, + "step": 13987, + "time_per_iteration": 2.6994168758392334 + }, + { + "auxiliary_loss_clip": 0.06401115, + "auxiliary_loss_mlp": 0.01265067, + "balance_loss_clip": 0.06270722, + "balance_loss_mlp": 0.01256043, + "epoch": 0.841004058319555, + "flos": 26585966885760.0, + "grad_norm": 1.7419411611465583, + "language_loss": 0.67379653, + "learning_rate": 2.5928487907502906e-07, + "loss": 0.75045836, + "num_input_tokens_seen": 301730955, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.09020996, + "step": 13988, + "time_per_iteration": 2.535848617553711 + }, + { + "auxiliary_loss_clip": 0.06402878, + "auxiliary_loss_mlp": 0.01266134, + "balance_loss_clip": 0.06269789, + "balance_loss_mlp": 0.01255912, + "epoch": 0.8410641815722231, + "flos": 14507152467840.0, + "grad_norm": 2.335548469753872, + "language_loss": 0.81167138, + "learning_rate": 2.590931332560622e-07, + "loss": 0.88836145, + "num_input_tokens_seen": 301746930, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.10217285, + "step": 13989, + "time_per_iteration": 2.4869043827056885 + }, + { + "auxiliary_loss_clip": 0.06406767, + "auxiliary_loss_mlp": 0.01262411, + "balance_loss_clip": 0.06272566, + "balance_loss_mlp": 0.01253161, + "epoch": 0.841124304824891, + "flos": 29173994818560.0, + "grad_norm": 1.7072106379508765, + "language_loss": 0.75771666, + "learning_rate": 2.5890145345245826e-07, + "loss": 0.8344084, + "num_input_tokens_seen": 301766945, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.0925293, + "step": 13990, + "time_per_iteration": 2.5450754165649414 + }, + { + "auxiliary_loss_clip": 0.06394493, + "auxiliary_loss_mlp": 0.01266409, + "balance_loss_clip": 0.06268186, + "balance_loss_mlp": 0.01257552, + "epoch": 0.841184428077559, + "flos": 22417410913920.0, + "grad_norm": 1.5743700344824108, + "language_loss": 0.80771601, + "learning_rate": 2.5870983967148597e-07, + "loss": 0.88432503, + "num_input_tokens_seen": 301785460, + "router_z_loss_clip": 1.26367188, + "router_z_loss_mlp": 0.08862305, + "step": 13991, + "time_per_iteration": 2.5198276042938232 + }, + { + "auxiliary_loss_clip": 0.06398806, + "auxiliary_loss_mlp": 0.0126106, + "balance_loss_clip": 0.06268385, + "balance_loss_mlp": 0.01252244, + "epoch": 0.841244551330227, + "flos": 22968846633600.0, + "grad_norm": 2.2103689173127767, + "language_loss": 0.70700645, + "learning_rate": 2.585182919204105e-07, + "loss": 0.78360516, + "num_input_tokens_seen": 301804180, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.0881958, + "step": 13992, + "time_per_iteration": 4.00426983833313 + }, + { + "auxiliary_loss_clip": 0.06402652, + "auxiliary_loss_mlp": 0.01262158, + "balance_loss_clip": 0.06269322, + "balance_loss_mlp": 0.012528, + "epoch": 0.8413046745828949, + "flos": 21039086776320.0, + "grad_norm": 1.5410913015371062, + "language_loss": 0.76244783, + "learning_rate": 2.583268102064959e-07, + "loss": 0.83909595, + "num_input_tokens_seen": 301823670, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.09362793, + "step": 13993, + "time_per_iteration": 2.491050958633423 + }, + { + "auxiliary_loss_clip": 0.06408523, + "auxiliary_loss_mlp": 0.01266993, + "balance_loss_clip": 0.06269802, + "balance_loss_mlp": 0.01256377, + "epoch": 0.841364797835563, + "flos": 27059305000320.0, + "grad_norm": 2.1350785829086214, + "language_loss": 0.74388689, + "learning_rate": 2.5813539453700393e-07, + "loss": 0.82064199, + "num_input_tokens_seen": 301845890, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.1060791, + "step": 13994, + "time_per_iteration": 2.552985906600952 + }, + { + "auxiliary_loss_clip": 0.06396306, + "auxiliary_loss_mlp": 0.01264636, + "balance_loss_clip": 0.06269471, + "balance_loss_mlp": 0.01256059, + "epoch": 0.8414249210882309, + "flos": 17901635621760.0, + "grad_norm": 1.413146140624494, + "language_loss": 0.5934546, + "learning_rate": 2.5794404491919163e-07, + "loss": 0.67006397, + "num_input_tokens_seen": 301863985, + "router_z_loss_clip": 1.26855469, + "router_z_loss_mlp": 0.08569336, + "step": 13995, + "time_per_iteration": 2.4642326831817627 + }, + { + "auxiliary_loss_clip": 0.06402554, + "auxiliary_loss_mlp": 0.01262234, + "balance_loss_clip": 0.06271402, + "balance_loss_mlp": 0.0125262, + "epoch": 0.8414850443408989, + "flos": 25447233098880.0, + "grad_norm": 2.2726761612856206, + "language_loss": 0.72156918, + "learning_rate": 2.577527613603163e-07, + "loss": 0.79821706, + "num_input_tokens_seen": 301882765, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.09619141, + "step": 13996, + "time_per_iteration": 2.5874221324920654 + }, + { + "auxiliary_loss_clip": 0.0640333, + "auxiliary_loss_mlp": 0.01267475, + "balance_loss_clip": 0.06272834, + "balance_loss_mlp": 0.01258988, + "epoch": 0.8415451675935668, + "flos": 23226465611520.0, + "grad_norm": 1.5819410580498, + "language_loss": 0.64570701, + "learning_rate": 2.5756154386763017e-07, + "loss": 0.72241509, + "num_input_tokens_seen": 301902720, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.0848999, + "step": 13997, + "time_per_iteration": 2.4954543113708496 + }, + { + "auxiliary_loss_clip": 0.06407194, + "auxiliary_loss_mlp": 0.01267232, + "balance_loss_clip": 0.062701, + "balance_loss_mlp": 0.01256992, + "epoch": 0.8416052908462348, + "flos": 18551560216320.0, + "grad_norm": 1.9485298310301038, + "language_loss": 0.82216007, + "learning_rate": 2.5737039244838565e-07, + "loss": 0.89890432, + "num_input_tokens_seen": 301921245, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.10241699, + "step": 13998, + "time_per_iteration": 2.496969699859619 + }, + { + "auxiliary_loss_clip": 0.06404015, + "auxiliary_loss_mlp": 0.01269356, + "balance_loss_clip": 0.06272015, + "balance_loss_mlp": 0.01258961, + "epoch": 0.8416654140989027, + "flos": 26112544917120.0, + "grad_norm": 1.4808581499635296, + "language_loss": 0.80342889, + "learning_rate": 2.5717930710982984e-07, + "loss": 0.8801626, + "num_input_tokens_seen": 301942320, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.10400391, + "step": 13999, + "time_per_iteration": 2.52217173576355 + }, + { + "auxiliary_loss_clip": 0.06409043, + "auxiliary_loss_mlp": 0.01265186, + "balance_loss_clip": 0.06274166, + "balance_loss_mlp": 0.01255304, + "epoch": 0.8417255373515707, + "flos": 26440630778880.0, + "grad_norm": 4.858767918566699, + "language_loss": 0.66816556, + "learning_rate": 2.569882878592096e-07, + "loss": 0.74490786, + "num_input_tokens_seen": 301963110, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.09881592, + "step": 14000, + "time_per_iteration": 2.5723514556884766 + }, + { + "auxiliary_loss_clip": 0.06403996, + "auxiliary_loss_mlp": 0.01267083, + "balance_loss_clip": 0.06269436, + "balance_loss_mlp": 0.01257093, + "epoch": 0.8417856606042387, + "flos": 24724703341440.0, + "grad_norm": 1.500004932940948, + "language_loss": 0.7974422, + "learning_rate": 2.5679733470376885e-07, + "loss": 0.87415302, + "num_input_tokens_seen": 301984915, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.09985352, + "step": 14001, + "time_per_iteration": 2.5384724140167236 + }, + { + "auxiliary_loss_clip": 0.06400739, + "auxiliary_loss_mlp": 0.01266336, + "balance_loss_clip": 0.06269417, + "balance_loss_mlp": 0.01256853, + "epoch": 0.8418457838569067, + "flos": 20857259416320.0, + "grad_norm": 1.7632333528169615, + "language_loss": 0.78508544, + "learning_rate": 2.5660644765074703e-07, + "loss": 0.86175615, + "num_input_tokens_seen": 302004095, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.0949707, + "step": 14002, + "time_per_iteration": 2.5468106269836426 + }, + { + "auxiliary_loss_clip": 0.06400124, + "auxiliary_loss_mlp": 0.01266, + "balance_loss_clip": 0.06271224, + "balance_loss_mlp": 0.01256398, + "epoch": 0.8419059071095746, + "flos": 28668651644160.0, + "grad_norm": 1.3302333296141904, + "language_loss": 0.78383386, + "learning_rate": 2.5641562670738334e-07, + "loss": 0.86049509, + "num_input_tokens_seen": 302027250, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.0960083, + "step": 14003, + "time_per_iteration": 2.572388172149658 + }, + { + "auxiliary_loss_clip": 0.06398443, + "auxiliary_loss_mlp": 0.01267978, + "balance_loss_clip": 0.06267287, + "balance_loss_mlp": 0.01258757, + "epoch": 0.8419660303622426, + "flos": 21660150839040.0, + "grad_norm": 1.98720953266761, + "language_loss": 0.65639722, + "learning_rate": 2.5622487188091436e-07, + "loss": 0.73306143, + "num_input_tokens_seen": 302046950, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.09222412, + "step": 14004, + "time_per_iteration": 3.96457576751709 + }, + { + "auxiliary_loss_clip": 0.06407335, + "auxiliary_loss_mlp": 0.01266305, + "balance_loss_clip": 0.06271972, + "balance_loss_mlp": 0.01255814, + "epoch": 0.8420261536149106, + "flos": 25308102193920.0, + "grad_norm": 2.270922911539394, + "language_loss": 0.76293629, + "learning_rate": 2.560341831785724e-07, + "loss": 0.83967268, + "num_input_tokens_seen": 302065470, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.10498047, + "step": 14005, + "time_per_iteration": 2.5258288383483887 + }, + { + "auxiliary_loss_clip": 0.06406075, + "auxiliary_loss_mlp": 0.01265149, + "balance_loss_clip": 0.06271887, + "balance_loss_mlp": 0.01255535, + "epoch": 0.8420862768675785, + "flos": 18768159820800.0, + "grad_norm": 1.6456178296251338, + "language_loss": 0.78003979, + "learning_rate": 2.5584356060758906e-07, + "loss": 0.8567521, + "num_input_tokens_seen": 302083190, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.09606934, + "step": 14006, + "time_per_iteration": 2.489978313446045 + }, + { + "auxiliary_loss_clip": 0.06400469, + "auxiliary_loss_mlp": 0.01262872, + "balance_loss_clip": 0.06269795, + "balance_loss_mlp": 0.01253353, + "epoch": 0.8421464001202466, + "flos": 18333157749120.0, + "grad_norm": 1.7948996432963087, + "language_loss": 0.77462882, + "learning_rate": 2.556530041751932e-07, + "loss": 0.85126221, + "num_input_tokens_seen": 302098820, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.09515381, + "step": 14007, + "time_per_iteration": 3.9048590660095215 + }, + { + "auxiliary_loss_clip": 0.06404168, + "auxiliary_loss_mlp": 0.01261821, + "balance_loss_clip": 0.06270444, + "balance_loss_mlp": 0.01252267, + "epoch": 0.8422065233729145, + "flos": 31544710387200.0, + "grad_norm": 1.6673756075616437, + "language_loss": 0.66031694, + "learning_rate": 2.554625138886102e-07, + "loss": 0.7369768, + "num_input_tokens_seen": 302117075, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.09552002, + "step": 14008, + "time_per_iteration": 2.5647101402282715 + }, + { + "auxiliary_loss_clip": 0.0630706, + "auxiliary_loss_mlp": 0.01249886, + "balance_loss_clip": 0.06252214, + "balance_loss_mlp": 0.01248812, + "epoch": 0.8422666466255825, + "flos": 64316691999360.0, + "grad_norm": 0.7086447716783576, + "language_loss": 0.56921613, + "learning_rate": 2.552720897550631e-07, + "loss": 0.64478564, + "num_input_tokens_seen": 302179735, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.01074982, + "step": 14009, + "time_per_iteration": 3.2140049934387207 + }, + { + "auxiliary_loss_clip": 0.06399348, + "auxiliary_loss_mlp": 0.01265049, + "balance_loss_clip": 0.06270915, + "balance_loss_mlp": 0.01256531, + "epoch": 0.8423267698782504, + "flos": 24323676900480.0, + "grad_norm": 1.225341490624907, + "language_loss": 0.7808187, + "learning_rate": 2.5508173178177304e-07, + "loss": 0.85746264, + "num_input_tokens_seen": 302202055, + "router_z_loss_clip": 1.28613281, + "router_z_loss_mlp": 0.08520508, + "step": 14010, + "time_per_iteration": 3.96768856048584 + }, + { + "auxiliary_loss_clip": 0.06407313, + "auxiliary_loss_mlp": 0.01265279, + "balance_loss_clip": 0.06273588, + "balance_loss_mlp": 0.01254545, + "epoch": 0.8423868931309184, + "flos": 18301949303040.0, + "grad_norm": 1.7313909892121646, + "language_loss": 0.7269572, + "learning_rate": 2.548914399759592e-07, + "loss": 0.80368304, + "num_input_tokens_seen": 302221360, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.10742188, + "step": 14011, + "time_per_iteration": 2.4659523963928223 + }, + { + "auxiliary_loss_clip": 0.06401571, + "auxiliary_loss_mlp": 0.01265496, + "balance_loss_clip": 0.06270736, + "balance_loss_mlp": 0.01256114, + "epoch": 0.8424470163835863, + "flos": 23556983241600.0, + "grad_norm": 1.7946548405120046, + "language_loss": 0.84176588, + "learning_rate": 2.5470121434483636e-07, + "loss": 0.91843653, + "num_input_tokens_seen": 302240715, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.09381104, + "step": 14012, + "time_per_iteration": 2.5705301761627197 + }, + { + "auxiliary_loss_clip": 0.06391717, + "auxiliary_loss_mlp": 0.01266082, + "balance_loss_clip": 0.06269065, + "balance_loss_mlp": 0.01257529, + "epoch": 0.8425071396362543, + "flos": 23776350030720.0, + "grad_norm": 1.5491953146751778, + "language_loss": 0.67853385, + "learning_rate": 2.5451105489561884e-07, + "loss": 0.75511181, + "num_input_tokens_seen": 302260950, + "router_z_loss_clip": 1.22558594, + "router_z_loss_mlp": 0.08551025, + "step": 14013, + "time_per_iteration": 2.5138120651245117 + }, + { + "auxiliary_loss_clip": 0.0640588, + "auxiliary_loss_mlp": 0.01264224, + "balance_loss_clip": 0.06270213, + "balance_loss_mlp": 0.01254199, + "epoch": 0.8425672628889223, + "flos": 16184240737920.0, + "grad_norm": 2.40464734961883, + "language_loss": 0.78383315, + "learning_rate": 2.5432096163551644e-07, + "loss": 0.86053419, + "num_input_tokens_seen": 302277500, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.10028076, + "step": 14014, + "time_per_iteration": 2.499150276184082 + }, + { + "auxiliary_loss_clip": 0.06404585, + "auxiliary_loss_mlp": 0.01263908, + "balance_loss_clip": 0.06272553, + "balance_loss_mlp": 0.01254466, + "epoch": 0.8426273861415903, + "flos": 23155872946560.0, + "grad_norm": 1.6895801007055753, + "language_loss": 0.67373145, + "learning_rate": 2.5413093457173884e-07, + "loss": 0.75041628, + "num_input_tokens_seen": 302297930, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09436035, + "step": 14015, + "time_per_iteration": 2.5022330284118652 + }, + { + "auxiliary_loss_clip": 0.06404251, + "auxiliary_loss_mlp": 0.01263685, + "balance_loss_clip": 0.06272057, + "balance_loss_mlp": 0.01254083, + "epoch": 0.8426875093942582, + "flos": 17463614803200.0, + "grad_norm": 3.5337410606590556, + "language_loss": 0.76054883, + "learning_rate": 2.5394097371149036e-07, + "loss": 0.83722818, + "num_input_tokens_seen": 302315735, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.0960083, + "step": 14016, + "time_per_iteration": 2.483449935913086 + }, + { + "auxiliary_loss_clip": 0.06403068, + "auxiliary_loss_mlp": 0.01266667, + "balance_loss_clip": 0.06270768, + "balance_loss_mlp": 0.0125626, + "epoch": 0.8427476326469262, + "flos": 19645710831360.0, + "grad_norm": 1.789200385527246, + "language_loss": 0.7966969, + "learning_rate": 2.5375107906197544e-07, + "loss": 0.87339425, + "num_input_tokens_seen": 302332790, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.10394287, + "step": 14017, + "time_per_iteration": 2.473740577697754 + }, + { + "auxiliary_loss_clip": 0.06404161, + "auxiliary_loss_mlp": 0.01263517, + "balance_loss_clip": 0.06271141, + "balance_loss_mlp": 0.0125438, + "epoch": 0.8428077558995941, + "flos": 11944882465920.0, + "grad_norm": 2.0996679127374276, + "language_loss": 0.63158822, + "learning_rate": 2.5356125063039525e-07, + "loss": 0.70826501, + "num_input_tokens_seen": 302346490, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.09136963, + "step": 14018, + "time_per_iteration": 2.4304590225219727 + }, + { + "auxiliary_loss_clip": 0.06403518, + "auxiliary_loss_mlp": 0.01266777, + "balance_loss_clip": 0.06271292, + "balance_loss_mlp": 0.0125789, + "epoch": 0.8428678791522621, + "flos": 10456287955200.0, + "grad_norm": 1.7281845201580097, + "language_loss": 0.79151654, + "learning_rate": 2.5337148842394687e-07, + "loss": 0.86821949, + "num_input_tokens_seen": 302363235, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.08886719, + "step": 14019, + "time_per_iteration": 2.447352647781372 + }, + { + "auxiliary_loss_clip": 0.06403257, + "auxiliary_loss_mlp": 0.01265283, + "balance_loss_clip": 0.06270546, + "balance_loss_mlp": 0.01255717, + "epoch": 0.8429280024049302, + "flos": 28774813167360.0, + "grad_norm": 1.7232638375614535, + "language_loss": 0.78435445, + "learning_rate": 2.531817924498265e-07, + "loss": 0.86103988, + "num_input_tokens_seen": 302383270, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09564209, + "step": 14020, + "time_per_iteration": 2.551368474960327 + }, + { + "auxiliary_loss_clip": 0.06403369, + "auxiliary_loss_mlp": 0.01264948, + "balance_loss_clip": 0.06271713, + "balance_loss_mlp": 0.01255417, + "epoch": 0.8429881256575981, + "flos": 19543238887680.0, + "grad_norm": 1.7602731882199467, + "language_loss": 0.71348774, + "learning_rate": 2.5299216271522805e-07, + "loss": 0.79017103, + "num_input_tokens_seen": 302401355, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09527588, + "step": 14021, + "time_per_iteration": 2.4735569953918457 + }, + { + "auxiliary_loss_clip": 0.06406254, + "auxiliary_loss_mlp": 0.01266699, + "balance_loss_clip": 0.06271423, + "balance_loss_mlp": 0.01256441, + "epoch": 0.8430482489102661, + "flos": 24797937409920.0, + "grad_norm": 1.5820497244167908, + "language_loss": 0.69932485, + "learning_rate": 2.5280259922734125e-07, + "loss": 0.77605438, + "num_input_tokens_seen": 302419515, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.10253906, + "step": 14022, + "time_per_iteration": 2.5423529148101807 + }, + { + "auxiliary_loss_clip": 0.06404831, + "auxiliary_loss_mlp": 0.01265319, + "balance_loss_clip": 0.06270556, + "balance_loss_mlp": 0.0125552, + "epoch": 0.843108372162934, + "flos": 21550802860800.0, + "grad_norm": 1.7046614195484213, + "language_loss": 0.72680509, + "learning_rate": 2.526131019933553e-07, + "loss": 0.80350661, + "num_input_tokens_seen": 302438280, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.09796143, + "step": 14023, + "time_per_iteration": 2.484471559524536 + }, + { + "auxiliary_loss_clip": 0.06401069, + "auxiliary_loss_mlp": 0.01265355, + "balance_loss_clip": 0.06270259, + "balance_loss_mlp": 0.01255138, + "epoch": 0.843168495415602, + "flos": 24615816560640.0, + "grad_norm": 1.4810889251875472, + "language_loss": 0.67264724, + "learning_rate": 2.524236710204559e-07, + "loss": 0.74931145, + "num_input_tokens_seen": 302460860, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.10211182, + "step": 14024, + "time_per_iteration": 2.5865228176116943 + }, + { + "auxiliary_loss_clip": 0.06397875, + "auxiliary_loss_mlp": 0.01265022, + "balance_loss_clip": 0.06269605, + "balance_loss_mlp": 0.0125511, + "epoch": 0.8432286186682699, + "flos": 15128216530560.0, + "grad_norm": 2.032466655248574, + "language_loss": 0.81405187, + "learning_rate": 2.522343063158261e-07, + "loss": 0.89068085, + "num_input_tokens_seen": 302476980, + "router_z_loss_clip": 1.28222656, + "router_z_loss_mlp": 0.09912109, + "step": 14025, + "time_per_iteration": 2.465604782104492 + }, + { + "auxiliary_loss_clip": 0.0639737, + "auxiliary_loss_mlp": 0.01261603, + "balance_loss_clip": 0.06269414, + "balance_loss_mlp": 0.01253104, + "epoch": 0.843288741920938, + "flos": 20307920048640.0, + "grad_norm": 1.4533964551508662, + "language_loss": 0.77700567, + "learning_rate": 2.5204500788664606e-07, + "loss": 0.85359538, + "num_input_tokens_seen": 302496380, + "router_z_loss_clip": 1.27929688, + "router_z_loss_mlp": 0.08508301, + "step": 14026, + "time_per_iteration": 2.53076171875 + }, + { + "auxiliary_loss_clip": 0.06398062, + "auxiliary_loss_mlp": 0.01262456, + "balance_loss_clip": 0.06269979, + "balance_loss_mlp": 0.01253087, + "epoch": 0.8433488651736059, + "flos": 23338958117760.0, + "grad_norm": 1.4725617079093607, + "language_loss": 0.82412767, + "learning_rate": 2.518557757400945e-07, + "loss": 0.90073287, + "num_input_tokens_seen": 302516845, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.09375, + "step": 14027, + "time_per_iteration": 2.5195744037628174 + }, + { + "auxiliary_loss_clip": 0.06401826, + "auxiliary_loss_mlp": 0.01262756, + "balance_loss_clip": 0.06271818, + "balance_loss_mlp": 0.01253797, + "epoch": 0.8434089884262739, + "flos": 39467546945280.0, + "grad_norm": 1.6367557813703113, + "language_loss": 0.56320584, + "learning_rate": 2.5166660988334754e-07, + "loss": 0.63985163, + "num_input_tokens_seen": 302538865, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.08947754, + "step": 14028, + "time_per_iteration": 2.6699862480163574 + }, + { + "auxiliary_loss_clip": 0.06402962, + "auxiliary_loss_mlp": 0.01262903, + "balance_loss_clip": 0.06272294, + "balance_loss_mlp": 0.01253933, + "epoch": 0.8434691116789418, + "flos": 23775595344000.0, + "grad_norm": 1.7800167865381953, + "language_loss": 0.64169657, + "learning_rate": 2.51477510323578e-07, + "loss": 0.71835524, + "num_input_tokens_seen": 302557970, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.08966064, + "step": 14029, + "time_per_iteration": 2.5012686252593994 + }, + { + "auxiliary_loss_clip": 0.06397776, + "auxiliary_loss_mlp": 0.01263425, + "balance_loss_clip": 0.06271636, + "balance_loss_mlp": 0.01254949, + "epoch": 0.8435292349316098, + "flos": 22677503587200.0, + "grad_norm": 1.6433020027379726, + "language_loss": 0.75232613, + "learning_rate": 2.51288477067956e-07, + "loss": 0.82893813, + "num_input_tokens_seen": 302578915, + "router_z_loss_clip": 1.26171875, + "router_z_loss_mlp": 0.0847168, + "step": 14030, + "time_per_iteration": 2.5419058799743652 + }, + { + "auxiliary_loss_clip": 0.06398299, + "auxiliary_loss_mlp": 0.01267606, + "balance_loss_clip": 0.06269399, + "balance_loss_mlp": 0.01258075, + "epoch": 0.8435893581842777, + "flos": 18849611589120.0, + "grad_norm": 2.1565835327609406, + "language_loss": 0.83877122, + "learning_rate": 2.510995101236502e-07, + "loss": 0.91543025, + "num_input_tokens_seen": 302596300, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.09533691, + "step": 14031, + "time_per_iteration": 2.468385696411133 + }, + { + "auxiliary_loss_clip": 0.06401075, + "auxiliary_loss_mlp": 0.01263467, + "balance_loss_clip": 0.06271769, + "balance_loss_mlp": 0.01254586, + "epoch": 0.8436494814369457, + "flos": 20710497790080.0, + "grad_norm": 2.151005653825973, + "language_loss": 0.80558878, + "learning_rate": 2.509106094978266e-07, + "loss": 0.88223422, + "num_input_tokens_seen": 302614975, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.08886719, + "step": 14032, + "time_per_iteration": 3.9253792762756348 + }, + { + "auxiliary_loss_clip": 0.06401269, + "auxiliary_loss_mlp": 0.01266295, + "balance_loss_clip": 0.06269183, + "balance_loss_mlp": 0.0125593, + "epoch": 0.8437096046896138, + "flos": 22680731969280.0, + "grad_norm": 1.43708237310059, + "language_loss": 0.75761014, + "learning_rate": 2.507217751976478e-07, + "loss": 0.8342858, + "num_input_tokens_seen": 302636415, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.10369873, + "step": 14033, + "time_per_iteration": 2.545506238937378 + }, + { + "auxiliary_loss_clip": 0.06403454, + "auxiliary_loss_mlp": 0.01266807, + "balance_loss_clip": 0.06270887, + "balance_loss_mlp": 0.01258021, + "epoch": 0.8437697279422817, + "flos": 16185666257280.0, + "grad_norm": 1.695610228137136, + "language_loss": 0.83268261, + "learning_rate": 2.505330072302743e-07, + "loss": 0.9093852, + "num_input_tokens_seen": 302653605, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.08782959, + "step": 14034, + "time_per_iteration": 2.5694990158081055 + }, + { + "auxiliary_loss_clip": 0.06401746, + "auxiliary_loss_mlp": 0.01263914, + "balance_loss_clip": 0.06269741, + "balance_loss_mlp": 0.01254061, + "epoch": 0.8438298511949497, + "flos": 28773178012800.0, + "grad_norm": 1.4341877550440127, + "language_loss": 0.78500712, + "learning_rate": 2.503443056028656e-07, + "loss": 0.86166364, + "num_input_tokens_seen": 302673965, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09851074, + "step": 14035, + "time_per_iteration": 2.603475332260132 + }, + { + "auxiliary_loss_clip": 0.06401128, + "auxiliary_loss_mlp": 0.01261261, + "balance_loss_clip": 0.06270442, + "balance_loss_mlp": 0.01252035, + "epoch": 0.8438899744476176, + "flos": 33731837660160.0, + "grad_norm": 1.4118924926688545, + "language_loss": 0.72302711, + "learning_rate": 2.501556703225751e-07, + "loss": 0.79965097, + "num_input_tokens_seen": 302695560, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.09234619, + "step": 14036, + "time_per_iteration": 2.618654727935791 + }, + { + "auxiliary_loss_clip": 0.06396312, + "auxiliary_loss_mlp": 0.01261207, + "balance_loss_clip": 0.06269594, + "balance_loss_mlp": 0.01252868, + "epoch": 0.8439500977002856, + "flos": 25116421979520.0, + "grad_norm": 1.6362343480396115, + "language_loss": 0.70156783, + "learning_rate": 2.49967101396557e-07, + "loss": 0.77814305, + "num_input_tokens_seen": 302713480, + "router_z_loss_clip": 1.26757812, + "router_z_loss_mlp": 0.08331299, + "step": 14037, + "time_per_iteration": 2.5106723308563232 + }, + { + "auxiliary_loss_clip": 0.06399255, + "auxiliary_loss_mlp": 0.01264455, + "balance_loss_clip": 0.0627047, + "balance_loss_mlp": 0.01256098, + "epoch": 0.8440102209529535, + "flos": 32858060083200.0, + "grad_norm": 1.571189244416603, + "language_loss": 0.69434804, + "learning_rate": 2.4977859883196227e-07, + "loss": 0.77098513, + "num_input_tokens_seen": 302736860, + "router_z_loss_clip": 1.28808594, + "router_z_loss_mlp": 0.08355713, + "step": 14038, + "time_per_iteration": 2.588937282562256 + }, + { + "auxiliary_loss_clip": 0.0640436, + "auxiliary_loss_mlp": 0.01263875, + "balance_loss_clip": 0.06271221, + "balance_loss_mlp": 0.0125463, + "epoch": 0.8440703442056215, + "flos": 23736588468480.0, + "grad_norm": 1.525634873049396, + "language_loss": 0.76716536, + "learning_rate": 2.49590162635938e-07, + "loss": 0.84384775, + "num_input_tokens_seen": 302757745, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.09240723, + "step": 14039, + "time_per_iteration": 2.5490803718566895 + }, + { + "auxiliary_loss_clip": 0.06412183, + "auxiliary_loss_mlp": 0.01262445, + "balance_loss_clip": 0.062753, + "balance_loss_mlp": 0.01252646, + "epoch": 0.8441304674582895, + "flos": 20199955662720.0, + "grad_norm": 1.8775468369698345, + "language_loss": 0.79449338, + "learning_rate": 2.4940179281563046e-07, + "loss": 0.87123966, + "num_input_tokens_seen": 302774885, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.09796143, + "step": 14040, + "time_per_iteration": 2.4884471893310547 + }, + { + "auxiliary_loss_clip": 0.0640001, + "auxiliary_loss_mlp": 0.01266389, + "balance_loss_clip": 0.06269734, + "balance_loss_mlp": 0.01256513, + "epoch": 0.8441905907109575, + "flos": 20224413855360.0, + "grad_norm": 1.8433585006655098, + "language_loss": 0.69202292, + "learning_rate": 2.492134893781821e-07, + "loss": 0.76868689, + "num_input_tokens_seen": 302791035, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.09866333, + "step": 14041, + "time_per_iteration": 2.4893062114715576 + }, + { + "auxiliary_loss_clip": 0.06408129, + "auxiliary_loss_mlp": 0.01265821, + "balance_loss_clip": 0.06273414, + "balance_loss_mlp": 0.01255491, + "epoch": 0.8442507139636254, + "flos": 13521511290240.0, + "grad_norm": 1.7054295527425734, + "language_loss": 0.68817204, + "learning_rate": 2.490252523307341e-07, + "loss": 0.76491153, + "num_input_tokens_seen": 302808650, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.10327148, + "step": 14042, + "time_per_iteration": 2.4641237258911133 + }, + { + "auxiliary_loss_clip": 0.0639908, + "auxiliary_loss_mlp": 0.01266235, + "balance_loss_clip": 0.06270715, + "balance_loss_mlp": 0.01256871, + "epoch": 0.8443108372162934, + "flos": 18225570706560.0, + "grad_norm": 1.5510354554393648, + "language_loss": 0.75078881, + "learning_rate": 2.4883708168042373e-07, + "loss": 0.82744193, + "num_input_tokens_seen": 302824605, + "router_z_loss_clip": 1.28417969, + "router_z_loss_mlp": 0.09356689, + "step": 14043, + "time_per_iteration": 3.892390489578247 + }, + { + "auxiliary_loss_clip": 0.06400645, + "auxiliary_loss_mlp": 0.01261977, + "balance_loss_clip": 0.06269968, + "balance_loss_mlp": 0.0125315, + "epoch": 0.8443709604689613, + "flos": 16110293909760.0, + "grad_norm": 2.1613590719043003, + "language_loss": 0.72651005, + "learning_rate": 2.486489774343865e-07, + "loss": 0.80313635, + "num_input_tokens_seen": 302840170, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.08831787, + "step": 14044, + "time_per_iteration": 2.4726979732513428 + }, + { + "auxiliary_loss_clip": 0.06397988, + "auxiliary_loss_mlp": 0.01263562, + "balance_loss_clip": 0.06269136, + "balance_loss_mlp": 0.01254932, + "epoch": 0.8444310837216293, + "flos": 18517542658560.0, + "grad_norm": 1.454592931872587, + "language_loss": 0.74902761, + "learning_rate": 2.484609395997559e-07, + "loss": 0.82564312, + "num_input_tokens_seen": 302858320, + "router_z_loss_clip": 1.28808594, + "router_z_loss_mlp": 0.08630371, + "step": 14045, + "time_per_iteration": 2.5141093730926514 + }, + { + "auxiliary_loss_clip": 0.06400928, + "auxiliary_loss_mlp": 0.01266482, + "balance_loss_clip": 0.06270893, + "balance_loss_mlp": 0.01257339, + "epoch": 0.8444912069742974, + "flos": 14945215213440.0, + "grad_norm": 1.9915649249395384, + "language_loss": 0.78878438, + "learning_rate": 2.4827296818366216e-07, + "loss": 0.86545849, + "num_input_tokens_seen": 302875255, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.09155273, + "step": 14046, + "time_per_iteration": 2.4441287517547607 + }, + { + "auxiliary_loss_clip": 0.06403919, + "auxiliary_loss_mlp": 0.01266027, + "balance_loss_clip": 0.06271581, + "balance_loss_mlp": 0.01255948, + "epoch": 0.8445513302269653, + "flos": 20126470032000.0, + "grad_norm": 1.93814940449734, + "language_loss": 0.78215307, + "learning_rate": 2.4808506319323255e-07, + "loss": 0.85885251, + "num_input_tokens_seen": 302894690, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.10076904, + "step": 14047, + "time_per_iteration": 3.947803258895874 + }, + { + "auxiliary_loss_clip": 0.06400177, + "auxiliary_loss_mlp": 0.01267116, + "balance_loss_clip": 0.0627154, + "balance_loss_mlp": 0.0125786, + "epoch": 0.8446114534796333, + "flos": 31178162701440.0, + "grad_norm": 1.7964123097724451, + "language_loss": 0.72113055, + "learning_rate": 2.478972246355935e-07, + "loss": 0.7978034, + "num_input_tokens_seen": 302912405, + "router_z_loss_clip": 1.28613281, + "router_z_loss_mlp": 0.0925293, + "step": 14048, + "time_per_iteration": 2.5795657634735107 + }, + { + "auxiliary_loss_clip": 0.06403403, + "auxiliary_loss_mlp": 0.01265353, + "balance_loss_clip": 0.06272613, + "balance_loss_mlp": 0.01255697, + "epoch": 0.8446715767323012, + "flos": 23954613592320.0, + "grad_norm": 1.3616000745091086, + "language_loss": 0.73144412, + "learning_rate": 2.477094525178667e-07, + "loss": 0.80813169, + "num_input_tokens_seen": 302932525, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09661865, + "step": 14049, + "time_per_iteration": 3.9288156032562256 + }, + { + "auxiliary_loss_clip": 0.0630594, + "auxiliary_loss_mlp": 0.0125014, + "balance_loss_clip": 0.06251055, + "balance_loss_mlp": 0.01249117, + "epoch": 0.8447316999849692, + "flos": 68004362989440.0, + "grad_norm": 0.7905781903446938, + "language_loss": 0.60587054, + "learning_rate": 2.475217468471729e-07, + "loss": 0.68143135, + "num_input_tokens_seen": 302991285, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.01023102, + "step": 14050, + "time_per_iteration": 3.077780246734619 + }, + { + "auxiliary_loss_clip": 0.06402567, + "auxiliary_loss_mlp": 0.01264219, + "balance_loss_clip": 0.06271556, + "balance_loss_mlp": 0.01253938, + "epoch": 0.8447918232376371, + "flos": 22425460905600.0, + "grad_norm": 2.519523289840615, + "language_loss": 0.72404873, + "learning_rate": 2.473341076306303e-07, + "loss": 0.80071664, + "num_input_tokens_seen": 303009515, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.10284424, + "step": 14051, + "time_per_iteration": 2.556217670440674 + }, + { + "auxiliary_loss_clip": 0.06396311, + "auxiliary_loss_mlp": 0.01266219, + "balance_loss_clip": 0.06267892, + "balance_loss_mlp": 0.01257243, + "epoch": 0.8448519464903052, + "flos": 23700600339840.0, + "grad_norm": 1.9626022777584542, + "language_loss": 0.74592292, + "learning_rate": 2.471465348753547e-07, + "loss": 0.82254827, + "num_input_tokens_seen": 303026905, + "router_z_loss_clip": 1.28417969, + "router_z_loss_mlp": 0.08984375, + "step": 14052, + "time_per_iteration": 2.5140316486358643 + }, + { + "auxiliary_loss_clip": 0.06395899, + "auxiliary_loss_mlp": 0.01266351, + "balance_loss_clip": 0.06272222, + "balance_loss_mlp": 0.01257941, + "epoch": 0.8449120697429731, + "flos": 13741087714560.0, + "grad_norm": 1.5692386664403404, + "language_loss": 0.73870707, + "learning_rate": 2.469590285884575e-07, + "loss": 0.81532955, + "num_input_tokens_seen": 303045245, + "router_z_loss_clip": 1.23730469, + "router_z_loss_mlp": 0.08404541, + "step": 14053, + "time_per_iteration": 2.5562212467193604 + }, + { + "auxiliary_loss_clip": 0.06402231, + "auxiliary_loss_mlp": 0.01265521, + "balance_loss_clip": 0.06272172, + "balance_loss_mlp": 0.01256235, + "epoch": 0.8449721929956411, + "flos": 20893121763840.0, + "grad_norm": 1.5720536659104367, + "language_loss": 0.74138618, + "learning_rate": 2.467715887770494e-07, + "loss": 0.81806374, + "num_input_tokens_seen": 303065205, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.09283447, + "step": 14054, + "time_per_iteration": 2.499558687210083 + }, + { + "auxiliary_loss_clip": 0.06406872, + "auxiliary_loss_mlp": 0.01263984, + "balance_loss_clip": 0.06270154, + "balance_loss_mlp": 0.01253863, + "epoch": 0.845032316248309, + "flos": 33224985112320.0, + "grad_norm": 1.3467293957479496, + "language_loss": 0.78394425, + "learning_rate": 2.4658421544823895e-07, + "loss": 0.86065292, + "num_input_tokens_seen": 303088250, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.10119629, + "step": 14055, + "time_per_iteration": 2.6246414184570312 + }, + { + "auxiliary_loss_clip": 0.06395009, + "auxiliary_loss_mlp": 0.01266193, + "balance_loss_clip": 0.06266758, + "balance_loss_mlp": 0.01257604, + "epoch": 0.845092439500977, + "flos": 23591755486080.0, + "grad_norm": 1.544566635839548, + "language_loss": 0.73342294, + "learning_rate": 2.463969086091302e-07, + "loss": 0.81003493, + "num_input_tokens_seen": 303109280, + "router_z_loss_clip": 1.28222656, + "router_z_loss_mlp": 0.08587646, + "step": 14056, + "time_per_iteration": 2.508028030395508 + }, + { + "auxiliary_loss_clip": 0.06407695, + "auxiliary_loss_mlp": 0.01264647, + "balance_loss_clip": 0.06270935, + "balance_loss_mlp": 0.01254777, + "epoch": 0.8451525627536449, + "flos": 13338929243520.0, + "grad_norm": 2.1863869456022647, + "language_loss": 0.68351102, + "learning_rate": 2.4620966826682686e-07, + "loss": 0.76023448, + "num_input_tokens_seen": 303126075, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.09881592, + "step": 14057, + "time_per_iteration": 2.548752546310425 + }, + { + "auxiliary_loss_clip": 0.06399477, + "auxiliary_loss_mlp": 0.0126254, + "balance_loss_clip": 0.06268546, + "balance_loss_mlp": 0.0125292, + "epoch": 0.8452126860063129, + "flos": 27825285899520.0, + "grad_norm": 1.5760714164083998, + "language_loss": 0.77413702, + "learning_rate": 2.460224944284284e-07, + "loss": 0.85075724, + "num_input_tokens_seen": 303146920, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.09625244, + "step": 14058, + "time_per_iteration": 2.5370140075683594 + }, + { + "auxiliary_loss_clip": 0.06404024, + "auxiliary_loss_mlp": 0.01264124, + "balance_loss_clip": 0.06272276, + "balance_loss_mlp": 0.01254802, + "epoch": 0.845272809258981, + "flos": 27131868236160.0, + "grad_norm": 1.5270727793259906, + "language_loss": 0.69999516, + "learning_rate": 2.45835387101033e-07, + "loss": 0.77667671, + "num_input_tokens_seen": 303167885, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09320068, + "step": 14059, + "time_per_iteration": 2.5480189323425293 + }, + { + "auxiliary_loss_clip": 0.06407374, + "auxiliary_loss_mlp": 0.01262296, + "balance_loss_clip": 0.0627005, + "balance_loss_mlp": 0.01251961, + "epoch": 0.8453329325116489, + "flos": 18338440556160.0, + "grad_norm": 2.540996267685051, + "language_loss": 0.57944226, + "learning_rate": 2.4564834629173516e-07, + "loss": 0.65613896, + "num_input_tokens_seen": 303185000, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.10339355, + "step": 14060, + "time_per_iteration": 2.481928586959839 + }, + { + "auxiliary_loss_clip": 0.0640597, + "auxiliary_loss_mlp": 0.01263749, + "balance_loss_clip": 0.06269474, + "balance_loss_mlp": 0.01252841, + "epoch": 0.8453930557643169, + "flos": 22681989780480.0, + "grad_norm": 1.4782194608801338, + "language_loss": 0.75907153, + "learning_rate": 2.454613720076277e-07, + "loss": 0.8357687, + "num_input_tokens_seen": 303205210, + "router_z_loss_clip": 1.36621094, + "router_z_loss_mlp": 0.10900879, + "step": 14061, + "time_per_iteration": 2.488678455352783 + }, + { + "auxiliary_loss_clip": 0.06403539, + "auxiliary_loss_mlp": 0.0126313, + "balance_loss_clip": 0.06268848, + "balance_loss_mlp": 0.01253194, + "epoch": 0.8454531790169848, + "flos": 22493034823680.0, + "grad_norm": 3.6244102921260004, + "language_loss": 0.71058381, + "learning_rate": 2.452744642558013e-07, + "loss": 0.78725052, + "num_input_tokens_seen": 303224655, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.09942627, + "step": 14062, + "time_per_iteration": 2.4787416458129883 + }, + { + "auxiliary_loss_clip": 0.06312045, + "auxiliary_loss_mlp": 0.01252111, + "balance_loss_clip": 0.06256789, + "balance_loss_mlp": 0.01251069, + "epoch": 0.8455133022696528, + "flos": 58295383672320.0, + "grad_norm": 0.6264898637302231, + "language_loss": 0.52687728, + "learning_rate": 2.450876230433432e-07, + "loss": 0.60251892, + "num_input_tokens_seen": 303289645, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01042175, + "step": 14063, + "time_per_iteration": 3.193988800048828 + }, + { + "auxiliary_loss_clip": 0.06397031, + "auxiliary_loss_mlp": 0.01265361, + "balance_loss_clip": 0.06271092, + "balance_loss_mlp": 0.01257398, + "epoch": 0.8455734255223207, + "flos": 21367717689600.0, + "grad_norm": 1.6737838739239328, + "language_loss": 0.82301968, + "learning_rate": 2.449008483773378e-07, + "loss": 0.8996436, + "num_input_tokens_seen": 303308350, + "router_z_loss_clip": 1.26074219, + "router_z_loss_mlp": 0.07965088, + "step": 14064, + "time_per_iteration": 2.4716007709503174 + }, + { + "auxiliary_loss_clip": 0.06409873, + "auxiliary_loss_mlp": 0.0126423, + "balance_loss_clip": 0.06275783, + "balance_loss_mlp": 0.01254872, + "epoch": 0.8456335487749888, + "flos": 20455562142720.0, + "grad_norm": 2.438952619320042, + "language_loss": 0.72705638, + "learning_rate": 2.447141402648685e-07, + "loss": 0.80379742, + "num_input_tokens_seen": 303325230, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.09356689, + "step": 14065, + "time_per_iteration": 2.486729383468628 + }, + { + "auxiliary_loss_clip": 0.06397683, + "auxiliary_loss_mlp": 0.0126263, + "balance_loss_clip": 0.06270014, + "balance_loss_mlp": 0.01254196, + "epoch": 0.8456936720276567, + "flos": 28848592287360.0, + "grad_norm": 1.4053294681947734, + "language_loss": 0.77431583, + "learning_rate": 2.445274987130146e-07, + "loss": 0.85091895, + "num_input_tokens_seen": 303345810, + "router_z_loss_clip": 1.27734375, + "router_z_loss_mlp": 0.08435059, + "step": 14066, + "time_per_iteration": 2.5918047428131104 + }, + { + "auxiliary_loss_clip": 0.06402615, + "auxiliary_loss_mlp": 0.01262305, + "balance_loss_clip": 0.06271371, + "balance_loss_mlp": 0.01252649, + "epoch": 0.8457537952803247, + "flos": 22679222595840.0, + "grad_norm": 1.4389859181784144, + "language_loss": 0.70042717, + "learning_rate": 2.4434092372885363e-07, + "loss": 0.77707636, + "num_input_tokens_seen": 303365140, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09661865, + "step": 14067, + "time_per_iteration": 2.525247097015381 + }, + { + "auxiliary_loss_clip": 0.06396677, + "auxiliary_loss_mlp": 0.01263949, + "balance_loss_clip": 0.06268427, + "balance_loss_mlp": 0.01254717, + "epoch": 0.8458139185329926, + "flos": 33811444638720.0, + "grad_norm": 1.616550126073105, + "language_loss": 0.71155679, + "learning_rate": 2.4415441531946144e-07, + "loss": 0.78816307, + "num_input_tokens_seen": 303386150, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.09234619, + "step": 14068, + "time_per_iteration": 2.6183526515960693 + }, + { + "auxiliary_loss_clip": 0.06309339, + "auxiliary_loss_mlp": 0.01250851, + "balance_loss_clip": 0.06254174, + "balance_loss_mlp": 0.01249894, + "epoch": 0.8458740417856606, + "flos": 70317860618880.0, + "grad_norm": 0.9869339045259047, + "language_loss": 0.60466254, + "learning_rate": 2.4396797349190976e-07, + "loss": 0.68026447, + "num_input_tokens_seen": 303453770, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.009552, + "step": 14069, + "time_per_iteration": 3.223912000656128 + }, + { + "auxiliary_loss_clip": 0.06405959, + "auxiliary_loss_mlp": 0.01263164, + "balance_loss_clip": 0.0627277, + "balance_loss_mlp": 0.01254276, + "epoch": 0.8459341650383285, + "flos": 24177795742080.0, + "grad_norm": 1.5100814720997062, + "language_loss": 0.7470544, + "learning_rate": 2.4378159825326804e-07, + "loss": 0.82374561, + "num_input_tokens_seen": 303474520, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.08886719, + "step": 14070, + "time_per_iteration": 2.5450565814971924 + }, + { + "auxiliary_loss_clip": 0.064023, + "auxiliary_loss_mlp": 0.01266026, + "balance_loss_clip": 0.06273255, + "balance_loss_mlp": 0.01256013, + "epoch": 0.8459942882909965, + "flos": 38190395013120.0, + "grad_norm": 1.6691276484821116, + "language_loss": 0.67298388, + "learning_rate": 2.435952896106039e-07, + "loss": 0.74966717, + "num_input_tokens_seen": 303497345, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.10009766, + "step": 14071, + "time_per_iteration": 4.11489462852478 + }, + { + "auxiliary_loss_clip": 0.06311657, + "auxiliary_loss_mlp": 0.01250821, + "balance_loss_clip": 0.06256663, + "balance_loss_mlp": 0.01249876, + "epoch": 0.8460544115436646, + "flos": 64137212553600.0, + "grad_norm": 0.7266466242386742, + "language_loss": 0.61095023, + "learning_rate": 2.4340904757098313e-07, + "loss": 0.686575, + "num_input_tokens_seen": 303554890, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.00943756, + "step": 14072, + "time_per_iteration": 2.9876792430877686 + }, + { + "auxiliary_loss_clip": 0.06405033, + "auxiliary_loss_mlp": 0.0126483, + "balance_loss_clip": 0.06271338, + "balance_loss_mlp": 0.01254531, + "epoch": 0.8461145347963325, + "flos": 24177753815040.0, + "grad_norm": 2.184634062710798, + "language_loss": 0.72637683, + "learning_rate": 2.4322287214146664e-07, + "loss": 0.80307543, + "num_input_tokens_seen": 303574380, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.10296631, + "step": 14073, + "time_per_iteration": 2.5138275623321533 + }, + { + "auxiliary_loss_clip": 0.06410398, + "auxiliary_loss_mlp": 0.01263688, + "balance_loss_clip": 0.06270458, + "balance_loss_mlp": 0.0125315, + "epoch": 0.8461746580490005, + "flos": 34901863747200.0, + "grad_norm": 1.7949530900019746, + "language_loss": 0.78191227, + "learning_rate": 2.430367633291155e-07, + "loss": 0.85865319, + "num_input_tokens_seen": 303594910, + "router_z_loss_clip": 1.39941406, + "router_z_loss_mlp": 0.10546875, + "step": 14074, + "time_per_iteration": 2.619873046875 + }, + { + "auxiliary_loss_clip": 0.064037, + "auxiliary_loss_mlp": 0.01263016, + "balance_loss_clip": 0.06272943, + "balance_loss_mlp": 0.01253759, + "epoch": 0.8462347813016684, + "flos": 25564127944320.0, + "grad_norm": 3.010228780430648, + "language_loss": 0.75585461, + "learning_rate": 2.4285072114098583e-07, + "loss": 0.8325218, + "num_input_tokens_seen": 303613520, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.0925293, + "step": 14075, + "time_per_iteration": 2.5305089950561523 + }, + { + "auxiliary_loss_clip": 0.06402498, + "auxiliary_loss_mlp": 0.0126512, + "balance_loss_clip": 0.06272259, + "balance_loss_mlp": 0.01255554, + "epoch": 0.8462949045543364, + "flos": 21331855342080.0, + "grad_norm": 2.8956830830227607, + "language_loss": 0.72880858, + "learning_rate": 2.4266474558413355e-07, + "loss": 0.80548477, + "num_input_tokens_seen": 303631225, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.09564209, + "step": 14076, + "time_per_iteration": 2.5211126804351807 + }, + { + "auxiliary_loss_clip": 0.06409035, + "auxiliary_loss_mlp": 0.01266766, + "balance_loss_clip": 0.06273739, + "balance_loss_mlp": 0.01256705, + "epoch": 0.8463550278070043, + "flos": 22643947226880.0, + "grad_norm": 1.8142263370296956, + "language_loss": 0.77469641, + "learning_rate": 2.4247883666560945e-07, + "loss": 0.85145444, + "num_input_tokens_seen": 303649175, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.10058594, + "step": 14077, + "time_per_iteration": 2.4927358627319336 + }, + { + "auxiliary_loss_clip": 0.0640869, + "auxiliary_loss_mlp": 0.01265288, + "balance_loss_clip": 0.06273301, + "balance_loss_mlp": 0.012554, + "epoch": 0.8464151510596724, + "flos": 13010549892480.0, + "grad_norm": 1.9163242247942687, + "language_loss": 0.75092995, + "learning_rate": 2.422929943924643e-07, + "loss": 0.82766974, + "num_input_tokens_seen": 303665915, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.09887695, + "step": 14078, + "time_per_iteration": 2.4891517162323 + }, + { + "auxiliary_loss_clip": 0.06398796, + "auxiliary_loss_mlp": 0.01265037, + "balance_loss_clip": 0.06270543, + "balance_loss_mlp": 0.01255316, + "epoch": 0.8464752743123403, + "flos": 15710231790720.0, + "grad_norm": 2.9876674327438026, + "language_loss": 0.85306883, + "learning_rate": 2.4210721877174565e-07, + "loss": 0.92970717, + "num_input_tokens_seen": 303679985, + "router_z_loss_clip": 1.28320312, + "router_z_loss_mlp": 0.097229, + "step": 14079, + "time_per_iteration": 2.46020770072937 + }, + { + "auxiliary_loss_clip": 0.06414415, + "auxiliary_loss_mlp": 0.01265782, + "balance_loss_clip": 0.06273301, + "balance_loss_mlp": 0.01254928, + "epoch": 0.8465353975650083, + "flos": 21660570109440.0, + "grad_norm": 2.4202133336595826, + "language_loss": 0.58745563, + "learning_rate": 2.419215098104965e-07, + "loss": 0.66425759, + "num_input_tokens_seen": 303698470, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.10870361, + "step": 14080, + "time_per_iteration": 2.520763635635376 + }, + { + "auxiliary_loss_clip": 0.06408149, + "auxiliary_loss_mlp": 0.01263994, + "balance_loss_clip": 0.06270742, + "balance_loss_mlp": 0.01253796, + "epoch": 0.8465955208176762, + "flos": 18521651508480.0, + "grad_norm": 1.9002618050268867, + "language_loss": 0.6564846, + "learning_rate": 2.4173586751576014e-07, + "loss": 0.73320603, + "num_input_tokens_seen": 303716415, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.10198975, + "step": 14081, + "time_per_iteration": 2.4694995880126953 + }, + { + "auxiliary_loss_clip": 0.06404518, + "auxiliary_loss_mlp": 0.01262511, + "balance_loss_clip": 0.06271017, + "balance_loss_mlp": 0.01253815, + "epoch": 0.8466556440703442, + "flos": 24206362784640.0, + "grad_norm": 1.741929690841942, + "language_loss": 0.73086697, + "learning_rate": 2.41550291894576e-07, + "loss": 0.80753726, + "num_input_tokens_seen": 303734490, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.08691406, + "step": 14082, + "time_per_iteration": 2.5245912075042725 + }, + { + "auxiliary_loss_clip": 0.0640555, + "auxiliary_loss_mlp": 0.01262022, + "balance_loss_clip": 0.06270213, + "balance_loss_mlp": 0.01252503, + "epoch": 0.8467157673230121, + "flos": 20382118439040.0, + "grad_norm": 5.9029687604683945, + "language_loss": 0.76243949, + "learning_rate": 2.413647829539809e-07, + "loss": 0.8391152, + "num_input_tokens_seen": 303752310, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.09515381, + "step": 14083, + "time_per_iteration": 3.958021879196167 + }, + { + "auxiliary_loss_clip": 0.06404339, + "auxiliary_loss_mlp": 0.01269365, + "balance_loss_clip": 0.0626808, + "balance_loss_mlp": 0.01259113, + "epoch": 0.8467758905756801, + "flos": 28480870644480.0, + "grad_norm": 1.8273205637866814, + "language_loss": 0.66057962, + "learning_rate": 2.411793407010092e-07, + "loss": 0.73731661, + "num_input_tokens_seen": 303776065, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.10247803, + "step": 14084, + "time_per_iteration": 2.5712640285491943 + }, + { + "auxiliary_loss_clip": 0.06403982, + "auxiliary_loss_mlp": 0.01266515, + "balance_loss_clip": 0.06272845, + "balance_loss_mlp": 0.01256835, + "epoch": 0.8468360138283482, + "flos": 11697367904640.0, + "grad_norm": 1.9024447155732727, + "language_loss": 0.70089591, + "learning_rate": 2.409939651426938e-07, + "loss": 0.77760088, + "num_input_tokens_seen": 303793500, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.09680176, + "step": 14085, + "time_per_iteration": 2.5116045475006104 + }, + { + "auxiliary_loss_clip": 0.06401011, + "auxiliary_loss_mlp": 0.01263688, + "balance_loss_clip": 0.06269903, + "balance_loss_mlp": 0.01254396, + "epoch": 0.8468961370810161, + "flos": 24614726457600.0, + "grad_norm": 1.582597620873215, + "language_loss": 0.7123071, + "learning_rate": 2.408086562860634e-07, + "loss": 0.78895414, + "num_input_tokens_seen": 303814835, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.09295654, + "step": 14086, + "time_per_iteration": 2.5062472820281982 + }, + { + "auxiliary_loss_clip": 0.06402152, + "auxiliary_loss_mlp": 0.01265202, + "balance_loss_clip": 0.06269901, + "balance_loss_mlp": 0.01255951, + "epoch": 0.8469562603336841, + "flos": 19615927904640.0, + "grad_norm": 2.0212942405255347, + "language_loss": 0.75401855, + "learning_rate": 2.4062341413814445e-07, + "loss": 0.83069211, + "num_input_tokens_seen": 303834505, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.09240723, + "step": 14087, + "time_per_iteration": 3.9551570415496826 + }, + { + "auxiliary_loss_clip": 0.06400134, + "auxiliary_loss_mlp": 0.01265984, + "balance_loss_clip": 0.06269534, + "balance_loss_mlp": 0.01256445, + "epoch": 0.847016383586352, + "flos": 22645708162560.0, + "grad_norm": 1.342825997114302, + "language_loss": 0.73916817, + "learning_rate": 2.4043823870596227e-07, + "loss": 0.8158294, + "num_input_tokens_seen": 303855050, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.0954895, + "step": 14088, + "time_per_iteration": 2.540492296218872 + }, + { + "auxiliary_loss_clip": 0.0640047, + "auxiliary_loss_mlp": 0.01265277, + "balance_loss_clip": 0.06268281, + "balance_loss_mlp": 0.01255275, + "epoch": 0.84707650683902, + "flos": 20966565467520.0, + "grad_norm": 2.1758547876889405, + "language_loss": 0.72225606, + "learning_rate": 2.402531299965387e-07, + "loss": 0.79891354, + "num_input_tokens_seen": 303875635, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.10003662, + "step": 14089, + "time_per_iteration": 3.8723671436309814 + }, + { + "auxiliary_loss_clip": 0.06396633, + "auxiliary_loss_mlp": 0.01264492, + "balance_loss_clip": 0.06270199, + "balance_loss_mlp": 0.01255677, + "epoch": 0.8471366300916879, + "flos": 24099111158400.0, + "grad_norm": 1.5614948588231485, + "language_loss": 0.79447126, + "learning_rate": 2.400680880168928e-07, + "loss": 0.87108254, + "num_input_tokens_seen": 303896750, + "router_z_loss_clip": 1.26464844, + "router_z_loss_mlp": 0.0881958, + "step": 14090, + "time_per_iteration": 2.5186121463775635 + }, + { + "auxiliary_loss_clip": 0.064051, + "auxiliary_loss_mlp": 0.01271247, + "balance_loss_clip": 0.06271061, + "balance_loss_mlp": 0.01260507, + "epoch": 0.847196753344356, + "flos": 18338817899520.0, + "grad_norm": 2.1681555308129163, + "language_loss": 0.77695274, + "learning_rate": 2.3988311277404085e-07, + "loss": 0.85371625, + "num_input_tokens_seen": 303915435, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.10742188, + "step": 14091, + "time_per_iteration": 2.4776766300201416 + }, + { + "auxiliary_loss_clip": 0.06313801, + "auxiliary_loss_mlp": 0.01249423, + "balance_loss_clip": 0.06258924, + "balance_loss_mlp": 0.01248393, + "epoch": 0.8472568765970239, + "flos": 49585252550400.0, + "grad_norm": 0.8022713224368199, + "language_loss": 0.59404254, + "learning_rate": 2.396982042749982e-07, + "loss": 0.66967475, + "num_input_tokens_seen": 303977245, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.01030731, + "step": 14092, + "time_per_iteration": 3.243363380432129 + }, + { + "auxiliary_loss_clip": 0.06401625, + "auxiliary_loss_mlp": 0.01266586, + "balance_loss_clip": 0.06269732, + "balance_loss_mlp": 0.01256471, + "epoch": 0.8473169998496919, + "flos": 19284739441920.0, + "grad_norm": 1.7230869725009348, + "language_loss": 0.70479727, + "learning_rate": 2.395133625267756e-07, + "loss": 0.78147936, + "num_input_tokens_seen": 303996055, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.10119629, + "step": 14093, + "time_per_iteration": 2.554523229598999 + }, + { + "auxiliary_loss_clip": 0.0640064, + "auxiliary_loss_mlp": 0.01262162, + "balance_loss_clip": 0.0627271, + "balance_loss_mlp": 0.01253358, + "epoch": 0.8473771231023598, + "flos": 17681262583680.0, + "grad_norm": 2.1470167593348, + "language_loss": 0.83683729, + "learning_rate": 2.3932858753638263e-07, + "loss": 0.91346526, + "num_input_tokens_seen": 304012205, + "router_z_loss_clip": 1.28027344, + "router_z_loss_mlp": 0.0880127, + "step": 14094, + "time_per_iteration": 2.474327564239502 + }, + { + "auxiliary_loss_clip": 0.06397246, + "auxiliary_loss_mlp": 0.01266631, + "balance_loss_clip": 0.06270628, + "balance_loss_mlp": 0.01257237, + "epoch": 0.8474372463550278, + "flos": 26367019367040.0, + "grad_norm": 1.5654273666716596, + "language_loss": 0.7183401, + "learning_rate": 2.3914387931082626e-07, + "loss": 0.79497892, + "num_input_tokens_seen": 304033475, + "router_z_loss_clip": 1.26464844, + "router_z_loss_mlp": 0.09399414, + "step": 14095, + "time_per_iteration": 2.559675693511963 + }, + { + "auxiliary_loss_clip": 0.06399059, + "auxiliary_loss_mlp": 0.01265629, + "balance_loss_clip": 0.06270283, + "balance_loss_mlp": 0.0125589, + "epoch": 0.8474973696076957, + "flos": 23408418752640.0, + "grad_norm": 1.6128422152605608, + "language_loss": 0.80883193, + "learning_rate": 2.3895923785711105e-07, + "loss": 0.88547873, + "num_input_tokens_seen": 304051845, + "router_z_loss_clip": 1.28808594, + "router_z_loss_mlp": 0.09735107, + "step": 14096, + "time_per_iteration": 2.4910190105438232 + }, + { + "auxiliary_loss_clip": 0.06410886, + "auxiliary_loss_mlp": 0.01267989, + "balance_loss_clip": 0.06274761, + "balance_loss_mlp": 0.01257279, + "epoch": 0.8475574928603637, + "flos": 25081523953920.0, + "grad_norm": 1.7270068866988848, + "language_loss": 0.77507085, + "learning_rate": 2.387746631822374e-07, + "loss": 0.85185957, + "num_input_tokens_seen": 304069965, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.10699463, + "step": 14097, + "time_per_iteration": 2.5406811237335205 + }, + { + "auxiliary_loss_clip": 0.06399789, + "auxiliary_loss_mlp": 0.0126681, + "balance_loss_clip": 0.06270649, + "balance_loss_mlp": 0.01258042, + "epoch": 0.8476176161130318, + "flos": 19971532632960.0, + "grad_norm": 1.8020847692391104, + "language_loss": 0.80530119, + "learning_rate": 2.385901552932048e-07, + "loss": 0.88196719, + "num_input_tokens_seen": 304086805, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.08770752, + "step": 14098, + "time_per_iteration": 2.486926794052124 + }, + { + "auxiliary_loss_clip": 0.06402344, + "auxiliary_loss_mlp": 0.01268018, + "balance_loss_clip": 0.06272727, + "balance_loss_mlp": 0.01258267, + "epoch": 0.8476777393656997, + "flos": 21291842217600.0, + "grad_norm": 1.9132060530933808, + "language_loss": 0.72118181, + "learning_rate": 2.3840571419701062e-07, + "loss": 0.79788542, + "num_input_tokens_seen": 304105865, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.09753418, + "step": 14099, + "time_per_iteration": 2.5139384269714355 + }, + { + "auxiliary_loss_clip": 0.06402131, + "auxiliary_loss_mlp": 0.01262911, + "balance_loss_clip": 0.06271783, + "balance_loss_mlp": 0.01252558, + "epoch": 0.8477378626183677, + "flos": 29979276082560.0, + "grad_norm": 1.8485239738364325, + "language_loss": 0.63567179, + "learning_rate": 2.3822133990064787e-07, + "loss": 0.71232224, + "num_input_tokens_seen": 304128300, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.10351562, + "step": 14100, + "time_per_iteration": 2.56445574760437 + }, + { + "auxiliary_loss_clip": 0.06405117, + "auxiliary_loss_mlp": 0.01263495, + "balance_loss_clip": 0.06270204, + "balance_loss_mlp": 0.01252962, + "epoch": 0.8477979858710356, + "flos": 24243650651520.0, + "grad_norm": 2.126244455885968, + "language_loss": 0.73909217, + "learning_rate": 2.380370324111085e-07, + "loss": 0.81577832, + "num_input_tokens_seen": 304143695, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.10534668, + "step": 14101, + "time_per_iteration": 2.529759645462036 + }, + { + "auxiliary_loss_clip": 0.0640009, + "auxiliary_loss_mlp": 0.01263117, + "balance_loss_clip": 0.06269788, + "balance_loss_mlp": 0.01253777, + "epoch": 0.8478581091237036, + "flos": 25600828832640.0, + "grad_norm": 1.4947198559415165, + "language_loss": 0.71708381, + "learning_rate": 2.3785279173538163e-07, + "loss": 0.79371595, + "num_input_tokens_seen": 304165800, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.09344482, + "step": 14102, + "time_per_iteration": 2.539574384689331 + }, + { + "auxiliary_loss_clip": 0.06408991, + "auxiliary_loss_mlp": 0.0126655, + "balance_loss_clip": 0.06274236, + "balance_loss_mlp": 0.01256101, + "epoch": 0.8479182323763715, + "flos": 12061945019520.0, + "grad_norm": 2.418221007739104, + "language_loss": 0.82366699, + "learning_rate": 2.3766861788045366e-07, + "loss": 0.90042239, + "num_input_tokens_seen": 304182910, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.10455322, + "step": 14103, + "time_per_iteration": 2.481079339981079 + }, + { + "auxiliary_loss_clip": 0.06399621, + "auxiliary_loss_mlp": 0.01261485, + "balance_loss_clip": 0.06270504, + "balance_loss_mlp": 0.01252049, + "epoch": 0.8479783556290396, + "flos": 21439693946880.0, + "grad_norm": 2.033398222504212, + "language_loss": 0.78817004, + "learning_rate": 2.374845108533079e-07, + "loss": 0.86478114, + "num_input_tokens_seen": 304200175, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.09429932, + "step": 14104, + "time_per_iteration": 2.490394353866577 + }, + { + "auxiliary_loss_clip": 0.06407318, + "auxiliary_loss_mlp": 0.01265186, + "balance_loss_clip": 0.06273159, + "balance_loss_mlp": 0.01255035, + "epoch": 0.8480384788817075, + "flos": 19648142599680.0, + "grad_norm": 2.0490312403076114, + "language_loss": 0.79098284, + "learning_rate": 2.3730047066092607e-07, + "loss": 0.86770785, + "num_input_tokens_seen": 304217775, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.10144043, + "step": 14105, + "time_per_iteration": 2.485868215560913 + }, + { + "auxiliary_loss_clip": 0.06410661, + "auxiliary_loss_mlp": 0.01267423, + "balance_loss_clip": 0.06273758, + "balance_loss_mlp": 0.01256569, + "epoch": 0.8480986021343755, + "flos": 22495298883840.0, + "grad_norm": 1.5957177290166866, + "language_loss": 0.50232506, + "learning_rate": 2.3711649731028749e-07, + "loss": 0.57910585, + "num_input_tokens_seen": 304235760, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.10858154, + "step": 14106, + "time_per_iteration": 2.5024311542510986 + }, + { + "auxiliary_loss_clip": 0.06401025, + "auxiliary_loss_mlp": 0.01265391, + "balance_loss_clip": 0.06269896, + "balance_loss_mlp": 0.0125577, + "epoch": 0.8481587253870434, + "flos": 22097039627520.0, + "grad_norm": 1.768185108702469, + "language_loss": 0.7552582, + "learning_rate": 2.3693259080836792e-07, + "loss": 0.83192235, + "num_input_tokens_seen": 304253985, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09619141, + "step": 14107, + "time_per_iteration": 2.514000177383423 + }, + { + "auxiliary_loss_clip": 0.06406081, + "auxiliary_loss_mlp": 0.01266176, + "balance_loss_clip": 0.06274906, + "balance_loss_mlp": 0.01257152, + "epoch": 0.8482188486397114, + "flos": 33590945819520.0, + "grad_norm": 2.8502892293190585, + "language_loss": 0.73806465, + "learning_rate": 2.3674875116214087e-07, + "loss": 0.81478727, + "num_input_tokens_seen": 304276785, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09020996, + "step": 14108, + "time_per_iteration": 2.6308159828186035 + }, + { + "auxiliary_loss_clip": 0.06397291, + "auxiliary_loss_mlp": 0.01266603, + "balance_loss_clip": 0.06270851, + "balance_loss_mlp": 0.01256577, + "epoch": 0.8482789718923793, + "flos": 20925084896640.0, + "grad_norm": 4.392299515879854, + "language_loss": 0.72917706, + "learning_rate": 2.3656497837857836e-07, + "loss": 0.80581594, + "num_input_tokens_seen": 304296310, + "router_z_loss_clip": 1.26464844, + "router_z_loss_mlp": 0.10028076, + "step": 14109, + "time_per_iteration": 2.492094039916992 + }, + { + "auxiliary_loss_clip": 0.063987, + "auxiliary_loss_mlp": 0.01264973, + "balance_loss_clip": 0.06269309, + "balance_loss_mlp": 0.01255335, + "epoch": 0.8483390951450474, + "flos": 12901159987200.0, + "grad_norm": 2.2274280206799904, + "language_loss": 0.74444723, + "learning_rate": 2.3638127246464811e-07, + "loss": 0.82108402, + "num_input_tokens_seen": 304311715, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.09643555, + "step": 14110, + "time_per_iteration": 2.4547863006591797 + }, + { + "auxiliary_loss_clip": 0.06399868, + "auxiliary_loss_mlp": 0.0126879, + "balance_loss_clip": 0.06268494, + "balance_loss_mlp": 0.01259289, + "epoch": 0.8483992183977154, + "flos": 25088483842560.0, + "grad_norm": 1.5964367231590322, + "language_loss": 0.76417547, + "learning_rate": 2.3619763342731658e-07, + "loss": 0.84086204, + "num_input_tokens_seen": 304331910, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.0949707, + "step": 14111, + "time_per_iteration": 3.9820806980133057 + }, + { + "auxiliary_loss_clip": 0.06399922, + "auxiliary_loss_mlp": 0.01266512, + "balance_loss_clip": 0.06271142, + "balance_loss_mlp": 0.01257541, + "epoch": 0.8484593416503833, + "flos": 25564631068800.0, + "grad_norm": 1.5630772359474336, + "language_loss": 0.67188197, + "learning_rate": 2.3601406127354772e-07, + "loss": 0.7485463, + "num_input_tokens_seen": 304351405, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.08966064, + "step": 14112, + "time_per_iteration": 2.5049498081207275 + }, + { + "auxiliary_loss_clip": 0.06402437, + "auxiliary_loss_mlp": 0.0126693, + "balance_loss_clip": 0.06270389, + "balance_loss_mlp": 0.01257644, + "epoch": 0.8485194649030513, + "flos": 27205773137280.0, + "grad_norm": 1.5196653604706423, + "language_loss": 0.7372402, + "learning_rate": 2.3583055601030312e-07, + "loss": 0.81393391, + "num_input_tokens_seen": 304372935, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.09289551, + "step": 14113, + "time_per_iteration": 2.5452187061309814 + }, + { + "auxiliary_loss_clip": 0.0639898, + "auxiliary_loss_mlp": 0.01268532, + "balance_loss_clip": 0.06269895, + "balance_loss_mlp": 0.01259228, + "epoch": 0.8485795881557192, + "flos": 24212609913600.0, + "grad_norm": 1.8417781889365228, + "language_loss": 0.66789317, + "learning_rate": 2.3564711764454003e-07, + "loss": 0.74456829, + "num_input_tokens_seen": 304393070, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.09289551, + "step": 14114, + "time_per_iteration": 2.533439874649048 + }, + { + "auxiliary_loss_clip": 0.06404068, + "auxiliary_loss_mlp": 0.0126426, + "balance_loss_clip": 0.06271428, + "balance_loss_mlp": 0.01254616, + "epoch": 0.8486397114083872, + "flos": 21147931630080.0, + "grad_norm": 1.5901930956565895, + "language_loss": 0.7938953, + "learning_rate": 2.3546374618321495e-07, + "loss": 0.87057859, + "num_input_tokens_seen": 304411195, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09643555, + "step": 14115, + "time_per_iteration": 2.4947285652160645 + }, + { + "auxiliary_loss_clip": 0.06404249, + "auxiliary_loss_mlp": 0.01264775, + "balance_loss_clip": 0.06271269, + "balance_loss_mlp": 0.0125503, + "epoch": 0.8486998346610551, + "flos": 19980966216960.0, + "grad_norm": 1.8982053522036084, + "language_loss": 0.79270887, + "learning_rate": 2.3528044163328187e-07, + "loss": 0.86939907, + "num_input_tokens_seen": 304429425, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.09747314, + "step": 14116, + "time_per_iteration": 2.4848196506500244 + }, + { + "auxiliary_loss_clip": 0.06406476, + "auxiliary_loss_mlp": 0.01264395, + "balance_loss_clip": 0.06271321, + "balance_loss_mlp": 0.01254394, + "epoch": 0.8487599579137232, + "flos": 19798468024320.0, + "grad_norm": 2.1065592476865045, + "language_loss": 0.68344235, + "learning_rate": 2.3509720400169076e-07, + "loss": 0.76015103, + "num_input_tokens_seen": 304447460, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.10003662, + "step": 14117, + "time_per_iteration": 2.5438575744628906 + }, + { + "auxiliary_loss_clip": 0.06404263, + "auxiliary_loss_mlp": 0.01263508, + "balance_loss_clip": 0.06269245, + "balance_loss_mlp": 0.01254144, + "epoch": 0.8488200811663911, + "flos": 26403259057920.0, + "grad_norm": 2.1344254269522653, + "language_loss": 0.649701, + "learning_rate": 2.3491403329539096e-07, + "loss": 0.72637874, + "num_input_tokens_seen": 304468230, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.09362793, + "step": 14118, + "time_per_iteration": 2.5292701721191406 + }, + { + "auxiliary_loss_clip": 0.06402715, + "auxiliary_loss_mlp": 0.01263736, + "balance_loss_clip": 0.06272824, + "balance_loss_mlp": 0.012551, + "epoch": 0.8488802044190591, + "flos": 16364307162240.0, + "grad_norm": 1.5920337114960288, + "language_loss": 0.73305792, + "learning_rate": 2.3473092952132757e-07, + "loss": 0.80972242, + "num_input_tokens_seen": 304484860, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.08636475, + "step": 14119, + "time_per_iteration": 2.5488085746765137 + }, + { + "auxiliary_loss_clip": 0.06405111, + "auxiliary_loss_mlp": 0.0126518, + "balance_loss_clip": 0.06273293, + "balance_loss_mlp": 0.01255017, + "epoch": 0.848940327671727, + "flos": 19214985317760.0, + "grad_norm": 1.735285321727865, + "language_loss": 0.78245997, + "learning_rate": 2.345478926864446e-07, + "loss": 0.85916287, + "num_input_tokens_seen": 304503575, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.10168457, + "step": 14120, + "time_per_iteration": 2.4802494049072266 + }, + { + "auxiliary_loss_clip": 0.0640521, + "auxiliary_loss_mlp": 0.01261862, + "balance_loss_clip": 0.06270778, + "balance_loss_mlp": 0.01251956, + "epoch": 0.849000450924395, + "flos": 21877547057280.0, + "grad_norm": 1.653660849157392, + "language_loss": 0.75841606, + "learning_rate": 2.3436492279768227e-07, + "loss": 0.83508676, + "num_input_tokens_seen": 304525005, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.09906006, + "step": 14121, + "time_per_iteration": 2.5294899940490723 + }, + { + "auxiliary_loss_clip": 0.06311592, + "auxiliary_loss_mlp": 0.01250316, + "balance_loss_clip": 0.06256946, + "balance_loss_mlp": 0.01249346, + "epoch": 0.8490605741770629, + "flos": 71187697054080.0, + "grad_norm": 0.8089399370239767, + "language_loss": 0.60124117, + "learning_rate": 2.3418201986197883e-07, + "loss": 0.67686021, + "num_input_tokens_seen": 304585220, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.0096817, + "step": 14122, + "time_per_iteration": 4.59176778793335 + }, + { + "auxiliary_loss_clip": 0.06405739, + "auxiliary_loss_mlp": 0.01266882, + "balance_loss_clip": 0.06273272, + "balance_loss_mlp": 0.01257697, + "epoch": 0.849120697429731, + "flos": 24980393675520.0, + "grad_norm": 1.735217190538918, + "language_loss": 0.79777497, + "learning_rate": 2.3399918388627048e-07, + "loss": 0.87450123, + "num_input_tokens_seen": 304604665, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09191895, + "step": 14123, + "time_per_iteration": 2.5571157932281494 + }, + { + "auxiliary_loss_clip": 0.0639874, + "auxiliary_loss_mlp": 0.01265305, + "balance_loss_clip": 0.06271547, + "balance_loss_mlp": 0.01255899, + "epoch": 0.8491808206823989, + "flos": 23037762216960.0, + "grad_norm": 2.314794878951381, + "language_loss": 0.83767265, + "learning_rate": 2.3381641487749016e-07, + "loss": 0.91431308, + "num_input_tokens_seen": 304620600, + "router_z_loss_clip": 1.27246094, + "router_z_loss_mlp": 0.09411621, + "step": 14124, + "time_per_iteration": 2.4972498416900635 + }, + { + "auxiliary_loss_clip": 0.06402995, + "auxiliary_loss_mlp": 0.01266211, + "balance_loss_clip": 0.0627236, + "balance_loss_mlp": 0.01256448, + "epoch": 0.8492409439350669, + "flos": 23885362592640.0, + "grad_norm": 1.83156410249787, + "language_loss": 0.71961606, + "learning_rate": 2.3363371284256805e-07, + "loss": 0.7963081, + "num_input_tokens_seen": 304639540, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.09753418, + "step": 14125, + "time_per_iteration": 2.489391326904297 + }, + { + "auxiliary_loss_clip": 0.06413139, + "auxiliary_loss_mlp": 0.01265668, + "balance_loss_clip": 0.06274882, + "balance_loss_mlp": 0.01254987, + "epoch": 0.8493010671877349, + "flos": 22426592935680.0, + "grad_norm": 1.586241425813396, + "language_loss": 0.73891562, + "learning_rate": 2.3345107778843288e-07, + "loss": 0.81570363, + "num_input_tokens_seen": 304660595, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.10687256, + "step": 14126, + "time_per_iteration": 3.983708381652832 + }, + { + "auxiliary_loss_clip": 0.06403054, + "auxiliary_loss_mlp": 0.01264392, + "balance_loss_clip": 0.06273109, + "balance_loss_mlp": 0.01254707, + "epoch": 0.8493611904404028, + "flos": 17535087936000.0, + "grad_norm": 1.8650592737197151, + "language_loss": 0.67556584, + "learning_rate": 2.3326850972200928e-07, + "loss": 0.75224024, + "num_input_tokens_seen": 304679580, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.09686279, + "step": 14127, + "time_per_iteration": 2.487192153930664 + }, + { + "auxiliary_loss_clip": 0.06408098, + "auxiliary_loss_mlp": 0.01264632, + "balance_loss_clip": 0.06272452, + "balance_loss_mlp": 0.01254624, + "epoch": 0.8494213136930708, + "flos": 19468872789120.0, + "grad_norm": 2.079377486336631, + "language_loss": 0.6908232, + "learning_rate": 2.330860086502211e-07, + "loss": 0.76755047, + "num_input_tokens_seen": 304698385, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.10003662, + "step": 14128, + "time_per_iteration": 3.9321682453155518 + }, + { + "auxiliary_loss_clip": 0.0640503, + "auxiliary_loss_mlp": 0.01266181, + "balance_loss_clip": 0.06273429, + "balance_loss_mlp": 0.01256203, + "epoch": 0.8494814369457387, + "flos": 18776209812480.0, + "grad_norm": 1.8334204932365141, + "language_loss": 0.77824986, + "learning_rate": 2.3290357457998855e-07, + "loss": 0.85496199, + "num_input_tokens_seen": 304715430, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09973145, + "step": 14129, + "time_per_iteration": 2.477147102355957 + }, + { + "auxiliary_loss_clip": 0.06402892, + "auxiliary_loss_mlp": 0.01262796, + "balance_loss_clip": 0.06271499, + "balance_loss_mlp": 0.01253718, + "epoch": 0.8495415601984068, + "flos": 23338245358080.0, + "grad_norm": 1.6462886650116846, + "language_loss": 0.68294001, + "learning_rate": 2.3272120751823031e-07, + "loss": 0.75959694, + "num_input_tokens_seen": 304734345, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09075928, + "step": 14130, + "time_per_iteration": 2.4999375343322754 + }, + { + "auxiliary_loss_clip": 0.0640253, + "auxiliary_loss_mlp": 0.01261921, + "balance_loss_clip": 0.06270012, + "balance_loss_mlp": 0.01252367, + "epoch": 0.8496016834510747, + "flos": 26619774808320.0, + "grad_norm": 1.5802166891621863, + "language_loss": 0.71646059, + "learning_rate": 2.3253890747186e-07, + "loss": 0.79310513, + "num_input_tokens_seen": 304755030, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.09545898, + "step": 14131, + "time_per_iteration": 2.5575854778289795 + }, + { + "auxiliary_loss_clip": 0.06405224, + "auxiliary_loss_mlp": 0.01265063, + "balance_loss_clip": 0.06270383, + "balance_loss_mlp": 0.01255729, + "epoch": 0.8496618067037427, + "flos": 25486868880000.0, + "grad_norm": 1.7695763181681814, + "language_loss": 0.68790936, + "learning_rate": 2.3235667444779162e-07, + "loss": 0.7646122, + "num_input_tokens_seen": 304774320, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.09332275, + "step": 14132, + "time_per_iteration": 2.5535638332366943 + }, + { + "auxiliary_loss_clip": 0.06400724, + "auxiliary_loss_mlp": 0.01264751, + "balance_loss_clip": 0.0627154, + "balance_loss_mlp": 0.012563, + "epoch": 0.8497219299564106, + "flos": 25381671678720.0, + "grad_norm": 1.5183602424718283, + "language_loss": 0.70325232, + "learning_rate": 2.3217450845293564e-07, + "loss": 0.77990711, + "num_input_tokens_seen": 304795355, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.08459473, + "step": 14133, + "time_per_iteration": 2.5285003185272217 + }, + { + "auxiliary_loss_clip": 0.06314642, + "auxiliary_loss_mlp": 0.0125198, + "balance_loss_clip": 0.06259762, + "balance_loss_mlp": 0.01250997, + "epoch": 0.8497820532090786, + "flos": 67802102432640.0, + "grad_norm": 0.719733671392506, + "language_loss": 0.57708496, + "learning_rate": 2.3199240949419918e-07, + "loss": 0.65275121, + "num_input_tokens_seen": 304863915, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.00982666, + "step": 14134, + "time_per_iteration": 3.2259228229522705 + }, + { + "auxiliary_loss_clip": 0.06407531, + "auxiliary_loss_mlp": 0.01264401, + "balance_loss_clip": 0.06273041, + "balance_loss_mlp": 0.01254947, + "epoch": 0.8498421764617465, + "flos": 23447257920000.0, + "grad_norm": 1.8337709107177125, + "language_loss": 0.78980213, + "learning_rate": 2.3181037757848787e-07, + "loss": 0.86652142, + "num_input_tokens_seen": 304881555, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.09460449, + "step": 14135, + "time_per_iteration": 2.5097665786743164 + }, + { + "auxiliary_loss_clip": 0.06408061, + "auxiliary_loss_mlp": 0.01265083, + "balance_loss_clip": 0.06273302, + "balance_loss_mlp": 0.01254521, + "epoch": 0.8499022997144146, + "flos": 17718424669440.0, + "grad_norm": 1.803510122803531, + "language_loss": 0.63663286, + "learning_rate": 2.316284127127044e-07, + "loss": 0.71336436, + "num_input_tokens_seen": 304898760, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.10559082, + "step": 14136, + "time_per_iteration": 2.4748387336730957 + }, + { + "auxiliary_loss_clip": 0.06406897, + "auxiliary_loss_mlp": 0.01265974, + "balance_loss_clip": 0.06272756, + "balance_loss_mlp": 0.01255686, + "epoch": 0.8499624229670825, + "flos": 18594508233600.0, + "grad_norm": 1.9108052639568265, + "language_loss": 0.8452841, + "learning_rate": 2.3144651490374835e-07, + "loss": 0.92201281, + "num_input_tokens_seen": 304915465, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.10290527, + "step": 14137, + "time_per_iteration": 2.466539144515991 + }, + { + "auxiliary_loss_clip": 0.0639957, + "auxiliary_loss_mlp": 0.01266335, + "balance_loss_clip": 0.0627173, + "balance_loss_mlp": 0.01257281, + "epoch": 0.8500225462197505, + "flos": 24351573110400.0, + "grad_norm": 2.739928375946937, + "language_loss": 0.78818476, + "learning_rate": 2.3126468415851773e-07, + "loss": 0.86484385, + "num_input_tokens_seen": 304933190, + "router_z_loss_clip": 1.27929688, + "router_z_loss_mlp": 0.09051514, + "step": 14138, + "time_per_iteration": 2.5530903339385986 + }, + { + "auxiliary_loss_clip": 0.06402527, + "auxiliary_loss_mlp": 0.0126479, + "balance_loss_clip": 0.06272259, + "balance_loss_mlp": 0.01255271, + "epoch": 0.8500826694724185, + "flos": 16551207694080.0, + "grad_norm": 1.5096380838746266, + "language_loss": 0.64687216, + "learning_rate": 2.310829204839073e-07, + "loss": 0.72354537, + "num_input_tokens_seen": 304951110, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.09521484, + "step": 14139, + "time_per_iteration": 2.4765748977661133 + }, + { + "auxiliary_loss_clip": 0.06402735, + "auxiliary_loss_mlp": 0.01262708, + "balance_loss_clip": 0.06273352, + "balance_loss_mlp": 0.01253451, + "epoch": 0.8501427927250864, + "flos": 16294930381440.0, + "grad_norm": 1.421511629945392, + "language_loss": 0.70614517, + "learning_rate": 2.3090122388681043e-07, + "loss": 0.78279966, + "num_input_tokens_seen": 304969095, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.0925293, + "step": 14140, + "time_per_iteration": 2.498777151107788 + }, + { + "auxiliary_loss_clip": 0.06406597, + "auxiliary_loss_mlp": 0.01266518, + "balance_loss_clip": 0.06272027, + "balance_loss_mlp": 0.01256534, + "epoch": 0.8502029159777544, + "flos": 26695189082880.0, + "grad_norm": 1.9493724688595604, + "language_loss": 0.64299488, + "learning_rate": 2.3071959437411648e-07, + "loss": 0.71972603, + "num_input_tokens_seen": 304989315, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.09979248, + "step": 14141, + "time_per_iteration": 2.5951173305511475 + }, + { + "auxiliary_loss_clip": 0.06401542, + "auxiliary_loss_mlp": 0.01267222, + "balance_loss_clip": 0.0627162, + "balance_loss_mlp": 0.01257614, + "epoch": 0.8502630392304223, + "flos": 35599599895680.0, + "grad_norm": 1.6642597175452942, + "language_loss": 0.71313, + "learning_rate": 2.3053803195271214e-07, + "loss": 0.78981769, + "num_input_tokens_seen": 305011020, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.0960083, + "step": 14142, + "time_per_iteration": 2.644679307937622 + }, + { + "auxiliary_loss_clip": 0.06406039, + "auxiliary_loss_mlp": 0.01263489, + "balance_loss_clip": 0.0627221, + "balance_loss_mlp": 0.01254704, + "epoch": 0.8503231624830904, + "flos": 21655329229440.0, + "grad_norm": 1.5291787912539954, + "language_loss": 0.6560241, + "learning_rate": 2.3035653662948375e-07, + "loss": 0.73271942, + "num_input_tokens_seen": 305033550, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.08782959, + "step": 14143, + "time_per_iteration": 2.550386905670166 + }, + { + "auxiliary_loss_clip": 0.06409223, + "auxiliary_loss_mlp": 0.01267388, + "balance_loss_clip": 0.06273058, + "balance_loss_mlp": 0.01257029, + "epoch": 0.8503832857357583, + "flos": 22423741896960.0, + "grad_norm": 1.9945347024432363, + "language_loss": 0.68129444, + "learning_rate": 2.3017510841131216e-07, + "loss": 0.75806051, + "num_input_tokens_seen": 305052885, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.10357666, + "step": 14144, + "time_per_iteration": 2.535437822341919 + }, + { + "auxiliary_loss_clip": 0.06397337, + "auxiliary_loss_mlp": 0.01264934, + "balance_loss_clip": 0.06270745, + "balance_loss_mlp": 0.01255981, + "epoch": 0.8504434089884263, + "flos": 18703981992960.0, + "grad_norm": 2.059497972093478, + "language_loss": 0.6487931, + "learning_rate": 2.299937473050777e-07, + "loss": 0.72541577, + "num_input_tokens_seen": 305071995, + "router_z_loss_clip": 1.265625, + "router_z_loss_mlp": 0.08953857, + "step": 14145, + "time_per_iteration": 2.4910314083099365 + }, + { + "auxiliary_loss_clip": 0.06402655, + "auxiliary_loss_mlp": 0.01262868, + "balance_loss_clip": 0.06271585, + "balance_loss_mlp": 0.01253784, + "epoch": 0.8505035322410942, + "flos": 20013642109440.0, + "grad_norm": 1.595818409331469, + "language_loss": 0.85513884, + "learning_rate": 2.2981245331765842e-07, + "loss": 0.93179405, + "num_input_tokens_seen": 305090190, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.09075928, + "step": 14146, + "time_per_iteration": 2.5118772983551025 + }, + { + "auxiliary_loss_clip": 0.06399256, + "auxiliary_loss_mlp": 0.01263156, + "balance_loss_clip": 0.06268792, + "balance_loss_mlp": 0.01254543, + "epoch": 0.8505636554937622, + "flos": 20818210613760.0, + "grad_norm": 1.4979672504038752, + "language_loss": 0.84137052, + "learning_rate": 2.2963122645592814e-07, + "loss": 0.91799468, + "num_input_tokens_seen": 305109355, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.08612061, + "step": 14147, + "time_per_iteration": 2.491823673248291 + }, + { + "auxiliary_loss_clip": 0.06407596, + "auxiliary_loss_mlp": 0.01264593, + "balance_loss_clip": 0.06270961, + "balance_loss_mlp": 0.01254549, + "epoch": 0.8506237787464301, + "flos": 14179821292800.0, + "grad_norm": 2.3326412330221284, + "language_loss": 0.86542302, + "learning_rate": 2.2945006672675894e-07, + "loss": 0.94214487, + "num_input_tokens_seen": 305124165, + "router_z_loss_clip": 1.36621094, + "router_z_loss_mlp": 0.10040283, + "step": 14148, + "time_per_iteration": 2.4511468410491943 + }, + { + "auxiliary_loss_clip": 0.06404074, + "auxiliary_loss_mlp": 0.01267682, + "balance_loss_clip": 0.06273896, + "balance_loss_mlp": 0.01257991, + "epoch": 0.8506839019990982, + "flos": 23265095143680.0, + "grad_norm": 2.691845002956324, + "language_loss": 0.72521651, + "learning_rate": 2.292689741370204e-07, + "loss": 0.801934, + "num_input_tokens_seen": 305143940, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.09698486, + "step": 14149, + "time_per_iteration": 2.4899957180023193 + }, + { + "auxiliary_loss_clip": 0.06403546, + "auxiliary_loss_mlp": 0.01262142, + "balance_loss_clip": 0.06271783, + "balance_loss_mlp": 0.01252504, + "epoch": 0.8507440252517661, + "flos": 23665911949440.0, + "grad_norm": 1.5144720298422676, + "language_loss": 0.76150334, + "learning_rate": 2.290879486935804e-07, + "loss": 0.83816022, + "num_input_tokens_seen": 305163505, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.09631348, + "step": 14150, + "time_per_iteration": 3.8928089141845703 + }, + { + "auxiliary_loss_clip": 0.06398553, + "auxiliary_loss_mlp": 0.01263858, + "balance_loss_clip": 0.06269762, + "balance_loss_mlp": 0.01255025, + "epoch": 0.8508041485044341, + "flos": 18667323031680.0, + "grad_norm": 1.6618873770107652, + "language_loss": 0.72802079, + "learning_rate": 2.2890699040330231e-07, + "loss": 0.80464488, + "num_input_tokens_seen": 305182325, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.08837891, + "step": 14151, + "time_per_iteration": 2.4714863300323486 + }, + { + "auxiliary_loss_clip": 0.0630898, + "auxiliary_loss_mlp": 0.01253738, + "balance_loss_clip": 0.06254144, + "balance_loss_mlp": 0.01252743, + "epoch": 0.8508642717571021, + "flos": 52527124275840.0, + "grad_norm": 0.8927928049322662, + "language_loss": 0.59571874, + "learning_rate": 2.2872609927304909e-07, + "loss": 0.67134595, + "num_input_tokens_seen": 305230775, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.0099411, + "step": 14152, + "time_per_iteration": 2.8655712604522705 + }, + { + "auxiliary_loss_clip": 0.06316353, + "auxiliary_loss_mlp": 0.01254234, + "balance_loss_clip": 0.06261283, + "balance_loss_mlp": 0.0125321, + "epoch": 0.85092439500977, + "flos": 69316622582400.0, + "grad_norm": 0.6838202798086767, + "language_loss": 0.60732996, + "learning_rate": 2.285452753096797e-07, + "loss": 0.68303585, + "num_input_tokens_seen": 305296000, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.01024628, + "step": 14153, + "time_per_iteration": 3.1540443897247314 + }, + { + "auxiliary_loss_clip": 0.06401594, + "auxiliary_loss_mlp": 0.01264001, + "balance_loss_clip": 0.06270707, + "balance_loss_mlp": 0.01254584, + "epoch": 0.850984518262438, + "flos": 24396701333760.0, + "grad_norm": 1.5261009228174292, + "language_loss": 0.80733705, + "learning_rate": 2.2836451852005067e-07, + "loss": 0.88399303, + "num_input_tokens_seen": 305314705, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09411621, + "step": 14154, + "time_per_iteration": 2.509315013885498 + }, + { + "auxiliary_loss_clip": 0.0639661, + "auxiliary_loss_mlp": 0.01264654, + "balance_loss_clip": 0.06270568, + "balance_loss_mlp": 0.01256023, + "epoch": 0.851044641515106, + "flos": 23301544469760.0, + "grad_norm": 1.6872874413166468, + "language_loss": 0.80040228, + "learning_rate": 2.281838289110165e-07, + "loss": 0.87701488, + "num_input_tokens_seen": 305333870, + "router_z_loss_clip": 1.25976562, + "router_z_loss_mlp": 0.08630371, + "step": 14155, + "time_per_iteration": 2.5027365684509277 + }, + { + "auxiliary_loss_clip": 0.06406571, + "auxiliary_loss_mlp": 0.01265425, + "balance_loss_clip": 0.06270237, + "balance_loss_mlp": 0.0125556, + "epoch": 0.851104764767774, + "flos": 22055894472960.0, + "grad_norm": 1.8573710226657936, + "language_loss": 0.70853728, + "learning_rate": 2.2800320648942904e-07, + "loss": 0.78525728, + "num_input_tokens_seen": 305352780, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.09863281, + "step": 14156, + "time_per_iteration": 2.479628324508667 + }, + { + "auxiliary_loss_clip": 0.06399591, + "auxiliary_loss_mlp": 0.01266727, + "balance_loss_clip": 0.06270649, + "balance_loss_mlp": 0.0125753, + "epoch": 0.8511648880204419, + "flos": 20711084768640.0, + "grad_norm": 1.7440507839185868, + "language_loss": 0.73986185, + "learning_rate": 2.278226512621386e-07, + "loss": 0.81652504, + "num_input_tokens_seen": 305371370, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.09204102, + "step": 14157, + "time_per_iteration": 2.517547845840454 + }, + { + "auxiliary_loss_clip": 0.06396286, + "auxiliary_loss_mlp": 0.01264892, + "balance_loss_clip": 0.06269678, + "balance_loss_mlp": 0.01256214, + "epoch": 0.8512250112731099, + "flos": 24031537240320.0, + "grad_norm": 1.8245812327511397, + "language_loss": 0.79734576, + "learning_rate": 2.2764216323598995e-07, + "loss": 0.87395757, + "num_input_tokens_seen": 305387955, + "router_z_loss_clip": 1.265625, + "router_z_loss_mlp": 0.08673096, + "step": 14158, + "time_per_iteration": 2.4979214668273926 + }, + { + "auxiliary_loss_clip": 0.0640398, + "auxiliary_loss_mlp": 0.01268649, + "balance_loss_clip": 0.06272298, + "balance_loss_mlp": 0.01258855, + "epoch": 0.8512851345257778, + "flos": 22021583425920.0, + "grad_norm": 1.9713413245067732, + "language_loss": 0.79106247, + "learning_rate": 2.27461742417828e-07, + "loss": 0.86778879, + "num_input_tokens_seen": 305406285, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.09790039, + "step": 14159, + "time_per_iteration": 2.528264284133911 + }, + { + "auxiliary_loss_clip": 0.06402959, + "auxiliary_loss_mlp": 0.01262793, + "balance_loss_clip": 0.06271561, + "balance_loss_mlp": 0.01252976, + "epoch": 0.8513452577784458, + "flos": 14835531818880.0, + "grad_norm": 1.6436898451229665, + "language_loss": 0.71580386, + "learning_rate": 2.2728138881449488e-07, + "loss": 0.7924614, + "num_input_tokens_seen": 305424500, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.09814453, + "step": 14160, + "time_per_iteration": 2.479752779006958 + }, + { + "auxiliary_loss_clip": 0.06410594, + "auxiliary_loss_mlp": 0.01266043, + "balance_loss_clip": 0.06273068, + "balance_loss_mlp": 0.012556, + "epoch": 0.8514053810311137, + "flos": 33043870512000.0, + "grad_norm": 2.103891046025698, + "language_loss": 0.71018016, + "learning_rate": 2.2710110243282866e-07, + "loss": 0.78694654, + "num_input_tokens_seen": 305442990, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.10443115, + "step": 14161, + "time_per_iteration": 2.6188247203826904 + }, + { + "auxiliary_loss_clip": 0.06404144, + "auxiliary_loss_mlp": 0.01263874, + "balance_loss_clip": 0.062693, + "balance_loss_mlp": 0.01254248, + "epoch": 0.8514655042837818, + "flos": 27572027333760.0, + "grad_norm": 2.0966778505863997, + "language_loss": 0.78282481, + "learning_rate": 2.2692088327966653e-07, + "loss": 0.85950494, + "num_input_tokens_seen": 305463065, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.09625244, + "step": 14162, + "time_per_iteration": 4.036656856536865 + }, + { + "auxiliary_loss_clip": 0.06401855, + "auxiliary_loss_mlp": 0.01263883, + "balance_loss_clip": 0.06271641, + "balance_loss_mlp": 0.01254036, + "epoch": 0.8515256275364497, + "flos": 35565163067520.0, + "grad_norm": 1.9877443818476273, + "language_loss": 0.77228487, + "learning_rate": 2.2674073136184235e-07, + "loss": 0.84894228, + "num_input_tokens_seen": 305489070, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.09844971, + "step": 14163, + "time_per_iteration": 2.63171648979187 + }, + { + "auxiliary_loss_clip": 0.06310776, + "auxiliary_loss_mlp": 0.01252981, + "balance_loss_clip": 0.06255888, + "balance_loss_mlp": 0.01251983, + "epoch": 0.8515857507891177, + "flos": 70226681777280.0, + "grad_norm": 0.6817221132059864, + "language_loss": 0.54955924, + "learning_rate": 2.2656064668618735e-07, + "loss": 0.62519681, + "num_input_tokens_seen": 305551490, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.00997925, + "step": 14164, + "time_per_iteration": 3.1551241874694824 + }, + { + "auxiliary_loss_clip": 0.06406744, + "auxiliary_loss_mlp": 0.01270382, + "balance_loss_clip": 0.06274273, + "balance_loss_mlp": 0.01260524, + "epoch": 0.8516458740417857, + "flos": 22682031707520.0, + "grad_norm": 3.0329072828581816, + "language_loss": 0.73003203, + "learning_rate": 2.2638062925953005e-07, + "loss": 0.80680323, + "num_input_tokens_seen": 305570535, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09857178, + "step": 14165, + "time_per_iteration": 2.5035831928253174 + }, + { + "auxiliary_loss_clip": 0.06397499, + "auxiliary_loss_mlp": 0.01262475, + "balance_loss_clip": 0.06270273, + "balance_loss_mlp": 0.01253231, + "epoch": 0.8517059972944536, + "flos": 22754049891840.0, + "grad_norm": 1.484328472533111, + "language_loss": 0.67534792, + "learning_rate": 2.26200679088697e-07, + "loss": 0.7519477, + "num_input_tokens_seen": 305590800, + "router_z_loss_clip": 1.27148438, + "router_z_loss_mlp": 0.09240723, + "step": 14166, + "time_per_iteration": 3.980225086212158 + }, + { + "auxiliary_loss_clip": 0.06407012, + "auxiliary_loss_mlp": 0.01265516, + "balance_loss_clip": 0.06273839, + "balance_loss_mlp": 0.01256164, + "epoch": 0.8517661205471216, + "flos": 21695551989120.0, + "grad_norm": 1.6606333090542271, + "language_loss": 0.73706573, + "learning_rate": 2.260207961805125e-07, + "loss": 0.81379104, + "num_input_tokens_seen": 305609495, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.09350586, + "step": 14167, + "time_per_iteration": 2.5159831047058105 + }, + { + "auxiliary_loss_clip": 0.06402537, + "auxiliary_loss_mlp": 0.0126222, + "balance_loss_clip": 0.06271734, + "balance_loss_mlp": 0.0125341, + "epoch": 0.8518262437997896, + "flos": 25381965168000.0, + "grad_norm": 1.6418130813226552, + "language_loss": 0.80574334, + "learning_rate": 2.258409805417969e-07, + "loss": 0.88239098, + "num_input_tokens_seen": 305629420, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.08807373, + "step": 14168, + "time_per_iteration": 3.9127509593963623 + }, + { + "auxiliary_loss_clip": 0.06400729, + "auxiliary_loss_mlp": 0.01263799, + "balance_loss_clip": 0.06270607, + "balance_loss_mlp": 0.01254233, + "epoch": 0.8518863670524576, + "flos": 27242809441920.0, + "grad_norm": 1.6366824582665955, + "language_loss": 0.76805246, + "learning_rate": 2.2566123217936893e-07, + "loss": 0.84469771, + "num_input_tokens_seen": 305649835, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.09570312, + "step": 14169, + "time_per_iteration": 2.564000129699707 + }, + { + "auxiliary_loss_clip": 0.06407769, + "auxiliary_loss_mlp": 0.01265521, + "balance_loss_clip": 0.06273901, + "balance_loss_mlp": 0.0125574, + "epoch": 0.8519464903051255, + "flos": 20965810780800.0, + "grad_norm": 1.524606449707151, + "language_loss": 0.64094317, + "learning_rate": 2.254815511000452e-07, + "loss": 0.71767604, + "num_input_tokens_seen": 305668840, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.09777832, + "step": 14170, + "time_per_iteration": 2.4731311798095703 + }, + { + "auxiliary_loss_clip": 0.06401997, + "auxiliary_loss_mlp": 0.01263402, + "balance_loss_clip": 0.06271668, + "balance_loss_mlp": 0.01254348, + "epoch": 0.8520066135577935, + "flos": 18447578899200.0, + "grad_norm": 3.4073612372840003, + "language_loss": 0.86964762, + "learning_rate": 2.253019373106384e-07, + "loss": 0.94630164, + "num_input_tokens_seen": 305686955, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.09057617, + "step": 14171, + "time_per_iteration": 2.4719200134277344 + }, + { + "auxiliary_loss_clip": 0.0640336, + "auxiliary_loss_mlp": 0.01264072, + "balance_loss_clip": 0.0627137, + "balance_loss_mlp": 0.01254368, + "epoch": 0.8520667368104614, + "flos": 29137545492480.0, + "grad_norm": 1.7662348242463337, + "language_loss": 0.55010748, + "learning_rate": 2.2512239081796003e-07, + "loss": 0.62678176, + "num_input_tokens_seen": 305706290, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.0970459, + "step": 14172, + "time_per_iteration": 2.545728921890259 + }, + { + "auxiliary_loss_clip": 0.06399302, + "auxiliary_loss_mlp": 0.01263713, + "balance_loss_clip": 0.06271762, + "balance_loss_mlp": 0.01255232, + "epoch": 0.8521268600631294, + "flos": 16039910880000.0, + "grad_norm": 2.3131255138599287, + "language_loss": 0.69956374, + "learning_rate": 2.2494291162881862e-07, + "loss": 0.77619392, + "num_input_tokens_seen": 305723835, + "router_z_loss_clip": 1.27539062, + "router_z_loss_mlp": 0.08477783, + "step": 14173, + "time_per_iteration": 2.505682945251465 + }, + { + "auxiliary_loss_clip": 0.06406021, + "auxiliary_loss_mlp": 0.01266898, + "balance_loss_clip": 0.06273559, + "balance_loss_mlp": 0.0125667, + "epoch": 0.8521869833157973, + "flos": 22461323253120.0, + "grad_norm": 2.130719000445001, + "language_loss": 0.77812624, + "learning_rate": 2.247634997500205e-07, + "loss": 0.85485542, + "num_input_tokens_seen": 305741655, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.10241699, + "step": 14174, + "time_per_iteration": 2.487783908843994 + }, + { + "auxiliary_loss_clip": 0.06406736, + "auxiliary_loss_mlp": 0.01263896, + "balance_loss_clip": 0.06273077, + "balance_loss_mlp": 0.01254199, + "epoch": 0.8522471065684654, + "flos": 24978842375040.0, + "grad_norm": 1.55391099663027, + "language_loss": 0.81712008, + "learning_rate": 2.245841551883676e-07, + "loss": 0.89382648, + "num_input_tokens_seen": 305761890, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.09698486, + "step": 14175, + "time_per_iteration": 2.524867534637451 + }, + { + "auxiliary_loss_clip": 0.06412444, + "auxiliary_loss_mlp": 0.01264709, + "balance_loss_clip": 0.06276155, + "balance_loss_mlp": 0.01254832, + "epoch": 0.8523072298211333, + "flos": 17716076755200.0, + "grad_norm": 2.276000629543861, + "language_loss": 0.65874249, + "learning_rate": 2.2440487795066153e-07, + "loss": 0.73551399, + "num_input_tokens_seen": 305779190, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.09881592, + "step": 14176, + "time_per_iteration": 2.477936029434204 + }, + { + "auxiliary_loss_clip": 0.06401838, + "auxiliary_loss_mlp": 0.01264664, + "balance_loss_clip": 0.06274813, + "balance_loss_mlp": 0.01255282, + "epoch": 0.8523673530738013, + "flos": 25453060957440.0, + "grad_norm": 1.6627416158004444, + "language_loss": 0.78781587, + "learning_rate": 2.2422566804370068e-07, + "loss": 0.86448085, + "num_input_tokens_seen": 305799870, + "router_z_loss_clip": 1.26757812, + "router_z_loss_mlp": 0.09381104, + "step": 14177, + "time_per_iteration": 2.5438950061798096 + }, + { + "auxiliary_loss_clip": 0.06401211, + "auxiliary_loss_mlp": 0.01265433, + "balance_loss_clip": 0.06269382, + "balance_loss_mlp": 0.01255556, + "epoch": 0.8524274763264693, + "flos": 31437416833920.0, + "grad_norm": 1.5992726547756348, + "language_loss": 0.73792171, + "learning_rate": 2.2404652547428026e-07, + "loss": 0.81458819, + "num_input_tokens_seen": 305819695, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09881592, + "step": 14178, + "time_per_iteration": 2.553457736968994 + }, + { + "auxiliary_loss_clip": 0.06405145, + "auxiliary_loss_mlp": 0.0126473, + "balance_loss_clip": 0.06271388, + "balance_loss_mlp": 0.01254419, + "epoch": 0.8524875995791372, + "flos": 17718466596480.0, + "grad_norm": 1.6207840647423646, + "language_loss": 0.74986088, + "learning_rate": 2.238674502491935e-07, + "loss": 0.82655966, + "num_input_tokens_seen": 305837270, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.10302734, + "step": 14179, + "time_per_iteration": 2.4778192043304443 + }, + { + "auxiliary_loss_clip": 0.064025, + "auxiliary_loss_mlp": 0.01264849, + "balance_loss_clip": 0.06273463, + "balance_loss_mlp": 0.01256039, + "epoch": 0.8525477228318052, + "flos": 21693413710080.0, + "grad_norm": 2.2580601470919177, + "language_loss": 0.81900585, + "learning_rate": 2.2368844237523165e-07, + "loss": 0.89567935, + "num_input_tokens_seen": 305855250, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.08813477, + "step": 14180, + "time_per_iteration": 2.5831997394561768 + }, + { + "auxiliary_loss_clip": 0.06404898, + "auxiliary_loss_mlp": 0.01264396, + "balance_loss_clip": 0.06273393, + "balance_loss_mlp": 0.0125462, + "epoch": 0.8526078460844732, + "flos": 24834009392640.0, + "grad_norm": 2.3038873670157045, + "language_loss": 0.61954057, + "learning_rate": 2.235095018591815e-07, + "loss": 0.69623345, + "num_input_tokens_seen": 305875660, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09777832, + "step": 14181, + "time_per_iteration": 2.615877628326416 + }, + { + "auxiliary_loss_clip": 0.06400971, + "auxiliary_loss_mlp": 0.01263288, + "balance_loss_clip": 0.06272621, + "balance_loss_mlp": 0.01254091, + "epoch": 0.8526679693371412, + "flos": 13521469363200.0, + "grad_norm": 2.0632362183656046, + "language_loss": 0.7309761, + "learning_rate": 2.2333062870782894e-07, + "loss": 0.80761874, + "num_input_tokens_seen": 305892415, + "router_z_loss_clip": 1.28417969, + "router_z_loss_mlp": 0.09197998, + "step": 14182, + "time_per_iteration": 2.454415798187256 + }, + { + "auxiliary_loss_clip": 0.0640147, + "auxiliary_loss_mlp": 0.01264054, + "balance_loss_clip": 0.06271984, + "balance_loss_mlp": 0.01254709, + "epoch": 0.8527280925898091, + "flos": 23520911258880.0, + "grad_norm": 1.6302774737251082, + "language_loss": 0.71115839, + "learning_rate": 2.2315182292795697e-07, + "loss": 0.78781366, + "num_input_tokens_seen": 305912665, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.09338379, + "step": 14183, + "time_per_iteration": 2.5552773475646973 + }, + { + "auxiliary_loss_clip": 0.06401762, + "auxiliary_loss_mlp": 0.01263252, + "balance_loss_clip": 0.06273358, + "balance_loss_mlp": 0.01253906, + "epoch": 0.8527882158424771, + "flos": 20309261713920.0, + "grad_norm": 1.7421644295315515, + "language_loss": 0.7277168, + "learning_rate": 2.2297308452634644e-07, + "loss": 0.80436695, + "num_input_tokens_seen": 305931515, + "router_z_loss_clip": 1.28417969, + "router_z_loss_mlp": 0.09344482, + "step": 14184, + "time_per_iteration": 2.4897632598876953 + }, + { + "auxiliary_loss_clip": 0.06405064, + "auxiliary_loss_mlp": 0.01261188, + "balance_loss_clip": 0.06273878, + "balance_loss_mlp": 0.01251747, + "epoch": 0.852848339095145, + "flos": 17208343739520.0, + "grad_norm": 1.7709255697532287, + "language_loss": 0.77010369, + "learning_rate": 2.2279441350977457e-07, + "loss": 0.84676623, + "num_input_tokens_seen": 305949965, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09436035, + "step": 14185, + "time_per_iteration": 2.5324416160583496 + }, + { + "auxiliary_loss_clip": 0.06408064, + "auxiliary_loss_mlp": 0.01262591, + "balance_loss_clip": 0.06271752, + "balance_loss_mlp": 0.01253096, + "epoch": 0.852908462347813, + "flos": 18374847955200.0, + "grad_norm": 1.8096042183588577, + "language_loss": 0.7986542, + "learning_rate": 2.2261580988501637e-07, + "loss": 0.87536073, + "num_input_tokens_seen": 305967820, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.0949707, + "step": 14186, + "time_per_iteration": 2.4652650356292725 + }, + { + "auxiliary_loss_clip": 0.06405443, + "auxiliary_loss_mlp": 0.0126256, + "balance_loss_clip": 0.0627183, + "balance_loss_mlp": 0.01252958, + "epoch": 0.8529685856004809, + "flos": 18630873705600.0, + "grad_norm": 1.5490242087187152, + "language_loss": 0.62591934, + "learning_rate": 2.224372736588449e-07, + "loss": 0.70259941, + "num_input_tokens_seen": 305985505, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.0960083, + "step": 14187, + "time_per_iteration": 2.5199503898620605 + }, + { + "auxiliary_loss_clip": 0.06409691, + "auxiliary_loss_mlp": 0.01263092, + "balance_loss_clip": 0.06272909, + "balance_loss_mlp": 0.012529, + "epoch": 0.853028708853149, + "flos": 29615579435520.0, + "grad_norm": 1.578408505037398, + "language_loss": 0.76792014, + "learning_rate": 2.2225880483803005e-07, + "loss": 0.844648, + "num_input_tokens_seen": 306005220, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.10192871, + "step": 14188, + "time_per_iteration": 2.559159517288208 + }, + { + "auxiliary_loss_clip": 0.06407709, + "auxiliary_loss_mlp": 0.01261931, + "balance_loss_clip": 0.06273156, + "balance_loss_mlp": 0.01251941, + "epoch": 0.8530888321058169, + "flos": 26359304791680.0, + "grad_norm": 1.6865481411500645, + "language_loss": 0.78473645, + "learning_rate": 2.2208040342933932e-07, + "loss": 0.86143285, + "num_input_tokens_seen": 306023785, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.09985352, + "step": 14189, + "time_per_iteration": 2.5550105571746826 + }, + { + "auxiliary_loss_clip": 0.06405266, + "auxiliary_loss_mlp": 0.01264032, + "balance_loss_clip": 0.06272979, + "balance_loss_mlp": 0.01253619, + "epoch": 0.8531489553584849, + "flos": 20528251159680.0, + "grad_norm": 1.946155460997632, + "language_loss": 0.79894865, + "learning_rate": 2.2190206943953793e-07, + "loss": 0.87564158, + "num_input_tokens_seen": 306041600, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.10412598, + "step": 14190, + "time_per_iteration": 3.9609453678131104 + }, + { + "auxiliary_loss_clip": 0.06404427, + "auxiliary_loss_mlp": 0.01268005, + "balance_loss_clip": 0.06273438, + "balance_loss_mlp": 0.01258283, + "epoch": 0.8532090786111529, + "flos": 20710581644160.0, + "grad_norm": 1.8315307088661303, + "language_loss": 0.76509988, + "learning_rate": 2.2172380287538894e-07, + "loss": 0.84182423, + "num_input_tokens_seen": 306060345, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.097229, + "step": 14191, + "time_per_iteration": 2.497880458831787 + }, + { + "auxiliary_loss_clip": 0.06399481, + "auxiliary_loss_mlp": 0.01265109, + "balance_loss_clip": 0.06269594, + "balance_loss_mlp": 0.01255042, + "epoch": 0.8532692018638208, + "flos": 19835085058560.0, + "grad_norm": 1.8333627441476026, + "language_loss": 0.69020867, + "learning_rate": 2.2154560374365073e-07, + "loss": 0.76685452, + "num_input_tokens_seen": 306078285, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.10058594, + "step": 14192, + "time_per_iteration": 2.4836080074310303 + }, + { + "auxiliary_loss_clip": 0.06410177, + "auxiliary_loss_mlp": 0.01267235, + "balance_loss_clip": 0.06271335, + "balance_loss_mlp": 0.01255565, + "epoch": 0.8533293251164888, + "flos": 21003224428800.0, + "grad_norm": 4.054614200028427, + "language_loss": 0.62898421, + "learning_rate": 2.2136747205108164e-07, + "loss": 0.70575833, + "num_input_tokens_seen": 306093760, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.11669922, + "step": 14193, + "time_per_iteration": 2.4602465629577637 + }, + { + "auxiliary_loss_clip": 0.06401785, + "auxiliary_loss_mlp": 0.01261393, + "balance_loss_clip": 0.06270966, + "balance_loss_mlp": 0.01252065, + "epoch": 0.8533894483691568, + "flos": 22426257519360.0, + "grad_norm": 1.772584246462062, + "language_loss": 0.76703686, + "learning_rate": 2.211894078044365e-07, + "loss": 0.8436687, + "num_input_tokens_seen": 306112595, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09326172, + "step": 14194, + "time_per_iteration": 2.486522674560547 + }, + { + "auxiliary_loss_clip": 0.06402128, + "auxiliary_loss_mlp": 0.01261977, + "balance_loss_clip": 0.06269732, + "balance_loss_mlp": 0.01253096, + "epoch": 0.8534495716218248, + "flos": 21622988753280.0, + "grad_norm": 1.8711254841944578, + "language_loss": 0.6979003, + "learning_rate": 2.2101141101046705e-07, + "loss": 0.77454138, + "num_input_tokens_seen": 306131800, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.08880615, + "step": 14195, + "time_per_iteration": 2.4857912063598633 + }, + { + "auxiliary_loss_clip": 0.06402412, + "auxiliary_loss_mlp": 0.01267409, + "balance_loss_clip": 0.06270134, + "balance_loss_mlp": 0.01257729, + "epoch": 0.8535096948744927, + "flos": 22352855742720.0, + "grad_norm": 1.8252311406941406, + "language_loss": 0.85771298, + "learning_rate": 2.2083348167592343e-07, + "loss": 0.93441117, + "num_input_tokens_seen": 306150590, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.09680176, + "step": 14196, + "time_per_iteration": 2.495814800262451 + }, + { + "auxiliary_loss_clip": 0.06310438, + "auxiliary_loss_mlp": 0.01249691, + "balance_loss_clip": 0.06255472, + "balance_loss_mlp": 0.01248657, + "epoch": 0.8535698181271607, + "flos": 52778118781440.0, + "grad_norm": 0.7492715698474276, + "language_loss": 0.55104071, + "learning_rate": 2.2065561980755243e-07, + "loss": 0.62664199, + "num_input_tokens_seen": 306205850, + "router_z_loss_clip": 0.54833984, + "router_z_loss_mlp": 0.01035309, + "step": 14197, + "time_per_iteration": 3.0517899990081787 + }, + { + "auxiliary_loss_clip": 0.06400962, + "auxiliary_loss_mlp": 0.01262147, + "balance_loss_clip": 0.06272976, + "balance_loss_mlp": 0.01252449, + "epoch": 0.8536299413798286, + "flos": 19068978378240.0, + "grad_norm": 1.6048685300978085, + "language_loss": 0.81422484, + "learning_rate": 2.2047782541209826e-07, + "loss": 0.89085591, + "num_input_tokens_seen": 306225220, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.09698486, + "step": 14198, + "time_per_iteration": 2.5209779739379883 + }, + { + "auxiliary_loss_clip": 0.06403227, + "auxiliary_loss_mlp": 0.01263611, + "balance_loss_clip": 0.06271878, + "balance_loss_mlp": 0.01254825, + "epoch": 0.8536900646324966, + "flos": 49355670291840.0, + "grad_norm": 1.3991146351834236, + "language_loss": 0.68443, + "learning_rate": 2.203000984963035e-07, + "loss": 0.76109838, + "num_input_tokens_seen": 306249865, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.08789062, + "step": 14199, + "time_per_iteration": 2.732821464538574 + }, + { + "auxiliary_loss_clip": 0.06397039, + "auxiliary_loss_mlp": 0.01264117, + "balance_loss_clip": 0.06270607, + "balance_loss_mlp": 0.01255212, + "epoch": 0.8537501878851645, + "flos": 21768786057600.0, + "grad_norm": 1.5481845643108143, + "language_loss": 0.86597717, + "learning_rate": 2.201224390669072e-07, + "loss": 0.94258881, + "num_input_tokens_seen": 306270215, + "router_z_loss_clip": 1.26269531, + "router_z_loss_mlp": 0.08905029, + "step": 14200, + "time_per_iteration": 2.51717209815979 + }, + { + "auxiliary_loss_clip": 0.06402627, + "auxiliary_loss_mlp": 0.0126303, + "balance_loss_clip": 0.06271648, + "balance_loss_mlp": 0.01254101, + "epoch": 0.8538103111378326, + "flos": 22275051626880.0, + "grad_norm": 1.664748237948193, + "language_loss": 0.78232074, + "learning_rate": 2.1994484713064666e-07, + "loss": 0.85897732, + "num_input_tokens_seen": 306288960, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.08929443, + "step": 14201, + "time_per_iteration": 3.9599037170410156 + }, + { + "auxiliary_loss_clip": 0.06402917, + "auxiliary_loss_mlp": 0.01267065, + "balance_loss_clip": 0.06274314, + "balance_loss_mlp": 0.01258846, + "epoch": 0.8538704343905005, + "flos": 20310309889920.0, + "grad_norm": 1.8137924392854496, + "language_loss": 0.68695676, + "learning_rate": 2.19767322694256e-07, + "loss": 0.76365662, + "num_input_tokens_seen": 306308735, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.08221436, + "step": 14202, + "time_per_iteration": 2.5016098022460938 + }, + { + "auxiliary_loss_clip": 0.0640841, + "auxiliary_loss_mlp": 0.01265781, + "balance_loss_clip": 0.06275605, + "balance_loss_mlp": 0.01256167, + "epoch": 0.8539305576431685, + "flos": 24762284697600.0, + "grad_norm": 1.435109126468579, + "language_loss": 0.80630964, + "learning_rate": 2.195898657644666e-07, + "loss": 0.88305151, + "num_input_tokens_seen": 306329015, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09613037, + "step": 14203, + "time_per_iteration": 2.5469577312469482 + }, + { + "auxiliary_loss_clip": 0.06407243, + "auxiliary_loss_mlp": 0.01270539, + "balance_loss_clip": 0.06273086, + "balance_loss_mlp": 0.01259566, + "epoch": 0.8539906808958365, + "flos": 26694853666560.0, + "grad_norm": 1.7668265789233564, + "language_loss": 0.6594435, + "learning_rate": 2.1941247634800808e-07, + "loss": 0.73622131, + "num_input_tokens_seen": 306349085, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.10974121, + "step": 14204, + "time_per_iteration": 2.5221924781799316 + }, + { + "auxiliary_loss_clip": 0.06407247, + "auxiliary_loss_mlp": 0.01264623, + "balance_loss_clip": 0.06272349, + "balance_loss_mlp": 0.01254425, + "epoch": 0.8540508041485044, + "flos": 13369718419200.0, + "grad_norm": 2.1751805975593728, + "language_loss": 0.60867941, + "learning_rate": 2.1923515445160667e-07, + "loss": 0.68539816, + "num_input_tokens_seen": 306365385, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.10198975, + "step": 14205, + "time_per_iteration": 2.4865877628326416 + }, + { + "auxiliary_loss_clip": 0.06401113, + "auxiliary_loss_mlp": 0.0126197, + "balance_loss_clip": 0.06271503, + "balance_loss_mlp": 0.01252868, + "epoch": 0.8541109274011724, + "flos": 32789144499840.0, + "grad_norm": 1.841264040666231, + "language_loss": 0.72367227, + "learning_rate": 2.1905790008198655e-07, + "loss": 0.8003031, + "num_input_tokens_seen": 306384585, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.09100342, + "step": 14206, + "time_per_iteration": 4.089895963668823 + }, + { + "auxiliary_loss_clip": 0.06406163, + "auxiliary_loss_mlp": 0.01269422, + "balance_loss_clip": 0.06272157, + "balance_loss_mlp": 0.01259789, + "epoch": 0.8541710506538404, + "flos": 17645022892800.0, + "grad_norm": 2.6328069765844226, + "language_loss": 0.76719952, + "learning_rate": 2.1888071324586987e-07, + "loss": 0.8439554, + "num_input_tokens_seen": 306401565, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.09631348, + "step": 14207, + "time_per_iteration": 3.9147050380706787 + }, + { + "auxiliary_loss_clip": 0.06406431, + "auxiliary_loss_mlp": 0.01265601, + "balance_loss_clip": 0.06272171, + "balance_loss_mlp": 0.01255534, + "epoch": 0.8542311739065084, + "flos": 20268703537920.0, + "grad_norm": 4.198730612623469, + "language_loss": 0.85354292, + "learning_rate": 2.1870359394997485e-07, + "loss": 0.93026328, + "num_input_tokens_seen": 306419995, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.10064697, + "step": 14208, + "time_per_iteration": 2.4986929893493652 + }, + { + "auxiliary_loss_clip": 0.06401771, + "auxiliary_loss_mlp": 0.01262828, + "balance_loss_clip": 0.06270763, + "balance_loss_mlp": 0.01253667, + "epoch": 0.8542912971591763, + "flos": 17791491029760.0, + "grad_norm": 1.5673944060040763, + "language_loss": 0.66329616, + "learning_rate": 2.1852654220101785e-07, + "loss": 0.73994213, + "num_input_tokens_seen": 306439240, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.0916748, + "step": 14209, + "time_per_iteration": 2.463555097579956 + }, + { + "auxiliary_loss_clip": 0.06400887, + "auxiliary_loss_mlp": 0.01264146, + "balance_loss_clip": 0.06271108, + "balance_loss_mlp": 0.01254747, + "epoch": 0.8543514204118443, + "flos": 26986783691520.0, + "grad_norm": 1.7929675763472626, + "language_loss": 0.70580226, + "learning_rate": 2.1834955800571287e-07, + "loss": 0.78245258, + "num_input_tokens_seen": 306458425, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.09399414, + "step": 14210, + "time_per_iteration": 2.550405979156494 + }, + { + "auxiliary_loss_clip": 0.06406937, + "auxiliary_loss_mlp": 0.0126768, + "balance_loss_clip": 0.06274385, + "balance_loss_mlp": 0.01258084, + "epoch": 0.8544115436645122, + "flos": 24031453386240.0, + "grad_norm": 1.3681653014571087, + "language_loss": 0.70620722, + "learning_rate": 2.1817264137077141e-07, + "loss": 0.78295344, + "num_input_tokens_seen": 306477210, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09594727, + "step": 14211, + "time_per_iteration": 2.516709804534912 + }, + { + "auxiliary_loss_clip": 0.06405395, + "auxiliary_loss_mlp": 0.01265339, + "balance_loss_clip": 0.06272474, + "balance_loss_mlp": 0.01255469, + "epoch": 0.8544716669171802, + "flos": 16623603221760.0, + "grad_norm": 2.1078451145204156, + "language_loss": 0.81721437, + "learning_rate": 2.1799579230290166e-07, + "loss": 0.89392173, + "num_input_tokens_seen": 306495820, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.09863281, + "step": 14212, + "time_per_iteration": 2.479947328567505 + }, + { + "auxiliary_loss_clip": 0.06403465, + "auxiliary_loss_mlp": 0.01263033, + "balance_loss_clip": 0.06271081, + "balance_loss_mlp": 0.01253127, + "epoch": 0.8545317901698481, + "flos": 40015376939520.0, + "grad_norm": 2.0862751950857135, + "language_loss": 0.66500002, + "learning_rate": 2.178190108088105e-07, + "loss": 0.74166501, + "num_input_tokens_seen": 306516420, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.09906006, + "step": 14213, + "time_per_iteration": 2.641176462173462 + }, + { + "auxiliary_loss_clip": 0.06403671, + "auxiliary_loss_mlp": 0.01263607, + "balance_loss_clip": 0.06272917, + "balance_loss_mlp": 0.01253862, + "epoch": 0.8545919134225162, + "flos": 19908822251520.0, + "grad_norm": 1.6101825554065545, + "language_loss": 0.78410029, + "learning_rate": 2.1764229689520098e-07, + "loss": 0.86077309, + "num_input_tokens_seen": 306534785, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.09747314, + "step": 14214, + "time_per_iteration": 2.5185306072235107 + }, + { + "auxiliary_loss_clip": 0.06409415, + "auxiliary_loss_mlp": 0.01265369, + "balance_loss_clip": 0.06273215, + "balance_loss_mlp": 0.01254825, + "epoch": 0.8546520366751841, + "flos": 18958959567360.0, + "grad_norm": 2.1746086147260097, + "language_loss": 0.67291975, + "learning_rate": 2.1746565056877397e-07, + "loss": 0.74966758, + "num_input_tokens_seen": 306552440, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.10546875, + "step": 14215, + "time_per_iteration": 2.559387445449829 + }, + { + "auxiliary_loss_clip": 0.06403182, + "auxiliary_loss_mlp": 0.01264976, + "balance_loss_clip": 0.06272992, + "balance_loss_mlp": 0.01255773, + "epoch": 0.8547121599278521, + "flos": 35629298968320.0, + "grad_norm": 1.7345016463439749, + "language_loss": 0.62729144, + "learning_rate": 2.172890718362279e-07, + "loss": 0.703973, + "num_input_tokens_seen": 306573600, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.09197998, + "step": 14216, + "time_per_iteration": 2.631380319595337 + }, + { + "auxiliary_loss_clip": 0.06403802, + "auxiliary_loss_mlp": 0.01263952, + "balance_loss_clip": 0.06269723, + "balance_loss_mlp": 0.01254046, + "epoch": 0.8547722831805201, + "flos": 16915742881920.0, + "grad_norm": 1.5812149458388964, + "language_loss": 0.65813535, + "learning_rate": 2.17112560704259e-07, + "loss": 0.73481297, + "num_input_tokens_seen": 306592840, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.09912109, + "step": 14217, + "time_per_iteration": 2.457961320877075 + }, + { + "auxiliary_loss_clip": 0.06400003, + "auxiliary_loss_mlp": 0.01265845, + "balance_loss_clip": 0.06270915, + "balance_loss_mlp": 0.01256827, + "epoch": 0.854832406433188, + "flos": 23009237101440.0, + "grad_norm": 1.6861315946256161, + "language_loss": 0.65233666, + "learning_rate": 2.1693611717956072e-07, + "loss": 0.72899508, + "num_input_tokens_seen": 306613210, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.09008789, + "step": 14218, + "time_per_iteration": 2.5305798053741455 + }, + { + "auxiliary_loss_clip": 0.06403703, + "auxiliary_loss_mlp": 0.01266926, + "balance_loss_clip": 0.06268973, + "balance_loss_mlp": 0.01257622, + "epoch": 0.854892529685856, + "flos": 20418861254400.0, + "grad_norm": 1.6553984291407586, + "language_loss": 0.70452309, + "learning_rate": 2.167597412688238e-07, + "loss": 0.78122938, + "num_input_tokens_seen": 306631620, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.09301758, + "step": 14219, + "time_per_iteration": 2.5228383541107178 + }, + { + "auxiliary_loss_clip": 0.06408383, + "auxiliary_loss_mlp": 0.01266081, + "balance_loss_clip": 0.06272451, + "balance_loss_mlp": 0.01255507, + "epoch": 0.854952652938524, + "flos": 16404236432640.0, + "grad_norm": 2.1871061782173524, + "language_loss": 0.68056822, + "learning_rate": 2.1658343297873549e-07, + "loss": 0.75731283, + "num_input_tokens_seen": 306646695, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.10577393, + "step": 14220, + "time_per_iteration": 2.4769935607910156 + }, + { + "auxiliary_loss_clip": 0.06399038, + "auxiliary_loss_mlp": 0.01261891, + "balance_loss_clip": 0.06271215, + "balance_loss_mlp": 0.01252861, + "epoch": 0.855012776191192, + "flos": 21185051788800.0, + "grad_norm": 2.8581673001858015, + "language_loss": 0.72015893, + "learning_rate": 2.164071923159827e-07, + "loss": 0.79676819, + "num_input_tokens_seen": 306665465, + "router_z_loss_clip": 1.27832031, + "router_z_loss_mlp": 0.09039307, + "step": 14221, + "time_per_iteration": 2.483891725540161 + }, + { + "auxiliary_loss_clip": 0.06402694, + "auxiliary_loss_mlp": 0.01263341, + "balance_loss_clip": 0.06269461, + "balance_loss_mlp": 0.01253798, + "epoch": 0.8550728994438599, + "flos": 26148239556480.0, + "grad_norm": 2.145984380511565, + "language_loss": 0.60342848, + "learning_rate": 2.1623101928724763e-07, + "loss": 0.68008888, + "num_input_tokens_seen": 306685950, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.09539795, + "step": 14222, + "time_per_iteration": 2.549551486968994 + }, + { + "auxiliary_loss_clip": 0.06401211, + "auxiliary_loss_mlp": 0.01262674, + "balance_loss_clip": 0.0627152, + "balance_loss_mlp": 0.0125362, + "epoch": 0.8551330226965279, + "flos": 22793895308160.0, + "grad_norm": 1.4434546769022616, + "language_loss": 0.84376544, + "learning_rate": 2.1605491389921093e-07, + "loss": 0.92040431, + "num_input_tokens_seen": 306705740, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.09051514, + "step": 14223, + "time_per_iteration": 2.5119271278381348 + }, + { + "auxiliary_loss_clip": 0.0640087, + "auxiliary_loss_mlp": 0.01265091, + "balance_loss_clip": 0.06270584, + "balance_loss_mlp": 0.01255972, + "epoch": 0.8551931459491958, + "flos": 22425586686720.0, + "grad_norm": 1.8195239921480866, + "language_loss": 0.74431682, + "learning_rate": 2.158788761585515e-07, + "loss": 0.82097644, + "num_input_tokens_seen": 306725065, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.09118652, + "step": 14224, + "time_per_iteration": 2.520721912384033 + }, + { + "auxiliary_loss_clip": 0.06403351, + "auxiliary_loss_mlp": 0.01264932, + "balance_loss_clip": 0.0627145, + "balance_loss_mlp": 0.01255818, + "epoch": 0.8552532692018638, + "flos": 19579268943360.0, + "grad_norm": 3.311933017994998, + "language_loss": 0.75833428, + "learning_rate": 2.1570290607194307e-07, + "loss": 0.83501709, + "num_input_tokens_seen": 306743630, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.09118652, + "step": 14225, + "time_per_iteration": 2.4928267002105713 + }, + { + "auxiliary_loss_clip": 0.06402107, + "auxiliary_loss_mlp": 0.01263352, + "balance_loss_clip": 0.0627172, + "balance_loss_mlp": 0.0125434, + "epoch": 0.8553133924545318, + "flos": 26440043800320.0, + "grad_norm": 1.618794757802268, + "language_loss": 0.7746619, + "learning_rate": 2.1552700364605925e-07, + "loss": 0.85131651, + "num_input_tokens_seen": 306763105, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.09008789, + "step": 14226, + "time_per_iteration": 2.5908937454223633 + }, + { + "auxiliary_loss_clip": 0.06408493, + "auxiliary_loss_mlp": 0.01262631, + "balance_loss_clip": 0.06272776, + "balance_loss_mlp": 0.01252642, + "epoch": 0.8553735157071998, + "flos": 16367996741760.0, + "grad_norm": 2.0827352676299817, + "language_loss": 0.54691792, + "learning_rate": 2.153511688875702e-07, + "loss": 0.62362921, + "num_input_tokens_seen": 306779875, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.09991455, + "step": 14227, + "time_per_iteration": 2.4728844165802 + }, + { + "auxiliary_loss_clip": 0.0640135, + "auxiliary_loss_mlp": 0.01265196, + "balance_loss_clip": 0.06272006, + "balance_loss_mlp": 0.01255909, + "epoch": 0.8554336389598677, + "flos": 20893750669440.0, + "grad_norm": 1.8255877057500567, + "language_loss": 0.66183186, + "learning_rate": 2.151754018031442e-07, + "loss": 0.73849732, + "num_input_tokens_seen": 306800015, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.09289551, + "step": 14228, + "time_per_iteration": 2.578582525253296 + }, + { + "auxiliary_loss_clip": 0.06404306, + "auxiliary_loss_mlp": 0.01261575, + "balance_loss_clip": 0.06270082, + "balance_loss_mlp": 0.01251704, + "epoch": 0.8554937622125357, + "flos": 21290542479360.0, + "grad_norm": 1.7630288706046695, + "language_loss": 0.73876858, + "learning_rate": 2.1499970239944542e-07, + "loss": 0.8154273, + "num_input_tokens_seen": 306814160, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.09875488, + "step": 14229, + "time_per_iteration": 4.011183023452759 + }, + { + "auxiliary_loss_clip": 0.06399019, + "auxiliary_loss_mlp": 0.01263221, + "balance_loss_clip": 0.06270005, + "balance_loss_mlp": 0.01254495, + "epoch": 0.8555538854652037, + "flos": 22418752579200.0, + "grad_norm": 1.642260219354586, + "language_loss": 0.7294243, + "learning_rate": 2.1482407068313724e-07, + "loss": 0.80604661, + "num_input_tokens_seen": 306833310, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.08728027, + "step": 14230, + "time_per_iteration": 2.486304521560669 + }, + { + "auxiliary_loss_clip": 0.06403501, + "auxiliary_loss_mlp": 0.0126514, + "balance_loss_clip": 0.06272286, + "balance_loss_mlp": 0.01255639, + "epoch": 0.8556140087178716, + "flos": 20199955662720.0, + "grad_norm": 2.082778168166704, + "language_loss": 0.82605416, + "learning_rate": 2.1464850666087897e-07, + "loss": 0.9027406, + "num_input_tokens_seen": 306851345, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.0949707, + "step": 14231, + "time_per_iteration": 2.5565478801727295 + }, + { + "auxiliary_loss_clip": 0.06408692, + "auxiliary_loss_mlp": 0.01265448, + "balance_loss_clip": 0.0627467, + "balance_loss_mlp": 0.01255083, + "epoch": 0.8556741319705397, + "flos": 22644743840640.0, + "grad_norm": 1.7449765739897811, + "language_loss": 0.6803897, + "learning_rate": 2.1447301033932796e-07, + "loss": 0.7571311, + "num_input_tokens_seen": 306871040, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.1036377, + "step": 14232, + "time_per_iteration": 2.599693536758423 + }, + { + "auxiliary_loss_clip": 0.06405558, + "auxiliary_loss_mlp": 0.01264791, + "balance_loss_clip": 0.06271291, + "balance_loss_mlp": 0.01254545, + "epoch": 0.8557342552232076, + "flos": 23555935065600.0, + "grad_norm": 1.433905025036311, + "language_loss": 0.67211032, + "learning_rate": 2.1429758172513955e-07, + "loss": 0.74881387, + "num_input_tokens_seen": 306891625, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.10253906, + "step": 14233, + "time_per_iteration": 2.5528273582458496 + }, + { + "auxiliary_loss_clip": 0.06397888, + "auxiliary_loss_mlp": 0.01264971, + "balance_loss_clip": 0.06268627, + "balance_loss_mlp": 0.01255953, + "epoch": 0.8557943784758756, + "flos": 19616011758720.0, + "grad_norm": 1.6206343328834838, + "language_loss": 0.77135193, + "learning_rate": 2.1412222082496556e-07, + "loss": 0.84798056, + "num_input_tokens_seen": 306910020, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.09014893, + "step": 14234, + "time_per_iteration": 2.466433525085449 + }, + { + "auxiliary_loss_clip": 0.06311054, + "auxiliary_loss_mlp": 0.01249873, + "balance_loss_clip": 0.06256243, + "balance_loss_mlp": 0.0124884, + "epoch": 0.8558545017285435, + "flos": 70660719527040.0, + "grad_norm": 0.7448880666757703, + "language_loss": 0.58154905, + "learning_rate": 2.1394692764545684e-07, + "loss": 0.65715837, + "num_input_tokens_seen": 306969505, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.0103302, + "step": 14235, + "time_per_iteration": 3.1063690185546875 + }, + { + "auxiliary_loss_clip": 0.06307988, + "auxiliary_loss_mlp": 0.0125195, + "balance_loss_clip": 0.06253141, + "balance_loss_mlp": 0.01250894, + "epoch": 0.8559146249812115, + "flos": 56669586900480.0, + "grad_norm": 0.7679206472060363, + "language_loss": 0.56618702, + "learning_rate": 2.1377170219325858e-07, + "loss": 0.64178634, + "num_input_tokens_seen": 307027710, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.01057434, + "step": 14236, + "time_per_iteration": 3.0186736583709717 + }, + { + "auxiliary_loss_clip": 0.0640348, + "auxiliary_loss_mlp": 0.01264038, + "balance_loss_clip": 0.06271995, + "balance_loss_mlp": 0.01254371, + "epoch": 0.8559747482338794, + "flos": 22894019337600.0, + "grad_norm": 1.5957292123473101, + "language_loss": 0.70495546, + "learning_rate": 2.1359654447501673e-07, + "loss": 0.78163064, + "num_input_tokens_seen": 307045515, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09661865, + "step": 14237, + "time_per_iteration": 2.509390115737915 + }, + { + "auxiliary_loss_clip": 0.06402485, + "auxiliary_loss_mlp": 0.01262428, + "balance_loss_clip": 0.06271048, + "balance_loss_mlp": 0.0125341, + "epoch": 0.8560348714865474, + "flos": 22608588003840.0, + "grad_norm": 2.298866202248753, + "language_loss": 0.64055443, + "learning_rate": 2.1342145449737314e-07, + "loss": 0.71720362, + "num_input_tokens_seen": 307064470, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09014893, + "step": 14238, + "time_per_iteration": 2.5472559928894043 + }, + { + "auxiliary_loss_clip": 0.06398335, + "auxiliary_loss_mlp": 0.01261025, + "balance_loss_clip": 0.06270797, + "balance_loss_mlp": 0.01252663, + "epoch": 0.8560949947392154, + "flos": 17937288334080.0, + "grad_norm": 1.3930808832059673, + "language_loss": 0.6932922, + "learning_rate": 2.1324643226696648e-07, + "loss": 0.76988578, + "num_input_tokens_seen": 307083900, + "router_z_loss_clip": 1.2734375, + "router_z_loss_mlp": 0.08355713, + "step": 14239, + "time_per_iteration": 2.5263397693634033 + }, + { + "auxiliary_loss_clip": 0.06407407, + "auxiliary_loss_mlp": 0.01265921, + "balance_loss_clip": 0.06271498, + "balance_loss_mlp": 0.01256021, + "epoch": 0.8561551179918834, + "flos": 31033623208320.0, + "grad_norm": 1.8670368079308202, + "language_loss": 0.66960537, + "learning_rate": 2.1307147779043455e-07, + "loss": 0.74633867, + "num_input_tokens_seen": 307104590, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.09912109, + "step": 14240, + "time_per_iteration": 2.556577205657959 + }, + { + "auxiliary_loss_clip": 0.06403075, + "auxiliary_loss_mlp": 0.01264958, + "balance_loss_clip": 0.06270295, + "balance_loss_mlp": 0.01254581, + "epoch": 0.8562152412445513, + "flos": 30673196870400.0, + "grad_norm": 1.7026908336354338, + "language_loss": 0.6247797, + "learning_rate": 2.1289659107441182e-07, + "loss": 0.70146, + "num_input_tokens_seen": 307125580, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.1038208, + "step": 14241, + "time_per_iteration": 3.953922986984253 + }, + { + "auxiliary_loss_clip": 0.06409171, + "auxiliary_loss_mlp": 0.01264684, + "balance_loss_clip": 0.06270305, + "balance_loss_mlp": 0.01253914, + "epoch": 0.8562753644972193, + "flos": 31584094606080.0, + "grad_norm": 1.7280214562641805, + "language_loss": 0.74751389, + "learning_rate": 2.1272177212552855e-07, + "loss": 0.82425249, + "num_input_tokens_seen": 307147625, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.10766602, + "step": 14242, + "time_per_iteration": 2.6225974559783936 + }, + { + "auxiliary_loss_clip": 0.06404752, + "auxiliary_loss_mlp": 0.01266213, + "balance_loss_clip": 0.06271575, + "balance_loss_mlp": 0.01255788, + "epoch": 0.8563354877498872, + "flos": 26220844719360.0, + "grad_norm": 2.0910743848690756, + "language_loss": 0.76865256, + "learning_rate": 2.1254702095041498e-07, + "loss": 0.84536231, + "num_input_tokens_seen": 307164665, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.10418701, + "step": 14243, + "time_per_iteration": 2.6213650703430176 + }, + { + "auxiliary_loss_clip": 0.06404091, + "auxiliary_loss_mlp": 0.01263899, + "balance_loss_clip": 0.06271794, + "balance_loss_mlp": 0.0125472, + "epoch": 0.8563956110025552, + "flos": 24141262561920.0, + "grad_norm": 1.716514705669694, + "language_loss": 0.68232524, + "learning_rate": 2.123723375556974e-07, + "loss": 0.75900519, + "num_input_tokens_seen": 307182530, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.09179688, + "step": 14244, + "time_per_iteration": 2.5382473468780518 + }, + { + "auxiliary_loss_clip": 0.06309429, + "auxiliary_loss_mlp": 0.01252704, + "balance_loss_clip": 0.06254511, + "balance_loss_mlp": 0.01251608, + "epoch": 0.8564557342552233, + "flos": 56289329072640.0, + "grad_norm": 0.7489817973332332, + "language_loss": 0.58483648, + "learning_rate": 2.1219772194800046e-07, + "loss": 0.66045779, + "num_input_tokens_seen": 307241240, + "router_z_loss_clip": 0.54931641, + "router_z_loss_mlp": 0.01098633, + "step": 14245, + "time_per_iteration": 4.431305170059204 + }, + { + "auxiliary_loss_clip": 0.06408551, + "auxiliary_loss_mlp": 0.01268725, + "balance_loss_clip": 0.06271117, + "balance_loss_mlp": 0.01258425, + "epoch": 0.8565158575078912, + "flos": 23447341774080.0, + "grad_norm": 1.5238034305670078, + "language_loss": 0.78042555, + "learning_rate": 2.1202317413394488e-07, + "loss": 0.85719824, + "num_input_tokens_seen": 307261485, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.10290527, + "step": 14246, + "time_per_iteration": 2.5076048374176025 + }, + { + "auxiliary_loss_clip": 0.06399557, + "auxiliary_loss_mlp": 0.01263061, + "balance_loss_clip": 0.06269571, + "balance_loss_mlp": 0.01253954, + "epoch": 0.8565759807605592, + "flos": 20382160366080.0, + "grad_norm": 1.895687760539362, + "language_loss": 0.81607592, + "learning_rate": 2.1184869412014938e-07, + "loss": 0.8927021, + "num_input_tokens_seen": 307279160, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.09100342, + "step": 14247, + "time_per_iteration": 3.8989782333374023 + }, + { + "auxiliary_loss_clip": 0.06403957, + "auxiliary_loss_mlp": 0.01266682, + "balance_loss_clip": 0.06271452, + "balance_loss_mlp": 0.01256246, + "epoch": 0.8566361040132271, + "flos": 18813078408960.0, + "grad_norm": 1.6009384046261905, + "language_loss": 0.77626634, + "learning_rate": 2.1167428191323112e-07, + "loss": 0.85297275, + "num_input_tokens_seen": 307297920, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.10437012, + "step": 14248, + "time_per_iteration": 2.458406686782837 + }, + { + "auxiliary_loss_clip": 0.06403801, + "auxiliary_loss_mlp": 0.01262882, + "balance_loss_clip": 0.06269226, + "balance_loss_mlp": 0.01253131, + "epoch": 0.8566962272658951, + "flos": 24542289002880.0, + "grad_norm": 1.7603443054940122, + "language_loss": 0.78292143, + "learning_rate": 2.1149993751980278e-07, + "loss": 0.85958827, + "num_input_tokens_seen": 307318320, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.09747314, + "step": 14249, + "time_per_iteration": 2.5413098335266113 + }, + { + "auxiliary_loss_clip": 0.06403436, + "auxiliary_loss_mlp": 0.01264294, + "balance_loss_clip": 0.06273547, + "balance_loss_mlp": 0.01254739, + "epoch": 0.856756350518563, + "flos": 23184062645760.0, + "grad_norm": 1.5958025284963269, + "language_loss": 0.78781301, + "learning_rate": 2.1132566094647597e-07, + "loss": 0.86449027, + "num_input_tokens_seen": 307336720, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.09552002, + "step": 14250, + "time_per_iteration": 2.5379374027252197 + }, + { + "auxiliary_loss_clip": 0.06401314, + "auxiliary_loss_mlp": 0.0126559, + "balance_loss_clip": 0.06273337, + "balance_loss_mlp": 0.01256906, + "epoch": 0.856816473771231, + "flos": 20814017909760.0, + "grad_norm": 1.6478543991539485, + "language_loss": 0.80071545, + "learning_rate": 2.1115145219985942e-07, + "loss": 0.87738448, + "num_input_tokens_seen": 307354120, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.08685303, + "step": 14251, + "time_per_iteration": 2.5280861854553223 + }, + { + "auxiliary_loss_clip": 0.06403105, + "auxiliary_loss_mlp": 0.01263534, + "balance_loss_clip": 0.06271777, + "balance_loss_mlp": 0.01254206, + "epoch": 0.856876597023899, + "flos": 20234057074560.0, + "grad_norm": 1.9560781121028739, + "language_loss": 0.61853564, + "learning_rate": 2.1097731128656005e-07, + "loss": 0.69520199, + "num_input_tokens_seen": 307373165, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09320068, + "step": 14252, + "time_per_iteration": 2.5199599266052246 + }, + { + "auxiliary_loss_clip": 0.06406347, + "auxiliary_loss_mlp": 0.01266439, + "balance_loss_clip": 0.06272375, + "balance_loss_mlp": 0.01256324, + "epoch": 0.856936720276567, + "flos": 18301991230080.0, + "grad_norm": 1.7507738475608288, + "language_loss": 0.6978209, + "learning_rate": 2.1080323821317924e-07, + "loss": 0.77454877, + "num_input_tokens_seen": 307391000, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.10113525, + "step": 14253, + "time_per_iteration": 2.5490400791168213 + }, + { + "auxiliary_loss_clip": 0.06309576, + "auxiliary_loss_mlp": 0.01251585, + "balance_loss_clip": 0.06254718, + "balance_loss_mlp": 0.01250532, + "epoch": 0.8569968435292349, + "flos": 69897547739520.0, + "grad_norm": 0.7701050589451736, + "language_loss": 0.59286332, + "learning_rate": 2.1062923298631907e-07, + "loss": 0.66847491, + "num_input_tokens_seen": 307452865, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.01053619, + "step": 14254, + "time_per_iteration": 3.209148645401001 + }, + { + "auxiliary_loss_clip": 0.06397738, + "auxiliary_loss_mlp": 0.01265165, + "balance_loss_clip": 0.06269066, + "balance_loss_mlp": 0.01254228, + "epoch": 0.8570569667819029, + "flos": 25855680625920.0, + "grad_norm": 2.258350207103323, + "language_loss": 0.81105256, + "learning_rate": 2.1045529561257825e-07, + "loss": 0.88768154, + "num_input_tokens_seen": 307471940, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.109375, + "step": 14255, + "time_per_iteration": 2.5137405395507812 + }, + { + "auxiliary_loss_clip": 0.06400292, + "auxiliary_loss_mlp": 0.01262539, + "balance_loss_clip": 0.06272858, + "balance_loss_mlp": 0.01253539, + "epoch": 0.8571170900345708, + "flos": 23263627697280.0, + "grad_norm": 1.9053302406900494, + "language_loss": 0.67952186, + "learning_rate": 2.1028142609855126e-07, + "loss": 0.75615019, + "num_input_tokens_seen": 307488745, + "router_z_loss_clip": 1.2734375, + "router_z_loss_mlp": 0.09008789, + "step": 14256, + "time_per_iteration": 2.532684326171875 + }, + { + "auxiliary_loss_clip": 0.06404783, + "auxiliary_loss_mlp": 0.01266294, + "balance_loss_clip": 0.06271411, + "balance_loss_mlp": 0.01256543, + "epoch": 0.8571772132872388, + "flos": 18923851906560.0, + "grad_norm": 1.4788145502824088, + "language_loss": 0.70254731, + "learning_rate": 2.1010762445083218e-07, + "loss": 0.77925813, + "num_input_tokens_seen": 307506855, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.09759521, + "step": 14257, + "time_per_iteration": 2.4685792922973633 + }, + { + "auxiliary_loss_clip": 0.06404016, + "auxiliary_loss_mlp": 0.01260827, + "balance_loss_clip": 0.06273229, + "balance_loss_mlp": 0.0125138, + "epoch": 0.8572373365399069, + "flos": 33257619077760.0, + "grad_norm": 5.167351592300506, + "language_loss": 0.77215445, + "learning_rate": 2.0993389067601197e-07, + "loss": 0.84880286, + "num_input_tokens_seen": 307526115, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.09442139, + "step": 14258, + "time_per_iteration": 2.5947256088256836 + }, + { + "auxiliary_loss_clip": 0.06404524, + "auxiliary_loss_mlp": 0.01264942, + "balance_loss_clip": 0.06275545, + "balance_loss_mlp": 0.01255441, + "epoch": 0.8572974597925748, + "flos": 23333633383680.0, + "grad_norm": 1.616211280257574, + "language_loss": 0.68083584, + "learning_rate": 2.0976022478067735e-07, + "loss": 0.75753057, + "num_input_tokens_seen": 307545230, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.0949707, + "step": 14259, + "time_per_iteration": 2.503953456878662 + }, + { + "auxiliary_loss_clip": 0.06403054, + "auxiliary_loss_mlp": 0.01267159, + "balance_loss_clip": 0.0627027, + "balance_loss_mlp": 0.01256961, + "epoch": 0.8573575830452428, + "flos": 24542875981440.0, + "grad_norm": 1.7496586618740582, + "language_loss": 0.77195299, + "learning_rate": 2.0958662677141437e-07, + "loss": 0.8486551, + "num_input_tokens_seen": 307564900, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.10192871, + "step": 14260, + "time_per_iteration": 2.5407462120056152 + }, + { + "auxiliary_loss_clip": 0.06405485, + "auxiliary_loss_mlp": 0.01263632, + "balance_loss_clip": 0.06271508, + "balance_loss_mlp": 0.01253619, + "epoch": 0.8574177062979107, + "flos": 24171422832000.0, + "grad_norm": 1.7543477262218912, + "language_loss": 0.74165386, + "learning_rate": 2.09413096654806e-07, + "loss": 0.81834501, + "num_input_tokens_seen": 307583500, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.10015869, + "step": 14261, + "time_per_iteration": 2.4984147548675537 + }, + { + "auxiliary_loss_clip": 0.06407628, + "auxiliary_loss_mlp": 0.01265927, + "balance_loss_clip": 0.06272539, + "balance_loss_mlp": 0.01255139, + "epoch": 0.8574778295505787, + "flos": 17936449793280.0, + "grad_norm": 2.9359486176790686, + "language_loss": 0.79358846, + "learning_rate": 2.0923963443743276e-07, + "loss": 0.87032402, + "num_input_tokens_seen": 307601430, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.10784912, + "step": 14262, + "time_per_iteration": 2.4626708030700684 + }, + { + "auxiliary_loss_clip": 0.06400175, + "auxiliary_loss_mlp": 0.01267289, + "balance_loss_clip": 0.0627176, + "balance_loss_mlp": 0.01258098, + "epoch": 0.8575379528032466, + "flos": 21587252186880.0, + "grad_norm": 1.616838611011757, + "language_loss": 0.6784209, + "learning_rate": 2.0906624012587203e-07, + "loss": 0.75509548, + "num_input_tokens_seen": 307621495, + "router_z_loss_clip": 1.28222656, + "router_z_loss_mlp": 0.09185791, + "step": 14263, + "time_per_iteration": 2.4902124404907227 + }, + { + "auxiliary_loss_clip": 0.06405489, + "auxiliary_loss_mlp": 0.01262847, + "balance_loss_clip": 0.06272297, + "balance_loss_mlp": 0.01253471, + "epoch": 0.8575980760559146, + "flos": 21767905589760.0, + "grad_norm": 1.9571137270825887, + "language_loss": 0.79872948, + "learning_rate": 2.088929137266986e-07, + "loss": 0.87541282, + "num_input_tokens_seen": 307640840, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.09375, + "step": 14264, + "time_per_iteration": 2.5202577114105225 + }, + { + "auxiliary_loss_clip": 0.06404608, + "auxiliary_loss_mlp": 0.01269305, + "balance_loss_clip": 0.0627332, + "balance_loss_mlp": 0.01259978, + "epoch": 0.8576581993085826, + "flos": 34395011199360.0, + "grad_norm": 2.2143904362028644, + "language_loss": 0.69639301, + "learning_rate": 2.0871965524648582e-07, + "loss": 0.77313221, + "num_input_tokens_seen": 307663820, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09326172, + "step": 14265, + "time_per_iteration": 2.612647771835327 + }, + { + "auxiliary_loss_clip": 0.06399523, + "auxiliary_loss_mlp": 0.012609, + "balance_loss_clip": 0.06272203, + "balance_loss_mlp": 0.01251942, + "epoch": 0.8577183225612506, + "flos": 23229316650240.0, + "grad_norm": 1.6733169528814695, + "language_loss": 0.65993267, + "learning_rate": 2.085464646918027e-07, + "loss": 0.73653686, + "num_input_tokens_seen": 307682385, + "router_z_loss_clip": 1.2734375, + "router_z_loss_mlp": 0.08966064, + "step": 14266, + "time_per_iteration": 2.5544586181640625 + }, + { + "auxiliary_loss_clip": 0.06401126, + "auxiliary_loss_mlp": 0.01265154, + "balance_loss_clip": 0.06271696, + "balance_loss_mlp": 0.01255563, + "epoch": 0.8577784458139185, + "flos": 28811807544960.0, + "grad_norm": 1.5935040876679754, + "language_loss": 0.75452656, + "learning_rate": 2.0837334206921731e-07, + "loss": 0.83118939, + "num_input_tokens_seen": 307704680, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.09576416, + "step": 14267, + "time_per_iteration": 2.5590057373046875 + }, + { + "auxiliary_loss_clip": 0.06401159, + "auxiliary_loss_mlp": 0.01264336, + "balance_loss_clip": 0.06272185, + "balance_loss_mlp": 0.01255527, + "epoch": 0.8578385690665865, + "flos": 19761683281920.0, + "grad_norm": 1.584742251328993, + "language_loss": 0.87780321, + "learning_rate": 2.082002873852946e-07, + "loss": 0.95445812, + "num_input_tokens_seen": 307723245, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.0880127, + "step": 14268, + "time_per_iteration": 2.525526523590088 + }, + { + "auxiliary_loss_clip": 0.06411083, + "auxiliary_loss_mlp": 0.01266639, + "balance_loss_clip": 0.06275931, + "balance_loss_mlp": 0.012569, + "epoch": 0.8578986923192544, + "flos": 20710330081920.0, + "grad_norm": 2.0171508570409173, + "language_loss": 0.7276274, + "learning_rate": 2.0802730064659667e-07, + "loss": 0.80440462, + "num_input_tokens_seen": 307742510, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.09747314, + "step": 14269, + "time_per_iteration": 3.9116053581237793 + }, + { + "auxiliary_loss_clip": 0.06407435, + "auxiliary_loss_mlp": 0.01264024, + "balance_loss_clip": 0.06273964, + "balance_loss_mlp": 0.0125438, + "epoch": 0.8579588155719224, + "flos": 36110645147520.0, + "grad_norm": 1.865981060297471, + "language_loss": 0.66775644, + "learning_rate": 2.0785438185968252e-07, + "loss": 0.74447107, + "num_input_tokens_seen": 307766030, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.09637451, + "step": 14270, + "time_per_iteration": 2.618803024291992 + }, + { + "auxiliary_loss_clip": 0.06402225, + "auxiliary_loss_mlp": 0.01263727, + "balance_loss_clip": 0.06272581, + "balance_loss_mlp": 0.01254542, + "epoch": 0.8580189388245905, + "flos": 22859540582400.0, + "grad_norm": 2.2948861169859525, + "language_loss": 0.73892224, + "learning_rate": 2.0768153103110997e-07, + "loss": 0.8155818, + "num_input_tokens_seen": 307785800, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.09179688, + "step": 14271, + "time_per_iteration": 2.497725486755371 + }, + { + "auxiliary_loss_clip": 0.06309859, + "auxiliary_loss_mlp": 0.01251844, + "balance_loss_clip": 0.06254922, + "balance_loss_mlp": 0.01250786, + "epoch": 0.8580790620772584, + "flos": 69664414152960.0, + "grad_norm": 0.7639484057926735, + "language_loss": 0.58678043, + "learning_rate": 2.0750874816743358e-07, + "loss": 0.66239738, + "num_input_tokens_seen": 307850995, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.01059723, + "step": 14272, + "time_per_iteration": 3.169260263442993 + }, + { + "auxiliary_loss_clip": 0.06409359, + "auxiliary_loss_mlp": 0.01262454, + "balance_loss_clip": 0.06272221, + "balance_loss_mlp": 0.01252566, + "epoch": 0.8581391853299264, + "flos": 13339306586880.0, + "grad_norm": 1.7586191821345811, + "language_loss": 0.75792611, + "learning_rate": 2.0733603327520499e-07, + "loss": 0.8346442, + "num_input_tokens_seen": 307868585, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.09887695, + "step": 14273, + "time_per_iteration": 2.478921890258789 + }, + { + "auxiliary_loss_clip": 0.06403127, + "auxiliary_loss_mlp": 0.01265609, + "balance_loss_clip": 0.06271982, + "balance_loss_mlp": 0.01256489, + "epoch": 0.8581993085825943, + "flos": 19651664471040.0, + "grad_norm": 1.8547741547168304, + "language_loss": 0.82333291, + "learning_rate": 2.0716338636097385e-07, + "loss": 0.9000203, + "num_input_tokens_seen": 307886820, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09118652, + "step": 14274, + "time_per_iteration": 2.478856086730957 + }, + { + "auxiliary_loss_clip": 0.06313574, + "auxiliary_loss_mlp": 0.01252106, + "balance_loss_clip": 0.06258807, + "balance_loss_mlp": 0.01251031, + "epoch": 0.8582594318352623, + "flos": 55840826494080.0, + "grad_norm": 3.2665895099659745, + "language_loss": 0.60961515, + "learning_rate": 2.0699080743128672e-07, + "loss": 0.68527198, + "num_input_tokens_seen": 307944020, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.01076508, + "step": 14275, + "time_per_iteration": 3.197674036026001 + }, + { + "auxiliary_loss_clip": 0.06405815, + "auxiliary_loss_mlp": 0.01264154, + "balance_loss_clip": 0.06271765, + "balance_loss_mlp": 0.01254314, + "epoch": 0.8583195550879302, + "flos": 24286389033600.0, + "grad_norm": 2.04706011240556, + "language_loss": 0.59755701, + "learning_rate": 2.0681829649268768e-07, + "loss": 0.67425668, + "num_input_tokens_seen": 307961055, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.09844971, + "step": 14276, + "time_per_iteration": 2.530808448791504 + }, + { + "auxiliary_loss_clip": 0.06402551, + "auxiliary_loss_mlp": 0.01264566, + "balance_loss_clip": 0.06271014, + "balance_loss_mlp": 0.01254559, + "epoch": 0.8583796783405983, + "flos": 13449283470720.0, + "grad_norm": 1.6940890444447256, + "language_loss": 0.76255608, + "learning_rate": 2.0664585355171838e-07, + "loss": 0.8392272, + "num_input_tokens_seen": 307978690, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.10009766, + "step": 14277, + "time_per_iteration": 2.541459083557129 + }, + { + "auxiliary_loss_clip": 0.06400612, + "auxiliary_loss_mlp": 0.01266902, + "balance_loss_clip": 0.06269176, + "balance_loss_mlp": 0.01256525, + "epoch": 0.8584398015932662, + "flos": 16185833965440.0, + "grad_norm": 1.51585453174595, + "language_loss": 0.84088707, + "learning_rate": 2.0647347861491803e-07, + "loss": 0.91756225, + "num_input_tokens_seen": 307995870, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.10369873, + "step": 14278, + "time_per_iteration": 2.4656083583831787 + }, + { + "auxiliary_loss_clip": 0.06406611, + "auxiliary_loss_mlp": 0.01267273, + "balance_loss_clip": 0.06270614, + "balance_loss_mlp": 0.01256717, + "epoch": 0.8584999248459342, + "flos": 17455061687040.0, + "grad_norm": 1.7809196500006463, + "language_loss": 0.74783373, + "learning_rate": 2.0630117168882366e-07, + "loss": 0.82457256, + "num_input_tokens_seen": 308013645, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.10552979, + "step": 14279, + "time_per_iteration": 2.515935182571411 + }, + { + "auxiliary_loss_clip": 0.06400705, + "auxiliary_loss_mlp": 0.0126475, + "balance_loss_clip": 0.06270881, + "balance_loss_mlp": 0.01256018, + "epoch": 0.8585600480986021, + "flos": 23447802971520.0, + "grad_norm": 2.7435347339803826, + "language_loss": 0.66660666, + "learning_rate": 2.0612893277996845e-07, + "loss": 0.74326128, + "num_input_tokens_seen": 308032490, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.08734131, + "step": 14280, + "time_per_iteration": 3.916933059692383 + }, + { + "auxiliary_loss_clip": 0.06399409, + "auxiliary_loss_mlp": 0.01264296, + "balance_loss_clip": 0.06269073, + "balance_loss_mlp": 0.01255195, + "epoch": 0.8586201713512701, + "flos": 19944055693440.0, + "grad_norm": 2.5855570213577588, + "language_loss": 0.62396699, + "learning_rate": 2.0595676189488343e-07, + "loss": 0.70060408, + "num_input_tokens_seen": 308052110, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.09106445, + "step": 14281, + "time_per_iteration": 2.505758762359619 + }, + { + "auxiliary_loss_clip": 0.06404914, + "auxiliary_loss_mlp": 0.01264619, + "balance_loss_clip": 0.06272723, + "balance_loss_mlp": 0.0125488, + "epoch": 0.858680294603938, + "flos": 15310211598720.0, + "grad_norm": 1.5539909401541185, + "language_loss": 0.73079032, + "learning_rate": 2.0578465904009845e-07, + "loss": 0.80748564, + "num_input_tokens_seen": 308070660, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09747314, + "step": 14282, + "time_per_iteration": 2.493986129760742 + }, + { + "auxiliary_loss_clip": 0.06398949, + "auxiliary_loss_mlp": 0.01260814, + "balance_loss_clip": 0.06269239, + "balance_loss_mlp": 0.01252117, + "epoch": 0.858740417856606, + "flos": 22717894055040.0, + "grad_norm": 1.8222049767211217, + "language_loss": 0.75866199, + "learning_rate": 2.0561262422213832e-07, + "loss": 0.83525962, + "num_input_tokens_seen": 308089520, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.08691406, + "step": 14283, + "time_per_iteration": 2.5006518363952637 + }, + { + "auxiliary_loss_clip": 0.06400195, + "auxiliary_loss_mlp": 0.01261844, + "balance_loss_clip": 0.06268735, + "balance_loss_mlp": 0.01252963, + "epoch": 0.8588005411092741, + "flos": 34062187582080.0, + "grad_norm": 1.6694102205368824, + "language_loss": 0.60190046, + "learning_rate": 2.0544065744752736e-07, + "loss": 0.67852092, + "num_input_tokens_seen": 308111545, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.08874512, + "step": 14284, + "time_per_iteration": 2.5979769229888916 + }, + { + "auxiliary_loss_clip": 0.06398802, + "auxiliary_loss_mlp": 0.01262388, + "balance_loss_clip": 0.06269779, + "balance_loss_mlp": 0.01253531, + "epoch": 0.858860664361942, + "flos": 28921239377280.0, + "grad_norm": 1.896816667575115, + "language_loss": 0.7606923, + "learning_rate": 2.0526875872278749e-07, + "loss": 0.83730417, + "num_input_tokens_seen": 308129690, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.08856201, + "step": 14285, + "time_per_iteration": 3.9742085933685303 + }, + { + "auxiliary_loss_clip": 0.06406308, + "auxiliary_loss_mlp": 0.01264594, + "balance_loss_clip": 0.06271583, + "balance_loss_mlp": 0.01254682, + "epoch": 0.85892078761461, + "flos": 19798719586560.0, + "grad_norm": 1.6271590224898915, + "language_loss": 0.74210882, + "learning_rate": 2.0509692805443524e-07, + "loss": 0.81881779, + "num_input_tokens_seen": 308147410, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.09906006, + "step": 14286, + "time_per_iteration": 3.961693048477173 + }, + { + "auxiliary_loss_clip": 0.06312392, + "auxiliary_loss_mlp": 0.01251687, + "balance_loss_clip": 0.06257644, + "balance_loss_mlp": 0.01250683, + "epoch": 0.8589809108672779, + "flos": 67125512240640.0, + "grad_norm": 0.7443163222732918, + "language_loss": 0.49355024, + "learning_rate": 2.0492516544898718e-07, + "loss": 0.56919104, + "num_input_tokens_seen": 308204875, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.01003265, + "step": 14287, + "time_per_iteration": 3.081287145614624 + }, + { + "auxiliary_loss_clip": 0.06402116, + "auxiliary_loss_mlp": 0.01263241, + "balance_loss_clip": 0.06269466, + "balance_loss_mlp": 0.01253323, + "epoch": 0.8590410341199459, + "flos": 29724046945920.0, + "grad_norm": 1.7960694275427957, + "language_loss": 0.79450381, + "learning_rate": 2.0475347091295704e-07, + "loss": 0.87115741, + "num_input_tokens_seen": 308225690, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.09912109, + "step": 14288, + "time_per_iteration": 2.549579381942749 + }, + { + "auxiliary_loss_clip": 0.06406873, + "auxiliary_loss_mlp": 0.0126658, + "balance_loss_clip": 0.06272471, + "balance_loss_mlp": 0.01256197, + "epoch": 0.8591011573726138, + "flos": 23994165519360.0, + "grad_norm": 1.8099062195023483, + "language_loss": 0.81317496, + "learning_rate": 2.045818444528553e-07, + "loss": 0.88990951, + "num_input_tokens_seen": 308245255, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.10375977, + "step": 14289, + "time_per_iteration": 2.532503366470337 + }, + { + "auxiliary_loss_clip": 0.06402125, + "auxiliary_loss_mlp": 0.01263769, + "balance_loss_clip": 0.06271179, + "balance_loss_mlp": 0.01254054, + "epoch": 0.8591612806252819, + "flos": 14433876472320.0, + "grad_norm": 1.6143264802543886, + "language_loss": 0.6542815, + "learning_rate": 2.0441028607518973e-07, + "loss": 0.7309404, + "num_input_tokens_seen": 308261755, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.09710693, + "step": 14290, + "time_per_iteration": 2.4673476219177246 + }, + { + "auxiliary_loss_clip": 0.06405544, + "auxiliary_loss_mlp": 0.01262804, + "balance_loss_clip": 0.06270199, + "balance_loss_mlp": 0.012526, + "epoch": 0.8592214038779498, + "flos": 31585268563200.0, + "grad_norm": 2.147386540857062, + "language_loss": 0.5574224, + "learning_rate": 2.0423879578646642e-07, + "loss": 0.63410592, + "num_input_tokens_seen": 308285145, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.10205078, + "step": 14291, + "time_per_iteration": 2.634934425354004 + }, + { + "auxiliary_loss_clip": 0.06403403, + "auxiliary_loss_mlp": 0.01264218, + "balance_loss_clip": 0.06271186, + "balance_loss_mlp": 0.0125427, + "epoch": 0.8592815271306178, + "flos": 17463069751680.0, + "grad_norm": 2.0257150352321256, + "language_loss": 0.71959877, + "learning_rate": 2.0406737359318792e-07, + "loss": 0.79627502, + "num_input_tokens_seen": 308304130, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09954834, + "step": 14292, + "time_per_iteration": 2.4553961753845215 + }, + { + "auxiliary_loss_clip": 0.06403185, + "auxiliary_loss_mlp": 0.01263162, + "balance_loss_clip": 0.06270117, + "balance_loss_mlp": 0.01253929, + "epoch": 0.8593416503832857, + "flos": 25418498348160.0, + "grad_norm": 1.3381246650209893, + "language_loss": 0.71274585, + "learning_rate": 2.038960195018542e-07, + "loss": 0.78940934, + "num_input_tokens_seen": 308324670, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.09228516, + "step": 14293, + "time_per_iteration": 2.56117844581604 + }, + { + "auxiliary_loss_clip": 0.06400074, + "auxiliary_loss_mlp": 0.01261361, + "balance_loss_clip": 0.06270564, + "balance_loss_mlp": 0.01252629, + "epoch": 0.8594017736359537, + "flos": 21003056720640.0, + "grad_norm": 3.825132104527405, + "language_loss": 0.68924177, + "learning_rate": 2.0372473351896358e-07, + "loss": 0.76585615, + "num_input_tokens_seen": 308344215, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.08721924, + "step": 14294, + "time_per_iteration": 2.4963736534118652 + }, + { + "auxiliary_loss_clip": 0.06396788, + "auxiliary_loss_mlp": 0.01263426, + "balance_loss_clip": 0.06268485, + "balance_loss_mlp": 0.01254218, + "epoch": 0.8594618968886216, + "flos": 22097626606080.0, + "grad_norm": 1.805212015136028, + "language_loss": 0.78444296, + "learning_rate": 2.0355351565101087e-07, + "loss": 0.86104512, + "num_input_tokens_seen": 308360520, + "router_z_loss_clip": 1.28222656, + "router_z_loss_mlp": 0.09204102, + "step": 14295, + "time_per_iteration": 2.5134646892547607 + }, + { + "auxiliary_loss_clip": 0.06408249, + "auxiliary_loss_mlp": 0.01265112, + "balance_loss_clip": 0.06271674, + "balance_loss_mlp": 0.01253948, + "epoch": 0.8595220201412896, + "flos": 11661086286720.0, + "grad_norm": 2.7942491090682213, + "language_loss": 0.69070399, + "learning_rate": 2.0338236590448975e-07, + "loss": 0.76743758, + "num_input_tokens_seen": 308376865, + "router_z_loss_clip": 1.36621094, + "router_z_loss_mlp": 0.11151123, + "step": 14296, + "time_per_iteration": 2.475787878036499 + }, + { + "auxiliary_loss_clip": 0.0640314, + "auxiliary_loss_mlp": 0.01263171, + "balance_loss_clip": 0.06271674, + "balance_loss_mlp": 0.01253497, + "epoch": 0.8595821433939577, + "flos": 25046416293120.0, + "grad_norm": 1.9233061484509495, + "language_loss": 0.79669362, + "learning_rate": 2.0321128428588842e-07, + "loss": 0.87335676, + "num_input_tokens_seen": 308395870, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.09674072, + "step": 14297, + "time_per_iteration": 2.5401291847229004 + }, + { + "auxiliary_loss_clip": 0.06396289, + "auxiliary_loss_mlp": 0.01268362, + "balance_loss_clip": 0.06267644, + "balance_loss_mlp": 0.01259951, + "epoch": 0.8596422666466256, + "flos": 28518997052160.0, + "grad_norm": 2.2682977179383372, + "language_loss": 0.68144363, + "learning_rate": 2.030402708016954e-07, + "loss": 0.75809014, + "num_input_tokens_seen": 308417250, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.08410645, + "step": 14298, + "time_per_iteration": 2.5733871459960938 + }, + { + "auxiliary_loss_clip": 0.06398705, + "auxiliary_loss_mlp": 0.01260865, + "balance_loss_clip": 0.06270595, + "balance_loss_mlp": 0.01251913, + "epoch": 0.8597023898992936, + "flos": 13594158380160.0, + "grad_norm": 1.9854858480921735, + "language_loss": 0.68880069, + "learning_rate": 2.0286932545839576e-07, + "loss": 0.76539636, + "num_input_tokens_seen": 308434565, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.08947754, + "step": 14299, + "time_per_iteration": 2.488328456878662 + }, + { + "auxiliary_loss_clip": 0.06404358, + "auxiliary_loss_mlp": 0.01264205, + "balance_loss_clip": 0.06271502, + "balance_loss_mlp": 0.01254454, + "epoch": 0.8597625131519615, + "flos": 32308049882880.0, + "grad_norm": 2.1252767779815374, + "language_loss": 0.71345496, + "learning_rate": 2.0269844826247096e-07, + "loss": 0.79014063, + "num_input_tokens_seen": 308450040, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.09753418, + "step": 14300, + "time_per_iteration": 2.5601115226745605 + }, + { + "auxiliary_loss_clip": 0.06400272, + "auxiliary_loss_mlp": 0.01267131, + "balance_loss_clip": 0.06269163, + "balance_loss_mlp": 0.01258274, + "epoch": 0.8598226364046295, + "flos": 28737860716800.0, + "grad_norm": 1.7436356561716806, + "language_loss": 0.6957137, + "learning_rate": 2.0252763922040116e-07, + "loss": 0.77238768, + "num_input_tokens_seen": 308470545, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.08856201, + "step": 14301, + "time_per_iteration": 2.6039092540740967 + }, + { + "auxiliary_loss_clip": 0.06402557, + "auxiliary_loss_mlp": 0.01265888, + "balance_loss_clip": 0.06270393, + "balance_loss_mlp": 0.01256661, + "epoch": 0.8598827596572974, + "flos": 21878301744000.0, + "grad_norm": 1.5832191765924557, + "language_loss": 0.74322796, + "learning_rate": 2.023568983386641e-07, + "loss": 0.81991243, + "num_input_tokens_seen": 308490020, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.09228516, + "step": 14302, + "time_per_iteration": 2.4957993030548096 + }, + { + "auxiliary_loss_clip": 0.06400271, + "auxiliary_loss_mlp": 0.01260712, + "balance_loss_clip": 0.06272089, + "balance_loss_mlp": 0.01251855, + "epoch": 0.8599428829099655, + "flos": 23773792481280.0, + "grad_norm": 1.75128895706435, + "language_loss": 0.83832628, + "learning_rate": 2.02186225623733e-07, + "loss": 0.91493607, + "num_input_tokens_seen": 308509065, + "router_z_loss_clip": 1.28222656, + "router_z_loss_mlp": 0.08856201, + "step": 14303, + "time_per_iteration": 2.522888660430908 + }, + { + "auxiliary_loss_clip": 0.06405427, + "auxiliary_loss_mlp": 0.01264688, + "balance_loss_clip": 0.06271775, + "balance_loss_mlp": 0.01254543, + "epoch": 0.8600030061626334, + "flos": 16217671317120.0, + "grad_norm": 2.5248591398182327, + "language_loss": 0.7718581, + "learning_rate": 2.0201562108208025e-07, + "loss": 0.84855914, + "num_input_tokens_seen": 308524725, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.10137939, + "step": 14304, + "time_per_iteration": 2.4513118267059326 + }, + { + "auxiliary_loss_clip": 0.06403493, + "auxiliary_loss_mlp": 0.01262423, + "balance_loss_clip": 0.06271586, + "balance_loss_mlp": 0.0125201, + "epoch": 0.8600631294153014, + "flos": 15674830640640.0, + "grad_norm": 2.4458831318070815, + "language_loss": 0.54347569, + "learning_rate": 2.0184508472017537e-07, + "loss": 0.62013483, + "num_input_tokens_seen": 308543525, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.10424805, + "step": 14305, + "time_per_iteration": 2.4636104106903076 + }, + { + "auxiliary_loss_clip": 0.06401916, + "auxiliary_loss_mlp": 0.01266463, + "balance_loss_clip": 0.06271758, + "balance_loss_mlp": 0.01256194, + "epoch": 0.8601232526679693, + "flos": 17498764391040.0, + "grad_norm": 1.7675730532667615, + "language_loss": 0.83626974, + "learning_rate": 2.0167461654448558e-07, + "loss": 0.9129535, + "num_input_tokens_seen": 308557995, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.1027832, + "step": 14306, + "time_per_iteration": 2.438267230987549 + }, + { + "auxiliary_loss_clip": 0.0639829, + "auxiliary_loss_mlp": 0.01261955, + "balance_loss_clip": 0.06269355, + "balance_loss_mlp": 0.01252764, + "epoch": 0.8601833759206373, + "flos": 26994288631680.0, + "grad_norm": 1.2962192910177055, + "language_loss": 0.71717322, + "learning_rate": 2.01504216561474e-07, + "loss": 0.79377568, + "num_input_tokens_seen": 308582750, + "router_z_loss_clip": 1.28808594, + "router_z_loss_mlp": 0.09191895, + "step": 14307, + "time_per_iteration": 2.592008590698242 + }, + { + "auxiliary_loss_clip": 0.06409558, + "auxiliary_loss_mlp": 0.0126879, + "balance_loss_clip": 0.06273729, + "balance_loss_mlp": 0.01258418, + "epoch": 0.8602434991733052, + "flos": 25237006404480.0, + "grad_norm": 1.5952354561078483, + "language_loss": 0.64001, + "learning_rate": 2.0133388477760316e-07, + "loss": 0.71679354, + "num_input_tokens_seen": 308603770, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10369873, + "step": 14308, + "time_per_iteration": 3.9432108402252197 + }, + { + "auxiliary_loss_clip": 0.06312782, + "auxiliary_loss_mlp": 0.01249453, + "balance_loss_clip": 0.06257753, + "balance_loss_mlp": 0.01248612, + "epoch": 0.8603036224259732, + "flos": 71035694547840.0, + "grad_norm": 0.693627555027915, + "language_loss": 0.48403317, + "learning_rate": 2.0116362119933172e-07, + "loss": 0.55965549, + "num_input_tokens_seen": 308667735, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.00842285, + "step": 14309, + "time_per_iteration": 3.236663579940796 + }, + { + "auxiliary_loss_clip": 0.06405216, + "auxiliary_loss_mlp": 0.012639, + "balance_loss_clip": 0.06271836, + "balance_loss_mlp": 0.01253011, + "epoch": 0.8603637456786413, + "flos": 20306452602240.0, + "grad_norm": 5.430428245021858, + "language_loss": 0.6706562, + "learning_rate": 2.0099342583311563e-07, + "loss": 0.74734735, + "num_input_tokens_seen": 308686300, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.10888672, + "step": 14310, + "time_per_iteration": 2.5191948413848877 + }, + { + "auxiliary_loss_clip": 0.06399269, + "auxiliary_loss_mlp": 0.01264383, + "balance_loss_clip": 0.06266133, + "balance_loss_mlp": 0.01255657, + "epoch": 0.8604238689313092, + "flos": 21842397469440.0, + "grad_norm": 1.7447011135153685, + "language_loss": 0.78432125, + "learning_rate": 2.0082329868540905e-07, + "loss": 0.8609578, + "num_input_tokens_seen": 308705825, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.0871582, + "step": 14311, + "time_per_iteration": 2.5042197704315186 + }, + { + "auxiliary_loss_clip": 0.06401919, + "auxiliary_loss_mlp": 0.01263334, + "balance_loss_clip": 0.06270894, + "balance_loss_mlp": 0.01253904, + "epoch": 0.8604839921839772, + "flos": 18010019278080.0, + "grad_norm": 1.955815230439552, + "language_loss": 0.71597105, + "learning_rate": 2.006532397626639e-07, + "loss": 0.79262364, + "num_input_tokens_seen": 308723340, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.09429932, + "step": 14312, + "time_per_iteration": 2.5219128131866455 + }, + { + "auxiliary_loss_clip": 0.06400298, + "auxiliary_loss_mlp": 0.01265117, + "balance_loss_clip": 0.06270005, + "balance_loss_mlp": 0.01255586, + "epoch": 0.8605441154366451, + "flos": 16257558660480.0, + "grad_norm": 1.7707114111635922, + "language_loss": 0.78253788, + "learning_rate": 2.0048324907132797e-07, + "loss": 0.85919207, + "num_input_tokens_seen": 308741280, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.09527588, + "step": 14313, + "time_per_iteration": 2.493755340576172 + }, + { + "auxiliary_loss_clip": 0.06400809, + "auxiliary_loss_mlp": 0.01265934, + "balance_loss_clip": 0.06272502, + "balance_loss_mlp": 0.01255837, + "epoch": 0.8606042386893131, + "flos": 32274745084800.0, + "grad_norm": 1.4922872578644866, + "language_loss": 0.72934496, + "learning_rate": 2.003133266178474e-07, + "loss": 0.80601239, + "num_input_tokens_seen": 308762875, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.10101318, + "step": 14314, + "time_per_iteration": 2.621281862258911 + }, + { + "auxiliary_loss_clip": 0.06400359, + "auxiliary_loss_mlp": 0.01263313, + "balance_loss_clip": 0.06269641, + "balance_loss_mlp": 0.01253687, + "epoch": 0.860664361941981, + "flos": 20235943791360.0, + "grad_norm": 1.7275534208829755, + "language_loss": 0.69404042, + "learning_rate": 2.001434724086657e-07, + "loss": 0.77067709, + "num_input_tokens_seen": 308780315, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09619141, + "step": 14315, + "time_per_iteration": 2.4812421798706055 + }, + { + "auxiliary_loss_clip": 0.06402497, + "auxiliary_loss_mlp": 0.01266885, + "balance_loss_clip": 0.06271563, + "balance_loss_mlp": 0.01257789, + "epoch": 0.8607244851946491, + "flos": 25198586507520.0, + "grad_norm": 1.8449394172666267, + "language_loss": 0.71876442, + "learning_rate": 1.9997368645022418e-07, + "loss": 0.79545832, + "num_input_tokens_seen": 308799435, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09088135, + "step": 14316, + "time_per_iteration": 2.5461459159851074 + }, + { + "auxiliary_loss_clip": 0.06405434, + "auxiliary_loss_mlp": 0.01266236, + "balance_loss_clip": 0.06272785, + "balance_loss_mlp": 0.0125664, + "epoch": 0.860784608447317, + "flos": 20487776837760.0, + "grad_norm": 1.8785897277558985, + "language_loss": 0.8305161, + "learning_rate": 1.9980396874896056e-07, + "loss": 0.90723282, + "num_input_tokens_seen": 308817730, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.09588623, + "step": 14317, + "time_per_iteration": 2.4942269325256348 + }, + { + "auxiliary_loss_clip": 0.06398265, + "auxiliary_loss_mlp": 0.01264212, + "balance_loss_clip": 0.06269276, + "balance_loss_mlp": 0.01255129, + "epoch": 0.860844731699985, + "flos": 50487192627840.0, + "grad_norm": 1.57272546994991, + "language_loss": 0.67247951, + "learning_rate": 1.996343193113108e-07, + "loss": 0.74910432, + "num_input_tokens_seen": 308841735, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.09082031, + "step": 14318, + "time_per_iteration": 2.753952980041504 + }, + { + "auxiliary_loss_clip": 0.06399272, + "auxiliary_loss_mlp": 0.01259503, + "balance_loss_clip": 0.0627184, + "balance_loss_mlp": 0.01250891, + "epoch": 0.8609048549526529, + "flos": 41182468133760.0, + "grad_norm": 1.5269464521671718, + "language_loss": 0.71332115, + "learning_rate": 1.9946473814370911e-07, + "loss": 0.78990889, + "num_input_tokens_seen": 308865050, + "router_z_loss_clip": 1.27441406, + "router_z_loss_mlp": 0.08612061, + "step": 14319, + "time_per_iteration": 2.6694722175598145 + }, + { + "auxiliary_loss_clip": 0.06406449, + "auxiliary_loss_mlp": 0.01263054, + "balance_loss_clip": 0.06272025, + "balance_loss_mlp": 0.01253565, + "epoch": 0.8609649782053209, + "flos": 23957967755520.0, + "grad_norm": 1.5943400470171931, + "language_loss": 0.67575115, + "learning_rate": 1.992952252525839e-07, + "loss": 0.75244617, + "num_input_tokens_seen": 308885375, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.09484863, + "step": 14320, + "time_per_iteration": 3.9435226917266846 + }, + { + "auxiliary_loss_clip": 0.06404917, + "auxiliary_loss_mlp": 0.01263639, + "balance_loss_clip": 0.06270186, + "balance_loss_mlp": 0.01254036, + "epoch": 0.8610251014579888, + "flos": 23119297839360.0, + "grad_norm": 2.410508268349302, + "language_loss": 0.80603713, + "learning_rate": 1.9912578064436446e-07, + "loss": 0.88272274, + "num_input_tokens_seen": 308904700, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.09606934, + "step": 14321, + "time_per_iteration": 2.55540132522583 + }, + { + "auxiliary_loss_clip": 0.06397673, + "auxiliary_loss_mlp": 0.01266501, + "balance_loss_clip": 0.06271833, + "balance_loss_mlp": 0.01257114, + "epoch": 0.8610852247106568, + "flos": 19432800806400.0, + "grad_norm": 1.7626385906380733, + "language_loss": 0.71308374, + "learning_rate": 1.9895640432547567e-07, + "loss": 0.78972548, + "num_input_tokens_seen": 308922985, + "router_z_loss_clip": 1.25878906, + "router_z_loss_mlp": 0.09387207, + "step": 14322, + "time_per_iteration": 2.454256772994995 + }, + { + "auxiliary_loss_clip": 0.06408723, + "auxiliary_loss_mlp": 0.01266883, + "balance_loss_clip": 0.06271061, + "balance_loss_mlp": 0.01256369, + "epoch": 0.8611453479633249, + "flos": 19317163772160.0, + "grad_norm": 1.7944348088812987, + "language_loss": 0.56349087, + "learning_rate": 1.9878709630234102e-07, + "loss": 0.64024693, + "num_input_tokens_seen": 308940765, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.10516357, + "step": 14323, + "time_per_iteration": 2.502837896347046 + }, + { + "auxiliary_loss_clip": 0.06400344, + "auxiliary_loss_mlp": 0.01266337, + "balance_loss_clip": 0.06269696, + "balance_loss_mlp": 0.0125736, + "epoch": 0.8612054712159928, + "flos": 23259602701440.0, + "grad_norm": 1.6798995165774648, + "language_loss": 0.7580722, + "learning_rate": 1.986178565813801e-07, + "loss": 0.83473897, + "num_input_tokens_seen": 308960110, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.08972168, + "step": 14324, + "time_per_iteration": 3.954850912094116 + }, + { + "auxiliary_loss_clip": 0.06402896, + "auxiliary_loss_mlp": 0.01263656, + "balance_loss_clip": 0.06271403, + "balance_loss_mlp": 0.01253416, + "epoch": 0.8612655944686608, + "flos": 16032992918400.0, + "grad_norm": 2.3205040233098866, + "language_loss": 0.66479814, + "learning_rate": 1.9844868516901036e-07, + "loss": 0.74146366, + "num_input_tokens_seen": 308976665, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.10235596, + "step": 14325, + "time_per_iteration": 2.5524306297302246 + }, + { + "auxiliary_loss_clip": 0.06404022, + "auxiliary_loss_mlp": 0.01264163, + "balance_loss_clip": 0.0627131, + "balance_loss_mlp": 0.01254382, + "epoch": 0.8613257177213287, + "flos": 22499407733760.0, + "grad_norm": 2.0036509537419964, + "language_loss": 0.65199041, + "learning_rate": 1.982795820716472e-07, + "loss": 0.72867227, + "num_input_tokens_seen": 308997015, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09765625, + "step": 14326, + "time_per_iteration": 3.9491071701049805 + }, + { + "auxiliary_loss_clip": 0.06404285, + "auxiliary_loss_mlp": 0.01265148, + "balance_loss_clip": 0.06272285, + "balance_loss_mlp": 0.01255719, + "epoch": 0.8613858409739967, + "flos": 17243744889600.0, + "grad_norm": 2.07355797106235, + "language_loss": 0.84871465, + "learning_rate": 1.9811054729570253e-07, + "loss": 0.92540902, + "num_input_tokens_seen": 309015250, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.09436035, + "step": 14327, + "time_per_iteration": 2.4626643657684326 + }, + { + "auxiliary_loss_clip": 0.06403395, + "auxiliary_loss_mlp": 0.01265431, + "balance_loss_clip": 0.06270425, + "balance_loss_mlp": 0.01255793, + "epoch": 0.8614459642266646, + "flos": 22827870938880.0, + "grad_norm": 1.9454555931500424, + "language_loss": 0.75197828, + "learning_rate": 1.9794158084758661e-07, + "loss": 0.82866651, + "num_input_tokens_seen": 309034140, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.09631348, + "step": 14328, + "time_per_iteration": 2.5205399990081787 + }, + { + "auxiliary_loss_clip": 0.06400015, + "auxiliary_loss_mlp": 0.01264532, + "balance_loss_clip": 0.06271035, + "balance_loss_mlp": 0.01255574, + "epoch": 0.8615060874793327, + "flos": 26511349224960.0, + "grad_norm": 1.7079556862754726, + "language_loss": 0.80290902, + "learning_rate": 1.9777268273370673e-07, + "loss": 0.87955445, + "num_input_tokens_seen": 309055075, + "router_z_loss_clip": 1.28808594, + "router_z_loss_mlp": 0.08959961, + "step": 14329, + "time_per_iteration": 2.5383529663085938 + }, + { + "auxiliary_loss_clip": 0.06400427, + "auxiliary_loss_mlp": 0.01268007, + "balance_loss_clip": 0.06269085, + "balance_loss_mlp": 0.01258679, + "epoch": 0.8615662107320006, + "flos": 24067860785280.0, + "grad_norm": 2.856996046278892, + "language_loss": 0.76966333, + "learning_rate": 1.9760385296046757e-07, + "loss": 0.84634769, + "num_input_tokens_seen": 309074650, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09326172, + "step": 14330, + "time_per_iteration": 2.523453950881958 + }, + { + "auxiliary_loss_clip": 0.06401514, + "auxiliary_loss_mlp": 0.01263477, + "balance_loss_clip": 0.06272125, + "balance_loss_mlp": 0.01254405, + "epoch": 0.8616263339846686, + "flos": 24171003561600.0, + "grad_norm": 1.7944132766223935, + "language_loss": 0.65172309, + "learning_rate": 1.974350915342702e-07, + "loss": 0.72837293, + "num_input_tokens_seen": 309094385, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.09069824, + "step": 14331, + "time_per_iteration": 2.494178533554077 + }, + { + "auxiliary_loss_clip": 0.06397793, + "auxiliary_loss_mlp": 0.01264862, + "balance_loss_clip": 0.06269865, + "balance_loss_mlp": 0.01256314, + "epoch": 0.8616864572373365, + "flos": 21730533868800.0, + "grad_norm": 2.0612116619003933, + "language_loss": 0.76773548, + "learning_rate": 1.9726639846151506e-07, + "loss": 0.84436202, + "num_input_tokens_seen": 309111815, + "router_z_loss_clip": 1.27929688, + "router_z_loss_mlp": 0.08544922, + "step": 14332, + "time_per_iteration": 2.5452775955200195 + }, + { + "auxiliary_loss_clip": 0.06406568, + "auxiliary_loss_mlp": 0.01265905, + "balance_loss_clip": 0.0627177, + "balance_loss_mlp": 0.012556, + "epoch": 0.8617465804900045, + "flos": 23773037794560.0, + "grad_norm": 2.589644465810543, + "language_loss": 0.66962624, + "learning_rate": 1.9709777374859904e-07, + "loss": 0.74635088, + "num_input_tokens_seen": 309131385, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.10321045, + "step": 14333, + "time_per_iteration": 2.507899761199951 + }, + { + "auxiliary_loss_clip": 0.06408904, + "auxiliary_loss_mlp": 0.01266112, + "balance_loss_clip": 0.06271466, + "balance_loss_mlp": 0.01255365, + "epoch": 0.8618067037426724, + "flos": 37712612632320.0, + "grad_norm": 2.0727942750697244, + "language_loss": 0.62088275, + "learning_rate": 1.969292174019157e-07, + "loss": 0.69763291, + "num_input_tokens_seen": 309155020, + "router_z_loss_clip": 1.37402344, + "router_z_loss_mlp": 0.10736084, + "step": 14334, + "time_per_iteration": 2.727379322052002 + }, + { + "auxiliary_loss_clip": 0.06409249, + "auxiliary_loss_mlp": 0.01266887, + "balance_loss_clip": 0.06273654, + "balance_loss_mlp": 0.01256861, + "epoch": 0.8618668269953405, + "flos": 21477526865280.0, + "grad_norm": 2.2092003237507627, + "language_loss": 0.69843465, + "learning_rate": 1.967607294278577e-07, + "loss": 0.77519608, + "num_input_tokens_seen": 309172865, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.10028076, + "step": 14335, + "time_per_iteration": 2.5096664428710938 + }, + { + "auxiliary_loss_clip": 0.0640562, + "auxiliary_loss_mlp": 0.01267636, + "balance_loss_clip": 0.06273089, + "balance_loss_mlp": 0.01257927, + "epoch": 0.8619269502480085, + "flos": 22238560373760.0, + "grad_norm": 1.539528781413438, + "language_loss": 0.83133399, + "learning_rate": 1.965923098328135e-07, + "loss": 0.90806651, + "num_input_tokens_seen": 309193575, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.09710693, + "step": 14336, + "time_per_iteration": 2.534871816635132 + }, + { + "auxiliary_loss_clip": 0.06407534, + "auxiliary_loss_mlp": 0.01266904, + "balance_loss_clip": 0.06270752, + "balance_loss_mlp": 0.01257725, + "epoch": 0.8619870735006764, + "flos": 22717181295360.0, + "grad_norm": 1.7880701547963709, + "language_loss": 0.67198873, + "learning_rate": 1.9642395862316907e-07, + "loss": 0.74873316, + "num_input_tokens_seen": 309212680, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.09179688, + "step": 14337, + "time_per_iteration": 2.4912726879119873 + }, + { + "auxiliary_loss_clip": 0.06400966, + "auxiliary_loss_mlp": 0.0126898, + "balance_loss_clip": 0.06269696, + "balance_loss_mlp": 0.01259574, + "epoch": 0.8620471967533444, + "flos": 37528730847360.0, + "grad_norm": 1.850620303251151, + "language_loss": 0.67287397, + "learning_rate": 1.962556758053089e-07, + "loss": 0.74957347, + "num_input_tokens_seen": 309234485, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09411621, + "step": 14338, + "time_per_iteration": 2.630582571029663 + }, + { + "auxiliary_loss_clip": 0.06403364, + "auxiliary_loss_mlp": 0.01264598, + "balance_loss_clip": 0.06270847, + "balance_loss_mlp": 0.01255693, + "epoch": 0.8621073200060123, + "flos": 19688533067520.0, + "grad_norm": 1.6865658257552463, + "language_loss": 0.62323976, + "learning_rate": 1.9608746138561448e-07, + "loss": 0.69991934, + "num_input_tokens_seen": 309253630, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.08905029, + "step": 14339, + "time_per_iteration": 2.4896788597106934 + }, + { + "auxiliary_loss_clip": 0.06398258, + "auxiliary_loss_mlp": 0.01261212, + "balance_loss_clip": 0.06268729, + "balance_loss_mlp": 0.01252009, + "epoch": 0.8621674432586803, + "flos": 14541882785280.0, + "grad_norm": 1.789087653765178, + "language_loss": 0.62707412, + "learning_rate": 1.9591931537046458e-07, + "loss": 0.70366883, + "num_input_tokens_seen": 309270950, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.09197998, + "step": 14340, + "time_per_iteration": 2.514129400253296 + }, + { + "auxiliary_loss_clip": 0.06398233, + "auxiliary_loss_mlp": 0.01270527, + "balance_loss_clip": 0.06275177, + "balance_loss_mlp": 0.01261962, + "epoch": 0.8622275665113482, + "flos": 20746276283520.0, + "grad_norm": 1.55822601807664, + "language_loss": 0.79994321, + "learning_rate": 1.9575123776623493e-07, + "loss": 0.87663078, + "num_input_tokens_seen": 309288780, + "router_z_loss_clip": 1.23046875, + "router_z_loss_mlp": 0.08569336, + "step": 14341, + "time_per_iteration": 2.4803621768951416 + }, + { + "auxiliary_loss_clip": 0.06398244, + "auxiliary_loss_mlp": 0.0126392, + "balance_loss_clip": 0.0626985, + "balance_loss_mlp": 0.01254693, + "epoch": 0.8622876897640163, + "flos": 24722565062400.0, + "grad_norm": 1.537556068716055, + "language_loss": 0.75025547, + "learning_rate": 1.9558322857929887e-07, + "loss": 0.82687712, + "num_input_tokens_seen": 309310875, + "router_z_loss_clip": 1.28320312, + "router_z_loss_mlp": 0.09234619, + "step": 14342, + "time_per_iteration": 2.530914545059204 + }, + { + "auxiliary_loss_clip": 0.06404229, + "auxiliary_loss_mlp": 0.0126642, + "balance_loss_clip": 0.06270722, + "balance_loss_mlp": 0.01255841, + "epoch": 0.8623478130166842, + "flos": 17463153605760.0, + "grad_norm": 6.2322370911509815, + "language_loss": 0.68703187, + "learning_rate": 1.95415287816028e-07, + "loss": 0.76373827, + "num_input_tokens_seen": 309329900, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.10577393, + "step": 14343, + "time_per_iteration": 2.5865073204040527 + }, + { + "auxiliary_loss_clip": 0.06402855, + "auxiliary_loss_mlp": 0.01268855, + "balance_loss_clip": 0.06271795, + "balance_loss_mlp": 0.0125914, + "epoch": 0.8624079362693522, + "flos": 18114252157440.0, + "grad_norm": 1.6558360016746088, + "language_loss": 0.68030214, + "learning_rate": 1.9524741548278967e-07, + "loss": 0.75701928, + "num_input_tokens_seen": 309347870, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09716797, + "step": 14344, + "time_per_iteration": 2.537827730178833 + }, + { + "auxiliary_loss_clip": 0.06405529, + "auxiliary_loss_mlp": 0.01265965, + "balance_loss_clip": 0.06271856, + "balance_loss_mlp": 0.01256232, + "epoch": 0.8624680595220201, + "flos": 30674664316800.0, + "grad_norm": 1.3739709083227454, + "language_loss": 0.81833351, + "learning_rate": 1.9507961158595054e-07, + "loss": 0.8950485, + "num_input_tokens_seen": 309371695, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.09735107, + "step": 14345, + "time_per_iteration": 2.5870211124420166 + }, + { + "auxiliary_loss_clip": 0.06403511, + "auxiliary_loss_mlp": 0.0126453, + "balance_loss_clip": 0.06269494, + "balance_loss_mlp": 0.01254606, + "epoch": 0.8625281827746881, + "flos": 38007771039360.0, + "grad_norm": 1.9821482587026948, + "language_loss": 0.51161534, + "learning_rate": 1.9491187613187355e-07, + "loss": 0.58829576, + "num_input_tokens_seen": 309394645, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.09918213, + "step": 14346, + "time_per_iteration": 2.6315839290618896 + }, + { + "auxiliary_loss_clip": 0.06401588, + "auxiliary_loss_mlp": 0.01266829, + "balance_loss_clip": 0.06270475, + "balance_loss_mlp": 0.01257567, + "epoch": 0.862588306027356, + "flos": 26256874775040.0, + "grad_norm": 1.4328060153446618, + "language_loss": 0.75362718, + "learning_rate": 1.9474420912691913e-07, + "loss": 0.8303113, + "num_input_tokens_seen": 309413170, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.09259033, + "step": 14347, + "time_per_iteration": 2.541715383529663 + }, + { + "auxiliary_loss_clip": 0.06404621, + "auxiliary_loss_mlp": 0.01266719, + "balance_loss_clip": 0.06272689, + "balance_loss_mlp": 0.0125667, + "epoch": 0.862648429280024, + "flos": 25884876574080.0, + "grad_norm": 1.79527283779547, + "language_loss": 0.80723387, + "learning_rate": 1.945766105774449e-07, + "loss": 0.88394725, + "num_input_tokens_seen": 309431315, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.10046387, + "step": 14348, + "time_per_iteration": 3.9310317039489746 + }, + { + "auxiliary_loss_clip": 0.06397234, + "auxiliary_loss_mlp": 0.01262407, + "balance_loss_clip": 0.06269418, + "balance_loss_mlp": 0.01253526, + "epoch": 0.862708552532692, + "flos": 37825608263040.0, + "grad_norm": 1.5979162095683273, + "language_loss": 0.664671, + "learning_rate": 1.9440908048980665e-07, + "loss": 0.74126744, + "num_input_tokens_seen": 309453020, + "router_z_loss_clip": 1.27832031, + "router_z_loss_mlp": 0.08886719, + "step": 14349, + "time_per_iteration": 2.667307138442993 + }, + { + "auxiliary_loss_clip": 0.06402015, + "auxiliary_loss_mlp": 0.01266689, + "balance_loss_clip": 0.06271265, + "balance_loss_mlp": 0.01257206, + "epoch": 0.86276867578536, + "flos": 19096623025920.0, + "grad_norm": 2.3630117925707474, + "language_loss": 0.70285714, + "learning_rate": 1.942416188703573e-07, + "loss": 0.77954423, + "num_input_tokens_seen": 309469780, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09484863, + "step": 14350, + "time_per_iteration": 2.4467716217041016 + }, + { + "auxiliary_loss_clip": 0.06401723, + "auxiliary_loss_mlp": 0.01264798, + "balance_loss_clip": 0.06270139, + "balance_loss_mlp": 0.0125482, + "epoch": 0.862828799038028, + "flos": 22170902601600.0, + "grad_norm": 1.8189959040488348, + "language_loss": 0.77373683, + "learning_rate": 1.9407422572544618e-07, + "loss": 0.85040212, + "num_input_tokens_seen": 309489610, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09979248, + "step": 14351, + "time_per_iteration": 2.5256404876708984 + }, + { + "auxiliary_loss_clip": 0.06401232, + "auxiliary_loss_mlp": 0.01265649, + "balance_loss_clip": 0.06269863, + "balance_loss_mlp": 0.0125622, + "epoch": 0.8628889222906959, + "flos": 23151722169600.0, + "grad_norm": 2.066153736176063, + "language_loss": 0.84886032, + "learning_rate": 1.9390690106142204e-07, + "loss": 0.92552912, + "num_input_tokens_seen": 309508295, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09436035, + "step": 14352, + "time_per_iteration": 2.4913690090179443 + }, + { + "auxiliary_loss_clip": 0.06313725, + "auxiliary_loss_mlp": 0.01255388, + "balance_loss_clip": 0.06258518, + "balance_loss_mlp": 0.01254317, + "epoch": 0.8629490455433639, + "flos": 57837600489600.0, + "grad_norm": 0.7926925126054749, + "language_loss": 0.61875582, + "learning_rate": 1.9373964488462913e-07, + "loss": 0.69444704, + "num_input_tokens_seen": 309567960, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.0107193, + "step": 14353, + "time_per_iteration": 3.177020788192749 + }, + { + "auxiliary_loss_clip": 0.06400892, + "auxiliary_loss_mlp": 0.01265779, + "balance_loss_clip": 0.06272262, + "balance_loss_mlp": 0.01257202, + "epoch": 0.8630091687960318, + "flos": 15924315772800.0, + "grad_norm": 1.6311074355718687, + "language_loss": 0.81987357, + "learning_rate": 1.9357245720140948e-07, + "loss": 0.89654028, + "num_input_tokens_seen": 309586050, + "router_z_loss_clip": 1.28417969, + "router_z_loss_mlp": 0.08575439, + "step": 14354, + "time_per_iteration": 2.4930381774902344 + }, + { + "auxiliary_loss_clip": 0.06401116, + "auxiliary_loss_mlp": 0.0126246, + "balance_loss_clip": 0.06269745, + "balance_loss_mlp": 0.01252792, + "epoch": 0.8630692920486999, + "flos": 17966484282240.0, + "grad_norm": 1.7753060969616925, + "language_loss": 0.85697293, + "learning_rate": 1.934053380181031e-07, + "loss": 0.93360865, + "num_input_tokens_seen": 309602910, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09667969, + "step": 14355, + "time_per_iteration": 2.53157901763916 + }, + { + "auxiliary_loss_clip": 0.06404698, + "auxiliary_loss_mlp": 0.01264579, + "balance_loss_clip": 0.0627116, + "balance_loss_mlp": 0.01254684, + "epoch": 0.8631294153013678, + "flos": 22461658669440.0, + "grad_norm": 2.081321104089011, + "language_loss": 0.58636671, + "learning_rate": 1.9323828734104763e-07, + "loss": 0.66305947, + "num_input_tokens_seen": 309621175, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.09893799, + "step": 14356, + "time_per_iteration": 2.4832444190979004 + }, + { + "auxiliary_loss_clip": 0.06409314, + "auxiliary_loss_mlp": 0.01265255, + "balance_loss_clip": 0.06271717, + "balance_loss_mlp": 0.01254502, + "epoch": 0.8631895385540358, + "flos": 16842676521600.0, + "grad_norm": 2.3858514635605945, + "language_loss": 0.7736609, + "learning_rate": 1.9307130517657756e-07, + "loss": 0.85040665, + "num_input_tokens_seen": 309639395, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.10754395, + "step": 14357, + "time_per_iteration": 2.5092248916625977 + }, + { + "auxiliary_loss_clip": 0.06403995, + "auxiliary_loss_mlp": 0.01266002, + "balance_loss_clip": 0.06271581, + "balance_loss_mlp": 0.01256227, + "epoch": 0.8632496618067037, + "flos": 18703101525120.0, + "grad_norm": 2.0189776853182906, + "language_loss": 0.7785663, + "learning_rate": 1.9290439153102468e-07, + "loss": 0.85526627, + "num_input_tokens_seen": 309657265, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.09765625, + "step": 14358, + "time_per_iteration": 2.4995810985565186 + }, + { + "auxiliary_loss_clip": 0.06403126, + "auxiliary_loss_mlp": 0.0126148, + "balance_loss_clip": 0.06271631, + "balance_loss_mlp": 0.01252575, + "epoch": 0.8633097850593717, + "flos": 24286808304000.0, + "grad_norm": 1.2976012595497113, + "language_loss": 0.75020969, + "learning_rate": 1.9273754641071816e-07, + "loss": 0.82685572, + "num_input_tokens_seen": 309678610, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.08905029, + "step": 14359, + "time_per_iteration": 3.9602229595184326 + }, + { + "auxiliary_loss_clip": 0.06394325, + "auxiliary_loss_mlp": 0.01264972, + "balance_loss_clip": 0.06267578, + "balance_loss_mlp": 0.01256204, + "epoch": 0.8633699083120396, + "flos": 21184926007680.0, + "grad_norm": 1.9803616638517643, + "language_loss": 0.70742667, + "learning_rate": 1.9257076982198517e-07, + "loss": 0.78401971, + "num_input_tokens_seen": 309697710, + "router_z_loss_clip": 1.26953125, + "router_z_loss_mlp": 0.08776855, + "step": 14360, + "time_per_iteration": 2.5267932415008545 + }, + { + "auxiliary_loss_clip": 0.06407928, + "auxiliary_loss_mlp": 0.0126448, + "balance_loss_clip": 0.06273921, + "balance_loss_mlp": 0.01254055, + "epoch": 0.8634300315647077, + "flos": 19250931519360.0, + "grad_norm": 1.9077703243953956, + "language_loss": 0.76441604, + "learning_rate": 1.9240406177114953e-07, + "loss": 0.84114009, + "num_input_tokens_seen": 309715985, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.10424805, + "step": 14361, + "time_per_iteration": 2.490943193435669 + }, + { + "auxiliary_loss_clip": 0.06311232, + "auxiliary_loss_mlp": 0.0125435, + "balance_loss_clip": 0.062562, + "balance_loss_mlp": 0.01253265, + "epoch": 0.8634901548173756, + "flos": 66214572577920.0, + "grad_norm": 0.8650846774823281, + "language_loss": 0.586797, + "learning_rate": 1.922374222645329e-07, + "loss": 0.66245276, + "num_input_tokens_seen": 309779930, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.01087189, + "step": 14362, + "time_per_iteration": 3.1930222511291504 + }, + { + "auxiliary_loss_clip": 0.06408567, + "auxiliary_loss_mlp": 0.01271559, + "balance_loss_clip": 0.06273866, + "balance_loss_mlp": 0.01261497, + "epoch": 0.8635502780700436, + "flos": 24796302255360.0, + "grad_norm": 1.6117142175408212, + "language_loss": 0.80565244, + "learning_rate": 1.9207085130845524e-07, + "loss": 0.88245368, + "num_input_tokens_seen": 309800580, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.10064697, + "step": 14363, + "time_per_iteration": 2.5488052368164062 + }, + { + "auxiliary_loss_clip": 0.06405362, + "auxiliary_loss_mlp": 0.01264899, + "balance_loss_clip": 0.06271777, + "balance_loss_mlp": 0.01254325, + "epoch": 0.8636104013227116, + "flos": 25196657863680.0, + "grad_norm": 3.2911093464095376, + "language_loss": 0.7295658, + "learning_rate": 1.9190434890923112e-07, + "loss": 0.80626839, + "num_input_tokens_seen": 309821725, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.10577393, + "step": 14364, + "time_per_iteration": 3.9698781967163086 + }, + { + "auxiliary_loss_clip": 0.0640732, + "auxiliary_loss_mlp": 0.01263265, + "balance_loss_clip": 0.06272443, + "balance_loss_mlp": 0.01253978, + "epoch": 0.8636705245753795, + "flos": 23885236811520.0, + "grad_norm": 1.455571022027207, + "language_loss": 0.7167381, + "learning_rate": 1.917379150731755e-07, + "loss": 0.79344392, + "num_input_tokens_seen": 309841565, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.09295654, + "step": 14365, + "time_per_iteration": 3.9607086181640625 + }, + { + "auxiliary_loss_clip": 0.06408954, + "auxiliary_loss_mlp": 0.01268552, + "balance_loss_clip": 0.06272392, + "balance_loss_mlp": 0.01257472, + "epoch": 0.8637306478280475, + "flos": 23116824144000.0, + "grad_norm": 1.9610886432207495, + "language_loss": 0.71209329, + "learning_rate": 1.915715498065993e-07, + "loss": 0.78886831, + "num_input_tokens_seen": 309858635, + "router_z_loss_clip": 1.36621094, + "router_z_loss_mlp": 0.11090088, + "step": 14366, + "time_per_iteration": 2.502300977706909 + }, + { + "auxiliary_loss_clip": 0.0639744, + "auxiliary_loss_mlp": 0.01266839, + "balance_loss_clip": 0.06268862, + "balance_loss_mlp": 0.01258071, + "epoch": 0.8637907710807154, + "flos": 21913032061440.0, + "grad_norm": 3.9077232982556196, + "language_loss": 0.81972671, + "learning_rate": 1.9140525311581146e-07, + "loss": 0.89636946, + "num_input_tokens_seen": 309877885, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.08764648, + "step": 14367, + "time_per_iteration": 2.5068411827087402 + }, + { + "auxiliary_loss_clip": 0.0640227, + "auxiliary_loss_mlp": 0.0126336, + "balance_loss_clip": 0.06269377, + "balance_loss_mlp": 0.01253269, + "epoch": 0.8638508943333835, + "flos": 23586263043840.0, + "grad_norm": 2.019581069105167, + "language_loss": 0.6210227, + "learning_rate": 1.9123902500711743e-07, + "loss": 0.69767898, + "num_input_tokens_seen": 309893140, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.10095215, + "step": 14368, + "time_per_iteration": 2.502528429031372 + }, + { + "auxiliary_loss_clip": 0.06402116, + "auxiliary_loss_mlp": 0.01265082, + "balance_loss_clip": 0.06271379, + "balance_loss_mlp": 0.01255427, + "epoch": 0.8639110175860514, + "flos": 25782991608960.0, + "grad_norm": 1.8655713260907347, + "language_loss": 0.76021969, + "learning_rate": 1.91072865486821e-07, + "loss": 0.83689165, + "num_input_tokens_seen": 309914175, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09661865, + "step": 14369, + "time_per_iteration": 2.5583889484405518 + }, + { + "auxiliary_loss_clip": 0.06405649, + "auxiliary_loss_mlp": 0.01268162, + "balance_loss_clip": 0.06269823, + "balance_loss_mlp": 0.01257409, + "epoch": 0.8639711408387194, + "flos": 23376455619840.0, + "grad_norm": 1.8041581348752054, + "language_loss": 0.64473277, + "learning_rate": 1.9090677456122294e-07, + "loss": 0.72147083, + "num_input_tokens_seen": 309932395, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.10754395, + "step": 14370, + "time_per_iteration": 2.523294687271118 + }, + { + "auxiliary_loss_clip": 0.06406188, + "auxiliary_loss_mlp": 0.01265473, + "balance_loss_clip": 0.06274764, + "balance_loss_mlp": 0.01256085, + "epoch": 0.8640312640913873, + "flos": 22133740515840.0, + "grad_norm": 1.5680829975837718, + "language_loss": 0.66822445, + "learning_rate": 1.907407522366209e-07, + "loss": 0.744941, + "num_input_tokens_seen": 309951720, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09381104, + "step": 14371, + "time_per_iteration": 2.529430389404297 + }, + { + "auxiliary_loss_clip": 0.06313685, + "auxiliary_loss_mlp": 0.01251782, + "balance_loss_clip": 0.06259193, + "balance_loss_mlp": 0.012508, + "epoch": 0.8640913873440553, + "flos": 57586998055680.0, + "grad_norm": 0.8486423176680128, + "language_loss": 0.57041156, + "learning_rate": 1.905747985193107e-07, + "loss": 0.64606631, + "num_input_tokens_seen": 310006120, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.0098114, + "step": 14372, + "time_per_iteration": 3.0363752841949463 + }, + { + "auxiliary_loss_clip": 0.06399583, + "auxiliary_loss_mlp": 0.01263288, + "balance_loss_clip": 0.06271808, + "balance_loss_mlp": 0.01253811, + "epoch": 0.8641515105967232, + "flos": 23994165519360.0, + "grad_norm": 1.5906200485227884, + "language_loss": 0.79251468, + "learning_rate": 1.9040891341558597e-07, + "loss": 0.86914343, + "num_input_tokens_seen": 310026740, + "router_z_loss_clip": 1.27734375, + "router_z_loss_mlp": 0.0947876, + "step": 14373, + "time_per_iteration": 2.5637240409851074 + }, + { + "auxiliary_loss_clip": 0.06403147, + "auxiliary_loss_mlp": 0.01263805, + "balance_loss_clip": 0.06269763, + "balance_loss_mlp": 0.01254328, + "epoch": 0.8642116338493913, + "flos": 19068810670080.0, + "grad_norm": 1.7439997489953305, + "language_loss": 0.63977039, + "learning_rate": 1.9024309693173656e-07, + "loss": 0.7164399, + "num_input_tokens_seen": 310044135, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.0947876, + "step": 14374, + "time_per_iteration": 2.5307912826538086 + }, + { + "auxiliary_loss_clip": 0.06398176, + "auxiliary_loss_mlp": 0.01263527, + "balance_loss_clip": 0.06268865, + "balance_loss_mlp": 0.01254652, + "epoch": 0.8642717571020592, + "flos": 18259085139840.0, + "grad_norm": 2.2572077948028433, + "language_loss": 0.77652001, + "learning_rate": 1.9007734907404993e-07, + "loss": 0.85313702, + "num_input_tokens_seen": 310061560, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.08880615, + "step": 14375, + "time_per_iteration": 2.483269453048706 + }, + { + "auxiliary_loss_clip": 0.06401785, + "auxiliary_loss_mlp": 0.01263893, + "balance_loss_clip": 0.06269706, + "balance_loss_mlp": 0.01253892, + "epoch": 0.8643318803547272, + "flos": 57675550222080.0, + "grad_norm": 1.541843891786326, + "language_loss": 0.61128557, + "learning_rate": 1.899116698488117e-07, + "loss": 0.68794239, + "num_input_tokens_seen": 310087310, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.10003662, + "step": 14376, + "time_per_iteration": 2.8843209743499756 + }, + { + "auxiliary_loss_clip": 0.06403586, + "auxiliary_loss_mlp": 0.01264991, + "balance_loss_clip": 0.06272595, + "balance_loss_mlp": 0.01254876, + "epoch": 0.8643920036073952, + "flos": 19615592488320.0, + "grad_norm": 1.5018425014580143, + "language_loss": 0.66786122, + "learning_rate": 1.8974605926230457e-07, + "loss": 0.74454701, + "num_input_tokens_seen": 310106260, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.10107422, + "step": 14377, + "time_per_iteration": 2.5229828357696533 + }, + { + "auxiliary_loss_clip": 0.06406192, + "auxiliary_loss_mlp": 0.01265361, + "balance_loss_clip": 0.06271313, + "balance_loss_mlp": 0.01255604, + "epoch": 0.8644521268600631, + "flos": 20856672437760.0, + "grad_norm": 1.4771903457051754, + "language_loss": 0.70475584, + "learning_rate": 1.8958051732080804e-07, + "loss": 0.78147137, + "num_input_tokens_seen": 310125305, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.09747314, + "step": 14378, + "time_per_iteration": 2.509063720703125 + }, + { + "auxiliary_loss_clip": 0.06313916, + "auxiliary_loss_mlp": 0.01254059, + "balance_loss_clip": 0.06259046, + "balance_loss_mlp": 0.01252975, + "epoch": 0.8645122501127311, + "flos": 66740753491200.0, + "grad_norm": 0.7838907158972782, + "language_loss": 0.60319781, + "learning_rate": 1.894150440305995e-07, + "loss": 0.67887759, + "num_input_tokens_seen": 310189270, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.01085663, + "step": 14379, + "time_per_iteration": 3.1320457458496094 + }, + { + "auxiliary_loss_clip": 0.06399889, + "auxiliary_loss_mlp": 0.01263784, + "balance_loss_clip": 0.0627097, + "balance_loss_mlp": 0.01254605, + "epoch": 0.864572373365399, + "flos": 21696558238080.0, + "grad_norm": 1.5435489146258106, + "language_loss": 0.74544406, + "learning_rate": 1.8924963939795478e-07, + "loss": 0.82208085, + "num_input_tokens_seen": 310208395, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.09179688, + "step": 14380, + "time_per_iteration": 2.533979654312134 + }, + { + "auxiliary_loss_clip": 0.0641063, + "auxiliary_loss_mlp": 0.0126624, + "balance_loss_clip": 0.06273422, + "balance_loss_mlp": 0.01256018, + "epoch": 0.8646324966180671, + "flos": 20272602752640.0, + "grad_norm": 1.8170227609010927, + "language_loss": 0.75806165, + "learning_rate": 1.8908430342914473e-07, + "loss": 0.83483034, + "num_input_tokens_seen": 310227415, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.10235596, + "step": 14381, + "time_per_iteration": 2.497065305709839 + }, + { + "auxiliary_loss_clip": 0.06403077, + "auxiliary_loss_mlp": 0.01262559, + "balance_loss_clip": 0.06272194, + "balance_loss_mlp": 0.01253457, + "epoch": 0.864692619870735, + "flos": 11950752251520.0, + "grad_norm": 2.2051437875425757, + "language_loss": 0.84932131, + "learning_rate": 1.8891903613043892e-07, + "loss": 0.92597765, + "num_input_tokens_seen": 310242625, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09106445, + "step": 14382, + "time_per_iteration": 2.535344362258911 + }, + { + "auxiliary_loss_clip": 0.06403528, + "auxiliary_loss_mlp": 0.0126157, + "balance_loss_clip": 0.06271295, + "balance_loss_mlp": 0.01252058, + "epoch": 0.864752743123403, + "flos": 21477149521920.0, + "grad_norm": 1.6567318612766335, + "language_loss": 0.75987065, + "learning_rate": 1.8875383750810504e-07, + "loss": 0.83652163, + "num_input_tokens_seen": 310260585, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.09509277, + "step": 14383, + "time_per_iteration": 2.5716378688812256 + }, + { + "auxiliary_loss_clip": 0.06400105, + "auxiliary_loss_mlp": 0.01265637, + "balance_loss_clip": 0.06271577, + "balance_loss_mlp": 0.01256738, + "epoch": 0.8648128663760709, + "flos": 19534979260800.0, + "grad_norm": 1.6589847314556463, + "language_loss": 0.84984505, + "learning_rate": 1.8858870756840738e-07, + "loss": 0.92650247, + "num_input_tokens_seen": 310277210, + "router_z_loss_clip": 1.28613281, + "router_z_loss_mlp": 0.08892822, + "step": 14384, + "time_per_iteration": 2.5308241844177246 + }, + { + "auxiliary_loss_clip": 0.06400002, + "auxiliary_loss_mlp": 0.01265, + "balance_loss_clip": 0.06269626, + "balance_loss_mlp": 0.01255344, + "epoch": 0.8648729896287389, + "flos": 21294315912960.0, + "grad_norm": 1.7401611102824495, + "language_loss": 0.81164479, + "learning_rate": 1.884236463176072e-07, + "loss": 0.88829482, + "num_input_tokens_seen": 310296610, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.09655762, + "step": 14385, + "time_per_iteration": 2.4921391010284424 + }, + { + "auxiliary_loss_clip": 0.06406556, + "auxiliary_loss_mlp": 0.01267811, + "balance_loss_clip": 0.06271443, + "balance_loss_mlp": 0.01257428, + "epoch": 0.8649331128814068, + "flos": 24610785315840.0, + "grad_norm": 2.091649700881737, + "language_loss": 0.72774786, + "learning_rate": 1.8825865376196437e-07, + "loss": 0.80449152, + "num_input_tokens_seen": 310316830, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.1038208, + "step": 14386, + "time_per_iteration": 2.545750379562378 + }, + { + "auxiliary_loss_clip": 0.06401771, + "auxiliary_loss_mlp": 0.01262704, + "balance_loss_clip": 0.06270965, + "balance_loss_mlp": 0.01253138, + "epoch": 0.8649932361340749, + "flos": 15383277959040.0, + "grad_norm": 2.209665569654056, + "language_loss": 0.82382894, + "learning_rate": 1.8809372990773476e-07, + "loss": 0.90047371, + "num_input_tokens_seen": 310334355, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09570312, + "step": 14387, + "time_per_iteration": 3.9013686180114746 + }, + { + "auxiliary_loss_clip": 0.0640083, + "auxiliary_loss_mlp": 0.01263962, + "balance_loss_clip": 0.06272831, + "balance_loss_mlp": 0.01255004, + "epoch": 0.8650533593867428, + "flos": 19907312878080.0, + "grad_norm": 2.010329116526224, + "language_loss": 0.68742537, + "learning_rate": 1.8792887476117224e-07, + "loss": 0.76407325, + "num_input_tokens_seen": 310352900, + "router_z_loss_clip": 1.27929688, + "router_z_loss_mlp": 0.08966064, + "step": 14388, + "time_per_iteration": 2.504244565963745 + }, + { + "auxiliary_loss_clip": 0.06398115, + "auxiliary_loss_mlp": 0.01264198, + "balance_loss_clip": 0.06271598, + "balance_loss_mlp": 0.01255323, + "epoch": 0.8651134826394108, + "flos": 25633546652160.0, + "grad_norm": 1.6117795719153174, + "language_loss": 0.90809613, + "learning_rate": 1.877640883285283e-07, + "loss": 0.98471928, + "num_input_tokens_seen": 310372855, + "router_z_loss_clip": 1.265625, + "router_z_loss_mlp": 0.08874512, + "step": 14389, + "time_per_iteration": 2.5962395668029785 + }, + { + "auxiliary_loss_clip": 0.0639938, + "auxiliary_loss_mlp": 0.01263329, + "balance_loss_clip": 0.06270058, + "balance_loss_mlp": 0.01253947, + "epoch": 0.8651736058920788, + "flos": 18740557100160.0, + "grad_norm": 1.8613703066049654, + "language_loss": 0.71011788, + "learning_rate": 1.8759937061605212e-07, + "loss": 0.78674495, + "num_input_tokens_seen": 310391595, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.09375, + "step": 14390, + "time_per_iteration": 2.495643138885498 + }, + { + "auxiliary_loss_clip": 0.06405844, + "auxiliary_loss_mlp": 0.01268761, + "balance_loss_clip": 0.06271544, + "balance_loss_mlp": 0.01259392, + "epoch": 0.8652337291447467, + "flos": 20782977171840.0, + "grad_norm": 1.5876271483812678, + "language_loss": 0.8251009, + "learning_rate": 1.8743472162998941e-07, + "loss": 0.901847, + "num_input_tokens_seen": 310410090, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.09387207, + "step": 14391, + "time_per_iteration": 2.5008716583251953 + }, + { + "auxiliary_loss_clip": 0.06307146, + "auxiliary_loss_mlp": 0.01252466, + "balance_loss_clip": 0.06252509, + "balance_loss_mlp": 0.01251483, + "epoch": 0.8652938523974147, + "flos": 64246895948160.0, + "grad_norm": 0.8370368549242478, + "language_loss": 0.67857373, + "learning_rate": 1.8727014137658337e-07, + "loss": 0.75416982, + "num_input_tokens_seen": 310470055, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.00981903, + "step": 14392, + "time_per_iteration": 3.0305261611938477 + }, + { + "auxiliary_loss_clip": 0.06409582, + "auxiliary_loss_mlp": 0.01263889, + "balance_loss_clip": 0.06272376, + "balance_loss_mlp": 0.01253572, + "epoch": 0.8653539756500827, + "flos": 18046384750080.0, + "grad_norm": 1.905098269493672, + "language_loss": 0.75714177, + "learning_rate": 1.8710562986207523e-07, + "loss": 0.83387649, + "num_input_tokens_seen": 310487665, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.10314941, + "step": 14393, + "time_per_iteration": 2.505152940750122 + }, + { + "auxiliary_loss_clip": 0.06406023, + "auxiliary_loss_mlp": 0.0126336, + "balance_loss_clip": 0.06270998, + "balance_loss_mlp": 0.01253173, + "epoch": 0.8654140989027507, + "flos": 17387865112320.0, + "grad_norm": 1.8277719663675551, + "language_loss": 0.74051988, + "learning_rate": 1.8694118709270357e-07, + "loss": 0.81721365, + "num_input_tokens_seen": 310506130, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.10192871, + "step": 14394, + "time_per_iteration": 2.482966661453247 + }, + { + "auxiliary_loss_clip": 0.0640289, + "auxiliary_loss_mlp": 0.01264117, + "balance_loss_clip": 0.06269561, + "balance_loss_mlp": 0.01254634, + "epoch": 0.8654742221554186, + "flos": 53296390212480.0, + "grad_norm": 1.8585676526788644, + "language_loss": 0.65939879, + "learning_rate": 1.867768130747036e-07, + "loss": 0.7360689, + "num_input_tokens_seen": 310532445, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.09484863, + "step": 14395, + "time_per_iteration": 2.800736904144287 + }, + { + "auxiliary_loss_clip": 0.06404395, + "auxiliary_loss_mlp": 0.01264073, + "balance_loss_clip": 0.06273991, + "balance_loss_mlp": 0.01254239, + "epoch": 0.8655343454080866, + "flos": 23921476502400.0, + "grad_norm": 1.4835131789742315, + "language_loss": 0.68352878, + "learning_rate": 1.8661250781430838e-07, + "loss": 0.76021349, + "num_input_tokens_seen": 310552300, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.09832764, + "step": 14396, + "time_per_iteration": 2.5393667221069336 + }, + { + "auxiliary_loss_clip": 0.06409425, + "auxiliary_loss_mlp": 0.01266633, + "balance_loss_clip": 0.06273856, + "balance_loss_mlp": 0.01255994, + "epoch": 0.8655944686607545, + "flos": 24104016622080.0, + "grad_norm": 2.042547019801872, + "language_loss": 0.69834018, + "learning_rate": 1.8644827131774954e-07, + "loss": 0.77510077, + "num_input_tokens_seen": 310572710, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.10638428, + "step": 14397, + "time_per_iteration": 2.539818286895752 + }, + { + "auxiliary_loss_clip": 0.06403225, + "auxiliary_loss_mlp": 0.01263446, + "balance_loss_clip": 0.0627091, + "balance_loss_mlp": 0.01253373, + "epoch": 0.8656545919134225, + "flos": 23119465547520.0, + "grad_norm": 1.8495016232222878, + "language_loss": 0.63756424, + "learning_rate": 1.86284103591253e-07, + "loss": 0.71423095, + "num_input_tokens_seen": 310592460, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.10070801, + "step": 14398, + "time_per_iteration": 2.538398265838623 + }, + { + "auxiliary_loss_clip": 0.06404422, + "auxiliary_loss_mlp": 0.01268592, + "balance_loss_clip": 0.06273551, + "balance_loss_mlp": 0.01259454, + "epoch": 0.8657147151660904, + "flos": 21148057411200.0, + "grad_norm": 2.1437443287779594, + "language_loss": 0.76056588, + "learning_rate": 1.8612000464104517e-07, + "loss": 0.83729601, + "num_input_tokens_seen": 310609375, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09136963, + "step": 14399, + "time_per_iteration": 3.908792734146118 + }, + { + "auxiliary_loss_clip": 0.06397003, + "auxiliary_loss_mlp": 0.01262133, + "balance_loss_clip": 0.06268921, + "balance_loss_mlp": 0.01253961, + "epoch": 0.8657748384187585, + "flos": 16294972308480.0, + "grad_norm": 1.9617345996315059, + "language_loss": 0.93996477, + "learning_rate": 1.8595597447334855e-07, + "loss": 1.01655602, + "num_input_tokens_seen": 310627405, + "router_z_loss_clip": 1.28222656, + "router_z_loss_mlp": 0.081604, + "step": 14400, + "time_per_iteration": 2.587644338607788 + }, + { + "auxiliary_loss_clip": 0.06404351, + "auxiliary_loss_mlp": 0.01263404, + "balance_loss_clip": 0.06271766, + "balance_loss_mlp": 0.01254314, + "epoch": 0.8658349616714264, + "flos": 30851292723840.0, + "grad_norm": 1.6768484881367147, + "language_loss": 0.67610824, + "learning_rate": 1.8579201309438353e-07, + "loss": 0.75278574, + "num_input_tokens_seen": 310649945, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.09094238, + "step": 14401, + "time_per_iteration": 2.5835890769958496 + }, + { + "auxiliary_loss_clip": 0.06406137, + "auxiliary_loss_mlp": 0.0126592, + "balance_loss_clip": 0.06270184, + "balance_loss_mlp": 0.01256526, + "epoch": 0.8658950849240944, + "flos": 18958833786240.0, + "grad_norm": 2.2258596653957508, + "language_loss": 0.7464267, + "learning_rate": 1.8562812051036714e-07, + "loss": 0.8231473, + "num_input_tokens_seen": 310668285, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.09387207, + "step": 14402, + "time_per_iteration": 2.527329206466675 + }, + { + "auxiliary_loss_clip": 0.06397735, + "auxiliary_loss_mlp": 0.01263573, + "balance_loss_clip": 0.06269282, + "balance_loss_mlp": 0.01254907, + "epoch": 0.8659552081767624, + "flos": 23370501980160.0, + "grad_norm": 1.6213852785308416, + "language_loss": 0.752159, + "learning_rate": 1.8546429672751397e-07, + "loss": 0.82877213, + "num_input_tokens_seen": 310687015, + "router_z_loss_clip": 1.28320312, + "router_z_loss_mlp": 0.08660889, + "step": 14403, + "time_per_iteration": 2.531348705291748 + }, + { + "auxiliary_loss_clip": 0.06404096, + "auxiliary_loss_mlp": 0.01264956, + "balance_loss_clip": 0.06270886, + "balance_loss_mlp": 0.01255234, + "epoch": 0.8660153314294303, + "flos": 23848787485440.0, + "grad_norm": 1.6613689377775722, + "language_loss": 0.73390162, + "learning_rate": 1.853005417520368e-07, + "loss": 0.81059217, + "num_input_tokens_seen": 310707580, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.09716797, + "step": 14404, + "time_per_iteration": 4.003480911254883 + }, + { + "auxiliary_loss_clip": 0.0639967, + "auxiliary_loss_mlp": 0.01266035, + "balance_loss_clip": 0.06270695, + "balance_loss_mlp": 0.01255801, + "epoch": 0.8660754546820983, + "flos": 23119172058240.0, + "grad_norm": 1.6322756861517351, + "language_loss": 0.71098399, + "learning_rate": 1.851368555901447e-07, + "loss": 0.78764105, + "num_input_tokens_seen": 310727300, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.10241699, + "step": 14405, + "time_per_iteration": 4.005644798278809 + }, + { + "auxiliary_loss_clip": 0.06404774, + "auxiliary_loss_mlp": 0.01262757, + "balance_loss_clip": 0.06269382, + "balance_loss_mlp": 0.01252666, + "epoch": 0.8661355779347663, + "flos": 14397175584000.0, + "grad_norm": 1.6421655620173083, + "language_loss": 0.66277993, + "learning_rate": 1.8497323824804467e-07, + "loss": 0.73945522, + "num_input_tokens_seen": 310744935, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.10089111, + "step": 14406, + "time_per_iteration": 2.50840425491333 + }, + { + "auxiliary_loss_clip": 0.06401468, + "auxiliary_loss_mlp": 0.01268771, + "balance_loss_clip": 0.06270108, + "balance_loss_mlp": 0.01260015, + "epoch": 0.8661957011874343, + "flos": 21876331173120.0, + "grad_norm": 1.8022015910030553, + "language_loss": 0.83140111, + "learning_rate": 1.8480968973194177e-07, + "loss": 0.90810353, + "num_input_tokens_seen": 310765085, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.08752441, + "step": 14407, + "time_per_iteration": 2.523522138595581 + }, + { + "auxiliary_loss_clip": 0.06403568, + "auxiliary_loss_mlp": 0.01265957, + "balance_loss_clip": 0.06273694, + "balance_loss_mlp": 0.01256366, + "epoch": 0.8662558244401022, + "flos": 21841600855680.0, + "grad_norm": 1.6449965568009912, + "language_loss": 0.70152688, + "learning_rate": 1.8464621004803748e-07, + "loss": 0.77822208, + "num_input_tokens_seen": 310783260, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.09588623, + "step": 14408, + "time_per_iteration": 2.5317270755767822 + }, + { + "auxiliary_loss_clip": 0.06397519, + "auxiliary_loss_mlp": 0.01264222, + "balance_loss_clip": 0.06270346, + "balance_loss_mlp": 0.01254959, + "epoch": 0.8663159476927702, + "flos": 17389835683200.0, + "grad_norm": 1.7633081999688927, + "language_loss": 0.77345204, + "learning_rate": 1.844827992025304e-07, + "loss": 0.85006946, + "num_input_tokens_seen": 310801970, + "router_z_loss_clip": 1.27246094, + "router_z_loss_mlp": 0.09265137, + "step": 14409, + "time_per_iteration": 2.526059865951538 + }, + { + "auxiliary_loss_clip": 0.06406955, + "auxiliary_loss_mlp": 0.01265018, + "balance_loss_clip": 0.06271859, + "balance_loss_mlp": 0.01254951, + "epoch": 0.8663760709454381, + "flos": 22754385308160.0, + "grad_norm": 1.8171416455536564, + "language_loss": 0.76934552, + "learning_rate": 1.8431945720161757e-07, + "loss": 0.84606528, + "num_input_tokens_seen": 310822070, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.10064697, + "step": 14410, + "time_per_iteration": 2.5280380249023438 + }, + { + "auxiliary_loss_clip": 0.06405914, + "auxiliary_loss_mlp": 0.0126676, + "balance_loss_clip": 0.06273735, + "balance_loss_mlp": 0.01256991, + "epoch": 0.8664361941981061, + "flos": 17381366421120.0, + "grad_norm": 1.9715328032802535, + "language_loss": 0.78025001, + "learning_rate": 1.8415618405149315e-07, + "loss": 0.85697675, + "num_input_tokens_seen": 310838355, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.09765625, + "step": 14411, + "time_per_iteration": 2.515397071838379 + }, + { + "auxiliary_loss_clip": 0.06397986, + "auxiliary_loss_mlp": 0.01263072, + "balance_loss_clip": 0.06267551, + "balance_loss_mlp": 0.01253774, + "epoch": 0.866496317450774, + "flos": 16039994734080.0, + "grad_norm": 1.7277643330108303, + "language_loss": 0.73680794, + "learning_rate": 1.8399297975834794e-07, + "loss": 0.81341851, + "num_input_tokens_seen": 310856055, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.09295654, + "step": 14412, + "time_per_iteration": 2.4690604209899902 + }, + { + "auxiliary_loss_clip": 0.06400064, + "auxiliary_loss_mlp": 0.01267281, + "balance_loss_clip": 0.06271769, + "balance_loss_mlp": 0.0125868, + "epoch": 0.8665564407034421, + "flos": 20821313214720.0, + "grad_norm": 1.8053932243271904, + "language_loss": 0.69647372, + "learning_rate": 1.83829844328371e-07, + "loss": 0.77314717, + "num_input_tokens_seen": 310876695, + "router_z_loss_clip": 1.28222656, + "router_z_loss_mlp": 0.08605957, + "step": 14413, + "time_per_iteration": 2.514761209487915 + }, + { + "auxiliary_loss_clip": 0.06403694, + "auxiliary_loss_mlp": 0.01265064, + "balance_loss_clip": 0.06270799, + "balance_loss_mlp": 0.01255254, + "epoch": 0.86661656395611, + "flos": 15820627944960.0, + "grad_norm": 2.2244360215137684, + "language_loss": 0.63284969, + "learning_rate": 1.8366677776774874e-07, + "loss": 0.70953727, + "num_input_tokens_seen": 310893880, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.0980835, + "step": 14414, + "time_per_iteration": 2.475782871246338 + }, + { + "auxiliary_loss_clip": 0.06403673, + "auxiliary_loss_mlp": 0.0126404, + "balance_loss_clip": 0.06273353, + "balance_loss_mlp": 0.01254652, + "epoch": 0.866676687208778, + "flos": 23043170805120.0, + "grad_norm": 1.623963807084388, + "language_loss": 0.6375469, + "learning_rate": 1.8350378008266377e-07, + "loss": 0.71422398, + "num_input_tokens_seen": 310914145, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.09387207, + "step": 14415, + "time_per_iteration": 2.5194180011749268 + }, + { + "auxiliary_loss_clip": 0.06311454, + "auxiliary_loss_mlp": 0.01254301, + "balance_loss_clip": 0.06256884, + "balance_loss_mlp": 0.01253252, + "epoch": 0.866736810461446, + "flos": 63823256104320.0, + "grad_norm": 0.7752064714418949, + "language_loss": 0.60367054, + "learning_rate": 1.8334085127929754e-07, + "loss": 0.67932814, + "num_input_tokens_seen": 310972825, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.01049805, + "step": 14416, + "time_per_iteration": 3.1916332244873047 + }, + { + "auxiliary_loss_clip": 0.06406388, + "auxiliary_loss_mlp": 0.01263895, + "balance_loss_clip": 0.06270231, + "balance_loss_mlp": 0.01253596, + "epoch": 0.8667969337141139, + "flos": 20455687923840.0, + "grad_norm": 1.758371928696436, + "language_loss": 0.75081879, + "learning_rate": 1.831779913638285e-07, + "loss": 0.82752162, + "num_input_tokens_seen": 310992050, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.10296631, + "step": 14417, + "time_per_iteration": 2.519272565841675 + }, + { + "auxiliary_loss_clip": 0.06401929, + "auxiliary_loss_mlp": 0.01264851, + "balance_loss_clip": 0.06270267, + "balance_loss_mlp": 0.01255493, + "epoch": 0.866857056966782, + "flos": 21660276620160.0, + "grad_norm": 1.4417823685180284, + "language_loss": 0.75500447, + "learning_rate": 1.830152003424319e-07, + "loss": 0.83167231, + "num_input_tokens_seen": 311011105, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09350586, + "step": 14418, + "time_per_iteration": 2.5243372917175293 + }, + { + "auxiliary_loss_clip": 0.06397848, + "auxiliary_loss_mlp": 0.0126541, + "balance_loss_clip": 0.06267963, + "balance_loss_mlp": 0.01255963, + "epoch": 0.8669171802194499, + "flos": 22858785895680.0, + "grad_norm": 1.4538626454884047, + "language_loss": 0.68544036, + "learning_rate": 1.8285247822128126e-07, + "loss": 0.76207292, + "num_input_tokens_seen": 311032080, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.09448242, + "step": 14419, + "time_per_iteration": 2.598567247390747 + }, + { + "auxiliary_loss_clip": 0.06402744, + "auxiliary_loss_mlp": 0.01264128, + "balance_loss_clip": 0.06270118, + "balance_loss_mlp": 0.01254794, + "epoch": 0.8669773034721179, + "flos": 18740137829760.0, + "grad_norm": 1.6269776672151877, + "language_loss": 0.78971106, + "learning_rate": 1.826898250065465e-07, + "loss": 0.86637974, + "num_input_tokens_seen": 311049735, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.09326172, + "step": 14420, + "time_per_iteration": 2.5527749061584473 + }, + { + "auxiliary_loss_clip": 0.06402794, + "auxiliary_loss_mlp": 0.01264773, + "balance_loss_clip": 0.0627051, + "balance_loss_mlp": 0.01255153, + "epoch": 0.8670374267247858, + "flos": 18921923262720.0, + "grad_norm": 1.8416843684547823, + "language_loss": 0.83623648, + "learning_rate": 1.8252724070439586e-07, + "loss": 0.91291213, + "num_input_tokens_seen": 311067675, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09625244, + "step": 14421, + "time_per_iteration": 2.53287935256958 + }, + { + "auxiliary_loss_clip": 0.06307293, + "auxiliary_loss_mlp": 0.01252132, + "balance_loss_clip": 0.06252414, + "balance_loss_mlp": 0.01251069, + "epoch": 0.8670975499774538, + "flos": 48834323458560.0, + "grad_norm": 0.6970048505263723, + "language_loss": 0.4877342, + "learning_rate": 1.823647253209941e-07, + "loss": 0.56332839, + "num_input_tokens_seen": 311126605, + "router_z_loss_clip": 0.54931641, + "router_z_loss_mlp": 0.01064301, + "step": 14422, + "time_per_iteration": 3.2060294151306152 + }, + { + "auxiliary_loss_clip": 0.06402378, + "auxiliary_loss_mlp": 0.0126638, + "balance_loss_clip": 0.06270766, + "balance_loss_mlp": 0.01257374, + "epoch": 0.8671576732301217, + "flos": 26142579406080.0, + "grad_norm": 1.482284163911286, + "language_loss": 0.73646462, + "learning_rate": 1.8220227886250417e-07, + "loss": 0.81315225, + "num_input_tokens_seen": 311147325, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.09008789, + "step": 14423, + "time_per_iteration": 2.5513858795166016 + }, + { + "auxiliary_loss_clip": 0.06397344, + "auxiliary_loss_mlp": 0.01261454, + "balance_loss_clip": 0.06272127, + "balance_loss_mlp": 0.0125339, + "epoch": 0.8672177964827897, + "flos": 18373045092480.0, + "grad_norm": 1.5159393869667968, + "language_loss": 0.7694416, + "learning_rate": 1.8203990133508684e-07, + "loss": 0.84602958, + "num_input_tokens_seen": 311165385, + "router_z_loss_clip": 1.25097656, + "router_z_loss_mlp": 0.08056641, + "step": 14424, + "time_per_iteration": 2.5115561485290527 + }, + { + "auxiliary_loss_clip": 0.06394623, + "auxiliary_loss_mlp": 0.01261736, + "balance_loss_clip": 0.06269346, + "balance_loss_mlp": 0.01253331, + "epoch": 0.8672779197354576, + "flos": 28552385704320.0, + "grad_norm": 1.5295537919973716, + "language_loss": 0.71849072, + "learning_rate": 1.8187759274489767e-07, + "loss": 0.79505438, + "num_input_tokens_seen": 311185860, + "router_z_loss_clip": 1.25390625, + "router_z_loss_mlp": 0.08410645, + "step": 14425, + "time_per_iteration": 2.5568857192993164 + }, + { + "auxiliary_loss_clip": 0.06405246, + "auxiliary_loss_mlp": 0.01264965, + "balance_loss_clip": 0.06270114, + "balance_loss_mlp": 0.01255297, + "epoch": 0.8673380429881257, + "flos": 22389011579520.0, + "grad_norm": 1.4758185818369447, + "language_loss": 0.6852206, + "learning_rate": 1.817153530980926e-07, + "loss": 0.76192278, + "num_input_tokens_seen": 311205810, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.09667969, + "step": 14426, + "time_per_iteration": 2.5231831073760986 + }, + { + "auxiliary_loss_clip": 0.06402829, + "auxiliary_loss_mlp": 0.01263874, + "balance_loss_clip": 0.06270183, + "balance_loss_mlp": 0.01253419, + "epoch": 0.8673981662407936, + "flos": 21002805158400.0, + "grad_norm": 1.7832105670695808, + "language_loss": 0.70722842, + "learning_rate": 1.815531824008234e-07, + "loss": 0.78389543, + "num_input_tokens_seen": 311226080, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.10455322, + "step": 14427, + "time_per_iteration": 4.035536289215088 + }, + { + "auxiliary_loss_clip": 0.0640244, + "auxiliary_loss_mlp": 0.0126232, + "balance_loss_clip": 0.06271227, + "balance_loss_mlp": 0.01252676, + "epoch": 0.8674582894934616, + "flos": 24433863419520.0, + "grad_norm": 1.5804797427940684, + "language_loss": 0.6822958, + "learning_rate": 1.8139108065924004e-07, + "loss": 0.75894332, + "num_input_tokens_seen": 311246380, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09655762, + "step": 14428, + "time_per_iteration": 2.553795099258423 + }, + { + "auxiliary_loss_clip": 0.06399473, + "auxiliary_loss_mlp": 0.01266114, + "balance_loss_clip": 0.0626923, + "balance_loss_mlp": 0.01257334, + "epoch": 0.8675184127461296, + "flos": 20743257536640.0, + "grad_norm": 1.8013326629765731, + "language_loss": 0.71193767, + "learning_rate": 1.812290478794889e-07, + "loss": 0.78859359, + "num_input_tokens_seen": 311266465, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.08776855, + "step": 14429, + "time_per_iteration": 2.493234157562256 + }, + { + "auxiliary_loss_clip": 0.06401441, + "auxiliary_loss_mlp": 0.01264101, + "balance_loss_clip": 0.0627252, + "balance_loss_mlp": 0.01254898, + "epoch": 0.8675785359987975, + "flos": 19141709322240.0, + "grad_norm": 1.8609763049402845, + "language_loss": 0.66596407, + "learning_rate": 1.810670840677151e-07, + "loss": 0.74261945, + "num_input_tokens_seen": 311285075, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.09204102, + "step": 14430, + "time_per_iteration": 2.4854321479797363 + }, + { + "auxiliary_loss_clip": 0.06403784, + "auxiliary_loss_mlp": 0.012671, + "balance_loss_clip": 0.06269564, + "balance_loss_mlp": 0.01256902, + "epoch": 0.8676386592514655, + "flos": 22717223222400.0, + "grad_norm": 2.523579211603687, + "language_loss": 0.69258201, + "learning_rate": 1.8090518923005948e-07, + "loss": 0.76929086, + "num_input_tokens_seen": 311303230, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.10198975, + "step": 14431, + "time_per_iteration": 2.5594279766082764 + }, + { + "auxiliary_loss_clip": 0.06405756, + "auxiliary_loss_mlp": 0.0126775, + "balance_loss_clip": 0.06272927, + "balance_loss_mlp": 0.01258016, + "epoch": 0.8676987825041335, + "flos": 14215054734720.0, + "grad_norm": 2.3061623073742545, + "language_loss": 0.6399014, + "learning_rate": 1.8074336337266116e-07, + "loss": 0.71663648, + "num_input_tokens_seen": 311318070, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.09735107, + "step": 14432, + "time_per_iteration": 2.499904155731201 + }, + { + "auxiliary_loss_clip": 0.06403828, + "auxiliary_loss_mlp": 0.01265326, + "balance_loss_clip": 0.06272545, + "balance_loss_mlp": 0.01256111, + "epoch": 0.8677589057568015, + "flos": 13595080775040.0, + "grad_norm": 1.7789604432407644, + "language_loss": 0.78301966, + "learning_rate": 1.8058160650165656e-07, + "loss": 0.85971117, + "num_input_tokens_seen": 311334885, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09222412, + "step": 14433, + "time_per_iteration": 2.4722964763641357 + }, + { + "auxiliary_loss_clip": 0.06308552, + "auxiliary_loss_mlp": 0.01250803, + "balance_loss_clip": 0.06253849, + "balance_loss_mlp": 0.01249807, + "epoch": 0.8678190290094694, + "flos": 68953303278720.0, + "grad_norm": 0.6938824705198252, + "language_loss": 0.58372235, + "learning_rate": 1.804199186231805e-07, + "loss": 0.65931588, + "num_input_tokens_seen": 311399780, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.00994873, + "step": 14434, + "time_per_iteration": 3.22125506401062 + }, + { + "auxiliary_loss_clip": 0.06397156, + "auxiliary_loss_mlp": 0.01264803, + "balance_loss_clip": 0.06269969, + "balance_loss_mlp": 0.01256273, + "epoch": 0.8678791522621374, + "flos": 32565249590400.0, + "grad_norm": 1.644245978236505, + "language_loss": 0.80153918, + "learning_rate": 1.802582997433628e-07, + "loss": 0.87815875, + "num_input_tokens_seen": 311419610, + "router_z_loss_clip": 1.27148438, + "router_z_loss_mlp": 0.08526611, + "step": 14435, + "time_per_iteration": 2.623704195022583 + }, + { + "auxiliary_loss_clip": 0.06403121, + "auxiliary_loss_mlp": 0.012653, + "balance_loss_clip": 0.06269317, + "balance_loss_mlp": 0.01254756, + "epoch": 0.8679392755148053, + "flos": 35051224849920.0, + "grad_norm": 1.897215126056039, + "language_loss": 0.62167633, + "learning_rate": 1.8009674986833322e-07, + "loss": 0.69836056, + "num_input_tokens_seen": 311440045, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.10546875, + "step": 14436, + "time_per_iteration": 2.632450819015503 + }, + { + "auxiliary_loss_clip": 0.06402992, + "auxiliary_loss_mlp": 0.01262824, + "balance_loss_clip": 0.06270669, + "balance_loss_mlp": 0.0125278, + "epoch": 0.8679993987674733, + "flos": 18558562032000.0, + "grad_norm": 1.9896848572147598, + "language_loss": 0.70140958, + "learning_rate": 1.7993526900421706e-07, + "loss": 0.77806765, + "num_input_tokens_seen": 311456660, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.10040283, + "step": 14437, + "time_per_iteration": 2.541003704071045 + }, + { + "auxiliary_loss_clip": 0.06404081, + "auxiliary_loss_mlp": 0.0126507, + "balance_loss_clip": 0.06273189, + "balance_loss_mlp": 0.01255152, + "epoch": 0.8680595220201412, + "flos": 27461840814720.0, + "grad_norm": 1.956729698736987, + "language_loss": 0.8056224, + "learning_rate": 1.797738571571381e-07, + "loss": 0.88231391, + "num_input_tokens_seen": 311475460, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.09924316, + "step": 14438, + "time_per_iteration": 4.026323556900024 + }, + { + "auxiliary_loss_clip": 0.06396785, + "auxiliary_loss_mlp": 0.012629, + "balance_loss_clip": 0.0627017, + "balance_loss_mlp": 0.01254066, + "epoch": 0.8681196452728093, + "flos": 19214901463680.0, + "grad_norm": 1.7667026459424926, + "language_loss": 0.67657971, + "learning_rate": 1.7961251433321656e-07, + "loss": 0.75317651, + "num_input_tokens_seen": 311494575, + "router_z_loss_clip": 1.265625, + "router_z_loss_mlp": 0.08837891, + "step": 14439, + "time_per_iteration": 2.510300874710083 + }, + { + "auxiliary_loss_clip": 0.06404371, + "auxiliary_loss_mlp": 0.01263942, + "balance_loss_clip": 0.06272165, + "balance_loss_mlp": 0.01255007, + "epoch": 0.8681797685254772, + "flos": 37569498658560.0, + "grad_norm": 2.0023877249640094, + "language_loss": 0.64283299, + "learning_rate": 1.7945124053857085e-07, + "loss": 0.7195161, + "num_input_tokens_seen": 311515805, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.0894165, + "step": 14440, + "time_per_iteration": 2.66216778755188 + }, + { + "auxiliary_loss_clip": 0.06398277, + "auxiliary_loss_mlp": 0.01265472, + "balance_loss_clip": 0.06271653, + "balance_loss_mlp": 0.0125609, + "epoch": 0.8682398917781452, + "flos": 23295842392320.0, + "grad_norm": 1.5322174401444875, + "language_loss": 0.65759438, + "learning_rate": 1.7929003577931722e-07, + "loss": 0.73423183, + "num_input_tokens_seen": 311536000, + "router_z_loss_clip": 1.26757812, + "router_z_loss_mlp": 0.09381104, + "step": 14441, + "time_per_iteration": 2.504725456237793 + }, + { + "auxiliary_loss_clip": 0.06396982, + "auxiliary_loss_mlp": 0.01262947, + "balance_loss_clip": 0.06271137, + "balance_loss_mlp": 0.01254125, + "epoch": 0.8683000150308132, + "flos": 21879433774080.0, + "grad_norm": 1.5819575820693645, + "language_loss": 0.66384351, + "learning_rate": 1.7912890006156722e-07, + "loss": 0.74044275, + "num_input_tokens_seen": 311556220, + "router_z_loss_clip": 1.25976562, + "router_z_loss_mlp": 0.08813477, + "step": 14442, + "time_per_iteration": 2.515378713607788 + }, + { + "auxiliary_loss_clip": 0.06408555, + "auxiliary_loss_mlp": 0.0126728, + "balance_loss_clip": 0.06272847, + "balance_loss_mlp": 0.01256921, + "epoch": 0.8683601382834811, + "flos": 14652404720640.0, + "grad_norm": 1.734423376729254, + "language_loss": 0.72608215, + "learning_rate": 1.7896783339143195e-07, + "loss": 0.80284047, + "num_input_tokens_seen": 311572530, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10345459, + "step": 14443, + "time_per_iteration": 4.0072619915008545 + }, + { + "auxiliary_loss_clip": 0.06403544, + "auxiliary_loss_mlp": 0.01266339, + "balance_loss_clip": 0.06272006, + "balance_loss_mlp": 0.01256617, + "epoch": 0.8684202615361492, + "flos": 26367187075200.0, + "grad_norm": 1.686322881132401, + "language_loss": 0.83196855, + "learning_rate": 1.7880683577501877e-07, + "loss": 0.90866739, + "num_input_tokens_seen": 311591105, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.097229, + "step": 14444, + "time_per_iteration": 4.008268594741821 + }, + { + "auxiliary_loss_clip": 0.06403743, + "auxiliary_loss_mlp": 0.01261873, + "balance_loss_clip": 0.06272523, + "balance_loss_mlp": 0.01252628, + "epoch": 0.8684803847888171, + "flos": 20710246227840.0, + "grad_norm": 1.9141617998597704, + "language_loss": 0.76965976, + "learning_rate": 1.7864590721843342e-07, + "loss": 0.84631586, + "num_input_tokens_seen": 311608350, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09246826, + "step": 14445, + "time_per_iteration": 2.5997262001037598 + }, + { + "auxiliary_loss_clip": 0.06402852, + "auxiliary_loss_mlp": 0.01262345, + "balance_loss_clip": 0.06273001, + "balance_loss_mlp": 0.01252194, + "epoch": 0.8685405080414851, + "flos": 22644743840640.0, + "grad_norm": 1.69315828341739, + "language_loss": 0.68069935, + "learning_rate": 1.7848504772777728e-07, + "loss": 0.75735128, + "num_input_tokens_seen": 311626380, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.10144043, + "step": 14446, + "time_per_iteration": 2.5424163341522217 + }, + { + "auxiliary_loss_clip": 0.06401268, + "auxiliary_loss_mlp": 0.01264762, + "balance_loss_clip": 0.06270047, + "balance_loss_mlp": 0.01255422, + "epoch": 0.868600631294153, + "flos": 24828181534080.0, + "grad_norm": 1.616488905601248, + "language_loss": 0.82849407, + "learning_rate": 1.7832425730915102e-07, + "loss": 0.90515447, + "num_input_tokens_seen": 311644345, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.09344482, + "step": 14447, + "time_per_iteration": 2.6071512699127197 + }, + { + "auxiliary_loss_clip": 0.06400138, + "auxiliary_loss_mlp": 0.0126489, + "balance_loss_clip": 0.06269099, + "balance_loss_mlp": 0.01255937, + "epoch": 0.868660754546821, + "flos": 25120153486080.0, + "grad_norm": 1.624335416002347, + "language_loss": 0.74320281, + "learning_rate": 1.781635359686515e-07, + "loss": 0.81985313, + "num_input_tokens_seen": 311663340, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.08959961, + "step": 14448, + "time_per_iteration": 2.547412633895874 + }, + { + "auxiliary_loss_clip": 0.06402777, + "auxiliary_loss_mlp": 0.01263991, + "balance_loss_clip": 0.06270443, + "balance_loss_mlp": 0.01254299, + "epoch": 0.8687208777994889, + "flos": 12682841374080.0, + "grad_norm": 1.8412426032708813, + "language_loss": 0.80489451, + "learning_rate": 1.7800288371237303e-07, + "loss": 0.88156223, + "num_input_tokens_seen": 311679860, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.09686279, + "step": 14449, + "time_per_iteration": 2.4914026260375977 + }, + { + "auxiliary_loss_clip": 0.0631351, + "auxiliary_loss_mlp": 0.01253647, + "balance_loss_clip": 0.06259002, + "balance_loss_mlp": 0.01252613, + "epoch": 0.8687810010521569, + "flos": 65636959656960.0, + "grad_norm": 0.7923178433705474, + "language_loss": 0.60340738, + "learning_rate": 1.7784230054640758e-07, + "loss": 0.67907894, + "num_input_tokens_seen": 311738135, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.01034546, + "step": 14450, + "time_per_iteration": 3.0573930740356445 + }, + { + "auxiliary_loss_clip": 0.06410334, + "auxiliary_loss_mlp": 0.01264555, + "balance_loss_clip": 0.06276858, + "balance_loss_mlp": 0.0125512, + "epoch": 0.8688411243048249, + "flos": 24250987883520.0, + "grad_norm": 1.5446429349016553, + "language_loss": 0.76378512, + "learning_rate": 1.7768178647684517e-07, + "loss": 0.84053403, + "num_input_tokens_seen": 311756975, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.09436035, + "step": 14451, + "time_per_iteration": 2.5443615913391113 + }, + { + "auxiliary_loss_clip": 0.06400914, + "auxiliary_loss_mlp": 0.01264515, + "balance_loss_clip": 0.0627023, + "balance_loss_mlp": 0.01254943, + "epoch": 0.8689012475574929, + "flos": 18227457423360.0, + "grad_norm": 2.4344123800734487, + "language_loss": 0.72107518, + "learning_rate": 1.7752134150977205e-07, + "loss": 0.79772949, + "num_input_tokens_seen": 311771830, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.09564209, + "step": 14452, + "time_per_iteration": 2.4614477157592773 + }, + { + "auxiliary_loss_clip": 0.06404183, + "auxiliary_loss_mlp": 0.0126295, + "balance_loss_clip": 0.06270374, + "balance_loss_mlp": 0.01253014, + "epoch": 0.8689613708101608, + "flos": 19652922282240.0, + "grad_norm": 1.772178254376601, + "language_loss": 0.72880638, + "learning_rate": 1.7736096565127201e-07, + "loss": 0.80547774, + "num_input_tokens_seen": 311790130, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.09942627, + "step": 14453, + "time_per_iteration": 2.508371591567993 + }, + { + "auxiliary_loss_clip": 0.06399187, + "auxiliary_loss_mlp": 0.01264806, + "balance_loss_clip": 0.06269897, + "balance_loss_mlp": 0.01255669, + "epoch": 0.8690214940628288, + "flos": 11733523741440.0, + "grad_norm": 1.9522335345310335, + "language_loss": 0.73650515, + "learning_rate": 1.7720065890742664e-07, + "loss": 0.8131451, + "num_input_tokens_seen": 311808360, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.09136963, + "step": 14454, + "time_per_iteration": 2.6009294986724854 + }, + { + "auxiliary_loss_clip": 0.06401433, + "auxiliary_loss_mlp": 0.01266363, + "balance_loss_clip": 0.06271113, + "balance_loss_mlp": 0.01256516, + "epoch": 0.8690816173154968, + "flos": 34945566451200.0, + "grad_norm": 1.7631305246108158, + "language_loss": 0.60118473, + "learning_rate": 1.7704042128431552e-07, + "loss": 0.67786264, + "num_input_tokens_seen": 311831325, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.09844971, + "step": 14455, + "time_per_iteration": 2.753415107727051 + }, + { + "auxiliary_loss_clip": 0.06404486, + "auxiliary_loss_mlp": 0.01264704, + "balance_loss_clip": 0.06271438, + "balance_loss_mlp": 0.01255233, + "epoch": 0.8691417405681647, + "flos": 11618809102080.0, + "grad_norm": 2.01471686144797, + "language_loss": 0.80115831, + "learning_rate": 1.7688025278801378e-07, + "loss": 0.87785017, + "num_input_tokens_seen": 311848090, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.09472656, + "step": 14456, + "time_per_iteration": 2.5271530151367188 + }, + { + "auxiliary_loss_clip": 0.06409412, + "auxiliary_loss_mlp": 0.01267391, + "balance_loss_clip": 0.06274113, + "balance_loss_mlp": 0.01257247, + "epoch": 0.8692018638208328, + "flos": 24614936092800.0, + "grad_norm": 2.326789924300959, + "language_loss": 0.74536252, + "learning_rate": 1.7672015342459568e-07, + "loss": 0.82213056, + "num_input_tokens_seen": 311867855, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.10137939, + "step": 14457, + "time_per_iteration": 2.526219129562378 + }, + { + "auxiliary_loss_clip": 0.06399509, + "auxiliary_loss_mlp": 0.01264718, + "balance_loss_clip": 0.06271378, + "balance_loss_mlp": 0.01255784, + "epoch": 0.8692619870735007, + "flos": 26002358398080.0, + "grad_norm": 1.4211804467950002, + "language_loss": 0.7873075, + "learning_rate": 1.765601232001328e-07, + "loss": 0.86394978, + "num_input_tokens_seen": 311888675, + "router_z_loss_clip": 1.28222656, + "router_z_loss_mlp": 0.0894165, + "step": 14458, + "time_per_iteration": 2.5216262340545654 + }, + { + "auxiliary_loss_clip": 0.06402966, + "auxiliary_loss_mlp": 0.01266346, + "balance_loss_clip": 0.06273033, + "balance_loss_mlp": 0.0125663, + "epoch": 0.8693221103261687, + "flos": 18047810269440.0, + "grad_norm": 1.5087935238946328, + "language_loss": 0.71331191, + "learning_rate": 1.7640016212069187e-07, + "loss": 0.79000497, + "num_input_tokens_seen": 311907310, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.097229, + "step": 14459, + "time_per_iteration": 2.4944982528686523 + }, + { + "auxiliary_loss_clip": 0.06394096, + "auxiliary_loss_mlp": 0.01263427, + "balance_loss_clip": 0.06268888, + "balance_loss_mlp": 0.0125485, + "epoch": 0.8693822335788366, + "flos": 27500051076480.0, + "grad_norm": 1.2788295067454918, + "language_loss": 0.74028695, + "learning_rate": 1.762402701923398e-07, + "loss": 0.81686223, + "num_input_tokens_seen": 311929635, + "router_z_loss_clip": 1.25195312, + "router_z_loss_mlp": 0.08575439, + "step": 14460, + "time_per_iteration": 2.56471848487854 + }, + { + "auxiliary_loss_clip": 0.06408993, + "auxiliary_loss_mlp": 0.01266866, + "balance_loss_clip": 0.06271887, + "balance_loss_mlp": 0.01257002, + "epoch": 0.8694423568315046, + "flos": 24104603600640.0, + "grad_norm": 2.393839276229543, + "language_loss": 0.65351462, + "learning_rate": 1.7608044742113947e-07, + "loss": 0.73027325, + "num_input_tokens_seen": 311948800, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.09857178, + "step": 14461, + "time_per_iteration": 2.5354537963867188 + }, + { + "auxiliary_loss_clip": 0.06403669, + "auxiliary_loss_mlp": 0.01267783, + "balance_loss_clip": 0.0627113, + "balance_loss_mlp": 0.0125793, + "epoch": 0.8695024800841725, + "flos": 18366839890560.0, + "grad_norm": 2.377735407196708, + "language_loss": 0.82366443, + "learning_rate": 1.7592069381315123e-07, + "loss": 0.900379, + "num_input_tokens_seen": 311964090, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.09844971, + "step": 14462, + "time_per_iteration": 2.471653938293457 + }, + { + "auxiliary_loss_clip": 0.06403664, + "auxiliary_loss_mlp": 0.01265298, + "balance_loss_clip": 0.0627223, + "balance_loss_mlp": 0.0125529, + "epoch": 0.8695626033368405, + "flos": 14032975812480.0, + "grad_norm": 1.782940361632394, + "language_loss": 0.65303802, + "learning_rate": 1.757610093744335e-07, + "loss": 0.72972763, + "num_input_tokens_seen": 311981460, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.10009766, + "step": 14463, + "time_per_iteration": 2.519047975540161 + }, + { + "auxiliary_loss_clip": 0.06408842, + "auxiliary_loss_mlp": 0.01268237, + "balance_loss_clip": 0.06271829, + "balance_loss_mlp": 0.01257729, + "epoch": 0.8696227265895085, + "flos": 16842508813440.0, + "grad_norm": 1.8832383618141357, + "language_loss": 0.66826367, + "learning_rate": 1.7560139411104058e-07, + "loss": 0.74503446, + "num_input_tokens_seen": 312000115, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.10516357, + "step": 14464, + "time_per_iteration": 2.4889910221099854 + }, + { + "auxiliary_loss_clip": 0.06410474, + "auxiliary_loss_mlp": 0.01263823, + "balance_loss_clip": 0.06273378, + "balance_loss_mlp": 0.01253166, + "epoch": 0.8696828498421765, + "flos": 21805570800000.0, + "grad_norm": 2.242038874190131, + "language_loss": 0.63238472, + "learning_rate": 1.7544184802902607e-07, + "loss": 0.70912772, + "num_input_tokens_seen": 312020770, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.10656738, + "step": 14465, + "time_per_iteration": 2.5462048053741455 + }, + { + "auxiliary_loss_clip": 0.06396791, + "auxiliary_loss_mlp": 0.01265271, + "balance_loss_clip": 0.06269901, + "balance_loss_mlp": 0.012567, + "epoch": 0.8697429730948444, + "flos": 22901691985920.0, + "grad_norm": 1.4710912733423256, + "language_loss": 0.84975493, + "learning_rate": 1.7528237113443934e-07, + "loss": 0.92637551, + "num_input_tokens_seen": 312041870, + "router_z_loss_clip": 1.26855469, + "router_z_loss_mlp": 0.08569336, + "step": 14466, + "time_per_iteration": 3.9527673721313477 + }, + { + "auxiliary_loss_clip": 0.06408149, + "auxiliary_loss_mlp": 0.01267066, + "balance_loss_clip": 0.06272207, + "balance_loss_mlp": 0.01256146, + "epoch": 0.8698030963475124, + "flos": 24724367925120.0, + "grad_norm": 2.1885311234894607, + "language_loss": 0.61972004, + "learning_rate": 1.7512296343332779e-07, + "loss": 0.69647217, + "num_input_tokens_seen": 312058210, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.10913086, + "step": 14467, + "time_per_iteration": 2.531226634979248 + }, + { + "auxiliary_loss_clip": 0.06397028, + "auxiliary_loss_mlp": 0.0126206, + "balance_loss_clip": 0.06269924, + "balance_loss_mlp": 0.01253441, + "epoch": 0.8698632196001803, + "flos": 28450291104000.0, + "grad_norm": 1.3163681767260083, + "language_loss": 0.69067562, + "learning_rate": 1.7496362493173655e-07, + "loss": 0.76726645, + "num_input_tokens_seen": 312082665, + "router_z_loss_clip": 1.27246094, + "router_z_loss_mlp": 0.08624268, + "step": 14468, + "time_per_iteration": 2.617129325866699 + }, + { + "auxiliary_loss_clip": 0.06402217, + "auxiliary_loss_mlp": 0.01263604, + "balance_loss_clip": 0.06272022, + "balance_loss_mlp": 0.0125501, + "epoch": 0.8699233428528483, + "flos": 27643877809920.0, + "grad_norm": 1.469874122619276, + "language_loss": 0.71179217, + "learning_rate": 1.7480435563570773e-07, + "loss": 0.78845036, + "num_input_tokens_seen": 312101960, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.08587646, + "step": 14469, + "time_per_iteration": 2.5837879180908203 + }, + { + "auxiliary_loss_clip": 0.06397484, + "auxiliary_loss_mlp": 0.01262825, + "balance_loss_clip": 0.06272286, + "balance_loss_mlp": 0.01254326, + "epoch": 0.8699834661055164, + "flos": 20051516954880.0, + "grad_norm": 2.250456431690659, + "language_loss": 0.84240717, + "learning_rate": 1.7464515555128024e-07, + "loss": 0.91901028, + "num_input_tokens_seen": 312117125, + "router_z_loss_clip": 1.24902344, + "router_z_loss_mlp": 0.08502197, + "step": 14470, + "time_per_iteration": 2.555173635482788 + }, + { + "auxiliary_loss_clip": 0.06400733, + "auxiliary_loss_mlp": 0.01262712, + "balance_loss_clip": 0.06270544, + "balance_loss_mlp": 0.01253384, + "epoch": 0.8700435893581843, + "flos": 23739607215360.0, + "grad_norm": 1.6759251274970535, + "language_loss": 0.73015386, + "learning_rate": 1.7448602468449148e-07, + "loss": 0.80678833, + "num_input_tokens_seen": 312135775, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.09332275, + "step": 14471, + "time_per_iteration": 2.49556827545166 + }, + { + "auxiliary_loss_clip": 0.06401968, + "auxiliary_loss_mlp": 0.0126843, + "balance_loss_clip": 0.06272097, + "balance_loss_mlp": 0.01259001, + "epoch": 0.8701037126108523, + "flos": 23554886889600.0, + "grad_norm": 1.414338662469805, + "language_loss": 0.79126775, + "learning_rate": 1.7432696304137573e-07, + "loss": 0.86797178, + "num_input_tokens_seen": 312156070, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.09429932, + "step": 14472, + "time_per_iteration": 2.546039581298828 + }, + { + "auxiliary_loss_clip": 0.06400506, + "auxiliary_loss_mlp": 0.01261454, + "balance_loss_clip": 0.06270543, + "balance_loss_mlp": 0.01252371, + "epoch": 0.8701638358635202, + "flos": 18849401953920.0, + "grad_norm": 1.7511234862282108, + "language_loss": 0.72525012, + "learning_rate": 1.741679706279644e-07, + "loss": 0.80186975, + "num_input_tokens_seen": 312174380, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.09075928, + "step": 14473, + "time_per_iteration": 2.4787282943725586 + }, + { + "auxiliary_loss_clip": 0.06408264, + "auxiliary_loss_mlp": 0.01262745, + "balance_loss_clip": 0.06274155, + "balance_loss_mlp": 0.01253232, + "epoch": 0.8702239591161882, + "flos": 27935807834880.0, + "grad_norm": 1.4568573772519522, + "language_loss": 0.72361302, + "learning_rate": 1.7400904745028644e-07, + "loss": 0.80032313, + "num_input_tokens_seen": 312195130, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.09521484, + "step": 14474, + "time_per_iteration": 2.580152750015259 + }, + { + "auxiliary_loss_clip": 0.0640256, + "auxiliary_loss_mlp": 0.01268742, + "balance_loss_clip": 0.06270008, + "balance_loss_mlp": 0.012588, + "epoch": 0.8702840823688561, + "flos": 17239007134080.0, + "grad_norm": 2.0568894505970836, + "language_loss": 0.67749852, + "learning_rate": 1.7385019351436925e-07, + "loss": 0.75421154, + "num_input_tokens_seen": 312212300, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.0994873, + "step": 14475, + "time_per_iteration": 2.4745309352874756 + }, + { + "auxiliary_loss_clip": 0.06405099, + "auxiliary_loss_mlp": 0.01266972, + "balance_loss_clip": 0.06271499, + "balance_loss_mlp": 0.01257334, + "epoch": 0.8703442056215241, + "flos": 19433681274240.0, + "grad_norm": 1.4998627111504736, + "language_loss": 0.78266013, + "learning_rate": 1.736914088262349e-07, + "loss": 0.85938084, + "num_input_tokens_seen": 312231735, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.09637451, + "step": 14476, + "time_per_iteration": 2.5792596340179443 + }, + { + "auxiliary_loss_clip": 0.06402189, + "auxiliary_loss_mlp": 0.01263388, + "balance_loss_clip": 0.06273142, + "balance_loss_mlp": 0.01254185, + "epoch": 0.8704043288741921, + "flos": 22280502142080.0, + "grad_norm": 1.4832205414105002, + "language_loss": 0.72368699, + "learning_rate": 1.7353269339190525e-07, + "loss": 0.8003428, + "num_input_tokens_seen": 312253060, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.09191895, + "step": 14477, + "time_per_iteration": 2.523857593536377 + }, + { + "auxiliary_loss_clip": 0.06404123, + "auxiliary_loss_mlp": 0.01265103, + "balance_loss_clip": 0.06272732, + "balance_loss_mlp": 0.01255906, + "epoch": 0.8704644521268601, + "flos": 16653386148480.0, + "grad_norm": 3.7210066512939064, + "language_loss": 0.59888941, + "learning_rate": 1.7337404721739946e-07, + "loss": 0.67558169, + "num_input_tokens_seen": 312269460, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09191895, + "step": 14478, + "time_per_iteration": 3.9272985458374023 + }, + { + "auxiliary_loss_clip": 0.06400814, + "auxiliary_loss_mlp": 0.01265797, + "balance_loss_clip": 0.06273096, + "balance_loss_mlp": 0.01257178, + "epoch": 0.870524575379528, + "flos": 24287143720320.0, + "grad_norm": 1.561156822868459, + "language_loss": 0.71748471, + "learning_rate": 1.732154703087323e-07, + "loss": 0.79415083, + "num_input_tokens_seen": 312289830, + "router_z_loss_clip": 1.27539062, + "router_z_loss_mlp": 0.08624268, + "step": 14479, + "time_per_iteration": 2.690037727355957 + }, + { + "auxiliary_loss_clip": 0.06402399, + "auxiliary_loss_mlp": 0.0126804, + "balance_loss_clip": 0.06271303, + "balance_loss_mlp": 0.01257693, + "epoch": 0.870584698632196, + "flos": 28776490248960.0, + "grad_norm": 1.313083691844494, + "language_loss": 0.7115078, + "learning_rate": 1.7305696267191805e-07, + "loss": 0.78821218, + "num_input_tokens_seen": 312311320, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.10351562, + "step": 14480, + "time_per_iteration": 2.5635881423950195 + }, + { + "auxiliary_loss_clip": 0.06405388, + "auxiliary_loss_mlp": 0.01266207, + "balance_loss_clip": 0.06272168, + "balance_loss_mlp": 0.01256039, + "epoch": 0.8706448218848639, + "flos": 32457369058560.0, + "grad_norm": 1.5315464053111656, + "language_loss": 0.69993174, + "learning_rate": 1.728985243129666e-07, + "loss": 0.77664775, + "num_input_tokens_seen": 312332095, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.10174561, + "step": 14481, + "time_per_iteration": 2.6091196537017822 + }, + { + "auxiliary_loss_clip": 0.06400968, + "auxiliary_loss_mlp": 0.01264909, + "balance_loss_clip": 0.06270086, + "balance_loss_mlp": 0.01256403, + "epoch": 0.8707049451375319, + "flos": 22754720724480.0, + "grad_norm": 1.6803042036172529, + "language_loss": 0.77415997, + "learning_rate": 1.7274015523788643e-07, + "loss": 0.85081875, + "num_input_tokens_seen": 312351225, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.08496094, + "step": 14482, + "time_per_iteration": 2.505281448364258 + }, + { + "auxiliary_loss_clip": 0.06400886, + "auxiliary_loss_mlp": 0.01271627, + "balance_loss_clip": 0.06271046, + "balance_loss_mlp": 0.01262359, + "epoch": 0.8707650683902, + "flos": 15857496541440.0, + "grad_norm": 1.7059576346478473, + "language_loss": 0.76927876, + "learning_rate": 1.7258185545268234e-07, + "loss": 0.84600389, + "num_input_tokens_seen": 312369730, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.0927124, + "step": 14483, + "time_per_iteration": 3.9307732582092285 + }, + { + "auxiliary_loss_clip": 0.06408566, + "auxiliary_loss_mlp": 0.01267486, + "balance_loss_clip": 0.06271568, + "balance_loss_mlp": 0.01257127, + "epoch": 0.8708251916428679, + "flos": 16473068161920.0, + "grad_norm": 1.8670315835414784, + "language_loss": 0.61994016, + "learning_rate": 1.7242362496335749e-07, + "loss": 0.69670069, + "num_input_tokens_seen": 312386780, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.10351562, + "step": 14484, + "time_per_iteration": 3.927198886871338 + }, + { + "auxiliary_loss_clip": 0.06401549, + "auxiliary_loss_mlp": 0.01264874, + "balance_loss_clip": 0.06271225, + "balance_loss_mlp": 0.01255402, + "epoch": 0.8708853148955359, + "flos": 15383319886080.0, + "grad_norm": 1.6982742251902394, + "language_loss": 0.68225974, + "learning_rate": 1.7226546377591222e-07, + "loss": 0.75892395, + "num_input_tokens_seen": 312404875, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.09472656, + "step": 14485, + "time_per_iteration": 2.4758594036102295 + }, + { + "auxiliary_loss_clip": 0.06400119, + "auxiliary_loss_mlp": 0.01269297, + "balance_loss_clip": 0.06269044, + "balance_loss_mlp": 0.01259224, + "epoch": 0.8709454381482038, + "flos": 30558566085120.0, + "grad_norm": 1.707582248918332, + "language_loss": 0.63406742, + "learning_rate": 1.7210737189634373e-07, + "loss": 0.71076155, + "num_input_tokens_seen": 312425280, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.10076904, + "step": 14486, + "time_per_iteration": 2.600389003753662 + }, + { + "auxiliary_loss_clip": 0.06409895, + "auxiliary_loss_mlp": 0.01270202, + "balance_loss_clip": 0.06275006, + "balance_loss_mlp": 0.01259825, + "epoch": 0.8710055614008718, + "flos": 22608001025280.0, + "grad_norm": 1.9272108546392486, + "language_loss": 0.61984718, + "learning_rate": 1.7194934933064653e-07, + "loss": 0.69664824, + "num_input_tokens_seen": 312443835, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.1036377, + "step": 14487, + "time_per_iteration": 2.5196049213409424 + }, + { + "auxiliary_loss_clip": 0.06400737, + "auxiliary_loss_mlp": 0.01266902, + "balance_loss_clip": 0.0627054, + "balance_loss_mlp": 0.01258652, + "epoch": 0.8710656846535397, + "flos": 18449214053760.0, + "grad_norm": 1.8411007600329907, + "language_loss": 0.68481451, + "learning_rate": 1.7179139608481318e-07, + "loss": 0.76149088, + "num_input_tokens_seen": 312460830, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.08251953, + "step": 14488, + "time_per_iteration": 2.506927967071533 + }, + { + "auxiliary_loss_clip": 0.06402954, + "auxiliary_loss_mlp": 0.01268264, + "balance_loss_clip": 0.06271151, + "balance_loss_mlp": 0.01258317, + "epoch": 0.8711258079062077, + "flos": 16508678947200.0, + "grad_norm": 1.8335601369523609, + "language_loss": 0.85487485, + "learning_rate": 1.716335121648338e-07, + "loss": 0.93158698, + "num_input_tokens_seen": 312477575, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09942627, + "step": 14489, + "time_per_iteration": 2.484161376953125 + }, + { + "auxiliary_loss_clip": 0.06410562, + "auxiliary_loss_mlp": 0.01266116, + "balance_loss_clip": 0.06272433, + "balance_loss_mlp": 0.01255119, + "epoch": 0.8711859311588757, + "flos": 15667786897920.0, + "grad_norm": 6.139143930949815, + "language_loss": 0.76203996, + "learning_rate": 1.7147569757669445e-07, + "loss": 0.83880675, + "num_input_tokens_seen": 312492140, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.11004639, + "step": 14490, + "time_per_iteration": 2.5018839836120605 + }, + { + "auxiliary_loss_clip": 0.06407736, + "auxiliary_loss_mlp": 0.01268396, + "balance_loss_clip": 0.06273264, + "balance_loss_mlp": 0.01257363, + "epoch": 0.8712460544115437, + "flos": 15562589696640.0, + "grad_norm": 1.9796792508389878, + "language_loss": 0.76653862, + "learning_rate": 1.7131795232638012e-07, + "loss": 0.84329993, + "num_input_tokens_seen": 312508400, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.11022949, + "step": 14491, + "time_per_iteration": 2.4751522541046143 + }, + { + "auxiliary_loss_clip": 0.0640479, + "auxiliary_loss_mlp": 0.01265934, + "balance_loss_clip": 0.06274243, + "balance_loss_mlp": 0.01256922, + "epoch": 0.8713061776642116, + "flos": 16769148963840.0, + "grad_norm": 1.5868092330088945, + "language_loss": 0.6700983, + "learning_rate": 1.711602764198723e-07, + "loss": 0.74680555, + "num_input_tokens_seen": 312525915, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09020996, + "step": 14492, + "time_per_iteration": 2.5103981494903564 + }, + { + "auxiliary_loss_clip": 0.06399809, + "auxiliary_loss_mlp": 0.01261278, + "balance_loss_clip": 0.06271499, + "balance_loss_mlp": 0.01252665, + "epoch": 0.8713663009168796, + "flos": 24286766376960.0, + "grad_norm": 1.7963898814832777, + "language_loss": 0.69969654, + "learning_rate": 1.7100266986314992e-07, + "loss": 0.77630746, + "num_input_tokens_seen": 312544735, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.08618164, + "step": 14493, + "time_per_iteration": 2.501518487930298 + }, + { + "auxiliary_loss_clip": 0.06402645, + "auxiliary_loss_mlp": 0.0126872, + "balance_loss_clip": 0.06271104, + "balance_loss_mlp": 0.01258706, + "epoch": 0.8714264241695475, + "flos": 23800724369280.0, + "grad_norm": 2.714150442096016, + "language_loss": 0.89298224, + "learning_rate": 1.7084513266218936e-07, + "loss": 0.96969593, + "num_input_tokens_seen": 312557910, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.10015869, + "step": 14494, + "time_per_iteration": 2.496976375579834 + }, + { + "auxiliary_loss_clip": 0.06397564, + "auxiliary_loss_mlp": 0.01262665, + "balance_loss_clip": 0.06270292, + "balance_loss_mlp": 0.01253927, + "epoch": 0.8714865474222155, + "flos": 38007016352640.0, + "grad_norm": 1.585930512402851, + "language_loss": 0.59490967, + "learning_rate": 1.7068766482296514e-07, + "loss": 0.67151189, + "num_input_tokens_seen": 312580360, + "router_z_loss_clip": 1.27148438, + "router_z_loss_mlp": 0.08737183, + "step": 14495, + "time_per_iteration": 2.6341331005096436 + }, + { + "auxiliary_loss_clip": 0.0640444, + "auxiliary_loss_mlp": 0.01265038, + "balance_loss_clip": 0.06272034, + "balance_loss_mlp": 0.01255495, + "epoch": 0.8715466706748836, + "flos": 22462287575040.0, + "grad_norm": 1.899333408458114, + "language_loss": 0.8036266, + "learning_rate": 1.7053026635144762e-07, + "loss": 0.88032138, + "num_input_tokens_seen": 312597550, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.09539795, + "step": 14496, + "time_per_iteration": 2.512383460998535 + }, + { + "auxiliary_loss_clip": 0.06404877, + "auxiliary_loss_mlp": 0.01264441, + "balance_loss_clip": 0.06272918, + "balance_loss_mlp": 0.01254278, + "epoch": 0.8716067939275515, + "flos": 21221501114880.0, + "grad_norm": 1.979531289163737, + "language_loss": 0.79082352, + "learning_rate": 1.7037293725360624e-07, + "loss": 0.86751664, + "num_input_tokens_seen": 312616435, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.10168457, + "step": 14497, + "time_per_iteration": 2.5025105476379395 + }, + { + "auxiliary_loss_clip": 0.06405815, + "auxiliary_loss_mlp": 0.01265291, + "balance_loss_clip": 0.06270967, + "balance_loss_mlp": 0.01255128, + "epoch": 0.8716669171802195, + "flos": 23003535024000.0, + "grad_norm": 2.3896985728798827, + "language_loss": 0.67118752, + "learning_rate": 1.70215677535406e-07, + "loss": 0.74789858, + "num_input_tokens_seen": 312632770, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.10168457, + "step": 14498, + "time_per_iteration": 2.5077733993530273 + }, + { + "auxiliary_loss_clip": 0.06402379, + "auxiliary_loss_mlp": 0.012634, + "balance_loss_clip": 0.06270681, + "balance_loss_mlp": 0.01254066, + "epoch": 0.8717270404328874, + "flos": 29790991958400.0, + "grad_norm": 2.011348568561811, + "language_loss": 0.5741989, + "learning_rate": 1.700584872028108e-07, + "loss": 0.65085673, + "num_input_tokens_seen": 312651900, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09326172, + "step": 14499, + "time_per_iteration": 2.551210880279541 + }, + { + "auxiliary_loss_clip": 0.06407043, + "auxiliary_loss_mlp": 0.01264588, + "balance_loss_clip": 0.06273316, + "balance_loss_mlp": 0.01254664, + "epoch": 0.8717871636855554, + "flos": 22024686026880.0, + "grad_norm": 1.7042733854363687, + "language_loss": 0.8017959, + "learning_rate": 1.6990136626178097e-07, + "loss": 0.8785122, + "num_input_tokens_seen": 312671380, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.09918213, + "step": 14500, + "time_per_iteration": 2.527987480163574 + }, + { + "auxiliary_loss_clip": 0.06403673, + "auxiliary_loss_mlp": 0.01269023, + "balance_loss_clip": 0.06273565, + "balance_loss_mlp": 0.01259856, + "epoch": 0.8718472869382233, + "flos": 16659842912640.0, + "grad_norm": 1.7725346587418325, + "language_loss": 0.73199558, + "learning_rate": 1.6974431471827466e-07, + "loss": 0.8087225, + "num_input_tokens_seen": 312689215, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.0916748, + "step": 14501, + "time_per_iteration": 2.4719321727752686 + }, + { + "auxiliary_loss_clip": 0.06410412, + "auxiliary_loss_mlp": 0.01264013, + "balance_loss_clip": 0.06273587, + "balance_loss_mlp": 0.01253314, + "epoch": 0.8719074101908914, + "flos": 19500584359680.0, + "grad_norm": 1.6060046992779708, + "language_loss": 0.65037239, + "learning_rate": 1.695873325782482e-07, + "loss": 0.7271167, + "num_input_tokens_seen": 312706400, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.10699463, + "step": 14502, + "time_per_iteration": 2.5199615955352783 + }, + { + "auxiliary_loss_clip": 0.06404664, + "auxiliary_loss_mlp": 0.0126564, + "balance_loss_clip": 0.06272453, + "balance_loss_mlp": 0.01255925, + "epoch": 0.8719675334435593, + "flos": 33078894318720.0, + "grad_norm": 1.9549594610014964, + "language_loss": 0.69178712, + "learning_rate": 1.6943041984765262e-07, + "loss": 0.76849008, + "num_input_tokens_seen": 312727985, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.097229, + "step": 14503, + "time_per_iteration": 2.585371494293213 + }, + { + "auxiliary_loss_clip": 0.06405653, + "auxiliary_loss_mlp": 0.01264169, + "balance_loss_clip": 0.0627344, + "balance_loss_mlp": 0.01254448, + "epoch": 0.8720276566962273, + "flos": 13631404320000.0, + "grad_norm": 2.015312910125128, + "language_loss": 0.69743592, + "learning_rate": 1.6927357653243912e-07, + "loss": 0.7741341, + "num_input_tokens_seen": 312745025, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.09729004, + "step": 14504, + "time_per_iteration": 2.4844253063201904 + }, + { + "auxiliary_loss_clip": 0.06401467, + "auxiliary_loss_mlp": 0.01262384, + "balance_loss_clip": 0.06269079, + "balance_loss_mlp": 0.01252734, + "epoch": 0.8720877799488952, + "flos": 23520995112960.0, + "grad_norm": 1.7542452009567429, + "language_loss": 0.70339608, + "learning_rate": 1.691168026385552e-07, + "loss": 0.78003466, + "num_input_tokens_seen": 312764170, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.09661865, + "step": 14505, + "time_per_iteration": 2.501800537109375 + }, + { + "auxiliary_loss_clip": 0.06400619, + "auxiliary_loss_mlp": 0.0126351, + "balance_loss_clip": 0.06270672, + "balance_loss_mlp": 0.01255177, + "epoch": 0.8721479032015632, + "flos": 20820516600960.0, + "grad_norm": 1.4504260712656618, + "language_loss": 0.78312892, + "learning_rate": 1.6896009817194545e-07, + "loss": 0.85977018, + "num_input_tokens_seen": 312783830, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.08325195, + "step": 14506, + "time_per_iteration": 3.896496534347534 + }, + { + "auxiliary_loss_clip": 0.06404346, + "auxiliary_loss_mlp": 0.0126421, + "balance_loss_clip": 0.0626972, + "balance_loss_mlp": 0.0125459, + "epoch": 0.8722080264542311, + "flos": 19469711329920.0, + "grad_norm": 2.2593739015214895, + "language_loss": 0.74364638, + "learning_rate": 1.6880346313855221e-07, + "loss": 0.82033199, + "num_input_tokens_seen": 312802015, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.09619141, + "step": 14507, + "time_per_iteration": 2.5149693489074707 + }, + { + "auxiliary_loss_clip": 0.06409867, + "auxiliary_loss_mlp": 0.01267946, + "balance_loss_clip": 0.06273276, + "balance_loss_mlp": 0.01258075, + "epoch": 0.8722681497068991, + "flos": 21768241006080.0, + "grad_norm": 2.684746862543845, + "language_loss": 0.72729445, + "learning_rate": 1.686468975443156e-07, + "loss": 0.80407256, + "num_input_tokens_seen": 312820650, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.09869385, + "step": 14508, + "time_per_iteration": 2.480463743209839 + }, + { + "auxiliary_loss_clip": 0.06408631, + "auxiliary_loss_mlp": 0.0126697, + "balance_loss_clip": 0.06272415, + "balance_loss_mlp": 0.0125642, + "epoch": 0.8723282729595672, + "flos": 28884790051200.0, + "grad_norm": 2.2883900025545953, + "language_loss": 0.69032156, + "learning_rate": 1.6849040139517202e-07, + "loss": 0.76707762, + "num_input_tokens_seen": 312841310, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.10546875, + "step": 14509, + "time_per_iteration": 2.5842347145080566 + }, + { + "auxiliary_loss_clip": 0.06403151, + "auxiliary_loss_mlp": 0.01266131, + "balance_loss_clip": 0.06271935, + "balance_loss_mlp": 0.01256589, + "epoch": 0.8723883962122351, + "flos": 26476409272320.0, + "grad_norm": 1.5825052329417453, + "language_loss": 0.58807904, + "learning_rate": 1.683339746970558e-07, + "loss": 0.66477191, + "num_input_tokens_seen": 312862100, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09539795, + "step": 14510, + "time_per_iteration": 2.548917293548584 + }, + { + "auxiliary_loss_clip": 0.06413917, + "auxiliary_loss_mlp": 0.01269969, + "balance_loss_clip": 0.06273636, + "balance_loss_mlp": 0.01258794, + "epoch": 0.8724485194649031, + "flos": 20527664181120.0, + "grad_norm": 2.1184884114038556, + "language_loss": 0.67942345, + "learning_rate": 1.6817761745589865e-07, + "loss": 0.75626224, + "num_input_tokens_seen": 312880220, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.11187744, + "step": 14511, + "time_per_iteration": 2.5175976753234863 + }, + { + "auxiliary_loss_clip": 0.0640533, + "auxiliary_loss_mlp": 0.01264234, + "balance_loss_clip": 0.06270505, + "balance_loss_mlp": 0.01254047, + "epoch": 0.872508642717571, + "flos": 24360335861760.0, + "grad_norm": 1.596141317024249, + "language_loss": 0.81785661, + "learning_rate": 1.6802132967763027e-07, + "loss": 0.89455223, + "num_input_tokens_seen": 312900765, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.10180664, + "step": 14512, + "time_per_iteration": 2.542559862136841 + }, + { + "auxiliary_loss_clip": 0.06310365, + "auxiliary_loss_mlp": 0.01250481, + "balance_loss_clip": 0.06255949, + "balance_loss_mlp": 0.01249467, + "epoch": 0.872568765970239, + "flos": 61427132749440.0, + "grad_norm": 0.7791722432142947, + "language_loss": 0.5879969, + "learning_rate": 1.6786511136817617e-07, + "loss": 0.66360533, + "num_input_tokens_seen": 312955840, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.01014709, + "step": 14513, + "time_per_iteration": 3.0595717430114746 + }, + { + "auxiliary_loss_clip": 0.06401786, + "auxiliary_loss_mlp": 0.01265573, + "balance_loss_clip": 0.0626969, + "balance_loss_mlp": 0.01255917, + "epoch": 0.8726288892229069, + "flos": 22604059883520.0, + "grad_norm": 1.6369159357122527, + "language_loss": 0.76856357, + "learning_rate": 1.6770896253346112e-07, + "loss": 0.84523714, + "num_input_tokens_seen": 312973565, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.09661865, + "step": 14514, + "time_per_iteration": 2.505091905593872 + }, + { + "auxiliary_loss_clip": 0.06408387, + "auxiliary_loss_mlp": 0.01264552, + "balance_loss_clip": 0.06272617, + "balance_loss_mlp": 0.0125461, + "epoch": 0.872689012475575, + "flos": 25892339587200.0, + "grad_norm": 1.7178923167711113, + "language_loss": 0.65753925, + "learning_rate": 1.675528831794055e-07, + "loss": 0.73426867, + "num_input_tokens_seen": 312994660, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.0993042, + "step": 14515, + "time_per_iteration": 2.5665414333343506 + }, + { + "auxiliary_loss_clip": 0.06405771, + "auxiliary_loss_mlp": 0.01264715, + "balance_loss_clip": 0.06272528, + "balance_loss_mlp": 0.01254934, + "epoch": 0.8727491357282429, + "flos": 21513095723520.0, + "grad_norm": 1.926028752131716, + "language_loss": 0.78788495, + "learning_rate": 1.6739687331192842e-07, + "loss": 0.86458981, + "num_input_tokens_seen": 313009860, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.09777832, + "step": 14516, + "time_per_iteration": 2.480694055557251 + }, + { + "auxiliary_loss_clip": 0.0640446, + "auxiliary_loss_mlp": 0.01265123, + "balance_loss_clip": 0.06269546, + "balance_loss_mlp": 0.01254585, + "epoch": 0.8728092589809109, + "flos": 19213392090240.0, + "grad_norm": 2.236925792083213, + "language_loss": 0.72447747, + "learning_rate": 1.672409329369453e-07, + "loss": 0.80117333, + "num_input_tokens_seen": 313027025, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.10534668, + "step": 14517, + "time_per_iteration": 2.4733726978302 + }, + { + "auxiliary_loss_clip": 0.06400529, + "auxiliary_loss_mlp": 0.01267427, + "balance_loss_clip": 0.06271172, + "balance_loss_mlp": 0.01258599, + "epoch": 0.8728693822335788, + "flos": 20601652936320.0, + "grad_norm": 1.738008639362388, + "language_loss": 0.72772276, + "learning_rate": 1.6708506206036966e-07, + "loss": 0.80440235, + "num_input_tokens_seen": 313046830, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.08825684, + "step": 14518, + "time_per_iteration": 3.923923969268799 + }, + { + "auxiliary_loss_clip": 0.06398532, + "auxiliary_loss_mlp": 0.01264388, + "balance_loss_clip": 0.06269579, + "balance_loss_mlp": 0.01255269, + "epoch": 0.8729295054862468, + "flos": 21735523186560.0, + "grad_norm": 1.4853642793865207, + "language_loss": 0.74297607, + "learning_rate": 1.6692926068811275e-07, + "loss": 0.81960523, + "num_input_tokens_seen": 313067715, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.09124756, + "step": 14519, + "time_per_iteration": 2.583524227142334 + }, + { + "auxiliary_loss_clip": 0.0640825, + "auxiliary_loss_mlp": 0.01267705, + "balance_loss_clip": 0.06272946, + "balance_loss_mlp": 0.01256583, + "epoch": 0.8729896287389147, + "flos": 17678788888320.0, + "grad_norm": 2.5521451847443437, + "language_loss": 0.77261472, + "learning_rate": 1.6677352882608142e-07, + "loss": 0.84937429, + "num_input_tokens_seen": 313082305, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.11126709, + "step": 14520, + "time_per_iteration": 2.4702889919281006 + }, + { + "auxiliary_loss_clip": 0.06407069, + "auxiliary_loss_mlp": 0.01265858, + "balance_loss_clip": 0.06271906, + "balance_loss_mlp": 0.01255934, + "epoch": 0.8730497519915827, + "flos": 24578738328960.0, + "grad_norm": 1.679080927037556, + "language_loss": 0.81987226, + "learning_rate": 1.666178664801816e-07, + "loss": 0.89660144, + "num_input_tokens_seen": 313101190, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.09924316, + "step": 14521, + "time_per_iteration": 2.530060052871704 + }, + { + "auxiliary_loss_clip": 0.06406459, + "auxiliary_loss_mlp": 0.01267903, + "balance_loss_clip": 0.06272659, + "balance_loss_mlp": 0.01257777, + "epoch": 0.8731098752442508, + "flos": 13448822273280.0, + "grad_norm": 2.292757707836215, + "language_loss": 0.7680378, + "learning_rate": 1.6646227365631616e-07, + "loss": 0.8447814, + "num_input_tokens_seen": 313118965, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.10125732, + "step": 14522, + "time_per_iteration": 4.001532316207886 + }, + { + "auxiliary_loss_clip": 0.06400695, + "auxiliary_loss_mlp": 0.01266384, + "balance_loss_clip": 0.06270634, + "balance_loss_mlp": 0.01257229, + "epoch": 0.8731699984969187, + "flos": 23480730426240.0, + "grad_norm": 1.896353046813896, + "language_loss": 0.75725633, + "learning_rate": 1.66306750360385e-07, + "loss": 0.83392715, + "num_input_tokens_seen": 313139280, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.0914917, + "step": 14523, + "time_per_iteration": 2.529074192047119 + }, + { + "auxiliary_loss_clip": 0.06400236, + "auxiliary_loss_mlp": 0.01263987, + "balance_loss_clip": 0.06271105, + "balance_loss_mlp": 0.01254784, + "epoch": 0.8732301217495867, + "flos": 17718466596480.0, + "grad_norm": 2.1427135823795354, + "language_loss": 0.78751552, + "learning_rate": 1.6615129659828542e-07, + "loss": 0.86415774, + "num_input_tokens_seen": 313156655, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.09204102, + "step": 14524, + "time_per_iteration": 3.9017279148101807 + }, + { + "auxiliary_loss_clip": 0.06395754, + "auxiliary_loss_mlp": 0.0126382, + "balance_loss_clip": 0.06269418, + "balance_loss_mlp": 0.01254999, + "epoch": 0.8732902450022546, + "flos": 22060883790720.0, + "grad_norm": 1.8911749247959948, + "language_loss": 0.78280824, + "learning_rate": 1.6599591237591272e-07, + "loss": 0.85940397, + "num_input_tokens_seen": 313174050, + "router_z_loss_clip": 1.265625, + "router_z_loss_mlp": 0.0881958, + "step": 14525, + "time_per_iteration": 2.5112502574920654 + }, + { + "auxiliary_loss_clip": 0.06405047, + "auxiliary_loss_mlp": 0.01267041, + "balance_loss_clip": 0.06270174, + "balance_loss_mlp": 0.01257069, + "epoch": 0.8733503682549226, + "flos": 22279495893120.0, + "grad_norm": 1.5433520001458627, + "language_loss": 0.69392395, + "learning_rate": 1.6584059769915902e-07, + "loss": 0.77064478, + "num_input_tokens_seen": 313192765, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.09967041, + "step": 14526, + "time_per_iteration": 2.5193099975585938 + }, + { + "auxiliary_loss_clip": 0.06409685, + "auxiliary_loss_mlp": 0.01267069, + "balance_loss_clip": 0.06273328, + "balance_loss_mlp": 0.01256382, + "epoch": 0.8734104915075905, + "flos": 23370501980160.0, + "grad_norm": 1.732651268082275, + "language_loss": 0.61444616, + "learning_rate": 1.6568535257391326e-07, + "loss": 0.69121373, + "num_input_tokens_seen": 313210925, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.10687256, + "step": 14527, + "time_per_iteration": 2.6036882400512695 + }, + { + "auxiliary_loss_clip": 0.06414483, + "auxiliary_loss_mlp": 0.01268907, + "balance_loss_clip": 0.0627443, + "balance_loss_mlp": 0.01257862, + "epoch": 0.8734706147602586, + "flos": 17718047326080.0, + "grad_norm": 2.000916766827133, + "language_loss": 0.65944868, + "learning_rate": 1.6553017700606265e-07, + "loss": 0.73628259, + "num_input_tokens_seen": 313228250, + "router_z_loss_clip": 1.40039062, + "router_z_loss_mlp": 0.1104126, + "step": 14528, + "time_per_iteration": 2.4655213356018066 + }, + { + "auxiliary_loss_clip": 0.06403276, + "auxiliary_loss_mlp": 0.01264092, + "balance_loss_clip": 0.06274714, + "balance_loss_mlp": 0.01254794, + "epoch": 0.8735307380129265, + "flos": 22055055932160.0, + "grad_norm": 2.336985436344426, + "language_loss": 0.90133297, + "learning_rate": 1.6537507100149205e-07, + "loss": 0.9780066, + "num_input_tokens_seen": 313247880, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.09307861, + "step": 14529, + "time_per_iteration": 2.514073371887207 + }, + { + "auxiliary_loss_clip": 0.063995, + "auxiliary_loss_mlp": 0.01266507, + "balance_loss_clip": 0.06271863, + "balance_loss_mlp": 0.01256881, + "epoch": 0.8735908612655945, + "flos": 25345557768960.0, + "grad_norm": 1.7800121585868869, + "language_loss": 0.85022855, + "learning_rate": 1.6522003456608258e-07, + "loss": 0.92688859, + "num_input_tokens_seen": 313266790, + "router_z_loss_clip": 1.27734375, + "router_z_loss_mlp": 0.09625244, + "step": 14530, + "time_per_iteration": 2.524286985397339 + }, + { + "auxiliary_loss_clip": 0.06402133, + "auxiliary_loss_mlp": 0.01268815, + "balance_loss_clip": 0.06269572, + "balance_loss_mlp": 0.01259702, + "epoch": 0.8736509845182624, + "flos": 21546903646080.0, + "grad_norm": 2.029519430173588, + "language_loss": 0.74400681, + "learning_rate": 1.650650677057128e-07, + "loss": 0.82071632, + "num_input_tokens_seen": 313286805, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09112549, + "step": 14531, + "time_per_iteration": 2.537536144256592 + }, + { + "auxiliary_loss_clip": 0.06398211, + "auxiliary_loss_mlp": 0.01266853, + "balance_loss_clip": 0.06270216, + "balance_loss_mlp": 0.01257811, + "epoch": 0.8737111077709304, + "flos": 22023637850880.0, + "grad_norm": 1.7208212669688667, + "language_loss": 0.6192863, + "learning_rate": 1.6491017042625966e-07, + "loss": 0.69593698, + "num_input_tokens_seen": 313305415, + "router_z_loss_clip": 1.28027344, + "router_z_loss_mlp": 0.09039307, + "step": 14532, + "time_per_iteration": 2.5035369396209717 + }, + { + "auxiliary_loss_clip": 0.06313117, + "auxiliary_loss_mlp": 0.01253845, + "balance_loss_clip": 0.06258602, + "balance_loss_mlp": 0.012528, + "epoch": 0.8737712310235983, + "flos": 70086418842240.0, + "grad_norm": 0.7989490293536622, + "language_loss": 0.58785164, + "learning_rate": 1.6475534273359704e-07, + "loss": 0.66352129, + "num_input_tokens_seen": 313369940, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.0104599, + "step": 14533, + "time_per_iteration": 3.2517998218536377 + }, + { + "auxiliary_loss_clip": 0.06401654, + "auxiliary_loss_mlp": 0.01264271, + "balance_loss_clip": 0.06272509, + "balance_loss_mlp": 0.01254949, + "epoch": 0.8738313542762663, + "flos": 28665968313600.0, + "grad_norm": 2.0402838251566644, + "language_loss": 0.76672494, + "learning_rate": 1.646005846335954e-07, + "loss": 0.84338421, + "num_input_tokens_seen": 313390965, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.09326172, + "step": 14534, + "time_per_iteration": 2.546053409576416 + }, + { + "auxiliary_loss_clip": 0.06403311, + "auxiliary_loss_mlp": 0.01264005, + "balance_loss_clip": 0.06271609, + "balance_loss_mlp": 0.01254874, + "epoch": 0.8738914775289344, + "flos": 22352981523840.0, + "grad_norm": 1.5823807033231816, + "language_loss": 0.75660425, + "learning_rate": 1.6444589613212357e-07, + "loss": 0.8332774, + "num_input_tokens_seen": 313409680, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09118652, + "step": 14535, + "time_per_iteration": 2.539175510406494 + }, + { + "auxiliary_loss_clip": 0.06402861, + "auxiliary_loss_mlp": 0.01262561, + "balance_loss_clip": 0.0627098, + "balance_loss_mlp": 0.0125303, + "epoch": 0.8739516007816023, + "flos": 31767808682880.0, + "grad_norm": 1.9833489778511422, + "language_loss": 0.74645185, + "learning_rate": 1.64291277235048e-07, + "loss": 0.82310605, + "num_input_tokens_seen": 313431335, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.09533691, + "step": 14536, + "time_per_iteration": 2.588463068008423 + }, + { + "auxiliary_loss_clip": 0.06404154, + "auxiliary_loss_mlp": 0.01261289, + "balance_loss_clip": 0.06269206, + "balance_loss_mlp": 0.01251794, + "epoch": 0.8740117240342703, + "flos": 21217518046080.0, + "grad_norm": 1.6487681333797766, + "language_loss": 0.64354205, + "learning_rate": 1.641367279482304e-07, + "loss": 0.72019655, + "num_input_tokens_seen": 313449225, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.09503174, + "step": 14537, + "time_per_iteration": 2.475311517715454 + }, + { + "auxiliary_loss_clip": 0.06402414, + "auxiliary_loss_mlp": 0.0126706, + "balance_loss_clip": 0.06272729, + "balance_loss_mlp": 0.01257392, + "epoch": 0.8740718472869382, + "flos": 25192800576000.0, + "grad_norm": 1.6981530320484999, + "language_loss": 0.57779753, + "learning_rate": 1.6398224827753216e-07, + "loss": 0.65449232, + "num_input_tokens_seen": 313467715, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.09674072, + "step": 14538, + "time_per_iteration": 2.5233047008514404 + }, + { + "auxiliary_loss_clip": 0.0639964, + "auxiliary_loss_mlp": 0.01263306, + "balance_loss_clip": 0.06272976, + "balance_loss_mlp": 0.01254097, + "epoch": 0.8741319705396062, + "flos": 19507124977920.0, + "grad_norm": 1.743989836533952, + "language_loss": 0.68863463, + "learning_rate": 1.6382783822881142e-07, + "loss": 0.76526415, + "num_input_tokens_seen": 313486805, + "router_z_loss_clip": 1.265625, + "router_z_loss_mlp": 0.09210205, + "step": 14539, + "time_per_iteration": 2.4944701194763184 + }, + { + "auxiliary_loss_clip": 0.06409974, + "auxiliary_loss_mlp": 0.01265214, + "balance_loss_clip": 0.06271386, + "balance_loss_mlp": 0.01255112, + "epoch": 0.8741920937922741, + "flos": 14106167953920.0, + "grad_norm": 1.8528727857189147, + "language_loss": 0.74751997, + "learning_rate": 1.6367349780792262e-07, + "loss": 0.82427186, + "num_input_tokens_seen": 313504880, + "router_z_loss_clip": 1.38476562, + "router_z_loss_mlp": 0.10101318, + "step": 14540, + "time_per_iteration": 2.4811830520629883 + }, + { + "auxiliary_loss_clip": 0.0640149, + "auxiliary_loss_mlp": 0.01261579, + "balance_loss_clip": 0.06271747, + "balance_loss_mlp": 0.01251792, + "epoch": 0.8742522170449422, + "flos": 27717363440640.0, + "grad_norm": 1.6180222602989935, + "language_loss": 0.79222339, + "learning_rate": 1.635192270207193e-07, + "loss": 0.86885411, + "num_input_tokens_seen": 313524995, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.09790039, + "step": 14541, + "time_per_iteration": 2.5740039348602295 + }, + { + "auxiliary_loss_clip": 0.06413158, + "auxiliary_loss_mlp": 0.01267604, + "balance_loss_clip": 0.06276666, + "balance_loss_mlp": 0.01256864, + "epoch": 0.8743123402976101, + "flos": 21149021733120.0, + "grad_norm": 1.8380973773337208, + "language_loss": 0.66893399, + "learning_rate": 1.6336502587305035e-07, + "loss": 0.74574167, + "num_input_tokens_seen": 313541740, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.10748291, + "step": 14542, + "time_per_iteration": 2.5132861137390137 + }, + { + "auxiliary_loss_clip": 0.06308813, + "auxiliary_loss_mlp": 0.01251732, + "balance_loss_clip": 0.06254316, + "balance_loss_mlp": 0.01250717, + "epoch": 0.8743724635502781, + "flos": 60888275141760.0, + "grad_norm": 0.7602513032785679, + "language_loss": 0.54570305, + "learning_rate": 1.632108943707642e-07, + "loss": 0.62130845, + "num_input_tokens_seen": 313593445, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.01014709, + "step": 14543, + "time_per_iteration": 2.9452686309814453 + }, + { + "auxiliary_loss_clip": 0.06406276, + "auxiliary_loss_mlp": 0.01263911, + "balance_loss_clip": 0.06272275, + "balance_loss_mlp": 0.01254398, + "epoch": 0.874432586802946, + "flos": 28116545091840.0, + "grad_norm": 1.7912544552975234, + "language_loss": 0.69910216, + "learning_rate": 1.6305683251970458e-07, + "loss": 0.77580404, + "num_input_tokens_seen": 313615640, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.09515381, + "step": 14544, + "time_per_iteration": 2.5625085830688477 + }, + { + "auxiliary_loss_clip": 0.06397738, + "auxiliary_loss_mlp": 0.01259982, + "balance_loss_clip": 0.06271628, + "balance_loss_mlp": 0.01251685, + "epoch": 0.874492710055614, + "flos": 23557067095680.0, + "grad_norm": 1.4418848759585507, + "language_loss": 0.75803328, + "learning_rate": 1.62902840325714e-07, + "loss": 0.83461046, + "num_input_tokens_seen": 313635550, + "router_z_loss_clip": 1.26171875, + "router_z_loss_mlp": 0.08306885, + "step": 14545, + "time_per_iteration": 3.978076696395874 + }, + { + "auxiliary_loss_clip": 0.06402361, + "auxiliary_loss_mlp": 0.01264774, + "balance_loss_clip": 0.062708, + "balance_loss_mlp": 0.01254129, + "epoch": 0.8745528333082819, + "flos": 40925016864000.0, + "grad_norm": 1.6096623490639794, + "language_loss": 0.66167152, + "learning_rate": 1.6274891779463217e-07, + "loss": 0.73834288, + "num_input_tokens_seen": 313659275, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.10644531, + "step": 14546, + "time_per_iteration": 2.6540935039520264 + }, + { + "auxiliary_loss_clip": 0.06403122, + "auxiliary_loss_mlp": 0.0126332, + "balance_loss_clip": 0.06272014, + "balance_loss_mlp": 0.01253467, + "epoch": 0.87461295656095, + "flos": 23629630331520.0, + "grad_norm": 1.581391249306466, + "language_loss": 0.72981352, + "learning_rate": 1.6259506493229536e-07, + "loss": 0.8064779, + "num_input_tokens_seen": 313680595, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.09844971, + "step": 14547, + "time_per_iteration": 2.5465586185455322 + }, + { + "auxiliary_loss_clip": 0.06413304, + "auxiliary_loss_mlp": 0.01266861, + "balance_loss_clip": 0.06273919, + "balance_loss_mlp": 0.01256347, + "epoch": 0.874673079813618, + "flos": 38802235127040.0, + "grad_norm": 2.0398162521863608, + "language_loss": 0.69331336, + "learning_rate": 1.6244128174453752e-07, + "loss": 0.77011502, + "num_input_tokens_seen": 313699730, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.10516357, + "step": 14548, + "time_per_iteration": 2.6202781200408936 + }, + { + "auxiliary_loss_clip": 0.06407377, + "auxiliary_loss_mlp": 0.01264008, + "balance_loss_clip": 0.06269997, + "balance_loss_mlp": 0.01254174, + "epoch": 0.8747332030662859, + "flos": 23702948254080.0, + "grad_norm": 2.380289597874903, + "language_loss": 0.70875394, + "learning_rate": 1.6228756823719093e-07, + "loss": 0.78546774, + "num_input_tokens_seen": 313720090, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.09832764, + "step": 14549, + "time_per_iteration": 2.6334874629974365 + }, + { + "auxiliary_loss_clip": 0.06409204, + "auxiliary_loss_mlp": 0.0126558, + "balance_loss_clip": 0.06271277, + "balance_loss_mlp": 0.01255191, + "epoch": 0.8747933263189539, + "flos": 24469390350720.0, + "grad_norm": 2.097604364393195, + "language_loss": 0.83978105, + "learning_rate": 1.6213392441608352e-07, + "loss": 0.91652894, + "num_input_tokens_seen": 313736795, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.1038208, + "step": 14550, + "time_per_iteration": 2.5499937534332275 + }, + { + "auxiliary_loss_clip": 0.06409267, + "auxiliary_loss_mlp": 0.01262247, + "balance_loss_clip": 0.06273516, + "balance_loss_mlp": 0.01253003, + "epoch": 0.8748534495716218, + "flos": 13814405637120.0, + "grad_norm": 1.5904524065006318, + "language_loss": 0.72164232, + "learning_rate": 1.6198035028704183e-07, + "loss": 0.79835749, + "num_input_tokens_seen": 313754820, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.09246826, + "step": 14551, + "time_per_iteration": 2.4693989753723145 + }, + { + "auxiliary_loss_clip": 0.06401157, + "auxiliary_loss_mlp": 0.01261725, + "balance_loss_clip": 0.06272075, + "balance_loss_mlp": 0.01252582, + "epoch": 0.8749135728242898, + "flos": 29869886177280.0, + "grad_norm": 1.9835642625635446, + "language_loss": 0.64623117, + "learning_rate": 1.6182684585588934e-07, + "loss": 0.72286004, + "num_input_tokens_seen": 313775830, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.09143066, + "step": 14552, + "time_per_iteration": 2.558300733566284 + }, + { + "auxiliary_loss_clip": 0.06409608, + "auxiliary_loss_mlp": 0.01267334, + "balance_loss_clip": 0.06274374, + "balance_loss_mlp": 0.01256581, + "epoch": 0.8749736960769577, + "flos": 24140256312960.0, + "grad_norm": 1.5918713414815686, + "language_loss": 0.79966319, + "learning_rate": 1.616734111284479e-07, + "loss": 0.87643266, + "num_input_tokens_seen": 313795745, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.10754395, + "step": 14553, + "time_per_iteration": 2.5289547443389893 + }, + { + "auxiliary_loss_clip": 0.06405284, + "auxiliary_loss_mlp": 0.01264107, + "balance_loss_clip": 0.0627055, + "balance_loss_mlp": 0.01254594, + "epoch": 0.8750338193296258, + "flos": 17208385666560.0, + "grad_norm": 1.7861330816455667, + "language_loss": 0.70206106, + "learning_rate": 1.6152004611053416e-07, + "loss": 0.77875495, + "num_input_tokens_seen": 313813895, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.09509277, + "step": 14554, + "time_per_iteration": 2.5121958255767822 + }, + { + "auxiliary_loss_clip": 0.06400765, + "auxiliary_loss_mlp": 0.01263457, + "balance_loss_clip": 0.06269407, + "balance_loss_mlp": 0.01254272, + "epoch": 0.8750939425822937, + "flos": 23740110339840.0, + "grad_norm": 1.6171556811070096, + "language_loss": 0.83951151, + "learning_rate": 1.6136675080796457e-07, + "loss": 0.91615379, + "num_input_tokens_seen": 313834225, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.09179688, + "step": 14555, + "time_per_iteration": 2.533935546875 + }, + { + "auxiliary_loss_clip": 0.06403114, + "auxiliary_loss_mlp": 0.01267593, + "balance_loss_clip": 0.06271933, + "balance_loss_mlp": 0.01257753, + "epoch": 0.8751540658349617, + "flos": 26548888654080.0, + "grad_norm": 1.6023816965835223, + "language_loss": 0.71021914, + "learning_rate": 1.6121352522655252e-07, + "loss": 0.78692615, + "num_input_tokens_seen": 313854430, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09844971, + "step": 14556, + "time_per_iteration": 2.5914430618286133 + }, + { + "auxiliary_loss_clip": 0.06408825, + "auxiliary_loss_mlp": 0.0126549, + "balance_loss_clip": 0.0627299, + "balance_loss_mlp": 0.01255179, + "epoch": 0.8752141890876296, + "flos": 19392200703360.0, + "grad_norm": 2.2735534947570115, + "language_loss": 0.7708326, + "learning_rate": 1.6106036937210732e-07, + "loss": 0.84757572, + "num_input_tokens_seen": 313871600, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.10314941, + "step": 14557, + "time_per_iteration": 3.9466445446014404 + }, + { + "auxiliary_loss_clip": 0.06408848, + "auxiliary_loss_mlp": 0.01267158, + "balance_loss_clip": 0.06275624, + "balance_loss_mlp": 0.01256769, + "epoch": 0.8752743123402976, + "flos": 25381462043520.0, + "grad_norm": 1.69314146192959, + "language_loss": 0.83270669, + "learning_rate": 1.6090728325043767e-07, + "loss": 0.90946674, + "num_input_tokens_seen": 313891570, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.1038208, + "step": 14558, + "time_per_iteration": 2.5691773891448975 + }, + { + "auxiliary_loss_clip": 0.06311321, + "auxiliary_loss_mlp": 0.01250089, + "balance_loss_clip": 0.06256986, + "balance_loss_mlp": 0.01249142, + "epoch": 0.8753344355929655, + "flos": 59969578976640.0, + "grad_norm": 0.7810475083105511, + "language_loss": 0.56042981, + "learning_rate": 1.6075426686734784e-07, + "loss": 0.63604391, + "num_input_tokens_seen": 313951290, + "router_z_loss_clip": 0.54345703, + "router_z_loss_mlp": 0.00945282, + "step": 14559, + "time_per_iteration": 3.157846450805664 + }, + { + "auxiliary_loss_clip": 0.06402047, + "auxiliary_loss_mlp": 0.01266495, + "balance_loss_clip": 0.06271435, + "balance_loss_mlp": 0.01257125, + "epoch": 0.8753945588456336, + "flos": 17900419737600.0, + "grad_norm": 1.6963554419042506, + "language_loss": 0.66404682, + "learning_rate": 1.606013202286407e-07, + "loss": 0.74073219, + "num_input_tokens_seen": 313968645, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.09375, + "step": 14560, + "time_per_iteration": 2.470168352127075 + }, + { + "auxiliary_loss_clip": 0.06398799, + "auxiliary_loss_mlp": 0.01262913, + "balance_loss_clip": 0.06269611, + "balance_loss_mlp": 0.01253471, + "epoch": 0.8754546820983016, + "flos": 30921969242880.0, + "grad_norm": 1.8348910812668497, + "language_loss": 0.78910828, + "learning_rate": 1.6044844334011541e-07, + "loss": 0.8657254, + "num_input_tokens_seen": 313987580, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.09442139, + "step": 14561, + "time_per_iteration": 2.5636520385742188 + }, + { + "auxiliary_loss_clip": 0.06408288, + "auxiliary_loss_mlp": 0.01264293, + "balance_loss_clip": 0.06271692, + "balance_loss_mlp": 0.01253952, + "epoch": 0.8755148053509695, + "flos": 20637305648640.0, + "grad_norm": 1.9358118623790102, + "language_loss": 0.78181839, + "learning_rate": 1.6029563620756982e-07, + "loss": 0.85854423, + "num_input_tokens_seen": 314004460, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.10339355, + "step": 14562, + "time_per_iteration": 3.9300997257232666 + }, + { + "auxiliary_loss_clip": 0.06399447, + "auxiliary_loss_mlp": 0.01263478, + "balance_loss_clip": 0.06274161, + "balance_loss_mlp": 0.01254681, + "epoch": 0.8755749286036375, + "flos": 34978326197760.0, + "grad_norm": 1.6279482889503327, + "language_loss": 0.72014946, + "learning_rate": 1.601428988367981e-07, + "loss": 0.79677868, + "num_input_tokens_seen": 314026855, + "router_z_loss_clip": 1.25195312, + "router_z_loss_mlp": 0.08789062, + "step": 14563, + "time_per_iteration": 4.056689977645874 + }, + { + "auxiliary_loss_clip": 0.06408808, + "auxiliary_loss_mlp": 0.01265016, + "balance_loss_clip": 0.06271923, + "balance_loss_mlp": 0.01255283, + "epoch": 0.8756350518563054, + "flos": 18192265908480.0, + "grad_norm": 2.023004884264385, + "language_loss": 0.65937054, + "learning_rate": 1.5999023123359235e-07, + "loss": 0.73610878, + "num_input_tokens_seen": 314042830, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.09735107, + "step": 14564, + "time_per_iteration": 2.4699697494506836 + }, + { + "auxiliary_loss_clip": 0.06403805, + "auxiliary_loss_mlp": 0.01266635, + "balance_loss_clip": 0.06271675, + "balance_loss_mlp": 0.01257623, + "epoch": 0.8756951751089734, + "flos": 20090188414080.0, + "grad_norm": 1.696910224626912, + "language_loss": 0.70870125, + "learning_rate": 1.598376334037408e-07, + "loss": 0.78540564, + "num_input_tokens_seen": 314062225, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.09008789, + "step": 14565, + "time_per_iteration": 2.49548077583313 + }, + { + "auxiliary_loss_clip": 0.0641157, + "auxiliary_loss_mlp": 0.01264443, + "balance_loss_clip": 0.0627208, + "balance_loss_mlp": 0.01253553, + "epoch": 0.8757552983616413, + "flos": 27532349625600.0, + "grad_norm": 1.4285199436173486, + "language_loss": 0.77859598, + "learning_rate": 1.5968510535303102e-07, + "loss": 0.8553561, + "num_input_tokens_seen": 314082325, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.10882568, + "step": 14566, + "time_per_iteration": 2.749091863632202 + }, + { + "auxiliary_loss_clip": 0.06403997, + "auxiliary_loss_mlp": 0.01269996, + "balance_loss_clip": 0.06272083, + "balance_loss_mlp": 0.01260703, + "epoch": 0.8758154216143094, + "flos": 18078138247680.0, + "grad_norm": 1.529339605078132, + "language_loss": 0.71489322, + "learning_rate": 1.5953264708724624e-07, + "loss": 0.79163313, + "num_input_tokens_seen": 314100310, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.09283447, + "step": 14567, + "time_per_iteration": 2.5004701614379883 + }, + { + "auxiliary_loss_clip": 0.06402886, + "auxiliary_loss_mlp": 0.01267484, + "balance_loss_clip": 0.0627336, + "balance_loss_mlp": 0.01258305, + "epoch": 0.8758755448669773, + "flos": 25052621495040.0, + "grad_norm": 1.6619530150648376, + "language_loss": 0.74655724, + "learning_rate": 1.5938025861216776e-07, + "loss": 0.8232609, + "num_input_tokens_seen": 314121330, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.09179688, + "step": 14568, + "time_per_iteration": 2.5281195640563965 + }, + { + "auxiliary_loss_clip": 0.06398214, + "auxiliary_loss_mlp": 0.01265357, + "balance_loss_clip": 0.06268168, + "balance_loss_mlp": 0.01256416, + "epoch": 0.8759356681196453, + "flos": 22863439797120.0, + "grad_norm": 1.9978030218595135, + "language_loss": 0.87101042, + "learning_rate": 1.5922793993357475e-07, + "loss": 0.9476462, + "num_input_tokens_seen": 314139875, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.0894165, + "step": 14569, + "time_per_iteration": 2.5461788177490234 + }, + { + "auxiliary_loss_clip": 0.06404515, + "auxiliary_loss_mlp": 0.01262364, + "balance_loss_clip": 0.06270414, + "balance_loss_mlp": 0.01253435, + "epoch": 0.8759957913723132, + "flos": 21038835214080.0, + "grad_norm": 1.6138151637367601, + "language_loss": 0.7468214, + "learning_rate": 1.5907569105724284e-07, + "loss": 0.82349014, + "num_input_tokens_seen": 314157850, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.0892334, + "step": 14570, + "time_per_iteration": 2.498565196990967 + }, + { + "auxiliary_loss_clip": 0.06409349, + "auxiliary_loss_mlp": 0.01263872, + "balance_loss_clip": 0.06273144, + "balance_loss_mlp": 0.01254192, + "epoch": 0.8760559146249812, + "flos": 20016535075200.0, + "grad_norm": 1.5814035636458428, + "language_loss": 0.68048859, + "learning_rate": 1.5892351198894472e-07, + "loss": 0.75722075, + "num_input_tokens_seen": 314176720, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.09680176, + "step": 14571, + "time_per_iteration": 2.5363006591796875 + }, + { + "auxiliary_loss_clip": 0.06400727, + "auxiliary_loss_mlp": 0.01262869, + "balance_loss_clip": 0.06271683, + "balance_loss_mlp": 0.01254131, + "epoch": 0.8761160378776491, + "flos": 19980253457280.0, + "grad_norm": 1.8860279623082572, + "language_loss": 0.62593281, + "learning_rate": 1.5877140273445156e-07, + "loss": 0.70256877, + "num_input_tokens_seen": 314196645, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.08734131, + "step": 14572, + "time_per_iteration": 2.509617328643799 + }, + { + "auxiliary_loss_clip": 0.06398857, + "auxiliary_loss_mlp": 0.01263429, + "balance_loss_clip": 0.06271888, + "balance_loss_mlp": 0.01254542, + "epoch": 0.8761761611303172, + "flos": 28812101034240.0, + "grad_norm": 1.87554988756501, + "language_loss": 0.74363232, + "learning_rate": 1.5861936329953162e-07, + "loss": 0.82025516, + "num_input_tokens_seen": 314217430, + "router_z_loss_clip": 1.26953125, + "router_z_loss_mlp": 0.08886719, + "step": 14573, + "time_per_iteration": 2.5745317935943604 + }, + { + "auxiliary_loss_clip": 0.0639876, + "auxiliary_loss_mlp": 0.0126231, + "balance_loss_clip": 0.06270745, + "balance_loss_mlp": 0.01253763, + "epoch": 0.8762362843829851, + "flos": 18338356702080.0, + "grad_norm": 1.9590289923808466, + "language_loss": 0.73202926, + "learning_rate": 1.5846739368994966e-07, + "loss": 0.80863994, + "num_input_tokens_seen": 314235310, + "router_z_loss_clip": 1.28222656, + "router_z_loss_mlp": 0.08544922, + "step": 14574, + "time_per_iteration": 2.465827465057373 + }, + { + "auxiliary_loss_clip": 0.06403725, + "auxiliary_loss_mlp": 0.01263106, + "balance_loss_clip": 0.06272864, + "balance_loss_mlp": 0.01253361, + "epoch": 0.8762964076356531, + "flos": 15784681743360.0, + "grad_norm": 1.6549061624891563, + "language_loss": 0.76195455, + "learning_rate": 1.5831549391146903e-07, + "loss": 0.83862293, + "num_input_tokens_seen": 314252355, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09747314, + "step": 14575, + "time_per_iteration": 2.5050904750823975 + }, + { + "auxiliary_loss_clip": 0.06398784, + "auxiliary_loss_mlp": 0.01266013, + "balance_loss_clip": 0.06271212, + "balance_loss_mlp": 0.01256667, + "epoch": 0.8763565308883211, + "flos": 33184175374080.0, + "grad_norm": 1.6971430511045047, + "language_loss": 0.66751701, + "learning_rate": 1.5816366396984916e-07, + "loss": 0.74416494, + "num_input_tokens_seen": 314272755, + "router_z_loss_clip": 1.27734375, + "router_z_loss_mlp": 0.09344482, + "step": 14576, + "time_per_iteration": 2.707777500152588 + }, + { + "auxiliary_loss_clip": 0.06400728, + "auxiliary_loss_mlp": 0.01264456, + "balance_loss_clip": 0.06270373, + "balance_loss_mlp": 0.01255456, + "epoch": 0.876416654140989, + "flos": 15893568524160.0, + "grad_norm": 5.287288925068646, + "language_loss": 0.67297328, + "learning_rate": 1.5801190387084806e-07, + "loss": 0.74962509, + "num_input_tokens_seen": 314291365, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.08996582, + "step": 14577, + "time_per_iteration": 2.516228675842285 + }, + { + "auxiliary_loss_clip": 0.06408198, + "auxiliary_loss_mlp": 0.01264689, + "balance_loss_clip": 0.06274717, + "balance_loss_mlp": 0.01254753, + "epoch": 0.876476777393657, + "flos": 25892381514240.0, + "grad_norm": 2.047552880616012, + "language_loss": 0.71286416, + "learning_rate": 1.5786021362021962e-07, + "loss": 0.78959298, + "num_input_tokens_seen": 314310075, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.09936523, + "step": 14578, + "time_per_iteration": 2.5514087677001953 + }, + { + "auxiliary_loss_clip": 0.06409043, + "auxiliary_loss_mlp": 0.01268646, + "balance_loss_clip": 0.06273985, + "balance_loss_mlp": 0.01258501, + "epoch": 0.876536900646325, + "flos": 13594787285760.0, + "grad_norm": 1.8887093995761175, + "language_loss": 0.7153939, + "learning_rate": 1.5770859322371676e-07, + "loss": 0.79217076, + "num_input_tokens_seen": 314325695, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.10150146, + "step": 14579, + "time_per_iteration": 2.5152196884155273 + }, + { + "auxiliary_loss_clip": 0.06400099, + "auxiliary_loss_mlp": 0.01261571, + "balance_loss_clip": 0.06272951, + "balance_loss_mlp": 0.01252624, + "epoch": 0.876597023898993, + "flos": 12208245448320.0, + "grad_norm": 3.2232555084556265, + "language_loss": 0.69840139, + "learning_rate": 1.5755704268708912e-07, + "loss": 0.7750181, + "num_input_tokens_seen": 314343605, + "router_z_loss_clip": 1.27246094, + "router_z_loss_mlp": 0.0894165, + "step": 14580, + "time_per_iteration": 2.5027308464050293 + }, + { + "auxiliary_loss_clip": 0.06397118, + "auxiliary_loss_mlp": 0.01264215, + "balance_loss_clip": 0.06268962, + "balance_loss_mlp": 0.01254893, + "epoch": 0.8766571471516609, + "flos": 25343629125120.0, + "grad_norm": 1.6080390513913188, + "language_loss": 0.65369827, + "learning_rate": 1.5740556201608256e-07, + "loss": 0.73031157, + "num_input_tokens_seen": 314364275, + "router_z_loss_clip": 1.28417969, + "router_z_loss_mlp": 0.09313965, + "step": 14581, + "time_per_iteration": 2.5610644817352295 + }, + { + "auxiliary_loss_clip": 0.06400178, + "auxiliary_loss_mlp": 0.01263336, + "balance_loss_clip": 0.06271291, + "balance_loss_mlp": 0.01254884, + "epoch": 0.8767172704043289, + "flos": 30120419485440.0, + "grad_norm": 2.0311405699132368, + "language_loss": 0.73738873, + "learning_rate": 1.572541512164416e-07, + "loss": 0.81402385, + "num_input_tokens_seen": 314385140, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.08459473, + "step": 14582, + "time_per_iteration": 2.5676662921905518 + }, + { + "auxiliary_loss_clip": 0.06401975, + "auxiliary_loss_mlp": 0.01266739, + "balance_loss_clip": 0.06271679, + "balance_loss_mlp": 0.01257095, + "epoch": 0.8767773936569968, + "flos": 19287171210240.0, + "grad_norm": 2.1739067295595884, + "language_loss": 0.67125332, + "learning_rate": 1.5710281029390826e-07, + "loss": 0.74794054, + "num_input_tokens_seen": 314403715, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.09649658, + "step": 14583, + "time_per_iteration": 2.5512192249298096 + }, + { + "auxiliary_loss_clip": 0.0640585, + "auxiliary_loss_mlp": 0.01261674, + "balance_loss_clip": 0.06272185, + "balance_loss_mlp": 0.01252614, + "epoch": 0.8768375169096648, + "flos": 21252877269120.0, + "grad_norm": 1.532856465266313, + "language_loss": 0.79368246, + "learning_rate": 1.5695153925422067e-07, + "loss": 0.87035769, + "num_input_tokens_seen": 314421880, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.09069824, + "step": 14584, + "time_per_iteration": 2.484201192855835 + }, + { + "auxiliary_loss_clip": 0.06405112, + "auxiliary_loss_mlp": 0.012662, + "balance_loss_clip": 0.06270323, + "balance_loss_mlp": 0.01256837, + "epoch": 0.8768976401623327, + "flos": 23302383010560.0, + "grad_norm": 1.4894739815416904, + "language_loss": 0.72938401, + "learning_rate": 1.5680033810311555e-07, + "loss": 0.80609715, + "num_input_tokens_seen": 314441585, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.09368896, + "step": 14585, + "time_per_iteration": 3.944657802581787 + }, + { + "auxiliary_loss_clip": 0.06401481, + "auxiliary_loss_mlp": 0.01264972, + "balance_loss_clip": 0.062718, + "balance_loss_mlp": 0.01255263, + "epoch": 0.8769577634150008, + "flos": 21367675762560.0, + "grad_norm": 1.8689895153618223, + "language_loss": 0.74672264, + "learning_rate": 1.5664920684632654e-07, + "loss": 0.82338715, + "num_input_tokens_seen": 314459020, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.0970459, + "step": 14586, + "time_per_iteration": 2.48671555519104 + }, + { + "auxiliary_loss_clip": 0.06402427, + "auxiliary_loss_mlp": 0.0126322, + "balance_loss_clip": 0.06271265, + "balance_loss_mlp": 0.01253922, + "epoch": 0.8770178866676687, + "flos": 23520869331840.0, + "grad_norm": 1.901621628510341, + "language_loss": 0.78764355, + "learning_rate": 1.564981454895844e-07, + "loss": 0.86430001, + "num_input_tokens_seen": 314478935, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09295654, + "step": 14587, + "time_per_iteration": 2.5289950370788574 + }, + { + "auxiliary_loss_clip": 0.06404516, + "auxiliary_loss_mlp": 0.01268497, + "balance_loss_clip": 0.06273125, + "balance_loss_mlp": 0.01258787, + "epoch": 0.8770780099203367, + "flos": 19725150101760.0, + "grad_norm": 1.5376144495313915, + "language_loss": 0.74347901, + "learning_rate": 1.5634715403861697e-07, + "loss": 0.82020915, + "num_input_tokens_seen": 314497635, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.0970459, + "step": 14588, + "time_per_iteration": 2.504408597946167 + }, + { + "auxiliary_loss_clip": 0.06400863, + "auxiliary_loss_mlp": 0.01263355, + "balance_loss_clip": 0.06272131, + "balance_loss_mlp": 0.01254402, + "epoch": 0.8771381331730047, + "flos": 21402028736640.0, + "grad_norm": 2.5853533604834387, + "language_loss": 0.67017472, + "learning_rate": 1.5619623249915016e-07, + "loss": 0.74681687, + "num_input_tokens_seen": 314515445, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.08947754, + "step": 14589, + "time_per_iteration": 2.530637264251709 + }, + { + "auxiliary_loss_clip": 0.06401638, + "auxiliary_loss_mlp": 0.01267687, + "balance_loss_clip": 0.06272372, + "balance_loss_mlp": 0.01258383, + "epoch": 0.8771982564256726, + "flos": 20267194164480.0, + "grad_norm": 2.192494295915613, + "language_loss": 0.71027219, + "learning_rate": 1.5604538087690732e-07, + "loss": 0.78696543, + "num_input_tokens_seen": 314533040, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.09295654, + "step": 14590, + "time_per_iteration": 2.519289970397949 + }, + { + "auxiliary_loss_clip": 0.0641445, + "auxiliary_loss_mlp": 0.01265546, + "balance_loss_clip": 0.06275117, + "balance_loss_mlp": 0.0125474, + "epoch": 0.8772583796783406, + "flos": 12493341365760.0, + "grad_norm": 2.278892739613534, + "language_loss": 0.75203848, + "learning_rate": 1.558945991776086e-07, + "loss": 0.82883847, + "num_input_tokens_seen": 314548280, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.10803223, + "step": 14591, + "time_per_iteration": 2.480944871902466 + }, + { + "auxiliary_loss_clip": 0.06396542, + "auxiliary_loss_mlp": 0.0126499, + "balance_loss_clip": 0.06272044, + "balance_loss_mlp": 0.01255751, + "epoch": 0.8773185029310085, + "flos": 15925992854400.0, + "grad_norm": 1.7438987564474657, + "language_loss": 0.80089593, + "learning_rate": 1.5574388740697096e-07, + "loss": 0.87751126, + "num_input_tokens_seen": 314565345, + "router_z_loss_clip": 1.24609375, + "router_z_loss_mlp": 0.09240723, + "step": 14592, + "time_per_iteration": 2.4851748943328857 + }, + { + "auxiliary_loss_clip": 0.06397837, + "auxiliary_loss_mlp": 0.01266197, + "balance_loss_clip": 0.06270818, + "balance_loss_mlp": 0.01257858, + "epoch": 0.8773786261836766, + "flos": 21510538174080.0, + "grad_norm": 1.550623060936972, + "language_loss": 0.82925177, + "learning_rate": 1.5559324557071052e-07, + "loss": 0.90589213, + "num_input_tokens_seen": 314584190, + "router_z_loss_clip": 1.27148438, + "router_z_loss_mlp": 0.08343506, + "step": 14593, + "time_per_iteration": 2.567701578140259 + }, + { + "auxiliary_loss_clip": 0.06401081, + "auxiliary_loss_mlp": 0.01264113, + "balance_loss_clip": 0.06272095, + "balance_loss_mlp": 0.01255256, + "epoch": 0.8774387494363445, + "flos": 26768884348800.0, + "grad_norm": 1.2807416584393148, + "language_loss": 0.75873339, + "learning_rate": 1.5544267367453845e-07, + "loss": 0.83538544, + "num_input_tokens_seen": 314605625, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.08862305, + "step": 14594, + "time_per_iteration": 2.566321849822998 + }, + { + "auxiliary_loss_clip": 0.0640325, + "auxiliary_loss_mlp": 0.0126413, + "balance_loss_clip": 0.06269722, + "balance_loss_mlp": 0.01255112, + "epoch": 0.8774988726890125, + "flos": 18484782912000.0, + "grad_norm": 1.9693354280798063, + "language_loss": 0.77621579, + "learning_rate": 1.552921717241651e-07, + "loss": 0.8528896, + "num_input_tokens_seen": 314622630, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.090271, + "step": 14595, + "time_per_iteration": 2.4619386196136475 + }, + { + "auxiliary_loss_clip": 0.06402054, + "auxiliary_loss_mlp": 0.01264392, + "balance_loss_clip": 0.06271306, + "balance_loss_mlp": 0.01254921, + "epoch": 0.8775589959416804, + "flos": 24433360295040.0, + "grad_norm": 1.3207424076931227, + "language_loss": 0.70732266, + "learning_rate": 1.5514173972529743e-07, + "loss": 0.78398716, + "num_input_tokens_seen": 314642460, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.09472656, + "step": 14596, + "time_per_iteration": 2.526388645172119 + }, + { + "auxiliary_loss_clip": 0.06398661, + "auxiliary_loss_mlp": 0.01265503, + "balance_loss_clip": 0.06270237, + "balance_loss_mlp": 0.01256628, + "epoch": 0.8776191191943484, + "flos": 23446796722560.0, + "grad_norm": 1.635020983674664, + "language_loss": 0.85904115, + "learning_rate": 1.5499137768364067e-07, + "loss": 0.93568277, + "num_input_tokens_seen": 314659875, + "router_z_loss_clip": 1.28320312, + "router_z_loss_mlp": 0.08874512, + "step": 14597, + "time_per_iteration": 3.9261152744293213 + }, + { + "auxiliary_loss_clip": 0.06402812, + "auxiliary_loss_mlp": 0.01265281, + "balance_loss_clip": 0.06272464, + "balance_loss_mlp": 0.01256185, + "epoch": 0.8776792424470163, + "flos": 26837674151040.0, + "grad_norm": 1.9849273814310462, + "language_loss": 0.72925198, + "learning_rate": 1.5484108560489494e-07, + "loss": 0.80593288, + "num_input_tokens_seen": 314680260, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.09094238, + "step": 14598, + "time_per_iteration": 2.5652682781219482 + }, + { + "auxiliary_loss_clip": 0.06404451, + "auxiliary_loss_mlp": 0.01263229, + "balance_loss_clip": 0.06273061, + "balance_loss_mlp": 0.01253788, + "epoch": 0.8777393656996844, + "flos": 15630499031040.0, + "grad_norm": 2.1509248383698782, + "language_loss": 0.77800953, + "learning_rate": 1.5469086349476036e-07, + "loss": 0.85468638, + "num_input_tokens_seen": 314696260, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09442139, + "step": 14599, + "time_per_iteration": 2.4392573833465576 + }, + { + "auxiliary_loss_clip": 0.0640744, + "auxiliary_loss_mlp": 0.01264831, + "balance_loss_clip": 0.06275728, + "balance_loss_mlp": 0.01255491, + "epoch": 0.8777994889523523, + "flos": 18885977061120.0, + "grad_norm": 1.9773713526565397, + "language_loss": 0.6848346, + "learning_rate": 1.545407113589332e-07, + "loss": 0.76155728, + "num_input_tokens_seen": 314714215, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09344482, + "step": 14600, + "time_per_iteration": 2.5783047676086426 + }, + { + "auxiliary_loss_clip": 0.0640178, + "auxiliary_loss_mlp": 0.01263195, + "balance_loss_clip": 0.06270954, + "balance_loss_mlp": 0.01253658, + "epoch": 0.8778596122050203, + "flos": 48836113850880.0, + "grad_norm": 1.7580584830878268, + "language_loss": 0.69559765, + "learning_rate": 1.543906292031072e-07, + "loss": 0.77224743, + "num_input_tokens_seen": 314735700, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.09527588, + "step": 14601, + "time_per_iteration": 4.200392484664917 + }, + { + "auxiliary_loss_clip": 0.06411396, + "auxiliary_loss_mlp": 0.01267458, + "balance_loss_clip": 0.06274483, + "balance_loss_mlp": 0.0125779, + "epoch": 0.8779197354576883, + "flos": 25666264471680.0, + "grad_norm": 1.7776243951443933, + "language_loss": 0.73434043, + "learning_rate": 1.542406170329733e-07, + "loss": 0.81112897, + "num_input_tokens_seen": 314753335, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.09661865, + "step": 14602, + "time_per_iteration": 2.5296902656555176 + }, + { + "auxiliary_loss_clip": 0.06397757, + "auxiliary_loss_mlp": 0.01264623, + "balance_loss_clip": 0.06269722, + "balance_loss_mlp": 0.01255807, + "epoch": 0.8779798587103562, + "flos": 18849150391680.0, + "grad_norm": 1.6545957796620159, + "language_loss": 0.70951098, + "learning_rate": 1.5409067485422056e-07, + "loss": 0.78613484, + "num_input_tokens_seen": 314770800, + "router_z_loss_clip": 1.27929688, + "router_z_loss_mlp": 0.08813477, + "step": 14603, + "time_per_iteration": 3.900700807571411 + }, + { + "auxiliary_loss_clip": 0.06315686, + "auxiliary_loss_mlp": 0.0125067, + "balance_loss_clip": 0.06261384, + "balance_loss_mlp": 0.0124961, + "epoch": 0.8780399819630242, + "flos": 68634022095360.0, + "grad_norm": 0.7420580476925245, + "language_loss": 0.54075485, + "learning_rate": 1.539408026725344e-07, + "loss": 0.61641842, + "num_input_tokens_seen": 314837275, + "router_z_loss_clip": 0.54589844, + "router_z_loss_mlp": 0.01062012, + "step": 14604, + "time_per_iteration": 3.145667314529419 + }, + { + "auxiliary_loss_clip": 0.06312891, + "auxiliary_loss_mlp": 0.01249667, + "balance_loss_clip": 0.06258688, + "balance_loss_mlp": 0.01248654, + "epoch": 0.8781001052156922, + "flos": 65755908927360.0, + "grad_norm": 0.6879925918981881, + "language_loss": 0.59306002, + "learning_rate": 1.537910004935976e-07, + "loss": 0.66868562, + "num_input_tokens_seen": 314902220, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.01013184, + "step": 14605, + "time_per_iteration": 3.1238157749176025 + }, + { + "auxiliary_loss_clip": 0.06404503, + "auxiliary_loss_mlp": 0.01264254, + "balance_loss_clip": 0.06271055, + "balance_loss_mlp": 0.01254848, + "epoch": 0.8781602284683602, + "flos": 22055391348480.0, + "grad_norm": 1.7310041158158627, + "language_loss": 0.85172927, + "learning_rate": 1.536412683230912e-07, + "loss": 0.92841685, + "num_input_tokens_seen": 314921645, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.09399414, + "step": 14606, + "time_per_iteration": 2.519148349761963 + }, + { + "auxiliary_loss_clip": 0.0640693, + "auxiliary_loss_mlp": 0.01264105, + "balance_loss_clip": 0.06271999, + "balance_loss_mlp": 0.01253997, + "epoch": 0.8782203517210281, + "flos": 17568099244800.0, + "grad_norm": 2.0459573713019266, + "language_loss": 0.71192271, + "learning_rate": 1.534916061666931e-07, + "loss": 0.78863305, + "num_input_tokens_seen": 314939390, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.10113525, + "step": 14607, + "time_per_iteration": 2.476141929626465 + }, + { + "auxiliary_loss_clip": 0.06399085, + "auxiliary_loss_mlp": 0.01265994, + "balance_loss_clip": 0.0627173, + "balance_loss_mlp": 0.01257238, + "epoch": 0.8782804749736961, + "flos": 25527510910080.0, + "grad_norm": 1.6865812212317128, + "language_loss": 0.72198415, + "learning_rate": 1.533420140300785e-07, + "loss": 0.79863501, + "num_input_tokens_seen": 314959205, + "router_z_loss_clip": 1.27539062, + "router_z_loss_mlp": 0.08758545, + "step": 14608, + "time_per_iteration": 2.543273687362671 + }, + { + "auxiliary_loss_clip": 0.06411412, + "auxiliary_loss_mlp": 0.01265109, + "balance_loss_clip": 0.06274945, + "balance_loss_mlp": 0.01255257, + "epoch": 0.878340598226364, + "flos": 21805193456640.0, + "grad_norm": 1.928532327012367, + "language_loss": 0.8771438, + "learning_rate": 1.5319249191891936e-07, + "loss": 0.95390904, + "num_input_tokens_seen": 314977485, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.09863281, + "step": 14609, + "time_per_iteration": 2.486294746398926 + }, + { + "auxiliary_loss_clip": 0.0640282, + "auxiliary_loss_mlp": 0.01268196, + "balance_loss_clip": 0.06272058, + "balance_loss_mlp": 0.01258832, + "epoch": 0.878400721479032, + "flos": 21108211994880.0, + "grad_norm": 1.4945868352839566, + "language_loss": 0.7052213, + "learning_rate": 1.5304303983888643e-07, + "loss": 0.78193146, + "num_input_tokens_seen": 314997830, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09368896, + "step": 14610, + "time_per_iteration": 2.513068437576294 + }, + { + "auxiliary_loss_clip": 0.06398328, + "auxiliary_loss_mlp": 0.0126464, + "balance_loss_clip": 0.06270711, + "balance_loss_mlp": 0.0125533, + "epoch": 0.8784608447316999, + "flos": 20929906506240.0, + "grad_norm": 1.880824719735257, + "language_loss": 0.81051499, + "learning_rate": 1.5289365779564612e-07, + "loss": 0.88714468, + "num_input_tokens_seen": 315016480, + "router_z_loss_clip": 1.27734375, + "router_z_loss_mlp": 0.09307861, + "step": 14611, + "time_per_iteration": 2.4752485752105713 + }, + { + "auxiliary_loss_clip": 0.06400166, + "auxiliary_loss_mlp": 0.01262109, + "balance_loss_clip": 0.06268719, + "balance_loss_mlp": 0.01252476, + "epoch": 0.878520967984368, + "flos": 23337281036160.0, + "grad_norm": 1.4827937857578044, + "language_loss": 0.76664627, + "learning_rate": 1.5274434579486338e-07, + "loss": 0.84326899, + "num_input_tokens_seen": 315036135, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09625244, + "step": 14612, + "time_per_iteration": 2.5153868198394775 + }, + { + "auxiliary_loss_clip": 0.06400725, + "auxiliary_loss_mlp": 0.01263329, + "balance_loss_clip": 0.06272018, + "balance_loss_mlp": 0.01254192, + "epoch": 0.8785810912370359, + "flos": 25525833828480.0, + "grad_norm": 1.4386207413508079, + "language_loss": 0.72404128, + "learning_rate": 1.525951038422002e-07, + "loss": 0.80068183, + "num_input_tokens_seen": 315057995, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.09143066, + "step": 14613, + "time_per_iteration": 2.5526235103607178 + }, + { + "auxiliary_loss_clip": 0.06313758, + "auxiliary_loss_mlp": 0.01250159, + "balance_loss_clip": 0.06259576, + "balance_loss_mlp": 0.01249207, + "epoch": 0.8786412144897039, + "flos": 61857103576320.0, + "grad_norm": 1.1387954879683988, + "language_loss": 0.64722979, + "learning_rate": 1.5244593194331667e-07, + "loss": 0.72286892, + "num_input_tokens_seen": 315104010, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.00950623, + "step": 14614, + "time_per_iteration": 2.897026538848877 + }, + { + "auxiliary_loss_clip": 0.06311168, + "auxiliary_loss_mlp": 0.01251335, + "balance_loss_clip": 0.06256739, + "balance_loss_mlp": 0.01250316, + "epoch": 0.8787013377423719, + "flos": 71011445990400.0, + "grad_norm": 0.70779446038068, + "language_loss": 0.58095002, + "learning_rate": 1.5229683010386762e-07, + "loss": 0.65657508, + "num_input_tokens_seen": 315174550, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.01018524, + "step": 14615, + "time_per_iteration": 3.2636308670043945 + }, + { + "auxiliary_loss_clip": 0.06402515, + "auxiliary_loss_mlp": 0.01265364, + "balance_loss_clip": 0.06271381, + "balance_loss_mlp": 0.01256286, + "epoch": 0.8787614609950398, + "flos": 17353092867840.0, + "grad_norm": 1.8779699458458277, + "language_loss": 0.73255086, + "learning_rate": 1.5214779832950807e-07, + "loss": 0.80922961, + "num_input_tokens_seen": 315191825, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.09082031, + "step": 14616, + "time_per_iteration": 2.5093941688537598 + }, + { + "auxiliary_loss_clip": 0.06311196, + "auxiliary_loss_mlp": 0.01252507, + "balance_loss_clip": 0.06256916, + "balance_loss_mlp": 0.01251385, + "epoch": 0.8788215842477078, + "flos": 72532003633920.0, + "grad_norm": 0.7819923375628035, + "language_loss": 0.5785529, + "learning_rate": 1.5199883662588953e-07, + "loss": 0.65418988, + "num_input_tokens_seen": 315255075, + "router_z_loss_clip": 0.54638672, + "router_z_loss_mlp": 0.01124573, + "step": 14617, + "time_per_iteration": 3.25418758392334 + }, + { + "auxiliary_loss_clip": 0.06399922, + "auxiliary_loss_mlp": 0.01266444, + "balance_loss_clip": 0.06271379, + "balance_loss_mlp": 0.01257146, + "epoch": 0.8788817075003758, + "flos": 24834470590080.0, + "grad_norm": 1.7451091411227035, + "language_loss": 0.84037435, + "learning_rate": 1.5184994499865987e-07, + "loss": 0.91703808, + "num_input_tokens_seen": 315273995, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.09301758, + "step": 14618, + "time_per_iteration": 2.5523579120635986 + }, + { + "auxiliary_loss_clip": 0.0639818, + "auxiliary_loss_mlp": 0.01263411, + "balance_loss_clip": 0.06273776, + "balance_loss_mlp": 0.0125434, + "epoch": 0.8789418307530438, + "flos": 22645498527360.0, + "grad_norm": 1.6061000948299264, + "language_loss": 0.69441819, + "learning_rate": 1.5170112345346598e-07, + "loss": 0.77103406, + "num_input_tokens_seen": 315294485, + "router_z_loss_clip": 1.24414062, + "router_z_loss_mlp": 0.09069824, + "step": 14619, + "time_per_iteration": 2.54170823097229 + }, + { + "auxiliary_loss_clip": 0.06405766, + "auxiliary_loss_mlp": 0.01264393, + "balance_loss_clip": 0.06271112, + "balance_loss_mlp": 0.0125497, + "epoch": 0.8790019540057117, + "flos": 19790795376000.0, + "grad_norm": 1.783720752563742, + "language_loss": 0.77634114, + "learning_rate": 1.5155237199595016e-07, + "loss": 0.85304272, + "num_input_tokens_seen": 315310420, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.09417725, + "step": 14620, + "time_per_iteration": 2.510427474975586 + }, + { + "auxiliary_loss_clip": 0.06402472, + "auxiliary_loss_mlp": 0.01265134, + "balance_loss_clip": 0.06271151, + "balance_loss_mlp": 0.01255687, + "epoch": 0.8790620772583797, + "flos": 20235943791360.0, + "grad_norm": 1.820776592101537, + "language_loss": 0.79876006, + "learning_rate": 1.514036906317542e-07, + "loss": 0.87543613, + "num_input_tokens_seen": 315330110, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.09448242, + "step": 14621, + "time_per_iteration": 2.523426055908203 + }, + { + "auxiliary_loss_clip": 0.06407711, + "auxiliary_loss_mlp": 0.01264569, + "balance_loss_clip": 0.06271552, + "balance_loss_mlp": 0.01255098, + "epoch": 0.8791222005110476, + "flos": 24137111784960.0, + "grad_norm": 1.602537149946791, + "language_loss": 0.67313725, + "learning_rate": 1.5125507936651506e-07, + "loss": 0.74986005, + "num_input_tokens_seen": 315350080, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.09472656, + "step": 14622, + "time_per_iteration": 2.5274059772491455 + }, + { + "auxiliary_loss_clip": 0.06402093, + "auxiliary_loss_mlp": 0.01263964, + "balance_loss_clip": 0.06273091, + "balance_loss_mlp": 0.01254481, + "epoch": 0.8791823237637156, + "flos": 21620263495680.0, + "grad_norm": 1.855612811571573, + "language_loss": 0.72613978, + "learning_rate": 1.511065382058687e-07, + "loss": 0.80280036, + "num_input_tokens_seen": 315366360, + "router_z_loss_clip": 1.28808594, + "router_z_loss_mlp": 0.09490967, + "step": 14623, + "time_per_iteration": 2.510666847229004 + }, + { + "auxiliary_loss_clip": 0.06397058, + "auxiliary_loss_mlp": 0.01263964, + "balance_loss_clip": 0.06268196, + "balance_loss_mlp": 0.01254821, + "epoch": 0.8792424470163835, + "flos": 24250275123840.0, + "grad_norm": 1.5326349370658456, + "language_loss": 0.79326856, + "learning_rate": 1.5095806715544801e-07, + "loss": 0.86987877, + "num_input_tokens_seen": 315385890, + "router_z_loss_clip": 1.28808594, + "router_z_loss_mlp": 0.0914917, + "step": 14624, + "time_per_iteration": 2.566740036010742 + }, + { + "auxiliary_loss_clip": 0.06401555, + "auxiliary_loss_mlp": 0.01267628, + "balance_loss_clip": 0.06269389, + "balance_loss_mlp": 0.01257429, + "epoch": 0.8793025702690516, + "flos": 24899025761280.0, + "grad_norm": 1.7167241879200805, + "language_loss": 0.80230272, + "learning_rate": 1.5080966622088265e-07, + "loss": 0.87899458, + "num_input_tokens_seen": 315403400, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.10205078, + "step": 14625, + "time_per_iteration": 3.918522834777832 + }, + { + "auxiliary_loss_clip": 0.06401938, + "auxiliary_loss_mlp": 0.01265505, + "balance_loss_clip": 0.06273644, + "balance_loss_mlp": 0.01256707, + "epoch": 0.8793626935217195, + "flos": 25379952670080.0, + "grad_norm": 1.5019930803038062, + "language_loss": 0.73864943, + "learning_rate": 1.5066133540779967e-07, + "loss": 0.81532383, + "num_input_tokens_seen": 315423670, + "router_z_loss_clip": 1.28222656, + "router_z_loss_mlp": 0.08795166, + "step": 14626, + "time_per_iteration": 2.562892198562622 + }, + { + "auxiliary_loss_clip": 0.06406923, + "auxiliary_loss_mlp": 0.01265377, + "balance_loss_clip": 0.06273006, + "balance_loss_mlp": 0.01255787, + "epoch": 0.8794228167743875, + "flos": 34686563880960.0, + "grad_norm": 1.3945734521090933, + "language_loss": 0.71120954, + "learning_rate": 1.505130747218246e-07, + "loss": 0.78793246, + "num_input_tokens_seen": 315446265, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.09588623, + "step": 14627, + "time_per_iteration": 2.6167502403259277 + }, + { + "auxiliary_loss_clip": 0.06399681, + "auxiliary_loss_mlp": 0.01263302, + "balance_loss_clip": 0.06269456, + "balance_loss_mlp": 0.01254064, + "epoch": 0.8794829400270555, + "flos": 19470130600320.0, + "grad_norm": 1.7440522993673278, + "language_loss": 0.72579825, + "learning_rate": 1.5036488416857873e-07, + "loss": 0.80242813, + "num_input_tokens_seen": 315464655, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.09246826, + "step": 14628, + "time_per_iteration": 2.4789912700653076 + }, + { + "auxiliary_loss_clip": 0.06404158, + "auxiliary_loss_mlp": 0.01265187, + "balance_loss_clip": 0.06273529, + "balance_loss_mlp": 0.01255114, + "epoch": 0.8795430632797234, + "flos": 15236767895040.0, + "grad_norm": 2.773153659158058, + "language_loss": 0.69403476, + "learning_rate": 1.5021676375368175e-07, + "loss": 0.77072817, + "num_input_tokens_seen": 315481090, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.10064697, + "step": 14629, + "time_per_iteration": 2.4813661575317383 + }, + { + "auxiliary_loss_clip": 0.06396346, + "auxiliary_loss_mlp": 0.01262621, + "balance_loss_clip": 0.06269465, + "balance_loss_mlp": 0.01253895, + "epoch": 0.8796031865323914, + "flos": 27751967976960.0, + "grad_norm": 1.4293653202616396, + "language_loss": 0.68995941, + "learning_rate": 1.5006871348275053e-07, + "loss": 0.76654905, + "num_input_tokens_seen": 315502010, + "router_z_loss_clip": 1.26855469, + "router_z_loss_mlp": 0.08728027, + "step": 14630, + "time_per_iteration": 2.5377347469329834 + }, + { + "auxiliary_loss_clip": 0.06396469, + "auxiliary_loss_mlp": 0.01263738, + "balance_loss_clip": 0.06271411, + "balance_loss_mlp": 0.01254208, + "epoch": 0.8796633097850594, + "flos": 31293506246400.0, + "grad_norm": 1.4070035021312453, + "language_loss": 0.7483651, + "learning_rate": 1.499207333613999e-07, + "loss": 0.82496721, + "num_input_tokens_seen": 315523040, + "router_z_loss_clip": 1.24902344, + "router_z_loss_mlp": 0.09533691, + "step": 14631, + "time_per_iteration": 2.5822885036468506 + }, + { + "auxiliary_loss_clip": 0.06393504, + "auxiliary_loss_mlp": 0.01266538, + "balance_loss_clip": 0.06268861, + "balance_loss_mlp": 0.01257067, + "epoch": 0.8797234330377274, + "flos": 24249981634560.0, + "grad_norm": 1.9319771057822412, + "language_loss": 0.68856537, + "learning_rate": 1.4977282339523954e-07, + "loss": 0.76516581, + "num_input_tokens_seen": 315541865, + "router_z_loss_clip": 1.24511719, + "router_z_loss_mlp": 0.0947876, + "step": 14632, + "time_per_iteration": 2.5268332958221436 + }, + { + "auxiliary_loss_clip": 0.06400291, + "auxiliary_loss_mlp": 0.01262146, + "balance_loss_clip": 0.06270425, + "balance_loss_mlp": 0.01253742, + "epoch": 0.8797835562903953, + "flos": 24173770746240.0, + "grad_norm": 1.6895810277497014, + "language_loss": 0.64861834, + "learning_rate": 1.4962498358987929e-07, + "loss": 0.72524273, + "num_input_tokens_seen": 315561470, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.08404541, + "step": 14633, + "time_per_iteration": 2.5247573852539062 + }, + { + "auxiliary_loss_clip": 0.06401753, + "auxiliary_loss_mlp": 0.01266986, + "balance_loss_clip": 0.0627309, + "balance_loss_mlp": 0.01258165, + "epoch": 0.8798436795430633, + "flos": 19291280060160.0, + "grad_norm": 1.3977423779566516, + "language_loss": 0.84072506, + "learning_rate": 1.4947721395092528e-07, + "loss": 0.91741252, + "num_input_tokens_seen": 315583140, + "router_z_loss_clip": 1.28613281, + "router_z_loss_mlp": 0.08822632, + "step": 14634, + "time_per_iteration": 2.5381462574005127 + }, + { + "auxiliary_loss_clip": 0.06400451, + "auxiliary_loss_mlp": 0.01266972, + "balance_loss_clip": 0.06269066, + "balance_loss_mlp": 0.01257173, + "epoch": 0.8799038027957312, + "flos": 28186173434880.0, + "grad_norm": 1.4907767475913263, + "language_loss": 0.79870266, + "learning_rate": 1.4932951448398056e-07, + "loss": 0.87537694, + "num_input_tokens_seen": 315601935, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09790039, + "step": 14635, + "time_per_iteration": 2.5396430492401123 + }, + { + "auxiliary_loss_clip": 0.06404407, + "auxiliary_loss_mlp": 0.01265799, + "balance_loss_clip": 0.06272666, + "balance_loss_mlp": 0.0125628, + "epoch": 0.8799639260483992, + "flos": 24651636981120.0, + "grad_norm": 1.7695455435420768, + "language_loss": 0.65644789, + "learning_rate": 1.4918188519464648e-07, + "loss": 0.73314989, + "num_input_tokens_seen": 315619995, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.09515381, + "step": 14636, + "time_per_iteration": 3.964998960494995 + }, + { + "auxiliary_loss_clip": 0.06402347, + "auxiliary_loss_mlp": 0.01267397, + "balance_loss_clip": 0.06271206, + "balance_loss_mlp": 0.01257902, + "epoch": 0.8800240493010671, + "flos": 22207058438400.0, + "grad_norm": 1.4677484913942043, + "language_loss": 0.70408964, + "learning_rate": 1.4903432608852074e-07, + "loss": 0.78078711, + "num_input_tokens_seen": 315637895, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.0949707, + "step": 14637, + "time_per_iteration": 2.5140292644500732 + }, + { + "auxiliary_loss_clip": 0.06401545, + "auxiliary_loss_mlp": 0.01264401, + "balance_loss_clip": 0.06271181, + "balance_loss_mlp": 0.01255353, + "epoch": 0.8800841725537352, + "flos": 14251252498560.0, + "grad_norm": 1.8480361398751275, + "language_loss": 0.66556799, + "learning_rate": 1.4888683717119843e-07, + "loss": 0.74222744, + "num_input_tokens_seen": 315655520, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.0904541, + "step": 14638, + "time_per_iteration": 2.519340991973877 + }, + { + "auxiliary_loss_clip": 0.06404281, + "auxiliary_loss_mlp": 0.01263496, + "balance_loss_clip": 0.06272089, + "balance_loss_mlp": 0.01253977, + "epoch": 0.8801442958064031, + "flos": 37425043019520.0, + "grad_norm": 2.054991343187147, + "language_loss": 0.58460569, + "learning_rate": 1.4873941844827286e-07, + "loss": 0.66128349, + "num_input_tokens_seen": 315678955, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.09521484, + "step": 14639, + "time_per_iteration": 2.622095823287964 + }, + { + "auxiliary_loss_clip": 0.0640137, + "auxiliary_loss_mlp": 0.01266992, + "balance_loss_clip": 0.06269941, + "balance_loss_mlp": 0.012573, + "epoch": 0.8802044190590711, + "flos": 25054550138880.0, + "grad_norm": 2.0541054396884677, + "language_loss": 0.74650657, + "learning_rate": 1.4859206992533402e-07, + "loss": 0.82319009, + "num_input_tokens_seen": 315700360, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.09692383, + "step": 14640, + "time_per_iteration": 3.9488940238952637 + }, + { + "auxiliary_loss_clip": 0.06400943, + "auxiliary_loss_mlp": 0.01262613, + "balance_loss_clip": 0.06270456, + "balance_loss_mlp": 0.01253052, + "epoch": 0.8802645423117391, + "flos": 24140717510400.0, + "grad_norm": 1.9319844379203082, + "language_loss": 0.70021105, + "learning_rate": 1.4844479160796985e-07, + "loss": 0.77684665, + "num_input_tokens_seen": 315719270, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.09564209, + "step": 14641, + "time_per_iteration": 2.5713586807250977 + }, + { + "auxiliary_loss_clip": 0.06405936, + "auxiliary_loss_mlp": 0.01262892, + "balance_loss_clip": 0.06272167, + "balance_loss_mlp": 0.01252789, + "epoch": 0.880324665564407, + "flos": 17936994844800.0, + "grad_norm": 1.944450035656478, + "language_loss": 0.85435617, + "learning_rate": 1.4829758350176457e-07, + "loss": 0.9310444, + "num_input_tokens_seen": 315737425, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.10107422, + "step": 14642, + "time_per_iteration": 3.906127691268921 + }, + { + "auxiliary_loss_clip": 0.06403466, + "auxiliary_loss_mlp": 0.01263245, + "balance_loss_clip": 0.06273904, + "balance_loss_mlp": 0.01253691, + "epoch": 0.880384788817075, + "flos": 21293938569600.0, + "grad_norm": 1.7769951500601024, + "language_loss": 0.78894514, + "learning_rate": 1.4815044561230038e-07, + "loss": 0.86561227, + "num_input_tokens_seen": 315755725, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.09558105, + "step": 14643, + "time_per_iteration": 2.519885778427124 + }, + { + "auxiliary_loss_clip": 0.06397131, + "auxiliary_loss_mlp": 0.01262242, + "balance_loss_clip": 0.06270497, + "balance_loss_mlp": 0.01253867, + "epoch": 0.880444912069743, + "flos": 12463390730880.0, + "grad_norm": 1.5041267161215206, + "language_loss": 0.73285198, + "learning_rate": 1.4800337794515705e-07, + "loss": 0.80944562, + "num_input_tokens_seen": 315773835, + "router_z_loss_clip": 1.26757812, + "router_z_loss_mlp": 0.0836792, + "step": 14644, + "time_per_iteration": 2.470648765563965 + }, + { + "auxiliary_loss_clip": 0.06408016, + "auxiliary_loss_mlp": 0.01267274, + "balance_loss_clip": 0.06272088, + "balance_loss_mlp": 0.01257129, + "epoch": 0.880505035322411, + "flos": 13631026976640.0, + "grad_norm": 2.3799093865223213, + "language_loss": 0.7972905, + "learning_rate": 1.47856380505911e-07, + "loss": 0.87404341, + "num_input_tokens_seen": 315790615, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.10144043, + "step": 14645, + "time_per_iteration": 2.518871545791626 + }, + { + "auxiliary_loss_clip": 0.06397209, + "auxiliary_loss_mlp": 0.01264412, + "balance_loss_clip": 0.06271972, + "balance_loss_mlp": 0.01255334, + "epoch": 0.8805651585750789, + "flos": 23189387379840.0, + "grad_norm": 1.4852789962824886, + "language_loss": 0.64198017, + "learning_rate": 1.477094533001364e-07, + "loss": 0.7185964, + "num_input_tokens_seen": 315811010, + "router_z_loss_clip": 1.25195312, + "router_z_loss_mlp": 0.09082031, + "step": 14646, + "time_per_iteration": 2.5021417140960693 + }, + { + "auxiliary_loss_clip": 0.06412499, + "auxiliary_loss_mlp": 0.01263315, + "balance_loss_clip": 0.06275496, + "balance_loss_mlp": 0.01253045, + "epoch": 0.8806252818277469, + "flos": 14908304689920.0, + "grad_norm": 2.619123359403294, + "language_loss": 0.77789688, + "learning_rate": 1.475625963334055e-07, + "loss": 0.85465503, + "num_input_tokens_seen": 315828130, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.10272217, + "step": 14647, + "time_per_iteration": 2.539391040802002 + }, + { + "auxiliary_loss_clip": 0.06398942, + "auxiliary_loss_mlp": 0.01263452, + "balance_loss_clip": 0.06270331, + "balance_loss_mlp": 0.01255, + "epoch": 0.8806854050804148, + "flos": 17644897111680.0, + "grad_norm": 2.192652669524439, + "language_loss": 0.75220722, + "learning_rate": 1.4741580961128652e-07, + "loss": 0.8288312, + "num_input_tokens_seen": 315844900, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.08453369, + "step": 14648, + "time_per_iteration": 2.4884188175201416 + }, + { + "auxiliary_loss_clip": 0.06403202, + "auxiliary_loss_mlp": 0.01265143, + "balance_loss_clip": 0.06270049, + "balance_loss_mlp": 0.01255952, + "epoch": 0.8807455283330828, + "flos": 25338514026240.0, + "grad_norm": 2.5305554735964573, + "language_loss": 0.65665662, + "learning_rate": 1.4726909313934522e-07, + "loss": 0.73334002, + "num_input_tokens_seen": 315863745, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.09191895, + "step": 14649, + "time_per_iteration": 2.5534260272979736 + }, + { + "auxiliary_loss_clip": 0.06403228, + "auxiliary_loss_mlp": 0.01263972, + "balance_loss_clip": 0.06272388, + "balance_loss_mlp": 0.01254495, + "epoch": 0.8808056515857507, + "flos": 25272239846400.0, + "grad_norm": 1.2725171028063786, + "language_loss": 0.62303275, + "learning_rate": 1.4712244692314578e-07, + "loss": 0.69970477, + "num_input_tokens_seen": 315885765, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.0947876, + "step": 14650, + "time_per_iteration": 2.5216543674468994 + }, + { + "auxiliary_loss_clip": 0.06398011, + "auxiliary_loss_mlp": 0.01261953, + "balance_loss_clip": 0.06269711, + "balance_loss_mlp": 0.01253018, + "epoch": 0.8808657748384188, + "flos": 26586176520960.0, + "grad_norm": 1.5795337054633014, + "language_loss": 0.72957003, + "learning_rate": 1.4697587096824914e-07, + "loss": 0.80616963, + "num_input_tokens_seen": 315907340, + "router_z_loss_clip": 1.28320312, + "router_z_loss_mlp": 0.08929443, + "step": 14651, + "time_per_iteration": 2.5674073696136475 + }, + { + "auxiliary_loss_clip": 0.06404445, + "auxiliary_loss_mlp": 0.01262501, + "balance_loss_clip": 0.0627149, + "balance_loss_mlp": 0.01252935, + "epoch": 0.8809258980910867, + "flos": 18667197250560.0, + "grad_norm": 1.6881514833270383, + "language_loss": 0.72177875, + "learning_rate": 1.4682936528021284e-07, + "loss": 0.7984482, + "num_input_tokens_seen": 315924935, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.09576416, + "step": 14652, + "time_per_iteration": 2.4565625190734863 + }, + { + "auxiliary_loss_clip": 0.06400369, + "auxiliary_loss_mlp": 0.01262522, + "balance_loss_clip": 0.062704, + "balance_loss_mlp": 0.01253509, + "epoch": 0.8809860213437547, + "flos": 19798426097280.0, + "grad_norm": 6.259659475652455, + "language_loss": 0.74713862, + "learning_rate": 1.4668292986459286e-07, + "loss": 0.82376754, + "num_input_tokens_seen": 315943165, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.09008789, + "step": 14653, + "time_per_iteration": 2.5095698833465576 + }, + { + "auxiliary_loss_clip": 0.06404018, + "auxiliary_loss_mlp": 0.01267393, + "balance_loss_clip": 0.06269836, + "balance_loss_mlp": 0.01257588, + "epoch": 0.8810461445964227, + "flos": 17900210102400.0, + "grad_norm": 1.7754653756175585, + "language_loss": 0.71624255, + "learning_rate": 1.465365647269421e-07, + "loss": 0.79295671, + "num_input_tokens_seen": 315961340, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.0980835, + "step": 14654, + "time_per_iteration": 2.458045244216919 + }, + { + "auxiliary_loss_clip": 0.06403499, + "auxiliary_loss_mlp": 0.01267179, + "balance_loss_clip": 0.06272502, + "balance_loss_mlp": 0.01257529, + "epoch": 0.8811062678490906, + "flos": 29170766436480.0, + "grad_norm": 1.4291557550809124, + "language_loss": 0.71611077, + "learning_rate": 1.4639026987281012e-07, + "loss": 0.79281753, + "num_input_tokens_seen": 315981335, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.09655762, + "step": 14655, + "time_per_iteration": 2.5877456665039062 + }, + { + "auxiliary_loss_clip": 0.06398024, + "auxiliary_loss_mlp": 0.01264929, + "balance_loss_clip": 0.06269453, + "balance_loss_mlp": 0.01256025, + "epoch": 0.8811663911017587, + "flos": 20344956353280.0, + "grad_norm": 1.56260789406541, + "language_loss": 0.81561428, + "learning_rate": 1.462440453077449e-07, + "loss": 0.89224374, + "num_input_tokens_seen": 316001325, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.08911133, + "step": 14656, + "time_per_iteration": 2.4939017295837402 + }, + { + "auxiliary_loss_clip": 0.06403321, + "auxiliary_loss_mlp": 0.01265996, + "balance_loss_clip": 0.06272201, + "balance_loss_mlp": 0.01257258, + "epoch": 0.8812265143544266, + "flos": 25892926565760.0, + "grad_norm": 1.6558958362539187, + "language_loss": 0.68877184, + "learning_rate": 1.460978910372914e-07, + "loss": 0.76546496, + "num_input_tokens_seen": 316022540, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.08740234, + "step": 14657, + "time_per_iteration": 2.5605247020721436 + }, + { + "auxiliary_loss_clip": 0.0640131, + "auxiliary_loss_mlp": 0.01264715, + "balance_loss_clip": 0.06269861, + "balance_loss_mlp": 0.01255804, + "epoch": 0.8812866376070946, + "flos": 27202335120000.0, + "grad_norm": 1.9275241644467438, + "language_loss": 0.83792698, + "learning_rate": 1.4595180706699207e-07, + "loss": 0.91458726, + "num_input_tokens_seen": 316037735, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.08911133, + "step": 14658, + "time_per_iteration": 2.539914846420288 + }, + { + "auxiliary_loss_clip": 0.06408009, + "auxiliary_loss_mlp": 0.01267518, + "balance_loss_clip": 0.06271239, + "balance_loss_mlp": 0.0125729, + "epoch": 0.8813467608597625, + "flos": 23814266803200.0, + "grad_norm": 1.768545286165811, + "language_loss": 0.77509159, + "learning_rate": 1.4580579340238554e-07, + "loss": 0.85184681, + "num_input_tokens_seen": 316058105, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.10235596, + "step": 14659, + "time_per_iteration": 2.574265480041504 + }, + { + "auxiliary_loss_clip": 0.06399348, + "auxiliary_loss_mlp": 0.01261562, + "balance_loss_clip": 0.06269409, + "balance_loss_mlp": 0.01252377, + "epoch": 0.8814068841124305, + "flos": 21111775793280.0, + "grad_norm": 1.7845469935654699, + "language_loss": 0.60817045, + "learning_rate": 1.4565985004900894e-07, + "loss": 0.68477958, + "num_input_tokens_seen": 316074415, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.09185791, + "step": 14660, + "time_per_iteration": 2.5120184421539307 + }, + { + "auxiliary_loss_clip": 0.06399903, + "auxiliary_loss_mlp": 0.01262177, + "balance_loss_clip": 0.06270248, + "balance_loss_mlp": 0.01252509, + "epoch": 0.8814670073650984, + "flos": 24723822873600.0, + "grad_norm": 1.6340648502892121, + "language_loss": 0.78212428, + "learning_rate": 1.455139770123972e-07, + "loss": 0.8587451, + "num_input_tokens_seen": 316094405, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.09674072, + "step": 14661, + "time_per_iteration": 2.5731544494628906 + }, + { + "auxiliary_loss_clip": 0.06405543, + "auxiliary_loss_mlp": 0.0126526, + "balance_loss_clip": 0.06272089, + "balance_loss_mlp": 0.01255294, + "epoch": 0.8815271306177664, + "flos": 22972913556480.0, + "grad_norm": 1.7150336378950353, + "language_loss": 0.76684302, + "learning_rate": 1.45368174298081e-07, + "loss": 0.84355104, + "num_input_tokens_seen": 316113390, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.09967041, + "step": 14662, + "time_per_iteration": 2.518737554550171 + }, + { + "auxiliary_loss_clip": 0.06397216, + "auxiliary_loss_mlp": 0.01265956, + "balance_loss_clip": 0.06270915, + "balance_loss_mlp": 0.01257356, + "epoch": 0.8815872538704344, + "flos": 19465518625920.0, + "grad_norm": 1.8360238755805145, + "language_loss": 0.73649955, + "learning_rate": 1.4522244191158929e-07, + "loss": 0.81313121, + "num_input_tokens_seen": 316131085, + "router_z_loss_clip": 1.26269531, + "router_z_loss_mlp": 0.08599854, + "step": 14663, + "time_per_iteration": 2.4928483963012695 + }, + { + "auxiliary_loss_clip": 0.06398933, + "auxiliary_loss_mlp": 0.01268829, + "balance_loss_clip": 0.06270891, + "balance_loss_mlp": 0.0125987, + "epoch": 0.8816473771231024, + "flos": 32164097368320.0, + "grad_norm": 1.4224599659696884, + "language_loss": 0.70133549, + "learning_rate": 1.450767798584489e-07, + "loss": 0.77801311, + "num_input_tokens_seen": 316151440, + "router_z_loss_clip": 1.27929688, + "router_z_loss_mlp": 0.08953857, + "step": 14664, + "time_per_iteration": 4.078710079193115 + }, + { + "auxiliary_loss_clip": 0.06400171, + "auxiliary_loss_mlp": 0.01263779, + "balance_loss_clip": 0.06271797, + "balance_loss_mlp": 0.01254916, + "epoch": 0.8817075003757703, + "flos": 19688323432320.0, + "grad_norm": 1.386701890018287, + "language_loss": 0.81031573, + "learning_rate": 1.449311881441828e-07, + "loss": 0.88695526, + "num_input_tokens_seen": 316170750, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.08868408, + "step": 14665, + "time_per_iteration": 2.5095698833465576 + }, + { + "auxiliary_loss_clip": 0.06401434, + "auxiliary_loss_mlp": 0.01260949, + "balance_loss_clip": 0.06272306, + "balance_loss_mlp": 0.01251817, + "epoch": 0.8817676236284383, + "flos": 15673950172800.0, + "grad_norm": 2.3358439244424862, + "language_loss": 0.58787858, + "learning_rate": 1.447856667743117e-07, + "loss": 0.66450244, + "num_input_tokens_seen": 316187265, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.09136963, + "step": 14666, + "time_per_iteration": 2.540194034576416 + }, + { + "auxiliary_loss_clip": 0.06400174, + "auxiliary_loss_mlp": 0.01265605, + "balance_loss_clip": 0.06270184, + "balance_loss_mlp": 0.01255102, + "epoch": 0.8818277468811063, + "flos": 17901048643200.0, + "grad_norm": 1.6530225652639872, + "language_loss": 0.83922029, + "learning_rate": 1.4464021575435403e-07, + "loss": 0.91587806, + "num_input_tokens_seen": 316206555, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.10498047, + "step": 14667, + "time_per_iteration": 2.495633125305176 + }, + { + "auxiliary_loss_clip": 0.06404059, + "auxiliary_loss_mlp": 0.01265655, + "balance_loss_clip": 0.06274028, + "balance_loss_mlp": 0.01255999, + "epoch": 0.8818878701337742, + "flos": 18776461374720.0, + "grad_norm": 1.7309788421424104, + "language_loss": 0.62558234, + "learning_rate": 1.4449483508982563e-07, + "loss": 0.70227951, + "num_input_tokens_seen": 316225210, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.09649658, + "step": 14668, + "time_per_iteration": 2.4942386150360107 + }, + { + "auxiliary_loss_clip": 0.06397483, + "auxiliary_loss_mlp": 0.01261702, + "balance_loss_clip": 0.06268862, + "balance_loss_mlp": 0.01252898, + "epoch": 0.8819479933864423, + "flos": 17718047326080.0, + "grad_norm": 2.2322444364782577, + "language_loss": 0.5726642, + "learning_rate": 1.4434952478623918e-07, + "loss": 0.64925605, + "num_input_tokens_seen": 316242685, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.0880127, + "step": 14669, + "time_per_iteration": 2.5518670082092285 + }, + { + "auxiliary_loss_clip": 0.0640443, + "auxiliary_loss_mlp": 0.01262805, + "balance_loss_clip": 0.06273519, + "balance_loss_mlp": 0.01253489, + "epoch": 0.8820081166391102, + "flos": 11733523741440.0, + "grad_norm": 1.7260866904493628, + "language_loss": 0.71694434, + "learning_rate": 1.442042848491043e-07, + "loss": 0.79361665, + "num_input_tokens_seen": 316260935, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09320068, + "step": 14670, + "time_per_iteration": 2.469038486480713 + }, + { + "auxiliary_loss_clip": 0.06399909, + "auxiliary_loss_mlp": 0.01267979, + "balance_loss_clip": 0.06269167, + "balance_loss_mlp": 0.01258067, + "epoch": 0.8820682398917782, + "flos": 27497745089280.0, + "grad_norm": 2.206437045380329, + "language_loss": 0.7456339, + "learning_rate": 1.44059115283929e-07, + "loss": 0.82231283, + "num_input_tokens_seen": 316281190, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.09924316, + "step": 14671, + "time_per_iteration": 2.5506999492645264 + }, + { + "auxiliary_loss_clip": 0.06403503, + "auxiliary_loss_mlp": 0.01269024, + "balance_loss_clip": 0.06270997, + "balance_loss_mlp": 0.0125882, + "epoch": 0.8821283631444461, + "flos": 16879587045120.0, + "grad_norm": 2.5171122435451245, + "language_loss": 0.85031545, + "learning_rate": 1.43914016096218e-07, + "loss": 0.9270407, + "num_input_tokens_seen": 316297115, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.10205078, + "step": 14672, + "time_per_iteration": 2.5202066898345947 + }, + { + "auxiliary_loss_clip": 0.06396373, + "auxiliary_loss_mlp": 0.01268498, + "balance_loss_clip": 0.0626964, + "balance_loss_mlp": 0.01259724, + "epoch": 0.8821884863971141, + "flos": 24288024188160.0, + "grad_norm": 1.6225814735684048, + "language_loss": 0.72806644, + "learning_rate": 1.4376898729147336e-07, + "loss": 0.8047151, + "num_input_tokens_seen": 316318235, + "router_z_loss_clip": 1.26953125, + "router_z_loss_mlp": 0.08770752, + "step": 14673, + "time_per_iteration": 2.6579220294952393 + }, + { + "auxiliary_loss_clip": 0.06309947, + "auxiliary_loss_mlp": 0.01255376, + "balance_loss_clip": 0.06255542, + "balance_loss_mlp": 0.01254378, + "epoch": 0.882248609649782, + "flos": 59453990876160.0, + "grad_norm": 0.7872167317420794, + "language_loss": 0.49268723, + "learning_rate": 1.4362402887519487e-07, + "loss": 0.56834042, + "num_input_tokens_seen": 316384705, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.00997162, + "step": 14674, + "time_per_iteration": 3.236130475997925 + }, + { + "auxiliary_loss_clip": 0.06401759, + "auxiliary_loss_mlp": 0.01265651, + "balance_loss_clip": 0.06269863, + "balance_loss_mlp": 0.01255608, + "epoch": 0.88230873290245, + "flos": 19943887985280.0, + "grad_norm": 1.8856716394845916, + "language_loss": 0.76288593, + "learning_rate": 1.4347914085287971e-07, + "loss": 0.83956003, + "num_input_tokens_seen": 316401165, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.10046387, + "step": 14675, + "time_per_iteration": 2.5227322578430176 + }, + { + "auxiliary_loss_clip": 0.06397566, + "auxiliary_loss_mlp": 0.01264151, + "balance_loss_clip": 0.06270373, + "balance_loss_mlp": 0.01255306, + "epoch": 0.882368856155118, + "flos": 16368374085120.0, + "grad_norm": 1.6123928744840947, + "language_loss": 0.79259509, + "learning_rate": 1.4333432323002105e-07, + "loss": 0.86921227, + "num_input_tokens_seen": 316418780, + "router_z_loss_clip": 1.2734375, + "router_z_loss_mlp": 0.08843994, + "step": 14676, + "time_per_iteration": 3.8567166328430176 + }, + { + "auxiliary_loss_clip": 0.06307142, + "auxiliary_loss_mlp": 0.01253674, + "balance_loss_clip": 0.06252797, + "balance_loss_mlp": 0.01252705, + "epoch": 0.882428979407786, + "flos": 70617672927360.0, + "grad_norm": 0.6822788139152429, + "language_loss": 0.54586005, + "learning_rate": 1.431895760121109e-07, + "loss": 0.62146819, + "num_input_tokens_seen": 316482030, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.00967407, + "step": 14677, + "time_per_iteration": 3.2512588500976562 + }, + { + "auxiliary_loss_clip": 0.06399799, + "auxiliary_loss_mlp": 0.01263106, + "balance_loss_clip": 0.06268829, + "balance_loss_mlp": 0.01253545, + "epoch": 0.8824891026604539, + "flos": 18156151998720.0, + "grad_norm": 2.014632299610882, + "language_loss": 0.65062732, + "learning_rate": 1.4304489920463847e-07, + "loss": 0.72725636, + "num_input_tokens_seen": 316499175, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.09558105, + "step": 14678, + "time_per_iteration": 2.472111225128174 + }, + { + "auxiliary_loss_clip": 0.06405297, + "auxiliary_loss_mlp": 0.01268562, + "balance_loss_clip": 0.06272408, + "balance_loss_mlp": 0.01259496, + "epoch": 0.8825492259131219, + "flos": 27239664913920.0, + "grad_norm": 1.7861369915928562, + "language_loss": 0.71231997, + "learning_rate": 1.4290029281308936e-07, + "loss": 0.78905857, + "num_input_tokens_seen": 316519495, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.09063721, + "step": 14679, + "time_per_iteration": 2.6039962768554688 + }, + { + "auxiliary_loss_clip": 0.06400929, + "auxiliary_loss_mlp": 0.01265754, + "balance_loss_clip": 0.06271735, + "balance_loss_mlp": 0.01257768, + "epoch": 0.8826093491657898, + "flos": 22281172974720.0, + "grad_norm": 1.5959410569258197, + "language_loss": 0.63950992, + "learning_rate": 1.4275575684294694e-07, + "loss": 0.71617675, + "num_input_tokens_seen": 316538180, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.07983398, + "step": 14680, + "time_per_iteration": 3.9252450466156006 + }, + { + "auxiliary_loss_clip": 0.06397928, + "auxiliary_loss_mlp": 0.01264633, + "balance_loss_clip": 0.06270419, + "balance_loss_mlp": 0.0125605, + "epoch": 0.8826694724184578, + "flos": 14209101095040.0, + "grad_norm": 2.809563443192349, + "language_loss": 0.77776754, + "learning_rate": 1.4261129129969328e-07, + "loss": 0.85439312, + "num_input_tokens_seen": 316551750, + "router_z_loss_clip": 1.27441406, + "router_z_loss_mlp": 0.08575439, + "step": 14681, + "time_per_iteration": 2.4502193927764893 + }, + { + "auxiliary_loss_clip": 0.06403942, + "auxiliary_loss_mlp": 0.01262466, + "balance_loss_clip": 0.06270965, + "balance_loss_mlp": 0.01252685, + "epoch": 0.8827295956711259, + "flos": 20638018408320.0, + "grad_norm": 1.5469151752981896, + "language_loss": 0.72931725, + "learning_rate": 1.424668961888047e-07, + "loss": 0.80598128, + "num_input_tokens_seen": 316570680, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.09783936, + "step": 14682, + "time_per_iteration": 3.962366819381714 + }, + { + "auxiliary_loss_clip": 0.06409137, + "auxiliary_loss_mlp": 0.01270395, + "balance_loss_clip": 0.06273471, + "balance_loss_mlp": 0.01259595, + "epoch": 0.8827897189237938, + "flos": 18518632761600.0, + "grad_norm": 1.6628923088438647, + "language_loss": 0.75193185, + "learning_rate": 1.4232257151575765e-07, + "loss": 0.82872719, + "num_input_tokens_seen": 316588635, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.10791016, + "step": 14683, + "time_per_iteration": 2.5152933597564697 + }, + { + "auxiliary_loss_clip": 0.06403377, + "auxiliary_loss_mlp": 0.01262559, + "balance_loss_clip": 0.06272641, + "balance_loss_mlp": 0.01252677, + "epoch": 0.8828498421764618, + "flos": 22754007964800.0, + "grad_norm": 2.015952811438403, + "language_loss": 0.66169786, + "learning_rate": 1.4217831728602492e-07, + "loss": 0.73835725, + "num_input_tokens_seen": 316607550, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09875488, + "step": 14684, + "time_per_iteration": 2.557262420654297 + }, + { + "auxiliary_loss_clip": 0.06398778, + "auxiliary_loss_mlp": 0.01263689, + "balance_loss_clip": 0.06268162, + "balance_loss_mlp": 0.01254563, + "epoch": 0.8829099654291297, + "flos": 15017694595200.0, + "grad_norm": 1.8477413865365486, + "language_loss": 0.69428438, + "learning_rate": 1.4203413350507677e-07, + "loss": 0.77090901, + "num_input_tokens_seen": 316624460, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.09124756, + "step": 14685, + "time_per_iteration": 2.5324926376342773 + }, + { + "auxiliary_loss_clip": 0.0640468, + "auxiliary_loss_mlp": 0.01262589, + "balance_loss_clip": 0.06270929, + "balance_loss_mlp": 0.01252623, + "epoch": 0.8829700886817977, + "flos": 16725026989440.0, + "grad_norm": 1.8057502590812853, + "language_loss": 0.7455259, + "learning_rate": 1.418900201783806e-07, + "loss": 0.82219857, + "num_input_tokens_seen": 316640765, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.09954834, + "step": 14686, + "time_per_iteration": 2.4790773391723633 + }, + { + "auxiliary_loss_clip": 0.06394429, + "auxiliary_loss_mlp": 0.01265012, + "balance_loss_clip": 0.06266899, + "balance_loss_mlp": 0.01255941, + "epoch": 0.8830302119344656, + "flos": 15267850560000.0, + "grad_norm": 1.7158951019726476, + "language_loss": 0.63215464, + "learning_rate": 1.417459773114007e-07, + "loss": 0.70874906, + "num_input_tokens_seen": 316656120, + "router_z_loss_clip": 1.2734375, + "router_z_loss_mlp": 0.09069824, + "step": 14687, + "time_per_iteration": 2.5241615772247314 + }, + { + "auxiliary_loss_clip": 0.06404291, + "auxiliary_loss_mlp": 0.01262922, + "balance_loss_clip": 0.06270834, + "balance_loss_mlp": 0.01252903, + "epoch": 0.8830903351871336, + "flos": 28624697377920.0, + "grad_norm": 2.595517619251839, + "language_loss": 0.69500947, + "learning_rate": 1.4160200490959984e-07, + "loss": 0.77168155, + "num_input_tokens_seen": 316676095, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.10028076, + "step": 14688, + "time_per_iteration": 2.540933609008789 + }, + { + "auxiliary_loss_clip": 0.06396133, + "auxiliary_loss_mlp": 0.01267955, + "balance_loss_clip": 0.06270996, + "balance_loss_mlp": 0.01259652, + "epoch": 0.8831504584398016, + "flos": 28009167684480.0, + "grad_norm": 1.5638574685604314, + "language_loss": 0.66877151, + "learning_rate": 1.4145810297843697e-07, + "loss": 0.74541235, + "num_input_tokens_seen": 316696235, + "router_z_loss_clip": 1.25292969, + "router_z_loss_mlp": 0.08294678, + "step": 14689, + "time_per_iteration": 2.5956904888153076 + }, + { + "auxiliary_loss_clip": 0.06402047, + "auxiliary_loss_mlp": 0.01265309, + "balance_loss_clip": 0.06273194, + "balance_loss_mlp": 0.01256839, + "epoch": 0.8832105816924696, + "flos": 26587098915840.0, + "grad_norm": 1.2720232823843813, + "language_loss": 0.74491525, + "learning_rate": 1.4131427152336905e-07, + "loss": 0.82158875, + "num_input_tokens_seen": 316719680, + "router_z_loss_clip": 1.28808594, + "router_z_loss_mlp": 0.08465576, + "step": 14690, + "time_per_iteration": 2.550379753112793 + }, + { + "auxiliary_loss_clip": 0.0640257, + "auxiliary_loss_mlp": 0.01265347, + "balance_loss_clip": 0.06271975, + "balance_loss_mlp": 0.01255065, + "epoch": 0.8832707049451375, + "flos": 24905524452480.0, + "grad_norm": 1.3286070309663014, + "language_loss": 0.7308588, + "learning_rate": 1.4117051054985018e-07, + "loss": 0.80753797, + "num_input_tokens_seen": 316739830, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.10272217, + "step": 14691, + "time_per_iteration": 2.618356466293335 + }, + { + "auxiliary_loss_clip": 0.06406618, + "auxiliary_loss_mlp": 0.01263553, + "balance_loss_clip": 0.06271677, + "balance_loss_mlp": 0.01254058, + "epoch": 0.8833308281978055, + "flos": 15456679735680.0, + "grad_norm": 1.9431819438637523, + "language_loss": 0.52190626, + "learning_rate": 1.4102682006333243e-07, + "loss": 0.5986079, + "num_input_tokens_seen": 316758105, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.09490967, + "step": 14692, + "time_per_iteration": 2.4854116439819336 + }, + { + "auxiliary_loss_clip": 0.06404817, + "auxiliary_loss_mlp": 0.0126387, + "balance_loss_clip": 0.06273092, + "balance_loss_mlp": 0.0125397, + "epoch": 0.8833909514504734, + "flos": 20307500778240.0, + "grad_norm": 2.1854307452735884, + "language_loss": 0.61036348, + "learning_rate": 1.4088320006926346e-07, + "loss": 0.6870504, + "num_input_tokens_seen": 316777455, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09906006, + "step": 14693, + "time_per_iteration": 2.5374739170074463 + }, + { + "auxiliary_loss_clip": 0.06395325, + "auxiliary_loss_mlp": 0.01263199, + "balance_loss_clip": 0.06270225, + "balance_loss_mlp": 0.01254532, + "epoch": 0.8834510747031414, + "flos": 20379938232960.0, + "grad_norm": 1.4784746764410908, + "language_loss": 0.75460541, + "learning_rate": 1.407396505730898e-07, + "loss": 0.83119071, + "num_input_tokens_seen": 316796300, + "router_z_loss_clip": 1.25195312, + "router_z_loss_mlp": 0.08666992, + "step": 14694, + "time_per_iteration": 2.543729066848755 + }, + { + "auxiliary_loss_clip": 0.06403571, + "auxiliary_loss_mlp": 0.01265299, + "balance_loss_clip": 0.06269252, + "balance_loss_mlp": 0.01256531, + "epoch": 0.8835111979558095, + "flos": 29759699658240.0, + "grad_norm": 1.9605899347359843, + "language_loss": 0.72491586, + "learning_rate": 1.4059617158025527e-07, + "loss": 0.80160457, + "num_input_tokens_seen": 316819090, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.08770752, + "step": 14695, + "time_per_iteration": 2.5731723308563232 + }, + { + "auxiliary_loss_clip": 0.06393148, + "auxiliary_loss_mlp": 0.0126203, + "balance_loss_clip": 0.06268685, + "balance_loss_mlp": 0.01253942, + "epoch": 0.8835713212084774, + "flos": 24141514124160.0, + "grad_norm": 1.602709205439156, + "language_loss": 0.8027606, + "learning_rate": 1.404527630961998e-07, + "loss": 0.8793124, + "num_input_tokens_seen": 316839250, + "router_z_loss_clip": 1.24511719, + "router_z_loss_mlp": 0.08093262, + "step": 14696, + "time_per_iteration": 2.534120798110962 + }, + { + "auxiliary_loss_clip": 0.06403233, + "auxiliary_loss_mlp": 0.01265612, + "balance_loss_clip": 0.06271463, + "balance_loss_mlp": 0.01256665, + "epoch": 0.8836314444611454, + "flos": 27679656303360.0, + "grad_norm": 2.0173732379548905, + "language_loss": 0.74990559, + "learning_rate": 1.4030942512636236e-07, + "loss": 0.82659405, + "num_input_tokens_seen": 316861315, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.08953857, + "step": 14697, + "time_per_iteration": 2.592552900314331 + }, + { + "auxiliary_loss_clip": 0.06399925, + "auxiliary_loss_mlp": 0.01266921, + "balance_loss_clip": 0.06270725, + "balance_loss_mlp": 0.01257844, + "epoch": 0.8836915677138133, + "flos": 16842634594560.0, + "grad_norm": 1.9895118296401026, + "language_loss": 0.72394419, + "learning_rate": 1.401661576761779e-07, + "loss": 0.80061269, + "num_input_tokens_seen": 316879325, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.09075928, + "step": 14698, + "time_per_iteration": 2.4627113342285156 + }, + { + "auxiliary_loss_clip": 0.06305031, + "auxiliary_loss_mlp": 0.0125323, + "balance_loss_clip": 0.06250586, + "balance_loss_mlp": 0.01252178, + "epoch": 0.8837516909664813, + "flos": 69332261368320.0, + "grad_norm": 0.7740855543002164, + "language_loss": 0.5369336, + "learning_rate": 1.4002296075107856e-07, + "loss": 0.61251622, + "num_input_tokens_seen": 316936425, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.01052856, + "step": 14699, + "time_per_iteration": 3.17140793800354 + }, + { + "auxiliary_loss_clip": 0.06403652, + "auxiliary_loss_mlp": 0.01264634, + "balance_loss_clip": 0.06268007, + "balance_loss_mlp": 0.01254931, + "epoch": 0.8838118142191492, + "flos": 21331142582400.0, + "grad_norm": 1.5418918526110506, + "language_loss": 0.76658535, + "learning_rate": 1.3987983435649508e-07, + "loss": 0.84326828, + "num_input_tokens_seen": 316956360, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.09698486, + "step": 14700, + "time_per_iteration": 2.5061445236206055 + }, + { + "auxiliary_loss_clip": 0.06398652, + "auxiliary_loss_mlp": 0.01261483, + "balance_loss_clip": 0.06270871, + "balance_loss_mlp": 0.01252536, + "epoch": 0.8838719374718172, + "flos": 21476981813760.0, + "grad_norm": 1.816100763964491, + "language_loss": 0.73857808, + "learning_rate": 1.3973677849785494e-07, + "loss": 0.81517947, + "num_input_tokens_seen": 316975295, + "router_z_loss_clip": 1.27636719, + "router_z_loss_mlp": 0.08947754, + "step": 14701, + "time_per_iteration": 2.5440568923950195 + }, + { + "auxiliary_loss_clip": 0.06405409, + "auxiliary_loss_mlp": 0.0126287, + "balance_loss_clip": 0.0626961, + "balance_loss_mlp": 0.01253297, + "epoch": 0.8839320607244852, + "flos": 26476157710080.0, + "grad_norm": 1.7347205509220878, + "language_loss": 0.71765238, + "learning_rate": 1.3959379318058262e-07, + "loss": 0.79433513, + "num_input_tokens_seen": 316994520, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.09570312, + "step": 14702, + "time_per_iteration": 2.5365030765533447 + }, + { + "auxiliary_loss_clip": 0.0640773, + "auxiliary_loss_mlp": 0.01267604, + "balance_loss_clip": 0.06274585, + "balance_loss_mlp": 0.012583, + "epoch": 0.8839921839771532, + "flos": 45232577959680.0, + "grad_norm": 1.4693799837877743, + "language_loss": 0.72042251, + "learning_rate": 1.3945087841010006e-07, + "loss": 0.79717582, + "num_input_tokens_seen": 317018095, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.09307861, + "step": 14703, + "time_per_iteration": 2.7185418605804443 + }, + { + "auxiliary_loss_clip": 0.06394663, + "auxiliary_loss_mlp": 0.01263802, + "balance_loss_clip": 0.0626796, + "balance_loss_mlp": 0.01254761, + "epoch": 0.8840523072298211, + "flos": 20012342371200.0, + "grad_norm": 1.7405173343909983, + "language_loss": 0.6674304, + "learning_rate": 1.3930803419182645e-07, + "loss": 0.74401504, + "num_input_tokens_seen": 317035755, + "router_z_loss_clip": 1.26855469, + "router_z_loss_mlp": 0.0904541, + "step": 14704, + "time_per_iteration": 3.921534776687622 + }, + { + "auxiliary_loss_clip": 0.06395476, + "auxiliary_loss_mlp": 0.01264102, + "balance_loss_clip": 0.06270425, + "balance_loss_mlp": 0.01255644, + "epoch": 0.8841124304824891, + "flos": 24432941024640.0, + "grad_norm": 1.519427157327818, + "language_loss": 0.70908153, + "learning_rate": 1.3916526053117905e-07, + "loss": 0.78567731, + "num_input_tokens_seen": 317055765, + "router_z_loss_clip": 1.25, + "router_z_loss_mlp": 0.08459473, + "step": 14705, + "time_per_iteration": 2.6113686561584473 + }, + { + "auxiliary_loss_clip": 0.06397911, + "auxiliary_loss_mlp": 0.01264052, + "balance_loss_clip": 0.06269821, + "balance_loss_mlp": 0.0125566, + "epoch": 0.884172553735157, + "flos": 31292583851520.0, + "grad_norm": 1.3762163602676374, + "language_loss": 0.70915127, + "learning_rate": 1.3902255743357104e-07, + "loss": 0.78577089, + "num_input_tokens_seen": 317077955, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.08392334, + "step": 14706, + "time_per_iteration": 2.665069580078125 + }, + { + "auxiliary_loss_clip": 0.06399087, + "auxiliary_loss_mlp": 0.01265819, + "balance_loss_clip": 0.06269109, + "balance_loss_mlp": 0.01256843, + "epoch": 0.884232676987825, + "flos": 21396494367360.0, + "grad_norm": 1.5565027115335555, + "language_loss": 0.74541593, + "learning_rate": 1.3887992490441413e-07, + "loss": 0.822065, + "num_input_tokens_seen": 317095825, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.08978271, + "step": 14707, + "time_per_iteration": 2.669102430343628 + }, + { + "auxiliary_loss_clip": 0.06309316, + "auxiliary_loss_mlp": 0.01249357, + "balance_loss_clip": 0.06254923, + "balance_loss_mlp": 0.01248359, + "epoch": 0.8842928002404931, + "flos": 57928668278400.0, + "grad_norm": 0.8267722296709221, + "language_loss": 0.60377383, + "learning_rate": 1.387373629491173e-07, + "loss": 0.67936051, + "num_input_tokens_seen": 317152875, + "router_z_loss_clip": 0.54638672, + "router_z_loss_mlp": 0.00997162, + "step": 14708, + "time_per_iteration": 2.9923834800720215 + }, + { + "auxiliary_loss_clip": 0.06393933, + "auxiliary_loss_mlp": 0.01265055, + "balance_loss_clip": 0.06269866, + "balance_loss_mlp": 0.01257062, + "epoch": 0.884352923493161, + "flos": 41473517690880.0, + "grad_norm": 1.6630393907624046, + "language_loss": 0.67774945, + "learning_rate": 1.3859487157308625e-07, + "loss": 0.75433934, + "num_input_tokens_seen": 317176725, + "router_z_loss_clip": 1.24121094, + "router_z_loss_mlp": 0.07989502, + "step": 14709, + "time_per_iteration": 2.713012933731079 + }, + { + "auxiliary_loss_clip": 0.06405933, + "auxiliary_loss_mlp": 0.01267155, + "balance_loss_clip": 0.06270263, + "balance_loss_mlp": 0.0125667, + "epoch": 0.884413046745829, + "flos": 46552677909120.0, + "grad_norm": 1.5766892978129978, + "language_loss": 0.62479722, + "learning_rate": 1.3845245078172373e-07, + "loss": 0.70152819, + "num_input_tokens_seen": 317206880, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.10479736, + "step": 14710, + "time_per_iteration": 2.767439603805542 + }, + { + "auxiliary_loss_clip": 0.06396196, + "auxiliary_loss_mlp": 0.01264197, + "balance_loss_clip": 0.06270634, + "balance_loss_mlp": 0.01255924, + "epoch": 0.8844731699984969, + "flos": 19141331978880.0, + "grad_norm": 3.098385376741182, + "language_loss": 0.63903069, + "learning_rate": 1.38310100580431e-07, + "loss": 0.7156347, + "num_input_tokens_seen": 317224135, + "router_z_loss_clip": 1.25683594, + "router_z_loss_mlp": 0.08282471, + "step": 14711, + "time_per_iteration": 2.5306129455566406 + }, + { + "auxiliary_loss_clip": 0.06406876, + "auxiliary_loss_mlp": 0.01265093, + "balance_loss_clip": 0.06271248, + "balance_loss_mlp": 0.01255872, + "epoch": 0.8845332932511649, + "flos": 23267736547200.0, + "grad_norm": 1.7593747867980092, + "language_loss": 0.76430249, + "learning_rate": 1.38167820974606e-07, + "loss": 0.84102213, + "num_input_tokens_seen": 317244505, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.09222412, + "step": 14712, + "time_per_iteration": 2.5903677940368652 + }, + { + "auxiliary_loss_clip": 0.06404536, + "auxiliary_loss_mlp": 0.01267354, + "balance_loss_clip": 0.06273165, + "balance_loss_mlp": 0.01258246, + "epoch": 0.8845934165038328, + "flos": 17570027888640.0, + "grad_norm": 2.1477538781818777, + "language_loss": 0.81665063, + "learning_rate": 1.3802561196964368e-07, + "loss": 0.89336956, + "num_input_tokens_seen": 317257830, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.09112549, + "step": 14713, + "time_per_iteration": 2.4900383949279785 + }, + { + "auxiliary_loss_clip": 0.06397398, + "auxiliary_loss_mlp": 0.01261797, + "balance_loss_clip": 0.06267774, + "balance_loss_mlp": 0.01252535, + "epoch": 0.8846535397565009, + "flos": 27492336501120.0, + "grad_norm": 1.69166035128251, + "language_loss": 0.55999333, + "learning_rate": 1.3788347357093688e-07, + "loss": 0.63658524, + "num_input_tokens_seen": 317278430, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.0927124, + "step": 14714, + "time_per_iteration": 2.534978151321411 + }, + { + "auxiliary_loss_clip": 0.06400881, + "auxiliary_loss_mlp": 0.01262206, + "balance_loss_clip": 0.06269959, + "balance_loss_mlp": 0.01253587, + "epoch": 0.8847136630091688, + "flos": 28768020986880.0, + "grad_norm": 1.6242716538465463, + "language_loss": 0.73918736, + "learning_rate": 1.377414057838755e-07, + "loss": 0.81581825, + "num_input_tokens_seen": 317295970, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.08612061, + "step": 14715, + "time_per_iteration": 3.9610276222229004 + }, + { + "auxiliary_loss_clip": 0.06403157, + "auxiliary_loss_mlp": 0.0126659, + "balance_loss_clip": 0.06271261, + "balance_loss_mlp": 0.01257387, + "epoch": 0.8847737862618368, + "flos": 23483623392000.0, + "grad_norm": 1.4848157988551902, + "language_loss": 0.75333452, + "learning_rate": 1.375994086138461e-07, + "loss": 0.83003205, + "num_input_tokens_seen": 317316185, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09204102, + "step": 14716, + "time_per_iteration": 2.5149214267730713 + }, + { + "auxiliary_loss_clip": 0.06399931, + "auxiliary_loss_mlp": 0.01261999, + "balance_loss_clip": 0.06271353, + "balance_loss_mlp": 0.01252676, + "epoch": 0.8848339095145047, + "flos": 18666777980160.0, + "grad_norm": 1.9564063786190344, + "language_loss": 0.7096256, + "learning_rate": 1.3745748206623397e-07, + "loss": 0.78624487, + "num_input_tokens_seen": 317333275, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.09320068, + "step": 14717, + "time_per_iteration": 2.5454225540161133 + }, + { + "auxiliary_loss_clip": 0.06393513, + "auxiliary_loss_mlp": 0.01261753, + "balance_loss_clip": 0.06269147, + "balance_loss_mlp": 0.01253003, + "epoch": 0.8848940327671727, + "flos": 32278518518400.0, + "grad_norm": 3.4354664808670607, + "language_loss": 0.74253142, + "learning_rate": 1.373156261464208e-07, + "loss": 0.81908405, + "num_input_tokens_seen": 317351245, + "router_z_loss_clip": 1.2421875, + "router_z_loss_mlp": 0.08740234, + "step": 14718, + "time_per_iteration": 2.567211627960205 + }, + { + "auxiliary_loss_clip": 0.06400803, + "auxiliary_loss_mlp": 0.01261671, + "balance_loss_clip": 0.06267846, + "balance_loss_mlp": 0.01252033, + "epoch": 0.8849541560198406, + "flos": 24028225004160.0, + "grad_norm": 1.4551817490086836, + "language_loss": 0.78617239, + "learning_rate": 1.3717384085978602e-07, + "loss": 0.86279714, + "num_input_tokens_seen": 317370740, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.09643555, + "step": 14719, + "time_per_iteration": 4.014564514160156 + }, + { + "auxiliary_loss_clip": 0.06404986, + "auxiliary_loss_mlp": 0.01265664, + "balance_loss_clip": 0.06272528, + "balance_loss_mlp": 0.01257254, + "epoch": 0.8850142792725086, + "flos": 16878664650240.0, + "grad_norm": 2.2822989614167515, + "language_loss": 0.72013068, + "learning_rate": 1.3703212621170579e-07, + "loss": 0.79683721, + "num_input_tokens_seen": 317388370, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.08410645, + "step": 14720, + "time_per_iteration": 2.5508828163146973 + }, + { + "auxiliary_loss_clip": 0.06405028, + "auxiliary_loss_mlp": 0.01263708, + "balance_loss_clip": 0.06270377, + "balance_loss_mlp": 0.0125441, + "epoch": 0.8850744025251767, + "flos": 24030824480640.0, + "grad_norm": 1.7235256005815422, + "language_loss": 0.8247689, + "learning_rate": 1.3689048220755383e-07, + "loss": 0.90145624, + "num_input_tokens_seen": 317407390, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.09295654, + "step": 14721, + "time_per_iteration": 3.969202995300293 + }, + { + "auxiliary_loss_clip": 0.06402031, + "auxiliary_loss_mlp": 0.01265058, + "balance_loss_clip": 0.06270001, + "balance_loss_mlp": 0.01255521, + "epoch": 0.8851345257778446, + "flos": 47965816218240.0, + "grad_norm": 1.964786564262649, + "language_loss": 0.62954146, + "learning_rate": 1.3674890885270186e-07, + "loss": 0.70621228, + "num_input_tokens_seen": 317430825, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09545898, + "step": 14722, + "time_per_iteration": 2.7305383682250977 + }, + { + "auxiliary_loss_clip": 0.06398532, + "auxiliary_loss_mlp": 0.01265131, + "balance_loss_clip": 0.06266725, + "balance_loss_mlp": 0.01255242, + "epoch": 0.8851946490305126, + "flos": 36619761755520.0, + "grad_norm": 1.7414583880111092, + "language_loss": 0.68572694, + "learning_rate": 1.3660740615251754e-07, + "loss": 0.76236361, + "num_input_tokens_seen": 317451905, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.09881592, + "step": 14723, + "time_per_iteration": 2.6492748260498047 + }, + { + "auxiliary_loss_clip": 0.06399927, + "auxiliary_loss_mlp": 0.01263745, + "balance_loss_clip": 0.06269683, + "balance_loss_mlp": 0.01254834, + "epoch": 0.8852547722831805, + "flos": 21550802860800.0, + "grad_norm": 1.6351451905657401, + "language_loss": 0.77568376, + "learning_rate": 1.3646597411236703e-07, + "loss": 0.85232049, + "num_input_tokens_seen": 317470030, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.08917236, + "step": 14724, + "time_per_iteration": 2.5171244144439697 + }, + { + "auxiliary_loss_clip": 0.06308331, + "auxiliary_loss_mlp": 0.01249732, + "balance_loss_clip": 0.06254104, + "balance_loss_mlp": 0.01248703, + "epoch": 0.8853148955358485, + "flos": 63077876110080.0, + "grad_norm": 0.783597517732296, + "language_loss": 0.58947587, + "learning_rate": 1.363246127376143e-07, + "loss": 0.66505647, + "num_input_tokens_seen": 317527460, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.01029205, + "step": 14725, + "time_per_iteration": 3.0300509929656982 + }, + { + "auxiliary_loss_clip": 0.06410657, + "auxiliary_loss_mlp": 0.01267993, + "balance_loss_clip": 0.06271988, + "balance_loss_mlp": 0.01257962, + "epoch": 0.8853750187885164, + "flos": 18155606947200.0, + "grad_norm": 1.866018411089085, + "language_loss": 0.68803233, + "learning_rate": 1.3618332203361837e-07, + "loss": 0.76481885, + "num_input_tokens_seen": 317544070, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.1003418, + "step": 14726, + "time_per_iteration": 2.4636669158935547 + }, + { + "auxiliary_loss_clip": 0.06399886, + "auxiliary_loss_mlp": 0.01265553, + "balance_loss_clip": 0.06270148, + "balance_loss_mlp": 0.01257036, + "epoch": 0.8854351420411845, + "flos": 39580500648960.0, + "grad_norm": 1.2347060660537659, + "language_loss": 0.6949172, + "learning_rate": 1.3604210200573785e-07, + "loss": 0.77157164, + "num_input_tokens_seen": 317570275, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.08508301, + "step": 14727, + "time_per_iteration": 2.736482858657837 + }, + { + "auxiliary_loss_clip": 0.06401646, + "auxiliary_loss_mlp": 0.01263244, + "balance_loss_clip": 0.06271316, + "balance_loss_mlp": 0.01254184, + "epoch": 0.8854952652938524, + "flos": 23776140395520.0, + "grad_norm": 1.6133668439229503, + "language_loss": 0.70217514, + "learning_rate": 1.3590095265932733e-07, + "loss": 0.77882403, + "num_input_tokens_seen": 317590160, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.09063721, + "step": 14728, + "time_per_iteration": 2.5058109760284424 + }, + { + "auxiliary_loss_clip": 0.0640386, + "auxiliary_loss_mlp": 0.01261995, + "balance_loss_clip": 0.06270647, + "balance_loss_mlp": 0.01252434, + "epoch": 0.8855553885465204, + "flos": 18295199049600.0, + "grad_norm": 2.1275999023059673, + "language_loss": 0.66818655, + "learning_rate": 1.3575987399973987e-07, + "loss": 0.74484515, + "num_input_tokens_seen": 317608340, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.09558105, + "step": 14729, + "time_per_iteration": 2.521054267883301 + }, + { + "auxiliary_loss_clip": 0.06401055, + "auxiliary_loss_mlp": 0.0126072, + "balance_loss_clip": 0.06272933, + "balance_loss_mlp": 0.01252513, + "epoch": 0.8856155117991883, + "flos": 36876374484480.0, + "grad_norm": 1.8562662991246879, + "language_loss": 0.6310026, + "learning_rate": 1.3561886603232453e-07, + "loss": 0.70762038, + "num_input_tokens_seen": 317629910, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.08209229, + "step": 14730, + "time_per_iteration": 2.6651859283447266 + }, + { + "auxiliary_loss_clip": 0.06397253, + "auxiliary_loss_mlp": 0.01262249, + "balance_loss_clip": 0.0627026, + "balance_loss_mlp": 0.01253553, + "epoch": 0.8856756350518563, + "flos": 22170441404160.0, + "grad_norm": 1.6656970883539435, + "language_loss": 0.79226112, + "learning_rate": 1.3547792876242904e-07, + "loss": 0.86885613, + "num_input_tokens_seen": 317650265, + "router_z_loss_clip": 1.26953125, + "router_z_loss_mlp": 0.0869751, + "step": 14731, + "time_per_iteration": 2.5325546264648438 + }, + { + "auxiliary_loss_clip": 0.06402338, + "auxiliary_loss_mlp": 0.0126849, + "balance_loss_clip": 0.06271227, + "balance_loss_mlp": 0.01259746, + "epoch": 0.8857357583045242, + "flos": 20747282532480.0, + "grad_norm": 1.5228493349215588, + "language_loss": 0.83495152, + "learning_rate": 1.3533706219539708e-07, + "loss": 0.91165972, + "num_input_tokens_seen": 317669045, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.08752441, + "step": 14732, + "time_per_iteration": 2.4797542095184326 + }, + { + "auxiliary_loss_clip": 0.06308968, + "auxiliary_loss_mlp": 0.01249256, + "balance_loss_clip": 0.06254347, + "balance_loss_mlp": 0.01248295, + "epoch": 0.8857958815571922, + "flos": 69913815431040.0, + "grad_norm": 0.8972181039902289, + "language_loss": 0.59697849, + "learning_rate": 1.3519626633657045e-07, + "loss": 0.67256069, + "num_input_tokens_seen": 317728065, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.00959015, + "step": 14733, + "time_per_iteration": 3.1617019176483154 + }, + { + "auxiliary_loss_clip": 0.06401418, + "auxiliary_loss_mlp": 0.01263495, + "balance_loss_clip": 0.06271139, + "balance_loss_mlp": 0.01253863, + "epoch": 0.8858560048098603, + "flos": 15127294135680.0, + "grad_norm": 1.8039314213733861, + "language_loss": 0.6699304, + "learning_rate": 1.3505554119128838e-07, + "loss": 0.74657953, + "num_input_tokens_seen": 317746120, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.09625244, + "step": 14734, + "time_per_iteration": 2.464336395263672 + }, + { + "auxiliary_loss_clip": 0.06398517, + "auxiliary_loss_mlp": 0.0126497, + "balance_loss_clip": 0.06271675, + "balance_loss_mlp": 0.01255905, + "epoch": 0.8859161280625282, + "flos": 16615469376000.0, + "grad_norm": 1.96139376058703, + "language_loss": 0.75832766, + "learning_rate": 1.3491488676488682e-07, + "loss": 0.83496255, + "num_input_tokens_seen": 317762280, + "router_z_loss_clip": 1.26855469, + "router_z_loss_mlp": 0.09057617, + "step": 14735, + "time_per_iteration": 2.499420166015625 + }, + { + "auxiliary_loss_clip": 0.06406797, + "auxiliary_loss_mlp": 0.01263237, + "balance_loss_clip": 0.06273414, + "balance_loss_mlp": 0.0125404, + "epoch": 0.8859762513151962, + "flos": 18699915070080.0, + "grad_norm": 2.4052129022673507, + "language_loss": 0.70763892, + "learning_rate": 1.3477430306270066e-07, + "loss": 0.78433919, + "num_input_tokens_seen": 317780615, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.09197998, + "step": 14736, + "time_per_iteration": 2.4729537963867188 + }, + { + "auxiliary_loss_clip": 0.06403352, + "auxiliary_loss_mlp": 0.01263355, + "balance_loss_clip": 0.06272499, + "balance_loss_mlp": 0.0125423, + "epoch": 0.8860363745678641, + "flos": 19542987325440.0, + "grad_norm": 1.711220105447237, + "language_loss": 0.8489334, + "learning_rate": 1.3463379009005892e-07, + "loss": 0.92560041, + "num_input_tokens_seen": 317798830, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09124756, + "step": 14737, + "time_per_iteration": 2.5370328426361084 + }, + { + "auxiliary_loss_clip": 0.06409991, + "auxiliary_loss_mlp": 0.01267221, + "balance_loss_clip": 0.06270722, + "balance_loss_mlp": 0.01256623, + "epoch": 0.8860964978205321, + "flos": 35963673886080.0, + "grad_norm": 1.905285473109681, + "language_loss": 0.67920482, + "learning_rate": 1.3449334785229093e-07, + "loss": 0.75597692, + "num_input_tokens_seen": 317819235, + "router_z_loss_clip": 1.39257812, + "router_z_loss_mlp": 0.10601807, + "step": 14738, + "time_per_iteration": 2.6281023025512695 + }, + { + "auxiliary_loss_clip": 0.06409208, + "auxiliary_loss_mlp": 0.01263679, + "balance_loss_clip": 0.06271783, + "balance_loss_mlp": 0.01253588, + "epoch": 0.8861566210732, + "flos": 21218524295040.0, + "grad_norm": 1.6152938283716511, + "language_loss": 0.75368971, + "learning_rate": 1.343529763547222e-07, + "loss": 0.83041853, + "num_input_tokens_seen": 317836785, + "router_z_loss_clip": 1.37402344, + "router_z_loss_mlp": 0.10083008, + "step": 14739, + "time_per_iteration": 2.5536062717437744 + }, + { + "auxiliary_loss_clip": 0.06398404, + "auxiliary_loss_mlp": 0.01263694, + "balance_loss_clip": 0.06269807, + "balance_loss_mlp": 0.01255462, + "epoch": 0.886216744325868, + "flos": 14613984823680.0, + "grad_norm": 1.863446316101088, + "language_loss": 0.87359273, + "learning_rate": 1.3421267560267559e-07, + "loss": 0.95021367, + "num_input_tokens_seen": 317854225, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.08227539, + "step": 14740, + "time_per_iteration": 2.4583516120910645 + }, + { + "auxiliary_loss_clip": 0.06400885, + "auxiliary_loss_mlp": 0.01263516, + "balance_loss_clip": 0.06271682, + "balance_loss_mlp": 0.01254206, + "epoch": 0.886276867578536, + "flos": 26658949392000.0, + "grad_norm": 1.8967743887192066, + "language_loss": 0.63574475, + "learning_rate": 1.34072445601471e-07, + "loss": 0.71238875, + "num_input_tokens_seen": 317874865, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.09301758, + "step": 14741, + "time_per_iteration": 2.5750632286071777 + }, + { + "auxiliary_loss_clip": 0.06400025, + "auxiliary_loss_mlp": 0.01268656, + "balance_loss_clip": 0.06270176, + "balance_loss_mlp": 0.01259149, + "epoch": 0.886336990831204, + "flos": 16769735942400.0, + "grad_norm": 1.8023239022858395, + "language_loss": 0.7326563, + "learning_rate": 1.3393228635642717e-07, + "loss": 0.8093431, + "num_input_tokens_seen": 317892830, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.09509277, + "step": 14742, + "time_per_iteration": 2.4618430137634277 + }, + { + "auxiliary_loss_clip": 0.06399601, + "auxiliary_loss_mlp": 0.0126363, + "balance_loss_clip": 0.06269386, + "balance_loss_mlp": 0.01254147, + "epoch": 0.8863971140838719, + "flos": 25272365627520.0, + "grad_norm": 1.947275844906342, + "language_loss": 0.59373927, + "learning_rate": 1.3379219787285733e-07, + "loss": 0.67037159, + "num_input_tokens_seen": 317911780, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.09484863, + "step": 14743, + "time_per_iteration": 2.5227532386779785 + }, + { + "auxiliary_loss_clip": 0.0640617, + "auxiliary_loss_mlp": 0.01269532, + "balance_loss_clip": 0.06273371, + "balance_loss_mlp": 0.01258392, + "epoch": 0.8864572373365399, + "flos": 23411060156160.0, + "grad_norm": 1.6050209562169269, + "language_loss": 0.60046476, + "learning_rate": 1.3365218015607437e-07, + "loss": 0.67722178, + "num_input_tokens_seen": 317932855, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.11138916, + "step": 14744, + "time_per_iteration": 3.923879861831665 + }, + { + "auxiliary_loss_clip": 0.06401066, + "auxiliary_loss_mlp": 0.01263442, + "balance_loss_clip": 0.06270179, + "balance_loss_mlp": 0.01254001, + "epoch": 0.8865173605892078, + "flos": 18554201619840.0, + "grad_norm": 1.5756497333321051, + "language_loss": 0.76668805, + "learning_rate": 1.3351223321138762e-07, + "loss": 0.84333313, + "num_input_tokens_seen": 317952090, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.09436035, + "step": 14745, + "time_per_iteration": 2.4856021404266357 + }, + { + "auxiliary_loss_clip": 0.06399768, + "auxiliary_loss_mlp": 0.01264389, + "balance_loss_clip": 0.06270394, + "balance_loss_mlp": 0.01255251, + "epoch": 0.8865774838418758, + "flos": 19031858219520.0, + "grad_norm": 1.8923480144182985, + "language_loss": 0.77594328, + "learning_rate": 1.3337235704410454e-07, + "loss": 0.85258484, + "num_input_tokens_seen": 317970370, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.09136963, + "step": 14746, + "time_per_iteration": 2.5016493797302246 + }, + { + "auxiliary_loss_clip": 0.06402637, + "auxiliary_loss_mlp": 0.0126552, + "balance_loss_clip": 0.06271207, + "balance_loss_mlp": 0.0125618, + "epoch": 0.8866376070945439, + "flos": 22169602863360.0, + "grad_norm": 1.8379446681951996, + "language_loss": 0.77303553, + "learning_rate": 1.3323255165952873e-07, + "loss": 0.84971702, + "num_input_tokens_seen": 317989125, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09338379, + "step": 14747, + "time_per_iteration": 2.4861974716186523 + }, + { + "auxiliary_loss_clip": 0.06395779, + "auxiliary_loss_mlp": 0.01263313, + "balance_loss_clip": 0.06268896, + "balance_loss_mlp": 0.01254748, + "epoch": 0.8866977303472118, + "flos": 20710539717120.0, + "grad_norm": 1.530148448203103, + "language_loss": 0.82762802, + "learning_rate": 1.3309281706296127e-07, + "loss": 0.90421903, + "num_input_tokens_seen": 318007820, + "router_z_loss_clip": 1.26855469, + "router_z_loss_mlp": 0.08551025, + "step": 14748, + "time_per_iteration": 2.502021551132202 + }, + { + "auxiliary_loss_clip": 0.06401731, + "auxiliary_loss_mlp": 0.01266782, + "balance_loss_clip": 0.0627058, + "balance_loss_mlp": 0.01257162, + "epoch": 0.8867578535998798, + "flos": 48804779623680.0, + "grad_norm": 4.373040844685136, + "language_loss": 0.77577972, + "learning_rate": 1.3295315325970148e-07, + "loss": 0.8524648, + "num_input_tokens_seen": 318030435, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.09613037, + "step": 14749, + "time_per_iteration": 2.727158546447754 + }, + { + "auxiliary_loss_clip": 0.06406604, + "auxiliary_loss_mlp": 0.01265154, + "balance_loss_clip": 0.06270957, + "balance_loss_mlp": 0.01255617, + "epoch": 0.8868179768525477, + "flos": 21111608085120.0, + "grad_norm": 2.390428852813455, + "language_loss": 0.7003032, + "learning_rate": 1.328135602550451e-07, + "loss": 0.77702081, + "num_input_tokens_seen": 318049465, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.09539795, + "step": 14750, + "time_per_iteration": 2.5537924766540527 + }, + { + "auxiliary_loss_clip": 0.06399231, + "auxiliary_loss_mlp": 0.01264164, + "balance_loss_clip": 0.06269191, + "balance_loss_mlp": 0.01255325, + "epoch": 0.8868781001052157, + "flos": 21836653464960.0, + "grad_norm": 1.669612343662207, + "language_loss": 0.59316975, + "learning_rate": 1.3267403805428546e-07, + "loss": 0.66980374, + "num_input_tokens_seen": 318067760, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.08837891, + "step": 14751, + "time_per_iteration": 2.4961390495300293 + }, + { + "auxiliary_loss_clip": 0.06401397, + "auxiliary_loss_mlp": 0.01262515, + "balance_loss_clip": 0.06271613, + "balance_loss_mlp": 0.01252966, + "epoch": 0.8869382233578836, + "flos": 13521469363200.0, + "grad_norm": 2.201193429076569, + "language_loss": 0.81327409, + "learning_rate": 1.3253458666271344e-07, + "loss": 0.88991326, + "num_input_tokens_seen": 318082785, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.09545898, + "step": 14752, + "time_per_iteration": 2.527376651763916 + }, + { + "auxiliary_loss_clip": 0.06405862, + "auxiliary_loss_mlp": 0.01265552, + "balance_loss_clip": 0.0626955, + "balance_loss_mlp": 0.01255365, + "epoch": 0.8869983466105517, + "flos": 22710598750080.0, + "grad_norm": 1.7397771398756352, + "language_loss": 0.80421031, + "learning_rate": 1.3239520608561793e-07, + "loss": 0.8809244, + "num_input_tokens_seen": 318101925, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.10186768, + "step": 14753, + "time_per_iteration": 2.496861457824707 + }, + { + "auxiliary_loss_clip": 0.0639924, + "auxiliary_loss_mlp": 0.01265479, + "balance_loss_clip": 0.06270298, + "balance_loss_mlp": 0.01256569, + "epoch": 0.8870584698632196, + "flos": 15346115873280.0, + "grad_norm": 1.9358713626182904, + "language_loss": 0.65389812, + "learning_rate": 1.3225589632828248e-07, + "loss": 0.73054528, + "num_input_tokens_seen": 318119945, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.08911133, + "step": 14754, + "time_per_iteration": 2.481266736984253 + }, + { + "auxiliary_loss_clip": 0.06402417, + "auxiliary_loss_mlp": 0.01265451, + "balance_loss_clip": 0.06271257, + "balance_loss_mlp": 0.0125654, + "epoch": 0.8871185931158876, + "flos": 26623003190400.0, + "grad_norm": 1.899322495177458, + "language_loss": 0.7491895, + "learning_rate": 1.3211665739599065e-07, + "loss": 0.82586813, + "num_input_tokens_seen": 318139685, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.08911133, + "step": 14755, + "time_per_iteration": 3.9851739406585693 + }, + { + "auxiliary_loss_clip": 0.06400773, + "auxiliary_loss_mlp": 0.01269007, + "balance_loss_clip": 0.06269758, + "balance_loss_mlp": 0.01259745, + "epoch": 0.8871787163685555, + "flos": 21805528872960.0, + "grad_norm": 1.4050001258190605, + "language_loss": 0.78016531, + "learning_rate": 1.3197748929402262e-07, + "loss": 0.85686314, + "num_input_tokens_seen": 318160375, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.09259033, + "step": 14756, + "time_per_iteration": 2.4884493350982666 + }, + { + "auxiliary_loss_clip": 0.06399755, + "auxiliary_loss_mlp": 0.01263375, + "balance_loss_clip": 0.06268262, + "balance_loss_mlp": 0.01253719, + "epoch": 0.8872388396212235, + "flos": 14908262762880.0, + "grad_norm": 2.804203292047771, + "language_loss": 0.77138597, + "learning_rate": 1.3183839202765535e-07, + "loss": 0.84801722, + "num_input_tokens_seen": 318177995, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09655762, + "step": 14757, + "time_per_iteration": 2.458031177520752 + }, + { + "auxiliary_loss_clip": 0.06396057, + "auxiliary_loss_mlp": 0.01265932, + "balance_loss_clip": 0.06269957, + "balance_loss_mlp": 0.0125711, + "epoch": 0.8872989628738914, + "flos": 26439331040640.0, + "grad_norm": 1.7403499564680318, + "language_loss": 0.68120039, + "learning_rate": 1.316993656021632e-07, + "loss": 0.75782031, + "num_input_tokens_seen": 318197030, + "router_z_loss_clip": 1.26171875, + "router_z_loss_mlp": 0.0881958, + "step": 14758, + "time_per_iteration": 2.5202882289886475 + }, + { + "auxiliary_loss_clip": 0.0639921, + "auxiliary_loss_mlp": 0.01265437, + "balance_loss_clip": 0.06269047, + "balance_loss_mlp": 0.01256473, + "epoch": 0.8873590861265594, + "flos": 48153597217920.0, + "grad_norm": 1.6386846273703985, + "language_loss": 0.68983102, + "learning_rate": 1.3156041002281915e-07, + "loss": 0.76647747, + "num_input_tokens_seen": 318221780, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.08972168, + "step": 14759, + "time_per_iteration": 4.159254550933838 + }, + { + "auxiliary_loss_clip": 0.06398255, + "auxiliary_loss_mlp": 0.01263758, + "balance_loss_clip": 0.0626884, + "balance_loss_mlp": 0.01254418, + "epoch": 0.8874192093792275, + "flos": 18338901753600.0, + "grad_norm": 2.3604242969885707, + "language_loss": 0.74442339, + "learning_rate": 1.3142152529489092e-07, + "loss": 0.82104361, + "num_input_tokens_seen": 318239710, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.09350586, + "step": 14760, + "time_per_iteration": 2.4698567390441895 + }, + { + "auxiliary_loss_clip": 0.06404065, + "auxiliary_loss_mlp": 0.01273255, + "balance_loss_clip": 0.06270099, + "balance_loss_mlp": 0.01263736, + "epoch": 0.8874793326318954, + "flos": 17899916613120.0, + "grad_norm": 2.2735692439153237, + "language_loss": 0.7632544, + "learning_rate": 1.3128271142364565e-07, + "loss": 0.84002757, + "num_input_tokens_seen": 318257425, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.09521484, + "step": 14761, + "time_per_iteration": 3.89682936668396 + }, + { + "auxiliary_loss_clip": 0.06400929, + "auxiliary_loss_mlp": 0.01263207, + "balance_loss_clip": 0.06268443, + "balance_loss_mlp": 0.01254052, + "epoch": 0.8875394558845634, + "flos": 31110169512960.0, + "grad_norm": 1.636429643501416, + "language_loss": 0.61458367, + "learning_rate": 1.3114396841434717e-07, + "loss": 0.69122505, + "num_input_tokens_seen": 318278485, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09155273, + "step": 14762, + "time_per_iteration": 2.632906436920166 + }, + { + "auxiliary_loss_clip": 0.06397983, + "auxiliary_loss_mlp": 0.01264663, + "balance_loss_clip": 0.06268691, + "balance_loss_mlp": 0.01254406, + "epoch": 0.8875995791372313, + "flos": 21148392827520.0, + "grad_norm": 1.751322601119736, + "language_loss": 0.64450324, + "learning_rate": 1.3100529627225697e-07, + "loss": 0.72112966, + "num_input_tokens_seen": 318297560, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.10253906, + "step": 14763, + "time_per_iteration": 2.6457977294921875 + }, + { + "auxiliary_loss_clip": 0.06402642, + "auxiliary_loss_mlp": 0.01261912, + "balance_loss_clip": 0.0627153, + "balance_loss_mlp": 0.01252452, + "epoch": 0.8876597023898993, + "flos": 17460554129280.0, + "grad_norm": 2.4705520367844924, + "language_loss": 0.70655769, + "learning_rate": 1.3086669500263335e-07, + "loss": 0.78320324, + "num_input_tokens_seen": 318313060, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09460449, + "step": 14764, + "time_per_iteration": 2.5632097721099854 + }, + { + "auxiliary_loss_clip": 0.06406358, + "auxiliary_loss_mlp": 0.01262549, + "balance_loss_clip": 0.06270573, + "balance_loss_mlp": 0.0125315, + "epoch": 0.8877198256425672, + "flos": 22714036767360.0, + "grad_norm": 2.109687309094666, + "language_loss": 0.65986574, + "learning_rate": 1.3072816461073166e-07, + "loss": 0.73655486, + "num_input_tokens_seen": 318332030, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.09399414, + "step": 14765, + "time_per_iteration": 2.5792641639709473 + }, + { + "auxiliary_loss_clip": 0.06397182, + "auxiliary_loss_mlp": 0.01265927, + "balance_loss_clip": 0.06269948, + "balance_loss_mlp": 0.01257532, + "epoch": 0.8877799488952353, + "flos": 24541995513600.0, + "grad_norm": 1.5120500891311812, + "language_loss": 0.76344001, + "learning_rate": 1.3058970510180568e-07, + "loss": 0.84007108, + "num_input_tokens_seen": 318351090, + "router_z_loss_clip": 1.27441406, + "router_z_loss_mlp": 0.08395386, + "step": 14766, + "time_per_iteration": 2.6661949157714844 + }, + { + "auxiliary_loss_clip": 0.06396287, + "auxiliary_loss_mlp": 0.01267323, + "balance_loss_clip": 0.06269039, + "balance_loss_mlp": 0.01258448, + "epoch": 0.8878400721479032, + "flos": 20965433437440.0, + "grad_norm": 1.7405820386467394, + "language_loss": 0.73762059, + "learning_rate": 1.3045131648110496e-07, + "loss": 0.81425673, + "num_input_tokens_seen": 318372000, + "router_z_loss_clip": 1.27050781, + "router_z_loss_mlp": 0.08880615, + "step": 14767, + "time_per_iteration": 2.5033586025238037 + }, + { + "auxiliary_loss_clip": 0.06396404, + "auxiliary_loss_mlp": 0.01261133, + "balance_loss_clip": 0.06271426, + "balance_loss_mlp": 0.01252556, + "epoch": 0.8879001954005712, + "flos": 25301268086400.0, + "grad_norm": 3.2803844975125003, + "language_loss": 0.71396875, + "learning_rate": 1.303129987538778e-07, + "loss": 0.79054409, + "num_input_tokens_seen": 318391530, + "router_z_loss_clip": 1.24902344, + "router_z_loss_mlp": 0.08569336, + "step": 14768, + "time_per_iteration": 2.6661486625671387 + }, + { + "auxiliary_loss_clip": 0.06398378, + "auxiliary_loss_mlp": 0.01263834, + "balance_loss_clip": 0.06268355, + "balance_loss_mlp": 0.01255001, + "epoch": 0.8879603186532391, + "flos": 23192028783360.0, + "grad_norm": 1.6618639759125788, + "language_loss": 0.70540762, + "learning_rate": 1.3017475192536932e-07, + "loss": 0.78202975, + "num_input_tokens_seen": 318410690, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.08831787, + "step": 14769, + "time_per_iteration": 2.512924909591675 + }, + { + "auxiliary_loss_clip": 0.06403679, + "auxiliary_loss_mlp": 0.01261408, + "balance_loss_clip": 0.06273782, + "balance_loss_mlp": 0.01252354, + "epoch": 0.8880204419059071, + "flos": 13659342456960.0, + "grad_norm": 2.4814123968549127, + "language_loss": 0.67167079, + "learning_rate": 1.3003657600082174e-07, + "loss": 0.74832165, + "num_input_tokens_seen": 318427380, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.09051514, + "step": 14770, + "time_per_iteration": 2.490354061126709 + }, + { + "auxiliary_loss_clip": 0.06397928, + "auxiliary_loss_mlp": 0.01266026, + "balance_loss_clip": 0.0627326, + "balance_loss_mlp": 0.01257055, + "epoch": 0.888080565158575, + "flos": 20638228043520.0, + "grad_norm": 2.5502983496635467, + "language_loss": 0.65957916, + "learning_rate": 1.2989847098547424e-07, + "loss": 0.73621869, + "num_input_tokens_seen": 318448530, + "router_z_loss_clip": 1.24609375, + "router_z_loss_mlp": 0.08972168, + "step": 14771, + "time_per_iteration": 2.528031349182129 + }, + { + "auxiliary_loss_clip": 0.06400346, + "auxiliary_loss_mlp": 0.01261846, + "balance_loss_clip": 0.06270881, + "balance_loss_mlp": 0.01253269, + "epoch": 0.888140688411243, + "flos": 28627338781440.0, + "grad_norm": 1.5852554919043456, + "language_loss": 0.82730216, + "learning_rate": 1.2976043688456396e-07, + "loss": 0.90392411, + "num_input_tokens_seen": 318468655, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.08569336, + "step": 14772, + "time_per_iteration": 2.571366786956787 + }, + { + "auxiliary_loss_clip": 0.06395004, + "auxiliary_loss_mlp": 0.01263606, + "balance_loss_clip": 0.06270064, + "balance_loss_mlp": 0.01255226, + "epoch": 0.8882008116639111, + "flos": 25527301274880.0, + "grad_norm": 1.5194647492720985, + "language_loss": 0.76408058, + "learning_rate": 1.296224737033258e-07, + "loss": 0.84066665, + "num_input_tokens_seen": 318488740, + "router_z_loss_clip": 1.24902344, + "router_z_loss_mlp": 0.08374023, + "step": 14773, + "time_per_iteration": 2.5512452125549316 + }, + { + "auxiliary_loss_clip": 0.06396265, + "auxiliary_loss_mlp": 0.01264026, + "balance_loss_clip": 0.0626926, + "balance_loss_mlp": 0.01255253, + "epoch": 0.888260934916579, + "flos": 27681249530880.0, + "grad_norm": 1.7554405652029053, + "language_loss": 0.75057411, + "learning_rate": 1.294845814469907e-07, + "loss": 0.82717705, + "num_input_tokens_seen": 318508810, + "router_z_loss_clip": 1.27050781, + "router_z_loss_mlp": 0.08782959, + "step": 14774, + "time_per_iteration": 2.580103635787964 + }, + { + "auxiliary_loss_clip": 0.0640349, + "auxiliary_loss_mlp": 0.01265769, + "balance_loss_clip": 0.06272057, + "balance_loss_mlp": 0.01256089, + "epoch": 0.888321058169247, + "flos": 21616615843200.0, + "grad_norm": 2.5677131374215945, + "language_loss": 0.72789186, + "learning_rate": 1.2934676012078783e-07, + "loss": 0.80458438, + "num_input_tokens_seen": 318526860, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09686279, + "step": 14775, + "time_per_iteration": 2.4722659587860107 + }, + { + "auxiliary_loss_clip": 0.06401627, + "auxiliary_loss_mlp": 0.0126518, + "balance_loss_clip": 0.06272218, + "balance_loss_mlp": 0.01256311, + "epoch": 0.8883811814219149, + "flos": 18154768406400.0, + "grad_norm": 1.7615915737374597, + "language_loss": 0.80541307, + "learning_rate": 1.292090097299432e-07, + "loss": 0.88208115, + "num_input_tokens_seen": 318545180, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.08862305, + "step": 14776, + "time_per_iteration": 2.488631010055542 + }, + { + "auxiliary_loss_clip": 0.06408714, + "auxiliary_loss_mlp": 0.01262464, + "balance_loss_clip": 0.0627206, + "balance_loss_mlp": 0.01252826, + "epoch": 0.8884413046745829, + "flos": 28331341833600.0, + "grad_norm": 1.8936331280996206, + "language_loss": 0.6894474, + "learning_rate": 1.290713302796802e-07, + "loss": 0.76615912, + "num_input_tokens_seen": 318564350, + "router_z_loss_clip": 1.36621094, + "router_z_loss_mlp": 0.09637451, + "step": 14777, + "time_per_iteration": 2.5410220623016357 + }, + { + "auxiliary_loss_clip": 0.06399784, + "auxiliary_loss_mlp": 0.01264302, + "balance_loss_clip": 0.06270191, + "balance_loss_mlp": 0.01255213, + "epoch": 0.8885014279272508, + "flos": 15164162732160.0, + "grad_norm": 1.7667313656152588, + "language_loss": 0.71248996, + "learning_rate": 1.2893372177522e-07, + "loss": 0.78913081, + "num_input_tokens_seen": 318582275, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.09094238, + "step": 14778, + "time_per_iteration": 2.4593677520751953 + }, + { + "auxiliary_loss_clip": 0.06401107, + "auxiliary_loss_mlp": 0.01262965, + "balance_loss_clip": 0.0627052, + "balance_loss_mlp": 0.01254, + "epoch": 0.8885615511799189, + "flos": 19105721193600.0, + "grad_norm": 1.5500678278821722, + "language_loss": 0.77619112, + "learning_rate": 1.287961842217804e-07, + "loss": 0.85283184, + "num_input_tokens_seen": 318601230, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.08966064, + "step": 14779, + "time_per_iteration": 2.467658519744873 + }, + { + "auxiliary_loss_clip": 0.06312528, + "auxiliary_loss_mlp": 0.01252679, + "balance_loss_clip": 0.06258145, + "balance_loss_mlp": 0.01251605, + "epoch": 0.8886216744325868, + "flos": 51200735270400.0, + "grad_norm": 0.84904602104289, + "language_loss": 0.56864655, + "learning_rate": 1.2865871762457747e-07, + "loss": 0.64429867, + "num_input_tokens_seen": 318645595, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.01075745, + "step": 14780, + "time_per_iteration": 2.908271074295044 + }, + { + "auxiliary_loss_clip": 0.06315291, + "auxiliary_loss_mlp": 0.01249856, + "balance_loss_clip": 0.0626081, + "balance_loss_mlp": 0.01249003, + "epoch": 0.8886817976852548, + "flos": 61633571281920.0, + "grad_norm": 0.7676462046556519, + "language_loss": 0.62468183, + "learning_rate": 1.2852132198882326e-07, + "loss": 0.7003333, + "num_input_tokens_seen": 318707850, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.00855255, + "step": 14781, + "time_per_iteration": 3.2015137672424316 + }, + { + "auxiliary_loss_clip": 0.06310038, + "auxiliary_loss_mlp": 0.0124953, + "balance_loss_clip": 0.06255679, + "balance_loss_mlp": 0.01248576, + "epoch": 0.8887419209379227, + "flos": 60664464086400.0, + "grad_norm": 0.7663905748294921, + "language_loss": 0.58062631, + "learning_rate": 1.2838399731972805e-07, + "loss": 0.65622199, + "num_input_tokens_seen": 318764915, + "router_z_loss_clip": 0.54638672, + "router_z_loss_mlp": 0.00952911, + "step": 14782, + "time_per_iteration": 2.9721531867980957 + }, + { + "auxiliary_loss_clip": 0.06399249, + "auxiliary_loss_mlp": 0.0126328, + "balance_loss_clip": 0.06271558, + "balance_loss_mlp": 0.01255472, + "epoch": 0.8888020441905907, + "flos": 29213630599680.0, + "grad_norm": 1.6461458074975241, + "language_loss": 0.65778244, + "learning_rate": 1.2824674362249922e-07, + "loss": 0.73440778, + "num_input_tokens_seen": 318785660, + "router_z_loss_clip": 1.27832031, + "router_z_loss_mlp": 0.07806396, + "step": 14783, + "time_per_iteration": 3.9794864654541016 + }, + { + "auxiliary_loss_clip": 0.0640447, + "auxiliary_loss_mlp": 0.01262648, + "balance_loss_clip": 0.06270882, + "balance_loss_mlp": 0.01252867, + "epoch": 0.8888621674432586, + "flos": 22169057811840.0, + "grad_norm": 1.463778407652058, + "language_loss": 0.77528048, + "learning_rate": 1.281095609023415e-07, + "loss": 0.8519516, + "num_input_tokens_seen": 318806080, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.09765625, + "step": 14784, + "time_per_iteration": 2.5277795791625977 + }, + { + "auxiliary_loss_clip": 0.06403342, + "auxiliary_loss_mlp": 0.01272132, + "balance_loss_clip": 0.06270555, + "balance_loss_mlp": 0.01262554, + "epoch": 0.8889222906959267, + "flos": 27680243281920.0, + "grad_norm": 3.057965191651345, + "language_loss": 0.61165977, + "learning_rate": 1.279724491644565e-07, + "loss": 0.68841451, + "num_input_tokens_seen": 318826445, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.09576416, + "step": 14785, + "time_per_iteration": 2.580399990081787 + }, + { + "auxiliary_loss_clip": 0.06400205, + "auxiliary_loss_mlp": 0.01265322, + "balance_loss_clip": 0.06271164, + "balance_loss_mlp": 0.01256251, + "epoch": 0.8889824139485947, + "flos": 14173029112320.0, + "grad_norm": 1.975478801188687, + "language_loss": 0.65172708, + "learning_rate": 1.278354084140445e-07, + "loss": 0.72838235, + "num_input_tokens_seen": 318843915, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.09069824, + "step": 14786, + "time_per_iteration": 2.4636151790618896 + }, + { + "auxiliary_loss_clip": 0.06406666, + "auxiliary_loss_mlp": 0.01267342, + "balance_loss_clip": 0.06271188, + "balance_loss_mlp": 0.01256082, + "epoch": 0.8890425372012626, + "flos": 12856828377600.0, + "grad_norm": 7.700688456498016, + "language_loss": 0.85678732, + "learning_rate": 1.276984386563009e-07, + "loss": 0.93352735, + "num_input_tokens_seen": 318859670, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.11260986, + "step": 14787, + "time_per_iteration": 2.4787025451660156 + }, + { + "auxiliary_loss_clip": 0.06403594, + "auxiliary_loss_mlp": 0.012634, + "balance_loss_clip": 0.0627303, + "balance_loss_mlp": 0.01254645, + "epoch": 0.8891026604539306, + "flos": 21695719697280.0, + "grad_norm": 2.351201834821054, + "language_loss": 0.70638961, + "learning_rate": 1.2756153989642027e-07, + "loss": 0.7830596, + "num_input_tokens_seen": 318877855, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.08758545, + "step": 14788, + "time_per_iteration": 2.504624128341675 + }, + { + "auxiliary_loss_clip": 0.06397562, + "auxiliary_loss_mlp": 0.01263047, + "balance_loss_clip": 0.06271622, + "balance_loss_mlp": 0.01254226, + "epoch": 0.8891627837065985, + "flos": 21877840546560.0, + "grad_norm": 2.261908173801477, + "language_loss": 0.70175314, + "learning_rate": 1.274247121395935e-07, + "loss": 0.77835929, + "num_input_tokens_seen": 318896045, + "router_z_loss_clip": 1.2578125, + "router_z_loss_mlp": 0.08825684, + "step": 14789, + "time_per_iteration": 2.513617992401123 + }, + { + "auxiliary_loss_clip": 0.06400102, + "auxiliary_loss_mlp": 0.01266314, + "balance_loss_clip": 0.06272225, + "balance_loss_mlp": 0.01257505, + "epoch": 0.8892229069592665, + "flos": 21586707135360.0, + "grad_norm": 1.4895103847506954, + "language_loss": 0.70829117, + "learning_rate": 1.2728795539100956e-07, + "loss": 0.78495526, + "num_input_tokens_seen": 318915515, + "router_z_loss_clip": 1.27929688, + "router_z_loss_mlp": 0.08807373, + "step": 14790, + "time_per_iteration": 2.5025522708892822 + }, + { + "auxiliary_loss_clip": 0.06399814, + "auxiliary_loss_mlp": 0.01263203, + "balance_loss_clip": 0.0627079, + "balance_loss_mlp": 0.01254333, + "epoch": 0.8892830302119344, + "flos": 23082680805120.0, + "grad_norm": 1.5934920580532534, + "language_loss": 0.7301842, + "learning_rate": 1.2715126965585387e-07, + "loss": 0.80681431, + "num_input_tokens_seen": 318934305, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.08874512, + "step": 14791, + "time_per_iteration": 2.5074832439422607 + }, + { + "auxiliary_loss_clip": 0.06399459, + "auxiliary_loss_mlp": 0.01265691, + "balance_loss_clip": 0.06273172, + "balance_loss_mlp": 0.01256661, + "epoch": 0.8893431534646025, + "flos": 23078194611840.0, + "grad_norm": 1.4776865540614907, + "language_loss": 0.74067426, + "learning_rate": 1.2701465493931008e-07, + "loss": 0.81732577, + "num_input_tokens_seen": 318953880, + "router_z_loss_clip": 1.26269531, + "router_z_loss_mlp": 0.09033203, + "step": 14792, + "time_per_iteration": 2.5036797523498535 + }, + { + "auxiliary_loss_clip": 0.06406777, + "auxiliary_loss_mlp": 0.01265351, + "balance_loss_clip": 0.06272233, + "balance_loss_mlp": 0.01255612, + "epoch": 0.8894032767172704, + "flos": 22461449034240.0, + "grad_norm": 1.9207360943675909, + "language_loss": 0.66585648, + "learning_rate": 1.2687811124655801e-07, + "loss": 0.74257779, + "num_input_tokens_seen": 318971395, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.09729004, + "step": 14793, + "time_per_iteration": 2.4976742267608643 + }, + { + "auxiliary_loss_clip": 0.06402475, + "auxiliary_loss_mlp": 0.01263987, + "balance_loss_clip": 0.06269317, + "balance_loss_mlp": 0.01254653, + "epoch": 0.8894633999699384, + "flos": 25345348133760.0, + "grad_norm": 1.568161072745724, + "language_loss": 0.72041291, + "learning_rate": 1.2674163858277552e-07, + "loss": 0.79707754, + "num_input_tokens_seen": 318990580, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.09332275, + "step": 14794, + "time_per_iteration": 2.522052764892578 + }, + { + "auxiliary_loss_clip": 0.06405, + "auxiliary_loss_mlp": 0.01265763, + "balance_loss_clip": 0.06268515, + "balance_loss_mlp": 0.01255856, + "epoch": 0.8895235232226063, + "flos": 21000079900800.0, + "grad_norm": 1.4570169942784024, + "language_loss": 0.75557005, + "learning_rate": 1.2660523695313785e-07, + "loss": 0.83227766, + "num_input_tokens_seen": 319010040, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.09899902, + "step": 14795, + "time_per_iteration": 3.9239513874053955 + }, + { + "auxiliary_loss_clip": 0.06307139, + "auxiliary_loss_mlp": 0.01249152, + "balance_loss_clip": 0.06252786, + "balance_loss_mlp": 0.01248141, + "epoch": 0.8895836464752743, + "flos": 69752169705600.0, + "grad_norm": 0.7593022455176621, + "language_loss": 0.56138074, + "learning_rate": 1.2646890636281727e-07, + "loss": 0.6369437, + "num_input_tokens_seen": 319063860, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.01010132, + "step": 14796, + "time_per_iteration": 3.0498147010803223 + }, + { + "auxiliary_loss_clip": 0.06403103, + "auxiliary_loss_mlp": 0.01262207, + "balance_loss_clip": 0.0627062, + "balance_loss_mlp": 0.01251717, + "epoch": 0.8896437697279422, + "flos": 23228520036480.0, + "grad_norm": 1.6659870416154836, + "language_loss": 0.70651698, + "learning_rate": 1.263326468169843e-07, + "loss": 0.7831701, + "num_input_tokens_seen": 319082335, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.1048584, + "step": 14797, + "time_per_iteration": 2.498295783996582 + }, + { + "auxiliary_loss_clip": 0.06308188, + "auxiliary_loss_mlp": 0.01248559, + "balance_loss_clip": 0.06253885, + "balance_loss_mlp": 0.01247547, + "epoch": 0.8897038929806103, + "flos": 70771786513920.0, + "grad_norm": 0.7861850314361323, + "language_loss": 0.5798108, + "learning_rate": 1.2619645832080417e-07, + "loss": 0.65537828, + "num_input_tokens_seen": 319147075, + "router_z_loss_clip": 0.54394531, + "router_z_loss_mlp": 0.01012421, + "step": 14798, + "time_per_iteration": 4.576344728469849 + }, + { + "auxiliary_loss_clip": 0.06401603, + "auxiliary_loss_mlp": 0.01263713, + "balance_loss_clip": 0.06271328, + "balance_loss_mlp": 0.01254802, + "epoch": 0.8897640162332782, + "flos": 19251183081600.0, + "grad_norm": 1.5301138927285134, + "language_loss": 0.79772937, + "learning_rate": 1.2606034087944251e-07, + "loss": 0.8743825, + "num_input_tokens_seen": 319166630, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.08905029, + "step": 14799, + "time_per_iteration": 2.51359224319458 + }, + { + "auxiliary_loss_clip": 0.06312159, + "auxiliary_loss_mlp": 0.01250026, + "balance_loss_clip": 0.06257726, + "balance_loss_mlp": 0.01249046, + "epoch": 0.8898241394859462, + "flos": 41372288830080.0, + "grad_norm": 0.862554760801988, + "language_loss": 0.58133441, + "learning_rate": 1.2592429449806053e-07, + "loss": 0.65695632, + "num_input_tokens_seen": 319221865, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.00979614, + "step": 14800, + "time_per_iteration": 4.55234169960022 + }, + { + "auxiliary_loss_clip": 0.06398645, + "auxiliary_loss_mlp": 0.01264119, + "balance_loss_clip": 0.06269025, + "balance_loss_mlp": 0.01255149, + "epoch": 0.8898842627386142, + "flos": 18991761240960.0, + "grad_norm": 1.4041707387256148, + "language_loss": 0.66151714, + "learning_rate": 1.2578831918181698e-07, + "loss": 0.73814476, + "num_input_tokens_seen": 319240710, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.08978271, + "step": 14801, + "time_per_iteration": 2.4694650173187256 + }, + { + "auxiliary_loss_clip": 0.06408633, + "auxiliary_loss_mlp": 0.01265457, + "balance_loss_clip": 0.06275365, + "balance_loss_mlp": 0.01255354, + "epoch": 0.8899443859912821, + "flos": 13220944295040.0, + "grad_norm": 2.4705918248485266, + "language_loss": 0.75189161, + "learning_rate": 1.256524149358682e-07, + "loss": 0.82863259, + "num_input_tokens_seen": 319256495, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.10101318, + "step": 14802, + "time_per_iteration": 2.5068447589874268 + }, + { + "auxiliary_loss_clip": 0.06400315, + "auxiliary_loss_mlp": 0.01262993, + "balance_loss_clip": 0.06273411, + "balance_loss_mlp": 0.01253856, + "epoch": 0.8900045092439501, + "flos": 22681318947840.0, + "grad_norm": 1.6381133195062223, + "language_loss": 0.73893923, + "learning_rate": 1.2551658176536805e-07, + "loss": 0.81557232, + "num_input_tokens_seen": 319273620, + "router_z_loss_clip": 1.26855469, + "router_z_loss_mlp": 0.09136963, + "step": 14803, + "time_per_iteration": 2.501056432723999 + }, + { + "auxiliary_loss_clip": 0.06399588, + "auxiliary_loss_mlp": 0.01262871, + "balance_loss_clip": 0.06269965, + "balance_loss_mlp": 0.01253299, + "epoch": 0.890064632496618, + "flos": 21147889703040.0, + "grad_norm": 1.7012691749350357, + "language_loss": 0.71806979, + "learning_rate": 1.2538081967546664e-07, + "loss": 0.79469442, + "num_input_tokens_seen": 319291720, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.09576416, + "step": 14804, + "time_per_iteration": 2.4941203594207764 + }, + { + "auxiliary_loss_clip": 0.06400431, + "auxiliary_loss_mlp": 0.01263674, + "balance_loss_clip": 0.0626931, + "balance_loss_mlp": 0.0125425, + "epoch": 0.8901247557492861, + "flos": 23402590894080.0, + "grad_norm": 1.58173973410221, + "language_loss": 0.81494653, + "learning_rate": 1.252451286713123e-07, + "loss": 0.89158762, + "num_input_tokens_seen": 319310380, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09417725, + "step": 14805, + "time_per_iteration": 2.4995498657226562 + }, + { + "auxiliary_loss_clip": 0.06405678, + "auxiliary_loss_mlp": 0.01263308, + "balance_loss_clip": 0.0627286, + "balance_loss_mlp": 0.012537, + "epoch": 0.890184879001954, + "flos": 29177390908800.0, + "grad_norm": 1.7463753983517807, + "language_loss": 0.67048252, + "learning_rate": 1.251095087580505e-07, + "loss": 0.74717236, + "num_input_tokens_seen": 319331765, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.09606934, + "step": 14806, + "time_per_iteration": 2.5823683738708496 + }, + { + "auxiliary_loss_clip": 0.06400896, + "auxiliary_loss_mlp": 0.01263841, + "balance_loss_clip": 0.06270954, + "balance_loss_mlp": 0.01254429, + "epoch": 0.890245002254622, + "flos": 14432912150400.0, + "grad_norm": 1.7922455060213383, + "language_loss": 0.67830801, + "learning_rate": 1.2497395994082438e-07, + "loss": 0.75495535, + "num_input_tokens_seen": 319349135, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.09417725, + "step": 14807, + "time_per_iteration": 2.5916707515716553 + }, + { + "auxiliary_loss_clip": 0.06399317, + "auxiliary_loss_mlp": 0.01263711, + "balance_loss_clip": 0.06270466, + "balance_loss_mlp": 0.01254734, + "epoch": 0.8903051255072899, + "flos": 22388676163200.0, + "grad_norm": 1.744680374078912, + "language_loss": 0.75182492, + "learning_rate": 1.248384822247732e-07, + "loss": 0.82845521, + "num_input_tokens_seen": 319368410, + "router_z_loss_clip": 1.28808594, + "router_z_loss_mlp": 0.08972168, + "step": 14808, + "time_per_iteration": 2.5085625648498535 + }, + { + "auxiliary_loss_clip": 0.06401837, + "auxiliary_loss_mlp": 0.0126289, + "balance_loss_clip": 0.06269499, + "balance_loss_mlp": 0.01254408, + "epoch": 0.8903652487599579, + "flos": 20783689931520.0, + "grad_norm": 2.005761137516875, + "language_loss": 0.81256378, + "learning_rate": 1.2470307561503513e-07, + "loss": 0.88921106, + "num_input_tokens_seen": 319387535, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.08477783, + "step": 14809, + "time_per_iteration": 2.476633310317993 + }, + { + "auxiliary_loss_clip": 0.06402358, + "auxiliary_loss_mlp": 0.01265966, + "balance_loss_clip": 0.06272776, + "balance_loss_mlp": 0.01256847, + "epoch": 0.8904253720126258, + "flos": 24431180088960.0, + "grad_norm": 1.7755328357455793, + "language_loss": 0.68591714, + "learning_rate": 1.2456774011674442e-07, + "loss": 0.7626003, + "num_input_tokens_seen": 319407210, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.09112549, + "step": 14810, + "time_per_iteration": 2.529508590698242 + }, + { + "auxiliary_loss_clip": 0.06403522, + "auxiliary_loss_mlp": 0.01264868, + "balance_loss_clip": 0.0627034, + "balance_loss_mlp": 0.01255879, + "epoch": 0.8904854952652939, + "flos": 19469962892160.0, + "grad_norm": 1.8706593590776184, + "language_loss": 0.7023586, + "learning_rate": 1.2443247573503257e-07, + "loss": 0.77904254, + "num_input_tokens_seen": 319425340, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.08990479, + "step": 14811, + "time_per_iteration": 2.465928792953491 + }, + { + "auxiliary_loss_clip": 0.06403497, + "auxiliary_loss_mlp": 0.01262283, + "balance_loss_clip": 0.06270523, + "balance_loss_mlp": 0.01253414, + "epoch": 0.8905456185179618, + "flos": 50811337347840.0, + "grad_norm": 2.2469275425064743, + "language_loss": 0.65642589, + "learning_rate": 1.2429728247502924e-07, + "loss": 0.73308372, + "num_input_tokens_seen": 319448150, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.08874512, + "step": 14812, + "time_per_iteration": 2.7694013118743896 + }, + { + "auxiliary_loss_clip": 0.06398641, + "auxiliary_loss_mlp": 0.01263031, + "balance_loss_clip": 0.06269665, + "balance_loss_mlp": 0.0125458, + "epoch": 0.8906057417706298, + "flos": 17790568634880.0, + "grad_norm": 1.8555379059256571, + "language_loss": 0.68591535, + "learning_rate": 1.24162160341861e-07, + "loss": 0.76253206, + "num_input_tokens_seen": 319466115, + "router_z_loss_clip": 1.28808594, + "router_z_loss_mlp": 0.08453369, + "step": 14813, + "time_per_iteration": 2.463127851486206 + }, + { + "auxiliary_loss_clip": 0.06410507, + "auxiliary_loss_mlp": 0.01265046, + "balance_loss_clip": 0.06271763, + "balance_loss_mlp": 0.01254455, + "epoch": 0.8906658650232978, + "flos": 21951368104320.0, + "grad_norm": 2.3980423530949944, + "language_loss": 0.76035082, + "learning_rate": 1.2402710934065198e-07, + "loss": 0.83710635, + "num_input_tokens_seen": 319485255, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.10583496, + "step": 14814, + "time_per_iteration": 2.528144121170044 + }, + { + "auxiliary_loss_clip": 0.0640672, + "auxiliary_loss_mlp": 0.01263567, + "balance_loss_clip": 0.06271608, + "balance_loss_mlp": 0.01253721, + "epoch": 0.8907259882759657, + "flos": 21294148204800.0, + "grad_norm": 1.8020653483786722, + "language_loss": 0.74471802, + "learning_rate": 1.2389212947652229e-07, + "loss": 0.82142091, + "num_input_tokens_seen": 319501800, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.09838867, + "step": 14815, + "time_per_iteration": 2.489715814590454 + }, + { + "auxiliary_loss_clip": 0.06397778, + "auxiliary_loss_mlp": 0.01265289, + "balance_loss_clip": 0.06269968, + "balance_loss_mlp": 0.01256545, + "epoch": 0.8907861115286337, + "flos": 20126595813120.0, + "grad_norm": 1.8276250566401673, + "language_loss": 0.75265664, + "learning_rate": 1.237572207545914e-07, + "loss": 0.82928729, + "num_input_tokens_seen": 319520415, + "router_z_loss_clip": 1.27734375, + "router_z_loss_mlp": 0.08740234, + "step": 14816, + "time_per_iteration": 2.5541696548461914 + }, + { + "auxiliary_loss_clip": 0.06403603, + "auxiliary_loss_mlp": 0.01264452, + "balance_loss_clip": 0.06270932, + "balance_loss_mlp": 0.01255386, + "epoch": 0.8908462347813016, + "flos": 20090356122240.0, + "grad_norm": 1.6893324557452318, + "language_loss": 0.77627748, + "learning_rate": 1.2362238317997476e-07, + "loss": 0.85295802, + "num_input_tokens_seen": 319538410, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.09063721, + "step": 14817, + "time_per_iteration": 2.551323652267456 + }, + { + "auxiliary_loss_clip": 0.06309056, + "auxiliary_loss_mlp": 0.01250369, + "balance_loss_clip": 0.06254645, + "balance_loss_mlp": 0.01249346, + "epoch": 0.8909063580339697, + "flos": 65522664288000.0, + "grad_norm": 0.7605080836630386, + "language_loss": 0.56617504, + "learning_rate": 1.2348761675778517e-07, + "loss": 0.64176929, + "num_input_tokens_seen": 319602565, + "router_z_loss_clip": 0.54345703, + "router_z_loss_mlp": 0.01023865, + "step": 14818, + "time_per_iteration": 3.1869611740112305 + }, + { + "auxiliary_loss_clip": 0.06404532, + "auxiliary_loss_mlp": 0.01268345, + "balance_loss_clip": 0.06274045, + "balance_loss_mlp": 0.01258939, + "epoch": 0.8909664812866376, + "flos": 29871018207360.0, + "grad_norm": 1.912148882510469, + "language_loss": 0.64619452, + "learning_rate": 1.2335292149313325e-07, + "loss": 0.72292328, + "num_input_tokens_seen": 319624645, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.09405518, + "step": 14819, + "time_per_iteration": 2.5426406860351562 + }, + { + "auxiliary_loss_clip": 0.06405222, + "auxiliary_loss_mlp": 0.01263411, + "balance_loss_clip": 0.06273527, + "balance_loss_mlp": 0.01253749, + "epoch": 0.8910266045393056, + "flos": 25454151060480.0, + "grad_norm": 2.0396984257073743, + "language_loss": 0.78438711, + "learning_rate": 1.2321829739112731e-07, + "loss": 0.86107349, + "num_input_tokens_seen": 319644040, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.09661865, + "step": 14820, + "time_per_iteration": 2.5258073806762695 + }, + { + "auxiliary_loss_clip": 0.06400749, + "auxiliary_loss_mlp": 0.01264328, + "balance_loss_clip": 0.06270877, + "balance_loss_mlp": 0.01254493, + "epoch": 0.8910867277919735, + "flos": 24506091239040.0, + "grad_norm": 1.6484795223719642, + "language_loss": 0.76428401, + "learning_rate": 1.2308374445687087e-07, + "loss": 0.84093475, + "num_input_tokens_seen": 319663930, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.0982666, + "step": 14821, + "time_per_iteration": 2.5047779083251953 + }, + { + "auxiliary_loss_clip": 0.06309538, + "auxiliary_loss_mlp": 0.01252341, + "balance_loss_clip": 0.06255338, + "balance_loss_mlp": 0.01251348, + "epoch": 0.8911468510446415, + "flos": 60706447781760.0, + "grad_norm": 0.7814778898249498, + "language_loss": 0.59336329, + "learning_rate": 1.2294926269546712e-07, + "loss": 0.66898209, + "num_input_tokens_seen": 319721245, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.00993347, + "step": 14822, + "time_per_iteration": 3.009101390838623 + }, + { + "auxiliary_loss_clip": 0.06401677, + "auxiliary_loss_mlp": 0.0126257, + "balance_loss_clip": 0.06270057, + "balance_loss_mlp": 0.01252986, + "epoch": 0.8912069742973094, + "flos": 25344467665920.0, + "grad_norm": 1.812720827369598, + "language_loss": 0.69541264, + "learning_rate": 1.2281485211201515e-07, + "loss": 0.77205515, + "num_input_tokens_seen": 319741200, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09588623, + "step": 14823, + "time_per_iteration": 3.9813392162323 + }, + { + "auxiliary_loss_clip": 0.06397749, + "auxiliary_loss_mlp": 0.0126276, + "balance_loss_clip": 0.06269428, + "balance_loss_mlp": 0.01254034, + "epoch": 0.8912670975499775, + "flos": 18229427994240.0, + "grad_norm": 1.5767380343948896, + "language_loss": 0.69303524, + "learning_rate": 1.2268051271161262e-07, + "loss": 0.76964033, + "num_input_tokens_seen": 319759265, + "router_z_loss_clip": 1.28320312, + "router_z_loss_mlp": 0.08721924, + "step": 14824, + "time_per_iteration": 2.4433348178863525 + }, + { + "auxiliary_loss_clip": 0.06405216, + "auxiliary_loss_mlp": 0.01263499, + "balance_loss_clip": 0.06270469, + "balance_loss_mlp": 0.01253694, + "epoch": 0.8913272208026454, + "flos": 26511558860160.0, + "grad_norm": 1.8101659396972392, + "language_loss": 0.70682526, + "learning_rate": 1.2254624449935303e-07, + "loss": 0.78351235, + "num_input_tokens_seen": 319777560, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.0980835, + "step": 14825, + "time_per_iteration": 2.5183238983154297 + }, + { + "auxiliary_loss_clip": 0.06400351, + "auxiliary_loss_mlp": 0.01266596, + "balance_loss_clip": 0.06270771, + "balance_loss_mlp": 0.01257065, + "epoch": 0.8913873440553134, + "flos": 18807502112640.0, + "grad_norm": 2.0913429177611467, + "language_loss": 0.70963371, + "learning_rate": 1.2241204748032786e-07, + "loss": 0.78630316, + "num_input_tokens_seen": 319794125, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.09527588, + "step": 14826, + "time_per_iteration": 2.4554646015167236 + }, + { + "auxiliary_loss_clip": 0.06399363, + "auxiliary_loss_mlp": 0.01263058, + "balance_loss_clip": 0.06270689, + "balance_loss_mlp": 0.01254099, + "epoch": 0.8914474673079814, + "flos": 20890899630720.0, + "grad_norm": 1.9654155681394898, + "language_loss": 0.75443125, + "learning_rate": 1.2227792165962615e-07, + "loss": 0.83105552, + "num_input_tokens_seen": 319810310, + "router_z_loss_clip": 1.28613281, + "router_z_loss_mlp": 0.08959961, + "step": 14827, + "time_per_iteration": 2.562636137008667 + }, + { + "auxiliary_loss_clip": 0.06403911, + "auxiliary_loss_mlp": 0.01263366, + "balance_loss_clip": 0.06272019, + "balance_loss_mlp": 0.01253513, + "epoch": 0.8915075905606493, + "flos": 20957551153920.0, + "grad_norm": 1.5895570130516543, + "language_loss": 0.78462636, + "learning_rate": 1.221438670423336e-07, + "loss": 0.86129922, + "num_input_tokens_seen": 319828505, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09851074, + "step": 14828, + "time_per_iteration": 2.4832942485809326 + }, + { + "auxiliary_loss_clip": 0.06401472, + "auxiliary_loss_mlp": 0.01264433, + "balance_loss_clip": 0.06271511, + "balance_loss_mlp": 0.01255367, + "epoch": 0.8915677138133173, + "flos": 23083058148480.0, + "grad_norm": 1.576500276860786, + "language_loss": 0.75334942, + "learning_rate": 1.2200988363353392e-07, + "loss": 0.83000845, + "num_input_tokens_seen": 319848680, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.09075928, + "step": 14829, + "time_per_iteration": 2.5500776767730713 + }, + { + "auxiliary_loss_clip": 0.06400748, + "auxiliary_loss_mlp": 0.01266587, + "balance_loss_clip": 0.06269491, + "balance_loss_mlp": 0.01257479, + "epoch": 0.8916278370659853, + "flos": 23446922503680.0, + "grad_norm": 1.4673976438274965, + "language_loss": 0.84542692, + "learning_rate": 1.2187597143830773e-07, + "loss": 0.92210025, + "num_input_tokens_seen": 319868835, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09112549, + "step": 14830, + "time_per_iteration": 2.5105338096618652 + }, + { + "auxiliary_loss_clip": 0.06398022, + "auxiliary_loss_mlp": 0.0126449, + "balance_loss_clip": 0.06270295, + "balance_loss_mlp": 0.01255996, + "epoch": 0.8916879603186533, + "flos": 25168342383360.0, + "grad_norm": 1.3751500735649531, + "language_loss": 0.75201428, + "learning_rate": 1.2174213046173299e-07, + "loss": 0.82863945, + "num_input_tokens_seen": 319891585, + "router_z_loss_clip": 1.27734375, + "router_z_loss_mlp": 0.08496094, + "step": 14831, + "time_per_iteration": 2.5866332054138184 + }, + { + "auxiliary_loss_clip": 0.06403229, + "auxiliary_loss_mlp": 0.01264299, + "balance_loss_clip": 0.06268588, + "balance_loss_mlp": 0.01254548, + "epoch": 0.8917480835713212, + "flos": 20236027645440.0, + "grad_norm": 1.8114871234332395, + "language_loss": 0.73160887, + "learning_rate": 1.216083607088847e-07, + "loss": 0.8082841, + "num_input_tokens_seen": 319910315, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.09759521, + "step": 14832, + "time_per_iteration": 2.5055291652679443 + }, + { + "auxiliary_loss_clip": 0.06403124, + "auxiliary_loss_mlp": 0.01264791, + "balance_loss_clip": 0.06270224, + "balance_loss_mlp": 0.01255153, + "epoch": 0.8918082068239892, + "flos": 26108729556480.0, + "grad_norm": 1.7973281023337047, + "language_loss": 0.67450631, + "learning_rate": 1.214746621848355e-07, + "loss": 0.75118548, + "num_input_tokens_seen": 319932275, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.09631348, + "step": 14833, + "time_per_iteration": 2.5191965103149414 + }, + { + "auxiliary_loss_clip": 0.06404808, + "auxiliary_loss_mlp": 0.01264902, + "balance_loss_clip": 0.06271433, + "balance_loss_mlp": 0.01254257, + "epoch": 0.8918683300766571, + "flos": 24839124491520.0, + "grad_norm": 1.6520503263058561, + "language_loss": 0.74187469, + "learning_rate": 1.2134103489465575e-07, + "loss": 0.81857181, + "num_input_tokens_seen": 319955335, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.10638428, + "step": 14834, + "time_per_iteration": 4.013251781463623 + }, + { + "auxiliary_loss_clip": 0.06404478, + "auxiliary_loss_mlp": 0.01263789, + "balance_loss_clip": 0.06273828, + "balance_loss_mlp": 0.01254288, + "epoch": 0.8919284533293251, + "flos": 22310955901440.0, + "grad_norm": 1.863798974093549, + "language_loss": 0.79164231, + "learning_rate": 1.2120747884341188e-07, + "loss": 0.868325, + "num_input_tokens_seen": 319973990, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.0949707, + "step": 14835, + "time_per_iteration": 2.4751789569854736 + }, + { + "auxiliary_loss_clip": 0.063965, + "auxiliary_loss_mlp": 0.01263728, + "balance_loss_clip": 0.06268743, + "balance_loss_mlp": 0.01254698, + "epoch": 0.891988576581993, + "flos": 30381518407680.0, + "grad_norm": 1.4245369026634545, + "language_loss": 0.73941118, + "learning_rate": 1.210739940361689e-07, + "loss": 0.81601346, + "num_input_tokens_seen": 319995555, + "router_z_loss_clip": 1.27636719, + "router_z_loss_mlp": 0.09033203, + "step": 14836, + "time_per_iteration": 2.561229944229126 + }, + { + "auxiliary_loss_clip": 0.06401372, + "auxiliary_loss_mlp": 0.01266792, + "balance_loss_clip": 0.06270787, + "balance_loss_mlp": 0.01257524, + "epoch": 0.8920486998346611, + "flos": 15557223035520.0, + "grad_norm": 2.352945147165247, + "language_loss": 0.689592, + "learning_rate": 1.2094058047798838e-07, + "loss": 0.76627362, + "num_input_tokens_seen": 320012385, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.09259033, + "step": 14837, + "time_per_iteration": 2.4373927116394043 + }, + { + "auxiliary_loss_clip": 0.0640661, + "auxiliary_loss_mlp": 0.01265317, + "balance_loss_clip": 0.06270414, + "balance_loss_mlp": 0.01255369, + "epoch": 0.892108823087329, + "flos": 21221333406720.0, + "grad_norm": 1.7967074516272619, + "language_loss": 0.67696273, + "learning_rate": 1.2080723817392913e-07, + "loss": 0.75368202, + "num_input_tokens_seen": 320032390, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.09942627, + "step": 14838, + "time_per_iteration": 3.965111255645752 + }, + { + "auxiliary_loss_clip": 0.06404169, + "auxiliary_loss_mlp": 0.01266814, + "balance_loss_clip": 0.06271984, + "balance_loss_mlp": 0.01257391, + "epoch": 0.892168946339997, + "flos": 21985092172800.0, + "grad_norm": 1.9689260594947426, + "language_loss": 0.76717424, + "learning_rate": 1.2067396712904777e-07, + "loss": 0.84388411, + "num_input_tokens_seen": 320052885, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.09423828, + "step": 14839, + "time_per_iteration": 3.895935297012329 + }, + { + "auxiliary_loss_clip": 0.06311233, + "auxiliary_loss_mlp": 0.01249533, + "balance_loss_clip": 0.06256986, + "balance_loss_mlp": 0.01248568, + "epoch": 0.892229069592665, + "flos": 67494869038080.0, + "grad_norm": 0.6601341927430973, + "language_loss": 0.49224526, + "learning_rate": 1.205407673483978e-07, + "loss": 0.56785291, + "num_input_tokens_seen": 320113685, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.00964355, + "step": 14840, + "time_per_iteration": 3.0776662826538086 + }, + { + "auxiliary_loss_clip": 0.06408979, + "auxiliary_loss_mlp": 0.01264539, + "balance_loss_clip": 0.06271542, + "balance_loss_mlp": 0.01253709, + "epoch": 0.8922891928453329, + "flos": 19464931647360.0, + "grad_norm": 2.036775192434288, + "language_loss": 0.64259487, + "learning_rate": 1.2040763883703074e-07, + "loss": 0.71933007, + "num_input_tokens_seen": 320130810, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.10827637, + "step": 14841, + "time_per_iteration": 2.5317835807800293 + }, + { + "auxiliary_loss_clip": 0.06397078, + "auxiliary_loss_mlp": 0.01266801, + "balance_loss_clip": 0.06271005, + "balance_loss_mlp": 0.01258367, + "epoch": 0.8923493160980009, + "flos": 23374065778560.0, + "grad_norm": 1.5067524723122596, + "language_loss": 0.68637419, + "learning_rate": 1.2027458159999438e-07, + "loss": 0.76301301, + "num_input_tokens_seen": 320152170, + "router_z_loss_clip": 1.26074219, + "router_z_loss_mlp": 0.08428955, + "step": 14842, + "time_per_iteration": 2.5464539527893066 + }, + { + "auxiliary_loss_clip": 0.06398538, + "auxiliary_loss_mlp": 0.01266525, + "balance_loss_clip": 0.06271973, + "balance_loss_mlp": 0.01257984, + "epoch": 0.8924094393506689, + "flos": 26184227685120.0, + "grad_norm": 1.9083387280935236, + "language_loss": 0.80568957, + "learning_rate": 1.2014159564233373e-07, + "loss": 0.88234019, + "num_input_tokens_seen": 320172360, + "router_z_loss_clip": 1.26660156, + "router_z_loss_mlp": 0.08538818, + "step": 14843, + "time_per_iteration": 2.6366734504699707 + }, + { + "auxiliary_loss_clip": 0.06403741, + "auxiliary_loss_mlp": 0.01261264, + "balance_loss_clip": 0.06268854, + "balance_loss_mlp": 0.01251781, + "epoch": 0.8924695626033369, + "flos": 22025147224320.0, + "grad_norm": 2.3820566119919597, + "language_loss": 0.68648458, + "learning_rate": 1.2000868096909257e-07, + "loss": 0.76313466, + "num_input_tokens_seen": 320192130, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.09484863, + "step": 14844, + "time_per_iteration": 2.473205327987671 + }, + { + "auxiliary_loss_clip": 0.06401572, + "auxiliary_loss_mlp": 0.01266646, + "balance_loss_clip": 0.0627027, + "balance_loss_mlp": 0.01257646, + "epoch": 0.8925296858560048, + "flos": 14799292128000.0, + "grad_norm": 2.2923996449190236, + "language_loss": 0.91698718, + "learning_rate": 1.1987583758531038e-07, + "loss": 0.99366921, + "num_input_tokens_seen": 320207760, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.08996582, + "step": 14845, + "time_per_iteration": 2.4725682735443115 + }, + { + "auxiliary_loss_clip": 0.06397889, + "auxiliary_loss_mlp": 0.01266631, + "balance_loss_clip": 0.06271143, + "balance_loss_mlp": 0.012575, + "epoch": 0.8925898091086728, + "flos": 22353275013120.0, + "grad_norm": 1.8851582934669056, + "language_loss": 0.72789091, + "learning_rate": 1.1974306549602476e-07, + "loss": 0.8045361, + "num_input_tokens_seen": 320225325, + "router_z_loss_clip": 1.26660156, + "router_z_loss_mlp": 0.09130859, + "step": 14846, + "time_per_iteration": 2.466618299484253 + }, + { + "auxiliary_loss_clip": 0.06400344, + "auxiliary_loss_mlp": 0.01264224, + "balance_loss_clip": 0.06268599, + "balance_loss_mlp": 0.01254645, + "epoch": 0.8926499323613407, + "flos": 45816773425920.0, + "grad_norm": 2.2290508938220657, + "language_loss": 0.57516384, + "learning_rate": 1.1961036470627094e-07, + "loss": 0.65180945, + "num_input_tokens_seen": 320247645, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.0958252, + "step": 14847, + "time_per_iteration": 2.6878631114959717 + }, + { + "auxiliary_loss_clip": 0.06405343, + "auxiliary_loss_mlp": 0.0126293, + "balance_loss_clip": 0.06273352, + "balance_loss_mlp": 0.01254026, + "epoch": 0.8927100556140087, + "flos": 22133530880640.0, + "grad_norm": 1.7052460383606831, + "language_loss": 0.76622617, + "learning_rate": 1.1947773522108052e-07, + "loss": 0.84290886, + "num_input_tokens_seen": 320266005, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.08911133, + "step": 14848, + "time_per_iteration": 2.4818036556243896 + }, + { + "auxiliary_loss_clip": 0.06397684, + "auxiliary_loss_mlp": 0.01264619, + "balance_loss_clip": 0.06270074, + "balance_loss_mlp": 0.01255648, + "epoch": 0.8927701788666766, + "flos": 28337756670720.0, + "grad_norm": 1.7160281168375413, + "language_loss": 0.69265717, + "learning_rate": 1.1934517704548251e-07, + "loss": 0.7692802, + "num_input_tokens_seen": 320285555, + "router_z_loss_clip": 1.27636719, + "router_z_loss_mlp": 0.08978271, + "step": 14849, + "time_per_iteration": 2.5345237255096436 + }, + { + "auxiliary_loss_clip": 0.0640296, + "auxiliary_loss_mlp": 0.01266234, + "balance_loss_clip": 0.06271099, + "balance_loss_mlp": 0.01257139, + "epoch": 0.8928303021193447, + "flos": 25300932670080.0, + "grad_norm": 9.377316945949495, + "language_loss": 0.80831003, + "learning_rate": 1.1921269018450364e-07, + "loss": 0.88500196, + "num_input_tokens_seen": 320305395, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09094238, + "step": 14850, + "time_per_iteration": 2.5199098587036133 + }, + { + "auxiliary_loss_clip": 0.06401064, + "auxiliary_loss_mlp": 0.01268179, + "balance_loss_clip": 0.06273234, + "balance_loss_mlp": 0.01258905, + "epoch": 0.8928904253720126, + "flos": 22243256202240.0, + "grad_norm": 1.5485045372929462, + "language_loss": 0.75078595, + "learning_rate": 1.1908027464316872e-07, + "loss": 0.82747841, + "num_input_tokens_seen": 320324220, + "router_z_loss_clip": 1.27929688, + "router_z_loss_mlp": 0.09259033, + "step": 14851, + "time_per_iteration": 2.520653247833252 + }, + { + "auxiliary_loss_clip": 0.06399436, + "auxiliary_loss_mlp": 0.01263636, + "balance_loss_clip": 0.06269779, + "balance_loss_mlp": 0.01253873, + "epoch": 0.8929505486246806, + "flos": 27100240519680.0, + "grad_norm": 1.5234739913675641, + "language_loss": 0.78729236, + "learning_rate": 1.1894793042649775e-07, + "loss": 0.86392307, + "num_input_tokens_seen": 320347195, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.09759521, + "step": 14852, + "time_per_iteration": 2.5750808715820312 + }, + { + "auxiliary_loss_clip": 0.06400271, + "auxiliary_loss_mlp": 0.01263228, + "balance_loss_clip": 0.06272772, + "balance_loss_mlp": 0.01254544, + "epoch": 0.8930106718773486, + "flos": 23046021843840.0, + "grad_norm": 1.3447156606133301, + "language_loss": 0.69361079, + "learning_rate": 1.1881565753951006e-07, + "loss": 0.77024567, + "num_input_tokens_seen": 320366850, + "router_z_loss_clip": 1.27539062, + "router_z_loss_mlp": 0.08685303, + "step": 14853, + "time_per_iteration": 2.5206987857818604 + }, + { + "auxiliary_loss_clip": 0.0639962, + "auxiliary_loss_mlp": 0.0126338, + "balance_loss_clip": 0.06269603, + "balance_loss_mlp": 0.01254469, + "epoch": 0.8930707951300165, + "flos": 35635378389120.0, + "grad_norm": 1.7867498059610383, + "language_loss": 0.67108899, + "learning_rate": 1.1868345598722118e-07, + "loss": 0.74771899, + "num_input_tokens_seen": 320388895, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.08905029, + "step": 14854, + "time_per_iteration": 2.6050684452056885 + }, + { + "auxiliary_loss_clip": 0.06395994, + "auxiliary_loss_mlp": 0.01262577, + "balance_loss_clip": 0.06270514, + "balance_loss_mlp": 0.01253732, + "epoch": 0.8931309183826845, + "flos": 23046650749440.0, + "grad_norm": 1.3515219492538217, + "language_loss": 0.74918699, + "learning_rate": 1.1855132577464399e-07, + "loss": 0.8257727, + "num_input_tokens_seen": 320408520, + "router_z_loss_clip": 1.25585938, + "router_z_loss_mlp": 0.08850098, + "step": 14855, + "time_per_iteration": 2.530815362930298 + }, + { + "auxiliary_loss_clip": 0.06400646, + "auxiliary_loss_mlp": 0.01264231, + "balance_loss_clip": 0.06271029, + "balance_loss_mlp": 0.01255427, + "epoch": 0.8931910416353525, + "flos": 26511726568320.0, + "grad_norm": 1.9239485722311656, + "language_loss": 0.64665866, + "learning_rate": 1.1841926690678893e-07, + "loss": 0.72330737, + "num_input_tokens_seen": 320427400, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.0880127, + "step": 14856, + "time_per_iteration": 2.61660099029541 + }, + { + "auxiliary_loss_clip": 0.06401564, + "auxiliary_loss_mlp": 0.01267596, + "balance_loss_clip": 0.06271841, + "balance_loss_mlp": 0.01259007, + "epoch": 0.8932511648880205, + "flos": 24980687164800.0, + "grad_norm": 1.6913640508608676, + "language_loss": 0.66606605, + "learning_rate": 1.1828727938866378e-07, + "loss": 0.74275768, + "num_input_tokens_seen": 320447570, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.08587646, + "step": 14857, + "time_per_iteration": 2.5284576416015625 + }, + { + "auxiliary_loss_clip": 0.06404722, + "auxiliary_loss_mlp": 0.01265153, + "balance_loss_clip": 0.0627214, + "balance_loss_mlp": 0.01255331, + "epoch": 0.8933112881406884, + "flos": 24467377852800.0, + "grad_norm": 2.3222836752374305, + "language_loss": 0.75424057, + "learning_rate": 1.1815536322527408e-07, + "loss": 0.83093929, + "num_input_tokens_seen": 320464405, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.09814453, + "step": 14858, + "time_per_iteration": 2.5318548679351807 + }, + { + "auxiliary_loss_clip": 0.06400517, + "auxiliary_loss_mlp": 0.01269115, + "balance_loss_clip": 0.06269918, + "balance_loss_mlp": 0.01259859, + "epoch": 0.8933714113933564, + "flos": 28300594584960.0, + "grad_norm": 1.5598958760491506, + "language_loss": 0.69930089, + "learning_rate": 1.1802351842162139e-07, + "loss": 0.77599716, + "num_input_tokens_seen": 320485525, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.09246826, + "step": 14859, + "time_per_iteration": 2.5497772693634033 + }, + { + "auxiliary_loss_clip": 0.06392509, + "auxiliary_loss_mlp": 0.01262646, + "balance_loss_clip": 0.06268515, + "balance_loss_mlp": 0.01254284, + "epoch": 0.8934315346460243, + "flos": 21441412955520.0, + "grad_norm": 1.8624217934039429, + "language_loss": 0.75625086, + "learning_rate": 1.1789174498270526e-07, + "loss": 0.83280241, + "num_input_tokens_seen": 320506725, + "router_z_loss_clip": 1.24121094, + "router_z_loss_mlp": 0.08355713, + "step": 14860, + "time_per_iteration": 2.5246856212615967 + }, + { + "auxiliary_loss_clip": 0.06401479, + "auxiliary_loss_mlp": 0.01264873, + "balance_loss_clip": 0.06269905, + "balance_loss_mlp": 0.01255092, + "epoch": 0.8934916578986923, + "flos": 23776475811840.0, + "grad_norm": 1.7913164258614247, + "language_loss": 0.5788613, + "learning_rate": 1.1776004291352303e-07, + "loss": 0.65552485, + "num_input_tokens_seen": 320525425, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.09777832, + "step": 14861, + "time_per_iteration": 2.513174533843994 + }, + { + "auxiliary_loss_clip": 0.06395803, + "auxiliary_loss_mlp": 0.01266204, + "balance_loss_clip": 0.06268242, + "balance_loss_mlp": 0.01257407, + "epoch": 0.8935517811513602, + "flos": 18922090970880.0, + "grad_norm": 2.8229402142894924, + "language_loss": 0.63289392, + "learning_rate": 1.176284122190685e-07, + "loss": 0.70951402, + "num_input_tokens_seen": 320543010, + "router_z_loss_clip": 1.27539062, + "router_z_loss_mlp": 0.0880127, + "step": 14862, + "time_per_iteration": 2.4601802825927734 + }, + { + "auxiliary_loss_clip": 0.06398255, + "auxiliary_loss_mlp": 0.01264936, + "balance_loss_clip": 0.06269197, + "balance_loss_mlp": 0.01255883, + "epoch": 0.8936119044040283, + "flos": 24068280055680.0, + "grad_norm": 1.5659074836236766, + "language_loss": 0.78562599, + "learning_rate": 1.1749685290433298e-07, + "loss": 0.8622579, + "num_input_tokens_seen": 320562180, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.09057617, + "step": 14863, + "time_per_iteration": 3.9400691986083984 + }, + { + "auxiliary_loss_clip": 0.06396215, + "auxiliary_loss_mlp": 0.01263795, + "balance_loss_clip": 0.06269459, + "balance_loss_mlp": 0.01255361, + "epoch": 0.8936720276566962, + "flos": 21330387895680.0, + "grad_norm": 1.807940322380626, + "language_loss": 0.70814526, + "learning_rate": 1.1736536497430627e-07, + "loss": 0.78474534, + "num_input_tokens_seen": 320580395, + "router_z_loss_clip": 1.26855469, + "router_z_loss_mlp": 0.08435059, + "step": 14864, + "time_per_iteration": 2.477184295654297 + }, + { + "auxiliary_loss_clip": 0.06409479, + "auxiliary_loss_mlp": 0.01264945, + "balance_loss_clip": 0.06271873, + "balance_loss_mlp": 0.01255158, + "epoch": 0.8937321509093642, + "flos": 18412093895040.0, + "grad_norm": 1.8448979724824994, + "language_loss": 0.76666725, + "learning_rate": 1.1723394843397283e-07, + "loss": 0.84341156, + "num_input_tokens_seen": 320599505, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.09777832, + "step": 14865, + "time_per_iteration": 2.50520658493042 + }, + { + "auxiliary_loss_clip": 0.06396964, + "auxiliary_loss_mlp": 0.01263849, + "balance_loss_clip": 0.06270568, + "balance_loss_mlp": 0.01254801, + "epoch": 0.8937922741620322, + "flos": 22061344988160.0, + "grad_norm": 1.686573948545257, + "language_loss": 0.71847916, + "learning_rate": 1.1710260328831668e-07, + "loss": 0.79508728, + "num_input_tokens_seen": 320619825, + "router_z_loss_clip": 1.26464844, + "router_z_loss_mlp": 0.09051514, + "step": 14866, + "time_per_iteration": 2.5232789516448975 + }, + { + "auxiliary_loss_clip": 0.06405518, + "auxiliary_loss_mlp": 0.01261975, + "balance_loss_clip": 0.06270327, + "balance_loss_mlp": 0.01251765, + "epoch": 0.8938523974147001, + "flos": 25671169935360.0, + "grad_norm": 1.5088143817745128, + "language_loss": 0.84316403, + "learning_rate": 1.1697132954231869e-07, + "loss": 0.91983891, + "num_input_tokens_seen": 320638515, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.10205078, + "step": 14867, + "time_per_iteration": 2.504709243774414 + }, + { + "auxiliary_loss_clip": 0.06400672, + "auxiliary_loss_mlp": 0.01263794, + "balance_loss_clip": 0.06268955, + "balance_loss_mlp": 0.01255223, + "epoch": 0.8939125206673681, + "flos": 25750567278720.0, + "grad_norm": 1.4933944812080338, + "language_loss": 0.80616713, + "learning_rate": 1.168401272009567e-07, + "loss": 0.88281178, + "num_input_tokens_seen": 320659430, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.08575439, + "step": 14868, + "time_per_iteration": 2.5456981658935547 + }, + { + "auxiliary_loss_clip": 0.06400943, + "auxiliary_loss_mlp": 0.01264224, + "balance_loss_clip": 0.06269291, + "balance_loss_mlp": 0.01254264, + "epoch": 0.8939726439200361, + "flos": 27351863930880.0, + "grad_norm": 1.6782026554135205, + "language_loss": 0.77551532, + "learning_rate": 1.167089962692056e-07, + "loss": 0.85216701, + "num_input_tokens_seen": 320679295, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.0994873, + "step": 14869, + "time_per_iteration": 2.5171701908111572 + }, + { + "auxiliary_loss_clip": 0.06400751, + "auxiliary_loss_mlp": 0.01262574, + "balance_loss_clip": 0.06272287, + "balance_loss_mlp": 0.01253323, + "epoch": 0.8940327671727041, + "flos": 20344956353280.0, + "grad_norm": 1.3418834615415587, + "language_loss": 0.65861583, + "learning_rate": 1.1657793675203853e-07, + "loss": 0.73524916, + "num_input_tokens_seen": 320697535, + "router_z_loss_clip": 1.28320312, + "router_z_loss_mlp": 0.09240723, + "step": 14870, + "time_per_iteration": 2.4959447383880615 + }, + { + "auxiliary_loss_clip": 0.06304982, + "auxiliary_loss_mlp": 0.01250431, + "balance_loss_clip": 0.06250464, + "balance_loss_mlp": 0.01249474, + "epoch": 0.894092890425372, + "flos": 58425919534080.0, + "grad_norm": 0.7802103203986496, + "language_loss": 0.55975109, + "learning_rate": 1.1644694865442461e-07, + "loss": 0.63530517, + "num_input_tokens_seen": 320758635, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.009552, + "step": 14871, + "time_per_iteration": 3.156993865966797 + }, + { + "auxiliary_loss_clip": 0.06400608, + "auxiliary_loss_mlp": 0.0126467, + "balance_loss_clip": 0.06272507, + "balance_loss_mlp": 0.01255956, + "epoch": 0.89415301367804, + "flos": 19835965526400.0, + "grad_norm": 2.0336418069128515, + "language_loss": 0.76816511, + "learning_rate": 1.16316031981331e-07, + "loss": 0.84481788, + "num_input_tokens_seen": 320777175, + "router_z_loss_clip": 1.28027344, + "router_z_loss_mlp": 0.0871582, + "step": 14872, + "time_per_iteration": 2.485140323638916 + }, + { + "auxiliary_loss_clip": 0.0639937, + "auxiliary_loss_mlp": 0.0126479, + "balance_loss_clip": 0.06272227, + "balance_loss_mlp": 0.01256624, + "epoch": 0.8942131369307079, + "flos": 25782907754880.0, + "grad_norm": 1.6493247020685964, + "language_loss": 0.67278552, + "learning_rate": 1.1618518673772215e-07, + "loss": 0.7494272, + "num_input_tokens_seen": 320797670, + "router_z_loss_clip": 1.27050781, + "router_z_loss_mlp": 0.08166504, + "step": 14873, + "time_per_iteration": 2.5375049114227295 + }, + { + "auxiliary_loss_clip": 0.06396008, + "auxiliary_loss_mlp": 0.01265889, + "balance_loss_clip": 0.06269968, + "balance_loss_mlp": 0.01256799, + "epoch": 0.8942732601833759, + "flos": 23155747165440.0, + "grad_norm": 1.7348612988581609, + "language_loss": 0.59519863, + "learning_rate": 1.1605441292856033e-07, + "loss": 0.6718176, + "num_input_tokens_seen": 320817410, + "router_z_loss_clip": 1.26074219, + "router_z_loss_mlp": 0.09094238, + "step": 14874, + "time_per_iteration": 4.009182691574097 + }, + { + "auxiliary_loss_clip": 0.0640433, + "auxiliary_loss_mlp": 0.01262347, + "balance_loss_clip": 0.06272757, + "balance_loss_mlp": 0.01252548, + "epoch": 0.8943333834360438, + "flos": 27863034963840.0, + "grad_norm": 3.262059606823023, + "language_loss": 0.75661355, + "learning_rate": 1.1592371055880356e-07, + "loss": 0.83328027, + "num_input_tokens_seen": 320836745, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09796143, + "step": 14875, + "time_per_iteration": 2.582097291946411 + }, + { + "auxiliary_loss_clip": 0.06409485, + "auxiliary_loss_mlp": 0.0126629, + "balance_loss_clip": 0.06272477, + "balance_loss_mlp": 0.01255901, + "epoch": 0.8943935066887119, + "flos": 22170525258240.0, + "grad_norm": 2.233676801641688, + "language_loss": 0.7754097, + "learning_rate": 1.1579307963340857e-07, + "loss": 0.85216737, + "num_input_tokens_seen": 320853305, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.1038208, + "step": 14876, + "time_per_iteration": 2.49131178855896 + }, + { + "auxiliary_loss_clip": 0.06400561, + "auxiliary_loss_mlp": 0.01262912, + "balance_loss_clip": 0.06270637, + "balance_loss_mlp": 0.01253781, + "epoch": 0.8944536299413798, + "flos": 21476394835200.0, + "grad_norm": 1.6532963820803077, + "language_loss": 0.78540194, + "learning_rate": 1.156625201573287e-07, + "loss": 0.86203676, + "num_input_tokens_seen": 320872885, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.09124756, + "step": 14877, + "time_per_iteration": 3.9302589893341064 + }, + { + "auxiliary_loss_clip": 0.06400222, + "auxiliary_loss_mlp": 0.01262535, + "balance_loss_clip": 0.06270761, + "balance_loss_mlp": 0.01253051, + "epoch": 0.8945137531940478, + "flos": 17754538579200.0, + "grad_norm": 2.0502806010232453, + "language_loss": 0.75457507, + "learning_rate": 1.155320321355151e-07, + "loss": 0.83120263, + "num_input_tokens_seen": 320889755, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.09484863, + "step": 14878, + "time_per_iteration": 2.479912519454956 + }, + { + "auxiliary_loss_clip": 0.06404997, + "auxiliary_loss_mlp": 0.01266971, + "balance_loss_clip": 0.06271661, + "balance_loss_mlp": 0.0125744, + "epoch": 0.8945738764467158, + "flos": 21148644389760.0, + "grad_norm": 1.5435004393122365, + "language_loss": 0.75714976, + "learning_rate": 1.1540161557291539e-07, + "loss": 0.83386946, + "num_input_tokens_seen": 320907860, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.09527588, + "step": 14879, + "time_per_iteration": 3.89373779296875 + }, + { + "auxiliary_loss_clip": 0.06402966, + "auxiliary_loss_mlp": 0.01268024, + "balance_loss_clip": 0.06272627, + "balance_loss_mlp": 0.01258737, + "epoch": 0.8946339996993837, + "flos": 14908304689920.0, + "grad_norm": 1.7286319960162253, + "language_loss": 0.74827355, + "learning_rate": 1.1527127047447538e-07, + "loss": 0.82498348, + "num_input_tokens_seen": 320925825, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.09283447, + "step": 14880, + "time_per_iteration": 2.485443592071533 + }, + { + "auxiliary_loss_clip": 0.06400868, + "auxiliary_loss_mlp": 0.01262655, + "balance_loss_clip": 0.0626995, + "balance_loss_mlp": 0.01253786, + "epoch": 0.8946941229520518, + "flos": 27389738776320.0, + "grad_norm": 1.6052503239792235, + "language_loss": 0.83234966, + "learning_rate": 1.1514099684513822e-07, + "loss": 0.9089849, + "num_input_tokens_seen": 320946165, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.08862305, + "step": 14881, + "time_per_iteration": 2.6131069660186768 + }, + { + "auxiliary_loss_clip": 0.064004, + "auxiliary_loss_mlp": 0.01262592, + "balance_loss_clip": 0.06272516, + "balance_loss_mlp": 0.01253902, + "epoch": 0.8947542462047197, + "flos": 31804467644160.0, + "grad_norm": 1.6227908564694626, + "language_loss": 0.67742473, + "learning_rate": 1.1501079468984287e-07, + "loss": 0.75405467, + "num_input_tokens_seen": 320969330, + "router_z_loss_clip": 1.27832031, + "router_z_loss_mlp": 0.08685303, + "step": 14882, + "time_per_iteration": 2.624990701675415 + }, + { + "auxiliary_loss_clip": 0.06410404, + "auxiliary_loss_mlp": 0.01265699, + "balance_loss_clip": 0.06273839, + "balance_loss_mlp": 0.01255286, + "epoch": 0.8948143694573877, + "flos": 20889390257280.0, + "grad_norm": 2.2592268261234794, + "language_loss": 0.76093864, + "learning_rate": 1.1488066401352691e-07, + "loss": 0.83769971, + "num_input_tokens_seen": 320985055, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.10412598, + "step": 14883, + "time_per_iteration": 2.615189552307129 + }, + { + "auxiliary_loss_clip": 0.06395276, + "auxiliary_loss_mlp": 0.01265504, + "balance_loss_clip": 0.06268727, + "balance_loss_mlp": 0.01256176, + "epoch": 0.8948744927100556, + "flos": 28222287344640.0, + "grad_norm": 1.4620287749625491, + "language_loss": 0.72531396, + "learning_rate": 1.147506048211253e-07, + "loss": 0.80192173, + "num_input_tokens_seen": 321004720, + "router_z_loss_clip": 1.26367188, + "router_z_loss_mlp": 0.09320068, + "step": 14884, + "time_per_iteration": 2.686645269393921 + }, + { + "auxiliary_loss_clip": 0.06399888, + "auxiliary_loss_mlp": 0.01266732, + "balance_loss_clip": 0.06271544, + "balance_loss_mlp": 0.01258155, + "epoch": 0.8949346159627236, + "flos": 21908210451840.0, + "grad_norm": 1.538214913987674, + "language_loss": 0.75908208, + "learning_rate": 1.1462061711756987e-07, + "loss": 0.8357482, + "num_input_tokens_seen": 321022350, + "router_z_loss_clip": 1.28417969, + "router_z_loss_mlp": 0.08575439, + "step": 14885, + "time_per_iteration": 2.5954906940460205 + }, + { + "auxiliary_loss_clip": 0.06404841, + "auxiliary_loss_mlp": 0.01265997, + "balance_loss_clip": 0.06268379, + "balance_loss_mlp": 0.01256138, + "epoch": 0.8949947392153915, + "flos": 21365202067200.0, + "grad_norm": 1.822897035526379, + "language_loss": 0.82082385, + "learning_rate": 1.1449070090778911e-07, + "loss": 0.89753222, + "num_input_tokens_seen": 321040450, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.09857178, + "step": 14886, + "time_per_iteration": 2.547220468521118 + }, + { + "auxiliary_loss_clip": 0.06404007, + "auxiliary_loss_mlp": 0.01264491, + "balance_loss_clip": 0.06273124, + "balance_loss_mlp": 0.01255241, + "epoch": 0.8950548624680595, + "flos": 52456672120320.0, + "grad_norm": 1.403766118863264, + "language_loss": 0.63836366, + "learning_rate": 1.1436085619671043e-07, + "loss": 0.71504867, + "num_input_tokens_seen": 321063970, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.09246826, + "step": 14887, + "time_per_iteration": 2.840047836303711 + }, + { + "auxiliary_loss_clip": 0.06404397, + "auxiliary_loss_mlp": 0.01264814, + "balance_loss_clip": 0.06270733, + "balance_loss_mlp": 0.01255379, + "epoch": 0.8951149857207275, + "flos": 20127643989120.0, + "grad_norm": 2.05922037970012, + "language_loss": 0.61333579, + "learning_rate": 1.1423108298925698e-07, + "loss": 0.69002795, + "num_input_tokens_seen": 321083840, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.09423828, + "step": 14888, + "time_per_iteration": 2.4822325706481934 + }, + { + "auxiliary_loss_clip": 0.0640295, + "auxiliary_loss_mlp": 0.01263259, + "balance_loss_clip": 0.06270017, + "balance_loss_mlp": 0.01253997, + "epoch": 0.8951751089733955, + "flos": 29870515082880.0, + "grad_norm": 1.7326619011020001, + "language_loss": 0.70190442, + "learning_rate": 1.1410138129034952e-07, + "loss": 0.77856648, + "num_input_tokens_seen": 321104165, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.09259033, + "step": 14889, + "time_per_iteration": 2.5692856311798096 + }, + { + "auxiliary_loss_clip": 0.06403539, + "auxiliary_loss_mlp": 0.01267234, + "balance_loss_clip": 0.0627138, + "balance_loss_mlp": 0.01257364, + "epoch": 0.8952352322260634, + "flos": 15267305508480.0, + "grad_norm": 2.187718839417261, + "language_loss": 0.70865494, + "learning_rate": 1.1397175110490676e-07, + "loss": 0.78536266, + "num_input_tokens_seen": 321117290, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.09863281, + "step": 14890, + "time_per_iteration": 2.4263176918029785 + }, + { + "auxiliary_loss_clip": 0.06400955, + "auxiliary_loss_mlp": 0.0126212, + "balance_loss_clip": 0.06268892, + "balance_loss_mlp": 0.01253036, + "epoch": 0.8952953554787314, + "flos": 26805794872320.0, + "grad_norm": 1.6509945503945358, + "language_loss": 0.75869304, + "learning_rate": 1.1384219243784454e-07, + "loss": 0.83532381, + "num_input_tokens_seen": 321137115, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09069824, + "step": 14891, + "time_per_iteration": 2.5538480281829834 + }, + { + "auxiliary_loss_clip": 0.06407404, + "auxiliary_loss_mlp": 0.01265746, + "balance_loss_clip": 0.06271844, + "balance_loss_mlp": 0.01256031, + "epoch": 0.8953554787313994, + "flos": 14142449571840.0, + "grad_norm": 1.8156588804398968, + "language_loss": 0.77074498, + "learning_rate": 1.1371270529407517e-07, + "loss": 0.84747648, + "num_input_tokens_seen": 321154490, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.097229, + "step": 14892, + "time_per_iteration": 2.4513299465179443 + }, + { + "auxiliary_loss_clip": 0.06402381, + "auxiliary_loss_mlp": 0.01265472, + "balance_loss_clip": 0.06271234, + "balance_loss_mlp": 0.01256132, + "epoch": 0.8954156019840673, + "flos": 25710512227200.0, + "grad_norm": 1.8223353261207547, + "language_loss": 0.81747323, + "learning_rate": 1.1358328967850895e-07, + "loss": 0.89415169, + "num_input_tokens_seen": 321175625, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.09338379, + "step": 14893, + "time_per_iteration": 2.5349771976470947 + }, + { + "auxiliary_loss_clip": 0.06401483, + "auxiliary_loss_mlp": 0.01263898, + "balance_loss_clip": 0.06273355, + "balance_loss_mlp": 0.01255554, + "epoch": 0.8954757252367354, + "flos": 21914415653760.0, + "grad_norm": 2.626592017949994, + "language_loss": 0.75162917, + "learning_rate": 1.1345394559605348e-07, + "loss": 0.82828295, + "num_input_tokens_seen": 321193895, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.08343506, + "step": 14894, + "time_per_iteration": 2.4701755046844482 + }, + { + "auxiliary_loss_clip": 0.06405791, + "auxiliary_loss_mlp": 0.01264109, + "balance_loss_clip": 0.06272551, + "balance_loss_mlp": 0.01254006, + "epoch": 0.8955358484894033, + "flos": 12975568012800.0, + "grad_norm": 1.5890644812826222, + "language_loss": 0.66464567, + "learning_rate": 1.1332467305161352e-07, + "loss": 0.74134463, + "num_input_tokens_seen": 321211610, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.10101318, + "step": 14895, + "time_per_iteration": 2.5035977363586426 + }, + { + "auxiliary_loss_clip": 0.06404694, + "auxiliary_loss_mlp": 0.01265678, + "balance_loss_clip": 0.06269982, + "balance_loss_mlp": 0.01255444, + "epoch": 0.8955959717420713, + "flos": 17279565310080.0, + "grad_norm": 1.87791036453397, + "language_loss": 0.67284429, + "learning_rate": 1.1319547205009094e-07, + "loss": 0.74954802, + "num_input_tokens_seen": 321229805, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.10229492, + "step": 14896, + "time_per_iteration": 2.4724619388580322 + }, + { + "auxiliary_loss_clip": 0.06403284, + "auxiliary_loss_mlp": 0.0126487, + "balance_loss_clip": 0.06272653, + "balance_loss_mlp": 0.01255667, + "epoch": 0.8956560949947392, + "flos": 14799208273920.0, + "grad_norm": 1.6470443719838597, + "language_loss": 0.76069427, + "learning_rate": 1.1306634259638492e-07, + "loss": 0.83737576, + "num_input_tokens_seen": 321247165, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.09210205, + "step": 14897, + "time_per_iteration": 2.4986391067504883 + }, + { + "auxiliary_loss_clip": 0.06308123, + "auxiliary_loss_mlp": 0.01251594, + "balance_loss_clip": 0.06253865, + "balance_loss_mlp": 0.01250616, + "epoch": 0.8957162182474072, + "flos": 63626754280320.0, + "grad_norm": 0.7334774931329842, + "language_loss": 0.55192471, + "learning_rate": 1.129372846953931e-07, + "loss": 0.62752187, + "num_input_tokens_seen": 321308425, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.00977325, + "step": 14898, + "time_per_iteration": 3.1359360218048096 + }, + { + "auxiliary_loss_clip": 0.0640052, + "auxiliary_loss_mlp": 0.01265122, + "balance_loss_clip": 0.06268872, + "balance_loss_mlp": 0.01255681, + "epoch": 0.8957763415000751, + "flos": 25016884928640.0, + "grad_norm": 1.570472066859937, + "language_loss": 0.70246518, + "learning_rate": 1.12808298352008e-07, + "loss": 0.77912164, + "num_input_tokens_seen": 321329295, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09442139, + "step": 14899, + "time_per_iteration": 2.5486810207366943 + }, + { + "auxiliary_loss_clip": 0.06403163, + "auxiliary_loss_mlp": 0.01263587, + "balance_loss_clip": 0.06270869, + "balance_loss_mlp": 0.01253061, + "epoch": 0.8958364647527431, + "flos": 19834749642240.0, + "grad_norm": 1.8245090514725772, + "language_loss": 0.73847759, + "learning_rate": 1.1267938357112106e-07, + "loss": 0.81514513, + "num_input_tokens_seen": 321347580, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.10534668, + "step": 14900, + "time_per_iteration": 2.4651243686676025 + }, + { + "auxiliary_loss_clip": 0.06306873, + "auxiliary_loss_mlp": 0.01248856, + "balance_loss_clip": 0.0625267, + "balance_loss_mlp": 0.0124786, + "epoch": 0.895896588005411, + "flos": 65555717523840.0, + "grad_norm": 0.7552312872825258, + "language_loss": 0.6180774, + "learning_rate": 1.1255054035762124e-07, + "loss": 0.69363469, + "num_input_tokens_seen": 321407820, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.00995636, + "step": 14901, + "time_per_iteration": 3.1450839042663574 + }, + { + "auxiliary_loss_clip": 0.06405282, + "auxiliary_loss_mlp": 0.01269268, + "balance_loss_clip": 0.06272149, + "balance_loss_mlp": 0.01259702, + "epoch": 0.8959567112580791, + "flos": 25597726231680.0, + "grad_norm": 1.4924543934723433, + "language_loss": 0.71050578, + "learning_rate": 1.1242176871639441e-07, + "loss": 0.78725129, + "num_input_tokens_seen": 321426745, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.09570312, + "step": 14902, + "time_per_iteration": 4.060534477233887 + }, + { + "auxiliary_loss_clip": 0.0639545, + "auxiliary_loss_mlp": 0.01265072, + "balance_loss_clip": 0.0626891, + "balance_loss_mlp": 0.01256037, + "epoch": 0.896016834510747, + "flos": 24207788304000.0, + "grad_norm": 1.5516479623413435, + "language_loss": 0.78019071, + "learning_rate": 1.1229306865232313e-07, + "loss": 0.85679603, + "num_input_tokens_seen": 321446165, + "router_z_loss_clip": 1.265625, + "router_z_loss_mlp": 0.09033203, + "step": 14903, + "time_per_iteration": 2.530996084213257 + }, + { + "auxiliary_loss_clip": 0.06406369, + "auxiliary_loss_mlp": 0.0126234, + "balance_loss_clip": 0.06271996, + "balance_loss_mlp": 0.01252774, + "epoch": 0.896076957763415, + "flos": 23082638878080.0, + "grad_norm": 1.8242518649454527, + "language_loss": 0.73055351, + "learning_rate": 1.121644401702877e-07, + "loss": 0.80724061, + "num_input_tokens_seen": 321465285, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.09570312, + "step": 14904, + "time_per_iteration": 2.6296870708465576 + }, + { + "auxiliary_loss_clip": 0.06401238, + "auxiliary_loss_mlp": 0.01262525, + "balance_loss_clip": 0.06269231, + "balance_loss_mlp": 0.01252637, + "epoch": 0.8961370810160829, + "flos": 22243130421120.0, + "grad_norm": 1.862824182986126, + "language_loss": 0.75347674, + "learning_rate": 1.12035883275166e-07, + "loss": 0.83011442, + "num_input_tokens_seen": 321483670, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.09887695, + "step": 14905, + "time_per_iteration": 2.5133965015411377 + }, + { + "auxiliary_loss_clip": 0.06398168, + "auxiliary_loss_mlp": 0.01264344, + "balance_loss_clip": 0.06269428, + "balance_loss_mlp": 0.01255487, + "epoch": 0.8961972042687509, + "flos": 23078404247040.0, + "grad_norm": 1.5622217047945155, + "language_loss": 0.76437497, + "learning_rate": 1.1190739797183279e-07, + "loss": 0.84100008, + "num_input_tokens_seen": 321501190, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.08862305, + "step": 14906, + "time_per_iteration": 2.5375421047210693 + }, + { + "auxiliary_loss_clip": 0.06407402, + "auxiliary_loss_mlp": 0.01264174, + "balance_loss_clip": 0.06274619, + "balance_loss_mlp": 0.01254822, + "epoch": 0.896257327521419, + "flos": 18191595075840.0, + "grad_norm": 1.6265989394728257, + "language_loss": 0.745776, + "learning_rate": 1.1177898426515996e-07, + "loss": 0.82249177, + "num_input_tokens_seen": 321518540, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09356689, + "step": 14907, + "time_per_iteration": 2.4702959060668945 + }, + { + "auxiliary_loss_clip": 0.06397235, + "auxiliary_loss_mlp": 0.01269593, + "balance_loss_clip": 0.06267928, + "balance_loss_mlp": 0.01260152, + "epoch": 0.8963174507740869, + "flos": 17901384059520.0, + "grad_norm": 1.964029322424203, + "language_loss": 0.8312695, + "learning_rate": 1.1165064216001785e-07, + "loss": 0.90793782, + "num_input_tokens_seen": 321536555, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.09436035, + "step": 14908, + "time_per_iteration": 2.5215442180633545 + }, + { + "auxiliary_loss_clip": 0.06403542, + "auxiliary_loss_mlp": 0.01266202, + "balance_loss_clip": 0.06269868, + "balance_loss_mlp": 0.01256129, + "epoch": 0.8963775740267549, + "flos": 21038541724800.0, + "grad_norm": 1.7328216295609387, + "language_loss": 0.70987892, + "learning_rate": 1.1152237166127232e-07, + "loss": 0.78657633, + "num_input_tokens_seen": 321557655, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.10070801, + "step": 14909, + "time_per_iteration": 2.50961971282959 + }, + { + "auxiliary_loss_clip": 0.06402007, + "auxiliary_loss_mlp": 0.01265795, + "balance_loss_clip": 0.06269825, + "balance_loss_mlp": 0.01256103, + "epoch": 0.8964376972794228, + "flos": 23185362384000.0, + "grad_norm": 1.6724963003182998, + "language_loss": 0.72410321, + "learning_rate": 1.113941727737877e-07, + "loss": 0.80078113, + "num_input_tokens_seen": 321576160, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.09698486, + "step": 14910, + "time_per_iteration": 2.5077359676361084 + }, + { + "auxiliary_loss_clip": 0.06399799, + "auxiliary_loss_mlp": 0.01265379, + "balance_loss_clip": 0.06270814, + "balance_loss_mlp": 0.0125682, + "epoch": 0.8964978205320908, + "flos": 24979974405120.0, + "grad_norm": 1.770153875298599, + "language_loss": 0.63518411, + "learning_rate": 1.1126604550242502e-07, + "loss": 0.71183586, + "num_input_tokens_seen": 321596205, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.08563232, + "step": 14911, + "time_per_iteration": 2.4959042072296143 + }, + { + "auxiliary_loss_clip": 0.06406086, + "auxiliary_loss_mlp": 0.01267252, + "balance_loss_clip": 0.06273033, + "balance_loss_mlp": 0.01257596, + "epoch": 0.8965579437847587, + "flos": 19178074794240.0, + "grad_norm": 1.6726693619697703, + "language_loss": 0.75323474, + "learning_rate": 1.111379898520437e-07, + "loss": 0.82996809, + "num_input_tokens_seen": 321614800, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.09649658, + "step": 14912, + "time_per_iteration": 2.511392593383789 + }, + { + "auxiliary_loss_clip": 0.06399107, + "auxiliary_loss_mlp": 0.012646, + "balance_loss_clip": 0.06268585, + "balance_loss_mlp": 0.01255028, + "epoch": 0.8966180670374267, + "flos": 24283034870400.0, + "grad_norm": 1.7988610159945775, + "language_loss": 0.82114106, + "learning_rate": 1.1101000582749876e-07, + "loss": 0.89777815, + "num_input_tokens_seen": 321633445, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.09570312, + "step": 14913, + "time_per_iteration": 4.064217805862427 + }, + { + "auxiliary_loss_clip": 0.06404127, + "auxiliary_loss_mlp": 0.01271416, + "balance_loss_clip": 0.06272069, + "balance_loss_mlp": 0.01261235, + "epoch": 0.8966781902900947, + "flos": 13558296032640.0, + "grad_norm": 1.9987077999566127, + "language_loss": 0.61253613, + "learning_rate": 1.1088209343364407e-07, + "loss": 0.6892916, + "num_input_tokens_seen": 321650890, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.10186768, + "step": 14914, + "time_per_iteration": 2.503157377243042 + }, + { + "auxiliary_loss_clip": 0.06308897, + "auxiliary_loss_mlp": 0.01250037, + "balance_loss_clip": 0.06254553, + "balance_loss_mlp": 0.01249038, + "epoch": 0.8967383135427627, + "flos": 65085104666880.0, + "grad_norm": 0.7199686075509744, + "language_loss": 0.54956484, + "learning_rate": 1.1075425267532956e-07, + "loss": 0.6251542, + "num_input_tokens_seen": 321710960, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.00998688, + "step": 14915, + "time_per_iteration": 3.121408462524414 + }, + { + "auxiliary_loss_clip": 0.0639778, + "auxiliary_loss_mlp": 0.01262669, + "balance_loss_clip": 0.0627024, + "balance_loss_mlp": 0.01254021, + "epoch": 0.8967984367954306, + "flos": 29720273512320.0, + "grad_norm": 1.453709134846792, + "language_loss": 0.71710205, + "learning_rate": 1.1062648355740289e-07, + "loss": 0.79370654, + "num_input_tokens_seen": 321733290, + "router_z_loss_clip": 1.27636719, + "router_z_loss_mlp": 0.08648682, + "step": 14916, + "time_per_iteration": 2.5399439334869385 + }, + { + "auxiliary_loss_clip": 0.06399646, + "auxiliary_loss_mlp": 0.01262869, + "balance_loss_clip": 0.06269349, + "balance_loss_mlp": 0.01253386, + "epoch": 0.8968585600480986, + "flos": 25709547905280.0, + "grad_norm": 1.6511135445596639, + "language_loss": 0.77996731, + "learning_rate": 1.1049878608470931e-07, + "loss": 0.85659248, + "num_input_tokens_seen": 321753120, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.0947876, + "step": 14917, + "time_per_iteration": 3.9514448642730713 + }, + { + "auxiliary_loss_clip": 0.06407967, + "auxiliary_loss_mlp": 0.01265951, + "balance_loss_clip": 0.06272604, + "balance_loss_mlp": 0.01255419, + "epoch": 0.8969186833007665, + "flos": 30052552078080.0, + "grad_norm": 2.288875312823381, + "language_loss": 0.6860131, + "learning_rate": 1.1037116026209137e-07, + "loss": 0.76275229, + "num_input_tokens_seen": 321772840, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.10522461, + "step": 14918, + "time_per_iteration": 3.9689831733703613 + }, + { + "auxiliary_loss_clip": 0.06404864, + "auxiliary_loss_mlp": 0.01262687, + "balance_loss_clip": 0.06271897, + "balance_loss_mlp": 0.0125324, + "epoch": 0.8969788065534345, + "flos": 22824390994560.0, + "grad_norm": 2.299615610412693, + "language_loss": 0.83668256, + "learning_rate": 1.102436060943881e-07, + "loss": 0.91335803, + "num_input_tokens_seen": 321791020, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.09442139, + "step": 14919, + "time_per_iteration": 2.5401570796966553 + }, + { + "auxiliary_loss_clip": 0.06404417, + "auxiliary_loss_mlp": 0.0126842, + "balance_loss_clip": 0.06270535, + "balance_loss_mlp": 0.01258698, + "epoch": 0.8970389298061026, + "flos": 13266575642880.0, + "grad_norm": 2.5633891144705134, + "language_loss": 0.73092914, + "learning_rate": 1.1011612358643696e-07, + "loss": 0.80765748, + "num_input_tokens_seen": 321810075, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.097229, + "step": 14920, + "time_per_iteration": 2.546627998352051 + }, + { + "auxiliary_loss_clip": 0.06404185, + "auxiliary_loss_mlp": 0.01262662, + "balance_loss_clip": 0.06273196, + "balance_loss_mlp": 0.0125266, + "epoch": 0.8970990530587705, + "flos": 10270058256000.0, + "grad_norm": 2.486381848845646, + "language_loss": 0.90980357, + "learning_rate": 1.0998871274307164e-07, + "loss": 0.98647201, + "num_input_tokens_seen": 321822635, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.10009766, + "step": 14921, + "time_per_iteration": 2.452223777770996 + }, + { + "auxiliary_loss_clip": 0.06404401, + "auxiliary_loss_mlp": 0.01265926, + "balance_loss_clip": 0.06269224, + "balance_loss_mlp": 0.01255906, + "epoch": 0.8971591763114385, + "flos": 20308884370560.0, + "grad_norm": 1.739666810440783, + "language_loss": 0.74017936, + "learning_rate": 1.0986137356912384e-07, + "loss": 0.81688261, + "num_input_tokens_seen": 321841130, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.10028076, + "step": 14922, + "time_per_iteration": 2.546560525894165 + }, + { + "auxiliary_loss_clip": 0.06400974, + "auxiliary_loss_mlp": 0.01261981, + "balance_loss_clip": 0.0626979, + "balance_loss_mlp": 0.0125257, + "epoch": 0.8972192995641064, + "flos": 23263543843200.0, + "grad_norm": 1.7043702833178804, + "language_loss": 0.7044152, + "learning_rate": 1.097341060694219e-07, + "loss": 0.78104472, + "num_input_tokens_seen": 321859855, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.09411621, + "step": 14923, + "time_per_iteration": 2.4887611865997314 + }, + { + "auxiliary_loss_clip": 0.06407218, + "auxiliary_loss_mlp": 0.01265187, + "balance_loss_clip": 0.06271665, + "balance_loss_mlp": 0.0125518, + "epoch": 0.8972794228167744, + "flos": 18375560714880.0, + "grad_norm": 1.9781381885926022, + "language_loss": 0.71156216, + "learning_rate": 1.0960691024879221e-07, + "loss": 0.78828621, + "num_input_tokens_seen": 321877990, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.10003662, + "step": 14924, + "time_per_iteration": 2.507704257965088 + }, + { + "auxiliary_loss_clip": 0.06404379, + "auxiliary_loss_mlp": 0.01261706, + "balance_loss_clip": 0.06271243, + "balance_loss_mlp": 0.01253141, + "epoch": 0.8973395460694423, + "flos": 23958974004480.0, + "grad_norm": 1.3820942229672155, + "language_loss": 0.72463107, + "learning_rate": 1.0947978611205844e-07, + "loss": 0.80129194, + "num_input_tokens_seen": 321898120, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.08566284, + "step": 14925, + "time_per_iteration": 2.5084264278411865 + }, + { + "auxiliary_loss_clip": 0.06404161, + "auxiliary_loss_mlp": 0.01263424, + "balance_loss_clip": 0.06271493, + "balance_loss_mlp": 0.01254007, + "epoch": 0.8973996693221103, + "flos": 24977458782720.0, + "grad_norm": 1.5685683957200127, + "language_loss": 0.82635689, + "learning_rate": 1.0935273366404008e-07, + "loss": 0.90303278, + "num_input_tokens_seen": 321918140, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.09417725, + "step": 14926, + "time_per_iteration": 2.557849884033203 + }, + { + "auxiliary_loss_clip": 0.06403212, + "auxiliary_loss_mlp": 0.01263645, + "balance_loss_clip": 0.06271018, + "balance_loss_mlp": 0.01254275, + "epoch": 0.8974597925747783, + "flos": 25745997231360.0, + "grad_norm": 1.4572864051065582, + "language_loss": 0.79279351, + "learning_rate": 1.092257529095555e-07, + "loss": 0.86946213, + "num_input_tokens_seen": 321938580, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.09375, + "step": 14927, + "time_per_iteration": 2.5682642459869385 + }, + { + "auxiliary_loss_clip": 0.06400602, + "auxiliary_loss_mlp": 0.01264213, + "balance_loss_clip": 0.06270526, + "balance_loss_mlp": 0.01255308, + "epoch": 0.8975199158274463, + "flos": 38081172816000.0, + "grad_norm": 1.7102877126425073, + "language_loss": 0.66823071, + "learning_rate": 1.0909884385341994e-07, + "loss": 0.74487889, + "num_input_tokens_seen": 321961135, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.08905029, + "step": 14928, + "time_per_iteration": 2.6806201934814453 + }, + { + "auxiliary_loss_clip": 0.06407198, + "auxiliary_loss_mlp": 0.01262321, + "balance_loss_clip": 0.06272136, + "balance_loss_mlp": 0.01251282, + "epoch": 0.8975800390801142, + "flos": 25418875691520.0, + "grad_norm": 2.175076083160526, + "language_loss": 0.71158016, + "learning_rate": 1.0897200650044602e-07, + "loss": 0.78827536, + "num_input_tokens_seen": 321980945, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.11029053, + "step": 14929, + "time_per_iteration": 2.518965005874634 + }, + { + "auxiliary_loss_clip": 0.0640422, + "auxiliary_loss_mlp": 0.01265932, + "balance_loss_clip": 0.06272244, + "balance_loss_mlp": 0.0125683, + "epoch": 0.8976401623327822, + "flos": 21765599602560.0, + "grad_norm": 1.5816996603880829, + "language_loss": 0.68028259, + "learning_rate": 1.0884524085544256e-07, + "loss": 0.75698406, + "num_input_tokens_seen": 322000350, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09106445, + "step": 14930, + "time_per_iteration": 2.5001468658447266 + }, + { + "auxiliary_loss_clip": 0.06397609, + "auxiliary_loss_mlp": 0.01265308, + "balance_loss_clip": 0.06267622, + "balance_loss_mlp": 0.01256058, + "epoch": 0.8977002855854501, + "flos": 13850519546880.0, + "grad_norm": 3.507650532962027, + "language_loss": 0.74712485, + "learning_rate": 1.0871854692321769e-07, + "loss": 0.82375401, + "num_input_tokens_seen": 322018980, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.0925293, + "step": 14931, + "time_per_iteration": 2.468661069869995 + }, + { + "auxiliary_loss_clip": 0.06398958, + "auxiliary_loss_mlp": 0.01267981, + "balance_loss_clip": 0.06269293, + "balance_loss_mlp": 0.01258897, + "epoch": 0.8977604088381181, + "flos": 19433639347200.0, + "grad_norm": 1.6811603420532344, + "language_loss": 0.63567096, + "learning_rate": 1.0859192470857492e-07, + "loss": 0.71234035, + "num_input_tokens_seen": 322037675, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.09082031, + "step": 14932, + "time_per_iteration": 2.500734567642212 + }, + { + "auxiliary_loss_clip": 0.06395967, + "auxiliary_loss_mlp": 0.01263865, + "balance_loss_clip": 0.06271164, + "balance_loss_mlp": 0.01255425, + "epoch": 0.8978205320907862, + "flos": 22747802762880.0, + "grad_norm": 1.5617576374717, + "language_loss": 0.71711791, + "learning_rate": 1.0846537421631552e-07, + "loss": 0.79371631, + "num_input_tokens_seen": 322055130, + "router_z_loss_clip": 1.24707031, + "router_z_loss_mlp": 0.08441162, + "step": 14933, + "time_per_iteration": 2.4802329540252686 + }, + { + "auxiliary_loss_clip": 0.06406559, + "auxiliary_loss_mlp": 0.01268041, + "balance_loss_clip": 0.06271081, + "balance_loss_mlp": 0.01257753, + "epoch": 0.8978806553434541, + "flos": 21366837221760.0, + "grad_norm": 1.528884069249085, + "language_loss": 0.74636477, + "learning_rate": 1.0833889545123898e-07, + "loss": 0.82311076, + "num_input_tokens_seen": 322074850, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.10284424, + "step": 14934, + "time_per_iteration": 2.5407958030700684 + }, + { + "auxiliary_loss_clip": 0.06400236, + "auxiliary_loss_mlp": 0.01266178, + "balance_loss_clip": 0.06271216, + "balance_loss_mlp": 0.01257029, + "epoch": 0.8979407785961221, + "flos": 20930661192960.0, + "grad_norm": 1.684910765856414, + "language_loss": 0.60720909, + "learning_rate": 1.0821248841814123e-07, + "loss": 0.68387318, + "num_input_tokens_seen": 322093315, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.09155273, + "step": 14935, + "time_per_iteration": 2.494798183441162 + }, + { + "auxiliary_loss_clip": 0.06396089, + "auxiliary_loss_mlp": 0.01262066, + "balance_loss_clip": 0.06269929, + "balance_loss_mlp": 0.0125303, + "epoch": 0.89800090184879, + "flos": 25236042082560.0, + "grad_norm": 1.7255902732774182, + "language_loss": 0.76495326, + "learning_rate": 1.0808615312181512e-07, + "loss": 0.84153479, + "num_input_tokens_seen": 322112555, + "router_z_loss_clip": 1.26074219, + "router_z_loss_mlp": 0.09033203, + "step": 14936, + "time_per_iteration": 2.548093318939209 + }, + { + "auxiliary_loss_clip": 0.06402925, + "auxiliary_loss_mlp": 0.01262388, + "balance_loss_clip": 0.0627269, + "balance_loss_mlp": 0.01252905, + "epoch": 0.898061025101458, + "flos": 22568868368640.0, + "grad_norm": 1.5780818295841181, + "language_loss": 0.74487138, + "learning_rate": 1.0795988956705193e-07, + "loss": 0.82152456, + "num_input_tokens_seen": 322130440, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.09484863, + "step": 14937, + "time_per_iteration": 2.4871113300323486 + }, + { + "auxiliary_loss_clip": 0.06305996, + "auxiliary_loss_mlp": 0.01251202, + "balance_loss_clip": 0.06251696, + "balance_loss_mlp": 0.01250233, + "epoch": 0.8981211483541259, + "flos": 56208799699200.0, + "grad_norm": 0.829573035126938, + "language_loss": 0.63498247, + "learning_rate": 1.0783369775863915e-07, + "loss": 0.71055448, + "num_input_tokens_seen": 322187295, + "router_z_loss_clip": 0.54492188, + "router_z_loss_mlp": 0.00967407, + "step": 14938, + "time_per_iteration": 2.991299629211426 + }, + { + "auxiliary_loss_clip": 0.06397615, + "auxiliary_loss_mlp": 0.01263328, + "balance_loss_clip": 0.06271379, + "balance_loss_mlp": 0.01254179, + "epoch": 0.898181271606794, + "flos": 16397234616960.0, + "grad_norm": 3.7900138603468894, + "language_loss": 0.80554181, + "learning_rate": 1.0770757770136251e-07, + "loss": 0.88215125, + "num_input_tokens_seen": 322202965, + "router_z_loss_clip": 1.26171875, + "router_z_loss_mlp": 0.09143066, + "step": 14939, + "time_per_iteration": 2.461031675338745 + }, + { + "auxiliary_loss_clip": 0.06305988, + "auxiliary_loss_mlp": 0.01253105, + "balance_loss_clip": 0.06251763, + "balance_loss_mlp": 0.01252118, + "epoch": 0.8982413948594619, + "flos": 63461655809280.0, + "grad_norm": 0.7334559404863827, + "language_loss": 0.52954245, + "learning_rate": 1.0758152940000375e-07, + "loss": 0.60513341, + "num_input_tokens_seen": 322269490, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.00986481, + "step": 14940, + "time_per_iteration": 3.248729705810547 + }, + { + "auxiliary_loss_clip": 0.06402014, + "auxiliary_loss_mlp": 0.01267397, + "balance_loss_clip": 0.06270303, + "balance_loss_mlp": 0.01257294, + "epoch": 0.8983015181121299, + "flos": 21841810490880.0, + "grad_norm": 1.7036314435960453, + "language_loss": 0.77842438, + "learning_rate": 1.0745555285934327e-07, + "loss": 0.85511851, + "num_input_tokens_seen": 322288060, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.10107422, + "step": 14941, + "time_per_iteration": 4.003239870071411 + }, + { + "auxiliary_loss_clip": 0.06402576, + "auxiliary_loss_mlp": 0.01265073, + "balance_loss_clip": 0.06269994, + "balance_loss_mlp": 0.01255167, + "epoch": 0.8983616413647978, + "flos": 28957604849280.0, + "grad_norm": 1.9606451344783369, + "language_loss": 0.73512655, + "learning_rate": 1.0732964808415834e-07, + "loss": 0.81180304, + "num_input_tokens_seen": 322307930, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.09899902, + "step": 14942, + "time_per_iteration": 2.569955587387085 + }, + { + "auxiliary_loss_clip": 0.06404367, + "auxiliary_loss_mlp": 0.01264132, + "balance_loss_clip": 0.06270196, + "balance_loss_mlp": 0.01254571, + "epoch": 0.8984217646174658, + "flos": 17790820197120.0, + "grad_norm": 2.002654681143642, + "language_loss": 0.80248809, + "learning_rate": 1.0720381507922205e-07, + "loss": 0.87917316, + "num_input_tokens_seen": 322326155, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.09558105, + "step": 14943, + "time_per_iteration": 2.488431930541992 + }, + { + "auxiliary_loss_clip": 0.06405204, + "auxiliary_loss_mlp": 0.01269191, + "balance_loss_clip": 0.06271496, + "balance_loss_mlp": 0.01258945, + "epoch": 0.8984818878701337, + "flos": 23411311718400.0, + "grad_norm": 1.5597743070922876, + "language_loss": 0.71681154, + "learning_rate": 1.0707805384930701e-07, + "loss": 0.7935555, + "num_input_tokens_seen": 322345850, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.10247803, + "step": 14944, + "time_per_iteration": 2.5067203044891357 + }, + { + "auxiliary_loss_clip": 0.06407298, + "auxiliary_loss_mlp": 0.01264929, + "balance_loss_clip": 0.06270649, + "balance_loss_mlp": 0.01254809, + "epoch": 0.8985420111228017, + "flos": 22352604180480.0, + "grad_norm": 2.1061094543474184, + "language_loss": 0.76275969, + "learning_rate": 1.0695236439918187e-07, + "loss": 0.83948195, + "num_input_tokens_seen": 322364715, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.10125732, + "step": 14945, + "time_per_iteration": 2.500641107559204 + }, + { + "auxiliary_loss_clip": 0.06413375, + "auxiliary_loss_mlp": 0.01269223, + "balance_loss_clip": 0.06273663, + "balance_loss_mlp": 0.01258917, + "epoch": 0.8986021343754698, + "flos": 21398381084160.0, + "grad_norm": 2.9283306664334128, + "language_loss": 0.73861766, + "learning_rate": 1.0682674673361302e-07, + "loss": 0.81544363, + "num_input_tokens_seen": 322383570, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.10314941, + "step": 14946, + "time_per_iteration": 2.491835832595825 + }, + { + "auxiliary_loss_clip": 0.06401925, + "auxiliary_loss_mlp": 0.01263432, + "balance_loss_clip": 0.06270081, + "balance_loss_mlp": 0.0125358, + "epoch": 0.8986622576281377, + "flos": 21331897269120.0, + "grad_norm": 2.0231368146788813, + "language_loss": 0.64790112, + "learning_rate": 1.0670120085736334e-07, + "loss": 0.72455472, + "num_input_tokens_seen": 322401375, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09851074, + "step": 14947, + "time_per_iteration": 2.4926280975341797 + }, + { + "auxiliary_loss_clip": 0.06400159, + "auxiliary_loss_mlp": 0.01262141, + "balance_loss_clip": 0.06270196, + "balance_loss_mlp": 0.01253343, + "epoch": 0.8987223808808057, + "flos": 23995171768320.0, + "grad_norm": 1.824984607909439, + "language_loss": 0.70089561, + "learning_rate": 1.0657572677519411e-07, + "loss": 0.77751863, + "num_input_tokens_seen": 322421890, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.08795166, + "step": 14948, + "time_per_iteration": 2.546989679336548 + }, + { + "auxiliary_loss_clip": 0.06400745, + "auxiliary_loss_mlp": 0.01263069, + "balance_loss_clip": 0.06270957, + "balance_loss_mlp": 0.01253985, + "epoch": 0.8987825041334736, + "flos": 41510679776640.0, + "grad_norm": 1.7578036541733197, + "language_loss": 0.74855787, + "learning_rate": 1.0645032449186309e-07, + "loss": 0.82519603, + "num_input_tokens_seen": 322445730, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.09082031, + "step": 14949, + "time_per_iteration": 2.698312997817993 + }, + { + "auxiliary_loss_clip": 0.06405021, + "auxiliary_loss_mlp": 0.01265803, + "balance_loss_clip": 0.06272099, + "balance_loss_mlp": 0.01254871, + "epoch": 0.8988426273861416, + "flos": 27571817698560.0, + "grad_norm": 1.6083544850300273, + "language_loss": 0.75579, + "learning_rate": 1.0632499401212513e-07, + "loss": 0.83249831, + "num_input_tokens_seen": 322464595, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.10925293, + "step": 14950, + "time_per_iteration": 2.553276777267456 + }, + { + "auxiliary_loss_clip": 0.06403638, + "auxiliary_loss_mlp": 0.01263025, + "balance_loss_clip": 0.06273642, + "balance_loss_mlp": 0.01254541, + "epoch": 0.8989027506388095, + "flos": 17098408782720.0, + "grad_norm": 1.6063948284230318, + "language_loss": 0.66535282, + "learning_rate": 1.0619973534073334e-07, + "loss": 0.74201941, + "num_input_tokens_seen": 322483305, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.08486938, + "step": 14951, + "time_per_iteration": 2.487602472305298 + }, + { + "auxiliary_loss_clip": 0.06404173, + "auxiliary_loss_mlp": 0.01263355, + "balance_loss_clip": 0.06266937, + "balance_loss_mlp": 0.01253956, + "epoch": 0.8989628738914776, + "flos": 20560843198080.0, + "grad_norm": 1.8566559318875047, + "language_loss": 0.74081647, + "learning_rate": 1.0607454848243769e-07, + "loss": 0.81749177, + "num_input_tokens_seen": 322501905, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.09393311, + "step": 14952, + "time_per_iteration": 4.035311937332153 + }, + { + "auxiliary_loss_clip": 0.06401406, + "auxiliary_loss_mlp": 0.01264061, + "balance_loss_clip": 0.06271611, + "balance_loss_mlp": 0.0125493, + "epoch": 0.8990229971441455, + "flos": 16256300849280.0, + "grad_norm": 2.420734028106449, + "language_loss": 0.56859446, + "learning_rate": 1.0594943344198481e-07, + "loss": 0.64524913, + "num_input_tokens_seen": 322518135, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.09136963, + "step": 14953, + "time_per_iteration": 2.4741392135620117 + }, + { + "auxiliary_loss_clip": 0.06400678, + "auxiliary_loss_mlp": 0.01262102, + "balance_loss_clip": 0.06271634, + "balance_loss_mlp": 0.0125247, + "epoch": 0.8990831203968135, + "flos": 21987817430400.0, + "grad_norm": 1.7963505164231723, + "language_loss": 0.82287514, + "learning_rate": 1.0582439022411915e-07, + "loss": 0.89950299, + "num_input_tokens_seen": 322537905, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.09637451, + "step": 14954, + "time_per_iteration": 2.5389609336853027 + }, + { + "auxiliary_loss_clip": 0.06401017, + "auxiliary_loss_mlp": 0.01266641, + "balance_loss_clip": 0.06273876, + "balance_loss_mlp": 0.0125748, + "epoch": 0.8991432436494814, + "flos": 27453413479680.0, + "grad_norm": 1.9061442567744085, + "language_loss": 0.60138369, + "learning_rate": 1.0569941883358224e-07, + "loss": 0.67806023, + "num_input_tokens_seen": 322557945, + "router_z_loss_clip": 1.27148438, + "router_z_loss_mlp": 0.09155273, + "step": 14955, + "time_per_iteration": 4.0195207595825195 + }, + { + "auxiliary_loss_clip": 0.06399333, + "auxiliary_loss_mlp": 0.01262833, + "balance_loss_clip": 0.0627117, + "balance_loss_mlp": 0.01253636, + "epoch": 0.8992033669021494, + "flos": 21586245937920.0, + "grad_norm": 1.9656216250623941, + "language_loss": 0.55445802, + "learning_rate": 1.0557451927511341e-07, + "loss": 0.63107967, + "num_input_tokens_seen": 322575765, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.09191895, + "step": 14956, + "time_per_iteration": 2.485630989074707 + }, + { + "auxiliary_loss_clip": 0.06401742, + "auxiliary_loss_mlp": 0.01259934, + "balance_loss_clip": 0.06271005, + "balance_loss_mlp": 0.01250648, + "epoch": 0.8992634901548173, + "flos": 28591644142080.0, + "grad_norm": 1.9186757999102584, + "language_loss": 0.80292857, + "learning_rate": 1.0544969155344863e-07, + "loss": 0.87954533, + "num_input_tokens_seen": 322595665, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09283447, + "step": 14957, + "time_per_iteration": 2.549023151397705 + }, + { + "auxiliary_loss_clip": 0.06405012, + "auxiliary_loss_mlp": 0.01264553, + "balance_loss_clip": 0.06270377, + "balance_loss_mlp": 0.01254486, + "epoch": 0.8993236134074853, + "flos": 19873966152960.0, + "grad_norm": 1.606617914343127, + "language_loss": 0.79137737, + "learning_rate": 1.0532493567332123e-07, + "loss": 0.86807305, + "num_input_tokens_seen": 322614755, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.10070801, + "step": 14958, + "time_per_iteration": 3.8852593898773193 + }, + { + "auxiliary_loss_clip": 0.06400818, + "auxiliary_loss_mlp": 0.0126224, + "balance_loss_clip": 0.06271718, + "balance_loss_mlp": 0.01253407, + "epoch": 0.8993837366601534, + "flos": 19396686896640.0, + "grad_norm": 2.106043903727993, + "language_loss": 0.74878645, + "learning_rate": 1.0520025163946277e-07, + "loss": 0.82541704, + "num_input_tokens_seen": 322633425, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.08837891, + "step": 14959, + "time_per_iteration": 2.491607427597046 + }, + { + "auxiliary_loss_clip": 0.06397241, + "auxiliary_loss_mlp": 0.01264655, + "balance_loss_clip": 0.06269586, + "balance_loss_mlp": 0.01255464, + "epoch": 0.8994438599128213, + "flos": 18557681564160.0, + "grad_norm": 1.7970677871166365, + "language_loss": 0.68824446, + "learning_rate": 1.0507563945660015e-07, + "loss": 0.76486343, + "num_input_tokens_seen": 322652065, + "router_z_loss_clip": 1.27734375, + "router_z_loss_mlp": 0.09185791, + "step": 14960, + "time_per_iteration": 2.473184823989868 + }, + { + "auxiliary_loss_clip": 0.06401291, + "auxiliary_loss_mlp": 0.01261175, + "balance_loss_clip": 0.06271642, + "balance_loss_mlp": 0.01252157, + "epoch": 0.8995039831654893, + "flos": 24434785814400.0, + "grad_norm": 1.656362738673528, + "language_loss": 0.66098744, + "learning_rate": 1.049510991294591e-07, + "loss": 0.73761213, + "num_input_tokens_seen": 322673275, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.09014893, + "step": 14961, + "time_per_iteration": 2.66253662109375 + }, + { + "auxiliary_loss_clip": 0.06398708, + "auxiliary_loss_mlp": 0.01265611, + "balance_loss_clip": 0.0627034, + "balance_loss_mlp": 0.01257105, + "epoch": 0.8995641064181572, + "flos": 21257656951680.0, + "grad_norm": 1.4284268544780132, + "language_loss": 0.83220261, + "learning_rate": 1.0482663066276254e-07, + "loss": 0.90884578, + "num_input_tokens_seen": 322693375, + "router_z_loss_clip": 1.28320312, + "router_z_loss_mlp": 0.08505249, + "step": 14962, + "time_per_iteration": 2.490577220916748 + }, + { + "auxiliary_loss_clip": 0.06411661, + "auxiliary_loss_mlp": 0.0126439, + "balance_loss_clip": 0.06276189, + "balance_loss_mlp": 0.01253924, + "epoch": 0.8996242296708252, + "flos": 23520408134400.0, + "grad_norm": 1.7909885664561782, + "language_loss": 0.76536137, + "learning_rate": 1.047022340612298e-07, + "loss": 0.84212184, + "num_input_tokens_seen": 322712615, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.10473633, + "step": 14963, + "time_per_iteration": 2.548292398452759 + }, + { + "auxiliary_loss_clip": 0.06311448, + "auxiliary_loss_mlp": 0.01255845, + "balance_loss_clip": 0.06257099, + "balance_loss_mlp": 0.01254884, + "epoch": 0.8996843529234931, + "flos": 62421872094720.0, + "grad_norm": 0.7636131914060387, + "language_loss": 0.57454842, + "learning_rate": 1.0457790932957867e-07, + "loss": 0.65022135, + "num_input_tokens_seen": 322766855, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.00959778, + "step": 14964, + "time_per_iteration": 2.9614195823669434 + }, + { + "auxiliary_loss_clip": 0.06410883, + "auxiliary_loss_mlp": 0.01264449, + "balance_loss_clip": 0.06273533, + "balance_loss_mlp": 0.01254307, + "epoch": 0.8997444761761612, + "flos": 24242602475520.0, + "grad_norm": 2.33036033552358, + "language_loss": 0.68011808, + "learning_rate": 1.0445365647252269e-07, + "loss": 0.75687134, + "num_input_tokens_seen": 322781130, + "router_z_loss_clip": 1.37402344, + "router_z_loss_mlp": 0.10140991, + "step": 14965, + "time_per_iteration": 2.4943199157714844 + }, + { + "auxiliary_loss_clip": 0.06403812, + "auxiliary_loss_mlp": 0.01265866, + "balance_loss_clip": 0.06271215, + "balance_loss_mlp": 0.01256216, + "epoch": 0.8998045994288291, + "flos": 21367508054400.0, + "grad_norm": 2.8566612226019354, + "language_loss": 0.72390759, + "learning_rate": 1.0432947549477433e-07, + "loss": 0.8006044, + "num_input_tokens_seen": 322800310, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.09649658, + "step": 14966, + "time_per_iteration": 2.480290412902832 + }, + { + "auxiliary_loss_clip": 0.06401855, + "auxiliary_loss_mlp": 0.01271387, + "balance_loss_clip": 0.06271008, + "balance_loss_mlp": 0.01261618, + "epoch": 0.8998647226814971, + "flos": 28993760686080.0, + "grad_norm": 1.6461811578416619, + "language_loss": 0.7351234, + "learning_rate": 1.0420536640104205e-07, + "loss": 0.81185579, + "num_input_tokens_seen": 322820955, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09765625, + "step": 14967, + "time_per_iteration": 2.5578274726867676 + }, + { + "auxiliary_loss_clip": 0.06400469, + "auxiliary_loss_mlp": 0.01260803, + "balance_loss_clip": 0.06269619, + "balance_loss_mlp": 0.01252011, + "epoch": 0.899924845934165, + "flos": 13630985049600.0, + "grad_norm": 1.7161192874601998, + "language_loss": 0.72534561, + "learning_rate": 1.040813291960323e-07, + "loss": 0.80195838, + "num_input_tokens_seen": 322838780, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.08789062, + "step": 14968, + "time_per_iteration": 2.4440808296203613 + }, + { + "auxiliary_loss_clip": 0.06403413, + "auxiliary_loss_mlp": 0.01266071, + "balance_loss_clip": 0.06271084, + "balance_loss_mlp": 0.01256904, + "epoch": 0.899984969186833, + "flos": 20888258227200.0, + "grad_norm": 1.7973658286855019, + "language_loss": 0.71199846, + "learning_rate": 1.0395736388444864e-07, + "loss": 0.78869331, + "num_input_tokens_seen": 322856710, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.09173584, + "step": 14969, + "time_per_iteration": 2.4951353073120117 + }, + { + "auxiliary_loss_clip": 0.06404494, + "auxiliary_loss_mlp": 0.01261784, + "balance_loss_clip": 0.06270813, + "balance_loss_mlp": 0.01252337, + "epoch": 0.9000450924395009, + "flos": 20927894008320.0, + "grad_norm": 1.857601731037714, + "language_loss": 0.76268947, + "learning_rate": 1.0383347047099201e-07, + "loss": 0.83935225, + "num_input_tokens_seen": 322876070, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.09448242, + "step": 14970, + "time_per_iteration": 2.480330467224121 + }, + { + "auxiliary_loss_clip": 0.06401761, + "auxiliary_loss_mlp": 0.01264551, + "balance_loss_clip": 0.06269549, + "balance_loss_mlp": 0.01255348, + "epoch": 0.900105215692169, + "flos": 17170720456320.0, + "grad_norm": 1.5818903114690037, + "language_loss": 0.73086268, + "learning_rate": 1.0370964896035972e-07, + "loss": 0.80752581, + "num_input_tokens_seen": 322895095, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.09204102, + "step": 14971, + "time_per_iteration": 2.512716293334961 + }, + { + "auxiliary_loss_clip": 0.06404724, + "auxiliary_loss_mlp": 0.01264534, + "balance_loss_clip": 0.06273608, + "balance_loss_mlp": 0.01254175, + "epoch": 0.900165338944837, + "flos": 19937053877760.0, + "grad_norm": 1.9745289708509002, + "language_loss": 0.82069004, + "learning_rate": 1.035858993572476e-07, + "loss": 0.89738262, + "num_input_tokens_seen": 322911845, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.10357666, + "step": 14972, + "time_per_iteration": 2.4601757526397705 + }, + { + "auxiliary_loss_clip": 0.06408463, + "auxiliary_loss_mlp": 0.01264926, + "balance_loss_clip": 0.06272122, + "balance_loss_mlp": 0.01255592, + "epoch": 0.9002254621975049, + "flos": 16112599896960.0, + "grad_norm": 1.8818540963205237, + "language_loss": 0.81552333, + "learning_rate": 1.0346222166634855e-07, + "loss": 0.89225721, + "num_input_tokens_seen": 322928170, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.09332275, + "step": 14973, + "time_per_iteration": 2.475221633911133 + }, + { + "auxiliary_loss_clip": 0.06398419, + "auxiliary_loss_mlp": 0.01266711, + "balance_loss_clip": 0.06268209, + "balance_loss_mlp": 0.0125693, + "epoch": 0.9002855854501729, + "flos": 28483763610240.0, + "grad_norm": 1.7760523165463304, + "language_loss": 0.58510089, + "learning_rate": 1.0333861589235193e-07, + "loss": 0.66175216, + "num_input_tokens_seen": 322948165, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.09783936, + "step": 14974, + "time_per_iteration": 2.5352773666381836 + }, + { + "auxiliary_loss_clip": 0.06406291, + "auxiliary_loss_mlp": 0.01265924, + "balance_loss_clip": 0.0627301, + "balance_loss_mlp": 0.01256643, + "epoch": 0.9003457087028408, + "flos": 25637487793920.0, + "grad_norm": 1.8033115500772146, + "language_loss": 0.63577545, + "learning_rate": 1.0321508203994489e-07, + "loss": 0.71249753, + "num_input_tokens_seen": 322968880, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.09283447, + "step": 14975, + "time_per_iteration": 2.5415873527526855 + }, + { + "auxiliary_loss_clip": 0.06403071, + "auxiliary_loss_mlp": 0.01265131, + "balance_loss_clip": 0.06269182, + "balance_loss_mlp": 0.01255445, + "epoch": 0.9004058319555088, + "flos": 24396323990400.0, + "grad_norm": 1.51522570202554, + "language_loss": 0.72969091, + "learning_rate": 1.0309162011381257e-07, + "loss": 0.80637288, + "num_input_tokens_seen": 322989395, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.09686279, + "step": 14976, + "time_per_iteration": 2.5184712409973145 + }, + { + "auxiliary_loss_clip": 0.06402969, + "auxiliary_loss_mlp": 0.01264535, + "balance_loss_clip": 0.06271479, + "balance_loss_mlp": 0.01255719, + "epoch": 0.9004659552081767, + "flos": 29066994754560.0, + "grad_norm": 1.8113879200430405, + "language_loss": 0.69898343, + "learning_rate": 1.0296823011863565e-07, + "loss": 0.77565849, + "num_input_tokens_seen": 323009060, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.08813477, + "step": 14977, + "time_per_iteration": 2.5655102729797363 + }, + { + "auxiliary_loss_clip": 0.06403376, + "auxiliary_loss_mlp": 0.0126245, + "balance_loss_clip": 0.06269954, + "balance_loss_mlp": 0.0125227, + "epoch": 0.9005260784608448, + "flos": 16769484380160.0, + "grad_norm": 4.2700223305485485, + "language_loss": 0.65910697, + "learning_rate": 1.0284491205909351e-07, + "loss": 0.73576528, + "num_input_tokens_seen": 323027530, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.10174561, + "step": 14978, + "time_per_iteration": 2.446382522583008 + }, + { + "auxiliary_loss_clip": 0.06405294, + "auxiliary_loss_mlp": 0.01266515, + "balance_loss_clip": 0.06270348, + "balance_loss_mlp": 0.01256216, + "epoch": 0.9005862017135127, + "flos": 20382244220160.0, + "grad_norm": 1.7055654083923508, + "language_loss": 0.79123801, + "learning_rate": 1.0272166593986286e-07, + "loss": 0.86795604, + "num_input_tokens_seen": 323045370, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.10284424, + "step": 14979, + "time_per_iteration": 2.4874277114868164 + }, + { + "auxiliary_loss_clip": 0.06307672, + "auxiliary_loss_mlp": 0.012518, + "balance_loss_clip": 0.06253401, + "balance_loss_mlp": 0.0125079, + "epoch": 0.9006463249661807, + "flos": 67599101917440.0, + "grad_norm": 0.7137395392285222, + "language_loss": 0.52951163, + "learning_rate": 1.0259849176561642e-07, + "loss": 0.60510641, + "num_input_tokens_seen": 323105660, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.01009369, + "step": 14980, + "time_per_iteration": 3.1869754791259766 + }, + { + "auxiliary_loss_clip": 0.06407195, + "auxiliary_loss_mlp": 0.0126926, + "balance_loss_clip": 0.06270692, + "balance_loss_mlp": 0.01259426, + "epoch": 0.9007064482188486, + "flos": 28300888074240.0, + "grad_norm": 1.5679808464329743, + "language_loss": 0.82694447, + "learning_rate": 1.0247538954102553e-07, + "loss": 0.90370905, + "num_input_tokens_seen": 323126365, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.0982666, + "step": 14981, + "time_per_iteration": 3.9160542488098145 + }, + { + "auxiliary_loss_clip": 0.06398074, + "auxiliary_loss_mlp": 0.01265032, + "balance_loss_clip": 0.06271156, + "balance_loss_mlp": 0.01255639, + "epoch": 0.9007665714715166, + "flos": 21622737191040.0, + "grad_norm": 1.400293048529382, + "language_loss": 0.81589913, + "learning_rate": 1.0235235927075758e-07, + "loss": 0.8925302, + "num_input_tokens_seen": 323145655, + "router_z_loss_clip": 1.26953125, + "router_z_loss_mlp": 0.09387207, + "step": 14982, + "time_per_iteration": 2.5058610439300537 + }, + { + "auxiliary_loss_clip": 0.06395832, + "auxiliary_loss_mlp": 0.01263704, + "balance_loss_clip": 0.06270994, + "balance_loss_mlp": 0.01255037, + "epoch": 0.9008266947241845, + "flos": 26549098289280.0, + "grad_norm": 1.7768075203157598, + "language_loss": 0.7178492, + "learning_rate": 1.0222940095947885e-07, + "loss": 0.79444456, + "num_input_tokens_seen": 323164540, + "router_z_loss_clip": 1.24804688, + "router_z_loss_mlp": 0.08660889, + "step": 14983, + "time_per_iteration": 2.5296106338500977 + }, + { + "auxiliary_loss_clip": 0.06400231, + "auxiliary_loss_mlp": 0.01265711, + "balance_loss_clip": 0.06272098, + "balance_loss_mlp": 0.01257039, + "epoch": 0.9008868179768525, + "flos": 23116907998080.0, + "grad_norm": 1.269960431360642, + "language_loss": 0.75048274, + "learning_rate": 1.0210651461185115e-07, + "loss": 0.82714218, + "num_input_tokens_seen": 323186960, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.08660889, + "step": 14984, + "time_per_iteration": 2.544950246810913 + }, + { + "auxiliary_loss_clip": 0.06398641, + "auxiliary_loss_mlp": 0.01266345, + "balance_loss_clip": 0.06270674, + "balance_loss_mlp": 0.01256802, + "epoch": 0.9009469412295206, + "flos": 19066546609920.0, + "grad_norm": 1.3816348199344486, + "language_loss": 0.70344037, + "learning_rate": 1.0198370023253456e-07, + "loss": 0.78009021, + "num_input_tokens_seen": 323206135, + "router_z_loss_clip": 1.28027344, + "router_z_loss_mlp": 0.09552002, + "step": 14985, + "time_per_iteration": 2.4892797470092773 + }, + { + "auxiliary_loss_clip": 0.06402488, + "auxiliary_loss_mlp": 0.01263035, + "balance_loss_clip": 0.06268957, + "balance_loss_mlp": 0.01253617, + "epoch": 0.9010070644821885, + "flos": 23229065088000.0, + "grad_norm": 1.882791144388424, + "language_loss": 0.70384359, + "learning_rate": 1.0186095782618643e-07, + "loss": 0.7804988, + "num_input_tokens_seen": 323225980, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.09411621, + "step": 14986, + "time_per_iteration": 2.504513740539551 + }, + { + "auxiliary_loss_clip": 0.06405906, + "auxiliary_loss_mlp": 0.01263679, + "balance_loss_clip": 0.06271657, + "balance_loss_mlp": 0.01254661, + "epoch": 0.9010671877348565, + "flos": 17390674224000.0, + "grad_norm": 1.5819824224389398, + "language_loss": 0.76687872, + "learning_rate": 1.0173828739746104e-07, + "loss": 0.84357452, + "num_input_tokens_seen": 323243700, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.09020996, + "step": 14987, + "time_per_iteration": 2.469608783721924 + }, + { + "auxiliary_loss_clip": 0.06404476, + "auxiliary_loss_mlp": 0.01265663, + "balance_loss_clip": 0.06274141, + "balance_loss_mlp": 0.01256537, + "epoch": 0.9011273109875244, + "flos": 21914625288960.0, + "grad_norm": 1.7711059610657074, + "language_loss": 0.74044967, + "learning_rate": 1.0161568895100981e-07, + "loss": 0.81715107, + "num_input_tokens_seen": 323261535, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.09118652, + "step": 14988, + "time_per_iteration": 2.5563955307006836 + }, + { + "auxiliary_loss_clip": 0.06406365, + "auxiliary_loss_mlp": 0.01266135, + "balance_loss_clip": 0.06271102, + "balance_loss_mlp": 0.01255574, + "epoch": 0.9011874342401924, + "flos": 24067651150080.0, + "grad_norm": 1.7919012597313317, + "language_loss": 0.6937961, + "learning_rate": 1.0149316249148188e-07, + "loss": 0.7705211, + "num_input_tokens_seen": 323281855, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.10565186, + "step": 14989, + "time_per_iteration": 2.521286725997925 + }, + { + "auxiliary_loss_clip": 0.06404412, + "auxiliary_loss_mlp": 0.01264705, + "balance_loss_clip": 0.06270802, + "balance_loss_mlp": 0.01255889, + "epoch": 0.9012475574928603, + "flos": 16763572667520.0, + "grad_norm": 1.9829784311923562, + "language_loss": 0.80470562, + "learning_rate": 1.0137070802352376e-07, + "loss": 0.88139677, + "num_input_tokens_seen": 323299505, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.08825684, + "step": 14990, + "time_per_iteration": 2.47330379486084 + }, + { + "auxiliary_loss_clip": 0.06409752, + "auxiliary_loss_mlp": 0.01264204, + "balance_loss_clip": 0.06274055, + "balance_loss_mlp": 0.01254489, + "epoch": 0.9013076807455284, + "flos": 19976689658880.0, + "grad_norm": 1.6858389926968038, + "language_loss": 0.78232729, + "learning_rate": 1.0124832555177842e-07, + "loss": 0.85906684, + "num_input_tokens_seen": 323318365, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.09710693, + "step": 14991, + "time_per_iteration": 2.4976749420166016 + }, + { + "auxiliary_loss_clip": 0.06310493, + "auxiliary_loss_mlp": 0.01250757, + "balance_loss_clip": 0.06256165, + "balance_loss_mlp": 0.01249734, + "epoch": 0.9013678039981963, + "flos": 65200070868480.0, + "grad_norm": 0.771418761968222, + "language_loss": 0.59844536, + "learning_rate": 1.0112601508088726e-07, + "loss": 0.67405784, + "num_input_tokens_seen": 323371835, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.01023102, + "step": 14992, + "time_per_iteration": 4.404303073883057 + }, + { + "auxiliary_loss_clip": 0.06398614, + "auxiliary_loss_mlp": 0.01266162, + "balance_loss_clip": 0.06269006, + "balance_loss_mlp": 0.01256423, + "epoch": 0.9014279272508643, + "flos": 20527370691840.0, + "grad_norm": 2.0885867633446833, + "language_loss": 0.83284277, + "learning_rate": 1.0100377661548764e-07, + "loss": 0.90949053, + "num_input_tokens_seen": 323388495, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.09741211, + "step": 14993, + "time_per_iteration": 2.501352071762085 + }, + { + "auxiliary_loss_clip": 0.06403168, + "auxiliary_loss_mlp": 0.01264173, + "balance_loss_clip": 0.06271326, + "balance_loss_mlp": 0.0125457, + "epoch": 0.9014880505035322, + "flos": 17314421408640.0, + "grad_norm": 1.9120593810256001, + "language_loss": 0.73393512, + "learning_rate": 1.0088161016021502e-07, + "loss": 0.81060851, + "num_input_tokens_seen": 323405280, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09606934, + "step": 14994, + "time_per_iteration": 2.463254928588867 + }, + { + "auxiliary_loss_clip": 0.06396592, + "auxiliary_loss_mlp": 0.01264052, + "balance_loss_clip": 0.06269167, + "balance_loss_mlp": 0.01255076, + "epoch": 0.9015481737562002, + "flos": 28410445687680.0, + "grad_norm": 15.899977864830745, + "language_loss": 0.64903772, + "learning_rate": 1.0075951571970187e-07, + "loss": 0.72564423, + "num_input_tokens_seen": 323425310, + "router_z_loss_clip": 1.27441406, + "router_z_loss_mlp": 0.08978271, + "step": 14995, + "time_per_iteration": 3.9784598350524902 + }, + { + "auxiliary_loss_clip": 0.06406161, + "auxiliary_loss_mlp": 0.01265735, + "balance_loss_clip": 0.06272201, + "balance_loss_mlp": 0.01256371, + "epoch": 0.9016082970088681, + "flos": 29760454344960.0, + "grad_norm": 1.9295019510354385, + "language_loss": 0.67002177, + "learning_rate": 1.0063749329857873e-07, + "loss": 0.7467407, + "num_input_tokens_seen": 323447805, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.09368896, + "step": 14996, + "time_per_iteration": 2.565322160720825 + }, + { + "auxiliary_loss_clip": 0.06399288, + "auxiliary_loss_mlp": 0.01261496, + "balance_loss_clip": 0.06269487, + "balance_loss_mlp": 0.01252764, + "epoch": 0.9016684202615362, + "flos": 23519905009920.0, + "grad_norm": 1.9107555524376416, + "language_loss": 0.66491365, + "learning_rate": 1.0051554290147168e-07, + "loss": 0.74152148, + "num_input_tokens_seen": 323467150, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.08743286, + "step": 14997, + "time_per_iteration": 3.9145309925079346 + }, + { + "auxiliary_loss_clip": 0.06399675, + "auxiliary_loss_mlp": 0.01265504, + "balance_loss_clip": 0.06270206, + "balance_loss_mlp": 0.01255658, + "epoch": 0.9017285435142042, + "flos": 16984323048960.0, + "grad_norm": 1.6649709431983433, + "language_loss": 0.77622521, + "learning_rate": 1.0039366453300613e-07, + "loss": 0.85287696, + "num_input_tokens_seen": 323484250, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.09851074, + "step": 14998, + "time_per_iteration": 2.529517412185669 + }, + { + "auxiliary_loss_clip": 0.06404671, + "auxiliary_loss_mlp": 0.01262218, + "balance_loss_clip": 0.06271236, + "balance_loss_mlp": 0.01252837, + "epoch": 0.9017886667668721, + "flos": 21399051916800.0, + "grad_norm": 1.59161018782867, + "language_loss": 0.75096691, + "learning_rate": 1.0027185819780281e-07, + "loss": 0.82763588, + "num_input_tokens_seen": 323502910, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.09387207, + "step": 14999, + "time_per_iteration": 2.490741014480591 + }, + { + "auxiliary_loss_clip": 0.06399871, + "auxiliary_loss_mlp": 0.01266503, + "balance_loss_clip": 0.06271258, + "balance_loss_mlp": 0.01256942, + "epoch": 0.9018487900195401, + "flos": 21002972866560.0, + "grad_norm": 2.103504102903878, + "language_loss": 0.75620949, + "learning_rate": 1.0015012390048117e-07, + "loss": 0.83287323, + "num_input_tokens_seen": 323521820, + "router_z_loss_clip": 1.28417969, + "router_z_loss_mlp": 0.09564209, + "step": 15000, + "time_per_iteration": 2.588860511779785 + }, + { + "auxiliary_loss_clip": 0.0639964, + "auxiliary_loss_mlp": 0.01266266, + "balance_loss_clip": 0.06270966, + "balance_loss_mlp": 0.01257296, + "epoch": 0.901908913272208, + "flos": 53370085478400.0, + "grad_norm": 2.339615199248997, + "language_loss": 0.81363082, + "learning_rate": 1.0002846164565704e-07, + "loss": 0.8902899, + "num_input_tokens_seen": 323543200, + "router_z_loss_clip": 1.28613281, + "router_z_loss_mlp": 0.08966064, + "step": 15001, + "time_per_iteration": 2.7768962383270264 + }, + { + "auxiliary_loss_clip": 0.06399134, + "auxiliary_loss_mlp": 0.01263715, + "balance_loss_clip": 0.06270103, + "balance_loss_mlp": 0.01254906, + "epoch": 0.901969036524876, + "flos": 22096201086720.0, + "grad_norm": 1.7575723088457134, + "language_loss": 0.78756481, + "learning_rate": 9.990687143794407e-08, + "loss": 0.86419332, + "num_input_tokens_seen": 323563075, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.0880127, + "step": 15002, + "time_per_iteration": 2.5058481693267822 + }, + { + "auxiliary_loss_clip": 0.0640651, + "auxiliary_loss_mlp": 0.01263017, + "balance_loss_clip": 0.06274793, + "balance_loss_mlp": 0.0125295, + "epoch": 0.9020291597775439, + "flos": 23840653639680.0, + "grad_norm": 1.9143661946542763, + "language_loss": 0.68313885, + "learning_rate": 9.978535328195347e-08, + "loss": 0.75983411, + "num_input_tokens_seen": 323579065, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.10076904, + "step": 15003, + "time_per_iteration": 2.474975824356079 + }, + { + "auxiliary_loss_clip": 0.064068, + "auxiliary_loss_mlp": 0.01263994, + "balance_loss_clip": 0.06272157, + "balance_loss_mlp": 0.01254171, + "epoch": 0.902089283030212, + "flos": 18330767907840.0, + "grad_norm": 1.6505314719382027, + "language_loss": 0.86296797, + "learning_rate": 9.9663907182292e-08, + "loss": 0.93967593, + "num_input_tokens_seen": 323594835, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.09820557, + "step": 15004, + "time_per_iteration": 2.4478914737701416 + }, + { + "auxiliary_loss_clip": 0.06403968, + "auxiliary_loss_mlp": 0.01265292, + "balance_loss_clip": 0.06270397, + "balance_loss_mlp": 0.01255612, + "epoch": 0.9021494062828799, + "flos": 24177208763520.0, + "grad_norm": 2.1879472494001546, + "language_loss": 0.72795928, + "learning_rate": 9.954253314356575e-08, + "loss": 0.80465186, + "num_input_tokens_seen": 323611475, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.09686279, + "step": 15005, + "time_per_iteration": 2.4971089363098145 + }, + { + "auxiliary_loss_clip": 0.06404206, + "auxiliary_loss_mlp": 0.01265568, + "balance_loss_clip": 0.06268016, + "balance_loss_mlp": 0.01255602, + "epoch": 0.9022095295355479, + "flos": 21623366096640.0, + "grad_norm": 1.793458776106301, + "language_loss": 0.71351212, + "learning_rate": 9.942123117037748e-08, + "loss": 0.79020989, + "num_input_tokens_seen": 323629730, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.09967041, + "step": 15006, + "time_per_iteration": 2.4973998069763184 + }, + { + "auxiliary_loss_clip": 0.06405459, + "auxiliary_loss_mlp": 0.01263428, + "balance_loss_clip": 0.06272218, + "balance_loss_mlp": 0.01254947, + "epoch": 0.9022696527882158, + "flos": 18730871953920.0, + "grad_norm": 1.8715422678325178, + "language_loss": 0.84960949, + "learning_rate": 9.930000126732618e-08, + "loss": 0.92629838, + "num_input_tokens_seen": 323646000, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.08477783, + "step": 15007, + "time_per_iteration": 2.4507057666778564 + }, + { + "auxiliary_loss_clip": 0.06399123, + "auxiliary_loss_mlp": 0.01264283, + "balance_loss_clip": 0.06270652, + "balance_loss_mlp": 0.01255324, + "epoch": 0.9023297760408838, + "flos": 26768548932480.0, + "grad_norm": 1.4952724913749835, + "language_loss": 0.78544199, + "learning_rate": 9.917884343900928e-08, + "loss": 0.8620761, + "num_input_tokens_seen": 323667250, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.08966064, + "step": 15008, + "time_per_iteration": 2.5391016006469727 + }, + { + "auxiliary_loss_clip": 0.06395697, + "auxiliary_loss_mlp": 0.01263912, + "balance_loss_clip": 0.06271064, + "balance_loss_mlp": 0.01255159, + "epoch": 0.9023898992935517, + "flos": 20528921992320.0, + "grad_norm": 2.089305963207464, + "language_loss": 0.73686892, + "learning_rate": 9.905775769002156e-08, + "loss": 0.813465, + "num_input_tokens_seen": 323687150, + "router_z_loss_clip": 1.24707031, + "router_z_loss_mlp": 0.08743286, + "step": 15009, + "time_per_iteration": 2.4703476428985596 + }, + { + "auxiliary_loss_clip": 0.06399488, + "auxiliary_loss_mlp": 0.01262587, + "balance_loss_clip": 0.06270318, + "balance_loss_mlp": 0.01252937, + "epoch": 0.9024500225462198, + "flos": 17462315064960.0, + "grad_norm": 1.6513544611324535, + "language_loss": 0.73667175, + "learning_rate": 9.893674402495399e-08, + "loss": 0.8132925, + "num_input_tokens_seen": 323703660, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.09649658, + "step": 15010, + "time_per_iteration": 2.5722885131835938 + }, + { + "auxiliary_loss_clip": 0.06401055, + "auxiliary_loss_mlp": 0.01263209, + "balance_loss_clip": 0.06269281, + "balance_loss_mlp": 0.01253685, + "epoch": 0.9025101457988878, + "flos": 20819887695360.0, + "grad_norm": 1.8237598528390848, + "language_loss": 0.74242365, + "learning_rate": 9.881580244839538e-08, + "loss": 0.81906629, + "num_input_tokens_seen": 323722060, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09521484, + "step": 15011, + "time_per_iteration": 2.4827427864074707 + }, + { + "auxiliary_loss_clip": 0.06407499, + "auxiliary_loss_mlp": 0.01263501, + "balance_loss_clip": 0.0627194, + "balance_loss_mlp": 0.01253529, + "epoch": 0.9025702690515557, + "flos": 19032445198080.0, + "grad_norm": 1.7995959341286187, + "language_loss": 0.73437095, + "learning_rate": 9.869493296493204e-08, + "loss": 0.81108093, + "num_input_tokens_seen": 323740645, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.09979248, + "step": 15012, + "time_per_iteration": 2.4940521717071533 + }, + { + "auxiliary_loss_clip": 0.06397925, + "auxiliary_loss_mlp": 0.01264675, + "balance_loss_clip": 0.06269205, + "balance_loss_mlp": 0.01255406, + "epoch": 0.9026303923042237, + "flos": 19688952337920.0, + "grad_norm": 1.48602837314537, + "language_loss": 0.69452763, + "learning_rate": 9.857413557914763e-08, + "loss": 0.77115357, + "num_input_tokens_seen": 323758905, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.09259033, + "step": 15013, + "time_per_iteration": 2.4835736751556396 + }, + { + "auxiliary_loss_clip": 0.06398869, + "auxiliary_loss_mlp": 0.01260522, + "balance_loss_clip": 0.06272131, + "balance_loss_mlp": 0.01251594, + "epoch": 0.9026905155568916, + "flos": 24615019946880.0, + "grad_norm": 1.7650439718162378, + "language_loss": 0.73028564, + "learning_rate": 9.845341029562249e-08, + "loss": 0.80687964, + "num_input_tokens_seen": 323780595, + "router_z_loss_clip": 1.26660156, + "router_z_loss_mlp": 0.0892334, + "step": 15014, + "time_per_iteration": 2.545559883117676 + }, + { + "auxiliary_loss_clip": 0.06403096, + "auxiliary_loss_mlp": 0.01264563, + "balance_loss_clip": 0.06270044, + "balance_loss_mlp": 0.01254896, + "epoch": 0.9027506388095596, + "flos": 20528041524480.0, + "grad_norm": 1.7507431286300652, + "language_loss": 0.72524196, + "learning_rate": 9.833275711893474e-08, + "loss": 0.80191857, + "num_input_tokens_seen": 323798160, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.09661865, + "step": 15015, + "time_per_iteration": 2.4903807640075684 + }, + { + "auxiliary_loss_clip": 0.06400931, + "auxiliary_loss_mlp": 0.01265325, + "balance_loss_clip": 0.06269611, + "balance_loss_mlp": 0.01256021, + "epoch": 0.9028107620622275, + "flos": 22791211977600.0, + "grad_norm": 2.296107301723219, + "language_loss": 0.69238591, + "learning_rate": 9.821217605365895e-08, + "loss": 0.76904845, + "num_input_tokens_seen": 323816810, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09295654, + "step": 15016, + "time_per_iteration": 2.504646062850952 + }, + { + "auxiliary_loss_clip": 0.06400882, + "auxiliary_loss_mlp": 0.01265162, + "balance_loss_clip": 0.06271025, + "balance_loss_mlp": 0.01256323, + "epoch": 0.9028708853148956, + "flos": 25417534026240.0, + "grad_norm": 1.7870514242976832, + "language_loss": 0.70508265, + "learning_rate": 9.809166710436855e-08, + "loss": 0.78174311, + "num_input_tokens_seen": 323836900, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.08837891, + "step": 15017, + "time_per_iteration": 2.5365939140319824 + }, + { + "auxiliary_loss_clip": 0.06402348, + "auxiliary_loss_mlp": 0.01266381, + "balance_loss_clip": 0.06272686, + "balance_loss_mlp": 0.01256856, + "epoch": 0.9029310085675635, + "flos": 21877714765440.0, + "grad_norm": 1.5325047994601255, + "language_loss": 0.69792432, + "learning_rate": 9.797123027563237e-08, + "loss": 0.77461159, + "num_input_tokens_seen": 323855325, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.09527588, + "step": 15018, + "time_per_iteration": 2.566941738128662 + }, + { + "auxiliary_loss_clip": 0.06402241, + "auxiliary_loss_mlp": 0.01263584, + "balance_loss_clip": 0.06271377, + "balance_loss_mlp": 0.01254047, + "epoch": 0.9029911318202315, + "flos": 26221725187200.0, + "grad_norm": 1.7617066238132792, + "language_loss": 0.69269657, + "learning_rate": 9.785086557201782e-08, + "loss": 0.76935482, + "num_input_tokens_seen": 323875650, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.09545898, + "step": 15019, + "time_per_iteration": 2.5253076553344727 + }, + { + "auxiliary_loss_clip": 0.06397457, + "auxiliary_loss_mlp": 0.01264732, + "balance_loss_clip": 0.06268983, + "balance_loss_mlp": 0.01256363, + "epoch": 0.9030512550728994, + "flos": 15966886446720.0, + "grad_norm": 1.889114929079113, + "language_loss": 0.7230109, + "learning_rate": 9.773057299808951e-08, + "loss": 0.79963273, + "num_input_tokens_seen": 323892920, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.08374023, + "step": 15020, + "time_per_iteration": 2.468628406524658 + }, + { + "auxiliary_loss_clip": 0.06404897, + "auxiliary_loss_mlp": 0.01268613, + "balance_loss_clip": 0.06270586, + "balance_loss_mlp": 0.01258916, + "epoch": 0.9031113783255674, + "flos": 23994375154560.0, + "grad_norm": 1.4194454202400997, + "language_loss": 0.74583924, + "learning_rate": 9.7610352558408e-08, + "loss": 0.82257438, + "num_input_tokens_seen": 323913835, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.09698486, + "step": 15021, + "time_per_iteration": 3.985873222351074 + }, + { + "auxiliary_loss_clip": 0.06407882, + "auxiliary_loss_mlp": 0.01264222, + "balance_loss_clip": 0.06272886, + "balance_loss_mlp": 0.01254, + "epoch": 0.9031715015782353, + "flos": 22243843180800.0, + "grad_norm": 2.5045903448395137, + "language_loss": 0.73161501, + "learning_rate": 9.749020425753251e-08, + "loss": 0.80833614, + "num_input_tokens_seen": 323933440, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.10217285, + "step": 15022, + "time_per_iteration": 2.5113275051116943 + }, + { + "auxiliary_loss_clip": 0.06393677, + "auxiliary_loss_mlp": 0.01267404, + "balance_loss_clip": 0.06270428, + "balance_loss_mlp": 0.01257975, + "epoch": 0.9032316248309034, + "flos": 26330402332800.0, + "grad_norm": 3.967318803725848, + "language_loss": 0.72854298, + "learning_rate": 9.737012810001943e-08, + "loss": 0.80515379, + "num_input_tokens_seen": 323954090, + "router_z_loss_clip": 1.23242188, + "router_z_loss_mlp": 0.09423828, + "step": 15023, + "time_per_iteration": 2.5420546531677246 + }, + { + "auxiliary_loss_clip": 0.06403374, + "auxiliary_loss_mlp": 0.01262483, + "balance_loss_clip": 0.06272282, + "balance_loss_mlp": 0.01253543, + "epoch": 0.9032917480835713, + "flos": 22643066759040.0, + "grad_norm": 1.6550162923878977, + "language_loss": 0.83047354, + "learning_rate": 9.725012409042155e-08, + "loss": 0.90713215, + "num_input_tokens_seen": 323974040, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.08929443, + "step": 15024, + "time_per_iteration": 2.4915647506713867 + }, + { + "auxiliary_loss_clip": 0.06401648, + "auxiliary_loss_mlp": 0.0126249, + "balance_loss_clip": 0.06268153, + "balance_loss_mlp": 0.01253245, + "epoch": 0.9033518713362393, + "flos": 23885614154880.0, + "grad_norm": 1.4118760042972751, + "language_loss": 0.69764483, + "learning_rate": 9.713019223328966e-08, + "loss": 0.77428621, + "num_input_tokens_seen": 323996125, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.09246826, + "step": 15025, + "time_per_iteration": 2.5418436527252197 + }, + { + "auxiliary_loss_clip": 0.0639978, + "auxiliary_loss_mlp": 0.01265465, + "balance_loss_clip": 0.06270677, + "balance_loss_mlp": 0.01256614, + "epoch": 0.9034119945889073, + "flos": 26912333738880.0, + "grad_norm": 1.6472456604256864, + "language_loss": 0.77497172, + "learning_rate": 9.70103325331717e-08, + "loss": 0.85162413, + "num_input_tokens_seen": 324017645, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.08856201, + "step": 15026, + "time_per_iteration": 2.542853355407715 + }, + { + "auxiliary_loss_clip": 0.0640185, + "auxiliary_loss_mlp": 0.01264911, + "balance_loss_clip": 0.06272145, + "balance_loss_mlp": 0.01255636, + "epoch": 0.9034721178415752, + "flos": 20856462802560.0, + "grad_norm": 1.7153056741233828, + "language_loss": 0.69028974, + "learning_rate": 9.68905449946129e-08, + "loss": 0.76695728, + "num_input_tokens_seen": 324036875, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.09265137, + "step": 15027, + "time_per_iteration": 2.541903018951416 + }, + { + "auxiliary_loss_clip": 0.06398702, + "auxiliary_loss_mlp": 0.01262434, + "balance_loss_clip": 0.06273375, + "balance_loss_mlp": 0.01253147, + "epoch": 0.9035322410942432, + "flos": 22240447090560.0, + "grad_norm": 1.5068481483988292, + "language_loss": 0.75781077, + "learning_rate": 9.677082962215477e-08, + "loss": 0.83442211, + "num_input_tokens_seen": 324057045, + "router_z_loss_clip": 1.25292969, + "router_z_loss_mlp": 0.09283447, + "step": 15028, + "time_per_iteration": 2.5198581218719482 + }, + { + "auxiliary_loss_clip": 0.06401777, + "auxiliary_loss_mlp": 0.01264092, + "balance_loss_clip": 0.06272782, + "balance_loss_mlp": 0.01254365, + "epoch": 0.9035923643469111, + "flos": 25930843338240.0, + "grad_norm": 1.6223052048522015, + "language_loss": 0.69506884, + "learning_rate": 9.665118642033765e-08, + "loss": 0.77172744, + "num_input_tokens_seen": 324079735, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.09735107, + "step": 15029, + "time_per_iteration": 2.587470531463623 + }, + { + "auxiliary_loss_clip": 0.06409352, + "auxiliary_loss_mlp": 0.01263235, + "balance_loss_clip": 0.06274136, + "balance_loss_mlp": 0.01253246, + "epoch": 0.9036524875995792, + "flos": 20346088383360.0, + "grad_norm": 1.9631111792955274, + "language_loss": 0.74286699, + "learning_rate": 9.653161539369858e-08, + "loss": 0.81959289, + "num_input_tokens_seen": 324097785, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.09991455, + "step": 15030, + "time_per_iteration": 2.503896951675415 + }, + { + "auxiliary_loss_clip": 0.06404515, + "auxiliary_loss_mlp": 0.01261624, + "balance_loss_clip": 0.0626976, + "balance_loss_mlp": 0.01251652, + "epoch": 0.9037126108522471, + "flos": 40124137939200.0, + "grad_norm": 1.6436403874655139, + "language_loss": 0.6833986, + "learning_rate": 9.641211654677151e-08, + "loss": 0.76006001, + "num_input_tokens_seen": 324121625, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.09973145, + "step": 15031, + "time_per_iteration": 4.073733329772949 + }, + { + "auxiliary_loss_clip": 0.06398544, + "auxiliary_loss_mlp": 0.01262429, + "balance_loss_clip": 0.06269525, + "balance_loss_mlp": 0.01253322, + "epoch": 0.9037727341049151, + "flos": 23338874263680.0, + "grad_norm": 1.465363790750211, + "language_loss": 0.7664578, + "learning_rate": 9.629268988408723e-08, + "loss": 0.84306753, + "num_input_tokens_seen": 324142535, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.09112549, + "step": 15032, + "time_per_iteration": 2.532316207885742 + }, + { + "auxiliary_loss_clip": 0.06404598, + "auxiliary_loss_mlp": 0.01265709, + "balance_loss_clip": 0.06271706, + "balance_loss_mlp": 0.01256142, + "epoch": 0.903832857357583, + "flos": 12827506648320.0, + "grad_norm": 1.7777263252161932, + "language_loss": 0.75482416, + "learning_rate": 9.617333541017502e-08, + "loss": 0.83152729, + "num_input_tokens_seen": 324159610, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.09564209, + "step": 15033, + "time_per_iteration": 2.4739763736724854 + }, + { + "auxiliary_loss_clip": 0.0640469, + "auxiliary_loss_mlp": 0.01261941, + "balance_loss_clip": 0.06270737, + "balance_loss_mlp": 0.01252571, + "epoch": 0.903892980610251, + "flos": 25710176810880.0, + "grad_norm": 1.6001227374225993, + "language_loss": 0.73648345, + "learning_rate": 9.605405312956105e-08, + "loss": 0.81314975, + "num_input_tokens_seen": 324182510, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.09375, + "step": 15034, + "time_per_iteration": 2.6218338012695312 + }, + { + "auxiliary_loss_clip": 0.06400965, + "auxiliary_loss_mlp": 0.01267772, + "balance_loss_clip": 0.06270188, + "balance_loss_mlp": 0.01258414, + "epoch": 0.9039531038629189, + "flos": 14689357171200.0, + "grad_norm": 1.6406698929246424, + "language_loss": 0.63630551, + "learning_rate": 9.593484304676791e-08, + "loss": 0.71299291, + "num_input_tokens_seen": 324200555, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09356689, + "step": 15035, + "time_per_iteration": 3.9817230701446533 + }, + { + "auxiliary_loss_clip": 0.06408051, + "auxiliary_loss_mlp": 0.01264822, + "balance_loss_clip": 0.0627642, + "balance_loss_mlp": 0.01254773, + "epoch": 0.904013227115587, + "flos": 24031830729600.0, + "grad_norm": 2.2548052275485717, + "language_loss": 0.61979508, + "learning_rate": 9.581570516631643e-08, + "loss": 0.69652379, + "num_input_tokens_seen": 324220255, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.1005249, + "step": 15036, + "time_per_iteration": 2.5301129817962646 + }, + { + "auxiliary_loss_clip": 0.06398427, + "auxiliary_loss_mlp": 0.01266003, + "balance_loss_clip": 0.06272119, + "balance_loss_mlp": 0.01257683, + "epoch": 0.9040733503682549, + "flos": 22863020526720.0, + "grad_norm": 1.5445550025492283, + "language_loss": 0.8279326, + "learning_rate": 9.569663949272455e-08, + "loss": 0.90457696, + "num_input_tokens_seen": 324237855, + "router_z_loss_clip": 1.26269531, + "router_z_loss_mlp": 0.08312988, + "step": 15037, + "time_per_iteration": 3.9757161140441895 + }, + { + "auxiliary_loss_clip": 0.0640467, + "auxiliary_loss_mlp": 0.01261891, + "balance_loss_clip": 0.0627031, + "balance_loss_mlp": 0.01252652, + "epoch": 0.9041334736209229, + "flos": 19981175852160.0, + "grad_norm": 3.8362695019003703, + "language_loss": 0.6746912, + "learning_rate": 9.557764603050667e-08, + "loss": 0.75135684, + "num_input_tokens_seen": 324257050, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.09240723, + "step": 15038, + "time_per_iteration": 2.483499765396118 + }, + { + "auxiliary_loss_clip": 0.06400178, + "auxiliary_loss_mlp": 0.01264492, + "balance_loss_clip": 0.06270482, + "balance_loss_mlp": 0.01255128, + "epoch": 0.9041935968735909, + "flos": 17536387674240.0, + "grad_norm": 1.9515146557246112, + "language_loss": 0.75760317, + "learning_rate": 9.545872478417494e-08, + "loss": 0.83424991, + "num_input_tokens_seen": 324275510, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.09356689, + "step": 15039, + "time_per_iteration": 2.4685962200164795 + }, + { + "auxiliary_loss_clip": 0.06397585, + "auxiliary_loss_mlp": 0.01264821, + "balance_loss_clip": 0.06270954, + "balance_loss_mlp": 0.0125575, + "epoch": 0.9042537201262588, + "flos": 22786138805760.0, + "grad_norm": 1.4938055012181715, + "language_loss": 0.70288754, + "learning_rate": 9.533987575823977e-08, + "loss": 0.77951157, + "num_input_tokens_seen": 324295150, + "router_z_loss_clip": 1.26660156, + "router_z_loss_mlp": 0.09069824, + "step": 15040, + "time_per_iteration": 2.491750717163086 + }, + { + "auxiliary_loss_clip": 0.0639802, + "auxiliary_loss_mlp": 0.01262156, + "balance_loss_clip": 0.06270084, + "balance_loss_mlp": 0.01252995, + "epoch": 0.9043138433789268, + "flos": 20601778717440.0, + "grad_norm": 1.6249589859719578, + "language_loss": 0.67891502, + "learning_rate": 9.522109895720709e-08, + "loss": 0.75551683, + "num_input_tokens_seen": 324313855, + "router_z_loss_clip": 1.28027344, + "router_z_loss_mlp": 0.09155273, + "step": 15041, + "time_per_iteration": 2.4903454780578613 + }, + { + "auxiliary_loss_clip": 0.06401966, + "auxiliary_loss_mlp": 0.01265404, + "balance_loss_clip": 0.06270871, + "balance_loss_mlp": 0.01255808, + "epoch": 0.9043739666315948, + "flos": 32971223422080.0, + "grad_norm": 1.8083812356166467, + "language_loss": 0.5776667, + "learning_rate": 9.510239438558155e-08, + "loss": 0.65434039, + "num_input_tokens_seen": 324338465, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.09594727, + "step": 15042, + "time_per_iteration": 2.6052052974700928 + }, + { + "auxiliary_loss_clip": 0.06309783, + "auxiliary_loss_mlp": 0.01252944, + "balance_loss_clip": 0.0625516, + "balance_loss_mlp": 0.01251936, + "epoch": 0.9044340898842628, + "flos": 67316563549440.0, + "grad_norm": 0.7739673625252199, + "language_loss": 0.56937176, + "learning_rate": 9.498376204786351e-08, + "loss": 0.64499903, + "num_input_tokens_seen": 324398740, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.01007843, + "step": 15043, + "time_per_iteration": 3.1082680225372314 + }, + { + "auxiliary_loss_clip": 0.06401354, + "auxiliary_loss_mlp": 0.01262146, + "balance_loss_clip": 0.06270433, + "balance_loss_mlp": 0.01252353, + "epoch": 0.9044942131369307, + "flos": 17719053575040.0, + "grad_norm": 1.5454963743123358, + "language_loss": 0.70180726, + "learning_rate": 9.486520194855274e-08, + "loss": 0.77844226, + "num_input_tokens_seen": 324417335, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09802246, + "step": 15044, + "time_per_iteration": 2.512294054031372 + }, + { + "auxiliary_loss_clip": 0.06407118, + "auxiliary_loss_mlp": 0.01268666, + "balance_loss_clip": 0.06272253, + "balance_loss_mlp": 0.01258509, + "epoch": 0.9045543363895987, + "flos": 17826137493120.0, + "grad_norm": 2.078656560936693, + "language_loss": 0.6995939, + "learning_rate": 9.474671409214407e-08, + "loss": 0.77635169, + "num_input_tokens_seen": 324433240, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.10162354, + "step": 15045, + "time_per_iteration": 2.4667201042175293 + }, + { + "auxiliary_loss_clip": 0.06404091, + "auxiliary_loss_mlp": 0.01266009, + "balance_loss_clip": 0.06270969, + "balance_loss_mlp": 0.01255948, + "epoch": 0.9046144596422666, + "flos": 21879349920000.0, + "grad_norm": 6.184482867221641, + "language_loss": 0.66192079, + "learning_rate": 9.462829848313081e-08, + "loss": 0.73862171, + "num_input_tokens_seen": 324452675, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.10064697, + "step": 15046, + "time_per_iteration": 2.486665964126587 + }, + { + "auxiliary_loss_clip": 0.06403056, + "auxiliary_loss_mlp": 0.0126387, + "balance_loss_clip": 0.0626939, + "balance_loss_mlp": 0.0125382, + "epoch": 0.9046745828949346, + "flos": 17677866493440.0, + "grad_norm": 1.9702778577435238, + "language_loss": 0.6221115, + "learning_rate": 9.450995512600379e-08, + "loss": 0.69878078, + "num_input_tokens_seen": 324467865, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.10058594, + "step": 15047, + "time_per_iteration": 2.4436275959014893 + }, + { + "auxiliary_loss_clip": 0.06400335, + "auxiliary_loss_mlp": 0.01266598, + "balance_loss_clip": 0.06270543, + "balance_loss_mlp": 0.01257502, + "epoch": 0.9047347061476025, + "flos": 25709631759360.0, + "grad_norm": 1.433089504689409, + "language_loss": 0.71434736, + "learning_rate": 9.439168402525032e-08, + "loss": 0.7910167, + "num_input_tokens_seen": 324490430, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.09094238, + "step": 15048, + "time_per_iteration": 2.529222249984741 + }, + { + "auxiliary_loss_clip": 0.06401604, + "auxiliary_loss_mlp": 0.01265479, + "balance_loss_clip": 0.06268995, + "balance_loss_mlp": 0.01255853, + "epoch": 0.9047948294002706, + "flos": 15163449972480.0, + "grad_norm": 1.9513151131510529, + "language_loss": 0.75001335, + "learning_rate": 9.427348518535483e-08, + "loss": 0.82668418, + "num_input_tokens_seen": 324506620, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09631348, + "step": 15049, + "time_per_iteration": 2.481271743774414 + }, + { + "auxiliary_loss_clip": 0.06397744, + "auxiliary_loss_mlp": 0.01262639, + "balance_loss_clip": 0.0626848, + "balance_loss_mlp": 0.01253204, + "epoch": 0.9048549526529385, + "flos": 21878846795520.0, + "grad_norm": 2.2351800902186243, + "language_loss": 0.75558716, + "learning_rate": 9.415535861079993e-08, + "loss": 0.83219099, + "num_input_tokens_seen": 324525505, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.09436035, + "step": 15050, + "time_per_iteration": 2.6334476470947266 + }, + { + "auxiliary_loss_clip": 0.06403841, + "auxiliary_loss_mlp": 0.01262044, + "balance_loss_clip": 0.06271207, + "balance_loss_mlp": 0.01252353, + "epoch": 0.9049150759056065, + "flos": 23552790537600.0, + "grad_norm": 1.7362546421895346, + "language_loss": 0.82079089, + "learning_rate": 9.403730430606472e-08, + "loss": 0.89744979, + "num_input_tokens_seen": 324544415, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.09692383, + "step": 15051, + "time_per_iteration": 2.523456573486328 + }, + { + "auxiliary_loss_clip": 0.06402219, + "auxiliary_loss_mlp": 0.01263229, + "balance_loss_clip": 0.06270407, + "balance_loss_mlp": 0.01254336, + "epoch": 0.9049751991582745, + "flos": 19651957960320.0, + "grad_norm": 1.966519944539865, + "language_loss": 0.89343834, + "learning_rate": 9.391932227562582e-08, + "loss": 0.97009277, + "num_input_tokens_seen": 324562555, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.08898926, + "step": 15052, + "time_per_iteration": 2.478151798248291 + }, + { + "auxiliary_loss_clip": 0.06406327, + "auxiliary_loss_mlp": 0.012654, + "balance_loss_clip": 0.06270624, + "balance_loss_mlp": 0.01255613, + "epoch": 0.9050353224109424, + "flos": 15601638499200.0, + "grad_norm": 2.0979073593011495, + "language_loss": 0.77037603, + "learning_rate": 9.380141252395724e-08, + "loss": 0.84709334, + "num_input_tokens_seen": 324580865, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.09777832, + "step": 15053, + "time_per_iteration": 2.4709739685058594 + }, + { + "auxiliary_loss_clip": 0.06399354, + "auxiliary_loss_mlp": 0.0126397, + "balance_loss_clip": 0.06270497, + "balance_loss_mlp": 0.0125497, + "epoch": 0.9050954456636104, + "flos": 28191078898560.0, + "grad_norm": 1.875148681506397, + "language_loss": 0.73177737, + "learning_rate": 9.368357505553049e-08, + "loss": 0.80841064, + "num_input_tokens_seen": 324600665, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.09002686, + "step": 15054, + "time_per_iteration": 2.5475215911865234 + }, + { + "auxiliary_loss_clip": 0.06402034, + "auxiliary_loss_mlp": 0.01264626, + "balance_loss_clip": 0.06272063, + "balance_loss_mlp": 0.01255804, + "epoch": 0.9051555689162784, + "flos": 25737444115200.0, + "grad_norm": 1.5847730284358719, + "language_loss": 0.83485198, + "learning_rate": 9.356580987481333e-08, + "loss": 0.91151857, + "num_input_tokens_seen": 324618145, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.0881958, + "step": 15055, + "time_per_iteration": 2.538119077682495 + }, + { + "auxiliary_loss_clip": 0.06400138, + "auxiliary_loss_mlp": 0.01262787, + "balance_loss_clip": 0.06271436, + "balance_loss_mlp": 0.01253405, + "epoch": 0.9052156921689464, + "flos": 23263795405440.0, + "grad_norm": 1.5354699500322193, + "language_loss": 0.85279965, + "learning_rate": 9.344811698627176e-08, + "loss": 0.92942894, + "num_input_tokens_seen": 324638165, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.09387207, + "step": 15056, + "time_per_iteration": 2.523686408996582 + }, + { + "auxiliary_loss_clip": 0.06402357, + "auxiliary_loss_mlp": 0.01267292, + "balance_loss_clip": 0.06270941, + "balance_loss_mlp": 0.01258047, + "epoch": 0.9052758154216143, + "flos": 29571038190720.0, + "grad_norm": 1.8112643765194574, + "language_loss": 0.72546428, + "learning_rate": 9.333049639436863e-08, + "loss": 0.80216074, + "num_input_tokens_seen": 324658560, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09246826, + "step": 15057, + "time_per_iteration": 2.587482213973999 + }, + { + "auxiliary_loss_clip": 0.06398334, + "auxiliary_loss_mlp": 0.01263054, + "balance_loss_clip": 0.06271854, + "balance_loss_mlp": 0.0125434, + "epoch": 0.9053359386742823, + "flos": 22134285567360.0, + "grad_norm": 4.2714331701731885, + "language_loss": 0.81114525, + "learning_rate": 9.321294810356418e-08, + "loss": 0.88775909, + "num_input_tokens_seen": 324679185, + "router_z_loss_clip": 1.26660156, + "router_z_loss_mlp": 0.0871582, + "step": 15058, + "time_per_iteration": 2.5192415714263916 + }, + { + "auxiliary_loss_clip": 0.06307732, + "auxiliary_loss_mlp": 0.01250617, + "balance_loss_clip": 0.06253529, + "balance_loss_mlp": 0.01249746, + "epoch": 0.9053960619269502, + "flos": 67112332421760.0, + "grad_norm": 0.6586954372577108, + "language_loss": 0.51446468, + "learning_rate": 9.309547211831592e-08, + "loss": 0.59004819, + "num_input_tokens_seen": 324744830, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.00873566, + "step": 15059, + "time_per_iteration": 3.2848002910614014 + }, + { + "auxiliary_loss_clip": 0.06403908, + "auxiliary_loss_mlp": 0.01265364, + "balance_loss_clip": 0.06271765, + "balance_loss_mlp": 0.01256018, + "epoch": 0.9054561851796182, + "flos": 15820921434240.0, + "grad_norm": 3.296870649078698, + "language_loss": 0.67341602, + "learning_rate": 9.297806844307831e-08, + "loss": 0.75010878, + "num_input_tokens_seen": 324762905, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09344482, + "step": 15060, + "time_per_iteration": 2.514012098312378 + }, + { + "auxiliary_loss_clip": 0.06402002, + "auxiliary_loss_mlp": 0.01265399, + "balance_loss_clip": 0.06269133, + "balance_loss_mlp": 0.01255397, + "epoch": 0.9055163084322861, + "flos": 17572837000320.0, + "grad_norm": 1.9490761162977135, + "language_loss": 0.64140469, + "learning_rate": 9.286073708230357e-08, + "loss": 0.71807867, + "num_input_tokens_seen": 324781905, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.10003662, + "step": 15061, + "time_per_iteration": 4.011102676391602 + }, + { + "auxiliary_loss_clip": 0.06401615, + "auxiliary_loss_mlp": 0.01264256, + "balance_loss_clip": 0.0627028, + "balance_loss_mlp": 0.01254558, + "epoch": 0.9055764316849542, + "flos": 17645358309120.0, + "grad_norm": 1.760466857694858, + "language_loss": 0.71594036, + "learning_rate": 9.274347804044058e-08, + "loss": 0.79259902, + "num_input_tokens_seen": 324799260, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.09698486, + "step": 15062, + "time_per_iteration": 2.4741172790527344 + }, + { + "auxiliary_loss_clip": 0.06401698, + "auxiliary_loss_mlp": 0.01266798, + "balance_loss_clip": 0.0627198, + "balance_loss_mlp": 0.01257488, + "epoch": 0.9056365549376221, + "flos": 20127098937600.0, + "grad_norm": 1.6172347718122244, + "language_loss": 0.70928562, + "learning_rate": 9.2626291321936e-08, + "loss": 0.78597057, + "num_input_tokens_seen": 324817800, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.09307861, + "step": 15063, + "time_per_iteration": 2.4766180515289307 + }, + { + "auxiliary_loss_clip": 0.06397741, + "auxiliary_loss_mlp": 0.01264342, + "balance_loss_clip": 0.0627069, + "balance_loss_mlp": 0.01255396, + "epoch": 0.9056966781902901, + "flos": 27606002964480.0, + "grad_norm": 1.5248390937922436, + "language_loss": 0.72296852, + "learning_rate": 9.250917693123406e-08, + "loss": 0.79958934, + "num_input_tokens_seen": 324838445, + "router_z_loss_clip": 1.26953125, + "router_z_loss_mlp": 0.08947754, + "step": 15064, + "time_per_iteration": 2.5452868938446045 + }, + { + "auxiliary_loss_clip": 0.06402265, + "auxiliary_loss_mlp": 0.01263796, + "balance_loss_clip": 0.06268708, + "balance_loss_mlp": 0.01255106, + "epoch": 0.9057568014429581, + "flos": 25926986050560.0, + "grad_norm": 1.7435921110411652, + "language_loss": 0.70200551, + "learning_rate": 9.23921348727752e-08, + "loss": 0.77866608, + "num_input_tokens_seen": 324859895, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.08691406, + "step": 15065, + "time_per_iteration": 2.5181055068969727 + }, + { + "auxiliary_loss_clip": 0.06401214, + "auxiliary_loss_mlp": 0.01264477, + "balance_loss_clip": 0.06270632, + "balance_loss_mlp": 0.01254851, + "epoch": 0.905816924695626, + "flos": 22937093136000.0, + "grad_norm": 1.533976766894516, + "language_loss": 0.63432038, + "learning_rate": 9.227516515099743e-08, + "loss": 0.71097726, + "num_input_tokens_seen": 324879580, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.09625244, + "step": 15066, + "time_per_iteration": 2.5122158527374268 + }, + { + "auxiliary_loss_clip": 0.06410006, + "auxiliary_loss_mlp": 0.01263218, + "balance_loss_clip": 0.06271099, + "balance_loss_mlp": 0.01252441, + "epoch": 0.905877047948294, + "flos": 22162894536960.0, + "grad_norm": 1.777219964068019, + "language_loss": 0.80306625, + "learning_rate": 9.215826777033675e-08, + "loss": 0.87979841, + "num_input_tokens_seen": 324898950, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.10766602, + "step": 15067, + "time_per_iteration": 2.4910852909088135 + }, + { + "auxiliary_loss_clip": 0.06400168, + "auxiliary_loss_mlp": 0.01266388, + "balance_loss_clip": 0.06269554, + "balance_loss_mlp": 0.01256393, + "epoch": 0.905937171200962, + "flos": 15310253525760.0, + "grad_norm": 1.552097033204445, + "language_loss": 0.69955444, + "learning_rate": 9.204144273522563e-08, + "loss": 0.77621996, + "num_input_tokens_seen": 324917455, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.09985352, + "step": 15068, + "time_per_iteration": 2.5215139389038086 + }, + { + "auxiliary_loss_clip": 0.06396197, + "auxiliary_loss_mlp": 0.01265147, + "balance_loss_clip": 0.06269008, + "balance_loss_mlp": 0.0125663, + "epoch": 0.90599729445363, + "flos": 19468914716160.0, + "grad_norm": 1.805239207493818, + "language_loss": 0.85927349, + "learning_rate": 9.19246900500943e-08, + "loss": 0.93588692, + "num_input_tokens_seen": 324934495, + "router_z_loss_clip": 1.27050781, + "router_z_loss_mlp": 0.08514404, + "step": 15069, + "time_per_iteration": 2.4659931659698486 + }, + { + "auxiliary_loss_clip": 0.06407644, + "auxiliary_loss_mlp": 0.01265898, + "balance_loss_clip": 0.0627166, + "balance_loss_mlp": 0.01255926, + "epoch": 0.9060574177062979, + "flos": 23739816850560.0, + "grad_norm": 1.8280166423907744, + "language_loss": 0.5974074, + "learning_rate": 9.180800971936987e-08, + "loss": 0.67414284, + "num_input_tokens_seen": 324953230, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.09967041, + "step": 15070, + "time_per_iteration": 3.9546656608581543 + }, + { + "auxiliary_loss_clip": 0.0640643, + "auxiliary_loss_mlp": 0.01265113, + "balance_loss_clip": 0.06271288, + "balance_loss_mlp": 0.01255755, + "epoch": 0.9061175409589659, + "flos": 17316853176960.0, + "grad_norm": 1.9844350397935704, + "language_loss": 0.81391585, + "learning_rate": 9.169140174747724e-08, + "loss": 0.89063132, + "num_input_tokens_seen": 324969880, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.09356689, + "step": 15071, + "time_per_iteration": 2.4680888652801514 + }, + { + "auxiliary_loss_clip": 0.06404223, + "auxiliary_loss_mlp": 0.01267825, + "balance_loss_clip": 0.06269695, + "balance_loss_mlp": 0.01257705, + "epoch": 0.9061776642116338, + "flos": 17783063694720.0, + "grad_norm": 1.8768433932169004, + "language_loss": 0.61904967, + "learning_rate": 9.157486613883758e-08, + "loss": 0.6957702, + "num_input_tokens_seen": 324987005, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.10125732, + "step": 15072, + "time_per_iteration": 2.4591763019561768 + }, + { + "auxiliary_loss_clip": 0.06402346, + "auxiliary_loss_mlp": 0.0126459, + "balance_loss_clip": 0.06271026, + "balance_loss_mlp": 0.01253814, + "epoch": 0.9062377874643018, + "flos": 42787580146560.0, + "grad_norm": 1.9902101584979952, + "language_loss": 0.72696972, + "learning_rate": 9.145840289787021e-08, + "loss": 0.80363911, + "num_input_tokens_seen": 325010700, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.10778809, + "step": 15073, + "time_per_iteration": 2.7119879722595215 + }, + { + "auxiliary_loss_clip": 0.06397014, + "auxiliary_loss_mlp": 0.01263309, + "balance_loss_clip": 0.06270237, + "balance_loss_mlp": 0.01254624, + "epoch": 0.9062979107169697, + "flos": 16367032419840.0, + "grad_norm": 1.764665765355135, + "language_loss": 0.81274933, + "learning_rate": 9.134201202899161e-08, + "loss": 0.88935256, + "num_input_tokens_seen": 325028760, + "router_z_loss_clip": 1.26855469, + "router_z_loss_mlp": 0.08685303, + "step": 15074, + "time_per_iteration": 2.4704678058624268 + }, + { + "auxiliary_loss_clip": 0.06309762, + "auxiliary_loss_mlp": 0.01249224, + "balance_loss_clip": 0.06255601, + "balance_loss_mlp": 0.01248231, + "epoch": 0.9063580339696378, + "flos": 69336286364160.0, + "grad_norm": 0.7296001006592828, + "language_loss": 0.52386355, + "learning_rate": 9.122569353661513e-08, + "loss": 0.59945345, + "num_input_tokens_seen": 325093545, + "router_z_loss_clip": 0.54248047, + "router_z_loss_mlp": 0.00992584, + "step": 15075, + "time_per_iteration": 4.617650508880615 + }, + { + "auxiliary_loss_clip": 0.06307253, + "auxiliary_loss_mlp": 0.01248452, + "balance_loss_clip": 0.06252947, + "balance_loss_mlp": 0.01247452, + "epoch": 0.9064181572223057, + "flos": 58813388812800.0, + "grad_norm": 0.7084404872191936, + "language_loss": 0.62037706, + "learning_rate": 9.11094474251517e-08, + "loss": 0.69593406, + "num_input_tokens_seen": 325152295, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.00999451, + "step": 15076, + "time_per_iteration": 3.049726724624634 + }, + { + "auxiliary_loss_clip": 0.06398588, + "auxiliary_loss_mlp": 0.01263843, + "balance_loss_clip": 0.06269225, + "balance_loss_mlp": 0.01254611, + "epoch": 0.9064782804749737, + "flos": 21769205328000.0, + "grad_norm": 1.6263272411743717, + "language_loss": 0.82236755, + "learning_rate": 9.09932736990091e-08, + "loss": 0.89899194, + "num_input_tokens_seen": 325169705, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.09234619, + "step": 15077, + "time_per_iteration": 3.9159936904907227 + }, + { + "auxiliary_loss_clip": 0.06396757, + "auxiliary_loss_mlp": 0.01267967, + "balance_loss_clip": 0.06269564, + "balance_loss_mlp": 0.01259337, + "epoch": 0.9065384037276417, + "flos": 21403747745280.0, + "grad_norm": 1.4172221106724106, + "language_loss": 0.84297204, + "learning_rate": 9.08771723625934e-08, + "loss": 0.91961926, + "num_input_tokens_seen": 325189175, + "router_z_loss_clip": 1.27246094, + "router_z_loss_mlp": 0.08630371, + "step": 15078, + "time_per_iteration": 2.5148606300354004 + }, + { + "auxiliary_loss_clip": 0.06395961, + "auxiliary_loss_mlp": 0.01261788, + "balance_loss_clip": 0.06270163, + "balance_loss_mlp": 0.01253188, + "epoch": 0.9065985269803096, + "flos": 38291734926720.0, + "grad_norm": 1.4055545219540846, + "language_loss": 0.6550107, + "learning_rate": 9.076114342030617e-08, + "loss": 0.73158824, + "num_input_tokens_seen": 325211020, + "router_z_loss_clip": 1.2578125, + "router_z_loss_mlp": 0.08599854, + "step": 15079, + "time_per_iteration": 2.6431503295898438 + }, + { + "auxiliary_loss_clip": 0.06400599, + "auxiliary_loss_mlp": 0.01264169, + "balance_loss_clip": 0.06269769, + "balance_loss_mlp": 0.01254889, + "epoch": 0.9066586502329776, + "flos": 44828406990720.0, + "grad_norm": 1.5673413930371245, + "language_loss": 0.70924938, + "learning_rate": 9.064518687654765e-08, + "loss": 0.78589708, + "num_input_tokens_seen": 325236970, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.0927124, + "step": 15080, + "time_per_iteration": 2.7151243686676025 + }, + { + "auxiliary_loss_clip": 0.06409639, + "auxiliary_loss_mlp": 0.01261513, + "balance_loss_clip": 0.0627256, + "balance_loss_mlp": 0.01251368, + "epoch": 0.9067187734856456, + "flos": 18629825529600.0, + "grad_norm": 2.407695406101915, + "language_loss": 0.7148692, + "learning_rate": 9.052930273571547e-08, + "loss": 0.79158074, + "num_input_tokens_seen": 325252670, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.10144043, + "step": 15081, + "time_per_iteration": 2.5449743270874023 + }, + { + "auxiliary_loss_clip": 0.06400509, + "auxiliary_loss_mlp": 0.01261877, + "balance_loss_clip": 0.06271397, + "balance_loss_mlp": 0.01251965, + "epoch": 0.9067788967383136, + "flos": 22754217600000.0, + "grad_norm": 2.1469946393929935, + "language_loss": 0.74491692, + "learning_rate": 9.04134910022032e-08, + "loss": 0.82154077, + "num_input_tokens_seen": 325273860, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.09912109, + "step": 15082, + "time_per_iteration": 2.513711929321289 + }, + { + "auxiliary_loss_clip": 0.06398562, + "auxiliary_loss_mlp": 0.01265255, + "balance_loss_clip": 0.06270002, + "balance_loss_mlp": 0.01256463, + "epoch": 0.9068390199909815, + "flos": 27677853440640.0, + "grad_norm": 1.8132990718715725, + "language_loss": 0.78194749, + "learning_rate": 9.029775168040266e-08, + "loss": 0.85858572, + "num_input_tokens_seen": 325294140, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.08782959, + "step": 15083, + "time_per_iteration": 2.5405113697052 + }, + { + "auxiliary_loss_clip": 0.06396039, + "auxiliary_loss_mlp": 0.01261891, + "balance_loss_clip": 0.06269726, + "balance_loss_mlp": 0.01253183, + "epoch": 0.9068991432436495, + "flos": 24250987883520.0, + "grad_norm": 1.5606180532346916, + "language_loss": 0.69092917, + "learning_rate": 9.01820847747028e-08, + "loss": 0.76750851, + "num_input_tokens_seen": 325313130, + "router_z_loss_clip": 1.26367188, + "router_z_loss_mlp": 0.08703613, + "step": 15084, + "time_per_iteration": 2.514923095703125 + }, + { + "auxiliary_loss_clip": 0.06400265, + "auxiliary_loss_mlp": 0.01266118, + "balance_loss_clip": 0.06270438, + "balance_loss_mlp": 0.01256784, + "epoch": 0.9069592664963174, + "flos": 28040040714240.0, + "grad_norm": 2.1153010193521946, + "language_loss": 0.67261243, + "learning_rate": 9.006649028948965e-08, + "loss": 0.74927622, + "num_input_tokens_seen": 325334880, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.09332275, + "step": 15085, + "time_per_iteration": 2.54697322845459 + }, + { + "auxiliary_loss_clip": 0.06311613, + "auxiliary_loss_mlp": 0.01250731, + "balance_loss_clip": 0.06257414, + "balance_loss_mlp": 0.01249732, + "epoch": 0.9070193897489854, + "flos": 68796479162880.0, + "grad_norm": 0.7613186514195954, + "language_loss": 0.61280566, + "learning_rate": 8.995096822914638e-08, + "loss": 0.68842912, + "num_input_tokens_seen": 325394175, + "router_z_loss_clip": 0.54248047, + "router_z_loss_mlp": 0.00998688, + "step": 15086, + "time_per_iteration": 3.126314163208008 + }, + { + "auxiliary_loss_clip": 0.06399283, + "auxiliary_loss_mlp": 0.01268957, + "balance_loss_clip": 0.06270003, + "balance_loss_mlp": 0.01259372, + "epoch": 0.9070795130016533, + "flos": 23448515731200.0, + "grad_norm": 1.464283060306305, + "language_loss": 0.72384381, + "learning_rate": 8.983551859805416e-08, + "loss": 0.8005262, + "num_input_tokens_seen": 325415020, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.0958252, + "step": 15087, + "time_per_iteration": 2.5283164978027344 + }, + { + "auxiliary_loss_clip": 0.06401356, + "auxiliary_loss_mlp": 0.01263049, + "balance_loss_clip": 0.06269845, + "balance_loss_mlp": 0.01253422, + "epoch": 0.9071396362543214, + "flos": 18922384460160.0, + "grad_norm": 2.001227665639937, + "language_loss": 0.76600784, + "learning_rate": 8.972014140059058e-08, + "loss": 0.84265184, + "num_input_tokens_seen": 325433595, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09625244, + "step": 15088, + "time_per_iteration": 2.4616496562957764 + }, + { + "auxiliary_loss_clip": 0.06397097, + "auxiliary_loss_mlp": 0.01263128, + "balance_loss_clip": 0.06272545, + "balance_loss_mlp": 0.01254426, + "epoch": 0.9071997595069893, + "flos": 25235706666240.0, + "grad_norm": 1.9506446362411543, + "language_loss": 0.73176634, + "learning_rate": 8.960483664113038e-08, + "loss": 0.80836862, + "num_input_tokens_seen": 325451605, + "router_z_loss_clip": 1.24609375, + "router_z_loss_mlp": 0.08703613, + "step": 15089, + "time_per_iteration": 2.5427422523498535 + }, + { + "auxiliary_loss_clip": 0.06397973, + "auxiliary_loss_mlp": 0.01264219, + "balance_loss_clip": 0.06272795, + "balance_loss_mlp": 0.01256298, + "epoch": 0.9072598827596573, + "flos": 24352453578240.0, + "grad_norm": 1.8246434429888692, + "language_loss": 0.75705659, + "learning_rate": 8.948960432404628e-08, + "loss": 0.83367848, + "num_input_tokens_seen": 325470645, + "router_z_loss_clip": 1.25195312, + "router_z_loss_mlp": 0.07922363, + "step": 15090, + "time_per_iteration": 2.5452728271484375 + }, + { + "auxiliary_loss_clip": 0.06400724, + "auxiliary_loss_mlp": 0.01267571, + "balance_loss_clip": 0.0626859, + "balance_loss_mlp": 0.01257468, + "epoch": 0.9073200060123253, + "flos": 22681654364160.0, + "grad_norm": 2.143089382853149, + "language_loss": 0.77423573, + "learning_rate": 8.93744444537079e-08, + "loss": 0.85091865, + "num_input_tokens_seen": 325488070, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.10107422, + "step": 15091, + "time_per_iteration": 2.4868202209472656 + }, + { + "auxiliary_loss_clip": 0.06397654, + "auxiliary_loss_mlp": 0.01263957, + "balance_loss_clip": 0.0627251, + "balance_loss_mlp": 0.01256113, + "epoch": 0.9073801292649932, + "flos": 23702151640320.0, + "grad_norm": 1.4693758835684605, + "language_loss": 0.86293435, + "learning_rate": 8.925935703448217e-08, + "loss": 0.93955046, + "num_input_tokens_seen": 325509285, + "router_z_loss_clip": 1.25097656, + "router_z_loss_mlp": 0.07843018, + "step": 15092, + "time_per_iteration": 2.5014595985412598 + }, + { + "auxiliary_loss_clip": 0.06402805, + "auxiliary_loss_mlp": 0.01262805, + "balance_loss_clip": 0.0627242, + "balance_loss_mlp": 0.0125365, + "epoch": 0.9074402525176612, + "flos": 25382636000640.0, + "grad_norm": 1.507029531138036, + "language_loss": 0.78888041, + "learning_rate": 8.914434207073296e-08, + "loss": 0.86553651, + "num_input_tokens_seen": 325529360, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.09155273, + "step": 15093, + "time_per_iteration": 2.583144426345825 + }, + { + "auxiliary_loss_clip": 0.06309871, + "auxiliary_loss_mlp": 0.01252503, + "balance_loss_clip": 0.06255481, + "balance_loss_mlp": 0.01251333, + "epoch": 0.9075003757703292, + "flos": 67667178960000.0, + "grad_norm": 0.7248238804514167, + "language_loss": 0.5692569, + "learning_rate": 8.902939956682188e-08, + "loss": 0.64488065, + "num_input_tokens_seen": 325583565, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.01167297, + "step": 15094, + "time_per_iteration": 3.065505266189575 + }, + { + "auxiliary_loss_clip": 0.06404001, + "auxiliary_loss_mlp": 0.01262814, + "balance_loss_clip": 0.06270079, + "balance_loss_mlp": 0.01253093, + "epoch": 0.9075604990229972, + "flos": 22459897733760.0, + "grad_norm": 2.3026997740502297, + "language_loss": 0.71735692, + "learning_rate": 8.891452952710742e-08, + "loss": 0.79402506, + "num_input_tokens_seen": 325603690, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.097229, + "step": 15095, + "time_per_iteration": 2.5325124263763428 + }, + { + "auxiliary_loss_clip": 0.06400533, + "auxiliary_loss_mlp": 0.01265643, + "balance_loss_clip": 0.06269962, + "balance_loss_mlp": 0.01256262, + "epoch": 0.9076206222756651, + "flos": 19542735763200.0, + "grad_norm": 1.6933352125689685, + "language_loss": 0.74221349, + "learning_rate": 8.879973195594526e-08, + "loss": 0.81887525, + "num_input_tokens_seen": 325622255, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.09387207, + "step": 15096, + "time_per_iteration": 2.4719395637512207 + }, + { + "auxiliary_loss_clip": 0.06403936, + "auxiliary_loss_mlp": 0.01263226, + "balance_loss_clip": 0.06269987, + "balance_loss_mlp": 0.01252587, + "epoch": 0.9076807455283331, + "flos": 30124654116480.0, + "grad_norm": 1.8580529883394223, + "language_loss": 0.58028173, + "learning_rate": 8.868500685768898e-08, + "loss": 0.65695339, + "num_input_tokens_seen": 325640165, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.10644531, + "step": 15097, + "time_per_iteration": 2.554093837738037 + }, + { + "auxiliary_loss_clip": 0.06394961, + "auxiliary_loss_mlp": 0.01262336, + "balance_loss_clip": 0.06267217, + "balance_loss_mlp": 0.01253639, + "epoch": 0.907740868781001, + "flos": 18703478868480.0, + "grad_norm": 1.5527007642230701, + "language_loss": 0.79784089, + "learning_rate": 8.857035423668935e-08, + "loss": 0.87441391, + "num_input_tokens_seen": 325659455, + "router_z_loss_clip": 1.27734375, + "router_z_loss_mlp": 0.0869751, + "step": 15098, + "time_per_iteration": 2.5422494411468506 + }, + { + "auxiliary_loss_clip": 0.06405206, + "auxiliary_loss_mlp": 0.01263684, + "balance_loss_clip": 0.06270834, + "balance_loss_mlp": 0.01254458, + "epoch": 0.907800992033669, + "flos": 22645540454400.0, + "grad_norm": 1.6203953780141742, + "language_loss": 0.66362941, + "learning_rate": 8.845577409729266e-08, + "loss": 0.74031836, + "num_input_tokens_seen": 325678095, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.09216309, + "step": 15099, + "time_per_iteration": 2.53924822807312 + }, + { + "auxiliary_loss_clip": 0.06402986, + "auxiliary_loss_mlp": 0.01264278, + "balance_loss_clip": 0.06270178, + "balance_loss_mlp": 0.01253925, + "epoch": 0.907861115286337, + "flos": 21293980496640.0, + "grad_norm": 2.113947678970701, + "language_loss": 0.70936823, + "learning_rate": 8.834126644384477e-08, + "loss": 0.78604084, + "num_input_tokens_seen": 325695825, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.10357666, + "step": 15100, + "time_per_iteration": 2.500608444213867 + }, + { + "auxiliary_loss_clip": 0.06306085, + "auxiliary_loss_mlp": 0.0124919, + "balance_loss_clip": 0.06251926, + "balance_loss_mlp": 0.01248136, + "epoch": 0.907921238539005, + "flos": 69759800426880.0, + "grad_norm": 0.6247804404635554, + "language_loss": 0.5343653, + "learning_rate": 8.822683128068775e-08, + "loss": 0.609918, + "num_input_tokens_seen": 325764515, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.01055145, + "step": 15101, + "time_per_iteration": 4.569448232650757 + }, + { + "auxiliary_loss_clip": 0.06403472, + "auxiliary_loss_mlp": 0.01263011, + "balance_loss_clip": 0.06273133, + "balance_loss_mlp": 0.01253654, + "epoch": 0.9079813617916729, + "flos": 23484168443520.0, + "grad_norm": 2.4551114582819764, + "language_loss": 0.68570346, + "learning_rate": 8.811246861216081e-08, + "loss": 0.76236832, + "num_input_tokens_seen": 325783235, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.09362793, + "step": 15102, + "time_per_iteration": 2.543745517730713 + }, + { + "auxiliary_loss_clip": 0.06400967, + "auxiliary_loss_mlp": 0.01264699, + "balance_loss_clip": 0.06271027, + "balance_loss_mlp": 0.01255114, + "epoch": 0.9080414850443409, + "flos": 22936590011520.0, + "grad_norm": 1.8212057779957778, + "language_loss": 0.7951529, + "learning_rate": 8.799817844260049e-08, + "loss": 0.8718096, + "num_input_tokens_seen": 325800195, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.09588623, + "step": 15103, + "time_per_iteration": 2.4846246242523193 + }, + { + "auxiliary_loss_clip": 0.06402376, + "auxiliary_loss_mlp": 0.01267473, + "balance_loss_clip": 0.06269386, + "balance_loss_mlp": 0.01258401, + "epoch": 0.9081016082970089, + "flos": 26184269612160.0, + "grad_norm": 1.995512307901863, + "language_loss": 0.71880859, + "learning_rate": 8.78839607763413e-08, + "loss": 0.79550713, + "num_input_tokens_seen": 325820215, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.09069824, + "step": 15104, + "time_per_iteration": 2.5300004482269287 + }, + { + "auxiliary_loss_clip": 0.06399778, + "auxiliary_loss_mlp": 0.01263283, + "balance_loss_clip": 0.06271459, + "balance_loss_mlp": 0.01254467, + "epoch": 0.9081617315496768, + "flos": 24469054934400.0, + "grad_norm": 1.6559231689282168, + "language_loss": 0.78008848, + "learning_rate": 8.77698156177138e-08, + "loss": 0.85671914, + "num_input_tokens_seen": 325838415, + "router_z_loss_clip": 1.28320312, + "router_z_loss_mlp": 0.08813477, + "step": 15105, + "time_per_iteration": 2.520692825317383 + }, + { + "auxiliary_loss_clip": 0.06401225, + "auxiliary_loss_mlp": 0.01265497, + "balance_loss_clip": 0.06269834, + "balance_loss_mlp": 0.01256401, + "epoch": 0.9082218548023449, + "flos": 24752599551360.0, + "grad_norm": 1.7549028809217568, + "language_loss": 0.73971152, + "learning_rate": 8.765574297104628e-08, + "loss": 0.81637871, + "num_input_tokens_seen": 325855580, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09088135, + "step": 15106, + "time_per_iteration": 2.5180251598358154 + }, + { + "auxiliary_loss_clip": 0.06404307, + "auxiliary_loss_mlp": 0.01264352, + "balance_loss_clip": 0.06271388, + "balance_loss_mlp": 0.01254249, + "epoch": 0.9082819780550128, + "flos": 24427448582400.0, + "grad_norm": 1.5903230958882113, + "language_loss": 0.80446184, + "learning_rate": 8.754174284066462e-08, + "loss": 0.8811484, + "num_input_tokens_seen": 325874890, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.10101318, + "step": 15107, + "time_per_iteration": 2.560788154602051 + }, + { + "auxiliary_loss_clip": 0.06312685, + "auxiliary_loss_mlp": 0.01250294, + "balance_loss_clip": 0.0625825, + "balance_loss_mlp": 0.01249236, + "epoch": 0.9083421013076808, + "flos": 59630535429120.0, + "grad_norm": 0.8314070940246863, + "language_loss": 0.59992969, + "learning_rate": 8.742781523089205e-08, + "loss": 0.67555946, + "num_input_tokens_seen": 325935835, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.0105896, + "step": 15108, + "time_per_iteration": 3.0896317958831787 + }, + { + "auxiliary_loss_clip": 0.06400774, + "auxiliary_loss_mlp": 0.01261142, + "balance_loss_clip": 0.06267995, + "balance_loss_mlp": 0.01252034, + "epoch": 0.9084022245603487, + "flos": 33628652956800.0, + "grad_norm": 2.03070687094374, + "language_loss": 0.74325216, + "learning_rate": 8.73139601460482e-08, + "loss": 0.81987131, + "num_input_tokens_seen": 325958035, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.09112549, + "step": 15109, + "time_per_iteration": 2.618248462677002 + }, + { + "auxiliary_loss_clip": 0.06398752, + "auxiliary_loss_mlp": 0.01262631, + "balance_loss_clip": 0.06270365, + "balance_loss_mlp": 0.01253815, + "epoch": 0.9084623478130167, + "flos": 24978465031680.0, + "grad_norm": 2.0096064178066273, + "language_loss": 0.71743369, + "learning_rate": 8.720017759045073e-08, + "loss": 0.79404759, + "num_input_tokens_seen": 325979870, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.08807373, + "step": 15110, + "time_per_iteration": 3.9737777709960938 + }, + { + "auxiliary_loss_clip": 0.06398316, + "auxiliary_loss_mlp": 0.01263963, + "balance_loss_clip": 0.06271097, + "balance_loss_mlp": 0.01254802, + "epoch": 0.9085224710656846, + "flos": 31468918769280.0, + "grad_norm": 2.1410515920625364, + "language_loss": 0.68859386, + "learning_rate": 8.708646756841421e-08, + "loss": 0.76521665, + "num_input_tokens_seen": 325998245, + "router_z_loss_clip": 1.2734375, + "router_z_loss_mlp": 0.09161377, + "step": 15111, + "time_per_iteration": 2.568233013153076 + }, + { + "auxiliary_loss_clip": 0.06308745, + "auxiliary_loss_mlp": 0.01249082, + "balance_loss_clip": 0.06254536, + "balance_loss_mlp": 0.01248148, + "epoch": 0.9085825943183526, + "flos": 64935450074880.0, + "grad_norm": 0.6818975607395432, + "language_loss": 0.51562428, + "learning_rate": 8.697283008425026e-08, + "loss": 0.5912025, + "num_input_tokens_seen": 326061770, + "router_z_loss_clip": 0.54248047, + "router_z_loss_mlp": 0.00931549, + "step": 15112, + "time_per_iteration": 3.218057632446289 + }, + { + "auxiliary_loss_clip": 0.06401073, + "auxiliary_loss_mlp": 0.01265191, + "balance_loss_clip": 0.06268831, + "balance_loss_mlp": 0.01256173, + "epoch": 0.9086427175710206, + "flos": 18959253056640.0, + "grad_norm": 1.927505414115429, + "language_loss": 0.70069271, + "learning_rate": 8.685926514226837e-08, + "loss": 0.77735531, + "num_input_tokens_seen": 326080945, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.09014893, + "step": 15113, + "time_per_iteration": 2.580868721008301 + }, + { + "auxiliary_loss_clip": 0.06401566, + "auxiliary_loss_mlp": 0.01267122, + "balance_loss_clip": 0.06270175, + "balance_loss_mlp": 0.01257699, + "epoch": 0.9087028408236886, + "flos": 34022258311680.0, + "grad_norm": 2.107615186119017, + "language_loss": 0.79321289, + "learning_rate": 8.674577274677508e-08, + "loss": 0.86989981, + "num_input_tokens_seen": 326100630, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09429932, + "step": 15114, + "time_per_iteration": 4.032289028167725 + }, + { + "auxiliary_loss_clip": 0.06410873, + "auxiliary_loss_mlp": 0.01266597, + "balance_loss_clip": 0.06274423, + "balance_loss_mlp": 0.01256035, + "epoch": 0.9087629640763565, + "flos": 21951032688000.0, + "grad_norm": 1.9480884837439871, + "language_loss": 0.70168352, + "learning_rate": 8.663235290207405e-08, + "loss": 0.77845824, + "num_input_tokens_seen": 326120145, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.10552979, + "step": 15115, + "time_per_iteration": 2.5174953937530518 + }, + { + "auxiliary_loss_clip": 0.06407836, + "auxiliary_loss_mlp": 0.01262941, + "balance_loss_clip": 0.06271894, + "balance_loss_mlp": 0.01252754, + "epoch": 0.9088230873290245, + "flos": 21769456890240.0, + "grad_norm": 1.407962111970601, + "language_loss": 0.65673447, + "learning_rate": 8.651900561246561e-08, + "loss": 0.73344225, + "num_input_tokens_seen": 326140715, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.10180664, + "step": 15116, + "time_per_iteration": 3.929631471633911 + }, + { + "auxiliary_loss_clip": 0.06397676, + "auxiliary_loss_mlp": 0.01267156, + "balance_loss_clip": 0.06271522, + "balance_loss_mlp": 0.01257566, + "epoch": 0.9088832105816925, + "flos": 21547322916480.0, + "grad_norm": 1.4695312079859524, + "language_loss": 0.69494951, + "learning_rate": 8.640573088224812e-08, + "loss": 0.77159774, + "num_input_tokens_seen": 326159130, + "router_z_loss_clip": 1.26269531, + "router_z_loss_mlp": 0.09588623, + "step": 15117, + "time_per_iteration": 2.5169076919555664 + }, + { + "auxiliary_loss_clip": 0.06400181, + "auxiliary_loss_mlp": 0.01267852, + "balance_loss_clip": 0.06269901, + "balance_loss_mlp": 0.01258715, + "epoch": 0.9089433338343604, + "flos": 26004203187840.0, + "grad_norm": 1.358588776880828, + "language_loss": 0.74719739, + "learning_rate": 8.629252871571745e-08, + "loss": 0.82387769, + "num_input_tokens_seen": 326181375, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.09143066, + "step": 15118, + "time_per_iteration": 2.5945725440979004 + }, + { + "auxiliary_loss_clip": 0.06408937, + "auxiliary_loss_mlp": 0.0126524, + "balance_loss_clip": 0.06269845, + "balance_loss_mlp": 0.01254183, + "epoch": 0.9090034570870285, + "flos": 21184758299520.0, + "grad_norm": 2.0413531147204345, + "language_loss": 0.72784328, + "learning_rate": 8.617939911716554e-08, + "loss": 0.8045851, + "num_input_tokens_seen": 326199740, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.1104126, + "step": 15119, + "time_per_iteration": 2.5365755558013916 + }, + { + "auxiliary_loss_clip": 0.06409705, + "auxiliary_loss_mlp": 0.01263579, + "balance_loss_clip": 0.0627287, + "balance_loss_mlp": 0.0125263, + "epoch": 0.9090635803396964, + "flos": 16147036725120.0, + "grad_norm": 2.3146876326826233, + "language_loss": 0.71590072, + "learning_rate": 8.60663420908827e-08, + "loss": 0.79263353, + "num_input_tokens_seen": 326214350, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.10943604, + "step": 15120, + "time_per_iteration": 2.4872450828552246 + }, + { + "auxiliary_loss_clip": 0.06401677, + "auxiliary_loss_mlp": 0.01262323, + "balance_loss_clip": 0.06268568, + "balance_loss_mlp": 0.01252894, + "epoch": 0.9091237035923644, + "flos": 20597250597120.0, + "grad_norm": 1.9625105264787481, + "language_loss": 0.66382855, + "learning_rate": 8.595335764115596e-08, + "loss": 0.74046856, + "num_input_tokens_seen": 326234580, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.09429932, + "step": 15121, + "time_per_iteration": 2.5439295768737793 + }, + { + "auxiliary_loss_clip": 0.06402369, + "auxiliary_loss_mlp": 0.01269485, + "balance_loss_clip": 0.06271164, + "balance_loss_mlp": 0.01259179, + "epoch": 0.9091838268450323, + "flos": 52239275902080.0, + "grad_norm": 1.9522631564696673, + "language_loss": 0.70143443, + "learning_rate": 8.58404457722699e-08, + "loss": 0.77815294, + "num_input_tokens_seen": 326259080, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.10302734, + "step": 15122, + "time_per_iteration": 2.7782716751098633 + }, + { + "auxiliary_loss_clip": 0.06399389, + "auxiliary_loss_mlp": 0.01262307, + "balance_loss_clip": 0.06270258, + "balance_loss_mlp": 0.01253009, + "epoch": 0.9092439500977003, + "flos": 20566084078080.0, + "grad_norm": 1.2228012273882412, + "language_loss": 0.74737382, + "learning_rate": 8.572760648850575e-08, + "loss": 0.8239907, + "num_input_tokens_seen": 326280175, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.09295654, + "step": 15123, + "time_per_iteration": 2.548868417739868 + }, + { + "auxiliary_loss_clip": 0.06397559, + "auxiliary_loss_mlp": 0.01264083, + "balance_loss_clip": 0.06270659, + "balance_loss_mlp": 0.01255303, + "epoch": 0.9093040733503682, + "flos": 28624823159040.0, + "grad_norm": 1.786331644949096, + "language_loss": 0.75845641, + "learning_rate": 8.561483979414253e-08, + "loss": 0.83507288, + "num_input_tokens_seen": 326297990, + "router_z_loss_clip": 1.27050781, + "router_z_loss_mlp": 0.08782959, + "step": 15124, + "time_per_iteration": 2.561037302017212 + }, + { + "auxiliary_loss_clip": 0.06398606, + "auxiliary_loss_mlp": 0.01266988, + "balance_loss_clip": 0.06268884, + "balance_loss_mlp": 0.0125766, + "epoch": 0.9093641966030362, + "flos": 23446838649600.0, + "grad_norm": 1.8436669176844096, + "language_loss": 0.72484279, + "learning_rate": 8.55021456934566e-08, + "loss": 0.80149877, + "num_input_tokens_seen": 326316735, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.09326172, + "step": 15125, + "time_per_iteration": 2.519473075866699 + }, + { + "auxiliary_loss_clip": 0.06397496, + "auxiliary_loss_mlp": 0.01263813, + "balance_loss_clip": 0.06270289, + "balance_loss_mlp": 0.0125501, + "epoch": 0.9094243198557042, + "flos": 16805807925120.0, + "grad_norm": 1.5501227828920265, + "language_loss": 0.79221696, + "learning_rate": 8.538952419072143e-08, + "loss": 0.86883008, + "num_input_tokens_seen": 326334370, + "router_z_loss_clip": 1.2734375, + "router_z_loss_mlp": 0.08795166, + "step": 15126, + "time_per_iteration": 2.4721574783325195 + }, + { + "auxiliary_loss_clip": 0.0640032, + "auxiliary_loss_mlp": 0.01267544, + "balance_loss_clip": 0.06272551, + "balance_loss_mlp": 0.012588, + "epoch": 0.9094844431083722, + "flos": 24279051801600.0, + "grad_norm": 1.446842251564929, + "language_loss": 0.75611615, + "learning_rate": 8.527697529020694e-08, + "loss": 0.83279485, + "num_input_tokens_seen": 326353435, + "router_z_loss_clip": 1.27929688, + "router_z_loss_mlp": 0.08743286, + "step": 15127, + "time_per_iteration": 2.519174337387085 + }, + { + "auxiliary_loss_clip": 0.06402364, + "auxiliary_loss_mlp": 0.01263756, + "balance_loss_clip": 0.06269338, + "balance_loss_mlp": 0.01254607, + "epoch": 0.9095445663610401, + "flos": 21951116542080.0, + "grad_norm": 2.994024762493421, + "language_loss": 0.62593842, + "learning_rate": 8.516449899618173e-08, + "loss": 0.70259964, + "num_input_tokens_seen": 326371810, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.0914917, + "step": 15128, + "time_per_iteration": 2.492807388305664 + }, + { + "auxiliary_loss_clip": 0.06399337, + "auxiliary_loss_mlp": 0.01263081, + "balance_loss_clip": 0.06269845, + "balance_loss_mlp": 0.01253616, + "epoch": 0.9096046896137081, + "flos": 19799096929920.0, + "grad_norm": 2.004480478019155, + "language_loss": 0.76882553, + "learning_rate": 8.505209531291013e-08, + "loss": 0.84544969, + "num_input_tokens_seen": 326391380, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.09466553, + "step": 15129, + "time_per_iteration": 2.5134694576263428 + }, + { + "auxiliary_loss_clip": 0.06405028, + "auxiliary_loss_mlp": 0.01262605, + "balance_loss_clip": 0.06271479, + "balance_loss_mlp": 0.0125302, + "epoch": 0.909664812866376, + "flos": 22644701913600.0, + "grad_norm": 1.922995524134768, + "language_loss": 0.84000599, + "learning_rate": 8.49397642446552e-08, + "loss": 0.91668236, + "num_input_tokens_seen": 326408800, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.09576416, + "step": 15130, + "time_per_iteration": 2.49751353263855 + }, + { + "auxiliary_loss_clip": 0.06402621, + "auxiliary_loss_mlp": 0.01262359, + "balance_loss_clip": 0.06272228, + "balance_loss_mlp": 0.01252691, + "epoch": 0.909724936119044, + "flos": 39860439540480.0, + "grad_norm": 1.684451923385225, + "language_loss": 0.75303972, + "learning_rate": 8.482750579567644e-08, + "loss": 0.8296895, + "num_input_tokens_seen": 326431565, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.09661865, + "step": 15131, + "time_per_iteration": 2.6618237495422363 + }, + { + "auxiliary_loss_clip": 0.06401692, + "auxiliary_loss_mlp": 0.0126297, + "balance_loss_clip": 0.06270601, + "balance_loss_mlp": 0.01253773, + "epoch": 0.9097850593717121, + "flos": 35078953351680.0, + "grad_norm": 1.8018216027233815, + "language_loss": 0.59644824, + "learning_rate": 8.471531997023085e-08, + "loss": 0.67309487, + "num_input_tokens_seen": 326451715, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09204102, + "step": 15132, + "time_per_iteration": 2.5843985080718994 + }, + { + "auxiliary_loss_clip": 0.06398638, + "auxiliary_loss_mlp": 0.0126275, + "balance_loss_clip": 0.06269633, + "balance_loss_mlp": 0.01254317, + "epoch": 0.90984518262438, + "flos": 23374149632640.0, + "grad_norm": 1.413260935585949, + "language_loss": 0.83113134, + "learning_rate": 8.460320677257193e-08, + "loss": 0.90774524, + "num_input_tokens_seen": 326470855, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.08435059, + "step": 15133, + "time_per_iteration": 2.537156581878662 + }, + { + "auxiliary_loss_clip": 0.06399462, + "auxiliary_loss_mlp": 0.01262679, + "balance_loss_clip": 0.0626839, + "balance_loss_mlp": 0.01253434, + "epoch": 0.909905305877048, + "flos": 27530085565440.0, + "grad_norm": 1.6843084731476905, + "language_loss": 0.73938394, + "learning_rate": 8.449116620695118e-08, + "loss": 0.81600529, + "num_input_tokens_seen": 326490480, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.09240723, + "step": 15134, + "time_per_iteration": 2.5576279163360596 + }, + { + "auxiliary_loss_clip": 0.06413636, + "auxiliary_loss_mlp": 0.01264703, + "balance_loss_clip": 0.062745, + "balance_loss_mlp": 0.01255179, + "epoch": 0.9099654291297159, + "flos": 24353921024640.0, + "grad_norm": 1.4339167033731788, + "language_loss": 0.73107815, + "learning_rate": 8.437919827761786e-08, + "loss": 0.80786151, + "num_input_tokens_seen": 326509445, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.09521484, + "step": 15135, + "time_per_iteration": 2.600571870803833 + }, + { + "auxiliary_loss_clip": 0.06398353, + "auxiliary_loss_mlp": 0.01262496, + "balance_loss_clip": 0.06270214, + "balance_loss_mlp": 0.01253162, + "epoch": 0.9100255523823839, + "flos": 21221626896000.0, + "grad_norm": 1.7085160018816423, + "language_loss": 0.70284522, + "learning_rate": 8.426730298881702e-08, + "loss": 0.77945369, + "num_input_tokens_seen": 326528380, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.09332275, + "step": 15136, + "time_per_iteration": 2.4926037788391113 + }, + { + "auxiliary_loss_clip": 0.06304874, + "auxiliary_loss_mlp": 0.01251653, + "balance_loss_clip": 0.06250328, + "balance_loss_mlp": 0.01250625, + "epoch": 0.9100856756350518, + "flos": 46067292005760.0, + "grad_norm": 0.80453023989808, + "language_loss": 0.59098959, + "learning_rate": 8.415548034479214e-08, + "loss": 0.66655481, + "num_input_tokens_seen": 326576940, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.01027679, + "step": 15137, + "time_per_iteration": 2.8737428188323975 + }, + { + "auxiliary_loss_clip": 0.06404972, + "auxiliary_loss_mlp": 0.01264173, + "balance_loss_clip": 0.06272592, + "balance_loss_mlp": 0.01255208, + "epoch": 0.9101457988877198, + "flos": 20236111499520.0, + "grad_norm": 1.4827649946447703, + "language_loss": 0.82628894, + "learning_rate": 8.40437303497834e-08, + "loss": 0.90298033, + "num_input_tokens_seen": 326596100, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.08966064, + "step": 15138, + "time_per_iteration": 2.4917473793029785 + }, + { + "auxiliary_loss_clip": 0.06394204, + "auxiliary_loss_mlp": 0.01261553, + "balance_loss_clip": 0.06268851, + "balance_loss_mlp": 0.01252928, + "epoch": 0.9102059221403878, + "flos": 26622458138880.0, + "grad_norm": 2.0023017385136392, + "language_loss": 0.81339759, + "learning_rate": 8.39320530080283e-08, + "loss": 0.8899551, + "num_input_tokens_seen": 326615700, + "router_z_loss_clip": 1.25390625, + "router_z_loss_mlp": 0.08636475, + "step": 15139, + "time_per_iteration": 2.5509281158447266 + }, + { + "auxiliary_loss_clip": 0.0640308, + "auxiliary_loss_mlp": 0.01263473, + "balance_loss_clip": 0.06273255, + "balance_loss_mlp": 0.01254026, + "epoch": 0.9102660453930558, + "flos": 21915086486400.0, + "grad_norm": 1.5474154648257277, + "language_loss": 0.77706277, + "learning_rate": 8.382044832376167e-08, + "loss": 0.85372829, + "num_input_tokens_seen": 326635905, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.09454346, + "step": 15140, + "time_per_iteration": 3.9355709552764893 + }, + { + "auxiliary_loss_clip": 0.06400235, + "auxiliary_loss_mlp": 0.01260713, + "balance_loss_clip": 0.06271002, + "balance_loss_mlp": 0.01252071, + "epoch": 0.9103261686457237, + "flos": 36185933640960.0, + "grad_norm": 1.8719337735504868, + "language_loss": 0.66449845, + "learning_rate": 8.370891630121569e-08, + "loss": 0.74110788, + "num_input_tokens_seen": 326661855, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.08648682, + "step": 15141, + "time_per_iteration": 2.647343873977661 + }, + { + "auxiliary_loss_clip": 0.06405683, + "auxiliary_loss_mlp": 0.01266424, + "balance_loss_clip": 0.06270161, + "balance_loss_mlp": 0.01256976, + "epoch": 0.9103862918983917, + "flos": 23885362592640.0, + "grad_norm": 6.054142486418284, + "language_loss": 0.75214803, + "learning_rate": 8.359745694462005e-08, + "loss": 0.8288691, + "num_input_tokens_seen": 326679320, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.09448242, + "step": 15142, + "time_per_iteration": 2.5260467529296875 + }, + { + "auxiliary_loss_clip": 0.06397744, + "auxiliary_loss_mlp": 0.01263466, + "balance_loss_clip": 0.06268731, + "balance_loss_mlp": 0.01254508, + "epoch": 0.9104464151510596, + "flos": 14944837870080.0, + "grad_norm": 1.6281016166898625, + "language_loss": 0.64735144, + "learning_rate": 8.348607025820076e-08, + "loss": 0.7239635, + "num_input_tokens_seen": 326698110, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.08959961, + "step": 15143, + "time_per_iteration": 2.478365182876587 + }, + { + "auxiliary_loss_clip": 0.06402953, + "auxiliary_loss_mlp": 0.01262903, + "balance_loss_clip": 0.06269629, + "balance_loss_mlp": 0.0125302, + "epoch": 0.9105065384037276, + "flos": 33664096033920.0, + "grad_norm": 1.8192012493861849, + "language_loss": 0.61270368, + "learning_rate": 8.337475624618152e-08, + "loss": 0.68936229, + "num_input_tokens_seen": 326718370, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.09875488, + "step": 15144, + "time_per_iteration": 2.612241506576538 + }, + { + "auxiliary_loss_clip": 0.06393068, + "auxiliary_loss_mlp": 0.01265463, + "balance_loss_clip": 0.06268917, + "balance_loss_mlp": 0.0125663, + "epoch": 0.9105666616563957, + "flos": 24323634973440.0, + "grad_norm": 1.7059892216742707, + "language_loss": 0.71336597, + "learning_rate": 8.326351491278382e-08, + "loss": 0.78995132, + "num_input_tokens_seen": 326738445, + "router_z_loss_clip": 1.24121094, + "router_z_loss_mlp": 0.08837891, + "step": 15145, + "time_per_iteration": 2.5258352756500244 + }, + { + "auxiliary_loss_clip": 0.06395367, + "auxiliary_loss_mlp": 0.01263535, + "balance_loss_clip": 0.06269669, + "balance_loss_mlp": 0.01254458, + "epoch": 0.9106267849090636, + "flos": 29979527644800.0, + "grad_norm": 1.5087408228781412, + "language_loss": 0.7090916, + "learning_rate": 8.315234626222545e-08, + "loss": 0.78568059, + "num_input_tokens_seen": 326758855, + "router_z_loss_clip": 1.25585938, + "router_z_loss_mlp": 0.09069824, + "step": 15146, + "time_per_iteration": 2.7402799129486084 + }, + { + "auxiliary_loss_clip": 0.06400052, + "auxiliary_loss_mlp": 0.01262786, + "balance_loss_clip": 0.06270608, + "balance_loss_mlp": 0.01254155, + "epoch": 0.9106869081617316, + "flos": 25344761155200.0, + "grad_norm": 1.7237443516781754, + "language_loss": 0.73024035, + "learning_rate": 8.304125029872233e-08, + "loss": 0.80686873, + "num_input_tokens_seen": 326777140, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.08624268, + "step": 15147, + "time_per_iteration": 2.5613772869110107 + }, + { + "auxiliary_loss_clip": 0.06405227, + "auxiliary_loss_mlp": 0.01263577, + "balance_loss_clip": 0.06269574, + "balance_loss_mlp": 0.01254267, + "epoch": 0.9107470314143995, + "flos": 18192936741120.0, + "grad_norm": 1.8228865120504234, + "language_loss": 0.80208182, + "learning_rate": 8.293022702648711e-08, + "loss": 0.87876976, + "num_input_tokens_seen": 326794070, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.09307861, + "step": 15148, + "time_per_iteration": 2.4916961193084717 + }, + { + "auxiliary_loss_clip": 0.06404668, + "auxiliary_loss_mlp": 0.0126411, + "balance_loss_clip": 0.06271308, + "balance_loss_mlp": 0.01254412, + "epoch": 0.9108071546670675, + "flos": 23557696001280.0, + "grad_norm": 1.6542822970415358, + "language_loss": 0.68148386, + "learning_rate": 8.281927644972996e-08, + "loss": 0.75817162, + "num_input_tokens_seen": 326814695, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.09692383, + "step": 15149, + "time_per_iteration": 3.9529452323913574 + }, + { + "auxiliary_loss_clip": 0.06406561, + "auxiliary_loss_mlp": 0.01265217, + "balance_loss_clip": 0.06273574, + "balance_loss_mlp": 0.01256035, + "epoch": 0.9108672779197354, + "flos": 25637487793920.0, + "grad_norm": 1.477688710921721, + "language_loss": 0.63625479, + "learning_rate": 8.270839857265776e-08, + "loss": 0.71297264, + "num_input_tokens_seen": 326835295, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.09179688, + "step": 15150, + "time_per_iteration": 2.53456449508667 + }, + { + "auxiliary_loss_clip": 0.06401673, + "auxiliary_loss_mlp": 0.0126291, + "balance_loss_clip": 0.0627019, + "balance_loss_mlp": 0.01253874, + "epoch": 0.9109274011724035, + "flos": 22344470334720.0, + "grad_norm": 1.7663276861657815, + "language_loss": 0.73236012, + "learning_rate": 8.259759339947514e-08, + "loss": 0.80900592, + "num_input_tokens_seen": 326853350, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09039307, + "step": 15151, + "time_per_iteration": 2.515439510345459 + }, + { + "auxiliary_loss_clip": 0.06399186, + "auxiliary_loss_mlp": 0.01265861, + "balance_loss_clip": 0.06269082, + "balance_loss_mlp": 0.01256437, + "epoch": 0.9109875244250714, + "flos": 26695524499200.0, + "grad_norm": 1.4955695387299417, + "language_loss": 0.64540172, + "learning_rate": 8.248686093438429e-08, + "loss": 0.72205222, + "num_input_tokens_seen": 326873425, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.09417725, + "step": 15152, + "time_per_iteration": 2.547096014022827 + }, + { + "auxiliary_loss_clip": 0.06403639, + "auxiliary_loss_mlp": 0.01266075, + "balance_loss_clip": 0.06273131, + "balance_loss_mlp": 0.01256735, + "epoch": 0.9110476476777394, + "flos": 22936799646720.0, + "grad_norm": 1.8229658483887674, + "language_loss": 0.73700202, + "learning_rate": 8.23762011815834e-08, + "loss": 0.81369913, + "num_input_tokens_seen": 326893455, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.09338379, + "step": 15153, + "time_per_iteration": 2.515530824661255 + }, + { + "auxiliary_loss_clip": 0.06403325, + "auxiliary_loss_mlp": 0.01264561, + "balance_loss_clip": 0.06271794, + "balance_loss_mlp": 0.01254756, + "epoch": 0.9111077709304073, + "flos": 13476718483200.0, + "grad_norm": 1.9576939804869533, + "language_loss": 0.7254191, + "learning_rate": 8.226561414526956e-08, + "loss": 0.80209798, + "num_input_tokens_seen": 326910210, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09802246, + "step": 15154, + "time_per_iteration": 4.011706590652466 + }, + { + "auxiliary_loss_clip": 0.06400883, + "auxiliary_loss_mlp": 0.01264225, + "balance_loss_clip": 0.06272145, + "balance_loss_mlp": 0.01254599, + "epoch": 0.9111678941830753, + "flos": 20856924000000.0, + "grad_norm": 1.6070045592329703, + "language_loss": 0.82313609, + "learning_rate": 8.215509982963564e-08, + "loss": 0.89978719, + "num_input_tokens_seen": 326929350, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.09631348, + "step": 15155, + "time_per_iteration": 2.5135505199432373 + }, + { + "auxiliary_loss_clip": 0.06403641, + "auxiliary_loss_mlp": 0.012676, + "balance_loss_clip": 0.06273505, + "balance_loss_mlp": 0.01258659, + "epoch": 0.9112280174357432, + "flos": 19688281505280.0, + "grad_norm": 1.4380707223539104, + "language_loss": 0.59939194, + "learning_rate": 8.204465823887252e-08, + "loss": 0.67610437, + "num_input_tokens_seen": 326949060, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.0894165, + "step": 15156, + "time_per_iteration": 3.994004487991333 + }, + { + "auxiliary_loss_clip": 0.06406192, + "auxiliary_loss_mlp": 0.01265569, + "balance_loss_clip": 0.0627121, + "balance_loss_mlp": 0.01254643, + "epoch": 0.9112881406884112, + "flos": 25454192987520.0, + "grad_norm": 1.7593571365414977, + "language_loss": 0.74333876, + "learning_rate": 8.193428937716796e-08, + "loss": 0.82005632, + "num_input_tokens_seen": 326968950, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.10919189, + "step": 15157, + "time_per_iteration": 2.54280686378479 + }, + { + "auxiliary_loss_clip": 0.0640168, + "auxiliary_loss_mlp": 0.01261948, + "balance_loss_clip": 0.06268957, + "balance_loss_mlp": 0.01253401, + "epoch": 0.9113482639410793, + "flos": 33074324271360.0, + "grad_norm": 1.6469178321530784, + "language_loss": 0.59426653, + "learning_rate": 8.182399324870747e-08, + "loss": 0.67090285, + "num_input_tokens_seen": 326989455, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.08551025, + "step": 15158, + "time_per_iteration": 2.6101877689361572 + }, + { + "auxiliary_loss_clip": 0.0639876, + "auxiliary_loss_mlp": 0.01263604, + "balance_loss_clip": 0.0626954, + "balance_loss_mlp": 0.01254717, + "epoch": 0.9114083871937472, + "flos": 21842103980160.0, + "grad_norm": 1.7579172043530233, + "language_loss": 0.6775853, + "learning_rate": 8.171376985767375e-08, + "loss": 0.75420892, + "num_input_tokens_seen": 327009640, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.08886719, + "step": 15159, + "time_per_iteration": 2.4980640411376953 + }, + { + "auxiliary_loss_clip": 0.06402466, + "auxiliary_loss_mlp": 0.01262425, + "balance_loss_clip": 0.06270958, + "balance_loss_mlp": 0.01253777, + "epoch": 0.9114685104464152, + "flos": 27096299377920.0, + "grad_norm": 1.9611572487780382, + "language_loss": 0.78373706, + "learning_rate": 8.160361920824588e-08, + "loss": 0.86038601, + "num_input_tokens_seen": 327027690, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.08654785, + "step": 15160, + "time_per_iteration": 2.5919408798217773 + }, + { + "auxiliary_loss_clip": 0.06406088, + "auxiliary_loss_mlp": 0.01266258, + "balance_loss_clip": 0.06273904, + "balance_loss_mlp": 0.01256048, + "epoch": 0.9115286336990831, + "flos": 17972731411200.0, + "grad_norm": 1.807136826815418, + "language_loss": 0.69505328, + "learning_rate": 8.149354130460073e-08, + "loss": 0.77177674, + "num_input_tokens_seen": 327045915, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.10205078, + "step": 15161, + "time_per_iteration": 2.484355926513672 + }, + { + "auxiliary_loss_clip": 0.06401908, + "auxiliary_loss_mlp": 0.01265154, + "balance_loss_clip": 0.06269228, + "balance_loss_mlp": 0.01255099, + "epoch": 0.9115887569517511, + "flos": 22936506157440.0, + "grad_norm": 1.654027416988286, + "language_loss": 0.75972486, + "learning_rate": 8.138353615091321e-08, + "loss": 0.8363955, + "num_input_tokens_seen": 327066355, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.10058594, + "step": 15162, + "time_per_iteration": 2.5151309967041016 + }, + { + "auxiliary_loss_clip": 0.06398072, + "auxiliary_loss_mlp": 0.01262761, + "balance_loss_clip": 0.06267852, + "balance_loss_mlp": 0.01253481, + "epoch": 0.911648880204419, + "flos": 23995339476480.0, + "grad_norm": 1.734863559014141, + "language_loss": 0.66808069, + "learning_rate": 8.127360375135395e-08, + "loss": 0.74468899, + "num_input_tokens_seen": 327086735, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.09283447, + "step": 15163, + "time_per_iteration": 2.5094223022460938 + }, + { + "auxiliary_loss_clip": 0.06410325, + "auxiliary_loss_mlp": 0.01262997, + "balance_loss_clip": 0.06272718, + "balance_loss_mlp": 0.01253347, + "epoch": 0.911709003457087, + "flos": 17060911280640.0, + "grad_norm": 2.5549807341049893, + "language_loss": 0.7104494, + "learning_rate": 8.116374411009186e-08, + "loss": 0.78718263, + "num_input_tokens_seen": 327104035, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.09661865, + "step": 15164, + "time_per_iteration": 2.524186849594116 + }, + { + "auxiliary_loss_clip": 0.06397158, + "auxiliary_loss_mlp": 0.01264303, + "balance_loss_clip": 0.0627102, + "balance_loss_mlp": 0.01255928, + "epoch": 0.911769126709755, + "flos": 21659857349760.0, + "grad_norm": 1.5173262591042511, + "language_loss": 0.76362646, + "learning_rate": 8.105395723129315e-08, + "loss": 0.84024107, + "num_input_tokens_seen": 327124370, + "router_z_loss_clip": 1.25976562, + "router_z_loss_mlp": 0.08374023, + "step": 15165, + "time_per_iteration": 2.5094478130340576 + }, + { + "auxiliary_loss_clip": 0.06401199, + "auxiliary_loss_mlp": 0.01263972, + "balance_loss_clip": 0.06269228, + "balance_loss_mlp": 0.01254036, + "epoch": 0.911829249962423, + "flos": 24797224650240.0, + "grad_norm": 2.5732167401800026, + "language_loss": 0.72387528, + "learning_rate": 8.094424311912074e-08, + "loss": 0.80052704, + "num_input_tokens_seen": 327140915, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09942627, + "step": 15166, + "time_per_iteration": 2.552232265472412 + }, + { + "auxiliary_loss_clip": 0.06402378, + "auxiliary_loss_mlp": 0.01264778, + "balance_loss_clip": 0.06268582, + "balance_loss_mlp": 0.01254472, + "epoch": 0.9118893732150909, + "flos": 20965684999680.0, + "grad_norm": 1.9072835391866958, + "language_loss": 0.7338112, + "learning_rate": 8.083460177773482e-08, + "loss": 0.81048274, + "num_input_tokens_seen": 327158940, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.10314941, + "step": 15167, + "time_per_iteration": 2.5074968338012695 + }, + { + "auxiliary_loss_clip": 0.06309468, + "auxiliary_loss_mlp": 0.01249426, + "balance_loss_clip": 0.06255375, + "balance_loss_mlp": 0.01248414, + "epoch": 0.9119494964677589, + "flos": 67937753393280.0, + "grad_norm": 0.7591368082582344, + "language_loss": 0.65499896, + "learning_rate": 8.072503321129298e-08, + "loss": 0.73058796, + "num_input_tokens_seen": 327217450, + "router_z_loss_clip": 0.54248047, + "router_z_loss_mlp": 0.01011658, + "step": 15168, + "time_per_iteration": 3.1166579723358154 + }, + { + "auxiliary_loss_clip": 0.06396022, + "auxiliary_loss_mlp": 0.01262898, + "balance_loss_clip": 0.06267242, + "balance_loss_mlp": 0.01254395, + "epoch": 0.9120096197204268, + "flos": 18557430001920.0, + "grad_norm": 2.4249937166543587, + "language_loss": 0.78455007, + "learning_rate": 8.061553742395033e-08, + "loss": 0.8611393, + "num_input_tokens_seen": 327233905, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.08499146, + "step": 15169, + "time_per_iteration": 2.4771196842193604 + }, + { + "auxiliary_loss_clip": 0.06401431, + "auxiliary_loss_mlp": 0.0126634, + "balance_loss_clip": 0.06269872, + "balance_loss_mlp": 0.01256595, + "epoch": 0.9120697429730948, + "flos": 19031690511360.0, + "grad_norm": 1.9684543700960608, + "language_loss": 0.82421303, + "learning_rate": 8.05061144198591e-08, + "loss": 0.90089071, + "num_input_tokens_seen": 327252430, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09741211, + "step": 15170, + "time_per_iteration": 2.4824554920196533 + }, + { + "auxiliary_loss_clip": 0.06403299, + "auxiliary_loss_mlp": 0.01265146, + "balance_loss_clip": 0.06272299, + "balance_loss_mlp": 0.01255424, + "epoch": 0.9121298662257629, + "flos": 17169127228800.0, + "grad_norm": 1.9931452501477718, + "language_loss": 0.77126348, + "learning_rate": 8.039676420316799e-08, + "loss": 0.84794796, + "num_input_tokens_seen": 327269215, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.09729004, + "step": 15171, + "time_per_iteration": 2.4650163650512695 + }, + { + "auxiliary_loss_clip": 0.06395893, + "auxiliary_loss_mlp": 0.0126533, + "balance_loss_clip": 0.06268039, + "balance_loss_mlp": 0.01255865, + "epoch": 0.9121899894784308, + "flos": 19688826556800.0, + "grad_norm": 1.334235978901617, + "language_loss": 0.6716094, + "learning_rate": 8.02874867780241e-08, + "loss": 0.74822164, + "num_input_tokens_seen": 327290320, + "router_z_loss_clip": 1.27832031, + "router_z_loss_mlp": 0.09466553, + "step": 15172, + "time_per_iteration": 2.513577461242676 + }, + { + "auxiliary_loss_clip": 0.06402537, + "auxiliary_loss_mlp": 0.01266519, + "balance_loss_clip": 0.0627134, + "balance_loss_mlp": 0.01256833, + "epoch": 0.9122501127310988, + "flos": 22242124172160.0, + "grad_norm": 1.593741100302707, + "language_loss": 0.75324094, + "learning_rate": 8.017828214857103e-08, + "loss": 0.82993144, + "num_input_tokens_seen": 327310150, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.09686279, + "step": 15173, + "time_per_iteration": 2.5007779598236084 + }, + { + "auxiliary_loss_clip": 0.06409647, + "auxiliary_loss_mlp": 0.01263462, + "balance_loss_clip": 0.06272635, + "balance_loss_mlp": 0.01253574, + "epoch": 0.9123102359837667, + "flos": 15961939056000.0, + "grad_norm": 2.6696952213402607, + "language_loss": 0.65961421, + "learning_rate": 8.00691503189499e-08, + "loss": 0.73634529, + "num_input_tokens_seen": 327326660, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.09893799, + "step": 15174, + "time_per_iteration": 2.4853627681732178 + }, + { + "auxiliary_loss_clip": 0.06404449, + "auxiliary_loss_mlp": 0.01266595, + "balance_loss_clip": 0.06270468, + "balance_loss_mlp": 0.01256862, + "epoch": 0.9123703592364347, + "flos": 25162849941120.0, + "grad_norm": 1.5703785543649638, + "language_loss": 0.75523746, + "learning_rate": 7.996009129329894e-08, + "loss": 0.83194792, + "num_input_tokens_seen": 327346700, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.09735107, + "step": 15175, + "time_per_iteration": 2.555255174636841 + }, + { + "auxiliary_loss_clip": 0.06308284, + "auxiliary_loss_mlp": 0.01250077, + "balance_loss_clip": 0.0625402, + "balance_loss_mlp": 0.01249143, + "epoch": 0.9124304824891026, + "flos": 60820659296640.0, + "grad_norm": 0.9596461602053525, + "language_loss": 0.58555514, + "learning_rate": 7.985110507575421e-08, + "loss": 0.66113877, + "num_input_tokens_seen": 327403050, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.00931549, + "step": 15176, + "time_per_iteration": 3.1778509616851807 + }, + { + "auxiliary_loss_clip": 0.06401191, + "auxiliary_loss_mlp": 0.01265992, + "balance_loss_clip": 0.06269446, + "balance_loss_mlp": 0.01256944, + "epoch": 0.9124906057417707, + "flos": 18156906685440.0, + "grad_norm": 1.7664992295670066, + "language_loss": 0.65369797, + "learning_rate": 7.97421916704475e-08, + "loss": 0.73036981, + "num_input_tokens_seen": 327422225, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09039307, + "step": 15177, + "time_per_iteration": 2.4894156455993652 + }, + { + "auxiliary_loss_clip": 0.06400608, + "auxiliary_loss_mlp": 0.01264318, + "balance_loss_clip": 0.06271262, + "balance_loss_mlp": 0.01255127, + "epoch": 0.9125507289944386, + "flos": 11690617651200.0, + "grad_norm": 2.0964544968020746, + "language_loss": 0.81507087, + "learning_rate": 7.963335108150926e-08, + "loss": 0.89172012, + "num_input_tokens_seen": 327437025, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.09191895, + "step": 15178, + "time_per_iteration": 2.4541144371032715 + }, + { + "auxiliary_loss_clip": 0.06400141, + "auxiliary_loss_mlp": 0.01263487, + "balance_loss_clip": 0.06270543, + "balance_loss_mlp": 0.01254373, + "epoch": 0.9126108522471066, + "flos": 17754580506240.0, + "grad_norm": 1.9718139410424265, + "language_loss": 0.7923696, + "learning_rate": 7.952458331306711e-08, + "loss": 0.86900592, + "num_input_tokens_seen": 327453915, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.09112549, + "step": 15179, + "time_per_iteration": 2.4755301475524902 + }, + { + "auxiliary_loss_clip": 0.06398898, + "auxiliary_loss_mlp": 0.01263453, + "balance_loss_clip": 0.06269644, + "balance_loss_mlp": 0.01254519, + "epoch": 0.9126709754997745, + "flos": 27643039269120.0, + "grad_norm": 1.5444069929332227, + "language_loss": 0.68083477, + "learning_rate": 7.941588836924507e-08, + "loss": 0.75745833, + "num_input_tokens_seen": 327474415, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.08935547, + "step": 15180, + "time_per_iteration": 3.9795782566070557 + }, + { + "auxiliary_loss_clip": 0.06395189, + "auxiliary_loss_mlp": 0.01265637, + "balance_loss_clip": 0.0626757, + "balance_loss_mlp": 0.01257203, + "epoch": 0.9127310987524425, + "flos": 15930520974720.0, + "grad_norm": 1.7977625815153482, + "language_loss": 0.75159156, + "learning_rate": 7.930726625416495e-08, + "loss": 0.82819986, + "num_input_tokens_seen": 327492750, + "router_z_loss_clip": 1.27734375, + "router_z_loss_mlp": 0.08428955, + "step": 15181, + "time_per_iteration": 2.493853807449341 + }, + { + "auxiliary_loss_clip": 0.06406903, + "auxiliary_loss_mlp": 0.0126666, + "balance_loss_clip": 0.06270269, + "balance_loss_mlp": 0.01257296, + "epoch": 0.9127912220051104, + "flos": 21542207817600.0, + "grad_norm": 1.6739957519158306, + "language_loss": 0.7473678, + "learning_rate": 7.919871697194614e-08, + "loss": 0.82410347, + "num_input_tokens_seen": 327509470, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.09375, + "step": 15182, + "time_per_iteration": 2.5281310081481934 + }, + { + "auxiliary_loss_clip": 0.06404476, + "auxiliary_loss_mlp": 0.01262375, + "balance_loss_clip": 0.06270052, + "balance_loss_mlp": 0.01252439, + "epoch": 0.9128513452577784, + "flos": 24070837605120.0, + "grad_norm": 1.3928021431516506, + "language_loss": 0.76586825, + "learning_rate": 7.909024052670421e-08, + "loss": 0.84253675, + "num_input_tokens_seen": 327530520, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.09936523, + "step": 15183, + "time_per_iteration": 2.549593448638916 + }, + { + "auxiliary_loss_clip": 0.06403659, + "auxiliary_loss_mlp": 0.01266055, + "balance_loss_clip": 0.06268917, + "balance_loss_mlp": 0.0125628, + "epoch": 0.9129114685104465, + "flos": 16221989802240.0, + "grad_norm": 2.1106683874916925, + "language_loss": 0.76460809, + "learning_rate": 7.898183692255256e-08, + "loss": 0.8413052, + "num_input_tokens_seen": 327546960, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.09765625, + "step": 15184, + "time_per_iteration": 2.4702370166778564 + }, + { + "auxiliary_loss_clip": 0.06401117, + "auxiliary_loss_mlp": 0.01265712, + "balance_loss_clip": 0.06270198, + "balance_loss_mlp": 0.01256283, + "epoch": 0.9129715917631144, + "flos": 19389349664640.0, + "grad_norm": 1.6484733671686076, + "language_loss": 0.74769634, + "learning_rate": 7.887350616360233e-08, + "loss": 0.8243646, + "num_input_tokens_seen": 327564830, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09423828, + "step": 15185, + "time_per_iteration": 2.492485761642456 + }, + { + "auxiliary_loss_clip": 0.06400957, + "auxiliary_loss_mlp": 0.01265918, + "balance_loss_clip": 0.0627048, + "balance_loss_mlp": 0.01256166, + "epoch": 0.9130317150157824, + "flos": 20595992785920.0, + "grad_norm": 2.006751528269808, + "language_loss": 0.68653584, + "learning_rate": 7.876524825396158e-08, + "loss": 0.76320457, + "num_input_tokens_seen": 327583675, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.09741211, + "step": 15186, + "time_per_iteration": 2.485649347305298 + }, + { + "auxiliary_loss_clip": 0.06410342, + "auxiliary_loss_mlp": 0.01262913, + "balance_loss_clip": 0.06271516, + "balance_loss_mlp": 0.01253096, + "epoch": 0.9130918382684503, + "flos": 20194714782720.0, + "grad_norm": 1.795742988224212, + "language_loss": 0.77302891, + "learning_rate": 7.865706319773502e-08, + "loss": 0.84976149, + "num_input_tokens_seen": 327602280, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.09820557, + "step": 15187, + "time_per_iteration": 2.4841318130493164 + }, + { + "auxiliary_loss_clip": 0.06398897, + "auxiliary_loss_mlp": 0.01263601, + "balance_loss_clip": 0.06267929, + "balance_loss_mlp": 0.01254571, + "epoch": 0.9131519615211183, + "flos": 25563960236160.0, + "grad_norm": 2.105861883241293, + "language_loss": 0.66391146, + "learning_rate": 7.854895099902515e-08, + "loss": 0.74053645, + "num_input_tokens_seen": 327623515, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.090271, + "step": 15188, + "time_per_iteration": 4.009814023971558 + }, + { + "auxiliary_loss_clip": 0.06398279, + "auxiliary_loss_mlp": 0.01266124, + "balance_loss_clip": 0.06269646, + "balance_loss_mlp": 0.01256492, + "epoch": 0.9132120847737862, + "flos": 17937414115200.0, + "grad_norm": 1.9445407212493928, + "language_loss": 0.76366603, + "learning_rate": 7.844091166193157e-08, + "loss": 0.84031004, + "num_input_tokens_seen": 327642875, + "router_z_loss_clip": 1.28613281, + "router_z_loss_mlp": 0.09631348, + "step": 15189, + "time_per_iteration": 2.485355854034424 + }, + { + "auxiliary_loss_clip": 0.06399502, + "auxiliary_loss_mlp": 0.01264259, + "balance_loss_clip": 0.06270356, + "balance_loss_mlp": 0.0125573, + "epoch": 0.9132722080264543, + "flos": 20053822942080.0, + "grad_norm": 1.763084249703843, + "language_loss": 0.76183271, + "learning_rate": 7.8332945190551e-08, + "loss": 0.83847034, + "num_input_tokens_seen": 327662450, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.08532715, + "step": 15190, + "time_per_iteration": 2.51477313041687 + }, + { + "auxiliary_loss_clip": 0.06304602, + "auxiliary_loss_mlp": 0.01248492, + "balance_loss_clip": 0.06250489, + "balance_loss_mlp": 0.01247529, + "epoch": 0.9133323312791222, + "flos": 70461603498240.0, + "grad_norm": 0.69994498946902, + "language_loss": 0.57092154, + "learning_rate": 7.822505158897797e-08, + "loss": 0.64645249, + "num_input_tokens_seen": 327723845, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.00962067, + "step": 15191, + "time_per_iteration": 3.1387722492218018 + }, + { + "auxiliary_loss_clip": 0.06404773, + "auxiliary_loss_mlp": 0.01266029, + "balance_loss_clip": 0.06270269, + "balance_loss_mlp": 0.01256611, + "epoch": 0.9133924545317902, + "flos": 25490851948800.0, + "grad_norm": 1.8893008015714516, + "language_loss": 0.74291134, + "learning_rate": 7.81172308613034e-08, + "loss": 0.8196193, + "num_input_tokens_seen": 327742590, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.09417725, + "step": 15192, + "time_per_iteration": 2.5557541847229004 + }, + { + "auxiliary_loss_clip": 0.06398205, + "auxiliary_loss_mlp": 0.01265254, + "balance_loss_clip": 0.06269048, + "balance_loss_mlp": 0.01255693, + "epoch": 0.9134525777844581, + "flos": 39939920737920.0, + "grad_norm": 1.5014180075629815, + "language_loss": 0.6911993, + "learning_rate": 7.800948301161647e-08, + "loss": 0.76783395, + "num_input_tokens_seen": 327764350, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.09558105, + "step": 15193, + "time_per_iteration": 4.1078901290893555 + }, + { + "auxiliary_loss_clip": 0.0639585, + "auxiliary_loss_mlp": 0.01260777, + "balance_loss_clip": 0.06267818, + "balance_loss_mlp": 0.01251891, + "epoch": 0.9135127010371261, + "flos": 20893037909760.0, + "grad_norm": 1.7245818478003463, + "language_loss": 0.73219973, + "learning_rate": 7.790180804400215e-08, + "loss": 0.80876601, + "num_input_tokens_seen": 327783120, + "router_z_loss_clip": 1.28222656, + "router_z_loss_mlp": 0.08880615, + "step": 15194, + "time_per_iteration": 2.547111988067627 + }, + { + "auxiliary_loss_clip": 0.06405854, + "auxiliary_loss_mlp": 0.01268882, + "balance_loss_clip": 0.06268877, + "balance_loss_mlp": 0.01257956, + "epoch": 0.913572824289794, + "flos": 20819468424960.0, + "grad_norm": 1.8031874353131485, + "language_loss": 0.62096351, + "learning_rate": 7.779420596254383e-08, + "loss": 0.69771087, + "num_input_tokens_seen": 327801960, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.10931396, + "step": 15195, + "time_per_iteration": 3.9417948722839355 + }, + { + "auxiliary_loss_clip": 0.06398496, + "auxiliary_loss_mlp": 0.01264512, + "balance_loss_clip": 0.06267463, + "balance_loss_mlp": 0.01255285, + "epoch": 0.913632947542462, + "flos": 25710470300160.0, + "grad_norm": 1.4279035452599953, + "language_loss": 0.7193073, + "learning_rate": 7.768667677132201e-08, + "loss": 0.79593736, + "num_input_tokens_seen": 327823795, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.09222412, + "step": 15196, + "time_per_iteration": 2.551023483276367 + }, + { + "auxiliary_loss_clip": 0.06397398, + "auxiliary_loss_mlp": 0.01267249, + "balance_loss_clip": 0.06269406, + "balance_loss_mlp": 0.01258421, + "epoch": 0.9136930707951301, + "flos": 26293366028160.0, + "grad_norm": 1.6867538606308004, + "language_loss": 0.71241689, + "learning_rate": 7.757922047441411e-08, + "loss": 0.78906339, + "num_input_tokens_seen": 327845175, + "router_z_loss_clip": 1.27929688, + "router_z_loss_mlp": 0.0881958, + "step": 15197, + "time_per_iteration": 2.5550129413604736 + }, + { + "auxiliary_loss_clip": 0.06408559, + "auxiliary_loss_mlp": 0.01262566, + "balance_loss_clip": 0.06272875, + "balance_loss_mlp": 0.01252784, + "epoch": 0.913753194047798, + "flos": 22098590928000.0, + "grad_norm": 1.8710706746015826, + "language_loss": 0.78052139, + "learning_rate": 7.747183707589489e-08, + "loss": 0.85723269, + "num_input_tokens_seen": 327863150, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.09783936, + "step": 15198, + "time_per_iteration": 2.5072240829467773 + }, + { + "auxiliary_loss_clip": 0.06394757, + "auxiliary_loss_mlp": 0.01263949, + "balance_loss_clip": 0.06267546, + "balance_loss_mlp": 0.01255193, + "epoch": 0.913813317300466, + "flos": 23594061473280.0, + "grad_norm": 1.2968049238366115, + "language_loss": 0.67974442, + "learning_rate": 7.736452657983616e-08, + "loss": 0.75633144, + "num_input_tokens_seen": 327883445, + "router_z_loss_clip": 1.27148438, + "router_z_loss_mlp": 0.08758545, + "step": 15199, + "time_per_iteration": 2.534032106399536 + }, + { + "auxiliary_loss_clip": 0.0640765, + "auxiliary_loss_mlp": 0.01264658, + "balance_loss_clip": 0.06274316, + "balance_loss_mlp": 0.01255145, + "epoch": 0.9138734405531339, + "flos": 28883993437440.0, + "grad_norm": 1.4924819881457518, + "language_loss": 0.676305, + "learning_rate": 7.725728899030714e-08, + "loss": 0.75302815, + "num_input_tokens_seen": 327905745, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.09515381, + "step": 15200, + "time_per_iteration": 2.5669631958007812 + }, + { + "auxiliary_loss_clip": 0.06398766, + "auxiliary_loss_mlp": 0.01266384, + "balance_loss_clip": 0.0627182, + "balance_loss_mlp": 0.01257456, + "epoch": 0.9139335638058019, + "flos": 22827829011840.0, + "grad_norm": 1.5812565319228622, + "language_loss": 0.7186532, + "learning_rate": 7.715012431137435e-08, + "loss": 0.79530466, + "num_input_tokens_seen": 327925435, + "router_z_loss_clip": 1.26855469, + "router_z_loss_mlp": 0.0892334, + "step": 15201, + "time_per_iteration": 2.5404951572418213 + }, + { + "auxiliary_loss_clip": 0.06400613, + "auxiliary_loss_mlp": 0.01260801, + "balance_loss_clip": 0.06268527, + "balance_loss_mlp": 0.01251843, + "epoch": 0.9139936870584698, + "flos": 18009977351040.0, + "grad_norm": 1.789758567160699, + "language_loss": 0.70331693, + "learning_rate": 7.704303254710165e-08, + "loss": 0.77993107, + "num_input_tokens_seen": 327944145, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.08959961, + "step": 15202, + "time_per_iteration": 2.6106953620910645 + }, + { + "auxiliary_loss_clip": 0.06399814, + "auxiliary_loss_mlp": 0.01264792, + "balance_loss_clip": 0.06268477, + "balance_loss_mlp": 0.01255016, + "epoch": 0.9140538103111379, + "flos": 15818992790400.0, + "grad_norm": 1.8688438464961967, + "language_loss": 0.6666283, + "learning_rate": 7.693601370155001e-08, + "loss": 0.74327433, + "num_input_tokens_seen": 327960565, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.09777832, + "step": 15203, + "time_per_iteration": 2.5028200149536133 + }, + { + "auxiliary_loss_clip": 0.06404755, + "auxiliary_loss_mlp": 0.01267578, + "balance_loss_clip": 0.06273845, + "balance_loss_mlp": 0.01258315, + "epoch": 0.9141139335638058, + "flos": 23993704321920.0, + "grad_norm": 1.615125656411442, + "language_loss": 0.69094318, + "learning_rate": 7.682906777877751e-08, + "loss": 0.76766646, + "num_input_tokens_seen": 327981180, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.0927124, + "step": 15204, + "time_per_iteration": 2.5456814765930176 + }, + { + "auxiliary_loss_clip": 0.0640422, + "auxiliary_loss_mlp": 0.01265902, + "balance_loss_clip": 0.06271097, + "balance_loss_mlp": 0.01256174, + "epoch": 0.9141740568164738, + "flos": 24031243751040.0, + "grad_norm": 1.933761420354856, + "language_loss": 0.60122651, + "learning_rate": 7.672219478283915e-08, + "loss": 0.67792773, + "num_input_tokens_seen": 328001500, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.09729004, + "step": 15205, + "time_per_iteration": 2.5356082916259766 + }, + { + "auxiliary_loss_clip": 0.06395389, + "auxiliary_loss_mlp": 0.01264629, + "balance_loss_clip": 0.06268188, + "balance_loss_mlp": 0.01255611, + "epoch": 0.9142341800691417, + "flos": 27025958275200.0, + "grad_norm": 1.7761946490024947, + "language_loss": 0.81234074, + "learning_rate": 7.661539471778811e-08, + "loss": 0.88894093, + "num_input_tokens_seen": 328023025, + "router_z_loss_clip": 1.27246094, + "router_z_loss_mlp": 0.09014893, + "step": 15206, + "time_per_iteration": 2.5894620418548584 + }, + { + "auxiliary_loss_clip": 0.06404903, + "auxiliary_loss_mlp": 0.0126205, + "balance_loss_clip": 0.06271455, + "balance_loss_mlp": 0.01253056, + "epoch": 0.9142943033218097, + "flos": 20418735473280.0, + "grad_norm": 2.8299467191418333, + "language_loss": 0.74824673, + "learning_rate": 7.650866758767382e-08, + "loss": 0.8249163, + "num_input_tokens_seen": 328041410, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.08996582, + "step": 15207, + "time_per_iteration": 2.5086050033569336 + }, + { + "auxiliary_loss_clip": 0.06402467, + "auxiliary_loss_mlp": 0.01264601, + "balance_loss_clip": 0.06271173, + "balance_loss_mlp": 0.01254892, + "epoch": 0.9143544265744776, + "flos": 19761389792640.0, + "grad_norm": 1.4655535636017647, + "language_loss": 0.72923332, + "learning_rate": 7.640201339654373e-08, + "loss": 0.80590397, + "num_input_tokens_seen": 328060495, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09710693, + "step": 15208, + "time_per_iteration": 2.5494110584259033 + }, + { + "auxiliary_loss_clip": 0.06401486, + "auxiliary_loss_mlp": 0.01262111, + "balance_loss_clip": 0.06272633, + "balance_loss_mlp": 0.01253522, + "epoch": 0.9144145498271457, + "flos": 17171181653760.0, + "grad_norm": 2.2763772203960086, + "language_loss": 0.86370367, + "learning_rate": 7.629543214844237e-08, + "loss": 0.94033957, + "num_input_tokens_seen": 328076905, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.0859375, + "step": 15209, + "time_per_iteration": 2.4788320064544678 + }, + { + "auxiliary_loss_clip": 0.06401129, + "auxiliary_loss_mlp": 0.01266162, + "balance_loss_clip": 0.06271241, + "balance_loss_mlp": 0.01257269, + "epoch": 0.9144746730798137, + "flos": 23731766858880.0, + "grad_norm": 1.579155450029156, + "language_loss": 0.75406897, + "learning_rate": 7.618892384741093e-08, + "loss": 0.83074194, + "num_input_tokens_seen": 328096960, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.08886719, + "step": 15210, + "time_per_iteration": 2.5567657947540283 + }, + { + "auxiliary_loss_clip": 0.06400596, + "auxiliary_loss_mlp": 0.01264105, + "balance_loss_clip": 0.06268501, + "balance_loss_mlp": 0.01255122, + "epoch": 0.9145347963324816, + "flos": 25854842085120.0, + "grad_norm": 1.979200231812929, + "language_loss": 0.77927828, + "learning_rate": 7.6082488497488e-08, + "loss": 0.85592532, + "num_input_tokens_seen": 328115445, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.08984375, + "step": 15211, + "time_per_iteration": 2.552198648452759 + }, + { + "auxiliary_loss_clip": 0.0640268, + "auxiliary_loss_mlp": 0.01261832, + "balance_loss_clip": 0.06270398, + "balance_loss_mlp": 0.01252629, + "epoch": 0.9145949195851496, + "flos": 19248457824000.0, + "grad_norm": 2.10166098094478, + "language_loss": 0.82732511, + "learning_rate": 7.597612610270986e-08, + "loss": 0.90397024, + "num_input_tokens_seen": 328133965, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.09204102, + "step": 15212, + "time_per_iteration": 2.513986110687256 + }, + { + "auxiliary_loss_clip": 0.06398089, + "auxiliary_loss_mlp": 0.01264444, + "balance_loss_clip": 0.06269515, + "balance_loss_mlp": 0.01255665, + "epoch": 0.9146550428378175, + "flos": 18302284719360.0, + "grad_norm": 1.652995444238016, + "language_loss": 0.84054744, + "learning_rate": 7.586983666711022e-08, + "loss": 0.91717279, + "num_input_tokens_seen": 328151520, + "router_z_loss_clip": 1.28613281, + "router_z_loss_mlp": 0.08776855, + "step": 15213, + "time_per_iteration": 2.4883370399475098 + }, + { + "auxiliary_loss_clip": 0.06401733, + "auxiliary_loss_mlp": 0.01264518, + "balance_loss_clip": 0.06270234, + "balance_loss_mlp": 0.01255261, + "epoch": 0.9147151660904855, + "flos": 20090481903360.0, + "grad_norm": 1.824328091244105, + "language_loss": 0.71026123, + "learning_rate": 7.576362019471894e-08, + "loss": 0.78692377, + "num_input_tokens_seen": 328171275, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09259033, + "step": 15214, + "time_per_iteration": 2.646428346633911 + }, + { + "auxiliary_loss_clip": 0.06405354, + "auxiliary_loss_mlp": 0.0126419, + "balance_loss_clip": 0.06271201, + "balance_loss_mlp": 0.01254623, + "epoch": 0.9147752893431534, + "flos": 24395988574080.0, + "grad_norm": 1.704762447240634, + "language_loss": 0.63240612, + "learning_rate": 7.565747668956413e-08, + "loss": 0.70910156, + "num_input_tokens_seen": 328192115, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.09564209, + "step": 15215, + "time_per_iteration": 2.53265643119812 + }, + { + "auxiliary_loss_clip": 0.06403671, + "auxiliary_loss_mlp": 0.01263526, + "balance_loss_clip": 0.0626839, + "balance_loss_mlp": 0.01253512, + "epoch": 0.9148354125958215, + "flos": 18156277779840.0, + "grad_norm": 2.2416131553032983, + "language_loss": 0.76165468, + "learning_rate": 7.555140615567058e-08, + "loss": 0.83832663, + "num_input_tokens_seen": 328208990, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.10009766, + "step": 15216, + "time_per_iteration": 2.4794795513153076 + }, + { + "auxiliary_loss_clip": 0.0640347, + "auxiliary_loss_mlp": 0.01269309, + "balance_loss_clip": 0.06272964, + "balance_loss_mlp": 0.0125951, + "epoch": 0.9148955358484894, + "flos": 23374233486720.0, + "grad_norm": 2.196642746611264, + "language_loss": 0.68317431, + "learning_rate": 7.544540859706062e-08, + "loss": 0.75990212, + "num_input_tokens_seen": 328227840, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.0980835, + "step": 15217, + "time_per_iteration": 2.5035665035247803 + }, + { + "auxiliary_loss_clip": 0.06397339, + "auxiliary_loss_mlp": 0.01263699, + "balance_loss_clip": 0.06268431, + "balance_loss_mlp": 0.01254029, + "epoch": 0.9149556591011574, + "flos": 18082205170560.0, + "grad_norm": 1.8248039597500896, + "language_loss": 0.80576724, + "learning_rate": 7.533948401775347e-08, + "loss": 0.88237762, + "num_input_tokens_seen": 328246250, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.09667969, + "step": 15218, + "time_per_iteration": 2.4810121059417725 + }, + { + "auxiliary_loss_clip": 0.06306933, + "auxiliary_loss_mlp": 0.01255386, + "balance_loss_clip": 0.06252693, + "balance_loss_mlp": 0.01254361, + "epoch": 0.9150157823538253, + "flos": 54602220240000.0, + "grad_norm": 0.8181156143430024, + "language_loss": 0.58716023, + "learning_rate": 7.523363242176595e-08, + "loss": 0.6627835, + "num_input_tokens_seen": 328303625, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.01025391, + "step": 15219, + "time_per_iteration": 4.510970592498779 + }, + { + "auxiliary_loss_clip": 0.0639798, + "auxiliary_loss_mlp": 0.01263707, + "balance_loss_clip": 0.06269677, + "balance_loss_mlp": 0.01254683, + "epoch": 0.9150759056064933, + "flos": 17898616874880.0, + "grad_norm": 2.651595808916399, + "language_loss": 0.78293604, + "learning_rate": 7.512785381311216e-08, + "loss": 0.85955286, + "num_input_tokens_seen": 328322135, + "router_z_loss_clip": 1.28222656, + "router_z_loss_mlp": 0.090271, + "step": 15220, + "time_per_iteration": 2.4863898754119873 + }, + { + "auxiliary_loss_clip": 0.06403407, + "auxiliary_loss_mlp": 0.01264138, + "balance_loss_clip": 0.0626848, + "balance_loss_mlp": 0.01254214, + "epoch": 0.9151360288591612, + "flos": 18078725226240.0, + "grad_norm": 1.7108553042471706, + "language_loss": 0.65879726, + "learning_rate": 7.50221481958031e-08, + "loss": 0.73547268, + "num_input_tokens_seen": 328340750, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.09924316, + "step": 15221, + "time_per_iteration": 2.4642598628997803 + }, + { + "auxiliary_loss_clip": 0.06398383, + "auxiliary_loss_mlp": 0.0126148, + "balance_loss_clip": 0.0626786, + "balance_loss_mlp": 0.01252003, + "epoch": 0.9151961521118293, + "flos": 19360614913920.0, + "grad_norm": 1.718973391494924, + "language_loss": 0.84501803, + "learning_rate": 7.491651557384692e-08, + "loss": 0.92161667, + "num_input_tokens_seen": 328359995, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.0947876, + "step": 15222, + "time_per_iteration": 2.471740245819092 + }, + { + "auxiliary_loss_clip": 0.06308072, + "auxiliary_loss_mlp": 0.0125194, + "balance_loss_clip": 0.06253721, + "balance_loss_mlp": 0.01250893, + "epoch": 0.9152562753644973, + "flos": 72167174956800.0, + "grad_norm": 0.7092684015563987, + "language_loss": 0.49536896, + "learning_rate": 7.481095595124953e-08, + "loss": 0.5709691, + "num_input_tokens_seen": 328426865, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.01048279, + "step": 15223, + "time_per_iteration": 3.159543752670288 + }, + { + "auxiliary_loss_clip": 0.0640175, + "auxiliary_loss_mlp": 0.0126443, + "balance_loss_clip": 0.0627071, + "balance_loss_mlp": 0.01254488, + "epoch": 0.9153163986171652, + "flos": 20783270661120.0, + "grad_norm": 1.6312984984407164, + "language_loss": 0.72100401, + "learning_rate": 7.470546933201349e-08, + "loss": 0.79766577, + "num_input_tokens_seen": 328445970, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09942627, + "step": 15224, + "time_per_iteration": 2.497352361679077 + }, + { + "auxiliary_loss_clip": 0.06398828, + "auxiliary_loss_mlp": 0.01261128, + "balance_loss_clip": 0.06269911, + "balance_loss_mlp": 0.01252211, + "epoch": 0.9153765218698332, + "flos": 23046902311680.0, + "grad_norm": 1.8848265932846708, + "language_loss": 0.81092465, + "learning_rate": 7.460005572013895e-08, + "loss": 0.88752425, + "num_input_tokens_seen": 328464585, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.0892334, + "step": 15225, + "time_per_iteration": 2.5618300437927246 + }, + { + "auxiliary_loss_clip": 0.06398889, + "auxiliary_loss_mlp": 0.0126229, + "balance_loss_clip": 0.06268218, + "balance_loss_mlp": 0.0125317, + "epoch": 0.9154366451225011, + "flos": 28999295055360.0, + "grad_norm": 1.3043395747962432, + "language_loss": 0.71588331, + "learning_rate": 7.44947151196238e-08, + "loss": 0.79249507, + "num_input_tokens_seen": 328490155, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.09124756, + "step": 15226, + "time_per_iteration": 2.6222610473632812 + }, + { + "auxiliary_loss_clip": 0.06400748, + "auxiliary_loss_mlp": 0.01263826, + "balance_loss_clip": 0.06268212, + "balance_loss_mlp": 0.01254456, + "epoch": 0.9154967683751691, + "flos": 22316029073280.0, + "grad_norm": 1.870267091222323, + "language_loss": 0.7535274, + "learning_rate": 7.43894475344613e-08, + "loss": 0.83017313, + "num_input_tokens_seen": 328508275, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09365845, + "step": 15227, + "time_per_iteration": 3.9587011337280273 + }, + { + "auxiliary_loss_clip": 0.06399345, + "auxiliary_loss_mlp": 0.01263901, + "balance_loss_clip": 0.0627011, + "balance_loss_mlp": 0.01255276, + "epoch": 0.915556891627837, + "flos": 24578360985600.0, + "grad_norm": 1.5200774095907186, + "language_loss": 0.74375439, + "learning_rate": 7.428425296864404e-08, + "loss": 0.82038689, + "num_input_tokens_seen": 328529425, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.08630371, + "step": 15228, + "time_per_iteration": 2.5360701084136963 + }, + { + "auxiliary_loss_clip": 0.06402157, + "auxiliary_loss_mlp": 0.01265448, + "balance_loss_clip": 0.06272555, + "balance_loss_mlp": 0.01256287, + "epoch": 0.9156170148805051, + "flos": 22171363799040.0, + "grad_norm": 1.5117785921082858, + "language_loss": 0.72036177, + "learning_rate": 7.417913142616106e-08, + "loss": 0.79703784, + "num_input_tokens_seen": 328550200, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.0916748, + "step": 15229, + "time_per_iteration": 2.5301578044891357 + }, + { + "auxiliary_loss_clip": 0.06400885, + "auxiliary_loss_mlp": 0.01266333, + "balance_loss_clip": 0.06270942, + "balance_loss_mlp": 0.01256397, + "epoch": 0.915677138133173, + "flos": 20926552343040.0, + "grad_norm": 1.5178465460863502, + "language_loss": 0.83324695, + "learning_rate": 7.407408291099848e-08, + "loss": 0.90991908, + "num_input_tokens_seen": 328568540, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.09936523, + "step": 15230, + "time_per_iteration": 2.4890830516815186 + }, + { + "auxiliary_loss_clip": 0.06398893, + "auxiliary_loss_mlp": 0.01261014, + "balance_loss_clip": 0.06271241, + "balance_loss_mlp": 0.01251907, + "epoch": 0.915737261385841, + "flos": 24350734569600.0, + "grad_norm": 2.140708224764665, + "language_loss": 0.83798474, + "learning_rate": 7.396910742713957e-08, + "loss": 0.9145838, + "num_input_tokens_seen": 328587300, + "router_z_loss_clip": 1.27734375, + "router_z_loss_mlp": 0.09112549, + "step": 15231, + "time_per_iteration": 2.5503671169281006 + }, + { + "auxiliary_loss_clip": 0.06395644, + "auxiliary_loss_mlp": 0.01262039, + "balance_loss_clip": 0.06266124, + "balance_loss_mlp": 0.01253051, + "epoch": 0.9157973846385089, + "flos": 26768758567680.0, + "grad_norm": 1.412460383804666, + "language_loss": 0.72348028, + "learning_rate": 7.386420497856516e-08, + "loss": 0.80005717, + "num_input_tokens_seen": 328610055, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.08978271, + "step": 15232, + "time_per_iteration": 2.536257266998291 + }, + { + "auxiliary_loss_clip": 0.06403804, + "auxiliary_loss_mlp": 0.01263561, + "balance_loss_clip": 0.06271422, + "balance_loss_mlp": 0.01254436, + "epoch": 0.9158575078911769, + "flos": 18484657130880.0, + "grad_norm": 2.3550676100990775, + "language_loss": 0.6826663, + "learning_rate": 7.375937556925338e-08, + "loss": 0.75933993, + "num_input_tokens_seen": 328626815, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.09124756, + "step": 15233, + "time_per_iteration": 4.011778831481934 + }, + { + "auxiliary_loss_clip": 0.064054, + "auxiliary_loss_mlp": 0.01265597, + "balance_loss_clip": 0.06272289, + "balance_loss_mlp": 0.01255769, + "epoch": 0.9159176311438448, + "flos": 21805403091840.0, + "grad_norm": 1.916334328828353, + "language_loss": 0.69990098, + "learning_rate": 7.365461920317861e-08, + "loss": 0.77661097, + "num_input_tokens_seen": 328643995, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.09832764, + "step": 15234, + "time_per_iteration": 2.5241239070892334 + }, + { + "auxiliary_loss_clip": 0.06404121, + "auxiliary_loss_mlp": 0.01263525, + "balance_loss_clip": 0.06271881, + "balance_loss_mlp": 0.01253964, + "epoch": 0.9159777543965129, + "flos": 24789552001920.0, + "grad_norm": 1.6575192392751135, + "language_loss": 0.8802951, + "learning_rate": 7.354993588431391e-08, + "loss": 0.95697153, + "num_input_tokens_seen": 328659565, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09558105, + "step": 15235, + "time_per_iteration": 3.9579367637634277 + }, + { + "auxiliary_loss_clip": 0.06400644, + "auxiliary_loss_mlp": 0.01266559, + "balance_loss_clip": 0.06269062, + "balance_loss_mlp": 0.01256987, + "epoch": 0.9160378776491809, + "flos": 26875800558720.0, + "grad_norm": 1.690257425906499, + "language_loss": 0.77583575, + "learning_rate": 7.344532561662853e-08, + "loss": 0.85250783, + "num_input_tokens_seen": 328679045, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.09576416, + "step": 15236, + "time_per_iteration": 2.5500221252441406 + }, + { + "auxiliary_loss_clip": 0.06309772, + "auxiliary_loss_mlp": 0.01251276, + "balance_loss_clip": 0.06255564, + "balance_loss_mlp": 0.01250298, + "epoch": 0.9160980009018488, + "flos": 70598596124160.0, + "grad_norm": 0.6553616821648679, + "language_loss": 0.6221928, + "learning_rate": 7.334078840409019e-08, + "loss": 0.69780326, + "num_input_tokens_seen": 328744565, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.00977325, + "step": 15237, + "time_per_iteration": 3.084401845932007 + }, + { + "auxiliary_loss_clip": 0.0640253, + "auxiliary_loss_mlp": 0.0126268, + "balance_loss_clip": 0.06270298, + "balance_loss_mlp": 0.01252827, + "epoch": 0.9161581241545168, + "flos": 16294846527360.0, + "grad_norm": 1.9192593491707206, + "language_loss": 0.75049806, + "learning_rate": 7.323632425066151e-08, + "loss": 0.82715011, + "num_input_tokens_seen": 328762455, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.09863281, + "step": 15238, + "time_per_iteration": 2.4591023921966553 + }, + { + "auxiliary_loss_clip": 0.06403898, + "auxiliary_loss_mlp": 0.0126337, + "balance_loss_clip": 0.06271527, + "balance_loss_mlp": 0.01253672, + "epoch": 0.9162182474071847, + "flos": 18443386195200.0, + "grad_norm": 1.5712034366167735, + "language_loss": 0.74555534, + "learning_rate": 7.313193316030464e-08, + "loss": 0.82222801, + "num_input_tokens_seen": 328780320, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.0970459, + "step": 15239, + "time_per_iteration": 2.5155394077301025 + }, + { + "auxiliary_loss_clip": 0.06404249, + "auxiliary_loss_mlp": 0.01269015, + "balance_loss_clip": 0.06271224, + "balance_loss_mlp": 0.01259258, + "epoch": 0.9162783706598527, + "flos": 19172498497920.0, + "grad_norm": 1.883459603997045, + "language_loss": 0.63822246, + "learning_rate": 7.302761513697819e-08, + "loss": 0.71495509, + "num_input_tokens_seen": 328797570, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.09765625, + "step": 15240, + "time_per_iteration": 2.5100462436676025 + }, + { + "auxiliary_loss_clip": 0.06401497, + "auxiliary_loss_mlp": 0.0126341, + "balance_loss_clip": 0.06272344, + "balance_loss_mlp": 0.0125438, + "epoch": 0.9163384939125206, + "flos": 20419322451840.0, + "grad_norm": 1.7171261992686273, + "language_loss": 0.76934052, + "learning_rate": 7.292337018463746e-08, + "loss": 0.84598958, + "num_input_tokens_seen": 328814075, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.090271, + "step": 15241, + "time_per_iteration": 2.515197992324829 + }, + { + "auxiliary_loss_clip": 0.06415688, + "auxiliary_loss_mlp": 0.01267436, + "balance_loss_clip": 0.06273037, + "balance_loss_mlp": 0.01256236, + "epoch": 0.9163986171651887, + "flos": 19651957960320.0, + "grad_norm": 2.7440161948074984, + "language_loss": 0.68086845, + "learning_rate": 7.281919830723549e-08, + "loss": 0.75769967, + "num_input_tokens_seen": 328831990, + "router_z_loss_clip": 1.42675781, + "router_z_loss_mlp": 0.11193848, + "step": 15242, + "time_per_iteration": 2.5829575061798096 + }, + { + "auxiliary_loss_clip": 0.06399854, + "auxiliary_loss_mlp": 0.01263264, + "balance_loss_clip": 0.06268453, + "balance_loss_mlp": 0.01254204, + "epoch": 0.9164587404178566, + "flos": 12827967845760.0, + "grad_norm": 1.757331084176624, + "language_loss": 0.81106311, + "learning_rate": 7.271509950872334e-08, + "loss": 0.88769436, + "num_input_tokens_seen": 328849105, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09069824, + "step": 15243, + "time_per_iteration": 2.5732226371765137 + }, + { + "auxiliary_loss_clip": 0.06405694, + "auxiliary_loss_mlp": 0.01266735, + "balance_loss_clip": 0.0627118, + "balance_loss_mlp": 0.01256996, + "epoch": 0.9165188636705246, + "flos": 22315903292160.0, + "grad_norm": 3.9147017718887205, + "language_loss": 0.82610697, + "learning_rate": 7.261107379304721e-08, + "loss": 0.90283132, + "num_input_tokens_seen": 328866810, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.09735107, + "step": 15244, + "time_per_iteration": 2.501309871673584 + }, + { + "auxiliary_loss_clip": 0.06406015, + "auxiliary_loss_mlp": 0.01265083, + "balance_loss_clip": 0.06269778, + "balance_loss_mlp": 0.01255153, + "epoch": 0.9165789869231925, + "flos": 18229218359040.0, + "grad_norm": 2.4095610629063176, + "language_loss": 0.72487861, + "learning_rate": 7.250712116415214e-08, + "loss": 0.80158961, + "num_input_tokens_seen": 328885325, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.0993042, + "step": 15245, + "time_per_iteration": 2.51802921295166 + }, + { + "auxiliary_loss_clip": 0.064008, + "auxiliary_loss_mlp": 0.01263885, + "balance_loss_clip": 0.06269535, + "balance_loss_mlp": 0.01254885, + "epoch": 0.9166391101758605, + "flos": 13695414439680.0, + "grad_norm": 1.5418326168026033, + "language_loss": 0.74834359, + "learning_rate": 7.240324162598033e-08, + "loss": 0.82499039, + "num_input_tokens_seen": 328902655, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.08990479, + "step": 15246, + "time_per_iteration": 2.4759280681610107 + }, + { + "auxiliary_loss_clip": 0.06401987, + "auxiliary_loss_mlp": 0.01264745, + "balance_loss_clip": 0.06271073, + "balance_loss_mlp": 0.01255065, + "epoch": 0.9166992334285284, + "flos": 17352380108160.0, + "grad_norm": 2.6245151033033802, + "language_loss": 0.75630188, + "learning_rate": 7.229943518247106e-08, + "loss": 0.83296925, + "num_input_tokens_seen": 328918440, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09680176, + "step": 15247, + "time_per_iteration": 2.50736927986145 + }, + { + "auxiliary_loss_clip": 0.06403103, + "auxiliary_loss_mlp": 0.01263507, + "balance_loss_clip": 0.06269411, + "balance_loss_mlp": 0.01254096, + "epoch": 0.9167593566811965, + "flos": 23737678571520.0, + "grad_norm": 1.734119816640847, + "language_loss": 0.76551712, + "learning_rate": 7.219570183756052e-08, + "loss": 0.84218323, + "num_input_tokens_seen": 328938055, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.09405518, + "step": 15248, + "time_per_iteration": 2.5225977897644043 + }, + { + "auxiliary_loss_clip": 0.06402726, + "auxiliary_loss_mlp": 0.01267049, + "balance_loss_clip": 0.06269974, + "balance_loss_mlp": 0.01256589, + "epoch": 0.9168194799338644, + "flos": 27825537461760.0, + "grad_norm": 2.0530525588042634, + "language_loss": 0.739088, + "learning_rate": 7.209204159518178e-08, + "loss": 0.81578577, + "num_input_tokens_seen": 328957895, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.10467529, + "step": 15249, + "time_per_iteration": 2.5675055980682373 + }, + { + "auxiliary_loss_clip": 0.06401356, + "auxiliary_loss_mlp": 0.01265318, + "balance_loss_clip": 0.06270999, + "balance_loss_mlp": 0.01256509, + "epoch": 0.9168796031865324, + "flos": 21722609658240.0, + "grad_norm": 1.9372290328284216, + "language_loss": 0.76030535, + "learning_rate": 7.198845445926616e-08, + "loss": 0.83697212, + "num_input_tokens_seen": 328971365, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.08813477, + "step": 15250, + "time_per_iteration": 2.4735028743743896 + }, + { + "auxiliary_loss_clip": 0.06397949, + "auxiliary_loss_mlp": 0.01265748, + "balance_loss_clip": 0.06268395, + "balance_loss_mlp": 0.01256193, + "epoch": 0.9169397264392004, + "flos": 23411185937280.0, + "grad_norm": 1.8107623073184385, + "language_loss": 0.76076829, + "learning_rate": 7.188494043374138e-08, + "loss": 0.83740526, + "num_input_tokens_seen": 328990830, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.09545898, + "step": 15251, + "time_per_iteration": 2.6144092082977295 + }, + { + "auxiliary_loss_clip": 0.0640536, + "auxiliary_loss_mlp": 0.01263626, + "balance_loss_clip": 0.06273532, + "balance_loss_mlp": 0.01253958, + "epoch": 0.9169998496918683, + "flos": 23957716193280.0, + "grad_norm": 2.127162243234926, + "language_loss": 0.80199194, + "learning_rate": 7.178149952253298e-08, + "loss": 0.87868178, + "num_input_tokens_seen": 329008345, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09667969, + "step": 15252, + "time_per_iteration": 2.5656697750091553 + }, + { + "auxiliary_loss_clip": 0.0640313, + "auxiliary_loss_mlp": 0.0126583, + "balance_loss_clip": 0.06271911, + "balance_loss_mlp": 0.01256287, + "epoch": 0.9170599729445363, + "flos": 18338314775040.0, + "grad_norm": 1.430147384395712, + "language_loss": 0.77667707, + "learning_rate": 7.167813172956316e-08, + "loss": 0.85336667, + "num_input_tokens_seen": 329027440, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.09539795, + "step": 15253, + "time_per_iteration": 2.5039689540863037 + }, + { + "auxiliary_loss_clip": 0.06402656, + "auxiliary_loss_mlp": 0.01263281, + "balance_loss_clip": 0.0627113, + "balance_loss_mlp": 0.01254513, + "epoch": 0.9171200961972042, + "flos": 22681528583040.0, + "grad_norm": 1.727297082986554, + "language_loss": 0.72871399, + "learning_rate": 7.157483705875256e-08, + "loss": 0.80537337, + "num_input_tokens_seen": 329046445, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.08776855, + "step": 15254, + "time_per_iteration": 2.5122387409210205 + }, + { + "auxiliary_loss_clip": 0.06395872, + "auxiliary_loss_mlp": 0.01264189, + "balance_loss_clip": 0.06269526, + "balance_loss_mlp": 0.01254825, + "epoch": 0.9171802194498723, + "flos": 26725726696320.0, + "grad_norm": 1.4812567402844228, + "language_loss": 0.79206324, + "learning_rate": 7.14716155140167e-08, + "loss": 0.86866391, + "num_input_tokens_seen": 329065555, + "router_z_loss_clip": 1.26464844, + "router_z_loss_mlp": 0.09356689, + "step": 15255, + "time_per_iteration": 2.5765507221221924 + }, + { + "auxiliary_loss_clip": 0.064024, + "auxiliary_loss_mlp": 0.01269302, + "balance_loss_clip": 0.06268662, + "balance_loss_mlp": 0.01260224, + "epoch": 0.9172403427025402, + "flos": 37898423061120.0, + "grad_norm": 1.872101049589666, + "language_loss": 0.68329966, + "learning_rate": 7.136846709927047e-08, + "loss": 0.76001668, + "num_input_tokens_seen": 329087515, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.09082031, + "step": 15256, + "time_per_iteration": 2.6418230533599854 + }, + { + "auxiliary_loss_clip": 0.06400028, + "auxiliary_loss_mlp": 0.01263708, + "balance_loss_clip": 0.0627111, + "balance_loss_mlp": 0.01254976, + "epoch": 0.9173004659552082, + "flos": 17060743572480.0, + "grad_norm": 1.8283973623759848, + "language_loss": 0.84006357, + "learning_rate": 7.126539181842561e-08, + "loss": 0.91670096, + "num_input_tokens_seen": 329106820, + "router_z_loss_clip": 1.28808594, + "router_z_loss_mlp": 0.08734131, + "step": 15257, + "time_per_iteration": 2.5305137634277344 + }, + { + "auxiliary_loss_clip": 0.0639857, + "auxiliary_loss_mlp": 0.01263291, + "balance_loss_clip": 0.0627027, + "balance_loss_mlp": 0.01254678, + "epoch": 0.9173605892078761, + "flos": 22208358176640.0, + "grad_norm": 1.5204666136912515, + "language_loss": 0.77536505, + "learning_rate": 7.116238967539012e-08, + "loss": 0.85198367, + "num_input_tokens_seen": 329126515, + "router_z_loss_clip": 1.28417969, + "router_z_loss_mlp": 0.08618164, + "step": 15258, + "time_per_iteration": 2.5125315189361572 + }, + { + "auxiliary_loss_clip": 0.06402186, + "auxiliary_loss_mlp": 0.01265531, + "balance_loss_clip": 0.06273498, + "balance_loss_mlp": 0.01256227, + "epoch": 0.9174207124605441, + "flos": 16513248994560.0, + "grad_norm": 1.9960678800991773, + "language_loss": 0.78876376, + "learning_rate": 7.105946067406999e-08, + "loss": 0.86544091, + "num_input_tokens_seen": 329142660, + "router_z_loss_clip": 1.28613281, + "router_z_loss_mlp": 0.09307861, + "step": 15259, + "time_per_iteration": 3.941746950149536 + }, + { + "auxiliary_loss_clip": 0.06399495, + "auxiliary_loss_mlp": 0.01264365, + "balance_loss_clip": 0.06270274, + "balance_loss_mlp": 0.01255651, + "epoch": 0.917480835713212, + "flos": 24542582492160.0, + "grad_norm": 1.4851816549824022, + "language_loss": 0.76305032, + "learning_rate": 7.095660481836895e-08, + "loss": 0.8396889, + "num_input_tokens_seen": 329162575, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.0871582, + "step": 15260, + "time_per_iteration": 2.54323148727417 + }, + { + "auxiliary_loss_clip": 0.06400856, + "auxiliary_loss_mlp": 0.01262484, + "balance_loss_clip": 0.06270303, + "balance_loss_mlp": 0.01253311, + "epoch": 0.9175409589658801, + "flos": 20886036094080.0, + "grad_norm": 1.4569612276520922, + "language_loss": 0.61439729, + "learning_rate": 7.085382211218637e-08, + "loss": 0.69103068, + "num_input_tokens_seen": 329182090, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.09179688, + "step": 15261, + "time_per_iteration": 2.519350290298462 + }, + { + "auxiliary_loss_clip": 0.063967, + "auxiliary_loss_mlp": 0.01261936, + "balance_loss_clip": 0.06268273, + "balance_loss_mlp": 0.01252865, + "epoch": 0.917601082218548, + "flos": 14280113030400.0, + "grad_norm": 1.8017934646848675, + "language_loss": 0.74208277, + "learning_rate": 7.075111255942002e-08, + "loss": 0.81866914, + "num_input_tokens_seen": 329196535, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.09063721, + "step": 15262, + "time_per_iteration": 2.4770686626434326 + }, + { + "auxiliary_loss_clip": 0.06404866, + "auxiliary_loss_mlp": 0.01263429, + "balance_loss_clip": 0.06268941, + "balance_loss_mlp": 0.0125425, + "epoch": 0.917661205471216, + "flos": 19105301923200.0, + "grad_norm": 1.713441901458641, + "language_loss": 0.77938473, + "learning_rate": 7.064847616396496e-08, + "loss": 0.85606766, + "num_input_tokens_seen": 329215135, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.09179688, + "step": 15263, + "time_per_iteration": 2.4721927642822266 + }, + { + "auxiliary_loss_clip": 0.06405415, + "auxiliary_loss_mlp": 0.01265853, + "balance_loss_clip": 0.06269035, + "balance_loss_mlp": 0.01256025, + "epoch": 0.917721328723884, + "flos": 21113075531520.0, + "grad_norm": 2.2981718419830894, + "language_loss": 0.75979543, + "learning_rate": 7.054591292971324e-08, + "loss": 0.83650815, + "num_input_tokens_seen": 329235150, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.09832764, + "step": 15264, + "time_per_iteration": 2.5106306076049805 + }, + { + "auxiliary_loss_clip": 0.06397746, + "auxiliary_loss_mlp": 0.01263995, + "balance_loss_clip": 0.06265679, + "balance_loss_mlp": 0.01254398, + "epoch": 0.9177814519765519, + "flos": 21949439460480.0, + "grad_norm": 1.607338475004671, + "language_loss": 0.83605742, + "learning_rate": 7.044342286055394e-08, + "loss": 0.91267478, + "num_input_tokens_seen": 329254365, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09594727, + "step": 15265, + "time_per_iteration": 2.500438928604126 + }, + { + "auxiliary_loss_clip": 0.06404482, + "auxiliary_loss_mlp": 0.01267303, + "balance_loss_clip": 0.06270517, + "balance_loss_mlp": 0.01256759, + "epoch": 0.9178415752292199, + "flos": 24212693767680.0, + "grad_norm": 1.4811768769102642, + "language_loss": 0.73341453, + "learning_rate": 7.034100596037306e-08, + "loss": 0.81013238, + "num_input_tokens_seen": 329274385, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.10552979, + "step": 15266, + "time_per_iteration": 3.9415042400360107 + }, + { + "auxiliary_loss_clip": 0.06403729, + "auxiliary_loss_mlp": 0.01265804, + "balance_loss_clip": 0.06271026, + "balance_loss_mlp": 0.01256506, + "epoch": 0.9179016984818879, + "flos": 20047324250880.0, + "grad_norm": 1.5268706819082398, + "language_loss": 0.77726352, + "learning_rate": 7.023866223305486e-08, + "loss": 0.85395879, + "num_input_tokens_seen": 329292160, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09289551, + "step": 15267, + "time_per_iteration": 2.5025975704193115 + }, + { + "auxiliary_loss_clip": 0.06306774, + "auxiliary_loss_mlp": 0.0124874, + "balance_loss_clip": 0.06252508, + "balance_loss_mlp": 0.01247798, + "epoch": 0.9179618217345559, + "flos": 65577561511680.0, + "grad_norm": 0.7361853308076762, + "language_loss": 0.55530179, + "learning_rate": 7.013639168247975e-08, + "loss": 0.63085693, + "num_input_tokens_seen": 329351870, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.00940704, + "step": 15268, + "time_per_iteration": 3.1551411151885986 + }, + { + "auxiliary_loss_clip": 0.0640334, + "auxiliary_loss_mlp": 0.0126454, + "balance_loss_clip": 0.06272043, + "balance_loss_mlp": 0.01255224, + "epoch": 0.9180219449872238, + "flos": 21331016801280.0, + "grad_norm": 1.7178999838576712, + "language_loss": 0.76744187, + "learning_rate": 7.0034194312526e-08, + "loss": 0.84412068, + "num_input_tokens_seen": 329370930, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.09313965, + "step": 15269, + "time_per_iteration": 2.5461537837982178 + }, + { + "auxiliary_loss_clip": 0.06400153, + "auxiliary_loss_mlp": 0.01268007, + "balance_loss_clip": 0.06269392, + "balance_loss_mlp": 0.01257689, + "epoch": 0.9180820682398918, + "flos": 41069137086720.0, + "grad_norm": 1.706172460230681, + "language_loss": 0.72979438, + "learning_rate": 6.993207012706936e-08, + "loss": 0.806476, + "num_input_tokens_seen": 329391275, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.10321045, + "step": 15270, + "time_per_iteration": 2.6807196140289307 + }, + { + "auxiliary_loss_clip": 0.06395302, + "auxiliary_loss_mlp": 0.01268583, + "balance_loss_clip": 0.06266629, + "balance_loss_mlp": 0.01259571, + "epoch": 0.9181421914925597, + "flos": 28080179619840.0, + "grad_norm": 1.4631420859140687, + "language_loss": 0.79966378, + "learning_rate": 6.98300191299821e-08, + "loss": 0.8763026, + "num_input_tokens_seen": 329412775, + "router_z_loss_clip": 1.28613281, + "router_z_loss_mlp": 0.09008789, + "step": 15271, + "time_per_iteration": 2.6022467613220215 + }, + { + "auxiliary_loss_clip": 0.0640102, + "auxiliary_loss_mlp": 0.01263986, + "balance_loss_clip": 0.06268465, + "balance_loss_mlp": 0.01254706, + "epoch": 0.9182023147452277, + "flos": 29177181273600.0, + "grad_norm": 1.8997922177263993, + "language_loss": 0.72772801, + "learning_rate": 6.972804132513355e-08, + "loss": 0.80437815, + "num_input_tokens_seen": 329432440, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.09277344, + "step": 15272, + "time_per_iteration": 2.5741183757781982 + }, + { + "auxiliary_loss_clip": 0.06399629, + "auxiliary_loss_mlp": 0.01266695, + "balance_loss_clip": 0.06269245, + "balance_loss_mlp": 0.01257784, + "epoch": 0.9182624379978956, + "flos": 24067651150080.0, + "grad_norm": 1.823337092754064, + "language_loss": 0.72748905, + "learning_rate": 6.962613671639105e-08, + "loss": 0.80415225, + "num_input_tokens_seen": 329450605, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.08911133, + "step": 15273, + "time_per_iteration": 3.989461898803711 + }, + { + "auxiliary_loss_clip": 0.06397839, + "auxiliary_loss_mlp": 0.01266929, + "balance_loss_clip": 0.06272411, + "balance_loss_mlp": 0.01258454, + "epoch": 0.9183225612505637, + "flos": 23300035096320.0, + "grad_norm": 1.4793794409400558, + "language_loss": 0.74706221, + "learning_rate": 6.952430530761933e-08, + "loss": 0.82370985, + "num_input_tokens_seen": 329470550, + "router_z_loss_clip": 1.25390625, + "router_z_loss_mlp": 0.08477783, + "step": 15274, + "time_per_iteration": 2.520556688308716 + }, + { + "auxiliary_loss_clip": 0.06403947, + "auxiliary_loss_mlp": 0.0126299, + "balance_loss_clip": 0.06271337, + "balance_loss_mlp": 0.012539, + "epoch": 0.9183826845032316, + "flos": 19615257072000.0, + "grad_norm": 1.5221375197874305, + "language_loss": 0.69075209, + "learning_rate": 6.942254710267902e-08, + "loss": 0.76742148, + "num_input_tokens_seen": 329489765, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09088135, + "step": 15275, + "time_per_iteration": 3.905719041824341 + }, + { + "auxiliary_loss_clip": 0.06398068, + "auxiliary_loss_mlp": 0.01264874, + "balance_loss_clip": 0.06269246, + "balance_loss_mlp": 0.01255296, + "epoch": 0.9184428077558996, + "flos": 18485034474240.0, + "grad_norm": 1.8827436840113005, + "language_loss": 0.72488761, + "learning_rate": 6.932086210542953e-08, + "loss": 0.80151707, + "num_input_tokens_seen": 329507040, + "router_z_loss_clip": 1.28808594, + "router_z_loss_mlp": 0.09576416, + "step": 15276, + "time_per_iteration": 2.485471248626709 + }, + { + "auxiliary_loss_clip": 0.06402228, + "auxiliary_loss_mlp": 0.01261956, + "balance_loss_clip": 0.06271537, + "balance_loss_mlp": 0.01253277, + "epoch": 0.9185029310085676, + "flos": 20747366386560.0, + "grad_norm": 1.7691227354314663, + "language_loss": 0.73457116, + "learning_rate": 6.921925031972642e-08, + "loss": 0.81121302, + "num_input_tokens_seen": 329525540, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.08679199, + "step": 15277, + "time_per_iteration": 2.512688159942627 + }, + { + "auxiliary_loss_clip": 0.06307656, + "auxiliary_loss_mlp": 0.01251054, + "balance_loss_clip": 0.06253561, + "balance_loss_mlp": 0.01250129, + "epoch": 0.9185630542612355, + "flos": 68229641491200.0, + "grad_norm": 0.706284622540633, + "language_loss": 0.59206891, + "learning_rate": 6.91177117494226e-08, + "loss": 0.66765606, + "num_input_tokens_seen": 329592905, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.00922394, + "step": 15278, + "time_per_iteration": 3.2377090454101562 + }, + { + "auxiliary_loss_clip": 0.06395192, + "auxiliary_loss_mlp": 0.01259779, + "balance_loss_clip": 0.06267422, + "balance_loss_mlp": 0.01251649, + "epoch": 0.9186231775139035, + "flos": 12244317431040.0, + "grad_norm": 1.7835726733368307, + "language_loss": 0.64503765, + "learning_rate": 6.901624639836879e-08, + "loss": 0.7215873, + "num_input_tokens_seen": 329610150, + "router_z_loss_clip": 1.27539062, + "router_z_loss_mlp": 0.08123779, + "step": 15279, + "time_per_iteration": 2.475576877593994 + }, + { + "auxiliary_loss_clip": 0.0631056, + "auxiliary_loss_mlp": 0.01249529, + "balance_loss_clip": 0.0625634, + "balance_loss_mlp": 0.01248621, + "epoch": 0.9186833007665715, + "flos": 63958739356800.0, + "grad_norm": 0.8219128410312971, + "language_loss": 0.60080945, + "learning_rate": 6.891485427041211e-08, + "loss": 0.67641032, + "num_input_tokens_seen": 329673650, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.00904846, + "step": 15280, + "time_per_iteration": 3.119189977645874 + }, + { + "auxiliary_loss_clip": 0.06403612, + "auxiliary_loss_mlp": 0.01263581, + "balance_loss_clip": 0.06269744, + "balance_loss_mlp": 0.01253639, + "epoch": 0.9187434240192395, + "flos": 19980882362880.0, + "grad_norm": 1.890303690282995, + "language_loss": 0.70436323, + "learning_rate": 6.881353536939815e-08, + "loss": 0.78103518, + "num_input_tokens_seen": 329692520, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.09942627, + "step": 15281, + "time_per_iteration": 2.531141996383667 + }, + { + "auxiliary_loss_clip": 0.06401566, + "auxiliary_loss_mlp": 0.01263049, + "balance_loss_clip": 0.06269042, + "balance_loss_mlp": 0.01253209, + "epoch": 0.9188035472719074, + "flos": 25234742344320.0, + "grad_norm": 1.9786800170515064, + "language_loss": 0.84562802, + "learning_rate": 6.871228969916831e-08, + "loss": 0.92227417, + "num_input_tokens_seen": 329713750, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.09838867, + "step": 15282, + "time_per_iteration": 2.5332024097442627 + }, + { + "auxiliary_loss_clip": 0.06399123, + "auxiliary_loss_mlp": 0.01271317, + "balance_loss_clip": 0.06269504, + "balance_loss_mlp": 0.01261411, + "epoch": 0.9188636705245754, + "flos": 18411423062400.0, + "grad_norm": 2.0072759179217052, + "language_loss": 0.60563141, + "learning_rate": 6.861111726356194e-08, + "loss": 0.68233585, + "num_input_tokens_seen": 329730960, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.09906006, + "step": 15283, + "time_per_iteration": 2.5034496784210205 + }, + { + "auxiliary_loss_clip": 0.06406576, + "auxiliary_loss_mlp": 0.0126769, + "balance_loss_clip": 0.0626885, + "balance_loss_mlp": 0.01257879, + "epoch": 0.9189237937772433, + "flos": 23775930760320.0, + "grad_norm": 1.7836030599883965, + "language_loss": 0.65816599, + "learning_rate": 6.851001806641554e-08, + "loss": 0.73490864, + "num_input_tokens_seen": 329750975, + "router_z_loss_clip": 1.37695312, + "router_z_loss_mlp": 0.09820557, + "step": 15284, + "time_per_iteration": 2.5270888805389404 + }, + { + "auxiliary_loss_clip": 0.06401928, + "auxiliary_loss_mlp": 0.01261516, + "balance_loss_clip": 0.06270975, + "balance_loss_mlp": 0.01252217, + "epoch": 0.9189839170299113, + "flos": 21220914136320.0, + "grad_norm": 1.9502901912071402, + "language_loss": 0.73604786, + "learning_rate": 6.840899211156292e-08, + "loss": 0.81268227, + "num_input_tokens_seen": 329769645, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.09295654, + "step": 15285, + "time_per_iteration": 2.5270345211029053 + }, + { + "auxiliary_loss_clip": 0.063976, + "auxiliary_loss_mlp": 0.01263018, + "balance_loss_clip": 0.06268349, + "balance_loss_mlp": 0.01253726, + "epoch": 0.9190440402825792, + "flos": 16732993127040.0, + "grad_norm": 1.9982888502982128, + "language_loss": 0.72159714, + "learning_rate": 6.830803940283458e-08, + "loss": 0.79820335, + "num_input_tokens_seen": 329788185, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.09295654, + "step": 15286, + "time_per_iteration": 2.4716579914093018 + }, + { + "auxiliary_loss_clip": 0.06399448, + "auxiliary_loss_mlp": 0.01263944, + "balance_loss_clip": 0.06268711, + "balance_loss_mlp": 0.01254229, + "epoch": 0.9191041635352473, + "flos": 23448012606720.0, + "grad_norm": 1.604320036693306, + "language_loss": 0.7369895, + "learning_rate": 6.820715994405945e-08, + "loss": 0.81362337, + "num_input_tokens_seen": 329806780, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.0970459, + "step": 15287, + "time_per_iteration": 2.541874885559082 + }, + { + "auxiliary_loss_clip": 0.06403069, + "auxiliary_loss_mlp": 0.01265047, + "balance_loss_clip": 0.06271331, + "balance_loss_mlp": 0.01254968, + "epoch": 0.9191642867879152, + "flos": 18813581533440.0, + "grad_norm": 1.9153203073753247, + "language_loss": 0.65538442, + "learning_rate": 6.810635373906226e-08, + "loss": 0.73206556, + "num_input_tokens_seen": 329826350, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.10070801, + "step": 15288, + "time_per_iteration": 2.48822021484375 + }, + { + "auxiliary_loss_clip": 0.06402881, + "auxiliary_loss_mlp": 0.01264206, + "balance_loss_clip": 0.06272922, + "balance_loss_mlp": 0.01254985, + "epoch": 0.9192244100405832, + "flos": 32169170540160.0, + "grad_norm": 2.0269640241218303, + "language_loss": 0.71110076, + "learning_rate": 6.800562079166549e-08, + "loss": 0.78777158, + "num_input_tokens_seen": 329846160, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.09228516, + "step": 15289, + "time_per_iteration": 2.617255926132202 + }, + { + "auxiliary_loss_clip": 0.06402991, + "auxiliary_loss_mlp": 0.01265573, + "balance_loss_clip": 0.06271317, + "balance_loss_mlp": 0.01255768, + "epoch": 0.9192845332932512, + "flos": 16362420445440.0, + "grad_norm": 1.8310281360833698, + "language_loss": 0.74637043, + "learning_rate": 6.790496110568921e-08, + "loss": 0.82305604, + "num_input_tokens_seen": 329862020, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.09802246, + "step": 15290, + "time_per_iteration": 2.478506088256836 + }, + { + "auxiliary_loss_clip": 0.06398199, + "auxiliary_loss_mlp": 0.01262641, + "balance_loss_clip": 0.06270142, + "balance_loss_mlp": 0.01253968, + "epoch": 0.9193446565459191, + "flos": 26621661525120.0, + "grad_norm": 1.9398963623899734, + "language_loss": 0.719679, + "learning_rate": 6.78043746849506e-08, + "loss": 0.79628742, + "num_input_tokens_seen": 329880185, + "router_z_loss_clip": 1.27929688, + "router_z_loss_mlp": 0.08666992, + "step": 15291, + "time_per_iteration": 2.5524001121520996 + }, + { + "auxiliary_loss_clip": 0.06399632, + "auxiliary_loss_mlp": 0.01267484, + "balance_loss_clip": 0.06270288, + "balance_loss_mlp": 0.01258168, + "epoch": 0.9194047797985871, + "flos": 22498778828160.0, + "grad_norm": 1.543404805290079, + "language_loss": 0.71005565, + "learning_rate": 6.770386153326346e-08, + "loss": 0.78672683, + "num_input_tokens_seen": 329900255, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.09313965, + "step": 15292, + "time_per_iteration": 2.6065571308135986 + }, + { + "auxiliary_loss_clip": 0.06402849, + "auxiliary_loss_mlp": 0.01263278, + "balance_loss_clip": 0.06270827, + "balance_loss_mlp": 0.01253068, + "epoch": 0.9194649030512551, + "flos": 25085171606400.0, + "grad_norm": 1.8067565930105831, + "language_loss": 0.73275411, + "learning_rate": 6.760342165443988e-08, + "loss": 0.80941534, + "num_input_tokens_seen": 329919095, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.10205078, + "step": 15293, + "time_per_iteration": 2.61039662361145 + }, + { + "auxiliary_loss_clip": 0.06400138, + "auxiliary_loss_mlp": 0.01265567, + "balance_loss_clip": 0.0627121, + "balance_loss_mlp": 0.01256453, + "epoch": 0.9195250263039231, + "flos": 11915938080000.0, + "grad_norm": 1.8020463710370824, + "language_loss": 0.78330243, + "learning_rate": 6.750305505228837e-08, + "loss": 0.85995948, + "num_input_tokens_seen": 329936505, + "router_z_loss_clip": 1.28808594, + "router_z_loss_mlp": 0.09100342, + "step": 15294, + "time_per_iteration": 2.493028163909912 + }, + { + "auxiliary_loss_clip": 0.0640836, + "auxiliary_loss_mlp": 0.01268598, + "balance_loss_clip": 0.06273803, + "balance_loss_mlp": 0.01257929, + "epoch": 0.919585149556591, + "flos": 21840426898560.0, + "grad_norm": 1.44776982902165, + "language_loss": 0.77154565, + "learning_rate": 6.74027617306141e-08, + "loss": 0.84831524, + "num_input_tokens_seen": 329956795, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.10662842, + "step": 15295, + "time_per_iteration": 2.553980588912964 + }, + { + "auxiliary_loss_clip": 0.06398003, + "auxiliary_loss_mlp": 0.01267619, + "balance_loss_clip": 0.062723, + "balance_loss_mlp": 0.01259066, + "epoch": 0.919645272809259, + "flos": 28191623950080.0, + "grad_norm": 3.7930778156513245, + "language_loss": 0.71295464, + "learning_rate": 6.730254169322114e-08, + "loss": 0.78961086, + "num_input_tokens_seen": 329977195, + "router_z_loss_clip": 1.2578125, + "router_z_loss_mlp": 0.08563232, + "step": 15296, + "time_per_iteration": 2.5601587295532227 + }, + { + "auxiliary_loss_clip": 0.06399599, + "auxiliary_loss_mlp": 0.0126506, + "balance_loss_clip": 0.06269962, + "balance_loss_mlp": 0.01255178, + "epoch": 0.9197053960619269, + "flos": 18338734045440.0, + "grad_norm": 1.87886497767534, + "language_loss": 0.75809079, + "learning_rate": 6.720239494390912e-08, + "loss": 0.83473742, + "num_input_tokens_seen": 329992095, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.09881592, + "step": 15297, + "time_per_iteration": 2.5021512508392334 + }, + { + "auxiliary_loss_clip": 0.06400803, + "auxiliary_loss_mlp": 0.01268368, + "balance_loss_clip": 0.06269927, + "balance_loss_mlp": 0.01259064, + "epoch": 0.9197655193145949, + "flos": 28190911190400.0, + "grad_norm": 1.8177051823695647, + "language_loss": 0.73887002, + "learning_rate": 6.710232148647676e-08, + "loss": 0.81556177, + "num_input_tokens_seen": 330011490, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09307861, + "step": 15298, + "time_per_iteration": 3.9610788822174072 + }, + { + "auxiliary_loss_clip": 0.06405745, + "auxiliary_loss_mlp": 0.01265367, + "balance_loss_clip": 0.06272408, + "balance_loss_mlp": 0.0125527, + "epoch": 0.9198256425672628, + "flos": 17311234953600.0, + "grad_norm": 1.9682637509338687, + "language_loss": 0.79818356, + "learning_rate": 6.70023213247175e-08, + "loss": 0.87489468, + "num_input_tokens_seen": 330027885, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.10089111, + "step": 15299, + "time_per_iteration": 2.5930144786834717 + }, + { + "auxiliary_loss_clip": 0.06398566, + "auxiliary_loss_mlp": 0.01263143, + "balance_loss_clip": 0.06269201, + "balance_loss_mlp": 0.01253994, + "epoch": 0.9198857658199309, + "flos": 17864347754880.0, + "grad_norm": 2.0170678317240185, + "language_loss": 0.63947648, + "learning_rate": 6.690239446242385e-08, + "loss": 0.71609354, + "num_input_tokens_seen": 330046230, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.0914917, + "step": 15300, + "time_per_iteration": 2.491405487060547 + }, + { + "auxiliary_loss_clip": 0.06394336, + "auxiliary_loss_mlp": 0.01263452, + "balance_loss_clip": 0.06269766, + "balance_loss_mlp": 0.01255459, + "epoch": 0.9199458890725988, + "flos": 22134117859200.0, + "grad_norm": 1.6376619653433249, + "language_loss": 0.69386828, + "learning_rate": 6.680254090338545e-08, + "loss": 0.77044618, + "num_input_tokens_seen": 330065535, + "router_z_loss_clip": 1.24511719, + "router_z_loss_mlp": 0.07989502, + "step": 15301, + "time_per_iteration": 2.517106056213379 + }, + { + "auxiliary_loss_clip": 0.06403923, + "auxiliary_loss_mlp": 0.01263436, + "balance_loss_clip": 0.06270855, + "balance_loss_mlp": 0.01253088, + "epoch": 0.9200060123252668, + "flos": 16039533536640.0, + "grad_norm": 1.8442828866072565, + "language_loss": 0.71317738, + "learning_rate": 6.670276065138814e-08, + "loss": 0.78985095, + "num_input_tokens_seen": 330082920, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.10351562, + "step": 15302, + "time_per_iteration": 2.4811885356903076 + }, + { + "auxiliary_loss_clip": 0.06403451, + "auxiliary_loss_mlp": 0.0126337, + "balance_loss_clip": 0.0627024, + "balance_loss_mlp": 0.0125375, + "epoch": 0.9200661355779348, + "flos": 26870853168000.0, + "grad_norm": 1.597458857738985, + "language_loss": 0.76678693, + "learning_rate": 6.660305371021579e-08, + "loss": 0.84345514, + "num_input_tokens_seen": 330101165, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.09613037, + "step": 15303, + "time_per_iteration": 2.548341989517212 + }, + { + "auxiliary_loss_clip": 0.06402119, + "auxiliary_loss_mlp": 0.01266402, + "balance_loss_clip": 0.06271823, + "balance_loss_mlp": 0.01257068, + "epoch": 0.9201262588306027, + "flos": 12791686227840.0, + "grad_norm": 2.0394625643099435, + "language_loss": 0.87783742, + "learning_rate": 6.650342008365006e-08, + "loss": 0.95452261, + "num_input_tokens_seen": 330118775, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.09332275, + "step": 15304, + "time_per_iteration": 2.4814488887786865 + }, + { + "auxiliary_loss_clip": 0.0641056, + "auxiliary_loss_mlp": 0.01268156, + "balance_loss_clip": 0.06273954, + "balance_loss_mlp": 0.01256724, + "epoch": 0.9201863820832707, + "flos": 20637934554240.0, + "grad_norm": 1.7672455563097456, + "language_loss": 0.77882159, + "learning_rate": 6.64038597754677e-08, + "loss": 0.85560876, + "num_input_tokens_seen": 330135570, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.11413574, + "step": 15305, + "time_per_iteration": 2.5235755443573 + }, + { + "auxiliary_loss_clip": 0.06401099, + "auxiliary_loss_mlp": 0.0126516, + "balance_loss_clip": 0.06268904, + "balance_loss_mlp": 0.01255975, + "epoch": 0.9202465053359387, + "flos": 26403007495680.0, + "grad_norm": 2.2842473577556497, + "language_loss": 0.81661773, + "learning_rate": 6.630437278944501e-08, + "loss": 0.89328027, + "num_input_tokens_seen": 330152840, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.09179688, + "step": 15306, + "time_per_iteration": 3.9354968070983887 + }, + { + "auxiliary_loss_clip": 0.06398699, + "auxiliary_loss_mlp": 0.01265097, + "balance_loss_clip": 0.06270522, + "balance_loss_mlp": 0.01256305, + "epoch": 0.9203066285886067, + "flos": 10492737281280.0, + "grad_norm": 1.8746939053209624, + "language_loss": 0.72304678, + "learning_rate": 6.62049591293541e-08, + "loss": 0.79968476, + "num_input_tokens_seen": 330168605, + "router_z_loss_clip": 1.28320312, + "router_z_loss_mlp": 0.08789062, + "step": 15307, + "time_per_iteration": 2.4903953075408936 + }, + { + "auxiliary_loss_clip": 0.06403868, + "auxiliary_loss_mlp": 0.01262191, + "balance_loss_clip": 0.06270027, + "balance_loss_mlp": 0.01252726, + "epoch": 0.9203667518412746, + "flos": 19396770750720.0, + "grad_norm": 1.8214262025870762, + "language_loss": 0.786762, + "learning_rate": 6.610561879896526e-08, + "loss": 0.86342263, + "num_input_tokens_seen": 330186160, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.09460449, + "step": 15308, + "time_per_iteration": 2.4916763305664062 + }, + { + "auxiliary_loss_clip": 0.06398311, + "auxiliary_loss_mlp": 0.0126164, + "balance_loss_clip": 0.06267833, + "balance_loss_mlp": 0.01252425, + "epoch": 0.9204268750939426, + "flos": 15930520974720.0, + "grad_norm": 1.810335481306463, + "language_loss": 0.77935588, + "learning_rate": 6.600635180204484e-08, + "loss": 0.85595536, + "num_input_tokens_seen": 330201780, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.09210205, + "step": 15309, + "time_per_iteration": 2.4542508125305176 + }, + { + "auxiliary_loss_clip": 0.06400943, + "auxiliary_loss_mlp": 0.01261859, + "balance_loss_clip": 0.0626944, + "balance_loss_mlp": 0.01252847, + "epoch": 0.9204869983466105, + "flos": 16477302792960.0, + "grad_norm": 1.7421035243048335, + "language_loss": 0.66452754, + "learning_rate": 6.590715814235781e-08, + "loss": 0.74115556, + "num_input_tokens_seen": 330219165, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09011841, + "step": 15310, + "time_per_iteration": 2.4991562366485596 + }, + { + "auxiliary_loss_clip": 0.06399545, + "auxiliary_loss_mlp": 0.01263459, + "balance_loss_clip": 0.06268573, + "balance_loss_mlp": 0.01253803, + "epoch": 0.9205471215992785, + "flos": 21544933075200.0, + "grad_norm": 1.6637113509144883, + "language_loss": 0.66279554, + "learning_rate": 6.580803782366495e-08, + "loss": 0.73942566, + "num_input_tokens_seen": 330238975, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09649658, + "step": 15311, + "time_per_iteration": 2.4965457916259766 + }, + { + "auxiliary_loss_clip": 0.0639765, + "auxiliary_loss_mlp": 0.01265166, + "balance_loss_clip": 0.06265511, + "balance_loss_mlp": 0.01255432, + "epoch": 0.9206072448519464, + "flos": 25012272954240.0, + "grad_norm": 1.8269618240158574, + "language_loss": 0.76250952, + "learning_rate": 6.570899084972503e-08, + "loss": 0.83913767, + "num_input_tokens_seen": 330259755, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09735107, + "step": 15312, + "time_per_iteration": 3.9788658618927 + }, + { + "auxiliary_loss_clip": 0.06397028, + "auxiliary_loss_mlp": 0.01268151, + "balance_loss_clip": 0.06270073, + "balance_loss_mlp": 0.01259199, + "epoch": 0.9206673681046145, + "flos": 20529047773440.0, + "grad_norm": 1.6388491370190603, + "language_loss": 0.79423517, + "learning_rate": 6.561001722429394e-08, + "loss": 0.87088692, + "num_input_tokens_seen": 330277660, + "router_z_loss_clip": 1.27050781, + "router_z_loss_mlp": 0.08959961, + "step": 15313, + "time_per_iteration": 2.4897162914276123 + }, + { + "auxiliary_loss_clip": 0.06402104, + "auxiliary_loss_mlp": 0.01262155, + "balance_loss_clip": 0.06269892, + "balance_loss_mlp": 0.01252368, + "epoch": 0.9207274913572824, + "flos": 20889222549120.0, + "grad_norm": 1.670329128161987, + "language_loss": 0.78675765, + "learning_rate": 6.55111169511251e-08, + "loss": 0.86340022, + "num_input_tokens_seen": 330295455, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.09790039, + "step": 15314, + "time_per_iteration": 3.9323294162750244 + }, + { + "auxiliary_loss_clip": 0.06409357, + "auxiliary_loss_mlp": 0.01266101, + "balance_loss_clip": 0.06271656, + "balance_loss_mlp": 0.01255509, + "epoch": 0.9207876146099504, + "flos": 22714414110720.0, + "grad_norm": 1.7791309268152706, + "language_loss": 0.79277146, + "learning_rate": 6.541229003396864e-08, + "loss": 0.86952603, + "num_input_tokens_seen": 330315310, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.10601807, + "step": 15315, + "time_per_iteration": 2.5845134258270264 + }, + { + "auxiliary_loss_clip": 0.06407665, + "auxiliary_loss_mlp": 0.01267885, + "balance_loss_clip": 0.06270912, + "balance_loss_mlp": 0.01257508, + "epoch": 0.9208477378626184, + "flos": 18511966362240.0, + "grad_norm": 1.9500495947335497, + "language_loss": 0.76453424, + "learning_rate": 6.531353647657156e-08, + "loss": 0.84128976, + "num_input_tokens_seen": 330333260, + "router_z_loss_clip": 1.36621094, + "router_z_loss_mlp": 0.1038208, + "step": 15316, + "time_per_iteration": 2.47459077835083 + }, + { + "auxiliary_loss_clip": 0.0640117, + "auxiliary_loss_mlp": 0.01263889, + "balance_loss_clip": 0.06267554, + "balance_loss_mlp": 0.01254216, + "epoch": 0.9209078611152863, + "flos": 23005757157120.0, + "grad_norm": 1.5768988455786053, + "language_loss": 0.69479769, + "learning_rate": 6.521485628267931e-08, + "loss": 0.77144837, + "num_input_tokens_seen": 330352465, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.09661865, + "step": 15317, + "time_per_iteration": 2.527420997619629 + }, + { + "auxiliary_loss_clip": 0.06401445, + "auxiliary_loss_mlp": 0.01265355, + "balance_loss_clip": 0.062697, + "balance_loss_mlp": 0.01255824, + "epoch": 0.9209679843679544, + "flos": 24068447763840.0, + "grad_norm": 1.5969618693252037, + "language_loss": 0.8386265, + "learning_rate": 6.511624945603378e-08, + "loss": 0.91529447, + "num_input_tokens_seen": 330372685, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09533691, + "step": 15318, + "time_per_iteration": 2.5386664867401123 + }, + { + "auxiliary_loss_clip": 0.06403956, + "auxiliary_loss_mlp": 0.01263441, + "balance_loss_clip": 0.06273109, + "balance_loss_mlp": 0.01254422, + "epoch": 0.9210281076206223, + "flos": 13558505667840.0, + "grad_norm": 1.7973020316666544, + "language_loss": 0.85918063, + "learning_rate": 6.501771600037354e-08, + "loss": 0.93585461, + "num_input_tokens_seen": 330388860, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09020996, + "step": 15319, + "time_per_iteration": 2.5289907455444336 + }, + { + "auxiliary_loss_clip": 0.06306411, + "auxiliary_loss_mlp": 0.01248044, + "balance_loss_clip": 0.06252417, + "balance_loss_mlp": 0.01247074, + "epoch": 0.9210882308732903, + "flos": 71448292851840.0, + "grad_norm": 0.7592752330183857, + "language_loss": 0.56235629, + "learning_rate": 6.491925591943559e-08, + "loss": 0.63790083, + "num_input_tokens_seen": 330448735, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.00968933, + "step": 15320, + "time_per_iteration": 3.1707842350006104 + }, + { + "auxiliary_loss_clip": 0.06406188, + "auxiliary_loss_mlp": 0.0126704, + "balance_loss_clip": 0.06270667, + "balance_loss_mlp": 0.0125655, + "epoch": 0.9211483541259582, + "flos": 18514020787200.0, + "grad_norm": 2.407910490278205, + "language_loss": 0.6486662, + "learning_rate": 6.482086921695384e-08, + "loss": 0.72539854, + "num_input_tokens_seen": 330465600, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.10491943, + "step": 15321, + "time_per_iteration": 2.503638505935669 + }, + { + "auxiliary_loss_clip": 0.06396494, + "auxiliary_loss_mlp": 0.01264162, + "balance_loss_clip": 0.06272007, + "balance_loss_mlp": 0.01255263, + "epoch": 0.9212084773786262, + "flos": 23264927435520.0, + "grad_norm": 1.5551004297855493, + "language_loss": 0.71829319, + "learning_rate": 6.47225558966582e-08, + "loss": 0.79489976, + "num_input_tokens_seen": 330485770, + "router_z_loss_clip": 1.24414062, + "router_z_loss_mlp": 0.08901978, + "step": 15322, + "time_per_iteration": 2.5333313941955566 + }, + { + "auxiliary_loss_clip": 0.06396886, + "auxiliary_loss_mlp": 0.01266738, + "balance_loss_clip": 0.06267932, + "balance_loss_mlp": 0.01257655, + "epoch": 0.9212686006312941, + "flos": 16295056162560.0, + "grad_norm": 1.6480851550140987, + "language_loss": 0.69842833, + "learning_rate": 6.462431596227725e-08, + "loss": 0.77506459, + "num_input_tokens_seen": 330504255, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.09088135, + "step": 15323, + "time_per_iteration": 2.4778027534484863 + }, + { + "auxiliary_loss_clip": 0.06403235, + "auxiliary_loss_mlp": 0.01267314, + "balance_loss_clip": 0.06269836, + "balance_loss_mlp": 0.01256948, + "epoch": 0.9213287238839621, + "flos": 19790837303040.0, + "grad_norm": 1.9637834340414146, + "language_loss": 0.74995911, + "learning_rate": 6.452614941753597e-08, + "loss": 0.82666463, + "num_input_tokens_seen": 330520705, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.1036377, + "step": 15324, + "time_per_iteration": 2.488264322280884 + }, + { + "auxiliary_loss_clip": 0.06405512, + "auxiliary_loss_mlp": 0.01267457, + "balance_loss_clip": 0.06274214, + "balance_loss_mlp": 0.01257915, + "epoch": 0.92138884713663, + "flos": 21036361518720.0, + "grad_norm": 2.1445778052802327, + "language_loss": 0.71659297, + "learning_rate": 6.442805626615744e-08, + "loss": 0.79332268, + "num_input_tokens_seen": 330539245, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09539795, + "step": 15325, + "time_per_iteration": 2.496718406677246 + }, + { + "auxiliary_loss_clip": 0.06398599, + "auxiliary_loss_mlp": 0.01262694, + "balance_loss_clip": 0.06267601, + "balance_loss_mlp": 0.0125333, + "epoch": 0.9214489703892981, + "flos": 28595207940480.0, + "grad_norm": 1.4431088490493214, + "language_loss": 0.78559232, + "learning_rate": 6.433003651186109e-08, + "loss": 0.86220527, + "num_input_tokens_seen": 330561815, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09356689, + "step": 15326, + "time_per_iteration": 2.569300889968872 + }, + { + "auxiliary_loss_clip": 0.06409511, + "auxiliary_loss_mlp": 0.01267744, + "balance_loss_clip": 0.06275136, + "balance_loss_mlp": 0.01257391, + "epoch": 0.921509093641966, + "flos": 16366864711680.0, + "grad_norm": 3.0252741922568465, + "language_loss": 0.71586484, + "learning_rate": 6.42320901583635e-08, + "loss": 0.79263741, + "num_input_tokens_seen": 330579760, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.10345459, + "step": 15327, + "time_per_iteration": 2.4783525466918945 + }, + { + "auxiliary_loss_clip": 0.0640553, + "auxiliary_loss_mlp": 0.01265754, + "balance_loss_clip": 0.06269649, + "balance_loss_mlp": 0.01255627, + "epoch": 0.921569216894634, + "flos": 26837632224000.0, + "grad_norm": 1.6779125016260046, + "language_loss": 0.78150362, + "learning_rate": 6.413421720937906e-08, + "loss": 0.85821646, + "num_input_tokens_seen": 330598545, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.10131836, + "step": 15328, + "time_per_iteration": 2.540372371673584 + }, + { + "auxiliary_loss_clip": 0.06400491, + "auxiliary_loss_mlp": 0.01261732, + "balance_loss_clip": 0.06271934, + "balance_loss_mlp": 0.01253054, + "epoch": 0.921629340147302, + "flos": 24652140105600.0, + "grad_norm": 2.2635066688956957, + "language_loss": 0.71408528, + "learning_rate": 6.4036417668619e-08, + "loss": 0.79070753, + "num_input_tokens_seen": 330616700, + "router_z_loss_clip": 1.28613281, + "router_z_loss_mlp": 0.08679199, + "step": 15329, + "time_per_iteration": 2.533205986022949 + }, + { + "auxiliary_loss_clip": 0.06399167, + "auxiliary_loss_mlp": 0.01261949, + "balance_loss_clip": 0.06268907, + "balance_loss_mlp": 0.01253318, + "epoch": 0.9216894633999699, + "flos": 15092018766720.0, + "grad_norm": 1.8806450993945985, + "language_loss": 0.86950338, + "learning_rate": 6.393869153979192e-08, + "loss": 0.94611454, + "num_input_tokens_seen": 330633355, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.08630371, + "step": 15330, + "time_per_iteration": 2.4801652431488037 + }, + { + "auxiliary_loss_clip": 0.06404316, + "auxiliary_loss_mlp": 0.01264793, + "balance_loss_clip": 0.06271324, + "balance_loss_mlp": 0.0125512, + "epoch": 0.921749586652638, + "flos": 19209912145920.0, + "grad_norm": 2.21823378133338, + "language_loss": 0.76192427, + "learning_rate": 6.384103882660397e-08, + "loss": 0.83861536, + "num_input_tokens_seen": 330651470, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.09674072, + "step": 15331, + "time_per_iteration": 2.484335422515869 + }, + { + "auxiliary_loss_clip": 0.0640348, + "auxiliary_loss_mlp": 0.0126417, + "balance_loss_clip": 0.06270707, + "balance_loss_mlp": 0.01254621, + "epoch": 0.9218097099053059, + "flos": 20528796211200.0, + "grad_norm": 1.4680320475819244, + "language_loss": 0.75768459, + "learning_rate": 6.374345953275794e-08, + "loss": 0.83436108, + "num_input_tokens_seen": 330669170, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09552002, + "step": 15332, + "time_per_iteration": 2.5472254753112793 + }, + { + "auxiliary_loss_clip": 0.06399745, + "auxiliary_loss_mlp": 0.012679, + "balance_loss_clip": 0.06270576, + "balance_loss_mlp": 0.01259242, + "epoch": 0.9218698331579739, + "flos": 17354518387200.0, + "grad_norm": 1.6404932332375755, + "language_loss": 0.7481606, + "learning_rate": 6.364595366195358e-08, + "loss": 0.82483709, + "num_input_tokens_seen": 330686635, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.08657837, + "step": 15333, + "time_per_iteration": 2.5107102394104004 + }, + { + "auxiliary_loss_clip": 0.06310606, + "auxiliary_loss_mlp": 0.01248711, + "balance_loss_clip": 0.0625622, + "balance_loss_mlp": 0.01247726, + "epoch": 0.9219299564106418, + "flos": 61975717430400.0, + "grad_norm": 0.7759353424239996, + "language_loss": 0.52887559, + "learning_rate": 6.354852121788879e-08, + "loss": 0.60446876, + "num_input_tokens_seen": 330749160, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.00983429, + "step": 15334, + "time_per_iteration": 3.109227180480957 + }, + { + "auxiliary_loss_clip": 0.06396239, + "auxiliary_loss_mlp": 0.01262699, + "balance_loss_clip": 0.06269791, + "balance_loss_mlp": 0.01254223, + "epoch": 0.9219900796633098, + "flos": 15706542211200.0, + "grad_norm": 1.7785905559381385, + "language_loss": 0.62691534, + "learning_rate": 6.345116220425839e-08, + "loss": 0.70350474, + "num_input_tokens_seen": 330766840, + "router_z_loss_clip": 1.26367188, + "router_z_loss_mlp": 0.0847168, + "step": 15335, + "time_per_iteration": 2.5022456645965576 + }, + { + "auxiliary_loss_clip": 0.06401903, + "auxiliary_loss_mlp": 0.01266885, + "balance_loss_clip": 0.06270576, + "balance_loss_mlp": 0.01257539, + "epoch": 0.9220502029159777, + "flos": 24938996958720.0, + "grad_norm": 1.5764942536870223, + "language_loss": 0.71558487, + "learning_rate": 6.335387662475366e-08, + "loss": 0.79227275, + "num_input_tokens_seen": 330785585, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09338379, + "step": 15336, + "time_per_iteration": 2.5597825050354004 + }, + { + "auxiliary_loss_clip": 0.06400605, + "auxiliary_loss_mlp": 0.01263441, + "balance_loss_clip": 0.06271902, + "balance_loss_mlp": 0.01254894, + "epoch": 0.9221103261686457, + "flos": 15672315018240.0, + "grad_norm": 1.803852700991986, + "language_loss": 0.72009486, + "learning_rate": 6.325666448306433e-08, + "loss": 0.79673529, + "num_input_tokens_seen": 330800750, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.08544922, + "step": 15337, + "time_per_iteration": 3.9219424724578857 + }, + { + "auxiliary_loss_clip": 0.06308219, + "auxiliary_loss_mlp": 0.01248795, + "balance_loss_clip": 0.06254087, + "balance_loss_mlp": 0.0124781, + "epoch": 0.9221704494213137, + "flos": 67536643098240.0, + "grad_norm": 0.8647733027794, + "language_loss": 0.65245771, + "learning_rate": 6.31595257828763e-08, + "loss": 0.72802794, + "num_input_tokens_seen": 330863640, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.00984955, + "step": 15338, + "time_per_iteration": 3.142150640487671 + }, + { + "auxiliary_loss_clip": 0.06404249, + "auxiliary_loss_mlp": 0.01264427, + "balance_loss_clip": 0.06273044, + "balance_loss_mlp": 0.01255236, + "epoch": 0.9222305726739817, + "flos": 30234798708480.0, + "grad_norm": 1.6484364978205361, + "language_loss": 0.67409325, + "learning_rate": 6.306246052787289e-08, + "loss": 0.75077999, + "num_input_tokens_seen": 330884675, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.09191895, + "step": 15339, + "time_per_iteration": 2.593411684036255 + }, + { + "auxiliary_loss_clip": 0.06400622, + "auxiliary_loss_mlp": 0.01263453, + "balance_loss_clip": 0.06269693, + "balance_loss_mlp": 0.01254399, + "epoch": 0.9222906959266496, + "flos": 25344132249600.0, + "grad_norm": 1.7385628862396276, + "language_loss": 0.71863818, + "learning_rate": 6.296546872173513e-08, + "loss": 0.79527897, + "num_input_tokens_seen": 330904125, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09051514, + "step": 15340, + "time_per_iteration": 2.5827271938323975 + }, + { + "auxiliary_loss_clip": 0.064013, + "auxiliary_loss_mlp": 0.01266685, + "balance_loss_clip": 0.06271731, + "balance_loss_mlp": 0.01257506, + "epoch": 0.9223508191793176, + "flos": 27607260775680.0, + "grad_norm": 1.4559470665197816, + "language_loss": 0.70787621, + "learning_rate": 6.286855036814098e-08, + "loss": 0.78455609, + "num_input_tokens_seen": 330925140, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.09179688, + "step": 15341, + "time_per_iteration": 2.69647479057312 + }, + { + "auxiliary_loss_clip": 0.06392725, + "auxiliary_loss_mlp": 0.01263032, + "balance_loss_clip": 0.06267273, + "balance_loss_mlp": 0.01254956, + "epoch": 0.9224109424319856, + "flos": 27314869553280.0, + "grad_norm": 1.5381458649062534, + "language_loss": 0.67303658, + "learning_rate": 6.277170547076571e-08, + "loss": 0.74959409, + "num_input_tokens_seen": 330946625, + "router_z_loss_clip": 1.25488281, + "router_z_loss_mlp": 0.08068848, + "step": 15342, + "time_per_iteration": 2.588177442550659 + }, + { + "auxiliary_loss_clip": 0.06401019, + "auxiliary_loss_mlp": 0.01262683, + "balance_loss_clip": 0.06269694, + "balance_loss_mlp": 0.01253474, + "epoch": 0.9224710656846535, + "flos": 48218152389120.0, + "grad_norm": 2.052024165680001, + "language_loss": 0.69629633, + "learning_rate": 6.26749340332815e-08, + "loss": 0.7729333, + "num_input_tokens_seen": 330967795, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09210205, + "step": 15343, + "time_per_iteration": 2.7706665992736816 + }, + { + "auxiliary_loss_clip": 0.063094, + "auxiliary_loss_mlp": 0.01249689, + "balance_loss_clip": 0.06255051, + "balance_loss_mlp": 0.01248708, + "epoch": 0.9225311889373216, + "flos": 66743814165120.0, + "grad_norm": 0.8019643704800373, + "language_loss": 0.51885521, + "learning_rate": 6.257823605935786e-08, + "loss": 0.59444606, + "num_input_tokens_seen": 331040850, + "router_z_loss_clip": 0.54638672, + "router_z_loss_mlp": 0.00980377, + "step": 15344, + "time_per_iteration": 3.30328631401062 + }, + { + "auxiliary_loss_clip": 0.06392275, + "auxiliary_loss_mlp": 0.01264218, + "balance_loss_clip": 0.06267268, + "balance_loss_mlp": 0.01255981, + "epoch": 0.9225913121899895, + "flos": 22277525322240.0, + "grad_norm": 1.572478644220583, + "language_loss": 0.70385808, + "learning_rate": 6.248161155266162e-08, + "loss": 0.78042299, + "num_input_tokens_seen": 331060595, + "router_z_loss_clip": 1.25, + "router_z_loss_mlp": 0.0824585, + "step": 15345, + "time_per_iteration": 3.937687397003174 + }, + { + "auxiliary_loss_clip": 0.06402814, + "auxiliary_loss_mlp": 0.01267303, + "balance_loss_clip": 0.06271317, + "balance_loss_mlp": 0.01257075, + "epoch": 0.9226514354426575, + "flos": 20088679040640.0, + "grad_norm": 2.342779825818367, + "language_loss": 0.77456373, + "learning_rate": 6.238506051685677e-08, + "loss": 0.85126495, + "num_input_tokens_seen": 331080195, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.10223389, + "step": 15346, + "time_per_iteration": 2.569237232208252 + }, + { + "auxiliary_loss_clip": 0.0640711, + "auxiliary_loss_mlp": 0.01270235, + "balance_loss_clip": 0.06270187, + "balance_loss_mlp": 0.01259381, + "epoch": 0.9227115586953254, + "flos": 16076402133120.0, + "grad_norm": 6.129129283291578, + "language_loss": 0.76381576, + "learning_rate": 6.228858295560457e-08, + "loss": 0.84058923, + "num_input_tokens_seen": 331097645, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.10845947, + "step": 15347, + "time_per_iteration": 2.54581618309021 + }, + { + "auxiliary_loss_clip": 0.06394706, + "auxiliary_loss_mlp": 0.01264003, + "balance_loss_clip": 0.06268799, + "balance_loss_mlp": 0.01255444, + "epoch": 0.9227716819479934, + "flos": 20451788709120.0, + "grad_norm": 1.6400744592090153, + "language_loss": 0.76745045, + "learning_rate": 6.219217887256367e-08, + "loss": 0.84403753, + "num_input_tokens_seen": 331116830, + "router_z_loss_clip": 1.25976562, + "router_z_loss_mlp": 0.08563232, + "step": 15348, + "time_per_iteration": 2.590552806854248 + }, + { + "auxiliary_loss_clip": 0.0640482, + "auxiliary_loss_mlp": 0.01263248, + "balance_loss_clip": 0.06270683, + "balance_loss_mlp": 0.01253097, + "epoch": 0.9228318052006613, + "flos": 25014033889920.0, + "grad_norm": 1.7903050543327186, + "language_loss": 0.68388069, + "learning_rate": 6.209584827138959e-08, + "loss": 0.76056135, + "num_input_tokens_seen": 331137235, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.10150146, + "step": 15349, + "time_per_iteration": 2.5478007793426514 + }, + { + "auxiliary_loss_clip": 0.0640128, + "auxiliary_loss_mlp": 0.01262275, + "balance_loss_clip": 0.06269504, + "balance_loss_mlp": 0.01253227, + "epoch": 0.9228919284533293, + "flos": 12682170541440.0, + "grad_norm": 2.1989132821719948, + "language_loss": 0.87228858, + "learning_rate": 6.199959115573495e-08, + "loss": 0.94892418, + "num_input_tokens_seen": 331153155, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.09051514, + "step": 15350, + "time_per_iteration": 2.5094597339630127 + }, + { + "auxiliary_loss_clip": 0.06312097, + "auxiliary_loss_mlp": 0.01249183, + "balance_loss_clip": 0.06257882, + "balance_loss_mlp": 0.01248158, + "epoch": 0.9229520517059973, + "flos": 70005050928000.0, + "grad_norm": 0.7490449092962135, + "language_loss": 0.60287833, + "learning_rate": 6.190340752924994e-08, + "loss": 0.67849118, + "num_input_tokens_seen": 331214895, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.01025391, + "step": 15351, + "time_per_iteration": 3.092261791229248 + }, + { + "auxiliary_loss_clip": 0.06403425, + "auxiliary_loss_mlp": 0.01263289, + "balance_loss_clip": 0.06269398, + "balance_loss_mlp": 0.01253901, + "epoch": 0.9230121749586653, + "flos": 14799166346880.0, + "grad_norm": 2.2204736454747493, + "language_loss": 0.77420902, + "learning_rate": 6.180729739558233e-08, + "loss": 0.85087621, + "num_input_tokens_seen": 331232185, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.09393311, + "step": 15352, + "time_per_iteration": 3.931007146835327 + }, + { + "auxiliary_loss_clip": 0.064078, + "auxiliary_loss_mlp": 0.01262825, + "balance_loss_clip": 0.0627102, + "balance_loss_mlp": 0.01252758, + "epoch": 0.9230722982113332, + "flos": 22974003659520.0, + "grad_norm": 2.303163162043219, + "language_loss": 0.5970825, + "learning_rate": 6.171126075837585e-08, + "loss": 0.67378873, + "num_input_tokens_seen": 331251065, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.10070801, + "step": 15353, + "time_per_iteration": 2.5389790534973145 + }, + { + "auxiliary_loss_clip": 0.06398928, + "auxiliary_loss_mlp": 0.01262823, + "balance_loss_clip": 0.06270197, + "balance_loss_mlp": 0.01253939, + "epoch": 0.9231324214640012, + "flos": 18557346147840.0, + "grad_norm": 1.5949625436453003, + "language_loss": 0.74683791, + "learning_rate": 6.161529762127293e-08, + "loss": 0.82345545, + "num_input_tokens_seen": 331269110, + "router_z_loss_clip": 1.28808594, + "router_z_loss_mlp": 0.08889771, + "step": 15354, + "time_per_iteration": 4.007373571395874 + }, + { + "auxiliary_loss_clip": 0.06408745, + "auxiliary_loss_mlp": 0.01265787, + "balance_loss_clip": 0.06272165, + "balance_loss_mlp": 0.0125532, + "epoch": 0.9231925447166691, + "flos": 22087899532800.0, + "grad_norm": 1.884882701150637, + "language_loss": 0.65271533, + "learning_rate": 6.1519407987912e-08, + "loss": 0.72946066, + "num_input_tokens_seen": 331286555, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.10473633, + "step": 15355, + "time_per_iteration": 2.5409066677093506 + }, + { + "auxiliary_loss_clip": 0.06397177, + "auxiliary_loss_mlp": 0.01263201, + "balance_loss_clip": 0.06271057, + "balance_loss_mlp": 0.01254028, + "epoch": 0.9232526679693371, + "flos": 26548259748480.0, + "grad_norm": 1.546790587862242, + "language_loss": 0.74723232, + "learning_rate": 6.142359186192947e-08, + "loss": 0.82383615, + "num_input_tokens_seen": 331307660, + "router_z_loss_clip": 1.26074219, + "router_z_loss_mlp": 0.09179688, + "step": 15356, + "time_per_iteration": 2.5545573234558105 + }, + { + "auxiliary_loss_clip": 0.06402813, + "auxiliary_loss_mlp": 0.01264241, + "balance_loss_clip": 0.0627003, + "balance_loss_mlp": 0.01254603, + "epoch": 0.9233127912220052, + "flos": 14761878480000.0, + "grad_norm": 1.6173539213907528, + "language_loss": 0.60903341, + "learning_rate": 6.132784924695844e-08, + "loss": 0.68570393, + "num_input_tokens_seen": 331324885, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.09637451, + "step": 15357, + "time_per_iteration": 2.479755163192749 + }, + { + "auxiliary_loss_clip": 0.06403501, + "auxiliary_loss_mlp": 0.01264325, + "balance_loss_clip": 0.0626936, + "balance_loss_mlp": 0.01254848, + "epoch": 0.9233729144746731, + "flos": 25268298704640.0, + "grad_norm": 1.3619838972501352, + "language_loss": 0.70080173, + "learning_rate": 6.123218014662956e-08, + "loss": 0.77747995, + "num_input_tokens_seen": 331345885, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.0947876, + "step": 15358, + "time_per_iteration": 2.5597140789031982 + }, + { + "auxiliary_loss_clip": 0.06399107, + "auxiliary_loss_mlp": 0.01262902, + "balance_loss_clip": 0.06269094, + "balance_loss_mlp": 0.01254063, + "epoch": 0.9234330377273411, + "flos": 27856368564480.0, + "grad_norm": 1.796399091870678, + "language_loss": 0.73676997, + "learning_rate": 6.113658456457104e-08, + "loss": 0.81339008, + "num_input_tokens_seen": 331364320, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.08837891, + "step": 15359, + "time_per_iteration": 2.582848072052002 + }, + { + "auxiliary_loss_clip": 0.06400593, + "auxiliary_loss_mlp": 0.01263199, + "balance_loss_clip": 0.06269514, + "balance_loss_mlp": 0.01253847, + "epoch": 0.923493160980009, + "flos": 24615313436160.0, + "grad_norm": 1.8173722037046873, + "language_loss": 0.65140021, + "learning_rate": 6.104106250440732e-08, + "loss": 0.72803813, + "num_input_tokens_seen": 331384135, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09356689, + "step": 15360, + "time_per_iteration": 2.5897343158721924 + }, + { + "auxiliary_loss_clip": 0.06310426, + "auxiliary_loss_mlp": 0.01250329, + "balance_loss_clip": 0.06256235, + "balance_loss_mlp": 0.0124932, + "epoch": 0.923553284232677, + "flos": 67721656913280.0, + "grad_norm": 0.7579229937332289, + "language_loss": 0.5489769, + "learning_rate": 6.094561396976083e-08, + "loss": 0.62458444, + "num_input_tokens_seen": 331440645, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.01008606, + "step": 15361, + "time_per_iteration": 3.076972723007202 + }, + { + "auxiliary_loss_clip": 0.0640441, + "auxiliary_loss_mlp": 0.01263174, + "balance_loss_clip": 0.06269506, + "balance_loss_mlp": 0.01252404, + "epoch": 0.9236134074853449, + "flos": 18813246117120.0, + "grad_norm": 1.9671802371462084, + "language_loss": 0.70403993, + "learning_rate": 6.085023896425112e-08, + "loss": 0.78071576, + "num_input_tokens_seen": 331459580, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.10760498, + "step": 15362, + "time_per_iteration": 2.5362637042999268 + }, + { + "auxiliary_loss_clip": 0.06406496, + "auxiliary_loss_mlp": 0.0126344, + "balance_loss_clip": 0.06270804, + "balance_loss_mlp": 0.01253278, + "epoch": 0.923673530738013, + "flos": 27789800895360.0, + "grad_norm": 1.3407454971691222, + "language_loss": 0.75910234, + "learning_rate": 6.075493749149463e-08, + "loss": 0.83580172, + "num_input_tokens_seen": 331481560, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10162354, + "step": 15363, + "time_per_iteration": 2.552292585372925 + }, + { + "auxiliary_loss_clip": 0.06403825, + "auxiliary_loss_mlp": 0.01265451, + "balance_loss_clip": 0.06272069, + "balance_loss_mlp": 0.01256027, + "epoch": 0.9237336539906809, + "flos": 26804369352960.0, + "grad_norm": 1.950831388344252, + "language_loss": 0.83409828, + "learning_rate": 6.065970955510514e-08, + "loss": 0.91079104, + "num_input_tokens_seen": 331499090, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09423828, + "step": 15364, + "time_per_iteration": 2.556971549987793 + }, + { + "auxiliary_loss_clip": 0.06398296, + "auxiliary_loss_mlp": 0.01265053, + "balance_loss_clip": 0.06268522, + "balance_loss_mlp": 0.01256631, + "epoch": 0.9237937772433489, + "flos": 23594648451840.0, + "grad_norm": 1.5023507773294924, + "language_loss": 0.68472719, + "learning_rate": 6.056455515869419e-08, + "loss": 0.7613607, + "num_input_tokens_seen": 331519420, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.08422852, + "step": 15365, + "time_per_iteration": 2.525970935821533 + }, + { + "auxiliary_loss_clip": 0.06400183, + "auxiliary_loss_mlp": 0.01265116, + "balance_loss_clip": 0.06269205, + "balance_loss_mlp": 0.0125546, + "epoch": 0.9238539004960168, + "flos": 26147736432000.0, + "grad_norm": 2.741191058954429, + "language_loss": 0.62701088, + "learning_rate": 6.046947430586913e-08, + "loss": 0.70366389, + "num_input_tokens_seen": 331538720, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09661865, + "step": 15366, + "time_per_iteration": 2.571578025817871 + }, + { + "auxiliary_loss_clip": 0.06403293, + "auxiliary_loss_mlp": 0.01261168, + "balance_loss_clip": 0.06273372, + "balance_loss_mlp": 0.01251261, + "epoch": 0.9239140237486848, + "flos": 21074152510080.0, + "grad_norm": 1.7815327579173699, + "language_loss": 0.74507236, + "learning_rate": 6.037446700023619e-08, + "loss": 0.82171696, + "num_input_tokens_seen": 331558505, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.09899902, + "step": 15367, + "time_per_iteration": 2.5045971870422363 + }, + { + "auxiliary_loss_clip": 0.06390847, + "auxiliary_loss_mlp": 0.01264934, + "balance_loss_clip": 0.06267439, + "balance_loss_mlp": 0.01255922, + "epoch": 0.9239741470013527, + "flos": 24614810311680.0, + "grad_norm": 1.8519512729741396, + "language_loss": 0.64742005, + "learning_rate": 6.027953324539759e-08, + "loss": 0.72397792, + "num_input_tokens_seen": 331578440, + "router_z_loss_clip": 1.23339844, + "router_z_loss_mlp": 0.08996582, + "step": 15368, + "time_per_iteration": 2.544147491455078 + }, + { + "auxiliary_loss_clip": 0.06404577, + "auxiliary_loss_mlp": 0.01267709, + "balance_loss_clip": 0.06269414, + "balance_loss_mlp": 0.0125754, + "epoch": 0.9240342702540207, + "flos": 24725290320000.0, + "grad_norm": 1.790282394600615, + "language_loss": 0.74812615, + "learning_rate": 6.018467304495401e-08, + "loss": 0.82484901, + "num_input_tokens_seen": 331598945, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.10168457, + "step": 15369, + "time_per_iteration": 2.524303913116455 + }, + { + "auxiliary_loss_clip": 0.06408086, + "auxiliary_loss_mlp": 0.01264607, + "balance_loss_clip": 0.06271143, + "balance_loss_mlp": 0.01253383, + "epoch": 0.9240943935066888, + "flos": 20856253167360.0, + "grad_norm": 1.8071530163307696, + "language_loss": 0.77047461, + "learning_rate": 6.008988640250145e-08, + "loss": 0.84720153, + "num_input_tokens_seen": 331616700, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.11230469, + "step": 15370, + "time_per_iteration": 2.513298988342285 + }, + { + "auxiliary_loss_clip": 0.06402336, + "auxiliary_loss_mlp": 0.01261917, + "balance_loss_clip": 0.0627064, + "balance_loss_mlp": 0.01252923, + "epoch": 0.9241545167593567, + "flos": 24469222642560.0, + "grad_norm": 2.0099399345355575, + "language_loss": 0.67316246, + "learning_rate": 5.999517332163528e-08, + "loss": 0.74980497, + "num_input_tokens_seen": 331635625, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.08996582, + "step": 15371, + "time_per_iteration": 2.520193576812744 + }, + { + "auxiliary_loss_clip": 0.06306948, + "auxiliary_loss_mlp": 0.01251246, + "balance_loss_clip": 0.062529, + "balance_loss_mlp": 0.01250195, + "epoch": 0.9242146400120247, + "flos": 61847110212480.0, + "grad_norm": 0.7120628094396801, + "language_loss": 0.5773133, + "learning_rate": 5.99005338059464e-08, + "loss": 0.65289533, + "num_input_tokens_seen": 331698595, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.01052094, + "step": 15372, + "time_per_iteration": 3.0978200435638428 + }, + { + "auxiliary_loss_clip": 0.06395283, + "auxiliary_loss_mlp": 0.01266989, + "balance_loss_clip": 0.06267901, + "balance_loss_mlp": 0.01258782, + "epoch": 0.9242747632646926, + "flos": 22053923902080.0, + "grad_norm": 1.7652087955090183, + "language_loss": 0.70249438, + "learning_rate": 5.98059678590237e-08, + "loss": 0.77911711, + "num_input_tokens_seen": 331717975, + "router_z_loss_clip": 1.2734375, + "router_z_loss_mlp": 0.08209229, + "step": 15373, + "time_per_iteration": 2.4996917247772217 + }, + { + "auxiliary_loss_clip": 0.06402817, + "auxiliary_loss_mlp": 0.01269313, + "balance_loss_clip": 0.06271312, + "balance_loss_mlp": 0.01259436, + "epoch": 0.9243348865173606, + "flos": 18484195933440.0, + "grad_norm": 2.6606321172292424, + "language_loss": 0.75800008, + "learning_rate": 5.971147548445299e-08, + "loss": 0.83472145, + "num_input_tokens_seen": 331737220, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09881592, + "step": 15374, + "time_per_iteration": 2.4819071292877197 + }, + { + "auxiliary_loss_clip": 0.06398623, + "auxiliary_loss_mlp": 0.01262613, + "balance_loss_clip": 0.06267889, + "balance_loss_mlp": 0.01253738, + "epoch": 0.9243950097700285, + "flos": 23265556341120.0, + "grad_norm": 1.5989491973910335, + "language_loss": 0.6470179, + "learning_rate": 5.961705668581784e-08, + "loss": 0.72363025, + "num_input_tokens_seen": 331757300, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.08874512, + "step": 15375, + "time_per_iteration": 2.511228561401367 + }, + { + "auxiliary_loss_clip": 0.06398005, + "auxiliary_loss_mlp": 0.01260851, + "balance_loss_clip": 0.06269285, + "balance_loss_mlp": 0.01251189, + "epoch": 0.9244551330226966, + "flos": 29756261640960.0, + "grad_norm": 2.2942145440392028, + "language_loss": 0.66584778, + "learning_rate": 5.952271146669829e-08, + "loss": 0.74243629, + "num_input_tokens_seen": 331776995, + "router_z_loss_clip": 1.28808594, + "router_z_loss_mlp": 0.09655762, + "step": 15376, + "time_per_iteration": 2.5877747535705566 + }, + { + "auxiliary_loss_clip": 0.06310389, + "auxiliary_loss_mlp": 0.01248316, + "balance_loss_clip": 0.06256086, + "balance_loss_mlp": 0.01247218, + "epoch": 0.9245152562753645, + "flos": 68885310090240.0, + "grad_norm": 0.6448904976403038, + "language_loss": 0.61183542, + "learning_rate": 5.94284398306717e-08, + "loss": 0.68742251, + "num_input_tokens_seen": 331845015, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.01100159, + "step": 15377, + "time_per_iteration": 4.591358184814453 + }, + { + "auxiliary_loss_clip": 0.06397624, + "auxiliary_loss_mlp": 0.01264112, + "balance_loss_clip": 0.06267756, + "balance_loss_mlp": 0.01254254, + "epoch": 0.9245753795280325, + "flos": 21585575105280.0, + "grad_norm": 1.6098683920154133, + "language_loss": 0.74425101, + "learning_rate": 5.933424178131341e-08, + "loss": 0.82086837, + "num_input_tokens_seen": 331862795, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.09851074, + "step": 15378, + "time_per_iteration": 2.498936653137207 + }, + { + "auxiliary_loss_clip": 0.06402528, + "auxiliary_loss_mlp": 0.01263964, + "balance_loss_clip": 0.06269968, + "balance_loss_mlp": 0.01254314, + "epoch": 0.9246355027807004, + "flos": 34504694593920.0, + "grad_norm": 1.8895065800436894, + "language_loss": 0.62142766, + "learning_rate": 5.924011732219503e-08, + "loss": 0.69809258, + "num_input_tokens_seen": 331882535, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09655762, + "step": 15379, + "time_per_iteration": 2.6365721225738525 + }, + { + "auxiliary_loss_clip": 0.06397697, + "auxiliary_loss_mlp": 0.01264574, + "balance_loss_clip": 0.06270209, + "balance_loss_mlp": 0.01255764, + "epoch": 0.9246956260333684, + "flos": 15958123695360.0, + "grad_norm": 1.9053224282223191, + "language_loss": 0.83903706, + "learning_rate": 5.914606645688591e-08, + "loss": 0.91565973, + "num_input_tokens_seen": 331899335, + "router_z_loss_clip": 1.27441406, + "router_z_loss_mlp": 0.08813477, + "step": 15380, + "time_per_iteration": 2.4695920944213867 + }, + { + "auxiliary_loss_clip": 0.06402585, + "auxiliary_loss_mlp": 0.01265336, + "balance_loss_clip": 0.0626857, + "balance_loss_mlp": 0.01254715, + "epoch": 0.9247557492860363, + "flos": 23375197808640.0, + "grad_norm": 1.384509137636546, + "language_loss": 0.7339139, + "learning_rate": 5.905208918895233e-08, + "loss": 0.81059313, + "num_input_tokens_seen": 331919030, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.10614014, + "step": 15381, + "time_per_iteration": 2.534614086151123 + }, + { + "auxiliary_loss_clip": 0.0640035, + "auxiliary_loss_mlp": 0.01262661, + "balance_loss_clip": 0.06271455, + "balance_loss_mlp": 0.01253595, + "epoch": 0.9248158725387043, + "flos": 23046608822400.0, + "grad_norm": 1.680142462272489, + "language_loss": 0.78818119, + "learning_rate": 5.8958185521958524e-08, + "loss": 0.86481124, + "num_input_tokens_seen": 331936465, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.09057617, + "step": 15382, + "time_per_iteration": 2.5061895847320557 + }, + { + "auxiliary_loss_clip": 0.06401303, + "auxiliary_loss_mlp": 0.01264469, + "balance_loss_clip": 0.06268425, + "balance_loss_mlp": 0.01254354, + "epoch": 0.9248759957913724, + "flos": 22527974776320.0, + "grad_norm": 1.7961295169638432, + "language_loss": 0.74988508, + "learning_rate": 5.886435545946455e-08, + "loss": 0.82654279, + "num_input_tokens_seen": 331954625, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.10107422, + "step": 15383, + "time_per_iteration": 2.581434488296509 + }, + { + "auxiliary_loss_clip": 0.06396997, + "auxiliary_loss_mlp": 0.01261141, + "balance_loss_clip": 0.06268598, + "balance_loss_mlp": 0.01252499, + "epoch": 0.9249361190440403, + "flos": 25454318768640.0, + "grad_norm": 1.566333672745091, + "language_loss": 0.75798136, + "learning_rate": 5.8770599005028456e-08, + "loss": 0.83456272, + "num_input_tokens_seen": 331975865, + "router_z_loss_clip": 1.28222656, + "router_z_loss_mlp": 0.08642578, + "step": 15384, + "time_per_iteration": 2.55129075050354 + }, + { + "auxiliary_loss_clip": 0.0639509, + "auxiliary_loss_mlp": 0.01261598, + "balance_loss_clip": 0.06269214, + "balance_loss_mlp": 0.01252306, + "epoch": 0.9249962422967083, + "flos": 12382358232960.0, + "grad_norm": 2.143877935574221, + "language_loss": 0.66191006, + "learning_rate": 5.8676916162206045e-08, + "loss": 0.73847699, + "num_input_tokens_seen": 331992760, + "router_z_loss_clip": 1.2578125, + "router_z_loss_mlp": 0.09289551, + "step": 15385, + "time_per_iteration": 3.918323516845703 + }, + { + "auxiliary_loss_clip": 0.0639942, + "auxiliary_loss_mlp": 0.01268229, + "balance_loss_clip": 0.06270313, + "balance_loss_mlp": 0.01259204, + "epoch": 0.9250563655493762, + "flos": 22936003032960.0, + "grad_norm": 1.8903454338190138, + "language_loss": 0.80601746, + "learning_rate": 5.85833069345496e-08, + "loss": 0.88269401, + "num_input_tokens_seen": 332011890, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.090271, + "step": 15386, + "time_per_iteration": 2.52738094329834 + }, + { + "auxiliary_loss_clip": 0.06399529, + "auxiliary_loss_mlp": 0.01263013, + "balance_loss_clip": 0.0627138, + "balance_loss_mlp": 0.01253727, + "epoch": 0.9251164888020442, + "flos": 18484573276800.0, + "grad_norm": 1.9057906513931537, + "language_loss": 0.75911927, + "learning_rate": 5.8489771325608504e-08, + "loss": 0.83574468, + "num_input_tokens_seen": 332029485, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.09283447, + "step": 15387, + "time_per_iteration": 2.479053020477295 + }, + { + "auxiliary_loss_clip": 0.06396089, + "auxiliary_loss_mlp": 0.01263451, + "balance_loss_clip": 0.06269023, + "balance_loss_mlp": 0.01254779, + "epoch": 0.9251766120547121, + "flos": 33045505666560.0, + "grad_norm": 1.2958399719477445, + "language_loss": 0.70158648, + "learning_rate": 5.839630933893014e-08, + "loss": 0.77818191, + "num_input_tokens_seen": 332052970, + "router_z_loss_clip": 1.27148438, + "router_z_loss_mlp": 0.08660889, + "step": 15388, + "time_per_iteration": 2.7240984439849854 + }, + { + "auxiliary_loss_clip": 0.06403159, + "auxiliary_loss_mlp": 0.01266072, + "balance_loss_clip": 0.06270151, + "balance_loss_mlp": 0.01256702, + "epoch": 0.9252367353073802, + "flos": 24394563054720.0, + "grad_norm": 1.6728291040294425, + "language_loss": 0.81795633, + "learning_rate": 5.8302920978058115e-08, + "loss": 0.89464867, + "num_input_tokens_seen": 332070395, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.09368896, + "step": 15389, + "time_per_iteration": 2.5441529750823975 + }, + { + "auxiliary_loss_clip": 0.06410511, + "auxiliary_loss_mlp": 0.01266804, + "balance_loss_clip": 0.06269868, + "balance_loss_mlp": 0.01256213, + "epoch": 0.9252968585600481, + "flos": 18922887584640.0, + "grad_norm": 1.643054722636028, + "language_loss": 0.79540706, + "learning_rate": 5.820960624653381e-08, + "loss": 0.87218022, + "num_input_tokens_seen": 332090185, + "router_z_loss_clip": 1.40527344, + "router_z_loss_mlp": 0.10577393, + "step": 15390, + "time_per_iteration": 3.9439857006073 + }, + { + "auxiliary_loss_clip": 0.06405532, + "auxiliary_loss_mlp": 0.01265289, + "balance_loss_clip": 0.06270221, + "balance_loss_mlp": 0.01255448, + "epoch": 0.9253569818127161, + "flos": 21731707825920.0, + "grad_norm": 1.8343388341488236, + "language_loss": 0.75466919, + "learning_rate": 5.811636514789597e-08, + "loss": 0.83137739, + "num_input_tokens_seen": 332109050, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.09838867, + "step": 15391, + "time_per_iteration": 2.5190751552581787 + }, + { + "auxiliary_loss_clip": 0.06401081, + "auxiliary_loss_mlp": 0.0126542, + "balance_loss_clip": 0.06268418, + "balance_loss_mlp": 0.01255937, + "epoch": 0.925417105065384, + "flos": 34248878478720.0, + "grad_norm": 2.6134750174735615, + "language_loss": 0.52719831, + "learning_rate": 5.80231976856802e-08, + "loss": 0.60386336, + "num_input_tokens_seen": 332131180, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09490967, + "step": 15392, + "time_per_iteration": 2.618853807449341 + }, + { + "auxiliary_loss_clip": 0.06401975, + "auxiliary_loss_mlp": 0.01263312, + "balance_loss_clip": 0.06268699, + "balance_loss_mlp": 0.01254097, + "epoch": 0.925477228318052, + "flos": 25966915320960.0, + "grad_norm": 1.5816032710587289, + "language_loss": 0.7732839, + "learning_rate": 5.7930103863419454e-08, + "loss": 0.84993678, + "num_input_tokens_seen": 332149555, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.09210205, + "step": 15393, + "time_per_iteration": 2.537705659866333 + }, + { + "auxiliary_loss_clip": 0.06396216, + "auxiliary_loss_mlp": 0.01266005, + "balance_loss_clip": 0.06267455, + "balance_loss_mlp": 0.01256475, + "epoch": 0.9255373515707199, + "flos": 11843039427840.0, + "grad_norm": 1.996154441217668, + "language_loss": 0.69555247, + "learning_rate": 5.783708368464357e-08, + "loss": 0.77217472, + "num_input_tokens_seen": 332165830, + "router_z_loss_clip": 1.28808594, + "router_z_loss_mlp": 0.09539795, + "step": 15394, + "time_per_iteration": 3.9489758014678955 + }, + { + "auxiliary_loss_clip": 0.06405875, + "auxiliary_loss_mlp": 0.0126477, + "balance_loss_clip": 0.06272207, + "balance_loss_mlp": 0.01254965, + "epoch": 0.925597474823388, + "flos": 21440784049920.0, + "grad_norm": 1.656460677506419, + "language_loss": 0.73046553, + "learning_rate": 5.7744137152879956e-08, + "loss": 0.80717206, + "num_input_tokens_seen": 332185130, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.0980835, + "step": 15395, + "time_per_iteration": 2.6102516651153564 + }, + { + "auxiliary_loss_clip": 0.06395631, + "auxiliary_loss_mlp": 0.01263537, + "balance_loss_clip": 0.06268463, + "balance_loss_mlp": 0.01254513, + "epoch": 0.925657598076056, + "flos": 22864320264960.0, + "grad_norm": 1.8614827496346085, + "language_loss": 0.71563172, + "learning_rate": 5.7651264271653785e-08, + "loss": 0.79222345, + "num_input_tokens_seen": 332203695, + "router_z_loss_clip": 1.26953125, + "router_z_loss_mlp": 0.09020996, + "step": 15396, + "time_per_iteration": 2.4928057193756104 + }, + { + "auxiliary_loss_clip": 0.06398199, + "auxiliary_loss_mlp": 0.01264777, + "balance_loss_clip": 0.06268467, + "balance_loss_mlp": 0.01254763, + "epoch": 0.9257177213287239, + "flos": 25711350768000.0, + "grad_norm": 1.848315648403689, + "language_loss": 0.87198037, + "learning_rate": 5.755846504448603e-08, + "loss": 0.94861013, + "num_input_tokens_seen": 332224850, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.10009766, + "step": 15397, + "time_per_iteration": 2.54464054107666 + }, + { + "auxiliary_loss_clip": 0.06308962, + "auxiliary_loss_mlp": 0.01250606, + "balance_loss_clip": 0.06255, + "balance_loss_mlp": 0.01249661, + "epoch": 0.9257778445813919, + "flos": 59610955501440.0, + "grad_norm": 0.7882354200342199, + "language_loss": 0.55162835, + "learning_rate": 5.746573947489586e-08, + "loss": 0.62722397, + "num_input_tokens_seen": 332278085, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.00942993, + "step": 15398, + "time_per_iteration": 2.9914557933807373 + }, + { + "auxiliary_loss_clip": 0.06410329, + "auxiliary_loss_mlp": 0.01264914, + "balance_loss_clip": 0.06272009, + "balance_loss_mlp": 0.01254346, + "epoch": 0.9258379678340598, + "flos": 27716860316160.0, + "grad_norm": 1.6589961349835687, + "language_loss": 0.76505327, + "learning_rate": 5.7373087566400025e-08, + "loss": 0.8418057, + "num_input_tokens_seen": 332297875, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.10571289, + "step": 15399, + "time_per_iteration": 2.5598769187927246 + }, + { + "auxiliary_loss_clip": 0.06392607, + "auxiliary_loss_mlp": 0.01261184, + "balance_loss_clip": 0.06267655, + "balance_loss_mlp": 0.01252952, + "epoch": 0.9258980910867278, + "flos": 24870500645760.0, + "grad_norm": 1.4116581037404592, + "language_loss": 0.78297949, + "learning_rate": 5.7280509322510826e-08, + "loss": 0.85951746, + "num_input_tokens_seen": 332318500, + "router_z_loss_clip": 1.24902344, + "router_z_loss_mlp": 0.08227539, + "step": 15400, + "time_per_iteration": 2.5266971588134766 + }, + { + "auxiliary_loss_clip": 0.06312054, + "auxiliary_loss_mlp": 0.01255899, + "balance_loss_clip": 0.06257905, + "balance_loss_mlp": 0.01254794, + "epoch": 0.9259582143393957, + "flos": 63153625800960.0, + "grad_norm": 0.7063959708054426, + "language_loss": 0.51333666, + "learning_rate": 5.718800474673946e-08, + "loss": 0.5890162, + "num_input_tokens_seen": 332381980, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.01106262, + "step": 15401, + "time_per_iteration": 3.093920946121216 + }, + { + "auxiliary_loss_clip": 0.0639642, + "auxiliary_loss_mlp": 0.01264437, + "balance_loss_clip": 0.06271479, + "balance_loss_mlp": 0.01255997, + "epoch": 0.9260183375920638, + "flos": 24132835226880.0, + "grad_norm": 1.5775889664181235, + "language_loss": 0.82458878, + "learning_rate": 5.709557384259378e-08, + "loss": 0.90119737, + "num_input_tokens_seen": 332399510, + "router_z_loss_clip": 1.24902344, + "router_z_loss_mlp": 0.08447266, + "step": 15402, + "time_per_iteration": 2.5282785892486572 + }, + { + "auxiliary_loss_clip": 0.06307814, + "auxiliary_loss_mlp": 0.01254092, + "balance_loss_clip": 0.06253652, + "balance_loss_mlp": 0.01252993, + "epoch": 0.9260784608447317, + "flos": 63064863999360.0, + "grad_norm": 0.7161646458588573, + "language_loss": 0.51258361, + "learning_rate": 5.700321661357876e-08, + "loss": 0.58820271, + "num_input_tokens_seen": 332459130, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.01101685, + "step": 15403, + "time_per_iteration": 3.221836566925049 + }, + { + "auxiliary_loss_clip": 0.0631336, + "auxiliary_loss_mlp": 0.01254044, + "balance_loss_clip": 0.06259177, + "balance_loss_mlp": 0.01253067, + "epoch": 0.9261385840973997, + "flos": 70607652364800.0, + "grad_norm": 0.6694714734059207, + "language_loss": 0.58772385, + "learning_rate": 5.69109330631965e-08, + "loss": 0.66339797, + "num_input_tokens_seen": 332526555, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.00976562, + "step": 15404, + "time_per_iteration": 3.1927330493927 + }, + { + "auxiliary_loss_clip": 0.06401071, + "auxiliary_loss_mlp": 0.01264228, + "balance_loss_clip": 0.06268735, + "balance_loss_mlp": 0.01254673, + "epoch": 0.9261987073500676, + "flos": 20236111499520.0, + "grad_norm": 1.9818455249680897, + "language_loss": 0.71835959, + "learning_rate": 5.681872319494596e-08, + "loss": 0.79501259, + "num_input_tokens_seen": 332544005, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.09558105, + "step": 15405, + "time_per_iteration": 2.499476432800293 + }, + { + "auxiliary_loss_clip": 0.06404161, + "auxiliary_loss_mlp": 0.01268691, + "balance_loss_clip": 0.06269959, + "balance_loss_mlp": 0.01259065, + "epoch": 0.9262588306027356, + "flos": 20959563651840.0, + "grad_norm": 1.7250744191621226, + "language_loss": 0.69170922, + "learning_rate": 5.672658701232458e-08, + "loss": 0.76843774, + "num_input_tokens_seen": 332563070, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.09625244, + "step": 15406, + "time_per_iteration": 2.5540614128112793 + }, + { + "auxiliary_loss_clip": 0.0640143, + "auxiliary_loss_mlp": 0.01263229, + "balance_loss_clip": 0.06268954, + "balance_loss_mlp": 0.01253126, + "epoch": 0.9263189538554035, + "flos": 22164361983360.0, + "grad_norm": 2.1174818175534242, + "language_loss": 0.76692176, + "learning_rate": 5.663452451882555e-08, + "loss": 0.84356833, + "num_input_tokens_seen": 332579620, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.10101318, + "step": 15407, + "time_per_iteration": 2.5082249641418457 + }, + { + "auxiliary_loss_clip": 0.06410325, + "auxiliary_loss_mlp": 0.01266644, + "balance_loss_clip": 0.06271269, + "balance_loss_mlp": 0.0125613, + "epoch": 0.9263790771080715, + "flos": 18193146376320.0, + "grad_norm": 1.7688340349597225, + "language_loss": 0.72253478, + "learning_rate": 5.6542535717940096e-08, + "loss": 0.79930449, + "num_input_tokens_seen": 332597795, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.10516357, + "step": 15408, + "time_per_iteration": 2.5314793586730957 + }, + { + "auxiliary_loss_clip": 0.06398048, + "auxiliary_loss_mlp": 0.0126162, + "balance_loss_clip": 0.06270379, + "balance_loss_mlp": 0.01253442, + "epoch": 0.9264392003607396, + "flos": 48189501492480.0, + "grad_norm": 5.21505973276934, + "language_loss": 0.68691289, + "learning_rate": 5.645062061315675e-08, + "loss": 0.76350951, + "num_input_tokens_seen": 332620375, + "router_z_loss_clip": 1.27734375, + "router_z_loss_mlp": 0.08184814, + "step": 15409, + "time_per_iteration": 2.755697011947632 + }, + { + "auxiliary_loss_clip": 0.06404391, + "auxiliary_loss_mlp": 0.01267031, + "balance_loss_clip": 0.06271081, + "balance_loss_mlp": 0.0125663, + "epoch": 0.9264993236134075, + "flos": 26395586409600.0, + "grad_norm": 1.7559130928965878, + "language_loss": 0.75985503, + "learning_rate": 5.6358779207960506e-08, + "loss": 0.83656931, + "num_input_tokens_seen": 332639510, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.10394287, + "step": 15410, + "time_per_iteration": 2.5520312786102295 + }, + { + "auxiliary_loss_clip": 0.06401296, + "auxiliary_loss_mlp": 0.01263143, + "balance_loss_clip": 0.06268159, + "balance_loss_mlp": 0.01253797, + "epoch": 0.9265594468660755, + "flos": 20925881510400.0, + "grad_norm": 1.552254697633523, + "language_loss": 0.82113504, + "learning_rate": 5.6267011505833905e-08, + "loss": 0.89777941, + "num_input_tokens_seen": 332658350, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.09344482, + "step": 15411, + "time_per_iteration": 2.5069782733917236 + }, + { + "auxiliary_loss_clip": 0.06407169, + "auxiliary_loss_mlp": 0.01262961, + "balance_loss_clip": 0.06274098, + "balance_loss_mlp": 0.01253573, + "epoch": 0.9266195701187434, + "flos": 17529930910080.0, + "grad_norm": 1.7428936214869757, + "language_loss": 0.75701684, + "learning_rate": 5.617531751025728e-08, + "loss": 0.83371818, + "num_input_tokens_seen": 332676715, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.09387207, + "step": 15412, + "time_per_iteration": 2.5123889446258545 + }, + { + "auxiliary_loss_clip": 0.06398541, + "auxiliary_loss_mlp": 0.0126566, + "balance_loss_clip": 0.06267709, + "balance_loss_mlp": 0.01256439, + "epoch": 0.9266796933714114, + "flos": 33696436510080.0, + "grad_norm": 1.5679043837553974, + "language_loss": 0.67275411, + "learning_rate": 5.6083697224707406e-08, + "loss": 0.74939615, + "num_input_tokens_seen": 332701470, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09222412, + "step": 15413, + "time_per_iteration": 2.605947732925415 + }, + { + "auxiliary_loss_clip": 0.06403206, + "auxiliary_loss_mlp": 0.01262992, + "balance_loss_clip": 0.06269696, + "balance_loss_mlp": 0.01253169, + "epoch": 0.9267398166240793, + "flos": 18922510241280.0, + "grad_norm": 1.6198376571408515, + "language_loss": 0.7588625, + "learning_rate": 5.5992150652658167e-08, + "loss": 0.83552444, + "num_input_tokens_seen": 332719060, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.09820557, + "step": 15414, + "time_per_iteration": 2.4856977462768555 + }, + { + "auxiliary_loss_clip": 0.06399503, + "auxiliary_loss_mlp": 0.01264925, + "balance_loss_clip": 0.06270388, + "balance_loss_mlp": 0.01256246, + "epoch": 0.9267999398767474, + "flos": 20484129185280.0, + "grad_norm": 2.030820880788606, + "language_loss": 0.81923372, + "learning_rate": 5.59006777975819e-08, + "loss": 0.89587802, + "num_input_tokens_seen": 332736345, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.08679199, + "step": 15415, + "time_per_iteration": 2.4929685592651367 + }, + { + "auxiliary_loss_clip": 0.06406386, + "auxiliary_loss_mlp": 0.01265515, + "balance_loss_clip": 0.06271857, + "balance_loss_mlp": 0.01255394, + "epoch": 0.9268600631294153, + "flos": 24796092620160.0, + "grad_norm": 1.5707213378789486, + "language_loss": 0.5453577, + "learning_rate": 5.580927866294671e-08, + "loss": 0.62207669, + "num_input_tokens_seen": 332756270, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.10131836, + "step": 15416, + "time_per_iteration": 4.090368747711182 + }, + { + "auxiliary_loss_clip": 0.06395909, + "auxiliary_loss_mlp": 0.01263225, + "balance_loss_clip": 0.06268269, + "balance_loss_mlp": 0.01254302, + "epoch": 0.9269201863820833, + "flos": 18703059598080.0, + "grad_norm": 1.4326729115430334, + "language_loss": 0.72303391, + "learning_rate": 5.571795325221807e-08, + "loss": 0.79962528, + "num_input_tokens_seen": 332775185, + "router_z_loss_clip": 1.27734375, + "router_z_loss_mlp": 0.08917236, + "step": 15417, + "time_per_iteration": 2.492025136947632 + }, + { + "auxiliary_loss_clip": 0.06399834, + "auxiliary_loss_mlp": 0.012626, + "balance_loss_clip": 0.06270199, + "balance_loss_mlp": 0.01253331, + "epoch": 0.9269803096347512, + "flos": 20930451557760.0, + "grad_norm": 1.915992557586703, + "language_loss": 0.75794625, + "learning_rate": 5.5626701568859624e-08, + "loss": 0.83457053, + "num_input_tokens_seen": 332794320, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.09265137, + "step": 15418, + "time_per_iteration": 2.500960111618042 + }, + { + "auxiliary_loss_clip": 0.06400837, + "auxiliary_loss_mlp": 0.01265825, + "balance_loss_clip": 0.06271046, + "balance_loss_mlp": 0.01256485, + "epoch": 0.9270404328874192, + "flos": 28010425495680.0, + "grad_norm": 1.4576581953985273, + "language_loss": 0.76502192, + "learning_rate": 5.553552361633174e-08, + "loss": 0.84168857, + "num_input_tokens_seen": 332818095, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.09344482, + "step": 15419, + "time_per_iteration": 2.5978782176971436 + }, + { + "auxiliary_loss_clip": 0.06393886, + "auxiliary_loss_mlp": 0.01261694, + "balance_loss_clip": 0.0626778, + "balance_loss_mlp": 0.01253612, + "epoch": 0.9271005561400871, + "flos": 25897790102400.0, + "grad_norm": 1.5679935415739816, + "language_loss": 0.7624113, + "learning_rate": 5.5444419398091636e-08, + "loss": 0.83896708, + "num_input_tokens_seen": 332839860, + "router_z_loss_clip": 1.26074219, + "router_z_loss_mlp": 0.08081055, + "step": 15420, + "time_per_iteration": 2.547791004180908 + }, + { + "auxiliary_loss_clip": 0.06403813, + "auxiliary_loss_mlp": 0.01264966, + "balance_loss_clip": 0.06269305, + "balance_loss_mlp": 0.01254708, + "epoch": 0.9271606793927551, + "flos": 27061443279360.0, + "grad_norm": 1.3671087136068567, + "language_loss": 0.76732445, + "learning_rate": 5.535338891759389e-08, + "loss": 0.84401226, + "num_input_tokens_seen": 332861155, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.10266113, + "step": 15421, + "time_per_iteration": 2.579566717147827 + }, + { + "auxiliary_loss_clip": 0.06401263, + "auxiliary_loss_mlp": 0.01264215, + "balance_loss_clip": 0.06270991, + "balance_loss_mlp": 0.01254958, + "epoch": 0.9272208026454232, + "flos": 26216442380160.0, + "grad_norm": 2.015466462348958, + "language_loss": 0.72872943, + "learning_rate": 5.526243217829041e-08, + "loss": 0.80538422, + "num_input_tokens_seen": 332881110, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.0925293, + "step": 15422, + "time_per_iteration": 2.556781530380249 + }, + { + "auxiliary_loss_clip": 0.06401004, + "auxiliary_loss_mlp": 0.01263615, + "balance_loss_clip": 0.06268564, + "balance_loss_mlp": 0.01254305, + "epoch": 0.9272809258980911, + "flos": 12463348803840.0, + "grad_norm": 1.9568135682925627, + "language_loss": 0.77870274, + "learning_rate": 5.517154918363065e-08, + "loss": 0.85534894, + "num_input_tokens_seen": 332899350, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09307861, + "step": 15423, + "time_per_iteration": 2.54386568069458 + }, + { + "auxiliary_loss_clip": 0.06402774, + "auxiliary_loss_mlp": 0.01262642, + "balance_loss_clip": 0.06267941, + "balance_loss_mlp": 0.01252688, + "epoch": 0.9273410491507591, + "flos": 22863523651200.0, + "grad_norm": 1.9393896166418776, + "language_loss": 0.75592458, + "learning_rate": 5.508073993706053e-08, + "loss": 0.83257878, + "num_input_tokens_seen": 332918105, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.09954834, + "step": 15424, + "time_per_iteration": 3.950807571411133 + }, + { + "auxiliary_loss_clip": 0.06308335, + "auxiliary_loss_mlp": 0.01251168, + "balance_loss_clip": 0.06254116, + "balance_loss_mlp": 0.01250244, + "epoch": 0.927401172403427, + "flos": 47681963383680.0, + "grad_norm": 0.7629522595192675, + "language_loss": 0.60162652, + "learning_rate": 5.499000444202351e-08, + "loss": 0.67722148, + "num_input_tokens_seen": 332969490, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.00921631, + "step": 15425, + "time_per_iteration": 2.9016902446746826 + }, + { + "auxiliary_loss_clip": 0.06402518, + "auxiliary_loss_mlp": 0.01261675, + "balance_loss_clip": 0.06271154, + "balance_loss_mlp": 0.01252585, + "epoch": 0.927461295656095, + "flos": 29980324258560.0, + "grad_norm": 1.366559565689854, + "language_loss": 0.71148986, + "learning_rate": 5.489934270196106e-08, + "loss": 0.78813183, + "num_input_tokens_seen": 332988805, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09088135, + "step": 15426, + "time_per_iteration": 2.61396861076355 + }, + { + "auxiliary_loss_clip": 0.06402343, + "auxiliary_loss_mlp": 0.01262233, + "balance_loss_clip": 0.0627178, + "balance_loss_mlp": 0.01253388, + "epoch": 0.9275214189087629, + "flos": 20381573387520.0, + "grad_norm": 1.8238747923679495, + "language_loss": 0.83321905, + "learning_rate": 5.480875472030977e-08, + "loss": 0.9098649, + "num_input_tokens_seen": 333007960, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.08843994, + "step": 15427, + "time_per_iteration": 2.533583641052246 + }, + { + "auxiliary_loss_clip": 0.06399953, + "auxiliary_loss_mlp": 0.01264957, + "balance_loss_clip": 0.0626848, + "balance_loss_mlp": 0.01255158, + "epoch": 0.927581542161431, + "flos": 22389850120320.0, + "grad_norm": 1.562228354740854, + "language_loss": 0.77034312, + "learning_rate": 5.471824050050555e-08, + "loss": 0.84699225, + "num_input_tokens_seen": 333026035, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09802246, + "step": 15428, + "time_per_iteration": 2.5238113403320312 + }, + { + "auxiliary_loss_clip": 0.0640026, + "auxiliary_loss_mlp": 0.01264868, + "balance_loss_clip": 0.06270307, + "balance_loss_mlp": 0.01255528, + "epoch": 0.9276416654140989, + "flos": 23959435201920.0, + "grad_norm": 1.7264807975252925, + "language_loss": 0.7457782, + "learning_rate": 5.4627800045980555e-08, + "loss": 0.82242942, + "num_input_tokens_seen": 333045590, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.09338379, + "step": 15429, + "time_per_iteration": 2.5195233821868896 + }, + { + "auxiliary_loss_clip": 0.06396069, + "auxiliary_loss_mlp": 0.01264681, + "balance_loss_clip": 0.06268522, + "balance_loss_mlp": 0.01255723, + "epoch": 0.9277017886667669, + "flos": 13922831220480.0, + "grad_norm": 1.7362302718251208, + "language_loss": 0.75345081, + "learning_rate": 5.45374333601647e-08, + "loss": 0.83005834, + "num_input_tokens_seen": 333063355, + "router_z_loss_clip": 1.27441406, + "router_z_loss_mlp": 0.08959961, + "step": 15430, + "time_per_iteration": 3.9254066944122314 + }, + { + "auxiliary_loss_clip": 0.06402864, + "auxiliary_loss_mlp": 0.01262331, + "balance_loss_clip": 0.06269671, + "balance_loss_mlp": 0.01252478, + "epoch": 0.9277619119194348, + "flos": 35675768856960.0, + "grad_norm": 1.3448855002348141, + "language_loss": 0.76524234, + "learning_rate": 5.444714044648391e-08, + "loss": 0.84189427, + "num_input_tokens_seen": 333088045, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.09851074, + "step": 15431, + "time_per_iteration": 2.6647591590881348 + }, + { + "auxiliary_loss_clip": 0.06399286, + "auxiliary_loss_mlp": 0.01265502, + "balance_loss_clip": 0.06271762, + "balance_loss_mlp": 0.0125649, + "epoch": 0.9278220351721028, + "flos": 23847907017600.0, + "grad_norm": 1.598032669675074, + "language_loss": 0.70804644, + "learning_rate": 5.4356921308363e-08, + "loss": 0.78469431, + "num_input_tokens_seen": 333108005, + "router_z_loss_clip": 1.27539062, + "router_z_loss_mlp": 0.09014893, + "step": 15432, + "time_per_iteration": 2.521979808807373 + }, + { + "auxiliary_loss_clip": 0.06401653, + "auxiliary_loss_mlp": 0.01268277, + "balance_loss_clip": 0.06268461, + "balance_loss_mlp": 0.01258746, + "epoch": 0.9278821584247707, + "flos": 15232952534400.0, + "grad_norm": 3.130753679955256, + "language_loss": 0.83228093, + "learning_rate": 5.4266775949222354e-08, + "loss": 0.90898025, + "num_input_tokens_seen": 333124335, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.09533691, + "step": 15433, + "time_per_iteration": 3.868227481842041 + }, + { + "auxiliary_loss_clip": 0.06392471, + "auxiliary_loss_mlp": 0.01264408, + "balance_loss_clip": 0.06267262, + "balance_loss_mlp": 0.01256147, + "epoch": 0.9279422816774388, + "flos": 24688379796480.0, + "grad_norm": 1.704558942323815, + "language_loss": 0.67013133, + "learning_rate": 5.417670437248056e-08, + "loss": 0.74670017, + "num_input_tokens_seen": 333143995, + "router_z_loss_clip": 1.25390625, + "router_z_loss_mlp": 0.08258057, + "step": 15434, + "time_per_iteration": 2.5150067806243896 + }, + { + "auxiliary_loss_clip": 0.06390243, + "auxiliary_loss_mlp": 0.01261235, + "balance_loss_clip": 0.06267539, + "balance_loss_mlp": 0.01252938, + "epoch": 0.9280024049301068, + "flos": 19174762558080.0, + "grad_norm": 1.6939832412088915, + "language_loss": 0.68807113, + "learning_rate": 5.40867065815529e-08, + "loss": 0.76458597, + "num_input_tokens_seen": 333162805, + "router_z_loss_clip": 1.22753906, + "router_z_loss_mlp": 0.08300781, + "step": 15435, + "time_per_iteration": 2.5746238231658936 + }, + { + "auxiliary_loss_clip": 0.06400537, + "auxiliary_loss_mlp": 0.01264275, + "balance_loss_clip": 0.06268658, + "balance_loss_mlp": 0.01254757, + "epoch": 0.9280625281827747, + "flos": 11397304033920.0, + "grad_norm": 1.8675874882503214, + "language_loss": 0.72116661, + "learning_rate": 5.399678257985263e-08, + "loss": 0.79781473, + "num_input_tokens_seen": 333175770, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09521484, + "step": 15436, + "time_per_iteration": 2.4609224796295166 + }, + { + "auxiliary_loss_clip": 0.06404845, + "auxiliary_loss_mlp": 0.01266496, + "balance_loss_clip": 0.0627347, + "balance_loss_mlp": 0.01257287, + "epoch": 0.9281226514354427, + "flos": 24791732208000.0, + "grad_norm": 1.898604382401611, + "language_loss": 0.67076588, + "learning_rate": 5.390693237078925e-08, + "loss": 0.74747938, + "num_input_tokens_seen": 333194775, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09204102, + "step": 15437, + "time_per_iteration": 2.574120044708252 + }, + { + "auxiliary_loss_clip": 0.06404506, + "auxiliary_loss_mlp": 0.01265505, + "balance_loss_clip": 0.06270991, + "balance_loss_mlp": 0.01254728, + "epoch": 0.9281827746881106, + "flos": 15088077624960.0, + "grad_norm": 1.8931845608351296, + "language_loss": 0.71641231, + "learning_rate": 5.3817155957770254e-08, + "loss": 0.7931124, + "num_input_tokens_seen": 333208920, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.10778809, + "step": 15438, + "time_per_iteration": 2.5342071056365967 + }, + { + "auxiliary_loss_clip": 0.06401535, + "auxiliary_loss_mlp": 0.01263761, + "balance_loss_clip": 0.06268774, + "balance_loss_mlp": 0.01255089, + "epoch": 0.9282428979407786, + "flos": 24142101102720.0, + "grad_norm": 1.7631471978480706, + "language_loss": 0.64994079, + "learning_rate": 5.3727453344199366e-08, + "loss": 0.72659373, + "num_input_tokens_seen": 333229350, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.08679199, + "step": 15439, + "time_per_iteration": 2.5256354808807373 + }, + { + "auxiliary_loss_clip": 0.0639973, + "auxiliary_loss_mlp": 0.01264033, + "balance_loss_clip": 0.06269728, + "balance_loss_mlp": 0.01255349, + "epoch": 0.9283030211934465, + "flos": 24829523199360.0, + "grad_norm": 1.6821997919344165, + "language_loss": 0.70312607, + "learning_rate": 5.363782453347876e-08, + "loss": 0.7797637, + "num_input_tokens_seen": 333246125, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.0869751, + "step": 15440, + "time_per_iteration": 2.5232927799224854 + }, + { + "auxiliary_loss_clip": 0.06404891, + "auxiliary_loss_mlp": 0.01265965, + "balance_loss_clip": 0.06268373, + "balance_loss_mlp": 0.01255523, + "epoch": 0.9283631444461146, + "flos": 23986702506240.0, + "grad_norm": 1.5413519977968317, + "language_loss": 0.77124566, + "learning_rate": 5.354826952900682e-08, + "loss": 0.84795421, + "num_input_tokens_seen": 333263685, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.10430908, + "step": 15441, + "time_per_iteration": 2.516756772994995 + }, + { + "auxiliary_loss_clip": 0.06398309, + "auxiliary_loss_mlp": 0.01262603, + "balance_loss_clip": 0.06272468, + "balance_loss_mlp": 0.01254735, + "epoch": 0.9284232676987825, + "flos": 22791253904640.0, + "grad_norm": 1.5445819988173333, + "language_loss": 0.64162666, + "learning_rate": 5.345878833417949e-08, + "loss": 0.71823585, + "num_input_tokens_seen": 333282435, + "router_z_loss_clip": 1.25976562, + "router_z_loss_mlp": 0.07873535, + "step": 15442, + "time_per_iteration": 2.505448341369629 + }, + { + "auxiliary_loss_clip": 0.06404665, + "auxiliary_loss_mlp": 0.01268101, + "balance_loss_clip": 0.0626903, + "balance_loss_mlp": 0.01258314, + "epoch": 0.9284833909514505, + "flos": 19506621853440.0, + "grad_norm": 1.7431674890191913, + "language_loss": 0.80909652, + "learning_rate": 5.3369380952390295e-08, + "loss": 0.8858242, + "num_input_tokens_seen": 333300400, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.09790039, + "step": 15443, + "time_per_iteration": 2.562551498413086 + }, + { + "auxiliary_loss_clip": 0.06403337, + "auxiliary_loss_mlp": 0.0126488, + "balance_loss_clip": 0.06270701, + "balance_loss_mlp": 0.01255629, + "epoch": 0.9285435142041184, + "flos": 23192783470080.0, + "grad_norm": 1.9512114579199797, + "language_loss": 0.65079677, + "learning_rate": 5.328004738702896e-08, + "loss": 0.72747898, + "num_input_tokens_seen": 333318980, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.0925293, + "step": 15444, + "time_per_iteration": 2.5125370025634766 + }, + { + "auxiliary_loss_clip": 0.06402203, + "auxiliary_loss_mlp": 0.01263334, + "balance_loss_clip": 0.06270593, + "balance_loss_mlp": 0.0125425, + "epoch": 0.9286036374567864, + "flos": 17681220656640.0, + "grad_norm": 2.010684849546823, + "language_loss": 0.73854786, + "learning_rate": 5.3190787641483215e-08, + "loss": 0.81520319, + "num_input_tokens_seen": 333334135, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.09082031, + "step": 15445, + "time_per_iteration": 2.5049667358398438 + }, + { + "auxiliary_loss_clip": 0.064026, + "auxiliary_loss_mlp": 0.01262565, + "balance_loss_clip": 0.06271416, + "balance_loss_mlp": 0.01253165, + "epoch": 0.9286637607094543, + "flos": 20892995982720.0, + "grad_norm": 1.5998111247681204, + "language_loss": 0.71395653, + "learning_rate": 5.3101601719138135e-08, + "loss": 0.79060817, + "num_input_tokens_seen": 333353325, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.09399414, + "step": 15446, + "time_per_iteration": 2.502922296524048 + }, + { + "auxiliary_loss_clip": 0.06408063, + "auxiliary_loss_mlp": 0.01262626, + "balance_loss_clip": 0.06270014, + "balance_loss_mlp": 0.01252642, + "epoch": 0.9287238839621224, + "flos": 19032025927680.0, + "grad_norm": 1.8680884802805782, + "language_loss": 0.69709033, + "learning_rate": 5.301248962337523e-08, + "loss": 0.77379727, + "num_input_tokens_seen": 333371110, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.09979248, + "step": 15447, + "time_per_iteration": 2.498037815093994 + }, + { + "auxiliary_loss_clip": 0.06395551, + "auxiliary_loss_mlp": 0.0126141, + "balance_loss_clip": 0.06271149, + "balance_loss_mlp": 0.01252809, + "epoch": 0.9287840072147904, + "flos": 20563065331200.0, + "grad_norm": 1.463542829558656, + "language_loss": 0.72163129, + "learning_rate": 5.292345135757403e-08, + "loss": 0.79820085, + "num_input_tokens_seen": 333391420, + "router_z_loss_clip": 1.24511719, + "router_z_loss_mlp": 0.08605957, + "step": 15448, + "time_per_iteration": 2.5169200897216797 + }, + { + "auxiliary_loss_clip": 0.06399667, + "auxiliary_loss_mlp": 0.01262909, + "balance_loss_clip": 0.06270666, + "balance_loss_mlp": 0.01253128, + "epoch": 0.9288441304674583, + "flos": 21257069973120.0, + "grad_norm": 1.631031069367745, + "language_loss": 0.74867898, + "learning_rate": 5.283448692511072e-08, + "loss": 0.82530475, + "num_input_tokens_seen": 333410365, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.09790039, + "step": 15449, + "time_per_iteration": 2.5181782245635986 + }, + { + "auxiliary_loss_clip": 0.06401692, + "auxiliary_loss_mlp": 0.01260945, + "balance_loss_clip": 0.06271457, + "balance_loss_mlp": 0.01251426, + "epoch": 0.9289042537201263, + "flos": 27676763337600.0, + "grad_norm": 2.2115875222336716, + "language_loss": 0.67882347, + "learning_rate": 5.27455963293586e-08, + "loss": 0.75544983, + "num_input_tokens_seen": 333430000, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.09503174, + "step": 15450, + "time_per_iteration": 2.588937759399414 + }, + { + "auxiliary_loss_clip": 0.06401034, + "auxiliary_loss_mlp": 0.01262114, + "balance_loss_clip": 0.06269682, + "balance_loss_mlp": 0.01253311, + "epoch": 0.9289643769727942, + "flos": 19323788244480.0, + "grad_norm": 1.901357650419004, + "language_loss": 0.71771216, + "learning_rate": 5.265677957368875e-08, + "loss": 0.79434371, + "num_input_tokens_seen": 333445800, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.08795166, + "step": 15451, + "time_per_iteration": 2.5311567783355713 + }, + { + "auxiliary_loss_clip": 0.06402208, + "auxiliary_loss_mlp": 0.01262611, + "balance_loss_clip": 0.06270938, + "balance_loss_mlp": 0.01253527, + "epoch": 0.9290245002254622, + "flos": 14062255614720.0, + "grad_norm": 2.063265286417505, + "language_loss": 0.73937112, + "learning_rate": 5.25680366614687e-08, + "loss": 0.8160193, + "num_input_tokens_seen": 333461550, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.09075928, + "step": 15452, + "time_per_iteration": 2.533107042312622 + }, + { + "auxiliary_loss_clip": 0.06399271, + "auxiliary_loss_mlp": 0.0126503, + "balance_loss_clip": 0.06270489, + "balance_loss_mlp": 0.01255851, + "epoch": 0.9290846234781301, + "flos": 20053235963520.0, + "grad_norm": 2.3098184994717785, + "language_loss": 0.74543643, + "learning_rate": 5.2479367596064196e-08, + "loss": 0.82207942, + "num_input_tokens_seen": 333478835, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.09173584, + "step": 15453, + "time_per_iteration": 2.505582094192505 + }, + { + "auxiliary_loss_clip": 0.06307368, + "auxiliary_loss_mlp": 0.01250217, + "balance_loss_clip": 0.062534, + "balance_loss_mlp": 0.0124918, + "epoch": 0.9291447467307982, + "flos": 61244592629760.0, + "grad_norm": 0.8123240258072839, + "language_loss": 0.60719591, + "learning_rate": 5.2390772380837226e-08, + "loss": 0.6827718, + "num_input_tokens_seen": 333535250, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.01038361, + "step": 15454, + "time_per_iteration": 3.0330328941345215 + }, + { + "auxiliary_loss_clip": 0.06403492, + "auxiliary_loss_mlp": 0.01267869, + "balance_loss_clip": 0.06270558, + "balance_loss_mlp": 0.0125835, + "epoch": 0.9292048699834661, + "flos": 20558746846080.0, + "grad_norm": 1.4616904844748926, + "language_loss": 0.69075823, + "learning_rate": 5.230225101914709e-08, + "loss": 0.76747185, + "num_input_tokens_seen": 333553805, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.09521484, + "step": 15455, + "time_per_iteration": 3.9310483932495117 + }, + { + "auxiliary_loss_clip": 0.06399804, + "auxiliary_loss_mlp": 0.0126208, + "balance_loss_clip": 0.06269494, + "balance_loss_mlp": 0.01253366, + "epoch": 0.9292649932361341, + "flos": 23630510799360.0, + "grad_norm": 1.5254212820753648, + "language_loss": 0.65071934, + "learning_rate": 5.22138035143509e-08, + "loss": 0.72733819, + "num_input_tokens_seen": 333572800, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.08709717, + "step": 15456, + "time_per_iteration": 2.5281927585601807 + }, + { + "auxiliary_loss_clip": 0.06399552, + "auxiliary_loss_mlp": 0.01266719, + "balance_loss_clip": 0.06271125, + "balance_loss_mlp": 0.01257141, + "epoch": 0.929325116488802, + "flos": 15014843556480.0, + "grad_norm": 1.6452448643687836, + "language_loss": 0.68623769, + "learning_rate": 5.2125429869802615e-08, + "loss": 0.76290047, + "num_input_tokens_seen": 333588520, + "router_z_loss_clip": 1.28222656, + "router_z_loss_mlp": 0.0958252, + "step": 15457, + "time_per_iteration": 2.4656875133514404 + }, + { + "auxiliary_loss_clip": 0.0640226, + "auxiliary_loss_mlp": 0.01262411, + "balance_loss_clip": 0.06269163, + "balance_loss_mlp": 0.01252749, + "epoch": 0.92938523974147, + "flos": 17973108754560.0, + "grad_norm": 2.0948470161883717, + "language_loss": 0.81135142, + "learning_rate": 5.203713008885291e-08, + "loss": 0.8879981, + "num_input_tokens_seen": 333603435, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.09655762, + "step": 15458, + "time_per_iteration": 2.483344316482544 + }, + { + "auxiliary_loss_clip": 0.06399539, + "auxiliary_loss_mlp": 0.01264005, + "balance_loss_clip": 0.06268502, + "balance_loss_mlp": 0.01254754, + "epoch": 0.9294453629941379, + "flos": 23009740225920.0, + "grad_norm": 1.5981022484787952, + "language_loss": 0.72647446, + "learning_rate": 5.194890417485065e-08, + "loss": 0.80310988, + "num_input_tokens_seen": 333623305, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.0925293, + "step": 15459, + "time_per_iteration": 2.5095856189727783 + }, + { + "auxiliary_loss_clip": 0.06403077, + "auxiliary_loss_mlp": 0.01264372, + "balance_loss_clip": 0.0627103, + "balance_loss_mlp": 0.01255223, + "epoch": 0.929505486246806, + "flos": 17060827426560.0, + "grad_norm": 2.205290237596035, + "language_loss": 0.59509528, + "learning_rate": 5.1860752131141384e-08, + "loss": 0.67176986, + "num_input_tokens_seen": 333641205, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.09143066, + "step": 15460, + "time_per_iteration": 2.4897260665893555 + }, + { + "auxiliary_loss_clip": 0.06407061, + "auxiliary_loss_mlp": 0.01267368, + "balance_loss_clip": 0.06273078, + "balance_loss_mlp": 0.01257909, + "epoch": 0.9295656094994739, + "flos": 27347084248320.0, + "grad_norm": 1.858696453479836, + "language_loss": 0.81050324, + "learning_rate": 5.177267396106733e-08, + "loss": 0.88724756, + "num_input_tokens_seen": 333659615, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.09466553, + "step": 15461, + "time_per_iteration": 2.5442938804626465 + }, + { + "auxiliary_loss_clip": 0.06401002, + "auxiliary_loss_mlp": 0.01264519, + "balance_loss_clip": 0.06271482, + "balance_loss_mlp": 0.01255275, + "epoch": 0.9296257327521419, + "flos": 21477443011200.0, + "grad_norm": 1.6125510363493594, + "language_loss": 0.78114223, + "learning_rate": 5.168466966796869e-08, + "loss": 0.85779738, + "num_input_tokens_seen": 333678985, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.09246826, + "step": 15462, + "time_per_iteration": 2.5683822631835938 + }, + { + "auxiliary_loss_clip": 0.06399049, + "auxiliary_loss_mlp": 0.01262981, + "balance_loss_clip": 0.06268325, + "balance_loss_mlp": 0.01254207, + "epoch": 0.9296858560048099, + "flos": 16368248304000.0, + "grad_norm": 1.8573692546143064, + "language_loss": 0.63046449, + "learning_rate": 5.159673925518282e-08, + "loss": 0.70708477, + "num_input_tokens_seen": 333696410, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.08764648, + "step": 15463, + "time_per_iteration": 2.4773969650268555 + }, + { + "auxiliary_loss_clip": 0.06398252, + "auxiliary_loss_mlp": 0.01262228, + "balance_loss_clip": 0.06268728, + "balance_loss_mlp": 0.01253466, + "epoch": 0.9297459792574778, + "flos": 29865819254400.0, + "grad_norm": 1.4275812835029746, + "language_loss": 0.71507215, + "learning_rate": 5.15088827260437e-08, + "loss": 0.79167688, + "num_input_tokens_seen": 333716615, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.08758545, + "step": 15464, + "time_per_iteration": 3.9611384868621826 + }, + { + "auxiliary_loss_clip": 0.06404192, + "auxiliary_loss_mlp": 0.01259513, + "balance_loss_clip": 0.06270679, + "balance_loss_mlp": 0.01250835, + "epoch": 0.9298061025101458, + "flos": 15930353266560.0, + "grad_norm": 1.866301443113407, + "language_loss": 0.78163409, + "learning_rate": 5.1421100083883115e-08, + "loss": 0.85827112, + "num_input_tokens_seen": 333732800, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.08679199, + "step": 15465, + "time_per_iteration": 2.5284931659698486 + }, + { + "auxiliary_loss_clip": 0.0631011, + "auxiliary_loss_mlp": 0.01253376, + "balance_loss_clip": 0.06255974, + "balance_loss_mlp": 0.0125237, + "epoch": 0.9298662257628137, + "flos": 64118498365440.0, + "grad_norm": 0.6980012483793121, + "language_loss": 0.56405276, + "learning_rate": 5.133339133202952e-08, + "loss": 0.6396876, + "num_input_tokens_seen": 333799300, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.01006317, + "step": 15466, + "time_per_iteration": 3.244619846343994 + }, + { + "auxiliary_loss_clip": 0.06403805, + "auxiliary_loss_mlp": 0.0126834, + "balance_loss_clip": 0.06270371, + "balance_loss_mlp": 0.01258588, + "epoch": 0.9299263490154818, + "flos": 24287143720320.0, + "grad_norm": 1.3940934660028805, + "language_loss": 0.73205161, + "learning_rate": 5.1245756473809355e-08, + "loss": 0.80877304, + "num_input_tokens_seen": 333820360, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.09747314, + "step": 15467, + "time_per_iteration": 2.5676679611206055 + }, + { + "auxiliary_loss_clip": 0.06403539, + "auxiliary_loss_mlp": 0.01265696, + "balance_loss_clip": 0.06271035, + "balance_loss_mlp": 0.01256458, + "epoch": 0.9299864722681497, + "flos": 23300999418240.0, + "grad_norm": 1.6752251187046447, + "language_loss": 0.72396517, + "learning_rate": 5.1158195512545076e-08, + "loss": 0.80065751, + "num_input_tokens_seen": 333840415, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.09240723, + "step": 15468, + "time_per_iteration": 2.59311580657959 + }, + { + "auxiliary_loss_clip": 0.06405564, + "auxiliary_loss_mlp": 0.01262883, + "balance_loss_clip": 0.0627134, + "balance_loss_mlp": 0.01253424, + "epoch": 0.9300465955208177, + "flos": 21402112590720.0, + "grad_norm": 1.6073000412647687, + "language_loss": 0.75552547, + "learning_rate": 5.107070845155737e-08, + "loss": 0.83220994, + "num_input_tokens_seen": 333859910, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.09466553, + "step": 15469, + "time_per_iteration": 2.5530714988708496 + }, + { + "auxiliary_loss_clip": 0.06402186, + "auxiliary_loss_mlp": 0.01267177, + "balance_loss_clip": 0.06269206, + "balance_loss_mlp": 0.01257629, + "epoch": 0.9301067187734856, + "flos": 24578319058560.0, + "grad_norm": 1.7588900587413723, + "language_loss": 0.76161134, + "learning_rate": 5.098329529416379e-08, + "loss": 0.838305, + "num_input_tokens_seen": 333880495, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.09545898, + "step": 15470, + "time_per_iteration": 4.046792984008789 + }, + { + "auxiliary_loss_clip": 0.063991, + "auxiliary_loss_mlp": 0.01265604, + "balance_loss_clip": 0.06269463, + "balance_loss_mlp": 0.01256431, + "epoch": 0.9301668420261536, + "flos": 22202949588480.0, + "grad_norm": 1.50853778846898, + "language_loss": 0.74989831, + "learning_rate": 5.089595604367902e-08, + "loss": 0.82654536, + "num_input_tokens_seen": 333897640, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.09179688, + "step": 15471, + "time_per_iteration": 2.523951530456543 + }, + { + "auxiliary_loss_clip": 0.06401512, + "auxiliary_loss_mlp": 0.01264888, + "balance_loss_clip": 0.06271497, + "balance_loss_mlp": 0.01255661, + "epoch": 0.9302269652788215, + "flos": 17753196913920.0, + "grad_norm": 2.3784631998670203, + "language_loss": 0.69654554, + "learning_rate": 5.080869070341487e-08, + "loss": 0.77320957, + "num_input_tokens_seen": 333913670, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.09234619, + "step": 15472, + "time_per_iteration": 2.523432493209839 + }, + { + "auxiliary_loss_clip": 0.06395452, + "auxiliary_loss_mlp": 0.01263156, + "balance_loss_clip": 0.06270222, + "balance_loss_mlp": 0.01254281, + "epoch": 0.9302870885314896, + "flos": 19396854604800.0, + "grad_norm": 1.6143670274863005, + "language_loss": 0.88837874, + "learning_rate": 5.0721499276680233e-08, + "loss": 0.96496475, + "num_input_tokens_seen": 333934105, + "router_z_loss_clip": 1.25390625, + "router_z_loss_mlp": 0.08880615, + "step": 15473, + "time_per_iteration": 4.012357473373413 + }, + { + "auxiliary_loss_clip": 0.06405994, + "auxiliary_loss_mlp": 0.01265627, + "balance_loss_clip": 0.06271876, + "balance_loss_mlp": 0.01255274, + "epoch": 0.9303472117841575, + "flos": 21766396216320.0, + "grad_norm": 2.419925900963914, + "language_loss": 0.64569032, + "learning_rate": 5.063438176678203e-08, + "loss": 0.72240651, + "num_input_tokens_seen": 333953635, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.10357666, + "step": 15474, + "time_per_iteration": 2.5024755001068115 + }, + { + "auxiliary_loss_clip": 0.06400555, + "auxiliary_loss_mlp": 0.01264178, + "balance_loss_clip": 0.06268995, + "balance_loss_mlp": 0.01254409, + "epoch": 0.9304073350368255, + "flos": 19615844050560.0, + "grad_norm": 1.7539760136561613, + "language_loss": 0.74913669, + "learning_rate": 5.054733817702339e-08, + "loss": 0.82578397, + "num_input_tokens_seen": 333971825, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.09765625, + "step": 15475, + "time_per_iteration": 2.4802138805389404 + }, + { + "auxiliary_loss_clip": 0.06402318, + "auxiliary_loss_mlp": 0.01267821, + "balance_loss_clip": 0.06271594, + "balance_loss_mlp": 0.01258761, + "epoch": 0.9304674582894935, + "flos": 30448756909440.0, + "grad_norm": 2.5253856676415296, + "language_loss": 0.67179549, + "learning_rate": 5.0460368510704786e-08, + "loss": 0.74849689, + "num_input_tokens_seen": 333990120, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.09057617, + "step": 15476, + "time_per_iteration": 2.5887856483459473 + }, + { + "auxiliary_loss_clip": 0.06402615, + "auxiliary_loss_mlp": 0.01265997, + "balance_loss_clip": 0.06271078, + "balance_loss_mlp": 0.01256782, + "epoch": 0.9305275815421614, + "flos": 17791532956800.0, + "grad_norm": 2.2931059467330814, + "language_loss": 0.69080395, + "learning_rate": 5.0373472771124914e-08, + "loss": 0.76749009, + "num_input_tokens_seen": 334007970, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09204102, + "step": 15477, + "time_per_iteration": 2.46964430809021 + }, + { + "auxiliary_loss_clip": 0.06398468, + "auxiliary_loss_mlp": 0.01266148, + "balance_loss_clip": 0.06270145, + "balance_loss_mlp": 0.01257142, + "epoch": 0.9305877047948294, + "flos": 25304999592960.0, + "grad_norm": 2.133764472350911, + "language_loss": 0.58989286, + "learning_rate": 5.0286650961578027e-08, + "loss": 0.66653895, + "num_input_tokens_seen": 334027120, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.09008789, + "step": 15478, + "time_per_iteration": 2.5351498126983643 + }, + { + "auxiliary_loss_clip": 0.06409034, + "auxiliary_loss_mlp": 0.01265248, + "balance_loss_clip": 0.06270212, + "balance_loss_mlp": 0.01254585, + "epoch": 0.9306478280474973, + "flos": 16981975134720.0, + "grad_norm": 1.7786919360630835, + "language_loss": 0.79033351, + "learning_rate": 5.01999030853566e-08, + "loss": 0.86707628, + "num_input_tokens_seen": 334042785, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.10662842, + "step": 15479, + "time_per_iteration": 2.4584336280822754 + }, + { + "auxiliary_loss_clip": 0.06400747, + "auxiliary_loss_mlp": 0.0126376, + "balance_loss_clip": 0.06269468, + "balance_loss_mlp": 0.01254379, + "epoch": 0.9307079513001654, + "flos": 35672121204480.0, + "grad_norm": 1.6572796741868023, + "language_loss": 0.68828124, + "learning_rate": 5.0113229145750445e-08, + "loss": 0.76492631, + "num_input_tokens_seen": 334063480, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09393311, + "step": 15480, + "time_per_iteration": 2.6261415481567383 + }, + { + "auxiliary_loss_clip": 0.06401486, + "auxiliary_loss_mlp": 0.01264294, + "balance_loss_clip": 0.06269836, + "balance_loss_mlp": 0.01254984, + "epoch": 0.9307680745528333, + "flos": 19214146776960.0, + "grad_norm": 1.7175902100711526, + "language_loss": 0.68017375, + "learning_rate": 5.002662914604583e-08, + "loss": 0.75683153, + "num_input_tokens_seen": 334082005, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.09307861, + "step": 15481, + "time_per_iteration": 2.481839179992676 + }, + { + "auxiliary_loss_clip": 0.06399475, + "auxiliary_loss_mlp": 0.01263901, + "balance_loss_clip": 0.06270431, + "balance_loss_mlp": 0.01255145, + "epoch": 0.9308281978055013, + "flos": 19068684888960.0, + "grad_norm": 1.7362782888725026, + "language_loss": 0.74914646, + "learning_rate": 4.994010308952701e-08, + "loss": 0.82578027, + "num_input_tokens_seen": 334101375, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.08746338, + "step": 15482, + "time_per_iteration": 2.521629810333252 + }, + { + "auxiliary_loss_clip": 0.06396139, + "auxiliary_loss_mlp": 0.01263596, + "balance_loss_clip": 0.06269595, + "balance_loss_mlp": 0.01254542, + "epoch": 0.9308883210581692, + "flos": 20527748035200.0, + "grad_norm": 1.865123226027677, + "language_loss": 0.80490708, + "learning_rate": 4.985365097947469e-08, + "loss": 0.88150442, + "num_input_tokens_seen": 334119460, + "router_z_loss_clip": 1.26464844, + "router_z_loss_mlp": 0.0904541, + "step": 15483, + "time_per_iteration": 2.533062696456909 + }, + { + "auxiliary_loss_clip": 0.06400363, + "auxiliary_loss_mlp": 0.01264643, + "balance_loss_clip": 0.06269716, + "balance_loss_mlp": 0.01255118, + "epoch": 0.9309484443108372, + "flos": 13005686355840.0, + "grad_norm": 1.8891510591308605, + "language_loss": 0.74612212, + "learning_rate": 4.976727281916782e-08, + "loss": 0.82277215, + "num_input_tokens_seen": 334136065, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09527588, + "step": 15484, + "time_per_iteration": 2.5859484672546387 + }, + { + "auxiliary_loss_clip": 0.06404746, + "auxiliary_loss_mlp": 0.01264949, + "balance_loss_clip": 0.06271218, + "balance_loss_mlp": 0.01255776, + "epoch": 0.9310085675635051, + "flos": 12572654855040.0, + "grad_norm": 2.023027681276139, + "language_loss": 0.76634532, + "learning_rate": 4.968096861188087e-08, + "loss": 0.84304231, + "num_input_tokens_seen": 334153690, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.09173584, + "step": 15485, + "time_per_iteration": 2.5508246421813965 + }, + { + "auxiliary_loss_clip": 0.0640571, + "auxiliary_loss_mlp": 0.01266589, + "balance_loss_clip": 0.06270456, + "balance_loss_mlp": 0.01256862, + "epoch": 0.9310686908161732, + "flos": 23484378078720.0, + "grad_norm": 1.7812037755211436, + "language_loss": 0.78332233, + "learning_rate": 4.959473836088723e-08, + "loss": 0.86004531, + "num_input_tokens_seen": 334171880, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.09729004, + "step": 15486, + "time_per_iteration": 2.535637140274048 + }, + { + "auxiliary_loss_clip": 0.06408517, + "auxiliary_loss_mlp": 0.01266169, + "balance_loss_clip": 0.06274606, + "balance_loss_mlp": 0.01256144, + "epoch": 0.9311288140688411, + "flos": 24177124909440.0, + "grad_norm": 2.198527808951168, + "language_loss": 0.77455759, + "learning_rate": 4.950858206945674e-08, + "loss": 0.85130453, + "num_input_tokens_seen": 334190005, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.10021973, + "step": 15487, + "time_per_iteration": 2.5223898887634277 + }, + { + "auxiliary_loss_clip": 0.06398556, + "auxiliary_loss_mlp": 0.01260459, + "balance_loss_clip": 0.06268291, + "balance_loss_mlp": 0.01251006, + "epoch": 0.9311889373215091, + "flos": 35598929063040.0, + "grad_norm": 1.8567185005188602, + "language_loss": 0.67377645, + "learning_rate": 4.942249974085633e-08, + "loss": 0.75036657, + "num_input_tokens_seen": 334209545, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.09460449, + "step": 15488, + "time_per_iteration": 2.619208335876465 + }, + { + "auxiliary_loss_clip": 0.06397253, + "auxiliary_loss_mlp": 0.01265084, + "balance_loss_clip": 0.06270263, + "balance_loss_mlp": 0.01256089, + "epoch": 0.9312490605741771, + "flos": 20236824259200.0, + "grad_norm": 1.7224807859602875, + "language_loss": 0.75432515, + "learning_rate": 4.933649137834983e-08, + "loss": 0.83094847, + "num_input_tokens_seen": 334228900, + "router_z_loss_clip": 1.27050781, + "router_z_loss_mlp": 0.08996582, + "step": 15489, + "time_per_iteration": 2.5090341567993164 + }, + { + "auxiliary_loss_clip": 0.06405045, + "auxiliary_loss_mlp": 0.01263019, + "balance_loss_clip": 0.0627015, + "balance_loss_mlp": 0.01253292, + "epoch": 0.931309183826845, + "flos": 13955087842560.0, + "grad_norm": 2.0628027282737396, + "language_loss": 0.80944282, + "learning_rate": 4.925055698519931e-08, + "loss": 0.88612348, + "num_input_tokens_seen": 334245500, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.097229, + "step": 15490, + "time_per_iteration": 2.4866514205932617 + }, + { + "auxiliary_loss_clip": 0.06403734, + "auxiliary_loss_mlp": 0.01266039, + "balance_loss_clip": 0.06270062, + "balance_loss_mlp": 0.01255554, + "epoch": 0.931369307079513, + "flos": 20162877431040.0, + "grad_norm": 1.8170541366291355, + "language_loss": 0.72400761, + "learning_rate": 4.9164696564663264e-08, + "loss": 0.80070531, + "num_input_tokens_seen": 334264370, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.10479736, + "step": 15491, + "time_per_iteration": 2.538468837738037 + }, + { + "auxiliary_loss_clip": 0.06393816, + "auxiliary_loss_mlp": 0.01264838, + "balance_loss_clip": 0.06267494, + "balance_loss_mlp": 0.0125638, + "epoch": 0.931429430332181, + "flos": 25345725477120.0, + "grad_norm": 1.8003153236272884, + "language_loss": 0.74667656, + "learning_rate": 4.9078910119997096e-08, + "loss": 0.82326305, + "num_input_tokens_seen": 334283905, + "router_z_loss_clip": 1.26269531, + "router_z_loss_mlp": 0.08459473, + "step": 15492, + "time_per_iteration": 2.6507134437561035 + }, + { + "auxiliary_loss_clip": 0.06310092, + "auxiliary_loss_mlp": 0.01250657, + "balance_loss_clip": 0.06255943, + "balance_loss_mlp": 0.01249686, + "epoch": 0.931489553584849, + "flos": 71245208482560.0, + "grad_norm": 0.6897832124619488, + "language_loss": 0.53372693, + "learning_rate": 4.899319765445442e-08, + "loss": 0.60933441, + "num_input_tokens_seen": 334339925, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.00970459, + "step": 15493, + "time_per_iteration": 3.021958112716675 + }, + { + "auxiliary_loss_clip": 0.06401284, + "auxiliary_loss_mlp": 0.01264813, + "balance_loss_clip": 0.06270571, + "balance_loss_mlp": 0.01256242, + "epoch": 0.9315496768375169, + "flos": 14648253943680.0, + "grad_norm": 1.768280806379928, + "language_loss": 0.70375299, + "learning_rate": 4.890755917128531e-08, + "loss": 0.78041399, + "num_input_tokens_seen": 334357225, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.08575439, + "step": 15494, + "time_per_iteration": 2.4740707874298096 + }, + { + "auxiliary_loss_clip": 0.06405485, + "auxiliary_loss_mlp": 0.01265527, + "balance_loss_clip": 0.06271463, + "balance_loss_mlp": 0.01255812, + "epoch": 0.9316098000901849, + "flos": 28337505108480.0, + "grad_norm": 1.5961909410807655, + "language_loss": 0.68592763, + "learning_rate": 4.882199467373671e-08, + "loss": 0.76263779, + "num_input_tokens_seen": 334375945, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.09716797, + "step": 15495, + "time_per_iteration": 4.0202531814575195 + }, + { + "auxiliary_loss_clip": 0.06397967, + "auxiliary_loss_mlp": 0.01263218, + "balance_loss_clip": 0.06270482, + "balance_loss_mlp": 0.01254111, + "epoch": 0.9316699233428528, + "flos": 28520338717440.0, + "grad_norm": 1.810348188530725, + "language_loss": 0.62453389, + "learning_rate": 4.8736504165053815e-08, + "loss": 0.70114577, + "num_input_tokens_seen": 334395310, + "router_z_loss_clip": 1.2734375, + "router_z_loss_mlp": 0.09106445, + "step": 15496, + "time_per_iteration": 2.5821802616119385 + }, + { + "auxiliary_loss_clip": 0.06402013, + "auxiliary_loss_mlp": 0.01265862, + "balance_loss_clip": 0.06270453, + "balance_loss_mlp": 0.01256599, + "epoch": 0.9317300465955208, + "flos": 33701887025280.0, + "grad_norm": 1.570853840724038, + "language_loss": 0.76926303, + "learning_rate": 4.865108764847825e-08, + "loss": 0.84594178, + "num_input_tokens_seen": 334416965, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.0925293, + "step": 15497, + "time_per_iteration": 2.6000030040740967 + }, + { + "auxiliary_loss_clip": 0.06406631, + "auxiliary_loss_mlp": 0.01266459, + "balance_loss_clip": 0.06270823, + "balance_loss_mlp": 0.0125576, + "epoch": 0.9317901698481887, + "flos": 23664779919360.0, + "grad_norm": 1.6175776581744283, + "language_loss": 0.662678, + "learning_rate": 4.856574512724898e-08, + "loss": 0.73940897, + "num_input_tokens_seen": 334435620, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.10693359, + "step": 15498, + "time_per_iteration": 2.5351293087005615 + }, + { + "auxiliary_loss_clip": 0.06401354, + "auxiliary_loss_mlp": 0.01266939, + "balance_loss_clip": 0.06269923, + "balance_loss_mlp": 0.01256401, + "epoch": 0.9318502931008568, + "flos": 20966397759360.0, + "grad_norm": 1.5626366594075778, + "language_loss": 0.79703665, + "learning_rate": 4.8480476604602305e-08, + "loss": 0.87371957, + "num_input_tokens_seen": 334456210, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.10534668, + "step": 15499, + "time_per_iteration": 2.5085513591766357 + }, + { + "auxiliary_loss_clip": 0.0639477, + "auxiliary_loss_mlp": 0.01268461, + "balance_loss_clip": 0.06268457, + "balance_loss_mlp": 0.01258978, + "epoch": 0.9319104163535247, + "flos": 23447844898560.0, + "grad_norm": 1.6015703430685497, + "language_loss": 0.76808083, + "learning_rate": 4.8395282083771196e-08, + "loss": 0.84471321, + "num_input_tokens_seen": 334475485, + "router_z_loss_clip": 1.26367188, + "router_z_loss_mlp": 0.09484863, + "step": 15500, + "time_per_iteration": 2.538321018218994 + }, + { + "auxiliary_loss_clip": 0.06396381, + "auxiliary_loss_mlp": 0.01262529, + "balance_loss_clip": 0.06267996, + "balance_loss_mlp": 0.01253427, + "epoch": 0.9319705396061927, + "flos": 22354197408000.0, + "grad_norm": 1.7160437702231266, + "language_loss": 0.72390819, + "learning_rate": 4.8310161567987064e-08, + "loss": 0.80049717, + "num_input_tokens_seen": 334494740, + "router_z_loss_clip": 1.28320312, + "router_z_loss_mlp": 0.09106445, + "step": 15501, + "time_per_iteration": 2.506465196609497 + }, + { + "auxiliary_loss_clip": 0.06406382, + "auxiliary_loss_mlp": 0.01262539, + "balance_loss_clip": 0.06271172, + "balance_loss_mlp": 0.0125327, + "epoch": 0.9320306628588607, + "flos": 20999450995200.0, + "grad_norm": 1.6621036286836153, + "language_loss": 0.6654309, + "learning_rate": 4.822511506047666e-08, + "loss": 0.74212009, + "num_input_tokens_seen": 334511910, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.0927124, + "step": 15502, + "time_per_iteration": 2.4847748279571533 + }, + { + "auxiliary_loss_clip": 0.06403543, + "auxiliary_loss_mlp": 0.01263989, + "balance_loss_clip": 0.06269436, + "balance_loss_mlp": 0.01255096, + "epoch": 0.9320907861115286, + "flos": 24545727020160.0, + "grad_norm": 1.48735457149782, + "language_loss": 0.65586728, + "learning_rate": 4.814014256446586e-08, + "loss": 0.73254263, + "num_input_tokens_seen": 334533150, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.08892822, + "step": 15503, + "time_per_iteration": 4.052160024642944 + }, + { + "auxiliary_loss_clip": 0.06403034, + "auxiliary_loss_mlp": 0.01265098, + "balance_loss_clip": 0.06270871, + "balance_loss_mlp": 0.01254745, + "epoch": 0.9321509093641966, + "flos": 19790418032640.0, + "grad_norm": 1.480948638802982, + "language_loss": 0.75340253, + "learning_rate": 4.805524408317652e-08, + "loss": 0.83008385, + "num_input_tokens_seen": 334550940, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.10345459, + "step": 15504, + "time_per_iteration": 2.5183193683624268 + }, + { + "auxiliary_loss_clip": 0.06404573, + "auxiliary_loss_mlp": 0.01265438, + "balance_loss_clip": 0.06273066, + "balance_loss_mlp": 0.01255645, + "epoch": 0.9322110326168646, + "flos": 24979597061760.0, + "grad_norm": 2.1014126245091735, + "language_loss": 0.71645415, + "learning_rate": 4.797041961982762e-08, + "loss": 0.79315424, + "num_input_tokens_seen": 334570935, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09790039, + "step": 15505, + "time_per_iteration": 2.5211434364318848 + }, + { + "auxiliary_loss_clip": 0.06400719, + "auxiliary_loss_mlp": 0.01261551, + "balance_loss_clip": 0.06268628, + "balance_loss_mlp": 0.01252402, + "epoch": 0.9322711558695326, + "flos": 16149175004160.0, + "grad_norm": 1.7427121022281884, + "language_loss": 0.75388575, + "learning_rate": 4.788566917763614e-08, + "loss": 0.83050847, + "num_input_tokens_seen": 334589315, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09143066, + "step": 15506, + "time_per_iteration": 2.4648678302764893 + }, + { + "auxiliary_loss_clip": 0.06394555, + "auxiliary_loss_mlp": 0.01264013, + "balance_loss_clip": 0.06267924, + "balance_loss_mlp": 0.0125484, + "epoch": 0.9323312791222005, + "flos": 23739187944960.0, + "grad_norm": 1.7165726591251698, + "language_loss": 0.83231521, + "learning_rate": 4.780099275981597e-08, + "loss": 0.90890092, + "num_input_tokens_seen": 334608990, + "router_z_loss_clip": 1.26757812, + "router_z_loss_mlp": 0.09173584, + "step": 15507, + "time_per_iteration": 2.5396206378936768 + }, + { + "auxiliary_loss_clip": 0.06407491, + "auxiliary_loss_mlp": 0.01263332, + "balance_loss_clip": 0.06273882, + "balance_loss_mlp": 0.01253896, + "epoch": 0.9323914023748685, + "flos": 20784318837120.0, + "grad_norm": 1.6429448873571484, + "language_loss": 0.67592001, + "learning_rate": 4.771639036957742e-08, + "loss": 0.75262833, + "num_input_tokens_seen": 334628655, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.09436035, + "step": 15508, + "time_per_iteration": 2.501565933227539 + }, + { + "auxiliary_loss_clip": 0.06400056, + "auxiliary_loss_mlp": 0.01266031, + "balance_loss_clip": 0.06271896, + "balance_loss_mlp": 0.0125709, + "epoch": 0.9324515256275364, + "flos": 23922021553920.0, + "grad_norm": 1.6443982436727373, + "language_loss": 0.72509021, + "learning_rate": 4.7631862010129033e-08, + "loss": 0.80175108, + "num_input_tokens_seen": 334648295, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.0894165, + "step": 15509, + "time_per_iteration": 3.96291184425354 + }, + { + "auxiliary_loss_clip": 0.06400399, + "auxiliary_loss_mlp": 0.01263532, + "balance_loss_clip": 0.06270161, + "balance_loss_mlp": 0.01254091, + "epoch": 0.9325116488802044, + "flos": 18011193235200.0, + "grad_norm": 1.8032935257192066, + "language_loss": 0.74504322, + "learning_rate": 4.754740768467624e-08, + "loss": 0.82168245, + "num_input_tokens_seen": 334666280, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.09442139, + "step": 15510, + "time_per_iteration": 2.4776346683502197 + }, + { + "auxiliary_loss_clip": 0.06406374, + "auxiliary_loss_mlp": 0.01261789, + "balance_loss_clip": 0.06270321, + "balance_loss_mlp": 0.01252175, + "epoch": 0.9325717721328723, + "flos": 29029036055040.0, + "grad_norm": 1.6506300537711536, + "language_loss": 0.70206726, + "learning_rate": 4.746302739642161e-08, + "loss": 0.77874887, + "num_input_tokens_seen": 334688830, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.09619141, + "step": 15511, + "time_per_iteration": 2.582463502883911 + }, + { + "auxiliary_loss_clip": 0.06401817, + "auxiliary_loss_mlp": 0.01266769, + "balance_loss_clip": 0.06271385, + "balance_loss_mlp": 0.01257262, + "epoch": 0.9326318953855404, + "flos": 21651681576960.0, + "grad_norm": 1.9988751237601965, + "language_loss": 0.78188848, + "learning_rate": 4.737872114856412e-08, + "loss": 0.85857439, + "num_input_tokens_seen": 334705205, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.09503174, + "step": 15512, + "time_per_iteration": 2.494394540786743 + }, + { + "auxiliary_loss_clip": 0.06400086, + "auxiliary_loss_mlp": 0.01261219, + "balance_loss_clip": 0.06269924, + "balance_loss_mlp": 0.01251474, + "epoch": 0.9326920186382083, + "flos": 26072573719680.0, + "grad_norm": 1.5043776839825136, + "language_loss": 0.80977184, + "learning_rate": 4.7294488944301436e-08, + "loss": 0.8863849, + "num_input_tokens_seen": 334723830, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.09741211, + "step": 15513, + "time_per_iteration": 4.013933181762695 + }, + { + "auxiliary_loss_clip": 0.06411409, + "auxiliary_loss_mlp": 0.01267814, + "balance_loss_clip": 0.0627336, + "balance_loss_mlp": 0.01256966, + "epoch": 0.9327521418908763, + "flos": 12061945019520.0, + "grad_norm": 1.9976650496804842, + "language_loss": 0.80668688, + "learning_rate": 4.721033078682768e-08, + "loss": 0.88347912, + "num_input_tokens_seen": 334740825, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.10858154, + "step": 15514, + "time_per_iteration": 2.4943747520446777 + }, + { + "auxiliary_loss_clip": 0.06396277, + "auxiliary_loss_mlp": 0.01265518, + "balance_loss_clip": 0.06269284, + "balance_loss_mlp": 0.01256565, + "epoch": 0.9328122651435443, + "flos": 43844233259520.0, + "grad_norm": 1.850634533570311, + "language_loss": 0.71329403, + "learning_rate": 4.7126246679333626e-08, + "loss": 0.78991199, + "num_input_tokens_seen": 334765825, + "router_z_loss_clip": 1.26855469, + "router_z_loss_mlp": 0.08953857, + "step": 15515, + "time_per_iteration": 2.72308611869812 + }, + { + "auxiliary_loss_clip": 0.0640793, + "auxiliary_loss_mlp": 0.01262767, + "balance_loss_clip": 0.06270353, + "balance_loss_mlp": 0.01252647, + "epoch": 0.9328723883962122, + "flos": 15200318568960.0, + "grad_norm": 2.4044334079280882, + "language_loss": 0.81314027, + "learning_rate": 4.704223662500806e-08, + "loss": 0.88984722, + "num_input_tokens_seen": 334782680, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.10125732, + "step": 15516, + "time_per_iteration": 2.5302047729492188 + }, + { + "auxiliary_loss_clip": 0.06406114, + "auxiliary_loss_mlp": 0.01265832, + "balance_loss_clip": 0.06271726, + "balance_loss_mlp": 0.01255943, + "epoch": 0.9329325116488802, + "flos": 20267194164480.0, + "grad_norm": 1.6559287001330782, + "language_loss": 0.80651397, + "learning_rate": 4.695830062703643e-08, + "loss": 0.88323343, + "num_input_tokens_seen": 334800160, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.09893799, + "step": 15517, + "time_per_iteration": 2.5221047401428223 + }, + { + "auxiliary_loss_clip": 0.06403969, + "auxiliary_loss_mlp": 0.01265946, + "balance_loss_clip": 0.06271014, + "balance_loss_mlp": 0.01256051, + "epoch": 0.9329926349015482, + "flos": 13119981724800.0, + "grad_norm": 2.3377447085938563, + "language_loss": 0.74920237, + "learning_rate": 4.687443868860219e-08, + "loss": 0.82590151, + "num_input_tokens_seen": 334815840, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.09899902, + "step": 15518, + "time_per_iteration": 2.527200222015381 + }, + { + "auxiliary_loss_clip": 0.06399631, + "auxiliary_loss_mlp": 0.0126335, + "balance_loss_clip": 0.06269, + "balance_loss_mlp": 0.01254559, + "epoch": 0.9330527581542162, + "flos": 23047070019840.0, + "grad_norm": 1.9434422747125724, + "language_loss": 0.75886834, + "learning_rate": 4.679065081288458e-08, + "loss": 0.83549809, + "num_input_tokens_seen": 334834735, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.08789062, + "step": 15519, + "time_per_iteration": 2.49600887298584 + }, + { + "auxiliary_loss_clip": 0.06401511, + "auxiliary_loss_mlp": 0.01266494, + "balance_loss_clip": 0.06272543, + "balance_loss_mlp": 0.01256451, + "epoch": 0.9331128814068841, + "flos": 15565021464960.0, + "grad_norm": 1.9352369672878387, + "language_loss": 0.83285367, + "learning_rate": 4.6706937003061275e-08, + "loss": 0.90953374, + "num_input_tokens_seen": 334853490, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.10040283, + "step": 15520, + "time_per_iteration": 2.496610641479492 + }, + { + "auxiliary_loss_clip": 0.06397337, + "auxiliary_loss_mlp": 0.01267035, + "balance_loss_clip": 0.06269252, + "balance_loss_mlp": 0.01258064, + "epoch": 0.9331730046595521, + "flos": 22278070373760.0, + "grad_norm": 1.5332547398860534, + "language_loss": 0.76337314, + "learning_rate": 4.6623297262306846e-08, + "loss": 0.84001684, + "num_input_tokens_seen": 334873675, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.08959961, + "step": 15521, + "time_per_iteration": 2.493025779724121 + }, + { + "auxiliary_loss_clip": 0.063995, + "auxiliary_loss_mlp": 0.0126359, + "balance_loss_clip": 0.06270102, + "balance_loss_mlp": 0.01254667, + "epoch": 0.93323312791222, + "flos": 15782920807680.0, + "grad_norm": 4.746978619733777, + "language_loss": 0.77783549, + "learning_rate": 4.6539731593792545e-08, + "loss": 0.85446644, + "num_input_tokens_seen": 334890970, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.08935547, + "step": 15522, + "time_per_iteration": 2.483830213546753 + }, + { + "auxiliary_loss_clip": 0.06401372, + "auxiliary_loss_mlp": 0.01263805, + "balance_loss_clip": 0.06269466, + "balance_loss_mlp": 0.01253565, + "epoch": 0.933293251164888, + "flos": 22016342545920.0, + "grad_norm": 1.8324231152169705, + "language_loss": 0.6271559, + "learning_rate": 4.6456240000687373e-08, + "loss": 0.70380771, + "num_input_tokens_seen": 334906635, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.10247803, + "step": 15523, + "time_per_iteration": 2.464359760284424 + }, + { + "auxiliary_loss_clip": 0.06401858, + "auxiliary_loss_mlp": 0.01268604, + "balance_loss_clip": 0.06272347, + "balance_loss_mlp": 0.01259008, + "epoch": 0.933353374417556, + "flos": 26038556161920.0, + "grad_norm": 1.6351277834664266, + "language_loss": 0.68286568, + "learning_rate": 4.63728224861577e-08, + "loss": 0.7595703, + "num_input_tokens_seen": 334926230, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.0960083, + "step": 15524, + "time_per_iteration": 2.5472025871276855 + }, + { + "auxiliary_loss_clip": 0.06402338, + "auxiliary_loss_mlp": 0.0126476, + "balance_loss_clip": 0.06269465, + "balance_loss_mlp": 0.01254788, + "epoch": 0.933413497670224, + "flos": 24907075752960.0, + "grad_norm": 1.6411454444510272, + "language_loss": 0.73814523, + "learning_rate": 4.628947905336589e-08, + "loss": 0.81481624, + "num_input_tokens_seen": 334946680, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.09973145, + "step": 15525, + "time_per_iteration": 2.5322306156158447 + }, + { + "auxiliary_loss_clip": 0.06398012, + "auxiliary_loss_mlp": 0.01262306, + "balance_loss_clip": 0.06270038, + "balance_loss_mlp": 0.01253449, + "epoch": 0.9334736209228919, + "flos": 23694227429760.0, + "grad_norm": 2.041587925291887, + "language_loss": 0.84483254, + "learning_rate": 4.6206209705473175e-08, + "loss": 0.92143565, + "num_input_tokens_seen": 334964785, + "router_z_loss_clip": 1.27929688, + "router_z_loss_mlp": 0.08862305, + "step": 15526, + "time_per_iteration": 2.519195556640625 + }, + { + "auxiliary_loss_clip": 0.06403422, + "auxiliary_loss_mlp": 0.01265587, + "balance_loss_clip": 0.06271212, + "balance_loss_mlp": 0.0125633, + "epoch": 0.9335337441755599, + "flos": 15382732907520.0, + "grad_norm": 1.8383298430053767, + "language_loss": 0.6973694, + "learning_rate": 4.61230144456366e-08, + "loss": 0.77405953, + "num_input_tokens_seen": 334982400, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.0927124, + "step": 15527, + "time_per_iteration": 2.456176996231079 + }, + { + "auxiliary_loss_clip": 0.06408224, + "auxiliary_loss_mlp": 0.01262635, + "balance_loss_clip": 0.06273658, + "balance_loss_mlp": 0.01252043, + "epoch": 0.9335938674282279, + "flos": 16112180626560.0, + "grad_norm": 2.2241549741395574, + "language_loss": 0.65134645, + "learning_rate": 4.603989327701141e-08, + "loss": 0.728055, + "num_input_tokens_seen": 334999685, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.10595703, + "step": 15528, + "time_per_iteration": 2.4924302101135254 + }, + { + "auxiliary_loss_clip": 0.06401557, + "auxiliary_loss_mlp": 0.0126415, + "balance_loss_clip": 0.06268039, + "balance_loss_mlp": 0.01254357, + "epoch": 0.9336539906808958, + "flos": 18958875713280.0, + "grad_norm": 1.7399334221654377, + "language_loss": 0.74828267, + "learning_rate": 4.5956846202748867e-08, + "loss": 0.82493973, + "num_input_tokens_seen": 335019160, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.09790039, + "step": 15529, + "time_per_iteration": 2.482252597808838 + }, + { + "auxiliary_loss_clip": 0.06400265, + "auxiliary_loss_mlp": 0.01263909, + "balance_loss_clip": 0.06269649, + "balance_loss_mlp": 0.01255407, + "epoch": 0.9337141139335638, + "flos": 18114168303360.0, + "grad_norm": 1.7494873639650692, + "language_loss": 0.63001961, + "learning_rate": 4.5873873225998674e-08, + "loss": 0.70666134, + "num_input_tokens_seen": 335037350, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.08505249, + "step": 15530, + "time_per_iteration": 2.564744234085083 + }, + { + "auxiliary_loss_clip": 0.06398335, + "auxiliary_loss_mlp": 0.01263036, + "balance_loss_clip": 0.06270778, + "balance_loss_mlp": 0.01253749, + "epoch": 0.9337742371862318, + "flos": 17351122296960.0, + "grad_norm": 1.7194228505060978, + "language_loss": 0.73030329, + "learning_rate": 4.5790974349907194e-08, + "loss": 0.80691695, + "num_input_tokens_seen": 335056060, + "router_z_loss_clip": 1.27539062, + "router_z_loss_mlp": 0.09283447, + "step": 15531, + "time_per_iteration": 2.522684097290039 + }, + { + "auxiliary_loss_clip": 0.06400237, + "auxiliary_loss_mlp": 0.01264634, + "balance_loss_clip": 0.06271127, + "balance_loss_mlp": 0.01254772, + "epoch": 0.9338343604388998, + "flos": 29066575484160.0, + "grad_norm": 1.6454676066397984, + "language_loss": 0.71094078, + "learning_rate": 4.5708149577617925e-08, + "loss": 0.78758943, + "num_input_tokens_seen": 335075410, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.09866333, + "step": 15532, + "time_per_iteration": 2.5740439891815186 + }, + { + "auxiliary_loss_clip": 0.0640241, + "auxiliary_loss_mlp": 0.01263663, + "balance_loss_clip": 0.06269048, + "balance_loss_mlp": 0.01254288, + "epoch": 0.9338944836915677, + "flos": 18666819907200.0, + "grad_norm": 1.5232167653668405, + "language_loss": 0.73042238, + "learning_rate": 4.5625398912271016e-08, + "loss": 0.80708313, + "num_input_tokens_seen": 335095190, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.09381104, + "step": 15533, + "time_per_iteration": 2.587557554244995 + }, + { + "auxiliary_loss_clip": 0.06396709, + "auxiliary_loss_mlp": 0.01263683, + "balance_loss_clip": 0.06268157, + "balance_loss_mlp": 0.01254963, + "epoch": 0.9339546069442357, + "flos": 16623309732480.0, + "grad_norm": 1.6925618891662986, + "language_loss": 0.79914582, + "learning_rate": 4.554272235700507e-08, + "loss": 0.87574971, + "num_input_tokens_seen": 335113825, + "router_z_loss_clip": 1.28417969, + "router_z_loss_mlp": 0.0871582, + "step": 15534, + "time_per_iteration": 2.499203681945801 + }, + { + "auxiliary_loss_clip": 0.06394495, + "auxiliary_loss_mlp": 0.01265151, + "balance_loss_clip": 0.06272149, + "balance_loss_mlp": 0.01256836, + "epoch": 0.9340147301969036, + "flos": 23699384455680.0, + "grad_norm": 1.5632032653776713, + "language_loss": 0.74868226, + "learning_rate": 4.546011991495513e-08, + "loss": 0.8252787, + "num_input_tokens_seen": 335136425, + "router_z_loss_clip": 1.22363281, + "router_z_loss_mlp": 0.08319092, + "step": 15535, + "time_per_iteration": 3.9140188694000244 + }, + { + "auxiliary_loss_clip": 0.06405044, + "auxiliary_loss_mlp": 0.0126181, + "balance_loss_clip": 0.06272762, + "balance_loss_mlp": 0.01253042, + "epoch": 0.9340748534495716, + "flos": 28661440193280.0, + "grad_norm": 1.9268953260365462, + "language_loss": 0.78152293, + "learning_rate": 4.537759158925292e-08, + "loss": 0.85819149, + "num_input_tokens_seen": 335157925, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.08770752, + "step": 15536, + "time_per_iteration": 2.5641329288482666 + }, + { + "auxiliary_loss_clip": 0.06401525, + "auxiliary_loss_mlp": 0.01264478, + "balance_loss_clip": 0.06270687, + "balance_loss_mlp": 0.01255269, + "epoch": 0.9341349767022396, + "flos": 24906530701440.0, + "grad_norm": 1.4301567901014753, + "language_loss": 0.80895746, + "learning_rate": 4.5295137383028593e-08, + "loss": 0.88561743, + "num_input_tokens_seen": 335177840, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.09210205, + "step": 15537, + "time_per_iteration": 2.5457959175109863 + }, + { + "auxiliary_loss_clip": 0.06404231, + "auxiliary_loss_mlp": 0.01264078, + "balance_loss_clip": 0.06270302, + "balance_loss_mlp": 0.01254697, + "epoch": 0.9341950999549076, + "flos": 29067204389760.0, + "grad_norm": 1.8933325679633086, + "language_loss": 0.77954888, + "learning_rate": 4.5212757299408764e-08, + "loss": 0.85623199, + "num_input_tokens_seen": 335199470, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.09387207, + "step": 15538, + "time_per_iteration": 2.5645008087158203 + }, + { + "auxiliary_loss_clip": 0.06401729, + "auxiliary_loss_mlp": 0.01265221, + "balance_loss_clip": 0.06271592, + "balance_loss_mlp": 0.01255911, + "epoch": 0.9342552232075755, + "flos": 23593893765120.0, + "grad_norm": 1.685361007162288, + "language_loss": 0.7330637, + "learning_rate": 4.513045134151672e-08, + "loss": 0.80973315, + "num_input_tokens_seen": 335218885, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.09301758, + "step": 15539, + "time_per_iteration": 2.5273890495300293 + }, + { + "auxiliary_loss_clip": 0.06399798, + "auxiliary_loss_mlp": 0.01264706, + "balance_loss_clip": 0.06271564, + "balance_loss_mlp": 0.01256314, + "epoch": 0.9343153464602435, + "flos": 36730325617920.0, + "grad_norm": 1.4282033939406924, + "language_loss": 0.65054214, + "learning_rate": 4.504821951247373e-08, + "loss": 0.72718728, + "num_input_tokens_seen": 335239485, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.08392334, + "step": 15540, + "time_per_iteration": 2.713907241821289 + }, + { + "auxiliary_loss_clip": 0.06400084, + "auxiliary_loss_mlp": 0.01264105, + "balance_loss_clip": 0.06270008, + "balance_loss_mlp": 0.0125449, + "epoch": 0.9343754697129115, + "flos": 22243004640000.0, + "grad_norm": 1.9745672183993257, + "language_loss": 0.76623344, + "learning_rate": 4.496606181539864e-08, + "loss": 0.8428753, + "num_input_tokens_seen": 335258355, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.09625244, + "step": 15541, + "time_per_iteration": 2.537337064743042 + }, + { + "auxiliary_loss_clip": 0.06400786, + "auxiliary_loss_mlp": 0.0126562, + "balance_loss_clip": 0.06271005, + "balance_loss_mlp": 0.01255827, + "epoch": 0.9344355929655794, + "flos": 29717128984320.0, + "grad_norm": 1.7902468110763983, + "language_loss": 0.675026, + "learning_rate": 4.4883978253406066e-08, + "loss": 0.75169003, + "num_input_tokens_seen": 335276835, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.09790039, + "step": 15542, + "time_per_iteration": 2.589301347732544 + }, + { + "auxiliary_loss_clip": 0.06398873, + "auxiliary_loss_mlp": 0.01264541, + "balance_loss_clip": 0.06269249, + "balance_loss_mlp": 0.01254998, + "epoch": 0.9344957162182475, + "flos": 18886438258560.0, + "grad_norm": 2.562374344000717, + "language_loss": 0.69583577, + "learning_rate": 4.480196882960907e-08, + "loss": 0.77246988, + "num_input_tokens_seen": 335296220, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.09545898, + "step": 15543, + "time_per_iteration": 3.915339708328247 + }, + { + "auxiliary_loss_clip": 0.06405383, + "auxiliary_loss_mlp": 0.01263667, + "balance_loss_clip": 0.06269714, + "balance_loss_mlp": 0.01253564, + "epoch": 0.9345558394709154, + "flos": 27425181853440.0, + "grad_norm": 2.8974325946656303, + "language_loss": 0.70212889, + "learning_rate": 4.4720033547117394e-08, + "loss": 0.77881944, + "num_input_tokens_seen": 335316335, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.10101318, + "step": 15544, + "time_per_iteration": 2.552852153778076 + }, + { + "auxiliary_loss_clip": 0.06405076, + "auxiliary_loss_mlp": 0.01266145, + "balance_loss_clip": 0.06271882, + "balance_loss_mlp": 0.01256483, + "epoch": 0.9346159627235834, + "flos": 20747659875840.0, + "grad_norm": 1.548835129494503, + "language_loss": 0.77488774, + "learning_rate": 4.463817240903789e-08, + "loss": 0.85159993, + "num_input_tokens_seen": 335335545, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.09661865, + "step": 15545, + "time_per_iteration": 2.561870813369751 + }, + { + "auxiliary_loss_clip": 0.0640424, + "auxiliary_loss_mlp": 0.01264324, + "balance_loss_clip": 0.0626965, + "balance_loss_mlp": 0.01254752, + "epoch": 0.9346760859762513, + "flos": 21075578029440.0, + "grad_norm": 1.423420905987788, + "language_loss": 0.69126034, + "learning_rate": 4.455638541847495e-08, + "loss": 0.767946, + "num_input_tokens_seen": 335355350, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.09576416, + "step": 15546, + "time_per_iteration": 2.504326581954956 + }, + { + "auxiliary_loss_clip": 0.06395329, + "auxiliary_loss_mlp": 0.0126154, + "balance_loss_clip": 0.0626837, + "balance_loss_mlp": 0.01253082, + "epoch": 0.9347362092289193, + "flos": 29212540496640.0, + "grad_norm": 1.728698051619845, + "language_loss": 0.82426834, + "learning_rate": 4.447467257852966e-08, + "loss": 0.90083706, + "num_input_tokens_seen": 335375160, + "router_z_loss_clip": 1.26855469, + "router_z_loss_mlp": 0.08459473, + "step": 15547, + "time_per_iteration": 2.564218044281006 + }, + { + "auxiliary_loss_clip": 0.06397106, + "auxiliary_loss_mlp": 0.01264609, + "balance_loss_clip": 0.06268612, + "balance_loss_mlp": 0.01256104, + "epoch": 0.9347963324815872, + "flos": 19433429712000.0, + "grad_norm": 1.7812542299870269, + "language_loss": 0.83993661, + "learning_rate": 4.439303389230087e-08, + "loss": 0.91655374, + "num_input_tokens_seen": 335394080, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.08508301, + "step": 15548, + "time_per_iteration": 2.4733710289001465 + }, + { + "auxiliary_loss_clip": 0.06411811, + "auxiliary_loss_mlp": 0.01266367, + "balance_loss_clip": 0.06275804, + "balance_loss_mlp": 0.01255292, + "epoch": 0.9348564557342552, + "flos": 36910475896320.0, + "grad_norm": 1.6747966040501179, + "language_loss": 0.65960097, + "learning_rate": 4.4311469362884326e-08, + "loss": 0.73638272, + "num_input_tokens_seen": 335414230, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.11065674, + "step": 15549, + "time_per_iteration": 4.084869623184204 + }, + { + "auxiliary_loss_clip": 0.06403033, + "auxiliary_loss_mlp": 0.01262193, + "balance_loss_clip": 0.06271501, + "balance_loss_mlp": 0.01252752, + "epoch": 0.9349165789869232, + "flos": 21696684019200.0, + "grad_norm": 1.700964891054384, + "language_loss": 0.8061015, + "learning_rate": 4.4229978993372665e-08, + "loss": 0.88275379, + "num_input_tokens_seen": 335432890, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09436035, + "step": 15550, + "time_per_iteration": 2.499324083328247 + }, + { + "auxiliary_loss_clip": 0.06405445, + "auxiliary_loss_mlp": 0.01265455, + "balance_loss_clip": 0.06276134, + "balance_loss_mlp": 0.01255978, + "epoch": 0.9349767022395912, + "flos": 18850114713600.0, + "grad_norm": 1.7308471893198725, + "language_loss": 0.7611016, + "learning_rate": 4.4148562786856524e-08, + "loss": 0.83781064, + "num_input_tokens_seen": 335452085, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.09472656, + "step": 15551, + "time_per_iteration": 2.489948272705078 + }, + { + "auxiliary_loss_clip": 0.06396884, + "auxiliary_loss_mlp": 0.01262996, + "balance_loss_clip": 0.06270495, + "balance_loss_mlp": 0.01255111, + "epoch": 0.9350368254922591, + "flos": 24980477529600.0, + "grad_norm": 1.5471348014278214, + "language_loss": 0.73827606, + "learning_rate": 4.406722074642255e-08, + "loss": 0.81487489, + "num_input_tokens_seen": 335472130, + "router_z_loss_clip": 1.26464844, + "router_z_loss_mlp": 0.07891846, + "step": 15552, + "time_per_iteration": 2.5838027000427246 + }, + { + "auxiliary_loss_clip": 0.06398878, + "auxiliary_loss_mlp": 0.01266903, + "balance_loss_clip": 0.06268165, + "balance_loss_mlp": 0.01257813, + "epoch": 0.9350969487449271, + "flos": 23076391749120.0, + "grad_norm": 1.569356822541186, + "language_loss": 0.77291447, + "learning_rate": 4.3985952875155386e-08, + "loss": 0.8495723, + "num_input_tokens_seen": 335489970, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09088135, + "step": 15553, + "time_per_iteration": 3.920443534851074 + }, + { + "auxiliary_loss_clip": 0.06403033, + "auxiliary_loss_mlp": 0.0126984, + "balance_loss_clip": 0.06268983, + "balance_loss_mlp": 0.01259314, + "epoch": 0.9351570719975951, + "flos": 18631209121920.0, + "grad_norm": 1.5901890244896573, + "language_loss": 0.78230381, + "learning_rate": 4.390475917613723e-08, + "loss": 0.85903263, + "num_input_tokens_seen": 335509125, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.10522461, + "step": 15554, + "time_per_iteration": 2.5145413875579834 + }, + { + "auxiliary_loss_clip": 0.06394763, + "auxiliary_loss_mlp": 0.01263815, + "balance_loss_clip": 0.06269139, + "balance_loss_mlp": 0.01255966, + "epoch": 0.935217195250263, + "flos": 15893862013440.0, + "grad_norm": 1.5004203898596764, + "language_loss": 0.68972766, + "learning_rate": 4.382363965244695e-08, + "loss": 0.76631343, + "num_input_tokens_seen": 335525620, + "router_z_loss_clip": 1.25683594, + "router_z_loss_mlp": 0.07843018, + "step": 15555, + "time_per_iteration": 2.478994369506836 + }, + { + "auxiliary_loss_clip": 0.0639784, + "auxiliary_loss_mlp": 0.01264208, + "balance_loss_clip": 0.06269526, + "balance_loss_mlp": 0.01254689, + "epoch": 0.935277318502931, + "flos": 24397372166400.0, + "grad_norm": 1.6233160508843345, + "language_loss": 0.75448465, + "learning_rate": 4.374259430715965e-08, + "loss": 0.83110511, + "num_input_tokens_seen": 335547565, + "router_z_loss_clip": 1.28320312, + "router_z_loss_mlp": 0.09515381, + "step": 15556, + "time_per_iteration": 2.5654189586639404 + }, + { + "auxiliary_loss_clip": 0.06400485, + "auxiliary_loss_mlp": 0.01265083, + "balance_loss_clip": 0.06270866, + "balance_loss_mlp": 0.01256327, + "epoch": 0.935337441755599, + "flos": 27607721973120.0, + "grad_norm": 1.4937701005093391, + "language_loss": 0.72718519, + "learning_rate": 4.366162314334953e-08, + "loss": 0.80384088, + "num_input_tokens_seen": 335570285, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.08746338, + "step": 15557, + "time_per_iteration": 2.5661914348602295 + }, + { + "auxiliary_loss_clip": 0.06403461, + "auxiliary_loss_mlp": 0.01266447, + "balance_loss_clip": 0.0627244, + "balance_loss_mlp": 0.01256844, + "epoch": 0.935397565008267, + "flos": 20488699232640.0, + "grad_norm": 1.479053055288317, + "language_loss": 0.63463771, + "learning_rate": 4.358072616408681e-08, + "loss": 0.71133679, + "num_input_tokens_seen": 335588600, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09594727, + "step": 15558, + "time_per_iteration": 2.4923977851867676 + }, + { + "auxiliary_loss_clip": 0.06402527, + "auxiliary_loss_mlp": 0.0126766, + "balance_loss_clip": 0.06272481, + "balance_loss_mlp": 0.01257456, + "epoch": 0.9354576882609349, + "flos": 23660293726080.0, + "grad_norm": 1.7353882784834274, + "language_loss": 0.73151875, + "learning_rate": 4.34999033724388e-08, + "loss": 0.80822068, + "num_input_tokens_seen": 335606235, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.10198975, + "step": 15559, + "time_per_iteration": 2.5124833583831787 + }, + { + "auxiliary_loss_clip": 0.06400333, + "auxiliary_loss_mlp": 0.01260437, + "balance_loss_clip": 0.0627114, + "balance_loss_mlp": 0.01252236, + "epoch": 0.9355178115136029, + "flos": 36693834364800.0, + "grad_norm": 1.5332066334129346, + "language_loss": 0.64076531, + "learning_rate": 4.341915477147062e-08, + "loss": 0.71737301, + "num_input_tokens_seen": 335628240, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.08197021, + "step": 15560, + "time_per_iteration": 2.6758434772491455 + }, + { + "auxiliary_loss_clip": 0.06415723, + "auxiliary_loss_mlp": 0.01267002, + "balance_loss_clip": 0.06274995, + "balance_loss_mlp": 0.01255785, + "epoch": 0.9355779347662708, + "flos": 14464833356160.0, + "grad_norm": 2.091115456103633, + "language_loss": 0.64280677, + "learning_rate": 4.3338480364244034e-08, + "loss": 0.71963406, + "num_input_tokens_seen": 335643755, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.11218262, + "step": 15561, + "time_per_iteration": 2.5375335216522217 + }, + { + "auxiliary_loss_clip": 0.06398933, + "auxiliary_loss_mlp": 0.01266271, + "balance_loss_clip": 0.06270428, + "balance_loss_mlp": 0.0125668, + "epoch": 0.9356380580189388, + "flos": 23192783470080.0, + "grad_norm": 2.046301744114267, + "language_loss": 0.7559768, + "learning_rate": 4.325788015381859e-08, + "loss": 0.83262885, + "num_input_tokens_seen": 335665160, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.09594727, + "step": 15562, + "time_per_iteration": 2.5097131729125977 + }, + { + "auxiliary_loss_clip": 0.06310297, + "auxiliary_loss_mlp": 0.01249402, + "balance_loss_clip": 0.06256372, + "balance_loss_mlp": 0.01248288, + "epoch": 0.9356981812716068, + "flos": 67490592480000.0, + "grad_norm": 0.9299181656084027, + "language_loss": 0.62328547, + "learning_rate": 4.31773541432503e-08, + "loss": 0.69888246, + "num_input_tokens_seen": 335715240, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.01116943, + "step": 15563, + "time_per_iteration": 2.9744601249694824 + }, + { + "auxiliary_loss_clip": 0.06396849, + "auxiliary_loss_mlp": 0.0126558, + "balance_loss_clip": 0.06269947, + "balance_loss_mlp": 0.01256756, + "epoch": 0.9357583045242748, + "flos": 24688631358720.0, + "grad_norm": 1.421638923084558, + "language_loss": 0.78548312, + "learning_rate": 4.3096902335592714e-08, + "loss": 0.8621074, + "num_input_tokens_seen": 335734970, + "router_z_loss_clip": 1.26855469, + "router_z_loss_mlp": 0.08816528, + "step": 15564, + "time_per_iteration": 2.528512954711914 + }, + { + "auxiliary_loss_clip": 0.06406452, + "auxiliary_loss_mlp": 0.01264834, + "balance_loss_clip": 0.06271413, + "balance_loss_mlp": 0.01255346, + "epoch": 0.9358184277769427, + "flos": 19469795184000.0, + "grad_norm": 2.174716619334903, + "language_loss": 0.78390223, + "learning_rate": 4.301652473389694e-08, + "loss": 0.86061513, + "num_input_tokens_seen": 335753435, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.09490967, + "step": 15565, + "time_per_iteration": 2.4927587509155273 + }, + { + "auxiliary_loss_clip": 0.06398039, + "auxiliary_loss_mlp": 0.01262656, + "balance_loss_clip": 0.06270307, + "balance_loss_mlp": 0.01254275, + "epoch": 0.9358785510296107, + "flos": 18923055292800.0, + "grad_norm": 2.3604474699248086, + "language_loss": 0.72209811, + "learning_rate": 4.2936221341210774e-08, + "loss": 0.7987051, + "num_input_tokens_seen": 335772105, + "router_z_loss_clip": 1.27929688, + "router_z_loss_mlp": 0.08380127, + "step": 15566, + "time_per_iteration": 2.469862222671509 + }, + { + "auxiliary_loss_clip": 0.06401025, + "auxiliary_loss_mlp": 0.01264516, + "balance_loss_clip": 0.06269009, + "balance_loss_mlp": 0.01255063, + "epoch": 0.9359386742822787, + "flos": 23448096460800.0, + "grad_norm": 1.7674222051319097, + "language_loss": 0.68101299, + "learning_rate": 4.285599216057889e-08, + "loss": 0.75766838, + "num_input_tokens_seen": 335789125, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09448242, + "step": 15567, + "time_per_iteration": 2.5092694759368896 + }, + { + "auxiliary_loss_clip": 0.06399126, + "auxiliary_loss_mlp": 0.01265065, + "balance_loss_clip": 0.06268832, + "balance_loss_mlp": 0.01255159, + "epoch": 0.9359987975349466, + "flos": 32752275903360.0, + "grad_norm": 1.9133350433830412, + "language_loss": 0.62613881, + "learning_rate": 4.277583719504418e-08, + "loss": 0.70278078, + "num_input_tokens_seen": 335810995, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.09893799, + "step": 15568, + "time_per_iteration": 2.590184211730957 + }, + { + "auxiliary_loss_clip": 0.06399098, + "auxiliary_loss_mlp": 0.01262364, + "balance_loss_clip": 0.06269547, + "balance_loss_mlp": 0.01253251, + "epoch": 0.9360589207876147, + "flos": 22826151930240.0, + "grad_norm": 1.5331346258977052, + "language_loss": 0.79038656, + "learning_rate": 4.269575644764556e-08, + "loss": 0.86700118, + "num_input_tokens_seen": 335830580, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.09118652, + "step": 15569, + "time_per_iteration": 2.547078847885132 + }, + { + "auxiliary_loss_clip": 0.06405905, + "auxiliary_loss_mlp": 0.01266784, + "balance_loss_clip": 0.06274277, + "balance_loss_mlp": 0.01257414, + "epoch": 0.9361190440402826, + "flos": 20891318901120.0, + "grad_norm": 2.1386136697606517, + "language_loss": 0.70064366, + "learning_rate": 4.261574992142014e-08, + "loss": 0.77737057, + "num_input_tokens_seen": 335846515, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09368896, + "step": 15570, + "time_per_iteration": 2.511286973953247 + }, + { + "auxiliary_loss_clip": 0.06400268, + "auxiliary_loss_mlp": 0.0126654, + "balance_loss_clip": 0.06268404, + "balance_loss_mlp": 0.01256568, + "epoch": 0.9361791672929506, + "flos": 19323872098560.0, + "grad_norm": 1.9820727131819575, + "language_loss": 0.79175496, + "learning_rate": 4.2535817619401726e-08, + "loss": 0.86842304, + "num_input_tokens_seen": 335863350, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.09973145, + "step": 15571, + "time_per_iteration": 2.5219452381134033 + }, + { + "auxiliary_loss_clip": 0.06402998, + "auxiliary_loss_mlp": 0.0126496, + "balance_loss_clip": 0.06271084, + "balance_loss_mlp": 0.01255965, + "epoch": 0.9362392905456185, + "flos": 15163491899520.0, + "grad_norm": 1.9713117286932247, + "language_loss": 0.77583826, + "learning_rate": 4.2455959544621224e-08, + "loss": 0.85251784, + "num_input_tokens_seen": 335880510, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.08996582, + "step": 15572, + "time_per_iteration": 2.4837546348571777 + }, + { + "auxiliary_loss_clip": 0.06396253, + "auxiliary_loss_mlp": 0.01264793, + "balance_loss_clip": 0.0626861, + "balance_loss_mlp": 0.01255775, + "epoch": 0.9362994137982865, + "flos": 22091589112320.0, + "grad_norm": 1.592764345612902, + "language_loss": 0.78254807, + "learning_rate": 4.237617570010688e-08, + "loss": 0.85915852, + "num_input_tokens_seen": 335899440, + "router_z_loss_clip": 1.27539062, + "router_z_loss_mlp": 0.09014893, + "step": 15573, + "time_per_iteration": 2.538482427597046 + }, + { + "auxiliary_loss_clip": 0.06395616, + "auxiliary_loss_mlp": 0.01265839, + "balance_loss_clip": 0.06269381, + "balance_loss_mlp": 0.01257023, + "epoch": 0.9363595370509544, + "flos": 23518772979840.0, + "grad_norm": 1.4505342920053566, + "language_loss": 0.74485445, + "learning_rate": 4.2296466088884044e-08, + "loss": 0.82146895, + "num_input_tokens_seen": 335919540, + "router_z_loss_clip": 1.26367188, + "router_z_loss_mlp": 0.08813477, + "step": 15574, + "time_per_iteration": 3.9295005798339844 + }, + { + "auxiliary_loss_clip": 0.06395365, + "auxiliary_loss_mlp": 0.01261285, + "balance_loss_clip": 0.06269053, + "balance_loss_mlp": 0.01252112, + "epoch": 0.9364196603036224, + "flos": 27130442716800.0, + "grad_norm": 2.361043736430351, + "language_loss": 0.68279696, + "learning_rate": 4.221683071397564e-08, + "loss": 0.75936341, + "num_input_tokens_seen": 335939665, + "router_z_loss_clip": 1.26367188, + "router_z_loss_mlp": 0.09173584, + "step": 15575, + "time_per_iteration": 2.606562852859497 + }, + { + "auxiliary_loss_clip": 0.06395829, + "auxiliary_loss_mlp": 0.01265677, + "balance_loss_clip": 0.06269231, + "balance_loss_mlp": 0.01256021, + "epoch": 0.9364797835562904, + "flos": 18485034474240.0, + "grad_norm": 1.6188828089297882, + "language_loss": 0.65445733, + "learning_rate": 4.2137269578401026e-08, + "loss": 0.73107237, + "num_input_tokens_seen": 335958580, + "router_z_loss_clip": 1.26660156, + "router_z_loss_mlp": 0.09655762, + "step": 15576, + "time_per_iteration": 2.5505363941192627 + }, + { + "auxiliary_loss_clip": 0.06399767, + "auxiliary_loss_mlp": 0.01265648, + "balance_loss_clip": 0.06267945, + "balance_loss_mlp": 0.01255402, + "epoch": 0.9365399068089584, + "flos": 13010507965440.0, + "grad_norm": 2.0026006343601725, + "language_loss": 0.76252437, + "learning_rate": 4.2057782685177566e-08, + "loss": 0.8391785, + "num_input_tokens_seen": 335974965, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.10247803, + "step": 15577, + "time_per_iteration": 2.5032527446746826 + }, + { + "auxiliary_loss_clip": 0.06399457, + "auxiliary_loss_mlp": 0.01266406, + "balance_loss_clip": 0.06267272, + "balance_loss_mlp": 0.01256828, + "epoch": 0.9366000300616263, + "flos": 25673559776640.0, + "grad_norm": 2.037972462404189, + "language_loss": 0.52709925, + "learning_rate": 4.1978370037318855e-08, + "loss": 0.60375792, + "num_input_tokens_seen": 335996575, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.0958252, + "step": 15578, + "time_per_iteration": 2.575258731842041 + }, + { + "auxiliary_loss_clip": 0.0639855, + "auxiliary_loss_mlp": 0.01265718, + "balance_loss_clip": 0.06271734, + "balance_loss_mlp": 0.01256956, + "epoch": 0.9366601533142943, + "flos": 21439652019840.0, + "grad_norm": 1.6897117136078763, + "language_loss": 0.70794189, + "learning_rate": 4.189903163783692e-08, + "loss": 0.78458452, + "num_input_tokens_seen": 336017265, + "router_z_loss_clip": 1.26855469, + "router_z_loss_mlp": 0.08746338, + "step": 15579, + "time_per_iteration": 2.5197277069091797 + }, + { + "auxiliary_loss_clip": 0.06398612, + "auxiliary_loss_mlp": 0.01261657, + "balance_loss_clip": 0.06269911, + "balance_loss_mlp": 0.01252639, + "epoch": 0.9367202765669622, + "flos": 24099362720640.0, + "grad_norm": 1.789359287334025, + "language_loss": 0.76805091, + "learning_rate": 4.181976748973959e-08, + "loss": 0.84465355, + "num_input_tokens_seen": 336035905, + "router_z_loss_clip": 1.28613281, + "router_z_loss_mlp": 0.09014893, + "step": 15580, + "time_per_iteration": 2.5272631645202637 + }, + { + "auxiliary_loss_clip": 0.0640737, + "auxiliary_loss_mlp": 0.01263031, + "balance_loss_clip": 0.06271207, + "balance_loss_mlp": 0.01252285, + "epoch": 0.9367803998196302, + "flos": 20895511605120.0, + "grad_norm": 4.066229369441099, + "language_loss": 0.66627061, + "learning_rate": 4.1740577596033114e-08, + "loss": 0.74297458, + "num_input_tokens_seen": 336055585, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.10748291, + "step": 15581, + "time_per_iteration": 2.5539963245391846 + }, + { + "auxiliary_loss_clip": 0.06398203, + "auxiliary_loss_mlp": 0.01266342, + "balance_loss_clip": 0.0626883, + "balance_loss_mlp": 0.01256978, + "epoch": 0.9368405230722983, + "flos": 22570838939520.0, + "grad_norm": 1.4665280133275418, + "language_loss": 0.76610607, + "learning_rate": 4.166146195972042e-08, + "loss": 0.8427515, + "num_input_tokens_seen": 336076695, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.09362793, + "step": 15582, + "time_per_iteration": 4.0959906578063965 + }, + { + "auxiliary_loss_clip": 0.06399594, + "auxiliary_loss_mlp": 0.01263756, + "balance_loss_clip": 0.06270076, + "balance_loss_mlp": 0.01254195, + "epoch": 0.9369006463249662, + "flos": 18886228623360.0, + "grad_norm": 1.845169870254204, + "language_loss": 0.74022168, + "learning_rate": 4.1582420583800905e-08, + "loss": 0.81685519, + "num_input_tokens_seen": 336094740, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.09558105, + "step": 15583, + "time_per_iteration": 2.6164638996124268 + }, + { + "auxiliary_loss_clip": 0.06408083, + "auxiliary_loss_mlp": 0.01269119, + "balance_loss_clip": 0.06271075, + "balance_loss_mlp": 0.01259439, + "epoch": 0.9369607695776342, + "flos": 26439750311040.0, + "grad_norm": 2.094527372320434, + "language_loss": 0.84255081, + "learning_rate": 4.1503453471272376e-08, + "loss": 0.91932285, + "num_input_tokens_seen": 336113985, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.09680176, + "step": 15584, + "time_per_iteration": 2.5555663108825684 + }, + { + "auxiliary_loss_clip": 0.0641142, + "auxiliary_loss_mlp": 0.01267981, + "balance_loss_clip": 0.06274232, + "balance_loss_mlp": 0.01257527, + "epoch": 0.9370208928303021, + "flos": 39576769142400.0, + "grad_norm": 1.4458449658506118, + "language_loss": 0.72188222, + "learning_rate": 4.1424560625129334e-08, + "loss": 0.79867625, + "num_input_tokens_seen": 336136395, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.10455322, + "step": 15585, + "time_per_iteration": 2.6535887718200684 + }, + { + "auxiliary_loss_clip": 0.06396037, + "auxiliary_loss_mlp": 0.01263316, + "balance_loss_clip": 0.06268879, + "balance_loss_mlp": 0.01254334, + "epoch": 0.9370810160829701, + "flos": 22969223976960.0, + "grad_norm": 2.3040990220175535, + "language_loss": 0.80541742, + "learning_rate": 4.134574204836316e-08, + "loss": 0.88201094, + "num_input_tokens_seen": 336156345, + "router_z_loss_clip": 1.27246094, + "router_z_loss_mlp": 0.08978271, + "step": 15586, + "time_per_iteration": 2.5093491077423096 + }, + { + "auxiliary_loss_clip": 0.06403472, + "auxiliary_loss_mlp": 0.01269686, + "balance_loss_clip": 0.06273106, + "balance_loss_mlp": 0.01260226, + "epoch": 0.937141139335638, + "flos": 23081590702080.0, + "grad_norm": 1.4938915537331265, + "language_loss": 0.76188564, + "learning_rate": 4.126699774396258e-08, + "loss": 0.83861721, + "num_input_tokens_seen": 336176760, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.09460449, + "step": 15587, + "time_per_iteration": 2.5432510375976562 + }, + { + "auxiliary_loss_clip": 0.06406046, + "auxiliary_loss_mlp": 0.01262902, + "balance_loss_clip": 0.06271333, + "balance_loss_mlp": 0.01252721, + "epoch": 0.937201262588306, + "flos": 16361246488320.0, + "grad_norm": 2.7151633052231774, + "language_loss": 0.87725753, + "learning_rate": 4.118832771491387e-08, + "loss": 0.95394701, + "num_input_tokens_seen": 336193285, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.10186768, + "step": 15588, + "time_per_iteration": 2.479767322540283 + }, + { + "auxiliary_loss_clip": 0.06396212, + "auxiliary_loss_mlp": 0.01263659, + "balance_loss_clip": 0.06270182, + "balance_loss_mlp": 0.01255195, + "epoch": 0.937261385840974, + "flos": 20200374933120.0, + "grad_norm": 1.6537690665751095, + "language_loss": 0.78271496, + "learning_rate": 4.11097319642002e-08, + "loss": 0.85931367, + "num_input_tokens_seen": 336211425, + "router_z_loss_clip": 1.25878906, + "router_z_loss_mlp": 0.08465576, + "step": 15589, + "time_per_iteration": 4.0159831047058105 + }, + { + "auxiliary_loss_clip": 0.06398676, + "auxiliary_loss_mlp": 0.01262823, + "balance_loss_clip": 0.062714, + "balance_loss_mlp": 0.01253787, + "epoch": 0.937321509093642, + "flos": 18301781594880.0, + "grad_norm": 1.6602653892740842, + "language_loss": 0.778988, + "learning_rate": 4.103121049480163e-08, + "loss": 0.85560298, + "num_input_tokens_seen": 336230205, + "router_z_loss_clip": 1.27246094, + "router_z_loss_mlp": 0.09033203, + "step": 15590, + "time_per_iteration": 2.473738193511963 + }, + { + "auxiliary_loss_clip": 0.06412096, + "auxiliary_loss_mlp": 0.01267354, + "balance_loss_clip": 0.0627647, + "balance_loss_mlp": 0.01257591, + "epoch": 0.9373816323463099, + "flos": 25891710681600.0, + "grad_norm": 1.9786385015228094, + "language_loss": 0.71866137, + "learning_rate": 4.095276330969577e-08, + "loss": 0.79545587, + "num_input_tokens_seen": 336252440, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.09765625, + "step": 15591, + "time_per_iteration": 2.570101022720337 + }, + { + "auxiliary_loss_clip": 0.06408433, + "auxiliary_loss_mlp": 0.01267971, + "balance_loss_clip": 0.06272327, + "balance_loss_mlp": 0.01257058, + "epoch": 0.9374417555989779, + "flos": 27206234334720.0, + "grad_norm": 1.7598443823568033, + "language_loss": 0.53974843, + "learning_rate": 4.0874390411857804e-08, + "loss": 0.61651254, + "num_input_tokens_seen": 336273845, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.10906982, + "step": 15592, + "time_per_iteration": 4.081035137176514 + }, + { + "auxiliary_loss_clip": 0.06399275, + "auxiliary_loss_mlp": 0.01262346, + "balance_loss_clip": 0.06270185, + "balance_loss_mlp": 0.01253209, + "epoch": 0.9375018788516458, + "flos": 23627701687680.0, + "grad_norm": 1.5745760731175873, + "language_loss": 0.67514831, + "learning_rate": 4.0796091804259136e-08, + "loss": 0.75176454, + "num_input_tokens_seen": 336292790, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.09130859, + "step": 15593, + "time_per_iteration": 2.508760452270508 + }, + { + "auxiliary_loss_clip": 0.06400297, + "auxiliary_loss_mlp": 0.01263636, + "balance_loss_clip": 0.06268944, + "balance_loss_mlp": 0.01254385, + "epoch": 0.9375620021043138, + "flos": 22686098630400.0, + "grad_norm": 1.4850027564581405, + "language_loss": 0.74354887, + "learning_rate": 4.0717867489868715e-08, + "loss": 0.82018816, + "num_input_tokens_seen": 336312600, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.09259033, + "step": 15594, + "time_per_iteration": 2.5463995933532715 + }, + { + "auxiliary_loss_clip": 0.06398121, + "auxiliary_loss_mlp": 0.01263132, + "balance_loss_clip": 0.0627058, + "balance_loss_mlp": 0.01254197, + "epoch": 0.9376221253569819, + "flos": 27567121870080.0, + "grad_norm": 1.5092053336620472, + "language_loss": 0.73907506, + "learning_rate": 4.063971747165351e-08, + "loss": 0.8156876, + "num_input_tokens_seen": 336332770, + "router_z_loss_clip": 1.27441406, + "router_z_loss_mlp": 0.0892334, + "step": 15595, + "time_per_iteration": 2.5729222297668457 + }, + { + "auxiliary_loss_clip": 0.06404946, + "auxiliary_loss_mlp": 0.01265806, + "balance_loss_clip": 0.06270959, + "balance_loss_mlp": 0.01256352, + "epoch": 0.9376822486096498, + "flos": 24136063608960.0, + "grad_norm": 1.892076191551823, + "language_loss": 0.7632336, + "learning_rate": 4.056164175257626e-08, + "loss": 0.83994108, + "num_input_tokens_seen": 336351445, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.09454346, + "step": 15596, + "time_per_iteration": 2.5364673137664795 + }, + { + "auxiliary_loss_clip": 0.06401411, + "auxiliary_loss_mlp": 0.01268101, + "balance_loss_clip": 0.06270882, + "balance_loss_mlp": 0.01258492, + "epoch": 0.9377423718623178, + "flos": 22790666926080.0, + "grad_norm": 1.6042943416913158, + "language_loss": 0.78836501, + "learning_rate": 4.0483640335597926e-08, + "loss": 0.86506015, + "num_input_tokens_seen": 336368690, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.09606934, + "step": 15597, + "time_per_iteration": 2.5183331966400146 + }, + { + "auxiliary_loss_clip": 0.06409448, + "auxiliary_loss_mlp": 0.01263004, + "balance_loss_clip": 0.06272915, + "balance_loss_mlp": 0.0125299, + "epoch": 0.9378024951149857, + "flos": 19174427141760.0, + "grad_norm": 1.5171680951862323, + "language_loss": 0.812361, + "learning_rate": 4.0405713223676363e-08, + "loss": 0.88908553, + "num_input_tokens_seen": 336388165, + "router_z_loss_clip": 1.36621094, + "router_z_loss_mlp": 0.10009766, + "step": 15598, + "time_per_iteration": 2.547635078430176 + }, + { + "auxiliary_loss_clip": 0.06408492, + "auxiliary_loss_mlp": 0.01265418, + "balance_loss_clip": 0.06269473, + "balance_loss_mlp": 0.01255118, + "epoch": 0.9378626183676537, + "flos": 23510890696320.0, + "grad_norm": 3.988859299196599, + "language_loss": 0.62941587, + "learning_rate": 4.0327860419766994e-08, + "loss": 0.70615494, + "num_input_tokens_seen": 336406475, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.10302734, + "step": 15599, + "time_per_iteration": 2.5061824321746826 + }, + { + "auxiliary_loss_clip": 0.06402044, + "auxiliary_loss_mlp": 0.01264248, + "balance_loss_clip": 0.06269282, + "balance_loss_mlp": 0.01255039, + "epoch": 0.9379227416203216, + "flos": 18411548843520.0, + "grad_norm": 1.7602701335437743, + "language_loss": 0.73915505, + "learning_rate": 4.0250081926821e-08, + "loss": 0.81581795, + "num_input_tokens_seen": 336424690, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09216309, + "step": 15600, + "time_per_iteration": 2.4871292114257812 + }, + { + "auxiliary_loss_clip": 0.06400068, + "auxiliary_loss_mlp": 0.01264599, + "balance_loss_clip": 0.06271948, + "balance_loss_mlp": 0.01255873, + "epoch": 0.9379828648729897, + "flos": 17827646866560.0, + "grad_norm": 1.946057242530572, + "language_loss": 0.70149601, + "learning_rate": 4.0172377747788474e-08, + "loss": 0.77814269, + "num_input_tokens_seen": 336443055, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.0871582, + "step": 15601, + "time_per_iteration": 2.4916884899139404 + }, + { + "auxiliary_loss_clip": 0.06305277, + "auxiliary_loss_mlp": 0.01251346, + "balance_loss_clip": 0.06251266, + "balance_loss_mlp": 0.01250292, + "epoch": 0.9380429881256576, + "flos": 68044376113920.0, + "grad_norm": 0.7389611059273472, + "language_loss": 0.5819695, + "learning_rate": 4.009474788561573e-08, + "loss": 0.65753579, + "num_input_tokens_seen": 336510190, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.01054382, + "step": 15602, + "time_per_iteration": 3.2857046127319336 + }, + { + "auxiliary_loss_clip": 0.0640846, + "auxiliary_loss_mlp": 0.01264932, + "balance_loss_clip": 0.06275323, + "balance_loss_mlp": 0.01255675, + "epoch": 0.9381031113783256, + "flos": 20783228734080.0, + "grad_norm": 1.8320878544992856, + "language_loss": 0.71827531, + "learning_rate": 4.001719234324663e-08, + "loss": 0.79500926, + "num_input_tokens_seen": 336529250, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.0925293, + "step": 15603, + "time_per_iteration": 2.523958921432495 + }, + { + "auxiliary_loss_clip": 0.0639255, + "auxiliary_loss_mlp": 0.01269196, + "balance_loss_clip": 0.0627018, + "balance_loss_mlp": 0.01260988, + "epoch": 0.9381632346309935, + "flos": 19030935824640.0, + "grad_norm": 1.5522803660196332, + "language_loss": 0.76325035, + "learning_rate": 3.993971112362171e-08, + "loss": 0.83986783, + "num_input_tokens_seen": 336548530, + "router_z_loss_clip": 1.22363281, + "router_z_loss_mlp": 0.08203125, + "step": 15604, + "time_per_iteration": 2.572173595428467 + }, + { + "auxiliary_loss_clip": 0.06403452, + "auxiliary_loss_mlp": 0.01265098, + "balance_loss_clip": 0.06271511, + "balance_loss_mlp": 0.01255019, + "epoch": 0.9382233578836615, + "flos": 23520617769600.0, + "grad_norm": 1.891479976745369, + "language_loss": 0.65401727, + "learning_rate": 3.9862304229679734e-08, + "loss": 0.73070276, + "num_input_tokens_seen": 336568510, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.10070801, + "step": 15605, + "time_per_iteration": 2.520111083984375 + }, + { + "auxiliary_loss_clip": 0.06409709, + "auxiliary_loss_mlp": 0.01267027, + "balance_loss_clip": 0.06272443, + "balance_loss_mlp": 0.01256972, + "epoch": 0.9382834811363294, + "flos": 43077539600640.0, + "grad_norm": 1.699505727802155, + "language_loss": 0.67158365, + "learning_rate": 3.9784971664355683e-08, + "loss": 0.7483511, + "num_input_tokens_seen": 336592020, + "router_z_loss_clip": 1.37402344, + "router_z_loss_mlp": 0.1005249, + "step": 15606, + "time_per_iteration": 2.7113072872161865 + }, + { + "auxiliary_loss_clip": 0.06395191, + "auxiliary_loss_mlp": 0.01266636, + "balance_loss_clip": 0.06269544, + "balance_loss_mlp": 0.01258374, + "epoch": 0.9383436043889974, + "flos": 16441943569920.0, + "grad_norm": 1.6594414641865307, + "language_loss": 0.77971619, + "learning_rate": 3.970771343058166e-08, + "loss": 0.85633445, + "num_input_tokens_seen": 336610010, + "router_z_loss_clip": 1.25585938, + "router_z_loss_mlp": 0.0826416, + "step": 15607, + "time_per_iteration": 2.479999303817749 + }, + { + "auxiliary_loss_clip": 0.06402883, + "auxiliary_loss_mlp": 0.01262038, + "balance_loss_clip": 0.06271037, + "balance_loss_mlp": 0.01252769, + "epoch": 0.9384037276416655, + "flos": 20746863262080.0, + "grad_norm": 1.7801054619230159, + "language_loss": 0.83052444, + "learning_rate": 3.963052953128776e-08, + "loss": 0.90717363, + "num_input_tokens_seen": 336628520, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.0927124, + "step": 15608, + "time_per_iteration": 2.5650830268859863 + }, + { + "auxiliary_loss_clip": 0.06400616, + "auxiliary_loss_mlp": 0.01267206, + "balance_loss_clip": 0.06271856, + "balance_loss_mlp": 0.01257401, + "epoch": 0.9384638508943334, + "flos": 19068726816000.0, + "grad_norm": 1.6719463976708178, + "language_loss": 0.69115657, + "learning_rate": 3.9553419969400536e-08, + "loss": 0.76783478, + "num_input_tokens_seen": 336647365, + "router_z_loss_clip": 1.28808594, + "router_z_loss_mlp": 0.09802246, + "step": 15609, + "time_per_iteration": 2.498767614364624 + }, + { + "auxiliary_loss_clip": 0.06405382, + "auxiliary_loss_mlp": 0.01263162, + "balance_loss_clip": 0.06270407, + "balance_loss_mlp": 0.01252899, + "epoch": 0.9385239741470014, + "flos": 23411730988800.0, + "grad_norm": 1.946583052250983, + "language_loss": 0.75374961, + "learning_rate": 3.9476384747844316e-08, + "loss": 0.83043504, + "num_input_tokens_seen": 336667165, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.1026001, + "step": 15610, + "time_per_iteration": 2.5711920261383057 + }, + { + "auxiliary_loss_clip": 0.06403802, + "auxiliary_loss_mlp": 0.01262478, + "balance_loss_clip": 0.06272253, + "balance_loss_mlp": 0.01253609, + "epoch": 0.9385840973996693, + "flos": 12829938416640.0, + "grad_norm": 1.6994334075613569, + "language_loss": 0.75466156, + "learning_rate": 3.939942386953987e-08, + "loss": 0.83132434, + "num_input_tokens_seen": 336684130, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.08862305, + "step": 15611, + "time_per_iteration": 2.483549118041992 + }, + { + "auxiliary_loss_clip": 0.06401208, + "auxiliary_loss_mlp": 0.01265334, + "balance_loss_clip": 0.06270809, + "balance_loss_mlp": 0.01256107, + "epoch": 0.9386442206523373, + "flos": 15492416302080.0, + "grad_norm": 1.9065211631243921, + "language_loss": 0.66030884, + "learning_rate": 3.9322537337405756e-08, + "loss": 0.73697424, + "num_input_tokens_seen": 336701520, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.09222412, + "step": 15612, + "time_per_iteration": 2.4864282608032227 + }, + { + "auxiliary_loss_clip": 0.06397071, + "auxiliary_loss_mlp": 0.01262431, + "balance_loss_clip": 0.06269386, + "balance_loss_mlp": 0.01253508, + "epoch": 0.9387043439050052, + "flos": 21185219496960.0, + "grad_norm": 1.9313570682062124, + "language_loss": 0.5721032, + "learning_rate": 3.924572515435742e-08, + "loss": 0.64869821, + "num_input_tokens_seen": 336720675, + "router_z_loss_clip": 1.27734375, + "router_z_loss_mlp": 0.0892334, + "step": 15613, + "time_per_iteration": 2.4872400760650635 + }, + { + "auxiliary_loss_clip": 0.06404439, + "auxiliary_loss_mlp": 0.01269408, + "balance_loss_clip": 0.0627088, + "balance_loss_mlp": 0.01259853, + "epoch": 0.9387644671576733, + "flos": 27674918547840.0, + "grad_norm": 1.9388248320141126, + "language_loss": 0.70511746, + "learning_rate": 3.916898732330764e-08, + "loss": 0.78185594, + "num_input_tokens_seen": 336741005, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.09545898, + "step": 15614, + "time_per_iteration": 3.943666934967041 + }, + { + "auxiliary_loss_clip": 0.06404942, + "auxiliary_loss_mlp": 0.01266663, + "balance_loss_clip": 0.06271072, + "balance_loss_mlp": 0.01256745, + "epoch": 0.9388245904103412, + "flos": 18841100400000.0, + "grad_norm": 1.8251928384631169, + "language_loss": 0.81327057, + "learning_rate": 3.9092323847166544e-08, + "loss": 0.88998669, + "num_input_tokens_seen": 336757990, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.09918213, + "step": 15615, + "time_per_iteration": 2.487769603729248 + }, + { + "auxiliary_loss_clip": 0.06396815, + "auxiliary_loss_mlp": 0.01261659, + "balance_loss_clip": 0.06269054, + "balance_loss_mlp": 0.01252486, + "epoch": 0.9388847136630092, + "flos": 25490893875840.0, + "grad_norm": 1.5481190981940673, + "language_loss": 0.71929944, + "learning_rate": 3.901573472884134e-08, + "loss": 0.79588419, + "num_input_tokens_seen": 336777705, + "router_z_loss_clip": 1.27832031, + "router_z_loss_mlp": 0.09173584, + "step": 15616, + "time_per_iteration": 2.5378410816192627 + }, + { + "auxiliary_loss_clip": 0.06402715, + "auxiliary_loss_mlp": 0.01264302, + "balance_loss_clip": 0.06272252, + "balance_loss_mlp": 0.01254909, + "epoch": 0.9389448369156771, + "flos": 18741102151680.0, + "grad_norm": 1.7803352890368735, + "language_loss": 0.66485155, + "learning_rate": 3.89392199712355e-08, + "loss": 0.74152172, + "num_input_tokens_seen": 336798275, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.09387207, + "step": 15617, + "time_per_iteration": 2.5118300914764404 + }, + { + "auxiliary_loss_clip": 0.06406648, + "auxiliary_loss_mlp": 0.01264715, + "balance_loss_clip": 0.06271216, + "balance_loss_mlp": 0.01254273, + "epoch": 0.9390049601683451, + "flos": 21722945074560.0, + "grad_norm": 1.945035864880724, + "language_loss": 0.73652196, + "learning_rate": 3.886277957725092e-08, + "loss": 0.81323552, + "num_input_tokens_seen": 336813835, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.10443115, + "step": 15618, + "time_per_iteration": 2.489013671875 + }, + { + "auxiliary_loss_clip": 0.06410211, + "auxiliary_loss_mlp": 0.01265609, + "balance_loss_clip": 0.06271806, + "balance_loss_mlp": 0.01255518, + "epoch": 0.939065083421013, + "flos": 19397357729280.0, + "grad_norm": 1.8849612887114653, + "language_loss": 0.70230412, + "learning_rate": 3.878641354978662e-08, + "loss": 0.77906239, + "num_input_tokens_seen": 336832210, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.10089111, + "step": 15619, + "time_per_iteration": 2.50981068611145 + }, + { + "auxiliary_loss_clip": 0.06404421, + "auxiliary_loss_mlp": 0.01265416, + "balance_loss_clip": 0.06272148, + "balance_loss_mlp": 0.01255199, + "epoch": 0.939125206673681, + "flos": 24688505577600.0, + "grad_norm": 1.5841389932494754, + "language_loss": 0.77946162, + "learning_rate": 3.8710121891737834e-08, + "loss": 0.85615999, + "num_input_tokens_seen": 336851380, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.10217285, + "step": 15620, + "time_per_iteration": 2.541767120361328 + }, + { + "auxiliary_loss_clip": 0.06399068, + "auxiliary_loss_mlp": 0.01262232, + "balance_loss_clip": 0.06271023, + "balance_loss_mlp": 0.01253005, + "epoch": 0.9391853299263491, + "flos": 16331505488640.0, + "grad_norm": 2.1364779923575026, + "language_loss": 0.73495758, + "learning_rate": 3.8633904605998025e-08, + "loss": 0.81157064, + "num_input_tokens_seen": 336868525, + "router_z_loss_clip": 1.28027344, + "router_z_loss_mlp": 0.09222412, + "step": 15621, + "time_per_iteration": 2.5422234535217285 + }, + { + "auxiliary_loss_clip": 0.06412639, + "auxiliary_loss_mlp": 0.01269163, + "balance_loss_clip": 0.06276237, + "balance_loss_mlp": 0.01258935, + "epoch": 0.939245453179017, + "flos": 11660541235200.0, + "grad_norm": 2.3036117116482524, + "language_loss": 0.67062247, + "learning_rate": 3.855776169545688e-08, + "loss": 0.74744046, + "num_input_tokens_seen": 336886200, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.10235596, + "step": 15622, + "time_per_iteration": 3.9106016159057617 + }, + { + "auxiliary_loss_clip": 0.06401062, + "auxiliary_loss_mlp": 0.01266555, + "balance_loss_clip": 0.06272039, + "balance_loss_mlp": 0.01257781, + "epoch": 0.939305576431685, + "flos": 23155369822080.0, + "grad_norm": 1.6184026237616547, + "language_loss": 0.71614575, + "learning_rate": 3.848169316300209e-08, + "loss": 0.79282188, + "num_input_tokens_seen": 336905815, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.08770752, + "step": 15623, + "time_per_iteration": 2.535764694213867 + }, + { + "auxiliary_loss_clip": 0.06404904, + "auxiliary_loss_mlp": 0.01267971, + "balance_loss_clip": 0.06273766, + "balance_loss_mlp": 0.01258458, + "epoch": 0.9393656996843529, + "flos": 33295493923200.0, + "grad_norm": 1.8929766893988849, + "language_loss": 0.72837877, + "learning_rate": 3.84056990115178e-08, + "loss": 0.80510753, + "num_input_tokens_seen": 336928460, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.09515381, + "step": 15624, + "time_per_iteration": 2.6262624263763428 + }, + { + "auxiliary_loss_clip": 0.06399508, + "auxiliary_loss_mlp": 0.01269514, + "balance_loss_clip": 0.06270696, + "balance_loss_mlp": 0.01260263, + "epoch": 0.9394258229370209, + "flos": 21695887405440.0, + "grad_norm": 1.7875404465361746, + "language_loss": 0.89779687, + "learning_rate": 3.832977924388614e-08, + "loss": 0.97448707, + "num_input_tokens_seen": 336948320, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.09240723, + "step": 15625, + "time_per_iteration": 2.531123161315918 + }, + { + "auxiliary_loss_clip": 0.06399558, + "auxiliary_loss_mlp": 0.01262032, + "balance_loss_clip": 0.06269208, + "balance_loss_mlp": 0.01252478, + "epoch": 0.9394859461896888, + "flos": 23880289420800.0, + "grad_norm": 4.111605423444732, + "language_loss": 0.83748984, + "learning_rate": 3.825393386298592e-08, + "loss": 0.91410571, + "num_input_tokens_seen": 336967670, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.09545898, + "step": 15626, + "time_per_iteration": 2.5196032524108887 + }, + { + "auxiliary_loss_clip": 0.06308495, + "auxiliary_loss_mlp": 0.01251926, + "balance_loss_clip": 0.06254559, + "balance_loss_mlp": 0.0125083, + "epoch": 0.9395460694423569, + "flos": 61584963114240.0, + "grad_norm": 0.7637423536356234, + "language_loss": 0.56075698, + "learning_rate": 3.8178162871693284e-08, + "loss": 0.63636124, + "num_input_tokens_seen": 337028395, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.0109787, + "step": 15627, + "time_per_iteration": 3.1151974201202393 + }, + { + "auxiliary_loss_clip": 0.06399734, + "auxiliary_loss_mlp": 0.01262903, + "balance_loss_clip": 0.06269808, + "balance_loss_mlp": 0.01254522, + "epoch": 0.9396061926950248, + "flos": 21001966617600.0, + "grad_norm": 2.0549661543440725, + "language_loss": 0.70356309, + "learning_rate": 3.810246627288105e-08, + "loss": 0.78018951, + "num_input_tokens_seen": 337048150, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.08380127, + "step": 15628, + "time_per_iteration": 3.9435250759124756 + }, + { + "auxiliary_loss_clip": 0.06402381, + "auxiliary_loss_mlp": 0.01264022, + "balance_loss_clip": 0.06272991, + "balance_loss_mlp": 0.01255188, + "epoch": 0.9396663159476928, + "flos": 27494726342400.0, + "grad_norm": 1.608508127182665, + "language_loss": 0.7580415, + "learning_rate": 3.8026844069420025e-08, + "loss": 0.83470553, + "num_input_tokens_seen": 337069315, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.08837891, + "step": 15629, + "time_per_iteration": 2.6277477741241455 + }, + { + "auxiliary_loss_clip": 0.06394442, + "auxiliary_loss_mlp": 0.01261005, + "balance_loss_clip": 0.06268346, + "balance_loss_mlp": 0.01252273, + "epoch": 0.9397264392003607, + "flos": 19433555493120.0, + "grad_norm": 1.7353515662757615, + "language_loss": 0.74587202, + "learning_rate": 3.795129626417748e-08, + "loss": 0.8224265, + "num_input_tokens_seen": 337087765, + "router_z_loss_clip": 1.26074219, + "router_z_loss_mlp": 0.08734131, + "step": 15630, + "time_per_iteration": 2.5997049808502197 + }, + { + "auxiliary_loss_clip": 0.06399633, + "auxiliary_loss_mlp": 0.01262857, + "balance_loss_clip": 0.06272737, + "balance_loss_mlp": 0.01254709, + "epoch": 0.9397865624530287, + "flos": 18010732037760.0, + "grad_norm": 1.9830238944989997, + "language_loss": 0.69652402, + "learning_rate": 3.787582286001845e-08, + "loss": 0.77314889, + "num_input_tokens_seen": 337106265, + "router_z_loss_clip": 1.26855469, + "router_z_loss_mlp": 0.08154297, + "step": 15631, + "time_per_iteration": 2.516963481903076 + }, + { + "auxiliary_loss_clip": 0.06397713, + "auxiliary_loss_mlp": 0.01265126, + "balance_loss_clip": 0.06269372, + "balance_loss_mlp": 0.01255822, + "epoch": 0.9398466857056966, + "flos": 22571132428800.0, + "grad_norm": 1.4686823843430021, + "language_loss": 0.75433683, + "learning_rate": 3.7800423859805086e-08, + "loss": 0.83096522, + "num_input_tokens_seen": 337126090, + "router_z_loss_clip": 1.28417969, + "router_z_loss_mlp": 0.09307861, + "step": 15632, + "time_per_iteration": 3.9681499004364014 + }, + { + "auxiliary_loss_clip": 0.06407969, + "auxiliary_loss_mlp": 0.01263576, + "balance_loss_clip": 0.06271549, + "balance_loss_mlp": 0.01253222, + "epoch": 0.9399068089583646, + "flos": 24542666346240.0, + "grad_norm": 2.093894657159583, + "language_loss": 0.7490074, + "learning_rate": 3.772509926639622e-08, + "loss": 0.82572281, + "num_input_tokens_seen": 337145655, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.10351562, + "step": 15633, + "time_per_iteration": 2.5607848167419434 + }, + { + "auxiliary_loss_clip": 0.06401691, + "auxiliary_loss_mlp": 0.01266106, + "balance_loss_clip": 0.06268854, + "balance_loss_mlp": 0.01255801, + "epoch": 0.9399669322110327, + "flos": 25637529720960.0, + "grad_norm": 1.842729170438083, + "language_loss": 0.72873878, + "learning_rate": 3.764984908264823e-08, + "loss": 0.8054167, + "num_input_tokens_seen": 337164805, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.10296631, + "step": 15634, + "time_per_iteration": 2.5304877758026123 + }, + { + "auxiliary_loss_clip": 0.06408176, + "auxiliary_loss_mlp": 0.01264401, + "balance_loss_clip": 0.06273288, + "balance_loss_mlp": 0.01254405, + "epoch": 0.9400270554637006, + "flos": 17094593422080.0, + "grad_norm": 1.5847517594895608, + "language_loss": 0.69334674, + "learning_rate": 3.75746733114144e-08, + "loss": 0.77007252, + "num_input_tokens_seen": 337182280, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.09997559, + "step": 15635, + "time_per_iteration": 2.5305612087249756 + }, + { + "auxiliary_loss_clip": 0.06394704, + "auxiliary_loss_mlp": 0.01261499, + "balance_loss_clip": 0.06268582, + "balance_loss_mlp": 0.01252845, + "epoch": 0.9400871787163686, + "flos": 22061764258560.0, + "grad_norm": 1.5394095539238604, + "language_loss": 0.74523485, + "learning_rate": 3.7499571955545985e-08, + "loss": 0.82179689, + "num_input_tokens_seen": 337203495, + "router_z_loss_clip": 1.26269531, + "router_z_loss_mlp": 0.08654785, + "step": 15636, + "time_per_iteration": 2.5429651737213135 + }, + { + "auxiliary_loss_clip": 0.06401463, + "auxiliary_loss_mlp": 0.01262977, + "balance_loss_clip": 0.06270332, + "balance_loss_mlp": 0.01253553, + "epoch": 0.9401473019690365, + "flos": 16988431898880.0, + "grad_norm": 1.907903865743405, + "language_loss": 0.83414614, + "learning_rate": 3.7424545017890054e-08, + "loss": 0.91079056, + "num_input_tokens_seen": 337220435, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.09423828, + "step": 15637, + "time_per_iteration": 2.5974667072296143 + }, + { + "auxiliary_loss_clip": 0.06402609, + "auxiliary_loss_mlp": 0.01265808, + "balance_loss_clip": 0.06269225, + "balance_loss_mlp": 0.01256391, + "epoch": 0.9402074252217045, + "flos": 19687946088960.0, + "grad_norm": 2.20612132803902, + "language_loss": 0.69127619, + "learning_rate": 3.7349592501292325e-08, + "loss": 0.76796037, + "num_input_tokens_seen": 337238095, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.09417725, + "step": 15638, + "time_per_iteration": 2.5368309020996094 + }, + { + "auxiliary_loss_clip": 0.06396491, + "auxiliary_loss_mlp": 0.01264929, + "balance_loss_clip": 0.06270848, + "balance_loss_mlp": 0.01256448, + "epoch": 0.9402675484743724, + "flos": 24761278448640.0, + "grad_norm": 1.57830953149631, + "language_loss": 0.848472, + "learning_rate": 3.727471440859498e-08, + "loss": 0.92508614, + "num_input_tokens_seen": 337256645, + "router_z_loss_clip": 1.25683594, + "router_z_loss_mlp": 0.08477783, + "step": 15639, + "time_per_iteration": 2.528841257095337 + }, + { + "auxiliary_loss_clip": 0.06401523, + "auxiliary_loss_mlp": 0.01262071, + "balance_loss_clip": 0.0626966, + "balance_loss_mlp": 0.01253255, + "epoch": 0.9403276717270405, + "flos": 25566014661120.0, + "grad_norm": 1.4451560995387316, + "language_loss": 0.784464, + "learning_rate": 3.719991074263662e-08, + "loss": 0.86109996, + "num_input_tokens_seen": 337278360, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.08813477, + "step": 15640, + "time_per_iteration": 2.539466619491577 + }, + { + "auxiliary_loss_clip": 0.06403446, + "auxiliary_loss_mlp": 0.01264478, + "balance_loss_clip": 0.06269012, + "balance_loss_mlp": 0.01255412, + "epoch": 0.9403877949797084, + "flos": 26697453143040.0, + "grad_norm": 1.431088063022994, + "language_loss": 0.74448258, + "learning_rate": 3.7125181506254544e-08, + "loss": 0.82116181, + "num_input_tokens_seen": 337302480, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.09063721, + "step": 15641, + "time_per_iteration": 2.5854341983795166 + }, + { + "auxiliary_loss_clip": 0.06407844, + "auxiliary_loss_mlp": 0.01268272, + "balance_loss_clip": 0.06270669, + "balance_loss_mlp": 0.01257466, + "epoch": 0.9404479182323764, + "flos": 15016856054400.0, + "grad_norm": 1.9684805464288027, + "language_loss": 0.82889009, + "learning_rate": 3.7050526702282256e-08, + "loss": 0.90565127, + "num_input_tokens_seen": 337316600, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.10803223, + "step": 15642, + "time_per_iteration": 2.4660263061523438 + }, + { + "auxiliary_loss_clip": 0.06396569, + "auxiliary_loss_mlp": 0.01267, + "balance_loss_clip": 0.0626855, + "balance_loss_mlp": 0.01258441, + "epoch": 0.9405080414850443, + "flos": 24980645237760.0, + "grad_norm": 1.7977314668470241, + "language_loss": 0.68295997, + "learning_rate": 3.697594633355084e-08, + "loss": 0.75959563, + "num_input_tokens_seen": 337336895, + "router_z_loss_clip": 1.28027344, + "router_z_loss_mlp": 0.08557129, + "step": 15643, + "time_per_iteration": 2.5947160720825195 + }, + { + "auxiliary_loss_clip": 0.06406666, + "auxiliary_loss_mlp": 0.01264827, + "balance_loss_clip": 0.06273131, + "balance_loss_mlp": 0.0125535, + "epoch": 0.9405681647377123, + "flos": 20850131819520.0, + "grad_norm": 1.9653990343363072, + "language_loss": 0.76734209, + "learning_rate": 3.6901440402888226e-08, + "loss": 0.84405702, + "num_input_tokens_seen": 337355105, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.0947876, + "step": 15644, + "time_per_iteration": 2.5140726566314697 + }, + { + "auxiliary_loss_clip": 0.06398097, + "auxiliary_loss_mlp": 0.01264658, + "balance_loss_clip": 0.06269826, + "balance_loss_mlp": 0.0125598, + "epoch": 0.9406282879903802, + "flos": 23812380086400.0, + "grad_norm": 1.5018434731522488, + "language_loss": 0.6776011, + "learning_rate": 3.682700891311974e-08, + "loss": 0.75422859, + "num_input_tokens_seen": 337374905, + "router_z_loss_clip": 1.28222656, + "router_z_loss_mlp": 0.08685303, + "step": 15645, + "time_per_iteration": 2.5149364471435547 + }, + { + "auxiliary_loss_clip": 0.06395334, + "auxiliary_loss_mlp": 0.01266219, + "balance_loss_clip": 0.06269147, + "balance_loss_mlp": 0.01257261, + "epoch": 0.9406884112430483, + "flos": 27682716977280.0, + "grad_norm": 1.3496847114989412, + "language_loss": 0.70362568, + "learning_rate": 3.6752651867067774e-08, + "loss": 0.78024125, + "num_input_tokens_seen": 337397130, + "router_z_loss_clip": 1.26269531, + "router_z_loss_mlp": 0.08953857, + "step": 15646, + "time_per_iteration": 2.565032958984375 + }, + { + "auxiliary_loss_clip": 0.06398815, + "auxiliary_loss_mlp": 0.01261727, + "balance_loss_clip": 0.06269072, + "balance_loss_mlp": 0.01253018, + "epoch": 0.9407485344957163, + "flos": 23081590702080.0, + "grad_norm": 1.5299768389325743, + "language_loss": 0.74550891, + "learning_rate": 3.667836926755208e-08, + "loss": 0.82211429, + "num_input_tokens_seen": 337418660, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.08709717, + "step": 15647, + "time_per_iteration": 2.52329158782959 + }, + { + "auxiliary_loss_clip": 0.06308979, + "auxiliary_loss_mlp": 0.01247889, + "balance_loss_clip": 0.06254758, + "balance_loss_mlp": 0.01246815, + "epoch": 0.9408086577483842, + "flos": 71034143247360.0, + "grad_norm": 0.8645069850890814, + "language_loss": 0.63526928, + "learning_rate": 3.660416111738907e-08, + "loss": 0.71083796, + "num_input_tokens_seen": 337478055, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.01075745, + "step": 15648, + "time_per_iteration": 3.233332872390747 + }, + { + "auxiliary_loss_clip": 0.06401809, + "auxiliary_loss_mlp": 0.01261765, + "balance_loss_clip": 0.06273667, + "balance_loss_mlp": 0.01253027, + "epoch": 0.9408687810010522, + "flos": 23737468936320.0, + "grad_norm": 1.3199036053586422, + "language_loss": 0.66599685, + "learning_rate": 3.653002741939337e-08, + "loss": 0.74263257, + "num_input_tokens_seen": 337499405, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.08740234, + "step": 15649, + "time_per_iteration": 2.5568881034851074 + }, + { + "auxiliary_loss_clip": 0.06399603, + "auxiliary_loss_mlp": 0.01263735, + "balance_loss_clip": 0.06268597, + "balance_loss_mlp": 0.01254967, + "epoch": 0.9409289042537201, + "flos": 18375225298560.0, + "grad_norm": 2.070554549702626, + "language_loss": 0.77568704, + "learning_rate": 3.645596817637586e-08, + "loss": 0.85232043, + "num_input_tokens_seen": 337517195, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.08770752, + "step": 15650, + "time_per_iteration": 2.4954206943511963 + }, + { + "auxiliary_loss_clip": 0.06402092, + "auxiliary_loss_mlp": 0.01263238, + "balance_loss_clip": 0.06272111, + "balance_loss_mlp": 0.01254596, + "epoch": 0.9409890275063881, + "flos": 23885111030400.0, + "grad_norm": 1.6619608167936917, + "language_loss": 0.74290323, + "learning_rate": 3.638198339114451e-08, + "loss": 0.81955653, + "num_input_tokens_seen": 337535245, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.08630371, + "step": 15651, + "time_per_iteration": 2.5114126205444336 + }, + { + "auxiliary_loss_clip": 0.06400727, + "auxiliary_loss_mlp": 0.01262851, + "balance_loss_clip": 0.06271733, + "balance_loss_mlp": 0.01253607, + "epoch": 0.941049150759056, + "flos": 16550704569600.0, + "grad_norm": 1.9371023578664908, + "language_loss": 0.72369295, + "learning_rate": 3.630807306650507e-08, + "loss": 0.80032873, + "num_input_tokens_seen": 337553040, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.0925293, + "step": 15652, + "time_per_iteration": 2.490548849105835 + }, + { + "auxiliary_loss_clip": 0.06408902, + "auxiliary_loss_mlp": 0.01264671, + "balance_loss_clip": 0.06270728, + "balance_loss_mlp": 0.01254592, + "epoch": 0.9411092740117241, + "flos": 25125310512000.0, + "grad_norm": 1.5890222954041313, + "language_loss": 0.66336501, + "learning_rate": 3.6234237205260645e-08, + "loss": 0.7401008, + "num_input_tokens_seen": 337574580, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.10076904, + "step": 15653, + "time_per_iteration": 3.934385061264038 + }, + { + "auxiliary_loss_clip": 0.06402892, + "auxiliary_loss_mlp": 0.01264013, + "balance_loss_clip": 0.06269339, + "balance_loss_mlp": 0.01253767, + "epoch": 0.941169397264392, + "flos": 21148644389760.0, + "grad_norm": 1.8935835038310136, + "language_loss": 0.78094435, + "learning_rate": 3.6160475810210536e-08, + "loss": 0.85761338, + "num_input_tokens_seen": 337593010, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.10247803, + "step": 15654, + "time_per_iteration": 2.5380873680114746 + }, + { + "auxiliary_loss_clip": 0.06412641, + "auxiliary_loss_mlp": 0.01263841, + "balance_loss_clip": 0.06273723, + "balance_loss_mlp": 0.01254489, + "epoch": 0.94122952051706, + "flos": 38518103531520.0, + "grad_norm": 2.6656013558269, + "language_loss": 0.70125389, + "learning_rate": 3.6086788884152065e-08, + "loss": 0.77801865, + "num_input_tokens_seen": 337616170, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.09350586, + "step": 15655, + "time_per_iteration": 2.662172317504883 + }, + { + "auxiliary_loss_clip": 0.06398033, + "auxiliary_loss_mlp": 0.01262956, + "balance_loss_clip": 0.0626789, + "balance_loss_mlp": 0.01253169, + "epoch": 0.9412896437697279, + "flos": 18375099517440.0, + "grad_norm": 1.6963116521742299, + "language_loss": 0.7260558, + "learning_rate": 3.601317642987944e-08, + "loss": 0.80266565, + "num_input_tokens_seen": 337635215, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.09783936, + "step": 15656, + "time_per_iteration": 2.478374481201172 + }, + { + "auxiliary_loss_clip": 0.06401219, + "auxiliary_loss_mlp": 0.01263672, + "balance_loss_clip": 0.06271031, + "balance_loss_mlp": 0.01254182, + "epoch": 0.9413497670223959, + "flos": 25892046097920.0, + "grad_norm": 1.89156015011812, + "language_loss": 0.78345996, + "learning_rate": 3.593963845018377e-08, + "loss": 0.86010885, + "num_input_tokens_seen": 337654195, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.09490967, + "step": 15657, + "time_per_iteration": 2.5166099071502686 + }, + { + "auxiliary_loss_clip": 0.06401125, + "auxiliary_loss_mlp": 0.01265038, + "balance_loss_clip": 0.06268708, + "balance_loss_mlp": 0.01255758, + "epoch": 0.9414098902750638, + "flos": 16623980565120.0, + "grad_norm": 3.3736293450967505, + "language_loss": 0.84897089, + "learning_rate": 3.586617494785371e-08, + "loss": 0.92563248, + "num_input_tokens_seen": 337671810, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.09289551, + "step": 15658, + "time_per_iteration": 2.4719231128692627 + }, + { + "auxiliary_loss_clip": 0.06407331, + "auxiliary_loss_mlp": 0.01266897, + "balance_loss_clip": 0.06271299, + "balance_loss_mlp": 0.01256049, + "epoch": 0.9414700135277319, + "flos": 18631041413760.0, + "grad_norm": 2.0197764771126936, + "language_loss": 0.71193194, + "learning_rate": 3.5792785925675254e-08, + "loss": 0.78867424, + "num_input_tokens_seen": 337689410, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.10848999, + "step": 15659, + "time_per_iteration": 2.469200611114502 + }, + { + "auxiliary_loss_clip": 0.0640014, + "auxiliary_loss_mlp": 0.01267204, + "balance_loss_clip": 0.06271692, + "balance_loss_mlp": 0.01258019, + "epoch": 0.9415301367803999, + "flos": 26286280358400.0, + "grad_norm": 1.643546636264258, + "language_loss": 0.79811406, + "learning_rate": 3.571947138643172e-08, + "loss": 0.87478751, + "num_input_tokens_seen": 337709950, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.09191895, + "step": 15660, + "time_per_iteration": 2.5389978885650635 + }, + { + "auxiliary_loss_clip": 0.06393769, + "auxiliary_loss_mlp": 0.01262754, + "balance_loss_clip": 0.06267805, + "balance_loss_mlp": 0.01253766, + "epoch": 0.9415902600330678, + "flos": 23268617015040.0, + "grad_norm": 1.3569546875428349, + "language_loss": 0.68124604, + "learning_rate": 3.564623133290201e-08, + "loss": 0.75781125, + "num_input_tokens_seen": 337731320, + "router_z_loss_clip": 1.25878906, + "router_z_loss_mlp": 0.08984375, + "step": 15661, + "time_per_iteration": 3.984410285949707 + }, + { + "auxiliary_loss_clip": 0.06403223, + "auxiliary_loss_mlp": 0.01268302, + "balance_loss_clip": 0.06272446, + "balance_loss_mlp": 0.01258783, + "epoch": 0.9416503832857358, + "flos": 14724171342720.0, + "grad_norm": 1.8492726006521825, + "language_loss": 0.6645698, + "learning_rate": 3.557306576786434e-08, + "loss": 0.74128503, + "num_input_tokens_seen": 337747720, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09521484, + "step": 15662, + "time_per_iteration": 2.4829232692718506 + }, + { + "auxiliary_loss_clip": 0.06309918, + "auxiliary_loss_mlp": 0.01248909, + "balance_loss_clip": 0.06255955, + "balance_loss_mlp": 0.0124794, + "epoch": 0.9417105065384037, + "flos": 70331333927040.0, + "grad_norm": 0.7645309383813702, + "language_loss": 0.59303248, + "learning_rate": 3.5499974694092935e-08, + "loss": 0.66862071, + "num_input_tokens_seen": 337806930, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.00967407, + "step": 15663, + "time_per_iteration": 3.204615354537964 + }, + { + "auxiliary_loss_clip": 0.06405449, + "auxiliary_loss_mlp": 0.01265808, + "balance_loss_clip": 0.06269373, + "balance_loss_mlp": 0.01255437, + "epoch": 0.9417706297910717, + "flos": 34066380286080.0, + "grad_norm": 1.7732726183519205, + "language_loss": 0.66930187, + "learning_rate": 3.542695811435914e-08, + "loss": 0.74601436, + "num_input_tokens_seen": 337828100, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.1036377, + "step": 15664, + "time_per_iteration": 2.6080029010772705 + }, + { + "auxiliary_loss_clip": 0.06399654, + "auxiliary_loss_mlp": 0.0126386, + "balance_loss_clip": 0.06270635, + "balance_loss_mlp": 0.01254973, + "epoch": 0.9418307530437396, + "flos": 16477135084800.0, + "grad_norm": 2.310935997550932, + "language_loss": 0.74091578, + "learning_rate": 3.535401603143207e-08, + "loss": 0.81755096, + "num_input_tokens_seen": 337844805, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.08880615, + "step": 15665, + "time_per_iteration": 2.483211040496826 + }, + { + "auxiliary_loss_clip": 0.06396838, + "auxiliary_loss_mlp": 0.01264954, + "balance_loss_clip": 0.06268667, + "balance_loss_mlp": 0.01256026, + "epoch": 0.9418908762964077, + "flos": 11258089274880.0, + "grad_norm": 2.6110981514445366, + "language_loss": 0.6352722, + "learning_rate": 3.528114844807773e-08, + "loss": 0.7118901, + "num_input_tokens_seen": 337860490, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.08929443, + "step": 15666, + "time_per_iteration": 2.5411856174468994 + }, + { + "auxiliary_loss_clip": 0.06402782, + "auxiliary_loss_mlp": 0.01263561, + "balance_loss_clip": 0.06272171, + "balance_loss_mlp": 0.01254063, + "epoch": 0.9419509995490756, + "flos": 18444182808960.0, + "grad_norm": 1.991838709857188, + "language_loss": 0.78680706, + "learning_rate": 3.520835536705902e-08, + "loss": 0.86347044, + "num_input_tokens_seen": 337878360, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.09500122, + "step": 15667, + "time_per_iteration": 3.9452993869781494 + }, + { + "auxiliary_loss_clip": 0.06400198, + "auxiliary_loss_mlp": 0.01262756, + "balance_loss_clip": 0.06271772, + "balance_loss_mlp": 0.01254638, + "epoch": 0.9420111228017436, + "flos": 20743760661120.0, + "grad_norm": 1.6944923844867426, + "language_loss": 0.75551254, + "learning_rate": 3.5135636791136404e-08, + "loss": 0.83214211, + "num_input_tokens_seen": 337895635, + "router_z_loss_clip": 1.28613281, + "router_z_loss_mlp": 0.08123779, + "step": 15668, + "time_per_iteration": 2.524935007095337 + }, + { + "auxiliary_loss_clip": 0.06403884, + "auxiliary_loss_mlp": 0.01269735, + "balance_loss_clip": 0.06270599, + "balance_loss_mlp": 0.01260168, + "epoch": 0.9420712460544115, + "flos": 21148267046400.0, + "grad_norm": 2.4528189170116774, + "language_loss": 0.59678006, + "learning_rate": 3.506299272306723e-08, + "loss": 0.67351627, + "num_input_tokens_seen": 337913940, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.09564209, + "step": 15669, + "time_per_iteration": 2.4999589920043945 + }, + { + "auxiliary_loss_clip": 0.06396198, + "auxiliary_loss_mlp": 0.0126024, + "balance_loss_clip": 0.06269813, + "balance_loss_mlp": 0.01251812, + "epoch": 0.9421313693070795, + "flos": 15857244979200.0, + "grad_norm": 1.4484921317506239, + "language_loss": 0.77208281, + "learning_rate": 3.4990423165606406e-08, + "loss": 0.84864712, + "num_input_tokens_seen": 337932015, + "router_z_loss_clip": 1.26367188, + "router_z_loss_mlp": 0.08422852, + "step": 15670, + "time_per_iteration": 2.4799532890319824 + }, + { + "auxiliary_loss_clip": 0.06402656, + "auxiliary_loss_mlp": 0.01264404, + "balance_loss_clip": 0.06273, + "balance_loss_mlp": 0.01254748, + "epoch": 0.9421914925597474, + "flos": 32424106187520.0, + "grad_norm": 2.128403859031794, + "language_loss": 0.65426135, + "learning_rate": 3.491792812150574e-08, + "loss": 0.73093194, + "num_input_tokens_seen": 337953345, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.09667969, + "step": 15671, + "time_per_iteration": 3.975170850753784 + }, + { + "auxiliary_loss_clip": 0.06401955, + "auxiliary_loss_mlp": 0.01267564, + "balance_loss_clip": 0.06270818, + "balance_loss_mlp": 0.01257986, + "epoch": 0.9422516158124155, + "flos": 19724521196160.0, + "grad_norm": 1.5351118428964867, + "language_loss": 0.79441094, + "learning_rate": 3.48455075935139e-08, + "loss": 0.87110615, + "num_input_tokens_seen": 337973685, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09576416, + "step": 15672, + "time_per_iteration": 2.4977033138275146 + }, + { + "auxiliary_loss_clip": 0.06408137, + "auxiliary_loss_mlp": 0.01264621, + "balance_loss_clip": 0.06270933, + "balance_loss_mlp": 0.01254214, + "epoch": 0.9423117390650835, + "flos": 16258858398720.0, + "grad_norm": 1.991030547608086, + "language_loss": 0.74059123, + "learning_rate": 3.47731615843776e-08, + "loss": 0.8173188, + "num_input_tokens_seen": 337989175, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.10412598, + "step": 15673, + "time_per_iteration": 2.448622226715088 + }, + { + "auxiliary_loss_clip": 0.0639824, + "auxiliary_loss_mlp": 0.01263085, + "balance_loss_clip": 0.06268054, + "balance_loss_mlp": 0.01253715, + "epoch": 0.9423718623177514, + "flos": 31804803060480.0, + "grad_norm": 1.4075068342748132, + "language_loss": 0.70376456, + "learning_rate": 3.470089009683974e-08, + "loss": 0.78037775, + "num_input_tokens_seen": 338011800, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.09368896, + "step": 15674, + "time_per_iteration": 2.5917158126831055 + }, + { + "auxiliary_loss_clip": 0.06401472, + "auxiliary_loss_mlp": 0.01264344, + "balance_loss_clip": 0.06269686, + "balance_loss_mlp": 0.01255684, + "epoch": 0.9424319855704194, + "flos": 23338622701440.0, + "grad_norm": 1.6840645348051175, + "language_loss": 0.81582546, + "learning_rate": 3.462869313364125e-08, + "loss": 0.89248359, + "num_input_tokens_seen": 338032120, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.08660889, + "step": 15675, + "time_per_iteration": 2.5051825046539307 + }, + { + "auxiliary_loss_clip": 0.06400142, + "auxiliary_loss_mlp": 0.01265582, + "balance_loss_clip": 0.0627027, + "balance_loss_mlp": 0.01257076, + "epoch": 0.9424921088230873, + "flos": 20783983420800.0, + "grad_norm": 1.5494780490790538, + "language_loss": 0.63124716, + "learning_rate": 3.4556570697519494e-08, + "loss": 0.7079044, + "num_input_tokens_seen": 338051880, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.08508301, + "step": 15676, + "time_per_iteration": 2.4995803833007812 + }, + { + "auxiliary_loss_clip": 0.06400351, + "auxiliary_loss_mlp": 0.01264973, + "balance_loss_clip": 0.06268977, + "balance_loss_mlp": 0.01254709, + "epoch": 0.9425522320757553, + "flos": 19032780614400.0, + "grad_norm": 1.7622357826868196, + "language_loss": 0.67433226, + "learning_rate": 3.448452279120984e-08, + "loss": 0.7509855, + "num_input_tokens_seen": 338069665, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.1026001, + "step": 15677, + "time_per_iteration": 2.5142791271209717 + }, + { + "auxiliary_loss_clip": 0.06405545, + "auxiliary_loss_mlp": 0.01263466, + "balance_loss_clip": 0.06270891, + "balance_loss_mlp": 0.01253458, + "epoch": 0.9426123553284232, + "flos": 25162346816640.0, + "grad_norm": 1.7717990036864524, + "language_loss": 0.64982033, + "learning_rate": 3.441254941744387e-08, + "loss": 0.72651047, + "num_input_tokens_seen": 338090490, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.10003662, + "step": 15678, + "time_per_iteration": 2.5930380821228027 + }, + { + "auxiliary_loss_clip": 0.06398059, + "auxiliary_loss_mlp": 0.01267241, + "balance_loss_clip": 0.06267848, + "balance_loss_mlp": 0.01258092, + "epoch": 0.9426724785810913, + "flos": 21185848402560.0, + "grad_norm": 1.4818609891623467, + "language_loss": 0.74543768, + "learning_rate": 3.434065057895097e-08, + "loss": 0.82209063, + "num_input_tokens_seen": 338109825, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.09155273, + "step": 15679, + "time_per_iteration": 2.4969890117645264 + }, + { + "auxiliary_loss_clip": 0.06406982, + "auxiliary_loss_mlp": 0.01267063, + "balance_loss_clip": 0.062732, + "balance_loss_mlp": 0.01257223, + "epoch": 0.9427326018337592, + "flos": 14762171969280.0, + "grad_norm": 2.028620141533925, + "language_loss": 0.77248597, + "learning_rate": 3.426882627845762e-08, + "loss": 0.84922642, + "num_input_tokens_seen": 338125790, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.09832764, + "step": 15680, + "time_per_iteration": 2.4729225635528564 + }, + { + "auxiliary_loss_clip": 0.06401733, + "auxiliary_loss_mlp": 0.01269172, + "balance_loss_clip": 0.06271403, + "balance_loss_mlp": 0.01259439, + "epoch": 0.9427927250864272, + "flos": 20930032287360.0, + "grad_norm": 1.7948180035587007, + "language_loss": 0.75664496, + "learning_rate": 3.419707651868742e-08, + "loss": 0.833354, + "num_input_tokens_seen": 338145610, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.09735107, + "step": 15681, + "time_per_iteration": 2.5000479221343994 + }, + { + "auxiliary_loss_clip": 0.06404436, + "auxiliary_loss_mlp": 0.01266864, + "balance_loss_clip": 0.06271823, + "balance_loss_mlp": 0.0125725, + "epoch": 0.9428528483390951, + "flos": 19758119483520.0, + "grad_norm": 1.682204296334067, + "language_loss": 0.65451252, + "learning_rate": 3.412540130236086e-08, + "loss": 0.73122549, + "num_input_tokens_seen": 338165960, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.09613037, + "step": 15682, + "time_per_iteration": 2.5290274620056152 + }, + { + "auxiliary_loss_clip": 0.06400858, + "auxiliary_loss_mlp": 0.01263081, + "balance_loss_clip": 0.06269148, + "balance_loss_mlp": 0.01253365, + "epoch": 0.9429129715917631, + "flos": 24541869732480.0, + "grad_norm": 1.6078440758053596, + "language_loss": 0.76264083, + "learning_rate": 3.405380063219665e-08, + "loss": 0.83928025, + "num_input_tokens_seen": 338187215, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09716797, + "step": 15683, + "time_per_iteration": 2.5387845039367676 + }, + { + "auxiliary_loss_clip": 0.06404649, + "auxiliary_loss_mlp": 0.01266852, + "balance_loss_clip": 0.06270392, + "balance_loss_mlp": 0.01256398, + "epoch": 0.942973094844431, + "flos": 17964304076160.0, + "grad_norm": 2.5267719992452076, + "language_loss": 0.75809973, + "learning_rate": 3.398227451090885e-08, + "loss": 0.83481473, + "num_input_tokens_seen": 338201825, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.10461426, + "step": 15684, + "time_per_iteration": 2.483170747756958 + }, + { + "auxiliary_loss_clip": 0.06397957, + "auxiliary_loss_mlp": 0.01264368, + "balance_loss_clip": 0.06269065, + "balance_loss_mlp": 0.01255523, + "epoch": 0.9430332180970991, + "flos": 26144382268800.0, + "grad_norm": 1.5399234901397196, + "language_loss": 0.77343988, + "learning_rate": 3.391082294121017e-08, + "loss": 0.85006315, + "num_input_tokens_seen": 338220865, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.08843994, + "step": 15685, + "time_per_iteration": 2.5491085052490234 + }, + { + "auxiliary_loss_clip": 0.06397514, + "auxiliary_loss_mlp": 0.01261396, + "balance_loss_clip": 0.06270064, + "balance_loss_mlp": 0.01252807, + "epoch": 0.943093341349767, + "flos": 23958177390720.0, + "grad_norm": 1.7162540789171723, + "language_loss": 0.76184905, + "learning_rate": 3.383944592581023e-08, + "loss": 0.83843815, + "num_input_tokens_seen": 338240160, + "router_z_loss_clip": 1.27539062, + "router_z_loss_mlp": 0.0859375, + "step": 15686, + "time_per_iteration": 2.588693857192993 + }, + { + "auxiliary_loss_clip": 0.06403645, + "auxiliary_loss_mlp": 0.01264923, + "balance_loss_clip": 0.06270447, + "balance_loss_mlp": 0.01255572, + "epoch": 0.943153464602435, + "flos": 17974324638720.0, + "grad_norm": 1.6255235883785641, + "language_loss": 0.80987608, + "learning_rate": 3.376814346741575e-08, + "loss": 0.88656175, + "num_input_tokens_seen": 338259305, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.09350586, + "step": 15687, + "time_per_iteration": 2.4934589862823486 + }, + { + "auxiliary_loss_clip": 0.06407475, + "auxiliary_loss_mlp": 0.01265472, + "balance_loss_clip": 0.06271624, + "balance_loss_mlp": 0.0125503, + "epoch": 0.943213587855103, + "flos": 14506733197440.0, + "grad_norm": 2.2198187889767516, + "language_loss": 0.7578727, + "learning_rate": 3.369691556873011e-08, + "loss": 0.83460218, + "num_input_tokens_seen": 338274950, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10443115, + "step": 15688, + "time_per_iteration": 2.497774600982666 + }, + { + "auxiliary_loss_clip": 0.0639424, + "auxiliary_loss_mlp": 0.01264677, + "balance_loss_clip": 0.06269427, + "balance_loss_mlp": 0.01255188, + "epoch": 0.9432737111077709, + "flos": 28994054175360.0, + "grad_norm": 1.6545855096259856, + "language_loss": 0.68633425, + "learning_rate": 3.3625762232454504e-08, + "loss": 0.76292336, + "num_input_tokens_seen": 338295585, + "router_z_loss_clip": 1.24707031, + "router_z_loss_mlp": 0.0947876, + "step": 15689, + "time_per_iteration": 2.6034674644470215 + }, + { + "auxiliary_loss_clip": 0.06400025, + "auxiliary_loss_mlp": 0.01265711, + "balance_loss_clip": 0.06270765, + "balance_loss_mlp": 0.01257444, + "epoch": 0.9433338343604389, + "flos": 21614267928960.0, + "grad_norm": 1.6339942455994367, + "language_loss": 0.80775511, + "learning_rate": 3.35546834612872e-08, + "loss": 0.88441241, + "num_input_tokens_seen": 338314555, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.0826416, + "step": 15690, + "time_per_iteration": 2.523336410522461 + }, + { + "auxiliary_loss_clip": 0.06400111, + "auxiliary_loss_mlp": 0.01261797, + "balance_loss_clip": 0.06271239, + "balance_loss_mlp": 0.01252052, + "epoch": 0.9433939576131068, + "flos": 33190632138240.0, + "grad_norm": 1.7354077420100367, + "language_loss": 0.60600984, + "learning_rate": 3.348367925792317e-08, + "loss": 0.68262887, + "num_input_tokens_seen": 338336260, + "router_z_loss_clip": 1.28808594, + "router_z_loss_mlp": 0.09735107, + "step": 15691, + "time_per_iteration": 2.606536626815796 + }, + { + "auxiliary_loss_clip": 0.06404334, + "auxiliary_loss_mlp": 0.01266204, + "balance_loss_clip": 0.06272846, + "balance_loss_mlp": 0.01256769, + "epoch": 0.9434540808657749, + "flos": 20492808082560.0, + "grad_norm": 1.4492750689861678, + "language_loss": 0.6661129, + "learning_rate": 3.341274962505514e-08, + "loss": 0.74281824, + "num_input_tokens_seen": 338354680, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09436035, + "step": 15692, + "time_per_iteration": 2.498673439025879 + }, + { + "auxiliary_loss_clip": 0.06399876, + "auxiliary_loss_mlp": 0.01265516, + "balance_loss_clip": 0.06269374, + "balance_loss_mlp": 0.01255997, + "epoch": 0.9435142041184428, + "flos": 21549293487360.0, + "grad_norm": 2.3030634231510545, + "language_loss": 0.74972957, + "learning_rate": 3.334189456537251e-08, + "loss": 0.82638347, + "num_input_tokens_seen": 338372490, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.09515381, + "step": 15693, + "time_per_iteration": 3.9923908710479736 + }, + { + "auxiliary_loss_clip": 0.06400185, + "auxiliary_loss_mlp": 0.01262209, + "balance_loss_clip": 0.06271058, + "balance_loss_mlp": 0.01252881, + "epoch": 0.9435743273711108, + "flos": 25016004460800.0, + "grad_norm": 1.5946007545759409, + "language_loss": 0.73723388, + "learning_rate": 3.327111408156291e-08, + "loss": 0.81385785, + "num_input_tokens_seen": 338390870, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.09338379, + "step": 15694, + "time_per_iteration": 2.516932487487793 + }, + { + "auxiliary_loss_clip": 0.06313274, + "auxiliary_loss_mlp": 0.01251927, + "balance_loss_clip": 0.06259228, + "balance_loss_mlp": 0.01250888, + "epoch": 0.9436344506237787, + "flos": 60179916723840.0, + "grad_norm": 0.6942834206013441, + "language_loss": 0.50500864, + "learning_rate": 3.3200408176309316e-08, + "loss": 0.5806607, + "num_input_tokens_seen": 338453075, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.01039886, + "step": 15695, + "time_per_iteration": 3.178891181945801 + }, + { + "auxiliary_loss_clip": 0.06396429, + "auxiliary_loss_mlp": 0.0126797, + "balance_loss_clip": 0.06271218, + "balance_loss_mlp": 0.01259691, + "epoch": 0.9436945738764467, + "flos": 22243885107840.0, + "grad_norm": 1.5773322030260613, + "language_loss": 0.65293247, + "learning_rate": 3.312977685229335e-08, + "loss": 0.72957647, + "num_input_tokens_seen": 338471770, + "router_z_loss_clip": 1.25195312, + "router_z_loss_mlp": 0.08276367, + "step": 15696, + "time_per_iteration": 2.501094341278076 + }, + { + "auxiliary_loss_clip": 0.06403381, + "auxiliary_loss_mlp": 0.01261862, + "balance_loss_clip": 0.0627207, + "balance_loss_mlp": 0.01252683, + "epoch": 0.9437546971291146, + "flos": 25052034516480.0, + "grad_norm": 1.6284029505922766, + "language_loss": 0.66615683, + "learning_rate": 3.305922011219353e-08, + "loss": 0.7428093, + "num_input_tokens_seen": 338492190, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.09179688, + "step": 15697, + "time_per_iteration": 2.541961431503296 + }, + { + "auxiliary_loss_clip": 0.06310762, + "auxiliary_loss_mlp": 0.01253679, + "balance_loss_clip": 0.06256643, + "balance_loss_mlp": 0.01252642, + "epoch": 0.9438148203817827, + "flos": 56809556346240.0, + "grad_norm": 0.844263571757514, + "language_loss": 0.63148797, + "learning_rate": 3.298873795868506e-08, + "loss": 0.70713234, + "num_input_tokens_seen": 338552560, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.01036835, + "step": 15698, + "time_per_iteration": 3.051950216293335 + }, + { + "auxiliary_loss_clip": 0.06405546, + "auxiliary_loss_mlp": 0.0126485, + "balance_loss_clip": 0.06270891, + "balance_loss_mlp": 0.01255081, + "epoch": 0.9438749436344506, + "flos": 22352981523840.0, + "grad_norm": 1.8322973887510348, + "language_loss": 0.69760531, + "learning_rate": 3.291833039444092e-08, + "loss": 0.77430928, + "num_input_tokens_seen": 338571770, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.09759521, + "step": 15699, + "time_per_iteration": 2.504598379135132 + }, + { + "auxiliary_loss_clip": 0.06397957, + "auxiliary_loss_mlp": 0.01264465, + "balance_loss_clip": 0.06271023, + "balance_loss_mlp": 0.01255441, + "epoch": 0.9439350668871186, + "flos": 13375881694080.0, + "grad_norm": 2.165048866443223, + "language_loss": 0.74769372, + "learning_rate": 3.2847997422130734e-08, + "loss": 0.82431793, + "num_input_tokens_seen": 338587310, + "router_z_loss_clip": 1.27050781, + "router_z_loss_mlp": 0.090271, + "step": 15700, + "time_per_iteration": 2.4962573051452637 + }, + { + "auxiliary_loss_clip": 0.06398397, + "auxiliary_loss_mlp": 0.01263164, + "balance_loss_clip": 0.06269559, + "balance_loss_mlp": 0.01254033, + "epoch": 0.9439951901397866, + "flos": 17791113686400.0, + "grad_norm": 1.531110206414724, + "language_loss": 0.7072165, + "learning_rate": 3.2777739044421495e-08, + "loss": 0.78383207, + "num_input_tokens_seen": 338606235, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.09136963, + "step": 15701, + "time_per_iteration": 3.9256973266601562 + }, + { + "auxiliary_loss_clip": 0.06410138, + "auxiliary_loss_mlp": 0.01263745, + "balance_loss_clip": 0.06272127, + "balance_loss_mlp": 0.01254095, + "epoch": 0.9440553133924545, + "flos": 18885473936640.0, + "grad_norm": 1.6976214240404868, + "language_loss": 0.78259611, + "learning_rate": 3.2707555263977505e-08, + "loss": 0.85933489, + "num_input_tokens_seen": 338624090, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.09649658, + "step": 15702, + "time_per_iteration": 2.5262832641601562 + }, + { + "auxiliary_loss_clip": 0.06404, + "auxiliary_loss_mlp": 0.0126415, + "balance_loss_clip": 0.06271169, + "balance_loss_mlp": 0.01254548, + "epoch": 0.9441154366451225, + "flos": 19579017381120.0, + "grad_norm": 2.6087828966167326, + "language_loss": 0.66408789, + "learning_rate": 3.2637446083460194e-08, + "loss": 0.74076939, + "num_input_tokens_seen": 338643695, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.0960083, + "step": 15703, + "time_per_iteration": 2.4908831119537354 + }, + { + "auxiliary_loss_clip": 0.06404126, + "auxiliary_loss_mlp": 0.01264876, + "balance_loss_clip": 0.06271374, + "balance_loss_mlp": 0.01255685, + "epoch": 0.9441755598977905, + "flos": 30302037210240.0, + "grad_norm": 1.5526862694072474, + "language_loss": 0.73514414, + "learning_rate": 3.256741150552833e-08, + "loss": 0.81183422, + "num_input_tokens_seen": 338664725, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.09191895, + "step": 15704, + "time_per_iteration": 2.578453302383423 + }, + { + "auxiliary_loss_clip": 0.06397037, + "auxiliary_loss_mlp": 0.01265014, + "balance_loss_clip": 0.06270902, + "balance_loss_mlp": 0.01255686, + "epoch": 0.9442356831504585, + "flos": 20674174245120.0, + "grad_norm": 1.9988169073450903, + "language_loss": 0.74711281, + "learning_rate": 3.2497451532837336e-08, + "loss": 0.82373333, + "num_input_tokens_seen": 338683990, + "router_z_loss_clip": 1.26269531, + "router_z_loss_mlp": 0.09332275, + "step": 15705, + "time_per_iteration": 2.50264835357666 + }, + { + "auxiliary_loss_clip": 0.06400542, + "auxiliary_loss_mlp": 0.0126436, + "balance_loss_clip": 0.06270608, + "balance_loss_mlp": 0.01255139, + "epoch": 0.9442958064031264, + "flos": 16112809532160.0, + "grad_norm": 1.6809193926837838, + "language_loss": 0.77485085, + "learning_rate": 3.2427566168039986e-08, + "loss": 0.8514998, + "num_input_tokens_seen": 338702025, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.09222412, + "step": 15706, + "time_per_iteration": 2.4911396503448486 + }, + { + "auxiliary_loss_clip": 0.06395966, + "auxiliary_loss_mlp": 0.01261484, + "balance_loss_clip": 0.06269921, + "balance_loss_mlp": 0.01252537, + "epoch": 0.9443559296557944, + "flos": 20453381936640.0, + "grad_norm": 2.6863035412051612, + "language_loss": 0.69485629, + "learning_rate": 3.23577554137866e-08, + "loss": 0.77143085, + "num_input_tokens_seen": 338720920, + "router_z_loss_clip": 1.25976562, + "router_z_loss_mlp": 0.08953857, + "step": 15707, + "time_per_iteration": 3.932788133621216 + }, + { + "auxiliary_loss_clip": 0.06392172, + "auxiliary_loss_mlp": 0.01261239, + "balance_loss_clip": 0.06267689, + "balance_loss_mlp": 0.0125284, + "epoch": 0.9444160529084623, + "flos": 21616406208000.0, + "grad_norm": 1.7233425168990235, + "language_loss": 0.69313765, + "learning_rate": 3.22880192727244e-08, + "loss": 0.76967174, + "num_input_tokens_seen": 338739590, + "router_z_loss_clip": 1.24414062, + "router_z_loss_mlp": 0.08398438, + "step": 15708, + "time_per_iteration": 2.488739490509033 + }, + { + "auxiliary_loss_clip": 0.06398219, + "auxiliary_loss_mlp": 0.01263278, + "balance_loss_clip": 0.06269833, + "balance_loss_mlp": 0.01254599, + "epoch": 0.9444761761611303, + "flos": 18447620826240.0, + "grad_norm": 2.4449285040700905, + "language_loss": 0.7077049, + "learning_rate": 3.221835774749748e-08, + "loss": 0.78431988, + "num_input_tokens_seen": 338757240, + "router_z_loss_clip": 1.28320312, + "router_z_loss_mlp": 0.08679199, + "step": 15709, + "time_per_iteration": 2.486844539642334 + }, + { + "auxiliary_loss_clip": 0.06396931, + "auxiliary_loss_mlp": 0.01264514, + "balance_loss_clip": 0.06268953, + "balance_loss_mlp": 0.01255675, + "epoch": 0.9445362994137982, + "flos": 20963043596160.0, + "grad_norm": 1.9344210100070667, + "language_loss": 0.85356987, + "learning_rate": 3.214877084074774e-08, + "loss": 0.93018436, + "num_input_tokens_seen": 338773750, + "router_z_loss_clip": 1.27929688, + "router_z_loss_mlp": 0.08837891, + "step": 15710, + "time_per_iteration": 2.477931261062622 + }, + { + "auxiliary_loss_clip": 0.06406383, + "auxiliary_loss_mlp": 0.0126325, + "balance_loss_clip": 0.06272225, + "balance_loss_mlp": 0.01253284, + "epoch": 0.9445964226664663, + "flos": 20309555203200.0, + "grad_norm": 1.6267551376340164, + "language_loss": 0.71685177, + "learning_rate": 3.2079258555113956e-08, + "loss": 0.79354811, + "num_input_tokens_seen": 338792115, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.09967041, + "step": 15711, + "time_per_iteration": 3.9364025592803955 + }, + { + "auxiliary_loss_clip": 0.06402559, + "auxiliary_loss_mlp": 0.01262817, + "balance_loss_clip": 0.06272049, + "balance_loss_mlp": 0.01254096, + "epoch": 0.9446565459191342, + "flos": 26403259057920.0, + "grad_norm": 2.3323613984996707, + "language_loss": 0.69751537, + "learning_rate": 3.200982089323179e-08, + "loss": 0.77416909, + "num_input_tokens_seen": 338812480, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.0871582, + "step": 15712, + "time_per_iteration": 2.556997060775757 + }, + { + "auxiliary_loss_clip": 0.0640899, + "auxiliary_loss_mlp": 0.01265857, + "balance_loss_clip": 0.06272276, + "balance_loss_mlp": 0.01255212, + "epoch": 0.9447166691718022, + "flos": 16550327226240.0, + "grad_norm": 2.2946300657355976, + "language_loss": 0.70720011, + "learning_rate": 3.1940457857734246e-08, + "loss": 0.78394854, + "num_input_tokens_seen": 338829105, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.10650635, + "step": 15713, + "time_per_iteration": 2.5120773315429688 + }, + { + "auxiliary_loss_clip": 0.0639579, + "auxiliary_loss_mlp": 0.01266227, + "balance_loss_clip": 0.06270416, + "balance_loss_mlp": 0.01256828, + "epoch": 0.9447767924244702, + "flos": 29171604977280.0, + "grad_norm": 1.4532838118975553, + "language_loss": 0.76606899, + "learning_rate": 3.187116945125212e-08, + "loss": 0.84268916, + "num_input_tokens_seen": 338850670, + "router_z_loss_clip": 1.25195312, + "router_z_loss_mlp": 0.09399414, + "step": 15714, + "time_per_iteration": 2.5846641063690186 + }, + { + "auxiliary_loss_clip": 0.06404714, + "auxiliary_loss_mlp": 0.01265239, + "balance_loss_clip": 0.06270965, + "balance_loss_mlp": 0.01255577, + "epoch": 0.9448369156771381, + "flos": 19279875905280.0, + "grad_norm": 1.7877405259726427, + "language_loss": 0.68124247, + "learning_rate": 3.1801955676412194e-08, + "loss": 0.75794196, + "num_input_tokens_seen": 338867795, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.09661865, + "step": 15715, + "time_per_iteration": 2.517007350921631 + }, + { + "auxiliary_loss_clip": 0.06405981, + "auxiliary_loss_mlp": 0.01265021, + "balance_loss_clip": 0.06272849, + "balance_loss_mlp": 0.01254763, + "epoch": 0.9448970389298061, + "flos": 23847823163520.0, + "grad_norm": 1.7071461081986556, + "language_loss": 0.74850857, + "learning_rate": 3.173281653583948e-08, + "loss": 0.82521862, + "num_input_tokens_seen": 338887205, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.1026001, + "step": 15716, + "time_per_iteration": 2.5198490619659424 + }, + { + "auxiliary_loss_clip": 0.06407739, + "auxiliary_loss_mlp": 0.01264418, + "balance_loss_clip": 0.06275283, + "balance_loss_mlp": 0.01255078, + "epoch": 0.944957162182474, + "flos": 22388760017280.0, + "grad_norm": 1.6811142354543167, + "language_loss": 0.62509549, + "learning_rate": 3.166375203215565e-08, + "loss": 0.70181704, + "num_input_tokens_seen": 338906130, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09338379, + "step": 15717, + "time_per_iteration": 2.5217764377593994 + }, + { + "auxiliary_loss_clip": 0.06400305, + "auxiliary_loss_mlp": 0.0126476, + "balance_loss_clip": 0.06270771, + "balance_loss_mlp": 0.01255444, + "epoch": 0.9450172854351421, + "flos": 17389584120960.0, + "grad_norm": 1.5696006706759635, + "language_loss": 0.7965737, + "learning_rate": 3.1594762167979514e-08, + "loss": 0.87322432, + "num_input_tokens_seen": 338923045, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.09313965, + "step": 15718, + "time_per_iteration": 2.4564990997314453 + }, + { + "auxiliary_loss_clip": 0.06306401, + "auxiliary_loss_mlp": 0.01249456, + "balance_loss_clip": 0.06252193, + "balance_loss_mlp": 0.01248478, + "epoch": 0.94507740868781, + "flos": 68487092760960.0, + "grad_norm": 0.6920512223947758, + "language_loss": 0.57755935, + "learning_rate": 3.152584694592719e-08, + "loss": 0.65311795, + "num_input_tokens_seen": 338987545, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.00977325, + "step": 15719, + "time_per_iteration": 3.150592565536499 + }, + { + "auxiliary_loss_clip": 0.06405877, + "auxiliary_loss_mlp": 0.01267549, + "balance_loss_clip": 0.06272814, + "balance_loss_mlp": 0.0125797, + "epoch": 0.945137531940478, + "flos": 21148895952000.0, + "grad_norm": 1.5595416281624737, + "language_loss": 0.75960934, + "learning_rate": 3.145700636861193e-08, + "loss": 0.83634359, + "num_input_tokens_seen": 339007830, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.09570312, + "step": 15720, + "time_per_iteration": 2.521066427230835 + }, + { + "auxiliary_loss_clip": 0.06395644, + "auxiliary_loss_mlp": 0.01265072, + "balance_loss_clip": 0.0626734, + "balance_loss_mlp": 0.01256763, + "epoch": 0.9451976551931459, + "flos": 24540611921280.0, + "grad_norm": 1.9699653920542373, + "language_loss": 0.73071945, + "learning_rate": 3.138824043864452e-08, + "loss": 0.80732661, + "num_input_tokens_seen": 339028980, + "router_z_loss_clip": 1.28320312, + "router_z_loss_mlp": 0.08300781, + "step": 15721, + "time_per_iteration": 2.525794267654419 + }, + { + "auxiliary_loss_clip": 0.06402142, + "auxiliary_loss_mlp": 0.01262673, + "balance_loss_clip": 0.06270024, + "balance_loss_mlp": 0.01253369, + "epoch": 0.9452577784458139, + "flos": 23447299847040.0, + "grad_norm": 1.718614090375189, + "language_loss": 0.85034347, + "learning_rate": 3.131954915863244e-08, + "loss": 0.92699158, + "num_input_tokens_seen": 339047950, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.09301758, + "step": 15722, + "time_per_iteration": 2.536926746368408 + }, + { + "auxiliary_loss_clip": 0.06309976, + "auxiliary_loss_mlp": 0.01254115, + "balance_loss_clip": 0.06255897, + "balance_loss_mlp": 0.01253094, + "epoch": 0.9453179016984818, + "flos": 52036749054720.0, + "grad_norm": 0.884744124121599, + "language_loss": 0.64469177, + "learning_rate": 3.125093253118005e-08, + "loss": 0.72033274, + "num_input_tokens_seen": 339104535, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.01020813, + "step": 15723, + "time_per_iteration": 3.1003150939941406 + }, + { + "auxiliary_loss_clip": 0.06405857, + "auxiliary_loss_mlp": 0.01265921, + "balance_loss_clip": 0.0627241, + "balance_loss_mlp": 0.01255646, + "epoch": 0.9453780249511499, + "flos": 13476886191360.0, + "grad_norm": 1.970769174235418, + "language_loss": 0.7331022, + "learning_rate": 3.1182390558889715e-08, + "loss": 0.80982006, + "num_input_tokens_seen": 339122050, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.10266113, + "step": 15724, + "time_per_iteration": 2.4845023155212402 + }, + { + "auxiliary_loss_clip": 0.06401257, + "auxiliary_loss_mlp": 0.01266566, + "balance_loss_clip": 0.0627144, + "balance_loss_mlp": 0.01257625, + "epoch": 0.9454381482038178, + "flos": 23265262851840.0, + "grad_norm": 1.854039175790055, + "language_loss": 0.84987056, + "learning_rate": 3.111392324436024e-08, + "loss": 0.92654884, + "num_input_tokens_seen": 339138940, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.08947754, + "step": 15725, + "time_per_iteration": 2.5003042221069336 + }, + { + "auxiliary_loss_clip": 0.06403221, + "auxiliary_loss_mlp": 0.01262907, + "balance_loss_clip": 0.06270561, + "balance_loss_mlp": 0.01253627, + "epoch": 0.9454982714564858, + "flos": 19502093733120.0, + "grad_norm": 1.8779217955872736, + "language_loss": 0.71166205, + "learning_rate": 3.104553059018822e-08, + "loss": 0.78832328, + "num_input_tokens_seen": 339158245, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09277344, + "step": 15726, + "time_per_iteration": 2.5910589694976807 + }, + { + "auxiliary_loss_clip": 0.06402659, + "auxiliary_loss_mlp": 0.0126494, + "balance_loss_clip": 0.062715, + "balance_loss_mlp": 0.0125532, + "epoch": 0.9455583947091538, + "flos": 23264801654400.0, + "grad_norm": 1.8879911426467153, + "language_loss": 0.61094165, + "learning_rate": 3.097721259896735e-08, + "loss": 0.68761766, + "num_input_tokens_seen": 339178200, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.09619141, + "step": 15727, + "time_per_iteration": 2.636110782623291 + }, + { + "auxiliary_loss_clip": 0.06398436, + "auxiliary_loss_mlp": 0.01268185, + "balance_loss_clip": 0.06271493, + "balance_loss_mlp": 0.01259614, + "epoch": 0.9456185179618217, + "flos": 17678327690880.0, + "grad_norm": 1.7197111625111396, + "language_loss": 0.82013702, + "learning_rate": 3.0908969273287566e-08, + "loss": 0.8968032, + "num_input_tokens_seen": 339193950, + "router_z_loss_clip": 1.26953125, + "router_z_loss_mlp": 0.08569336, + "step": 15728, + "time_per_iteration": 2.5550687313079834 + }, + { + "auxiliary_loss_clip": 0.06308329, + "auxiliary_loss_mlp": 0.01249812, + "balance_loss_clip": 0.0625433, + "balance_loss_mlp": 0.01248773, + "epoch": 0.9456786412144897, + "flos": 61433002535040.0, + "grad_norm": 0.7391636345974608, + "language_loss": 0.58712065, + "learning_rate": 3.08408006157368e-08, + "loss": 0.66270202, + "num_input_tokens_seen": 339252330, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.01039124, + "step": 15729, + "time_per_iteration": 3.104180335998535 + }, + { + "auxiliary_loss_clip": 0.06399846, + "auxiliary_loss_mlp": 0.01264543, + "balance_loss_clip": 0.06271389, + "balance_loss_mlp": 0.01255465, + "epoch": 0.9457387644671577, + "flos": 18594340525440.0, + "grad_norm": 2.1443897362387814, + "language_loss": 0.77353084, + "learning_rate": 3.077270662890052e-08, + "loss": 0.85017467, + "num_input_tokens_seen": 339270325, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.09082031, + "step": 15730, + "time_per_iteration": 2.5131759643554688 + }, + { + "auxiliary_loss_clip": 0.06399836, + "auxiliary_loss_mlp": 0.01267427, + "balance_loss_clip": 0.06267837, + "balance_loss_mlp": 0.01257688, + "epoch": 0.9457988877198257, + "flos": 21115381518720.0, + "grad_norm": 1.6416517192605633, + "language_loss": 0.63005936, + "learning_rate": 3.070468731536047e-08, + "loss": 0.70673198, + "num_input_tokens_seen": 339291980, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09747314, + "step": 15731, + "time_per_iteration": 2.530729293823242 + }, + { + "auxiliary_loss_clip": 0.06402969, + "auxiliary_loss_mlp": 0.01262855, + "balance_loss_clip": 0.06271915, + "balance_loss_mlp": 0.01252955, + "epoch": 0.9458590109724936, + "flos": 26695734134400.0, + "grad_norm": 1.9292294773012948, + "language_loss": 0.6470663, + "learning_rate": 3.063674267769589e-08, + "loss": 0.7237246, + "num_input_tokens_seen": 339311795, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.09899902, + "step": 15732, + "time_per_iteration": 3.9439215660095215 + }, + { + "auxiliary_loss_clip": 0.06409542, + "auxiliary_loss_mlp": 0.01262122, + "balance_loss_clip": 0.06273539, + "balance_loss_mlp": 0.01252383, + "epoch": 0.9459191342251616, + "flos": 18667616520960.0, + "grad_norm": 1.677687050760564, + "language_loss": 0.84323162, + "learning_rate": 3.056887271848363e-08, + "loss": 0.91994834, + "num_input_tokens_seen": 339327745, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.09741211, + "step": 15733, + "time_per_iteration": 2.488312005996704 + }, + { + "auxiliary_loss_clip": 0.06393486, + "auxiliary_loss_mlp": 0.01264252, + "balance_loss_clip": 0.06267101, + "balance_loss_mlp": 0.01255633, + "epoch": 0.9459792574778295, + "flos": 23404226048640.0, + "grad_norm": 1.452807558700151, + "language_loss": 0.72373539, + "learning_rate": 3.0501077440297173e-08, + "loss": 0.80031276, + "num_input_tokens_seen": 339346445, + "router_z_loss_clip": 1.26464844, + "router_z_loss_mlp": 0.08612061, + "step": 15734, + "time_per_iteration": 2.6829605102539062 + }, + { + "auxiliary_loss_clip": 0.06394021, + "auxiliary_loss_mlp": 0.01264276, + "balance_loss_clip": 0.06269066, + "balance_loss_mlp": 0.01256527, + "epoch": 0.9460393807304975, + "flos": 24400474767360.0, + "grad_norm": 1.566131852204227, + "language_loss": 0.86707246, + "learning_rate": 3.043335684570692e-08, + "loss": 0.94365543, + "num_input_tokens_seen": 339367945, + "router_z_loss_clip": 1.25, + "router_z_loss_mlp": 0.07739258, + "step": 15735, + "time_per_iteration": 2.549342632293701 + }, + { + "auxiliary_loss_clip": 0.06399663, + "auxiliary_loss_mlp": 0.01263854, + "balance_loss_clip": 0.06269069, + "balance_loss_mlp": 0.01254887, + "epoch": 0.9460995039831654, + "flos": 21944995194240.0, + "grad_norm": 1.6903865141289935, + "language_loss": 0.67260051, + "learning_rate": 3.036571093728102e-08, + "loss": 0.74923569, + "num_input_tokens_seen": 339386060, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.08969116, + "step": 15736, + "time_per_iteration": 2.4905238151550293 + }, + { + "auxiliary_loss_clip": 0.06303936, + "auxiliary_loss_mlp": 0.01249824, + "balance_loss_clip": 0.06249891, + "balance_loss_mlp": 0.01248861, + "epoch": 0.9461596272358335, + "flos": 70342738081920.0, + "grad_norm": 0.8456385965936714, + "language_loss": 0.65439987, + "learning_rate": 3.029813971758499e-08, + "loss": 0.72993743, + "num_input_tokens_seen": 339446695, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.00961304, + "step": 15737, + "time_per_iteration": 3.1456351280212402 + }, + { + "auxiliary_loss_clip": 0.06310228, + "auxiliary_loss_mlp": 0.01250707, + "balance_loss_clip": 0.06256226, + "balance_loss_mlp": 0.01249746, + "epoch": 0.9462197504885014, + "flos": 58612427994240.0, + "grad_norm": 0.7768588148943026, + "language_loss": 0.58685583, + "learning_rate": 3.0230643189181225e-08, + "loss": 0.66246521, + "num_input_tokens_seen": 339510080, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.00959778, + "step": 15738, + "time_per_iteration": 3.1362509727478027 + }, + { + "auxiliary_loss_clip": 0.06394856, + "auxiliary_loss_mlp": 0.01264418, + "balance_loss_clip": 0.06267979, + "balance_loss_mlp": 0.01256121, + "epoch": 0.9462798737411694, + "flos": 23439333709440.0, + "grad_norm": 1.8516554697337375, + "language_loss": 0.71715391, + "learning_rate": 3.016322135462834e-08, + "loss": 0.79374659, + "num_input_tokens_seen": 339529335, + "router_z_loss_clip": 1.26855469, + "router_z_loss_mlp": 0.08294678, + "step": 15739, + "time_per_iteration": 2.5040197372436523 + }, + { + "auxiliary_loss_clip": 0.06402469, + "auxiliary_loss_mlp": 0.01265002, + "balance_loss_clip": 0.06269836, + "balance_loss_mlp": 0.01255227, + "epoch": 0.9463399969938374, + "flos": 25053082692480.0, + "grad_norm": 2.1300906946077953, + "language_loss": 0.6520685, + "learning_rate": 3.009587421648363e-08, + "loss": 0.7287432, + "num_input_tokens_seen": 339548820, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.09765625, + "step": 15740, + "time_per_iteration": 3.9453022480010986 + }, + { + "auxiliary_loss_clip": 0.06396136, + "auxiliary_loss_mlp": 0.01269325, + "balance_loss_clip": 0.06268455, + "balance_loss_mlp": 0.01260164, + "epoch": 0.9464001202465053, + "flos": 24359455393920.0, + "grad_norm": 1.573667052728098, + "language_loss": 0.66363811, + "learning_rate": 3.0028601777301045e-08, + "loss": 0.74029279, + "num_input_tokens_seen": 339566775, + "router_z_loss_clip": 1.27734375, + "router_z_loss_mlp": 0.09155273, + "step": 15741, + "time_per_iteration": 2.5351650714874268 + }, + { + "auxiliary_loss_clip": 0.06402055, + "auxiliary_loss_mlp": 0.01265101, + "balance_loss_clip": 0.06270935, + "balance_loss_mlp": 0.01256018, + "epoch": 0.9464602434991733, + "flos": 17171181653760.0, + "grad_norm": 1.8202599223323925, + "language_loss": 0.76282263, + "learning_rate": 2.9961404039630987e-08, + "loss": 0.83949423, + "num_input_tokens_seen": 339581905, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09094238, + "step": 15742, + "time_per_iteration": 2.4993362426757812 + }, + { + "auxiliary_loss_clip": 0.06400387, + "auxiliary_loss_mlp": 0.01264176, + "balance_loss_clip": 0.06272254, + "balance_loss_mlp": 0.01255152, + "epoch": 0.9465203667518413, + "flos": 19944265328640.0, + "grad_norm": 1.8835810090915717, + "language_loss": 0.72201908, + "learning_rate": 2.989428100602187e-08, + "loss": 0.79866475, + "num_input_tokens_seen": 339599870, + "router_z_loss_clip": 1.27929688, + "router_z_loss_mlp": 0.090271, + "step": 15743, + "time_per_iteration": 2.5028302669525146 + }, + { + "auxiliary_loss_clip": 0.06402981, + "auxiliary_loss_mlp": 0.01265574, + "balance_loss_clip": 0.06269473, + "balance_loss_mlp": 0.01255585, + "epoch": 0.9465804900045093, + "flos": 20126470032000.0, + "grad_norm": 1.615168658581885, + "language_loss": 0.80039352, + "learning_rate": 2.982723267901943e-08, + "loss": 0.87707901, + "num_input_tokens_seen": 339620250, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.09991455, + "step": 15744, + "time_per_iteration": 2.5396833419799805 + }, + { + "auxiliary_loss_clip": 0.06402554, + "auxiliary_loss_mlp": 0.01267498, + "balance_loss_clip": 0.06269826, + "balance_loss_mlp": 0.01257502, + "epoch": 0.9466406132571772, + "flos": 23917870776960.0, + "grad_norm": 1.6501908259993738, + "language_loss": 0.78493166, + "learning_rate": 2.9760259061165417e-08, + "loss": 0.86163217, + "num_input_tokens_seen": 339639900, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.09991455, + "step": 15745, + "time_per_iteration": 2.5577425956726074 + }, + { + "auxiliary_loss_clip": 0.06403811, + "auxiliary_loss_mlp": 0.01268431, + "balance_loss_clip": 0.06269467, + "balance_loss_mlp": 0.01258113, + "epoch": 0.9467007365098452, + "flos": 19938563251200.0, + "grad_norm": 1.513557471901544, + "language_loss": 0.70127267, + "learning_rate": 2.9693360155000014e-08, + "loss": 0.77799511, + "num_input_tokens_seen": 339658970, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.10308838, + "step": 15746, + "time_per_iteration": 2.5116147994995117 + }, + { + "auxiliary_loss_clip": 0.06400457, + "auxiliary_loss_mlp": 0.01264802, + "balance_loss_clip": 0.06270906, + "balance_loss_mlp": 0.01255092, + "epoch": 0.9467608597625131, + "flos": 19315318982400.0, + "grad_norm": 2.038079128612824, + "language_loss": 0.56620514, + "learning_rate": 2.962653596305964e-08, + "loss": 0.64285767, + "num_input_tokens_seen": 339675600, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.09710693, + "step": 15747, + "time_per_iteration": 4.008328914642334 + }, + { + "auxiliary_loss_clip": 0.06305839, + "auxiliary_loss_mlp": 0.01248436, + "balance_loss_clip": 0.06251822, + "balance_loss_mlp": 0.01247403, + "epoch": 0.9468209830151811, + "flos": 69650578229760.0, + "grad_norm": 0.6388680889443452, + "language_loss": 0.53260732, + "learning_rate": 2.955978648787871e-08, + "loss": 0.60815012, + "num_input_tokens_seen": 339744505, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.01033783, + "step": 15748, + "time_per_iteration": 3.302865743637085 + }, + { + "auxiliary_loss_clip": 0.06403889, + "auxiliary_loss_mlp": 0.01263785, + "balance_loss_clip": 0.06272125, + "balance_loss_mlp": 0.0125432, + "epoch": 0.946881106267849, + "flos": 27024029631360.0, + "grad_norm": 1.6131180095460511, + "language_loss": 0.66900456, + "learning_rate": 2.9493111731988096e-08, + "loss": 0.74568129, + "num_input_tokens_seen": 339765810, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09460449, + "step": 15749, + "time_per_iteration": 2.5552892684936523 + }, + { + "auxiliary_loss_clip": 0.06402941, + "auxiliary_loss_mlp": 0.01263987, + "balance_loss_clip": 0.06269799, + "balance_loss_mlp": 0.01253371, + "epoch": 0.9469412295205171, + "flos": 20195721031680.0, + "grad_norm": 1.9171819700733619, + "language_loss": 0.76360601, + "learning_rate": 2.942651169791621e-08, + "loss": 0.84027529, + "num_input_tokens_seen": 339784125, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.10620117, + "step": 15750, + "time_per_iteration": 3.9931576251983643 + }, + { + "auxiliary_loss_clip": 0.06399237, + "auxiliary_loss_mlp": 0.01263463, + "balance_loss_clip": 0.06271112, + "balance_loss_mlp": 0.01254403, + "epoch": 0.947001352773185, + "flos": 21331352217600.0, + "grad_norm": 6.300306404866139, + "language_loss": 0.6824044, + "learning_rate": 2.9359986388188372e-08, + "loss": 0.75903136, + "num_input_tokens_seen": 339803450, + "router_z_loss_clip": 1.28027344, + "router_z_loss_mlp": 0.09057617, + "step": 15751, + "time_per_iteration": 2.5015761852264404 + }, + { + "auxiliary_loss_clip": 0.06403518, + "auxiliary_loss_mlp": 0.01264898, + "balance_loss_clip": 0.062707, + "balance_loss_mlp": 0.01255296, + "epoch": 0.947061476025853, + "flos": 21950403782400.0, + "grad_norm": 1.5258403559147693, + "language_loss": 0.65762782, + "learning_rate": 2.929353580532723e-08, + "loss": 0.734312, + "num_input_tokens_seen": 339823215, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.0960083, + "step": 15752, + "time_per_iteration": 2.5320088863372803 + }, + { + "auxiliary_loss_clip": 0.0640187, + "auxiliary_loss_mlp": 0.01265282, + "balance_loss_clip": 0.06272066, + "balance_loss_mlp": 0.01256121, + "epoch": 0.947121599278521, + "flos": 21400645144320.0, + "grad_norm": 1.5250116712794441, + "language_loss": 0.71658498, + "learning_rate": 2.9227159951852764e-08, + "loss": 0.79325652, + "num_input_tokens_seen": 339842230, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.09161377, + "step": 15753, + "time_per_iteration": 2.5358986854553223 + }, + { + "auxiliary_loss_clip": 0.06404962, + "auxiliary_loss_mlp": 0.01264996, + "balance_loss_clip": 0.06269598, + "balance_loss_mlp": 0.0125484, + "epoch": 0.9471817225311889, + "flos": 23082387315840.0, + "grad_norm": 2.318871000803308, + "language_loss": 0.70373905, + "learning_rate": 2.9160858830281855e-08, + "loss": 0.78043866, + "num_input_tokens_seen": 339861640, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.10168457, + "step": 15754, + "time_per_iteration": 2.5675079822540283 + }, + { + "auxiliary_loss_clip": 0.06402844, + "auxiliary_loss_mlp": 0.01262674, + "balance_loss_clip": 0.06269033, + "balance_loss_mlp": 0.01253113, + "epoch": 0.947241845783857, + "flos": 11915476882560.0, + "grad_norm": 2.1288030858444107, + "language_loss": 0.79356575, + "learning_rate": 2.9094632443129153e-08, + "loss": 0.8702209, + "num_input_tokens_seen": 339878210, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.09552002, + "step": 15755, + "time_per_iteration": 2.4721009731292725 + }, + { + "auxiliary_loss_clip": 0.06409688, + "auxiliary_loss_mlp": 0.01266846, + "balance_loss_clip": 0.06272167, + "balance_loss_mlp": 0.01255771, + "epoch": 0.9473019690365249, + "flos": 20746947116160.0, + "grad_norm": 2.0378371913661333, + "language_loss": 0.75405908, + "learning_rate": 2.9028480792904876e-08, + "loss": 0.83082443, + "num_input_tokens_seen": 339894255, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.11071777, + "step": 15756, + "time_per_iteration": 2.5512049198150635 + }, + { + "auxiliary_loss_clip": 0.06406745, + "auxiliary_loss_mlp": 0.01263174, + "balance_loss_clip": 0.06272939, + "balance_loss_mlp": 0.01253458, + "epoch": 0.9473620922891929, + "flos": 17645735652480.0, + "grad_norm": 2.0156510018018317, + "language_loss": 0.74623597, + "learning_rate": 2.8962403882118347e-08, + "loss": 0.82293516, + "num_input_tokens_seen": 339912425, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.09710693, + "step": 15757, + "time_per_iteration": 2.500520706176758 + }, + { + "auxiliary_loss_clip": 0.06404679, + "auxiliary_loss_mlp": 0.01263311, + "balance_loss_clip": 0.06270847, + "balance_loss_mlp": 0.01253744, + "epoch": 0.9474222155418608, + "flos": 23556731679360.0, + "grad_norm": 2.0076349731074843, + "language_loss": 0.79710162, + "learning_rate": 2.889640171327512e-08, + "loss": 0.8737815, + "num_input_tokens_seen": 339929635, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.09558105, + "step": 15758, + "time_per_iteration": 2.536018133163452 + }, + { + "auxiliary_loss_clip": 0.06401306, + "auxiliary_loss_mlp": 0.0126548, + "balance_loss_clip": 0.06272912, + "balance_loss_mlp": 0.01256027, + "epoch": 0.9474823387945288, + "flos": 27097179845760.0, + "grad_norm": 1.7167664956687578, + "language_loss": 0.7194469, + "learning_rate": 2.8830474288877638e-08, + "loss": 0.79611474, + "num_input_tokens_seen": 339951200, + "router_z_loss_clip": 1.28320312, + "router_z_loss_mlp": 0.09454346, + "step": 15759, + "time_per_iteration": 2.537297487258911 + }, + { + "auxiliary_loss_clip": 0.06395267, + "auxiliary_loss_mlp": 0.01263013, + "balance_loss_clip": 0.06270848, + "balance_loss_mlp": 0.01255175, + "epoch": 0.9475424620471967, + "flos": 22973207045760.0, + "grad_norm": 1.518890611164647, + "language_loss": 0.75593793, + "learning_rate": 2.8764621611426344e-08, + "loss": 0.83252072, + "num_input_tokens_seen": 339971820, + "router_z_loss_clip": 1.24316406, + "router_z_loss_mlp": 0.07830811, + "step": 15760, + "time_per_iteration": 2.506772518157959 + }, + { + "auxiliary_loss_clip": 0.0640036, + "auxiliary_loss_mlp": 0.01262958, + "balance_loss_clip": 0.06270038, + "balance_loss_mlp": 0.01254065, + "epoch": 0.9476025852998647, + "flos": 20053864869120.0, + "grad_norm": 1.6429269418431312, + "language_loss": 0.72826153, + "learning_rate": 2.8698843683418128e-08, + "loss": 0.80489469, + "num_input_tokens_seen": 339989420, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.08880615, + "step": 15761, + "time_per_iteration": 2.461029291152954 + }, + { + "auxiliary_loss_clip": 0.06400488, + "auxiliary_loss_mlp": 0.01263148, + "balance_loss_clip": 0.0627092, + "balance_loss_mlp": 0.01254511, + "epoch": 0.9476627085525327, + "flos": 14980700217600.0, + "grad_norm": 2.0388938661384066, + "language_loss": 0.72076392, + "learning_rate": 2.863314050734722e-08, + "loss": 0.7974003, + "num_input_tokens_seen": 340006690, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.08642578, + "step": 15762, + "time_per_iteration": 2.4437167644500732 + }, + { + "auxiliary_loss_clip": 0.06409766, + "auxiliary_loss_mlp": 0.01263153, + "balance_loss_clip": 0.0627232, + "balance_loss_mlp": 0.01253235, + "epoch": 0.9477228318052007, + "flos": 18703772357760.0, + "grad_norm": 1.9307223538038316, + "language_loss": 0.67410612, + "learning_rate": 2.856751208570518e-08, + "loss": 0.7508353, + "num_input_tokens_seen": 340025480, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.09912109, + "step": 15763, + "time_per_iteration": 2.4636471271514893 + }, + { + "auxiliary_loss_clip": 0.06403434, + "auxiliary_loss_mlp": 0.01263146, + "balance_loss_clip": 0.06269363, + "balance_loss_mlp": 0.01254378, + "epoch": 0.9477829550578686, + "flos": 23881295669760.0, + "grad_norm": 1.6268798558288402, + "language_loss": 0.70511979, + "learning_rate": 2.8501958420980466e-08, + "loss": 0.78178561, + "num_input_tokens_seen": 340043785, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.08764648, + "step": 15764, + "time_per_iteration": 2.507150888442993 + }, + { + "auxiliary_loss_clip": 0.06395758, + "auxiliary_loss_mlp": 0.0126393, + "balance_loss_clip": 0.06272718, + "balance_loss_mlp": 0.01255937, + "epoch": 0.9478430783105366, + "flos": 22569119930880.0, + "grad_norm": 1.638940250411441, + "language_loss": 0.71428376, + "learning_rate": 2.8436479515659306e-08, + "loss": 0.79088062, + "num_input_tokens_seen": 340064360, + "router_z_loss_clip": 1.23144531, + "router_z_loss_mlp": 0.07983398, + "step": 15765, + "time_per_iteration": 2.507747173309326 + }, + { + "auxiliary_loss_clip": 0.06314638, + "auxiliary_loss_mlp": 0.01249169, + "balance_loss_clip": 0.06260315, + "balance_loss_mlp": 0.01248279, + "epoch": 0.9479032015632046, + "flos": 60874103802240.0, + "grad_norm": 0.7940134593806808, + "language_loss": 0.58885753, + "learning_rate": 2.8371075372224384e-08, + "loss": 0.66449559, + "num_input_tokens_seen": 340114425, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.00889587, + "step": 15766, + "time_per_iteration": 2.895747184753418 + }, + { + "auxiliary_loss_clip": 0.0640133, + "auxiliary_loss_mlp": 0.01264695, + "balance_loss_clip": 0.06271998, + "balance_loss_mlp": 0.01255641, + "epoch": 0.9479633248158725, + "flos": 14689105608960.0, + "grad_norm": 2.0710351232242337, + "language_loss": 0.74133766, + "learning_rate": 2.8305745993155938e-08, + "loss": 0.81799787, + "num_input_tokens_seen": 340132200, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.09057617, + "step": 15767, + "time_per_iteration": 2.4537556171417236 + }, + { + "auxiliary_loss_clip": 0.06410235, + "auxiliary_loss_mlp": 0.01264184, + "balance_loss_clip": 0.06274644, + "balance_loss_mlp": 0.01254445, + "epoch": 0.9480234480685406, + "flos": 20339170421760.0, + "grad_norm": 2.3853256310763684, + "language_loss": 0.73483276, + "learning_rate": 2.8240491380931096e-08, + "loss": 0.81157696, + "num_input_tokens_seen": 340149175, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.09735107, + "step": 15768, + "time_per_iteration": 2.5297107696533203 + }, + { + "auxiliary_loss_clip": 0.06308576, + "auxiliary_loss_mlp": 0.01250161, + "balance_loss_clip": 0.06254381, + "balance_loss_mlp": 0.01249126, + "epoch": 0.9480835713212085, + "flos": 70314548382720.0, + "grad_norm": 0.7196411504801323, + "language_loss": 0.55233341, + "learning_rate": 2.8175311538024326e-08, + "loss": 0.62792081, + "num_input_tokens_seen": 340208155, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.01035309, + "step": 15769, + "time_per_iteration": 3.135312557220459 + }, + { + "auxiliary_loss_clip": 0.06404492, + "auxiliary_loss_mlp": 0.01262015, + "balance_loss_clip": 0.06270418, + "balance_loss_mlp": 0.01253131, + "epoch": 0.9481436945738765, + "flos": 25457211734400.0, + "grad_norm": 1.291960686791139, + "language_loss": 0.77551377, + "learning_rate": 2.8110206466907428e-08, + "loss": 0.85217881, + "num_input_tokens_seen": 340229275, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.08895874, + "step": 15770, + "time_per_iteration": 2.549916982650757 + }, + { + "auxiliary_loss_clip": 0.06405759, + "auxiliary_loss_mlp": 0.01265581, + "balance_loss_clip": 0.06275308, + "balance_loss_mlp": 0.01255514, + "epoch": 0.9482038178265444, + "flos": 26987244888960.0, + "grad_norm": 1.726653277690328, + "language_loss": 0.80475664, + "learning_rate": 2.8045176170049313e-08, + "loss": 0.88147008, + "num_input_tokens_seen": 340248920, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.10058594, + "step": 15771, + "time_per_iteration": 2.5613114833831787 + }, + { + "auxiliary_loss_clip": 0.06398853, + "auxiliary_loss_mlp": 0.01263649, + "balance_loss_clip": 0.06269822, + "balance_loss_mlp": 0.01254696, + "epoch": 0.9482639410792124, + "flos": 17791239467520.0, + "grad_norm": 1.9987146967466614, + "language_loss": 0.70112687, + "learning_rate": 2.7980220649915566e-08, + "loss": 0.77775192, + "num_input_tokens_seen": 340266775, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.08959961, + "step": 15772, + "time_per_iteration": 3.8485605716705322 + }, + { + "auxiliary_loss_clip": 0.06399487, + "auxiliary_loss_mlp": 0.01261828, + "balance_loss_clip": 0.06269841, + "balance_loss_mlp": 0.01252583, + "epoch": 0.9483240643318803, + "flos": 21003098647680.0, + "grad_norm": 1.447529838975947, + "language_loss": 0.74107957, + "learning_rate": 2.7915339908969327e-08, + "loss": 0.8176927, + "num_input_tokens_seen": 340285295, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.09240723, + "step": 15773, + "time_per_iteration": 2.500173807144165 + }, + { + "auxiliary_loss_clip": 0.06403539, + "auxiliary_loss_mlp": 0.01263968, + "balance_loss_clip": 0.06268892, + "balance_loss_mlp": 0.01253555, + "epoch": 0.9483841875845483, + "flos": 20089349873280.0, + "grad_norm": 2.1086250224803806, + "language_loss": 0.63228577, + "learning_rate": 2.7850533949671072e-08, + "loss": 0.70896089, + "num_input_tokens_seen": 340304265, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.10412598, + "step": 15774, + "time_per_iteration": 2.585265636444092 + }, + { + "auxiliary_loss_clip": 0.0640205, + "auxiliary_loss_mlp": 0.01263784, + "balance_loss_clip": 0.06270356, + "balance_loss_mlp": 0.01254056, + "epoch": 0.9484443108372163, + "flos": 20819929622400.0, + "grad_norm": 1.7035210571527313, + "language_loss": 0.59463555, + "learning_rate": 2.7785802774478396e-08, + "loss": 0.67129385, + "num_input_tokens_seen": 340323690, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09735107, + "step": 15775, + "time_per_iteration": 2.563870906829834 + }, + { + "auxiliary_loss_clip": 0.06404445, + "auxiliary_loss_mlp": 0.01266804, + "balance_loss_clip": 0.06271166, + "balance_loss_mlp": 0.01257125, + "epoch": 0.9485044340898843, + "flos": 36438018249600.0, + "grad_norm": 1.4654814011520536, + "language_loss": 0.61937261, + "learning_rate": 2.772114638584555e-08, + "loss": 0.6960851, + "num_input_tokens_seen": 340345830, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.09674072, + "step": 15776, + "time_per_iteration": 2.62610125541687 + }, + { + "auxiliary_loss_clip": 0.06404588, + "auxiliary_loss_mlp": 0.01264603, + "balance_loss_clip": 0.06271777, + "balance_loss_mlp": 0.01255132, + "epoch": 0.9485645573425522, + "flos": 22609300763520.0, + "grad_norm": 1.9419474034086324, + "language_loss": 0.73911107, + "learning_rate": 2.765656478622458e-08, + "loss": 0.81580293, + "num_input_tokens_seen": 340365910, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09466553, + "step": 15777, + "time_per_iteration": 2.5099053382873535 + }, + { + "auxiliary_loss_clip": 0.06414537, + "auxiliary_loss_mlp": 0.01265641, + "balance_loss_clip": 0.06272923, + "balance_loss_mlp": 0.01255365, + "epoch": 0.9486246805952202, + "flos": 22024266756480.0, + "grad_norm": 2.4570684024376885, + "language_loss": 0.71977055, + "learning_rate": 2.759205797806441e-08, + "loss": 0.79657233, + "num_input_tokens_seen": 340383935, + "router_z_loss_clip": 1.41308594, + "router_z_loss_mlp": 0.10272217, + "step": 15778, + "time_per_iteration": 2.4870026111602783 + }, + { + "auxiliary_loss_clip": 0.06396791, + "auxiliary_loss_mlp": 0.01265306, + "balance_loss_clip": 0.06273665, + "balance_loss_mlp": 0.01257277, + "epoch": 0.9486848038478882, + "flos": 16514297170560.0, + "grad_norm": 1.9713243247520542, + "language_loss": 0.69818199, + "learning_rate": 2.7527625963810865e-08, + "loss": 0.77480304, + "num_input_tokens_seen": 340402760, + "router_z_loss_clip": 1.23046875, + "router_z_loss_mlp": 0.08032227, + "step": 15779, + "time_per_iteration": 2.5228939056396484 + }, + { + "auxiliary_loss_clip": 0.06406988, + "auxiliary_loss_mlp": 0.01263384, + "balance_loss_clip": 0.06274127, + "balance_loss_mlp": 0.01253204, + "epoch": 0.9487449271005561, + "flos": 19250344540800.0, + "grad_norm": 2.158437031271148, + "language_loss": 0.7843678, + "learning_rate": 2.7463268745907542e-08, + "loss": 0.86107153, + "num_input_tokens_seen": 340422105, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.10174561, + "step": 15780, + "time_per_iteration": 4.039035081863403 + }, + { + "auxiliary_loss_clip": 0.06399371, + "auxiliary_loss_mlp": 0.01268072, + "balance_loss_clip": 0.06269532, + "balance_loss_mlp": 0.01258738, + "epoch": 0.9488050503532242, + "flos": 21769205328000.0, + "grad_norm": 1.652828919215293, + "language_loss": 0.66618556, + "learning_rate": 2.7398986326794494e-08, + "loss": 0.74285996, + "num_input_tokens_seen": 340441160, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.09326172, + "step": 15781, + "time_per_iteration": 2.5114023685455322 + }, + { + "auxiliary_loss_clip": 0.06399278, + "auxiliary_loss_mlp": 0.01269723, + "balance_loss_clip": 0.062714, + "balance_loss_mlp": 0.01260156, + "epoch": 0.9488651736058921, + "flos": 18374764101120.0, + "grad_norm": 1.8456931190486248, + "language_loss": 0.80244529, + "learning_rate": 2.733477870890999e-08, + "loss": 0.87913531, + "num_input_tokens_seen": 340458200, + "router_z_loss_clip": 1.27832031, + "router_z_loss_mlp": 0.09564209, + "step": 15782, + "time_per_iteration": 2.523489236831665 + }, + { + "auxiliary_loss_clip": 0.0630802, + "auxiliary_loss_mlp": 0.01249376, + "balance_loss_clip": 0.06253742, + "balance_loss_mlp": 0.01248354, + "epoch": 0.9489252968585601, + "flos": 70107130800000.0, + "grad_norm": 0.7092659629806969, + "language_loss": 0.59900188, + "learning_rate": 2.7270645894688082e-08, + "loss": 0.67457592, + "num_input_tokens_seen": 340526420, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.01021576, + "step": 15783, + "time_per_iteration": 3.2024121284484863 + }, + { + "auxiliary_loss_clip": 0.06402528, + "auxiliary_loss_mlp": 0.01266593, + "balance_loss_clip": 0.06270333, + "balance_loss_mlp": 0.01256651, + "epoch": 0.948985420111228, + "flos": 27862909182720.0, + "grad_norm": 1.627858945896465, + "language_loss": 0.74303591, + "learning_rate": 2.720658788656105e-08, + "loss": 0.81972712, + "num_input_tokens_seen": 340546325, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.09936523, + "step": 15784, + "time_per_iteration": 2.545043468475342 + }, + { + "auxiliary_loss_clip": 0.06405601, + "auxiliary_loss_mlp": 0.01268009, + "balance_loss_clip": 0.06272575, + "balance_loss_mlp": 0.01258413, + "epoch": 0.949045543363896, + "flos": 24322880286720.0, + "grad_norm": 1.7686500585497513, + "language_loss": 0.69748747, + "learning_rate": 2.714260468695806e-08, + "loss": 0.77422357, + "num_input_tokens_seen": 340565145, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.0958252, + "step": 15785, + "time_per_iteration": 2.505894184112549 + }, + { + "auxiliary_loss_clip": 0.06406058, + "auxiliary_loss_mlp": 0.0126643, + "balance_loss_clip": 0.06270994, + "balance_loss_mlp": 0.0125712, + "epoch": 0.9491056666165639, + "flos": 24248262625920.0, + "grad_norm": 1.5913923023691325, + "language_loss": 0.7625891, + "learning_rate": 2.707869629830495e-08, + "loss": 0.83931398, + "num_input_tokens_seen": 340585465, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.09313965, + "step": 15786, + "time_per_iteration": 3.9345221519470215 + }, + { + "auxiliary_loss_clip": 0.06399442, + "auxiliary_loss_mlp": 0.01264758, + "balance_loss_clip": 0.06269728, + "balance_loss_mlp": 0.01256109, + "epoch": 0.949165789869232, + "flos": 24537509320320.0, + "grad_norm": 2.558063223282522, + "language_loss": 0.79310948, + "learning_rate": 2.7014862723025335e-08, + "loss": 0.86975145, + "num_input_tokens_seen": 340606010, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.08642578, + "step": 15787, + "time_per_iteration": 2.5140228271484375 + }, + { + "auxiliary_loss_clip": 0.06398906, + "auxiliary_loss_mlp": 0.01263863, + "balance_loss_clip": 0.06272651, + "balance_loss_mlp": 0.01255003, + "epoch": 0.9492259131218999, + "flos": 22241662974720.0, + "grad_norm": 1.482913828210554, + "language_loss": 0.76110846, + "learning_rate": 2.6951103963540388e-08, + "loss": 0.83773613, + "num_input_tokens_seen": 340626135, + "router_z_loss_clip": 1.26269531, + "router_z_loss_mlp": 0.08859253, + "step": 15788, + "time_per_iteration": 2.49965763092041 + }, + { + "auxiliary_loss_clip": 0.06405517, + "auxiliary_loss_mlp": 0.01266294, + "balance_loss_clip": 0.06271803, + "balance_loss_mlp": 0.0125696, + "epoch": 0.9492860363745679, + "flos": 22972955483520.0, + "grad_norm": 1.5889024657895832, + "language_loss": 0.72189152, + "learning_rate": 2.6887420022266848e-08, + "loss": 0.79860961, + "num_input_tokens_seen": 340644870, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.09332275, + "step": 15789, + "time_per_iteration": 2.556658983230591 + }, + { + "auxiliary_loss_clip": 0.06401318, + "auxiliary_loss_mlp": 0.01266331, + "balance_loss_clip": 0.06272426, + "balance_loss_mlp": 0.0125649, + "epoch": 0.9493461596272358, + "flos": 18376357328640.0, + "grad_norm": 1.820508624210969, + "language_loss": 0.73197401, + "learning_rate": 2.682381090161989e-08, + "loss": 0.80865049, + "num_input_tokens_seen": 340663695, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.09851074, + "step": 15790, + "time_per_iteration": 3.926544189453125 + }, + { + "auxiliary_loss_clip": 0.06403148, + "auxiliary_loss_mlp": 0.01263876, + "balance_loss_clip": 0.06268154, + "balance_loss_mlp": 0.01254185, + "epoch": 0.9494062828799038, + "flos": 20018002521600.0, + "grad_norm": 1.8246160541331784, + "language_loss": 0.77819729, + "learning_rate": 2.6760276604012033e-08, + "loss": 0.85486752, + "num_input_tokens_seen": 340682970, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.09698486, + "step": 15791, + "time_per_iteration": 2.4806320667266846 + }, + { + "auxiliary_loss_clip": 0.0640974, + "auxiliary_loss_mlp": 0.01265101, + "balance_loss_clip": 0.0627186, + "balance_loss_mlp": 0.01254939, + "epoch": 0.9494664061325718, + "flos": 27234843304320.0, + "grad_norm": 1.8993527124962928, + "language_loss": 0.74267161, + "learning_rate": 2.6696817131852234e-08, + "loss": 0.81942004, + "num_input_tokens_seen": 340702275, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.10162354, + "step": 15792, + "time_per_iteration": 2.5601704120635986 + }, + { + "auxiliary_loss_clip": 0.06402216, + "auxiliary_loss_mlp": 0.01262243, + "balance_loss_clip": 0.06271151, + "balance_loss_mlp": 0.01252837, + "epoch": 0.9495265293852397, + "flos": 18375812277120.0, + "grad_norm": 1.7327549003896519, + "language_loss": 0.78444892, + "learning_rate": 2.663343248754679e-08, + "loss": 0.86109352, + "num_input_tokens_seen": 340719060, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09405518, + "step": 15793, + "time_per_iteration": 2.4936344623565674 + }, + { + "auxiliary_loss_clip": 0.06399348, + "auxiliary_loss_mlp": 0.01263265, + "balance_loss_clip": 0.06267807, + "balance_loss_mlp": 0.01253889, + "epoch": 0.9495866526379078, + "flos": 23082429242880.0, + "grad_norm": 1.6722001726685662, + "language_loss": 0.77888709, + "learning_rate": 2.6570122673499562e-08, + "loss": 0.85551322, + "num_input_tokens_seen": 340737815, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09375, + "step": 15794, + "time_per_iteration": 2.497514247894287 + }, + { + "auxiliary_loss_clip": 0.06406198, + "auxiliary_loss_mlp": 0.0126488, + "balance_loss_clip": 0.06271206, + "balance_loss_mlp": 0.01254897, + "epoch": 0.9496467758905757, + "flos": 17535632987520.0, + "grad_norm": 1.9049729517954086, + "language_loss": 0.61179888, + "learning_rate": 2.650688769211107e-08, + "loss": 0.6885097, + "num_input_tokens_seen": 340756035, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.09979248, + "step": 15795, + "time_per_iteration": 2.5063045024871826 + }, + { + "auxiliary_loss_clip": 0.06395505, + "auxiliary_loss_mlp": 0.01265243, + "balance_loss_clip": 0.06269419, + "balance_loss_mlp": 0.01255897, + "epoch": 0.9497068991432437, + "flos": 24140759437440.0, + "grad_norm": 1.51218594053535, + "language_loss": 0.79580635, + "learning_rate": 2.644372754577895e-08, + "loss": 0.87241381, + "num_input_tokens_seen": 340775620, + "router_z_loss_clip": 1.26074219, + "router_z_loss_mlp": 0.09338379, + "step": 15796, + "time_per_iteration": 2.5217463970184326 + }, + { + "auxiliary_loss_clip": 0.06400493, + "auxiliary_loss_mlp": 0.01265932, + "balance_loss_clip": 0.06268636, + "balance_loss_mlp": 0.01255793, + "epoch": 0.9497670223959116, + "flos": 20309597130240.0, + "grad_norm": 1.9588104868661271, + "language_loss": 0.75637573, + "learning_rate": 2.6380642236898398e-08, + "loss": 0.83303994, + "num_input_tokens_seen": 340794510, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.10137939, + "step": 15797, + "time_per_iteration": 2.510477066040039 + }, + { + "auxiliary_loss_clip": 0.06401858, + "auxiliary_loss_mlp": 0.01262483, + "balance_loss_clip": 0.06269763, + "balance_loss_mlp": 0.01253071, + "epoch": 0.9498271456485796, + "flos": 13704009482880.0, + "grad_norm": 2.197071076360675, + "language_loss": 0.66319734, + "learning_rate": 2.6317631767861727e-08, + "loss": 0.73984075, + "num_input_tokens_seen": 340812955, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09417725, + "step": 15798, + "time_per_iteration": 2.466979503631592 + }, + { + "auxiliary_loss_clip": 0.06409442, + "auxiliary_loss_mlp": 0.01265644, + "balance_loss_clip": 0.06273577, + "balance_loss_mlp": 0.01255994, + "epoch": 0.9498872689012475, + "flos": 20820348892800.0, + "grad_norm": 1.7595466908543556, + "language_loss": 0.77202052, + "learning_rate": 2.6254696141058575e-08, + "loss": 0.84877139, + "num_input_tokens_seen": 340829200, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.09646606, + "step": 15799, + "time_per_iteration": 2.502589225769043 + }, + { + "auxiliary_loss_clip": 0.06398167, + "auxiliary_loss_mlp": 0.01263962, + "balance_loss_clip": 0.06270022, + "balance_loss_mlp": 0.01254795, + "epoch": 0.9499473921539155, + "flos": 21039044849280.0, + "grad_norm": 1.7937393457780948, + "language_loss": 0.71204829, + "learning_rate": 2.6191835358874814e-08, + "loss": 0.78866959, + "num_input_tokens_seen": 340848035, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.0916748, + "step": 15800, + "time_per_iteration": 2.496887683868408 + }, + { + "auxiliary_loss_clip": 0.06400058, + "auxiliary_loss_mlp": 0.01265689, + "balance_loss_clip": 0.06269508, + "balance_loss_mlp": 0.0125657, + "epoch": 0.9500075154065835, + "flos": 21005446561920.0, + "grad_norm": 1.7439251794642465, + "language_loss": 0.71854639, + "learning_rate": 2.6129049423694315e-08, + "loss": 0.79520386, + "num_input_tokens_seen": 340870025, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.09118652, + "step": 15801, + "time_per_iteration": 2.600644588470459 + }, + { + "auxiliary_loss_clip": 0.06403385, + "auxiliary_loss_mlp": 0.01263835, + "balance_loss_clip": 0.06271951, + "balance_loss_mlp": 0.01254578, + "epoch": 0.9500676386592515, + "flos": 25129461288960.0, + "grad_norm": 1.5167901940299169, + "language_loss": 0.81219077, + "learning_rate": 2.6066338337898508e-08, + "loss": 0.88886297, + "num_input_tokens_seen": 340892290, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.0925293, + "step": 15802, + "time_per_iteration": 2.5559613704681396 + }, + { + "auxiliary_loss_clip": 0.06404102, + "auxiliary_loss_mlp": 0.01264645, + "balance_loss_clip": 0.06270744, + "balance_loss_mlp": 0.01254799, + "epoch": 0.9501277619119194, + "flos": 27530462908800.0, + "grad_norm": 1.5351955934289538, + "language_loss": 0.67835546, + "learning_rate": 2.60037021038646e-08, + "loss": 0.75504291, + "num_input_tokens_seen": 340912260, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.09838867, + "step": 15803, + "time_per_iteration": 2.5468993186950684 + }, + { + "auxiliary_loss_clip": 0.06397918, + "auxiliary_loss_mlp": 0.01264702, + "balance_loss_clip": 0.06269416, + "balance_loss_mlp": 0.01254885, + "epoch": 0.9501878851645874, + "flos": 20820306965760.0, + "grad_norm": 1.6488350985874107, + "language_loss": 0.76223731, + "learning_rate": 2.5941140723968247e-08, + "loss": 0.83886349, + "num_input_tokens_seen": 340928930, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.0982666, + "step": 15804, + "time_per_iteration": 2.482729434967041 + }, + { + "auxiliary_loss_clip": 0.06402758, + "auxiliary_loss_mlp": 0.01265776, + "balance_loss_clip": 0.06268962, + "balance_loss_mlp": 0.0125618, + "epoch": 0.9502480084172553, + "flos": 18375309152640.0, + "grad_norm": 3.607340173427983, + "language_loss": 0.73302132, + "learning_rate": 2.5878654200581775e-08, + "loss": 0.80970663, + "num_input_tokens_seen": 340946615, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.09606934, + "step": 15805, + "time_per_iteration": 2.4692134857177734 + }, + { + "auxiliary_loss_clip": 0.06401005, + "auxiliary_loss_mlp": 0.01268215, + "balance_loss_clip": 0.06270203, + "balance_loss_mlp": 0.01258422, + "epoch": 0.9503081316699233, + "flos": 23556270481920.0, + "grad_norm": 1.4624608104842494, + "language_loss": 0.80504966, + "learning_rate": 2.5816242536074618e-08, + "loss": 0.88174188, + "num_input_tokens_seen": 340967545, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09790039, + "step": 15806, + "time_per_iteration": 2.5002782344818115 + }, + { + "auxiliary_loss_clip": 0.06403825, + "auxiliary_loss_mlp": 0.01266066, + "balance_loss_clip": 0.06271081, + "balance_loss_mlp": 0.0125706, + "epoch": 0.9503682549225914, + "flos": 18046217041920.0, + "grad_norm": 1.909262236411516, + "language_loss": 0.82481933, + "learning_rate": 2.5753905732813108e-08, + "loss": 0.90151823, + "num_input_tokens_seen": 340984955, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09008789, + "step": 15807, + "time_per_iteration": 2.4873461723327637 + }, + { + "auxiliary_loss_clip": 0.06400104, + "auxiliary_loss_mlp": 0.01265767, + "balance_loss_clip": 0.0627058, + "balance_loss_mlp": 0.01256564, + "epoch": 0.9504283781752593, + "flos": 25893429690240.0, + "grad_norm": 1.5955782807041765, + "language_loss": 0.7199322, + "learning_rate": 2.5691643793161355e-08, + "loss": 0.79659086, + "num_input_tokens_seen": 341007300, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.09204102, + "step": 15808, + "time_per_iteration": 2.540447473526001 + }, + { + "auxiliary_loss_clip": 0.06396027, + "auxiliary_loss_mlp": 0.01265833, + "balance_loss_clip": 0.06267738, + "balance_loss_mlp": 0.01256618, + "epoch": 0.9504885014279273, + "flos": 22130009009280.0, + "grad_norm": 1.3816783547504883, + "language_loss": 0.69870842, + "learning_rate": 2.562945671948058e-08, + "loss": 0.77532703, + "num_input_tokens_seen": 341026695, + "router_z_loss_clip": 1.28613281, + "router_z_loss_mlp": 0.09216309, + "step": 15809, + "time_per_iteration": 2.4813284873962402 + }, + { + "auxiliary_loss_clip": 0.06396701, + "auxiliary_loss_mlp": 0.01261651, + "balance_loss_clip": 0.06267259, + "balance_loss_mlp": 0.01253027, + "epoch": 0.9505486246805952, + "flos": 21622317920640.0, + "grad_norm": 1.4773684576527446, + "language_loss": 0.75935221, + "learning_rate": 2.5567344514128452e-08, + "loss": 0.83593571, + "num_input_tokens_seen": 341047080, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.08630371, + "step": 15810, + "time_per_iteration": 2.4988956451416016 + }, + { + "auxiliary_loss_clip": 0.06400326, + "auxiliary_loss_mlp": 0.01267617, + "balance_loss_clip": 0.06269518, + "balance_loss_mlp": 0.01257865, + "epoch": 0.9506087479332632, + "flos": 22534766956800.0, + "grad_norm": 1.3339331298451294, + "language_loss": 0.80074775, + "learning_rate": 2.5505307179460643e-08, + "loss": 0.87742716, + "num_input_tokens_seen": 341067310, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09753418, + "step": 15811, + "time_per_iteration": 3.8706562519073486 + }, + { + "auxiliary_loss_clip": 0.06399944, + "auxiliary_loss_mlp": 0.01264686, + "balance_loss_clip": 0.0626929, + "balance_loss_mlp": 0.01255168, + "epoch": 0.9506688711859311, + "flos": 27534823320960.0, + "grad_norm": 2.3021025111119133, + "language_loss": 0.70557272, + "learning_rate": 2.5443344717829495e-08, + "loss": 0.78221905, + "num_input_tokens_seen": 341085110, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.09509277, + "step": 15812, + "time_per_iteration": 2.5505876541137695 + }, + { + "auxiliary_loss_clip": 0.06405829, + "auxiliary_loss_mlp": 0.01262148, + "balance_loss_clip": 0.06271984, + "balance_loss_mlp": 0.01252826, + "epoch": 0.9507289944385992, + "flos": 19872037509120.0, + "grad_norm": 1.526419629738536, + "language_loss": 0.656178, + "learning_rate": 2.538145713158446e-08, + "loss": 0.73285776, + "num_input_tokens_seen": 341103190, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.09320068, + "step": 15813, + "time_per_iteration": 2.504990816116333 + }, + { + "auxiliary_loss_clip": 0.06402929, + "auxiliary_loss_mlp": 0.01264397, + "balance_loss_clip": 0.06271286, + "balance_loss_mlp": 0.01254694, + "epoch": 0.9507891176912671, + "flos": 25200515151360.0, + "grad_norm": 1.3164663911360832, + "language_loss": 0.70462513, + "learning_rate": 2.5319644423072327e-08, + "loss": 0.7812984, + "num_input_tokens_seen": 341125695, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.0970459, + "step": 15814, + "time_per_iteration": 2.5385372638702393 + }, + { + "auxiliary_loss_clip": 0.06397622, + "auxiliary_loss_mlp": 0.01262752, + "balance_loss_clip": 0.06269576, + "balance_loss_mlp": 0.01254253, + "epoch": 0.9508492409439351, + "flos": 24906446847360.0, + "grad_norm": 1.8852174609712755, + "language_loss": 0.63183349, + "learning_rate": 2.5257906594637445e-08, + "loss": 0.7084372, + "num_input_tokens_seen": 341143930, + "router_z_loss_clip": 1.28027344, + "router_z_loss_mlp": 0.08496094, + "step": 15815, + "time_per_iteration": 2.53188419342041 + }, + { + "auxiliary_loss_clip": 0.06401452, + "auxiliary_loss_mlp": 0.01264924, + "balance_loss_clip": 0.06271219, + "balance_loss_mlp": 0.01255883, + "epoch": 0.950909364196603, + "flos": 29791033885440.0, + "grad_norm": 1.7527785707750094, + "language_loss": 0.59055346, + "learning_rate": 2.519624364862061e-08, + "loss": 0.66721725, + "num_input_tokens_seen": 341164280, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.0904541, + "step": 15816, + "time_per_iteration": 2.5678937435150146 + }, + { + "auxiliary_loss_clip": 0.06401551, + "auxiliary_loss_mlp": 0.01261651, + "balance_loss_clip": 0.0627007, + "balance_loss_mlp": 0.01252478, + "epoch": 0.950969487449271, + "flos": 24724745268480.0, + "grad_norm": 1.574607991311696, + "language_loss": 0.73901993, + "learning_rate": 2.513465558735994e-08, + "loss": 0.81565189, + "num_input_tokens_seen": 341183670, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.09173584, + "step": 15817, + "time_per_iteration": 2.529062271118164 + }, + { + "auxiliary_loss_clip": 0.0640544, + "auxiliary_loss_mlp": 0.01266326, + "balance_loss_clip": 0.0627112, + "balance_loss_mlp": 0.0125611, + "epoch": 0.9510296107019389, + "flos": 13704302972160.0, + "grad_norm": 1.658723255681471, + "language_loss": 0.60563654, + "learning_rate": 2.5073142413190918e-08, + "loss": 0.68235421, + "num_input_tokens_seen": 341201900, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.10217285, + "step": 15818, + "time_per_iteration": 2.4677538871765137 + }, + { + "auxiliary_loss_clip": 0.06399883, + "auxiliary_loss_mlp": 0.01263447, + "balance_loss_clip": 0.06269322, + "balance_loss_mlp": 0.01253767, + "epoch": 0.9510897339546069, + "flos": 17317691717760.0, + "grad_norm": 1.6828133029068784, + "language_loss": 0.69863963, + "learning_rate": 2.5011704128446552e-08, + "loss": 0.77527297, + "num_input_tokens_seen": 341218340, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.09680176, + "step": 15819, + "time_per_iteration": 3.972642421722412 + }, + { + "auxiliary_loss_clip": 0.0640963, + "auxiliary_loss_mlp": 0.0126202, + "balance_loss_clip": 0.06275742, + "balance_loss_mlp": 0.01253055, + "epoch": 0.951149857207275, + "flos": 14799292128000.0, + "grad_norm": 1.637089994669383, + "language_loss": 0.74310344, + "learning_rate": 2.49503407354561e-08, + "loss": 0.81981993, + "num_input_tokens_seen": 341235885, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.08966064, + "step": 15820, + "time_per_iteration": 2.514216184616089 + }, + { + "auxiliary_loss_clip": 0.06404862, + "auxiliary_loss_mlp": 0.0126351, + "balance_loss_clip": 0.06269901, + "balance_loss_mlp": 0.01253491, + "epoch": 0.9512099804599429, + "flos": 19397273875200.0, + "grad_norm": 1.8501796910784354, + "language_loss": 0.78652138, + "learning_rate": 2.4889052236546804e-08, + "loss": 0.86320508, + "num_input_tokens_seen": 341255280, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.10028076, + "step": 15821, + "time_per_iteration": 2.4915084838867188 + }, + { + "auxiliary_loss_clip": 0.06399101, + "auxiliary_loss_mlp": 0.01262518, + "balance_loss_clip": 0.0626865, + "balance_loss_mlp": 0.01252993, + "epoch": 0.9512701037126109, + "flos": 36766816871040.0, + "grad_norm": 1.4188367342021355, + "language_loss": 0.71510702, + "learning_rate": 2.4827838634042586e-08, + "loss": 0.79172319, + "num_input_tokens_seen": 341279055, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.09533691, + "step": 15822, + "time_per_iteration": 2.641385078430176 + }, + { + "auxiliary_loss_clip": 0.06399742, + "auxiliary_loss_mlp": 0.01263886, + "balance_loss_clip": 0.06270236, + "balance_loss_mlp": 0.01255202, + "epoch": 0.9513302269652788, + "flos": 22644911548800.0, + "grad_norm": 1.5230172306663716, + "language_loss": 0.6589359, + "learning_rate": 2.47666999302647e-08, + "loss": 0.73557216, + "num_input_tokens_seen": 341298560, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.08691406, + "step": 15823, + "time_per_iteration": 2.6643285751342773 + }, + { + "auxiliary_loss_clip": 0.0639899, + "auxiliary_loss_mlp": 0.01264, + "balance_loss_clip": 0.06269787, + "balance_loss_mlp": 0.01255042, + "epoch": 0.9513903502179468, + "flos": 22899847196160.0, + "grad_norm": 1.6545118844209308, + "language_loss": 0.77469099, + "learning_rate": 2.4705636127531292e-08, + "loss": 0.85132086, + "num_input_tokens_seen": 341316650, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.08959961, + "step": 15824, + "time_per_iteration": 2.560600757598877 + }, + { + "auxiliary_loss_clip": 0.06404689, + "auxiliary_loss_mlp": 0.01262938, + "balance_loss_clip": 0.0626903, + "balance_loss_mlp": 0.0125274, + "epoch": 0.9514504734706147, + "flos": 27936143251200.0, + "grad_norm": 1.8708540735128236, + "language_loss": 0.74260736, + "learning_rate": 2.4644647228158065e-08, + "loss": 0.8192836, + "num_input_tokens_seen": 341336185, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10192871, + "step": 15825, + "time_per_iteration": 2.566944122314453 + }, + { + "auxiliary_loss_clip": 0.06308633, + "auxiliary_loss_mlp": 0.01249825, + "balance_loss_clip": 0.06254488, + "balance_loss_mlp": 0.01248835, + "epoch": 0.9515105967232828, + "flos": 67386485381760.0, + "grad_norm": 0.8171627417310032, + "language_loss": 0.53219813, + "learning_rate": 2.458373323445806e-08, + "loss": 0.60778272, + "num_input_tokens_seen": 341395795, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.00989532, + "step": 15826, + "time_per_iteration": 4.5212695598602295 + }, + { + "auxiliary_loss_clip": 0.06403694, + "auxiliary_loss_mlp": 0.01263494, + "balance_loss_clip": 0.06270428, + "balance_loss_mlp": 0.01253779, + "epoch": 0.9515707199759507, + "flos": 25853290784640.0, + "grad_norm": 1.7303662165905656, + "language_loss": 0.73298597, + "learning_rate": 2.452289414874076e-08, + "loss": 0.80965781, + "num_input_tokens_seen": 341415675, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.09716797, + "step": 15827, + "time_per_iteration": 2.5447840690612793 + }, + { + "auxiliary_loss_clip": 0.06404355, + "auxiliary_loss_mlp": 0.01266063, + "balance_loss_clip": 0.06272575, + "balance_loss_mlp": 0.0125593, + "epoch": 0.9516308432286187, + "flos": 21834389404800.0, + "grad_norm": 1.8023851639179382, + "language_loss": 0.74833316, + "learning_rate": 2.4462129973313207e-08, + "loss": 0.82503736, + "num_input_tokens_seen": 341432990, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.10131836, + "step": 15828, + "time_per_iteration": 2.4686501026153564 + }, + { + "auxiliary_loss_clip": 0.06403244, + "auxiliary_loss_mlp": 0.01265275, + "balance_loss_clip": 0.06274635, + "balance_loss_mlp": 0.01256406, + "epoch": 0.9516909664812866, + "flos": 27276617364480.0, + "grad_norm": 1.5487816970397665, + "language_loss": 0.73187357, + "learning_rate": 2.440144071047978e-08, + "loss": 0.80855876, + "num_input_tokens_seen": 341454100, + "router_z_loss_clip": 1.28613281, + "router_z_loss_mlp": 0.08874512, + "step": 15829, + "time_per_iteration": 2.542429208755493 + }, + { + "auxiliary_loss_clip": 0.06404226, + "auxiliary_loss_mlp": 0.01266859, + "balance_loss_clip": 0.06272517, + "balance_loss_mlp": 0.01258043, + "epoch": 0.9517510897339546, + "flos": 21221752677120.0, + "grad_norm": 1.837415216575745, + "language_loss": 0.61719525, + "learning_rate": 2.4340826362541533e-08, + "loss": 0.69390613, + "num_input_tokens_seen": 341472955, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.08807373, + "step": 15830, + "time_per_iteration": 3.9777581691741943 + }, + { + "auxiliary_loss_clip": 0.06406231, + "auxiliary_loss_mlp": 0.01270469, + "balance_loss_clip": 0.06272319, + "balance_loss_mlp": 0.0126011, + "epoch": 0.9518112129866225, + "flos": 18739928194560.0, + "grad_norm": 2.4162096913039286, + "language_loss": 0.73349452, + "learning_rate": 2.428028693179729e-08, + "loss": 0.81026161, + "num_input_tokens_seen": 341490165, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.10357666, + "step": 15831, + "time_per_iteration": 2.5067529678344727 + }, + { + "auxiliary_loss_clip": 0.0640035, + "auxiliary_loss_mlp": 0.01262917, + "balance_loss_clip": 0.06271564, + "balance_loss_mlp": 0.01253935, + "epoch": 0.9518713362392905, + "flos": 16769274744960.0, + "grad_norm": 1.653127425404805, + "language_loss": 0.65777677, + "learning_rate": 2.4219822420542545e-08, + "loss": 0.73440945, + "num_input_tokens_seen": 341508055, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.08984375, + "step": 15832, + "time_per_iteration": 2.4970624446868896 + }, + { + "auxiliary_loss_clip": 0.06398977, + "auxiliary_loss_mlp": 0.01268006, + "balance_loss_clip": 0.06273643, + "balance_loss_mlp": 0.01258475, + "epoch": 0.9519314594919586, + "flos": 15235887427200.0, + "grad_norm": 1.7378729185986037, + "language_loss": 0.7819438, + "learning_rate": 2.4159432831070135e-08, + "loss": 0.85861361, + "num_input_tokens_seen": 341526155, + "router_z_loss_clip": 1.25292969, + "router_z_loss_mlp": 0.09527588, + "step": 15833, + "time_per_iteration": 2.460865020751953 + }, + { + "auxiliary_loss_clip": 0.06399127, + "auxiliary_loss_mlp": 0.01262041, + "balance_loss_clip": 0.0627199, + "balance_loss_mlp": 0.01253261, + "epoch": 0.9519915827446265, + "flos": 19358770124160.0, + "grad_norm": 2.181001598505818, + "language_loss": 0.7522788, + "learning_rate": 2.4099118165670007e-08, + "loss": 0.8288905, + "num_input_tokens_seen": 341540450, + "router_z_loss_clip": 1.26953125, + "router_z_loss_mlp": 0.08782959, + "step": 15834, + "time_per_iteration": 2.5118231773376465 + }, + { + "auxiliary_loss_clip": 0.06410512, + "auxiliary_loss_mlp": 0.01265298, + "balance_loss_clip": 0.0627307, + "balance_loss_mlp": 0.01255403, + "epoch": 0.9520517059972945, + "flos": 22271697463680.0, + "grad_norm": 1.899995669990022, + "language_loss": 0.76650679, + "learning_rate": 2.4038878426629216e-08, + "loss": 0.84326494, + "num_input_tokens_seen": 341557865, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.09899902, + "step": 15835, + "time_per_iteration": 2.519073486328125 + }, + { + "auxiliary_loss_clip": 0.06403553, + "auxiliary_loss_mlp": 0.01265144, + "balance_loss_clip": 0.06271343, + "balance_loss_mlp": 0.01255876, + "epoch": 0.9521118292499624, + "flos": 14866907973120.0, + "grad_norm": 2.201120374190252, + "language_loss": 0.66960144, + "learning_rate": 2.397871361623238e-08, + "loss": 0.74628842, + "num_input_tokens_seen": 341573890, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.09265137, + "step": 15836, + "time_per_iteration": 2.5229427814483643 + }, + { + "auxiliary_loss_clip": 0.06397817, + "auxiliary_loss_mlp": 0.01262274, + "balance_loss_clip": 0.06269939, + "balance_loss_mlp": 0.01253011, + "epoch": 0.9521719525026304, + "flos": 23514747984000.0, + "grad_norm": 3.4643899323136553, + "language_loss": 0.70896757, + "learning_rate": 2.391862373676057e-08, + "loss": 0.78556848, + "num_input_tokens_seen": 341593770, + "router_z_loss_clip": 1.27832031, + "router_z_loss_mlp": 0.0927124, + "step": 15837, + "time_per_iteration": 2.5268142223358154 + }, + { + "auxiliary_loss_clip": 0.06405401, + "auxiliary_loss_mlp": 0.01263888, + "balance_loss_clip": 0.06271522, + "balance_loss_mlp": 0.01253648, + "epoch": 0.9522320757552983, + "flos": 19720328492160.0, + "grad_norm": 2.1007938575310847, + "language_loss": 0.73421597, + "learning_rate": 2.3858608790492617e-08, + "loss": 0.81090885, + "num_input_tokens_seen": 341612065, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.10241699, + "step": 15838, + "time_per_iteration": 2.5051467418670654 + }, + { + "auxiliary_loss_clip": 0.06400177, + "auxiliary_loss_mlp": 0.0126335, + "balance_loss_clip": 0.06268861, + "balance_loss_mlp": 0.01254606, + "epoch": 0.9522921990079664, + "flos": 25928369642880.0, + "grad_norm": 1.8172457888979467, + "language_loss": 0.78152144, + "learning_rate": 2.379866877970449e-08, + "loss": 0.85815668, + "num_input_tokens_seen": 341631365, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.08740234, + "step": 15839, + "time_per_iteration": 2.5397469997406006 + }, + { + "auxiliary_loss_clip": 0.06404764, + "auxiliary_loss_mlp": 0.01264586, + "balance_loss_clip": 0.06270763, + "balance_loss_mlp": 0.01255839, + "epoch": 0.9523523222606343, + "flos": 19214104849920.0, + "grad_norm": 1.5224815877407776, + "language_loss": 0.80422169, + "learning_rate": 2.3738803706668585e-08, + "loss": 0.88091516, + "num_input_tokens_seen": 341650300, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.0874939, + "step": 15840, + "time_per_iteration": 2.50748872756958 + }, + { + "auxiliary_loss_clip": 0.06395362, + "auxiliary_loss_mlp": 0.01263889, + "balance_loss_clip": 0.06268735, + "balance_loss_mlp": 0.01255753, + "epoch": 0.9524124455133023, + "flos": 20927265102720.0, + "grad_norm": 1.8175470123467525, + "language_loss": 0.73156214, + "learning_rate": 2.3679013573655314e-08, + "loss": 0.80815464, + "num_input_tokens_seen": 341667680, + "router_z_loss_clip": 1.265625, + "router_z_loss_mlp": 0.08129883, + "step": 15841, + "time_per_iteration": 2.4872241020202637 + }, + { + "auxiliary_loss_clip": 0.06395878, + "auxiliary_loss_mlp": 0.0126485, + "balance_loss_clip": 0.0627192, + "balance_loss_mlp": 0.01256249, + "epoch": 0.9524725687659702, + "flos": 18849527735040.0, + "grad_norm": 1.7833255311576237, + "language_loss": 0.79193342, + "learning_rate": 2.3619298382931972e-08, + "loss": 0.86854064, + "num_input_tokens_seen": 341685760, + "router_z_loss_clip": 1.24023438, + "router_z_loss_mlp": 0.0859375, + "step": 15842, + "time_per_iteration": 2.486121654510498 + }, + { + "auxiliary_loss_clip": 0.06402968, + "auxiliary_loss_mlp": 0.01264831, + "balance_loss_clip": 0.06273231, + "balance_loss_mlp": 0.01255526, + "epoch": 0.9525326920186382, + "flos": 22681318947840.0, + "grad_norm": 1.6085240870156023, + "language_loss": 0.72762179, + "learning_rate": 2.3559658136762973e-08, + "loss": 0.80429983, + "num_input_tokens_seen": 341705300, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.09307861, + "step": 15843, + "time_per_iteration": 2.523148775100708 + }, + { + "auxiliary_loss_clip": 0.06404278, + "auxiliary_loss_mlp": 0.01264952, + "balance_loss_clip": 0.06271654, + "balance_loss_mlp": 0.01255058, + "epoch": 0.9525928152713061, + "flos": 22092469580160.0, + "grad_norm": 1.502432873794168, + "language_loss": 0.78351128, + "learning_rate": 2.3500092837409612e-08, + "loss": 0.86020356, + "num_input_tokens_seen": 341724565, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09899902, + "step": 15844, + "time_per_iteration": 2.524716377258301 + }, + { + "auxiliary_loss_clip": 0.06406938, + "auxiliary_loss_mlp": 0.01266712, + "balance_loss_clip": 0.06270063, + "balance_loss_mlp": 0.01255578, + "epoch": 0.9526529385239741, + "flos": 20711084768640.0, + "grad_norm": 2.0263100699563488, + "language_loss": 0.70321971, + "learning_rate": 2.3440602487130977e-08, + "loss": 0.77995622, + "num_input_tokens_seen": 341743605, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.11138916, + "step": 15845, + "time_per_iteration": 2.500941038131714 + }, + { + "auxiliary_loss_clip": 0.06405, + "auxiliary_loss_mlp": 0.01265696, + "balance_loss_clip": 0.06269988, + "balance_loss_mlp": 0.01256338, + "epoch": 0.9527130617766422, + "flos": 23374820465280.0, + "grad_norm": 1.3991687644307798, + "language_loss": 0.75763822, + "learning_rate": 2.338118708818282e-08, + "loss": 0.83434522, + "num_input_tokens_seen": 341763475, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.09356689, + "step": 15846, + "time_per_iteration": 2.5281105041503906 + }, + { + "auxiliary_loss_clip": 0.06399485, + "auxiliary_loss_mlp": 0.01262481, + "balance_loss_clip": 0.06267849, + "balance_loss_mlp": 0.01253689, + "epoch": 0.9527731850293101, + "flos": 18991341970560.0, + "grad_norm": 1.6178897673715225, + "language_loss": 0.78373063, + "learning_rate": 2.3321846642817998e-08, + "loss": 0.86035025, + "num_input_tokens_seen": 341781265, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.0880127, + "step": 15847, + "time_per_iteration": 2.481491804122925 + }, + { + "auxiliary_loss_clip": 0.06396569, + "auxiliary_loss_mlp": 0.01264523, + "balance_loss_clip": 0.06267966, + "balance_loss_mlp": 0.01255624, + "epoch": 0.9528333082819781, + "flos": 19324123660800.0, + "grad_norm": 1.5274665589358507, + "language_loss": 0.77939975, + "learning_rate": 2.326258115328672e-08, + "loss": 0.85601068, + "num_input_tokens_seen": 341798825, + "router_z_loss_clip": 1.28613281, + "router_z_loss_mlp": 0.08905029, + "step": 15848, + "time_per_iteration": 2.5218746662139893 + }, + { + "auxiliary_loss_clip": 0.06409523, + "auxiliary_loss_mlp": 0.01266597, + "balance_loss_clip": 0.06273653, + "balance_loss_mlp": 0.01256178, + "epoch": 0.952893431534646, + "flos": 23958135463680.0, + "grad_norm": 1.5417221900752704, + "language_loss": 0.72081304, + "learning_rate": 2.320339062183674e-08, + "loss": 0.79757422, + "num_input_tokens_seen": 341819480, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.10424805, + "step": 15849, + "time_per_iteration": 2.5317416191101074 + }, + { + "auxiliary_loss_clip": 0.06408659, + "auxiliary_loss_mlp": 0.01266373, + "balance_loss_clip": 0.0627094, + "balance_loss_mlp": 0.01255829, + "epoch": 0.952953554787314, + "flos": 21036529226880.0, + "grad_norm": 1.660132090953839, + "language_loss": 0.75134432, + "learning_rate": 2.314427505071226e-08, + "loss": 0.8280946, + "num_input_tokens_seen": 341838035, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.10546875, + "step": 15850, + "time_per_iteration": 3.9556667804718018 + }, + { + "auxiliary_loss_clip": 0.06401952, + "auxiliary_loss_mlp": 0.01264257, + "balance_loss_clip": 0.06270756, + "balance_loss_mlp": 0.01255198, + "epoch": 0.9530136780399819, + "flos": 22389472776960.0, + "grad_norm": 2.0028001807866973, + "language_loss": 0.72165865, + "learning_rate": 2.308523444215482e-08, + "loss": 0.79832071, + "num_input_tokens_seen": 341855895, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09063721, + "step": 15851, + "time_per_iteration": 2.546628952026367 + }, + { + "auxiliary_loss_clip": 0.06401463, + "auxiliary_loss_mlp": 0.01264181, + "balance_loss_clip": 0.06271876, + "balance_loss_mlp": 0.01255515, + "epoch": 0.95307380129265, + "flos": 22165452086400.0, + "grad_norm": 2.8670815366039606, + "language_loss": 0.79601598, + "learning_rate": 2.3026268798403525e-08, + "loss": 0.87267244, + "num_input_tokens_seen": 341875240, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.08666992, + "step": 15852, + "time_per_iteration": 2.514052152633667 + }, + { + "auxiliary_loss_clip": 0.06401996, + "auxiliary_loss_mlp": 0.01266531, + "balance_loss_clip": 0.06269959, + "balance_loss_mlp": 0.01257191, + "epoch": 0.9531339245453179, + "flos": 44033607486720.0, + "grad_norm": 2.022438359351555, + "language_loss": 0.59703016, + "learning_rate": 2.2967378121694138e-08, + "loss": 0.67371547, + "num_input_tokens_seen": 341901020, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.09344482, + "step": 15853, + "time_per_iteration": 2.73101806640625 + }, + { + "auxiliary_loss_clip": 0.06398737, + "auxiliary_loss_mlp": 0.01263116, + "balance_loss_clip": 0.06272894, + "balance_loss_mlp": 0.01254556, + "epoch": 0.9531940477979859, + "flos": 20272938168960.0, + "grad_norm": 1.6568412443068294, + "language_loss": 0.72921371, + "learning_rate": 2.290856241425998e-08, + "loss": 0.80583227, + "num_input_tokens_seen": 341919365, + "router_z_loss_clip": 1.25878906, + "router_z_loss_mlp": 0.08557129, + "step": 15854, + "time_per_iteration": 2.475628137588501 + }, + { + "auxiliary_loss_clip": 0.06404815, + "auxiliary_loss_mlp": 0.01262782, + "balance_loss_clip": 0.06271343, + "balance_loss_mlp": 0.01253573, + "epoch": 0.9532541710506538, + "flos": 25342413240960.0, + "grad_norm": 2.097114010753005, + "language_loss": 0.67732322, + "learning_rate": 2.284982167833127e-08, + "loss": 0.75399917, + "num_input_tokens_seen": 341939985, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.09216309, + "step": 15855, + "time_per_iteration": 2.5460567474365234 + }, + { + "auxiliary_loss_clip": 0.06401306, + "auxiliary_loss_mlp": 0.01267791, + "balance_loss_clip": 0.06270517, + "balance_loss_mlp": 0.01258147, + "epoch": 0.9533142943033218, + "flos": 26476576980480.0, + "grad_norm": 1.5411782595098198, + "language_loss": 0.76690978, + "learning_rate": 2.279115591613556e-08, + "loss": 0.84360075, + "num_input_tokens_seen": 341959255, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09631348, + "step": 15856, + "time_per_iteration": 2.5271217823028564 + }, + { + "auxiliary_loss_clip": 0.06399896, + "auxiliary_loss_mlp": 0.0126262, + "balance_loss_clip": 0.06270279, + "balance_loss_mlp": 0.01253203, + "epoch": 0.9533744175559897, + "flos": 23663270545920.0, + "grad_norm": 1.480276533024058, + "language_loss": 0.77887392, + "learning_rate": 2.2732565129897075e-08, + "loss": 0.85549903, + "num_input_tokens_seen": 341977205, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.09423828, + "step": 15857, + "time_per_iteration": 2.526076555252075 + }, + { + "auxiliary_loss_clip": 0.06311148, + "auxiliary_loss_mlp": 0.01248159, + "balance_loss_clip": 0.06257018, + "balance_loss_mlp": 0.01247038, + "epoch": 0.9534345408086577, + "flos": 61070270209920.0, + "grad_norm": 0.6905807509758151, + "language_loss": 0.62508583, + "learning_rate": 2.267404932183803e-08, + "loss": 0.70067894, + "num_input_tokens_seen": 342038545, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.01124573, + "step": 15858, + "time_per_iteration": 4.573625564575195 + }, + { + "auxiliary_loss_clip": 0.06398419, + "auxiliary_loss_mlp": 0.01262909, + "balance_loss_clip": 0.06267797, + "balance_loss_mlp": 0.01254243, + "epoch": 0.9534946640613258, + "flos": 18957450193920.0, + "grad_norm": 1.4570268848956331, + "language_loss": 0.57324982, + "learning_rate": 2.2615608494177097e-08, + "loss": 0.64986312, + "num_input_tokens_seen": 342058195, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.08666992, + "step": 15859, + "time_per_iteration": 2.5377213954925537 + }, + { + "auxiliary_loss_clip": 0.06395449, + "auxiliary_loss_mlp": 0.01262921, + "balance_loss_clip": 0.06269926, + "balance_loss_mlp": 0.01254517, + "epoch": 0.9535547873139937, + "flos": 16659884839680.0, + "grad_norm": 1.9960585900313483, + "language_loss": 0.81999767, + "learning_rate": 2.2557242649130504e-08, + "loss": 0.89658141, + "num_input_tokens_seen": 342075025, + "router_z_loss_clip": 1.25390625, + "router_z_loss_mlp": 0.08398438, + "step": 15860, + "time_per_iteration": 2.499480962753296 + }, + { + "auxiliary_loss_clip": 0.0640311, + "auxiliary_loss_mlp": 0.01263902, + "balance_loss_clip": 0.06270963, + "balance_loss_mlp": 0.01254842, + "epoch": 0.9536149105666617, + "flos": 20674048464000.0, + "grad_norm": 1.6914081967904189, + "language_loss": 0.67099893, + "learning_rate": 2.249895178891159e-08, + "loss": 0.74766904, + "num_input_tokens_seen": 342094595, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.09057617, + "step": 15861, + "time_per_iteration": 2.5145528316497803 + }, + { + "auxiliary_loss_clip": 0.06404839, + "auxiliary_loss_mlp": 0.01266949, + "balance_loss_clip": 0.06272271, + "balance_loss_mlp": 0.0125743, + "epoch": 0.9536750338193296, + "flos": 30708304531200.0, + "grad_norm": 1.7038056043376955, + "language_loss": 0.65918678, + "learning_rate": 2.244073591573037e-08, + "loss": 0.73590457, + "num_input_tokens_seen": 342115970, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09527588, + "step": 15862, + "time_per_iteration": 2.603203535079956 + }, + { + "auxiliary_loss_clip": 0.06399581, + "auxiliary_loss_mlp": 0.01268128, + "balance_loss_clip": 0.06274375, + "balance_loss_mlp": 0.01259566, + "epoch": 0.9537351570719976, + "flos": 20410559700480.0, + "grad_norm": 1.9688037838707206, + "language_loss": 0.67976749, + "learning_rate": 2.238259503179485e-08, + "loss": 0.75644457, + "num_input_tokens_seen": 342134080, + "router_z_loss_clip": 1.25195312, + "router_z_loss_mlp": 0.08566284, + "step": 15863, + "time_per_iteration": 2.4922752380371094 + }, + { + "auxiliary_loss_clip": 0.0639983, + "auxiliary_loss_mlp": 0.01266275, + "balance_loss_clip": 0.06269602, + "balance_loss_mlp": 0.01257543, + "epoch": 0.9537952803246655, + "flos": 29936076503040.0, + "grad_norm": 1.911779704928809, + "language_loss": 0.78732878, + "learning_rate": 2.2324529139309267e-08, + "loss": 0.86398983, + "num_input_tokens_seen": 342154725, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.08728027, + "step": 15864, + "time_per_iteration": 2.5733559131622314 + }, + { + "auxiliary_loss_clip": 0.06401516, + "auxiliary_loss_mlp": 0.01262552, + "balance_loss_clip": 0.06272466, + "balance_loss_mlp": 0.01253886, + "epoch": 0.9538554035773336, + "flos": 20527580327040.0, + "grad_norm": 1.9107480949648576, + "language_loss": 0.59663749, + "learning_rate": 2.226653824047586e-08, + "loss": 0.67327815, + "num_input_tokens_seen": 342172275, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.08660889, + "step": 15865, + "time_per_iteration": 2.497642993927002 + }, + { + "auxiliary_loss_clip": 0.06402111, + "auxiliary_loss_mlp": 0.01268229, + "balance_loss_clip": 0.06271199, + "balance_loss_mlp": 0.01259008, + "epoch": 0.9539155268300015, + "flos": 18412555092480.0, + "grad_norm": 1.675507337482719, + "language_loss": 0.69925714, + "learning_rate": 2.2208622337493765e-08, + "loss": 0.77596056, + "num_input_tokens_seen": 342190880, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09222412, + "step": 15866, + "time_per_iteration": 4.007173299789429 + }, + { + "auxiliary_loss_clip": 0.06402818, + "auxiliary_loss_mlp": 0.01267711, + "balance_loss_clip": 0.06272386, + "balance_loss_mlp": 0.01257846, + "epoch": 0.9539756500826695, + "flos": 26220425448960.0, + "grad_norm": 3.175329411462929, + "language_loss": 0.85554373, + "learning_rate": 2.215078143255855e-08, + "loss": 0.93224895, + "num_input_tokens_seen": 342208165, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.09863281, + "step": 15867, + "time_per_iteration": 2.5468525886535645 + }, + { + "auxiliary_loss_clip": 0.06306315, + "auxiliary_loss_mlp": 0.01249791, + "balance_loss_clip": 0.06252342, + "balance_loss_mlp": 0.01248795, + "epoch": 0.9540357733353374, + "flos": 68310673989120.0, + "grad_norm": 0.7435794957212412, + "language_loss": 0.61859345, + "learning_rate": 2.2093015527864024e-08, + "loss": 0.6941545, + "num_input_tokens_seen": 342277110, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.00995636, + "step": 15868, + "time_per_iteration": 3.1682119369506836 + }, + { + "auxiliary_loss_clip": 0.06400545, + "auxiliary_loss_mlp": 0.01265566, + "balance_loss_clip": 0.06270404, + "balance_loss_mlp": 0.01256417, + "epoch": 0.9540958965880054, + "flos": 21294693256320.0, + "grad_norm": 1.9119613617330347, + "language_loss": 0.60321581, + "learning_rate": 2.2035324625600425e-08, + "loss": 0.67987692, + "num_input_tokens_seen": 342294695, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.0914917, + "step": 15869, + "time_per_iteration": 3.9212167263031006 + }, + { + "auxiliary_loss_clip": 0.06401898, + "auxiliary_loss_mlp": 0.01263992, + "balance_loss_clip": 0.06271598, + "balance_loss_mlp": 0.01255755, + "epoch": 0.9541560198406733, + "flos": 19756819745280.0, + "grad_norm": 1.512796436338129, + "language_loss": 0.71245605, + "learning_rate": 2.197770872795579e-08, + "loss": 0.78911495, + "num_input_tokens_seen": 342314970, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.08239746, + "step": 15870, + "time_per_iteration": 2.5445003509521484 + }, + { + "auxiliary_loss_clip": 0.06398092, + "auxiliary_loss_mlp": 0.01262742, + "balance_loss_clip": 0.06267514, + "balance_loss_mlp": 0.01253176, + "epoch": 0.9542161430933414, + "flos": 24722229646080.0, + "grad_norm": 2.7015684448513255, + "language_loss": 0.77101582, + "learning_rate": 2.1920167837114368e-08, + "loss": 0.84762418, + "num_input_tokens_seen": 342334255, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.09564209, + "step": 15871, + "time_per_iteration": 2.5978353023529053 + }, + { + "auxiliary_loss_clip": 0.06402687, + "auxiliary_loss_mlp": 0.01266317, + "balance_loss_clip": 0.062712, + "balance_loss_mlp": 0.01256083, + "epoch": 0.9542762663460094, + "flos": 31073762113920.0, + "grad_norm": 1.836271204712955, + "language_loss": 0.58700699, + "learning_rate": 2.1862701955258634e-08, + "loss": 0.663697, + "num_input_tokens_seen": 342354730, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.10229492, + "step": 15872, + "time_per_iteration": 2.569619655609131 + }, + { + "auxiliary_loss_clip": 0.06405389, + "auxiliary_loss_mlp": 0.0126534, + "balance_loss_clip": 0.06270651, + "balance_loss_mlp": 0.01255404, + "epoch": 0.9543363895986773, + "flos": 20782935244800.0, + "grad_norm": 1.4552113328660328, + "language_loss": 0.75296628, + "learning_rate": 2.1805311084567514e-08, + "loss": 0.82967359, + "num_input_tokens_seen": 342374565, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.09936523, + "step": 15873, + "time_per_iteration": 2.5089752674102783 + }, + { + "auxiliary_loss_clip": 0.06401756, + "auxiliary_loss_mlp": 0.01263022, + "balance_loss_clip": 0.06270768, + "balance_loss_mlp": 0.01253205, + "epoch": 0.9543965128513453, + "flos": 24469725767040.0, + "grad_norm": 1.8570446909627079, + "language_loss": 0.62738776, + "learning_rate": 2.1747995227217265e-08, + "loss": 0.70403558, + "num_input_tokens_seen": 342394590, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.0980835, + "step": 15874, + "time_per_iteration": 2.529031276702881 + }, + { + "auxiliary_loss_clip": 0.06400266, + "auxiliary_loss_mlp": 0.01264719, + "balance_loss_clip": 0.06271619, + "balance_loss_mlp": 0.01255719, + "epoch": 0.9544566361040132, + "flos": 15265838062080.0, + "grad_norm": 2.078620235439226, + "language_loss": 0.89995027, + "learning_rate": 2.169075438538104e-08, + "loss": 0.97660017, + "num_input_tokens_seen": 342410445, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.08990479, + "step": 15875, + "time_per_iteration": 2.4796183109283447 + }, + { + "auxiliary_loss_clip": 0.06407903, + "auxiliary_loss_mlp": 0.01264624, + "balance_loss_clip": 0.06271803, + "balance_loss_mlp": 0.01254408, + "epoch": 0.9545167593566812, + "flos": 25925434750080.0, + "grad_norm": 1.6683219273292442, + "language_loss": 0.67765808, + "learning_rate": 2.1633588561229765e-08, + "loss": 0.75438333, + "num_input_tokens_seen": 342430970, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.10211182, + "step": 15876, + "time_per_iteration": 2.5184824466705322 + }, + { + "auxiliary_loss_clip": 0.06402661, + "auxiliary_loss_mlp": 0.01266519, + "balance_loss_clip": 0.06270103, + "balance_loss_mlp": 0.01256553, + "epoch": 0.9545768826093491, + "flos": 25635014098560.0, + "grad_norm": 1.790004894907314, + "language_loss": 0.69065815, + "learning_rate": 2.1576497756931267e-08, + "loss": 0.76734996, + "num_input_tokens_seen": 342449505, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.09967041, + "step": 15877, + "time_per_iteration": 2.5234038829803467 + }, + { + "auxiliary_loss_clip": 0.06403767, + "auxiliary_loss_mlp": 0.01263566, + "balance_loss_clip": 0.06269973, + "balance_loss_mlp": 0.01253815, + "epoch": 0.9546370058620172, + "flos": 22497982214400.0, + "grad_norm": 2.586759661224603, + "language_loss": 0.70764804, + "learning_rate": 2.1519481974650035e-08, + "loss": 0.78432131, + "num_input_tokens_seen": 342470390, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.09741211, + "step": 15878, + "time_per_iteration": 2.5088722705841064 + }, + { + "auxiliary_loss_clip": 0.06397127, + "auxiliary_loss_mlp": 0.01262444, + "balance_loss_clip": 0.06268129, + "balance_loss_mlp": 0.01253634, + "epoch": 0.9546971291146851, + "flos": 24616738955520.0, + "grad_norm": 1.3740237570513218, + "language_loss": 0.68291056, + "learning_rate": 2.1462541216548335e-08, + "loss": 0.75950634, + "num_input_tokens_seen": 342492560, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.08813477, + "step": 15879, + "time_per_iteration": 2.5728976726531982 + }, + { + "auxiliary_loss_clip": 0.06399859, + "auxiliary_loss_mlp": 0.01263148, + "balance_loss_clip": 0.06270926, + "balance_loss_mlp": 0.01253951, + "epoch": 0.9547572523673531, + "flos": 28665297480960.0, + "grad_norm": 1.9338134404565663, + "language_loss": 0.85166907, + "learning_rate": 2.1405675484785334e-08, + "loss": 0.92829913, + "num_input_tokens_seen": 342512315, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.09210205, + "step": 15880, + "time_per_iteration": 2.682302713394165 + }, + { + "auxiliary_loss_clip": 0.06399159, + "auxiliary_loss_mlp": 0.0126469, + "balance_loss_clip": 0.06267743, + "balance_loss_mlp": 0.01255326, + "epoch": 0.954817375620021, + "flos": 33811067295360.0, + "grad_norm": 1.7252221713052975, + "language_loss": 0.72050363, + "learning_rate": 2.134888478151753e-08, + "loss": 0.79714215, + "num_input_tokens_seen": 342533060, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09362793, + "step": 15881, + "time_per_iteration": 2.6338717937469482 + }, + { + "auxiliary_loss_clip": 0.06399329, + "auxiliary_loss_mlp": 0.01264091, + "balance_loss_clip": 0.06269658, + "balance_loss_mlp": 0.01254596, + "epoch": 0.954877498872689, + "flos": 14433373347840.0, + "grad_norm": 2.028539816265887, + "language_loss": 0.72078586, + "learning_rate": 2.1292169108898083e-08, + "loss": 0.79742002, + "num_input_tokens_seen": 342550830, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.09490967, + "step": 15882, + "time_per_iteration": 2.4859602451324463 + }, + { + "auxiliary_loss_clip": 0.06404308, + "auxiliary_loss_mlp": 0.01264748, + "balance_loss_clip": 0.06271283, + "balance_loss_mlp": 0.01255748, + "epoch": 0.9549376221253569, + "flos": 59282129681280.0, + "grad_norm": 1.6753503027814232, + "language_loss": 0.66631484, + "learning_rate": 2.1235528469078168e-08, + "loss": 0.74300539, + "num_input_tokens_seen": 342575070, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.09002686, + "step": 15883, + "time_per_iteration": 2.858281373977661 + }, + { + "auxiliary_loss_clip": 0.06403692, + "auxiliary_loss_mlp": 0.01264383, + "balance_loss_clip": 0.06270359, + "balance_loss_mlp": 0.01254167, + "epoch": 0.954997745378025, + "flos": 17280068434560.0, + "grad_norm": 2.018487507978806, + "language_loss": 0.77985692, + "learning_rate": 2.1178962864205175e-08, + "loss": 0.85653764, + "num_input_tokens_seen": 342592215, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.10223389, + "step": 15884, + "time_per_iteration": 2.4717769622802734 + }, + { + "auxiliary_loss_clip": 0.06402802, + "auxiliary_loss_mlp": 0.01263008, + "balance_loss_clip": 0.06270074, + "balance_loss_mlp": 0.01253949, + "epoch": 0.955057868630693, + "flos": 13011472287360.0, + "grad_norm": 1.7174754271027919, + "language_loss": 0.7789489, + "learning_rate": 2.1122472296424054e-08, + "loss": 0.85560703, + "num_input_tokens_seen": 342610030, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09057617, + "step": 15885, + "time_per_iteration": 2.47308087348938 + }, + { + "auxiliary_loss_clip": 0.06403592, + "auxiliary_loss_mlp": 0.01264253, + "balance_loss_clip": 0.062719, + "balance_loss_mlp": 0.01255324, + "epoch": 0.9551179918833609, + "flos": 22644240716160.0, + "grad_norm": 1.6851003761813457, + "language_loss": 0.70151675, + "learning_rate": 2.1066056767877317e-08, + "loss": 0.77819514, + "num_input_tokens_seen": 342626475, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.08929443, + "step": 15886, + "time_per_iteration": 2.5218918323516846 + }, + { + "auxiliary_loss_clip": 0.0640685, + "auxiliary_loss_mlp": 0.01264104, + "balance_loss_clip": 0.0627156, + "balance_loss_mlp": 0.01253261, + "epoch": 0.9551781151360289, + "flos": 21549125779200.0, + "grad_norm": 1.64294120083005, + "language_loss": 0.72599673, + "learning_rate": 2.1009716280703916e-08, + "loss": 0.8027063, + "num_input_tokens_seen": 342646645, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.10852051, + "step": 15887, + "time_per_iteration": 2.5182785987854004 + }, + { + "auxiliary_loss_clip": 0.06395856, + "auxiliary_loss_mlp": 0.01263725, + "balance_loss_clip": 0.06269971, + "balance_loss_mlp": 0.01254534, + "epoch": 0.9552382383886968, + "flos": 20708191802880.0, + "grad_norm": 1.9888087849985687, + "language_loss": 0.57053173, + "learning_rate": 2.0953450837040364e-08, + "loss": 0.64712757, + "num_input_tokens_seen": 342663615, + "router_z_loss_clip": 1.25683594, + "router_z_loss_mlp": 0.09197998, + "step": 15888, + "time_per_iteration": 2.4768011569976807 + }, + { + "auxiliary_loss_clip": 0.06306279, + "auxiliary_loss_mlp": 0.01249003, + "balance_loss_clip": 0.06252466, + "balance_loss_mlp": 0.01247993, + "epoch": 0.9552983616413648, + "flos": 67789859736960.0, + "grad_norm": 0.6952192032198299, + "language_loss": 0.57792616, + "learning_rate": 2.0897260439020514e-08, + "loss": 0.65347898, + "num_input_tokens_seen": 342728275, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.01010132, + "step": 15889, + "time_per_iteration": 3.172846794128418 + }, + { + "auxiliary_loss_clip": 0.06403498, + "auxiliary_loss_mlp": 0.01264184, + "balance_loss_clip": 0.06267909, + "balance_loss_mlp": 0.01254278, + "epoch": 0.9553584848940327, + "flos": 21586413646080.0, + "grad_norm": 1.3421643090083555, + "language_loss": 0.66883469, + "learning_rate": 2.084114508877466e-08, + "loss": 0.74551147, + "num_input_tokens_seen": 342748860, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.09906006, + "step": 15890, + "time_per_iteration": 3.8940742015838623 + }, + { + "auxiliary_loss_clip": 0.06402219, + "auxiliary_loss_mlp": 0.01263198, + "balance_loss_clip": 0.06271324, + "balance_loss_mlp": 0.01254263, + "epoch": 0.9554186081467008, + "flos": 24215251317120.0, + "grad_norm": 1.4384385434971376, + "language_loss": 0.74144399, + "learning_rate": 2.0785104788430874e-08, + "loss": 0.81809819, + "num_input_tokens_seen": 342769705, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.08929443, + "step": 15891, + "time_per_iteration": 2.506944179534912 + }, + { + "auxiliary_loss_clip": 0.06399399, + "auxiliary_loss_mlp": 0.01263987, + "balance_loss_clip": 0.06272076, + "balance_loss_mlp": 0.01254945, + "epoch": 0.9554787313993687, + "flos": 16256845900800.0, + "grad_norm": 1.9258127915032677, + "language_loss": 0.78508484, + "learning_rate": 2.072913954011435e-08, + "loss": 0.86171877, + "num_input_tokens_seen": 342787000, + "router_z_loss_clip": 1.2734375, + "router_z_loss_mlp": 0.09039307, + "step": 15892, + "time_per_iteration": 2.4849460124969482 + }, + { + "auxiliary_loss_clip": 0.06401937, + "auxiliary_loss_mlp": 0.01264788, + "balance_loss_clip": 0.06271841, + "balance_loss_mlp": 0.01255042, + "epoch": 0.9555388546520367, + "flos": 23410850520960.0, + "grad_norm": 4.041459820212515, + "language_loss": 0.69976628, + "learning_rate": 2.0673249345947386e-08, + "loss": 0.77643347, + "num_input_tokens_seen": 342807795, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.09741211, + "step": 15893, + "time_per_iteration": 2.5242111682891846 + }, + { + "auxiliary_loss_clip": 0.06400245, + "auxiliary_loss_mlp": 0.01265117, + "balance_loss_clip": 0.06271841, + "balance_loss_mlp": 0.0125474, + "epoch": 0.9555989779047046, + "flos": 14799417909120.0, + "grad_norm": 1.7085696744264771, + "language_loss": 0.66091406, + "learning_rate": 2.0617434208048955e-08, + "loss": 0.73756772, + "num_input_tokens_seen": 342825490, + "router_z_loss_clip": 1.28417969, + "router_z_loss_mlp": 0.10375977, + "step": 15894, + "time_per_iteration": 2.4788177013397217 + }, + { + "auxiliary_loss_clip": 0.06402315, + "auxiliary_loss_mlp": 0.01265129, + "balance_loss_clip": 0.06269212, + "balance_loss_mlp": 0.01255175, + "epoch": 0.9556591011573726, + "flos": 22243298129280.0, + "grad_norm": 1.7483661442382448, + "language_loss": 0.82017207, + "learning_rate": 2.056169412853581e-08, + "loss": 0.89684647, + "num_input_tokens_seen": 342844965, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.09954834, + "step": 15895, + "time_per_iteration": 2.498887777328491 + }, + { + "auxiliary_loss_clip": 0.06403477, + "auxiliary_loss_mlp": 0.0126659, + "balance_loss_clip": 0.06272532, + "balance_loss_mlp": 0.01257476, + "epoch": 0.9557192244100405, + "flos": 27862741474560.0, + "grad_norm": 1.5507506491352763, + "language_loss": 0.72899592, + "learning_rate": 2.0506029109521593e-08, + "loss": 0.80569655, + "num_input_tokens_seen": 342865915, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.09118652, + "step": 15896, + "time_per_iteration": 2.564551830291748 + }, + { + "auxiliary_loss_clip": 0.06398298, + "auxiliary_loss_mlp": 0.01264488, + "balance_loss_clip": 0.06269994, + "balance_loss_mlp": 0.0125488, + "epoch": 0.9557793476627086, + "flos": 17608531639680.0, + "grad_norm": 1.9204607289870128, + "language_loss": 0.79759163, + "learning_rate": 2.045043915311706e-08, + "loss": 0.87421948, + "num_input_tokens_seen": 342884000, + "router_z_loss_clip": 1.28320312, + "router_z_loss_mlp": 0.09613037, + "step": 15897, + "time_per_iteration": 3.9504964351654053 + }, + { + "auxiliary_loss_clip": 0.06402426, + "auxiliary_loss_mlp": 0.0126348, + "balance_loss_clip": 0.06270665, + "balance_loss_mlp": 0.01253601, + "epoch": 0.9558394709153766, + "flos": 23881798794240.0, + "grad_norm": 1.5071236590809027, + "language_loss": 0.72668207, + "learning_rate": 2.03949242614303e-08, + "loss": 0.80334115, + "num_input_tokens_seen": 342903095, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09884644, + "step": 15898, + "time_per_iteration": 2.5182039737701416 + }, + { + "auxiliary_loss_clip": 0.06307501, + "auxiliary_loss_mlp": 0.01250726, + "balance_loss_clip": 0.06253572, + "balance_loss_mlp": 0.01249622, + "epoch": 0.9558995941680445, + "flos": 53698995152640.0, + "grad_norm": 0.8900999457262652, + "language_loss": 0.52336156, + "learning_rate": 2.033948443656652e-08, + "loss": 0.59894383, + "num_input_tokens_seen": 342958155, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.01106262, + "step": 15899, + "time_per_iteration": 3.0710113048553467 + }, + { + "auxiliary_loss_clip": 0.06409016, + "auxiliary_loss_mlp": 0.01266314, + "balance_loss_clip": 0.06271899, + "balance_loss_mlp": 0.0125565, + "epoch": 0.9559597174207125, + "flos": 13768355018880.0, + "grad_norm": 3.1416892180470533, + "language_loss": 0.69164026, + "learning_rate": 2.028411968062782e-08, + "loss": 0.76839364, + "num_input_tokens_seen": 342972500, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.10662842, + "step": 15900, + "time_per_iteration": 2.4697251319885254 + }, + { + "auxiliary_loss_clip": 0.06400748, + "auxiliary_loss_mlp": 0.01264197, + "balance_loss_clip": 0.06269322, + "balance_loss_mlp": 0.01254553, + "epoch": 0.9560198406733804, + "flos": 19942210903680.0, + "grad_norm": 1.799845968546889, + "language_loss": 0.83136785, + "learning_rate": 2.0228829995713627e-08, + "loss": 0.9080174, + "num_input_tokens_seen": 342989035, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09637451, + "step": 15901, + "time_per_iteration": 2.4810070991516113 + }, + { + "auxiliary_loss_clip": 0.06309229, + "auxiliary_loss_mlp": 0.0125074, + "balance_loss_clip": 0.06255125, + "balance_loss_mlp": 0.01249663, + "epoch": 0.9560799639260484, + "flos": 57306388331520.0, + "grad_norm": 0.8422702355549128, + "language_loss": 0.54080284, + "learning_rate": 2.0173615383920485e-08, + "loss": 0.61640251, + "num_input_tokens_seen": 343051675, + "router_z_loss_clip": 0.54199219, + "router_z_loss_mlp": 0.01078033, + "step": 15902, + "time_per_iteration": 3.218306303024292 + }, + { + "auxiliary_loss_clip": 0.06393287, + "auxiliary_loss_mlp": 0.012633, + "balance_loss_clip": 0.06269377, + "balance_loss_mlp": 0.01255715, + "epoch": 0.9561400871787163, + "flos": 18923264928000.0, + "grad_norm": 1.5636157887301885, + "language_loss": 0.85598201, + "learning_rate": 2.01184758473425e-08, + "loss": 0.93254787, + "num_input_tokens_seen": 343068895, + "router_z_loss_clip": 1.24023438, + "router_z_loss_mlp": 0.07583618, + "step": 15903, + "time_per_iteration": 2.540703773498535 + }, + { + "auxiliary_loss_clip": 0.06400403, + "auxiliary_loss_mlp": 0.01264973, + "balance_loss_clip": 0.0626982, + "balance_loss_mlp": 0.01256205, + "epoch": 0.9562002104313844, + "flos": 18044036835840.0, + "grad_norm": 2.1727192495909162, + "language_loss": 0.80775261, + "learning_rate": 2.0063411388070217e-08, + "loss": 0.88440645, + "num_input_tokens_seen": 343087115, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.08758545, + "step": 15904, + "time_per_iteration": 2.5238215923309326 + }, + { + "auxiliary_loss_clip": 0.06405573, + "auxiliary_loss_mlp": 0.01263965, + "balance_loss_clip": 0.06272165, + "balance_loss_mlp": 0.01253791, + "epoch": 0.9562603336840523, + "flos": 24724619487360.0, + "grad_norm": 2.309004230193841, + "language_loss": 0.60495961, + "learning_rate": 2.0008422008191972e-08, + "loss": 0.68165493, + "num_input_tokens_seen": 343105575, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.10180664, + "step": 15905, + "time_per_iteration": 2.5484659671783447 + }, + { + "auxiliary_loss_clip": 0.0639789, + "auxiliary_loss_mlp": 0.0126401, + "balance_loss_clip": 0.06268601, + "balance_loss_mlp": 0.01255028, + "epoch": 0.9563204569367203, + "flos": 21183332780160.0, + "grad_norm": 1.726875839834982, + "language_loss": 0.70595205, + "learning_rate": 1.995350770979254e-08, + "loss": 0.78257102, + "num_input_tokens_seen": 343123025, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.08990479, + "step": 15906, + "time_per_iteration": 3.9245364665985107 + }, + { + "auxiliary_loss_clip": 0.06408137, + "auxiliary_loss_mlp": 0.01263171, + "balance_loss_clip": 0.06272523, + "balance_loss_mlp": 0.0125271, + "epoch": 0.9563805801893882, + "flos": 20235901864320.0, + "grad_norm": 1.7588326158627845, + "language_loss": 0.70970643, + "learning_rate": 1.9898668494954473e-08, + "loss": 0.78641951, + "num_input_tokens_seen": 343141625, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.10455322, + "step": 15907, + "time_per_iteration": 2.4972972869873047 + }, + { + "auxiliary_loss_clip": 0.06399702, + "auxiliary_loss_mlp": 0.01266174, + "balance_loss_clip": 0.06271538, + "balance_loss_mlp": 0.01257567, + "epoch": 0.9564407034420562, + "flos": 25418079077760.0, + "grad_norm": 2.2506849509040543, + "language_loss": 0.70946819, + "learning_rate": 1.9843904365757447e-08, + "loss": 0.78612697, + "num_input_tokens_seen": 343161300, + "router_z_loss_clip": 1.28320312, + "router_z_loss_mlp": 0.08605957, + "step": 15908, + "time_per_iteration": 2.5539722442626953 + }, + { + "auxiliary_loss_clip": 0.06401962, + "auxiliary_loss_mlp": 0.01264879, + "balance_loss_clip": 0.06271769, + "balance_loss_mlp": 0.01256022, + "epoch": 0.9565008266947241, + "flos": 18629699748480.0, + "grad_norm": 2.0637627701483607, + "language_loss": 0.82866412, + "learning_rate": 1.978921532427802e-08, + "loss": 0.90533257, + "num_input_tokens_seen": 343177815, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.08856201, + "step": 15909, + "time_per_iteration": 3.9678423404693604 + }, + { + "auxiliary_loss_clip": 0.06401636, + "auxiliary_loss_mlp": 0.01262877, + "balance_loss_clip": 0.06272514, + "balance_loss_mlp": 0.01253639, + "epoch": 0.9565609499473922, + "flos": 24868865491200.0, + "grad_norm": 1.7859019883624712, + "language_loss": 0.67964911, + "learning_rate": 1.9734601372590086e-08, + "loss": 0.75629425, + "num_input_tokens_seen": 343198140, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.09234619, + "step": 15910, + "time_per_iteration": 2.5445590019226074 + }, + { + "auxiliary_loss_clip": 0.06406734, + "auxiliary_loss_mlp": 0.01263274, + "balance_loss_clip": 0.06272303, + "balance_loss_mlp": 0.01253886, + "epoch": 0.9566210732000601, + "flos": 21804858040320.0, + "grad_norm": 2.0219141580296256, + "language_loss": 0.74345183, + "learning_rate": 1.968006251276444e-08, + "loss": 0.82015193, + "num_input_tokens_seen": 343218280, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.09393311, + "step": 15911, + "time_per_iteration": 2.5246856212615967 + }, + { + "auxiliary_loss_clip": 0.06402273, + "auxiliary_loss_mlp": 0.01266264, + "balance_loss_clip": 0.06270364, + "balance_loss_mlp": 0.01257348, + "epoch": 0.9566811964527281, + "flos": 18703562722560.0, + "grad_norm": 1.7881819879076843, + "language_loss": 0.6983766, + "learning_rate": 1.9625598746869198e-08, + "loss": 0.77506196, + "num_input_tokens_seen": 343236850, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.08911133, + "step": 15912, + "time_per_iteration": 2.4712233543395996 + }, + { + "auxiliary_loss_clip": 0.06402682, + "auxiliary_loss_mlp": 0.012665, + "balance_loss_clip": 0.06271908, + "balance_loss_mlp": 0.01257058, + "epoch": 0.9567413197053961, + "flos": 13004763960960.0, + "grad_norm": 3.3702578825008147, + "language_loss": 0.72631347, + "learning_rate": 1.95712100769696e-08, + "loss": 0.80300522, + "num_input_tokens_seen": 343253065, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09442139, + "step": 15913, + "time_per_iteration": 2.491908311843872 + }, + { + "auxiliary_loss_clip": 0.06399457, + "auxiliary_loss_mlp": 0.01266561, + "balance_loss_clip": 0.06270806, + "balance_loss_mlp": 0.01257335, + "epoch": 0.956801442958064, + "flos": 19725401664000.0, + "grad_norm": 2.223834124894749, + "language_loss": 0.73728657, + "learning_rate": 1.9516896505128444e-08, + "loss": 0.81394672, + "num_input_tokens_seen": 343270330, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.09222412, + "step": 15914, + "time_per_iteration": 2.4964563846588135 + }, + { + "auxiliary_loss_clip": 0.06398837, + "auxiliary_loss_mlp": 0.01263467, + "balance_loss_clip": 0.06268872, + "balance_loss_mlp": 0.01253978, + "epoch": 0.956861566210732, + "flos": 18228631380480.0, + "grad_norm": 1.552289311371977, + "language_loss": 0.67290843, + "learning_rate": 1.9462658033404965e-08, + "loss": 0.74953151, + "num_input_tokens_seen": 343289625, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.09484863, + "step": 15915, + "time_per_iteration": 2.5190324783325195 + }, + { + "auxiliary_loss_clip": 0.06394604, + "auxiliary_loss_mlp": 0.0126414, + "balance_loss_clip": 0.06268029, + "balance_loss_mlp": 0.01255014, + "epoch": 0.9569216894634, + "flos": 22202949588480.0, + "grad_norm": 1.6620877394499343, + "language_loss": 0.64458013, + "learning_rate": 1.9408494663855967e-08, + "loss": 0.72116756, + "num_input_tokens_seen": 343309200, + "router_z_loss_clip": 1.26367188, + "router_z_loss_mlp": 0.09124756, + "step": 15916, + "time_per_iteration": 2.491138219833374 + }, + { + "auxiliary_loss_clip": 0.06395577, + "auxiliary_loss_mlp": 0.01263704, + "balance_loss_clip": 0.0627159, + "balance_loss_mlp": 0.01255464, + "epoch": 0.956981812716068, + "flos": 21695719697280.0, + "grad_norm": 1.8254745953624876, + "language_loss": 0.80804276, + "learning_rate": 1.935440639853536e-08, + "loss": 0.88463557, + "num_input_tokens_seen": 343326270, + "router_z_loss_clip": 1.23925781, + "router_z_loss_mlp": 0.08242798, + "step": 15917, + "time_per_iteration": 2.5050711631774902 + }, + { + "auxiliary_loss_clip": 0.06400816, + "auxiliary_loss_mlp": 0.01268269, + "balance_loss_clip": 0.06271309, + "balance_loss_mlp": 0.01258321, + "epoch": 0.9570419359687359, + "flos": 13996065288960.0, + "grad_norm": 1.542027352693381, + "language_loss": 0.73089451, + "learning_rate": 1.9300393239494172e-08, + "loss": 0.80758536, + "num_input_tokens_seen": 343344430, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.09960938, + "step": 15918, + "time_per_iteration": 2.4727392196655273 + }, + { + "auxiliary_loss_clip": 0.06310041, + "auxiliary_loss_mlp": 0.01250785, + "balance_loss_clip": 0.06256156, + "balance_loss_mlp": 0.01249783, + "epoch": 0.9571020592214039, + "flos": 65219525015040.0, + "grad_norm": 0.6115592062767367, + "language_loss": 0.53111994, + "learning_rate": 1.924645518878032e-08, + "loss": 0.6067282, + "num_input_tokens_seen": 343416155, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.0100174, + "step": 15919, + "time_per_iteration": 3.272456645965576 + }, + { + "auxiliary_loss_clip": 0.064109, + "auxiliary_loss_mlp": 0.01269147, + "balance_loss_clip": 0.06275045, + "balance_loss_mlp": 0.01258651, + "epoch": 0.9571621824740718, + "flos": 17389793756160.0, + "grad_norm": 2.6495483249351137, + "language_loss": 0.76336288, + "learning_rate": 1.919259224843972e-08, + "loss": 0.84016335, + "num_input_tokens_seen": 343431715, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.1050415, + "step": 15920, + "time_per_iteration": 2.536787509918213 + }, + { + "auxiliary_loss_clip": 0.0640638, + "auxiliary_loss_mlp": 0.01267318, + "balance_loss_clip": 0.06273204, + "balance_loss_mlp": 0.0125712, + "epoch": 0.9572223057267398, + "flos": 14543434085760.0, + "grad_norm": 1.7185782559349, + "language_loss": 0.79365337, + "learning_rate": 1.9138804420514298e-08, + "loss": 0.87039036, + "num_input_tokens_seen": 343450425, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.10198975, + "step": 15921, + "time_per_iteration": 2.5111634731292725 + }, + { + "auxiliary_loss_clip": 0.06408585, + "auxiliary_loss_mlp": 0.01264797, + "balance_loss_clip": 0.06270958, + "balance_loss_mlp": 0.01254396, + "epoch": 0.9572824289794077, + "flos": 33956151840000.0, + "grad_norm": 1.7702021043483893, + "language_loss": 0.5147, + "learning_rate": 1.9085091707044197e-08, + "loss": 0.59143382, + "num_input_tokens_seen": 343470445, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.10406494, + "step": 15922, + "time_per_iteration": 2.5946807861328125 + }, + { + "auxiliary_loss_clip": 0.06403722, + "auxiliary_loss_mlp": 0.01265384, + "balance_loss_clip": 0.06269565, + "balance_loss_mlp": 0.01255764, + "epoch": 0.9573425522320758, + "flos": 18700418194560.0, + "grad_norm": 1.9436710836250617, + "language_loss": 0.84095252, + "learning_rate": 1.903145411006557e-08, + "loss": 0.91764355, + "num_input_tokens_seen": 343485200, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.09625244, + "step": 15923, + "time_per_iteration": 2.478198289871216 + }, + { + "auxiliary_loss_clip": 0.06399676, + "auxiliary_loss_mlp": 0.0126405, + "balance_loss_clip": 0.06269531, + "balance_loss_mlp": 0.01255187, + "epoch": 0.9574026754847437, + "flos": 28517571532800.0, + "grad_norm": 1.5492156766676946, + "language_loss": 0.7513963, + "learning_rate": 1.8977891631613008e-08, + "loss": 0.82803351, + "num_input_tokens_seen": 343505080, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.08862305, + "step": 15924, + "time_per_iteration": 2.5611090660095215 + }, + { + "auxiliary_loss_clip": 0.06402448, + "auxiliary_loss_mlp": 0.01262647, + "balance_loss_clip": 0.06271331, + "balance_loss_mlp": 0.01253594, + "epoch": 0.9574627987374117, + "flos": 24359203831680.0, + "grad_norm": 2.195724562368793, + "language_loss": 0.86041164, + "learning_rate": 1.892440427371711e-08, + "loss": 0.93706262, + "num_input_tokens_seen": 343523995, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09051514, + "step": 15925, + "time_per_iteration": 2.5580694675445557 + }, + { + "auxiliary_loss_clip": 0.06405063, + "auxiliary_loss_mlp": 0.01265178, + "balance_loss_clip": 0.06269714, + "balance_loss_mlp": 0.01255004, + "epoch": 0.9575229219900797, + "flos": 23516928190080.0, + "grad_norm": 1.83782139466113, + "language_loss": 0.76031494, + "learning_rate": 1.8870992038406474e-08, + "loss": 0.8370173, + "num_input_tokens_seen": 343542015, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.10174561, + "step": 15926, + "time_per_iteration": 2.6703908443450928 + }, + { + "auxiliary_loss_clip": 0.06407382, + "auxiliary_loss_mlp": 0.0126202, + "balance_loss_clip": 0.06274736, + "balance_loss_mlp": 0.01253759, + "epoch": 0.9575830452427476, + "flos": 22681486656000.0, + "grad_norm": 1.5772300841265903, + "language_loss": 0.78243768, + "learning_rate": 1.8817654927706373e-08, + "loss": 0.85913169, + "num_input_tokens_seen": 343561680, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.08276367, + "step": 15927, + "time_per_iteration": 2.569844961166382 + }, + { + "auxiliary_loss_clip": 0.06403775, + "auxiliary_loss_mlp": 0.01266085, + "balance_loss_clip": 0.06269503, + "balance_loss_mlp": 0.01255643, + "epoch": 0.9576431684954156, + "flos": 30493633570560.0, + "grad_norm": 1.749047653525374, + "language_loss": 0.68875557, + "learning_rate": 1.8764392943639183e-08, + "loss": 0.76545417, + "num_input_tokens_seen": 343585290, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.10449219, + "step": 15928, + "time_per_iteration": 2.6400134563446045 + }, + { + "auxiliary_loss_clip": 0.0640448, + "auxiliary_loss_mlp": 0.01264922, + "balance_loss_clip": 0.06272465, + "balance_loss_mlp": 0.01255296, + "epoch": 0.9577032917480836, + "flos": 21693497564160.0, + "grad_norm": 1.7657767995871196, + "language_loss": 0.82337755, + "learning_rate": 1.871120608822485e-08, + "loss": 0.90007156, + "num_input_tokens_seen": 343604045, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09631348, + "step": 15929, + "time_per_iteration": 2.537607431411743 + }, + { + "auxiliary_loss_clip": 0.06409724, + "auxiliary_loss_mlp": 0.01267462, + "balance_loss_clip": 0.06272496, + "balance_loss_mlp": 0.01257663, + "epoch": 0.9577634150007516, + "flos": 29030838917760.0, + "grad_norm": 1.3603689969387036, + "language_loss": 0.72440124, + "learning_rate": 1.8658094363480202e-08, + "loss": 0.80117309, + "num_input_tokens_seen": 343626595, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.09802246, + "step": 15930, + "time_per_iteration": 4.076937198638916 + }, + { + "auxiliary_loss_clip": 0.06400728, + "auxiliary_loss_mlp": 0.01262169, + "balance_loss_clip": 0.0627092, + "balance_loss_mlp": 0.01253586, + "epoch": 0.9578235382534195, + "flos": 19288429021440.0, + "grad_norm": 1.7666162202134825, + "language_loss": 0.62475115, + "learning_rate": 1.8605057771419185e-08, + "loss": 0.70138013, + "num_input_tokens_seen": 343646195, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.08587646, + "step": 15931, + "time_per_iteration": 2.4878103733062744 + }, + { + "auxiliary_loss_clip": 0.06398283, + "auxiliary_loss_mlp": 0.0126528, + "balance_loss_clip": 0.06270614, + "balance_loss_mlp": 0.01256822, + "epoch": 0.9578836615060875, + "flos": 13704428753280.0, + "grad_norm": 3.5194186637129548, + "language_loss": 0.69838828, + "learning_rate": 1.8552096314052633e-08, + "loss": 0.77502394, + "num_input_tokens_seen": 343663665, + "router_z_loss_clip": 1.27636719, + "router_z_loss_mlp": 0.08453369, + "step": 15932, + "time_per_iteration": 2.5196003913879395 + }, + { + "auxiliary_loss_clip": 0.06407235, + "auxiliary_loss_mlp": 0.01272005, + "balance_loss_clip": 0.06270652, + "balance_loss_mlp": 0.0126152, + "epoch": 0.9579437847587554, + "flos": 17059988885760.0, + "grad_norm": 1.7465631161736164, + "language_loss": 0.75582886, + "learning_rate": 1.849920999338961e-08, + "loss": 0.83262122, + "num_input_tokens_seen": 343682145, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.1048584, + "step": 15933, + "time_per_iteration": 2.5064492225646973 + }, + { + "auxiliary_loss_clip": 0.06308126, + "auxiliary_loss_mlp": 0.01248499, + "balance_loss_clip": 0.06254178, + "balance_loss_mlp": 0.01247536, + "epoch": 0.9580039080114234, + "flos": 60587875854720.0, + "grad_norm": 0.7159109651995939, + "language_loss": 0.57357532, + "learning_rate": 1.8446398811434948e-08, + "loss": 0.64914161, + "num_input_tokens_seen": 343744685, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.00962067, + "step": 15934, + "time_per_iteration": 3.2443442344665527 + }, + { + "auxiliary_loss_clip": 0.06307364, + "auxiliary_loss_mlp": 0.01247753, + "balance_loss_clip": 0.06253395, + "balance_loss_mlp": 0.01246772, + "epoch": 0.9580640312640913, + "flos": 66254837264640.0, + "grad_norm": 0.9651737078828977, + "language_loss": 0.65949249, + "learning_rate": 1.8393662770191277e-08, + "loss": 0.7350437, + "num_input_tokens_seen": 343801835, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.00979614, + "step": 15935, + "time_per_iteration": 3.065608501434326 + }, + { + "auxiliary_loss_clip": 0.0630898, + "auxiliary_loss_mlp": 0.01251402, + "balance_loss_clip": 0.06254997, + "balance_loss_mlp": 0.01250436, + "epoch": 0.9581241545167594, + "flos": 62236145520000.0, + "grad_norm": 0.7631981636188135, + "language_loss": 0.56839162, + "learning_rate": 1.8341001871658546e-08, + "loss": 0.64399546, + "num_input_tokens_seen": 343861515, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.00964355, + "step": 15936, + "time_per_iteration": 3.1163625717163086 + }, + { + "auxiliary_loss_clip": 0.06401271, + "auxiliary_loss_mlp": 0.01267128, + "balance_loss_clip": 0.06268574, + "balance_loss_mlp": 0.01257687, + "epoch": 0.9581842777694273, + "flos": 23774714876160.0, + "grad_norm": 1.5239589067044021, + "language_loss": 0.78735429, + "learning_rate": 1.8288416117833825e-08, + "loss": 0.86403823, + "num_input_tokens_seen": 343881240, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.09448242, + "step": 15937, + "time_per_iteration": 4.0462646484375 + }, + { + "auxiliary_loss_clip": 0.06402034, + "auxiliary_loss_mlp": 0.01264412, + "balance_loss_clip": 0.06271479, + "balance_loss_mlp": 0.01254339, + "epoch": 0.9582444010220953, + "flos": 21219111273600.0, + "grad_norm": 1.677321670215532, + "language_loss": 0.68562138, + "learning_rate": 1.8235905510710636e-08, + "loss": 0.76228583, + "num_input_tokens_seen": 343900885, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.10064697, + "step": 15938, + "time_per_iteration": 2.497121572494507 + }, + { + "auxiliary_loss_clip": 0.06402011, + "auxiliary_loss_mlp": 0.0126384, + "balance_loss_clip": 0.06271237, + "balance_loss_mlp": 0.01254625, + "epoch": 0.9583045242747633, + "flos": 23811876961920.0, + "grad_norm": 2.306411620688474, + "language_loss": 0.66241562, + "learning_rate": 1.8183470052280712e-08, + "loss": 0.73907411, + "num_input_tokens_seen": 343918460, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.09210205, + "step": 15939, + "time_per_iteration": 2.526710033416748 + }, + { + "auxiliary_loss_clip": 0.06401028, + "auxiliary_loss_mlp": 0.01261972, + "balance_loss_clip": 0.06270058, + "balance_loss_mlp": 0.01253108, + "epoch": 0.9583646475274312, + "flos": 24137908398720.0, + "grad_norm": 1.9566475767780982, + "language_loss": 0.73915648, + "learning_rate": 1.8131109744532025e-08, + "loss": 0.81578648, + "num_input_tokens_seen": 343938030, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.08868408, + "step": 15940, + "time_per_iteration": 2.561065673828125 + }, + { + "auxiliary_loss_clip": 0.06404864, + "auxiliary_loss_mlp": 0.01265122, + "balance_loss_clip": 0.06271879, + "balance_loss_mlp": 0.0125512, + "epoch": 0.9584247707800992, + "flos": 20892954055680.0, + "grad_norm": 1.7935762593019313, + "language_loss": 0.73054647, + "learning_rate": 1.8078824589450535e-08, + "loss": 0.80724633, + "num_input_tokens_seen": 343956635, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.09997559, + "step": 15941, + "time_per_iteration": 2.5311267375946045 + }, + { + "auxiliary_loss_clip": 0.06403222, + "auxiliary_loss_mlp": 0.0126599, + "balance_loss_clip": 0.06272411, + "balance_loss_mlp": 0.01256561, + "epoch": 0.9584848940327672, + "flos": 26074753925760.0, + "grad_norm": 2.6796518959373086, + "language_loss": 0.7163468, + "learning_rate": 1.8026614589018442e-08, + "loss": 0.79303896, + "num_input_tokens_seen": 343976625, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09429932, + "step": 15942, + "time_per_iteration": 2.5477967262268066 + }, + { + "auxiliary_loss_clip": 0.06404561, + "auxiliary_loss_mlp": 0.0126497, + "balance_loss_clip": 0.06271345, + "balance_loss_mlp": 0.01254951, + "epoch": 0.9585450172854352, + "flos": 34501088868480.0, + "grad_norm": 1.640983954823699, + "language_loss": 0.72097212, + "learning_rate": 1.797447974521571e-08, + "loss": 0.79766738, + "num_input_tokens_seen": 343997790, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.10021973, + "step": 15943, + "time_per_iteration": 2.6213395595550537 + }, + { + "auxiliary_loss_clip": 0.06406368, + "auxiliary_loss_mlp": 0.01266644, + "balance_loss_clip": 0.06272337, + "balance_loss_mlp": 0.01256744, + "epoch": 0.9586051405381031, + "flos": 23117159560320.0, + "grad_norm": 2.4382664366899873, + "language_loss": 0.68584573, + "learning_rate": 1.792242006001965e-08, + "loss": 0.76257586, + "num_input_tokens_seen": 344016935, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.09906006, + "step": 15944, + "time_per_iteration": 2.527688503265381 + }, + { + "auxiliary_loss_clip": 0.06400511, + "auxiliary_loss_mlp": 0.01265871, + "balance_loss_clip": 0.06268411, + "balance_loss_mlp": 0.01255964, + "epoch": 0.9586652637907711, + "flos": 19609135724160.0, + "grad_norm": 1.9938870353448976, + "language_loss": 0.66536617, + "learning_rate": 1.7870435535403795e-08, + "loss": 0.74202991, + "num_input_tokens_seen": 344035590, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.09912109, + "step": 15945, + "time_per_iteration": 3.923600673675537 + }, + { + "auxiliary_loss_clip": 0.06310786, + "auxiliary_loss_mlp": 0.01252735, + "balance_loss_clip": 0.06256623, + "balance_loss_mlp": 0.01251638, + "epoch": 0.958725387043439, + "flos": 72093815107200.0, + "grad_norm": 0.7394875290848417, + "language_loss": 0.61828369, + "learning_rate": 1.7818526173339678e-08, + "loss": 0.69391894, + "num_input_tokens_seen": 344100845, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.01098633, + "step": 15946, + "time_per_iteration": 3.237788438796997 + }, + { + "auxiliary_loss_clip": 0.06400455, + "auxiliary_loss_mlp": 0.01263062, + "balance_loss_clip": 0.06272161, + "balance_loss_mlp": 0.01254151, + "epoch": 0.958785510296107, + "flos": 28919310733440.0, + "grad_norm": 1.520574817813325, + "language_loss": 0.75433493, + "learning_rate": 1.7766691975795723e-08, + "loss": 0.83097005, + "num_input_tokens_seen": 344121780, + "router_z_loss_clip": 1.28417969, + "router_z_loss_mlp": 0.08917236, + "step": 15947, + "time_per_iteration": 2.5902247428894043 + }, + { + "auxiliary_loss_clip": 0.06399107, + "auxiliary_loss_mlp": 0.0126601, + "balance_loss_clip": 0.06268102, + "balance_loss_mlp": 0.01256682, + "epoch": 0.958845633548775, + "flos": 18482854268160.0, + "grad_norm": 2.0981305077445676, + "language_loss": 0.70112932, + "learning_rate": 1.771493294473747e-08, + "loss": 0.77778053, + "num_input_tokens_seen": 344140150, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09320068, + "step": 15948, + "time_per_iteration": 3.957618236541748 + }, + { + "auxiliary_loss_clip": 0.06398233, + "auxiliary_loss_mlp": 0.01262024, + "balance_loss_clip": 0.06268825, + "balance_loss_mlp": 0.01252362, + "epoch": 0.958905756801443, + "flos": 24213783870720.0, + "grad_norm": 7.304958158083634, + "language_loss": 0.7873342, + "learning_rate": 1.7663249082127574e-08, + "loss": 0.86393678, + "num_input_tokens_seen": 344158200, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.09661865, + "step": 15949, + "time_per_iteration": 2.538614511489868 + }, + { + "auxiliary_loss_clip": 0.06403197, + "auxiliary_loss_mlp": 0.01262903, + "balance_loss_clip": 0.0627002, + "balance_loss_mlp": 0.01253629, + "epoch": 0.9589658800541109, + "flos": 25014662795520.0, + "grad_norm": 1.8198938167398784, + "language_loss": 0.69052678, + "learning_rate": 1.761164038992602e-08, + "loss": 0.76718783, + "num_input_tokens_seen": 344174720, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.09283447, + "step": 15950, + "time_per_iteration": 2.5288169384002686 + }, + { + "auxiliary_loss_clip": 0.06401816, + "auxiliary_loss_mlp": 0.01268119, + "balance_loss_clip": 0.06269851, + "balance_loss_mlp": 0.01259273, + "epoch": 0.9590260033067789, + "flos": 23521456310400.0, + "grad_norm": 1.6945586951033367, + "language_loss": 0.86529648, + "learning_rate": 1.7560106870089687e-08, + "loss": 0.94199586, + "num_input_tokens_seen": 344192580, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.08843994, + "step": 15951, + "time_per_iteration": 2.5392637252807617 + }, + { + "auxiliary_loss_clip": 0.06405854, + "auxiliary_loss_mlp": 0.01264128, + "balance_loss_clip": 0.06270808, + "balance_loss_mlp": 0.01253703, + "epoch": 0.9590861265594469, + "flos": 25527427056000.0, + "grad_norm": 4.282815391208873, + "language_loss": 0.8056556, + "learning_rate": 1.7508648524572568e-08, + "loss": 0.88235545, + "num_input_tokens_seen": 344210345, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.10412598, + "step": 15952, + "time_per_iteration": 2.5456416606903076 + }, + { + "auxiliary_loss_clip": 0.06403787, + "auxiliary_loss_mlp": 0.01266317, + "balance_loss_clip": 0.0627217, + "balance_loss_mlp": 0.01256077, + "epoch": 0.9591462498121148, + "flos": 21185806475520.0, + "grad_norm": 1.9010894377049286, + "language_loss": 0.6990664, + "learning_rate": 1.7457265355326434e-08, + "loss": 0.77576745, + "num_input_tokens_seen": 344229540, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.10235596, + "step": 15953, + "time_per_iteration": 2.5646610260009766 + }, + { + "auxiliary_loss_clip": 0.06404779, + "auxiliary_loss_mlp": 0.01267328, + "balance_loss_clip": 0.06272354, + "balance_loss_mlp": 0.01257177, + "epoch": 0.9592063730647828, + "flos": 21729024495360.0, + "grad_norm": 3.7840506918954557, + "language_loss": 0.58236861, + "learning_rate": 1.7405957364299285e-08, + "loss": 0.65908968, + "num_input_tokens_seen": 344247830, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.10150146, + "step": 15954, + "time_per_iteration": 2.501776933670044 + }, + { + "auxiliary_loss_clip": 0.06404117, + "auxiliary_loss_mlp": 0.01263181, + "balance_loss_clip": 0.06270336, + "balance_loss_mlp": 0.01253537, + "epoch": 0.9592664963174508, + "flos": 29897992022400.0, + "grad_norm": 2.9078911705966095, + "language_loss": 0.74191898, + "learning_rate": 1.7354724553437117e-08, + "loss": 0.81859195, + "num_input_tokens_seen": 344267760, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.09649658, + "step": 15955, + "time_per_iteration": 2.5696985721588135 + }, + { + "auxiliary_loss_clip": 0.0640043, + "auxiliary_loss_mlp": 0.01266787, + "balance_loss_clip": 0.06268075, + "balance_loss_mlp": 0.01256553, + "epoch": 0.9593266195701188, + "flos": 18004652616960.0, + "grad_norm": 1.8259803400807233, + "language_loss": 0.62581319, + "learning_rate": 1.7303566924682378e-08, + "loss": 0.70248532, + "num_input_tokens_seen": 344284905, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.10235596, + "step": 15956, + "time_per_iteration": 2.4732725620269775 + }, + { + "auxiliary_loss_clip": 0.06403741, + "auxiliary_loss_mlp": 0.01265541, + "balance_loss_clip": 0.06271461, + "balance_loss_mlp": 0.01256088, + "epoch": 0.9593867428227867, + "flos": 18843364460160.0, + "grad_norm": 1.9369477994253566, + "language_loss": 0.60280073, + "learning_rate": 1.725248447997507e-08, + "loss": 0.67949355, + "num_input_tokens_seen": 344302025, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.09454346, + "step": 15957, + "time_per_iteration": 2.504669427871704 + }, + { + "auxiliary_loss_clip": 0.0640239, + "auxiliary_loss_mlp": 0.01266865, + "balance_loss_clip": 0.06269728, + "balance_loss_mlp": 0.01255743, + "epoch": 0.9594468660754547, + "flos": 29574266572800.0, + "grad_norm": 2.0245547455705264, + "language_loss": 0.74410594, + "learning_rate": 1.7201477221252314e-08, + "loss": 0.82079852, + "num_input_tokens_seen": 344321935, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.11120605, + "step": 15958, + "time_per_iteration": 2.5677356719970703 + }, + { + "auxiliary_loss_clip": 0.06397437, + "auxiliary_loss_mlp": 0.01265983, + "balance_loss_clip": 0.06268004, + "balance_loss_mlp": 0.01256553, + "epoch": 0.9595069893281226, + "flos": 20709365760000.0, + "grad_norm": 1.539498065951829, + "language_loss": 0.74628884, + "learning_rate": 1.7150545150448116e-08, + "loss": 0.822923, + "num_input_tokens_seen": 344340405, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.09417725, + "step": 15959, + "time_per_iteration": 2.536829710006714 + }, + { + "auxiliary_loss_clip": 0.06405512, + "auxiliary_loss_mlp": 0.01265512, + "balance_loss_clip": 0.06271296, + "balance_loss_mlp": 0.01255862, + "epoch": 0.9595671125807906, + "flos": 22459855806720.0, + "grad_norm": 1.9304133607099632, + "language_loss": 0.64810073, + "learning_rate": 1.7099688269493816e-08, + "loss": 0.72481102, + "num_input_tokens_seen": 344359925, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.09643555, + "step": 15960, + "time_per_iteration": 2.511936664581299 + }, + { + "auxiliary_loss_clip": 0.06398654, + "auxiliary_loss_mlp": 0.01263314, + "balance_loss_clip": 0.06271854, + "balance_loss_mlp": 0.01254159, + "epoch": 0.9596272358334585, + "flos": 23922063480960.0, + "grad_norm": 1.6378255149464493, + "language_loss": 0.78098899, + "learning_rate": 1.7048906580318544e-08, + "loss": 0.85760868, + "num_input_tokens_seen": 344379100, + "router_z_loss_clip": 1.26855469, + "router_z_loss_mlp": 0.09161377, + "step": 15961, + "time_per_iteration": 2.532150983810425 + }, + { + "auxiliary_loss_clip": 0.06397168, + "auxiliary_loss_mlp": 0.01268616, + "balance_loss_clip": 0.06268074, + "balance_loss_mlp": 0.0125961, + "epoch": 0.9596873590861266, + "flos": 17677740712320.0, + "grad_norm": 1.8261694186593203, + "language_loss": 0.76113975, + "learning_rate": 1.699820008484698e-08, + "loss": 0.83779764, + "num_input_tokens_seen": 344396895, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.09008789, + "step": 15962, + "time_per_iteration": 2.462209939956665 + }, + { + "auxiliary_loss_clip": 0.06404586, + "auxiliary_loss_mlp": 0.01265561, + "balance_loss_clip": 0.06270142, + "balance_loss_mlp": 0.01255422, + "epoch": 0.9597474823387945, + "flos": 25815038595840.0, + "grad_norm": 2.1500884319333466, + "language_loss": 0.71985179, + "learning_rate": 1.6947568785002698e-08, + "loss": 0.79655325, + "num_input_tokens_seen": 344415115, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.10150146, + "step": 15963, + "time_per_iteration": 2.535642385482788 + }, + { + "auxiliary_loss_clip": 0.06392812, + "auxiliary_loss_mlp": 0.01264787, + "balance_loss_clip": 0.06268126, + "balance_loss_mlp": 0.01256192, + "epoch": 0.9598076055914625, + "flos": 23775218000640.0, + "grad_norm": 1.3971515613610286, + "language_loss": 0.74030179, + "learning_rate": 1.689701268270527e-08, + "loss": 0.81687784, + "num_input_tokens_seen": 344435185, + "router_z_loss_clip": 1.24707031, + "router_z_loss_mlp": 0.0859375, + "step": 15964, + "time_per_iteration": 2.52500581741333 + }, + { + "auxiliary_loss_clip": 0.06307586, + "auxiliary_loss_mlp": 0.0124987, + "balance_loss_clip": 0.06253596, + "balance_loss_mlp": 0.01248861, + "epoch": 0.9598677288441305, + "flos": 56531435045760.0, + "grad_norm": 0.8705968118534945, + "language_loss": 0.57773823, + "learning_rate": 1.684653177987161e-08, + "loss": 0.6533128, + "num_input_tokens_seen": 344488950, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.01008606, + "step": 15965, + "time_per_iteration": 3.1062443256378174 + }, + { + "auxiliary_loss_clip": 0.06403217, + "auxiliary_loss_mlp": 0.01265006, + "balance_loss_clip": 0.06270359, + "balance_loss_mlp": 0.01255487, + "epoch": 0.9599278520967984, + "flos": 23003241534720.0, + "grad_norm": 1.6069333020666432, + "language_loss": 0.78958309, + "learning_rate": 1.6796126078416627e-08, + "loss": 0.8662653, + "num_input_tokens_seen": 344506740, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.09521484, + "step": 15966, + "time_per_iteration": 2.4983363151550293 + }, + { + "auxiliary_loss_clip": 0.06399991, + "auxiliary_loss_mlp": 0.01263589, + "balance_loss_clip": 0.06269903, + "balance_loss_mlp": 0.01254809, + "epoch": 0.9599879753494664, + "flos": 23046399187200.0, + "grad_norm": 1.7301576567619177, + "language_loss": 0.79460174, + "learning_rate": 1.674579558025102e-08, + "loss": 0.87123752, + "num_input_tokens_seen": 344526670, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.08782959, + "step": 15967, + "time_per_iteration": 2.5906291007995605 + }, + { + "auxiliary_loss_clip": 0.06405335, + "auxiliary_loss_mlp": 0.01264036, + "balance_loss_clip": 0.06271484, + "balance_loss_mlp": 0.01253546, + "epoch": 0.9600480986021344, + "flos": 16396731492480.0, + "grad_norm": 1.8178242289336397, + "language_loss": 0.80317146, + "learning_rate": 1.669554028728348e-08, + "loss": 0.87986517, + "num_input_tokens_seen": 344541995, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.10491943, + "step": 15968, + "time_per_iteration": 2.5032947063446045 + }, + { + "auxiliary_loss_clip": 0.06406718, + "auxiliary_loss_mlp": 0.01266637, + "balance_loss_clip": 0.06270508, + "balance_loss_mlp": 0.01256469, + "epoch": 0.9601082218548024, + "flos": 24282741381120.0, + "grad_norm": 2.288236761604915, + "language_loss": 0.67642689, + "learning_rate": 1.6645360201420044e-08, + "loss": 0.75316042, + "num_input_tokens_seen": 344559980, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.10162354, + "step": 15969, + "time_per_iteration": 3.9625113010406494 + }, + { + "auxiliary_loss_clip": 0.06400546, + "auxiliary_loss_mlp": 0.01265001, + "balance_loss_clip": 0.06270244, + "balance_loss_mlp": 0.01255893, + "epoch": 0.9601683451074703, + "flos": 19616137539840.0, + "grad_norm": 2.845353279559271, + "language_loss": 0.79347444, + "learning_rate": 1.6595255324563186e-08, + "loss": 0.87012994, + "num_input_tokens_seen": 344577765, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.09112549, + "step": 15970, + "time_per_iteration": 2.5543136596679688 + }, + { + "auxiliary_loss_clip": 0.06397574, + "auxiliary_loss_mlp": 0.01262648, + "balance_loss_clip": 0.06270392, + "balance_loss_mlp": 0.01252437, + "epoch": 0.9602284683601383, + "flos": 26658320486400.0, + "grad_norm": 1.6064611852721693, + "language_loss": 0.77587306, + "learning_rate": 1.654522565861316e-08, + "loss": 0.85247523, + "num_input_tokens_seen": 344597650, + "router_z_loss_clip": 1.27246094, + "router_z_loss_mlp": 0.10198975, + "step": 15971, + "time_per_iteration": 2.5803046226501465 + }, + { + "auxiliary_loss_clip": 0.0640654, + "auxiliary_loss_mlp": 0.0127055, + "balance_loss_clip": 0.06269947, + "balance_loss_mlp": 0.01260501, + "epoch": 0.9602885916128062, + "flos": 15558564700800.0, + "grad_norm": 1.7619680373804267, + "language_loss": 0.67380464, + "learning_rate": 1.64952712054669e-08, + "loss": 0.75057554, + "num_input_tokens_seen": 344613580, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.10046387, + "step": 15972, + "time_per_iteration": 2.498838186264038 + }, + { + "auxiliary_loss_clip": 0.06402527, + "auxiliary_loss_mlp": 0.01266197, + "balance_loss_clip": 0.06271423, + "balance_loss_mlp": 0.0125734, + "epoch": 0.9603487148654742, + "flos": 16506918011520.0, + "grad_norm": 2.00764116027108, + "language_loss": 0.76161063, + "learning_rate": 1.644539196701844e-08, + "loss": 0.83829796, + "num_input_tokens_seen": 344626910, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.08862305, + "step": 15973, + "time_per_iteration": 2.4790399074554443 + }, + { + "auxiliary_loss_clip": 0.06398208, + "auxiliary_loss_mlp": 0.0126264, + "balance_loss_clip": 0.06269785, + "balance_loss_mlp": 0.01253265, + "epoch": 0.9604088381181421, + "flos": 20850844579200.0, + "grad_norm": 1.5560491123984277, + "language_loss": 0.6949749, + "learning_rate": 1.639558794515983e-08, + "loss": 0.77158332, + "num_input_tokens_seen": 344644330, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.09368896, + "step": 15974, + "time_per_iteration": 2.5170116424560547 + }, + { + "auxiliary_loss_clip": 0.06401684, + "auxiliary_loss_mlp": 0.01263757, + "balance_loss_clip": 0.06267555, + "balance_loss_mlp": 0.01254149, + "epoch": 0.9604689613708102, + "flos": 19689287754240.0, + "grad_norm": 1.9711138139103617, + "language_loss": 0.6806975, + "learning_rate": 1.6345859141779105e-08, + "loss": 0.75735193, + "num_input_tokens_seen": 344663910, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.0960083, + "step": 15975, + "time_per_iteration": 2.5701375007629395 + }, + { + "auxiliary_loss_clip": 0.06393464, + "auxiliary_loss_mlp": 0.0126123, + "balance_loss_clip": 0.06268396, + "balance_loss_mlp": 0.01252844, + "epoch": 0.9605290846234781, + "flos": 24104435892480.0, + "grad_norm": 1.8738118251682123, + "language_loss": 0.55862868, + "learning_rate": 1.6296205558762322e-08, + "loss": 0.63517565, + "num_input_tokens_seen": 344682320, + "router_z_loss_clip": 1.25195312, + "router_z_loss_mlp": 0.08392334, + "step": 15976, + "time_per_iteration": 4.048995494842529 + }, + { + "auxiliary_loss_clip": 0.06395699, + "auxiliary_loss_mlp": 0.01269742, + "balance_loss_clip": 0.06268542, + "balance_loss_mlp": 0.0126107, + "epoch": 0.9605892078761461, + "flos": 27129394540800.0, + "grad_norm": 1.8917776879450527, + "language_loss": 0.6844517, + "learning_rate": 1.624662719799219e-08, + "loss": 0.76110613, + "num_input_tokens_seen": 344701355, + "router_z_loss_clip": 1.27050781, + "router_z_loss_mlp": 0.08679199, + "step": 15977, + "time_per_iteration": 2.530975103378296 + }, + { + "auxiliary_loss_clip": 0.06400748, + "auxiliary_loss_mlp": 0.01264114, + "balance_loss_clip": 0.06269416, + "balance_loss_mlp": 0.01254839, + "epoch": 0.9606493311288141, + "flos": 14142114155520.0, + "grad_norm": 1.6662861951181476, + "language_loss": 0.82018828, + "learning_rate": 1.6197124061348766e-08, + "loss": 0.89683688, + "num_input_tokens_seen": 344717980, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.0927124, + "step": 15978, + "time_per_iteration": 2.44873046875 + }, + { + "auxiliary_loss_clip": 0.06404868, + "auxiliary_loss_mlp": 0.01262098, + "balance_loss_clip": 0.06270764, + "balance_loss_mlp": 0.01251995, + "epoch": 0.960709454381482, + "flos": 15818489665920.0, + "grad_norm": 2.0740905644965864, + "language_loss": 0.83917105, + "learning_rate": 1.614769615070921e-08, + "loss": 0.91584074, + "num_input_tokens_seen": 344733480, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.10107422, + "step": 15979, + "time_per_iteration": 2.4589617252349854 + }, + { + "auxiliary_loss_clip": 0.06404734, + "auxiliary_loss_mlp": 0.01263469, + "balance_loss_clip": 0.062713, + "balance_loss_mlp": 0.01254731, + "epoch": 0.96076957763415, + "flos": 22572054823680.0, + "grad_norm": 1.4954834953684717, + "language_loss": 0.79959273, + "learning_rate": 1.6098343467947805e-08, + "loss": 0.87627476, + "num_input_tokens_seen": 344752130, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.08734131, + "step": 15980, + "time_per_iteration": 2.511533498764038 + }, + { + "auxiliary_loss_clip": 0.0640362, + "auxiliary_loss_mlp": 0.01263144, + "balance_loss_clip": 0.06268869, + "balance_loss_mlp": 0.01253697, + "epoch": 0.960829700886818, + "flos": 24688212088320.0, + "grad_norm": 1.903020531997726, + "language_loss": 0.68203151, + "learning_rate": 1.6049066014935942e-08, + "loss": 0.75869906, + "num_input_tokens_seen": 344771195, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.09442139, + "step": 15981, + "time_per_iteration": 2.520338535308838 + }, + { + "auxiliary_loss_clip": 0.06401807, + "auxiliary_loss_mlp": 0.01266037, + "balance_loss_clip": 0.06271201, + "balance_loss_mlp": 0.01256608, + "epoch": 0.960889824139486, + "flos": 26549517559680.0, + "grad_norm": 1.448278163725355, + "language_loss": 0.70106196, + "learning_rate": 1.5999863793542344e-08, + "loss": 0.77774036, + "num_input_tokens_seen": 344793150, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.09429932, + "step": 15982, + "time_per_iteration": 2.5638973712921143 + }, + { + "auxiliary_loss_clip": 0.06308071, + "auxiliary_loss_mlp": 0.01250914, + "balance_loss_clip": 0.06253908, + "balance_loss_mlp": 0.01249987, + "epoch": 0.9609499473921539, + "flos": 71133638371200.0, + "grad_norm": 0.6588987615366447, + "language_loss": 0.53301847, + "learning_rate": 1.595073680563286e-08, + "loss": 0.60860837, + "num_input_tokens_seen": 344852855, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.00924683, + "step": 15983, + "time_per_iteration": 3.2202537059783936 + }, + { + "auxiliary_loss_clip": 0.06403141, + "auxiliary_loss_mlp": 0.01264496, + "balance_loss_clip": 0.06271559, + "balance_loss_mlp": 0.01255255, + "epoch": 0.9610100706448219, + "flos": 20557740597120.0, + "grad_norm": 2.132875740331415, + "language_loss": 0.67696095, + "learning_rate": 1.5901685053070212e-08, + "loss": 0.75363725, + "num_input_tokens_seen": 344869830, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09243774, + "step": 15984, + "time_per_iteration": 2.4828972816467285 + }, + { + "auxiliary_loss_clip": 0.06395225, + "auxiliary_loss_mlp": 0.01264558, + "balance_loss_clip": 0.06270009, + "balance_loss_mlp": 0.01255748, + "epoch": 0.9610701938974898, + "flos": 14069425138560.0, + "grad_norm": 1.4813244917974475, + "language_loss": 0.6780051, + "learning_rate": 1.5852708537714477e-08, + "loss": 0.75460297, + "num_input_tokens_seen": 344888905, + "router_z_loss_clip": 1.25292969, + "router_z_loss_mlp": 0.0880127, + "step": 15985, + "time_per_iteration": 3.950624704360962 + }, + { + "auxiliary_loss_clip": 0.06401645, + "auxiliary_loss_mlp": 0.01266624, + "balance_loss_clip": 0.06269781, + "balance_loss_mlp": 0.01256938, + "epoch": 0.9611303171501578, + "flos": 20236195353600.0, + "grad_norm": 1.7938469650350048, + "language_loss": 0.7897535, + "learning_rate": 1.580380726142283e-08, + "loss": 0.86643624, + "num_input_tokens_seen": 344907160, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09686279, + "step": 15986, + "time_per_iteration": 2.4934823513031006 + }, + { + "auxiliary_loss_clip": 0.06401192, + "auxiliary_loss_mlp": 0.01266929, + "balance_loss_clip": 0.06271122, + "balance_loss_mlp": 0.012566, + "epoch": 0.9611904404028258, + "flos": 20955957926400.0, + "grad_norm": 2.0809357131228423, + "language_loss": 0.63982856, + "learning_rate": 1.5754981226049792e-08, + "loss": 0.71650976, + "num_input_tokens_seen": 344922400, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.10333252, + "step": 15987, + "time_per_iteration": 2.4966821670532227 + }, + { + "auxiliary_loss_clip": 0.06399138, + "auxiliary_loss_mlp": 0.01263515, + "balance_loss_clip": 0.06273428, + "balance_loss_mlp": 0.01255147, + "epoch": 0.9612505636554938, + "flos": 24834806006400.0, + "grad_norm": 1.5786304249382652, + "language_loss": 0.67162049, + "learning_rate": 1.5706230433446544e-08, + "loss": 0.74824703, + "num_input_tokens_seen": 344941910, + "router_z_loss_clip": 1.25683594, + "router_z_loss_mlp": 0.08361816, + "step": 15988, + "time_per_iteration": 4.050363540649414 + }, + { + "auxiliary_loss_clip": 0.06401965, + "auxiliary_loss_mlp": 0.01266454, + "balance_loss_clip": 0.06269932, + "balance_loss_mlp": 0.01257531, + "epoch": 0.9613106869081617, + "flos": 17170636602240.0, + "grad_norm": 1.9067338568780405, + "language_loss": 0.7483418, + "learning_rate": 1.5657554885462055e-08, + "loss": 0.82502604, + "num_input_tokens_seen": 344960020, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.08929443, + "step": 15989, + "time_per_iteration": 2.4900639057159424 + }, + { + "auxiliary_loss_clip": 0.06311363, + "auxiliary_loss_mlp": 0.01250371, + "balance_loss_clip": 0.06257341, + "balance_loss_mlp": 0.01249386, + "epoch": 0.9613708101608297, + "flos": 61582279783680.0, + "grad_norm": 0.7995098975386216, + "language_loss": 0.63284862, + "learning_rate": 1.5608954583941737e-08, + "loss": 0.70846593, + "num_input_tokens_seen": 345018290, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.00983429, + "step": 15990, + "time_per_iteration": 3.012808322906494 + }, + { + "auxiliary_loss_clip": 0.06398995, + "auxiliary_loss_mlp": 0.01262542, + "balance_loss_clip": 0.06267406, + "balance_loss_mlp": 0.01253172, + "epoch": 0.9614309334134977, + "flos": 27425349561600.0, + "grad_norm": 1.9733105896619667, + "language_loss": 0.78653449, + "learning_rate": 1.5560429530729003e-08, + "loss": 0.86314988, + "num_input_tokens_seen": 345040235, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09375, + "step": 15991, + "time_per_iteration": 2.575064182281494 + }, + { + "auxiliary_loss_clip": 0.06408799, + "auxiliary_loss_mlp": 0.01267574, + "balance_loss_clip": 0.06271989, + "balance_loss_mlp": 0.01257107, + "epoch": 0.9614910566661656, + "flos": 22825564951680.0, + "grad_norm": 3.560030551697313, + "language_loss": 0.85130018, + "learning_rate": 1.5511979727663493e-08, + "loss": 0.92806387, + "num_input_tokens_seen": 345054540, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.10467529, + "step": 15992, + "time_per_iteration": 2.5204951763153076 + }, + { + "auxiliary_loss_clip": 0.06402579, + "auxiliary_loss_mlp": 0.01266631, + "balance_loss_clip": 0.06270155, + "balance_loss_mlp": 0.01256618, + "epoch": 0.9615511799188337, + "flos": 20674090391040.0, + "grad_norm": 1.9027763344253423, + "language_loss": 0.73045832, + "learning_rate": 1.5463605176582406e-08, + "loss": 0.80715036, + "num_input_tokens_seen": 345074035, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.10021973, + "step": 15993, + "time_per_iteration": 2.528385877609253 + }, + { + "auxiliary_loss_clip": 0.06401677, + "auxiliary_loss_mlp": 0.01263253, + "balance_loss_clip": 0.06269389, + "balance_loss_mlp": 0.0125368, + "epoch": 0.9616113031715016, + "flos": 33158123953920.0, + "grad_norm": 1.5426026145316933, + "language_loss": 0.68642288, + "learning_rate": 1.5415305879320716e-08, + "loss": 0.76307219, + "num_input_tokens_seen": 345099270, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.09564209, + "step": 15994, + "time_per_iteration": 2.6216301918029785 + }, + { + "auxiliary_loss_clip": 0.06400389, + "auxiliary_loss_mlp": 0.01263471, + "balance_loss_clip": 0.06268929, + "balance_loss_mlp": 0.01254566, + "epoch": 0.9616714264241696, + "flos": 25016843001600.0, + "grad_norm": 1.8467550508155814, + "language_loss": 0.84644687, + "learning_rate": 1.5367081837709183e-08, + "loss": 0.92308545, + "num_input_tokens_seen": 345116975, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.08892822, + "step": 15995, + "time_per_iteration": 2.563554525375366 + }, + { + "auxiliary_loss_clip": 0.06411675, + "auxiliary_loss_mlp": 0.01267604, + "balance_loss_clip": 0.06273677, + "balance_loss_mlp": 0.01257221, + "epoch": 0.9617315496768375, + "flos": 13551629633280.0, + "grad_norm": 2.057298976603726, + "language_loss": 0.76097316, + "learning_rate": 1.5318933053576788e-08, + "loss": 0.83776593, + "num_input_tokens_seen": 345133645, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.10394287, + "step": 15996, + "time_per_iteration": 2.478343963623047 + }, + { + "auxiliary_loss_clip": 0.06398165, + "auxiliary_loss_mlp": 0.01265357, + "balance_loss_clip": 0.06267761, + "balance_loss_mlp": 0.01255939, + "epoch": 0.9617916729295055, + "flos": 11259221304960.0, + "grad_norm": 1.8482484197146472, + "language_loss": 0.77136171, + "learning_rate": 1.52708595287494e-08, + "loss": 0.84799695, + "num_input_tokens_seen": 345150740, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.09423828, + "step": 15997, + "time_per_iteration": 2.5085597038269043 + }, + { + "auxiliary_loss_clip": 0.06397088, + "auxiliary_loss_mlp": 0.01264136, + "balance_loss_clip": 0.06270058, + "balance_loss_mlp": 0.01255344, + "epoch": 0.9618517961821734, + "flos": 22826235784320.0, + "grad_norm": 1.5938896462134406, + "language_loss": 0.67285407, + "learning_rate": 1.522286126505001e-08, + "loss": 0.7494663, + "num_input_tokens_seen": 345170365, + "router_z_loss_clip": 1.27050781, + "router_z_loss_mlp": 0.08789062, + "step": 15998, + "time_per_iteration": 2.5118253231048584 + }, + { + "auxiliary_loss_clip": 0.06399897, + "auxiliary_loss_mlp": 0.01264603, + "balance_loss_clip": 0.06270373, + "balance_loss_mlp": 0.01255072, + "epoch": 0.9619119194348414, + "flos": 16622848535040.0, + "grad_norm": 1.496371845917081, + "language_loss": 0.72930491, + "learning_rate": 1.5174938264298498e-08, + "loss": 0.80594993, + "num_input_tokens_seen": 345188930, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.09527588, + "step": 15999, + "time_per_iteration": 2.506864547729492 + }, + { + "auxiliary_loss_clip": 0.06395978, + "auxiliary_loss_mlp": 0.01265996, + "balance_loss_clip": 0.06269437, + "balance_loss_mlp": 0.01257574, + "epoch": 0.9619720426875094, + "flos": 24542037440640.0, + "grad_norm": 1.6415628522989876, + "language_loss": 0.65517807, + "learning_rate": 1.5127090528312514e-08, + "loss": 0.73179787, + "num_input_tokens_seen": 345209615, + "router_z_loss_clip": 1.265625, + "router_z_loss_mlp": 0.08422852, + "step": 16000, + "time_per_iteration": 2.5260074138641357 + }, + { + "auxiliary_loss_clip": 0.06402802, + "auxiliary_loss_mlp": 0.01263738, + "balance_loss_clip": 0.06270752, + "balance_loss_mlp": 0.01253295, + "epoch": 0.9620321659401774, + "flos": 20638647313920.0, + "grad_norm": 3.855036180657502, + "language_loss": 0.75523168, + "learning_rate": 1.5079318058905723e-08, + "loss": 0.83189702, + "num_input_tokens_seen": 345229175, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.10430908, + "step": 16001, + "time_per_iteration": 2.5201330184936523 + }, + { + "auxiliary_loss_clip": 0.06402686, + "auxiliary_loss_mlp": 0.01266273, + "balance_loss_clip": 0.06271547, + "balance_loss_mlp": 0.01256653, + "epoch": 0.9620922891928453, + "flos": 18521232238080.0, + "grad_norm": 1.6547442520201165, + "language_loss": 0.68397254, + "learning_rate": 1.5031620857890447e-08, + "loss": 0.76066214, + "num_input_tokens_seen": 345247815, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09619141, + "step": 16002, + "time_per_iteration": 2.483081817626953 + }, + { + "auxiliary_loss_clip": 0.06401039, + "auxiliary_loss_mlp": 0.01265908, + "balance_loss_clip": 0.06271882, + "balance_loss_mlp": 0.01256532, + "epoch": 0.9621524124455133, + "flos": 28774980875520.0, + "grad_norm": 1.195438695245258, + "language_loss": 0.64683259, + "learning_rate": 1.4983998927074804e-08, + "loss": 0.72350204, + "num_input_tokens_seen": 345269935, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.09375, + "step": 16003, + "time_per_iteration": 2.586229085922241 + }, + { + "auxiliary_loss_clip": 0.06402837, + "auxiliary_loss_mlp": 0.0126463, + "balance_loss_clip": 0.06271525, + "balance_loss_mlp": 0.01255141, + "epoch": 0.9622125356981813, + "flos": 19104882652800.0, + "grad_norm": 1.7948469305878696, + "language_loss": 0.7638011, + "learning_rate": 1.493645226826512e-08, + "loss": 0.8404758, + "num_input_tokens_seen": 345288310, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.09490967, + "step": 16004, + "time_per_iteration": 2.493025541305542 + }, + { + "auxiliary_loss_clip": 0.06399065, + "auxiliary_loss_mlp": 0.01264795, + "balance_loss_clip": 0.06270385, + "balance_loss_mlp": 0.01255223, + "epoch": 0.9622726589508492, + "flos": 20309010151680.0, + "grad_norm": 1.9981031350559504, + "language_loss": 0.79513681, + "learning_rate": 1.4888980883263958e-08, + "loss": 0.87177539, + "num_input_tokens_seen": 345306615, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.09570312, + "step": 16005, + "time_per_iteration": 2.5306947231292725 + }, + { + "auxiliary_loss_clip": 0.06400214, + "auxiliary_loss_mlp": 0.01262513, + "balance_loss_clip": 0.06271853, + "balance_loss_mlp": 0.01253435, + "epoch": 0.9623327822035173, + "flos": 54942060401280.0, + "grad_norm": 1.8020406678297956, + "language_loss": 0.68003178, + "learning_rate": 1.4841584773871652e-08, + "loss": 0.75665909, + "num_input_tokens_seen": 345331935, + "router_z_loss_clip": 1.28320312, + "router_z_loss_mlp": 0.09075928, + "step": 16006, + "time_per_iteration": 2.816959857940674 + }, + { + "auxiliary_loss_clip": 0.06397587, + "auxiliary_loss_mlp": 0.012623, + "balance_loss_clip": 0.06273156, + "balance_loss_mlp": 0.01253276, + "epoch": 0.9623929054561852, + "flos": 21764928769920.0, + "grad_norm": 1.5013515092363827, + "language_loss": 0.78550291, + "learning_rate": 1.479426394188521e-08, + "loss": 0.86210179, + "num_input_tokens_seen": 345351510, + "router_z_loss_clip": 1.24414062, + "router_z_loss_mlp": 0.09020996, + "step": 16007, + "time_per_iteration": 2.5247249603271484 + }, + { + "auxiliary_loss_clip": 0.0640254, + "auxiliary_loss_mlp": 0.01264076, + "balance_loss_clip": 0.06270196, + "balance_loss_mlp": 0.01254373, + "epoch": 0.9624530287088532, + "flos": 17937414115200.0, + "grad_norm": 1.816767417350666, + "language_loss": 0.67981744, + "learning_rate": 1.4747018389099198e-08, + "loss": 0.75648361, + "num_input_tokens_seen": 345367750, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.09698486, + "step": 16008, + "time_per_iteration": 2.4601643085479736 + }, + { + "auxiliary_loss_clip": 0.06404279, + "auxiliary_loss_mlp": 0.01265584, + "balance_loss_clip": 0.06271291, + "balance_loss_mlp": 0.01255248, + "epoch": 0.9625131519615211, + "flos": 23259686555520.0, + "grad_norm": 2.1142432172822456, + "language_loss": 0.73074311, + "learning_rate": 1.469984811730529e-08, + "loss": 0.80744171, + "num_input_tokens_seen": 345384790, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.10345459, + "step": 16009, + "time_per_iteration": 3.9339191913604736 + }, + { + "auxiliary_loss_clip": 0.06400783, + "auxiliary_loss_mlp": 0.01263245, + "balance_loss_clip": 0.06271462, + "balance_loss_mlp": 0.01254382, + "epoch": 0.9625732752141891, + "flos": 18922636022400.0, + "grad_norm": 2.192710915297057, + "language_loss": 0.7549693, + "learning_rate": 1.4652753128292061e-08, + "loss": 0.83160961, + "num_input_tokens_seen": 345403390, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.08856201, + "step": 16010, + "time_per_iteration": 2.5013561248779297 + }, + { + "auxiliary_loss_clip": 0.06405942, + "auxiliary_loss_mlp": 0.0126574, + "balance_loss_clip": 0.0627039, + "balance_loss_mlp": 0.01254319, + "epoch": 0.962633398466857, + "flos": 16258439128320.0, + "grad_norm": 1.712569944229846, + "language_loss": 0.69567752, + "learning_rate": 1.4605733423845635e-08, + "loss": 0.77239436, + "num_input_tokens_seen": 345418685, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.11419678, + "step": 16011, + "time_per_iteration": 2.4701602458953857 + }, + { + "auxiliary_loss_clip": 0.06400087, + "auxiliary_loss_mlp": 0.01263956, + "balance_loss_clip": 0.06270588, + "balance_loss_mlp": 0.01255069, + "epoch": 0.962693521719525, + "flos": 54209174664960.0, + "grad_norm": 1.8665711044506685, + "language_loss": 0.68777549, + "learning_rate": 1.4558789005748585e-08, + "loss": 0.76441598, + "num_input_tokens_seen": 345442380, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.08886719, + "step": 16012, + "time_per_iteration": 2.7930734157562256 + }, + { + "auxiliary_loss_clip": 0.06411394, + "auxiliary_loss_mlp": 0.01265092, + "balance_loss_clip": 0.06273941, + "balance_loss_mlp": 0.01254423, + "epoch": 0.962753644972193, + "flos": 33113540782080.0, + "grad_norm": 2.007287931479522, + "language_loss": 0.72470278, + "learning_rate": 1.4511919875781264e-08, + "loss": 0.8014676, + "num_input_tokens_seen": 345463815, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.10668945, + "step": 16013, + "time_per_iteration": 2.607010841369629 + }, + { + "auxiliary_loss_clip": 0.06398678, + "auxiliary_loss_mlp": 0.01263775, + "balance_loss_clip": 0.06269355, + "balance_loss_mlp": 0.01253821, + "epoch": 0.962813768224861, + "flos": 42240504839040.0, + "grad_norm": 2.1001634109531433, + "language_loss": 0.63370138, + "learning_rate": 1.4465126035720698e-08, + "loss": 0.71032596, + "num_input_tokens_seen": 345484525, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.09954834, + "step": 16014, + "time_per_iteration": 2.7006850242614746 + }, + { + "auxiliary_loss_clip": 0.06395663, + "auxiliary_loss_mlp": 0.01264971, + "balance_loss_clip": 0.06269664, + "balance_loss_mlp": 0.01256478, + "epoch": 0.9628738914775289, + "flos": 43954671340800.0, + "grad_norm": 1.4423438502368708, + "language_loss": 0.72028565, + "learning_rate": 1.4418407487341688e-08, + "loss": 0.79689205, + "num_input_tokens_seen": 345508295, + "router_z_loss_clip": 1.25976562, + "router_z_loss_mlp": 0.08483887, + "step": 16015, + "time_per_iteration": 2.7649402618408203 + }, + { + "auxiliary_loss_clip": 0.06401665, + "auxiliary_loss_mlp": 0.01265296, + "balance_loss_clip": 0.06270321, + "balance_loss_mlp": 0.01255414, + "epoch": 0.9629340147301969, + "flos": 15601596572160.0, + "grad_norm": 1.9682425360643256, + "language_loss": 0.77071536, + "learning_rate": 1.4371764232415707e-08, + "loss": 0.84738493, + "num_input_tokens_seen": 345525155, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.09881592, + "step": 16016, + "time_per_iteration": 4.025376796722412 + }, + { + "auxiliary_loss_clip": 0.06310678, + "auxiliary_loss_mlp": 0.01250101, + "balance_loss_clip": 0.06256417, + "balance_loss_mlp": 0.01249044, + "epoch": 0.9629941379828649, + "flos": 62969827870080.0, + "grad_norm": 0.8541107533621018, + "language_loss": 0.63163006, + "learning_rate": 1.4325196272711337e-08, + "loss": 0.70723784, + "num_input_tokens_seen": 345578905, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.01058197, + "step": 16017, + "time_per_iteration": 3.1209259033203125 + }, + { + "auxiliary_loss_clip": 0.06404077, + "auxiliary_loss_mlp": 0.01264759, + "balance_loss_clip": 0.0627223, + "balance_loss_mlp": 0.01255103, + "epoch": 0.9630542612355328, + "flos": 29907006336000.0, + "grad_norm": 1.7708678376407427, + "language_loss": 0.67122102, + "learning_rate": 1.4278703609994502e-08, + "loss": 0.74790937, + "num_input_tokens_seen": 345598965, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09649658, + "step": 16018, + "time_per_iteration": 2.6136341094970703 + }, + { + "auxiliary_loss_clip": 0.06403263, + "auxiliary_loss_mlp": 0.01262583, + "balance_loss_clip": 0.06271482, + "balance_loss_mlp": 0.01253011, + "epoch": 0.9631143844882009, + "flos": 17900335883520.0, + "grad_norm": 1.6914005371501741, + "language_loss": 0.79650891, + "learning_rate": 1.4232286246028457e-08, + "loss": 0.8731674, + "num_input_tokens_seen": 345617945, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.0958252, + "step": 16019, + "time_per_iteration": 2.6144886016845703 + }, + { + "auxiliary_loss_clip": 0.06397004, + "auxiliary_loss_mlp": 0.01263057, + "balance_loss_clip": 0.06269085, + "balance_loss_mlp": 0.01254641, + "epoch": 0.9631745077408688, + "flos": 26146101277440.0, + "grad_norm": 1.351412513525788, + "language_loss": 0.71868813, + "learning_rate": 1.4185944182572907e-08, + "loss": 0.79528868, + "num_input_tokens_seen": 345637920, + "router_z_loss_clip": 1.27832031, + "router_z_loss_mlp": 0.08410645, + "step": 16020, + "time_per_iteration": 2.537116765975952 + }, + { + "auxiliary_loss_clip": 0.06400692, + "auxiliary_loss_mlp": 0.01266716, + "balance_loss_clip": 0.06269675, + "balance_loss_mlp": 0.01257555, + "epoch": 0.9632346309935368, + "flos": 24980729091840.0, + "grad_norm": 1.6112903009597273, + "language_loss": 0.76956975, + "learning_rate": 1.4139677421385331e-08, + "loss": 0.84624374, + "num_input_tokens_seen": 345656195, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.09161377, + "step": 16021, + "time_per_iteration": 2.5795507431030273 + }, + { + "auxiliary_loss_clip": 0.06410046, + "auxiliary_loss_mlp": 0.0126309, + "balance_loss_clip": 0.06272537, + "balance_loss_mlp": 0.01253065, + "epoch": 0.9632947542462047, + "flos": 23623005859200.0, + "grad_norm": 2.052482591151295, + "language_loss": 0.65333438, + "learning_rate": 1.4093485964220331e-08, + "loss": 0.73006582, + "num_input_tokens_seen": 345676700, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.10028076, + "step": 16022, + "time_per_iteration": 2.4925694465637207 + }, + { + "auxiliary_loss_clip": 0.06400712, + "auxiliary_loss_mlp": 0.0126106, + "balance_loss_clip": 0.06271265, + "balance_loss_mlp": 0.01251887, + "epoch": 0.9633548774988727, + "flos": 26402755933440.0, + "grad_norm": 2.041932123027993, + "language_loss": 0.73429894, + "learning_rate": 1.4047369812829168e-08, + "loss": 0.81091666, + "num_input_tokens_seen": 345696725, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.09179688, + "step": 16023, + "time_per_iteration": 2.6148433685302734 + }, + { + "auxiliary_loss_clip": 0.06398109, + "auxiliary_loss_mlp": 0.012652, + "balance_loss_clip": 0.06269968, + "balance_loss_mlp": 0.01256456, + "epoch": 0.9634150007515406, + "flos": 23774295605760.0, + "grad_norm": 1.3771901625449594, + "language_loss": 0.8138119, + "learning_rate": 1.4001328968960891e-08, + "loss": 0.89044499, + "num_input_tokens_seen": 345716245, + "router_z_loss_clip": 1.28027344, + "router_z_loss_mlp": 0.08746338, + "step": 16024, + "time_per_iteration": 2.521254539489746 + }, + { + "auxiliary_loss_clip": 0.06408462, + "auxiliary_loss_mlp": 0.01262523, + "balance_loss_clip": 0.06271751, + "balance_loss_mlp": 0.01252671, + "epoch": 0.9634751240042086, + "flos": 24142436519040.0, + "grad_norm": 1.3519204413028436, + "language_loss": 0.81720084, + "learning_rate": 1.3955363434361212e-08, + "loss": 0.89391065, + "num_input_tokens_seen": 345739060, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.09857178, + "step": 16025, + "time_per_iteration": 4.056759595870972 + }, + { + "auxiliary_loss_clip": 0.06406177, + "auxiliary_loss_mlp": 0.0126363, + "balance_loss_clip": 0.06270571, + "balance_loss_mlp": 0.01254421, + "epoch": 0.9635352472568766, + "flos": 24355346544000.0, + "grad_norm": 1.6633226224806905, + "language_loss": 0.76957327, + "learning_rate": 1.3909473210773181e-08, + "loss": 0.8462714, + "num_input_tokens_seen": 345758325, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.09210205, + "step": 16026, + "time_per_iteration": 2.5177974700927734 + }, + { + "auxiliary_loss_clip": 0.0640067, + "auxiliary_loss_mlp": 0.01270768, + "balance_loss_clip": 0.06268805, + "balance_loss_mlp": 0.01260993, + "epoch": 0.9635953705095446, + "flos": 23991062918400.0, + "grad_norm": 1.6668938865230072, + "language_loss": 0.6339668, + "learning_rate": 1.3863658299936965e-08, + "loss": 0.7106812, + "num_input_tokens_seen": 345778530, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09777832, + "step": 16027, + "time_per_iteration": 2.5560450553894043 + }, + { + "auxiliary_loss_clip": 0.06407472, + "auxiliary_loss_mlp": 0.01267154, + "balance_loss_clip": 0.06273127, + "balance_loss_mlp": 0.01257462, + "epoch": 0.9636554937622125, + "flos": 19834540007040.0, + "grad_norm": 1.6733275013477416, + "language_loss": 0.87025476, + "learning_rate": 1.3817918703589837e-08, + "loss": 0.94700098, + "num_input_tokens_seen": 345796535, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.09692383, + "step": 16028, + "time_per_iteration": 3.9252398014068604 + }, + { + "auxiliary_loss_clip": 0.0631086, + "auxiliary_loss_mlp": 0.01252273, + "balance_loss_clip": 0.06256698, + "balance_loss_mlp": 0.01251267, + "epoch": 0.9637156170148805, + "flos": 67454520497280.0, + "grad_norm": 0.6687418840467081, + "language_loss": 0.53127611, + "learning_rate": 1.3772254423466412e-08, + "loss": 0.60690737, + "num_input_tokens_seen": 345859700, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.01006317, + "step": 16029, + "time_per_iteration": 3.0885190963745117 + }, + { + "auxiliary_loss_clip": 0.06406175, + "auxiliary_loss_mlp": 0.01263355, + "balance_loss_clip": 0.06271643, + "balance_loss_mlp": 0.01253562, + "epoch": 0.9637757402675484, + "flos": 20306788018560.0, + "grad_norm": 1.5288285449125392, + "language_loss": 0.74157113, + "learning_rate": 1.372666546129797e-08, + "loss": 0.81826651, + "num_input_tokens_seen": 345878760, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.09796143, + "step": 16030, + "time_per_iteration": 2.5154569149017334 + }, + { + "auxiliary_loss_clip": 0.0639952, + "auxiliary_loss_mlp": 0.0126644, + "balance_loss_clip": 0.06270611, + "balance_loss_mlp": 0.01257249, + "epoch": 0.9638358635202164, + "flos": 27241803192960.0, + "grad_norm": 1.8304305412759827, + "language_loss": 0.65878218, + "learning_rate": 1.3681151818813575e-08, + "loss": 0.7354418, + "num_input_tokens_seen": 345900445, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.09191895, + "step": 16031, + "time_per_iteration": 2.5666158199310303 + }, + { + "auxiliary_loss_clip": 0.06310733, + "auxiliary_loss_mlp": 0.01250007, + "balance_loss_clip": 0.06256757, + "balance_loss_mlp": 0.0124902, + "epoch": 0.9638959867728845, + "flos": 70309768700160.0, + "grad_norm": 0.855502378370066, + "language_loss": 0.60727084, + "learning_rate": 1.3635713497738955e-08, + "loss": 0.68287826, + "num_input_tokens_seen": 345961020, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.00986481, + "step": 16032, + "time_per_iteration": 3.1735146045684814 + }, + { + "auxiliary_loss_clip": 0.06392821, + "auxiliary_loss_mlp": 0.01264604, + "balance_loss_clip": 0.06269621, + "balance_loss_mlp": 0.01256319, + "epoch": 0.9639561100255524, + "flos": 25414012154880.0, + "grad_norm": 1.8239636455461146, + "language_loss": 0.66663599, + "learning_rate": 1.3590350499796954e-08, + "loss": 0.74321026, + "num_input_tokens_seen": 345980210, + "router_z_loss_clip": 1.23242188, + "router_z_loss_mlp": 0.08282471, + "step": 16033, + "time_per_iteration": 2.56622314453125 + }, + { + "auxiliary_loss_clip": 0.06402284, + "auxiliary_loss_mlp": 0.01261476, + "balance_loss_clip": 0.06272136, + "balance_loss_mlp": 0.01252363, + "epoch": 0.9640162332782204, + "flos": 18119744599680.0, + "grad_norm": 1.7952029192998942, + "language_loss": 0.65676892, + "learning_rate": 1.3545062826707976e-08, + "loss": 0.73340648, + "num_input_tokens_seen": 345998280, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.09112549, + "step": 16034, + "time_per_iteration": 2.4902241230010986 + }, + { + "auxiliary_loss_clip": 0.06397773, + "auxiliary_loss_mlp": 0.01263891, + "balance_loss_clip": 0.06269251, + "balance_loss_mlp": 0.01254539, + "epoch": 0.9640763565308883, + "flos": 23446964430720.0, + "grad_norm": 2.2171721620826665, + "language_loss": 0.74419838, + "learning_rate": 1.3499850480189313e-08, + "loss": 0.82081503, + "num_input_tokens_seen": 346015545, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.09356689, + "step": 16035, + "time_per_iteration": 2.5261058807373047 + }, + { + "auxiliary_loss_clip": 0.06402203, + "auxiliary_loss_mlp": 0.01260621, + "balance_loss_clip": 0.06272241, + "balance_loss_mlp": 0.01252038, + "epoch": 0.9641364797835563, + "flos": 22425964030080.0, + "grad_norm": 1.85699593571715, + "language_loss": 0.82645416, + "learning_rate": 1.3454713461955591e-08, + "loss": 0.90308243, + "num_input_tokens_seen": 346034055, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.08575439, + "step": 16036, + "time_per_iteration": 2.5058321952819824 + }, + { + "auxiliary_loss_clip": 0.06399503, + "auxiliary_loss_mlp": 0.01262795, + "balance_loss_clip": 0.0626888, + "balance_loss_mlp": 0.01253478, + "epoch": 0.9641966030362242, + "flos": 30629284531200.0, + "grad_norm": 1.8406886485490508, + "language_loss": 0.70395046, + "learning_rate": 1.340965177371789e-08, + "loss": 0.78057343, + "num_input_tokens_seen": 346054130, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09320068, + "step": 16037, + "time_per_iteration": 2.5934836864471436 + }, + { + "auxiliary_loss_clip": 0.06400578, + "auxiliary_loss_mlp": 0.01265146, + "balance_loss_clip": 0.06268116, + "balance_loss_mlp": 0.0125602, + "epoch": 0.9642567262888923, + "flos": 20958347767680.0, + "grad_norm": 2.2949598508589024, + "language_loss": 0.63063121, + "learning_rate": 1.3364665417185506e-08, + "loss": 0.7072885, + "num_input_tokens_seen": 346072990, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.09124756, + "step": 16038, + "time_per_iteration": 2.528991460800171 + }, + { + "auxiliary_loss_clip": 0.06402931, + "auxiliary_loss_mlp": 0.01266559, + "balance_loss_clip": 0.06269977, + "balance_loss_mlp": 0.01256867, + "epoch": 0.9643168495415602, + "flos": 22646253214080.0, + "grad_norm": 2.710323469198111, + "language_loss": 0.70936692, + "learning_rate": 1.3319754394064187e-08, + "loss": 0.78606176, + "num_input_tokens_seen": 346093745, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.09692383, + "step": 16039, + "time_per_iteration": 2.559022903442383 + }, + { + "auxiliary_loss_clip": 0.06404687, + "auxiliary_loss_mlp": 0.0126847, + "balance_loss_clip": 0.06272136, + "balance_loss_mlp": 0.01258567, + "epoch": 0.9643769727942282, + "flos": 20272435044480.0, + "grad_norm": 1.9949930425544389, + "language_loss": 0.73979366, + "learning_rate": 1.327491870605657e-08, + "loss": 0.81652522, + "num_input_tokens_seen": 346110115, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09902954, + "step": 16040, + "time_per_iteration": 2.500765562057495 + }, + { + "auxiliary_loss_clip": 0.06403273, + "auxiliary_loss_mlp": 0.01263933, + "balance_loss_clip": 0.06270061, + "balance_loss_mlp": 0.01254777, + "epoch": 0.9644370960468961, + "flos": 13887052727040.0, + "grad_norm": 1.8870655198248234, + "language_loss": 0.72925007, + "learning_rate": 1.3230158354863296e-08, + "loss": 0.80592215, + "num_input_tokens_seen": 346127165, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.09155273, + "step": 16041, + "time_per_iteration": 2.4811394214630127 + }, + { + "auxiliary_loss_clip": 0.06392974, + "auxiliary_loss_mlp": 0.01262963, + "balance_loss_clip": 0.06269765, + "balance_loss_mlp": 0.01254207, + "epoch": 0.9644972192995641, + "flos": 17243912597760.0, + "grad_norm": 1.6302297136942336, + "language_loss": 0.72166139, + "learning_rate": 1.3185473342181674e-08, + "loss": 0.79822075, + "num_input_tokens_seen": 346145950, + "router_z_loss_clip": 1.23144531, + "router_z_loss_mlp": 0.08764648, + "step": 16042, + "time_per_iteration": 2.464141368865967 + }, + { + "auxiliary_loss_clip": 0.06405792, + "auxiliary_loss_mlp": 0.01262034, + "balance_loss_clip": 0.06271081, + "balance_loss_mlp": 0.01253404, + "epoch": 0.964557342552232, + "flos": 23846858841600.0, + "grad_norm": 1.7036888779753476, + "language_loss": 0.81625164, + "learning_rate": 1.3140863669705683e-08, + "loss": 0.89292991, + "num_input_tokens_seen": 346165005, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.08636475, + "step": 16043, + "time_per_iteration": 2.5336403846740723 + }, + { + "auxiliary_loss_clip": 0.0640493, + "auxiliary_loss_mlp": 0.012649, + "balance_loss_clip": 0.06274771, + "balance_loss_mlp": 0.0125607, + "epoch": 0.9646174658049, + "flos": 21659605787520.0, + "grad_norm": 1.4250533671062502, + "language_loss": 0.71966612, + "learning_rate": 1.3096329339127522e-08, + "loss": 0.79636443, + "num_input_tokens_seen": 346185095, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.08831787, + "step": 16044, + "time_per_iteration": 2.4873435497283936 + }, + { + "auxiliary_loss_clip": 0.06398635, + "auxiliary_loss_mlp": 0.01260999, + "balance_loss_clip": 0.06268857, + "balance_loss_mlp": 0.01251611, + "epoch": 0.9646775890575681, + "flos": 17135403160320.0, + "grad_norm": 1.6962750102757636, + "language_loss": 0.70311677, + "learning_rate": 1.3051870352135397e-08, + "loss": 0.77971309, + "num_input_tokens_seen": 346202580, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.09387207, + "step": 16045, + "time_per_iteration": 2.5005507469177246 + }, + { + "auxiliary_loss_clip": 0.06401645, + "auxiliary_loss_mlp": 0.01264346, + "balance_loss_clip": 0.06270438, + "balance_loss_mlp": 0.01255405, + "epoch": 0.964737712310236, + "flos": 13010717600640.0, + "grad_norm": 1.7958263286958636, + "language_loss": 0.75115418, + "learning_rate": 1.3007486710415737e-08, + "loss": 0.82781404, + "num_input_tokens_seen": 346219395, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.0894165, + "step": 16046, + "time_per_iteration": 2.4769492149353027 + }, + { + "auxiliary_loss_clip": 0.06402702, + "auxiliary_loss_mlp": 0.0126378, + "balance_loss_clip": 0.06269902, + "balance_loss_mlp": 0.0125391, + "epoch": 0.964797835562904, + "flos": 24286011690240.0, + "grad_norm": 1.5485557136808419, + "language_loss": 0.62918746, + "learning_rate": 1.2963178415651199e-08, + "loss": 0.70585227, + "num_input_tokens_seen": 346239715, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.09863281, + "step": 16047, + "time_per_iteration": 2.5291333198547363 + }, + { + "auxiliary_loss_clip": 0.06404603, + "auxiliary_loss_mlp": 0.01265766, + "balance_loss_clip": 0.06273589, + "balance_loss_mlp": 0.01256468, + "epoch": 0.9648579588155719, + "flos": 20529089700480.0, + "grad_norm": 1.7369231208534281, + "language_loss": 0.69178629, + "learning_rate": 1.2918945469521992e-08, + "loss": 0.76849002, + "num_input_tokens_seen": 346258500, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09295654, + "step": 16048, + "time_per_iteration": 3.8918800354003906 + }, + { + "auxiliary_loss_clip": 0.06404486, + "auxiliary_loss_mlp": 0.01267225, + "balance_loss_clip": 0.0627009, + "balance_loss_mlp": 0.01257027, + "epoch": 0.9649180820682399, + "flos": 32162042943360.0, + "grad_norm": 1.8321763154478243, + "language_loss": 0.63903487, + "learning_rate": 1.2874787873705662e-08, + "loss": 0.71575201, + "num_input_tokens_seen": 346279110, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.10186768, + "step": 16049, + "time_per_iteration": 2.570418119430542 + }, + { + "auxiliary_loss_clip": 0.06402539, + "auxiliary_loss_mlp": 0.01261874, + "balance_loss_clip": 0.06269829, + "balance_loss_mlp": 0.01252886, + "epoch": 0.9649782053209078, + "flos": 20528963919360.0, + "grad_norm": 1.7027576373675015, + "language_loss": 0.71291816, + "learning_rate": 1.2830705629876427e-08, + "loss": 0.78956234, + "num_input_tokens_seen": 346297860, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.08990479, + "step": 16050, + "time_per_iteration": 2.5361132621765137 + }, + { + "auxiliary_loss_clip": 0.06408322, + "auxiliary_loss_mlp": 0.01263097, + "balance_loss_clip": 0.06272562, + "balance_loss_mlp": 0.01252988, + "epoch": 0.9650383285735759, + "flos": 43077623454720.0, + "grad_norm": 1.7842069676990906, + "language_loss": 0.70066154, + "learning_rate": 1.278669873970606e-08, + "loss": 0.77737582, + "num_input_tokens_seen": 346319860, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10107422, + "step": 16051, + "time_per_iteration": 2.677975654602051 + }, + { + "auxiliary_loss_clip": 0.06307529, + "auxiliary_loss_mlp": 0.01252916, + "balance_loss_clip": 0.06253548, + "balance_loss_mlp": 0.01251916, + "epoch": 0.9650984518262438, + "flos": 61767083963520.0, + "grad_norm": 0.8182337392431096, + "language_loss": 0.59232974, + "learning_rate": 1.2742767204863004e-08, + "loss": 0.66793418, + "num_input_tokens_seen": 346379025, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.00999451, + "step": 16052, + "time_per_iteration": 3.1726770401000977 + }, + { + "auxiliary_loss_clip": 0.063959, + "auxiliary_loss_mlp": 0.01262271, + "balance_loss_clip": 0.06268722, + "balance_loss_mlp": 0.01253068, + "epoch": 0.9651585750789118, + "flos": 29797155233280.0, + "grad_norm": 1.566863639244542, + "language_loss": 0.74622291, + "learning_rate": 1.2698911027013482e-08, + "loss": 0.82280469, + "num_input_tokens_seen": 346402250, + "router_z_loss_clip": 1.27148438, + "router_z_loss_mlp": 0.09204102, + "step": 16053, + "time_per_iteration": 2.576726198196411 + }, + { + "auxiliary_loss_clip": 0.06400575, + "auxiliary_loss_mlp": 0.01262414, + "balance_loss_clip": 0.06268197, + "balance_loss_mlp": 0.01252883, + "epoch": 0.9652186983315797, + "flos": 16878664650240.0, + "grad_norm": 1.8893492919268848, + "language_loss": 0.68987983, + "learning_rate": 1.2655130207820386e-08, + "loss": 0.76650977, + "num_input_tokens_seen": 346419555, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.09527588, + "step": 16054, + "time_per_iteration": 2.479691505432129 + }, + { + "auxiliary_loss_clip": 0.064014, + "auxiliary_loss_mlp": 0.01263325, + "balance_loss_clip": 0.06271985, + "balance_loss_mlp": 0.01254968, + "epoch": 0.9652788215842477, + "flos": 31657831799040.0, + "grad_norm": 1.4060273362324986, + "language_loss": 0.62068862, + "learning_rate": 1.2611424748943944e-08, + "loss": 0.69733584, + "num_input_tokens_seen": 346441245, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.08361816, + "step": 16055, + "time_per_iteration": 4.018486499786377 + }, + { + "auxiliary_loss_clip": 0.06399205, + "auxiliary_loss_mlp": 0.01264446, + "balance_loss_clip": 0.06270594, + "balance_loss_mlp": 0.01255433, + "epoch": 0.9653389448369156, + "flos": 24761236521600.0, + "grad_norm": 1.998915754260937, + "language_loss": 0.76546788, + "learning_rate": 1.2567794652041719e-08, + "loss": 0.84210438, + "num_input_tokens_seen": 346460065, + "router_z_loss_clip": 1.28613281, + "router_z_loss_mlp": 0.09008789, + "step": 16056, + "time_per_iteration": 2.551823854446411 + }, + { + "auxiliary_loss_clip": 0.06400546, + "auxiliary_loss_mlp": 0.01267414, + "balance_loss_clip": 0.06270006, + "balance_loss_mlp": 0.01258729, + "epoch": 0.9653990680895836, + "flos": 20302511460480.0, + "grad_norm": 1.692625022004946, + "language_loss": 0.72081912, + "learning_rate": 1.2524239918767498e-08, + "loss": 0.79749864, + "num_input_tokens_seen": 346478005, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.08679199, + "step": 16057, + "time_per_iteration": 2.5352344512939453 + }, + { + "auxiliary_loss_clip": 0.06398775, + "auxiliary_loss_mlp": 0.01263303, + "balance_loss_clip": 0.06269361, + "balance_loss_mlp": 0.01254189, + "epoch": 0.9654591913422517, + "flos": 22535395862400.0, + "grad_norm": 1.7376745718681348, + "language_loss": 0.71854722, + "learning_rate": 1.2480760550773295e-08, + "loss": 0.79516792, + "num_input_tokens_seen": 346497575, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.09118652, + "step": 16058, + "time_per_iteration": 2.5178182125091553 + }, + { + "auxiliary_loss_clip": 0.06398124, + "auxiliary_loss_mlp": 0.01263491, + "balance_loss_clip": 0.06270248, + "balance_loss_mlp": 0.0125408, + "epoch": 0.9655193145949196, + "flos": 26770645284480.0, + "grad_norm": 1.546516279721211, + "language_loss": 0.74440265, + "learning_rate": 1.2437356549708011e-08, + "loss": 0.82101882, + "num_input_tokens_seen": 346520000, + "router_z_loss_clip": 1.27832031, + "router_z_loss_mlp": 0.09405518, + "step": 16059, + "time_per_iteration": 2.562965154647827 + }, + { + "auxiliary_loss_clip": 0.0640713, + "auxiliary_loss_mlp": 0.01266425, + "balance_loss_clip": 0.06271156, + "balance_loss_mlp": 0.01256746, + "epoch": 0.9655794378475876, + "flos": 41979741333120.0, + "grad_norm": 1.699778030433775, + "language_loss": 0.73402834, + "learning_rate": 1.239402791721722e-08, + "loss": 0.8107639, + "num_input_tokens_seen": 346541605, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.09680176, + "step": 16060, + "time_per_iteration": 2.6744906902313232 + }, + { + "auxiliary_loss_clip": 0.06393793, + "auxiliary_loss_mlp": 0.01261439, + "balance_loss_clip": 0.06268264, + "balance_loss_mlp": 0.01252915, + "epoch": 0.9656395611002555, + "flos": 27716860316160.0, + "grad_norm": 1.5503214965387115, + "language_loss": 0.7667194, + "learning_rate": 1.2350774654944273e-08, + "loss": 0.84327173, + "num_input_tokens_seen": 346560955, + "router_z_loss_clip": 1.25488281, + "router_z_loss_mlp": 0.08526611, + "step": 16061, + "time_per_iteration": 2.5771090984344482 + }, + { + "auxiliary_loss_clip": 0.0630983, + "auxiliary_loss_mlp": 0.01248501, + "balance_loss_clip": 0.06255753, + "balance_loss_mlp": 0.01247496, + "epoch": 0.9656996843529235, + "flos": 68987949742080.0, + "grad_norm": 0.780210844217019, + "language_loss": 0.64234674, + "learning_rate": 1.2307596764528749e-08, + "loss": 0.71793002, + "num_input_tokens_seen": 346621615, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.01005554, + "step": 16062, + "time_per_iteration": 3.2013790607452393 + }, + { + "auxiliary_loss_clip": 0.06394887, + "auxiliary_loss_mlp": 0.01266088, + "balance_loss_clip": 0.06270029, + "balance_loss_mlp": 0.01257683, + "epoch": 0.9657598076055914, + "flos": 20637599137920.0, + "grad_norm": 2.251945173497628, + "language_loss": 0.934484, + "learning_rate": 1.226449424760867e-08, + "loss": 1.01109374, + "num_input_tokens_seen": 346637460, + "router_z_loss_clip": 1.24707031, + "router_z_loss_mlp": 0.08413696, + "step": 16063, + "time_per_iteration": 2.5948007106781006 + }, + { + "auxiliary_loss_clip": 0.06403172, + "auxiliary_loss_mlp": 0.01266989, + "balance_loss_clip": 0.06272347, + "balance_loss_mlp": 0.01257953, + "epoch": 0.9658199308582595, + "flos": 20454765528960.0, + "grad_norm": 1.8938965740794855, + "language_loss": 0.81982899, + "learning_rate": 1.2221467105818062e-08, + "loss": 0.89653063, + "num_input_tokens_seen": 346655625, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09039307, + "step": 16064, + "time_per_iteration": 3.946958303451538 + }, + { + "auxiliary_loss_clip": 0.06401001, + "auxiliary_loss_mlp": 0.01261606, + "balance_loss_clip": 0.0627339, + "balance_loss_mlp": 0.01252772, + "epoch": 0.9658800541109274, + "flos": 24725038757760.0, + "grad_norm": 1.5779341158882096, + "language_loss": 0.84311408, + "learning_rate": 1.2178515340788731e-08, + "loss": 0.91974014, + "num_input_tokens_seen": 346675220, + "router_z_loss_clip": 1.27636719, + "router_z_loss_mlp": 0.08837891, + "step": 16065, + "time_per_iteration": 2.647083282470703 + }, + { + "auxiliary_loss_clip": 0.06402124, + "auxiliary_loss_mlp": 0.0126448, + "balance_loss_clip": 0.06270837, + "balance_loss_mlp": 0.01255623, + "epoch": 0.9659401773635954, + "flos": 21615399959040.0, + "grad_norm": 2.432738378484276, + "language_loss": 0.67548525, + "learning_rate": 1.2135638954149151e-08, + "loss": 0.75215131, + "num_input_tokens_seen": 346694710, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.08850098, + "step": 16066, + "time_per_iteration": 2.568356990814209 + }, + { + "auxiliary_loss_clip": 0.06398377, + "auxiliary_loss_mlp": 0.0126593, + "balance_loss_clip": 0.06268573, + "balance_loss_mlp": 0.01256897, + "epoch": 0.9660003006162633, + "flos": 20307123434880.0, + "grad_norm": 1.7910369908094568, + "language_loss": 0.82607502, + "learning_rate": 1.209283794752558e-08, + "loss": 0.90271813, + "num_input_tokens_seen": 346712645, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.09036255, + "step": 16067, + "time_per_iteration": 2.5698952674865723 + }, + { + "auxiliary_loss_clip": 0.06401904, + "auxiliary_loss_mlp": 0.01264106, + "balance_loss_clip": 0.06271727, + "balance_loss_mlp": 0.01254325, + "epoch": 0.9660604238689313, + "flos": 24468803372160.0, + "grad_norm": 1.671137077977421, + "language_loss": 0.69428784, + "learning_rate": 1.2050112322540496e-08, + "loss": 0.77094793, + "num_input_tokens_seen": 346732375, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.09783936, + "step": 16068, + "time_per_iteration": 4.0155861377716064 + }, + { + "auxiliary_loss_clip": 0.0639426, + "auxiliary_loss_mlp": 0.01267688, + "balance_loss_clip": 0.06271375, + "balance_loss_mlp": 0.01259755, + "epoch": 0.9661205471215992, + "flos": 19869983084160.0, + "grad_norm": 1.7705169776652172, + "language_loss": 0.68107969, + "learning_rate": 1.20074620808146e-08, + "loss": 0.75769919, + "num_input_tokens_seen": 346750430, + "router_z_loss_clip": 1.22949219, + "router_z_loss_mlp": 0.07928467, + "step": 16069, + "time_per_iteration": 2.496572256088257 + }, + { + "auxiliary_loss_clip": 0.06400932, + "auxiliary_loss_mlp": 0.01262822, + "balance_loss_clip": 0.06271296, + "balance_loss_mlp": 0.01253595, + "epoch": 0.9661806703742672, + "flos": 20564071580160.0, + "grad_norm": 1.710702523196639, + "language_loss": 0.89453393, + "learning_rate": 1.1964887223964826e-08, + "loss": 0.9711715, + "num_input_tokens_seen": 346768455, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.09228516, + "step": 16070, + "time_per_iteration": 2.58213472366333 + }, + { + "auxiliary_loss_clip": 0.06401291, + "auxiliary_loss_mlp": 0.01266178, + "balance_loss_clip": 0.06269821, + "balance_loss_mlp": 0.01255884, + "epoch": 0.9662407936269353, + "flos": 21436842908160.0, + "grad_norm": 2.0425263157777604, + "language_loss": 0.77503681, + "learning_rate": 1.1922387753605878e-08, + "loss": 0.85171151, + "num_input_tokens_seen": 346786530, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.10290527, + "step": 16071, + "time_per_iteration": 2.5824472904205322 + }, + { + "auxiliary_loss_clip": 0.06395756, + "auxiliary_loss_mlp": 0.01263186, + "balance_loss_clip": 0.06267762, + "balance_loss_mlp": 0.01253059, + "epoch": 0.9663009168796032, + "flos": 14908178908800.0, + "grad_norm": 1.734770632308268, + "language_loss": 0.66013038, + "learning_rate": 1.1879963671349137e-08, + "loss": 0.73671985, + "num_input_tokens_seen": 346804635, + "router_z_loss_clip": 1.28027344, + "router_z_loss_mlp": 0.10113525, + "step": 16072, + "time_per_iteration": 2.5067734718322754 + }, + { + "auxiliary_loss_clip": 0.06405023, + "auxiliary_loss_mlp": 0.01263151, + "balance_loss_clip": 0.06272658, + "balance_loss_mlp": 0.01253936, + "epoch": 0.9663610401322712, + "flos": 24316842792960.0, + "grad_norm": 1.5748682596234602, + "language_loss": 0.78113818, + "learning_rate": 1.1837614978803534e-08, + "loss": 0.85781991, + "num_input_tokens_seen": 346823070, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.09216309, + "step": 16073, + "time_per_iteration": 2.608790636062622 + }, + { + "auxiliary_loss_clip": 0.06407337, + "auxiliary_loss_mlp": 0.01262702, + "balance_loss_clip": 0.06271721, + "balance_loss_mlp": 0.01252706, + "epoch": 0.9664211633849391, + "flos": 17643345811200.0, + "grad_norm": 1.9546716126874173, + "language_loss": 0.75967658, + "learning_rate": 1.1795341677574677e-08, + "loss": 0.83637702, + "num_input_tokens_seen": 346841180, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.09991455, + "step": 16074, + "time_per_iteration": 2.613964557647705 + }, + { + "auxiliary_loss_clip": 0.06400394, + "auxiliary_loss_mlp": 0.01265189, + "balance_loss_clip": 0.06268935, + "balance_loss_mlp": 0.01255223, + "epoch": 0.9664812866376071, + "flos": 29797239087360.0, + "grad_norm": 1.3986555912156662, + "language_loss": 0.75712979, + "learning_rate": 1.1753143769265728e-08, + "loss": 0.83378559, + "num_input_tokens_seen": 346864250, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.09960938, + "step": 16075, + "time_per_iteration": 2.7057979106903076 + }, + { + "auxiliary_loss_clip": 0.06403182, + "auxiliary_loss_mlp": 0.01265451, + "balance_loss_clip": 0.06269626, + "balance_loss_mlp": 0.01256255, + "epoch": 0.966541409890275, + "flos": 14287450262400.0, + "grad_norm": 1.7774078486578757, + "language_loss": 0.78800076, + "learning_rate": 1.171102125547696e-08, + "loss": 0.86468703, + "num_input_tokens_seen": 346881955, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.09204102, + "step": 16076, + "time_per_iteration": 2.5402417182922363 + }, + { + "auxiliary_loss_clip": 0.06401198, + "auxiliary_loss_mlp": 0.01265198, + "balance_loss_clip": 0.06270036, + "balance_loss_mlp": 0.01255322, + "epoch": 0.9666015331429431, + "flos": 19865790380160.0, + "grad_norm": 1.5934368657490992, + "language_loss": 0.72737241, + "learning_rate": 1.166897413780532e-08, + "loss": 0.80403632, + "num_input_tokens_seen": 346900445, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.09875488, + "step": 16077, + "time_per_iteration": 2.535360097885132 + }, + { + "auxiliary_loss_clip": 0.06399302, + "auxiliary_loss_mlp": 0.01263469, + "balance_loss_clip": 0.06269421, + "balance_loss_mlp": 0.01254129, + "epoch": 0.966661656395611, + "flos": 27133335682560.0, + "grad_norm": 1.8761219493118404, + "language_loss": 0.59630072, + "learning_rate": 1.1627002417845533e-08, + "loss": 0.67292845, + "num_input_tokens_seen": 346920135, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.09344482, + "step": 16078, + "time_per_iteration": 2.561671018600464 + }, + { + "auxiliary_loss_clip": 0.06405197, + "auxiliary_loss_mlp": 0.01262495, + "balance_loss_clip": 0.06270532, + "balance_loss_mlp": 0.01252428, + "epoch": 0.966721779648279, + "flos": 21514856659200.0, + "grad_norm": 1.7688121157900791, + "language_loss": 0.72058022, + "learning_rate": 1.158510609718899e-08, + "loss": 0.79725718, + "num_input_tokens_seen": 346940450, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.10064697, + "step": 16079, + "time_per_iteration": 2.529829263687134 + }, + { + "auxiliary_loss_clip": 0.06397161, + "auxiliary_loss_mlp": 0.01264122, + "balance_loss_clip": 0.06270564, + "balance_loss_mlp": 0.01255199, + "epoch": 0.9667819029009469, + "flos": 23884859468160.0, + "grad_norm": 1.5564630804369735, + "language_loss": 0.72879219, + "learning_rate": 1.1543285177424644e-08, + "loss": 0.80540496, + "num_input_tokens_seen": 346960935, + "router_z_loss_clip": 1.265625, + "router_z_loss_mlp": 0.08917236, + "step": 16080, + "time_per_iteration": 2.5244600772857666 + }, + { + "auxiliary_loss_clip": 0.06398826, + "auxiliary_loss_mlp": 0.01264318, + "balance_loss_clip": 0.06269746, + "balance_loss_mlp": 0.01255324, + "epoch": 0.9668420261536149, + "flos": 21513682702080.0, + "grad_norm": 1.8020849522821436, + "language_loss": 0.74110532, + "learning_rate": 1.1501539660138115e-08, + "loss": 0.81773674, + "num_input_tokens_seen": 346980100, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.08990479, + "step": 16081, + "time_per_iteration": 2.4842236042022705 + }, + { + "auxiliary_loss_clip": 0.06397434, + "auxiliary_loss_mlp": 0.01264751, + "balance_loss_clip": 0.062673, + "balance_loss_mlp": 0.01255578, + "epoch": 0.9669021494062828, + "flos": 26694434396160.0, + "grad_norm": 1.538464840175787, + "language_loss": 0.67664808, + "learning_rate": 1.145986954691236e-08, + "loss": 0.75326991, + "num_input_tokens_seen": 347001250, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.09173584, + "step": 16082, + "time_per_iteration": 2.5313684940338135 + }, + { + "auxiliary_loss_clip": 0.06400424, + "auxiliary_loss_mlp": 0.01264878, + "balance_loss_clip": 0.06270989, + "balance_loss_mlp": 0.01255556, + "epoch": 0.9669622726589508, + "flos": 29832724091520.0, + "grad_norm": 1.4347274539872106, + "language_loss": 0.7732228, + "learning_rate": 1.141827483932789e-08, + "loss": 0.84987581, + "num_input_tokens_seen": 347022975, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.09320068, + "step": 16083, + "time_per_iteration": 2.6201815605163574 + }, + { + "auxiliary_loss_clip": 0.06402251, + "auxiliary_loss_mlp": 0.01264976, + "balance_loss_clip": 0.06270413, + "balance_loss_mlp": 0.01255546, + "epoch": 0.9670223959116189, + "flos": 22927911114240.0, + "grad_norm": 1.9457609743548718, + "language_loss": 0.79789531, + "learning_rate": 1.1376755538961669e-08, + "loss": 0.87456757, + "num_input_tokens_seen": 347038780, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09436035, + "step": 16084, + "time_per_iteration": 2.51789927482605 + }, + { + "auxiliary_loss_clip": 0.06404713, + "auxiliary_loss_mlp": 0.01263186, + "balance_loss_clip": 0.06271202, + "balance_loss_mlp": 0.01252964, + "epoch": 0.9670825191642868, + "flos": 18630412508160.0, + "grad_norm": 2.4606761386831133, + "language_loss": 0.68396688, + "learning_rate": 1.1335311647387991e-08, + "loss": 0.76064587, + "num_input_tokens_seen": 347056705, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.10223389, + "step": 16085, + "time_per_iteration": 2.474874258041382 + }, + { + "auxiliary_loss_clip": 0.06406981, + "auxiliary_loss_mlp": 0.01264663, + "balance_loss_clip": 0.06271201, + "balance_loss_mlp": 0.0125419, + "epoch": 0.9671426424169548, + "flos": 24504707646720.0, + "grad_norm": 2.075044751177439, + "language_loss": 0.68617862, + "learning_rate": 1.1293943166178709e-08, + "loss": 0.76289505, + "num_input_tokens_seen": 347075710, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.10473633, + "step": 16086, + "time_per_iteration": 2.534994125366211 + }, + { + "auxiliary_loss_clip": 0.06402737, + "auxiliary_loss_mlp": 0.01265826, + "balance_loss_clip": 0.06271712, + "balance_loss_mlp": 0.01255884, + "epoch": 0.9672027656696227, + "flos": 20376625996800.0, + "grad_norm": 1.3946644640700947, + "language_loss": 0.7882064, + "learning_rate": 1.125265009690235e-08, + "loss": 0.86489207, + "num_input_tokens_seen": 347092325, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09942627, + "step": 16087, + "time_per_iteration": 2.4735782146453857 + }, + { + "auxiliary_loss_clip": 0.06399234, + "auxiliary_loss_mlp": 0.01261819, + "balance_loss_clip": 0.06268933, + "balance_loss_mlp": 0.01252837, + "epoch": 0.9672628889222907, + "flos": 18886186696320.0, + "grad_norm": 1.8117496525637224, + "language_loss": 0.71433723, + "learning_rate": 1.1211432441124769e-08, + "loss": 0.79094768, + "num_input_tokens_seen": 347110595, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.08990479, + "step": 16088, + "time_per_iteration": 3.8949713706970215 + }, + { + "auxiliary_loss_clip": 0.06397194, + "auxiliary_loss_mlp": 0.01262715, + "balance_loss_clip": 0.06270102, + "balance_loss_mlp": 0.01253715, + "epoch": 0.9673230121749586, + "flos": 28702962691200.0, + "grad_norm": 1.577967984656714, + "language_loss": 0.70956695, + "learning_rate": 1.117029020040916e-08, + "loss": 0.78616601, + "num_input_tokens_seen": 347131625, + "router_z_loss_clip": 1.27148438, + "router_z_loss_mlp": 0.09002686, + "step": 16089, + "time_per_iteration": 2.5853075981140137 + }, + { + "auxiliary_loss_clip": 0.06403333, + "auxiliary_loss_mlp": 0.01264796, + "balance_loss_clip": 0.06271292, + "balance_loss_mlp": 0.01255235, + "epoch": 0.9673831354276267, + "flos": 20490544022400.0, + "grad_norm": 1.9844262982420549, + "language_loss": 0.75145471, + "learning_rate": 1.1129223376315167e-08, + "loss": 0.82813597, + "num_input_tokens_seen": 347147910, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09558105, + "step": 16090, + "time_per_iteration": 2.469186544418335 + }, + { + "auxiliary_loss_clip": 0.06409271, + "auxiliary_loss_mlp": 0.01264002, + "balance_loss_clip": 0.06270892, + "balance_loss_mlp": 0.01253899, + "epoch": 0.9674432586802946, + "flos": 26804872477440.0, + "grad_norm": 1.5950063142097652, + "language_loss": 0.68768305, + "learning_rate": 1.1088231970400653e-08, + "loss": 0.76441574, + "num_input_tokens_seen": 347168805, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.10101318, + "step": 16091, + "time_per_iteration": 2.563216209411621 + }, + { + "auxiliary_loss_clip": 0.06398912, + "auxiliary_loss_mlp": 0.01263687, + "balance_loss_clip": 0.06270887, + "balance_loss_mlp": 0.012539, + "epoch": 0.9675033819329626, + "flos": 22317706154880.0, + "grad_norm": 1.8144338488923422, + "language_loss": 0.77032447, + "learning_rate": 1.1047315984219484e-08, + "loss": 0.84695041, + "num_input_tokens_seen": 347189455, + "router_z_loss_clip": 1.28027344, + "router_z_loss_mlp": 0.09783936, + "step": 16092, + "time_per_iteration": 2.528087854385376 + }, + { + "auxiliary_loss_clip": 0.06399173, + "auxiliary_loss_mlp": 0.01263601, + "balance_loss_clip": 0.06269817, + "balance_loss_mlp": 0.01255101, + "epoch": 0.9675635051856305, + "flos": 12680367678720.0, + "grad_norm": 1.8169609266887585, + "language_loss": 0.7681576, + "learning_rate": 1.1006475419323313e-08, + "loss": 0.84478533, + "num_input_tokens_seen": 347206030, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.08496094, + "step": 16093, + "time_per_iteration": 2.4785947799682617 + }, + { + "auxiliary_loss_clip": 0.06403705, + "auxiliary_loss_mlp": 0.0126625, + "balance_loss_clip": 0.06271917, + "balance_loss_mlp": 0.01256416, + "epoch": 0.9676236284382985, + "flos": 24615439217280.0, + "grad_norm": 1.550034543506878, + "language_loss": 0.69245452, + "learning_rate": 1.096571027726112e-08, + "loss": 0.76915407, + "num_input_tokens_seen": 347226250, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09832764, + "step": 16094, + "time_per_iteration": 2.531022310256958 + }, + { + "auxiliary_loss_clip": 0.06406316, + "auxiliary_loss_mlp": 0.01266357, + "balance_loss_clip": 0.06270891, + "balance_loss_mlp": 0.01257136, + "epoch": 0.9676837516909664, + "flos": 23373772289280.0, + "grad_norm": 2.0284619015774745, + "language_loss": 0.75801766, + "learning_rate": 1.0925020559578557e-08, + "loss": 0.83474445, + "num_input_tokens_seen": 347247350, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.09222412, + "step": 16095, + "time_per_iteration": 3.973430633544922 + }, + { + "auxiliary_loss_clip": 0.06406826, + "auxiliary_loss_mlp": 0.01263981, + "balance_loss_clip": 0.06270942, + "balance_loss_mlp": 0.01254546, + "epoch": 0.9677438749436345, + "flos": 20493395061120.0, + "grad_norm": 2.546128984208035, + "language_loss": 0.70797509, + "learning_rate": 1.0884406267818392e-08, + "loss": 0.78468317, + "num_input_tokens_seen": 347266870, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.09436035, + "step": 16096, + "time_per_iteration": 2.556928873062134 + }, + { + "auxiliary_loss_clip": 0.0640536, + "auxiliary_loss_mlp": 0.01263747, + "balance_loss_clip": 0.06272483, + "balance_loss_mlp": 0.01254085, + "epoch": 0.9678039981963025, + "flos": 47566341077760.0, + "grad_norm": 1.6741629416522243, + "language_loss": 0.71720374, + "learning_rate": 1.0843867403520946e-08, + "loss": 0.79389483, + "num_input_tokens_seen": 347290120, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.09661865, + "step": 16097, + "time_per_iteration": 2.7817232608795166 + }, + { + "auxiliary_loss_clip": 0.06399585, + "auxiliary_loss_mlp": 0.01266789, + "balance_loss_clip": 0.06271115, + "balance_loss_mlp": 0.01257449, + "epoch": 0.9678641214489704, + "flos": 25046542074240.0, + "grad_norm": 1.8782821270100718, + "language_loss": 0.78498095, + "learning_rate": 1.0803403968223434e-08, + "loss": 0.86164474, + "num_input_tokens_seen": 347308785, + "router_z_loss_clip": 1.28417969, + "router_z_loss_mlp": 0.09338379, + "step": 16098, + "time_per_iteration": 2.5505471229553223 + }, + { + "auxiliary_loss_clip": 0.06397729, + "auxiliary_loss_mlp": 0.01262535, + "balance_loss_clip": 0.06268919, + "balance_loss_mlp": 0.01253708, + "epoch": 0.9679242447016384, + "flos": 19246319544960.0, + "grad_norm": 1.724025286564301, + "language_loss": 0.90831089, + "learning_rate": 1.0763015963459965e-08, + "loss": 0.98491359, + "num_input_tokens_seen": 347326375, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.08831787, + "step": 16099, + "time_per_iteration": 2.484697103500366 + }, + { + "auxiliary_loss_clip": 0.0640512, + "auxiliary_loss_mlp": 0.01264422, + "balance_loss_clip": 0.06269465, + "balance_loss_mlp": 0.01254641, + "epoch": 0.9679843679543063, + "flos": 33262943811840.0, + "grad_norm": 1.8533709433525063, + "language_loss": 0.66165268, + "learning_rate": 1.0722703390762643e-08, + "loss": 0.73834813, + "num_input_tokens_seen": 347348250, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.09777832, + "step": 16100, + "time_per_iteration": 2.6391396522521973 + }, + { + "auxiliary_loss_clip": 0.06401994, + "auxiliary_loss_mlp": 0.01264329, + "balance_loss_clip": 0.06270385, + "balance_loss_mlp": 0.01254805, + "epoch": 0.9680444912069743, + "flos": 22790205728640.0, + "grad_norm": 1.4418574001305366, + "language_loss": 0.73443776, + "learning_rate": 1.0682466251659584e-08, + "loss": 0.81110096, + "num_input_tokens_seen": 347367400, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09533691, + "step": 16101, + "time_per_iteration": 2.4959254264831543 + }, + { + "auxiliary_loss_clip": 0.06401779, + "auxiliary_loss_mlp": 0.01263958, + "balance_loss_clip": 0.06270876, + "balance_loss_mlp": 0.012546, + "epoch": 0.9681046144596422, + "flos": 24030866407680.0, + "grad_norm": 1.476677590253325, + "language_loss": 0.73699975, + "learning_rate": 1.0642304547676672e-08, + "loss": 0.8136571, + "num_input_tokens_seen": 347387600, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.09356689, + "step": 16102, + "time_per_iteration": 2.527163505554199 + }, + { + "auxiliary_loss_clip": 0.06401537, + "auxiliary_loss_mlp": 0.01268193, + "balance_loss_clip": 0.06270529, + "balance_loss_mlp": 0.01257899, + "epoch": 0.9681647377123103, + "flos": 23447802971520.0, + "grad_norm": 1.896455412966277, + "language_loss": 0.77483177, + "learning_rate": 1.0602218280337139e-08, + "loss": 0.85152906, + "num_input_tokens_seen": 347406915, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.10296631, + "step": 16103, + "time_per_iteration": 3.9508111476898193 + }, + { + "auxiliary_loss_clip": 0.06402817, + "auxiliary_loss_mlp": 0.01263081, + "balance_loss_clip": 0.06272209, + "balance_loss_mlp": 0.01254427, + "epoch": 0.9682248609649782, + "flos": 22681780145280.0, + "grad_norm": 1.7473063951215217, + "language_loss": 0.80425286, + "learning_rate": 1.0562207451160655e-08, + "loss": 0.88091195, + "num_input_tokens_seen": 347425140, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.08654785, + "step": 16104, + "time_per_iteration": 2.5212666988372803 + }, + { + "auxiliary_loss_clip": 0.06398646, + "auxiliary_loss_mlp": 0.01262819, + "balance_loss_clip": 0.06269979, + "balance_loss_mlp": 0.01254284, + "epoch": 0.9682849842176462, + "flos": 24435750136320.0, + "grad_norm": 1.553672505568153, + "language_loss": 0.77860147, + "learning_rate": 1.0522272061664672e-08, + "loss": 0.85521615, + "num_input_tokens_seen": 347446350, + "router_z_loss_clip": 1.28613281, + "router_z_loss_mlp": 0.08532715, + "step": 16105, + "time_per_iteration": 2.526402711868286 + }, + { + "auxiliary_loss_clip": 0.06307848, + "auxiliary_loss_mlp": 0.01250922, + "balance_loss_clip": 0.06253837, + "balance_loss_mlp": 0.01249911, + "epoch": 0.9683451074703141, + "flos": 60013365534720.0, + "grad_norm": 0.8157679586212945, + "language_loss": 0.56714195, + "learning_rate": 1.0482412113363536e-08, + "loss": 0.64272964, + "num_input_tokens_seen": 347510135, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.01011658, + "step": 16106, + "time_per_iteration": 3.1907763481140137 + }, + { + "auxiliary_loss_clip": 0.06308085, + "auxiliary_loss_mlp": 0.01250817, + "balance_loss_clip": 0.06253918, + "balance_loss_mlp": 0.01249806, + "epoch": 0.9684052307229821, + "flos": 52712850850560.0, + "grad_norm": 0.8588329806048718, + "language_loss": 0.61471093, + "learning_rate": 1.0442627607768707e-08, + "loss": 0.69029999, + "num_input_tokens_seen": 347562505, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.01010895, + "step": 16107, + "time_per_iteration": 3.0176451206207275 + }, + { + "auxiliary_loss_clip": 0.06401956, + "auxiliary_loss_mlp": 0.0126273, + "balance_loss_clip": 0.06270234, + "balance_loss_mlp": 0.01252859, + "epoch": 0.96846535397565, + "flos": 22790457290880.0, + "grad_norm": 2.0284139673557635, + "language_loss": 0.74127901, + "learning_rate": 1.040291854638875e-08, + "loss": 0.81792581, + "num_input_tokens_seen": 347579150, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09875488, + "step": 16108, + "time_per_iteration": 3.937136650085449 + }, + { + "auxiliary_loss_clip": 0.06403004, + "auxiliary_loss_mlp": 0.01261473, + "balance_loss_clip": 0.06271024, + "balance_loss_mlp": 0.01252359, + "epoch": 0.968525477228318, + "flos": 23329482606720.0, + "grad_norm": 2.048945101246752, + "language_loss": 0.57015377, + "learning_rate": 1.0363284930729576e-08, + "loss": 0.64679849, + "num_input_tokens_seen": 347596705, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09112549, + "step": 16109, + "time_per_iteration": 2.5268101692199707 + }, + { + "auxiliary_loss_clip": 0.06305698, + "auxiliary_loss_mlp": 0.01249198, + "balance_loss_clip": 0.06251822, + "balance_loss_mlp": 0.01248142, + "epoch": 0.9685856004809861, + "flos": 67903651981440.0, + "grad_norm": 0.6567864126752433, + "language_loss": 0.54225814, + "learning_rate": 1.0323726762294205e-08, + "loss": 0.61780703, + "num_input_tokens_seen": 347661870, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.01056671, + "step": 16110, + "time_per_iteration": 3.203383207321167 + }, + { + "auxiliary_loss_clip": 0.0640424, + "auxiliary_loss_mlp": 0.01263261, + "balance_loss_clip": 0.06270111, + "balance_loss_mlp": 0.01253194, + "epoch": 0.968645723733654, + "flos": 33956277621120.0, + "grad_norm": 1.395578578385916, + "language_loss": 0.62541378, + "learning_rate": 1.0284244042582325e-08, + "loss": 0.70208883, + "num_input_tokens_seen": 347684295, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.10058594, + "step": 16111, + "time_per_iteration": 2.640765428543091 + }, + { + "auxiliary_loss_clip": 0.06400396, + "auxiliary_loss_mlp": 0.0126341, + "balance_loss_clip": 0.06270713, + "balance_loss_mlp": 0.0125463, + "epoch": 0.968705846986322, + "flos": 18557388074880.0, + "grad_norm": 1.8951473791498206, + "language_loss": 0.74788642, + "learning_rate": 1.024483677309118e-08, + "loss": 0.82452452, + "num_input_tokens_seen": 347702585, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.08776855, + "step": 16112, + "time_per_iteration": 2.5014288425445557 + }, + { + "auxiliary_loss_clip": 0.06395774, + "auxiliary_loss_mlp": 0.01264106, + "balance_loss_clip": 0.06268512, + "balance_loss_mlp": 0.01255517, + "epoch": 0.9687659702389899, + "flos": 17426704279680.0, + "grad_norm": 2.0013501762386072, + "language_loss": 0.67307127, + "learning_rate": 1.020550495531558e-08, + "loss": 0.74967003, + "num_input_tokens_seen": 347721810, + "router_z_loss_clip": 1.27246094, + "router_z_loss_mlp": 0.0859375, + "step": 16113, + "time_per_iteration": 2.479163646697998 + }, + { + "auxiliary_loss_clip": 0.06308687, + "auxiliary_loss_mlp": 0.01250527, + "balance_loss_clip": 0.06254673, + "balance_loss_mlp": 0.01249524, + "epoch": 0.9688260934916579, + "flos": 62067231688320.0, + "grad_norm": 0.785383139879687, + "language_loss": 0.56577516, + "learning_rate": 1.0166248590746329e-08, + "loss": 0.64136732, + "num_input_tokens_seen": 347782330, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.01003265, + "step": 16114, + "time_per_iteration": 3.1394646167755127 + }, + { + "auxiliary_loss_clip": 0.06402376, + "auxiliary_loss_mlp": 0.01268137, + "balance_loss_clip": 0.06271395, + "balance_loss_mlp": 0.01258618, + "epoch": 0.9688862167443258, + "flos": 15080363049600.0, + "grad_norm": 1.8572842989291634, + "language_loss": 0.82534641, + "learning_rate": 1.0127067680872458e-08, + "loss": 0.90205157, + "num_input_tokens_seen": 347794835, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.09521484, + "step": 16115, + "time_per_iteration": 2.4412038326263428 + }, + { + "auxiliary_loss_clip": 0.06396943, + "auxiliary_loss_mlp": 0.01261817, + "balance_loss_clip": 0.06272493, + "balance_loss_mlp": 0.01253377, + "epoch": 0.9689463399969939, + "flos": 19944391109760.0, + "grad_norm": 1.4226958516999226, + "language_loss": 0.72081476, + "learning_rate": 1.0087962227179448e-08, + "loss": 0.79740238, + "num_input_tokens_seen": 347814320, + "router_z_loss_clip": 1.24414062, + "router_z_loss_mlp": 0.08435059, + "step": 16116, + "time_per_iteration": 2.5535671710968018 + }, + { + "auxiliary_loss_clip": 0.06405754, + "auxiliary_loss_mlp": 0.01261237, + "balance_loss_clip": 0.06272267, + "balance_loss_mlp": 0.01251587, + "epoch": 0.9690064632496618, + "flos": 19579101235200.0, + "grad_norm": 2.2415643926520614, + "language_loss": 0.75798059, + "learning_rate": 1.0048932231150553e-08, + "loss": 0.83465052, + "num_input_tokens_seen": 347832125, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.09661865, + "step": 16117, + "time_per_iteration": 2.483868360519409 + }, + { + "auxiliary_loss_clip": 0.06406679, + "auxiliary_loss_mlp": 0.0126341, + "balance_loss_clip": 0.06273545, + "balance_loss_mlp": 0.01254004, + "epoch": 0.9690665865023298, + "flos": 21878846795520.0, + "grad_norm": 2.007341004668209, + "language_loss": 0.77854973, + "learning_rate": 1.000997769426548e-08, + "loss": 0.8552506, + "num_input_tokens_seen": 347850765, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.09399414, + "step": 16118, + "time_per_iteration": 2.5185434818267822 + }, + { + "auxiliary_loss_clip": 0.06402394, + "auxiliary_loss_mlp": 0.01264527, + "balance_loss_clip": 0.06269039, + "balance_loss_mlp": 0.01254758, + "epoch": 0.9691267097549977, + "flos": 21000541098240.0, + "grad_norm": 1.7099772377431646, + "language_loss": 0.78459924, + "learning_rate": 9.971098618001272e-09, + "loss": 0.86126846, + "num_input_tokens_seen": 347870125, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.09771729, + "step": 16119, + "time_per_iteration": 2.612290859222412 + }, + { + "auxiliary_loss_clip": 0.06396645, + "auxiliary_loss_mlp": 0.01263245, + "balance_loss_clip": 0.06271589, + "balance_loss_mlp": 0.01254609, + "epoch": 0.9691868330076657, + "flos": 24285885909120.0, + "grad_norm": 1.3978893166659911, + "language_loss": 0.75944752, + "learning_rate": 9.932295003832747e-09, + "loss": 0.83604646, + "num_input_tokens_seen": 347890615, + "router_z_loss_clip": 1.25097656, + "router_z_loss_mlp": 0.08636475, + "step": 16120, + "time_per_iteration": 2.5401206016540527 + }, + { + "auxiliary_loss_clip": 0.0640015, + "auxiliary_loss_mlp": 0.01262274, + "balance_loss_clip": 0.06269264, + "balance_loss_mlp": 0.01252923, + "epoch": 0.9692469562603336, + "flos": 17681430291840.0, + "grad_norm": 1.946020897677594, + "language_loss": 0.69889534, + "learning_rate": 9.89356685323095e-09, + "loss": 0.77551961, + "num_input_tokens_seen": 347908685, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.09344482, + "step": 16121, + "time_per_iteration": 2.4932589530944824 + }, + { + "auxiliary_loss_clip": 0.06398712, + "auxiliary_loss_mlp": 0.01261825, + "balance_loss_clip": 0.06269211, + "balance_loss_mlp": 0.01252211, + "epoch": 0.9693070795130017, + "flos": 26841783000960.0, + "grad_norm": 2.6697458666208007, + "language_loss": 0.6931926, + "learning_rate": 9.854914167664486e-09, + "loss": 0.76979792, + "num_input_tokens_seen": 347926385, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.09613037, + "step": 16122, + "time_per_iteration": 2.5934178829193115 + }, + { + "auxiliary_loss_clip": 0.0640236, + "auxiliary_loss_mlp": 0.01261205, + "balance_loss_clip": 0.06270461, + "balance_loss_mlp": 0.01252849, + "epoch": 0.9693672027656697, + "flos": 18083127565440.0, + "grad_norm": 1.887635490879254, + "language_loss": 0.75718206, + "learning_rate": 9.81633694859907e-09, + "loss": 0.83381778, + "num_input_tokens_seen": 347945290, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.08355713, + "step": 16123, + "time_per_iteration": 2.526440382003784 + }, + { + "auxiliary_loss_clip": 0.0640337, + "auxiliary_loss_mlp": 0.01262305, + "balance_loss_clip": 0.06270259, + "balance_loss_mlp": 0.01252536, + "epoch": 0.9694273260183376, + "flos": 21769582671360.0, + "grad_norm": 1.3729033080387363, + "language_loss": 0.74643373, + "learning_rate": 9.777835197497753e-09, + "loss": 0.82309043, + "num_input_tokens_seen": 347966330, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.09771729, + "step": 16124, + "time_per_iteration": 2.551767349243164 + }, + { + "auxiliary_loss_clip": 0.06402075, + "auxiliary_loss_mlp": 0.01262872, + "balance_loss_clip": 0.06270434, + "balance_loss_mlp": 0.01253716, + "epoch": 0.9694874492710056, + "flos": 24433066805760.0, + "grad_norm": 1.95841723109516, + "language_loss": 0.74200714, + "learning_rate": 9.739408915820258e-09, + "loss": 0.81865656, + "num_input_tokens_seen": 347982590, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09155273, + "step": 16125, + "time_per_iteration": 2.55316424369812 + }, + { + "auxiliary_loss_clip": 0.06305213, + "auxiliary_loss_mlp": 0.0125144, + "balance_loss_clip": 0.06251328, + "balance_loss_mlp": 0.01250412, + "epoch": 0.9695475725236735, + "flos": 67669191457920.0, + "grad_norm": 0.8771800111615311, + "language_loss": 0.61598706, + "learning_rate": 9.70105810502364e-09, + "loss": 0.69155359, + "num_input_tokens_seen": 348043310, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.01027679, + "step": 16126, + "time_per_iteration": 3.0755326747894287 + }, + { + "auxiliary_loss_clip": 0.06398349, + "auxiliary_loss_mlp": 0.01264514, + "balance_loss_clip": 0.06271584, + "balance_loss_mlp": 0.01255293, + "epoch": 0.9696076957763415, + "flos": 19134330163200.0, + "grad_norm": 1.5311536279147961, + "language_loss": 0.75146884, + "learning_rate": 9.662782766562738e-09, + "loss": 0.82809746, + "num_input_tokens_seen": 348062200, + "router_z_loss_clip": 1.26855469, + "router_z_loss_mlp": 0.09222412, + "step": 16127, + "time_per_iteration": 3.9446663856506348 + }, + { + "auxiliary_loss_clip": 0.06405523, + "auxiliary_loss_mlp": 0.01262243, + "balance_loss_clip": 0.06270227, + "balance_loss_mlp": 0.01252146, + "epoch": 0.9696678190290094, + "flos": 15492248593920.0, + "grad_norm": 1.533562341751804, + "language_loss": 0.69545048, + "learning_rate": 9.62458290188839e-09, + "loss": 0.77212816, + "num_input_tokens_seen": 348080685, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.10101318, + "step": 16128, + "time_per_iteration": 2.4981887340545654 + }, + { + "auxiliary_loss_clip": 0.06400339, + "auxiliary_loss_mlp": 0.01266042, + "balance_loss_clip": 0.06270851, + "balance_loss_mlp": 0.01256761, + "epoch": 0.9697279422816775, + "flos": 36217225941120.0, + "grad_norm": 1.5209597540885744, + "language_loss": 0.65483963, + "learning_rate": 9.586458512449213e-09, + "loss": 0.73150343, + "num_input_tokens_seen": 348102500, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.09277344, + "step": 16129, + "time_per_iteration": 2.6251938343048096 + }, + { + "auxiliary_loss_clip": 0.06407736, + "auxiliary_loss_mlp": 0.0126171, + "balance_loss_clip": 0.06270853, + "balance_loss_mlp": 0.01252024, + "epoch": 0.9697880655343454, + "flos": 25491103511040.0, + "grad_norm": 1.8080137782892927, + "language_loss": 0.63748336, + "learning_rate": 9.548409599691166e-09, + "loss": 0.71417773, + "num_input_tokens_seen": 348122515, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.09692383, + "step": 16130, + "time_per_iteration": 2.534078359603882 + }, + { + "auxiliary_loss_clip": 0.06406684, + "auxiliary_loss_mlp": 0.01266248, + "balance_loss_clip": 0.06270098, + "balance_loss_mlp": 0.01256336, + "epoch": 0.9698481887870134, + "flos": 15337688538240.0, + "grad_norm": 2.3021960280258718, + "language_loss": 0.70279443, + "learning_rate": 9.510436165056867e-09, + "loss": 0.77952373, + "num_input_tokens_seen": 348138775, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.09912109, + "step": 16131, + "time_per_iteration": 2.4530463218688965 + }, + { + "auxiliary_loss_clip": 0.06404746, + "auxiliary_loss_mlp": 0.01267276, + "balance_loss_clip": 0.06270657, + "balance_loss_mlp": 0.01257173, + "epoch": 0.9699083120396813, + "flos": 21988907533440.0, + "grad_norm": 2.025844934607916, + "language_loss": 0.76757103, + "learning_rate": 9.472538209986058e-09, + "loss": 0.84429133, + "num_input_tokens_seen": 348157115, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.10107422, + "step": 16132, + "time_per_iteration": 2.5047919750213623 + }, + { + "auxiliary_loss_clip": 0.0640052, + "auxiliary_loss_mlp": 0.01265217, + "balance_loss_clip": 0.06269385, + "balance_loss_mlp": 0.01255042, + "epoch": 0.9699684352923493, + "flos": 15668625438720.0, + "grad_norm": 2.7063973551454263, + "language_loss": 0.79410255, + "learning_rate": 9.434715735916477e-09, + "loss": 0.8707599, + "num_input_tokens_seen": 348173035, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.10168457, + "step": 16133, + "time_per_iteration": 2.4512226581573486 + }, + { + "auxiliary_loss_clip": 0.06397133, + "auxiliary_loss_mlp": 0.01267095, + "balance_loss_clip": 0.06269794, + "balance_loss_mlp": 0.01258476, + "epoch": 0.9700285585450172, + "flos": 21914876851200.0, + "grad_norm": 1.5611198022203323, + "language_loss": 0.64911574, + "learning_rate": 9.396968744281863e-09, + "loss": 0.72575808, + "num_input_tokens_seen": 348192960, + "router_z_loss_clip": 1.27246094, + "router_z_loss_mlp": 0.08618164, + "step": 16134, + "time_per_iteration": 2.500866413116455 + }, + { + "auxiliary_loss_clip": 0.06402618, + "auxiliary_loss_mlp": 0.01262072, + "balance_loss_clip": 0.0627054, + "balance_loss_mlp": 0.01252786, + "epoch": 0.9700886817976853, + "flos": 23921686137600.0, + "grad_norm": 1.8077102580122415, + "language_loss": 0.80706894, + "learning_rate": 9.359297236513519e-09, + "loss": 0.88371587, + "num_input_tokens_seen": 348212805, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.09289551, + "step": 16135, + "time_per_iteration": 3.9117238521575928 + }, + { + "auxiliary_loss_clip": 0.06405312, + "auxiliary_loss_mlp": 0.01263739, + "balance_loss_clip": 0.06270383, + "balance_loss_mlp": 0.01253267, + "epoch": 0.9701488050503532, + "flos": 25454989601280.0, + "grad_norm": 1.6284393285017646, + "language_loss": 0.73501408, + "learning_rate": 9.321701214040079e-09, + "loss": 0.81170464, + "num_input_tokens_seen": 348232900, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.10473633, + "step": 16136, + "time_per_iteration": 2.5779073238372803 + }, + { + "auxiliary_loss_clip": 0.06398432, + "auxiliary_loss_mlp": 0.01267079, + "balance_loss_clip": 0.06269141, + "balance_loss_mlp": 0.01257644, + "epoch": 0.9702089283030212, + "flos": 20596453983360.0, + "grad_norm": 1.492877171392222, + "language_loss": 0.76563627, + "learning_rate": 9.28418067828729e-09, + "loss": 0.84229136, + "num_input_tokens_seen": 348253065, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.09442139, + "step": 16137, + "time_per_iteration": 2.538085460662842 + }, + { + "auxiliary_loss_clip": 0.06306077, + "auxiliary_loss_mlp": 0.01249847, + "balance_loss_clip": 0.06252094, + "balance_loss_mlp": 0.01248849, + "epoch": 0.9702690515556892, + "flos": 70671955973760.0, + "grad_norm": 0.821661417803752, + "language_loss": 0.5493418, + "learning_rate": 9.246735630678015e-09, + "loss": 0.62490106, + "num_input_tokens_seen": 348316075, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.00997925, + "step": 16138, + "time_per_iteration": 3.2206809520721436 + }, + { + "auxiliary_loss_clip": 0.06400603, + "auxiliary_loss_mlp": 0.01266479, + "balance_loss_clip": 0.06268343, + "balance_loss_mlp": 0.01257031, + "epoch": 0.9703291748083571, + "flos": 35890104401280.0, + "grad_norm": 1.6919399068394998, + "language_loss": 0.70817888, + "learning_rate": 9.209366072632007e-09, + "loss": 0.78484976, + "num_input_tokens_seen": 348337605, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.09448242, + "step": 16139, + "time_per_iteration": 2.685359239578247 + }, + { + "auxiliary_loss_clip": 0.06405871, + "auxiliary_loss_mlp": 0.01265937, + "balance_loss_clip": 0.06271709, + "balance_loss_mlp": 0.01255846, + "epoch": 0.9703892980610251, + "flos": 24323383411200.0, + "grad_norm": 1.4852004067198157, + "language_loss": 0.72197908, + "learning_rate": 9.172072005566134e-09, + "loss": 0.79869711, + "num_input_tokens_seen": 348359430, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.10089111, + "step": 16140, + "time_per_iteration": 2.558115005493164 + }, + { + "auxiliary_loss_clip": 0.06405499, + "auxiliary_loss_mlp": 0.01266107, + "balance_loss_clip": 0.06272194, + "balance_loss_mlp": 0.01256433, + "epoch": 0.970449421313693, + "flos": 18009474226560.0, + "grad_norm": 3.19764117051917, + "language_loss": 0.69224846, + "learning_rate": 9.13485343089504e-09, + "loss": 0.76896447, + "num_input_tokens_seen": 348377890, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.09674072, + "step": 16141, + "time_per_iteration": 2.482884168624878 + }, + { + "auxiliary_loss_clip": 0.06398399, + "auxiliary_loss_mlp": 0.01262865, + "balance_loss_clip": 0.06271194, + "balance_loss_mlp": 0.01253692, + "epoch": 0.9705095445663611, + "flos": 25345054644480.0, + "grad_norm": 1.7252528313404465, + "language_loss": 0.68293542, + "learning_rate": 9.097710350029597e-09, + "loss": 0.75954807, + "num_input_tokens_seen": 348396550, + "router_z_loss_clip": 1.27246094, + "router_z_loss_mlp": 0.0916748, + "step": 16142, + "time_per_iteration": 4.058878183364868 + }, + { + "auxiliary_loss_clip": 0.06401761, + "auxiliary_loss_mlp": 0.01262507, + "balance_loss_clip": 0.0626963, + "balance_loss_mlp": 0.01253132, + "epoch": 0.970569667819029, + "flos": 26840860606080.0, + "grad_norm": 1.8571958847472876, + "language_loss": 0.55470061, + "learning_rate": 9.060642764378457e-09, + "loss": 0.63134331, + "num_input_tokens_seen": 348417120, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.09375, + "step": 16143, + "time_per_iteration": 2.5692148208618164 + }, + { + "auxiliary_loss_clip": 0.06405912, + "auxiliary_loss_mlp": 0.01267612, + "balance_loss_clip": 0.06272087, + "balance_loss_mlp": 0.01258742, + "epoch": 0.970629791071697, + "flos": 25855764480000.0, + "grad_norm": 2.158347081633599, + "language_loss": 0.67963922, + "learning_rate": 9.023650675347382e-09, + "loss": 0.75637448, + "num_input_tokens_seen": 348437750, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.08874512, + "step": 16144, + "time_per_iteration": 2.5477588176727295 + }, + { + "auxiliary_loss_clip": 0.06398851, + "auxiliary_loss_mlp": 0.01265158, + "balance_loss_clip": 0.06270637, + "balance_loss_mlp": 0.0125611, + "epoch": 0.9706899143243649, + "flos": 36549294871680.0, + "grad_norm": 1.7214087229077903, + "language_loss": 0.72277164, + "learning_rate": 8.986734084339253e-09, + "loss": 0.79941171, + "num_input_tokens_seen": 348460935, + "router_z_loss_clip": 1.28320312, + "router_z_loss_mlp": 0.0904541, + "step": 16145, + "time_per_iteration": 2.6755943298339844 + }, + { + "auxiliary_loss_clip": 0.0640352, + "auxiliary_loss_mlp": 0.01263869, + "balance_loss_clip": 0.06269689, + "balance_loss_mlp": 0.01253522, + "epoch": 0.9707500375770329, + "flos": 12271794370560.0, + "grad_norm": 2.6855467217537488, + "language_loss": 0.80483818, + "learning_rate": 8.949892992753395e-09, + "loss": 0.88151205, + "num_input_tokens_seen": 348474480, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.10351562, + "step": 16146, + "time_per_iteration": 2.4482696056365967 + }, + { + "auxiliary_loss_clip": 0.063061, + "auxiliary_loss_mlp": 0.01250418, + "balance_loss_clip": 0.06252153, + "balance_loss_mlp": 0.01249364, + "epoch": 0.9708101608297008, + "flos": 60874550271360.0, + "grad_norm": 0.8926605376395859, + "language_loss": 0.546646, + "learning_rate": 8.91312740198713e-09, + "loss": 0.62221122, + "num_input_tokens_seen": 348541220, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.01055145, + "step": 16147, + "time_per_iteration": 3.1784896850585938 + }, + { + "auxiliary_loss_clip": 0.06404494, + "auxiliary_loss_mlp": 0.01265443, + "balance_loss_clip": 0.06269732, + "balance_loss_mlp": 0.01255663, + "epoch": 0.9708702840823689, + "flos": 27131952090240.0, + "grad_norm": 4.377042255553633, + "language_loss": 0.61389154, + "learning_rate": 8.876437313434682e-09, + "loss": 0.69059098, + "num_input_tokens_seen": 348559230, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.09783936, + "step": 16148, + "time_per_iteration": 3.9833836555480957 + }, + { + "auxiliary_loss_clip": 0.06399462, + "auxiliary_loss_mlp": 0.01263798, + "balance_loss_clip": 0.06271495, + "balance_loss_mlp": 0.01254553, + "epoch": 0.9709304073350368, + "flos": 20784067274880.0, + "grad_norm": 1.8003493724827047, + "language_loss": 0.73550653, + "learning_rate": 8.839822728487155e-09, + "loss": 0.81213915, + "num_input_tokens_seen": 348577850, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.09246826, + "step": 16149, + "time_per_iteration": 2.518012046813965 + }, + { + "auxiliary_loss_clip": 0.06402236, + "auxiliary_loss_mlp": 0.01263658, + "balance_loss_clip": 0.06271193, + "balance_loss_mlp": 0.01254408, + "epoch": 0.9709905305877048, + "flos": 41943627423360.0, + "grad_norm": 1.959430214101398, + "language_loss": 0.75053811, + "learning_rate": 8.803283648533222e-09, + "loss": 0.82719702, + "num_input_tokens_seen": 348598345, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.09246826, + "step": 16150, + "time_per_iteration": 2.6981914043426514 + }, + { + "auxiliary_loss_clip": 0.0641038, + "auxiliary_loss_mlp": 0.01268959, + "balance_loss_clip": 0.06272288, + "balance_loss_mlp": 0.01257408, + "epoch": 0.9710506538403728, + "flos": 17171349361920.0, + "grad_norm": 2.6505663185230803, + "language_loss": 0.73947191, + "learning_rate": 8.766820074958214e-09, + "loss": 0.81626534, + "num_input_tokens_seen": 348616300, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.11547852, + "step": 16151, + "time_per_iteration": 2.4698150157928467 + }, + { + "auxiliary_loss_clip": 0.0639576, + "auxiliary_loss_mlp": 0.01262487, + "balance_loss_clip": 0.06268339, + "balance_loss_mlp": 0.01253153, + "epoch": 0.9711107770930407, + "flos": 21178972368000.0, + "grad_norm": 1.7020963339660558, + "language_loss": 0.74932683, + "learning_rate": 8.730432009145027e-09, + "loss": 0.82590926, + "num_input_tokens_seen": 348633845, + "router_z_loss_clip": 1.2734375, + "router_z_loss_mlp": 0.09332275, + "step": 16152, + "time_per_iteration": 2.5061516761779785 + }, + { + "auxiliary_loss_clip": 0.06401396, + "auxiliary_loss_mlp": 0.01263582, + "balance_loss_clip": 0.06271546, + "balance_loss_mlp": 0.0125398, + "epoch": 0.9711709003457087, + "flos": 22243675472640.0, + "grad_norm": 1.8409380245762448, + "language_loss": 0.67063367, + "learning_rate": 8.694119452473448e-09, + "loss": 0.74728346, + "num_input_tokens_seen": 348653070, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.0960083, + "step": 16153, + "time_per_iteration": 2.5174050331115723 + }, + { + "auxiliary_loss_clip": 0.06401861, + "auxiliary_loss_mlp": 0.01268174, + "balance_loss_clip": 0.06270944, + "balance_loss_mlp": 0.01259204, + "epoch": 0.9712310235983767, + "flos": 26221096281600.0, + "grad_norm": 1.5163475252585155, + "language_loss": 0.70737278, + "learning_rate": 8.65788240632037e-09, + "loss": 0.78407311, + "num_input_tokens_seen": 348672145, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.08978271, + "step": 16154, + "time_per_iteration": 2.55505633354187 + }, + { + "auxiliary_loss_clip": 0.06405511, + "auxiliary_loss_mlp": 0.01265417, + "balance_loss_clip": 0.06270428, + "balance_loss_mlp": 0.01255082, + "epoch": 0.9712911468510447, + "flos": 20674509661440.0, + "grad_norm": 1.7710831738309059, + "language_loss": 0.81191093, + "learning_rate": 8.621720872059812e-09, + "loss": 0.88862026, + "num_input_tokens_seen": 348690615, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.10327148, + "step": 16155, + "time_per_iteration": 2.4887568950653076 + }, + { + "auxiliary_loss_clip": 0.06409426, + "auxiliary_loss_mlp": 0.01266787, + "balance_loss_clip": 0.06273002, + "balance_loss_mlp": 0.01256958, + "epoch": 0.9713512701037126, + "flos": 13557960616320.0, + "grad_norm": 1.9435807645982621, + "language_loss": 0.67513001, + "learning_rate": 8.58563485106334e-09, + "loss": 0.75189221, + "num_input_tokens_seen": 348708665, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.0982666, + "step": 16156, + "time_per_iteration": 2.4993584156036377 + }, + { + "auxiliary_loss_clip": 0.06404352, + "auxiliary_loss_mlp": 0.01263135, + "balance_loss_clip": 0.06270174, + "balance_loss_mlp": 0.01254081, + "epoch": 0.9714113933563806, + "flos": 25855890261120.0, + "grad_norm": 2.458858040967428, + "language_loss": 0.91195989, + "learning_rate": 8.54962434469919e-09, + "loss": 0.98863471, + "num_input_tokens_seen": 348726105, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.0904541, + "step": 16157, + "time_per_iteration": 2.5206339359283447 + }, + { + "auxiliary_loss_clip": 0.06405168, + "auxiliary_loss_mlp": 0.01261509, + "balance_loss_clip": 0.06270272, + "balance_loss_mlp": 0.01252938, + "epoch": 0.9714715166090485, + "flos": 12746809566720.0, + "grad_norm": 2.148569057457713, + "language_loss": 0.72731894, + "learning_rate": 8.513689354332721e-09, + "loss": 0.80398571, + "num_input_tokens_seen": 348743360, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.08575439, + "step": 16158, + "time_per_iteration": 2.4993045330047607 + }, + { + "auxiliary_loss_clip": 0.06398468, + "auxiliary_loss_mlp": 0.01263592, + "balance_loss_clip": 0.0626895, + "balance_loss_mlp": 0.0125443, + "epoch": 0.9715316398617165, + "flos": 18411423062400.0, + "grad_norm": 2.253671983046757, + "language_loss": 0.6065799, + "learning_rate": 8.477829881326836e-09, + "loss": 0.68320048, + "num_input_tokens_seen": 348759045, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.0916748, + "step": 16159, + "time_per_iteration": 2.5027124881744385 + }, + { + "auxiliary_loss_clip": 0.0639558, + "auxiliary_loss_mlp": 0.01264017, + "balance_loss_clip": 0.06269194, + "balance_loss_mlp": 0.01255434, + "epoch": 0.9715917631143844, + "flos": 28921490939520.0, + "grad_norm": 1.595247357103686, + "language_loss": 0.78944242, + "learning_rate": 8.44204592704112e-09, + "loss": 0.86603844, + "num_input_tokens_seen": 348779910, + "router_z_loss_clip": 1.26464844, + "router_z_loss_mlp": 0.08575439, + "step": 16160, + "time_per_iteration": 2.5898780822753906 + }, + { + "auxiliary_loss_clip": 0.06308243, + "auxiliary_loss_mlp": 0.01251149, + "balance_loss_clip": 0.06254422, + "balance_loss_mlp": 0.01250153, + "epoch": 0.9716518863670525, + "flos": 65958504900480.0, + "grad_norm": 0.7522955925244894, + "language_loss": 0.54286468, + "learning_rate": 8.406337492832704e-09, + "loss": 0.61845851, + "num_input_tokens_seen": 348838995, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.00994873, + "step": 16161, + "time_per_iteration": 3.1553361415863037 + }, + { + "auxiliary_loss_clip": 0.06398444, + "auxiliary_loss_mlp": 0.01263413, + "balance_loss_clip": 0.06270605, + "balance_loss_mlp": 0.01254282, + "epoch": 0.9717120096197204, + "flos": 17718592377600.0, + "grad_norm": 1.69794740323834, + "language_loss": 0.71924436, + "learning_rate": 8.3707045800554e-09, + "loss": 0.79586291, + "num_input_tokens_seen": 348858090, + "router_z_loss_clip": 1.27832031, + "router_z_loss_mlp": 0.09130859, + "step": 16162, + "time_per_iteration": 2.529026746749878 + }, + { + "auxiliary_loss_clip": 0.063986, + "auxiliary_loss_mlp": 0.01265674, + "balance_loss_clip": 0.06268875, + "balance_loss_mlp": 0.0125622, + "epoch": 0.9717721328723884, + "flos": 24470522380800.0, + "grad_norm": 1.5641682606376985, + "language_loss": 0.78791863, + "learning_rate": 8.335147190060787e-09, + "loss": 0.86456132, + "num_input_tokens_seen": 348877885, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.09454346, + "step": 16163, + "time_per_iteration": 2.5521621704101562 + }, + { + "auxiliary_loss_clip": 0.06400799, + "auxiliary_loss_mlp": 0.01263838, + "balance_loss_clip": 0.0627103, + "balance_loss_mlp": 0.01254832, + "epoch": 0.9718322561250564, + "flos": 20782641755520.0, + "grad_norm": 1.9434386776023218, + "language_loss": 0.73329967, + "learning_rate": 8.299665324196903e-09, + "loss": 0.80994606, + "num_input_tokens_seen": 348897720, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.09008789, + "step": 16164, + "time_per_iteration": 2.5233001708984375 + }, + { + "auxiliary_loss_clip": 0.06404097, + "auxiliary_loss_mlp": 0.0126725, + "balance_loss_clip": 0.0627019, + "balance_loss_mlp": 0.01257225, + "epoch": 0.9718923793777243, + "flos": 19031900146560.0, + "grad_norm": 2.0895359758091194, + "language_loss": 0.84477919, + "learning_rate": 8.264258983809114e-09, + "loss": 0.92149264, + "num_input_tokens_seen": 348915410, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.10015869, + "step": 16165, + "time_per_iteration": 2.4885025024414062 + }, + { + "auxiliary_loss_clip": 0.06401068, + "auxiliary_loss_mlp": 0.01261942, + "balance_loss_clip": 0.06270339, + "balance_loss_mlp": 0.01253967, + "epoch": 0.9719525026303923, + "flos": 21878175962880.0, + "grad_norm": 1.4925569897983804, + "language_loss": 0.79246068, + "learning_rate": 8.228928170240345e-09, + "loss": 0.8690908, + "num_input_tokens_seen": 348934335, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.07977295, + "step": 16166, + "time_per_iteration": 2.505911111831665 + }, + { + "auxiliary_loss_clip": 0.0639866, + "auxiliary_loss_mlp": 0.01263924, + "balance_loss_clip": 0.06269057, + "balance_loss_mlp": 0.01254107, + "epoch": 0.9720126258830603, + "flos": 14434631159040.0, + "grad_norm": 1.7340216606889713, + "language_loss": 0.71028543, + "learning_rate": 8.193672884830195e-09, + "loss": 0.78691125, + "num_input_tokens_seen": 348952405, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.0982666, + "step": 16167, + "time_per_iteration": 4.039773941040039 + }, + { + "auxiliary_loss_clip": 0.06401287, + "auxiliary_loss_mlp": 0.01263666, + "balance_loss_clip": 0.06272106, + "balance_loss_mlp": 0.0125432, + "epoch": 0.9720727491357283, + "flos": 26258551856640.0, + "grad_norm": 1.4905836885557386, + "language_loss": 0.76212865, + "learning_rate": 8.158493128915812e-09, + "loss": 0.83877814, + "num_input_tokens_seen": 348973580, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.09350586, + "step": 16168, + "time_per_iteration": 2.571298837661743 + }, + { + "auxiliary_loss_clip": 0.06404977, + "auxiliary_loss_mlp": 0.01264172, + "balance_loss_clip": 0.06272201, + "balance_loss_mlp": 0.01254564, + "epoch": 0.9721328723883962, + "flos": 22680648115200.0, + "grad_norm": 2.0966560068036073, + "language_loss": 0.72333491, + "learning_rate": 8.123388903830797e-09, + "loss": 0.80002642, + "num_input_tokens_seen": 348992035, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.0960083, + "step": 16169, + "time_per_iteration": 2.514556646347046 + }, + { + "auxiliary_loss_clip": 0.06403787, + "auxiliary_loss_mlp": 0.01263177, + "balance_loss_clip": 0.06268648, + "balance_loss_mlp": 0.01253354, + "epoch": 0.9721929956410642, + "flos": 28081647066240.0, + "grad_norm": 1.657160830557666, + "language_loss": 0.57263756, + "learning_rate": 8.088360210906309e-09, + "loss": 0.64930725, + "num_input_tokens_seen": 349013160, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.09820557, + "step": 16170, + "time_per_iteration": 2.5566329956054688 + }, + { + "auxiliary_loss_clip": 0.06402764, + "auxiliary_loss_mlp": 0.01265099, + "balance_loss_clip": 0.06270877, + "balance_loss_mlp": 0.01255258, + "epoch": 0.9722531188937321, + "flos": 21002595523200.0, + "grad_norm": 1.5645513876953863, + "language_loss": 0.71513534, + "learning_rate": 8.053407051471062e-09, + "loss": 0.79181397, + "num_input_tokens_seen": 349033485, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09832764, + "step": 16171, + "time_per_iteration": 2.521963119506836 + }, + { + "auxiliary_loss_clip": 0.06400986, + "auxiliary_loss_mlp": 0.01265808, + "balance_loss_clip": 0.06269605, + "balance_loss_mlp": 0.01256069, + "epoch": 0.9723132421464001, + "flos": 16076108643840.0, + "grad_norm": 1.684444185792019, + "language_loss": 0.68665528, + "learning_rate": 8.018529426850218e-09, + "loss": 0.76332319, + "num_input_tokens_seen": 349051705, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.09747314, + "step": 16172, + "time_per_iteration": 2.4726855754852295 + }, + { + "auxiliary_loss_clip": 0.06400435, + "auxiliary_loss_mlp": 0.01263752, + "balance_loss_clip": 0.06272088, + "balance_loss_mlp": 0.01255044, + "epoch": 0.972373365399068, + "flos": 27753183861120.0, + "grad_norm": 1.7449556340792685, + "language_loss": 0.86100602, + "learning_rate": 7.983727338366274e-09, + "loss": 0.93764794, + "num_input_tokens_seen": 349070825, + "router_z_loss_clip": 1.28320312, + "router_z_loss_mlp": 0.08703613, + "step": 16173, + "time_per_iteration": 2.5892083644866943 + }, + { + "auxiliary_loss_clip": 0.06409517, + "auxiliary_loss_mlp": 0.01266374, + "balance_loss_clip": 0.06271982, + "balance_loss_mlp": 0.0125527, + "epoch": 0.9724334886517361, + "flos": 23009614444800.0, + "grad_norm": 1.7640837556867108, + "language_loss": 0.64575619, + "learning_rate": 7.949000787339289e-09, + "loss": 0.72251511, + "num_input_tokens_seen": 349089730, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.11090088, + "step": 16174, + "time_per_iteration": 3.989103317260742 + }, + { + "auxiliary_loss_clip": 0.06399212, + "auxiliary_loss_mlp": 0.01266929, + "balance_loss_clip": 0.06270289, + "balance_loss_mlp": 0.01258275, + "epoch": 0.972493611904404, + "flos": 25454067206400.0, + "grad_norm": 1.712366988133228, + "language_loss": 0.78392601, + "learning_rate": 7.914349775085538e-09, + "loss": 0.86058748, + "num_input_tokens_seen": 349111315, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.08654785, + "step": 16175, + "time_per_iteration": 2.548630475997925 + }, + { + "auxiliary_loss_clip": 0.06401244, + "auxiliary_loss_mlp": 0.01266243, + "balance_loss_clip": 0.06271894, + "balance_loss_mlp": 0.01256337, + "epoch": 0.972553735157072, + "flos": 16988767315200.0, + "grad_norm": 2.017456752421388, + "language_loss": 0.57784498, + "learning_rate": 7.879774302919307e-09, + "loss": 0.65451986, + "num_input_tokens_seen": 349129495, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.09906006, + "step": 16176, + "time_per_iteration": 2.4894320964813232 + }, + { + "auxiliary_loss_clip": 0.06400141, + "auxiliary_loss_mlp": 0.01263307, + "balance_loss_clip": 0.06271263, + "balance_loss_mlp": 0.01254569, + "epoch": 0.97261385840974, + "flos": 26111916011520.0, + "grad_norm": 2.620974908086474, + "language_loss": 0.72649771, + "learning_rate": 7.845274372151545e-09, + "loss": 0.80313218, + "num_input_tokens_seen": 349148850, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.08740234, + "step": 16177, + "time_per_iteration": 2.536285400390625 + }, + { + "auxiliary_loss_clip": 0.06406036, + "auxiliary_loss_mlp": 0.01265412, + "balance_loss_clip": 0.06271951, + "balance_loss_mlp": 0.0125618, + "epoch": 0.9726739816624079, + "flos": 25455031528320.0, + "grad_norm": 1.6608985876914684, + "language_loss": 0.68600643, + "learning_rate": 7.810849984090984e-09, + "loss": 0.76272094, + "num_input_tokens_seen": 349167620, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.09228516, + "step": 16178, + "time_per_iteration": 2.54495906829834 + }, + { + "auxiliary_loss_clip": 0.06405666, + "auxiliary_loss_mlp": 0.01264633, + "balance_loss_clip": 0.06270958, + "balance_loss_mlp": 0.01254405, + "epoch": 0.972734104915076, + "flos": 29021237625600.0, + "grad_norm": 2.148587612037516, + "language_loss": 0.6748485, + "learning_rate": 7.776501140042358e-09, + "loss": 0.75155145, + "num_input_tokens_seen": 349185845, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.10229492, + "step": 16179, + "time_per_iteration": 2.5600404739379883 + }, + { + "auxiliary_loss_clip": 0.06396864, + "auxiliary_loss_mlp": 0.0126201, + "balance_loss_clip": 0.06269827, + "balance_loss_mlp": 0.01253212, + "epoch": 0.9727942281677439, + "flos": 23443861829760.0, + "grad_norm": 1.8043958106995313, + "language_loss": 0.77263665, + "learning_rate": 7.742227841308624e-09, + "loss": 0.8492254, + "num_input_tokens_seen": 349204525, + "router_z_loss_clip": 1.27050781, + "router_z_loss_mlp": 0.0880127, + "step": 16180, + "time_per_iteration": 2.521084785461426 + }, + { + "auxiliary_loss_clip": 0.06407119, + "auxiliary_loss_mlp": 0.0126681, + "balance_loss_clip": 0.06269898, + "balance_loss_mlp": 0.01256618, + "epoch": 0.9728543514204119, + "flos": 31732994511360.0, + "grad_norm": 1.4950380620703005, + "language_loss": 0.76710343, + "learning_rate": 7.708030089189188e-09, + "loss": 0.84384269, + "num_input_tokens_seen": 349228075, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.10180664, + "step": 16181, + "time_per_iteration": 2.5928866863250732 + }, + { + "auxiliary_loss_clip": 0.06401683, + "auxiliary_loss_mlp": 0.01263081, + "balance_loss_clip": 0.06270894, + "balance_loss_mlp": 0.01254003, + "epoch": 0.9729144746730798, + "flos": 16294888454400.0, + "grad_norm": 1.3587136174189807, + "language_loss": 0.6363312, + "learning_rate": 7.67390788498079e-09, + "loss": 0.71297884, + "num_input_tokens_seen": 349246990, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.09075928, + "step": 16182, + "time_per_iteration": 3.9371418952941895 + }, + { + "auxiliary_loss_clip": 0.06401983, + "auxiliary_loss_mlp": 0.01265037, + "balance_loss_clip": 0.06269817, + "balance_loss_mlp": 0.01255512, + "epoch": 0.9729745979257478, + "flos": 25047632177280.0, + "grad_norm": 1.6902434550844887, + "language_loss": 0.62347919, + "learning_rate": 7.639861229977507e-09, + "loss": 0.70014942, + "num_input_tokens_seen": 349265890, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.09527588, + "step": 16183, + "time_per_iteration": 2.5505123138427734 + }, + { + "auxiliary_loss_clip": 0.06394369, + "auxiliary_loss_mlp": 0.01265951, + "balance_loss_clip": 0.06267164, + "balance_loss_mlp": 0.01256623, + "epoch": 0.9730347211784157, + "flos": 22645456600320.0, + "grad_norm": 2.073017408654554, + "language_loss": 0.77957594, + "learning_rate": 7.605890125470527e-09, + "loss": 0.85617918, + "num_input_tokens_seen": 349285275, + "router_z_loss_clip": 1.27148438, + "router_z_loss_mlp": 0.09326172, + "step": 16184, + "time_per_iteration": 2.5804505348205566 + }, + { + "auxiliary_loss_clip": 0.06400636, + "auxiliary_loss_mlp": 0.01264673, + "balance_loss_clip": 0.06270216, + "balance_loss_mlp": 0.01255625, + "epoch": 0.9730948444310837, + "flos": 11003195554560.0, + "grad_norm": 2.1007472833639764, + "language_loss": 0.79576832, + "learning_rate": 7.571994572747709e-09, + "loss": 0.87242138, + "num_input_tokens_seen": 349301515, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.09051514, + "step": 16185, + "time_per_iteration": 2.4700310230255127 + }, + { + "auxiliary_loss_clip": 0.06404022, + "auxiliary_loss_mlp": 0.0126446, + "balance_loss_clip": 0.06270284, + "balance_loss_mlp": 0.01255167, + "epoch": 0.9731549676837516, + "flos": 16804969384320.0, + "grad_norm": 1.7281880541829828, + "language_loss": 0.77737701, + "learning_rate": 7.538174573094469e-09, + "loss": 0.85406184, + "num_input_tokens_seen": 349319590, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.09289551, + "step": 16186, + "time_per_iteration": 2.495136022567749 + }, + { + "auxiliary_loss_clip": 0.06399482, + "auxiliary_loss_mlp": 0.01261887, + "balance_loss_clip": 0.06269419, + "balance_loss_mlp": 0.01252344, + "epoch": 0.9732150909364197, + "flos": 21148057411200.0, + "grad_norm": 1.5535957867301606, + "language_loss": 0.65284431, + "learning_rate": 7.504430127793337e-09, + "loss": 0.72945803, + "num_input_tokens_seen": 349339230, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.09539795, + "step": 16187, + "time_per_iteration": 2.518338680267334 + }, + { + "auxiliary_loss_clip": 0.06399734, + "auxiliary_loss_mlp": 0.01264685, + "balance_loss_clip": 0.06269566, + "balance_loss_mlp": 0.01255297, + "epoch": 0.9732752141890876, + "flos": 33735401458560.0, + "grad_norm": 1.82910578171191, + "language_loss": 0.80486286, + "learning_rate": 7.47076123812418e-09, + "loss": 0.88150704, + "num_input_tokens_seen": 349361155, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.09387207, + "step": 16188, + "time_per_iteration": 4.078651666641235 + }, + { + "auxiliary_loss_clip": 0.0639957, + "auxiliary_loss_mlp": 0.01265825, + "balance_loss_clip": 0.06272013, + "balance_loss_mlp": 0.01257331, + "epoch": 0.9733353374417556, + "flos": 23411144010240.0, + "grad_norm": 1.9709286631587892, + "language_loss": 0.79032779, + "learning_rate": 7.437167905363084e-09, + "loss": 0.86698174, + "num_input_tokens_seen": 349379335, + "router_z_loss_clip": 1.27734375, + "router_z_loss_mlp": 0.0848999, + "step": 16189, + "time_per_iteration": 2.5257105827331543 + }, + { + "auxiliary_loss_clip": 0.06399654, + "auxiliary_loss_mlp": 0.01264485, + "balance_loss_clip": 0.06269268, + "balance_loss_mlp": 0.01254859, + "epoch": 0.9733954606944236, + "flos": 39175113795840.0, + "grad_norm": 1.7501353346003765, + "language_loss": 0.5154829, + "learning_rate": 7.403650130784367e-09, + "loss": 0.59212422, + "num_input_tokens_seen": 349401575, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.09619141, + "step": 16190, + "time_per_iteration": 2.6552765369415283 + }, + { + "auxiliary_loss_clip": 0.06401493, + "auxiliary_loss_mlp": 0.0126365, + "balance_loss_clip": 0.06270113, + "balance_loss_mlp": 0.01254399, + "epoch": 0.9734555839470915, + "flos": 21988404408960.0, + "grad_norm": 1.6917761337688713, + "language_loss": 0.80587709, + "learning_rate": 7.3702079156590105e-09, + "loss": 0.88252854, + "num_input_tokens_seen": 349420650, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09240723, + "step": 16191, + "time_per_iteration": 2.544218063354492 + }, + { + "auxiliary_loss_clip": 0.06401005, + "auxiliary_loss_mlp": 0.01263985, + "balance_loss_clip": 0.06270884, + "balance_loss_mlp": 0.01255462, + "epoch": 0.9735157071997596, + "flos": 16580152080000.0, + "grad_norm": 1.6445033626278693, + "language_loss": 0.8313902, + "learning_rate": 7.336841261255111e-09, + "loss": 0.90804017, + "num_input_tokens_seen": 349436830, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.08526611, + "step": 16192, + "time_per_iteration": 2.4879636764526367 + }, + { + "auxiliary_loss_clip": 0.0640302, + "auxiliary_loss_mlp": 0.01265061, + "balance_loss_clip": 0.06269884, + "balance_loss_mlp": 0.01255596, + "epoch": 0.9735758304524275, + "flos": 20228313070080.0, + "grad_norm": 1.7244487674572468, + "language_loss": 0.75065506, + "learning_rate": 7.303550168837658e-09, + "loss": 0.82733583, + "num_input_tokens_seen": 349454325, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.09472656, + "step": 16193, + "time_per_iteration": 2.4931979179382324 + }, + { + "auxiliary_loss_clip": 0.06397454, + "auxiliary_loss_mlp": 0.01262104, + "balance_loss_clip": 0.06270149, + "balance_loss_mlp": 0.01253688, + "epoch": 0.9736359537050955, + "flos": 23659077841920.0, + "grad_norm": 1.629712416735138, + "language_loss": 0.85322011, + "learning_rate": 7.270334639669417e-09, + "loss": 0.92981565, + "num_input_tokens_seen": 349470230, + "router_z_loss_clip": 1.2734375, + "router_z_loss_mlp": 0.08416748, + "step": 16194, + "time_per_iteration": 2.505967140197754 + }, + { + "auxiliary_loss_clip": 0.06396167, + "auxiliary_loss_mlp": 0.01264562, + "balance_loss_clip": 0.06270817, + "balance_loss_mlp": 0.01255919, + "epoch": 0.9736960769577634, + "flos": 15565692297600.0, + "grad_norm": 1.4618204477527796, + "language_loss": 0.76054919, + "learning_rate": 7.237194675009828e-09, + "loss": 0.83715641, + "num_input_tokens_seen": 349486250, + "router_z_loss_clip": 1.25195312, + "router_z_loss_mlp": 0.08648682, + "step": 16195, + "time_per_iteration": 2.4902737140655518 + }, + { + "auxiliary_loss_clip": 0.0630816, + "auxiliary_loss_mlp": 0.01249959, + "balance_loss_clip": 0.06254224, + "balance_loss_mlp": 0.01248933, + "epoch": 0.9737562002104314, + "flos": 65369781313920.0, + "grad_norm": 0.7068967034804419, + "language_loss": 0.52516842, + "learning_rate": 7.204130276115439e-09, + "loss": 0.60074961, + "num_input_tokens_seen": 349545865, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.01026154, + "step": 16196, + "time_per_iteration": 3.0891356468200684 + }, + { + "auxiliary_loss_clip": 0.06402862, + "auxiliary_loss_mlp": 0.01264517, + "balance_loss_clip": 0.06270996, + "balance_loss_mlp": 0.01255028, + "epoch": 0.9738163234630993, + "flos": 27203760639360.0, + "grad_norm": 1.5067079067640303, + "language_loss": 0.76304662, + "learning_rate": 7.171141444240136e-09, + "loss": 0.83972049, + "num_input_tokens_seen": 349566080, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09484863, + "step": 16197, + "time_per_iteration": 2.5539703369140625 + }, + { + "auxiliary_loss_clip": 0.06407809, + "auxiliary_loss_mlp": 0.01266448, + "balance_loss_clip": 0.06270401, + "balance_loss_mlp": 0.01256124, + "epoch": 0.9738764467157673, + "flos": 21075745737600.0, + "grad_norm": 1.7086384340605625, + "language_loss": 0.67975712, + "learning_rate": 7.13822818063492e-09, + "loss": 0.75649977, + "num_input_tokens_seen": 349585665, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.10327148, + "step": 16198, + "time_per_iteration": 2.503563165664673 + }, + { + "auxiliary_loss_clip": 0.06400761, + "auxiliary_loss_mlp": 0.01264048, + "balance_loss_clip": 0.06268206, + "balance_loss_mlp": 0.01254678, + "epoch": 0.9739365699684353, + "flos": 21367633835520.0, + "grad_norm": 1.6722273103700527, + "language_loss": 0.77999789, + "learning_rate": 7.10539048654768e-09, + "loss": 0.85664594, + "num_input_tokens_seen": 349605125, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.09362793, + "step": 16199, + "time_per_iteration": 2.5150656700134277 + }, + { + "auxiliary_loss_clip": 0.06402802, + "auxiliary_loss_mlp": 0.01264046, + "balance_loss_clip": 0.06271003, + "balance_loss_mlp": 0.0125411, + "epoch": 0.9739966932211033, + "flos": 21907497692160.0, + "grad_norm": 1.5607608988910977, + "language_loss": 0.79645491, + "learning_rate": 7.072628363223865e-09, + "loss": 0.87312341, + "num_input_tokens_seen": 349623360, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09936523, + "step": 16200, + "time_per_iteration": 2.5212936401367188 + }, + { + "auxiliary_loss_clip": 0.06407085, + "auxiliary_loss_mlp": 0.01263577, + "balance_loss_clip": 0.06268042, + "balance_loss_mlp": 0.01253474, + "epoch": 0.9740568164737712, + "flos": 24834344808960.0, + "grad_norm": 2.2264646235457937, + "language_loss": 0.69207859, + "learning_rate": 7.039941811905592e-09, + "loss": 0.76878524, + "num_input_tokens_seen": 349644390, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.10113525, + "step": 16201, + "time_per_iteration": 2.5361874103546143 + }, + { + "auxiliary_loss_clip": 0.06404103, + "auxiliary_loss_mlp": 0.01265158, + "balance_loss_clip": 0.06272092, + "balance_loss_mlp": 0.01256105, + "epoch": 0.9741169397264392, + "flos": 23630426945280.0, + "grad_norm": 1.5091663740328265, + "language_loss": 0.72960538, + "learning_rate": 7.0073308338325364e-09, + "loss": 0.80629796, + "num_input_tokens_seen": 349663200, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09051514, + "step": 16202, + "time_per_iteration": 2.53006649017334 + }, + { + "auxiliary_loss_clip": 0.064046, + "auxiliary_loss_mlp": 0.01264665, + "balance_loss_clip": 0.06270882, + "balance_loss_mlp": 0.01255236, + "epoch": 0.9741770629791072, + "flos": 18846718623360.0, + "grad_norm": 1.822554423323346, + "language_loss": 0.72919339, + "learning_rate": 6.974795430241265e-09, + "loss": 0.80588603, + "num_input_tokens_seen": 349681975, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.09423828, + "step": 16203, + "time_per_iteration": 2.504948616027832 + }, + { + "auxiliary_loss_clip": 0.06402065, + "auxiliary_loss_mlp": 0.01262649, + "balance_loss_clip": 0.06270267, + "balance_loss_mlp": 0.0125347, + "epoch": 0.9742371862317751, + "flos": 22352813815680.0, + "grad_norm": 2.4570819002926303, + "language_loss": 0.77505815, + "learning_rate": 6.942335602365235e-09, + "loss": 0.85170531, + "num_input_tokens_seen": 349701185, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.09179688, + "step": 16204, + "time_per_iteration": 2.499577760696411 + }, + { + "auxiliary_loss_clip": 0.06406648, + "auxiliary_loss_mlp": 0.01266659, + "balance_loss_clip": 0.06274957, + "balance_loss_mlp": 0.01257093, + "epoch": 0.9742973094844432, + "flos": 21769289182080.0, + "grad_norm": 1.965411642233907, + "language_loss": 0.79419672, + "learning_rate": 6.909951351435905e-09, + "loss": 0.87092984, + "num_input_tokens_seen": 349720360, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09564209, + "step": 16205, + "time_per_iteration": 2.4995784759521484 + }, + { + "auxiliary_loss_clip": 0.06399336, + "auxiliary_loss_mlp": 0.01265129, + "balance_loss_clip": 0.06269155, + "balance_loss_mlp": 0.01256147, + "epoch": 0.9743574327371111, + "flos": 26255700817920.0, + "grad_norm": 1.508831100662547, + "language_loss": 0.7445184, + "learning_rate": 6.87764267868074e-09, + "loss": 0.82116306, + "num_input_tokens_seen": 349741040, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.08984375, + "step": 16206, + "time_per_iteration": 4.032231092453003 + }, + { + "auxiliary_loss_clip": 0.06402233, + "auxiliary_loss_mlp": 0.01262179, + "balance_loss_clip": 0.06268986, + "balance_loss_mlp": 0.01252487, + "epoch": 0.9744175559897791, + "flos": 12354252387840.0, + "grad_norm": 2.280007782311689, + "language_loss": 0.84424287, + "learning_rate": 6.8454095853252015e-09, + "loss": 0.92088699, + "num_input_tokens_seen": 349758895, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.09686279, + "step": 16207, + "time_per_iteration": 2.6139605045318604 + }, + { + "auxiliary_loss_clip": 0.06399205, + "auxiliary_loss_mlp": 0.01262873, + "balance_loss_clip": 0.06271303, + "balance_loss_mlp": 0.01254231, + "epoch": 0.974477679242447, + "flos": 28404575902080.0, + "grad_norm": 1.4963528987347634, + "language_loss": 0.71026999, + "learning_rate": 6.813252072591425e-09, + "loss": 0.78689075, + "num_input_tokens_seen": 349779740, + "router_z_loss_clip": 1.27929688, + "router_z_loss_mlp": 0.08648682, + "step": 16208, + "time_per_iteration": 2.599848747253418 + }, + { + "auxiliary_loss_clip": 0.06394268, + "auxiliary_loss_mlp": 0.01262607, + "balance_loss_clip": 0.06270576, + "balance_loss_mlp": 0.0125409, + "epoch": 0.974537802495115, + "flos": 17791155613440.0, + "grad_norm": 1.6815172078173168, + "language_loss": 0.77535599, + "learning_rate": 6.781170141698878e-09, + "loss": 0.85192478, + "num_input_tokens_seen": 349796820, + "router_z_loss_clip": 1.23828125, + "router_z_loss_mlp": 0.08526611, + "step": 16209, + "time_per_iteration": 2.4785659313201904 + }, + { + "auxiliary_loss_clip": 0.06402382, + "auxiliary_loss_mlp": 0.01263455, + "balance_loss_clip": 0.06268477, + "balance_loss_mlp": 0.0125365, + "epoch": 0.9745979257477829, + "flos": 23849164828800.0, + "grad_norm": 1.5681531369172674, + "language_loss": 0.79805732, + "learning_rate": 6.749163793864144e-09, + "loss": 0.87471569, + "num_input_tokens_seen": 349816550, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.09802246, + "step": 16210, + "time_per_iteration": 2.525526285171509 + }, + { + "auxiliary_loss_clip": 0.06400919, + "auxiliary_loss_mlp": 0.01262256, + "balance_loss_clip": 0.06269119, + "balance_loss_mlp": 0.0125294, + "epoch": 0.9746580490004509, + "flos": 27023484579840.0, + "grad_norm": 2.075547249109443, + "language_loss": 0.78150928, + "learning_rate": 6.7172330303009176e-09, + "loss": 0.85814106, + "num_input_tokens_seen": 349834350, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09307861, + "step": 16211, + "time_per_iteration": 2.5355217456817627 + }, + { + "auxiliary_loss_clip": 0.06411395, + "auxiliary_loss_mlp": 0.01265327, + "balance_loss_clip": 0.06274585, + "balance_loss_mlp": 0.01255027, + "epoch": 0.9747181722531189, + "flos": 19798132608000.0, + "grad_norm": 1.9008085045696146, + "language_loss": 0.7795732, + "learning_rate": 6.685377852219787e-09, + "loss": 0.85634041, + "num_input_tokens_seen": 349853460, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.10296631, + "step": 16212, + "time_per_iteration": 2.506300926208496 + }, + { + "auxiliary_loss_clip": 0.06398016, + "auxiliary_loss_mlp": 0.0126477, + "balance_loss_clip": 0.06269819, + "balance_loss_mlp": 0.01256008, + "epoch": 0.9747782955057869, + "flos": 31438590791040.0, + "grad_norm": 1.3851280595823252, + "language_loss": 0.80251986, + "learning_rate": 6.653598260829118e-09, + "loss": 0.87914777, + "num_input_tokens_seen": 349874830, + "router_z_loss_clip": 1.28027344, + "router_z_loss_mlp": 0.08764648, + "step": 16213, + "time_per_iteration": 2.5735127925872803 + }, + { + "auxiliary_loss_clip": 0.0640009, + "auxiliary_loss_mlp": 0.01263743, + "balance_loss_clip": 0.06269902, + "balance_loss_mlp": 0.01254558, + "epoch": 0.9748384187584548, + "flos": 15966802592640.0, + "grad_norm": 1.8081777723616046, + "language_loss": 0.66367626, + "learning_rate": 6.6218942573335044e-09, + "loss": 0.7403146, + "num_input_tokens_seen": 349893690, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.09185791, + "step": 16214, + "time_per_iteration": 4.007796049118042 + }, + { + "auxiliary_loss_clip": 0.06407678, + "auxiliary_loss_mlp": 0.01270943, + "balance_loss_clip": 0.06271762, + "balance_loss_mlp": 0.01261251, + "epoch": 0.9748985420111228, + "flos": 20565035902080.0, + "grad_norm": 3.5974058234157056, + "language_loss": 0.74614125, + "learning_rate": 6.5902658429355386e-09, + "loss": 0.82292747, + "num_input_tokens_seen": 349912480, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.09692383, + "step": 16215, + "time_per_iteration": 2.621452808380127 + }, + { + "auxiliary_loss_clip": 0.06399758, + "auxiliary_loss_mlp": 0.01264078, + "balance_loss_clip": 0.06270334, + "balance_loss_mlp": 0.01254696, + "epoch": 0.9749586652637908, + "flos": 36730577180160.0, + "grad_norm": 1.6258391416497984, + "language_loss": 0.66849625, + "learning_rate": 6.558713018834483e-09, + "loss": 0.74513459, + "num_input_tokens_seen": 349932470, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.09381104, + "step": 16216, + "time_per_iteration": 2.61350417137146 + }, + { + "auxiliary_loss_clip": 0.06405714, + "auxiliary_loss_mlp": 0.01264792, + "balance_loss_clip": 0.062713, + "balance_loss_mlp": 0.01255393, + "epoch": 0.9750187885164587, + "flos": 11003908314240.0, + "grad_norm": 1.786638757254164, + "language_loss": 0.72343373, + "learning_rate": 6.527235786226937e-09, + "loss": 0.80013883, + "num_input_tokens_seen": 349949060, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.09399414, + "step": 16217, + "time_per_iteration": 2.466787576675415 + }, + { + "auxiliary_loss_clip": 0.06400132, + "auxiliary_loss_mlp": 0.01262208, + "balance_loss_clip": 0.06270897, + "balance_loss_mlp": 0.01253667, + "epoch": 0.9750789117691268, + "flos": 25746668064000.0, + "grad_norm": 1.4800942983039718, + "language_loss": 0.78881538, + "learning_rate": 6.495834146306167e-09, + "loss": 0.86543876, + "num_input_tokens_seen": 349968010, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.08532715, + "step": 16218, + "time_per_iteration": 2.6768579483032227 + }, + { + "auxiliary_loss_clip": 0.06398283, + "auxiliary_loss_mlp": 0.01261833, + "balance_loss_clip": 0.06271155, + "balance_loss_mlp": 0.01252458, + "epoch": 0.9751390350217947, + "flos": 13338971170560.0, + "grad_norm": 1.8880651410649392, + "language_loss": 0.77665508, + "learning_rate": 6.464508100263222e-09, + "loss": 0.85325623, + "num_input_tokens_seen": 349985270, + "router_z_loss_clip": 1.27246094, + "router_z_loss_mlp": 0.09362793, + "step": 16219, + "time_per_iteration": 2.511852741241455 + }, + { + "auxiliary_loss_clip": 0.06405136, + "auxiliary_loss_mlp": 0.01262829, + "balance_loss_clip": 0.06272408, + "balance_loss_mlp": 0.0125393, + "epoch": 0.9751991582744627, + "flos": 22827283960320.0, + "grad_norm": 1.5654377531659194, + "language_loss": 0.81504959, + "learning_rate": 6.433257649285817e-09, + "loss": 0.89172924, + "num_input_tokens_seen": 350003935, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.08905029, + "step": 16220, + "time_per_iteration": 2.478729009628296 + }, + { + "auxiliary_loss_clip": 0.06398819, + "auxiliary_loss_mlp": 0.01262589, + "balance_loss_clip": 0.06270699, + "balance_loss_mlp": 0.01253696, + "epoch": 0.9752592815271306, + "flos": 19652293376640.0, + "grad_norm": 1.7313417854694155, + "language_loss": 0.75431448, + "learning_rate": 6.402082794559227e-09, + "loss": 0.83092856, + "num_input_tokens_seen": 350023595, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.08892822, + "step": 16221, + "time_per_iteration": 2.4944918155670166 + }, + { + "auxiliary_loss_clip": 0.06398918, + "auxiliary_loss_mlp": 0.01265498, + "balance_loss_clip": 0.06270978, + "balance_loss_mlp": 0.0125623, + "epoch": 0.9753194047797986, + "flos": 26698165902720.0, + "grad_norm": 1.457397211257543, + "language_loss": 0.66733098, + "learning_rate": 6.370983537265395e-09, + "loss": 0.74397516, + "num_input_tokens_seen": 350045920, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.09265137, + "step": 16222, + "time_per_iteration": 3.96037220954895 + }, + { + "auxiliary_loss_clip": 0.06399057, + "auxiliary_loss_mlp": 0.01263788, + "balance_loss_clip": 0.06270253, + "balance_loss_mlp": 0.01254787, + "epoch": 0.9753795280324665, + "flos": 23228478109440.0, + "grad_norm": 1.713022931639831, + "language_loss": 0.88554835, + "learning_rate": 6.3399598785836004e-09, + "loss": 0.9621768, + "num_input_tokens_seen": 350063925, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.08996582, + "step": 16223, + "time_per_iteration": 2.514981269836426 + }, + { + "auxiliary_loss_clip": 0.06396091, + "auxiliary_loss_mlp": 0.01265957, + "balance_loss_clip": 0.06269166, + "balance_loss_mlp": 0.01257177, + "epoch": 0.9754396512851345, + "flos": 19469920965120.0, + "grad_norm": 1.6965637319333444, + "language_loss": 0.74798816, + "learning_rate": 6.309011819690457e-09, + "loss": 0.82460868, + "num_input_tokens_seen": 350080900, + "router_z_loss_clip": 1.26855469, + "router_z_loss_mlp": 0.08764648, + "step": 16224, + "time_per_iteration": 2.4790241718292236 + }, + { + "auxiliary_loss_clip": 0.06309325, + "auxiliary_loss_mlp": 0.01249123, + "balance_loss_clip": 0.06255152, + "balance_loss_mlp": 0.012481, + "epoch": 0.9754997745378025, + "flos": 68478875061120.0, + "grad_norm": 0.7927113550551911, + "language_loss": 0.59015584, + "learning_rate": 6.278139361759249e-09, + "loss": 0.66574037, + "num_input_tokens_seen": 350144550, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.01023102, + "step": 16225, + "time_per_iteration": 3.09687876701355 + }, + { + "auxiliary_loss_clip": 0.06404333, + "auxiliary_loss_mlp": 0.01270773, + "balance_loss_clip": 0.06274021, + "balance_loss_mlp": 0.01261505, + "epoch": 0.9755598977904705, + "flos": 26402252808960.0, + "grad_norm": 1.669263937257646, + "language_loss": 0.68925965, + "learning_rate": 6.247342505960818e-09, + "loss": 0.7660107, + "num_input_tokens_seen": 350164050, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.09265137, + "step": 16226, + "time_per_iteration": 2.5773234367370605 + }, + { + "auxiliary_loss_clip": 0.06400628, + "auxiliary_loss_mlp": 0.01261945, + "balance_loss_clip": 0.0626903, + "balance_loss_mlp": 0.01252522, + "epoch": 0.9756200210431384, + "flos": 16623225878400.0, + "grad_norm": 1.6660576711306636, + "language_loss": 0.83624262, + "learning_rate": 6.216621253462894e-09, + "loss": 0.91286826, + "num_input_tokens_seen": 350181350, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09417725, + "step": 16227, + "time_per_iteration": 2.486311435699463 + }, + { + "auxiliary_loss_clip": 0.06398968, + "auxiliary_loss_mlp": 0.01264262, + "balance_loss_clip": 0.06270081, + "balance_loss_mlp": 0.01255321, + "epoch": 0.9756801442958064, + "flos": 23629798039680.0, + "grad_norm": 1.652694974526233, + "language_loss": 0.77968043, + "learning_rate": 6.185975605430549e-09, + "loss": 0.85631275, + "num_input_tokens_seen": 350199765, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.0894165, + "step": 16228, + "time_per_iteration": 3.98093843460083 + }, + { + "auxiliary_loss_clip": 0.06308308, + "auxiliary_loss_mlp": 0.01248433, + "balance_loss_clip": 0.06254362, + "balance_loss_mlp": 0.01247415, + "epoch": 0.9757402675484744, + "flos": 61642432615680.0, + "grad_norm": 0.8144485911431966, + "language_loss": 0.55775505, + "learning_rate": 6.155405563025962e-09, + "loss": 0.63332248, + "num_input_tokens_seen": 350256420, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.01017761, + "step": 16229, + "time_per_iteration": 3.1203420162200928 + }, + { + "auxiliary_loss_clip": 0.06401952, + "auxiliary_loss_mlp": 0.01267662, + "balance_loss_clip": 0.06270453, + "balance_loss_mlp": 0.01258298, + "epoch": 0.9758003908011423, + "flos": 24065470944000.0, + "grad_norm": 1.894418364311992, + "language_loss": 0.7524991, + "learning_rate": 6.124911127407984e-09, + "loss": 0.82919526, + "num_input_tokens_seen": 350276270, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09362793, + "step": 16230, + "time_per_iteration": 2.5575931072235107 + }, + { + "auxiliary_loss_clip": 0.06396811, + "auxiliary_loss_mlp": 0.01264254, + "balance_loss_clip": 0.06271554, + "balance_loss_mlp": 0.01255773, + "epoch": 0.9758605140538104, + "flos": 17498764391040.0, + "grad_norm": 1.8422769218162587, + "language_loss": 0.71889436, + "learning_rate": 6.094492299733245e-09, + "loss": 0.79550505, + "num_input_tokens_seen": 350295000, + "router_z_loss_clip": 1.25195312, + "router_z_loss_mlp": 0.08483887, + "step": 16231, + "time_per_iteration": 2.5055992603302 + }, + { + "auxiliary_loss_clip": 0.06407274, + "auxiliary_loss_mlp": 0.01266757, + "balance_loss_clip": 0.06271669, + "balance_loss_mlp": 0.01256463, + "epoch": 0.9759206373064783, + "flos": 24833883611520.0, + "grad_norm": 1.7197145025092386, + "language_loss": 0.76920104, + "learning_rate": 6.064149081155267e-09, + "loss": 0.84594142, + "num_input_tokens_seen": 350314980, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.10296631, + "step": 16232, + "time_per_iteration": 2.5294857025146484 + }, + { + "auxiliary_loss_clip": 0.0630935, + "auxiliary_loss_mlp": 0.01249753, + "balance_loss_clip": 0.06255519, + "balance_loss_mlp": 0.01248793, + "epoch": 0.9759807605591463, + "flos": 68179649731200.0, + "grad_norm": 0.719631552631875, + "language_loss": 0.53778744, + "learning_rate": 6.033881472824465e-09, + "loss": 0.61337841, + "num_input_tokens_seen": 350371985, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.00958252, + "step": 16233, + "time_per_iteration": 3.017638683319092 + }, + { + "auxiliary_loss_clip": 0.0640213, + "auxiliary_loss_mlp": 0.01266568, + "balance_loss_clip": 0.06271942, + "balance_loss_mlp": 0.01256853, + "epoch": 0.9760408838118142, + "flos": 18995199258240.0, + "grad_norm": 1.757221153024699, + "language_loss": 0.71420014, + "learning_rate": 6.003689475888807e-09, + "loss": 0.79088712, + "num_input_tokens_seen": 350390590, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.0970459, + "step": 16234, + "time_per_iteration": 2.493136167526245 + }, + { + "auxiliary_loss_clip": 0.06408353, + "auxiliary_loss_mlp": 0.01266546, + "balance_loss_clip": 0.06272238, + "balance_loss_mlp": 0.01257104, + "epoch": 0.9761010070644822, + "flos": 17131210456320.0, + "grad_norm": 2.3283739707112354, + "language_loss": 0.79285693, + "learning_rate": 5.973573091493156e-09, + "loss": 0.86960596, + "num_input_tokens_seen": 350403770, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.09448242, + "step": 16235, + "time_per_iteration": 2.48677921295166 + }, + { + "auxiliary_loss_clip": 0.06400178, + "auxiliary_loss_mlp": 0.01265132, + "balance_loss_clip": 0.06271134, + "balance_loss_mlp": 0.01255166, + "epoch": 0.9761611303171501, + "flos": 22058829365760.0, + "grad_norm": 1.763069638375242, + "language_loss": 0.77298689, + "learning_rate": 5.943532320779265e-09, + "loss": 0.84964001, + "num_input_tokens_seen": 350421870, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.09954834, + "step": 16236, + "time_per_iteration": 2.5670228004455566 + }, + { + "auxiliary_loss_clip": 0.06401862, + "auxiliary_loss_mlp": 0.01265343, + "balance_loss_clip": 0.06270871, + "balance_loss_mlp": 0.01256682, + "epoch": 0.9762212535698181, + "flos": 21763167834240.0, + "grad_norm": 1.9679872991470369, + "language_loss": 0.75770509, + "learning_rate": 5.913567164886446e-09, + "loss": 0.83437711, + "num_input_tokens_seen": 350440025, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.08654785, + "step": 16237, + "time_per_iteration": 2.51847505569458 + }, + { + "auxiliary_loss_clip": 0.06401821, + "auxiliary_loss_mlp": 0.01266592, + "balance_loss_clip": 0.06269572, + "balance_loss_mlp": 0.01255786, + "epoch": 0.9762813768224861, + "flos": 25928746986240.0, + "grad_norm": 1.5570589919233344, + "language_loss": 0.73076248, + "learning_rate": 5.8836776249509e-09, + "loss": 0.8074466, + "num_input_tokens_seen": 350459435, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.10803223, + "step": 16238, + "time_per_iteration": 2.527402877807617 + }, + { + "auxiliary_loss_clip": 0.06403423, + "auxiliary_loss_mlp": 0.01265456, + "balance_loss_clip": 0.06271146, + "balance_loss_mlp": 0.01256283, + "epoch": 0.9763415000751541, + "flos": 24057169390080.0, + "grad_norm": 2.218643213238991, + "language_loss": 0.84103715, + "learning_rate": 5.8538637021063875e-09, + "loss": 0.91772586, + "num_input_tokens_seen": 350472655, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.09173584, + "step": 16239, + "time_per_iteration": 2.5126121044158936 + }, + { + "auxiliary_loss_clip": 0.06400665, + "auxiliary_loss_mlp": 0.01266419, + "balance_loss_clip": 0.06270189, + "balance_loss_mlp": 0.01257031, + "epoch": 0.976401623327822, + "flos": 17024252319360.0, + "grad_norm": 2.8876025020508265, + "language_loss": 0.60672832, + "learning_rate": 5.824125397483115e-09, + "loss": 0.6833992, + "num_input_tokens_seen": 350488160, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.09387207, + "step": 16240, + "time_per_iteration": 2.463484287261963 + }, + { + "auxiliary_loss_clip": 0.06397688, + "auxiliary_loss_mlp": 0.01265751, + "balance_loss_clip": 0.06269402, + "balance_loss_mlp": 0.01256286, + "epoch": 0.97646174658049, + "flos": 16112432188800.0, + "grad_norm": 2.071519660187613, + "language_loss": 0.82556367, + "learning_rate": 5.7944627122088474e-09, + "loss": 0.90219802, + "num_input_tokens_seen": 350506065, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.09460449, + "step": 16241, + "time_per_iteration": 2.5261969566345215 + }, + { + "auxiliary_loss_clip": 0.06403396, + "auxiliary_loss_mlp": 0.01264797, + "balance_loss_clip": 0.06272305, + "balance_loss_mlp": 0.01255433, + "epoch": 0.9765218698331579, + "flos": 21259292106240.0, + "grad_norm": 1.6838503485732548, + "language_loss": 0.83407527, + "learning_rate": 5.764875647408463e-09, + "loss": 0.91075718, + "num_input_tokens_seen": 350524495, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.09362793, + "step": 16242, + "time_per_iteration": 2.504279136657715 + }, + { + "auxiliary_loss_clip": 0.06404735, + "auxiliary_loss_mlp": 0.0126526, + "balance_loss_clip": 0.06273401, + "balance_loss_mlp": 0.01255939, + "epoch": 0.9765819930858259, + "flos": 18593963182080.0, + "grad_norm": 1.7428652843510748, + "language_loss": 0.75944352, + "learning_rate": 5.7353642042037294e-09, + "loss": 0.83614349, + "num_input_tokens_seen": 350544185, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09320068, + "step": 16243, + "time_per_iteration": 2.5364439487457275 + }, + { + "auxiliary_loss_clip": 0.0640022, + "auxiliary_loss_mlp": 0.01267394, + "balance_loss_clip": 0.06269416, + "balance_loss_mlp": 0.01257899, + "epoch": 0.976642116338494, + "flos": 20273105877120.0, + "grad_norm": 1.6450832165857223, + "language_loss": 0.6998055, + "learning_rate": 5.705928383713754e-09, + "loss": 0.77648169, + "num_input_tokens_seen": 350562675, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.09503174, + "step": 16244, + "time_per_iteration": 2.503443717956543 + }, + { + "auxiliary_loss_clip": 0.06406413, + "auxiliary_loss_mlp": 0.01269299, + "balance_loss_clip": 0.06273812, + "balance_loss_mlp": 0.01259357, + "epoch": 0.9767022395911619, + "flos": 25556497223040.0, + "grad_norm": 1.7598332350638926, + "language_loss": 0.84103727, + "learning_rate": 5.676568187055197e-09, + "loss": 0.91779447, + "num_input_tokens_seen": 350581535, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.09942627, + "step": 16245, + "time_per_iteration": 2.5492780208587646 + }, + { + "auxiliary_loss_clip": 0.06397044, + "auxiliary_loss_mlp": 0.01262033, + "balance_loss_clip": 0.06267294, + "balance_loss_mlp": 0.01252812, + "epoch": 0.9767623628438299, + "flos": 21769163400960.0, + "grad_norm": 1.4065715679155657, + "language_loss": 0.78878963, + "learning_rate": 5.647283615340726e-09, + "loss": 0.86538041, + "num_input_tokens_seen": 350601615, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.09222412, + "step": 16246, + "time_per_iteration": 4.017332315444946 + }, + { + "auxiliary_loss_clip": 0.06389856, + "auxiliary_loss_mlp": 0.01259694, + "balance_loss_clip": 0.06268258, + "balance_loss_mlp": 0.01251588, + "epoch": 0.9768224860964978, + "flos": 15856490292480.0, + "grad_norm": 1.4347284082361575, + "language_loss": 0.74287903, + "learning_rate": 5.6180746696812275e-09, + "loss": 0.81937456, + "num_input_tokens_seen": 350619580, + "router_z_loss_clip": 1.21386719, + "router_z_loss_mlp": 0.08105469, + "step": 16247, + "time_per_iteration": 2.4851341247558594 + }, + { + "auxiliary_loss_clip": 0.0640361, + "auxiliary_loss_mlp": 0.01263997, + "balance_loss_clip": 0.06272487, + "balance_loss_mlp": 0.0125505, + "epoch": 0.9768826093491658, + "flos": 25157441352960.0, + "grad_norm": 1.4750714987336841, + "language_loss": 0.80060053, + "learning_rate": 5.58894135118404e-09, + "loss": 0.8772766, + "num_input_tokens_seen": 350640015, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.08947754, + "step": 16248, + "time_per_iteration": 2.538630485534668 + }, + { + "auxiliary_loss_clip": 0.06412353, + "auxiliary_loss_mlp": 0.01267958, + "balance_loss_clip": 0.06277192, + "balance_loss_mlp": 0.01257765, + "epoch": 0.9769427326018337, + "flos": 22973794024320.0, + "grad_norm": 1.7015435546437248, + "language_loss": 0.79519981, + "learning_rate": 5.559883660954278e-09, + "loss": 0.87200296, + "num_input_tokens_seen": 350659155, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.10180664, + "step": 16249, + "time_per_iteration": 2.5262768268585205 + }, + { + "auxiliary_loss_clip": 0.06397509, + "auxiliary_loss_mlp": 0.01267019, + "balance_loss_clip": 0.06270598, + "balance_loss_mlp": 0.01257393, + "epoch": 0.9770028558545018, + "flos": 15269066444160.0, + "grad_norm": 1.8482758647978654, + "language_loss": 0.66747582, + "learning_rate": 5.530901600093507e-09, + "loss": 0.74412113, + "num_input_tokens_seen": 350676615, + "router_z_loss_clip": 1.26855469, + "router_z_loss_mlp": 0.09613037, + "step": 16250, + "time_per_iteration": 2.475498914718628 + }, + { + "auxiliary_loss_clip": 0.0631108, + "auxiliary_loss_mlp": 0.01248906, + "balance_loss_clip": 0.06257159, + "balance_loss_mlp": 0.01247916, + "epoch": 0.9770629791071697, + "flos": 71470277349120.0, + "grad_norm": 0.766535928446672, + "language_loss": 0.59739006, + "learning_rate": 5.501995169700846e-09, + "loss": 0.6729899, + "num_input_tokens_seen": 350736805, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.0098877, + "step": 16251, + "time_per_iteration": 3.171131134033203 + }, + { + "auxiliary_loss_clip": 0.06401361, + "auxiliary_loss_mlp": 0.01265235, + "balance_loss_clip": 0.06270295, + "balance_loss_mlp": 0.01256032, + "epoch": 0.9771231023598377, + "flos": 22418375235840.0, + "grad_norm": 1.6976848540118503, + "language_loss": 0.78588271, + "learning_rate": 5.473164370872307e-09, + "loss": 0.86254865, + "num_input_tokens_seen": 350753600, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09197998, + "step": 16252, + "time_per_iteration": 2.5451128482818604 + }, + { + "auxiliary_loss_clip": 0.06400634, + "auxiliary_loss_mlp": 0.01263344, + "balance_loss_clip": 0.06269819, + "balance_loss_mlp": 0.01253623, + "epoch": 0.9771832256125056, + "flos": 19031942073600.0, + "grad_norm": 3.8752836290944774, + "language_loss": 0.65360057, + "learning_rate": 5.444409204701461e-09, + "loss": 0.73024035, + "num_input_tokens_seen": 350771225, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09729004, + "step": 16253, + "time_per_iteration": 3.9955294132232666 + }, + { + "auxiliary_loss_clip": 0.06406756, + "auxiliary_loss_mlp": 0.01265874, + "balance_loss_clip": 0.06273551, + "balance_loss_mlp": 0.01255592, + "epoch": 0.9772433488651736, + "flos": 17827982282880.0, + "grad_norm": 2.0997041921444652, + "language_loss": 0.77016485, + "learning_rate": 5.415729672278324e-09, + "loss": 0.84689116, + "num_input_tokens_seen": 350789100, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.10272217, + "step": 16254, + "time_per_iteration": 2.4991238117218018 + }, + { + "auxiliary_loss_clip": 0.06405216, + "auxiliary_loss_mlp": 0.01266948, + "balance_loss_clip": 0.06271631, + "balance_loss_mlp": 0.01256881, + "epoch": 0.9773034721178415, + "flos": 37638246533760.0, + "grad_norm": 2.3865763339015618, + "language_loss": 0.64227772, + "learning_rate": 5.387125774690471e-09, + "loss": 0.71899939, + "num_input_tokens_seen": 350811085, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.10064697, + "step": 16255, + "time_per_iteration": 2.8432374000549316 + }, + { + "auxiliary_loss_clip": 0.06406088, + "auxiliary_loss_mlp": 0.01265056, + "balance_loss_clip": 0.06270261, + "balance_loss_mlp": 0.01254858, + "epoch": 0.9773635953705095, + "flos": 20308590881280.0, + "grad_norm": 1.9209330151147832, + "language_loss": 0.7554915, + "learning_rate": 5.358597513023033e-09, + "loss": 0.83220291, + "num_input_tokens_seen": 350831065, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.10192871, + "step": 16256, + "time_per_iteration": 2.539581537246704 + }, + { + "auxiliary_loss_clip": 0.06402241, + "auxiliary_loss_mlp": 0.01267896, + "balance_loss_clip": 0.06274899, + "balance_loss_mlp": 0.01258186, + "epoch": 0.9774237186231776, + "flos": 22315735584000.0, + "grad_norm": 2.134374282243183, + "language_loss": 0.78430331, + "learning_rate": 5.330144888357369e-09, + "loss": 0.86100471, + "num_input_tokens_seen": 350849675, + "router_z_loss_clip": 1.27539062, + "router_z_loss_mlp": 0.0970459, + "step": 16257, + "time_per_iteration": 2.521059513092041 + }, + { + "auxiliary_loss_clip": 0.06404999, + "auxiliary_loss_mlp": 0.01264619, + "balance_loss_clip": 0.06275123, + "balance_loss_mlp": 0.01255106, + "epoch": 0.9774838418758455, + "flos": 24211435956480.0, + "grad_norm": 2.254901577298529, + "language_loss": 0.75327086, + "learning_rate": 5.301767901772391e-09, + "loss": 0.82996702, + "num_input_tokens_seen": 350868955, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.09509277, + "step": 16258, + "time_per_iteration": 2.5600156784057617 + }, + { + "auxiliary_loss_clip": 0.06308343, + "auxiliary_loss_mlp": 0.01249899, + "balance_loss_clip": 0.06254452, + "balance_loss_mlp": 0.01248971, + "epoch": 0.9775439651285135, + "flos": 66378691998720.0, + "grad_norm": 0.6729555007121276, + "language_loss": 0.59753788, + "learning_rate": 5.273466554344353e-09, + "loss": 0.67312038, + "num_input_tokens_seen": 350935110, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.00926208, + "step": 16259, + "time_per_iteration": 3.2042317390441895 + }, + { + "auxiliary_loss_clip": 0.06408554, + "auxiliary_loss_mlp": 0.01265358, + "balance_loss_clip": 0.06274059, + "balance_loss_mlp": 0.01255547, + "epoch": 0.9776040883811814, + "flos": 22608168733440.0, + "grad_norm": 1.5933188792012458, + "language_loss": 0.7377913, + "learning_rate": 5.2452408471461705e-09, + "loss": 0.81453043, + "num_input_tokens_seen": 350953220, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.0980835, + "step": 16260, + "time_per_iteration": 2.520371675491333 + }, + { + "auxiliary_loss_clip": 0.06402797, + "auxiliary_loss_mlp": 0.0126442, + "balance_loss_clip": 0.06271645, + "balance_loss_mlp": 0.01254412, + "epoch": 0.9776642116338494, + "flos": 18448082023680.0, + "grad_norm": 1.9636602337481959, + "language_loss": 0.80066824, + "learning_rate": 5.2170907812485456e-09, + "loss": 0.87734044, + "num_input_tokens_seen": 350971915, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.10009766, + "step": 16261, + "time_per_iteration": 3.925679922103882 + }, + { + "auxiliary_loss_clip": 0.06401169, + "auxiliary_loss_mlp": 0.01262925, + "balance_loss_clip": 0.06267936, + "balance_loss_mlp": 0.01253269, + "epoch": 0.9777243348865173, + "flos": 22645121184000.0, + "grad_norm": 2.276585327345628, + "language_loss": 0.74144262, + "learning_rate": 5.189016357718845e-09, + "loss": 0.81808358, + "num_input_tokens_seen": 350990470, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.09649658, + "step": 16262, + "time_per_iteration": 2.5040698051452637 + }, + { + "auxiliary_loss_clip": 0.06405801, + "auxiliary_loss_mlp": 0.0126505, + "balance_loss_clip": 0.06272787, + "balance_loss_mlp": 0.01254345, + "epoch": 0.9777844581391854, + "flos": 31329410520960.0, + "grad_norm": 2.244508140891946, + "language_loss": 0.7062791, + "learning_rate": 5.16101757762133e-09, + "loss": 0.78298759, + "num_input_tokens_seen": 351010755, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.1071167, + "step": 16263, + "time_per_iteration": 2.6393070220947266 + }, + { + "auxiliary_loss_clip": 0.06403024, + "auxiliary_loss_mlp": 0.0126253, + "balance_loss_clip": 0.06270716, + "balance_loss_mlp": 0.01253053, + "epoch": 0.9778445813918533, + "flos": 23045728354560.0, + "grad_norm": 1.6974232351495746, + "language_loss": 0.66375017, + "learning_rate": 5.133094442018038e-09, + "loss": 0.74040568, + "num_input_tokens_seen": 351029965, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.09484863, + "step": 16264, + "time_per_iteration": 2.505544900894165 + }, + { + "auxiliary_loss_clip": 0.06409594, + "auxiliary_loss_mlp": 0.01265425, + "balance_loss_clip": 0.06271692, + "balance_loss_mlp": 0.01255602, + "epoch": 0.9779047046445213, + "flos": 17572082313600.0, + "grad_norm": 2.0688603545679585, + "language_loss": 0.73281831, + "learning_rate": 5.105246951967679e-09, + "loss": 0.80956852, + "num_input_tokens_seen": 351046205, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.09820557, + "step": 16265, + "time_per_iteration": 2.477476119995117 + }, + { + "auxiliary_loss_clip": 0.06397505, + "auxiliary_loss_mlp": 0.01262251, + "balance_loss_clip": 0.06270322, + "balance_loss_mlp": 0.01253298, + "epoch": 0.9779648278971892, + "flos": 20747492167680.0, + "grad_norm": 1.8532261221017665, + "language_loss": 0.68805051, + "learning_rate": 5.077475108526297e-09, + "loss": 0.76464808, + "num_input_tokens_seen": 351065390, + "router_z_loss_clip": 1.27050781, + "router_z_loss_mlp": 0.08953857, + "step": 16266, + "time_per_iteration": 2.505934238433838 + }, + { + "auxiliary_loss_clip": 0.06398112, + "auxiliary_loss_mlp": 0.01264596, + "balance_loss_clip": 0.06272861, + "balance_loss_mlp": 0.01255992, + "epoch": 0.9780249511498572, + "flos": 21032336522880.0, + "grad_norm": 2.6886691630763884, + "language_loss": 0.8669281, + "learning_rate": 5.049778912747049e-09, + "loss": 0.94355524, + "num_input_tokens_seen": 351084355, + "router_z_loss_clip": 1.25097656, + "router_z_loss_mlp": 0.08602905, + "step": 16267, + "time_per_iteration": 2.510568141937256 + }, + { + "auxiliary_loss_clip": 0.06402868, + "auxiliary_loss_mlp": 0.01263569, + "balance_loss_clip": 0.0627014, + "balance_loss_mlp": 0.01253591, + "epoch": 0.9780850744025251, + "flos": 30782167505280.0, + "grad_norm": 1.6577621473420363, + "language_loss": 0.70518297, + "learning_rate": 5.022158365679985e-09, + "loss": 0.78184736, + "num_input_tokens_seen": 351105870, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.09967041, + "step": 16268, + "time_per_iteration": 4.031549453735352 + }, + { + "auxiliary_loss_clip": 0.06402364, + "auxiliary_loss_mlp": 0.0126831, + "balance_loss_clip": 0.06270558, + "balance_loss_mlp": 0.01258832, + "epoch": 0.9781451976551931, + "flos": 20309219786880.0, + "grad_norm": 1.5149963209120108, + "language_loss": 0.74065733, + "learning_rate": 4.994613468372711e-09, + "loss": 0.8173641, + "num_input_tokens_seen": 351124760, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09472656, + "step": 16269, + "time_per_iteration": 2.4883556365966797 + }, + { + "auxiliary_loss_clip": 0.06404697, + "auxiliary_loss_mlp": 0.0126611, + "balance_loss_clip": 0.06272128, + "balance_loss_mlp": 0.01256036, + "epoch": 0.9782053209078612, + "flos": 24323383411200.0, + "grad_norm": 2.11112255910788, + "language_loss": 0.70834357, + "learning_rate": 4.967144221869501e-09, + "loss": 0.78505164, + "num_input_tokens_seen": 351142820, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.10076904, + "step": 16270, + "time_per_iteration": 2.5375027656555176 + }, + { + "auxiliary_loss_clip": 0.06403029, + "auxiliary_loss_mlp": 0.01263166, + "balance_loss_clip": 0.0627147, + "balance_loss_mlp": 0.01253874, + "epoch": 0.9782654441605291, + "flos": 32497717599360.0, + "grad_norm": 3.0838275536528705, + "language_loss": 0.64377117, + "learning_rate": 4.939750627212191e-09, + "loss": 0.72043312, + "num_input_tokens_seen": 351164805, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09301758, + "step": 16271, + "time_per_iteration": 2.5905959606170654 + }, + { + "auxiliary_loss_clip": 0.06396818, + "auxiliary_loss_mlp": 0.01263415, + "balance_loss_clip": 0.06269811, + "balance_loss_mlp": 0.01253783, + "epoch": 0.9783255674131971, + "flos": 26986280567040.0, + "grad_norm": 1.9658813772061734, + "language_loss": 0.70980221, + "learning_rate": 4.912432685439505e-09, + "loss": 0.78640461, + "num_input_tokens_seen": 351187005, + "router_z_loss_clip": 1.27050781, + "router_z_loss_mlp": 0.09631348, + "step": 16272, + "time_per_iteration": 2.5623769760131836 + }, + { + "auxiliary_loss_clip": 0.06402478, + "auxiliary_loss_mlp": 0.01267088, + "balance_loss_clip": 0.06270878, + "balance_loss_mlp": 0.01257736, + "epoch": 0.978385690665865, + "flos": 23118920496000.0, + "grad_norm": 3.4786188165318648, + "language_loss": 0.67056668, + "learning_rate": 4.88519039758728e-09, + "loss": 0.74726236, + "num_input_tokens_seen": 351208450, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09356689, + "step": 16273, + "time_per_iteration": 2.516294002532959 + }, + { + "auxiliary_loss_clip": 0.06402078, + "auxiliary_loss_mlp": 0.01264409, + "balance_loss_clip": 0.06269372, + "balance_loss_mlp": 0.01255099, + "epoch": 0.978445813918533, + "flos": 25416527777280.0, + "grad_norm": 1.5271544085655164, + "language_loss": 0.74288815, + "learning_rate": 4.85802376468869e-09, + "loss": 0.81955302, + "num_input_tokens_seen": 351229585, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09313965, + "step": 16274, + "time_per_iteration": 2.5984392166137695 + }, + { + "auxiliary_loss_clip": 0.06401587, + "auxiliary_loss_mlp": 0.01265008, + "balance_loss_clip": 0.0627111, + "balance_loss_mlp": 0.01255793, + "epoch": 0.9785059371712009, + "flos": 23556983241600.0, + "grad_norm": 1.707553695357098, + "language_loss": 0.7783469, + "learning_rate": 4.830932787773579e-09, + "loss": 0.85501283, + "num_input_tokens_seen": 351249525, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.09210205, + "step": 16275, + "time_per_iteration": 2.5418179035186768 + }, + { + "auxiliary_loss_clip": 0.06406128, + "auxiliary_loss_mlp": 0.01262648, + "balance_loss_clip": 0.06272225, + "balance_loss_mlp": 0.01253469, + "epoch": 0.978566060423869, + "flos": 34359945465600.0, + "grad_norm": 1.5276794434381622, + "language_loss": 0.71135265, + "learning_rate": 4.803917467869567e-09, + "loss": 0.7880404, + "num_input_tokens_seen": 351272530, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.09179688, + "step": 16276, + "time_per_iteration": 2.622631311416626 + }, + { + "auxiliary_loss_clip": 0.06400249, + "auxiliary_loss_mlp": 0.01263911, + "balance_loss_clip": 0.06272748, + "balance_loss_mlp": 0.01255346, + "epoch": 0.9786261836765369, + "flos": 11623546857600.0, + "grad_norm": 3.17667163989501, + "language_loss": 0.85745251, + "learning_rate": 4.776977806000726e-09, + "loss": 0.93409419, + "num_input_tokens_seen": 351288530, + "router_z_loss_clip": 1.27441406, + "router_z_loss_mlp": 0.08563232, + "step": 16277, + "time_per_iteration": 2.4804911613464355 + }, + { + "auxiliary_loss_clip": 0.06398945, + "auxiliary_loss_mlp": 0.01262406, + "balance_loss_clip": 0.06271117, + "balance_loss_mlp": 0.01253317, + "epoch": 0.9786863069292049, + "flos": 17426746206720.0, + "grad_norm": 1.7095033174168577, + "language_loss": 0.71152186, + "learning_rate": 4.7501138031891264e-09, + "loss": 0.78813535, + "num_input_tokens_seen": 351305890, + "router_z_loss_clip": 1.27734375, + "router_z_loss_mlp": 0.09088135, + "step": 16278, + "time_per_iteration": 2.488579750061035 + }, + { + "auxiliary_loss_clip": 0.06398286, + "auxiliary_loss_mlp": 0.01261989, + "balance_loss_clip": 0.06268737, + "balance_loss_mlp": 0.01252339, + "epoch": 0.9787464301818728, + "flos": 20850341454720.0, + "grad_norm": 1.68580975011962, + "language_loss": 0.84887272, + "learning_rate": 4.723325460453065e-09, + "loss": 0.92547548, + "num_input_tokens_seen": 351325010, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.09649658, + "step": 16279, + "time_per_iteration": 2.543829917907715 + }, + { + "auxiliary_loss_clip": 0.06398898, + "auxiliary_loss_mlp": 0.0126355, + "balance_loss_clip": 0.06267536, + "balance_loss_mlp": 0.01254275, + "epoch": 0.9788065534345408, + "flos": 18228757161600.0, + "grad_norm": 1.79953849939751, + "language_loss": 0.7903899, + "learning_rate": 4.696612778808395e-09, + "loss": 0.86701441, + "num_input_tokens_seen": 351343060, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09283447, + "step": 16280, + "time_per_iteration": 2.5347559452056885 + }, + { + "auxiliary_loss_clip": 0.06397119, + "auxiliary_loss_mlp": 0.01265633, + "balance_loss_clip": 0.06271647, + "balance_loss_mlp": 0.01256973, + "epoch": 0.9788666766872087, + "flos": 21584359221120.0, + "grad_norm": 1.5249645071415179, + "language_loss": 0.79882574, + "learning_rate": 4.669975759268085e-09, + "loss": 0.87545323, + "num_input_tokens_seen": 351363260, + "router_z_loss_clip": 1.25488281, + "router_z_loss_mlp": 0.08666992, + "step": 16281, + "time_per_iteration": 2.5423452854156494 + }, + { + "auxiliary_loss_clip": 0.06401223, + "auxiliary_loss_mlp": 0.01266758, + "balance_loss_clip": 0.06269599, + "balance_loss_mlp": 0.01256965, + "epoch": 0.9789267999398767, + "flos": 24907536950400.0, + "grad_norm": 1.5917480809235194, + "language_loss": 0.80182159, + "learning_rate": 4.643414402842216e-09, + "loss": 0.87850136, + "num_input_tokens_seen": 351382610, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09796143, + "step": 16282, + "time_per_iteration": 2.5288219451904297 + }, + { + "auxiliary_loss_clip": 0.06399183, + "auxiliary_loss_mlp": 0.0126555, + "balance_loss_clip": 0.06268679, + "balance_loss_mlp": 0.01255412, + "epoch": 0.9789869231925448, + "flos": 19579185089280.0, + "grad_norm": 2.068232253290479, + "language_loss": 0.8363508, + "learning_rate": 4.616928710538204e-09, + "loss": 0.91299808, + "num_input_tokens_seen": 351401075, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.10137939, + "step": 16283, + "time_per_iteration": 2.475937604904175 + }, + { + "auxiliary_loss_clip": 0.06399857, + "auxiliary_loss_mlp": 0.01263668, + "balance_loss_clip": 0.06268431, + "balance_loss_mlp": 0.01254245, + "epoch": 0.9790470464452127, + "flos": 16801657148160.0, + "grad_norm": 1.6072240404976843, + "language_loss": 0.72103167, + "learning_rate": 4.590518683360134e-09, + "loss": 0.79766691, + "num_input_tokens_seen": 351419275, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09411621, + "step": 16284, + "time_per_iteration": 2.473494529724121 + }, + { + "auxiliary_loss_clip": 0.06401023, + "auxiliary_loss_mlp": 0.01266157, + "balance_loss_clip": 0.06273106, + "balance_loss_mlp": 0.01257723, + "epoch": 0.9791071696978807, + "flos": 18375267225600.0, + "grad_norm": 1.7801771621665499, + "language_loss": 0.64641076, + "learning_rate": 4.56418432230965e-09, + "loss": 0.72308254, + "num_input_tokens_seen": 351437375, + "router_z_loss_clip": 1.27832031, + "router_z_loss_mlp": 0.08435059, + "step": 16285, + "time_per_iteration": 4.014649391174316 + }, + { + "auxiliary_loss_clip": 0.06402356, + "auxiliary_loss_mlp": 0.01266814, + "balance_loss_clip": 0.06273103, + "balance_loss_mlp": 0.01257664, + "epoch": 0.9791672929505486, + "flos": 24177166836480.0, + "grad_norm": 1.4206006238516855, + "language_loss": 0.70657575, + "learning_rate": 4.537925628385286e-09, + "loss": 0.78326744, + "num_input_tokens_seen": 351457810, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.09143066, + "step": 16286, + "time_per_iteration": 2.5511789321899414 + }, + { + "auxiliary_loss_clip": 0.06395744, + "auxiliary_loss_mlp": 0.01265186, + "balance_loss_clip": 0.06267752, + "balance_loss_mlp": 0.01255583, + "epoch": 0.9792274162032166, + "flos": 24361216329600.0, + "grad_norm": 1.3312898540617772, + "language_loss": 0.58715498, + "learning_rate": 4.511742602582691e-09, + "loss": 0.66376424, + "num_input_tokens_seen": 351478825, + "router_z_loss_clip": 1.27929688, + "router_z_loss_mlp": 0.09613037, + "step": 16287, + "time_per_iteration": 2.5384435653686523 + }, + { + "auxiliary_loss_clip": 0.06399453, + "auxiliary_loss_mlp": 0.01262835, + "balance_loss_clip": 0.06270657, + "balance_loss_mlp": 0.01253811, + "epoch": 0.9792875394558845, + "flos": 26402965568640.0, + "grad_norm": 1.63050988384962, + "language_loss": 0.81943876, + "learning_rate": 4.485635245894626e-09, + "loss": 0.89606166, + "num_input_tokens_seen": 351498785, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.09020996, + "step": 16288, + "time_per_iteration": 2.5366978645324707 + }, + { + "auxiliary_loss_clip": 0.06400405, + "auxiliary_loss_mlp": 0.01265614, + "balance_loss_clip": 0.06270171, + "balance_loss_mlp": 0.01255815, + "epoch": 0.9793476627085526, + "flos": 28155635821440.0, + "grad_norm": 1.4168880450273769, + "language_loss": 0.71902084, + "learning_rate": 4.459603559311631e-09, + "loss": 0.795681, + "num_input_tokens_seen": 351520235, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.09796143, + "step": 16289, + "time_per_iteration": 2.5798122882843018 + }, + { + "auxiliary_loss_clip": 0.063995, + "auxiliary_loss_mlp": 0.01262223, + "balance_loss_clip": 0.06270827, + "balance_loss_mlp": 0.01253199, + "epoch": 0.9794077859612205, + "flos": 16769568234240.0, + "grad_norm": 4.451777244467701, + "language_loss": 0.75933874, + "learning_rate": 4.43364754382003e-09, + "loss": 0.83595598, + "num_input_tokens_seen": 351538900, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.09020996, + "step": 16290, + "time_per_iteration": 2.4823756217956543 + }, + { + "auxiliary_loss_clip": 0.06403105, + "auxiliary_loss_mlp": 0.01263116, + "balance_loss_clip": 0.06269795, + "balance_loss_mlp": 0.01253108, + "epoch": 0.9794679092138885, + "flos": 19286793866880.0, + "grad_norm": 1.4953530744469736, + "language_loss": 0.67339337, + "learning_rate": 4.4077672004048105e-09, + "loss": 0.75005561, + "num_input_tokens_seen": 351558715, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.10015869, + "step": 16291, + "time_per_iteration": 2.5128190517425537 + }, + { + "auxiliary_loss_clip": 0.06406611, + "auxiliary_loss_mlp": 0.01265154, + "balance_loss_clip": 0.06271151, + "balance_loss_mlp": 0.01254866, + "epoch": 0.9795280324665564, + "flos": 32164139295360.0, + "grad_norm": 1.837132230394904, + "language_loss": 0.62766051, + "learning_rate": 4.3819625300467456e-09, + "loss": 0.70437813, + "num_input_tokens_seen": 351578450, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.10284424, + "step": 16292, + "time_per_iteration": 2.6618642807006836 + }, + { + "auxiliary_loss_clip": 0.06400578, + "auxiliary_loss_mlp": 0.01265969, + "balance_loss_clip": 0.06269545, + "balance_loss_mlp": 0.01256063, + "epoch": 0.9795881557192244, + "flos": 19066714318080.0, + "grad_norm": 1.6297309965936324, + "language_loss": 0.73538578, + "learning_rate": 4.356233533724829e-09, + "loss": 0.8120513, + "num_input_tokens_seen": 351597195, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09912109, + "step": 16293, + "time_per_iteration": 3.9606332778930664 + }, + { + "auxiliary_loss_clip": 0.06403802, + "auxiliary_loss_mlp": 0.01262473, + "balance_loss_clip": 0.06269664, + "balance_loss_mlp": 0.01252799, + "epoch": 0.9796482789718923, + "flos": 28337505108480.0, + "grad_norm": 1.6383548808431236, + "language_loss": 0.84130985, + "learning_rate": 4.330580212414503e-09, + "loss": 0.91797256, + "num_input_tokens_seen": 351617460, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.09674072, + "step": 16294, + "time_per_iteration": 2.6089725494384766 + }, + { + "auxiliary_loss_clip": 0.06393368, + "auxiliary_loss_mlp": 0.01267559, + "balance_loss_clip": 0.06268513, + "balance_loss_mlp": 0.01259262, + "epoch": 0.9797084022245603, + "flos": 17973821514240.0, + "grad_norm": 1.8121690447623178, + "language_loss": 0.71849918, + "learning_rate": 4.305002567088767e-09, + "loss": 0.79510844, + "num_input_tokens_seen": 351635900, + "router_z_loss_clip": 1.25, + "router_z_loss_mlp": 0.08294678, + "step": 16295, + "time_per_iteration": 2.494866132736206 + }, + { + "auxiliary_loss_clip": 0.06407996, + "auxiliary_loss_mlp": 0.01266646, + "balance_loss_clip": 0.06274095, + "balance_loss_mlp": 0.01256567, + "epoch": 0.9797685254772284, + "flos": 20272980096000.0, + "grad_norm": 1.689190760934112, + "language_loss": 0.80868363, + "learning_rate": 4.2795005987170674e-09, + "loss": 0.8854301, + "num_input_tokens_seen": 351655400, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.10083008, + "step": 16296, + "time_per_iteration": 2.5353195667266846 + }, + { + "auxiliary_loss_clip": 0.06396893, + "auxiliary_loss_mlp": 0.01263523, + "balance_loss_clip": 0.06268729, + "balance_loss_mlp": 0.01254499, + "epoch": 0.9798286487298963, + "flos": 26914513944960.0, + "grad_norm": 1.8507340964976773, + "language_loss": 0.75629425, + "learning_rate": 4.254074308266853e-09, + "loss": 0.83289838, + "num_input_tokens_seen": 351675505, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.09020996, + "step": 16297, + "time_per_iteration": 2.566253185272217 + }, + { + "auxiliary_loss_clip": 0.06409726, + "auxiliary_loss_mlp": 0.01265641, + "balance_loss_clip": 0.06272483, + "balance_loss_mlp": 0.01256253, + "epoch": 0.9798887719825643, + "flos": 27168233708160.0, + "grad_norm": 1.5228355519225918, + "language_loss": 0.78694081, + "learning_rate": 4.228723696702019e-09, + "loss": 0.86369449, + "num_input_tokens_seen": 351697920, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.09399414, + "step": 16298, + "time_per_iteration": 2.635408639907837 + }, + { + "auxiliary_loss_clip": 0.06396599, + "auxiliary_loss_mlp": 0.0126188, + "balance_loss_clip": 0.06269842, + "balance_loss_mlp": 0.01252785, + "epoch": 0.9799488952352322, + "flos": 20674803150720.0, + "grad_norm": 1.6048617132975538, + "language_loss": 0.73147827, + "learning_rate": 4.203448764984019e-09, + "loss": 0.80806303, + "num_input_tokens_seen": 351717615, + "router_z_loss_clip": 1.26953125, + "router_z_loss_mlp": 0.09088135, + "step": 16299, + "time_per_iteration": 2.5388383865356445 + }, + { + "auxiliary_loss_clip": 0.06401886, + "auxiliary_loss_mlp": 0.0126338, + "balance_loss_clip": 0.06268089, + "balance_loss_mlp": 0.01254105, + "epoch": 0.9800090184879002, + "flos": 21987691649280.0, + "grad_norm": 2.0565505689040795, + "language_loss": 0.89151061, + "learning_rate": 4.178249514071419e-09, + "loss": 0.96816331, + "num_input_tokens_seen": 351735260, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.0927124, + "step": 16300, + "time_per_iteration": 2.510451316833496 + }, + { + "auxiliary_loss_clip": 0.06408317, + "auxiliary_loss_mlp": 0.01265306, + "balance_loss_clip": 0.06273539, + "balance_loss_mlp": 0.01255155, + "epoch": 0.9800691417405681, + "flos": 21294860964480.0, + "grad_norm": 1.950668796450147, + "language_loss": 0.78290796, + "learning_rate": 4.1531259449194555e-09, + "loss": 0.85964411, + "num_input_tokens_seen": 351755800, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.10150146, + "step": 16301, + "time_per_iteration": 3.9293153285980225 + }, + { + "auxiliary_loss_clip": 0.06402375, + "auxiliary_loss_mlp": 0.01266486, + "balance_loss_clip": 0.06270753, + "balance_loss_mlp": 0.01256502, + "epoch": 0.9801292649932362, + "flos": 18445398693120.0, + "grad_norm": 2.786273844322341, + "language_loss": 0.75642586, + "learning_rate": 4.128078058480921e-09, + "loss": 0.83311445, + "num_input_tokens_seen": 351774790, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09985352, + "step": 16302, + "time_per_iteration": 2.5075223445892334 + }, + { + "auxiliary_loss_clip": 0.06404446, + "auxiliary_loss_mlp": 0.0126291, + "balance_loss_clip": 0.06273034, + "balance_loss_mlp": 0.01253045, + "epoch": 0.9801893882459041, + "flos": 25053418108800.0, + "grad_norm": 1.6673066496570457, + "language_loss": 0.79480714, + "learning_rate": 4.103105855705724e-09, + "loss": 0.8714807, + "num_input_tokens_seen": 351792855, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09851074, + "step": 16303, + "time_per_iteration": 2.628279209136963 + }, + { + "auxiliary_loss_clip": 0.06405927, + "auxiliary_loss_mlp": 0.01263016, + "balance_loss_clip": 0.06270191, + "balance_loss_mlp": 0.01253062, + "epoch": 0.9802495114985721, + "flos": 18516787971840.0, + "grad_norm": 1.8702096510195432, + "language_loss": 0.83911574, + "learning_rate": 4.078209337540883e-09, + "loss": 0.91580522, + "num_input_tokens_seen": 351811450, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.09967041, + "step": 16304, + "time_per_iteration": 2.5042169094085693 + }, + { + "auxiliary_loss_clip": 0.06394262, + "auxiliary_loss_mlp": 0.01262398, + "balance_loss_clip": 0.06268616, + "balance_loss_mlp": 0.01253875, + "epoch": 0.98030963475124, + "flos": 21476143272960.0, + "grad_norm": 1.8927432348814315, + "language_loss": 0.70325917, + "learning_rate": 4.053388504930089e-09, + "loss": 0.77982581, + "num_input_tokens_seen": 351831960, + "router_z_loss_clip": 1.25585938, + "router_z_loss_mlp": 0.08526611, + "step": 16305, + "time_per_iteration": 2.5113353729248047 + }, + { + "auxiliary_loss_clip": 0.06407525, + "auxiliary_loss_mlp": 0.01264496, + "balance_loss_clip": 0.06273907, + "balance_loss_mlp": 0.01254578, + "epoch": 0.980369758003908, + "flos": 20418483911040.0, + "grad_norm": 1.8068750092854737, + "language_loss": 0.72213495, + "learning_rate": 4.028643358815032e-09, + "loss": 0.79885519, + "num_input_tokens_seen": 351851585, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.09918213, + "step": 16306, + "time_per_iteration": 2.5188653469085693 + }, + { + "auxiliary_loss_clip": 0.06395418, + "auxiliary_loss_mlp": 0.0126193, + "balance_loss_clip": 0.06268764, + "balance_loss_mlp": 0.01253502, + "epoch": 0.9804298812565759, + "flos": 23405064589440.0, + "grad_norm": 1.5213209869306519, + "language_loss": 0.73565251, + "learning_rate": 4.00397390013385e-09, + "loss": 0.81222594, + "num_input_tokens_seen": 351871085, + "router_z_loss_clip": 1.26367188, + "router_z_loss_mlp": 0.08422852, + "step": 16307, + "time_per_iteration": 3.985133171081543 + }, + { + "auxiliary_loss_clip": 0.06392866, + "auxiliary_loss_mlp": 0.01262041, + "balance_loss_clip": 0.06268162, + "balance_loss_mlp": 0.01253899, + "epoch": 0.980490004509244, + "flos": 23299028847360.0, + "grad_norm": 1.3292657797175953, + "language_loss": 0.7521047, + "learning_rate": 3.979380129822018e-09, + "loss": 0.82865375, + "num_input_tokens_seen": 351891775, + "router_z_loss_clip": 1.24609375, + "router_z_loss_mlp": 0.08135986, + "step": 16308, + "time_per_iteration": 2.545912265777588 + }, + { + "auxiliary_loss_clip": 0.06303553, + "auxiliary_loss_mlp": 0.01251644, + "balance_loss_clip": 0.06249667, + "balance_loss_mlp": 0.01250615, + "epoch": 0.980550127761912, + "flos": 56067991712640.0, + "grad_norm": 0.8050036087826854, + "language_loss": 0.57682216, + "learning_rate": 3.954862048811902e-09, + "loss": 0.65237415, + "num_input_tokens_seen": 351946770, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.01029968, + "step": 16309, + "time_per_iteration": 2.9991166591644287 + }, + { + "auxiliary_loss_clip": 0.06399623, + "auxiliary_loss_mlp": 0.01265391, + "balance_loss_clip": 0.06267844, + "balance_loss_mlp": 0.01256194, + "epoch": 0.9806102510145799, + "flos": 25339562202240.0, + "grad_norm": 1.6272757671722682, + "language_loss": 0.66520619, + "learning_rate": 3.930419658033646e-09, + "loss": 0.7418564, + "num_input_tokens_seen": 351966155, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09191895, + "step": 16310, + "time_per_iteration": 2.5256764888763428 + }, + { + "auxiliary_loss_clip": 0.06307549, + "auxiliary_loss_mlp": 0.01249123, + "balance_loss_clip": 0.06253639, + "balance_loss_mlp": 0.01248124, + "epoch": 0.9806703742672479, + "flos": 67297472017920.0, + "grad_norm": 0.8343331868495012, + "language_loss": 0.54504246, + "learning_rate": 3.906052958413841e-09, + "loss": 0.62060916, + "num_input_tokens_seen": 352031655, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.00998688, + "step": 16311, + "time_per_iteration": 3.235471248626709 + }, + { + "auxiliary_loss_clip": 0.06400076, + "auxiliary_loss_mlp": 0.01262219, + "balance_loss_clip": 0.06269625, + "balance_loss_mlp": 0.01253004, + "epoch": 0.9807304975199158, + "flos": 25236084009600.0, + "grad_norm": 1.569735113945606, + "language_loss": 0.79639947, + "learning_rate": 3.881761950876638e-09, + "loss": 0.87302244, + "num_input_tokens_seen": 352051920, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.09216309, + "step": 16312, + "time_per_iteration": 2.5837817192077637 + }, + { + "auxiliary_loss_clip": 0.06399613, + "auxiliary_loss_mlp": 0.01263333, + "balance_loss_clip": 0.06270465, + "balance_loss_mlp": 0.01255012, + "epoch": 0.9807906207725838, + "flos": 17462021575680.0, + "grad_norm": 1.8369595786658577, + "language_loss": 0.6327976, + "learning_rate": 3.8575466363430785e-09, + "loss": 0.70942706, + "num_input_tokens_seen": 352069315, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.08325195, + "step": 16313, + "time_per_iteration": 2.485778570175171 + }, + { + "auxiliary_loss_clip": 0.06398313, + "auxiliary_loss_mlp": 0.01263511, + "balance_loss_clip": 0.06268698, + "balance_loss_mlp": 0.0125407, + "epoch": 0.9808507440252517, + "flos": 21038709432960.0, + "grad_norm": 1.8257284943536598, + "language_loss": 0.72914076, + "learning_rate": 3.833407015731316e-09, + "loss": 0.80575901, + "num_input_tokens_seen": 352089480, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.09442139, + "step": 16314, + "time_per_iteration": 2.522977590560913 + }, + { + "auxiliary_loss_clip": 0.06308355, + "auxiliary_loss_mlp": 0.01248498, + "balance_loss_clip": 0.06254214, + "balance_loss_mlp": 0.01247535, + "epoch": 0.9809108672779198, + "flos": 64063307652480.0, + "grad_norm": 0.6727311068794228, + "language_loss": 0.51654154, + "learning_rate": 3.80934308995684e-09, + "loss": 0.5921101, + "num_input_tokens_seen": 352150000, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.00961304, + "step": 16315, + "time_per_iteration": 3.1521832942962646 + }, + { + "auxiliary_loss_clip": 0.06402422, + "auxiliary_loss_mlp": 0.01263871, + "balance_loss_clip": 0.06269836, + "balance_loss_mlp": 0.01255019, + "epoch": 0.9809709905305877, + "flos": 22786683857280.0, + "grad_norm": 1.2900137630224915, + "language_loss": 0.69874811, + "learning_rate": 3.785354859932033e-09, + "loss": 0.77541101, + "num_input_tokens_seen": 352170990, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.08843994, + "step": 16316, + "time_per_iteration": 2.5589540004730225 + }, + { + "auxiliary_loss_clip": 0.06403589, + "auxiliary_loss_mlp": 0.01263012, + "balance_loss_clip": 0.06271669, + "balance_loss_mlp": 0.01254274, + "epoch": 0.9810311137832557, + "flos": 37022423351040.0, + "grad_norm": 1.7188947170249258, + "language_loss": 0.55401117, + "learning_rate": 3.76144232656661e-09, + "loss": 0.6306771, + "num_input_tokens_seen": 352195335, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.08734131, + "step": 16317, + "time_per_iteration": 2.6282479763031006 + }, + { + "auxiliary_loss_clip": 0.06401145, + "auxiliary_loss_mlp": 0.01270262, + "balance_loss_clip": 0.06272769, + "balance_loss_mlp": 0.01260905, + "epoch": 0.9810912370359236, + "flos": 18922258679040.0, + "grad_norm": 1.5547999119596, + "language_loss": 0.73396158, + "learning_rate": 3.737605490767404e-09, + "loss": 0.81067568, + "num_input_tokens_seen": 352214170, + "router_z_loss_clip": 1.28417969, + "router_z_loss_mlp": 0.09350586, + "step": 16318, + "time_per_iteration": 2.5051674842834473 + }, + { + "auxiliary_loss_clip": 0.06401484, + "auxiliary_loss_mlp": 0.01265731, + "balance_loss_clip": 0.06274587, + "balance_loss_mlp": 0.01256397, + "epoch": 0.9811513602885916, + "flos": 18447411191040.0, + "grad_norm": 1.9997801159399626, + "language_loss": 0.82393742, + "learning_rate": 3.7138443534383555e-09, + "loss": 0.90060955, + "num_input_tokens_seen": 352231470, + "router_z_loss_clip": 1.26757812, + "router_z_loss_mlp": 0.09332275, + "step": 16319, + "time_per_iteration": 2.4961817264556885 + }, + { + "auxiliary_loss_clip": 0.06305759, + "auxiliary_loss_mlp": 0.01249631, + "balance_loss_clip": 0.06251506, + "balance_loss_mlp": 0.01248486, + "epoch": 0.9812114835412595, + "flos": 68078603306880.0, + "grad_norm": 0.7284136479958665, + "language_loss": 0.53509539, + "learning_rate": 3.6901589154803014e-09, + "loss": 0.61064935, + "num_input_tokens_seen": 352291770, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.01144409, + "step": 16320, + "time_per_iteration": 3.0633654594421387 + }, + { + "auxiliary_loss_clip": 0.06399468, + "auxiliary_loss_mlp": 0.01265154, + "balance_loss_clip": 0.0626857, + "balance_loss_mlp": 0.01255969, + "epoch": 0.9812716067939276, + "flos": 25379826888960.0, + "grad_norm": 1.768638435944613, + "language_loss": 0.73735636, + "learning_rate": 3.6665491777914116e-09, + "loss": 0.81400257, + "num_input_tokens_seen": 352310735, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09185791, + "step": 16321, + "time_per_iteration": 2.566575527191162 + }, + { + "auxiliary_loss_clip": 0.06396846, + "auxiliary_loss_mlp": 0.01263617, + "balance_loss_clip": 0.06269559, + "balance_loss_mlp": 0.01254444, + "epoch": 0.9813317300465956, + "flos": 22863439797120.0, + "grad_norm": 1.5226563597520282, + "language_loss": 0.79048485, + "learning_rate": 3.6430151412669698e-09, + "loss": 0.86708951, + "num_input_tokens_seen": 352329545, + "router_z_loss_clip": 1.2734375, + "router_z_loss_mlp": 0.09173584, + "step": 16322, + "time_per_iteration": 2.549025535583496 + }, + { + "auxiliary_loss_clip": 0.06402303, + "auxiliary_loss_mlp": 0.01266539, + "balance_loss_clip": 0.06273006, + "balance_loss_mlp": 0.01257437, + "epoch": 0.9813918532992635, + "flos": 23593767984000.0, + "grad_norm": 1.4992713181150594, + "language_loss": 0.80698186, + "learning_rate": 3.619556806799595e-09, + "loss": 0.88367027, + "num_input_tokens_seen": 352352080, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.09100342, + "step": 16323, + "time_per_iteration": 2.542644739151001 + }, + { + "auxiliary_loss_clip": 0.06404383, + "auxiliary_loss_mlp": 0.01265912, + "balance_loss_clip": 0.06270544, + "balance_loss_mlp": 0.0125637, + "epoch": 0.9814519765519315, + "flos": 19611860981760.0, + "grad_norm": 1.9971610080835347, + "language_loss": 0.84855533, + "learning_rate": 3.596174175278799e-09, + "loss": 0.92525834, + "num_input_tokens_seen": 352366455, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.09552002, + "step": 16324, + "time_per_iteration": 3.8597731590270996 + }, + { + "auxiliary_loss_clip": 0.06401306, + "auxiliary_loss_mlp": 0.01264136, + "balance_loss_clip": 0.06270885, + "balance_loss_mlp": 0.01254921, + "epoch": 0.9815120998045994, + "flos": 33954390904320.0, + "grad_norm": 1.403861865593316, + "language_loss": 0.74789631, + "learning_rate": 3.5728672475909827e-09, + "loss": 0.82455075, + "num_input_tokens_seen": 352386090, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.09216309, + "step": 16325, + "time_per_iteration": 2.6116855144500732 + }, + { + "auxiliary_loss_clip": 0.06395521, + "auxiliary_loss_mlp": 0.01263546, + "balance_loss_clip": 0.0626988, + "balance_loss_mlp": 0.01254653, + "epoch": 0.9815722230572674, + "flos": 20856295094400.0, + "grad_norm": 1.6689629805802655, + "language_loss": 0.76699644, + "learning_rate": 3.5496360246201063e-09, + "loss": 0.8435871, + "num_input_tokens_seen": 352404000, + "router_z_loss_clip": 1.25585938, + "router_z_loss_mlp": 0.08892822, + "step": 16326, + "time_per_iteration": 2.4666171073913574 + }, + { + "auxiliary_loss_clip": 0.06404449, + "auxiliary_loss_mlp": 0.01264706, + "balance_loss_clip": 0.06272893, + "balance_loss_mlp": 0.01254698, + "epoch": 0.9816323463099353, + "flos": 22901356569600.0, + "grad_norm": 1.675938455983492, + "language_loss": 0.67456639, + "learning_rate": 3.5264805072470205e-09, + "loss": 0.75125796, + "num_input_tokens_seen": 352423540, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.10003662, + "step": 16327, + "time_per_iteration": 2.5008022785186768 + }, + { + "auxiliary_loss_clip": 0.06408646, + "auxiliary_loss_mlp": 0.01265428, + "balance_loss_clip": 0.06271295, + "balance_loss_mlp": 0.01255361, + "epoch": 0.9816924695626034, + "flos": 31547351790720.0, + "grad_norm": 1.412267833838127, + "language_loss": 0.73885894, + "learning_rate": 3.5034006963501337e-09, + "loss": 0.81559968, + "num_input_tokens_seen": 352445530, + "router_z_loss_clip": 1.37402344, + "router_z_loss_mlp": 0.10070801, + "step": 16328, + "time_per_iteration": 2.5715060234069824 + }, + { + "auxiliary_loss_clip": 0.06412687, + "auxiliary_loss_mlp": 0.01268223, + "balance_loss_clip": 0.0627219, + "balance_loss_mlp": 0.01257988, + "epoch": 0.9817525928152713, + "flos": 21513305358720.0, + "grad_norm": 1.842030012246621, + "language_loss": 0.81347609, + "learning_rate": 3.4803965928040802e-09, + "loss": 0.89028519, + "num_input_tokens_seen": 352466325, + "router_z_loss_clip": 1.40429688, + "router_z_loss_mlp": 0.10235596, + "step": 16329, + "time_per_iteration": 2.5269720554351807 + }, + { + "auxiliary_loss_clip": 0.06407592, + "auxiliary_loss_mlp": 0.01266637, + "balance_loss_clip": 0.06271036, + "balance_loss_mlp": 0.0125626, + "epoch": 0.9818127160679393, + "flos": 25556539150080.0, + "grad_norm": 1.7679142745259233, + "language_loss": 0.76537704, + "learning_rate": 3.4574681974817168e-09, + "loss": 0.8421194, + "num_input_tokens_seen": 352485505, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.10375977, + "step": 16330, + "time_per_iteration": 2.5097715854644775 + }, + { + "auxiliary_loss_clip": 0.06417432, + "auxiliary_loss_mlp": 0.01265777, + "balance_loss_clip": 0.06274364, + "balance_loss_mlp": 0.01254118, + "epoch": 0.9818728393206072, + "flos": 28811220566400.0, + "grad_norm": 3.7665228770401518, + "language_loss": 0.66577238, + "learning_rate": 3.434615511252126e-09, + "loss": 0.74260449, + "num_input_tokens_seen": 352505360, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.11645508, + "step": 16331, + "time_per_iteration": 2.594588041305542 + }, + { + "auxiliary_loss_clip": 0.0640003, + "auxiliary_loss_mlp": 0.01264275, + "balance_loss_clip": 0.06271006, + "balance_loss_mlp": 0.01255412, + "epoch": 0.9819329625732752, + "flos": 23229023160960.0, + "grad_norm": 1.8265104369584833, + "language_loss": 0.73624349, + "learning_rate": 3.411838534981948e-09, + "loss": 0.81288654, + "num_input_tokens_seen": 352524035, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.08862305, + "step": 16332, + "time_per_iteration": 3.9030022621154785 + }, + { + "auxiliary_loss_clip": 0.06402284, + "auxiliary_loss_mlp": 0.01265638, + "balance_loss_clip": 0.06271557, + "balance_loss_mlp": 0.01256882, + "epoch": 0.9819930858259431, + "flos": 17536261893120.0, + "grad_norm": 1.7201228746182549, + "language_loss": 0.76839882, + "learning_rate": 3.389137269534936e-09, + "loss": 0.84507805, + "num_input_tokens_seen": 352543210, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.08752441, + "step": 16333, + "time_per_iteration": 2.5092711448669434 + }, + { + "auxiliary_loss_clip": 0.06401891, + "auxiliary_loss_mlp": 0.01263466, + "balance_loss_clip": 0.06271283, + "balance_loss_mlp": 0.01254305, + "epoch": 0.9820532090786112, + "flos": 12534570374400.0, + "grad_norm": 1.9033760890389273, + "language_loss": 0.73437434, + "learning_rate": 3.366511715771958e-09, + "loss": 0.81102788, + "num_input_tokens_seen": 352559770, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.09161377, + "step": 16334, + "time_per_iteration": 2.4836056232452393 + }, + { + "auxiliary_loss_clip": 0.06403394, + "auxiliary_loss_mlp": 0.01265276, + "balance_loss_clip": 0.06271391, + "balance_loss_mlp": 0.01255435, + "epoch": 0.9821133323312792, + "flos": 18845586593280.0, + "grad_norm": 1.7621498824539008, + "language_loss": 0.78639001, + "learning_rate": 3.3439618745509934e-09, + "loss": 0.86307669, + "num_input_tokens_seen": 352577690, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.09838867, + "step": 16335, + "time_per_iteration": 2.494976043701172 + }, + { + "auxiliary_loss_clip": 0.06405871, + "auxiliary_loss_mlp": 0.01267908, + "balance_loss_clip": 0.0627166, + "balance_loss_mlp": 0.01257406, + "epoch": 0.9821734555839471, + "flos": 34832612747520.0, + "grad_norm": 2.0448132834813593, + "language_loss": 0.6420033, + "learning_rate": 3.3214877467271362e-09, + "loss": 0.71874112, + "num_input_tokens_seen": 352598850, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.1050415, + "step": 16336, + "time_per_iteration": 2.605154514312744 + }, + { + "auxiliary_loss_clip": 0.06409524, + "auxiliary_loss_mlp": 0.01263456, + "balance_loss_clip": 0.0627144, + "balance_loss_mlp": 0.0125327, + "epoch": 0.9822335788366151, + "flos": 17133768005760.0, + "grad_norm": 2.4121534627506183, + "language_loss": 0.73113394, + "learning_rate": 3.299089333152372e-09, + "loss": 0.80786371, + "num_input_tokens_seen": 352616130, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.10186768, + "step": 16337, + "time_per_iteration": 2.492018222808838 + }, + { + "auxiliary_loss_clip": 0.06404828, + "auxiliary_loss_mlp": 0.01266072, + "balance_loss_clip": 0.06271564, + "balance_loss_mlp": 0.01256196, + "epoch": 0.982293702089283, + "flos": 20819468424960.0, + "grad_norm": 1.6227440209505577, + "language_loss": 0.73327523, + "learning_rate": 3.2767666346764645e-09, + "loss": 0.80998421, + "num_input_tokens_seen": 352636885, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.09881592, + "step": 16338, + "time_per_iteration": 2.511469602584839 + }, + { + "auxiliary_loss_clip": 0.06401011, + "auxiliary_loss_mlp": 0.01264448, + "balance_loss_clip": 0.06268863, + "balance_loss_mlp": 0.01255096, + "epoch": 0.982353825341951, + "flos": 24687708963840.0, + "grad_norm": 1.5242122575774386, + "language_loss": 0.81808567, + "learning_rate": 3.2545196521454045e-09, + "loss": 0.89474022, + "num_input_tokens_seen": 352657905, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.09350586, + "step": 16339, + "time_per_iteration": 2.5525383949279785 + }, + { + "auxiliary_loss_clip": 0.06396718, + "auxiliary_loss_mlp": 0.01263936, + "balance_loss_clip": 0.06269798, + "balance_loss_mlp": 0.01254888, + "epoch": 0.982413948594619, + "flos": 20856840145920.0, + "grad_norm": 7.946616687424166, + "language_loss": 0.63168478, + "learning_rate": 3.232348386403405e-09, + "loss": 0.70829129, + "num_input_tokens_seen": 352676320, + "router_z_loss_clip": 1.26855469, + "router_z_loss_mlp": 0.09051514, + "step": 16340, + "time_per_iteration": 3.950870990753174 + }, + { + "auxiliary_loss_clip": 0.06404588, + "auxiliary_loss_mlp": 0.01262603, + "balance_loss_clip": 0.06271665, + "balance_loss_mlp": 0.0125318, + "epoch": 0.982474071847287, + "flos": 15382774834560.0, + "grad_norm": 2.1427722252854626, + "language_loss": 0.85878891, + "learning_rate": 3.2102528382904613e-09, + "loss": 0.93546081, + "num_input_tokens_seen": 352692665, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.09417725, + "step": 16341, + "time_per_iteration": 2.4532127380371094 + }, + { + "auxiliary_loss_clip": 0.06396417, + "auxiliary_loss_mlp": 0.0126733, + "balance_loss_clip": 0.06270136, + "balance_loss_mlp": 0.01258371, + "epoch": 0.9825341950999549, + "flos": 23782471378560.0, + "grad_norm": 1.3471902958727353, + "language_loss": 0.67131615, + "learning_rate": 3.188233008645014e-09, + "loss": 0.74795365, + "num_input_tokens_seen": 352716130, + "router_z_loss_clip": 1.26171875, + "router_z_loss_mlp": 0.08959961, + "step": 16342, + "time_per_iteration": 2.611873149871826 + }, + { + "auxiliary_loss_clip": 0.06402282, + "auxiliary_loss_mlp": 0.01265067, + "balance_loss_clip": 0.0626906, + "balance_loss_mlp": 0.01256055, + "epoch": 0.9825943183526229, + "flos": 22752708226560.0, + "grad_norm": 1.4818959973540065, + "language_loss": 0.77696526, + "learning_rate": 3.16628889830195e-09, + "loss": 0.85363877, + "num_input_tokens_seen": 352734705, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.09008789, + "step": 16343, + "time_per_iteration": 2.5588226318359375 + }, + { + "auxiliary_loss_clip": 0.06398541, + "auxiliary_loss_mlp": 0.01262034, + "balance_loss_clip": 0.06269187, + "balance_loss_mlp": 0.01253642, + "epoch": 0.9826544416052908, + "flos": 27717489221760.0, + "grad_norm": 1.4769865198658847, + "language_loss": 0.75333172, + "learning_rate": 3.1444205080932707e-09, + "loss": 0.82993752, + "num_input_tokens_seen": 352756225, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.08392334, + "step": 16344, + "time_per_iteration": 2.5765645503997803 + }, + { + "auxiliary_loss_clip": 0.06400666, + "auxiliary_loss_mlp": 0.01263473, + "balance_loss_clip": 0.0626943, + "balance_loss_mlp": 0.01253376, + "epoch": 0.9827145648579588, + "flos": 26948699210880.0, + "grad_norm": 1.922930318885977, + "language_loss": 0.67135489, + "learning_rate": 3.122627838848313e-09, + "loss": 0.74799621, + "num_input_tokens_seen": 352776210, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.10107422, + "step": 16345, + "time_per_iteration": 2.533918857574463 + }, + { + "auxiliary_loss_clip": 0.06396809, + "auxiliary_loss_mlp": 0.01261827, + "balance_loss_clip": 0.0627033, + "balance_loss_mlp": 0.01253537, + "epoch": 0.9827746881106267, + "flos": 21872138469120.0, + "grad_norm": 1.3537926665164286, + "language_loss": 0.79563165, + "learning_rate": 3.1009108913933045e-09, + "loss": 0.87221801, + "num_input_tokens_seen": 352795455, + "router_z_loss_clip": 1.26464844, + "router_z_loss_mlp": 0.08288574, + "step": 16346, + "time_per_iteration": 3.958854913711548 + }, + { + "auxiliary_loss_clip": 0.06413849, + "auxiliary_loss_mlp": 0.0126616, + "balance_loss_clip": 0.06275063, + "balance_loss_mlp": 0.01256176, + "epoch": 0.9828348113632948, + "flos": 20857175562240.0, + "grad_norm": 1.985745822904642, + "language_loss": 0.75521713, + "learning_rate": 3.079269666552031e-09, + "loss": 0.83201724, + "num_input_tokens_seen": 352812895, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.09991455, + "step": 16347, + "time_per_iteration": 2.535245656967163 + }, + { + "auxiliary_loss_clip": 0.06396177, + "auxiliary_loss_mlp": 0.01263212, + "balance_loss_clip": 0.06270447, + "balance_loss_mlp": 0.01254695, + "epoch": 0.9828949346159628, + "flos": 34577886735360.0, + "grad_norm": 2.740882984240411, + "language_loss": 0.6695146, + "learning_rate": 3.0577041651449474e-09, + "loss": 0.74610847, + "num_input_tokens_seen": 352835470, + "router_z_loss_clip": 1.2578125, + "router_z_loss_mlp": 0.08514404, + "step": 16348, + "time_per_iteration": 2.6063122749328613 + }, + { + "auxiliary_loss_clip": 0.06400393, + "auxiliary_loss_mlp": 0.01264818, + "balance_loss_clip": 0.06270978, + "balance_loss_mlp": 0.01254721, + "epoch": 0.9829550578686307, + "flos": 24463562492160.0, + "grad_norm": 1.859593683804768, + "language_loss": 0.69546545, + "learning_rate": 3.0362143879898437e-09, + "loss": 0.7721175, + "num_input_tokens_seen": 352854295, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.10095215, + "step": 16349, + "time_per_iteration": 2.538785934448242 + }, + { + "auxiliary_loss_clip": 0.06395674, + "auxiliary_loss_mlp": 0.01264209, + "balance_loss_clip": 0.06270944, + "balance_loss_mlp": 0.01255292, + "epoch": 0.9830151811212987, + "flos": 16915784808960.0, + "grad_norm": 1.6752687286579624, + "language_loss": 0.75853312, + "learning_rate": 3.0148003359014018e-09, + "loss": 0.835132, + "num_input_tokens_seen": 352869695, + "router_z_loss_clip": 1.24609375, + "router_z_loss_mlp": 0.08905029, + "step": 16350, + "time_per_iteration": 2.46547269821167 + }, + { + "auxiliary_loss_clip": 0.06401215, + "auxiliary_loss_mlp": 0.01266283, + "balance_loss_clip": 0.06269281, + "balance_loss_mlp": 0.01256854, + "epoch": 0.9830753043739666, + "flos": 21294735183360.0, + "grad_norm": 1.9963831633917941, + "language_loss": 0.84258103, + "learning_rate": 2.9934620096920826e-09, + "loss": 0.91925597, + "num_input_tokens_seen": 352887430, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09429932, + "step": 16351, + "time_per_iteration": 2.572175979614258 + }, + { + "auxiliary_loss_clip": 0.06398397, + "auxiliary_loss_mlp": 0.01260592, + "balance_loss_clip": 0.06267038, + "balance_loss_mlp": 0.01251526, + "epoch": 0.9831354276266346, + "flos": 31731736700160.0, + "grad_norm": 1.4942934246036372, + "language_loss": 0.6857751, + "learning_rate": 2.972199410170795e-09, + "loss": 0.76236498, + "num_input_tokens_seen": 352907555, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09069824, + "step": 16352, + "time_per_iteration": 2.5960402488708496 + }, + { + "auxiliary_loss_clip": 0.06403258, + "auxiliary_loss_mlp": 0.01261007, + "balance_loss_clip": 0.06273116, + "balance_loss_mlp": 0.01252054, + "epoch": 0.9831955508793025, + "flos": 21625923646080.0, + "grad_norm": 1.3954339265151765, + "language_loss": 0.6703254, + "learning_rate": 2.951012538143782e-09, + "loss": 0.74696803, + "num_input_tokens_seen": 352928670, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.08944702, + "step": 16353, + "time_per_iteration": 2.5353140830993652 + }, + { + "auxiliary_loss_clip": 0.06395429, + "auxiliary_loss_mlp": 0.01264292, + "balance_loss_clip": 0.06268495, + "balance_loss_mlp": 0.0125559, + "epoch": 0.9832556741319706, + "flos": 22975177616640.0, + "grad_norm": 1.6379749253440405, + "language_loss": 0.74751425, + "learning_rate": 2.9299013944144025e-09, + "loss": 0.82411146, + "num_input_tokens_seen": 352948345, + "router_z_loss_clip": 1.26855469, + "router_z_loss_mlp": 0.08703613, + "step": 16354, + "time_per_iteration": 2.508065700531006 + }, + { + "auxiliary_loss_clip": 0.06398819, + "auxiliary_loss_mlp": 0.01263889, + "balance_loss_clip": 0.06268892, + "balance_loss_mlp": 0.0125468, + "epoch": 0.9833157973846385, + "flos": 21330178260480.0, + "grad_norm": 2.1076687660597644, + "language_loss": 0.77908456, + "learning_rate": 2.9088659797835702e-09, + "loss": 0.85571158, + "num_input_tokens_seen": 352967250, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.09216309, + "step": 16355, + "time_per_iteration": 2.508748769760132 + }, + { + "auxiliary_loss_clip": 0.06398673, + "auxiliary_loss_mlp": 0.01264487, + "balance_loss_clip": 0.06269018, + "balance_loss_mlp": 0.01255743, + "epoch": 0.9833759206373065, + "flos": 21074991050880.0, + "grad_norm": 1.7510865399500044, + "language_loss": 0.73771065, + "learning_rate": 2.8879062950484256e-09, + "loss": 0.81434226, + "num_input_tokens_seen": 352984725, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.08746338, + "step": 16356, + "time_per_iteration": 2.4964609146118164 + }, + { + "auxiliary_loss_clip": 0.06397355, + "auxiliary_loss_mlp": 0.01264905, + "balance_loss_clip": 0.06268449, + "balance_loss_mlp": 0.01256, + "epoch": 0.9834360438899744, + "flos": 18703227306240.0, + "grad_norm": 1.7922829245383989, + "language_loss": 0.76294625, + "learning_rate": 2.8670223410041104e-09, + "loss": 0.83956885, + "num_input_tokens_seen": 353003480, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.08911133, + "step": 16357, + "time_per_iteration": 2.508512258529663 + }, + { + "auxiliary_loss_clip": 0.0640227, + "auxiliary_loss_mlp": 0.01263006, + "balance_loss_clip": 0.06272359, + "balance_loss_mlp": 0.01253743, + "epoch": 0.9834961671426424, + "flos": 21111524231040.0, + "grad_norm": 2.0027677805382953, + "language_loss": 0.80176306, + "learning_rate": 2.846214118442436e-09, + "loss": 0.87841582, + "num_input_tokens_seen": 353021425, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.09259033, + "step": 16358, + "time_per_iteration": 2.4842851161956787 + }, + { + "auxiliary_loss_clip": 0.06400406, + "auxiliary_loss_mlp": 0.01262987, + "balance_loss_clip": 0.06269883, + "balance_loss_mlp": 0.01254094, + "epoch": 0.9835562903953103, + "flos": 26694853666560.0, + "grad_norm": 2.106405637853541, + "language_loss": 0.67995811, + "learning_rate": 2.8254816281523263e-09, + "loss": 0.75659204, + "num_input_tokens_seen": 353039870, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.08886719, + "step": 16359, + "time_per_iteration": 2.543684720993042 + }, + { + "auxiliary_loss_clip": 0.06396379, + "auxiliary_loss_mlp": 0.01260995, + "balance_loss_clip": 0.06268568, + "balance_loss_mlp": 0.01252264, + "epoch": 0.9836164136479784, + "flos": 22096578430080.0, + "grad_norm": 1.5676577636482238, + "language_loss": 0.69622505, + "learning_rate": 2.804824870920264e-09, + "loss": 0.77279884, + "num_input_tokens_seen": 353059750, + "router_z_loss_clip": 1.27636719, + "router_z_loss_mlp": 0.08728027, + "step": 16360, + "time_per_iteration": 2.5693228244781494 + }, + { + "auxiliary_loss_clip": 0.06402056, + "auxiliary_loss_mlp": 0.01263576, + "balance_loss_clip": 0.06269471, + "balance_loss_mlp": 0.0125389, + "epoch": 0.9836765369006463, + "flos": 23885194884480.0, + "grad_norm": 1.682194458725563, + "language_loss": 0.8439554, + "learning_rate": 2.7842438475293996e-09, + "loss": 0.92061168, + "num_input_tokens_seen": 353079940, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09674072, + "step": 16361, + "time_per_iteration": 2.560330390930176 + }, + { + "auxiliary_loss_clip": 0.06402538, + "auxiliary_loss_mlp": 0.01263822, + "balance_loss_clip": 0.06270505, + "balance_loss_mlp": 0.01255251, + "epoch": 0.9837366601533143, + "flos": 25851529848960.0, + "grad_norm": 1.6385001954034184, + "language_loss": 0.7628051, + "learning_rate": 2.76373855876022e-09, + "loss": 0.83946872, + "num_input_tokens_seen": 353099990, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.08575439, + "step": 16362, + "time_per_iteration": 2.5176074504852295 + }, + { + "auxiliary_loss_clip": 0.06398503, + "auxiliary_loss_mlp": 0.01266553, + "balance_loss_clip": 0.06268647, + "balance_loss_mlp": 0.0125685, + "epoch": 0.9837967834059823, + "flos": 21363902328960.0, + "grad_norm": 1.5985135435768925, + "language_loss": 0.71467978, + "learning_rate": 2.7433090053901043e-09, + "loss": 0.79133034, + "num_input_tokens_seen": 353118710, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.0970459, + "step": 16363, + "time_per_iteration": 3.905139684677124 + }, + { + "auxiliary_loss_clip": 0.06394857, + "auxiliary_loss_mlp": 0.01264694, + "balance_loss_clip": 0.06269969, + "balance_loss_mlp": 0.01256219, + "epoch": 0.9838569066586502, + "flos": 18521819216640.0, + "grad_norm": 1.6859812607317168, + "language_loss": 0.63076383, + "learning_rate": 2.7229551881937653e-09, + "loss": 0.70735937, + "num_input_tokens_seen": 353136415, + "router_z_loss_clip": 1.25, + "router_z_loss_mlp": 0.08477783, + "step": 16364, + "time_per_iteration": 2.5008041858673096 + }, + { + "auxiliary_loss_clip": 0.06401549, + "auxiliary_loss_mlp": 0.01262269, + "balance_loss_clip": 0.0627073, + "balance_loss_mlp": 0.01253793, + "epoch": 0.9839170299113182, + "flos": 22458430287360.0, + "grad_norm": 1.9025940336475926, + "language_loss": 0.75345969, + "learning_rate": 2.702677107943252e-09, + "loss": 0.83009791, + "num_input_tokens_seen": 353154650, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.0847168, + "step": 16365, + "time_per_iteration": 2.552847146987915 + }, + { + "auxiliary_loss_clip": 0.06399475, + "auxiliary_loss_mlp": 0.01264327, + "balance_loss_clip": 0.0627087, + "balance_loss_mlp": 0.01255554, + "epoch": 0.9839771531639862, + "flos": 27899861633280.0, + "grad_norm": 1.6224580462196883, + "language_loss": 0.76744139, + "learning_rate": 2.6824747654072832e-09, + "loss": 0.84407938, + "num_input_tokens_seen": 353174065, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.08776855, + "step": 16366, + "time_per_iteration": 2.5814366340637207 + }, + { + "auxiliary_loss_clip": 0.06397621, + "auxiliary_loss_mlp": 0.0126483, + "balance_loss_clip": 0.06269, + "balance_loss_mlp": 0.01255752, + "epoch": 0.9840372764166542, + "flos": 28221071460480.0, + "grad_norm": 1.5360929282556393, + "language_loss": 0.77089232, + "learning_rate": 2.662348161352357e-09, + "loss": 0.84751683, + "num_input_tokens_seen": 353193560, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.09075928, + "step": 16367, + "time_per_iteration": 2.548718214035034 + }, + { + "auxiliary_loss_clip": 0.06398439, + "auxiliary_loss_mlp": 0.01263987, + "balance_loss_clip": 0.06268852, + "balance_loss_mlp": 0.01254933, + "epoch": 0.9840973996693221, + "flos": 23410682812800.0, + "grad_norm": 1.451840758159792, + "language_loss": 0.61724389, + "learning_rate": 2.642297296540974e-09, + "loss": 0.69386816, + "num_input_tokens_seen": 353213525, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.09057617, + "step": 16368, + "time_per_iteration": 2.532034158706665 + }, + { + "auxiliary_loss_clip": 0.06396456, + "auxiliary_loss_mlp": 0.01267037, + "balance_loss_clip": 0.06270956, + "balance_loss_mlp": 0.01258698, + "epoch": 0.9841575229219901, + "flos": 21401986809600.0, + "grad_norm": 1.5041768156140347, + "language_loss": 0.6552428, + "learning_rate": 2.6223221717340816e-09, + "loss": 0.73187768, + "num_input_tokens_seen": 353234000, + "router_z_loss_clip": 1.25488281, + "router_z_loss_mlp": 0.08343506, + "step": 16369, + "time_per_iteration": 2.520037889480591 + }, + { + "auxiliary_loss_clip": 0.06402774, + "auxiliary_loss_mlp": 0.0126442, + "balance_loss_clip": 0.06270762, + "balance_loss_mlp": 0.01254126, + "epoch": 0.984217646174658, + "flos": 24471277067520.0, + "grad_norm": 1.753415617022144, + "language_loss": 0.68846416, + "learning_rate": 2.6024227876886295e-09, + "loss": 0.76513612, + "num_input_tokens_seen": 353254940, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.10296631, + "step": 16370, + "time_per_iteration": 2.540699005126953 + }, + { + "auxiliary_loss_clip": 0.0640409, + "auxiliary_loss_mlp": 0.01266605, + "balance_loss_clip": 0.06269194, + "balance_loss_mlp": 0.01256246, + "epoch": 0.984277769427326, + "flos": 16440559977600.0, + "grad_norm": 2.157249724896927, + "language_loss": 0.73935145, + "learning_rate": 2.582599145159792e-09, + "loss": 0.8160584, + "num_input_tokens_seen": 353272590, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.10357666, + "step": 16371, + "time_per_iteration": 2.454529047012329 + }, + { + "auxiliary_loss_clip": 0.06309754, + "auxiliary_loss_mlp": 0.01249704, + "balance_loss_clip": 0.06255664, + "balance_loss_mlp": 0.01248747, + "epoch": 0.9843378926799939, + "flos": 64551487939200.0, + "grad_norm": 0.7685676506536336, + "language_loss": 0.64979422, + "learning_rate": 2.562851244898745e-09, + "loss": 0.72538882, + "num_input_tokens_seen": 353334380, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.00955963, + "step": 16372, + "time_per_iteration": 4.500819206237793 + }, + { + "auxiliary_loss_clip": 0.0639531, + "auxiliary_loss_mlp": 0.0126257, + "balance_loss_clip": 0.0626704, + "balance_loss_mlp": 0.01253796, + "epoch": 0.984398015932662, + "flos": 17388326309760.0, + "grad_norm": 1.6860490980606475, + "language_loss": 0.71169502, + "learning_rate": 2.5431790876544456e-09, + "loss": 0.78827381, + "num_input_tokens_seen": 353351640, + "router_z_loss_clip": 1.28222656, + "router_z_loss_mlp": 0.087677, + "step": 16373, + "time_per_iteration": 2.457385301589966 + }, + { + "auxiliary_loss_clip": 0.06398892, + "auxiliary_loss_mlp": 0.01265678, + "balance_loss_clip": 0.06270701, + "balance_loss_mlp": 0.01256857, + "epoch": 0.9844581391853299, + "flos": 23885991498240.0, + "grad_norm": 1.5447539198468738, + "language_loss": 0.81465459, + "learning_rate": 2.523582674173186e-09, + "loss": 0.89130032, + "num_input_tokens_seen": 353372555, + "router_z_loss_clip": 1.28222656, + "router_z_loss_mlp": 0.08825684, + "step": 16374, + "time_per_iteration": 2.5521185398101807 + }, + { + "auxiliary_loss_clip": 0.06403422, + "auxiliary_loss_mlp": 0.01265136, + "balance_loss_clip": 0.06271537, + "balance_loss_mlp": 0.01256016, + "epoch": 0.9845182624379979, + "flos": 19871534384640.0, + "grad_norm": 1.7001768463921554, + "language_loss": 0.69477171, + "learning_rate": 2.504062005197927e-09, + "loss": 0.77145725, + "num_input_tokens_seen": 353391385, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09112549, + "step": 16375, + "time_per_iteration": 2.4824092388153076 + }, + { + "auxiliary_loss_clip": 0.06405924, + "auxiliary_loss_mlp": 0.01263771, + "balance_loss_clip": 0.06271198, + "balance_loss_mlp": 0.01254246, + "epoch": 0.9845783856906659, + "flos": 28261839271680.0, + "grad_norm": 1.9798268500878542, + "language_loss": 0.80762142, + "learning_rate": 2.484617081468521e-09, + "loss": 0.88431835, + "num_input_tokens_seen": 353411630, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.09515381, + "step": 16376, + "time_per_iteration": 2.564424753189087 + }, + { + "auxiliary_loss_clip": 0.06399219, + "auxiliary_loss_mlp": 0.01263402, + "balance_loss_clip": 0.06270926, + "balance_loss_mlp": 0.01253967, + "epoch": 0.9846385089433338, + "flos": 28335702245760.0, + "grad_norm": 1.4082081602945489, + "language_loss": 0.62552863, + "learning_rate": 2.4652479037228224e-09, + "loss": 0.70215487, + "num_input_tokens_seen": 353432895, + "router_z_loss_clip": 1.28222656, + "router_z_loss_mlp": 0.09429932, + "step": 16377, + "time_per_iteration": 2.58390212059021 + }, + { + "auxiliary_loss_clip": 0.06403971, + "auxiliary_loss_mlp": 0.01265767, + "balance_loss_clip": 0.06272383, + "balance_loss_mlp": 0.01256588, + "epoch": 0.9846986321960018, + "flos": 24323718827520.0, + "grad_norm": 1.541732057428472, + "language_loss": 0.73141658, + "learning_rate": 2.445954472695133e-09, + "loss": 0.80811405, + "num_input_tokens_seen": 353454195, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09173584, + "step": 16378, + "time_per_iteration": 2.5272939205169678 + }, + { + "auxiliary_loss_clip": 0.06401136, + "auxiliary_loss_mlp": 0.01265891, + "balance_loss_clip": 0.06269161, + "balance_loss_mlp": 0.01256461, + "epoch": 0.9847587554486698, + "flos": 27279426476160.0, + "grad_norm": 1.6453729293875299, + "language_loss": 0.71287769, + "learning_rate": 2.426736789116868e-09, + "loss": 0.78954792, + "num_input_tokens_seen": 353475125, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09429932, + "step": 16379, + "time_per_iteration": 3.9946951866149902 + }, + { + "auxiliary_loss_clip": 0.06402892, + "auxiliary_loss_mlp": 0.01264316, + "balance_loss_clip": 0.06270932, + "balance_loss_mlp": 0.01254589, + "epoch": 0.9848188787013378, + "flos": 16547937384960.0, + "grad_norm": 1.675981927204607, + "language_loss": 0.68351865, + "learning_rate": 2.407594853716999e-09, + "loss": 0.76019073, + "num_input_tokens_seen": 353493265, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09719849, + "step": 16380, + "time_per_iteration": 2.525541305541992 + }, + { + "auxiliary_loss_clip": 0.06406681, + "auxiliary_loss_mlp": 0.01265103, + "balance_loss_clip": 0.06270894, + "balance_loss_mlp": 0.01255358, + "epoch": 0.9848790019540057, + "flos": 20199871808640.0, + "grad_norm": 1.8917613358360588, + "language_loss": 0.78484976, + "learning_rate": 2.38852866722139e-09, + "loss": 0.86156762, + "num_input_tokens_seen": 353511650, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.09741211, + "step": 16381, + "time_per_iteration": 2.5284276008605957 + }, + { + "auxiliary_loss_clip": 0.06401529, + "auxiliary_loss_mlp": 0.01263906, + "balance_loss_clip": 0.06269079, + "balance_loss_mlp": 0.01254101, + "epoch": 0.9849391252066737, + "flos": 28267750984320.0, + "grad_norm": 1.3772384607089387, + "language_loss": 0.82476425, + "learning_rate": 2.3695382303527965e-09, + "loss": 0.90141863, + "num_input_tokens_seen": 353534035, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.0980835, + "step": 16382, + "time_per_iteration": 2.5919766426086426 + }, + { + "auxiliary_loss_clip": 0.06408627, + "auxiliary_loss_mlp": 0.01264361, + "balance_loss_clip": 0.06271975, + "balance_loss_mlp": 0.0125492, + "epoch": 0.9849992484593416, + "flos": 22461407107200.0, + "grad_norm": 1.6817529475209232, + "language_loss": 0.74892008, + "learning_rate": 2.3506235438315316e-09, + "loss": 0.82564998, + "num_input_tokens_seen": 353549950, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.09436035, + "step": 16383, + "time_per_iteration": 2.514427900314331 + }, + { + "auxiliary_loss_clip": 0.06402783, + "auxiliary_loss_mlp": 0.01265978, + "balance_loss_clip": 0.06272022, + "balance_loss_mlp": 0.01256644, + "epoch": 0.9850593717120096, + "flos": 34505994332160.0, + "grad_norm": 1.417993131097162, + "language_loss": 0.66312635, + "learning_rate": 2.3317846083750203e-09, + "loss": 0.73981392, + "num_input_tokens_seen": 353573745, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09332275, + "step": 16384, + "time_per_iteration": 2.6268670558929443 + }, + { + "auxiliary_loss_clip": 0.06408171, + "auxiliary_loss_mlp": 0.01266699, + "balance_loss_clip": 0.0627324, + "balance_loss_mlp": 0.01256679, + "epoch": 0.9851194949646775, + "flos": 38846524809600.0, + "grad_norm": 2.0407585132753474, + "language_loss": 0.70862484, + "learning_rate": 2.313021424697359e-09, + "loss": 0.78537351, + "num_input_tokens_seen": 353595335, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.10021973, + "step": 16385, + "time_per_iteration": 2.644968032836914 + }, + { + "auxiliary_loss_clip": 0.06406495, + "auxiliary_loss_mlp": 0.01267976, + "balance_loss_clip": 0.06273443, + "balance_loss_mlp": 0.01258511, + "epoch": 0.9851796182173456, + "flos": 17718215034240.0, + "grad_norm": 1.8403638705762766, + "language_loss": 0.81630373, + "learning_rate": 2.294333993509978e-09, + "loss": 0.89304841, + "num_input_tokens_seen": 353614270, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.09460449, + "step": 16386, + "time_per_iteration": 3.917997360229492 + }, + { + "auxiliary_loss_clip": 0.06405159, + "auxiliary_loss_mlp": 0.01265158, + "balance_loss_clip": 0.06271283, + "balance_loss_mlp": 0.01255127, + "epoch": 0.9852397414700135, + "flos": 27461756960640.0, + "grad_norm": 1.9733443741817431, + "language_loss": 0.67915004, + "learning_rate": 2.2757223155216442e-09, + "loss": 0.75585318, + "num_input_tokens_seen": 353634900, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.10040283, + "step": 16387, + "time_per_iteration": 2.5964622497558594 + }, + { + "auxiliary_loss_clip": 0.06393988, + "auxiliary_loss_mlp": 0.01264067, + "balance_loss_clip": 0.06269428, + "balance_loss_mlp": 0.01255752, + "epoch": 0.9852998647226815, + "flos": 18302662062720.0, + "grad_norm": 1.6277320463659288, + "language_loss": 0.74601555, + "learning_rate": 2.257186391438237e-09, + "loss": 0.82259607, + "num_input_tokens_seen": 353652890, + "router_z_loss_clip": 1.24511719, + "router_z_loss_mlp": 0.08312988, + "step": 16388, + "time_per_iteration": 2.5200042724609375 + }, + { + "auxiliary_loss_clip": 0.06399764, + "auxiliary_loss_mlp": 0.01266601, + "balance_loss_clip": 0.06269439, + "balance_loss_mlp": 0.01257291, + "epoch": 0.9853599879753495, + "flos": 19648058745600.0, + "grad_norm": 1.5789948007972028, + "language_loss": 0.82318109, + "learning_rate": 2.238726221962528e-09, + "loss": 0.89984477, + "num_input_tokens_seen": 353671295, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.09313965, + "step": 16389, + "time_per_iteration": 2.5028319358825684 + }, + { + "auxiliary_loss_clip": 0.06399673, + "auxiliary_loss_mlp": 0.012661, + "balance_loss_clip": 0.06269571, + "balance_loss_mlp": 0.01257118, + "epoch": 0.9854201112280174, + "flos": 23848745558400.0, + "grad_norm": 1.9542914856542009, + "language_loss": 0.67416507, + "learning_rate": 2.2203418077946234e-09, + "loss": 0.75082278, + "num_input_tokens_seen": 353690560, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.08978271, + "step": 16390, + "time_per_iteration": 2.524301052093506 + }, + { + "auxiliary_loss_clip": 0.06404354, + "auxiliary_loss_mlp": 0.01263587, + "balance_loss_clip": 0.0627258, + "balance_loss_mlp": 0.01253514, + "epoch": 0.9854802344806854, + "flos": 30088330571520.0, + "grad_norm": 1.5456986712452574, + "language_loss": 0.77386737, + "learning_rate": 2.2020331496312994e-09, + "loss": 0.85054678, + "num_input_tokens_seen": 353710660, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.10076904, + "step": 16391, + "time_per_iteration": 2.5559659004211426 + }, + { + "auxiliary_loss_clip": 0.06395002, + "auxiliary_loss_mlp": 0.01266032, + "balance_loss_clip": 0.06271442, + "balance_loss_mlp": 0.01257074, + "epoch": 0.9855403577333534, + "flos": 21913744821120.0, + "grad_norm": 1.673902135646454, + "language_loss": 0.68136293, + "learning_rate": 2.1838002481673333e-09, + "loss": 0.75797331, + "num_input_tokens_seen": 353730440, + "router_z_loss_clip": 1.23535156, + "router_z_loss_mlp": 0.08966064, + "step": 16392, + "time_per_iteration": 2.5317015647888184 + }, + { + "auxiliary_loss_clip": 0.06408426, + "auxiliary_loss_mlp": 0.01263266, + "balance_loss_clip": 0.06270889, + "balance_loss_mlp": 0.01252859, + "epoch": 0.9856004809860214, + "flos": 15419182233600.0, + "grad_norm": 1.7111314079552304, + "language_loss": 0.56011736, + "learning_rate": 2.1656431040937286e-09, + "loss": 0.63683426, + "num_input_tokens_seen": 353748360, + "router_z_loss_clip": 1.37695312, + "router_z_loss_mlp": 0.10400391, + "step": 16393, + "time_per_iteration": 2.476515293121338 + }, + { + "auxiliary_loss_clip": 0.06411494, + "auxiliary_loss_mlp": 0.01265344, + "balance_loss_clip": 0.06273687, + "balance_loss_mlp": 0.01255861, + "epoch": 0.9856606042386893, + "flos": 13656742980480.0, + "grad_norm": 3.1315340219077794, + "language_loss": 0.79706287, + "learning_rate": 2.1475617180990444e-09, + "loss": 0.87383127, + "num_input_tokens_seen": 353760880, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.09484863, + "step": 16394, + "time_per_iteration": 2.4626893997192383 + }, + { + "auxiliary_loss_clip": 0.06404269, + "auxiliary_loss_mlp": 0.01262883, + "balance_loss_clip": 0.06270118, + "balance_loss_mlp": 0.01253221, + "epoch": 0.9857207274913573, + "flos": 23486222868480.0, + "grad_norm": 1.467210916610673, + "language_loss": 0.76540744, + "learning_rate": 2.129556090869178e-09, + "loss": 0.84207892, + "num_input_tokens_seen": 353782255, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.09667969, + "step": 16395, + "time_per_iteration": 2.525965452194214 + }, + { + "auxiliary_loss_clip": 0.06400509, + "auxiliary_loss_mlp": 0.0126337, + "balance_loss_clip": 0.06270809, + "balance_loss_mlp": 0.01254554, + "epoch": 0.9857808507440252, + "flos": 21071217617280.0, + "grad_norm": 1.8150456310357506, + "language_loss": 0.75683588, + "learning_rate": 2.1116262230866933e-09, + "loss": 0.83347464, + "num_input_tokens_seen": 353803580, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.08813477, + "step": 16396, + "time_per_iteration": 2.567934513092041 + }, + { + "auxiliary_loss_clip": 0.06400032, + "auxiliary_loss_mlp": 0.01263122, + "balance_loss_clip": 0.06269535, + "balance_loss_mlp": 0.01254121, + "epoch": 0.9858409739966932, + "flos": 25308395683200.0, + "grad_norm": 1.3662476334903952, + "language_loss": 0.71217585, + "learning_rate": 2.0937721154317133e-09, + "loss": 0.78880739, + "num_input_tokens_seen": 353824200, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.08996582, + "step": 16397, + "time_per_iteration": 2.5428028106689453 + }, + { + "auxiliary_loss_clip": 0.06395601, + "auxiliary_loss_mlp": 0.01262092, + "balance_loss_clip": 0.06271599, + "balance_loss_mlp": 0.01253616, + "epoch": 0.9859010972493611, + "flos": 20565077829120.0, + "grad_norm": 1.5361350556521405, + "language_loss": 0.71496713, + "learning_rate": 2.0759937685810304e-09, + "loss": 0.79154408, + "num_input_tokens_seen": 353843350, + "router_z_loss_clip": 1.23925781, + "router_z_loss_mlp": 0.08477783, + "step": 16398, + "time_per_iteration": 2.5399317741394043 + }, + { + "auxiliary_loss_clip": 0.06398591, + "auxiliary_loss_mlp": 0.01261999, + "balance_loss_clip": 0.06270011, + "balance_loss_mlp": 0.01253434, + "epoch": 0.9859612205020292, + "flos": 24762075062400.0, + "grad_norm": 1.3521426462373807, + "language_loss": 0.74462658, + "learning_rate": 2.058291183208771e-09, + "loss": 0.82123244, + "num_input_tokens_seen": 353864520, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.08563232, + "step": 16399, + "time_per_iteration": 2.5816903114318848 + }, + { + "auxiliary_loss_clip": 0.06400129, + "auxiliary_loss_mlp": 0.01264452, + "balance_loss_clip": 0.06268509, + "balance_loss_mlp": 0.0125535, + "epoch": 0.9860213437546971, + "flos": 21112236990720.0, + "grad_norm": 3.4281947603629495, + "language_loss": 0.57744968, + "learning_rate": 2.0406643599863993e-09, + "loss": 0.65409541, + "num_input_tokens_seen": 353882240, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.09106445, + "step": 16400, + "time_per_iteration": 2.5620059967041016 + }, + { + "auxiliary_loss_clip": 0.06412265, + "auxiliary_loss_mlp": 0.0126489, + "balance_loss_clip": 0.0627275, + "balance_loss_mlp": 0.01254585, + "epoch": 0.9860814670073651, + "flos": 19142212446720.0, + "grad_norm": 1.6519096165686342, + "language_loss": 0.81009173, + "learning_rate": 2.023113299582491e-09, + "loss": 0.88686335, + "num_input_tokens_seen": 353901590, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.10308838, + "step": 16401, + "time_per_iteration": 2.6548011302948 + }, + { + "auxiliary_loss_clip": 0.06398042, + "auxiliary_loss_mlp": 0.01263271, + "balance_loss_clip": 0.06269659, + "balance_loss_mlp": 0.01253371, + "epoch": 0.9861415902600331, + "flos": 17242570932480.0, + "grad_norm": 1.9964613223358685, + "language_loss": 0.78200734, + "learning_rate": 2.005638002662069e-09, + "loss": 0.85862046, + "num_input_tokens_seen": 353918785, + "router_z_loss_clip": 1.28417969, + "router_z_loss_mlp": 0.09899902, + "step": 16402, + "time_per_iteration": 2.594348430633545 + }, + { + "auxiliary_loss_clip": 0.06402256, + "auxiliary_loss_mlp": 0.01262163, + "balance_loss_clip": 0.06270587, + "balance_loss_mlp": 0.01252978, + "epoch": 0.986201713512701, + "flos": 27790052457600.0, + "grad_norm": 1.7160674070535198, + "language_loss": 0.70323497, + "learning_rate": 1.9882384698881596e-09, + "loss": 0.77987915, + "num_input_tokens_seen": 353940390, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09185791, + "step": 16403, + "time_per_iteration": 4.039694547653198 + }, + { + "auxiliary_loss_clip": 0.06401487, + "auxiliary_loss_mlp": 0.01264425, + "balance_loss_clip": 0.06270707, + "balance_loss_mlp": 0.01255705, + "epoch": 0.986261836765369, + "flos": 28737902643840.0, + "grad_norm": 1.7955118608228118, + "language_loss": 0.74658298, + "learning_rate": 1.9709147019204566e-09, + "loss": 0.82324219, + "num_input_tokens_seen": 353962180, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.08721924, + "step": 16404, + "time_per_iteration": 2.539034366607666 + }, + { + "auxiliary_loss_clip": 0.06399557, + "auxiliary_loss_mlp": 0.0126528, + "balance_loss_clip": 0.06267157, + "balance_loss_mlp": 0.01255934, + "epoch": 0.986321960018037, + "flos": 34322028693120.0, + "grad_norm": 1.643480275660223, + "language_loss": 0.70353627, + "learning_rate": 1.953666699415768e-09, + "loss": 0.78018463, + "num_input_tokens_seen": 353984305, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.09344482, + "step": 16405, + "time_per_iteration": 2.6273982524871826 + }, + { + "auxiliary_loss_clip": 0.06396019, + "auxiliary_loss_mlp": 0.01263846, + "balance_loss_clip": 0.06269442, + "balance_loss_mlp": 0.01255764, + "epoch": 0.986382083270705, + "flos": 25196406301440.0, + "grad_norm": 1.6118450408666642, + "language_loss": 0.69949228, + "learning_rate": 1.93649446302846e-09, + "loss": 0.77609086, + "num_input_tokens_seen": 354004495, + "router_z_loss_clip": 1.265625, + "router_z_loss_mlp": 0.08087158, + "step": 16406, + "time_per_iteration": 2.5140862464904785 + }, + { + "auxiliary_loss_clip": 0.06398158, + "auxiliary_loss_mlp": 0.01267786, + "balance_loss_clip": 0.06270266, + "balance_loss_mlp": 0.01258655, + "epoch": 0.9864422065233729, + "flos": 11028953485440.0, + "grad_norm": 2.6977989926594597, + "language_loss": 0.75664067, + "learning_rate": 1.9193979934095663e-09, + "loss": 0.83330011, + "num_input_tokens_seen": 354015985, + "router_z_loss_clip": 1.27832031, + "router_z_loss_mlp": 0.09130859, + "step": 16407, + "time_per_iteration": 2.4719793796539307 + }, + { + "auxiliary_loss_clip": 0.06401893, + "auxiliary_loss_mlp": 0.0126585, + "balance_loss_clip": 0.06271636, + "balance_loss_mlp": 0.01256402, + "epoch": 0.9865023297760409, + "flos": 16551291548160.0, + "grad_norm": 1.7687262607764567, + "language_loss": 0.78086448, + "learning_rate": 1.9023772912072357e-09, + "loss": 0.85754192, + "num_input_tokens_seen": 354033260, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.09448242, + "step": 16408, + "time_per_iteration": 2.484081983566284 + }, + { + "auxiliary_loss_clip": 0.06408665, + "auxiliary_loss_mlp": 0.01263338, + "balance_loss_clip": 0.06272249, + "balance_loss_mlp": 0.0125243, + "epoch": 0.9865624530287088, + "flos": 18886186696320.0, + "grad_norm": 1.6749403374040852, + "language_loss": 0.68618417, + "learning_rate": 1.8854323570669515e-09, + "loss": 0.76290423, + "num_input_tokens_seen": 354052825, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.10900879, + "step": 16409, + "time_per_iteration": 2.57738995552063 + }, + { + "auxiliary_loss_clip": 0.06307763, + "auxiliary_loss_mlp": 0.01249973, + "balance_loss_clip": 0.06253904, + "balance_loss_mlp": 0.01248934, + "epoch": 0.9866225762813768, + "flos": 68905869068160.0, + "grad_norm": 0.7792419194004762, + "language_loss": 0.60673237, + "learning_rate": 1.8685631916313118e-09, + "loss": 0.68230969, + "num_input_tokens_seen": 354113920, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.01039124, + "step": 16410, + "time_per_iteration": 3.1789920330047607 + }, + { + "auxiliary_loss_clip": 0.06402837, + "auxiliary_loss_mlp": 0.0126605, + "balance_loss_clip": 0.06270561, + "balance_loss_mlp": 0.01256507, + "epoch": 0.9866826995340447, + "flos": 29030796990720.0, + "grad_norm": 1.9710910309404892, + "language_loss": 0.66693377, + "learning_rate": 1.8517697955400258e-09, + "loss": 0.74362266, + "num_input_tokens_seen": 354134210, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.09552002, + "step": 16411, + "time_per_iteration": 2.593170166015625 + }, + { + "auxiliary_loss_clip": 0.06309229, + "auxiliary_loss_mlp": 0.012507, + "balance_loss_clip": 0.06255028, + "balance_loss_mlp": 0.01249633, + "epoch": 0.9867428227867128, + "flos": 65399004460800.0, + "grad_norm": 0.7100176404553015, + "language_loss": 0.56223959, + "learning_rate": 1.8350521694299182e-09, + "loss": 0.63783884, + "num_input_tokens_seen": 354198010, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.01068115, + "step": 16412, + "time_per_iteration": 4.65021014213562 + }, + { + "auxiliary_loss_clip": 0.06410616, + "auxiliary_loss_mlp": 0.01264203, + "balance_loss_clip": 0.06273398, + "balance_loss_mlp": 0.01253856, + "epoch": 0.9868029460393807, + "flos": 26513697139200.0, + "grad_norm": 1.4737285322847526, + "language_loss": 0.73170412, + "learning_rate": 1.818410313934926e-09, + "loss": 0.80845225, + "num_input_tokens_seen": 354220000, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.10345459, + "step": 16413, + "time_per_iteration": 2.5816121101379395 + }, + { + "auxiliary_loss_clip": 0.0640188, + "auxiliary_loss_mlp": 0.01265077, + "balance_loss_clip": 0.06269288, + "balance_loss_mlp": 0.01255695, + "epoch": 0.9868630692920487, + "flos": 22974087513600.0, + "grad_norm": 1.3119989471392648, + "language_loss": 0.71715784, + "learning_rate": 1.8018442296858782e-09, + "loss": 0.79382741, + "num_input_tokens_seen": 354240910, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09381104, + "step": 16414, + "time_per_iteration": 2.621397018432617 + }, + { + "auxiliary_loss_clip": 0.0639587, + "auxiliary_loss_mlp": 0.01265336, + "balance_loss_clip": 0.06269387, + "balance_loss_mlp": 0.01256461, + "epoch": 0.9869231925447167, + "flos": 19834833496320.0, + "grad_norm": 1.641333270842883, + "language_loss": 0.70467007, + "learning_rate": 1.7853539173111608e-09, + "loss": 0.78128219, + "num_input_tokens_seen": 354259430, + "router_z_loss_clip": 1.265625, + "router_z_loss_mlp": 0.08880615, + "step": 16415, + "time_per_iteration": 2.472790241241455 + }, + { + "auxiliary_loss_clip": 0.06392305, + "auxiliary_loss_mlp": 0.01261183, + "balance_loss_clip": 0.06268395, + "balance_loss_mlp": 0.01252636, + "epoch": 0.9869833157973846, + "flos": 20201716598400.0, + "grad_norm": 1.4440519411439314, + "language_loss": 0.75557512, + "learning_rate": 1.7689393774362737e-09, + "loss": 0.83210999, + "num_input_tokens_seen": 354279490, + "router_z_loss_clip": 1.23730469, + "router_z_loss_mlp": 0.08551025, + "step": 16416, + "time_per_iteration": 2.5069968700408936 + }, + { + "auxiliary_loss_clip": 0.06400134, + "auxiliary_loss_mlp": 0.01266213, + "balance_loss_clip": 0.062718, + "balance_loss_mlp": 0.01256843, + "epoch": 0.9870434390500527, + "flos": 16103753291520.0, + "grad_norm": 1.8261025745727175, + "language_loss": 0.70445406, + "learning_rate": 1.7526006106833858e-09, + "loss": 0.78111756, + "num_input_tokens_seen": 354295080, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.09368896, + "step": 16417, + "time_per_iteration": 2.4517784118652344 + }, + { + "auxiliary_loss_clip": 0.06412635, + "auxiliary_loss_mlp": 0.01265538, + "balance_loss_clip": 0.06275108, + "balance_loss_mlp": 0.01255513, + "epoch": 0.9871035623027206, + "flos": 21766941267840.0, + "grad_norm": 1.4145230092930503, + "language_loss": 0.70816773, + "learning_rate": 1.7363376176720013e-09, + "loss": 0.78494942, + "num_input_tokens_seen": 354314610, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.10021973, + "step": 16418, + "time_per_iteration": 2.5196893215179443 + }, + { + "auxiliary_loss_clip": 0.06306736, + "auxiliary_loss_mlp": 0.01250685, + "balance_loss_clip": 0.06252833, + "balance_loss_mlp": 0.0124971, + "epoch": 0.9871636855553886, + "flos": 70240936970880.0, + "grad_norm": 0.6409677987917212, + "language_loss": 0.53744692, + "learning_rate": 1.7201503990189603e-09, + "loss": 0.61302114, + "num_input_tokens_seen": 354383115, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.00975037, + "step": 16419, + "time_per_iteration": 4.717554330825806 + }, + { + "auxiliary_loss_clip": 0.06404417, + "auxiliary_loss_mlp": 0.01264412, + "balance_loss_clip": 0.06268717, + "balance_loss_mlp": 0.01254464, + "epoch": 0.9872238088080565, + "flos": 25052789203200.0, + "grad_norm": 2.3110174767600635, + "language_loss": 0.78357494, + "learning_rate": 1.7040389553382162e-09, + "loss": 0.86026323, + "num_input_tokens_seen": 354403115, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.0994873, + "step": 16420, + "time_per_iteration": 2.522343635559082 + }, + { + "auxiliary_loss_clip": 0.06399032, + "auxiliary_loss_mlp": 0.01265144, + "balance_loss_clip": 0.06271401, + "balance_loss_mlp": 0.01256126, + "epoch": 0.9872839320607245, + "flos": 19472268879360.0, + "grad_norm": 1.4612255153298364, + "language_loss": 0.7113086, + "learning_rate": 1.6880032872403916e-09, + "loss": 0.7879504, + "num_input_tokens_seen": 354424520, + "router_z_loss_clip": 1.27734375, + "router_z_loss_mlp": 0.09020996, + "step": 16421, + "time_per_iteration": 2.5539984703063965 + }, + { + "auxiliary_loss_clip": 0.06407337, + "auxiliary_loss_mlp": 0.01266413, + "balance_loss_clip": 0.06271101, + "balance_loss_mlp": 0.0125659, + "epoch": 0.9873440553133924, + "flos": 26950166657280.0, + "grad_norm": 1.8769291751528816, + "language_loss": 0.82184935, + "learning_rate": 1.6720433953338886e-09, + "loss": 0.89858687, + "num_input_tokens_seen": 354444800, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.09820557, + "step": 16422, + "time_per_iteration": 2.5518367290496826 + }, + { + "auxiliary_loss_clip": 0.06399193, + "auxiliary_loss_mlp": 0.01263419, + "balance_loss_clip": 0.0627217, + "balance_loss_mlp": 0.01254884, + "epoch": 0.9874041785660604, + "flos": 19068181764480.0, + "grad_norm": 1.559911203458106, + "language_loss": 0.85809267, + "learning_rate": 1.656159280223779e-09, + "loss": 0.93471885, + "num_input_tokens_seen": 354464590, + "router_z_loss_clip": 1.27050781, + "router_z_loss_mlp": 0.08538818, + "step": 16423, + "time_per_iteration": 2.511932849884033 + }, + { + "auxiliary_loss_clip": 0.06401457, + "auxiliary_loss_mlp": 0.01264252, + "balance_loss_clip": 0.06268983, + "balance_loss_mlp": 0.01255144, + "epoch": 0.9874643018187284, + "flos": 21112195063680.0, + "grad_norm": 1.7455614219935738, + "language_loss": 0.70705903, + "learning_rate": 1.6403509425122475e-09, + "loss": 0.78371602, + "num_input_tokens_seen": 354484145, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.09106445, + "step": 16424, + "time_per_iteration": 2.5012552738189697 + }, + { + "auxiliary_loss_clip": 0.0640029, + "auxiliary_loss_mlp": 0.01266657, + "balance_loss_clip": 0.062686, + "balance_loss_mlp": 0.01257197, + "epoch": 0.9875244250713964, + "flos": 24432982951680.0, + "grad_norm": 1.9007162164582931, + "language_loss": 0.81031597, + "learning_rate": 1.6246183827990366e-09, + "loss": 0.88698548, + "num_input_tokens_seen": 354502475, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.09466553, + "step": 16425, + "time_per_iteration": 2.5602309703826904 + }, + { + "auxiliary_loss_clip": 0.06403489, + "auxiliary_loss_mlp": 0.01265933, + "balance_loss_clip": 0.06270744, + "balance_loss_mlp": 0.01255884, + "epoch": 0.9875845483240643, + "flos": 25124388117120.0, + "grad_norm": 1.972243539520393, + "language_loss": 0.80218101, + "learning_rate": 1.6089616016803364e-09, + "loss": 0.87887526, + "num_input_tokens_seen": 354521855, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.10046387, + "step": 16426, + "time_per_iteration": 4.0034801959991455 + }, + { + "auxiliary_loss_clip": 0.06401198, + "auxiliary_loss_mlp": 0.01268645, + "balance_loss_clip": 0.06273003, + "balance_loss_mlp": 0.01258763, + "epoch": 0.9876446715767323, + "flos": 16587447384960.0, + "grad_norm": 1.730891223738535, + "language_loss": 0.84535158, + "learning_rate": 1.593380599750338e-09, + "loss": 0.92205, + "num_input_tokens_seen": 354539535, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.09887695, + "step": 16427, + "time_per_iteration": 2.5224578380584717 + }, + { + "auxiliary_loss_clip": 0.06397066, + "auxiliary_loss_mlp": 0.01263748, + "balance_loss_clip": 0.06267832, + "balance_loss_mlp": 0.01254218, + "epoch": 0.9877047948294003, + "flos": 21622527555840.0, + "grad_norm": 1.6150790821834389, + "language_loss": 0.70599663, + "learning_rate": 1.577875377599458e-09, + "loss": 0.78260475, + "num_input_tokens_seen": 354557430, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.09527588, + "step": 16428, + "time_per_iteration": 2.530439615249634 + }, + { + "auxiliary_loss_clip": 0.06398337, + "auxiliary_loss_mlp": 0.01265208, + "balance_loss_clip": 0.06270449, + "balance_loss_mlp": 0.01256386, + "epoch": 0.9877649180820682, + "flos": 21184842153600.0, + "grad_norm": 1.9151863241472484, + "language_loss": 0.80755043, + "learning_rate": 1.5624459358158926e-09, + "loss": 0.88418591, + "num_input_tokens_seen": 354574735, + "router_z_loss_clip": 1.27929688, + "router_z_loss_mlp": 0.0881958, + "step": 16429, + "time_per_iteration": 2.5248844623565674 + }, + { + "auxiliary_loss_clip": 0.06398588, + "auxiliary_loss_mlp": 0.01266267, + "balance_loss_clip": 0.06267557, + "balance_loss_mlp": 0.01256724, + "epoch": 0.9878250413347363, + "flos": 39758596502400.0, + "grad_norm": 1.529650874257726, + "language_loss": 0.62086964, + "learning_rate": 1.5470922749845073e-09, + "loss": 0.69751823, + "num_input_tokens_seen": 354597050, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09545898, + "step": 16430, + "time_per_iteration": 2.6865828037261963 + }, + { + "auxiliary_loss_clip": 0.06405398, + "auxiliary_loss_mlp": 0.01268313, + "balance_loss_clip": 0.06273668, + "balance_loss_mlp": 0.01259044, + "epoch": 0.9878851645874042, + "flos": 29433584367360.0, + "grad_norm": 1.294361870195289, + "language_loss": 0.73193979, + "learning_rate": 1.531814395687725e-09, + "loss": 0.8086769, + "num_input_tokens_seen": 354619095, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.09277344, + "step": 16431, + "time_per_iteration": 2.584623336791992 + }, + { + "auxiliary_loss_clip": 0.06408115, + "auxiliary_loss_mlp": 0.01268719, + "balance_loss_clip": 0.06277065, + "balance_loss_mlp": 0.01259021, + "epoch": 0.9879452878400722, + "flos": 15810230039040.0, + "grad_norm": 2.003563247379043, + "language_loss": 0.80578899, + "learning_rate": 1.5166122985048602e-09, + "loss": 0.88255733, + "num_input_tokens_seen": 354633790, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.09698486, + "step": 16432, + "time_per_iteration": 2.4977803230285645 + }, + { + "auxiliary_loss_clip": 0.0639713, + "auxiliary_loss_mlp": 0.01263453, + "balance_loss_clip": 0.06268157, + "balance_loss_mlp": 0.01255121, + "epoch": 0.9880054110927401, + "flos": 22239985893120.0, + "grad_norm": 1.8780022898088136, + "language_loss": 0.80855387, + "learning_rate": 1.5014859840123405e-09, + "loss": 0.88515973, + "num_input_tokens_seen": 354653180, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.08331299, + "step": 16433, + "time_per_iteration": 2.559974431991577 + }, + { + "auxiliary_loss_clip": 0.06398477, + "auxiliary_loss_mlp": 0.01263192, + "balance_loss_clip": 0.06270765, + "balance_loss_mlp": 0.01254067, + "epoch": 0.9880655343454081, + "flos": 28770830098560.0, + "grad_norm": 3.071762614653828, + "language_loss": 0.65055972, + "learning_rate": 1.4864354527837075e-09, + "loss": 0.72717643, + "num_input_tokens_seen": 354669900, + "router_z_loss_clip": 1.27734375, + "router_z_loss_mlp": 0.09130859, + "step": 16434, + "time_per_iteration": 2.5954465866088867 + }, + { + "auxiliary_loss_clip": 0.06404148, + "auxiliary_loss_mlp": 0.01263004, + "balance_loss_clip": 0.06270909, + "balance_loss_mlp": 0.01253104, + "epoch": 0.988125657598076, + "flos": 32861581954560.0, + "grad_norm": 1.5114449517285122, + "language_loss": 0.69690335, + "learning_rate": 1.4714607053896154e-09, + "loss": 0.77357489, + "num_input_tokens_seen": 354693165, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.09899902, + "step": 16435, + "time_per_iteration": 2.6048479080200195 + }, + { + "auxiliary_loss_clip": 0.06401417, + "auxiliary_loss_mlp": 0.01263505, + "balance_loss_clip": 0.0627191, + "balance_loss_mlp": 0.01253915, + "epoch": 0.988185780850744, + "flos": 19396728823680.0, + "grad_norm": 1.5491204598191355, + "language_loss": 0.75873798, + "learning_rate": 1.4565617423980548e-09, + "loss": 0.83538723, + "num_input_tokens_seen": 354711915, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.09594727, + "step": 16436, + "time_per_iteration": 2.5019142627716064 + }, + { + "auxiliary_loss_clip": 0.06399369, + "auxiliary_loss_mlp": 0.01264401, + "balance_loss_clip": 0.06268755, + "balance_loss_mlp": 0.012549, + "epoch": 0.988245904103412, + "flos": 22534976592000.0, + "grad_norm": 2.2922063156337216, + "language_loss": 0.74628437, + "learning_rate": 1.4417385643741286e-09, + "loss": 0.82292199, + "num_input_tokens_seen": 354729135, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.09503174, + "step": 16437, + "time_per_iteration": 2.4945950508117676 + }, + { + "auxiliary_loss_clip": 0.06395677, + "auxiliary_loss_mlp": 0.01265076, + "balance_loss_clip": 0.06269895, + "balance_loss_mlp": 0.0125623, + "epoch": 0.98830602735608, + "flos": 28666974562560.0, + "grad_norm": 1.3644693930192495, + "language_loss": 0.60571569, + "learning_rate": 1.4269911718796103e-09, + "loss": 0.68232322, + "num_input_tokens_seen": 354752530, + "router_z_loss_clip": 1.25585938, + "router_z_loss_mlp": 0.08850098, + "step": 16438, + "time_per_iteration": 2.5724501609802246 + }, + { + "auxiliary_loss_clip": 0.06400715, + "auxiliary_loss_mlp": 0.01265196, + "balance_loss_clip": 0.06271615, + "balance_loss_mlp": 0.01255546, + "epoch": 0.9883661506087479, + "flos": 21002343960960.0, + "grad_norm": 1.7275853151179177, + "language_loss": 0.71979439, + "learning_rate": 1.4123195654738295e-09, + "loss": 0.79645348, + "num_input_tokens_seen": 354771135, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.09649658, + "step": 16439, + "time_per_iteration": 2.4901344776153564 + }, + { + "auxiliary_loss_clip": 0.06396712, + "auxiliary_loss_mlp": 0.01265241, + "balance_loss_clip": 0.06268465, + "balance_loss_mlp": 0.01256145, + "epoch": 0.9884262738614159, + "flos": 32714065641600.0, + "grad_norm": 1.7534107682801081, + "language_loss": 0.60016227, + "learning_rate": 1.3977237457134528e-09, + "loss": 0.67678177, + "num_input_tokens_seen": 354791800, + "router_z_loss_clip": 1.28320312, + "router_z_loss_mlp": 0.09100342, + "step": 16440, + "time_per_iteration": 2.6009092330932617 + }, + { + "auxiliary_loss_clip": 0.06403635, + "auxiliary_loss_mlp": 0.01262738, + "balance_loss_clip": 0.06269899, + "balance_loss_mlp": 0.01253243, + "epoch": 0.9884863971140839, + "flos": 17570153669760.0, + "grad_norm": 2.2528495077342634, + "language_loss": 0.76208878, + "learning_rate": 1.3832037131513707e-09, + "loss": 0.83875251, + "num_input_tokens_seen": 354809200, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.09503174, + "step": 16441, + "time_per_iteration": 2.520890712738037 + }, + { + "auxiliary_loss_clip": 0.06403451, + "auxiliary_loss_mlp": 0.01265503, + "balance_loss_clip": 0.06271541, + "balance_loss_mlp": 0.01256348, + "epoch": 0.9885465203667518, + "flos": 40562116830720.0, + "grad_norm": 1.8751614088289563, + "language_loss": 0.6817615, + "learning_rate": 1.3687594683386982e-09, + "loss": 0.75845104, + "num_input_tokens_seen": 354829945, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.09155273, + "step": 16442, + "time_per_iteration": 2.7019503116607666 + }, + { + "auxiliary_loss_clip": 0.06398676, + "auxiliary_loss_mlp": 0.01266035, + "balance_loss_clip": 0.06270617, + "balance_loss_mlp": 0.01257142, + "epoch": 0.9886066436194199, + "flos": 13813022044800.0, + "grad_norm": 2.4056325615728693, + "language_loss": 0.74363172, + "learning_rate": 1.3543910118227753e-09, + "loss": 0.82027876, + "num_input_tokens_seen": 354845055, + "router_z_loss_clip": 1.28222656, + "router_z_loss_mlp": 0.08892822, + "step": 16443, + "time_per_iteration": 3.9015562534332275 + }, + { + "auxiliary_loss_clip": 0.06400269, + "auxiliary_loss_mlp": 0.01268202, + "balance_loss_clip": 0.0626837, + "balance_loss_mlp": 0.01258326, + "epoch": 0.9886667668720878, + "flos": 23330824272000.0, + "grad_norm": 1.903669663592203, + "language_loss": 0.7392866, + "learning_rate": 1.3400983441487213e-09, + "loss": 0.81597131, + "num_input_tokens_seen": 354864680, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09875488, + "step": 16444, + "time_per_iteration": 2.555422067642212 + }, + { + "auxiliary_loss_clip": 0.06397615, + "auxiliary_loss_mlp": 0.01264619, + "balance_loss_clip": 0.06270696, + "balance_loss_mlp": 0.01256238, + "epoch": 0.9887268901247558, + "flos": 22711814634240.0, + "grad_norm": 2.125613653372287, + "language_loss": 0.69637549, + "learning_rate": 1.325881465858547e-09, + "loss": 0.77299786, + "num_input_tokens_seen": 354885685, + "router_z_loss_clip": 1.26855469, + "router_z_loss_mlp": 0.08380127, + "step": 16445, + "time_per_iteration": 2.561236619949341 + }, + { + "auxiliary_loss_clip": 0.06407273, + "auxiliary_loss_mlp": 0.01262681, + "balance_loss_clip": 0.06277097, + "balance_loss_mlp": 0.01253269, + "epoch": 0.9887870133774237, + "flos": 13046118750720.0, + "grad_norm": 3.18173440901386, + "language_loss": 0.60854781, + "learning_rate": 1.311740377491155e-09, + "loss": 0.68524736, + "num_input_tokens_seen": 354901505, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.09411621, + "step": 16446, + "time_per_iteration": 2.4627370834350586 + }, + { + "auxiliary_loss_clip": 0.06401445, + "auxiliary_loss_mlp": 0.0126252, + "balance_loss_clip": 0.06271827, + "balance_loss_mlp": 0.01253967, + "epoch": 0.9888471366300917, + "flos": 15164288513280.0, + "grad_norm": 2.1657095582443797, + "language_loss": 0.71381092, + "learning_rate": 1.297675079582783e-09, + "loss": 0.79045057, + "num_input_tokens_seen": 354920060, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.08544922, + "step": 16447, + "time_per_iteration": 2.516580104827881 + }, + { + "auxiliary_loss_clip": 0.06397137, + "auxiliary_loss_mlp": 0.01264224, + "balance_loss_clip": 0.06267823, + "balance_loss_mlp": 0.01255445, + "epoch": 0.9889072598827596, + "flos": 25125771709440.0, + "grad_norm": 1.6408411032004997, + "language_loss": 0.83849478, + "learning_rate": 1.2836855726667818e-09, + "loss": 0.91510838, + "num_input_tokens_seen": 354938690, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.08776855, + "step": 16448, + "time_per_iteration": 2.6334075927734375 + }, + { + "auxiliary_loss_clip": 0.06398049, + "auxiliary_loss_mlp": 0.01263872, + "balance_loss_clip": 0.0627088, + "balance_loss_mlp": 0.01255378, + "epoch": 0.9889673831354276, + "flos": 16734502500480.0, + "grad_norm": 2.1555382523852766, + "language_loss": 0.70484287, + "learning_rate": 1.26977185727406e-09, + "loss": 0.78146207, + "num_input_tokens_seen": 354956955, + "router_z_loss_clip": 1.27148438, + "router_z_loss_mlp": 0.0848999, + "step": 16449, + "time_per_iteration": 2.533296823501587 + }, + { + "auxiliary_loss_clip": 0.06404455, + "auxiliary_loss_mlp": 0.01263914, + "balance_loss_clip": 0.06269993, + "balance_loss_mlp": 0.01254764, + "epoch": 0.9890275063880956, + "flos": 35593059277440.0, + "grad_norm": 2.393318173005223, + "language_loss": 0.74060148, + "learning_rate": 1.25593393393153e-09, + "loss": 0.81728518, + "num_input_tokens_seen": 354976800, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.09143066, + "step": 16450, + "time_per_iteration": 2.622335195541382 + }, + { + "auxiliary_loss_clip": 0.06403831, + "auxiliary_loss_mlp": 0.01265203, + "balance_loss_clip": 0.06269386, + "balance_loss_mlp": 0.01255755, + "epoch": 0.9890876296407636, + "flos": 18958246807680.0, + "grad_norm": 1.56920034871992, + "language_loss": 0.79678428, + "learning_rate": 1.242171803164549e-09, + "loss": 0.8734746, + "num_input_tokens_seen": 354996625, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.09454346, + "step": 16451, + "time_per_iteration": 2.5179364681243896 + }, + { + "auxiliary_loss_clip": 0.06404501, + "auxiliary_loss_mlp": 0.01263638, + "balance_loss_clip": 0.06270505, + "balance_loss_mlp": 0.01254179, + "epoch": 0.9891477528934315, + "flos": 23776140395520.0, + "grad_norm": 2.1825746418947283, + "language_loss": 0.70112723, + "learning_rate": 1.2284854654946996e-09, + "loss": 0.77780861, + "num_input_tokens_seen": 355014535, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.09460449, + "step": 16452, + "time_per_iteration": 3.9754366874694824 + }, + { + "auxiliary_loss_clip": 0.06395538, + "auxiliary_loss_mlp": 0.01263016, + "balance_loss_clip": 0.06269531, + "balance_loss_mlp": 0.01254999, + "epoch": 0.9892078761460995, + "flos": 20778490978560.0, + "grad_norm": 1.550723942339921, + "language_loss": 0.74353349, + "learning_rate": 1.2148749214409004e-09, + "loss": 0.82011908, + "num_input_tokens_seen": 355033280, + "router_z_loss_clip": 1.26074219, + "router_z_loss_mlp": 0.08013916, + "step": 16453, + "time_per_iteration": 2.5287036895751953 + }, + { + "auxiliary_loss_clip": 0.06401984, + "auxiliary_loss_mlp": 0.01266017, + "balance_loss_clip": 0.06270185, + "balance_loss_mlp": 0.01256003, + "epoch": 0.9892679993987675, + "flos": 23374568903040.0, + "grad_norm": 2.164886509887776, + "language_loss": 0.70232868, + "learning_rate": 1.2013401715191828e-09, + "loss": 0.77900863, + "num_input_tokens_seen": 355053320, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.10009766, + "step": 16454, + "time_per_iteration": 2.5466010570526123 + }, + { + "auxiliary_loss_clip": 0.06396247, + "auxiliary_loss_mlp": 0.01268105, + "balance_loss_clip": 0.06270434, + "balance_loss_mlp": 0.01259206, + "epoch": 0.9893281226514354, + "flos": 22711101874560.0, + "grad_norm": 1.889448765409953, + "language_loss": 0.75790614, + "learning_rate": 1.1878812162433583e-09, + "loss": 0.83454967, + "num_input_tokens_seen": 355070230, + "router_z_loss_clip": 1.25878906, + "router_z_loss_mlp": 0.08898926, + "step": 16455, + "time_per_iteration": 2.5108723640441895 + }, + { + "auxiliary_loss_clip": 0.06395634, + "auxiliary_loss_mlp": 0.01266751, + "balance_loss_clip": 0.0626895, + "balance_loss_mlp": 0.0125793, + "epoch": 0.9893882459041035, + "flos": 21802761688320.0, + "grad_norm": 1.755990040999191, + "language_loss": 0.65666765, + "learning_rate": 1.1744980561230188e-09, + "loss": 0.73329145, + "num_input_tokens_seen": 355090125, + "router_z_loss_clip": 1.26660156, + "router_z_loss_mlp": 0.0881958, + "step": 16456, + "time_per_iteration": 2.6398427486419678 + }, + { + "auxiliary_loss_clip": 0.06405662, + "auxiliary_loss_mlp": 0.0126407, + "balance_loss_clip": 0.06272131, + "balance_loss_mlp": 0.01254074, + "epoch": 0.9894483691567714, + "flos": 18119618818560.0, + "grad_norm": 1.8518732955546615, + "language_loss": 0.74572182, + "learning_rate": 1.161190691666203e-09, + "loss": 0.82241917, + "num_input_tokens_seen": 355107890, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.09997559, + "step": 16457, + "time_per_iteration": 2.546666383743286 + }, + { + "auxiliary_loss_clip": 0.06405069, + "auxiliary_loss_mlp": 0.01261015, + "balance_loss_clip": 0.06272469, + "balance_loss_mlp": 0.01251633, + "epoch": 0.9895084924094394, + "flos": 31219559418240.0, + "grad_norm": 4.713405572654526, + "language_loss": 0.69061947, + "learning_rate": 1.1479591233773954e-09, + "loss": 0.76728028, + "num_input_tokens_seen": 355126340, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09387207, + "step": 16458, + "time_per_iteration": 4.058138847351074 + }, + { + "auxiliary_loss_clip": 0.06397022, + "auxiliary_loss_mlp": 0.01264937, + "balance_loss_clip": 0.0626925, + "balance_loss_mlp": 0.01255836, + "epoch": 0.9895686156621073, + "flos": 19683376041600.0, + "grad_norm": 1.6011210328127727, + "language_loss": 0.79420429, + "learning_rate": 1.1348033517581956e-09, + "loss": 0.87082392, + "num_input_tokens_seen": 355144025, + "router_z_loss_clip": 1.27832031, + "router_z_loss_mlp": 0.09106445, + "step": 16459, + "time_per_iteration": 2.4854841232299805 + }, + { + "auxiliary_loss_clip": 0.06404197, + "auxiliary_loss_mlp": 0.01262909, + "balance_loss_clip": 0.06271499, + "balance_loss_mlp": 0.01253587, + "epoch": 0.9896287389147753, + "flos": 23587604709120.0, + "grad_norm": 1.7868867036072664, + "language_loss": 0.71253073, + "learning_rate": 1.1217233773075373e-09, + "loss": 0.78920174, + "num_input_tokens_seen": 355163125, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09320068, + "step": 16460, + "time_per_iteration": 2.5770578384399414 + }, + { + "auxiliary_loss_clip": 0.06403832, + "auxiliary_loss_mlp": 0.01263939, + "balance_loss_clip": 0.06271418, + "balance_loss_mlp": 0.01254564, + "epoch": 0.9896888621674432, + "flos": 29612854177920.0, + "grad_norm": 1.4614514408304804, + "language_loss": 0.8714518, + "learning_rate": 1.1087192005214685e-09, + "loss": 0.94812953, + "num_input_tokens_seen": 355184060, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.09381104, + "step": 16461, + "time_per_iteration": 2.5683257579803467 + }, + { + "auxiliary_loss_clip": 0.06397907, + "auxiliary_loss_mlp": 0.01267148, + "balance_loss_clip": 0.06267931, + "balance_loss_mlp": 0.01256991, + "epoch": 0.9897489854201112, + "flos": 23701648515840.0, + "grad_norm": 1.9559550168181632, + "language_loss": 0.63296109, + "learning_rate": 1.09579082189315e-09, + "loss": 0.70961165, + "num_input_tokens_seen": 355204505, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.10162354, + "step": 16462, + "time_per_iteration": 2.5388832092285156 + }, + { + "auxiliary_loss_clip": 0.06400032, + "auxiliary_loss_mlp": 0.01263191, + "balance_loss_clip": 0.06270525, + "balance_loss_mlp": 0.01254602, + "epoch": 0.9898091086727792, + "flos": 13230252097920.0, + "grad_norm": 1.655754614262544, + "language_loss": 0.73308957, + "learning_rate": 1.0829382419126343e-09, + "loss": 0.80972171, + "num_input_tokens_seen": 355223055, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.08581543, + "step": 16463, + "time_per_iteration": 2.4658756256103516 + }, + { + "auxiliary_loss_clip": 0.06401134, + "auxiliary_loss_mlp": 0.01265976, + "balance_loss_clip": 0.06269235, + "balance_loss_mlp": 0.01256022, + "epoch": 0.9898692319254472, + "flos": 22937135063040.0, + "grad_norm": 7.43106423326432, + "language_loss": 0.7060079, + "learning_rate": 1.0701614610675314e-09, + "loss": 0.78267902, + "num_input_tokens_seen": 355242000, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09960938, + "step": 16464, + "time_per_iteration": 2.5187556743621826 + }, + { + "auxiliary_loss_clip": 0.06401595, + "auxiliary_loss_mlp": 0.01263658, + "balance_loss_clip": 0.06268543, + "balance_loss_mlp": 0.01254116, + "epoch": 0.9899293551781151, + "flos": 12463223022720.0, + "grad_norm": 1.840444252233611, + "language_loss": 0.73403418, + "learning_rate": 1.0574604798421204e-09, + "loss": 0.81068671, + "num_input_tokens_seen": 355260175, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.09545898, + "step": 16465, + "time_per_iteration": 3.9173574447631836 + }, + { + "auxiliary_loss_clip": 0.06399283, + "auxiliary_loss_mlp": 0.0126429, + "balance_loss_clip": 0.06270085, + "balance_loss_mlp": 0.01255648, + "epoch": 0.9899894784307831, + "flos": 26878567743360.0, + "grad_norm": 1.5836016872401681, + "language_loss": 0.86692631, + "learning_rate": 1.0448352987182386e-09, + "loss": 0.94356197, + "num_input_tokens_seen": 355281930, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.08642578, + "step": 16466, + "time_per_iteration": 2.5749897956848145 + }, + { + "auxiliary_loss_clip": 0.06399287, + "auxiliary_loss_mlp": 0.01264079, + "balance_loss_clip": 0.06269732, + "balance_loss_mlp": 0.01254775, + "epoch": 0.990049601683451, + "flos": 21548287238400.0, + "grad_norm": 1.6828167464492951, + "language_loss": 0.7183401, + "learning_rate": 1.0322859181743915e-09, + "loss": 0.79497385, + "num_input_tokens_seen": 355301555, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.09301758, + "step": 16467, + "time_per_iteration": 2.5681228637695312 + }, + { + "auxiliary_loss_clip": 0.06401198, + "auxiliary_loss_mlp": 0.01265476, + "balance_loss_clip": 0.06271577, + "balance_loss_mlp": 0.01256428, + "epoch": 0.990109724936119, + "flos": 28780137901440.0, + "grad_norm": 1.2060811454546625, + "language_loss": 0.65264559, + "learning_rate": 1.019812338686643e-09, + "loss": 0.7293123, + "num_input_tokens_seen": 355324925, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.09039307, + "step": 16468, + "time_per_iteration": 2.5990076065063477 + }, + { + "auxiliary_loss_clip": 0.06405121, + "auxiliary_loss_mlp": 0.01263189, + "balance_loss_clip": 0.06269127, + "balance_loss_mlp": 0.0125342, + "epoch": 0.9901698481887871, + "flos": 29281288371840.0, + "grad_norm": 1.6622205655344582, + "language_loss": 0.62186044, + "learning_rate": 1.0074145607281704e-09, + "loss": 0.69854355, + "num_input_tokens_seen": 355343875, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.09765625, + "step": 16469, + "time_per_iteration": 2.6046206951141357 + }, + { + "auxiliary_loss_clip": 0.06401995, + "auxiliary_loss_mlp": 0.01264459, + "balance_loss_clip": 0.06269933, + "balance_loss_mlp": 0.01255072, + "epoch": 0.990229971441455, + "flos": 15964161189120.0, + "grad_norm": 2.6347119694953007, + "language_loss": 0.70456368, + "learning_rate": 9.950925847685976e-10, + "loss": 0.78122824, + "num_input_tokens_seen": 355358835, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.09393311, + "step": 16470, + "time_per_iteration": 2.4915683269500732 + }, + { + "auxiliary_loss_clip": 0.06312285, + "auxiliary_loss_mlp": 0.01249711, + "balance_loss_clip": 0.06258221, + "balance_loss_mlp": 0.01248712, + "epoch": 0.990290094694123, + "flos": 69801322924800.0, + "grad_norm": 0.6553477289574845, + "language_loss": 0.55503154, + "learning_rate": 9.828464112755509e-10, + "loss": 0.63065147, + "num_input_tokens_seen": 355431225, + "router_z_loss_clip": 0.54003906, + "router_z_loss_mlp": 0.00998688, + "step": 16471, + "time_per_iteration": 3.337892770767212 + }, + { + "auxiliary_loss_clip": 0.06401256, + "auxiliary_loss_mlp": 0.0126515, + "balance_loss_clip": 0.06271065, + "balance_loss_mlp": 0.0125612, + "epoch": 0.9903502179467909, + "flos": 16257894076800.0, + "grad_norm": 1.8227264770016582, + "language_loss": 0.84216011, + "learning_rate": 9.706760407131032e-10, + "loss": 0.9188242, + "num_input_tokens_seen": 355448250, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.09033203, + "step": 16472, + "time_per_iteration": 2.5153591632843018 + }, + { + "auxiliary_loss_clip": 0.06403252, + "auxiliary_loss_mlp": 0.01265496, + "balance_loss_clip": 0.06271196, + "balance_loss_mlp": 0.01255965, + "epoch": 0.9904103411994589, + "flos": 21694671521280.0, + "grad_norm": 1.7363396784721263, + "language_loss": 0.86251837, + "learning_rate": 9.585814735431075e-10, + "loss": 0.93920588, + "num_input_tokens_seen": 355467040, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.09527588, + "step": 16473, + "time_per_iteration": 2.514474630355835 + }, + { + "auxiliary_loss_clip": 0.0639959, + "auxiliary_loss_mlp": 0.01267324, + "balance_loss_clip": 0.0626872, + "balance_loss_mlp": 0.01258724, + "epoch": 0.9904704644521268, + "flos": 25746584209920.0, + "grad_norm": 1.9148437433101497, + "language_loss": 0.84488249, + "learning_rate": 9.465627102240859e-10, + "loss": 0.9215517, + "num_input_tokens_seen": 355487825, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.08605957, + "step": 16474, + "time_per_iteration": 2.561305284500122 + }, + { + "auxiliary_loss_clip": 0.06397276, + "auxiliary_loss_mlp": 0.01263163, + "balance_loss_clip": 0.06266478, + "balance_loss_mlp": 0.01254049, + "epoch": 0.9905305877047949, + "flos": 21914834924160.0, + "grad_norm": 2.553445622723368, + "language_loss": 0.76806021, + "learning_rate": 9.346197512116738e-10, + "loss": 0.84466457, + "num_input_tokens_seen": 355507445, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09112549, + "step": 16475, + "time_per_iteration": 2.5100929737091064 + }, + { + "auxiliary_loss_clip": 0.06403254, + "auxiliary_loss_mlp": 0.01262449, + "balance_loss_clip": 0.06270232, + "balance_loss_mlp": 0.01252871, + "epoch": 0.9905907109574628, + "flos": 21397961813760.0, + "grad_norm": 1.4250465308129456, + "language_loss": 0.7599352, + "learning_rate": 9.227525969588423e-10, + "loss": 0.8365922, + "num_input_tokens_seen": 355527205, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.09576416, + "step": 16476, + "time_per_iteration": 2.5377602577209473 + }, + { + "auxiliary_loss_clip": 0.06410898, + "auxiliary_loss_mlp": 0.0126517, + "balance_loss_clip": 0.0627154, + "balance_loss_mlp": 0.01254298, + "epoch": 0.9906508342101308, + "flos": 20527831889280.0, + "grad_norm": 2.5556456243776684, + "language_loss": 0.67784524, + "learning_rate": 9.109612479154538e-10, + "loss": 0.75460589, + "num_input_tokens_seen": 355544740, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.10876465, + "step": 16477, + "time_per_iteration": 2.500948667526245 + }, + { + "auxiliary_loss_clip": 0.06406661, + "auxiliary_loss_mlp": 0.01267782, + "balance_loss_clip": 0.06271023, + "balance_loss_mlp": 0.01257113, + "epoch": 0.9907109574627987, + "flos": 21367633835520.0, + "grad_norm": 1.8026145726768161, + "language_loss": 0.71967936, + "learning_rate": 8.992457045289282e-10, + "loss": 0.79642379, + "num_input_tokens_seen": 355564385, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.10656738, + "step": 16478, + "time_per_iteration": 2.5416836738586426 + }, + { + "auxiliary_loss_clip": 0.0640207, + "auxiliary_loss_mlp": 0.01265345, + "balance_loss_clip": 0.06270047, + "balance_loss_mlp": 0.01255707, + "epoch": 0.9907710807154667, + "flos": 17342820743040.0, + "grad_norm": 2.2389355543560874, + "language_loss": 0.81408846, + "learning_rate": 8.876059672433545e-10, + "loss": 0.89076257, + "num_input_tokens_seen": 355579260, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.09631348, + "step": 16479, + "time_per_iteration": 2.4918854236602783 + }, + { + "auxiliary_loss_clip": 0.06405993, + "auxiliary_loss_mlp": 0.01266846, + "balance_loss_clip": 0.06272171, + "balance_loss_mlp": 0.01257417, + "epoch": 0.9908312039681346, + "flos": 28629518987520.0, + "grad_norm": 1.7024929779820783, + "language_loss": 0.6656878, + "learning_rate": 8.760420364999355e-10, + "loss": 0.7424162, + "num_input_tokens_seen": 355599790, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.09429932, + "step": 16480, + "time_per_iteration": 2.5911026000976562 + }, + { + "auxiliary_loss_clip": 0.06397465, + "auxiliary_loss_mlp": 0.01264495, + "balance_loss_clip": 0.06268594, + "balance_loss_mlp": 0.01255424, + "epoch": 0.9908913272208026, + "flos": 35779079341440.0, + "grad_norm": 1.9992383349547551, + "language_loss": 0.72380996, + "learning_rate": 8.645539127374313e-10, + "loss": 0.80042958, + "num_input_tokens_seen": 355620925, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.09069824, + "step": 16481, + "time_per_iteration": 2.6130805015563965 + }, + { + "auxiliary_loss_clip": 0.06397593, + "auxiliary_loss_mlp": 0.0126149, + "balance_loss_clip": 0.06269701, + "balance_loss_mlp": 0.01252591, + "epoch": 0.9909514504734707, + "flos": 19908444908160.0, + "grad_norm": 2.088225556047704, + "language_loss": 0.77833641, + "learning_rate": 8.531415963912713e-10, + "loss": 0.8549273, + "num_input_tokens_seen": 355639165, + "router_z_loss_clip": 1.27734375, + "router_z_loss_mlp": 0.08886719, + "step": 16482, + "time_per_iteration": 2.500314235687256 + }, + { + "auxiliary_loss_clip": 0.06400485, + "auxiliary_loss_mlp": 0.01263677, + "balance_loss_clip": 0.06268109, + "balance_loss_mlp": 0.01254331, + "epoch": 0.9910115737261386, + "flos": 20009910602880.0, + "grad_norm": 1.7779031696268206, + "language_loss": 0.75710553, + "learning_rate": 8.418050878944427e-10, + "loss": 0.83374715, + "num_input_tokens_seen": 355657320, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.09344482, + "step": 16483, + "time_per_iteration": 3.906383752822876 + }, + { + "auxiliary_loss_clip": 0.06312563, + "auxiliary_loss_mlp": 0.01253629, + "balance_loss_clip": 0.0625831, + "balance_loss_mlp": 0.01252545, + "epoch": 0.9910716969788066, + "flos": 70708950351360.0, + "grad_norm": 0.6833139744850949, + "language_loss": 0.53665406, + "learning_rate": 8.305443876768237e-10, + "loss": 0.61231595, + "num_input_tokens_seen": 355726370, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.010849, + "step": 16484, + "time_per_iteration": 3.2748491764068604 + }, + { + "auxiliary_loss_clip": 0.06397069, + "auxiliary_loss_mlp": 0.01263274, + "balance_loss_clip": 0.06271018, + "balance_loss_mlp": 0.01254608, + "epoch": 0.9911318202314745, + "flos": 21440448633600.0, + "grad_norm": 1.584141486996251, + "language_loss": 0.8189832, + "learning_rate": 8.19359496165184e-10, + "loss": 0.89558661, + "num_input_tokens_seen": 355745840, + "router_z_loss_clip": 1.26074219, + "router_z_loss_mlp": 0.08666992, + "step": 16485, + "time_per_iteration": 2.522608757019043 + }, + { + "auxiliary_loss_clip": 0.0639887, + "auxiliary_loss_mlp": 0.0126673, + "balance_loss_clip": 0.06270351, + "balance_loss_mlp": 0.01257402, + "epoch": 0.9911919434841425, + "flos": 19832653290240.0, + "grad_norm": 1.523507059973884, + "language_loss": 0.81901872, + "learning_rate": 8.082504137836288e-10, + "loss": 0.89567471, + "num_input_tokens_seen": 355763385, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.09332275, + "step": 16486, + "time_per_iteration": 2.5003557205200195 + }, + { + "auxiliary_loss_clip": 0.06405136, + "auxiliary_loss_mlp": 0.01263298, + "balance_loss_clip": 0.06271749, + "balance_loss_mlp": 0.01253887, + "epoch": 0.9912520667368104, + "flos": 41729040316800.0, + "grad_norm": 1.374674132460458, + "language_loss": 0.66326475, + "learning_rate": 7.972171409538209e-10, + "loss": 0.73994911, + "num_input_tokens_seen": 355786075, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.09417725, + "step": 16487, + "time_per_iteration": 2.6989879608154297 + }, + { + "auxiliary_loss_clip": 0.06396048, + "auxiliary_loss_mlp": 0.01260581, + "balance_loss_clip": 0.06269118, + "balance_loss_mlp": 0.01252671, + "epoch": 0.9913121899894785, + "flos": 23776559665920.0, + "grad_norm": 1.5105370838435217, + "language_loss": 0.77039683, + "learning_rate": 7.862596780936481e-10, + "loss": 0.84696317, + "num_input_tokens_seen": 355806295, + "router_z_loss_clip": 1.26953125, + "router_z_loss_mlp": 0.07913208, + "step": 16488, + "time_per_iteration": 2.5589473247528076 + }, + { + "auxiliary_loss_clip": 0.06408311, + "auxiliary_loss_mlp": 0.012619, + "balance_loss_clip": 0.06270079, + "balance_loss_mlp": 0.01251982, + "epoch": 0.9913723132421464, + "flos": 23776559665920.0, + "grad_norm": 4.081303895397492, + "language_loss": 0.68999302, + "learning_rate": 7.753780256190001e-10, + "loss": 0.76669514, + "num_input_tokens_seen": 355825730, + "router_z_loss_clip": 1.38183594, + "router_z_loss_mlp": 0.09912109, + "step": 16489, + "time_per_iteration": 2.5339298248291016 + }, + { + "auxiliary_loss_clip": 0.06312118, + "auxiliary_loss_mlp": 0.0125074, + "balance_loss_clip": 0.06257981, + "balance_loss_mlp": 0.01249744, + "epoch": 0.9914324364948144, + "flos": 71287234104960.0, + "grad_norm": 0.5966014121504264, + "language_loss": 0.52483445, + "learning_rate": 7.645721839424357e-10, + "loss": 0.60046303, + "num_input_tokens_seen": 355891545, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.00995636, + "step": 16490, + "time_per_iteration": 3.2971177101135254 + }, + { + "auxiliary_loss_clip": 0.06410297, + "auxiliary_loss_mlp": 0.01269129, + "balance_loss_clip": 0.06273858, + "balance_loss_mlp": 0.012588, + "epoch": 0.9914925597474823, + "flos": 23702109713280.0, + "grad_norm": 1.5769147749467787, + "language_loss": 0.75964558, + "learning_rate": 7.538421534734052e-10, + "loss": 0.83643979, + "num_input_tokens_seen": 355909920, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.10327148, + "step": 16491, + "time_per_iteration": 2.5335545539855957 + }, + { + "auxiliary_loss_clip": 0.06408622, + "auxiliary_loss_mlp": 0.01265093, + "balance_loss_clip": 0.06274007, + "balance_loss_mlp": 0.01254478, + "epoch": 0.9915526830001503, + "flos": 13437250410240.0, + "grad_norm": 2.0325070946840644, + "language_loss": 0.70255387, + "learning_rate": 7.431879346191383e-10, + "loss": 0.77929103, + "num_input_tokens_seen": 355923130, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.10626221, + "step": 16492, + "time_per_iteration": 3.9161179065704346 + }, + { + "auxiliary_loss_clip": 0.06401629, + "auxiliary_loss_mlp": 0.01263978, + "balance_loss_clip": 0.06271149, + "balance_loss_mlp": 0.01254191, + "epoch": 0.9916128062528182, + "flos": 20747282532480.0, + "grad_norm": 1.742564772152948, + "language_loss": 0.68796587, + "learning_rate": 7.326095277837563e-10, + "loss": 0.76462197, + "num_input_tokens_seen": 355941960, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.09783936, + "step": 16493, + "time_per_iteration": 2.5178070068359375 + }, + { + "auxiliary_loss_clip": 0.06404144, + "auxiliary_loss_mlp": 0.01264334, + "balance_loss_clip": 0.06268735, + "balance_loss_mlp": 0.0125478, + "epoch": 0.9916729295054862, + "flos": 22492825188480.0, + "grad_norm": 1.6130531837005415, + "language_loss": 0.71639037, + "learning_rate": 7.221069333678276e-10, + "loss": 0.79307514, + "num_input_tokens_seen": 355961640, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.09552002, + "step": 16494, + "time_per_iteration": 2.538949728012085 + }, + { + "auxiliary_loss_clip": 0.06406216, + "auxiliary_loss_mlp": 0.0126424, + "balance_loss_clip": 0.06271614, + "balance_loss_mlp": 0.01253755, + "epoch": 0.9917330527581543, + "flos": 14797573119360.0, + "grad_norm": 1.963098186344062, + "language_loss": 0.68285948, + "learning_rate": 7.116801517701443e-10, + "loss": 0.75956404, + "num_input_tokens_seen": 355977980, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.10491943, + "step": 16495, + "time_per_iteration": 2.4931821823120117 + }, + { + "auxiliary_loss_clip": 0.06310745, + "auxiliary_loss_mlp": 0.01252706, + "balance_loss_clip": 0.06256633, + "balance_loss_mlp": 0.01251622, + "epoch": 0.9917931760108222, + "flos": 59209551717120.0, + "grad_norm": 0.6971695961276645, + "language_loss": 0.5343821, + "learning_rate": 7.013291833859458e-10, + "loss": 0.61001664, + "num_input_tokens_seen": 356042900, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.01085663, + "step": 16496, + "time_per_iteration": 3.313877820968628 + }, + { + "auxiliary_loss_clip": 0.0640336, + "auxiliary_loss_mlp": 0.01264656, + "balance_loss_clip": 0.06270392, + "balance_loss_mlp": 0.01255054, + "epoch": 0.9918532992634902, + "flos": 26769052056960.0, + "grad_norm": 1.686792956138552, + "language_loss": 0.71729428, + "learning_rate": 6.91054028607585e-10, + "loss": 0.7939744, + "num_input_tokens_seen": 356063000, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.09613037, + "step": 16497, + "time_per_iteration": 4.06347918510437 + }, + { + "auxiliary_loss_clip": 0.06407828, + "auxiliary_loss_mlp": 0.01265363, + "balance_loss_clip": 0.06272013, + "balance_loss_mlp": 0.01255547, + "epoch": 0.9919134225161581, + "flos": 14980993706880.0, + "grad_norm": 2.091155080212875, + "language_loss": 0.82478547, + "learning_rate": 6.808546878249721e-10, + "loss": 0.90151739, + "num_input_tokens_seen": 356078130, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.0982666, + "step": 16498, + "time_per_iteration": 2.5037145614624023 + }, + { + "auxiliary_loss_clip": 0.06402234, + "auxiliary_loss_mlp": 0.01266692, + "balance_loss_clip": 0.06271948, + "balance_loss_mlp": 0.01257448, + "epoch": 0.9919735457688261, + "flos": 27825537461760.0, + "grad_norm": 3.5794951967468447, + "language_loss": 0.68476105, + "learning_rate": 6.707311614246869e-10, + "loss": 0.76145029, + "num_input_tokens_seen": 356101655, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.09246826, + "step": 16499, + "time_per_iteration": 2.5629689693450928 + }, + { + "auxiliary_loss_clip": 0.06405471, + "auxiliary_loss_mlp": 0.01263161, + "balance_loss_clip": 0.06270543, + "balance_loss_mlp": 0.01253792, + "epoch": 0.992033669021494, + "flos": 22568994149760.0, + "grad_norm": 2.4469510189518684, + "language_loss": 0.82463717, + "learning_rate": 6.606834497904223e-10, + "loss": 0.90132344, + "num_input_tokens_seen": 356121425, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.09368896, + "step": 16500, + "time_per_iteration": 2.5607094764709473 + }, + { + "auxiliary_loss_clip": 0.06403733, + "auxiliary_loss_mlp": 0.0126595, + "balance_loss_clip": 0.06271171, + "balance_loss_mlp": 0.01256121, + "epoch": 0.9920937922741621, + "flos": 25381671678720.0, + "grad_norm": 5.293314511420753, + "language_loss": 0.82256448, + "learning_rate": 6.507115533036511e-10, + "loss": 0.89926136, + "num_input_tokens_seen": 356140710, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09832764, + "step": 16501, + "time_per_iteration": 2.547940731048584 + }, + { + "auxiliary_loss_clip": 0.06401893, + "auxiliary_loss_mlp": 0.01267237, + "balance_loss_clip": 0.06269954, + "balance_loss_mlp": 0.01257897, + "epoch": 0.99215391552683, + "flos": 22061009571840.0, + "grad_norm": 2.044596215484759, + "language_loss": 0.7750001, + "learning_rate": 6.408154723420711e-10, + "loss": 0.85169148, + "num_input_tokens_seen": 356159835, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.09338379, + "step": 16502, + "time_per_iteration": 2.52785325050354 + }, + { + "auxiliary_loss_clip": 0.06407385, + "auxiliary_loss_mlp": 0.0126289, + "balance_loss_clip": 0.06270929, + "balance_loss_mlp": 0.01252549, + "epoch": 0.992214038779498, + "flos": 15419349941760.0, + "grad_norm": 2.2650147973319337, + "language_loss": 0.71174729, + "learning_rate": 6.309952072811597e-10, + "loss": 0.78845006, + "num_input_tokens_seen": 356177555, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.10333252, + "step": 16503, + "time_per_iteration": 2.479231595993042 + }, + { + "auxiliary_loss_clip": 0.06309342, + "auxiliary_loss_mlp": 0.01248757, + "balance_loss_clip": 0.06255215, + "balance_loss_mlp": 0.01247744, + "epoch": 0.9922741620321659, + "flos": 62035184701440.0, + "grad_norm": 0.6268759345910434, + "language_loss": 0.55145812, + "learning_rate": 6.212507584932858e-10, + "loss": 0.62703907, + "num_input_tokens_seen": 356244975, + "router_z_loss_clip": 0.54248047, + "router_z_loss_mlp": 0.01013184, + "step": 16504, + "time_per_iteration": 3.2505059242248535 + }, + { + "auxiliary_loss_clip": 0.06399435, + "auxiliary_loss_mlp": 0.01265661, + "balance_loss_clip": 0.06268956, + "balance_loss_mlp": 0.01257209, + "epoch": 0.9923342852848339, + "flos": 17171223580800.0, + "grad_norm": 1.6208802676549345, + "language_loss": 0.69611251, + "learning_rate": 6.115821263481536e-10, + "loss": 0.77276349, + "num_input_tokens_seen": 356262605, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.08441162, + "step": 16505, + "time_per_iteration": 3.9143412113189697 + }, + { + "auxiliary_loss_clip": 0.06404525, + "auxiliary_loss_mlp": 0.01263876, + "balance_loss_clip": 0.06269157, + "balance_loss_mlp": 0.01253356, + "epoch": 0.9923944085375018, + "flos": 23189555088000.0, + "grad_norm": 1.923670918802994, + "language_loss": 0.66283721, + "learning_rate": 6.019893112119146e-10, + "loss": 0.73952121, + "num_input_tokens_seen": 356278935, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.10522461, + "step": 16506, + "time_per_iteration": 2.4962158203125 + }, + { + "auxiliary_loss_clip": 0.0640049, + "auxiliary_loss_mlp": 0.01263896, + "balance_loss_clip": 0.06270368, + "balance_loss_mlp": 0.01254461, + "epoch": 0.9924545317901698, + "flos": 20820181184640.0, + "grad_norm": 2.195088142816573, + "language_loss": 0.63749093, + "learning_rate": 5.924723134487219e-10, + "loss": 0.71413481, + "num_input_tokens_seen": 356295675, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.09442139, + "step": 16507, + "time_per_iteration": 2.4816720485687256 + }, + { + "auxiliary_loss_clip": 0.06400108, + "auxiliary_loss_mlp": 0.01262795, + "balance_loss_clip": 0.06268136, + "balance_loss_mlp": 0.01253461, + "epoch": 0.9925146550428379, + "flos": 20089517581440.0, + "grad_norm": 2.0367572587682714, + "language_loss": 0.72877479, + "learning_rate": 5.830311334193983e-10, + "loss": 0.80540383, + "num_input_tokens_seen": 356312885, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.09332275, + "step": 16508, + "time_per_iteration": 2.481667995452881 + }, + { + "auxiliary_loss_clip": 0.06402674, + "auxiliary_loss_mlp": 0.01264359, + "balance_loss_clip": 0.06270348, + "balance_loss_mlp": 0.01254245, + "epoch": 0.9925747782955058, + "flos": 24980812945920.0, + "grad_norm": 1.4154056439024716, + "language_loss": 0.70592123, + "learning_rate": 5.736657714818793e-10, + "loss": 0.78259158, + "num_input_tokens_seen": 356334070, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.10113525, + "step": 16509, + "time_per_iteration": 2.5196590423583984 + }, + { + "auxiliary_loss_clip": 0.06405061, + "auxiliary_loss_mlp": 0.01265677, + "balance_loss_clip": 0.0627228, + "balance_loss_mlp": 0.01255789, + "epoch": 0.9926349015481738, + "flos": 60485250931200.0, + "grad_norm": 1.8295494813147601, + "language_loss": 0.68665648, + "learning_rate": 5.643762279912146e-10, + "loss": 0.76336384, + "num_input_tokens_seen": 356359410, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09893799, + "step": 16510, + "time_per_iteration": 2.8475050926208496 + }, + { + "auxiliary_loss_clip": 0.06405565, + "auxiliary_loss_mlp": 0.01264078, + "balance_loss_clip": 0.06273197, + "balance_loss_mlp": 0.01254261, + "epoch": 0.9926950248008417, + "flos": 20748163000320.0, + "grad_norm": 2.178338500168841, + "language_loss": 0.81844068, + "learning_rate": 5.551625032997886e-10, + "loss": 0.89513707, + "num_input_tokens_seen": 356378345, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.09814453, + "step": 16511, + "time_per_iteration": 2.5016791820526123 + }, + { + "auxiliary_loss_clip": 0.06398685, + "auxiliary_loss_mlp": 0.01262596, + "balance_loss_clip": 0.06270064, + "balance_loss_mlp": 0.01254126, + "epoch": 0.9927551480535097, + "flos": 24359874664320.0, + "grad_norm": 1.862945910053827, + "language_loss": 0.91819113, + "learning_rate": 5.460245977570998e-10, + "loss": 0.99480402, + "num_input_tokens_seen": 356397345, + "router_z_loss_clip": 1.28613281, + "router_z_loss_mlp": 0.08459473, + "step": 16512, + "time_per_iteration": 2.534518003463745 + }, + { + "auxiliary_loss_clip": 0.06313323, + "auxiliary_loss_mlp": 0.01262737, + "balance_loss_clip": 0.06259029, + "balance_loss_mlp": 0.01261737, + "epoch": 0.9928152713061776, + "flos": 71296751543040.0, + "grad_norm": 0.6913965440802265, + "language_loss": 0.55126524, + "learning_rate": 5.369625117095378e-10, + "loss": 0.62702584, + "num_input_tokens_seen": 356459160, + "router_z_loss_clip": 0.54345703, + "router_z_loss_mlp": 0.00999451, + "step": 16513, + "time_per_iteration": 3.224245071411133 + }, + { + "auxiliary_loss_clip": 0.06400467, + "auxiliary_loss_mlp": 0.01264294, + "balance_loss_clip": 0.06269906, + "balance_loss_mlp": 0.0125437, + "epoch": 0.9928753945588457, + "flos": 57821850650880.0, + "grad_norm": 1.4693700782931527, + "language_loss": 0.6477679, + "learning_rate": 5.279762455006054e-10, + "loss": 0.72441554, + "num_input_tokens_seen": 356486405, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.0993042, + "step": 16514, + "time_per_iteration": 2.8541526794433594 + }, + { + "auxiliary_loss_clip": 0.06402757, + "auxiliary_loss_mlp": 0.01267583, + "balance_loss_clip": 0.06270185, + "balance_loss_mlp": 0.01257296, + "epoch": 0.9929355178115136, + "flos": 19574363479680.0, + "grad_norm": 1.8661171060371329, + "language_loss": 0.73515117, + "learning_rate": 5.190657994713632e-10, + "loss": 0.81185454, + "num_input_tokens_seen": 356502905, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.10296631, + "step": 16515, + "time_per_iteration": 2.4777932167053223 + }, + { + "auxiliary_loss_clip": 0.06404781, + "auxiliary_loss_mlp": 0.01266644, + "balance_loss_clip": 0.06273709, + "balance_loss_mlp": 0.01256732, + "epoch": 0.9929956410641816, + "flos": 22971026839680.0, + "grad_norm": 1.348064631886549, + "language_loss": 0.77389991, + "learning_rate": 5.102311739593191e-10, + "loss": 0.85061419, + "num_input_tokens_seen": 356523830, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09912109, + "step": 16516, + "time_per_iteration": 2.5608432292938232 + }, + { + "auxiliary_loss_clip": 0.06398392, + "auxiliary_loss_mlp": 0.01266123, + "balance_loss_clip": 0.06268544, + "balance_loss_mlp": 0.01256968, + "epoch": 0.9930557643168495, + "flos": 22573228780800.0, + "grad_norm": 1.58329129583989, + "language_loss": 0.78152323, + "learning_rate": 5.014723692997602e-10, + "loss": 0.85816836, + "num_input_tokens_seen": 356543965, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.09161377, + "step": 16517, + "time_per_iteration": 2.528740167617798 + }, + { + "auxiliary_loss_clip": 0.06407169, + "auxiliary_loss_mlp": 0.01267301, + "balance_loss_clip": 0.06271128, + "balance_loss_mlp": 0.01257741, + "epoch": 0.9931158875695175, + "flos": 17206624730880.0, + "grad_norm": 1.9618850991719492, + "language_loss": 0.67701828, + "learning_rate": 4.927893858248655e-10, + "loss": 0.75376302, + "num_input_tokens_seen": 356561530, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.09564209, + "step": 16518, + "time_per_iteration": 2.548466205596924 + }, + { + "auxiliary_loss_clip": 0.06309474, + "auxiliary_loss_mlp": 0.01253105, + "balance_loss_clip": 0.06255181, + "balance_loss_mlp": 0.01252109, + "epoch": 0.9931760108221854, + "flos": 63729142369920.0, + "grad_norm": 0.7167826797108764, + "language_loss": 0.53387469, + "learning_rate": 4.84182223863483e-10, + "loss": 0.60950041, + "num_input_tokens_seen": 356616845, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.00995636, + "step": 16519, + "time_per_iteration": 3.033399820327759 + }, + { + "auxiliary_loss_clip": 0.06400052, + "auxiliary_loss_mlp": 0.01264927, + "balance_loss_clip": 0.06270394, + "balance_loss_mlp": 0.01256076, + "epoch": 0.9932361340748534, + "flos": 15310253525760.0, + "grad_norm": 1.8743335975768634, + "language_loss": 0.60528129, + "learning_rate": 4.756508837426842e-10, + "loss": 0.68193108, + "num_input_tokens_seen": 356633560, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.08850098, + "step": 16520, + "time_per_iteration": 2.517338514328003 + }, + { + "auxiliary_loss_clip": 0.06401677, + "auxiliary_loss_mlp": 0.01264223, + "balance_loss_clip": 0.06270006, + "balance_loss_mlp": 0.01254859, + "epoch": 0.9932962573275215, + "flos": 36073776551040.0, + "grad_norm": 1.8911026139940599, + "language_loss": 0.62225491, + "learning_rate": 4.671953657853223e-10, + "loss": 0.69891393, + "num_input_tokens_seen": 356657600, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09356689, + "step": 16521, + "time_per_iteration": 2.6538894176483154 + }, + { + "auxiliary_loss_clip": 0.06403658, + "auxiliary_loss_mlp": 0.01264234, + "balance_loss_clip": 0.0626916, + "balance_loss_mlp": 0.01254268, + "epoch": 0.9933563805801894, + "flos": 21476939886720.0, + "grad_norm": 1.7541359047868907, + "language_loss": 0.74881208, + "learning_rate": 4.5881567031225145e-10, + "loss": 0.82549095, + "num_input_tokens_seen": 356675880, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.09973145, + "step": 16522, + "time_per_iteration": 3.9640562534332275 + }, + { + "auxiliary_loss_clip": 0.06399375, + "auxiliary_loss_mlp": 0.01265263, + "balance_loss_clip": 0.06270382, + "balance_loss_mlp": 0.01255673, + "epoch": 0.9934165038328574, + "flos": 23993117343360.0, + "grad_norm": 1.410603102343642, + "language_loss": 0.73254204, + "learning_rate": 4.5051179764143964e-10, + "loss": 0.80918843, + "num_input_tokens_seen": 356696000, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.0960083, + "step": 16523, + "time_per_iteration": 2.5403733253479004 + }, + { + "auxiliary_loss_clip": 0.06399913, + "auxiliary_loss_mlp": 0.01262714, + "balance_loss_clip": 0.06268643, + "balance_loss_mlp": 0.0125332, + "epoch": 0.9934766270855253, + "flos": 21914206018560.0, + "grad_norm": 1.45594715483847, + "language_loss": 0.71754086, + "learning_rate": 4.422837480875241e-10, + "loss": 0.79416716, + "num_input_tokens_seen": 356716845, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.09399414, + "step": 16524, + "time_per_iteration": 2.49554181098938 + }, + { + "auxiliary_loss_clip": 0.06401279, + "auxiliary_loss_mlp": 0.01261178, + "balance_loss_clip": 0.06269774, + "balance_loss_mlp": 0.01252035, + "epoch": 0.9935367503381933, + "flos": 17134900035840.0, + "grad_norm": 1.9388609047910152, + "language_loss": 0.79724878, + "learning_rate": 4.341315219624775e-10, + "loss": 0.87387335, + "num_input_tokens_seen": 356732100, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.09143066, + "step": 16525, + "time_per_iteration": 2.4689719676971436 + }, + { + "auxiliary_loss_clip": 0.06404391, + "auxiliary_loss_mlp": 0.0126308, + "balance_loss_clip": 0.06273483, + "balance_loss_mlp": 0.01253841, + "epoch": 0.9935968735908612, + "flos": 22352813815680.0, + "grad_norm": 2.8533353027739246, + "language_loss": 0.74970055, + "learning_rate": 4.2605511957582995e-10, + "loss": 0.82637525, + "num_input_tokens_seen": 356751480, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09240723, + "step": 16526, + "time_per_iteration": 2.5054593086242676 + }, + { + "auxiliary_loss_clip": 0.06396805, + "auxiliary_loss_mlp": 0.01266824, + "balance_loss_clip": 0.06269647, + "balance_loss_mlp": 0.0125765, + "epoch": 0.9936569968435293, + "flos": 29468230830720.0, + "grad_norm": 1.4052771435638536, + "language_loss": 0.72989619, + "learning_rate": 4.180545412333369e-10, + "loss": 0.8065325, + "num_input_tokens_seen": 356772650, + "router_z_loss_clip": 1.27148438, + "router_z_loss_mlp": 0.0916748, + "step": 16527, + "time_per_iteration": 2.5682404041290283 + }, + { + "auxiliary_loss_clip": 0.06403709, + "auxiliary_loss_mlp": 0.01263759, + "balance_loss_clip": 0.06269115, + "balance_loss_mlp": 0.0125427, + "epoch": 0.9937171200961972, + "flos": 16549488685440.0, + "grad_norm": 2.193816392359614, + "language_loss": 0.7689482, + "learning_rate": 4.1012978723875547e-10, + "loss": 0.8456229, + "num_input_tokens_seen": 356788510, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.09490967, + "step": 16528, + "time_per_iteration": 2.4579873085021973 + }, + { + "auxiliary_loss_clip": 0.06405492, + "auxiliary_loss_mlp": 0.01264731, + "balance_loss_clip": 0.06270828, + "balance_loss_mlp": 0.01253919, + "epoch": 0.9937772433488652, + "flos": 24397330239360.0, + "grad_norm": 2.1223276204344494, + "language_loss": 0.68164897, + "learning_rate": 4.022808578922898e-10, + "loss": 0.75835121, + "num_input_tokens_seen": 356809115, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.10809326, + "step": 16529, + "time_per_iteration": 2.5395190715789795 + }, + { + "auxiliary_loss_clip": 0.06410487, + "auxiliary_loss_mlp": 0.01266372, + "balance_loss_clip": 0.062738, + "balance_loss_mlp": 0.01255357, + "epoch": 0.9938373666015331, + "flos": 15675459546240.0, + "grad_norm": 1.9586531091846018, + "language_loss": 0.65134317, + "learning_rate": 3.9450775349170186e-10, + "loss": 0.7281118, + "num_input_tokens_seen": 356826410, + "router_z_loss_clip": 1.36621094, + "router_z_loss_mlp": 0.11016846, + "step": 16530, + "time_per_iteration": 2.4729955196380615 + }, + { + "auxiliary_loss_clip": 0.06402886, + "auxiliary_loss_mlp": 0.0126345, + "balance_loss_clip": 0.06270776, + "balance_loss_mlp": 0.01254307, + "epoch": 0.9938974898542011, + "flos": 19501590608640.0, + "grad_norm": 1.9185750704175901, + "language_loss": 0.71495968, + "learning_rate": 3.8681047433186676e-10, + "loss": 0.79162306, + "num_input_tokens_seen": 356844990, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.0914917, + "step": 16531, + "time_per_iteration": 2.512540340423584 + }, + { + "auxiliary_loss_clip": 0.06404379, + "auxiliary_loss_mlp": 0.01270128, + "balance_loss_clip": 0.06272028, + "balance_loss_mlp": 0.01260485, + "epoch": 0.993957613106869, + "flos": 26914220455680.0, + "grad_norm": 1.3658354956475158, + "language_loss": 0.74276423, + "learning_rate": 3.791890207045512e-10, + "loss": 0.81950933, + "num_input_tokens_seen": 356866530, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.09643555, + "step": 16532, + "time_per_iteration": 4.052224397659302 + }, + { + "auxiliary_loss_clip": 0.06394548, + "auxiliary_loss_mlp": 0.01260602, + "balance_loss_clip": 0.06271665, + "balance_loss_mlp": 0.01252836, + "epoch": 0.994017736359537, + "flos": 14944921724160.0, + "grad_norm": 1.627443205614894, + "language_loss": 0.70665741, + "learning_rate": 3.7164339289885717e-10, + "loss": 0.78320897, + "num_input_tokens_seen": 356884660, + "router_z_loss_clip": 1.22851562, + "router_z_loss_mlp": 0.07769775, + "step": 16533, + "time_per_iteration": 2.5302624702453613 + }, + { + "auxiliary_loss_clip": 0.0640518, + "auxiliary_loss_mlp": 0.01263424, + "balance_loss_clip": 0.06270548, + "balance_loss_mlp": 0.01253088, + "epoch": 0.9940778596122051, + "flos": 15383361813120.0, + "grad_norm": 2.0547818206893362, + "language_loss": 0.84855843, + "learning_rate": 3.641735912007782e-10, + "loss": 0.92524445, + "num_input_tokens_seen": 356900895, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.10321045, + "step": 16534, + "time_per_iteration": 2.528353452682495 + }, + { + "auxiliary_loss_clip": 0.06395555, + "auxiliary_loss_mlp": 0.01264136, + "balance_loss_clip": 0.06271446, + "balance_loss_mlp": 0.01255118, + "epoch": 0.994137982864873, + "flos": 25235077760640.0, + "grad_norm": 1.3590448936998143, + "language_loss": 0.66083765, + "learning_rate": 3.567796158934211e-10, + "loss": 0.73743457, + "num_input_tokens_seen": 356920985, + "router_z_loss_clip": 1.24023438, + "router_z_loss_mlp": 0.09020996, + "step": 16535, + "time_per_iteration": 2.5445032119750977 + }, + { + "auxiliary_loss_clip": 0.06400403, + "auxiliary_loss_mlp": 0.01261695, + "balance_loss_clip": 0.06271672, + "balance_loss_mlp": 0.01253261, + "epoch": 0.994198106117541, + "flos": 18448040096640.0, + "grad_norm": 1.492382097158509, + "language_loss": 0.64826763, + "learning_rate": 3.4946146725767235e-10, + "loss": 0.72488862, + "num_input_tokens_seen": 356939800, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.08435059, + "step": 16536, + "time_per_iteration": 2.4790172576904297 + }, + { + "auxiliary_loss_clip": 0.063953, + "auxiliary_loss_mlp": 0.01266284, + "balance_loss_clip": 0.06267138, + "balance_loss_mlp": 0.01257325, + "epoch": 0.9942582293702089, + "flos": 16659675204480.0, + "grad_norm": 1.6373933785699804, + "language_loss": 0.79013014, + "learning_rate": 3.4221914557064357e-10, + "loss": 0.86674595, + "num_input_tokens_seen": 356957780, + "router_z_loss_clip": 1.28320312, + "router_z_loss_mlp": 0.08959961, + "step": 16537, + "time_per_iteration": 3.9477853775024414 + }, + { + "auxiliary_loss_clip": 0.06408946, + "auxiliary_loss_mlp": 0.01265972, + "balance_loss_clip": 0.06270771, + "balance_loss_mlp": 0.01255249, + "epoch": 0.9943183526228769, + "flos": 21951032688000.0, + "grad_norm": 1.5863603537176718, + "language_loss": 0.68719506, + "learning_rate": 3.35052651107004e-10, + "loss": 0.76394421, + "num_input_tokens_seen": 356979185, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.10717773, + "step": 16538, + "time_per_iteration": 2.560777187347412 + }, + { + "auxiliary_loss_clip": 0.06395986, + "auxiliary_loss_mlp": 0.01264597, + "balance_loss_clip": 0.06270739, + "balance_loss_mlp": 0.01255543, + "epoch": 0.9943784758755448, + "flos": 23849458318080.0, + "grad_norm": 1.9320392025007822, + "language_loss": 0.75314772, + "learning_rate": 3.2796198413853614e-10, + "loss": 0.82975346, + "num_input_tokens_seen": 356997735, + "router_z_loss_clip": 1.25292969, + "router_z_loss_mlp": 0.0904541, + "step": 16539, + "time_per_iteration": 2.5456387996673584 + }, + { + "auxiliary_loss_clip": 0.06405414, + "auxiliary_loss_mlp": 0.01263516, + "balance_loss_clip": 0.06272653, + "balance_loss_mlp": 0.01254414, + "epoch": 0.9944385991282129, + "flos": 21476310981120.0, + "grad_norm": 2.0585320600581287, + "language_loss": 0.70989531, + "learning_rate": 3.209471449341361e-10, + "loss": 0.78658462, + "num_input_tokens_seen": 357015660, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.09100342, + "step": 16540, + "time_per_iteration": 2.56339955329895 + }, + { + "auxiliary_loss_clip": 0.06397563, + "auxiliary_loss_mlp": 0.01263176, + "balance_loss_clip": 0.06268452, + "balance_loss_mlp": 0.01254253, + "epoch": 0.9944987223808808, + "flos": 22933193921280.0, + "grad_norm": 1.9177356075251677, + "language_loss": 0.75796914, + "learning_rate": 3.140081337600353e-10, + "loss": 0.83457649, + "num_input_tokens_seen": 357034800, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.08935547, + "step": 16541, + "time_per_iteration": 2.5349810123443604 + }, + { + "auxiliary_loss_clip": 0.06401087, + "auxiliary_loss_mlp": 0.01264778, + "balance_loss_clip": 0.06270842, + "balance_loss_mlp": 0.01254931, + "epoch": 0.9945588456335488, + "flos": 22389640485120.0, + "grad_norm": 1.8943263701308943, + "language_loss": 0.76886356, + "learning_rate": 3.0714495087891255e-10, + "loss": 0.84552217, + "num_input_tokens_seen": 357053785, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.09851074, + "step": 16542, + "time_per_iteration": 2.5378565788269043 + }, + { + "auxiliary_loss_clip": 0.06405424, + "auxiliary_loss_mlp": 0.01264121, + "balance_loss_clip": 0.06272386, + "balance_loss_mlp": 0.01253923, + "epoch": 0.9946189688862167, + "flos": 21403915453440.0, + "grad_norm": 1.9598697762283788, + "language_loss": 0.75353408, + "learning_rate": 3.0035759655122615e-10, + "loss": 0.83022952, + "num_input_tokens_seen": 357072025, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.10205078, + "step": 16543, + "time_per_iteration": 2.5946569442749023 + }, + { + "auxiliary_loss_clip": 0.06407975, + "auxiliary_loss_mlp": 0.01263482, + "balance_loss_clip": 0.06271738, + "balance_loss_mlp": 0.01253195, + "epoch": 0.9946790921388847, + "flos": 12420526567680.0, + "grad_norm": 3.189833149975994, + "language_loss": 0.81971997, + "learning_rate": 2.9364607103454785e-10, + "loss": 0.89643455, + "num_input_tokens_seen": 357086960, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.10296631, + "step": 16544, + "time_per_iteration": 2.595747470855713 + }, + { + "auxiliary_loss_clip": 0.06400429, + "auxiliary_loss_mlp": 0.01262665, + "balance_loss_clip": 0.06269884, + "balance_loss_mlp": 0.01253426, + "epoch": 0.9947392153915526, + "flos": 19063611717120.0, + "grad_norm": 1.7015698654881692, + "language_loss": 0.79186726, + "learning_rate": 2.870103745831187e-10, + "loss": 0.86849821, + "num_input_tokens_seen": 357105095, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.09234619, + "step": 16545, + "time_per_iteration": 3.9479551315307617 + }, + { + "auxiliary_loss_clip": 0.06405969, + "auxiliary_loss_mlp": 0.01262518, + "balance_loss_clip": 0.06272364, + "balance_loss_mlp": 0.01253288, + "epoch": 0.9947993386442207, + "flos": 27316295072640.0, + "grad_norm": 1.803677846508674, + "language_loss": 0.72430396, + "learning_rate": 2.8045050744873733e-10, + "loss": 0.80098879, + "num_input_tokens_seen": 357125065, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.09225464, + "step": 16546, + "time_per_iteration": 2.5391626358032227 + }, + { + "auxiliary_loss_clip": 0.064002, + "auxiliary_loss_mlp": 0.01262515, + "balance_loss_clip": 0.06271345, + "balance_loss_mlp": 0.01253336, + "epoch": 0.9948594618968887, + "flos": 20811586141440.0, + "grad_norm": 1.9161103078847286, + "language_loss": 0.77849805, + "learning_rate": 2.739664698798716e-10, + "loss": 0.85512525, + "num_input_tokens_seen": 357141600, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.09185791, + "step": 16547, + "time_per_iteration": 2.5525412559509277 + }, + { + "auxiliary_loss_clip": 0.06404825, + "auxiliary_loss_mlp": 0.01264169, + "balance_loss_clip": 0.06271931, + "balance_loss_mlp": 0.01255348, + "epoch": 0.9949195851495566, + "flos": 23299364263680.0, + "grad_norm": 2.386588561491637, + "language_loss": 0.70458543, + "learning_rate": 2.67558262122769e-10, + "loss": 0.78127539, + "num_input_tokens_seen": 357157880, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.08825684, + "step": 16548, + "time_per_iteration": 2.487410306930542 + }, + { + "auxiliary_loss_clip": 0.06400785, + "auxiliary_loss_mlp": 0.01262532, + "balance_loss_clip": 0.06270401, + "balance_loss_mlp": 0.01253395, + "epoch": 0.9949797084022246, + "flos": 18521441873280.0, + "grad_norm": 1.807359351493948, + "language_loss": 0.75424373, + "learning_rate": 2.6122588442012427e-10, + "loss": 0.83087683, + "num_input_tokens_seen": 357176705, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.09136963, + "step": 16549, + "time_per_iteration": 2.479980230331421 + }, + { + "auxiliary_loss_clip": 0.06405661, + "auxiliary_loss_mlp": 0.01265691, + "balance_loss_clip": 0.06271679, + "balance_loss_mlp": 0.01255302, + "epoch": 0.9950398316548925, + "flos": 30415326330240.0, + "grad_norm": 3.759297696668105, + "language_loss": 0.74710596, + "learning_rate": 2.5496933701241177e-10, + "loss": 0.82381952, + "num_input_tokens_seen": 357197630, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.1038208, + "step": 16550, + "time_per_iteration": 2.5653908252716064 + }, + { + "auxiliary_loss_clip": 0.06400557, + "auxiliary_loss_mlp": 0.01263204, + "balance_loss_clip": 0.06270449, + "balance_loss_mlp": 0.01254406, + "epoch": 0.9950999549075605, + "flos": 19906893607680.0, + "grad_norm": 1.505447061940754, + "language_loss": 0.78210282, + "learning_rate": 2.4878862013655297e-10, + "loss": 0.85874045, + "num_input_tokens_seen": 357215445, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.08795166, + "step": 16551, + "time_per_iteration": 2.502298593521118 + }, + { + "auxiliary_loss_clip": 0.06394917, + "auxiliary_loss_mlp": 0.01266008, + "balance_loss_clip": 0.06272274, + "balance_loss_mlp": 0.01258039, + "epoch": 0.9951600781602284, + "flos": 17609412107520.0, + "grad_norm": 1.3321877426988011, + "language_loss": 0.66736692, + "learning_rate": 2.426837340270271e-10, + "loss": 0.74397612, + "num_input_tokens_seen": 357234285, + "router_z_loss_clip": 1.2265625, + "router_z_loss_mlp": 0.07971191, + "step": 16552, + "time_per_iteration": 2.5603482723236084 + }, + { + "auxiliary_loss_clip": 0.06401337, + "auxiliary_loss_mlp": 0.01263383, + "balance_loss_clip": 0.06268856, + "balance_loss_mlp": 0.01254255, + "epoch": 0.9952202014128965, + "flos": 28958485317120.0, + "grad_norm": 1.2833907558121627, + "language_loss": 0.81770164, + "learning_rate": 2.3665467891520465e-10, + "loss": 0.89434886, + "num_input_tokens_seen": 357257565, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09127808, + "step": 16553, + "time_per_iteration": 2.576486825942993 + }, + { + "auxiliary_loss_clip": 0.06314038, + "auxiliary_loss_mlp": 0.01256617, + "balance_loss_clip": 0.06259907, + "balance_loss_mlp": 0.01255608, + "epoch": 0.9952803246655644, + "flos": 70833014720640.0, + "grad_norm": 0.732372532890913, + "language_loss": 0.57316744, + "learning_rate": 2.3070145503001348e-10, + "loss": 0.64887393, + "num_input_tokens_seen": 357320205, + "router_z_loss_clip": 0.54199219, + "router_z_loss_mlp": 0.01009369, + "step": 16554, + "time_per_iteration": 3.206148147583008 + }, + { + "auxiliary_loss_clip": 0.0640728, + "auxiliary_loss_mlp": 0.0126751, + "balance_loss_clip": 0.06274211, + "balance_loss_mlp": 0.01258307, + "epoch": 0.9953404479182324, + "flos": 21805570800000.0, + "grad_norm": 1.762448547669116, + "language_loss": 0.77524269, + "learning_rate": 2.24824062597051e-10, + "loss": 0.85199058, + "num_input_tokens_seen": 357340695, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.09197998, + "step": 16555, + "time_per_iteration": 2.519479274749756 + }, + { + "auxiliary_loss_clip": 0.06400803, + "auxiliary_loss_mlp": 0.01267755, + "balance_loss_clip": 0.06269628, + "balance_loss_mlp": 0.01258355, + "epoch": 0.9954005711709003, + "flos": 21942647280000.0, + "grad_norm": 2.0814748850322156, + "language_loss": 0.86322951, + "learning_rate": 2.1902250183902793e-10, + "loss": 0.93991506, + "num_input_tokens_seen": 357357505, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.09399414, + "step": 16556, + "time_per_iteration": 2.5571491718292236 + }, + { + "auxiliary_loss_clip": 0.06396689, + "auxiliary_loss_mlp": 0.0126468, + "balance_loss_clip": 0.06268832, + "balance_loss_mlp": 0.01255734, + "epoch": 0.9954606944235683, + "flos": 19360656840960.0, + "grad_norm": 1.6249222072461627, + "language_loss": 0.72927034, + "learning_rate": 2.132967729762125e-10, + "loss": 0.805884, + "num_input_tokens_seen": 357375395, + "router_z_loss_clip": 1.27832031, + "router_z_loss_mlp": 0.08947754, + "step": 16557, + "time_per_iteration": 2.5323092937469482 + }, + { + "auxiliary_loss_clip": 0.06396444, + "auxiliary_loss_mlp": 0.0126417, + "balance_loss_clip": 0.06270406, + "balance_loss_mlp": 0.01255772, + "epoch": 0.9955208176762362, + "flos": 30526477171200.0, + "grad_norm": 1.7597019969018155, + "language_loss": 0.7678116, + "learning_rate": 2.0764687622554233e-10, + "loss": 0.84441775, + "num_input_tokens_seen": 357397375, + "router_z_loss_clip": 1.25976562, + "router_z_loss_mlp": 0.08395386, + "step": 16558, + "time_per_iteration": 2.5865776538848877 + }, + { + "auxiliary_loss_clip": 0.06402529, + "auxiliary_loss_mlp": 0.01263721, + "balance_loss_clip": 0.06270144, + "balance_loss_mlp": 0.0125397, + "epoch": 0.9955809409289043, + "flos": 30016102752000.0, + "grad_norm": 1.795429364473874, + "language_loss": 0.63227272, + "learning_rate": 2.0207281180129044e-10, + "loss": 0.70893526, + "num_input_tokens_seen": 357418880, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.09759521, + "step": 16559, + "time_per_iteration": 2.5742897987365723 + }, + { + "auxiliary_loss_clip": 0.06398577, + "auxiliary_loss_mlp": 0.01266428, + "balance_loss_clip": 0.06270035, + "balance_loss_mlp": 0.01257189, + "epoch": 0.9956410641815723, + "flos": 21549670830720.0, + "grad_norm": 1.7103757872781167, + "language_loss": 0.7445935, + "learning_rate": 1.965745799148433e-10, + "loss": 0.82124352, + "num_input_tokens_seen": 357438310, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.09240723, + "step": 16560, + "time_per_iteration": 2.4979653358459473 + }, + { + "auxiliary_loss_clip": 0.06398787, + "auxiliary_loss_mlp": 0.01262797, + "balance_loss_clip": 0.06268584, + "balance_loss_mlp": 0.01253695, + "epoch": 0.9957011874342402, + "flos": 21695929332480.0, + "grad_norm": 1.6604206822913, + "language_loss": 0.79359847, + "learning_rate": 1.9115218077470073e-10, + "loss": 0.87021428, + "num_input_tokens_seen": 357457155, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.09100342, + "step": 16561, + "time_per_iteration": 2.5015368461608887 + }, + { + "auxiliary_loss_clip": 0.06396727, + "auxiliary_loss_mlp": 0.01263664, + "balance_loss_clip": 0.06269149, + "balance_loss_mlp": 0.01255087, + "epoch": 0.9957613106869082, + "flos": 17706810879360.0, + "grad_norm": 2.6002951438446718, + "language_loss": 0.65660673, + "learning_rate": 1.8580561458647614e-10, + "loss": 0.73321062, + "num_input_tokens_seen": 357468060, + "router_z_loss_clip": 1.27636719, + "router_z_loss_mlp": 0.08569336, + "step": 16562, + "time_per_iteration": 3.870602607727051 + }, + { + "auxiliary_loss_clip": 0.06403091, + "auxiliary_loss_mlp": 0.01267098, + "balance_loss_clip": 0.06268853, + "balance_loss_mlp": 0.0125743, + "epoch": 0.9958214339395761, + "flos": 30564016600320.0, + "grad_norm": 1.8613517918936233, + "language_loss": 0.64495075, + "learning_rate": 1.805348815528962e-10, + "loss": 0.72165263, + "num_input_tokens_seen": 357489665, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.09655762, + "step": 16563, + "time_per_iteration": 2.5799973011016846 + }, + { + "auxiliary_loss_clip": 0.06400756, + "auxiliary_loss_mlp": 0.01266447, + "balance_loss_clip": 0.06270421, + "balance_loss_mlp": 0.01257149, + "epoch": 0.9958815571922441, + "flos": 24175825171200.0, + "grad_norm": 1.467683459705596, + "language_loss": 0.65106744, + "learning_rate": 1.7533998187380105e-10, + "loss": 0.72773945, + "num_input_tokens_seen": 357511975, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.09301758, + "step": 16564, + "time_per_iteration": 2.638465404510498 + }, + { + "auxiliary_loss_clip": 0.06400171, + "auxiliary_loss_mlp": 0.01265945, + "balance_loss_clip": 0.06270692, + "balance_loss_mlp": 0.01256485, + "epoch": 0.995941680444912, + "flos": 15492458229120.0, + "grad_norm": 1.759207175120335, + "language_loss": 0.74907964, + "learning_rate": 1.7022091574636633e-10, + "loss": 0.82574081, + "num_input_tokens_seen": 357529345, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.09454346, + "step": 16565, + "time_per_iteration": 2.513090133666992 + }, + { + "auxiliary_loss_clip": 0.06399176, + "auxiliary_loss_mlp": 0.0126437, + "balance_loss_clip": 0.06268928, + "balance_loss_mlp": 0.01255316, + "epoch": 0.9960018036975801, + "flos": 18626597147520.0, + "grad_norm": 1.999585355447059, + "language_loss": 0.79579604, + "learning_rate": 1.6517768336443694e-10, + "loss": 0.87243158, + "num_input_tokens_seen": 357547615, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.0904541, + "step": 16566, + "time_per_iteration": 2.4897356033325195 + }, + { + "auxiliary_loss_clip": 0.0639536, + "auxiliary_loss_mlp": 0.01264477, + "balance_loss_clip": 0.0626775, + "balance_loss_mlp": 0.0125579, + "epoch": 0.996061926950248, + "flos": 20090314195200.0, + "grad_norm": 1.6960254260383738, + "language_loss": 0.71283329, + "learning_rate": 1.6021028491941535e-10, + "loss": 0.78943169, + "num_input_tokens_seen": 357567380, + "router_z_loss_clip": 1.27539062, + "router_z_loss_mlp": 0.08688354, + "step": 16567, + "time_per_iteration": 2.4799892902374268 + }, + { + "auxiliary_loss_clip": 0.06404319, + "auxiliary_loss_mlp": 0.01268033, + "balance_loss_clip": 0.0627034, + "balance_loss_mlp": 0.01257883, + "epoch": 0.996122050202916, + "flos": 24353879097600.0, + "grad_norm": 2.5978628938543085, + "language_loss": 0.78895438, + "learning_rate": 1.5531872059959538e-10, + "loss": 0.86567795, + "num_input_tokens_seen": 357586435, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.10150146, + "step": 16568, + "time_per_iteration": 2.606168270111084 + }, + { + "auxiliary_loss_clip": 0.06396884, + "auxiliary_loss_mlp": 0.01265522, + "balance_loss_clip": 0.06271239, + "balance_loss_mlp": 0.01257404, + "epoch": 0.9961821734555839, + "flos": 24204895338240.0, + "grad_norm": 1.7013100229361442, + "language_loss": 0.82422203, + "learning_rate": 1.5050299059060634e-10, + "loss": 0.90084612, + "num_input_tokens_seen": 357604720, + "router_z_loss_clip": 1.25683594, + "router_z_loss_mlp": 0.08123779, + "step": 16569, + "time_per_iteration": 2.7738900184631348 + }, + { + "auxiliary_loss_clip": 0.06398392, + "auxiliary_loss_mlp": 0.01264989, + "balance_loss_clip": 0.06272103, + "balance_loss_mlp": 0.01256334, + "epoch": 0.9962422967082519, + "flos": 22639628741760.0, + "grad_norm": 1.7744118698102032, + "language_loss": 0.70764375, + "learning_rate": 1.457630950747468e-10, + "loss": 0.78427756, + "num_input_tokens_seen": 357622345, + "router_z_loss_clip": 1.26367188, + "router_z_loss_mlp": 0.08654785, + "step": 16570, + "time_per_iteration": 2.5719547271728516 + }, + { + "auxiliary_loss_clip": 0.06403951, + "auxiliary_loss_mlp": 0.01267572, + "balance_loss_clip": 0.06273632, + "balance_loss_mlp": 0.01257731, + "epoch": 0.9963024199609198, + "flos": 26403259057920.0, + "grad_norm": 1.528477322587173, + "language_loss": 0.7513268, + "learning_rate": 1.4109903423209502e-10, + "loss": 0.82804203, + "num_input_tokens_seen": 357642710, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.09838867, + "step": 16571, + "time_per_iteration": 2.541731595993042 + }, + { + "auxiliary_loss_clip": 0.06398408, + "auxiliary_loss_mlp": 0.01263982, + "balance_loss_clip": 0.06269142, + "balance_loss_mlp": 0.0125497, + "epoch": 0.9963625432135879, + "flos": 16587153895680.0, + "grad_norm": 2.215286054451634, + "language_loss": 0.79922211, + "learning_rate": 1.3651080823939843e-10, + "loss": 0.87584603, + "num_input_tokens_seen": 357659870, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.09014893, + "step": 16572, + "time_per_iteration": 3.920409917831421 + }, + { + "auxiliary_loss_clip": 0.06398214, + "auxiliary_loss_mlp": 0.01264598, + "balance_loss_clip": 0.06270143, + "balance_loss_mlp": 0.01255115, + "epoch": 0.9964226664662559, + "flos": 26475696512640.0, + "grad_norm": 1.736067517515339, + "language_loss": 0.70695126, + "learning_rate": 1.3199841727074e-10, + "loss": 0.78357947, + "num_input_tokens_seen": 357677075, + "router_z_loss_clip": 1.28027344, + "router_z_loss_mlp": 0.09484863, + "step": 16573, + "time_per_iteration": 2.518183469772339 + }, + { + "auxiliary_loss_clip": 0.0640582, + "auxiliary_loss_mlp": 0.01268788, + "balance_loss_clip": 0.06269335, + "balance_loss_mlp": 0.01258667, + "epoch": 0.9964827897189238, + "flos": 27454755144960.0, + "grad_norm": 1.5539558414743522, + "language_loss": 0.63445759, + "learning_rate": 1.275618614968721e-10, + "loss": 0.71120363, + "num_input_tokens_seen": 357696715, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.10119629, + "step": 16574, + "time_per_iteration": 2.5316076278686523 + }, + { + "auxiliary_loss_clip": 0.06409708, + "auxiliary_loss_mlp": 0.01264743, + "balance_loss_clip": 0.06273416, + "balance_loss_mlp": 0.01254437, + "epoch": 0.9965429129715918, + "flos": 11725138333440.0, + "grad_norm": 2.458562193325811, + "language_loss": 0.76547927, + "learning_rate": 1.2320114108654856e-10, + "loss": 0.84222376, + "num_input_tokens_seen": 357712345, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.10308838, + "step": 16575, + "time_per_iteration": 2.4830782413482666 + }, + { + "auxiliary_loss_clip": 0.06404927, + "auxiliary_loss_mlp": 0.01265709, + "balance_loss_clip": 0.06273346, + "balance_loss_mlp": 0.0125569, + "epoch": 0.9966030362242597, + "flos": 19762186406400.0, + "grad_norm": 1.68476172893604, + "language_loss": 0.70171261, + "learning_rate": 1.1891625620474855e-10, + "loss": 0.77841902, + "num_input_tokens_seen": 357731815, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.10021973, + "step": 16576, + "time_per_iteration": 2.4806809425354004 + }, + { + "auxiliary_loss_clip": 0.06396693, + "auxiliary_loss_mlp": 0.0126643, + "balance_loss_clip": 0.06271367, + "balance_loss_mlp": 0.01257466, + "epoch": 0.9966631594769277, + "flos": 23922021553920.0, + "grad_norm": 1.514407622643374, + "language_loss": 0.72368443, + "learning_rate": 1.1470720701400871e-10, + "loss": 0.80031562, + "num_input_tokens_seen": 357751640, + "router_z_loss_clip": 1.25488281, + "router_z_loss_mlp": 0.08966064, + "step": 16577, + "time_per_iteration": 3.9655070304870605 + }, + { + "auxiliary_loss_clip": 0.06402753, + "auxiliary_loss_mlp": 0.01267642, + "balance_loss_clip": 0.06271574, + "balance_loss_mlp": 0.01258356, + "epoch": 0.9967232827295956, + "flos": 15564979537920.0, + "grad_norm": 1.8728768870401162, + "language_loss": 0.79020208, + "learning_rate": 1.1057399367397912e-10, + "loss": 0.86690605, + "num_input_tokens_seen": 357769850, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.09295654, + "step": 16578, + "time_per_iteration": 2.465836524963379 + }, + { + "auxiliary_loss_clip": 0.06401658, + "auxiliary_loss_mlp": 0.01263584, + "balance_loss_clip": 0.06269468, + "balance_loss_mlp": 0.01254334, + "epoch": 0.9967834059822637, + "flos": 20819216862720.0, + "grad_norm": 1.5291705366711337, + "language_loss": 0.7613309, + "learning_rate": 1.0651661634142328e-10, + "loss": 0.83798331, + "num_input_tokens_seen": 357789550, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.09246826, + "step": 16579, + "time_per_iteration": 2.563567876815796 + }, + { + "auxiliary_loss_clip": 0.06405777, + "auxiliary_loss_mlp": 0.01267794, + "balance_loss_clip": 0.0627277, + "balance_loss_mlp": 0.01257476, + "epoch": 0.9968435292349316, + "flos": 36727809995520.0, + "grad_norm": 2.038574869304339, + "language_loss": 0.69993865, + "learning_rate": 1.0253507516999604e-10, + "loss": 0.77667433, + "num_input_tokens_seen": 357809525, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.10321045, + "step": 16580, + "time_per_iteration": 2.6372199058532715 + }, + { + "auxiliary_loss_clip": 0.06400767, + "auxiliary_loss_mlp": 0.01267201, + "balance_loss_clip": 0.06268618, + "balance_loss_mlp": 0.01257908, + "epoch": 0.9969036524875996, + "flos": 26768213516160.0, + "grad_norm": 1.8631774429365007, + "language_loss": 0.80034542, + "learning_rate": 9.862937031113184e-11, + "loss": 0.87702513, + "num_input_tokens_seen": 357829795, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.09295654, + "step": 16581, + "time_per_iteration": 2.586275815963745 + }, + { + "auxiliary_loss_clip": 0.06398311, + "auxiliary_loss_mlp": 0.01263701, + "balance_loss_clip": 0.06269346, + "balance_loss_mlp": 0.01254796, + "epoch": 0.9969637757402675, + "flos": 24834219027840.0, + "grad_norm": 1.5224635632541534, + "language_loss": 0.80819917, + "learning_rate": 9.479950191249031e-11, + "loss": 0.88481927, + "num_input_tokens_seen": 357851655, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.08898926, + "step": 16582, + "time_per_iteration": 2.5423171520233154 + }, + { + "auxiliary_loss_clip": 0.06398583, + "auxiliary_loss_mlp": 0.01262708, + "balance_loss_clip": 0.06271505, + "balance_loss_mlp": 0.01253851, + "epoch": 0.9970238989929355, + "flos": 23045309084160.0, + "grad_norm": 1.6463581574005606, + "language_loss": 0.60997719, + "learning_rate": 9.104547011951069e-11, + "loss": 0.68659008, + "num_input_tokens_seen": 357871205, + "router_z_loss_clip": 1.27148438, + "router_z_loss_mlp": 0.08856201, + "step": 16583, + "time_per_iteration": 2.5455894470214844 + }, + { + "auxiliary_loss_clip": 0.06403083, + "auxiliary_loss_mlp": 0.01263359, + "balance_loss_clip": 0.06270447, + "balance_loss_mlp": 0.01254263, + "epoch": 0.9970840222456034, + "flos": 25305418863360.0, + "grad_norm": 1.6487266342882827, + "language_loss": 0.78016913, + "learning_rate": 8.736727507452357e-11, + "loss": 0.85683358, + "num_input_tokens_seen": 357892145, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.09094238, + "step": 16584, + "time_per_iteration": 3.9929842948913574 + }, + { + "auxiliary_loss_clip": 0.06401587, + "auxiliary_loss_mlp": 0.01265911, + "balance_loss_clip": 0.06273124, + "balance_loss_mlp": 0.01257322, + "epoch": 0.9971441454982715, + "flos": 21621898650240.0, + "grad_norm": 1.4117034682008287, + "language_loss": 0.69645995, + "learning_rate": 8.376491691697297e-11, + "loss": 0.77313489, + "num_input_tokens_seen": 357911205, + "router_z_loss_clip": 1.28613281, + "router_z_loss_mlp": 0.0859375, + "step": 16585, + "time_per_iteration": 2.5167651176452637 + }, + { + "auxiliary_loss_clip": 0.06399348, + "auxiliary_loss_mlp": 0.01263258, + "balance_loss_clip": 0.06271613, + "balance_loss_mlp": 0.01253739, + "epoch": 0.9972042687509394, + "flos": 14980867925760.0, + "grad_norm": 2.0755557682308963, + "language_loss": 0.81635392, + "learning_rate": 8.023839578363834e-11, + "loss": 0.89298004, + "num_input_tokens_seen": 357928190, + "router_z_loss_clip": 1.27734375, + "router_z_loss_mlp": 0.09515381, + "step": 16586, + "time_per_iteration": 2.5256056785583496 + }, + { + "auxiliary_loss_clip": 0.06401335, + "auxiliary_loss_mlp": 0.01262833, + "balance_loss_clip": 0.06269982, + "balance_loss_mlp": 0.01253904, + "epoch": 0.9972643920036074, + "flos": 25812858389760.0, + "grad_norm": 1.749230535961165, + "language_loss": 0.78177583, + "learning_rate": 7.678771180796851e-11, + "loss": 0.85841757, + "num_input_tokens_seen": 357946985, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.08929443, + "step": 16587, + "time_per_iteration": 2.542367696762085 + }, + { + "auxiliary_loss_clip": 0.06403758, + "auxiliary_loss_mlp": 0.01269466, + "balance_loss_clip": 0.06272189, + "balance_loss_mlp": 0.01260162, + "epoch": 0.9973245152562754, + "flos": 23332124010240.0, + "grad_norm": 1.844090752894055, + "language_loss": 0.72692442, + "learning_rate": 7.341286512074773e-11, + "loss": 0.8036567, + "num_input_tokens_seen": 357966720, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.09307861, + "step": 16588, + "time_per_iteration": 2.5101404190063477 + }, + { + "auxiliary_loss_clip": 0.06406671, + "auxiliary_loss_mlp": 0.01265692, + "balance_loss_clip": 0.0626927, + "balance_loss_mlp": 0.01255702, + "epoch": 0.9973846385089433, + "flos": 12170999508480.0, + "grad_norm": 2.447200723458138, + "language_loss": 0.82740468, + "learning_rate": 7.011385585031781e-11, + "loss": 0.90412831, + "num_input_tokens_seen": 357981375, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.09991455, + "step": 16589, + "time_per_iteration": 2.452146053314209 + }, + { + "auxiliary_loss_clip": 0.06408519, + "auxiliary_loss_mlp": 0.01264547, + "balance_loss_clip": 0.06271757, + "balance_loss_mlp": 0.01253317, + "epoch": 0.9974447617616113, + "flos": 20050929976320.0, + "grad_norm": 2.015417296795279, + "language_loss": 0.70627606, + "learning_rate": 6.689068412168986e-11, + "loss": 0.78300673, + "num_input_tokens_seen": 358000290, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.11236572, + "step": 16590, + "time_per_iteration": 2.5470008850097656 + }, + { + "auxiliary_loss_clip": 0.06405114, + "auxiliary_loss_mlp": 0.01263511, + "balance_loss_clip": 0.06271853, + "balance_loss_mlp": 0.01253945, + "epoch": 0.9975048850142793, + "flos": 32022744330240.0, + "grad_norm": 1.7156925678226993, + "language_loss": 0.63968062, + "learning_rate": 6.374335005676634e-11, + "loss": 0.71636689, + "num_input_tokens_seen": 358022075, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.09570312, + "step": 16591, + "time_per_iteration": 2.5790483951568604 + }, + { + "auxiliary_loss_clip": 0.06401144, + "auxiliary_loss_mlp": 0.0126748, + "balance_loss_clip": 0.06268914, + "balance_loss_mlp": 0.01258552, + "epoch": 0.9975650082669473, + "flos": 36941600488320.0, + "grad_norm": 1.6209737833273146, + "language_loss": 0.7318058, + "learning_rate": 6.067185377522933e-11, + "loss": 0.80849206, + "num_input_tokens_seen": 358043940, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.08929443, + "step": 16592, + "time_per_iteration": 2.6874001026153564 + }, + { + "auxiliary_loss_clip": 0.06400564, + "auxiliary_loss_mlp": 0.01264326, + "balance_loss_clip": 0.06268974, + "balance_loss_mlp": 0.01254747, + "epoch": 0.9976251315196152, + "flos": 16477722063360.0, + "grad_norm": 1.4238943744939072, + "language_loss": 0.8514542, + "learning_rate": 5.767619539343016e-11, + "loss": 0.92810309, + "num_input_tokens_seen": 358062720, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.0958252, + "step": 16593, + "time_per_iteration": 2.500425338745117 + }, + { + "auxiliary_loss_clip": 0.06400873, + "auxiliary_loss_mlp": 0.01266904, + "balance_loss_clip": 0.06271567, + "balance_loss_mlp": 0.01258059, + "epoch": 0.9976852547722832, + "flos": 19653048063360.0, + "grad_norm": 1.552542866202301, + "language_loss": 0.69804668, + "learning_rate": 5.4756375024833656e-11, + "loss": 0.77472448, + "num_input_tokens_seen": 358081560, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.08837891, + "step": 16594, + "time_per_iteration": 2.499431610107422 + }, + { + "auxiliary_loss_clip": 0.06403884, + "auxiliary_loss_mlp": 0.01267202, + "balance_loss_clip": 0.06269734, + "balance_loss_mlp": 0.01257862, + "epoch": 0.9977453780249511, + "flos": 20454597820800.0, + "grad_norm": 1.975113527631894, + "language_loss": 0.73193353, + "learning_rate": 5.1912392780462113e-11, + "loss": 0.80864441, + "num_input_tokens_seen": 358099065, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.09338379, + "step": 16595, + "time_per_iteration": 2.4891738891601562 + }, + { + "auxiliary_loss_clip": 0.06308937, + "auxiliary_loss_mlp": 0.01250785, + "balance_loss_clip": 0.06254812, + "balance_loss_mlp": 0.01249741, + "epoch": 0.9978055012776191, + "flos": 65472085549440.0, + "grad_norm": 0.766579678458714, + "language_loss": 0.60467255, + "learning_rate": 4.9144248768007156e-11, + "loss": 0.68026978, + "num_input_tokens_seen": 358156095, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.01044464, + "step": 16596, + "time_per_iteration": 3.0071284770965576 + }, + { + "auxiliary_loss_clip": 0.06399873, + "auxiliary_loss_mlp": 0.01265753, + "balance_loss_clip": 0.06268875, + "balance_loss_mlp": 0.01256431, + "epoch": 0.997865624530287, + "flos": 20637808773120.0, + "grad_norm": 1.6512537501923108, + "language_loss": 0.77633482, + "learning_rate": 4.645194309227385e-11, + "loss": 0.8529911, + "num_input_tokens_seen": 358175230, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.09320068, + "step": 16597, + "time_per_iteration": 2.4939262866973877 + }, + { + "auxiliary_loss_clip": 0.06402931, + "auxiliary_loss_mlp": 0.0126284, + "balance_loss_clip": 0.06270836, + "balance_loss_mlp": 0.01253089, + "epoch": 0.9979257477829551, + "flos": 29394703272960.0, + "grad_norm": 1.6822966575262215, + "language_loss": 0.82273138, + "learning_rate": 4.383547585562475e-11, + "loss": 0.89938903, + "num_input_tokens_seen": 358197075, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.09753418, + "step": 16598, + "time_per_iteration": 2.57281494140625 + }, + { + "auxiliary_loss_clip": 0.06406281, + "auxiliary_loss_mlp": 0.01268438, + "balance_loss_clip": 0.06270172, + "balance_loss_mlp": 0.01257631, + "epoch": 0.997985871035623, + "flos": 22641180042240.0, + "grad_norm": 1.9442107163563487, + "language_loss": 0.65055943, + "learning_rate": 4.129484715709175e-11, + "loss": 0.7273066, + "num_input_tokens_seen": 358215925, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.10803223, + "step": 16599, + "time_per_iteration": 2.5110907554626465 + }, + { + "auxiliary_loss_clip": 0.06311208, + "auxiliary_loss_mlp": 0.01254339, + "balance_loss_clip": 0.06257115, + "balance_loss_mlp": 0.0125329, + "epoch": 0.998045994288291, + "flos": 61823421434880.0, + "grad_norm": 0.8427819693945304, + "language_loss": 0.62358809, + "learning_rate": 3.8830057093264256e-11, + "loss": 0.69924355, + "num_input_tokens_seen": 358269035, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.01049805, + "step": 16600, + "time_per_iteration": 3.0938379764556885 + }, + { + "auxiliary_loss_clip": 0.06400381, + "auxiliary_loss_mlp": 0.01262242, + "balance_loss_clip": 0.0626972, + "balance_loss_mlp": 0.01253212, + "epoch": 0.998106117540959, + "flos": 19251686206080.0, + "grad_norm": 1.6103653898018497, + "language_loss": 0.78675622, + "learning_rate": 3.644110575717896e-11, + "loss": 0.86338246, + "num_input_tokens_seen": 358287680, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09033203, + "step": 16601, + "time_per_iteration": 2.6078760623931885 + }, + { + "auxiliary_loss_clip": 0.06409572, + "auxiliary_loss_mlp": 0.01264878, + "balance_loss_clip": 0.06273425, + "balance_loss_mlp": 0.01255484, + "epoch": 0.9981662407936269, + "flos": 21112656261120.0, + "grad_norm": 1.8622477211411699, + "language_loss": 0.82537067, + "learning_rate": 3.412799323987414e-11, + "loss": 0.90211511, + "num_input_tokens_seen": 358304080, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.09393311, + "step": 16602, + "time_per_iteration": 3.946537494659424 + }, + { + "auxiliary_loss_clip": 0.06402224, + "auxiliary_loss_mlp": 0.01264728, + "balance_loss_clip": 0.06271221, + "balance_loss_mlp": 0.01255478, + "epoch": 0.998226364046295, + "flos": 24323802681600.0, + "grad_norm": 2.0035158293659663, + "language_loss": 0.62724072, + "learning_rate": 3.189071962883538e-11, + "loss": 0.70391023, + "num_input_tokens_seen": 358323670, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.0925293, + "step": 16603, + "time_per_iteration": 2.5347180366516113 + }, + { + "auxiliary_loss_clip": 0.06403463, + "auxiliary_loss_mlp": 0.0126348, + "balance_loss_clip": 0.06271768, + "balance_loss_mlp": 0.01253836, + "epoch": 0.9982864872989629, + "flos": 23842246867200.0, + "grad_norm": 1.8205508857856618, + "language_loss": 0.71622694, + "learning_rate": 2.972928500866168e-11, + "loss": 0.79289639, + "num_input_tokens_seen": 358341980, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09637451, + "step": 16604, + "time_per_iteration": 2.5248515605926514 + }, + { + "auxiliary_loss_clip": 0.06401166, + "auxiliary_loss_mlp": 0.01260416, + "balance_loss_clip": 0.06269663, + "balance_loss_mlp": 0.0125101, + "epoch": 0.9983466105516309, + "flos": 18339069461760.0, + "grad_norm": 1.486707520198961, + "language_loss": 0.64735997, + "learning_rate": 2.7643689461953613e-11, + "loss": 0.72397572, + "num_input_tokens_seen": 358360400, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.09405518, + "step": 16605, + "time_per_iteration": 2.5203909873962402 + }, + { + "auxiliary_loss_clip": 0.06399968, + "auxiliary_loss_mlp": 0.01262227, + "balance_loss_clip": 0.06270541, + "balance_loss_mlp": 0.01252654, + "epoch": 0.9984067338042988, + "flos": 17242235516160.0, + "grad_norm": 1.6138823205609316, + "language_loss": 0.71377051, + "learning_rate": 2.5633933067092938e-11, + "loss": 0.7903924, + "num_input_tokens_seen": 358378990, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.09570312, + "step": 16606, + "time_per_iteration": 2.472602128982544 + }, + { + "auxiliary_loss_clip": 0.06399357, + "auxiliary_loss_mlp": 0.01262803, + "balance_loss_clip": 0.06269458, + "balance_loss_mlp": 0.01253666, + "epoch": 0.9984668570569668, + "flos": 20674174245120.0, + "grad_norm": 1.8806503380972919, + "language_loss": 0.82498664, + "learning_rate": 2.370001590090709e-11, + "loss": 0.90160817, + "num_input_tokens_seen": 358395970, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.0914917, + "step": 16607, + "time_per_iteration": 2.511127471923828 + }, + { + "auxiliary_loss_clip": 0.06407319, + "auxiliary_loss_mlp": 0.01264489, + "balance_loss_clip": 0.06272326, + "balance_loss_mlp": 0.01254362, + "epoch": 0.9985269803096347, + "flos": 30270241785600.0, + "grad_norm": 1.8869176334897872, + "language_loss": 0.66939551, + "learning_rate": 2.184193803622669e-11, + "loss": 0.7461136, + "num_input_tokens_seen": 358417355, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.10125732, + "step": 16608, + "time_per_iteration": 2.5830514430999756 + }, + { + "auxiliary_loss_clip": 0.06404091, + "auxiliary_loss_mlp": 0.01264125, + "balance_loss_clip": 0.06271875, + "balance_loss_mlp": 0.01254887, + "epoch": 0.9985871035623027, + "flos": 10565510152320.0, + "grad_norm": 2.1287331538283936, + "language_loss": 0.80895412, + "learning_rate": 2.0059699543883978e-11, + "loss": 0.88563633, + "num_input_tokens_seen": 358434345, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.09240723, + "step": 16609, + "time_per_iteration": 2.485151529312134 + }, + { + "auxiliary_loss_clip": 0.06403465, + "auxiliary_loss_mlp": 0.01265215, + "balance_loss_clip": 0.06271623, + "balance_loss_mlp": 0.01255684, + "epoch": 0.9986472268149706, + "flos": 16879125847680.0, + "grad_norm": 1.3951775563827955, + "language_loss": 0.62941349, + "learning_rate": 1.8353300491158462e-11, + "loss": 0.70610029, + "num_input_tokens_seen": 358452870, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09527588, + "step": 16610, + "time_per_iteration": 2.5005035400390625 + }, + { + "auxiliary_loss_clip": 0.06397352, + "auxiliary_loss_mlp": 0.01264817, + "balance_loss_clip": 0.06267397, + "balance_loss_mlp": 0.0125596, + "epoch": 0.9987073500676387, + "flos": 22061093425920.0, + "grad_norm": 1.999034741081423, + "language_loss": 0.67834997, + "learning_rate": 1.672274094288717e-11, + "loss": 0.75497168, + "num_input_tokens_seen": 358472210, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.08862305, + "step": 16611, + "time_per_iteration": 2.514544725418091 + }, + { + "auxiliary_loss_clip": 0.06401592, + "auxiliary_loss_mlp": 0.01263398, + "balance_loss_clip": 0.06270025, + "balance_loss_mlp": 0.0125382, + "epoch": 0.9987674733203066, + "flos": 30490866385920.0, + "grad_norm": 2.8198538577186265, + "language_loss": 0.70121431, + "learning_rate": 1.5168020961020544e-11, + "loss": 0.77786428, + "num_input_tokens_seen": 358493840, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09570312, + "step": 16612, + "time_per_iteration": 4.045984268188477 + }, + { + "auxiliary_loss_clip": 0.06396015, + "auxiliary_loss_mlp": 0.01264259, + "balance_loss_clip": 0.06269395, + "balance_loss_mlp": 0.01255772, + "epoch": 0.9988275965729746, + "flos": 27752554955520.0, + "grad_norm": 1.4374108761182864, + "language_loss": 0.74011898, + "learning_rate": 1.3689140604400407e-11, + "loss": 0.81672174, + "num_input_tokens_seen": 358515060, + "router_z_loss_clip": 1.26660156, + "router_z_loss_mlp": 0.08483887, + "step": 16613, + "time_per_iteration": 2.565751314163208 + }, + { + "auxiliary_loss_clip": 0.06403059, + "auxiliary_loss_mlp": 0.01267726, + "balance_loss_clip": 0.06270773, + "balance_loss_mlp": 0.01257641, + "epoch": 0.9988877198256426, + "flos": 17528966588160.0, + "grad_norm": 2.032430631725315, + "language_loss": 0.73772359, + "learning_rate": 1.2286099928981996e-11, + "loss": 0.81443143, + "num_input_tokens_seen": 358528200, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.10083008, + "step": 16614, + "time_per_iteration": 2.455465078353882 + }, + { + "auxiliary_loss_clip": 0.06402258, + "auxiliary_loss_mlp": 0.01264143, + "balance_loss_clip": 0.06270853, + "balance_loss_mlp": 0.012546, + "epoch": 0.9989478430783105, + "flos": 21002889012480.0, + "grad_norm": 1.9940582429405083, + "language_loss": 0.73076797, + "learning_rate": 1.0958898988278065e-11, + "loss": 0.80743194, + "num_input_tokens_seen": 358548360, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.09539795, + "step": 16615, + "time_per_iteration": 2.5054819583892822 + }, + { + "auxiliary_loss_clip": 0.06406209, + "auxiliary_loss_mlp": 0.0126395, + "balance_loss_clip": 0.06272269, + "balance_loss_mlp": 0.01254479, + "epoch": 0.9990079663309785, + "flos": 13375672058880.0, + "grad_norm": 3.418620590990309, + "language_loss": 0.77891582, + "learning_rate": 9.70753783247069e-12, + "loss": 0.8556174, + "num_input_tokens_seen": 358566270, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.09466553, + "step": 16616, + "time_per_iteration": 3.982736825942993 + }, + { + "auxiliary_loss_clip": 0.06401001, + "auxiliary_loss_mlp": 0.01269245, + "balance_loss_clip": 0.06271502, + "balance_loss_mlp": 0.01260153, + "epoch": 0.9990680895836465, + "flos": 17315805000960.0, + "grad_norm": 1.7493662985892016, + "language_loss": 0.83197755, + "learning_rate": 8.532016508855378e-12, + "loss": 0.90867996, + "num_input_tokens_seen": 358584710, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.0909729, + "step": 16617, + "time_per_iteration": 2.481229782104492 + }, + { + "auxiliary_loss_clip": 0.06399592, + "auxiliary_loss_mlp": 0.0126193, + "balance_loss_clip": 0.06269813, + "balance_loss_mlp": 0.01253162, + "epoch": 0.9991282128363145, + "flos": 24215041681920.0, + "grad_norm": 1.5472149441524297, + "language_loss": 0.78848952, + "learning_rate": 7.43233506206309e-12, + "loss": 0.86510473, + "num_input_tokens_seen": 358606750, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.08764648, + "step": 16618, + "time_per_iteration": 2.5298168659210205 + }, + { + "auxiliary_loss_clip": 0.06397195, + "auxiliary_loss_mlp": 0.01262323, + "balance_loss_clip": 0.06267681, + "balance_loss_mlp": 0.01252799, + "epoch": 0.9991883360889824, + "flos": 21181110647040.0, + "grad_norm": 1.6455695651366786, + "language_loss": 0.7489872, + "learning_rate": 6.408493534060255e-12, + "loss": 0.82558239, + "num_input_tokens_seen": 358624675, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.09527588, + "step": 16619, + "time_per_iteration": 2.5116331577301025 + }, + { + "auxiliary_loss_clip": 0.06394048, + "auxiliary_loss_mlp": 0.01264154, + "balance_loss_clip": 0.06267348, + "balance_loss_mlp": 0.01255571, + "epoch": 0.9992484593416504, + "flos": 19907229024000.0, + "grad_norm": 1.8478849238967225, + "language_loss": 0.86866474, + "learning_rate": 5.460491963260594e-12, + "loss": 0.9452467, + "num_input_tokens_seen": 358640715, + "router_z_loss_clip": 1.26757812, + "router_z_loss_mlp": 0.08581543, + "step": 16620, + "time_per_iteration": 2.4666316509246826 + }, + { + "auxiliary_loss_clip": 0.06398039, + "auxiliary_loss_mlp": 0.01263946, + "balance_loss_clip": 0.06269631, + "balance_loss_mlp": 0.01255381, + "epoch": 0.9993085825943183, + "flos": 24863834246400.0, + "grad_norm": 2.0773440241084855, + "language_loss": 0.7270844, + "learning_rate": 4.58833038607942e-12, + "loss": 0.80370426, + "num_input_tokens_seen": 358659630, + "router_z_loss_clip": 1.28613281, + "router_z_loss_mlp": 0.08569336, + "step": 16621, + "time_per_iteration": 2.542825698852539 + }, + { + "auxiliary_loss_clip": 0.06309964, + "auxiliary_loss_mlp": 0.0125154, + "balance_loss_clip": 0.06255855, + "balance_loss_mlp": 0.01250484, + "epoch": 0.9993687058469863, + "flos": 71307149448960.0, + "grad_norm": 0.7280436002919584, + "language_loss": 0.56537503, + "learning_rate": 3.79200883515729e-12, + "loss": 0.64099008, + "num_input_tokens_seen": 358727840, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.01057434, + "step": 16622, + "time_per_iteration": 3.3803882598876953 + }, + { + "auxiliary_loss_clip": 0.06399865, + "auxiliary_loss_mlp": 0.01263676, + "balance_loss_clip": 0.06268437, + "balance_loss_mlp": 0.01253949, + "epoch": 0.9994288290996542, + "flos": 12203843109120.0, + "grad_norm": 1.9127246932088902, + "language_loss": 0.71968305, + "learning_rate": 3.071527340914315e-12, + "loss": 0.79631841, + "num_input_tokens_seen": 358744125, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.097229, + "step": 16623, + "time_per_iteration": 2.5517661571502686 + }, + { + "auxiliary_loss_clip": 0.06400504, + "auxiliary_loss_mlp": 0.01265963, + "balance_loss_clip": 0.06271066, + "balance_loss_mlp": 0.01255801, + "epoch": 0.9994889523523223, + "flos": 17894927295360.0, + "grad_norm": 1.794645940520366, + "language_loss": 0.74947834, + "learning_rate": 2.4268859304399368e-12, + "loss": 0.82614297, + "num_input_tokens_seen": 358761420, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.10168457, + "step": 16624, + "time_per_iteration": 3.9927942752838135 + }, + { + "auxiliary_loss_clip": 0.06401871, + "auxiliary_loss_mlp": 0.01264474, + "balance_loss_clip": 0.0626986, + "balance_loss_mlp": 0.01255384, + "epoch": 0.9995490756049902, + "flos": 26586218448000.0, + "grad_norm": 2.153450022332739, + "language_loss": 0.73763341, + "learning_rate": 1.8580846286031514e-12, + "loss": 0.81429684, + "num_input_tokens_seen": 358782600, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09094238, + "step": 16625, + "time_per_iteration": 2.5666937828063965 + }, + { + "auxiliary_loss_clip": 0.06395371, + "auxiliary_loss_mlp": 0.01264271, + "balance_loss_clip": 0.06267975, + "balance_loss_mlp": 0.01254293, + "epoch": 0.9996091988576582, + "flos": 22206555313920.0, + "grad_norm": 1.9802842228291273, + "language_loss": 0.78101254, + "learning_rate": 1.3651234567202408e-12, + "loss": 0.85760903, + "num_input_tokens_seen": 358801220, + "router_z_loss_clip": 1.2734375, + "router_z_loss_mlp": 0.09979248, + "step": 16626, + "time_per_iteration": 2.5068161487579346 + }, + { + "auxiliary_loss_clip": 0.06398835, + "auxiliary_loss_mlp": 0.01267262, + "balance_loss_clip": 0.0627013, + "balance_loss_mlp": 0.01257898, + "epoch": 0.9996693221103262, + "flos": 27379257016320.0, + "grad_norm": 1.598569345061047, + "language_loss": 0.82122838, + "learning_rate": 9.480024334429515e-13, + "loss": 0.89788932, + "num_input_tokens_seen": 358819190, + "router_z_loss_clip": 1.28613281, + "router_z_loss_mlp": 0.09356689, + "step": 16627, + "time_per_iteration": 2.522557258605957 + }, + { + "auxiliary_loss_clip": 0.06405565, + "auxiliary_loss_mlp": 0.01266216, + "balance_loss_clip": 0.06270921, + "balance_loss_mlp": 0.01255958, + "epoch": 0.9997294453629941, + "flos": 26877729202560.0, + "grad_norm": 1.7858605797788545, + "language_loss": 0.70790946, + "learning_rate": 6.067215747584952e-13, + "loss": 0.78462732, + "num_input_tokens_seen": 358839850, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.1026001, + "step": 16628, + "time_per_iteration": 2.5772440433502197 + }, + { + "auxiliary_loss_clip": 0.06401081, + "auxiliary_loss_mlp": 0.01266476, + "balance_loss_clip": 0.0626926, + "balance_loss_mlp": 0.01257023, + "epoch": 0.9997895686156621, + "flos": 23483707246080.0, + "grad_norm": 1.3163404239979697, + "language_loss": 0.75694299, + "learning_rate": 3.4128089332341456e-13, + "loss": 0.83361858, + "num_input_tokens_seen": 358859805, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.09448242, + "step": 16629, + "time_per_iteration": 2.538621187210083 + }, + { + "auxiliary_loss_clip": 0.06406366, + "auxiliary_loss_mlp": 0.01265595, + "balance_loss_clip": 0.06269718, + "balance_loss_mlp": 0.0125557, + "epoch": 0.9998496918683301, + "flos": 20230325568000.0, + "grad_norm": 1.543408158505846, + "language_loss": 0.6084404, + "learning_rate": 1.5168039935176126e-13, + "loss": 0.68516004, + "num_input_tokens_seen": 358877900, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.10021973, + "step": 16630, + "time_per_iteration": 2.5081424713134766 + }, + { + "auxiliary_loss_clip": 0.06402503, + "auxiliary_loss_mlp": 0.01264058, + "balance_loss_clip": 0.06271052, + "balance_loss_mlp": 0.01254819, + "epoch": 0.9999098151209981, + "flos": 21659354225280.0, + "grad_norm": 1.8304152411760084, + "language_loss": 0.60664153, + "learning_rate": 3.792010017100722e-14, + "loss": 0.68330717, + "num_input_tokens_seen": 358897285, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09246826, + "step": 16631, + "time_per_iteration": 2.49533748626709 + }, + { + "auxiliary_loss_clip": 0.0639651, + "auxiliary_loss_mlp": 0.01262988, + "balance_loss_clip": 0.06269827, + "balance_loss_mlp": 0.01254816, + "epoch": 0.999969938373666, + "flos": 11549054977920.0, + "grad_norm": 1.7303148261606152, + "language_loss": 0.73035192, + "learning_rate": 0.0, + "loss": 0.80694693, + "num_input_tokens_seen": 358911570, + "router_z_loss_clip": 1.26757812, + "router_z_loss_mlp": 0.08172607, + "step": 16632, + "time_per_iteration": 2.4620893001556396 + } + ], + "logging_steps": 1.0, + "max_steps": 16632, + "num_input_tokens_seen": 358911570, + "num_train_epochs": 1, + "save_steps": 3328, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.399648566653223e+18, + "train_batch_size": 5, + "trial_name": null, + "trial_params": null +} diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-16632/training_args.bin b/sft/revise_Full_smoe_tcmoe/checkpoint-16632/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..97c752df28a864c1e1da329f5474435eefe7778b --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-16632/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fda08a1e9d46ee3a47070dfbfdde239474b3b39c0e298dedbf0b0dd9cdd3c27e +size 7992 diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-16632/zero_to_fp32.py b/sft/revise_Full_smoe_tcmoe/checkpoint-16632/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-16632/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-3328/added_tokens.json b/sft/revise_Full_smoe_tcmoe/checkpoint-3328/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..c9d3d3a1b74d87e381e471f7b33784015d2dc0ea --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-3328/added_tokens.json @@ -0,0 +1,13 @@ +{ + "<|assistant|>": 32001, + "<|endoftext|>": 32000, + "<|end|>": 32007, + "<|placeholder1|>": 32002, + "<|placeholder2|>": 32003, + "<|placeholder3|>": 32004, + "<|placeholder4|>": 32005, + "<|placeholder5|>": 32008, + "<|placeholder6|>": 32009, + "<|system|>": 32006, + "<|user|>": 32010 +} diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-3328/config.json b/sft/revise_Full_smoe_tcmoe/checkpoint-3328/config.json new file mode 100644 index 0000000000000000000000000000000000000000..da3b0c65c0ef1d3a1c68ffdd7565996d4dd85a33 --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-3328/config.json @@ -0,0 +1,203 @@ +{ + "_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "architectures": [ + "LlavaPhiForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_phi3.Phi3Config", + "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM" + }, + "bal_comp_loss_coef": 0.01, + "balance_loss_coef": 0.01, + "bos_token_id": 1, + "clip_smoe": true, + "diversity_loss_coef": 0.01, + "dropout": false, + "e_loss_coef": 0.001, + "embd_pdrop": 0.0, + "entropy_advance_loss": false, + "eos_token_id": 32000, + "freeze_backbone": false, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 3072, + "hybrid": false, + "image_aspect_ratio": "pad", + "init_weight": true, + "initializer_range": 0.02, + "intermediate_size": 8192, + "is_cosine": false, + "is_norm_weight": false, + "local_rank": 0, + "loss1": "balanceloss", + "loss2": "zloss", + "luna": false, + "max_compete_in_iter": 3, + "max_position_embeddings": 131072, + "mlp_smoe": true, + "mm_hidden_size": 1152, + "mm_patch_merge_type": "flat", + "mm_projector_lr": null, + "mm_projector_type": "moe", + "mm_use_im_patch_token": false, + "mm_use_im_start_end": false, + "mm_vision_select_feature": "patch", + "mm_vision_select_layer": -2, + "mm_vision_tower": "google/siglip-so400m-patch14-224", + "model_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "model_type": "llava_phi", + "moe_name": "smoe_tcmoe", + "moe_relu_l1_reg_coeff_multiplier": 1.2, + "mp_pixel_shuffle_factor": 1, + "norm_softmax": false, + "normalization": true, + "num_attention_heads": 32, + "num_experts": 4, + "num_hidden_layers": 32, + "num_key_value_heads": 32, + "num_layers": 3, + "num_selected": 2, + "number_of_previous_tokens": 2, + "original_max_position_embeddings": 4096, + "pad_token_id": 32000, + "pretrain_mm_mlp_adapter": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft/mm_projector.bin", + "rate_compete": 0.2, + "rate_flip": 0.05, + "resid_pdrop": 0.0, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "long_factor": [ + 1.0800000429153442, + 1.1100000143051147, + 1.1399999856948853, + 1.340000033378601, + 1.5899999141693115, + 1.600000023841858, + 1.6200000047683716, + 2.620000123977661, + 3.2300000190734863, + 3.2300000190734863, + 4.789999961853027, + 7.400000095367432, + 7.700000286102295, + 9.09000015258789, + 12.199999809265137, + 17.670000076293945, + 24.46000099182129, + 28.57000160217285, + 30.420001983642578, + 30.840002059936523, + 32.590003967285156, + 32.93000411987305, + 42.320003509521484, + 44.96000289916992, + 50.340003967285156, + 50.45000457763672, + 57.55000305175781, + 57.93000411987305, + 58.21000289916992, + 60.1400032043457, + 62.61000442504883, + 62.62000274658203, + 62.71000289916992, + 63.1400032043457, + 63.1400032043457, + 63.77000427246094, + 63.93000411987305, + 63.96000289916992, + 63.970001220703125, + 64.02999877929688, + 64.06999969482422, + 64.08000183105469, + 64.12000274658203, + 64.41000366210938, + 64.4800033569336, + 64.51000213623047, + 64.52999877929688, + 64.83999633789062 + ], + "short_factor": [ + 1.0, + 1.0199999809265137, + 1.0299999713897705, + 1.0299999713897705, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0699999332427979, + 1.0999999046325684, + 1.1099998950958252, + 1.1599998474121094, + 1.1599998474121094, + 1.1699998378753662, + 1.2899998426437378, + 1.339999794960022, + 1.679999828338623, + 1.7899998426437378, + 1.8199998140335083, + 1.8499997854232788, + 1.8799997568130493, + 1.9099997282028198, + 1.9399996995925903, + 1.9899996519088745, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0799996852874756, + 2.0899996757507324, + 2.189999580383301, + 2.2199995517730713, + 2.5899994373321533, + 2.729999542236328, + 2.749999523162842, + 2.8399994373321533 + ], + "type": "longrope" + }, + "rope_theta": 10000.0, + "router_loss_coef": 0.01, + "router_theta": 0.1, + "router_z_loss_coef": 0.001, + "scales": [ + 1, + 3 + ], + "sliding_window": 262144, + "sparse_upcycling": true, + "std_gate": 0.02, + "strategy_train": "base", + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "topk_max": 2, + "topk_min": 1, + "torch_dtype": "bfloat16", + "training": true, + "transformers_version": "4.43.0", + "tune_mm_mlp_adapter": false, + "unit_test": true, + "use_cache": false, + "use_mm_proj": true, + "use_old": false, + "version": "phi35", + "vision_tower": "google/siglip-so400m-patch14-224", + "vision_tower_dir": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft/clip.bin", + "vocab_size": 32064, + "warm_up": 0.05 +} diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-3328/generation_config.json b/sft/revise_Full_smoe_tcmoe/checkpoint-3328/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..dad5c4578f0dc5969b38755d095fc30c368bb54a --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-3328/generation_config.json @@ -0,0 +1,12 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "do_sample": true, + "eos_token_id": [ + 32007, + 32001, + 32000 + ], + "pad_token_id": 32000, + "transformers_version": "4.43.0" +} diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-3328/latest b/sft/revise_Full_smoe_tcmoe/checkpoint-3328/latest new file mode 100644 index 0000000000000000000000000000000000000000..2c27d5aabecd1a20f5d8e01a05251ed2cf0a7fec --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-3328/latest @@ -0,0 +1 @@ +global_step3328 \ No newline at end of file diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-3328/model-00001-of-00003.safetensors b/sft/revise_Full_smoe_tcmoe/checkpoint-3328/model-00001-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0689162692a32151f262cf5dcc6a88e6551e70de --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-3328/model-00001-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f50d089ef346f03db3e356d69aed49b5492296fb30aa2f31bb508c5ef8a5f1f7 +size 4972489328 diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-3328/model-00002-of-00003.safetensors b/sft/revise_Full_smoe_tcmoe/checkpoint-3328/model-00002-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..32f48deb9efc48c694537c3ff3efeeefe2d10cdf --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-3328/model-00002-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f76d19a47292f1533d007cce40b169a416c6395d1d99ccb5ce4103424086fc1 +size 4985902928 diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-3328/model-00003-of-00003.safetensors b/sft/revise_Full_smoe_tcmoe/checkpoint-3328/model-00003-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..30c95b38b6076775d35defe835ee2184504e459d --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-3328/model-00003-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df2e6a26479b9e7b98234cec06f91e73ae08c7c66158dc42aab22e63711920ff +size 248971200 diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-3328/model.safetensors.index.json b/sft/revise_Full_smoe_tcmoe/checkpoint-3328/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..3197289c4553bb4cba30dd31a8c232b7496a92b5 --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-3328/model.safetensors.index.json @@ -0,0 +1,1005 @@ +{ + "metadata": { + "total_size": 10207220352 + }, + "weight_map": { + "lm_head.weight": "model-00003-of-00003.safetensors", + "model.embed_tokens.weight": "model-00001-of-00003.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.16.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.16.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.17.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.17.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.18.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.18.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.19.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.19.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.21.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.25.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.25.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.25.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.26.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.26.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.26.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.27.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.27.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.27.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.28.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.28.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.28.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.29.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.29.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.29.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.30.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.30.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.30.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.31.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.31.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.31.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.mm_projector.moelayer.experts.0.0.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.0.0.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.0.2.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.0.2.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.1.0.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.1.0.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.1.2.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.1.2.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.2.0.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.2.0.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.2.2.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.2.2.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.3.0.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.3.0.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.3.2.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.3.2.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.gate.weight": "model-00003-of-00003.safetensors", + "model.norm.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00002-of-00003.safetensors" + } +} diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-3328/rng_state_0.pth b/sft/revise_Full_smoe_tcmoe/checkpoint-3328/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..9231f69f5fd461899867106a669ce247e70c72c2 --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-3328/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f9f23d807f0e704f4ca79670a6631cbff43189cf7f8ff4e1fc0a4330e636a798 +size 14960 diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-3328/rng_state_1.pth b/sft/revise_Full_smoe_tcmoe/checkpoint-3328/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..19fe2dcc766f192ea5de79cec4dcff17172a10f7 --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-3328/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d37f92f6aea5386e84d2d64a1a25d6ef96a10b3bbbfe63627981604c8934076 +size 14960 diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-3328/rng_state_2.pth b/sft/revise_Full_smoe_tcmoe/checkpoint-3328/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..bfe492519c6b79b07a8d68b98c5f3d0c073667aa --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-3328/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:667ebf727735115f00a6bdbe090344e9846c726d11bb555cdc201c415f27ad85 +size 14960 diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-3328/rng_state_3.pth b/sft/revise_Full_smoe_tcmoe/checkpoint-3328/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..838d42ad13e30851fdbd1d8801738a4106a9ce8b --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-3328/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49d306f8c511cba8a225e3b723c5fa79d8a6ecc922f834da914ff0780c78b1fc +size 14960 diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-3328/special_tokens_map.json b/sft/revise_Full_smoe_tcmoe/checkpoint-3328/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3e4d5a5bc1cb51753cc9ae0305ece0da60052b10 --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-3328/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-3328/tokenizer.model b/sft/revise_Full_smoe_tcmoe/checkpoint-3328/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-3328/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-3328/tokenizer_config.json b/sft/revise_Full_smoe_tcmoe/checkpoint-3328/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d579bb0b91b24b214ea3c2e487e27a65017cdc4a --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-3328/tokenizer_config.json @@ -0,0 +1,132 @@ +{ + "add_bos_token": false, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": false + }, + "32000": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32001": { + "content": "<|assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32002": { + "content": "<|placeholder1|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32003": { + "content": "<|placeholder2|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32004": { + "content": "<|placeholder3|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32005": { + "content": "<|placeholder4|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32006": { + "content": "<|system|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32007": { + "content": "<|end|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32008": { + "content": "<|placeholder5|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32009": { + "content": "<|placeholder6|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32010": { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "chat_template": "{% for message in messages %}{% if message['role'] == 'system' and message['content'] %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "legacy": false, + "model_max_length": 2048, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-3328/trainer_state.json b/sft/revise_Full_smoe_tcmoe/checkpoint-3328/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..e2c099dc6a1b5f73da9e8b34b4dc711ff0aa9f25 --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-3328/trainer_state.json @@ -0,0 +1,56609 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.20009018487900196, + "eval_steps": 500, + "global_step": 3328, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "auxiliary_loss_clip": 0.20073968, + "auxiliary_loss_mlp": 1.0941844, + "balance_loss_clip": 0.12873733, + "balance_loss_mlp": 0.03705556, + "epoch": 6.012325266796934e-05, + "flos": 24462952254720.0, + "grad_norm": 941654.8300602314, + "language_loss": 24.32558632, + "learning_rate": 0.0, + "loss": 16.92002487, + "num_input_tokens_seen": 19155, + "router_z_loss_clip": 72.03125, + "router_z_loss_mlp": 1058.5, + "step": 1, + "time_per_iteration": 18.343486785888672 + }, + { + "auxiliary_loss_clip": 0.13316599, + "auxiliary_loss_mlp": 0.71558112, + "balance_loss_clip": 0.08576315, + "balance_loss_mlp": 0.02466314, + "epoch": 0.00012024650533593868, + "flos": 20231457598080.0, + "grad_norm": 271164.48776572174, + "language_loss": 15.90828419, + "learning_rate": 4.4628432569317594e-07, + "loss": 16.75703049, + "num_input_tokens_seen": 36175, + "router_z_loss_clip": 47.40625, + "router_z_loss_mlp": 691.5, + "step": 2, + "time_per_iteration": 2.4823946952819824 + }, + { + "auxiliary_loss_clip": 0.13345747, + "auxiliary_loss_mlp": 0.73460984, + "balance_loss_clip": 0.08591475, + "balance_loss_mlp": 0.02464893, + "epoch": 0.000180369758003908, + "flos": 22316532197760.0, + "grad_norm": 30890.300344628693, + "language_loss": 15.82156086, + "learning_rate": 7.073439208833112e-07, + "loss": 16.68962669, + "num_input_tokens_seen": 54870, + "router_z_loss_clip": 47.46875, + "router_z_loss_mlp": 711.0, + "step": 3, + "time_per_iteration": 2.4773216247558594 + }, + { + "auxiliary_loss_clip": 0.13399127, + "auxiliary_loss_mlp": 0.72687411, + "balance_loss_clip": 0.08587996, + "balance_loss_mlp": 0.02472562, + "epoch": 0.00024049301067187735, + "flos": 22420471587840.0, + "grad_norm": 3825.373736974443, + "language_loss": 15.7262888, + "learning_rate": 8.925686513863519e-07, + "loss": 16.58715439, + "num_input_tokens_seen": 74575, + "router_z_loss_clip": 48.15625, + "router_z_loss_mlp": 703.0, + "step": 4, + "time_per_iteration": 2.492133378982544 + }, + { + "auxiliary_loss_clip": 0.13353133, + "auxiliary_loss_mlp": 0.72775936, + "balance_loss_clip": 0.08579096, + "balance_loss_mlp": 0.02463434, + "epoch": 0.0003006162633398467, + "flos": 21403286547840.0, + "grad_norm": 4441.394942298188, + "language_loss": 15.57899952, + "learning_rate": 1.0362401141348472e-06, + "loss": 16.44029045, + "num_input_tokens_seen": 92580, + "router_z_loss_clip": 47.65625, + "router_z_loss_mlp": 704.0, + "step": 5, + "time_per_iteration": 2.7607173919677734 + }, + { + "auxiliary_loss_clip": 0.13327441, + "auxiliary_loss_mlp": 0.71557182, + "balance_loss_clip": 0.08570103, + "balance_loss_mlp": 0.02465384, + "epoch": 0.000360739516007816, + "flos": 21658725319680.0, + "grad_norm": 2540.715684092784, + "language_loss": 14.90827179, + "learning_rate": 1.153628246576487e-06, + "loss": 15.75711823, + "num_input_tokens_seen": 109705, + "router_z_loss_clip": 47.5625, + "router_z_loss_mlp": 691.5, + "step": 6, + "time_per_iteration": 2.6497979164123535 + }, + { + "auxiliary_loss_clip": 0.13351092, + "auxiliary_loss_mlp": 0.7340821, + "balance_loss_clip": 0.08562777, + "balance_loss_mlp": 0.02460942, + "epoch": 0.0004208627686757854, + "flos": 27166682407680.0, + "grad_norm": 2502.417206046203, + "language_loss": 14.593853, + "learning_rate": 1.2528784983718962e-06, + "loss": 15.46144581, + "num_input_tokens_seen": 129425, + "router_z_loss_clip": 47.875, + "router_z_loss_mlp": 710.5, + "step": 7, + "time_per_iteration": 2.7325549125671387 + }, + { + "auxiliary_loss_clip": 0.13360947, + "auxiliary_loss_mlp": 0.73910165, + "balance_loss_clip": 0.08574936, + "balance_loss_mlp": 0.02474618, + "epoch": 0.0004809860213437547, + "flos": 31326727190400.0, + "grad_norm": 4081.02679202092, + "language_loss": 14.47960091, + "learning_rate": 1.338852977079528e-06, + "loss": 15.35231113, + "num_input_tokens_seen": 149210, + "router_z_loss_clip": 47.78125, + "router_z_loss_mlp": 715.5, + "step": 8, + "time_per_iteration": 2.7674574851989746 + }, + { + "auxiliary_loss_clip": 0.13345738, + "auxiliary_loss_mlp": 0.74048162, + "balance_loss_clip": 0.08564517, + "balance_loss_mlp": 0.02466127, + "epoch": 0.000541109274011724, + "flos": 32168541634560.0, + "grad_norm": 2607.7195165159947, + "language_loss": 13.74505424, + "learning_rate": 1.4146878417666224e-06, + "loss": 14.61899281, + "num_input_tokens_seen": 169055, + "router_z_loss_clip": 47.78125, + "router_z_loss_mlp": 716.5, + "step": 9, + "time_per_iteration": 2.8135807514190674 + }, + { + "auxiliary_loss_clip": 0.13289651, + "auxiliary_loss_mlp": 0.7478379, + "balance_loss_clip": 0.08548209, + "balance_loss_mlp": 0.02469334, + "epoch": 0.0006012325266796934, + "flos": 18922845657600.0, + "grad_norm": 8226.203152944285, + "language_loss": 12.47718525, + "learning_rate": 1.4825244398280232e-06, + "loss": 13.35791969, + "num_input_tokens_seen": 188045, + "router_z_loss_clip": 47.375, + "router_z_loss_mlp": 724.5, + "step": 10, + "time_per_iteration": 2.665703296661377 + }, + { + "auxiliary_loss_clip": 0.1330242, + "auxiliary_loss_mlp": 0.74298382, + "balance_loss_clip": 0.08549603, + "balance_loss_mlp": 0.02472211, + "epoch": 0.0006613557793476627, + "flos": 20780755038720.0, + "grad_norm": 29924.608712817644, + "language_loss": 12.23305321, + "learning_rate": 1.5438901072051983e-06, + "loss": 13.10906219, + "num_input_tokens_seen": 207035, + "router_z_loss_clip": 47.53125, + "router_z_loss_mlp": 719.0, + "step": 11, + "time_per_iteration": 2.6799204349517822 + }, + { + "auxiliary_loss_clip": 0.133246, + "auxiliary_loss_mlp": 0.74782056, + "balance_loss_clip": 0.08560382, + "balance_loss_mlp": 0.02467602, + "epoch": 0.000721479032015632, + "flos": 16587321603840.0, + "grad_norm": 24119.088684995622, + "language_loss": 11.84583473, + "learning_rate": 1.5999125722696629e-06, + "loss": 12.72690105, + "num_input_tokens_seen": 223225, + "router_z_loss_clip": 47.6875, + "router_z_loss_mlp": 723.5, + "step": 12, + "time_per_iteration": 2.707231044769287 + }, + { + "auxiliary_loss_clip": 0.13276552, + "auxiliary_loss_mlp": 0.74238944, + "balance_loss_clip": 0.08559544, + "balance_loss_mlp": 0.02461605, + "epoch": 0.0007816022846836014, + "flos": 23812254305280.0, + "grad_norm": 118556.26638855682, + "language_loss": 11.36912918, + "learning_rate": 1.6514482443788434e-06, + "loss": 12.24428368, + "num_input_tokens_seen": 242570, + "router_z_loss_clip": 47.1875, + "router_z_loss_mlp": 718.0, + "step": 13, + "time_per_iteration": 2.696007251739502 + }, + { + "auxiliary_loss_clip": 0.13292459, + "auxiliary_loss_mlp": 0.74095768, + "balance_loss_clip": 0.0856985, + "balance_loss_mlp": 0.02464909, + "epoch": 0.0008417255373515708, + "flos": 19178284429440.0, + "grad_norm": 181106.81391623587, + "language_loss": 10.94849205, + "learning_rate": 1.6991628240650723e-06, + "loss": 11.82237434, + "num_input_tokens_seen": 261215, + "router_z_loss_clip": 47.1875, + "router_z_loss_mlp": 716.5, + "step": 14, + "time_per_iteration": 2.676393985748291 + }, + { + "auxiliary_loss_clip": 0.13372461, + "auxiliary_loss_mlp": 0.75321233, + "balance_loss_clip": 0.08592231, + "balance_loss_mlp": 0.02469672, + "epoch": 0.00090184879001954, + "flos": 26402714006400.0, + "grad_norm": 8872.944602873076, + "language_loss": 11.40745831, + "learning_rate": 1.7435840350181584e-06, + "loss": 12.29439545, + "num_input_tokens_seen": 280035, + "router_z_loss_clip": 47.78125, + "router_z_loss_mlp": 729.5, + "step": 15, + "time_per_iteration": 2.716722249984741 + }, + { + "auxiliary_loss_clip": 0.13287091, + "auxiliary_loss_mlp": 0.73999238, + "balance_loss_clip": 0.0855229, + "balance_loss_mlp": 0.02466036, + "epoch": 0.0009619720426875094, + "flos": 24686157663360.0, + "grad_norm": 5195.838129438997, + "language_loss": 10.71900749, + "learning_rate": 1.7851373027727038e-06, + "loss": 11.59187126, + "num_input_tokens_seen": 300265, + "router_z_loss_clip": 47.3125, + "router_z_loss_mlp": 716.5, + "step": 16, + "time_per_iteration": 2.744054079055786 + }, + { + "auxiliary_loss_clip": 0.13309729, + "auxiliary_loss_mlp": 0.76006317, + "balance_loss_clip": 0.08562544, + "balance_loss_mlp": 0.0247116, + "epoch": 0.0010220952953554788, + "flos": 18630454435200.0, + "grad_norm": 4421.362455936007, + "language_loss": 10.42590714, + "learning_rate": 1.8241705979033208e-06, + "loss": 11.319067, + "num_input_tokens_seen": 317375, + "router_z_loss_clip": 47.5, + "router_z_loss_mlp": 736.0, + "step": 17, + "time_per_iteration": 4.191499471664429 + }, + { + "auxiliary_loss_clip": 0.13315202, + "auxiliary_loss_mlp": 0.7600373, + "balance_loss_clip": 0.08556177, + "balance_loss_mlp": 0.02468574, + "epoch": 0.001082218548023448, + "flos": 26150042419200.0, + "grad_norm": 7888.125072686045, + "language_loss": 9.94283867, + "learning_rate": 1.860972167459798e-06, + "loss": 10.83602905, + "num_input_tokens_seen": 337975, + "router_z_loss_clip": 47.625, + "router_z_loss_mlp": 735.5, + "step": 18, + "time_per_iteration": 2.7808027267456055 + }, + { + "auxiliary_loss_clip": 0.13318592, + "auxiliary_loss_mlp": 0.73953104, + "balance_loss_clip": 0.08563764, + "balance_loss_mlp": 0.02468731, + "epoch": 0.0011423418006914173, + "flos": 19615885977600.0, + "grad_norm": 21999.592558043798, + "language_loss": 8.84625435, + "learning_rate": 1.89578346593066e-06, + "loss": 9.71897125, + "num_input_tokens_seen": 356635, + "router_z_loss_clip": 47.53125, + "router_z_loss_mlp": 716.0, + "step": 19, + "time_per_iteration": 4.131728172302246 + }, + { + "auxiliary_loss_clip": 0.13303626, + "auxiliary_loss_mlp": 0.74244332, + "balance_loss_clip": 0.08565694, + "balance_loss_mlp": 0.02466989, + "epoch": 0.0012024650533593868, + "flos": 17901258278400.0, + "grad_norm": 4121.169450537968, + "language_loss": 8.27947521, + "learning_rate": 1.928808765521199e-06, + "loss": 9.15495491, + "num_input_tokens_seen": 375625, + "router_z_loss_clip": 47.34375, + "router_z_loss_mlp": 718.5, + "step": 20, + "time_per_iteration": 2.708914279937744 + }, + { + "auxiliary_loss_clip": 0.13338368, + "auxiliary_loss_mlp": 0.76394671, + "balance_loss_clip": 0.08570746, + "balance_loss_mlp": 0.02468888, + "epoch": 0.001262588306027356, + "flos": 21258495492480.0, + "grad_norm": 4514.811048777073, + "language_loss": 8.72282791, + "learning_rate": 1.9602224192552076e-06, + "loss": 9.62015915, + "num_input_tokens_seen": 394350, + "router_z_loss_clip": 47.6875, + "router_z_loss_mlp": 740.0, + "step": 21, + "time_per_iteration": 2.685307502746582 + }, + { + "auxiliary_loss_clip": 0.13281943, + "auxiliary_loss_mlp": 0.75118458, + "balance_loss_clip": 0.08552284, + "balance_loss_mlp": 0.02462207, + "epoch": 0.0013227115586953253, + "flos": 26111245178880.0, + "grad_norm": 4471.445911682346, + "language_loss": 8.71503925, + "learning_rate": 1.9901744328983746e-06, + "loss": 9.5990448, + "num_input_tokens_seen": 413255, + "router_z_loss_clip": 47.28125, + "router_z_loss_mlp": 727.5, + "step": 22, + "time_per_iteration": 2.734961748123169 + }, + { + "auxiliary_loss_clip": 0.13285899, + "auxiliary_loss_mlp": 0.73805398, + "balance_loss_clip": 0.08560154, + "balance_loss_mlp": 0.02467511, + "epoch": 0.0013828348113632948, + "flos": 23958177390720.0, + "grad_norm": 2111.5818511880134, + "language_loss": 8.18912506, + "learning_rate": 2.018794797290208e-06, + "loss": 9.06003761, + "num_input_tokens_seen": 433065, + "router_z_loss_clip": 47.3125, + "router_z_loss_mlp": 714.5, + "step": 23, + "time_per_iteration": 2.756584882736206 + }, + { + "auxiliary_loss_clip": 0.13278747, + "auxiliary_loss_mlp": 0.74887347, + "balance_loss_clip": 0.08537573, + "balance_loss_mlp": 0.0247524, + "epoch": 0.001442958064031264, + "flos": 15965125511040.0, + "grad_norm": 1807.1551511559412, + "language_loss": 8.28752899, + "learning_rate": 2.046196897962839e-06, + "loss": 9.16918945, + "num_input_tokens_seen": 451175, + "router_z_loss_clip": 47.4375, + "router_z_loss_mlp": 724.5, + "step": 24, + "time_per_iteration": 2.6928858757019043 + }, + { + "auxiliary_loss_clip": 0.13229564, + "auxiliary_loss_mlp": 0.73557305, + "balance_loss_clip": 0.08544464, + "balance_loss_mlp": 0.02463556, + "epoch": 0.0015030813166992333, + "flos": 18113287835520.0, + "grad_norm": 1186.4376598888527, + "language_loss": 7.80813074, + "learning_rate": 2.0724802282696944e-06, + "loss": 8.67599869, + "num_input_tokens_seen": 468775, + "router_z_loss_clip": 46.875, + "router_z_loss_mlp": 712.0, + "step": 25, + "time_per_iteration": 2.7093117237091064 + }, + { + "auxiliary_loss_clip": 0.13238442, + "auxiliary_loss_mlp": 0.7248075, + "balance_loss_clip": 0.085484, + "balance_loss_mlp": 0.02461214, + "epoch": 0.0015632045693672028, + "flos": 22240740579840.0, + "grad_norm": 3090.3782450571143, + "language_loss": 8.51009178, + "learning_rate": 2.0977325700720194e-06, + "loss": 9.36728287, + "num_input_tokens_seen": 488530, + "router_z_loss_clip": 46.875, + "router_z_loss_mlp": 701.0, + "step": 26, + "time_per_iteration": 2.7142887115478516 + }, + { + "auxiliary_loss_clip": 0.13264546, + "auxiliary_loss_mlp": 0.74387956, + "balance_loss_clip": 0.085568, + "balance_loss_mlp": 0.02464127, + "epoch": 0.001623327822035172, + "flos": 23999448326400.0, + "grad_norm": 883.8040958014411, + "language_loss": 8.80418682, + "learning_rate": 2.122031762649933e-06, + "loss": 9.68071175, + "num_input_tokens_seen": 510495, + "router_z_loss_clip": 47.03125, + "router_z_loss_mlp": 720.5, + "step": 27, + "time_per_iteration": 2.739086389541626 + }, + { + "auxiliary_loss_clip": 0.13261499, + "auxiliary_loss_mlp": 0.74588925, + "balance_loss_clip": 0.08545862, + "balance_loss_mlp": 0.02469785, + "epoch": 0.0016834510747031415, + "flos": 19682914844160.0, + "grad_norm": 778.9563997110462, + "language_loss": 7.52667618, + "learning_rate": 2.1454471497582483e-06, + "loss": 8.40517998, + "num_input_tokens_seen": 528605, + "router_z_loss_clip": 47.125, + "router_z_loss_mlp": 722.0, + "step": 28, + "time_per_iteration": 2.684328079223633 + }, + { + "auxiliary_loss_clip": 0.1322532, + "auxiliary_loss_mlp": 0.72868228, + "balance_loss_clip": 0.08545788, + "balance_loss_mlp": 0.02458075, + "epoch": 0.0017435743273711108, + "flos": 20930241922560.0, + "grad_norm": 711.3301469780024, + "language_loss": 7.32490015, + "learning_rate": 2.1680407726407727e-06, + "loss": 8.18583584, + "num_input_tokens_seen": 548515, + "router_z_loss_clip": 46.84375, + "router_z_loss_mlp": 705.0, + "step": 29, + "time_per_iteration": 2.6822586059570312 + }, + { + "auxiliary_loss_clip": 0.13197789, + "auxiliary_loss_mlp": 0.72772777, + "balance_loss_clip": 0.08529261, + "balance_loss_mlp": 0.02460276, + "epoch": 0.00180369758003908, + "flos": 19533763376640.0, + "grad_norm": 596.7513494595695, + "language_loss": 7.62213326, + "learning_rate": 2.189868360711334e-06, + "loss": 8.48183823, + "num_input_tokens_seen": 564025, + "router_z_loss_clip": 46.65625, + "router_z_loss_mlp": 703.5, + "step": 30, + "time_per_iteration": 2.66929030418396 + }, + { + "auxiliary_loss_clip": 0.13220352, + "auxiliary_loss_mlp": 0.73066145, + "balance_loss_clip": 0.08544487, + "balance_loss_mlp": 0.02460678, + "epoch": 0.0018638208327070496, + "flos": 27460415295360.0, + "grad_norm": 562.9814252823624, + "language_loss": 6.46621895, + "learning_rate": 2.2109801597326265e-06, + "loss": 7.32908344, + "num_input_tokens_seen": 583345, + "router_z_loss_clip": 46.78125, + "router_z_loss_mlp": 707.0, + "step": 31, + "time_per_iteration": 2.769524574279785 + }, + { + "auxiliary_loss_clip": 0.13217463, + "auxiliary_loss_mlp": 0.72719908, + "balance_loss_clip": 0.08546316, + "balance_loss_mlp": 0.02456231, + "epoch": 0.0019239440853750188, + "flos": 13594535723520.0, + "grad_norm": 932.7202356227122, + "language_loss": 6.38840246, + "learning_rate": 2.2314216284658796e-06, + "loss": 7.24777603, + "num_input_tokens_seen": 600010, + "router_z_loss_clip": 46.65625, + "router_z_loss_mlp": 703.0, + "step": 32, + "time_per_iteration": 2.6535158157348633 + }, + { + "auxiliary_loss_clip": 0.13187753, + "auxiliary_loss_mlp": 0.73303366, + "balance_loss_clip": 0.08555806, + "balance_loss_mlp": 0.02453755, + "epoch": 0.001984067338042988, + "flos": 11258466618240.0, + "grad_norm": 1313.3745045414653, + "language_loss": 6.49637842, + "learning_rate": 2.2512340280885094e-06, + "loss": 7.36128998, + "num_input_tokens_seen": 616295, + "router_z_loss_clip": 46.34375, + "router_z_loss_mlp": 709.5, + "step": 33, + "time_per_iteration": 2.7210733890533447 + }, + { + "auxiliary_loss_clip": 0.13162288, + "auxiliary_loss_mlp": 0.73504317, + "balance_loss_clip": 0.08544378, + "balance_loss_mlp": 0.02459392, + "epoch": 0.0020441905907109576, + "flos": 22393413918720.0, + "grad_norm": 826.9088902553285, + "language_loss": 6.77253819, + "learning_rate": 2.270454923596497e-06, + "loss": 7.6392045, + "num_input_tokens_seen": 637640, + "router_z_loss_clip": 46.0625, + "router_z_loss_mlp": 711.5, + "step": 34, + "time_per_iteration": 2.7001218795776367 + }, + { + "auxiliary_loss_clip": 0.13097668, + "auxiliary_loss_mlp": 0.75116229, + "balance_loss_clip": 0.08524574, + "balance_loss_mlp": 0.02459984, + "epoch": 0.0021043138433789266, + "flos": 49788911427840.0, + "grad_norm": 577.9485802079388, + "language_loss": 6.20400715, + "learning_rate": 2.2891186125067434e-06, + "loss": 7.08614588, + "num_input_tokens_seen": 659710, + "router_z_loss_clip": 45.6875, + "router_z_loss_mlp": 727.0, + "step": 35, + "time_per_iteration": 3.031013250350952 + }, + { + "auxiliary_loss_clip": 0.13148203, + "auxiliary_loss_mlp": 0.75109303, + "balance_loss_clip": 0.08537915, + "balance_loss_mlp": 0.02453051, + "epoch": 0.002164437096046896, + "flos": 20564155434240.0, + "grad_norm": 623.9821605724222, + "language_loss": 6.06852198, + "learning_rate": 2.307256493152974e-06, + "loss": 6.95109653, + "num_input_tokens_seen": 679670, + "router_z_loss_clip": 46.0625, + "router_z_loss_mlp": 727.0, + "step": 36, + "time_per_iteration": 2.7437260150909424 + }, + { + "auxiliary_loss_clip": 0.13138273, + "auxiliary_loss_mlp": 0.77219343, + "balance_loss_clip": 0.08535384, + "balance_loss_mlp": 0.02463487, + "epoch": 0.0022245603487148656, + "flos": 26549601413760.0, + "grad_norm": 1356.3181729473308, + "language_loss": 6.23619747, + "learning_rate": 2.3248973825097614e-06, + "loss": 7.13977337, + "num_input_tokens_seen": 700170, + "router_z_loss_clip": 46.03125, + "router_z_loss_mlp": 747.5, + "step": 37, + "time_per_iteration": 2.761021375656128 + }, + { + "auxiliary_loss_clip": 0.1308586, + "auxiliary_loss_mlp": 0.75746208, + "balance_loss_clip": 0.0852948, + "balance_loss_mlp": 0.02455192, + "epoch": 0.0022846836013828346, + "flos": 20344201666560.0, + "grad_norm": 550.1318567752543, + "language_loss": 6.76989794, + "learning_rate": 2.3420677916238357e-06, + "loss": 7.65821838, + "num_input_tokens_seen": 718545, + "router_z_loss_clip": 45.53125, + "router_z_loss_mlp": 733.5, + "step": 38, + "time_per_iteration": 2.797001600265503 + }, + { + "auxiliary_loss_clip": 0.13035053, + "auxiliary_loss_mlp": 0.76824772, + "balance_loss_clip": 0.08534516, + "balance_loss_mlp": 0.02459541, + "epoch": 0.002344806854050804, + "flos": 26254359152640.0, + "grad_norm": 327.614641212253, + "language_loss": 6.69246101, + "learning_rate": 2.358792165262154e-06, + "loss": 7.59105968, + "num_input_tokens_seen": 739865, + "router_z_loss_clip": 45.0, + "router_z_loss_mlp": 744.0, + "step": 39, + "time_per_iteration": 2.7852022647857666 + }, + { + "auxiliary_loss_clip": 0.1300399, + "auxiliary_loss_mlp": 0.74368668, + "balance_loss_clip": 0.08536238, + "balance_loss_mlp": 0.0244484, + "epoch": 0.0024049301067187736, + "flos": 11806296612480.0, + "grad_norm": 474.92846081285364, + "language_loss": 5.92113161, + "learning_rate": 2.3750930912143747e-06, + "loss": 6.79485798, + "num_input_tokens_seen": 755770, + "router_z_loss_clip": 44.6875, + "router_z_loss_mlp": 720.0, + "step": 40, + "time_per_iteration": 2.679415464401245 + }, + { + "auxiliary_loss_clip": 0.1309007, + "auxiliary_loss_mlp": 0.78535652, + "balance_loss_clip": 0.08556648, + "balance_loss_mlp": 0.02461432, + "epoch": 0.0024650533593867426, + "flos": 20637808773120.0, + "grad_norm": 345.5419638030077, + "language_loss": 6.47731018, + "learning_rate": 2.3909914837471044e-06, + "loss": 7.39356709, + "num_input_tokens_seen": 773440, + "router_z_loss_clip": 45.3125, + "router_z_loss_mlp": 760.0, + "step": 41, + "time_per_iteration": 2.835094928741455 + }, + { + "auxiliary_loss_clip": 0.13010421, + "auxiliary_loss_mlp": 0.76229548, + "balance_loss_clip": 0.08534975, + "balance_loss_mlp": 0.02450255, + "epoch": 0.002525176612054712, + "flos": 18412093895040.0, + "grad_norm": 622.6550674421553, + "language_loss": 6.03043365, + "learning_rate": 2.4065067449483835e-06, + "loss": 6.92283392, + "num_input_tokens_seen": 790455, + "router_z_loss_clip": 44.75, + "router_z_loss_mlp": 738.0, + "step": 42, + "time_per_iteration": 2.66955828666687 + }, + { + "auxiliary_loss_clip": 0.13026509, + "auxiliary_loss_mlp": 0.76781166, + "balance_loss_clip": 0.08538143, + "balance_loss_mlp": 0.02464763, + "epoch": 0.0025852998647226816, + "flos": 28191582023040.0, + "grad_norm": 8462.035545761653, + "language_loss": 5.972929, + "learning_rate": 2.4216569070848724e-06, + "loss": 6.87100601, + "num_input_tokens_seen": 810645, + "router_z_loss_clip": 44.875, + "router_z_loss_mlp": 744.0, + "step": 43, + "time_per_iteration": 2.7703070640563965 + }, + { + "auxiliary_loss_clip": 0.13056265, + "auxiliary_loss_mlp": 0.74383116, + "balance_loss_clip": 0.0856277, + "balance_loss_mlp": 0.02459292, + "epoch": 0.0026454231173906506, + "flos": 14288372657280.0, + "grad_norm": 293.14149660558166, + "language_loss": 5.65497112, + "learning_rate": 2.4364587585915504e-06, + "loss": 6.52936459, + "num_input_tokens_seen": 827470, + "router_z_loss_clip": 44.875, + "router_z_loss_mlp": 720.0, + "step": 44, + "time_per_iteration": 2.655585527420044 + }, + { + "auxiliary_loss_clip": 0.13054577, + "auxiliary_loss_mlp": 0.75350422, + "balance_loss_clip": 0.08569255, + "balance_loss_mlp": 0.02450033, + "epoch": 0.00270554637005862, + "flos": 22425796321920.0, + "grad_norm": 174.2843578867089, + "language_loss": 6.01187468, + "learning_rate": 2.450927955901469e-06, + "loss": 6.89592457, + "num_input_tokens_seen": 847285, + "router_z_loss_clip": 44.84375, + "router_z_loss_mlp": 730.0, + "step": 45, + "time_per_iteration": 2.705265522003174 + }, + { + "auxiliary_loss_clip": 0.12984964, + "auxiliary_loss_mlp": 0.73199093, + "balance_loss_clip": 0.08560722, + "balance_loss_mlp": 0.02447144, + "epoch": 0.0027656696227265896, + "flos": 23992236875520.0, + "grad_norm": 191.3929439681521, + "language_loss": 6.48347139, + "learning_rate": 2.465079122983384e-06, + "loss": 7.34531212, + "num_input_tokens_seen": 867545, + "router_z_loss_clip": 44.1875, + "router_z_loss_mlp": 708.5, + "step": 46, + "time_per_iteration": 2.733833074569702 + }, + { + "auxiliary_loss_clip": 0.12997682, + "auxiliary_loss_mlp": 0.73999059, + "balance_loss_clip": 0.08536641, + "balance_loss_mlp": 0.02465855, + "epoch": 0.0028257928753945586, + "flos": 37678511220480.0, + "grad_norm": 214.21785552289575, + "language_loss": 5.68396425, + "learning_rate": 2.4789259401737868e-06, + "loss": 6.55393171, + "num_input_tokens_seen": 889915, + "router_z_loss_clip": 44.5625, + "router_z_loss_mlp": 716.0, + "step": 47, + "time_per_iteration": 2.8230926990509033 + }, + { + "auxiliary_loss_clip": 0.1297729, + "auxiliary_loss_mlp": 0.74471426, + "balance_loss_clip": 0.08536708, + "balance_loss_mlp": 0.0244994, + "epoch": 0.002885916128062528, + "flos": 22460945909760.0, + "grad_norm": 449.4004858001912, + "language_loss": 5.75540733, + "learning_rate": 2.492481223656015e-06, + "loss": 6.62989426, + "num_input_tokens_seen": 908975, + "router_z_loss_clip": 44.40625, + "router_z_loss_mlp": 721.5, + "step": 48, + "time_per_iteration": 2.7284624576568604 + }, + { + "auxiliary_loss_clip": 0.12959239, + "auxiliary_loss_mlp": 0.73848325, + "balance_loss_clip": 0.08549985, + "balance_loss_mlp": 0.02461606, + "epoch": 0.0029460393807304976, + "flos": 27019543438080.0, + "grad_norm": 230.30029270071188, + "language_loss": 6.70517731, + "learning_rate": 2.5057569967437924e-06, + "loss": 7.57325315, + "num_input_tokens_seen": 929810, + "router_z_loss_clip": 44.0625, + "router_z_loss_mlp": 715.0, + "step": 49, + "time_per_iteration": 2.792755603790283 + }, + { + "auxiliary_loss_clip": 0.12996669, + "auxiliary_loss_mlp": 0.71446228, + "balance_loss_clip": 0.08555867, + "balance_loss_mlp": 0.02452083, + "epoch": 0.0030061626333984666, + "flos": 15857328833280.0, + "grad_norm": 311.93786428729913, + "language_loss": 5.55702782, + "learning_rate": 2.51876455396287e-06, + "loss": 6.40145731, + "num_input_tokens_seen": 948650, + "router_z_loss_clip": 44.34375, + "router_z_loss_mlp": 690.5, + "step": 50, + "time_per_iteration": 2.689176559448242 + }, + { + "auxiliary_loss_clip": 0.12955803, + "auxiliary_loss_mlp": 0.71350002, + "balance_loss_clip": 0.08553191, + "balance_loss_mlp": 0.02453516, + "epoch": 0.003066285886066436, + "flos": 31834292497920.0, + "grad_norm": 326.0050772098012, + "language_loss": 6.42039013, + "learning_rate": 2.5315145187866316e-06, + "loss": 7.26344872, + "num_input_tokens_seen": 966455, + "router_z_loss_clip": 44.0, + "router_z_loss_mlp": 689.5, + "step": 51, + "time_per_iteration": 2.751997232437134 + }, + { + "auxiliary_loss_clip": 0.12936625, + "auxiliary_loss_mlp": 0.71062022, + "balance_loss_clip": 0.08552323, + "balance_loss_mlp": 0.02458507, + "epoch": 0.0031264091387344056, + "flos": 41437110291840.0, + "grad_norm": 467.7969407780881, + "language_loss": 5.78601551, + "learning_rate": 2.5440168957651953e-06, + "loss": 6.62600183, + "num_input_tokens_seen": 988110, + "router_z_loss_clip": 43.84375, + "router_z_loss_mlp": 686.5, + "step": 52, + "time_per_iteration": 2.8259687423706055 + }, + { + "auxiliary_loss_clip": 0.12935326, + "auxiliary_loss_mlp": 0.69343221, + "balance_loss_clip": 0.08550587, + "balance_loss_mlp": 0.02448688, + "epoch": 0.0031865323914023747, + "flos": 23447719117440.0, + "grad_norm": 4084.3297995155954, + "language_loss": 5.79331207, + "learning_rate": 2.5562811176888872e-06, + "loss": 6.61609745, + "num_input_tokens_seen": 1008550, + "router_z_loss_clip": 43.78125, + "router_z_loss_mlp": 669.0, + "step": 53, + "time_per_iteration": 2.6902496814727783 + }, + { + "auxiliary_loss_clip": 0.12926383, + "auxiliary_loss_mlp": 0.69104648, + "balance_loss_clip": 0.08542258, + "balance_loss_mlp": 0.02454257, + "epoch": 0.003246655644070344, + "flos": 14434505377920.0, + "grad_norm": 247.18448581495338, + "language_loss": 5.53028297, + "learning_rate": 2.5683160883431093e-06, + "loss": 6.35059309, + "num_input_tokens_seen": 1026840, + "router_z_loss_clip": 43.75, + "router_z_loss_mlp": 666.5, + "step": 54, + "time_per_iteration": 2.642801523208618 + }, + { + "auxiliary_loss_clip": 0.12913677, + "auxiliary_loss_mlp": 0.68966341, + "balance_loss_clip": 0.08543722, + "balance_loss_mlp": 0.02462436, + "epoch": 0.0033067788967383136, + "flos": 35926972997760.0, + "grad_norm": 431.229914559421, + "language_loss": 5.18386555, + "learning_rate": 2.580130221340046e-06, + "loss": 6.00266552, + "num_input_tokens_seen": 1048875, + "router_z_loss_clip": 43.6875, + "router_z_loss_mlp": 665.0, + "step": 55, + "time_per_iteration": 2.7916810512542725 + }, + { + "auxiliary_loss_clip": 0.12884736, + "auxiliary_loss_mlp": 0.68559694, + "balance_loss_clip": 0.08553176, + "balance_loss_mlp": 0.02446416, + "epoch": 0.003366902149406283, + "flos": 22964108878080.0, + "grad_norm": 559.5224439968259, + "language_loss": 5.74156904, + "learning_rate": 2.5917314754514246e-06, + "loss": 6.55601311, + "num_input_tokens_seen": 1066435, + "router_z_loss_clip": 43.28125, + "router_z_loss_mlp": 661.0, + "step": 56, + "time_per_iteration": 2.638873338699341 + }, + { + "auxiliary_loss_clip": 0.12877631, + "auxiliary_loss_mlp": 0.65916806, + "balance_loss_clip": 0.08553813, + "balance_loss_mlp": 0.02440244, + "epoch": 0.003427025402074252, + "flos": 26590830422400.0, + "grad_norm": 1293.1571760901363, + "language_loss": 6.61670828, + "learning_rate": 2.6031273868139713e-06, + "loss": 7.4046526, + "num_input_tokens_seen": 1090330, + "router_z_loss_clip": 43.28125, + "router_z_loss_mlp": 634.0, + "step": 57, + "time_per_iteration": 4.246931314468384 + }, + { + "auxiliary_loss_clip": 0.12864697, + "auxiliary_loss_mlp": 0.66109824, + "balance_loss_clip": 0.08544569, + "balance_loss_mlp": 0.02437945, + "epoch": 0.0034871486547422216, + "flos": 23957967755520.0, + "grad_norm": 1581.401693587077, + "language_loss": 6.75815916, + "learning_rate": 2.614325098333948e-06, + "loss": 7.54790401, + "num_input_tokens_seen": 1109840, + "router_z_loss_clip": 43.25, + "router_z_loss_mlp": 636.0, + "step": 58, + "time_per_iteration": 4.129940986633301 + }, + { + "auxiliary_loss_clip": 0.12923497, + "auxiliary_loss_mlp": 0.64957327, + "balance_loss_clip": 0.08577307, + "balance_loss_mlp": 0.02457325, + "epoch": 0.003547271907410191, + "flos": 21221333406720.0, + "grad_norm": 1242.7465016222895, + "language_loss": 5.84827662, + "learning_rate": 2.625331386578098e-06, + "loss": 6.62708521, + "num_input_tokens_seen": 1128415, + "router_z_loss_clip": 43.40625, + "router_z_loss_mlp": 624.0, + "step": 59, + "time_per_iteration": 2.81791090965271 + }, + { + "auxiliary_loss_clip": 0.1292145, + "auxiliary_loss_mlp": 0.65939367, + "balance_loss_clip": 0.08575267, + "balance_loss_mlp": 0.02462805, + "epoch": 0.00360739516007816, + "flos": 16509894831360.0, + "grad_norm": 2163.0106173410372, + "language_loss": 6.19513655, + "learning_rate": 2.63615268640451e-06, + "loss": 6.98374462, + "num_input_tokens_seen": 1146515, + "router_z_loss_clip": 43.4375, + "router_z_loss_mlp": 634.0, + "step": 60, + "time_per_iteration": 2.6462490558624268 + }, + { + "auxiliary_loss_clip": 0.12888563, + "auxiliary_loss_mlp": 0.64225286, + "balance_loss_clip": 0.08565725, + "balance_loss_mlp": 0.0245771, + "epoch": 0.0036675184127461296, + "flos": 19471052995200.0, + "grad_norm": 635.7445513752676, + "language_loss": 5.79569387, + "learning_rate": 2.6467951135575943e-06, + "loss": 6.56683254, + "num_input_tokens_seen": 1166330, + "router_z_loss_clip": 43.21875, + "router_z_loss_mlp": 617.0, + "step": 61, + "time_per_iteration": 2.681910753250122 + }, + { + "auxiliary_loss_clip": 0.12824672, + "auxiliary_loss_mlp": 0.63430971, + "balance_loss_clip": 0.08548941, + "balance_loss_mlp": 0.02444647, + "epoch": 0.003727641665414099, + "flos": 20963253231360.0, + "grad_norm": 899.0914058712833, + "language_loss": 5.87668133, + "learning_rate": 2.657264485425803e-06, + "loss": 6.63923836, + "num_input_tokens_seen": 1186010, + "router_z_loss_clip": 42.71875, + "router_z_loss_mlp": 609.0, + "step": 62, + "time_per_iteration": 2.6819515228271484 + }, + { + "auxiliary_loss_clip": 0.12823591, + "auxiliary_loss_mlp": 0.6255362, + "balance_loss_clip": 0.08562292, + "balance_loss_mlp": 0.02446202, + "epoch": 0.003787764918082068, + "flos": 18412010040960.0, + "grad_norm": 1285.0325266073119, + "language_loss": 5.71324301, + "learning_rate": 2.6675663401385186e-06, + "loss": 6.46701479, + "num_input_tokens_seen": 1204985, + "router_z_loss_clip": 42.59375, + "router_z_loss_mlp": 600.0, + "step": 63, + "time_per_iteration": 2.6705985069274902 + }, + { + "auxiliary_loss_clip": 0.12830947, + "auxiliary_loss_mlp": 0.62154531, + "balance_loss_clip": 0.08567161, + "balance_loss_mlp": 0.02437731, + "epoch": 0.0038478881707500376, + "flos": 12464271198720.0, + "grad_norm": 1843.6770385957534, + "language_loss": 5.25008583, + "learning_rate": 2.677705954159056e-06, + "loss": 5.99994087, + "num_input_tokens_seen": 1223545, + "router_z_loss_clip": 42.6875, + "router_z_loss_mlp": 597.0, + "step": 64, + "time_per_iteration": 2.7688894271850586 + }, + { + "auxiliary_loss_clip": 0.12807481, + "auxiliary_loss_mlp": 0.61575615, + "balance_loss_clip": 0.08564365, + "balance_loss_mlp": 0.02444756, + "epoch": 0.003908011423418007, + "flos": 13558463740800.0, + "grad_norm": 1007.498474071754, + "language_loss": 5.29735851, + "learning_rate": 2.6876883585136904e-06, + "loss": 6.04118919, + "num_input_tokens_seen": 1241175, + "router_z_loss_clip": 42.40625, + "router_z_loss_mlp": 590.5, + "step": 65, + "time_per_iteration": 2.7044079303741455 + }, + { + "auxiliary_loss_clip": 0.12739113, + "auxiliary_loss_mlp": 0.60150075, + "balance_loss_clip": 0.08550942, + "balance_loss_mlp": 0.02435229, + "epoch": 0.003968134676085976, + "flos": 18339488732160.0, + "grad_norm": 1472.5993340381553, + "language_loss": 5.05529404, + "learning_rate": 2.697518353781685e-06, + "loss": 5.78418589, + "num_input_tokens_seen": 1259315, + "router_z_loss_clip": 41.90625, + "router_z_loss_mlp": 577.0, + "step": 66, + "time_per_iteration": 2.639763116836548 + }, + { + "auxiliary_loss_clip": 0.12713413, + "auxiliary_loss_mlp": 0.58826029, + "balance_loss_clip": 0.08548602, + "balance_loss_mlp": 0.02429543, + "epoch": 0.004028257928753946, + "flos": 20491466417280.0, + "grad_norm": 2128.447716031984, + "language_loss": 5.57779789, + "learning_rate": 2.7072005239581103e-06, + "loss": 6.29319191, + "num_input_tokens_seen": 1277055, + "router_z_loss_clip": 41.65625, + "router_z_loss_mlp": 564.0, + "step": 67, + "time_per_iteration": 2.6764183044433594 + }, + { + "auxiliary_loss_clip": 0.12659386, + "auxiliary_loss_mlp": 0.59566367, + "balance_loss_clip": 0.08534892, + "balance_loss_mlp": 0.02437462, + "epoch": 0.004088381181421915, + "flos": 18849863151360.0, + "grad_norm": 1300.1095038466112, + "language_loss": 5.65431881, + "learning_rate": 2.7167392492896727e-06, + "loss": 6.37657642, + "num_input_tokens_seen": 1294355, + "router_z_loss_clip": 41.21875, + "router_z_loss_mlp": 571.5, + "step": 68, + "time_per_iteration": 2.6499533653259277 + }, + { + "auxiliary_loss_clip": 0.12670201, + "auxiliary_loss_mlp": 0.59023213, + "balance_loss_clip": 0.08528139, + "balance_loss_mlp": 0.02431421, + "epoch": 0.004148504434089885, + "flos": 19433974763520.0, + "grad_norm": 775.8661457915586, + "language_loss": 5.68540192, + "learning_rate": 2.7261387181735195e-06, + "loss": 6.40233564, + "num_input_tokens_seen": 1313525, + "router_z_loss_clip": 41.375, + "router_z_loss_mlp": 566.0, + "step": 69, + "time_per_iteration": 2.680570363998413 + }, + { + "auxiliary_loss_clip": 0.12638462, + "auxiliary_loss_mlp": 0.5930984, + "balance_loss_clip": 0.08532386, + "balance_loss_mlp": 0.02425073, + "epoch": 0.004208627686757853, + "flos": 20816868948480.0, + "grad_norm": 532.7078221445815, + "language_loss": 6.55753994, + "learning_rate": 2.7354029381999196e-06, + "loss": 7.27702332, + "num_input_tokens_seen": 1330505, + "router_z_loss_clip": 41.09375, + "router_z_loss_mlp": 570.0, + "step": 70, + "time_per_iteration": 2.6596553325653076 + }, + { + "auxiliary_loss_clip": 0.12589023, + "auxiliary_loss_mlp": 0.57596606, + "balance_loss_clip": 0.08525643, + "balance_loss_mlp": 0.02420826, + "epoch": 0.004268750939425823, + "flos": 19104589163520.0, + "grad_norm": 3523.620393185992, + "language_loss": 4.99572229, + "learning_rate": 2.7445357464116983e-06, + "loss": 5.69757891, + "num_input_tokens_seen": 1349615, + "router_z_loss_clip": 40.71875, + "router_z_loss_mlp": 552.5, + "step": 71, + "time_per_iteration": 2.6517086029052734 + }, + { + "auxiliary_loss_clip": 0.13345143, + "auxiliary_loss_mlp": 0.53337634, + "balance_loss_clip": 0.08910056, + "balance_loss_mlp": 0.02458726, + "epoch": 0.004328874192093792, + "flos": 52456112340480.0, + "grad_norm": 24.73254947156558, + "language_loss": 0.75920403, + "learning_rate": 2.75354081884615e-06, + "loss": 1.42603183, + "num_input_tokens_seen": 1410275, + "router_z_loss_clip": 44.375, + "router_z_loss_mlp": 508.25, + "step": 72, + "time_per_iteration": 3.4461121559143066 + }, + { + "auxiliary_loss_clip": 0.13279217, + "auxiliary_loss_mlp": 0.51093936, + "balance_loss_clip": 0.08903308, + "balance_loss_mlp": 0.02436709, + "epoch": 0.004388997444761762, + "flos": 66495922260480.0, + "grad_norm": 24.018429481505308, + "language_loss": 0.70889235, + "learning_rate": 2.7624216794188286e-06, + "loss": 1.35262394, + "num_input_tokens_seen": 1473020, + "router_z_loss_clip": 43.71875, + "router_z_loss_mlp": 486.25, + "step": 73, + "time_per_iteration": 3.8973076343536377 + }, + { + "auxiliary_loss_clip": 0.12491501, + "auxiliary_loss_mlp": 0.53349555, + "balance_loss_clip": 0.08502775, + "balance_loss_mlp": 0.02397403, + "epoch": 0.004449120697429731, + "flos": 18958959567360.0, + "grad_norm": 3320.4524015503866, + "language_loss": 5.2433157, + "learning_rate": 2.771181708202938e-06, + "loss": 5.90172577, + "num_input_tokens_seen": 1490385, + "router_z_loss_clip": 39.90625, + "router_z_loss_mlp": 509.5, + "step": 74, + "time_per_iteration": 2.6803529262542725 + }, + { + "auxiliary_loss_clip": 0.12445074, + "auxiliary_loss_mlp": 0.51731253, + "balance_loss_clip": 0.08501716, + "balance_loss_mlp": 0.02390428, + "epoch": 0.004509243950097701, + "flos": 21111817720320.0, + "grad_norm": 2097.466788992517, + "language_loss": 5.57566261, + "learning_rate": 2.779824149153005e-06, + "loss": 6.21742582, + "num_input_tokens_seen": 1509725, + "router_z_loss_clip": 39.4375, + "router_z_loss_mlp": 493.0, + "step": 75, + "time_per_iteration": 2.687678575515747 + }, + { + "auxiliary_loss_clip": 0.12385009, + "auxiliary_loss_mlp": 0.49917772, + "balance_loss_clip": 0.08505447, + "balance_loss_mlp": 0.0235918, + "epoch": 0.004569367202765669, + "flos": 20704082952960.0, + "grad_norm": 7030.779065512956, + "language_loss": 5.64007378, + "learning_rate": 2.788352117317012e-06, + "loss": 6.26310158, + "num_input_tokens_seen": 1527245, + "router_z_loss_clip": 38.8125, + "router_z_loss_mlp": 475.25, + "step": 76, + "time_per_iteration": 2.666630744934082 + }, + { + "auxiliary_loss_clip": 0.12336895, + "auxiliary_loss_mlp": 0.48941305, + "balance_loss_clip": 0.08483945, + "balance_loss_mlp": 0.02359273, + "epoch": 0.004629490455433639, + "flos": 28666136021760.0, + "grad_norm": 620.4309602119407, + "language_loss": 5.72052956, + "learning_rate": 2.796768605577095e-06, + "loss": 6.33331108, + "num_input_tokens_seen": 1548930, + "router_z_loss_clip": 38.5, + "router_z_loss_mlp": 465.5, + "step": 77, + "time_per_iteration": 2.7469568252563477 + }, + { + "auxiliary_loss_clip": 0.12308235, + "auxiliary_loss_mlp": 0.48191378, + "balance_loss_clip": 0.08460534, + "balance_loss_mlp": 0.02366182, + "epoch": 0.004689613708101608, + "flos": 11077142382720.0, + "grad_norm": 1643.3438058920954, + "language_loss": 5.09305811, + "learning_rate": 2.80507649095533e-06, + "loss": 5.69805431, + "num_input_tokens_seen": 1565695, + "router_z_loss_clip": 38.5, + "router_z_loss_mlp": 458.25, + "step": 78, + "time_per_iteration": 2.6558547019958496 + }, + { + "auxiliary_loss_clip": 0.12249273, + "auxiliary_loss_mlp": 0.46293706, + "balance_loss_clip": 0.08442898, + "balance_loss_mlp": 0.02348393, + "epoch": 0.004749736960769578, + "flos": 21805612727040.0, + "grad_norm": 2200.9167741447113, + "language_loss": 4.90451622, + "learning_rate": 2.813278540517843e-06, + "loss": 5.48994637, + "num_input_tokens_seen": 1582625, + "router_z_loss_clip": 38.0625, + "router_z_loss_mlp": 439.75, + "step": 79, + "time_per_iteration": 2.7162697315216064 + }, + { + "auxiliary_loss_clip": 0.12262511, + "auxiliary_loss_mlp": 0.46983981, + "balance_loss_clip": 0.08447941, + "balance_loss_mlp": 0.02355075, + "epoch": 0.004809860213437547, + "flos": 19798803440640.0, + "grad_norm": 344.66463824801895, + "language_loss": 5.05523586, + "learning_rate": 2.8213774169075505e-06, + "loss": 5.64770126, + "num_input_tokens_seen": 1601725, + "router_z_loss_clip": 38.125, + "router_z_loss_mlp": 446.75, + "step": 80, + "time_per_iteration": 2.687460422515869 + }, + { + "auxiliary_loss_clip": 0.12261841, + "auxiliary_loss_mlp": 0.45211679, + "balance_loss_clip": 0.08451226, + "balance_loss_mlp": 0.02364997, + "epoch": 0.004869983466105517, + "flos": 26580893713920.0, + "grad_norm": 1677.7099343970488, + "language_loss": 5.56453705, + "learning_rate": 2.829375683533245e-06, + "loss": 6.13927221, + "num_input_tokens_seen": 1622420, + "router_z_loss_clip": 38.125, + "router_z_loss_mlp": 428.5, + "step": 81, + "time_per_iteration": 2.7709527015686035 + }, + { + "auxiliary_loss_clip": 0.12245495, + "auxiliary_loss_mlp": 0.44303346, + "balance_loss_clip": 0.08439148, + "balance_loss_mlp": 0.02335574, + "epoch": 0.004930106718773485, + "flos": 12828345189120.0, + "grad_norm": 4679.4395433895315, + "language_loss": 4.60398674, + "learning_rate": 2.8372758094402803e-06, + "loss": 5.16947508, + "num_input_tokens_seen": 1640715, + "router_z_loss_clip": 38.125, + "router_z_loss_mlp": 419.75, + "step": 82, + "time_per_iteration": 2.6463286876678467 + }, + { + "auxiliary_loss_clip": 0.12233329, + "auxiliary_loss_mlp": 0.44903332, + "balance_loss_clip": 0.0843938, + "balance_loss_mlp": 0.0234962, + "epoch": 0.004990229971441455, + "flos": 25781901505920.0, + "grad_norm": 1468.5073951038269, + "language_loss": 5.41148376, + "learning_rate": 2.84508017388607e-06, + "loss": 5.98285007, + "num_input_tokens_seen": 1662210, + "router_z_loss_clip": 37.96875, + "router_z_loss_mlp": 425.5, + "step": 83, + "time_per_iteration": 2.751582145690918 + }, + { + "auxiliary_loss_clip": 0.12286501, + "auxiliary_loss_mlp": 0.44843888, + "balance_loss_clip": 0.08466095, + "balance_loss_mlp": 0.0236342, + "epoch": 0.005050353224109424, + "flos": 17463027824640.0, + "grad_norm": 333.54187308321605, + "language_loss": 4.89241934, + "learning_rate": 2.852791070641559e-06, + "loss": 5.46372318, + "num_input_tokens_seen": 1681070, + "router_z_loss_clip": 38.21875, + "router_z_loss_mlp": 425.0, + "step": 84, + "time_per_iteration": 2.6613667011260986 + }, + { + "auxiliary_loss_clip": 0.12715524, + "auxiliary_loss_mlp": 0.33666173, + "balance_loss_clip": 0.08695208, + "balance_loss_mlp": 0.02245275, + "epoch": 0.005110476476777394, + "flos": 69824607160320.0, + "grad_norm": 16.750834021856043, + "language_loss": 0.63998127, + "learning_rate": 2.8604107120381682e-06, + "loss": 1.10379827, + "num_input_tokens_seen": 1747140, + "router_z_loss_clip": 40.09375, + "router_z_loss_mlp": 313.75, + "step": 85, + "time_per_iteration": 3.4564764499664307 + }, + { + "auxiliary_loss_clip": 0.12209877, + "auxiliary_loss_mlp": 0.42757708, + "balance_loss_clip": 0.08426955, + "balance_loss_mlp": 0.02352437, + "epoch": 0.005170599729445363, + "flos": 24796973088000.0, + "grad_norm": 542.703970895993, + "language_loss": 4.92362881, + "learning_rate": 2.8679412327780482e-06, + "loss": 5.47330475, + "num_input_tokens_seen": 1767475, + "router_z_loss_clip": 37.90625, + "router_z_loss_mlp": 403.75, + "step": 86, + "time_per_iteration": 2.775689125061035 + }, + { + "auxiliary_loss_clip": 0.12224952, + "auxiliary_loss_mlp": 0.4164477, + "balance_loss_clip": 0.08412233, + "balance_loss_mlp": 0.02362544, + "epoch": 0.005230722982113333, + "flos": 23264717800320.0, + "grad_norm": 4371.207136836947, + "language_loss": 5.4414258, + "learning_rate": 2.8753846935240833e-06, + "loss": 5.98012304, + "num_input_tokens_seen": 1784980, + "router_z_loss_clip": 38.15625, + "router_z_loss_mlp": 392.25, + "step": 87, + "time_per_iteration": 2.7322311401367188 + }, + { + "auxiliary_loss_clip": 0.12200201, + "auxiliary_loss_mlp": 0.41744971, + "balance_loss_clip": 0.08406796, + "balance_loss_mlp": 0.02365087, + "epoch": 0.005290846234781301, + "flos": 16733622032640.0, + "grad_norm": 2919.861295310318, + "language_loss": 4.86351013, + "learning_rate": 2.8827430842847267e-06, + "loss": 5.40296173, + "num_input_tokens_seen": 1803030, + "router_z_loss_clip": 38.0, + "router_z_loss_mlp": 393.75, + "step": 88, + "time_per_iteration": 2.7260544300079346 + }, + { + "auxiliary_loss_clip": 0.1219901, + "auxiliary_loss_mlp": 0.40224642, + "balance_loss_clip": 0.08417168, + "balance_loss_mlp": 0.02358433, + "epoch": 0.005350969487449271, + "flos": 20892283223040.0, + "grad_norm": 1645.58162705774, + "language_loss": 5.16751766, + "learning_rate": 2.8900183276075957e-06, + "loss": 5.69175386, + "num_input_tokens_seen": 1822865, + "router_z_loss_clip": 37.875, + "router_z_loss_mlp": 378.5, + "step": 89, + "time_per_iteration": 2.674370288848877 + }, + { + "auxiliary_loss_clip": 0.12154645, + "auxiliary_loss_mlp": 0.38342261, + "balance_loss_clip": 0.0840472, + "balance_loss_mlp": 0.02331517, + "epoch": 0.00541109274011724, + "flos": 26216568161280.0, + "grad_norm": 1270.091627450628, + "language_loss": 4.37986279, + "learning_rate": 2.8972122815946455e-06, + "loss": 4.88483191, + "num_input_tokens_seen": 1842435, + "router_z_loss_clip": 37.5, + "router_z_loss_mlp": 360.75, + "step": 90, + "time_per_iteration": 2.7423648834228516 + }, + { + "auxiliary_loss_clip": 0.12150387, + "auxiliary_loss_mlp": 0.38653693, + "balance_loss_clip": 0.08385181, + "balance_loss_mlp": 0.02349981, + "epoch": 0.00547121599278521, + "flos": 21184926007680.0, + "grad_norm": 803.9563265609303, + "language_loss": 5.31085825, + "learning_rate": 2.90432674275074e-06, + "loss": 5.81889915, + "num_input_tokens_seen": 1860065, + "router_z_loss_clip": 37.6875, + "router_z_loss_mlp": 363.0, + "step": 91, + "time_per_iteration": 2.6603400707244873 + }, + { + "auxiliary_loss_clip": 0.12079477, + "auxiliary_loss_mlp": 0.37034535, + "balance_loss_clip": 0.08381163, + "balance_loss_mlp": 0.02342154, + "epoch": 0.005531339245453179, + "flos": 19724856612480.0, + "grad_norm": 829.7403965041182, + "language_loss": 4.4634366, + "learning_rate": 2.91136344867656e-06, + "loss": 4.95457649, + "num_input_tokens_seen": 1878135, + "router_z_loss_clip": 37.0, + "router_z_loss_mlp": 347.25, + "step": 92, + "time_per_iteration": 2.6818525791168213 + }, + { + "auxiliary_loss_clip": 0.1209444, + "auxiliary_loss_mlp": 0.35073167, + "balance_loss_clip": 0.08383686, + "balance_loss_mlp": 0.02309498, + "epoch": 0.005591462498121149, + "flos": 17641291386240.0, + "grad_norm": 1625.08326205636, + "language_loss": 4.56070709, + "learning_rate": 2.918324080615938e-06, + "loss": 5.03238297, + "num_input_tokens_seen": 1894895, + "router_z_loss_clip": 37.125, + "router_z_loss_mlp": 327.5, + "step": 93, + "time_per_iteration": 2.612030029296875 + }, + { + "auxiliary_loss_clip": 0.12023389, + "auxiliary_loss_mlp": 0.34590679, + "balance_loss_clip": 0.08357395, + "balance_loss_mlp": 0.02290875, + "epoch": 0.005651585750789117, + "flos": 20017415543040.0, + "grad_norm": 681.2724931544728, + "language_loss": 4.70847607, + "learning_rate": 2.925210265866963e-06, + "loss": 5.17461681, + "num_input_tokens_seen": 1913220, + "router_z_loss_clip": 36.625, + "router_z_loss_mlp": 322.75, + "step": 94, + "time_per_iteration": 2.6726646423339844 + }, + { + "auxiliary_loss_clip": 0.12331794, + "auxiliary_loss_mlp": 0.21429604, + "balance_loss_clip": 0.08515669, + "balance_loss_mlp": 0.01873939, + "epoch": 0.005711709003457087, + "flos": 59831202758400.0, + "grad_norm": 11.50707364837694, + "language_loss": 0.68575168, + "learning_rate": 2.932023580065507e-06, + "loss": 1.02336574, + "num_input_tokens_seen": 1970970, + "router_z_loss_clip": 38.0, + "router_z_loss_mlp": 195.25, + "step": 95, + "time_per_iteration": 3.168633222579956 + }, + { + "auxiliary_loss_clip": 0.11899618, + "auxiliary_loss_mlp": 0.32138801, + "balance_loss_clip": 0.08329217, + "balance_loss_mlp": 0.02231575, + "epoch": 0.005771832256125056, + "flos": 15564979537920.0, + "grad_norm": 1013.3395640383166, + "language_loss": 4.49414778, + "learning_rate": 2.9387655493491906e-06, + "loss": 4.93453217, + "num_input_tokens_seen": 1988930, + "router_z_loss_clip": 35.71875, + "router_z_loss_mlp": 298.5, + "step": 96, + "time_per_iteration": 5.5690062046051025 + }, + { + "auxiliary_loss_clip": 0.11822618, + "auxiliary_loss_mlp": 0.30064785, + "balance_loss_clip": 0.08285143, + "balance_loss_mlp": 0.02220548, + "epoch": 0.005831955508793026, + "flos": 22534934664960.0, + "grad_norm": 2356.5481695677104, + "language_loss": 5.16498899, + "learning_rate": 2.9454376524092147e-06, + "loss": 5.58386326, + "num_input_tokens_seen": 2006285, + "router_z_loss_clip": 35.4375, + "router_z_loss_mlp": 278.375, + "step": 97, + "time_per_iteration": 4.129577159881592 + }, + { + "auxiliary_loss_clip": 0.11772624, + "auxiliary_loss_mlp": 0.27429676, + "balance_loss_clip": 0.08268203, + "balance_loss_mlp": 0.02161121, + "epoch": 0.005892078761460995, + "flos": 22055600983680.0, + "grad_norm": 1442.767046866879, + "language_loss": 4.65611029, + "learning_rate": 2.952041322436969e-06, + "loss": 5.04813337, + "num_input_tokens_seen": 2024905, + "router_z_loss_clip": 35.03125, + "router_z_loss_mlp": 252.75, + "step": 98, + "time_per_iteration": 4.072925567626953 + }, + { + "auxiliary_loss_clip": 0.12124368, + "auxiliary_loss_mlp": 0.12855935, + "balance_loss_clip": 0.08381641, + "balance_loss_mlp": 0.01625466, + "epoch": 0.005952202014128965, + "flos": 68559865632000.0, + "grad_norm": 9.945172746585492, + "language_loss": 0.65681642, + "learning_rate": 2.9585779489718204e-06, + "loss": 0.90661949, + "num_input_tokens_seen": 2086220, + "router_z_loss_clip": 37.46875, + "router_z_loss_mlp": 112.4375, + "step": 99, + "time_per_iteration": 3.3806052207946777 + }, + { + "auxiliary_loss_clip": 0.11659142, + "auxiliary_loss_mlp": 0.25495899, + "balance_loss_clip": 0.08219896, + "balance_loss_mlp": 0.02095021, + "epoch": 0.006012325266796933, + "flos": 22966624500480.0, + "grad_norm": 5439.355539233552, + "language_loss": 4.89178705, + "learning_rate": 2.9650488796560464e-06, + "loss": 5.26333714, + "num_input_tokens_seen": 2103365, + "router_z_loss_clip": 34.34375, + "router_z_loss_mlp": 233.875, + "step": 100, + "time_per_iteration": 2.6920084953308105 + }, + { + "auxiliary_loss_clip": 0.11642508, + "auxiliary_loss_mlp": 0.23216301, + "balance_loss_clip": 0.08225508, + "balance_loss_mlp": 0.02037103, + "epoch": 0.006072448519464903, + "flos": 17353721773440.0, + "grad_norm": 71170.85330308754, + "language_loss": 4.95652103, + "learning_rate": 2.971455421902446e-06, + "loss": 5.30510902, + "num_input_tokens_seen": 2121995, + "router_z_loss_clip": 34.15625, + "router_z_loss_mlp": 211.875, + "step": 101, + "time_per_iteration": 2.652926206588745 + }, + { + "auxiliary_loss_clip": 0.11583164, + "auxiliary_loss_mlp": 0.214275, + "balance_loss_clip": 0.08206252, + "balance_loss_mlp": 0.01957287, + "epoch": 0.006132571772132872, + "flos": 24688044380160.0, + "grad_norm": 7482.306451170957, + "language_loss": 5.13341808, + "learning_rate": 2.9777988444798075e-06, + "loss": 5.4635253, + "num_input_tokens_seen": 2141815, + "router_z_loss_clip": 33.78125, + "router_z_loss_mlp": 194.625, + "step": 102, + "time_per_iteration": 2.7020983695983887 + }, + { + "auxiliary_loss_clip": 0.11553724, + "auxiliary_loss_mlp": 0.20282698, + "balance_loss_clip": 0.08193958, + "balance_loss_mlp": 0.01923322, + "epoch": 0.006192695024800842, + "flos": 21471279736320.0, + "grad_norm": 1966.1076689836887, + "language_loss": 4.95062399, + "learning_rate": 2.9840803790210285e-06, + "loss": 5.26898813, + "num_input_tokens_seen": 2161125, + "router_z_loss_clip": 33.59375, + "router_z_loss_mlp": 183.75, + "step": 103, + "time_per_iteration": 2.652406692504883 + }, + { + "auxiliary_loss_clip": 0.11498895, + "auxiliary_loss_mlp": 0.18188542, + "balance_loss_clip": 0.08159411, + "balance_loss_mlp": 0.01855535, + "epoch": 0.006252818277468811, + "flos": 17426117301120.0, + "grad_norm": 4017.94727583705, + "language_loss": 4.81252193, + "learning_rate": 2.990301221458371e-06, + "loss": 5.10939646, + "num_input_tokens_seen": 2179510, + "router_z_loss_clip": 33.40625, + "router_z_loss_mlp": 163.25, + "step": 104, + "time_per_iteration": 2.6669459342956543 + }, + { + "auxiliary_loss_clip": 0.11507185, + "auxiliary_loss_mlp": 0.18210354, + "balance_loss_clip": 0.081876, + "balance_loss_mlp": 0.01852931, + "epoch": 0.006312941530136781, + "flos": 19105679266560.0, + "grad_norm": 5275.119248926157, + "language_loss": 4.54453945, + "learning_rate": 2.9964625333900544e-06, + "loss": 4.84171486, + "num_input_tokens_seen": 2197870, + "router_z_loss_clip": 33.1875, + "router_z_loss_mlp": 163.625, + "step": 105, + "time_per_iteration": 2.6467208862304688 + }, + { + "auxiliary_loss_clip": 0.11489026, + "auxiliary_loss_mlp": 0.17571044, + "balance_loss_clip": 0.08164956, + "balance_loss_mlp": 0.01872801, + "epoch": 0.006373064782804749, + "flos": 24067651150080.0, + "grad_norm": 56669.614766689854, + "language_loss": 4.9280014, + "learning_rate": 3.002565443382063e-06, + "loss": 5.2186017, + "num_input_tokens_seen": 2217495, + "router_z_loss_clip": 33.1875, + "router_z_loss_mlp": 157.0, + "step": 106, + "time_per_iteration": 2.7375807762145996 + }, + { + "auxiliary_loss_clip": 0.11464141, + "auxiliary_loss_mlp": 0.16512999, + "balance_loss_clip": 0.08158538, + "balance_loss_mlp": 0.01815734, + "epoch": 0.006433188035472719, + "flos": 18338272848000.0, + "grad_norm": 94457.61945163306, + "language_loss": 4.08243847, + "learning_rate": 3.008611048208843e-06, + "loss": 4.36221027, + "num_input_tokens_seen": 2236520, + "router_z_loss_clip": 33.0625, + "router_z_loss_mlp": 146.875, + "step": 107, + "time_per_iteration": 2.6703994274139404 + }, + { + "auxiliary_loss_clip": 0.12281319, + "auxiliary_loss_mlp": 0.04033342, + "balance_loss_clip": 0.08292686, + "balance_loss_mlp": 0.01773516, + "epoch": 0.006493311288140688, + "flos": 62583266257920.0, + "grad_norm": 1.9990534397749096, + "language_loss": 0.6506741, + "learning_rate": 3.014600414036285e-06, + "loss": 0.81382072, + "num_input_tokens_seen": 2300140, + "router_z_loss_clip": 40.0, + "router_z_loss_mlp": 22.640625, + "step": 108, + "time_per_iteration": 3.3318073749542236 + }, + { + "auxiliary_loss_clip": 0.1146347, + "auxiliary_loss_mlp": 0.17600623, + "balance_loss_clip": 0.08161052, + "balance_loss_mlp": 0.01902381, + "epoch": 0.006553434540808658, + "flos": 19506202583040.0, + "grad_norm": 2213.052526088781, + "language_loss": 5.47699499, + "learning_rate": 3.0205345775501937e-06, + "loss": 5.76763535, + "num_input_tokens_seen": 2317320, + "router_z_loss_clip": 33.03125, + "router_z_loss_mlp": 156.875, + "step": 109, + "time_per_iteration": 2.719162940979004 + }, + { + "auxiliary_loss_clip": 0.11452536, + "auxiliary_loss_mlp": 0.16698027, + "balance_loss_clip": 0.08172794, + "balance_loss_mlp": 0.01903106, + "epoch": 0.006613557793476627, + "flos": 21111398449920.0, + "grad_norm": 8171.333832946622, + "language_loss": 4.33011436, + "learning_rate": 3.0264145470332218e-06, + "loss": 4.61161995, + "num_input_tokens_seen": 2337820, + "router_z_loss_clip": 32.765625, + "router_z_loss_mlp": 147.75, + "step": 110, + "time_per_iteration": 2.7021584510803223 + }, + { + "auxiliary_loss_clip": 0.11498255, + "auxiliary_loss_mlp": 0.16723976, + "balance_loss_clip": 0.08168858, + "balance_loss_mlp": 0.01916846, + "epoch": 0.006673681046144597, + "flos": 26037843402240.0, + "grad_norm": 85243.79091039153, + "language_loss": 5.33909988, + "learning_rate": 3.032241303393073e-06, + "loss": 5.62132263, + "num_input_tokens_seen": 2358560, + "router_z_loss_clip": 33.25, + "router_z_loss_mlp": 148.0625, + "step": 111, + "time_per_iteration": 2.763227939605713 + }, + { + "auxiliary_loss_clip": 0.11479855, + "auxiliary_loss_mlp": 0.17865081, + "balance_loss_clip": 0.08154993, + "balance_loss_mlp": 0.01983733, + "epoch": 0.006733804298812566, + "flos": 23154279719040.0, + "grad_norm": 75829.31622331966, + "language_loss": 4.96874857, + "learning_rate": 3.0380158011446e-06, + "loss": 5.26219797, + "num_input_tokens_seen": 2379005, + "router_z_loss_clip": 33.1875, + "router_z_loss_mlp": 158.875, + "step": 112, + "time_per_iteration": 2.656294822692871 + }, + { + "auxiliary_loss_clip": 0.1147141, + "auxiliary_loss_mlp": 0.17070231, + "balance_loss_clip": 0.08172764, + "balance_loss_mlp": 0.01933513, + "epoch": 0.006793927551480535, + "flos": 11769092599680.0, + "grad_norm": 3384.2074822155987, + "language_loss": 4.32218456, + "learning_rate": 3.0437389693482466e-06, + "loss": 4.60760117, + "num_input_tokens_seen": 2395610, + "router_z_loss_clip": 32.96875, + "router_z_loss_mlp": 151.25, + "step": 113, + "time_per_iteration": 2.6669225692749023 + }, + { + "auxiliary_loss_clip": 0.11510996, + "auxiliary_loss_mlp": 0.18198231, + "balance_loss_clip": 0.08184206, + "balance_loss_mlp": 0.019995, + "epoch": 0.006854050804148504, + "flos": 19177990940160.0, + "grad_norm": 1118.9556792976962, + "language_loss": 4.58965397, + "learning_rate": 3.0494117125071475e-06, + "loss": 4.88674641, + "num_input_tokens_seen": 2415005, + "router_z_loss_clip": 33.28125, + "router_z_loss_mlp": 161.875, + "step": 114, + "time_per_iteration": 2.6245124340057373 + }, + { + "auxiliary_loss_clip": 0.11491105, + "auxiliary_loss_mlp": 0.15876909, + "balance_loss_clip": 0.08183911, + "balance_loss_mlp": 0.01912064, + "epoch": 0.006914174056816474, + "flos": 21988488263040.0, + "grad_norm": 3570.8470324102345, + "language_loss": 4.92026377, + "learning_rate": 3.055034911425055e-06, + "loss": 5.19394398, + "num_input_tokens_seen": 2433965, + "router_z_loss_clip": 33.09375, + "router_z_loss_mlp": 139.625, + "step": 115, + "time_per_iteration": 2.694258689880371 + }, + { + "auxiliary_loss_clip": 0.11497033, + "auxiliary_loss_mlp": 0.17786066, + "balance_loss_clip": 0.08183155, + "balance_loss_mlp": 0.02014583, + "epoch": 0.006974297309484443, + "flos": 16294636892160.0, + "grad_norm": 28497.885490954828, + "language_loss": 4.11111546, + "learning_rate": 3.0606094240271244e-06, + "loss": 4.40394688, + "num_input_tokens_seen": 2451605, + "router_z_loss_clip": 33.125, + "router_z_loss_mlp": 157.75, + "step": 116, + "time_per_iteration": 2.6153717041015625 + }, + { + "auxiliary_loss_clip": 0.11479296, + "auxiliary_loss_mlp": 0.17568065, + "balance_loss_clip": 0.08183482, + "balance_loss_mlp": 0.02040722, + "epoch": 0.007034420562152413, + "flos": 26111161324800.0, + "grad_norm": 6129.230277666204, + "language_loss": 4.56221914, + "learning_rate": 3.0661360861454656e-06, + "loss": 4.8526926, + "num_input_tokens_seen": 2472035, + "router_z_loss_clip": 32.90625, + "router_z_loss_mlp": 155.25, + "step": 117, + "time_per_iteration": 2.698347568511963 + }, + { + "auxiliary_loss_clip": 0.11602448, + "auxiliary_loss_mlp": 0.18875569, + "balance_loss_clip": 0.08221327, + "balance_loss_mlp": 0.02151936, + "epoch": 0.007094543814820382, + "flos": 14208933386880.0, + "grad_norm": 568.8145863995832, + "language_loss": 4.50002289, + "learning_rate": 3.071615712271274e-06, + "loss": 4.80480337, + "num_input_tokens_seen": 2489285, + "router_z_loss_clip": 33.75, + "router_z_loss_mlp": 167.375, + "step": 118, + "time_per_iteration": 2.614288091659546 + }, + { + "auxiliary_loss_clip": 0.11586175, + "auxiliary_loss_mlp": 0.17393641, + "balance_loss_clip": 0.08235049, + "balance_loss_mlp": 0.02086024, + "epoch": 0.007154667067488351, + "flos": 14981329123200.0, + "grad_norm": 337.3163881950513, + "language_loss": 4.89806128, + "learning_rate": 3.0770490962752172e-06, + "loss": 5.18785954, + "num_input_tokens_seen": 2506460, + "router_z_loss_clip": 33.5625, + "router_z_loss_mlp": 153.0, + "step": 119, + "time_per_iteration": 2.6733670234680176 + }, + { + "auxiliary_loss_clip": 0.11613901, + "auxiliary_loss_mlp": 0.17884746, + "balance_loss_clip": 0.08224175, + "balance_loss_mlp": 0.02088849, + "epoch": 0.00721479032015632, + "flos": 20199452538240.0, + "grad_norm": 4431.2993639449, + "language_loss": 4.39706039, + "learning_rate": 3.082437012097686e-06, + "loss": 4.69204712, + "num_input_tokens_seen": 2525565, + "router_z_loss_clip": 33.9375, + "router_z_loss_mlp": 157.75, + "step": 120, + "time_per_iteration": 2.6733429431915283 + }, + { + "auxiliary_loss_clip": 0.11614023, + "auxiliary_loss_mlp": 0.18062758, + "balance_loss_clip": 0.0821183, + "balance_loss_mlp": 0.02144791, + "epoch": 0.00727491357282429, + "flos": 23153650813440.0, + "grad_norm": 6523.034573603343, + "language_loss": 5.06446743, + "learning_rate": 3.0877802144103967e-06, + "loss": 5.36123562, + "num_input_tokens_seen": 2546605, + "router_z_loss_clip": 34.0, + "router_z_loss_mlp": 159.0, + "step": 121, + "time_per_iteration": 2.726327419281006 + }, + { + "auxiliary_loss_clip": 0.11618941, + "auxiliary_loss_mlp": 0.17642631, + "balance_loss_clip": 0.08232379, + "balance_loss_mlp": 0.02127495, + "epoch": 0.007335036825492259, + "flos": 15526811203200.0, + "grad_norm": 1010.4173973733286, + "language_loss": 4.56235886, + "learning_rate": 3.09307943925077e-06, + "loss": 4.85497475, + "num_input_tokens_seen": 2560730, + "router_z_loss_clip": 33.875, + "router_z_loss_mlp": 155.125, + "step": 122, + "time_per_iteration": 2.640110969543457 + }, + { + "auxiliary_loss_clip": 0.11591011, + "auxiliary_loss_mlp": 0.16755471, + "balance_loss_clip": 0.08221178, + "balance_loss_mlp": 0.02094828, + "epoch": 0.007395160078160229, + "flos": 24250233196800.0, + "grad_norm": 4778.191954305265, + "language_loss": 4.97837877, + "learning_rate": 3.0983354046304154e-06, + "loss": 5.2618432, + "num_input_tokens_seen": 2579550, + "router_z_loss_clip": 33.625, + "router_z_loss_mlp": 146.625, + "step": 123, + "time_per_iteration": 2.689462661743164 + }, + { + "auxiliary_loss_clip": 0.11583175, + "auxiliary_loss_mlp": 0.16522312, + "balance_loss_clip": 0.08218054, + "balance_loss_mlp": 0.02069187, + "epoch": 0.007455283330828198, + "flos": 31767976391040.0, + "grad_norm": 918.147653305623, + "language_loss": 4.24658871, + "learning_rate": 3.103548811118979e-06, + "loss": 4.5276432, + "num_input_tokens_seen": 2600390, + "router_z_loss_clip": 33.6875, + "router_z_loss_mlp": 144.625, + "step": 124, + "time_per_iteration": 2.79850172996521 + }, + { + "auxiliary_loss_clip": 0.11631332, + "auxiliary_loss_mlp": 0.17508414, + "balance_loss_clip": 0.08243011, + "balance_loss_mlp": 0.02151969, + "epoch": 0.007515406583496167, + "flos": 26622458138880.0, + "grad_norm": 2521.4972321949017, + "language_loss": 4.22364092, + "learning_rate": 3.108720342404542e-06, + "loss": 4.51503849, + "num_input_tokens_seen": 2620770, + "router_z_loss_clip": 33.90625, + "router_z_loss_mlp": 153.375, + "step": 125, + "time_per_iteration": 2.699488401412964 + }, + { + "auxiliary_loss_clip": 0.11621339, + "auxiliary_loss_mlp": 0.16743667, + "balance_loss_clip": 0.08258513, + "balance_loss_mlp": 0.02131851, + "epoch": 0.007575529836164136, + "flos": 18229637629440.0, + "grad_norm": 2114.724785338214, + "language_loss": 4.42466068, + "learning_rate": 3.1138506658316945e-06, + "loss": 4.70831108, + "num_input_tokens_seen": 2639900, + "router_z_loss_clip": 33.625, + "router_z_loss_mlp": 146.125, + "step": 126, + "time_per_iteration": 2.65913987159729 + }, + { + "auxiliary_loss_clip": 0.11678092, + "auxiliary_loss_mlp": 0.16983882, + "balance_loss_clip": 0.08243092, + "balance_loss_mlp": 0.02127924, + "epoch": 0.007635653088832106, + "flos": 21586916770560.0, + "grad_norm": 719.841664884419, + "language_loss": 3.98921776, + "learning_rate": 3.1189404329183404e-06, + "loss": 4.2758379, + "num_input_tokens_seen": 2657450, + "router_z_loss_clip": 34.3125, + "router_z_loss_mlp": 148.625, + "step": 127, + "time_per_iteration": 2.6392276287078857 + }, + { + "auxiliary_loss_clip": 0.11679719, + "auxiliary_loss_mlp": 0.17065403, + "balance_loss_clip": 0.08245254, + "balance_loss_mlp": 0.02160617, + "epoch": 0.007695776341500075, + "flos": 25382216730240.0, + "grad_norm": 1269.777428310943, + "language_loss": 4.33711529, + "learning_rate": 3.1239902798522317e-06, + "loss": 4.62456656, + "num_input_tokens_seen": 2678150, + "router_z_loss_clip": 34.3125, + "router_z_loss_mlp": 149.125, + "step": 128, + "time_per_iteration": 2.698997974395752 + }, + { + "auxiliary_loss_clip": 0.11722346, + "auxiliary_loss_mlp": 0.16804715, + "balance_loss_clip": 0.08270991, + "balance_loss_mlp": 0.02131863, + "epoch": 0.007755899594168045, + "flos": 22350088558080.0, + "grad_norm": 1159.6537901720856, + "language_loss": 4.87967634, + "learning_rate": 3.129000827968184e-06, + "loss": 5.16494703, + "num_input_tokens_seen": 2698290, + "router_z_loss_clip": 34.5, + "router_z_loss_mlp": 146.625, + "step": 129, + "time_per_iteration": 2.6568491458892822 + }, + { + "auxiliary_loss_clip": 0.11725748, + "auxiliary_loss_mlp": 0.17228858, + "balance_loss_clip": 0.08278215, + "balance_loss_mlp": 0.02165382, + "epoch": 0.007816022846836013, + "flos": 22644869621760.0, + "grad_norm": 436.4430863377033, + "language_loss": 5.01482534, + "learning_rate": 3.133972684206866e-06, + "loss": 5.30437136, + "num_input_tokens_seen": 2717630, + "router_z_loss_clip": 34.4375, + "router_z_loss_mlp": 150.5, + "step": 130, + "time_per_iteration": 2.7268729209899902 + }, + { + "auxiliary_loss_clip": 0.11697873, + "auxiliary_loss_mlp": 0.16884172, + "balance_loss_clip": 0.08257942, + "balance_loss_mlp": 0.02162493, + "epoch": 0.007876146099503984, + "flos": 18188115131520.0, + "grad_norm": 1162.2622739405722, + "language_loss": 4.07958698, + "learning_rate": 3.138906441556014e-06, + "loss": 4.36540699, + "num_input_tokens_seen": 2735835, + "router_z_loss_clip": 34.40625, + "router_z_loss_mlp": 147.25, + "step": 131, + "time_per_iteration": 2.7109498977661133 + }, + { + "auxiliary_loss_clip": 0.11733647, + "auxiliary_loss_mlp": 0.16117501, + "balance_loss_clip": 0.08280095, + "balance_loss_mlp": 0.02128244, + "epoch": 0.007936269352171952, + "flos": 27125788815360.0, + "grad_norm": 7543.348079431309, + "language_loss": 4.20423412, + "learning_rate": 3.143802679474861e-06, + "loss": 4.48274565, + "num_input_tokens_seen": 2756335, + "router_z_loss_clip": 34.5, + "router_z_loss_mlp": 140.0, + "step": 132, + "time_per_iteration": 2.717806816101074 + }, + { + "auxiliary_loss_clip": 0.11797122, + "auxiliary_loss_mlp": 0.16945273, + "balance_loss_clip": 0.08290964, + "balance_loss_mlp": 0.0219918, + "epoch": 0.007996392604839923, + "flos": 19032403271040.0, + "grad_norm": 824.1057706186339, + "language_loss": 4.52130318, + "learning_rate": 3.1486619643025565e-06, + "loss": 4.80872679, + "num_input_tokens_seen": 2775090, + "router_z_loss_clip": 34.96875, + "router_z_loss_mlp": 147.375, + "step": 133, + "time_per_iteration": 2.6183056831359863 + }, + { + "auxiliary_loss_clip": 0.11778916, + "auxiliary_loss_mlp": 0.1607928, + "balance_loss_clip": 0.08279899, + "balance_loss_mlp": 0.02163264, + "epoch": 0.008056515857507891, + "flos": 25491271219200.0, + "grad_norm": 23901.09716796145, + "language_loss": 3.33778429, + "learning_rate": 3.153484849651286e-06, + "loss": 3.61636591, + "num_input_tokens_seen": 2795320, + "router_z_loss_clip": 34.96875, + "router_z_loss_mlp": 139.25, + "step": 134, + "time_per_iteration": 2.715651750564575 + }, + { + "auxiliary_loss_clip": 0.11796138, + "auxiliary_loss_mlp": 0.16928384, + "balance_loss_clip": 0.08284588, + "balance_loss_mlp": 0.02206703, + "epoch": 0.00811663911017586, + "flos": 20563694236800.0, + "grad_norm": 532.3002515432323, + "language_loss": 4.31598186, + "learning_rate": 3.1582718767847806e-06, + "loss": 4.60322666, + "num_input_tokens_seen": 2812815, + "router_z_loss_clip": 35.1875, + "router_z_loss_mlp": 147.25, + "step": 135, + "time_per_iteration": 2.658189296722412 + }, + { + "auxiliary_loss_clip": 0.11834078, + "auxiliary_loss_mlp": 0.17649791, + "balance_loss_clip": 0.08286304, + "balance_loss_mlp": 0.02256724, + "epoch": 0.00817676236284383, + "flos": 18804483365760.0, + "grad_norm": 591.2706889750153, + "language_loss": 4.16468382, + "learning_rate": 3.1630235749828485e-06, + "loss": 4.45952272, + "num_input_tokens_seen": 2830445, + "router_z_loss_clip": 35.4375, + "router_z_loss_mlp": 153.75, + "step": 136, + "time_per_iteration": 5.634068250656128 + }, + { + "auxiliary_loss_clip": 0.11831227, + "auxiliary_loss_mlp": 0.16616376, + "balance_loss_clip": 0.08291583, + "balance_loss_mlp": 0.02193768, + "epoch": 0.008236885615511799, + "flos": 23879576661120.0, + "grad_norm": 754.59577193491, + "language_loss": 4.28476763, + "learning_rate": 3.1677404618925676e-06, + "loss": 4.56924391, + "num_input_tokens_seen": 2846965, + "router_z_loss_clip": 35.46875, + "router_z_loss_mlp": 144.25, + "step": 137, + "time_per_iteration": 2.6984925270080566 + }, + { + "auxiliary_loss_clip": 0.11840196, + "auxiliary_loss_mlp": 0.16576298, + "balance_loss_clip": 0.08293904, + "balance_loss_mlp": 0.02214726, + "epoch": 0.00829700886817977, + "flos": 24650379169920.0, + "grad_norm": 767.1857414798482, + "language_loss": 4.50048828, + "learning_rate": 3.1724230438666953e-06, + "loss": 4.78465271, + "num_input_tokens_seen": 2867520, + "router_z_loss_clip": 35.5, + "router_z_loss_mlp": 143.5625, + "step": 138, + "time_per_iteration": 4.106135368347168 + }, + { + "auxiliary_loss_clip": 0.11846266, + "auxiliary_loss_mlp": 0.16453376, + "balance_loss_clip": 0.08313362, + "balance_loss_mlp": 0.02219978, + "epoch": 0.008357132120847738, + "flos": 25268550266880.0, + "grad_norm": 3135.202751990444, + "language_loss": 4.53827906, + "learning_rate": 3.177071816289865e-06, + "loss": 4.82127523, + "num_input_tokens_seen": 2885675, + "router_z_loss_clip": 35.34375, + "router_z_loss_mlp": 142.5, + "step": 139, + "time_per_iteration": 2.6956582069396973 + }, + { + "auxiliary_loss_clip": 0.11892673, + "auxiliary_loss_mlp": 0.17064422, + "balance_loss_clip": 0.08314734, + "balance_loss_mlp": 0.02245087, + "epoch": 0.008417255373515706, + "flos": 27352325128320.0, + "grad_norm": 729.9492101747932, + "language_loss": 3.41289186, + "learning_rate": 3.181687263893095e-06, + "loss": 3.70246267, + "num_input_tokens_seen": 2905960, + "router_z_loss_clip": 35.71875, + "router_z_loss_mlp": 148.125, + "step": 140, + "time_per_iteration": 2.6964235305786133 + }, + { + "auxiliary_loss_clip": 0.1186142, + "auxiliary_loss_mlp": 0.16847792, + "balance_loss_clip": 0.08325124, + "balance_loss_mlp": 0.02223768, + "epoch": 0.008477378626183677, + "flos": 17644771330560.0, + "grad_norm": 9248.736899536998, + "language_loss": 3.54738212, + "learning_rate": 3.186269861057098e-06, + "loss": 3.83447456, + "num_input_tokens_seen": 2922780, + "router_z_loss_clip": 35.375, + "router_z_loss_mlp": 146.125, + "step": 141, + "time_per_iteration": 2.6551992893218994 + }, + { + "auxiliary_loss_clip": 0.11875261, + "auxiliary_loss_mlp": 0.17182453, + "balance_loss_clip": 0.08333448, + "balance_loss_mlp": 0.02241047, + "epoch": 0.008537501878851645, + "flos": 13886465748480.0, + "grad_norm": 1195.8886145818353, + "language_loss": 3.75801992, + "learning_rate": 3.1908200721048745e-06, + "loss": 4.04859734, + "num_input_tokens_seen": 2938765, + "router_z_loss_clip": 35.46875, + "router_z_loss_mlp": 149.375, + "step": 142, + "time_per_iteration": 2.613173246383667 + }, + { + "auxiliary_loss_clip": 0.11767568, + "auxiliary_loss_mlp": 0.03479403, + "balance_loss_clip": 0.08269441, + "balance_loss_mlp": 0.01324862, + "epoch": 0.008597625131519616, + "flos": 71270783976960.0, + "grad_norm": 1.6897091068609469, + "language_loss": 0.6651473, + "learning_rate": 3.195338351584042e-06, + "loss": 0.81761706, + "num_input_tokens_seen": 3006665, + "router_z_loss_clip": 35.0, + "router_z_loss_mlp": 21.5625, + "step": 143, + "time_per_iteration": 3.571974754333496 + }, + { + "auxiliary_loss_clip": 0.11831102, + "auxiliary_loss_mlp": 0.18004906, + "balance_loss_clip": 0.08322103, + "balance_loss_mlp": 0.02245629, + "epoch": 0.008657748384187584, + "flos": 17608573566720.0, + "grad_norm": 764.3395719536082, + "language_loss": 4.02781963, + "learning_rate": 3.1998251445393258e-06, + "loss": 4.32617998, + "num_input_tokens_seen": 3024335, + "router_z_loss_clip": 35.125, + "router_z_loss_mlp": 157.625, + "step": 144, + "time_per_iteration": 2.950308322906494 + }, + { + "auxiliary_loss_clip": 0.11815393, + "auxiliary_loss_mlp": 0.1653876, + "balance_loss_clip": 0.08320558, + "balance_loss_mlp": 0.021955, + "epoch": 0.008717871636855555, + "flos": 19720789689600.0, + "grad_norm": 995.118837229873, + "language_loss": 3.85104275, + "learning_rate": 3.204280886775619e-06, + "loss": 4.13458443, + "num_input_tokens_seen": 3043300, + "router_z_loss_clip": 34.9375, + "router_z_loss_mlp": 143.625, + "step": 145, + "time_per_iteration": 2.704049587249756 + }, + { + "auxiliary_loss_clip": 0.11712223, + "auxiliary_loss_mlp": 0.1568643, + "balance_loss_clip": 0.08270143, + "balance_loss_mlp": 0.02154936, + "epoch": 0.008777994889523523, + "flos": 24724325998080.0, + "grad_norm": 15039.120691806027, + "language_loss": 3.98885298, + "learning_rate": 3.208706005112005e-06, + "loss": 4.26283932, + "num_input_tokens_seen": 3064610, + "router_z_loss_clip": 34.40625, + "router_z_loss_mlp": 135.4375, + "step": 146, + "time_per_iteration": 2.7329108715057373 + }, + { + "auxiliary_loss_clip": 0.11446112, + "auxiliary_loss_mlp": 0.02845502, + "balance_loss_clip": 0.08152023, + "balance_loss_mlp": 0.01408125, + "epoch": 0.008838118142191492, + "flos": 70150974013440.0, + "grad_norm": 1.1651618479175945, + "language_loss": 0.59517723, + "learning_rate": 3.213100917627104e-06, + "loss": 0.73809338, + "num_input_tokens_seen": 3130385, + "router_z_loss_clip": 33.0, + "router_z_loss_mlp": 14.3671875, + "step": 147, + "time_per_iteration": 3.3949942588806152 + }, + { + "auxiliary_loss_clip": 0.11677637, + "auxiliary_loss_mlp": 0.16713935, + "balance_loss_clip": 0.08274397, + "balance_loss_mlp": 0.02199776, + "epoch": 0.008898241394859462, + "flos": 20050510705920.0, + "grad_norm": 1889.1884601694564, + "language_loss": 4.35780334, + "learning_rate": 3.2174660338961135e-06, + "loss": 4.64171886, + "num_input_tokens_seen": 3149760, + "router_z_loss_clip": 33.96875, + "router_z_loss_mlp": 145.25, + "step": 148, + "time_per_iteration": 2.7146079540252686 + }, + { + "auxiliary_loss_clip": 0.1159438, + "auxiliary_loss_mlp": 0.16573352, + "balance_loss_clip": 0.08248326, + "balance_loss_mlp": 0.02217881, + "epoch": 0.008958364647527431, + "flos": 10748217980160.0, + "grad_norm": 637.0991660467967, + "language_loss": 4.14174032, + "learning_rate": 3.2218017552198588e-06, + "loss": 4.42341805, + "num_input_tokens_seen": 3164500, + "router_z_loss_clip": 33.4375, + "router_z_loss_mlp": 143.625, + "step": 149, + "time_per_iteration": 2.661672353744507 + }, + { + "auxiliary_loss_clip": 0.11618437, + "auxiliary_loss_mlp": 0.16563556, + "balance_loss_clip": 0.08263792, + "balance_loss_mlp": 0.02201984, + "epoch": 0.009018487900195401, + "flos": 29134317110400.0, + "grad_norm": 1769.3998229499293, + "language_loss": 4.95698929, + "learning_rate": 3.226108474846181e-06, + "loss": 5.23880959, + "num_input_tokens_seen": 3182455, + "router_z_loss_clip": 33.5625, + "router_z_loss_mlp": 143.6875, + "step": 150, + "time_per_iteration": 2.7311227321624756 + }, + { + "auxiliary_loss_clip": 0.11585926, + "auxiliary_loss_mlp": 0.16123089, + "balance_loss_clip": 0.08249478, + "balance_loss_mlp": 0.02219281, + "epoch": 0.00907861115286337, + "flos": 32972020035840.0, + "grad_norm": 2114.6136002652206, + "language_loss": 3.36094427, + "learning_rate": 3.2303865781839817e-06, + "loss": 3.63803458, + "num_input_tokens_seen": 3203995, + "router_z_loss_clip": 33.34375, + "router_z_loss_mlp": 139.125, + "step": 151, + "time_per_iteration": 2.7520253658294678 + }, + { + "auxiliary_loss_clip": 0.115492, + "auxiliary_loss_mlp": 0.15748456, + "balance_loss_clip": 0.08239767, + "balance_loss_mlp": 0.02198652, + "epoch": 0.009138734405531338, + "flos": 21768911838720.0, + "grad_norm": 3311.474565423633, + "language_loss": 3.73547316, + "learning_rate": 3.234636443010188e-06, + "loss": 4.00844955, + "num_input_tokens_seen": 3222575, + "router_z_loss_clip": 33.09375, + "router_z_loss_mlp": 135.625, + "step": 152, + "time_per_iteration": 2.694563865661621 + }, + { + "auxiliary_loss_clip": 0.1159073, + "auxiliary_loss_mlp": 0.1623821, + "balance_loss_clip": 0.08250044, + "balance_loss_mlp": 0.02248952, + "epoch": 0.009198857658199309, + "flos": 20847532343040.0, + "grad_norm": 1087.0956983151382, + "language_loss": 3.84302998, + "learning_rate": 3.238858439669943e-06, + "loss": 4.12131977, + "num_input_tokens_seen": 3240180, + "router_z_loss_clip": 33.40625, + "router_z_loss_mlp": 139.875, + "step": 153, + "time_per_iteration": 2.6366450786590576 + }, + { + "auxiliary_loss_clip": 0.11564142, + "auxiliary_loss_mlp": 0.15476364, + "balance_loss_clip": 0.08260261, + "balance_loss_mlp": 0.02207321, + "epoch": 0.009258980910867277, + "flos": 24834386736000.0, + "grad_norm": 8366.148944916698, + "language_loss": 4.13687325, + "learning_rate": 3.2430529312702712e-06, + "loss": 4.40727806, + "num_input_tokens_seen": 3259800, + "router_z_loss_clip": 33.03125, + "router_z_loss_mlp": 132.8125, + "step": 154, + "time_per_iteration": 2.7312138080596924 + }, + { + "auxiliary_loss_clip": 0.11535051, + "auxiliary_loss_mlp": 0.15077396, + "balance_loss_clip": 0.08268774, + "balance_loss_mlp": 0.02198978, + "epoch": 0.009319104163535248, + "flos": 28775442072960.0, + "grad_norm": 662.1258045248602, + "language_loss": 4.14579964, + "learning_rate": 3.2472202738674737e-06, + "loss": 4.41192484, + "num_input_tokens_seen": 3280400, + "router_z_loss_clip": 32.71875, + "router_z_loss_mlp": 128.6875, + "step": 155, + "time_per_iteration": 2.755199909210205 + }, + { + "auxiliary_loss_clip": 0.11566834, + "auxiliary_loss_mlp": 0.15004471, + "balance_loss_clip": 0.08261703, + "balance_loss_mlp": 0.02193191, + "epoch": 0.009379227416203216, + "flos": 16587698947200.0, + "grad_norm": 731.5664855161135, + "language_loss": 3.49704862, + "learning_rate": 3.2513608166485063e-06, + "loss": 3.76276183, + "num_input_tokens_seen": 3297600, + "router_z_loss_clip": 33.09375, + "router_z_loss_mlp": 128.125, + "step": 156, + "time_per_iteration": 2.7707407474517822 + }, + { + "auxiliary_loss_clip": 0.11568415, + "auxiliary_loss_mlp": 0.15332887, + "balance_loss_clip": 0.08266081, + "balance_loss_mlp": 0.02216432, + "epoch": 0.009439350668871187, + "flos": 18335337955200.0, + "grad_norm": 795.683005311381, + "language_loss": 3.94911337, + "learning_rate": 3.2554749021065498e-06, + "loss": 4.2181263, + "num_input_tokens_seen": 3313635, + "router_z_loss_clip": 32.96875, + "router_z_loss_mlp": 131.25, + "step": 157, + "time_per_iteration": 2.6737098693847656 + }, + { + "auxiliary_loss_clip": 0.11567172, + "auxiliary_loss_mlp": 0.15600383, + "balance_loss_clip": 0.0828969, + "balance_loss_mlp": 0.02264203, + "epoch": 0.009499473921539155, + "flos": 24356310865920.0, + "grad_norm": 748.6515809747107, + "language_loss": 3.9944849, + "learning_rate": 3.2595628662110186e-06, + "loss": 4.26616049, + "num_input_tokens_seen": 3333735, + "router_z_loss_clip": 32.75, + "router_z_loss_mlp": 133.5625, + "step": 158, + "time_per_iteration": 2.6704254150390625 + }, + { + "auxiliary_loss_clip": 0.11561831, + "auxiliary_loss_mlp": 0.15665153, + "balance_loss_clip": 0.08273103, + "balance_loss_mlp": 0.02231314, + "epoch": 0.009559597174207124, + "flos": 16404949192320.0, + "grad_norm": 1901.311070356518, + "language_loss": 3.80921197, + "learning_rate": 3.2636250385721982e-06, + "loss": 4.08148146, + "num_input_tokens_seen": 3348800, + "router_z_loss_clip": 32.90625, + "router_z_loss_mlp": 134.4375, + "step": 159, + "time_per_iteration": 2.6218996047973633 + }, + { + "auxiliary_loss_clip": 0.11580203, + "auxiliary_loss_mlp": 0.15643886, + "balance_loss_clip": 0.08278053, + "balance_loss_mlp": 0.02252773, + "epoch": 0.009619720426875094, + "flos": 22863523651200.0, + "grad_norm": 1785.522909187837, + "language_loss": 3.8831954, + "learning_rate": 3.2676617426007263e-06, + "loss": 4.15543652, + "num_input_tokens_seen": 3368595, + "router_z_loss_clip": 33.0, + "router_z_loss_mlp": 134.0, + "step": 160, + "time_per_iteration": 2.6699254512786865 + }, + { + "auxiliary_loss_clip": 0.11567888, + "auxiliary_loss_mlp": 0.15128596, + "balance_loss_clip": 0.08280417, + "balance_loss_mlp": 0.02237971, + "epoch": 0.009679843679543063, + "flos": 19140954635520.0, + "grad_norm": 1894.5705497879367, + "language_loss": 4.38242626, + "learning_rate": 3.2716732956621042e-06, + "loss": 4.6493907, + "num_input_tokens_seen": 3384975, + "router_z_loss_clip": 32.890625, + "router_z_loss_mlp": 129.0, + "step": 161, + "time_per_iteration": 2.692594289779663 + }, + { + "auxiliary_loss_clip": 0.11596949, + "auxiliary_loss_mlp": 0.15413821, + "balance_loss_clip": 0.08296333, + "balance_loss_mlp": 0.02279055, + "epoch": 0.009739966932211033, + "flos": 20309219786880.0, + "grad_norm": 1092.6315431795774, + "language_loss": 3.67637897, + "learning_rate": 3.2756600092264203e-06, + "loss": 3.94648647, + "num_input_tokens_seen": 3404755, + "router_z_loss_clip": 33.0, + "router_z_loss_mlp": 131.4375, + "step": 162, + "time_per_iteration": 2.684589147567749 + }, + { + "auxiliary_loss_clip": 0.10812573, + "auxiliary_loss_mlp": 0.02121325, + "balance_loss_clip": 0.08169468, + "balance_loss_mlp": 0.01469775, + "epoch": 0.009800090184879002, + "flos": 67053200567040.0, + "grad_norm": 1.455168404801105, + "language_loss": 0.72263706, + "learning_rate": 3.279622189013474e-06, + "loss": 0.85197604, + "num_input_tokens_seen": 3467210, + "router_z_loss_clip": 26.484375, + "router_z_loss_mlp": 6.515625, + "step": 163, + "time_per_iteration": 3.2609994411468506 + }, + { + "auxiliary_loss_clip": 0.1158057, + "auxiliary_loss_mlp": 0.15459523, + "balance_loss_clip": 0.08303102, + "balance_loss_mlp": 0.02282033, + "epoch": 0.00986021343754697, + "flos": 17170301185920.0, + "grad_norm": 728.8786194893343, + "language_loss": 3.07243919, + "learning_rate": 3.283560135133457e-06, + "loss": 3.34283996, + "num_input_tokens_seen": 3483220, + "router_z_loss_clip": 32.765625, + "router_z_loss_mlp": 131.8125, + "step": 164, + "time_per_iteration": 2.6558001041412354 + }, + { + "auxiliary_loss_clip": 0.11589515, + "auxiliary_loss_mlp": 0.15754591, + "balance_loss_clip": 0.08312181, + "balance_loss_mlp": 0.02308546, + "epoch": 0.00992033669021494, + "flos": 17755293265920.0, + "grad_norm": 847.0745501241739, + "language_loss": 3.51890922, + "learning_rate": 3.2874741422233565e-06, + "loss": 3.79235029, + "num_input_tokens_seen": 3501465, + "router_z_loss_clip": 32.78125, + "router_z_loss_mlp": 134.4375, + "step": 165, + "time_per_iteration": 2.661271095275879 + }, + { + "auxiliary_loss_clip": 0.11568248, + "auxiliary_loss_mlp": 0.15508898, + "balance_loss_clip": 0.08301617, + "balance_loss_mlp": 0.02294787, + "epoch": 0.00998045994288291, + "flos": 25303490219520.0, + "grad_norm": 327.0790624727143, + "language_loss": 3.23893571, + "learning_rate": 3.2913644995792465e-06, + "loss": 3.50970697, + "num_input_tokens_seen": 3520480, + "router_z_loss_clip": 32.6875, + "router_z_loss_mlp": 132.3125, + "step": 166, + "time_per_iteration": 2.710336923599243 + }, + { + "auxiliary_loss_clip": 0.11574914, + "auxiliary_loss_mlp": 0.14880663, + "balance_loss_clip": 0.08314175, + "balance_loss_mlp": 0.02301317, + "epoch": 0.01004058319555088, + "flos": 32305869676800.0, + "grad_norm": 776.5856268380442, + "language_loss": 4.07326555, + "learning_rate": 3.2952314912845914e-06, + "loss": 4.33782148, + "num_input_tokens_seen": 3539570, + "router_z_loss_clip": 32.609375, + "router_z_loss_mlp": 125.8125, + "step": 167, + "time_per_iteration": 2.779219150543213 + }, + { + "auxiliary_loss_clip": 0.1150827, + "auxiliary_loss_mlp": 0.15720402, + "balance_loss_clip": 0.083069, + "balance_loss_mlp": 0.02304874, + "epoch": 0.010100706448218848, + "flos": 11323399132800.0, + "grad_norm": 2394.835407434967, + "language_loss": 3.28905821, + "learning_rate": 3.299075396334735e-06, + "loss": 3.5613451, + "num_input_tokens_seen": 3555465, + "router_z_loss_clip": 32.0, + "router_z_loss_mlp": 134.25, + "step": 168, + "time_per_iteration": 2.6511645317077637 + }, + { + "auxiliary_loss_clip": 0.11477365, + "auxiliary_loss_mlp": 0.1529358, + "balance_loss_clip": 0.08283502, + "balance_loss_mlp": 0.02299196, + "epoch": 0.010160829700886819, + "flos": 29727820379520.0, + "grad_norm": 656.1528496227621, + "language_loss": 3.4663558, + "learning_rate": 3.3028964887576868e-06, + "loss": 3.73406529, + "num_input_tokens_seen": 3578970, + "router_z_loss_clip": 31.921875, + "router_z_loss_mlp": 130.0, + "step": 169, + "time_per_iteration": 2.744943141937256 + }, + { + "auxiliary_loss_clip": 0.1151928, + "auxiliary_loss_mlp": 0.1559048, + "balance_loss_clip": 0.08316396, + "balance_loss_mlp": 0.02315333, + "epoch": 0.010220952953554787, + "flos": 20418567765120.0, + "grad_norm": 1313.5821328962659, + "language_loss": 3.30928183, + "learning_rate": 3.306695037731344e-06, + "loss": 3.58037925, + "num_input_tokens_seen": 3597275, + "router_z_loss_clip": 32.03125, + "router_z_loss_mlp": 132.75, + "step": 170, + "time_per_iteration": 2.6904942989349365 + }, + { + "auxiliary_loss_clip": 0.11476055, + "auxiliary_loss_mlp": 0.14880618, + "balance_loss_clip": 0.08295664, + "balance_loss_mlp": 0.02301271, + "epoch": 0.010281076206222756, + "flos": 31293170830080.0, + "grad_norm": 1393.3935417181144, + "language_loss": 3.61100364, + "learning_rate": 3.3104713076972827e-06, + "loss": 3.87457037, + "num_input_tokens_seen": 3618905, + "router_z_loss_clip": 31.84375, + "router_z_loss_mlp": 125.75, + "step": 171, + "time_per_iteration": 2.7253830432891846 + }, + { + "auxiliary_loss_clip": 0.11506656, + "auxiliary_loss_mlp": 0.15002409, + "balance_loss_clip": 0.08299719, + "balance_loss_mlp": 0.02294889, + "epoch": 0.010341199458890726, + "flos": 21988949460480.0, + "grad_norm": 857.6014739419991, + "language_loss": 3.63604832, + "learning_rate": 3.314225558471224e-06, + "loss": 3.90113878, + "num_input_tokens_seen": 3639610, + "router_z_loss_clip": 32.015625, + "router_z_loss_mlp": 127.1875, + "step": 172, + "time_per_iteration": 2.687918186187744 + }, + { + "auxiliary_loss_clip": 0.11501465, + "auxiliary_loss_mlp": 0.15934135, + "balance_loss_clip": 0.08304699, + "balance_loss_mlp": 0.02359916, + "epoch": 0.010401322711558695, + "flos": 30818449123200.0, + "grad_norm": 2776.6711688344126, + "language_loss": 3.43709183, + "learning_rate": 3.317958045350308e-06, + "loss": 3.71144772, + "num_input_tokens_seen": 3664030, + "router_z_loss_clip": 31.9375, + "router_z_loss_mlp": 135.6875, + "step": 173, + "time_per_iteration": 2.760416030883789 + }, + { + "auxiliary_loss_clip": 0.11548179, + "auxiliary_loss_mlp": 0.15753293, + "balance_loss_clip": 0.08317138, + "balance_loss_mlp": 0.02337765, + "epoch": 0.010461445964226665, + "flos": 24721642667520.0, + "grad_norm": 1049.1047345334737, + "language_loss": 3.46181607, + "learning_rate": 3.3216690192172596e-06, + "loss": 3.73483086, + "num_input_tokens_seen": 3683615, + "router_z_loss_clip": 32.28125, + "router_z_loss_mlp": 134.125, + "step": 174, + "time_per_iteration": 2.8112432956695557 + }, + { + "auxiliary_loss_clip": 0.11529493, + "auxiliary_loss_mlp": 0.16248052, + "balance_loss_clip": 0.08304952, + "balance_loss_mlp": 0.02319829, + "epoch": 0.010521569216894634, + "flos": 27717950419200.0, + "grad_norm": 1443.6409322594398, + "language_loss": 3.14877939, + "learning_rate": 3.325358726641591e-06, + "loss": 3.42655468, + "num_input_tokens_seen": 3704540, + "router_z_loss_clip": 32.265625, + "router_z_loss_mlp": 139.25, + "step": 175, + "time_per_iteration": 5.6078009605407715 + }, + { + "auxiliary_loss_clip": 0.11549105, + "auxiliary_loss_mlp": 0.15645993, + "balance_loss_clip": 0.08317456, + "balance_loss_mlp": 0.02328122, + "epoch": 0.010581692469562603, + "flos": 12463223022720.0, + "grad_norm": 956.7802143525229, + "language_loss": 3.34866667, + "learning_rate": 3.329027409977902e-06, + "loss": 3.62061763, + "num_input_tokens_seen": 3721320, + "router_z_loss_clip": 32.34375, + "router_z_loss_mlp": 133.375, + "step": 176, + "time_per_iteration": 4.057558059692383 + }, + { + "auxiliary_loss_clip": 0.11580729, + "auxiliary_loss_mlp": 0.16905147, + "balance_loss_clip": 0.08321375, + "balance_loss_mlp": 0.02378779, + "epoch": 0.010641815722230573, + "flos": 19433723201280.0, + "grad_norm": 1505.424754847227, + "language_loss": 3.25544405, + "learning_rate": 3.3326753074614087e-06, + "loss": 3.54030275, + "num_input_tokens_seen": 3739385, + "router_z_loss_clip": 32.5625, + "router_z_loss_mlp": 145.25, + "step": 177, + "time_per_iteration": 4.175410032272339 + }, + { + "auxiliary_loss_clip": 0.11632887, + "auxiliary_loss_mlp": 0.17182559, + "balance_loss_clip": 0.08330977, + "balance_loss_mlp": 0.02387638, + "epoch": 0.010701938974898541, + "flos": 18338440556160.0, + "grad_norm": 1009.0094276513727, + "language_loss": 3.02760315, + "learning_rate": 3.3363026533007716e-06, + "loss": 3.31575751, + "num_input_tokens_seen": 3756360, + "router_z_loss_clip": 33.046875, + "router_z_loss_mlp": 148.0, + "step": 178, + "time_per_iteration": 2.6476314067840576 + }, + { + "auxiliary_loss_clip": 0.11659138, + "auxiliary_loss_mlp": 0.17559879, + "balance_loss_clip": 0.0834986, + "balance_loss_mlp": 0.02398745, + "epoch": 0.010762062227566512, + "flos": 19209283240320.0, + "grad_norm": 645.2944722680985, + "language_loss": 3.18850112, + "learning_rate": 3.3399096777683303e-06, + "loss": 3.48069143, + "num_input_tokens_seen": 3773930, + "router_z_loss_clip": 33.03125, + "router_z_loss_mlp": 151.5, + "step": 179, + "time_per_iteration": 2.673020601272583 + }, + { + "auxiliary_loss_clip": 0.11646449, + "auxiliary_loss_mlp": 0.17152536, + "balance_loss_clip": 0.0833544, + "balance_loss_mlp": 0.02369822, + "epoch": 0.01082218548023448, + "flos": 31432553297280.0, + "grad_norm": 1138.8337468152163, + "language_loss": 3.61664343, + "learning_rate": 3.3434966072878213e-06, + "loss": 3.90463305, + "num_input_tokens_seen": 3793630, + "router_z_loss_clip": 33.125, + "router_z_loss_mlp": 147.75, + "step": 180, + "time_per_iteration": 2.7129592895507812 + }, + { + "auxiliary_loss_clip": 0.1163583, + "auxiliary_loss_mlp": 0.17579561, + "balance_loss_clip": 0.08352019, + "balance_loss_mlp": 0.02406223, + "epoch": 0.01088230873290245, + "flos": 25053501962880.0, + "grad_norm": 1023.6426422721124, + "language_loss": 3.16591597, + "learning_rate": 3.3470636645196674e-06, + "loss": 3.45807004, + "num_input_tokens_seen": 3813610, + "router_z_loss_clip": 32.875, + "router_z_loss_mlp": 151.5, + "step": 181, + "time_per_iteration": 2.7088735103607178 + }, + { + "auxiliary_loss_clip": 0.11667231, + "auxiliary_loss_mlp": 0.17749819, + "balance_loss_clip": 0.08358228, + "balance_loss_mlp": 0.02381167, + "epoch": 0.01094243198557042, + "flos": 22900056831360.0, + "grad_norm": 355.45097956691654, + "language_loss": 3.57462454, + "learning_rate": 3.3506110684439156e-06, + "loss": 3.86879492, + "num_input_tokens_seen": 3831390, + "router_z_loss_clip": 33.03125, + "router_z_loss_mlp": 153.625, + "step": 182, + "time_per_iteration": 2.6655702590942383 + }, + { + "auxiliary_loss_clip": 0.11774068, + "auxiliary_loss_mlp": 0.186405, + "balance_loss_clip": 0.08392486, + "balance_loss_mlp": 0.02429562, + "epoch": 0.011002555238238388, + "flos": 17170720456320.0, + "grad_norm": 544.9308642616941, + "language_loss": 3.01895189, + "learning_rate": 3.3541390344409054e-06, + "loss": 3.32309771, + "num_input_tokens_seen": 3849705, + "router_z_loss_clip": 33.8125, + "router_z_loss_mlp": 162.0, + "step": 183, + "time_per_iteration": 2.672084331512451 + }, + { + "auxiliary_loss_clip": 0.11731043, + "auxiliary_loss_mlp": 0.17741105, + "balance_loss_clip": 0.0838448, + "balance_loss_mlp": 0.02409074, + "epoch": 0.011062678490906358, + "flos": 22316783760000.0, + "grad_norm": 900.0159693716428, + "language_loss": 3.54977012, + "learning_rate": 3.357647774369736e-06, + "loss": 3.84449148, + "num_input_tokens_seen": 3869230, + "router_z_loss_clip": 33.4375, + "router_z_loss_mlp": 153.25, + "step": 184, + "time_per_iteration": 2.664008140563965 + }, + { + "auxiliary_loss_clip": 0.11698474, + "auxiliary_loss_mlp": 0.18400645, + "balance_loss_clip": 0.08363934, + "balance_loss_mlp": 0.02433849, + "epoch": 0.011122801743574327, + "flos": 24395108106240.0, + "grad_norm": 434.928327577731, + "language_loss": 3.09638596, + "learning_rate": 3.3611374966446085e-06, + "loss": 3.39737701, + "num_input_tokens_seen": 3889735, + "router_z_loss_clip": 33.34375, + "router_z_loss_mlp": 159.5, + "step": 185, + "time_per_iteration": 2.726417303085327 + }, + { + "auxiliary_loss_clip": 0.11759127, + "auxiliary_loss_mlp": 0.17777845, + "balance_loss_clip": 0.08374798, + "balance_loss_mlp": 0.02421399, + "epoch": 0.011182924996242297, + "flos": 18156110071680.0, + "grad_norm": 629.7246053366609, + "language_loss": 2.4891119, + "learning_rate": 3.3646084063091142e-06, + "loss": 2.78448153, + "num_input_tokens_seen": 3908855, + "router_z_loss_clip": 33.84375, + "router_z_loss_mlp": 153.5, + "step": 186, + "time_per_iteration": 2.694352865219116 + }, + { + "auxiliary_loss_clip": 0.11730683, + "auxiliary_loss_mlp": 0.17846453, + "balance_loss_clip": 0.08379789, + "balance_loss_mlp": 0.0240456, + "epoch": 0.011243048248910266, + "flos": 15492206666880.0, + "grad_norm": 204.67136476740635, + "language_loss": 3.6299262, + "learning_rate": 3.3680607051085194e-06, + "loss": 3.9256978, + "num_input_tokens_seen": 3923865, + "router_z_loss_clip": 33.53125, + "router_z_loss_mlp": 154.25, + "step": 187, + "time_per_iteration": 2.6440258026123047 + }, + { + "auxiliary_loss_clip": 0.11782947, + "auxiliary_loss_mlp": 0.18885629, + "balance_loss_clip": 0.08391893, + "balance_loss_mlp": 0.02454964, + "epoch": 0.011303171501578235, + "flos": 40926442383360.0, + "grad_norm": 245.45256433797323, + "language_loss": 2.78124428, + "learning_rate": 3.371494591560139e-06, + "loss": 3.0879302, + "num_input_tokens_seen": 3946870, + "router_z_loss_clip": 33.875, + "router_z_loss_mlp": 164.25, + "step": 188, + "time_per_iteration": 2.8504083156585693 + }, + { + "auxiliary_loss_clip": 0.10094331, + "auxiliary_loss_mlp": 0.0271045, + "balance_loss_clip": 0.08081996, + "balance_loss_mlp": 0.01840699, + "epoch": 0.011363294754246205, + "flos": 66321237225600.0, + "grad_norm": 2.5418158680058287, + "language_loss": 0.5572542, + "learning_rate": 3.3749102610218297e-06, + "loss": 0.68530196, + "num_input_tokens_seen": 4010005, + "router_z_loss_clip": 20.140625, + "router_z_loss_mlp": 8.71875, + "step": 189, + "time_per_iteration": 3.351346492767334 + }, + { + "auxiliary_loss_clip": 0.11787133, + "auxiliary_loss_mlp": 0.18362574, + "balance_loss_clip": 0.08391854, + "balance_loss_mlp": 0.02444606, + "epoch": 0.011423418006914174, + "flos": 24907285388160.0, + "grad_norm": 1404.1743205968703, + "language_loss": 3.09611416, + "learning_rate": 3.3783079057586833e-06, + "loss": 3.39761114, + "num_input_tokens_seen": 4029035, + "router_z_loss_clip": 34.0, + "router_z_loss_mlp": 159.125, + "step": 190, + "time_per_iteration": 2.7106430530548096 + }, + { + "auxiliary_loss_clip": 0.11759384, + "auxiliary_loss_mlp": 0.1804318, + "balance_loss_clip": 0.08374631, + "balance_loss_mlp": 0.02442593, + "epoch": 0.011483541259582144, + "flos": 19797964899840.0, + "grad_norm": 958.8286854390585, + "language_loss": 3.06252718, + "learning_rate": 3.3816877150079665e-06, + "loss": 3.36055326, + "num_input_tokens_seen": 4046995, + "router_z_loss_clip": 33.875, + "router_z_loss_mlp": 156.0, + "step": 191, + "time_per_iteration": 2.6592226028442383 + }, + { + "auxiliary_loss_clip": 0.11741614, + "auxiliary_loss_mlp": 0.17628413, + "balance_loss_clip": 0.08397849, + "balance_loss_mlp": 0.02442867, + "epoch": 0.011543664512250112, + "flos": 26184101904000.0, + "grad_norm": 872.0200851454543, + "language_loss": 3.40287876, + "learning_rate": 3.385049875042367e-06, + "loss": 3.69657874, + "num_input_tokens_seen": 4065865, + "router_z_loss_clip": 33.4375, + "router_z_loss_mlp": 151.625, + "step": 192, + "time_per_iteration": 2.7246127128601074 + }, + { + "auxiliary_loss_clip": 0.11744646, + "auxiliary_loss_mlp": 0.1831618, + "balance_loss_clip": 0.08387344, + "balance_loss_mlp": 0.02459247, + "epoch": 0.011603787764918083, + "flos": 23775763052160.0, + "grad_norm": 255.22859463919886, + "language_loss": 3.03195429, + "learning_rate": 3.3883945692315938e-06, + "loss": 3.33256245, + "num_input_tokens_seen": 4085305, + "router_z_loss_clip": 33.59375, + "router_z_loss_mlp": 158.375, + "step": 193, + "time_per_iteration": 2.683800220489502 + }, + { + "auxiliary_loss_clip": 0.11792802, + "auxiliary_loss_mlp": 0.18172303, + "balance_loss_clip": 0.08409159, + "balance_loss_mlp": 0.02449647, + "epoch": 0.011663911017586051, + "flos": 25961255170560.0, + "grad_norm": 151.45813274947093, + "language_loss": 3.26517797, + "learning_rate": 3.3917219781023906e-06, + "loss": 3.56482911, + "num_input_tokens_seen": 4105185, + "router_z_loss_clip": 33.875, + "router_z_loss_mlp": 157.0, + "step": 194, + "time_per_iteration": 2.6878743171691895 + }, + { + "auxiliary_loss_clip": 0.11706592, + "auxiliary_loss_mlp": 0.17706957, + "balance_loss_clip": 0.08367997, + "balance_loss_mlp": 0.0244817, + "epoch": 0.01172403427025402, + "flos": 17901006716160.0, + "grad_norm": 341.36308265873936, + "language_loss": 3.21669102, + "learning_rate": 3.3950322793970014e-06, + "loss": 3.51082659, + "num_input_tokens_seen": 4123160, + "router_z_loss_clip": 33.375, + "router_z_loss_mlp": 152.25, + "step": 195, + "time_per_iteration": 2.6620969772338867 + }, + { + "auxiliary_loss_clip": 0.11741272, + "auxiliary_loss_mlp": 0.18081686, + "balance_loss_clip": 0.08387178, + "balance_loss_mlp": 0.02468893, + "epoch": 0.01178415752292199, + "flos": 17900293956480.0, + "grad_norm": 232.42067340374058, + "language_loss": 3.00283194, + "learning_rate": 3.3983256481301445e-06, + "loss": 3.30106115, + "num_input_tokens_seen": 4140425, + "router_z_loss_clip": 33.53125, + "router_z_loss_mlp": 156.0, + "step": 196, + "time_per_iteration": 2.608747720718384 + }, + { + "auxiliary_loss_clip": 0.11721249, + "auxiliary_loss_mlp": 0.17373422, + "balance_loss_clip": 0.08370736, + "balance_loss_mlp": 0.02444223, + "epoch": 0.011844280775589959, + "flos": 22900224539520.0, + "grad_norm": 115.37051275011517, + "language_loss": 2.93469787, + "learning_rate": 3.4016022566445335e-06, + "loss": 3.22564435, + "num_input_tokens_seen": 4159555, + "router_z_loss_clip": 33.5, + "router_z_loss_mlp": 149.0, + "step": 197, + "time_per_iteration": 2.6884865760803223 + }, + { + "auxiliary_loss_clip": 0.11780085, + "auxiliary_loss_mlp": 0.17500654, + "balance_loss_clip": 0.08412851, + "balance_loss_mlp": 0.02486004, + "epoch": 0.01190440402825793, + "flos": 26987748013440.0, + "grad_norm": 594.5655905086047, + "language_loss": 2.93459964, + "learning_rate": 3.4048622746649966e-06, + "loss": 3.22740698, + "num_input_tokens_seen": 4180480, + "router_z_loss_clip": 33.65625, + "router_z_loss_mlp": 150.25, + "step": 198, + "time_per_iteration": 2.7313427925109863 + }, + { + "auxiliary_loss_clip": 0.11754367, + "auxiliary_loss_mlp": 0.16903168, + "balance_loss_clip": 0.08420561, + "balance_loss_mlp": 0.02462251, + "epoch": 0.011964527280925898, + "flos": 20527789962240.0, + "grad_norm": 145.17481727818333, + "language_loss": 2.84690857, + "learning_rate": 3.4081058693512278e-06, + "loss": 3.13348389, + "num_input_tokens_seen": 4198835, + "router_z_loss_clip": 33.34375, + "router_z_loss_mlp": 144.5, + "step": 199, + "time_per_iteration": 2.688974618911743 + }, + { + "auxiliary_loss_clip": 0.11798929, + "auxiliary_loss_mlp": 0.17447452, + "balance_loss_clip": 0.08422767, + "balance_loss_mlp": 0.02481632, + "epoch": 0.012024650533593867, + "flos": 27753435423360.0, + "grad_norm": 82.0113766879368, + "language_loss": 2.56142473, + "learning_rate": 3.411333205349222e-06, + "loss": 2.85388851, + "num_input_tokens_seen": 4219335, + "router_z_loss_clip": 33.75, + "router_z_loss_mlp": 149.5, + "step": 200, + "time_per_iteration": 2.745638608932495 + }, + { + "auxiliary_loss_clip": 0.11760798, + "auxiliary_loss_mlp": 0.1661135, + "balance_loss_clip": 0.08439215, + "balance_loss_mlp": 0.02475607, + "epoch": 0.012084773786261837, + "flos": 10456623371520.0, + "grad_norm": 81.29107841083456, + "language_loss": 2.49306059, + "learning_rate": 3.4145444448414217e-06, + "loss": 2.77678204, + "num_input_tokens_seen": 4236940, + "router_z_loss_clip": 33.25, + "router_z_loss_mlp": 141.375, + "step": 201, + "time_per_iteration": 2.7527854442596436 + }, + { + "auxiliary_loss_clip": 0.1174719, + "auxiliary_loss_mlp": 0.16602293, + "balance_loss_clip": 0.08432734, + "balance_loss_mlp": 0.02490965, + "epoch": 0.012144897038929806, + "flos": 23111331701760.0, + "grad_norm": 843.8800494285322, + "language_loss": 2.70319819, + "learning_rate": 3.4177397475956223e-06, + "loss": 2.98669291, + "num_input_tokens_seen": 4256755, + "router_z_loss_clip": 33.21875, + "router_z_loss_mlp": 141.125, + "step": 202, + "time_per_iteration": 2.739138603210449 + }, + { + "auxiliary_loss_clip": 0.11772437, + "auxiliary_loss_mlp": 0.16814882, + "balance_loss_clip": 0.08448092, + "balance_loss_mlp": 0.02483826, + "epoch": 0.012205020291597776, + "flos": 21039631827840.0, + "grad_norm": 111.22984226607618, + "language_loss": 2.69834185, + "learning_rate": 3.4209192710126685e-06, + "loss": 2.98421502, + "num_input_tokens_seen": 4276505, + "router_z_loss_clip": 33.25, + "router_z_loss_mlp": 143.375, + "step": 203, + "time_per_iteration": 2.6849801540374756 + }, + { + "auxiliary_loss_clip": 0.09996115, + "auxiliary_loss_mlp": 0.01763683, + "balance_loss_clip": 0.08022483, + "balance_loss_mlp": 0.01355129, + "epoch": 0.012265143544265745, + "flos": 68465416481280.0, + "grad_norm": 2.5939001011358327, + "language_loss": 0.60663998, + "learning_rate": 3.4240831701729837e-06, + "loss": 0.72423798, + "num_input_tokens_seen": 4330965, + "router_z_loss_clip": 19.75, + "router_z_loss_mlp": 4.08984375, + "step": 204, + "time_per_iteration": 3.218200922012329 + }, + { + "auxiliary_loss_clip": 0.11829591, + "auxiliary_loss_mlp": 0.16426852, + "balance_loss_clip": 0.08460154, + "balance_loss_mlp": 0.02486424, + "epoch": 0.012325266796933715, + "flos": 17024923152000.0, + "grad_norm": 175.923318576614, + "language_loss": 2.6947825, + "learning_rate": 3.4272315978819516e-06, + "loss": 2.9773469, + "num_input_tokens_seen": 4348200, + "router_z_loss_clip": 33.6875, + "router_z_loss_mlp": 139.5, + "step": 205, + "time_per_iteration": 2.6580400466918945 + }, + { + "auxiliary_loss_clip": 0.11821875, + "auxiliary_loss_mlp": 0.15477848, + "balance_loss_clip": 0.0845597, + "balance_loss_mlp": 0.02483464, + "epoch": 0.012385390049601683, + "flos": 20195679104640.0, + "grad_norm": 179.20336452265943, + "language_loss": 2.76609898, + "learning_rate": 3.4303647047142043e-06, + "loss": 3.03909636, + "num_input_tokens_seen": 4365460, + "router_z_loss_clip": 33.71875, + "router_z_loss_mlp": 130.0625, + "step": 206, + "time_per_iteration": 2.732661724090576 + }, + { + "auxiliary_loss_clip": 0.11876252, + "auxiliary_loss_mlp": 0.15609139, + "balance_loss_clip": 0.0847889, + "balance_loss_mlp": 0.02498787, + "epoch": 0.012445513302269652, + "flos": 16258690690560.0, + "grad_norm": 37.57079461410369, + "language_loss": 2.63663292, + "learning_rate": 3.43348263905683e-06, + "loss": 2.91148686, + "num_input_tokens_seen": 4383650, + "router_z_loss_clip": 33.9375, + "router_z_loss_mlp": 131.25, + "step": 207, + "time_per_iteration": 2.655898332595825 + }, + { + "auxiliary_loss_clip": 0.11858118, + "auxiliary_loss_mlp": 0.15964949, + "balance_loss_clip": 0.08469288, + "balance_loss_mlp": 0.02500593, + "epoch": 0.012505636554937622, + "flos": 23776224249600.0, + "grad_norm": 80.16610328924297, + "language_loss": 2.31757832, + "learning_rate": 3.436585547151547e-06, + "loss": 2.59580898, + "num_input_tokens_seen": 4403765, + "router_z_loss_clip": 33.90625, + "router_z_loss_mlp": 134.8125, + "step": 208, + "time_per_iteration": 2.7096707820892334 + }, + { + "auxiliary_loss_clip": 0.11891477, + "auxiliary_loss_mlp": 0.15333374, + "balance_loss_clip": 0.08512411, + "balance_loss_mlp": 0.02509888, + "epoch": 0.012565759807605591, + "flos": 30599417750400.0, + "grad_norm": 94.61742092763181, + "language_loss": 2.89340639, + "learning_rate": 3.4396735731358586e-06, + "loss": 3.16565466, + "num_input_tokens_seen": 4421935, + "router_z_loss_clip": 33.8125, + "router_z_loss_mlp": 128.3125, + "step": 209, + "time_per_iteration": 2.7260549068450928 + }, + { + "auxiliary_loss_clip": 0.11866176, + "auxiliary_loss_mlp": 0.14843261, + "balance_loss_clip": 0.08489646, + "balance_loss_mlp": 0.02508056, + "epoch": 0.012625883060273561, + "flos": 40122838200960.0, + "grad_norm": 70.02885877178691, + "language_loss": 2.47040462, + "learning_rate": 3.4427468590832302e-06, + "loss": 2.737499, + "num_input_tokens_seen": 4441470, + "router_z_loss_clip": 33.78125, + "router_z_loss_mlp": 123.375, + "step": 210, + "time_per_iteration": 2.8969995975494385 + }, + { + "auxiliary_loss_clip": 0.1188697, + "auxiliary_loss_mlp": 0.14057073, + "balance_loss_clip": 0.08471721, + "balance_loss_mlp": 0.02497014, + "epoch": 0.01268600631294153, + "flos": 27096509013120.0, + "grad_norm": 122.06391807709156, + "language_loss": 2.54189563, + "learning_rate": 3.445805545042314e-06, + "loss": 2.80133629, + "num_input_tokens_seen": 4459950, + "router_z_loss_clip": 34.15625, + "router_z_loss_mlp": 115.625, + "step": 211, + "time_per_iteration": 2.708080768585205 + }, + { + "auxiliary_loss_clip": 0.11883873, + "auxiliary_loss_mlp": 0.13339609, + "balance_loss_clip": 0.08499163, + "balance_loss_mlp": 0.02499764, + "epoch": 0.012746129565609499, + "flos": 16988431898880.0, + "grad_norm": 126.44131700603937, + "language_loss": 2.37998009, + "learning_rate": 3.448849769075239e-06, + "loss": 2.63221502, + "num_input_tokens_seen": 4478390, + "router_z_loss_clip": 33.84375, + "router_z_loss_mlp": 108.375, + "step": 212, + "time_per_iteration": 2.6480045318603516 + }, + { + "auxiliary_loss_clip": 0.11928719, + "auxiliary_loss_mlp": 0.13044119, + "balance_loss_clip": 0.08510935, + "balance_loss_mlp": 0.02497243, + "epoch": 0.012806252818277469, + "flos": 46543621668480.0, + "grad_norm": 186.42729164055353, + "language_loss": 2.21970725, + "learning_rate": 3.4518796672950093e-06, + "loss": 2.46943569, + "num_input_tokens_seen": 4501665, + "router_z_loss_clip": 34.15625, + "router_z_loss_mlp": 105.5625, + "step": 213, + "time_per_iteration": 2.871330738067627 + }, + { + "auxiliary_loss_clip": 0.119517, + "auxiliary_loss_mlp": 0.12083894, + "balance_loss_clip": 0.08513753, + "balance_loss_mlp": 0.02489167, + "epoch": 0.012866376070945438, + "flos": 14393234442240.0, + "grad_norm": 59.129237382202305, + "language_loss": 2.15201378, + "learning_rate": 3.4548953739020187e-06, + "loss": 2.39236999, + "num_input_tokens_seen": 4519055, + "router_z_loss_clip": 34.40625, + "router_z_loss_mlp": 95.9375, + "step": 214, + "time_per_iteration": 2.677279472351074 + }, + { + "auxiliary_loss_clip": 0.11979187, + "auxiliary_loss_mlp": 0.11437444, + "balance_loss_clip": 0.08527225, + "balance_loss_mlp": 0.02483585, + "epoch": 0.012926499323613408, + "flos": 26148029921280.0, + "grad_norm": 82.8472801825022, + "language_loss": 2.01005268, + "learning_rate": 3.4578970212197196e-06, + "loss": 2.24421906, + "num_input_tokens_seen": 4540870, + "router_z_loss_clip": 34.5, + "router_z_loss_mlp": 89.625, + "step": 215, + "time_per_iteration": 5.505565881729126 + }, + { + "auxiliary_loss_clip": 0.11977073, + "auxiliary_loss_mlp": 0.10736242, + "balance_loss_clip": 0.08518873, + "balance_loss_mlp": 0.02484289, + "epoch": 0.012986622576281377, + "flos": 30124989532800.0, + "grad_norm": 444.29299491343255, + "language_loss": 2.23052669, + "learning_rate": 3.460884739729461e-06, + "loss": 2.45765996, + "num_input_tokens_seen": 4560395, + "router_z_loss_clip": 34.625, + "router_z_loss_mlp": 82.5, + "step": 216, + "time_per_iteration": 4.0875208377838135 + }, + { + "auxiliary_loss_clip": 0.11978886, + "auxiliary_loss_mlp": 0.10150906, + "balance_loss_clip": 0.0852896, + "balance_loss_mlp": 0.02478787, + "epoch": 0.013046745828949347, + "flos": 13959112838400.0, + "grad_norm": 45.21271501184753, + "language_loss": 2.33321786, + "learning_rate": 3.463858658104523e-06, + "loss": 2.55451584, + "num_input_tokens_seen": 4575785, + "router_z_loss_clip": 34.46875, + "router_z_loss_mlp": 76.625, + "step": 217, + "time_per_iteration": 4.032313585281372 + }, + { + "auxiliary_loss_clip": 0.11990365, + "auxiliary_loss_mlp": 0.09330522, + "balance_loss_clip": 0.08498306, + "balance_loss_mlp": 0.02482377, + "epoch": 0.013106869081617315, + "flos": 17353595992320.0, + "grad_norm": 48.7496700865691, + "language_loss": 2.077981, + "learning_rate": 3.4668189032433696e-06, + "loss": 2.29119015, + "num_input_tokens_seen": 4594985, + "router_z_loss_clip": 34.9375, + "router_z_loss_mlp": 68.625, + "step": 218, + "time_per_iteration": 2.655488967895508 + }, + { + "auxiliary_loss_clip": 0.12044869, + "auxiliary_loss_mlp": 0.08778962, + "balance_loss_clip": 0.08527655, + "balance_loss_mlp": 0.02477083, + "epoch": 0.013166992334285284, + "flos": 25892004170880.0, + "grad_norm": 58.49845250600888, + "language_loss": 2.1651845, + "learning_rate": 3.46976560030214e-06, + "loss": 2.3734231, + "num_input_tokens_seen": 4616125, + "router_z_loss_clip": 35.15625, + "router_z_loss_mlp": 63.0, + "step": 219, + "time_per_iteration": 2.7416553497314453 + }, + { + "auxiliary_loss_clip": 0.12097923, + "auxiliary_loss_mlp": 0.08351351, + "balance_loss_clip": 0.08555256, + "balance_loss_mlp": 0.0248282, + "epoch": 0.013227115586953254, + "flos": 31184032487040.0, + "grad_norm": 65.30096795058861, + "language_loss": 2.22661948, + "learning_rate": 3.4726988727263976e-06, + "loss": 2.43111229, + "num_input_tokens_seen": 4637795, + "router_z_loss_clip": 35.40625, + "router_z_loss_mlp": 58.625, + "step": 220, + "time_per_iteration": 2.825364351272583 + }, + { + "auxiliary_loss_clip": 0.12091806, + "auxiliary_loss_mlp": 0.07555279, + "balance_loss_clip": 0.08557573, + "balance_loss_mlp": 0.02477154, + "epoch": 0.013287238839621223, + "flos": 20415213601920.0, + "grad_norm": 85.51848477504389, + "language_loss": 2.08907223, + "learning_rate": 3.475618842282164e-06, + "loss": 2.2855432, + "num_input_tokens_seen": 4656835, + "router_z_loss_clip": 35.375, + "router_z_loss_mlp": 50.75, + "step": 221, + "time_per_iteration": 2.699341058731079 + }, + { + "auxiliary_loss_clip": 0.12102397, + "auxiliary_loss_mlp": 0.07188272, + "balance_loss_clip": 0.08552121, + "balance_loss_mlp": 0.02482462, + "epoch": 0.013347362092289193, + "flos": 14142365717760.0, + "grad_norm": 45.70301732891132, + "language_loss": 2.16536474, + "learning_rate": 3.4785256290862486e-06, + "loss": 2.3582716, + "num_input_tokens_seen": 4673015, + "router_z_loss_clip": 35.5, + "router_z_loss_mlp": 47.0, + "step": 222, + "time_per_iteration": 2.635849714279175 + }, + { + "auxiliary_loss_clip": 0.12141806, + "auxiliary_loss_mlp": 0.06919794, + "balance_loss_clip": 0.08555885, + "balance_loss_mlp": 0.0248864, + "epoch": 0.013407485344957162, + "flos": 21803977572480.0, + "grad_norm": 133.93360024755185, + "language_loss": 2.13315558, + "learning_rate": 3.481419351635897e-06, + "loss": 2.32377172, + "num_input_tokens_seen": 4692355, + "router_z_loss_clip": 35.84375, + "router_z_loss_mlp": 44.375, + "step": 223, + "time_per_iteration": 2.677440881729126 + }, + { + "auxiliary_loss_clip": 0.12133283, + "auxiliary_loss_mlp": 0.06662595, + "balance_loss_clip": 0.08527759, + "balance_loss_mlp": 0.0248779, + "epoch": 0.013467608597625132, + "flos": 18627058344960.0, + "grad_norm": 45.82649386348146, + "language_loss": 2.04508209, + "learning_rate": 3.484300126837776e-06, + "loss": 2.23304057, + "num_input_tokens_seen": 4710080, + "router_z_loss_clip": 36.0, + "router_z_loss_mlp": 41.71875, + "step": 224, + "time_per_iteration": 2.647221803665161 + }, + { + "auxiliary_loss_clip": 0.12132762, + "auxiliary_loss_mlp": 0.06591167, + "balance_loss_clip": 0.0855926, + "balance_loss_mlp": 0.02489604, + "epoch": 0.013527731850293101, + "flos": 18558352396800.0, + "grad_norm": 35.4602333373948, + "language_loss": 1.96751869, + "learning_rate": 3.487168070036317e-06, + "loss": 2.15475798, + "num_input_tokens_seen": 4728980, + "router_z_loss_clip": 35.71875, + "router_z_loss_mlp": 41.0, + "step": 225, + "time_per_iteration": 2.6572558879852295 + }, + { + "auxiliary_loss_clip": 0.12111218, + "auxiliary_loss_mlp": 0.06338836, + "balance_loss_clip": 0.08540972, + "balance_loss_mlp": 0.02487518, + "epoch": 0.01358785510296107, + "flos": 19170318291840.0, + "grad_norm": 35.010295897234684, + "language_loss": 2.14010954, + "learning_rate": 3.4900232950414224e-06, + "loss": 2.32460999, + "num_input_tokens_seen": 4747020, + "router_z_loss_clip": 35.6875, + "router_z_loss_mlp": 38.46875, + "step": 226, + "time_per_iteration": 2.6925666332244873 + }, + { + "auxiliary_loss_clip": 0.12106597, + "auxiliary_loss_mlp": 0.06106333, + "balance_loss_clip": 0.08537765, + "balance_loss_mlp": 0.02477793, + "epoch": 0.01364797835562904, + "flos": 23336442495360.0, + "grad_norm": 62.289483146556975, + "language_loss": 1.89336014, + "learning_rate": 3.4928659141555727e-06, + "loss": 2.07548952, + "num_input_tokens_seen": 4765000, + "router_z_loss_clip": 35.71875, + "router_z_loss_mlp": 36.25, + "step": 227, + "time_per_iteration": 2.662459373474121 + }, + { + "auxiliary_loss_clip": 0.09852038, + "auxiliary_loss_mlp": 0.02028254, + "balance_loss_clip": 0.08093569, + "balance_loss_mlp": 0.01678827, + "epoch": 0.013708101608297009, + "flos": 71016561089280.0, + "grad_norm": 1.118625578373922, + "language_loss": 0.572559, + "learning_rate": 3.4956960382003234e-06, + "loss": 0.6913619, + "num_input_tokens_seen": 4833210, + "router_z_loss_clip": 17.53125, + "router_z_loss_mlp": 3.49804688, + "step": 228, + "time_per_iteration": 3.3785295486450195 + }, + { + "auxiliary_loss_clip": 0.12056112, + "auxiliary_loss_mlp": 0.05858175, + "balance_loss_clip": 0.08522452, + "balance_loss_mlp": 0.02485983, + "epoch": 0.013768224860964979, + "flos": 16330583093760.0, + "grad_norm": 67.20403392826273, + "language_loss": 1.83727443, + "learning_rate": 3.4985137765422354e-06, + "loss": 2.0164175, + "num_input_tokens_seen": 4850120, + "router_z_loss_clip": 35.34375, + "router_z_loss_mlp": 33.765625, + "step": 229, + "time_per_iteration": 2.6247904300689697 + }, + { + "auxiliary_loss_clip": 0.11999249, + "auxiliary_loss_mlp": 0.05601757, + "balance_loss_clip": 0.08509874, + "balance_loss_mlp": 0.02482861, + "epoch": 0.013828348113632948, + "flos": 20199159048960.0, + "grad_norm": 53.50045183346903, + "language_loss": 1.8795563, + "learning_rate": 3.501319237118231e-06, + "loss": 2.05556631, + "num_input_tokens_seen": 4866215, + "router_z_loss_clip": 34.9375, + "router_z_loss_mlp": 31.1875, + "step": 230, + "time_per_iteration": 2.7507057189941406 + }, + { + "auxiliary_loss_clip": 0.12064129, + "auxiliary_loss_mlp": 0.05470717, + "balance_loss_clip": 0.08557475, + "balance_loss_mlp": 0.02487624, + "epoch": 0.013888471366300916, + "flos": 20747408313600.0, + "grad_norm": 34.266749882440614, + "language_loss": 1.64469385, + "learning_rate": 3.5041125264604056e-06, + "loss": 1.82004225, + "num_input_tokens_seen": 4885630, + "router_z_loss_clip": 35.09375, + "router_z_loss_mlp": 29.796875, + "step": 231, + "time_per_iteration": 2.641220808029175 + }, + { + "auxiliary_loss_clip": 0.12051, + "auxiliary_loss_mlp": 0.05321148, + "balance_loss_clip": 0.08549553, + "balance_loss_mlp": 0.02486065, + "epoch": 0.013948594618968886, + "flos": 22097123481600.0, + "grad_norm": 189.27377216215737, + "language_loss": 1.70564377, + "learning_rate": 3.5068937497203002e-06, + "loss": 1.87936521, + "num_input_tokens_seen": 4905570, + "router_z_loss_clip": 35.0, + "router_z_loss_mlp": 28.34375, + "step": 232, + "time_per_iteration": 2.6656322479248047 + }, + { + "auxiliary_loss_clip": 0.12035383, + "auxiliary_loss_mlp": 0.0510756, + "balance_loss_clip": 0.08542152, + "balance_loss_mlp": 0.02483049, + "epoch": 0.014008717871636855, + "flos": 19069229940480.0, + "grad_norm": 76.31242813901656, + "language_loss": 1.64492762, + "learning_rate": 3.509663010692652e-06, + "loss": 1.81635702, + "num_input_tokens_seen": 4923535, + "router_z_loss_clip": 34.96875, + "router_z_loss_mlp": 26.25, + "step": 233, + "time_per_iteration": 2.6354150772094727 + }, + { + "auxiliary_loss_clip": 0.12088259, + "auxiliary_loss_mlp": 0.05079982, + "balance_loss_clip": 0.08570465, + "balance_loss_mlp": 0.02490566, + "epoch": 0.014068841124304825, + "flos": 14534839042560.0, + "grad_norm": 50.00852440461159, + "language_loss": 1.75618017, + "learning_rate": 3.512420411838642e-06, + "loss": 1.92786264, + "num_input_tokens_seen": 4939200, + "router_z_loss_clip": 35.15625, + "router_z_loss_mlp": 25.890625, + "step": 234, + "time_per_iteration": 2.666630983352661 + }, + { + "auxiliary_loss_clip": 0.11989364, + "auxiliary_loss_mlp": 0.05021151, + "balance_loss_clip": 0.08533135, + "balance_loss_mlp": 0.0249277, + "epoch": 0.014128964376972794, + "flos": 18083253346560.0, + "grad_norm": 159.74277839526525, + "language_loss": 1.68861091, + "learning_rate": 3.515166054308634e-06, + "loss": 1.85871601, + "num_input_tokens_seen": 4956620, + "router_z_loss_clip": 34.625, + "router_z_loss_mlp": 25.28125, + "step": 235, + "time_per_iteration": 2.6749186515808105 + }, + { + "auxiliary_loss_clip": 0.12056133, + "auxiliary_loss_mlp": 0.04976581, + "balance_loss_clip": 0.08549982, + "balance_loss_mlp": 0.02495502, + "epoch": 0.014189087629640764, + "flos": 25340778086400.0, + "grad_norm": 181.61682318003585, + "language_loss": 1.60946572, + "learning_rate": 3.5179000379644498e-06, + "loss": 1.77979279, + "num_input_tokens_seen": 4975650, + "router_z_loss_clip": 35.03125, + "router_z_loss_mlp": 24.8125, + "step": 236, + "time_per_iteration": 2.744683027267456 + }, + { + "auxiliary_loss_clip": 0.11981137, + "auxiliary_loss_mlp": 0.04688486, + "balance_loss_clip": 0.08556408, + "balance_loss_mlp": 0.02492746, + "epoch": 0.014249210882308733, + "flos": 36148939263360.0, + "grad_norm": 53.559601436427585, + "language_loss": 1.50691867, + "learning_rate": 3.520622461401154e-06, + "loss": 1.67361498, + "num_input_tokens_seen": 4997415, + "router_z_loss_clip": 34.25, + "router_z_loss_mlp": 21.96875, + "step": 237, + "time_per_iteration": 2.845082998275757 + }, + { + "auxiliary_loss_clip": 0.12020621, + "auxiliary_loss_mlp": 0.04751597, + "balance_loss_clip": 0.08577786, + "balance_loss_mlp": 0.02497874, + "epoch": 0.014309334134976702, + "flos": 12937986656640.0, + "grad_norm": 74.10279300011292, + "language_loss": 1.46138978, + "learning_rate": 3.5233334219683935e-06, + "loss": 1.62911201, + "num_input_tokens_seen": 5013905, + "router_z_loss_clip": 34.4375, + "router_z_loss_mlp": 22.5625, + "step": 238, + "time_per_iteration": 2.658674716949463 + }, + { + "auxiliary_loss_clip": 0.11937614, + "auxiliary_loss_mlp": 0.04392426, + "balance_loss_clip": 0.08564249, + "balance_loss_mlp": 0.02485077, + "epoch": 0.014369457387644672, + "flos": 20783857639680.0, + "grad_norm": 42.588620022932425, + "language_loss": 1.53544843, + "learning_rate": 3.526033015791284e-06, + "loss": 1.69874883, + "num_input_tokens_seen": 5033645, + "router_z_loss_clip": 33.78125, + "router_z_loss_mlp": 19.046875, + "step": 239, + "time_per_iteration": 2.700894355773926 + }, + { + "auxiliary_loss_clip": 0.11902035, + "auxiliary_loss_mlp": 0.04253633, + "balance_loss_clip": 0.08564246, + "balance_loss_mlp": 0.02488191, + "epoch": 0.01442958064031264, + "flos": 25855638698880.0, + "grad_norm": 34.671761903295156, + "language_loss": 1.53386331, + "learning_rate": 3.528721337790862e-06, + "loss": 1.69542003, + "num_input_tokens_seen": 5052875, + "router_z_loss_clip": 33.4375, + "router_z_loss_mlp": 17.671875, + "step": 240, + "time_per_iteration": 2.712979555130005 + }, + { + "auxiliary_loss_clip": 0.11883197, + "auxiliary_loss_mlp": 0.04123231, + "balance_loss_clip": 0.08562298, + "balance_loss_mlp": 0.02487489, + "epoch": 0.014489703892980611, + "flos": 28227150881280.0, + "grad_norm": 79.00201559956153, + "language_loss": 1.47835279, + "learning_rate": 3.531398481704111e-06, + "loss": 1.63841701, + "num_input_tokens_seen": 5075005, + "router_z_loss_clip": 33.15625, + "router_z_loss_mlp": 16.359375, + "step": 241, + "time_per_iteration": 2.7748684883117676 + }, + { + "auxiliary_loss_clip": 0.11856598, + "auxiliary_loss_mlp": 0.0397551, + "balance_loss_clip": 0.08558369, + "balance_loss_mlp": 0.02488541, + "epoch": 0.01454982714564858, + "flos": 22497311381760.0, + "grad_norm": 26.156771136535646, + "language_loss": 1.46749806, + "learning_rate": 3.534064540103573e-06, + "loss": 1.62581909, + "num_input_tokens_seen": 5091875, + "router_z_loss_clip": 32.984375, + "router_z_loss_mlp": 14.875, + "step": 242, + "time_per_iteration": 2.69297456741333 + }, + { + "auxiliary_loss_clip": 0.11859537, + "auxiliary_loss_mlp": 0.03845835, + "balance_loss_clip": 0.08550237, + "balance_loss_mlp": 0.0248704, + "epoch": 0.014609950398316548, + "flos": 21659689641600.0, + "grad_norm": 40.62615504318681, + "language_loss": 1.44594622, + "learning_rate": 3.536719604416555e-06, + "loss": 1.60299993, + "num_input_tokens_seen": 5111290, + "router_z_loss_clip": 33.03125, + "router_z_loss_mlp": 13.5859375, + "step": 243, + "time_per_iteration": 2.7429516315460205 + }, + { + "auxiliary_loss_clip": 0.11778541, + "auxiliary_loss_mlp": 0.03809229, + "balance_loss_clip": 0.08539546, + "balance_loss_mlp": 0.02486292, + "epoch": 0.014670073650984519, + "flos": 21876163464960.0, + "grad_norm": 100.86422067940943, + "language_loss": 1.56203103, + "learning_rate": 3.5393637649439464e-06, + "loss": 1.71790862, + "num_input_tokens_seen": 5132265, + "router_z_loss_clip": 32.34375, + "router_z_loss_mlp": 13.2265625, + "step": 244, + "time_per_iteration": 2.6750683784484863 + }, + { + "auxiliary_loss_clip": 0.11823894, + "auxiliary_loss_mlp": 0.03778996, + "balance_loss_clip": 0.08550587, + "balance_loss_mlp": 0.02497257, + "epoch": 0.014730196903652487, + "flos": 23190142066560.0, + "grad_norm": 48.52251723310838, + "language_loss": 1.50476313, + "learning_rate": 3.54199711087864e-06, + "loss": 1.66079211, + "num_input_tokens_seen": 5148575, + "router_z_loss_clip": 32.71875, + "router_z_loss_mlp": 12.8125, + "step": 245, + "time_per_iteration": 2.72153639793396 + }, + { + "auxiliary_loss_clip": 0.11763392, + "auxiliary_loss_mlp": 0.03610927, + "balance_loss_clip": 0.08551488, + "balance_loss_mlp": 0.02484828, + "epoch": 0.014790320156320457, + "flos": 23229442431360.0, + "grad_norm": 98.70024924690004, + "language_loss": 1.52072549, + "learning_rate": 3.5446197303235913e-06, + "loss": 1.67446864, + "num_input_tokens_seen": 5170415, + "router_z_loss_clip": 32.078125, + "router_z_loss_mlp": 11.265625, + "step": 246, + "time_per_iteration": 2.739284038543701 + }, + { + "auxiliary_loss_clip": 0.11731501, + "auxiliary_loss_mlp": 0.03545591, + "balance_loss_clip": 0.08530955, + "balance_loss_mlp": 0.0246832, + "epoch": 0.014850443408988426, + "flos": 15821005288320.0, + "grad_norm": 33.98035395755878, + "language_loss": 1.40319586, + "learning_rate": 3.5472317103095034e-06, + "loss": 1.55596685, + "num_input_tokens_seen": 5188565, + "router_z_loss_clip": 31.96875, + "router_z_loss_mlp": 10.7734375, + "step": 247, + "time_per_iteration": 2.7273683547973633 + }, + { + "auxiliary_loss_clip": 0.1172208, + "auxiliary_loss_mlp": 0.03547119, + "balance_loss_clip": 0.08564139, + "balance_loss_mlp": 0.02478241, + "epoch": 0.014910566661656396, + "flos": 22787899741440.0, + "grad_norm": 52.371226674183355, + "language_loss": 1.30089116, + "learning_rate": 3.549833136812155e-06, + "loss": 1.453583, + "num_input_tokens_seen": 5207810, + "router_z_loss_clip": 31.578125, + "router_z_loss_mlp": 10.6953125, + "step": 248, + "time_per_iteration": 2.7991907596588135 + }, + { + "auxiliary_loss_clip": 0.11678547, + "auxiliary_loss_mlp": 0.03475812, + "balance_loss_clip": 0.08537906, + "balance_loss_mlp": 0.02466443, + "epoch": 0.014970689914324365, + "flos": 26871440146560.0, + "grad_norm": 39.139484540660874, + "language_loss": 1.33625245, + "learning_rate": 3.552424094769381e-06, + "loss": 1.48779607, + "num_input_tokens_seen": 5226210, + "router_z_loss_clip": 31.390625, + "router_z_loss_mlp": 10.0859375, + "step": 249, + "time_per_iteration": 2.7439961433410645 + }, + { + "auxiliary_loss_clip": 0.11684404, + "auxiliary_loss_mlp": 0.03406032, + "balance_loss_clip": 0.08537483, + "balance_loss_mlp": 0.02458461, + "epoch": 0.015030813166992334, + "flos": 13989943941120.0, + "grad_norm": 151.47532384589994, + "language_loss": 1.465379, + "learning_rate": 3.5550046680977174e-06, + "loss": 1.6162833, + "num_input_tokens_seen": 5241660, + "router_z_loss_clip": 31.46875, + "router_z_loss_mlp": 9.4765625, + "step": 250, + "time_per_iteration": 2.68412184715271 + }, + { + "auxiliary_loss_clip": 0.11659358, + "auxiliary_loss_mlp": 0.03389172, + "balance_loss_clip": 0.08554412, + "balance_loss_mlp": 0.02466397, + "epoch": 0.015090936419660304, + "flos": 24724787195520.0, + "grad_norm": 46.474949555678066, + "language_loss": 1.48383927, + "learning_rate": 3.5575749397087034e-06, + "loss": 1.63432467, + "num_input_tokens_seen": 5261090, + "router_z_loss_clip": 31.0625, + "router_z_loss_mlp": 9.22265625, + "step": 251, + "time_per_iteration": 2.7403595447540283 + }, + { + "auxiliary_loss_clip": 0.11684091, + "auxiliary_loss_mlp": 0.0341421, + "balance_loss_clip": 0.08552309, + "balance_loss_mlp": 0.02502498, + "epoch": 0.015151059672328273, + "flos": 25745829523200.0, + "grad_norm": 38.842940432028065, + "language_loss": 1.35644555, + "learning_rate": 3.5601349915248707e-06, + "loss": 1.50742865, + "num_input_tokens_seen": 5279175, + "router_z_loss_clip": 31.296875, + "router_z_loss_mlp": 9.1171875, + "step": 252, + "time_per_iteration": 2.791579246520996 + }, + { + "auxiliary_loss_clip": 0.11669001, + "auxiliary_loss_mlp": 0.03442915, + "balance_loss_clip": 0.08573347, + "balance_loss_mlp": 0.02537305, + "epoch": 0.015211182924996243, + "flos": 21877588984320.0, + "grad_norm": 62.5379323018988, + "language_loss": 1.55304623, + "learning_rate": 3.5626849044954064e-06, + "loss": 1.70416546, + "num_input_tokens_seen": 5296975, + "router_z_loss_clip": 30.96875, + "router_z_loss_mlp": 9.0625, + "step": 253, + "time_per_iteration": 2.6943836212158203 + }, + { + "auxiliary_loss_clip": 0.09242393, + "auxiliary_loss_mlp": 0.017157, + "balance_loss_clip": 0.07774388, + "balance_loss_mlp": 0.01455537, + "epoch": 0.015271306177664212, + "flos": 66915159765120.0, + "grad_norm": 1.2208472030610649, + "language_loss": 0.55767465, + "learning_rate": 3.5652247586115167e-06, + "loss": 0.66725558, + "num_input_tokens_seen": 5358375, + "router_z_loss_clip": 14.65625, + "router_z_loss_mlp": 2.6015625, + "step": 254, + "time_per_iteration": 4.672732353210449 + }, + { + "auxiliary_loss_clip": 0.11620437, + "auxiliary_loss_mlp": 0.03323486, + "balance_loss_clip": 0.08537702, + "balance_loss_mlp": 0.02497223, + "epoch": 0.01533142943033218, + "flos": 26841405657600.0, + "grad_norm": 25.800997540380294, + "language_loss": 1.37205672, + "learning_rate": 3.567754632921479e-06, + "loss": 1.52149594, + "num_input_tokens_seen": 5377255, + "router_z_loss_clip": 30.84375, + "router_z_loss_mlp": 8.265625, + "step": 255, + "time_per_iteration": 5.487545490264893 + }, + { + "auxiliary_loss_clip": 0.11549303, + "auxiliary_loss_mlp": 0.03243715, + "balance_loss_clip": 0.08531242, + "balance_loss_mlp": 0.02464373, + "epoch": 0.01539155268300015, + "flos": 20820055403520.0, + "grad_norm": 51.38147970022548, + "language_loss": 1.3568666, + "learning_rate": 3.5702746055454075e-06, + "loss": 1.50479686, + "num_input_tokens_seen": 5395320, + "router_z_loss_clip": 30.171875, + "router_z_loss_mlp": 7.7890625, + "step": 256, + "time_per_iteration": 2.7118937969207764 + }, + { + "auxiliary_loss_clip": 0.11515065, + "auxiliary_loss_mlp": 0.0323028, + "balance_loss_clip": 0.08509345, + "balance_loss_mlp": 0.02460093, + "epoch": 0.01545167593566812, + "flos": 15967473425280.0, + "grad_norm": 27.629045104410558, + "language_loss": 1.28094459, + "learning_rate": 3.5727847536897254e-06, + "loss": 1.42839789, + "num_input_tokens_seen": 5411970, + "router_z_loss_clip": 30.046875, + "router_z_loss_mlp": 7.69921875, + "step": 257, + "time_per_iteration": 4.093847751617432 + }, + { + "auxiliary_loss_clip": 0.11514995, + "auxiliary_loss_mlp": 0.03174197, + "balance_loss_clip": 0.08523524, + "balance_loss_mlp": 0.02457415, + "epoch": 0.01551179918833609, + "flos": 22608378368640.0, + "grad_norm": 22.193359085523966, + "language_loss": 1.37467206, + "learning_rate": 3.5752851536613596e-06, + "loss": 1.52156401, + "num_input_tokens_seen": 5430245, + "router_z_loss_clip": 29.921875, + "router_z_loss_mlp": 7.171875, + "step": 258, + "time_per_iteration": 2.6789233684539795 + }, + { + "auxiliary_loss_clip": 0.11490995, + "auxiliary_loss_mlp": 0.03125494, + "balance_loss_clip": 0.08525682, + "balance_loss_mlp": 0.02450675, + "epoch": 0.015571922441004058, + "flos": 22822713912960.0, + "grad_norm": 41.08352403819959, + "language_loss": 1.35431111, + "learning_rate": 3.577775880881658e-06, + "loss": 1.50047588, + "num_input_tokens_seen": 5448905, + "router_z_loss_clip": 29.640625, + "router_z_loss_mlp": 6.75390625, + "step": 259, + "time_per_iteration": 2.716095209121704 + }, + { + "auxiliary_loss_clip": 0.11409761, + "auxiliary_loss_mlp": 0.03065479, + "balance_loss_clip": 0.08500087, + "balance_loss_mlp": 0.02439868, + "epoch": 0.015632045693672027, + "flos": 18952502803200.0, + "grad_norm": 45.41794645804665, + "language_loss": 1.35833013, + "learning_rate": 3.5802570099000424e-06, + "loss": 1.50308251, + "num_input_tokens_seen": 5466405, + "router_z_loss_clip": 29.109375, + "router_z_loss_mlp": 6.25390625, + "step": 260, + "time_per_iteration": 2.63728666305542 + }, + { + "auxiliary_loss_clip": 0.11363758, + "auxiliary_loss_mlp": 0.03047284, + "balance_loss_clip": 0.0847533, + "balance_loss_mlp": 0.02422818, + "epoch": 0.015692168946339995, + "flos": 29979569571840.0, + "grad_norm": 14.449297272648009, + "language_loss": 1.30485594, + "learning_rate": 3.5827286144073947e-06, + "loss": 1.44896626, + "num_input_tokens_seen": 5487055, + "router_z_loss_clip": 28.921875, + "router_z_loss_mlp": 6.23828125, + "step": 261, + "time_per_iteration": 2.7847509384155273 + }, + { + "auxiliary_loss_clip": 0.11379428, + "auxiliary_loss_mlp": 0.03054321, + "balance_loss_clip": 0.08507971, + "balance_loss_mlp": 0.02459991, + "epoch": 0.015752292199007967, + "flos": 19398363978240.0, + "grad_norm": 31.701786044094614, + "language_loss": 1.03000259, + "learning_rate": 3.5851907672491904e-06, + "loss": 1.17434001, + "num_input_tokens_seen": 5506600, + "router_z_loss_clip": 28.71875, + "router_z_loss_mlp": 5.94140625, + "step": 262, + "time_per_iteration": 2.6821658611297607 + }, + { + "auxiliary_loss_clip": 0.11303549, + "auxiliary_loss_mlp": 0.02991728, + "balance_loss_clip": 0.0846238, + "balance_loss_mlp": 0.02461103, + "epoch": 0.015812415451675936, + "flos": 20346088383360.0, + "grad_norm": 21.20591685993131, + "language_loss": 1.06071973, + "learning_rate": 3.587643540438383e-06, + "loss": 1.20367253, + "num_input_tokens_seen": 5524350, + "router_z_loss_clip": 28.421875, + "router_z_loss_mlp": 5.30859375, + "step": 263, + "time_per_iteration": 2.6878163814544678 + }, + { + "auxiliary_loss_clip": 0.11343089, + "auxiliary_loss_mlp": 0.02942515, + "balance_loss_clip": 0.08484475, + "balance_loss_mlp": 0.0242982, + "epoch": 0.015872538704343905, + "flos": 17530392107520.0, + "grad_norm": 30.142563573193335, + "language_loss": 1.29773152, + "learning_rate": 3.590087005168037e-06, + "loss": 1.44058764, + "num_input_tokens_seen": 5542145, + "router_z_loss_clip": 28.59375, + "router_z_loss_mlp": 5.125, + "step": 264, + "time_per_iteration": 2.662154197692871 + }, + { + "auxiliary_loss_clip": 0.11317942, + "auxiliary_loss_mlp": 0.02875043, + "balance_loss_clip": 0.08491537, + "balance_loss_mlp": 0.02415754, + "epoch": 0.015932661957011873, + "flos": 15264622177920.0, + "grad_norm": 32.942584170075996, + "language_loss": 1.38455915, + "learning_rate": 3.5925212318237344e-06, + "loss": 1.52648902, + "num_input_tokens_seen": 5557920, + "router_z_loss_clip": 28.28125, + "router_z_loss_mlp": 4.59375, + "step": 265, + "time_per_iteration": 2.6390388011932373 + }, + { + "auxiliary_loss_clip": 0.11291553, + "auxiliary_loss_mlp": 0.02864291, + "balance_loss_clip": 0.08442727, + "balance_loss_mlp": 0.02421405, + "epoch": 0.015992785209679845, + "flos": 20308674735360.0, + "grad_norm": 55.122223701442024, + "language_loss": 1.13817394, + "learning_rate": 3.5949462899957323e-06, + "loss": 1.27973235, + "num_input_tokens_seen": 5576290, + "router_z_loss_clip": 28.484375, + "router_z_loss_mlp": 4.42773438, + "step": 266, + "time_per_iteration": 2.7511661052703857 + }, + { + "auxiliary_loss_clip": 0.11267024, + "auxiliary_loss_mlp": 0.02842336, + "balance_loss_clip": 0.08455394, + "balance_loss_mlp": 0.02423863, + "epoch": 0.016052908462347814, + "flos": 23368195992960.0, + "grad_norm": 26.951368678186665, + "language_loss": 1.23554707, + "learning_rate": 3.5973622484909068e-06, + "loss": 1.3766408, + "num_input_tokens_seen": 5595205, + "router_z_loss_clip": 28.140625, + "router_z_loss_mlp": 4.17773438, + "step": 267, + "time_per_iteration": 2.681403875350952 + }, + { + "auxiliary_loss_clip": 0.11252864, + "auxiliary_loss_mlp": 0.02837055, + "balance_loss_clip": 0.0845217, + "balance_loss_mlp": 0.02411335, + "epoch": 0.016113031715015783, + "flos": 21292722685440.0, + "grad_norm": 64.20150221953703, + "language_loss": 1.24742389, + "learning_rate": 3.599769175344462e-06, + "loss": 1.38832319, + "num_input_tokens_seen": 5612645, + "router_z_loss_clip": 28.0, + "router_z_loss_mlp": 4.2578125, + "step": 268, + "time_per_iteration": 2.72198224067688 + }, + { + "auxiliary_loss_clip": 0.11163211, + "auxiliary_loss_mlp": 0.02866759, + "balance_loss_clip": 0.08415397, + "balance_loss_mlp": 0.0243093, + "epoch": 0.01617315496768375, + "flos": 18920371962240.0, + "grad_norm": 170.41239636292127, + "language_loss": 1.22916961, + "learning_rate": 3.602167137831432e-06, + "loss": 1.3694694, + "num_input_tokens_seen": 5628345, + "router_z_loss_clip": 27.46875, + "router_z_loss_mlp": 4.36132812, + "step": 269, + "time_per_iteration": 2.6403703689575195 + }, + { + "auxiliary_loss_clip": 0.11217365, + "auxiliary_loss_mlp": 0.02780488, + "balance_loss_clip": 0.08470169, + "balance_loss_mlp": 0.02398446, + "epoch": 0.01623327822035172, + "flos": 16552339724160.0, + "grad_norm": 38.966481299889274, + "language_loss": 1.32494903, + "learning_rate": 3.6045562024779565e-06, + "loss": 1.46492743, + "num_input_tokens_seen": 5645940, + "router_z_loss_clip": 27.515625, + "router_z_loss_mlp": 3.82226562, + "step": 270, + "time_per_iteration": 2.7300021648406982 + }, + { + "auxiliary_loss_clip": 0.11115253, + "auxiliary_loss_mlp": 0.02879213, + "balance_loss_clip": 0.08416284, + "balance_loss_mlp": 0.02523302, + "epoch": 0.016293401473019692, + "flos": 23520198499200.0, + "grad_norm": 74.8782587112652, + "language_loss": 1.26303077, + "learning_rate": 3.606936435072361e-06, + "loss": 1.40297556, + "num_input_tokens_seen": 5665690, + "router_z_loss_clip": 26.984375, + "router_z_loss_mlp": 3.55859375, + "step": 271, + "time_per_iteration": 2.7073349952697754 + }, + { + "auxiliary_loss_clip": 0.11099713, + "auxiliary_loss_mlp": 0.02833465, + "balance_loss_clip": 0.08408779, + "balance_loss_mlp": 0.02473739, + "epoch": 0.01635352472568766, + "flos": 29022579290880.0, + "grad_norm": 92.09487601801163, + "language_loss": 1.22523308, + "learning_rate": 3.609307900676025e-06, + "loss": 1.36456478, + "num_input_tokens_seen": 5683190, + "router_z_loss_clip": 26.921875, + "router_z_loss_mlp": 3.59765625, + "step": 272, + "time_per_iteration": 2.767242670059204 + }, + { + "auxiliary_loss_clip": 0.11100094, + "auxiliary_loss_mlp": 0.02845915, + "balance_loss_clip": 0.08419856, + "balance_loss_mlp": 0.02489432, + "epoch": 0.01641364797835563, + "flos": 13375546277760.0, + "grad_norm": 162.68643260209848, + "language_loss": 1.12912893, + "learning_rate": 3.611670663634051e-06, + "loss": 1.26858902, + "num_input_tokens_seen": 5699780, + "router_z_loss_clip": 26.828125, + "router_z_loss_mlp": 3.5625, + "step": 273, + "time_per_iteration": 2.6756341457366943 + }, + { + "auxiliary_loss_clip": 0.11082844, + "auxiliary_loss_mlp": 0.02877946, + "balance_loss_clip": 0.08410685, + "balance_loss_mlp": 0.02487702, + "epoch": 0.016473771231023598, + "flos": 18883922636160.0, + "grad_norm": 33.34014800610017, + "language_loss": 1.30194449, + "learning_rate": 3.614024787585744e-06, + "loss": 1.44155228, + "num_input_tokens_seen": 5716980, + "router_z_loss_clip": 26.734375, + "router_z_loss_mlp": 3.90234375, + "step": 274, + "time_per_iteration": 2.7216930389404297 + }, + { + "auxiliary_loss_clip": 0.11044294, + "auxiliary_loss_mlp": 0.02852219, + "balance_loss_clip": 0.08402658, + "balance_loss_mlp": 0.02501839, + "epoch": 0.016533894483691566, + "flos": 22608252587520.0, + "grad_norm": 44.408233256015265, + "language_loss": 1.22405624, + "learning_rate": 3.6163703354748927e-06, + "loss": 1.36302137, + "num_input_tokens_seen": 5737780, + "router_z_loss_clip": 26.453125, + "router_z_loss_mlp": 3.50390625, + "step": 275, + "time_per_iteration": 2.6909008026123047 + }, + { + "auxiliary_loss_clip": 0.10985737, + "auxiliary_loss_mlp": 0.02874438, + "balance_loss_clip": 0.08389083, + "balance_loss_mlp": 0.02526728, + "epoch": 0.01659401773635954, + "flos": 21513640775040.0, + "grad_norm": 44.25598676438703, + "language_loss": 1.11958659, + "learning_rate": 3.6187073695598707e-06, + "loss": 1.25818849, + "num_input_tokens_seen": 5758330, + "router_z_loss_clip": 25.984375, + "router_z_loss_mlp": 3.4765625, + "step": 276, + "time_per_iteration": 2.700979471206665 + }, + { + "auxiliary_loss_clip": 0.10974017, + "auxiliary_loss_mlp": 0.02898641, + "balance_loss_clip": 0.08386508, + "balance_loss_mlp": 0.02528615, + "epoch": 0.016654140989027507, + "flos": 32858772842880.0, + "grad_norm": 42.11334181974309, + "language_loss": 1.14762068, + "learning_rate": 3.621035951423551e-06, + "loss": 1.28634739, + "num_input_tokens_seen": 5778340, + "router_z_loss_clip": 25.859375, + "router_z_loss_mlp": 3.703125, + "step": 277, + "time_per_iteration": 2.8497049808502197 + }, + { + "auxiliary_loss_clip": 0.10973347, + "auxiliary_loss_mlp": 0.02864523, + "balance_loss_clip": 0.08391111, + "balance_loss_mlp": 0.02533217, + "epoch": 0.016714264241695476, + "flos": 12310046559360.0, + "grad_norm": 887.2068563232498, + "language_loss": 1.11253488, + "learning_rate": 3.623356141983041e-06, + "loss": 1.25091362, + "num_input_tokens_seen": 5794295, + "router_z_loss_clip": 25.796875, + "router_z_loss_mlp": 3.3125, + "step": 278, + "time_per_iteration": 2.6813693046569824 + }, + { + "auxiliary_loss_clip": 0.10953625, + "auxiliary_loss_mlp": 0.02843702, + "balance_loss_clip": 0.08367237, + "balance_loss_mlp": 0.02501333, + "epoch": 0.016774387494363444, + "flos": 27130820060160.0, + "grad_norm": 34.273698880479216, + "language_loss": 1.25525784, + "learning_rate": 3.6256680014992486e-06, + "loss": 1.39323103, + "num_input_tokens_seen": 5814405, + "router_z_loss_clip": 25.859375, + "router_z_loss_mlp": 3.42382812, + "step": 279, + "time_per_iteration": 2.784980058670044 + }, + { + "auxiliary_loss_clip": 0.10968237, + "auxiliary_loss_mlp": 0.02757426, + "balance_loss_clip": 0.0838433, + "balance_loss_mlp": 0.02447863, + "epoch": 0.016834510747031413, + "flos": 20197356186240.0, + "grad_norm": 53.49395148263472, + "language_loss": 1.29536223, + "learning_rate": 3.6279715895862713e-06, + "loss": 1.43261886, + "num_input_tokens_seen": 5832795, + "router_z_loss_clip": 25.859375, + "router_z_loss_mlp": 3.09570312, + "step": 280, + "time_per_iteration": 2.681295871734619 + }, + { + "auxiliary_loss_clip": 0.10977297, + "auxiliary_loss_mlp": 0.02731509, + "balance_loss_clip": 0.083787, + "balance_loss_mlp": 0.02426143, + "epoch": 0.016894633999699385, + "flos": 27282067879680.0, + "grad_norm": 34.532536985404526, + "language_loss": 1.04021847, + "learning_rate": 3.6302669652206183e-06, + "loss": 1.17730653, + "num_input_tokens_seen": 5855750, + "router_z_loss_clip": 26.0, + "router_z_loss_mlp": 3.0546875, + "step": 281, + "time_per_iteration": 2.760214328765869 + }, + { + "auxiliary_loss_clip": 0.10965681, + "auxiliary_loss_mlp": 0.02675743, + "balance_loss_clip": 0.08379069, + "balance_loss_mlp": 0.02375717, + "epoch": 0.016954757252367354, + "flos": 14908262762880.0, + "grad_norm": 196.2497312811754, + "language_loss": 1.22675765, + "learning_rate": 3.632554186750274e-06, + "loss": 1.36317194, + "num_input_tokens_seen": 5872610, + "router_z_loss_clip": 25.875, + "router_z_loss_mlp": 2.99609375, + "step": 282, + "time_per_iteration": 2.619256019592285 + }, + { + "auxiliary_loss_clip": 0.10984524, + "auxiliary_loss_mlp": 0.02614953, + "balance_loss_clip": 0.0837212, + "balance_loss_mlp": 0.02316834, + "epoch": 0.017014880505035322, + "flos": 21364824723840.0, + "grad_norm": 113.89697119062544, + "language_loss": 1.1510148, + "learning_rate": 3.6348333119035937e-06, + "loss": 1.28700948, + "num_input_tokens_seen": 5892985, + "router_z_loss_clip": 26.125, + "router_z_loss_mlp": 2.98046875, + "step": 283, + "time_per_iteration": 2.7038846015930176 + }, + { + "auxiliary_loss_clip": 0.10939686, + "auxiliary_loss_mlp": 0.02615653, + "balance_loss_clip": 0.08368152, + "balance_loss_mlp": 0.02314101, + "epoch": 0.01707500375770329, + "flos": 35341561647360.0, + "grad_norm": 2832.5964725422496, + "language_loss": 1.17971587, + "learning_rate": 3.6371043977980503e-06, + "loss": 1.31526923, + "num_input_tokens_seen": 5914060, + "router_z_loss_clip": 25.703125, + "router_z_loss_mlp": 3.015625, + "step": 284, + "time_per_iteration": 2.779290199279785 + }, + { + "auxiliary_loss_clip": 0.11009269, + "auxiliary_loss_mlp": 0.02623795, + "balance_loss_clip": 0.08394658, + "balance_loss_mlp": 0.02300118, + "epoch": 0.01713512701037126, + "flos": 23588065906560.0, + "grad_norm": 202.09490986405962, + "language_loss": 1.3942194, + "learning_rate": 3.639367500948819e-06, + "loss": 1.53055, + "num_input_tokens_seen": 5932860, + "router_z_loss_clip": 26.15625, + "router_z_loss_mlp": 3.23632812, + "step": 285, + "time_per_iteration": 2.708090305328369 + }, + { + "auxiliary_loss_clip": 0.10991548, + "auxiliary_loss_mlp": 0.02635612, + "balance_loss_clip": 0.08366679, + "balance_loss_mlp": 0.02286949, + "epoch": 0.01719525026303923, + "flos": 27641781457920.0, + "grad_norm": 356.15135022069484, + "language_loss": 1.3973043, + "learning_rate": 3.6416226772772178e-06, + "loss": 1.53357589, + "num_input_tokens_seen": 5952725, + "router_z_loss_clip": 26.265625, + "router_z_loss_mlp": 3.48828125, + "step": 286, + "time_per_iteration": 2.719446897506714 + }, + { + "auxiliary_loss_clip": 0.11012185, + "auxiliary_loss_mlp": 0.02632762, + "balance_loss_clip": 0.08369677, + "balance_loss_mlp": 0.02288295, + "epoch": 0.0172553735157072, + "flos": 26987035253760.0, + "grad_norm": 104.57350843719594, + "language_loss": 1.20868826, + "learning_rate": 3.643869982119001e-06, + "loss": 1.34513772, + "num_input_tokens_seen": 5970560, + "router_z_loss_clip": 26.4375, + "router_z_loss_mlp": 3.44335938, + "step": 287, + "time_per_iteration": 2.729893207550049 + }, + { + "auxiliary_loss_clip": 0.10980022, + "auxiliary_loss_mlp": 0.02642429, + "balance_loss_clip": 0.08353196, + "balance_loss_mlp": 0.02284801, + "epoch": 0.01731549676837517, + "flos": 14060578533120.0, + "grad_norm": 166.25914626432441, + "language_loss": 1.43957901, + "learning_rate": 3.646109470232502e-06, + "loss": 1.57580352, + "num_input_tokens_seen": 5982980, + "router_z_loss_clip": 26.21875, + "router_z_loss_mlp": 3.57617188, + "step": 288, + "time_per_iteration": 2.649275779724121 + }, + { + "auxiliary_loss_clip": 0.08934768, + "auxiliary_loss_mlp": 0.02473956, + "balance_loss_clip": 0.07674165, + "balance_loss_mlp": 0.02246409, + "epoch": 0.017375620021043137, + "flos": 66533545543680.0, + "grad_norm": 1.4063062090104488, + "language_loss": 0.6396153, + "learning_rate": 3.6483411958066417e-06, + "loss": 0.75370252, + "num_input_tokens_seen": 6049445, + "router_z_loss_clip": 12.625, + "router_z_loss_mlp": 2.27734375, + "step": 289, + "time_per_iteration": 3.379565954208374 + }, + { + "auxiliary_loss_clip": 0.10942794, + "auxiliary_loss_mlp": 0.0259406, + "balance_loss_clip": 0.08345533, + "balance_loss_mlp": 0.02290982, + "epoch": 0.01743574327371111, + "flos": 15229472590080.0, + "grad_norm": 77.68078787610818, + "language_loss": 1.23036659, + "learning_rate": 3.6505652124687957e-06, + "loss": 1.36573505, + "num_input_tokens_seen": 6064150, + "router_z_loss_clip": 26.0, + "router_z_loss_mlp": 3.03320312, + "step": 290, + "time_per_iteration": 2.6509203910827637 + }, + { + "auxiliary_loss_clip": 0.10926615, + "auxiliary_loss_mlp": 0.02615048, + "balance_loss_clip": 0.08348773, + "balance_loss_mlp": 0.02310254, + "epoch": 0.017495866526379078, + "flos": 25380833137920.0, + "grad_norm": 27.564120325217353, + "language_loss": 1.14881706, + "learning_rate": 3.6527815732925258e-06, + "loss": 1.28423381, + "num_input_tokens_seen": 6083920, + "router_z_loss_clip": 25.796875, + "router_z_loss_mlp": 3.046875, + "step": 291, + "time_per_iteration": 2.7178046703338623 + }, + { + "auxiliary_loss_clip": 0.10883434, + "auxiliary_loss_mlp": 0.02591836, + "balance_loss_clip": 0.08332369, + "balance_loss_mlp": 0.02272164, + "epoch": 0.017555989779047047, + "flos": 26366683950720.0, + "grad_norm": 17.764405326344416, + "language_loss": 0.99533927, + "learning_rate": 3.6549903308051806e-06, + "loss": 1.13009202, + "num_input_tokens_seen": 6105460, + "router_z_loss_clip": 25.53125, + "router_z_loss_mlp": 3.1953125, + "step": 292, + "time_per_iteration": 2.788431406021118 + }, + { + "auxiliary_loss_clip": 0.10899352, + "auxiliary_loss_mlp": 0.02663543, + "balance_loss_clip": 0.08339885, + "balance_loss_mlp": 0.02329948, + "epoch": 0.017616113031715015, + "flos": 22344134918400.0, + "grad_norm": 26.042803645754148, + "language_loss": 1.17510223, + "learning_rate": 3.6571915369953646e-06, + "loss": 1.31073129, + "num_input_tokens_seen": 6122890, + "router_z_loss_clip": 25.59375, + "router_z_loss_mlp": 3.33398438, + "step": 293, + "time_per_iteration": 2.6952950954437256 + }, + { + "auxiliary_loss_clip": 0.10900117, + "auxiliary_loss_mlp": 0.02710556, + "balance_loss_clip": 0.08334709, + "balance_loss_mlp": 0.02379822, + "epoch": 0.017676236284382984, + "flos": 20163087066240.0, + "grad_norm": 32.066823918561106, + "language_loss": 1.13700342, + "learning_rate": 3.6593852433202797e-06, + "loss": 1.27311015, + "num_input_tokens_seen": 6142890, + "router_z_loss_clip": 25.640625, + "router_z_loss_mlp": 3.30859375, + "step": 294, + "time_per_iteration": 5.568135976791382 + }, + { + "auxiliary_loss_clip": 0.10885305, + "auxiliary_loss_mlp": 0.02641671, + "balance_loss_clip": 0.08332892, + "balance_loss_mlp": 0.02322953, + "epoch": 0.017736359537050956, + "flos": 25229501464320.0, + "grad_norm": 23.522869629200528, + "language_loss": 1.10671854, + "learning_rate": 3.6615715007129453e-06, + "loss": 1.24198818, + "num_input_tokens_seen": 6162030, + "router_z_loss_clip": 25.515625, + "router_z_loss_mlp": 3.1875, + "step": 295, + "time_per_iteration": 4.106949090957642 + }, + { + "auxiliary_loss_clip": 0.10915332, + "auxiliary_loss_mlp": 0.02662487, + "balance_loss_clip": 0.08334074, + "balance_loss_mlp": 0.02339572, + "epoch": 0.017796482789718925, + "flos": 20344914426240.0, + "grad_norm": 21.437764161161574, + "language_loss": 1.11617136, + "learning_rate": 3.6637503595892897e-06, + "loss": 1.25194955, + "num_input_tokens_seen": 6180540, + "router_z_loss_clip": 25.8125, + "router_z_loss_mlp": 3.22851562, + "step": 296, + "time_per_iteration": 2.6804072856903076 + }, + { + "auxiliary_loss_clip": 0.10889067, + "auxiliary_loss_mlp": 0.02644786, + "balance_loss_clip": 0.08324579, + "balance_loss_mlp": 0.02326259, + "epoch": 0.017856606042386893, + "flos": 22385196218880.0, + "grad_norm": 24.793293378850404, + "language_loss": 1.13374424, + "learning_rate": 3.665921869855132e-06, + "loss": 1.26908278, + "num_input_tokens_seen": 6199425, + "router_z_loss_clip": 25.671875, + "router_z_loss_mlp": 3.18554688, + "step": 297, + "time_per_iteration": 4.217481851577759 + }, + { + "auxiliary_loss_clip": 0.10852176, + "auxiliary_loss_mlp": 0.02688673, + "balance_loss_clip": 0.08303393, + "balance_loss_mlp": 0.02347639, + "epoch": 0.017916729295054862, + "flos": 20236279207680.0, + "grad_norm": 36.45374269731938, + "language_loss": 1.20502043, + "learning_rate": 3.6680860809130346e-06, + "loss": 1.34042883, + "num_input_tokens_seen": 6219170, + "router_z_loss_clip": 25.515625, + "router_z_loss_mlp": 3.40820312, + "step": 298, + "time_per_iteration": 2.6716575622558594 + }, + { + "auxiliary_loss_clip": 0.10865816, + "auxiliary_loss_mlp": 0.02644256, + "balance_loss_clip": 0.08315772, + "balance_loss_mlp": 0.02343848, + "epoch": 0.01797685254772283, + "flos": 19397064240000.0, + "grad_norm": 34.948505853119244, + "language_loss": 1.10227847, + "learning_rate": 3.6702430416690516e-06, + "loss": 1.23737931, + "num_input_tokens_seen": 6237930, + "router_z_loss_clip": 25.5, + "router_z_loss_mlp": 3.00390625, + "step": 299, + "time_per_iteration": 2.6678671836853027 + }, + { + "auxiliary_loss_clip": 0.10841461, + "auxiliary_loss_mlp": 0.02622314, + "balance_loss_clip": 0.08293117, + "balance_loss_mlp": 0.02329536, + "epoch": 0.018036975800390802, + "flos": 24432941024640.0, + "grad_norm": 19.38461643101093, + "language_loss": 0.93498641, + "learning_rate": 3.672392800539357e-06, + "loss": 1.06962407, + "num_input_tokens_seen": 6257170, + "router_z_loss_clip": 25.46875, + "router_z_loss_mlp": 2.92578125, + "step": 300, + "time_per_iteration": 2.678161382675171 + }, + { + "auxiliary_loss_clip": 0.10806506, + "auxiliary_loss_mlp": 0.02621871, + "balance_loss_clip": 0.08281456, + "balance_loss_mlp": 0.02336723, + "epoch": 0.01809709905305877, + "flos": 15784430181120.0, + "grad_norm": 20.696646248156853, + "language_loss": 1.21024799, + "learning_rate": 3.6745354054567686e-06, + "loss": 1.34453177, + "num_input_tokens_seen": 6274780, + "router_z_loss_clip": 25.265625, + "router_z_loss_mlp": 2.85351562, + "step": 301, + "time_per_iteration": 2.6817290782928467 + }, + { + "auxiliary_loss_clip": 0.0850801, + "auxiliary_loss_mlp": 0.01826254, + "balance_loss_clip": 0.07523113, + "balance_loss_mlp": 0.01690356, + "epoch": 0.01815722230572674, + "flos": 67371125356800.0, + "grad_norm": 1.2503467181890604, + "language_loss": 0.62148851, + "learning_rate": 3.676670903877158e-06, + "loss": 0.72483116, + "num_input_tokens_seen": 6340435, + "router_z_loss_clip": 9.859375, + "router_z_loss_mlp": 1.36035156, + "step": 302, + "time_per_iteration": 3.424029588699341 + }, + { + "auxiliary_loss_clip": 0.10791934, + "auxiliary_loss_mlp": 0.02578435, + "balance_loss_clip": 0.08265001, + "balance_loss_mlp": 0.02299963, + "epoch": 0.01821734555839471, + "flos": 15490823074560.0, + "grad_norm": 21.711544566316807, + "language_loss": 1.17839396, + "learning_rate": 3.6787993427857567e-06, + "loss": 1.31209755, + "num_input_tokens_seen": 6358160, + "router_z_loss_clip": 25.265625, + "router_z_loss_mlp": 2.78320312, + "step": 303, + "time_per_iteration": 2.6523215770721436 + }, + { + "auxiliary_loss_clip": 0.10728209, + "auxiliary_loss_mlp": 0.02544189, + "balance_loss_clip": 0.08224705, + "balance_loss_mlp": 0.02301288, + "epoch": 0.018277468811062677, + "flos": 24104268184320.0, + "grad_norm": 23.704422815160775, + "language_loss": 1.0746634, + "learning_rate": 3.680920768703364e-06, + "loss": 1.20738745, + "num_input_tokens_seen": 6378485, + "router_z_loss_clip": 25.03125, + "router_z_loss_mlp": 2.42675781, + "step": 304, + "time_per_iteration": 2.7344958782196045 + }, + { + "auxiliary_loss_clip": 0.1066777, + "auxiliary_loss_mlp": 0.02483555, + "balance_loss_clip": 0.08210013, + "balance_loss_mlp": 0.02260681, + "epoch": 0.01833759206373065, + "flos": 20965601145600.0, + "grad_norm": 30.99837504160223, + "language_loss": 1.03348625, + "learning_rate": 3.6830352276924415e-06, + "loss": 1.16499949, + "num_input_tokens_seen": 6397845, + "router_z_loss_clip": 24.5625, + "router_z_loss_mlp": 2.22949219, + "step": 305, + "time_per_iteration": 2.7260208129882812 + }, + { + "auxiliary_loss_clip": 0.10687442, + "auxiliary_loss_mlp": 0.0251225, + "balance_loss_clip": 0.08201034, + "balance_loss_mlp": 0.0229529, + "epoch": 0.018397715316398618, + "flos": 19396812677760.0, + "grad_norm": 19.918754118514013, + "language_loss": 1.13116205, + "learning_rate": 3.685142765363119e-06, + "loss": 1.26315892, + "num_input_tokens_seen": 6416475, + "router_z_loss_clip": 24.828125, + "router_z_loss_mlp": 2.16992188, + "step": 306, + "time_per_iteration": 2.691499948501587 + }, + { + "auxiliary_loss_clip": 0.10669354, + "auxiliary_loss_mlp": 0.02508631, + "balance_loss_clip": 0.08186156, + "balance_loss_mlp": 0.02314558, + "epoch": 0.018457838569066586, + "flos": 29140228823040.0, + "grad_norm": 47.10981354198648, + "language_loss": 1.13449669, + "learning_rate": 3.687243426879095e-06, + "loss": 1.2662766, + "num_input_tokens_seen": 6437520, + "router_z_loss_clip": 24.859375, + "router_z_loss_mlp": 1.94335938, + "step": 307, + "time_per_iteration": 2.7379393577575684 + }, + { + "auxiliary_loss_clip": 0.10625106, + "auxiliary_loss_mlp": 0.02487612, + "balance_loss_clip": 0.08165652, + "balance_loss_mlp": 0.02317095, + "epoch": 0.018517961821734555, + "flos": 19214733755520.0, + "grad_norm": 42.1678147839251, + "language_loss": 0.98589212, + "learning_rate": 3.6893372569634466e-06, + "loss": 1.11701941, + "num_input_tokens_seen": 6455680, + "router_z_loss_clip": 24.609375, + "router_z_loss_mlp": 1.70605469, + "step": 308, + "time_per_iteration": 2.702864646911621 + }, + { + "auxiliary_loss_clip": 0.1055109, + "auxiliary_loss_mlp": 0.02395341, + "balance_loss_clip": 0.08134291, + "balance_loss_mlp": 0.02218911, + "epoch": 0.018578085074402523, + "flos": 19868809127040.0, + "grad_norm": 28.65950876073581, + "language_loss": 1.1383698, + "learning_rate": 3.6914242999043395e-06, + "loss": 1.26783419, + "num_input_tokens_seen": 6474880, + "router_z_loss_clip": 24.171875, + "router_z_loss_mlp": 1.765625, + "step": 309, + "time_per_iteration": 2.6683051586151123 + }, + { + "auxiliary_loss_clip": 0.10586038, + "auxiliary_loss_mlp": 0.02405273, + "balance_loss_clip": 0.08121731, + "balance_loss_mlp": 0.02230465, + "epoch": 0.018638208327070496, + "flos": 29614740894720.0, + "grad_norm": 52.453360042586766, + "language_loss": 1.0296793, + "learning_rate": 3.69350459956065e-06, + "loss": 1.15959239, + "num_input_tokens_seen": 6495945, + "router_z_loss_clip": 24.625, + "router_z_loss_mlp": 1.74804688, + "step": 310, + "time_per_iteration": 2.775391101837158 + }, + { + "auxiliary_loss_clip": 0.10563378, + "auxiliary_loss_mlp": 0.02371235, + "balance_loss_clip": 0.08112171, + "balance_loss_mlp": 0.02215118, + "epoch": 0.018698331579738464, + "flos": 45741694567680.0, + "grad_norm": 23.410275827875097, + "language_loss": 0.97821265, + "learning_rate": 3.695578199367497e-06, + "loss": 1.10755873, + "num_input_tokens_seen": 6519930, + "router_z_loss_clip": 24.5, + "router_z_loss_mlp": 1.56054688, + "step": 311, + "time_per_iteration": 2.8839335441589355 + }, + { + "auxiliary_loss_clip": 0.10531655, + "auxiliary_loss_mlp": 0.02336008, + "balance_loss_clip": 0.08109175, + "balance_loss_mlp": 0.02177126, + "epoch": 0.018758454832406433, + "flos": 20489621627520.0, + "grad_norm": 82.59483456267918, + "language_loss": 1.18671477, + "learning_rate": 3.6976451423416825e-06, + "loss": 1.31539142, + "num_input_tokens_seen": 6535070, + "router_z_loss_clip": 24.203125, + "router_z_loss_mlp": 1.58886719, + "step": 312, + "time_per_iteration": 2.770037889480591 + }, + { + "auxiliary_loss_clip": 0.10558081, + "auxiliary_loss_mlp": 0.02280057, + "balance_loss_clip": 0.08105703, + "balance_loss_mlp": 0.02130998, + "epoch": 0.0188185780850744, + "flos": 15783088515840.0, + "grad_norm": 63.63527142809732, + "language_loss": 1.19325101, + "learning_rate": 3.699705471087043e-06, + "loss": 1.32163239, + "num_input_tokens_seen": 6554135, + "router_z_loss_clip": 24.515625, + "router_z_loss_mlp": 1.49121094, + "step": 313, + "time_per_iteration": 2.6673521995544434 + }, + { + "auxiliary_loss_clip": 0.10532573, + "auxiliary_loss_mlp": 0.02284473, + "balance_loss_clip": 0.08092797, + "balance_loss_mlp": 0.02119774, + "epoch": 0.018878701337742373, + "flos": 22462329502080.0, + "grad_norm": 55.57556601394066, + "language_loss": 1.1492281, + "learning_rate": 3.7017592277997256e-06, + "loss": 1.27739859, + "num_input_tokens_seen": 6572275, + "router_z_loss_clip": 24.375, + "router_z_loss_mlp": 1.6484375, + "step": 314, + "time_per_iteration": 2.6694388389587402 + }, + { + "auxiliary_loss_clip": 0.10578424, + "auxiliary_loss_mlp": 0.02246847, + "balance_loss_clip": 0.08105191, + "balance_loss_mlp": 0.02083482, + "epoch": 0.018938824590410342, + "flos": 31001576221440.0, + "grad_norm": 45.405049918855795, + "language_loss": 1.21203804, + "learning_rate": 3.7038064542733654e-06, + "loss": 1.34029078, + "num_input_tokens_seen": 6594520, + "router_z_loss_clip": 24.734375, + "router_z_loss_mlp": 1.6328125, + "step": 315, + "time_per_iteration": 2.7529938220977783 + }, + { + "auxiliary_loss_clip": 0.10473935, + "auxiliary_loss_mlp": 0.02224543, + "balance_loss_clip": 0.08059986, + "balance_loss_mlp": 0.02047731, + "epoch": 0.01899894784307831, + "flos": 23265724049280.0, + "grad_norm": 52.87369135887914, + "language_loss": 1.09085321, + "learning_rate": 3.7058471919041945e-06, + "loss": 1.21783805, + "num_input_tokens_seen": 6614245, + "router_z_loss_clip": 24.15625, + "router_z_loss_mlp": 1.76855469, + "step": 316, + "time_per_iteration": 2.7019717693328857 + }, + { + "auxiliary_loss_clip": 0.1049989, + "auxiliary_loss_mlp": 0.02224334, + "balance_loss_clip": 0.08073364, + "balance_loss_mlp": 0.02044757, + "epoch": 0.01905907109574628, + "flos": 17463782511360.0, + "grad_norm": 120.61991368810097, + "language_loss": 1.19369888, + "learning_rate": 3.7078814816960605e-06, + "loss": 1.32094109, + "num_input_tokens_seen": 6632015, + "router_z_loss_clip": 24.234375, + "router_z_loss_mlp": 1.79492188, + "step": 317, + "time_per_iteration": 2.6503257751464844 + }, + { + "auxiliary_loss_clip": 0.10466437, + "auxiliary_loss_mlp": 0.02269676, + "balance_loss_clip": 0.08054706, + "balance_loss_mlp": 0.02081039, + "epoch": 0.019119194348414248, + "flos": 14974578869760.0, + "grad_norm": 61.86297235247138, + "language_loss": 1.22225165, + "learning_rate": 3.709909364265374e-06, + "loss": 1.34961283, + "num_input_tokens_seen": 6649015, + "router_z_loss_clip": 24.109375, + "router_z_loss_mlp": 1.88769531, + "step": 318, + "time_per_iteration": 2.631645917892456 + }, + { + "auxiliary_loss_clip": 0.1039573, + "auxiliary_loss_mlp": 0.02220381, + "balance_loss_clip": 0.08026896, + "balance_loss_mlp": 0.02036608, + "epoch": 0.01917931760108222, + "flos": 25489719918720.0, + "grad_norm": 79.56078914423522, + "language_loss": 1.24628842, + "learning_rate": 3.7119308798459706e-06, + "loss": 1.3724494, + "num_input_tokens_seen": 6669225, + "router_z_loss_clip": 23.65625, + "router_z_loss_mlp": 1.83789062, + "step": 319, + "time_per_iteration": 2.723235607147217 + }, + { + "auxiliary_loss_clip": 0.08211939, + "auxiliary_loss_mlp": 0.01803451, + "balance_loss_clip": 0.07311222, + "balance_loss_mlp": 0.01697974, + "epoch": 0.01923944085375019, + "flos": 71576438872320.0, + "grad_norm": 0.9540157623115577, + "language_loss": 0.59494603, + "learning_rate": 3.7139460682939026e-06, + "loss": 0.69509989, + "num_input_tokens_seen": 6725775, + "router_z_loss_clip": 9.0, + "router_z_loss_mlp": 1.05664062, + "step": 320, + "time_per_iteration": 3.180224895477295 + }, + { + "auxiliary_loss_clip": 0.10427548, + "auxiliary_loss_mlp": 0.02254004, + "balance_loss_clip": 0.0803239, + "balance_loss_mlp": 0.02062601, + "epoch": 0.019299564106418157, + "flos": 19688574994560.0, + "grad_norm": 36.291900925718565, + "language_loss": 1.21542251, + "learning_rate": 3.715954969092154e-06, + "loss": 1.34223795, + "num_input_tokens_seen": 6744170, + "router_z_loss_clip": 23.921875, + "router_z_loss_mlp": 1.9140625, + "step": 321, + "time_per_iteration": 2.682126045227051 + }, + { + "auxiliary_loss_clip": 0.10335587, + "auxiliary_loss_mlp": 0.02247301, + "balance_loss_clip": 0.079924, + "balance_loss_mlp": 0.02050463, + "epoch": 0.019359687359086126, + "flos": 24393682586880.0, + "grad_norm": 33.259970226975035, + "language_loss": 1.13044763, + "learning_rate": 3.7179576213552805e-06, + "loss": 1.25627637, + "num_input_tokens_seen": 6764565, + "router_z_loss_clip": 23.40625, + "router_z_loss_mlp": 1.96972656, + "step": 322, + "time_per_iteration": 2.707108736038208 + }, + { + "auxiliary_loss_clip": 0.10356271, + "auxiliary_loss_mlp": 0.02232923, + "balance_loss_clip": 0.08007558, + "balance_loss_mlp": 0.02039518, + "epoch": 0.019419810611754094, + "flos": 23958177390720.0, + "grad_norm": 36.53278953975959, + "language_loss": 0.99391961, + "learning_rate": 3.719954063833981e-06, + "loss": 1.11981153, + "num_input_tokens_seen": 6785310, + "router_z_loss_clip": 23.46875, + "router_z_loss_mlp": 1.93554688, + "step": 323, + "time_per_iteration": 2.723851442337036 + }, + { + "auxiliary_loss_clip": 0.10368463, + "auxiliary_loss_mlp": 0.02256046, + "balance_loss_clip": 0.08015804, + "balance_loss_mlp": 0.02064739, + "epoch": 0.019479933864422067, + "flos": 22166164846080.0, + "grad_norm": 31.715264393756637, + "language_loss": 1.15310884, + "learning_rate": 3.721944334919596e-06, + "loss": 1.27935386, + "num_input_tokens_seen": 6803290, + "router_z_loss_clip": 23.5, + "router_z_loss_mlp": 1.9140625, + "step": 324, + "time_per_iteration": 2.696791887283325 + }, + { + "auxiliary_loss_clip": 0.10296808, + "auxiliary_loss_mlp": 0.02240866, + "balance_loss_clip": 0.08005355, + "balance_loss_mlp": 0.02052992, + "epoch": 0.019540057117090035, + "flos": 22243381983360.0, + "grad_norm": 43.49790109423306, + "language_loss": 0.94611681, + "learning_rate": 3.7239284726485375e-06, + "loss": 1.07149351, + "num_input_tokens_seen": 6822570, + "router_z_loss_clip": 22.90625, + "router_z_loss_mlp": 1.87890625, + "step": 325, + "time_per_iteration": 2.653348207473755 + }, + { + "auxiliary_loss_clip": 0.10282885, + "auxiliary_loss_mlp": 0.02182889, + "balance_loss_clip": 0.07997272, + "balance_loss_mlp": 0.02001023, + "epoch": 0.019600180369758004, + "flos": 23083603200000.0, + "grad_norm": 27.315965412731057, + "language_loss": 0.98057997, + "learning_rate": 3.72590651470665e-06, + "loss": 1.10523772, + "num_input_tokens_seen": 6841910, + "router_z_loss_clip": 22.859375, + "router_z_loss_mlp": 1.81835938, + "step": 326, + "time_per_iteration": 2.712902545928955 + }, + { + "auxiliary_loss_clip": 0.10212934, + "auxiliary_loss_mlp": 0.0211514, + "balance_loss_clip": 0.07960281, + "balance_loss_mlp": 0.01952062, + "epoch": 0.019660303622425972, + "flos": 25417911369600.0, + "grad_norm": 35.757935523376304, + "language_loss": 1.00482905, + "learning_rate": 3.727878498433505e-06, + "loss": 1.12810981, + "num_input_tokens_seen": 6862480, + "router_z_loss_clip": 22.53125, + "router_z_loss_mlp": 1.63085938, + "step": 327, + "time_per_iteration": 2.7241063117980957 + }, + { + "auxiliary_loss_clip": 0.10138492, + "auxiliary_loss_mlp": 0.02035691, + "balance_loss_clip": 0.07947245, + "balance_loss_mlp": 0.01881101, + "epoch": 0.01972042687509394, + "flos": 23663941378560.0, + "grad_norm": 104.32864902308236, + "language_loss": 1.03565025, + "learning_rate": 3.7298444608266328e-06, + "loss": 1.15739202, + "num_input_tokens_seen": 6882015, + "router_z_loss_clip": 21.9375, + "router_z_loss_mlp": 1.54492188, + "step": 328, + "time_per_iteration": 2.709101438522339 + }, + { + "auxiliary_loss_clip": 0.10164856, + "auxiliary_loss_mlp": 0.01970008, + "balance_loss_clip": 0.0795281, + "balance_loss_mlp": 0.01821044, + "epoch": 0.019780550127761913, + "flos": 18229386067200.0, + "grad_norm": 42.1606706132577, + "language_loss": 1.2875843, + "learning_rate": 3.731804438545683e-06, + "loss": 1.40893316, + "num_input_tokens_seen": 6899785, + "router_z_loss_clip": 22.125, + "router_z_loss_mlp": 1.49023438, + "step": 329, + "time_per_iteration": 2.6586227416992188 + }, + { + "auxiliary_loss_clip": 0.10175324, + "auxiliary_loss_mlp": 0.0194808, + "balance_loss_clip": 0.07956892, + "balance_loss_mlp": 0.0180417, + "epoch": 0.01984067338042988, + "flos": 22425293197440.0, + "grad_norm": 45.342797810033126, + "language_loss": 1.05014217, + "learning_rate": 3.7337584679165324e-06, + "loss": 1.17137623, + "num_input_tokens_seen": 6918575, + "router_z_loss_clip": 22.1875, + "router_z_loss_mlp": 1.43847656, + "step": 330, + "time_per_iteration": 2.7214515209198 + }, + { + "auxiliary_loss_clip": 0.10115402, + "auxiliary_loss_mlp": 0.01893459, + "balance_loss_clip": 0.07927606, + "balance_loss_mlp": 0.01745353, + "epoch": 0.01990079663309785, + "flos": 17060785499520.0, + "grad_norm": 59.15314637886723, + "language_loss": 1.25238144, + "learning_rate": 3.7357065849353186e-06, + "loss": 1.37247014, + "num_input_tokens_seen": 6936965, + "router_z_loss_clip": 21.890625, + "router_z_loss_mlp": 1.48046875, + "step": 331, + "time_per_iteration": 2.657338857650757 + }, + { + "auxiliary_loss_clip": 0.10080996, + "auxiliary_loss_mlp": 0.01847509, + "balance_loss_clip": 0.07917192, + "balance_loss_mlp": 0.01704076, + "epoch": 0.01996091988576582, + "flos": 15967389571200.0, + "grad_norm": 98.01539887897596, + "language_loss": 1.18547392, + "learning_rate": 3.737648825272422e-06, + "loss": 1.30475891, + "num_input_tokens_seen": 6953475, + "router_z_loss_clip": 21.625, + "router_z_loss_mlp": 1.43457031, + "step": 332, + "time_per_iteration": 2.653959035873413 + }, + { + "auxiliary_loss_clip": 0.10103545, + "auxiliary_loss_mlp": 0.01800932, + "balance_loss_clip": 0.07904914, + "balance_loss_mlp": 0.01663794, + "epoch": 0.02002104313843379, + "flos": 23593181005440.0, + "grad_norm": 35.094478760810134, + "language_loss": 1.10768199, + "learning_rate": 3.739585224276384e-06, + "loss": 1.22672677, + "num_input_tokens_seen": 6971630, + "router_z_loss_clip": 21.96875, + "router_z_loss_mlp": 1.37207031, + "step": 333, + "time_per_iteration": 4.1371009349823 + }, + { + "auxiliary_loss_clip": 0.10097618, + "auxiliary_loss_mlp": 0.01781343, + "balance_loss_clip": 0.07907948, + "balance_loss_mlp": 0.01654028, + "epoch": 0.02008116639110176, + "flos": 34103458517760.0, + "grad_norm": 136.68327853765982, + "language_loss": 1.06974816, + "learning_rate": 3.7415158169777673e-06, + "loss": 1.18853784, + "num_input_tokens_seen": 6992775, + "router_z_loss_clip": 21.921875, + "router_z_loss_mlp": 1.2734375, + "step": 334, + "time_per_iteration": 4.332135200500488 + }, + { + "auxiliary_loss_clip": 0.10031913, + "auxiliary_loss_mlp": 0.01781208, + "balance_loss_clip": 0.07884848, + "balance_loss_mlp": 0.01645405, + "epoch": 0.020141289643769728, + "flos": 19690000513920.0, + "grad_norm": 127.35413263461035, + "language_loss": 1.06165111, + "learning_rate": 3.7434406380929575e-06, + "loss": 1.17978239, + "num_input_tokens_seen": 7011425, + "router_z_loss_clip": 21.453125, + "router_z_loss_mlp": 1.35742188, + "step": 335, + "time_per_iteration": 2.6845688819885254 + }, + { + "auxiliary_loss_clip": 0.10012034, + "auxiliary_loss_mlp": 0.01785006, + "balance_loss_clip": 0.07876636, + "balance_loss_mlp": 0.01652064, + "epoch": 0.020201412896437697, + "flos": 20746821335040.0, + "grad_norm": 92.68671579424392, + "language_loss": 1.17325389, + "learning_rate": 3.745359722027911e-06, + "loss": 1.29122424, + "num_input_tokens_seen": 7029450, + "router_z_loss_clip": 21.34375, + "router_z_loss_mlp": 1.33007812, + "step": 336, + "time_per_iteration": 4.08910059928894 + }, + { + "auxiliary_loss_clip": 0.1002828, + "auxiliary_loss_mlp": 0.01777388, + "balance_loss_clip": 0.07887816, + "balance_loss_mlp": 0.01649119, + "epoch": 0.020261536149105665, + "flos": 20272728533760.0, + "grad_norm": 120.00954497896274, + "language_loss": 1.09627342, + "learning_rate": 3.7472731028818428e-06, + "loss": 1.21433008, + "num_input_tokens_seen": 7047555, + "router_z_loss_clip": 21.40625, + "router_z_loss_mlp": 1.28222656, + "step": 337, + "time_per_iteration": 2.805793285369873 + }, + { + "auxiliary_loss_clip": 0.09984031, + "auxiliary_loss_mlp": 0.01793779, + "balance_loss_clip": 0.07868993, + "balance_loss_mlp": 0.01666368, + "epoch": 0.020321659401773638, + "flos": 25855890261120.0, + "grad_norm": 28.99860578242643, + "language_loss": 1.06755781, + "learning_rate": 3.7491808144508626e-06, + "loss": 1.18533587, + "num_input_tokens_seen": 7068185, + "router_z_loss_clip": 21.15625, + "router_z_loss_mlp": 1.2734375, + "step": 338, + "time_per_iteration": 2.731576919555664 + }, + { + "auxiliary_loss_clip": 0.09960704, + "auxiliary_loss_mlp": 0.01799352, + "balance_loss_clip": 0.0785647, + "balance_loss_mlp": 0.01663931, + "epoch": 0.020381782654441606, + "flos": 17501028451200.0, + "grad_norm": 48.687202060804886, + "language_loss": 1.0690763, + "learning_rate": 3.7510828902315576e-06, + "loss": 1.18667698, + "num_input_tokens_seen": 7085955, + "router_z_loss_clip": 21.03125, + "router_z_loss_mlp": 1.35449219, + "step": 339, + "time_per_iteration": 2.6707966327667236 + }, + { + "auxiliary_loss_clip": 0.09979145, + "auxiliary_loss_mlp": 0.01800383, + "balance_loss_clip": 0.07839093, + "balance_loss_mlp": 0.01661433, + "epoch": 0.020441905907109575, + "flos": 24250904029440.0, + "grad_norm": 71.79969186636298, + "language_loss": 1.09025931, + "learning_rate": 3.75297936342452e-06, + "loss": 1.20805454, + "num_input_tokens_seen": 7106345, + "router_z_loss_clip": 21.4375, + "router_z_loss_mlp": 1.38964844, + "step": 340, + "time_per_iteration": 2.6860833168029785 + }, + { + "auxiliary_loss_clip": 0.09942168, + "auxiliary_loss_mlp": 0.01812594, + "balance_loss_clip": 0.07835533, + "balance_loss_mlp": 0.01670592, + "epoch": 0.020502029159777543, + "flos": 22239273133440.0, + "grad_norm": 33.37713513104353, + "language_loss": 1.09787846, + "learning_rate": 3.7548702669378253e-06, + "loss": 1.21542597, + "num_input_tokens_seen": 7125070, + "router_z_loss_clip": 21.09375, + "router_z_loss_mlp": 1.41992188, + "step": 341, + "time_per_iteration": 2.6922483444213867 + }, + { + "auxiliary_loss_clip": 0.09939329, + "auxiliary_loss_mlp": 0.01828812, + "balance_loss_clip": 0.07839939, + "balance_loss_mlp": 0.01694249, + "epoch": 0.020562152412445512, + "flos": 23994668643840.0, + "grad_norm": 29.77192234960925, + "language_loss": 1.11667454, + "learning_rate": 3.756755633390458e-06, + "loss": 1.23435605, + "num_input_tokens_seen": 7144675, + "router_z_loss_clip": 21.0, + "router_z_loss_mlp": 1.34472656, + "step": 342, + "time_per_iteration": 2.6834869384765625 + }, + { + "auxiliary_loss_clip": 0.09933892, + "auxiliary_loss_mlp": 0.01819402, + "balance_loss_clip": 0.07828948, + "balance_loss_mlp": 0.0168541, + "epoch": 0.020622275665113484, + "flos": 26981878227840.0, + "grad_norm": 22.197931915509507, + "language_loss": 1.07990003, + "learning_rate": 3.7586354951156886e-06, + "loss": 1.19743299, + "num_input_tokens_seen": 7165505, + "router_z_loss_clip": 21.0625, + "router_z_loss_mlp": 1.34082031, + "step": 343, + "time_per_iteration": 2.749616861343384 + }, + { + "auxiliary_loss_clip": 0.09917849, + "auxiliary_loss_mlp": 0.01848479, + "balance_loss_clip": 0.07828984, + "balance_loss_mlp": 0.01717921, + "epoch": 0.020682398917781453, + "flos": 22607162484480.0, + "grad_norm": 141.8901696404303, + "language_loss": 0.98407257, + "learning_rate": 3.7605098841644e-06, + "loss": 1.10173583, + "num_input_tokens_seen": 7184605, + "router_z_loss_clip": 20.859375, + "router_z_loss_mlp": 1.30566406, + "step": 344, + "time_per_iteration": 2.675349235534668 + }, + { + "auxiliary_loss_clip": 0.09898005, + "auxiliary_loss_mlp": 0.01869082, + "balance_loss_clip": 0.07812598, + "balance_loss_mlp": 0.01731467, + "epoch": 0.02074252217044942, + "flos": 15019120114560.0, + "grad_norm": 18.785611022256134, + "language_loss": 0.99672723, + "learning_rate": 3.7623788323083666e-06, + "loss": 1.11439812, + "num_input_tokens_seen": 7203065, + "router_z_loss_clip": 20.84375, + "router_z_loss_mlp": 1.37597656, + "step": 345, + "time_per_iteration": 2.692946434020996 + }, + { + "auxiliary_loss_clip": 0.09874325, + "auxiliary_loss_mlp": 0.01900277, + "balance_loss_clip": 0.07799722, + "balance_loss_mlp": 0.01757512, + "epoch": 0.02080264542311739, + "flos": 25345012717440.0, + "grad_norm": 55.83425603592709, + "language_loss": 1.104882, + "learning_rate": 3.7642423710434837e-06, + "loss": 1.222628, + "num_input_tokens_seen": 7222995, + "router_z_loss_clip": 20.734375, + "router_z_loss_mlp": 1.42871094, + "step": 346, + "time_per_iteration": 2.6843760013580322 + }, + { + "auxiliary_loss_clip": 0.09857361, + "auxiliary_loss_mlp": 0.01900508, + "balance_loss_clip": 0.07793791, + "balance_loss_mlp": 0.01751067, + "epoch": 0.02086276867578536, + "flos": 24395611230720.0, + "grad_norm": 77.40789728508068, + "language_loss": 1.02947056, + "learning_rate": 3.7661005315929563e-06, + "loss": 1.14704919, + "num_input_tokens_seen": 7244625, + "router_z_loss_clip": 20.625, + "router_z_loss_mlp": 1.49511719, + "step": 347, + "time_per_iteration": 2.7445502281188965 + }, + { + "auxiliary_loss_clip": 0.09829693, + "auxiliary_loss_mlp": 0.01850064, + "balance_loss_clip": 0.07772936, + "balance_loss_mlp": 0.01707585, + "epoch": 0.02092289192845333, + "flos": 24469096861440.0, + "grad_norm": 39.57326474220843, + "language_loss": 0.95316571, + "learning_rate": 3.7679533449104354e-06, + "loss": 1.06996334, + "num_input_tokens_seen": 7263255, + "router_z_loss_clip": 20.546875, + "router_z_loss_mlp": 1.42578125, + "step": 348, + "time_per_iteration": 2.8197853565216064 + }, + { + "auxiliary_loss_clip": 0.09904477, + "auxiliary_loss_mlp": 0.01869566, + "balance_loss_clip": 0.07792602, + "balance_loss_mlp": 0.01723273, + "epoch": 0.0209830151811213, + "flos": 17455942154880.0, + "grad_norm": 162.53223734199824, + "language_loss": 1.06930375, + "learning_rate": 3.7698008416831116e-06, + "loss": 1.18704414, + "num_input_tokens_seen": 7279275, + "router_z_loss_clip": 21.125, + "router_z_loss_mlp": 1.46289062, + "step": 349, + "time_per_iteration": 2.752092123031616 + }, + { + "auxiliary_loss_clip": 0.09846102, + "auxiliary_loss_mlp": 0.01921246, + "balance_loss_clip": 0.07772378, + "balance_loss_mlp": 0.01771328, + "epoch": 0.021043138433789268, + "flos": 24581295878400.0, + "grad_norm": 27.656933027979164, + "language_loss": 1.05012357, + "learning_rate": 3.7716430523347664e-06, + "loss": 1.16779709, + "num_input_tokens_seen": 7300180, + "router_z_loss_clip": 20.71875, + "router_z_loss_mlp": 1.49902344, + "step": 350, + "time_per_iteration": 2.766042947769165 + }, + { + "auxiliary_loss_clip": 0.0987936, + "auxiliary_loss_mlp": 0.01878538, + "balance_loss_clip": 0.07780807, + "balance_loss_mlp": 0.01733103, + "epoch": 0.021103261686457236, + "flos": 24459579423360.0, + "grad_norm": 79.75623451753691, + "language_loss": 0.99250925, + "learning_rate": 3.773480007028776e-06, + "loss": 1.11008823, + "num_input_tokens_seen": 7317430, + "router_z_loss_clip": 21.0, + "router_z_loss_mlp": 1.45507812, + "step": 351, + "time_per_iteration": 2.7852492332458496 + }, + { + "auxiliary_loss_clip": 0.09914102, + "auxiliary_loss_mlp": 0.01872584, + "balance_loss_clip": 0.07798491, + "balance_loss_mlp": 0.01732013, + "epoch": 0.021163384939125205, + "flos": 14688183214080.0, + "grad_norm": 45.172979776217204, + "language_loss": 1.05138326, + "learning_rate": 3.775311735671078e-06, + "loss": 1.16925001, + "num_input_tokens_seen": 7334875, + "router_z_loss_clip": 21.15625, + "router_z_loss_mlp": 1.40527344, + "step": 352, + "time_per_iteration": 2.670952558517456 + }, + { + "auxiliary_loss_clip": 0.09916839, + "auxiliary_loss_mlp": 0.0188162, + "balance_loss_clip": 0.07782572, + "balance_loss_mlp": 0.01727792, + "epoch": 0.021223508191793177, + "flos": 24499173277440.0, + "grad_norm": 32.69809617550279, + "language_loss": 1.02695966, + "learning_rate": 3.7771382679130878e-06, + "loss": 1.14494431, + "num_input_tokens_seen": 7355185, + "router_z_loss_clip": 21.375, + "router_z_loss_mlp": 1.5390625, + "step": 353, + "time_per_iteration": 2.7037458419799805 + }, + { + "auxiliary_loss_clip": 0.09877251, + "auxiliary_loss_mlp": 0.01866766, + "balance_loss_clip": 0.07783737, + "balance_loss_mlp": 0.01718565, + "epoch": 0.021283631444461146, + "flos": 24132667518720.0, + "grad_norm": 42.14264864151201, + "language_loss": 1.01166749, + "learning_rate": 3.7789596331545845e-06, + "loss": 1.12910759, + "num_input_tokens_seen": 7374425, + "router_z_loss_clip": 20.921875, + "router_z_loss_mlp": 1.48242188, + "step": 354, + "time_per_iteration": 2.692936658859253 + }, + { + "auxiliary_loss_clip": 0.0993467, + "auxiliary_loss_mlp": 0.0189021, + "balance_loss_clip": 0.07795032, + "balance_loss_mlp": 0.01743726, + "epoch": 0.021343754697129114, + "flos": 25199299267200.0, + "grad_norm": 49.082565254141, + "language_loss": 1.02249849, + "learning_rate": 3.780775860546545e-06, + "loss": 1.14074731, + "num_input_tokens_seen": 7394175, + "router_z_loss_clip": 21.359375, + "router_z_loss_mlp": 1.46484375, + "step": 355, + "time_per_iteration": 2.703904151916504 + }, + { + "auxiliary_loss_clip": 0.09890301, + "auxiliary_loss_mlp": 0.01933568, + "balance_loss_clip": 0.07771169, + "balance_loss_mlp": 0.01774495, + "epoch": 0.021403877949797083, + "flos": 17279816872320.0, + "grad_norm": 33.424095724347985, + "language_loss": 1.12320316, + "learning_rate": 3.7825869789939474e-06, + "loss": 1.24144173, + "num_input_tokens_seen": 7412645, + "router_z_loss_clip": 21.21875, + "router_z_loss_mlp": 1.58984375, + "step": 356, + "time_per_iteration": 2.7039332389831543 + }, + { + "auxiliary_loss_clip": 0.09926872, + "auxiliary_loss_mlp": 0.01913321, + "balance_loss_clip": 0.07763862, + "balance_loss_mlp": 0.01768648, + "epoch": 0.021464001202465055, + "flos": 30924946062720.0, + "grad_norm": 28.358403300745604, + "language_loss": 1.00492048, + "learning_rate": 3.784393017158528e-06, + "loss": 1.12332249, + "num_input_tokens_seen": 7432275, + "router_z_loss_clip": 21.640625, + "router_z_loss_mlp": 1.44628906, + "step": 357, + "time_per_iteration": 2.7567434310913086 + }, + { + "auxiliary_loss_clip": 0.09896905, + "auxiliary_loss_mlp": 0.0189471, + "balance_loss_clip": 0.0777001, + "balance_loss_mlp": 0.01751087, + "epoch": 0.021524124455133024, + "flos": 18192182054400.0, + "grad_norm": 311.83490549391024, + "language_loss": 1.00049341, + "learning_rate": 3.786194003461506e-06, + "loss": 1.11840951, + "num_input_tokens_seen": 7450245, + "router_z_loss_clip": 21.28125, + "router_z_loss_mlp": 1.43652344, + "step": 358, + "time_per_iteration": 2.697567939758301 + }, + { + "auxiliary_loss_clip": 0.09952264, + "auxiliary_loss_mlp": 0.01876113, + "balance_loss_clip": 0.0777906, + "balance_loss_mlp": 0.01737449, + "epoch": 0.021584247707800992, + "flos": 13810464495360.0, + "grad_norm": 74.44924093849752, + "language_loss": 1.11748183, + "learning_rate": 3.787989966086264e-06, + "loss": 1.2357657, + "num_input_tokens_seen": 7466845, + "router_z_loss_clip": 21.734375, + "router_z_loss_mlp": 1.38671875, + "step": 359, + "time_per_iteration": 2.683791399002075 + }, + { + "auxiliary_loss_clip": 0.09922898, + "auxiliary_loss_mlp": 0.01885242, + "balance_loss_clip": 0.07765573, + "balance_loss_mlp": 0.01746292, + "epoch": 0.02164437096046896, + "flos": 23301418688640.0, + "grad_norm": 64.98362502413198, + "language_loss": 1.06271791, + "learning_rate": 3.789780932980997e-06, + "loss": 1.18079925, + "num_input_tokens_seen": 7485450, + "router_z_loss_clip": 21.5625, + "router_z_loss_mlp": 1.38867188, + "step": 360, + "time_per_iteration": 2.7144362926483154 + }, + { + "auxiliary_loss_clip": 0.08207352, + "auxiliary_loss_mlp": 0.01776906, + "balance_loss_clip": 0.07236059, + "balance_loss_mlp": 0.01669809, + "epoch": 0.02170449421313693, + "flos": 68919621137280.0, + "grad_norm": 1.0217512577987982, + "language_loss": 0.65141213, + "learning_rate": 3.79156693186132e-06, + "loss": 0.75125468, + "num_input_tokens_seen": 7553780, + "router_z_loss_clip": 9.734375, + "router_z_loss_mlp": 1.07324219, + "step": 361, + "time_per_iteration": 3.3981525897979736 + }, + { + "auxiliary_loss_clip": 0.09926173, + "auxiliary_loss_mlp": 0.01850484, + "balance_loss_clip": 0.07767443, + "balance_loss_mlp": 0.01710961, + "epoch": 0.0217646174658049, + "flos": 25235580885120.0, + "grad_norm": 46.06075194478587, + "language_loss": 1.07240796, + "learning_rate": 3.7933479902128433e-06, + "loss": 1.19017458, + "num_input_tokens_seen": 7574155, + "router_z_loss_clip": 21.5625, + "router_z_loss_mlp": 1.39550781, + "step": 362, + "time_per_iteration": 2.7112934589385986 + }, + { + "auxiliary_loss_clip": 0.09902073, + "auxiliary_loss_mlp": 0.01838434, + "balance_loss_clip": 0.07771316, + "balance_loss_mlp": 0.01689852, + "epoch": 0.02182474071847287, + "flos": 22899721415040.0, + "grad_norm": 31.847388073363284, + "language_loss": 1.10624099, + "learning_rate": 3.7951241352937077e-06, + "loss": 1.22364616, + "num_input_tokens_seen": 7592320, + "router_z_loss_clip": 21.3125, + "router_z_loss_mlp": 1.48632812, + "step": 363, + "time_per_iteration": 2.7391881942749023 + }, + { + "auxiliary_loss_clip": 0.09905075, + "auxiliary_loss_mlp": 0.01804412, + "balance_loss_clip": 0.0776676, + "balance_loss_mlp": 0.01661742, + "epoch": 0.02188486397114084, + "flos": 23665660387200.0, + "grad_norm": 28.541039167709148, + "language_loss": 1.08880925, + "learning_rate": 3.7968953941370915e-06, + "loss": 1.20590401, + "num_input_tokens_seen": 7611185, + "router_z_loss_clip": 21.359375, + "router_z_loss_mlp": 1.42578125, + "step": 364, + "time_per_iteration": 2.7092103958129883 + }, + { + "auxiliary_loss_clip": 0.09940802, + "auxiliary_loss_mlp": 0.01790674, + "balance_loss_clip": 0.07771328, + "balance_loss_mlp": 0.01644666, + "epoch": 0.021944987223808807, + "flos": 21550090101120.0, + "grad_norm": 29.41270562877638, + "language_loss": 1.01945662, + "learning_rate": 3.798661793553676e-06, + "loss": 1.13677144, + "num_input_tokens_seen": 7631970, + "router_z_loss_clip": 21.6875, + "router_z_loss_mlp": 1.4609375, + "step": 365, + "time_per_iteration": 2.7039554119110107 + }, + { + "auxiliary_loss_clip": 0.09880184, + "auxiliary_loss_mlp": 0.01787501, + "balance_loss_clip": 0.07767902, + "balance_loss_mlp": 0.01639968, + "epoch": 0.022005110476476776, + "flos": 16076444060160.0, + "grad_norm": 25.357242967570325, + "language_loss": 1.00391948, + "learning_rate": 3.8004233601340808e-06, + "loss": 1.12059641, + "num_input_tokens_seen": 7649745, + "router_z_loss_clip": 21.125, + "router_z_loss_mlp": 1.47558594, + "step": 366, + "time_per_iteration": 2.6410672664642334 + }, + { + "auxiliary_loss_clip": 0.09886092, + "auxiliary_loss_mlp": 0.01802461, + "balance_loss_clip": 0.07774624, + "balance_loss_mlp": 0.01645009, + "epoch": 0.022065233729144748, + "flos": 21440071290240.0, + "grad_norm": 44.529255844390654, + "language_loss": 1.12988663, + "learning_rate": 3.8021801202512694e-06, + "loss": 1.24677217, + "num_input_tokens_seen": 7668830, + "router_z_loss_clip": 21.09375, + "router_z_loss_mlp": 1.57421875, + "step": 367, + "time_per_iteration": 2.742794990539551 + }, + { + "auxiliary_loss_clip": 0.09926969, + "auxiliary_loss_mlp": 0.01819149, + "balance_loss_clip": 0.0779452, + "balance_loss_mlp": 0.01654545, + "epoch": 0.022125356981812717, + "flos": 21550173955200.0, + "grad_norm": 31.338184320621753, + "language_loss": 1.07241869, + "learning_rate": 3.803932100062912e-06, + "loss": 1.18987989, + "num_input_tokens_seen": 7687240, + "router_z_loss_clip": 21.34375, + "router_z_loss_mlp": 1.64648438, + "step": 368, + "time_per_iteration": 2.660156488418579 + }, + { + "auxiliary_loss_clip": 0.09893043, + "auxiliary_loss_mlp": 0.01817736, + "balance_loss_clip": 0.07784697, + "balance_loss_mlp": 0.01649699, + "epoch": 0.022185480234480685, + "flos": 20710413936000.0, + "grad_norm": 81.09585500154182, + "language_loss": 1.0770272, + "learning_rate": 3.8056793255137264e-06, + "loss": 1.19413495, + "num_input_tokens_seen": 7704440, + "router_z_loss_clip": 21.09375, + "router_z_loss_mlp": 1.6796875, + "step": 369, + "time_per_iteration": 2.6966772079467773 + }, + { + "auxiliary_loss_clip": 0.09905175, + "auxiliary_loss_mlp": 0.01835143, + "balance_loss_clip": 0.07793829, + "balance_loss_mlp": 0.01659667, + "epoch": 0.022245603487148654, + "flos": 25200431297280.0, + "grad_norm": 48.526199326230525, + "language_loss": 1.05259717, + "learning_rate": 3.8074218223377844e-06, + "loss": 1.17000043, + "num_input_tokens_seen": 7727160, + "router_z_loss_clip": 21.09375, + "router_z_loss_mlp": 1.75585938, + "step": 370, + "time_per_iteration": 2.726882219314575 + }, + { + "auxiliary_loss_clip": 0.09840686, + "auxiliary_loss_mlp": 0.01849254, + "balance_loss_clip": 0.0775683, + "balance_loss_mlp": 0.01677497, + "epoch": 0.022305726739816623, + "flos": 21402070663680.0, + "grad_norm": 32.14486041550045, + "language_loss": 1.00516605, + "learning_rate": 3.8091596160607834e-06, + "loss": 1.12206554, + "num_input_tokens_seen": 7747730, + "router_z_loss_clip": 20.828125, + "router_z_loss_mlp": 1.71875, + "step": 371, + "time_per_iteration": 2.6846559047698975 + }, + { + "auxiliary_loss_clip": 0.09844472, + "auxiliary_loss_mlp": 0.01857578, + "balance_loss_clip": 0.07769165, + "balance_loss_mlp": 0.01683151, + "epoch": 0.022365849992484595, + "flos": 22498736901120.0, + "grad_norm": 33.301604666823, + "language_loss": 1.06231499, + "learning_rate": 3.8108927320022896e-06, + "loss": 1.17933559, + "num_input_tokens_seen": 7766765, + "router_z_loss_clip": 20.734375, + "router_z_loss_mlp": 1.74511719, + "step": 372, + "time_per_iteration": 2.7052745819091797 + }, + { + "auxiliary_loss_clip": 0.09826015, + "auxiliary_loss_mlp": 0.01853945, + "balance_loss_clip": 0.07764611, + "balance_loss_mlp": 0.01673796, + "epoch": 0.022425973245152563, + "flos": 17862083694720.0, + "grad_norm": 41.636352487556145, + "language_loss": 1.03913403, + "learning_rate": 3.8126211952779548e-06, + "loss": 1.15593362, + "num_input_tokens_seen": 7784010, + "router_z_loss_clip": 20.640625, + "router_z_loss_mlp": 1.80078125, + "step": 373, + "time_per_iteration": 4.106141090393066 + }, + { + "auxiliary_loss_clip": 0.09845725, + "auxiliary_loss_mlp": 0.01869282, + "balance_loss_clip": 0.07777153, + "balance_loss_mlp": 0.01685128, + "epoch": 0.022486096497820532, + "flos": 15487804327680.0, + "grad_norm": 61.54476347228186, + "language_loss": 1.0650835, + "learning_rate": 3.8143450308016952e-06, + "loss": 1.18223345, + "num_input_tokens_seen": 7801305, + "router_z_loss_clip": 20.703125, + "router_z_loss_mlp": 1.84277344, + "step": 374, + "time_per_iteration": 4.033753871917725 + }, + { + "auxiliary_loss_clip": 0.09812269, + "auxiliary_loss_mlp": 0.01856399, + "balance_loss_clip": 0.07757415, + "balance_loss_mlp": 0.01667095, + "epoch": 0.0225462197504885, + "flos": 27791897247360.0, + "grad_norm": 56.210759270114224, + "language_loss": 1.03319001, + "learning_rate": 3.8160642632878525e-06, + "loss": 1.14987683, + "num_input_tokens_seen": 7823965, + "router_z_loss_clip": 20.5625, + "router_z_loss_mlp": 1.89257812, + "step": 375, + "time_per_iteration": 2.7545790672302246 + }, + { + "auxiliary_loss_clip": 0.0981497, + "auxiliary_loss_mlp": 0.01843627, + "balance_loss_clip": 0.07751609, + "balance_loss_mlp": 0.01665767, + "epoch": 0.02260634300315647, + "flos": 19981804757760.0, + "grad_norm": 57.812718044092065, + "language_loss": 1.07001138, + "learning_rate": 3.817778917253314e-06, + "loss": 1.18659735, + "num_input_tokens_seen": 7842115, + "router_z_loss_clip": 20.625, + "router_z_loss_mlp": 1.77734375, + "step": 376, + "time_per_iteration": 4.076448202133179 + }, + { + "auxiliary_loss_clip": 0.09767978, + "auxiliary_loss_mlp": 0.01843169, + "balance_loss_clip": 0.07741934, + "balance_loss_mlp": 0.01659587, + "epoch": 0.02266646625582444, + "flos": 16032699429120.0, + "grad_norm": 49.61569881920644, + "language_loss": 1.03111744, + "learning_rate": 3.8194890170196155e-06, + "loss": 1.14722896, + "num_input_tokens_seen": 7857830, + "router_z_loss_clip": 20.265625, + "router_z_loss_mlp": 1.83691406, + "step": 377, + "time_per_iteration": 2.7254374027252197 + }, + { + "auxiliary_loss_clip": 0.09738941, + "auxiliary_loss_mlp": 0.01853994, + "balance_loss_clip": 0.07719769, + "balance_loss_mlp": 0.01670221, + "epoch": 0.02272658950849241, + "flos": 20409553451520.0, + "grad_norm": 48.84797020114705, + "language_loss": 1.2001133, + "learning_rate": 3.8211945867150055e-06, + "loss": 1.31604266, + "num_input_tokens_seen": 7875840, + "router_z_loss_clip": 20.171875, + "router_z_loss_mlp": 1.83691406, + "step": 378, + "time_per_iteration": 2.648167848587036 + }, + { + "auxiliary_loss_clip": 0.08046754, + "auxiliary_loss_mlp": 0.0138253, + "balance_loss_clip": 0.07155026, + "balance_loss_mlp": 0.01272953, + "epoch": 0.02278671276116038, + "flos": 69867387469440.0, + "grad_norm": 0.9915915427532991, + "language_loss": 0.75403833, + "learning_rate": 3.822895650276492e-06, + "loss": 0.84833115, + "num_input_tokens_seen": 7940190, + "router_z_loss_clip": 8.90625, + "router_z_loss_mlp": 1.09863281, + "step": 379, + "time_per_iteration": 3.301997661590576 + }, + { + "auxiliary_loss_clip": 0.09709425, + "auxiliary_loss_mlp": 0.01844372, + "balance_loss_clip": 0.07733691, + "balance_loss_mlp": 0.0167643, + "epoch": 0.022846836013828347, + "flos": 38517935823360.0, + "grad_norm": 57.599828595547535, + "language_loss": 1.02933359, + "learning_rate": 3.824592231451859e-06, + "loss": 1.14487147, + "num_input_tokens_seen": 7960840, + "router_z_loss_clip": 19.75, + "router_z_loss_mlp": 1.6796875, + "step": 380, + "time_per_iteration": 2.817310094833374 + }, + { + "auxiliary_loss_clip": 0.09699684, + "auxiliary_loss_mlp": 0.01850822, + "balance_loss_clip": 0.07715706, + "balance_loss_mlp": 0.01682976, + "epoch": 0.02290695926649632, + "flos": 20965768853760.0, + "grad_norm": 97.98649595332142, + "language_loss": 1.19140625, + "learning_rate": 3.826284353801652e-06, + "loss": 1.30691135, + "num_input_tokens_seen": 7975500, + "router_z_loss_clip": 19.875, + "router_z_loss_mlp": 1.6796875, + "step": 381, + "time_per_iteration": 2.6415421962738037 + }, + { + "auxiliary_loss_clip": 0.09691618, + "auxiliary_loss_mlp": 0.01878712, + "balance_loss_clip": 0.0772172, + "balance_loss_mlp": 0.01696942, + "epoch": 0.022967082519164288, + "flos": 24028895836800.0, + "grad_norm": 71.67825440631948, + "language_loss": 1.08586979, + "learning_rate": 3.827972040701142e-06, + "loss": 1.20157313, + "num_input_tokens_seen": 7993880, + "router_z_loss_clip": 19.703125, + "router_z_loss_mlp": 1.81640625, + "step": 382, + "time_per_iteration": 2.688380718231201 + }, + { + "auxiliary_loss_clip": 0.0969088, + "auxiliary_loss_mlp": 0.0187998, + "balance_loss_clip": 0.07735589, + "balance_loss_mlp": 0.01704695, + "epoch": 0.023027205771832256, + "flos": 21003643699200.0, + "grad_norm": 97.39739491884717, + "language_loss": 1.06533158, + "learning_rate": 3.829655315342268e-06, + "loss": 1.18104029, + "num_input_tokens_seen": 8012730, + "router_z_loss_clip": 19.53125, + "router_z_loss_mlp": 1.75292969, + "step": 383, + "time_per_iteration": 2.697038173675537 + }, + { + "auxiliary_loss_clip": 0.09652471, + "auxiliary_loss_mlp": 0.01917586, + "balance_loss_clip": 0.07717164, + "balance_loss_mlp": 0.017485, + "epoch": 0.023087329024500225, + "flos": 21367172638080.0, + "grad_norm": 19.8768776799836, + "language_loss": 1.04799581, + "learning_rate": 3.831334200735543e-06, + "loss": 1.16369653, + "num_input_tokens_seen": 8031275, + "router_z_loss_clip": 19.34375, + "router_z_loss_mlp": 1.68945312, + "step": 384, + "time_per_iteration": 2.778743028640747 + }, + { + "auxiliary_loss_clip": 0.09638548, + "auxiliary_loss_mlp": 0.01934173, + "balance_loss_clip": 0.07711613, + "balance_loss_mlp": 0.01771858, + "epoch": 0.023147452277168194, + "flos": 21879014503680.0, + "grad_norm": 73.36535290584087, + "language_loss": 1.05852127, + "learning_rate": 3.8330087197119426e-06, + "loss": 1.17424858, + "num_input_tokens_seen": 8051600, + "router_z_loss_clip": 19.265625, + "router_z_loss_mlp": 1.62402344, + "step": 385, + "time_per_iteration": 2.6939914226531982 + }, + { + "auxiliary_loss_clip": 0.09652182, + "auxiliary_loss_mlp": 0.01965061, + "balance_loss_clip": 0.07710169, + "balance_loss_mlp": 0.01799503, + "epoch": 0.023207575529836166, + "flos": 18922719876480.0, + "grad_norm": 50.36598663544367, + "language_loss": 0.83061486, + "learning_rate": 3.83467889492477e-06, + "loss": 0.9467873, + "num_input_tokens_seen": 8070600, + "router_z_loss_clip": 19.390625, + "router_z_loss_mlp": 1.65527344, + "step": 386, + "time_per_iteration": 2.655557870864868 + }, + { + "auxiliary_loss_clip": 0.09622966, + "auxiliary_loss_mlp": 0.01950141, + "balance_loss_clip": 0.07707699, + "balance_loss_mlp": 0.01772281, + "epoch": 0.023267698782504134, + "flos": 25052998838400.0, + "grad_norm": 988.1002722416383, + "language_loss": 1.04901791, + "learning_rate": 3.836344748851495e-06, + "loss": 1.16474891, + "num_input_tokens_seen": 8090680, + "router_z_loss_clip": 19.171875, + "router_z_loss_mlp": 1.77832031, + "step": 387, + "time_per_iteration": 2.7180447578430176 + }, + { + "auxiliary_loss_clip": 0.09642081, + "auxiliary_loss_mlp": 0.01949741, + "balance_loss_clip": 0.0771786, + "balance_loss_mlp": 0.0177932, + "epoch": 0.023327822035172103, + "flos": 28887221819520.0, + "grad_norm": 25.325317169555962, + "language_loss": 1.03613186, + "learning_rate": 3.838006303795566e-06, + "loss": 1.15205002, + "num_input_tokens_seen": 8114610, + "router_z_loss_clip": 19.21875, + "router_z_loss_mlp": 1.70410156, + "step": 388, + "time_per_iteration": 2.7562358379364014 + }, + { + "auxiliary_loss_clip": 0.09633669, + "auxiliary_loss_mlp": 0.01946229, + "balance_loss_clip": 0.0770783, + "balance_loss_mlp": 0.01764268, + "epoch": 0.02338794528784007, + "flos": 27128178656640.0, + "grad_norm": 20.981666659787948, + "language_loss": 1.1374321, + "learning_rate": 3.839663581888206e-06, + "loss": 1.25323105, + "num_input_tokens_seen": 8133975, + "router_z_loss_clip": 19.25, + "router_z_loss_mlp": 1.8203125, + "step": 389, + "time_per_iteration": 2.762704372406006 + }, + { + "auxiliary_loss_clip": 0.09556312, + "auxiliary_loss_mlp": 0.01957007, + "balance_loss_clip": 0.07663149, + "balance_loss_mlp": 0.01788016, + "epoch": 0.02344806854050804, + "flos": 21328375397760.0, + "grad_norm": 32.87948782751001, + "language_loss": 1.07566035, + "learning_rate": 3.841316605090178e-06, + "loss": 1.19079351, + "num_input_tokens_seen": 8153570, + "router_z_loss_clip": 18.921875, + "router_z_loss_mlp": 1.68945312, + "step": 390, + "time_per_iteration": 2.659283399581909 + }, + { + "auxiliary_loss_clip": 0.09492537, + "auxiliary_loss_mlp": 0.01896556, + "balance_loss_clip": 0.07636442, + "balance_loss_mlp": 0.01733001, + "epoch": 0.023508191793176012, + "flos": 24796847306880.0, + "grad_norm": 140.16785757024044, + "language_loss": 1.15910161, + "learning_rate": 3.842965395193529e-06, + "loss": 1.27299261, + "num_input_tokens_seen": 8170075, + "router_z_loss_clip": 18.546875, + "router_z_loss_mlp": 1.63476562, + "step": 391, + "time_per_iteration": 2.713545799255371 + }, + { + "auxiliary_loss_clip": 0.09538671, + "auxiliary_loss_mlp": 0.0188554, + "balance_loss_clip": 0.0766757, + "balance_loss_mlp": 0.01730473, + "epoch": 0.02356831504584398, + "flos": 26002651887360.0, + "grad_norm": 36.4029876381944, + "language_loss": 1.06844151, + "learning_rate": 3.84460997382332e-06, + "loss": 1.18268371, + "num_input_tokens_seen": 8190420, + "router_z_loss_clip": 18.6875, + "router_z_loss_mlp": 1.54882812, + "step": 392, + "time_per_iteration": 2.738403081893921 + }, + { + "auxiliary_loss_clip": 0.09424435, + "auxiliary_loss_mlp": 0.01937068, + "balance_loss_clip": 0.07618648, + "balance_loss_mlp": 0.01782287, + "epoch": 0.02362843829851195, + "flos": 19068475253760.0, + "grad_norm": 23.190572901307267, + "language_loss": 1.05277753, + "learning_rate": 3.8462503624393256e-06, + "loss": 1.16639256, + "num_input_tokens_seen": 8208790, + "router_z_loss_clip": 18.0625, + "router_z_loss_mlp": 1.546875, + "step": 393, + "time_per_iteration": 2.730311155319214 + }, + { + "auxiliary_loss_clip": 0.09391345, + "auxiliary_loss_mlp": 0.01894272, + "balance_loss_clip": 0.07595266, + "balance_loss_mlp": 0.01726616, + "epoch": 0.023688561551179918, + "flos": 16076611768320.0, + "grad_norm": 91.86478442531423, + "language_loss": 1.00682688, + "learning_rate": 3.84788658233771e-06, + "loss": 1.11968303, + "num_input_tokens_seen": 8226885, + "router_z_loss_clip": 17.953125, + "router_z_loss_mlp": 1.67578125, + "step": 394, + "time_per_iteration": 2.705462694168091 + }, + { + "auxiliary_loss_clip": 0.09387165, + "auxiliary_loss_mlp": 0.01881808, + "balance_loss_clip": 0.07597888, + "balance_loss_mlp": 0.01708144, + "epoch": 0.023748684803847887, + "flos": 21730575795840.0, + "grad_norm": 29.466731361634597, + "language_loss": 1.02469492, + "learning_rate": 3.84951865465269e-06, + "loss": 1.13738465, + "num_input_tokens_seen": 8246825, + "router_z_loss_clip": 17.875, + "router_z_loss_mlp": 1.73632812, + "step": 395, + "time_per_iteration": 2.67728328704834 + }, + { + "auxiliary_loss_clip": 0.07807533, + "auxiliary_loss_mlp": 0.01422272, + "balance_loss_clip": 0.06998962, + "balance_loss_mlp": 0.01324949, + "epoch": 0.02380880805651586, + "flos": 61944299349120.0, + "grad_norm": 0.9675883167947973, + "language_loss": 0.63979137, + "learning_rate": 3.851146600358172e-06, + "loss": 0.7320894, + "num_input_tokens_seen": 8302835, + "router_z_loss_clip": 8.09375, + "router_z_loss_mlp": 0.97216797, + "step": 396, + "time_per_iteration": 3.085773468017578 + }, + { + "auxiliary_loss_clip": 0.09369384, + "auxiliary_loss_mlp": 0.01878876, + "balance_loss_clip": 0.07592572, + "balance_loss_mlp": 0.01705307, + "epoch": 0.023868931309183827, + "flos": 20272518898560.0, + "grad_norm": 448.6329753345253, + "language_loss": 1.09206522, + "learning_rate": 3.852770440269372e-06, + "loss": 1.20454776, + "num_input_tokens_seen": 8320745, + "router_z_loss_clip": 17.765625, + "router_z_loss_mlp": 1.73632812, + "step": 397, + "time_per_iteration": 2.645312786102295 + }, + { + "auxiliary_loss_clip": 0.09360366, + "auxiliary_loss_mlp": 0.01887806, + "balance_loss_clip": 0.07592075, + "balance_loss_mlp": 0.01703461, + "epoch": 0.023929054561851796, + "flos": 21144954810240.0, + "grad_norm": 35.15382244199787, + "language_loss": 1.09138823, + "learning_rate": 3.854390195044404e-06, + "loss": 1.20386982, + "num_input_tokens_seen": 8339540, + "router_z_loss_clip": 17.671875, + "router_z_loss_mlp": 1.84277344, + "step": 398, + "time_per_iteration": 2.7186756134033203 + }, + { + "auxiliary_loss_clip": 0.09363802, + "auxiliary_loss_mlp": 0.01863352, + "balance_loss_clip": 0.07595689, + "balance_loss_mlp": 0.01681963, + "epoch": 0.023989177814519765, + "flos": 13703548285440.0, + "grad_norm": 79.14501576371894, + "language_loss": 1.17455924, + "learning_rate": 3.856005885185868e-06, + "loss": 1.2868309, + "num_input_tokens_seen": 8354890, + "router_z_loss_clip": 17.6875, + "router_z_loss_mlp": 1.81347656, + "step": 399, + "time_per_iteration": 2.6266868114471436 + }, + { + "auxiliary_loss_clip": 0.09350164, + "auxiliary_loss_mlp": 0.01862402, + "balance_loss_clip": 0.07603092, + "balance_loss_mlp": 0.0168683, + "epoch": 0.024049301067187733, + "flos": 26329060667520.0, + "grad_norm": 31.26445557719831, + "language_loss": 1.02793097, + "learning_rate": 3.857617531042398e-06, + "loss": 1.14005673, + "num_input_tokens_seen": 8375845, + "router_z_loss_clip": 17.46875, + "router_z_loss_mlp": 1.75585938, + "step": 400, + "time_per_iteration": 2.766996145248413 + }, + { + "auxiliary_loss_clip": 0.09326777, + "auxiliary_loss_mlp": 0.01879183, + "balance_loss_clip": 0.07581857, + "balance_loss_mlp": 0.01707522, + "epoch": 0.024109424319855705, + "flos": 24432270192000.0, + "grad_norm": 165.70452294486532, + "language_loss": 0.98901701, + "learning_rate": 3.8592251528102065e-06, + "loss": 1.1010766, + "num_input_tokens_seen": 8395240, + "router_z_loss_clip": 17.46875, + "router_z_loss_mlp": 1.71679688, + "step": 401, + "time_per_iteration": 2.6877481937408447 + }, + { + "auxiliary_loss_clip": 0.09325443, + "auxiliary_loss_mlp": 0.01927273, + "balance_loss_clip": 0.0761469, + "balance_loss_mlp": 0.01736538, + "epoch": 0.024169547572523674, + "flos": 29611764074880.0, + "grad_norm": 158.83382742696674, + "language_loss": 1.04086566, + "learning_rate": 3.8608287705345976e-06, + "loss": 1.15339279, + "num_input_tokens_seen": 8416950, + "router_z_loss_clip": 17.09375, + "router_z_loss_mlp": 1.90722656, + "step": 402, + "time_per_iteration": 2.7297163009643555 + }, + { + "auxiliary_loss_clip": 0.09320071, + "auxiliary_loss_mlp": 0.01914681, + "balance_loss_clip": 0.07593916, + "balance_loss_mlp": 0.01724327, + "epoch": 0.024229670825191642, + "flos": 22608042952320.0, + "grad_norm": 474.9195361774189, + "language_loss": 1.23886442, + "learning_rate": 3.86242840411147e-06, + "loss": 1.35121191, + "num_input_tokens_seen": 8433660, + "router_z_loss_clip": 17.265625, + "router_z_loss_mlp": 1.90234375, + "step": 403, + "time_per_iteration": 2.6663832664489746 + }, + { + "auxiliary_loss_clip": 0.09310063, + "auxiliary_loss_mlp": 0.01918458, + "balance_loss_clip": 0.07606195, + "balance_loss_mlp": 0.01729535, + "epoch": 0.02428979407785961, + "flos": 18156110071680.0, + "grad_norm": 557.4725363749534, + "language_loss": 1.23195148, + "learning_rate": 3.864024073288798e-06, + "loss": 1.34423661, + "num_input_tokens_seen": 8450180, + "router_z_loss_clip": 17.0625, + "router_z_loss_mlp": 1.88867188, + "step": 404, + "time_per_iteration": 2.6930551528930664 + }, + { + "auxiliary_loss_clip": 0.09236102, + "auxiliary_loss_mlp": 0.01972168, + "balance_loss_clip": 0.07543309, + "balance_loss_mlp": 0.01765125, + "epoch": 0.024349917330527583, + "flos": 15310463160960.0, + "grad_norm": 32.91094539461264, + "language_loss": 1.10026622, + "learning_rate": 3.865615797668091e-06, + "loss": 1.21234894, + "num_input_tokens_seen": 8467775, + "router_z_loss_clip": 16.921875, + "router_z_loss_mlp": 2.0703125, + "step": 405, + "time_per_iteration": 2.7313172817230225 + }, + { + "auxiliary_loss_clip": 0.09182028, + "auxiliary_loss_mlp": 0.01998566, + "balance_loss_clip": 0.0751636, + "balance_loss_mlp": 0.01782559, + "epoch": 0.024410040583195552, + "flos": 20779623008640.0, + "grad_norm": 51.884422925202074, + "language_loss": 1.20401216, + "learning_rate": 3.867203596705844e-06, + "loss": 1.31581819, + "num_input_tokens_seen": 8486765, + "router_z_loss_clip": 16.65625, + "router_z_loss_mlp": 2.16015625, + "step": 406, + "time_per_iteration": 2.687269449234009 + }, + { + "auxiliary_loss_clip": 0.09164648, + "auxiliary_loss_mlp": 0.02058169, + "balance_loss_clip": 0.07528092, + "balance_loss_mlp": 0.01824328, + "epoch": 0.02447016383586352, + "flos": 21805319237760.0, + "grad_norm": 51.34272238318618, + "language_loss": 1.09166133, + "learning_rate": 3.86878748971496e-06, + "loss": 1.20388949, + "num_input_tokens_seen": 8506515, + "router_z_loss_clip": 16.375, + "router_z_loss_mlp": 2.33789062, + "step": 407, + "time_per_iteration": 2.7443573474884033 + }, + { + "auxiliary_loss_clip": 0.0913244, + "auxiliary_loss_mlp": 0.02070529, + "balance_loss_clip": 0.07525964, + "balance_loss_mlp": 0.01834208, + "epoch": 0.02453028708853149, + "flos": 33956529183360.0, + "grad_norm": 76.90003006133684, + "language_loss": 0.92362475, + "learning_rate": 3.8703674958661596e-06, + "loss": 1.03565443, + "num_input_tokens_seen": 8528035, + "router_z_loss_clip": 16.0546875, + "router_z_loss_mlp": 2.36132812, + "step": 408, + "time_per_iteration": 2.78354549407959 + }, + { + "auxiliary_loss_clip": 0.09112523, + "auxiliary_loss_mlp": 0.02060747, + "balance_loss_clip": 0.07508834, + "balance_loss_mlp": 0.01828241, + "epoch": 0.024590410341199458, + "flos": 21798485130240.0, + "grad_norm": 96.45423831363296, + "language_loss": 1.18704772, + "learning_rate": 3.871943634189376e-06, + "loss": 1.29878044, + "num_input_tokens_seen": 8546455, + "router_z_loss_clip": 16.015625, + "router_z_loss_mlp": 2.32421875, + "step": 409, + "time_per_iteration": 2.7200136184692383 + }, + { + "auxiliary_loss_clip": 0.09154539, + "auxiliary_loss_mlp": 0.02068674, + "balance_loss_clip": 0.07541502, + "balance_loss_mlp": 0.01836741, + "epoch": 0.02465053359386743, + "flos": 35123243034240.0, + "grad_norm": 76.46793311342431, + "language_loss": 1.05106175, + "learning_rate": 3.873515923575128e-06, + "loss": 1.16329384, + "num_input_tokens_seen": 8568450, + "router_z_loss_clip": 16.1171875, + "router_z_loss_mlp": 2.3203125, + "step": 410, + "time_per_iteration": 2.7935402393341064 + }, + { + "auxiliary_loss_clip": 0.09179245, + "auxiliary_loss_mlp": 0.02052485, + "balance_loss_clip": 0.07555975, + "balance_loss_mlp": 0.01831042, + "epoch": 0.0247106568465354, + "flos": 27458360870400.0, + "grad_norm": 178.4501833385731, + "language_loss": 1.0301317, + "learning_rate": 3.875084382775879e-06, + "loss": 1.14244902, + "num_input_tokens_seen": 8589340, + "router_z_loss_clip": 16.25, + "router_z_loss_mlp": 2.21679688, + "step": 411, + "time_per_iteration": 2.810314416885376 + }, + { + "auxiliary_loss_clip": 0.09117973, + "auxiliary_loss_mlp": 0.02147569, + "balance_loss_clip": 0.07523946, + "balance_loss_mlp": 0.01899232, + "epoch": 0.024770780099203367, + "flos": 20709994665600.0, + "grad_norm": 31.381834451084366, + "language_loss": 1.07807076, + "learning_rate": 3.87664903040738e-06, + "loss": 1.19072616, + "num_input_tokens_seen": 8607150, + "router_z_loss_clip": 15.9375, + "router_z_loss_mlp": 2.48242188, + "step": 412, + "time_per_iteration": 4.135298252105713 + }, + { + "auxiliary_loss_clip": 0.0766484, + "auxiliary_loss_mlp": 0.01383218, + "balance_loss_clip": 0.06950212, + "balance_loss_mlp": 0.01289853, + "epoch": 0.024830903351871336, + "flos": 69571264740480.0, + "grad_norm": 0.8458100626859368, + "language_loss": 0.58554661, + "learning_rate": 3.878209884949994e-06, + "loss": 0.67602718, + "num_input_tokens_seen": 8669865, + "router_z_loss_clip": 7.13671875, + "router_z_loss_mlp": 0.93261719, + "step": 413, + "time_per_iteration": 4.813804864883423 + }, + { + "auxiliary_loss_clip": 0.09105721, + "auxiliary_loss_mlp": 0.02060854, + "balance_loss_clip": 0.07511897, + "balance_loss_mlp": 0.01837503, + "epoch": 0.024891026604539304, + "flos": 32278728153600.0, + "grad_norm": 48.89104730966055, + "language_loss": 0.9726972, + "learning_rate": 3.879766964750006e-06, + "loss": 1.08436298, + "num_input_tokens_seen": 8690235, + "router_z_loss_clip": 15.921875, + "router_z_loss_mlp": 2.234375, + "step": 414, + "time_per_iteration": 2.777872323989868 + }, + { + "auxiliary_loss_clip": 0.0905456, + "auxiliary_loss_mlp": 0.02077859, + "balance_loss_clip": 0.07483284, + "balance_loss_mlp": 0.0185365, + "epoch": 0.024951149857207276, + "flos": 18845712374400.0, + "grad_norm": 208.18956686369972, + "language_loss": 1.01095724, + "learning_rate": 3.881320288020917e-06, + "loss": 1.12228131, + "num_input_tokens_seen": 8706295, + "router_z_loss_clip": 15.71875, + "router_z_loss_mlp": 2.24023438, + "step": 415, + "time_per_iteration": 4.142550230026245 + }, + { + "auxiliary_loss_clip": 0.09080397, + "auxiliary_loss_mlp": 0.02074643, + "balance_loss_clip": 0.07484584, + "balance_loss_mlp": 0.0184805, + "epoch": 0.025011273109875245, + "flos": 15382565199360.0, + "grad_norm": 178.52142115782007, + "language_loss": 1.28543544, + "learning_rate": 3.882869872844723e-06, + "loss": 1.39698577, + "num_input_tokens_seen": 8724200, + "router_z_loss_clip": 15.953125, + "router_z_loss_mlp": 2.26757812, + "step": 416, + "time_per_iteration": 2.6912667751312256 + }, + { + "auxiliary_loss_clip": 0.09093624, + "auxiliary_loss_mlp": 0.02048458, + "balance_loss_clip": 0.07498566, + "balance_loss_mlp": 0.01806797, + "epoch": 0.025071396362543213, + "flos": 18921336284160.0, + "grad_norm": 52.83271193802728, + "language_loss": 0.94415307, + "learning_rate": 3.884415737173176e-06, + "loss": 1.05557394, + "num_input_tokens_seen": 8744170, + "router_z_loss_clip": 15.9609375, + "router_z_loss_mlp": 2.41796875, + "step": 417, + "time_per_iteration": 2.6768579483032227 + }, + { + "auxiliary_loss_clip": 0.0906695, + "auxiliary_loss_mlp": 0.02050523, + "balance_loss_clip": 0.07510033, + "balance_loss_mlp": 0.01817826, + "epoch": 0.025131519615211182, + "flos": 25345012717440.0, + "grad_norm": 47.28632079324067, + "language_loss": 0.95738804, + "learning_rate": 3.8859578988290344e-06, + "loss": 1.06856275, + "num_input_tokens_seen": 8765120, + "router_z_loss_clip": 15.5625, + "router_z_loss_mlp": 2.328125, + "step": 418, + "time_per_iteration": 2.7193026542663574 + }, + { + "auxiliary_loss_clip": 0.09048779, + "auxiliary_loss_mlp": 0.02107992, + "balance_loss_clip": 0.07468801, + "balance_loss_mlp": 0.01844969, + "epoch": 0.02519164286787915, + "flos": 18959169202560.0, + "grad_norm": 64.96228222580599, + "language_loss": 1.10502434, + "learning_rate": 3.887496375507294e-06, + "loss": 1.21659207, + "num_input_tokens_seen": 8783500, + "router_z_loss_clip": 15.7890625, + "router_z_loss_mlp": 2.62890625, + "step": 419, + "time_per_iteration": 2.661895513534546 + }, + { + "auxiliary_loss_clip": 0.09047179, + "auxiliary_loss_mlp": 0.02074314, + "balance_loss_clip": 0.07473344, + "balance_loss_mlp": 0.01826931, + "epoch": 0.025251766120547123, + "flos": 17426913914880.0, + "grad_norm": 60.48178105720379, + "language_loss": 0.91689897, + "learning_rate": 3.8890311847764065e-06, + "loss": 1.02811384, + "num_input_tokens_seen": 8801175, + "router_z_loss_clip": 15.7265625, + "router_z_loss_mlp": 2.47070312, + "step": 420, + "time_per_iteration": 2.690960168838501 + }, + { + "auxiliary_loss_clip": 0.09091747, + "auxiliary_loss_mlp": 0.02038651, + "balance_loss_clip": 0.07504605, + "balance_loss_mlp": 0.01800423, + "epoch": 0.02531188937321509, + "flos": 25052328005760.0, + "grad_norm": 83.61542449738408, + "language_loss": 0.95396888, + "learning_rate": 3.890562344079484e-06, + "loss": 1.06527293, + "num_input_tokens_seen": 8820215, + "router_z_loss_clip": 15.875, + "router_z_loss_mlp": 2.38085938, + "step": 421, + "time_per_iteration": 2.713627338409424 + }, + { + "auxiliary_loss_clip": 0.0910122, + "auxiliary_loss_mlp": 0.02078743, + "balance_loss_clip": 0.07504999, + "balance_loss_mlp": 0.0184185, + "epoch": 0.02537201262588306, + "flos": 30600214364160.0, + "grad_norm": 131.53322969932037, + "language_loss": 1.06396794, + "learning_rate": 3.89208987073549e-06, + "loss": 1.17576766, + "num_input_tokens_seen": 8839660, + "router_z_loss_clip": 15.96875, + "router_z_loss_mlp": 2.36914062, + "step": 422, + "time_per_iteration": 2.779984712600708 + }, + { + "auxiliary_loss_clip": 0.09149099, + "auxiliary_loss_mlp": 0.02005588, + "balance_loss_clip": 0.07524605, + "balance_loss_mlp": 0.01778041, + "epoch": 0.02543213587855103, + "flos": 26072154449280.0, + "grad_norm": 215.69560731113194, + "language_loss": 1.02335918, + "learning_rate": 3.893613781940409e-06, + "loss": 1.13490605, + "num_input_tokens_seen": 8859280, + "router_z_loss_clip": 16.2265625, + "router_z_loss_mlp": 2.27148438, + "step": 423, + "time_per_iteration": 2.72013783454895 + }, + { + "auxiliary_loss_clip": 0.09173086, + "auxiliary_loss_mlp": 0.0200403, + "balance_loss_clip": 0.07535084, + "balance_loss_mlp": 0.01785067, + "epoch": 0.025492259131218997, + "flos": 36030744679680.0, + "grad_norm": 27.081185373152007, + "language_loss": 0.91272038, + "learning_rate": 3.895134094768415e-06, + "loss": 1.02449155, + "num_input_tokens_seen": 8880560, + "router_z_loss_clip": 16.375, + "router_z_loss_mlp": 2.18945312, + "step": 424, + "time_per_iteration": 2.8317928314208984 + }, + { + "auxiliary_loss_clip": 0.09242675, + "auxiliary_loss_mlp": 0.01968499, + "balance_loss_clip": 0.07578178, + "balance_loss_mlp": 0.01753446, + "epoch": 0.02555238238388697, + "flos": 18593963182080.0, + "grad_norm": 166.26721899755887, + "language_loss": 1.05789995, + "learning_rate": 3.896650826173015e-06, + "loss": 1.17001164, + "num_input_tokens_seen": 8899155, + "router_z_loss_clip": 16.625, + "router_z_loss_mlp": 2.15332031, + "step": 425, + "time_per_iteration": 2.660106897354126 + }, + { + "auxiliary_loss_clip": 0.0923897, + "auxiliary_loss_mlp": 0.01943853, + "balance_loss_clip": 0.07566722, + "balance_loss_mlp": 0.01731852, + "epoch": 0.025612505636554938, + "flos": 24250023561600.0, + "grad_norm": 44.6180367993383, + "language_loss": 1.08164155, + "learning_rate": 3.898163992988186e-06, + "loss": 1.19346988, + "num_input_tokens_seen": 8917890, + "router_z_loss_clip": 16.703125, + "router_z_loss_mlp": 2.12109375, + "step": 426, + "time_per_iteration": 2.713566303253174 + }, + { + "auxiliary_loss_clip": 0.07567823, + "auxiliary_loss_mlp": 0.0137553, + "balance_loss_clip": 0.06925757, + "balance_loss_mlp": 0.01282499, + "epoch": 0.025672628889222907, + "flos": 60606617241600.0, + "grad_norm": 0.882551554014491, + "language_loss": 0.57127881, + "learning_rate": 3.899673611929491e-06, + "loss": 0.66071236, + "num_input_tokens_seen": 8978260, + "router_z_loss_clip": 6.43359375, + "router_z_loss_mlp": 0.92919922, + "step": 427, + "time_per_iteration": 3.3642380237579346 + }, + { + "auxiliary_loss_clip": 0.09344095, + "auxiliary_loss_mlp": 0.01954303, + "balance_loss_clip": 0.0761513, + "balance_loss_mlp": 0.01743541, + "epoch": 0.025732752141890875, + "flos": 19579352797440.0, + "grad_norm": 32.1114157010126, + "language_loss": 1.08901465, + "learning_rate": 3.901179699595194e-06, + "loss": 1.20199859, + "num_input_tokens_seen": 8994460, + "router_z_loss_clip": 17.296875, + "router_z_loss_mlp": 2.10839844, + "step": 428, + "time_per_iteration": 2.6606802940368652 + }, + { + "auxiliary_loss_clip": 0.09310514, + "auxiliary_loss_mlp": 0.01961632, + "balance_loss_clip": 0.07603246, + "balance_loss_mlp": 0.01752969, + "epoch": 0.025792875394558847, + "flos": 31292164581120.0, + "grad_norm": 36.551830180207176, + "language_loss": 1.00762367, + "learning_rate": 3.902682272467353e-06, + "loss": 1.12034512, + "num_input_tokens_seen": 9016670, + "router_z_loss_clip": 17.046875, + "router_z_loss_mlp": 2.08984375, + "step": 429, + "time_per_iteration": 2.8459787368774414 + }, + { + "auxiliary_loss_clip": 0.09338318, + "auxiliary_loss_mlp": 0.01955653, + "balance_loss_clip": 0.07623117, + "balance_loss_mlp": 0.01745367, + "epoch": 0.025852998647226816, + "flos": 32387824569600.0, + "grad_norm": 62.5354126598028, + "language_loss": 1.05025983, + "learning_rate": 3.904181346912895e-06, + "loss": 1.16319966, + "num_input_tokens_seen": 9039720, + "router_z_loss_clip": 17.15625, + "router_z_loss_mlp": 2.10644531, + "step": 430, + "time_per_iteration": 2.8446128368377686 + }, + { + "auxiliary_loss_clip": 0.09278628, + "auxiliary_loss_mlp": 0.01943414, + "balance_loss_clip": 0.07600376, + "balance_loss_mlp": 0.01729219, + "epoch": 0.025913121899894784, + "flos": 20199452538240.0, + "grad_norm": 28.225993864396795, + "language_loss": 1.00378919, + "learning_rate": 3.905676939184698e-06, + "loss": 1.11600959, + "num_input_tokens_seen": 9059850, + "router_z_loss_clip": 16.78125, + "router_z_loss_mlp": 2.14453125, + "step": 431, + "time_per_iteration": 2.735534906387329 + }, + { + "auxiliary_loss_clip": 0.09339449, + "auxiliary_loss_mlp": 0.01919694, + "balance_loss_clip": 0.07634744, + "balance_loss_mlp": 0.01714844, + "epoch": 0.025973245152562753, + "flos": 14725680716160.0, + "grad_norm": 242.91179280184718, + "language_loss": 1.11488628, + "learning_rate": 3.907169065422638e-06, + "loss": 1.22747779, + "num_input_tokens_seen": 9077590, + "router_z_loss_clip": 17.046875, + "router_z_loss_mlp": 2.04882812, + "step": 432, + "time_per_iteration": 2.6356372833251953 + }, + { + "auxiliary_loss_clip": 0.09349881, + "auxiliary_loss_mlp": 0.01923388, + "balance_loss_clip": 0.07619249, + "balance_loss_mlp": 0.01717585, + "epoch": 0.02603336840523072, + "flos": 31000947315840.0, + "grad_norm": 39.86728122976192, + "language_loss": 0.95303321, + "learning_rate": 3.908657741654636e-06, + "loss": 1.06576586, + "num_input_tokens_seen": 9099880, + "router_z_loss_clip": 17.328125, + "router_z_loss_mlp": 2.06054688, + "step": 433, + "time_per_iteration": 2.7784080505371094 + }, + { + "auxiliary_loss_clip": 0.09401309, + "auxiliary_loss_mlp": 0.0191169, + "balance_loss_clip": 0.07644869, + "balance_loss_mlp": 0.01712276, + "epoch": 0.026093491657898694, + "flos": 17679753210240.0, + "grad_norm": 1553.0281168066135, + "language_loss": 1.08543563, + "learning_rate": 3.910142983797699e-06, + "loss": 1.19856548, + "num_input_tokens_seen": 9118620, + "router_z_loss_clip": 17.5625, + "router_z_loss_mlp": 1.99511719, + "step": 434, + "time_per_iteration": 2.668267250061035 + }, + { + "auxiliary_loss_clip": 0.09433939, + "auxiliary_loss_mlp": 0.01869234, + "balance_loss_clip": 0.07651832, + "balance_loss_mlp": 0.01678308, + "epoch": 0.026153614910566662, + "flos": 17863593068160.0, + "grad_norm": 33.64342024905016, + "language_loss": 1.03063393, + "learning_rate": 3.9116248076589305e-06, + "loss": 1.14366555, + "num_input_tokens_seen": 9135655, + "router_z_loss_clip": 17.8125, + "router_z_loss_mlp": 1.90917969, + "step": 435, + "time_per_iteration": 2.6838159561157227 + }, + { + "auxiliary_loss_clip": 0.09478317, + "auxiliary_loss_mlp": 0.01863685, + "balance_loss_clip": 0.07678007, + "balance_loss_mlp": 0.01671615, + "epoch": 0.02621373816323463, + "flos": 20017289761920.0, + "grad_norm": 41.08687640619308, + "language_loss": 1.07638645, + "learning_rate": 3.913103228936546e-06, + "loss": 1.18980646, + "num_input_tokens_seen": 9153520, + "router_z_loss_clip": 18.0, + "router_z_loss_mlp": 1.91992188, + "step": 436, + "time_per_iteration": 2.760547399520874 + }, + { + "auxiliary_loss_clip": 0.09473966, + "auxiliary_loss_mlp": 0.0187601, + "balance_loss_clip": 0.07674257, + "balance_loss_mlp": 0.01688708, + "epoch": 0.0262738614159026, + "flos": 19287213137280.0, + "grad_norm": 53.25711722147742, + "language_loss": 0.98595166, + "learning_rate": 3.914578263220868e-06, + "loss": 1.09945142, + "num_input_tokens_seen": 9170750, + "router_z_loss_clip": 18.0, + "router_z_loss_mlp": 1.87402344, + "step": 437, + "time_per_iteration": 2.6779754161834717 + }, + { + "auxiliary_loss_clip": 0.0942243, + "auxiliary_loss_mlp": 0.01861842, + "balance_loss_clip": 0.0761686, + "balance_loss_mlp": 0.01679594, + "epoch": 0.026333984668570568, + "flos": 18813204190080.0, + "grad_norm": 25.40915552443808, + "language_loss": 1.10034943, + "learning_rate": 3.916049925995316e-06, + "loss": 1.21319222, + "num_input_tokens_seen": 9188430, + "router_z_loss_clip": 18.03125, + "router_z_loss_mlp": 1.82421875, + "step": 438, + "time_per_iteration": 2.6451144218444824 + }, + { + "auxiliary_loss_clip": 0.07475804, + "auxiliary_loss_mlp": 0.01367854, + "balance_loss_clip": 0.06865337, + "balance_loss_mlp": 0.01290463, + "epoch": 0.02639410792123854, + "flos": 64593723196800.0, + "grad_norm": 0.9063737016618233, + "language_loss": 0.62703174, + "learning_rate": 3.917518232637377e-06, + "loss": 0.71546829, + "num_input_tokens_seen": 9255835, + "router_z_loss_clip": 6.09765625, + "router_z_loss_mlp": 0.77294922, + "step": 439, + "time_per_iteration": 3.321974992752075 + }, + { + "auxiliary_loss_clip": 0.09522887, + "auxiliary_loss_mlp": 0.0184955, + "balance_loss_clip": 0.07696441, + "balance_loss_mlp": 0.01671499, + "epoch": 0.02645423117390651, + "flos": 28480661009280.0, + "grad_norm": 87.92324241889918, + "language_loss": 0.94047898, + "learning_rate": 3.918983198419573e-06, + "loss": 1.05420327, + "num_input_tokens_seen": 9276835, + "router_z_loss_clip": 18.25, + "router_z_loss_mlp": 1.78027344, + "step": 440, + "time_per_iteration": 2.7474722862243652 + }, + { + "auxiliary_loss_clip": 0.09507709, + "auxiliary_loss_mlp": 0.01844884, + "balance_loss_clip": 0.07691655, + "balance_loss_mlp": 0.01676846, + "epoch": 0.026514354426574478, + "flos": 18557094585600.0, + "grad_norm": 21.281112340814676, + "language_loss": 1.01854694, + "learning_rate": 3.920444838510415e-06, + "loss": 1.13207293, + "num_input_tokens_seen": 9295075, + "router_z_loss_clip": 18.171875, + "router_z_loss_mlp": 1.68066406, + "step": 441, + "time_per_iteration": 2.6456263065338135 + }, + { + "auxiliary_loss_clip": 0.09501958, + "auxiliary_loss_mlp": 0.01843855, + "balance_loss_clip": 0.07712354, + "balance_loss_mlp": 0.01682208, + "epoch": 0.026574477679242446, + "flos": 20674090391040.0, + "grad_norm": 41.33053095224922, + "language_loss": 0.97709602, + "learning_rate": 3.92190316797534e-06, + "loss": 1.09055424, + "num_input_tokens_seen": 9314205, + "router_z_loss_clip": 17.890625, + "router_z_loss_mlp": 1.61621094, + "step": 442, + "time_per_iteration": 2.672673463821411 + }, + { + "auxiliary_loss_clip": 0.07433579, + "auxiliary_loss_mlp": 0.01330966, + "balance_loss_clip": 0.06849352, + "balance_loss_mlp": 0.01265354, + "epoch": 0.026634600931910415, + "flos": 57974718896640.0, + "grad_norm": 0.9677279434812149, + "language_loss": 0.64635992, + "learning_rate": 3.92335820177765e-06, + "loss": 0.73400539, + "num_input_tokens_seen": 9367395, + "router_z_loss_clip": 5.83203125, + "router_z_loss_mlp": 0.65625, + "step": 443, + "time_per_iteration": 3.173064947128296 + }, + { + "auxiliary_loss_clip": 0.09527416, + "auxiliary_loss_mlp": 0.01860056, + "balance_loss_clip": 0.07710861, + "balance_loss_mlp": 0.01695928, + "epoch": 0.026694724184578387, + "flos": 15820586017920.0, + "grad_norm": 61.63283491372988, + "language_loss": 1.0548501, + "learning_rate": 3.924809954779425e-06, + "loss": 1.16872489, + "num_input_tokens_seen": 9385185, + "router_z_loss_clip": 18.15625, + "router_z_loss_mlp": 1.64160156, + "step": 444, + "time_per_iteration": 2.639677047729492 + }, + { + "auxiliary_loss_clip": 0.09502187, + "auxiliary_loss_mlp": 0.01838362, + "balance_loss_clip": 0.07703182, + "balance_loss_mlp": 0.01668608, + "epoch": 0.026754847437246355, + "flos": 23446922503680.0, + "grad_norm": 26.361183363910182, + "language_loss": 1.13923943, + "learning_rate": 3.9262584417424425e-06, + "loss": 1.2526449, + "num_input_tokens_seen": 9403225, + "router_z_loss_clip": 17.96875, + "router_z_loss_mlp": 1.69824219, + "step": 445, + "time_per_iteration": 2.6820874214172363 + }, + { + "auxiliary_loss_clip": 0.09478995, + "auxiliary_loss_mlp": 0.01847369, + "balance_loss_clip": 0.07693952, + "balance_loss_mlp": 0.01688678, + "epoch": 0.026814970689914324, + "flos": 17346552249600.0, + "grad_norm": 24.407324377890284, + "language_loss": 1.13474417, + "learning_rate": 3.9277036773290725e-06, + "loss": 1.24800777, + "num_input_tokens_seen": 9420540, + "router_z_loss_clip": 17.84375, + "router_z_loss_mlp": 1.5859375, + "step": 446, + "time_per_iteration": 2.6508054733276367 + }, + { + "auxiliary_loss_clip": 0.09462097, + "auxiliary_loss_mlp": 0.01860509, + "balance_loss_clip": 0.07703365, + "balance_loss_mlp": 0.01698385, + "epoch": 0.026875093942582293, + "flos": 17900503591680.0, + "grad_norm": 17.536194577693298, + "language_loss": 0.97970635, + "learning_rate": 3.92914567610317e-06, + "loss": 1.09293234, + "num_input_tokens_seen": 9438840, + "router_z_loss_clip": 17.609375, + "router_z_loss_mlp": 1.62109375, + "step": 447, + "time_per_iteration": 2.6584267616271973 + }, + { + "auxiliary_loss_clip": 0.0948635, + "auxiliary_loss_mlp": 0.01891451, + "balance_loss_clip": 0.0770483, + "balance_loss_mlp": 0.01723413, + "epoch": 0.026935217195250265, + "flos": 21730114598400.0, + "grad_norm": 21.562911901589327, + "language_loss": 1.05652094, + "learning_rate": 3.930584452530952e-06, + "loss": 1.17029905, + "num_input_tokens_seen": 9457215, + "router_z_loss_clip": 17.8125, + "router_z_loss_mlp": 1.67871094, + "step": 448, + "time_per_iteration": 2.672372341156006 + }, + { + "auxiliary_loss_clip": 0.09413482, + "auxiliary_loss_mlp": 0.01902533, + "balance_loss_clip": 0.07671943, + "balance_loss_mlp": 0.01741266, + "epoch": 0.026995340447918233, + "flos": 23629378769280.0, + "grad_norm": 23.02833788504926, + "language_loss": 1.03788567, + "learning_rate": 3.9320200209818755e-06, + "loss": 1.1510458, + "num_input_tokens_seen": 9475615, + "router_z_loss_clip": 17.421875, + "router_z_loss_mlp": 1.61328125, + "step": 449, + "time_per_iteration": 2.7325220108032227 + }, + { + "auxiliary_loss_clip": 0.09437311, + "auxiliary_loss_mlp": 0.01924822, + "balance_loss_clip": 0.07667883, + "balance_loss_mlp": 0.0175955, + "epoch": 0.027055463700586202, + "flos": 17937078698880.0, + "grad_norm": 25.829396596685555, + "language_loss": 1.03924859, + "learning_rate": 3.933452395729493e-06, + "loss": 1.15286994, + "num_input_tokens_seen": 9493975, + "router_z_loss_clip": 17.703125, + "router_z_loss_mlp": 1.65332031, + "step": 450, + "time_per_iteration": 2.7811074256896973 + }, + { + "auxiliary_loss_clip": 0.09359707, + "auxiliary_loss_mlp": 0.01970194, + "balance_loss_clip": 0.0764256, + "balance_loss_mlp": 0.01786802, + "epoch": 0.02711558695325417, + "flos": 25125897490560.0, + "grad_norm": 13.607653987068408, + "language_loss": 0.94443107, + "learning_rate": 3.934881590952304e-06, + "loss": 1.05773008, + "num_input_tokens_seen": 9514810, + "router_z_loss_clip": 17.171875, + "router_z_loss_mlp": 1.83398438, + "step": 451, + "time_per_iteration": 2.7412643432617188 + }, + { + "auxiliary_loss_clip": 0.09335385, + "auxiliary_loss_mlp": 0.02017307, + "balance_loss_clip": 0.07637483, + "balance_loss_mlp": 0.0183115, + "epoch": 0.02717571020592214, + "flos": 24245788930560.0, + "grad_norm": 37.22783951143226, + "language_loss": 0.88836813, + "learning_rate": 3.936307620734599e-06, + "loss": 1.00189495, + "num_input_tokens_seen": 9533635, + "router_z_loss_clip": 16.984375, + "router_z_loss_mlp": 1.86132812, + "step": 452, + "time_per_iteration": 4.115676403045654 + }, + { + "auxiliary_loss_clip": 0.09290475, + "auxiliary_loss_mlp": 0.0203207, + "balance_loss_clip": 0.07611442, + "balance_loss_mlp": 0.01843815, + "epoch": 0.02723583345859011, + "flos": 25125939417600.0, + "grad_norm": 26.908598142012707, + "language_loss": 0.85555518, + "learning_rate": 3.937730499067294e-06, + "loss": 0.96878058, + "num_input_tokens_seen": 9555420, + "router_z_loss_clip": 16.796875, + "router_z_loss_mlp": 1.88378906, + "step": 453, + "time_per_iteration": 4.138639211654663 + }, + { + "auxiliary_loss_clip": 0.09325944, + "auxiliary_loss_mlp": 0.02084866, + "balance_loss_clip": 0.07637945, + "balance_loss_mlp": 0.01890889, + "epoch": 0.02729595671125808, + "flos": 42751550090880.0, + "grad_norm": 24.937148454808558, + "language_loss": 1.02160192, + "learning_rate": 3.939150239848748e-06, + "loss": 1.13570988, + "num_input_tokens_seen": 9578950, + "router_z_loss_clip": 16.90625, + "router_z_loss_mlp": 1.94140625, + "step": 454, + "time_per_iteration": 2.851925849914551 + }, + { + "auxiliary_loss_clip": 0.09296365, + "auxiliary_loss_mlp": 0.02123722, + "balance_loss_clip": 0.07621342, + "balance_loss_mlp": 0.01917728, + "epoch": 0.02735607996392605, + "flos": 21436884835200.0, + "grad_norm": 33.11607572615514, + "language_loss": 0.89587128, + "learning_rate": 3.9405668568855866e-06, + "loss": 1.01007211, + "num_input_tokens_seen": 9598160, + "router_z_loss_clip": 16.734375, + "router_z_loss_mlp": 2.0625, + "step": 455, + "time_per_iteration": 4.109623432159424 + }, + { + "auxiliary_loss_clip": 0.09291606, + "auxiliary_loss_mlp": 0.02163595, + "balance_loss_clip": 0.07605162, + "balance_loss_mlp": 0.01945966, + "epoch": 0.027416203216594017, + "flos": 20857762540800.0, + "grad_norm": 21.694013226548094, + "language_loss": 0.99008209, + "learning_rate": 3.941980363893499e-06, + "loss": 1.10463405, + "num_input_tokens_seen": 9616010, + "router_z_loss_clip": 16.84375, + "router_z_loss_mlp": 2.17773438, + "step": 456, + "time_per_iteration": 2.6782984733581543 + }, + { + "auxiliary_loss_clip": 0.09230845, + "auxiliary_loss_mlp": 0.02187109, + "balance_loss_clip": 0.07574348, + "balance_loss_mlp": 0.01970243, + "epoch": 0.027476326469261986, + "flos": 13229497411200.0, + "grad_norm": 28.08353344684151, + "language_loss": 0.97085631, + "learning_rate": 3.9433907744980384e-06, + "loss": 1.0850358, + "num_input_tokens_seen": 9634000, + "router_z_loss_clip": 16.5625, + "router_z_loss_mlp": 2.16894531, + "step": 457, + "time_per_iteration": 2.6582846641540527 + }, + { + "auxiliary_loss_clip": 0.09249748, + "auxiliary_loss_mlp": 0.02209668, + "balance_loss_clip": 0.07581042, + "balance_loss_mlp": 0.01978497, + "epoch": 0.027536449721929958, + "flos": 24031369532160.0, + "grad_norm": 45.18041952436337, + "language_loss": 1.10011601, + "learning_rate": 3.944798102235412e-06, + "loss": 1.21471024, + "num_input_tokens_seen": 9653455, + "router_z_loss_clip": 16.671875, + "router_z_loss_mlp": 2.31054688, + "step": 458, + "time_per_iteration": 2.723140239715576 + }, + { + "auxiliary_loss_clip": 0.09220205, + "auxiliary_loss_mlp": 0.02210297, + "balance_loss_clip": 0.07555029, + "balance_loss_mlp": 0.01976265, + "epoch": 0.027596572974597926, + "flos": 13011094944000.0, + "grad_norm": 45.239920259124276, + "language_loss": 1.02681351, + "learning_rate": 3.9462023605532545e-06, + "loss": 1.14111853, + "num_input_tokens_seen": 9669650, + "router_z_loss_clip": 16.640625, + "router_z_loss_mlp": 2.33984375, + "step": 459, + "time_per_iteration": 2.671720027923584 + }, + { + "auxiliary_loss_clip": 0.09208341, + "auxiliary_loss_mlp": 0.02210187, + "balance_loss_clip": 0.07567435, + "balance_loss_mlp": 0.0198264, + "epoch": 0.027656696227265895, + "flos": 26150671324800.0, + "grad_norm": 19.623434288041715, + "language_loss": 0.97685856, + "learning_rate": 3.947603562811407e-06, + "loss": 1.09104395, + "num_input_tokens_seen": 9691415, + "router_z_loss_clip": 16.40625, + "router_z_loss_mlp": 2.2734375, + "step": 460, + "time_per_iteration": 2.757227897644043 + }, + { + "auxiliary_loss_clip": 0.07349286, + "auxiliary_loss_mlp": 0.01457289, + "balance_loss_clip": 0.06801966, + "balance_loss_mlp": 0.01381853, + "epoch": 0.027716819479933864, + "flos": 60717055322880.0, + "grad_norm": 1.34871546657126, + "language_loss": 0.73767412, + "learning_rate": 3.949001722282675e-06, + "loss": 0.8257398, + "num_input_tokens_seen": 9755605, + "router_z_loss_clip": 5.48828125, + "router_z_loss_mlp": 0.75292969, + "step": 461, + "time_per_iteration": 3.225203514099121 + }, + { + "auxiliary_loss_clip": 0.09153335, + "auxiliary_loss_mlp": 0.02158036, + "balance_loss_clip": 0.07562718, + "balance_loss_mlp": 0.01941456, + "epoch": 0.027776942732601832, + "flos": 31219936761600.0, + "grad_norm": 25.337070845847826, + "language_loss": 1.02236819, + "learning_rate": 3.950396852153582e-06, + "loss": 1.13548183, + "num_input_tokens_seen": 9776270, + "router_z_loss_clip": 15.921875, + "router_z_loss_mlp": 2.16503906, + "step": 462, + "time_per_iteration": 2.761122941970825 + }, + { + "auxiliary_loss_clip": 0.0917296, + "auxiliary_loss_mlp": 0.02143298, + "balance_loss_clip": 0.07564321, + "balance_loss_mlp": 0.01926432, + "epoch": 0.027837065985269804, + "flos": 22681277020800.0, + "grad_norm": 25.879214952659087, + "language_loss": 1.11945248, + "learning_rate": 3.951788965525118e-06, + "loss": 1.23261511, + "num_input_tokens_seen": 9794465, + "router_z_loss_clip": 16.09375, + "router_z_loss_mlp": 2.16796875, + "step": 463, + "time_per_iteration": 2.6517393589019775 + }, + { + "auxiliary_loss_clip": 0.07315847, + "auxiliary_loss_mlp": 0.01337025, + "balance_loss_clip": 0.06773283, + "balance_loss_mlp": 0.01272986, + "epoch": 0.027897189237937773, + "flos": 62200786296960.0, + "grad_norm": 0.9076693638551637, + "language_loss": 0.58966231, + "learning_rate": 3.953178075413476e-06, + "loss": 0.67619097, + "num_input_tokens_seen": 9849685, + "router_z_loss_clip": 5.4375, + "router_z_loss_mlp": 0.64013672, + "step": 464, + "time_per_iteration": 3.2396233081817627 + }, + { + "auxiliary_loss_clip": 0.09172998, + "auxiliary_loss_mlp": 0.02120585, + "balance_loss_clip": 0.07578301, + "balance_loss_mlp": 0.01918502, + "epoch": 0.02795731249060574, + "flos": 24499131350400.0, + "grad_norm": 45.20349334546378, + "language_loss": 1.03495145, + "learning_rate": 3.954564194750784e-06, + "loss": 1.14788723, + "num_input_tokens_seen": 9869505, + "router_z_loss_clip": 15.953125, + "router_z_loss_mlp": 2.02148438, + "step": 465, + "time_per_iteration": 2.725616931915283 + }, + { + "auxiliary_loss_clip": 0.09135859, + "auxiliary_loss_mlp": 0.0204377, + "balance_loss_clip": 0.07563674, + "balance_loss_mlp": 0.01849125, + "epoch": 0.02801743574327371, + "flos": 23739858777600.0, + "grad_norm": 33.78948466858622, + "language_loss": 0.95100033, + "learning_rate": 3.955947336385828e-06, + "loss": 1.06279659, + "num_input_tokens_seen": 9890950, + "router_z_loss_clip": 15.703125, + "router_z_loss_mlp": 1.94628906, + "step": 466, + "time_per_iteration": 2.7096307277679443 + }, + { + "auxiliary_loss_clip": 0.09162845, + "auxiliary_loss_mlp": 0.02091556, + "balance_loss_clip": 0.07588789, + "balance_loss_mlp": 0.0189424, + "epoch": 0.02807755899594168, + "flos": 20634999661440.0, + "grad_norm": 17.071922366982022, + "language_loss": 1.01469541, + "learning_rate": 3.957327513084761e-06, + "loss": 1.12723947, + "num_input_tokens_seen": 9911265, + "router_z_loss_clip": 15.75, + "router_z_loss_mlp": 1.97265625, + "step": 467, + "time_per_iteration": 2.697120189666748 + }, + { + "auxiliary_loss_clip": 0.0908498, + "auxiliary_loss_mlp": 0.02113688, + "balance_loss_clip": 0.07555597, + "balance_loss_mlp": 0.01908934, + "epoch": 0.02813768224860965, + "flos": 19250554176000.0, + "grad_norm": 23.52868546244156, + "language_loss": 1.03801823, + "learning_rate": 3.958704737531818e-06, + "loss": 1.15000498, + "num_input_tokens_seen": 9929025, + "router_z_loss_clip": 15.2734375, + "router_z_loss_mlp": 2.04882812, + "step": 468, + "time_per_iteration": 2.6348235607147217 + }, + { + "auxiliary_loss_clip": 0.09087479, + "auxiliary_loss_mlp": 0.02120186, + "balance_loss_clip": 0.07563758, + "balance_loss_mlp": 0.01912189, + "epoch": 0.02819780550127762, + "flos": 20820306965760.0, + "grad_norm": 34.78387665912523, + "language_loss": 1.11076498, + "learning_rate": 3.9600790223300065e-06, + "loss": 1.2228415, + "num_input_tokens_seen": 9945190, + "router_z_loss_clip": 15.2265625, + "router_z_loss_mlp": 2.08300781, + "step": 469, + "time_per_iteration": 2.6886401176452637 + }, + { + "auxiliary_loss_clip": 0.09051213, + "auxiliary_loss_mlp": 0.02126417, + "balance_loss_clip": 0.07552808, + "balance_loss_mlp": 0.01921949, + "epoch": 0.028257928753945588, + "flos": 19980211530240.0, + "grad_norm": 43.4409759227761, + "language_loss": 1.05499089, + "learning_rate": 3.96145038000181e-06, + "loss": 1.16676712, + "num_input_tokens_seen": 9962820, + "router_z_loss_clip": 15.0078125, + "router_z_loss_mlp": 2.046875, + "step": 470, + "time_per_iteration": 2.649240255355835 + }, + { + "auxiliary_loss_clip": 0.09054536, + "auxiliary_loss_mlp": 0.02164254, + "balance_loss_clip": 0.0753805, + "balance_loss_mlp": 0.0194281, + "epoch": 0.028318052006613557, + "flos": 20490585949440.0, + "grad_norm": 34.229925481391405, + "language_loss": 1.11025834, + "learning_rate": 3.962818822989861e-06, + "loss": 1.2224462, + "num_input_tokens_seen": 9982595, + "router_z_loss_clip": 15.1796875, + "router_z_loss_mlp": 2.21484375, + "step": 471, + "time_per_iteration": 2.694502592086792 + }, + { + "auxiliary_loss_clip": 0.0901389, + "auxiliary_loss_mlp": 0.02100335, + "balance_loss_clip": 0.07527161, + "balance_loss_mlp": 0.01902638, + "epoch": 0.02837817525928153, + "flos": 28522854339840.0, + "grad_norm": 28.640745518781863, + "language_loss": 0.93263328, + "learning_rate": 3.964184363657625e-06, + "loss": 1.04377556, + "num_input_tokens_seen": 10004645, + "router_z_loss_clip": 14.859375, + "router_z_loss_mlp": 1.9765625, + "step": 472, + "time_per_iteration": 2.723616123199463 + }, + { + "auxiliary_loss_clip": 0.09058346, + "auxiliary_loss_mlp": 0.02156495, + "balance_loss_clip": 0.07551048, + "balance_loss_mlp": 0.01941347, + "epoch": 0.028438298511949497, + "flos": 18557597710080.0, + "grad_norm": 31.883678895195217, + "language_loss": 1.09761989, + "learning_rate": 3.965547014290071e-06, + "loss": 1.2097683, + "num_input_tokens_seen": 10022555, + "router_z_loss_clip": 15.078125, + "router_z_loss_mlp": 2.15136719, + "step": 473, + "time_per_iteration": 2.678131580352783 + }, + { + "auxiliary_loss_clip": 0.09018995, + "auxiliary_loss_mlp": 0.02143272, + "balance_loss_clip": 0.07526669, + "balance_loss_mlp": 0.01926216, + "epoch": 0.028498421764617466, + "flos": 16915952517120.0, + "grad_norm": 82.06010961294956, + "language_loss": 1.11515367, + "learning_rate": 3.96690678709433e-06, + "loss": 1.22677636, + "num_input_tokens_seen": 10041025, + "router_z_loss_clip": 14.921875, + "router_z_loss_mlp": 2.171875, + "step": 474, + "time_per_iteration": 2.6410977840423584 + }, + { + "auxiliary_loss_clip": 0.08995185, + "auxiliary_loss_mlp": 0.02205209, + "balance_loss_clip": 0.0752454, + "balance_loss_mlp": 0.01985291, + "epoch": 0.028558545017285435, + "flos": 27785524337280.0, + "grad_norm": 24.826629982331372, + "language_loss": 0.97130352, + "learning_rate": 3.968263694200355e-06, + "loss": 1.0833075, + "num_input_tokens_seen": 10060775, + "router_z_loss_clip": 14.6953125, + "router_z_loss_mlp": 2.19726562, + "step": 475, + "time_per_iteration": 2.7301735877990723 + }, + { + "auxiliary_loss_clip": 0.07259832, + "auxiliary_loss_mlp": 0.01404773, + "balance_loss_clip": 0.06728013, + "balance_loss_mlp": 0.01346599, + "epoch": 0.028618668269953403, + "flos": 65674205596800.0, + "grad_norm": 0.9437348671950723, + "language_loss": 0.66932654, + "learning_rate": 3.969617747661569e-06, + "loss": 0.75597262, + "num_input_tokens_seen": 10120225, + "router_z_loss_clip": 5.3125, + "router_z_loss_mlp": 0.58154297, + "step": 476, + "time_per_iteration": 3.247438430786133 + }, + { + "auxiliary_loss_clip": 0.08952022, + "auxiliary_loss_mlp": 0.02252624, + "balance_loss_clip": 0.07508352, + "balance_loss_mlp": 0.02028701, + "epoch": 0.028678791522621375, + "flos": 21942269936640.0, + "grad_norm": 144.43661292546363, + "language_loss": 1.05051386, + "learning_rate": 3.970968959455509e-06, + "loss": 1.16256034, + "num_input_tokens_seen": 10137880, + "router_z_loss_clip": 14.4296875, + "router_z_loss_mlp": 2.24023438, + "step": 477, + "time_per_iteration": 2.6508686542510986 + }, + { + "auxiliary_loss_clip": 0.08993904, + "auxiliary_loss_mlp": 0.02256823, + "balance_loss_clip": 0.0754967, + "balance_loss_mlp": 0.02029467, + "epoch": 0.028738914775289344, + "flos": 24579115672320.0, + "grad_norm": 33.20185721324117, + "language_loss": 1.03065133, + "learning_rate": 3.97231734148446e-06, + "loss": 1.14315856, + "num_input_tokens_seen": 10156930, + "router_z_loss_clip": 14.453125, + "router_z_loss_mlp": 2.2734375, + "step": 478, + "time_per_iteration": 2.7467830181121826 + }, + { + "auxiliary_loss_clip": 0.08933547, + "auxiliary_loss_mlp": 0.0224041, + "balance_loss_clip": 0.07500903, + "balance_loss_mlp": 0.02019921, + "epoch": 0.028799038027957313, + "flos": 23264633946240.0, + "grad_norm": 28.885721108677235, + "language_loss": 1.00177026, + "learning_rate": 3.973662905576082e-06, + "loss": 1.11350989, + "num_input_tokens_seen": 10176295, + "router_z_loss_clip": 14.328125, + "router_z_loss_mlp": 2.20507812, + "step": 479, + "time_per_iteration": 2.7295467853546143 + }, + { + "auxiliary_loss_clip": 0.08948811, + "auxiliary_loss_mlp": 0.02267472, + "balance_loss_clip": 0.07523456, + "balance_loss_mlp": 0.02031152, + "epoch": 0.02885916128062528, + "flos": 22170692966400.0, + "grad_norm": 33.357673755660976, + "language_loss": 0.91625684, + "learning_rate": 3.975005663484038e-06, + "loss": 1.02841961, + "num_input_tokens_seen": 10195790, + "router_z_loss_clip": 14.25, + "router_z_loss_mlp": 2.36328125, + "step": 480, + "time_per_iteration": 2.766277551651001 + }, + { + "auxiliary_loss_clip": 0.08903027, + "auxiliary_loss_mlp": 0.02291788, + "balance_loss_clip": 0.07483099, + "balance_loss_mlp": 0.02045358, + "epoch": 0.02891928453329325, + "flos": 22939986101760.0, + "grad_norm": 22.287574516605755, + "language_loss": 1.01525128, + "learning_rate": 3.976345626888605e-06, + "loss": 1.12719941, + "num_input_tokens_seen": 10218405, + "router_z_loss_clip": 14.1875, + "router_z_loss_mlp": 2.4609375, + "step": 481, + "time_per_iteration": 2.692387580871582 + }, + { + "auxiliary_loss_clip": 0.07204929, + "auxiliary_loss_mlp": 0.0133661, + "balance_loss_clip": 0.06688471, + "balance_loss_mlp": 0.01279295, + "epoch": 0.028979407785961222, + "flos": 57449376524160.0, + "grad_norm": 0.8487290952821426, + "language_loss": 0.65879083, + "learning_rate": 3.9776828073972864e-06, + "loss": 0.74420619, + "num_input_tokens_seen": 10271005, + "router_z_loss_clip": 5.16015625, + "router_z_loss_mlp": 0.57275391, + "step": 482, + "time_per_iteration": 3.019406318664551 + }, + { + "auxiliary_loss_clip": 0.08916203, + "auxiliary_loss_mlp": 0.02251093, + "balance_loss_clip": 0.0748857, + "balance_loss_mlp": 0.02018397, + "epoch": 0.02903953103862919, + "flos": 16727584538880.0, + "grad_norm": 104.5991727322302, + "language_loss": 1.06331348, + "learning_rate": 3.979017216545415e-06, + "loss": 1.17498636, + "num_input_tokens_seen": 10288405, + "router_z_loss_clip": 14.28125, + "router_z_loss_mlp": 2.32421875, + "step": 483, + "time_per_iteration": 2.609882354736328 + }, + { + "auxiliary_loss_clip": 0.08908117, + "auxiliary_loss_mlp": 0.02236577, + "balance_loss_clip": 0.07510938, + "balance_loss_mlp": 0.02016469, + "epoch": 0.02909965429129716, + "flos": 16769232817920.0, + "grad_norm": 23.083678473769563, + "language_loss": 0.94234419, + "learning_rate": 3.980348865796749e-06, + "loss": 1.05379117, + "num_input_tokens_seen": 10306875, + "router_z_loss_clip": 13.96875, + "router_z_loss_mlp": 2.20507812, + "step": 484, + "time_per_iteration": 2.6507458686828613 + }, + { + "auxiliary_loss_clip": 0.08915585, + "auxiliary_loss_mlp": 0.02232887, + "balance_loss_clip": 0.07503805, + "balance_loss_mlp": 0.02011253, + "epoch": 0.029159777543965128, + "flos": 19790334178560.0, + "grad_norm": 110.91894314268477, + "language_loss": 1.00352454, + "learning_rate": 3.9816777665440615e-06, + "loss": 1.11500931, + "num_input_tokens_seen": 10323965, + "router_z_loss_clip": 14.125, + "router_z_loss_mlp": 2.21679688, + "step": 485, + "time_per_iteration": 2.7673757076263428 + }, + { + "auxiliary_loss_clip": 0.08880442, + "auxiliary_loss_mlp": 0.02237809, + "balance_loss_clip": 0.07482816, + "balance_loss_mlp": 0.02005876, + "epoch": 0.029219900796633096, + "flos": 19648184526720.0, + "grad_norm": 27.10228237086094, + "language_loss": 1.06272924, + "learning_rate": 3.983003930109732e-06, + "loss": 1.17391181, + "num_input_tokens_seen": 10342620, + "router_z_loss_clip": 13.96875, + "router_z_loss_mlp": 2.31835938, + "step": 486, + "time_per_iteration": 2.6508092880249023 + }, + { + "auxiliary_loss_clip": 0.08911004, + "auxiliary_loss_mlp": 0.02193732, + "balance_loss_clip": 0.0752122, + "balance_loss_mlp": 0.01974864, + "epoch": 0.02928002404930107, + "flos": 25892926565760.0, + "grad_norm": 15.693662583850747, + "language_loss": 1.04105806, + "learning_rate": 3.984327367746315e-06, + "loss": 1.15210545, + "num_input_tokens_seen": 10364610, + "router_z_loss_clip": 13.90625, + "router_z_loss_mlp": 2.19042969, + "step": 487, + "time_per_iteration": 2.81233286857605 + }, + { + "auxiliary_loss_clip": 0.0888624, + "auxiliary_loss_mlp": 0.02210903, + "balance_loss_clip": 0.07486838, + "balance_loss_mlp": 0.02002811, + "epoch": 0.029340147301969037, + "flos": 20665243785600.0, + "grad_norm": 49.61563210000309, + "language_loss": 1.12978697, + "learning_rate": 3.985648090637122e-06, + "loss": 1.24075842, + "num_input_tokens_seen": 10380910, + "router_z_loss_clip": 13.9921875, + "router_z_loss_mlp": 2.08300781, + "step": 488, + "time_per_iteration": 2.674189567565918 + }, + { + "auxiliary_loss_clip": 0.08953497, + "auxiliary_loss_mlp": 0.02211393, + "balance_loss_clip": 0.07543504, + "balance_loss_mlp": 0.02002347, + "epoch": 0.029400270554637006, + "flos": 24435288938880.0, + "grad_norm": 19.90256121713189, + "language_loss": 1.00477099, + "learning_rate": 3.986966109896785e-06, + "loss": 1.11641979, + "num_input_tokens_seen": 10400665, + "router_z_loss_clip": 14.1015625, + "router_z_loss_mlp": 2.09277344, + "step": 489, + "time_per_iteration": 2.7639148235321045 + }, + { + "auxiliary_loss_clip": 0.0892607, + "auxiliary_loss_mlp": 0.0220073, + "balance_loss_clip": 0.07529595, + "balance_loss_mlp": 0.01982529, + "epoch": 0.029460393807304974, + "flos": 20127140864640.0, + "grad_norm": 27.578366038116485, + "language_loss": 1.02338409, + "learning_rate": 3.988281436571815e-06, + "loss": 1.13465214, + "num_input_tokens_seen": 10420150, + "router_z_loss_clip": 13.96875, + "router_z_loss_mlp": 2.18359375, + "step": 490, + "time_per_iteration": 2.6444106101989746 + }, + { + "auxiliary_loss_clip": 0.08913176, + "auxiliary_loss_mlp": 0.02195572, + "balance_loss_clip": 0.07533699, + "balance_loss_mlp": 0.0197432, + "epoch": 0.029520517059972943, + "flos": 17681681854080.0, + "grad_norm": 29.015537112342308, + "language_loss": 1.11532688, + "learning_rate": 3.989594081641164e-06, + "loss": 1.22641444, + "num_input_tokens_seen": 10438210, + "router_z_loss_clip": 13.7890625, + "router_z_loss_mlp": 2.21289062, + "step": 491, + "time_per_iteration": 5.5153045654296875 + }, + { + "auxiliary_loss_clip": 0.08889591, + "auxiliary_loss_mlp": 0.02207651, + "balance_loss_clip": 0.07520857, + "balance_loss_mlp": 0.0199317, + "epoch": 0.029580640312640915, + "flos": 18959211129600.0, + "grad_norm": 14.57626480214455, + "language_loss": 0.9931764, + "learning_rate": 3.9909040560167675e-06, + "loss": 1.10414886, + "num_input_tokens_seen": 10455125, + "router_z_loss_clip": 13.6875, + "router_z_loss_mlp": 2.14550781, + "step": 492, + "time_per_iteration": 4.12203049659729 + }, + { + "auxiliary_loss_clip": 0.08912461, + "auxiliary_loss_mlp": 0.02272215, + "balance_loss_clip": 0.07548416, + "balance_loss_mlp": 0.02033606, + "epoch": 0.029640763565308884, + "flos": 18730746172800.0, + "grad_norm": 23.908228280746865, + "language_loss": 1.05753922, + "learning_rate": 3.992211370544093e-06, + "loss": 1.16938591, + "num_input_tokens_seen": 10470990, + "router_z_loss_clip": 13.625, + "router_z_loss_mlp": 2.3828125, + "step": 493, + "time_per_iteration": 2.6953020095825195 + }, + { + "auxiliary_loss_clip": 0.08946873, + "auxiliary_loss_mlp": 0.02207101, + "balance_loss_clip": 0.07561117, + "balance_loss_mlp": 0.01985753, + "epoch": 0.029700886817976852, + "flos": 20601652936320.0, + "grad_norm": 59.82783301164341, + "language_loss": 1.05118871, + "learning_rate": 3.99351603600268e-06, + "loss": 1.16272855, + "num_input_tokens_seen": 10490685, + "router_z_loss_clip": 13.8515625, + "router_z_loss_mlp": 2.21386719, + "step": 494, + "time_per_iteration": 2.6631805896759033 + }, + { + "auxiliary_loss_clip": 0.08915924, + "auxiliary_loss_mlp": 0.02239191, + "balance_loss_clip": 0.07543083, + "balance_loss_mlp": 0.0199753, + "epoch": 0.02976101007064482, + "flos": 22243423910400.0, + "grad_norm": 26.318413946561634, + "language_loss": 1.04354262, + "learning_rate": 3.994818063106668e-06, + "loss": 1.15509367, + "num_input_tokens_seen": 10509435, + "router_z_loss_clip": 13.7265625, + "router_z_loss_mlp": 2.4140625, + "step": 495, + "time_per_iteration": 4.107235908508301 + }, + { + "auxiliary_loss_clip": 0.08888054, + "auxiliary_loss_mlp": 0.02273613, + "balance_loss_clip": 0.07541628, + "balance_loss_mlp": 0.02036148, + "epoch": 0.029821133323312793, + "flos": 23739439507200.0, + "grad_norm": 14.252476342508674, + "language_loss": 0.79374158, + "learning_rate": 3.99611746250533e-06, + "loss": 0.9053582, + "num_input_tokens_seen": 10530050, + "router_z_loss_clip": 13.4609375, + "router_z_loss_mlp": 2.37304688, + "step": 496, + "time_per_iteration": 2.757887363433838 + }, + { + "auxiliary_loss_clip": 0.08908898, + "auxiliary_loss_mlp": 0.0225322, + "balance_loss_clip": 0.07561936, + "balance_loss_mlp": 0.02023385, + "epoch": 0.02988125657598076, + "flos": 22426131738240.0, + "grad_norm": 48.93797296748546, + "language_loss": 1.05435932, + "learning_rate": 3.997414244783595e-06, + "loss": 1.16598058, + "num_input_tokens_seen": 10551370, + "router_z_loss_clip": 13.453125, + "router_z_loss_mlp": 2.296875, + "step": 497, + "time_per_iteration": 2.698960781097412 + }, + { + "auxiliary_loss_clip": 0.08959304, + "auxiliary_loss_mlp": 0.0221962, + "balance_loss_clip": 0.07595803, + "balance_loss_mlp": 0.01998176, + "epoch": 0.02994137982864873, + "flos": 13850267984640.0, + "grad_norm": 57.28331954677374, + "language_loss": 1.09360301, + "learning_rate": 3.998708420462557e-06, + "loss": 1.20539236, + "num_input_tokens_seen": 10569225, + "router_z_loss_clip": 13.640625, + "router_z_loss_mlp": 2.21289062, + "step": 498, + "time_per_iteration": 2.699470281600952 + }, + { + "auxiliary_loss_clip": 0.08942117, + "auxiliary_loss_mlp": 0.02291662, + "balance_loss_clip": 0.07576901, + "balance_loss_mlp": 0.02053434, + "epoch": 0.0300015030813167, + "flos": 23914055416320.0, + "grad_norm": 30.471494656970325, + "language_loss": 1.05517888, + "learning_rate": 4e-06, + "loss": 1.16751671, + "num_input_tokens_seen": 10586170, + "router_z_loss_clip": 13.65625, + "router_z_loss_mlp": 2.37890625, + "step": 499, + "time_per_iteration": 2.6825146675109863 + }, + { + "auxiliary_loss_clip": 0.08909643, + "auxiliary_loss_mlp": 0.02277073, + "balance_loss_clip": 0.07578171, + "balance_loss_mlp": 0.02052769, + "epoch": 0.030061626333984667, + "flos": 22023134726400.0, + "grad_norm": 15.715356901732157, + "language_loss": 0.96281993, + "learning_rate": 3.9999999620799e-06, + "loss": 1.07468712, + "num_input_tokens_seen": 10606205, + "router_z_loss_clip": 13.3046875, + "router_z_loss_mlp": 2.24414062, + "step": 500, + "time_per_iteration": 2.7350914478302 + }, + { + "auxiliary_loss_clip": 0.08887713, + "auxiliary_loss_mlp": 0.02297984, + "balance_loss_clip": 0.07557485, + "balance_loss_mlp": 0.02069103, + "epoch": 0.03012174958665264, + "flos": 23046483041280.0, + "grad_norm": 15.325261953037035, + "language_loss": 1.09255648, + "learning_rate": 3.9999998483196e-06, + "loss": 1.20441341, + "num_input_tokens_seen": 10625995, + "router_z_loss_clip": 13.296875, + "router_z_loss_mlp": 2.2890625, + "step": 501, + "time_per_iteration": 2.6515860557556152 + }, + { + "auxiliary_loss_clip": 0.0895866, + "auxiliary_loss_mlp": 0.02279337, + "balance_loss_clip": 0.07618586, + "balance_loss_mlp": 0.02058275, + "epoch": 0.030181872839320608, + "flos": 18959294983680.0, + "grad_norm": 442.08874740717613, + "language_loss": 1.0616231, + "learning_rate": 3.9999996587191065e-06, + "loss": 1.17400312, + "num_input_tokens_seen": 10644105, + "router_z_loss_clip": 13.40625, + "router_z_loss_mlp": 2.21289062, + "step": 502, + "time_per_iteration": 2.6650314331054688 + }, + { + "auxiliary_loss_clip": 0.08926746, + "auxiliary_loss_mlp": 0.02313635, + "balance_loss_clip": 0.07593986, + "balance_loss_mlp": 0.02080176, + "epoch": 0.030241996091988577, + "flos": 16733747813760.0, + "grad_norm": 40.11923719359636, + "language_loss": 1.00487685, + "learning_rate": 3.999999393278425e-06, + "loss": 1.11728072, + "num_input_tokens_seen": 10661090, + "router_z_loss_clip": 13.3125, + "router_z_loss_mlp": 2.3359375, + "step": 503, + "time_per_iteration": 2.6301283836364746 + }, + { + "auxiliary_loss_clip": 0.08950677, + "auxiliary_loss_mlp": 0.02299167, + "balance_loss_clip": 0.07607222, + "balance_loss_mlp": 0.02070094, + "epoch": 0.030302119344656545, + "flos": 28628806227840.0, + "grad_norm": 16.096297116013613, + "language_loss": 1.02800179, + "learning_rate": 3.999999051997567e-06, + "loss": 1.14050031, + "num_input_tokens_seen": 10682380, + "router_z_loss_clip": 13.4375, + "router_z_loss_mlp": 2.28808594, + "step": 504, + "time_per_iteration": 2.7234466075897217 + }, + { + "auxiliary_loss_clip": 0.08954775, + "auxiliary_loss_mlp": 0.022733, + "balance_loss_clip": 0.07610564, + "balance_loss_mlp": 0.02054241, + "epoch": 0.030362242597324514, + "flos": 15674788713600.0, + "grad_norm": 53.80634610199122, + "language_loss": 0.90572113, + "learning_rate": 3.9999986348765425e-06, + "loss": 1.01800191, + "num_input_tokens_seen": 10699925, + "router_z_loss_clip": 13.453125, + "router_z_loss_mlp": 2.19042969, + "step": 505, + "time_per_iteration": 2.6355271339416504 + }, + { + "auxiliary_loss_clip": 0.07202613, + "auxiliary_loss_mlp": 0.01385887, + "balance_loss_clip": 0.06702607, + "balance_loss_mlp": 0.01312073, + "epoch": 0.030422365849992486, + "flos": 72149173528320.0, + "grad_norm": 1.0312424009228802, + "language_loss": 0.55707914, + "learning_rate": 3.999998141915371e-06, + "loss": 0.64296412, + "num_input_tokens_seen": 10766525, + "router_z_loss_clip": 5.0, + "router_z_loss_mlp": 0.73779297, + "step": 506, + "time_per_iteration": 3.4425716400146484 + }, + { + "auxiliary_loss_clip": 0.08947556, + "auxiliary_loss_mlp": 0.0229462, + "balance_loss_clip": 0.07588895, + "balance_loss_mlp": 0.02080234, + "epoch": 0.030482489102660455, + "flos": 19433974763520.0, + "grad_norm": 15.732874937996321, + "language_loss": 0.96318799, + "learning_rate": 3.999997573114069e-06, + "loss": 1.07560968, + "num_input_tokens_seen": 10786725, + "router_z_loss_clip": 13.5703125, + "router_z_loss_mlp": 2.14648438, + "step": 507, + "time_per_iteration": 2.6885857582092285 + }, + { + "auxiliary_loss_clip": 0.08928548, + "auxiliary_loss_mlp": 0.02259048, + "balance_loss_clip": 0.07588597, + "balance_loss_mlp": 0.02042945, + "epoch": 0.030542612355328423, + "flos": 20382034584960.0, + "grad_norm": 22.351883402694675, + "language_loss": 1.05944586, + "learning_rate": 3.999996928472659e-06, + "loss": 1.17132187, + "num_input_tokens_seen": 10805390, + "router_z_loss_clip": 13.3984375, + "router_z_loss_mlp": 2.15722656, + "step": 508, + "time_per_iteration": 2.659903049468994 + }, + { + "auxiliary_loss_clip": 0.08911724, + "auxiliary_loss_mlp": 0.02284852, + "balance_loss_clip": 0.07589735, + "balance_loss_mlp": 0.02067796, + "epoch": 0.030602735607996392, + "flos": 34685809194240.0, + "grad_norm": 36.57726962187856, + "language_loss": 0.84476292, + "learning_rate": 3.999996207991165e-06, + "loss": 0.95672864, + "num_input_tokens_seen": 10828030, + "router_z_loss_clip": 13.1953125, + "router_z_loss_mlp": 2.17089844, + "step": 509, + "time_per_iteration": 2.8194127082824707 + }, + { + "auxiliary_loss_clip": 0.08892205, + "auxiliary_loss_mlp": 0.02281797, + "balance_loss_clip": 0.07575735, + "balance_loss_mlp": 0.02065503, + "epoch": 0.03066285886066436, + "flos": 23665283043840.0, + "grad_norm": 17.47434487382061, + "language_loss": 0.97325271, + "learning_rate": 3.999995411669614e-06, + "loss": 1.08499277, + "num_input_tokens_seen": 10845240, + "router_z_loss_clip": 13.15625, + "router_z_loss_mlp": 2.16210938, + "step": 510, + "time_per_iteration": 2.6817235946655273 + }, + { + "auxiliary_loss_clip": 0.08892487, + "auxiliary_loss_mlp": 0.02360194, + "balance_loss_clip": 0.07583004, + "balance_loss_mlp": 0.02123492, + "epoch": 0.030722982113332332, + "flos": 23009656371840.0, + "grad_norm": 18.905046526469672, + "language_loss": 1.01792526, + "learning_rate": 3.999994539508036e-06, + "loss": 1.13045216, + "num_input_tokens_seen": 10864325, + "router_z_loss_clip": 13.109375, + "router_z_loss_mlp": 2.36328125, + "step": 511, + "time_per_iteration": 2.7218635082244873 + }, + { + "auxiliary_loss_clip": 0.08893925, + "auxiliary_loss_mlp": 0.02289988, + "balance_loss_clip": 0.07569309, + "balance_loss_mlp": 0.02083041, + "epoch": 0.0307831053660003, + "flos": 24757253452800.0, + "grad_norm": 19.668331583944035, + "language_loss": 0.98058987, + "learning_rate": 3.9999935915064655e-06, + "loss": 1.09242892, + "num_input_tokens_seen": 10883860, + "router_z_loss_clip": 13.25, + "router_z_loss_mlp": 2.07226562, + "step": 512, + "time_per_iteration": 2.6965620517730713 + }, + { + "auxiliary_loss_clip": 0.08852743, + "auxiliary_loss_mlp": 0.02379446, + "balance_loss_clip": 0.0755362, + "balance_loss_mlp": 0.02156858, + "epoch": 0.03084322861866827, + "flos": 26148113775360.0, + "grad_norm": 13.468181826610785, + "language_loss": 1.01916862, + "learning_rate": 3.9999925676649374e-06, + "loss": 1.13149047, + "num_input_tokens_seen": 10904555, + "router_z_loss_clip": 12.984375, + "router_z_loss_mlp": 2.22460938, + "step": 513, + "time_per_iteration": 2.711587429046631 + }, + { + "auxiliary_loss_clip": 0.08845583, + "auxiliary_loss_mlp": 0.02430958, + "balance_loss_clip": 0.07545915, + "balance_loss_mlp": 0.02204555, + "epoch": 0.03090335187133624, + "flos": 18777383769600.0, + "grad_norm": 6.55607776583441, + "language_loss": 0.95138013, + "learning_rate": 3.999991467983491e-06, + "loss": 1.06414557, + "num_input_tokens_seen": 10923700, + "router_z_loss_clip": 13.0, + "router_z_loss_mlp": 2.26269531, + "step": 514, + "time_per_iteration": 2.6500775814056396 + }, + { + "auxiliary_loss_clip": 0.08815307, + "auxiliary_loss_mlp": 0.02407072, + "balance_loss_clip": 0.07539771, + "balance_loss_mlp": 0.02187917, + "epoch": 0.030963475124004207, + "flos": 23228603890560.0, + "grad_norm": 18.204719930438795, + "language_loss": 0.97247916, + "learning_rate": 3.999990292462167e-06, + "loss": 1.08470297, + "num_input_tokens_seen": 10942730, + "router_z_loss_clip": 12.7578125, + "router_z_loss_mlp": 2.19335938, + "step": 515, + "time_per_iteration": 2.7167558670043945 + }, + { + "auxiliary_loss_clip": 0.08806405, + "auxiliary_loss_mlp": 0.02437712, + "balance_loss_clip": 0.0752582, + "balance_loss_mlp": 0.02208258, + "epoch": 0.03102359837667218, + "flos": 42535998662400.0, + "grad_norm": 5.904658856542002, + "language_loss": 1.00314569, + "learning_rate": 3.999989041101011e-06, + "loss": 1.11558676, + "num_input_tokens_seen": 10967120, + "router_z_loss_clip": 12.8203125, + "router_z_loss_mlp": 2.29492188, + "step": 516, + "time_per_iteration": 2.932173013687134 + }, + { + "auxiliary_loss_clip": 0.08796877, + "auxiliary_loss_mlp": 0.02455233, + "balance_loss_clip": 0.07514809, + "balance_loss_mlp": 0.02220629, + "epoch": 0.031083721629340148, + "flos": 21183039290880.0, + "grad_norm": 45.02393900109363, + "language_loss": 0.9180311, + "learning_rate": 3.999987713900071e-06, + "loss": 1.03055215, + "num_input_tokens_seen": 10986775, + "router_z_loss_clip": 12.8359375, + "router_z_loss_mlp": 2.34375, + "step": 517, + "time_per_iteration": 2.666154623031616 + }, + { + "auxiliary_loss_clip": 0.08820206, + "auxiliary_loss_mlp": 0.02414127, + "balance_loss_clip": 0.07551458, + "balance_loss_mlp": 0.02194306, + "epoch": 0.031143844882008116, + "flos": 29723963091840.0, + "grad_norm": 7.285252117980509, + "language_loss": 0.99479294, + "learning_rate": 3.999986310859396e-06, + "loss": 1.10713625, + "num_input_tokens_seen": 11011360, + "router_z_loss_clip": 12.6796875, + "router_z_loss_mlp": 2.19824219, + "step": 518, + "time_per_iteration": 2.752505302429199 + }, + { + "auxiliary_loss_clip": 0.08830461, + "auxiliary_loss_mlp": 0.024645, + "balance_loss_clip": 0.07556459, + "balance_loss_mlp": 0.02246586, + "epoch": 0.031203968134676085, + "flos": 23119172058240.0, + "grad_norm": 20.736865355911096, + "language_loss": 1.01917171, + "learning_rate": 3.999984831979039e-06, + "loss": 1.13212132, + "num_input_tokens_seen": 11030150, + "router_z_loss_clip": 12.734375, + "router_z_loss_mlp": 2.1796875, + "step": 519, + "time_per_iteration": 2.6659457683563232 + }, + { + "auxiliary_loss_clip": 0.08817208, + "auxiliary_loss_mlp": 0.02465606, + "balance_loss_clip": 0.07545176, + "balance_loss_mlp": 0.02241778, + "epoch": 0.03126409138734405, + "flos": 20959815214080.0, + "grad_norm": 7.142122271726701, + "language_loss": 1.00803113, + "learning_rate": 3.999983277259057e-06, + "loss": 1.12085938, + "num_input_tokens_seen": 11049145, + "router_z_loss_clip": 12.7265625, + "router_z_loss_mlp": 2.23632812, + "step": 520, + "time_per_iteration": 2.7612173557281494 + }, + { + "auxiliary_loss_clip": 0.08873951, + "auxiliary_loss_mlp": 0.02427922, + "balance_loss_clip": 0.07591425, + "balance_loss_mlp": 0.02219163, + "epoch": 0.031324214640012026, + "flos": 21656083916160.0, + "grad_norm": 5386.394179139514, + "language_loss": 1.03191018, + "learning_rate": 3.999981646699509e-06, + "loss": 1.14492893, + "num_input_tokens_seen": 11068835, + "router_z_loss_clip": 12.8203125, + "router_z_loss_mlp": 2.08886719, + "step": 521, + "time_per_iteration": 2.6934170722961426 + }, + { + "auxiliary_loss_clip": 0.08889641, + "auxiliary_loss_mlp": 0.02359363, + "balance_loss_clip": 0.07604645, + "balance_loss_mlp": 0.02163669, + "epoch": 0.03138433789267999, + "flos": 23448180314880.0, + "grad_norm": 8.073235529869596, + "language_loss": 0.83005708, + "learning_rate": 3.999979940300456e-06, + "loss": 0.94254714, + "num_input_tokens_seen": 11088980, + "router_z_loss_clip": 12.84375, + "router_z_loss_mlp": 1.95800781, + "step": 522, + "time_per_iteration": 2.8722758293151855 + }, + { + "auxiliary_loss_clip": 0.08903908, + "auxiliary_loss_mlp": 0.02254118, + "balance_loss_clip": 0.07622182, + "balance_loss_mlp": 0.0208465, + "epoch": 0.03144446114534796, + "flos": 18986939631360.0, + "grad_norm": 12.411483225368043, + "language_loss": 1.05680871, + "learning_rate": 3.999978158061963e-06, + "loss": 1.16838908, + "num_input_tokens_seen": 11104300, + "router_z_loss_clip": 12.8046875, + "router_z_loss_mlp": 1.6953125, + "step": 523, + "time_per_iteration": 2.650547742843628 + }, + { + "auxiliary_loss_clip": 0.08934012, + "auxiliary_loss_mlp": 0.02230434, + "balance_loss_clip": 0.07644011, + "balance_loss_mlp": 0.0206087, + "epoch": 0.031504584398015935, + "flos": 22644240716160.0, + "grad_norm": 13.96543726868128, + "language_loss": 1.08792841, + "learning_rate": 3.999976299984099e-06, + "loss": 1.1995728, + "num_input_tokens_seen": 11123335, + "router_z_loss_clip": 12.8984375, + "router_z_loss_mlp": 1.69628906, + "step": 524, + "time_per_iteration": 2.7135303020477295 + }, + { + "auxiliary_loss_clip": 0.08891568, + "auxiliary_loss_mlp": 0.02091454, + "balance_loss_clip": 0.07603844, + "balance_loss_mlp": 0.0193486, + "epoch": 0.0315647076506839, + "flos": 25303364438400.0, + "grad_norm": 13.325751395918596, + "language_loss": 0.96287918, + "learning_rate": 3.999974366066933e-06, + "loss": 1.07270944, + "num_input_tokens_seen": 11140880, + "router_z_loss_clip": 12.875, + "router_z_loss_mlp": 1.56542969, + "step": 525, + "time_per_iteration": 2.7008469104766846 + }, + { + "auxiliary_loss_clip": 0.08895689, + "auxiliary_loss_mlp": 0.02060743, + "balance_loss_clip": 0.07611247, + "balance_loss_mlp": 0.01902052, + "epoch": 0.03162483090335187, + "flos": 16988515752960.0, + "grad_norm": 10.865036443132793, + "language_loss": 0.93799376, + "learning_rate": 3.999972356310538e-06, + "loss": 1.04755807, + "num_input_tokens_seen": 11158710, + "router_z_loss_clip": 12.84375, + "router_z_loss_mlp": 1.58789062, + "step": 526, + "time_per_iteration": 2.6346511840820312 + }, + { + "auxiliary_loss_clip": 0.08917748, + "auxiliary_loss_mlp": 0.01935945, + "balance_loss_clip": 0.07596096, + "balance_loss_mlp": 0.01773629, + "epoch": 0.03168495415601984, + "flos": 18740515173120.0, + "grad_norm": 57.85895101220995, + "language_loss": 0.99752951, + "learning_rate": 3.999970270714991e-06, + "loss": 1.10606647, + "num_input_tokens_seen": 11177550, + "router_z_loss_clip": 13.2109375, + "router_z_loss_mlp": 1.62402344, + "step": 527, + "time_per_iteration": 2.679004669189453 + }, + { + "auxiliary_loss_clip": 0.08855803, + "auxiliary_loss_mlp": 0.01834989, + "balance_loss_clip": 0.07585346, + "balance_loss_mlp": 0.01673914, + "epoch": 0.03174507740868781, + "flos": 21221207625600.0, + "grad_norm": 46.02909291045389, + "language_loss": 1.11322296, + "learning_rate": 3.999968109280371e-06, + "loss": 1.22013092, + "num_input_tokens_seen": 11196230, + "router_z_loss_clip": 12.703125, + "router_z_loss_mlp": 1.61035156, + "step": 528, + "time_per_iteration": 2.6590561866760254 + }, + { + "auxiliary_loss_clip": 0.08896849, + "auxiliary_loss_mlp": 0.01846134, + "balance_loss_clip": 0.07587088, + "balance_loss_mlp": 0.01668655, + "epoch": 0.03180520066135578, + "flos": 24794122049280.0, + "grad_norm": 60.37354361545739, + "language_loss": 0.97275496, + "learning_rate": 3.99996587200676e-06, + "loss": 1.08018494, + "num_input_tokens_seen": 11214935, + "router_z_loss_clip": 13.09375, + "router_z_loss_mlp": 1.77539062, + "step": 529, + "time_per_iteration": 2.7260618209838867 + }, + { + "auxiliary_loss_clip": 0.08883977, + "auxiliary_loss_mlp": 0.01771414, + "balance_loss_clip": 0.07582102, + "balance_loss_mlp": 0.01579535, + "epoch": 0.03186532391402375, + "flos": 24871339186560.0, + "grad_norm": 10627.611218983826, + "language_loss": 1.18170238, + "learning_rate": 3.999963558894243e-06, + "loss": 1.28825641, + "num_input_tokens_seen": 11235310, + "router_z_loss_clip": 13.015625, + "router_z_loss_mlp": 1.91894531, + "step": 530, + "time_per_iteration": 2.7020938396453857 + }, + { + "auxiliary_loss_clip": 0.08833256, + "auxiliary_loss_mlp": 0.01774458, + "balance_loss_clip": 0.07546531, + "balance_loss_mlp": 0.01588683, + "epoch": 0.03192544716669172, + "flos": 21221417260800.0, + "grad_norm": 74.92861353079512, + "language_loss": 0.92192125, + "learning_rate": 3.999961169942907e-06, + "loss": 1.02799833, + "num_input_tokens_seen": 11254425, + "router_z_loss_clip": 12.8671875, + "router_z_loss_mlp": 1.85644531, + "step": 531, + "time_per_iteration": 5.536854028701782 + }, + { + "auxiliary_loss_clip": 0.08819988, + "auxiliary_loss_mlp": 0.0179185, + "balance_loss_clip": 0.07536054, + "balance_loss_mlp": 0.01611224, + "epoch": 0.03198557041935969, + "flos": 24360168153600.0, + "grad_norm": 15.362611414198588, + "language_loss": 1.04843593, + "learning_rate": 3.999958705152843e-06, + "loss": 1.15455437, + "num_input_tokens_seen": 11274595, + "router_z_loss_clip": 12.8359375, + "router_z_loss_mlp": 1.8046875, + "step": 532, + "time_per_iteration": 4.078269958496094 + }, + { + "auxiliary_loss_clip": 0.07593378, + "auxiliary_loss_mlp": 0.01964501, + "balance_loss_clip": 0.07000267, + "balance_loss_mlp": 0.01595619, + "epoch": 0.032045693672027656, + "flos": 61847235993600.0, + "grad_norm": 0.8955673428440366, + "language_loss": 0.58032346, + "learning_rate": 3.9999561645241445e-06, + "loss": 0.67590225, + "num_input_tokens_seen": 11336705, + "router_z_loss_clip": 5.9375, + "router_z_loss_mlp": 3.68554688, + "step": 533, + "time_per_iteration": 3.319361925125122 + }, + { + "auxiliary_loss_clip": 0.08788651, + "auxiliary_loss_mlp": 0.01742728, + "balance_loss_clip": 0.07528964, + "balance_loss_mlp": 0.01567061, + "epoch": 0.03210581692469563, + "flos": 28408475116800.0, + "grad_norm": 18.42557842883857, + "language_loss": 0.99417937, + "learning_rate": 3.999953548056907e-06, + "loss": 1.09949315, + "num_input_tokens_seen": 11356820, + "router_z_loss_clip": 12.5859375, + "router_z_loss_mlp": 1.75585938, + "step": 534, + "time_per_iteration": 4.265074729919434 + }, + { + "auxiliary_loss_clip": 0.08770919, + "auxiliary_loss_mlp": 0.0174947, + "balance_loss_clip": 0.07504185, + "balance_loss_mlp": 0.01577809, + "epoch": 0.03216594017736359, + "flos": 24724661414400.0, + "grad_norm": 508.9639434919875, + "language_loss": 0.94137996, + "learning_rate": 3.999950855751232e-06, + "loss": 1.04658389, + "num_input_tokens_seen": 11376645, + "router_z_loss_clip": 12.671875, + "router_z_loss_mlp": 1.71777344, + "step": 535, + "time_per_iteration": 2.7245981693267822 + }, + { + "auxiliary_loss_clip": 0.08758718, + "auxiliary_loss_mlp": 0.01725335, + "balance_loss_clip": 0.07518992, + "balance_loss_mlp": 0.01554437, + "epoch": 0.032226063430031565, + "flos": 31183445508480.0, + "grad_norm": 22.532643943929422, + "language_loss": 0.94802475, + "learning_rate": 3.999948087607219e-06, + "loss": 1.05286527, + "num_input_tokens_seen": 11397310, + "router_z_loss_clip": 12.390625, + "router_z_loss_mlp": 1.70996094, + "step": 536, + "time_per_iteration": 2.7583792209625244 + }, + { + "auxiliary_loss_clip": 0.08705089, + "auxiliary_loss_mlp": 0.01729852, + "balance_loss_clip": 0.07491484, + "balance_loss_mlp": 0.01569253, + "epoch": 0.03228618668269954, + "flos": 32206584188160.0, + "grad_norm": 18.146665662297185, + "language_loss": 0.83908743, + "learning_rate": 3.999945243624975e-06, + "loss": 0.94343686, + "num_input_tokens_seen": 11418475, + "router_z_loss_clip": 12.1484375, + "router_z_loss_mlp": 1.60546875, + "step": 537, + "time_per_iteration": 2.770418167114258 + }, + { + "auxiliary_loss_clip": 0.08731261, + "auxiliary_loss_mlp": 0.01758368, + "balance_loss_clip": 0.07496089, + "balance_loss_mlp": 0.0159672, + "epoch": 0.0323463099353675, + "flos": 22676036140800.0, + "grad_norm": 12.39933899749453, + "language_loss": 0.95942801, + "learning_rate": 3.999942323804607e-06, + "loss": 1.06432438, + "num_input_tokens_seen": 11436630, + "router_z_loss_clip": 12.3515625, + "router_z_loss_mlp": 1.6171875, + "step": 538, + "time_per_iteration": 2.7392029762268066 + }, + { + "auxiliary_loss_clip": 0.0875225, + "auxiliary_loss_mlp": 0.01750456, + "balance_loss_clip": 0.07507962, + "balance_loss_mlp": 0.01584802, + "epoch": 0.032406433188035474, + "flos": 26912207957760.0, + "grad_norm": 95.24255955505957, + "language_loss": 0.90228236, + "learning_rate": 3.999939328146225e-06, + "loss": 1.00730944, + "num_input_tokens_seen": 11457275, + "router_z_loss_clip": 12.4453125, + "router_z_loss_mlp": 1.65625, + "step": 539, + "time_per_iteration": 2.760545253753662 + }, + { + "auxiliary_loss_clip": 0.08700242, + "auxiliary_loss_mlp": 0.01788145, + "balance_loss_clip": 0.07481987, + "balance_loss_mlp": 0.0161162, + "epoch": 0.03246655644070344, + "flos": 31511992567680.0, + "grad_norm": 15.31403595077071, + "language_loss": 0.89398444, + "learning_rate": 3.999936256649943e-06, + "loss": 0.99886829, + "num_input_tokens_seen": 11476925, + "router_z_loss_clip": 12.1875, + "router_z_loss_mlp": 1.76757812, + "step": 540, + "time_per_iteration": 2.791525363922119 + }, + { + "auxiliary_loss_clip": 0.08740143, + "auxiliary_loss_mlp": 0.01834392, + "balance_loss_clip": 0.07499444, + "balance_loss_mlp": 0.01643276, + "epoch": 0.03252667969337141, + "flos": 23224453113600.0, + "grad_norm": 73.47244628512628, + "language_loss": 0.99572086, + "learning_rate": 3.999933109315878e-06, + "loss": 1.10146618, + "num_input_tokens_seen": 11496830, + "router_z_loss_clip": 12.40625, + "router_z_loss_mlp": 1.90917969, + "step": 541, + "time_per_iteration": 2.698315143585205 + }, + { + "auxiliary_loss_clip": 0.08765414, + "auxiliary_loss_mlp": 0.01821723, + "balance_loss_clip": 0.07523992, + "balance_loss_mlp": 0.01612201, + "epoch": 0.032586802946039384, + "flos": 14762800874880.0, + "grad_norm": 49.77821697975532, + "language_loss": 1.00654817, + "learning_rate": 3.9999298861441496e-06, + "loss": 1.11241961, + "num_input_tokens_seen": 11515605, + "router_z_loss_clip": 12.4296875, + "router_z_loss_mlp": 2.09667969, + "step": 542, + "time_per_iteration": 2.6720223426818848 + }, + { + "auxiliary_loss_clip": 0.08722232, + "auxiliary_loss_mlp": 0.01879557, + "balance_loss_clip": 0.07465587, + "balance_loss_mlp": 0.01644953, + "epoch": 0.03264692619870735, + "flos": 24287688771840.0, + "grad_norm": 65.19472082730613, + "language_loss": 0.83699101, + "learning_rate": 3.999926587134879e-06, + "loss": 0.9430089, + "num_input_tokens_seen": 11536230, + "router_z_loss_clip": 12.5625, + "router_z_loss_mlp": 2.34375, + "step": 543, + "time_per_iteration": 2.692474842071533 + }, + { + "auxiliary_loss_clip": 0.0878472, + "auxiliary_loss_mlp": 0.01882603, + "balance_loss_clip": 0.07507792, + "balance_loss_mlp": 0.01631214, + "epoch": 0.03270704945137532, + "flos": 22899763342080.0, + "grad_norm": 1912.553873416959, + "language_loss": 1.09316349, + "learning_rate": 3.999923212288192e-06, + "loss": 1.19983673, + "num_input_tokens_seen": 11554715, + "router_z_loss_clip": 12.7734375, + "router_z_loss_mlp": 2.51367188, + "step": 544, + "time_per_iteration": 2.663267135620117 + }, + { + "auxiliary_loss_clip": 0.0881625, + "auxiliary_loss_mlp": 0.01879222, + "balance_loss_clip": 0.07490219, + "balance_loss_mlp": 0.01537997, + "epoch": 0.032767172704043286, + "flos": 18046887874560.0, + "grad_norm": 1976.6790975556307, + "language_loss": 0.85651809, + "learning_rate": 3.999919761604216e-06, + "loss": 0.96347284, + "num_input_tokens_seen": 11571370, + "router_z_loss_clip": 13.265625, + "router_z_loss_mlp": 3.41210938, + "step": 545, + "time_per_iteration": 2.6566007137298584 + }, + { + "auxiliary_loss_clip": 0.08881226, + "auxiliary_loss_mlp": 0.01919651, + "balance_loss_clip": 0.07538594, + "balance_loss_mlp": 0.01591969, + "epoch": 0.03282729595671126, + "flos": 22535353935360.0, + "grad_norm": 36635.99630864103, + "language_loss": 1.19350576, + "learning_rate": 3.999916235083083e-06, + "loss": 1.30151451, + "num_input_tokens_seen": 11588560, + "router_z_loss_clip": 13.421875, + "router_z_loss_mlp": 3.27539062, + "step": 546, + "time_per_iteration": 2.6508443355560303 + }, + { + "auxiliary_loss_clip": 0.0885489, + "auxiliary_loss_mlp": 0.01969573, + "balance_loss_clip": 0.07525921, + "balance_loss_mlp": 0.01650092, + "epoch": 0.03288741920937923, + "flos": 20416555267200.0, + "grad_norm": 175.83782863941582, + "language_loss": 1.0484463, + "learning_rate": 3.999912632724925e-06, + "loss": 1.15669084, + "num_input_tokens_seen": 11605685, + "router_z_loss_clip": 13.28125, + "router_z_loss_mlp": 3.1953125, + "step": 547, + "time_per_iteration": 2.709317445755005 + }, + { + "auxiliary_loss_clip": 0.08846241, + "auxiliary_loss_mlp": 0.02054837, + "balance_loss_clip": 0.07521404, + "balance_loss_mlp": 0.01724484, + "epoch": 0.032947542462047195, + "flos": 20784402691200.0, + "grad_norm": 1231.4634556281662, + "language_loss": 0.99917918, + "learning_rate": 3.999908954529881e-06, + "loss": 1.10818994, + "num_input_tokens_seen": 11626290, + "router_z_loss_clip": 13.2578125, + "router_z_loss_mlp": 3.30664062, + "step": 548, + "time_per_iteration": 2.761152744293213 + }, + { + "auxiliary_loss_clip": 0.08837526, + "auxiliary_loss_mlp": 0.02099407, + "balance_loss_clip": 0.07500955, + "balance_loss_mlp": 0.01773059, + "epoch": 0.03300766571471517, + "flos": 19907354805120.0, + "grad_norm": 538.4476306780408, + "language_loss": 0.89559388, + "learning_rate": 3.999905200498087e-06, + "loss": 1.00496316, + "num_input_tokens_seen": 11643950, + "router_z_loss_clip": 13.3671875, + "router_z_loss_mlp": 3.26367188, + "step": 549, + "time_per_iteration": 2.7063941955566406 + }, + { + "auxiliary_loss_clip": 0.08802217, + "auxiliary_loss_mlp": 0.02104246, + "balance_loss_clip": 0.07490957, + "balance_loss_mlp": 0.0178324, + "epoch": 0.03306778896738313, + "flos": 17973569952000.0, + "grad_norm": 95.24031464069257, + "language_loss": 1.00179911, + "learning_rate": 3.999901370629689e-06, + "loss": 1.1108638, + "num_input_tokens_seen": 11662560, + "router_z_loss_clip": 13.125, + "router_z_loss_mlp": 3.20703125, + "step": 550, + "time_per_iteration": 2.6905276775360107 + }, + { + "auxiliary_loss_clip": 0.08789266, + "auxiliary_loss_mlp": 0.02134598, + "balance_loss_clip": 0.07500902, + "balance_loss_mlp": 0.01818551, + "epoch": 0.033127912220051105, + "flos": 21659899276800.0, + "grad_norm": 52.30662645055097, + "language_loss": 0.93777549, + "learning_rate": 3.99989746492483e-06, + "loss": 1.04701412, + "num_input_tokens_seen": 11682265, + "router_z_loss_clip": 12.8984375, + "router_z_loss_mlp": 3.16015625, + "step": 551, + "time_per_iteration": 2.7061314582824707 + }, + { + "auxiliary_loss_clip": 0.08738074, + "auxiliary_loss_mlp": 0.02134365, + "balance_loss_clip": 0.07474738, + "balance_loss_mlp": 0.01835484, + "epoch": 0.03318803547271908, + "flos": 30195875687040.0, + "grad_norm": 81.64424293941155, + "language_loss": 1.06586599, + "learning_rate": 3.999893483383658e-06, + "loss": 1.17459035, + "num_input_tokens_seen": 11699300, + "router_z_loss_clip": 12.6484375, + "router_z_loss_mlp": 2.98828125, + "step": 552, + "time_per_iteration": 2.7557857036590576 + }, + { + "auxiliary_loss_clip": 0.08738689, + "auxiliary_loss_mlp": 0.02132193, + "balance_loss_clip": 0.07474653, + "balance_loss_mlp": 0.01841513, + "epoch": 0.03324815872538704, + "flos": 20382286147200.0, + "grad_norm": 103.46520912531122, + "language_loss": 1.07230687, + "learning_rate": 3.999889426006326e-06, + "loss": 1.18101549, + "num_input_tokens_seen": 11716955, + "router_z_loss_clip": 12.6328125, + "router_z_loss_mlp": 2.90625, + "step": 553, + "time_per_iteration": 2.6690380573272705 + }, + { + "auxiliary_loss_clip": 0.0876793, + "auxiliary_loss_mlp": 0.02203825, + "balance_loss_clip": 0.07493228, + "balance_loss_mlp": 0.01878431, + "epoch": 0.033308281978055014, + "flos": 24500766504960.0, + "grad_norm": 2577.3704160991106, + "language_loss": 0.91311669, + "learning_rate": 3.999885292792986e-06, + "loss": 1.0228343, + "num_input_tokens_seen": 11736130, + "router_z_loss_clip": 12.75, + "router_z_loss_mlp": 3.25390625, + "step": 554, + "time_per_iteration": 2.690467119216919 + }, + { + "auxiliary_loss_clip": 0.08781252, + "auxiliary_loss_mlp": 0.02161472, + "balance_loss_clip": 0.0750941, + "balance_loss_mlp": 0.01854961, + "epoch": 0.03336840523072298, + "flos": 23406406254720.0, + "grad_norm": 23.66967902789698, + "language_loss": 0.92365468, + "learning_rate": 3.999881083743795e-06, + "loss": 1.03308201, + "num_input_tokens_seen": 11754425, + "router_z_loss_clip": 12.7265625, + "router_z_loss_mlp": 3.06445312, + "step": 555, + "time_per_iteration": 2.7009239196777344 + }, + { + "auxiliary_loss_clip": 0.0871176, + "auxiliary_loss_mlp": 0.02191896, + "balance_loss_clip": 0.0746032, + "balance_loss_mlp": 0.01904268, + "epoch": 0.03342852848339095, + "flos": 30557685617280.0, + "grad_norm": 32.47411862244808, + "language_loss": 1.03816569, + "learning_rate": 3.999876798858914e-06, + "loss": 1.14720225, + "num_input_tokens_seen": 11772845, + "router_z_loss_clip": 12.5234375, + "router_z_loss_mlp": 2.875, + "step": 556, + "time_per_iteration": 2.7751269340515137 + }, + { + "auxiliary_loss_clip": 0.08728363, + "auxiliary_loss_mlp": 0.02208938, + "balance_loss_clip": 0.07497713, + "balance_loss_mlp": 0.01914825, + "epoch": 0.03348865173605892, + "flos": 22899931050240.0, + "grad_norm": 26.350622314910414, + "language_loss": 0.97158062, + "learning_rate": 3.999872438138503e-06, + "loss": 1.0809536, + "num_input_tokens_seen": 11792850, + "router_z_loss_clip": 12.3046875, + "router_z_loss_mlp": 2.93945312, + "step": 557, + "time_per_iteration": 2.6803956031799316 + }, + { + "auxiliary_loss_clip": 0.08708371, + "auxiliary_loss_mlp": 0.02154386, + "balance_loss_clip": 0.0748485, + "balance_loss_mlp": 0.01905477, + "epoch": 0.03354877498872689, + "flos": 17681807635200.0, + "grad_norm": 18.772470179547817, + "language_loss": 1.10132766, + "learning_rate": 3.999868001582729e-06, + "loss": 1.20995522, + "num_input_tokens_seen": 11809670, + "router_z_loss_clip": 12.2265625, + "router_z_loss_mlp": 2.49023438, + "step": 558, + "time_per_iteration": 2.650348663330078 + }, + { + "auxiliary_loss_clip": 0.08667068, + "auxiliary_loss_mlp": 0.02131925, + "balance_loss_clip": 0.07472065, + "balance_loss_mlp": 0.01914487, + "epoch": 0.03360889824139486, + "flos": 21659438079360.0, + "grad_norm": 17.45552884003481, + "language_loss": 0.92322779, + "learning_rate": 3.99986348919176e-06, + "loss": 1.03121769, + "num_input_tokens_seen": 11829665, + "router_z_loss_clip": 11.953125, + "router_z_loss_mlp": 2.17578125, + "step": 559, + "time_per_iteration": 2.69866681098938 + }, + { + "auxiliary_loss_clip": 0.08715945, + "auxiliary_loss_mlp": 0.02064835, + "balance_loss_clip": 0.07521564, + "balance_loss_mlp": 0.01861607, + "epoch": 0.033669021494062826, + "flos": 21801671585280.0, + "grad_norm": 8.293279297555102, + "language_loss": 0.96911502, + "learning_rate": 3.9998589009657675e-06, + "loss": 1.07692266, + "num_input_tokens_seen": 11848190, + "router_z_loss_clip": 11.9453125, + "router_z_loss_mlp": 2.03417969, + "step": 560, + "time_per_iteration": 2.7140135765075684 + }, + { + "auxiliary_loss_clip": 0.08642244, + "auxiliary_loss_mlp": 0.01977364, + "balance_loss_clip": 0.07480196, + "balance_loss_mlp": 0.01790062, + "epoch": 0.0337291447467308, + "flos": 21871761125760.0, + "grad_norm": 36.168101096947126, + "language_loss": 0.91244531, + "learning_rate": 3.999854236904925e-06, + "loss": 1.01864135, + "num_input_tokens_seen": 11864795, + "router_z_loss_clip": 11.640625, + "router_z_loss_mlp": 1.875, + "step": 561, + "time_per_iteration": 2.6863293647766113 + }, + { + "auxiliary_loss_clip": 0.08645087, + "auxiliary_loss_mlp": 0.01996294, + "balance_loss_clip": 0.07495341, + "balance_loss_mlp": 0.01809374, + "epoch": 0.03378926799939877, + "flos": 24253251943680.0, + "grad_norm": 9.210066016696686, + "language_loss": 0.90415317, + "learning_rate": 3.999849497009409e-06, + "loss": 1.01056707, + "num_input_tokens_seen": 11885275, + "router_z_loss_clip": 11.4921875, + "router_z_loss_mlp": 1.86914062, + "step": 562, + "time_per_iteration": 2.724127769470215 + }, + { + "auxiliary_loss_clip": 0.08630846, + "auxiliary_loss_mlp": 0.01896325, + "balance_loss_clip": 0.07475269, + "balance_loss_mlp": 0.0172867, + "epoch": 0.033849391252066735, + "flos": 16513290921600.0, + "grad_norm": 8.70795014369516, + "language_loss": 0.93251538, + "learning_rate": 3.999844681279401e-06, + "loss": 1.03778696, + "num_input_tokens_seen": 11903595, + "router_z_loss_clip": 11.5546875, + "router_z_loss_mlp": 1.67773438, + "step": 563, + "time_per_iteration": 2.653869867324829 + }, + { + "auxiliary_loss_clip": 0.08601731, + "auxiliary_loss_mlp": 0.0185707, + "balance_loss_clip": 0.07466102, + "balance_loss_mlp": 0.01686648, + "epoch": 0.03390951450473471, + "flos": 15674746786560.0, + "grad_norm": 12.715008158349837, + "language_loss": 1.03361213, + "learning_rate": 3.99983978971508e-06, + "loss": 1.13820004, + "num_input_tokens_seen": 11917815, + "router_z_loss_clip": 11.34375, + "router_z_loss_mlp": 1.70507812, + "step": 564, + "time_per_iteration": 2.6272659301757812 + }, + { + "auxiliary_loss_clip": 0.08544251, + "auxiliary_loss_mlp": 0.01761406, + "balance_loss_clip": 0.07418631, + "balance_loss_mlp": 0.01609581, + "epoch": 0.03396963775740267, + "flos": 22681444728960.0, + "grad_norm": 17.830043780961535, + "language_loss": 1.06299067, + "learning_rate": 3.999834822316635e-06, + "loss": 1.1660471, + "num_input_tokens_seen": 11936305, + "router_z_loss_clip": 11.2578125, + "router_z_loss_mlp": 1.51855469, + "step": 565, + "time_per_iteration": 2.6662397384643555 + }, + { + "auxiliary_loss_clip": 0.07533604, + "auxiliary_loss_mlp": 0.01361189, + "balance_loss_clip": 0.07012594, + "balance_loss_mlp": 0.01291713, + "epoch": 0.034029761010070644, + "flos": 64414872656640.0, + "grad_norm": 1.941550580035849, + "language_loss": 0.56352836, + "learning_rate": 3.9998297790842535e-06, + "loss": 0.65247625, + "num_input_tokens_seen": 11998940, + "router_z_loss_clip": 5.203125, + "router_z_loss_mlp": 0.6953125, + "step": 566, + "time_per_iteration": 3.3542587757110596 + }, + { + "auxiliary_loss_clip": 0.08492532, + "auxiliary_loss_mlp": 0.0159982, + "balance_loss_clip": 0.07380439, + "balance_loss_mlp": 0.01460488, + "epoch": 0.034089884262738616, + "flos": 25010302383360.0, + "grad_norm": 17.320262523662066, + "language_loss": 0.91644871, + "learning_rate": 3.999824660018126e-06, + "loss": 1.01737225, + "num_input_tokens_seen": 12018860, + "router_z_loss_clip": 11.125, + "router_z_loss_mlp": 1.39355469, + "step": 567, + "time_per_iteration": 2.7798964977264404 + }, + { + "auxiliary_loss_clip": 0.08452182, + "auxiliary_loss_mlp": 0.01578824, + "balance_loss_clip": 0.07376789, + "balance_loss_mlp": 0.01451318, + "epoch": 0.03415000751540658, + "flos": 28446643451520.0, + "grad_norm": 16.848598157475653, + "language_loss": 0.91613495, + "learning_rate": 3.999819465118447e-06, + "loss": 1.01644492, + "num_input_tokens_seen": 12039675, + "router_z_loss_clip": 10.7578125, + "router_z_loss_mlp": 1.27539062, + "step": 568, + "time_per_iteration": 2.7506062984466553 + }, + { + "auxiliary_loss_clip": 0.08471178, + "auxiliary_loss_mlp": 0.01592293, + "balance_loss_clip": 0.07369491, + "balance_loss_mlp": 0.0146307, + "epoch": 0.034210130768074554, + "flos": 21474843534720.0, + "grad_norm": 19.531015605864777, + "language_loss": 0.96641582, + "learning_rate": 3.999814194385413e-06, + "loss": 1.06705046, + "num_input_tokens_seen": 12057680, + "router_z_loss_clip": 11.0234375, + "router_z_loss_mlp": 1.29199219, + "step": 569, + "time_per_iteration": 2.679094076156616 + }, + { + "auxiliary_loss_clip": 0.08444348, + "auxiliary_loss_mlp": 0.01572924, + "balance_loss_clip": 0.07354259, + "balance_loss_mlp": 0.01444559, + "epoch": 0.03427025402074252, + "flos": 18703436941440.0, + "grad_norm": 10.09748529662486, + "language_loss": 1.03407526, + "learning_rate": 3.9998088478192255e-06, + "loss": 1.13424802, + "num_input_tokens_seen": 12076135, + "router_z_loss_clip": 10.90625, + "router_z_loss_mlp": 1.28320312, + "step": 570, + "time_per_iteration": 5.62298059463501 + }, + { + "auxiliary_loss_clip": 0.08452979, + "auxiliary_loss_mlp": 0.01597574, + "balance_loss_clip": 0.07344566, + "balance_loss_mlp": 0.01465204, + "epoch": 0.03433037727341049, + "flos": 20856253167360.0, + "grad_norm": 7.817701028438559, + "language_loss": 0.91945982, + "learning_rate": 3.9998034254200846e-06, + "loss": 1.01996529, + "num_input_tokens_seen": 12094785, + "router_z_loss_clip": 11.09375, + "router_z_loss_mlp": 1.32421875, + "step": 571, + "time_per_iteration": 2.654836654663086 + }, + { + "auxiliary_loss_clip": 0.08401142, + "auxiliary_loss_mlp": 0.01674875, + "balance_loss_clip": 0.073204, + "balance_loss_mlp": 0.01534971, + "epoch": 0.03439050052607846, + "flos": 25417240536960.0, + "grad_norm": 10.131092922686104, + "language_loss": 0.93731064, + "learning_rate": 3.999797927188199e-06, + "loss": 1.0380708, + "num_input_tokens_seen": 12114590, + "router_z_loss_clip": 10.8046875, + "router_z_loss_mlp": 1.39941406, + "step": 572, + "time_per_iteration": 4.118088483810425 + }, + { + "auxiliary_loss_clip": 0.08396388, + "auxiliary_loss_mlp": 0.01765484, + "balance_loss_clip": 0.07306887, + "balance_loss_mlp": 0.01610417, + "epoch": 0.03445062377874643, + "flos": 17646029141760.0, + "grad_norm": 20.127104681387284, + "language_loss": 0.93513721, + "learning_rate": 3.999792353123774e-06, + "loss": 1.03675592, + "num_input_tokens_seen": 12132390, + "router_z_loss_clip": 10.8984375, + "router_z_loss_mlp": 1.55078125, + "step": 573, + "time_per_iteration": 2.743281841278076 + }, + { + "auxiliary_loss_clip": 0.08402257, + "auxiliary_loss_mlp": 0.01880152, + "balance_loss_clip": 0.07297936, + "balance_loss_mlp": 0.01694757, + "epoch": 0.0345107470314144, + "flos": 16770239066880.0, + "grad_norm": 36.525489937717154, + "language_loss": 0.90410393, + "learning_rate": 3.999786703227023e-06, + "loss": 1.00692797, + "num_input_tokens_seen": 12149035, + "router_z_loss_clip": 11.046875, + "router_z_loss_mlp": 1.85351562, + "step": 574, + "time_per_iteration": 4.080662250518799 + }, + { + "auxiliary_loss_clip": 0.08410574, + "auxiliary_loss_mlp": 0.01951083, + "balance_loss_clip": 0.0729783, + "balance_loss_mlp": 0.01742514, + "epoch": 0.03457087028408237, + "flos": 14689776441600.0, + "grad_norm": 44.337021824182244, + "language_loss": 0.94332999, + "learning_rate": 3.9997809774981606e-06, + "loss": 1.04694653, + "num_input_tokens_seen": 12167530, + "router_z_loss_clip": 11.125, + "router_z_loss_mlp": 2.08398438, + "step": 575, + "time_per_iteration": 2.6497297286987305 + }, + { + "auxiliary_loss_clip": 0.0841077, + "auxiliary_loss_mlp": 0.02005797, + "balance_loss_clip": 0.07284614, + "balance_loss_mlp": 0.01780635, + "epoch": 0.03463099353675034, + "flos": 20017499397120.0, + "grad_norm": 29.883353134979416, + "language_loss": 0.90882921, + "learning_rate": 3.9997751759374025e-06, + "loss": 1.01299489, + "num_input_tokens_seen": 12186340, + "router_z_loss_clip": 11.265625, + "router_z_loss_mlp": 2.24804688, + "step": 576, + "time_per_iteration": 2.67240309715271 + }, + { + "auxiliary_loss_clip": 0.08418353, + "auxiliary_loss_mlp": 0.02062659, + "balance_loss_clip": 0.07293572, + "balance_loss_mlp": 0.01817947, + "epoch": 0.03469111678941831, + "flos": 25308144120960.0, + "grad_norm": 230.42461275956111, + "language_loss": 0.94618452, + "learning_rate": 3.99976929854497e-06, + "loss": 1.05099463, + "num_input_tokens_seen": 12204090, + "router_z_loss_clip": 11.2421875, + "router_z_loss_mlp": 2.44921875, + "step": 577, + "time_per_iteration": 2.6817197799682617 + }, + { + "auxiliary_loss_clip": 0.08418664, + "auxiliary_loss_mlp": 0.02057238, + "balance_loss_clip": 0.07282382, + "balance_loss_mlp": 0.01803943, + "epoch": 0.034751240042086275, + "flos": 23266311027840.0, + "grad_norm": 40.134119868020754, + "language_loss": 0.81416667, + "learning_rate": 3.9997633453210845e-06, + "loss": 0.9189257, + "num_input_tokens_seen": 12224850, + "router_z_loss_clip": 11.359375, + "router_z_loss_mlp": 2.53320312, + "step": 578, + "time_per_iteration": 2.6971585750579834 + }, + { + "auxiliary_loss_clip": 0.08457734, + "auxiliary_loss_mlp": 0.0202791, + "balance_loss_clip": 0.07290839, + "balance_loss_mlp": 0.0177881, + "epoch": 0.03481136329475425, + "flos": 23776056541440.0, + "grad_norm": 24.631913893483972, + "language_loss": 0.86342728, + "learning_rate": 3.999757316265973e-06, + "loss": 0.96828371, + "num_input_tokens_seen": 12244935, + "router_z_loss_clip": 11.6640625, + "router_z_loss_mlp": 2.4921875, + "step": 579, + "time_per_iteration": 2.694719076156616 + }, + { + "auxiliary_loss_clip": 0.08425288, + "auxiliary_loss_mlp": 0.0202294, + "balance_loss_clip": 0.07289667, + "balance_loss_mlp": 0.01773459, + "epoch": 0.03487148654742222, + "flos": 20163799825920.0, + "grad_norm": 24.746236106534205, + "language_loss": 0.94137156, + "learning_rate": 3.999751211379863e-06, + "loss": 1.04585385, + "num_input_tokens_seen": 12262140, + "router_z_loss_clip": 11.3671875, + "router_z_loss_mlp": 2.49609375, + "step": 580, + "time_per_iteration": 2.6965222358703613 + }, + { + "auxiliary_loss_clip": 0.08429064, + "auxiliary_loss_mlp": 0.02027245, + "balance_loss_clip": 0.07292753, + "balance_loss_mlp": 0.01790066, + "epoch": 0.034931609800090184, + "flos": 15675082202880.0, + "grad_norm": 72.69729205239823, + "language_loss": 0.92401338, + "learning_rate": 3.999745030662987e-06, + "loss": 1.02857637, + "num_input_tokens_seen": 12280930, + "router_z_loss_clip": 11.34375, + "router_z_loss_mlp": 2.37011719, + "step": 581, + "time_per_iteration": 2.6485416889190674 + }, + { + "auxiliary_loss_clip": 0.08388546, + "auxiliary_loss_mlp": 0.01934185, + "balance_loss_clip": 0.07261664, + "balance_loss_mlp": 0.01722183, + "epoch": 0.034991733052758156, + "flos": 16367912887680.0, + "grad_norm": 7.903206829146829, + "language_loss": 0.86330044, + "learning_rate": 3.99973877411558e-06, + "loss": 0.96652782, + "num_input_tokens_seen": 12299125, + "router_z_loss_clip": 11.28125, + "router_z_loss_mlp": 2.11914062, + "step": 582, + "time_per_iteration": 2.649725914001465 + }, + { + "auxiliary_loss_clip": 0.08328964, + "auxiliary_loss_mlp": 0.01871683, + "balance_loss_clip": 0.07243238, + "balance_loss_mlp": 0.01678087, + "epoch": 0.03505185630542612, + "flos": 19392787681920.0, + "grad_norm": 16.174360943611433, + "language_loss": 0.95958614, + "learning_rate": 3.999732441737877e-06, + "loss": 1.06159258, + "num_input_tokens_seen": 12316905, + "router_z_loss_clip": 10.859375, + "router_z_loss_mlp": 1.9375, + "step": 583, + "time_per_iteration": 2.643488645553589 + }, + { + "auxiliary_loss_clip": 0.08363868, + "auxiliary_loss_mlp": 0.01881498, + "balance_loss_clip": 0.07254223, + "balance_loss_mlp": 0.0168199, + "epoch": 0.03511197955809409, + "flos": 21330094406400.0, + "grad_norm": 77.84633741200611, + "language_loss": 0.91128743, + "learning_rate": 3.99972603353012e-06, + "loss": 1.01374114, + "num_input_tokens_seen": 12335070, + "router_z_loss_clip": 11.09375, + "router_z_loss_mlp": 1.99511719, + "step": 584, + "time_per_iteration": 2.6665167808532715 + }, + { + "auxiliary_loss_clip": 0.08332659, + "auxiliary_loss_mlp": 0.01830344, + "balance_loss_clip": 0.07228079, + "balance_loss_mlp": 0.01642279, + "epoch": 0.035172102810762065, + "flos": 14141736812160.0, + "grad_norm": 18.638483190058057, + "language_loss": 1.05479646, + "learning_rate": 3.999719549492551e-06, + "loss": 1.15642655, + "num_input_tokens_seen": 12350315, + "router_z_loss_clip": 11.046875, + "router_z_loss_mlp": 1.88183594, + "step": 585, + "time_per_iteration": 2.6243345737457275 + }, + { + "auxiliary_loss_clip": 0.08346213, + "auxiliary_loss_mlp": 0.01757237, + "balance_loss_clip": 0.07237425, + "balance_loss_mlp": 0.01597305, + "epoch": 0.03523222606343003, + "flos": 20302092190080.0, + "grad_norm": 16.531437097419627, + "language_loss": 0.96612549, + "learning_rate": 3.9997129896254165e-06, + "loss": 1.06716001, + "num_input_tokens_seen": 12366030, + "router_z_loss_clip": 11.0859375, + "router_z_loss_mlp": 1.59960938, + "step": 586, + "time_per_iteration": 2.79085373878479 + }, + { + "auxiliary_loss_clip": 0.08346236, + "auxiliary_loss_mlp": 0.01816744, + "balance_loss_clip": 0.07224018, + "balance_loss_mlp": 0.01643652, + "epoch": 0.035292349316098, + "flos": 20382034584960.0, + "grad_norm": 18.968444028471765, + "language_loss": 0.85692161, + "learning_rate": 3.999706353928965e-06, + "loss": 0.95855141, + "num_input_tokens_seen": 12384895, + "router_z_loss_clip": 11.234375, + "router_z_loss_mlp": 1.73242188, + "step": 587, + "time_per_iteration": 2.6773126125335693 + }, + { + "auxiliary_loss_clip": 0.08336938, + "auxiliary_loss_mlp": 0.01864921, + "balance_loss_clip": 0.07205997, + "balance_loss_mlp": 0.01679527, + "epoch": 0.03535247256876597, + "flos": 21475011242880.0, + "grad_norm": 15.49018014588467, + "language_loss": 0.87486923, + "learning_rate": 3.999699642403449e-06, + "loss": 0.97688788, + "num_input_tokens_seen": 12404980, + "router_z_loss_clip": 11.3203125, + "router_z_loss_mlp": 1.85546875, + "step": 588, + "time_per_iteration": 2.7011075019836426 + }, + { + "auxiliary_loss_clip": 0.08372419, + "auxiliary_loss_mlp": 0.01837943, + "balance_loss_clip": 0.07240701, + "balance_loss_mlp": 0.01648257, + "epoch": 0.03541259582143394, + "flos": 23629798039680.0, + "grad_norm": 7.372880070726386, + "language_loss": 1.04957795, + "learning_rate": 3.99969285504912e-06, + "loss": 1.15168166, + "num_input_tokens_seen": 12423835, + "router_z_loss_clip": 11.3203125, + "router_z_loss_mlp": 1.8984375, + "step": 589, + "time_per_iteration": 2.6905288696289062 + }, + { + "auxiliary_loss_clip": 0.08381461, + "auxiliary_loss_mlp": 0.01904967, + "balance_loss_clip": 0.07235886, + "balance_loss_mlp": 0.0170708, + "epoch": 0.03547271907410191, + "flos": 33734269428480.0, + "grad_norm": 5.900447642035286, + "language_loss": 0.93457747, + "learning_rate": 3.99968599186624e-06, + "loss": 1.03744173, + "num_input_tokens_seen": 12443135, + "router_z_loss_clip": 11.4609375, + "router_z_loss_mlp": 1.98046875, + "step": 590, + "time_per_iteration": 2.7626585960388184 + }, + { + "auxiliary_loss_clip": 0.08363292, + "auxiliary_loss_mlp": 0.01913512, + "balance_loss_clip": 0.07212853, + "balance_loss_mlp": 0.01716864, + "epoch": 0.03553284232676988, + "flos": 21149147514240.0, + "grad_norm": 8.056614912073432, + "language_loss": 0.93932045, + "learning_rate": 3.999679052855065e-06, + "loss": 1.04208851, + "num_input_tokens_seen": 12462895, + "router_z_loss_clip": 11.5, + "router_z_loss_mlp": 1.96484375, + "step": 591, + "time_per_iteration": 2.6892929077148438 + }, + { + "auxiliary_loss_clip": 0.08372159, + "auxiliary_loss_mlp": 0.0192709, + "balance_loss_clip": 0.0721619, + "balance_loss_mlp": 0.01729871, + "epoch": 0.03559296557943785, + "flos": 20052607057920.0, + "grad_norm": 11.504016210282687, + "language_loss": 0.90931952, + "learning_rate": 3.999672038015861e-06, + "loss": 1.01231205, + "num_input_tokens_seen": 12481515, + "router_z_loss_clip": 11.5546875, + "router_z_loss_mlp": 1.97363281, + "step": 592, + "time_per_iteration": 2.682248830795288 + }, + { + "auxiliary_loss_clip": 0.07476875, + "auxiliary_loss_mlp": 0.01418694, + "balance_loss_clip": 0.06931903, + "balance_loss_mlp": 0.01348551, + "epoch": 0.035653088832105814, + "flos": 60354742268160.0, + "grad_norm": 1.7390456768388496, + "language_loss": 0.61271667, + "learning_rate": 3.999664947348893e-06, + "loss": 0.70167232, + "num_input_tokens_seen": 12548220, + "router_z_loss_clip": 5.4375, + "router_z_loss_mlp": 0.70214844, + "step": 593, + "time_per_iteration": 3.372291088104248 + }, + { + "auxiliary_loss_clip": 0.08396088, + "auxiliary_loss_mlp": 0.01873215, + "balance_loss_clip": 0.07235788, + "balance_loss_mlp": 0.0169402, + "epoch": 0.035713212084773786, + "flos": 20118084624000.0, + "grad_norm": 4.056543882896522, + "language_loss": 0.9366371, + "learning_rate": 3.999657780854429e-06, + "loss": 1.03933024, + "num_input_tokens_seen": 12566105, + "router_z_loss_clip": 11.609375, + "router_z_loss_mlp": 1.79199219, + "step": 594, + "time_per_iteration": 2.656702756881714 + }, + { + "auxiliary_loss_clip": 0.08370538, + "auxiliary_loss_mlp": 0.01864142, + "balance_loss_clip": 0.07210694, + "balance_loss_mlp": 0.01671786, + "epoch": 0.03577333533744176, + "flos": 26292862903680.0, + "grad_norm": 7.659859705492133, + "language_loss": 0.90299201, + "learning_rate": 3.999650538532742e-06, + "loss": 1.00533891, + "num_input_tokens_seen": 12586680, + "router_z_loss_clip": 11.609375, + "router_z_loss_mlp": 1.92480469, + "step": 595, + "time_per_iteration": 2.735182285308838 + }, + { + "auxiliary_loss_clip": 0.08357747, + "auxiliary_loss_mlp": 0.01819213, + "balance_loss_clip": 0.07199049, + "balance_loss_mlp": 0.01642402, + "epoch": 0.035833458590109724, + "flos": 10894392627840.0, + "grad_norm": 11.312857601205495, + "language_loss": 1.05936086, + "learning_rate": 3.999643220384106e-06, + "loss": 1.16113043, + "num_input_tokens_seen": 12601605, + "router_z_loss_clip": 11.6015625, + "router_z_loss_mlp": 1.76953125, + "step": 596, + "time_per_iteration": 2.6456210613250732 + }, + { + "auxiliary_loss_clip": 0.08308871, + "auxiliary_loss_mlp": 0.01797355, + "balance_loss_clip": 0.07171883, + "balance_loss_mlp": 0.01627124, + "epoch": 0.035893581842777696, + "flos": 22096620357120.0, + "grad_norm": 9.130935198122538, + "language_loss": 0.90824974, + "learning_rate": 3.999635826408799e-06, + "loss": 1.00931203, + "num_input_tokens_seen": 12620365, + "router_z_loss_clip": 11.3671875, + "router_z_loss_mlp": 1.70117188, + "step": 597, + "time_per_iteration": 2.6823341846466064 + }, + { + "auxiliary_loss_clip": 0.08270305, + "auxiliary_loss_mlp": 0.01746721, + "balance_loss_clip": 0.0715827, + "balance_loss_mlp": 0.01584406, + "epoch": 0.03595370509544566, + "flos": 23044847886720.0, + "grad_norm": 9.111056149089638, + "language_loss": 0.87109864, + "learning_rate": 3.999628356607101e-06, + "loss": 0.97126889, + "num_input_tokens_seen": 12641140, + "router_z_loss_clip": 11.1171875, + "router_z_loss_mlp": 1.62402344, + "step": 598, + "time_per_iteration": 2.720789670944214 + }, + { + "auxiliary_loss_clip": 0.08249436, + "auxiliary_loss_mlp": 0.01768458, + "balance_loss_clip": 0.07144348, + "balance_loss_mlp": 0.01596511, + "epoch": 0.03601382834811363, + "flos": 20784109201920.0, + "grad_norm": 3.8408259345244593, + "language_loss": 0.87403977, + "learning_rate": 3.999620810979295e-06, + "loss": 0.97421879, + "num_input_tokens_seen": 12661080, + "router_z_loss_clip": 11.0546875, + "router_z_loss_mlp": 1.71972656, + "step": 599, + "time_per_iteration": 2.648764133453369 + }, + { + "auxiliary_loss_clip": 0.08292407, + "auxiliary_loss_mlp": 0.01772624, + "balance_loss_clip": 0.07133689, + "balance_loss_mlp": 0.01594573, + "epoch": 0.036073951600781605, + "flos": 23958470880000.0, + "grad_norm": 6.448569836830266, + "language_loss": 0.96199447, + "learning_rate": 3.999613189525668e-06, + "loss": 1.06264472, + "num_input_tokens_seen": 12678270, + "router_z_loss_clip": 11.6015625, + "router_z_loss_mlp": 1.78027344, + "step": 600, + "time_per_iteration": 2.677182197570801 + }, + { + "auxiliary_loss_clip": 0.08248397, + "auxiliary_loss_mlp": 0.01755802, + "balance_loss_clip": 0.07142025, + "balance_loss_mlp": 0.01582996, + "epoch": 0.03613407485344957, + "flos": 18917562850560.0, + "grad_norm": 6.503034140887701, + "language_loss": 0.8985101, + "learning_rate": 3.999605492246508e-06, + "loss": 0.9985522, + "num_input_tokens_seen": 12697295, + "router_z_loss_clip": 11.0703125, + "router_z_loss_mlp": 1.72753906, + "step": 601, + "time_per_iteration": 2.6344988346099854 + }, + { + "auxiliary_loss_clip": 0.08262836, + "auxiliary_loss_mlp": 0.01796413, + "balance_loss_clip": 0.07111854, + "balance_loss_mlp": 0.01602054, + "epoch": 0.03619419810611754, + "flos": 23045057521920.0, + "grad_norm": 7.606856937764795, + "language_loss": 0.83811623, + "learning_rate": 3.999597719142107e-06, + "loss": 0.93870872, + "num_input_tokens_seen": 12716165, + "router_z_loss_clip": 11.5234375, + "router_z_loss_mlp": 1.94335938, + "step": 602, + "time_per_iteration": 2.6544992923736572 + }, + { + "auxiliary_loss_clip": 0.08245073, + "auxiliary_loss_mlp": 0.01805812, + "balance_loss_clip": 0.07111835, + "balance_loss_mlp": 0.01607543, + "epoch": 0.03625432135878551, + "flos": 29465002448640.0, + "grad_norm": 10.358505294515373, + "language_loss": 0.86272752, + "learning_rate": 3.999589870212761e-06, + "loss": 0.96323633, + "num_input_tokens_seen": 12735475, + "router_z_loss_clip": 11.328125, + "router_z_loss_mlp": 1.984375, + "step": 603, + "time_per_iteration": 2.7074103355407715 + }, + { + "auxiliary_loss_clip": 0.08216999, + "auxiliary_loss_mlp": 0.01791145, + "balance_loss_clip": 0.07080936, + "balance_loss_mlp": 0.01602794, + "epoch": 0.03631444461145348, + "flos": 23514412567680.0, + "grad_norm": 4.761739949728406, + "language_loss": 0.93545526, + "learning_rate": 3.9995819454587664e-06, + "loss": 1.03553677, + "num_input_tokens_seen": 12754540, + "router_z_loss_clip": 11.3671875, + "router_z_loss_mlp": 1.88574219, + "step": 604, + "time_per_iteration": 2.683458089828491 + }, + { + "auxiliary_loss_clip": 0.08179027, + "auxiliary_loss_mlp": 0.01779272, + "balance_loss_clip": 0.07038404, + "balance_loss_mlp": 0.01587965, + "epoch": 0.03637456786412145, + "flos": 16623770929920.0, + "grad_norm": 10.408229209770424, + "language_loss": 0.89575511, + "learning_rate": 3.999573944880424e-06, + "loss": 0.99533808, + "num_input_tokens_seen": 12773050, + "router_z_loss_clip": 11.4140625, + "router_z_loss_mlp": 1.91308594, + "step": 605, + "time_per_iteration": 2.6058335304260254 + }, + { + "auxiliary_loss_clip": 0.08185698, + "auxiliary_loss_mlp": 0.0179345, + "balance_loss_clip": 0.07041989, + "balance_loss_mlp": 0.01587933, + "epoch": 0.03643469111678942, + "flos": 15857328833280.0, + "grad_norm": 18.44965350869095, + "language_loss": 0.94496262, + "learning_rate": 3.9995658684780375e-06, + "loss": 1.04475403, + "num_input_tokens_seen": 12791240, + "router_z_loss_clip": 11.421875, + "router_z_loss_mlp": 2.05566406, + "step": 606, + "time_per_iteration": 2.6620774269104004 + }, + { + "auxiliary_loss_clip": 0.0816614, + "auxiliary_loss_mlp": 0.01748117, + "balance_loss_clip": 0.07028672, + "balance_loss_mlp": 0.01549944, + "epoch": 0.03649481436945739, + "flos": 23626695438720.0, + "grad_norm": 22.881578639374155, + "language_loss": 0.89864534, + "learning_rate": 3.999557716251912e-06, + "loss": 0.99778789, + "num_input_tokens_seen": 12812245, + "router_z_loss_clip": 11.3828125, + "router_z_loss_mlp": 1.98144531, + "step": 607, + "time_per_iteration": 2.643644332885742 + }, + { + "auxiliary_loss_clip": 0.08159362, + "auxiliary_loss_mlp": 0.01746593, + "balance_loss_clip": 0.07035235, + "balance_loss_mlp": 0.01550708, + "epoch": 0.036554937622125354, + "flos": 21760903774080.0, + "grad_norm": 5.869564247499357, + "language_loss": 0.89574814, + "learning_rate": 3.999549488202358e-06, + "loss": 0.99480766, + "num_input_tokens_seen": 12831085, + "router_z_loss_clip": 11.2421875, + "router_z_loss_mlp": 1.95800781, + "step": 608, + "time_per_iteration": 2.6450629234313965 + }, + { + "auxiliary_loss_clip": 0.08127657, + "auxiliary_loss_mlp": 0.01727103, + "balance_loss_clip": 0.07009961, + "balance_loss_mlp": 0.01525497, + "epoch": 0.036615060874793326, + "flos": 17825215098240.0, + "grad_norm": 10.044459064109706, + "language_loss": 0.90011758, + "learning_rate": 3.999541184329688e-06, + "loss": 0.99866509, + "num_input_tokens_seen": 12849115, + "router_z_loss_clip": 11.171875, + "router_z_loss_mlp": 2.01464844, + "step": 609, + "time_per_iteration": 4.030602216720581 + }, + { + "auxiliary_loss_clip": 0.08147175, + "auxiliary_loss_mlp": 0.01709632, + "balance_loss_clip": 0.07004737, + "balance_loss_mlp": 0.01506309, + "epoch": 0.0366751841274613, + "flos": 26759911962240.0, + "grad_norm": 23.288197653985222, + "language_loss": 0.89072526, + "learning_rate": 3.999532804634215e-06, + "loss": 0.98929334, + "num_input_tokens_seen": 12868005, + "router_z_loss_clip": 11.421875, + "router_z_loss_mlp": 2.03515625, + "step": 610, + "time_per_iteration": 4.13908052444458 + }, + { + "auxiliary_loss_clip": 0.08141156, + "auxiliary_loss_mlp": 0.01701532, + "balance_loss_clip": 0.06999695, + "balance_loss_mlp": 0.01503454, + "epoch": 0.03673530738012926, + "flos": 22202949588480.0, + "grad_norm": 12.716864123026268, + "language_loss": 0.93839324, + "learning_rate": 3.9995243491162575e-06, + "loss": 1.03682017, + "num_input_tokens_seen": 12886890, + "router_z_loss_clip": 11.421875, + "router_z_loss_mlp": 1.98046875, + "step": 611, + "time_per_iteration": 4.084355354309082 + }, + { + "auxiliary_loss_clip": 0.08129553, + "auxiliary_loss_mlp": 0.01677889, + "balance_loss_clip": 0.07002232, + "balance_loss_mlp": 0.01494783, + "epoch": 0.036795430632797235, + "flos": 24688673285760.0, + "grad_norm": 5.856966427284507, + "language_loss": 0.80289567, + "learning_rate": 3.999515817776136e-06, + "loss": 0.9009701, + "num_input_tokens_seen": 12906130, + "router_z_loss_clip": 11.296875, + "router_z_loss_mlp": 1.83007812, + "step": 612, + "time_per_iteration": 2.797450065612793 + }, + { + "auxiliary_loss_clip": 0.08124618, + "auxiliary_loss_mlp": 0.01670571, + "balance_loss_clip": 0.06981046, + "balance_loss_mlp": 0.01486607, + "epoch": 0.0368555538854652, + "flos": 17754706287360.0, + "grad_norm": 13.343841316796098, + "language_loss": 0.86962521, + "learning_rate": 3.999507210614175e-06, + "loss": 0.9675771, + "num_input_tokens_seen": 12925260, + "router_z_loss_clip": 11.4453125, + "router_z_loss_mlp": 1.83984375, + "step": 613, + "time_per_iteration": 4.1074419021606445 + }, + { + "auxiliary_loss_clip": 0.0806347, + "auxiliary_loss_mlp": 0.01642999, + "balance_loss_clip": 0.0695873, + "balance_loss_mlp": 0.01476392, + "epoch": 0.03691567713813317, + "flos": 20600772468480.0, + "grad_norm": 5.522225672422525, + "language_loss": 1.0065136, + "learning_rate": 3.9994985276307e-06, + "loss": 1.10357833, + "num_input_tokens_seen": 12944590, + "router_z_loss_clip": 11.046875, + "router_z_loss_mlp": 1.66699219, + "step": 614, + "time_per_iteration": 2.645425796508789 + }, + { + "auxiliary_loss_clip": 0.08091287, + "auxiliary_loss_mlp": 0.01664825, + "balance_loss_clip": 0.06965354, + "balance_loss_mlp": 0.01476188, + "epoch": 0.036975800390801145, + "flos": 33657765050880.0, + "grad_norm": 13.032636577175042, + "language_loss": 0.81820416, + "learning_rate": 3.999489768826041e-06, + "loss": 0.91576523, + "num_input_tokens_seen": 12964785, + "router_z_loss_clip": 11.265625, + "router_z_loss_mlp": 1.88671875, + "step": 615, + "time_per_iteration": 2.781172752380371 + }, + { + "auxiliary_loss_clip": 0.08073606, + "auxiliary_loss_mlp": 0.01648642, + "balance_loss_clip": 0.06957066, + "balance_loss_mlp": 0.01467158, + "epoch": 0.03703592364346911, + "flos": 28301307344640.0, + "grad_norm": 5.888176936290721, + "language_loss": 0.88226712, + "learning_rate": 3.999480934200528e-06, + "loss": 0.97948968, + "num_input_tokens_seen": 12986705, + "router_z_loss_clip": 11.171875, + "router_z_loss_mlp": 1.81445312, + "step": 616, + "time_per_iteration": 2.712480068206787 + }, + { + "auxiliary_loss_clip": 0.08063665, + "auxiliary_loss_mlp": 0.01595674, + "balance_loss_clip": 0.06951402, + "balance_loss_mlp": 0.01438985, + "epoch": 0.03709604689613708, + "flos": 31512327984000.0, + "grad_norm": 15.942016878304402, + "language_loss": 0.7623843, + "learning_rate": 3.999472023754499e-06, + "loss": 0.85897768, + "num_input_tokens_seen": 13010560, + "router_z_loss_clip": 11.1171875, + "router_z_loss_mlp": 1.56738281, + "step": 617, + "time_per_iteration": 2.738520622253418 + }, + { + "auxiliary_loss_clip": 0.08034836, + "auxiliary_loss_mlp": 0.01559373, + "balance_loss_clip": 0.06941325, + "balance_loss_mlp": 0.01401445, + "epoch": 0.03715617014880505, + "flos": 19615424780160.0, + "grad_norm": 6.714823910826054, + "language_loss": 0.88676983, + "learning_rate": 3.99946303748829e-06, + "loss": 0.98271191, + "num_input_tokens_seen": 13028935, + "router_z_loss_clip": 10.953125, + "router_z_loss_mlp": 1.57910156, + "step": 618, + "time_per_iteration": 2.6463687419891357 + }, + { + "auxiliary_loss_clip": 0.08035833, + "auxiliary_loss_mlp": 0.0158681, + "balance_loss_clip": 0.06917505, + "balance_loss_mlp": 0.01430789, + "epoch": 0.03721629340147302, + "flos": 15929598579840.0, + "grad_norm": 200.27470015941975, + "language_loss": 0.97611117, + "learning_rate": 3.999453975402242e-06, + "loss": 1.07233763, + "num_input_tokens_seen": 13046000, + "router_z_loss_clip": 11.171875, + "router_z_loss_mlp": 1.55957031, + "step": 619, + "time_per_iteration": 2.6415488719940186 + }, + { + "auxiliary_loss_clip": 0.08024481, + "auxiliary_loss_mlp": 0.01545146, + "balance_loss_clip": 0.06915386, + "balance_loss_mlp": 0.01399139, + "epoch": 0.03727641665414099, + "flos": 21110182565760.0, + "grad_norm": 5.601090655471351, + "language_loss": 1.00407517, + "learning_rate": 3.9994448374967e-06, + "loss": 1.0997715, + "num_input_tokens_seen": 13062995, + "router_z_loss_clip": 11.1015625, + "router_z_loss_mlp": 1.4609375, + "step": 620, + "time_per_iteration": 2.6569979190826416 + }, + { + "auxiliary_loss_clip": 0.08002374, + "auxiliary_loss_mlp": 0.01557386, + "balance_loss_clip": 0.06899319, + "balance_loss_mlp": 0.01406705, + "epoch": 0.037336539906808956, + "flos": 24138159960960.0, + "grad_norm": 36.40398806521908, + "language_loss": 0.83474398, + "learning_rate": 3.999435623772008e-06, + "loss": 0.9303416, + "num_input_tokens_seen": 13084120, + "router_z_loss_clip": 11.046875, + "router_z_loss_mlp": 1.5078125, + "step": 621, + "time_per_iteration": 2.690336227416992 + }, + { + "auxiliary_loss_clip": 0.07971206, + "auxiliary_loss_mlp": 0.01523645, + "balance_loss_clip": 0.06889994, + "balance_loss_mlp": 0.01385266, + "epoch": 0.03739666315947693, + "flos": 22352981523840.0, + "grad_norm": 9.446463642728892, + "language_loss": 0.92411411, + "learning_rate": 3.999426334228518e-06, + "loss": 1.01906252, + "num_input_tokens_seen": 13100035, + "router_z_loss_clip": 10.828125, + "router_z_loss_mlp": 1.38378906, + "step": 622, + "time_per_iteration": 2.658414363861084 + }, + { + "auxiliary_loss_clip": 0.07994708, + "auxiliary_loss_mlp": 0.01510841, + "balance_loss_clip": 0.06888318, + "balance_loss_mlp": 0.01382, + "epoch": 0.0374567864121449, + "flos": 20455855632000.0, + "grad_norm": 11.361437110202797, + "language_loss": 0.97279346, + "learning_rate": 3.999416968866581e-06, + "loss": 1.06784892, + "num_input_tokens_seen": 13118070, + "router_z_loss_clip": 11.0546875, + "router_z_loss_mlp": 1.2890625, + "step": 623, + "time_per_iteration": 2.641080617904663 + }, + { + "auxiliary_loss_clip": 0.07990901, + "auxiliary_loss_mlp": 0.01512746, + "balance_loss_clip": 0.06881022, + "balance_loss_mlp": 0.0138009, + "epoch": 0.037516909664812866, + "flos": 19214020995840.0, + "grad_norm": 6.5992711028490865, + "language_loss": 0.9044131, + "learning_rate": 3.999407527686551e-06, + "loss": 0.99944961, + "num_input_tokens_seen": 13136355, + "router_z_loss_clip": 11.1171875, + "router_z_loss_mlp": 1.32714844, + "step": 624, + "time_per_iteration": 2.6581132411956787 + }, + { + "auxiliary_loss_clip": 0.07970337, + "auxiliary_loss_mlp": 0.0150074, + "balance_loss_clip": 0.06882318, + "balance_loss_mlp": 0.01368561, + "epoch": 0.03757703291748084, + "flos": 35013643493760.0, + "grad_norm": 9.813739409664771, + "language_loss": 0.77213168, + "learning_rate": 3.999398010688788e-06, + "loss": 0.86684251, + "num_input_tokens_seen": 13155435, + "router_z_loss_clip": 10.8828125, + "router_z_loss_mlp": 1.32128906, + "step": 625, + "time_per_iteration": 2.741912603378296 + }, + { + "auxiliary_loss_clip": 0.07975402, + "auxiliary_loss_mlp": 0.01499832, + "balance_loss_clip": 0.06869578, + "balance_loss_mlp": 0.01362599, + "epoch": 0.0376371561701488, + "flos": 25490977729920.0, + "grad_norm": 10.795152981420221, + "language_loss": 0.84230971, + "learning_rate": 3.999388417873652e-06, + "loss": 0.93706203, + "num_input_tokens_seen": 13174295, + "router_z_loss_clip": 11.0625, + "router_z_loss_mlp": 1.37207031, + "step": 626, + "time_per_iteration": 2.7070746421813965 + }, + { + "auxiliary_loss_clip": 0.07968426, + "auxiliary_loss_mlp": 0.01497735, + "balance_loss_clip": 0.06873227, + "balance_loss_mlp": 0.01361264, + "epoch": 0.037697279422816775, + "flos": 18191301586560.0, + "grad_norm": 4.940336590948721, + "language_loss": 0.86271065, + "learning_rate": 3.999378749241506e-06, + "loss": 0.95737231, + "num_input_tokens_seen": 13192500, + "router_z_loss_clip": 10.953125, + "router_z_loss_mlp": 1.36425781, + "step": 627, + "time_per_iteration": 2.622081756591797 + }, + { + "auxiliary_loss_clip": 0.07952641, + "auxiliary_loss_mlp": 0.01462314, + "balance_loss_clip": 0.06847817, + "balance_loss_mlp": 0.01327273, + "epoch": 0.03775740267548475, + "flos": 24651133856640.0, + "grad_norm": 5.044807916969655, + "language_loss": 0.93558288, + "learning_rate": 3.999369004792719e-06, + "loss": 1.02973247, + "num_input_tokens_seen": 13213470, + "router_z_loss_clip": 11.046875, + "router_z_loss_mlp": 1.35058594, + "step": 628, + "time_per_iteration": 2.699890375137329 + }, + { + "auxiliary_loss_clip": 0.07954629, + "auxiliary_loss_mlp": 0.01473174, + "balance_loss_clip": 0.06867678, + "balance_loss_mlp": 0.01340232, + "epoch": 0.03781752592815271, + "flos": 21294609402240.0, + "grad_norm": 4.416786805856079, + "language_loss": 0.86205798, + "learning_rate": 3.999359184527658e-06, + "loss": 0.95633596, + "num_input_tokens_seen": 13232365, + "router_z_loss_clip": 10.8828125, + "router_z_loss_mlp": 1.32910156, + "step": 629, + "time_per_iteration": 2.629606246948242 + }, + { + "auxiliary_loss_clip": 0.07949786, + "auxiliary_loss_mlp": 0.01478041, + "balance_loss_clip": 0.06862906, + "balance_loss_mlp": 0.01348436, + "epoch": 0.037877649180820684, + "flos": 22095949524480.0, + "grad_norm": 11.02025815590499, + "language_loss": 0.82977569, + "learning_rate": 3.999349288446696e-06, + "loss": 0.92405391, + "num_input_tokens_seen": 13251920, + "router_z_loss_clip": 10.8671875, + "router_z_loss_mlp": 1.29589844, + "step": 630, + "time_per_iteration": 2.6579172611236572 + }, + { + "auxiliary_loss_clip": 0.07989411, + "auxiliary_loss_mlp": 0.01449511, + "balance_loss_clip": 0.06879212, + "balance_loss_mlp": 0.01315711, + "epoch": 0.03793777243348865, + "flos": 14506523562240.0, + "grad_norm": 6.642300097880606, + "language_loss": 0.99746037, + "learning_rate": 3.99933931655021e-06, + "loss": 1.09184957, + "num_input_tokens_seen": 13267440, + "router_z_loss_clip": 11.1015625, + "router_z_loss_mlp": 1.33789062, + "step": 631, + "time_per_iteration": 2.5856504440307617 + }, + { + "auxiliary_loss_clip": 0.079531, + "auxiliary_loss_mlp": 0.0144806, + "balance_loss_clip": 0.06880549, + "balance_loss_mlp": 0.01321221, + "epoch": 0.03799789568615662, + "flos": 21914918778240.0, + "grad_norm": 6.504165414948274, + "language_loss": 0.96511495, + "learning_rate": 3.999329268838575e-06, + "loss": 1.05912662, + "num_input_tokens_seen": 13287850, + "router_z_loss_clip": 10.71875, + "router_z_loss_mlp": 1.26953125, + "step": 632, + "time_per_iteration": 2.6638169288635254 + }, + { + "auxiliary_loss_clip": 0.07980786, + "auxiliary_loss_mlp": 0.01460671, + "balance_loss_clip": 0.06883863, + "balance_loss_mlp": 0.0132668, + "epoch": 0.03805801893882459, + "flos": 24833967465600.0, + "grad_norm": 3.720972995518591, + "language_loss": 0.88515753, + "learning_rate": 3.999319145312175e-06, + "loss": 0.97957206, + "num_input_tokens_seen": 13307760, + "router_z_loss_clip": 10.984375, + "router_z_loss_mlp": 1.33984375, + "step": 633, + "time_per_iteration": 2.7479147911071777 + }, + { + "auxiliary_loss_clip": 0.07973721, + "auxiliary_loss_mlp": 0.01476512, + "balance_loss_clip": 0.06873562, + "balance_loss_mlp": 0.01335273, + "epoch": 0.03811814219149256, + "flos": 30490950240000.0, + "grad_norm": 5.013866846245917, + "language_loss": 0.74909431, + "learning_rate": 3.999308945971392e-06, + "loss": 0.84359664, + "num_input_tokens_seen": 13331230, + "router_z_loss_clip": 11.0078125, + "router_z_loss_mlp": 1.4140625, + "step": 634, + "time_per_iteration": 2.7746760845184326 + }, + { + "auxiliary_loss_clip": 0.07892692, + "auxiliary_loss_mlp": 0.01617175, + "balance_loss_clip": 0.0733197, + "balance_loss_mlp": 0.01455336, + "epoch": 0.03817826544416053, + "flos": 67010671820160.0, + "grad_norm": 1.8703584651187424, + "language_loss": 0.63503969, + "learning_rate": 3.999298670816614e-06, + "loss": 0.73013842, + "num_input_tokens_seen": 13394760, + "router_z_loss_clip": 5.6171875, + "router_z_loss_mlp": 1.61816406, + "step": 635, + "time_per_iteration": 3.2972047328948975 + }, + { + "auxiliary_loss_clip": 0.08014892, + "auxiliary_loss_mlp": 0.01535345, + "balance_loss_clip": 0.06916042, + "balance_loss_mlp": 0.01392198, + "epoch": 0.038238388696828496, + "flos": 20491592198400.0, + "grad_norm": 9.695955755206388, + "language_loss": 0.90505767, + "learning_rate": 3.9992883198482294e-06, + "loss": 1.00056005, + "num_input_tokens_seen": 13412775, + "router_z_loss_clip": 10.9921875, + "router_z_loss_mlp": 1.43066406, + "step": 636, + "time_per_iteration": 2.6479721069335938 + }, + { + "auxiliary_loss_clip": 0.08042439, + "auxiliary_loss_mlp": 0.01559473, + "balance_loss_clip": 0.06923507, + "balance_loss_mlp": 0.01399637, + "epoch": 0.03829851194949647, + "flos": 17971389745920.0, + "grad_norm": 32.79410112755353, + "language_loss": 0.88142544, + "learning_rate": 3.999277893066632e-06, + "loss": 0.97744453, + "num_input_tokens_seen": 13427835, + "router_z_loss_clip": 11.1796875, + "router_z_loss_mlp": 1.59667969, + "step": 637, + "time_per_iteration": 2.6563000679016113 + }, + { + "auxiliary_loss_clip": 0.08110388, + "auxiliary_loss_mlp": 0.0159766, + "balance_loss_clip": 0.06951486, + "balance_loss_mlp": 0.0144078, + "epoch": 0.03835863520216444, + "flos": 22463251896960.0, + "grad_norm": 37.67076952511291, + "language_loss": 0.91187263, + "learning_rate": 3.999267390472215e-06, + "loss": 1.00895298, + "num_input_tokens_seen": 13447295, + "router_z_loss_clip": 11.578125, + "router_z_loss_mlp": 1.56933594, + "step": 638, + "time_per_iteration": 2.6984195709228516 + }, + { + "auxiliary_loss_clip": 0.08094786, + "auxiliary_loss_mlp": 0.01648944, + "balance_loss_clip": 0.0693827, + "balance_loss_mlp": 0.01462406, + "epoch": 0.038418758454832405, + "flos": 22171070309760.0, + "grad_norm": 8.895472090968715, + "language_loss": 0.76717615, + "learning_rate": 3.999256812065381e-06, + "loss": 0.86461353, + "num_input_tokens_seen": 13468455, + "router_z_loss_clip": 11.5703125, + "router_z_loss_mlp": 1.86621094, + "step": 639, + "time_per_iteration": 2.7338461875915527 + }, + { + "auxiliary_loss_clip": 0.08159171, + "auxiliary_loss_mlp": 0.0166434, + "balance_loss_clip": 0.06976852, + "balance_loss_mlp": 0.01475227, + "epoch": 0.03847888170750038, + "flos": 22754049891840.0, + "grad_norm": 14.750114797034104, + "language_loss": 0.93037415, + "learning_rate": 3.999246157846526e-06, + "loss": 1.02860928, + "num_input_tokens_seen": 13489085, + "router_z_loss_clip": 11.8203125, + "router_z_loss_mlp": 1.890625, + "step": 640, + "time_per_iteration": 2.6571292877197266 + }, + { + "auxiliary_loss_clip": 0.08171181, + "auxiliary_loss_mlp": 0.01715232, + "balance_loss_clip": 0.06975375, + "balance_loss_mlp": 0.01501704, + "epoch": 0.03853900496016834, + "flos": 22717852128000.0, + "grad_norm": 10.934463540103733, + "language_loss": 0.90094578, + "learning_rate": 3.9992354278160574e-06, + "loss": 0.99980986, + "num_input_tokens_seen": 13509120, + "router_z_loss_clip": 11.953125, + "router_z_loss_mlp": 2.1328125, + "step": 641, + "time_per_iteration": 2.6885619163513184 + }, + { + "auxiliary_loss_clip": 0.07644878, + "auxiliary_loss_mlp": 0.01447392, + "balance_loss_clip": 0.07120143, + "balance_loss_mlp": 0.01325512, + "epoch": 0.038599128212836314, + "flos": 70420039073280.0, + "grad_norm": 0.9281695288015585, + "language_loss": 0.65025115, + "learning_rate": 3.999224621974381e-06, + "loss": 0.74117386, + "num_input_tokens_seen": 13562005, + "router_z_loss_clip": 5.25, + "router_z_loss_mlp": 1.21679688, + "step": 642, + "time_per_iteration": 3.2678098678588867 + }, + { + "auxiliary_loss_clip": 0.08201542, + "auxiliary_loss_mlp": 0.01819887, + "balance_loss_clip": 0.07001273, + "balance_loss_mlp": 0.01562014, + "epoch": 0.03865925146550429, + "flos": 23301921813120.0, + "grad_norm": 11.481508748032715, + "language_loss": 0.86633605, + "learning_rate": 3.999213740321906e-06, + "loss": 0.96655035, + "num_input_tokens_seen": 13582185, + "router_z_loss_clip": 11.9921875, + "router_z_loss_mlp": 2.57617188, + "step": 643, + "time_per_iteration": 2.659075975418091 + }, + { + "auxiliary_loss_clip": 0.08181606, + "auxiliary_loss_mlp": 0.01825318, + "balance_loss_clip": 0.06992409, + "balance_loss_mlp": 0.01547799, + "epoch": 0.03871937471817225, + "flos": 21436255929600.0, + "grad_norm": 51.325604168223556, + "language_loss": 0.89457649, + "learning_rate": 3.999202782859046e-06, + "loss": 0.99464566, + "num_input_tokens_seen": 13599555, + "router_z_loss_clip": 11.890625, + "router_z_loss_mlp": 2.77539062, + "step": 644, + "time_per_iteration": 2.659674882888794 + }, + { + "auxiliary_loss_clip": 0.08227627, + "auxiliary_loss_mlp": 0.01840427, + "balance_loss_clip": 0.07032949, + "balance_loss_mlp": 0.01557186, + "epoch": 0.038779497970840224, + "flos": 34285914783360.0, + "grad_norm": 72.96819975442757, + "language_loss": 0.90063643, + "learning_rate": 3.9991917495862165e-06, + "loss": 1.00131702, + "num_input_tokens_seen": 13621160, + "router_z_loss_clip": 11.953125, + "router_z_loss_mlp": 2.83007812, + "step": 645, + "time_per_iteration": 2.732840061187744 + }, + { + "auxiliary_loss_clip": 0.08212948, + "auxiliary_loss_mlp": 0.01875445, + "balance_loss_clip": 0.07012647, + "balance_loss_mlp": 0.01580378, + "epoch": 0.03883962122350819, + "flos": 22754930359680.0, + "grad_norm": 12.262203154186425, + "language_loss": 0.90520537, + "learning_rate": 3.9991806405038345e-06, + "loss": 1.00608933, + "num_input_tokens_seen": 13641915, + "router_z_loss_clip": 12.0078125, + "router_z_loss_mlp": 2.95117188, + "step": 646, + "time_per_iteration": 2.6865735054016113 + }, + { + "auxiliary_loss_clip": 0.08250429, + "auxiliary_loss_mlp": 0.01894148, + "balance_loss_clip": 0.07030701, + "balance_loss_mlp": 0.01611288, + "epoch": 0.03889974447617616, + "flos": 21952500134400.0, + "grad_norm": 17.1595872898191, + "language_loss": 0.88891035, + "learning_rate": 3.999169455612323e-06, + "loss": 0.99035615, + "num_input_tokens_seen": 13661410, + "router_z_loss_clip": 12.1953125, + "router_z_loss_mlp": 2.83007812, + "step": 647, + "time_per_iteration": 2.648667097091675 + }, + { + "auxiliary_loss_clip": 0.08277115, + "auxiliary_loss_mlp": 0.01910975, + "balance_loss_clip": 0.0706424, + "balance_loss_mlp": 0.01610376, + "epoch": 0.03895986772884413, + "flos": 31513040743680.0, + "grad_norm": 19.91369953833428, + "language_loss": 0.91710514, + "learning_rate": 3.999158194912106e-06, + "loss": 1.01898599, + "num_input_tokens_seen": 13681705, + "router_z_loss_clip": 12.125, + "router_z_loss_mlp": 3.00585938, + "step": 648, + "time_per_iteration": 2.7659173011779785 + }, + { + "auxiliary_loss_clip": 0.08252379, + "auxiliary_loss_mlp": 0.0196062, + "balance_loss_clip": 0.0704875, + "balance_loss_mlp": 0.01647243, + "epoch": 0.0390199909815121, + "flos": 19907061315840.0, + "grad_norm": 11.116514995705378, + "language_loss": 0.90245318, + "learning_rate": 3.9991468584036086e-06, + "loss": 1.00458312, + "num_input_tokens_seen": 13700400, + "router_z_loss_clip": 12.0234375, + "router_z_loss_mlp": 3.1328125, + "step": 649, + "time_per_iteration": 4.126534938812256 + }, + { + "auxiliary_loss_clip": 0.08304022, + "auxiliary_loss_mlp": 0.01986477, + "balance_loss_clip": 0.07056045, + "balance_loss_mlp": 0.01679394, + "epoch": 0.03908011423418007, + "flos": 21618250997760.0, + "grad_norm": 9.336868328216912, + "language_loss": 0.85345471, + "learning_rate": 3.999135446087263e-06, + "loss": 0.95635974, + "num_input_tokens_seen": 13720145, + "router_z_loss_clip": 12.484375, + "router_z_loss_mlp": 3.07421875, + "step": 650, + "time_per_iteration": 4.1806252002716064 + }, + { + "auxiliary_loss_clip": 0.08239638, + "auxiliary_loss_mlp": 0.01912282, + "balance_loss_clip": 0.0705025, + "balance_loss_mlp": 0.01647351, + "epoch": 0.039140237486848035, + "flos": 18667406885760.0, + "grad_norm": 11.202480244033193, + "language_loss": 0.84588236, + "learning_rate": 3.9991239579635e-06, + "loss": 0.94740158, + "num_input_tokens_seen": 13737500, + "router_z_loss_clip": 11.890625, + "router_z_loss_mlp": 2.6484375, + "step": 651, + "time_per_iteration": 4.02846360206604 + }, + { + "auxiliary_loss_clip": 0.08228613, + "auxiliary_loss_mlp": 0.01893436, + "balance_loss_clip": 0.07038778, + "balance_loss_mlp": 0.01631557, + "epoch": 0.03920036073951601, + "flos": 18667071469440.0, + "grad_norm": 33.17940308554231, + "language_loss": 0.9516173, + "learning_rate": 3.999112394032757e-06, + "loss": 1.05283785, + "num_input_tokens_seen": 13754750, + "router_z_loss_clip": 11.90625, + "router_z_loss_mlp": 2.6171875, + "step": 652, + "time_per_iteration": 2.6877963542938232 + }, + { + "auxiliary_loss_clip": 0.08188264, + "auxiliary_loss_mlp": 0.01841461, + "balance_loss_clip": 0.07017257, + "balance_loss_mlp": 0.01607716, + "epoch": 0.03926048399218398, + "flos": 31361918705280.0, + "grad_norm": 14.717862862310868, + "language_loss": 0.87065995, + "learning_rate": 3.999100754295471e-06, + "loss": 0.97095722, + "num_input_tokens_seen": 13771990, + "router_z_loss_clip": 11.7109375, + "router_z_loss_mlp": 2.33691406, + "step": 653, + "time_per_iteration": 4.161829948425293 + }, + { + "auxiliary_loss_clip": 0.08235107, + "auxiliary_loss_mlp": 0.01869742, + "balance_loss_clip": 0.07023594, + "balance_loss_mlp": 0.01632659, + "epoch": 0.039320607244851945, + "flos": 29610715898880.0, + "grad_norm": 12.720561465838024, + "language_loss": 0.92308909, + "learning_rate": 3.999089038752085e-06, + "loss": 1.0241375, + "num_input_tokens_seen": 13792750, + "router_z_loss_clip": 12.125, + "router_z_loss_mlp": 2.37304688, + "step": 654, + "time_per_iteration": 2.7182300090789795 + }, + { + "auxiliary_loss_clip": 0.07219759, + "auxiliary_loss_mlp": 0.01432266, + "balance_loss_clip": 0.0672446, + "balance_loss_mlp": 0.01342621, + "epoch": 0.03938073049751992, + "flos": 66555362332800.0, + "grad_norm": 4.21609108891928, + "language_loss": 0.5259136, + "learning_rate": 3.999077247403041e-06, + "loss": 0.61243391, + "num_input_tokens_seen": 13858570, + "router_z_loss_clip": 4.94140625, + "router_z_loss_mlp": 0.89599609, + "step": 655, + "time_per_iteration": 3.3539531230926514 + }, + { + "auxiliary_loss_clip": 0.08163472, + "auxiliary_loss_mlp": 0.01789512, + "balance_loss_clip": 0.07021941, + "balance_loss_mlp": 0.01601352, + "epoch": 0.03944085375018788, + "flos": 23374568903040.0, + "grad_norm": 42.09331718280733, + "language_loss": 0.85369515, + "learning_rate": 3.9990653802487886e-06, + "loss": 0.95322502, + "num_input_tokens_seen": 13876335, + "router_z_loss_clip": 11.4140625, + "router_z_loss_mlp": 1.88183594, + "step": 656, + "time_per_iteration": 2.696443557739258 + }, + { + "auxiliary_loss_clip": 0.08208387, + "auxiliary_loss_mlp": 0.01830457, + "balance_loss_clip": 0.07014482, + "balance_loss_mlp": 0.01624177, + "epoch": 0.039500977002855854, + "flos": 18553656568320.0, + "grad_norm": 12.61442729870119, + "language_loss": 0.83751947, + "learning_rate": 3.999053437289776e-06, + "loss": 0.93790793, + "num_input_tokens_seen": 13892640, + "router_z_loss_clip": 11.9296875, + "router_z_loss_mlp": 2.06347656, + "step": 657, + "time_per_iteration": 2.6805458068847656 + }, + { + "auxiliary_loss_clip": 0.08160911, + "auxiliary_loss_mlp": 0.01759172, + "balance_loss_clip": 0.07011348, + "balance_loss_mlp": 0.0155871, + "epoch": 0.039561100255523826, + "flos": 25345264279680.0, + "grad_norm": 59.81491010429953, + "language_loss": 0.86573362, + "learning_rate": 3.999041418526457e-06, + "loss": 0.96493447, + "num_input_tokens_seen": 13910085, + "router_z_loss_clip": 11.5, + "router_z_loss_mlp": 2.00488281, + "step": 658, + "time_per_iteration": 2.7667956352233887 + }, + { + "auxiliary_loss_clip": 0.08139389, + "auxiliary_loss_mlp": 0.01752558, + "balance_loss_clip": 0.07002386, + "balance_loss_mlp": 0.01577368, + "epoch": 0.03962122350819179, + "flos": 18225193363200.0, + "grad_norm": 13.067415763006752, + "language_loss": 0.97220278, + "learning_rate": 3.999029323959287e-06, + "loss": 1.07112217, + "num_input_tokens_seen": 13928800, + "router_z_loss_clip": 11.375, + "router_z_loss_mlp": 1.75097656, + "step": 659, + "time_per_iteration": 2.7390072345733643 + }, + { + "auxiliary_loss_clip": 0.08160311, + "auxiliary_loss_mlp": 0.01767653, + "balance_loss_clip": 0.07020363, + "balance_loss_mlp": 0.01584643, + "epoch": 0.03968134676085976, + "flos": 20528544648960.0, + "grad_norm": 6.696604257077815, + "language_loss": 0.85069668, + "learning_rate": 3.999017153588724e-06, + "loss": 0.94997621, + "num_input_tokens_seen": 13948325, + "router_z_loss_clip": 11.40625, + "router_z_loss_mlp": 1.83203125, + "step": 660, + "time_per_iteration": 2.6942412853240967 + }, + { + "auxiliary_loss_clip": 0.08128712, + "auxiliary_loss_mlp": 0.01673628, + "balance_loss_clip": 0.07018431, + "balance_loss_mlp": 0.01512361, + "epoch": 0.03974147001352773, + "flos": 22429737463680.0, + "grad_norm": 7.3843033134333425, + "language_loss": 0.86255896, + "learning_rate": 3.999004907415231e-06, + "loss": 0.96058238, + "num_input_tokens_seen": 13969090, + "router_z_loss_clip": 11.109375, + "router_z_loss_mlp": 1.61132812, + "step": 661, + "time_per_iteration": 2.688343048095703 + }, + { + "auxiliary_loss_clip": 0.07200997, + "auxiliary_loss_mlp": 0.01397595, + "balance_loss_clip": 0.06707223, + "balance_loss_mlp": 0.01289354, + "epoch": 0.0398015932661957, + "flos": 71149780281600.0, + "grad_norm": 0.9134370604104062, + "language_loss": 0.69827634, + "learning_rate": 3.998992585439272e-06, + "loss": 0.78426224, + "num_input_tokens_seen": 14037555, + "router_z_loss_clip": 4.9375, + "router_z_loss_mlp": 1.08496094, + "step": 662, + "time_per_iteration": 3.4075381755828857 + }, + { + "auxiliary_loss_clip": 0.08114735, + "auxiliary_loss_mlp": 0.01667295, + "balance_loss_clip": 0.06992006, + "balance_loss_mlp": 0.01495347, + "epoch": 0.03986171651886367, + "flos": 16806688392960.0, + "grad_norm": 88.3041379662575, + "language_loss": 0.8901574, + "learning_rate": 3.998980187661314e-06, + "loss": 0.98797774, + "num_input_tokens_seen": 14055765, + "router_z_loss_clip": 11.234375, + "router_z_loss_mlp": 1.71875, + "step": 663, + "time_per_iteration": 2.6151316165924072 + }, + { + "auxiliary_loss_clip": 0.08116017, + "auxiliary_loss_mlp": 0.01665745, + "balance_loss_clip": 0.06974875, + "balance_loss_mlp": 0.01491318, + "epoch": 0.03992183977153164, + "flos": 24541953586560.0, + "grad_norm": 13.584726936237926, + "language_loss": 0.92355931, + "learning_rate": 3.998967714081826e-06, + "loss": 1.02137709, + "num_input_tokens_seen": 14074195, + "router_z_loss_clip": 11.3984375, + "router_z_loss_mlp": 1.74511719, + "step": 664, + "time_per_iteration": 2.7008705139160156 + }, + { + "auxiliary_loss_clip": 0.08040652, + "auxiliary_loss_mlp": 0.01593066, + "balance_loss_clip": 0.06989275, + "balance_loss_mlp": 0.01449252, + "epoch": 0.03998196302419961, + "flos": 15601261155840.0, + "grad_norm": 12.968973833741712, + "language_loss": 0.90573943, + "learning_rate": 3.998955164701281e-06, + "loss": 1.00207651, + "num_input_tokens_seen": 14090215, + "router_z_loss_clip": 10.5078125, + "router_z_loss_mlp": 1.43847656, + "step": 665, + "time_per_iteration": 2.588078737258911 + }, + { + "auxiliary_loss_clip": 0.0806282, + "auxiliary_loss_mlp": 0.01620663, + "balance_loss_clip": 0.06955597, + "balance_loss_mlp": 0.01454533, + "epoch": 0.04004208627686758, + "flos": 25312714168320.0, + "grad_norm": 13.194143098844163, + "language_loss": 0.86261296, + "learning_rate": 3.998942539520158e-06, + "loss": 0.9594478, + "num_input_tokens_seen": 14112150, + "router_z_loss_clip": 11.0859375, + "router_z_loss_mlp": 1.66113281, + "step": 666, + "time_per_iteration": 2.7150063514709473 + }, + { + "auxiliary_loss_clip": 0.08039176, + "auxiliary_loss_mlp": 0.01580059, + "balance_loss_clip": 0.06968041, + "balance_loss_mlp": 0.01428235, + "epoch": 0.04010220952953555, + "flos": 23482365580800.0, + "grad_norm": 143.76139759772911, + "language_loss": 0.91256213, + "learning_rate": 3.998929838538932e-06, + "loss": 1.00875449, + "num_input_tokens_seen": 14131475, + "router_z_loss_clip": 10.71875, + "router_z_loss_mlp": 1.51855469, + "step": 667, + "time_per_iteration": 2.6658053398132324 + }, + { + "auxiliary_loss_clip": 0.08004649, + "auxiliary_loss_mlp": 0.01530234, + "balance_loss_clip": 0.06972381, + "balance_loss_mlp": 0.01387469, + "epoch": 0.04016233278220352, + "flos": 18621691683840.0, + "grad_norm": 22.359711377029505, + "language_loss": 0.8821072, + "learning_rate": 3.998917061758087e-06, + "loss": 0.97745597, + "num_input_tokens_seen": 14146165, + "router_z_loss_clip": 10.3046875, + "router_z_loss_mlp": 1.42773438, + "step": 668, + "time_per_iteration": 2.6255545616149902 + }, + { + "auxiliary_loss_clip": 0.07152489, + "auxiliary_loss_mlp": 0.01341531, + "balance_loss_clip": 0.06666718, + "balance_loss_mlp": 0.01260421, + "epoch": 0.040222456034871484, + "flos": 70926556204800.0, + "grad_norm": 1.1799050230194268, + "language_loss": 0.60729092, + "learning_rate": 3.998904209178107e-06, + "loss": 0.69223112, + "num_input_tokens_seen": 14215005, + "router_z_loss_clip": 4.875, + "router_z_loss_mlp": 0.81103516, + "step": 669, + "time_per_iteration": 3.3595035076141357 + }, + { + "auxiliary_loss_clip": 0.08017544, + "auxiliary_loss_mlp": 0.01537312, + "balance_loss_clip": 0.06961209, + "balance_loss_mlp": 0.0138749, + "epoch": 0.040282579287539456, + "flos": 23770773734400.0, + "grad_norm": 21.749949136203163, + "language_loss": 0.91578722, + "learning_rate": 3.9988912807994785e-06, + "loss": 1.01133573, + "num_input_tokens_seen": 14235510, + "router_z_loss_clip": 10.5703125, + "router_z_loss_mlp": 1.49707031, + "step": 670, + "time_per_iteration": 2.66859769821167 + }, + { + "auxiliary_loss_clip": 0.08002704, + "auxiliary_loss_mlp": 0.01555976, + "balance_loss_clip": 0.0695509, + "balance_loss_mlp": 0.01413116, + "epoch": 0.04034270254020743, + "flos": 18484405568640.0, + "grad_norm": 9.221564261110139, + "language_loss": 0.80103904, + "learning_rate": 3.998878276622692e-06, + "loss": 0.89662588, + "num_input_tokens_seen": 14254565, + "router_z_loss_clip": 10.484375, + "router_z_loss_mlp": 1.4296875, + "step": 671, + "time_per_iteration": 2.6671946048736572 + }, + { + "auxiliary_loss_clip": 0.07994901, + "auxiliary_loss_mlp": 0.01548628, + "balance_loss_clip": 0.06957932, + "balance_loss_mlp": 0.01400332, + "epoch": 0.040402825792875394, + "flos": 17207589052800.0, + "grad_norm": 12.445045366932057, + "language_loss": 0.98976898, + "learning_rate": 3.998865196648242e-06, + "loss": 1.08520412, + "num_input_tokens_seen": 14271885, + "router_z_loss_clip": 10.375, + "router_z_loss_mlp": 1.484375, + "step": 672, + "time_per_iteration": 2.6043524742126465 + }, + { + "auxiliary_loss_clip": 0.08007569, + "auxiliary_loss_mlp": 0.01577526, + "balance_loss_clip": 0.06955793, + "balance_loss_mlp": 0.01428181, + "epoch": 0.040462949045543366, + "flos": 19178242502400.0, + "grad_norm": 16.68355787547426, + "language_loss": 0.95323932, + "learning_rate": 3.998852040876622e-06, + "loss": 1.04909039, + "num_input_tokens_seen": 14289670, + "router_z_loss_clip": 10.53125, + "router_z_loss_mlp": 1.49316406, + "step": 673, + "time_per_iteration": 2.67228102684021 + }, + { + "auxiliary_loss_clip": 0.07999671, + "auxiliary_loss_mlp": 0.01557213, + "balance_loss_clip": 0.06955186, + "balance_loss_mlp": 0.01413161, + "epoch": 0.04052307229821133, + "flos": 24025877089920.0, + "grad_norm": 7.385878323717427, + "language_loss": 0.80140877, + "learning_rate": 3.998838809308334e-06, + "loss": 0.89697754, + "num_input_tokens_seen": 14309285, + "router_z_loss_clip": 10.4375, + "router_z_loss_mlp": 1.43994141, + "step": 674, + "time_per_iteration": 2.6599738597869873 + }, + { + "auxiliary_loss_clip": 0.08032155, + "auxiliary_loss_mlp": 0.01590571, + "balance_loss_clip": 0.06966965, + "balance_loss_mlp": 0.01439795, + "epoch": 0.0405831955508793, + "flos": 16442362840320.0, + "grad_norm": 8.615330731484576, + "language_loss": 0.83709693, + "learning_rate": 3.9988255019438766e-06, + "loss": 0.93332422, + "num_input_tokens_seen": 14328300, + "router_z_loss_clip": 10.6484375, + "router_z_loss_mlp": 1.50683594, + "step": 675, + "time_per_iteration": 2.68145751953125 + }, + { + "auxiliary_loss_clip": 0.07989661, + "auxiliary_loss_mlp": 0.01530552, + "balance_loss_clip": 0.06954966, + "balance_loss_mlp": 0.01384926, + "epoch": 0.040643318803547275, + "flos": 24286808304000.0, + "grad_norm": 7.342047246701879, + "language_loss": 0.80985713, + "learning_rate": 3.998812118783757e-06, + "loss": 0.90505934, + "num_input_tokens_seen": 14346395, + "router_z_loss_clip": 10.3359375, + "router_z_loss_mlp": 1.45605469, + "step": 676, + "time_per_iteration": 2.6827666759490967 + }, + { + "auxiliary_loss_clip": 0.0800771, + "auxiliary_loss_mlp": 0.01548704, + "balance_loss_clip": 0.06941711, + "balance_loss_mlp": 0.01395925, + "epoch": 0.04070344205621524, + "flos": 17717795763840.0, + "grad_norm": 11.552804849972091, + "language_loss": 0.9000327, + "learning_rate": 3.9987986598284804e-06, + "loss": 0.99559683, + "num_input_tokens_seen": 14364605, + "router_z_loss_clip": 10.6640625, + "router_z_loss_mlp": 1.52734375, + "step": 677, + "time_per_iteration": 2.647284984588623 + }, + { + "auxiliary_loss_clip": 0.0795664, + "auxiliary_loss_mlp": 0.01525712, + "balance_loss_clip": 0.06946824, + "balance_loss_mlp": 0.01385522, + "epoch": 0.04076356530888321, + "flos": 26184940444800.0, + "grad_norm": 15.722345117009269, + "language_loss": 0.81235254, + "learning_rate": 3.998785125078559e-06, + "loss": 0.90717608, + "num_input_tokens_seen": 14385265, + "router_z_loss_clip": 10.09375, + "router_z_loss_mlp": 1.40039062, + "step": 678, + "time_per_iteration": 2.713604688644409 + }, + { + "auxiliary_loss_clip": 0.07982595, + "auxiliary_loss_mlp": 0.01542507, + "balance_loss_clip": 0.06946435, + "balance_loss_mlp": 0.01393447, + "epoch": 0.04082368856155118, + "flos": 35782349650560.0, + "grad_norm": 7.406308464158208, + "language_loss": 0.87816763, + "learning_rate": 3.998771514534505e-06, + "loss": 0.97341865, + "num_input_tokens_seen": 14406090, + "router_z_loss_clip": 10.3671875, + "router_z_loss_mlp": 1.4921875, + "step": 679, + "time_per_iteration": 2.7753264904022217 + }, + { + "auxiliary_loss_clip": 0.07950564, + "auxiliary_loss_mlp": 0.01522729, + "balance_loss_clip": 0.06942166, + "balance_loss_mlp": 0.01383969, + "epoch": 0.04088381181421915, + "flos": 28154042593920.0, + "grad_norm": 7.465466597866811, + "language_loss": 0.8230598, + "learning_rate": 3.998757828196835e-06, + "loss": 0.91779268, + "num_input_tokens_seen": 14425130, + "router_z_loss_clip": 10.0859375, + "router_z_loss_mlp": 1.38671875, + "step": 680, + "time_per_iteration": 2.729719400405884 + }, + { + "auxiliary_loss_clip": 0.07993592, + "auxiliary_loss_mlp": 0.01532905, + "balance_loss_clip": 0.06938143, + "balance_loss_mlp": 0.01378696, + "epoch": 0.04094393506688712, + "flos": 27604703226240.0, + "grad_norm": 9.665492233492547, + "language_loss": 0.8765927, + "learning_rate": 3.9987440660660685e-06, + "loss": 0.97185767, + "num_input_tokens_seen": 14447355, + "router_z_loss_clip": 10.5703125, + "router_z_loss_mlp": 1.54199219, + "step": 681, + "time_per_iteration": 2.752514600753784 + }, + { + "auxiliary_loss_clip": 0.07989424, + "auxiliary_loss_mlp": 0.01553673, + "balance_loss_clip": 0.0693374, + "balance_loss_mlp": 0.01390118, + "epoch": 0.04100405831955509, + "flos": 23118668933760.0, + "grad_norm": 7.019008438585821, + "language_loss": 0.77474326, + "learning_rate": 3.998730228142726e-06, + "loss": 0.87017429, + "num_input_tokens_seen": 14466790, + "router_z_loss_clip": 10.5546875, + "router_z_loss_mlp": 1.63476562, + "step": 682, + "time_per_iteration": 2.6727144718170166 + }, + { + "auxiliary_loss_clip": 0.07959605, + "auxiliary_loss_mlp": 0.01503527, + "balance_loss_clip": 0.06938009, + "balance_loss_mlp": 0.01370394, + "epoch": 0.04106418157222306, + "flos": 20162877431040.0, + "grad_norm": 10.358969831785554, + "language_loss": 0.77842575, + "learning_rate": 3.998716314427333e-06, + "loss": 0.87305701, + "num_input_tokens_seen": 14485195, + "router_z_loss_clip": 10.2109375, + "router_z_loss_mlp": 1.33007812, + "step": 683, + "time_per_iteration": 2.6043591499328613 + }, + { + "auxiliary_loss_clip": 0.07972776, + "auxiliary_loss_mlp": 0.01527418, + "balance_loss_clip": 0.06933653, + "balance_loss_mlp": 0.01377405, + "epoch": 0.041124304824891024, + "flos": 17426452717440.0, + "grad_norm": 41.27076771704703, + "language_loss": 0.86504227, + "learning_rate": 3.998702324920417e-06, + "loss": 0.96004421, + "num_input_tokens_seen": 14503370, + "router_z_loss_clip": 10.3984375, + "router_z_loss_mlp": 1.5, + "step": 684, + "time_per_iteration": 2.6286985874176025 + }, + { + "auxiliary_loss_clip": 0.07935933, + "auxiliary_loss_mlp": 0.01488839, + "balance_loss_clip": 0.06928104, + "balance_loss_mlp": 0.01343976, + "epoch": 0.041184428077558996, + "flos": 25788022853760.0, + "grad_norm": 3.9155930370094065, + "language_loss": 0.94948566, + "learning_rate": 3.9986882596225085e-06, + "loss": 1.04373336, + "num_input_tokens_seen": 14526415, + "router_z_loss_clip": 10.0859375, + "router_z_loss_mlp": 1.44824219, + "step": 685, + "time_per_iteration": 2.7345352172851562 + }, + { + "auxiliary_loss_clip": 0.07948299, + "auxiliary_loss_mlp": 0.0149691, + "balance_loss_clip": 0.06921411, + "balance_loss_mlp": 0.01346992, + "epoch": 0.04124455133022697, + "flos": 22971152620800.0, + "grad_norm": 3.7671102410224577, + "language_loss": 0.94070864, + "learning_rate": 3.998674118534141e-06, + "loss": 1.03516078, + "num_input_tokens_seen": 14546595, + "router_z_loss_clip": 10.2734375, + "router_z_loss_mlp": 1.5, + "step": 686, + "time_per_iteration": 2.6663894653320312 + }, + { + "auxiliary_loss_clip": 0.0795872, + "auxiliary_loss_mlp": 0.01501087, + "balance_loss_clip": 0.06920497, + "balance_loss_mlp": 0.01356414, + "epoch": 0.04130467458289493, + "flos": 21295615651200.0, + "grad_norm": 39.86585208650635, + "language_loss": 0.77225804, + "learning_rate": 3.998659901655851e-06, + "loss": 0.8668561, + "num_input_tokens_seen": 14566590, + "router_z_loss_clip": 10.3828125, + "router_z_loss_mlp": 1.44628906, + "step": 687, + "time_per_iteration": 2.6355550289154053 + }, + { + "auxiliary_loss_clip": 0.07898364, + "auxiliary_loss_mlp": 0.01464255, + "balance_loss_clip": 0.06899062, + "balance_loss_mlp": 0.01340564, + "epoch": 0.041364797835562905, + "flos": 19980337311360.0, + "grad_norm": 4.212344971526593, + "language_loss": 0.91093004, + "learning_rate": 3.998645608988177e-06, + "loss": 1.00455618, + "num_input_tokens_seen": 14585965, + "router_z_loss_clip": 10.0078125, + "router_z_loss_mlp": 1.23730469, + "step": 688, + "time_per_iteration": 4.057282209396362 + }, + { + "auxiliary_loss_clip": 0.07878294, + "auxiliary_loss_mlp": 0.01448978, + "balance_loss_clip": 0.06897704, + "balance_loss_mlp": 0.01329388, + "epoch": 0.04142492108823087, + "flos": 21912361228800.0, + "grad_norm": 22.971814885863903, + "language_loss": 0.88008463, + "learning_rate": 3.998631240531661e-06, + "loss": 0.97335738, + "num_input_tokens_seen": 14606015, + "router_z_loss_clip": 9.796875, + "router_z_loss_mlp": 1.19628906, + "step": 689, + "time_per_iteration": 4.07433295249939 + }, + { + "auxiliary_loss_clip": 0.07866906, + "auxiliary_loss_mlp": 0.01444557, + "balance_loss_clip": 0.06897521, + "balance_loss_mlp": 0.01326349, + "epoch": 0.04148504434089884, + "flos": 27647567389440.0, + "grad_norm": 6.767605845927541, + "language_loss": 0.72533339, + "learning_rate": 3.998616796286848e-06, + "loss": 0.81844807, + "num_input_tokens_seen": 14629955, + "router_z_loss_clip": 9.6953125, + "router_z_loss_mlp": 1.18212891, + "step": 690, + "time_per_iteration": 4.110247611999512 + }, + { + "auxiliary_loss_clip": 0.07835479, + "auxiliary_loss_mlp": 0.01439264, + "balance_loss_clip": 0.06874412, + "balance_loss_mlp": 0.01314809, + "epoch": 0.041545167593566815, + "flos": 20524058455680.0, + "grad_norm": 9.225891193910236, + "language_loss": 0.79284167, + "learning_rate": 3.998602276254286e-06, + "loss": 0.88558906, + "num_input_tokens_seen": 14648000, + "router_z_loss_clip": 9.6171875, + "router_z_loss_mlp": 1.24316406, + "step": 691, + "time_per_iteration": 2.667081594467163 + }, + { + "auxiliary_loss_clip": 0.07827538, + "auxiliary_loss_mlp": 0.01419803, + "balance_loss_clip": 0.06878158, + "balance_loss_mlp": 0.01303931, + "epoch": 0.04160529084623478, + "flos": 11872738500480.0, + "grad_norm": 5.1056325398424125, + "language_loss": 0.88591456, + "learning_rate": 3.998587680434526e-06, + "loss": 0.97838795, + "num_input_tokens_seen": 14662235, + "router_z_loss_clip": 9.484375, + "router_z_loss_mlp": 1.15820312, + "step": 692, + "time_per_iteration": 4.027364015579224 + }, + { + "auxiliary_loss_clip": 0.07869601, + "auxiliary_loss_mlp": 0.01461887, + "balance_loss_clip": 0.0685929, + "balance_loss_mlp": 0.01322936, + "epoch": 0.04166541409890275, + "flos": 14833309685760.0, + "grad_norm": 14.964488884578895, + "language_loss": 0.94025421, + "learning_rate": 3.99857300882812e-06, + "loss": 1.0335691, + "num_input_tokens_seen": 14676065, + "router_z_loss_clip": 10.1171875, + "router_z_loss_mlp": 1.38867188, + "step": 693, + "time_per_iteration": 2.6548287868499756 + }, + { + "auxiliary_loss_clip": 0.07852003, + "auxiliary_loss_mlp": 0.01436954, + "balance_loss_clip": 0.06875066, + "balance_loss_mlp": 0.01312977, + "epoch": 0.04172553735157072, + "flos": 25814577398400.0, + "grad_norm": 10.760604695701561, + "language_loss": 0.88156736, + "learning_rate": 3.998558261435626e-06, + "loss": 0.97445703, + "num_input_tokens_seen": 14694955, + "router_z_loss_clip": 9.765625, + "router_z_loss_mlp": 1.24023438, + "step": 694, + "time_per_iteration": 2.6794655323028564 + }, + { + "auxiliary_loss_clip": 0.07850839, + "auxiliary_loss_mlp": 0.01460734, + "balance_loss_clip": 0.0686307, + "balance_loss_mlp": 0.01329222, + "epoch": 0.04178566060423869, + "flos": 24286682522880.0, + "grad_norm": 6.107694720201945, + "language_loss": 0.89735746, + "learning_rate": 3.9985434382576015e-06, + "loss": 0.99047321, + "num_input_tokens_seen": 14715510, + "router_z_loss_clip": 9.890625, + "router_z_loss_mlp": 1.31445312, + "step": 695, + "time_per_iteration": 2.7562625408172607 + }, + { + "auxiliary_loss_clip": 0.07797342, + "auxiliary_loss_mlp": 0.01449631, + "balance_loss_clip": 0.0684258, + "balance_loss_mlp": 0.01321648, + "epoch": 0.04184578385690666, + "flos": 18227667058560.0, + "grad_norm": 4.8539800399764195, + "language_loss": 0.91097277, + "learning_rate": 3.99852853929461e-06, + "loss": 1.00344253, + "num_input_tokens_seen": 14731755, + "router_z_loss_clip": 9.5625, + "router_z_loss_mlp": 1.28027344, + "step": 696, + "time_per_iteration": 2.6180830001831055 + }, + { + "auxiliary_loss_clip": 0.07759669, + "auxiliary_loss_mlp": 0.01436884, + "balance_loss_clip": 0.06835265, + "balance_loss_mlp": 0.01318438, + "epoch": 0.041905907109574626, + "flos": 22781694539520.0, + "grad_norm": 8.248305080547661, + "language_loss": 0.97183168, + "learning_rate": 3.998513564547216e-06, + "loss": 1.06379724, + "num_input_tokens_seen": 14750810, + "router_z_loss_clip": 9.234375, + "router_z_loss_mlp": 1.18359375, + "step": 697, + "time_per_iteration": 2.6976754665374756 + }, + { + "auxiliary_loss_clip": 0.0775051, + "auxiliary_loss_mlp": 0.0142093, + "balance_loss_clip": 0.06823087, + "balance_loss_mlp": 0.01301005, + "epoch": 0.0419660303622426, + "flos": 20163128993280.0, + "grad_norm": 6.669627081417543, + "language_loss": 0.90090138, + "learning_rate": 3.998498514015987e-06, + "loss": 0.99261582, + "num_input_tokens_seen": 14768435, + "router_z_loss_clip": 9.2890625, + "router_z_loss_mlp": 1.20068359, + "step": 698, + "time_per_iteration": 2.6525814533233643 + }, + { + "auxiliary_loss_clip": 0.07798302, + "auxiliary_loss_mlp": 0.01439823, + "balance_loss_clip": 0.06844427, + "balance_loss_mlp": 0.01318039, + "epoch": 0.042026153614910564, + "flos": 23083142002560.0, + "grad_norm": 12.169844049295248, + "language_loss": 0.96140921, + "learning_rate": 3.998483387701495e-06, + "loss": 1.05379045, + "num_input_tokens_seen": 14786690, + "router_z_loss_clip": 9.546875, + "router_z_loss_mlp": 1.21728516, + "step": 699, + "time_per_iteration": 2.700636625289917 + }, + { + "auxiliary_loss_clip": 0.0715683, + "auxiliary_loss_mlp": 0.01383088, + "balance_loss_clip": 0.06685513, + "balance_loss_mlp": 0.01307272, + "epoch": 0.042086276867578536, + "flos": 64516296424320.0, + "grad_norm": 2.8955425132907755, + "language_loss": 0.7356112, + "learning_rate": 3.998468185604312e-06, + "loss": 0.82101035, + "num_input_tokens_seen": 14853840, + "router_z_loss_clip": 4.71875, + "router_z_loss_mlp": 0.75683594, + "step": 700, + "time_per_iteration": 3.2564964294433594 + }, + { + "auxiliary_loss_clip": 0.07741027, + "auxiliary_loss_mlp": 0.01429077, + "balance_loss_clip": 0.0681721, + "balance_loss_mlp": 0.01313587, + "epoch": 0.04214640012024651, + "flos": 15492458229120.0, + "grad_norm": 9.391497638208355, + "language_loss": 0.93962044, + "learning_rate": 3.998452907725016e-06, + "loss": 1.03132153, + "num_input_tokens_seen": 14869580, + "router_z_loss_clip": 9.2421875, + "router_z_loss_mlp": 1.15527344, + "step": 701, + "time_per_iteration": 2.66644024848938 + }, + { + "auxiliary_loss_clip": 0.07737128, + "auxiliary_loss_mlp": 0.01419929, + "balance_loss_clip": 0.06809002, + "balance_loss_mlp": 0.01302341, + "epoch": 0.04220652337291447, + "flos": 23883601656960.0, + "grad_norm": 33.27176662769112, + "language_loss": 0.71847737, + "learning_rate": 3.998437554064184e-06, + "loss": 0.81004792, + "num_input_tokens_seen": 14891065, + "router_z_loss_clip": 9.2890625, + "router_z_loss_mlp": 1.17529297, + "step": 702, + "time_per_iteration": 2.7162067890167236 + }, + { + "auxiliary_loss_clip": 0.07125677, + "auxiliary_loss_mlp": 0.01365095, + "balance_loss_clip": 0.06657615, + "balance_loss_mlp": 0.01297575, + "epoch": 0.042266646625582445, + "flos": 63815289966720.0, + "grad_norm": 0.8674304256332159, + "language_loss": 0.6110186, + "learning_rate": 3.9984221246224006e-06, + "loss": 0.69592631, + "num_input_tokens_seen": 14954815, + "router_z_loss_clip": 4.6875, + "router_z_loss_mlp": 0.67578125, + "step": 703, + "time_per_iteration": 3.3240442276000977 + }, + { + "auxiliary_loss_clip": 0.0710092, + "auxiliary_loss_mlp": 0.01355985, + "balance_loss_clip": 0.06631917, + "balance_loss_mlp": 0.01291803, + "epoch": 0.04232676987825041, + "flos": 50038912154880.0, + "grad_norm": 1.041495616235658, + "language_loss": 0.58151424, + "learning_rate": 3.9984066194002494e-06, + "loss": 0.66608322, + "num_input_tokens_seen": 15003050, + "router_z_loss_clip": 4.6875, + "router_z_loss_mlp": 0.64160156, + "step": 704, + "time_per_iteration": 3.174765110015869 + }, + { + "auxiliary_loss_clip": 0.07745479, + "auxiliary_loss_mlp": 0.01449155, + "balance_loss_clip": 0.06810448, + "balance_loss_mlp": 0.01329278, + "epoch": 0.04238689313091838, + "flos": 21622485628800.0, + "grad_norm": 12.557351496220864, + "language_loss": 0.93966371, + "learning_rate": 3.998391038398319e-06, + "loss": 1.03161013, + "num_input_tokens_seen": 15021990, + "router_z_loss_clip": 9.3515625, + "router_z_loss_mlp": 1.19775391, + "step": 705, + "time_per_iteration": 2.6435232162475586 + }, + { + "auxiliary_loss_clip": 0.07677379, + "auxiliary_loss_mlp": 0.01427121, + "balance_loss_clip": 0.06791299, + "balance_loss_mlp": 0.01325698, + "epoch": 0.042447016383586354, + "flos": 19141080416640.0, + "grad_norm": 3.7381942579388303, + "language_loss": 0.75889277, + "learning_rate": 3.998375381617201e-06, + "loss": 0.8499378, + "num_input_tokens_seen": 15040700, + "router_z_loss_clip": 8.8515625, + "router_z_loss_mlp": 1.01269531, + "step": 706, + "time_per_iteration": 2.671828508377075 + }, + { + "auxiliary_loss_clip": 0.07719514, + "auxiliary_loss_mlp": 0.01450054, + "balance_loss_clip": 0.06807585, + "balance_loss_mlp": 0.01336471, + "epoch": 0.04250713963625432, + "flos": 24432941024640.0, + "grad_norm": 29.794541170575812, + "language_loss": 0.97812521, + "learning_rate": 3.9983596490574875e-06, + "loss": 1.06982088, + "num_input_tokens_seen": 15056725, + "router_z_loss_clip": 9.1171875, + "router_z_loss_mlp": 1.13427734, + "step": 707, + "time_per_iteration": 2.6550920009613037 + }, + { + "auxiliary_loss_clip": 0.07717137, + "auxiliary_loss_mlp": 0.01443639, + "balance_loss_clip": 0.06809401, + "balance_loss_mlp": 0.01333776, + "epoch": 0.04256726288892229, + "flos": 30374348883840.0, + "grad_norm": 14.849267761051758, + "language_loss": 0.85616708, + "learning_rate": 3.998343840719776e-06, + "loss": 0.94777477, + "num_input_tokens_seen": 15077550, + "router_z_loss_clip": 9.09375, + "router_z_loss_mlp": 1.09863281, + "step": 708, + "time_per_iteration": 2.7447280883789062 + }, + { + "auxiliary_loss_clip": 0.07730591, + "auxiliary_loss_mlp": 0.01453146, + "balance_loss_clip": 0.06808455, + "balance_loss_mlp": 0.01341232, + "epoch": 0.04262738614159026, + "flos": 16368248304000.0, + "grad_norm": 3.836638557890093, + "language_loss": 0.88926339, + "learning_rate": 3.998327956604666e-06, + "loss": 0.98110074, + "num_input_tokens_seen": 15094955, + "router_z_loss_clip": 9.21875, + "router_z_loss_mlp": 1.11914062, + "step": 709, + "time_per_iteration": 2.632735252380371 + }, + { + "auxiliary_loss_clip": 0.07711782, + "auxiliary_loss_mlp": 0.01472, + "balance_loss_clip": 0.06786519, + "balance_loss_mlp": 0.01342396, + "epoch": 0.04268750939425823, + "flos": 20418609692160.0, + "grad_norm": 7.682824070104421, + "language_loss": 0.92841685, + "learning_rate": 3.99831199671276e-06, + "loss": 1.02025461, + "num_input_tokens_seen": 15113395, + "router_z_loss_clip": 9.2421875, + "router_z_loss_mlp": 1.296875, + "step": 710, + "time_per_iteration": 2.6799728870391846 + }, + { + "auxiliary_loss_clip": 0.07731062, + "auxiliary_loss_mlp": 0.01465957, + "balance_loss_clip": 0.06815341, + "balance_loss_mlp": 0.01351993, + "epoch": 0.0427476326469262, + "flos": 20309177859840.0, + "grad_norm": 5.073822997040578, + "language_loss": 0.89081585, + "learning_rate": 3.998295961044662e-06, + "loss": 0.98278606, + "num_input_tokens_seen": 15132920, + "router_z_loss_clip": 9.1484375, + "router_z_loss_mlp": 1.13867188, + "step": 711, + "time_per_iteration": 2.6377625465393066 + }, + { + "auxiliary_loss_clip": 0.07695919, + "auxiliary_loss_mlp": 0.01446717, + "balance_loss_clip": 0.06801347, + "balance_loss_mlp": 0.01336377, + "epoch": 0.042807755899594166, + "flos": 21656880529920.0, + "grad_norm": 4.571300727713509, + "language_loss": 0.91390419, + "learning_rate": 3.9982798496009804e-06, + "loss": 1.00533056, + "num_input_tokens_seen": 15153115, + "router_z_loss_clip": 8.9453125, + "router_z_loss_mlp": 1.10302734, + "step": 712, + "time_per_iteration": 2.6158323287963867 + }, + { + "auxiliary_loss_clip": 0.07722442, + "auxiliary_loss_mlp": 0.01473663, + "balance_loss_clip": 0.06794881, + "balance_loss_mlp": 0.01356647, + "epoch": 0.04286787915226214, + "flos": 21441580663680.0, + "grad_norm": 10.343893565695913, + "language_loss": 0.96509683, + "learning_rate": 3.998263662382328e-06, + "loss": 1.05705786, + "num_input_tokens_seen": 15172770, + "router_z_loss_clip": 9.265625, + "router_z_loss_mlp": 1.17041016, + "step": 713, + "time_per_iteration": 2.668109655380249 + }, + { + "auxiliary_loss_clip": 0.07025006, + "auxiliary_loss_mlp": 0.01310492, + "balance_loss_clip": 0.06573053, + "balance_loss_mlp": 0.01250029, + "epoch": 0.04292800240493011, + "flos": 66420256423680.0, + "grad_norm": 1.0671347208063184, + "language_loss": 0.65522671, + "learning_rate": 3.9982473993893165e-06, + "loss": 0.73858166, + "num_input_tokens_seen": 15240055, + "router_z_loss_clip": 4.5078125, + "router_z_loss_mlp": 0.60351562, + "step": 714, + "time_per_iteration": 3.317920207977295 + }, + { + "auxiliary_loss_clip": 0.07647526, + "auxiliary_loss_mlp": 0.01441108, + "balance_loss_clip": 0.0677468, + "balance_loss_mlp": 0.01326476, + "epoch": 0.042988125657598075, + "flos": 31658418777600.0, + "grad_norm": 3.6319248406792983, + "language_loss": 0.79793668, + "learning_rate": 3.998231060622563e-06, + "loss": 0.88882303, + "num_input_tokens_seen": 15261585, + "router_z_loss_clip": 8.73046875, + "router_z_loss_mlp": 1.14550781, + "step": 715, + "time_per_iteration": 2.717393398284912 + }, + { + "auxiliary_loss_clip": 0.07645463, + "auxiliary_loss_mlp": 0.01445614, + "balance_loss_clip": 0.06767702, + "balance_loss_mlp": 0.01331984, + "epoch": 0.04304824891026605, + "flos": 33255690433920.0, + "grad_norm": 29.540799393093693, + "language_loss": 0.77394652, + "learning_rate": 3.998214646082688e-06, + "loss": 0.86485732, + "num_input_tokens_seen": 15281160, + "router_z_loss_clip": 8.7890625, + "router_z_loss_mlp": 1.13623047, + "step": 716, + "time_per_iteration": 2.7298099994659424 + }, + { + "auxiliary_loss_clip": 0.07019071, + "auxiliary_loss_mlp": 0.01306888, + "balance_loss_clip": 0.06569381, + "balance_loss_mlp": 0.01252815, + "epoch": 0.04310837216293401, + "flos": 64086996430080.0, + "grad_norm": 0.9619131870502678, + "language_loss": 0.6602453, + "learning_rate": 3.998198155770314e-06, + "loss": 0.74350488, + "num_input_tokens_seen": 15344505, + "router_z_loss_clip": 4.5, + "router_z_loss_mlp": 0.54199219, + "step": 717, + "time_per_iteration": 3.2711920738220215 + }, + { + "auxiliary_loss_clip": 0.06998679, + "auxiliary_loss_mlp": 0.01302753, + "balance_loss_clip": 0.06550965, + "balance_loss_mlp": 0.01248918, + "epoch": 0.043168495415601985, + "flos": 61361990599680.0, + "grad_norm": 0.9806748941419274, + "language_loss": 0.58663344, + "learning_rate": 3.998181589686065e-06, + "loss": 0.66964775, + "num_input_tokens_seen": 15404050, + "router_z_loss_clip": 4.49609375, + "router_z_loss_mlp": 0.53955078, + "step": 718, + "time_per_iteration": 3.083362579345703 + }, + { + "auxiliary_loss_clip": 0.07634784, + "auxiliary_loss_mlp": 0.01408365, + "balance_loss_clip": 0.06757121, + "balance_loss_mlp": 0.01309135, + "epoch": 0.04322861866826996, + "flos": 20710539717120.0, + "grad_norm": 8.670927241625472, + "language_loss": 0.97469372, + "learning_rate": 3.99816494783057e-06, + "loss": 1.06512523, + "num_input_tokens_seen": 15424190, + "router_z_loss_clip": 8.78125, + "router_z_loss_mlp": 0.99316406, + "step": 719, + "time_per_iteration": 2.620244264602661 + }, + { + "auxiliary_loss_clip": 0.07617359, + "auxiliary_loss_mlp": 0.01437239, + "balance_loss_clip": 0.06746139, + "balance_loss_mlp": 0.01327042, + "epoch": 0.04328874192093792, + "flos": 30381308772480.0, + "grad_norm": 7.103043460272315, + "language_loss": 0.71241379, + "learning_rate": 3.99814823020446e-06, + "loss": 0.8029598, + "num_input_tokens_seen": 15446500, + "router_z_loss_clip": 8.703125, + "router_z_loss_mlp": 1.10253906, + "step": 720, + "time_per_iteration": 2.7137084007263184 + }, + { + "auxiliary_loss_clip": 0.07571768, + "auxiliary_loss_mlp": 0.01420566, + "balance_loss_clip": 0.06721878, + "balance_loss_mlp": 0.01314518, + "epoch": 0.043348865173605894, + "flos": 21951284250240.0, + "grad_norm": 7.242521234745598, + "language_loss": 0.82826072, + "learning_rate": 3.9981314368083684e-06, + "loss": 0.91818404, + "num_input_tokens_seen": 15465830, + "router_z_loss_clip": 8.5078125, + "router_z_loss_mlp": 1.06152344, + "step": 721, + "time_per_iteration": 2.6496849060058594 + }, + { + "auxiliary_loss_clip": 0.07618188, + "auxiliary_loss_mlp": 0.01421571, + "balance_loss_clip": 0.06749155, + "balance_loss_mlp": 0.01323009, + "epoch": 0.04340898842627386, + "flos": 15268982590080.0, + "grad_norm": 11.950148766430376, + "language_loss": 0.94630802, + "learning_rate": 3.998114567642933e-06, + "loss": 1.03670549, + "num_input_tokens_seen": 15479985, + "router_z_loss_clip": 8.6953125, + "router_z_loss_mlp": 0.98486328, + "step": 722, + "time_per_iteration": 2.665302038192749 + }, + { + "auxiliary_loss_clip": 0.07582939, + "auxiliary_loss_mlp": 0.01410079, + "balance_loss_clip": 0.06720737, + "balance_loss_mlp": 0.01309896, + "epoch": 0.04346911167894183, + "flos": 27973011847680.0, + "grad_norm": 7.626593725821058, + "language_loss": 0.90292984, + "learning_rate": 3.998097622708792e-06, + "loss": 0.99286008, + "num_input_tokens_seen": 15501545, + "router_z_loss_clip": 8.625, + "router_z_loss_mlp": 1.00195312, + "step": 723, + "time_per_iteration": 2.6893301010131836 + }, + { + "auxiliary_loss_clip": 0.0756183, + "auxiliary_loss_mlp": 0.01404071, + "balance_loss_clip": 0.06712201, + "balance_loss_mlp": 0.01307798, + "epoch": 0.0435292349316098, + "flos": 29249954144640.0, + "grad_norm": 5.654199567369001, + "language_loss": 0.8762064, + "learning_rate": 3.99808060200659e-06, + "loss": 0.96586531, + "num_input_tokens_seen": 15521725, + "router_z_loss_clip": 8.5, + "router_z_loss_mlp": 0.96337891, + "step": 724, + "time_per_iteration": 2.7862863540649414 + }, + { + "auxiliary_loss_clip": 0.07522231, + "auxiliary_loss_mlp": 0.01408898, + "balance_loss_clip": 0.06700347, + "balance_loss_mlp": 0.01310479, + "epoch": 0.04358935818427777, + "flos": 20564616631680.0, + "grad_norm": 17.469159252810304, + "language_loss": 0.84563124, + "learning_rate": 3.998063505536971e-06, + "loss": 0.93494248, + "num_input_tokens_seen": 15540910, + "router_z_loss_clip": 8.2109375, + "router_z_loss_mlp": 0.98339844, + "step": 725, + "time_per_iteration": 2.6348090171813965 + }, + { + "auxiliary_loss_clip": 0.07563804, + "auxiliary_loss_mlp": 0.01414464, + "balance_loss_clip": 0.06708695, + "balance_loss_mlp": 0.01317428, + "epoch": 0.04364948143694574, + "flos": 14470116163200.0, + "grad_norm": 13.275228581754149, + "language_loss": 0.94372833, + "learning_rate": 3.998046333300584e-06, + "loss": 1.03351104, + "num_input_tokens_seen": 15558640, + "router_z_loss_clip": 8.5546875, + "router_z_loss_mlp": 0.96972656, + "step": 726, + "time_per_iteration": 2.6198081970214844 + }, + { + "auxiliary_loss_clip": 0.06976914, + "auxiliary_loss_mlp": 0.01364793, + "balance_loss_clip": 0.0652867, + "balance_loss_mlp": 0.01297797, + "epoch": 0.043709604689613706, + "flos": 50083216565760.0, + "grad_norm": 0.973992689315138, + "language_loss": 0.56151426, + "learning_rate": 3.998029085298079e-06, + "loss": 0.64493132, + "num_input_tokens_seen": 15612975, + "router_z_loss_clip": 4.4921875, + "router_z_loss_mlp": 0.67041016, + "step": 727, + "time_per_iteration": 3.331416368484497 + }, + { + "auxiliary_loss_clip": 0.07546923, + "auxiliary_loss_mlp": 0.01412171, + "balance_loss_clip": 0.06696635, + "balance_loss_mlp": 0.01320475, + "epoch": 0.04376972794228168, + "flos": 13996861902720.0, + "grad_norm": 5.257747667032763, + "language_loss": 0.87717295, + "learning_rate": 3.998011761530112e-06, + "loss": 0.96676385, + "num_input_tokens_seen": 15631070, + "router_z_loss_clip": 8.51953125, + "router_z_loss_mlp": 0.91699219, + "step": 728, + "time_per_iteration": 3.989957571029663 + }, + { + "auxiliary_loss_clip": 0.07508835, + "auxiliary_loss_mlp": 0.01424416, + "balance_loss_clip": 0.06694756, + "balance_loss_mlp": 0.0133787, + "epoch": 0.04382985119494965, + "flos": 22015084734720.0, + "grad_norm": 7.636957371182376, + "language_loss": 0.80325305, + "learning_rate": 3.997994361997338e-06, + "loss": 0.89258564, + "num_input_tokens_seen": 15647825, + "router_z_loss_clip": 8.14453125, + "router_z_loss_mlp": 0.86572266, + "step": 729, + "time_per_iteration": 4.069265365600586 + }, + { + "auxiliary_loss_clip": 0.07515953, + "auxiliary_loss_mlp": 0.01429781, + "balance_loss_clip": 0.06682766, + "balance_loss_mlp": 0.01337561, + "epoch": 0.043889974447617615, + "flos": 24213322673280.0, + "grad_norm": 4.547809577279536, + "language_loss": 1.00979817, + "learning_rate": 3.997976886700417e-06, + "loss": 1.09925556, + "num_input_tokens_seen": 15668260, + "router_z_loss_clip": 8.33203125, + "router_z_loss_mlp": 0.92285156, + "step": 730, + "time_per_iteration": 4.043174982070923 + }, + { + "auxiliary_loss_clip": 0.07549515, + "auxiliary_loss_mlp": 0.01462607, + "balance_loss_clip": 0.06684491, + "balance_loss_mlp": 0.0135055, + "epoch": 0.04395009770028559, + "flos": 17280236142720.0, + "grad_norm": 42.34250232752857, + "language_loss": 0.93866402, + "learning_rate": 3.997959335640013e-06, + "loss": 1.02878523, + "num_input_tokens_seen": 15685630, + "router_z_loss_clip": 8.6640625, + "router_z_loss_mlp": 1.12011719, + "step": 731, + "time_per_iteration": 2.6158339977264404 + }, + { + "auxiliary_loss_clip": 0.07507139, + "auxiliary_loss_mlp": 0.01450773, + "balance_loss_clip": 0.06690555, + "balance_loss_mlp": 0.0135059, + "epoch": 0.04401022095295355, + "flos": 12314784314880.0, + "grad_norm": 29.143956092822908, + "language_loss": 0.9731133, + "learning_rate": 3.997941708816791e-06, + "loss": 1.0626924, + "num_input_tokens_seen": 15698645, + "router_z_loss_clip": 8.1640625, + "router_z_loss_mlp": 1.00146484, + "step": 732, + "time_per_iteration": 4.100733995437622 + }, + { + "auxiliary_loss_clip": 0.07525843, + "auxiliary_loss_mlp": 0.01458711, + "balance_loss_clip": 0.06679834, + "balance_loss_mlp": 0.01353854, + "epoch": 0.044070344205621524, + "flos": 20965978488960.0, + "grad_norm": 13.482370943505323, + "language_loss": 0.90961432, + "learning_rate": 3.997924006231419e-06, + "loss": 0.9994598, + "num_input_tokens_seen": 15716775, + "router_z_loss_clip": 8.46875, + "router_z_loss_mlp": 1.04785156, + "step": 733, + "time_per_iteration": 2.6597700119018555 + }, + { + "auxiliary_loss_clip": 0.07518548, + "auxiliary_loss_mlp": 0.01469977, + "balance_loss_clip": 0.06685109, + "balance_loss_mlp": 0.01364262, + "epoch": 0.044130467458289496, + "flos": 13850477619840.0, + "grad_norm": 7.4867822080691235, + "language_loss": 0.95689577, + "learning_rate": 3.9979062278845685e-06, + "loss": 1.04678106, + "num_input_tokens_seen": 15733320, + "router_z_loss_clip": 8.34375, + "router_z_loss_mlp": 1.05664062, + "step": 734, + "time_per_iteration": 2.5865581035614014 + }, + { + "auxiliary_loss_clip": 0.0748552, + "auxiliary_loss_mlp": 0.01451415, + "balance_loss_clip": 0.06673294, + "balance_loss_mlp": 0.01355809, + "epoch": 0.04419059071095746, + "flos": 28662152952960.0, + "grad_norm": 3.9560769382385237, + "language_loss": 0.82954776, + "learning_rate": 3.9978883737769125e-06, + "loss": 0.91891712, + "num_input_tokens_seen": 15752705, + "router_z_loss_clip": 8.12890625, + "router_z_loss_mlp": 0.95605469, + "step": 735, + "time_per_iteration": 2.7034595012664795 + }, + { + "auxiliary_loss_clip": 0.07501128, + "auxiliary_loss_mlp": 0.01471986, + "balance_loss_clip": 0.06663659, + "balance_loss_mlp": 0.01360931, + "epoch": 0.04425071396362543, + "flos": 28190743482240.0, + "grad_norm": 5.551572813958511, + "language_loss": 0.95522362, + "learning_rate": 3.9978704439091305e-06, + "loss": 1.04495478, + "num_input_tokens_seen": 15772800, + "router_z_loss_clip": 8.375, + "router_z_loss_mlp": 1.11132812, + "step": 736, + "time_per_iteration": 2.6946370601654053 + }, + { + "auxiliary_loss_clip": 0.07478474, + "auxiliary_loss_mlp": 0.01445427, + "balance_loss_clip": 0.06672784, + "balance_loss_mlp": 0.01338806, + "epoch": 0.0443108372162934, + "flos": 23665031481600.0, + "grad_norm": 16.744954570362566, + "language_loss": 0.88981938, + "learning_rate": 3.997852438281901e-06, + "loss": 0.97905844, + "num_input_tokens_seen": 15793665, + "router_z_loss_clip": 8.0625, + "router_z_loss_mlp": 1.06640625, + "step": 737, + "time_per_iteration": 2.715646266937256 + }, + { + "auxiliary_loss_clip": 0.07480585, + "auxiliary_loss_mlp": 0.01439926, + "balance_loss_clip": 0.0667211, + "balance_loss_mlp": 0.01326964, + "epoch": 0.04437096046896137, + "flos": 33987486067200.0, + "grad_norm": 222.55096495156016, + "language_loss": 0.89570022, + "learning_rate": 3.997834356895906e-06, + "loss": 0.98490536, + "num_input_tokens_seen": 15813175, + "router_z_loss_clip": 8.0859375, + "router_z_loss_mlp": 1.12988281, + "step": 738, + "time_per_iteration": 2.7859413623809814 + }, + { + "auxiliary_loss_clip": 0.06961473, + "auxiliary_loss_mlp": 0.01305245, + "balance_loss_clip": 0.06532852, + "balance_loss_mlp": 0.01250504, + "epoch": 0.04443108372162934, + "flos": 67416268308480.0, + "grad_norm": 0.9420923573397554, + "language_loss": 0.59376323, + "learning_rate": 3.9978161997518324e-06, + "loss": 0.67643034, + "num_input_tokens_seen": 15872050, + "router_z_loss_clip": 4.3046875, + "router_z_loss_mlp": 0.54882812, + "step": 739, + "time_per_iteration": 3.1967270374298096 + }, + { + "auxiliary_loss_clip": 0.07502826, + "auxiliary_loss_mlp": 0.01427717, + "balance_loss_clip": 0.06669345, + "balance_loss_mlp": 0.01320858, + "epoch": 0.04449120697429731, + "flos": 29760454344960.0, + "grad_norm": 6.6049127408313915, + "language_loss": 0.9770751, + "learning_rate": 3.997797966850369e-06, + "loss": 1.0663805, + "num_input_tokens_seen": 15891085, + "router_z_loss_clip": 8.3359375, + "router_z_loss_mlp": 1.06933594, + "step": 740, + "time_per_iteration": 2.768758535385132 + }, + { + "auxiliary_loss_clip": 0.07489674, + "auxiliary_loss_mlp": 0.0143368, + "balance_loss_clip": 0.06660549, + "balance_loss_mlp": 0.01330111, + "epoch": 0.04455133022696528, + "flos": 36510958828800.0, + "grad_norm": 21.062626098117025, + "language_loss": 0.76799577, + "learning_rate": 3.997779658192205e-06, + "loss": 0.85722935, + "num_input_tokens_seen": 15914225, + "router_z_loss_clip": 8.3046875, + "router_z_loss_mlp": 1.03515625, + "step": 741, + "time_per_iteration": 2.755948543548584 + }, + { + "auxiliary_loss_clip": 0.0744606, + "auxiliary_loss_mlp": 0.01441267, + "balance_loss_clip": 0.06655986, + "balance_loss_mlp": 0.01339128, + "epoch": 0.044611453479633245, + "flos": 28811220566400.0, + "grad_norm": 10.341428331493303, + "language_loss": 0.9204191, + "learning_rate": 3.997761273778037e-06, + "loss": 1.00929236, + "num_input_tokens_seen": 15934540, + "router_z_loss_clip": 7.90234375, + "router_z_loss_mlp": 1.02148438, + "step": 742, + "time_per_iteration": 2.6964497566223145 + }, + { + "auxiliary_loss_clip": 0.07461847, + "auxiliary_loss_mlp": 0.01424939, + "balance_loss_clip": 0.06654513, + "balance_loss_mlp": 0.01322085, + "epoch": 0.04467157673230122, + "flos": 20017122053760.0, + "grad_norm": 7.31366885778202, + "language_loss": 0.89204007, + "learning_rate": 3.997742813608561e-06, + "loss": 0.98090798, + "num_input_tokens_seen": 15952560, + "router_z_loss_clip": 8.078125, + "router_z_loss_mlp": 1.02880859, + "step": 743, + "time_per_iteration": 2.6080615520477295 + }, + { + "auxiliary_loss_clip": 0.07439004, + "auxiliary_loss_mlp": 0.01432385, + "balance_loss_clip": 0.06638713, + "balance_loss_mlp": 0.01329913, + "epoch": 0.04473169998496919, + "flos": 18010899745920.0, + "grad_norm": 13.675273731760388, + "language_loss": 0.85338962, + "learning_rate": 3.997724277684479e-06, + "loss": 0.94210356, + "num_input_tokens_seen": 15970620, + "router_z_loss_clip": 8.00390625, + "router_z_loss_mlp": 1.02490234, + "step": 744, + "time_per_iteration": 2.697763204574585 + }, + { + "auxiliary_loss_clip": 0.07427198, + "auxiliary_loss_mlp": 0.01407828, + "balance_loss_clip": 0.06637768, + "balance_loss_mlp": 0.01313938, + "epoch": 0.044791823237637154, + "flos": 20638060335360.0, + "grad_norm": 8.258556171326942, + "language_loss": 0.89771521, + "learning_rate": 3.99770566600649e-06, + "loss": 0.98606539, + "num_input_tokens_seen": 15987325, + "router_z_loss_clip": 7.89453125, + "router_z_loss_mlp": 0.93896484, + "step": 745, + "time_per_iteration": 2.609206438064575 + }, + { + "auxiliary_loss_clip": 0.07450528, + "auxiliary_loss_mlp": 0.01413412, + "balance_loss_clip": 0.06646559, + "balance_loss_mlp": 0.01313371, + "epoch": 0.04485194649030513, + "flos": 31184284049280.0, + "grad_norm": 12.351211228960139, + "language_loss": 0.73676586, + "learning_rate": 3.997686978575302e-06, + "loss": 0.82540524, + "num_input_tokens_seen": 16008310, + "router_z_loss_clip": 8.05859375, + "router_z_loss_mlp": 1.0, + "step": 746, + "time_per_iteration": 2.8217551708221436 + }, + { + "auxiliary_loss_clip": 0.07421336, + "auxiliary_loss_mlp": 0.01411005, + "balance_loss_clip": 0.06631814, + "balance_loss_mlp": 0.01308485, + "epoch": 0.04491206974297309, + "flos": 26150922887040.0, + "grad_norm": 4.52399420645529, + "language_loss": 0.7370531, + "learning_rate": 3.997668215391625e-06, + "loss": 0.82537645, + "num_input_tokens_seen": 16029620, + "router_z_loss_clip": 7.89453125, + "router_z_loss_mlp": 1.02587891, + "step": 747, + "time_per_iteration": 2.724240303039551 + }, + { + "auxiliary_loss_clip": 0.0741486, + "auxiliary_loss_mlp": 0.01407706, + "balance_loss_clip": 0.06629101, + "balance_loss_mlp": 0.0131005, + "epoch": 0.044972192995641064, + "flos": 20673922682880.0, + "grad_norm": 4.695342378066542, + "language_loss": 0.7142753, + "learning_rate": 3.997649376456168e-06, + "loss": 0.80250096, + "num_input_tokens_seen": 16049065, + "router_z_loss_clip": 7.859375, + "router_z_loss_mlp": 0.97607422, + "step": 748, + "time_per_iteration": 2.6020255088806152 + }, + { + "auxiliary_loss_clip": 0.0743566, + "auxiliary_loss_mlp": 0.01385894, + "balance_loss_clip": 0.06626688, + "balance_loss_mlp": 0.01281753, + "epoch": 0.045032316248309036, + "flos": 16112306407680.0, + "grad_norm": 6.462262226814603, + "language_loss": 0.81646264, + "learning_rate": 3.997630461769647e-06, + "loss": 0.90467817, + "num_input_tokens_seen": 16066765, + "router_z_loss_clip": 8.08984375, + "router_z_loss_mlp": 1.04199219, + "step": 749, + "time_per_iteration": 2.715440273284912 + }, + { + "auxiliary_loss_clip": 0.07424041, + "auxiliary_loss_mlp": 0.01391269, + "balance_loss_clip": 0.06627008, + "balance_loss_mlp": 0.01284601, + "epoch": 0.045092439500977, + "flos": 17864725098240.0, + "grad_norm": 4.760324696153287, + "language_loss": 0.94018352, + "learning_rate": 3.997611471332778e-06, + "loss": 1.02833652, + "num_input_tokens_seen": 16085980, + "router_z_loss_clip": 7.96484375, + "router_z_loss_mlp": 1.06542969, + "step": 750, + "time_per_iteration": 2.603782892227173 + }, + { + "auxiliary_loss_clip": 0.07430436, + "auxiliary_loss_mlp": 0.01400307, + "balance_loss_clip": 0.06634089, + "balance_loss_mlp": 0.01284579, + "epoch": 0.04515256275364497, + "flos": 24469809621120.0, + "grad_norm": 8.436133500985974, + "language_loss": 0.79776669, + "learning_rate": 3.9975924051462825e-06, + "loss": 0.88607413, + "num_input_tokens_seen": 16106260, + "router_z_loss_clip": 7.97265625, + "router_z_loss_mlp": 1.15673828, + "step": 751, + "time_per_iteration": 2.6831071376800537 + }, + { + "auxiliary_loss_clip": 0.07439418, + "auxiliary_loss_mlp": 0.01393415, + "balance_loss_clip": 0.06633066, + "balance_loss_mlp": 0.01282932, + "epoch": 0.04521268600631294, + "flos": 20921563025280.0, + "grad_norm": 6.241833654243461, + "language_loss": 0.75070345, + "learning_rate": 3.997573263210883e-06, + "loss": 0.83903182, + "num_input_tokens_seen": 16123475, + "router_z_loss_clip": 8.05859375, + "router_z_loss_mlp": 1.10351562, + "step": 752, + "time_per_iteration": 2.6177663803100586 + }, + { + "auxiliary_loss_clip": 0.07437599, + "auxiliary_loss_mlp": 0.01387858, + "balance_loss_clip": 0.06631324, + "balance_loss_mlp": 0.01275515, + "epoch": 0.04527280925898091, + "flos": 13376552526720.0, + "grad_norm": 9.915844804632899, + "language_loss": 0.97712451, + "learning_rate": 3.997554045527305e-06, + "loss": 1.06537914, + "num_input_tokens_seen": 16138335, + "router_z_loss_clip": 8.0703125, + "router_z_loss_mlp": 1.125, + "step": 753, + "time_per_iteration": 2.613664388656616 + }, + { + "auxiliary_loss_clip": 0.07467066, + "auxiliary_loss_mlp": 0.0138957, + "balance_loss_clip": 0.06645191, + "balance_loss_mlp": 0.01278133, + "epoch": 0.04533293251164888, + "flos": 23260650877440.0, + "grad_norm": 4.960920268809469, + "language_loss": 0.95308006, + "learning_rate": 3.997534752096277e-06, + "loss": 1.04164636, + "num_input_tokens_seen": 16157110, + "router_z_loss_clip": 8.23046875, + "router_z_loss_mlp": 1.11376953, + "step": 754, + "time_per_iteration": 2.6214957237243652 + }, + { + "auxiliary_loss_clip": 0.07402018, + "auxiliary_loss_mlp": 0.01373244, + "balance_loss_clip": 0.06614807, + "balance_loss_mlp": 0.01264812, + "epoch": 0.04539305576431685, + "flos": 12426899477760.0, + "grad_norm": 4.312204742226669, + "language_loss": 0.84473336, + "learning_rate": 3.997515382918531e-06, + "loss": 0.93248594, + "num_input_tokens_seen": 16174155, + "router_z_loss_clip": 7.87890625, + "router_z_loss_mlp": 1.08544922, + "step": 755, + "time_per_iteration": 2.659515857696533 + }, + { + "auxiliary_loss_clip": 0.07425568, + "auxiliary_loss_mlp": 0.01385083, + "balance_loss_clip": 0.06618007, + "balance_loss_mlp": 0.01261582, + "epoch": 0.04545317901698482, + "flos": 16076569841280.0, + "grad_norm": 4.663949688306233, + "language_loss": 0.85189492, + "learning_rate": 3.9974959379948015e-06, + "loss": 0.94000149, + "num_input_tokens_seen": 16192240, + "router_z_loss_clip": 8.078125, + "router_z_loss_mlp": 1.23632812, + "step": 756, + "time_per_iteration": 2.5948095321655273 + }, + { + "auxiliary_loss_clip": 0.0692629, + "auxiliary_loss_mlp": 0.01345145, + "balance_loss_clip": 0.06492035, + "balance_loss_mlp": 0.01295292, + "epoch": 0.045513302269652785, + "flos": 66418118144640.0, + "grad_norm": 0.7901603277703675, + "language_loss": 0.62960637, + "learning_rate": 3.997476417325827e-06, + "loss": 0.71232069, + "num_input_tokens_seen": 16255775, + "router_z_loss_clip": 4.34375, + "router_z_loss_mlp": 0.49829102, + "step": 757, + "time_per_iteration": 3.255581855773926 + }, + { + "auxiliary_loss_clip": 0.07416959, + "auxiliary_loss_mlp": 0.01380818, + "balance_loss_clip": 0.06624802, + "balance_loss_mlp": 0.01258747, + "epoch": 0.04557342552232076, + "flos": 21477694573440.0, + "grad_norm": 3.09506424046452, + "language_loss": 0.87773216, + "learning_rate": 3.997456820912346e-06, + "loss": 0.96570992, + "num_input_tokens_seen": 16277015, + "router_z_loss_clip": 7.921875, + "router_z_loss_mlp": 1.22070312, + "step": 758, + "time_per_iteration": 2.661123514175415 + }, + { + "auxiliary_loss_clip": 0.0740035, + "auxiliary_loss_mlp": 0.01375063, + "balance_loss_clip": 0.06621221, + "balance_loss_mlp": 0.01257952, + "epoch": 0.04563354877498873, + "flos": 23739481434240.0, + "grad_norm": 2.638413914831674, + "language_loss": 0.92492557, + "learning_rate": 3.997437148755101e-06, + "loss": 1.0126797, + "num_input_tokens_seen": 16296005, + "router_z_loss_clip": 7.78515625, + "router_z_loss_mlp": 1.17089844, + "step": 759, + "time_per_iteration": 2.668470859527588 + }, + { + "auxiliary_loss_clip": 0.07430892, + "auxiliary_loss_mlp": 0.01383461, + "balance_loss_clip": 0.06623936, + "balance_loss_mlp": 0.01266541, + "epoch": 0.045693672027656694, + "flos": 25742265724800.0, + "grad_norm": 3.8629420904701237, + "language_loss": 0.79697698, + "learning_rate": 3.9974174008548405e-06, + "loss": 0.88512051, + "num_input_tokens_seen": 16315300, + "router_z_loss_clip": 8.07421875, + "router_z_loss_mlp": 1.16992188, + "step": 760, + "time_per_iteration": 2.716425895690918 + }, + { + "auxiliary_loss_clip": 0.07406907, + "auxiliary_loss_mlp": 0.01369419, + "balance_loss_clip": 0.06620169, + "balance_loss_mlp": 0.01267519, + "epoch": 0.045753795280324666, + "flos": 19725108174720.0, + "grad_norm": 2.8686759977967458, + "language_loss": 0.87246794, + "learning_rate": 3.9973975772123105e-06, + "loss": 0.96023118, + "num_input_tokens_seen": 16333820, + "router_z_loss_clip": 7.87109375, + "router_z_loss_mlp": 1.01855469, + "step": 761, + "time_per_iteration": 2.6261487007141113 + }, + { + "auxiliary_loss_clip": 0.07379207, + "auxiliary_loss_mlp": 0.01371916, + "balance_loss_clip": 0.06607988, + "balance_loss_mlp": 0.01259764, + "epoch": 0.04581391853299264, + "flos": 23262076396800.0, + "grad_norm": 2.7268346941502273, + "language_loss": 0.83904314, + "learning_rate": 3.997377677828266e-06, + "loss": 0.92655438, + "num_input_tokens_seen": 16355290, + "router_z_loss_clip": 7.71875, + "router_z_loss_mlp": 1.12304688, + "step": 762, + "time_per_iteration": 2.677358627319336 + }, + { + "auxiliary_loss_clip": 0.06917945, + "auxiliary_loss_mlp": 0.01342542, + "balance_loss_clip": 0.06491472, + "balance_loss_mlp": 0.01301057, + "epoch": 0.0458740417856606, + "flos": 64250711308800.0, + "grad_norm": 0.9293980504879501, + "language_loss": 0.59131134, + "learning_rate": 3.9973577027034585e-06, + "loss": 0.67391622, + "num_input_tokens_seen": 16415995, + "router_z_loss_clip": 4.25, + "router_z_loss_mlp": 0.41503906, + "step": 763, + "time_per_iteration": 3.262456178665161 + }, + { + "auxiliary_loss_clip": 0.07421511, + "auxiliary_loss_mlp": 0.01399391, + "balance_loss_clip": 0.0662367, + "balance_loss_mlp": 0.01283425, + "epoch": 0.045934165038328575, + "flos": 20775220669440.0, + "grad_norm": 3.4758610459340535, + "language_loss": 0.92935646, + "learning_rate": 3.9973376518386475e-06, + "loss": 1.01756549, + "num_input_tokens_seen": 16433120, + "router_z_loss_clip": 7.98046875, + "router_z_loss_mlp": 1.15869141, + "step": 764, + "time_per_iteration": 2.66152024269104 + }, + { + "auxiliary_loss_clip": 0.07451791, + "auxiliary_loss_mlp": 0.01391333, + "balance_loss_clip": 0.06637829, + "balance_loss_mlp": 0.01274556, + "epoch": 0.04599428829099654, + "flos": 30270661056000.0, + "grad_norm": 3.768496915542153, + "language_loss": 0.90699267, + "learning_rate": 3.997317525234592e-06, + "loss": 0.99542397, + "num_input_tokens_seen": 16453360, + "router_z_loss_clip": 8.14453125, + "router_z_loss_mlp": 1.16845703, + "step": 765, + "time_per_iteration": 2.6835410594940186 + }, + { + "auxiliary_loss_clip": 0.07426902, + "auxiliary_loss_mlp": 0.01398616, + "balance_loss_clip": 0.0662117, + "balance_loss_mlp": 0.01278883, + "epoch": 0.04605441154366451, + "flos": 23045518719360.0, + "grad_norm": 7.076643019058991, + "language_loss": 0.94406933, + "learning_rate": 3.997297322892056e-06, + "loss": 1.03232455, + "num_input_tokens_seen": 16471160, + "router_z_loss_clip": 8.0625, + "router_z_loss_mlp": 1.19580078, + "step": 766, + "time_per_iteration": 2.6382553577423096 + }, + { + "auxiliary_loss_clip": 0.07415807, + "auxiliary_loss_mlp": 0.01393781, + "balance_loss_clip": 0.06614047, + "balance_loss_mlp": 0.01284967, + "epoch": 0.046114534796332485, + "flos": 22023847486080.0, + "grad_norm": 4.776611740874826, + "language_loss": 0.89285934, + "learning_rate": 3.997277044811806e-06, + "loss": 0.98095518, + "num_input_tokens_seen": 16488940, + "router_z_loss_clip": 8.01953125, + "router_z_loss_mlp": 1.08789062, + "step": 767, + "time_per_iteration": 4.195739984512329 + }, + { + "auxiliary_loss_clip": 0.07392205, + "auxiliary_loss_mlp": 0.01374375, + "balance_loss_clip": 0.0661349, + "balance_loss_mlp": 0.01267278, + "epoch": 0.04617465804900045, + "flos": 29870221593600.0, + "grad_norm": 7.642963435689524, + "language_loss": 0.92056656, + "learning_rate": 3.99725669099461e-06, + "loss": 1.00823236, + "num_input_tokens_seen": 16509505, + "router_z_loss_clip": 7.7890625, + "router_z_loss_mlp": 1.0703125, + "step": 768, + "time_per_iteration": 4.208758354187012 + }, + { + "auxiliary_loss_clip": 0.07427865, + "auxiliary_loss_mlp": 0.01386956, + "balance_loss_clip": 0.06619686, + "balance_loss_mlp": 0.01278571, + "epoch": 0.04623478130166842, + "flos": 25637194304640.0, + "grad_norm": 3.542997425401238, + "language_loss": 0.79400444, + "learning_rate": 3.9972362614412395e-06, + "loss": 0.88215268, + "num_input_tokens_seen": 16528840, + "router_z_loss_clip": 8.078125, + "router_z_loss_mlp": 1.08447266, + "step": 769, + "time_per_iteration": 4.17974328994751 + }, + { + "auxiliary_loss_clip": 0.07375413, + "auxiliary_loss_mlp": 0.01385881, + "balance_loss_clip": 0.06606276, + "balance_loss_mlp": 0.01275923, + "epoch": 0.04629490455433639, + "flos": 20455352507520.0, + "grad_norm": 2.7800745603564185, + "language_loss": 0.89842647, + "learning_rate": 3.997215756152471e-06, + "loss": 0.9860394, + "num_input_tokens_seen": 16548335, + "router_z_loss_clip": 7.69140625, + "router_z_loss_mlp": 1.10009766, + "step": 770, + "time_per_iteration": 2.656651735305786 + }, + { + "auxiliary_loss_clip": 0.07423855, + "auxiliary_loss_mlp": 0.01400348, + "balance_loss_clip": 0.06619771, + "balance_loss_mlp": 0.01292678, + "epoch": 0.04635502780700436, + "flos": 23155411749120.0, + "grad_norm": 4.755062709171144, + "language_loss": 0.92055309, + "learning_rate": 3.99719517512908e-06, + "loss": 1.00879514, + "num_input_tokens_seen": 16567725, + "router_z_loss_clip": 8.04296875, + "router_z_loss_mlp": 1.07714844, + "step": 771, + "time_per_iteration": 4.008092403411865 + }, + { + "auxiliary_loss_clip": 0.07446887, + "auxiliary_loss_mlp": 0.0141094, + "balance_loss_clip": 0.06623209, + "balance_loss_mlp": 0.01295641, + "epoch": 0.04641515105967233, + "flos": 23298274160640.0, + "grad_norm": 7.281609081858744, + "language_loss": 0.88918245, + "learning_rate": 3.997174518371848e-06, + "loss": 0.97776067, + "num_input_tokens_seen": 16588175, + "router_z_loss_clip": 8.2265625, + "router_z_loss_mlp": 1.15380859, + "step": 772, + "time_per_iteration": 2.6240971088409424 + }, + { + "auxiliary_loss_clip": 0.07388498, + "auxiliary_loss_mlp": 0.01396403, + "balance_loss_clip": 0.06612748, + "balance_loss_mlp": 0.01294503, + "epoch": 0.046475274312340296, + "flos": 25121579005440.0, + "grad_norm": 3.47084722704317, + "language_loss": 0.78166652, + "learning_rate": 3.997153785881557e-06, + "loss": 0.86951548, + "num_input_tokens_seen": 16607735, + "router_z_loss_clip": 7.765625, + "router_z_loss_mlp": 1.01904297, + "step": 773, + "time_per_iteration": 2.6761457920074463 + }, + { + "auxiliary_loss_clip": 0.07362784, + "auxiliary_loss_mlp": 0.01412458, + "balance_loss_clip": 0.06602354, + "balance_loss_mlp": 0.0130703, + "epoch": 0.04653539756500827, + "flos": 25271946357120.0, + "grad_norm": 3.68531082302782, + "language_loss": 0.82003927, + "learning_rate": 3.997132977658996e-06, + "loss": 0.90779173, + "num_input_tokens_seen": 16627225, + "router_z_loss_clip": 7.609375, + "router_z_loss_mlp": 1.05419922, + "step": 774, + "time_per_iteration": 2.6333625316619873 + }, + { + "auxiliary_loss_clip": 0.0737831, + "auxiliary_loss_mlp": 0.01410602, + "balance_loss_clip": 0.06605712, + "balance_loss_mlp": 0.0129783, + "epoch": 0.046595520817676234, + "flos": 35412238166400.0, + "grad_norm": 3.362442863286837, + "language_loss": 0.78172398, + "learning_rate": 3.997112093704952e-06, + "loss": 0.86961305, + "num_input_tokens_seen": 16647785, + "router_z_loss_clip": 7.73046875, + "router_z_loss_mlp": 1.12792969, + "step": 775, + "time_per_iteration": 2.7341220378875732 + }, + { + "auxiliary_loss_clip": 0.07397586, + "auxiliary_loss_mlp": 0.01408088, + "balance_loss_clip": 0.0662451, + "balance_loss_mlp": 0.01303994, + "epoch": 0.046655644070344206, + "flos": 18118151372160.0, + "grad_norm": 4.938605745427105, + "language_loss": 0.81674814, + "learning_rate": 3.997091134020217e-06, + "loss": 0.90480489, + "num_input_tokens_seen": 16667555, + "router_z_loss_clip": 7.734375, + "router_z_loss_mlp": 1.04052734, + "step": 776, + "time_per_iteration": 2.631185293197632 + }, + { + "auxiliary_loss_clip": 0.07349464, + "auxiliary_loss_mlp": 0.01382372, + "balance_loss_clip": 0.06605366, + "balance_loss_mlp": 0.01283905, + "epoch": 0.04671576732301218, + "flos": 29212959767040.0, + "grad_norm": 3.9530223985438724, + "language_loss": 0.76411474, + "learning_rate": 3.997070098605585e-06, + "loss": 0.85143304, + "num_input_tokens_seen": 16686875, + "router_z_loss_clip": 7.4375, + "router_z_loss_mlp": 0.98535156, + "step": 777, + "time_per_iteration": 2.6883299350738525 + }, + { + "auxiliary_loss_clip": 0.07356873, + "auxiliary_loss_mlp": 0.01403802, + "balance_loss_clip": 0.06604887, + "balance_loss_mlp": 0.0129618, + "epoch": 0.04677589057568014, + "flos": 30485541651840.0, + "grad_norm": 5.886017158674543, + "language_loss": 0.8144322, + "learning_rate": 3.997048987461856e-06, + "loss": 0.90203899, + "num_input_tokens_seen": 16706420, + "router_z_loss_clip": 7.52734375, + "router_z_loss_mlp": 1.07568359, + "step": 778, + "time_per_iteration": 2.685317277908325 + }, + { + "auxiliary_loss_clip": 0.07353938, + "auxiliary_loss_mlp": 0.01397494, + "balance_loss_clip": 0.06609853, + "balance_loss_mlp": 0.01301697, + "epoch": 0.046836013828348115, + "flos": 20563820017920.0, + "grad_norm": 3.1633004103469644, + "language_loss": 0.83870596, + "learning_rate": 3.997027800589829e-06, + "loss": 0.92622018, + "num_input_tokens_seen": 16726390, + "router_z_loss_clip": 7.4375, + "router_z_loss_mlp": 0.95849609, + "step": 779, + "time_per_iteration": 2.737780809402466 + }, + { + "auxiliary_loss_clip": 0.07349363, + "auxiliary_loss_mlp": 0.01400206, + "balance_loss_clip": 0.06610721, + "balance_loss_mlp": 0.01301119, + "epoch": 0.04689613708101608, + "flos": 25454444549760.0, + "grad_norm": 5.859193350473668, + "language_loss": 0.80411738, + "learning_rate": 3.997006537990308e-06, + "loss": 0.89161313, + "num_input_tokens_seen": 16748965, + "router_z_loss_clip": 7.38671875, + "router_z_loss_mlp": 0.99023438, + "step": 780, + "time_per_iteration": 2.7168006896972656 + }, + { + "auxiliary_loss_clip": 0.07343157, + "auxiliary_loss_mlp": 0.0140195, + "balance_loss_clip": 0.06612131, + "balance_loss_mlp": 0.01309253, + "epoch": 0.04695626033368405, + "flos": 23007811582080.0, + "grad_norm": 3.4762604948204707, + "language_loss": 0.80410504, + "learning_rate": 3.996985199664099e-06, + "loss": 0.89155614, + "num_input_tokens_seen": 16768620, + "router_z_loss_clip": 7.3125, + "router_z_loss_mlp": 0.92724609, + "step": 781, + "time_per_iteration": 2.6267943382263184 + }, + { + "auxiliary_loss_clip": 0.07401444, + "auxiliary_loss_mlp": 0.01433849, + "balance_loss_clip": 0.06619258, + "balance_loss_mlp": 0.01321363, + "epoch": 0.047016383586352024, + "flos": 29141193144960.0, + "grad_norm": 4.331089591937386, + "language_loss": 0.79331714, + "learning_rate": 3.99696378561201e-06, + "loss": 0.88167012, + "num_input_tokens_seen": 16789755, + "router_z_loss_clip": 7.83984375, + "router_z_loss_mlp": 1.12451172, + "step": 782, + "time_per_iteration": 2.7272114753723145 + }, + { + "auxiliary_loss_clip": 0.07364355, + "auxiliary_loss_mlp": 0.01439388, + "balance_loss_clip": 0.06623092, + "balance_loss_mlp": 0.01338251, + "epoch": 0.04707650683901999, + "flos": 14981706466560.0, + "grad_norm": 6.433414878185146, + "language_loss": 0.85460365, + "learning_rate": 3.996942295834855e-06, + "loss": 0.94264108, + "num_input_tokens_seen": 16807585, + "router_z_loss_clip": 7.421875, + "router_z_loss_mlp": 1.01269531, + "step": 783, + "time_per_iteration": 2.6950912475585938 + }, + { + "auxiliary_loss_clip": 0.07354224, + "auxiliary_loss_mlp": 0.01436959, + "balance_loss_clip": 0.06629962, + "balance_loss_mlp": 0.01332722, + "epoch": 0.04713663009168796, + "flos": 21657257873280.0, + "grad_norm": 5.367904788236997, + "language_loss": 0.87574267, + "learning_rate": 3.996920730333448e-06, + "loss": 0.96365452, + "num_input_tokens_seen": 16827220, + "router_z_loss_clip": 7.234375, + "router_z_loss_mlp": 1.04150391, + "step": 784, + "time_per_iteration": 2.649948835372925 + }, + { + "auxiliary_loss_clip": 0.07386977, + "auxiliary_loss_mlp": 0.01467498, + "balance_loss_clip": 0.06641141, + "balance_loss_mlp": 0.01344665, + "epoch": 0.04719675334435593, + "flos": 21331939196160.0, + "grad_norm": 33.75407076232228, + "language_loss": 0.85470867, + "learning_rate": 3.996899089108607e-06, + "loss": 0.9432534, + "num_input_tokens_seen": 16846230, + "router_z_loss_clip": 7.453125, + "router_z_loss_mlp": 1.22753906, + "step": 785, + "time_per_iteration": 2.641284227371216 + }, + { + "auxiliary_loss_clip": 0.07399641, + "auxiliary_loss_mlp": 0.01481075, + "balance_loss_clip": 0.06649202, + "balance_loss_mlp": 0.01357002, + "epoch": 0.0472568765970239, + "flos": 17937204480000.0, + "grad_norm": 4.826067054081543, + "language_loss": 0.94969213, + "learning_rate": 3.996877372161152e-06, + "loss": 1.03849936, + "num_input_tokens_seen": 16865325, + "router_z_loss_clip": 7.51953125, + "router_z_loss_mlp": 1.24023438, + "step": 786, + "time_per_iteration": 2.6160340309143066 + }, + { + "auxiliary_loss_clip": 0.07465263, + "auxiliary_loss_mlp": 0.01521969, + "balance_loss_clip": 0.06653383, + "balance_loss_mlp": 0.01371384, + "epoch": 0.04731699984969187, + "flos": 18083169492480.0, + "grad_norm": 10.690384669742231, + "language_loss": 0.84019518, + "learning_rate": 3.9968555794919065e-06, + "loss": 0.93006748, + "num_input_tokens_seen": 16882930, + "router_z_loss_clip": 8.1328125, + "router_z_loss_mlp": 1.50488281, + "step": 787, + "time_per_iteration": 2.5864908695220947 + }, + { + "auxiliary_loss_clip": 0.07389308, + "auxiliary_loss_mlp": 0.01468371, + "balance_loss_clip": 0.06647876, + "balance_loss_mlp": 0.01332663, + "epoch": 0.047377123102359836, + "flos": 23191735294080.0, + "grad_norm": 8.892570877156906, + "language_loss": 0.85964632, + "learning_rate": 3.996833711101698e-06, + "loss": 0.94822311, + "num_input_tokens_seen": 16900710, + "router_z_loss_clip": 7.41796875, + "router_z_loss_mlp": 1.35839844, + "step": 788, + "time_per_iteration": 2.6390748023986816 + }, + { + "auxiliary_loss_clip": 0.07401264, + "auxiliary_loss_mlp": 0.01469979, + "balance_loss_clip": 0.06672339, + "balance_loss_mlp": 0.01334367, + "epoch": 0.04743724635502781, + "flos": 22754469162240.0, + "grad_norm": 17.026258111429804, + "language_loss": 0.89192903, + "learning_rate": 3.996811766991355e-06, + "loss": 0.98064142, + "num_input_tokens_seen": 16919210, + "router_z_loss_clip": 7.29296875, + "router_z_loss_mlp": 1.35449219, + "step": 789, + "time_per_iteration": 2.6131770610809326 + }, + { + "auxiliary_loss_clip": 0.07421435, + "auxiliary_loss_mlp": 0.01479761, + "balance_loss_clip": 0.06683871, + "balance_loss_mlp": 0.01339475, + "epoch": 0.04749736960769577, + "flos": 17244499576320.0, + "grad_norm": 30.32315054606697, + "language_loss": 0.88307178, + "learning_rate": 3.996789747161709e-06, + "loss": 0.97208381, + "num_input_tokens_seen": 16937125, + "router_z_loss_clip": 7.37890625, + "router_z_loss_mlp": 1.40136719, + "step": 790, + "time_per_iteration": 2.618745803833008 + }, + { + "auxiliary_loss_clip": 0.07412322, + "auxiliary_loss_mlp": 0.01470303, + "balance_loss_clip": 0.06664298, + "balance_loss_mlp": 0.01331687, + "epoch": 0.047557492860363745, + "flos": 40488798908160.0, + "grad_norm": 154.88106341207603, + "language_loss": 0.94037831, + "learning_rate": 3.996767651613597e-06, + "loss": 1.02920461, + "num_input_tokens_seen": 16958610, + "router_z_loss_clip": 7.48046875, + "router_z_loss_mlp": 1.38623047, + "step": 791, + "time_per_iteration": 2.7700016498565674 + }, + { + "auxiliary_loss_clip": 0.07422841, + "auxiliary_loss_mlp": 0.01462484, + "balance_loss_clip": 0.06681914, + "balance_loss_mlp": 0.01322198, + "epoch": 0.04761761611303172, + "flos": 18704023920000.0, + "grad_norm": 23.33805920811653, + "language_loss": 0.9476828, + "learning_rate": 3.996745480347854e-06, + "loss": 1.03653598, + "num_input_tokens_seen": 16977300, + "router_z_loss_clip": 7.4140625, + "router_z_loss_mlp": 1.40332031, + "step": 792, + "time_per_iteration": 2.605254888534546 + }, + { + "auxiliary_loss_clip": 0.07424683, + "auxiliary_loss_mlp": 0.01473205, + "balance_loss_clip": 0.0668014, + "balance_loss_mlp": 0.01333396, + "epoch": 0.04767773936569968, + "flos": 20928103643520.0, + "grad_norm": 9.340139883580587, + "language_loss": 0.78320849, + "learning_rate": 3.996723233365324e-06, + "loss": 0.87218744, + "num_input_tokens_seen": 16994950, + "router_z_loss_clip": 7.44921875, + "router_z_loss_mlp": 1.39697266, + "step": 793, + "time_per_iteration": 2.589350938796997 + }, + { + "auxiliary_loss_clip": 0.07421647, + "auxiliary_loss_mlp": 0.01474475, + "balance_loss_clip": 0.06679038, + "balance_loss_mlp": 0.01333379, + "epoch": 0.047737862618367655, + "flos": 23739481434240.0, + "grad_norm": 17.45910394468578, + "language_loss": 0.91955769, + "learning_rate": 3.996700910666847e-06, + "loss": 1.00851893, + "num_input_tokens_seen": 17014760, + "router_z_loss_clip": 7.4296875, + "router_z_loss_mlp": 1.41064453, + "step": 794, + "time_per_iteration": 2.65012264251709 + }, + { + "auxiliary_loss_clip": 0.07410855, + "auxiliary_loss_mlp": 0.01451088, + "balance_loss_clip": 0.06674555, + "balance_loss_mlp": 0.01322247, + "epoch": 0.04779798587103562, + "flos": 23702487056640.0, + "grad_norm": 25.87656480685072, + "language_loss": 0.77586949, + "learning_rate": 3.996678512253272e-06, + "loss": 0.8644889, + "num_input_tokens_seen": 17032715, + "router_z_loss_clip": 7.3671875, + "router_z_loss_mlp": 1.28808594, + "step": 795, + "time_per_iteration": 2.6948788166046143 + }, + { + "auxiliary_loss_clip": 0.07379565, + "auxiliary_loss_mlp": 0.01431544, + "balance_loss_clip": 0.06667496, + "balance_loss_mlp": 0.01302989, + "epoch": 0.04785810912370359, + "flos": 23190058212480.0, + "grad_norm": 8.675826434601191, + "language_loss": 0.85312498, + "learning_rate": 3.996656038125449e-06, + "loss": 0.94123614, + "num_input_tokens_seen": 17052215, + "router_z_loss_clip": 7.12109375, + "router_z_loss_mlp": 1.28466797, + "step": 796, + "time_per_iteration": 2.7435877323150635 + }, + { + "auxiliary_loss_clip": 0.07385565, + "auxiliary_loss_mlp": 0.0140352, + "balance_loss_clip": 0.06662786, + "balance_loss_mlp": 0.01285074, + "epoch": 0.047918232376371564, + "flos": 18046426677120.0, + "grad_norm": 54.926272560680225, + "language_loss": 0.8855834, + "learning_rate": 3.996633488284228e-06, + "loss": 0.97347426, + "num_input_tokens_seen": 17069225, + "router_z_loss_clip": 7.23046875, + "router_z_loss_mlp": 1.18359375, + "step": 797, + "time_per_iteration": 2.6623764038085938 + }, + { + "auxiliary_loss_clip": 0.07094701, + "auxiliary_loss_mlp": 0.01316158, + "balance_loss_clip": 0.0666967, + "balance_loss_mlp": 0.01274649, + "epoch": 0.04797835562903953, + "flos": 62461717511040.0, + "grad_norm": 0.9155106497251145, + "language_loss": 0.64821255, + "learning_rate": 3.996610862730465e-06, + "loss": 0.73232114, + "num_input_tokens_seen": 17126680, + "router_z_loss_clip": 4.25, + "router_z_loss_mlp": 0.4152832, + "step": 798, + "time_per_iteration": 3.148404121398926 + }, + { + "auxiliary_loss_clip": 0.07427999, + "auxiliary_loss_mlp": 0.01422996, + "balance_loss_clip": 0.06684162, + "balance_loss_mlp": 0.01303215, + "epoch": 0.0480384788817075, + "flos": 21513766556160.0, + "grad_norm": 16.018908533164023, + "language_loss": 0.96157068, + "learning_rate": 3.996588161465018e-06, + "loss": 1.05008054, + "num_input_tokens_seen": 17144835, + "router_z_loss_clip": 7.4453125, + "router_z_loss_mlp": 1.19775391, + "step": 799, + "time_per_iteration": 2.6639058589935303 + }, + { + "auxiliary_loss_clip": 0.07364519, + "auxiliary_loss_mlp": 0.01407648, + "balance_loss_clip": 0.06657426, + "balance_loss_mlp": 0.01297594, + "epoch": 0.048098602134375466, + "flos": 21733301053440.0, + "grad_norm": 22.047266878511874, + "language_loss": 0.92366803, + "learning_rate": 3.996565384488748e-06, + "loss": 1.01138973, + "num_input_tokens_seen": 17165030, + "router_z_loss_clip": 7.07421875, + "router_z_loss_mlp": 1.10253906, + "step": 800, + "time_per_iteration": 2.646414041519165 + }, + { + "auxiliary_loss_clip": 0.07370388, + "auxiliary_loss_mlp": 0.01385117, + "balance_loss_clip": 0.06655432, + "balance_loss_mlp": 0.01282549, + "epoch": 0.04815872538704344, + "flos": 22937931676800.0, + "grad_norm": 10.357052219396058, + "language_loss": 0.89344579, + "learning_rate": 3.996542531802518e-06, + "loss": 0.98100084, + "num_input_tokens_seen": 17184895, + "router_z_loss_clip": 7.1484375, + "router_z_loss_mlp": 1.02636719, + "step": 801, + "time_per_iteration": 2.6882050037384033 + }, + { + "auxiliary_loss_clip": 0.07345966, + "auxiliary_loss_mlp": 0.01362249, + "balance_loss_clip": 0.06635958, + "balance_loss_mlp": 0.01265022, + "epoch": 0.04821884863971141, + "flos": 43183952686080.0, + "grad_norm": 6.136831614794949, + "language_loss": 0.85035717, + "learning_rate": 3.996519603407196e-06, + "loss": 0.93743926, + "num_input_tokens_seen": 17208225, + "router_z_loss_clip": 7.10546875, + "router_z_loss_mlp": 0.97216797, + "step": 802, + "time_per_iteration": 2.79622220993042 + }, + { + "auxiliary_loss_clip": 0.07318079, + "auxiliary_loss_mlp": 0.01347073, + "balance_loss_clip": 0.06636789, + "balance_loss_mlp": 0.01265057, + "epoch": 0.048278971892379376, + "flos": 18625171628160.0, + "grad_norm": 43.20373329941697, + "language_loss": 0.91245079, + "learning_rate": 3.996496599303649e-06, + "loss": 0.99910235, + "num_input_tokens_seen": 17226305, + "router_z_loss_clip": 6.81640625, + "router_z_loss_mlp": 0.81982422, + "step": 803, + "time_per_iteration": 2.624542236328125 + }, + { + "auxiliary_loss_clip": 0.07327777, + "auxiliary_loss_mlp": 0.01365974, + "balance_loss_clip": 0.06626104, + "balance_loss_mlp": 0.01271798, + "epoch": 0.04833909514504735, + "flos": 20236279207680.0, + "grad_norm": 95.48194102470296, + "language_loss": 0.905747, + "learning_rate": 3.996473519492753e-06, + "loss": 0.99268442, + "num_input_tokens_seen": 17244545, + "router_z_loss_clip": 7.01953125, + "router_z_loss_mlp": 0.94238281, + "step": 804, + "time_per_iteration": 2.597118854522705 + }, + { + "auxiliary_loss_clip": 0.07322634, + "auxiliary_loss_mlp": 0.01340955, + "balance_loss_clip": 0.0662351, + "balance_loss_mlp": 0.01259273, + "epoch": 0.04839921839771532, + "flos": 24652182032640.0, + "grad_norm": 4.3863417773594096, + "language_loss": 0.91238397, + "learning_rate": 3.99645036397538e-06, + "loss": 0.99901986, + "num_input_tokens_seen": 17265730, + "router_z_loss_clip": 6.9921875, + "router_z_loss_mlp": 0.81689453, + "step": 805, + "time_per_iteration": 2.6999049186706543 + }, + { + "auxiliary_loss_clip": 0.07332969, + "auxiliary_loss_mlp": 0.01347421, + "balance_loss_clip": 0.06628814, + "balance_loss_mlp": 0.01263783, + "epoch": 0.048459341650383285, + "flos": 24834470590080.0, + "grad_norm": 14.417666191465669, + "language_loss": 0.71703786, + "learning_rate": 3.9964271327524085e-06, + "loss": 0.80384171, + "num_input_tokens_seen": 17284820, + "router_z_loss_clip": 7.046875, + "router_z_loss_mlp": 0.8359375, + "step": 806, + "time_per_iteration": 4.025094985961914 + }, + { + "auxiliary_loss_clip": 0.07307116, + "auxiliary_loss_mlp": 0.01343001, + "balance_loss_clip": 0.06628814, + "balance_loss_mlp": 0.01262844, + "epoch": 0.04851946490305126, + "flos": 22169644790400.0, + "grad_norm": 6.037392612651371, + "language_loss": 0.81120235, + "learning_rate": 3.9964038258247214e-06, + "loss": 0.89770353, + "num_input_tokens_seen": 17305085, + "router_z_loss_clip": 6.7734375, + "router_z_loss_mlp": 0.80126953, + "step": 807, + "time_per_iteration": 4.06866717338562 + }, + { + "auxiliary_loss_clip": 0.07289852, + "auxiliary_loss_mlp": 0.01348053, + "balance_loss_clip": 0.06616738, + "balance_loss_mlp": 0.01266228, + "epoch": 0.04857958815571922, + "flos": 19798132608000.0, + "grad_norm": 11.228648532877324, + "language_loss": 0.92036742, + "learning_rate": 3.9963804431932005e-06, + "loss": 1.00674641, + "num_input_tokens_seen": 17322715, + "router_z_loss_clip": 6.73046875, + "router_z_loss_mlp": 0.81738281, + "step": 808, + "time_per_iteration": 3.9916791915893555 + }, + { + "auxiliary_loss_clip": 0.07360442, + "auxiliary_loss_mlp": 0.01352716, + "balance_loss_clip": 0.06635769, + "balance_loss_mlp": 0.01261115, + "epoch": 0.048639711408387194, + "flos": 18703981992960.0, + "grad_norm": 6.742572767322423, + "language_loss": 0.95677304, + "learning_rate": 3.996356984858732e-06, + "loss": 1.04390454, + "num_input_tokens_seen": 17341455, + "router_z_loss_clip": 7.2421875, + "router_z_loss_mlp": 0.91699219, + "step": 809, + "time_per_iteration": 2.6680333614349365 + }, + { + "auxiliary_loss_clip": 0.07315584, + "auxiliary_loss_mlp": 0.01344649, + "balance_loss_clip": 0.06624336, + "balance_loss_mlp": 0.01256863, + "epoch": 0.048699834661055166, + "flos": 24870458718720.0, + "grad_norm": 4.628704942448529, + "language_loss": 0.90077579, + "learning_rate": 3.996333450822208e-06, + "loss": 0.98737824, + "num_input_tokens_seen": 17360765, + "router_z_loss_clip": 6.92578125, + "router_z_loss_mlp": 0.87841797, + "step": 810, + "time_per_iteration": 2.6677091121673584 + }, + { + "auxiliary_loss_clip": 0.07363133, + "auxiliary_loss_mlp": 0.01339196, + "balance_loss_clip": 0.06638221, + "balance_loss_mlp": 0.0126109, + "epoch": 0.04875995791372313, + "flos": 20710246227840.0, + "grad_norm": 31.095133807277897, + "language_loss": 0.84460914, + "learning_rate": 3.99630984108452e-06, + "loss": 0.9316324, + "num_input_tokens_seen": 17380625, + "router_z_loss_clip": 7.25, + "router_z_loss_mlp": 0.78125, + "step": 811, + "time_per_iteration": 4.020594358444214 + }, + { + "auxiliary_loss_clip": 0.07316839, + "auxiliary_loss_mlp": 0.01338146, + "balance_loss_clip": 0.06624701, + "balance_loss_mlp": 0.01256941, + "epoch": 0.048820081166391104, + "flos": 18594256671360.0, + "grad_norm": 4.82975857058881, + "language_loss": 0.78335881, + "learning_rate": 3.9962861556465615e-06, + "loss": 0.86990869, + "num_input_tokens_seen": 17399355, + "router_z_loss_clip": 6.92578125, + "router_z_loss_mlp": 0.81152344, + "step": 812, + "time_per_iteration": 2.614077091217041 + }, + { + "auxiliary_loss_clip": 0.0728099, + "auxiliary_loss_mlp": 0.01351533, + "balance_loss_clip": 0.06610497, + "balance_loss_mlp": 0.0127009, + "epoch": 0.04888020441905907, + "flos": 22713324007680.0, + "grad_norm": 17.655616040127313, + "language_loss": 0.94109142, + "learning_rate": 3.996262394509233e-06, + "loss": 1.02741659, + "num_input_tokens_seen": 17418240, + "router_z_loss_clip": 6.703125, + "router_z_loss_mlp": 0.81494141, + "step": 813, + "time_per_iteration": 2.5956995487213135 + }, + { + "auxiliary_loss_clip": 0.07318511, + "auxiliary_loss_mlp": 0.01349544, + "balance_loss_clip": 0.0662335, + "balance_loss_mlp": 0.01262807, + "epoch": 0.04894032767172704, + "flos": 22791044269440.0, + "grad_norm": 7.289252550466507, + "language_loss": 0.78803051, + "learning_rate": 3.9962385576734335e-06, + "loss": 0.87471104, + "num_input_tokens_seen": 17436250, + "router_z_loss_clip": 6.953125, + "router_z_loss_mlp": 0.8671875, + "step": 814, + "time_per_iteration": 2.625399351119995 + }, + { + "auxiliary_loss_clip": 0.07335538, + "auxiliary_loss_mlp": 0.01355257, + "balance_loss_clip": 0.06626598, + "balance_loss_mlp": 0.01267948, + "epoch": 0.04900045092439501, + "flos": 25522521592320.0, + "grad_norm": 46.975949242566905, + "language_loss": 0.87790531, + "learning_rate": 3.9962146451400675e-06, + "loss": 0.96481323, + "num_input_tokens_seen": 17455750, + "router_z_loss_clip": 7.1015625, + "router_z_loss_mlp": 0.87451172, + "step": 815, + "time_per_iteration": 2.6799027919769287 + }, + { + "auxiliary_loss_clip": 0.0734727, + "auxiliary_loss_mlp": 0.0137345, + "balance_loss_clip": 0.06619896, + "balance_loss_mlp": 0.01271788, + "epoch": 0.04906057417706298, + "flos": 25965280166400.0, + "grad_norm": 11.89199068240792, + "language_loss": 0.95818853, + "learning_rate": 3.996190656910043e-06, + "loss": 1.04539561, + "num_input_tokens_seen": 17474995, + "router_z_loss_clip": 7.28125, + "router_z_loss_mlp": 1.01757812, + "step": 816, + "time_per_iteration": 2.668058395385742 + }, + { + "auxiliary_loss_clip": 0.07340101, + "auxiliary_loss_mlp": 0.01360138, + "balance_loss_clip": 0.066241, + "balance_loss_mlp": 0.01271828, + "epoch": 0.04912069742973095, + "flos": 18630580216320.0, + "grad_norm": 8.092720893633917, + "language_loss": 0.84299397, + "learning_rate": 3.996166592984268e-06, + "loss": 0.92999631, + "num_input_tokens_seen": 17493395, + "router_z_loss_clip": 7.1484375, + "router_z_loss_mlp": 0.88330078, + "step": 817, + "time_per_iteration": 2.5901565551757812 + }, + { + "auxiliary_loss_clip": 0.07312281, + "auxiliary_loss_mlp": 0.01371477, + "balance_loss_clip": 0.06618914, + "balance_loss_mlp": 0.01282404, + "epoch": 0.049180820682398915, + "flos": 23707182885120.0, + "grad_norm": 5.174214831161968, + "language_loss": 0.88566625, + "learning_rate": 3.996142453363656e-06, + "loss": 0.97250384, + "num_input_tokens_seen": 17514565, + "router_z_loss_clip": 6.93359375, + "router_z_loss_mlp": 0.89013672, + "step": 818, + "time_per_iteration": 2.6751646995544434 + }, + { + "auxiliary_loss_clip": 0.07361554, + "auxiliary_loss_mlp": 0.01384487, + "balance_loss_clip": 0.06625406, + "balance_loss_mlp": 0.01290598, + "epoch": 0.04924094393506689, + "flos": 22427179914240.0, + "grad_norm": 6.808629946314654, + "language_loss": 0.81731856, + "learning_rate": 3.996118238049124e-06, + "loss": 0.90477902, + "num_input_tokens_seen": 17534590, + "router_z_loss_clip": 7.36328125, + "router_z_loss_mlp": 0.93798828, + "step": 819, + "time_per_iteration": 2.638293504714966 + }, + { + "auxiliary_loss_clip": 0.07319279, + "auxiliary_loss_mlp": 0.01377789, + "balance_loss_clip": 0.06608901, + "balance_loss_mlp": 0.01285903, + "epoch": 0.04930106718773486, + "flos": 15743033464320.0, + "grad_norm": 10.609665501519604, + "language_loss": 0.88234192, + "learning_rate": 3.996093947041586e-06, + "loss": 0.96931261, + "num_input_tokens_seen": 17551900, + "router_z_loss_clip": 7.109375, + "router_z_loss_mlp": 0.91845703, + "step": 820, + "time_per_iteration": 2.6076858043670654 + }, + { + "auxiliary_loss_clip": 0.07310833, + "auxiliary_loss_mlp": 0.01372579, + "balance_loss_clip": 0.06604609, + "balance_loss_mlp": 0.01282171, + "epoch": 0.049361190440402825, + "flos": 26257922951040.0, + "grad_norm": 5.648893665912937, + "language_loss": 0.94581264, + "learning_rate": 3.996069580341966e-06, + "loss": 1.03264678, + "num_input_tokens_seen": 17571485, + "router_z_loss_clip": 7.0703125, + "router_z_loss_mlp": 0.90380859, + "step": 821, + "time_per_iteration": 2.7164249420166016 + }, + { + "auxiliary_loss_clip": 0.07296955, + "auxiliary_loss_mlp": 0.01366561, + "balance_loss_clip": 0.0660333, + "balance_loss_mlp": 0.01277488, + "epoch": 0.0494213136930708, + "flos": 21258872835840.0, + "grad_norm": 13.842694995476421, + "language_loss": 0.93458569, + "learning_rate": 3.996045137951188e-06, + "loss": 1.02122092, + "num_input_tokens_seen": 17591410, + "router_z_loss_clip": 6.9453125, + "router_z_loss_mlp": 0.890625, + "step": 822, + "time_per_iteration": 2.6453444957733154 + }, + { + "auxiliary_loss_clip": 0.07319045, + "auxiliary_loss_mlp": 0.01374655, + "balance_loss_clip": 0.06613644, + "balance_loss_mlp": 0.0128048, + "epoch": 0.04948143694573876, + "flos": 27973095701760.0, + "grad_norm": 7.088849816783062, + "language_loss": 0.7121917, + "learning_rate": 3.996020619870178e-06, + "loss": 0.79912865, + "num_input_tokens_seen": 17612010, + "router_z_loss_clip": 7.05078125, + "router_z_loss_mlp": 0.94238281, + "step": 823, + "time_per_iteration": 2.6804885864257812 + }, + { + "auxiliary_loss_clip": 0.06953795, + "auxiliary_loss_mlp": 0.01404355, + "balance_loss_clip": 0.06535611, + "balance_loss_mlp": 0.01345371, + "epoch": 0.049541560198406734, + "flos": 66197466345600.0, + "grad_norm": 1.28356919167216, + "language_loss": 0.63197851, + "learning_rate": 3.995996026099866e-06, + "loss": 0.71555996, + "num_input_tokens_seen": 17673430, + "router_z_loss_clip": 4.1875, + "router_z_loss_mlp": 0.58837891, + "step": 824, + "time_per_iteration": 3.3058674335479736 + }, + { + "auxiliary_loss_clip": 0.07323784, + "auxiliary_loss_mlp": 0.01374745, + "balance_loss_clip": 0.06612824, + "balance_loss_mlp": 0.01280998, + "epoch": 0.049601683451074706, + "flos": 22899218290560.0, + "grad_norm": 5.8210235967171435, + "language_loss": 0.9564544, + "learning_rate": 3.995971356641185e-06, + "loss": 1.04343963, + "num_input_tokens_seen": 17689545, + "router_z_loss_clip": 7.11328125, + "router_z_loss_mlp": 0.9375, + "step": 825, + "time_per_iteration": 2.62613844871521 + }, + { + "auxiliary_loss_clip": 0.07281419, + "auxiliary_loss_mlp": 0.01365594, + "balance_loss_clip": 0.06597939, + "balance_loss_mlp": 0.0127695, + "epoch": 0.04966180670374267, + "flos": 21439987436160.0, + "grad_norm": 7.03533776815666, + "language_loss": 0.71345061, + "learning_rate": 3.9959466114950695e-06, + "loss": 0.7999208, + "num_input_tokens_seen": 17705965, + "router_z_loss_clip": 6.83984375, + "router_z_loss_mlp": 0.88671875, + "step": 826, + "time_per_iteration": 2.607252359390259 + }, + { + "auxiliary_loss_clip": 0.07308409, + "auxiliary_loss_mlp": 0.01368352, + "balance_loss_clip": 0.06603594, + "balance_loss_mlp": 0.0127885, + "epoch": 0.04972192995641064, + "flos": 23113218418560.0, + "grad_norm": 6.719033594417253, + "language_loss": 0.82099521, + "learning_rate": 3.995921790662459e-06, + "loss": 0.90776283, + "num_input_tokens_seen": 17724580, + "router_z_loss_clip": 7.05078125, + "router_z_loss_mlp": 0.89550781, + "step": 827, + "time_per_iteration": 2.6468021869659424 + }, + { + "auxiliary_loss_clip": 0.07312737, + "auxiliary_loss_mlp": 0.01384514, + "balance_loss_clip": 0.06605525, + "balance_loss_mlp": 0.01293009, + "epoch": 0.04978205320907861, + "flos": 40415648693760.0, + "grad_norm": 3.6071356819257336, + "language_loss": 0.83064795, + "learning_rate": 3.995896894144294e-06, + "loss": 0.91762054, + "num_input_tokens_seen": 17747755, + "router_z_loss_clip": 7.05859375, + "router_z_loss_mlp": 0.91455078, + "step": 828, + "time_per_iteration": 2.7598366737365723 + }, + { + "auxiliary_loss_clip": 0.07248655, + "auxiliary_loss_mlp": 0.01357422, + "balance_loss_clip": 0.06587116, + "balance_loss_mlp": 0.01271687, + "epoch": 0.04984217646174658, + "flos": 25235580885120.0, + "grad_norm": 7.916023460171269, + "language_loss": 0.88066685, + "learning_rate": 3.995871921941519e-06, + "loss": 0.96672761, + "num_input_tokens_seen": 17768550, + "router_z_loss_clip": 6.609375, + "router_z_loss_mlp": 0.85791016, + "step": 829, + "time_per_iteration": 2.664443016052246 + }, + { + "auxiliary_loss_clip": 0.07290308, + "auxiliary_loss_mlp": 0.01371956, + "balance_loss_clip": 0.06599583, + "balance_loss_mlp": 0.01282025, + "epoch": 0.04990229971441455, + "flos": 15964873948800.0, + "grad_norm": 30.23399077612731, + "language_loss": 0.79482603, + "learning_rate": 3.99584687405508e-06, + "loss": 0.88144869, + "num_input_tokens_seen": 17786080, + "router_z_loss_clip": 6.90625, + "router_z_loss_mlp": 0.90039062, + "step": 830, + "time_per_iteration": 2.5562844276428223 + }, + { + "auxiliary_loss_clip": 0.07284638, + "auxiliary_loss_mlp": 0.01358745, + "balance_loss_clip": 0.06602956, + "balance_loss_mlp": 0.01273677, + "epoch": 0.04996242296708252, + "flos": 18410919937920.0, + "grad_norm": 6.720833612775693, + "language_loss": 0.82703733, + "learning_rate": 3.995821750485929e-06, + "loss": 0.91347122, + "num_input_tokens_seen": 17803635, + "router_z_loss_clip": 6.81640625, + "router_z_loss_mlp": 0.85058594, + "step": 831, + "time_per_iteration": 2.6576318740844727 + }, + { + "auxiliary_loss_clip": 0.07282449, + "auxiliary_loss_mlp": 0.01350763, + "balance_loss_clip": 0.06587234, + "balance_loss_mlp": 0.01262882, + "epoch": 0.05002254621975049, + "flos": 17863802703360.0, + "grad_norm": 5.424543563535015, + "language_loss": 0.97343409, + "learning_rate": 3.995796551235016e-06, + "loss": 1.05976629, + "num_input_tokens_seen": 17822190, + "router_z_loss_clip": 6.953125, + "router_z_loss_mlp": 0.87939453, + "step": 832, + "time_per_iteration": 2.5859360694885254 + }, + { + "auxiliary_loss_clip": 0.07242593, + "auxiliary_loss_mlp": 0.01355446, + "balance_loss_clip": 0.06576244, + "balance_loss_mlp": 0.01268804, + "epoch": 0.050082669472418455, + "flos": 45670682632320.0, + "grad_norm": 14.668918539875873, + "language_loss": 0.86283791, + "learning_rate": 3.9957712763032974e-06, + "loss": 0.94881833, + "num_input_tokens_seen": 17846915, + "router_z_loss_clip": 6.66015625, + "router_z_loss_mlp": 0.86621094, + "step": 833, + "time_per_iteration": 2.8055691719055176 + }, + { + "auxiliary_loss_clip": 0.07249285, + "auxiliary_loss_mlp": 0.01350346, + "balance_loss_clip": 0.06584433, + "balance_loss_mlp": 0.01262561, + "epoch": 0.05014279272508643, + "flos": 37971237859200.0, + "grad_norm": 3.800888643683855, + "language_loss": 0.8636179, + "learning_rate": 3.995745925691733e-06, + "loss": 0.94961417, + "num_input_tokens_seen": 17867270, + "router_z_loss_clip": 6.64453125, + "router_z_loss_mlp": 0.87695312, + "step": 834, + "time_per_iteration": 2.757873296737671 + }, + { + "auxiliary_loss_clip": 0.07281981, + "auxiliary_loss_mlp": 0.01348084, + "balance_loss_clip": 0.0659239, + "balance_loss_mlp": 0.01265353, + "epoch": 0.0502029159777544, + "flos": 21002511669120.0, + "grad_norm": 6.832202768967494, + "language_loss": 0.96576416, + "learning_rate": 3.995720499401282e-06, + "loss": 1.0520649, + "num_input_tokens_seen": 17884880, + "router_z_loss_clip": 6.890625, + "router_z_loss_mlp": 0.82666016, + "step": 835, + "time_per_iteration": 2.5905637741088867 + }, + { + "auxiliary_loss_clip": 0.07274499, + "auxiliary_loss_mlp": 0.01349147, + "balance_loss_clip": 0.06586967, + "balance_loss_mlp": 0.01266273, + "epoch": 0.050263039230422364, + "flos": 15893526597120.0, + "grad_norm": 5.723886418395804, + "language_loss": 0.82083344, + "learning_rate": 3.995694997432911e-06, + "loss": 0.90706992, + "num_input_tokens_seen": 17903695, + "router_z_loss_clip": 6.87890625, + "router_z_loss_mlp": 0.82861328, + "step": 836, + "time_per_iteration": 2.6167397499084473 + }, + { + "auxiliary_loss_clip": 0.0721738, + "auxiliary_loss_mlp": 0.01338932, + "balance_loss_clip": 0.06569374, + "balance_loss_mlp": 0.01261065, + "epoch": 0.050323162483090336, + "flos": 23739565288320.0, + "grad_norm": 23.66781297023958, + "language_loss": 0.88235295, + "learning_rate": 3.9956694197875855e-06, + "loss": 0.96791613, + "num_input_tokens_seen": 17920745, + "router_z_loss_clip": 6.48046875, + "router_z_loss_mlp": 0.77832031, + "step": 837, + "time_per_iteration": 2.614959955215454 + }, + { + "auxiliary_loss_clip": 0.07221343, + "auxiliary_loss_mlp": 0.01354096, + "balance_loss_clip": 0.06550418, + "balance_loss_mlp": 0.01265261, + "epoch": 0.0503832857357583, + "flos": 20272393117440.0, + "grad_norm": 6.0443181189796995, + "language_loss": 0.76965159, + "learning_rate": 3.995643766466275e-06, + "loss": 0.85540605, + "num_input_tokens_seen": 17938220, + "router_z_loss_clip": 6.7109375, + "router_z_loss_mlp": 0.88769531, + "step": 838, + "time_per_iteration": 2.622648239135742 + }, + { + "auxiliary_loss_clip": 0.0724083, + "auxiliary_loss_mlp": 0.01341893, + "balance_loss_clip": 0.06561115, + "balance_loss_mlp": 0.01259353, + "epoch": 0.05044340898842627, + "flos": 17790736343040.0, + "grad_norm": 4.747797763129113, + "language_loss": 0.86986995, + "learning_rate": 3.995618037469953e-06, + "loss": 0.95569718, + "num_input_tokens_seen": 17957325, + "router_z_loss_clip": 6.796875, + "router_z_loss_mlp": 0.82519531, + "step": 839, + "time_per_iteration": 2.5999207496643066 + }, + { + "auxiliary_loss_clip": 0.07210248, + "auxiliary_loss_mlp": 0.01342514, + "balance_loss_clip": 0.06558718, + "balance_loss_mlp": 0.01262024, + "epoch": 0.050503532241094246, + "flos": 22973207045760.0, + "grad_norm": 3.66950577076863, + "language_loss": 0.88844591, + "learning_rate": 3.995592232799595e-06, + "loss": 0.97397357, + "num_input_tokens_seen": 17975875, + "router_z_loss_clip": 6.51953125, + "router_z_loss_mlp": 0.80517578, + "step": 840, + "time_per_iteration": 2.688936948776245 + }, + { + "auxiliary_loss_clip": 0.07223296, + "auxiliary_loss_mlp": 0.01348235, + "balance_loss_clip": 0.06565775, + "balance_loss_mlp": 0.01264264, + "epoch": 0.05056365549376221, + "flos": 22782449226240.0, + "grad_norm": 5.237976654716359, + "language_loss": 0.98182797, + "learning_rate": 3.99556635245618e-06, + "loss": 1.06754327, + "num_input_tokens_seen": 17994340, + "router_z_loss_clip": 6.57421875, + "router_z_loss_mlp": 0.84033203, + "step": 841, + "time_per_iteration": 2.626171588897705 + }, + { + "auxiliary_loss_clip": 0.07216457, + "auxiliary_loss_mlp": 0.01346197, + "balance_loss_clip": 0.06556017, + "balance_loss_mlp": 0.01263227, + "epoch": 0.05062377874643018, + "flos": 30924401011200.0, + "grad_norm": 3.922284831716734, + "language_loss": 0.81540143, + "learning_rate": 3.995540396440688e-06, + "loss": 0.90102798, + "num_input_tokens_seen": 18015260, + "router_z_loss_clip": 6.609375, + "router_z_loss_mlp": 0.82958984, + "step": 842, + "time_per_iteration": 2.707146167755127 + }, + { + "auxiliary_loss_clip": 0.07236033, + "auxiliary_loss_mlp": 0.01355891, + "balance_loss_clip": 0.06555693, + "balance_loss_mlp": 0.0126391, + "epoch": 0.05068390199909815, + "flos": 19653425406720.0, + "grad_norm": 6.4717382946502635, + "language_loss": 0.81965601, + "learning_rate": 3.995514364754105e-06, + "loss": 0.90557522, + "num_input_tokens_seen": 18033960, + "router_z_loss_clip": 6.80078125, + "router_z_loss_mlp": 0.91943359, + "step": 843, + "time_per_iteration": 2.672064781188965 + }, + { + "auxiliary_loss_clip": 0.07235807, + "auxiliary_loss_mlp": 0.01361352, + "balance_loss_clip": 0.06552228, + "balance_loss_mlp": 0.01271992, + "epoch": 0.05074402525176612, + "flos": 37971279786240.0, + "grad_norm": 2.407141650516338, + "language_loss": 0.87016606, + "learning_rate": 3.995488257397417e-06, + "loss": 0.95613766, + "num_input_tokens_seen": 18056700, + "router_z_loss_clip": 6.83203125, + "router_z_loss_mlp": 0.89404297, + "step": 844, + "time_per_iteration": 2.7541916370391846 + }, + { + "auxiliary_loss_clip": 0.07238596, + "auxiliary_loss_mlp": 0.01357268, + "balance_loss_clip": 0.06561587, + "balance_loss_mlp": 0.01275109, + "epoch": 0.05080414850443409, + "flos": 22061177280000.0, + "grad_norm": 5.7438919546505876, + "language_loss": 0.80192208, + "learning_rate": 3.995462074371614e-06, + "loss": 0.8878808, + "num_input_tokens_seen": 18075815, + "router_z_loss_clip": 6.76953125, + "router_z_loss_mlp": 0.82226562, + "step": 845, + "time_per_iteration": 2.5944912433624268 + }, + { + "auxiliary_loss_clip": 0.07213366, + "auxiliary_loss_mlp": 0.01353915, + "balance_loss_clip": 0.06554674, + "balance_loss_mlp": 0.01268561, + "epoch": 0.05086427175710206, + "flos": 20231289889920.0, + "grad_norm": 4.0486216034950475, + "language_loss": 0.91612351, + "learning_rate": 3.99543581567769e-06, + "loss": 1.00179636, + "num_input_tokens_seen": 18095095, + "router_z_loss_clip": 6.578125, + "router_z_loss_mlp": 0.85400391, + "step": 846, + "time_per_iteration": 4.029407739639282 + }, + { + "auxiliary_loss_clip": 0.07198675, + "auxiliary_loss_mlp": 0.01353444, + "balance_loss_clip": 0.06555093, + "balance_loss_mlp": 0.01271094, + "epoch": 0.05092439500977003, + "flos": 15164707783680.0, + "grad_norm": 2.8334464640278307, + "language_loss": 0.91321969, + "learning_rate": 3.9954094813166394e-06, + "loss": 0.99874079, + "num_input_tokens_seen": 18112675, + "router_z_loss_clip": 6.4375, + "router_z_loss_mlp": 0.82324219, + "step": 847, + "time_per_iteration": 4.004042863845825 + }, + { + "auxiliary_loss_clip": 0.07199422, + "auxiliary_loss_mlp": 0.01355266, + "balance_loss_clip": 0.0654697, + "balance_loss_mlp": 0.01273202, + "epoch": 0.050984518262437994, + "flos": 22061806185600.0, + "grad_norm": 3.421485941815423, + "language_loss": 0.86160553, + "learning_rate": 3.995383071289462e-06, + "loss": 0.94715238, + "num_input_tokens_seen": 18130745, + "router_z_loss_clip": 6.52734375, + "router_z_loss_mlp": 0.82080078, + "step": 848, + "time_per_iteration": 4.033248662948608 + }, + { + "auxiliary_loss_clip": 0.07196971, + "auxiliary_loss_mlp": 0.01345708, + "balance_loss_clip": 0.06533228, + "balance_loss_mlp": 0.01262166, + "epoch": 0.05104464151510597, + "flos": 30232911991680.0, + "grad_norm": 3.7966495356829357, + "language_loss": 0.90386808, + "learning_rate": 3.995356585597158e-06, + "loss": 0.98929483, + "num_input_tokens_seen": 18152410, + "router_z_loss_clip": 6.640625, + "router_z_loss_mlp": 0.83544922, + "step": 849, + "time_per_iteration": 2.6612625122070312 + }, + { + "auxiliary_loss_clip": 0.07179346, + "auxiliary_loss_mlp": 0.01359214, + "balance_loss_clip": 0.06533284, + "balance_loss_mlp": 0.01279106, + "epoch": 0.05110476476777394, + "flos": 18338817899520.0, + "grad_norm": 8.277424439503498, + "language_loss": 0.88001835, + "learning_rate": 3.995330024240732e-06, + "loss": 0.96540397, + "num_input_tokens_seen": 18170870, + "router_z_loss_clip": 6.45703125, + "router_z_loss_mlp": 0.80126953, + "step": 850, + "time_per_iteration": 2.591169834136963 + }, + { + "auxiliary_loss_clip": 0.07213688, + "auxiliary_loss_mlp": 0.01358343, + "balance_loss_clip": 0.06542021, + "balance_loss_mlp": 0.01272131, + "epoch": 0.051164888020441904, + "flos": 38007938747520.0, + "grad_norm": 2.8793275004055894, + "language_loss": 0.702048, + "learning_rate": 3.995303387221192e-06, + "loss": 0.78776836, + "num_input_tokens_seen": 18191555, + "router_z_loss_clip": 6.72265625, + "router_z_loss_mlp": 0.86328125, + "step": 851, + "time_per_iteration": 4.218145132064819 + }, + { + "auxiliary_loss_clip": 0.07192284, + "auxiliary_loss_mlp": 0.0136467, + "balance_loss_clip": 0.06527439, + "balance_loss_mlp": 0.01276741, + "epoch": 0.051225011273109876, + "flos": 23045183303040.0, + "grad_norm": 3.6723766751173894, + "language_loss": 0.87184155, + "learning_rate": 3.995276674539547e-06, + "loss": 0.95741105, + "num_input_tokens_seen": 18208620, + "router_z_loss_clip": 6.66015625, + "router_z_loss_mlp": 0.87939453, + "step": 852, + "time_per_iteration": 2.629037380218506 + }, + { + "auxiliary_loss_clip": 0.07206973, + "auxiliary_loss_mlp": 0.01354841, + "balance_loss_clip": 0.06534127, + "balance_loss_mlp": 0.01269678, + "epoch": 0.05128513452577785, + "flos": 18265709612160.0, + "grad_norm": 3.821037496712823, + "language_loss": 0.8378402, + "learning_rate": 3.995249886196811e-06, + "loss": 0.92345834, + "num_input_tokens_seen": 18226370, + "router_z_loss_clip": 6.73046875, + "router_z_loss_mlp": 0.8515625, + "step": 853, + "time_per_iteration": 2.6316466331481934 + }, + { + "auxiliary_loss_clip": 0.07211602, + "auxiliary_loss_mlp": 0.01339797, + "balance_loss_clip": 0.06537303, + "balance_loss_mlp": 0.01257733, + "epoch": 0.05134525777844581, + "flos": 27206360115840.0, + "grad_norm": 3.182696022693741, + "language_loss": 0.80133533, + "learning_rate": 3.995223022193999e-06, + "loss": 0.88684934, + "num_input_tokens_seen": 18247075, + "router_z_loss_clip": 6.7421875, + "router_z_loss_mlp": 0.82080078, + "step": 854, + "time_per_iteration": 2.6477131843566895 + }, + { + "auxiliary_loss_clip": 0.07215541, + "auxiliary_loss_mlp": 0.01344733, + "balance_loss_clip": 0.0654063, + "balance_loss_mlp": 0.01263146, + "epoch": 0.051405381031113785, + "flos": 28369132824960.0, + "grad_norm": 35.99472555736179, + "language_loss": 0.85045469, + "learning_rate": 3.99519608253213e-06, + "loss": 0.93605745, + "num_input_tokens_seen": 18265680, + "router_z_loss_clip": 6.74609375, + "router_z_loss_mlp": 0.81542969, + "step": 855, + "time_per_iteration": 2.6279296875 + }, + { + "auxiliary_loss_clip": 0.06909335, + "auxiliary_loss_mlp": 0.01436301, + "balance_loss_clip": 0.0650633, + "balance_loss_mlp": 0.01398083, + "epoch": 0.05146550428378175, + "flos": 65638049760000.0, + "grad_norm": 0.9716530477482218, + "language_loss": 0.65818644, + "learning_rate": 3.995169067212227e-06, + "loss": 0.74164271, + "num_input_tokens_seen": 18327015, + "router_z_loss_clip": 4.01171875, + "router_z_loss_mlp": 0.3815918, + "step": 856, + "time_per_iteration": 3.1742889881134033 + }, + { + "auxiliary_loss_clip": 0.0715993, + "auxiliary_loss_mlp": 0.01330963, + "balance_loss_clip": 0.06518224, + "balance_loss_mlp": 0.01252571, + "epoch": 0.05152562753644972, + "flos": 22061470769280.0, + "grad_norm": 29.089515075725927, + "language_loss": 0.80351281, + "learning_rate": 3.9951419762353116e-06, + "loss": 0.88842171, + "num_input_tokens_seen": 18345235, + "router_z_loss_clip": 6.41796875, + "router_z_loss_mlp": 0.78417969, + "step": 857, + "time_per_iteration": 2.6136977672576904 + }, + { + "auxiliary_loss_clip": 0.07196955, + "auxiliary_loss_mlp": 0.01347875, + "balance_loss_clip": 0.06528607, + "balance_loss_mlp": 0.01259422, + "epoch": 0.051585750789117694, + "flos": 18514523911680.0, + "grad_norm": 4.501526487205694, + "language_loss": 0.9266271, + "learning_rate": 3.995114809602412e-06, + "loss": 1.01207542, + "num_input_tokens_seen": 18362350, + "router_z_loss_clip": 6.6875, + "router_z_loss_mlp": 0.88427734, + "step": 858, + "time_per_iteration": 2.606518268585205 + }, + { + "auxiliary_loss_clip": 0.07190363, + "auxiliary_loss_mlp": 0.0134683, + "balance_loss_clip": 0.06527077, + "balance_loss_mlp": 0.01261381, + "epoch": 0.05164587404178566, + "flos": 23736630395520.0, + "grad_norm": 4.049462391518637, + "language_loss": 0.80811787, + "learning_rate": 3.9950875673145605e-06, + "loss": 0.89348972, + "num_input_tokens_seen": 18383390, + "router_z_loss_clip": 6.6328125, + "router_z_loss_mlp": 0.85400391, + "step": 859, + "time_per_iteration": 2.624462604522705 + }, + { + "auxiliary_loss_clip": 0.07202329, + "auxiliary_loss_mlp": 0.01352935, + "balance_loss_clip": 0.06525081, + "balance_loss_mlp": 0.01264196, + "epoch": 0.05170599729445363, + "flos": 16258397201280.0, + "grad_norm": 12.806303000100046, + "language_loss": 0.95290452, + "learning_rate": 3.995060249372788e-06, + "loss": 1.03845716, + "num_input_tokens_seen": 18399220, + "router_z_loss_clip": 6.78125, + "router_z_loss_mlp": 0.88769531, + "step": 860, + "time_per_iteration": 2.6383068561553955 + }, + { + "auxiliary_loss_clip": 0.07167631, + "auxiliary_loss_mlp": 0.01344788, + "balance_loss_clip": 0.06524719, + "balance_loss_mlp": 0.01262868, + "epoch": 0.0517661205471216, + "flos": 23992404583680.0, + "grad_norm": 3.0591302489664116, + "language_loss": 0.86028093, + "learning_rate": 3.99503285577813e-06, + "loss": 0.94540519, + "num_input_tokens_seen": 18419005, + "router_z_loss_clip": 6.4375, + "router_z_loss_mlp": 0.81884766, + "step": 861, + "time_per_iteration": 2.6825718879699707 + }, + { + "auxiliary_loss_clip": 0.07179172, + "auxiliary_loss_mlp": 0.01338271, + "balance_loss_clip": 0.06521305, + "balance_loss_mlp": 0.01256732, + "epoch": 0.05182624379978957, + "flos": 29285313367680.0, + "grad_norm": 3.256695777108904, + "language_loss": 0.8236177, + "learning_rate": 3.995005386531627e-06, + "loss": 0.90879214, + "num_input_tokens_seen": 18440550, + "router_z_loss_clip": 6.578125, + "router_z_loss_mlp": 0.81542969, + "step": 862, + "time_per_iteration": 2.723032236099243 + }, + { + "auxiliary_loss_clip": 0.07146881, + "auxiliary_loss_mlp": 0.01338015, + "balance_loss_clip": 0.06502384, + "balance_loss_mlp": 0.01256428, + "epoch": 0.05188636705245754, + "flos": 24177753815040.0, + "grad_norm": 4.080001789672534, + "language_loss": 0.92516744, + "learning_rate": 3.9949778416343195e-06, + "loss": 1.01001632, + "num_input_tokens_seen": 18461950, + "router_z_loss_clip": 6.44140625, + "router_z_loss_mlp": 0.81591797, + "step": 863, + "time_per_iteration": 2.624147653579712 + }, + { + "auxiliary_loss_clip": 0.07156427, + "auxiliary_loss_mlp": 0.0133763, + "balance_loss_clip": 0.06515339, + "balance_loss_mlp": 0.01253897, + "epoch": 0.051946490305125506, + "flos": 26767961953920.0, + "grad_norm": 5.3541817649382875, + "language_loss": 0.7963919, + "learning_rate": 3.9949502210872525e-06, + "loss": 0.88133246, + "num_input_tokens_seen": 18480555, + "router_z_loss_clip": 6.41015625, + "router_z_loss_mlp": 0.83789062, + "step": 864, + "time_per_iteration": 2.6928389072418213 + }, + { + "auxiliary_loss_clip": 0.07167269, + "auxiliary_loss_mlp": 0.01333883, + "balance_loss_clip": 0.0651238, + "balance_loss_mlp": 0.01252963, + "epoch": 0.05200661355779348, + "flos": 21508190259840.0, + "grad_norm": 2.900845784392114, + "language_loss": 0.83983421, + "learning_rate": 3.994922524891474e-06, + "loss": 0.9248457, + "num_input_tokens_seen": 18499645, + "router_z_loss_clip": 6.546875, + "router_z_loss_mlp": 0.80908203, + "step": 865, + "time_per_iteration": 2.6349294185638428 + }, + { + "auxiliary_loss_clip": 0.07157271, + "auxiliary_loss_mlp": 0.01343197, + "balance_loss_clip": 0.06511506, + "balance_loss_mlp": 0.01259417, + "epoch": 0.05206673681046144, + "flos": 18120457359360.0, + "grad_norm": 4.23578044185309, + "language_loss": 0.89868104, + "learning_rate": 3.994894753048032e-06, + "loss": 0.98368573, + "num_input_tokens_seen": 18516810, + "router_z_loss_clip": 6.453125, + "router_z_loss_mlp": 0.83789062, + "step": 866, + "time_per_iteration": 2.605546236038208 + }, + { + "auxiliary_loss_clip": 0.07133412, + "auxiliary_loss_mlp": 0.01337077, + "balance_loss_clip": 0.06502427, + "balance_loss_mlp": 0.01258494, + "epoch": 0.052126860063129415, + "flos": 17528966588160.0, + "grad_norm": 5.089693219930068, + "language_loss": 0.91889334, + "learning_rate": 3.9948669055579815e-06, + "loss": 1.00359821, + "num_input_tokens_seen": 18532510, + "router_z_loss_clip": 6.30859375, + "router_z_loss_mlp": 0.78564453, + "step": 867, + "time_per_iteration": 2.5601866245269775 + }, + { + "auxiliary_loss_clip": 0.07109866, + "auxiliary_loss_mlp": 0.01340108, + "balance_loss_clip": 0.06500173, + "balance_loss_mlp": 0.0126019, + "epoch": 0.05218698331579739, + "flos": 32606227036800.0, + "grad_norm": 2.1025104258361558, + "language_loss": 0.66466248, + "learning_rate": 3.9948389824223785e-06, + "loss": 0.7491622, + "num_input_tokens_seen": 18557380, + "router_z_loss_clip": 6.09765625, + "router_z_loss_mlp": 0.79882812, + "step": 868, + "time_per_iteration": 2.6942384243011475 + }, + { + "auxiliary_loss_clip": 0.0714476, + "auxiliary_loss_mlp": 0.01358483, + "balance_loss_clip": 0.06494892, + "balance_loss_mlp": 0.01263545, + "epoch": 0.05224710656846535, + "flos": 22133824369920.0, + "grad_norm": 2.980657220865539, + "language_loss": 0.87344658, + "learning_rate": 3.994810983642281e-06, + "loss": 0.95847905, + "num_input_tokens_seen": 18575720, + "router_z_loss_clip": 6.5, + "router_z_loss_mlp": 0.94921875, + "step": 869, + "time_per_iteration": 2.5877575874328613 + }, + { + "auxiliary_loss_clip": 0.07143813, + "auxiliary_loss_mlp": 0.01349092, + "balance_loss_clip": 0.06488257, + "balance_loss_mlp": 0.01260353, + "epoch": 0.052307229821133325, + "flos": 11149789472640.0, + "grad_norm": 7.7840171376663285, + "language_loss": 0.91889322, + "learning_rate": 3.994782909218751e-06, + "loss": 1.00382233, + "num_input_tokens_seen": 18592185, + "router_z_loss_clip": 6.55859375, + "router_z_loss_mlp": 0.88720703, + "step": 870, + "time_per_iteration": 2.608442783355713 + }, + { + "auxiliary_loss_clip": 0.07122661, + "auxiliary_loss_mlp": 0.01356358, + "balance_loss_clip": 0.064864, + "balance_loss_mlp": 0.01265759, + "epoch": 0.05236735307380129, + "flos": 19132862716800.0, + "grad_norm": 2.918328667759454, + "language_loss": 0.843858, + "learning_rate": 3.994754759152854e-06, + "loss": 0.92864817, + "num_input_tokens_seen": 18609560, + "router_z_loss_clip": 6.3671875, + "router_z_loss_mlp": 0.90722656, + "step": 871, + "time_per_iteration": 2.5879244804382324 + }, + { + "auxiliary_loss_clip": 0.07078928, + "auxiliary_loss_mlp": 0.01364934, + "balance_loss_clip": 0.06478463, + "balance_loss_mlp": 0.01281488, + "epoch": 0.05242747632646926, + "flos": 20967152446080.0, + "grad_norm": 2.587533245039743, + "language_loss": 0.8462553, + "learning_rate": 3.994726533445656e-06, + "loss": 0.93069392, + "num_input_tokens_seen": 18629405, + "router_z_loss_clip": 6.0078125, + "router_z_loss_mlp": 0.83496094, + "step": 872, + "time_per_iteration": 2.6208133697509766 + }, + { + "auxiliary_loss_clip": 0.06844061, + "auxiliary_loss_mlp": 0.01482571, + "balance_loss_clip": 0.06436051, + "balance_loss_mlp": 0.0141405, + "epoch": 0.052487599579137234, + "flos": 65038005872640.0, + "grad_norm": 0.8977590463147395, + "language_loss": 0.61953008, + "learning_rate": 3.9946982320982274e-06, + "loss": 0.70279646, + "num_input_tokens_seen": 18681480, + "router_z_loss_clip": 4.0625, + "router_z_loss_mlp": 0.68603516, + "step": 873, + "time_per_iteration": 3.134603500366211 + }, + { + "auxiliary_loss_clip": 0.07129098, + "auxiliary_loss_mlp": 0.01340569, + "balance_loss_clip": 0.06492221, + "balance_loss_mlp": 0.01259269, + "epoch": 0.0525477228318052, + "flos": 23294584581120.0, + "grad_norm": 2.232892718211453, + "language_loss": 0.92670178, + "learning_rate": 3.994669855111643e-06, + "loss": 1.01139832, + "num_input_tokens_seen": 18700390, + "router_z_loss_clip": 6.37109375, + "router_z_loss_mlp": 0.81298828, + "step": 874, + "time_per_iteration": 2.6136653423309326 + }, + { + "auxiliary_loss_clip": 0.07136606, + "auxiliary_loss_mlp": 0.01342837, + "balance_loss_clip": 0.0649495, + "balance_loss_mlp": 0.01262681, + "epoch": 0.05260784608447317, + "flos": 32237834561280.0, + "grad_norm": 3.6657665933203796, + "language_loss": 0.78140688, + "learning_rate": 3.994641402486977e-06, + "loss": 0.86620128, + "num_input_tokens_seen": 18721280, + "router_z_loss_clip": 6.41796875, + "router_z_loss_mlp": 0.80175781, + "step": 875, + "time_per_iteration": 2.72760272026062 + }, + { + "auxiliary_loss_clip": 0.07132401, + "auxiliary_loss_mlp": 0.01330422, + "balance_loss_clip": 0.06503764, + "balance_loss_mlp": 0.01255511, + "epoch": 0.052667969337141136, + "flos": 24470270818560.0, + "grad_norm": 2.6184423818700684, + "language_loss": 0.96137547, + "learning_rate": 3.99461287422531e-06, + "loss": 1.04600358, + "num_input_tokens_seen": 18741545, + "router_z_loss_clip": 6.28515625, + "router_z_loss_mlp": 0.74902344, + "step": 876, + "time_per_iteration": 2.627152681350708 + }, + { + "auxiliary_loss_clip": 0.06850941, + "auxiliary_loss_mlp": 0.01378053, + "balance_loss_clip": 0.06451087, + "balance_loss_mlp": 0.01329487, + "epoch": 0.05272809258980911, + "flos": 57804673034880.0, + "grad_norm": 0.7984915998280667, + "language_loss": 0.63229537, + "learning_rate": 3.994584270327722e-06, + "loss": 0.7145853, + "num_input_tokens_seen": 18801400, + "router_z_loss_clip": 4.0, + "router_z_loss_mlp": 0.48510742, + "step": 877, + "time_per_iteration": 3.2541913986206055 + }, + { + "auxiliary_loss_clip": 0.0712804, + "auxiliary_loss_mlp": 0.01326088, + "balance_loss_clip": 0.06496318, + "balance_loss_mlp": 0.01255087, + "epoch": 0.05278821584247708, + "flos": 17426578498560.0, + "grad_norm": 2.7186428977077624, + "language_loss": 0.89685273, + "learning_rate": 3.994555590795299e-06, + "loss": 0.98139405, + "num_input_tokens_seen": 18819670, + "router_z_loss_clip": 6.3125, + "router_z_loss_mlp": 0.71044922, + "step": 878, + "time_per_iteration": 2.5782718658447266 + }, + { + "auxiliary_loss_clip": 0.07154611, + "auxiliary_loss_mlp": 0.0135536, + "balance_loss_clip": 0.06498797, + "balance_loss_mlp": 0.01272485, + "epoch": 0.052848339095145046, + "flos": 26143879144320.0, + "grad_norm": 3.677878171007489, + "language_loss": 0.873586, + "learning_rate": 3.9945268356291275e-06, + "loss": 0.9586857, + "num_input_tokens_seen": 18840580, + "router_z_loss_clip": 6.55859375, + "router_z_loss_mlp": 0.82910156, + "step": 879, + "time_per_iteration": 2.6588823795318604 + }, + { + "auxiliary_loss_clip": 0.07119917, + "auxiliary_loss_mlp": 0.01353348, + "balance_loss_clip": 0.06497534, + "balance_loss_mlp": 0.01274622, + "epoch": 0.05290846234781302, + "flos": 16477680136320.0, + "grad_norm": 3.320308324601447, + "language_loss": 0.88939857, + "learning_rate": 3.9944980048302985e-06, + "loss": 0.97413123, + "num_input_tokens_seen": 18859295, + "router_z_loss_clip": 6.2265625, + "router_z_loss_mlp": 0.78710938, + "step": 880, + "time_per_iteration": 2.578577756881714 + }, + { + "auxiliary_loss_clip": 0.07141528, + "auxiliary_loss_mlp": 0.01362108, + "balance_loss_clip": 0.06505635, + "balance_loss_mlp": 0.0127971, + "epoch": 0.05296858560048098, + "flos": 19871324749440.0, + "grad_norm": 13.59148063097553, + "language_loss": 0.93088204, + "learning_rate": 3.994469098399906e-06, + "loss": 1.01591837, + "num_input_tokens_seen": 18877485, + "router_z_loss_clip": 6.3671875, + "router_z_loss_mlp": 0.82421875, + "step": 881, + "time_per_iteration": 2.5984764099121094 + }, + { + "auxiliary_loss_clip": 0.07145406, + "auxiliary_loss_mlp": 0.01363259, + "balance_loss_clip": 0.06503064, + "balance_loss_mlp": 0.01280146, + "epoch": 0.053028708853148955, + "flos": 24395359668480.0, + "grad_norm": 2.511110361208876, + "language_loss": 0.91561359, + "learning_rate": 3.994440116339046e-06, + "loss": 1.00070024, + "num_input_tokens_seen": 18898275, + "router_z_loss_clip": 6.4140625, + "router_z_loss_mlp": 0.83203125, + "step": 882, + "time_per_iteration": 2.6321942806243896 + }, + { + "auxiliary_loss_clip": 0.07153618, + "auxiliary_loss_mlp": 0.01379213, + "balance_loss_clip": 0.06501983, + "balance_loss_mlp": 0.0129343, + "epoch": 0.05308883210581693, + "flos": 36402072048000.0, + "grad_norm": 3.8602802151834035, + "language_loss": 0.74549603, + "learning_rate": 3.994411058648816e-06, + "loss": 0.83082438, + "num_input_tokens_seen": 18920665, + "router_z_loss_clip": 6.515625, + "router_z_loss_mlp": 0.85839844, + "step": 883, + "time_per_iteration": 2.758694648742676 + }, + { + "auxiliary_loss_clip": 0.07123835, + "auxiliary_loss_mlp": 0.01365604, + "balance_loss_clip": 0.06493074, + "balance_loss_mlp": 0.01279965, + "epoch": 0.05314895535848489, + "flos": 22861427299200.0, + "grad_norm": 3.506018870992282, + "language_loss": 0.79542196, + "learning_rate": 3.994381925330319e-06, + "loss": 0.88031638, + "num_input_tokens_seen": 18939835, + "router_z_loss_clip": 6.3125, + "router_z_loss_mlp": 0.85644531, + "step": 884, + "time_per_iteration": 2.638016700744629 + }, + { + "auxiliary_loss_clip": 0.07094033, + "auxiliary_loss_mlp": 0.01359391, + "balance_loss_clip": 0.06489642, + "balance_loss_mlp": 0.01288057, + "epoch": 0.053209078611152864, + "flos": 12865381493760.0, + "grad_norm": 6.565904312623652, + "language_loss": 0.90469623, + "learning_rate": 3.994352716384659e-06, + "loss": 0.98923051, + "num_input_tokens_seen": 18958405, + "router_z_loss_clip": 6.0390625, + "router_z_loss_mlp": 0.71289062, + "step": 885, + "time_per_iteration": 2.5900588035583496 + }, + { + "auxiliary_loss_clip": 0.07139361, + "auxiliary_loss_mlp": 0.01377795, + "balance_loss_clip": 0.06508732, + "balance_loss_mlp": 0.0129225, + "epoch": 0.05326920186382083, + "flos": 12169112791680.0, + "grad_norm": 9.079017579739912, + "language_loss": 0.91530603, + "learning_rate": 3.994323431812945e-06, + "loss": 1.00047755, + "num_input_tokens_seen": 18975445, + "router_z_loss_clip": 6.3046875, + "router_z_loss_mlp": 0.85595703, + "step": 886, + "time_per_iteration": 4.099337339401245 + }, + { + "auxiliary_loss_clip": 0.07124092, + "auxiliary_loss_mlp": 0.01379295, + "balance_loss_clip": 0.06500152, + "balance_loss_mlp": 0.01295754, + "epoch": 0.0533293251164888, + "flos": 22710011771520.0, + "grad_norm": 3.9905004918105202, + "language_loss": 0.93810099, + "learning_rate": 3.994294071616286e-06, + "loss": 1.02313483, + "num_input_tokens_seen": 18991930, + "router_z_loss_clip": 6.23828125, + "router_z_loss_mlp": 0.83447266, + "step": 887, + "time_per_iteration": 2.5987393856048584 + }, + { + "auxiliary_loss_clip": 0.0714867, + "auxiliary_loss_mlp": 0.01405803, + "balance_loss_clip": 0.06507815, + "balance_loss_mlp": 0.01314536, + "epoch": 0.053389448369156774, + "flos": 26947860670080.0, + "grad_norm": 3.06900720752712, + "language_loss": 0.79354906, + "learning_rate": 3.994264635795796e-06, + "loss": 0.87909377, + "num_input_tokens_seen": 19009790, + "router_z_loss_clip": 6.40234375, + "router_z_loss_mlp": 0.91259766, + "step": 888, + "time_per_iteration": 4.025885820388794 + }, + { + "auxiliary_loss_clip": 0.07115386, + "auxiliary_loss_mlp": 0.01373999, + "balance_loss_clip": 0.06494455, + "balance_loss_mlp": 0.01293223, + "epoch": 0.05344957162182474, + "flos": 25563331330560.0, + "grad_norm": 6.088733603359691, + "language_loss": 0.92500973, + "learning_rate": 3.994235124352592e-06, + "loss": 1.00990355, + "num_input_tokens_seen": 19030170, + "router_z_loss_clip": 6.21484375, + "router_z_loss_mlp": 0.80761719, + "step": 889, + "time_per_iteration": 2.7182345390319824 + }, + { + "auxiliary_loss_clip": 0.07091353, + "auxiliary_loss_mlp": 0.01359755, + "balance_loss_clip": 0.06492079, + "balance_loss_mlp": 0.01289135, + "epoch": 0.05350969487449271, + "flos": 19725779007360.0, + "grad_norm": 3.9732892090836818, + "language_loss": 0.92642856, + "learning_rate": 3.994205537287791e-06, + "loss": 1.0109396, + "num_input_tokens_seen": 19048075, + "router_z_loss_clip": 5.99609375, + "router_z_loss_mlp": 0.70654297, + "step": 890, + "time_per_iteration": 4.055738925933838 + }, + { + "auxiliary_loss_clip": 0.071067, + "auxiliary_loss_mlp": 0.01356348, + "balance_loss_clip": 0.06478938, + "balance_loss_mlp": 0.01276573, + "epoch": 0.053569818127160676, + "flos": 27023694215040.0, + "grad_norm": 3.5767216506214523, + "language_loss": 0.98853362, + "learning_rate": 3.994175874602517e-06, + "loss": 1.07316399, + "num_input_tokens_seen": 19067465, + "router_z_loss_clip": 6.27734375, + "router_z_loss_mlp": 0.79785156, + "step": 891, + "time_per_iteration": 2.651681661605835 + }, + { + "auxiliary_loss_clip": 0.07084872, + "auxiliary_loss_mlp": 0.01351507, + "balance_loss_clip": 0.06476413, + "balance_loss_mlp": 0.01277788, + "epoch": 0.05362994137982865, + "flos": 13193383501440.0, + "grad_norm": 5.794831179079165, + "language_loss": 0.75768781, + "learning_rate": 3.994146136297893e-06, + "loss": 0.84205151, + "num_input_tokens_seen": 19085505, + "router_z_loss_clip": 6.08203125, + "router_z_loss_mlp": 0.73779297, + "step": 892, + "time_per_iteration": 2.5933892726898193 + }, + { + "auxiliary_loss_clip": 0.07096062, + "auxiliary_loss_mlp": 0.01350672, + "balance_loss_clip": 0.0647971, + "balance_loss_mlp": 0.01278002, + "epoch": 0.05369006463249662, + "flos": 28665590970240.0, + "grad_norm": 4.507397126758742, + "language_loss": 0.85958588, + "learning_rate": 3.994116322375049e-06, + "loss": 0.94405323, + "num_input_tokens_seen": 19104360, + "router_z_loss_clip": 6.16796875, + "router_z_loss_mlp": 0.7265625, + "step": 893, + "time_per_iteration": 2.6546378135681152 + }, + { + "auxiliary_loss_clip": 0.07101032, + "auxiliary_loss_mlp": 0.01336529, + "balance_loss_clip": 0.06474701, + "balance_loss_mlp": 0.01265099, + "epoch": 0.053750187885164585, + "flos": 28920736252800.0, + "grad_norm": 9.639579848612797, + "language_loss": 0.85423577, + "learning_rate": 3.994086432835114e-06, + "loss": 0.93861139, + "num_input_tokens_seen": 19124680, + "router_z_loss_clip": 6.265625, + "router_z_loss_mlp": 0.71484375, + "step": 894, + "time_per_iteration": 2.649336099624634 + }, + { + "auxiliary_loss_clip": 0.07051332, + "auxiliary_loss_mlp": 0.0132645, + "balance_loss_clip": 0.06452148, + "balance_loss_mlp": 0.01260742, + "epoch": 0.05381031113783256, + "flos": 15164246586240.0, + "grad_norm": 3.2292453008689215, + "language_loss": 0.79914492, + "learning_rate": 3.994056467679221e-06, + "loss": 0.88292277, + "num_input_tokens_seen": 19142895, + "router_z_loss_clip": 5.9921875, + "router_z_loss_mlp": 0.65722656, + "step": 895, + "time_per_iteration": 2.5825929641723633 + }, + { + "auxiliary_loss_clip": 0.07075687, + "auxiliary_loss_mlp": 0.01335812, + "balance_loss_clip": 0.06453281, + "balance_loss_mlp": 0.01257229, + "epoch": 0.05387043439050053, + "flos": 21841684709760.0, + "grad_norm": 4.836504932030544, + "language_loss": 0.91227436, + "learning_rate": 3.9940264269085065e-06, + "loss": 0.99638927, + "num_input_tokens_seen": 19163125, + "router_z_loss_clip": 6.2265625, + "router_z_loss_mlp": 0.78564453, + "step": 896, + "time_per_iteration": 2.657710313796997 + }, + { + "auxiliary_loss_clip": 0.07047559, + "auxiliary_loss_mlp": 0.0133946, + "balance_loss_clip": 0.06444345, + "balance_loss_mlp": 0.01266504, + "epoch": 0.053930557643168495, + "flos": 17315888855040.0, + "grad_norm": 5.716166538264852, + "language_loss": 0.91855001, + "learning_rate": 3.9939963105241115e-06, + "loss": 1.00242019, + "num_input_tokens_seen": 19179385, + "router_z_loss_clip": 6.0390625, + "router_z_loss_mlp": 0.72998047, + "step": 897, + "time_per_iteration": 2.5864884853363037 + }, + { + "auxiliary_loss_clip": 0.06997538, + "auxiliary_loss_mlp": 0.013383, + "balance_loss_clip": 0.06422779, + "balance_loss_mlp": 0.0126625, + "epoch": 0.05399068089583647, + "flos": 17354350679040.0, + "grad_norm": 28.355738836577903, + "language_loss": 0.93759477, + "learning_rate": 3.993966118527175e-06, + "loss": 1.02095306, + "num_input_tokens_seen": 19198725, + "router_z_loss_clip": 5.74609375, + "router_z_loss_mlp": 0.72070312, + "step": 898, + "time_per_iteration": 2.6132631301879883 + }, + { + "auxiliary_loss_clip": 0.07036521, + "auxiliary_loss_mlp": 0.01343105, + "balance_loss_clip": 0.06425488, + "balance_loss_mlp": 0.01264809, + "epoch": 0.05405080414850443, + "flos": 17491594867200.0, + "grad_norm": 4.630068897804509, + "language_loss": 0.97064686, + "learning_rate": 3.993935850918845e-06, + "loss": 1.05444312, + "num_input_tokens_seen": 19212380, + "router_z_loss_clip": 6.10546875, + "router_z_loss_mlp": 0.78320312, + "step": 899, + "time_per_iteration": 2.5816986560821533 + }, + { + "auxiliary_loss_clip": 0.07002847, + "auxiliary_loss_mlp": 0.01337851, + "balance_loss_clip": 0.06429946, + "balance_loss_mlp": 0.01263131, + "epoch": 0.054110927401172404, + "flos": 24503365981440.0, + "grad_norm": 5.469084454178289, + "language_loss": 0.79532343, + "learning_rate": 3.9939055077002665e-06, + "loss": 0.87873036, + "num_input_tokens_seen": 19232235, + "router_z_loss_clip": 5.73046875, + "router_z_loss_mlp": 0.74755859, + "step": 900, + "time_per_iteration": 2.6616973876953125 + }, + { + "auxiliary_loss_clip": 0.07026203, + "auxiliary_loss_mlp": 0.01335204, + "balance_loss_clip": 0.06429055, + "balance_loss_mlp": 0.01261628, + "epoch": 0.054171050653840376, + "flos": 22936715792640.0, + "grad_norm": 9.114074112173778, + "language_loss": 0.79687816, + "learning_rate": 3.993875088872592e-06, + "loss": 0.88049221, + "num_input_tokens_seen": 19251460, + "router_z_loss_clip": 5.9765625, + "router_z_loss_mlp": 0.73681641, + "step": 901, + "time_per_iteration": 2.6217994689941406 + }, + { + "auxiliary_loss_clip": 0.06969521, + "auxiliary_loss_mlp": 0.01353187, + "balance_loss_clip": 0.06413257, + "balance_loss_mlp": 0.01276941, + "epoch": 0.05423117390650834, + "flos": 12938238218880.0, + "grad_norm": 4.5794905652094675, + "language_loss": 0.8858788, + "learning_rate": 3.9938445944369745e-06, + "loss": 0.96910584, + "num_input_tokens_seen": 19269060, + "router_z_loss_clip": 5.56640625, + "router_z_loss_mlp": 0.76220703, + "step": 902, + "time_per_iteration": 2.600041151046753 + }, + { + "auxiliary_loss_clip": 0.07010742, + "auxiliary_loss_mlp": 0.01348168, + "balance_loss_clip": 0.0642361, + "balance_loss_mlp": 0.01272208, + "epoch": 0.05429129715917631, + "flos": 19907438659200.0, + "grad_norm": 3.5235627900978987, + "language_loss": 0.90038717, + "learning_rate": 3.993814024394569e-06, + "loss": 0.98397624, + "num_input_tokens_seen": 19288620, + "router_z_loss_clip": 5.875, + "router_z_loss_mlp": 0.75927734, + "step": 903, + "time_per_iteration": 2.654343843460083 + }, + { + "auxiliary_loss_clip": 0.07027672, + "auxiliary_loss_mlp": 0.01351984, + "balance_loss_clip": 0.06429485, + "balance_loss_mlp": 0.01276739, + "epoch": 0.05435142041184428, + "flos": 16914065800320.0, + "grad_norm": 3.6682943607818808, + "language_loss": 0.79433787, + "learning_rate": 3.993783378746537e-06, + "loss": 0.87813443, + "num_input_tokens_seen": 19306615, + "router_z_loss_clip": 5.98046875, + "router_z_loss_mlp": 0.75292969, + "step": 904, + "time_per_iteration": 2.5959675312042236 + }, + { + "auxiliary_loss_clip": 0.07042356, + "auxiliary_loss_mlp": 0.01361745, + "balance_loss_clip": 0.06427713, + "balance_loss_mlp": 0.01279062, + "epoch": 0.05441154366451225, + "flos": 23954613592320.0, + "grad_norm": 4.579053653377249, + "language_loss": 0.88901699, + "learning_rate": 3.993752657494039e-06, + "loss": 0.97305799, + "num_input_tokens_seen": 19321680, + "router_z_loss_clip": 6.140625, + "router_z_loss_mlp": 0.82714844, + "step": 905, + "time_per_iteration": 2.6219427585601807 + }, + { + "auxiliary_loss_clip": 0.06998053, + "auxiliary_loss_mlp": 0.01347731, + "balance_loss_clip": 0.06429392, + "balance_loss_mlp": 0.01274727, + "epoch": 0.05447166691718022, + "flos": 19981678976640.0, + "grad_norm": 3.7765145633999624, + "language_loss": 0.78233027, + "learning_rate": 3.993721860638241e-06, + "loss": 0.8657881, + "num_input_tokens_seen": 19339760, + "router_z_loss_clip": 5.6875, + "router_z_loss_mlp": 0.73046875, + "step": 906, + "time_per_iteration": 2.6213393211364746 + }, + { + "auxiliary_loss_clip": 0.07034522, + "auxiliary_loss_mlp": 0.01354415, + "balance_loss_clip": 0.06439427, + "balance_loss_mlp": 0.01281221, + "epoch": 0.05453179016984819, + "flos": 24943483152000.0, + "grad_norm": 3.1487164244038546, + "language_loss": 0.91526973, + "learning_rate": 3.993690988180309e-06, + "loss": 0.9991591, + "num_input_tokens_seen": 19359585, + "router_z_loss_clip": 5.94921875, + "router_z_loss_mlp": 0.73242188, + "step": 907, + "time_per_iteration": 2.6804075241088867 + }, + { + "auxiliary_loss_clip": 0.07033581, + "auxiliary_loss_mlp": 0.01357567, + "balance_loss_clip": 0.06437694, + "balance_loss_mlp": 0.01279461, + "epoch": 0.05459191342251616, + "flos": 18121170119040.0, + "grad_norm": 6.406912601020187, + "language_loss": 0.90540731, + "learning_rate": 3.9936600401214165e-06, + "loss": 0.98931873, + "num_input_tokens_seen": 19378590, + "router_z_loss_clip": 5.953125, + "router_z_loss_mlp": 0.78076172, + "step": 908, + "time_per_iteration": 2.645015001296997 + }, + { + "auxiliary_loss_clip": 0.07043326, + "auxiliary_loss_mlp": 0.01345219, + "balance_loss_clip": 0.06445918, + "balance_loss_mlp": 0.01274695, + "epoch": 0.054652036675184125, + "flos": 19214314485120.0, + "grad_norm": 7.110019645600745, + "language_loss": 0.94541007, + "learning_rate": 3.9936290164627345e-06, + "loss": 1.02929544, + "num_input_tokens_seen": 19397910, + "router_z_loss_clip": 5.96875, + "router_z_loss_mlp": 0.70507812, + "step": 909, + "time_per_iteration": 2.6648013591766357 + }, + { + "auxiliary_loss_clip": 0.07070212, + "auxiliary_loss_mlp": 0.01367531, + "balance_loss_clip": 0.06454301, + "balance_loss_mlp": 0.01287184, + "epoch": 0.0547121599278521, + "flos": 16331253926400.0, + "grad_norm": 4.130588011927331, + "language_loss": 0.76068008, + "learning_rate": 3.99359791720544e-06, + "loss": 0.84505749, + "num_input_tokens_seen": 19415950, + "router_z_loss_clip": 6.15625, + "router_z_loss_mlp": 0.80273438, + "step": 910, + "time_per_iteration": 2.588240146636963 + }, + { + "auxiliary_loss_clip": 0.07039558, + "auxiliary_loss_mlp": 0.0135407, + "balance_loss_clip": 0.06453503, + "balance_loss_mlp": 0.01281829, + "epoch": 0.05477228318052007, + "flos": 20345165988480.0, + "grad_norm": 30.49086914574189, + "language_loss": 0.86822844, + "learning_rate": 3.993566742350714e-06, + "loss": 0.95216471, + "num_input_tokens_seen": 19435275, + "router_z_loss_clip": 5.8671875, + "router_z_loss_mlp": 0.72265625, + "step": 911, + "time_per_iteration": 2.6324408054351807 + }, + { + "auxiliary_loss_clip": 0.07064489, + "auxiliary_loss_mlp": 0.01358074, + "balance_loss_clip": 0.06459624, + "balance_loss_mlp": 0.01280207, + "epoch": 0.054832406433188034, + "flos": 21978216138240.0, + "grad_norm": 33.1555590789585, + "language_loss": 0.80294693, + "learning_rate": 3.993535491899736e-06, + "loss": 0.88717258, + "num_input_tokens_seen": 19452090, + "router_z_loss_clip": 6.046875, + "router_z_loss_mlp": 0.77880859, + "step": 912, + "time_per_iteration": 2.590373992919922 + }, + { + "auxiliary_loss_clip": 0.0703726, + "auxiliary_loss_mlp": 0.01353834, + "balance_loss_clip": 0.06456903, + "balance_loss_mlp": 0.01284979, + "epoch": 0.054892529685856006, + "flos": 16404487994880.0, + "grad_norm": 20.678206909589232, + "language_loss": 0.87077272, + "learning_rate": 3.993504165853694e-06, + "loss": 0.9546836, + "num_input_tokens_seen": 19470865, + "router_z_loss_clip": 5.8046875, + "router_z_loss_mlp": 0.68896484, + "step": 913, + "time_per_iteration": 2.6207854747772217 + }, + { + "auxiliary_loss_clip": 0.07058232, + "auxiliary_loss_mlp": 0.01355937, + "balance_loss_clip": 0.06467378, + "balance_loss_mlp": 0.01279214, + "epoch": 0.05495265293852397, + "flos": 23918709317760.0, + "grad_norm": 2.929829982992902, + "language_loss": 0.86646307, + "learning_rate": 3.993472764213772e-06, + "loss": 0.9506048, + "num_input_tokens_seen": 19492145, + "router_z_loss_clip": 5.91015625, + "router_z_loss_mlp": 0.76708984, + "step": 914, + "time_per_iteration": 2.653738260269165 + }, + { + "auxiliary_loss_clip": 0.07080867, + "auxiliary_loss_mlp": 0.01347963, + "balance_loss_clip": 0.06487378, + "balance_loss_mlp": 0.01278583, + "epoch": 0.055012776191191944, + "flos": 23593767984000.0, + "grad_norm": 5.681880132712419, + "language_loss": 0.94313538, + "learning_rate": 3.9934412869811655e-06, + "loss": 1.02742374, + "num_input_tokens_seen": 19511015, + "router_z_loss_clip": 5.93359375, + "router_z_loss_mlp": 0.69433594, + "step": 915, + "time_per_iteration": 2.6307506561279297 + }, + { + "auxiliary_loss_clip": 0.07055361, + "auxiliary_loss_mlp": 0.01345822, + "balance_loss_clip": 0.06473369, + "balance_loss_mlp": 0.01276442, + "epoch": 0.055072899443859916, + "flos": 17533997832960.0, + "grad_norm": 9.383060565186796, + "language_loss": 0.9327727, + "learning_rate": 3.993409734157064e-06, + "loss": 1.01678455, + "num_input_tokens_seen": 19529040, + "router_z_loss_clip": 5.8203125, + "router_z_loss_mlp": 0.69384766, + "step": 916, + "time_per_iteration": 2.5821292400360107 + }, + { + "auxiliary_loss_clip": 0.0710435, + "auxiliary_loss_mlp": 0.01382873, + "balance_loss_clip": 0.06478155, + "balance_loss_mlp": 0.01299808, + "epoch": 0.05513302269652788, + "flos": 21693246001920.0, + "grad_norm": 9.219504726961107, + "language_loss": 0.83272588, + "learning_rate": 3.993378105742666e-06, + "loss": 0.91759813, + "num_input_tokens_seen": 19549540, + "router_z_loss_clip": 6.2578125, + "router_z_loss_mlp": 0.83056641, + "step": 917, + "time_per_iteration": 2.620739221572876 + }, + { + "auxiliary_loss_clip": 0.07102817, + "auxiliary_loss_mlp": 0.01375299, + "balance_loss_clip": 0.06484253, + "balance_loss_mlp": 0.01293473, + "epoch": 0.05519314594919585, + "flos": 21619257246720.0, + "grad_norm": 3.775060612193374, + "language_loss": 0.84478474, + "learning_rate": 3.9933464017391705e-06, + "loss": 0.92956591, + "num_input_tokens_seen": 19567570, + "router_z_loss_clip": 6.1875, + "router_z_loss_mlp": 0.81787109, + "step": 918, + "time_per_iteration": 2.594416379928589 + }, + { + "auxiliary_loss_clip": 0.07101964, + "auxiliary_loss_mlp": 0.01367305, + "balance_loss_clip": 0.06485492, + "balance_loss_mlp": 0.01289151, + "epoch": 0.05525326920186382, + "flos": 21804983821440.0, + "grad_norm": 30.311763596206674, + "language_loss": 0.92698455, + "learning_rate": 3.99331462214778e-06, + "loss": 1.01167727, + "num_input_tokens_seen": 19585330, + "router_z_loss_clip": 6.171875, + "router_z_loss_mlp": 0.78125, + "step": 919, + "time_per_iteration": 2.652820587158203 + }, + { + "auxiliary_loss_clip": 0.07067424, + "auxiliary_loss_mlp": 0.01355052, + "balance_loss_clip": 0.06469625, + "balance_loss_mlp": 0.01279807, + "epoch": 0.05531339245453179, + "flos": 28447272357120.0, + "grad_norm": 10.071293586926402, + "language_loss": 0.91352344, + "learning_rate": 3.993282766969699e-06, + "loss": 0.99774826, + "num_input_tokens_seen": 19604970, + "router_z_loss_clip": 5.97265625, + "router_z_loss_mlp": 0.75244141, + "step": 920, + "time_per_iteration": 2.676198720932007 + }, + { + "auxiliary_loss_clip": 0.0705073, + "auxiliary_loss_mlp": 0.01349539, + "balance_loss_clip": 0.06465692, + "balance_loss_mlp": 0.01277489, + "epoch": 0.05537351570719976, + "flos": 37383688229760.0, + "grad_norm": 4.912310342767309, + "language_loss": 0.69610375, + "learning_rate": 3.993250836206136e-06, + "loss": 0.78010643, + "num_input_tokens_seen": 19626235, + "router_z_loss_clip": 5.85546875, + "router_z_loss_mlp": 0.72021484, + "step": 921, + "time_per_iteration": 2.729602098464966 + }, + { + "auxiliary_loss_clip": 0.07080688, + "auxiliary_loss_mlp": 0.01369369, + "balance_loss_clip": 0.06465121, + "balance_loss_mlp": 0.01287687, + "epoch": 0.05543363895986773, + "flos": 20090733465600.0, + "grad_norm": 4.2535446135467785, + "language_loss": 0.76117694, + "learning_rate": 3.993218829858301e-06, + "loss": 0.8456775, + "num_input_tokens_seen": 19644305, + "router_z_loss_clip": 6.1640625, + "router_z_loss_mlp": 0.81689453, + "step": 922, + "time_per_iteration": 2.5846810340881348 + }, + { + "auxiliary_loss_clip": 0.07077445, + "auxiliary_loss_mlp": 0.01375095, + "balance_loss_clip": 0.06466563, + "balance_loss_mlp": 0.01293842, + "epoch": 0.0554937622125357, + "flos": 24539773380480.0, + "grad_norm": 5.782149663492731, + "language_loss": 0.86474669, + "learning_rate": 3.993186747927408e-06, + "loss": 0.9492721, + "num_input_tokens_seen": 19662130, + "router_z_loss_clip": 6.1171875, + "router_z_loss_mlp": 0.81298828, + "step": 923, + "time_per_iteration": 2.6038758754730225 + }, + { + "auxiliary_loss_clip": 0.07066977, + "auxiliary_loss_mlp": 0.01365852, + "balance_loss_clip": 0.06460079, + "balance_loss_mlp": 0.01286125, + "epoch": 0.055553885465203665, + "flos": 14325408961920.0, + "grad_norm": 4.5524709486596695, + "language_loss": 0.82890737, + "learning_rate": 3.993154590414675e-06, + "loss": 0.91323566, + "num_input_tokens_seen": 19680715, + "router_z_loss_clip": 6.0703125, + "router_z_loss_mlp": 0.79736328, + "step": 924, + "time_per_iteration": 2.563229560852051 + }, + { + "auxiliary_loss_clip": 0.07049644, + "auxiliary_loss_mlp": 0.01383238, + "balance_loss_clip": 0.06458092, + "balance_loss_mlp": 0.01303654, + "epoch": 0.05561400871787164, + "flos": 27388522892160.0, + "grad_norm": 5.4957057534226115, + "language_loss": 1.05798936, + "learning_rate": 3.993122357321319e-06, + "loss": 1.14231825, + "num_input_tokens_seen": 19700535, + "router_z_loss_clip": 5.9140625, + "router_z_loss_mlp": 0.79492188, + "step": 925, + "time_per_iteration": 4.167480230331421 + }, + { + "auxiliary_loss_clip": 0.07051321, + "auxiliary_loss_mlp": 0.01368022, + "balance_loss_clip": 0.06456822, + "balance_loss_mlp": 0.01291585, + "epoch": 0.05567413197053961, + "flos": 23227681495680.0, + "grad_norm": 4.150968516842117, + "language_loss": 0.85383534, + "learning_rate": 3.993090048648564e-06, + "loss": 0.93802875, + "num_input_tokens_seen": 19718825, + "router_z_loss_clip": 5.94921875, + "router_z_loss_mlp": 0.76367188, + "step": 926, + "time_per_iteration": 4.156589031219482 + }, + { + "auxiliary_loss_clip": 0.07111964, + "auxiliary_loss_mlp": 0.01390888, + "balance_loss_clip": 0.06470172, + "balance_loss_mlp": 0.0130129, + "epoch": 0.055734255223207574, + "flos": 25271988284160.0, + "grad_norm": 8.095313947782397, + "language_loss": 0.79582185, + "learning_rate": 3.993057664397634e-06, + "loss": 0.88085037, + "num_input_tokens_seen": 19739080, + "router_z_loss_clip": 6.42578125, + "router_z_loss_mlp": 0.89550781, + "step": 927, + "time_per_iteration": 2.6851751804351807 + }, + { + "auxiliary_loss_clip": 0.06860578, + "auxiliary_loss_mlp": 0.01306525, + "balance_loss_clip": 0.06486383, + "balance_loss_mlp": 0.01261607, + "epoch": 0.055794378475875546, + "flos": 66524698938240.0, + "grad_norm": 0.7865808163657396, + "language_loss": 0.59965324, + "learning_rate": 3.9930252045697585e-06, + "loss": 0.68132424, + "num_input_tokens_seen": 19802960, + "router_z_loss_clip": 3.74804688, + "router_z_loss_mlp": 0.44921875, + "step": 928, + "time_per_iteration": 4.694532632827759 + }, + { + "auxiliary_loss_clip": 0.0702403, + "auxiliary_loss_mlp": 0.01398439, + "balance_loss_clip": 0.06437568, + "balance_loss_mlp": 0.01313991, + "epoch": 0.05585450172854351, + "flos": 25344635374080.0, + "grad_norm": 5.300738051002958, + "language_loss": 0.99270105, + "learning_rate": 3.992992669166168e-06, + "loss": 1.07692575, + "num_input_tokens_seen": 19822765, + "router_z_loss_clip": 5.8671875, + "router_z_loss_mlp": 0.84472656, + "step": 929, + "time_per_iteration": 2.652329444885254 + }, + { + "auxiliary_loss_clip": 0.07033007, + "auxiliary_loss_mlp": 0.01402576, + "balance_loss_clip": 0.06441823, + "balance_loss_mlp": 0.01318938, + "epoch": 0.05591462498121148, + "flos": 33920163711360.0, + "grad_norm": 20.10669872289237, + "language_loss": 0.7473861, + "learning_rate": 3.992960058188094e-06, + "loss": 0.83174193, + "num_input_tokens_seen": 19843590, + "router_z_loss_clip": 5.91015625, + "router_z_loss_mlp": 0.83691406, + "step": 930, + "time_per_iteration": 4.218009948730469 + }, + { + "auxiliary_loss_clip": 0.0703931, + "auxiliary_loss_mlp": 0.01397804, + "balance_loss_clip": 0.06446733, + "balance_loss_mlp": 0.01313929, + "epoch": 0.055974748233879455, + "flos": 17936617501440.0, + "grad_norm": 4.521391546474749, + "language_loss": 0.88519967, + "learning_rate": 3.992927371636776e-06, + "loss": 0.96957082, + "num_input_tokens_seen": 19860230, + "router_z_loss_clip": 5.91796875, + "router_z_loss_mlp": 0.83886719, + "step": 931, + "time_per_iteration": 2.5678892135620117 + }, + { + "auxiliary_loss_clip": 0.07037735, + "auxiliary_loss_mlp": 0.01413156, + "balance_loss_clip": 0.06439222, + "balance_loss_mlp": 0.01325466, + "epoch": 0.05603487148654742, + "flos": 24028392712320.0, + "grad_norm": 3.3508446860260355, + "language_loss": 0.86982858, + "learning_rate": 3.9928946095134525e-06, + "loss": 0.95433742, + "num_input_tokens_seen": 19880795, + "router_z_loss_clip": 5.9921875, + "router_z_loss_mlp": 0.87695312, + "step": 932, + "time_per_iteration": 2.6454596519470215 + }, + { + "auxiliary_loss_clip": 0.07046005, + "auxiliary_loss_mlp": 0.01409303, + "balance_loss_clip": 0.06444195, + "balance_loss_mlp": 0.01322901, + "epoch": 0.05609499473921539, + "flos": 17312912035200.0, + "grad_norm": 4.63721211876497, + "language_loss": 0.79083282, + "learning_rate": 3.992861771819365e-06, + "loss": 0.87538588, + "num_input_tokens_seen": 19897960, + "router_z_loss_clip": 6.02734375, + "router_z_loss_mlp": 0.86328125, + "step": 933, + "time_per_iteration": 2.5537846088409424 + }, + { + "auxiliary_loss_clip": 0.07023589, + "auxiliary_loss_mlp": 0.01416541, + "balance_loss_clip": 0.06434061, + "balance_loss_mlp": 0.01334287, + "epoch": 0.05615511799188336, + "flos": 21000834587520.0, + "grad_norm": 6.948998666256607, + "language_loss": 0.90410703, + "learning_rate": 3.99282885855576e-06, + "loss": 0.98850828, + "num_input_tokens_seen": 19913315, + "router_z_loss_clip": 5.89453125, + "router_z_loss_mlp": 0.82275391, + "step": 934, + "time_per_iteration": 2.5762336254119873 + }, + { + "auxiliary_loss_clip": 0.06990926, + "auxiliary_loss_mlp": 0.01429171, + "balance_loss_clip": 0.06438624, + "balance_loss_mlp": 0.01345153, + "epoch": 0.05621524124455133, + "flos": 17279062185600.0, + "grad_norm": 7.5646674228018265, + "language_loss": 0.84164441, + "learning_rate": 3.992795869723885e-06, + "loss": 0.92584538, + "num_input_tokens_seen": 19928790, + "router_z_loss_clip": 5.52734375, + "router_z_loss_mlp": 0.83984375, + "step": 935, + "time_per_iteration": 2.6203958988189697 + }, + { + "auxiliary_loss_clip": 0.06841761, + "auxiliary_loss_mlp": 0.01418196, + "balance_loss_clip": 0.06462182, + "balance_loss_mlp": 0.01359927, + "epoch": 0.0562753644972193, + "flos": 58737597194880.0, + "grad_norm": 0.8140808506826857, + "language_loss": 0.69178045, + "learning_rate": 3.99276280532499e-06, + "loss": 0.77438003, + "num_input_tokens_seen": 19988785, + "router_z_loss_clip": 3.79101562, + "router_z_loss_mlp": 0.58105469, + "step": 936, + "time_per_iteration": 3.1629393100738525 + }, + { + "auxiliary_loss_clip": 0.070338, + "auxiliary_loss_mlp": 0.01416227, + "balance_loss_clip": 0.06443301, + "balance_loss_mlp": 0.0133178, + "epoch": 0.05633548774988727, + "flos": 17462776262400.0, + "grad_norm": 4.591481841632389, + "language_loss": 0.81027842, + "learning_rate": 3.992729665360331e-06, + "loss": 0.89477861, + "num_input_tokens_seen": 20007685, + "router_z_loss_clip": 5.8984375, + "router_z_loss_mlp": 0.84472656, + "step": 937, + "time_per_iteration": 2.650186538696289 + }, + { + "auxiliary_loss_clip": 0.0684337, + "auxiliary_loss_mlp": 0.01393468, + "balance_loss_clip": 0.06467308, + "balance_loss_mlp": 0.01340683, + "epoch": 0.05639561100255524, + "flos": 70675939042560.0, + "grad_norm": 0.8752420339339617, + "language_loss": 0.64563346, + "learning_rate": 3.992696449831162e-06, + "loss": 0.72800183, + "num_input_tokens_seen": 20072750, + "router_z_loss_clip": 3.75, + "router_z_loss_mlp": 0.52880859, + "step": 938, + "time_per_iteration": 3.200669050216675 + }, + { + "auxiliary_loss_clip": 0.07073379, + "auxiliary_loss_mlp": 0.01391777, + "balance_loss_clip": 0.06460777, + "balance_loss_mlp": 0.01309332, + "epoch": 0.056455734255223204, + "flos": 20492346885120.0, + "grad_norm": 5.43214954330628, + "language_loss": 0.84251928, + "learning_rate": 3.992663158738745e-06, + "loss": 0.92717087, + "num_input_tokens_seen": 20089070, + "router_z_loss_clip": 6.125, + "router_z_loss_mlp": 0.82373047, + "step": 939, + "time_per_iteration": 2.622727870941162 + }, + { + "auxiliary_loss_clip": 0.07029171, + "auxiliary_loss_mlp": 0.01403853, + "balance_loss_clip": 0.06452838, + "balance_loss_mlp": 0.01326081, + "epoch": 0.056515857507891176, + "flos": 22059961395840.0, + "grad_norm": 5.005416621507547, + "language_loss": 0.76388282, + "learning_rate": 3.992629792084341e-06, + "loss": 0.84821308, + "num_input_tokens_seen": 20108790, + "router_z_loss_clip": 5.765625, + "router_z_loss_mlp": 0.77734375, + "step": 940, + "time_per_iteration": 2.6560001373291016 + }, + { + "auxiliary_loss_clip": 0.07005631, + "auxiliary_loss_mlp": 0.01389365, + "balance_loss_clip": 0.06443679, + "balance_loss_mlp": 0.01314073, + "epoch": 0.05657598076055915, + "flos": 24032291927040.0, + "grad_norm": 11.024308816683174, + "language_loss": 0.7415117, + "learning_rate": 3.992596349869216e-06, + "loss": 0.82546163, + "num_input_tokens_seen": 20128455, + "router_z_loss_clip": 5.6171875, + "router_z_loss_mlp": 0.75341797, + "step": 941, + "time_per_iteration": 2.691328525543213 + }, + { + "auxiliary_loss_clip": 0.07028662, + "auxiliary_loss_mlp": 0.01392256, + "balance_loss_clip": 0.06448376, + "balance_loss_mlp": 0.0131496, + "epoch": 0.05663610401322711, + "flos": 20486057829120.0, + "grad_norm": 6.757951792278694, + "language_loss": 0.8311438, + "learning_rate": 3.992562832094637e-06, + "loss": 0.91535294, + "num_input_tokens_seen": 20145775, + "router_z_loss_clip": 5.80859375, + "router_z_loss_mlp": 0.77246094, + "step": 942, + "time_per_iteration": 2.5987863540649414 + }, + { + "auxiliary_loss_clip": 0.07036945, + "auxiliary_loss_mlp": 0.01378378, + "balance_loss_clip": 0.06460088, + "balance_loss_mlp": 0.01303896, + "epoch": 0.056696227265895086, + "flos": 21075368394240.0, + "grad_norm": 21.600438823460475, + "language_loss": 0.92831737, + "learning_rate": 3.9925292387618755e-06, + "loss": 1.01247072, + "num_input_tokens_seen": 20164315, + "router_z_loss_clip": 5.765625, + "router_z_loss_mlp": 0.74462891, + "step": 943, + "time_per_iteration": 2.62147855758667 + }, + { + "auxiliary_loss_clip": 0.07040788, + "auxiliary_loss_mlp": 0.01386269, + "balance_loss_clip": 0.06462353, + "balance_loss_mlp": 0.01313027, + "epoch": 0.05675635051856306, + "flos": 17827017960960.0, + "grad_norm": 6.279897483523164, + "language_loss": 0.7991842, + "learning_rate": 3.992495569872206e-06, + "loss": 0.8834548, + "num_input_tokens_seen": 20182760, + "router_z_loss_clip": 5.78125, + "router_z_loss_mlp": 0.73242188, + "step": 944, + "time_per_iteration": 2.5755181312561035 + }, + { + "auxiliary_loss_clip": 0.0704762, + "auxiliary_loss_mlp": 0.01372731, + "balance_loss_clip": 0.06471305, + "balance_loss_mlp": 0.01300109, + "epoch": 0.05681647377123102, + "flos": 23122065024000.0, + "grad_norm": 11.186502162192404, + "language_loss": 0.82437181, + "learning_rate": 3.992461825426906e-06, + "loss": 0.90857524, + "num_input_tokens_seen": 20203830, + "router_z_loss_clip": 5.76171875, + "router_z_loss_mlp": 0.7265625, + "step": 945, + "time_per_iteration": 2.646212339401245 + }, + { + "auxiliary_loss_clip": 0.07062095, + "auxiliary_loss_mlp": 0.01352146, + "balance_loss_clip": 0.06473356, + "balance_loss_mlp": 0.01276854, + "epoch": 0.056876597023898995, + "flos": 16076024789760.0, + "grad_norm": 6.503065924665904, + "language_loss": 0.86640823, + "learning_rate": 3.992428005427252e-06, + "loss": 0.95055068, + "num_input_tokens_seen": 20220365, + "router_z_loss_clip": 5.890625, + "router_z_loss_mlp": 0.75195312, + "step": 946, + "time_per_iteration": 2.5955421924591064 + }, + { + "auxiliary_loss_clip": 0.07105307, + "auxiliary_loss_mlp": 0.01349465, + "balance_loss_clip": 0.06487983, + "balance_loss_mlp": 0.01268975, + "epoch": 0.05693672027656696, + "flos": 16841083294080.0, + "grad_norm": 30.160109907470417, + "language_loss": 0.83428961, + "learning_rate": 3.992394109874529e-06, + "loss": 0.91883731, + "num_input_tokens_seen": 20238640, + "router_z_loss_clip": 6.171875, + "router_z_loss_mlp": 0.80517578, + "step": 947, + "time_per_iteration": 2.578885078430176 + }, + { + "auxiliary_loss_clip": 0.07120173, + "auxiliary_loss_mlp": 0.01346427, + "balance_loss_clip": 0.06479014, + "balance_loss_mlp": 0.01264888, + "epoch": 0.05699684352923493, + "flos": 21394104526080.0, + "grad_norm": 7.760122513642949, + "language_loss": 0.89679337, + "learning_rate": 3.9923601387700225e-06, + "loss": 0.98145938, + "num_input_tokens_seen": 20251025, + "router_z_loss_clip": 6.40625, + "router_z_loss_mlp": 0.81542969, + "step": 948, + "time_per_iteration": 2.6047542095184326 + }, + { + "auxiliary_loss_clip": 0.07067588, + "auxiliary_loss_mlp": 0.01342886, + "balance_loss_clip": 0.06478094, + "balance_loss_mlp": 0.01268786, + "epoch": 0.057056966781902904, + "flos": 15565818078720.0, + "grad_norm": 4.718676024566818, + "language_loss": 0.91130018, + "learning_rate": 3.992326092115019e-06, + "loss": 0.99540496, + "num_input_tokens_seen": 20269775, + "router_z_loss_clip": 5.8984375, + "router_z_loss_mlp": 0.74121094, + "step": 949, + "time_per_iteration": 2.59798526763916 + }, + { + "auxiliary_loss_clip": 0.07052803, + "auxiliary_loss_mlp": 0.01334514, + "balance_loss_clip": 0.06479354, + "balance_loss_mlp": 0.01265897, + "epoch": 0.05711709003457087, + "flos": 19943971839360.0, + "grad_norm": 5.50050902669799, + "language_loss": 0.81973231, + "learning_rate": 3.992291969910811e-06, + "loss": 0.90360546, + "num_input_tokens_seen": 20287715, + "router_z_loss_clip": 5.73828125, + "router_z_loss_mlp": 0.68603516, + "step": 950, + "time_per_iteration": 2.6259987354278564 + }, + { + "auxiliary_loss_clip": 0.07096414, + "auxiliary_loss_mlp": 0.01341844, + "balance_loss_clip": 0.06496268, + "balance_loss_mlp": 0.0126536, + "epoch": 0.05717721328723884, + "flos": 30339953982720.0, + "grad_norm": 5.942643661235501, + "language_loss": 0.85793424, + "learning_rate": 3.992257772158691e-06, + "loss": 0.94231689, + "num_input_tokens_seen": 20307070, + "router_z_loss_clip": 5.99609375, + "router_z_loss_mlp": 0.76464844, + "step": 951, + "time_per_iteration": 2.6625497341156006 + }, + { + "auxiliary_loss_clip": 0.07096014, + "auxiliary_loss_mlp": 0.01337385, + "balance_loss_clip": 0.06490001, + "balance_loss_mlp": 0.0125494, + "epoch": 0.05723733653990681, + "flos": 23660251799040.0, + "grad_norm": 12.14793274648965, + "language_loss": 0.90794688, + "learning_rate": 3.992223498859958e-06, + "loss": 0.9922809, + "num_input_tokens_seen": 20324945, + "router_z_loss_clip": 6.06640625, + "router_z_loss_mlp": 0.82373047, + "step": 952, + "time_per_iteration": 2.6754026412963867 + }, + { + "auxiliary_loss_clip": 0.07150276, + "auxiliary_loss_mlp": 0.01358536, + "balance_loss_clip": 0.06509267, + "balance_loss_mlp": 0.01266268, + "epoch": 0.05729745979257478, + "flos": 22062518945280.0, + "grad_norm": 4.876026783534778, + "language_loss": 0.83819556, + "learning_rate": 3.9921891500159084e-06, + "loss": 0.92328364, + "num_input_tokens_seen": 20346135, + "router_z_loss_clip": 6.4140625, + "router_z_loss_mlp": 0.92333984, + "step": 953, + "time_per_iteration": 2.6004669666290283 + }, + { + "auxiliary_loss_clip": 0.07094061, + "auxiliary_loss_mlp": 0.01342327, + "balance_loss_clip": 0.06495301, + "balance_loss_mlp": 0.01262409, + "epoch": 0.05735758304524275, + "flos": 19609554994560.0, + "grad_norm": 6.9064094964387, + "language_loss": 0.9058758, + "learning_rate": 3.992154725627848e-06, + "loss": 0.99023962, + "num_input_tokens_seen": 20364450, + "router_z_loss_clip": 5.98046875, + "router_z_loss_mlp": 0.79931641, + "step": 954, + "time_per_iteration": 2.6270759105682373 + }, + { + "auxiliary_loss_clip": 0.07104363, + "auxiliary_loss_mlp": 0.01340099, + "balance_loss_clip": 0.06505129, + "balance_loss_mlp": 0.01262661, + "epoch": 0.057417706297910716, + "flos": 19105050360960.0, + "grad_norm": 6.439393268367411, + "language_loss": 0.9193548, + "learning_rate": 3.9921202256970804e-06, + "loss": 1.00379944, + "num_input_tokens_seen": 20383500, + "router_z_loss_clip": 6.0, + "router_z_loss_mlp": 0.77490234, + "step": 955, + "time_per_iteration": 2.5784714221954346 + }, + { + "auxiliary_loss_clip": 0.07088242, + "auxiliary_loss_mlp": 0.01339912, + "balance_loss_clip": 0.06500716, + "balance_loss_mlp": 0.01263379, + "epoch": 0.05747782955057869, + "flos": 16660136401920.0, + "grad_norm": 130.9595542139282, + "language_loss": 0.93622941, + "learning_rate": 3.992085650224914e-06, + "loss": 1.02051091, + "num_input_tokens_seen": 20400295, + "router_z_loss_clip": 5.875, + "router_z_loss_mlp": 0.765625, + "step": 956, + "time_per_iteration": 2.654709815979004 + }, + { + "auxiliary_loss_clip": 0.07069805, + "auxiliary_loss_mlp": 0.01336322, + "balance_loss_clip": 0.06513655, + "balance_loss_mlp": 0.01263795, + "epoch": 0.05753795280324665, + "flos": 14507362103040.0, + "grad_norm": 7.35623901329006, + "language_loss": 0.79601187, + "learning_rate": 3.99205099921266e-06, + "loss": 0.88007313, + "num_input_tokens_seen": 20419085, + "router_z_loss_clip": 5.5625, + "router_z_loss_mlp": 0.72509766, + "step": 957, + "time_per_iteration": 2.5814363956451416 + }, + { + "auxiliary_loss_clip": 0.07102334, + "auxiliary_loss_mlp": 0.013347, + "balance_loss_clip": 0.06516448, + "balance_loss_mlp": 0.01260171, + "epoch": 0.057598076055914625, + "flos": 18081995535360.0, + "grad_norm": 9.445676211161578, + "language_loss": 0.8370564, + "learning_rate": 3.992016272661633e-06, + "loss": 0.92142671, + "num_input_tokens_seen": 20437465, + "router_z_loss_clip": 5.859375, + "router_z_loss_mlp": 0.74511719, + "step": 958, + "time_per_iteration": 2.6244523525238037 + }, + { + "auxiliary_loss_clip": 0.0710094, + "auxiliary_loss_mlp": 0.01346675, + "balance_loss_clip": 0.06526074, + "balance_loss_mlp": 0.01272241, + "epoch": 0.0576581993085826, + "flos": 22130679841920.0, + "grad_norm": 4.908180525960309, + "language_loss": 0.91401774, + "learning_rate": 3.99198147057315e-06, + "loss": 0.99849397, + "num_input_tokens_seen": 20456235, + "router_z_loss_clip": 5.74609375, + "router_z_loss_mlp": 0.74365234, + "step": 959, + "time_per_iteration": 2.5950703620910645 + }, + { + "auxiliary_loss_clip": 0.07097997, + "auxiliary_loss_mlp": 0.01349298, + "balance_loss_clip": 0.06514278, + "balance_loss_mlp": 0.01272431, + "epoch": 0.05771832256125056, + "flos": 33190003232640.0, + "grad_norm": 5.502917231642364, + "language_loss": 0.82885253, + "learning_rate": 3.991946592948529e-06, + "loss": 0.91332549, + "num_input_tokens_seen": 20476825, + "router_z_loss_clip": 5.83203125, + "router_z_loss_mlp": 0.76904297, + "step": 960, + "time_per_iteration": 2.7026655673980713 + }, + { + "auxiliary_loss_clip": 0.07121219, + "auxiliary_loss_mlp": 0.0136329, + "balance_loss_clip": 0.06516127, + "balance_loss_mlp": 0.01276888, + "epoch": 0.057778445813918534, + "flos": 24176957201280.0, + "grad_norm": 10.105803552355386, + "language_loss": 0.96418011, + "learning_rate": 3.991911639789094e-06, + "loss": 1.0490253, + "num_input_tokens_seen": 20496965, + "router_z_loss_clip": 6.0546875, + "router_z_loss_mlp": 0.86425781, + "step": 961, + "time_per_iteration": 2.621075391769409 + }, + { + "auxiliary_loss_clip": 0.07137178, + "auxiliary_loss_mlp": 0.0136525, + "balance_loss_clip": 0.06529568, + "balance_loss_mlp": 0.01280421, + "epoch": 0.0578385690665865, + "flos": 29650770950400.0, + "grad_norm": 15.740079848034652, + "language_loss": 0.72144246, + "learning_rate": 3.991876611096169e-06, + "loss": 0.80646676, + "num_input_tokens_seen": 20518035, + "router_z_loss_clip": 6.08203125, + "router_z_loss_mlp": 0.84863281, + "step": 962, + "time_per_iteration": 2.662982702255249 + }, + { + "auxiliary_loss_clip": 0.07124397, + "auxiliary_loss_mlp": 0.01385383, + "balance_loss_clip": 0.06529254, + "balance_loss_mlp": 0.01300888, + "epoch": 0.05789869231925447, + "flos": 20891528536320.0, + "grad_norm": 6.9214750574770765, + "language_loss": 0.92274594, + "learning_rate": 3.991841506871084e-06, + "loss": 1.00784373, + "num_input_tokens_seen": 20534740, + "router_z_loss_clip": 5.953125, + "router_z_loss_mlp": 0.84521484, + "step": 963, + "time_per_iteration": 2.6076695919036865 + }, + { + "auxiliary_loss_clip": 0.07119042, + "auxiliary_loss_mlp": 0.01381304, + "balance_loss_clip": 0.06523143, + "balance_loss_mlp": 0.01297953, + "epoch": 0.057958815571922444, + "flos": 26038262672640.0, + "grad_norm": 11.895031253661099, + "language_loss": 0.8968147, + "learning_rate": 3.99180632711517e-06, + "loss": 0.98181814, + "num_input_tokens_seen": 20553485, + "router_z_loss_clip": 5.96484375, + "router_z_loss_mlp": 0.83300781, + "step": 964, + "time_per_iteration": 2.686906337738037 + }, + { + "auxiliary_loss_clip": 0.07105853, + "auxiliary_loss_mlp": 0.01387507, + "balance_loss_clip": 0.06517063, + "balance_loss_mlp": 0.01305252, + "epoch": 0.05801893882459041, + "flos": 18083588762880.0, + "grad_norm": 5.536598394443464, + "language_loss": 0.80100715, + "learning_rate": 3.99177107182976e-06, + "loss": 0.88594079, + "num_input_tokens_seen": 20572155, + "router_z_loss_clip": 5.88671875, + "router_z_loss_mlp": 0.82275391, + "step": 965, + "time_per_iteration": 4.090426921844482 + }, + { + "auxiliary_loss_clip": 0.07108907, + "auxiliary_loss_mlp": 0.01388674, + "balance_loss_clip": 0.0653006, + "balance_loss_mlp": 0.01307803, + "epoch": 0.05807906207725838, + "flos": 17754664360320.0, + "grad_norm": 8.638909024191255, + "language_loss": 0.85803884, + "learning_rate": 3.99173574101619e-06, + "loss": 0.94301462, + "num_input_tokens_seen": 20590395, + "router_z_loss_clip": 5.79296875, + "router_z_loss_mlp": 0.80859375, + "step": 966, + "time_per_iteration": 2.593015670776367 + }, + { + "auxiliary_loss_clip": 0.07081844, + "auxiliary_loss_mlp": 0.01385278, + "balance_loss_clip": 0.06515825, + "balance_loss_mlp": 0.01308507, + "epoch": 0.058139185329926346, + "flos": 18046133187840.0, + "grad_norm": 11.004143242377477, + "language_loss": 0.80350578, + "learning_rate": 3.9917003346758035e-06, + "loss": 0.88817692, + "num_input_tokens_seen": 20608435, + "router_z_loss_clip": 5.671875, + "router_z_loss_mlp": 0.76855469, + "step": 967, + "time_per_iteration": 4.057944297790527 + }, + { + "auxiliary_loss_clip": 0.06839906, + "auxiliary_loss_mlp": 0.01357839, + "balance_loss_clip": 0.06483683, + "balance_loss_mlp": 0.01313065, + "epoch": 0.05819930858259432, + "flos": 62381355845760.0, + "grad_norm": 0.8360355245003168, + "language_loss": 0.57554376, + "learning_rate": 3.991664852809939e-06, + "loss": 0.65752125, + "num_input_tokens_seen": 20668575, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 0.44799805, + "step": 968, + "time_per_iteration": 3.167989730834961 + }, + { + "auxiliary_loss_clip": 0.07096039, + "auxiliary_loss_mlp": 0.01391053, + "balance_loss_clip": 0.06529184, + "balance_loss_mlp": 0.01317, + "epoch": 0.05825943183526229, + "flos": 19141373905920.0, + "grad_norm": 7.005112994692607, + "language_loss": 0.84630275, + "learning_rate": 3.991629295419945e-06, + "loss": 0.93117368, + "num_input_tokens_seen": 20687355, + "router_z_loss_clip": 5.67578125, + "router_z_loss_mlp": 0.74072266, + "step": 969, + "time_per_iteration": 4.074899911880493 + }, + { + "auxiliary_loss_clip": 0.07116528, + "auxiliary_loss_mlp": 0.0138256, + "balance_loss_clip": 0.06523499, + "balance_loss_mlp": 0.01301068, + "epoch": 0.058319555087930255, + "flos": 29030042304000.0, + "grad_norm": 8.083926871251307, + "language_loss": 0.82668531, + "learning_rate": 3.991593662507167e-06, + "loss": 0.91167617, + "num_input_tokens_seen": 20705710, + "router_z_loss_clip": 5.9296875, + "router_z_loss_mlp": 0.81542969, + "step": 970, + "time_per_iteration": 2.659989833831787 + }, + { + "auxiliary_loss_clip": 0.07099806, + "auxiliary_loss_mlp": 0.01400005, + "balance_loss_clip": 0.06510817, + "balance_loss_mlp": 0.01317321, + "epoch": 0.05837967834059823, + "flos": 18885977061120.0, + "grad_norm": 16.518563352615757, + "language_loss": 0.96487081, + "learning_rate": 3.991557954072958e-06, + "loss": 1.04986882, + "num_input_tokens_seen": 20722405, + "router_z_loss_clip": 5.890625, + "router_z_loss_mlp": 0.82714844, + "step": 971, + "time_per_iteration": 2.666133165359497 + }, + { + "auxiliary_loss_clip": 0.07087609, + "auxiliary_loss_mlp": 0.01388607, + "balance_loss_clip": 0.06502773, + "balance_loss_mlp": 0.01310834, + "epoch": 0.05843980159326619, + "flos": 25710218737920.0, + "grad_norm": 16.27135895590574, + "language_loss": 0.89295512, + "learning_rate": 3.991522170118673e-06, + "loss": 0.97771728, + "num_input_tokens_seen": 20741480, + "router_z_loss_clip": 5.84765625, + "router_z_loss_mlp": 0.77832031, + "step": 972, + "time_per_iteration": 2.655470848083496 + }, + { + "auxiliary_loss_clip": 0.07066658, + "auxiliary_loss_mlp": 0.01374677, + "balance_loss_clip": 0.0650342, + "balance_loss_mlp": 0.01301482, + "epoch": 0.058499924845934165, + "flos": 25558425866880.0, + "grad_norm": 4.193788183762945, + "language_loss": 0.90456176, + "learning_rate": 3.991486310645667e-06, + "loss": 0.98897511, + "num_input_tokens_seen": 20759685, + "router_z_loss_clip": 5.62890625, + "router_z_loss_mlp": 0.73144531, + "step": 973, + "time_per_iteration": 2.6482443809509277 + }, + { + "auxiliary_loss_clip": 0.0705331, + "auxiliary_loss_mlp": 0.01383547, + "balance_loss_clip": 0.06485617, + "balance_loss_mlp": 0.01307635, + "epoch": 0.05856004809860214, + "flos": 16441859715840.0, + "grad_norm": 11.262132273646074, + "language_loss": 0.77443254, + "learning_rate": 3.991450375655301e-06, + "loss": 0.85880107, + "num_input_tokens_seen": 20778180, + "router_z_loss_clip": 5.6796875, + "router_z_loss_mlp": 0.75878906, + "step": 974, + "time_per_iteration": 2.57619047164917 + }, + { + "auxiliary_loss_clip": 0.07050242, + "auxiliary_loss_mlp": 0.01379524, + "balance_loss_clip": 0.06485987, + "balance_loss_mlp": 0.01304852, + "epoch": 0.0586201713512701, + "flos": 39468385486080.0, + "grad_norm": 6.566272929573762, + "language_loss": 0.79448825, + "learning_rate": 3.991414365148936e-06, + "loss": 0.87878591, + "num_input_tokens_seen": 20802705, + "router_z_loss_clip": 5.640625, + "router_z_loss_mlp": 0.74707031, + "step": 975, + "time_per_iteration": 2.79398250579834 + }, + { + "auxiliary_loss_clip": 0.07056309, + "auxiliary_loss_mlp": 0.0138878, + "balance_loss_clip": 0.06472544, + "balance_loss_mlp": 0.01304809, + "epoch": 0.058680294603938074, + "flos": 23371466302080.0, + "grad_norm": 4.828568059250088, + "language_loss": 0.79758298, + "learning_rate": 3.99137827912794e-06, + "loss": 0.88203388, + "num_input_tokens_seen": 20822540, + "router_z_loss_clip": 5.83984375, + "router_z_loss_mlp": 0.83984375, + "step": 976, + "time_per_iteration": 2.6214101314544678 + }, + { + "auxiliary_loss_clip": 0.07040592, + "auxiliary_loss_mlp": 0.01371791, + "balance_loss_clip": 0.06474636, + "balance_loss_mlp": 0.01299216, + "epoch": 0.05874041785660604, + "flos": 32239930913280.0, + "grad_norm": 7.236872171762386, + "language_loss": 0.89953148, + "learning_rate": 3.991342117593679e-06, + "loss": 0.98365533, + "num_input_tokens_seen": 20844175, + "router_z_loss_clip": 5.66015625, + "router_z_loss_mlp": 0.72607422, + "step": 977, + "time_per_iteration": 2.681955099105835 + }, + { + "auxiliary_loss_clip": 0.07041348, + "auxiliary_loss_mlp": 0.01373201, + "balance_loss_clip": 0.06467118, + "balance_loss_mlp": 0.01295619, + "epoch": 0.05880054110927401, + "flos": 22316657978880.0, + "grad_norm": 7.280318669233247, + "language_loss": 0.82238227, + "learning_rate": 3.991305880547527e-06, + "loss": 0.90652776, + "num_input_tokens_seen": 20864730, + "router_z_loss_clip": 5.734375, + "router_z_loss_mlp": 0.77587891, + "step": 978, + "time_per_iteration": 2.614290952682495 + }, + { + "auxiliary_loss_clip": 0.0707294, + "auxiliary_loss_mlp": 0.0136034, + "balance_loss_clip": 0.06484175, + "balance_loss_mlp": 0.01280184, + "epoch": 0.05886066436194198, + "flos": 27387726278400.0, + "grad_norm": 155.96057049304315, + "language_loss": 0.83328485, + "learning_rate": 3.991269567990855e-06, + "loss": 0.91761768, + "num_input_tokens_seen": 20885200, + "router_z_loss_clip": 5.89453125, + "router_z_loss_mlp": 0.80175781, + "step": 979, + "time_per_iteration": 2.635091543197632 + }, + { + "auxiliary_loss_clip": 0.0672864, + "auxiliary_loss_mlp": 0.01304756, + "balance_loss_clip": 0.06376771, + "balance_loss_mlp": 0.01257311, + "epoch": 0.05892078761460995, + "flos": 59601102647040.0, + "grad_norm": 0.9093094214807238, + "language_loss": 0.59396595, + "learning_rate": 3.9912331799250415e-06, + "loss": 0.67429984, + "num_input_tokens_seen": 20940325, + "router_z_loss_clip": 3.51953125, + "router_z_loss_mlp": 0.47387695, + "step": 980, + "time_per_iteration": 3.1261343955993652 + }, + { + "auxiliary_loss_clip": 0.07034945, + "auxiliary_loss_mlp": 0.01348733, + "balance_loss_clip": 0.06472749, + "balance_loss_mlp": 0.01274394, + "epoch": 0.05898091086727792, + "flos": 15419517649920.0, + "grad_norm": 3.186788863209633, + "language_loss": 0.90080172, + "learning_rate": 3.9911967163514665e-06, + "loss": 0.98463851, + "num_input_tokens_seen": 20958220, + "router_z_loss_clip": 5.625, + "router_z_loss_mlp": 0.74267578, + "step": 981, + "time_per_iteration": 2.5808515548706055 + }, + { + "auxiliary_loss_clip": 0.0705516, + "auxiliary_loss_mlp": 0.01348366, + "balance_loss_clip": 0.06484837, + "balance_loss_mlp": 0.0127746, + "epoch": 0.059041034119945886, + "flos": 23661383829120.0, + "grad_norm": 5.662656134717616, + "language_loss": 0.82531273, + "learning_rate": 3.991160177271513e-06, + "loss": 0.90934801, + "num_input_tokens_seen": 20978920, + "router_z_loss_clip": 5.703125, + "router_z_loss_mlp": 0.70898438, + "step": 982, + "time_per_iteration": 2.7105038166046143 + }, + { + "auxiliary_loss_clip": 0.07084571, + "auxiliary_loss_mlp": 0.01361032, + "balance_loss_clip": 0.06488383, + "balance_loss_mlp": 0.01281162, + "epoch": 0.05910115737261386, + "flos": 24761026886400.0, + "grad_norm": 3.604575523078559, + "language_loss": 0.87251258, + "learning_rate": 3.9911235626865654e-06, + "loss": 0.95696855, + "num_input_tokens_seen": 20999490, + "router_z_loss_clip": 5.9609375, + "router_z_loss_mlp": 0.79882812, + "step": 983, + "time_per_iteration": 2.744180917739868 + }, + { + "auxiliary_loss_clip": 0.07044654, + "auxiliary_loss_mlp": 0.01351466, + "balance_loss_clip": 0.06470264, + "balance_loss_mlp": 0.01274648, + "epoch": 0.05916128062528183, + "flos": 11733523741440.0, + "grad_norm": 4.930042751750388, + "language_loss": 0.87498015, + "learning_rate": 3.9910868725980125e-06, + "loss": 0.95894134, + "num_input_tokens_seen": 21017865, + "router_z_loss_clip": 5.734375, + "router_z_loss_mlp": 0.76806641, + "step": 984, + "time_per_iteration": 2.651169538497925 + }, + { + "auxiliary_loss_clip": 0.0704496, + "auxiliary_loss_mlp": 0.01342068, + "balance_loss_clip": 0.06470487, + "balance_loss_mlp": 0.01264582, + "epoch": 0.059221403877949795, + "flos": 21908587795200.0, + "grad_norm": 5.844491017467261, + "language_loss": 0.80473924, + "learning_rate": 3.9910501070072465e-06, + "loss": 0.88860953, + "num_input_tokens_seen": 21035900, + "router_z_loss_clip": 5.74609375, + "router_z_loss_mlp": 0.77490234, + "step": 985, + "time_per_iteration": 2.6289291381835938 + }, + { + "auxiliary_loss_clip": 0.07058708, + "auxiliary_loss_mlp": 0.01361985, + "balance_loss_clip": 0.06475725, + "balance_loss_mlp": 0.01284213, + "epoch": 0.05928152713061777, + "flos": 20519614189440.0, + "grad_norm": 6.301686711015131, + "language_loss": 0.93571031, + "learning_rate": 3.991013265915661e-06, + "loss": 1.01991737, + "num_input_tokens_seen": 21053235, + "router_z_loss_clip": 5.83203125, + "router_z_loss_mlp": 0.77783203, + "step": 986, + "time_per_iteration": 2.655438184738159 + }, + { + "auxiliary_loss_clip": 0.0708475, + "auxiliary_loss_mlp": 0.01349267, + "balance_loss_clip": 0.06479746, + "balance_loss_mlp": 0.01270303, + "epoch": 0.05934165038328574, + "flos": 24501437337600.0, + "grad_norm": 4.15562600287031, + "language_loss": 0.79382873, + "learning_rate": 3.9909763493246525e-06, + "loss": 0.87816888, + "num_input_tokens_seen": 21073090, + "router_z_loss_clip": 6.0546875, + "router_z_loss_mlp": 0.79003906, + "step": 987, + "time_per_iteration": 2.635974168777466 + }, + { + "auxiliary_loss_clip": 0.07112011, + "auxiliary_loss_mlp": 0.01375395, + "balance_loss_clip": 0.06492966, + "balance_loss_mlp": 0.0128861, + "epoch": 0.059401773635953704, + "flos": 38737302612480.0, + "grad_norm": 3.024721532830348, + "language_loss": 0.74664164, + "learning_rate": 3.990939357235621e-06, + "loss": 0.83151573, + "num_input_tokens_seen": 21094895, + "router_z_loss_clip": 6.19140625, + "router_z_loss_mlp": 0.8671875, + "step": 988, + "time_per_iteration": 2.8440210819244385 + }, + { + "auxiliary_loss_clip": 0.06738614, + "auxiliary_loss_mlp": 0.01302441, + "balance_loss_clip": 0.06389277, + "balance_loss_mlp": 0.01254757, + "epoch": 0.059461896888621676, + "flos": 58041244638720.0, + "grad_norm": 0.9346440677006217, + "language_loss": 0.71295583, + "learning_rate": 3.99090228964997e-06, + "loss": 0.79336637, + "num_input_tokens_seen": 21147555, + "router_z_loss_clip": 3.5, + "router_z_loss_mlp": 0.4765625, + "step": 989, + "time_per_iteration": 3.0397932529449463 + }, + { + "auxiliary_loss_clip": 0.07105568, + "auxiliary_loss_mlp": 0.01373719, + "balance_loss_clip": 0.06490866, + "balance_loss_mlp": 0.01288604, + "epoch": 0.05952202014128964, + "flos": 22134369421440.0, + "grad_norm": 3.813782873152628, + "language_loss": 0.81950057, + "learning_rate": 3.990865146569105e-06, + "loss": 0.90429342, + "num_input_tokens_seen": 21167845, + "router_z_loss_clip": 6.1484375, + "router_z_loss_mlp": 0.85107422, + "step": 990, + "time_per_iteration": 2.679490804672241 + }, + { + "auxiliary_loss_clip": 0.07070604, + "auxiliary_loss_mlp": 0.0136635, + "balance_loss_clip": 0.0648191, + "balance_loss_mlp": 0.01286957, + "epoch": 0.059582143393957614, + "flos": 20451495219840.0, + "grad_norm": 3.1821025671437786, + "language_loss": 0.88952839, + "learning_rate": 3.990827927994434e-06, + "loss": 0.97389793, + "num_input_tokens_seen": 21185085, + "router_z_loss_clip": 5.890625, + "router_z_loss_mlp": 0.79443359, + "step": 991, + "time_per_iteration": 2.6212010383605957 + }, + { + "auxiliary_loss_clip": 0.07097097, + "auxiliary_loss_mlp": 0.01373652, + "balance_loss_clip": 0.06486384, + "balance_loss_mlp": 0.012893, + "epoch": 0.059642266646625586, + "flos": 20601149811840.0, + "grad_norm": 4.7552664277712475, + "language_loss": 0.80401003, + "learning_rate": 3.9907906339273674e-06, + "loss": 0.88871753, + "num_input_tokens_seen": 21204230, + "router_z_loss_clip": 6.1171875, + "router_z_loss_mlp": 0.84375, + "step": 992, + "time_per_iteration": 2.6194934844970703 + }, + { + "auxiliary_loss_clip": 0.07081859, + "auxiliary_loss_mlp": 0.01371261, + "balance_loss_clip": 0.06485239, + "balance_loss_mlp": 0.01292869, + "epoch": 0.05970238989929355, + "flos": 19358434707840.0, + "grad_norm": 7.615023287218043, + "language_loss": 0.78822339, + "learning_rate": 3.9907532643693215e-06, + "loss": 0.87275457, + "num_input_tokens_seen": 21222655, + "router_z_loss_clip": 5.96875, + "router_z_loss_mlp": 0.78417969, + "step": 993, + "time_per_iteration": 2.5962717533111572 + }, + { + "auxiliary_loss_clip": 0.07073358, + "auxiliary_loss_mlp": 0.01364747, + "balance_loss_clip": 0.06486119, + "balance_loss_mlp": 0.01289073, + "epoch": 0.05976251315196152, + "flos": 30272002721280.0, + "grad_norm": 5.1352604598244, + "language_loss": 0.83427668, + "learning_rate": 3.990715819321712e-06, + "loss": 0.91865766, + "num_input_tokens_seen": 21242310, + "router_z_loss_clip": 5.875, + "router_z_loss_mlp": 0.75634766, + "step": 994, + "time_per_iteration": 2.677586317062378 + }, + { + "auxiliary_loss_clip": 0.07096842, + "auxiliary_loss_mlp": 0.01391454, + "balance_loss_clip": 0.06492864, + "balance_loss_mlp": 0.01313491, + "epoch": 0.05982263640462949, + "flos": 23191819148160.0, + "grad_norm": 4.423928105923456, + "language_loss": 0.83424294, + "learning_rate": 3.99067829878596e-06, + "loss": 0.91912591, + "num_input_tokens_seen": 21261410, + "router_z_loss_clip": 6.046875, + "router_z_loss_mlp": 0.77978516, + "step": 995, + "time_per_iteration": 2.62821364402771 + }, + { + "auxiliary_loss_clip": 0.07109222, + "auxiliary_loss_mlp": 0.01389117, + "balance_loss_clip": 0.06503183, + "balance_loss_mlp": 0.01309247, + "epoch": 0.05988275965729746, + "flos": 27857584448640.0, + "grad_norm": 3.07551937102457, + "language_loss": 0.89631027, + "learning_rate": 3.990640702763487e-06, + "loss": 0.98129368, + "num_input_tokens_seen": 21280080, + "router_z_loss_clip": 6.05859375, + "router_z_loss_mlp": 0.79785156, + "step": 996, + "time_per_iteration": 2.6472525596618652 + }, + { + "auxiliary_loss_clip": 0.0709434, + "auxiliary_loss_mlp": 0.01374144, + "balance_loss_clip": 0.06487706, + "balance_loss_mlp": 0.01292461, + "epoch": 0.05994288290996543, + "flos": 24686744641920.0, + "grad_norm": 3.8490454271878023, + "language_loss": 0.91812748, + "learning_rate": 3.990603031255718e-06, + "loss": 1.00281239, + "num_input_tokens_seen": 21296765, + "router_z_loss_clip": 6.05078125, + "router_z_loss_mlp": 0.81689453, + "step": 997, + "time_per_iteration": 2.6353485584259033 + }, + { + "auxiliary_loss_clip": 0.06747872, + "auxiliary_loss_mlp": 0.0129538, + "balance_loss_clip": 0.06402076, + "balance_loss_mlp": 0.01256113, + "epoch": 0.0600030061626334, + "flos": 69951187152000.0, + "grad_norm": 1.0138660307708214, + "language_loss": 0.75495923, + "learning_rate": 3.990565284264083e-06, + "loss": 0.83539176, + "num_input_tokens_seen": 21363345, + "router_z_loss_clip": 3.4609375, + "router_z_loss_mlp": 0.39233398, + "step": 998, + "time_per_iteration": 3.2664620876312256 + }, + { + "auxiliary_loss_clip": 0.07050692, + "auxiliary_loss_mlp": 0.01361564, + "balance_loss_clip": 0.06468829, + "balance_loss_mlp": 0.01286844, + "epoch": 0.06006312941530137, + "flos": 26547085791360.0, + "grad_norm": 6.665102912139699, + "language_loss": 0.78679419, + "learning_rate": 3.990527461790013e-06, + "loss": 0.87091672, + "num_input_tokens_seen": 21385290, + "router_z_loss_clip": 5.8203125, + "router_z_loss_mlp": 0.74707031, + "step": 999, + "time_per_iteration": 2.6708481311798096 + }, + { + "auxiliary_loss_clip": 0.07090119, + "auxiliary_loss_mlp": 0.01381378, + "balance_loss_clip": 0.06486722, + "balance_loss_mlp": 0.01301603, + "epoch": 0.060123252667969335, + "flos": 27351276952320.0, + "grad_norm": 3.7400701542168013, + "language_loss": 0.85150427, + "learning_rate": 3.990489563834943e-06, + "loss": 0.93621922, + "num_input_tokens_seen": 21407625, + "router_z_loss_clip": 6.03515625, + "router_z_loss_mlp": 0.79833984, + "step": 1000, + "time_per_iteration": 2.643961191177368 + }, + { + "auxiliary_loss_clip": 0.07061431, + "auxiliary_loss_mlp": 0.01377664, + "balance_loss_clip": 0.06471995, + "balance_loss_mlp": 0.01297555, + "epoch": 0.06018337592063731, + "flos": 27024113485440.0, + "grad_norm": 4.060867986193189, + "language_loss": 0.88738573, + "learning_rate": 3.990451590400309e-06, + "loss": 0.97177666, + "num_input_tokens_seen": 21426835, + "router_z_loss_clip": 5.890625, + "router_z_loss_mlp": 0.80126953, + "step": 1001, + "time_per_iteration": 2.629136323928833 + }, + { + "auxiliary_loss_clip": 0.07032709, + "auxiliary_loss_mlp": 0.01355395, + "balance_loss_clip": 0.06470643, + "balance_loss_mlp": 0.01289306, + "epoch": 0.06024349917330528, + "flos": 25599990291840.0, + "grad_norm": 3.249124655019378, + "language_loss": 0.76097226, + "learning_rate": 3.990413541487551e-06, + "loss": 0.84485334, + "num_input_tokens_seen": 21444920, + "router_z_loss_clip": 5.6171875, + "router_z_loss_mlp": 0.66162109, + "step": 1002, + "time_per_iteration": 2.6258249282836914 + }, + { + "auxiliary_loss_clip": 0.07068716, + "auxiliary_loss_mlp": 0.01374313, + "balance_loss_clip": 0.06480874, + "balance_loss_mlp": 0.01298067, + "epoch": 0.060303622425973244, + "flos": 26139225242880.0, + "grad_norm": 4.8561241229026075, + "language_loss": 0.78990388, + "learning_rate": 3.990375417098112e-06, + "loss": 0.87433422, + "num_input_tokens_seen": 21463555, + "router_z_loss_clip": 5.8828125, + "router_z_loss_mlp": 0.76220703, + "step": 1003, + "time_per_iteration": 2.7662932872772217 + }, + { + "auxiliary_loss_clip": 0.0707517, + "auxiliary_loss_mlp": 0.01365139, + "balance_loss_clip": 0.0647432, + "balance_loss_mlp": 0.01284077, + "epoch": 0.060363745678641216, + "flos": 20383627812480.0, + "grad_norm": 4.219450714846169, + "language_loss": 0.73012471, + "learning_rate": 3.990337217233437e-06, + "loss": 0.81452775, + "num_input_tokens_seen": 21481990, + "router_z_loss_clip": 6.015625, + "router_z_loss_mlp": 0.81005859, + "step": 1004, + "time_per_iteration": 5.472697734832764 + }, + { + "auxiliary_loss_clip": 0.07068998, + "auxiliary_loss_mlp": 0.01370949, + "balance_loss_clip": 0.06471765, + "balance_loss_mlp": 0.0129313, + "epoch": 0.06042386893130918, + "flos": 17754999776640.0, + "grad_norm": 3.350107422381743, + "language_loss": 0.86839885, + "learning_rate": 3.990298941894976e-06, + "loss": 0.95279837, + "num_input_tokens_seen": 21500385, + "router_z_loss_clip": 5.96875, + "router_z_loss_mlp": 0.77832031, + "step": 1005, + "time_per_iteration": 2.628612518310547 + }, + { + "auxiliary_loss_clip": 0.06732726, + "auxiliary_loss_mlp": 0.01300149, + "balance_loss_clip": 0.06388327, + "balance_loss_mlp": 0.01255518, + "epoch": 0.06048399218397715, + "flos": 68559110945280.0, + "grad_norm": 0.8658661250215584, + "language_loss": 0.59003174, + "learning_rate": 3.9902605910841794e-06, + "loss": 0.67036045, + "num_input_tokens_seen": 21561040, + "router_z_loss_clip": 3.4375, + "router_z_loss_mlp": 0.4465332, + "step": 1006, + "time_per_iteration": 3.2709102630615234 + }, + { + "auxiliary_loss_clip": 0.07070711, + "auxiliary_loss_mlp": 0.01360281, + "balance_loss_clip": 0.06464767, + "balance_loss_mlp": 0.01278123, + "epoch": 0.060544115436645125, + "flos": 23265262851840.0, + "grad_norm": 3.0418653981095973, + "language_loss": 0.77645856, + "learning_rate": 3.990222164802503e-06, + "loss": 0.8607685, + "num_input_tokens_seen": 21580655, + "router_z_loss_clip": 6.05859375, + "router_z_loss_mlp": 0.82128906, + "step": 1007, + "time_per_iteration": 4.056382894515991 + }, + { + "auxiliary_loss_clip": 0.07091306, + "auxiliary_loss_mlp": 0.01370917, + "balance_loss_clip": 0.06486145, + "balance_loss_mlp": 0.01290475, + "epoch": 0.06060423868931309, + "flos": 23885236811520.0, + "grad_norm": 3.189900491688776, + "language_loss": 0.83630216, + "learning_rate": 3.9901836630514006e-06, + "loss": 0.92092443, + "num_input_tokens_seen": 21599650, + "router_z_loss_clip": 6.05859375, + "router_z_loss_mlp": 0.8046875, + "step": 1008, + "time_per_iteration": 2.6701247692108154 + }, + { + "auxiliary_loss_clip": 0.07042849, + "auxiliary_loss_mlp": 0.01344814, + "balance_loss_clip": 0.06474254, + "balance_loss_mlp": 0.01273718, + "epoch": 0.06066436194198106, + "flos": 18733010232960.0, + "grad_norm": 8.677434751337552, + "language_loss": 0.80948377, + "learning_rate": 3.990145085832335e-06, + "loss": 0.89336038, + "num_input_tokens_seen": 21617550, + "router_z_loss_clip": 5.6875, + "router_z_loss_mlp": 0.71142578, + "step": 1009, + "time_per_iteration": 4.013457536697388 + }, + { + "auxiliary_loss_clip": 0.07022181, + "auxiliary_loss_mlp": 0.01332483, + "balance_loss_clip": 0.06467105, + "balance_loss_mlp": 0.01266345, + "epoch": 0.06072448519464903, + "flos": 24646689590400.0, + "grad_norm": 3.258884654543471, + "language_loss": 0.95985019, + "learning_rate": 3.990106433146769e-06, + "loss": 1.04339683, + "num_input_tokens_seen": 21635865, + "router_z_loss_clip": 5.55078125, + "router_z_loss_mlp": 0.66162109, + "step": 1010, + "time_per_iteration": 2.631512403488159 + }, + { + "auxiliary_loss_clip": 0.07117961, + "auxiliary_loss_mlp": 0.01383111, + "balance_loss_clip": 0.06489638, + "balance_loss_mlp": 0.01291845, + "epoch": 0.060784608447317, + "flos": 17383672408320.0, + "grad_norm": 3.3823449890168145, + "language_loss": 0.75409305, + "learning_rate": 3.9900677049961665e-06, + "loss": 0.83910382, + "num_input_tokens_seen": 21653945, + "router_z_loss_clip": 6.28125, + "router_z_loss_mlp": 0.91259766, + "step": 1011, + "time_per_iteration": 2.5896708965301514 + }, + { + "auxiliary_loss_clip": 0.07033786, + "auxiliary_loss_mlp": 0.01345512, + "balance_loss_clip": 0.06462559, + "balance_loss_mlp": 0.0126526, + "epoch": 0.06084473169998497, + "flos": 23698336279680.0, + "grad_norm": 3.246815093008435, + "language_loss": 0.89853048, + "learning_rate": 3.990028901381999e-06, + "loss": 0.98232347, + "num_input_tokens_seen": 21671230, + "router_z_loss_clip": 5.7109375, + "router_z_loss_mlp": 0.80273438, + "step": 1012, + "time_per_iteration": 2.637019157409668 + }, + { + "auxiliary_loss_clip": 0.07040339, + "auxiliary_loss_mlp": 0.01338129, + "balance_loss_clip": 0.06458548, + "balance_loss_mlp": 0.01258211, + "epoch": 0.06090485495265294, + "flos": 23552455121280.0, + "grad_norm": 2.5392970439405116, + "language_loss": 0.79602826, + "learning_rate": 3.989990022305734e-06, + "loss": 0.8798129, + "num_input_tokens_seen": 21691155, + "router_z_loss_clip": 5.81640625, + "router_z_loss_mlp": 0.79980469, + "step": 1013, + "time_per_iteration": 2.658986806869507 + }, + { + "auxiliary_loss_clip": 0.0703081, + "auxiliary_loss_mlp": 0.01334151, + "balance_loss_clip": 0.06449694, + "balance_loss_mlp": 0.01255664, + "epoch": 0.06096497820532091, + "flos": 20345501404800.0, + "grad_norm": 3.5799775107607585, + "language_loss": 0.88768977, + "learning_rate": 3.98995106776885e-06, + "loss": 0.97133934, + "num_input_tokens_seen": 21707405, + "router_z_loss_clip": 5.8203125, + "router_z_loss_mlp": 0.78515625, + "step": 1014, + "time_per_iteration": 2.6026017665863037 + }, + { + "auxiliary_loss_clip": 0.07069368, + "auxiliary_loss_mlp": 0.01344703, + "balance_loss_clip": 0.06459542, + "balance_loss_mlp": 0.01260589, + "epoch": 0.061025101457988874, + "flos": 26945638536960.0, + "grad_norm": 5.148864357756937, + "language_loss": 0.77818727, + "learning_rate": 3.98991203777282e-06, + "loss": 0.86232805, + "num_input_tokens_seen": 21728090, + "router_z_loss_clip": 6.1015625, + "router_z_loss_mlp": 0.84082031, + "step": 1015, + "time_per_iteration": 2.6645917892456055 + }, + { + "auxiliary_loss_clip": 0.07000691, + "auxiliary_loss_mlp": 0.01326184, + "balance_loss_clip": 0.06455131, + "balance_loss_mlp": 0.01257949, + "epoch": 0.061085224710656846, + "flos": 25382216730240.0, + "grad_norm": 2.4567185281472868, + "language_loss": 0.82061088, + "learning_rate": 3.9898729323191275e-06, + "loss": 0.90387964, + "num_input_tokens_seen": 21747950, + "router_z_loss_clip": 5.453125, + "router_z_loss_mlp": 0.68359375, + "step": 1016, + "time_per_iteration": 2.631394863128662 + }, + { + "auxiliary_loss_clip": 0.07014458, + "auxiliary_loss_mlp": 0.01339398, + "balance_loss_clip": 0.06457797, + "balance_loss_mlp": 0.01263962, + "epoch": 0.06114534796332482, + "flos": 24831326062080.0, + "grad_norm": 2.2885034058804363, + "language_loss": 0.78705657, + "learning_rate": 3.989833751409254e-06, + "loss": 0.8705951, + "num_input_tokens_seen": 21767900, + "router_z_loss_clip": 5.55859375, + "router_z_loss_mlp": 0.75390625, + "step": 1017, + "time_per_iteration": 2.657306432723999 + }, + { + "auxiliary_loss_clip": 0.07054974, + "auxiliary_loss_mlp": 0.0134134, + "balance_loss_clip": 0.06458369, + "balance_loss_mlp": 0.01256225, + "epoch": 0.061205471215992784, + "flos": 20637724919040.0, + "grad_norm": 9.632952296777574, + "language_loss": 0.88575757, + "learning_rate": 3.989794495044685e-06, + "loss": 0.96972066, + "num_input_tokens_seen": 21787375, + "router_z_loss_clip": 5.97265625, + "router_z_loss_mlp": 0.85107422, + "step": 1018, + "time_per_iteration": 2.5989861488342285 + }, + { + "auxiliary_loss_clip": 0.07009743, + "auxiliary_loss_mlp": 0.01334982, + "balance_loss_clip": 0.06455217, + "balance_loss_mlp": 0.01259165, + "epoch": 0.061265594468660756, + "flos": 16513919827200.0, + "grad_norm": 8.927182809216816, + "language_loss": 0.8225174, + "learning_rate": 3.989755163226909e-06, + "loss": 0.90596467, + "num_input_tokens_seen": 21806275, + "router_z_loss_clip": 5.54296875, + "router_z_loss_mlp": 0.75878906, + "step": 1019, + "time_per_iteration": 2.596885919570923 + }, + { + "auxiliary_loss_clip": 0.07013386, + "auxiliary_loss_mlp": 0.01335228, + "balance_loss_clip": 0.06456258, + "balance_loss_mlp": 0.01263417, + "epoch": 0.06132571772132872, + "flos": 26252765925120.0, + "grad_norm": 3.333827515378615, + "language_loss": 0.86933666, + "learning_rate": 3.989715755957418e-06, + "loss": 0.9528228, + "num_input_tokens_seen": 21826430, + "router_z_loss_clip": 5.5625, + "router_z_loss_mlp": 0.71826172, + "step": 1020, + "time_per_iteration": 2.6224961280822754 + }, + { + "auxiliary_loss_clip": 0.06996658, + "auxiliary_loss_mlp": 0.01346945, + "balance_loss_clip": 0.06447957, + "balance_loss_mlp": 0.01273989, + "epoch": 0.06138584097399669, + "flos": 37423869062400.0, + "grad_norm": 2.8232559173096914, + "language_loss": 0.81487918, + "learning_rate": 3.989676273237705e-06, + "loss": 0.89831525, + "num_input_tokens_seen": 21847800, + "router_z_loss_clip": 5.48828125, + "router_z_loss_mlp": 0.72949219, + "step": 1021, + "time_per_iteration": 2.771052598953247 + }, + { + "auxiliary_loss_clip": 0.06976922, + "auxiliary_loss_mlp": 0.0136383, + "balance_loss_clip": 0.06428508, + "balance_loss_mlp": 0.01285295, + "epoch": 0.061445964226664665, + "flos": 17426410790400.0, + "grad_norm": 7.734725170769636, + "language_loss": 0.9093855, + "learning_rate": 3.9896367150692705e-06, + "loss": 0.99279296, + "num_input_tokens_seen": 21863385, + "router_z_loss_clip": 5.48046875, + "router_z_loss_mlp": 0.78466797, + "step": 1022, + "time_per_iteration": 2.5622968673706055 + }, + { + "auxiliary_loss_clip": 0.0697528, + "auxiliary_loss_mlp": 0.01365327, + "balance_loss_clip": 0.06437931, + "balance_loss_mlp": 0.01295518, + "epoch": 0.06150608747933263, + "flos": 22606365870720.0, + "grad_norm": 3.61040283013288, + "language_loss": 0.84977013, + "learning_rate": 3.989597081453611e-06, + "loss": 0.93317622, + "num_input_tokens_seen": 21881880, + "router_z_loss_clip": 5.37109375, + "router_z_loss_mlp": 0.69824219, + "step": 1023, + "time_per_iteration": 2.6407079696655273 + }, + { + "auxiliary_loss_clip": 0.0673309, + "auxiliary_loss_mlp": 0.01419946, + "balance_loss_clip": 0.06385664, + "balance_loss_mlp": 0.0137119, + "epoch": 0.0615662107320006, + "flos": 56758097139840.0, + "grad_norm": 0.9164460168563352, + "language_loss": 0.64884549, + "learning_rate": 3.989557372392231e-06, + "loss": 0.73037589, + "num_input_tokens_seen": 21940550, + "router_z_loss_clip": 3.46875, + "router_z_loss_mlp": 0.48706055, + "step": 1024, + "time_per_iteration": 3.240457534790039 + }, + { + "auxiliary_loss_clip": 0.06995942, + "auxiliary_loss_mlp": 0.01352799, + "balance_loss_clip": 0.06434722, + "balance_loss_mlp": 0.01272356, + "epoch": 0.06162633398466857, + "flos": 22571342064000.0, + "grad_norm": 2.66796346315112, + "language_loss": 0.91765183, + "learning_rate": 3.989517587886636e-06, + "loss": 1.00113928, + "num_input_tokens_seen": 21958390, + "router_z_loss_clip": 5.61328125, + "router_z_loss_mlp": 0.80371094, + "step": 1025, + "time_per_iteration": 2.6372737884521484 + }, + { + "auxiliary_loss_clip": 0.06986167, + "auxiliary_loss_mlp": 0.01374261, + "balance_loss_clip": 0.06435852, + "balance_loss_mlp": 0.01300828, + "epoch": 0.06168645723733654, + "flos": 25600158000000.0, + "grad_norm": 2.4272602971827535, + "language_loss": 0.871768, + "learning_rate": 3.989477727938335e-06, + "loss": 0.95537233, + "num_input_tokens_seen": 21978625, + "router_z_loss_clip": 5.50390625, + "router_z_loss_mlp": 0.73486328, + "step": 1026, + "time_per_iteration": 2.6508452892303467 + }, + { + "auxiliary_loss_clip": 0.06989977, + "auxiliary_loss_mlp": 0.01363012, + "balance_loss_clip": 0.06439693, + "balance_loss_mlp": 0.01286622, + "epoch": 0.06174658049000451, + "flos": 16003461553920.0, + "grad_norm": 3.495791258705881, + "language_loss": 0.8437736, + "learning_rate": 3.989437792548839e-06, + "loss": 0.92730343, + "num_input_tokens_seen": 21996035, + "router_z_loss_clip": 5.5, + "router_z_loss_mlp": 0.76416016, + "step": 1027, + "time_per_iteration": 2.613172769546509 + }, + { + "auxiliary_loss_clip": 0.06973707, + "auxiliary_loss_mlp": 0.01359003, + "balance_loss_clip": 0.0641673, + "balance_loss_mlp": 0.01281422, + "epoch": 0.06180670374267248, + "flos": 11289842772480.0, + "grad_norm": 3.8173647671524793, + "language_loss": 0.87086433, + "learning_rate": 3.989397781719663e-06, + "loss": 0.95419139, + "num_input_tokens_seen": 22011625, + "router_z_loss_clip": 5.5625, + "router_z_loss_mlp": 0.77539062, + "step": 1028, + "time_per_iteration": 2.6524107456207275 + }, + { + "auxiliary_loss_clip": 0.06704632, + "auxiliary_loss_mlp": 0.01372349, + "balance_loss_clip": 0.06357226, + "balance_loss_mlp": 0.01321519, + "epoch": 0.06186682699534045, + "flos": 65147647340160.0, + "grad_norm": 0.9176628937357996, + "language_loss": 0.60490429, + "learning_rate": 3.989357695452323e-06, + "loss": 0.68567419, + "num_input_tokens_seen": 22066035, + "router_z_loss_clip": 3.46875, + "router_z_loss_mlp": 0.50830078, + "step": 1029, + "time_per_iteration": 3.218085289001465 + }, + { + "auxiliary_loss_clip": 0.07009555, + "auxiliary_loss_mlp": 0.01372678, + "balance_loss_clip": 0.06434123, + "balance_loss_mlp": 0.01287372, + "epoch": 0.061926950248008414, + "flos": 21112111209600.0, + "grad_norm": 3.737194986722716, + "language_loss": 0.85668898, + "learning_rate": 3.98931753374834e-06, + "loss": 0.94051135, + "num_input_tokens_seen": 22085015, + "router_z_loss_clip": 5.75390625, + "router_z_loss_mlp": 0.85253906, + "step": 1030, + "time_per_iteration": 2.7052202224731445 + }, + { + "auxiliary_loss_clip": 0.06989674, + "auxiliary_loss_mlp": 0.01357455, + "balance_loss_clip": 0.06431329, + "balance_loss_mlp": 0.01280446, + "epoch": 0.061987073500676386, + "flos": 17754161235840.0, + "grad_norm": 3.4423452178420013, + "language_loss": 0.83235556, + "learning_rate": 3.989277296609237e-06, + "loss": 0.91582686, + "num_input_tokens_seen": 22102775, + "router_z_loss_clip": 5.5859375, + "router_z_loss_mlp": 0.77050781, + "step": 1031, + "time_per_iteration": 2.588575839996338 + }, + { + "auxiliary_loss_clip": 0.06983647, + "auxiliary_loss_mlp": 0.01355074, + "balance_loss_clip": 0.06433594, + "balance_loss_mlp": 0.01283453, + "epoch": 0.06204719675334436, + "flos": 21842858666880.0, + "grad_norm": 14.220096224086527, + "language_loss": 0.80345309, + "learning_rate": 3.98923698403654e-06, + "loss": 0.88684022, + "num_input_tokens_seen": 22121680, + "router_z_loss_clip": 5.50390625, + "router_z_loss_mlp": 0.71582031, + "step": 1032, + "time_per_iteration": 2.6636962890625 + }, + { + "auxiliary_loss_clip": 0.06996015, + "auxiliary_loss_mlp": 0.01349932, + "balance_loss_clip": 0.064355, + "balance_loss_mlp": 0.01272828, + "epoch": 0.06210732000601232, + "flos": 19359650592000.0, + "grad_norm": 3.724079257252284, + "language_loss": 0.9305315, + "learning_rate": 3.989196596031776e-06, + "loss": 1.01399088, + "num_input_tokens_seen": 22138155, + "router_z_loss_clip": 5.60546875, + "router_z_loss_mlp": 0.77197266, + "step": 1033, + "time_per_iteration": 2.5974748134613037 + }, + { + "auxiliary_loss_clip": 0.06988779, + "auxiliary_loss_mlp": 0.01347157, + "balance_loss_clip": 0.06438898, + "balance_loss_mlp": 0.0127525, + "epoch": 0.062167443258680295, + "flos": 24755534444160.0, + "grad_norm": 3.649174890809254, + "language_loss": 0.87141907, + "learning_rate": 3.989156132596479e-06, + "loss": 0.95477843, + "num_input_tokens_seen": 22157420, + "router_z_loss_clip": 5.49609375, + "router_z_loss_mlp": 0.71875, + "step": 1034, + "time_per_iteration": 2.6747853755950928 + }, + { + "auxiliary_loss_clip": 0.06962503, + "auxiliary_loss_mlp": 0.01360042, + "balance_loss_clip": 0.06434912, + "balance_loss_mlp": 0.01290854, + "epoch": 0.06222756651134827, + "flos": 34466903602560.0, + "grad_norm": 3.3762373845942313, + "language_loss": 0.84657645, + "learning_rate": 3.989115593732182e-06, + "loss": 0.92980194, + "num_input_tokens_seen": 22178620, + "router_z_loss_clip": 5.2734375, + "router_z_loss_mlp": 0.69189453, + "step": 1035, + "time_per_iteration": 2.690265655517578 + }, + { + "auxiliary_loss_clip": 0.06995995, + "auxiliary_loss_mlp": 0.01348638, + "balance_loss_clip": 0.06441504, + "balance_loss_mlp": 0.01275015, + "epoch": 0.06228768976401623, + "flos": 25673601703680.0, + "grad_norm": 4.464615872821339, + "language_loss": 0.81925672, + "learning_rate": 3.989074979440421e-06, + "loss": 0.90270305, + "num_input_tokens_seen": 22197125, + "router_z_loss_clip": 5.5390625, + "router_z_loss_mlp": 0.73583984, + "step": 1036, + "time_per_iteration": 2.6662774085998535 + }, + { + "auxiliary_loss_clip": 0.07003354, + "auxiliary_loss_mlp": 0.01370226, + "balance_loss_clip": 0.064463, + "balance_loss_mlp": 0.01293693, + "epoch": 0.062347813016684205, + "flos": 25301687356800.0, + "grad_norm": 3.754285367283167, + "language_loss": 0.89123344, + "learning_rate": 3.989034289722739e-06, + "loss": 0.97496927, + "num_input_tokens_seen": 22217575, + "router_z_loss_clip": 5.56640625, + "router_z_loss_mlp": 0.76513672, + "step": 1037, + "time_per_iteration": 2.609894037246704 + }, + { + "auxiliary_loss_clip": 0.07008456, + "auxiliary_loss_mlp": 0.01342836, + "balance_loss_clip": 0.06453587, + "balance_loss_mlp": 0.01269641, + "epoch": 0.06240793626935217, + "flos": 26914388163840.0, + "grad_norm": 15.327798453817612, + "language_loss": 0.8346867, + "learning_rate": 3.988993524580676e-06, + "loss": 0.91819966, + "num_input_tokens_seen": 22236840, + "router_z_loss_clip": 5.54296875, + "router_z_loss_mlp": 0.73095703, + "step": 1038, + "time_per_iteration": 2.6626057624816895 + }, + { + "auxiliary_loss_clip": 0.06993866, + "auxiliary_loss_mlp": 0.01340149, + "balance_loss_clip": 0.0645204, + "balance_loss_mlp": 0.01267956, + "epoch": 0.06246805952202014, + "flos": 21622108285440.0, + "grad_norm": 3.08050473605758, + "language_loss": 0.88628823, + "learning_rate": 3.98895268401578e-06, + "loss": 0.96962833, + "num_input_tokens_seen": 22256465, + "router_z_loss_clip": 5.41796875, + "router_z_loss_mlp": 0.72167969, + "step": 1039, + "time_per_iteration": 2.6248486042022705 + }, + { + "auxiliary_loss_clip": 0.0701851, + "auxiliary_loss_mlp": 0.01340836, + "balance_loss_clip": 0.06453219, + "balance_loss_mlp": 0.01264352, + "epoch": 0.0625281827746881, + "flos": 19316954136960.0, + "grad_norm": 4.220230384937809, + "language_loss": 0.85023952, + "learning_rate": 3.9889117680296e-06, + "loss": 0.933833, + "num_input_tokens_seen": 22274025, + "router_z_loss_clip": 5.6484375, + "router_z_loss_mlp": 0.76513672, + "step": 1040, + "time_per_iteration": 2.6646370887756348 + }, + { + "auxiliary_loss_clip": 0.07036482, + "auxiliary_loss_mlp": 0.01364298, + "balance_loss_clip": 0.06464302, + "balance_loss_mlp": 0.01274987, + "epoch": 0.06258830602735609, + "flos": 27753183861120.0, + "grad_norm": 4.590358257909823, + "language_loss": 0.72318321, + "learning_rate": 3.988870776623685e-06, + "loss": 0.80719095, + "num_input_tokens_seen": 22292245, + "router_z_loss_clip": 5.71875, + "router_z_loss_mlp": 0.89306641, + "step": 1041, + "time_per_iteration": 2.6730599403381348 + }, + { + "auxiliary_loss_clip": 0.07040736, + "auxiliary_loss_mlp": 0.01378227, + "balance_loss_clip": 0.06470466, + "balance_loss_mlp": 0.01298548, + "epoch": 0.06264842928002405, + "flos": 23229442431360.0, + "grad_norm": 2.706616424442574, + "language_loss": 0.84952104, + "learning_rate": 3.9888297097995905e-06, + "loss": 0.93371069, + "num_input_tokens_seen": 22311455, + "router_z_loss_clip": 5.6953125, + "router_z_loss_mlp": 0.796875, + "step": 1042, + "time_per_iteration": 2.6521389484405518 + }, + { + "auxiliary_loss_clip": 0.0703849, + "auxiliary_loss_mlp": 0.0134851, + "balance_loss_clip": 0.06476429, + "balance_loss_mlp": 0.01272598, + "epoch": 0.06270855253269202, + "flos": 38408671699200.0, + "grad_norm": 3.072391396873047, + "language_loss": 0.79772788, + "learning_rate": 3.988788567558874e-06, + "loss": 0.88159788, + "num_input_tokens_seen": 22333750, + "router_z_loss_clip": 5.62109375, + "router_z_loss_mlp": 0.75927734, + "step": 1043, + "time_per_iteration": 4.132354021072388 + }, + { + "auxiliary_loss_clip": 0.07023476, + "auxiliary_loss_mlp": 0.01365807, + "balance_loss_clip": 0.06473523, + "balance_loss_mlp": 0.01289656, + "epoch": 0.06276867578535998, + "flos": 22459771952640.0, + "grad_norm": 8.578696431093903, + "language_loss": 0.95484012, + "learning_rate": 3.988747349903097e-06, + "loss": 1.03873289, + "num_input_tokens_seen": 22351940, + "router_z_loss_clip": 5.49609375, + "router_z_loss_mlp": 0.76123047, + "step": 1044, + "time_per_iteration": 4.0872087478637695 + }, + { + "auxiliary_loss_clip": 0.0702454, + "auxiliary_loss_mlp": 0.0136404, + "balance_loss_clip": 0.06474113, + "balance_loss_mlp": 0.0129156, + "epoch": 0.06282879903802796, + "flos": 22937176990080.0, + "grad_norm": 5.298315501835511, + "language_loss": 0.88737643, + "learning_rate": 3.988706056833821e-06, + "loss": 0.97126228, + "num_input_tokens_seen": 22372085, + "router_z_loss_clip": 5.5, + "router_z_loss_mlp": 0.72412109, + "step": 1045, + "time_per_iteration": 2.6359164714813232 + }, + { + "auxiliary_loss_clip": 0.07016507, + "auxiliary_loss_mlp": 0.01377248, + "balance_loss_clip": 0.06467608, + "balance_loss_mlp": 0.01300334, + "epoch": 0.06288892229069593, + "flos": 34827036451200.0, + "grad_norm": 2.8748954821383803, + "language_loss": 0.81643683, + "learning_rate": 3.9886646883526125e-06, + "loss": 0.90037435, + "num_input_tokens_seen": 22392020, + "router_z_loss_clip": 5.484375, + "router_z_loss_mlp": 0.76855469, + "step": 1046, + "time_per_iteration": 4.205566883087158 + }, + { + "auxiliary_loss_clip": 0.07049687, + "auxiliary_loss_mlp": 0.01383919, + "balance_loss_clip": 0.0647831, + "balance_loss_mlp": 0.01309628, + "epoch": 0.06294904554336389, + "flos": 19433178149760.0, + "grad_norm": 3.049904917466256, + "language_loss": 0.8054778, + "learning_rate": 3.988623244461039e-06, + "loss": 0.8898139, + "num_input_tokens_seen": 22411180, + "router_z_loss_clip": 5.71484375, + "router_z_loss_mlp": 0.74267578, + "step": 1047, + "time_per_iteration": 2.628453493118286 + }, + { + "auxiliary_loss_clip": 0.07082113, + "auxiliary_loss_mlp": 0.01418593, + "balance_loss_clip": 0.06488797, + "balance_loss_mlp": 0.01332237, + "epoch": 0.06300916879603187, + "flos": 40671464808960.0, + "grad_norm": 5.477739593856775, + "language_loss": 0.80062962, + "learning_rate": 3.988581725160672e-06, + "loss": 0.88563669, + "num_input_tokens_seen": 22435105, + "router_z_loss_clip": 5.921875, + "router_z_loss_mlp": 0.86279297, + "step": 1048, + "time_per_iteration": 4.191184997558594 + }, + { + "auxiliary_loss_clip": 0.07059699, + "auxiliary_loss_mlp": 0.01409495, + "balance_loss_clip": 0.06479897, + "balance_loss_mlp": 0.01322902, + "epoch": 0.06306929204869983, + "flos": 23810703004800.0, + "grad_norm": 4.634968800445174, + "language_loss": 0.81291783, + "learning_rate": 3.988540130453087e-06, + "loss": 0.89760983, + "num_input_tokens_seen": 22452710, + "router_z_loss_clip": 5.796875, + "router_z_loss_mlp": 0.86669922, + "step": 1049, + "time_per_iteration": 2.650202989578247 + }, + { + "auxiliary_loss_clip": 0.07039324, + "auxiliary_loss_mlp": 0.01395065, + "balance_loss_clip": 0.06466646, + "balance_loss_mlp": 0.01316435, + "epoch": 0.0631294153013678, + "flos": 18921671700480.0, + "grad_norm": 5.321703459602036, + "language_loss": 0.85613585, + "learning_rate": 3.988498460339862e-06, + "loss": 0.9404797, + "num_input_tokens_seen": 22470175, + "router_z_loss_clip": 5.71875, + "router_z_loss_mlp": 0.78662109, + "step": 1050, + "time_per_iteration": 2.6393301486968994 + }, + { + "auxiliary_loss_clip": 0.07003346, + "auxiliary_loss_mlp": 0.01381224, + "balance_loss_clip": 0.06475418, + "balance_loss_mlp": 0.01309221, + "epoch": 0.06318953855403578, + "flos": 24287101793280.0, + "grad_norm": 2.921652621723748, + "language_loss": 0.80915332, + "learning_rate": 3.988456714822575e-06, + "loss": 0.89299899, + "num_input_tokens_seen": 22490020, + "router_z_loss_clip": 5.2734375, + "router_z_loss_mlp": 0.71972656, + "step": 1051, + "time_per_iteration": 2.6563098430633545 + }, + { + "auxiliary_loss_clip": 0.07019964, + "auxiliary_loss_mlp": 0.01395256, + "balance_loss_clip": 0.06461668, + "balance_loss_mlp": 0.01314957, + "epoch": 0.06324966180670374, + "flos": 22535563570560.0, + "grad_norm": 3.4102512673670256, + "language_loss": 0.84142733, + "learning_rate": 3.98841489390281e-06, + "loss": 0.92557955, + "num_input_tokens_seen": 22509685, + "router_z_loss_clip": 5.57421875, + "router_z_loss_mlp": 0.80224609, + "step": 1052, + "time_per_iteration": 2.6776039600372314 + }, + { + "auxiliary_loss_clip": 0.07036786, + "auxiliary_loss_mlp": 0.01379519, + "balance_loss_clip": 0.06459802, + "balance_loss_mlp": 0.01299411, + "epoch": 0.06330978505937171, + "flos": 15783465859200.0, + "grad_norm": 2.8507947153873663, + "language_loss": 0.80809307, + "learning_rate": 3.988372997582155e-06, + "loss": 0.89225614, + "num_input_tokens_seen": 22527905, + "router_z_loss_clip": 5.76953125, + "router_z_loss_mlp": 0.80175781, + "step": 1053, + "time_per_iteration": 2.6043174266815186 + }, + { + "auxiliary_loss_clip": 0.06984901, + "auxiliary_loss_mlp": 0.01368181, + "balance_loss_clip": 0.06446727, + "balance_loss_mlp": 0.0129532, + "epoch": 0.06336990831203967, + "flos": 21477610719360.0, + "grad_norm": 4.159955078588776, + "language_loss": 0.88012934, + "learning_rate": 3.988331025862195e-06, + "loss": 0.96366018, + "num_input_tokens_seen": 22546335, + "router_z_loss_clip": 5.3828125, + "router_z_loss_mlp": 0.72802734, + "step": 1054, + "time_per_iteration": 2.647026777267456 + }, + { + "auxiliary_loss_clip": 0.06987712, + "auxiliary_loss_mlp": 0.01370375, + "balance_loss_clip": 0.06445334, + "balance_loss_mlp": 0.01301568, + "epoch": 0.06343003156470765, + "flos": 18484824839040.0, + "grad_norm": 2.8104304693341837, + "language_loss": 0.89331806, + "learning_rate": 3.9882889787445225e-06, + "loss": 0.97689891, + "num_input_tokens_seen": 22563885, + "router_z_loss_clip": 5.421875, + "router_z_loss_mlp": 0.68798828, + "step": 1055, + "time_per_iteration": 2.5695717334747314 + }, + { + "auxiliary_loss_clip": 0.07031021, + "auxiliary_loss_mlp": 0.01393239, + "balance_loss_clip": 0.06440826, + "balance_loss_mlp": 0.01302354, + "epoch": 0.06349015481737562, + "flos": 25161801765120.0, + "grad_norm": 4.1133835551619224, + "language_loss": 0.85196388, + "learning_rate": 3.988246856230734e-06, + "loss": 0.93620646, + "num_input_tokens_seen": 22583035, + "router_z_loss_clip": 5.89453125, + "router_z_loss_mlp": 0.90820312, + "step": 1056, + "time_per_iteration": 2.685821056365967 + }, + { + "auxiliary_loss_clip": 0.07029925, + "auxiliary_loss_mlp": 0.01408784, + "balance_loss_clip": 0.06446205, + "balance_loss_mlp": 0.01319377, + "epoch": 0.06355027807004358, + "flos": 26879322430080.0, + "grad_norm": 5.02877545894497, + "language_loss": 0.84474576, + "learning_rate": 3.988204658322426e-06, + "loss": 0.92913282, + "num_input_tokens_seen": 22605055, + "router_z_loss_clip": 5.8359375, + "router_z_loss_mlp": 0.89501953, + "step": 1057, + "time_per_iteration": 2.6688387393951416 + }, + { + "auxiliary_loss_clip": 0.06953399, + "auxiliary_loss_mlp": 0.01345887, + "balance_loss_clip": 0.06428042, + "balance_loss_mlp": 0.01278987, + "epoch": 0.06361040132271156, + "flos": 21402951131520.0, + "grad_norm": 3.9641222811805337, + "language_loss": 0.85986251, + "learning_rate": 3.988162385021196e-06, + "loss": 0.94285542, + "num_input_tokens_seen": 22623760, + "router_z_loss_clip": 5.25, + "router_z_loss_mlp": 0.66845703, + "step": 1058, + "time_per_iteration": 2.6371591091156006 + }, + { + "auxiliary_loss_clip": 0.0698344, + "auxiliary_loss_mlp": 0.01353949, + "balance_loss_clip": 0.06427366, + "balance_loss_mlp": 0.01275796, + "epoch": 0.06367052457537953, + "flos": 25739959737600.0, + "grad_norm": 3.2277693096185125, + "language_loss": 0.90202904, + "learning_rate": 3.988120036328651e-06, + "loss": 0.98540288, + "num_input_tokens_seen": 22643000, + "router_z_loss_clip": 5.5625, + "router_z_loss_mlp": 0.78173828, + "step": 1059, + "time_per_iteration": 2.6188669204711914 + }, + { + "auxiliary_loss_clip": 0.06969759, + "auxiliary_loss_mlp": 0.01343893, + "balance_loss_clip": 0.06422018, + "balance_loss_mlp": 0.01267218, + "epoch": 0.0637306478280475, + "flos": 17635840871040.0, + "grad_norm": 3.450468160359764, + "language_loss": 0.94701946, + "learning_rate": 3.988077612246394e-06, + "loss": 1.0301559, + "num_input_tokens_seen": 22660460, + "router_z_loss_clip": 5.48046875, + "router_z_loss_mlp": 0.76708984, + "step": 1060, + "time_per_iteration": 2.659820079803467 + }, + { + "auxiliary_loss_clip": 0.06957703, + "auxiliary_loss_mlp": 0.0133292, + "balance_loss_clip": 0.06419823, + "balance_loss_mlp": 0.01262396, + "epoch": 0.06379077108071547, + "flos": 13667727864960.0, + "grad_norm": 3.5269486179455622, + "language_loss": 0.91039562, + "learning_rate": 3.988035112776035e-06, + "loss": 0.99330181, + "num_input_tokens_seen": 22679270, + "router_z_loss_clip": 5.38671875, + "router_z_loss_mlp": 0.70483398, + "step": 1061, + "time_per_iteration": 2.595237970352173 + }, + { + "auxiliary_loss_clip": 0.07004992, + "auxiliary_loss_mlp": 0.0134989, + "balance_loss_clip": 0.06433421, + "balance_loss_mlp": 0.01272071, + "epoch": 0.06385089433338344, + "flos": 28486950065280.0, + "grad_norm": 26.387846770017223, + "language_loss": 0.80432439, + "learning_rate": 3.987992537919185e-06, + "loss": 0.88787317, + "num_input_tokens_seen": 22699330, + "router_z_loss_clip": 5.7109375, + "router_z_loss_mlp": 0.77832031, + "step": 1062, + "time_per_iteration": 2.69326114654541 + }, + { + "auxiliary_loss_clip": 0.06971388, + "auxiliary_loss_mlp": 0.01333448, + "balance_loss_clip": 0.06420203, + "balance_loss_mlp": 0.01260349, + "epoch": 0.0639110175860514, + "flos": 24317052428160.0, + "grad_norm": 14.259145516712906, + "language_loss": 0.90426183, + "learning_rate": 3.987949887677459e-06, + "loss": 0.98731029, + "num_input_tokens_seen": 22717945, + "router_z_loss_clip": 5.5, + "router_z_loss_mlp": 0.73095703, + "step": 1063, + "time_per_iteration": 2.642476797103882 + }, + { + "auxiliary_loss_clip": 0.06974378, + "auxiliary_loss_mlp": 0.01332583, + "balance_loss_clip": 0.06425211, + "balance_loss_mlp": 0.01259436, + "epoch": 0.06397114083871938, + "flos": 22097291189760.0, + "grad_norm": 2.9601227778370176, + "language_loss": 0.82562792, + "learning_rate": 3.9879071620524744e-06, + "loss": 0.90869761, + "num_input_tokens_seen": 22736790, + "router_z_loss_clip": 5.48828125, + "router_z_loss_mlp": 0.73144531, + "step": 1064, + "time_per_iteration": 2.661435604095459 + }, + { + "auxiliary_loss_clip": 0.06941259, + "auxiliary_loss_mlp": 0.01342729, + "balance_loss_clip": 0.06412596, + "balance_loss_mlp": 0.01271298, + "epoch": 0.06403126409138735, + "flos": 19578849672960.0, + "grad_norm": 3.2505919469988727, + "language_loss": 0.86995006, + "learning_rate": 3.987864361045851e-06, + "loss": 0.95278984, + "num_input_tokens_seen": 22754745, + "router_z_loss_clip": 5.28515625, + "router_z_loss_mlp": 0.71386719, + "step": 1065, + "time_per_iteration": 2.5758113861083984 + }, + { + "auxiliary_loss_clip": 0.06963679, + "auxiliary_loss_mlp": 0.01340247, + "balance_loss_clip": 0.06401139, + "balance_loss_mlp": 0.01265669, + "epoch": 0.06409138734405531, + "flos": 40816968624000.0, + "grad_norm": 2.0842805851080395, + "language_loss": 0.71325147, + "learning_rate": 3.987821484659211e-06, + "loss": 0.79629076, + "num_input_tokens_seen": 22776780, + "router_z_loss_clip": 5.625, + "router_z_loss_mlp": 0.74609375, + "step": 1066, + "time_per_iteration": 2.7917239665985107 + }, + { + "auxiliary_loss_clip": 0.06944396, + "auxiliary_loss_mlp": 0.0133661, + "balance_loss_clip": 0.06404863, + "balance_loss_mlp": 0.01266419, + "epoch": 0.06415151059672328, + "flos": 20446631683200.0, + "grad_norm": 3.9323967107233093, + "language_loss": 0.93839109, + "learning_rate": 3.987778532894181e-06, + "loss": 1.02120125, + "num_input_tokens_seen": 22793915, + "router_z_loss_clip": 5.390625, + "router_z_loss_mlp": 0.70166016, + "step": 1067, + "time_per_iteration": 2.6115174293518066 + }, + { + "auxiliary_loss_clip": 0.06956208, + "auxiliary_loss_mlp": 0.0134, + "balance_loss_clip": 0.06410809, + "balance_loss_mlp": 0.01270954, + "epoch": 0.06421163384939126, + "flos": 18077006217600.0, + "grad_norm": 2.3907527813163947, + "language_loss": 0.86262715, + "learning_rate": 3.987735505752391e-06, + "loss": 0.94558918, + "num_input_tokens_seen": 22812670, + "router_z_loss_clip": 5.453125, + "router_z_loss_mlp": 0.68994141, + "step": 1068, + "time_per_iteration": 2.6069822311401367 + }, + { + "auxiliary_loss_clip": 0.06937677, + "auxiliary_loss_mlp": 0.01339596, + "balance_loss_clip": 0.0640877, + "balance_loss_mlp": 0.01269787, + "epoch": 0.06427175710205922, + "flos": 25126526396160.0, + "grad_norm": 3.0644651013361175, + "language_loss": 0.92719203, + "learning_rate": 3.987692403235471e-06, + "loss": 1.0099647, + "num_input_tokens_seen": 22832440, + "router_z_loss_clip": 5.296875, + "router_z_loss_mlp": 0.69775391, + "step": 1069, + "time_per_iteration": 2.6751255989074707 + }, + { + "auxiliary_loss_clip": 0.06952519, + "auxiliary_loss_mlp": 0.01331878, + "balance_loss_clip": 0.06402327, + "balance_loss_mlp": 0.01256777, + "epoch": 0.06433188035472719, + "flos": 17385684906240.0, + "grad_norm": 4.001862380962301, + "language_loss": 0.98985177, + "learning_rate": 3.987649225345056e-06, + "loss": 1.07269573, + "num_input_tokens_seen": 22845495, + "router_z_loss_clip": 5.5078125, + "router_z_loss_mlp": 0.75048828, + "step": 1070, + "time_per_iteration": 2.5646464824676514 + }, + { + "auxiliary_loss_clip": 0.06933151, + "auxiliary_loss_mlp": 0.01337757, + "balance_loss_clip": 0.0639724, + "balance_loss_mlp": 0.01267042, + "epoch": 0.06439200360739517, + "flos": 23552371267200.0, + "grad_norm": 2.5082910657712474, + "language_loss": 0.90418053, + "learning_rate": 3.987605972082782e-06, + "loss": 0.98688966, + "num_input_tokens_seen": 22865390, + "router_z_loss_clip": 5.359375, + "router_z_loss_mlp": 0.70703125, + "step": 1071, + "time_per_iteration": 2.6427106857299805 + }, + { + "auxiliary_loss_clip": 0.06918223, + "auxiliary_loss_mlp": 0.01334321, + "balance_loss_clip": 0.06398708, + "balance_loss_mlp": 0.01262414, + "epoch": 0.06445212686006313, + "flos": 21986014567680.0, + "grad_norm": 1.871300371090536, + "language_loss": 0.79228568, + "learning_rate": 3.987562643450292e-06, + "loss": 0.87481117, + "num_input_tokens_seen": 22885495, + "router_z_loss_clip": 5.19921875, + "router_z_loss_mlp": 0.71923828, + "step": 1072, + "time_per_iteration": 2.647038698196411 + }, + { + "auxiliary_loss_clip": 0.06937171, + "auxiliary_loss_mlp": 0.01329872, + "balance_loss_clip": 0.06401432, + "balance_loss_mlp": 0.01259205, + "epoch": 0.0645122501127311, + "flos": 25928369642880.0, + "grad_norm": 2.655186985808554, + "language_loss": 0.84775895, + "learning_rate": 3.987519239449226e-06, + "loss": 0.9304294, + "num_input_tokens_seen": 22904845, + "router_z_loss_clip": 5.35546875, + "router_z_loss_mlp": 0.70800781, + "step": 1073, + "time_per_iteration": 2.658341646194458 + }, + { + "auxiliary_loss_clip": 0.06906792, + "auxiliary_loss_mlp": 0.01330074, + "balance_loss_clip": 0.06396446, + "balance_loss_mlp": 0.01263412, + "epoch": 0.06457237336539907, + "flos": 25632498476160.0, + "grad_norm": 1.923481252052909, + "language_loss": 0.82366061, + "learning_rate": 3.987475760081233e-06, + "loss": 0.90602928, + "num_input_tokens_seen": 22925940, + "router_z_loss_clip": 5.09765625, + "router_z_loss_mlp": 0.66650391, + "step": 1074, + "time_per_iteration": 2.6500589847564697 + }, + { + "auxiliary_loss_clip": 0.06911084, + "auxiliary_loss_mlp": 0.01341632, + "balance_loss_clip": 0.0638795, + "balance_loss_mlp": 0.01268152, + "epoch": 0.06463249661806704, + "flos": 19470088673280.0, + "grad_norm": 4.283359791903129, + "language_loss": 0.82960403, + "learning_rate": 3.987432205347958e-06, + "loss": 0.91213125, + "num_input_tokens_seen": 22944375, + "router_z_loss_clip": 5.23046875, + "router_z_loss_mlp": 0.73486328, + "step": 1075, + "time_per_iteration": 2.620055675506592 + }, + { + "auxiliary_loss_clip": 0.06919183, + "auxiliary_loss_mlp": 0.01329908, + "balance_loss_clip": 0.06393343, + "balance_loss_mlp": 0.01260528, + "epoch": 0.064692619870735, + "flos": 24504833427840.0, + "grad_norm": 4.7074268898703, + "language_loss": 0.90130782, + "learning_rate": 3.987388575251055e-06, + "loss": 0.98379874, + "num_input_tokens_seen": 22959145, + "router_z_loss_clip": 5.26171875, + "router_z_loss_mlp": 0.69335938, + "step": 1076, + "time_per_iteration": 2.6410202980041504 + }, + { + "auxiliary_loss_clip": 0.06917243, + "auxiliary_loss_mlp": 0.01324517, + "balance_loss_clip": 0.06391963, + "balance_loss_mlp": 0.01256901, + "epoch": 0.06475274312340297, + "flos": 17024252319360.0, + "grad_norm": 4.89859871786138, + "language_loss": 0.84430212, + "learning_rate": 3.98734486979218e-06, + "loss": 0.92671967, + "num_input_tokens_seen": 22978100, + "router_z_loss_clip": 5.25390625, + "router_z_loss_mlp": 0.67578125, + "step": 1077, + "time_per_iteration": 2.6577157974243164 + }, + { + "auxiliary_loss_clip": 0.06961326, + "auxiliary_loss_mlp": 0.0134572, + "balance_loss_clip": 0.06399816, + "balance_loss_mlp": 0.01265659, + "epoch": 0.06481286637607095, + "flos": 24579409161600.0, + "grad_norm": 2.525164880783881, + "language_loss": 0.95071888, + "learning_rate": 3.987301088972986e-06, + "loss": 1.03378928, + "num_input_tokens_seen": 22997285, + "router_z_loss_clip": 5.609375, + "router_z_loss_mlp": 0.80078125, + "step": 1078, + "time_per_iteration": 2.60807466506958 + }, + { + "auxiliary_loss_clip": 0.0696152, + "auxiliary_loss_mlp": 0.01348441, + "balance_loss_clip": 0.0639492, + "balance_loss_mlp": 0.01266616, + "epoch": 0.06487298962873891, + "flos": 21111985428480.0, + "grad_norm": 2.577127703708103, + "language_loss": 0.81118071, + "learning_rate": 3.987257232795137e-06, + "loss": 0.89428037, + "num_input_tokens_seen": 23016285, + "router_z_loss_clip": 5.6640625, + "router_z_loss_mlp": 0.81835938, + "step": 1079, + "time_per_iteration": 2.6317968368530273 + }, + { + "auxiliary_loss_clip": 0.06928547, + "auxiliary_loss_mlp": 0.01328554, + "balance_loss_clip": 0.06390582, + "balance_loss_mlp": 0.01256837, + "epoch": 0.06493311288140688, + "flos": 24615103800960.0, + "grad_norm": 2.4676521714353865, + "language_loss": 0.72843546, + "learning_rate": 3.987213301260294e-06, + "loss": 0.81100643, + "num_input_tokens_seen": 23036420, + "router_z_loss_clip": 5.37109375, + "router_z_loss_mlp": 0.71728516, + "step": 1080, + "time_per_iteration": 2.6215646266937256 + }, + { + "auxiliary_loss_clip": 0.06919578, + "auxiliary_loss_mlp": 0.01334283, + "balance_loss_clip": 0.06385017, + "balance_loss_mlp": 0.01258323, + "epoch": 0.06499323613407486, + "flos": 25345054644480.0, + "grad_norm": 2.8195024652173233, + "language_loss": 0.76152724, + "learning_rate": 3.987169294370123e-06, + "loss": 0.8440659, + "num_input_tokens_seen": 23056945, + "router_z_loss_clip": 5.34375, + "router_z_loss_mlp": 0.75927734, + "step": 1081, + "time_per_iteration": 2.619861364364624 + }, + { + "auxiliary_loss_clip": 0.06903991, + "auxiliary_loss_mlp": 0.01330699, + "balance_loss_clip": 0.06382824, + "balance_loss_mlp": 0.01260985, + "epoch": 0.06505335938674282, + "flos": 20381908803840.0, + "grad_norm": 3.8302016885059436, + "language_loss": 0.87991226, + "learning_rate": 3.987125212126294e-06, + "loss": 0.96225917, + "num_input_tokens_seen": 23074940, + "router_z_loss_clip": 5.2109375, + "router_z_loss_mlp": 0.69726562, + "step": 1082, + "time_per_iteration": 3.9682254791259766 + }, + { + "auxiliary_loss_clip": 0.06965172, + "auxiliary_loss_mlp": 0.01343743, + "balance_loss_clip": 0.06394538, + "balance_loss_mlp": 0.01265304, + "epoch": 0.06511348263941079, + "flos": 25344970790400.0, + "grad_norm": 3.078052560557278, + "language_loss": 0.85807657, + "learning_rate": 3.987081054530478e-06, + "loss": 0.94116569, + "num_input_tokens_seen": 23093420, + "router_z_loss_clip": 5.70703125, + "router_z_loss_mlp": 0.78417969, + "step": 1083, + "time_per_iteration": 4.172176361083984 + }, + { + "auxiliary_loss_clip": 0.06918654, + "auxiliary_loss_mlp": 0.01347933, + "balance_loss_clip": 0.06379002, + "balance_loss_mlp": 0.01269684, + "epoch": 0.06517360589207877, + "flos": 20337912610560.0, + "grad_norm": 5.768369350853526, + "language_loss": 0.82737648, + "learning_rate": 3.987036821584348e-06, + "loss": 0.91004241, + "num_input_tokens_seen": 23111550, + "router_z_loss_clip": 5.40234375, + "router_z_loss_mlp": 0.78173828, + "step": 1084, + "time_per_iteration": 2.5647377967834473 + }, + { + "auxiliary_loss_clip": 0.06925946, + "auxiliary_loss_mlp": 0.01344614, + "balance_loss_clip": 0.06381474, + "balance_loss_mlp": 0.0126379, + "epoch": 0.06523372914474673, + "flos": 31688956391040.0, + "grad_norm": 2.8637661589946664, + "language_loss": 0.69041795, + "learning_rate": 3.986992513289584e-06, + "loss": 0.7731235, + "num_input_tokens_seen": 23130335, + "router_z_loss_clip": 5.44921875, + "router_z_loss_mlp": 0.80908203, + "step": 1085, + "time_per_iteration": 2.6726510524749756 + }, + { + "auxiliary_loss_clip": 0.06912835, + "auxiliary_loss_mlp": 0.01346265, + "balance_loss_clip": 0.06394207, + "balance_loss_mlp": 0.01271496, + "epoch": 0.0652938523974147, + "flos": 20784612326400.0, + "grad_norm": 3.652482458321433, + "language_loss": 0.80282378, + "learning_rate": 3.9869481296478645e-06, + "loss": 0.88541472, + "num_input_tokens_seen": 23152380, + "router_z_loss_clip": 5.1875, + "router_z_loss_mlp": 0.74707031, + "step": 1086, + "time_per_iteration": 4.0445778369903564 + }, + { + "auxiliary_loss_clip": 0.06903446, + "auxiliary_loss_mlp": 0.01343539, + "balance_loss_clip": 0.06383859, + "balance_loss_mlp": 0.01271489, + "epoch": 0.06535397565008266, + "flos": 16696627655040.0, + "grad_norm": 2.983342921031512, + "language_loss": 0.88718885, + "learning_rate": 3.986903670660872e-06, + "loss": 0.96965867, + "num_input_tokens_seen": 23171630, + "router_z_loss_clip": 5.1953125, + "router_z_loss_mlp": 0.72021484, + "step": 1087, + "time_per_iteration": 2.612272024154663 + }, + { + "auxiliary_loss_clip": 0.06922436, + "auxiliary_loss_mlp": 0.01359561, + "balance_loss_clip": 0.06381297, + "balance_loss_mlp": 0.01282457, + "epoch": 0.06541409890275064, + "flos": 26875171653120.0, + "grad_norm": 4.165814553604834, + "language_loss": 0.81038088, + "learning_rate": 3.9868591363302945e-06, + "loss": 0.89320087, + "num_input_tokens_seen": 23192520, + "router_z_loss_clip": 5.4140625, + "router_z_loss_mlp": 0.77099609, + "step": 1088, + "time_per_iteration": 4.128512620925903 + }, + { + "auxiliary_loss_clip": 0.06905861, + "auxiliary_loss_mlp": 0.01369914, + "balance_loss_clip": 0.0637981, + "balance_loss_mlp": 0.01292333, + "epoch": 0.06547422215541861, + "flos": 20527831889280.0, + "grad_norm": 2.3905965673188043, + "language_loss": 0.73899305, + "learning_rate": 3.9868145266578186e-06, + "loss": 0.82175082, + "num_input_tokens_seen": 23210710, + "router_z_loss_clip": 5.26171875, + "router_z_loss_mlp": 0.77587891, + "step": 1089, + "time_per_iteration": 2.5846424102783203 + }, + { + "auxiliary_loss_clip": 0.06903853, + "auxiliary_loss_mlp": 0.01367809, + "balance_loss_clip": 0.06390744, + "balance_loss_mlp": 0.01297094, + "epoch": 0.06553434540808657, + "flos": 22022925091200.0, + "grad_norm": 2.5933459275490005, + "language_loss": 0.88925481, + "learning_rate": 3.9867698416451366e-06, + "loss": 0.97197139, + "num_input_tokens_seen": 23230305, + "router_z_loss_clip": 5.12890625, + "router_z_loss_mlp": 0.70751953, + "step": 1090, + "time_per_iteration": 2.632730722427368 + }, + { + "auxiliary_loss_clip": 0.06923388, + "auxiliary_loss_mlp": 0.01379562, + "balance_loss_clip": 0.06394897, + "balance_loss_mlp": 0.01304031, + "epoch": 0.06559446866075455, + "flos": 24615648852480.0, + "grad_norm": 5.07637209675267, + "language_loss": 0.7519111, + "learning_rate": 3.9867250812939434e-06, + "loss": 0.83494061, + "num_input_tokens_seen": 23249015, + "router_z_loss_clip": 5.28125, + "router_z_loss_mlp": 0.75634766, + "step": 1091, + "time_per_iteration": 2.6071624755859375 + }, + { + "auxiliary_loss_clip": 0.06920849, + "auxiliary_loss_mlp": 0.01367283, + "balance_loss_clip": 0.06403629, + "balance_loss_mlp": 0.01298141, + "epoch": 0.06565459191342252, + "flos": 24280686956160.0, + "grad_norm": 3.183278775232349, + "language_loss": 0.85751635, + "learning_rate": 3.986680245605936e-06, + "loss": 0.94039762, + "num_input_tokens_seen": 23265105, + "router_z_loss_clip": 5.1640625, + "router_z_loss_mlp": 0.69091797, + "step": 1092, + "time_per_iteration": 2.605273962020874 + }, + { + "auxiliary_loss_clip": 0.06938382, + "auxiliary_loss_mlp": 0.01382517, + "balance_loss_clip": 0.06414036, + "balance_loss_mlp": 0.0131123, + "epoch": 0.06571471516609048, + "flos": 24793493143680.0, + "grad_norm": 3.590473362105347, + "language_loss": 0.74473059, + "learning_rate": 3.986635334582814e-06, + "loss": 0.82793957, + "num_input_tokens_seen": 23283950, + "router_z_loss_clip": 5.2421875, + "router_z_loss_mlp": 0.71337891, + "step": 1093, + "time_per_iteration": 2.638237237930298 + }, + { + "auxiliary_loss_clip": 0.06921268, + "auxiliary_loss_mlp": 0.01380472, + "balance_loss_clip": 0.06396792, + "balance_loss_mlp": 0.01303797, + "epoch": 0.06577483841875846, + "flos": 26221347843840.0, + "grad_norm": 88.21387149104662, + "language_loss": 0.90390575, + "learning_rate": 3.986590348226282e-06, + "loss": 0.98692322, + "num_input_tokens_seen": 23305005, + "router_z_loss_clip": 5.2421875, + "router_z_loss_mlp": 0.76660156, + "step": 1094, + "time_per_iteration": 2.6458590030670166 + }, + { + "auxiliary_loss_clip": 0.06927408, + "auxiliary_loss_mlp": 0.01386993, + "balance_loss_clip": 0.06403756, + "balance_loss_mlp": 0.01310603, + "epoch": 0.06583496167142643, + "flos": 25087519520640.0, + "grad_norm": 2.736930049066649, + "language_loss": 0.83897924, + "learning_rate": 3.986545286538044e-06, + "loss": 0.92212319, + "num_input_tokens_seen": 23323220, + "router_z_loss_clip": 5.23046875, + "router_z_loss_mlp": 0.76416016, + "step": 1095, + "time_per_iteration": 2.678666114807129 + }, + { + "auxiliary_loss_clip": 0.06935441, + "auxiliary_loss_mlp": 0.01385344, + "balance_loss_clip": 0.06404546, + "balance_loss_mlp": 0.01317443, + "epoch": 0.06589508492409439, + "flos": 25636900815360.0, + "grad_norm": 5.395614329655057, + "language_loss": 0.73154068, + "learning_rate": 3.986500149519811e-06, + "loss": 0.81474853, + "num_input_tokens_seen": 23342235, + "router_z_loss_clip": 5.3046875, + "router_z_loss_mlp": 0.67871094, + "step": 1096, + "time_per_iteration": 2.6446287631988525 + }, + { + "auxiliary_loss_clip": 0.06917029, + "auxiliary_loss_mlp": 0.01365132, + "balance_loss_clip": 0.06399326, + "balance_loss_mlp": 0.01297755, + "epoch": 0.06595520817676236, + "flos": 23627701687680.0, + "grad_norm": 3.583666651431395, + "language_loss": 0.80129099, + "learning_rate": 3.986454937173292e-06, + "loss": 0.8841126, + "num_input_tokens_seen": 23363680, + "router_z_loss_clip": 5.171875, + "router_z_loss_mlp": 0.67285156, + "step": 1097, + "time_per_iteration": 2.610381603240967 + }, + { + "auxiliary_loss_clip": 0.06948523, + "auxiliary_loss_mlp": 0.01368674, + "balance_loss_clip": 0.0639759, + "balance_loss_mlp": 0.01295384, + "epoch": 0.06601533142943034, + "flos": 33810019119360.0, + "grad_norm": 2.548144949478092, + "language_loss": 0.80388427, + "learning_rate": 3.986409649500203e-06, + "loss": 0.88705623, + "num_input_tokens_seen": 23385590, + "router_z_loss_clip": 5.50390625, + "router_z_loss_mlp": 0.73339844, + "step": 1098, + "time_per_iteration": 2.720482110977173 + }, + { + "auxiliary_loss_clip": 0.06938128, + "auxiliary_loss_mlp": 0.01366931, + "balance_loss_clip": 0.06409903, + "balance_loss_mlp": 0.01293498, + "epoch": 0.0660754546820983, + "flos": 20264175417600.0, + "grad_norm": 10.171489722923557, + "language_loss": 0.84726501, + "learning_rate": 3.986364286502261e-06, + "loss": 0.93031561, + "num_input_tokens_seen": 23402945, + "router_z_loss_clip": 5.28125, + "router_z_loss_mlp": 0.73486328, + "step": 1099, + "time_per_iteration": 2.598655939102173 + }, + { + "auxiliary_loss_clip": 0.06904539, + "auxiliary_loss_mlp": 0.01375441, + "balance_loss_clip": 0.0639468, + "balance_loss_mlp": 0.01307397, + "epoch": 0.06613557793476627, + "flos": 19360195643520.0, + "grad_norm": 3.568327868722517, + "language_loss": 0.8664155, + "learning_rate": 3.986318848181186e-06, + "loss": 0.94921529, + "num_input_tokens_seen": 23421410, + "router_z_loss_clip": 5.09765625, + "router_z_loss_mlp": 0.68066406, + "step": 1100, + "time_per_iteration": 2.577528238296509 + }, + { + "auxiliary_loss_clip": 0.06927315, + "auxiliary_loss_mlp": 0.01369622, + "balance_loss_clip": 0.06391686, + "balance_loss_mlp": 0.01299861, + "epoch": 0.06619570118743424, + "flos": 13777788602880.0, + "grad_norm": 2.758398197018795, + "language_loss": 0.76281518, + "learning_rate": 3.986273334538702e-06, + "loss": 0.84578454, + "num_input_tokens_seen": 23438870, + "router_z_loss_clip": 5.3515625, + "router_z_loss_mlp": 0.69775391, + "step": 1101, + "time_per_iteration": 2.6156139373779297 + }, + { + "auxiliary_loss_clip": 0.06904308, + "auxiliary_loss_mlp": 0.01359683, + "balance_loss_clip": 0.06387865, + "balance_loss_mlp": 0.01295215, + "epoch": 0.06625582444010221, + "flos": 17863593068160.0, + "grad_norm": 4.389912717391851, + "language_loss": 0.89471924, + "learning_rate": 3.986227745576533e-06, + "loss": 0.97735918, + "num_input_tokens_seen": 23456975, + "router_z_loss_clip": 5.16796875, + "router_z_loss_mlp": 0.64501953, + "step": 1102, + "time_per_iteration": 2.569350242614746 + }, + { + "auxiliary_loss_clip": 0.0692213, + "auxiliary_loss_mlp": 0.01377442, + "balance_loss_clip": 0.06385392, + "balance_loss_mlp": 0.01306584, + "epoch": 0.06631594769277017, + "flos": 11843584479360.0, + "grad_norm": 3.5425773042581055, + "language_loss": 0.86216784, + "learning_rate": 3.98618208129641e-06, + "loss": 0.94516355, + "num_input_tokens_seen": 23473440, + "router_z_loss_clip": 5.36328125, + "router_z_loss_mlp": 0.70898438, + "step": 1103, + "time_per_iteration": 2.6067960262298584 + }, + { + "auxiliary_loss_clip": 0.06886483, + "auxiliary_loss_mlp": 0.01371541, + "balance_loss_clip": 0.06376658, + "balance_loss_mlp": 0.01305547, + "epoch": 0.06637607094543815, + "flos": 19799683908480.0, + "grad_norm": 2.4626452299406383, + "language_loss": 0.8457936, + "learning_rate": 3.986136341700063e-06, + "loss": 0.92837381, + "num_input_tokens_seen": 23493880, + "router_z_loss_clip": 5.09765625, + "router_z_loss_mlp": 0.66015625, + "step": 1104, + "time_per_iteration": 2.5836308002471924 + }, + { + "auxiliary_loss_clip": 0.06882686, + "auxiliary_loss_mlp": 0.01367781, + "balance_loss_clip": 0.0637526, + "balance_loss_mlp": 0.01303408, + "epoch": 0.06643619419810612, + "flos": 25493032154880.0, + "grad_norm": 1.7655477747418094, + "language_loss": 0.83173895, + "learning_rate": 3.986090526789227e-06, + "loss": 0.91424364, + "num_input_tokens_seen": 23514920, + "router_z_loss_clip": 5.0703125, + "router_z_loss_mlp": 0.64306641, + "step": 1105, + "time_per_iteration": 2.662261486053467 + }, + { + "auxiliary_loss_clip": 0.06873615, + "auxiliary_loss_mlp": 0.01369586, + "balance_loss_clip": 0.06380346, + "balance_loss_mlp": 0.01308694, + "epoch": 0.06649631745077408, + "flos": 16952234135040.0, + "grad_norm": 2.812403865753697, + "language_loss": 0.99235487, + "learning_rate": 3.986044636565639e-06, + "loss": 1.0747869, + "num_input_tokens_seen": 23531635, + "router_z_loss_clip": 4.9296875, + "router_z_loss_mlp": 0.60839844, + "step": 1106, + "time_per_iteration": 2.55377459526062 + }, + { + "auxiliary_loss_clip": 0.0691068, + "auxiliary_loss_mlp": 0.01368117, + "balance_loss_clip": 0.06380811, + "balance_loss_mlp": 0.01299977, + "epoch": 0.06655644070344206, + "flos": 17864431608960.0, + "grad_norm": 9.796712570365342, + "language_loss": 0.85572082, + "learning_rate": 3.985998671031039e-06, + "loss": 0.93850881, + "num_input_tokens_seen": 23551020, + "router_z_loss_clip": 5.296875, + "router_z_loss_mlp": 0.68115234, + "step": 1107, + "time_per_iteration": 2.607999324798584 + }, + { + "auxiliary_loss_clip": 0.06769384, + "auxiliary_loss_mlp": 0.01408352, + "balance_loss_clip": 0.06440101, + "balance_loss_mlp": 0.01358189, + "epoch": 0.06661656395611003, + "flos": 61438033779840.0, + "grad_norm": 0.835907980773472, + "language_loss": 0.57139766, + "learning_rate": 3.9859526301871705e-06, + "loss": 0.653175, + "num_input_tokens_seen": 23610675, + "router_z_loss_clip": 3.28515625, + "router_z_loss_mlp": 0.50195312, + "step": 1108, + "time_per_iteration": 3.1505634784698486 + }, + { + "auxiliary_loss_clip": 0.06919513, + "auxiliary_loss_mlp": 0.01358617, + "balance_loss_clip": 0.06388947, + "balance_loss_mlp": 0.01289285, + "epoch": 0.066676687208778, + "flos": 20668304459520.0, + "grad_norm": 4.7813305453067985, + "language_loss": 0.74593651, + "learning_rate": 3.9859065140357795e-06, + "loss": 0.82871783, + "num_input_tokens_seen": 23628710, + "router_z_loss_clip": 5.30078125, + "router_z_loss_mlp": 0.69384766, + "step": 1109, + "time_per_iteration": 2.5951621532440186 + }, + { + "auxiliary_loss_clip": 0.06901313, + "auxiliary_loss_mlp": 0.01359309, + "balance_loss_clip": 0.06382284, + "balance_loss_mlp": 0.01292219, + "epoch": 0.06673681046144596, + "flos": 20929613016960.0, + "grad_norm": 2.4423466539648686, + "language_loss": 0.81162918, + "learning_rate": 3.985860322578614e-06, + "loss": 0.89423537, + "num_input_tokens_seen": 23649160, + "router_z_loss_clip": 5.18359375, + "router_z_loss_mlp": 0.66992188, + "step": 1110, + "time_per_iteration": 2.5594658851623535 + }, + { + "auxiliary_loss_clip": 0.06916048, + "auxiliary_loss_mlp": 0.01350686, + "balance_loss_clip": 0.06385787, + "balance_loss_mlp": 0.01283261, + "epoch": 0.06679693371411394, + "flos": 31073762113920.0, + "grad_norm": 3.192640550751645, + "language_loss": 0.74339402, + "learning_rate": 3.985814055817427e-06, + "loss": 0.82606131, + "num_input_tokens_seen": 23671995, + "router_z_loss_clip": 5.296875, + "router_z_loss_mlp": 0.67431641, + "step": 1111, + "time_per_iteration": 2.6675732135772705 + }, + { + "auxiliary_loss_clip": 0.0692247, + "auxiliary_loss_mlp": 0.01336011, + "balance_loss_clip": 0.0638883, + "balance_loss_mlp": 0.01269492, + "epoch": 0.0668570569667819, + "flos": 21732630220800.0, + "grad_norm": 3.09844838926034, + "language_loss": 0.81051421, + "learning_rate": 3.985767713753971e-06, + "loss": 0.89309895, + "num_input_tokens_seen": 23690705, + "router_z_loss_clip": 5.3359375, + "router_z_loss_mlp": 0.66455078, + "step": 1112, + "time_per_iteration": 2.5785021781921387 + }, + { + "auxiliary_loss_clip": 0.06900664, + "auxiliary_loss_mlp": 0.01347702, + "balance_loss_clip": 0.06384552, + "balance_loss_mlp": 0.01282185, + "epoch": 0.06691718021944987, + "flos": 22753840256640.0, + "grad_norm": 2.9756537070092466, + "language_loss": 0.82400674, + "learning_rate": 3.985721296390005e-06, + "loss": 0.90649039, + "num_input_tokens_seen": 23709990, + "router_z_loss_clip": 5.1640625, + "router_z_loss_mlp": 0.65576172, + "step": 1113, + "time_per_iteration": 2.6159799098968506 + }, + { + "auxiliary_loss_clip": 0.06872059, + "auxiliary_loss_mlp": 0.01337269, + "balance_loss_clip": 0.06376456, + "balance_loss_mlp": 0.01280382, + "epoch": 0.06697730347211785, + "flos": 16551333475200.0, + "grad_norm": 3.049422068587495, + "language_loss": 0.85146165, + "learning_rate": 3.985674803727289e-06, + "loss": 0.93355489, + "num_input_tokens_seen": 23728485, + "router_z_loss_clip": 4.95703125, + "router_z_loss_mlp": 0.56884766, + "step": 1114, + "time_per_iteration": 2.5442495346069336 + }, + { + "auxiliary_loss_clip": 0.06720632, + "auxiliary_loss_mlp": 0.01311166, + "balance_loss_clip": 0.06393555, + "balance_loss_mlp": 0.01264675, + "epoch": 0.06703742672478581, + "flos": 59801545612800.0, + "grad_norm": 0.814822871226623, + "language_loss": 0.58299243, + "learning_rate": 3.985628235767584e-06, + "loss": 0.66331041, + "num_input_tokens_seen": 23786650, + "router_z_loss_clip": 3.2734375, + "router_z_loss_mlp": 0.46435547, + "step": 1115, + "time_per_iteration": 3.1831469535827637 + }, + { + "auxiliary_loss_clip": 0.06912658, + "auxiliary_loss_mlp": 0.01326736, + "balance_loss_clip": 0.06393988, + "balance_loss_mlp": 0.01261314, + "epoch": 0.06709754997745378, + "flos": 16805807925120.0, + "grad_norm": 5.78180725653176, + "language_loss": 0.94695258, + "learning_rate": 3.985581592512658e-06, + "loss": 1.02934647, + "num_input_tokens_seen": 23802555, + "router_z_loss_clip": 5.1875, + "router_z_loss_mlp": 0.65332031, + "step": 1116, + "time_per_iteration": 2.6025443077087402 + }, + { + "auxiliary_loss_clip": 0.06950381, + "auxiliary_loss_mlp": 0.01352294, + "balance_loss_clip": 0.06407215, + "balance_loss_mlp": 0.01283105, + "epoch": 0.06715767323012176, + "flos": 22129883228160.0, + "grad_norm": 3.297350824619057, + "language_loss": 0.90161335, + "learning_rate": 3.985534873964279e-06, + "loss": 0.98464012, + "num_input_tokens_seen": 23822945, + "router_z_loss_clip": 5.42578125, + "router_z_loss_mlp": 0.69189453, + "step": 1117, + "time_per_iteration": 2.640014410018921 + }, + { + "auxiliary_loss_clip": 0.06703123, + "auxiliary_loss_mlp": 0.01296382, + "balance_loss_clip": 0.06378835, + "balance_loss_mlp": 0.01254898, + "epoch": 0.06721779648278972, + "flos": 66634522842240.0, + "grad_norm": 0.828477744144983, + "language_loss": 0.59793437, + "learning_rate": 3.985488080124218e-06, + "loss": 0.67792934, + "num_input_tokens_seen": 23874075, + "router_z_loss_clip": 3.24804688, + "router_z_loss_mlp": 0.41503906, + "step": 1118, + "time_per_iteration": 3.1895816326141357 + }, + { + "auxiliary_loss_clip": 0.0694533, + "auxiliary_loss_mlp": 0.0134688, + "balance_loss_clip": 0.06400572, + "balance_loss_mlp": 0.0127092, + "epoch": 0.06727791973545769, + "flos": 22389011579520.0, + "grad_norm": 4.072656467009049, + "language_loss": 0.87426257, + "learning_rate": 3.985441210994251e-06, + "loss": 0.95718467, + "num_input_tokens_seen": 23889720, + "router_z_loss_clip": 5.453125, + "router_z_loss_mlp": 0.76025391, + "step": 1119, + "time_per_iteration": 2.588590621948242 + }, + { + "auxiliary_loss_clip": 0.0690966, + "auxiliary_loss_mlp": 0.01331486, + "balance_loss_clip": 0.06396869, + "balance_loss_mlp": 0.01269116, + "epoch": 0.06733804298812565, + "flos": 24287143720320.0, + "grad_norm": 3.964620176038611, + "language_loss": 0.88010037, + "learning_rate": 3.9853942665761545e-06, + "loss": 0.9625119, + "num_input_tokens_seen": 23909385, + "router_z_loss_clip": 5.12109375, + "router_z_loss_mlp": 0.62451172, + "step": 1120, + "time_per_iteration": 2.6959142684936523 + }, + { + "auxiliary_loss_clip": 0.06922112, + "auxiliary_loss_mlp": 0.01340271, + "balance_loss_clip": 0.06406626, + "balance_loss_mlp": 0.01275421, + "epoch": 0.06739816624079363, + "flos": 15922638691200.0, + "grad_norm": 2.824028723834481, + "language_loss": 0.81958008, + "learning_rate": 3.985347246871708e-06, + "loss": 0.90220392, + "num_input_tokens_seen": 23926830, + "router_z_loss_clip": 5.15625, + "router_z_loss_mlp": 0.6484375, + "step": 1121, + "time_per_iteration": 2.5337889194488525 + }, + { + "auxiliary_loss_clip": 0.0669936, + "auxiliary_loss_mlp": 0.01328619, + "balance_loss_clip": 0.0637704, + "balance_loss_mlp": 0.01291044, + "epoch": 0.0674582894934616, + "flos": 71422031796480.0, + "grad_norm": 0.7591545371637793, + "language_loss": 0.58392835, + "learning_rate": 3.985300151882694e-06, + "loss": 0.66420811, + "num_input_tokens_seen": 23992640, + "router_z_loss_clip": 3.21875, + "router_z_loss_mlp": 0.375, + "step": 1122, + "time_per_iteration": 4.871971130371094 + }, + { + "auxiliary_loss_clip": 0.06934178, + "auxiliary_loss_mlp": 0.01339594, + "balance_loss_clip": 0.06410946, + "balance_loss_mlp": 0.01275269, + "epoch": 0.06751841274612956, + "flos": 25271988284160.0, + "grad_norm": 2.7004693252579286, + "language_loss": 0.75033748, + "learning_rate": 3.985252981610901e-06, + "loss": 0.83307523, + "num_input_tokens_seen": 24011135, + "router_z_loss_clip": 5.23046875, + "router_z_loss_mlp": 0.64355469, + "step": 1123, + "time_per_iteration": 4.122293472290039 + }, + { + "auxiliary_loss_clip": 0.06974602, + "auxiliary_loss_mlp": 0.0135696, + "balance_loss_clip": 0.06425263, + "balance_loss_mlp": 0.01278282, + "epoch": 0.06757853599879754, + "flos": 23809067850240.0, + "grad_norm": 9.643312426369809, + "language_loss": 0.82052922, + "learning_rate": 3.985205736058114e-06, + "loss": 0.90384483, + "num_input_tokens_seen": 24030695, + "router_z_loss_clip": 5.49609375, + "router_z_loss_mlp": 0.78637695, + "step": 1124, + "time_per_iteration": 2.6173415184020996 + }, + { + "auxiliary_loss_clip": 0.06911455, + "auxiliary_loss_mlp": 0.01341629, + "balance_loss_clip": 0.06401114, + "balance_loss_mlp": 0.01274705, + "epoch": 0.0676386592514655, + "flos": 21040260733440.0, + "grad_norm": 3.063274936287039, + "language_loss": 0.74925935, + "learning_rate": 3.985158415226128e-06, + "loss": 0.83179009, + "num_input_tokens_seen": 24050680, + "router_z_loss_clip": 5.10546875, + "router_z_loss_mlp": 0.66870117, + "step": 1125, + "time_per_iteration": 3.984415292739868 + }, + { + "auxiliary_loss_clip": 0.0694951, + "auxiliary_loss_mlp": 0.01360506, + "balance_loss_clip": 0.06422167, + "balance_loss_mlp": 0.01290745, + "epoch": 0.06769878250413347, + "flos": 25563331330560.0, + "grad_norm": 3.6371795971434935, + "language_loss": 0.84025776, + "learning_rate": 3.985111019116736e-06, + "loss": 0.92335784, + "num_input_tokens_seen": 24067205, + "router_z_loss_clip": 5.2734375, + "router_z_loss_mlp": 0.69726562, + "step": 1126, + "time_per_iteration": 2.6536872386932373 + }, + { + "auxiliary_loss_clip": 0.06684255, + "auxiliary_loss_mlp": 0.01367323, + "balance_loss_clip": 0.06366412, + "balance_loss_mlp": 0.01329891, + "epoch": 0.06775890575680145, + "flos": 70676316385920.0, + "grad_norm": 0.9685337357274917, + "language_loss": 0.60214978, + "learning_rate": 3.985063547731735e-06, + "loss": 0.68266553, + "num_input_tokens_seen": 24131320, + "router_z_loss_clip": 3.1796875, + "router_z_loss_mlp": 0.37353516, + "step": 1127, + "time_per_iteration": 3.2334144115448 + }, + { + "auxiliary_loss_clip": 0.06927685, + "auxiliary_loss_mlp": 0.01345826, + "balance_loss_clip": 0.0640737, + "balance_loss_mlp": 0.01276304, + "epoch": 0.06781902900946941, + "flos": 24241051175040.0, + "grad_norm": 3.0319163993738307, + "language_loss": 0.83925569, + "learning_rate": 3.985016001072925e-06, + "loss": 0.92199081, + "num_input_tokens_seen": 24149930, + "router_z_loss_clip": 5.19921875, + "router_z_loss_mlp": 0.6953125, + "step": 1128, + "time_per_iteration": 4.002989053726196 + }, + { + "auxiliary_loss_clip": 0.06986301, + "auxiliary_loss_mlp": 0.01369711, + "balance_loss_clip": 0.06426411, + "balance_loss_mlp": 0.01288792, + "epoch": 0.06787915226213738, + "flos": 22423825751040.0, + "grad_norm": 5.128906887201041, + "language_loss": 0.79490405, + "learning_rate": 3.984968379142109e-06, + "loss": 0.87846416, + "num_input_tokens_seen": 24169590, + "router_z_loss_clip": 5.59375, + "router_z_loss_mlp": 0.80908203, + "step": 1129, + "time_per_iteration": 2.6091246604919434 + }, + { + "auxiliary_loss_clip": 0.06950344, + "auxiliary_loss_mlp": 0.0134506, + "balance_loss_clip": 0.06413193, + "balance_loss_mlp": 0.01275251, + "epoch": 0.06793927551480534, + "flos": 37716092576640.0, + "grad_norm": 7.724208809946286, + "language_loss": 0.75193048, + "learning_rate": 3.984920681941094e-06, + "loss": 0.83488452, + "num_input_tokens_seen": 24189965, + "router_z_loss_clip": 5.37109375, + "router_z_loss_mlp": 0.69873047, + "step": 1130, + "time_per_iteration": 2.747319221496582 + }, + { + "auxiliary_loss_clip": 0.06924557, + "auxiliary_loss_mlp": 0.01342805, + "balance_loss_clip": 0.06402417, + "balance_loss_mlp": 0.01275428, + "epoch": 0.06799939876747332, + "flos": 20637682992000.0, + "grad_norm": 3.4742611596039583, + "language_loss": 0.83601421, + "learning_rate": 3.984872909471688e-06, + "loss": 0.91868782, + "num_input_tokens_seen": 24208045, + "router_z_loss_clip": 5.21484375, + "router_z_loss_mlp": 0.67333984, + "step": 1131, + "time_per_iteration": 2.619173765182495 + }, + { + "auxiliary_loss_clip": 0.06889838, + "auxiliary_loss_mlp": 0.01323899, + "balance_loss_clip": 0.06390625, + "balance_loss_mlp": 0.01266011, + "epoch": 0.06805952202014129, + "flos": 14869759011840.0, + "grad_norm": 6.452833361572522, + "language_loss": 0.83523953, + "learning_rate": 3.984825061735701e-06, + "loss": 0.91737688, + "num_input_tokens_seen": 24223805, + "router_z_loss_clip": 4.99609375, + "router_z_loss_mlp": 0.57958984, + "step": 1132, + "time_per_iteration": 2.5897791385650635 + }, + { + "auxiliary_loss_clip": 0.06909724, + "auxiliary_loss_mlp": 0.01329094, + "balance_loss_clip": 0.06400912, + "balance_loss_mlp": 0.0126813, + "epoch": 0.06811964527280925, + "flos": 48920710147200.0, + "grad_norm": 2.2815724812180056, + "language_loss": 0.66480637, + "learning_rate": 3.9847771387349495e-06, + "loss": 0.74719459, + "num_input_tokens_seen": 24249475, + "router_z_loss_clip": 5.09375, + "router_z_loss_mlp": 0.61035156, + "step": 1133, + "time_per_iteration": 2.830873966217041 + }, + { + "auxiliary_loss_clip": 0.06951424, + "auxiliary_loss_mlp": 0.01351356, + "balance_loss_clip": 0.06402567, + "balance_loss_mlp": 0.0127573, + "epoch": 0.06817976852547723, + "flos": 15382649053440.0, + "grad_norm": 2.526233551435035, + "language_loss": 0.78033423, + "learning_rate": 3.9847291404712506e-06, + "loss": 0.86336207, + "num_input_tokens_seen": 24267980, + "router_z_loss_clip": 5.484375, + "router_z_loss_mlp": 0.75634766, + "step": 1134, + "time_per_iteration": 2.5770034790039062 + }, + { + "auxiliary_loss_clip": 0.06920115, + "auxiliary_loss_mlp": 0.0133773, + "balance_loss_clip": 0.06399941, + "balance_loss_mlp": 0.01275216, + "epoch": 0.0682398917781452, + "flos": 20161661546880.0, + "grad_norm": 3.170480536995333, + "language_loss": 0.89855266, + "learning_rate": 3.984681066946423e-06, + "loss": 0.98113102, + "num_input_tokens_seen": 24286805, + "router_z_loss_clip": 5.19921875, + "router_z_loss_mlp": 0.625, + "step": 1135, + "time_per_iteration": 2.574153423309326 + }, + { + "auxiliary_loss_clip": 0.06912802, + "auxiliary_loss_mlp": 0.01339867, + "balance_loss_clip": 0.06390901, + "balance_loss_mlp": 0.01268723, + "epoch": 0.06830001503081316, + "flos": 23447341774080.0, + "grad_norm": 4.323885929511343, + "language_loss": 0.81566894, + "learning_rate": 3.984632918162291e-06, + "loss": 0.89819562, + "num_input_tokens_seen": 24305855, + "router_z_loss_clip": 5.2109375, + "router_z_loss_mlp": 0.7109375, + "step": 1136, + "time_per_iteration": 2.632093906402588 + }, + { + "auxiliary_loss_clip": 0.0691568, + "auxiliary_loss_mlp": 0.01339988, + "balance_loss_clip": 0.06395651, + "balance_loss_mlp": 0.01271133, + "epoch": 0.06836013828348114, + "flos": 34358352238080.0, + "grad_norm": 3.452027949613855, + "language_loss": 0.86628962, + "learning_rate": 3.984584694120679e-06, + "loss": 0.94884622, + "num_input_tokens_seen": 24326535, + "router_z_loss_clip": 5.1953125, + "router_z_loss_mlp": 0.68798828, + "step": 1137, + "time_per_iteration": 2.7281885147094727 + }, + { + "auxiliary_loss_clip": 0.0688309, + "auxiliary_loss_mlp": 0.01332345, + "balance_loss_clip": 0.06381994, + "balance_loss_mlp": 0.01269736, + "epoch": 0.06842026153614911, + "flos": 23155537530240.0, + "grad_norm": 8.291551749105667, + "language_loss": 0.81329322, + "learning_rate": 3.984536394823418e-06, + "loss": 0.89544761, + "num_input_tokens_seen": 24345810, + "router_z_loss_clip": 5.0078125, + "router_z_loss_mlp": 0.62646484, + "step": 1138, + "time_per_iteration": 2.605118989944458 + }, + { + "auxiliary_loss_clip": 0.06915967, + "auxiliary_loss_mlp": 0.01331137, + "balance_loss_clip": 0.06396595, + "balance_loss_mlp": 0.01263808, + "epoch": 0.06848038478881707, + "flos": 24616026195840.0, + "grad_norm": 3.6376188064113704, + "language_loss": 0.88301587, + "learning_rate": 3.984488020272336e-06, + "loss": 0.96548682, + "num_input_tokens_seen": 24366095, + "router_z_loss_clip": 5.203125, + "router_z_loss_mlp": 0.67382812, + "step": 1139, + "time_per_iteration": 2.5919554233551025 + }, + { + "auxiliary_loss_clip": 0.06913859, + "auxiliary_loss_mlp": 0.01335261, + "balance_loss_clip": 0.0640454, + "balance_loss_mlp": 0.01272175, + "epoch": 0.06854050804148504, + "flos": 40890663889920.0, + "grad_norm": 3.4360954602414515, + "language_loss": 0.78086925, + "learning_rate": 3.984439570469271e-06, + "loss": 0.8633604, + "num_input_tokens_seen": 24388665, + "router_z_loss_clip": 5.09375, + "router_z_loss_mlp": 0.6315918, + "step": 1140, + "time_per_iteration": 2.805285930633545 + }, + { + "auxiliary_loss_clip": 0.06922249, + "auxiliary_loss_mlp": 0.01343333, + "balance_loss_clip": 0.06401816, + "balance_loss_mlp": 0.01273191, + "epoch": 0.06860063129415302, + "flos": 31694448833280.0, + "grad_norm": 3.650068739701382, + "language_loss": 0.7214306, + "learning_rate": 3.9843910454160574e-06, + "loss": 0.80408645, + "num_input_tokens_seen": 24407705, + "router_z_loss_clip": 5.1953125, + "router_z_loss_mlp": 0.70166016, + "step": 1141, + "time_per_iteration": 2.661224603652954 + }, + { + "auxiliary_loss_clip": 0.06967719, + "auxiliary_loss_mlp": 0.0134803, + "balance_loss_clip": 0.06416196, + "balance_loss_mlp": 0.01274931, + "epoch": 0.06866075454682098, + "flos": 26549265997440.0, + "grad_norm": 3.4867433558806664, + "language_loss": 0.81973946, + "learning_rate": 3.984342445114538e-06, + "loss": 0.902897, + "num_input_tokens_seen": 24428390, + "router_z_loss_clip": 5.515625, + "router_z_loss_mlp": 0.73095703, + "step": 1142, + "time_per_iteration": 2.6615188121795654 + }, + { + "auxiliary_loss_clip": 0.06894746, + "auxiliary_loss_mlp": 0.01330861, + "balance_loss_clip": 0.06396586, + "balance_loss_mlp": 0.01266488, + "epoch": 0.06872087779948895, + "flos": 29797658357760.0, + "grad_norm": 2.7600235318020157, + "language_loss": 0.71011055, + "learning_rate": 3.984293769566553e-06, + "loss": 0.79236662, + "num_input_tokens_seen": 24450810, + "router_z_loss_clip": 4.97265625, + "router_z_loss_mlp": 0.64404297, + "step": 1143, + "time_per_iteration": 2.6366419792175293 + }, + { + "auxiliary_loss_clip": 0.06881121, + "auxiliary_loss_mlp": 0.01324263, + "balance_loss_clip": 0.06384973, + "balance_loss_mlp": 0.01260987, + "epoch": 0.06878100105215693, + "flos": 26948070305280.0, + "grad_norm": 2.948232373137099, + "language_loss": 0.77426863, + "learning_rate": 3.98424501877395e-06, + "loss": 0.85632247, + "num_input_tokens_seen": 24469965, + "router_z_loss_clip": 4.9609375, + "router_z_loss_mlp": 0.63232422, + "step": 1144, + "time_per_iteration": 2.6423499584198 + }, + { + "auxiliary_loss_clip": 0.06941762, + "auxiliary_loss_mlp": 0.01342145, + "balance_loss_clip": 0.0640377, + "balance_loss_mlp": 0.01268617, + "epoch": 0.06884112430482489, + "flos": 10675361255040.0, + "grad_norm": 11.35172742857112, + "language_loss": 0.95204943, + "learning_rate": 3.984196192738577e-06, + "loss": 1.03488851, + "num_input_tokens_seen": 24486370, + "router_z_loss_clip": 5.37890625, + "router_z_loss_mlp": 0.73486328, + "step": 1145, + "time_per_iteration": 2.5397605895996094 + }, + { + "auxiliary_loss_clip": 0.06956828, + "auxiliary_loss_mlp": 0.01350992, + "balance_loss_clip": 0.06409793, + "balance_loss_mlp": 0.01275032, + "epoch": 0.06890124755749286, + "flos": 20199871808640.0, + "grad_norm": 2.888200090327115, + "language_loss": 0.85492933, + "learning_rate": 3.984147291462285e-06, + "loss": 0.93800759, + "num_input_tokens_seen": 24503780, + "router_z_loss_clip": 5.47265625, + "router_z_loss_mlp": 0.76025391, + "step": 1146, + "time_per_iteration": 2.594526529312134 + }, + { + "auxiliary_loss_clip": 0.06872599, + "auxiliary_loss_mlp": 0.01322623, + "balance_loss_clip": 0.06383249, + "balance_loss_mlp": 0.01261373, + "epoch": 0.06896137081016084, + "flos": 20455520215680.0, + "grad_norm": 3.1845992476426472, + "language_loss": 0.87540007, + "learning_rate": 3.98409831494693e-06, + "loss": 0.95735222, + "num_input_tokens_seen": 24522320, + "router_z_loss_clip": 4.890625, + "router_z_loss_mlp": 0.61303711, + "step": 1147, + "time_per_iteration": 2.583275556564331 + }, + { + "auxiliary_loss_clip": 0.06904457, + "auxiliary_loss_mlp": 0.01331833, + "balance_loss_clip": 0.06408815, + "balance_loss_mlp": 0.01268628, + "epoch": 0.0690214940628288, + "flos": 18374512538880.0, + "grad_norm": 2.487655094523106, + "language_loss": 0.88253343, + "learning_rate": 3.984049263194367e-06, + "loss": 0.96489632, + "num_input_tokens_seen": 24540445, + "router_z_loss_clip": 4.9453125, + "router_z_loss_mlp": 0.63232422, + "step": 1148, + "time_per_iteration": 2.6046411991119385 + }, + { + "auxiliary_loss_clip": 0.06914362, + "auxiliary_loss_mlp": 0.01331137, + "balance_loss_clip": 0.0640358, + "balance_loss_mlp": 0.01259516, + "epoch": 0.06908161731549677, + "flos": 20564239288320.0, + "grad_norm": 4.03707404203517, + "language_loss": 0.7250514, + "learning_rate": 3.9840001362064575e-06, + "loss": 0.80750638, + "num_input_tokens_seen": 24557105, + "router_z_loss_clip": 5.10546875, + "router_z_loss_mlp": 0.71606445, + "step": 1149, + "time_per_iteration": 2.598886489868164 + }, + { + "auxiliary_loss_clip": 0.06921704, + "auxiliary_loss_mlp": 0.01339506, + "balance_loss_clip": 0.06409335, + "balance_loss_mlp": 0.01271891, + "epoch": 0.06914174056816474, + "flos": 27571104938880.0, + "grad_norm": 5.60622478722484, + "language_loss": 0.87750047, + "learning_rate": 3.983950933985064e-06, + "loss": 0.96011257, + "num_input_tokens_seen": 24578240, + "router_z_loss_clip": 5.125, + "router_z_loss_mlp": 0.67626953, + "step": 1150, + "time_per_iteration": 2.618924379348755 + }, + { + "auxiliary_loss_clip": 0.06931552, + "auxiliary_loss_mlp": 0.01344517, + "balance_loss_clip": 0.06421608, + "balance_loss_mlp": 0.01277283, + "epoch": 0.06920186382083271, + "flos": 15309331130880.0, + "grad_norm": 4.140310732721626, + "language_loss": 0.85321879, + "learning_rate": 3.983901656532052e-06, + "loss": 0.93597955, + "num_input_tokens_seen": 24593585, + "router_z_loss_clip": 5.08984375, + "router_z_loss_mlp": 0.671875, + "step": 1151, + "time_per_iteration": 2.561635971069336 + }, + { + "auxiliary_loss_clip": 0.06954889, + "auxiliary_loss_mlp": 0.01331032, + "balance_loss_clip": 0.06432007, + "balance_loss_mlp": 0.01262987, + "epoch": 0.06926198707350067, + "flos": 25198125310080.0, + "grad_norm": 6.641784633133515, + "language_loss": 0.8773886, + "learning_rate": 3.983852303849291e-06, + "loss": 0.96024776, + "num_input_tokens_seen": 24613110, + "router_z_loss_clip": 5.2265625, + "router_z_loss_mlp": 0.68066406, + "step": 1152, + "time_per_iteration": 2.610301971435547 + }, + { + "auxiliary_loss_clip": 0.06939621, + "auxiliary_loss_mlp": 0.01350234, + "balance_loss_clip": 0.06435804, + "balance_loss_mlp": 0.01282142, + "epoch": 0.06932211032616864, + "flos": 13260328513920.0, + "grad_norm": 2.8280818960049046, + "language_loss": 0.93534935, + "learning_rate": 3.983802875938651e-06, + "loss": 1.01824796, + "num_input_tokens_seen": 24628795, + "router_z_loss_clip": 5.03125, + "router_z_loss_mlp": 0.68066406, + "step": 1153, + "time_per_iteration": 2.595799207687378 + }, + { + "auxiliary_loss_clip": 0.06937614, + "auxiliary_loss_mlp": 0.01346443, + "balance_loss_clip": 0.06424908, + "balance_loss_mlp": 0.01280687, + "epoch": 0.06938223357883662, + "flos": 24834386736000.0, + "grad_norm": 3.275555077522592, + "language_loss": 0.83502865, + "learning_rate": 3.983753372802008e-06, + "loss": 0.91786921, + "num_input_tokens_seen": 24645480, + "router_z_loss_clip": 5.125, + "router_z_loss_mlp": 0.65771484, + "step": 1154, + "time_per_iteration": 2.615935802459717 + }, + { + "auxiliary_loss_clip": 0.06924553, + "auxiliary_loss_mlp": 0.01343071, + "balance_loss_clip": 0.06417688, + "balance_loss_mlp": 0.01275837, + "epoch": 0.06944235683150458, + "flos": 27274730647680.0, + "grad_norm": 2.790851822686811, + "language_loss": 0.77858025, + "learning_rate": 3.983703794441237e-06, + "loss": 0.86125654, + "num_input_tokens_seen": 24664630, + "router_z_loss_clip": 5.06640625, + "router_z_loss_mlp": 0.67285156, + "step": 1155, + "time_per_iteration": 2.6646928787231445 + }, + { + "auxiliary_loss_clip": 0.06934217, + "auxiliary_loss_mlp": 0.01349275, + "balance_loss_clip": 0.06429212, + "balance_loss_mlp": 0.01284616, + "epoch": 0.06950248008417255, + "flos": 25814493544320.0, + "grad_norm": 4.449978036613599, + "language_loss": 0.73122412, + "learning_rate": 3.98365414085822e-06, + "loss": 0.81405902, + "num_input_tokens_seen": 24684210, + "router_z_loss_clip": 5.05078125, + "router_z_loss_mlp": 0.64697266, + "step": 1156, + "time_per_iteration": 2.6129708290100098 + }, + { + "auxiliary_loss_clip": 0.06933945, + "auxiliary_loss_mlp": 0.0134792, + "balance_loss_clip": 0.06418756, + "balance_loss_mlp": 0.01275202, + "epoch": 0.06956260333684053, + "flos": 22277818811520.0, + "grad_norm": 6.490327446037073, + "language_loss": 0.77343124, + "learning_rate": 3.98360441205484e-06, + "loss": 0.85624993, + "num_input_tokens_seen": 24702490, + "router_z_loss_clip": 5.1484375, + "router_z_loss_mlp": 0.7265625, + "step": 1157, + "time_per_iteration": 2.617549419403076 + }, + { + "auxiliary_loss_clip": 0.06920086, + "auxiliary_loss_mlp": 0.01334116, + "balance_loss_clip": 0.06410048, + "balance_loss_mlp": 0.01268265, + "epoch": 0.0696227265895085, + "flos": 29689442409600.0, + "grad_norm": 3.2808507481159785, + "language_loss": 0.7421459, + "learning_rate": 3.983554608032982e-06, + "loss": 0.8246879, + "num_input_tokens_seen": 24724340, + "router_z_loss_clip": 5.1015625, + "router_z_loss_mlp": 0.65869141, + "step": 1158, + "time_per_iteration": 2.649886131286621 + }, + { + "auxiliary_loss_clip": 0.0693851, + "auxiliary_loss_mlp": 0.01343202, + "balance_loss_clip": 0.06428596, + "balance_loss_mlp": 0.01279401, + "epoch": 0.06968284984217646, + "flos": 25531158562560.0, + "grad_norm": 2.8574838231568687, + "language_loss": 0.82572293, + "learning_rate": 3.983504728794533e-06, + "loss": 0.90854007, + "num_input_tokens_seen": 24745550, + "router_z_loss_clip": 5.09765625, + "router_z_loss_mlp": 0.63818359, + "step": 1159, + "time_per_iteration": 2.657604694366455 + }, + { + "auxiliary_loss_clip": 0.06916194, + "auxiliary_loss_mlp": 0.01333029, + "balance_loss_clip": 0.06403087, + "balance_loss_mlp": 0.01260598, + "epoch": 0.06974297309484444, + "flos": 20703454047360.0, + "grad_norm": 4.319041132998911, + "language_loss": 0.83704364, + "learning_rate": 3.983454774341387e-06, + "loss": 0.91953588, + "num_input_tokens_seen": 24762575, + "router_z_loss_clip": 5.125, + "router_z_loss_mlp": 0.72460938, + "step": 1160, + "time_per_iteration": 2.5699267387390137 + }, + { + "auxiliary_loss_clip": 0.06909285, + "auxiliary_loss_mlp": 0.01331612, + "balance_loss_clip": 0.06406631, + "balance_loss_mlp": 0.01266857, + "epoch": 0.0698030963475124, + "flos": 26512397400960.0, + "grad_norm": 2.5893552087800598, + "language_loss": 0.78334123, + "learning_rate": 3.983404744675437e-06, + "loss": 0.86575019, + "num_input_tokens_seen": 24782605, + "router_z_loss_clip": 5.0234375, + "router_z_loss_mlp": 0.64794922, + "step": 1161, + "time_per_iteration": 4.190939664840698 + }, + { + "auxiliary_loss_clip": 0.06900249, + "auxiliary_loss_mlp": 0.0132851, + "balance_loss_clip": 0.06396457, + "balance_loss_mlp": 0.01263899, + "epoch": 0.06986321960018037, + "flos": 23047279655040.0, + "grad_norm": 6.695162889354259, + "language_loss": 0.8492136, + "learning_rate": 3.9833546397985794e-06, + "loss": 0.93150115, + "num_input_tokens_seen": 24802910, + "router_z_loss_clip": 5.0390625, + "router_z_loss_mlp": 0.64575195, + "step": 1162, + "time_per_iteration": 2.639911413192749 + }, + { + "auxiliary_loss_clip": 0.06873773, + "auxiliary_loss_mlp": 0.01325161, + "balance_loss_clip": 0.06388026, + "balance_loss_mlp": 0.01266557, + "epoch": 0.06992334285284833, + "flos": 28592356901760.0, + "grad_norm": 3.1892890701678778, + "language_loss": 0.82525402, + "learning_rate": 3.983304459712716e-06, + "loss": 0.90724337, + "num_input_tokens_seen": 24823305, + "router_z_loss_clip": 4.8515625, + "router_z_loss_mlp": 0.58642578, + "step": 1163, + "time_per_iteration": 4.1009368896484375 + }, + { + "auxiliary_loss_clip": 0.06902477, + "auxiliary_loss_mlp": 0.0132859, + "balance_loss_clip": 0.06390633, + "balance_loss_mlp": 0.01260832, + "epoch": 0.06998346610551631, + "flos": 20601694863360.0, + "grad_norm": 2.8425577951758956, + "language_loss": 0.8088491, + "learning_rate": 3.983254204419749e-06, + "loss": 0.89115977, + "num_input_tokens_seen": 24842155, + "router_z_loss_clip": 5.1171875, + "router_z_loss_mlp": 0.67773438, + "step": 1164, + "time_per_iteration": 2.6123766899108887 + }, + { + "auxiliary_loss_clip": 0.06897761, + "auxiliary_loss_mlp": 0.01323941, + "balance_loss_clip": 0.06385773, + "balance_loss_mlp": 0.012589, + "epoch": 0.07004358935818428, + "flos": 22535437789440.0, + "grad_norm": 2.2246598791524903, + "language_loss": 0.75642318, + "learning_rate": 3.983203873921583e-06, + "loss": 0.83864021, + "num_input_tokens_seen": 24862080, + "router_z_loss_clip": 5.1171875, + "router_z_loss_mlp": 0.64941406, + "step": 1165, + "time_per_iteration": 4.041048288345337 + }, + { + "auxiliary_loss_clip": 0.06871405, + "auxiliary_loss_mlp": 0.01319453, + "balance_loss_clip": 0.06375992, + "balance_loss_mlp": 0.01258847, + "epoch": 0.07010371261085224, + "flos": 28957646776320.0, + "grad_norm": 2.442665636555923, + "language_loss": 0.83451885, + "learning_rate": 3.983153468220128e-06, + "loss": 0.91642749, + "num_input_tokens_seen": 24886165, + "router_z_loss_clip": 4.94921875, + "router_z_loss_mlp": 0.60668945, + "step": 1166, + "time_per_iteration": 2.652954339981079 + }, + { + "auxiliary_loss_clip": 0.06883232, + "auxiliary_loss_mlp": 0.01318395, + "balance_loss_clip": 0.06374976, + "balance_loss_mlp": 0.01257599, + "epoch": 0.07016383586352022, + "flos": 23665870022400.0, + "grad_norm": 2.9279177018628393, + "language_loss": 0.87250483, + "learning_rate": 3.983102987317295e-06, + "loss": 0.95452112, + "num_input_tokens_seen": 24905775, + "router_z_loss_clip": 5.07421875, + "router_z_loss_mlp": 0.60791016, + "step": 1167, + "time_per_iteration": 3.997807502746582 + }, + { + "auxiliary_loss_clip": 0.06869654, + "auxiliary_loss_mlp": 0.01315759, + "balance_loss_clip": 0.0637234, + "balance_loss_mlp": 0.01256608, + "epoch": 0.07022395911618819, + "flos": 19798258389120.0, + "grad_norm": 3.2057139816430826, + "language_loss": 0.9293927, + "learning_rate": 3.983052431214997e-06, + "loss": 1.01124692, + "num_input_tokens_seen": 24924295, + "router_z_loss_clip": 4.97265625, + "router_z_loss_mlp": 0.59106445, + "step": 1168, + "time_per_iteration": 2.6452579498291016 + }, + { + "auxiliary_loss_clip": 0.06893629, + "auxiliary_loss_mlp": 0.01330714, + "balance_loss_clip": 0.06368282, + "balance_loss_mlp": 0.01258331, + "epoch": 0.07028408236885615, + "flos": 21695551989120.0, + "grad_norm": 11.495675802169094, + "language_loss": 0.91365838, + "learning_rate": 3.983001799915153e-06, + "loss": 0.99590182, + "num_input_tokens_seen": 24943210, + "router_z_loss_clip": 5.24609375, + "router_z_loss_mlp": 0.72363281, + "step": 1169, + "time_per_iteration": 2.647975444793701 + }, + { + "auxiliary_loss_clip": 0.06888205, + "auxiliary_loss_mlp": 0.01328046, + "balance_loss_clip": 0.06373216, + "balance_loss_mlp": 0.01262696, + "epoch": 0.07034420562152413, + "flos": 25637445866880.0, + "grad_norm": 2.8251979605986515, + "language_loss": 0.87019682, + "learning_rate": 3.982951093419681e-06, + "loss": 0.95235932, + "num_input_tokens_seen": 24960360, + "router_z_loss_clip": 5.14453125, + "router_z_loss_mlp": 0.65356445, + "step": 1170, + "time_per_iteration": 2.6168391704559326 + }, + { + "auxiliary_loss_clip": 0.06855451, + "auxiliary_loss_mlp": 0.01322256, + "balance_loss_clip": 0.06370235, + "balance_loss_mlp": 0.01265703, + "epoch": 0.0704043288741921, + "flos": 20816198115840.0, + "grad_norm": 5.8134102676021175, + "language_loss": 0.77777052, + "learning_rate": 3.982900311730506e-06, + "loss": 0.85954762, + "num_input_tokens_seen": 24978290, + "router_z_loss_clip": 4.84765625, + "router_z_loss_mlp": 0.56542969, + "step": 1171, + "time_per_iteration": 2.5752956867218018 + }, + { + "auxiliary_loss_clip": 0.06854077, + "auxiliary_loss_mlp": 0.01325506, + "balance_loss_clip": 0.06365283, + "balance_loss_mlp": 0.01268191, + "epoch": 0.07046445212686006, + "flos": 25600241854080.0, + "grad_norm": 2.1487650465547463, + "language_loss": 0.92066246, + "learning_rate": 3.9828494548495514e-06, + "loss": 1.00245833, + "num_input_tokens_seen": 24997055, + "router_z_loss_clip": 4.88671875, + "router_z_loss_mlp": 0.57373047, + "step": 1172, + "time_per_iteration": 2.6476805210113525 + }, + { + "auxiliary_loss_clip": 0.06885421, + "auxiliary_loss_mlp": 0.01324663, + "balance_loss_clip": 0.06371161, + "balance_loss_mlp": 0.01262006, + "epoch": 0.07052457537952803, + "flos": 25564086017280.0, + "grad_norm": 2.603738764291359, + "language_loss": 0.84748065, + "learning_rate": 3.982798522778748e-06, + "loss": 0.92958152, + "num_input_tokens_seen": 25017490, + "router_z_loss_clip": 5.140625, + "router_z_loss_mlp": 0.62695312, + "step": 1173, + "time_per_iteration": 2.6071321964263916 + }, + { + "auxiliary_loss_clip": 0.06857952, + "auxiliary_loss_mlp": 0.01331109, + "balance_loss_clip": 0.06368312, + "balance_loss_mlp": 0.01273054, + "epoch": 0.070584698632196, + "flos": 17974450419840.0, + "grad_norm": 3.5775835502164868, + "language_loss": 0.85116845, + "learning_rate": 3.9827475155200245e-06, + "loss": 0.9330591, + "num_input_tokens_seen": 25035660, + "router_z_loss_clip": 4.89453125, + "router_z_loss_mlp": 0.58129883, + "step": 1174, + "time_per_iteration": 2.57753324508667 + }, + { + "auxiliary_loss_clip": 0.06853965, + "auxiliary_loss_mlp": 0.01334878, + "balance_loss_clip": 0.06364483, + "balance_loss_mlp": 0.01276847, + "epoch": 0.07064482188486397, + "flos": 25377353193600.0, + "grad_norm": 2.5795508468108053, + "language_loss": 0.87789464, + "learning_rate": 3.982696433075317e-06, + "loss": 0.95978308, + "num_input_tokens_seen": 25054785, + "router_z_loss_clip": 4.89453125, + "router_z_loss_mlp": 0.58056641, + "step": 1175, + "time_per_iteration": 2.610611915588379 + }, + { + "auxiliary_loss_clip": 0.06871554, + "auxiliary_loss_mlp": 0.01331862, + "balance_loss_clip": 0.06373453, + "balance_loss_mlp": 0.0127116, + "epoch": 0.07070494513753194, + "flos": 24906782263680.0, + "grad_norm": 2.676154874226604, + "language_loss": 0.87147272, + "learning_rate": 3.982645275446563e-06, + "loss": 0.95350683, + "num_input_tokens_seen": 25075180, + "router_z_loss_clip": 4.97265625, + "router_z_loss_mlp": 0.60644531, + "step": 1176, + "time_per_iteration": 2.6749603748321533 + }, + { + "auxiliary_loss_clip": 0.06855497, + "auxiliary_loss_mlp": 0.01331059, + "balance_loss_clip": 0.06369121, + "balance_loss_mlp": 0.01272075, + "epoch": 0.07076506839019991, + "flos": 22343715648000.0, + "grad_norm": 7.137695949749425, + "language_loss": 0.76855987, + "learning_rate": 3.982594042635701e-06, + "loss": 0.85042542, + "num_input_tokens_seen": 25093035, + "router_z_loss_clip": 4.86328125, + "router_z_loss_mlp": 0.58984375, + "step": 1177, + "time_per_iteration": 2.57594895362854 + }, + { + "auxiliary_loss_clip": 0.06883623, + "auxiliary_loss_mlp": 0.0132835, + "balance_loss_clip": 0.06377017, + "balance_loss_mlp": 0.01265599, + "epoch": 0.07082519164286788, + "flos": 18666694126080.0, + "grad_norm": 2.8035814441303164, + "language_loss": 0.8769573, + "learning_rate": 3.982542734644673e-06, + "loss": 0.959077, + "num_input_tokens_seen": 25112520, + "router_z_loss_clip": 5.0625, + "router_z_loss_mlp": 0.62695312, + "step": 1178, + "time_per_iteration": 2.6013543605804443 + }, + { + "auxiliary_loss_clip": 0.06703987, + "auxiliary_loss_mlp": 0.0134181, + "balance_loss_clip": 0.06385635, + "balance_loss_mlp": 0.01304808, + "epoch": 0.07088531489553584, + "flos": 63674691615360.0, + "grad_norm": 0.8655968349167181, + "language_loss": 0.63642812, + "learning_rate": 3.982491351475427e-06, + "loss": 0.71688616, + "num_input_tokens_seen": 25177760, + "router_z_loss_clip": 3.1875, + "router_z_loss_mlp": 0.36938477, + "step": 1179, + "time_per_iteration": 3.3081142902374268 + }, + { + "auxiliary_loss_clip": 0.06890059, + "auxiliary_loss_mlp": 0.01335612, + "balance_loss_clip": 0.06383069, + "balance_loss_mlp": 0.01270047, + "epoch": 0.07094543814820382, + "flos": 21577902456960.0, + "grad_norm": 4.088495173814758, + "language_loss": 0.87769747, + "learning_rate": 3.98243989312991e-06, + "loss": 0.9599542, + "num_input_tokens_seen": 25195260, + "router_z_loss_clip": 5.0703125, + "router_z_loss_mlp": 0.65625, + "step": 1180, + "time_per_iteration": 2.559685707092285 + }, + { + "auxiliary_loss_clip": 0.06872466, + "auxiliary_loss_mlp": 0.01339604, + "balance_loss_clip": 0.06370541, + "balance_loss_mlp": 0.01274754, + "epoch": 0.07100556140087179, + "flos": 22096326867840.0, + "grad_norm": 6.479686279022214, + "language_loss": 0.90814912, + "learning_rate": 3.982388359610074e-06, + "loss": 0.99026984, + "num_input_tokens_seen": 25212740, + "router_z_loss_clip": 5.015625, + "router_z_loss_mlp": 0.6484375, + "step": 1181, + "time_per_iteration": 2.616978883743286 + }, + { + "auxiliary_loss_clip": 0.06848356, + "auxiliary_loss_mlp": 0.01339504, + "balance_loss_clip": 0.06372169, + "balance_loss_mlp": 0.01279351, + "epoch": 0.07106568465353975, + "flos": 47933056471680.0, + "grad_norm": 6.025910143763993, + "language_loss": 0.86037725, + "learning_rate": 3.9823367509178725e-06, + "loss": 0.94225585, + "num_input_tokens_seen": 25236420, + "router_z_loss_clip": 4.75390625, + "router_z_loss_mlp": 0.60131836, + "step": 1182, + "time_per_iteration": 2.7946407794952393 + }, + { + "auxiliary_loss_clip": 0.06876318, + "auxiliary_loss_mlp": 0.0134218, + "balance_loss_clip": 0.06371553, + "balance_loss_mlp": 0.01276806, + "epoch": 0.07112580790620772, + "flos": 23447551409280.0, + "grad_norm": 3.676638851024929, + "language_loss": 0.82862288, + "learning_rate": 3.982285067055262e-06, + "loss": 0.91080785, + "num_input_tokens_seen": 25255120, + "router_z_loss_clip": 5.04296875, + "router_z_loss_mlp": 0.65332031, + "step": 1183, + "time_per_iteration": 2.60546612739563 + }, + { + "auxiliary_loss_clip": 0.06882935, + "auxiliary_loss_mlp": 0.01336855, + "balance_loss_clip": 0.0637991, + "balance_loss_mlp": 0.01272101, + "epoch": 0.0711859311588757, + "flos": 31877030880000.0, + "grad_norm": 4.3786669508725335, + "language_loss": 0.81657791, + "learning_rate": 3.982233308024204e-06, + "loss": 0.8987757, + "num_input_tokens_seen": 25275150, + "router_z_loss_clip": 5.02734375, + "router_z_loss_mlp": 0.64794922, + "step": 1184, + "time_per_iteration": 2.651372194290161 + }, + { + "auxiliary_loss_clip": 0.06854693, + "auxiliary_loss_mlp": 0.013301, + "balance_loss_clip": 0.06374621, + "balance_loss_mlp": 0.01271926, + "epoch": 0.07124605441154366, + "flos": 19616514883200.0, + "grad_norm": 2.502972307695957, + "language_loss": 0.79704922, + "learning_rate": 3.98218147382666e-06, + "loss": 0.87889707, + "num_input_tokens_seen": 25293680, + "router_z_loss_clip": 4.796875, + "router_z_loss_mlp": 0.58178711, + "step": 1185, + "time_per_iteration": 2.591947555541992 + }, + { + "auxiliary_loss_clip": 0.06869413, + "auxiliary_loss_mlp": 0.01332248, + "balance_loss_clip": 0.06377724, + "balance_loss_mlp": 0.0127169, + "epoch": 0.07130617766421163, + "flos": 14689776441600.0, + "grad_norm": 8.952451247795917, + "language_loss": 0.68110502, + "learning_rate": 3.982129564464596e-06, + "loss": 0.7631216, + "num_input_tokens_seen": 25310050, + "router_z_loss_clip": 4.91015625, + "router_z_loss_mlp": 0.60546875, + "step": 1186, + "time_per_iteration": 2.52742862701416 + }, + { + "auxiliary_loss_clip": 0.06856332, + "auxiliary_loss_mlp": 0.01335213, + "balance_loss_clip": 0.06375858, + "balance_loss_mlp": 0.01277587, + "epoch": 0.07136630091687961, + "flos": 26075131269120.0, + "grad_norm": 3.0050123348369984, + "language_loss": 0.72187626, + "learning_rate": 3.98207757993998e-06, + "loss": 0.8037917, + "num_input_tokens_seen": 25331020, + "router_z_loss_clip": 4.796875, + "router_z_loss_mlp": 0.57641602, + "step": 1187, + "time_per_iteration": 2.6516740322113037 + }, + { + "auxiliary_loss_clip": 0.06852362, + "auxiliary_loss_mlp": 0.01318955, + "balance_loss_clip": 0.06373794, + "balance_loss_mlp": 0.01261901, + "epoch": 0.07142642416954757, + "flos": 15674621005440.0, + "grad_norm": 8.213543534109728, + "language_loss": 0.81159407, + "learning_rate": 3.9820255202547845e-06, + "loss": 0.89330727, + "num_input_tokens_seen": 25347875, + "router_z_loss_clip": 4.78125, + "router_z_loss_mlp": 0.57006836, + "step": 1188, + "time_per_iteration": 2.535729169845581 + }, + { + "auxiliary_loss_clip": 0.06864372, + "auxiliary_loss_mlp": 0.01337634, + "balance_loss_clip": 0.06379133, + "balance_loss_mlp": 0.01275216, + "epoch": 0.07148654742221554, + "flos": 19761389792640.0, + "grad_norm": 3.9335979273681794, + "language_loss": 0.87605166, + "learning_rate": 3.981973385410981e-06, + "loss": 0.95807171, + "num_input_tokens_seen": 25366715, + "router_z_loss_clip": 4.84765625, + "router_z_loss_mlp": 0.62402344, + "step": 1189, + "time_per_iteration": 2.6562387943267822 + }, + { + "auxiliary_loss_clip": 0.06861293, + "auxiliary_loss_mlp": 0.01342124, + "balance_loss_clip": 0.06382903, + "balance_loss_mlp": 0.01281685, + "epoch": 0.07154667067488352, + "flos": 23477669752320.0, + "grad_norm": 2.556740892092056, + "language_loss": 0.79916418, + "learning_rate": 3.9819211754105494e-06, + "loss": 0.88119841, + "num_input_tokens_seen": 25385450, + "router_z_loss_clip": 4.78125, + "router_z_loss_mlp": 0.60473633, + "step": 1190, + "time_per_iteration": 2.5854697227478027 + }, + { + "auxiliary_loss_clip": 0.06877136, + "auxiliary_loss_mlp": 0.01341277, + "balance_loss_clip": 0.06381981, + "balance_loss_mlp": 0.01274925, + "epoch": 0.07160679392755148, + "flos": 18338859826560.0, + "grad_norm": 3.405692469784563, + "language_loss": 0.78708088, + "learning_rate": 3.981868890255468e-06, + "loss": 0.86926508, + "num_input_tokens_seen": 25403940, + "router_z_loss_clip": 4.94140625, + "router_z_loss_mlp": 0.6628418, + "step": 1191, + "time_per_iteration": 2.638591766357422 + }, + { + "auxiliary_loss_clip": 0.06881537, + "auxiliary_loss_mlp": 0.01331932, + "balance_loss_clip": 0.06384552, + "balance_loss_mlp": 0.01271493, + "epoch": 0.07166691718021945, + "flos": 17752484154240.0, + "grad_norm": 4.470338815774188, + "language_loss": 0.76098609, + "learning_rate": 3.981816529947719e-06, + "loss": 0.84312069, + "num_input_tokens_seen": 25420410, + "router_z_loss_clip": 4.9609375, + "router_z_loss_mlp": 0.60424805, + "step": 1192, + "time_per_iteration": 2.5505447387695312 + }, + { + "auxiliary_loss_clip": 0.06871057, + "auxiliary_loss_mlp": 0.01335615, + "balance_loss_clip": 0.06381638, + "balance_loss_mlp": 0.01275009, + "epoch": 0.07172704043288743, + "flos": 22457885235840.0, + "grad_norm": 6.182703134969588, + "language_loss": 0.8089788, + "learning_rate": 3.9817640944892896e-06, + "loss": 0.89104557, + "num_input_tokens_seen": 25439415, + "router_z_loss_clip": 4.88671875, + "router_z_loss_mlp": 0.60644531, + "step": 1193, + "time_per_iteration": 2.633073329925537 + }, + { + "auxiliary_loss_clip": 0.06859954, + "auxiliary_loss_mlp": 0.01339771, + "balance_loss_clip": 0.06379488, + "balance_loss_mlp": 0.0127733, + "epoch": 0.07178716368555539, + "flos": 23228981233920.0, + "grad_norm": 5.198460731675794, + "language_loss": 0.88664103, + "learning_rate": 3.981711583882166e-06, + "loss": 0.96863824, + "num_input_tokens_seen": 25458715, + "router_z_loss_clip": 4.80078125, + "router_z_loss_mlp": 0.62426758, + "step": 1194, + "time_per_iteration": 2.5827341079711914 + }, + { + "auxiliary_loss_clip": 0.06866181, + "auxiliary_loss_mlp": 0.01325528, + "balance_loss_clip": 0.06383646, + "balance_loss_mlp": 0.01270096, + "epoch": 0.07184728693822336, + "flos": 25157064009600.0, + "grad_norm": 6.369260359442203, + "language_loss": 0.83872163, + "learning_rate": 3.981658998128341e-06, + "loss": 0.92063868, + "num_input_tokens_seen": 25477985, + "router_z_loss_clip": 4.828125, + "router_z_loss_mlp": 0.55444336, + "step": 1195, + "time_per_iteration": 2.6193504333496094 + }, + { + "auxiliary_loss_clip": 0.06856936, + "auxiliary_loss_mlp": 0.01324202, + "balance_loss_clip": 0.06375654, + "balance_loss_mlp": 0.01265241, + "epoch": 0.07190741019089132, + "flos": 22717894055040.0, + "grad_norm": 2.883346879050408, + "language_loss": 0.81836474, + "learning_rate": 3.981606337229808e-06, + "loss": 0.90017617, + "num_input_tokens_seen": 25497110, + "router_z_loss_clip": 4.8046875, + "router_z_loss_mlp": 0.58984375, + "step": 1196, + "time_per_iteration": 2.586151123046875 + }, + { + "auxiliary_loss_clip": 0.06870347, + "auxiliary_loss_mlp": 0.0135034, + "balance_loss_clip": 0.06381004, + "balance_loss_mlp": 0.0128828, + "epoch": 0.0719675334435593, + "flos": 29357247697920.0, + "grad_norm": 3.757214572000768, + "language_loss": 0.74150658, + "learning_rate": 3.9815536011885655e-06, + "loss": 0.82371342, + "num_input_tokens_seen": 25516555, + "router_z_loss_clip": 4.89453125, + "router_z_loss_mlp": 0.62109375, + "step": 1197, + "time_per_iteration": 2.653139114379883 + }, + { + "auxiliary_loss_clip": 0.06849834, + "auxiliary_loss_mlp": 0.01333514, + "balance_loss_clip": 0.0637273, + "balance_loss_mlp": 0.01277867, + "epoch": 0.07202765669622727, + "flos": 17645609871360.0, + "grad_norm": 7.565571046606514, + "language_loss": 0.88836908, + "learning_rate": 3.98150079000661e-06, + "loss": 0.97020251, + "num_input_tokens_seen": 25533895, + "router_z_loss_clip": 4.76953125, + "router_z_loss_mlp": 0.55664062, + "step": 1198, + "time_per_iteration": 2.558506727218628 + }, + { + "auxiliary_loss_clip": 0.06868395, + "auxiliary_loss_mlp": 0.01336115, + "balance_loss_clip": 0.06385568, + "balance_loss_mlp": 0.01278942, + "epoch": 0.07208777994889523, + "flos": 21440448633600.0, + "grad_norm": 9.650241915118821, + "language_loss": 0.86308157, + "learning_rate": 3.981447903685947e-06, + "loss": 0.94512665, + "num_input_tokens_seen": 25554195, + "router_z_loss_clip": 4.828125, + "router_z_loss_mlp": 0.57202148, + "step": 1199, + "time_per_iteration": 2.593768835067749 + }, + { + "auxiliary_loss_clip": 0.06879794, + "auxiliary_loss_mlp": 0.01340676, + "balance_loss_clip": 0.06389172, + "balance_loss_mlp": 0.01281167, + "epoch": 0.07214790320156321, + "flos": 26947776816000.0, + "grad_norm": 2.5713335496183136, + "language_loss": 0.78793061, + "learning_rate": 3.981394942228581e-06, + "loss": 0.87013531, + "num_input_tokens_seen": 25574155, + "router_z_loss_clip": 4.91015625, + "router_z_loss_mlp": 0.59521484, + "step": 1200, + "time_per_iteration": 2.6549324989318848 + }, + { + "auxiliary_loss_clip": 0.06889373, + "auxiliary_loss_mlp": 0.01341905, + "balance_loss_clip": 0.06398184, + "balance_loss_mlp": 0.01281109, + "epoch": 0.07220802645423118, + "flos": 23886997747200.0, + "grad_norm": 3.3919476714664185, + "language_loss": 0.84325218, + "learning_rate": 3.98134190563652e-06, + "loss": 0.925565, + "num_input_tokens_seen": 25592735, + "router_z_loss_clip": 4.91015625, + "router_z_loss_mlp": 0.60839844, + "step": 1201, + "time_per_iteration": 3.9977235794067383 + }, + { + "auxiliary_loss_clip": 0.06908435, + "auxiliary_loss_mlp": 0.01338574, + "balance_loss_clip": 0.06397285, + "balance_loss_mlp": 0.0127382, + "epoch": 0.07226814970689914, + "flos": 19249464072960.0, + "grad_norm": 2.7243272317134624, + "language_loss": 0.71221054, + "learning_rate": 3.981288793911775e-06, + "loss": 0.7946806, + "num_input_tokens_seen": 25611510, + "router_z_loss_clip": 5.109375, + "router_z_loss_mlp": 0.6472168, + "step": 1202, + "time_per_iteration": 4.006861925125122 + }, + { + "auxiliary_loss_clip": 0.06890082, + "auxiliary_loss_mlp": 0.01341886, + "balance_loss_clip": 0.06389347, + "balance_loss_mlp": 0.01278705, + "epoch": 0.07232827295956712, + "flos": 19178074794240.0, + "grad_norm": 3.218171076661328, + "language_loss": 0.89525115, + "learning_rate": 3.98123560705636e-06, + "loss": 0.97757077, + "num_input_tokens_seen": 25629560, + "router_z_loss_clip": 5.00390625, + "router_z_loss_mlp": 0.63232422, + "step": 1203, + "time_per_iteration": 2.6098897457122803 + }, + { + "auxiliary_loss_clip": 0.069024, + "auxiliary_loss_mlp": 0.01349525, + "balance_loss_clip": 0.06393193, + "balance_loss_mlp": 0.01279335, + "epoch": 0.07238839621223508, + "flos": 17645567944320.0, + "grad_norm": 3.0614329982122266, + "language_loss": 0.81485641, + "learning_rate": 3.981182345072293e-06, + "loss": 0.89737558, + "num_input_tokens_seen": 25648330, + "router_z_loss_clip": 5.09375, + "router_z_loss_mlp": 0.70214844, + "step": 1204, + "time_per_iteration": 3.999619960784912 + }, + { + "auxiliary_loss_clip": 0.06911701, + "auxiliary_loss_mlp": 0.01333494, + "balance_loss_clip": 0.06413823, + "balance_loss_mlp": 0.01269693, + "epoch": 0.07244851946490305, + "flos": 28299797971200.0, + "grad_norm": 3.782046298297649, + "language_loss": 0.84954846, + "learning_rate": 3.981129007961593e-06, + "loss": 0.9320004, + "num_input_tokens_seen": 25669470, + "router_z_loss_clip": 4.97265625, + "router_z_loss_mlp": 0.63818359, + "step": 1205, + "time_per_iteration": 2.658663272857666 + }, + { + "auxiliary_loss_clip": 0.06914138, + "auxiliary_loss_mlp": 0.0134752, + "balance_loss_clip": 0.06405394, + "balance_loss_mlp": 0.01278021, + "epoch": 0.07250864271757101, + "flos": 22571383991040.0, + "grad_norm": 9.50364615421703, + "language_loss": 0.78291214, + "learning_rate": 3.981075595726283e-06, + "loss": 0.86552876, + "num_input_tokens_seen": 25690470, + "router_z_loss_clip": 5.078125, + "router_z_loss_mlp": 0.69458008, + "step": 1206, + "time_per_iteration": 2.6500728130340576 + }, + { + "auxiliary_loss_clip": 0.06879818, + "auxiliary_loss_mlp": 0.01347642, + "balance_loss_clip": 0.06386471, + "balance_loss_mlp": 0.0128594, + "epoch": 0.072568765970239, + "flos": 21768869911680.0, + "grad_norm": 3.061800504881848, + "language_loss": 0.79528189, + "learning_rate": 3.981022108368387e-06, + "loss": 0.87755644, + "num_input_tokens_seen": 25709205, + "router_z_loss_clip": 4.9375, + "router_z_loss_mlp": 0.61767578, + "step": 1207, + "time_per_iteration": 4.111234903335571 + }, + { + "auxiliary_loss_clip": 0.06890166, + "auxiliary_loss_mlp": 0.0133734, + "balance_loss_clip": 0.06392397, + "balance_loss_mlp": 0.01278618, + "epoch": 0.07262888922290696, + "flos": 25526672369280.0, + "grad_norm": 2.516808639831756, + "language_loss": 0.82780725, + "learning_rate": 3.9809685458899345e-06, + "loss": 0.91008234, + "num_input_tokens_seen": 25728485, + "router_z_loss_clip": 4.98046875, + "router_z_loss_mlp": 0.58789062, + "step": 1208, + "time_per_iteration": 2.65267276763916 + }, + { + "auxiliary_loss_clip": 0.06873606, + "auxiliary_loss_mlp": 0.01329274, + "balance_loss_clip": 0.06393886, + "balance_loss_mlp": 0.01270813, + "epoch": 0.07268901247557492, + "flos": 21252080655360.0, + "grad_norm": 3.726862788271486, + "language_loss": 0.80825698, + "learning_rate": 3.980914908292955e-06, + "loss": 0.89028573, + "num_input_tokens_seen": 25747730, + "router_z_loss_clip": 4.80078125, + "router_z_loss_mlp": 0.58496094, + "step": 1209, + "time_per_iteration": 2.5653858184814453 + }, + { + "auxiliary_loss_clip": 0.06887256, + "auxiliary_loss_mlp": 0.01333341, + "balance_loss_clip": 0.06401981, + "balance_loss_mlp": 0.012714, + "epoch": 0.0727491357282429, + "flos": 25485611068800.0, + "grad_norm": 85.1554110577333, + "language_loss": 0.83058631, + "learning_rate": 3.980861195579486e-06, + "loss": 0.91279227, + "num_input_tokens_seen": 25768050, + "router_z_loss_clip": 4.84375, + "router_z_loss_mlp": 0.61962891, + "step": 1210, + "time_per_iteration": 2.6290841102600098 + }, + { + "auxiliary_loss_clip": 0.06912959, + "auxiliary_loss_mlp": 0.01335995, + "balance_loss_clip": 0.064188, + "balance_loss_mlp": 0.01275437, + "epoch": 0.07280925898091087, + "flos": 24469054934400.0, + "grad_norm": 2.3690681332483092, + "language_loss": 0.87872899, + "learning_rate": 3.98080740775156e-06, + "loss": 0.96121848, + "num_input_tokens_seen": 25787985, + "router_z_loss_clip": 4.93359375, + "router_z_loss_mlp": 0.60571289, + "step": 1211, + "time_per_iteration": 2.601407289505005 + }, + { + "auxiliary_loss_clip": 0.06907704, + "auxiliary_loss_mlp": 0.01325307, + "balance_loss_clip": 0.06408024, + "balance_loss_mlp": 0.01262221, + "epoch": 0.07286938223357883, + "flos": 18292725354240.0, + "grad_norm": 12.676001298421971, + "language_loss": 0.94102865, + "learning_rate": 3.98075354481122e-06, + "loss": 1.0233587, + "num_input_tokens_seen": 25803620, + "router_z_loss_clip": 5.0, + "router_z_loss_mlp": 0.63134766, + "step": 1212, + "time_per_iteration": 2.583038806915283 + }, + { + "auxiliary_loss_clip": 0.06906819, + "auxiliary_loss_mlp": 0.0132597, + "balance_loss_clip": 0.06410546, + "balance_loss_mlp": 0.01265579, + "epoch": 0.07292950548624681, + "flos": 21221123771520.0, + "grad_norm": 2.174057870864043, + "language_loss": 0.74973536, + "learning_rate": 3.9806996067605055e-06, + "loss": 0.8320632, + "num_input_tokens_seen": 25823315, + "router_z_loss_clip": 4.96484375, + "router_z_loss_mlp": 0.60449219, + "step": 1213, + "time_per_iteration": 2.58750319480896 + }, + { + "auxiliary_loss_clip": 0.06919889, + "auxiliary_loss_mlp": 0.01335737, + "balance_loss_clip": 0.06414144, + "balance_loss_mlp": 0.01270815, + "epoch": 0.07298962873891478, + "flos": 24648492453120.0, + "grad_norm": 3.5327448066046547, + "language_loss": 0.86681479, + "learning_rate": 3.980645593601465e-06, + "loss": 0.9493711, + "num_input_tokens_seen": 25842605, + "router_z_loss_clip": 5.046875, + "router_z_loss_mlp": 0.64868164, + "step": 1214, + "time_per_iteration": 2.6603875160217285 + }, + { + "auxiliary_loss_clip": 0.0691122, + "auxiliary_loss_mlp": 0.01328745, + "balance_loss_clip": 0.06415356, + "balance_loss_mlp": 0.01268855, + "epoch": 0.07304975199158274, + "flos": 27060101614080.0, + "grad_norm": 2.7007963802747197, + "language_loss": 0.87098217, + "learning_rate": 3.980591505336144e-06, + "loss": 0.95338178, + "num_input_tokens_seen": 25863030, + "router_z_loss_clip": 4.95703125, + "router_z_loss_mlp": 0.59863281, + "step": 1215, + "time_per_iteration": 2.6591246128082275 + }, + { + "auxiliary_loss_clip": 0.06944987, + "auxiliary_loss_mlp": 0.01336211, + "balance_loss_clip": 0.06434523, + "balance_loss_mlp": 0.01269025, + "epoch": 0.07310987524425071, + "flos": 33558353781120.0, + "grad_norm": 3.0486240121539385, + "language_loss": 0.83975989, + "learning_rate": 3.980537341966595e-06, + "loss": 0.9225719, + "num_input_tokens_seen": 25888015, + "router_z_loss_clip": 5.1015625, + "router_z_loss_mlp": 0.67138672, + "step": 1216, + "time_per_iteration": 2.7674107551574707 + }, + { + "auxiliary_loss_clip": 0.06944714, + "auxiliary_loss_mlp": 0.01339054, + "balance_loss_clip": 0.06429577, + "balance_loss_mlp": 0.01274585, + "epoch": 0.07316999849691869, + "flos": 28118473735680.0, + "grad_norm": 3.328421621220486, + "language_loss": 0.78921533, + "learning_rate": 3.980483103494872e-06, + "loss": 0.87205303, + "num_input_tokens_seen": 25908660, + "router_z_loss_clip": 5.1484375, + "router_z_loss_mlp": 0.64550781, + "step": 1217, + "time_per_iteration": 2.672692060470581 + }, + { + "auxiliary_loss_clip": 0.06904574, + "auxiliary_loss_mlp": 0.01321216, + "balance_loss_clip": 0.06406265, + "balance_loss_mlp": 0.01263614, + "epoch": 0.07323012174958665, + "flos": 14397888343680.0, + "grad_norm": 2.4648840381938752, + "language_loss": 0.88704532, + "learning_rate": 3.98042878992303e-06, + "loss": 0.96930325, + "num_input_tokens_seen": 25927215, + "router_z_loss_clip": 4.984375, + "router_z_loss_mlp": 0.57592773, + "step": 1218, + "time_per_iteration": 2.6067652702331543 + }, + { + "auxiliary_loss_clip": 0.06908453, + "auxiliary_loss_mlp": 0.01339024, + "balance_loss_clip": 0.06418494, + "balance_loss_mlp": 0.01277607, + "epoch": 0.07329024500225462, + "flos": 21622862972160.0, + "grad_norm": 2.509726295852636, + "language_loss": 0.89056909, + "learning_rate": 3.9803744012531305e-06, + "loss": 0.9730438, + "num_input_tokens_seen": 25945500, + "router_z_loss_clip": 4.89453125, + "router_z_loss_mlp": 0.61376953, + "step": 1219, + "time_per_iteration": 2.644948959350586 + }, + { + "auxiliary_loss_clip": 0.0689719, + "auxiliary_loss_mlp": 0.01336847, + "balance_loss_clip": 0.06407624, + "balance_loss_mlp": 0.01275287, + "epoch": 0.0733503682549226, + "flos": 13229078140800.0, + "grad_norm": 3.459180464583836, + "language_loss": 0.87265766, + "learning_rate": 3.980319937487235e-06, + "loss": 0.95499802, + "num_input_tokens_seen": 25963105, + "router_z_loss_clip": 4.8984375, + "router_z_loss_mlp": 0.61621094, + "step": 1220, + "time_per_iteration": 2.575570583343506 + }, + { + "auxiliary_loss_clip": 0.06925908, + "auxiliary_loss_mlp": 0.01352206, + "balance_loss_clip": 0.06422862, + "balance_loss_mlp": 0.0128974, + "epoch": 0.07341049150759056, + "flos": 20893331399040.0, + "grad_norm": 4.615259324948809, + "language_loss": 0.79933828, + "learning_rate": 3.98026539862741e-06, + "loss": 0.88211942, + "num_input_tokens_seen": 25981690, + "router_z_loss_clip": 5.03125, + "router_z_loss_mlp": 0.62451172, + "step": 1221, + "time_per_iteration": 2.6174440383911133 + }, + { + "auxiliary_loss_clip": 0.06900848, + "auxiliary_loss_mlp": 0.01351796, + "balance_loss_clip": 0.06404451, + "balance_loss_mlp": 0.01290761, + "epoch": 0.07347061476025853, + "flos": 15418972598400.0, + "grad_norm": 2.5998624424358106, + "language_loss": 0.95159388, + "learning_rate": 3.980210784675722e-06, + "loss": 1.03412032, + "num_input_tokens_seen": 25999890, + "router_z_loss_clip": 4.9609375, + "router_z_loss_mlp": 0.61035156, + "step": 1222, + "time_per_iteration": 2.5956273078918457 + }, + { + "auxiliary_loss_clip": 0.06908462, + "auxiliary_loss_mlp": 0.01358079, + "balance_loss_clip": 0.06414389, + "balance_loss_mlp": 0.01303147, + "epoch": 0.0735307380129265, + "flos": 11113591708800.0, + "grad_norm": 14.551194351183868, + "language_loss": 0.93725538, + "learning_rate": 3.980156095634242e-06, + "loss": 1.01992083, + "num_input_tokens_seen": 26016445, + "router_z_loss_clip": 4.94140625, + "router_z_loss_mlp": 0.54907227, + "step": 1223, + "time_per_iteration": 2.5886712074279785 + }, + { + "auxiliary_loss_clip": 0.06916398, + "auxiliary_loss_mlp": 0.01394841, + "balance_loss_clip": 0.06417241, + "balance_loss_mlp": 0.01330874, + "epoch": 0.07359086126559447, + "flos": 23739146017920.0, + "grad_norm": 2.48832330955176, + "language_loss": 0.84952593, + "learning_rate": 3.980101331505045e-06, + "loss": 0.93263835, + "num_input_tokens_seen": 26036080, + "router_z_loss_clip": 4.984375, + "router_z_loss_mlp": 0.63989258, + "step": 1224, + "time_per_iteration": 2.600796937942505 + }, + { + "auxiliary_loss_clip": 0.06916806, + "auxiliary_loss_mlp": 0.01413444, + "balance_loss_clip": 0.06410658, + "balance_loss_mlp": 0.0134354, + "epoch": 0.07365098451826244, + "flos": 20999115578880.0, + "grad_norm": 3.5000549679052932, + "language_loss": 0.86487269, + "learning_rate": 3.9800464922902076e-06, + "loss": 0.94817519, + "num_input_tokens_seen": 26055805, + "router_z_loss_clip": 5.0625, + "router_z_loss_mlp": 0.69921875, + "step": 1225, + "time_per_iteration": 2.6348657608032227 + }, + { + "auxiliary_loss_clip": 0.06893472, + "auxiliary_loss_mlp": 0.01405003, + "balance_loss_clip": 0.06406252, + "balance_loss_mlp": 0.01345017, + "epoch": 0.0737111077709304, + "flos": 19938982521600.0, + "grad_norm": 2.4160640893773544, + "language_loss": 0.93043572, + "learning_rate": 3.979991577991808e-06, + "loss": 1.01342046, + "num_input_tokens_seen": 26073905, + "router_z_loss_clip": 4.8671875, + "router_z_loss_mlp": 0.59960938, + "step": 1226, + "time_per_iteration": 2.5814220905303955 + }, + { + "auxiliary_loss_clip": 0.06951886, + "auxiliary_loss_mlp": 0.01454874, + "balance_loss_clip": 0.06431323, + "balance_loss_mlp": 0.01382633, + "epoch": 0.07377123102359838, + "flos": 16587153895680.0, + "grad_norm": 17.71044350544229, + "language_loss": 0.81177175, + "learning_rate": 3.97993658861193e-06, + "loss": 0.89583939, + "num_input_tokens_seen": 26091700, + "router_z_loss_clip": 5.21484375, + "router_z_loss_mlp": 0.72216797, + "step": 1227, + "time_per_iteration": 2.562495708465576 + }, + { + "auxiliary_loss_clip": 0.06910308, + "auxiliary_loss_mlp": 0.0141995, + "balance_loss_clip": 0.06419577, + "balance_loss_mlp": 0.01357318, + "epoch": 0.07383135427626634, + "flos": 28335911880960.0, + "grad_norm": 2.0840618907227113, + "language_loss": 0.88551241, + "learning_rate": 3.9798815241526575e-06, + "loss": 0.96881503, + "num_input_tokens_seen": 26114105, + "router_z_loss_clip": 4.90625, + "router_z_loss_mlp": 0.6262207, + "step": 1228, + "time_per_iteration": 2.6383354663848877 + }, + { + "auxiliary_loss_clip": 0.06927899, + "auxiliary_loss_mlp": 0.01421335, + "balance_loss_clip": 0.06420749, + "balance_loss_mlp": 0.01352098, + "epoch": 0.07389147752893431, + "flos": 20053277890560.0, + "grad_norm": 2.9618119227327493, + "language_loss": 0.82374752, + "learning_rate": 3.97982638461608e-06, + "loss": 0.90723979, + "num_input_tokens_seen": 26131165, + "router_z_loss_clip": 5.0625, + "router_z_loss_mlp": 0.69238281, + "step": 1229, + "time_per_iteration": 2.572110414505005 + }, + { + "auxiliary_loss_clip": 0.06918953, + "auxiliary_loss_mlp": 0.01426217, + "balance_loss_clip": 0.06413613, + "balance_loss_mlp": 0.01351926, + "epoch": 0.07395160078160229, + "flos": 18120038088960.0, + "grad_norm": 2.8764105468999697, + "language_loss": 0.81244183, + "learning_rate": 3.979771170004287e-06, + "loss": 0.89589357, + "num_input_tokens_seen": 26150040, + "router_z_loss_clip": 5.046875, + "router_z_loss_mlp": 0.74267578, + "step": 1230, + "time_per_iteration": 2.580080270767212 + }, + { + "auxiliary_loss_clip": 0.06901585, + "auxiliary_loss_mlp": 0.01391553, + "balance_loss_clip": 0.06406316, + "balance_loss_mlp": 0.01325273, + "epoch": 0.07401172403427025, + "flos": 23593726056960.0, + "grad_norm": 2.3354922031953547, + "language_loss": 0.83756942, + "learning_rate": 3.979715880319372e-06, + "loss": 0.92050081, + "num_input_tokens_seen": 26169380, + "router_z_loss_clip": 4.95703125, + "router_z_loss_mlp": 0.66210938, + "step": 1231, + "time_per_iteration": 2.6182961463928223 + }, + { + "auxiliary_loss_clip": 0.06916339, + "auxiliary_loss_mlp": 0.01398184, + "balance_loss_clip": 0.06416178, + "balance_loss_mlp": 0.01340868, + "epoch": 0.07407184728693822, + "flos": 26367187075200.0, + "grad_norm": 2.448759958115063, + "language_loss": 0.97958755, + "learning_rate": 3.979660515563434e-06, + "loss": 1.0627327, + "num_input_tokens_seen": 26189420, + "router_z_loss_clip": 5.0, + "router_z_loss_mlp": 0.57373047, + "step": 1232, + "time_per_iteration": 2.6219074726104736 + }, + { + "auxiliary_loss_clip": 0.06881506, + "auxiliary_loss_mlp": 0.01383375, + "balance_loss_clip": 0.06404279, + "balance_loss_mlp": 0.01327991, + "epoch": 0.0741319705396062, + "flos": 22207016511360.0, + "grad_norm": 2.790382340569057, + "language_loss": 0.83657277, + "learning_rate": 3.979605075738569e-06, + "loss": 0.91922164, + "num_input_tokens_seen": 26209300, + "router_z_loss_clip": 4.7734375, + "router_z_loss_mlp": 0.55395508, + "step": 1233, + "time_per_iteration": 2.6186439990997314 + }, + { + "auxiliary_loss_clip": 0.06909496, + "auxiliary_loss_mlp": 0.0136395, + "balance_loss_clip": 0.06408279, + "balance_loss_mlp": 0.01302462, + "epoch": 0.07419209379227416, + "flos": 39209508696960.0, + "grad_norm": 3.1172656995673393, + "language_loss": 0.73086953, + "learning_rate": 3.979549560846883e-06, + "loss": 0.813604, + "num_input_tokens_seen": 26228110, + "router_z_loss_clip": 5.0078125, + "router_z_loss_mlp": 0.61450195, + "step": 1234, + "time_per_iteration": 2.750397205352783 + }, + { + "auxiliary_loss_clip": 0.0689207, + "auxiliary_loss_mlp": 0.01355226, + "balance_loss_clip": 0.06398024, + "balance_loss_mlp": 0.01294786, + "epoch": 0.07425221704494213, + "flos": 22787899741440.0, + "grad_norm": 2.355636628350322, + "language_loss": 0.789891, + "learning_rate": 3.979493970890478e-06, + "loss": 0.87236392, + "num_input_tokens_seen": 26247020, + "router_z_loss_clip": 4.9375, + "router_z_loss_mlp": 0.60473633, + "step": 1235, + "time_per_iteration": 2.5847980976104736 + }, + { + "auxiliary_loss_clip": 0.06876536, + "auxiliary_loss_mlp": 0.0134157, + "balance_loss_clip": 0.0640441, + "balance_loss_mlp": 0.01286972, + "epoch": 0.0743123402976101, + "flos": 22279495893120.0, + "grad_norm": 4.38662001374288, + "language_loss": 0.84938204, + "learning_rate": 3.979438305871464e-06, + "loss": 0.93156314, + "num_input_tokens_seen": 26265750, + "router_z_loss_clip": 4.71484375, + "router_z_loss_mlp": 0.54589844, + "step": 1236, + "time_per_iteration": 2.6517555713653564 + }, + { + "auxiliary_loss_clip": 0.06904443, + "auxiliary_loss_mlp": 0.013457, + "balance_loss_clip": 0.06407445, + "balance_loss_mlp": 0.01288479, + "epoch": 0.07437246355027807, + "flos": 29322768942720.0, + "grad_norm": 2.2405587930301705, + "language_loss": 0.78282797, + "learning_rate": 3.979382565791951e-06, + "loss": 0.86532938, + "num_input_tokens_seen": 26287905, + "router_z_loss_clip": 4.96875, + "router_z_loss_mlp": 0.57275391, + "step": 1237, + "time_per_iteration": 2.729818105697632 + }, + { + "auxiliary_loss_clip": 0.06881858, + "auxiliary_loss_mlp": 0.01325868, + "balance_loss_clip": 0.06397796, + "balance_loss_mlp": 0.01274488, + "epoch": 0.07443258680294604, + "flos": 31953367549440.0, + "grad_norm": 2.5947803667316123, + "language_loss": 0.79746008, + "learning_rate": 3.979326750654053e-06, + "loss": 0.87953734, + "num_input_tokens_seen": 26311795, + "router_z_loss_clip": 4.84765625, + "router_z_loss_mlp": 0.51391602, + "step": 1238, + "time_per_iteration": 2.7127678394317627 + }, + { + "auxiliary_loss_clip": 0.06888152, + "auxiliary_loss_mlp": 0.01350045, + "balance_loss_clip": 0.06387939, + "balance_loss_mlp": 0.01285982, + "epoch": 0.074492710055614, + "flos": 22682031707520.0, + "grad_norm": 6.17193517167714, + "language_loss": 0.88359845, + "learning_rate": 3.9792708604598854e-06, + "loss": 0.96598047, + "num_input_tokens_seen": 26330330, + "router_z_loss_clip": 5.00390625, + "router_z_loss_mlp": 0.64038086, + "step": 1239, + "time_per_iteration": 2.5982487201690674 + }, + { + "auxiliary_loss_clip": 0.06867203, + "auxiliary_loss_mlp": 0.01339139, + "balance_loss_clip": 0.06376298, + "balance_loss_mlp": 0.01279201, + "epoch": 0.07455283330828198, + "flos": 21290752114560.0, + "grad_norm": 4.728508562946579, + "language_loss": 0.9183414, + "learning_rate": 3.979214895211569e-06, + "loss": 1.00040483, + "num_input_tokens_seen": 26348865, + "router_z_loss_clip": 4.90625, + "router_z_loss_mlp": 0.59960938, + "step": 1240, + "time_per_iteration": 3.982212781906128 + }, + { + "auxiliary_loss_clip": 0.0687404, + "auxiliary_loss_mlp": 0.01344277, + "balance_loss_clip": 0.06383809, + "balance_loss_mlp": 0.01287676, + "epoch": 0.07461295656094995, + "flos": 24395150033280.0, + "grad_norm": 2.7209561023558506, + "language_loss": 0.903265, + "learning_rate": 3.979158854911225e-06, + "loss": 0.98544812, + "num_input_tokens_seen": 26368210, + "router_z_loss_clip": 4.8984375, + "router_z_loss_mlp": 0.56616211, + "step": 1241, + "time_per_iteration": 2.622676372528076 + }, + { + "auxiliary_loss_clip": 0.06764787, + "auxiliary_loss_mlp": 0.01319561, + "balance_loss_clip": 0.06452408, + "balance_loss_mlp": 0.01283775, + "epoch": 0.07467307981361791, + "flos": 62127971498880.0, + "grad_norm": 0.8806411506129102, + "language_loss": 0.63242501, + "learning_rate": 3.979102739560979e-06, + "loss": 0.71326846, + "num_input_tokens_seen": 26424890, + "router_z_loss_clip": 3.125, + "router_z_loss_mlp": 0.35864258, + "step": 1242, + "time_per_iteration": 4.608001947402954 + }, + { + "auxiliary_loss_clip": 0.06884564, + "auxiliary_loss_mlp": 0.01350666, + "balance_loss_clip": 0.06376857, + "balance_loss_mlp": 0.01288319, + "epoch": 0.07473320306628589, + "flos": 24870039448320.0, + "grad_norm": 20.01115775481137, + "language_loss": 0.65988898, + "learning_rate": 3.9790465491629595e-06, + "loss": 0.74224126, + "num_input_tokens_seen": 26446405, + "router_z_loss_clip": 5.08203125, + "router_z_loss_mlp": 0.6237793, + "step": 1243, + "time_per_iteration": 2.686720371246338 + }, + { + "auxiliary_loss_clip": 0.068617, + "auxiliary_loss_mlp": 0.01347661, + "balance_loss_clip": 0.06381305, + "balance_loss_mlp": 0.01292491, + "epoch": 0.07479332631895386, + "flos": 24903973152000.0, + "grad_norm": 3.6813184842747346, + "language_loss": 0.78008217, + "learning_rate": 3.978990283719296e-06, + "loss": 0.86217576, + "num_input_tokens_seen": 26466070, + "router_z_loss_clip": 4.8046875, + "router_z_loss_mlp": 0.55175781, + "step": 1244, + "time_per_iteration": 4.040115833282471 + }, + { + "auxiliary_loss_clip": 0.06851211, + "auxiliary_loss_mlp": 0.01348909, + "balance_loss_clip": 0.06370524, + "balance_loss_mlp": 0.01292833, + "epoch": 0.07485344957162182, + "flos": 17819932291200.0, + "grad_norm": 21.86650929914808, + "language_loss": 0.72362238, + "learning_rate": 3.978933943232123e-06, + "loss": 0.80562365, + "num_input_tokens_seen": 26479350, + "router_z_loss_clip": 4.80078125, + "router_z_loss_mlp": 0.56103516, + "step": 1245, + "time_per_iteration": 2.524477481842041 + }, + { + "auxiliary_loss_clip": 0.06865877, + "auxiliary_loss_mlp": 0.01375645, + "balance_loss_clip": 0.06379819, + "balance_loss_mlp": 0.01317042, + "epoch": 0.0749135728242898, + "flos": 25017304199040.0, + "grad_norm": 2.436107230077969, + "language_loss": 0.90751457, + "learning_rate": 3.978877527703576e-06, + "loss": 0.98992985, + "num_input_tokens_seen": 26498255, + "router_z_loss_clip": 4.85546875, + "router_z_loss_mlp": 0.58642578, + "step": 1246, + "time_per_iteration": 4.0361082553863525 + }, + { + "auxiliary_loss_clip": 0.06889592, + "auxiliary_loss_mlp": 0.01353914, + "balance_loss_clip": 0.06373734, + "balance_loss_mlp": 0.0128978, + "epoch": 0.07497369607695777, + "flos": 17827898428800.0, + "grad_norm": 3.630435288529284, + "language_loss": 0.91536689, + "learning_rate": 3.9788210371357945e-06, + "loss": 0.99780184, + "num_input_tokens_seen": 26515375, + "router_z_loss_clip": 5.15234375, + "router_z_loss_mlp": 0.64111328, + "step": 1247, + "time_per_iteration": 2.558710813522339 + }, + { + "auxiliary_loss_clip": 0.06850724, + "auxiliary_loss_mlp": 0.01373111, + "balance_loss_clip": 0.06373762, + "balance_loss_mlp": 0.01312124, + "epoch": 0.07503381932962573, + "flos": 15126287886720.0, + "grad_norm": 2.9459859952497336, + "language_loss": 0.67146099, + "learning_rate": 3.978764471530921e-06, + "loss": 0.7536993, + "num_input_tokens_seen": 26533595, + "router_z_loss_clip": 4.7578125, + "router_z_loss_mlp": 0.60986328, + "step": 1248, + "time_per_iteration": 2.5451245307922363 + }, + { + "auxiliary_loss_clip": 0.06826814, + "auxiliary_loss_mlp": 0.0138466, + "balance_loss_clip": 0.06362367, + "balance_loss_mlp": 0.01326009, + "epoch": 0.0750939425822937, + "flos": 12820588686720.0, + "grad_norm": 4.865871965779137, + "language_loss": 0.76126468, + "learning_rate": 3.978707830891102e-06, + "loss": 0.84337938, + "num_input_tokens_seen": 26549405, + "router_z_loss_clip": 4.64453125, + "router_z_loss_mlp": 0.58642578, + "step": 1249, + "time_per_iteration": 2.547814130783081 + }, + { + "auxiliary_loss_clip": 0.06878477, + "auxiliary_loss_mlp": 0.01356674, + "balance_loss_clip": 0.06384575, + "balance_loss_mlp": 0.01291156, + "epoch": 0.07515406583496168, + "flos": 24213700016640.0, + "grad_norm": 3.3650478618726805, + "language_loss": 0.84855753, + "learning_rate": 3.978651115218482e-06, + "loss": 0.93090904, + "num_input_tokens_seen": 26567200, + "router_z_loss_clip": 4.9296875, + "router_z_loss_mlp": 0.65429688, + "step": 1250, + "time_per_iteration": 2.6201655864715576 + }, + { + "auxiliary_loss_clip": 0.0685844, + "auxiliary_loss_mlp": 0.01372833, + "balance_loss_clip": 0.06383228, + "balance_loss_mlp": 0.01312036, + "epoch": 0.07521418908762964, + "flos": 26695482572160.0, + "grad_norm": 2.950747307093222, + "language_loss": 0.7010417, + "learning_rate": 3.978594324515215e-06, + "loss": 0.7833544, + "num_input_tokens_seen": 26586190, + "router_z_loss_clip": 4.74609375, + "router_z_loss_mlp": 0.60742188, + "step": 1251, + "time_per_iteration": 2.6431658267974854 + }, + { + "auxiliary_loss_clip": 0.06735167, + "auxiliary_loss_mlp": 0.01321971, + "balance_loss_clip": 0.06424966, + "balance_loss_mlp": 0.0128411, + "epoch": 0.0752743123402976, + "flos": 59115255546240.0, + "grad_norm": 0.864981950603712, + "language_loss": 0.69976699, + "learning_rate": 3.9785374587834515e-06, + "loss": 0.78033841, + "num_input_tokens_seen": 26650710, + "router_z_loss_clip": 3.10351562, + "router_z_loss_mlp": 0.37792969, + "step": 1252, + "time_per_iteration": 3.2185781002044678 + }, + { + "auxiliary_loss_clip": 0.06854245, + "auxiliary_loss_mlp": 0.01348889, + "balance_loss_clip": 0.06374305, + "balance_loss_mlp": 0.01288426, + "epoch": 0.07533443559296558, + "flos": 23483749173120.0, + "grad_norm": 3.3162526589419876, + "language_loss": 0.82824075, + "learning_rate": 3.97848051802535e-06, + "loss": 0.91027212, + "num_input_tokens_seen": 26669000, + "router_z_loss_clip": 4.7890625, + "router_z_loss_mlp": 0.60498047, + "step": 1253, + "time_per_iteration": 2.6227848529815674 + }, + { + "auxiliary_loss_clip": 0.06867173, + "auxiliary_loss_mlp": 0.01358456, + "balance_loss_clip": 0.06365065, + "balance_loss_mlp": 0.01293749, + "epoch": 0.07539455884563355, + "flos": 20884149377280.0, + "grad_norm": 6.3858164660002625, + "language_loss": 0.96525204, + "learning_rate": 3.978423502243069e-06, + "loss": 1.04750824, + "num_input_tokens_seen": 26683075, + "router_z_loss_clip": 5.015625, + "router_z_loss_mlp": 0.64697266, + "step": 1254, + "time_per_iteration": 2.5511484146118164 + }, + { + "auxiliary_loss_clip": 0.06840456, + "auxiliary_loss_mlp": 0.0135521, + "balance_loss_clip": 0.06368542, + "balance_loss_mlp": 0.012916, + "epoch": 0.07545468209830151, + "flos": 27680327136000.0, + "grad_norm": 2.4514498349060307, + "language_loss": 0.9076122, + "learning_rate": 3.97836641143877e-06, + "loss": 0.98956883, + "num_input_tokens_seen": 26701875, + "router_z_loss_clip": 4.71875, + "router_z_loss_mlp": 0.63525391, + "step": 1255, + "time_per_iteration": 2.6308302879333496 + }, + { + "auxiliary_loss_clip": 0.06840869, + "auxiliary_loss_mlp": 0.01347194, + "balance_loss_clip": 0.06364559, + "balance_loss_mlp": 0.01285968, + "epoch": 0.0755148053509695, + "flos": 14142198009600.0, + "grad_norm": 2.7245497332904325, + "language_loss": 0.81970763, + "learning_rate": 3.978309245614618e-06, + "loss": 0.90158832, + "num_input_tokens_seen": 26719050, + "router_z_loss_clip": 4.75390625, + "router_z_loss_mlp": 0.61230469, + "step": 1256, + "time_per_iteration": 2.552151679992676 + }, + { + "auxiliary_loss_clip": 0.06681269, + "auxiliary_loss_mlp": 0.01315431, + "balance_loss_clip": 0.06378952, + "balance_loss_mlp": 0.01282076, + "epoch": 0.07557492860363746, + "flos": 58251764822400.0, + "grad_norm": 0.7695886437006154, + "language_loss": 0.58049726, + "learning_rate": 3.9782520047727825e-06, + "loss": 0.66046429, + "num_input_tokens_seen": 26780650, + "router_z_loss_clip": 3.03125, + "router_z_loss_mlp": 0.33374023, + "step": 1257, + "time_per_iteration": 3.304816246032715 + }, + { + "auxiliary_loss_clip": 0.06853162, + "auxiliary_loss_mlp": 0.0135189, + "balance_loss_clip": 0.0636155, + "balance_loss_mlp": 0.01284012, + "epoch": 0.07563505185630542, + "flos": 24651259637760.0, + "grad_norm": 2.373470459060695, + "language_loss": 0.93104446, + "learning_rate": 3.978194688915432e-06, + "loss": 1.0130949, + "num_input_tokens_seen": 26798725, + "router_z_loss_clip": 4.91015625, + "router_z_loss_mlp": 0.6784668, + "step": 1258, + "time_per_iteration": 2.6907479763031006 + }, + { + "auxiliary_loss_clip": 0.06829782, + "auxiliary_loss_mlp": 0.01330684, + "balance_loss_clip": 0.06361564, + "balance_loss_mlp": 0.01273559, + "epoch": 0.07569517510897339, + "flos": 15528362503680.0, + "grad_norm": 3.094615329702446, + "language_loss": 0.84079689, + "learning_rate": 3.978137298044741e-06, + "loss": 0.92240155, + "num_input_tokens_seen": 26817005, + "router_z_loss_clip": 4.6796875, + "router_z_loss_mlp": 0.57128906, + "step": 1259, + "time_per_iteration": 2.5581536293029785 + }, + { + "auxiliary_loss_clip": 0.06848526, + "auxiliary_loss_mlp": 0.0132832, + "balance_loss_clip": 0.06371632, + "balance_loss_mlp": 0.01271052, + "epoch": 0.07575529836164137, + "flos": 22934954856960.0, + "grad_norm": 3.148240250348832, + "language_loss": 0.77577376, + "learning_rate": 3.978079832162885e-06, + "loss": 0.85754222, + "num_input_tokens_seen": 26836655, + "router_z_loss_clip": 4.76171875, + "router_z_loss_mlp": 0.57275391, + "step": 1260, + "time_per_iteration": 2.601511240005493 + }, + { + "auxiliary_loss_clip": 0.06837059, + "auxiliary_loss_mlp": 0.01329742, + "balance_loss_clip": 0.06359653, + "balance_loss_mlp": 0.01268421, + "epoch": 0.07581542161430933, + "flos": 19506537999360.0, + "grad_norm": 2.0302273693268535, + "language_loss": 0.87771595, + "learning_rate": 3.978022291272044e-06, + "loss": 0.95938396, + "num_input_tokens_seen": 26854925, + "router_z_loss_clip": 4.77734375, + "router_z_loss_mlp": 0.61328125, + "step": 1261, + "time_per_iteration": 2.5501255989074707 + }, + { + "auxiliary_loss_clip": 0.06841564, + "auxiliary_loss_mlp": 0.01315914, + "balance_loss_clip": 0.06369701, + "balance_loss_mlp": 0.01256547, + "epoch": 0.0758755448669773, + "flos": 24980519456640.0, + "grad_norm": 2.7189086354386407, + "language_loss": 0.84886664, + "learning_rate": 3.977964675374399e-06, + "loss": 0.93044144, + "num_input_tokens_seen": 26876170, + "router_z_loss_clip": 4.70703125, + "router_z_loss_mlp": 0.59423828, + "step": 1262, + "time_per_iteration": 2.642197370529175 + }, + { + "auxiliary_loss_clip": 0.06848589, + "auxiliary_loss_mlp": 0.01328257, + "balance_loss_clip": 0.06354951, + "balance_loss_mlp": 0.01263312, + "epoch": 0.07593566811964528, + "flos": 22754678797440.0, + "grad_norm": 3.7332355829542183, + "language_loss": 0.84859836, + "learning_rate": 3.977906984472136e-06, + "loss": 0.93036681, + "num_input_tokens_seen": 26895005, + "router_z_loss_clip": 4.94140625, + "router_z_loss_mlp": 0.64941406, + "step": 1263, + "time_per_iteration": 2.5762293338775635 + }, + { + "auxiliary_loss_clip": 0.06852871, + "auxiliary_loss_mlp": 0.01316465, + "balance_loss_clip": 0.06365145, + "balance_loss_mlp": 0.0126039, + "epoch": 0.07599579137231324, + "flos": 23119088204160.0, + "grad_norm": 2.8380907470503036, + "language_loss": 0.78429461, + "learning_rate": 3.977849218567442e-06, + "loss": 0.86598796, + "num_input_tokens_seen": 26913930, + "router_z_loss_clip": 4.87890625, + "router_z_loss_mlp": 0.56103516, + "step": 1264, + "time_per_iteration": 2.7333550453186035 + }, + { + "auxiliary_loss_clip": 0.06862055, + "auxiliary_loss_mlp": 0.01331538, + "balance_loss_clip": 0.06363812, + "balance_loss_mlp": 0.01272362, + "epoch": 0.07605591462498121, + "flos": 14507362103040.0, + "grad_norm": 3.0292139687816455, + "language_loss": 0.84203875, + "learning_rate": 3.977791377662507e-06, + "loss": 0.92397463, + "num_input_tokens_seen": 26931485, + "router_z_loss_clip": 4.984375, + "router_z_loss_mlp": 0.59179688, + "step": 1265, + "time_per_iteration": 2.587218761444092 + }, + { + "auxiliary_loss_clip": 0.06855778, + "auxiliary_loss_mlp": 0.01328532, + "balance_loss_clip": 0.0636021, + "balance_loss_mlp": 0.01264779, + "epoch": 0.07611603787764919, + "flos": 23521037040000.0, + "grad_norm": 3.3546410086249976, + "language_loss": 0.67662913, + "learning_rate": 3.977733461759524e-06, + "loss": 0.7584722, + "num_input_tokens_seen": 26951670, + "router_z_loss_clip": 4.953125, + "router_z_loss_mlp": 0.63720703, + "step": 1266, + "time_per_iteration": 2.6307120323181152 + }, + { + "auxiliary_loss_clip": 0.06869242, + "auxiliary_loss_mlp": 0.01332957, + "balance_loss_clip": 0.06363578, + "balance_loss_mlp": 0.01267201, + "epoch": 0.07617616113031715, + "flos": 21513640775040.0, + "grad_norm": 2.4484297039949894, + "language_loss": 0.81777161, + "learning_rate": 3.977675470860691e-06, + "loss": 0.89979357, + "num_input_tokens_seen": 26970335, + "router_z_loss_clip": 5.0546875, + "router_z_loss_mlp": 0.65673828, + "step": 1267, + "time_per_iteration": 2.5816946029663086 + }, + { + "auxiliary_loss_clip": 0.06859374, + "auxiliary_loss_mlp": 0.01329793, + "balance_loss_clip": 0.06364329, + "balance_loss_mlp": 0.01269354, + "epoch": 0.07623628438298512, + "flos": 14578164403200.0, + "grad_norm": 3.901991680203772, + "language_loss": 0.74711108, + "learning_rate": 3.977617404968205e-06, + "loss": 0.82900274, + "num_input_tokens_seen": 26986025, + "router_z_loss_clip": 4.94140625, + "router_z_loss_mlp": 0.60498047, + "step": 1268, + "time_per_iteration": 2.5329971313476562 + }, + { + "auxiliary_loss_clip": 0.06849901, + "auxiliary_loss_mlp": 0.01321442, + "balance_loss_clip": 0.06367739, + "balance_loss_mlp": 0.01263959, + "epoch": 0.07629640763565308, + "flos": 14725638789120.0, + "grad_norm": 7.47291205592579, + "language_loss": 0.85124403, + "learning_rate": 3.977559264084269e-06, + "loss": 0.93295747, + "num_input_tokens_seen": 27004045, + "router_z_loss_clip": 4.8125, + "router_z_loss_mlp": 0.57421875, + "step": 1269, + "time_per_iteration": 2.5311200618743896 + }, + { + "auxiliary_loss_clip": 0.06839523, + "auxiliary_loss_mlp": 0.01320369, + "balance_loss_clip": 0.0637067, + "balance_loss_mlp": 0.01264126, + "epoch": 0.07635653088832106, + "flos": 14908220835840.0, + "grad_norm": 2.6697300314393355, + "language_loss": 0.91628265, + "learning_rate": 3.977501048211088e-06, + "loss": 0.99788159, + "num_input_tokens_seen": 27022070, + "router_z_loss_clip": 4.6875, + "router_z_loss_mlp": 0.5625, + "step": 1270, + "time_per_iteration": 2.590938091278076 + }, + { + "auxiliary_loss_clip": 0.06847905, + "auxiliary_loss_mlp": 0.01334774, + "balance_loss_clip": 0.06368862, + "balance_loss_mlp": 0.01272309, + "epoch": 0.07641665414098903, + "flos": 26658865537920.0, + "grad_norm": 4.240829447117421, + "language_loss": 0.73391259, + "learning_rate": 3.977442757350869e-06, + "loss": 0.81573939, + "num_input_tokens_seen": 27041755, + "router_z_loss_clip": 4.7890625, + "router_z_loss_mlp": 0.625, + "step": 1271, + "time_per_iteration": 2.5961694717407227 + }, + { + "auxiliary_loss_clip": 0.06838269, + "auxiliary_loss_mlp": 0.01329276, + "balance_loss_clip": 0.06381856, + "balance_loss_mlp": 0.01278445, + "epoch": 0.07647677739365699, + "flos": 25199970099840.0, + "grad_norm": 3.136617280050721, + "language_loss": 0.8526597, + "learning_rate": 3.977384391505823e-06, + "loss": 0.93433517, + "num_input_tokens_seen": 27061540, + "router_z_loss_clip": 4.55859375, + "router_z_loss_mlp": 0.50878906, + "step": 1272, + "time_per_iteration": 2.6091222763061523 + }, + { + "auxiliary_loss_clip": 0.06845278, + "auxiliary_loss_mlp": 0.01336295, + "balance_loss_clip": 0.06370107, + "balance_loss_mlp": 0.01279599, + "epoch": 0.07653690064632497, + "flos": 20564365069440.0, + "grad_norm": 3.1222866186562674, + "language_loss": 0.82570672, + "learning_rate": 3.977325950678162e-06, + "loss": 0.90752244, + "num_input_tokens_seen": 27081395, + "router_z_loss_clip": 4.75, + "router_z_loss_mlp": 0.56713867, + "step": 1273, + "time_per_iteration": 2.5675384998321533 + }, + { + "auxiliary_loss_clip": 0.06864737, + "auxiliary_loss_mlp": 0.01336748, + "balance_loss_clip": 0.06374316, + "balance_loss_mlp": 0.01277787, + "epoch": 0.07659702389899294, + "flos": 22275219335040.0, + "grad_norm": 2.5887634532412123, + "language_loss": 0.83504725, + "learning_rate": 3.977267434870103e-06, + "loss": 0.91706204, + "num_input_tokens_seen": 27101175, + "router_z_loss_clip": 4.90234375, + "router_z_loss_mlp": 0.58862305, + "step": 1274, + "time_per_iteration": 2.594106912612915 + }, + { + "auxiliary_loss_clip": 0.06835781, + "auxiliary_loss_mlp": 0.01338776, + "balance_loss_clip": 0.06372908, + "balance_loss_mlp": 0.01281961, + "epoch": 0.0766571471516609, + "flos": 32644563079680.0, + "grad_norm": 2.657989216371077, + "language_loss": 0.75383544, + "learning_rate": 3.977208844083865e-06, + "loss": 0.835581, + "num_input_tokens_seen": 27124505, + "router_z_loss_clip": 4.62109375, + "router_z_loss_mlp": 0.56835938, + "step": 1275, + "time_per_iteration": 2.6635921001434326 + }, + { + "auxiliary_loss_clip": 0.06867371, + "auxiliary_loss_mlp": 0.01354656, + "balance_loss_clip": 0.06370118, + "balance_loss_mlp": 0.01289377, + "epoch": 0.07671727040432888, + "flos": 15272672169600.0, + "grad_norm": 3.4268385774262637, + "language_loss": 0.82329005, + "learning_rate": 3.9771501783216685e-06, + "loss": 0.90551031, + "num_input_tokens_seen": 27140960, + "router_z_loss_clip": 4.98046875, + "router_z_loss_mlp": 0.65234375, + "step": 1276, + "time_per_iteration": 2.5468428134918213 + }, + { + "auxiliary_loss_clip": 0.06860888, + "auxiliary_loss_mlp": 0.01344496, + "balance_loss_clip": 0.06380928, + "balance_loss_mlp": 0.01285964, + "epoch": 0.07677739365699685, + "flos": 28191665877120.0, + "grad_norm": 8.54617583390301, + "language_loss": 0.61651218, + "learning_rate": 3.97709143758574e-06, + "loss": 0.69856602, + "num_input_tokens_seen": 27160985, + "router_z_loss_clip": 4.78515625, + "router_z_loss_mlp": 0.58544922, + "step": 1277, + "time_per_iteration": 2.6240146160125732 + }, + { + "auxiliary_loss_clip": 0.06864151, + "auxiliary_loss_mlp": 0.01358552, + "balance_loss_clip": 0.06375778, + "balance_loss_mlp": 0.01298471, + "epoch": 0.07683751690966481, + "flos": 18301991230080.0, + "grad_norm": 2.6958136098916565, + "language_loss": 0.76683849, + "learning_rate": 3.977032621878305e-06, + "loss": 0.84906554, + "num_input_tokens_seen": 27178390, + "router_z_loss_clip": 4.875, + "router_z_loss_mlp": 0.60058594, + "step": 1278, + "time_per_iteration": 2.595947742462158 + }, + { + "auxiliary_loss_clip": 0.06835216, + "auxiliary_loss_mlp": 0.01346069, + "balance_loss_clip": 0.06372848, + "balance_loss_mlp": 0.01289683, + "epoch": 0.07689764016233278, + "flos": 21987565868160.0, + "grad_norm": 3.428980152963994, + "language_loss": 0.90527773, + "learning_rate": 3.976973731201596e-06, + "loss": 0.98709059, + "num_input_tokens_seen": 27197505, + "router_z_loss_clip": 4.62109375, + "router_z_loss_mlp": 0.56420898, + "step": 1279, + "time_per_iteration": 3.962568521499634 + }, + { + "auxiliary_loss_clip": 0.06834365, + "auxiliary_loss_mlp": 0.01339419, + "balance_loss_clip": 0.06362047, + "balance_loss_mlp": 0.01287301, + "epoch": 0.07695776341500075, + "flos": 22242417661440.0, + "grad_norm": 3.3495960477632685, + "language_loss": 0.85256732, + "learning_rate": 3.976914765557845e-06, + "loss": 0.93430507, + "num_input_tokens_seen": 27214260, + "router_z_loss_clip": 4.71484375, + "router_z_loss_mlp": 0.52148438, + "step": 1280, + "time_per_iteration": 2.5692243576049805 + }, + { + "auxiliary_loss_clip": 0.06832324, + "auxiliary_loss_mlp": 0.01339262, + "balance_loss_clip": 0.06368576, + "balance_loss_mlp": 0.01283662, + "epoch": 0.07701788666766872, + "flos": 16149300785280.0, + "grad_norm": 2.5153075146211274, + "language_loss": 0.78576446, + "learning_rate": 3.9768557249492875e-06, + "loss": 0.8674804, + "num_input_tokens_seen": 27232525, + "router_z_loss_clip": 4.63671875, + "router_z_loss_mlp": 0.55541992, + "step": 1281, + "time_per_iteration": 4.005364894866943 + }, + { + "auxiliary_loss_clip": 0.06866302, + "auxiliary_loss_mlp": 0.01356763, + "balance_loss_clip": 0.06371205, + "balance_loss_mlp": 0.01291317, + "epoch": 0.07707800992033668, + "flos": 19468998570240.0, + "grad_norm": 5.650134420498799, + "language_loss": 0.77910447, + "learning_rate": 3.9767966093781634e-06, + "loss": 0.8613351, + "num_input_tokens_seen": 27249800, + "router_z_loss_clip": 4.95703125, + "router_z_loss_mlp": 0.65429688, + "step": 1282, + "time_per_iteration": 2.6096553802490234 + }, + { + "auxiliary_loss_clip": 0.06843832, + "auxiliary_loss_mlp": 0.01354603, + "balance_loss_clip": 0.06370867, + "balance_loss_mlp": 0.01298647, + "epoch": 0.07713813317300466, + "flos": 18996415142400.0, + "grad_norm": 3.5179830835441974, + "language_loss": 0.86225599, + "learning_rate": 3.976737418846713e-06, + "loss": 0.94424033, + "num_input_tokens_seen": 27268895, + "router_z_loss_clip": 4.72265625, + "router_z_loss_mlp": 0.55932617, + "step": 1283, + "time_per_iteration": 2.605346202850342 + }, + { + "auxiliary_loss_clip": 0.06835528, + "auxiliary_loss_mlp": 0.01347471, + "balance_loss_clip": 0.06358841, + "balance_loss_mlp": 0.01292039, + "epoch": 0.07719825642567263, + "flos": 18119828453760.0, + "grad_norm": 2.430743235056626, + "language_loss": 0.77539676, + "learning_rate": 3.976678153357181e-06, + "loss": 0.85722673, + "num_input_tokens_seen": 27288180, + "router_z_loss_clip": 4.76171875, + "router_z_loss_mlp": 0.55444336, + "step": 1284, + "time_per_iteration": 3.990124225616455 + }, + { + "auxiliary_loss_clip": 0.06827543, + "auxiliary_loss_mlp": 0.01355487, + "balance_loss_clip": 0.06358978, + "balance_loss_mlp": 0.01300294, + "epoch": 0.0772583796783406, + "flos": 42204307075200.0, + "grad_norm": 2.435341154952095, + "language_loss": 0.78285027, + "learning_rate": 3.976618812911817e-06, + "loss": 0.86468053, + "num_input_tokens_seen": 27311815, + "router_z_loss_clip": 4.6796875, + "router_z_loss_mlp": 0.55200195, + "step": 1285, + "time_per_iteration": 2.7569363117218018 + }, + { + "auxiliary_loss_clip": 0.06851525, + "auxiliary_loss_mlp": 0.01337351, + "balance_loss_clip": 0.06371935, + "balance_loss_mlp": 0.01278081, + "epoch": 0.07731850293100857, + "flos": 24760565688960.0, + "grad_norm": 2.195462031898389, + "language_loss": 0.86501926, + "learning_rate": 3.9765593975128685e-06, + "loss": 0.946908, + "num_input_tokens_seen": 27331890, + "router_z_loss_clip": 4.79296875, + "router_z_loss_mlp": 0.59277344, + "step": 1286, + "time_per_iteration": 4.058920383453369 + }, + { + "auxiliary_loss_clip": 0.06876462, + "auxiliary_loss_mlp": 0.01367501, + "balance_loss_clip": 0.0637191, + "balance_loss_mlp": 0.01299314, + "epoch": 0.07737862618367654, + "flos": 17571537262080.0, + "grad_norm": 2.773879522110049, + "language_loss": 0.79808044, + "learning_rate": 3.97649990716259e-06, + "loss": 0.88052011, + "num_input_tokens_seen": 27348320, + "router_z_loss_clip": 5.04296875, + "router_z_loss_mlp": 0.68212891, + "step": 1287, + "time_per_iteration": 2.562206506729126 + }, + { + "auxiliary_loss_clip": 0.06845251, + "auxiliary_loss_mlp": 0.01340112, + "balance_loss_clip": 0.06370382, + "balance_loss_mlp": 0.01288136, + "epoch": 0.0774387494363445, + "flos": 25633798214400.0, + "grad_norm": 2.3847373218246983, + "language_loss": 0.8715058, + "learning_rate": 3.976440341863237e-06, + "loss": 0.95335943, + "num_input_tokens_seen": 27367670, + "router_z_loss_clip": 4.74609375, + "router_z_loss_mlp": 0.51953125, + "step": 1288, + "time_per_iteration": 2.600308656692505 + }, + { + "auxiliary_loss_clip": 0.0688329, + "auxiliary_loss_mlp": 0.01364865, + "balance_loss_clip": 0.06375885, + "balance_loss_mlp": 0.01300611, + "epoch": 0.07749887268901248, + "flos": 12244778628480.0, + "grad_norm": 3.451146773235629, + "language_loss": 0.8824665, + "learning_rate": 3.976380701617068e-06, + "loss": 0.96494806, + "num_input_tokens_seen": 27385485, + "router_z_loss_clip": 5.0703125, + "router_z_loss_mlp": 0.64306641, + "step": 1289, + "time_per_iteration": 2.6120755672454834 + }, + { + "auxiliary_loss_clip": 0.06845821, + "auxiliary_loss_mlp": 0.01332003, + "balance_loss_clip": 0.06365949, + "balance_loss_mlp": 0.0127781, + "epoch": 0.07755899594168045, + "flos": 25088609623680.0, + "grad_norm": 3.9721153981819377, + "language_loss": 0.87731397, + "learning_rate": 3.976320986426344e-06, + "loss": 0.95909214, + "num_input_tokens_seen": 27405110, + "router_z_loss_clip": 4.80078125, + "router_z_loss_mlp": 0.54150391, + "step": 1290, + "time_per_iteration": 2.6039535999298096 + }, + { + "auxiliary_loss_clip": 0.06849636, + "auxiliary_loss_mlp": 0.0134794, + "balance_loss_clip": 0.0637328, + "balance_loss_mlp": 0.01286833, + "epoch": 0.07761911919434841, + "flos": 14251629841920.0, + "grad_norm": 2.80389948255575, + "language_loss": 0.9359982, + "learning_rate": 3.9762611962933315e-06, + "loss": 1.0179739, + "num_input_tokens_seen": 27422855, + "router_z_loss_clip": 4.765625, + "router_z_loss_mlp": 0.61157227, + "step": 1291, + "time_per_iteration": 2.620960235595703 + }, + { + "auxiliary_loss_clip": 0.06740145, + "auxiliary_loss_mlp": 0.01502792, + "balance_loss_clip": 0.06432445, + "balance_loss_mlp": 0.01475422, + "epoch": 0.07767924244701638, + "flos": 67259639099520.0, + "grad_norm": 0.9524065323514693, + "language_loss": 0.65448344, + "learning_rate": 3.9762013312202955e-06, + "loss": 0.73691273, + "num_input_tokens_seen": 27487190, + "router_z_loss_clip": 3.078125, + "router_z_loss_mlp": 0.27416992, + "step": 1292, + "time_per_iteration": 3.3147408962249756 + }, + { + "auxiliary_loss_clip": 0.06863274, + "auxiliary_loss_mlp": 0.01339428, + "balance_loss_clip": 0.06369414, + "balance_loss_mlp": 0.01279203, + "epoch": 0.07773936569968436, + "flos": 28558548979200.0, + "grad_norm": 5.92776916982661, + "language_loss": 0.89760518, + "learning_rate": 3.9761413912095075e-06, + "loss": 0.97963214, + "num_input_tokens_seen": 27510465, + "router_z_loss_clip": 4.9375, + "router_z_loss_mlp": 0.60229492, + "step": 1293, + "time_per_iteration": 2.649545431137085 + }, + { + "auxiliary_loss_clip": 0.06850281, + "auxiliary_loss_mlp": 0.0134015, + "balance_loss_clip": 0.06365186, + "balance_loss_mlp": 0.01280689, + "epoch": 0.07779948895235232, + "flos": 27497619308160.0, + "grad_norm": 4.7786851588669315, + "language_loss": 0.88117272, + "learning_rate": 3.976081376263239e-06, + "loss": 0.96307707, + "num_input_tokens_seen": 27528645, + "router_z_loss_clip": 4.84765625, + "router_z_loss_mlp": 0.59521484, + "step": 1294, + "time_per_iteration": 2.7246196269989014 + }, + { + "auxiliary_loss_clip": 0.06872948, + "auxiliary_loss_mlp": 0.01341599, + "balance_loss_clip": 0.06369777, + "balance_loss_mlp": 0.01276034, + "epoch": 0.07785961220502029, + "flos": 18229176432000.0, + "grad_norm": 2.917147299599652, + "language_loss": 0.82283127, + "learning_rate": 3.976021286383768e-06, + "loss": 0.90497679, + "num_input_tokens_seen": 27546165, + "router_z_loss_clip": 5.03125, + "router_z_loss_mlp": 0.65576172, + "step": 1295, + "time_per_iteration": 2.565981149673462 + }, + { + "auxiliary_loss_clip": 0.06823503, + "auxiliary_loss_mlp": 0.0131494, + "balance_loss_clip": 0.06354046, + "balance_loss_mlp": 0.01258459, + "epoch": 0.07791973545768827, + "flos": 24615145728000.0, + "grad_norm": 2.406299450212834, + "language_loss": 0.90690672, + "learning_rate": 3.975961121573371e-06, + "loss": 0.9882912, + "num_input_tokens_seen": 27566520, + "router_z_loss_clip": 4.69140625, + "router_z_loss_mlp": 0.56494141, + "step": 1296, + "time_per_iteration": 2.6269545555114746 + }, + { + "auxiliary_loss_clip": 0.06845632, + "auxiliary_loss_mlp": 0.01328069, + "balance_loss_clip": 0.06355733, + "balance_loss_mlp": 0.01267058, + "epoch": 0.07797985871035623, + "flos": 14287156773120.0, + "grad_norm": 2.6954148658412636, + "language_loss": 0.98733974, + "learning_rate": 3.9759008818343305e-06, + "loss": 1.06907678, + "num_input_tokens_seen": 27581960, + "router_z_loss_clip": 4.8984375, + "router_z_loss_mlp": 0.61010742, + "step": 1297, + "time_per_iteration": 2.550185441970825 + }, + { + "auxiliary_loss_clip": 0.06845116, + "auxiliary_loss_mlp": 0.01318807, + "balance_loss_clip": 0.06359702, + "balance_loss_mlp": 0.01258606, + "epoch": 0.0780399819630242, + "flos": 26616965696640.0, + "grad_norm": 2.8603722020093287, + "language_loss": 0.7874198, + "learning_rate": 3.97584056716893e-06, + "loss": 0.86905909, + "num_input_tokens_seen": 27601415, + "router_z_loss_clip": 4.8515625, + "router_z_loss_mlp": 0.60229492, + "step": 1298, + "time_per_iteration": 2.6391749382019043 + }, + { + "auxiliary_loss_clip": 0.06826787, + "auxiliary_loss_mlp": 0.01312488, + "balance_loss_clip": 0.06351642, + "balance_loss_mlp": 0.01258558, + "epoch": 0.07810010521569218, + "flos": 21840846168960.0, + "grad_norm": 2.2381109850938077, + "language_loss": 0.83600903, + "learning_rate": 3.9757801775794575e-06, + "loss": 0.91740179, + "num_input_tokens_seen": 27621490, + "router_z_loss_clip": 4.75, + "router_z_loss_mlp": 0.53979492, + "step": 1299, + "time_per_iteration": 2.5871427059173584 + }, + { + "auxiliary_loss_clip": 0.0681142, + "auxiliary_loss_mlp": 0.01314166, + "balance_loss_clip": 0.06352274, + "balance_loss_mlp": 0.01260713, + "epoch": 0.07816022846836014, + "flos": 25088022645120.0, + "grad_norm": 2.404074331576357, + "language_loss": 0.89199561, + "learning_rate": 3.975719713068202e-06, + "loss": 0.97325152, + "num_input_tokens_seen": 27640600, + "router_z_loss_clip": 4.58984375, + "router_z_loss_mlp": 0.53442383, + "step": 1300, + "time_per_iteration": 2.633734941482544 + }, + { + "auxiliary_loss_clip": 0.06848504, + "auxiliary_loss_mlp": 0.01319579, + "balance_loss_clip": 0.0636059, + "balance_loss_mlp": 0.0125964, + "epoch": 0.0782203517210281, + "flos": 40927197070080.0, + "grad_norm": 2.022718991796153, + "language_loss": 0.7445091, + "learning_rate": 3.975659173637458e-06, + "loss": 0.82618994, + "num_input_tokens_seen": 27663070, + "router_z_loss_clip": 4.87890625, + "router_z_loss_mlp": 0.59936523, + "step": 1301, + "time_per_iteration": 2.7330377101898193 + }, + { + "auxiliary_loss_clip": 0.06825704, + "auxiliary_loss_mlp": 0.01316028, + "balance_loss_clip": 0.0635405, + "balance_loss_mlp": 0.01261335, + "epoch": 0.07828047497369607, + "flos": 41181587665920.0, + "grad_norm": 2.1366155853756275, + "language_loss": 0.73607302, + "learning_rate": 3.97559855928952e-06, + "loss": 0.81749034, + "num_input_tokens_seen": 27686425, + "router_z_loss_clip": 4.7109375, + "router_z_loss_mlp": 0.54736328, + "step": 1302, + "time_per_iteration": 2.781339168548584 + }, + { + "auxiliary_loss_clip": 0.06820317, + "auxiliary_loss_mlp": 0.01324174, + "balance_loss_clip": 0.06356553, + "balance_loss_mlp": 0.01270124, + "epoch": 0.07834059822636405, + "flos": 23513951370240.0, + "grad_norm": 3.2246124193670433, + "language_loss": 0.84486687, + "learning_rate": 3.9755378700266864e-06, + "loss": 0.92631173, + "num_input_tokens_seen": 27704900, + "router_z_loss_clip": 4.640625, + "router_z_loss_mlp": 0.54101562, + "step": 1303, + "time_per_iteration": 2.5946569442749023 + }, + { + "auxiliary_loss_clip": 0.06814861, + "auxiliary_loss_mlp": 0.01309278, + "balance_loss_clip": 0.06343949, + "balance_loss_mlp": 0.01254919, + "epoch": 0.07840072147903202, + "flos": 20200165297920.0, + "grad_norm": 2.085099882897468, + "language_loss": 0.77159727, + "learning_rate": 3.9754771058512585e-06, + "loss": 0.85283864, + "num_input_tokens_seen": 27724890, + "router_z_loss_clip": 4.69921875, + "router_z_loss_mlp": 0.54394531, + "step": 1304, + "time_per_iteration": 2.5800909996032715 + }, + { + "auxiliary_loss_clip": 0.06828763, + "auxiliary_loss_mlp": 0.01313707, + "balance_loss_clip": 0.06349462, + "balance_loss_mlp": 0.01258799, + "epoch": 0.07846084473169998, + "flos": 21367172638080.0, + "grad_norm": 2.1177139553290734, + "language_loss": 0.7841258, + "learning_rate": 3.975416266765542e-06, + "loss": 0.86555046, + "num_input_tokens_seen": 27743115, + "router_z_loss_clip": 4.78515625, + "router_z_loss_mlp": 0.54882812, + "step": 1305, + "time_per_iteration": 2.569558620452881 + }, + { + "auxiliary_loss_clip": 0.06855056, + "auxiliary_loss_mlp": 0.01321096, + "balance_loss_clip": 0.06367438, + "balance_loss_mlp": 0.01261348, + "epoch": 0.07852096798436796, + "flos": 25418037150720.0, + "grad_norm": 3.9004874062794057, + "language_loss": 0.88314414, + "learning_rate": 3.975355352771841e-06, + "loss": 0.96490562, + "num_input_tokens_seen": 27763570, + "router_z_loss_clip": 4.87109375, + "router_z_loss_mlp": 0.59765625, + "step": 1306, + "time_per_iteration": 2.6575305461883545 + }, + { + "auxiliary_loss_clip": 0.06810681, + "auxiliary_loss_mlp": 0.01315273, + "balance_loss_clip": 0.06347391, + "balance_loss_mlp": 0.01263608, + "epoch": 0.07858109123703592, + "flos": 24578360985600.0, + "grad_norm": 4.395850337278793, + "language_loss": 0.93214571, + "learning_rate": 3.975294363872468e-06, + "loss": 1.01340532, + "num_input_tokens_seen": 27780030, + "router_z_loss_clip": 4.6328125, + "router_z_loss_mlp": 0.51660156, + "step": 1307, + "time_per_iteration": 2.592435359954834 + }, + { + "auxiliary_loss_clip": 0.0682511, + "auxiliary_loss_mlp": 0.0131993, + "balance_loss_clip": 0.06345625, + "balance_loss_mlp": 0.01262566, + "epoch": 0.07864121448970389, + "flos": 20704250661120.0, + "grad_norm": 3.2307026300408683, + "language_loss": 0.8507998, + "learning_rate": 3.975233300069735e-06, + "loss": 0.93225014, + "num_input_tokens_seen": 27796225, + "router_z_loss_clip": 4.7890625, + "router_z_loss_mlp": 0.57373047, + "step": 1308, + "time_per_iteration": 2.597881555557251 + }, + { + "auxiliary_loss_clip": 0.06792136, + "auxiliary_loss_mlp": 0.01314144, + "balance_loss_clip": 0.06338251, + "balance_loss_mlp": 0.01262598, + "epoch": 0.07870133774237187, + "flos": 22973207045760.0, + "grad_norm": 1.9389316858499817, + "language_loss": 0.79464692, + "learning_rate": 3.975172161365958e-06, + "loss": 0.87570971, + "num_input_tokens_seen": 27815975, + "router_z_loss_clip": 4.53515625, + "router_z_loss_mlp": 0.515625, + "step": 1309, + "time_per_iteration": 2.599799871444702 + }, + { + "auxiliary_loss_clip": 0.06823064, + "auxiliary_loss_mlp": 0.01328854, + "balance_loss_clip": 0.06347175, + "balance_loss_mlp": 0.01272683, + "epoch": 0.07876146099503983, + "flos": 18848689194240.0, + "grad_norm": 2.5866734138361345, + "language_loss": 0.83378398, + "learning_rate": 3.975110947763453e-06, + "loss": 0.91530323, + "num_input_tokens_seen": 27832255, + "router_z_loss_clip": 4.765625, + "router_z_loss_mlp": 0.56176758, + "step": 1310, + "time_per_iteration": 2.5724973678588867 + }, + { + "auxiliary_loss_clip": 0.0678651, + "auxiliary_loss_mlp": 0.01315999, + "balance_loss_clip": 0.06338531, + "balance_loss_mlp": 0.01264811, + "epoch": 0.0788215842477078, + "flos": 23812631648640.0, + "grad_norm": 2.2765510373912683, + "language_loss": 0.76230896, + "learning_rate": 3.9750496592645435e-06, + "loss": 0.84333402, + "num_input_tokens_seen": 27852180, + "router_z_loss_clip": 4.48046875, + "router_z_loss_mlp": 0.51123047, + "step": 1311, + "time_per_iteration": 2.632310628890991 + }, + { + "auxiliary_loss_clip": 0.0680154, + "auxiliary_loss_mlp": 0.01319845, + "balance_loss_clip": 0.06336971, + "balance_loss_mlp": 0.01265009, + "epoch": 0.07888170750037576, + "flos": 21586329792000.0, + "grad_norm": 3.554782909684318, + "language_loss": 0.88360095, + "learning_rate": 3.974988295871553e-06, + "loss": 0.96481478, + "num_input_tokens_seen": 27871435, + "router_z_loss_clip": 4.640625, + "router_z_loss_mlp": 0.54882812, + "step": 1312, + "time_per_iteration": 2.7384519577026367 + }, + { + "auxiliary_loss_clip": 0.06786558, + "auxiliary_loss_mlp": 0.01318936, + "balance_loss_clip": 0.06334423, + "balance_loss_mlp": 0.01270561, + "epoch": 0.07894183075304374, + "flos": 19870947406080.0, + "grad_norm": 2.1624292410526773, + "language_loss": 0.84578681, + "learning_rate": 3.9749268575868085e-06, + "loss": 0.92684174, + "num_input_tokens_seen": 27890625, + "router_z_loss_clip": 4.5234375, + "router_z_loss_mlp": 0.48388672, + "step": 1313, + "time_per_iteration": 2.6043031215667725 + }, + { + "auxiliary_loss_clip": 0.06836893, + "auxiliary_loss_mlp": 0.01334789, + "balance_loss_clip": 0.06342322, + "balance_loss_mlp": 0.01270368, + "epoch": 0.07900195400571171, + "flos": 16148965368960.0, + "grad_norm": 3.8741474948490717, + "language_loss": 0.75254732, + "learning_rate": 3.97486534441264e-06, + "loss": 0.83426416, + "num_input_tokens_seen": 27906530, + "router_z_loss_clip": 4.94140625, + "router_z_loss_mlp": 0.64404297, + "step": 1314, + "time_per_iteration": 2.532270669937134 + }, + { + "auxiliary_loss_clip": 0.06814209, + "auxiliary_loss_mlp": 0.01316459, + "balance_loss_clip": 0.06346349, + "balance_loss_mlp": 0.01263363, + "epoch": 0.07906207725837967, + "flos": 23736840030720.0, + "grad_norm": 2.0058439737114826, + "language_loss": 0.8208642, + "learning_rate": 3.974803756351379e-06, + "loss": 0.9021709, + "num_input_tokens_seen": 27926725, + "router_z_loss_clip": 4.67578125, + "router_z_loss_mlp": 0.53125, + "step": 1315, + "time_per_iteration": 2.6085028648376465 + }, + { + "auxiliary_loss_clip": 0.06824351, + "auxiliary_loss_mlp": 0.01326067, + "balance_loss_clip": 0.06345295, + "balance_loss_mlp": 0.01265914, + "epoch": 0.07912220051104765, + "flos": 24322712578560.0, + "grad_norm": 1.9106769346900934, + "language_loss": 0.76054502, + "learning_rate": 3.974742093405362e-06, + "loss": 0.84204924, + "num_input_tokens_seen": 27947875, + "router_z_loss_clip": 4.79296875, + "router_z_loss_mlp": 0.60083008, + "step": 1316, + "time_per_iteration": 2.586472749710083 + }, + { + "auxiliary_loss_clip": 0.0684765, + "auxiliary_loss_mlp": 0.01325754, + "balance_loss_clip": 0.06349534, + "balance_loss_mlp": 0.01266244, + "epoch": 0.07918232376371562, + "flos": 18886018988160.0, + "grad_norm": 4.4995832003619, + "language_loss": 0.68677568, + "learning_rate": 3.974680355576927e-06, + "loss": 0.76850969, + "num_input_tokens_seen": 27965040, + "router_z_loss_clip": 4.98046875, + "router_z_loss_mlp": 0.59472656, + "step": 1317, + "time_per_iteration": 2.5489861965179443 + }, + { + "auxiliary_loss_clip": 0.06869859, + "auxiliary_loss_mlp": 0.01349552, + "balance_loss_clip": 0.06357804, + "balance_loss_mlp": 0.01281912, + "epoch": 0.07924244701638358, + "flos": 27382862741760.0, + "grad_norm": 3.047310758275923, + "language_loss": 0.75324464, + "learning_rate": 3.974618542868415e-06, + "loss": 0.83543873, + "num_input_tokens_seen": 27985330, + "router_z_loss_clip": 5.12109375, + "router_z_loss_mlp": 0.67700195, + "step": 1318, + "time_per_iteration": 2.5918128490448 + }, + { + "auxiliary_loss_clip": 0.06830844, + "auxiliary_loss_mlp": 0.01322573, + "balance_loss_clip": 0.06359029, + "balance_loss_mlp": 0.01269692, + "epoch": 0.07930257026905156, + "flos": 25127574572160.0, + "grad_norm": 1.9442087070115428, + "language_loss": 0.92534363, + "learning_rate": 3.97455665528217e-06, + "loss": 1.0068779, + "num_input_tokens_seen": 28007615, + "router_z_loss_clip": 4.7109375, + "router_z_loss_mlp": 0.52929688, + "step": 1319, + "time_per_iteration": 3.993619203567505 + }, + { + "auxiliary_loss_clip": 0.06832193, + "auxiliary_loss_mlp": 0.0132254, + "balance_loss_clip": 0.06361841, + "balance_loss_mlp": 0.01272902, + "epoch": 0.07936269352171953, + "flos": 21840804241920.0, + "grad_norm": 2.144433650708689, + "language_loss": 0.81964207, + "learning_rate": 3.974494692820539e-06, + "loss": 0.90118945, + "num_input_tokens_seen": 28027765, + "router_z_loss_clip": 4.703125, + "router_z_loss_mlp": 0.49633789, + "step": 1320, + "time_per_iteration": 3.991323232650757 + }, + { + "auxiliary_loss_clip": 0.06858893, + "auxiliary_loss_mlp": 0.01331954, + "balance_loss_clip": 0.06361651, + "balance_loss_mlp": 0.01271801, + "epoch": 0.07942281677438749, + "flos": 16944477632640.0, + "grad_norm": 2.2380017082009576, + "language_loss": 0.71816266, + "learning_rate": 3.974432655485872e-06, + "loss": 0.80007118, + "num_input_tokens_seen": 28044225, + "router_z_loss_clip": 4.96484375, + "router_z_loss_mlp": 0.60205078, + "step": 1321, + "time_per_iteration": 2.5437092781066895 + }, + { + "auxiliary_loss_clip": 0.06835557, + "auxiliary_loss_mlp": 0.01340758, + "balance_loss_clip": 0.06363731, + "balance_loss_mlp": 0.01282297, + "epoch": 0.07948294002705546, + "flos": 18992515927680.0, + "grad_norm": 2.7756488817332943, + "language_loss": 0.86391938, + "learning_rate": 3.9743705432805195e-06, + "loss": 0.94568253, + "num_input_tokens_seen": 28062915, + "router_z_loss_clip": 4.7109375, + "router_z_loss_mlp": 0.5847168, + "step": 1322, + "time_per_iteration": 2.579061985015869 + }, + { + "auxiliary_loss_clip": 0.06837995, + "auxiliary_loss_mlp": 0.01339731, + "balance_loss_clip": 0.0636203, + "balance_loss_mlp": 0.01284681, + "epoch": 0.07954306327972344, + "flos": 21659983130880.0, + "grad_norm": 2.3668510426442144, + "language_loss": 0.92888951, + "learning_rate": 3.974308356206838e-06, + "loss": 1.01066673, + "num_input_tokens_seen": 28082175, + "router_z_loss_clip": 4.7578125, + "router_z_loss_mlp": 0.55053711, + "step": 1323, + "time_per_iteration": 3.9885079860687256 + }, + { + "auxiliary_loss_clip": 0.06820317, + "auxiliary_loss_mlp": 0.01320075, + "balance_loss_clip": 0.06361794, + "balance_loss_mlp": 0.01267504, + "epoch": 0.0796031865323914, + "flos": 23226717173760.0, + "grad_norm": 4.577989929254941, + "language_loss": 0.84617591, + "learning_rate": 3.974246094267187e-06, + "loss": 0.92757982, + "num_input_tokens_seen": 28102645, + "router_z_loss_clip": 4.58203125, + "router_z_loss_mlp": 0.52661133, + "step": 1324, + "time_per_iteration": 2.575162410736084 + }, + { + "auxiliary_loss_clip": 0.0682738, + "auxiliary_loss_mlp": 0.01317412, + "balance_loss_clip": 0.06365715, + "balance_loss_mlp": 0.0126372, + "epoch": 0.07966330978505937, + "flos": 23301209053440.0, + "grad_norm": 4.146924168553952, + "language_loss": 0.81619465, + "learning_rate": 3.974183757463925e-06, + "loss": 0.89764249, + "num_input_tokens_seen": 28122805, + "router_z_loss_clip": 4.6171875, + "router_z_loss_mlp": 0.53710938, + "step": 1325, + "time_per_iteration": 3.9960508346557617 + }, + { + "auxiliary_loss_clip": 0.06838783, + "auxiliary_loss_mlp": 0.01317663, + "balance_loss_clip": 0.06375229, + "balance_loss_mlp": 0.01262112, + "epoch": 0.07972343303772735, + "flos": 18368768534400.0, + "grad_norm": 3.482553532723253, + "language_loss": 0.90544963, + "learning_rate": 3.974121345799418e-06, + "loss": 0.98701411, + "num_input_tokens_seen": 28140530, + "router_z_loss_clip": 4.63671875, + "router_z_loss_mlp": 0.55493164, + "step": 1326, + "time_per_iteration": 2.5401828289031982 + }, + { + "auxiliary_loss_clip": 0.0682137, + "auxiliary_loss_mlp": 0.01316322, + "balance_loss_clip": 0.06366737, + "balance_loss_mlp": 0.01263488, + "epoch": 0.07978355629039531, + "flos": 21768995692800.0, + "grad_norm": 2.4962093100336085, + "language_loss": 0.85295928, + "learning_rate": 3.974058859276032e-06, + "loss": 0.93433619, + "num_input_tokens_seen": 28159640, + "router_z_loss_clip": 4.54296875, + "router_z_loss_mlp": 0.52856445, + "step": 1327, + "time_per_iteration": 2.6081485748291016 + }, + { + "auxiliary_loss_clip": 0.0686523, + "auxiliary_loss_mlp": 0.01320845, + "balance_loss_clip": 0.06376741, + "balance_loss_mlp": 0.01260119, + "epoch": 0.07984367954306328, + "flos": 18557178439680.0, + "grad_norm": 3.6856767873413077, + "language_loss": 0.82425529, + "learning_rate": 3.9739962978961354e-06, + "loss": 0.90611601, + "num_input_tokens_seen": 28177050, + "router_z_loss_clip": 4.88671875, + "router_z_loss_mlp": 0.60742188, + "step": 1328, + "time_per_iteration": 2.5963807106018066 + }, + { + "auxiliary_loss_clip": 0.06855517, + "auxiliary_loss_mlp": 0.01323941, + "balance_loss_clip": 0.06378672, + "balance_loss_mlp": 0.01266315, + "epoch": 0.07990380279573125, + "flos": 16908741066240.0, + "grad_norm": 2.810501054411486, + "language_loss": 0.77465802, + "learning_rate": 3.973933661662101e-06, + "loss": 0.85645258, + "num_input_tokens_seen": 28193245, + "router_z_loss_clip": 4.76953125, + "router_z_loss_mlp": 0.57666016, + "step": 1329, + "time_per_iteration": 2.5654993057250977 + }, + { + "auxiliary_loss_clip": 0.06870389, + "auxiliary_loss_mlp": 0.01332359, + "balance_loss_clip": 0.06403654, + "balance_loss_mlp": 0.01277785, + "epoch": 0.07996392604839922, + "flos": 24105358287360.0, + "grad_norm": 3.2158550447724354, + "language_loss": 0.83423603, + "learning_rate": 3.973870950576305e-06, + "loss": 0.91626346, + "num_input_tokens_seen": 28213570, + "router_z_loss_clip": 4.66796875, + "router_z_loss_mlp": 0.5456543, + "step": 1330, + "time_per_iteration": 2.689359426498413 + }, + { + "auxiliary_loss_clip": 0.06871998, + "auxiliary_loss_mlp": 0.01327325, + "balance_loss_clip": 0.06395264, + "balance_loss_mlp": 0.01271893, + "epoch": 0.08002404930106718, + "flos": 14283257558400.0, + "grad_norm": 2.3593668670474375, + "language_loss": 0.91363919, + "learning_rate": 3.9738081646411255e-06, + "loss": 0.99563241, + "num_input_tokens_seen": 28229980, + "router_z_loss_clip": 4.765625, + "router_z_loss_mlp": 0.5534668, + "step": 1331, + "time_per_iteration": 2.535022735595703 + }, + { + "auxiliary_loss_clip": 0.06886654, + "auxiliary_loss_mlp": 0.01331981, + "balance_loss_clip": 0.0639886, + "balance_loss_mlp": 0.01274283, + "epoch": 0.08008417255373516, + "flos": 40415732547840.0, + "grad_norm": 8.382777264974079, + "language_loss": 0.75984204, + "learning_rate": 3.973745303858942e-06, + "loss": 0.84202838, + "num_input_tokens_seen": 28253840, + "router_z_loss_clip": 4.875, + "router_z_loss_mlp": 0.57666016, + "step": 1332, + "time_per_iteration": 2.798543691635132 + }, + { + "auxiliary_loss_clip": 0.06853566, + "auxiliary_loss_mlp": 0.01322273, + "balance_loss_clip": 0.06399575, + "balance_loss_mlp": 0.01270894, + "epoch": 0.08014429580640313, + "flos": 18484866766080.0, + "grad_norm": 3.077187306300229, + "language_loss": 0.84502465, + "learning_rate": 3.973682368232138e-06, + "loss": 0.92678297, + "num_input_tokens_seen": 28271675, + "router_z_loss_clip": 4.54296875, + "router_z_loss_mlp": 0.51318359, + "step": 1333, + "time_per_iteration": 2.55322003364563 + }, + { + "auxiliary_loss_clip": 0.06860092, + "auxiliary_loss_mlp": 0.01337998, + "balance_loss_clip": 0.06402323, + "balance_loss_mlp": 0.01283972, + "epoch": 0.0802044190590711, + "flos": 22059835614720.0, + "grad_norm": 5.409358557797253, + "language_loss": 0.77425432, + "learning_rate": 3.9736193577631015e-06, + "loss": 0.85623527, + "num_input_tokens_seen": 28291850, + "router_z_loss_clip": 4.57421875, + "router_z_loss_mlp": 0.54052734, + "step": 1334, + "time_per_iteration": 2.6176130771636963 + }, + { + "auxiliary_loss_clip": 0.06866166, + "auxiliary_loss_mlp": 0.01339925, + "balance_loss_clip": 0.06404187, + "balance_loss_mlp": 0.01288045, + "epoch": 0.08026454231173906, + "flos": 24579115672320.0, + "grad_norm": 2.171957673256717, + "language_loss": 0.82094586, + "learning_rate": 3.973556272454221e-06, + "loss": 0.90300679, + "num_input_tokens_seen": 28310780, + "router_z_loss_clip": 4.6171875, + "router_z_loss_mlp": 0.51855469, + "step": 1335, + "time_per_iteration": 2.5995283126831055 + }, + { + "auxiliary_loss_clip": 0.0666078, + "auxiliary_loss_mlp": 0.0130214, + "balance_loss_clip": 0.06361455, + "balance_loss_mlp": 0.01275747, + "epoch": 0.08032466556440704, + "flos": 52597716940800.0, + "grad_norm": 0.7171954407460774, + "language_loss": 0.56264853, + "learning_rate": 3.973493112307889e-06, + "loss": 0.64227772, + "num_input_tokens_seen": 28369985, + "router_z_loss_clip": 3.0, + "router_z_loss_mlp": 0.2644043, + "step": 1336, + "time_per_iteration": 3.246748447418213 + }, + { + "auxiliary_loss_clip": 0.06839207, + "auxiliary_loss_mlp": 0.01326336, + "balance_loss_clip": 0.06379974, + "balance_loss_mlp": 0.01274528, + "epoch": 0.080384788817075, + "flos": 23849500245120.0, + "grad_norm": 4.030100704660237, + "language_loss": 0.70582694, + "learning_rate": 3.9734298773265005e-06, + "loss": 0.78748238, + "num_input_tokens_seen": 28388670, + "router_z_loss_clip": 4.58984375, + "router_z_loss_mlp": 0.51757812, + "step": 1337, + "time_per_iteration": 2.6283833980560303 + }, + { + "auxiliary_loss_clip": 0.06838794, + "auxiliary_loss_mlp": 0.01334035, + "balance_loss_clip": 0.06387126, + "balance_loss_mlp": 0.01282751, + "epoch": 0.08044491206974297, + "flos": 25307640996480.0, + "grad_norm": 2.123866739454124, + "language_loss": 0.89543176, + "learning_rate": 3.973366567512453e-06, + "loss": 0.97716004, + "num_input_tokens_seen": 28411845, + "router_z_loss_clip": 4.515625, + "router_z_loss_mlp": 0.51245117, + "step": 1338, + "time_per_iteration": 2.657308340072632 + }, + { + "auxiliary_loss_clip": 0.0684766, + "auxiliary_loss_mlp": 0.01327669, + "balance_loss_clip": 0.06375088, + "balance_loss_mlp": 0.01275956, + "epoch": 0.08050503532241095, + "flos": 22382093617920.0, + "grad_norm": 3.2141596734882705, + "language_loss": 0.89268589, + "learning_rate": 3.973303182868147e-06, + "loss": 0.97443926, + "num_input_tokens_seen": 28427875, + "router_z_loss_clip": 4.7265625, + "router_z_loss_mlp": 0.51708984, + "step": 1339, + "time_per_iteration": 2.592478036880493 + }, + { + "auxiliary_loss_clip": 0.06819817, + "auxiliary_loss_mlp": 0.01317452, + "balance_loss_clip": 0.06381136, + "balance_loss_mlp": 0.01272391, + "epoch": 0.08056515857507891, + "flos": 18375351079680.0, + "grad_norm": 3.0627135326619093, + "language_loss": 0.91607487, + "learning_rate": 3.973239723395988e-06, + "loss": 0.99744761, + "num_input_tokens_seen": 28446615, + "router_z_loss_clip": 4.390625, + "router_z_loss_mlp": 0.45019531, + "step": 1340, + "time_per_iteration": 2.576737403869629 + }, + { + "auxiliary_loss_clip": 0.06633395, + "auxiliary_loss_mlp": 0.01308679, + "balance_loss_clip": 0.06341641, + "balance_loss_mlp": 0.01279282, + "epoch": 0.08062528182774688, + "flos": 51364938545280.0, + "grad_norm": 0.8608858843500025, + "language_loss": 0.65432441, + "learning_rate": 3.97317618909838e-06, + "loss": 0.73374522, + "num_input_tokens_seen": 28505290, + "router_z_loss_clip": 2.91992188, + "router_z_loss_mlp": 0.29321289, + "step": 1341, + "time_per_iteration": 3.1589889526367188 + }, + { + "auxiliary_loss_clip": 0.06851779, + "auxiliary_loss_mlp": 0.01330947, + "balance_loss_clip": 0.06375904, + "balance_loss_mlp": 0.01274966, + "epoch": 0.08068540508041486, + "flos": 17604925914240.0, + "grad_norm": 3.057229978757205, + "language_loss": 0.9131434, + "learning_rate": 3.973112579977733e-06, + "loss": 0.99497068, + "num_input_tokens_seen": 28522735, + "router_z_loss_clip": 4.7578125, + "router_z_loss_mlp": 0.55932617, + "step": 1342, + "time_per_iteration": 2.5444014072418213 + }, + { + "auxiliary_loss_clip": 0.06830276, + "auxiliary_loss_mlp": 0.01334079, + "balance_loss_clip": 0.06376267, + "balance_loss_mlp": 0.01283748, + "epoch": 0.08074552833308282, + "flos": 10565761714560.0, + "grad_norm": 4.354152160697022, + "language_loss": 0.78571475, + "learning_rate": 3.973048896036459e-06, + "loss": 0.86735827, + "num_input_tokens_seen": 28539460, + "router_z_loss_clip": 4.54296875, + "router_z_loss_mlp": 0.50268555, + "step": 1343, + "time_per_iteration": 2.5960419178009033 + }, + { + "auxiliary_loss_clip": 0.06624237, + "auxiliary_loss_mlp": 0.01296199, + "balance_loss_clip": 0.06332739, + "balance_loss_mlp": 0.0127157, + "epoch": 0.08080565158575079, + "flos": 60859624245120.0, + "grad_norm": 0.7713053801929547, + "language_loss": 0.57751364, + "learning_rate": 3.972985137276974e-06, + "loss": 0.65671802, + "num_input_tokens_seen": 28599855, + "router_z_loss_clip": 2.90625, + "router_z_loss_mlp": 0.24609375, + "step": 1344, + "time_per_iteration": 3.101456880569458 + }, + { + "auxiliary_loss_clip": 0.06825489, + "auxiliary_loss_mlp": 0.01321695, + "balance_loss_clip": 0.06367917, + "balance_loss_mlp": 0.01271937, + "epoch": 0.08086577483841875, + "flos": 18338188993920.0, + "grad_norm": 5.096262211204216, + "language_loss": 0.90334368, + "learning_rate": 3.972921303701695e-06, + "loss": 0.98481554, + "num_input_tokens_seen": 28617585, + "router_z_loss_clip": 4.578125, + "router_z_loss_mlp": 0.49780273, + "step": 1345, + "time_per_iteration": 2.586388349533081 + }, + { + "auxiliary_loss_clip": 0.0679345, + "auxiliary_loss_mlp": 0.013189, + "balance_loss_clip": 0.06356402, + "balance_loss_mlp": 0.01272527, + "epoch": 0.08092589809108673, + "flos": 21550048174080.0, + "grad_norm": 2.3072860000969437, + "language_loss": 0.89656544, + "learning_rate": 3.972857395313042e-06, + "loss": 0.97768891, + "num_input_tokens_seen": 28636355, + "router_z_loss_clip": 4.37109375, + "router_z_loss_mlp": 0.46386719, + "step": 1346, + "time_per_iteration": 2.582712411880493 + }, + { + "auxiliary_loss_clip": 0.06790248, + "auxiliary_loss_mlp": 0.01314356, + "balance_loss_clip": 0.06353667, + "balance_loss_mlp": 0.0126734, + "epoch": 0.0809860213437547, + "flos": 22134662910720.0, + "grad_norm": 2.14729633171376, + "language_loss": 0.94647479, + "learning_rate": 3.972793412113439e-06, + "loss": 1.0275209, + "num_input_tokens_seen": 28656260, + "router_z_loss_clip": 4.36328125, + "router_z_loss_mlp": 0.47021484, + "step": 1347, + "time_per_iteration": 2.625967025756836 + }, + { + "auxiliary_loss_clip": 0.06793564, + "auxiliary_loss_mlp": 0.01318721, + "balance_loss_clip": 0.06355867, + "balance_loss_mlp": 0.01268487, + "epoch": 0.08104614459642266, + "flos": 21731875534080.0, + "grad_norm": 1.9969105850097444, + "language_loss": 0.91454613, + "learning_rate": 3.972729354105312e-06, + "loss": 0.99566901, + "num_input_tokens_seen": 28675865, + "router_z_loss_clip": 4.37109375, + "router_z_loss_mlp": 0.50219727, + "step": 1348, + "time_per_iteration": 2.5634779930114746 + }, + { + "auxiliary_loss_clip": 0.06800284, + "auxiliary_loss_mlp": 0.01324319, + "balance_loss_clip": 0.06360676, + "balance_loss_mlp": 0.01274585, + "epoch": 0.08110626784909064, + "flos": 23958764369280.0, + "grad_norm": 1.9721965286660104, + "language_loss": 0.78618681, + "learning_rate": 3.97266522129109e-06, + "loss": 0.86743283, + "num_input_tokens_seen": 28696255, + "router_z_loss_clip": 4.39453125, + "router_z_loss_mlp": 0.49731445, + "step": 1349, + "time_per_iteration": 2.6185498237609863 + }, + { + "auxiliary_loss_clip": 0.06800876, + "auxiliary_loss_mlp": 0.01313559, + "balance_loss_clip": 0.06350809, + "balance_loss_mlp": 0.01260082, + "epoch": 0.0811663911017586, + "flos": 19031648584320.0, + "grad_norm": 2.1691769325426407, + "language_loss": 0.90292668, + "learning_rate": 3.972601013673205e-06, + "loss": 0.98407102, + "num_input_tokens_seen": 28713905, + "router_z_loss_clip": 4.5, + "router_z_loss_mlp": 0.53491211, + "step": 1350, + "time_per_iteration": 2.5529837608337402 + }, + { + "auxiliary_loss_clip": 0.06778225, + "auxiliary_loss_mlp": 0.01313184, + "balance_loss_clip": 0.06345821, + "balance_loss_mlp": 0.01263522, + "epoch": 0.08122651435442657, + "flos": 15346744778880.0, + "grad_norm": 2.4256402439075524, + "language_loss": 0.84302771, + "learning_rate": 3.972536731254092e-06, + "loss": 0.92394179, + "num_input_tokens_seen": 28732075, + "router_z_loss_clip": 4.33203125, + "router_z_loss_mlp": 0.49633789, + "step": 1351, + "time_per_iteration": 2.574605941772461 + }, + { + "auxiliary_loss_clip": 0.06780043, + "auxiliary_loss_mlp": 0.01313675, + "balance_loss_clip": 0.06340061, + "balance_loss_mlp": 0.01260365, + "epoch": 0.08128663760709455, + "flos": 23228226547200.0, + "grad_norm": 2.4241077577089296, + "language_loss": 0.77524561, + "learning_rate": 3.972472374036189e-06, + "loss": 0.85618269, + "num_input_tokens_seen": 28751150, + "router_z_loss_clip": 4.39453125, + "router_z_loss_mlp": 0.53393555, + "step": 1352, + "time_per_iteration": 2.5638983249664307 + }, + { + "auxiliary_loss_clip": 0.06784214, + "auxiliary_loss_mlp": 0.01317971, + "balance_loss_clip": 0.06339107, + "balance_loss_mlp": 0.01263802, + "epoch": 0.08134676085976252, + "flos": 22972158869760.0, + "grad_norm": 2.0098905052691154, + "language_loss": 0.84226817, + "learning_rate": 3.972407942021935e-06, + "loss": 0.92329001, + "num_input_tokens_seen": 28773360, + "router_z_loss_clip": 4.44140625, + "router_z_loss_mlp": 0.54223633, + "step": 1353, + "time_per_iteration": 2.64945125579834 + }, + { + "auxiliary_loss_clip": 0.06608218, + "auxiliary_loss_mlp": 0.01309213, + "balance_loss_clip": 0.06325812, + "balance_loss_mlp": 0.01278219, + "epoch": 0.08140688411243048, + "flos": 64338592642560.0, + "grad_norm": 0.8262871142057754, + "language_loss": 0.5983628, + "learning_rate": 3.972343435213775e-06, + "loss": 0.67753708, + "num_input_tokens_seen": 28833390, + "router_z_loss_clip": 2.828125, + "router_z_loss_mlp": 0.30957031, + "step": 1354, + "time_per_iteration": 3.1732943058013916 + }, + { + "auxiliary_loss_clip": 0.06774879, + "auxiliary_loss_mlp": 0.0130121, + "balance_loss_clip": 0.0634238, + "balance_loss_mlp": 0.01251332, + "epoch": 0.08146700736509845, + "flos": 22498401484800.0, + "grad_norm": 1.9500881523267093, + "language_loss": 0.84588456, + "learning_rate": 3.972278853614154e-06, + "loss": 0.92664552, + "num_input_tokens_seen": 28852430, + "router_z_loss_clip": 4.32421875, + "router_z_loss_mlp": 0.49853516, + "step": 1355, + "time_per_iteration": 2.6024701595306396 + }, + { + "auxiliary_loss_clip": 0.06776839, + "auxiliary_loss_mlp": 0.01312133, + "balance_loss_clip": 0.06341404, + "balance_loss_mlp": 0.01258727, + "epoch": 0.08152713061776642, + "flos": 20453885061120.0, + "grad_norm": 2.065670918937768, + "language_loss": 0.73062277, + "learning_rate": 3.972214197225521e-06, + "loss": 0.81151247, + "num_input_tokens_seen": 28870685, + "router_z_loss_clip": 4.3515625, + "router_z_loss_mlp": 0.53393555, + "step": 1356, + "time_per_iteration": 2.72872257232666 + }, + { + "auxiliary_loss_clip": 0.06800745, + "auxiliary_loss_mlp": 0.01315187, + "balance_loss_clip": 0.06343117, + "balance_loss_mlp": 0.01261305, + "epoch": 0.08158725387043439, + "flos": 23556983241600.0, + "grad_norm": 2.136910900826005, + "language_loss": 0.72079623, + "learning_rate": 3.972149466050329e-06, + "loss": 0.80195546, + "num_input_tokens_seen": 28889860, + "router_z_loss_clip": 4.57421875, + "router_z_loss_mlp": 0.5390625, + "step": 1357, + "time_per_iteration": 2.5841641426086426 + }, + { + "auxiliary_loss_clip": 0.06792152, + "auxiliary_loss_mlp": 0.01312262, + "balance_loss_clip": 0.06345978, + "balance_loss_mlp": 0.01258093, + "epoch": 0.08164737712310235, + "flos": 22023763632000.0, + "grad_norm": 3.905031036394957, + "language_loss": 0.86688, + "learning_rate": 3.97208466009103e-06, + "loss": 0.94792414, + "num_input_tokens_seen": 28905865, + "router_z_loss_clip": 4.45703125, + "router_z_loss_mlp": 0.54150391, + "step": 1358, + "time_per_iteration": 4.091388940811157 + }, + { + "auxiliary_loss_clip": 0.0678063, + "auxiliary_loss_mlp": 0.01322843, + "balance_loss_clip": 0.06336431, + "balance_loss_mlp": 0.01268985, + "epoch": 0.08170750037577033, + "flos": 23374568903040.0, + "grad_norm": 2.183092150408785, + "language_loss": 1.0464294, + "learning_rate": 3.972019779350084e-06, + "loss": 1.12746406, + "num_input_tokens_seen": 28925250, + "router_z_loss_clip": 4.4453125, + "router_z_loss_mlp": 0.53857422, + "step": 1359, + "time_per_iteration": 2.638028860092163 + }, + { + "auxiliary_loss_clip": 0.06798591, + "auxiliary_loss_mlp": 0.01334932, + "balance_loss_clip": 0.06339104, + "balance_loss_mlp": 0.01274732, + "epoch": 0.0817676236284383, + "flos": 28404743610240.0, + "grad_norm": 2.2550025008974335, + "language_loss": 0.86049831, + "learning_rate": 3.971954823829951e-06, + "loss": 0.9418335, + "num_input_tokens_seen": 28943445, + "router_z_loss_clip": 4.59765625, + "router_z_loss_mlp": 0.60229492, + "step": 1360, + "time_per_iteration": 4.079089164733887 + }, + { + "auxiliary_loss_clip": 0.06791367, + "auxiliary_loss_mlp": 0.01327265, + "balance_loss_clip": 0.06338443, + "balance_loss_mlp": 0.01274146, + "epoch": 0.08182774688110626, + "flos": 19215027244800.0, + "grad_norm": 8.376592298607987, + "language_loss": 0.74940681, + "learning_rate": 3.971889793533093e-06, + "loss": 0.83059311, + "num_input_tokens_seen": 28962695, + "router_z_loss_clip": 4.53125, + "router_z_loss_mlp": 0.53125, + "step": 1361, + "time_per_iteration": 2.6070094108581543 + }, + { + "auxiliary_loss_clip": 0.06780887, + "auxiliary_loss_mlp": 0.01320749, + "balance_loss_clip": 0.06343664, + "balance_loss_mlp": 0.01270443, + "epoch": 0.08188787013377424, + "flos": 22790750780160.0, + "grad_norm": 2.8909747766913574, + "language_loss": 0.79067749, + "learning_rate": 3.971824688461976e-06, + "loss": 0.87169385, + "num_input_tokens_seen": 28982120, + "router_z_loss_clip": 4.3671875, + "router_z_loss_mlp": 0.50244141, + "step": 1362, + "time_per_iteration": 2.575406074523926 + }, + { + "auxiliary_loss_clip": 0.06776625, + "auxiliary_loss_mlp": 0.01317112, + "balance_loss_clip": 0.06338399, + "balance_loss_mlp": 0.01266543, + "epoch": 0.08194799338644221, + "flos": 16473026234880.0, + "grad_norm": 2.5840358465526787, + "language_loss": 0.74518561, + "learning_rate": 3.971759508619069e-06, + "loss": 0.826123, + "num_input_tokens_seen": 28998100, + "router_z_loss_clip": 4.3828125, + "router_z_loss_mlp": 0.50537109, + "step": 1363, + "time_per_iteration": 3.9524402618408203 + }, + { + "auxiliary_loss_clip": 0.06785508, + "auxiliary_loss_mlp": 0.01321755, + "balance_loss_clip": 0.06342393, + "balance_loss_mlp": 0.01265846, + "epoch": 0.08200811663911017, + "flos": 23920218691200.0, + "grad_norm": 2.478943630227512, + "language_loss": 0.79175317, + "learning_rate": 3.971694254006844e-06, + "loss": 0.87282574, + "num_input_tokens_seen": 29017095, + "router_z_loss_clip": 4.43359375, + "router_z_loss_mlp": 0.55859375, + "step": 1364, + "time_per_iteration": 2.607170343399048 + }, + { + "auxiliary_loss_clip": 0.06783722, + "auxiliary_loss_mlp": 0.01316868, + "balance_loss_clip": 0.06340142, + "balance_loss_mlp": 0.01262867, + "epoch": 0.08206823989177814, + "flos": 17902641870720.0, + "grad_norm": 2.8411268969790275, + "language_loss": 0.83563399, + "learning_rate": 3.971628924627776e-06, + "loss": 0.91663992, + "num_input_tokens_seen": 29037240, + "router_z_loss_clip": 4.4375, + "router_z_loss_mlp": 0.54003906, + "step": 1365, + "time_per_iteration": 4.020315647125244 + }, + { + "auxiliary_loss_clip": 0.06767645, + "auxiliary_loss_mlp": 0.01324198, + "balance_loss_clip": 0.06336691, + "balance_loss_mlp": 0.01274917, + "epoch": 0.08212836314444612, + "flos": 22094272442880.0, + "grad_norm": 1.9744562731627089, + "language_loss": 0.83576512, + "learning_rate": 3.97156352048434e-06, + "loss": 0.91668355, + "num_input_tokens_seen": 29056250, + "router_z_loss_clip": 4.30078125, + "router_z_loss_mlp": 0.49243164, + "step": 1366, + "time_per_iteration": 2.5904746055603027 + }, + { + "auxiliary_loss_clip": 0.06785953, + "auxiliary_loss_mlp": 0.01321056, + "balance_loss_clip": 0.06344087, + "balance_loss_mlp": 0.01269963, + "epoch": 0.08218848639711408, + "flos": 17602326437760.0, + "grad_norm": 2.595099293602591, + "language_loss": 0.84101415, + "learning_rate": 3.97149804157902e-06, + "loss": 0.92208421, + "num_input_tokens_seen": 29073380, + "router_z_loss_clip": 4.41015625, + "router_z_loss_mlp": 0.51074219, + "step": 1367, + "time_per_iteration": 2.547091007232666 + }, + { + "auxiliary_loss_clip": 0.06812844, + "auxiliary_loss_mlp": 0.01336623, + "balance_loss_clip": 0.06357861, + "balance_loss_mlp": 0.01283504, + "epoch": 0.08224860964978205, + "flos": 17863551141120.0, + "grad_norm": 3.794710967606561, + "language_loss": 0.85955203, + "learning_rate": 3.9714324879142946e-06, + "loss": 0.94104671, + "num_input_tokens_seen": 29091330, + "router_z_loss_clip": 4.546875, + "router_z_loss_mlp": 0.53100586, + "step": 1368, + "time_per_iteration": 2.6025125980377197 + }, + { + "auxiliary_loss_clip": 0.06754048, + "auxiliary_loss_mlp": 0.01305347, + "balance_loss_clip": 0.06340475, + "balance_loss_mlp": 0.01259881, + "epoch": 0.08230873290245003, + "flos": 25234406928000.0, + "grad_norm": 1.7485210372757418, + "language_loss": 0.82751203, + "learning_rate": 3.971366859492653e-06, + "loss": 0.90810603, + "num_input_tokens_seen": 29110375, + "router_z_loss_clip": 4.12890625, + "router_z_loss_mlp": 0.45458984, + "step": 1369, + "time_per_iteration": 2.6027116775512695 + }, + { + "auxiliary_loss_clip": 0.06772825, + "auxiliary_loss_mlp": 0.01314688, + "balance_loss_clip": 0.06341462, + "balance_loss_mlp": 0.01264811, + "epoch": 0.08236885615511799, + "flos": 31768144099200.0, + "grad_norm": 4.8921113569353425, + "language_loss": 0.77775633, + "learning_rate": 3.971301156316582e-06, + "loss": 0.85863149, + "num_input_tokens_seen": 29129395, + "router_z_loss_clip": 4.31640625, + "router_z_loss_mlp": 0.49902344, + "step": 1370, + "time_per_iteration": 2.685317039489746 + }, + { + "auxiliary_loss_clip": 0.06783543, + "auxiliary_loss_mlp": 0.01317271, + "balance_loss_clip": 0.06345622, + "balance_loss_mlp": 0.01265153, + "epoch": 0.08242897940778596, + "flos": 23192615761920.0, + "grad_norm": 2.053394395942029, + "language_loss": 0.76803637, + "learning_rate": 3.971235378388573e-06, + "loss": 0.84904444, + "num_input_tokens_seen": 29148650, + "router_z_loss_clip": 4.37890625, + "router_z_loss_mlp": 0.52124023, + "step": 1371, + "time_per_iteration": 2.6406354904174805 + }, + { + "auxiliary_loss_clip": 0.06769266, + "auxiliary_loss_mlp": 0.01317025, + "balance_loss_clip": 0.06335683, + "balance_loss_mlp": 0.01267625, + "epoch": 0.08248910266045394, + "flos": 34499327932800.0, + "grad_norm": 3.0324747361967557, + "language_loss": 0.72827047, + "learning_rate": 3.971169525711122e-06, + "loss": 0.80913335, + "num_input_tokens_seen": 29170785, + "router_z_loss_clip": 4.33203125, + "router_z_loss_mlp": 0.49438477, + "step": 1372, + "time_per_iteration": 2.709796905517578 + }, + { + "auxiliary_loss_clip": 0.06798708, + "auxiliary_loss_mlp": 0.01317216, + "balance_loss_clip": 0.06345405, + "balance_loss_mlp": 0.01260854, + "epoch": 0.0825492259131219, + "flos": 13440059521920.0, + "grad_norm": 3.0329353190283075, + "language_loss": 0.9010855, + "learning_rate": 3.9711035982867246e-06, + "loss": 0.98224467, + "num_input_tokens_seen": 29185210, + "router_z_loss_clip": 4.53125, + "router_z_loss_mlp": 0.56420898, + "step": 1373, + "time_per_iteration": 2.5570318698883057 + }, + { + "auxiliary_loss_clip": 0.06774755, + "auxiliary_loss_mlp": 0.01317124, + "balance_loss_clip": 0.0634156, + "balance_loss_mlp": 0.01267056, + "epoch": 0.08260934916578987, + "flos": 25819608643200.0, + "grad_norm": 3.0603308178325657, + "language_loss": 0.84582615, + "learning_rate": 3.971037596117882e-06, + "loss": 0.92674494, + "num_input_tokens_seen": 29205210, + "router_z_loss_clip": 4.3359375, + "router_z_loss_mlp": 0.50024414, + "step": 1374, + "time_per_iteration": 2.596226215362549 + }, + { + "auxiliary_loss_clip": 0.06626149, + "auxiliary_loss_mlp": 0.0129603, + "balance_loss_clip": 0.06341976, + "balance_loss_mlp": 0.01265918, + "epoch": 0.08266947241845783, + "flos": 63478609061760.0, + "grad_norm": 0.8009341803089134, + "language_loss": 0.60659707, + "learning_rate": 3.970971519207095e-06, + "loss": 0.68581879, + "num_input_tokens_seen": 29265350, + "router_z_loss_clip": 2.84375, + "router_z_loss_mlp": 0.30053711, + "step": 1375, + "time_per_iteration": 3.177459716796875 + }, + { + "auxiliary_loss_clip": 0.06618689, + "auxiliary_loss_mlp": 0.01286424, + "balance_loss_clip": 0.06334813, + "balance_loss_mlp": 0.01256718, + "epoch": 0.08272959567112581, + "flos": 70013855606400.0, + "grad_norm": 0.886054791003263, + "language_loss": 0.62275791, + "learning_rate": 3.970905367556871e-06, + "loss": 0.70180905, + "num_input_tokens_seen": 29321475, + "router_z_loss_clip": 2.84375, + "router_z_loss_mlp": 0.29638672, + "step": 1376, + "time_per_iteration": 3.1206676959991455 + }, + { + "auxiliary_loss_clip": 0.06771185, + "auxiliary_loss_mlp": 0.01316915, + "balance_loss_clip": 0.06341776, + "balance_loss_mlp": 0.01268611, + "epoch": 0.08278971892379378, + "flos": 20419574014080.0, + "grad_norm": 2.5198182509144735, + "language_loss": 0.84768277, + "learning_rate": 3.970839141169718e-06, + "loss": 0.92856377, + "num_input_tokens_seen": 29341405, + "router_z_loss_clip": 4.2890625, + "router_z_loss_mlp": 0.48266602, + "step": 1377, + "time_per_iteration": 2.6820216178894043 + }, + { + "auxiliary_loss_clip": 0.06764729, + "auxiliary_loss_mlp": 0.01308146, + "balance_loss_clip": 0.06342821, + "balance_loss_mlp": 0.0126144, + "epoch": 0.08284984217646174, + "flos": 26257461753600.0, + "grad_norm": 2.286420184169047, + "language_loss": 0.86602247, + "learning_rate": 3.970772840048147e-06, + "loss": 0.94675124, + "num_input_tokens_seen": 29361955, + "router_z_loss_clip": 4.2109375, + "router_z_loss_mlp": 0.46728516, + "step": 1378, + "time_per_iteration": 2.5983967781066895 + }, + { + "auxiliary_loss_clip": 0.06779523, + "auxiliary_loss_mlp": 0.01324128, + "balance_loss_clip": 0.06348801, + "balance_loss_mlp": 0.01275396, + "epoch": 0.08290996542912972, + "flos": 27201370798080.0, + "grad_norm": 4.155383498543994, + "language_loss": 0.9020921, + "learning_rate": 3.970706464194672e-06, + "loss": 0.98312867, + "num_input_tokens_seen": 29382395, + "router_z_loss_clip": 4.30664062, + "router_z_loss_mlp": 0.48779297, + "step": 1379, + "time_per_iteration": 2.6558284759521484 + }, + { + "auxiliary_loss_clip": 0.06771149, + "auxiliary_loss_mlp": 0.01307486, + "balance_loss_clip": 0.06347619, + "balance_loss_mlp": 0.01261972, + "epoch": 0.08297008868179769, + "flos": 38627367655680.0, + "grad_norm": 2.766384510146163, + "language_loss": 0.80964148, + "learning_rate": 3.970640013611812e-06, + "loss": 0.89042783, + "num_input_tokens_seen": 29404460, + "router_z_loss_clip": 4.23828125, + "router_z_loss_mlp": 0.45483398, + "step": 1380, + "time_per_iteration": 2.7228140830993652 + }, + { + "auxiliary_loss_clip": 0.06759404, + "auxiliary_loss_mlp": 0.01314619, + "balance_loss_clip": 0.06340429, + "balance_loss_mlp": 0.01265576, + "epoch": 0.08303021193446565, + "flos": 19980924289920.0, + "grad_norm": 2.7915027065661593, + "language_loss": 0.88561881, + "learning_rate": 3.970573488302083e-06, + "loss": 0.96635896, + "num_input_tokens_seen": 29422675, + "router_z_loss_clip": 4.18554688, + "router_z_loss_mlp": 0.49023438, + "step": 1381, + "time_per_iteration": 2.6598143577575684 + }, + { + "auxiliary_loss_clip": 0.06800985, + "auxiliary_loss_mlp": 0.0131809, + "balance_loss_clip": 0.06359053, + "balance_loss_mlp": 0.01265972, + "epoch": 0.08309033518713363, + "flos": 13667769792000.0, + "grad_norm": 3.693105114641136, + "language_loss": 0.91473186, + "learning_rate": 3.970506888268011e-06, + "loss": 0.99592257, + "num_input_tokens_seen": 29439840, + "router_z_loss_clip": 4.41796875, + "router_z_loss_mlp": 0.52148438, + "step": 1382, + "time_per_iteration": 2.5975959300994873 + }, + { + "auxiliary_loss_clip": 0.06790116, + "auxiliary_loss_mlp": 0.01312438, + "balance_loss_clip": 0.06361018, + "balance_loss_mlp": 0.01263229, + "epoch": 0.0831504584398016, + "flos": 17974492346880.0, + "grad_norm": 2.495217268396043, + "language_loss": 0.78734231, + "learning_rate": 3.970440213512121e-06, + "loss": 0.86836791, + "num_input_tokens_seen": 29457360, + "router_z_loss_clip": 4.28515625, + "router_z_loss_mlp": 0.49243164, + "step": 1383, + "time_per_iteration": 2.625793695449829 + }, + { + "auxiliary_loss_clip": 0.06786636, + "auxiliary_loss_mlp": 0.01320002, + "balance_loss_clip": 0.06359254, + "balance_loss_mlp": 0.01273797, + "epoch": 0.08321058169246956, + "flos": 22607959098240.0, + "grad_norm": 2.963836437118746, + "language_loss": 0.85324878, + "learning_rate": 3.97037346403694e-06, + "loss": 0.93431515, + "num_input_tokens_seen": 29477040, + "router_z_loss_clip": 4.26953125, + "router_z_loss_mlp": 0.46240234, + "step": 1384, + "time_per_iteration": 2.6376733779907227 + }, + { + "auxiliary_loss_clip": 0.06818897, + "auxiliary_loss_mlp": 0.01334638, + "balance_loss_clip": 0.06359202, + "balance_loss_mlp": 0.01276106, + "epoch": 0.08327070494513754, + "flos": 22855976784000.0, + "grad_norm": 3.1601990232642225, + "language_loss": 0.86789215, + "learning_rate": 3.970306639845e-06, + "loss": 0.94942749, + "num_input_tokens_seen": 29492010, + "router_z_loss_clip": 4.59375, + "router_z_loss_mlp": 0.58569336, + "step": 1385, + "time_per_iteration": 2.568554639816284 + }, + { + "auxiliary_loss_clip": 0.06798602, + "auxiliary_loss_mlp": 0.0132055, + "balance_loss_clip": 0.06352767, + "balance_loss_mlp": 0.01267978, + "epoch": 0.0833308281978055, + "flos": 22789451041920.0, + "grad_norm": 2.43217008586481, + "language_loss": 0.71394652, + "learning_rate": 3.970239740938835e-06, + "loss": 0.795138, + "num_input_tokens_seen": 29511850, + "router_z_loss_clip": 4.45703125, + "router_z_loss_mlp": 0.52563477, + "step": 1386, + "time_per_iteration": 2.6096982955932617 + }, + { + "auxiliary_loss_clip": 0.06791467, + "auxiliary_loss_mlp": 0.01322523, + "balance_loss_clip": 0.06356902, + "balance_loss_mlp": 0.01273099, + "epoch": 0.08339095145047347, + "flos": 20818713738240.0, + "grad_norm": 2.3900622326762133, + "language_loss": 0.84172809, + "learning_rate": 3.97017276732098e-06, + "loss": 0.92286795, + "num_input_tokens_seen": 29531415, + "router_z_loss_clip": 4.34375, + "router_z_loss_mlp": 0.49389648, + "step": 1387, + "time_per_iteration": 2.575343132019043 + }, + { + "auxiliary_loss_clip": 0.06797379, + "auxiliary_loss_mlp": 0.01318956, + "balance_loss_clip": 0.06353064, + "balance_loss_mlp": 0.01265598, + "epoch": 0.08345107470314143, + "flos": 18521274165120.0, + "grad_norm": 5.434584550719809, + "language_loss": 0.79640985, + "learning_rate": 3.970105718993978e-06, + "loss": 0.87757325, + "num_input_tokens_seen": 29549525, + "router_z_loss_clip": 4.44140625, + "router_z_loss_mlp": 0.53369141, + "step": 1388, + "time_per_iteration": 2.567218780517578 + }, + { + "auxiliary_loss_clip": 0.06780161, + "auxiliary_loss_mlp": 0.01317075, + "balance_loss_clip": 0.06354657, + "balance_loss_mlp": 0.0126932, + "epoch": 0.08351119795580941, + "flos": 18813623460480.0, + "grad_norm": 2.631761877844796, + "language_loss": 0.82141799, + "learning_rate": 3.970038595960369e-06, + "loss": 0.90239036, + "num_input_tokens_seen": 29568705, + "router_z_loss_clip": 4.2578125, + "router_z_loss_mlp": 0.47827148, + "step": 1389, + "time_per_iteration": 2.5653841495513916 + }, + { + "auxiliary_loss_clip": 0.06804, + "auxiliary_loss_mlp": 0.01321664, + "balance_loss_clip": 0.06357203, + "balance_loss_mlp": 0.01264014, + "epoch": 0.08357132120847738, + "flos": 18447662753280.0, + "grad_norm": 4.4672809610096005, + "language_loss": 0.89901805, + "learning_rate": 3.969971398222699e-06, + "loss": 0.9802748, + "num_input_tokens_seen": 29585855, + "router_z_loss_clip": 4.46484375, + "router_z_loss_mlp": 0.57666016, + "step": 1390, + "time_per_iteration": 2.5599520206451416 + }, + { + "auxiliary_loss_clip": 0.06784607, + "auxiliary_loss_mlp": 0.01318322, + "balance_loss_clip": 0.06351756, + "balance_loss_mlp": 0.01268469, + "epoch": 0.08363144446114534, + "flos": 25929585527040.0, + "grad_norm": 2.0099549817565, + "language_loss": 0.88354278, + "learning_rate": 3.969904125783517e-06, + "loss": 0.96457207, + "num_input_tokens_seen": 29607280, + "router_z_loss_clip": 4.328125, + "router_z_loss_mlp": 0.49853516, + "step": 1391, + "time_per_iteration": 2.611985921859741 + }, + { + "auxiliary_loss_clip": 0.06815389, + "auxiliary_loss_mlp": 0.01329624, + "balance_loss_clip": 0.06354406, + "balance_loss_mlp": 0.01268851, + "epoch": 0.08369156771381332, + "flos": 18047223290880.0, + "grad_norm": 3.4660821416963805, + "language_loss": 0.90262675, + "learning_rate": 3.969836778645371e-06, + "loss": 0.98407698, + "num_input_tokens_seen": 29624130, + "router_z_loss_clip": 4.609375, + "router_z_loss_mlp": 0.60791016, + "step": 1392, + "time_per_iteration": 2.5649681091308594 + }, + { + "auxiliary_loss_clip": 0.06784143, + "auxiliary_loss_mlp": 0.01319854, + "balance_loss_clip": 0.06346482, + "balance_loss_mlp": 0.01270025, + "epoch": 0.08375169096648129, + "flos": 22681822072320.0, + "grad_norm": 4.398591622405809, + "language_loss": 0.82388842, + "learning_rate": 3.969769356810819e-06, + "loss": 0.90492845, + "num_input_tokens_seen": 29643210, + "router_z_loss_clip": 4.375, + "router_z_loss_mlp": 0.4987793, + "step": 1393, + "time_per_iteration": 2.596484899520874 + }, + { + "auxiliary_loss_clip": 0.06777762, + "auxiliary_loss_mlp": 0.01325984, + "balance_loss_clip": 0.06353533, + "balance_loss_mlp": 0.01276679, + "epoch": 0.08381181421914925, + "flos": 26110238929920.0, + "grad_norm": 2.2804276198164386, + "language_loss": 0.86896241, + "learning_rate": 3.969701860282415e-06, + "loss": 0.94999981, + "num_input_tokens_seen": 29663920, + "router_z_loss_clip": 4.24414062, + "router_z_loss_mlp": 0.49291992, + "step": 1394, + "time_per_iteration": 2.6082303524017334 + }, + { + "auxiliary_loss_clip": 0.06795013, + "auxiliary_loss_mlp": 0.01318108, + "balance_loss_clip": 0.06360835, + "balance_loss_mlp": 0.01267063, + "epoch": 0.08387193747181723, + "flos": 20635796275200.0, + "grad_norm": 2.9482675367733306, + "language_loss": 0.84974355, + "learning_rate": 3.969634289062719e-06, + "loss": 0.93087476, + "num_input_tokens_seen": 29683825, + "router_z_loss_clip": 4.3359375, + "router_z_loss_mlp": 0.51098633, + "step": 1395, + "time_per_iteration": 2.579622745513916 + }, + { + "auxiliary_loss_clip": 0.06798401, + "auxiliary_loss_mlp": 0.01311309, + "balance_loss_clip": 0.06349191, + "balance_loss_mlp": 0.01256282, + "epoch": 0.0839320607244852, + "flos": 13448193367680.0, + "grad_norm": 3.513957453818194, + "language_loss": 0.85002828, + "learning_rate": 3.969566643154293e-06, + "loss": 0.93112534, + "num_input_tokens_seen": 29698775, + "router_z_loss_clip": 4.48828125, + "router_z_loss_mlp": 0.55078125, + "step": 1396, + "time_per_iteration": 2.5521080493927 + }, + { + "auxiliary_loss_clip": 0.06784061, + "auxiliary_loss_mlp": 0.0131232, + "balance_loss_clip": 0.06356047, + "balance_loss_mlp": 0.0126261, + "epoch": 0.08399218397715316, + "flos": 23484000735360.0, + "grad_norm": 4.145800578493811, + "language_loss": 0.79030329, + "learning_rate": 3.969498922559703e-06, + "loss": 0.87126708, + "num_input_tokens_seen": 29719430, + "router_z_loss_clip": 4.28515625, + "router_z_loss_mlp": 0.49682617, + "step": 1397, + "time_per_iteration": 4.026551961898804 + }, + { + "auxiliary_loss_clip": 0.06777123, + "auxiliary_loss_mlp": 0.01309701, + "balance_loss_clip": 0.06349255, + "balance_loss_mlp": 0.01258655, + "epoch": 0.08405230722982113, + "flos": 25927698810240.0, + "grad_norm": 3.1837358420566173, + "language_loss": 0.79802477, + "learning_rate": 3.969431127281516e-06, + "loss": 0.87889296, + "num_input_tokens_seen": 29739685, + "router_z_loss_clip": 4.27734375, + "router_z_loss_mlp": 0.51123047, + "step": 1398, + "time_per_iteration": 2.6027841567993164 + }, + { + "auxiliary_loss_clip": 0.06793746, + "auxiliary_loss_mlp": 0.01312625, + "balance_loss_clip": 0.06375143, + "balance_loss_mlp": 0.01265299, + "epoch": 0.0841124304824891, + "flos": 17973192608640.0, + "grad_norm": 3.0716222673767404, + "language_loss": 0.96745825, + "learning_rate": 3.969363257322304e-06, + "loss": 1.048522, + "num_input_tokens_seen": 29756165, + "router_z_loss_clip": 4.1875, + "router_z_loss_mlp": 0.47290039, + "step": 1399, + "time_per_iteration": 3.9915521144866943 + }, + { + "auxiliary_loss_clip": 0.06813341, + "auxiliary_loss_mlp": 0.01316281, + "balance_loss_clip": 0.06352973, + "balance_loss_mlp": 0.01258012, + "epoch": 0.08417255373515707, + "flos": 25636733107200.0, + "grad_norm": 6.6751707009018055, + "language_loss": 0.83959824, + "learning_rate": 3.96929531268464e-06, + "loss": 0.92089444, + "num_input_tokens_seen": 29776425, + "router_z_loss_clip": 4.6015625, + "router_z_loss_mlp": 0.58300781, + "step": 1400, + "time_per_iteration": 2.6097705364227295 + }, + { + "auxiliary_loss_clip": 0.06801295, + "auxiliary_loss_mlp": 0.01317439, + "balance_loss_clip": 0.06362335, + "balance_loss_mlp": 0.01264868, + "epoch": 0.08423267698782504, + "flos": 26256874775040.0, + "grad_norm": 2.3612401801911487, + "language_loss": 0.8841815, + "learning_rate": 3.969227293371099e-06, + "loss": 0.96536887, + "num_input_tokens_seen": 29796440, + "router_z_loss_clip": 4.38671875, + "router_z_loss_mlp": 0.52539062, + "step": 1401, + "time_per_iteration": 2.654085874557495 + }, + { + "auxiliary_loss_clip": 0.06806403, + "auxiliary_loss_mlp": 0.01316426, + "balance_loss_clip": 0.0637629, + "balance_loss_mlp": 0.01264594, + "epoch": 0.08429280024049302, + "flos": 20125757272320.0, + "grad_norm": 2.1446358728684753, + "language_loss": 0.90116793, + "learning_rate": 3.969159199384263e-06, + "loss": 0.98239625, + "num_input_tokens_seen": 29814755, + "router_z_loss_clip": 4.296875, + "router_z_loss_mlp": 0.51733398, + "step": 1402, + "time_per_iteration": 4.018750905990601 + }, + { + "auxiliary_loss_clip": 0.067935, + "auxiliary_loss_mlp": 0.01308153, + "balance_loss_clip": 0.06370865, + "balance_loss_mlp": 0.01261519, + "epoch": 0.08435292349316098, + "flos": 42934593335040.0, + "grad_norm": 3.3097945414979324, + "language_loss": 0.91613716, + "learning_rate": 3.9690910307267125e-06, + "loss": 0.99715364, + "num_input_tokens_seen": 29834785, + "router_z_loss_clip": 4.21484375, + "router_z_loss_mlp": 0.46655273, + "step": 1403, + "time_per_iteration": 2.75314998626709 + }, + { + "auxiliary_loss_clip": 0.06802634, + "auxiliary_loss_mlp": 0.01312918, + "balance_loss_clip": 0.0636553, + "balance_loss_mlp": 0.01259679, + "epoch": 0.08441304674582895, + "flos": 22863984848640.0, + "grad_norm": 2.1842752098613696, + "language_loss": 0.8341198, + "learning_rate": 3.969022787401033e-06, + "loss": 0.91527522, + "num_input_tokens_seen": 29854695, + "router_z_loss_clip": 4.37109375, + "router_z_loss_mlp": 0.5324707, + "step": 1404, + "time_per_iteration": 4.128188371658325 + }, + { + "auxiliary_loss_clip": 0.06814778, + "auxiliary_loss_mlp": 0.01317505, + "balance_loss_clip": 0.06364593, + "balance_loss_mlp": 0.01263884, + "epoch": 0.08447316999849692, + "flos": 18703436941440.0, + "grad_norm": 2.408821192970914, + "language_loss": 0.85791099, + "learning_rate": 3.968954469409811e-06, + "loss": 0.93923384, + "num_input_tokens_seen": 29872180, + "router_z_loss_clip": 4.5, + "router_z_loss_mlp": 0.53588867, + "step": 1405, + "time_per_iteration": 2.6186141967773438 + }, + { + "auxiliary_loss_clip": 0.06785356, + "auxiliary_loss_mlp": 0.01307288, + "balance_loss_clip": 0.06358731, + "balance_loss_mlp": 0.01261488, + "epoch": 0.08453329325116489, + "flos": 25491061584000.0, + "grad_norm": 2.376275583502495, + "language_loss": 0.82456648, + "learning_rate": 3.968886076755639e-06, + "loss": 0.9054929, + "num_input_tokens_seen": 29893205, + "router_z_loss_clip": 4.2578125, + "router_z_loss_mlp": 0.45825195, + "step": 1406, + "time_per_iteration": 2.620391845703125 + }, + { + "auxiliary_loss_clip": 0.06791453, + "auxiliary_loss_mlp": 0.01321291, + "balance_loss_clip": 0.06356591, + "balance_loss_mlp": 0.01271461, + "epoch": 0.08459341650383286, + "flos": 20925839583360.0, + "grad_norm": 2.994077443847897, + "language_loss": 0.81261843, + "learning_rate": 3.96881760944111e-06, + "loss": 0.8937459, + "num_input_tokens_seen": 29911970, + "router_z_loss_clip": 4.34765625, + "router_z_loss_mlp": 0.49853516, + "step": 1407, + "time_per_iteration": 2.6037673950195312 + }, + { + "auxiliary_loss_clip": 0.06790854, + "auxiliary_loss_mlp": 0.01321715, + "balance_loss_clip": 0.06351606, + "balance_loss_mlp": 0.01269525, + "epoch": 0.08465353975650082, + "flos": 13048215102720.0, + "grad_norm": 4.665844838977458, + "language_loss": 0.93093699, + "learning_rate": 3.968749067468819e-06, + "loss": 1.01206267, + "num_input_tokens_seen": 29929925, + "router_z_loss_clip": 4.39453125, + "router_z_loss_mlp": 0.52197266, + "step": 1408, + "time_per_iteration": 2.5401058197021484 + }, + { + "auxiliary_loss_clip": 0.06614841, + "auxiliary_loss_mlp": 0.0131788, + "balance_loss_clip": 0.06340891, + "balance_loss_mlp": 0.01289985, + "epoch": 0.0847136630091688, + "flos": 60896912112000.0, + "grad_norm": 0.8563868358173309, + "language_loss": 0.62132567, + "learning_rate": 3.968680450841368e-06, + "loss": 0.7006529, + "num_input_tokens_seen": 29985950, + "router_z_loss_clip": 2.75, + "router_z_loss_mlp": 0.27954102, + "step": 1409, + "time_per_iteration": 3.2652077674865723 + }, + { + "auxiliary_loss_clip": 0.06755531, + "auxiliary_loss_mlp": 0.01311791, + "balance_loss_clip": 0.06338526, + "balance_loss_mlp": 0.01266802, + "epoch": 0.08477378626183676, + "flos": 22051743696000.0, + "grad_norm": 2.2146573769232916, + "language_loss": 0.88621575, + "learning_rate": 3.968611759561355e-06, + "loss": 0.96688896, + "num_input_tokens_seen": 30004330, + "router_z_loss_clip": 4.1640625, + "router_z_loss_mlp": 0.44995117, + "step": 1410, + "time_per_iteration": 2.5771710872650146 + }, + { + "auxiliary_loss_clip": 0.06769306, + "auxiliary_loss_mlp": 0.01318797, + "balance_loss_clip": 0.06336072, + "balance_loss_mlp": 0.01268253, + "epoch": 0.08483390951450473, + "flos": 16695537552000.0, + "grad_norm": 2.3714211979189987, + "language_loss": 0.76187658, + "learning_rate": 3.968542993631388e-06, + "loss": 0.84275758, + "num_input_tokens_seen": 30022555, + "router_z_loss_clip": 4.328125, + "router_z_loss_mlp": 0.50585938, + "step": 1411, + "time_per_iteration": 2.5831918716430664 + }, + { + "auxiliary_loss_clip": 0.06605848, + "auxiliary_loss_mlp": 0.01302084, + "balance_loss_clip": 0.06331737, + "balance_loss_mlp": 0.01268491, + "epoch": 0.08489403276717271, + "flos": 51604430313600.0, + "grad_norm": 0.8982882759913209, + "language_loss": 0.57100856, + "learning_rate": 3.968474153054073e-06, + "loss": 0.65008789, + "num_input_tokens_seen": 30077220, + "router_z_loss_clip": 2.7421875, + "router_z_loss_mlp": 0.33618164, + "step": 1412, + "time_per_iteration": 3.1449196338653564 + }, + { + "auxiliary_loss_clip": 0.06776647, + "auxiliary_loss_mlp": 0.0131046, + "balance_loss_clip": 0.06348051, + "balance_loss_mlp": 0.01261393, + "epoch": 0.08495415601984067, + "flos": 17098031439360.0, + "grad_norm": 4.4528738806487, + "language_loss": 0.91184032, + "learning_rate": 3.96840523783202e-06, + "loss": 0.99271137, + "num_input_tokens_seen": 30094600, + "router_z_loss_clip": 4.28515625, + "router_z_loss_mlp": 0.49145508, + "step": 1413, + "time_per_iteration": 2.5736677646636963 + }, + { + "auxiliary_loss_clip": 0.06762269, + "auxiliary_loss_mlp": 0.01310346, + "balance_loss_clip": 0.06341726, + "balance_loss_mlp": 0.01261685, + "epoch": 0.08501427927250864, + "flos": 23155034405760.0, + "grad_norm": 2.1658829941413997, + "language_loss": 0.9017415, + "learning_rate": 3.968336247967844e-06, + "loss": 0.98246765, + "num_input_tokens_seen": 30114475, + "router_z_loss_clip": 4.20703125, + "router_z_loss_mlp": 0.48706055, + "step": 1414, + "time_per_iteration": 2.6087806224823 + }, + { + "auxiliary_loss_clip": 0.06782193, + "auxiliary_loss_mlp": 0.01303484, + "balance_loss_clip": 0.06352735, + "balance_loss_mlp": 0.01258423, + "epoch": 0.08507440252517662, + "flos": 19069649210880.0, + "grad_norm": 2.082765030572706, + "language_loss": 0.79920703, + "learning_rate": 3.96826718346416e-06, + "loss": 0.88006377, + "num_input_tokens_seen": 30133350, + "router_z_loss_clip": 4.2890625, + "router_z_loss_mlp": 0.45068359, + "step": 1415, + "time_per_iteration": 2.5629544258117676 + }, + { + "auxiliary_loss_clip": 0.06759159, + "auxiliary_loss_mlp": 0.01306699, + "balance_loss_clip": 0.06336564, + "balance_loss_mlp": 0.01259492, + "epoch": 0.08513452577784458, + "flos": 60195249550080.0, + "grad_norm": 8.264598666401978, + "language_loss": 0.72300386, + "learning_rate": 3.968198044323587e-06, + "loss": 0.80366242, + "num_input_tokens_seen": 30159005, + "router_z_loss_clip": 4.21875, + "router_z_loss_mlp": 0.47216797, + "step": 1416, + "time_per_iteration": 2.9444239139556885 + }, + { + "auxiliary_loss_clip": 0.06803774, + "auxiliary_loss_mlp": 0.01317561, + "balance_loss_clip": 0.0635466, + "balance_loss_mlp": 0.01264608, + "epoch": 0.08519464903051255, + "flos": 27315917729280.0, + "grad_norm": 2.5149113887395407, + "language_loss": 0.77021283, + "learning_rate": 3.968128830548748e-06, + "loss": 0.85142624, + "num_input_tokens_seen": 30179450, + "router_z_loss_clip": 4.48046875, + "router_z_loss_mlp": 0.5300293, + "step": 1417, + "time_per_iteration": 2.619328260421753 + }, + { + "auxiliary_loss_clip": 0.06779526, + "auxiliary_loss_mlp": 0.01310101, + "balance_loss_clip": 0.06341187, + "balance_loss_mlp": 0.01259341, + "epoch": 0.08525477228318051, + "flos": 20272644679680.0, + "grad_norm": 2.930615198621333, + "language_loss": 0.84423447, + "learning_rate": 3.968059542142265e-06, + "loss": 0.92513078, + "num_input_tokens_seen": 30197235, + "router_z_loss_clip": 4.37890625, + "router_z_loss_mlp": 0.5078125, + "step": 1418, + "time_per_iteration": 2.5782899856567383 + }, + { + "auxiliary_loss_clip": 0.06606524, + "auxiliary_loss_mlp": 0.01274095, + "balance_loss_clip": 0.06333332, + "balance_loss_mlp": 0.01249931, + "epoch": 0.08531489553584849, + "flos": 67633580672640.0, + "grad_norm": 0.9458512268838744, + "language_loss": 0.5659793, + "learning_rate": 3.9679901791067685e-06, + "loss": 0.64478552, + "num_input_tokens_seen": 30257410, + "router_z_loss_clip": 2.73046875, + "router_z_loss_mlp": 0.24157715, + "step": 1419, + "time_per_iteration": 3.1296868324279785 + }, + { + "auxiliary_loss_clip": 0.06790996, + "auxiliary_loss_mlp": 0.01306783, + "balance_loss_clip": 0.06354627, + "balance_loss_mlp": 0.01259004, + "epoch": 0.08537501878851646, + "flos": 27534362123520.0, + "grad_norm": 2.6126551890980076, + "language_loss": 0.72536588, + "learning_rate": 3.967920741444886e-06, + "loss": 0.80634367, + "num_input_tokens_seen": 30277865, + "router_z_loss_clip": 4.359375, + "router_z_loss_mlp": 0.4777832, + "step": 1420, + "time_per_iteration": 2.629305839538574 + }, + { + "auxiliary_loss_clip": 0.06772007, + "auxiliary_loss_mlp": 0.01307483, + "balance_loss_clip": 0.06343359, + "balance_loss_mlp": 0.01257272, + "epoch": 0.08543514204118442, + "flos": 22790918488320.0, + "grad_norm": 2.3388359886837917, + "language_loss": 0.89903885, + "learning_rate": 3.967851229159252e-06, + "loss": 0.97983378, + "num_input_tokens_seen": 30298545, + "router_z_loss_clip": 4.27929688, + "router_z_loss_mlp": 0.50244141, + "step": 1421, + "time_per_iteration": 2.5863590240478516 + }, + { + "auxiliary_loss_clip": 0.06597036, + "auxiliary_loss_mlp": 0.01275978, + "balance_loss_clip": 0.06325173, + "balance_loss_mlp": 0.01249919, + "epoch": 0.0854952652938524, + "flos": 61010872064640.0, + "grad_norm": 0.7745811005373293, + "language_loss": 0.63692141, + "learning_rate": 3.967781642252502e-06, + "loss": 0.71565151, + "num_input_tokens_seen": 30361725, + "router_z_loss_clip": 2.71875, + "router_z_loss_mlp": 0.26098633, + "step": 1422, + "time_per_iteration": 3.19461989402771 + }, + { + "auxiliary_loss_clip": 0.06765623, + "auxiliary_loss_mlp": 0.01311314, + "balance_loss_clip": 0.06344545, + "balance_loss_mlp": 0.01266444, + "epoch": 0.08555538854652037, + "flos": 28045575083520.0, + "grad_norm": 3.3087422543747205, + "language_loss": 0.84878761, + "learning_rate": 3.967711980727276e-06, + "loss": 0.92955703, + "num_input_tokens_seen": 30382180, + "router_z_loss_clip": 4.21289062, + "router_z_loss_mlp": 0.44873047, + "step": 1423, + "time_per_iteration": 2.6554226875305176 + }, + { + "auxiliary_loss_clip": 0.06776007, + "auxiliary_loss_mlp": 0.01303967, + "balance_loss_clip": 0.06351057, + "balance_loss_mlp": 0.01261314, + "epoch": 0.08561551179918833, + "flos": 23515293035520.0, + "grad_norm": 2.569087931646671, + "language_loss": 0.7765131, + "learning_rate": 3.967642244586213e-06, + "loss": 0.85731286, + "num_input_tokens_seen": 30402980, + "router_z_loss_clip": 4.24609375, + "router_z_loss_mlp": 0.42602539, + "step": 1424, + "time_per_iteration": 2.7058026790618896 + }, + { + "auxiliary_loss_clip": 0.06765693, + "auxiliary_loss_mlp": 0.01310667, + "balance_loss_clip": 0.06343248, + "balance_loss_mlp": 0.01265988, + "epoch": 0.08567563505185631, + "flos": 17932005527040.0, + "grad_norm": 1.9981101747379681, + "language_loss": 0.78279495, + "learning_rate": 3.96757243383196e-06, + "loss": 0.86355859, + "num_input_tokens_seen": 30420800, + "router_z_loss_clip": 4.22265625, + "router_z_loss_mlp": 0.44677734, + "step": 1425, + "time_per_iteration": 2.575941801071167 + }, + { + "auxiliary_loss_clip": 0.06768522, + "auxiliary_loss_mlp": 0.01310756, + "balance_loss_clip": 0.06347974, + "balance_loss_mlp": 0.01264074, + "epoch": 0.08573575830452428, + "flos": 19725695153280.0, + "grad_norm": 2.337358950389625, + "language_loss": 0.95636088, + "learning_rate": 3.9675025484671624e-06, + "loss": 1.03715372, + "num_input_tokens_seen": 30439620, + "router_z_loss_clip": 4.20507812, + "router_z_loss_mlp": 0.46679688, + "step": 1426, + "time_per_iteration": 2.5706772804260254 + }, + { + "auxiliary_loss_clip": 0.06791019, + "auxiliary_loss_mlp": 0.01318941, + "balance_loss_clip": 0.06355577, + "balance_loss_mlp": 0.01267776, + "epoch": 0.08579588155719224, + "flos": 17937414115200.0, + "grad_norm": 3.6077969135085945, + "language_loss": 0.78100324, + "learning_rate": 3.967432588494471e-06, + "loss": 0.86210281, + "num_input_tokens_seen": 30457300, + "router_z_loss_clip": 4.3515625, + "router_z_loss_mlp": 0.51196289, + "step": 1427, + "time_per_iteration": 2.620664119720459 + }, + { + "auxiliary_loss_clip": 0.06773555, + "auxiliary_loss_mlp": 0.01322231, + "balance_loss_clip": 0.06351949, + "balance_loss_mlp": 0.01272831, + "epoch": 0.08585600480986022, + "flos": 16038694995840.0, + "grad_norm": 4.670417341284444, + "language_loss": 0.84344131, + "learning_rate": 3.96736255391654e-06, + "loss": 0.92439914, + "num_input_tokens_seen": 30471580, + "router_z_loss_clip": 4.21679688, + "router_z_loss_mlp": 0.49414062, + "step": 1428, + "time_per_iteration": 2.5323448181152344 + }, + { + "auxiliary_loss_clip": 0.06797348, + "auxiliary_loss_mlp": 0.01327926, + "balance_loss_clip": 0.06359121, + "balance_loss_mlp": 0.01274211, + "epoch": 0.08591612806252819, + "flos": 28664920137600.0, + "grad_norm": 3.8563401660428136, + "language_loss": 0.82438064, + "learning_rate": 3.967292444736023e-06, + "loss": 0.90563333, + "num_input_tokens_seen": 30492720, + "router_z_loss_clip": 4.375, + "router_z_loss_mlp": 0.53710938, + "step": 1429, + "time_per_iteration": 2.6729156970977783 + }, + { + "auxiliary_loss_clip": 0.06787296, + "auxiliary_loss_mlp": 0.01320421, + "balance_loss_clip": 0.06368907, + "balance_loss_mlp": 0.0127586, + "epoch": 0.08597625131519615, + "flos": 20965349583360.0, + "grad_norm": 2.123464733030403, + "language_loss": 0.90146309, + "learning_rate": 3.967222260955578e-06, + "loss": 0.98254025, + "num_input_tokens_seen": 30509535, + "router_z_loss_clip": 4.18554688, + "router_z_loss_mlp": 0.44580078, + "step": 1430, + "time_per_iteration": 2.553527355194092 + }, + { + "auxiliary_loss_clip": 0.06773631, + "auxiliary_loss_mlp": 0.01318779, + "balance_loss_clip": 0.06357691, + "balance_loss_mlp": 0.01274552, + "epoch": 0.08603637456786412, + "flos": 23262747229440.0, + "grad_norm": 2.0722520617005924, + "language_loss": 0.84170914, + "learning_rate": 3.96715200257787e-06, + "loss": 0.92263317, + "num_input_tokens_seen": 30529490, + "router_z_loss_clip": 4.16015625, + "router_z_loss_mlp": 0.44213867, + "step": 1431, + "time_per_iteration": 2.5954349040985107 + }, + { + "auxiliary_loss_clip": 0.06773046, + "auxiliary_loss_mlp": 0.01317231, + "balance_loss_clip": 0.06352717, + "balance_loss_mlp": 0.01270858, + "epoch": 0.0860964978205321, + "flos": 28701704880000.0, + "grad_norm": 5.769747909175534, + "language_loss": 0.79544812, + "learning_rate": 3.967081669605559e-06, + "loss": 0.87635088, + "num_input_tokens_seen": 30550205, + "router_z_loss_clip": 4.19726562, + "router_z_loss_mlp": 0.46362305, + "step": 1432, + "time_per_iteration": 2.6024515628814697 + }, + { + "auxiliary_loss_clip": 0.06771973, + "auxiliary_loss_mlp": 0.01314171, + "balance_loss_clip": 0.06355675, + "balance_loss_mlp": 0.01269325, + "epoch": 0.08615662107320006, + "flos": 19324542931200.0, + "grad_norm": 3.3903634053002336, + "language_loss": 0.75487757, + "learning_rate": 3.967011262041315e-06, + "loss": 0.83573902, + "num_input_tokens_seen": 30568830, + "router_z_loss_clip": 4.1640625, + "router_z_loss_mlp": 0.44848633, + "step": 1433, + "time_per_iteration": 2.5895845890045166 + }, + { + "auxiliary_loss_clip": 0.06795658, + "auxiliary_loss_mlp": 0.01322619, + "balance_loss_clip": 0.0636312, + "balance_loss_mlp": 0.01272313, + "epoch": 0.08621674432586802, + "flos": 15857161125120.0, + "grad_norm": 4.641351982999466, + "language_loss": 0.88055921, + "learning_rate": 3.9669407798878065e-06, + "loss": 0.96174198, + "num_input_tokens_seen": 30585730, + "router_z_loss_clip": 4.328125, + "router_z_loss_mlp": 0.50268555, + "step": 1434, + "time_per_iteration": 2.5355098247528076 + }, + { + "auxiliary_loss_clip": 0.06779063, + "auxiliary_loss_mlp": 0.01311558, + "balance_loss_clip": 0.06353655, + "balance_loss_mlp": 0.01263803, + "epoch": 0.086276867578536, + "flos": 14105874464640.0, + "grad_norm": 4.793331202343017, + "language_loss": 0.80184627, + "learning_rate": 3.966870223147707e-06, + "loss": 0.88275254, + "num_input_tokens_seen": 30603180, + "router_z_loss_clip": 4.25195312, + "router_z_loss_mlp": 0.4777832, + "step": 1435, + "time_per_iteration": 2.57381272315979 + }, + { + "auxiliary_loss_clip": 0.06627634, + "auxiliary_loss_mlp": 0.01282391, + "balance_loss_clip": 0.06350996, + "balance_loss_mlp": 0.01255616, + "epoch": 0.08633699083120397, + "flos": 70206500142720.0, + "grad_norm": 0.941958531658993, + "language_loss": 0.58419931, + "learning_rate": 3.96679959182369e-06, + "loss": 0.66329956, + "num_input_tokens_seen": 30668895, + "router_z_loss_clip": 2.7734375, + "router_z_loss_mlp": 0.26831055, + "step": 1436, + "time_per_iteration": 3.282787561416626 + }, + { + "auxiliary_loss_clip": 0.06781173, + "auxiliary_loss_mlp": 0.01309156, + "balance_loss_clip": 0.06351152, + "balance_loss_mlp": 0.01261949, + "epoch": 0.08639711408387193, + "flos": 30306565330560.0, + "grad_norm": 3.136203943019662, + "language_loss": 0.71995145, + "learning_rate": 3.966728885918437e-06, + "loss": 0.80085474, + "num_input_tokens_seen": 30688955, + "router_z_loss_clip": 4.296875, + "router_z_loss_mlp": 0.47167969, + "step": 1437, + "time_per_iteration": 4.062320232391357 + }, + { + "auxiliary_loss_clip": 0.06771993, + "auxiliary_loss_mlp": 0.01311453, + "balance_loss_clip": 0.06345055, + "balance_loss_mlp": 0.01262553, + "epoch": 0.08645723733653991, + "flos": 20303014584960.0, + "grad_norm": 2.1552544434513154, + "language_loss": 0.74663305, + "learning_rate": 3.966658105434627e-06, + "loss": 0.82746744, + "num_input_tokens_seen": 30706095, + "router_z_loss_clip": 4.26757812, + "router_z_loss_mlp": 0.48925781, + "step": 1438, + "time_per_iteration": 2.5902743339538574 + }, + { + "auxiliary_loss_clip": 0.06752677, + "auxiliary_loss_mlp": 0.01311557, + "balance_loss_clip": 0.06331892, + "balance_loss_mlp": 0.01263468, + "epoch": 0.08651736058920788, + "flos": 32898911748480.0, + "grad_norm": 2.1102638652127093, + "language_loss": 0.6610049, + "learning_rate": 3.966587250374945e-06, + "loss": 0.7416473, + "num_input_tokens_seen": 30729025, + "router_z_loss_clip": 4.20703125, + "router_z_loss_mlp": 0.48071289, + "step": 1439, + "time_per_iteration": 4.177356719970703 + }, + { + "auxiliary_loss_clip": 0.06767576, + "auxiliary_loss_mlp": 0.01319213, + "balance_loss_clip": 0.06342776, + "balance_loss_mlp": 0.01270934, + "epoch": 0.08657748384187584, + "flos": 22643863372800.0, + "grad_norm": 6.195931442958794, + "language_loss": 0.89298683, + "learning_rate": 3.966516320742077e-06, + "loss": 0.97385472, + "num_input_tokens_seen": 30746155, + "router_z_loss_clip": 4.25, + "router_z_loss_mlp": 0.4831543, + "step": 1440, + "time_per_iteration": 2.5557472705841064 + }, + { + "auxiliary_loss_clip": 0.06781097, + "auxiliary_loss_mlp": 0.01307911, + "balance_loss_clip": 0.06338568, + "balance_loss_mlp": 0.01254028, + "epoch": 0.08663760709454381, + "flos": 23664947627520.0, + "grad_norm": 2.369224573412665, + "language_loss": 0.86471045, + "learning_rate": 3.9664453165387124e-06, + "loss": 0.94560057, + "num_input_tokens_seen": 30761410, + "router_z_loss_clip": 4.421875, + "router_z_loss_mlp": 0.53833008, + "step": 1441, + "time_per_iteration": 2.65085768699646 + }, + { + "auxiliary_loss_clip": 0.06611373, + "auxiliary_loss_mlp": 0.01295436, + "balance_loss_clip": 0.06333591, + "balance_loss_mlp": 0.01268138, + "epoch": 0.08669773034721179, + "flos": 62703823484160.0, + "grad_norm": 0.803695610307685, + "language_loss": 0.60671109, + "learning_rate": 3.966374237767545e-06, + "loss": 0.68577921, + "num_input_tokens_seen": 30823010, + "router_z_loss_clip": 2.78125, + "router_z_loss_mlp": 0.27368164, + "step": 1442, + "time_per_iteration": 4.761855125427246 + }, + { + "auxiliary_loss_clip": 0.0676527, + "auxiliary_loss_mlp": 0.0130763, + "balance_loss_clip": 0.06333362, + "balance_loss_mlp": 0.0125885, + "epoch": 0.08675785359987975, + "flos": 20673713047680.0, + "grad_norm": 2.753695330350272, + "language_loss": 0.81546146, + "learning_rate": 3.96630308443127e-06, + "loss": 0.8961904, + "num_input_tokens_seen": 30841980, + "router_z_loss_clip": 4.31640625, + "router_z_loss_mlp": 0.48803711, + "step": 1443, + "time_per_iteration": 2.581735134124756 + }, + { + "auxiliary_loss_clip": 0.06751874, + "auxiliary_loss_mlp": 0.01309584, + "balance_loss_clip": 0.06329648, + "balance_loss_mlp": 0.01264404, + "epoch": 0.08681797685254772, + "flos": 26948070305280.0, + "grad_norm": 2.052695672066824, + "language_loss": 0.83898687, + "learning_rate": 3.966231856532584e-06, + "loss": 0.91960144, + "num_input_tokens_seen": 30863280, + "router_z_loss_clip": 4.22460938, + "router_z_loss_mlp": 0.45166016, + "step": 1444, + "time_per_iteration": 4.03491473197937 + }, + { + "auxiliary_loss_clip": 0.06771353, + "auxiliary_loss_mlp": 0.01313762, + "balance_loss_clip": 0.063327, + "balance_loss_mlp": 0.01263408, + "epoch": 0.0868781001052157, + "flos": 17718676231680.0, + "grad_norm": 2.3029002758170236, + "language_loss": 0.89515543, + "learning_rate": 3.966160554074189e-06, + "loss": 0.97600663, + "num_input_tokens_seen": 30881710, + "router_z_loss_clip": 4.3828125, + "router_z_loss_mlp": 0.50341797, + "step": 1445, + "time_per_iteration": 2.53659987449646 + }, + { + "auxiliary_loss_clip": 0.06757164, + "auxiliary_loss_mlp": 0.01319102, + "balance_loss_clip": 0.0633342, + "balance_loss_mlp": 0.01269916, + "epoch": 0.08693822335788366, + "flos": 19901820435840.0, + "grad_norm": 2.912516601595955, + "language_loss": 0.84297967, + "learning_rate": 3.96608917705879e-06, + "loss": 0.92374229, + "num_input_tokens_seen": 30900225, + "router_z_loss_clip": 4.23632812, + "router_z_loss_mlp": 0.49169922, + "step": 1446, + "time_per_iteration": 2.5991437435150146 + }, + { + "auxiliary_loss_clip": 0.06602339, + "auxiliary_loss_mlp": 0.01278086, + "balance_loss_clip": 0.06327674, + "balance_loss_mlp": 0.01252623, + "epoch": 0.08699834661055163, + "flos": 67040957871360.0, + "grad_norm": 0.7332106315857324, + "language_loss": 0.54912937, + "learning_rate": 3.966017725489091e-06, + "loss": 0.62793368, + "num_input_tokens_seen": 30959580, + "router_z_loss_clip": 2.75, + "router_z_loss_mlp": 0.25488281, + "step": 1447, + "time_per_iteration": 3.2708306312561035 + }, + { + "auxiliary_loss_clip": 0.06739033, + "auxiliary_loss_mlp": 0.01328667, + "balance_loss_clip": 0.06324905, + "balance_loss_mlp": 0.01282223, + "epoch": 0.0870584698632196, + "flos": 13485648942720.0, + "grad_norm": 3.073032874929238, + "language_loss": 0.86241722, + "learning_rate": 3.965946199367804e-06, + "loss": 0.94309419, + "num_input_tokens_seen": 30976775, + "router_z_loss_clip": 4.140625, + "router_z_loss_mlp": 0.46508789, + "step": 1448, + "time_per_iteration": 2.537522792816162 + }, + { + "auxiliary_loss_clip": 0.067637, + "auxiliary_loss_mlp": 0.01323636, + "balance_loss_clip": 0.06333195, + "balance_loss_mlp": 0.01275666, + "epoch": 0.08711859311588757, + "flos": 16112516042880.0, + "grad_norm": 5.523495984670142, + "language_loss": 0.81949937, + "learning_rate": 3.965874598697638e-06, + "loss": 0.90037274, + "num_input_tokens_seen": 30990495, + "router_z_loss_clip": 4.3046875, + "router_z_loss_mlp": 0.47949219, + "step": 1449, + "time_per_iteration": 2.57389760017395 + }, + { + "auxiliary_loss_clip": 0.06749628, + "auxiliary_loss_mlp": 0.01305238, + "balance_loss_clip": 0.06335508, + "balance_loss_mlp": 0.01262227, + "epoch": 0.08717871636855554, + "flos": 38481528424320.0, + "grad_norm": 2.3810554922577354, + "language_loss": 0.73064238, + "learning_rate": 3.965802923481313e-06, + "loss": 0.81119096, + "num_input_tokens_seen": 31014080, + "router_z_loss_clip": 4.140625, + "router_z_loss_mlp": 0.43017578, + "step": 1450, + "time_per_iteration": 2.7252304553985596 + }, + { + "auxiliary_loss_clip": 0.06761701, + "auxiliary_loss_mlp": 0.01323911, + "balance_loss_clip": 0.06337759, + "balance_loss_mlp": 0.01275416, + "epoch": 0.0872388396212235, + "flos": 17605932163200.0, + "grad_norm": 2.1112425767796474, + "language_loss": 0.85553432, + "learning_rate": 3.965731173721542e-06, + "loss": 0.9363904, + "num_input_tokens_seen": 31031210, + "router_z_loss_clip": 4.24414062, + "router_z_loss_mlp": 0.48486328, + "step": 1451, + "time_per_iteration": 2.556896209716797 + }, + { + "auxiliary_loss_clip": 0.06751224, + "auxiliary_loss_mlp": 0.01307951, + "balance_loss_clip": 0.06344092, + "balance_loss_mlp": 0.01266395, + "epoch": 0.08729896287389148, + "flos": 25265489592960.0, + "grad_norm": 2.067410826923288, + "language_loss": 0.76721281, + "learning_rate": 3.965659349421049e-06, + "loss": 0.84780455, + "num_input_tokens_seen": 31049710, + "router_z_loss_clip": 4.0703125, + "router_z_loss_mlp": 0.41577148, + "step": 1452, + "time_per_iteration": 2.5980234146118164 + }, + { + "auxiliary_loss_clip": 0.06767467, + "auxiliary_loss_mlp": 0.01321022, + "balance_loss_clip": 0.06343699, + "balance_loss_mlp": 0.01272623, + "epoch": 0.08735908612655945, + "flos": 15637836263040.0, + "grad_norm": 4.836985480100509, + "language_loss": 0.8246457, + "learning_rate": 3.965587450582556e-06, + "loss": 0.90553057, + "num_input_tokens_seen": 31066160, + "router_z_loss_clip": 4.23828125, + "router_z_loss_mlp": 0.48364258, + "step": 1453, + "time_per_iteration": 2.5459630489349365 + }, + { + "auxiliary_loss_clip": 0.06754768, + "auxiliary_loss_mlp": 0.0129928, + "balance_loss_clip": 0.06342497, + "balance_loss_mlp": 0.0125646, + "epoch": 0.08741920937922741, + "flos": 20345920675200.0, + "grad_norm": 3.0656217118084, + "language_loss": 0.72998244, + "learning_rate": 3.96551547720879e-06, + "loss": 0.81052291, + "num_input_tokens_seen": 31085270, + "router_z_loss_clip": 4.12695312, + "router_z_loss_mlp": 0.42822266, + "step": 1454, + "time_per_iteration": 2.551548957824707 + }, + { + "auxiliary_loss_clip": 0.0662789, + "auxiliary_loss_mlp": 0.01303999, + "balance_loss_clip": 0.06353966, + "balance_loss_mlp": 0.01280789, + "epoch": 0.08747933263189539, + "flos": 62841052944000.0, + "grad_norm": 0.7529223255178736, + "language_loss": 0.58298737, + "learning_rate": 3.96544342930248e-06, + "loss": 0.66230631, + "num_input_tokens_seen": 31148445, + "router_z_loss_clip": 2.734375, + "router_z_loss_mlp": 0.23181152, + "step": 1455, + "time_per_iteration": 3.2130184173583984 + }, + { + "auxiliary_loss_clip": 0.06774339, + "auxiliary_loss_mlp": 0.01313917, + "balance_loss_clip": 0.06350334, + "balance_loss_mlp": 0.01265303, + "epoch": 0.08753945588456336, + "flos": 33044122074240.0, + "grad_norm": 1.7776650768799964, + "language_loss": 0.79278296, + "learning_rate": 3.965371306866359e-06, + "loss": 0.87366557, + "num_input_tokens_seen": 31168770, + "router_z_loss_clip": 4.23632812, + "router_z_loss_mlp": 0.4855957, + "step": 1456, + "time_per_iteration": 2.6745898723602295 + }, + { + "auxiliary_loss_clip": 0.06785175, + "auxiliary_loss_mlp": 0.01319613, + "balance_loss_clip": 0.06356893, + "balance_loss_mlp": 0.01271881, + "epoch": 0.08759957913723132, + "flos": 35554807088640.0, + "grad_norm": 2.255439619282858, + "language_loss": 0.74143755, + "learning_rate": 3.96529910990316e-06, + "loss": 0.82248545, + "num_input_tokens_seen": 31189270, + "router_z_loss_clip": 4.28515625, + "router_z_loss_mlp": 0.47753906, + "step": 1457, + "time_per_iteration": 2.6837821006774902 + }, + { + "auxiliary_loss_clip": 0.06763137, + "auxiliary_loss_mlp": 0.01308035, + "balance_loss_clip": 0.06348729, + "balance_loss_mlp": 0.01264738, + "epoch": 0.0876597023898993, + "flos": 23917283798400.0, + "grad_norm": 1.7808177247023305, + "language_loss": 0.88680792, + "learning_rate": 3.965226838415622e-06, + "loss": 0.96751964, + "num_input_tokens_seen": 31210385, + "router_z_loss_clip": 4.140625, + "router_z_loss_mlp": 0.43261719, + "step": 1458, + "time_per_iteration": 2.5912857055664062 + }, + { + "auxiliary_loss_clip": 0.0677645, + "auxiliary_loss_mlp": 0.01313188, + "balance_loss_clip": 0.06355318, + "balance_loss_mlp": 0.01268151, + "epoch": 0.08771982564256726, + "flos": 18119912307840.0, + "grad_norm": 3.1042726617035297, + "language_loss": 0.82429975, + "learning_rate": 3.965154492406486e-06, + "loss": 0.90519613, + "num_input_tokens_seen": 31229745, + "router_z_loss_clip": 4.20703125, + "router_z_loss_mlp": 0.45043945, + "step": 1459, + "time_per_iteration": 2.5870959758758545 + }, + { + "auxiliary_loss_clip": 0.0679104, + "auxiliary_loss_mlp": 0.01327895, + "balance_loss_clip": 0.06355593, + "balance_loss_mlp": 0.01275062, + "epoch": 0.08777994889523523, + "flos": 17717711909760.0, + "grad_norm": 7.236455309064537, + "language_loss": 0.8621763, + "learning_rate": 3.9650820718784945e-06, + "loss": 0.94336569, + "num_input_tokens_seen": 31248280, + "router_z_loss_clip": 4.35546875, + "router_z_loss_mlp": 0.52856445, + "step": 1460, + "time_per_iteration": 2.574669361114502 + }, + { + "auxiliary_loss_clip": 0.06771254, + "auxiliary_loss_mlp": 0.01315799, + "balance_loss_clip": 0.06352662, + "balance_loss_mlp": 0.01271215, + "epoch": 0.0878400721479032, + "flos": 12824320193280.0, + "grad_norm": 3.2811276479841847, + "language_loss": 0.83160508, + "learning_rate": 3.965009576834394e-06, + "loss": 0.91247559, + "num_input_tokens_seen": 31262190, + "router_z_loss_clip": 4.18359375, + "router_z_loss_mlp": 0.44580078, + "step": 1461, + "time_per_iteration": 2.575343608856201 + }, + { + "auxiliary_loss_clip": 0.06765963, + "auxiliary_loss_mlp": 0.01303985, + "balance_loss_clip": 0.06350134, + "balance_loss_mlp": 0.01261094, + "epoch": 0.08790019540057117, + "flos": 26399359843200.0, + "grad_norm": 3.960130795636661, + "language_loss": 0.77723432, + "learning_rate": 3.964937007276932e-06, + "loss": 0.85793376, + "num_input_tokens_seen": 31283690, + "router_z_loss_clip": 4.15625, + "router_z_loss_mlp": 0.42895508, + "step": 1462, + "time_per_iteration": 2.6177735328674316 + }, + { + "auxiliary_loss_clip": 0.06788168, + "auxiliary_loss_mlp": 0.01309058, + "balance_loss_clip": 0.06352487, + "balance_loss_mlp": 0.01258371, + "epoch": 0.08796031865323914, + "flos": 19139822605440.0, + "grad_norm": 5.369695457360621, + "language_loss": 0.76475191, + "learning_rate": 3.9648643632088634e-06, + "loss": 0.84572417, + "num_input_tokens_seen": 31302505, + "router_z_loss_clip": 4.359375, + "router_z_loss_mlp": 0.50732422, + "step": 1463, + "time_per_iteration": 2.532130241394043 + }, + { + "auxiliary_loss_clip": 0.06770946, + "auxiliary_loss_mlp": 0.01316317, + "balance_loss_clip": 0.06331752, + "balance_loss_mlp": 0.01261218, + "epoch": 0.0880204419059071, + "flos": 26070896638080.0, + "grad_norm": 3.6430076592813427, + "language_loss": 0.85532415, + "learning_rate": 3.964791644632941e-06, + "loss": 0.9361968, + "num_input_tokens_seen": 31323070, + "router_z_loss_clip": 4.39453125, + "router_z_loss_mlp": 0.55126953, + "step": 1464, + "time_per_iteration": 2.606081962585449 + }, + { + "auxiliary_loss_clip": 0.06766248, + "auxiliary_loss_mlp": 0.01314801, + "balance_loss_clip": 0.06340823, + "balance_loss_mlp": 0.01264948, + "epoch": 0.08808056515857508, + "flos": 22383602991360.0, + "grad_norm": 2.6056498019463774, + "language_loss": 0.80711126, + "learning_rate": 3.964718851551923e-06, + "loss": 0.88792181, + "num_input_tokens_seen": 31341880, + "router_z_loss_clip": 4.25, + "router_z_loss_mlp": 0.4987793, + "step": 1465, + "time_per_iteration": 2.555612325668335 + }, + { + "auxiliary_loss_clip": 0.06765096, + "auxiliary_loss_mlp": 0.0132391, + "balance_loss_clip": 0.06346563, + "balance_loss_mlp": 0.01275654, + "epoch": 0.08814068841124305, + "flos": 23191986856320.0, + "grad_norm": 5.208613872763048, + "language_loss": 0.8713969, + "learning_rate": 3.9646459839685675e-06, + "loss": 0.95228696, + "num_input_tokens_seen": 31361995, + "router_z_loss_clip": 4.18554688, + "router_z_loss_mlp": 0.48266602, + "step": 1466, + "time_per_iteration": 2.5865933895111084 + }, + { + "auxiliary_loss_clip": 0.067513, + "auxiliary_loss_mlp": 0.01319742, + "balance_loss_clip": 0.06332761, + "balance_loss_mlp": 0.01270842, + "epoch": 0.08820081166391101, + "flos": 25162262962560.0, + "grad_norm": 2.171865464101356, + "language_loss": 0.85806906, + "learning_rate": 3.964573041885641e-06, + "loss": 0.93877947, + "num_input_tokens_seen": 31381515, + "router_z_loss_clip": 4.18359375, + "router_z_loss_mlp": 0.48852539, + "step": 1467, + "time_per_iteration": 2.5861306190490723 + }, + { + "auxiliary_loss_clip": 0.06751268, + "auxiliary_loss_mlp": 0.0130998, + "balance_loss_clip": 0.06337693, + "balance_loss_mlp": 0.01262654, + "epoch": 0.08826093491657899, + "flos": 22237386416640.0, + "grad_norm": 2.29409858909566, + "language_loss": 0.78131318, + "learning_rate": 3.964500025305907e-06, + "loss": 0.86192572, + "num_input_tokens_seen": 31400345, + "router_z_loss_clip": 4.1328125, + "router_z_loss_mlp": 0.47387695, + "step": 1468, + "time_per_iteration": 2.5800206661224365 + }, + { + "auxiliary_loss_clip": 0.06742708, + "auxiliary_loss_mlp": 0.01311969, + "balance_loss_clip": 0.06332668, + "balance_loss_mlp": 0.01265501, + "epoch": 0.08832105816924696, + "flos": 22133279318400.0, + "grad_norm": 1.8356690071746322, + "language_loss": 0.82406783, + "learning_rate": 3.9644269342321355e-06, + "loss": 0.90461457, + "num_input_tokens_seen": 31419620, + "router_z_loss_clip": 4.09570312, + "router_z_loss_mlp": 0.46459961, + "step": 1469, + "time_per_iteration": 2.5584611892700195 + }, + { + "auxiliary_loss_clip": 0.06744162, + "auxiliary_loss_mlp": 0.01313281, + "balance_loss_clip": 0.06327502, + "balance_loss_mlp": 0.01264739, + "epoch": 0.08838118142191492, + "flos": 17572250021760.0, + "grad_norm": 2.2192924058432615, + "language_loss": 0.79711461, + "learning_rate": 3.9643537686670974e-06, + "loss": 0.877689, + "num_input_tokens_seen": 31437970, + "router_z_loss_clip": 4.16210938, + "router_z_loss_mlp": 0.48535156, + "step": 1470, + "time_per_iteration": 2.5447630882263184 + }, + { + "auxiliary_loss_clip": 0.06739189, + "auxiliary_loss_mlp": 0.01312164, + "balance_loss_clip": 0.06326798, + "balance_loss_mlp": 0.0126274, + "epoch": 0.0884413046745829, + "flos": 20783480296320.0, + "grad_norm": 2.030528760335608, + "language_loss": 0.86272311, + "learning_rate": 3.964280528613569e-06, + "loss": 0.94323671, + "num_input_tokens_seen": 31457040, + "router_z_loss_clip": 4.125, + "router_z_loss_mlp": 0.49511719, + "step": 1471, + "time_per_iteration": 2.7219297885894775 + }, + { + "auxiliary_loss_clip": 0.06719133, + "auxiliary_loss_mlp": 0.01304039, + "balance_loss_clip": 0.06321308, + "balance_loss_mlp": 0.01263222, + "epoch": 0.08850142792725087, + "flos": 22131686090880.0, + "grad_norm": 5.945068157557599, + "language_loss": 0.85369575, + "learning_rate": 3.964207214074324e-06, + "loss": 0.93392742, + "num_input_tokens_seen": 31477520, + "router_z_loss_clip": 3.97851562, + "router_z_loss_mlp": 0.40820312, + "step": 1472, + "time_per_iteration": 2.6007394790649414 + }, + { + "auxiliary_loss_clip": 0.06741676, + "auxiliary_loss_mlp": 0.01307162, + "balance_loss_clip": 0.06323978, + "balance_loss_mlp": 0.01258811, + "epoch": 0.08856155117991883, + "flos": 22425251270400.0, + "grad_norm": 4.024487815181785, + "language_loss": 0.85227764, + "learning_rate": 3.964133825052146e-06, + "loss": 0.93276608, + "num_input_tokens_seen": 31495575, + "router_z_loss_clip": 4.1796875, + "router_z_loss_mlp": 0.48388672, + "step": 1473, + "time_per_iteration": 2.610280752182007 + }, + { + "auxiliary_loss_clip": 0.06745915, + "auxiliary_loss_mlp": 0.01303107, + "balance_loss_clip": 0.0632661, + "balance_loss_mlp": 0.01257998, + "epoch": 0.0886216744325868, + "flos": 29945132743680.0, + "grad_norm": 1.5926466073589443, + "language_loss": 0.80301654, + "learning_rate": 3.964060361549816e-06, + "loss": 0.88350677, + "num_input_tokens_seen": 31520020, + "router_z_loss_clip": 4.1953125, + "router_z_loss_mlp": 0.45092773, + "step": 1474, + "time_per_iteration": 2.74392032623291 + }, + { + "auxiliary_loss_clip": 0.0673038, + "auxiliary_loss_mlp": 0.01308218, + "balance_loss_clip": 0.06324204, + "balance_loss_mlp": 0.01263062, + "epoch": 0.08868179768525478, + "flos": 23988798858240.0, + "grad_norm": 2.028999420252469, + "language_loss": 0.80928683, + "learning_rate": 3.963986823570121e-06, + "loss": 0.88967282, + "num_input_tokens_seen": 31539265, + "router_z_loss_clip": 4.05859375, + "router_z_loss_mlp": 0.45166016, + "step": 1475, + "time_per_iteration": 2.570007801055908 + }, + { + "auxiliary_loss_clip": 0.06742392, + "auxiliary_loss_mlp": 0.01303332, + "balance_loss_clip": 0.06327485, + "balance_loss_mlp": 0.01256387, + "epoch": 0.08874192093792274, + "flos": 43187264922240.0, + "grad_norm": 1.8785525854248355, + "language_loss": 0.76261604, + "learning_rate": 3.963913211115848e-06, + "loss": 0.84307337, + "num_input_tokens_seen": 31563425, + "router_z_loss_clip": 4.1484375, + "router_z_loss_mlp": 0.46972656, + "step": 1476, + "time_per_iteration": 4.163857460021973 + }, + { + "auxiliary_loss_clip": 0.06743093, + "auxiliary_loss_mlp": 0.01308468, + "balance_loss_clip": 0.06333718, + "balance_loss_mlp": 0.01262405, + "epoch": 0.0888020441905907, + "flos": 32860491851520.0, + "grad_norm": 1.6890231836232912, + "language_loss": 0.76270819, + "learning_rate": 3.9638395241897895e-06, + "loss": 0.84322381, + "num_input_tokens_seen": 31584525, + "router_z_loss_clip": 4.09375, + "router_z_loss_mlp": 0.46069336, + "step": 1477, + "time_per_iteration": 2.6772334575653076 + }, + { + "auxiliary_loss_clip": 0.06751049, + "auxiliary_loss_mlp": 0.01308123, + "balance_loss_clip": 0.06334269, + "balance_loss_mlp": 0.01263468, + "epoch": 0.08886216744325869, + "flos": 23156124508800.0, + "grad_norm": 2.600680931100332, + "language_loss": 0.88817739, + "learning_rate": 3.963765762794739e-06, + "loss": 0.96876919, + "num_input_tokens_seen": 31603325, + "router_z_loss_clip": 4.16601562, + "router_z_loss_mlp": 0.44677734, + "step": 1478, + "time_per_iteration": 4.08270525932312 + }, + { + "auxiliary_loss_clip": 0.0675, + "auxiliary_loss_mlp": 0.01309174, + "balance_loss_clip": 0.06336476, + "balance_loss_mlp": 0.01263803, + "epoch": 0.08892229069592665, + "flos": 23338371139200.0, + "grad_norm": 1.8272738608530537, + "language_loss": 0.79003656, + "learning_rate": 3.963691926933495e-06, + "loss": 0.87062836, + "num_input_tokens_seen": 31624820, + "router_z_loss_clip": 4.1328125, + "router_z_loss_mlp": 0.45361328, + "step": 1479, + "time_per_iteration": 2.5917623043060303 + }, + { + "auxiliary_loss_clip": 0.06747445, + "auxiliary_loss_mlp": 0.01303872, + "balance_loss_clip": 0.06333964, + "balance_loss_mlp": 0.01256665, + "epoch": 0.08898241394859462, + "flos": 26221012427520.0, + "grad_norm": 4.931621721483509, + "language_loss": 0.80906087, + "learning_rate": 3.9636180166088555e-06, + "loss": 0.88957405, + "num_input_tokens_seen": 31646080, + "router_z_loss_clip": 4.1328125, + "router_z_loss_mlp": 0.47265625, + "step": 1480, + "time_per_iteration": 2.6102962493896484 + }, + { + "auxiliary_loss_clip": 0.06771734, + "auxiliary_loss_mlp": 0.01331796, + "balance_loss_clip": 0.06338413, + "balance_loss_mlp": 0.01278986, + "epoch": 0.0890425372012626, + "flos": 23557444439040.0, + "grad_norm": 2.1143063599710135, + "language_loss": 0.68804622, + "learning_rate": 3.963544031823624e-06, + "loss": 0.76908153, + "num_input_tokens_seen": 31665770, + "router_z_loss_clip": 4.33203125, + "router_z_loss_mlp": 0.52807617, + "step": 1481, + "time_per_iteration": 4.085212707519531 + }, + { + "auxiliary_loss_clip": 0.06743339, + "auxiliary_loss_mlp": 0.01307322, + "balance_loss_clip": 0.06335256, + "balance_loss_mlp": 0.01264358, + "epoch": 0.08910266045393056, + "flos": 23009446736640.0, + "grad_norm": 2.5169726563525234, + "language_loss": 0.99559236, + "learning_rate": 3.9634699725806065e-06, + "loss": 1.07609892, + "num_input_tokens_seen": 31683805, + "router_z_loss_clip": 4.07617188, + "router_z_loss_mlp": 0.42993164, + "step": 1482, + "time_per_iteration": 2.564034938812256 + }, + { + "auxiliary_loss_clip": 0.06760907, + "auxiliary_loss_mlp": 0.0131259, + "balance_loss_clip": 0.06338564, + "balance_loss_mlp": 0.01264024, + "epoch": 0.08916278370659853, + "flos": 31943766257280.0, + "grad_norm": 3.2036096398767993, + "language_loss": 0.81227845, + "learning_rate": 3.96339583888261e-06, + "loss": 0.89301342, + "num_input_tokens_seen": 31704630, + "router_z_loss_clip": 4.22460938, + "router_z_loss_mlp": 0.48535156, + "step": 1483, + "time_per_iteration": 4.063607215881348 + }, + { + "auxiliary_loss_clip": 0.06743906, + "auxiliary_loss_mlp": 0.01316489, + "balance_loss_clip": 0.06329283, + "balance_loss_mlp": 0.01268519, + "epoch": 0.08922290695926649, + "flos": 17536219966080.0, + "grad_norm": 10.926297293099243, + "language_loss": 0.87554848, + "learning_rate": 3.963321630732448e-06, + "loss": 0.95615244, + "num_input_tokens_seen": 31723255, + "router_z_loss_clip": 4.140625, + "router_z_loss_mlp": 0.47998047, + "step": 1484, + "time_per_iteration": 2.5457398891448975 + }, + { + "auxiliary_loss_clip": 0.06757183, + "auxiliary_loss_mlp": 0.01321525, + "balance_loss_clip": 0.06330685, + "balance_loss_mlp": 0.01272315, + "epoch": 0.08928303021193447, + "flos": 32133392046720.0, + "grad_norm": 2.337720635500538, + "language_loss": 0.82324612, + "learning_rate": 3.963247348132932e-06, + "loss": 0.90403324, + "num_input_tokens_seen": 31747045, + "router_z_loss_clip": 4.265625, + "router_z_loss_mlp": 0.49267578, + "step": 1485, + "time_per_iteration": 2.6794724464416504 + }, + { + "auxiliary_loss_clip": 0.06736165, + "auxiliary_loss_mlp": 0.01302402, + "balance_loss_clip": 0.06326707, + "balance_loss_mlp": 0.01256125, + "epoch": 0.08934315346460243, + "flos": 22131392601600.0, + "grad_norm": 3.158284640334893, + "language_loss": 0.84766626, + "learning_rate": 3.96317299108688e-06, + "loss": 0.92805195, + "num_input_tokens_seen": 31766615, + "router_z_loss_clip": 4.09765625, + "router_z_loss_mlp": 0.46264648, + "step": 1486, + "time_per_iteration": 2.5732409954071045 + }, + { + "auxiliary_loss_clip": 0.06736217, + "auxiliary_loss_mlp": 0.0130934, + "balance_loss_clip": 0.06328043, + "balance_loss_mlp": 0.01267569, + "epoch": 0.0894032767172704, + "flos": 22572264458880.0, + "grad_norm": 1.7672180345851645, + "language_loss": 0.78605509, + "learning_rate": 3.963098559597111e-06, + "loss": 0.86651075, + "num_input_tokens_seen": 31785855, + "router_z_loss_clip": 4.07617188, + "router_z_loss_mlp": 0.41748047, + "step": 1487, + "time_per_iteration": 2.5952718257904053 + }, + { + "auxiliary_loss_clip": 0.06736919, + "auxiliary_loss_mlp": 0.01308401, + "balance_loss_clip": 0.06326038, + "balance_loss_mlp": 0.0126353, + "epoch": 0.08946339996993838, + "flos": 20199578319360.0, + "grad_norm": 4.25204894574284, + "language_loss": 0.85387635, + "learning_rate": 3.963024053666449e-06, + "loss": 0.93432951, + "num_input_tokens_seen": 31804210, + "router_z_loss_clip": 4.10546875, + "router_z_loss_mlp": 0.44873047, + "step": 1488, + "time_per_iteration": 2.5534958839416504 + }, + { + "auxiliary_loss_clip": 0.06725559, + "auxiliary_loss_mlp": 0.01303445, + "balance_loss_clip": 0.06320536, + "balance_loss_mlp": 0.01259838, + "epoch": 0.08952352322260634, + "flos": 48371035363200.0, + "grad_norm": 2.4620081078023173, + "language_loss": 0.74370039, + "learning_rate": 3.962949473297718e-06, + "loss": 0.82399046, + "num_input_tokens_seen": 31826150, + "router_z_loss_clip": 4.04882812, + "router_z_loss_mlp": 0.43554688, + "step": 1489, + "time_per_iteration": 2.780122756958008 + }, + { + "auxiliary_loss_clip": 0.06736162, + "auxiliary_loss_mlp": 0.01308033, + "balance_loss_clip": 0.06324734, + "balance_loss_mlp": 0.01264092, + "epoch": 0.08958364647527431, + "flos": 31800736137600.0, + "grad_norm": 2.6258968543660584, + "language_loss": 0.91654348, + "learning_rate": 3.962874818493745e-06, + "loss": 0.99698538, + "num_input_tokens_seen": 31848060, + "router_z_loss_clip": 4.11132812, + "router_z_loss_mlp": 0.43945312, + "step": 1490, + "time_per_iteration": 2.619051456451416 + }, + { + "auxiliary_loss_clip": 0.06748827, + "auxiliary_loss_mlp": 0.01303631, + "balance_loss_clip": 0.06332797, + "balance_loss_mlp": 0.01258737, + "epoch": 0.08964376972794229, + "flos": 23374988173440.0, + "grad_norm": 2.6637397886572076, + "language_loss": 0.76370478, + "learning_rate": 3.9628000892573635e-06, + "loss": 0.84422934, + "num_input_tokens_seen": 31870040, + "router_z_loss_clip": 4.1640625, + "router_z_loss_mlp": 0.44897461, + "step": 1491, + "time_per_iteration": 2.590679407119751 + }, + { + "auxiliary_loss_clip": 0.06728335, + "auxiliary_loss_mlp": 0.01302455, + "balance_loss_clip": 0.06325481, + "balance_loss_mlp": 0.01261804, + "epoch": 0.08970389298061025, + "flos": 23301502542720.0, + "grad_norm": 1.853626118240874, + "language_loss": 0.78431886, + "learning_rate": 3.9627252855914055e-06, + "loss": 0.86462677, + "num_input_tokens_seen": 31890400, + "router_z_loss_clip": 4.02539062, + "router_z_loss_mlp": 0.40673828, + "step": 1492, + "time_per_iteration": 2.5715339183807373 + }, + { + "auxiliary_loss_clip": 0.06729841, + "auxiliary_loss_mlp": 0.01304764, + "balance_loss_clip": 0.06324601, + "balance_loss_mlp": 0.01260298, + "epoch": 0.08976401623327822, + "flos": 33769419016320.0, + "grad_norm": 3.870321699477457, + "language_loss": 0.73167109, + "learning_rate": 3.962650407498707e-06, + "loss": 0.81201714, + "num_input_tokens_seen": 31913435, + "router_z_loss_clip": 4.046875, + "router_z_loss_mlp": 0.44433594, + "step": 1493, + "time_per_iteration": 2.6644091606140137 + }, + { + "auxiliary_loss_clip": 0.0673489, + "auxiliary_loss_mlp": 0.01306407, + "balance_loss_clip": 0.06327641, + "balance_loss_mlp": 0.01259987, + "epoch": 0.08982413948594618, + "flos": 23917535360640.0, + "grad_norm": 1.970514386565943, + "language_loss": 0.88832223, + "learning_rate": 3.962575454982109e-06, + "loss": 0.96873516, + "num_input_tokens_seen": 31932435, + "router_z_loss_clip": 4.07617188, + "router_z_loss_mlp": 0.46435547, + "step": 1494, + "time_per_iteration": 2.58363676071167 + }, + { + "auxiliary_loss_clip": 0.06728575, + "auxiliary_loss_mlp": 0.01309753, + "balance_loss_clip": 0.06328882, + "balance_loss_mlp": 0.01267792, + "epoch": 0.08988426273861416, + "flos": 16843305427200.0, + "grad_norm": 4.2307100076147774, + "language_loss": 0.84796005, + "learning_rate": 3.962500428044454e-06, + "loss": 0.92834336, + "num_input_tokens_seen": 31950125, + "router_z_loss_clip": 3.99414062, + "router_z_loss_mlp": 0.41967773, + "step": 1495, + "time_per_iteration": 2.5592563152313232 + }, + { + "auxiliary_loss_clip": 0.06737964, + "auxiliary_loss_mlp": 0.01307798, + "balance_loss_clip": 0.06329042, + "balance_loss_mlp": 0.01263476, + "epoch": 0.08994438599128213, + "flos": 14798621295360.0, + "grad_norm": 2.6872032858380885, + "language_loss": 0.72458923, + "learning_rate": 3.962425326688585e-06, + "loss": 0.80504692, + "num_input_tokens_seen": 31968050, + "router_z_loss_clip": 4.08203125, + "router_z_loss_mlp": 0.44287109, + "step": 1496, + "time_per_iteration": 2.527702569961548 + }, + { + "auxiliary_loss_clip": 0.06731858, + "auxiliary_loss_mlp": 0.01301643, + "balance_loss_clip": 0.06328158, + "balance_loss_mlp": 0.01259038, + "epoch": 0.09000450924395009, + "flos": 17390087245440.0, + "grad_norm": 1.9873412980644265, + "language_loss": 0.82173735, + "learning_rate": 3.962350150917351e-06, + "loss": 0.90207237, + "num_input_tokens_seen": 31985675, + "router_z_loss_clip": 4.03515625, + "router_z_loss_mlp": 0.42578125, + "step": 1497, + "time_per_iteration": 2.5877413749694824 + }, + { + "auxiliary_loss_clip": 0.06743819, + "auxiliary_loss_mlp": 0.01303103, + "balance_loss_clip": 0.06327296, + "balance_loss_mlp": 0.01257064, + "epoch": 0.09006463249661807, + "flos": 24287269501440.0, + "grad_norm": 4.64905554567639, + "language_loss": 0.85617393, + "learning_rate": 3.9622749007336035e-06, + "loss": 0.93664312, + "num_input_tokens_seen": 32005180, + "router_z_loss_clip": 4.1640625, + "router_z_loss_mlp": 0.4609375, + "step": 1498, + "time_per_iteration": 2.5904557704925537 + }, + { + "auxiliary_loss_clip": 0.06749868, + "auxiliary_loss_mlp": 0.01309538, + "balance_loss_clip": 0.06334974, + "balance_loss_mlp": 0.01263666, + "epoch": 0.09012475574928604, + "flos": 13666931251200.0, + "grad_norm": 3.85109419291821, + "language_loss": 0.81540704, + "learning_rate": 3.962199576140195e-06, + "loss": 0.89600116, + "num_input_tokens_seen": 32022970, + "router_z_loss_clip": 4.1484375, + "router_z_loss_mlp": 0.45849609, + "step": 1499, + "time_per_iteration": 2.5302114486694336 + }, + { + "auxiliary_loss_clip": 0.06728019, + "auxiliary_loss_mlp": 0.01300863, + "balance_loss_clip": 0.06331602, + "balance_loss_mlp": 0.01261142, + "epoch": 0.090184879001954, + "flos": 23333884945920.0, + "grad_norm": 2.0381377997897636, + "language_loss": 0.94349372, + "learning_rate": 3.962124177139981e-06, + "loss": 1.02378249, + "num_input_tokens_seen": 32043055, + "router_z_loss_clip": 3.96484375, + "router_z_loss_mlp": 0.3972168, + "step": 1500, + "time_per_iteration": 2.5795865058898926 + }, + { + "auxiliary_loss_clip": 0.0677222, + "auxiliary_loss_mlp": 0.01314156, + "balance_loss_clip": 0.06350215, + "balance_loss_mlp": 0.01263539, + "epoch": 0.09024500225462198, + "flos": 23009320955520.0, + "grad_norm": 3.436423392701186, + "language_loss": 0.77039468, + "learning_rate": 3.962048703735822e-06, + "loss": 0.8512584, + "num_input_tokens_seen": 32061900, + "router_z_loss_clip": 4.21875, + "router_z_loss_mlp": 0.50634766, + "step": 1501, + "time_per_iteration": 2.5764503479003906 + }, + { + "auxiliary_loss_clip": 0.06607839, + "auxiliary_loss_mlp": 0.01283791, + "balance_loss_clip": 0.06328217, + "balance_loss_mlp": 0.01261165, + "epoch": 0.09030512550728995, + "flos": 62208626653440.0, + "grad_norm": 0.7031155649326037, + "language_loss": 0.58089769, + "learning_rate": 3.96197315593058e-06, + "loss": 0.659814, + "num_input_tokens_seen": 32122745, + "router_z_loss_clip": 2.796875, + "router_z_loss_mlp": 0.22619629, + "step": 1502, + "time_per_iteration": 3.1644375324249268 + }, + { + "auxiliary_loss_clip": 0.06763642, + "auxiliary_loss_mlp": 0.01313188, + "balance_loss_clip": 0.06354539, + "balance_loss_mlp": 0.01269653, + "epoch": 0.09036524875995791, + "flos": 38809907775360.0, + "grad_norm": 3.4086152145479427, + "language_loss": 0.72101718, + "learning_rate": 3.961897533727119e-06, + "loss": 0.80178547, + "num_input_tokens_seen": 32145125, + "router_z_loss_clip": 4.09179688, + "router_z_loss_mlp": 0.43579102, + "step": 1503, + "time_per_iteration": 2.724386215209961 + }, + { + "auxiliary_loss_clip": 0.06781425, + "auxiliary_loss_mlp": 0.01307874, + "balance_loss_clip": 0.06363953, + "balance_loss_mlp": 0.01263075, + "epoch": 0.09042537201262588, + "flos": 21696642092160.0, + "grad_norm": 2.1842796361034793, + "language_loss": 0.881266, + "learning_rate": 3.961821837128306e-06, + "loss": 0.96215898, + "num_input_tokens_seen": 32166255, + "router_z_loss_clip": 4.1796875, + "router_z_loss_mlp": 0.44848633, + "step": 1504, + "time_per_iteration": 2.5873734951019287 + }, + { + "auxiliary_loss_clip": 0.06790902, + "auxiliary_loss_mlp": 0.01331983, + "balance_loss_clip": 0.06361797, + "balance_loss_mlp": 0.01280795, + "epoch": 0.09048549526529386, + "flos": 22272536004480.0, + "grad_norm": 3.0474410186464427, + "language_loss": 0.75017542, + "learning_rate": 3.961746066137014e-06, + "loss": 0.83140427, + "num_input_tokens_seen": 32184010, + "router_z_loss_clip": 4.2890625, + "router_z_loss_mlp": 0.51171875, + "step": 1505, + "time_per_iteration": 2.542175054550171 + }, + { + "auxiliary_loss_clip": 0.06765792, + "auxiliary_loss_mlp": 0.0131069, + "balance_loss_clip": 0.06354111, + "balance_loss_mlp": 0.01263936, + "epoch": 0.09054561851796182, + "flos": 14616165029760.0, + "grad_norm": 3.6481054719455166, + "language_loss": 0.83357459, + "learning_rate": 3.961670220756114e-06, + "loss": 0.91433942, + "num_input_tokens_seen": 32201635, + "router_z_loss_clip": 4.11914062, + "router_z_loss_mlp": 0.46777344, + "step": 1506, + "time_per_iteration": 2.5811927318573 + }, + { + "auxiliary_loss_clip": 0.06768796, + "auxiliary_loss_mlp": 0.01305475, + "balance_loss_clip": 0.06366544, + "balance_loss_mlp": 0.01262584, + "epoch": 0.09060574177062979, + "flos": 27643542393600.0, + "grad_norm": 2.7002549048976388, + "language_loss": 0.78016138, + "learning_rate": 3.961594300988482e-06, + "loss": 0.8609041, + "num_input_tokens_seen": 32221940, + "router_z_loss_clip": 4.02148438, + "router_z_loss_mlp": 0.42871094, + "step": 1507, + "time_per_iteration": 2.6117966175079346 + }, + { + "auxiliary_loss_clip": 0.06588461, + "auxiliary_loss_mlp": 0.01287299, + "balance_loss_clip": 0.06317182, + "balance_loss_mlp": 0.01264351, + "epoch": 0.09066586502329776, + "flos": 66104637621120.0, + "grad_norm": 0.7149959192610794, + "language_loss": 0.57417059, + "learning_rate": 3.961518306836998e-06, + "loss": 0.65292823, + "num_input_tokens_seen": 32276495, + "router_z_loss_clip": 2.71875, + "router_z_loss_mlp": 0.22924805, + "step": 1508, + "time_per_iteration": 3.055577516555786 + }, + { + "auxiliary_loss_clip": 0.06765939, + "auxiliary_loss_mlp": 0.01315934, + "balance_loss_clip": 0.06356797, + "balance_loss_mlp": 0.01271135, + "epoch": 0.09072598827596573, + "flos": 18922426387200.0, + "grad_norm": 2.757411639882116, + "language_loss": 0.87097013, + "learning_rate": 3.961442238304543e-06, + "loss": 0.95178884, + "num_input_tokens_seen": 32294130, + "router_z_loss_clip": 4.09179688, + "router_z_loss_mlp": 0.44775391, + "step": 1509, + "time_per_iteration": 2.5325253009796143 + }, + { + "auxiliary_loss_clip": 0.06796411, + "auxiliary_loss_mlp": 0.01325092, + "balance_loss_clip": 0.06366567, + "balance_loss_mlp": 0.01275358, + "epoch": 0.0907861115286337, + "flos": 24827804190720.0, + "grad_norm": 3.0354649762753896, + "language_loss": 0.86899114, + "learning_rate": 3.961366095394002e-06, + "loss": 0.95020616, + "num_input_tokens_seen": 32313555, + "router_z_loss_clip": 4.29492188, + "router_z_loss_mlp": 0.49707031, + "step": 1510, + "time_per_iteration": 2.608421564102173 + }, + { + "auxiliary_loss_clip": 0.06775412, + "auxiliary_loss_mlp": 0.01304282, + "balance_loss_clip": 0.06358128, + "balance_loss_mlp": 0.01260127, + "epoch": 0.09084623478130167, + "flos": 21659270371200.0, + "grad_norm": 2.4633218193770103, + "language_loss": 0.89968181, + "learning_rate": 3.961289878108262e-06, + "loss": 0.98047876, + "num_input_tokens_seen": 32331430, + "router_z_loss_clip": 4.17773438, + "router_z_loss_mlp": 0.44140625, + "step": 1511, + "time_per_iteration": 2.566403388977051 + }, + { + "auxiliary_loss_clip": 0.0674355, + "auxiliary_loss_mlp": 0.01315251, + "balance_loss_clip": 0.06338912, + "balance_loss_mlp": 0.01272121, + "epoch": 0.09090635803396964, + "flos": 27647148119040.0, + "grad_norm": 2.09202487509347, + "language_loss": 0.86417758, + "learning_rate": 3.9612135864502135e-06, + "loss": 0.94476557, + "num_input_tokens_seen": 32353705, + "router_z_loss_clip": 4.0546875, + "router_z_loss_mlp": 0.43164062, + "step": 1512, + "time_per_iteration": 2.665790319442749 + }, + { + "auxiliary_loss_clip": 0.06752454, + "auxiliary_loss_mlp": 0.0130495, + "balance_loss_clip": 0.06350584, + "balance_loss_mlp": 0.01262726, + "epoch": 0.0909664812866376, + "flos": 17673757643520.0, + "grad_norm": 2.5146334197942926, + "language_loss": 0.88217908, + "learning_rate": 3.961137220422749e-06, + "loss": 0.96275318, + "num_input_tokens_seen": 32370520, + "router_z_loss_clip": 4.02148438, + "router_z_loss_mlp": 0.42211914, + "step": 1513, + "time_per_iteration": 2.531816244125366 + }, + { + "auxiliary_loss_clip": 0.06760095, + "auxiliary_loss_mlp": 0.01314183, + "balance_loss_clip": 0.06354512, + "balance_loss_mlp": 0.01272078, + "epoch": 0.09102660453930557, + "flos": 23958261244800.0, + "grad_norm": 5.873122305201123, + "language_loss": 0.88520277, + "learning_rate": 3.961060780028764e-06, + "loss": 0.9659456, + "num_input_tokens_seen": 32389105, + "router_z_loss_clip": 4.05664062, + "router_z_loss_mlp": 0.42138672, + "step": 1514, + "time_per_iteration": 2.609802722930908 + }, + { + "auxiliary_loss_clip": 0.06748682, + "auxiliary_loss_mlp": 0.01305229, + "balance_loss_clip": 0.06345841, + "balance_loss_mlp": 0.01266104, + "epoch": 0.09108672779197355, + "flos": 25820195621760.0, + "grad_norm": 1.9733366853077507, + "language_loss": 0.91259241, + "learning_rate": 3.960984265271159e-06, + "loss": 0.99313152, + "num_input_tokens_seen": 32408065, + "router_z_loss_clip": 4.02929688, + "router_z_loss_mlp": 0.39111328, + "step": 1515, + "time_per_iteration": 2.626183271408081 + }, + { + "auxiliary_loss_clip": 0.06753635, + "auxiliary_loss_mlp": 0.01307479, + "balance_loss_clip": 0.06346089, + "balance_loss_mlp": 0.01264754, + "epoch": 0.09114685104464151, + "flos": 29646620173440.0, + "grad_norm": 2.1883056599674195, + "language_loss": 0.87669599, + "learning_rate": 3.9609076761528335e-06, + "loss": 0.9573071, + "num_input_tokens_seen": 32427225, + "router_z_loss_clip": 4.078125, + "router_z_loss_mlp": 0.42700195, + "step": 1516, + "time_per_iteration": 4.0171709060668945 + }, + { + "auxiliary_loss_clip": 0.06753673, + "auxiliary_loss_mlp": 0.01309986, + "balance_loss_clip": 0.06344739, + "balance_loss_mlp": 0.01267643, + "epoch": 0.09120697429730948, + "flos": 33738084789120.0, + "grad_norm": 1.96049698042547, + "language_loss": 0.82941747, + "learning_rate": 3.960831012676692e-06, + "loss": 0.91005409, + "num_input_tokens_seen": 32450510, + "router_z_loss_clip": 4.0859375, + "router_z_loss_mlp": 0.42285156, + "step": 1517, + "time_per_iteration": 4.134803056716919 + }, + { + "auxiliary_loss_clip": 0.06748644, + "auxiliary_loss_mlp": 0.01313239, + "balance_loss_clip": 0.06338718, + "balance_loss_mlp": 0.0127061, + "epoch": 0.09126709754997746, + "flos": 18406559525760.0, + "grad_norm": 1.9085933618955446, + "language_loss": 0.79150838, + "learning_rate": 3.960754274845642e-06, + "loss": 0.87212718, + "num_input_tokens_seen": 32468425, + "router_z_loss_clip": 4.09375, + "router_z_loss_mlp": 0.42626953, + "step": 1518, + "time_per_iteration": 2.609239101409912 + }, + { + "auxiliary_loss_clip": 0.06742416, + "auxiliary_loss_mlp": 0.01311508, + "balance_loss_clip": 0.0633543, + "balance_loss_mlp": 0.01267853, + "epoch": 0.09132722080264542, + "flos": 22098674782080.0, + "grad_norm": 1.8265694387954685, + "language_loss": 0.88381147, + "learning_rate": 3.960677462662594e-06, + "loss": 0.9643507, + "num_input_tokens_seen": 32487510, + "router_z_loss_clip": 4.0703125, + "router_z_loss_mlp": 0.43676758, + "step": 1519, + "time_per_iteration": 2.559178590774536 + }, + { + "auxiliary_loss_clip": 0.06749827, + "auxiliary_loss_mlp": 0.01303758, + "balance_loss_clip": 0.06334724, + "balance_loss_mlp": 0.01259507, + "epoch": 0.09138734405531339, + "flos": 21039547973760.0, + "grad_norm": 3.1504469624820497, + "language_loss": 0.75833631, + "learning_rate": 3.96060057613046e-06, + "loss": 0.83887213, + "num_input_tokens_seen": 32507250, + "router_z_loss_clip": 4.15625, + "router_z_loss_mlp": 0.44238281, + "step": 1520, + "time_per_iteration": 2.5994057655334473 + }, + { + "auxiliary_loss_clip": 0.06753822, + "auxiliary_loss_mlp": 0.0130995, + "balance_loss_clip": 0.06342606, + "balance_loss_mlp": 0.01263912, + "epoch": 0.09144746730798137, + "flos": 20090104560000.0, + "grad_norm": 3.4850769207863648, + "language_loss": 0.8813951, + "learning_rate": 3.960523615252156e-06, + "loss": 0.96203285, + "num_input_tokens_seen": 32526045, + "router_z_loss_clip": 4.1171875, + "router_z_loss_mlp": 0.45996094, + "step": 1521, + "time_per_iteration": 3.9595701694488525 + }, + { + "auxiliary_loss_clip": 0.06768003, + "auxiliary_loss_mlp": 0.0131471, + "balance_loss_clip": 0.06346045, + "balance_loss_mlp": 0.01269864, + "epoch": 0.09150759056064933, + "flos": 22783874745600.0, + "grad_norm": 2.490873911959668, + "language_loss": 0.85374022, + "learning_rate": 3.960446580030599e-06, + "loss": 0.93456733, + "num_input_tokens_seen": 32546575, + "router_z_loss_clip": 4.21875, + "router_z_loss_mlp": 0.44824219, + "step": 1522, + "time_per_iteration": 4.0201475620269775 + }, + { + "auxiliary_loss_clip": 0.06745256, + "auxiliary_loss_mlp": 0.01307893, + "balance_loss_clip": 0.06349748, + "balance_loss_mlp": 0.01265359, + "epoch": 0.0915677138133173, + "flos": 27571733844480.0, + "grad_norm": 3.0013683058651974, + "language_loss": 0.82841086, + "learning_rate": 3.960369470468711e-06, + "loss": 0.90894234, + "num_input_tokens_seen": 32568795, + "router_z_loss_clip": 3.95117188, + "router_z_loss_mlp": 0.42504883, + "step": 1523, + "time_per_iteration": 2.6468050479888916 + }, + { + "auxiliary_loss_clip": 0.0678298, + "auxiliary_loss_mlp": 0.01311185, + "balance_loss_clip": 0.06364655, + "balance_loss_mlp": 0.01265838, + "epoch": 0.09162783706598528, + "flos": 17680340188800.0, + "grad_norm": 4.7132272646544395, + "language_loss": 0.75685203, + "learning_rate": 3.960292286569418e-06, + "loss": 0.83779365, + "num_input_tokens_seen": 32587010, + "router_z_loss_clip": 4.1796875, + "router_z_loss_mlp": 0.45361328, + "step": 1524, + "time_per_iteration": 2.521636962890625 + }, + { + "auxiliary_loss_clip": 0.06770191, + "auxiliary_loss_mlp": 0.01303707, + "balance_loss_clip": 0.06361801, + "balance_loss_mlp": 0.01259814, + "epoch": 0.09168796031865324, + "flos": 18484028225280.0, + "grad_norm": 2.538080589714564, + "language_loss": 0.88912833, + "learning_rate": 3.960215028335644e-06, + "loss": 0.96986729, + "num_input_tokens_seen": 32602375, + "router_z_loss_clip": 4.08398438, + "router_z_loss_mlp": 0.43920898, + "step": 1525, + "time_per_iteration": 2.523988962173462 + }, + { + "auxiliary_loss_clip": 0.06788673, + "auxiliary_loss_mlp": 0.01309343, + "balance_loss_clip": 0.06375777, + "balance_loss_mlp": 0.01263877, + "epoch": 0.0917480835713212, + "flos": 29395290251520.0, + "grad_norm": 2.947838768384084, + "language_loss": 0.76479626, + "learning_rate": 3.96013769577032e-06, + "loss": 0.84577644, + "num_input_tokens_seen": 32621460, + "router_z_loss_clip": 4.125, + "router_z_loss_mlp": 0.45458984, + "step": 1526, + "time_per_iteration": 2.622180700302124 + }, + { + "auxiliary_loss_clip": 0.06764297, + "auxiliary_loss_mlp": 0.0130915, + "balance_loss_clip": 0.06361825, + "balance_loss_mlp": 0.01267212, + "epoch": 0.09180820682398917, + "flos": 19835504328960.0, + "grad_norm": 3.217414250452265, + "language_loss": 0.78915322, + "learning_rate": 3.960060288876378e-06, + "loss": 0.86988777, + "num_input_tokens_seen": 32640440, + "router_z_loss_clip": 4.01953125, + "router_z_loss_mlp": 0.41967773, + "step": 1527, + "time_per_iteration": 2.574036121368408 + }, + { + "auxiliary_loss_clip": 0.0678985, + "auxiliary_loss_mlp": 0.0131218, + "balance_loss_clip": 0.0637854, + "balance_loss_mlp": 0.01269146, + "epoch": 0.09186833007665715, + "flos": 23848619777280.0, + "grad_norm": 2.3845621342237284, + "language_loss": 0.81092995, + "learning_rate": 3.959982807656753e-06, + "loss": 0.89195025, + "num_input_tokens_seen": 32660020, + "router_z_loss_clip": 4.109375, + "router_z_loss_mlp": 0.42993164, + "step": 1528, + "time_per_iteration": 2.55942440032959 + }, + { + "auxiliary_loss_clip": 0.067963, + "auxiliary_loss_mlp": 0.01308536, + "balance_loss_clip": 0.06370017, + "balance_loss_mlp": 0.01259708, + "epoch": 0.09192845332932512, + "flos": 12937693167360.0, + "grad_norm": 3.969055249882827, + "language_loss": 0.79179597, + "learning_rate": 3.959905252114384e-06, + "loss": 0.87284434, + "num_input_tokens_seen": 32678170, + "router_z_loss_clip": 4.26171875, + "router_z_loss_mlp": 0.48828125, + "step": 1529, + "time_per_iteration": 2.559513807296753 + }, + { + "auxiliary_loss_clip": 0.06793401, + "auxiliary_loss_mlp": 0.01313121, + "balance_loss_clip": 0.06376834, + "balance_loss_mlp": 0.01266081, + "epoch": 0.09198857658199308, + "flos": 24574503697920.0, + "grad_norm": 2.3851695624911433, + "language_loss": 0.84393311, + "learning_rate": 3.959827622252211e-06, + "loss": 0.92499834, + "num_input_tokens_seen": 32697540, + "router_z_loss_clip": 4.1640625, + "router_z_loss_mlp": 0.47021484, + "step": 1530, + "time_per_iteration": 2.586825132369995 + }, + { + "auxiliary_loss_clip": 0.06782777, + "auxiliary_loss_mlp": 0.01307988, + "balance_loss_clip": 0.0637871, + "balance_loss_mlp": 0.01264596, + "epoch": 0.09204869983466106, + "flos": 20273231658240.0, + "grad_norm": 2.9699033759595728, + "language_loss": 0.85435712, + "learning_rate": 3.959749918073179e-06, + "loss": 0.93526471, + "num_input_tokens_seen": 32716805, + "router_z_loss_clip": 4.0390625, + "router_z_loss_mlp": 0.43383789, + "step": 1531, + "time_per_iteration": 2.592822313308716 + }, + { + "auxiliary_loss_clip": 0.06784501, + "auxiliary_loss_mlp": 0.01306885, + "balance_loss_clip": 0.06371005, + "balance_loss_mlp": 0.01261967, + "epoch": 0.09210882308732903, + "flos": 20891780098560.0, + "grad_norm": 2.1537883780568907, + "language_loss": 0.82955891, + "learning_rate": 3.959672139580233e-06, + "loss": 0.91047275, + "num_input_tokens_seen": 32736385, + "router_z_loss_clip": 4.13671875, + "router_z_loss_mlp": 0.44897461, + "step": 1532, + "time_per_iteration": 2.5733680725097656 + }, + { + "auxiliary_loss_clip": 0.06776289, + "auxiliary_loss_mlp": 0.01303592, + "balance_loss_clip": 0.06368969, + "balance_loss_mlp": 0.01262059, + "epoch": 0.09216894633999699, + "flos": 30964246427520.0, + "grad_norm": 3.2208618489711593, + "language_loss": 0.85266644, + "learning_rate": 3.9595942867763235e-06, + "loss": 0.93346524, + "num_input_tokens_seen": 32757140, + "router_z_loss_clip": 4.06835938, + "router_z_loss_mlp": 0.41552734, + "step": 1533, + "time_per_iteration": 2.640906810760498 + }, + { + "auxiliary_loss_clip": 0.06779255, + "auxiliary_loss_mlp": 0.01307047, + "balance_loss_clip": 0.06369043, + "balance_loss_mlp": 0.01263369, + "epoch": 0.09222906959266497, + "flos": 13156556832000.0, + "grad_norm": 2.5924628709665987, + "language_loss": 0.91772735, + "learning_rate": 3.959516359664402e-06, + "loss": 0.99859047, + "num_input_tokens_seen": 32774860, + "router_z_loss_clip": 4.09960938, + "router_z_loss_mlp": 0.43652344, + "step": 1534, + "time_per_iteration": 2.5586555004119873 + }, + { + "auxiliary_loss_clip": 0.06771498, + "auxiliary_loss_mlp": 0.01306705, + "balance_loss_clip": 0.06357232, + "balance_loss_mlp": 0.01260142, + "epoch": 0.09228919284533293, + "flos": 26001603711360.0, + "grad_norm": 3.0123317324125694, + "language_loss": 0.77440608, + "learning_rate": 3.959438358247424e-06, + "loss": 0.85518813, + "num_input_tokens_seen": 32795250, + "router_z_loss_clip": 4.14257812, + "router_z_loss_mlp": 0.46557617, + "step": 1535, + "time_per_iteration": 2.5873541831970215 + }, + { + "auxiliary_loss_clip": 0.06759383, + "auxiliary_loss_mlp": 0.0131007, + "balance_loss_clip": 0.06362146, + "balance_loss_mlp": 0.012688, + "epoch": 0.0923493160980009, + "flos": 18666694126080.0, + "grad_norm": 2.0947698011843707, + "language_loss": 0.83399653, + "learning_rate": 3.959360282528346e-06, + "loss": 0.91469115, + "num_input_tokens_seen": 32813805, + "router_z_loss_clip": 3.97070312, + "router_z_loss_mlp": 0.41235352, + "step": 1536, + "time_per_iteration": 2.5708868503570557 + }, + { + "auxiliary_loss_clip": 0.06743568, + "auxiliary_loss_mlp": 0.01297679, + "balance_loss_clip": 0.06350097, + "balance_loss_mlp": 0.01257767, + "epoch": 0.09240943935066886, + "flos": 21146673818880.0, + "grad_norm": 2.077431495660488, + "language_loss": 0.91567117, + "learning_rate": 3.959282132510131e-06, + "loss": 0.99608374, + "num_input_tokens_seen": 32830960, + "router_z_loss_clip": 3.93945312, + "router_z_loss_mlp": 0.39916992, + "step": 1537, + "time_per_iteration": 2.5669217109680176 + }, + { + "auxiliary_loss_clip": 0.06758659, + "auxiliary_loss_mlp": 0.01302061, + "balance_loss_clip": 0.06354217, + "balance_loss_mlp": 0.01258288, + "epoch": 0.09246956260333684, + "flos": 20598298773120.0, + "grad_norm": 2.764633424079652, + "language_loss": 0.82388502, + "learning_rate": 3.959203908195741e-06, + "loss": 0.9044922, + "num_input_tokens_seen": 32848275, + "router_z_loss_clip": 4.04296875, + "router_z_loss_mlp": 0.43774414, + "step": 1538, + "time_per_iteration": 2.5693938732147217 + }, + { + "auxiliary_loss_clip": 0.06616426, + "auxiliary_loss_mlp": 0.01331188, + "balance_loss_clip": 0.06353034, + "balance_loss_mlp": 0.01300217, + "epoch": 0.09252968585600481, + "flos": 67580052312960.0, + "grad_norm": 0.7302597602699774, + "language_loss": 0.57435596, + "learning_rate": 3.959125609588142e-06, + "loss": 0.65383208, + "num_input_tokens_seen": 32917730, + "router_z_loss_clip": 2.625, + "router_z_loss_mlp": 0.30932617, + "step": 1539, + "time_per_iteration": 3.310535430908203 + }, + { + "auxiliary_loss_clip": 0.06755982, + "auxiliary_loss_mlp": 0.01299614, + "balance_loss_clip": 0.06351999, + "balance_loss_mlp": 0.01256174, + "epoch": 0.09258980910867277, + "flos": 17389542193920.0, + "grad_norm": 3.846304679224495, + "language_loss": 0.7084049, + "learning_rate": 3.959047236690304e-06, + "loss": 0.78896087, + "num_input_tokens_seen": 32934910, + "router_z_loss_clip": 4.03320312, + "router_z_loss_mlp": 0.43457031, + "step": 1540, + "time_per_iteration": 2.5759708881378174 + }, + { + "auxiliary_loss_clip": 0.06744132, + "auxiliary_loss_mlp": 0.01299701, + "balance_loss_clip": 0.0634924, + "balance_loss_mlp": 0.0125824, + "epoch": 0.09264993236134075, + "flos": 19872205217280.0, + "grad_norm": 1.8486482297190108, + "language_loss": 0.8567428, + "learning_rate": 3.958968789505198e-06, + "loss": 0.93718112, + "num_input_tokens_seen": 32953840, + "router_z_loss_clip": 3.95117188, + "router_z_loss_mlp": 0.41455078, + "step": 1541, + "time_per_iteration": 2.5332911014556885 + }, + { + "auxiliary_loss_clip": 0.06613824, + "auxiliary_loss_mlp": 0.01296188, + "balance_loss_clip": 0.06351398, + "balance_loss_mlp": 0.01268222, + "epoch": 0.09271005561400872, + "flos": 62301455377920.0, + "grad_norm": 0.8853632542817719, + "language_loss": 0.62370431, + "learning_rate": 3.9588902680358e-06, + "loss": 0.70280445, + "num_input_tokens_seen": 33011410, + "router_z_loss_clip": 2.625, + "router_z_loss_mlp": 0.28027344, + "step": 1542, + "time_per_iteration": 3.234708309173584 + }, + { + "auxiliary_loss_clip": 0.06759306, + "auxiliary_loss_mlp": 0.01304245, + "balance_loss_clip": 0.06356558, + "balance_loss_mlp": 0.01259923, + "epoch": 0.09277017886667668, + "flos": 23336358641280.0, + "grad_norm": 2.3970894213309, + "language_loss": 0.84548283, + "learning_rate": 3.958811672285086e-06, + "loss": 0.92611837, + "num_input_tokens_seen": 33031675, + "router_z_loss_clip": 4.03320312, + "router_z_loss_mlp": 0.44360352, + "step": 1543, + "time_per_iteration": 2.5636215209960938 + }, + { + "auxiliary_loss_clip": 0.06747155, + "auxiliary_loss_mlp": 0.01303454, + "balance_loss_clip": 0.06351274, + "balance_loss_mlp": 0.01258178, + "epoch": 0.09283030211934466, + "flos": 54757088513280.0, + "grad_norm": 2.335606951107943, + "language_loss": 0.73961073, + "learning_rate": 3.958733002256038e-06, + "loss": 0.82011688, + "num_input_tokens_seen": 33056355, + "router_z_loss_clip": 3.96289062, + "router_z_loss_mlp": 0.45288086, + "step": 1544, + "time_per_iteration": 2.8664584159851074 + }, + { + "auxiliary_loss_clip": 0.06775358, + "auxiliary_loss_mlp": 0.01304809, + "balance_loss_clip": 0.06364222, + "balance_loss_mlp": 0.01260082, + "epoch": 0.09289042537201263, + "flos": 30342385751040.0, + "grad_norm": 2.3360980643139673, + "language_loss": 0.78971326, + "learning_rate": 3.958654257951637e-06, + "loss": 0.87051487, + "num_input_tokens_seen": 33079520, + "router_z_loss_clip": 4.109375, + "router_z_loss_mlp": 0.44750977, + "step": 1545, + "time_per_iteration": 2.6384429931640625 + }, + { + "auxiliary_loss_clip": 0.0674521, + "auxiliary_loss_mlp": 0.01308675, + "balance_loss_clip": 0.06349306, + "balance_loss_mlp": 0.01266499, + "epoch": 0.09295054862468059, + "flos": 17752274519040.0, + "grad_norm": 3.8854693427637796, + "language_loss": 0.77781618, + "learning_rate": 3.9585754393748706e-06, + "loss": 0.85835493, + "num_input_tokens_seen": 33096135, + "router_z_loss_clip": 3.95703125, + "router_z_loss_mlp": 0.42163086, + "step": 1546, + "time_per_iteration": 2.5352087020874023 + }, + { + "auxiliary_loss_clip": 0.06760454, + "auxiliary_loss_mlp": 0.01300982, + "balance_loss_clip": 0.06357808, + "balance_loss_mlp": 0.01258066, + "epoch": 0.09301067187734856, + "flos": 23664528357120.0, + "grad_norm": 2.488248885797729, + "language_loss": 0.85732055, + "learning_rate": 3.9584965465287275e-06, + "loss": 0.93793488, + "num_input_tokens_seen": 33115245, + "router_z_loss_clip": 4.02734375, + "router_z_loss_mlp": 0.42919922, + "step": 1547, + "time_per_iteration": 2.6185734272003174 + }, + { + "auxiliary_loss_clip": 0.0676943, + "auxiliary_loss_mlp": 0.01302462, + "balance_loss_clip": 0.06361516, + "balance_loss_mlp": 0.01256733, + "epoch": 0.09307079513001654, + "flos": 27535242591360.0, + "grad_norm": 10.105633046635301, + "language_loss": 0.69631422, + "learning_rate": 3.958417579416199e-06, + "loss": 0.77703309, + "num_input_tokens_seen": 33136640, + "router_z_loss_clip": 4.078125, + "router_z_loss_mlp": 0.45703125, + "step": 1548, + "time_per_iteration": 2.590592861175537 + }, + { + "auxiliary_loss_clip": 0.06756231, + "auxiliary_loss_mlp": 0.01308751, + "balance_loss_clip": 0.06351212, + "balance_loss_mlp": 0.01262164, + "epoch": 0.0931309183826845, + "flos": 20632945236480.0, + "grad_norm": 2.778765119974638, + "language_loss": 0.85783607, + "learning_rate": 3.9583385380402795e-06, + "loss": 0.93848586, + "num_input_tokens_seen": 33155060, + "router_z_loss_clip": 4.0546875, + "router_z_loss_mlp": 0.46582031, + "step": 1549, + "time_per_iteration": 2.5733652114868164 + }, + { + "auxiliary_loss_clip": 0.0674461, + "auxiliary_loss_mlp": 0.0130734, + "balance_loss_clip": 0.06348558, + "balance_loss_mlp": 0.01260515, + "epoch": 0.09319104163535247, + "flos": 29028239441280.0, + "grad_norm": 2.291130376172184, + "language_loss": 0.78293371, + "learning_rate": 3.958259422403966e-06, + "loss": 0.86345315, + "num_input_tokens_seen": 33175420, + "router_z_loss_clip": 3.96289062, + "router_z_loss_mlp": 0.46777344, + "step": 1550, + "time_per_iteration": 2.675468683242798 + }, + { + "auxiliary_loss_clip": 0.06764482, + "auxiliary_loss_mlp": 0.01307112, + "balance_loss_clip": 0.06363475, + "balance_loss_mlp": 0.01261932, + "epoch": 0.09325116488802045, + "flos": 25308605318400.0, + "grad_norm": 3.8025580487165827, + "language_loss": 0.85284662, + "learning_rate": 3.95818023251026e-06, + "loss": 0.93356252, + "num_input_tokens_seen": 33194120, + "router_z_loss_clip": 4.00976562, + "router_z_loss_mlp": 0.4519043, + "step": 1551, + "time_per_iteration": 2.6053500175476074 + }, + { + "auxiliary_loss_clip": 0.06596169, + "auxiliary_loss_mlp": 0.0130535, + "balance_loss_clip": 0.0633968, + "balance_loss_mlp": 0.01277837, + "epoch": 0.09331128814068841, + "flos": 61556144509440.0, + "grad_norm": 0.7233822491319317, + "language_loss": 0.61895663, + "learning_rate": 3.958100968362163e-06, + "loss": 0.69797182, + "num_input_tokens_seen": 33261080, + "router_z_loss_clip": 2.5625, + "router_z_loss_mlp": 0.27587891, + "step": 1552, + "time_per_iteration": 3.3384416103363037 + }, + { + "auxiliary_loss_clip": 0.06590016, + "auxiliary_loss_mlp": 0.01301581, + "balance_loss_clip": 0.06333126, + "balance_loss_mlp": 0.012734, + "epoch": 0.09337141139335638, + "flos": 53312810883840.0, + "grad_norm": 0.7946952857616146, + "language_loss": 0.59040678, + "learning_rate": 3.958021629962681e-06, + "loss": 0.66932273, + "num_input_tokens_seen": 33330235, + "router_z_loss_clip": 2.5625, + "router_z_loss_mlp": 0.28222656, + "step": 1553, + "time_per_iteration": 3.328634262084961 + }, + { + "auxiliary_loss_clip": 0.06762205, + "auxiliary_loss_mlp": 0.01305187, + "balance_loss_clip": 0.06356394, + "balance_loss_mlp": 0.01259005, + "epoch": 0.09343153464602436, + "flos": 23483539537920.0, + "grad_norm": 2.4998209031659853, + "language_loss": 0.888143, + "learning_rate": 3.957942217314823e-06, + "loss": 0.96881694, + "num_input_tokens_seen": 33349035, + "router_z_loss_clip": 4.05078125, + "router_z_loss_mlp": 0.46142578, + "step": 1554, + "time_per_iteration": 2.581807851791382 + }, + { + "auxiliary_loss_clip": 0.06741555, + "auxiliary_loss_mlp": 0.01307833, + "balance_loss_clip": 0.06351957, + "balance_loss_mlp": 0.01266014, + "epoch": 0.09349165789869232, + "flos": 19359399029760.0, + "grad_norm": 2.344370035353047, + "language_loss": 0.83131635, + "learning_rate": 3.957862730421599e-06, + "loss": 0.91181016, + "num_input_tokens_seen": 33368060, + "router_z_loss_clip": 3.89453125, + "router_z_loss_mlp": 0.41772461, + "step": 1555, + "time_per_iteration": 2.5902695655822754 + }, + { + "auxiliary_loss_clip": 0.06587426, + "auxiliary_loss_mlp": 0.01289293, + "balance_loss_clip": 0.06331394, + "balance_loss_mlp": 0.01264736, + "epoch": 0.09355178115136029, + "flos": 67520626968960.0, + "grad_norm": 0.861973728001382, + "language_loss": 0.59963852, + "learning_rate": 3.957783169286024e-06, + "loss": 0.67840576, + "num_input_tokens_seen": 33430825, + "router_z_loss_clip": 2.5625, + "router_z_loss_mlp": 0.2454834, + "step": 1556, + "time_per_iteration": 4.633097410202026 + }, + { + "auxiliary_loss_clip": 0.06743869, + "auxiliary_loss_mlp": 0.01306461, + "balance_loss_clip": 0.06350282, + "balance_loss_mlp": 0.01262378, + "epoch": 0.09361190440402825, + "flos": 37350676920960.0, + "grad_norm": 4.324378965941339, + "language_loss": 0.86094332, + "learning_rate": 3.9577035339111155e-06, + "loss": 0.94144666, + "num_input_tokens_seen": 33454855, + "router_z_loss_clip": 3.93359375, + "router_z_loss_mlp": 0.44091797, + "step": 1557, + "time_per_iteration": 4.159425258636475 + }, + { + "auxiliary_loss_clip": 0.06735416, + "auxiliary_loss_mlp": 0.01305568, + "balance_loss_clip": 0.0634184, + "balance_loss_mlp": 0.01261961, + "epoch": 0.09367202765669623, + "flos": 24906614555520.0, + "grad_norm": 1.8416864834979163, + "language_loss": 0.79618692, + "learning_rate": 3.957623824299893e-06, + "loss": 0.87659669, + "num_input_tokens_seen": 33476000, + "router_z_loss_clip": 3.9375, + "router_z_loss_mlp": 0.4362793, + "step": 1558, + "time_per_iteration": 2.592564105987549 + }, + { + "auxiliary_loss_clip": 0.0675108, + "auxiliary_loss_mlp": 0.01310633, + "balance_loss_clip": 0.06350247, + "balance_loss_mlp": 0.0126562, + "epoch": 0.0937321509093642, + "flos": 15710986477440.0, + "grad_norm": 2.1774663365636555, + "language_loss": 0.81722063, + "learning_rate": 3.957544040455379e-06, + "loss": 0.89783776, + "num_input_tokens_seen": 33493845, + "router_z_loss_clip": 4.00390625, + "router_z_loss_mlp": 0.44995117, + "step": 1559, + "time_per_iteration": 2.6032233238220215 + }, + { + "auxiliary_loss_clip": 0.06735763, + "auxiliary_loss_mlp": 0.01315647, + "balance_loss_clip": 0.06339972, + "balance_loss_mlp": 0.0126844, + "epoch": 0.09379227416203216, + "flos": 20489663554560.0, + "grad_norm": 4.6744208078316785, + "language_loss": 0.77938354, + "learning_rate": 3.957464182380599e-06, + "loss": 0.85989761, + "num_input_tokens_seen": 33510850, + "router_z_loss_clip": 3.95117188, + "router_z_loss_mlp": 0.47216797, + "step": 1560, + "time_per_iteration": 4.077486753463745 + }, + { + "auxiliary_loss_clip": 0.06748343, + "auxiliary_loss_mlp": 0.01308417, + "balance_loss_clip": 0.06347422, + "balance_loss_mlp": 0.01262736, + "epoch": 0.09385239741470014, + "flos": 24359329612800.0, + "grad_norm": 2.0394992370655975, + "language_loss": 0.82801652, + "learning_rate": 3.95738425007858e-06, + "loss": 0.90858412, + "num_input_tokens_seen": 33530430, + "router_z_loss_clip": 4.0078125, + "router_z_loss_mlp": 0.45678711, + "step": 1561, + "time_per_iteration": 2.596116781234741 + }, + { + "auxiliary_loss_clip": 0.06752103, + "auxiliary_loss_mlp": 0.01323602, + "balance_loss_clip": 0.06347683, + "balance_loss_mlp": 0.01280186, + "epoch": 0.0939125206673681, + "flos": 33299812408320.0, + "grad_norm": 7.4214047506541085, + "language_loss": 0.63655907, + "learning_rate": 3.957304243552354e-06, + "loss": 0.71731609, + "num_input_tokens_seen": 33551975, + "router_z_loss_clip": 4.046875, + "router_z_loss_mlp": 0.43457031, + "step": 1562, + "time_per_iteration": 4.075207710266113 + }, + { + "auxiliary_loss_clip": 0.06726522, + "auxiliary_loss_mlp": 0.01325114, + "balance_loss_clip": 0.06341539, + "balance_loss_mlp": 0.012796, + "epoch": 0.09397264392003607, + "flos": 19250973446400.0, + "grad_norm": 3.0209063418471516, + "language_loss": 0.87167883, + "learning_rate": 3.957224162804956e-06, + "loss": 0.95219523, + "num_input_tokens_seen": 33569850, + "router_z_loss_clip": 3.84960938, + "router_z_loss_mlp": 0.45556641, + "step": 1563, + "time_per_iteration": 2.5672974586486816 + }, + { + "auxiliary_loss_clip": 0.06731268, + "auxiliary_loss_mlp": 0.01318973, + "balance_loss_clip": 0.06341776, + "balance_loss_mlp": 0.01275843, + "epoch": 0.09403276717270405, + "flos": 19323997879680.0, + "grad_norm": 4.036825223775372, + "language_loss": 0.77853692, + "learning_rate": 3.9571440078394205e-06, + "loss": 0.85903931, + "num_input_tokens_seen": 33590510, + "router_z_loss_clip": 3.90039062, + "router_z_loss_mlp": 0.43139648, + "step": 1564, + "time_per_iteration": 2.586803913116455 + }, + { + "auxiliary_loss_clip": 0.06734219, + "auxiliary_loss_mlp": 0.0132655, + "balance_loss_clip": 0.06344242, + "balance_loss_mlp": 0.01285876, + "epoch": 0.09409289042537201, + "flos": 23589701061120.0, + "grad_norm": 2.2846066488683725, + "language_loss": 0.81194431, + "learning_rate": 3.9570637786587895e-06, + "loss": 0.89255196, + "num_input_tokens_seen": 33608810, + "router_z_loss_clip": 3.90039062, + "router_z_loss_mlp": 0.40649414, + "step": 1565, + "time_per_iteration": 2.5794317722320557 + }, + { + "auxiliary_loss_clip": 0.06753047, + "auxiliary_loss_mlp": 0.01322466, + "balance_loss_clip": 0.06351732, + "balance_loss_mlp": 0.01275616, + "epoch": 0.09415301367803998, + "flos": 20083689722880.0, + "grad_norm": 2.6435222335860984, + "language_loss": 0.77859378, + "learning_rate": 3.956983475266103e-06, + "loss": 0.85934889, + "num_input_tokens_seen": 33627265, + "router_z_loss_clip": 4.01171875, + "router_z_loss_mlp": 0.46850586, + "step": 1566, + "time_per_iteration": 2.585827112197876 + }, + { + "auxiliary_loss_clip": 0.06732298, + "auxiliary_loss_mlp": 0.01317656, + "balance_loss_clip": 0.06341095, + "balance_loss_mlp": 0.01273048, + "epoch": 0.09421313693070796, + "flos": 21067234548480.0, + "grad_norm": 2.512043511854747, + "language_loss": 0.79885954, + "learning_rate": 3.956903097664407e-06, + "loss": 0.87935913, + "num_input_tokens_seen": 33644810, + "router_z_loss_clip": 3.91015625, + "router_z_loss_mlp": 0.44555664, + "step": 1567, + "time_per_iteration": 2.6127569675445557 + }, + { + "auxiliary_loss_clip": 0.06736939, + "auxiliary_loss_mlp": 0.01312026, + "balance_loss_clip": 0.06345257, + "balance_loss_mlp": 0.01268467, + "epoch": 0.09427326018337592, + "flos": 24323006067840.0, + "grad_norm": 2.023408518632979, + "language_loss": 0.8442241, + "learning_rate": 3.956822645856749e-06, + "loss": 0.92471373, + "num_input_tokens_seen": 33665665, + "router_z_loss_clip": 3.91796875, + "router_z_loss_mlp": 0.43505859, + "step": 1568, + "time_per_iteration": 2.569720506668091 + }, + { + "auxiliary_loss_clip": 0.06755883, + "auxiliary_loss_mlp": 0.01306618, + "balance_loss_clip": 0.06353641, + "balance_loss_mlp": 0.01263583, + "epoch": 0.09433338343604389, + "flos": 20269667859840.0, + "grad_norm": 2.477497103121254, + "language_loss": 0.77784359, + "learning_rate": 3.9567421198461814e-06, + "loss": 0.85846859, + "num_input_tokens_seen": 33684760, + "router_z_loss_clip": 4.01757812, + "router_z_loss_mlp": 0.43041992, + "step": 1569, + "time_per_iteration": 2.573776960372925 + }, + { + "auxiliary_loss_clip": 0.06750233, + "auxiliary_loss_mlp": 0.01322236, + "balance_loss_clip": 0.06360742, + "balance_loss_mlp": 0.01281443, + "epoch": 0.09439350668871185, + "flos": 12746683785600.0, + "grad_norm": 3.1104432371221495, + "language_loss": 0.87103617, + "learning_rate": 3.956661519635756e-06, + "loss": 0.95176083, + "num_input_tokens_seen": 33700750, + "router_z_loss_clip": 3.8984375, + "router_z_loss_mlp": 0.40795898, + "step": 1570, + "time_per_iteration": 2.5129590034484863 + }, + { + "auxiliary_loss_clip": 0.06749961, + "auxiliary_loss_mlp": 0.01311255, + "balance_loss_clip": 0.06350505, + "balance_loss_mlp": 0.01269007, + "epoch": 0.09445362994137983, + "flos": 25970101776000.0, + "grad_norm": 2.3671248077954297, + "language_loss": 0.7803812, + "learning_rate": 3.95658084522853e-06, + "loss": 0.86099339, + "num_input_tokens_seen": 33724430, + "router_z_loss_clip": 3.99609375, + "router_z_loss_mlp": 0.42236328, + "step": 1571, + "time_per_iteration": 2.7541556358337402 + }, + { + "auxiliary_loss_clip": 0.0672407, + "auxiliary_loss_mlp": 0.01308455, + "balance_loss_clip": 0.06346194, + "balance_loss_mlp": 0.01269807, + "epoch": 0.0945137531940478, + "flos": 19720831616640.0, + "grad_norm": 2.4306247586771934, + "language_loss": 0.81068146, + "learning_rate": 3.956500096627561e-06, + "loss": 0.89100671, + "num_input_tokens_seen": 33743455, + "router_z_loss_clip": 3.78125, + "router_z_loss_mlp": 0.38623047, + "step": 1572, + "time_per_iteration": 2.5679988861083984 + }, + { + "auxiliary_loss_clip": 0.06744019, + "auxiliary_loss_mlp": 0.01308416, + "balance_loss_clip": 0.06344286, + "balance_loss_mlp": 0.01265691, + "epoch": 0.09457387644671576, + "flos": 23622796224000.0, + "grad_norm": 3.3370924728894185, + "language_loss": 0.8915112, + "learning_rate": 3.956419273835913e-06, + "loss": 0.97203565, + "num_input_tokens_seen": 33763435, + "router_z_loss_clip": 3.99804688, + "router_z_loss_mlp": 0.42700195, + "step": 1573, + "time_per_iteration": 2.607600688934326 + }, + { + "auxiliary_loss_clip": 0.06757497, + "auxiliary_loss_mlp": 0.01304776, + "balance_loss_clip": 0.0635422, + "balance_loss_mlp": 0.0125919, + "epoch": 0.09463399969938374, + "flos": 26914681653120.0, + "grad_norm": 3.5983977458342764, + "language_loss": 0.83351094, + "learning_rate": 3.95633837685665e-06, + "loss": 0.91413361, + "num_input_tokens_seen": 33784325, + "router_z_loss_clip": 4.02734375, + "router_z_loss_mlp": 0.45605469, + "step": 1574, + "time_per_iteration": 2.629686117172241 + }, + { + "auxiliary_loss_clip": 0.06738517, + "auxiliary_loss_mlp": 0.01306377, + "balance_loss_clip": 0.06343692, + "balance_loss_mlp": 0.01264463, + "epoch": 0.0946941229520517, + "flos": 23666331219840.0, + "grad_norm": 2.307572986084867, + "language_loss": 0.82900977, + "learning_rate": 3.95625740569284e-06, + "loss": 0.9094587, + "num_input_tokens_seen": 33802510, + "router_z_loss_clip": 3.9453125, + "router_z_loss_mlp": 0.41918945, + "step": 1575, + "time_per_iteration": 2.6788809299468994 + }, + { + "auxiliary_loss_clip": 0.06738277, + "auxiliary_loss_mlp": 0.013099, + "balance_loss_clip": 0.06341611, + "balance_loss_mlp": 0.01265912, + "epoch": 0.09475424620471967, + "flos": 24140927145600.0, + "grad_norm": 3.091827797586119, + "language_loss": 0.88420904, + "learning_rate": 3.956176360347553e-06, + "loss": 0.9646908, + "num_input_tokens_seen": 33819980, + "router_z_loss_clip": 3.96875, + "router_z_loss_mlp": 0.43969727, + "step": 1576, + "time_per_iteration": 2.579481840133667 + }, + { + "auxiliary_loss_clip": 0.06599005, + "auxiliary_loss_mlp": 0.01293963, + "balance_loss_clip": 0.06343846, + "balance_loss_mlp": 0.01269894, + "epoch": 0.09481436945738765, + "flos": 68446283022720.0, + "grad_norm": 0.9736372426009887, + "language_loss": 0.66026628, + "learning_rate": 3.956095240823862e-06, + "loss": 0.73919594, + "num_input_tokens_seen": 33878925, + "router_z_loss_clip": 2.5546875, + "router_z_loss_mlp": 0.24060059, + "step": 1577, + "time_per_iteration": 3.1515533924102783 + }, + { + "auxiliary_loss_clip": 0.06730399, + "auxiliary_loss_mlp": 0.01300904, + "balance_loss_clip": 0.06338648, + "balance_loss_mlp": 0.01260373, + "epoch": 0.09487449271005562, + "flos": 16659633277440.0, + "grad_norm": 8.095983487206498, + "language_loss": 0.81352609, + "learning_rate": 3.956014047124844e-06, + "loss": 0.89383912, + "num_input_tokens_seen": 33897600, + "router_z_loss_clip": 3.91601562, + "router_z_loss_mlp": 0.40551758, + "step": 1578, + "time_per_iteration": 2.5477943420410156 + }, + { + "auxiliary_loss_clip": 0.06728384, + "auxiliary_loss_mlp": 0.01305272, + "balance_loss_clip": 0.06339101, + "balance_loss_mlp": 0.01262261, + "epoch": 0.09493461596272358, + "flos": 24281860913280.0, + "grad_norm": 2.2398618164761674, + "language_loss": 0.79482144, + "learning_rate": 3.955932779253578e-06, + "loss": 0.87515795, + "num_input_tokens_seen": 33917365, + "router_z_loss_clip": 3.89453125, + "router_z_loss_mlp": 0.43017578, + "step": 1579, + "time_per_iteration": 2.5968475341796875 + }, + { + "auxiliary_loss_clip": 0.06732477, + "auxiliary_loss_mlp": 0.01300696, + "balance_loss_clip": 0.06336749, + "balance_loss_mlp": 0.012579, + "epoch": 0.09499473921539155, + "flos": 21876373100160.0, + "grad_norm": 2.5076146880491406, + "language_loss": 0.75397295, + "learning_rate": 3.955851437213144e-06, + "loss": 0.83430469, + "num_input_tokens_seen": 33936680, + "router_z_loss_clip": 3.95703125, + "router_z_loss_mlp": 0.42822266, + "step": 1580, + "time_per_iteration": 2.570138931274414 + }, + { + "auxiliary_loss_clip": 0.06724589, + "auxiliary_loss_mlp": 0.01310914, + "balance_loss_clip": 0.06333821, + "balance_loss_mlp": 0.01268666, + "epoch": 0.09505486246805953, + "flos": 33555544669440.0, + "grad_norm": 5.064476993970354, + "language_loss": 0.78532892, + "learning_rate": 3.955770021006627e-06, + "loss": 0.86568391, + "num_input_tokens_seen": 33960685, + "router_z_loss_clip": 3.90625, + "router_z_loss_mlp": 0.42236328, + "step": 1581, + "time_per_iteration": 2.6650803089141846 + }, + { + "auxiliary_loss_clip": 0.06722299, + "auxiliary_loss_mlp": 0.01301656, + "balance_loss_clip": 0.06332248, + "balance_loss_mlp": 0.01261006, + "epoch": 0.09511498572072749, + "flos": 21221752677120.0, + "grad_norm": 5.1362606458817925, + "language_loss": 0.89191097, + "learning_rate": 3.955688530637116e-06, + "loss": 0.97215056, + "num_input_tokens_seen": 33980015, + "router_z_loss_clip": 3.90429688, + "router_z_loss_mlp": 0.40698242, + "step": 1582, + "time_per_iteration": 2.5564815998077393 + }, + { + "auxiliary_loss_clip": 0.06727481, + "auxiliary_loss_mlp": 0.01303544, + "balance_loss_clip": 0.06332925, + "balance_loss_mlp": 0.01261773, + "epoch": 0.09517510897339546, + "flos": 14616542373120.0, + "grad_norm": 2.3229781210723393, + "language_loss": 0.68368226, + "learning_rate": 3.955606966107699e-06, + "loss": 0.76399243, + "num_input_tokens_seen": 33997705, + "router_z_loss_clip": 3.94140625, + "router_z_loss_mlp": 0.41772461, + "step": 1583, + "time_per_iteration": 2.6164753437042236 + }, + { + "auxiliary_loss_clip": 0.06727771, + "auxiliary_loss_mlp": 0.01304751, + "balance_loss_clip": 0.06331809, + "balance_loss_mlp": 0.01261048, + "epoch": 0.09523523222606343, + "flos": 27824531212800.0, + "grad_norm": 3.115442275670272, + "language_loss": 0.72724044, + "learning_rate": 3.95552532742147e-06, + "loss": 0.80756557, + "num_input_tokens_seen": 34017465, + "router_z_loss_clip": 3.95703125, + "router_z_loss_mlp": 0.43725586, + "step": 1584, + "time_per_iteration": 2.604071855545044 + }, + { + "auxiliary_loss_clip": 0.06722259, + "auxiliary_loss_mlp": 0.01304961, + "balance_loss_clip": 0.06331295, + "balance_loss_mlp": 0.01265431, + "epoch": 0.0952953554787314, + "flos": 20712887631360.0, + "grad_norm": 1.6075041233622491, + "language_loss": 0.82572448, + "learning_rate": 3.955443614581525e-06, + "loss": 0.90599668, + "num_input_tokens_seen": 34038550, + "router_z_loss_clip": 3.91210938, + "router_z_loss_mlp": 0.39550781, + "step": 1585, + "time_per_iteration": 2.586507797241211 + }, + { + "auxiliary_loss_clip": 0.0673333, + "auxiliary_loss_mlp": 0.01317767, + "balance_loss_clip": 0.06331026, + "balance_loss_mlp": 0.01272039, + "epoch": 0.09535547873139937, + "flos": 24794080122240.0, + "grad_norm": 2.5515489551775854, + "language_loss": 0.74444079, + "learning_rate": 3.955361827590961e-06, + "loss": 0.82495177, + "num_input_tokens_seen": 34058665, + "router_z_loss_clip": 4.01953125, + "router_z_loss_mlp": 0.45727539, + "step": 1586, + "time_per_iteration": 2.629486083984375 + }, + { + "auxiliary_loss_clip": 0.06581648, + "auxiliary_loss_mlp": 0.01282136, + "balance_loss_clip": 0.06328419, + "balance_loss_mlp": 0.01258128, + "epoch": 0.09541560198406734, + "flos": 71930114956800.0, + "grad_norm": 0.7905774049307454, + "language_loss": 0.55110765, + "learning_rate": 3.955279966452883e-06, + "loss": 0.62974548, + "num_input_tokens_seen": 34109655, + "router_z_loss_clip": 2.53125, + "router_z_loss_mlp": 0.23974609, + "step": 1587, + "time_per_iteration": 2.9765305519104004 + }, + { + "auxiliary_loss_clip": 0.06737173, + "auxiliary_loss_mlp": 0.01308566, + "balance_loss_clip": 0.06336194, + "balance_loss_mlp": 0.01264316, + "epoch": 0.09547572523673531, + "flos": 28989609909120.0, + "grad_norm": 3.1625529132554835, + "language_loss": 0.82650244, + "learning_rate": 3.955198031170391e-06, + "loss": 0.90695989, + "num_input_tokens_seen": 34131115, + "router_z_loss_clip": 4.01171875, + "router_z_loss_mlp": 0.44213867, + "step": 1588, + "time_per_iteration": 2.6358370780944824 + }, + { + "auxiliary_loss_clip": 0.06726347, + "auxiliary_loss_mlp": 0.01313798, + "balance_loss_clip": 0.06331095, + "balance_loss_mlp": 0.01270716, + "epoch": 0.09553584848940327, + "flos": 24140759437440.0, + "grad_norm": 5.541794796195464, + "language_loss": 0.83084911, + "learning_rate": 3.955116021746594e-06, + "loss": 0.91125059, + "num_input_tokens_seen": 34151925, + "router_z_loss_clip": 3.95507812, + "router_z_loss_mlp": 0.43066406, + "step": 1589, + "time_per_iteration": 2.609682559967041 + }, + { + "auxiliary_loss_clip": 0.06720543, + "auxiliary_loss_mlp": 0.01306342, + "balance_loss_clip": 0.06330015, + "balance_loss_mlp": 0.01265263, + "epoch": 0.09559597174207124, + "flos": 42861401193600.0, + "grad_norm": 2.659540476465126, + "language_loss": 0.66428804, + "learning_rate": 3.955033938184601e-06, + "loss": 0.7445569, + "num_input_tokens_seen": 34175395, + "router_z_loss_clip": 3.91015625, + "router_z_loss_mlp": 0.41113281, + "step": 1590, + "time_per_iteration": 2.7904412746429443 + }, + { + "auxiliary_loss_clip": 0.06727439, + "auxiliary_loss_mlp": 0.01307692, + "balance_loss_clip": 0.06336293, + "balance_loss_mlp": 0.01267947, + "epoch": 0.09565609499473922, + "flos": 32678999907840.0, + "grad_norm": 1.976054240399588, + "language_loss": 0.84640449, + "learning_rate": 3.954951780487526e-06, + "loss": 0.92675579, + "num_input_tokens_seen": 34197760, + "router_z_loss_clip": 3.91210938, + "router_z_loss_mlp": 0.39746094, + "step": 1591, + "time_per_iteration": 2.677856683731079 + }, + { + "auxiliary_loss_clip": 0.0673625, + "auxiliary_loss_mlp": 0.01301164, + "balance_loss_clip": 0.06335758, + "balance_loss_mlp": 0.01259751, + "epoch": 0.09571621824740718, + "flos": 18484279787520.0, + "grad_norm": 3.2019409014799245, + "language_loss": 0.76485634, + "learning_rate": 3.9548695486584835e-06, + "loss": 0.84523046, + "num_input_tokens_seen": 34215330, + "router_z_loss_clip": 4.00976562, + "router_z_loss_mlp": 0.41381836, + "step": 1592, + "time_per_iteration": 2.5469346046447754 + }, + { + "auxiliary_loss_clip": 0.06718349, + "auxiliary_loss_mlp": 0.01308454, + "balance_loss_clip": 0.06327368, + "balance_loss_mlp": 0.01266444, + "epoch": 0.09577634150007515, + "flos": 29395164470400.0, + "grad_norm": 2.5830614134690757, + "language_loss": 0.75440031, + "learning_rate": 3.954787242700592e-06, + "loss": 0.8346684, + "num_input_tokens_seen": 34237745, + "router_z_loss_clip": 3.90625, + "router_z_loss_mlp": 0.42041016, + "step": 1593, + "time_per_iteration": 2.6077914237976074 + }, + { + "auxiliary_loss_clip": 0.06715257, + "auxiliary_loss_mlp": 0.01313469, + "balance_loss_clip": 0.06327495, + "balance_loss_mlp": 0.01269863, + "epoch": 0.09583646475274313, + "flos": 22754511089280.0, + "grad_norm": 3.098780608368182, + "language_loss": 0.70938909, + "learning_rate": 3.954704862616971e-06, + "loss": 0.78967637, + "num_input_tokens_seen": 34256565, + "router_z_loss_clip": 3.87890625, + "router_z_loss_mlp": 0.4362793, + "step": 1594, + "time_per_iteration": 2.6091833114624023 + }, + { + "auxiliary_loss_clip": 0.06719844, + "auxiliary_loss_mlp": 0.01312184, + "balance_loss_clip": 0.06326512, + "balance_loss_mlp": 0.01271247, + "epoch": 0.0958965880054111, + "flos": 23224495040640.0, + "grad_norm": 3.065197690061672, + "language_loss": 0.83355862, + "learning_rate": 3.954622408410747e-06, + "loss": 0.91387886, + "num_input_tokens_seen": 34275970, + "router_z_loss_clip": 3.92773438, + "router_z_loss_mlp": 0.40942383, + "step": 1595, + "time_per_iteration": 3.978273630142212 + }, + { + "auxiliary_loss_clip": 0.06729501, + "auxiliary_loss_mlp": 0.01321195, + "balance_loss_clip": 0.06329941, + "balance_loss_mlp": 0.01278638, + "epoch": 0.09595671125807906, + "flos": 21330807166080.0, + "grad_norm": 2.8509518249201866, + "language_loss": 0.87066317, + "learning_rate": 3.954539880085045e-06, + "loss": 0.95117009, + "num_input_tokens_seen": 34295490, + "router_z_loss_clip": 3.99414062, + "router_z_loss_mlp": 0.42529297, + "step": 1596, + "time_per_iteration": 4.032626390457153 + }, + { + "auxiliary_loss_clip": 0.06723377, + "auxiliary_loss_mlp": 0.01316069, + "balance_loss_clip": 0.06335501, + "balance_loss_mlp": 0.01273273, + "epoch": 0.09601683451074704, + "flos": 39612841125120.0, + "grad_norm": 3.1423731979310587, + "language_loss": 0.70766866, + "learning_rate": 3.9544572776429945e-06, + "loss": 0.78806317, + "num_input_tokens_seen": 34319990, + "router_z_loss_clip": 3.8828125, + "router_z_loss_mlp": 0.42773438, + "step": 1597, + "time_per_iteration": 2.7174298763275146 + }, + { + "auxiliary_loss_clip": 0.06742129, + "auxiliary_loss_mlp": 0.01306146, + "balance_loss_clip": 0.06339651, + "balance_loss_mlp": 0.01265687, + "epoch": 0.096076957763415, + "flos": 23739523361280.0, + "grad_norm": 3.050895337571829, + "language_loss": 0.77272135, + "learning_rate": 3.954374601087729e-06, + "loss": 0.85320413, + "num_input_tokens_seen": 34339225, + "router_z_loss_clip": 4.02734375, + "router_z_loss_mlp": 0.40429688, + "step": 1598, + "time_per_iteration": 2.5799829959869385 + }, + { + "auxiliary_loss_clip": 0.06737213, + "auxiliary_loss_mlp": 0.01319114, + "balance_loss_clip": 0.06339812, + "balance_loss_mlp": 0.01276103, + "epoch": 0.09613708101608297, + "flos": 34686689662080.0, + "grad_norm": 4.982256482437043, + "language_loss": 0.70875788, + "learning_rate": 3.954291850422382e-06, + "loss": 0.78932118, + "num_input_tokens_seen": 34361020, + "router_z_loss_clip": 3.96679688, + "router_z_loss_mlp": 0.43041992, + "step": 1599, + "time_per_iteration": 4.165144443511963 + }, + { + "auxiliary_loss_clip": 0.0672265, + "auxiliary_loss_mlp": 0.01315059, + "balance_loss_clip": 0.06336158, + "balance_loss_mlp": 0.01275029, + "epoch": 0.09619720426875093, + "flos": 20746192429440.0, + "grad_norm": 2.7563705555600655, + "language_loss": 0.85738063, + "learning_rate": 3.954209025650093e-06, + "loss": 0.93775773, + "num_input_tokens_seen": 34378630, + "router_z_loss_clip": 3.86523438, + "router_z_loss_mlp": 0.40014648, + "step": 1600, + "time_per_iteration": 2.583336591720581 + }, + { + "auxiliary_loss_clip": 0.06737998, + "auxiliary_loss_mlp": 0.01310218, + "balance_loss_clip": 0.06341977, + "balance_loss_mlp": 0.01270641, + "epoch": 0.09625732752141891, + "flos": 13047795832320.0, + "grad_norm": 2.909698328635622, + "language_loss": 0.82446879, + "learning_rate": 3.954126126774001e-06, + "loss": 0.90495098, + "num_input_tokens_seen": 34397110, + "router_z_loss_clip": 3.96484375, + "router_z_loss_mlp": 0.39599609, + "step": 1601, + "time_per_iteration": 3.9834721088409424 + }, + { + "auxiliary_loss_clip": 0.06743482, + "auxiliary_loss_mlp": 0.01303448, + "balance_loss_clip": 0.06337628, + "balance_loss_mlp": 0.01262368, + "epoch": 0.09631745077408688, + "flos": 22280250579840.0, + "grad_norm": 5.887605287140624, + "language_loss": 0.84592891, + "learning_rate": 3.954043153797251e-06, + "loss": 0.92639828, + "num_input_tokens_seen": 34414165, + "router_z_loss_clip": 4.0546875, + "router_z_loss_mlp": 0.41088867, + "step": 1602, + "time_per_iteration": 2.5633962154388428 + }, + { + "auxiliary_loss_clip": 0.06747036, + "auxiliary_loss_mlp": 0.01307728, + "balance_loss_clip": 0.06349348, + "balance_loss_mlp": 0.012661, + "epoch": 0.09637757402675484, + "flos": 24761236521600.0, + "grad_norm": 2.955003508709107, + "language_loss": 0.65285349, + "learning_rate": 3.953960106722989e-06, + "loss": 0.73340118, + "num_input_tokens_seen": 34434445, + "router_z_loss_clip": 3.97851562, + "router_z_loss_mlp": 0.41625977, + "step": 1603, + "time_per_iteration": 2.6790709495544434 + }, + { + "auxiliary_loss_clip": 0.06770037, + "auxiliary_loss_mlp": 0.01301761, + "balance_loss_clip": 0.06360609, + "balance_loss_mlp": 0.01258321, + "epoch": 0.09643769727942282, + "flos": 22531873991040.0, + "grad_norm": 5.353230367509213, + "language_loss": 0.72867018, + "learning_rate": 3.953876985554364e-06, + "loss": 0.80938816, + "num_input_tokens_seen": 34453095, + "router_z_loss_clip": 4.09570312, + "router_z_loss_mlp": 0.43505859, + "step": 1604, + "time_per_iteration": 2.608727216720581 + }, + { + "auxiliary_loss_clip": 0.06740201, + "auxiliary_loss_mlp": 0.01291258, + "balance_loss_clip": 0.06351058, + "balance_loss_mlp": 0.01254327, + "epoch": 0.09649782053209079, + "flos": 30929138766720.0, + "grad_norm": 4.793252253869783, + "language_loss": 0.80923069, + "learning_rate": 3.953793790294527e-06, + "loss": 0.88954532, + "num_input_tokens_seen": 34473680, + "router_z_loss_clip": 3.890625, + "router_z_loss_mlp": 0.36938477, + "step": 1605, + "time_per_iteration": 2.6763031482696533 + }, + { + "auxiliary_loss_clip": 0.06759577, + "auxiliary_loss_mlp": 0.01298287, + "balance_loss_clip": 0.06351094, + "balance_loss_mlp": 0.01258805, + "epoch": 0.09655794378475875, + "flos": 25344635374080.0, + "grad_norm": 2.3859738867756524, + "language_loss": 0.77227855, + "learning_rate": 3.953710520946634e-06, + "loss": 0.85285711, + "num_input_tokens_seen": 34492610, + "router_z_loss_clip": 4.08203125, + "router_z_loss_mlp": 0.39501953, + "step": 1606, + "time_per_iteration": 2.5902390480041504 + }, + { + "auxiliary_loss_clip": 0.0675118, + "auxiliary_loss_mlp": 0.0129606, + "balance_loss_clip": 0.06355944, + "balance_loss_mlp": 0.01258009, + "epoch": 0.09661806703742673, + "flos": 22352604180480.0, + "grad_norm": 2.2398823980048133, + "language_loss": 0.77161521, + "learning_rate": 3.953627177513843e-06, + "loss": 0.85208762, + "num_input_tokens_seen": 34511855, + "router_z_loss_clip": 3.953125, + "router_z_loss_mlp": 0.38085938, + "step": 1607, + "time_per_iteration": 2.5747807025909424 + }, + { + "auxiliary_loss_clip": 0.06767638, + "auxiliary_loss_mlp": 0.01306362, + "balance_loss_clip": 0.06365312, + "balance_loss_mlp": 0.01268597, + "epoch": 0.0966781902900947, + "flos": 17463405168000.0, + "grad_norm": 2.424309477239619, + "language_loss": 0.89527833, + "learning_rate": 3.953543759999312e-06, + "loss": 0.97601831, + "num_input_tokens_seen": 34528905, + "router_z_loss_clip": 4.02539062, + "router_z_loss_mlp": 0.37768555, + "step": 1608, + "time_per_iteration": 2.528881072998047 + }, + { + "auxiliary_loss_clip": 0.06782863, + "auxiliary_loss_mlp": 0.01306552, + "balance_loss_clip": 0.06378618, + "balance_loss_mlp": 0.01264471, + "epoch": 0.09673831354276266, + "flos": 36912991518720.0, + "grad_norm": 7.970472148643012, + "language_loss": 0.74000025, + "learning_rate": 3.953460268406207e-06, + "loss": 0.82089442, + "num_input_tokens_seen": 34548480, + "router_z_loss_clip": 4.0390625, + "router_z_loss_mlp": 0.4206543, + "step": 1609, + "time_per_iteration": 2.734060764312744 + }, + { + "auxiliary_loss_clip": 0.06767572, + "auxiliary_loss_mlp": 0.01304591, + "balance_loss_clip": 0.06368488, + "balance_loss_mlp": 0.01264418, + "epoch": 0.09679843679543064, + "flos": 20707185553920.0, + "grad_norm": 3.4585784172758123, + "language_loss": 0.86017323, + "learning_rate": 3.953376702737693e-06, + "loss": 0.94089484, + "num_input_tokens_seen": 34565410, + "router_z_loss_clip": 3.99414062, + "router_z_loss_mlp": 0.40185547, + "step": 1610, + "time_per_iteration": 2.6115059852600098 + }, + { + "auxiliary_loss_clip": 0.06763892, + "auxiliary_loss_mlp": 0.01304909, + "balance_loss_clip": 0.06364195, + "balance_loss_mlp": 0.01263877, + "epoch": 0.0968585600480986, + "flos": 23521288602240.0, + "grad_norm": 2.270672864322457, + "language_loss": 0.68734491, + "learning_rate": 3.953293062996939e-06, + "loss": 0.76803291, + "num_input_tokens_seen": 34584840, + "router_z_loss_clip": 3.9921875, + "router_z_loss_mlp": 0.41040039, + "step": 1611, + "time_per_iteration": 2.614010810852051 + }, + { + "auxiliary_loss_clip": 0.06775121, + "auxiliary_loss_mlp": 0.01302817, + "balance_loss_clip": 0.06373329, + "balance_loss_mlp": 0.01263239, + "epoch": 0.09691868330076657, + "flos": 20127350499840.0, + "grad_norm": 2.139701940573329, + "language_loss": 0.82997268, + "learning_rate": 3.953209349187115e-06, + "loss": 0.91075206, + "num_input_tokens_seen": 34603360, + "router_z_loss_clip": 4.0234375, + "router_z_loss_mlp": 0.39599609, + "step": 1612, + "time_per_iteration": 2.5493521690368652 + }, + { + "auxiliary_loss_clip": 0.06771481, + "auxiliary_loss_mlp": 0.01301111, + "balance_loss_clip": 0.06373016, + "balance_loss_mlp": 0.01260509, + "epoch": 0.09697880655343454, + "flos": 16550243372160.0, + "grad_norm": 8.083682244788854, + "language_loss": 0.82256299, + "learning_rate": 3.953125561311398e-06, + "loss": 0.90328896, + "num_input_tokens_seen": 34620760, + "router_z_loss_clip": 3.984375, + "router_z_loss_mlp": 0.40600586, + "step": 1613, + "time_per_iteration": 2.597912311553955 + }, + { + "auxiliary_loss_clip": 0.06750716, + "auxiliary_loss_mlp": 0.01299993, + "balance_loss_clip": 0.06359349, + "balance_loss_mlp": 0.01259724, + "epoch": 0.09703892980610251, + "flos": 26111370960000.0, + "grad_norm": 2.0260319330855654, + "language_loss": 0.86653531, + "learning_rate": 3.953041699372964e-06, + "loss": 0.94704247, + "num_input_tokens_seen": 34640695, + "router_z_loss_clip": 3.91210938, + "router_z_loss_mlp": 0.40258789, + "step": 1614, + "time_per_iteration": 2.6904046535491943 + }, + { + "auxiliary_loss_clip": 0.06673412, + "auxiliary_loss_mlp": 0.0133076, + "balance_loss_clip": 0.06412064, + "balance_loss_mlp": 0.01308611, + "epoch": 0.09709905305877048, + "flos": 60463712903040.0, + "grad_norm": 0.7036996820791193, + "language_loss": 0.54819673, + "learning_rate": 3.952957763374992e-06, + "loss": 0.6282385, + "num_input_tokens_seen": 34702395, + "router_z_loss_clip": 2.61914062, + "router_z_loss_mlp": 0.22180176, + "step": 1615, + "time_per_iteration": 3.235962152481079 + }, + { + "auxiliary_loss_clip": 0.06658442, + "auxiliary_loss_mlp": 0.01303789, + "balance_loss_clip": 0.06397749, + "balance_loss_mlp": 0.01282129, + "epoch": 0.09715917631143844, + "flos": 57660510885120.0, + "grad_norm": 0.7526049722603284, + "language_loss": 0.58190084, + "learning_rate": 3.952873753320666e-06, + "loss": 0.66152322, + "num_input_tokens_seen": 34768910, + "router_z_loss_clip": 2.59765625, + "router_z_loss_mlp": 0.21691895, + "step": 1616, + "time_per_iteration": 3.387523889541626 + }, + { + "auxiliary_loss_clip": 0.06757308, + "auxiliary_loss_mlp": 0.01307733, + "balance_loss_clip": 0.06359798, + "balance_loss_mlp": 0.01265652, + "epoch": 0.09721929956410642, + "flos": 20564448923520.0, + "grad_norm": 2.209089082853045, + "language_loss": 0.70192569, + "learning_rate": 3.952789669213172e-06, + "loss": 0.78257608, + "num_input_tokens_seen": 34787680, + "router_z_loss_clip": 3.97265625, + "router_z_loss_mlp": 0.42041016, + "step": 1617, + "time_per_iteration": 2.5756118297576904 + }, + { + "auxiliary_loss_clip": 0.06757677, + "auxiliary_loss_mlp": 0.0131002, + "balance_loss_clip": 0.06358766, + "balance_loss_mlp": 0.01269298, + "epoch": 0.09727942281677439, + "flos": 27351696222720.0, + "grad_norm": 2.235248973511229, + "language_loss": 0.81849337, + "learning_rate": 3.952705511055698e-06, + "loss": 0.89917034, + "num_input_tokens_seen": 34808330, + "router_z_loss_clip": 3.98632812, + "router_z_loss_mlp": 0.40722656, + "step": 1618, + "time_per_iteration": 2.6768393516540527 + }, + { + "auxiliary_loss_clip": 0.0674091, + "auxiliary_loss_mlp": 0.01309795, + "balance_loss_clip": 0.06356256, + "balance_loss_mlp": 0.01273293, + "epoch": 0.09733954606944235, + "flos": 24906991898880.0, + "grad_norm": 1.9369475823390685, + "language_loss": 0.94461536, + "learning_rate": 3.952621278851435e-06, + "loss": 1.0251224, + "num_input_tokens_seen": 34830020, + "router_z_loss_clip": 3.84375, + "router_z_loss_mlp": 0.36474609, + "step": 1619, + "time_per_iteration": 2.6324799060821533 + }, + { + "auxiliary_loss_clip": 0.06749003, + "auxiliary_loss_mlp": 0.01319848, + "balance_loss_clip": 0.06356695, + "balance_loss_mlp": 0.01280556, + "epoch": 0.09739966932211033, + "flos": 31511992567680.0, + "grad_norm": 2.8077555075872183, + "language_loss": 0.90160304, + "learning_rate": 3.9525369726035784e-06, + "loss": 0.98229158, + "num_input_tokens_seen": 34850330, + "router_z_loss_clip": 3.91601562, + "router_z_loss_mlp": 0.39257812, + "step": 1620, + "time_per_iteration": 2.658043146133423 + }, + { + "auxiliary_loss_clip": 0.06742691, + "auxiliary_loss_mlp": 0.01310778, + "balance_loss_clip": 0.06352507, + "balance_loss_mlp": 0.01268602, + "epoch": 0.0974597925747783, + "flos": 23885614154880.0, + "grad_norm": 11.754534189846764, + "language_loss": 0.78833234, + "learning_rate": 3.952452592315324e-06, + "loss": 0.86886704, + "num_input_tokens_seen": 34871640, + "router_z_loss_clip": 3.90429688, + "router_z_loss_mlp": 0.421875, + "step": 1621, + "time_per_iteration": 2.575810432434082 + }, + { + "auxiliary_loss_clip": 0.06744215, + "auxiliary_loss_mlp": 0.01311535, + "balance_loss_clip": 0.06357577, + "balance_loss_mlp": 0.01271863, + "epoch": 0.09751991582744626, + "flos": 17025300495360.0, + "grad_norm": 3.321884403192612, + "language_loss": 0.7956326, + "learning_rate": 3.952368137989871e-06, + "loss": 0.87619019, + "num_input_tokens_seen": 34888100, + "router_z_loss_clip": 3.86523438, + "router_z_loss_mlp": 0.39648438, + "step": 1622, + "time_per_iteration": 2.5544931888580322 + }, + { + "auxiliary_loss_clip": 0.06764823, + "auxiliary_loss_mlp": 0.01312235, + "balance_loss_clip": 0.06359966, + "balance_loss_mlp": 0.0127199, + "epoch": 0.09758003908011423, + "flos": 28410403760640.0, + "grad_norm": 4.629544309513281, + "language_loss": 0.86985308, + "learning_rate": 3.9522836096304225e-06, + "loss": 0.95062363, + "num_input_tokens_seen": 34910485, + "router_z_loss_clip": 4.046875, + "router_z_loss_mlp": 0.40209961, + "step": 1623, + "time_per_iteration": 2.612455129623413 + }, + { + "auxiliary_loss_clip": 0.06759211, + "auxiliary_loss_mlp": 0.01313929, + "balance_loss_clip": 0.06368798, + "balance_loss_mlp": 0.01275353, + "epoch": 0.09764016233278221, + "flos": 18149150183040.0, + "grad_norm": 2.3724260177997, + "language_loss": 0.82168519, + "learning_rate": 3.952199007240184e-06, + "loss": 0.90241659, + "num_input_tokens_seen": 34928615, + "router_z_loss_clip": 3.90429688, + "router_z_loss_mlp": 0.38598633, + "step": 1624, + "time_per_iteration": 2.572327136993408 + }, + { + "auxiliary_loss_clip": 0.06750062, + "auxiliary_loss_mlp": 0.01321107, + "balance_loss_clip": 0.06362263, + "balance_loss_mlp": 0.01284462, + "epoch": 0.09770028558545017, + "flos": 15270869306880.0, + "grad_norm": 2.8002590375685195, + "language_loss": 0.87639892, + "learning_rate": 3.952114330822364e-06, + "loss": 0.95711064, + "num_input_tokens_seen": 34946045, + "router_z_loss_clip": 3.87890625, + "router_z_loss_mlp": 0.36645508, + "step": 1625, + "time_per_iteration": 2.5327792167663574 + }, + { + "auxiliary_loss_clip": 0.06781108, + "auxiliary_loss_mlp": 0.01314743, + "balance_loss_clip": 0.06374431, + "balance_loss_mlp": 0.01273353, + "epoch": 0.09776040883811814, + "flos": 23478382512000.0, + "grad_norm": 2.111707696763749, + "language_loss": 0.8695811, + "learning_rate": 3.952029580380172e-06, + "loss": 0.95053965, + "num_input_tokens_seen": 34962865, + "router_z_loss_clip": 4.06445312, + "router_z_loss_mlp": 0.4140625, + "step": 1626, + "time_per_iteration": 2.631251096725464 + }, + { + "auxiliary_loss_clip": 0.067652, + "auxiliary_loss_mlp": 0.01306731, + "balance_loss_clip": 0.06367379, + "balance_loss_mlp": 0.01267177, + "epoch": 0.09782053209078612, + "flos": 24506510509440.0, + "grad_norm": 2.38090987978409, + "language_loss": 0.84928203, + "learning_rate": 3.9519447559168234e-06, + "loss": 0.93000138, + "num_input_tokens_seen": 34983505, + "router_z_loss_clip": 3.97460938, + "router_z_loss_mlp": 0.39550781, + "step": 1627, + "time_per_iteration": 2.6171953678131104 + }, + { + "auxiliary_loss_clip": 0.06749414, + "auxiliary_loss_mlp": 0.01311575, + "balance_loss_clip": 0.06362557, + "balance_loss_mlp": 0.01274334, + "epoch": 0.09788065534345408, + "flos": 21586623281280.0, + "grad_norm": 2.0465991602511107, + "language_loss": 0.86433482, + "learning_rate": 3.951859857435534e-06, + "loss": 0.94494474, + "num_input_tokens_seen": 35001825, + "router_z_loss_clip": 3.8671875, + "router_z_loss_mlp": 0.37255859, + "step": 1628, + "time_per_iteration": 2.5730161666870117 + }, + { + "auxiliary_loss_clip": 0.06751154, + "auxiliary_loss_mlp": 0.013221, + "balance_loss_clip": 0.06365977, + "balance_loss_mlp": 0.0128362, + "epoch": 0.09794077859612205, + "flos": 23849332536960.0, + "grad_norm": 2.074450963540643, + "language_loss": 0.76707101, + "learning_rate": 3.951774884939523e-06, + "loss": 0.84780353, + "num_input_tokens_seen": 35023075, + "router_z_loss_clip": 3.8515625, + "router_z_loss_mlp": 0.38452148, + "step": 1629, + "time_per_iteration": 2.615643262863159 + }, + { + "auxiliary_loss_clip": 0.06753751, + "auxiliary_loss_mlp": 0.01312675, + "balance_loss_clip": 0.06363355, + "balance_loss_mlp": 0.01273288, + "epoch": 0.09800090184879003, + "flos": 23666708563200.0, + "grad_norm": 2.0658158581699806, + "language_loss": 0.79474878, + "learning_rate": 3.951689838432013e-06, + "loss": 0.87541306, + "num_input_tokens_seen": 35043480, + "router_z_loss_clip": 3.91015625, + "router_z_loss_mlp": 0.39379883, + "step": 1630, + "time_per_iteration": 2.5846662521362305 + }, + { + "auxiliary_loss_clip": 0.06751612, + "auxiliary_loss_mlp": 0.01306103, + "balance_loss_clip": 0.06359278, + "balance_loss_mlp": 0.01266335, + "epoch": 0.09806102510145799, + "flos": 17061456332160.0, + "grad_norm": 3.092577982684634, + "language_loss": 0.88391125, + "learning_rate": 3.951604717916228e-06, + "loss": 0.96448845, + "num_input_tokens_seen": 35061490, + "router_z_loss_clip": 3.92382812, + "router_z_loss_mlp": 0.39770508, + "step": 1631, + "time_per_iteration": 2.545468807220459 + }, + { + "auxiliary_loss_clip": 0.06742664, + "auxiliary_loss_mlp": 0.01296447, + "balance_loss_clip": 0.06359032, + "balance_loss_mlp": 0.01259039, + "epoch": 0.09812114835412596, + "flos": 23885278738560.0, + "grad_norm": 2.2303411170681566, + "language_loss": 0.8421644, + "learning_rate": 3.9515195233953975e-06, + "loss": 0.92255551, + "num_input_tokens_seen": 35079670, + "router_z_loss_clip": 3.8359375, + "router_z_loss_mlp": 0.37426758, + "step": 1632, + "time_per_iteration": 2.5765457153320312 + }, + { + "auxiliary_loss_clip": 0.06746343, + "auxiliary_loss_mlp": 0.01300275, + "balance_loss_clip": 0.0636283, + "balance_loss_mlp": 0.01262557, + "epoch": 0.09818127160679392, + "flos": 20601862571520.0, + "grad_norm": 2.054168262723839, + "language_loss": 0.80421484, + "learning_rate": 3.951434254872751e-06, + "loss": 0.88468099, + "num_input_tokens_seen": 35099205, + "router_z_loss_clip": 3.83984375, + "router_z_loss_mlp": 0.37744141, + "step": 1633, + "time_per_iteration": 2.5900163650512695 + }, + { + "auxiliary_loss_clip": 0.06752759, + "auxiliary_loss_mlp": 0.01296054, + "balance_loss_clip": 0.06366011, + "balance_loss_mlp": 0.01257931, + "epoch": 0.0982413948594619, + "flos": 15492835572480.0, + "grad_norm": 3.0165255601535743, + "language_loss": 0.74936914, + "learning_rate": 3.951348912351521e-06, + "loss": 0.82985729, + "num_input_tokens_seen": 35115270, + "router_z_loss_clip": 3.86914062, + "router_z_loss_mlp": 0.38134766, + "step": 1634, + "time_per_iteration": 3.9524917602539062 + }, + { + "auxiliary_loss_clip": 0.06754396, + "auxiliary_loss_mlp": 0.01296894, + "balance_loss_clip": 0.06358244, + "balance_loss_mlp": 0.01258485, + "epoch": 0.09830151811212987, + "flos": 24214999754880.0, + "grad_norm": 4.629396807552869, + "language_loss": 0.75166363, + "learning_rate": 3.951263495834947e-06, + "loss": 0.83217651, + "num_input_tokens_seen": 35134065, + "router_z_loss_clip": 3.95898438, + "router_z_loss_mlp": 0.3840332, + "step": 1635, + "time_per_iteration": 2.619173049926758 + }, + { + "auxiliary_loss_clip": 0.06750873, + "auxiliary_loss_mlp": 0.01303971, + "balance_loss_clip": 0.0635405, + "balance_loss_mlp": 0.01262486, + "epoch": 0.09836164136479783, + "flos": 20600814395520.0, + "grad_norm": 5.1262872331137945, + "language_loss": 0.79884511, + "learning_rate": 3.951178005326264e-06, + "loss": 0.87939358, + "num_input_tokens_seen": 35154870, + "router_z_loss_clip": 3.96875, + "router_z_loss_mlp": 0.41455078, + "step": 1636, + "time_per_iteration": 4.063632965087891 + }, + { + "auxiliary_loss_clip": 0.06755228, + "auxiliary_loss_mlp": 0.0130259, + "balance_loss_clip": 0.06357834, + "balance_loss_mlp": 0.01260486, + "epoch": 0.09842176461746581, + "flos": 19939653354240.0, + "grad_norm": 2.182253503011162, + "language_loss": 0.72318256, + "learning_rate": 3.951092440828715e-06, + "loss": 0.80376077, + "num_input_tokens_seen": 35171850, + "router_z_loss_clip": 3.97460938, + "router_z_loss_mlp": 0.42163086, + "step": 1637, + "time_per_iteration": 2.573108196258545 + }, + { + "auxiliary_loss_clip": 0.0673624, + "auxiliary_loss_mlp": 0.01302289, + "balance_loss_clip": 0.06349343, + "balance_loss_mlp": 0.01263045, + "epoch": 0.09848188787013377, + "flos": 21220956063360.0, + "grad_norm": 2.9423896219595016, + "language_loss": 0.79459947, + "learning_rate": 3.951006802345545e-06, + "loss": 0.87498474, + "num_input_tokens_seen": 35188795, + "router_z_loss_clip": 3.87304688, + "router_z_loss_mlp": 0.39257812, + "step": 1638, + "time_per_iteration": 2.620058536529541 + }, + { + "auxiliary_loss_clip": 0.06725241, + "auxiliary_loss_mlp": 0.01294434, + "balance_loss_clip": 0.06345727, + "balance_loss_mlp": 0.01258027, + "epoch": 0.09854201112280174, + "flos": 30162109691520.0, + "grad_norm": 1.743966069044169, + "language_loss": 0.7446866, + "learning_rate": 3.950921089880003e-06, + "loss": 0.82488334, + "num_input_tokens_seen": 35212100, + "router_z_loss_clip": 3.79296875, + "router_z_loss_mlp": 0.36401367, + "step": 1639, + "time_per_iteration": 4.186578750610352 + }, + { + "auxiliary_loss_clip": 0.06740695, + "auxiliary_loss_mlp": 0.01301032, + "balance_loss_clip": 0.06346842, + "balance_loss_mlp": 0.01260025, + "epoch": 0.09860213437546972, + "flos": 21801671585280.0, + "grad_norm": 2.1837560711862114, + "language_loss": 0.90050477, + "learning_rate": 3.950835303435337e-06, + "loss": 0.9809221, + "num_input_tokens_seen": 35230390, + "router_z_loss_clip": 3.93945312, + "router_z_loss_mlp": 0.41040039, + "step": 1640, + "time_per_iteration": 2.571072816848755 + }, + { + "auxiliary_loss_clip": 0.06734361, + "auxiliary_loss_mlp": 0.01304387, + "balance_loss_clip": 0.06346233, + "balance_loss_mlp": 0.01265548, + "epoch": 0.09866225762813768, + "flos": 21842062053120.0, + "grad_norm": 2.730520486163119, + "language_loss": 0.82726961, + "learning_rate": 3.950749443014801e-06, + "loss": 0.90765709, + "num_input_tokens_seen": 35250405, + "router_z_loss_clip": 3.87695312, + "router_z_loss_mlp": 0.38818359, + "step": 1641, + "time_per_iteration": 3.9849867820739746 + }, + { + "auxiliary_loss_clip": 0.06739942, + "auxiliary_loss_mlp": 0.01313392, + "balance_loss_clip": 0.06347778, + "balance_loss_mlp": 0.01271692, + "epoch": 0.09872238088080565, + "flos": 17605093622400.0, + "grad_norm": 3.096093902434135, + "language_loss": 0.88531339, + "learning_rate": 3.95066350862165e-06, + "loss": 0.96584678, + "num_input_tokens_seen": 35262820, + "router_z_loss_clip": 3.92382812, + "router_z_loss_mlp": 0.41699219, + "step": 1642, + "time_per_iteration": 2.516415596008301 + }, + { + "auxiliary_loss_clip": 0.06737699, + "auxiliary_loss_mlp": 0.01318919, + "balance_loss_clip": 0.06353228, + "balance_loss_mlp": 0.01281606, + "epoch": 0.09878250413347361, + "flos": 27643500466560.0, + "grad_norm": 2.0791034906225883, + "language_loss": 0.82263941, + "learning_rate": 3.950577500259144e-06, + "loss": 0.90320563, + "num_input_tokens_seen": 35284490, + "router_z_loss_clip": 3.84179688, + "router_z_loss_mlp": 0.37304688, + "step": 1643, + "time_per_iteration": 2.647494077682495 + }, + { + "auxiliary_loss_clip": 0.06734201, + "auxiliary_loss_mlp": 0.01331721, + "balance_loss_clip": 0.06346507, + "balance_loss_mlp": 0.01293407, + "epoch": 0.0988426273861416, + "flos": 16550285299200.0, + "grad_norm": 2.4456553195112574, + "language_loss": 0.84032261, + "learning_rate": 3.950491417930543e-06, + "loss": 0.92098182, + "num_input_tokens_seen": 35302815, + "router_z_loss_clip": 3.88085938, + "router_z_loss_mlp": 0.3828125, + "step": 1644, + "time_per_iteration": 2.532773733139038 + }, + { + "auxiliary_loss_clip": 0.06725995, + "auxiliary_loss_mlp": 0.01324281, + "balance_loss_clip": 0.06350633, + "balance_loss_mlp": 0.0128499, + "epoch": 0.09890275063880956, + "flos": 21221668823040.0, + "grad_norm": 2.0467133061416956, + "language_loss": 0.70372713, + "learning_rate": 3.9504052616391124e-06, + "loss": 0.78422999, + "num_input_tokens_seen": 35321175, + "router_z_loss_clip": 3.75390625, + "router_z_loss_mlp": 0.39282227, + "step": 1645, + "time_per_iteration": 2.622675657272339 + }, + { + "auxiliary_loss_clip": 0.06615774, + "auxiliary_loss_mlp": 0.01318713, + "balance_loss_clip": 0.06367776, + "balance_loss_mlp": 0.01297721, + "epoch": 0.09896287389147752, + "flos": 59398255111680.0, + "grad_norm": 0.866313536392572, + "language_loss": 0.6076256, + "learning_rate": 3.950319031388119e-06, + "loss": 0.68697047, + "num_input_tokens_seen": 35381740, + "router_z_loss_clip": 2.48242188, + "router_z_loss_mlp": 0.21008301, + "step": 1646, + "time_per_iteration": 3.1056430339813232 + }, + { + "auxiliary_loss_clip": 0.06736847, + "auxiliary_loss_mlp": 0.01330956, + "balance_loss_clip": 0.06343894, + "balance_loss_mlp": 0.01288517, + "epoch": 0.0990229971441455, + "flos": 29650351680000.0, + "grad_norm": 13.669187568501263, + "language_loss": 0.74906254, + "learning_rate": 3.950232727180833e-06, + "loss": 0.82974058, + "num_input_tokens_seen": 35403760, + "router_z_loss_clip": 3.93164062, + "router_z_loss_mlp": 0.42456055, + "step": 1647, + "time_per_iteration": 2.6270813941955566 + }, + { + "auxiliary_loss_clip": 0.06742343, + "auxiliary_loss_mlp": 0.01344997, + "balance_loss_clip": 0.0635362, + "balance_loss_mlp": 0.01305277, + "epoch": 0.09908312039681347, + "flos": 21841265439360.0, + "grad_norm": 3.219880040136517, + "language_loss": 0.86054468, + "learning_rate": 3.950146349020525e-06, + "loss": 0.94141805, + "num_input_tokens_seen": 35424050, + "router_z_loss_clip": 3.88671875, + "router_z_loss_mlp": 0.3972168, + "step": 1648, + "time_per_iteration": 2.6192800998687744 + }, + { + "auxiliary_loss_clip": 0.06595583, + "auxiliary_loss_mlp": 0.01312987, + "balance_loss_clip": 0.06350748, + "balance_loss_mlp": 0.01292542, + "epoch": 0.09914324364948143, + "flos": 57584425777920.0, + "grad_norm": 0.7273762983113155, + "language_loss": 0.5560773, + "learning_rate": 3.950059896910473e-06, + "loss": 0.63516295, + "num_input_tokens_seen": 35481690, + "router_z_loss_clip": 2.44140625, + "router_z_loss_mlp": 0.20446777, + "step": 1649, + "time_per_iteration": 3.1318249702453613 + }, + { + "auxiliary_loss_clip": 0.06736004, + "auxiliary_loss_mlp": 0.01331784, + "balance_loss_clip": 0.06347787, + "balance_loss_mlp": 0.01293232, + "epoch": 0.09920336690214941, + "flos": 34131270873600.0, + "grad_norm": 3.80404299498915, + "language_loss": 0.92154968, + "learning_rate": 3.949973370853954e-06, + "loss": 1.00222754, + "num_input_tokens_seen": 35498635, + "router_z_loss_clip": 3.88085938, + "router_z_loss_mlp": 0.38574219, + "step": 1650, + "time_per_iteration": 2.640519142150879 + }, + { + "auxiliary_loss_clip": 0.06583999, + "auxiliary_loss_mlp": 0.012899, + "balance_loss_clip": 0.06337862, + "balance_loss_mlp": 0.012688, + "epoch": 0.09926349015481738, + "flos": 71239910947200.0, + "grad_norm": 0.7750953568391499, + "language_loss": 0.63578606, + "learning_rate": 3.94988677085425e-06, + "loss": 0.71452504, + "num_input_tokens_seen": 35565720, + "router_z_loss_clip": 2.46679688, + "router_z_loss_mlp": 0.21118164, + "step": 1651, + "time_per_iteration": 3.380758047103882 + }, + { + "auxiliary_loss_clip": 0.06739324, + "auxiliary_loss_mlp": 0.01313359, + "balance_loss_clip": 0.06352896, + "balance_loss_mlp": 0.01275236, + "epoch": 0.09932361340748534, + "flos": 23155369822080.0, + "grad_norm": 3.694899481712973, + "language_loss": 0.89802289, + "learning_rate": 3.949800096914643e-06, + "loss": 0.97854972, + "num_input_tokens_seen": 35586000, + "router_z_loss_clip": 3.8671875, + "router_z_loss_mlp": 0.38110352, + "step": 1652, + "time_per_iteration": 2.571901321411133 + }, + { + "auxiliary_loss_clip": 0.06737585, + "auxiliary_loss_mlp": 0.01305643, + "balance_loss_clip": 0.06349514, + "balance_loss_mlp": 0.01267735, + "epoch": 0.09938373666015332, + "flos": 19834791569280.0, + "grad_norm": 2.586330184077195, + "language_loss": 0.8401894, + "learning_rate": 3.949713349038422e-06, + "loss": 0.92062169, + "num_input_tokens_seen": 35604355, + "router_z_loss_clip": 3.88085938, + "router_z_loss_mlp": 0.37890625, + "step": 1653, + "time_per_iteration": 2.5631346702575684 + }, + { + "auxiliary_loss_clip": 0.0674301, + "auxiliary_loss_mlp": 0.01306602, + "balance_loss_clip": 0.06348432, + "balance_loss_mlp": 0.01266428, + "epoch": 0.09944385991282129, + "flos": 22097165408640.0, + "grad_norm": 3.5179958225358914, + "language_loss": 0.81669748, + "learning_rate": 3.949626527228875e-06, + "loss": 0.89719361, + "num_input_tokens_seen": 35625495, + "router_z_loss_clip": 3.94335938, + "router_z_loss_mlp": 0.40136719, + "step": 1654, + "time_per_iteration": 2.602562427520752 + }, + { + "auxiliary_loss_clip": 0.06716993, + "auxiliary_loss_mlp": 0.01303058, + "balance_loss_clip": 0.0634619, + "balance_loss_mlp": 0.01268178, + "epoch": 0.09950398316548925, + "flos": 19835043131520.0, + "grad_norm": 8.671208784933132, + "language_loss": 0.83012509, + "learning_rate": 3.949539631489295e-06, + "loss": 0.91032565, + "num_input_tokens_seen": 35645030, + "router_z_loss_clip": 3.70703125, + "router_z_loss_mlp": 0.34863281, + "step": 1655, + "time_per_iteration": 2.5673985481262207 + }, + { + "auxiliary_loss_clip": 0.06726938, + "auxiliary_loss_mlp": 0.01297279, + "balance_loss_clip": 0.06340201, + "balance_loss_mlp": 0.01259799, + "epoch": 0.09956410641815722, + "flos": 25009715404800.0, + "grad_norm": 2.461628043042503, + "language_loss": 0.82767576, + "learning_rate": 3.9494526618229765e-06, + "loss": 0.90791798, + "num_input_tokens_seen": 35664305, + "router_z_loss_clip": 3.86523438, + "router_z_loss_mlp": 0.37475586, + "step": 1656, + "time_per_iteration": 2.581664800643921 + }, + { + "auxiliary_loss_clip": 0.06710893, + "auxiliary_loss_mlp": 0.01307317, + "balance_loss_clip": 0.06336491, + "balance_loss_mlp": 0.01268812, + "epoch": 0.0996242296708252, + "flos": 19323746317440.0, + "grad_norm": 1.719286888169867, + "language_loss": 0.90283895, + "learning_rate": 3.949365618233217e-06, + "loss": 0.98302102, + "num_input_tokens_seen": 35684060, + "router_z_loss_clip": 3.74804688, + "router_z_loss_mlp": 0.38525391, + "step": 1657, + "time_per_iteration": 2.57688045501709 + }, + { + "auxiliary_loss_clip": 0.06739774, + "auxiliary_loss_mlp": 0.01311666, + "balance_loss_clip": 0.06340782, + "balance_loss_mlp": 0.01267869, + "epoch": 0.09968435292349316, + "flos": 21878050181760.0, + "grad_norm": 2.9029706728478533, + "language_loss": 0.87311482, + "learning_rate": 3.9492785007233195e-06, + "loss": 0.95362926, + "num_input_tokens_seen": 35703250, + "router_z_loss_clip": 3.98632812, + "router_z_loss_mlp": 0.43823242, + "step": 1658, + "time_per_iteration": 2.628093719482422 + }, + { + "auxiliary_loss_clip": 0.06571998, + "auxiliary_loss_mlp": 0.01376397, + "balance_loss_clip": 0.06328425, + "balance_loss_mlp": 0.01349933, + "epoch": 0.09974447617616113, + "flos": 65401912154880.0, + "grad_norm": 0.9037243571562794, + "language_loss": 0.60433233, + "learning_rate": 3.949191309296585e-06, + "loss": 0.68381631, + "num_input_tokens_seen": 35762165, + "router_z_loss_clip": 2.4375, + "router_z_loss_mlp": 0.26513672, + "step": 1659, + "time_per_iteration": 3.2305996417999268 + }, + { + "auxiliary_loss_clip": 0.06713426, + "auxiliary_loss_mlp": 0.01317119, + "balance_loss_clip": 0.06331229, + "balance_loss_mlp": 0.0127735, + "epoch": 0.0998045994288291, + "flos": 23666624709120.0, + "grad_norm": 2.0571407511312865, + "language_loss": 0.87086773, + "learning_rate": 3.949104043956321e-06, + "loss": 0.95117325, + "num_input_tokens_seen": 35781520, + "router_z_loss_clip": 3.82226562, + "router_z_loss_mlp": 0.39746094, + "step": 1660, + "time_per_iteration": 2.5779190063476562 + }, + { + "auxiliary_loss_clip": 0.0670151, + "auxiliary_loss_mlp": 0.01332109, + "balance_loss_clip": 0.06323117, + "balance_loss_mlp": 0.01290529, + "epoch": 0.09986472268149707, + "flos": 19615802123520.0, + "grad_norm": 2.4762315311071315, + "language_loss": 0.80644435, + "learning_rate": 3.949016704705836e-06, + "loss": 0.88678062, + "num_input_tokens_seen": 35799565, + "router_z_loss_clip": 3.78710938, + "router_z_loss_mlp": 0.41552734, + "step": 1661, + "time_per_iteration": 2.691804885864258 + }, + { + "auxiliary_loss_clip": 0.06725313, + "auxiliary_loss_mlp": 0.0132162, + "balance_loss_clip": 0.0632514, + "balance_loss_mlp": 0.01278443, + "epoch": 0.09992484593416504, + "flos": 26220467376000.0, + "grad_norm": 2.2620896744149412, + "language_loss": 0.8613416, + "learning_rate": 3.948929291548443e-06, + "loss": 0.94181097, + "num_input_tokens_seen": 35821085, + "router_z_loss_clip": 4.00585938, + "router_z_loss_mlp": 0.43164062, + "step": 1662, + "time_per_iteration": 2.6255035400390625 + }, + { + "auxiliary_loss_clip": 0.06704119, + "auxiliary_loss_mlp": 0.0133037, + "balance_loss_clip": 0.06321694, + "balance_loss_mlp": 0.0128941, + "epoch": 0.09998496918683301, + "flos": 17499393296640.0, + "grad_norm": 2.3672212997838993, + "language_loss": 0.90448183, + "learning_rate": 3.9488418044874546e-06, + "loss": 0.98482674, + "num_input_tokens_seen": 35839840, + "router_z_loss_clip": 3.82226562, + "router_z_loss_mlp": 0.40966797, + "step": 1663, + "time_per_iteration": 2.6671247482299805 + }, + { + "auxiliary_loss_clip": 0.06712753, + "auxiliary_loss_mlp": 0.01334758, + "balance_loss_clip": 0.06319161, + "balance_loss_mlp": 0.01292105, + "epoch": 0.10004509243950098, + "flos": 22791715102080.0, + "grad_norm": 2.952995005402735, + "language_loss": 0.72149938, + "learning_rate": 3.948754243526191e-06, + "loss": 0.80197442, + "num_input_tokens_seen": 35861545, + "router_z_loss_clip": 3.93164062, + "router_z_loss_mlp": 0.42651367, + "step": 1664, + "time_per_iteration": 2.619164228439331 + }, + { + "auxiliary_loss_clip": 0.06713652, + "auxiliary_loss_mlp": 0.01325429, + "balance_loss_clip": 0.06323303, + "balance_loss_mlp": 0.01284159, + "epoch": 0.10010521569216894, + "flos": 16258984179840.0, + "grad_norm": 39.90990553234195, + "language_loss": 0.80576968, + "learning_rate": 3.94866660866797e-06, + "loss": 0.88616049, + "num_input_tokens_seen": 35878295, + "router_z_loss_clip": 3.90429688, + "router_z_loss_mlp": 0.41235352, + "step": 1665, + "time_per_iteration": 2.605639934539795 + }, + { + "auxiliary_loss_clip": 0.06714154, + "auxiliary_loss_mlp": 0.01316999, + "balance_loss_clip": 0.06327689, + "balance_loss_mlp": 0.01278017, + "epoch": 0.10016533894483691, + "flos": 23409047658240.0, + "grad_norm": 2.1899546372821566, + "language_loss": 0.71735048, + "learning_rate": 3.9485788999161165e-06, + "loss": 0.79766202, + "num_input_tokens_seen": 35898990, + "router_z_loss_clip": 3.86523438, + "router_z_loss_mlp": 0.38964844, + "step": 1666, + "time_per_iteration": 2.565112352371216 + }, + { + "auxiliary_loss_clip": 0.06721501, + "auxiliary_loss_mlp": 0.01334152, + "balance_loss_clip": 0.06329556, + "balance_loss_mlp": 0.01286492, + "epoch": 0.10022546219750489, + "flos": 19360195643520.0, + "grad_norm": 2.4453770076419055, + "language_loss": 0.80451995, + "learning_rate": 3.948491117273956e-06, + "loss": 0.88507646, + "num_input_tokens_seen": 35916225, + "router_z_loss_clip": 3.921875, + "router_z_loss_mlp": 0.47680664, + "step": 1667, + "time_per_iteration": 2.5686376094818115 + }, + { + "auxiliary_loss_clip": 0.06714002, + "auxiliary_loss_mlp": 0.01313023, + "balance_loss_clip": 0.06328776, + "balance_loss_mlp": 0.01272492, + "epoch": 0.10028558545017285, + "flos": 27092525944320.0, + "grad_norm": 3.3659339438704357, + "language_loss": 0.79832667, + "learning_rate": 3.948403260744817e-06, + "loss": 0.8785969, + "num_input_tokens_seen": 35934630, + "router_z_loss_clip": 3.84960938, + "router_z_loss_mlp": 0.40551758, + "step": 1668, + "time_per_iteration": 2.5726866722106934 + }, + { + "auxiliary_loss_clip": 0.0670673, + "auxiliary_loss_mlp": 0.013093, + "balance_loss_clip": 0.06318925, + "balance_loss_mlp": 0.01268101, + "epoch": 0.10034570870284082, + "flos": 25854003544320.0, + "grad_norm": 2.568927800509246, + "language_loss": 0.79338908, + "learning_rate": 3.948315330332031e-06, + "loss": 0.87354934, + "num_input_tokens_seen": 35953855, + "router_z_loss_clip": 3.87695312, + "router_z_loss_mlp": 0.41235352, + "step": 1669, + "time_per_iteration": 2.6188042163848877 + }, + { + "auxiliary_loss_clip": 0.06725293, + "auxiliary_loss_mlp": 0.0130808, + "balance_loss_clip": 0.06329028, + "balance_loss_mlp": 0.01264497, + "epoch": 0.1004058319555088, + "flos": 26256707066880.0, + "grad_norm": 15.895164476932296, + "language_loss": 0.87389982, + "learning_rate": 3.948227326038933e-06, + "loss": 0.95423353, + "num_input_tokens_seen": 35974555, + "router_z_loss_clip": 3.9609375, + "router_z_loss_mlp": 0.43579102, + "step": 1670, + "time_per_iteration": 2.6586272716522217 + }, + { + "auxiliary_loss_clip": 0.06691795, + "auxiliary_loss_mlp": 0.01298769, + "balance_loss_clip": 0.06322314, + "balance_loss_mlp": 0.0126098, + "epoch": 0.10046595520817676, + "flos": 25381545897600.0, + "grad_norm": 1.8967452212827218, + "language_loss": 0.7865597, + "learning_rate": 3.9481392478688586e-06, + "loss": 0.86646533, + "num_input_tokens_seen": 35996830, + "router_z_loss_clip": 3.69335938, + "router_z_loss_mlp": 0.37817383, + "step": 1671, + "time_per_iteration": 2.6737799644470215 + }, + { + "auxiliary_loss_clip": 0.06549042, + "auxiliary_loss_mlp": 0.01335852, + "balance_loss_clip": 0.06305933, + "balance_loss_mlp": 0.01310293, + "epoch": 0.10052607846084473, + "flos": 67479146398080.0, + "grad_norm": 0.7871321089675286, + "language_loss": 0.60865933, + "learning_rate": 3.948051095825149e-06, + "loss": 0.68750823, + "num_input_tokens_seen": 36054465, + "router_z_loss_clip": 2.4375, + "router_z_loss_mlp": 0.25585938, + "step": 1672, + "time_per_iteration": 3.1528263092041016 + }, + { + "auxiliary_loss_clip": 0.06706591, + "auxiliary_loss_mlp": 0.01299319, + "balance_loss_clip": 0.06322384, + "balance_loss_mlp": 0.01258406, + "epoch": 0.10058620171351271, + "flos": 21366795294720.0, + "grad_norm": 25.353895208902486, + "language_loss": 0.78260916, + "learning_rate": 3.947962869911147e-06, + "loss": 0.86266828, + "num_input_tokens_seen": 36073480, + "router_z_loss_clip": 3.83984375, + "router_z_loss_mlp": 0.40917969, + "step": 1673, + "time_per_iteration": 2.548840045928955 + }, + { + "auxiliary_loss_clip": 0.06713213, + "auxiliary_loss_mlp": 0.01301927, + "balance_loss_clip": 0.06326719, + "balance_loss_mlp": 0.01261419, + "epoch": 0.10064632496618067, + "flos": 16805724071040.0, + "grad_norm": 3.2623460746575867, + "language_loss": 0.75444734, + "learning_rate": 3.947874570130197e-06, + "loss": 0.83459872, + "num_input_tokens_seen": 36091830, + "router_z_loss_clip": 3.8671875, + "router_z_loss_mlp": 0.4050293, + "step": 1674, + "time_per_iteration": 3.9417338371276855 + }, + { + "auxiliary_loss_clip": 0.06701215, + "auxiliary_loss_mlp": 0.01303034, + "balance_loss_clip": 0.0631593, + "balance_loss_mlp": 0.01264124, + "epoch": 0.10070644821884864, + "flos": 23631433194240.0, + "grad_norm": 2.3845334341515905, + "language_loss": 0.80716002, + "learning_rate": 3.947786196485649e-06, + "loss": 0.88720256, + "num_input_tokens_seen": 36111400, + "router_z_loss_clip": 3.8515625, + "router_z_loss_mlp": 0.38891602, + "step": 1675, + "time_per_iteration": 2.6035287380218506 + }, + { + "auxiliary_loss_clip": 0.06711227, + "auxiliary_loss_mlp": 0.01308342, + "balance_loss_clip": 0.06320765, + "balance_loss_mlp": 0.01266404, + "epoch": 0.1007665714715166, + "flos": 24469516131840.0, + "grad_norm": 3.2401043480386122, + "language_loss": 0.82723379, + "learning_rate": 3.947697748980853e-06, + "loss": 0.90742946, + "num_input_tokens_seen": 36129345, + "router_z_loss_clip": 3.90234375, + "router_z_loss_mlp": 0.41943359, + "step": 1676, + "time_per_iteration": 4.029613256454468 + }, + { + "auxiliary_loss_clip": 0.06714617, + "auxiliary_loss_mlp": 0.01315911, + "balance_loss_clip": 0.0632771, + "balance_loss_mlp": 0.0127476, + "epoch": 0.10082669472418458, + "flos": 16804550113920.0, + "grad_norm": 2.3128991920650295, + "language_loss": 0.87477523, + "learning_rate": 3.947609227619163e-06, + "loss": 0.95508051, + "num_input_tokens_seen": 36146255, + "router_z_loss_clip": 3.87109375, + "router_z_loss_mlp": 0.41113281, + "step": 1677, + "time_per_iteration": 2.593122720718384 + }, + { + "auxiliary_loss_clip": 0.06712872, + "auxiliary_loss_mlp": 0.01323048, + "balance_loss_clip": 0.06321359, + "balance_loss_mlp": 0.01280586, + "epoch": 0.10088681797685255, + "flos": 13558673376000.0, + "grad_norm": 2.3885344519990017, + "language_loss": 0.87886804, + "learning_rate": 3.947520632403936e-06, + "loss": 0.9592272, + "num_input_tokens_seen": 36164050, + "router_z_loss_clip": 3.9140625, + "router_z_loss_mlp": 0.42480469, + "step": 1678, + "time_per_iteration": 4.02148962020874 + }, + { + "auxiliary_loss_clip": 0.06711318, + "auxiliary_loss_mlp": 0.01321227, + "balance_loss_clip": 0.06328011, + "balance_loss_mlp": 0.01282985, + "epoch": 0.10094694122952051, + "flos": 25272868752000.0, + "grad_norm": 13.556620814946344, + "language_loss": 0.91124773, + "learning_rate": 3.947431963338532e-06, + "loss": 0.99157315, + "num_input_tokens_seen": 36183530, + "router_z_loss_clip": 3.83007812, + "router_z_loss_mlp": 0.38256836, + "step": 1679, + "time_per_iteration": 2.593204975128174 + }, + { + "auxiliary_loss_clip": 0.06551328, + "auxiliary_loss_mlp": 0.01270219, + "balance_loss_clip": 0.06307815, + "balance_loss_mlp": 0.01249143, + "epoch": 0.10100706448218849, + "flos": 69875521315200.0, + "grad_norm": 0.8658555731993547, + "language_loss": 0.53157437, + "learning_rate": 3.947343220426312e-06, + "loss": 0.60978985, + "num_input_tokens_seen": 36248550, + "router_z_loss_clip": 2.4375, + "router_z_loss_mlp": 0.2109375, + "step": 1680, + "time_per_iteration": 4.680401802062988 + }, + { + "auxiliary_loss_clip": 0.06706315, + "auxiliary_loss_mlp": 0.01330393, + "balance_loss_clip": 0.06326837, + "balance_loss_mlp": 0.0129103, + "epoch": 0.10106718773485646, + "flos": 20012677787520.0, + "grad_norm": 2.2086252291478403, + "language_loss": 0.78363287, + "learning_rate": 3.947254403670641e-06, + "loss": 0.86399996, + "num_input_tokens_seen": 36266065, + "router_z_loss_clip": 3.79296875, + "router_z_loss_mlp": 0.39331055, + "step": 1681, + "time_per_iteration": 2.5842180252075195 + }, + { + "auxiliary_loss_clip": 0.06727763, + "auxiliary_loss_mlp": 0.0133733, + "balance_loss_clip": 0.06334171, + "balance_loss_mlp": 0.01293271, + "epoch": 0.10112731098752442, + "flos": 13484852328960.0, + "grad_norm": 2.7825426019965707, + "language_loss": 0.9580273, + "learning_rate": 3.947165513074889e-06, + "loss": 1.03867817, + "num_input_tokens_seen": 36280960, + "router_z_loss_clip": 3.9375, + "router_z_loss_mlp": 0.44067383, + "step": 1682, + "time_per_iteration": 2.5091476440429688 + }, + { + "auxiliary_loss_clip": 0.06722884, + "auxiliary_loss_mlp": 0.01333979, + "balance_loss_clip": 0.06334428, + "balance_loss_mlp": 0.01291803, + "epoch": 0.1011874342401924, + "flos": 18521944997760.0, + "grad_norm": 4.013093374062749, + "language_loss": 0.88974559, + "learning_rate": 3.947076548642425e-06, + "loss": 0.97031426, + "num_input_tokens_seen": 36299010, + "router_z_loss_clip": 3.8828125, + "router_z_loss_mlp": 0.421875, + "step": 1683, + "time_per_iteration": 2.583263635635376 + }, + { + "auxiliary_loss_clip": 0.0671032, + "auxiliary_loss_mlp": 0.01319793, + "balance_loss_clip": 0.06327897, + "balance_loss_mlp": 0.0128074, + "epoch": 0.10124755749286037, + "flos": 20708904562560.0, + "grad_norm": 3.51695946667963, + "language_loss": 0.76482016, + "learning_rate": 3.946987510376624e-06, + "loss": 0.84512126, + "num_input_tokens_seen": 36318400, + "router_z_loss_clip": 3.82226562, + "router_z_loss_mlp": 0.390625, + "step": 1684, + "time_per_iteration": 2.5566201210021973 + }, + { + "auxiliary_loss_clip": 0.06545618, + "auxiliary_loss_mlp": 0.01270157, + "balance_loss_clip": 0.06304231, + "balance_loss_mlp": 0.01252085, + "epoch": 0.10130768074552833, + "flos": 56130100387200.0, + "grad_norm": 0.7359306974182547, + "language_loss": 0.6108619, + "learning_rate": 3.9468983982808615e-06, + "loss": 0.68901968, + "num_input_tokens_seen": 36381815, + "router_z_loss_clip": 2.40625, + "router_z_loss_mlp": 0.1809082, + "step": 1685, + "time_per_iteration": 3.2871286869049072 + }, + { + "auxiliary_loss_clip": 0.06715102, + "auxiliary_loss_mlp": 0.01314643, + "balance_loss_clip": 0.06328554, + "balance_loss_mlp": 0.01273612, + "epoch": 0.1013678039981963, + "flos": 33410921322240.0, + "grad_norm": 2.782312478618552, + "language_loss": 0.61882973, + "learning_rate": 3.946809212358516e-06, + "loss": 0.6991272, + "num_input_tokens_seen": 36404320, + "router_z_loss_clip": 3.86328125, + "router_z_loss_mlp": 0.41064453, + "step": 1686, + "time_per_iteration": 2.6534583568573 + }, + { + "auxiliary_loss_clip": 0.0670934, + "auxiliary_loss_mlp": 0.01311437, + "balance_loss_clip": 0.0633449, + "balance_loss_mlp": 0.01272622, + "epoch": 0.10142792725086427, + "flos": 31913480206080.0, + "grad_norm": 4.585581221965215, + "language_loss": 0.8288697, + "learning_rate": 3.946719952612972e-06, + "loss": 0.90907753, + "num_input_tokens_seen": 36427510, + "router_z_loss_clip": 3.74804688, + "router_z_loss_mlp": 0.38793945, + "step": 1687, + "time_per_iteration": 2.6766278743743896 + }, + { + "auxiliary_loss_clip": 0.06718412, + "auxiliary_loss_mlp": 0.0131249, + "balance_loss_clip": 0.06331126, + "balance_loss_mlp": 0.01271601, + "epoch": 0.10148805050353224, + "flos": 28483512048000.0, + "grad_norm": 2.9352499009147386, + "language_loss": 0.73686063, + "learning_rate": 3.94663061904761e-06, + "loss": 0.81716961, + "num_input_tokens_seen": 36448230, + "router_z_loss_clip": 3.88085938, + "router_z_loss_mlp": 0.40917969, + "step": 1688, + "time_per_iteration": 2.625084400177002 + }, + { + "auxiliary_loss_clip": 0.06704164, + "auxiliary_loss_mlp": 0.01310415, + "balance_loss_clip": 0.06328401, + "balance_loss_mlp": 0.01267905, + "epoch": 0.1015481737562002, + "flos": 25154799949440.0, + "grad_norm": 2.7691275113498293, + "language_loss": 0.88195848, + "learning_rate": 3.94654121166582e-06, + "loss": 0.9621042, + "num_input_tokens_seen": 36464395, + "router_z_loss_clip": 3.7578125, + "router_z_loss_mlp": 0.42480469, + "step": 1689, + "time_per_iteration": 2.595492362976074 + }, + { + "auxiliary_loss_clip": 0.06716056, + "auxiliary_loss_mlp": 0.01310716, + "balance_loss_clip": 0.06332745, + "balance_loss_mlp": 0.01270328, + "epoch": 0.10160829700886818, + "flos": 30890593088640.0, + "grad_norm": 2.202394662859946, + "language_loss": 0.89776945, + "learning_rate": 3.946451730470993e-06, + "loss": 0.97803724, + "num_input_tokens_seen": 36486475, + "router_z_loss_clip": 3.83398438, + "router_z_loss_mlp": 0.40429688, + "step": 1690, + "time_per_iteration": 2.6406383514404297 + }, + { + "auxiliary_loss_clip": 0.06720668, + "auxiliary_loss_mlp": 0.01309465, + "balance_loss_clip": 0.06337205, + "balance_loss_mlp": 0.01267932, + "epoch": 0.10166842026153615, + "flos": 20418190421760.0, + "grad_norm": 2.5850789066585595, + "language_loss": 0.85274917, + "learning_rate": 3.946362175466521e-06, + "loss": 0.93305051, + "num_input_tokens_seen": 36505310, + "router_z_loss_clip": 3.83789062, + "router_z_loss_mlp": 0.4152832, + "step": 1691, + "time_per_iteration": 2.6336474418640137 + }, + { + "auxiliary_loss_clip": 0.06720576, + "auxiliary_loss_mlp": 0.01308382, + "balance_loss_clip": 0.06329723, + "balance_loss_mlp": 0.01266039, + "epoch": 0.10172854351420411, + "flos": 33485832472320.0, + "grad_norm": 1.9210168222319979, + "language_loss": 0.67985535, + "learning_rate": 3.946272546655801e-06, + "loss": 0.76014495, + "num_input_tokens_seen": 36529820, + "router_z_loss_clip": 3.91210938, + "router_z_loss_mlp": 0.4230957, + "step": 1692, + "time_per_iteration": 2.7298569679260254 + }, + { + "auxiliary_loss_clip": 0.0670909, + "auxiliary_loss_mlp": 0.01313275, + "balance_loss_clip": 0.06329532, + "balance_loss_mlp": 0.01271933, + "epoch": 0.1017886667668721, + "flos": 23557109022720.0, + "grad_norm": 2.364359015626866, + "language_loss": 0.77791357, + "learning_rate": 3.94618284404223e-06, + "loss": 0.85813725, + "num_input_tokens_seen": 36549000, + "router_z_loss_clip": 3.79101562, + "router_z_loss_mlp": 0.41333008, + "step": 1693, + "time_per_iteration": 2.5772159099578857 + }, + { + "auxiliary_loss_clip": 0.06718149, + "auxiliary_loss_mlp": 0.01308582, + "balance_loss_clip": 0.06332842, + "balance_loss_mlp": 0.01267813, + "epoch": 0.10184879001954006, + "flos": 23303011916160.0, + "grad_norm": 1.7868831519316952, + "language_loss": 0.88559091, + "learning_rate": 3.9460930676292105e-06, + "loss": 0.96585822, + "num_input_tokens_seen": 36567515, + "router_z_loss_clip": 3.85742188, + "router_z_loss_mlp": 0.4074707, + "step": 1694, + "time_per_iteration": 2.6128172874450684 + }, + { + "auxiliary_loss_clip": 0.06728393, + "auxiliary_loss_mlp": 0.01308189, + "balance_loss_clip": 0.06335086, + "balance_loss_mlp": 0.01266681, + "epoch": 0.10190891327220802, + "flos": 18339069461760.0, + "grad_norm": 12.701803193315635, + "language_loss": 0.81483626, + "learning_rate": 3.946003217420147e-06, + "loss": 0.89520216, + "num_input_tokens_seen": 36586190, + "router_z_loss_clip": 3.9375, + "router_z_loss_mlp": 0.41503906, + "step": 1695, + "time_per_iteration": 2.5383710861206055 + }, + { + "auxiliary_loss_clip": 0.06719907, + "auxiliary_loss_mlp": 0.01309327, + "balance_loss_clip": 0.06335149, + "balance_loss_mlp": 0.01268152, + "epoch": 0.10196903652487599, + "flos": 26472006933120.0, + "grad_norm": 2.5208321376903173, + "language_loss": 0.87899506, + "learning_rate": 3.945913293418447e-06, + "loss": 0.95928741, + "num_input_tokens_seen": 36607495, + "router_z_loss_clip": 3.84765625, + "router_z_loss_mlp": 0.41186523, + "step": 1696, + "time_per_iteration": 2.651993989944458 + }, + { + "auxiliary_loss_clip": 0.067072, + "auxiliary_loss_mlp": 0.01308456, + "balance_loss_clip": 0.06329801, + "balance_loss_mlp": 0.01268545, + "epoch": 0.10202915977754397, + "flos": 21875618413440.0, + "grad_norm": 1.9807901580601361, + "language_loss": 0.83342528, + "learning_rate": 3.945823295627519e-06, + "loss": 0.91358191, + "num_input_tokens_seen": 36628555, + "router_z_loss_clip": 3.77734375, + "router_z_loss_mlp": 0.39916992, + "step": 1697, + "time_per_iteration": 2.5826144218444824 + }, + { + "auxiliary_loss_clip": 0.06717139, + "auxiliary_loss_mlp": 0.01309728, + "balance_loss_clip": 0.06333424, + "balance_loss_mlp": 0.01268339, + "epoch": 0.10208928303021193, + "flos": 22316322562560.0, + "grad_norm": 4.080073154744023, + "language_loss": 0.82607067, + "learning_rate": 3.9457332240507775e-06, + "loss": 0.90633935, + "num_input_tokens_seen": 36646250, + "router_z_loss_clip": 3.83789062, + "router_z_loss_mlp": 0.4140625, + "step": 1698, + "time_per_iteration": 2.6105751991271973 + }, + { + "auxiliary_loss_clip": 0.06711876, + "auxiliary_loss_mlp": 0.01312643, + "balance_loss_clip": 0.06331024, + "balance_loss_mlp": 0.01272541, + "epoch": 0.1021494062828799, + "flos": 22131811872000.0, + "grad_norm": 3.7730678992984594, + "language_loss": 0.78052682, + "learning_rate": 3.945643078691637e-06, + "loss": 0.86077201, + "num_input_tokens_seen": 36666675, + "router_z_loss_clip": 3.8046875, + "router_z_loss_mlp": 0.40112305, + "step": 1699, + "time_per_iteration": 2.554769515991211 + }, + { + "auxiliary_loss_clip": 0.06706256, + "auxiliary_loss_mlp": 0.01310666, + "balance_loss_clip": 0.06325917, + "balance_loss_mlp": 0.01269253, + "epoch": 0.10220952953554788, + "flos": 19652922282240.0, + "grad_norm": 2.595218153740113, + "language_loss": 0.81135154, + "learning_rate": 3.945552859553516e-06, + "loss": 0.89152074, + "num_input_tokens_seen": 36685225, + "router_z_loss_clip": 3.80273438, + "router_z_loss_mlp": 0.41430664, + "step": 1700, + "time_per_iteration": 2.6276824474334717 + }, + { + "auxiliary_loss_clip": 0.06713387, + "auxiliary_loss_mlp": 0.01308957, + "balance_loss_clip": 0.06330973, + "balance_loss_mlp": 0.01269284, + "epoch": 0.10226965278821584, + "flos": 29794765392000.0, + "grad_norm": 1.915620858004171, + "language_loss": 0.78195202, + "learning_rate": 3.945462566639836e-06, + "loss": 0.86217546, + "num_input_tokens_seen": 36705985, + "router_z_loss_clip": 3.82421875, + "router_z_loss_mlp": 0.39697266, + "step": 1701, + "time_per_iteration": 2.6159350872039795 + }, + { + "auxiliary_loss_clip": 0.06729369, + "auxiliary_loss_mlp": 0.01324821, + "balance_loss_clip": 0.06331599, + "balance_loss_mlp": 0.01279617, + "epoch": 0.10232977604088381, + "flos": 27024239266560.0, + "grad_norm": 2.5261274720011473, + "language_loss": 0.79135132, + "learning_rate": 3.945372199954019e-06, + "loss": 0.87189317, + "num_input_tokens_seen": 36725815, + "router_z_loss_clip": 3.98046875, + "router_z_loss_mlp": 0.4519043, + "step": 1702, + "time_per_iteration": 2.629913806915283 + }, + { + "auxiliary_loss_clip": 0.06706569, + "auxiliary_loss_mlp": 0.01317465, + "balance_loss_clip": 0.06326532, + "balance_loss_mlp": 0.01277983, + "epoch": 0.10238989929355179, + "flos": 20783857639680.0, + "grad_norm": 2.3222724065629494, + "language_loss": 0.95639896, + "learning_rate": 3.945281759499494e-06, + "loss": 1.03663921, + "num_input_tokens_seen": 36742345, + "router_z_loss_clip": 3.80078125, + "router_z_loss_mlp": 0.39501953, + "step": 1703, + "time_per_iteration": 2.601848840713501 + }, + { + "auxiliary_loss_clip": 0.06547229, + "auxiliary_loss_mlp": 0.01318477, + "balance_loss_clip": 0.06308849, + "balance_loss_mlp": 0.01299118, + "epoch": 0.10245002254621975, + "flos": 57716471013120.0, + "grad_norm": 0.8331319138238726, + "language_loss": 0.55242068, + "learning_rate": 3.94519124527969e-06, + "loss": 0.63107777, + "num_input_tokens_seen": 36798775, + "router_z_loss_clip": 2.375, + "router_z_loss_mlp": 0.19335938, + "step": 1704, + "time_per_iteration": 3.1248717308044434 + }, + { + "auxiliary_loss_clip": 0.06706051, + "auxiliary_loss_mlp": 0.01308758, + "balance_loss_clip": 0.06321411, + "balance_loss_mlp": 0.0126775, + "epoch": 0.10251014579888772, + "flos": 16805724071040.0, + "grad_norm": 2.30707717904525, + "language_loss": 0.8659755, + "learning_rate": 3.945100657298039e-06, + "loss": 0.94612348, + "num_input_tokens_seen": 36816295, + "router_z_loss_clip": 3.84960938, + "router_z_loss_mlp": 0.41015625, + "step": 1705, + "time_per_iteration": 2.5850555896759033 + }, + { + "auxiliary_loss_clip": 0.06541149, + "auxiliary_loss_mlp": 0.01304681, + "balance_loss_clip": 0.06304285, + "balance_loss_mlp": 0.01286478, + "epoch": 0.1025702690515557, + "flos": 68584533459840.0, + "grad_norm": 0.7436655566620352, + "language_loss": 0.60505682, + "learning_rate": 3.9450099955579765e-06, + "loss": 0.68351519, + "num_input_tokens_seen": 36882030, + "router_z_loss_clip": 2.375, + "router_z_loss_mlp": 0.18212891, + "step": 1706, + "time_per_iteration": 3.239501953125 + }, + { + "auxiliary_loss_clip": 0.06703549, + "auxiliary_loss_mlp": 0.01305907, + "balance_loss_clip": 0.0632052, + "balance_loss_mlp": 0.01262729, + "epoch": 0.10263039230422366, + "flos": 14871939217920.0, + "grad_norm": 2.8485004441458637, + "language_loss": 0.88280994, + "learning_rate": 3.94491926006294e-06, + "loss": 0.96290451, + "num_input_tokens_seen": 36899245, + "router_z_loss_clip": 3.828125, + "router_z_loss_mlp": 0.43188477, + "step": 1707, + "time_per_iteration": 2.6399993896484375 + }, + { + "auxiliary_loss_clip": 0.0669533, + "auxiliary_loss_mlp": 0.01302799, + "balance_loss_clip": 0.06323209, + "balance_loss_mlp": 0.01262887, + "epoch": 0.10269051555689163, + "flos": 25344593447040.0, + "grad_norm": 2.5980108077369604, + "language_loss": 0.74784869, + "learning_rate": 3.944828450816369e-06, + "loss": 0.82783002, + "num_input_tokens_seen": 36920950, + "router_z_loss_clip": 3.71875, + "router_z_loss_mlp": 0.39892578, + "step": 1708, + "time_per_iteration": 2.654852867126465 + }, + { + "auxiliary_loss_clip": 0.06703041, + "auxiliary_loss_mlp": 0.01305178, + "balance_loss_clip": 0.06323138, + "balance_loss_mlp": 0.01263049, + "epoch": 0.10275063880955959, + "flos": 21075116832000.0, + "grad_norm": 2.060667127210552, + "language_loss": 0.92398179, + "learning_rate": 3.944737567821709e-06, + "loss": 1.00406396, + "num_input_tokens_seen": 36938900, + "router_z_loss_clip": 3.80078125, + "router_z_loss_mlp": 0.42114258, + "step": 1709, + "time_per_iteration": 2.573854446411133 + }, + { + "auxiliary_loss_clip": 0.06702737, + "auxiliary_loss_mlp": 0.01298282, + "balance_loss_clip": 0.06322797, + "balance_loss_mlp": 0.01257703, + "epoch": 0.10281076206222757, + "flos": 30373636124160.0, + "grad_norm": 12.814317235362356, + "language_loss": 0.90276158, + "learning_rate": 3.944646611082406e-06, + "loss": 0.98277175, + "num_input_tokens_seen": 36957010, + "router_z_loss_clip": 3.79882812, + "router_z_loss_mlp": 0.40551758, + "step": 1710, + "time_per_iteration": 2.6228139400482178 + }, + { + "auxiliary_loss_clip": 0.06701953, + "auxiliary_loss_mlp": 0.01305177, + "balance_loss_clip": 0.06325494, + "balance_loss_mlp": 0.01263096, + "epoch": 0.10287088531489554, + "flos": 22424748145920.0, + "grad_norm": 2.0240875797159554, + "language_loss": 0.80754149, + "learning_rate": 3.944555580601908e-06, + "loss": 0.88761282, + "num_input_tokens_seen": 36977690, + "router_z_loss_clip": 3.75976562, + "router_z_loss_mlp": 0.42089844, + "step": 1711, + "time_per_iteration": 2.583343982696533 + }, + { + "auxiliary_loss_clip": 0.06708579, + "auxiliary_loss_mlp": 0.01306816, + "balance_loss_clip": 0.06325286, + "balance_loss_mlp": 0.01263447, + "epoch": 0.1029310085675635, + "flos": 25122501400320.0, + "grad_norm": 2.3794944473216684, + "language_loss": 0.74649823, + "learning_rate": 3.944464476383668e-06, + "loss": 0.82665217, + "num_input_tokens_seen": 36997300, + "router_z_loss_clip": 3.83398438, + "router_z_loss_mlp": 0.43383789, + "step": 1712, + "time_per_iteration": 2.571152687072754 + }, + { + "auxiliary_loss_clip": 0.06692443, + "auxiliary_loss_mlp": 0.01305083, + "balance_loss_clip": 0.0632696, + "balance_loss_mlp": 0.01265911, + "epoch": 0.10299113182023148, + "flos": 19871869800960.0, + "grad_norm": 3.881117444097493, + "language_loss": 0.88232982, + "learning_rate": 3.94437329843114e-06, + "loss": 0.96230507, + "num_input_tokens_seen": 37016110, + "router_z_loss_clip": 3.65429688, + "router_z_loss_mlp": 0.3918457, + "step": 1713, + "time_per_iteration": 4.005250453948975 + }, + { + "auxiliary_loss_clip": 0.06698017, + "auxiliary_loss_mlp": 0.01309494, + "balance_loss_clip": 0.06326848, + "balance_loss_mlp": 0.0126789, + "epoch": 0.10305125507289944, + "flos": 20453633498880.0, + "grad_norm": 1.7755930908575366, + "language_loss": 0.74034607, + "learning_rate": 3.944282046747782e-06, + "loss": 0.82042122, + "num_input_tokens_seen": 37036405, + "router_z_loss_clip": 3.70703125, + "router_z_loss_mlp": 0.41601562, + "step": 1714, + "time_per_iteration": 2.5871846675872803 + }, + { + "auxiliary_loss_clip": 0.06718543, + "auxiliary_loss_mlp": 0.01323459, + "balance_loss_clip": 0.06333546, + "balance_loss_mlp": 0.01278446, + "epoch": 0.10311137832556741, + "flos": 26258090659200.0, + "grad_norm": 2.9350503756017425, + "language_loss": 0.92344153, + "learning_rate": 3.944190721337053e-06, + "loss": 1.00386155, + "num_input_tokens_seen": 37057580, + "router_z_loss_clip": 3.84765625, + "router_z_loss_mlp": 0.45043945, + "step": 1715, + "time_per_iteration": 4.0185253620147705 + }, + { + "auxiliary_loss_clip": 0.06704861, + "auxiliary_loss_mlp": 0.01311537, + "balance_loss_clip": 0.06330159, + "balance_loss_mlp": 0.01269957, + "epoch": 0.10317150157823539, + "flos": 35307711797760.0, + "grad_norm": 2.2230189858401834, + "language_loss": 0.77534348, + "learning_rate": 3.944099322202418e-06, + "loss": 0.85550749, + "num_input_tokens_seen": 37079120, + "router_z_loss_clip": 3.74804688, + "router_z_loss_mlp": 0.41577148, + "step": 1716, + "time_per_iteration": 2.6924543380737305 + }, + { + "auxiliary_loss_clip": 0.06704281, + "auxiliary_loss_mlp": 0.01322549, + "balance_loss_clip": 0.06326932, + "balance_loss_mlp": 0.01278037, + "epoch": 0.10323162483090335, + "flos": 25747171188480.0, + "grad_norm": 4.647251493858166, + "language_loss": 0.87329108, + "learning_rate": 3.944007849347342e-06, + "loss": 0.9535594, + "num_input_tokens_seen": 37099710, + "router_z_loss_clip": 3.76757812, + "router_z_loss_mlp": 0.44506836, + "step": 1717, + "time_per_iteration": 2.5771939754486084 + }, + { + "auxiliary_loss_clip": 0.06709914, + "auxiliary_loss_mlp": 0.01337871, + "balance_loss_clip": 0.06322803, + "balance_loss_mlp": 0.0129393, + "epoch": 0.10329174808357132, + "flos": 16295475432960.0, + "grad_norm": 2.5245058321168297, + "language_loss": 0.84142077, + "learning_rate": 3.943916302775292e-06, + "loss": 0.9218986, + "num_input_tokens_seen": 37117775, + "router_z_loss_clip": 3.87109375, + "router_z_loss_mlp": 0.43945312, + "step": 1718, + "time_per_iteration": 3.9576940536499023 + }, + { + "auxiliary_loss_clip": 0.06693481, + "auxiliary_loss_mlp": 0.01328919, + "balance_loss_clip": 0.06322589, + "balance_loss_mlp": 0.01288626, + "epoch": 0.10335187133623928, + "flos": 36696475768320.0, + "grad_norm": 4.723677538171457, + "language_loss": 0.75181365, + "learning_rate": 3.943824682489742e-06, + "loss": 0.83203769, + "num_input_tokens_seen": 37140280, + "router_z_loss_clip": 3.70703125, + "router_z_loss_mlp": 0.40283203, + "step": 1719, + "time_per_iteration": 4.132940769195557 + }, + { + "auxiliary_loss_clip": 0.06689329, + "auxiliary_loss_mlp": 0.01317642, + "balance_loss_clip": 0.06317558, + "balance_loss_mlp": 0.01278064, + "epoch": 0.10341199458890726, + "flos": 14980909852800.0, + "grad_norm": 1.9928809485399477, + "language_loss": 0.94301736, + "learning_rate": 3.9437329884941665e-06, + "loss": 1.02308702, + "num_input_tokens_seen": 37158350, + "router_z_loss_clip": 3.71679688, + "router_z_loss_mlp": 0.39575195, + "step": 1720, + "time_per_iteration": 2.53070068359375 + }, + { + "auxiliary_loss_clip": 0.06693915, + "auxiliary_loss_mlp": 0.01322313, + "balance_loss_clip": 0.06316631, + "balance_loss_mlp": 0.0127811, + "epoch": 0.10347211784157523, + "flos": 21037745111040.0, + "grad_norm": 2.2577738133608944, + "language_loss": 0.80850732, + "learning_rate": 3.943641220792039e-06, + "loss": 0.88866961, + "num_input_tokens_seen": 37177120, + "router_z_loss_clip": 3.77148438, + "router_z_loss_mlp": 0.44213867, + "step": 1721, + "time_per_iteration": 2.6165122985839844 + }, + { + "auxiliary_loss_clip": 0.06711201, + "auxiliary_loss_mlp": 0.01332384, + "balance_loss_clip": 0.06324577, + "balance_loss_mlp": 0.01286345, + "epoch": 0.1035322410942432, + "flos": 19798216462080.0, + "grad_norm": 2.2916288774806137, + "language_loss": 0.81885946, + "learning_rate": 3.9435493793868434e-06, + "loss": 0.89929533, + "num_input_tokens_seen": 37195895, + "router_z_loss_clip": 3.87109375, + "router_z_loss_mlp": 0.46044922, + "step": 1722, + "time_per_iteration": 2.585881471633911 + }, + { + "auxiliary_loss_clip": 0.06547391, + "auxiliary_loss_mlp": 0.01290481, + "balance_loss_clip": 0.06313527, + "balance_loss_mlp": 0.01272635, + "epoch": 0.10359236434691117, + "flos": 52716037305600.0, + "grad_norm": 0.9610809671594381, + "language_loss": 0.66722119, + "learning_rate": 3.943457464282059e-06, + "loss": 0.74559999, + "num_input_tokens_seen": 37247270, + "router_z_loss_clip": 2.34375, + "router_z_loss_mlp": 0.17883301, + "step": 1723, + "time_per_iteration": 2.9245951175689697 + }, + { + "auxiliary_loss_clip": 0.0669903, + "auxiliary_loss_mlp": 0.01310212, + "balance_loss_clip": 0.06318312, + "balance_loss_mlp": 0.01267582, + "epoch": 0.10365248759957914, + "flos": 18411255354240.0, + "grad_norm": 3.390195963482514, + "language_loss": 0.78785694, + "learning_rate": 3.9433654754811745e-06, + "loss": 0.86794937, + "num_input_tokens_seen": 37265595, + "router_z_loss_clip": 3.8046875, + "router_z_loss_mlp": 0.42651367, + "step": 1724, + "time_per_iteration": 2.587998151779175 + }, + { + "auxiliary_loss_clip": 0.06701188, + "auxiliary_loss_mlp": 0.01310671, + "balance_loss_clip": 0.06321733, + "balance_loss_mlp": 0.01269663, + "epoch": 0.1037126108522471, + "flos": 47563615820160.0, + "grad_norm": 2.288753840195378, + "language_loss": 0.76223904, + "learning_rate": 3.943273412987676e-06, + "loss": 0.84235764, + "num_input_tokens_seen": 37286660, + "router_z_loss_clip": 3.79296875, + "router_z_loss_mlp": 0.41015625, + "step": 1725, + "time_per_iteration": 2.7683663368225098 + }, + { + "auxiliary_loss_clip": 0.06675334, + "auxiliary_loss_mlp": 0.01298882, + "balance_loss_clip": 0.06309348, + "balance_loss_mlp": 0.01258041, + "epoch": 0.10377273410491508, + "flos": 22822671985920.0, + "grad_norm": 2.2764288322332265, + "language_loss": 0.76062018, + "learning_rate": 3.943181276805054e-06, + "loss": 0.84036231, + "num_input_tokens_seen": 37304915, + "router_z_loss_clip": 3.66601562, + "router_z_loss_mlp": 0.40869141, + "step": 1726, + "time_per_iteration": 2.587892770767212 + }, + { + "auxiliary_loss_clip": 0.06701919, + "auxiliary_loss_mlp": 0.01307243, + "balance_loss_clip": 0.0631658, + "balance_loss_mlp": 0.0126316, + "epoch": 0.10383285735758305, + "flos": 26145556225920.0, + "grad_norm": 2.697441848061202, + "language_loss": 0.76235563, + "learning_rate": 3.9430890669368035e-06, + "loss": 0.84244722, + "num_input_tokens_seen": 37325265, + "router_z_loss_clip": 3.85351562, + "router_z_loss_mlp": 0.44042969, + "step": 1727, + "time_per_iteration": 2.6308248043060303 + }, + { + "auxiliary_loss_clip": 0.06691539, + "auxiliary_loss_mlp": 0.0130793, + "balance_loss_clip": 0.0631765, + "balance_loss_mlp": 0.01265277, + "epoch": 0.10389298061025101, + "flos": 17097402533760.0, + "grad_norm": 2.4502843901442315, + "language_loss": 0.86415958, + "learning_rate": 3.942996783386422e-06, + "loss": 0.94415426, + "num_input_tokens_seen": 37341650, + "router_z_loss_clip": 3.73828125, + "router_z_loss_mlp": 0.42675781, + "step": 1728, + "time_per_iteration": 2.5618197917938232 + }, + { + "auxiliary_loss_clip": 0.06685561, + "auxiliary_loss_mlp": 0.01302161, + "balance_loss_clip": 0.06312057, + "balance_loss_mlp": 0.01259484, + "epoch": 0.10395310386291898, + "flos": 20782683682560.0, + "grad_norm": 2.0546311064170726, + "language_loss": 0.71406788, + "learning_rate": 3.942904426157406e-06, + "loss": 0.79394507, + "num_input_tokens_seen": 37360270, + "router_z_loss_clip": 3.73632812, + "router_z_loss_mlp": 0.42675781, + "step": 1729, + "time_per_iteration": 2.5618793964385986 + }, + { + "auxiliary_loss_clip": 0.06693864, + "auxiliary_loss_mlp": 0.01305753, + "balance_loss_clip": 0.06314608, + "balance_loss_mlp": 0.01260954, + "epoch": 0.10401322711558696, + "flos": 12825032952960.0, + "grad_norm": 2.8841772006205617, + "language_loss": 0.83575559, + "learning_rate": 3.9428119952532605e-06, + "loss": 0.91575181, + "num_input_tokens_seen": 37375225, + "router_z_loss_clip": 3.79492188, + "router_z_loss_mlp": 0.44775391, + "step": 1730, + "time_per_iteration": 2.623878002166748 + }, + { + "auxiliary_loss_clip": 0.06680113, + "auxiliary_loss_mlp": 0.01302214, + "balance_loss_clip": 0.06313114, + "balance_loss_mlp": 0.01260681, + "epoch": 0.10407335036825492, + "flos": 23191274096640.0, + "grad_norm": 1.835927341089653, + "language_loss": 0.77408624, + "learning_rate": 3.942719490677489e-06, + "loss": 0.85390949, + "num_input_tokens_seen": 37395165, + "router_z_loss_clip": 3.67382812, + "router_z_loss_mlp": 0.4152832, + "step": 1731, + "time_per_iteration": 2.5633392333984375 + }, + { + "auxiliary_loss_clip": 0.0668644, + "auxiliary_loss_mlp": 0.01313118, + "balance_loss_clip": 0.0632073, + "balance_loss_mlp": 0.01273159, + "epoch": 0.10413347362092289, + "flos": 26111370960000.0, + "grad_norm": 1.90471773366097, + "language_loss": 0.84198594, + "learning_rate": 3.9426269124336e-06, + "loss": 0.92198151, + "num_input_tokens_seen": 37414845, + "router_z_loss_clip": 3.65625, + "router_z_loss_mlp": 0.39941406, + "step": 1732, + "time_per_iteration": 2.6176345348358154 + }, + { + "auxiliary_loss_clip": 0.06683554, + "auxiliary_loss_mlp": 0.01314534, + "balance_loss_clip": 0.06312263, + "balance_loss_mlp": 0.01271905, + "epoch": 0.10419359687359087, + "flos": 12646014704640.0, + "grad_norm": 2.549467420686237, + "language_loss": 0.8515988, + "learning_rate": 3.942534260525104e-06, + "loss": 0.93157971, + "num_input_tokens_seen": 37432490, + "router_z_loss_clip": 3.71679688, + "router_z_loss_mlp": 0.42626953, + "step": 1733, + "time_per_iteration": 2.529829978942871 + }, + { + "auxiliary_loss_clip": 0.06699164, + "auxiliary_loss_mlp": 0.01313294, + "balance_loss_clip": 0.06323372, + "balance_loss_mlp": 0.01269139, + "epoch": 0.10425372012625883, + "flos": 12129099667200.0, + "grad_norm": 4.348408719624472, + "language_loss": 0.78445566, + "learning_rate": 3.942441534955514e-06, + "loss": 0.86458015, + "num_input_tokens_seen": 37449435, + "router_z_loss_clip": 3.7578125, + "router_z_loss_mlp": 0.44165039, + "step": 1734, + "time_per_iteration": 2.5436649322509766 + }, + { + "auxiliary_loss_clip": 0.06683113, + "auxiliary_loss_mlp": 0.01310658, + "balance_loss_clip": 0.06320634, + "balance_loss_mlp": 0.01270937, + "epoch": 0.1043138433789268, + "flos": 25344551520000.0, + "grad_norm": 1.8276863047745044, + "language_loss": 0.76546466, + "learning_rate": 3.9423487357283465e-06, + "loss": 0.84540236, + "num_input_tokens_seen": 37469105, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 0.3972168, + "step": 1735, + "time_per_iteration": 2.6129813194274902 + }, + { + "auxiliary_loss_clip": 0.06697765, + "auxiliary_loss_mlp": 0.01313856, + "balance_loss_clip": 0.06318491, + "balance_loss_mlp": 0.01269438, + "epoch": 0.10437396663159478, + "flos": 29174539870080.0, + "grad_norm": 2.0479038136948735, + "language_loss": 0.80253965, + "learning_rate": 3.94225586284712e-06, + "loss": 0.88265586, + "num_input_tokens_seen": 37490540, + "router_z_loss_clip": 3.79492188, + "router_z_loss_mlp": 0.44360352, + "step": 1736, + "time_per_iteration": 2.6438446044921875 + }, + { + "auxiliary_loss_clip": 0.06694648, + "auxiliary_loss_mlp": 0.01312039, + "balance_loss_clip": 0.06322388, + "balance_loss_mlp": 0.01269267, + "epoch": 0.10443408988426274, + "flos": 25087687228800.0, + "grad_norm": 4.638523885209388, + "language_loss": 0.71961701, + "learning_rate": 3.942162916315356e-06, + "loss": 0.79968387, + "num_input_tokens_seen": 37511905, + "router_z_loss_clip": 3.72460938, + "router_z_loss_mlp": 0.42773438, + "step": 1737, + "time_per_iteration": 2.5947039127349854 + }, + { + "auxiliary_loss_clip": 0.06704547, + "auxiliary_loss_mlp": 0.01309535, + "balance_loss_clip": 0.06322168, + "balance_loss_mlp": 0.01263305, + "epoch": 0.1044942131369307, + "flos": 26766746069760.0, + "grad_norm": 2.5677527060209715, + "language_loss": 0.83228981, + "learning_rate": 3.942069896136581e-06, + "loss": 0.91243058, + "num_input_tokens_seen": 37533635, + "router_z_loss_clip": 3.82617188, + "router_z_loss_mlp": 0.46191406, + "step": 1738, + "time_per_iteration": 2.615252733230591 + }, + { + "auxiliary_loss_clip": 0.06695886, + "auxiliary_loss_mlp": 0.01310975, + "balance_loss_clip": 0.06315427, + "balance_loss_mlp": 0.01265747, + "epoch": 0.10455433638959867, + "flos": 18448543221120.0, + "grad_norm": 2.179337588406841, + "language_loss": 0.76366144, + "learning_rate": 3.9419768023143196e-06, + "loss": 0.84373009, + "num_input_tokens_seen": 37552035, + "router_z_loss_clip": 3.80273438, + "router_z_loss_mlp": 0.45239258, + "step": 1739, + "time_per_iteration": 2.5386781692504883 + }, + { + "auxiliary_loss_clip": 0.06684839, + "auxiliary_loss_mlp": 0.01316183, + "balance_loss_clip": 0.06310752, + "balance_loss_mlp": 0.01271456, + "epoch": 0.10461445964226665, + "flos": 23225207800320.0, + "grad_norm": 1.9549702888486553, + "language_loss": 0.7847473, + "learning_rate": 3.941883634852104e-06, + "loss": 0.86475754, + "num_input_tokens_seen": 37571540, + "router_z_loss_clip": 3.74023438, + "router_z_loss_mlp": 0.44775391, + "step": 1740, + "time_per_iteration": 2.6215531826019287 + }, + { + "auxiliary_loss_clip": 0.06687017, + "auxiliary_loss_mlp": 0.01315844, + "balance_loss_clip": 0.06320937, + "balance_loss_mlp": 0.01273953, + "epoch": 0.10467458289493461, + "flos": 24350860350720.0, + "grad_norm": 2.5281783737696246, + "language_loss": 0.86859214, + "learning_rate": 3.941790393753467e-06, + "loss": 0.94862068, + "num_input_tokens_seen": 37588265, + "router_z_loss_clip": 3.66210938, + "router_z_loss_mlp": 0.41894531, + "step": 1741, + "time_per_iteration": 2.5947859287261963 + }, + { + "auxiliary_loss_clip": 0.06689818, + "auxiliary_loss_mlp": 0.01306432, + "balance_loss_clip": 0.06307445, + "balance_loss_mlp": 0.01259201, + "epoch": 0.10473470614760258, + "flos": 21294315912960.0, + "grad_norm": 3.2114625668667367, + "language_loss": 0.76732343, + "learning_rate": 3.941697079021942e-06, + "loss": 0.84728593, + "num_input_tokens_seen": 37606860, + "router_z_loss_clip": 3.82421875, + "router_z_loss_mlp": 0.47265625, + "step": 1742, + "time_per_iteration": 2.5832579135894775 + }, + { + "auxiliary_loss_clip": 0.06678567, + "auxiliary_loss_mlp": 0.01303781, + "balance_loss_clip": 0.06306475, + "balance_loss_mlp": 0.01260628, + "epoch": 0.10479482940027056, + "flos": 21693287928960.0, + "grad_norm": 9.553870000179, + "language_loss": 0.89069176, + "learning_rate": 3.94160369066107e-06, + "loss": 0.97051525, + "num_input_tokens_seen": 37625210, + "router_z_loss_clip": 3.71875, + "router_z_loss_mlp": 0.43164062, + "step": 1743, + "time_per_iteration": 2.5764474868774414 + }, + { + "auxiliary_loss_clip": 0.06671779, + "auxiliary_loss_mlp": 0.01307955, + "balance_loss_clip": 0.06307401, + "balance_loss_mlp": 0.01264801, + "epoch": 0.10485495265293852, + "flos": 21579076414080.0, + "grad_norm": 2.2332748103162907, + "language_loss": 0.77711093, + "learning_rate": 3.941510228674391e-06, + "loss": 0.8569082, + "num_input_tokens_seen": 37644110, + "router_z_loss_clip": 3.64648438, + "router_z_loss_mlp": 0.43164062, + "step": 1744, + "time_per_iteration": 2.5712687969207764 + }, + { + "auxiliary_loss_clip": 0.06674588, + "auxiliary_loss_mlp": 0.01310978, + "balance_loss_clip": 0.06307609, + "balance_loss_mlp": 0.01270685, + "epoch": 0.10491507590560649, + "flos": 37971070151040.0, + "grad_norm": 4.071178521090377, + "language_loss": 0.81752264, + "learning_rate": 3.941416693065451e-06, + "loss": 0.89737833, + "num_input_tokens_seen": 37665800, + "router_z_loss_clip": 3.671875, + "router_z_loss_mlp": 0.40332031, + "step": 1745, + "time_per_iteration": 2.7351014614105225 + }, + { + "auxiliary_loss_clip": 0.06685829, + "auxiliary_loss_mlp": 0.01305127, + "balance_loss_clip": 0.0631006, + "balance_loss_mlp": 0.01260472, + "epoch": 0.10497519915827447, + "flos": 26403552547200.0, + "grad_norm": 2.408878958176613, + "language_loss": 0.84535897, + "learning_rate": 3.941323083837794e-06, + "loss": 0.92526853, + "num_input_tokens_seen": 37685095, + "router_z_loss_clip": 3.7578125, + "router_z_loss_mlp": 0.44628906, + "step": 1746, + "time_per_iteration": 2.6103639602661133 + }, + { + "auxiliary_loss_clip": 0.06678679, + "auxiliary_loss_mlp": 0.01312181, + "balance_loss_clip": 0.06308784, + "balance_loss_mlp": 0.01272174, + "epoch": 0.10503532241094243, + "flos": 40671842152320.0, + "grad_norm": 2.4792988701606444, + "language_loss": 0.72187877, + "learning_rate": 3.941229400994971e-06, + "loss": 0.80178738, + "num_input_tokens_seen": 37707445, + "router_z_loss_clip": 3.69921875, + "router_z_loss_mlp": 0.40014648, + "step": 1747, + "time_per_iteration": 2.7907614707946777 + }, + { + "auxiliary_loss_clip": 0.06697921, + "auxiliary_loss_mlp": 0.01310121, + "balance_loss_clip": 0.06312211, + "balance_loss_mlp": 0.0126432, + "epoch": 0.1050954456636104, + "flos": 29797239087360.0, + "grad_norm": 4.268942313212568, + "language_loss": 0.86334866, + "learning_rate": 3.941135644540535e-06, + "loss": 0.94342911, + "num_input_tokens_seen": 37728325, + "router_z_loss_clip": 3.859375, + "router_z_loss_mlp": 0.45825195, + "step": 1748, + "time_per_iteration": 2.6081960201263428 + }, + { + "auxiliary_loss_clip": 0.06687598, + "auxiliary_loss_mlp": 0.01305718, + "balance_loss_clip": 0.06311792, + "balance_loss_mlp": 0.0126409, + "epoch": 0.10515556891627838, + "flos": 23955116716800.0, + "grad_norm": 1.9464829787737532, + "language_loss": 0.73449892, + "learning_rate": 3.941041814478041e-06, + "loss": 0.81443208, + "num_input_tokens_seen": 37748910, + "router_z_loss_clip": 3.76171875, + "router_z_loss_mlp": 0.41625977, + "step": 1749, + "time_per_iteration": 2.628052234649658 + }, + { + "auxiliary_loss_clip": 0.06669957, + "auxiliary_loss_mlp": 0.01310674, + "balance_loss_clip": 0.0630856, + "balance_loss_mlp": 0.01270882, + "epoch": 0.10521569216894634, + "flos": 18265458049920.0, + "grad_norm": 3.456638635747079, + "language_loss": 0.84465253, + "learning_rate": 3.940947910811047e-06, + "loss": 0.92445886, + "num_input_tokens_seen": 37765745, + "router_z_loss_clip": 3.61328125, + "router_z_loss_mlp": 0.39794922, + "step": 1750, + "time_per_iteration": 2.537736177444458 + }, + { + "auxiliary_loss_clip": 0.06687038, + "auxiliary_loss_mlp": 0.01306152, + "balance_loss_clip": 0.06307652, + "balance_loss_mlp": 0.01264238, + "epoch": 0.10527581542161431, + "flos": 15636033400320.0, + "grad_norm": 3.4228490231822364, + "language_loss": 0.94313812, + "learning_rate": 3.940853933543114e-06, + "loss": 1.0230701, + "num_input_tokens_seen": 37780520, + "router_z_loss_clip": 3.79101562, + "router_z_loss_mlp": 0.41918945, + "step": 1751, + "time_per_iteration": 2.525054931640625 + }, + { + "auxiliary_loss_clip": 0.06674927, + "auxiliary_loss_mlp": 0.01302904, + "balance_loss_clip": 0.06309814, + "balance_loss_mlp": 0.01265686, + "epoch": 0.10533593867428227, + "flos": 18302494354560.0, + "grad_norm": 3.1318677329631757, + "language_loss": 0.8055681, + "learning_rate": 3.940759882677805e-06, + "loss": 0.88534641, + "num_input_tokens_seen": 37799515, + "router_z_loss_clip": 3.65234375, + "router_z_loss_mlp": 0.37207031, + "step": 1752, + "time_per_iteration": 2.61299467086792 + }, + { + "auxiliary_loss_clip": 0.06668897, + "auxiliary_loss_mlp": 0.01309257, + "balance_loss_clip": 0.06304127, + "balance_loss_mlp": 0.01268869, + "epoch": 0.10539606192695025, + "flos": 29030922771840.0, + "grad_norm": 1.9587092194109417, + "language_loss": 0.77260768, + "learning_rate": 3.940665758218686e-06, + "loss": 0.85238922, + "num_input_tokens_seen": 37818695, + "router_z_loss_clip": 3.64453125, + "router_z_loss_mlp": 0.40356445, + "step": 1753, + "time_per_iteration": 3.9985692501068115 + }, + { + "auxiliary_loss_clip": 0.06682716, + "auxiliary_loss_mlp": 0.01311036, + "balance_loss_clip": 0.06304091, + "balance_loss_mlp": 0.01267, + "epoch": 0.10545618517961822, + "flos": 19974593306880.0, + "grad_norm": 2.3568862676270244, + "language_loss": 0.85363507, + "learning_rate": 3.940571560169328e-06, + "loss": 0.93357253, + "num_input_tokens_seen": 37837860, + "router_z_loss_clip": 3.78710938, + "router_z_loss_mlp": 0.44067383, + "step": 1754, + "time_per_iteration": 2.5938985347747803 + }, + { + "auxiliary_loss_clip": 0.06682456, + "auxiliary_loss_mlp": 0.01316264, + "balance_loss_clip": 0.06304919, + "balance_loss_mlp": 0.012723, + "epoch": 0.10551630843228618, + "flos": 16148923441920.0, + "grad_norm": 4.265882829931168, + "language_loss": 0.71315837, + "learning_rate": 3.940477288533302e-06, + "loss": 0.7931456, + "num_input_tokens_seen": 37856260, + "router_z_loss_clip": 3.77148438, + "router_z_loss_mlp": 0.43969727, + "step": 1755, + "time_per_iteration": 3.9860999584198 + }, + { + "auxiliary_loss_clip": 0.06684709, + "auxiliary_loss_mlp": 0.01318348, + "balance_loss_clip": 0.06302933, + "balance_loss_mlp": 0.01273025, + "epoch": 0.10557643168495416, + "flos": 23446754795520.0, + "grad_norm": 2.7157076999837364, + "language_loss": 0.78681093, + "learning_rate": 3.940382943314182e-06, + "loss": 0.86684155, + "num_input_tokens_seen": 37876960, + "router_z_loss_clip": 3.8203125, + "router_z_loss_mlp": 0.453125, + "step": 1756, + "time_per_iteration": 2.616227149963379 + }, + { + "auxiliary_loss_clip": 0.06683522, + "auxiliary_loss_mlp": 0.01310683, + "balance_loss_clip": 0.06306458, + "balance_loss_mlp": 0.0126927, + "epoch": 0.10563655493762213, + "flos": 21805528872960.0, + "grad_norm": 1.8370818155350874, + "language_loss": 0.81619543, + "learning_rate": 3.940288524515547e-06, + "loss": 0.89613748, + "num_input_tokens_seen": 37897070, + "router_z_loss_clip": 3.77148438, + "router_z_loss_mlp": 0.41381836, + "step": 1757, + "time_per_iteration": 2.5410592555999756 + }, + { + "auxiliary_loss_clip": 0.06685489, + "auxiliary_loss_mlp": 0.01318192, + "balance_loss_clip": 0.06307954, + "balance_loss_mlp": 0.01272177, + "epoch": 0.10569667819029009, + "flos": 53813347176960.0, + "grad_norm": 2.270274116106966, + "language_loss": 0.800345, + "learning_rate": 3.940194032140976e-06, + "loss": 0.88038182, + "num_input_tokens_seen": 37923635, + "router_z_loss_clip": 3.77734375, + "router_z_loss_mlp": 0.46020508, + "step": 1758, + "time_per_iteration": 4.229799032211304 + }, + { + "auxiliary_loss_clip": 0.06687906, + "auxiliary_loss_mlp": 0.01314474, + "balance_loss_clip": 0.06312382, + "balance_loss_mlp": 0.01272537, + "epoch": 0.10575680144295807, + "flos": 22931432985600.0, + "grad_norm": 1.92460183667747, + "language_loss": 0.93262696, + "learning_rate": 3.940099466194054e-06, + "loss": 1.01265085, + "num_input_tokens_seen": 37942650, + "router_z_loss_clip": 3.75585938, + "router_z_loss_mlp": 0.41967773, + "step": 1759, + "time_per_iteration": 4.090106248855591 + }, + { + "auxiliary_loss_clip": 0.066918, + "auxiliary_loss_mlp": 0.01305635, + "balance_loss_clip": 0.06315835, + "balance_loss_mlp": 0.01262219, + "epoch": 0.10581692469562604, + "flos": 14141820666240.0, + "grad_norm": 3.0343588084928204, + "language_loss": 0.78992438, + "learning_rate": 3.940004826678365e-06, + "loss": 0.86989868, + "num_input_tokens_seen": 37960660, + "router_z_loss_clip": 3.75976562, + "router_z_loss_mlp": 0.43383789, + "step": 1760, + "time_per_iteration": 2.5582082271575928 + }, + { + "auxiliary_loss_clip": 0.06697676, + "auxiliary_loss_mlp": 0.0131432, + "balance_loss_clip": 0.06312977, + "balance_loss_mlp": 0.01266588, + "epoch": 0.105877047948294, + "flos": 25965909072000.0, + "grad_norm": 2.31808263898244, + "language_loss": 0.91032952, + "learning_rate": 3.939910113597498e-06, + "loss": 0.99044949, + "num_input_tokens_seen": 37978625, + "router_z_loss_clip": 3.8515625, + "router_z_loss_mlp": 0.47729492, + "step": 1761, + "time_per_iteration": 2.5757992267608643 + }, + { + "auxiliary_loss_clip": 0.06676473, + "auxiliary_loss_mlp": 0.01306238, + "balance_loss_clip": 0.06308871, + "balance_loss_mlp": 0.01264229, + "epoch": 0.10593717120096197, + "flos": 30672693745920.0, + "grad_norm": 2.4539135080814862, + "language_loss": 0.79606199, + "learning_rate": 3.9398153269550464e-06, + "loss": 0.87588912, + "num_input_tokens_seen": 38000005, + "router_z_loss_clip": 3.6796875, + "router_z_loss_mlp": 0.42041016, + "step": 1762, + "time_per_iteration": 2.6716315746307373 + }, + { + "auxiliary_loss_clip": 0.06617578, + "auxiliary_loss_mlp": 0.01351391, + "balance_loss_clip": 0.06387473, + "balance_loss_mlp": 0.01331745, + "epoch": 0.10599729445362994, + "flos": 66459347153280.0, + "grad_norm": 0.7549006377741803, + "language_loss": 0.60690284, + "learning_rate": 3.939720466754602e-06, + "loss": 0.68659246, + "num_input_tokens_seen": 38066165, + "router_z_loss_clip": 2.31054688, + "router_z_loss_mlp": 0.19628906, + "step": 1763, + "time_per_iteration": 3.3268401622772217 + }, + { + "auxiliary_loss_clip": 0.06678826, + "auxiliary_loss_mlp": 0.01304205, + "balance_loss_clip": 0.06307326, + "balance_loss_mlp": 0.01263221, + "epoch": 0.10605741770629791, + "flos": 23954445884160.0, + "grad_norm": 2.5468873407149744, + "language_loss": 0.81550586, + "learning_rate": 3.939625532999763e-06, + "loss": 0.89533615, + "num_input_tokens_seen": 38086150, + "router_z_loss_clip": 3.71289062, + "router_z_loss_mlp": 0.40991211, + "step": 1764, + "time_per_iteration": 2.6332688331604004 + }, + { + "auxiliary_loss_clip": 0.06680285, + "auxiliary_loss_mlp": 0.01305528, + "balance_loss_clip": 0.06314, + "balance_loss_mlp": 0.0126359, + "epoch": 0.10611754095896588, + "flos": 19393039244160.0, + "grad_norm": 2.1888720223736384, + "language_loss": 0.81130767, + "learning_rate": 3.9395305256941314e-06, + "loss": 0.89116579, + "num_input_tokens_seen": 38104205, + "router_z_loss_clip": 3.66210938, + "router_z_loss_mlp": 0.41943359, + "step": 1765, + "time_per_iteration": 2.5613298416137695 + }, + { + "auxiliary_loss_clip": 0.0667872, + "auxiliary_loss_mlp": 0.01306506, + "balance_loss_clip": 0.06306241, + "balance_loss_mlp": 0.01263328, + "epoch": 0.10617766421163385, + "flos": 22244472086400.0, + "grad_norm": 2.2657345433152853, + "language_loss": 0.78213799, + "learning_rate": 3.939435444841306e-06, + "loss": 0.86199021, + "num_input_tokens_seen": 38122005, + "router_z_loss_clip": 3.72460938, + "router_z_loss_mlp": 0.43188477, + "step": 1766, + "time_per_iteration": 2.596531867980957 + }, + { + "auxiliary_loss_clip": 0.0668143, + "auxiliary_loss_mlp": 0.01312404, + "balance_loss_clip": 0.06318849, + "balance_loss_mlp": 0.01270705, + "epoch": 0.10623778746430182, + "flos": 28412248550400.0, + "grad_norm": 1.8379569457301719, + "language_loss": 0.78568375, + "learning_rate": 3.939340290444895e-06, + "loss": 0.8656221, + "num_input_tokens_seen": 38143365, + "router_z_loss_clip": 3.62304688, + "router_z_loss_mlp": 0.41674805, + "step": 1767, + "time_per_iteration": 2.6066575050354004 + }, + { + "auxiliary_loss_clip": 0.06566842, + "auxiliary_loss_mlp": 0.01278755, + "balance_loss_clip": 0.06337046, + "balance_loss_mlp": 0.01260039, + "epoch": 0.10629791071696978, + "flos": 64254778231680.0, + "grad_norm": 0.6896173149576642, + "language_loss": 0.57757622, + "learning_rate": 3.939245062508506e-06, + "loss": 0.6560322, + "num_input_tokens_seen": 38210035, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.18688965, + "step": 1768, + "time_per_iteration": 3.3073205947875977 + }, + { + "auxiliary_loss_clip": 0.06681848, + "auxiliary_loss_mlp": 0.01302238, + "balance_loss_clip": 0.06313933, + "balance_loss_mlp": 0.01260634, + "epoch": 0.10635803396963776, + "flos": 22754217600000.0, + "grad_norm": 1.7735238866189138, + "language_loss": 0.88016206, + "learning_rate": 3.939149761035749e-06, + "loss": 0.9600029, + "num_input_tokens_seen": 38231230, + "router_z_loss_clip": 3.67578125, + "router_z_loss_mlp": 0.41625977, + "step": 1769, + "time_per_iteration": 2.59757924079895 + }, + { + "auxiliary_loss_clip": 0.06688489, + "auxiliary_loss_mlp": 0.01307377, + "balance_loss_clip": 0.06315921, + "balance_loss_mlp": 0.01266035, + "epoch": 0.10641815722230573, + "flos": 31403818546560.0, + "grad_norm": 1.8774824554466385, + "language_loss": 0.62396371, + "learning_rate": 3.9390543860302395e-06, + "loss": 0.70392233, + "num_input_tokens_seen": 38253890, + "router_z_loss_clip": 3.72460938, + "router_z_loss_mlp": 0.41357422, + "step": 1770, + "time_per_iteration": 2.619767904281616 + }, + { + "auxiliary_loss_clip": 0.06544405, + "auxiliary_loss_mlp": 0.01277398, + "balance_loss_clip": 0.06314689, + "balance_loss_mlp": 0.01260136, + "epoch": 0.1064782804749737, + "flos": 58567230645120.0, + "grad_norm": 0.8566843095142983, + "language_loss": 0.57127362, + "learning_rate": 3.9389589374955925e-06, + "loss": 0.64949167, + "num_input_tokens_seen": 38304290, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.17285156, + "step": 1771, + "time_per_iteration": 3.075225353240967 + }, + { + "auxiliary_loss_clip": 0.06680871, + "auxiliary_loss_mlp": 0.01316894, + "balance_loss_clip": 0.06314114, + "balance_loss_mlp": 0.01274432, + "epoch": 0.10653840372764166, + "flos": 23994626716800.0, + "grad_norm": 1.9413884947034454, + "language_loss": 0.90273499, + "learning_rate": 3.938863415435429e-06, + "loss": 0.98271263, + "num_input_tokens_seen": 38324725, + "router_z_loss_clip": 3.66601562, + "router_z_loss_mlp": 0.42431641, + "step": 1772, + "time_per_iteration": 2.5640146732330322 + }, + { + "auxiliary_loss_clip": 0.06695, + "auxiliary_loss_mlp": 0.01317722, + "balance_loss_clip": 0.0631227, + "balance_loss_mlp": 0.01272828, + "epoch": 0.10659852698030964, + "flos": 18300272221440.0, + "grad_norm": 4.259637608820723, + "language_loss": 0.78636491, + "learning_rate": 3.93876781985337e-06, + "loss": 0.86649209, + "num_input_tokens_seen": 38340735, + "router_z_loss_clip": 3.83203125, + "router_z_loss_mlp": 0.44824219, + "step": 1773, + "time_per_iteration": 2.528411626815796 + }, + { + "auxiliary_loss_clip": 0.06679896, + "auxiliary_loss_mlp": 0.01313366, + "balance_loss_clip": 0.06312554, + "balance_loss_mlp": 0.01272024, + "epoch": 0.1066586502329776, + "flos": 32168751269760.0, + "grad_norm": 2.123173958110219, + "language_loss": 0.84472597, + "learning_rate": 3.938672150753041e-06, + "loss": 0.92465854, + "num_input_tokens_seen": 38361315, + "router_z_loss_clip": 3.67578125, + "router_z_loss_mlp": 0.41333008, + "step": 1774, + "time_per_iteration": 2.6232900619506836 + }, + { + "auxiliary_loss_clip": 0.06689709, + "auxiliary_loss_mlp": 0.01315484, + "balance_loss_clip": 0.06314571, + "balance_loss_mlp": 0.0127245, + "epoch": 0.10671877348564557, + "flos": 17790904051200.0, + "grad_norm": 3.7633279602301326, + "language_loss": 0.78288794, + "learning_rate": 3.9385764081380704e-06, + "loss": 0.86293983, + "num_input_tokens_seen": 38377425, + "router_z_loss_clip": 3.75585938, + "router_z_loss_mlp": 0.43066406, + "step": 1775, + "time_per_iteration": 2.5444161891937256 + }, + { + "auxiliary_loss_clip": 0.06541309, + "auxiliary_loss_mlp": 0.01282331, + "balance_loss_clip": 0.06314777, + "balance_loss_mlp": 0.0126594, + "epoch": 0.10677889673831355, + "flos": 63531074517120.0, + "grad_norm": 0.8449773894494127, + "language_loss": 0.57561356, + "learning_rate": 3.9384805920120876e-06, + "loss": 0.65384996, + "num_input_tokens_seen": 38440275, + "router_z_loss_clip": 2.265625, + "router_z_loss_mlp": 0.16394043, + "step": 1776, + "time_per_iteration": 3.194715976715088 + }, + { + "auxiliary_loss_clip": 0.06668387, + "auxiliary_loss_mlp": 0.01308478, + "balance_loss_clip": 0.063052, + "balance_loss_mlp": 0.01266421, + "epoch": 0.10683901999098151, + "flos": 22024182902400.0, + "grad_norm": 4.182030492494299, + "language_loss": 0.84917277, + "learning_rate": 3.938384702378727e-06, + "loss": 0.92894137, + "num_input_tokens_seen": 38461820, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 0.42041016, + "step": 1777, + "time_per_iteration": 2.595827102661133 + }, + { + "auxiliary_loss_clip": 0.06665277, + "auxiliary_loss_mlp": 0.01305083, + "balance_loss_clip": 0.06308808, + "balance_loss_mlp": 0.01265076, + "epoch": 0.10689914324364948, + "flos": 25049435040000.0, + "grad_norm": 3.105295988575609, + "language_loss": 0.89778632, + "learning_rate": 3.938288739241625e-06, + "loss": 0.97748995, + "num_input_tokens_seen": 38482235, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 0.40014648, + "step": 1778, + "time_per_iteration": 2.5659501552581787 + }, + { + "auxiliary_loss_clip": 0.06673209, + "auxiliary_loss_mlp": 0.0130986, + "balance_loss_clip": 0.06311059, + "balance_loss_mlp": 0.01270068, + "epoch": 0.10695926649631746, + "flos": 16440643831680.0, + "grad_norm": 2.394911901784639, + "language_loss": 0.85383832, + "learning_rate": 3.938192702604417e-06, + "loss": 0.93366897, + "num_input_tokens_seen": 38500690, + "router_z_loss_clip": 3.62304688, + "router_z_loss_mlp": 0.39794922, + "step": 1779, + "time_per_iteration": 2.593081474304199 + }, + { + "auxiliary_loss_clip": 0.06673639, + "auxiliary_loss_mlp": 0.01307049, + "balance_loss_clip": 0.06310658, + "balance_loss_mlp": 0.01266255, + "epoch": 0.10701938974898542, + "flos": 16984281121920.0, + "grad_norm": 6.263456292034634, + "language_loss": 0.689089, + "learning_rate": 3.9380965924707495e-06, + "loss": 0.76889586, + "num_input_tokens_seen": 38518405, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 0.40844727, + "step": 1780, + "time_per_iteration": 2.5288658142089844 + }, + { + "auxiliary_loss_clip": 0.06670965, + "auxiliary_loss_mlp": 0.01308635, + "balance_loss_clip": 0.06307741, + "balance_loss_mlp": 0.01267675, + "epoch": 0.10707951300165339, + "flos": 15893568524160.0, + "grad_norm": 2.7813039840033116, + "language_loss": 0.94183797, + "learning_rate": 3.938000408844265e-06, + "loss": 1.02163386, + "num_input_tokens_seen": 38535060, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 0.40942383, + "step": 1781, + "time_per_iteration": 2.5472099781036377 + }, + { + "auxiliary_loss_clip": 0.06674273, + "auxiliary_loss_mlp": 0.01309874, + "balance_loss_clip": 0.06307364, + "balance_loss_mlp": 0.01267793, + "epoch": 0.10713963625432135, + "flos": 14252510309760.0, + "grad_norm": 2.902551508287184, + "language_loss": 0.80661923, + "learning_rate": 3.9379041517286105e-06, + "loss": 0.88646066, + "num_input_tokens_seen": 38552855, + "router_z_loss_clip": 3.671875, + "router_z_loss_mlp": 0.4206543, + "step": 1782, + "time_per_iteration": 2.510643482208252 + }, + { + "auxiliary_loss_clip": 0.06686161, + "auxiliary_loss_mlp": 0.01310662, + "balance_loss_clip": 0.06313431, + "balance_loss_mlp": 0.01267341, + "epoch": 0.10719975950698933, + "flos": 16761224753280.0, + "grad_norm": 2.870404925374148, + "language_loss": 0.80170923, + "learning_rate": 3.937807821127436e-06, + "loss": 0.88167745, + "num_input_tokens_seen": 38570075, + "router_z_loss_clip": 3.7265625, + "router_z_loss_mlp": 0.43334961, + "step": 1783, + "time_per_iteration": 2.5342109203338623 + }, + { + "auxiliary_loss_clip": 0.06683534, + "auxiliary_loss_mlp": 0.01311834, + "balance_loss_clip": 0.063077, + "balance_loss_mlp": 0.0126818, + "epoch": 0.1072598827596573, + "flos": 22717181295360.0, + "grad_norm": 2.882000106412139, + "language_loss": 0.88123596, + "learning_rate": 3.937711417044395e-06, + "loss": 0.96118969, + "num_input_tokens_seen": 38587970, + "router_z_loss_clip": 3.75390625, + "router_z_loss_mlp": 0.4362793, + "step": 1784, + "time_per_iteration": 2.5347747802734375 + }, + { + "auxiliary_loss_clip": 0.0667218, + "auxiliary_loss_mlp": 0.0129997, + "balance_loss_clip": 0.06303082, + "balance_loss_mlp": 0.01257484, + "epoch": 0.10732000601232526, + "flos": 23264969362560.0, + "grad_norm": 3.307544320202646, + "language_loss": 1.02124667, + "learning_rate": 3.937614939483143e-06, + "loss": 1.10096812, + "num_input_tokens_seen": 38605840, + "router_z_loss_clip": 3.6875, + "router_z_loss_mlp": 0.42480469, + "step": 1785, + "time_per_iteration": 2.573028802871704 + }, + { + "auxiliary_loss_clip": 0.06653184, + "auxiliary_loss_mlp": 0.01298346, + "balance_loss_clip": 0.06301528, + "balance_loss_mlp": 0.01260676, + "epoch": 0.10738012926499324, + "flos": 24213951578880.0, + "grad_norm": 1.5126040850021356, + "language_loss": 0.86291718, + "learning_rate": 3.937518388447339e-06, + "loss": 0.94243246, + "num_input_tokens_seen": 38627070, + "router_z_loss_clip": 3.51757812, + "router_z_loss_mlp": 0.37670898, + "step": 1786, + "time_per_iteration": 2.583588123321533 + }, + { + "auxiliary_loss_clip": 0.06674268, + "auxiliary_loss_mlp": 0.01305446, + "balance_loss_clip": 0.06299917, + "balance_loss_mlp": 0.01260337, + "epoch": 0.1074402525176612, + "flos": 20929361454720.0, + "grad_norm": 2.204457856509681, + "language_loss": 0.80718577, + "learning_rate": 3.937421763940642e-06, + "loss": 0.88698298, + "num_input_tokens_seen": 38645840, + "router_z_loss_clip": 3.74414062, + "router_z_loss_mlp": 0.45092773, + "step": 1787, + "time_per_iteration": 2.5648107528686523 + }, + { + "auxiliary_loss_clip": 0.06675328, + "auxiliary_loss_mlp": 0.01304706, + "balance_loss_clip": 0.06304328, + "balance_loss_mlp": 0.01262769, + "epoch": 0.10750037577032917, + "flos": 16952695332480.0, + "grad_norm": 2.64327450986053, + "language_loss": 0.8385697, + "learning_rate": 3.937325065966719e-06, + "loss": 0.91837001, + "num_input_tokens_seen": 38664770, + "router_z_loss_clip": 3.71484375, + "router_z_loss_mlp": 0.41943359, + "step": 1788, + "time_per_iteration": 2.5402321815490723 + }, + { + "auxiliary_loss_clip": 0.06668989, + "auxiliary_loss_mlp": 0.01316653, + "balance_loss_clip": 0.0630315, + "balance_loss_mlp": 0.01276384, + "epoch": 0.10756049902299715, + "flos": 20272770460800.0, + "grad_norm": 2.8631598958886135, + "language_loss": 0.79821587, + "learning_rate": 3.9372282945292335e-06, + "loss": 0.87807226, + "num_input_tokens_seen": 38683865, + "router_z_loss_clip": 3.66015625, + "router_z_loss_mlp": 0.40258789, + "step": 1789, + "time_per_iteration": 2.5255203247070312 + }, + { + "auxiliary_loss_clip": 0.06671752, + "auxiliary_loss_mlp": 0.01304626, + "balance_loss_clip": 0.06304207, + "balance_loss_mlp": 0.01261019, + "epoch": 0.10762062227566511, + "flos": 23593264859520.0, + "grad_norm": 3.1602441142249584, + "language_loss": 0.75890934, + "learning_rate": 3.937131449631859e-06, + "loss": 0.83867311, + "num_input_tokens_seen": 38702485, + "router_z_loss_clip": 3.671875, + "router_z_loss_mlp": 0.43603516, + "step": 1790, + "time_per_iteration": 2.6021804809570312 + }, + { + "auxiliary_loss_clip": 0.06681746, + "auxiliary_loss_mlp": 0.01304108, + "balance_loss_clip": 0.06308466, + "balance_loss_mlp": 0.01261741, + "epoch": 0.10768074552833308, + "flos": 24316549303680.0, + "grad_norm": 2.153087509424505, + "language_loss": 0.80275488, + "learning_rate": 3.9370345312782645e-06, + "loss": 0.88261342, + "num_input_tokens_seen": 38722475, + "router_z_loss_clip": 3.734375, + "router_z_loss_mlp": 0.42333984, + "step": 1791, + "time_per_iteration": 2.546696662902832 + }, + { + "auxiliary_loss_clip": 0.06660049, + "auxiliary_loss_mlp": 0.01311951, + "balance_loss_clip": 0.06307684, + "balance_loss_mlp": 0.01273255, + "epoch": 0.10774086878100106, + "flos": 25306760528640.0, + "grad_norm": 1.9333309848647533, + "language_loss": 0.72259545, + "learning_rate": 3.936937539472126e-06, + "loss": 0.80231547, + "num_input_tokens_seen": 38743285, + "router_z_loss_clip": 3.5234375, + "router_z_loss_mlp": 0.38647461, + "step": 1792, + "time_per_iteration": 3.9801604747772217 + }, + { + "auxiliary_loss_clip": 0.06673245, + "auxiliary_loss_mlp": 0.01302989, + "balance_loss_clip": 0.06307209, + "balance_loss_mlp": 0.01260813, + "epoch": 0.10780099203366902, + "flos": 22060506447360.0, + "grad_norm": 2.562098500680419, + "language_loss": 0.78115147, + "learning_rate": 3.9368404742171236e-06, + "loss": 0.86091387, + "num_input_tokens_seen": 38763035, + "router_z_loss_clip": 3.65820312, + "router_z_loss_mlp": 0.42163086, + "step": 1793, + "time_per_iteration": 2.5435540676116943 + }, + { + "auxiliary_loss_clip": 0.06668183, + "auxiliary_loss_mlp": 0.01304414, + "balance_loss_clip": 0.06312631, + "balance_loss_mlp": 0.01268151, + "epoch": 0.10786111528633699, + "flos": 22754091818880.0, + "grad_norm": 1.5894120102976992, + "language_loss": 0.86093199, + "learning_rate": 3.936743335516936e-06, + "loss": 0.94065803, + "num_input_tokens_seen": 38784900, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 0.36279297, + "step": 1794, + "time_per_iteration": 4.001549482345581 + }, + { + "auxiliary_loss_clip": 0.0669271, + "auxiliary_loss_mlp": 0.01312602, + "balance_loss_clip": 0.06319374, + "balance_loss_mlp": 0.01269472, + "epoch": 0.10792123853900495, + "flos": 20857510978560.0, + "grad_norm": 2.1590787324009257, + "language_loss": 0.77325815, + "learning_rate": 3.936646123375246e-06, + "loss": 0.8533113, + "num_input_tokens_seen": 38804695, + "router_z_loss_clip": 3.734375, + "router_z_loss_mlp": 0.43115234, + "step": 1795, + "time_per_iteration": 2.601548910140991 + }, + { + "auxiliary_loss_clip": 0.06686068, + "auxiliary_loss_mlp": 0.01301313, + "balance_loss_clip": 0.06317562, + "balance_loss_mlp": 0.01262212, + "epoch": 0.10798136179167293, + "flos": 17754454725120.0, + "grad_norm": 3.0035183040345306, + "language_loss": 0.83787191, + "learning_rate": 3.936548837795741e-06, + "loss": 0.91774577, + "num_input_tokens_seen": 38822395, + "router_z_loss_clip": 3.68359375, + "router_z_loss_mlp": 0.39086914, + "step": 1796, + "time_per_iteration": 2.506821870803833 + }, + { + "auxiliary_loss_clip": 0.06692545, + "auxiliary_loss_mlp": 0.01329164, + "balance_loss_clip": 0.06318776, + "balance_loss_mlp": 0.01285318, + "epoch": 0.1080414850443409, + "flos": 13594745358720.0, + "grad_norm": 2.560788533662373, + "language_loss": 0.7551347, + "learning_rate": 3.936451478782111e-06, + "loss": 0.83535177, + "num_input_tokens_seen": 38839865, + "router_z_loss_clip": 3.73828125, + "router_z_loss_mlp": 0.43847656, + "step": 1797, + "time_per_iteration": 3.9367597103118896 + }, + { + "auxiliary_loss_clip": 0.06662647, + "auxiliary_loss_mlp": 0.01300606, + "balance_loss_clip": 0.06308785, + "balance_loss_mlp": 0.0126265, + "epoch": 0.10810160829700886, + "flos": 16259026106880.0, + "grad_norm": 2.354924251941542, + "language_loss": 0.83353364, + "learning_rate": 3.936354046338046e-06, + "loss": 0.91316622, + "num_input_tokens_seen": 38857300, + "router_z_loss_clip": 3.53515625, + "router_z_loss_mlp": 0.37939453, + "step": 1798, + "time_per_iteration": 4.009509086608887 + }, + { + "auxiliary_loss_clip": 0.06672391, + "auxiliary_loss_mlp": 0.01305094, + "balance_loss_clip": 0.06315865, + "balance_loss_mlp": 0.01265635, + "epoch": 0.10816173154967684, + "flos": 15163282264320.0, + "grad_norm": 3.5539012768628786, + "language_loss": 0.87248892, + "learning_rate": 3.936256540467242e-06, + "loss": 0.95226371, + "num_input_tokens_seen": 38874960, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 0.39477539, + "step": 1799, + "time_per_iteration": 2.5058934688568115 + }, + { + "auxiliary_loss_clip": 0.06677136, + "auxiliary_loss_mlp": 0.01305557, + "balance_loss_clip": 0.06318786, + "balance_loss_mlp": 0.01268459, + "epoch": 0.10822185480234481, + "flos": 17791113686400.0, + "grad_norm": 2.263102555339672, + "language_loss": 0.78951424, + "learning_rate": 3.9361589611733955e-06, + "loss": 0.86934125, + "num_input_tokens_seen": 38893610, + "router_z_loss_clip": 3.58203125, + "router_z_loss_mlp": 0.37084961, + "step": 1800, + "time_per_iteration": 2.546147584915161 + }, + { + "auxiliary_loss_clip": 0.06672224, + "auxiliary_loss_mlp": 0.01299, + "balance_loss_clip": 0.06316296, + "balance_loss_mlp": 0.01262546, + "epoch": 0.10828197805501277, + "flos": 25563708673920.0, + "grad_norm": 5.510395821762047, + "language_loss": 0.74356997, + "learning_rate": 3.9360613084602075e-06, + "loss": 0.82328218, + "num_input_tokens_seen": 38913485, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 0.36425781, + "step": 1801, + "time_per_iteration": 2.6982262134552 + }, + { + "auxiliary_loss_clip": 0.06691626, + "auxiliary_loss_mlp": 0.01309625, + "balance_loss_clip": 0.06324095, + "balance_loss_mlp": 0.01272813, + "epoch": 0.10834210130768075, + "flos": 28991748188160.0, + "grad_norm": 2.1562213268616355, + "language_loss": 0.67963791, + "learning_rate": 3.935963582331381e-06, + "loss": 0.75965041, + "num_input_tokens_seen": 38935650, + "router_z_loss_clip": 3.67578125, + "router_z_loss_mlp": 0.3684082, + "step": 1802, + "time_per_iteration": 2.633770704269409 + }, + { + "auxiliary_loss_clip": 0.06676073, + "auxiliary_loss_mlp": 0.01309023, + "balance_loss_clip": 0.0632169, + "balance_loss_mlp": 0.01273379, + "epoch": 0.10840222456034872, + "flos": 20270045203200.0, + "grad_norm": 4.600711865085207, + "language_loss": 0.83367407, + "learning_rate": 3.935865782790621e-06, + "loss": 0.9135251, + "num_input_tokens_seen": 38954130, + "router_z_loss_clip": 3.55078125, + "router_z_loss_mlp": 0.35668945, + "step": 1803, + "time_per_iteration": 2.5231714248657227 + }, + { + "auxiliary_loss_clip": 0.06688153, + "auxiliary_loss_mlp": 0.01302267, + "balance_loss_clip": 0.06328186, + "balance_loss_mlp": 0.01263286, + "epoch": 0.10846234781301668, + "flos": 19868851054080.0, + "grad_norm": 2.166179009667806, + "language_loss": 0.92279881, + "learning_rate": 3.9357679098416365e-06, + "loss": 1.00270307, + "num_input_tokens_seen": 38972905, + "router_z_loss_clip": 3.59765625, + "router_z_loss_mlp": 0.39013672, + "step": 1804, + "time_per_iteration": 2.5790512561798096 + }, + { + "auxiliary_loss_clip": 0.06684472, + "auxiliary_loss_mlp": 0.01313096, + "balance_loss_clip": 0.06322414, + "balance_loss_mlp": 0.01273327, + "epoch": 0.10852247106568465, + "flos": 26476283491200.0, + "grad_norm": 2.1541825231451384, + "language_loss": 0.7834245, + "learning_rate": 3.935669963488139e-06, + "loss": 0.8634001, + "num_input_tokens_seen": 38993255, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 0.39794922, + "step": 1805, + "time_per_iteration": 2.579225778579712 + }, + { + "auxiliary_loss_clip": 0.06686831, + "auxiliary_loss_mlp": 0.01314489, + "balance_loss_clip": 0.06327775, + "balance_loss_mlp": 0.01276938, + "epoch": 0.10858259431835263, + "flos": 30089420674560.0, + "grad_norm": 1.8150777160293243, + "language_loss": 0.87391019, + "learning_rate": 3.935571943733843e-06, + "loss": 0.95392346, + "num_input_tokens_seen": 39012610, + "router_z_loss_clip": 3.58789062, + "router_z_loss_mlp": 0.37548828, + "step": 1806, + "time_per_iteration": 2.6113767623901367 + }, + { + "auxiliary_loss_clip": 0.06674515, + "auxiliary_loss_mlp": 0.01306373, + "balance_loss_clip": 0.06320654, + "balance_loss_mlp": 0.01270038, + "epoch": 0.10864271757102059, + "flos": 19069313794560.0, + "grad_norm": 2.587857349139583, + "language_loss": 0.81862879, + "learning_rate": 3.9354738505824635e-06, + "loss": 0.89843768, + "num_input_tokens_seen": 39030120, + "router_z_loss_clip": 3.53515625, + "router_z_loss_mlp": 0.36328125, + "step": 1807, + "time_per_iteration": 2.5133659839630127 + }, + { + "auxiliary_loss_clip": 0.06671922, + "auxiliary_loss_mlp": 0.01298096, + "balance_loss_clip": 0.06316403, + "balance_loss_mlp": 0.01264193, + "epoch": 0.10870284082368856, + "flos": 24721558813440.0, + "grad_norm": 5.872677105154593, + "language_loss": 0.80080831, + "learning_rate": 3.9353756840377225e-06, + "loss": 0.88050854, + "num_input_tokens_seen": 39049875, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 0.33911133, + "step": 1808, + "time_per_iteration": 2.615813732147217 + }, + { + "auxiliary_loss_clip": 0.06679243, + "auxiliary_loss_mlp": 0.01305785, + "balance_loss_clip": 0.06317936, + "balance_loss_mlp": 0.0126926, + "epoch": 0.10876296407635654, + "flos": 20633322579840.0, + "grad_norm": 1.9478579539752536, + "language_loss": 0.80837792, + "learning_rate": 3.935277444103342e-06, + "loss": 0.88822818, + "num_input_tokens_seen": 39068935, + "router_z_loss_clip": 3.61328125, + "router_z_loss_mlp": 0.36523438, + "step": 1809, + "time_per_iteration": 2.5448191165924072 + }, + { + "auxiliary_loss_clip": 0.0666375, + "auxiliary_loss_mlp": 0.01303981, + "balance_loss_clip": 0.06309726, + "balance_loss_mlp": 0.01265119, + "epoch": 0.1088230873290245, + "flos": 21586245937920.0, + "grad_norm": 2.4636813373380213, + "language_loss": 0.86466354, + "learning_rate": 3.935179130783046e-06, + "loss": 0.94434083, + "num_input_tokens_seen": 39087370, + "router_z_loss_clip": 3.54101562, + "router_z_loss_mlp": 0.38891602, + "step": 1810, + "time_per_iteration": 2.603607654571533 + }, + { + "auxiliary_loss_clip": 0.06689243, + "auxiliary_loss_mlp": 0.01306323, + "balance_loss_clip": 0.06319645, + "balance_loss_mlp": 0.01268367, + "epoch": 0.10888321058169247, + "flos": 26476283491200.0, + "grad_norm": 1.9747664396184277, + "language_loss": 0.65524805, + "learning_rate": 3.935080744080564e-06, + "loss": 0.73520374, + "num_input_tokens_seen": 39106635, + "router_z_loss_clip": 3.69726562, + "router_z_loss_mlp": 0.37939453, + "step": 1811, + "time_per_iteration": 2.581341505050659 + }, + { + "auxiliary_loss_clip": 0.0667599, + "auxiliary_loss_mlp": 0.01304861, + "balance_loss_clip": 0.06313843, + "balance_loss_mlp": 0.01266166, + "epoch": 0.10894333383436045, + "flos": 25855722552960.0, + "grad_norm": 2.675746043218001, + "language_loss": 0.75747859, + "learning_rate": 3.934982283999626e-06, + "loss": 0.83728707, + "num_input_tokens_seen": 39126335, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 0.38671875, + "step": 1812, + "time_per_iteration": 2.6015379428863525 + }, + { + "auxiliary_loss_clip": 0.06657378, + "auxiliary_loss_mlp": 0.01303294, + "balance_loss_clip": 0.06303936, + "balance_loss_mlp": 0.01265219, + "epoch": 0.10900345708702841, + "flos": 19543238887680.0, + "grad_norm": 2.31852988369708, + "language_loss": 0.74425399, + "learning_rate": 3.934883750543966e-06, + "loss": 0.82386076, + "num_input_tokens_seen": 39144820, + "router_z_loss_clip": 3.53320312, + "router_z_loss_mlp": 0.38085938, + "step": 1813, + "time_per_iteration": 2.5689308643341064 + }, + { + "auxiliary_loss_clip": 0.06659622, + "auxiliary_loss_mlp": 0.01293341, + "balance_loss_clip": 0.06308373, + "balance_loss_mlp": 0.01258556, + "epoch": 0.10906358033969638, + "flos": 23630091528960.0, + "grad_norm": 1.8365155089256564, + "language_loss": 0.84168994, + "learning_rate": 3.93478514371732e-06, + "loss": 0.92121959, + "num_input_tokens_seen": 39165945, + "router_z_loss_clip": 3.51367188, + "router_z_loss_mlp": 0.34790039, + "step": 1814, + "time_per_iteration": 2.5616791248321533 + }, + { + "auxiliary_loss_clip": 0.06670845, + "auxiliary_loss_mlp": 0.01300399, + "balance_loss_clip": 0.06307411, + "balance_loss_mlp": 0.01261036, + "epoch": 0.10912370359236434, + "flos": 21221039917440.0, + "grad_norm": 3.301230683958358, + "language_loss": 0.85154849, + "learning_rate": 3.934686463523429e-06, + "loss": 0.93126094, + "num_input_tokens_seen": 39183520, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 0.39355469, + "step": 1815, + "time_per_iteration": 2.57688307762146 + }, + { + "auxiliary_loss_clip": 0.06661555, + "auxiliary_loss_mlp": 0.01302183, + "balance_loss_clip": 0.06312289, + "balance_loss_mlp": 0.01263726, + "epoch": 0.10918382684503232, + "flos": 13558296032640.0, + "grad_norm": 2.7300514950641714, + "language_loss": 0.73428917, + "learning_rate": 3.9345877099660315e-06, + "loss": 0.81392652, + "num_input_tokens_seen": 39201190, + "router_z_loss_clip": 3.4921875, + "router_z_loss_mlp": 0.38476562, + "step": 1816, + "time_per_iteration": 2.503822088241577 + }, + { + "auxiliary_loss_clip": 0.06674603, + "auxiliary_loss_mlp": 0.01310351, + "balance_loss_clip": 0.06307942, + "balance_loss_mlp": 0.01269105, + "epoch": 0.10924395009770028, + "flos": 27971712109440.0, + "grad_norm": 2.9873916021139078, + "language_loss": 0.74010128, + "learning_rate": 3.9344888830488744e-06, + "loss": 0.81995082, + "num_input_tokens_seen": 39221210, + "router_z_loss_clip": 3.66601562, + "router_z_loss_mlp": 0.41235352, + "step": 1817, + "time_per_iteration": 2.636141300201416 + }, + { + "auxiliary_loss_clip": 0.06667508, + "auxiliary_loss_mlp": 0.01306282, + "balance_loss_clip": 0.06316356, + "balance_loss_mlp": 0.01268659, + "epoch": 0.10930407335036825, + "flos": 25600912686720.0, + "grad_norm": 1.8767258076281454, + "language_loss": 0.68811858, + "learning_rate": 3.934389982775706e-06, + "loss": 0.76785648, + "num_input_tokens_seen": 39242025, + "router_z_loss_clip": 3.51171875, + "router_z_loss_mlp": 0.37597656, + "step": 1818, + "time_per_iteration": 2.5823488235473633 + }, + { + "auxiliary_loss_clip": 0.06675036, + "auxiliary_loss_mlp": 0.01306463, + "balance_loss_clip": 0.06313543, + "balance_loss_mlp": 0.01266575, + "epoch": 0.10936419660303623, + "flos": 18412177749120.0, + "grad_norm": 2.168064712705315, + "language_loss": 0.74997962, + "learning_rate": 3.934291009150275e-06, + "loss": 0.82979459, + "num_input_tokens_seen": 39259870, + "router_z_loss_clip": 3.61523438, + "router_z_loss_mlp": 0.39892578, + "step": 1819, + "time_per_iteration": 2.5780999660491943 + }, + { + "auxiliary_loss_clip": 0.0666959, + "auxiliary_loss_mlp": 0.01302484, + "balance_loss_clip": 0.06316112, + "balance_loss_mlp": 0.01264123, + "epoch": 0.1094243198557042, + "flos": 23846523425280.0, + "grad_norm": 2.805852177899608, + "language_loss": 0.75565147, + "learning_rate": 3.934191962176335e-06, + "loss": 0.83537227, + "num_input_tokens_seen": 39278500, + "router_z_loss_clip": 3.53515625, + "router_z_loss_mlp": 0.38354492, + "step": 1820, + "time_per_iteration": 2.55102801322937 + }, + { + "auxiliary_loss_clip": 0.06670672, + "auxiliary_loss_mlp": 0.01301119, + "balance_loss_clip": 0.06311028, + "balance_loss_mlp": 0.01261065, + "epoch": 0.10948444310837216, + "flos": 14648589360000.0, + "grad_norm": 3.185311290283081, + "language_loss": 0.84421206, + "learning_rate": 3.934092841857642e-06, + "loss": 0.92392999, + "num_input_tokens_seen": 39294800, + "router_z_loss_clip": 3.59765625, + "router_z_loss_mlp": 0.40039062, + "step": 1821, + "time_per_iteration": 2.557086229324341 + }, + { + "auxiliary_loss_clip": 0.06666994, + "auxiliary_loss_mlp": 0.01310986, + "balance_loss_clip": 0.06314231, + "balance_loss_mlp": 0.01271409, + "epoch": 0.10954456636104014, + "flos": 27826250221440.0, + "grad_norm": 3.7637860321271117, + "language_loss": 0.78284943, + "learning_rate": 3.933993648197955e-06, + "loss": 0.86262918, + "num_input_tokens_seen": 39314625, + "router_z_loss_clip": 3.52734375, + "router_z_loss_mlp": 0.39575195, + "step": 1822, + "time_per_iteration": 2.607753038406372 + }, + { + "auxiliary_loss_clip": 0.06665225, + "auxiliary_loss_mlp": 0.01305751, + "balance_loss_clip": 0.06311564, + "balance_loss_mlp": 0.01267271, + "epoch": 0.1096046896137081, + "flos": 33629491497600.0, + "grad_norm": 2.4721955378281133, + "language_loss": 0.81345534, + "learning_rate": 3.933894381201034e-06, + "loss": 0.89316511, + "num_input_tokens_seen": 39336465, + "router_z_loss_clip": 3.53515625, + "router_z_loss_mlp": 0.38525391, + "step": 1823, + "time_per_iteration": 2.7046356201171875 + }, + { + "auxiliary_loss_clip": 0.06663416, + "auxiliary_loss_mlp": 0.01297526, + "balance_loss_clip": 0.06311031, + "balance_loss_mlp": 0.01260643, + "epoch": 0.10966481286637607, + "flos": 26987370670080.0, + "grad_norm": 1.5405254615008266, + "language_loss": 0.8184576, + "learning_rate": 3.933795040870645e-06, + "loss": 0.898067, + "num_input_tokens_seen": 39357930, + "router_z_loss_clip": 3.5234375, + "router_z_loss_mlp": 0.36889648, + "step": 1824, + "time_per_iteration": 2.6020491123199463 + }, + { + "auxiliary_loss_clip": 0.06675697, + "auxiliary_loss_mlp": 0.01302612, + "balance_loss_clip": 0.06317075, + "balance_loss_mlp": 0.01262796, + "epoch": 0.10972493611904403, + "flos": 23042751534720.0, + "grad_norm": 2.030784567379419, + "language_loss": 0.88740194, + "learning_rate": 3.933695627210554e-06, + "loss": 0.96718502, + "num_input_tokens_seen": 39376380, + "router_z_loss_clip": 3.5859375, + "router_z_loss_mlp": 0.3984375, + "step": 1825, + "time_per_iteration": 2.6143786907196045 + }, + { + "auxiliary_loss_clip": 0.06672946, + "auxiliary_loss_mlp": 0.01304094, + "balance_loss_clip": 0.06315491, + "balance_loss_mlp": 0.01265113, + "epoch": 0.10978505937171201, + "flos": 38113261729920.0, + "grad_norm": 4.39958169553056, + "language_loss": 0.77133435, + "learning_rate": 3.933596140224532e-06, + "loss": 0.85110474, + "num_input_tokens_seen": 39399935, + "router_z_loss_clip": 3.57617188, + "router_z_loss_mlp": 0.39013672, + "step": 1826, + "time_per_iteration": 2.6767754554748535 + }, + { + "auxiliary_loss_clip": 0.06562361, + "auxiliary_loss_mlp": 0.01306115, + "balance_loss_clip": 0.06342762, + "balance_loss_mlp": 0.01289641, + "epoch": 0.10984518262437998, + "flos": 59867987500800.0, + "grad_norm": 0.8265503512589908, + "language_loss": 0.55217832, + "learning_rate": 3.93349657991635e-06, + "loss": 0.63086313, + "num_input_tokens_seen": 39460685, + "router_z_loss_clip": 2.19921875, + "router_z_loss_mlp": 0.16479492, + "step": 1827, + "time_per_iteration": 3.2042500972747803 + }, + { + "auxiliary_loss_clip": 0.06558152, + "auxiliary_loss_mlp": 0.01284859, + "balance_loss_clip": 0.06338888, + "balance_loss_mlp": 0.01267704, + "epoch": 0.10990530587704794, + "flos": 66741088907520.0, + "grad_norm": 0.7202592314019287, + "language_loss": 0.55369592, + "learning_rate": 3.933396946289784e-06, + "loss": 0.63212597, + "num_input_tokens_seen": 39524765, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.17175293, + "step": 1828, + "time_per_iteration": 3.2514500617980957 + }, + { + "auxiliary_loss_clip": 0.06692256, + "auxiliary_loss_mlp": 0.01311884, + "balance_loss_clip": 0.06327218, + "balance_loss_mlp": 0.01270018, + "epoch": 0.10996542912971592, + "flos": 25454234914560.0, + "grad_norm": 6.114677648786519, + "language_loss": 0.86263084, + "learning_rate": 3.933297239348612e-06, + "loss": 0.94267225, + "num_input_tokens_seen": 39543640, + "router_z_loss_clip": 3.65234375, + "router_z_loss_mlp": 0.41918945, + "step": 1829, + "time_per_iteration": 2.586923360824585 + }, + { + "auxiliary_loss_clip": 0.06682983, + "auxiliary_loss_mlp": 0.01319143, + "balance_loss_clip": 0.06320649, + "balance_loss_mlp": 0.01279207, + "epoch": 0.11002555238238389, + "flos": 44028282752640.0, + "grad_norm": 2.5270889660052025, + "language_loss": 0.90112162, + "learning_rate": 3.933197459096614e-06, + "loss": 0.98114288, + "num_input_tokens_seen": 39567525, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 0.3996582, + "step": 1830, + "time_per_iteration": 2.8102030754089355 + }, + { + "auxiliary_loss_clip": 0.06544227, + "auxiliary_loss_mlp": 0.01284934, + "balance_loss_clip": 0.06324031, + "balance_loss_mlp": 0.01268376, + "epoch": 0.11008567563505185, + "flos": 54085248547200.0, + "grad_norm": 0.6738836054555057, + "language_loss": 0.55525172, + "learning_rate": 3.9330976055375756e-06, + "loss": 0.63354337, + "num_input_tokens_seen": 39628470, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.16564941, + "step": 1831, + "time_per_iteration": 4.652044057846069 + }, + { + "auxiliary_loss_clip": 0.06700309, + "auxiliary_loss_mlp": 0.01328613, + "balance_loss_clip": 0.06332322, + "balance_loss_mlp": 0.01284744, + "epoch": 0.11014579888771983, + "flos": 24249981634560.0, + "grad_norm": 4.072580491450979, + "language_loss": 0.92313743, + "learning_rate": 3.932997678675282e-06, + "loss": 1.00342667, + "num_input_tokens_seen": 39646670, + "router_z_loss_clip": 3.67773438, + "router_z_loss_mlp": 0.43823242, + "step": 1832, + "time_per_iteration": 2.6010701656341553 + }, + { + "auxiliary_loss_clip": 0.06543858, + "auxiliary_loss_mlp": 0.01268849, + "balance_loss_clip": 0.06322708, + "balance_loss_mlp": 0.0125247, + "epoch": 0.1102059221403878, + "flos": 57763653661440.0, + "grad_norm": 0.681716215184674, + "language_loss": 0.59753174, + "learning_rate": 3.932897678513523e-06, + "loss": 0.67565876, + "num_input_tokens_seen": 39712915, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.16381836, + "step": 1833, + "time_per_iteration": 3.3245253562927246 + }, + { + "auxiliary_loss_clip": 0.0668912, + "auxiliary_loss_mlp": 0.01321784, + "balance_loss_clip": 0.06319445, + "balance_loss_mlp": 0.01278773, + "epoch": 0.11026604539305576, + "flos": 16800818607360.0, + "grad_norm": 5.311308312768562, + "language_loss": 0.81575066, + "learning_rate": 3.93279760505609e-06, + "loss": 0.89585972, + "num_input_tokens_seen": 39730650, + "router_z_loss_clip": 3.703125, + "router_z_loss_mlp": 0.42993164, + "step": 1834, + "time_per_iteration": 4.020633697509766 + }, + { + "auxiliary_loss_clip": 0.0668771, + "auxiliary_loss_mlp": 0.01323505, + "balance_loss_clip": 0.0632341, + "balance_loss_mlp": 0.01282997, + "epoch": 0.11032616864572373, + "flos": 23994920206080.0, + "grad_norm": 4.522465656610911, + "language_loss": 0.91756475, + "learning_rate": 3.932697458306779e-06, + "loss": 0.99767691, + "num_input_tokens_seen": 39751065, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 0.40478516, + "step": 1835, + "time_per_iteration": 2.5956919193267822 + }, + { + "auxiliary_loss_clip": 0.06685364, + "auxiliary_loss_mlp": 0.01321402, + "balance_loss_clip": 0.06324954, + "balance_loss_mlp": 0.01281729, + "epoch": 0.1103862918983917, + "flos": 19689329681280.0, + "grad_norm": 3.000861759629478, + "language_loss": 0.66412532, + "learning_rate": 3.932597238269386e-06, + "loss": 0.74419296, + "num_input_tokens_seen": 39769245, + "router_z_loss_clip": 3.60546875, + "router_z_loss_mlp": 0.39648438, + "step": 1836, + "time_per_iteration": 2.5927958488464355 + }, + { + "auxiliary_loss_clip": 0.06670263, + "auxiliary_loss_mlp": 0.01319261, + "balance_loss_clip": 0.06317647, + "balance_loss_mlp": 0.01279541, + "epoch": 0.11044641515105967, + "flos": 32169086686080.0, + "grad_norm": 2.1343283023714865, + "language_loss": 0.74546272, + "learning_rate": 3.932496944947711e-06, + "loss": 0.82535791, + "num_input_tokens_seen": 39790830, + "router_z_loss_clip": 3.52539062, + "router_z_loss_mlp": 0.3972168, + "step": 1837, + "time_per_iteration": 5.453325033187866 + }, + { + "auxiliary_loss_clip": 0.06688204, + "auxiliary_loss_mlp": 0.01319143, + "balance_loss_clip": 0.06321806, + "balance_loss_mlp": 0.01281496, + "epoch": 0.11050653840372764, + "flos": 16694573230080.0, + "grad_norm": 2.107729732197389, + "language_loss": 0.79967713, + "learning_rate": 3.93239657834556e-06, + "loss": 0.87975061, + "num_input_tokens_seen": 39809475, + "router_z_loss_clip": 3.66210938, + "router_z_loss_mlp": 0.3762207, + "step": 1838, + "time_per_iteration": 2.5330708026885986 + }, + { + "auxiliary_loss_clip": 0.06681567, + "auxiliary_loss_mlp": 0.01310209, + "balance_loss_clip": 0.06323014, + "balance_loss_mlp": 0.01271013, + "epoch": 0.11056666165639562, + "flos": 21214205809920.0, + "grad_norm": 1.83916180844076, + "language_loss": 0.72651547, + "learning_rate": 3.932296138466736e-06, + "loss": 0.8064332, + "num_input_tokens_seen": 39826355, + "router_z_loss_clip": 3.58398438, + "router_z_loss_mlp": 0.39160156, + "step": 1839, + "time_per_iteration": 2.5494542121887207 + }, + { + "auxiliary_loss_clip": 0.06685573, + "auxiliary_loss_mlp": 0.01308897, + "balance_loss_clip": 0.06317459, + "balance_loss_mlp": 0.0126777, + "epoch": 0.11062678490906358, + "flos": 19170444072960.0, + "grad_norm": 2.2710606045718835, + "language_loss": 0.80620813, + "learning_rate": 3.93219562531505e-06, + "loss": 0.88615286, + "num_input_tokens_seen": 39845335, + "router_z_loss_clip": 3.68359375, + "router_z_loss_mlp": 0.41137695, + "step": 1840, + "time_per_iteration": 2.525967836380005 + }, + { + "auxiliary_loss_clip": 0.0666925, + "auxiliary_loss_mlp": 0.01306907, + "balance_loss_clip": 0.06314851, + "balance_loss_mlp": 0.01271287, + "epoch": 0.11068690816173155, + "flos": 24901457529600.0, + "grad_norm": 1.7471100044619239, + "language_loss": 0.89207804, + "learning_rate": 3.932095038894311e-06, + "loss": 0.97183955, + "num_input_tokens_seen": 39865065, + "router_z_loss_clip": 3.54296875, + "router_z_loss_mlp": 0.35620117, + "step": 1841, + "time_per_iteration": 2.6120924949645996 + }, + { + "auxiliary_loss_clip": 0.06674149, + "auxiliary_loss_mlp": 0.01316221, + "balance_loss_clip": 0.06318908, + "balance_loss_mlp": 0.01276739, + "epoch": 0.11074703141439952, + "flos": 16478015552640.0, + "grad_norm": 2.1111741847875822, + "language_loss": 0.92148924, + "learning_rate": 3.931994379208334e-06, + "loss": 1.00139296, + "num_input_tokens_seen": 39882780, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 0.39477539, + "step": 1842, + "time_per_iteration": 2.5187559127807617 + }, + { + "auxiliary_loss_clip": 0.06674332, + "auxiliary_loss_mlp": 0.01308171, + "balance_loss_clip": 0.06317849, + "balance_loss_mlp": 0.01269166, + "epoch": 0.11080715466706749, + "flos": 19178535991680.0, + "grad_norm": 2.023955120097268, + "language_loss": 0.87531722, + "learning_rate": 3.931893646260937e-06, + "loss": 0.95514226, + "num_input_tokens_seen": 39900295, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 0.39038086, + "step": 1843, + "time_per_iteration": 2.6090967655181885 + }, + { + "auxiliary_loss_clip": 0.06693342, + "auxiliary_loss_mlp": 0.01302224, + "balance_loss_clip": 0.0632928, + "balance_loss_mlp": 0.01261073, + "epoch": 0.11086727791973545, + "flos": 27711325946880.0, + "grad_norm": 2.219830309112563, + "language_loss": 0.75884986, + "learning_rate": 3.931792840055941e-06, + "loss": 0.8388055, + "num_input_tokens_seen": 39922075, + "router_z_loss_clip": 3.64257812, + "router_z_loss_mlp": 0.41137695, + "step": 1844, + "time_per_iteration": 2.6051831245422363 + }, + { + "auxiliary_loss_clip": 0.06685966, + "auxiliary_loss_mlp": 0.01305534, + "balance_loss_clip": 0.06324236, + "balance_loss_mlp": 0.01264311, + "epoch": 0.11092740117240343, + "flos": 18520854894720.0, + "grad_norm": 2.695467374521673, + "language_loss": 0.77040052, + "learning_rate": 3.931691960597165e-06, + "loss": 0.85031545, + "num_input_tokens_seen": 39940115, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 0.41235352, + "step": 1845, + "time_per_iteration": 2.6330642700195312 + }, + { + "auxiliary_loss_clip": 0.06677614, + "auxiliary_loss_mlp": 0.01301707, + "balance_loss_clip": 0.06324686, + "balance_loss_mlp": 0.01264681, + "epoch": 0.1109875244250714, + "flos": 20528796211200.0, + "grad_norm": 2.004922205839187, + "language_loss": 0.77657044, + "learning_rate": 3.9315910078884375e-06, + "loss": 0.85636371, + "num_input_tokens_seen": 39959920, + "router_z_loss_clip": 3.52929688, + "router_z_loss_mlp": 0.37036133, + "step": 1846, + "time_per_iteration": 2.5549449920654297 + }, + { + "auxiliary_loss_clip": 0.06701723, + "auxiliary_loss_mlp": 0.01300229, + "balance_loss_clip": 0.0633509, + "balance_loss_mlp": 0.01259627, + "epoch": 0.11104764767773936, + "flos": 14103484623360.0, + "grad_norm": 2.935889161115543, + "language_loss": 0.88190699, + "learning_rate": 3.931489981933584e-06, + "loss": 0.96192646, + "num_input_tokens_seen": 39974755, + "router_z_loss_clip": 3.671875, + "router_z_loss_mlp": 0.40600586, + "step": 1847, + "time_per_iteration": 2.544952869415283 + }, + { + "auxiliary_loss_clip": 0.06695546, + "auxiliary_loss_mlp": 0.01304809, + "balance_loss_clip": 0.06331737, + "balance_loss_mlp": 0.01263944, + "epoch": 0.11110777093040733, + "flos": 20600730541440.0, + "grad_norm": 2.320230631722476, + "language_loss": 0.79106438, + "learning_rate": 3.931388882736438e-06, + "loss": 0.87106788, + "num_input_tokens_seen": 39993355, + "router_z_loss_clip": 3.63476562, + "router_z_loss_mlp": 0.40893555, + "step": 1848, + "time_per_iteration": 2.6920952796936035 + }, + { + "auxiliary_loss_clip": 0.0668249, + "auxiliary_loss_mlp": 0.01302322, + "balance_loss_clip": 0.06330639, + "balance_loss_mlp": 0.01266702, + "epoch": 0.11116789418307531, + "flos": 21876247319040.0, + "grad_norm": 2.02298107620041, + "language_loss": 0.79027736, + "learning_rate": 3.931287710300832e-06, + "loss": 0.87012547, + "num_input_tokens_seen": 40012410, + "router_z_loss_clip": 3.51757812, + "router_z_loss_mlp": 0.35595703, + "step": 1849, + "time_per_iteration": 2.630244255065918 + }, + { + "auxiliary_loss_clip": 0.0669456, + "auxiliary_loss_mlp": 0.01300991, + "balance_loss_clip": 0.06327619, + "balance_loss_mlp": 0.01259363, + "epoch": 0.11122801743574327, + "flos": 15528488284800.0, + "grad_norm": 3.153012159345978, + "language_loss": 0.73516262, + "learning_rate": 3.931186464630601e-06, + "loss": 0.81511813, + "num_input_tokens_seen": 40029315, + "router_z_loss_clip": 3.66601562, + "router_z_loss_mlp": 0.41625977, + "step": 1850, + "time_per_iteration": 2.5095834732055664 + }, + { + "auxiliary_loss_clip": 0.06693517, + "auxiliary_loss_mlp": 0.01305101, + "balance_loss_clip": 0.06331346, + "balance_loss_mlp": 0.01265952, + "epoch": 0.11128814068841124, + "flos": 14397511000320.0, + "grad_norm": 2.7195587095410594, + "language_loss": 0.83262205, + "learning_rate": 3.931085145729588e-06, + "loss": 0.91260827, + "num_input_tokens_seen": 40045765, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 0.39135742, + "step": 1851, + "time_per_iteration": 2.5094821453094482 + }, + { + "auxiliary_loss_clip": 0.06681279, + "auxiliary_loss_mlp": 0.01301356, + "balance_loss_clip": 0.06326675, + "balance_loss_mlp": 0.01266285, + "epoch": 0.11134826394107922, + "flos": 16659465569280.0, + "grad_norm": 3.1935743698172874, + "language_loss": 0.90682918, + "learning_rate": 3.930983753601631e-06, + "loss": 0.98665553, + "num_input_tokens_seen": 40061660, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 0.35083008, + "step": 1852, + "time_per_iteration": 2.5097947120666504 + }, + { + "auxiliary_loss_clip": 0.06688742, + "auxiliary_loss_mlp": 0.0130004, + "balance_loss_clip": 0.06332849, + "balance_loss_mlp": 0.01261392, + "epoch": 0.11140838719374718, + "flos": 16696627655040.0, + "grad_norm": 2.055655946127079, + "language_loss": 0.73742187, + "learning_rate": 3.930882288250578e-06, + "loss": 0.81730974, + "num_input_tokens_seen": 40080180, + "router_z_loss_clip": 3.56445312, + "router_z_loss_mlp": 0.38647461, + "step": 1853, + "time_per_iteration": 2.5568370819091797 + }, + { + "auxiliary_loss_clip": 0.06563053, + "auxiliary_loss_mlp": 0.01299008, + "balance_loss_clip": 0.06346013, + "balance_loss_mlp": 0.01281771, + "epoch": 0.11146851044641515, + "flos": 60994101248640.0, + "grad_norm": 0.7599812832333546, + "language_loss": 0.53835392, + "learning_rate": 3.930780749680273e-06, + "loss": 0.61697447, + "num_input_tokens_seen": 40138910, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.17260742, + "step": 1854, + "time_per_iteration": 3.1410884857177734 + }, + { + "auxiliary_loss_clip": 0.06710939, + "auxiliary_loss_mlp": 0.01301728, + "balance_loss_clip": 0.06327829, + "balance_loss_mlp": 0.01258336, + "epoch": 0.11152863369908313, + "flos": 22199301936000.0, + "grad_norm": 2.170007206040738, + "language_loss": 0.86019069, + "learning_rate": 3.9306791378945705e-06, + "loss": 0.94031739, + "num_input_tokens_seen": 40157745, + "router_z_loss_clip": 3.83398438, + "router_z_loss_mlp": 0.43383789, + "step": 1855, + "time_per_iteration": 2.5451245307922363 + }, + { + "auxiliary_loss_clip": 0.06687084, + "auxiliary_loss_mlp": 0.01297488, + "balance_loss_clip": 0.0632429, + "balance_loss_mlp": 0.01258745, + "epoch": 0.11158875695175109, + "flos": 19543742012160.0, + "grad_norm": 2.6985711919434054, + "language_loss": 0.83108622, + "learning_rate": 3.9305774528973205e-06, + "loss": 0.91093194, + "num_input_tokens_seen": 40175375, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 0.38720703, + "step": 1856, + "time_per_iteration": 2.578641653060913 + }, + { + "auxiliary_loss_clip": 0.06667097, + "auxiliary_loss_mlp": 0.01293205, + "balance_loss_clip": 0.06315985, + "balance_loss_mlp": 0.01257824, + "epoch": 0.11164888020441906, + "flos": 25448994034560.0, + "grad_norm": 1.90457681551641, + "language_loss": 0.84520233, + "learning_rate": 3.93047569469238e-06, + "loss": 0.92480534, + "num_input_tokens_seen": 40195715, + "router_z_loss_clip": 3.51171875, + "router_z_loss_mlp": 0.35375977, + "step": 1857, + "time_per_iteration": 2.581700086593628 + }, + { + "auxiliary_loss_clip": 0.06686676, + "auxiliary_loss_mlp": 0.01304106, + "balance_loss_clip": 0.06318156, + "balance_loss_mlp": 0.01263289, + "epoch": 0.11170900345708702, + "flos": 15638171679360.0, + "grad_norm": 2.609725880853407, + "language_loss": 0.85109961, + "learning_rate": 3.930373863283608e-06, + "loss": 0.9310075, + "num_input_tokens_seen": 40213975, + "router_z_loss_clip": 3.68164062, + "router_z_loss_mlp": 0.40795898, + "step": 1858, + "time_per_iteration": 2.536013603210449 + }, + { + "auxiliary_loss_clip": 0.0668328, + "auxiliary_loss_mlp": 0.01297406, + "balance_loss_clip": 0.06323688, + "balance_loss_mlp": 0.01259569, + "epoch": 0.111769126709755, + "flos": 23046105697920.0, + "grad_norm": 2.4700078024873102, + "language_loss": 0.92790282, + "learning_rate": 3.930271958674866e-06, + "loss": 1.00770962, + "num_input_tokens_seen": 40233905, + "router_z_loss_clip": 3.59570312, + "router_z_loss_mlp": 0.37841797, + "step": 1859, + "time_per_iteration": 2.541881799697876 + }, + { + "auxiliary_loss_clip": 0.06691643, + "auxiliary_loss_mlp": 0.01299678, + "balance_loss_clip": 0.06318307, + "balance_loss_mlp": 0.0125774, + "epoch": 0.11182924996242297, + "flos": 20857091708160.0, + "grad_norm": 2.367815973832506, + "language_loss": 0.8396585, + "learning_rate": 3.930169980870018e-06, + "loss": 0.9195717, + "num_input_tokens_seen": 40252810, + "router_z_loss_clip": 3.72851562, + "router_z_loss_mlp": 0.41943359, + "step": 1860, + "time_per_iteration": 2.565051555633545 + }, + { + "auxiliary_loss_clip": 0.06669357, + "auxiliary_loss_mlp": 0.01300378, + "balance_loss_clip": 0.06315688, + "balance_loss_mlp": 0.01263065, + "epoch": 0.11188937321509093, + "flos": 17460763764480.0, + "grad_norm": 2.7908462123762026, + "language_loss": 0.7628203, + "learning_rate": 3.930067929872931e-06, + "loss": 0.84251761, + "num_input_tokens_seen": 40272000, + "router_z_loss_clip": 3.53710938, + "router_z_loss_mlp": 0.37304688, + "step": 1861, + "time_per_iteration": 2.5033557415008545 + }, + { + "auxiliary_loss_clip": 0.06670874, + "auxiliary_loss_mlp": 0.01303256, + "balance_loss_clip": 0.0631748, + "balance_loss_mlp": 0.01266635, + "epoch": 0.11194949646775891, + "flos": 24102507248640.0, + "grad_norm": 2.306450242478339, + "language_loss": 0.90480924, + "learning_rate": 3.929965805687474e-06, + "loss": 0.9845506, + "num_input_tokens_seen": 40290660, + "router_z_loss_clip": 3.53515625, + "router_z_loss_mlp": 0.3659668, + "step": 1862, + "time_per_iteration": 2.582846164703369 + }, + { + "auxiliary_loss_clip": 0.06675294, + "auxiliary_loss_mlp": 0.01301536, + "balance_loss_clip": 0.0632014, + "balance_loss_mlp": 0.01265273, + "epoch": 0.11200961972042688, + "flos": 25160627808000.0, + "grad_norm": 2.402216402179579, + "language_loss": 0.88216799, + "learning_rate": 3.92986360831752e-06, + "loss": 0.9619363, + "num_input_tokens_seen": 40307820, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 0.36279297, + "step": 1863, + "time_per_iteration": 2.548849105834961 + }, + { + "auxiliary_loss_clip": 0.06661677, + "auxiliary_loss_mlp": 0.01299701, + "balance_loss_clip": 0.06311835, + "balance_loss_mlp": 0.01259933, + "epoch": 0.11206974297309484, + "flos": 21294735183360.0, + "grad_norm": 3.3365899426908574, + "language_loss": 0.65844059, + "learning_rate": 3.929761337766945e-06, + "loss": 0.73805434, + "num_input_tokens_seen": 40327430, + "router_z_loss_clip": 3.5, + "router_z_loss_mlp": 0.39770508, + "step": 1864, + "time_per_iteration": 2.5405185222625732 + }, + { + "auxiliary_loss_clip": 0.06660779, + "auxiliary_loss_mlp": 0.01305926, + "balance_loss_clip": 0.06303211, + "balance_loss_mlp": 0.01270211, + "epoch": 0.11212986622576282, + "flos": 18921881335680.0, + "grad_norm": 2.2819326265061717, + "language_loss": 0.75939113, + "learning_rate": 3.929658994039627e-06, + "loss": 0.83905816, + "num_input_tokens_seen": 40344545, + "router_z_loss_clip": 3.58007812, + "router_z_loss_mlp": 0.35693359, + "step": 1865, + "time_per_iteration": 2.518132209777832 + }, + { + "auxiliary_loss_clip": 0.06676203, + "auxiliary_loss_mlp": 0.01303479, + "balance_loss_clip": 0.06308948, + "balance_loss_mlp": 0.01262066, + "epoch": 0.11218998947843078, + "flos": 22061344988160.0, + "grad_norm": 2.4630430297676087, + "language_loss": 0.86701274, + "learning_rate": 3.929556577139446e-06, + "loss": 0.94680953, + "num_input_tokens_seen": 40362300, + "router_z_loss_clip": 3.67382812, + "router_z_loss_mlp": 0.4140625, + "step": 1866, + "time_per_iteration": 2.559826135635376 + }, + { + "auxiliary_loss_clip": 0.06668604, + "auxiliary_loss_mlp": 0.0129946, + "balance_loss_clip": 0.06308037, + "balance_loss_mlp": 0.01259405, + "epoch": 0.11225011273109875, + "flos": 24578612547840.0, + "grad_norm": 1.6697676286935108, + "language_loss": 0.82806516, + "learning_rate": 3.929454087070286e-06, + "loss": 0.90774584, + "num_input_tokens_seen": 40384720, + "router_z_loss_clip": 3.60546875, + "router_z_loss_mlp": 0.40014648, + "step": 1867, + "time_per_iteration": 2.6024861335754395 + }, + { + "auxiliary_loss_clip": 0.06666633, + "auxiliary_loss_mlp": 0.01303841, + "balance_loss_clip": 0.06308746, + "balance_loss_mlp": 0.01266099, + "epoch": 0.11231023598376672, + "flos": 28446140327040.0, + "grad_norm": 2.646357828465267, + "language_loss": 0.88275552, + "learning_rate": 3.929351523836035e-06, + "loss": 0.96246034, + "num_input_tokens_seen": 40404000, + "router_z_loss_clip": 3.58007812, + "router_z_loss_mlp": 0.37744141, + "step": 1868, + "time_per_iteration": 2.6040542125701904 + }, + { + "auxiliary_loss_clip": 0.06659871, + "auxiliary_loss_mlp": 0.01297203, + "balance_loss_clip": 0.06306987, + "balance_loss_mlp": 0.01259866, + "epoch": 0.1123703592364347, + "flos": 14431318922880.0, + "grad_norm": 2.6026187077821796, + "language_loss": 0.69696379, + "learning_rate": 3.9292488874405795e-06, + "loss": 0.77653456, + "num_input_tokens_seen": 40418665, + "router_z_loss_clip": 3.52929688, + "router_z_loss_mlp": 0.3737793, + "step": 1869, + "time_per_iteration": 2.562173843383789 + }, + { + "auxiliary_loss_clip": 0.06669002, + "auxiliary_loss_mlp": 0.01308207, + "balance_loss_clip": 0.06307223, + "balance_loss_mlp": 0.01267629, + "epoch": 0.11243048248910266, + "flos": 22242753077760.0, + "grad_norm": 2.004713314117072, + "language_loss": 0.78550231, + "learning_rate": 3.929146177887814e-06, + "loss": 0.86527443, + "num_input_tokens_seen": 40437870, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 0.40600586, + "step": 1870, + "time_per_iteration": 2.5912842750549316 + }, + { + "auxiliary_loss_clip": 0.06677727, + "auxiliary_loss_mlp": 0.01300065, + "balance_loss_clip": 0.06308755, + "balance_loss_mlp": 0.01259462, + "epoch": 0.11249060574177062, + "flos": 18589435061760.0, + "grad_norm": 2.325375460191994, + "language_loss": 0.77409399, + "learning_rate": 3.929043395181631e-06, + "loss": 0.85387194, + "num_input_tokens_seen": 40455570, + "router_z_loss_clip": 3.69140625, + "router_z_loss_mlp": 0.40625, + "step": 1871, + "time_per_iteration": 3.970134735107422 + }, + { + "auxiliary_loss_clip": 0.06669156, + "auxiliary_loss_mlp": 0.01304929, + "balance_loss_clip": 0.06304972, + "balance_loss_mlp": 0.01264803, + "epoch": 0.1125507289944386, + "flos": 22863146307840.0, + "grad_norm": 2.5010943819542395, + "language_loss": 0.83236814, + "learning_rate": 3.928940539325929e-06, + "loss": 0.91210902, + "num_input_tokens_seen": 40473600, + "router_z_loss_clip": 3.64257812, + "router_z_loss_mlp": 0.40112305, + "step": 1872, + "time_per_iteration": 2.53498911857605 + }, + { + "auxiliary_loss_clip": 0.0666475, + "auxiliary_loss_mlp": 0.0132478, + "balance_loss_clip": 0.06302819, + "balance_loss_mlp": 0.01284344, + "epoch": 0.11261085224710657, + "flos": 19681447397760.0, + "grad_norm": 2.9026103981965963, + "language_loss": 0.84496641, + "learning_rate": 3.9288376103246095e-06, + "loss": 0.92486167, + "num_input_tokens_seen": 40490025, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 0.40454102, + "step": 1873, + "time_per_iteration": 3.988614082336426 + }, + { + "auxiliary_loss_clip": 0.06668855, + "auxiliary_loss_mlp": 0.01305813, + "balance_loss_clip": 0.06300959, + "balance_loss_mlp": 0.01266664, + "epoch": 0.11267097549977453, + "flos": 26069680753920.0, + "grad_norm": 2.0146094287088454, + "language_loss": 0.92890203, + "learning_rate": 3.928734608181575e-06, + "loss": 1.00864863, + "num_input_tokens_seen": 40511580, + "router_z_loss_clip": 3.67578125, + "router_z_loss_mlp": 0.3918457, + "step": 1874, + "time_per_iteration": 2.594095230102539 + }, + { + "auxiliary_loss_clip": 0.06647091, + "auxiliary_loss_mlp": 0.01311618, + "balance_loss_clip": 0.06294458, + "balance_loss_mlp": 0.01272589, + "epoch": 0.11273109875244251, + "flos": 21074194437120.0, + "grad_norm": 2.447545582518425, + "language_loss": 0.7598331, + "learning_rate": 3.928631532900729e-06, + "loss": 0.8394202, + "num_input_tokens_seen": 40530155, + "router_z_loss_clip": 3.52539062, + "router_z_loss_mlp": 0.39038086, + "step": 1875, + "time_per_iteration": 2.5846669673919678 + }, + { + "auxiliary_loss_clip": 0.06650866, + "auxiliary_loss_mlp": 0.01305089, + "balance_loss_clip": 0.06300622, + "balance_loss_mlp": 0.01270042, + "epoch": 0.11279122200511048, + "flos": 27096299377920.0, + "grad_norm": 2.1373581639008603, + "language_loss": 0.73336905, + "learning_rate": 3.928528384485984e-06, + "loss": 0.81292862, + "num_input_tokens_seen": 40549500, + "router_z_loss_clip": 3.50585938, + "router_z_loss_mlp": 0.3503418, + "step": 1876, + "time_per_iteration": 3.9819693565368652 + }, + { + "auxiliary_loss_clip": 0.06655607, + "auxiliary_loss_mlp": 0.01304943, + "balance_loss_clip": 0.06303705, + "balance_loss_mlp": 0.01268489, + "epoch": 0.11285134525777844, + "flos": 20193163482240.0, + "grad_norm": 1.9863695087931013, + "language_loss": 0.78284073, + "learning_rate": 3.9284251629412475e-06, + "loss": 0.86244625, + "num_input_tokens_seen": 40567475, + "router_z_loss_clip": 3.51953125, + "router_z_loss_mlp": 0.36474609, + "step": 1877, + "time_per_iteration": 4.03458046913147 + }, + { + "auxiliary_loss_clip": 0.06652889, + "auxiliary_loss_mlp": 0.01306338, + "balance_loss_clip": 0.06294097, + "balance_loss_mlp": 0.01265139, + "epoch": 0.11291146851044641, + "flos": 12463348803840.0, + "grad_norm": 2.614643448765401, + "language_loss": 0.8943826, + "learning_rate": 3.928321868270436e-06, + "loss": 0.97397494, + "num_input_tokens_seen": 40583280, + "router_z_loss_clip": 3.58789062, + "router_z_loss_mlp": 0.41186523, + "step": 1878, + "time_per_iteration": 2.5039942264556885 + }, + { + "auxiliary_loss_clip": 0.06650617, + "auxiliary_loss_mlp": 0.01298934, + "balance_loss_clip": 0.0629722, + "balance_loss_mlp": 0.01262981, + "epoch": 0.11297159176311439, + "flos": 23849164828800.0, + "grad_norm": 2.5452203644148748, + "language_loss": 0.83347368, + "learning_rate": 3.928218500477466e-06, + "loss": 0.91296917, + "num_input_tokens_seen": 40603080, + "router_z_loss_clip": 3.53710938, + "router_z_loss_mlp": 0.35961914, + "step": 1879, + "time_per_iteration": 2.597705125808716 + }, + { + "auxiliary_loss_clip": 0.06658179, + "auxiliary_loss_mlp": 0.01304624, + "balance_loss_clip": 0.06296952, + "balance_loss_mlp": 0.01265333, + "epoch": 0.11303171501578235, + "flos": 29937585876480.0, + "grad_norm": 2.2031468075921765, + "language_loss": 0.71889591, + "learning_rate": 3.928115059566259e-06, + "loss": 0.79852396, + "num_input_tokens_seen": 40623255, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 0.39306641, + "step": 1880, + "time_per_iteration": 2.5943877696990967 + }, + { + "auxiliary_loss_clip": 0.06640352, + "auxiliary_loss_mlp": 0.01299738, + "balance_loss_clip": 0.06297569, + "balance_loss_mlp": 0.01262163, + "epoch": 0.11309183826845032, + "flos": 16186169381760.0, + "grad_norm": 2.477930763311184, + "language_loss": 0.74137151, + "learning_rate": 3.928011545540734e-06, + "loss": 0.82077241, + "num_input_tokens_seen": 40641570, + "router_z_loss_clip": 3.43164062, + "router_z_loss_mlp": 0.37573242, + "step": 1881, + "time_per_iteration": 2.5628225803375244 + }, + { + "auxiliary_loss_clip": 0.06661209, + "auxiliary_loss_mlp": 0.01303844, + "balance_loss_clip": 0.06301182, + "balance_loss_mlp": 0.01264767, + "epoch": 0.1131519615211183, + "flos": 12025537620480.0, + "grad_norm": 2.71671437451568, + "language_loss": 0.75070721, + "learning_rate": 3.927907958404819e-06, + "loss": 0.83035773, + "num_input_tokens_seen": 40658775, + "router_z_loss_clip": 3.59765625, + "router_z_loss_mlp": 0.39111328, + "step": 1882, + "time_per_iteration": 2.5252811908721924 + }, + { + "auxiliary_loss_clip": 0.06659748, + "auxiliary_loss_mlp": 0.01301896, + "balance_loss_clip": 0.06302463, + "balance_loss_mlp": 0.0126363, + "epoch": 0.11321208477378626, + "flos": 26257335972480.0, + "grad_norm": 2.360500107686341, + "language_loss": 0.81115943, + "learning_rate": 3.92780429816244e-06, + "loss": 0.89077592, + "num_input_tokens_seen": 40679555, + "router_z_loss_clip": 3.57226562, + "router_z_loss_mlp": 0.3828125, + "step": 1883, + "time_per_iteration": 2.6215126514434814 + }, + { + "auxiliary_loss_clip": 0.06662337, + "auxiliary_loss_mlp": 0.01301794, + "balance_loss_clip": 0.06305344, + "balance_loss_mlp": 0.01264076, + "epoch": 0.11327220802645423, + "flos": 13631530101120.0, + "grad_norm": 4.398339236734383, + "language_loss": 0.78793007, + "learning_rate": 3.927700564817529e-06, + "loss": 0.86757141, + "num_input_tokens_seen": 40697295, + "router_z_loss_clip": 3.56835938, + "router_z_loss_mlp": 0.37719727, + "step": 1884, + "time_per_iteration": 2.5176398754119873 + }, + { + "auxiliary_loss_clip": 0.06509344, + "auxiliary_loss_mlp": 0.0129272, + "balance_loss_clip": 0.06295511, + "balance_loss_mlp": 0.0127789, + "epoch": 0.1133323312791222, + "flos": 57210582787200.0, + "grad_norm": 0.8090343621743066, + "language_loss": 0.55328304, + "learning_rate": 3.927596758374019e-06, + "loss": 0.63130367, + "num_input_tokens_seen": 40758095, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.14794922, + "step": 1885, + "time_per_iteration": 3.0971505641937256 + }, + { + "auxiliary_loss_clip": 0.06646755, + "auxiliary_loss_mlp": 0.01313183, + "balance_loss_clip": 0.06301701, + "balance_loss_mlp": 0.01277062, + "epoch": 0.11339245453179017, + "flos": 24358407217920.0, + "grad_norm": 2.1975512476365444, + "language_loss": 0.917539, + "learning_rate": 3.927492878835848e-06, + "loss": 0.99713838, + "num_input_tokens_seen": 40777140, + "router_z_loss_clip": 3.45117188, + "router_z_loss_mlp": 0.36132812, + "step": 1886, + "time_per_iteration": 2.557039260864258 + }, + { + "auxiliary_loss_clip": 0.06661782, + "auxiliary_loss_mlp": 0.01305618, + "balance_loss_clip": 0.06311518, + "balance_loss_mlp": 0.01271882, + "epoch": 0.11345257778445814, + "flos": 22676665046400.0, + "grad_norm": 2.7768273002598427, + "language_loss": 0.86747134, + "learning_rate": 3.927388926206953e-06, + "loss": 0.94714534, + "num_input_tokens_seen": 40797505, + "router_z_loss_clip": 3.50195312, + "router_z_loss_mlp": 0.33740234, + "step": 1887, + "time_per_iteration": 2.586071014404297 + }, + { + "auxiliary_loss_clip": 0.06653242, + "auxiliary_loss_mlp": 0.01304972, + "balance_loss_clip": 0.06302808, + "balance_loss_mlp": 0.01268279, + "epoch": 0.11351270103712612, + "flos": 20993245793280.0, + "grad_norm": 4.850859640376328, + "language_loss": 0.7868247, + "learning_rate": 3.927284900491277e-06, + "loss": 0.86640686, + "num_input_tokens_seen": 40812970, + "router_z_loss_clip": 3.50195312, + "router_z_loss_mlp": 0.36694336, + "step": 1888, + "time_per_iteration": 2.5445072650909424 + }, + { + "auxiliary_loss_clip": 0.06662205, + "auxiliary_loss_mlp": 0.01311301, + "balance_loss_clip": 0.06306933, + "balance_loss_mlp": 0.01271366, + "epoch": 0.11357282428979408, + "flos": 37358014152960.0, + "grad_norm": 2.243152205453325, + "language_loss": 0.69439191, + "learning_rate": 3.927180801692764e-06, + "loss": 0.77412695, + "num_input_tokens_seen": 40837745, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 0.39916992, + "step": 1889, + "time_per_iteration": 2.7570948600769043 + }, + { + "auxiliary_loss_clip": 0.06658383, + "auxiliary_loss_mlp": 0.01303074, + "balance_loss_clip": 0.06306529, + "balance_loss_mlp": 0.01266811, + "epoch": 0.11363294754246205, + "flos": 21762580855680.0, + "grad_norm": 2.3560992330068, + "language_loss": 0.85365129, + "learning_rate": 3.927076629815362e-06, + "loss": 0.93326581, + "num_input_tokens_seen": 40856490, + "router_z_loss_clip": 3.51953125, + "router_z_loss_mlp": 0.36279297, + "step": 1890, + "time_per_iteration": 2.539299964904785 + }, + { + "auxiliary_loss_clip": 0.06646931, + "auxiliary_loss_mlp": 0.0130946, + "balance_loss_clip": 0.06299055, + "balance_loss_mlp": 0.01272887, + "epoch": 0.11369307079513001, + "flos": 22608252587520.0, + "grad_norm": 3.2867804654433734, + "language_loss": 0.66679269, + "learning_rate": 3.926972384863022e-06, + "loss": 0.74635661, + "num_input_tokens_seen": 40874070, + "router_z_loss_clip": 3.48046875, + "router_z_loss_mlp": 0.36572266, + "step": 1891, + "time_per_iteration": 2.5804758071899414 + }, + { + "auxiliary_loss_clip": 0.06662975, + "auxiliary_loss_mlp": 0.01306025, + "balance_loss_clip": 0.06305033, + "balance_loss_mlp": 0.01268188, + "epoch": 0.11375319404779799, + "flos": 21950655344640.0, + "grad_norm": 2.3010503008358887, + "language_loss": 0.89755237, + "learning_rate": 3.9268680668396956e-06, + "loss": 0.97724235, + "num_input_tokens_seen": 40892425, + "router_z_loss_clip": 3.58398438, + "router_z_loss_mlp": 0.37817383, + "step": 1892, + "time_per_iteration": 2.5231149196624756 + }, + { + "auxiliary_loss_clip": 0.06664805, + "auxiliary_loss_mlp": 0.01310273, + "balance_loss_clip": 0.06304479, + "balance_loss_mlp": 0.01271149, + "epoch": 0.11381331730046595, + "flos": 26402588225280.0, + "grad_norm": 2.9760722646413966, + "language_loss": 0.75163257, + "learning_rate": 3.926763675749339e-06, + "loss": 0.83138341, + "num_input_tokens_seen": 40912190, + "router_z_loss_clip": 3.60546875, + "router_z_loss_mlp": 0.39111328, + "step": 1893, + "time_per_iteration": 2.6722171306610107 + }, + { + "auxiliary_loss_clip": 0.06657124, + "auxiliary_loss_mlp": 0.0130867, + "balance_loss_clip": 0.06306865, + "balance_loss_mlp": 0.01271405, + "epoch": 0.11387344055313392, + "flos": 23811373837440.0, + "grad_norm": 2.1739305302665417, + "language_loss": 0.81218535, + "learning_rate": 3.92665921159591e-06, + "loss": 0.89184326, + "num_input_tokens_seen": 40928395, + "router_z_loss_clip": 3.5, + "router_z_loss_mlp": 0.37255859, + "step": 1894, + "time_per_iteration": 2.5737743377685547 + }, + { + "auxiliary_loss_clip": 0.06661002, + "auxiliary_loss_mlp": 0.01313123, + "balance_loss_clip": 0.06302214, + "balance_loss_mlp": 0.01272187, + "epoch": 0.1139335638058019, + "flos": 34529865546240.0, + "grad_norm": 3.0499673553250317, + "language_loss": 0.81167793, + "learning_rate": 3.926554674383371e-06, + "loss": 0.89141917, + "num_input_tokens_seen": 40946555, + "router_z_loss_clip": 3.5859375, + "router_z_loss_mlp": 0.40991211, + "step": 1895, + "time_per_iteration": 2.6510303020477295 + }, + { + "auxiliary_loss_clip": 0.06495596, + "auxiliary_loss_mlp": 0.01269878, + "balance_loss_clip": 0.06284232, + "balance_loss_mlp": 0.01256026, + "epoch": 0.11399368705846986, + "flos": 70609790643840.0, + "grad_norm": 0.7664991761837657, + "language_loss": 0.63306981, + "learning_rate": 3.926450064115686e-06, + "loss": 0.71072453, + "num_input_tokens_seen": 41004910, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.13891602, + "step": 1896, + "time_per_iteration": 3.2715020179748535 + }, + { + "auxiliary_loss_clip": 0.06653456, + "auxiliary_loss_mlp": 0.01306088, + "balance_loss_clip": 0.06306494, + "balance_loss_mlp": 0.01266224, + "epoch": 0.11405381031113783, + "flos": 21330597530880.0, + "grad_norm": 2.7976416245645988, + "language_loss": 0.86136234, + "learning_rate": 3.926345380796821e-06, + "loss": 0.94095778, + "num_input_tokens_seen": 41026385, + "router_z_loss_clip": 3.47070312, + "router_z_loss_mlp": 0.3984375, + "step": 1897, + "time_per_iteration": 2.602890729904175 + }, + { + "auxiliary_loss_clip": 0.06656732, + "auxiliary_loss_mlp": 0.01307974, + "balance_loss_clip": 0.06304093, + "balance_loss_mlp": 0.01270041, + "epoch": 0.11411393356380581, + "flos": 19725820934400.0, + "grad_norm": 2.6374143353220068, + "language_loss": 0.80644619, + "learning_rate": 3.9262406244307465e-06, + "loss": 0.88609326, + "num_input_tokens_seen": 41045315, + "router_z_loss_clip": 3.52734375, + "router_z_loss_mlp": 0.37915039, + "step": 1898, + "time_per_iteration": 2.5834596157073975 + }, + { + "auxiliary_loss_clip": 0.06665078, + "auxiliary_loss_mlp": 0.0130214, + "balance_loss_clip": 0.06307302, + "balance_loss_mlp": 0.01261823, + "epoch": 0.11417405681647377, + "flos": 17536261893120.0, + "grad_norm": 3.558801225381502, + "language_loss": 0.74948764, + "learning_rate": 3.926135795021435e-06, + "loss": 0.82915986, + "num_input_tokens_seen": 41063390, + "router_z_loss_clip": 3.57226562, + "router_z_loss_mlp": 0.40283203, + "step": 1899, + "time_per_iteration": 2.5195093154907227 + }, + { + "auxiliary_loss_clip": 0.06484325, + "auxiliary_loss_mlp": 0.01277698, + "balance_loss_clip": 0.06276824, + "balance_loss_mlp": 0.01262463, + "epoch": 0.11423418006914174, + "flos": 59694168205440.0, + "grad_norm": 0.8563849035990295, + "language_loss": 0.63607001, + "learning_rate": 3.92603089257286e-06, + "loss": 0.71369016, + "num_input_tokens_seen": 41124180, + "router_z_loss_clip": 2.078125, + "router_z_loss_mlp": 0.15209961, + "step": 1900, + "time_per_iteration": 3.140596389770508 + }, + { + "auxiliary_loss_clip": 0.06654657, + "auxiliary_loss_mlp": 0.01295658, + "balance_loss_clip": 0.06311209, + "balance_loss_mlp": 0.01260706, + "epoch": 0.1142943033218097, + "flos": 22969223976960.0, + "grad_norm": 2.413799712437086, + "language_loss": 0.7948848, + "learning_rate": 3.925925917089001e-06, + "loss": 0.87438798, + "num_input_tokens_seen": 41143485, + "router_z_loss_clip": 3.43359375, + "router_z_loss_mlp": 0.34960938, + "step": 1901, + "time_per_iteration": 2.5521771907806396 + }, + { + "auxiliary_loss_clip": 0.06657314, + "auxiliary_loss_mlp": 0.01303255, + "balance_loss_clip": 0.06311248, + "balance_loss_mlp": 0.01264011, + "epoch": 0.11435442657447768, + "flos": 18261558835200.0, + "grad_norm": 2.3832212906881862, + "language_loss": 0.8530966, + "learning_rate": 3.925820868573839e-06, + "loss": 0.93270218, + "num_input_tokens_seen": 41161695, + "router_z_loss_clip": 3.4609375, + "router_z_loss_mlp": 0.39257812, + "step": 1902, + "time_per_iteration": 2.538130521774292 + }, + { + "auxiliary_loss_clip": 0.06657556, + "auxiliary_loss_mlp": 0.01298528, + "balance_loss_clip": 0.06305373, + "balance_loss_mlp": 0.01259737, + "epoch": 0.11441454982714565, + "flos": 24068070420480.0, + "grad_norm": 1.6413453356185448, + "language_loss": 0.79046285, + "learning_rate": 3.925715747031356e-06, + "loss": 0.87002361, + "num_input_tokens_seen": 41181715, + "router_z_loss_clip": 3.52539062, + "router_z_loss_mlp": 0.38793945, + "step": 1903, + "time_per_iteration": 2.5491714477539062 + }, + { + "auxiliary_loss_clip": 0.0665084, + "auxiliary_loss_mlp": 0.01296782, + "balance_loss_clip": 0.06302907, + "balance_loss_mlp": 0.01262021, + "epoch": 0.11447467307981361, + "flos": 25344719228160.0, + "grad_norm": 2.444047148927425, + "language_loss": 0.7716713, + "learning_rate": 3.925610552465539e-06, + "loss": 0.85114753, + "num_input_tokens_seen": 41201770, + "router_z_loss_clip": 3.47851562, + "router_z_loss_mlp": 0.34765625, + "step": 1904, + "time_per_iteration": 2.581732749938965 + }, + { + "auxiliary_loss_clip": 0.0665014, + "auxiliary_loss_mlp": 0.01305214, + "balance_loss_clip": 0.0630317, + "balance_loss_mlp": 0.01263967, + "epoch": 0.11453479633248159, + "flos": 21732546366720.0, + "grad_norm": 2.531757155305884, + "language_loss": 0.9328481, + "learning_rate": 3.9255052848803764e-06, + "loss": 1.01240158, + "num_input_tokens_seen": 41220590, + "router_z_loss_clip": 3.46484375, + "router_z_loss_mlp": 0.41259766, + "step": 1905, + "time_per_iteration": 2.5455148220062256 + }, + { + "auxiliary_loss_clip": 0.06677254, + "auxiliary_loss_mlp": 0.01302143, + "balance_loss_clip": 0.06310458, + "balance_loss_mlp": 0.0126185, + "epoch": 0.11459491958514956, + "flos": 12974771399040.0, + "grad_norm": 15.201644676234393, + "language_loss": 0.79179782, + "learning_rate": 3.925399944279861e-06, + "loss": 0.87159181, + "num_input_tokens_seen": 41237250, + "router_z_loss_clip": 3.66992188, + "router_z_loss_mlp": 0.40258789, + "step": 1906, + "time_per_iteration": 2.557220220565796 + }, + { + "auxiliary_loss_clip": 0.06651148, + "auxiliary_loss_mlp": 0.01309487, + "balance_loss_clip": 0.06300925, + "balance_loss_mlp": 0.0127022, + "epoch": 0.11465504283781752, + "flos": 22717935982080.0, + "grad_norm": 2.7916231383135903, + "language_loss": 0.84417903, + "learning_rate": 3.925294530667986e-06, + "loss": 0.92378545, + "num_input_tokens_seen": 41256680, + "router_z_loss_clip": 3.50195312, + "router_z_loss_mlp": 0.39257812, + "step": 1907, + "time_per_iteration": 2.538357734680176 + }, + { + "auxiliary_loss_clip": 0.06659371, + "auxiliary_loss_mlp": 0.01305713, + "balance_loss_clip": 0.06306633, + "balance_loss_mlp": 0.01266064, + "epoch": 0.1147151660904855, + "flos": 23404142194560.0, + "grad_norm": 5.983288386648609, + "language_loss": 0.85784996, + "learning_rate": 3.92518904404875e-06, + "loss": 0.93750072, + "num_input_tokens_seen": 41270955, + "router_z_loss_clip": 3.53125, + "router_z_loss_mlp": 0.39648438, + "step": 1908, + "time_per_iteration": 2.566323757171631 + }, + { + "auxiliary_loss_clip": 0.06483665, + "auxiliary_loss_mlp": 0.01269821, + "balance_loss_clip": 0.0627609, + "balance_loss_mlp": 0.01254252, + "epoch": 0.11477528934315347, + "flos": 63028639036800.0, + "grad_norm": 0.8722245963969955, + "language_loss": 0.60927975, + "learning_rate": 3.925083484426153e-06, + "loss": 0.68681461, + "num_input_tokens_seen": 41319180, + "router_z_loss_clip": 2.07617188, + "router_z_loss_mlp": 0.15551758, + "step": 1909, + "time_per_iteration": 2.9047083854675293 + }, + { + "auxiliary_loss_clip": 0.06651932, + "auxiliary_loss_mlp": 0.01304657, + "balance_loss_clip": 0.06305454, + "balance_loss_mlp": 0.01265223, + "epoch": 0.11483541259582143, + "flos": 16331086218240.0, + "grad_norm": 2.669666495614271, + "language_loss": 0.8074221, + "learning_rate": 3.924977851804197e-06, + "loss": 0.88698798, + "num_input_tokens_seen": 41337480, + "router_z_loss_clip": 3.46289062, + "router_z_loss_mlp": 0.39404297, + "step": 1910, + "time_per_iteration": 2.5531835556030273 + }, + { + "auxiliary_loss_clip": 0.06656756, + "auxiliary_loss_mlp": 0.01297855, + "balance_loss_clip": 0.06303862, + "balance_loss_mlp": 0.01258516, + "epoch": 0.1148955358484894, + "flos": 21586916770560.0, + "grad_norm": 2.9098941838716046, + "language_loss": 0.78589714, + "learning_rate": 3.9248721461868875e-06, + "loss": 0.86544329, + "num_input_tokens_seen": 41354650, + "router_z_loss_clip": 3.52734375, + "router_z_loss_mlp": 0.39331055, + "step": 1911, + "time_per_iteration": 3.928828477859497 + }, + { + "auxiliary_loss_clip": 0.06639488, + "auxiliary_loss_mlp": 0.01303362, + "balance_loss_clip": 0.06301475, + "balance_loss_mlp": 0.01266931, + "epoch": 0.11495565910115738, + "flos": 27681249530880.0, + "grad_norm": 2.02553210679246, + "language_loss": 0.80990648, + "learning_rate": 3.9247663675782336e-06, + "loss": 0.88933504, + "num_input_tokens_seen": 41376935, + "router_z_loss_clip": 3.38085938, + "router_z_loss_mlp": 0.36401367, + "step": 1912, + "time_per_iteration": 2.5985615253448486 + }, + { + "auxiliary_loss_clip": 0.06649567, + "auxiliary_loss_mlp": 0.01304436, + "balance_loss_clip": 0.06303079, + "balance_loss_mlp": 0.01266575, + "epoch": 0.11501578235382534, + "flos": 20638815022080.0, + "grad_norm": 2.0778571754475124, + "language_loss": 0.79150605, + "learning_rate": 3.924660515982246e-06, + "loss": 0.87104607, + "num_input_tokens_seen": 41396105, + "router_z_loss_clip": 3.46484375, + "router_z_loss_mlp": 0.37866211, + "step": 1913, + "time_per_iteration": 3.9840147495269775 + }, + { + "auxiliary_loss_clip": 0.06649221, + "auxiliary_loss_mlp": 0.01302596, + "balance_loss_clip": 0.06302825, + "balance_loss_mlp": 0.01266214, + "epoch": 0.1150759056064933, + "flos": 19835252766720.0, + "grad_norm": 2.174223201073213, + "language_loss": 0.71977127, + "learning_rate": 3.924554591402939e-06, + "loss": 0.79928941, + "num_input_tokens_seen": 41415600, + "router_z_loss_clip": 3.46484375, + "router_z_loss_mlp": 0.36352539, + "step": 1914, + "time_per_iteration": 2.564162492752075 + }, + { + "auxiliary_loss_clip": 0.06490675, + "auxiliary_loss_mlp": 0.01271492, + "balance_loss_clip": 0.06283194, + "balance_loss_mlp": 0.01257139, + "epoch": 0.11513602885916129, + "flos": 70068543194880.0, + "grad_norm": 0.7330745369663106, + "language_loss": 0.61048496, + "learning_rate": 3.92444859384433e-06, + "loss": 0.68810666, + "num_input_tokens_seen": 41478760, + "router_z_loss_clip": 2.0625, + "router_z_loss_mlp": 0.14343262, + "step": 1915, + "time_per_iteration": 4.616885662078857 + }, + { + "auxiliary_loss_clip": 0.06646329, + "auxiliary_loss_mlp": 0.01309796, + "balance_loss_clip": 0.06301694, + "balance_loss_mlp": 0.01271697, + "epoch": 0.11519615211182925, + "flos": 15747100387200.0, + "grad_norm": 2.8536727053056077, + "language_loss": 0.94662005, + "learning_rate": 3.924342523310436e-06, + "loss": 1.02618122, + "num_input_tokens_seen": 41495720, + "router_z_loss_clip": 3.44726562, + "router_z_loss_mlp": 0.38085938, + "step": 1916, + "time_per_iteration": 2.544074058532715 + }, + { + "auxiliary_loss_clip": 0.06649305, + "auxiliary_loss_mlp": 0.01297855, + "balance_loss_clip": 0.06298364, + "balance_loss_mlp": 0.01258945, + "epoch": 0.11525627536449722, + "flos": 20673880755840.0, + "grad_norm": 1.9176091228095486, + "language_loss": 0.73714519, + "learning_rate": 3.9242363798052806e-06, + "loss": 0.81661683, + "num_input_tokens_seen": 41513585, + "router_z_loss_clip": 3.5078125, + "router_z_loss_mlp": 0.3894043, + "step": 1917, + "time_per_iteration": 3.988520383834839 + }, + { + "auxiliary_loss_clip": 0.06637132, + "auxiliary_loss_mlp": 0.01303977, + "balance_loss_clip": 0.06296226, + "balance_loss_mlp": 0.01264876, + "epoch": 0.1153163986171652, + "flos": 20309555203200.0, + "grad_norm": 2.2006178662795546, + "language_loss": 0.7638135, + "learning_rate": 3.92413016333289e-06, + "loss": 0.84322459, + "num_input_tokens_seen": 41533390, + "router_z_loss_clip": 3.40820312, + "router_z_loss_mlp": 0.39135742, + "step": 1918, + "time_per_iteration": 2.531501531600952 + }, + { + "auxiliary_loss_clip": 0.06653848, + "auxiliary_loss_mlp": 0.01302011, + "balance_loss_clip": 0.06300295, + "balance_loss_mlp": 0.01263983, + "epoch": 0.11537652186983316, + "flos": 17645064819840.0, + "grad_norm": 6.624924967769877, + "language_loss": 0.87652063, + "learning_rate": 3.92402387389729e-06, + "loss": 0.95607924, + "num_input_tokens_seen": 41551015, + "router_z_loss_clip": 3.53320312, + "router_z_loss_mlp": 0.38037109, + "step": 1919, + "time_per_iteration": 2.5388336181640625 + }, + { + "auxiliary_loss_clip": 0.06642918, + "auxiliary_loss_mlp": 0.01303256, + "balance_loss_clip": 0.06300272, + "balance_loss_mlp": 0.01265872, + "epoch": 0.11543664512250112, + "flos": 21075787664640.0, + "grad_norm": 2.5165855021660697, + "language_loss": 0.87737721, + "learning_rate": 3.923917511502512e-06, + "loss": 0.95683897, + "num_input_tokens_seen": 41568055, + "router_z_loss_clip": 3.42773438, + "router_z_loss_mlp": 0.37402344, + "step": 1920, + "time_per_iteration": 2.536255121231079 + }, + { + "auxiliary_loss_clip": 0.0663945, + "auxiliary_loss_mlp": 0.01300031, + "balance_loss_clip": 0.06300904, + "balance_loss_mlp": 0.01262671, + "epoch": 0.11549676837516909, + "flos": 22754175672960.0, + "grad_norm": 2.0755692503441696, + "language_loss": 0.81216776, + "learning_rate": 3.923811076152589e-06, + "loss": 0.89156258, + "num_input_tokens_seen": 41587435, + "router_z_loss_clip": 3.3828125, + "router_z_loss_mlp": 0.3737793, + "step": 1921, + "time_per_iteration": 2.5809693336486816 + }, + { + "auxiliary_loss_clip": 0.06661837, + "auxiliary_loss_mlp": 0.01301821, + "balance_loss_clip": 0.06303193, + "balance_loss_mlp": 0.0126036, + "epoch": 0.11555689162783707, + "flos": 19174510995840.0, + "grad_norm": 2.11935003712056, + "language_loss": 0.79765266, + "learning_rate": 3.923704567851557e-06, + "loss": 0.87728924, + "num_input_tokens_seen": 41604975, + "router_z_loss_clip": 3.5859375, + "router_z_loss_mlp": 0.41455078, + "step": 1922, + "time_per_iteration": 2.521562099456787 + }, + { + "auxiliary_loss_clip": 0.06651014, + "auxiliary_loss_mlp": 0.01303966, + "balance_loss_clip": 0.06302896, + "balance_loss_mlp": 0.01265939, + "epoch": 0.11561701488050503, + "flos": 24579031818240.0, + "grad_norm": 1.9630494189649508, + "language_loss": 0.85855269, + "learning_rate": 3.923597986603456e-06, + "loss": 0.93810248, + "num_input_tokens_seen": 41626155, + "router_z_loss_clip": 3.48046875, + "router_z_loss_mlp": 0.38037109, + "step": 1923, + "time_per_iteration": 2.6439831256866455 + }, + { + "auxiliary_loss_clip": 0.06647194, + "auxiliary_loss_mlp": 0.01294133, + "balance_loss_clip": 0.0630134, + "balance_loss_mlp": 0.01258465, + "epoch": 0.115677138133173, + "flos": 17098283001600.0, + "grad_norm": 2.06344411433486, + "language_loss": 0.8208636, + "learning_rate": 3.9234913324123264e-06, + "loss": 0.90027684, + "num_input_tokens_seen": 41644805, + "router_z_loss_clip": 3.4609375, + "router_z_loss_mlp": 0.35668945, + "step": 1924, + "time_per_iteration": 2.5213494300842285 + }, + { + "auxiliary_loss_clip": 0.06494077, + "auxiliary_loss_mlp": 0.01268349, + "balance_loss_clip": 0.06289093, + "balance_loss_mlp": 0.01252459, + "epoch": 0.11573726138584098, + "flos": 62724032317440.0, + "grad_norm": 0.8075731701213882, + "language_loss": 0.60936594, + "learning_rate": 3.923384605282212e-06, + "loss": 0.6869902, + "num_input_tokens_seen": 41709345, + "router_z_loss_clip": 2.06054688, + "router_z_loss_mlp": 0.15881348, + "step": 1925, + "time_per_iteration": 3.2047207355499268 + }, + { + "auxiliary_loss_clip": 0.06648477, + "auxiliary_loss_mlp": 0.01300045, + "balance_loss_clip": 0.06303966, + "balance_loss_mlp": 0.01261016, + "epoch": 0.11579738463850894, + "flos": 22607665608960.0, + "grad_norm": 2.013389480073572, + "language_loss": 0.76518846, + "learning_rate": 3.923277805217161e-06, + "loss": 0.84467369, + "num_input_tokens_seen": 41730210, + "router_z_loss_clip": 3.44726562, + "router_z_loss_mlp": 0.39038086, + "step": 1926, + "time_per_iteration": 2.55283784866333 + }, + { + "auxiliary_loss_clip": 0.06666763, + "auxiliary_loss_mlp": 0.01299238, + "balance_loss_clip": 0.06301835, + "balance_loss_mlp": 0.01255583, + "epoch": 0.11585750789117691, + "flos": 21732630220800.0, + "grad_norm": 5.887246019394102, + "language_loss": 0.7431767, + "learning_rate": 3.923170932221222e-06, + "loss": 0.82283664, + "num_input_tokens_seen": 41750270, + "router_z_loss_clip": 3.64648438, + "router_z_loss_mlp": 0.43652344, + "step": 1927, + "time_per_iteration": 2.560518503189087 + }, + { + "auxiliary_loss_clip": 0.06652652, + "auxiliary_loss_mlp": 0.01306042, + "balance_loss_clip": 0.0630243, + "balance_loss_mlp": 0.01264986, + "epoch": 0.11591763114384489, + "flos": 26294917328640.0, + "grad_norm": 2.5509114333241873, + "language_loss": 0.88765574, + "learning_rate": 3.92306398629845e-06, + "loss": 0.96724266, + "num_input_tokens_seen": 41772975, + "router_z_loss_clip": 3.50585938, + "router_z_loss_mlp": 0.41064453, + "step": 1928, + "time_per_iteration": 2.6590919494628906 + }, + { + "auxiliary_loss_clip": 0.06657438, + "auxiliary_loss_mlp": 0.01301093, + "balance_loss_clip": 0.06300268, + "balance_loss_mlp": 0.01261468, + "epoch": 0.11597775439651285, + "flos": 23006721479040.0, + "grad_norm": 2.0893495121762844, + "language_loss": 0.7806766, + "learning_rate": 3.922956967452898e-06, + "loss": 0.86026198, + "num_input_tokens_seen": 41791765, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 0.39648438, + "step": 1929, + "time_per_iteration": 2.5792133808135986 + }, + { + "auxiliary_loss_clip": 0.06650299, + "auxiliary_loss_mlp": 0.01295794, + "balance_loss_clip": 0.06304935, + "balance_loss_mlp": 0.01259626, + "epoch": 0.11603787764918082, + "flos": 31949845678080.0, + "grad_norm": 1.6257603780251215, + "language_loss": 0.78351086, + "learning_rate": 3.922849875688626e-06, + "loss": 0.86297178, + "num_input_tokens_seen": 41815615, + "router_z_loss_clip": 3.453125, + "router_z_loss_mlp": 0.36181641, + "step": 1930, + "time_per_iteration": 2.6880123615264893 + }, + { + "auxiliary_loss_clip": 0.06647912, + "auxiliary_loss_mlp": 0.01295728, + "balance_loss_clip": 0.06302683, + "balance_loss_mlp": 0.01257438, + "epoch": 0.1160980009018488, + "flos": 22277944592640.0, + "grad_norm": 1.7868265367767153, + "language_loss": 0.73173678, + "learning_rate": 3.922742711009693e-06, + "loss": 0.81117314, + "num_input_tokens_seen": 41834810, + "router_z_loss_clip": 3.453125, + "router_z_loss_mlp": 0.3828125, + "step": 1931, + "time_per_iteration": 2.5717685222625732 + }, + { + "auxiliary_loss_clip": 0.06652078, + "auxiliary_loss_mlp": 0.01303044, + "balance_loss_clip": 0.06304099, + "balance_loss_mlp": 0.01264539, + "epoch": 0.11615812415451676, + "flos": 22790205728640.0, + "grad_norm": 1.6665760080165584, + "language_loss": 0.8340829, + "learning_rate": 3.922635473420164e-06, + "loss": 0.91363412, + "num_input_tokens_seen": 41854975, + "router_z_loss_clip": 3.47851562, + "router_z_loss_mlp": 0.38500977, + "step": 1932, + "time_per_iteration": 2.601752519607544 + }, + { + "auxiliary_loss_clip": 0.0648433, + "auxiliary_loss_mlp": 0.01265345, + "balance_loss_clip": 0.06279489, + "balance_loss_mlp": 0.01250242, + "epoch": 0.11621824740718473, + "flos": 67165483438080.0, + "grad_norm": 0.7530575515980809, + "language_loss": 0.61312342, + "learning_rate": 3.922528162924105e-06, + "loss": 0.69062018, + "num_input_tokens_seen": 41911105, + "router_z_loss_clip": 2.046875, + "router_z_loss_mlp": 0.15075684, + "step": 1933, + "time_per_iteration": 3.078101873397827 + }, + { + "auxiliary_loss_clip": 0.06656399, + "auxiliary_loss_mlp": 0.01297791, + "balance_loss_clip": 0.06303177, + "balance_loss_mlp": 0.01259239, + "epoch": 0.11627837065985269, + "flos": 20382160366080.0, + "grad_norm": 2.5724054750959446, + "language_loss": 0.8773917, + "learning_rate": 3.922420779525586e-06, + "loss": 0.95693362, + "num_input_tokens_seen": 41931750, + "router_z_loss_clip": 3.52929688, + "router_z_loss_mlp": 0.38574219, + "step": 1934, + "time_per_iteration": 2.5999112129211426 + }, + { + "auxiliary_loss_clip": 0.06669597, + "auxiliary_loss_mlp": 0.01303802, + "balance_loss_clip": 0.0630424, + "balance_loss_mlp": 0.01260386, + "epoch": 0.11633849391252067, + "flos": 21732252877440.0, + "grad_norm": 3.12484100633917, + "language_loss": 0.67964768, + "learning_rate": 3.9223133232286776e-06, + "loss": 0.75938165, + "num_input_tokens_seen": 41949400, + "router_z_loss_clip": 3.65625, + "router_z_loss_mlp": 0.43408203, + "step": 1935, + "time_per_iteration": 2.5801587104797363 + }, + { + "auxiliary_loss_clip": 0.06657647, + "auxiliary_loss_mlp": 0.01296559, + "balance_loss_clip": 0.06305058, + "balance_loss_mlp": 0.01259485, + "epoch": 0.11639861716518864, + "flos": 18811023984000.0, + "grad_norm": 1.935927362539055, + "language_loss": 0.77021551, + "learning_rate": 3.922205794037456e-06, + "loss": 0.84975761, + "num_input_tokens_seen": 41968100, + "router_z_loss_clip": 3.52734375, + "router_z_loss_mlp": 0.37084961, + "step": 1936, + "time_per_iteration": 2.5624840259552 + }, + { + "auxiliary_loss_clip": 0.06655373, + "auxiliary_loss_mlp": 0.01299017, + "balance_loss_clip": 0.06303351, + "balance_loss_mlp": 0.01259678, + "epoch": 0.1164587404178566, + "flos": 21221333406720.0, + "grad_norm": 1.9207342779057202, + "language_loss": 0.85928023, + "learning_rate": 3.922098191955998e-06, + "loss": 0.93882406, + "num_input_tokens_seen": 41986375, + "router_z_loss_clip": 3.51953125, + "router_z_loss_mlp": 0.39355469, + "step": 1937, + "time_per_iteration": 2.5510001182556152 + }, + { + "auxiliary_loss_clip": 0.06649198, + "auxiliary_loss_mlp": 0.01298206, + "balance_loss_clip": 0.06305847, + "balance_loss_mlp": 0.01261561, + "epoch": 0.11651886367052458, + "flos": 27826040586240.0, + "grad_norm": 2.6065443485594613, + "language_loss": 0.78032261, + "learning_rate": 3.921990516988384e-06, + "loss": 0.85979664, + "num_input_tokens_seen": 42006055, + "router_z_loss_clip": 3.43359375, + "router_z_loss_mlp": 0.36645508, + "step": 1938, + "time_per_iteration": 2.6225640773773193 + }, + { + "auxiliary_loss_clip": 0.06663075, + "auxiliary_loss_mlp": 0.01303768, + "balance_loss_clip": 0.06310252, + "balance_loss_mlp": 0.01266098, + "epoch": 0.11657898692319255, + "flos": 22895570638080.0, + "grad_norm": 1.931552039208485, + "language_loss": 0.80530608, + "learning_rate": 3.921882769138696e-06, + "loss": 0.88497448, + "num_input_tokens_seen": 42024995, + "router_z_loss_clip": 3.52929688, + "router_z_loss_mlp": 0.37670898, + "step": 1939, + "time_per_iteration": 2.5451977252960205 + }, + { + "auxiliary_loss_clip": 0.06656967, + "auxiliary_loss_mlp": 0.01296552, + "balance_loss_clip": 0.06312265, + "balance_loss_mlp": 0.01261409, + "epoch": 0.11663911017586051, + "flos": 24322712578560.0, + "grad_norm": 2.6690615994939795, + "language_loss": 0.88347197, + "learning_rate": 3.9217749484110215e-06, + "loss": 0.96300709, + "num_input_tokens_seen": 42042640, + "router_z_loss_clip": 3.4453125, + "router_z_loss_mlp": 0.3515625, + "step": 1940, + "time_per_iteration": 2.572737216949463 + }, + { + "auxiliary_loss_clip": 0.06642211, + "auxiliary_loss_mlp": 0.01298321, + "balance_loss_clip": 0.06303503, + "balance_loss_mlp": 0.01262987, + "epoch": 0.11669923342852849, + "flos": 42350020525440.0, + "grad_norm": 1.538525373225641, + "language_loss": 0.7696858, + "learning_rate": 3.921667054809449e-06, + "loss": 0.84909111, + "num_input_tokens_seen": 42067005, + "router_z_loss_clip": 3.38671875, + "router_z_loss_mlp": 0.35327148, + "step": 1941, + "time_per_iteration": 2.72994065284729 + }, + { + "auxiliary_loss_clip": 0.06658466, + "auxiliary_loss_mlp": 0.01294978, + "balance_loss_clip": 0.06313083, + "balance_loss_mlp": 0.01259525, + "epoch": 0.11675935668119646, + "flos": 14646660716160.0, + "grad_norm": 2.147321627209633, + "language_loss": 0.9028796, + "learning_rate": 3.921559088338068e-06, + "loss": 0.98241401, + "num_input_tokens_seen": 42082295, + "router_z_loss_clip": 3.453125, + "router_z_loss_mlp": 0.35449219, + "step": 1942, + "time_per_iteration": 2.550832986831665 + }, + { + "auxiliary_loss_clip": 0.06645136, + "auxiliary_loss_mlp": 0.0129601, + "balance_loss_clip": 0.06305736, + "balance_loss_mlp": 0.01262154, + "epoch": 0.11681947993386442, + "flos": 35125213605120.0, + "grad_norm": 1.8932460092328547, + "language_loss": 0.69414169, + "learning_rate": 3.921451049000975e-06, + "loss": 0.77355313, + "num_input_tokens_seen": 42105295, + "router_z_loss_clip": 3.39453125, + "router_z_loss_mlp": 0.33813477, + "step": 1943, + "time_per_iteration": 2.6689436435699463 + }, + { + "auxiliary_loss_clip": 0.06646268, + "auxiliary_loss_mlp": 0.01301771, + "balance_loss_clip": 0.06305961, + "balance_loss_mlp": 0.01264721, + "epoch": 0.11687960318653239, + "flos": 38992531749120.0, + "grad_norm": 3.030291623904481, + "language_loss": 0.71275461, + "learning_rate": 3.921342936802265e-06, + "loss": 0.79223496, + "num_input_tokens_seen": 42125520, + "router_z_loss_clip": 3.40429688, + "router_z_loss_mlp": 0.37060547, + "step": 1944, + "time_per_iteration": 2.8050050735473633 + }, + { + "auxiliary_loss_clip": 0.06641431, + "auxiliary_loss_mlp": 0.01296797, + "balance_loss_clip": 0.06306483, + "balance_loss_mlp": 0.01261606, + "epoch": 0.11693972643920036, + "flos": 26002190689920.0, + "grad_norm": 1.654338946560172, + "language_loss": 0.83736217, + "learning_rate": 3.921234751746038e-06, + "loss": 0.91674441, + "num_input_tokens_seen": 42146335, + "router_z_loss_clip": 3.34765625, + "router_z_loss_mlp": 0.35205078, + "step": 1945, + "time_per_iteration": 2.6361136436462402 + }, + { + "auxiliary_loss_clip": 0.06650846, + "auxiliary_loss_mlp": 0.01293506, + "balance_loss_clip": 0.06312834, + "balance_loss_mlp": 0.01259579, + "epoch": 0.11699984969186833, + "flos": 27279552257280.0, + "grad_norm": 2.078454883436641, + "language_loss": 0.78074771, + "learning_rate": 3.9211264938363975e-06, + "loss": 0.86019123, + "num_input_tokens_seen": 42165320, + "router_z_loss_clip": 3.38085938, + "router_z_loss_mlp": 0.33935547, + "step": 1946, + "time_per_iteration": 2.6417500972747803 + }, + { + "auxiliary_loss_clip": 0.06645864, + "auxiliary_loss_mlp": 0.01291798, + "balance_loss_clip": 0.06307344, + "balance_loss_mlp": 0.01256083, + "epoch": 0.1170599729445363, + "flos": 15273217221120.0, + "grad_norm": 2.310732730392425, + "language_loss": 0.70257539, + "learning_rate": 3.921018163077448e-06, + "loss": 0.78195202, + "num_input_tokens_seen": 42182955, + "router_z_loss_clip": 3.38671875, + "router_z_loss_mlp": 0.35717773, + "step": 1947, + "time_per_iteration": 2.536513090133667 + }, + { + "auxiliary_loss_clip": 0.0665355, + "auxiliary_loss_mlp": 0.01301689, + "balance_loss_clip": 0.0630812, + "balance_loss_mlp": 0.01263113, + "epoch": 0.11712009619720427, + "flos": 17170007696640.0, + "grad_norm": 1.8188768357243443, + "language_loss": 0.86507225, + "learning_rate": 3.920909759473295e-06, + "loss": 0.94462466, + "num_input_tokens_seen": 42200760, + "router_z_loss_clip": 3.45507812, + "router_z_loss_mlp": 0.38574219, + "step": 1948, + "time_per_iteration": 2.515779495239258 + }, + { + "auxiliary_loss_clip": 0.06494473, + "auxiliary_loss_mlp": 0.01265792, + "balance_loss_clip": 0.06290484, + "balance_loss_mlp": 0.01249031, + "epoch": 0.11718021944987224, + "flos": 70961076887040.0, + "grad_norm": 2.567078438362061, + "language_loss": 0.65165019, + "learning_rate": 3.920801283028054e-06, + "loss": 0.72925287, + "num_input_tokens_seen": 42265745, + "router_z_loss_clip": 2.04101562, + "router_z_loss_mlp": 0.16772461, + "step": 1949, + "time_per_iteration": 3.177534341812134 + }, + { + "auxiliary_loss_clip": 0.06637877, + "auxiliary_loss_mlp": 0.0129446, + "balance_loss_clip": 0.06306669, + "balance_loss_mlp": 0.01261344, + "epoch": 0.1172403427025402, + "flos": 27460750711680.0, + "grad_norm": 1.6361907196052987, + "language_loss": 0.73358595, + "learning_rate": 3.920692733745835e-06, + "loss": 0.81290931, + "num_input_tokens_seen": 42286245, + "router_z_loss_clip": 3.31054688, + "router_z_loss_mlp": 0.33129883, + "step": 1950, + "time_per_iteration": 4.022751808166504 + }, + { + "auxiliary_loss_clip": 0.06660106, + "auxiliary_loss_mlp": 0.01302647, + "balance_loss_clip": 0.063132, + "balance_loss_mlp": 0.01265382, + "epoch": 0.11730046595520818, + "flos": 15674075953920.0, + "grad_norm": 2.7331916034067363, + "language_loss": 0.77657926, + "learning_rate": 3.920584111630755e-06, + "loss": 0.85620677, + "num_input_tokens_seen": 42302710, + "router_z_loss_clip": 3.46875, + "router_z_loss_mlp": 0.37280273, + "step": 1951, + "time_per_iteration": 2.5281777381896973 + }, + { + "auxiliary_loss_clip": 0.06648034, + "auxiliary_loss_mlp": 0.01294944, + "balance_loss_clip": 0.06303104, + "balance_loss_mlp": 0.01259801, + "epoch": 0.11736058920787615, + "flos": 25637320085760.0, + "grad_norm": 1.948975435069226, + "language_loss": 0.77674389, + "learning_rate": 3.9204754166869325e-06, + "loss": 0.85617363, + "num_input_tokens_seen": 42324115, + "router_z_loss_clip": 3.44921875, + "router_z_loss_mlp": 0.35131836, + "step": 1952, + "time_per_iteration": 4.001826286315918 + }, + { + "auxiliary_loss_clip": 0.06657356, + "auxiliary_loss_mlp": 0.01307688, + "balance_loss_clip": 0.06309209, + "balance_loss_mlp": 0.01270828, + "epoch": 0.11742071246054411, + "flos": 21440742122880.0, + "grad_norm": 9.62552088472932, + "language_loss": 0.73713255, + "learning_rate": 3.920366648918491e-06, + "loss": 0.81678301, + "num_input_tokens_seen": 42342505, + "router_z_loss_clip": 3.48242188, + "router_z_loss_mlp": 0.3684082, + "step": 1953, + "time_per_iteration": 2.5549252033233643 + }, + { + "auxiliary_loss_clip": 0.06670918, + "auxiliary_loss_mlp": 0.0130466, + "balance_loss_clip": 0.06317334, + "balance_loss_mlp": 0.01266203, + "epoch": 0.11748083571321208, + "flos": 16003377699840.0, + "grad_norm": 2.536716983337743, + "language_loss": 0.80894691, + "learning_rate": 3.920257808329552e-06, + "loss": 0.88870263, + "num_input_tokens_seen": 42360525, + "router_z_loss_clip": 3.53710938, + "router_z_loss_mlp": 0.38452148, + "step": 1954, + "time_per_iteration": 2.5963521003723145 + }, + { + "auxiliary_loss_clip": 0.06659664, + "auxiliary_loss_mlp": 0.01298566, + "balance_loss_clip": 0.06309056, + "balance_loss_mlp": 0.01260037, + "epoch": 0.11754095896588006, + "flos": 16185582403200.0, + "grad_norm": 1.9904438509588216, + "language_loss": 0.86966431, + "learning_rate": 3.920148894924246e-06, + "loss": 0.94924664, + "num_input_tokens_seen": 42377045, + "router_z_loss_clip": 3.50195312, + "router_z_loss_mlp": 0.38500977, + "step": 1955, + "time_per_iteration": 3.9597103595733643 + }, + { + "auxiliary_loss_clip": 0.06656501, + "auxiliary_loss_mlp": 0.0129708, + "balance_loss_clip": 0.06311554, + "balance_loss_mlp": 0.01262962, + "epoch": 0.11760108221854802, + "flos": 13266701424000.0, + "grad_norm": 2.228472811519511, + "language_loss": 0.79745102, + "learning_rate": 3.920039908706701e-06, + "loss": 0.8769868, + "num_input_tokens_seen": 42393960, + "router_z_loss_clip": 3.44726562, + "router_z_loss_mlp": 0.34130859, + "step": 1956, + "time_per_iteration": 3.990912437438965 + }, + { + "auxiliary_loss_clip": 0.0665153, + "auxiliary_loss_mlp": 0.01299416, + "balance_loss_clip": 0.06313992, + "balance_loss_mlp": 0.01266014, + "epoch": 0.11766120547121599, + "flos": 24505294625280.0, + "grad_norm": 2.0751916947238755, + "language_loss": 0.81691504, + "learning_rate": 3.91993084968105e-06, + "loss": 0.89642453, + "num_input_tokens_seen": 42413160, + "router_z_loss_clip": 3.37695312, + "router_z_loss_mlp": 0.33398438, + "step": 1957, + "time_per_iteration": 2.6472387313842773 + }, + { + "auxiliary_loss_clip": 0.06660254, + "auxiliary_loss_mlp": 0.01296947, + "balance_loss_clip": 0.06313962, + "balance_loss_mlp": 0.01261757, + "epoch": 0.11772132872388397, + "flos": 17789562385920.0, + "grad_norm": 3.000987002447453, + "language_loss": 0.80231309, + "learning_rate": 3.919821717851428e-06, + "loss": 0.88188511, + "num_input_tokens_seen": 42432590, + "router_z_loss_clip": 3.45898438, + "router_z_loss_mlp": 0.35180664, + "step": 1958, + "time_per_iteration": 2.5531046390533447 + }, + { + "auxiliary_loss_clip": 0.06667449, + "auxiliary_loss_mlp": 0.01302997, + "balance_loss_clip": 0.06316346, + "balance_loss_mlp": 0.01263968, + "epoch": 0.11778145197655193, + "flos": 13220776586880.0, + "grad_norm": 3.2848276198767725, + "language_loss": 0.78886813, + "learning_rate": 3.919712513221976e-06, + "loss": 0.86857259, + "num_input_tokens_seen": 42450135, + "router_z_loss_clip": 3.51171875, + "router_z_loss_mlp": 0.39038086, + "step": 1959, + "time_per_iteration": 2.57987642288208 + }, + { + "auxiliary_loss_clip": 0.06661299, + "auxiliary_loss_mlp": 0.01290487, + "balance_loss_clip": 0.06313363, + "balance_loss_mlp": 0.0125656, + "epoch": 0.1178415752292199, + "flos": 20236446915840.0, + "grad_norm": 2.2069161558777033, + "language_loss": 0.72216022, + "learning_rate": 3.919603235796832e-06, + "loss": 0.80167806, + "num_input_tokens_seen": 42470050, + "router_z_loss_clip": 3.47851562, + "router_z_loss_mlp": 0.33911133, + "step": 1960, + "time_per_iteration": 2.568760633468628 + }, + { + "auxiliary_loss_clip": 0.06675136, + "auxiliary_loss_mlp": 0.0129754, + "balance_loss_clip": 0.0632275, + "balance_loss_mlp": 0.01260156, + "epoch": 0.11790169848188788, + "flos": 13044777085440.0, + "grad_norm": 2.729190408722114, + "language_loss": 0.83173323, + "learning_rate": 3.9194938855801406e-06, + "loss": 0.91146004, + "num_input_tokens_seen": 42484335, + "router_z_loss_clip": 3.51953125, + "router_z_loss_mlp": 0.3737793, + "step": 1961, + "time_per_iteration": 2.5375704765319824 + }, + { + "auxiliary_loss_clip": 0.06648357, + "auxiliary_loss_mlp": 0.01294811, + "balance_loss_clip": 0.06310797, + "balance_loss_mlp": 0.01261671, + "epoch": 0.11796182173455584, + "flos": 22271026631040.0, + "grad_norm": 1.7537121481691995, + "language_loss": 0.93383837, + "learning_rate": 3.919384462576049e-06, + "loss": 1.01327002, + "num_input_tokens_seen": 42502720, + "router_z_loss_clip": 3.375, + "router_z_loss_mlp": 0.33105469, + "step": 1962, + "time_per_iteration": 2.5976755619049072 + }, + { + "auxiliary_loss_clip": 0.06656337, + "auxiliary_loss_mlp": 0.01295869, + "balance_loss_clip": 0.06308894, + "balance_loss_mlp": 0.0125994, + "epoch": 0.1180219449872238, + "flos": 10639750469760.0, + "grad_norm": 2.255465148131723, + "language_loss": 0.89418864, + "learning_rate": 3.919274966788707e-06, + "loss": 0.97371072, + "num_input_tokens_seen": 42519460, + "router_z_loss_clip": 3.4765625, + "router_z_loss_mlp": 0.35961914, + "step": 1963, + "time_per_iteration": 2.543811321258545 + }, + { + "auxiliary_loss_clip": 0.06669922, + "auxiliary_loss_mlp": 0.01296273, + "balance_loss_clip": 0.0631619, + "balance_loss_mlp": 0.01260963, + "epoch": 0.11808206823989177, + "flos": 20929906506240.0, + "grad_norm": 1.978622705265592, + "language_loss": 0.85645056, + "learning_rate": 3.919165398222265e-06, + "loss": 0.93611252, + "num_input_tokens_seen": 42539420, + "router_z_loss_clip": 3.5390625, + "router_z_loss_mlp": 0.35327148, + "step": 1964, + "time_per_iteration": 2.623378276824951 + }, + { + "auxiliary_loss_clip": 0.06654269, + "auxiliary_loss_mlp": 0.01293841, + "balance_loss_clip": 0.06309862, + "balance_loss_mlp": 0.01258722, + "epoch": 0.11814219149255975, + "flos": 20784151128960.0, + "grad_norm": 2.5088973707394833, + "language_loss": 0.84141672, + "learning_rate": 3.919055756880879e-06, + "loss": 0.92089784, + "num_input_tokens_seen": 42558225, + "router_z_loss_clip": 3.44335938, + "router_z_loss_mlp": 0.35107422, + "step": 1965, + "time_per_iteration": 2.5660836696624756 + }, + { + "auxiliary_loss_clip": 0.0666364, + "auxiliary_loss_mlp": 0.01301878, + "balance_loss_clip": 0.06310593, + "balance_loss_mlp": 0.01261681, + "epoch": 0.11820231474522772, + "flos": 48770594357760.0, + "grad_norm": 7.622964926374016, + "language_loss": 0.75756431, + "learning_rate": 3.918946042768707e-06, + "loss": 0.83721948, + "num_input_tokens_seen": 42580790, + "router_z_loss_clip": 3.53320312, + "router_z_loss_mlp": 0.40185547, + "step": 1966, + "time_per_iteration": 2.82966947555542 + }, + { + "auxiliary_loss_clip": 0.06671088, + "auxiliary_loss_mlp": 0.01309316, + "balance_loss_clip": 0.06322029, + "balance_loss_mlp": 0.01273887, + "epoch": 0.11826243799789568, + "flos": 16696166457600.0, + "grad_norm": 4.386609320764267, + "language_loss": 0.74750423, + "learning_rate": 3.918836255889908e-06, + "loss": 0.8273083, + "num_input_tokens_seen": 42597355, + "router_z_loss_clip": 3.49414062, + "router_z_loss_mlp": 0.35449219, + "step": 1967, + "time_per_iteration": 2.5282158851623535 + }, + { + "auxiliary_loss_clip": 0.06658092, + "auxiliary_loss_mlp": 0.01304409, + "balance_loss_clip": 0.06307551, + "balance_loss_mlp": 0.01268003, + "epoch": 0.11832256125056366, + "flos": 16915533246720.0, + "grad_norm": 2.9401944207789934, + "language_loss": 0.90244436, + "learning_rate": 3.9187263962486456e-06, + "loss": 0.98206937, + "num_input_tokens_seen": 42616060, + "router_z_loss_clip": 3.5078125, + "router_z_loss_mlp": 0.36401367, + "step": 1968, + "time_per_iteration": 2.573209285736084 + }, + { + "auxiliary_loss_clip": 0.06659393, + "auxiliary_loss_mlp": 0.01300215, + "balance_loss_clip": 0.06313194, + "balance_loss_mlp": 0.01266264, + "epoch": 0.11838268450323162, + "flos": 22827032398080.0, + "grad_norm": 2.909458687960279, + "language_loss": 0.68506658, + "learning_rate": 3.918616463849087e-06, + "loss": 0.76466268, + "num_input_tokens_seen": 42636285, + "router_z_loss_clip": 3.46289062, + "router_z_loss_mlp": 0.33935547, + "step": 1969, + "time_per_iteration": 2.574584484100342 + }, + { + "auxiliary_loss_clip": 0.06652254, + "auxiliary_loss_mlp": 0.01317322, + "balance_loss_clip": 0.06307729, + "balance_loss_mlp": 0.01281034, + "epoch": 0.11844280775589959, + "flos": 33554035296000.0, + "grad_norm": 1.9192483322460232, + "language_loss": 0.81922328, + "learning_rate": 3.918506458695399e-06, + "loss": 0.89891899, + "num_input_tokens_seen": 42658320, + "router_z_loss_clip": 3.44335938, + "router_z_loss_mlp": 0.36303711, + "step": 1970, + "time_per_iteration": 2.688477039337158 + }, + { + "auxiliary_loss_clip": 0.06493312, + "auxiliary_loss_mlp": 0.01272243, + "balance_loss_clip": 0.06287479, + "balance_loss_mlp": 0.01257163, + "epoch": 0.11850293100856757, + "flos": 66371522474880.0, + "grad_norm": 0.7778041955901001, + "language_loss": 0.66349763, + "learning_rate": 3.918396380791754e-06, + "loss": 0.74115324, + "num_input_tokens_seen": 42721500, + "router_z_loss_clip": 2.0625, + "router_z_loss_mlp": 0.1505127, + "step": 1971, + "time_per_iteration": 3.1715264320373535 + }, + { + "auxiliary_loss_clip": 0.06664559, + "auxiliary_loss_mlp": 0.01309662, + "balance_loss_clip": 0.06317366, + "balance_loss_mlp": 0.01274996, + "epoch": 0.11856305426123553, + "flos": 24687960526080.0, + "grad_norm": 2.78038897761295, + "language_loss": 0.81843936, + "learning_rate": 3.918286230142327e-06, + "loss": 0.89818156, + "num_input_tokens_seen": 42739825, + "router_z_loss_clip": 3.47070312, + "router_z_loss_mlp": 0.34643555, + "step": 1972, + "time_per_iteration": 2.6285483837127686 + }, + { + "auxiliary_loss_clip": 0.06645221, + "auxiliary_loss_mlp": 0.01320916, + "balance_loss_clip": 0.06308153, + "balance_loss_mlp": 0.01286179, + "epoch": 0.1186231775139035, + "flos": 24287017939200.0, + "grad_norm": 2.7493832888964116, + "language_loss": 0.746387, + "learning_rate": 3.918176006751292e-06, + "loss": 0.82604837, + "num_input_tokens_seen": 42758695, + "router_z_loss_clip": 3.36914062, + "router_z_loss_mlp": 0.34716797, + "step": 1973, + "time_per_iteration": 2.607680082321167 + }, + { + "auxiliary_loss_clip": 0.06639803, + "auxiliary_loss_mlp": 0.0131421, + "balance_loss_clip": 0.06300108, + "balance_loss_mlp": 0.01277851, + "epoch": 0.11868330076657148, + "flos": 21763042053120.0, + "grad_norm": 1.6365219196166583, + "language_loss": 0.73750299, + "learning_rate": 3.918065710622832e-06, + "loss": 0.81704313, + "num_input_tokens_seen": 42778510, + "router_z_loss_clip": 3.3984375, + "router_z_loss_mlp": 0.36352539, + "step": 1974, + "time_per_iteration": 2.603078603744507 + }, + { + "auxiliary_loss_clip": 0.06653641, + "auxiliary_loss_mlp": 0.01323127, + "balance_loss_clip": 0.06305285, + "balance_loss_mlp": 0.01286196, + "epoch": 0.11874342401923944, + "flos": 17197568490240.0, + "grad_norm": 3.7102130607090893, + "language_loss": 0.79475862, + "learning_rate": 3.917955341761128e-06, + "loss": 0.87452626, + "num_input_tokens_seen": 42793995, + "router_z_loss_clip": 3.48242188, + "router_z_loss_mlp": 0.36914062, + "step": 1975, + "time_per_iteration": 2.529472827911377 + }, + { + "auxiliary_loss_clip": 0.06637481, + "auxiliary_loss_mlp": 0.01318957, + "balance_loss_clip": 0.06305119, + "balance_loss_mlp": 0.01286246, + "epoch": 0.11880354727190741, + "flos": 15234629616000.0, + "grad_norm": 3.277775960681522, + "language_loss": 0.77101427, + "learning_rate": 3.917844900170364e-06, + "loss": 0.85057861, + "num_input_tokens_seen": 42809000, + "router_z_loss_clip": 3.3203125, + "router_z_loss_mlp": 0.32714844, + "step": 1976, + "time_per_iteration": 2.5576260089874268 + }, + { + "auxiliary_loss_clip": 0.06648317, + "auxiliary_loss_mlp": 0.01301156, + "balance_loss_clip": 0.0630495, + "balance_loss_mlp": 0.0126537, + "epoch": 0.11886367052457537, + "flos": 27317343248640.0, + "grad_norm": 1.6788870618385208, + "language_loss": 0.76201534, + "learning_rate": 3.91773438585473e-06, + "loss": 0.84151006, + "num_input_tokens_seen": 42831585, + "router_z_loss_clip": 3.43164062, + "router_z_loss_mlp": 0.35791016, + "step": 1977, + "time_per_iteration": 2.6103506088256836 + }, + { + "auxiliary_loss_clip": 0.06654633, + "auxiliary_loss_mlp": 0.01297753, + "balance_loss_clip": 0.06301999, + "balance_loss_mlp": 0.01261346, + "epoch": 0.11892379377724335, + "flos": 21804648405120.0, + "grad_norm": 2.329560685386949, + "language_loss": 0.75601208, + "learning_rate": 3.9176237988184165e-06, + "loss": 0.835536, + "num_input_tokens_seen": 42848420, + "router_z_loss_clip": 3.5234375, + "router_z_loss_mlp": 0.36401367, + "step": 1978, + "time_per_iteration": 2.556502103805542 + }, + { + "auxiliary_loss_clip": 0.06647499, + "auxiliary_loss_mlp": 0.01294249, + "balance_loss_clip": 0.06307921, + "balance_loss_mlp": 0.0126068, + "epoch": 0.11898391702991132, + "flos": 13996191070080.0, + "grad_norm": 1.8023230195278173, + "language_loss": 0.74423146, + "learning_rate": 3.917513139065616e-06, + "loss": 0.82364893, + "num_input_tokens_seen": 42866645, + "router_z_loss_clip": 3.40039062, + "router_z_loss_mlp": 0.33569336, + "step": 1979, + "time_per_iteration": 2.595372200012207 + }, + { + "auxiliary_loss_clip": 0.0664144, + "auxiliary_loss_mlp": 0.01296465, + "balance_loss_clip": 0.06302245, + "balance_loss_mlp": 0.01261965, + "epoch": 0.11904404028257928, + "flos": 32242907733120.0, + "grad_norm": 1.646895354500375, + "language_loss": 0.99974936, + "learning_rate": 3.917402406600525e-06, + "loss": 1.07912838, + "num_input_tokens_seen": 42888515, + "router_z_loss_clip": 3.39257812, + "router_z_loss_mlp": 0.34521484, + "step": 1980, + "time_per_iteration": 2.6381077766418457 + }, + { + "auxiliary_loss_clip": 0.06647406, + "auxiliary_loss_mlp": 0.01292706, + "balance_loss_clip": 0.0630409, + "balance_loss_mlp": 0.01256299, + "epoch": 0.11910416353524726, + "flos": 23592971370240.0, + "grad_norm": 2.6857595325388095, + "language_loss": 0.87083352, + "learning_rate": 3.917291601427342e-06, + "loss": 0.95023465, + "num_input_tokens_seen": 42909035, + "router_z_loss_clip": 3.43359375, + "router_z_loss_mlp": 0.36401367, + "step": 1981, + "time_per_iteration": 2.5953710079193115 + }, + { + "auxiliary_loss_clip": 0.0664432, + "auxiliary_loss_mlp": 0.01298025, + "balance_loss_clip": 0.06305191, + "balance_loss_mlp": 0.01263287, + "epoch": 0.11916428678791523, + "flos": 25339268712960.0, + "grad_norm": 1.936683956575477, + "language_loss": 0.86578631, + "learning_rate": 3.91718072355027e-06, + "loss": 0.94520986, + "num_input_tokens_seen": 42927555, + "router_z_loss_clip": 3.38867188, + "router_z_loss_mlp": 0.34765625, + "step": 1982, + "time_per_iteration": 2.5845234394073486 + }, + { + "auxiliary_loss_clip": 0.06636401, + "auxiliary_loss_mlp": 0.01296498, + "balance_loss_clip": 0.06295685, + "balance_loss_mlp": 0.0126095, + "epoch": 0.11922441004058319, + "flos": 19793939904000.0, + "grad_norm": 2.0505681107153273, + "language_loss": 0.86230731, + "learning_rate": 3.917069772973513e-06, + "loss": 0.94163632, + "num_input_tokens_seen": 42945300, + "router_z_loss_clip": 3.40820312, + "router_z_loss_mlp": 0.35571289, + "step": 1983, + "time_per_iteration": 2.554844379425049 + }, + { + "auxiliary_loss_clip": 0.06654783, + "auxiliary_loss_mlp": 0.01292763, + "balance_loss_clip": 0.06302382, + "balance_loss_mlp": 0.01256858, + "epoch": 0.11928453329325117, + "flos": 21541578912000.0, + "grad_norm": 3.6464912777756373, + "language_loss": 0.78593659, + "learning_rate": 3.916958749701277e-06, + "loss": 0.86541206, + "num_input_tokens_seen": 42961295, + "router_z_loss_clip": 3.51757812, + "router_z_loss_mlp": 0.35913086, + "step": 1984, + "time_per_iteration": 2.5320324897766113 + }, + { + "auxiliary_loss_clip": 0.06647135, + "auxiliary_loss_mlp": 0.01292695, + "balance_loss_clip": 0.0630364, + "balance_loss_mlp": 0.0125574, + "epoch": 0.11934465654591914, + "flos": 20821522849920.0, + "grad_norm": 1.8707303629344072, + "language_loss": 0.84522444, + "learning_rate": 3.9168476537377745e-06, + "loss": 0.92462277, + "num_input_tokens_seen": 42980330, + "router_z_loss_clip": 3.43164062, + "router_z_loss_mlp": 0.36962891, + "step": 1985, + "time_per_iteration": 2.6096858978271484 + }, + { + "auxiliary_loss_clip": 0.06641059, + "auxiliary_loss_mlp": 0.01296367, + "balance_loss_clip": 0.06304613, + "balance_loss_mlp": 0.01263346, + "epoch": 0.1194047797985871, + "flos": 19066169266560.0, + "grad_norm": 3.6983230286651945, + "language_loss": 0.75468755, + "learning_rate": 3.916736485087216e-06, + "loss": 0.83406186, + "num_input_tokens_seen": 42996125, + "router_z_loss_clip": 3.3671875, + "router_z_loss_mlp": 0.33007812, + "step": 1986, + "time_per_iteration": 2.497166633605957 + }, + { + "auxiliary_loss_clip": 0.06650525, + "auxiliary_loss_mlp": 0.01300056, + "balance_loss_clip": 0.06311469, + "balance_loss_mlp": 0.01265771, + "epoch": 0.11946490305125507, + "flos": 27196842677760.0, + "grad_norm": 2.5090300356015227, + "language_loss": 0.73365855, + "learning_rate": 3.916625243753819e-06, + "loss": 0.81316435, + "num_input_tokens_seen": 43014180, + "router_z_loss_clip": 3.390625, + "router_z_loss_mlp": 0.34301758, + "step": 1987, + "time_per_iteration": 2.6316466331481934 + }, + { + "auxiliary_loss_clip": 0.06659403, + "auxiliary_loss_mlp": 0.01313937, + "balance_loss_clip": 0.06313819, + "balance_loss_mlp": 0.01275886, + "epoch": 0.11952502630392305, + "flos": 21146925381120.0, + "grad_norm": 1.9895182313514284, + "language_loss": 0.73564172, + "learning_rate": 3.916513929741799e-06, + "loss": 0.81537521, + "num_input_tokens_seen": 43032120, + "router_z_loss_clip": 3.453125, + "router_z_loss_mlp": 0.38012695, + "step": 1988, + "time_per_iteration": 2.538780450820923 + }, + { + "auxiliary_loss_clip": 0.06646325, + "auxiliary_loss_mlp": 0.01300531, + "balance_loss_clip": 0.06309503, + "balance_loss_mlp": 0.01265817, + "epoch": 0.11958514955659101, + "flos": 22130260571520.0, + "grad_norm": 2.1843811344265434, + "language_loss": 0.82602763, + "learning_rate": 3.91640254305538e-06, + "loss": 0.90549618, + "num_input_tokens_seen": 43052215, + "router_z_loss_clip": 3.3671875, + "router_z_loss_mlp": 0.34716797, + "step": 1989, + "time_per_iteration": 2.6741979122161865 + }, + { + "auxiliary_loss_clip": 0.06651568, + "auxiliary_loss_mlp": 0.01303723, + "balance_loss_clip": 0.06308736, + "balance_loss_mlp": 0.01266482, + "epoch": 0.11964527280925898, + "flos": 17427333185280.0, + "grad_norm": 3.1495832164614828, + "language_loss": 0.77526391, + "learning_rate": 3.916291083698784e-06, + "loss": 0.85481679, + "num_input_tokens_seen": 43069720, + "router_z_loss_clip": 3.42578125, + "router_z_loss_mlp": 0.37255859, + "step": 1990, + "time_per_iteration": 3.9906837940216064 + }, + { + "auxiliary_loss_clip": 0.06541168, + "auxiliary_loss_mlp": 0.0131986, + "balance_loss_clip": 0.06337936, + "balance_loss_mlp": 0.01304852, + "epoch": 0.11970539606192696, + "flos": 70698804007680.0, + "grad_norm": 0.8660684283454352, + "language_loss": 0.55407226, + "learning_rate": 3.916179551676238e-06, + "loss": 0.63268256, + "num_input_tokens_seen": 43123130, + "router_z_loss_clip": 2.03125, + "router_z_loss_mlp": 0.14978027, + "step": 1991, + "time_per_iteration": 4.6956093311309814 + }, + { + "auxiliary_loss_clip": 0.06638116, + "auxiliary_loss_mlp": 0.01295675, + "balance_loss_clip": 0.06307568, + "balance_loss_mlp": 0.01263345, + "epoch": 0.11976551931459492, + "flos": 21221375333760.0, + "grad_norm": 2.476959921909238, + "language_loss": 0.79074007, + "learning_rate": 3.916067946991971e-06, + "loss": 0.87007797, + "num_input_tokens_seen": 43140015, + "router_z_loss_clip": 3.3046875, + "router_z_loss_mlp": 0.32348633, + "step": 1992, + "time_per_iteration": 2.5945029258728027 + }, + { + "auxiliary_loss_clip": 0.06650865, + "auxiliary_loss_mlp": 0.01302479, + "balance_loss_clip": 0.06309184, + "balance_loss_mlp": 0.01267647, + "epoch": 0.11982564256726289, + "flos": 25995566217600.0, + "grad_norm": 2.0953190944700215, + "language_loss": 0.800017, + "learning_rate": 3.915956269650216e-06, + "loss": 0.87955046, + "num_input_tokens_seen": 43160105, + "router_z_loss_clip": 3.41601562, + "router_z_loss_mlp": 0.34838867, + "step": 1993, + "time_per_iteration": 2.5923471450805664 + }, + { + "auxiliary_loss_clip": 0.06641386, + "auxiliary_loss_mlp": 0.0130103, + "balance_loss_clip": 0.06304519, + "balance_loss_mlp": 0.01266793, + "epoch": 0.11988576581993086, + "flos": 21656964384000.0, + "grad_norm": 1.8929635889117382, + "language_loss": 0.83093858, + "learning_rate": 3.915844519655208e-06, + "loss": 0.91036278, + "num_input_tokens_seen": 43179835, + "router_z_loss_clip": 3.37304688, + "router_z_loss_mlp": 0.3425293, + "step": 1994, + "time_per_iteration": 2.58314847946167 + }, + { + "auxiliary_loss_clip": 0.06638885, + "auxiliary_loss_mlp": 0.01299925, + "balance_loss_clip": 0.06306463, + "balance_loss_mlp": 0.01265617, + "epoch": 0.11994588907259883, + "flos": 17863048016640.0, + "grad_norm": 2.42141016996774, + "language_loss": 0.90494514, + "learning_rate": 3.915732697011183e-06, + "loss": 0.98433328, + "num_input_tokens_seen": 43197210, + "router_z_loss_clip": 3.3203125, + "router_z_loss_mlp": 0.34301758, + "step": 1995, + "time_per_iteration": 5.38932991027832 + }, + { + "auxiliary_loss_clip": 0.06647271, + "auxiliary_loss_mlp": 0.01300085, + "balance_loss_clip": 0.06306107, + "balance_loss_mlp": 0.01263583, + "epoch": 0.1200060123252668, + "flos": 24469725767040.0, + "grad_norm": 3.463827549229225, + "language_loss": 0.75938386, + "learning_rate": 3.9156208017223825e-06, + "loss": 0.83885741, + "num_input_tokens_seen": 43215050, + "router_z_loss_clip": 3.41015625, + "router_z_loss_mlp": 0.36523438, + "step": 1996, + "time_per_iteration": 2.630936861038208 + }, + { + "auxiliary_loss_clip": 0.06633951, + "auxiliary_loss_mlp": 0.01306595, + "balance_loss_clip": 0.06300932, + "balance_loss_mlp": 0.01273097, + "epoch": 0.12006613557793476, + "flos": 18737831842560.0, + "grad_norm": 2.002664476767551, + "language_loss": 0.88733006, + "learning_rate": 3.915508833793048e-06, + "loss": 0.96673548, + "num_input_tokens_seen": 43233900, + "router_z_loss_clip": 3.328125, + "router_z_loss_mlp": 0.33496094, + "step": 1997, + "time_per_iteration": 2.542490243911743 + }, + { + "auxiliary_loss_clip": 0.06639601, + "auxiliary_loss_mlp": 0.01299934, + "balance_loss_clip": 0.06303362, + "balance_loss_mlp": 0.01265864, + "epoch": 0.12012625883060274, + "flos": 22273374545280.0, + "grad_norm": 2.268718132008626, + "language_loss": 0.8047471, + "learning_rate": 3.915396793227428e-06, + "loss": 0.88414252, + "num_input_tokens_seen": 43252105, + "router_z_loss_clip": 3.36523438, + "router_z_loss_mlp": 0.34033203, + "step": 1998, + "time_per_iteration": 2.6070334911346436 + }, + { + "auxiliary_loss_clip": 0.06640439, + "auxiliary_loss_mlp": 0.01306471, + "balance_loss_clip": 0.06312488, + "balance_loss_mlp": 0.01272401, + "epoch": 0.1201863820832707, + "flos": 21764761061760.0, + "grad_norm": 2.100057893204002, + "language_loss": 0.73916173, + "learning_rate": 3.915284680029769e-06, + "loss": 0.81863081, + "num_input_tokens_seen": 43270315, + "router_z_loss_clip": 3.27929688, + "router_z_loss_mlp": 0.34033203, + "step": 1999, + "time_per_iteration": 2.5563113689422607 + }, + { + "auxiliary_loss_clip": 0.0664693, + "auxiliary_loss_mlp": 0.01298334, + "balance_loss_clip": 0.06304446, + "balance_loss_mlp": 0.01263763, + "epoch": 0.12024650533593867, + "flos": 21914415653760.0, + "grad_norm": 2.961282874650153, + "language_loss": 0.76137137, + "learning_rate": 3.915172494204323e-06, + "loss": 0.84082401, + "num_input_tokens_seen": 43289935, + "router_z_loss_clip": 3.42578125, + "router_z_loss_mlp": 0.34545898, + "step": 2000, + "time_per_iteration": 2.6174545288085938 + }, + { + "auxiliary_loss_clip": 0.0664265, + "auxiliary_loss_mlp": 0.0131017, + "balance_loss_clip": 0.06307586, + "balance_loss_mlp": 0.012756, + "epoch": 0.12030662858860665, + "flos": 21695635843200.0, + "grad_norm": 1.7187756113932227, + "language_loss": 0.86554497, + "learning_rate": 3.915060235755344e-06, + "loss": 0.94507325, + "num_input_tokens_seen": 43309325, + "router_z_loss_clip": 3.34960938, + "router_z_loss_mlp": 0.34545898, + "step": 2001, + "time_per_iteration": 2.575740098953247 + }, + { + "auxiliary_loss_clip": 0.06635608, + "auxiliary_loss_mlp": 0.01303825, + "balance_loss_clip": 0.06303231, + "balance_loss_mlp": 0.01270232, + "epoch": 0.12036675184127461, + "flos": 12938280145920.0, + "grad_norm": 3.0530773908117297, + "language_loss": 0.75370091, + "learning_rate": 3.91494790468709e-06, + "loss": 0.83309525, + "num_input_tokens_seen": 43327010, + "router_z_loss_clip": 3.32226562, + "router_z_loss_mlp": 0.33618164, + "step": 2002, + "time_per_iteration": 2.5708627700805664 + }, + { + "auxiliary_loss_clip": 0.06653483, + "auxiliary_loss_mlp": 0.01301657, + "balance_loss_clip": 0.06308778, + "balance_loss_mlp": 0.01265322, + "epoch": 0.12042687509394258, + "flos": 20857469051520.0, + "grad_norm": 3.724600785525669, + "language_loss": 0.79714429, + "learning_rate": 3.9148355010038185e-06, + "loss": 0.87669575, + "num_input_tokens_seen": 43345650, + "router_z_loss_clip": 3.44726562, + "router_z_loss_mlp": 0.36352539, + "step": 2003, + "time_per_iteration": 2.5530362129211426 + }, + { + "auxiliary_loss_clip": 0.06639621, + "auxiliary_loss_mlp": 0.01310661, + "balance_loss_clip": 0.06304517, + "balance_loss_mlp": 0.01276638, + "epoch": 0.12048699834661056, + "flos": 23885320665600.0, + "grad_norm": 3.082354768272036, + "language_loss": 0.72748882, + "learning_rate": 3.914723024709793e-06, + "loss": 0.80699164, + "num_input_tokens_seen": 43365555, + "router_z_loss_clip": 3.3515625, + "router_z_loss_mlp": 0.34008789, + "step": 2004, + "time_per_iteration": 2.583922863006592 + }, + { + "auxiliary_loss_clip": 0.06642192, + "auxiliary_loss_mlp": 0.01300449, + "balance_loss_clip": 0.06302966, + "balance_loss_mlp": 0.01263899, + "epoch": 0.12054712159927852, + "flos": 19762605676800.0, + "grad_norm": 1.8151207739831152, + "language_loss": 0.79435182, + "learning_rate": 3.914610475809279e-06, + "loss": 0.87377822, + "num_input_tokens_seen": 43384990, + "router_z_loss_clip": 3.39257812, + "router_z_loss_mlp": 0.36547852, + "step": 2005, + "time_per_iteration": 2.5544016361236572 + }, + { + "auxiliary_loss_clip": 0.06498255, + "auxiliary_loss_mlp": 0.01304889, + "balance_loss_clip": 0.06296292, + "balance_loss_mlp": 0.01289821, + "epoch": 0.12060724485194649, + "flos": 51688999411200.0, + "grad_norm": 0.895152271859771, + "language_loss": 0.5819217, + "learning_rate": 3.914497854306543e-06, + "loss": 0.65995312, + "num_input_tokens_seen": 43436335, + "router_z_loss_clip": 2.02539062, + "router_z_loss_mlp": 0.15039062, + "step": 2006, + "time_per_iteration": 2.9925737380981445 + }, + { + "auxiliary_loss_clip": 0.06637617, + "auxiliary_loss_mlp": 0.01298518, + "balance_loss_clip": 0.06307045, + "balance_loss_mlp": 0.01264042, + "epoch": 0.12066736810461445, + "flos": 18996582850560.0, + "grad_norm": 2.2145885601274653, + "language_loss": 0.77570707, + "learning_rate": 3.9143851602058575e-06, + "loss": 0.85506845, + "num_input_tokens_seen": 43456495, + "router_z_loss_clip": 3.30664062, + "router_z_loss_mlp": 0.34472656, + "step": 2007, + "time_per_iteration": 2.5426108837127686 + }, + { + "auxiliary_loss_clip": 0.0663473, + "auxiliary_loss_mlp": 0.01296019, + "balance_loss_clip": 0.06301288, + "balance_loss_mlp": 0.01260352, + "epoch": 0.12072749135728243, + "flos": 16477554355200.0, + "grad_norm": 3.5055454300142346, + "language_loss": 0.8601926, + "learning_rate": 3.914272393511494e-06, + "loss": 0.93950009, + "num_input_tokens_seen": 43473085, + "router_z_loss_clip": 3.328125, + "router_z_loss_mlp": 0.35668945, + "step": 2008, + "time_per_iteration": 2.5499417781829834 + }, + { + "auxiliary_loss_clip": 0.06641807, + "auxiliary_loss_mlp": 0.01291488, + "balance_loss_clip": 0.06305657, + "balance_loss_mlp": 0.0125768, + "epoch": 0.1207876146099504, + "flos": 18082917930240.0, + "grad_norm": 2.14462830622821, + "language_loss": 0.84945571, + "learning_rate": 3.91415955422773e-06, + "loss": 0.92878866, + "num_input_tokens_seen": 43491135, + "router_z_loss_clip": 3.359375, + "router_z_loss_mlp": 0.33813477, + "step": 2009, + "time_per_iteration": 2.5377557277679443 + }, + { + "auxiliary_loss_clip": 0.06634751, + "auxiliary_loss_mlp": 0.01300176, + "balance_loss_clip": 0.06306206, + "balance_loss_mlp": 0.01266225, + "epoch": 0.12084773786261836, + "flos": 21878008254720.0, + "grad_norm": 2.1676887329617336, + "language_loss": 0.85496145, + "learning_rate": 3.914046642358844e-06, + "loss": 0.93431073, + "num_input_tokens_seen": 43510440, + "router_z_loss_clip": 3.29101562, + "router_z_loss_mlp": 0.33959961, + "step": 2010, + "time_per_iteration": 2.577526330947876 + }, + { + "auxiliary_loss_clip": 0.06654292, + "auxiliary_loss_mlp": 0.0131443, + "balance_loss_clip": 0.06313477, + "balance_loss_mlp": 0.01277666, + "epoch": 0.12090786111528634, + "flos": 18338985607680.0, + "grad_norm": 2.943319840268963, + "language_loss": 0.85397738, + "learning_rate": 3.9139336579091174e-06, + "loss": 0.93366468, + "num_input_tokens_seen": 43530145, + "router_z_loss_clip": 3.40625, + "router_z_loss_mlp": 0.36767578, + "step": 2011, + "time_per_iteration": 2.5281803607940674 + }, + { + "auxiliary_loss_clip": 0.06651285, + "auxiliary_loss_mlp": 0.01306451, + "balance_loss_clip": 0.06310041, + "balance_loss_mlp": 0.01270975, + "epoch": 0.1209679843679543, + "flos": 21112236990720.0, + "grad_norm": 2.078534673475464, + "language_loss": 0.97477353, + "learning_rate": 3.913820600882834e-06, + "loss": 1.05435085, + "num_input_tokens_seen": 43549315, + "router_z_loss_clip": 3.4140625, + "router_z_loss_mlp": 0.35498047, + "step": 2012, + "time_per_iteration": 2.607473611831665 + }, + { + "auxiliary_loss_clip": 0.06639741, + "auxiliary_loss_mlp": 0.01302196, + "balance_loss_clip": 0.06309405, + "balance_loss_mlp": 0.01268865, + "epoch": 0.12102810762062227, + "flos": 29248612479360.0, + "grad_norm": 1.9848767494674133, + "language_loss": 0.81610048, + "learning_rate": 3.913707471284283e-06, + "loss": 0.89551985, + "num_input_tokens_seen": 43569240, + "router_z_loss_clip": 3.30078125, + "router_z_loss_mlp": 0.33325195, + "step": 2013, + "time_per_iteration": 2.616990566253662 + }, + { + "auxiliary_loss_clip": 0.06652003, + "auxiliary_loss_mlp": 0.01311561, + "balance_loss_clip": 0.06309032, + "balance_loss_mlp": 0.0127525, + "epoch": 0.12108823087329025, + "flos": 17936407866240.0, + "grad_norm": 5.4278493881784415, + "language_loss": 0.78293782, + "learning_rate": 3.9135942691177515e-06, + "loss": 0.8625735, + "num_input_tokens_seen": 43587710, + "router_z_loss_clip": 3.43164062, + "router_z_loss_mlp": 0.36328125, + "step": 2014, + "time_per_iteration": 2.651820421218872 + }, + { + "auxiliary_loss_clip": 0.06640598, + "auxiliary_loss_mlp": 0.01320367, + "balance_loss_clip": 0.0630708, + "balance_loss_mlp": 0.01286344, + "epoch": 0.12114835412595822, + "flos": 22098549000960.0, + "grad_norm": 2.982829144387911, + "language_loss": 0.88284999, + "learning_rate": 3.913480994387535e-06, + "loss": 0.96245968, + "num_input_tokens_seen": 43606000, + "router_z_loss_clip": 3.33398438, + "router_z_loss_mlp": 0.34008789, + "step": 2015, + "time_per_iteration": 2.5447444915771484 + }, + { + "auxiliary_loss_clip": 0.06640744, + "auxiliary_loss_mlp": 0.01318151, + "balance_loss_clip": 0.06308715, + "balance_loss_mlp": 0.01284534, + "epoch": 0.12120847737862618, + "flos": 20418567765120.0, + "grad_norm": 2.096885211944344, + "language_loss": 0.70457768, + "learning_rate": 3.913367647097926e-06, + "loss": 0.78416657, + "num_input_tokens_seen": 43624815, + "router_z_loss_clip": 3.32226562, + "router_z_loss_mlp": 0.3359375, + "step": 2016, + "time_per_iteration": 2.596148729324341 + }, + { + "auxiliary_loss_clip": 0.06646016, + "auxiliary_loss_mlp": 0.01314653, + "balance_loss_clip": 0.06304827, + "balance_loss_mlp": 0.01276792, + "epoch": 0.12126860063129415, + "flos": 22315484021760.0, + "grad_norm": 2.9748504234470214, + "language_loss": 0.80719239, + "learning_rate": 3.913254227253225e-06, + "loss": 0.8867991, + "num_input_tokens_seen": 43643960, + "router_z_loss_clip": 3.41015625, + "router_z_loss_mlp": 0.37890625, + "step": 2017, + "time_per_iteration": 2.531651020050049 + }, + { + "auxiliary_loss_clip": 0.06646961, + "auxiliary_loss_mlp": 0.01325201, + "balance_loss_clip": 0.06301364, + "balance_loss_mlp": 0.01289128, + "epoch": 0.12132872388396213, + "flos": 13704428753280.0, + "grad_norm": 11.74399096976628, + "language_loss": 0.70780957, + "learning_rate": 3.913140734857731e-06, + "loss": 0.78753114, + "num_input_tokens_seen": 43662650, + "router_z_loss_clip": 3.45507812, + "router_z_loss_mlp": 0.3605957, + "step": 2018, + "time_per_iteration": 2.555253267288208 + }, + { + "auxiliary_loss_clip": 0.06636061, + "auxiliary_loss_mlp": 0.01298517, + "balance_loss_clip": 0.06301028, + "balance_loss_mlp": 0.01264828, + "epoch": 0.12138884713663009, + "flos": 26473851722880.0, + "grad_norm": 2.8042762769346714, + "language_loss": 0.73802805, + "learning_rate": 3.91302716991575e-06, + "loss": 0.81737387, + "num_input_tokens_seen": 43684205, + "router_z_loss_clip": 3.34765625, + "router_z_loss_mlp": 0.33691406, + "step": 2019, + "time_per_iteration": 2.6203458309173584 + }, + { + "auxiliary_loss_clip": 0.06639916, + "auxiliary_loss_mlp": 0.01311356, + "balance_loss_clip": 0.06299765, + "balance_loss_mlp": 0.01277238, + "epoch": 0.12144897038929806, + "flos": 26148952316160.0, + "grad_norm": 1.829808829925435, + "language_loss": 0.93501657, + "learning_rate": 3.912913532431586e-06, + "loss": 1.01452923, + "num_input_tokens_seen": 43706320, + "router_z_loss_clip": 3.40234375, + "router_z_loss_mlp": 0.34130859, + "step": 2020, + "time_per_iteration": 2.5888445377349854 + }, + { + "auxiliary_loss_clip": 0.06633772, + "auxiliary_loss_mlp": 0.01299116, + "balance_loss_clip": 0.06297548, + "balance_loss_mlp": 0.01263568, + "epoch": 0.12150909364196603, + "flos": 24724451779200.0, + "grad_norm": 2.526616616661372, + "language_loss": 0.78976464, + "learning_rate": 3.912799822409549e-06, + "loss": 0.86909354, + "num_input_tokens_seen": 43724805, + "router_z_loss_clip": 3.36328125, + "router_z_loss_mlp": 0.35546875, + "step": 2021, + "time_per_iteration": 2.6022841930389404 + }, + { + "auxiliary_loss_clip": 0.0663517, + "auxiliary_loss_mlp": 0.01299013, + "balance_loss_clip": 0.06302813, + "balance_loss_mlp": 0.01266898, + "epoch": 0.121569216894634, + "flos": 25193177919360.0, + "grad_norm": 2.2515588789305645, + "language_loss": 0.8175382, + "learning_rate": 3.912686039853952e-06, + "loss": 0.89688003, + "num_input_tokens_seen": 43742320, + "router_z_loss_clip": 3.32421875, + "router_z_loss_mlp": 0.32128906, + "step": 2022, + "time_per_iteration": 2.5850207805633545 + }, + { + "auxiliary_loss_clip": 0.0664625, + "auxiliary_loss_mlp": 0.01295093, + "balance_loss_clip": 0.06304103, + "balance_loss_mlp": 0.0125964, + "epoch": 0.12162934014730196, + "flos": 13449241543680.0, + "grad_norm": 2.226180845904462, + "language_loss": 0.8644762, + "learning_rate": 3.912572184769108e-06, + "loss": 0.94388956, + "num_input_tokens_seen": 43760665, + "router_z_loss_clip": 3.421875, + "router_z_loss_mlp": 0.35424805, + "step": 2023, + "time_per_iteration": 2.541822671890259 + }, + { + "auxiliary_loss_clip": 0.06652313, + "auxiliary_loss_mlp": 0.01299326, + "balance_loss_clip": 0.06306356, + "balance_loss_mlp": 0.01261394, + "epoch": 0.12168946339996994, + "flos": 16951772937600.0, + "grad_norm": 3.6496728157667477, + "language_loss": 0.87528783, + "learning_rate": 3.912458257159335e-06, + "loss": 0.95480424, + "num_input_tokens_seen": 43779020, + "router_z_loss_clip": 3.4609375, + "router_z_loss_mlp": 0.37963867, + "step": 2024, + "time_per_iteration": 2.510047674179077 + }, + { + "auxiliary_loss_clip": 0.06637174, + "auxiliary_loss_mlp": 0.01298516, + "balance_loss_clip": 0.06299831, + "balance_loss_mlp": 0.01262872, + "epoch": 0.12174958665263791, + "flos": 29828699095680.0, + "grad_norm": 2.180683853985422, + "language_loss": 0.73548269, + "learning_rate": 3.912344257028954e-06, + "loss": 0.8148396, + "num_input_tokens_seen": 43798850, + "router_z_loss_clip": 3.37695312, + "router_z_loss_mlp": 0.35620117, + "step": 2025, + "time_per_iteration": 2.612072229385376 + }, + { + "auxiliary_loss_clip": 0.06640136, + "auxiliary_loss_mlp": 0.01296236, + "balance_loss_clip": 0.06301836, + "balance_loss_mlp": 0.01260425, + "epoch": 0.12180970990530587, + "flos": 24648366672000.0, + "grad_norm": 1.6158057232252747, + "language_loss": 0.77162802, + "learning_rate": 3.912230184382286e-06, + "loss": 0.85099173, + "num_input_tokens_seen": 43820130, + "router_z_loss_clip": 3.38476562, + "router_z_loss_mlp": 0.35766602, + "step": 2026, + "time_per_iteration": 2.5995230674743652 + }, + { + "auxiliary_loss_clip": 0.06645372, + "auxiliary_loss_mlp": 0.01300506, + "balance_loss_clip": 0.06307228, + "balance_loss_mlp": 0.01264219, + "epoch": 0.12186983315797385, + "flos": 20527915743360.0, + "grad_norm": 2.387338120412035, + "language_loss": 0.90280318, + "learning_rate": 3.912116039223659e-06, + "loss": 0.9822619, + "num_input_tokens_seen": 43838485, + "router_z_loss_clip": 3.37890625, + "router_z_loss_mlp": 0.36254883, + "step": 2027, + "time_per_iteration": 2.534867763519287 + }, + { + "auxiliary_loss_clip": 0.06634748, + "auxiliary_loss_mlp": 0.0129945, + "balance_loss_clip": 0.06304284, + "balance_loss_mlp": 0.01266905, + "epoch": 0.12192995641064182, + "flos": 27825705169920.0, + "grad_norm": 2.1781707070906644, + "language_loss": 0.76798415, + "learning_rate": 3.912001821557399e-06, + "loss": 0.84732616, + "num_input_tokens_seen": 43859080, + "router_z_loss_clip": 3.30859375, + "router_z_loss_mlp": 0.32543945, + "step": 2028, + "time_per_iteration": 2.578725576400757 + }, + { + "auxiliary_loss_clip": 0.0664517, + "auxiliary_loss_mlp": 0.01295232, + "balance_loss_clip": 0.06306128, + "balance_loss_mlp": 0.012614, + "epoch": 0.12199007966330978, + "flos": 22023512069760.0, + "grad_norm": 2.4518178731886318, + "language_loss": 0.78897178, + "learning_rate": 3.911887531387839e-06, + "loss": 0.86837584, + "num_input_tokens_seen": 43879030, + "router_z_loss_clip": 3.39257812, + "router_z_loss_mlp": 0.33813477, + "step": 2029, + "time_per_iteration": 2.5508341789245605 + }, + { + "auxiliary_loss_clip": 0.06643746, + "auxiliary_loss_mlp": 0.01296807, + "balance_loss_clip": 0.06307071, + "balance_loss_mlp": 0.01262475, + "epoch": 0.12205020291597775, + "flos": 23302005667200.0, + "grad_norm": 2.091887383256169, + "language_loss": 0.80821085, + "learning_rate": 3.911773168719313e-06, + "loss": 0.8876164, + "num_input_tokens_seen": 43898505, + "router_z_loss_clip": 3.36328125, + "router_z_loss_mlp": 0.34326172, + "step": 2030, + "time_per_iteration": 3.9340591430664062 + }, + { + "auxiliary_loss_clip": 0.06641008, + "auxiliary_loss_mlp": 0.01296523, + "balance_loss_clip": 0.06307271, + "balance_loss_mlp": 0.01263097, + "epoch": 0.12211032616864573, + "flos": 26038849651200.0, + "grad_norm": 4.123821558530392, + "language_loss": 0.75410855, + "learning_rate": 3.911658733556155e-06, + "loss": 0.83348382, + "num_input_tokens_seen": 43917945, + "router_z_loss_clip": 3.33984375, + "router_z_loss_mlp": 0.33398438, + "step": 2031, + "time_per_iteration": 4.0164101123809814 + }, + { + "auxiliary_loss_clip": 0.06642319, + "auxiliary_loss_mlp": 0.01298968, + "balance_loss_clip": 0.06307532, + "balance_loss_mlp": 0.01265947, + "epoch": 0.12217044942131369, + "flos": 20416932610560.0, + "grad_norm": 1.945082071582731, + "language_loss": 0.76790285, + "learning_rate": 3.911544225902707e-06, + "loss": 0.84731567, + "num_input_tokens_seen": 43937385, + "router_z_loss_clip": 3.34765625, + "router_z_loss_mlp": 0.33032227, + "step": 2032, + "time_per_iteration": 2.5583930015563965 + }, + { + "auxiliary_loss_clip": 0.0663031, + "auxiliary_loss_mlp": 0.01300948, + "balance_loss_clip": 0.06305249, + "balance_loss_mlp": 0.01266901, + "epoch": 0.12223057267398166, + "flos": 22863817140480.0, + "grad_norm": 1.7389762148633483, + "language_loss": 0.89850545, + "learning_rate": 3.911429645763311e-06, + "loss": 0.97781807, + "num_input_tokens_seen": 43958130, + "router_z_loss_clip": 3.24609375, + "router_z_loss_mlp": 0.34057617, + "step": 2033, + "time_per_iteration": 2.5717952251434326 + }, + { + "auxiliary_loss_clip": 0.06656118, + "auxiliary_loss_mlp": 0.01295873, + "balance_loss_clip": 0.06305313, + "balance_loss_mlp": 0.01260063, + "epoch": 0.12229069592664964, + "flos": 20053739088000.0, + "grad_norm": 2.329108980084039, + "language_loss": 0.67293733, + "learning_rate": 3.911314993142311e-06, + "loss": 0.75245726, + "num_input_tokens_seen": 43976800, + "router_z_loss_clip": 3.50585938, + "router_z_loss_mlp": 0.3581543, + "step": 2034, + "time_per_iteration": 5.42257833480835 + }, + { + "auxiliary_loss_clip": 0.06636314, + "auxiliary_loss_mlp": 0.01296044, + "balance_loss_clip": 0.06304356, + "balance_loss_mlp": 0.0126164, + "epoch": 0.1223508191793176, + "flos": 22280963339520.0, + "grad_norm": 1.830897331176389, + "language_loss": 0.77330279, + "learning_rate": 3.911200268044055e-06, + "loss": 0.85262644, + "num_input_tokens_seen": 43996620, + "router_z_loss_clip": 3.32226562, + "router_z_loss_mlp": 0.34375, + "step": 2035, + "time_per_iteration": 2.636413097381592 + }, + { + "auxiliary_loss_clip": 0.06651293, + "auxiliary_loss_mlp": 0.01293249, + "balance_loss_clip": 0.06302898, + "balance_loss_mlp": 0.01258893, + "epoch": 0.12241094243198557, + "flos": 21292009925760.0, + "grad_norm": 2.7740017238095187, + "language_loss": 0.73084652, + "learning_rate": 3.911085470472892e-06, + "loss": 0.81029195, + "num_input_tokens_seen": 44016175, + "router_z_loss_clip": 3.48632812, + "router_z_loss_mlp": 0.34350586, + "step": 2036, + "time_per_iteration": 2.528167724609375 + }, + { + "auxiliary_loss_clip": 0.06639268, + "auxiliary_loss_mlp": 0.01290851, + "balance_loss_clip": 0.06301118, + "balance_loss_mlp": 0.01256185, + "epoch": 0.12247106568465355, + "flos": 17387823185280.0, + "grad_norm": 1.824605307650974, + "language_loss": 0.84228837, + "learning_rate": 3.910970600433178e-06, + "loss": 0.92158961, + "num_input_tokens_seen": 44035060, + "router_z_loss_clip": 3.3828125, + "router_z_loss_mlp": 0.34692383, + "step": 2037, + "time_per_iteration": 2.554356575012207 + }, + { + "auxiliary_loss_clip": 0.06640968, + "auxiliary_loss_mlp": 0.0129909, + "balance_loss_clip": 0.06304546, + "balance_loss_mlp": 0.01265043, + "epoch": 0.12253118893732151, + "flos": 27051548497920.0, + "grad_norm": 3.231665500772768, + "language_loss": 0.81365263, + "learning_rate": 3.910855657929267e-06, + "loss": 0.89305323, + "num_input_tokens_seen": 44053330, + "router_z_loss_clip": 3.36523438, + "router_z_loss_mlp": 0.34057617, + "step": 2038, + "time_per_iteration": 2.5666050910949707 + }, + { + "auxiliary_loss_clip": 0.0649721, + "auxiliary_loss_mlp": 0.01268916, + "balance_loss_clip": 0.06293084, + "balance_loss_mlp": 0.01256113, + "epoch": 0.12259131218998948, + "flos": 53878055328000.0, + "grad_norm": 0.7896182211698063, + "language_loss": 0.58607936, + "learning_rate": 3.910740642965518e-06, + "loss": 0.66374058, + "num_input_tokens_seen": 44107575, + "router_z_loss_clip": 2.04101562, + "router_z_loss_mlp": 0.12817383, + "step": 2039, + "time_per_iteration": 3.1232099533081055 + }, + { + "auxiliary_loss_clip": 0.06641525, + "auxiliary_loss_mlp": 0.0129467, + "balance_loss_clip": 0.06306375, + "balance_loss_mlp": 0.01261053, + "epoch": 0.12265143544265744, + "flos": 17897233282560.0, + "grad_norm": 3.4610063472864065, + "language_loss": 0.82137585, + "learning_rate": 3.910625555546292e-06, + "loss": 0.90073782, + "num_input_tokens_seen": 44126075, + "router_z_loss_clip": 3.34765625, + "router_z_loss_mlp": 0.33569336, + "step": 2040, + "time_per_iteration": 2.5443432331085205 + }, + { + "auxiliary_loss_clip": 0.06629258, + "auxiliary_loss_mlp": 0.01288004, + "balance_loss_clip": 0.06301395, + "balance_loss_mlp": 0.01255031, + "epoch": 0.12271155869532542, + "flos": 21806577048960.0, + "grad_norm": 2.3749836007198546, + "language_loss": 0.84196723, + "learning_rate": 3.910510395675953e-06, + "loss": 0.92113984, + "num_input_tokens_seen": 44145605, + "router_z_loss_clip": 3.27539062, + "router_z_loss_mlp": 0.32983398, + "step": 2041, + "time_per_iteration": 2.5387189388275146 + }, + { + "auxiliary_loss_clip": 0.06646631, + "auxiliary_loss_mlp": 0.01292367, + "balance_loss_clip": 0.06301489, + "balance_loss_mlp": 0.0125627, + "epoch": 0.12277168194799339, + "flos": 19834917350400.0, + "grad_norm": 2.032940304960421, + "language_loss": 0.68564701, + "learning_rate": 3.9103951633588694e-06, + "loss": 0.76503706, + "num_input_tokens_seen": 44164770, + "router_z_loss_clip": 3.44726562, + "router_z_loss_mlp": 0.36083984, + "step": 2042, + "time_per_iteration": 2.5871469974517822 + }, + { + "auxiliary_loss_clip": 0.06626363, + "auxiliary_loss_mlp": 0.01291525, + "balance_loss_clip": 0.06293724, + "balance_loss_mlp": 0.01258957, + "epoch": 0.12283180520066135, + "flos": 23227597641600.0, + "grad_norm": 4.507885061874762, + "language_loss": 0.82501084, + "learning_rate": 3.910279858599409e-06, + "loss": 0.90418965, + "num_input_tokens_seen": 44184025, + "router_z_loss_clip": 3.328125, + "router_z_loss_mlp": 0.32568359, + "step": 2043, + "time_per_iteration": 2.5436289310455322 + }, + { + "auxiliary_loss_clip": 0.06642601, + "auxiliary_loss_mlp": 0.01293474, + "balance_loss_clip": 0.06301275, + "balance_loss_mlp": 0.01260501, + "epoch": 0.12289192845332933, + "flos": 18594466306560.0, + "grad_norm": 1.8262165625903515, + "language_loss": 0.8169322, + "learning_rate": 3.910164481401946e-06, + "loss": 0.89629292, + "num_input_tokens_seen": 44202950, + "router_z_loss_clip": 3.41210938, + "router_z_loss_mlp": 0.32983398, + "step": 2044, + "time_per_iteration": 2.5594139099121094 + }, + { + "auxiliary_loss_clip": 0.06635186, + "auxiliary_loss_mlp": 0.0128851, + "balance_loss_clip": 0.06299295, + "balance_loss_mlp": 0.01254416, + "epoch": 0.1229520517059973, + "flos": 25775612449920.0, + "grad_norm": 1.8452303970598702, + "language_loss": 0.79028547, + "learning_rate": 3.910049031770853e-06, + "loss": 0.86952239, + "num_input_tokens_seen": 44221115, + "router_z_loss_clip": 3.35742188, + "router_z_loss_mlp": 0.34082031, + "step": 2045, + "time_per_iteration": 2.5465781688690186 + }, + { + "auxiliary_loss_clip": 0.06636953, + "auxiliary_loss_mlp": 0.01295167, + "balance_loss_clip": 0.06298777, + "balance_loss_mlp": 0.01262408, + "epoch": 0.12301217495866526, + "flos": 20893541034240.0, + "grad_norm": 1.9769865564806426, + "language_loss": 0.69156218, + "learning_rate": 3.90993350971051e-06, + "loss": 0.77088338, + "num_input_tokens_seen": 44240575, + "router_z_loss_clip": 3.38671875, + "router_z_loss_mlp": 0.32763672, + "step": 2046, + "time_per_iteration": 2.5848565101623535 + }, + { + "auxiliary_loss_clip": 0.06628656, + "auxiliary_loss_mlp": 0.01290131, + "balance_loss_clip": 0.06297234, + "balance_loss_mlp": 0.01257277, + "epoch": 0.12307229821133324, + "flos": 22384735021440.0, + "grad_norm": 2.0992511324886713, + "language_loss": 0.73182803, + "learning_rate": 3.909817915225297e-06, + "loss": 0.8110159, + "num_input_tokens_seen": 44257145, + "router_z_loss_clip": 3.31640625, + "router_z_loss_mlp": 0.32861328, + "step": 2047, + "time_per_iteration": 2.5309009552001953 + }, + { + "auxiliary_loss_clip": 0.06630135, + "auxiliary_loss_mlp": 0.0129866, + "balance_loss_clip": 0.06297912, + "balance_loss_mlp": 0.01263732, + "epoch": 0.1231324214640012, + "flos": 23374065778560.0, + "grad_norm": 2.486188262823441, + "language_loss": 0.77457881, + "learning_rate": 3.909702248319597e-06, + "loss": 0.85386682, + "num_input_tokens_seen": 44278035, + "router_z_loss_clip": 3.32226562, + "router_z_loss_mlp": 0.34912109, + "step": 2048, + "time_per_iteration": 2.6273012161254883 + }, + { + "auxiliary_loss_clip": 0.06627734, + "auxiliary_loss_mlp": 0.01290224, + "balance_loss_clip": 0.06297483, + "balance_loss_mlp": 0.01258514, + "epoch": 0.12319254471666917, + "flos": 23773624773120.0, + "grad_norm": 1.9256853930308273, + "language_loss": 0.8659687, + "learning_rate": 3.909586508997797e-06, + "loss": 0.94514829, + "num_input_tokens_seen": 44296980, + "router_z_loss_clip": 3.30273438, + "router_z_loss_mlp": 0.31665039, + "step": 2049, + "time_per_iteration": 2.559253692626953 + }, + { + "auxiliary_loss_clip": 0.06639866, + "auxiliary_loss_mlp": 0.01291416, + "balance_loss_clip": 0.06300847, + "balance_loss_mlp": 0.01257751, + "epoch": 0.12325266796933713, + "flos": 23556899387520.0, + "grad_norm": 2.574663902354124, + "language_loss": 0.76814753, + "learning_rate": 3.909470697264285e-06, + "loss": 0.84746033, + "num_input_tokens_seen": 44318005, + "router_z_loss_clip": 3.390625, + "router_z_loss_mlp": 0.33691406, + "step": 2050, + "time_per_iteration": 2.6138648986816406 + }, + { + "auxiliary_loss_clip": 0.06634495, + "auxiliary_loss_mlp": 0.0128935, + "balance_loss_clip": 0.06301371, + "balance_loss_mlp": 0.01256353, + "epoch": 0.12331279122200511, + "flos": 24430593110400.0, + "grad_norm": 2.4676515957678826, + "language_loss": 0.82809746, + "learning_rate": 3.909354813123452e-06, + "loss": 0.90733588, + "num_input_tokens_seen": 44335260, + "router_z_loss_clip": 3.328125, + "router_z_loss_mlp": 0.32983398, + "step": 2051, + "time_per_iteration": 2.53440260887146 + }, + { + "auxiliary_loss_clip": 0.06631288, + "auxiliary_loss_mlp": 0.01288335, + "balance_loss_clip": 0.06299216, + "balance_loss_mlp": 0.01256625, + "epoch": 0.12337291447467308, + "flos": 25491438927360.0, + "grad_norm": 2.0266783151609666, + "language_loss": 0.81273621, + "learning_rate": 3.909238856579693e-06, + "loss": 0.89193243, + "num_input_tokens_seen": 44355315, + "router_z_loss_clip": 3.32421875, + "router_z_loss_mlp": 0.3170166, + "step": 2052, + "time_per_iteration": 2.5801045894622803 + }, + { + "auxiliary_loss_clip": 0.06643972, + "auxiliary_loss_mlp": 0.012894, + "balance_loss_clip": 0.06304064, + "balance_loss_mlp": 0.0125533, + "epoch": 0.12343303772734104, + "flos": 23556731679360.0, + "grad_norm": 2.520879144307052, + "language_loss": 0.75331706, + "learning_rate": 3.909122827637406e-06, + "loss": 0.83265078, + "num_input_tokens_seen": 44373020, + "router_z_loss_clip": 3.40039062, + "router_z_loss_mlp": 0.34082031, + "step": 2053, + "time_per_iteration": 2.5373435020446777 + }, + { + "auxiliary_loss_clip": 0.06645267, + "auxiliary_loss_mlp": 0.01289892, + "balance_loss_clip": 0.06306874, + "balance_loss_mlp": 0.01256919, + "epoch": 0.12349316098000902, + "flos": 47567724670080.0, + "grad_norm": 1.6252086945457442, + "language_loss": 0.75631851, + "learning_rate": 3.909006726300991e-06, + "loss": 0.83567011, + "num_input_tokens_seen": 44397525, + "router_z_loss_clip": 3.37890625, + "router_z_loss_mlp": 0.32983398, + "step": 2054, + "time_per_iteration": 2.7952961921691895 + }, + { + "auxiliary_loss_clip": 0.06634779, + "auxiliary_loss_mlp": 0.01287596, + "balance_loss_clip": 0.06307411, + "balance_loss_mlp": 0.0125715, + "epoch": 0.12355328423267699, + "flos": 25052956911360.0, + "grad_norm": 1.7485213657356729, + "language_loss": 0.86270738, + "learning_rate": 3.908890552574849e-06, + "loss": 0.94193119, + "num_input_tokens_seen": 44415890, + "router_z_loss_clip": 3.27148438, + "router_z_loss_mlp": 0.30419922, + "step": 2055, + "time_per_iteration": 2.553056001663208 + }, + { + "auxiliary_loss_clip": 0.06643809, + "auxiliary_loss_mlp": 0.01295066, + "balance_loss_clip": 0.06311696, + "balance_loss_mlp": 0.0126226, + "epoch": 0.12361340748534495, + "flos": 27716524899840.0, + "grad_norm": 2.053117172443155, + "language_loss": 0.78908336, + "learning_rate": 3.908774306463384e-06, + "loss": 0.86847222, + "num_input_tokens_seen": 44436625, + "router_z_loss_clip": 3.32421875, + "router_z_loss_mlp": 0.328125, + "step": 2056, + "time_per_iteration": 2.632049322128296 + }, + { + "auxiliary_loss_clip": 0.06652766, + "auxiliary_loss_mlp": 0.01294236, + "balance_loss_clip": 0.06316112, + "balance_loss_mlp": 0.01262002, + "epoch": 0.12367353073801293, + "flos": 26147778359040.0, + "grad_norm": 2.0516910638510835, + "language_loss": 0.84512216, + "learning_rate": 3.908657987971009e-06, + "loss": 0.92459214, + "num_input_tokens_seen": 44455265, + "router_z_loss_clip": 3.3671875, + "router_z_loss_mlp": 0.32226562, + "step": 2057, + "time_per_iteration": 2.5529589653015137 + }, + { + "auxiliary_loss_clip": 0.06650747, + "auxiliary_loss_mlp": 0.0129436, + "balance_loss_clip": 0.06317189, + "balance_loss_mlp": 0.01261553, + "epoch": 0.1237336539906809, + "flos": 25163143430400.0, + "grad_norm": 1.8863431007110945, + "language_loss": 0.7932052, + "learning_rate": 3.90854159710213e-06, + "loss": 0.87265623, + "num_input_tokens_seen": 44475815, + "router_z_loss_clip": 3.33398438, + "router_z_loss_mlp": 0.328125, + "step": 2058, + "time_per_iteration": 2.636936902999878 + }, + { + "auxiliary_loss_clip": 0.06652544, + "auxiliary_loss_mlp": 0.01294377, + "balance_loss_clip": 0.06313539, + "balance_loss_mlp": 0.01259782, + "epoch": 0.12379377724334886, + "flos": 15310001963520.0, + "grad_norm": 2.1631103181071865, + "language_loss": 0.84899569, + "learning_rate": 3.9084251338611624e-06, + "loss": 0.92846489, + "num_input_tokens_seen": 44494045, + "router_z_loss_clip": 3.38671875, + "router_z_loss_mlp": 0.34619141, + "step": 2059, + "time_per_iteration": 2.534330129623413 + }, + { + "auxiliary_loss_clip": 0.06649262, + "auxiliary_loss_mlp": 0.01290616, + "balance_loss_clip": 0.06311791, + "balance_loss_mlp": 0.01258405, + "epoch": 0.12385390049601683, + "flos": 21321792852480.0, + "grad_norm": 2.425291985469593, + "language_loss": 0.82626045, + "learning_rate": 3.908308598252523e-06, + "loss": 0.90565926, + "num_input_tokens_seen": 44509120, + "router_z_loss_clip": 3.37695312, + "router_z_loss_mlp": 0.32177734, + "step": 2060, + "time_per_iteration": 2.6014535427093506 + }, + { + "auxiliary_loss_clip": 0.06642138, + "auxiliary_loss_mlp": 0.01290673, + "balance_loss_clip": 0.06310271, + "balance_loss_mlp": 0.01256579, + "epoch": 0.1239140237486848, + "flos": 15120711590400.0, + "grad_norm": 2.0800945388405734, + "language_loss": 0.87935984, + "learning_rate": 3.9081919902806306e-06, + "loss": 0.95868802, + "num_input_tokens_seen": 44525780, + "router_z_loss_clip": 3.31640625, + "router_z_loss_mlp": 0.34082031, + "step": 2061, + "time_per_iteration": 2.494584321975708 + }, + { + "auxiliary_loss_clip": 0.0663335, + "auxiliary_loss_mlp": 0.01291205, + "balance_loss_clip": 0.06306711, + "balance_loss_mlp": 0.01260259, + "epoch": 0.12397414700135277, + "flos": 21982534623360.0, + "grad_norm": 1.9753177189275368, + "language_loss": 0.85858583, + "learning_rate": 3.908075309949906e-06, + "loss": 0.9378314, + "num_input_tokens_seen": 44543125, + "router_z_loss_clip": 3.26953125, + "router_z_loss_mlp": 0.30932617, + "step": 2062, + "time_per_iteration": 2.5650103092193604 + }, + { + "auxiliary_loss_clip": 0.06642005, + "auxiliary_loss_mlp": 0.01288926, + "balance_loss_clip": 0.06311023, + "balance_loss_mlp": 0.01256549, + "epoch": 0.12403427025402074, + "flos": 13404909934080.0, + "grad_norm": 1.7604795458830171, + "language_loss": 0.80305374, + "learning_rate": 3.907958557264774e-06, + "loss": 0.88236302, + "num_input_tokens_seen": 44560275, + "router_z_loss_clip": 3.30859375, + "router_z_loss_mlp": 0.32373047, + "step": 2063, + "time_per_iteration": 2.5019121170043945 + }, + { + "auxiliary_loss_clip": 0.06644779, + "auxiliary_loss_mlp": 0.0129093, + "balance_loss_clip": 0.06312533, + "balance_loss_mlp": 0.01257146, + "epoch": 0.12409439350668872, + "flos": 15309750401280.0, + "grad_norm": 2.5047408324670832, + "language_loss": 0.80646086, + "learning_rate": 3.907841732229663e-06, + "loss": 0.885818, + "num_input_tokens_seen": 44577640, + "router_z_loss_clip": 3.32421875, + "router_z_loss_mlp": 0.33789062, + "step": 2064, + "time_per_iteration": 2.5915873050689697 + }, + { + "auxiliary_loss_clip": 0.06642206, + "auxiliary_loss_mlp": 0.01295102, + "balance_loss_clip": 0.06310631, + "balance_loss_mlp": 0.01263583, + "epoch": 0.12415451675935668, + "flos": 25016339877120.0, + "grad_norm": 2.4114555321806677, + "language_loss": 0.93642998, + "learning_rate": 3.907724834849002e-06, + "loss": 1.0158031, + "num_input_tokens_seen": 44594860, + "router_z_loss_clip": 3.31445312, + "router_z_loss_mlp": 0.31542969, + "step": 2065, + "time_per_iteration": 2.561858892440796 + }, + { + "auxiliary_loss_clip": 0.06650305, + "auxiliary_loss_mlp": 0.01289676, + "balance_loss_clip": 0.06313996, + "balance_loss_mlp": 0.01256845, + "epoch": 0.12421464001202465, + "flos": 23666457000960.0, + "grad_norm": 2.189266948105698, + "language_loss": 0.81909287, + "learning_rate": 3.907607865127225e-06, + "loss": 0.89849269, + "num_input_tokens_seen": 44614780, + "router_z_loss_clip": 3.36523438, + "router_z_loss_mlp": 0.32836914, + "step": 2066, + "time_per_iteration": 2.593202590942383 + }, + { + "auxiliary_loss_clip": 0.06490391, + "auxiliary_loss_mlp": 0.01263186, + "balance_loss_clip": 0.06289329, + "balance_loss_mlp": 0.01251599, + "epoch": 0.12427476326469263, + "flos": 65753686794240.0, + "grad_norm": 0.8319051039342746, + "language_loss": 0.63633674, + "learning_rate": 3.907490823068766e-06, + "loss": 0.71387255, + "num_input_tokens_seen": 44671240, + "router_z_loss_clip": 2.00195312, + "router_z_loss_mlp": 0.11578369, + "step": 2067, + "time_per_iteration": 3.1761627197265625 + }, + { + "auxiliary_loss_clip": 0.06645706, + "auxiliary_loss_mlp": 0.01298846, + "balance_loss_clip": 0.0631035, + "balance_loss_mlp": 0.01263441, + "epoch": 0.12433488651736059, + "flos": 24542372856960.0, + "grad_norm": 1.826307317776044, + "language_loss": 0.94409752, + "learning_rate": 3.907373708678063e-06, + "loss": 1.023543, + "num_input_tokens_seen": 44691050, + "router_z_loss_clip": 3.3515625, + "router_z_loss_mlp": 0.35375977, + "step": 2068, + "time_per_iteration": 2.548051357269287 + }, + { + "auxiliary_loss_clip": 0.06634392, + "auxiliary_loss_mlp": 0.01295819, + "balance_loss_clip": 0.06307046, + "balance_loss_mlp": 0.01265087, + "epoch": 0.12439500977002856, + "flos": 21037828965120.0, + "grad_norm": 2.192174211914145, + "language_loss": 0.82850045, + "learning_rate": 3.9072565219595596e-06, + "loss": 0.90780252, + "num_input_tokens_seen": 44709850, + "router_z_loss_clip": 3.2734375, + "router_z_loss_mlp": 0.30712891, + "step": 2069, + "time_per_iteration": 3.9771463871002197 + }, + { + "auxiliary_loss_clip": 0.0664653, + "auxiliary_loss_mlp": 0.01287176, + "balance_loss_clip": 0.06312294, + "balance_loss_mlp": 0.01255276, + "epoch": 0.12445513302269653, + "flos": 26837380661760.0, + "grad_norm": 2.140489528942806, + "language_loss": 0.78554291, + "learning_rate": 3.907139262917696e-06, + "loss": 0.86487997, + "num_input_tokens_seen": 44731475, + "router_z_loss_clip": 3.33984375, + "router_z_loss_mlp": 0.31884766, + "step": 2070, + "time_per_iteration": 2.5697221755981445 + }, + { + "auxiliary_loss_clip": 0.06645045, + "auxiliary_loss_mlp": 0.01288939, + "balance_loss_clip": 0.06311486, + "balance_loss_mlp": 0.01258469, + "epoch": 0.1245152562753645, + "flos": 18374764101120.0, + "grad_norm": 2.28424874253062, + "language_loss": 0.81667042, + "learning_rate": 3.907021931556922e-06, + "loss": 0.89601028, + "num_input_tokens_seen": 44749685, + "router_z_loss_clip": 3.3359375, + "router_z_loss_mlp": 0.3046875, + "step": 2071, + "time_per_iteration": 3.9356284141540527 + }, + { + "auxiliary_loss_clip": 0.06624742, + "auxiliary_loss_mlp": 0.01289094, + "balance_loss_clip": 0.06303577, + "balance_loss_mlp": 0.01256407, + "epoch": 0.12457537952803246, + "flos": 33116098331520.0, + "grad_norm": 2.0527550980706626, + "language_loss": 0.79415953, + "learning_rate": 3.906904527881684e-06, + "loss": 0.87329787, + "num_input_tokens_seen": 44772165, + "router_z_loss_clip": 3.20898438, + "router_z_loss_mlp": 0.32666016, + "step": 2072, + "time_per_iteration": 2.659824848175049 + }, + { + "auxiliary_loss_clip": 0.06639021, + "auxiliary_loss_mlp": 0.01293554, + "balance_loss_clip": 0.06306598, + "balance_loss_mlp": 0.01260819, + "epoch": 0.12463550278070043, + "flos": 22276267511040.0, + "grad_norm": 2.0170209718237144, + "language_loss": 0.76458508, + "learning_rate": 3.9067870518964355e-06, + "loss": 0.84391081, + "num_input_tokens_seen": 44790580, + "router_z_loss_clip": 3.32226562, + "router_z_loss_mlp": 0.32739258, + "step": 2073, + "time_per_iteration": 4.0372233390808105 + }, + { + "auxiliary_loss_clip": 0.06627664, + "auxiliary_loss_mlp": 0.012867, + "balance_loss_clip": 0.06303963, + "balance_loss_mlp": 0.01255491, + "epoch": 0.12469562603336841, + "flos": 14683445458560.0, + "grad_norm": 1.9751185197934578, + "language_loss": 0.9136548, + "learning_rate": 3.906669503605631e-06, + "loss": 0.99279845, + "num_input_tokens_seen": 44806730, + "router_z_loss_clip": 3.24023438, + "router_z_loss_mlp": 0.3125, + "step": 2074, + "time_per_iteration": 3.880718946456909 + }, + { + "auxiliary_loss_clip": 0.06644025, + "auxiliary_loss_mlp": 0.01296508, + "balance_loss_clip": 0.06306964, + "balance_loss_mlp": 0.0126065, + "epoch": 0.12475574928603637, + "flos": 24651720835200.0, + "grad_norm": 2.411338932827457, + "language_loss": 0.85379255, + "learning_rate": 3.906551883013728e-06, + "loss": 0.93319792, + "num_input_tokens_seen": 44825550, + "router_z_loss_clip": 3.36914062, + "router_z_loss_mlp": 0.35839844, + "step": 2075, + "time_per_iteration": 2.593402147293091 + }, + { + "auxiliary_loss_clip": 0.06632458, + "auxiliary_loss_mlp": 0.01300353, + "balance_loss_clip": 0.06302904, + "balance_loss_mlp": 0.01267166, + "epoch": 0.12481587253870434, + "flos": 21769540744320.0, + "grad_norm": 1.9904013424210072, + "language_loss": 0.73795271, + "learning_rate": 3.9064341901251865e-06, + "loss": 0.81728083, + "num_input_tokens_seen": 44844155, + "router_z_loss_clip": 3.29296875, + "router_z_loss_mlp": 0.33227539, + "step": 2076, + "time_per_iteration": 2.5252525806427 + }, + { + "auxiliary_loss_clip": 0.06619625, + "auxiliary_loss_mlp": 0.01296003, + "balance_loss_clip": 0.06298469, + "balance_loss_mlp": 0.0126632, + "epoch": 0.12487599579137232, + "flos": 21438687697920.0, + "grad_norm": 2.119852671968812, + "language_loss": 0.76853049, + "learning_rate": 3.906316424944469e-06, + "loss": 0.84768671, + "num_input_tokens_seen": 44863780, + "router_z_loss_clip": 3.2109375, + "router_z_loss_mlp": 0.29663086, + "step": 2077, + "time_per_iteration": 2.5812795162200928 + }, + { + "auxiliary_loss_clip": 0.06627834, + "auxiliary_loss_mlp": 0.01294428, + "balance_loss_clip": 0.06298409, + "balance_loss_mlp": 0.01261503, + "epoch": 0.12493611904404028, + "flos": 16113228802560.0, + "grad_norm": 2.6079444778137906, + "language_loss": 0.83980322, + "learning_rate": 3.906198587476043e-06, + "loss": 0.9190259, + "num_input_tokens_seen": 44881480, + "router_z_loss_clip": 3.296875, + "router_z_loss_mlp": 0.3293457, + "step": 2078, + "time_per_iteration": 2.5144779682159424 + }, + { + "auxiliary_loss_clip": 0.06633472, + "auxiliary_loss_mlp": 0.01297977, + "balance_loss_clip": 0.06301548, + "balance_loss_mlp": 0.01265337, + "epoch": 0.12499624229670825, + "flos": 21586749062400.0, + "grad_norm": 2.088353376240652, + "language_loss": 0.7681694, + "learning_rate": 3.906080677724374e-06, + "loss": 0.84748387, + "num_input_tokens_seen": 44900390, + "router_z_loss_clip": 3.32226562, + "router_z_loss_mlp": 0.32617188, + "step": 2079, + "time_per_iteration": 2.638761043548584 + }, + { + "auxiliary_loss_clip": 0.06640807, + "auxiliary_loss_mlp": 0.01295919, + "balance_loss_clip": 0.06307015, + "balance_loss_mlp": 0.01263351, + "epoch": 0.1250563655493762, + "flos": 25705522909440.0, + "grad_norm": 2.3726479932939064, + "language_loss": 0.85245967, + "learning_rate": 3.905962695693935e-06, + "loss": 0.93182695, + "num_input_tokens_seen": 44920375, + "router_z_loss_clip": 3.33984375, + "router_z_loss_mlp": 0.32592773, + "step": 2080, + "time_per_iteration": 2.5898683071136475 + }, + { + "auxiliary_loss_clip": 0.06618196, + "auxiliary_loss_mlp": 0.0130361, + "balance_loss_clip": 0.06292213, + "balance_loss_mlp": 0.01269993, + "epoch": 0.12511648880204418, + "flos": 16915113976320.0, + "grad_norm": 2.1047824756143263, + "language_loss": 0.86146665, + "learning_rate": 3.9058446413892e-06, + "loss": 0.94068468, + "num_input_tokens_seen": 44938415, + "router_z_loss_clip": 3.25976562, + "router_z_loss_mlp": 0.3359375, + "step": 2081, + "time_per_iteration": 2.5291430950164795 + }, + { + "auxiliary_loss_clip": 0.06628423, + "auxiliary_loss_mlp": 0.01299212, + "balance_loss_clip": 0.06304745, + "balance_loss_mlp": 0.01268289, + "epoch": 0.12517661205471217, + "flos": 17573423978880.0, + "grad_norm": 1.9525319716543403, + "language_loss": 0.77591729, + "learning_rate": 3.905726514814646e-06, + "loss": 0.85519361, + "num_input_tokens_seen": 44957135, + "router_z_loss_clip": 3.23632812, + "router_z_loss_mlp": 0.30908203, + "step": 2082, + "time_per_iteration": 2.5817041397094727 + }, + { + "auxiliary_loss_clip": 0.06645833, + "auxiliary_loss_mlp": 0.01295307, + "balance_loss_clip": 0.06304055, + "balance_loss_mlp": 0.01261118, + "epoch": 0.12523673530738014, + "flos": 16039240047360.0, + "grad_norm": 3.06086551706414, + "language_loss": 0.80167735, + "learning_rate": 3.9056083159747495e-06, + "loss": 0.88108873, + "num_input_tokens_seen": 44974480, + "router_z_loss_clip": 3.421875, + "router_z_loss_mlp": 0.34179688, + "step": 2083, + "time_per_iteration": 2.6278059482574463 + }, + { + "auxiliary_loss_clip": 0.06632711, + "auxiliary_loss_mlp": 0.01297422, + "balance_loss_clip": 0.06298797, + "balance_loss_mlp": 0.0126297, + "epoch": 0.1252968585600481, + "flos": 18813833095680.0, + "grad_norm": 3.451384720222282, + "language_loss": 0.92214763, + "learning_rate": 3.9054900448739966e-06, + "loss": 1.00144899, + "num_input_tokens_seen": 44990310, + "router_z_loss_clip": 3.33789062, + "router_z_loss_mlp": 0.34472656, + "step": 2084, + "time_per_iteration": 2.501530647277832 + }, + { + "auxiliary_loss_clip": 0.0662484, + "auxiliary_loss_mlp": 0.01295191, + "balance_loss_clip": 0.06296518, + "balance_loss_mlp": 0.01263171, + "epoch": 0.12535698181271607, + "flos": 27278923351680.0, + "grad_norm": 1.9702751102582312, + "language_loss": 0.81308639, + "learning_rate": 3.905371701516869e-06, + "loss": 0.89228666, + "num_input_tokens_seen": 45010720, + "router_z_loss_clip": 3.28320312, + "router_z_loss_mlp": 0.32006836, + "step": 2085, + "time_per_iteration": 2.5993080139160156 + }, + { + "auxiliary_loss_clip": 0.06621981, + "auxiliary_loss_mlp": 0.01314133, + "balance_loss_clip": 0.06297316, + "balance_loss_mlp": 0.01281469, + "epoch": 0.12541710506538403, + "flos": 22060590301440.0, + "grad_norm": 2.513443994409739, + "language_loss": 0.89793539, + "learning_rate": 3.905253285907856e-06, + "loss": 0.97729653, + "num_input_tokens_seen": 45030360, + "router_z_loss_clip": 3.24609375, + "router_z_loss_mlp": 0.32641602, + "step": 2086, + "time_per_iteration": 2.526017427444458 + }, + { + "auxiliary_loss_clip": 0.0661508, + "auxiliary_loss_mlp": 0.01297904, + "balance_loss_clip": 0.06298057, + "balance_loss_mlp": 0.01269651, + "epoch": 0.125477228318052, + "flos": 12607888296960.0, + "grad_norm": 2.458580206146656, + "language_loss": 0.88740981, + "learning_rate": 3.905134798051447e-06, + "loss": 0.96653962, + "num_input_tokens_seen": 45045085, + "router_z_loss_clip": 3.16992188, + "router_z_loss_mlp": 0.28271484, + "step": 2087, + "time_per_iteration": 2.6768429279327393 + }, + { + "auxiliary_loss_clip": 0.06626555, + "auxiliary_loss_mlp": 0.0130267, + "balance_loss_clip": 0.06301963, + "balance_loss_mlp": 0.0127077, + "epoch": 0.12553735157071996, + "flos": 23885362592640.0, + "grad_norm": 1.907782132807464, + "language_loss": 0.74902099, + "learning_rate": 3.905016237952136e-06, + "loss": 0.82831323, + "num_input_tokens_seen": 45065145, + "router_z_loss_clip": 3.24804688, + "router_z_loss_mlp": 0.3190918, + "step": 2088, + "time_per_iteration": 2.584322690963745 + }, + { + "auxiliary_loss_clip": 0.06515329, + "auxiliary_loss_mlp": 0.01279784, + "balance_loss_clip": 0.06318291, + "balance_loss_mlp": 0.01264752, + "epoch": 0.12559747482338796, + "flos": 69940998881280.0, + "grad_norm": 0.7370797813517723, + "language_loss": 0.61766195, + "learning_rate": 3.904897605614418e-06, + "loss": 0.69561303, + "num_input_tokens_seen": 45126230, + "router_z_loss_clip": 1.96875, + "router_z_loss_mlp": 0.15002441, + "step": 2089, + "time_per_iteration": 3.1401424407958984 + }, + { + "auxiliary_loss_clip": 0.06624255, + "auxiliary_loss_mlp": 0.01293606, + "balance_loss_clip": 0.06302167, + "balance_loss_mlp": 0.01262707, + "epoch": 0.12565759807605592, + "flos": 24286389033600.0, + "grad_norm": 1.9922861494736146, + "language_loss": 0.80224949, + "learning_rate": 3.904778901042793e-06, + "loss": 0.88142806, + "num_input_tokens_seen": 45145545, + "router_z_loss_clip": 3.22070312, + "router_z_loss_mlp": 0.30883789, + "step": 2090, + "time_per_iteration": 2.6044373512268066 + }, + { + "auxiliary_loss_clip": 0.0651547, + "auxiliary_loss_mlp": 0.01267283, + "balance_loss_clip": 0.06318653, + "balance_loss_mlp": 0.01254635, + "epoch": 0.12571772132872389, + "flos": 56468011904640.0, + "grad_norm": 0.7384472353065198, + "language_loss": 0.58865118, + "learning_rate": 3.90466012424176e-06, + "loss": 0.66647875, + "num_input_tokens_seen": 45206845, + "router_z_loss_clip": 1.96875, + "router_z_loss_mlp": 0.12646484, + "step": 2091, + "time_per_iteration": 3.1160824298858643 + }, + { + "auxiliary_loss_clip": 0.06630008, + "auxiliary_loss_mlp": 0.01289162, + "balance_loss_clip": 0.06302688, + "balance_loss_mlp": 0.0125781, + "epoch": 0.12577784458139185, + "flos": 41255576421120.0, + "grad_norm": 1.8290499485408422, + "language_loss": 0.65244853, + "learning_rate": 3.904541275215825e-06, + "loss": 0.73164022, + "num_input_tokens_seen": 45228495, + "router_z_loss_clip": 3.26953125, + "router_z_loss_mlp": 0.31347656, + "step": 2092, + "time_per_iteration": 2.743067741394043 + }, + { + "auxiliary_loss_clip": 0.06640761, + "auxiliary_loss_mlp": 0.01299851, + "balance_loss_clip": 0.06305548, + "balance_loss_mlp": 0.01265542, + "epoch": 0.12583796783405982, + "flos": 19761599427840.0, + "grad_norm": 2.082922063254684, + "language_loss": 0.82319552, + "learning_rate": 3.904422353969493e-06, + "loss": 0.9026016, + "num_input_tokens_seen": 45245720, + "router_z_loss_clip": 3.34960938, + "router_z_loss_mlp": 0.34277344, + "step": 2093, + "time_per_iteration": 2.5252139568328857 + }, + { + "auxiliary_loss_clip": 0.06622188, + "auxiliary_loss_mlp": 0.01291379, + "balance_loss_clip": 0.06303331, + "balance_loss_mlp": 0.01260766, + "epoch": 0.12589809108672778, + "flos": 22608797639040.0, + "grad_norm": 2.0047110075262635, + "language_loss": 0.76888406, + "learning_rate": 3.904303360507276e-06, + "loss": 0.84801972, + "num_input_tokens_seen": 45265650, + "router_z_loss_clip": 3.1875, + "router_z_loss_mlp": 0.30639648, + "step": 2094, + "time_per_iteration": 2.5590462684631348 + }, + { + "auxiliary_loss_clip": 0.06619669, + "auxiliary_loss_mlp": 0.01298019, + "balance_loss_clip": 0.06299751, + "balance_loss_mlp": 0.01266309, + "epoch": 0.12595821433939577, + "flos": 45233248792320.0, + "grad_norm": 1.7774170004570267, + "language_loss": 0.78170305, + "learning_rate": 3.9041842948336835e-06, + "loss": 0.8608799, + "num_input_tokens_seen": 45287790, + "router_z_loss_clip": 3.20117188, + "router_z_loss_mlp": 0.31689453, + "step": 2095, + "time_per_iteration": 2.7437078952789307 + }, + { + "auxiliary_loss_clip": 0.06632219, + "auxiliary_loss_mlp": 0.01294772, + "balance_loss_clip": 0.06299502, + "balance_loss_mlp": 0.01263492, + "epoch": 0.12601833759206374, + "flos": 14325115472640.0, + "grad_norm": 2.871933509106217, + "language_loss": 0.84611917, + "learning_rate": 3.904065156953232e-06, + "loss": 0.92538905, + "num_input_tokens_seen": 45305720, + "router_z_loss_clip": 3.328125, + "router_z_loss_mlp": 0.31274414, + "step": 2096, + "time_per_iteration": 2.530060052871704 + }, + { + "auxiliary_loss_clip": 0.06630743, + "auxiliary_loss_mlp": 0.01306013, + "balance_loss_clip": 0.06297809, + "balance_loss_mlp": 0.01272038, + "epoch": 0.1260784608447317, + "flos": 21294651329280.0, + "grad_norm": 2.3649533335504365, + "language_loss": 0.7677502, + "learning_rate": 3.903945946870439e-06, + "loss": 0.84711778, + "num_input_tokens_seen": 45325290, + "router_z_loss_clip": 3.33203125, + "router_z_loss_mlp": 0.33984375, + "step": 2097, + "time_per_iteration": 2.5258843898773193 + }, + { + "auxiliary_loss_clip": 0.06624204, + "auxiliary_loss_mlp": 0.01300362, + "balance_loss_clip": 0.06299201, + "balance_loss_mlp": 0.0127025, + "epoch": 0.12613858409739967, + "flos": 26258719564800.0, + "grad_norm": 2.151256625756143, + "language_loss": 0.88275403, + "learning_rate": 3.9038266645898246e-06, + "loss": 0.96199965, + "num_input_tokens_seen": 45344465, + "router_z_loss_clip": 3.25195312, + "router_z_loss_mlp": 0.30102539, + "step": 2098, + "time_per_iteration": 2.5916357040405273 + }, + { + "auxiliary_loss_clip": 0.0664238, + "auxiliary_loss_mlp": 0.01307801, + "balance_loss_clip": 0.06306277, + "balance_loss_mlp": 0.0127149, + "epoch": 0.12619870735006763, + "flos": 21586413646080.0, + "grad_norm": 1.8808679634119545, + "language_loss": 0.71169508, + "learning_rate": 3.903707310115912e-06, + "loss": 0.79119694, + "num_input_tokens_seen": 45362465, + "router_z_loss_clip": 3.359375, + "router_z_loss_mlp": 0.36303711, + "step": 2099, + "time_per_iteration": 2.525548219680786 + }, + { + "auxiliary_loss_clip": 0.06636767, + "auxiliary_loss_mlp": 0.01301654, + "balance_loss_clip": 0.06306287, + "balance_loss_mlp": 0.0126756, + "epoch": 0.1262588306027356, + "flos": 23373646508160.0, + "grad_norm": 3.191355313927065, + "language_loss": 0.83154678, + "learning_rate": 3.903587883453228e-06, + "loss": 0.91093099, + "num_input_tokens_seen": 45382700, + "router_z_loss_clip": 3.30664062, + "router_z_loss_mlp": 0.34106445, + "step": 2100, + "time_per_iteration": 2.581777572631836 + }, + { + "auxiliary_loss_clip": 0.06632592, + "auxiliary_loss_mlp": 0.01304584, + "balance_loss_clip": 0.06304123, + "balance_loss_mlp": 0.01271325, + "epoch": 0.12631895385540357, + "flos": 23955619841280.0, + "grad_norm": 1.9586534535799036, + "language_loss": 0.81579792, + "learning_rate": 3.903468384606302e-06, + "loss": 0.89516962, + "num_input_tokens_seen": 45401005, + "router_z_loss_clip": 3.28515625, + "router_z_loss_mlp": 0.33227539, + "step": 2101, + "time_per_iteration": 2.579571008682251 + }, + { + "auxiliary_loss_clip": 0.06508025, + "auxiliary_loss_mlp": 0.01260999, + "balance_loss_clip": 0.06310984, + "balance_loss_mlp": 0.0125033, + "epoch": 0.12637907710807156, + "flos": 70301760635520.0, + "grad_norm": 0.6797956524806741, + "language_loss": 0.57154572, + "learning_rate": 3.903348813579662e-06, + "loss": 0.6492359, + "num_input_tokens_seen": 45466555, + "router_z_loss_clip": 1.96875, + "router_z_loss_mlp": 0.10681152, + "step": 2102, + "time_per_iteration": 3.2542574405670166 + }, + { + "auxiliary_loss_clip": 0.06635006, + "auxiliary_loss_mlp": 0.0129624, + "balance_loss_clip": 0.06302785, + "balance_loss_mlp": 0.01264888, + "epoch": 0.12643920036073952, + "flos": 18920833159680.0, + "grad_norm": 2.1103424848105177, + "language_loss": 0.95015359, + "learning_rate": 3.903229170377845e-06, + "loss": 1.02946603, + "num_input_tokens_seen": 45485165, + "router_z_loss_clip": 3.32617188, + "router_z_loss_mlp": 0.31396484, + "step": 2103, + "time_per_iteration": 2.554858684539795 + }, + { + "auxiliary_loss_clip": 0.06615217, + "auxiliary_loss_mlp": 0.01290733, + "balance_loss_clip": 0.0629935, + "balance_loss_mlp": 0.0126099, + "epoch": 0.1264993236134075, + "flos": 27789926676480.0, + "grad_norm": 1.8409874759375768, + "language_loss": 0.79467118, + "learning_rate": 3.903109455005387e-06, + "loss": 0.8737306, + "num_input_tokens_seen": 45504630, + "router_z_loss_clip": 3.15820312, + "router_z_loss_mlp": 0.29711914, + "step": 2104, + "time_per_iteration": 2.6194100379943848 + }, + { + "auxiliary_loss_clip": 0.06630556, + "auxiliary_loss_mlp": 0.01294269, + "balance_loss_clip": 0.06301397, + "balance_loss_mlp": 0.0126256, + "epoch": 0.12655944686607545, + "flos": 24761739646080.0, + "grad_norm": 2.4857210053550625, + "language_loss": 0.82356828, + "learning_rate": 3.902989667466828e-06, + "loss": 0.90281653, + "num_input_tokens_seen": 45524885, + "router_z_loss_clip": 3.29296875, + "router_z_loss_mlp": 0.31713867, + "step": 2105, + "time_per_iteration": 2.6011011600494385 + }, + { + "auxiliary_loss_clip": 0.06645899, + "auxiliary_loss_mlp": 0.01301591, + "balance_loss_clip": 0.0630343, + "balance_loss_mlp": 0.01263587, + "epoch": 0.12661957011874342, + "flos": 24139753188480.0, + "grad_norm": 2.6380144602222653, + "language_loss": 0.84079802, + "learning_rate": 3.90286980776671e-06, + "loss": 0.92027295, + "num_input_tokens_seen": 45545000, + "router_z_loss_clip": 3.421875, + "router_z_loss_mlp": 0.37963867, + "step": 2106, + "time_per_iteration": 2.572817087173462 + }, + { + "auxiliary_loss_clip": 0.0662559, + "auxiliary_loss_mlp": 0.012898, + "balance_loss_clip": 0.06298016, + "balance_loss_mlp": 0.01256422, + "epoch": 0.12667969337141138, + "flos": 24576180779520.0, + "grad_norm": 1.9395738781277843, + "language_loss": 0.74407184, + "learning_rate": 3.902749875909578e-06, + "loss": 0.82322574, + "num_input_tokens_seen": 45564210, + "router_z_loss_clip": 3.2734375, + "router_z_loss_mlp": 0.33374023, + "step": 2107, + "time_per_iteration": 2.6193723678588867 + }, + { + "auxiliary_loss_clip": 0.06622959, + "auxiliary_loss_mlp": 0.01290393, + "balance_loss_clip": 0.06299001, + "balance_loss_mlp": 0.01259017, + "epoch": 0.12673981662407935, + "flos": 22967546895360.0, + "grad_norm": 2.0472212441306175, + "language_loss": 0.80444276, + "learning_rate": 3.90262987189998e-06, + "loss": 0.88357627, + "num_input_tokens_seen": 45583030, + "router_z_loss_clip": 3.24023438, + "router_z_loss_mlp": 0.31396484, + "step": 2108, + "time_per_iteration": 2.5497617721557617 + }, + { + "auxiliary_loss_clip": 0.06627882, + "auxiliary_loss_mlp": 0.01288653, + "balance_loss_clip": 0.06298642, + "balance_loss_mlp": 0.01256562, + "epoch": 0.12679993987674734, + "flos": 17280613486080.0, + "grad_norm": 2.14760795310841, + "language_loss": 0.77326792, + "learning_rate": 3.902509795742467e-06, + "loss": 0.85243326, + "num_input_tokens_seen": 45602265, + "router_z_loss_clip": 3.29296875, + "router_z_loss_mlp": 0.32080078, + "step": 2109, + "time_per_iteration": 3.9535577297210693 + }, + { + "auxiliary_loss_clip": 0.06619301, + "auxiliary_loss_mlp": 0.01294051, + "balance_loss_clip": 0.0629691, + "balance_loss_mlp": 0.01260672, + "epoch": 0.1268600631294153, + "flos": 17280865048320.0, + "grad_norm": 1.6861552096477337, + "language_loss": 0.83234507, + "learning_rate": 3.902389647441592e-06, + "loss": 0.91147858, + "num_input_tokens_seen": 45620595, + "router_z_loss_clip": 3.22265625, + "router_z_loss_mlp": 0.33374023, + "step": 2110, + "time_per_iteration": 3.975102424621582 + }, + { + "auxiliary_loss_clip": 0.06634356, + "auxiliary_loss_mlp": 0.01289468, + "balance_loss_clip": 0.06303843, + "balance_loss_mlp": 0.01256661, + "epoch": 0.12692018638208327, + "flos": 24067902712320.0, + "grad_norm": 1.6854035382994426, + "language_loss": 0.79946983, + "learning_rate": 3.90226942700191e-06, + "loss": 0.878708, + "num_input_tokens_seen": 45641140, + "router_z_loss_clip": 3.31054688, + "router_z_loss_mlp": 0.32788086, + "step": 2111, + "time_per_iteration": 2.549649953842163 + }, + { + "auxiliary_loss_clip": 0.06640926, + "auxiliary_loss_mlp": 0.0129832, + "balance_loss_clip": 0.06299084, + "balance_loss_mlp": 0.01261199, + "epoch": 0.12698030963475124, + "flos": 31839952648320.0, + "grad_norm": 2.9365318295255984, + "language_loss": 0.78364569, + "learning_rate": 3.902149134427982e-06, + "loss": 0.86303812, + "num_input_tokens_seen": 45662315, + "router_z_loss_clip": 3.4140625, + "router_z_loss_mlp": 0.37109375, + "step": 2112, + "time_per_iteration": 2.641850233078003 + }, + { + "auxiliary_loss_clip": 0.06616612, + "auxiliary_loss_mlp": 0.01293574, + "balance_loss_clip": 0.062942, + "balance_loss_mlp": 0.01262342, + "epoch": 0.1270404328874192, + "flos": 25194058387200.0, + "grad_norm": 2.0317084660262688, + "language_loss": 0.86970478, + "learning_rate": 3.902028769724367e-06, + "loss": 0.94880664, + "num_input_tokens_seen": 45680335, + "router_z_loss_clip": 3.22851562, + "router_z_loss_mlp": 0.31225586, + "step": 2113, + "time_per_iteration": 5.534189224243164 + }, + { + "auxiliary_loss_clip": 0.06626937, + "auxiliary_loss_mlp": 0.01298292, + "balance_loss_clip": 0.06295247, + "balance_loss_mlp": 0.01265462, + "epoch": 0.12710055614008717, + "flos": 16002790721280.0, + "grad_norm": 2.427248740860799, + "language_loss": 0.75266403, + "learning_rate": 3.9019083328956315e-06, + "loss": 0.83191633, + "num_input_tokens_seen": 45696240, + "router_z_loss_clip": 3.3203125, + "router_z_loss_mlp": 0.32788086, + "step": 2114, + "time_per_iteration": 2.491520643234253 + }, + { + "auxiliary_loss_clip": 0.06621046, + "auxiliary_loss_mlp": 0.01302494, + "balance_loss_clip": 0.06295703, + "balance_loss_mlp": 0.01270975, + "epoch": 0.12716067939275516, + "flos": 15091012517760.0, + "grad_norm": 2.3252793600318125, + "language_loss": 0.85064435, + "learning_rate": 3.901787823946341e-06, + "loss": 0.92987972, + "num_input_tokens_seen": 45713695, + "router_z_loss_clip": 3.2578125, + "router_z_loss_mlp": 0.31518555, + "step": 2115, + "time_per_iteration": 2.5152101516723633 + }, + { + "auxiliary_loss_clip": 0.06622103, + "auxiliary_loss_mlp": 0.01292068, + "balance_loss_clip": 0.06295006, + "balance_loss_mlp": 0.01260787, + "epoch": 0.12722080264542313, + "flos": 28374373704960.0, + "grad_norm": 1.6080767966631377, + "language_loss": 0.88167703, + "learning_rate": 3.901667242881065e-06, + "loss": 0.96081877, + "num_input_tokens_seen": 45736655, + "router_z_loss_clip": 3.2734375, + "router_z_loss_mlp": 0.3125, + "step": 2116, + "time_per_iteration": 2.61238169670105 + }, + { + "auxiliary_loss_clip": 0.06614063, + "auxiliary_loss_mlp": 0.01310146, + "balance_loss_clip": 0.06294715, + "balance_loss_mlp": 0.0127877, + "epoch": 0.1272809258980911, + "flos": 32388159985920.0, + "grad_norm": 4.443941469464488, + "language_loss": 0.72083235, + "learning_rate": 3.9015465897043775e-06, + "loss": 0.8000744, + "num_input_tokens_seen": 45758195, + "router_z_loss_clip": 3.1953125, + "router_z_loss_mlp": 0.3137207, + "step": 2117, + "time_per_iteration": 2.6185410022735596 + }, + { + "auxiliary_loss_clip": 0.06630652, + "auxiliary_loss_mlp": 0.01300593, + "balance_loss_clip": 0.06301345, + "balance_loss_mlp": 0.0126781, + "epoch": 0.12734104915075906, + "flos": 16039952807040.0, + "grad_norm": 1.9850917523754936, + "language_loss": 0.87703407, + "learning_rate": 3.901425864420852e-06, + "loss": 0.95634645, + "num_input_tokens_seen": 45774280, + "router_z_loss_clip": 3.29101562, + "router_z_loss_mlp": 0.32739258, + "step": 2118, + "time_per_iteration": 2.503112316131592 + }, + { + "auxiliary_loss_clip": 0.06623712, + "auxiliary_loss_mlp": 0.01308307, + "balance_loss_clip": 0.06299254, + "balance_loss_mlp": 0.01276359, + "epoch": 0.12740117240342702, + "flos": 18266296590720.0, + "grad_norm": 1.8669738886398666, + "language_loss": 0.88737518, + "learning_rate": 3.901305067035068e-06, + "loss": 0.96669531, + "num_input_tokens_seen": 45792760, + "router_z_loss_clip": 3.24414062, + "router_z_loss_mlp": 0.31945801, + "step": 2119, + "time_per_iteration": 2.541663885116577 + }, + { + "auxiliary_loss_clip": 0.06633841, + "auxiliary_loss_mlp": 0.01294245, + "balance_loss_clip": 0.06305236, + "balance_loss_mlp": 0.01260652, + "epoch": 0.127461295656095, + "flos": 12125242379520.0, + "grad_norm": 2.4570566612421154, + "language_loss": 0.88616729, + "learning_rate": 3.901184197551605e-06, + "loss": 0.96544814, + "num_input_tokens_seen": 45804300, + "router_z_loss_clip": 3.2890625, + "router_z_loss_mlp": 0.33569336, + "step": 2120, + "time_per_iteration": 2.481060743331909 + }, + { + "auxiliary_loss_clip": 0.06631807, + "auxiliary_loss_mlp": 0.01302004, + "balance_loss_clip": 0.06303513, + "balance_loss_mlp": 0.01269079, + "epoch": 0.12752141890876295, + "flos": 23155831019520.0, + "grad_norm": 1.9663880058350043, + "language_loss": 0.7779758, + "learning_rate": 3.901063255975046e-06, + "loss": 0.85731387, + "num_input_tokens_seen": 45823780, + "router_z_loss_clip": 3.28515625, + "router_z_loss_mlp": 0.3293457, + "step": 2121, + "time_per_iteration": 2.5578267574310303 + }, + { + "auxiliary_loss_clip": 0.06632394, + "auxiliary_loss_mlp": 0.01293067, + "balance_loss_clip": 0.06304775, + "balance_loss_mlp": 0.01258949, + "epoch": 0.12758154216143094, + "flos": 21622359847680.0, + "grad_norm": 2.5772818076611976, + "language_loss": 0.84019601, + "learning_rate": 3.900942242309978e-06, + "loss": 0.91945064, + "num_input_tokens_seen": 45840495, + "router_z_loss_clip": 3.28125, + "router_z_loss_mlp": 0.34106445, + "step": 2122, + "time_per_iteration": 2.5861244201660156 + }, + { + "auxiliary_loss_clip": 0.06629082, + "auxiliary_loss_mlp": 0.01293636, + "balance_loss_clip": 0.06302215, + "balance_loss_mlp": 0.01260162, + "epoch": 0.1276416654140989, + "flos": 15930395193600.0, + "grad_norm": 1.9995911681983476, + "language_loss": 0.80520052, + "learning_rate": 3.90082115656099e-06, + "loss": 0.88442767, + "num_input_tokens_seen": 45857735, + "router_z_loss_clip": 3.26953125, + "router_z_loss_mlp": 0.33496094, + "step": 2123, + "time_per_iteration": 2.543966770172119 + }, + { + "auxiliary_loss_clip": 0.06636834, + "auxiliary_loss_mlp": 0.01289825, + "balance_loss_clip": 0.06312384, + "balance_loss_mlp": 0.01257687, + "epoch": 0.12770178866676687, + "flos": 22389263141760.0, + "grad_norm": 1.6312979029769639, + "language_loss": 0.80678988, + "learning_rate": 3.900699998732673e-06, + "loss": 0.88605642, + "num_input_tokens_seen": 45876485, + "router_z_loss_clip": 3.24023438, + "router_z_loss_mlp": 0.3215332, + "step": 2124, + "time_per_iteration": 2.590118169784546 + }, + { + "auxiliary_loss_clip": 0.06636873, + "auxiliary_loss_mlp": 0.01291865, + "balance_loss_clip": 0.06307361, + "balance_loss_mlp": 0.01261228, + "epoch": 0.12776191191943484, + "flos": 21658851100800.0, + "grad_norm": 2.2926076774548765, + "language_loss": 0.76290202, + "learning_rate": 3.900578768829623e-06, + "loss": 0.84218943, + "num_input_tokens_seen": 45894645, + "router_z_loss_clip": 3.29492188, + "router_z_loss_mlp": 0.30639648, + "step": 2125, + "time_per_iteration": 2.5684149265289307 + }, + { + "auxiliary_loss_clip": 0.06631321, + "auxiliary_loss_mlp": 0.01289055, + "balance_loss_clip": 0.0630435, + "balance_loss_mlp": 0.01257011, + "epoch": 0.1278220351721028, + "flos": 25742056089600.0, + "grad_norm": 2.526811883204058, + "language_loss": 0.79172325, + "learning_rate": 3.900457466856434e-06, + "loss": 0.87092698, + "num_input_tokens_seen": 45913755, + "router_z_loss_clip": 3.26757812, + "router_z_loss_mlp": 0.3203125, + "step": 2126, + "time_per_iteration": 2.6264641284942627 + }, + { + "auxiliary_loss_clip": 0.06645348, + "auxiliary_loss_mlp": 0.01292083, + "balance_loss_clip": 0.06316036, + "balance_loss_mlp": 0.01259563, + "epoch": 0.12788215842477077, + "flos": 41252515747200.0, + "grad_norm": 1.559600581864003, + "language_loss": 0.70510435, + "learning_rate": 3.9003360928177085e-06, + "loss": 0.7844786, + "num_input_tokens_seen": 45936095, + "router_z_loss_clip": 3.29101562, + "router_z_loss_mlp": 0.32543945, + "step": 2127, + "time_per_iteration": 2.7501988410949707 + }, + { + "auxiliary_loss_clip": 0.06512339, + "auxiliary_loss_mlp": 0.01271557, + "balance_loss_clip": 0.06312746, + "balance_loss_mlp": 0.01259123, + "epoch": 0.12794228167743876, + "flos": 70899079265280.0, + "grad_norm": 0.8027421200972868, + "language_loss": 0.6268698, + "learning_rate": 3.900214646718047e-06, + "loss": 0.70470876, + "num_input_tokens_seen": 46004655, + "router_z_loss_clip": 2.0, + "router_z_loss_mlp": 0.12438965, + "step": 2128, + "time_per_iteration": 3.2327187061309814 + }, + { + "auxiliary_loss_clip": 0.06647713, + "auxiliary_loss_mlp": 0.0129082, + "balance_loss_clip": 0.06314018, + "balance_loss_mlp": 0.01255987, + "epoch": 0.12800240493010673, + "flos": 16295307724800.0, + "grad_norm": 3.2224372102485757, + "language_loss": 0.78878236, + "learning_rate": 3.900093128562056e-06, + "loss": 0.86816764, + "num_input_tokens_seen": 46023610, + "router_z_loss_clip": 3.3359375, + "router_z_loss_mlp": 0.34790039, + "step": 2129, + "time_per_iteration": 2.513296365737915 + }, + { + "auxiliary_loss_clip": 0.06653494, + "auxiliary_loss_mlp": 0.01302761, + "balance_loss_clip": 0.06312658, + "balance_loss_mlp": 0.012649, + "epoch": 0.1280625281827747, + "flos": 20637850700160.0, + "grad_norm": 2.4415165367574394, + "language_loss": 0.80974901, + "learning_rate": 3.899971538354343e-06, + "loss": 0.88931155, + "num_input_tokens_seen": 46041725, + "router_z_loss_clip": 3.40820312, + "router_z_loss_mlp": 0.37866211, + "step": 2130, + "time_per_iteration": 2.551335573196411 + }, + { + "auxiliary_loss_clip": 0.06635942, + "auxiliary_loss_mlp": 0.01301168, + "balance_loss_clip": 0.06304602, + "balance_loss_mlp": 0.01268457, + "epoch": 0.12812265143544266, + "flos": 22644869621760.0, + "grad_norm": 1.8063453022697407, + "language_loss": 0.73535526, + "learning_rate": 3.899849876099518e-06, + "loss": 0.81472635, + "num_input_tokens_seen": 46061095, + "router_z_loss_clip": 3.31445312, + "router_z_loss_mlp": 0.3269043, + "step": 2131, + "time_per_iteration": 2.591715097427368 + }, + { + "auxiliary_loss_clip": 0.06649061, + "auxiliary_loss_mlp": 0.01307481, + "balance_loss_clip": 0.06316839, + "balance_loss_mlp": 0.01274961, + "epoch": 0.12818277468811062, + "flos": 34723306696320.0, + "grad_norm": 2.4480572994081213, + "language_loss": 0.74477613, + "learning_rate": 3.899728141802197e-06, + "loss": 0.8243416, + "num_input_tokens_seen": 46082670, + "router_z_loss_clip": 3.32226562, + "router_z_loss_mlp": 0.32519531, + "step": 2132, + "time_per_iteration": 2.644005060195923 + }, + { + "auxiliary_loss_clip": 0.06630264, + "auxiliary_loss_mlp": 0.01301188, + "balance_loss_clip": 0.06311467, + "balance_loss_mlp": 0.01268573, + "epoch": 0.1282428979407786, + "flos": 23118752787840.0, + "grad_norm": 2.134664592917613, + "language_loss": 0.83662349, + "learning_rate": 3.8996063354669935e-06, + "loss": 0.91593802, + "num_input_tokens_seen": 46102410, + "router_z_loss_clip": 3.1875, + "router_z_loss_mlp": 0.32617188, + "step": 2133, + "time_per_iteration": 2.526437520980835 + }, + { + "auxiliary_loss_clip": 0.06657492, + "auxiliary_loss_mlp": 0.01312656, + "balance_loss_clip": 0.06318928, + "balance_loss_mlp": 0.01277823, + "epoch": 0.12830302119344655, + "flos": 20892786347520.0, + "grad_norm": 3.0593036297338223, + "language_loss": 0.82609046, + "learning_rate": 3.899484457098528e-06, + "loss": 0.90579188, + "num_input_tokens_seen": 46121145, + "router_z_loss_clip": 3.38476562, + "router_z_loss_mlp": 0.34814453, + "step": 2134, + "time_per_iteration": 2.57069993019104 + }, + { + "auxiliary_loss_clip": 0.06644946, + "auxiliary_loss_mlp": 0.01299694, + "balance_loss_clip": 0.0631265, + "balance_loss_mlp": 0.01266363, + "epoch": 0.12836314444611455, + "flos": 21404208942720.0, + "grad_norm": 1.8809028559826366, + "language_loss": 0.84531921, + "learning_rate": 3.899362506701421e-06, + "loss": 0.92476559, + "num_input_tokens_seen": 46140740, + "router_z_loss_clip": 3.3203125, + "router_z_loss_mlp": 0.33325195, + "step": 2135, + "time_per_iteration": 2.5816993713378906 + }, + { + "auxiliary_loss_clip": 0.06641332, + "auxiliary_loss_mlp": 0.01305378, + "balance_loss_clip": 0.06312244, + "balance_loss_mlp": 0.01272142, + "epoch": 0.1284232676987825, + "flos": 13667560156800.0, + "grad_norm": 3.0323333945799176, + "language_loss": 0.78892457, + "learning_rate": 3.899240484280298e-06, + "loss": 0.86839169, + "num_input_tokens_seen": 46156805, + "router_z_loss_clip": 3.2890625, + "router_z_loss_mlp": 0.33227539, + "step": 2136, + "time_per_iteration": 2.529231548309326 + }, + { + "auxiliary_loss_clip": 0.06499572, + "auxiliary_loss_mlp": 0.01289102, + "balance_loss_clip": 0.06299701, + "balance_loss_mlp": 0.01276156, + "epoch": 0.12848339095145048, + "flos": 60012904337280.0, + "grad_norm": 0.8797489168749767, + "language_loss": 0.5947628, + "learning_rate": 3.899118389839785e-06, + "loss": 0.67264956, + "num_input_tokens_seen": 46222085, + "router_z_loss_clip": 2.0, + "router_z_loss_mlp": 0.12957764, + "step": 2137, + "time_per_iteration": 3.308232545852661 + }, + { + "auxiliary_loss_clip": 0.06652065, + "auxiliary_loss_mlp": 0.01307251, + "balance_loss_clip": 0.06317523, + "balance_loss_mlp": 0.01273515, + "epoch": 0.12854351420411844, + "flos": 13886507675520.0, + "grad_norm": 2.603073013301421, + "language_loss": 0.84481782, + "learning_rate": 3.898996223384512e-06, + "loss": 0.924411, + "num_input_tokens_seen": 46239970, + "router_z_loss_clip": 3.34765625, + "router_z_loss_mlp": 0.3371582, + "step": 2138, + "time_per_iteration": 2.5150487422943115 + }, + { + "auxiliary_loss_clip": 0.0665133, + "auxiliary_loss_mlp": 0.01300544, + "balance_loss_clip": 0.06310506, + "balance_loss_mlp": 0.01263136, + "epoch": 0.1286036374567864, + "flos": 22644534205440.0, + "grad_norm": 2.3721539245571237, + "language_loss": 0.79668736, + "learning_rate": 3.898873984919113e-06, + "loss": 0.87620616, + "num_input_tokens_seen": 46257740, + "router_z_loss_clip": 3.41015625, + "router_z_loss_mlp": 0.37402344, + "step": 2139, + "time_per_iteration": 2.5760304927825928 + }, + { + "auxiliary_loss_clip": 0.06645858, + "auxiliary_loss_mlp": 0.01289965, + "balance_loss_clip": 0.06314536, + "balance_loss_mlp": 0.0125754, + "epoch": 0.12866376070945437, + "flos": 16330121896320.0, + "grad_norm": 1.944874099387006, + "language_loss": 0.86374593, + "learning_rate": 3.8987516744482215e-06, + "loss": 0.94310415, + "num_input_tokens_seen": 46275445, + "router_z_loss_clip": 3.30859375, + "router_z_loss_mlp": 0.32421875, + "step": 2140, + "time_per_iteration": 2.5656511783599854 + }, + { + "auxiliary_loss_clip": 0.06634524, + "auxiliary_loss_mlp": 0.01284799, + "balance_loss_clip": 0.06308289, + "balance_loss_mlp": 0.01254496, + "epoch": 0.12872388396212234, + "flos": 11879321045760.0, + "grad_norm": 2.00800168780761, + "language_loss": 0.87046349, + "learning_rate": 3.898629291976476e-06, + "loss": 0.94965667, + "num_input_tokens_seen": 46291710, + "router_z_loss_clip": 3.2578125, + "router_z_loss_mlp": 0.30322266, + "step": 2141, + "time_per_iteration": 2.589749336242676 + }, + { + "auxiliary_loss_clip": 0.06646, + "auxiliary_loss_mlp": 0.01294177, + "balance_loss_clip": 0.06311622, + "balance_loss_mlp": 0.01261037, + "epoch": 0.12878400721479033, + "flos": 28374331777920.0, + "grad_norm": 2.3143248810569563, + "language_loss": 0.69344199, + "learning_rate": 3.898506837508518e-06, + "loss": 0.77284372, + "num_input_tokens_seen": 46311335, + "router_z_loss_clip": 3.34375, + "router_z_loss_mlp": 0.33154297, + "step": 2142, + "time_per_iteration": 2.631613254547119 + }, + { + "auxiliary_loss_clip": 0.06645877, + "auxiliary_loss_mlp": 0.01292532, + "balance_loss_clip": 0.06308207, + "balance_loss_mlp": 0.01257723, + "epoch": 0.1288441304674583, + "flos": 25892842711680.0, + "grad_norm": 1.8471793604151003, + "language_loss": 0.84538341, + "learning_rate": 3.89838431104899e-06, + "loss": 0.92476749, + "num_input_tokens_seen": 46330985, + "router_z_loss_clip": 3.38085938, + "router_z_loss_mlp": 0.34814453, + "step": 2143, + "time_per_iteration": 2.62510085105896 + }, + { + "auxiliary_loss_clip": 0.06646847, + "auxiliary_loss_mlp": 0.01296075, + "balance_loss_clip": 0.06309757, + "balance_loss_mlp": 0.01261194, + "epoch": 0.12890425372012626, + "flos": 20820097330560.0, + "grad_norm": 2.9481033880232284, + "language_loss": 0.82936227, + "learning_rate": 3.898261712602539e-06, + "loss": 0.90879142, + "num_input_tokens_seen": 46351295, + "router_z_loss_clip": 3.37109375, + "router_z_loss_mlp": 0.34912109, + "step": 2144, + "time_per_iteration": 2.562148332595825 + }, + { + "auxiliary_loss_clip": 0.06632444, + "auxiliary_loss_mlp": 0.01299578, + "balance_loss_clip": 0.06302489, + "balance_loss_mlp": 0.01263196, + "epoch": 0.12896437697279423, + "flos": 22572599875200.0, + "grad_norm": 2.2245116542983046, + "language_loss": 0.80073792, + "learning_rate": 3.898139042173813e-06, + "loss": 0.88005811, + "num_input_tokens_seen": 46368600, + "router_z_loss_clip": 3.29882812, + "router_z_loss_mlp": 0.36376953, + "step": 2145, + "time_per_iteration": 2.5510518550872803 + }, + { + "auxiliary_loss_clip": 0.06636346, + "auxiliary_loss_mlp": 0.0130688, + "balance_loss_clip": 0.06306225, + "balance_loss_mlp": 0.01269877, + "epoch": 0.1290245002254622, + "flos": 17499561004800.0, + "grad_norm": 2.1761731102138686, + "language_loss": 0.83456767, + "learning_rate": 3.898016299767465e-06, + "loss": 0.91399992, + "num_input_tokens_seen": 46387370, + "router_z_loss_clip": 3.30273438, + "router_z_loss_mlp": 0.36987305, + "step": 2146, + "time_per_iteration": 2.5113868713378906 + }, + { + "auxiliary_loss_clip": 0.06626259, + "auxiliary_loss_mlp": 0.01301495, + "balance_loss_clip": 0.06300884, + "balance_loss_mlp": 0.01266042, + "epoch": 0.12908462347813016, + "flos": 36324142151040.0, + "grad_norm": 4.395125583857354, + "language_loss": 0.72594023, + "learning_rate": 3.897893485388149e-06, + "loss": 0.8052178, + "num_input_tokens_seen": 46409570, + "router_z_loss_clip": 3.25585938, + "router_z_loss_mlp": 0.35449219, + "step": 2147, + "time_per_iteration": 2.7282183170318604 + }, + { + "auxiliary_loss_clip": 0.06638759, + "auxiliary_loss_mlp": 0.01311135, + "balance_loss_clip": 0.0630547, + "balance_loss_mlp": 0.0127685, + "epoch": 0.12914474673079815, + "flos": 22535312008320.0, + "grad_norm": 2.709676387149746, + "language_loss": 0.73026669, + "learning_rate": 3.897770599040521e-06, + "loss": 0.80976564, + "num_input_tokens_seen": 46429320, + "router_z_loss_clip": 3.33203125, + "router_z_loss_mlp": 0.34326172, + "step": 2148, + "time_per_iteration": 2.5520236492156982 + }, + { + "auxiliary_loss_clip": 0.0663462, + "auxiliary_loss_mlp": 0.01329577, + "balance_loss_clip": 0.06310473, + "balance_loss_mlp": 0.01295626, + "epoch": 0.12920486998346611, + "flos": 21478533114240.0, + "grad_norm": 1.8799370652963014, + "language_loss": 0.80598587, + "learning_rate": 3.897647640729242e-06, + "loss": 0.88562787, + "num_input_tokens_seen": 46450155, + "router_z_loss_clip": 3.2421875, + "router_z_loss_mlp": 0.33959961, + "step": 2149, + "time_per_iteration": 3.9808621406555176 + }, + { + "auxiliary_loss_clip": 0.06633235, + "auxiliary_loss_mlp": 0.01311577, + "balance_loss_clip": 0.06304948, + "balance_loss_mlp": 0.01273907, + "epoch": 0.12926499323613408, + "flos": 27316001583360.0, + "grad_norm": 1.9848043356035314, + "language_loss": 0.77766216, + "learning_rate": 3.897524610458975e-06, + "loss": 0.85711026, + "num_input_tokens_seen": 46470280, + "router_z_loss_clip": 3.27929688, + "router_z_loss_mlp": 0.37646484, + "step": 2150, + "time_per_iteration": 4.050567388534546 + }, + { + "auxiliary_loss_clip": 0.06637069, + "auxiliary_loss_mlp": 0.01309125, + "balance_loss_clip": 0.06305329, + "balance_loss_mlp": 0.01273791, + "epoch": 0.12932511648880204, + "flos": 22097710460160.0, + "grad_norm": 2.600129389398131, + "language_loss": 0.71828127, + "learning_rate": 3.8974015082343835e-06, + "loss": 0.79774326, + "num_input_tokens_seen": 46487605, + "router_z_loss_clip": 3.31835938, + "router_z_loss_mlp": 0.35351562, + "step": 2151, + "time_per_iteration": 2.539199113845825 + }, + { + "auxiliary_loss_clip": 0.06638855, + "auxiliary_loss_mlp": 0.01316478, + "balance_loss_clip": 0.06308948, + "balance_loss_mlp": 0.01280716, + "epoch": 0.12938523974147, + "flos": 20308968224640.0, + "grad_norm": 2.09152011854814, + "language_loss": 0.85415232, + "learning_rate": 3.897278334060137e-06, + "loss": 0.93370569, + "num_input_tokens_seen": 46505100, + "router_z_loss_clip": 3.296875, + "router_z_loss_mlp": 0.35766602, + "step": 2152, + "time_per_iteration": 4.064931631088257 + }, + { + "auxiliary_loss_clip": 0.06626976, + "auxiliary_loss_mlp": 0.0130895, + "balance_loss_clip": 0.06301983, + "balance_loss_mlp": 0.01275118, + "epoch": 0.12944536299413797, + "flos": 19505992947840.0, + "grad_norm": 2.0734690645371865, + "language_loss": 0.79983026, + "learning_rate": 3.897155087940906e-06, + "loss": 0.87918949, + "num_input_tokens_seen": 46524020, + "router_z_loss_clip": 3.25, + "router_z_loss_mlp": 0.33837891, + "step": 2153, + "time_per_iteration": 3.9787750244140625 + }, + { + "auxiliary_loss_clip": 0.06634978, + "auxiliary_loss_mlp": 0.01296438, + "balance_loss_clip": 0.06309275, + "balance_loss_mlp": 0.01262845, + "epoch": 0.12950548624680594, + "flos": 27715099380480.0, + "grad_norm": 1.6134334939452253, + "language_loss": 0.81228089, + "learning_rate": 3.897031769881364e-06, + "loss": 0.89159513, + "num_input_tokens_seen": 46544640, + "router_z_loss_clip": 3.2578125, + "router_z_loss_mlp": 0.3359375, + "step": 2154, + "time_per_iteration": 2.6176583766937256 + }, + { + "auxiliary_loss_clip": 0.06634305, + "auxiliary_loss_mlp": 0.01301182, + "balance_loss_clip": 0.06307935, + "balance_loss_mlp": 0.01267756, + "epoch": 0.12956560949947393, + "flos": 17571369553920.0, + "grad_norm": 5.013009585067341, + "language_loss": 0.84744835, + "learning_rate": 3.896908379886188e-06, + "loss": 0.92680323, + "num_input_tokens_seen": 46561395, + "router_z_loss_clip": 3.265625, + "router_z_loss_mlp": 0.33422852, + "step": 2155, + "time_per_iteration": 2.512476921081543 + }, + { + "auxiliary_loss_clip": 0.06635429, + "auxiliary_loss_mlp": 0.01300286, + "balance_loss_clip": 0.06301479, + "balance_loss_mlp": 0.01265668, + "epoch": 0.1296257327521419, + "flos": 20746989043200.0, + "grad_norm": 7.629659850029062, + "language_loss": 0.77301121, + "learning_rate": 3.896784917960055e-06, + "loss": 0.85236835, + "num_input_tokens_seen": 46579395, + "router_z_loss_clip": 3.34179688, + "router_z_loss_mlp": 0.34619141, + "step": 2156, + "time_per_iteration": 2.5492148399353027 + }, + { + "auxiliary_loss_clip": 0.06627367, + "auxiliary_loss_mlp": 0.01301012, + "balance_loss_clip": 0.06305566, + "balance_loss_mlp": 0.01268063, + "epoch": 0.12968585600480986, + "flos": 16400756488320.0, + "grad_norm": 2.322189413476167, + "language_loss": 0.88143146, + "learning_rate": 3.896661384107648e-06, + "loss": 0.96071517, + "num_input_tokens_seen": 46597090, + "router_z_loss_clip": 3.21875, + "router_z_loss_mlp": 0.32910156, + "step": 2157, + "time_per_iteration": 2.571720838546753 + }, + { + "auxiliary_loss_clip": 0.06642087, + "auxiliary_loss_mlp": 0.0129196, + "balance_loss_clip": 0.06308718, + "balance_loss_mlp": 0.01257699, + "epoch": 0.12974597925747783, + "flos": 28337043911040.0, + "grad_norm": 2.3553612027238753, + "language_loss": 0.82135451, + "learning_rate": 3.896537778333651e-06, + "loss": 0.90069497, + "num_input_tokens_seen": 46617355, + "router_z_loss_clip": 3.33398438, + "router_z_loss_mlp": 0.34277344, + "step": 2158, + "time_per_iteration": 2.5973830223083496 + }, + { + "auxiliary_loss_clip": 0.06639753, + "auxiliary_loss_mlp": 0.0129687, + "balance_loss_clip": 0.06306097, + "balance_loss_mlp": 0.01263467, + "epoch": 0.1298061025101458, + "flos": 9687036746880.0, + "grad_norm": 2.577133138726625, + "language_loss": 0.76591945, + "learning_rate": 3.896414100642752e-06, + "loss": 0.84528571, + "num_input_tokens_seen": 46633130, + "router_z_loss_clip": 3.33984375, + "router_z_loss_mlp": 0.33422852, + "step": 2159, + "time_per_iteration": 2.4932103157043457 + }, + { + "auxiliary_loss_clip": 0.06634657, + "auxiliary_loss_mlp": 0.01294131, + "balance_loss_clip": 0.06308954, + "balance_loss_mlp": 0.01261086, + "epoch": 0.12986622576281376, + "flos": 27716986097280.0, + "grad_norm": 2.475517406269625, + "language_loss": 0.83553314, + "learning_rate": 3.89629035103964e-06, + "loss": 0.91482103, + "num_input_tokens_seen": 46650575, + "router_z_loss_clip": 3.26171875, + "router_z_loss_mlp": 0.33056641, + "step": 2160, + "time_per_iteration": 2.603818655014038 + }, + { + "auxiliary_loss_clip": 0.06627609, + "auxiliary_loss_mlp": 0.01293116, + "balance_loss_clip": 0.06306535, + "balance_loss_mlp": 0.01259118, + "epoch": 0.12992634901548175, + "flos": 18807963310080.0, + "grad_norm": 1.593154120113757, + "language_loss": 0.83271182, + "learning_rate": 3.896166529529008e-06, + "loss": 0.91191912, + "num_input_tokens_seen": 46668780, + "router_z_loss_clip": 3.21484375, + "router_z_loss_mlp": 0.33984375, + "step": 2161, + "time_per_iteration": 2.5266897678375244 + }, + { + "auxiliary_loss_clip": 0.06639621, + "auxiliary_loss_mlp": 0.01302779, + "balance_loss_clip": 0.06313581, + "balance_loss_mlp": 0.01268423, + "epoch": 0.12998647226814972, + "flos": 29134442891520.0, + "grad_norm": 2.3185391348432254, + "language_loss": 0.83230841, + "learning_rate": 3.896042636115551e-06, + "loss": 0.91173244, + "num_input_tokens_seen": 46687550, + "router_z_loss_clip": 3.25976562, + "router_z_loss_mlp": 0.34375, + "step": 2162, + "time_per_iteration": 2.65075945854187 + }, + { + "auxiliary_loss_clip": 0.06644595, + "auxiliary_loss_mlp": 0.0130915, + "balance_loss_clip": 0.06308532, + "balance_loss_mlp": 0.01275485, + "epoch": 0.13004659552081768, + "flos": 19579855921920.0, + "grad_norm": 2.844531827385147, + "language_loss": 0.74537766, + "learning_rate": 3.895918670803968e-06, + "loss": 0.82491517, + "num_input_tokens_seen": 46706730, + "router_z_loss_clip": 3.36132812, + "router_z_loss_mlp": 0.33666992, + "step": 2163, + "time_per_iteration": 2.54642653465271 + }, + { + "auxiliary_loss_clip": 0.06640218, + "auxiliary_loss_mlp": 0.0130695, + "balance_loss_clip": 0.06307475, + "balance_loss_mlp": 0.01271259, + "epoch": 0.13010671877348565, + "flos": 22497059819520.0, + "grad_norm": 2.8300840640024605, + "language_loss": 0.82687104, + "learning_rate": 3.895794633598958e-06, + "loss": 0.90634274, + "num_input_tokens_seen": 46724250, + "router_z_loss_clip": 3.33007812, + "router_z_loss_mlp": 0.35668945, + "step": 2164, + "time_per_iteration": 2.5606889724731445 + }, + { + "auxiliary_loss_clip": 0.06643611, + "auxiliary_loss_mlp": 0.01308241, + "balance_loss_clip": 0.0631078, + "balance_loss_mlp": 0.0127317, + "epoch": 0.1301668420261536, + "flos": 23884985249280.0, + "grad_norm": 2.1372618334431004, + "language_loss": 0.72789967, + "learning_rate": 3.8956705245052256e-06, + "loss": 0.80741817, + "num_input_tokens_seen": 46744105, + "router_z_loss_clip": 3.33007812, + "router_z_loss_mlp": 0.35058594, + "step": 2165, + "time_per_iteration": 2.5799126625061035 + }, + { + "auxiliary_loss_clip": 0.06653779, + "auxiliary_loss_mlp": 0.01315345, + "balance_loss_clip": 0.06317334, + "balance_loss_mlp": 0.0127932, + "epoch": 0.13022696527882158, + "flos": 23156963049600.0, + "grad_norm": 2.4025078023781563, + "language_loss": 0.76332915, + "learning_rate": 3.8955463435274765e-06, + "loss": 0.84302044, + "num_input_tokens_seen": 46764250, + "router_z_loss_clip": 3.3671875, + "router_z_loss_mlp": 0.35986328, + "step": 2166, + "time_per_iteration": 2.6160640716552734 + }, + { + "auxiliary_loss_clip": 0.06650659, + "auxiliary_loss_mlp": 0.01325427, + "balance_loss_clip": 0.06318434, + "balance_loss_mlp": 0.01292144, + "epoch": 0.13028708853148954, + "flos": 26916149099520.0, + "grad_norm": 2.7267776489226945, + "language_loss": 0.84227574, + "learning_rate": 3.895422090670421e-06, + "loss": 0.92203659, + "num_input_tokens_seen": 46786865, + "router_z_loss_clip": 3.3203125, + "router_z_loss_mlp": 0.33276367, + "step": 2167, + "time_per_iteration": 2.6118650436401367 + }, + { + "auxiliary_loss_clip": 0.0665281, + "auxiliary_loss_mlp": 0.01322266, + "balance_loss_clip": 0.06323615, + "balance_loss_mlp": 0.01284524, + "epoch": 0.13034721178415754, + "flos": 21257824659840.0, + "grad_norm": 1.882236850474067, + "language_loss": 0.84621233, + "learning_rate": 3.89529776593877e-06, + "loss": 0.9259631, + "num_input_tokens_seen": 46807030, + "router_z_loss_clip": 3.29101562, + "router_z_loss_mlp": 0.37719727, + "step": 2168, + "time_per_iteration": 2.599341869354248 + }, + { + "auxiliary_loss_clip": 0.06651181, + "auxiliary_loss_mlp": 0.01330045, + "balance_loss_clip": 0.0631827, + "balance_loss_mlp": 0.01296166, + "epoch": 0.1304073350368255, + "flos": 18772646014080.0, + "grad_norm": 2.6769280516725495, + "language_loss": 0.81258374, + "learning_rate": 3.8951733693372375e-06, + "loss": 0.89239597, + "num_input_tokens_seen": 46826280, + "router_z_loss_clip": 3.32617188, + "router_z_loss_mlp": 0.33886719, + "step": 2169, + "time_per_iteration": 2.551320791244507 + }, + { + "auxiliary_loss_clip": 0.06645042, + "auxiliary_loss_mlp": 0.01325755, + "balance_loss_clip": 0.06314517, + "balance_loss_mlp": 0.01290898, + "epoch": 0.13046745828949347, + "flos": 28371941936640.0, + "grad_norm": 2.6264294111585285, + "language_loss": 0.6902529, + "learning_rate": 3.8950489008705406e-06, + "loss": 0.76996082, + "num_input_tokens_seen": 46846505, + "router_z_loss_clip": 3.30664062, + "router_z_loss_mlp": 0.34838867, + "step": 2170, + "time_per_iteration": 2.636103868484497 + }, + { + "auxiliary_loss_clip": 0.06639146, + "auxiliary_loss_mlp": 0.01323013, + "balance_loss_clip": 0.063104, + "balance_loss_mlp": 0.01289826, + "epoch": 0.13052758154216143, + "flos": 29612518761600.0, + "grad_norm": 2.576487358768087, + "language_loss": 0.68392706, + "learning_rate": 3.8949243605434e-06, + "loss": 0.76354867, + "num_input_tokens_seen": 46867380, + "router_z_loss_clip": 3.28125, + "router_z_loss_mlp": 0.33178711, + "step": 2171, + "time_per_iteration": 2.6055140495300293 + }, + { + "auxiliary_loss_clip": 0.06645554, + "auxiliary_loss_mlp": 0.01327149, + "balance_loss_clip": 0.06309786, + "balance_loss_mlp": 0.0129215, + "epoch": 0.1305877047948294, + "flos": 19396938458880.0, + "grad_norm": 3.1003670458212973, + "language_loss": 0.73706764, + "learning_rate": 3.894799748360537e-06, + "loss": 0.81679469, + "num_input_tokens_seen": 46886810, + "router_z_loss_clip": 3.35742188, + "router_z_loss_mlp": 0.35009766, + "step": 2172, + "time_per_iteration": 2.541368007659912 + }, + { + "auxiliary_loss_clip": 0.06633269, + "auxiliary_loss_mlp": 0.01311381, + "balance_loss_clip": 0.06310625, + "balance_loss_mlp": 0.01278884, + "epoch": 0.13064782804749736, + "flos": 16879209701760.0, + "grad_norm": 2.044770569718403, + "language_loss": 0.7695576, + "learning_rate": 3.894675064326678e-06, + "loss": 0.84900403, + "num_input_tokens_seen": 46905620, + "router_z_loss_clip": 3.22460938, + "router_z_loss_mlp": 0.32470703, + "step": 2173, + "time_per_iteration": 2.5094704627990723 + }, + { + "auxiliary_loss_clip": 0.06648449, + "auxiliary_loss_mlp": 0.0132515, + "balance_loss_clip": 0.06310691, + "balance_loss_mlp": 0.01289125, + "epoch": 0.13070795130016533, + "flos": 24506049312000.0, + "grad_norm": 2.8505370909687575, + "language_loss": 0.725703, + "learning_rate": 3.894550308446551e-06, + "loss": 0.805439, + "num_input_tokens_seen": 46925120, + "router_z_loss_clip": 3.3828125, + "router_z_loss_mlp": 0.36035156, + "step": 2174, + "time_per_iteration": 2.5734338760375977 + }, + { + "auxiliary_loss_clip": 0.06505907, + "auxiliary_loss_mlp": 0.01291883, + "balance_loss_clip": 0.0631025, + "balance_loss_mlp": 0.0128004, + "epoch": 0.13076807455283332, + "flos": 71075288401920.0, + "grad_norm": 0.7747015133023086, + "language_loss": 0.58868217, + "learning_rate": 3.894425480724886e-06, + "loss": 0.66666007, + "num_input_tokens_seen": 46988195, + "router_z_loss_clip": 1.953125, + "router_z_loss_mlp": 0.11834717, + "step": 2175, + "time_per_iteration": 3.2926440238952637 + }, + { + "auxiliary_loss_clip": 0.0663542, + "auxiliary_loss_mlp": 0.01313196, + "balance_loss_clip": 0.06304372, + "balance_loss_mlp": 0.01276337, + "epoch": 0.13082819780550128, + "flos": 20270380619520.0, + "grad_norm": 2.4663196598164543, + "language_loss": 0.8129558, + "learning_rate": 3.894300581166417e-06, + "loss": 0.89244199, + "num_input_tokens_seen": 47004720, + "router_z_loss_clip": 3.30664062, + "router_z_loss_mlp": 0.36865234, + "step": 2176, + "time_per_iteration": 2.509202480316162 + }, + { + "auxiliary_loss_clip": 0.06636009, + "auxiliary_loss_mlp": 0.01308249, + "balance_loss_clip": 0.06307728, + "balance_loss_mlp": 0.01275204, + "epoch": 0.13088832105816925, + "flos": 34211884101120.0, + "grad_norm": 2.555490160200695, + "language_loss": 0.75945169, + "learning_rate": 3.894175609775881e-06, + "loss": 0.83889425, + "num_input_tokens_seen": 47024255, + "router_z_loss_clip": 3.28710938, + "router_z_loss_mlp": 0.33056641, + "step": 2177, + "time_per_iteration": 2.666957378387451 + }, + { + "auxiliary_loss_clip": 0.06632685, + "auxiliary_loss_mlp": 0.01303929, + "balance_loss_clip": 0.0630488, + "balance_loss_mlp": 0.01266378, + "epoch": 0.13094844431083721, + "flos": 17900797080960.0, + "grad_norm": 1.8104390236362107, + "language_loss": 0.8256914, + "learning_rate": 3.894050566558015e-06, + "loss": 0.90505755, + "num_input_tokens_seen": 47042465, + "router_z_loss_clip": 3.27929688, + "router_z_loss_mlp": 0.37548828, + "step": 2178, + "time_per_iteration": 2.5337579250335693 + }, + { + "auxiliary_loss_clip": 0.06635031, + "auxiliary_loss_mlp": 0.01298768, + "balance_loss_clip": 0.06305701, + "balance_loss_mlp": 0.01263625, + "epoch": 0.13100856756350518, + "flos": 17317062812160.0, + "grad_norm": 2.2347658227591327, + "language_loss": 0.76173234, + "learning_rate": 3.893925451517562e-06, + "loss": 0.84107035, + "num_input_tokens_seen": 47060370, + "router_z_loss_clip": 3.296875, + "router_z_loss_mlp": 0.35131836, + "step": 2179, + "time_per_iteration": 2.606982469558716 + }, + { + "auxiliary_loss_clip": 0.06624588, + "auxiliary_loss_mlp": 0.01289469, + "balance_loss_clip": 0.0630476, + "balance_loss_mlp": 0.01256281, + "epoch": 0.13106869081617314, + "flos": 22207142292480.0, + "grad_norm": 2.1299268574103074, + "language_loss": 0.85375142, + "learning_rate": 3.893800264659266e-06, + "loss": 0.93289196, + "num_input_tokens_seen": 47081415, + "router_z_loss_clip": 3.19921875, + "router_z_loss_mlp": 0.33154297, + "step": 2180, + "time_per_iteration": 2.585345506668091 + }, + { + "auxiliary_loss_clip": 0.06632008, + "auxiliary_loss_mlp": 0.01298661, + "balance_loss_clip": 0.06304625, + "balance_loss_mlp": 0.01265282, + "epoch": 0.13112881406884114, + "flos": 21769708452480.0, + "grad_norm": 1.7694842435775522, + "language_loss": 0.9062323, + "learning_rate": 3.8936750059878746e-06, + "loss": 0.98553902, + "num_input_tokens_seen": 47099860, + "router_z_loss_clip": 3.2734375, + "router_z_loss_mlp": 0.33374023, + "step": 2181, + "time_per_iteration": 2.5587892532348633 + }, + { + "auxiliary_loss_clip": 0.06634288, + "auxiliary_loss_mlp": 0.01294395, + "balance_loss_clip": 0.06307417, + "balance_loss_mlp": 0.01259776, + "epoch": 0.1311889373215091, + "flos": 23337784160640.0, + "grad_norm": 2.2247782487696557, + "language_loss": 0.70639372, + "learning_rate": 3.893549675508137e-06, + "loss": 0.78568053, + "num_input_tokens_seen": 47118540, + "router_z_loss_clip": 3.26953125, + "router_z_loss_mlp": 0.34594727, + "step": 2182, + "time_per_iteration": 2.5555248260498047 + }, + { + "auxiliary_loss_clip": 0.06638541, + "auxiliary_loss_mlp": 0.0130911, + "balance_loss_clip": 0.06305085, + "balance_loss_mlp": 0.01272799, + "epoch": 0.13124906057417707, + "flos": 21473250307200.0, + "grad_norm": 2.348832160211932, + "language_loss": 0.79619586, + "learning_rate": 3.893424273224806e-06, + "loss": 0.8756724, + "num_input_tokens_seen": 47136710, + "router_z_loss_clip": 3.33789062, + "router_z_loss_mlp": 0.36303711, + "step": 2183, + "time_per_iteration": 2.6583075523376465 + }, + { + "auxiliary_loss_clip": 0.06622553, + "auxiliary_loss_mlp": 0.01296715, + "balance_loss_clip": 0.06301284, + "balance_loss_mlp": 0.0126379, + "epoch": 0.13130918382684503, + "flos": 23261531345280.0, + "grad_norm": 1.7633024883927577, + "language_loss": 0.86310816, + "learning_rate": 3.893298799142636e-06, + "loss": 0.94230086, + "num_input_tokens_seen": 47157155, + "router_z_loss_clip": 3.2109375, + "router_z_loss_mlp": 0.32910156, + "step": 2184, + "time_per_iteration": 2.565059185028076 + }, + { + "auxiliary_loss_clip": 0.06636564, + "auxiliary_loss_mlp": 0.01289356, + "balance_loss_clip": 0.06310757, + "balance_loss_mlp": 0.0125593, + "epoch": 0.131369307079513, + "flos": 20856588583680.0, + "grad_norm": 2.0374007595813106, + "language_loss": 0.83394486, + "learning_rate": 3.893173253266387e-06, + "loss": 0.91320401, + "num_input_tokens_seen": 47176820, + "router_z_loss_clip": 3.25585938, + "router_z_loss_mlp": 0.33447266, + "step": 2185, + "time_per_iteration": 2.581048011779785 + }, + { + "auxiliary_loss_clip": 0.06633392, + "auxiliary_loss_mlp": 0.01301523, + "balance_loss_clip": 0.063053, + "balance_loss_mlp": 0.012675, + "epoch": 0.13142943033218096, + "flos": 17864138119680.0, + "grad_norm": 2.061355049120503, + "language_loss": 0.7394222, + "learning_rate": 3.893047635600818e-06, + "loss": 0.8187713, + "num_input_tokens_seen": 47195855, + "router_z_loss_clip": 3.27734375, + "router_z_loss_mlp": 0.33984375, + "step": 2186, + "time_per_iteration": 2.5314900875091553 + }, + { + "auxiliary_loss_clip": 0.06633774, + "auxiliary_loss_mlp": 0.01305006, + "balance_loss_clip": 0.06309012, + "balance_loss_mlp": 0.01268337, + "epoch": 0.13148955358484893, + "flos": 21002343960960.0, + "grad_norm": 2.3237992911957748, + "language_loss": 0.8187871, + "learning_rate": 3.892921946150693e-06, + "loss": 0.89817482, + "num_input_tokens_seen": 47214535, + "router_z_loss_clip": 3.24609375, + "router_z_loss_mlp": 0.36669922, + "step": 2187, + "time_per_iteration": 2.575146198272705 + }, + { + "auxiliary_loss_clip": 0.0650041, + "auxiliary_loss_mlp": 0.01303078, + "balance_loss_clip": 0.06306808, + "balance_loss_mlp": 0.01287998, + "epoch": 0.13154967683751692, + "flos": 70192035313920.0, + "grad_norm": 0.8229480574179819, + "language_loss": 0.58883667, + "learning_rate": 3.892796184920778e-06, + "loss": 0.66687155, + "num_input_tokens_seen": 47270300, + "router_z_loss_clip": 1.9375, + "router_z_loss_mlp": 0.1505127, + "step": 2188, + "time_per_iteration": 4.631601572036743 + }, + { + "auxiliary_loss_clip": 0.06627252, + "auxiliary_loss_mlp": 0.01301964, + "balance_loss_clip": 0.06307825, + "balance_loss_mlp": 0.01268609, + "epoch": 0.1316098000901849, + "flos": 20382411928320.0, + "grad_norm": 1.8739878728488704, + "language_loss": 0.75486964, + "learning_rate": 3.892670351915842e-06, + "loss": 0.83416182, + "num_input_tokens_seen": 47290720, + "router_z_loss_clip": 3.1953125, + "router_z_loss_mlp": 0.33300781, + "step": 2189, + "time_per_iteration": 4.007068395614624 + }, + { + "auxiliary_loss_clip": 0.06638934, + "auxiliary_loss_mlp": 0.01302262, + "balance_loss_clip": 0.06312171, + "balance_loss_mlp": 0.01267691, + "epoch": 0.13166992334285285, + "flos": 23227723422720.0, + "grad_norm": 2.019862807668573, + "language_loss": 0.73193908, + "learning_rate": 3.892544447140657e-06, + "loss": 0.81135106, + "num_input_tokens_seen": 47311820, + "router_z_loss_clip": 3.26757812, + "router_z_loss_mlp": 0.34570312, + "step": 2190, + "time_per_iteration": 2.5776755809783936 + }, + { + "auxiliary_loss_clip": 0.06636755, + "auxiliary_loss_mlp": 0.01299753, + "balance_loss_clip": 0.06315562, + "balance_loss_mlp": 0.01266828, + "epoch": 0.13173004659552082, + "flos": 23337616452480.0, + "grad_norm": 1.8457361126651268, + "language_loss": 0.75608957, + "learning_rate": 3.892418470599996e-06, + "loss": 0.83545464, + "num_input_tokens_seen": 47331605, + "router_z_loss_clip": 3.21484375, + "router_z_loss_mlp": 0.32958984, + "step": 2191, + "time_per_iteration": 2.580988645553589 + }, + { + "auxiliary_loss_clip": 0.06637161, + "auxiliary_loss_mlp": 0.01295844, + "balance_loss_clip": 0.06311083, + "balance_loss_mlp": 0.01258699, + "epoch": 0.13179016984818878, + "flos": 21257866586880.0, + "grad_norm": 2.0212941585210613, + "language_loss": 0.80481809, + "learning_rate": 3.892292422298637e-06, + "loss": 0.88414812, + "num_input_tokens_seen": 47350455, + "router_z_loss_clip": 3.2578125, + "router_z_loss_mlp": 0.37133789, + "step": 2192, + "time_per_iteration": 5.4770941734313965 + }, + { + "auxiliary_loss_clip": 0.06644538, + "auxiliary_loss_mlp": 0.01301425, + "balance_loss_clip": 0.06318243, + "balance_loss_mlp": 0.01265758, + "epoch": 0.13185029310085675, + "flos": 17783357184000.0, + "grad_norm": 2.540381366914011, + "language_loss": 0.86697793, + "learning_rate": 3.892166302241361e-06, + "loss": 0.94643748, + "num_input_tokens_seen": 47368225, + "router_z_loss_clip": 3.26757812, + "router_z_loss_mlp": 0.35693359, + "step": 2193, + "time_per_iteration": 2.5420453548431396 + }, + { + "auxiliary_loss_clip": 0.06500036, + "auxiliary_loss_mlp": 0.01269775, + "balance_loss_clip": 0.06307782, + "balance_loss_mlp": 0.01257103, + "epoch": 0.1319104163535247, + "flos": 69872586422400.0, + "grad_norm": 0.721919772393688, + "language_loss": 0.54093373, + "learning_rate": 3.8920401104329475e-06, + "loss": 0.61863184, + "num_input_tokens_seen": 47427125, + "router_z_loss_clip": 1.921875, + "router_z_loss_mlp": 0.12683105, + "step": 2194, + "time_per_iteration": 3.1521217823028564 + }, + { + "auxiliary_loss_clip": 0.06633582, + "auxiliary_loss_mlp": 0.01294441, + "balance_loss_clip": 0.06310762, + "balance_loss_mlp": 0.01261277, + "epoch": 0.1319705396061927, + "flos": 25200305516160.0, + "grad_norm": 1.726437316735012, + "language_loss": 0.7434622, + "learning_rate": 3.891913846878185e-06, + "loss": 0.82274246, + "num_input_tokens_seen": 47450275, + "router_z_loss_clip": 3.23046875, + "router_z_loss_mlp": 0.33154297, + "step": 2195, + "time_per_iteration": 2.593909740447998 + }, + { + "auxiliary_loss_clip": 0.06639563, + "auxiliary_loss_mlp": 0.01299138, + "balance_loss_clip": 0.0630713, + "balance_loss_mlp": 0.01264305, + "epoch": 0.13203066285886067, + "flos": 20746695553920.0, + "grad_norm": 1.9416785711103928, + "language_loss": 0.79390305, + "learning_rate": 3.891787511581859e-06, + "loss": 0.87329006, + "num_input_tokens_seen": 47469155, + "router_z_loss_clip": 3.32421875, + "router_z_loss_mlp": 0.34838867, + "step": 2196, + "time_per_iteration": 2.5824716091156006 + }, + { + "auxiliary_loss_clip": 0.06635743, + "auxiliary_loss_mlp": 0.01302288, + "balance_loss_clip": 0.06304654, + "balance_loss_mlp": 0.01269148, + "epoch": 0.13209078611152864, + "flos": 22060925717760.0, + "grad_norm": 8.075867999821003, + "language_loss": 0.76482284, + "learning_rate": 3.89166110454876e-06, + "loss": 0.84420311, + "num_input_tokens_seen": 47488405, + "router_z_loss_clip": 3.31054688, + "router_z_loss_mlp": 0.33105469, + "step": 2197, + "time_per_iteration": 2.5501832962036133 + }, + { + "auxiliary_loss_clip": 0.06635305, + "auxiliary_loss_mlp": 0.01300777, + "balance_loss_clip": 0.06304274, + "balance_loss_mlp": 0.01266063, + "epoch": 0.1321509093641966, + "flos": 16289731428480.0, + "grad_norm": 2.9293196732039126, + "language_loss": 0.81022984, + "learning_rate": 3.891534625783685e-06, + "loss": 0.88959062, + "num_input_tokens_seen": 47505650, + "router_z_loss_clip": 3.31054688, + "router_z_loss_mlp": 0.34716797, + "step": 2198, + "time_per_iteration": 2.570861577987671 + }, + { + "auxiliary_loss_clip": 0.06631541, + "auxiliary_loss_mlp": 0.01313296, + "balance_loss_clip": 0.06305937, + "balance_loss_mlp": 0.01279513, + "epoch": 0.13221103261686457, + "flos": 16988725388160.0, + "grad_norm": 2.4451285716665914, + "language_loss": 0.83851683, + "learning_rate": 3.891408075291425e-06, + "loss": 0.91796517, + "num_input_tokens_seen": 47521540, + "router_z_loss_clip": 3.25390625, + "router_z_loss_mlp": 0.33764648, + "step": 2199, + "time_per_iteration": 2.521033525466919 + }, + { + "auxiliary_loss_clip": 0.06631772, + "auxiliary_loss_mlp": 0.01306909, + "balance_loss_clip": 0.06307507, + "balance_loss_mlp": 0.01272887, + "epoch": 0.13227115586953253, + "flos": 34240996195200.0, + "grad_norm": 1.9425616182298255, + "language_loss": 0.71189994, + "learning_rate": 3.8912814530767826e-06, + "loss": 0.79128671, + "num_input_tokens_seen": 47543625, + "router_z_loss_clip": 3.24023438, + "router_z_loss_mlp": 0.34033203, + "step": 2200, + "time_per_iteration": 2.670046806335449 + }, + { + "auxiliary_loss_clip": 0.06617988, + "auxiliary_loss_mlp": 0.01304715, + "balance_loss_clip": 0.06300868, + "balance_loss_mlp": 0.01274341, + "epoch": 0.13233127912220052, + "flos": 20711000914560.0, + "grad_norm": 2.1724926946699754, + "language_loss": 0.86090875, + "learning_rate": 3.891154759144557e-06, + "loss": 0.94013584, + "num_input_tokens_seen": 47563740, + "router_z_loss_clip": 3.17773438, + "router_z_loss_mlp": 0.30371094, + "step": 2201, + "time_per_iteration": 2.570223569869995 + }, + { + "auxiliary_loss_clip": 0.06631213, + "auxiliary_loss_mlp": 0.01297349, + "balance_loss_clip": 0.06304044, + "balance_loss_mlp": 0.01263828, + "epoch": 0.1323914023748685, + "flos": 25810971672960.0, + "grad_norm": 1.9172071001088793, + "language_loss": 0.87768662, + "learning_rate": 3.891027993499554e-06, + "loss": 0.95697218, + "num_input_tokens_seen": 47582655, + "router_z_loss_clip": 3.2734375, + "router_z_loss_mlp": 0.33496094, + "step": 2202, + "time_per_iteration": 2.6102631092071533 + }, + { + "auxiliary_loss_clip": 0.06636258, + "auxiliary_loss_mlp": 0.012969, + "balance_loss_clip": 0.06311007, + "balance_loss_mlp": 0.01264427, + "epoch": 0.13245152562753645, + "flos": 21257908513920.0, + "grad_norm": 2.5432278039111202, + "language_loss": 0.73953617, + "learning_rate": 3.89090115614658e-06, + "loss": 0.81886774, + "num_input_tokens_seen": 47600875, + "router_z_loss_clip": 3.25390625, + "router_z_loss_mlp": 0.32470703, + "step": 2203, + "time_per_iteration": 2.582125425338745 + }, + { + "auxiliary_loss_clip": 0.0663885, + "auxiliary_loss_mlp": 0.01297802, + "balance_loss_clip": 0.06312627, + "balance_loss_mlp": 0.01266879, + "epoch": 0.13251164888020442, + "flos": 26617552675200.0, + "grad_norm": 2.0999892579623918, + "language_loss": 0.74886954, + "learning_rate": 3.890774247090444e-06, + "loss": 0.82823604, + "num_input_tokens_seen": 47619250, + "router_z_loss_clip": 3.26171875, + "router_z_loss_mlp": 0.30883789, + "step": 2204, + "time_per_iteration": 2.634873867034912 + }, + { + "auxiliary_loss_clip": 0.06637383, + "auxiliary_loss_mlp": 0.01309474, + "balance_loss_clip": 0.06314126, + "balance_loss_mlp": 0.01276119, + "epoch": 0.13257177213287238, + "flos": 29834485027200.0, + "grad_norm": 2.4895096645832235, + "language_loss": 0.79621047, + "learning_rate": 3.89064726633596e-06, + "loss": 0.87567902, + "num_input_tokens_seen": 47639445, + "router_z_loss_clip": 3.23242188, + "router_z_loss_mlp": 0.33349609, + "step": 2205, + "time_per_iteration": 2.619999647140503 + }, + { + "auxiliary_loss_clip": 0.06630976, + "auxiliary_loss_mlp": 0.01295213, + "balance_loss_clip": 0.06307817, + "balance_loss_mlp": 0.01261548, + "epoch": 0.13263189538554035, + "flos": 21294902891520.0, + "grad_norm": 2.228894402461185, + "language_loss": 0.80627573, + "learning_rate": 3.890520213887941e-06, + "loss": 0.88553762, + "num_input_tokens_seen": 47658740, + "router_z_loss_clip": 3.234375, + "router_z_loss_mlp": 0.33666992, + "step": 2206, + "time_per_iteration": 2.5711123943328857 + }, + { + "auxiliary_loss_clip": 0.06638241, + "auxiliary_loss_mlp": 0.01297492, + "balance_loss_clip": 0.06313571, + "balance_loss_mlp": 0.0126676, + "epoch": 0.13269201863820831, + "flos": 16879880534400.0, + "grad_norm": 2.2771237083056297, + "language_loss": 0.76153713, + "learning_rate": 3.890393089751208e-06, + "loss": 0.84089446, + "num_input_tokens_seen": 47676880, + "router_z_loss_clip": 3.2421875, + "router_z_loss_mlp": 0.30688477, + "step": 2207, + "time_per_iteration": 2.5054686069488525 + }, + { + "auxiliary_loss_clip": 0.06632576, + "auxiliary_loss_mlp": 0.01289317, + "balance_loss_clip": 0.06313936, + "balance_loss_mlp": 0.01259014, + "epoch": 0.1327521418908763, + "flos": 23775679198080.0, + "grad_norm": 2.287917678450009, + "language_loss": 0.85195792, + "learning_rate": 3.890265893930578e-06, + "loss": 0.9311769, + "num_input_tokens_seen": 47696635, + "router_z_loss_clip": 3.18359375, + "router_z_loss_mlp": 0.30322266, + "step": 2208, + "time_per_iteration": 2.609978675842285 + }, + { + "auxiliary_loss_clip": 0.0661916, + "auxiliary_loss_mlp": 0.0129287, + "balance_loss_clip": 0.06309634, + "balance_loss_mlp": 0.01263712, + "epoch": 0.13281226514354427, + "flos": 26512858598400.0, + "grad_norm": 2.1774657992842923, + "language_loss": 0.86578667, + "learning_rate": 3.890138626430876e-06, + "loss": 0.94490695, + "num_input_tokens_seen": 47717760, + "router_z_loss_clip": 3.09765625, + "router_z_loss_mlp": 0.29174805, + "step": 2209, + "time_per_iteration": 2.5905022621154785 + }, + { + "auxiliary_loss_clip": 0.06630558, + "auxiliary_loss_mlp": 0.01296527, + "balance_loss_clip": 0.06307525, + "balance_loss_mlp": 0.01264817, + "epoch": 0.13287238839621224, + "flos": 24505671968640.0, + "grad_norm": 2.0974790857001255, + "language_loss": 0.83324587, + "learning_rate": 3.890011287256929e-06, + "loss": 0.91251671, + "num_input_tokens_seen": 47737685, + "router_z_loss_clip": 3.234375, + "router_z_loss_mlp": 0.31689453, + "step": 2210, + "time_per_iteration": 2.605640172958374 + }, + { + "auxiliary_loss_clip": 0.06520031, + "auxiliary_loss_mlp": 0.01268108, + "balance_loss_clip": 0.06330763, + "balance_loss_mlp": 0.01256634, + "epoch": 0.1329325116488802, + "flos": 67713984264960.0, + "grad_norm": 0.7321997743468096, + "language_loss": 0.57977009, + "learning_rate": 3.889883876413563e-06, + "loss": 0.65765154, + "num_input_tokens_seen": 47802415, + "router_z_loss_clip": 1.890625, + "router_z_loss_mlp": 0.11456299, + "step": 2211, + "time_per_iteration": 3.2822937965393066 + }, + { + "auxiliary_loss_clip": 0.06521661, + "auxiliary_loss_mlp": 0.01258942, + "balance_loss_clip": 0.0633207, + "balance_loss_mlp": 0.01247897, + "epoch": 0.13299263490154817, + "flos": 72283440896640.0, + "grad_norm": 0.7669964089142771, + "language_loss": 0.54991639, + "learning_rate": 3.889756393905611e-06, + "loss": 0.62772238, + "num_input_tokens_seen": 47871485, + "router_z_loss_clip": 1.89648438, + "router_z_loss_mlp": 0.1105957, + "step": 2212, + "time_per_iteration": 3.2838916778564453 + }, + { + "auxiliary_loss_clip": 0.0664072, + "auxiliary_loss_mlp": 0.01298095, + "balance_loss_clip": 0.06314459, + "balance_loss_mlp": 0.012661, + "epoch": 0.13305275815421613, + "flos": 17937078698880.0, + "grad_norm": 3.2445802523020144, + "language_loss": 0.75483733, + "learning_rate": 3.889628839737908e-06, + "loss": 0.83422554, + "num_input_tokens_seen": 47888315, + "router_z_loss_clip": 3.26367188, + "router_z_loss_mlp": 0.31982422, + "step": 2213, + "time_per_iteration": 2.599457025527954 + }, + { + "auxiliary_loss_clip": 0.06623878, + "auxiliary_loss_mlp": 0.01290528, + "balance_loss_clip": 0.06308766, + "balance_loss_mlp": 0.01260917, + "epoch": 0.13311288140688413, + "flos": 22346566686720.0, + "grad_norm": 1.7850496574832224, + "language_loss": 0.80468798, + "learning_rate": 3.889501213915291e-06, + "loss": 0.88383198, + "num_input_tokens_seen": 47906600, + "router_z_loss_clip": 3.15429688, + "router_z_loss_mlp": 0.29614258, + "step": 2214, + "time_per_iteration": 2.572476625442505 + }, + { + "auxiliary_loss_clip": 0.06633762, + "auxiliary_loss_mlp": 0.01291249, + "balance_loss_clip": 0.06310902, + "balance_loss_mlp": 0.01259992, + "epoch": 0.1331730046595521, + "flos": 31877030880000.0, + "grad_norm": 1.879682062967662, + "language_loss": 0.71106076, + "learning_rate": 3.889373516442597e-06, + "loss": 0.79031086, + "num_input_tokens_seen": 47927630, + "router_z_loss_clip": 3.22851562, + "router_z_loss_mlp": 0.3125, + "step": 2215, + "time_per_iteration": 2.6289784908294678 + }, + { + "auxiliary_loss_clip": 0.06635362, + "auxiliary_loss_mlp": 0.01297639, + "balance_loss_clip": 0.06308068, + "balance_loss_mlp": 0.01264762, + "epoch": 0.13323312791222006, + "flos": 22573438416000.0, + "grad_norm": 2.1877299894623063, + "language_loss": 0.81866241, + "learning_rate": 3.889245747324671e-06, + "loss": 0.89799237, + "num_input_tokens_seen": 47947935, + "router_z_loss_clip": 3.27148438, + "router_z_loss_mlp": 0.32861328, + "step": 2216, + "time_per_iteration": 2.5978689193725586 + }, + { + "auxiliary_loss_clip": 0.06628902, + "auxiliary_loss_mlp": 0.01291342, + "balance_loss_clip": 0.06306753, + "balance_loss_mlp": 0.01260229, + "epoch": 0.13329325116488802, + "flos": 15090635174400.0, + "grad_norm": 1.945076656101512, + "language_loss": 0.8810879, + "learning_rate": 3.889117906566356e-06, + "loss": 0.96029037, + "num_input_tokens_seen": 47965515, + "router_z_loss_clip": 3.22265625, + "router_z_loss_mlp": 0.3112793, + "step": 2217, + "time_per_iteration": 2.5901639461517334 + }, + { + "auxiliary_loss_clip": 0.0662536, + "auxiliary_loss_mlp": 0.0129587, + "balance_loss_clip": 0.06307805, + "balance_loss_mlp": 0.01262563, + "epoch": 0.133353374417556, + "flos": 27461002273920.0, + "grad_norm": 2.771116888328456, + "language_loss": 0.75384659, + "learning_rate": 3.888989994172501e-06, + "loss": 0.83305889, + "num_input_tokens_seen": 47985675, + "router_z_loss_clip": 3.1796875, + "router_z_loss_mlp": 0.33349609, + "step": 2218, + "time_per_iteration": 2.5716331005096436 + }, + { + "auxiliary_loss_clip": 0.06631406, + "auxiliary_loss_mlp": 0.01293158, + "balance_loss_clip": 0.06307958, + "balance_loss_mlp": 0.01259875, + "epoch": 0.13341349767022395, + "flos": 24101081729280.0, + "grad_norm": 1.6852729372488615, + "language_loss": 0.88550645, + "learning_rate": 3.8888620101479565e-06, + "loss": 0.96475214, + "num_input_tokens_seen": 48004985, + "router_z_loss_clip": 3.23242188, + "router_z_loss_mlp": 0.33300781, + "step": 2219, + "time_per_iteration": 2.6070170402526855 + }, + { + "auxiliary_loss_clip": 0.06621003, + "auxiliary_loss_mlp": 0.01287851, + "balance_loss_clip": 0.06303806, + "balance_loss_mlp": 0.01257381, + "epoch": 0.13347362092289192, + "flos": 24140088604800.0, + "grad_norm": 2.0906842838932556, + "language_loss": 0.7815029, + "learning_rate": 3.888733954497574e-06, + "loss": 0.86059141, + "num_input_tokens_seen": 48024965, + "router_z_loss_clip": 3.16992188, + "router_z_loss_mlp": 0.3046875, + "step": 2220, + "time_per_iteration": 2.5560426712036133 + }, + { + "auxiliary_loss_clip": 0.06625573, + "auxiliary_loss_mlp": 0.01294385, + "balance_loss_clip": 0.06307516, + "balance_loss_mlp": 0.0126432, + "epoch": 0.1335337441755599, + "flos": 18441499478400.0, + "grad_norm": 3.5848326197945974, + "language_loss": 0.80259734, + "learning_rate": 3.888605827226212e-06, + "loss": 0.88179696, + "num_input_tokens_seen": 48040890, + "router_z_loss_clip": 3.18164062, + "router_z_loss_mlp": 0.30078125, + "step": 2221, + "time_per_iteration": 2.554230213165283 + }, + { + "auxiliary_loss_clip": 0.06500886, + "auxiliary_loss_mlp": 0.01279151, + "balance_loss_clip": 0.06314573, + "balance_loss_mlp": 0.01265382, + "epoch": 0.13359386742822787, + "flos": 50627608542720.0, + "grad_norm": 0.9620548374199929, + "language_loss": 0.69134498, + "learning_rate": 3.8884776283387275e-06, + "loss": 0.76914537, + "num_input_tokens_seen": 48091855, + "router_z_loss_clip": 1.86621094, + "router_z_loss_mlp": 0.13806152, + "step": 2222, + "time_per_iteration": 3.0396814346313477 + }, + { + "auxiliary_loss_clip": 0.0662626, + "auxiliary_loss_mlp": 0.01285858, + "balance_loss_clip": 0.06309929, + "balance_loss_mlp": 0.01257987, + "epoch": 0.13365399068089584, + "flos": 22784294016000.0, + "grad_norm": 6.993006748631453, + "language_loss": 0.68394774, + "learning_rate": 3.888349357839982e-06, + "loss": 0.76306891, + "num_input_tokens_seen": 48111350, + "router_z_loss_clip": 3.16015625, + "router_z_loss_mlp": 0.27856445, + "step": 2223, + "time_per_iteration": 2.6058313846588135 + }, + { + "auxiliary_loss_clip": 0.06624826, + "auxiliary_loss_mlp": 0.01288517, + "balance_loss_clip": 0.06304329, + "balance_loss_mlp": 0.01257296, + "epoch": 0.1337141139335638, + "flos": 12536540945280.0, + "grad_norm": 2.4608215865303937, + "language_loss": 0.8412739, + "learning_rate": 3.88822101573484e-06, + "loss": 0.9204073, + "num_input_tokens_seen": 48129840, + "router_z_loss_clip": 3.2109375, + "router_z_loss_mlp": 0.31213379, + "step": 2224, + "time_per_iteration": 2.5372307300567627 + }, + { + "auxiliary_loss_clip": 0.066294, + "auxiliary_loss_mlp": 0.01287352, + "balance_loss_clip": 0.06301981, + "balance_loss_mlp": 0.01255499, + "epoch": 0.13377423718623177, + "flos": 23045560646400.0, + "grad_norm": 2.2168840240666294, + "language_loss": 0.67877412, + "learning_rate": 3.888092602028167e-06, + "loss": 0.7579416, + "num_input_tokens_seen": 48149240, + "router_z_loss_clip": 3.27734375, + "router_z_loss_mlp": 0.31835938, + "step": 2225, + "time_per_iteration": 2.567253589630127 + }, + { + "auxiliary_loss_clip": 0.06627665, + "auxiliary_loss_mlp": 0.01285599, + "balance_loss_clip": 0.06307095, + "balance_loss_mlp": 0.01257406, + "epoch": 0.13383436043889974, + "flos": 16221905948160.0, + "grad_norm": 2.1695875347778184, + "language_loss": 0.90785301, + "learning_rate": 3.887964116724835e-06, + "loss": 0.98698568, + "num_input_tokens_seen": 48166330, + "router_z_loss_clip": 3.20703125, + "router_z_loss_mlp": 0.28186035, + "step": 2226, + "time_per_iteration": 2.6064305305480957 + }, + { + "auxiliary_loss_clip": 0.06623043, + "auxiliary_loss_mlp": 0.0129267, + "balance_loss_clip": 0.06300287, + "balance_loss_mlp": 0.01261771, + "epoch": 0.1338944836915677, + "flos": 24286514814720.0, + "grad_norm": 2.574481606503262, + "language_loss": 0.75021911, + "learning_rate": 3.887835559829712e-06, + "loss": 0.82937622, + "num_input_tokens_seen": 48187600, + "router_z_loss_clip": 3.23046875, + "router_z_loss_mlp": 0.30883789, + "step": 2227, + "time_per_iteration": 4.016468286514282 + }, + { + "auxiliary_loss_clip": 0.06618345, + "auxiliary_loss_mlp": 0.01292665, + "balance_loss_clip": 0.0629885, + "balance_loss_mlp": 0.01261265, + "epoch": 0.1339546069442357, + "flos": 17603793884160.0, + "grad_norm": 2.0025343623105214, + "language_loss": 0.8591758, + "learning_rate": 3.8877069313476764e-06, + "loss": 0.93828595, + "num_input_tokens_seen": 48204400, + "router_z_loss_clip": 3.1953125, + "router_z_loss_mlp": 0.31396484, + "step": 2228, + "time_per_iteration": 2.55798077583313 + }, + { + "auxiliary_loss_clip": 0.06615113, + "auxiliary_loss_mlp": 0.01284588, + "balance_loss_clip": 0.06298958, + "balance_loss_mlp": 0.01255548, + "epoch": 0.13401473019690366, + "flos": 18996163580160.0, + "grad_norm": 1.8879365390563052, + "language_loss": 0.82201439, + "learning_rate": 3.8875782312836054e-06, + "loss": 0.90101147, + "num_input_tokens_seen": 48222180, + "router_z_loss_clip": 3.15820312, + "router_z_loss_mlp": 0.29052734, + "step": 2229, + "time_per_iteration": 4.120098829269409 + }, + { + "auxiliary_loss_clip": 0.06619616, + "auxiliary_loss_mlp": 0.01290736, + "balance_loss_clip": 0.06300908, + "balance_loss_mlp": 0.01259849, + "epoch": 0.13407485344957162, + "flos": 26951214833280.0, + "grad_norm": 2.2979177943800386, + "language_loss": 0.7564404, + "learning_rate": 3.887449459642378e-06, + "loss": 0.83554387, + "num_input_tokens_seen": 48243245, + "router_z_loss_clip": 3.1875, + "router_z_loss_mlp": 0.30871582, + "step": 2230, + "time_per_iteration": 2.6150131225585938 + }, + { + "auxiliary_loss_clip": 0.06620437, + "auxiliary_loss_mlp": 0.01289621, + "balance_loss_clip": 0.06302108, + "balance_loss_mlp": 0.01261059, + "epoch": 0.1341349767022396, + "flos": 20345585258880.0, + "grad_norm": 1.8496833611889134, + "language_loss": 0.81113201, + "learning_rate": 3.8873206164288785e-06, + "loss": 0.89023262, + "num_input_tokens_seen": 48262600, + "router_z_loss_clip": 3.1796875, + "router_z_loss_mlp": 0.28564453, + "step": 2231, + "time_per_iteration": 2.5791971683502197 + }, + { + "auxiliary_loss_clip": 0.06629717, + "auxiliary_loss_mlp": 0.01304097, + "balance_loss_clip": 0.0629984, + "balance_loss_mlp": 0.01268811, + "epoch": 0.13419509995490755, + "flos": 29869802323200.0, + "grad_norm": 3.0058197712179218, + "language_loss": 0.73244405, + "learning_rate": 3.887191701647992e-06, + "loss": 0.81178224, + "num_input_tokens_seen": 48285075, + "router_z_loss_clip": 3.30273438, + "router_z_loss_mlp": 0.3527832, + "step": 2232, + "time_per_iteration": 4.126416444778442 + }, + { + "auxiliary_loss_clip": 0.06625827, + "auxiliary_loss_mlp": 0.01292477, + "balance_loss_clip": 0.06298069, + "balance_loss_mlp": 0.01260052, + "epoch": 0.13425522320757552, + "flos": 26950250511360.0, + "grad_norm": 2.8502119867979823, + "language_loss": 0.67005944, + "learning_rate": 3.8870627153046066e-06, + "loss": 0.74924242, + "num_input_tokens_seen": 48301285, + "router_z_loss_clip": 3.27539062, + "router_z_loss_mlp": 0.32421875, + "step": 2233, + "time_per_iteration": 2.57535457611084 + }, + { + "auxiliary_loss_clip": 0.0661561, + "auxiliary_loss_mlp": 0.01292122, + "balance_loss_clip": 0.0629602, + "balance_loss_mlp": 0.0126096, + "epoch": 0.1343153464602435, + "flos": 15782501537280.0, + "grad_norm": 2.818232021038303, + "language_loss": 0.82633889, + "learning_rate": 3.886933657403615e-06, + "loss": 0.90541625, + "num_input_tokens_seen": 48317835, + "router_z_loss_clip": 3.19726562, + "router_z_loss_mlp": 0.31176758, + "step": 2234, + "time_per_iteration": 2.5729787349700928 + }, + { + "auxiliary_loss_clip": 0.06617501, + "auxiliary_loss_mlp": 0.01296303, + "balance_loss_clip": 0.06299153, + "balance_loss_mlp": 0.01266668, + "epoch": 0.13437546971291148, + "flos": 24321370913280.0, + "grad_norm": 2.028590274897441, + "language_loss": 0.82841778, + "learning_rate": 3.886804527949909e-06, + "loss": 0.90755594, + "num_input_tokens_seen": 48335670, + "router_z_loss_clip": 3.18359375, + "router_z_loss_mlp": 0.29638672, + "step": 2235, + "time_per_iteration": 2.593050241470337 + }, + { + "auxiliary_loss_clip": 0.06612507, + "auxiliary_loss_mlp": 0.01293723, + "balance_loss_clip": 0.06294179, + "balance_loss_mlp": 0.01261989, + "epoch": 0.13443559296557944, + "flos": 26657817361920.0, + "grad_norm": 1.9716678370354759, + "language_loss": 0.87708902, + "learning_rate": 3.8866753269483864e-06, + "loss": 0.95615125, + "num_input_tokens_seen": 48357805, + "router_z_loss_clip": 3.18554688, + "router_z_loss_mlp": 0.31738281, + "step": 2236, + "time_per_iteration": 2.5910720825195312 + }, + { + "auxiliary_loss_clip": 0.06621092, + "auxiliary_loss_mlp": 0.01294743, + "balance_loss_clip": 0.06297852, + "balance_loss_mlp": 0.012627, + "epoch": 0.1344957162182474, + "flos": 21802216636800.0, + "grad_norm": 1.7646832896946034, + "language_loss": 0.78455186, + "learning_rate": 3.886546054403946e-06, + "loss": 0.86371022, + "num_input_tokens_seen": 48377845, + "router_z_loss_clip": 3.23632812, + "router_z_loss_mlp": 0.32080078, + "step": 2237, + "time_per_iteration": 2.5423593521118164 + }, + { + "auxiliary_loss_clip": 0.06621015, + "auxiliary_loss_mlp": 0.01296744, + "balance_loss_clip": 0.06297819, + "balance_loss_mlp": 0.01263746, + "epoch": 0.13455583947091537, + "flos": 19871785946880.0, + "grad_norm": 2.139876962287315, + "language_loss": 0.80559266, + "learning_rate": 3.886416710321491e-06, + "loss": 0.88477021, + "num_input_tokens_seen": 48394735, + "router_z_loss_clip": 3.23046875, + "router_z_loss_mlp": 0.33007812, + "step": 2238, + "time_per_iteration": 2.547511100769043 + }, + { + "auxiliary_loss_clip": 0.0662026, + "auxiliary_loss_mlp": 0.01290468, + "balance_loss_clip": 0.06300892, + "balance_loss_mlp": 0.0125945, + "epoch": 0.13461596272358334, + "flos": 30854730741120.0, + "grad_norm": 2.2946937997388983, + "language_loss": 0.69019175, + "learning_rate": 3.886287294705924e-06, + "loss": 0.76929903, + "num_input_tokens_seen": 48414200, + "router_z_loss_clip": 3.19335938, + "router_z_loss_mlp": 0.31005859, + "step": 2239, + "time_per_iteration": 2.6161396503448486 + }, + { + "auxiliary_loss_clip": 0.06626255, + "auxiliary_loss_mlp": 0.0129458, + "balance_loss_clip": 0.06302193, + "balance_loss_mlp": 0.01262609, + "epoch": 0.1346760859762513, + "flos": 12499253078400.0, + "grad_norm": 2.740092234793679, + "language_loss": 0.83294439, + "learning_rate": 3.8861578075621555e-06, + "loss": 0.91215271, + "num_input_tokens_seen": 48431065, + "router_z_loss_clip": 3.2421875, + "router_z_loss_mlp": 0.31958008, + "step": 2240, + "time_per_iteration": 2.531810998916626 + }, + { + "auxiliary_loss_clip": 0.06621873, + "auxiliary_loss_mlp": 0.01289824, + "balance_loss_clip": 0.06297486, + "balance_loss_mlp": 0.01256278, + "epoch": 0.1347362092289193, + "flos": 21842607104640.0, + "grad_norm": 1.6487000610588447, + "language_loss": 0.78665066, + "learning_rate": 3.886028248895093e-06, + "loss": 0.86576766, + "num_input_tokens_seen": 48450335, + "router_z_loss_clip": 3.2421875, + "router_z_loss_mlp": 0.33569336, + "step": 2241, + "time_per_iteration": 2.5346198081970215 + }, + { + "auxiliary_loss_clip": 0.06618196, + "auxiliary_loss_mlp": 0.01285675, + "balance_loss_clip": 0.06305367, + "balance_loss_mlp": 0.01256636, + "epoch": 0.13479633248158726, + "flos": 23515502670720.0, + "grad_norm": 1.8184249012274396, + "language_loss": 0.84641361, + "learning_rate": 3.88589861870965e-06, + "loss": 0.92545235, + "num_input_tokens_seen": 48468555, + "router_z_loss_clip": 3.12890625, + "router_z_loss_mlp": 0.29052734, + "step": 2242, + "time_per_iteration": 2.6532411575317383 + }, + { + "auxiliary_loss_clip": 0.0662721, + "auxiliary_loss_mlp": 0.01293952, + "balance_loss_clip": 0.06304164, + "balance_loss_mlp": 0.01261098, + "epoch": 0.13485645573425523, + "flos": 29350874787840.0, + "grad_norm": 2.677815565759994, + "language_loss": 0.66332561, + "learning_rate": 3.885768917010744e-06, + "loss": 0.74253726, + "num_input_tokens_seen": 48488515, + "router_z_loss_clip": 3.22851562, + "router_z_loss_mlp": 0.32836914, + "step": 2243, + "time_per_iteration": 2.599304437637329 + }, + { + "auxiliary_loss_clip": 0.06611082, + "auxiliary_loss_mlp": 0.01284736, + "balance_loss_clip": 0.06295401, + "balance_loss_mlp": 0.01256042, + "epoch": 0.1349165789869232, + "flos": 28044484980480.0, + "grad_norm": 1.4756823100545766, + "language_loss": 0.73444742, + "learning_rate": 3.8856391438032895e-06, + "loss": 0.81340563, + "num_input_tokens_seen": 48510515, + "router_z_loss_clip": 3.15234375, + "router_z_loss_mlp": 0.28662109, + "step": 2244, + "time_per_iteration": 2.640366554260254 + }, + { + "auxiliary_loss_clip": 0.06614108, + "auxiliary_loss_mlp": 0.01291938, + "balance_loss_clip": 0.062971, + "balance_loss_mlp": 0.01260133, + "epoch": 0.13497670223959116, + "flos": 22859834071680.0, + "grad_norm": 7.9965666613423, + "language_loss": 0.87522435, + "learning_rate": 3.88550929909221e-06, + "loss": 0.95428485, + "num_input_tokens_seen": 48529940, + "router_z_loss_clip": 3.17578125, + "router_z_loss_mlp": 0.31787109, + "step": 2245, + "time_per_iteration": 2.537259340286255 + }, + { + "auxiliary_loss_clip": 0.06609753, + "auxiliary_loss_mlp": 0.01291064, + "balance_loss_clip": 0.06298651, + "balance_loss_mlp": 0.0126119, + "epoch": 0.13503682549225912, + "flos": 16509517488000.0, + "grad_norm": 1.6351770671547161, + "language_loss": 0.80275553, + "learning_rate": 3.88537938288243e-06, + "loss": 0.88176376, + "num_input_tokens_seen": 48548190, + "router_z_loss_clip": 3.11328125, + "router_z_loss_mlp": 0.29858398, + "step": 2246, + "time_per_iteration": 2.576324224472046 + }, + { + "auxiliary_loss_clip": 0.06503996, + "auxiliary_loss_mlp": 0.01268266, + "balance_loss_clip": 0.06311698, + "balance_loss_mlp": 0.01256631, + "epoch": 0.1350969487449271, + "flos": 70775979217920.0, + "grad_norm": 0.7288766997222871, + "language_loss": 0.60674834, + "learning_rate": 3.885249395178874e-06, + "loss": 0.68447095, + "num_input_tokens_seen": 48613165, + "router_z_loss_clip": 1.921875, + "router_z_loss_mlp": 0.11621094, + "step": 2247, + "time_per_iteration": 3.295891046524048 + }, + { + "auxiliary_loss_clip": 0.06638567, + "auxiliary_loss_mlp": 0.01298182, + "balance_loss_clip": 0.06305797, + "balance_loss_mlp": 0.01262229, + "epoch": 0.13515707199759508, + "flos": 23082680805120.0, + "grad_norm": 2.7104639981136662, + "language_loss": 0.82279253, + "learning_rate": 3.885119335986473e-06, + "loss": 0.90216005, + "num_input_tokens_seen": 48631705, + "router_z_loss_clip": 3.33203125, + "router_z_loss_mlp": 0.359375, + "step": 2248, + "time_per_iteration": 2.575490951538086 + }, + { + "auxiliary_loss_clip": 0.06606994, + "auxiliary_loss_mlp": 0.01284005, + "balance_loss_clip": 0.0629556, + "balance_loss_mlp": 0.01255013, + "epoch": 0.13521719525026304, + "flos": 23193244667520.0, + "grad_norm": 1.8435286673705464, + "language_loss": 0.7853781, + "learning_rate": 3.884989205310157e-06, + "loss": 0.86428809, + "num_input_tokens_seen": 48649740, + "router_z_loss_clip": 3.11328125, + "router_z_loss_mlp": 0.2902832, + "step": 2249, + "time_per_iteration": 2.5745737552642822 + }, + { + "auxiliary_loss_clip": 0.06615513, + "auxiliary_loss_mlp": 0.01290474, + "balance_loss_clip": 0.06300813, + "balance_loss_mlp": 0.01262293, + "epoch": 0.135277318502931, + "flos": 24797937409920.0, + "grad_norm": 1.7186486055988894, + "language_loss": 0.86064833, + "learning_rate": 3.884859003154862e-06, + "loss": 0.93970823, + "num_input_tokens_seen": 48671565, + "router_z_loss_clip": 3.14453125, + "router_z_loss_mlp": 0.28210449, + "step": 2250, + "time_per_iteration": 2.594517469406128 + }, + { + "auxiliary_loss_clip": 0.06621417, + "auxiliary_loss_mlp": 0.01303153, + "balance_loss_clip": 0.06298415, + "balance_loss_mlp": 0.01270108, + "epoch": 0.13533744175559898, + "flos": 21915044559360.0, + "grad_norm": 3.4195422131585564, + "language_loss": 0.83116192, + "learning_rate": 3.884728729525524e-06, + "loss": 0.91040766, + "num_input_tokens_seen": 48690425, + "router_z_loss_clip": 3.22851562, + "router_z_loss_mlp": 0.33032227, + "step": 2251, + "time_per_iteration": 2.5615222454071045 + }, + { + "auxiliary_loss_clip": 0.066163, + "auxiliary_loss_mlp": 0.01290158, + "balance_loss_clip": 0.06295693, + "balance_loss_mlp": 0.01258579, + "epoch": 0.13539756500826694, + "flos": 21217434192000.0, + "grad_norm": 1.7358628614083547, + "language_loss": 0.86943758, + "learning_rate": 3.884598384427084e-06, + "loss": 0.94850212, + "num_input_tokens_seen": 48707505, + "router_z_loss_clip": 3.20703125, + "router_z_loss_mlp": 0.31555176, + "step": 2252, + "time_per_iteration": 2.5325772762298584 + }, + { + "auxiliary_loss_clip": 0.06482528, + "auxiliary_loss_mlp": 0.01279879, + "balance_loss_clip": 0.06294215, + "balance_loss_mlp": 0.01267404, + "epoch": 0.1354576882609349, + "flos": 63260835500160.0, + "grad_norm": 0.7528010548037618, + "language_loss": 0.61151105, + "learning_rate": 3.884467967864485e-06, + "loss": 0.68913507, + "num_input_tokens_seen": 48775895, + "router_z_loss_clip": 1.88867188, + "router_z_loss_mlp": 0.12481689, + "step": 2253, + "time_per_iteration": 3.2731101512908936 + }, + { + "auxiliary_loss_clip": 0.06617865, + "auxiliary_loss_mlp": 0.01297527, + "balance_loss_clip": 0.06298327, + "balance_loss_mlp": 0.01266961, + "epoch": 0.1355178115136029, + "flos": 25489971480960.0, + "grad_norm": 1.734180018549956, + "language_loss": 0.90171039, + "learning_rate": 3.884337479842671e-06, + "loss": 0.98086423, + "num_input_tokens_seen": 48798370, + "router_z_loss_clip": 3.19726562, + "router_z_loss_mlp": 0.30517578, + "step": 2254, + "time_per_iteration": 2.5830373764038086 + }, + { + "auxiliary_loss_clip": 0.06624171, + "auxiliary_loss_mlp": 0.01291824, + "balance_loss_clip": 0.06297128, + "balance_loss_mlp": 0.01259709, + "epoch": 0.13557793476627086, + "flos": 21623491877760.0, + "grad_norm": 2.5405517045767865, + "language_loss": 0.85834336, + "learning_rate": 3.884206920366591e-06, + "loss": 0.93750322, + "num_input_tokens_seen": 48817955, + "router_z_loss_clip": 3.26757812, + "router_z_loss_mlp": 0.32104492, + "step": 2255, + "time_per_iteration": 2.5849766731262207 + }, + { + "auxiliary_loss_clip": 0.06615041, + "auxiliary_loss_mlp": 0.01294235, + "balance_loss_clip": 0.06296261, + "balance_loss_mlp": 0.01264862, + "epoch": 0.13563805801893883, + "flos": 24933839932800.0, + "grad_norm": 2.4937460094050534, + "language_loss": 0.7602762, + "learning_rate": 3.884076289441196e-06, + "loss": 0.83936894, + "num_input_tokens_seen": 48836330, + "router_z_loss_clip": 3.18945312, + "router_z_loss_mlp": 0.29370117, + "step": 2256, + "time_per_iteration": 2.5914275646209717 + }, + { + "auxiliary_loss_clip": 0.06621285, + "auxiliary_loss_mlp": 0.01288962, + "balance_loss_clip": 0.06294358, + "balance_loss_mlp": 0.01257563, + "epoch": 0.1356981812716068, + "flos": 14754415466880.0, + "grad_norm": 2.129121942862091, + "language_loss": 0.84234703, + "learning_rate": 3.88394558707144e-06, + "loss": 0.92144954, + "num_input_tokens_seen": 48851890, + "router_z_loss_clip": 3.26367188, + "router_z_loss_mlp": 0.31420898, + "step": 2257, + "time_per_iteration": 2.5664286613464355 + }, + { + "auxiliary_loss_clip": 0.06630847, + "auxiliary_loss_mlp": 0.0129256, + "balance_loss_clip": 0.06299773, + "balance_loss_mlp": 0.01259658, + "epoch": 0.13575830452427476, + "flos": 11113256292480.0, + "grad_norm": 1.9364367185101232, + "language_loss": 0.83362973, + "learning_rate": 3.883814813262277e-06, + "loss": 0.91286373, + "num_input_tokens_seen": 48865510, + "router_z_loss_clip": 3.3125, + "router_z_loss_mlp": 0.32910156, + "step": 2258, + "time_per_iteration": 2.521657705307007 + }, + { + "auxiliary_loss_clip": 0.06621088, + "auxiliary_loss_mlp": 0.01297355, + "balance_loss_clip": 0.0629478, + "balance_loss_mlp": 0.01264858, + "epoch": 0.13581842777694272, + "flos": 17964849127680.0, + "grad_norm": 2.721301656824917, + "language_loss": 0.83752787, + "learning_rate": 3.883683968018669e-06, + "loss": 0.91671234, + "num_input_tokens_seen": 48882360, + "router_z_loss_clip": 3.26171875, + "router_z_loss_mlp": 0.32519531, + "step": 2259, + "time_per_iteration": 2.521693706512451 + }, + { + "auxiliary_loss_clip": 0.0660786, + "auxiliary_loss_mlp": 0.01289157, + "balance_loss_clip": 0.06291058, + "balance_loss_mlp": 0.01260952, + "epoch": 0.1358785510296107, + "flos": 22863817140480.0, + "grad_norm": 2.0214358343175927, + "language_loss": 0.74903429, + "learning_rate": 3.8835530513455755e-06, + "loss": 0.82800448, + "num_input_tokens_seen": 48902700, + "router_z_loss_clip": 3.16992188, + "router_z_loss_mlp": 0.28198242, + "step": 2260, + "time_per_iteration": 2.5302374362945557 + }, + { + "auxiliary_loss_clip": 0.0660997, + "auxiliary_loss_mlp": 0.0129096, + "balance_loss_clip": 0.06293269, + "balance_loss_mlp": 0.01260859, + "epoch": 0.13593867428227868, + "flos": 25746542282880.0, + "grad_norm": 2.2338901691781925, + "language_loss": 0.76686287, + "learning_rate": 3.883422063247961e-06, + "loss": 0.84587216, + "num_input_tokens_seen": 48922525, + "router_z_loss_clip": 3.16796875, + "router_z_loss_mlp": 0.30114746, + "step": 2261, + "time_per_iteration": 2.5939574241638184 + }, + { + "auxiliary_loss_clip": 0.06616522, + "auxiliary_loss_mlp": 0.01291008, + "balance_loss_clip": 0.0629552, + "balance_loss_mlp": 0.01259132, + "epoch": 0.13599879753494665, + "flos": 31257350409600.0, + "grad_norm": 2.2895573692407547, + "language_loss": 0.6521523, + "learning_rate": 3.883291003730794e-06, + "loss": 0.73122764, + "num_input_tokens_seen": 48942510, + "router_z_loss_clip": 3.21289062, + "router_z_loss_mlp": 0.31884766, + "step": 2262, + "time_per_iteration": 2.615324020385742 + }, + { + "auxiliary_loss_clip": 0.0662135, + "auxiliary_loss_mlp": 0.01300411, + "balance_loss_clip": 0.06298003, + "balance_loss_mlp": 0.01269584, + "epoch": 0.1360589207876146, + "flos": 23921853845760.0, + "grad_norm": 2.421989013841254, + "language_loss": 0.84175652, + "learning_rate": 3.883159872799043e-06, + "loss": 0.92097414, + "num_input_tokens_seen": 48962625, + "router_z_loss_clip": 3.23632812, + "router_z_loss_mlp": 0.30859375, + "step": 2263, + "time_per_iteration": 2.5566399097442627 + }, + { + "auxiliary_loss_clip": 0.06629188, + "auxiliary_loss_mlp": 0.01291754, + "balance_loss_clip": 0.06304573, + "balance_loss_mlp": 0.0125859, + "epoch": 0.13611904404028258, + "flos": 19980295384320.0, + "grad_norm": 2.5264058207475215, + "language_loss": 0.89336157, + "learning_rate": 3.8830286704576815e-06, + "loss": 0.97257102, + "num_input_tokens_seen": 48982525, + "router_z_loss_clip": 3.24804688, + "router_z_loss_mlp": 0.33178711, + "step": 2264, + "time_per_iteration": 2.5305962562561035 + }, + { + "auxiliary_loss_clip": 0.06637362, + "auxiliary_loss_mlp": 0.0129781, + "balance_loss_clip": 0.06308438, + "balance_loss_mlp": 0.01265195, + "epoch": 0.13617916729295054, + "flos": 15345990092160.0, + "grad_norm": 2.7927094576438716, + "language_loss": 0.71764517, + "learning_rate": 3.882897396711683e-06, + "loss": 0.79699689, + "num_input_tokens_seen": 48997605, + "router_z_loss_clip": 3.28515625, + "router_z_loss_mlp": 0.32617188, + "step": 2265, + "time_per_iteration": 2.561797857284546 + }, + { + "auxiliary_loss_clip": 0.06615983, + "auxiliary_loss_mlp": 0.01290453, + "balance_loss_clip": 0.06299248, + "balance_loss_mlp": 0.01262034, + "epoch": 0.1362392905456185, + "flos": 27458402797440.0, + "grad_norm": 2.5604448311617825, + "language_loss": 0.67458075, + "learning_rate": 3.882766051566027e-06, + "loss": 0.75364506, + "num_input_tokens_seen": 49018535, + "router_z_loss_clip": 3.16992188, + "router_z_loss_mlp": 0.28381348, + "step": 2266, + "time_per_iteration": 2.5694286823272705 + }, + { + "auxiliary_loss_clip": 0.06624304, + "auxiliary_loss_mlp": 0.01294932, + "balance_loss_clip": 0.06304609, + "balance_loss_mlp": 0.01263711, + "epoch": 0.1362994137982865, + "flos": 25015920606720.0, + "grad_norm": 2.0527906242943983, + "language_loss": 0.77445233, + "learning_rate": 3.882634635025694e-06, + "loss": 0.85364473, + "num_input_tokens_seen": 49038865, + "router_z_loss_clip": 3.1953125, + "router_z_loss_mlp": 0.31237793, + "step": 2267, + "time_per_iteration": 4.004362106323242 + }, + { + "auxiliary_loss_clip": 0.06632047, + "auxiliary_loss_mlp": 0.01290209, + "balance_loss_clip": 0.0631062, + "balance_loss_mlp": 0.01259882, + "epoch": 0.13635953705095447, + "flos": 20309261713920.0, + "grad_norm": 1.8370610095313742, + "language_loss": 0.836191, + "learning_rate": 3.882503147095667e-06, + "loss": 0.91541362, + "num_input_tokens_seen": 49058010, + "router_z_loss_clip": 3.21875, + "router_z_loss_mlp": 0.30322266, + "step": 2268, + "time_per_iteration": 3.9506208896636963 + }, + { + "auxiliary_loss_clip": 0.06630498, + "auxiliary_loss_mlp": 0.01294319, + "balance_loss_clip": 0.06311751, + "balance_loss_mlp": 0.01262013, + "epoch": 0.13641966030362243, + "flos": 31366530679680.0, + "grad_norm": 1.9828007462930386, + "language_loss": 0.7747438, + "learning_rate": 3.882371587780931e-06, + "loss": 0.85399193, + "num_input_tokens_seen": 49080330, + "router_z_loss_clip": 3.1875, + "router_z_loss_mlp": 0.32299805, + "step": 2269, + "time_per_iteration": 2.653453826904297 + }, + { + "auxiliary_loss_clip": 0.06638865, + "auxiliary_loss_mlp": 0.01296587, + "balance_loss_clip": 0.06316057, + "balance_loss_mlp": 0.0126545, + "epoch": 0.1364797835562904, + "flos": 20483122936320.0, + "grad_norm": 2.359526754249971, + "language_loss": 0.8236903, + "learning_rate": 3.882239957086477e-06, + "loss": 0.90304482, + "num_input_tokens_seen": 49097035, + "router_z_loss_clip": 3.22851562, + "router_z_loss_mlp": 0.31152344, + "step": 2270, + "time_per_iteration": 2.5675413608551025 + }, + { + "auxiliary_loss_clip": 0.06635441, + "auxiliary_loss_mlp": 0.01293131, + "balance_loss_clip": 0.06311204, + "balance_loss_mlp": 0.01261254, + "epoch": 0.13653990680895836, + "flos": 13083280836480.0, + "grad_norm": 2.670574241660613, + "language_loss": 0.77002323, + "learning_rate": 3.882108255017295e-06, + "loss": 0.84930891, + "num_input_tokens_seen": 49113945, + "router_z_loss_clip": 3.24414062, + "router_z_loss_mlp": 0.31884766, + "step": 2271, + "time_per_iteration": 3.976745367050171 + }, + { + "auxiliary_loss_clip": 0.06636623, + "auxiliary_loss_mlp": 0.01296686, + "balance_loss_clip": 0.06313315, + "balance_loss_mlp": 0.0126419, + "epoch": 0.13660003006162633, + "flos": 16952443770240.0, + "grad_norm": 2.320627701174975, + "language_loss": 0.81754398, + "learning_rate": 3.881976481578379e-06, + "loss": 0.89687717, + "num_input_tokens_seen": 49132855, + "router_z_loss_clip": 3.23242188, + "router_z_loss_mlp": 0.32495117, + "step": 2272, + "time_per_iteration": 4.03596043586731 + }, + { + "auxiliary_loss_clip": 0.0650102, + "auxiliary_loss_mlp": 0.01266825, + "balance_loss_clip": 0.06312356, + "balance_loss_mlp": 0.01255327, + "epoch": 0.1366601533142943, + "flos": 68703105386880.0, + "grad_norm": 0.6745755938751765, + "language_loss": 0.60570937, + "learning_rate": 3.8818446367747255e-06, + "loss": 0.68338782, + "num_input_tokens_seen": 49198310, + "router_z_loss_clip": 1.890625, + "router_z_loss_mlp": 0.11480713, + "step": 2273, + "time_per_iteration": 3.287332534790039 + }, + { + "auxiliary_loss_clip": 0.06625689, + "auxiliary_loss_mlp": 0.01290706, + "balance_loss_clip": 0.06308322, + "balance_loss_mlp": 0.01259831, + "epoch": 0.13672027656696228, + "flos": 19250176832640.0, + "grad_norm": 1.730825672757131, + "language_loss": 0.79225731, + "learning_rate": 3.881712720611336e-06, + "loss": 0.87142122, + "num_input_tokens_seen": 49217250, + "router_z_loss_clip": 3.171875, + "router_z_loss_mlp": 0.30883789, + "step": 2274, + "time_per_iteration": 2.562556743621826 + }, + { + "auxiliary_loss_clip": 0.06626303, + "auxiliary_loss_mlp": 0.01302977, + "balance_loss_clip": 0.06308225, + "balance_loss_mlp": 0.01270457, + "epoch": 0.13678039981963025, + "flos": 24541785878400.0, + "grad_norm": 2.937872524874316, + "language_loss": 0.79763901, + "learning_rate": 3.881580733093211e-06, + "loss": 0.87693179, + "num_input_tokens_seen": 49236615, + "router_z_loss_clip": 3.17773438, + "router_z_loss_mlp": 0.32519531, + "step": 2275, + "time_per_iteration": 2.560577630996704 + }, + { + "auxiliary_loss_clip": 0.06630076, + "auxiliary_loss_mlp": 0.01293627, + "balance_loss_clip": 0.06306267, + "balance_loss_mlp": 0.01259914, + "epoch": 0.13684052307229821, + "flos": 15674788713600.0, + "grad_norm": 2.8834689051693196, + "language_loss": 0.82202291, + "learning_rate": 3.881448674225356e-06, + "loss": 0.9012599, + "num_input_tokens_seen": 49253935, + "router_z_loss_clip": 3.2421875, + "router_z_loss_mlp": 0.33691406, + "step": 2276, + "time_per_iteration": 2.6382758617401123 + }, + { + "auxiliary_loss_clip": 0.06636757, + "auxiliary_loss_mlp": 0.01296316, + "balance_loss_clip": 0.06304651, + "balance_loss_mlp": 0.01260839, + "epoch": 0.13690064632496618, + "flos": 28371983863680.0, + "grad_norm": 2.682466270477189, + "language_loss": 0.71951526, + "learning_rate": 3.881316544012779e-06, + "loss": 0.79884601, + "num_input_tokens_seen": 49273605, + "router_z_loss_clip": 3.32421875, + "router_z_loss_mlp": 0.35473633, + "step": 2277, + "time_per_iteration": 2.59140944480896 + }, + { + "auxiliary_loss_clip": 0.06638919, + "auxiliary_loss_mlp": 0.01298071, + "balance_loss_clip": 0.06309501, + "balance_loss_mlp": 0.01265312, + "epoch": 0.13696076957763414, + "flos": 23411605207680.0, + "grad_norm": 2.2485386037649144, + "language_loss": 0.82153767, + "learning_rate": 3.88118434246049e-06, + "loss": 0.90090752, + "num_input_tokens_seen": 49291785, + "router_z_loss_clip": 3.296875, + "router_z_loss_mlp": 0.32739258, + "step": 2278, + "time_per_iteration": 2.5540530681610107 + }, + { + "auxiliary_loss_clip": 0.06627095, + "auxiliary_loss_mlp": 0.01287889, + "balance_loss_clip": 0.06304022, + "balance_loss_mlp": 0.01256358, + "epoch": 0.1370208928303021, + "flos": 37205760084480.0, + "grad_norm": 2.776511982198055, + "language_loss": 0.76353186, + "learning_rate": 3.881052069573502e-06, + "loss": 0.84268171, + "num_input_tokens_seen": 49311405, + "router_z_loss_clip": 3.234375, + "router_z_loss_mlp": 0.31506348, + "step": 2279, + "time_per_iteration": 2.659834623336792 + }, + { + "auxiliary_loss_clip": 0.06632279, + "auxiliary_loss_mlp": 0.01290702, + "balance_loss_clip": 0.06309781, + "balance_loss_mlp": 0.01260041, + "epoch": 0.13708101608297008, + "flos": 26983052184960.0, + "grad_norm": 1.8236300001025265, + "language_loss": 0.78161544, + "learning_rate": 3.880919725356831e-06, + "loss": 0.86084521, + "num_input_tokens_seen": 49331835, + "router_z_loss_clip": 3.22460938, + "router_z_loss_mlp": 0.30639648, + "step": 2280, + "time_per_iteration": 2.5933265686035156 + }, + { + "auxiliary_loss_clip": 0.06616117, + "auxiliary_loss_mlp": 0.01291386, + "balance_loss_clip": 0.06299774, + "balance_loss_mlp": 0.01259009, + "epoch": 0.13714113933563807, + "flos": 32564243341440.0, + "grad_norm": 2.0971089694494003, + "language_loss": 0.80573678, + "learning_rate": 3.880787309815496e-06, + "loss": 0.88481188, + "num_input_tokens_seen": 49352290, + "router_z_loss_clip": 3.1640625, + "router_z_loss_mlp": 0.32373047, + "step": 2281, + "time_per_iteration": 2.6089248657226562 + }, + { + "auxiliary_loss_clip": 0.06637304, + "auxiliary_loss_mlp": 0.01294147, + "balance_loss_clip": 0.06310696, + "balance_loss_mlp": 0.01260601, + "epoch": 0.13720126258830603, + "flos": 16105807716480.0, + "grad_norm": 1.9438647514298306, + "language_loss": 0.84104228, + "learning_rate": 3.880654822954518e-06, + "loss": 0.92035675, + "num_input_tokens_seen": 49370285, + "router_z_loss_clip": 3.26367188, + "router_z_loss_mlp": 0.33544922, + "step": 2282, + "time_per_iteration": 2.6252219676971436 + }, + { + "auxiliary_loss_clip": 0.06621532, + "auxiliary_loss_mlp": 0.01288566, + "balance_loss_clip": 0.06310192, + "balance_loss_mlp": 0.01258716, + "epoch": 0.137261385840974, + "flos": 18959630400000.0, + "grad_norm": 1.6598116001029841, + "language_loss": 0.74414694, + "learning_rate": 3.8805222647789195e-06, + "loss": 0.82324791, + "num_input_tokens_seen": 49389610, + "router_z_loss_clip": 3.1171875, + "router_z_loss_mlp": 0.29858398, + "step": 2283, + "time_per_iteration": 2.510495185852051 + }, + { + "auxiliary_loss_clip": 0.06626984, + "auxiliary_loss_mlp": 0.01293133, + "balance_loss_clip": 0.06314456, + "balance_loss_mlp": 0.01261686, + "epoch": 0.13732150909364196, + "flos": 23302173375360.0, + "grad_norm": 4.31542841231349, + "language_loss": 0.85737264, + "learning_rate": 3.880389635293729e-06, + "loss": 0.93657386, + "num_input_tokens_seen": 49408390, + "router_z_loss_clip": 3.125, + "router_z_loss_mlp": 0.31445312, + "step": 2284, + "time_per_iteration": 2.569772720336914 + }, + { + "auxiliary_loss_clip": 0.06637374, + "auxiliary_loss_mlp": 0.01296079, + "balance_loss_clip": 0.06309589, + "balance_loss_mlp": 0.01263702, + "epoch": 0.13738163234630993, + "flos": 29358966706560.0, + "grad_norm": 2.3287060101811643, + "language_loss": 0.76374751, + "learning_rate": 3.880256934503974e-06, + "loss": 0.84308201, + "num_input_tokens_seen": 49427725, + "router_z_loss_clip": 3.27734375, + "router_z_loss_mlp": 0.32348633, + "step": 2285, + "time_per_iteration": 2.618502140045166 + }, + { + "auxiliary_loss_clip": 0.06630811, + "auxiliary_loss_mlp": 0.01295468, + "balance_loss_clip": 0.06312186, + "balance_loss_mlp": 0.0126619, + "epoch": 0.1374417555989779, + "flos": 26658572048640.0, + "grad_norm": 1.8592668297074675, + "language_loss": 0.76012349, + "learning_rate": 3.880124162414689e-06, + "loss": 0.83938622, + "num_input_tokens_seen": 49449000, + "router_z_loss_clip": 3.18164062, + "router_z_loss_mlp": 0.29296875, + "step": 2286, + "time_per_iteration": 2.7475874423980713 + }, + { + "auxiliary_loss_clip": 0.06634222, + "auxiliary_loss_mlp": 0.01290764, + "balance_loss_clip": 0.06310531, + "balance_loss_mlp": 0.01258029, + "epoch": 0.1375018788516459, + "flos": 28411074593280.0, + "grad_norm": 5.375995383381602, + "language_loss": 0.87619269, + "learning_rate": 3.879991319030908e-06, + "loss": 0.95544249, + "num_input_tokens_seen": 49468360, + "router_z_loss_clip": 3.23242188, + "router_z_loss_mlp": 0.32763672, + "step": 2287, + "time_per_iteration": 2.7319629192352295 + }, + { + "auxiliary_loss_clip": 0.06638976, + "auxiliary_loss_mlp": 0.01305844, + "balance_loss_clip": 0.06320731, + "balance_loss_mlp": 0.01274683, + "epoch": 0.13756200210431385, + "flos": 37422695105280.0, + "grad_norm": 2.4551568049715486, + "language_loss": 0.70291626, + "learning_rate": 3.879858404357666e-06, + "loss": 0.78236449, + "num_input_tokens_seen": 49493450, + "router_z_loss_clip": 3.18359375, + "router_z_loss_mlp": 0.3112793, + "step": 2288, + "time_per_iteration": 2.6788651943206787 + }, + { + "auxiliary_loss_clip": 0.06632806, + "auxiliary_loss_mlp": 0.01293292, + "balance_loss_clip": 0.06312902, + "balance_loss_mlp": 0.01262667, + "epoch": 0.13762212535698182, + "flos": 22717642492800.0, + "grad_norm": 3.117032975681255, + "language_loss": 0.88826561, + "learning_rate": 3.879725418400005e-06, + "loss": 0.96752661, + "num_input_tokens_seen": 49511220, + "router_z_loss_clip": 3.1953125, + "router_z_loss_mlp": 0.30651855, + "step": 2289, + "time_per_iteration": 2.5602166652679443 + }, + { + "auxiliary_loss_clip": 0.06632558, + "auxiliary_loss_mlp": 0.01293233, + "balance_loss_clip": 0.06320693, + "balance_loss_mlp": 0.01263181, + "epoch": 0.13768224860964978, + "flos": 23959057858560.0, + "grad_norm": 1.9772525840465298, + "language_loss": 0.75630605, + "learning_rate": 3.879592361162969e-06, + "loss": 0.8355639, + "num_input_tokens_seen": 49529820, + "router_z_loss_clip": 3.11914062, + "router_z_loss_mlp": 0.30065918, + "step": 2290, + "time_per_iteration": 2.5592398643493652 + }, + { + "auxiliary_loss_clip": 0.06540786, + "auxiliary_loss_mlp": 0.01268874, + "balance_loss_clip": 0.06353199, + "balance_loss_mlp": 0.01257585, + "epoch": 0.13774237186231775, + "flos": 63612568212480.0, + "grad_norm": 0.6705422790130379, + "language_loss": 0.51642907, + "learning_rate": 3.8794592326516015e-06, + "loss": 0.59452564, + "num_input_tokens_seen": 49595325, + "router_z_loss_clip": 1.875, + "router_z_loss_mlp": 0.112854, + "step": 2291, + "time_per_iteration": 3.2724592685699463 + }, + { + "auxiliary_loss_clip": 0.06630601, + "auxiliary_loss_mlp": 0.01294866, + "balance_loss_clip": 0.0631279, + "balance_loss_mlp": 0.01263657, + "epoch": 0.1378024951149857, + "flos": 24286263252480.0, + "grad_norm": 2.140362896023876, + "language_loss": 0.72877645, + "learning_rate": 3.879326032870952e-06, + "loss": 0.80803108, + "num_input_tokens_seen": 49615850, + "router_z_loss_clip": 3.1796875, + "router_z_loss_mlp": 0.31201172, + "step": 2292, + "time_per_iteration": 2.571537971496582 + }, + { + "auxiliary_loss_clip": 0.0663756, + "auxiliary_loss_mlp": 0.01294271, + "balance_loss_clip": 0.06317808, + "balance_loss_mlp": 0.01261179, + "epoch": 0.13786261836765368, + "flos": 14025722434560.0, + "grad_norm": 2.9525020540096842, + "language_loss": 0.81376028, + "learning_rate": 3.879192761826071e-06, + "loss": 0.89307863, + "num_input_tokens_seen": 49631860, + "router_z_loss_clip": 3.19726562, + "router_z_loss_mlp": 0.33056641, + "step": 2293, + "time_per_iteration": 2.520320177078247 + }, + { + "auxiliary_loss_clip": 0.06629369, + "auxiliary_loss_mlp": 0.01294538, + "balance_loss_clip": 0.06308883, + "balance_loss_mlp": 0.01262065, + "epoch": 0.13792274162032167, + "flos": 28886592913920.0, + "grad_norm": 15.103956304175181, + "language_loss": 0.79534554, + "learning_rate": 3.879059419522011e-06, + "loss": 0.87458467, + "num_input_tokens_seen": 49652145, + "router_z_loss_clip": 3.20507812, + "router_z_loss_mlp": 0.32470703, + "step": 2294, + "time_per_iteration": 2.5958240032196045 + }, + { + "auxiliary_loss_clip": 0.06628333, + "auxiliary_loss_mlp": 0.01293802, + "balance_loss_clip": 0.06314936, + "balance_loss_mlp": 0.01264739, + "epoch": 0.13798286487298964, + "flos": 21147344651520.0, + "grad_norm": 2.1249265647314575, + "language_loss": 0.82119411, + "learning_rate": 3.878926005963831e-06, + "loss": 0.90041548, + "num_input_tokens_seen": 49669880, + "router_z_loss_clip": 3.1328125, + "router_z_loss_mlp": 0.29040527, + "step": 2295, + "time_per_iteration": 2.5259695053100586 + }, + { + "auxiliary_loss_clip": 0.06624444, + "auxiliary_loss_mlp": 0.0128892, + "balance_loss_clip": 0.06304439, + "balance_loss_mlp": 0.01258569, + "epoch": 0.1380429881256576, + "flos": 22493286385920.0, + "grad_norm": 1.9411162070190993, + "language_loss": 0.79297817, + "learning_rate": 3.878792521156588e-06, + "loss": 0.8721118, + "num_input_tokens_seen": 49687255, + "router_z_loss_clip": 3.203125, + "router_z_loss_mlp": 0.3034668, + "step": 2296, + "time_per_iteration": 2.5404605865478516 + }, + { + "auxiliary_loss_clip": 0.06623581, + "auxiliary_loss_mlp": 0.01292011, + "balance_loss_clip": 0.06309658, + "balance_loss_mlp": 0.01261755, + "epoch": 0.13810311137832557, + "flos": 21399429260160.0, + "grad_norm": 1.8193304302063846, + "language_loss": 0.79101717, + "learning_rate": 3.8786589651053446e-06, + "loss": 0.87017298, + "num_input_tokens_seen": 49706650, + "router_z_loss_clip": 3.13867188, + "router_z_loss_mlp": 0.30249023, + "step": 2297, + "time_per_iteration": 2.544902801513672 + }, + { + "auxiliary_loss_clip": 0.06617336, + "auxiliary_loss_mlp": 0.01292431, + "balance_loss_clip": 0.06304273, + "balance_loss_mlp": 0.01261162, + "epoch": 0.13816323463099353, + "flos": 25996195123200.0, + "grad_norm": 2.1649336589446113, + "language_loss": 0.70034248, + "learning_rate": 3.878525337815164e-06, + "loss": 0.77944016, + "num_input_tokens_seen": 49725715, + "router_z_loss_clip": 3.12890625, + "router_z_loss_mlp": 0.31286621, + "step": 2298, + "time_per_iteration": 2.7027747631073 + }, + { + "auxiliary_loss_clip": 0.06625488, + "auxiliary_loss_mlp": 0.01293838, + "balance_loss_clip": 0.06304887, + "balance_loss_mlp": 0.01263511, + "epoch": 0.1382233578836615, + "flos": 19250260686720.0, + "grad_norm": 1.8032659924791181, + "language_loss": 0.87816125, + "learning_rate": 3.878391639291116e-06, + "loss": 0.95735455, + "num_input_tokens_seen": 49744710, + "router_z_loss_clip": 3.20703125, + "router_z_loss_mlp": 0.30310059, + "step": 2299, + "time_per_iteration": 2.5216784477233887 + }, + { + "auxiliary_loss_clip": 0.06619459, + "auxiliary_loss_mlp": 0.01291843, + "balance_loss_clip": 0.06297824, + "balance_loss_mlp": 0.01258965, + "epoch": 0.1382834811363295, + "flos": 25673392068480.0, + "grad_norm": 1.8041271752460513, + "language_loss": 0.77313578, + "learning_rate": 3.878257869538267e-06, + "loss": 0.85224879, + "num_input_tokens_seen": 49764300, + "router_z_loss_clip": 3.21484375, + "router_z_loss_mlp": 0.32910156, + "step": 2300, + "time_per_iteration": 2.5652666091918945 + }, + { + "auxiliary_loss_clip": 0.06615824, + "auxiliary_loss_mlp": 0.01292831, + "balance_loss_clip": 0.06301995, + "balance_loss_mlp": 0.01263219, + "epoch": 0.13834360438899745, + "flos": 19788992513280.0, + "grad_norm": 2.607101946436598, + "language_loss": 0.84398985, + "learning_rate": 3.878124028561692e-06, + "loss": 0.92307633, + "num_input_tokens_seen": 49778380, + "router_z_loss_clip": 3.13867188, + "router_z_loss_mlp": 0.29589844, + "step": 2301, + "time_per_iteration": 2.5100109577178955 + }, + { + "auxiliary_loss_clip": 0.06616862, + "auxiliary_loss_mlp": 0.01292457, + "balance_loss_clip": 0.06302989, + "balance_loss_mlp": 0.01262631, + "epoch": 0.13840372764166542, + "flos": 26659200954240.0, + "grad_norm": 1.960897603887865, + "language_loss": 0.87807304, + "learning_rate": 3.877990116366466e-06, + "loss": 0.95716619, + "num_input_tokens_seen": 49797460, + "router_z_loss_clip": 3.140625, + "router_z_loss_mlp": 0.2980957, + "step": 2302, + "time_per_iteration": 2.5661840438842773 + }, + { + "auxiliary_loss_clip": 0.0648245, + "auxiliary_loss_mlp": 0.01256791, + "balance_loss_clip": 0.06296428, + "balance_loss_mlp": 0.01245943, + "epoch": 0.13846385089433338, + "flos": 70532321944320.0, + "grad_norm": 0.7317106160807376, + "language_loss": 0.65412122, + "learning_rate": 3.877856132957667e-06, + "loss": 0.73151362, + "num_input_tokens_seen": 49868005, + "router_z_loss_clip": 1.859375, + "router_z_loss_mlp": 0.10864258, + "step": 2303, + "time_per_iteration": 3.325839042663574 + }, + { + "auxiliary_loss_clip": 0.06609396, + "auxiliary_loss_mlp": 0.01287851, + "balance_loss_clip": 0.0630075, + "balance_loss_mlp": 0.01258263, + "epoch": 0.13852397414700135, + "flos": 17354644168320.0, + "grad_norm": 2.0774651772022885, + "language_loss": 0.79740053, + "learning_rate": 3.877722078340374e-06, + "loss": 0.87637299, + "num_input_tokens_seen": 49885825, + "router_z_loss_clip": 3.09179688, + "router_z_loss_mlp": 0.29589844, + "step": 2304, + "time_per_iteration": 2.543011426925659 + }, + { + "auxiliary_loss_clip": 0.06619786, + "auxiliary_loss_mlp": 0.01290997, + "balance_loss_clip": 0.06300867, + "balance_loss_mlp": 0.01261147, + "epoch": 0.13858409739966931, + "flos": 21550257809280.0, + "grad_norm": 3.5409811557707527, + "language_loss": 0.78727001, + "learning_rate": 3.877587952519672e-06, + "loss": 0.86637783, + "num_input_tokens_seen": 49905975, + "router_z_loss_clip": 3.19335938, + "router_z_loss_mlp": 0.2980957, + "step": 2305, + "time_per_iteration": 2.546365261077881 + }, + { + "auxiliary_loss_clip": 0.06604174, + "auxiliary_loss_mlp": 0.01290068, + "balance_loss_clip": 0.06297874, + "balance_loss_mlp": 0.01261624, + "epoch": 0.13864422065233728, + "flos": 21586329792000.0, + "grad_norm": 1.8829847036148735, + "language_loss": 0.89061654, + "learning_rate": 3.877453755500647e-06, + "loss": 0.96955895, + "num_input_tokens_seen": 49925800, + "router_z_loss_clip": 3.06054688, + "router_z_loss_mlp": 0.28442383, + "step": 2306, + "time_per_iteration": 2.564483165740967 + }, + { + "auxiliary_loss_clip": 0.06468673, + "auxiliary_loss_mlp": 0.0125835, + "balance_loss_clip": 0.0628318, + "balance_loss_mlp": 0.01247258, + "epoch": 0.13870434390500527, + "flos": 53384927650560.0, + "grad_norm": 0.8396257339497795, + "language_loss": 0.58554721, + "learning_rate": 3.877319487288387e-06, + "loss": 0.66281742, + "num_input_tokens_seen": 49977620, + "router_z_loss_clip": 1.859375, + "router_z_loss_mlp": 0.11108398, + "step": 2307, + "time_per_iteration": 4.632705450057983 + }, + { + "auxiliary_loss_clip": 0.0661881, + "auxiliary_loss_mlp": 0.01288588, + "balance_loss_clip": 0.06295981, + "balance_loss_mlp": 0.01258022, + "epoch": 0.13876446715767324, + "flos": 22572641802240.0, + "grad_norm": 1.7746642333134461, + "language_loss": 0.80762124, + "learning_rate": 3.877185147887984e-06, + "loss": 0.88669527, + "num_input_tokens_seen": 49996650, + "router_z_loss_clip": 3.23046875, + "router_z_loss_mlp": 0.30566406, + "step": 2308, + "time_per_iteration": 3.985261917114258 + }, + { + "auxiliary_loss_clip": 0.06612652, + "auxiliary_loss_mlp": 0.0129232, + "balance_loss_clip": 0.06302111, + "balance_loss_mlp": 0.01262208, + "epoch": 0.1388245904103412, + "flos": 20711671747200.0, + "grad_norm": 2.3070434354932425, + "language_loss": 0.7942912, + "learning_rate": 3.877050737304533e-06, + "loss": 0.8733409, + "num_input_tokens_seen": 50015640, + "router_z_loss_clip": 3.10742188, + "router_z_loss_mlp": 0.30102539, + "step": 2309, + "time_per_iteration": 2.5814623832702637 + }, + { + "auxiliary_loss_clip": 0.06621584, + "auxiliary_loss_mlp": 0.01295268, + "balance_loss_clip": 0.06297516, + "balance_loss_mlp": 0.0126444, + "epoch": 0.13888471366300917, + "flos": 20560382000640.0, + "grad_norm": 2.2863258472271437, + "language_loss": 0.6975733, + "learning_rate": 3.876916255543129e-06, + "loss": 0.77674186, + "num_input_tokens_seen": 50033500, + "router_z_loss_clip": 3.24023438, + "router_z_loss_mlp": 0.30786133, + "step": 2310, + "time_per_iteration": 2.5402469635009766 + }, + { + "auxiliary_loss_clip": 0.06612189, + "auxiliary_loss_mlp": 0.01299127, + "balance_loss_clip": 0.06296849, + "balance_loss_mlp": 0.01268967, + "epoch": 0.13894483691567713, + "flos": 13842008357760.0, + "grad_norm": 1.8909078278877924, + "language_loss": 0.85131961, + "learning_rate": 3.8767817026088725e-06, + "loss": 0.9304328, + "num_input_tokens_seen": 50050075, + "router_z_loss_clip": 3.15625, + "router_z_loss_mlp": 0.30126953, + "step": 2311, + "time_per_iteration": 5.377658128738403 + }, + { + "auxiliary_loss_clip": 0.06618226, + "auxiliary_loss_mlp": 0.01294733, + "balance_loss_clip": 0.06296492, + "balance_loss_mlp": 0.01264358, + "epoch": 0.1390049601683451, + "flos": 28037567018880.0, + "grad_norm": 2.5894979273704783, + "language_loss": 0.83215213, + "learning_rate": 3.876647078506866e-06, + "loss": 0.9112817, + "num_input_tokens_seen": 50070080, + "router_z_loss_clip": 3.21679688, + "router_z_loss_mlp": 0.30395508, + "step": 2312, + "time_per_iteration": 2.6039178371429443 + }, + { + "auxiliary_loss_clip": 0.06618522, + "auxiliary_loss_mlp": 0.01290839, + "balance_loss_clip": 0.06296252, + "balance_loss_mlp": 0.01259964, + "epoch": 0.13906508342101306, + "flos": 26763475760640.0, + "grad_norm": 1.7282329609081795, + "language_loss": 0.87823701, + "learning_rate": 3.876512383242215e-06, + "loss": 0.95733058, + "num_input_tokens_seen": 50090040, + "router_z_loss_clip": 3.22460938, + "router_z_loss_mlp": 0.30883789, + "step": 2313, + "time_per_iteration": 2.6105740070343018 + }, + { + "auxiliary_loss_clip": 0.06614069, + "auxiliary_loss_mlp": 0.01289702, + "balance_loss_clip": 0.06295129, + "balance_loss_mlp": 0.01259185, + "epoch": 0.13912520667368106, + "flos": 24541995513600.0, + "grad_norm": 1.8286826676096326, + "language_loss": 0.81090409, + "learning_rate": 3.876377616820024e-06, + "loss": 0.88994175, + "num_input_tokens_seen": 50110595, + "router_z_loss_clip": 3.1875, + "router_z_loss_mlp": 0.30541992, + "step": 2314, + "time_per_iteration": 2.581137180328369 + }, + { + "auxiliary_loss_clip": 0.06609131, + "auxiliary_loss_mlp": 0.0129379, + "balance_loss_clip": 0.06295457, + "balance_loss_mlp": 0.01263678, + "epoch": 0.13918532992634902, + "flos": 19388007999360.0, + "grad_norm": 4.757536248820732, + "language_loss": 0.86588097, + "learning_rate": 3.876242779245409e-06, + "loss": 0.94491017, + "num_input_tokens_seen": 50125430, + "router_z_loss_clip": 3.140625, + "router_z_loss_mlp": 0.30126953, + "step": 2315, + "time_per_iteration": 2.5262932777404785 + }, + { + "auxiliary_loss_clip": 0.06611065, + "auxiliary_loss_mlp": 0.01285772, + "balance_loss_clip": 0.06296186, + "balance_loss_mlp": 0.01255159, + "epoch": 0.139245453179017, + "flos": 21330010552320.0, + "grad_norm": 2.405797075318415, + "language_loss": 0.78922898, + "learning_rate": 3.876107870523477e-06, + "loss": 0.86819738, + "num_input_tokens_seen": 50144120, + "router_z_loss_clip": 3.15039062, + "router_z_loss_mlp": 0.30615234, + "step": 2316, + "time_per_iteration": 2.529972553253174 + }, + { + "auxiliary_loss_clip": 0.06613404, + "auxiliary_loss_mlp": 0.01292141, + "balance_loss_clip": 0.06296711, + "balance_loss_mlp": 0.01260026, + "epoch": 0.13930557643168495, + "flos": 19506747634560.0, + "grad_norm": 1.7528689753979556, + "language_loss": 0.77613419, + "learning_rate": 3.875972890659349e-06, + "loss": 0.85518968, + "num_input_tokens_seen": 50162500, + "router_z_loss_clip": 3.16992188, + "router_z_loss_mlp": 0.32116699, + "step": 2317, + "time_per_iteration": 2.5425355434417725 + }, + { + "auxiliary_loss_clip": 0.06624125, + "auxiliary_loss_mlp": 0.01286591, + "balance_loss_clip": 0.0630217, + "balance_loss_mlp": 0.01257027, + "epoch": 0.13936569968435292, + "flos": 25417869442560.0, + "grad_norm": 1.999588880264202, + "language_loss": 0.81447107, + "learning_rate": 3.875837839658139e-06, + "loss": 0.89357817, + "num_input_tokens_seen": 50182415, + "router_z_loss_clip": 3.22265625, + "router_z_loss_mlp": 0.2956543, + "step": 2318, + "time_per_iteration": 2.577786922454834 + }, + { + "auxiliary_loss_clip": 0.06479447, + "auxiliary_loss_mlp": 0.01268448, + "balance_loss_clip": 0.06297284, + "balance_loss_mlp": 0.01257373, + "epoch": 0.13942582293702088, + "flos": 70793211231360.0, + "grad_norm": 0.8224169172372592, + "language_loss": 0.59232461, + "learning_rate": 3.87570271752497e-06, + "loss": 0.66980362, + "num_input_tokens_seen": 50245160, + "router_z_loss_clip": 1.82421875, + "router_z_loss_mlp": 0.11090088, + "step": 2319, + "time_per_iteration": 3.204317092895508 + }, + { + "auxiliary_loss_clip": 0.06613657, + "auxiliary_loss_mlp": 0.01294413, + "balance_loss_clip": 0.06293797, + "balance_loss_mlp": 0.01263514, + "epoch": 0.13948594618968888, + "flos": 35599725676800.0, + "grad_norm": 2.1444622790100762, + "language_loss": 0.66576529, + "learning_rate": 3.875567524264967e-06, + "loss": 0.74484605, + "num_input_tokens_seen": 50268215, + "router_z_loss_clip": 3.20117188, + "router_z_loss_mlp": 0.30957031, + "step": 2320, + "time_per_iteration": 2.677716016769409 + }, + { + "auxiliary_loss_clip": 0.06604615, + "auxiliary_loss_mlp": 0.01292225, + "balance_loss_clip": 0.062957, + "balance_loss_mlp": 0.01263245, + "epoch": 0.13954606944235684, + "flos": 21111482304000.0, + "grad_norm": 1.7128433163135388, + "language_loss": 0.7132194, + "learning_rate": 3.875432259883256e-06, + "loss": 0.79218775, + "num_input_tokens_seen": 50288575, + "router_z_loss_clip": 3.08984375, + "router_z_loss_mlp": 0.28967285, + "step": 2321, + "time_per_iteration": 2.5557823181152344 + }, + { + "auxiliary_loss_clip": 0.06610114, + "auxiliary_loss_mlp": 0.01289737, + "balance_loss_clip": 0.06294077, + "balance_loss_mlp": 0.01258158, + "epoch": 0.1396061926950248, + "flos": 25051154048640.0, + "grad_norm": 2.1088337541486215, + "language_loss": 0.87096989, + "learning_rate": 3.875296924384965e-06, + "loss": 0.9499684, + "num_input_tokens_seen": 50308735, + "router_z_loss_clip": 3.15820312, + "router_z_loss_mlp": 0.3157959, + "step": 2322, + "time_per_iteration": 2.563751459121704 + }, + { + "auxiliary_loss_clip": 0.06602737, + "auxiliary_loss_mlp": 0.01287424, + "balance_loss_clip": 0.06298044, + "balance_loss_mlp": 0.01258718, + "epoch": 0.13966631594769277, + "flos": 37643193924480.0, + "grad_norm": 1.6181543517844332, + "language_loss": 0.68045509, + "learning_rate": 3.875161517775226e-06, + "loss": 0.75935674, + "num_input_tokens_seen": 50331025, + "router_z_loss_clip": 3.05078125, + "router_z_loss_mlp": 0.28710938, + "step": 2323, + "time_per_iteration": 2.8503611087799072 + }, + { + "auxiliary_loss_clip": 0.06623898, + "auxiliary_loss_mlp": 0.01286528, + "balance_loss_clip": 0.06301014, + "balance_loss_mlp": 0.01257393, + "epoch": 0.13972643920036074, + "flos": 16696627655040.0, + "grad_norm": 2.142170673512178, + "language_loss": 0.90579832, + "learning_rate": 3.875026040059175e-06, + "loss": 0.98490262, + "num_input_tokens_seen": 50349725, + "router_z_loss_clip": 3.23046875, + "router_z_loss_mlp": 0.29150391, + "step": 2324, + "time_per_iteration": 2.5540571212768555 + }, + { + "auxiliary_loss_clip": 0.06618317, + "auxiliary_loss_mlp": 0.01286509, + "balance_loss_clip": 0.0630155, + "balance_loss_mlp": 0.01256659, + "epoch": 0.1397865624530287, + "flos": 23337742233600.0, + "grad_norm": 4.139742528061125, + "language_loss": 0.72620469, + "learning_rate": 3.8748904912419485e-06, + "loss": 0.80525297, + "num_input_tokens_seen": 50367965, + "router_z_loss_clip": 3.16992188, + "router_z_loss_mlp": 0.29821777, + "step": 2325, + "time_per_iteration": 2.5619618892669678 + }, + { + "auxiliary_loss_clip": 0.0662512, + "auxiliary_loss_mlp": 0.01293129, + "balance_loss_clip": 0.06308709, + "balance_loss_mlp": 0.01264591, + "epoch": 0.13984668570569667, + "flos": 22784000526720.0, + "grad_norm": 2.1958407614138, + "language_loss": 0.83206451, + "learning_rate": 3.874754871328688e-06, + "loss": 0.91124701, + "num_input_tokens_seen": 50385605, + "router_z_loss_clip": 3.16210938, + "router_z_loss_mlp": 0.28503418, + "step": 2326, + "time_per_iteration": 2.544154167175293 + }, + { + "auxiliary_loss_clip": 0.06607386, + "auxiliary_loss_mlp": 0.0128622, + "balance_loss_clip": 0.06303836, + "balance_loss_mlp": 0.01256764, + "epoch": 0.13990680895836466, + "flos": 19470759505920.0, + "grad_norm": 1.8381162719470834, + "language_loss": 0.90198052, + "learning_rate": 3.874619180324534e-06, + "loss": 0.98091662, + "num_input_tokens_seen": 50403985, + "router_z_loss_clip": 3.03515625, + "router_z_loss_mlp": 0.2947998, + "step": 2327, + "time_per_iteration": 2.544022798538208 + }, + { + "auxiliary_loss_clip": 0.06612301, + "auxiliary_loss_mlp": 0.01294926, + "balance_loss_clip": 0.06299497, + "balance_loss_mlp": 0.01263479, + "epoch": 0.13996693221103262, + "flos": 20309555203200.0, + "grad_norm": 2.1153988454525927, + "language_loss": 0.86492193, + "learning_rate": 3.874483418234632e-06, + "loss": 0.9439941, + "num_input_tokens_seen": 50421590, + "router_z_loss_clip": 3.12890625, + "router_z_loss_mlp": 0.31433105, + "step": 2328, + "time_per_iteration": 2.498436212539673 + }, + { + "auxiliary_loss_clip": 0.06619829, + "auxiliary_loss_mlp": 0.01290779, + "balance_loss_clip": 0.06303053, + "balance_loss_mlp": 0.01261239, + "epoch": 0.1400270554637006, + "flos": 26625434958720.0, + "grad_norm": 2.232478376897894, + "language_loss": 0.74862719, + "learning_rate": 3.874347585064131e-06, + "loss": 0.82773322, + "num_input_tokens_seen": 50443945, + "router_z_loss_clip": 3.16992188, + "router_z_loss_mlp": 0.29541016, + "step": 2329, + "time_per_iteration": 2.625213146209717 + }, + { + "auxiliary_loss_clip": 0.06613478, + "auxiliary_loss_mlp": 0.01291404, + "balance_loss_clip": 0.06302348, + "balance_loss_mlp": 0.01261912, + "epoch": 0.14008717871636855, + "flos": 19397651218560.0, + "grad_norm": 2.9962397362189797, + "language_loss": 0.79502976, + "learning_rate": 3.874211680818183e-06, + "loss": 0.87407863, + "num_input_tokens_seen": 50462065, + "router_z_loss_clip": 3.11132812, + "router_z_loss_mlp": 0.29516602, + "step": 2330, + "time_per_iteration": 2.526705265045166 + }, + { + "auxiliary_loss_clip": 0.06610473, + "auxiliary_loss_mlp": 0.01292963, + "balance_loss_clip": 0.06299306, + "balance_loss_mlp": 0.01265187, + "epoch": 0.14014730196903652, + "flos": 15309624620160.0, + "grad_norm": 3.126642482841082, + "language_loss": 0.73399383, + "learning_rate": 3.87407570550194e-06, + "loss": 0.81302822, + "num_input_tokens_seen": 50479565, + "router_z_loss_clip": 3.11328125, + "router_z_loss_mlp": 0.27783203, + "step": 2331, + "time_per_iteration": 2.5545501708984375 + }, + { + "auxiliary_loss_clip": 0.06595145, + "auxiliary_loss_mlp": 0.01295524, + "balance_loss_clip": 0.06296061, + "balance_loss_mlp": 0.01267176, + "epoch": 0.14020742522170448, + "flos": 14945047505280.0, + "grad_norm": 1.5446780905805184, + "language_loss": 0.73888373, + "learning_rate": 3.873939659120557e-06, + "loss": 0.81779039, + "num_input_tokens_seen": 50497305, + "router_z_loss_clip": 2.9921875, + "router_z_loss_mlp": 0.28344727, + "step": 2332, + "time_per_iteration": 2.5132856369018555 + }, + { + "auxiliary_loss_clip": 0.06469279, + "auxiliary_loss_mlp": 0.01265718, + "balance_loss_clip": 0.0628898, + "balance_loss_mlp": 0.01254947, + "epoch": 0.14026754847437245, + "flos": 48839956410240.0, + "grad_norm": 0.7856293848414069, + "language_loss": 0.55978549, + "learning_rate": 3.873803541679196e-06, + "loss": 0.63713545, + "num_input_tokens_seen": 50549735, + "router_z_loss_clip": 1.80371094, + "router_z_loss_mlp": 0.10784912, + "step": 2333, + "time_per_iteration": 3.0545504093170166 + }, + { + "auxiliary_loss_clip": 0.06614032, + "auxiliary_loss_mlp": 0.01304219, + "balance_loss_clip": 0.06302805, + "balance_loss_mlp": 0.01274512, + "epoch": 0.14032767172704044, + "flos": 25779972862080.0, + "grad_norm": 1.7607916686559548, + "language_loss": 0.83699584, + "learning_rate": 3.873667353183016e-06, + "loss": 0.91617835, + "num_input_tokens_seen": 50570100, + "router_z_loss_clip": 3.11328125, + "router_z_loss_mlp": 0.29699707, + "step": 2334, + "time_per_iteration": 2.6067097187042236 + }, + { + "auxiliary_loss_clip": 0.06611067, + "auxiliary_loss_mlp": 0.01296359, + "balance_loss_clip": 0.06295306, + "balance_loss_mlp": 0.01268023, + "epoch": 0.1403877949797084, + "flos": 21222884707200.0, + "grad_norm": 3.2536049566200846, + "language_loss": 0.81910211, + "learning_rate": 3.8735310936371825e-06, + "loss": 0.89817637, + "num_input_tokens_seen": 50589185, + "router_z_loss_clip": 3.15625, + "router_z_loss_mlp": 0.28356934, + "step": 2335, + "time_per_iteration": 2.5793120861053467 + }, + { + "auxiliary_loss_clip": 0.06618994, + "auxiliary_loss_mlp": 0.0129466, + "balance_loss_clip": 0.06299357, + "balance_loss_mlp": 0.01262044, + "epoch": 0.14044791823237637, + "flos": 22754678797440.0, + "grad_norm": 1.8425920337650705, + "language_loss": 0.83025301, + "learning_rate": 3.873394763046862e-06, + "loss": 0.9093895, + "num_input_tokens_seen": 50609645, + "router_z_loss_clip": 3.19335938, + "router_z_loss_mlp": 0.32617188, + "step": 2336, + "time_per_iteration": 2.5754895210266113 + }, + { + "auxiliary_loss_clip": 0.0660933, + "auxiliary_loss_mlp": 0.0129189, + "balance_loss_clip": 0.06299824, + "balance_loss_mlp": 0.01261516, + "epoch": 0.14050804148504434, + "flos": 22970775277440.0, + "grad_norm": 1.9428001111866895, + "language_loss": 0.81449389, + "learning_rate": 3.873258361417225e-06, + "loss": 0.89350611, + "num_input_tokens_seen": 50628385, + "router_z_loss_clip": 3.09765625, + "router_z_loss_mlp": 0.30371094, + "step": 2337, + "time_per_iteration": 2.542494773864746 + }, + { + "auxiliary_loss_clip": 0.06620462, + "auxiliary_loss_mlp": 0.01292117, + "balance_loss_clip": 0.06304233, + "balance_loss_mlp": 0.01262493, + "epoch": 0.1405681647377123, + "flos": 22206890730240.0, + "grad_norm": 2.099495755823345, + "language_loss": 0.80428421, + "learning_rate": 3.873121888753442e-06, + "loss": 0.88341004, + "num_input_tokens_seen": 50647260, + "router_z_loss_clip": 3.16796875, + "router_z_loss_mlp": 0.29626465, + "step": 2338, + "time_per_iteration": 2.5587832927703857 + }, + { + "auxiliary_loss_clip": 0.06618391, + "auxiliary_loss_mlp": 0.01291133, + "balance_loss_clip": 0.06299177, + "balance_loss_mlp": 0.01259447, + "epoch": 0.14062828799038027, + "flos": 23739607215360.0, + "grad_norm": 2.563407914599119, + "language_loss": 0.81585765, + "learning_rate": 3.87298534506069e-06, + "loss": 0.89495289, + "num_input_tokens_seen": 50666130, + "router_z_loss_clip": 3.19140625, + "router_z_loss_mlp": 0.31689453, + "step": 2339, + "time_per_iteration": 2.541985273361206 + }, + { + "auxiliary_loss_clip": 0.06608106, + "auxiliary_loss_mlp": 0.01284227, + "balance_loss_clip": 0.06301871, + "balance_loss_mlp": 0.01254735, + "epoch": 0.14068841124304826, + "flos": 39211856611200.0, + "grad_norm": 1.7427009821835167, + "language_loss": 0.66622555, + "learning_rate": 3.872848730344146e-06, + "loss": 0.7451489, + "num_input_tokens_seen": 50687440, + "router_z_loss_clip": 3.06640625, + "router_z_loss_mlp": 0.29492188, + "step": 2340, + "time_per_iteration": 2.7599191665649414 + }, + { + "auxiliary_loss_clip": 0.06615461, + "auxiliary_loss_mlp": 0.01296967, + "balance_loss_clip": 0.06309174, + "balance_loss_mlp": 0.01267952, + "epoch": 0.14074853449571623, + "flos": 20198278581120.0, + "grad_norm": 2.455789479029152, + "language_loss": 0.80003643, + "learning_rate": 3.87271204460899e-06, + "loss": 0.87916064, + "num_input_tokens_seen": 50704030, + "router_z_loss_clip": 3.06445312, + "router_z_loss_mlp": 0.2902832, + "step": 2341, + "time_per_iteration": 2.5097782611846924 + }, + { + "auxiliary_loss_clip": 0.06617275, + "auxiliary_loss_mlp": 0.01290109, + "balance_loss_clip": 0.06306843, + "balance_loss_mlp": 0.01261118, + "epoch": 0.1408086577483842, + "flos": 18411800405760.0, + "grad_norm": 1.7920815266740484, + "language_loss": 0.81707942, + "learning_rate": 3.8725752878604066e-06, + "loss": 0.89615333, + "num_input_tokens_seen": 50723305, + "router_z_loss_clip": 3.10351562, + "router_z_loss_mlp": 0.29003906, + "step": 2342, + "time_per_iteration": 2.5234599113464355 + }, + { + "auxiliary_loss_clip": 0.06617711, + "auxiliary_loss_mlp": 0.01285014, + "balance_loss_clip": 0.06315217, + "balance_loss_mlp": 0.01257858, + "epoch": 0.14086878100105216, + "flos": 25271569013760.0, + "grad_norm": 1.8907393143090194, + "language_loss": 0.79096431, + "learning_rate": 3.87243846010358e-06, + "loss": 0.8699916, + "num_input_tokens_seen": 50743270, + "router_z_loss_clip": 3.02734375, + "router_z_loss_mlp": 0.27172852, + "step": 2343, + "time_per_iteration": 2.566734552383423 + }, + { + "auxiliary_loss_clip": 0.0648333, + "auxiliary_loss_mlp": 0.01280273, + "balance_loss_clip": 0.06304723, + "balance_loss_mlp": 0.01268566, + "epoch": 0.14092890425372012, + "flos": 65997553703040.0, + "grad_norm": 0.8105470614930316, + "language_loss": 0.61667693, + "learning_rate": 3.872301561343699e-06, + "loss": 0.69431293, + "num_input_tokens_seen": 50802710, + "router_z_loss_clip": 1.78125, + "router_z_loss_mlp": 0.11694336, + "step": 2344, + "time_per_iteration": 3.107311964035034 + }, + { + "auxiliary_loss_clip": 0.06612515, + "auxiliary_loss_mlp": 0.01296816, + "balance_loss_clip": 0.06307824, + "balance_loss_mlp": 0.01267514, + "epoch": 0.1409890275063881, + "flos": 23701564661760.0, + "grad_norm": 1.4479662088391603, + "language_loss": 0.66076458, + "learning_rate": 3.872164591585956e-06, + "loss": 0.73985791, + "num_input_tokens_seen": 50822625, + "router_z_loss_clip": 3.046875, + "router_z_loss_mlp": 0.29321289, + "step": 2345, + "time_per_iteration": 2.548482656478882 + }, + { + "auxiliary_loss_clip": 0.06630909, + "auxiliary_loss_mlp": 0.0129167, + "balance_loss_clip": 0.06307563, + "balance_loss_mlp": 0.01260676, + "epoch": 0.14104915075905605, + "flos": 23629923820800.0, + "grad_norm": 2.297389176264822, + "language_loss": 0.7525146, + "learning_rate": 3.8720275508355435e-06, + "loss": 0.83174026, + "num_input_tokens_seen": 50842330, + "router_z_loss_clip": 3.23242188, + "router_z_loss_mlp": 0.31005859, + "step": 2346, + "time_per_iteration": 3.9794979095458984 + }, + { + "auxiliary_loss_clip": 0.06626198, + "auxiliary_loss_mlp": 0.01293091, + "balance_loss_clip": 0.06312405, + "balance_loss_mlp": 0.0126162, + "epoch": 0.14110927401172405, + "flos": 20601485228160.0, + "grad_norm": 2.0524474508447876, + "language_loss": 0.7827574, + "learning_rate": 3.8718904390976585e-06, + "loss": 0.86195028, + "num_input_tokens_seen": 50861035, + "router_z_loss_clip": 3.13867188, + "router_z_loss_mlp": 0.31445312, + "step": 2347, + "time_per_iteration": 3.98130202293396 + }, + { + "auxiliary_loss_clip": 0.06624688, + "auxiliary_loss_mlp": 0.01292693, + "balance_loss_clip": 0.06315368, + "balance_loss_mlp": 0.01263725, + "epoch": 0.141169397264392, + "flos": 28555530232320.0, + "grad_norm": 2.266106813963602, + "language_loss": 0.77906024, + "learning_rate": 3.8717532563775e-06, + "loss": 0.85823405, + "num_input_tokens_seen": 50880105, + "router_z_loss_clip": 3.09765625, + "router_z_loss_mlp": 0.28955078, + "step": 2348, + "time_per_iteration": 2.594891309738159 + }, + { + "auxiliary_loss_clip": 0.06614843, + "auxiliary_loss_mlp": 0.01295406, + "balance_loss_clip": 0.06306847, + "balance_loss_mlp": 0.01267558, + "epoch": 0.14122952051705998, + "flos": 17097947585280.0, + "grad_norm": 2.2615839491571097, + "language_loss": 0.88040984, + "learning_rate": 3.871616002680272e-06, + "loss": 0.95951235, + "num_input_tokens_seen": 50897720, + "router_z_loss_clip": 3.08007812, + "router_z_loss_mlp": 0.27856445, + "step": 2349, + "time_per_iteration": 2.547189712524414 + }, + { + "auxiliary_loss_clip": 0.06613597, + "auxiliary_loss_mlp": 0.01290937, + "balance_loss_clip": 0.06307055, + "balance_loss_mlp": 0.01260754, + "epoch": 0.14128964376972794, + "flos": 28953915269760.0, + "grad_norm": 1.755772853620136, + "language_loss": 0.89833802, + "learning_rate": 3.871478678011177e-06, + "loss": 0.97738338, + "num_input_tokens_seen": 50918385, + "router_z_loss_clip": 3.06445312, + "router_z_loss_mlp": 0.30200195, + "step": 2350, + "time_per_iteration": 2.5965797901153564 + }, + { + "auxiliary_loss_clip": 0.06614771, + "auxiliary_loss_mlp": 0.01295884, + "balance_loss_clip": 0.06303953, + "balance_loss_mlp": 0.0126626, + "epoch": 0.1413497670223959, + "flos": 18995828163840.0, + "grad_norm": 2.169076392434691, + "language_loss": 0.81670076, + "learning_rate": 3.871341282375423e-06, + "loss": 0.89580733, + "num_input_tokens_seen": 50938270, + "router_z_loss_clip": 3.10742188, + "router_z_loss_mlp": 0.29638672, + "step": 2351, + "time_per_iteration": 4.039130687713623 + }, + { + "auxiliary_loss_clip": 0.06617273, + "auxiliary_loss_mlp": 0.012885, + "balance_loss_clip": 0.06303668, + "balance_loss_mlp": 0.01259246, + "epoch": 0.14140989027506387, + "flos": 29870053885440.0, + "grad_norm": 2.711725731055931, + "language_loss": 0.85320342, + "learning_rate": 3.871203815778219e-06, + "loss": 0.93226123, + "num_input_tokens_seen": 50958155, + "router_z_loss_clip": 3.1328125, + "router_z_loss_mlp": 0.29223633, + "step": 2352, + "time_per_iteration": 2.6179373264312744 + }, + { + "auxiliary_loss_clip": 0.06476805, + "auxiliary_loss_mlp": 0.01279755, + "balance_loss_clip": 0.06299812, + "balance_loss_mlp": 0.01267614, + "epoch": 0.14147001352773186, + "flos": 62098901331840.0, + "grad_norm": 0.8822482530682503, + "language_loss": 0.61915213, + "learning_rate": 3.87106627822478e-06, + "loss": 0.69671774, + "num_input_tokens_seen": 51020705, + "router_z_loss_clip": 1.76953125, + "router_z_loss_mlp": 0.12139893, + "step": 2353, + "time_per_iteration": 3.087498188018799 + }, + { + "auxiliary_loss_clip": 0.06606863, + "auxiliary_loss_mlp": 0.01289785, + "balance_loss_clip": 0.06297718, + "balance_loss_mlp": 0.01259458, + "epoch": 0.14153013678039983, + "flos": 22023973267200.0, + "grad_norm": 1.6072508509392793, + "language_loss": 0.88457793, + "learning_rate": 3.8709286697203196e-06, + "loss": 0.96354443, + "num_input_tokens_seen": 51039995, + "router_z_loss_clip": 3.09179688, + "router_z_loss_mlp": 0.30297852, + "step": 2354, + "time_per_iteration": 2.5465357303619385 + }, + { + "auxiliary_loss_clip": 0.06612588, + "auxiliary_loss_mlp": 0.01286583, + "balance_loss_clip": 0.0630111, + "balance_loss_mlp": 0.01255231, + "epoch": 0.1415902600330678, + "flos": 19726365985920.0, + "grad_norm": 1.842515646240357, + "language_loss": 0.75627196, + "learning_rate": 3.870790990270057e-06, + "loss": 0.83526361, + "num_input_tokens_seen": 51059075, + "router_z_loss_clip": 3.11523438, + "router_z_loss_mlp": 0.31347656, + "step": 2355, + "time_per_iteration": 2.5172102451324463 + }, + { + "auxiliary_loss_clip": 0.0647012, + "auxiliary_loss_mlp": 0.01269619, + "balance_loss_clip": 0.06293327, + "balance_loss_mlp": 0.01258312, + "epoch": 0.14165038328573576, + "flos": 65919330316800.0, + "grad_norm": 0.6582247032564781, + "language_loss": 0.51791292, + "learning_rate": 3.870653239879212e-06, + "loss": 0.59531033, + "num_input_tokens_seen": 51120380, + "router_z_loss_clip": 1.765625, + "router_z_loss_mlp": 0.11303711, + "step": 2356, + "time_per_iteration": 3.150625228881836 + }, + { + "auxiliary_loss_clip": 0.06615196, + "auxiliary_loss_mlp": 0.01292015, + "balance_loss_clip": 0.06304777, + "balance_loss_mlp": 0.01263262, + "epoch": 0.14171050653840372, + "flos": 12135011379840.0, + "grad_norm": 2.2420127528599973, + "language_loss": 0.71637189, + "learning_rate": 3.8705154185530095e-06, + "loss": 0.79544401, + "num_input_tokens_seen": 51136950, + "router_z_loss_clip": 3.10742188, + "router_z_loss_mlp": 0.28759766, + "step": 2357, + "time_per_iteration": 2.552600383758545 + }, + { + "auxiliary_loss_clip": 0.06616427, + "auxiliary_loss_mlp": 0.01288449, + "balance_loss_clip": 0.06301764, + "balance_loss_mlp": 0.01259624, + "epoch": 0.1417706297910717, + "flos": 20418735473280.0, + "grad_norm": 1.865810969860464, + "language_loss": 0.83125997, + "learning_rate": 3.870377526296674e-06, + "loss": 0.91030866, + "num_input_tokens_seen": 51155175, + "router_z_loss_clip": 3.15039062, + "router_z_loss_mlp": 0.28833008, + "step": 2358, + "time_per_iteration": 2.5359318256378174 + }, + { + "auxiliary_loss_clip": 0.06627464, + "auxiliary_loss_mlp": 0.01304325, + "balance_loss_clip": 0.06307626, + "balance_loss_mlp": 0.01270685, + "epoch": 0.14183075304373965, + "flos": 22386831373440.0, + "grad_norm": 2.098054947183796, + "language_loss": 0.72660583, + "learning_rate": 3.870239563115436e-06, + "loss": 0.8059237, + "num_input_tokens_seen": 51174500, + "router_z_loss_clip": 3.19726562, + "router_z_loss_mlp": 0.33642578, + "step": 2359, + "time_per_iteration": 2.5888121128082275 + }, + { + "auxiliary_loss_clip": 0.06615248, + "auxiliary_loss_mlp": 0.01292517, + "balance_loss_clip": 0.06299685, + "balance_loss_mlp": 0.0126126, + "epoch": 0.14189087629640765, + "flos": 21587503749120.0, + "grad_norm": 2.25647767982073, + "language_loss": 0.77278101, + "learning_rate": 3.870101529014526e-06, + "loss": 0.85185868, + "num_input_tokens_seen": 51194270, + "router_z_loss_clip": 3.16015625, + "router_z_loss_mlp": 0.31225586, + "step": 2360, + "time_per_iteration": 2.579084634780884 + }, + { + "auxiliary_loss_clip": 0.06601179, + "auxiliary_loss_mlp": 0.01289048, + "balance_loss_clip": 0.06295604, + "balance_loss_mlp": 0.01258936, + "epoch": 0.1419509995490756, + "flos": 20014312942080.0, + "grad_norm": 2.059957260866831, + "language_loss": 0.83125579, + "learning_rate": 3.869963423999178e-06, + "loss": 0.91015804, + "num_input_tokens_seen": 51211850, + "router_z_loss_clip": 3.0546875, + "router_z_loss_mlp": 0.30102539, + "step": 2361, + "time_per_iteration": 2.5846474170684814 + }, + { + "auxiliary_loss_clip": 0.06605215, + "auxiliary_loss_mlp": 0.01291381, + "balance_loss_clip": 0.06296673, + "balance_loss_mlp": 0.01261745, + "epoch": 0.14201112280174358, + "flos": 31949552188800.0, + "grad_norm": 1.940007653055607, + "language_loss": 0.75587547, + "learning_rate": 3.86982524807463e-06, + "loss": 0.83484137, + "num_input_tokens_seen": 51233545, + "router_z_loss_clip": 3.08789062, + "router_z_loss_mlp": 0.29663086, + "step": 2362, + "time_per_iteration": 2.6412899494171143 + }, + { + "auxiliary_loss_clip": 0.06603248, + "auxiliary_loss_mlp": 0.01291653, + "balance_loss_clip": 0.06299227, + "balance_loss_mlp": 0.01262948, + "epoch": 0.14207124605441154, + "flos": 41473811180160.0, + "grad_norm": 1.7220107932789903, + "language_loss": 0.74775076, + "learning_rate": 3.869687001246122e-06, + "loss": 0.82669979, + "num_input_tokens_seen": 51257615, + "router_z_loss_clip": 3.04296875, + "router_z_loss_mlp": 0.28686523, + "step": 2363, + "time_per_iteration": 2.7700705528259277 + }, + { + "auxiliary_loss_clip": 0.0660228, + "auxiliary_loss_mlp": 0.01297174, + "balance_loss_clip": 0.06294844, + "balance_loss_mlp": 0.01268051, + "epoch": 0.1421313693070795, + "flos": 31913186716800.0, + "grad_norm": 1.995738601500514, + "language_loss": 0.74229443, + "learning_rate": 3.8695486835188946e-06, + "loss": 0.82128894, + "num_input_tokens_seen": 51279645, + "router_z_loss_clip": 3.07226562, + "router_z_loss_mlp": 0.2911377, + "step": 2364, + "time_per_iteration": 2.636725664138794 + }, + { + "auxiliary_loss_clip": 0.06596863, + "auxiliary_loss_mlp": 0.01292827, + "balance_loss_clip": 0.06297632, + "balance_loss_mlp": 0.01264741, + "epoch": 0.14219149255974747, + "flos": 26878609670400.0, + "grad_norm": 3.4348232103303853, + "language_loss": 0.91282582, + "learning_rate": 3.869410294898195e-06, + "loss": 0.9917227, + "num_input_tokens_seen": 51299775, + "router_z_loss_clip": 2.9921875, + "router_z_loss_mlp": 0.28100586, + "step": 2365, + "time_per_iteration": 2.6131789684295654 + }, + { + "auxiliary_loss_clip": 0.06604894, + "auxiliary_loss_mlp": 0.01286963, + "balance_loss_clip": 0.06295748, + "balance_loss_mlp": 0.01257613, + "epoch": 0.14225161581241544, + "flos": 27461882741760.0, + "grad_norm": 1.7987446671320764, + "language_loss": 0.67002726, + "learning_rate": 3.869271835389268e-06, + "loss": 0.74894583, + "num_input_tokens_seen": 51319430, + "router_z_loss_clip": 3.09570312, + "router_z_loss_mlp": 0.29345703, + "step": 2366, + "time_per_iteration": 2.5887913703918457 + }, + { + "auxiliary_loss_clip": 0.06604536, + "auxiliary_loss_mlp": 0.01294035, + "balance_loss_clip": 0.06302322, + "balance_loss_mlp": 0.01266069, + "epoch": 0.14231173906508343, + "flos": 10566055203840.0, + "grad_norm": 1.9092553080536903, + "language_loss": 0.81985664, + "learning_rate": 3.8691333049973665e-06, + "loss": 0.89884233, + "num_input_tokens_seen": 51336045, + "router_z_loss_clip": 3.0234375, + "router_z_loss_mlp": 0.27978516, + "step": 2367, + "time_per_iteration": 2.5478296279907227 + }, + { + "auxiliary_loss_clip": 0.06620896, + "auxiliary_loss_mlp": 0.01287452, + "balance_loss_clip": 0.06312472, + "balance_loss_mlp": 0.01257244, + "epoch": 0.1423718623177514, + "flos": 28367539597440.0, + "grad_norm": 1.7968709236925184, + "language_loss": 0.83861458, + "learning_rate": 3.868994703727742e-06, + "loss": 0.91769814, + "num_input_tokens_seen": 51357030, + "router_z_loss_clip": 3.08984375, + "router_z_loss_mlp": 0.30224609, + "step": 2368, + "time_per_iteration": 2.6346163749694824 + }, + { + "auxiliary_loss_clip": 0.06607647, + "auxiliary_loss_mlp": 0.01292051, + "balance_loss_clip": 0.06299834, + "balance_loss_mlp": 0.01262558, + "epoch": 0.14243198557041936, + "flos": 19360279497600.0, + "grad_norm": 2.15297979683556, + "language_loss": 0.8844623, + "learning_rate": 3.868856031585652e-06, + "loss": 0.96345925, + "num_input_tokens_seen": 51374890, + "router_z_loss_clip": 3.078125, + "router_z_loss_mlp": 0.29516602, + "step": 2369, + "time_per_iteration": 2.539616823196411 + }, + { + "auxiliary_loss_clip": 0.06609218, + "auxiliary_loss_mlp": 0.01286988, + "balance_loss_clip": 0.06298466, + "balance_loss_mlp": 0.01257067, + "epoch": 0.14249210882308733, + "flos": 28814952072960.0, + "grad_norm": 1.4943626605358518, + "language_loss": 0.76837498, + "learning_rate": 3.868717288576354e-06, + "loss": 0.84733701, + "num_input_tokens_seen": 51398100, + "router_z_loss_clip": 3.10546875, + "router_z_loss_mlp": 0.29931641, + "step": 2370, + "time_per_iteration": 2.6086556911468506 + }, + { + "auxiliary_loss_clip": 0.06600792, + "auxiliary_loss_mlp": 0.01298284, + "balance_loss_clip": 0.06298122, + "balance_loss_mlp": 0.01270198, + "epoch": 0.1425522320757553, + "flos": 21841433147520.0, + "grad_norm": 1.5553091357309907, + "language_loss": 0.83888042, + "learning_rate": 3.868578474705109e-06, + "loss": 0.91787124, + "num_input_tokens_seen": 51418745, + "router_z_loss_clip": 3.02734375, + "router_z_loss_mlp": 0.28076172, + "step": 2371, + "time_per_iteration": 2.5464093685150146 + }, + { + "auxiliary_loss_clip": 0.06608661, + "auxiliary_loss_mlp": 0.01298037, + "balance_loss_clip": 0.06299958, + "balance_loss_mlp": 0.01267448, + "epoch": 0.14261235532842326, + "flos": 17317230520320.0, + "grad_norm": 1.80299500179396, + "language_loss": 0.84039259, + "learning_rate": 3.868439589977181e-06, + "loss": 0.91945958, + "num_input_tokens_seen": 51437455, + "router_z_loss_clip": 3.08984375, + "router_z_loss_mlp": 0.30615234, + "step": 2372, + "time_per_iteration": 2.6340725421905518 + }, + { + "auxiliary_loss_clip": 0.0660327, + "auxiliary_loss_mlp": 0.01297499, + "balance_loss_clip": 0.06296232, + "balance_loss_mlp": 0.01267149, + "epoch": 0.14267247858109125, + "flos": 18812659138560.0, + "grad_norm": 1.948811934487197, + "language_loss": 0.8570497, + "learning_rate": 3.868300634397836e-06, + "loss": 0.93605745, + "num_input_tokens_seen": 51455710, + "router_z_loss_clip": 3.07226562, + "router_z_loss_mlp": 0.30322266, + "step": 2373, + "time_per_iteration": 2.580719232559204 + }, + { + "auxiliary_loss_clip": 0.06601362, + "auxiliary_loss_mlp": 0.01295253, + "balance_loss_clip": 0.06296989, + "balance_loss_mlp": 0.01266547, + "epoch": 0.14273260183375922, + "flos": 11362783351680.0, + "grad_norm": 1.9518464435556906, + "language_loss": 0.87130672, + "learning_rate": 3.8681616079723445e-06, + "loss": 0.95027292, + "num_input_tokens_seen": 51471270, + "router_z_loss_clip": 3.04101562, + "router_z_loss_mlp": 0.28710938, + "step": 2374, + "time_per_iteration": 2.499939441680908 + }, + { + "auxiliary_loss_clip": 0.0660402, + "auxiliary_loss_mlp": 0.01294805, + "balance_loss_clip": 0.06292336, + "balance_loss_mlp": 0.01264526, + "epoch": 0.14279272508642718, + "flos": 27575800767360.0, + "grad_norm": 1.5586534981326832, + "language_loss": 0.79946959, + "learning_rate": 3.868022510705977e-06, + "loss": 0.87845778, + "num_input_tokens_seen": 51492705, + "router_z_loss_clip": 3.1171875, + "router_z_loss_mlp": 0.30273438, + "step": 2375, + "time_per_iteration": 2.610959768295288 + }, + { + "auxiliary_loss_clip": 0.06608847, + "auxiliary_loss_mlp": 0.01308792, + "balance_loss_clip": 0.06302035, + "balance_loss_mlp": 0.01278454, + "epoch": 0.14285284833909515, + "flos": 16258019857920.0, + "grad_norm": 4.976375068021591, + "language_loss": 0.77988309, + "learning_rate": 3.867883342604009e-06, + "loss": 0.85905945, + "num_input_tokens_seen": 51510780, + "router_z_loss_clip": 3.0703125, + "router_z_loss_mlp": 0.30310059, + "step": 2376, + "time_per_iteration": 2.5109288692474365 + }, + { + "auxiliary_loss_clip": 0.06606634, + "auxiliary_loss_mlp": 0.01292138, + "balance_loss_clip": 0.06302465, + "balance_loss_mlp": 0.01263742, + "epoch": 0.1429129715917631, + "flos": 19761725208960.0, + "grad_norm": 1.9346292161061796, + "language_loss": 0.94255036, + "learning_rate": 3.867744103671717e-06, + "loss": 1.02153814, + "num_input_tokens_seen": 51531400, + "router_z_loss_clip": 3.04101562, + "router_z_loss_mlp": 0.28393555, + "step": 2377, + "time_per_iteration": 2.5885112285614014 + }, + { + "auxiliary_loss_clip": 0.06608409, + "auxiliary_loss_mlp": 0.01297565, + "balance_loss_clip": 0.06298596, + "balance_loss_mlp": 0.01267524, + "epoch": 0.14297309484443108, + "flos": 21142606896000.0, + "grad_norm": 1.9262255620531108, + "language_loss": 0.92638403, + "learning_rate": 3.867604793914382e-06, + "loss": 1.00544381, + "num_input_tokens_seen": 51548215, + "router_z_loss_clip": 3.09960938, + "router_z_loss_mlp": 0.30029297, + "step": 2378, + "time_per_iteration": 2.5396018028259277 + }, + { + "auxiliary_loss_clip": 0.06602019, + "auxiliary_loss_mlp": 0.01288289, + "balance_loss_clip": 0.06294227, + "balance_loss_mlp": 0.01259667, + "epoch": 0.14303321809709904, + "flos": 23593432567680.0, + "grad_norm": 1.925396398414909, + "language_loss": 0.7506215, + "learning_rate": 3.8674654133372864e-06, + "loss": 0.82952458, + "num_input_tokens_seen": 51566820, + "router_z_loss_clip": 3.07421875, + "router_z_loss_mlp": 0.28649902, + "step": 2379, + "time_per_iteration": 2.5452654361724854 + }, + { + "auxiliary_loss_clip": 0.06604548, + "auxiliary_loss_mlp": 0.01289072, + "balance_loss_clip": 0.06300471, + "balance_loss_mlp": 0.01259342, + "epoch": 0.14309334134976703, + "flos": 15893778159360.0, + "grad_norm": 2.089306422098332, + "language_loss": 0.80051982, + "learning_rate": 3.867325961945714e-06, + "loss": 0.87945604, + "num_input_tokens_seen": 51585075, + "router_z_loss_clip": 3.0390625, + "router_z_loss_mlp": 0.29736328, + "step": 2380, + "time_per_iteration": 2.526667594909668 + }, + { + "auxiliary_loss_clip": 0.06614038, + "auxiliary_loss_mlp": 0.01293901, + "balance_loss_clip": 0.06305015, + "balance_loss_mlp": 0.01263348, + "epoch": 0.143153464602435, + "flos": 16331086218240.0, + "grad_norm": 2.094305551914021, + "language_loss": 0.88833153, + "learning_rate": 3.867186439744955e-06, + "loss": 0.96741092, + "num_input_tokens_seen": 51603185, + "router_z_loss_clip": 3.09179688, + "router_z_loss_mlp": 0.30578613, + "step": 2381, + "time_per_iteration": 2.5728068351745605 + }, + { + "auxiliary_loss_clip": 0.06602444, + "auxiliary_loss_mlp": 0.0128486, + "balance_loss_clip": 0.06299065, + "balance_loss_mlp": 0.01256226, + "epoch": 0.14321358785510296, + "flos": 17097737950080.0, + "grad_norm": 2.316632685614806, + "language_loss": 0.77740443, + "learning_rate": 3.867046846740299e-06, + "loss": 0.85627747, + "num_input_tokens_seen": 51620880, + "router_z_loss_clip": 3.02929688, + "router_z_loss_mlp": 0.28625488, + "step": 2382, + "time_per_iteration": 2.5297727584838867 + }, + { + "auxiliary_loss_clip": 0.06601999, + "auxiliary_loss_mlp": 0.01286872, + "balance_loss_clip": 0.06297128, + "balance_loss_mlp": 0.01257904, + "epoch": 0.14327371110777093, + "flos": 26330108843520.0, + "grad_norm": 2.004241684907444, + "language_loss": 0.78048921, + "learning_rate": 3.866907182937039e-06, + "loss": 0.85937786, + "num_input_tokens_seen": 51640170, + "router_z_loss_clip": 3.05078125, + "router_z_loss_mlp": 0.28955078, + "step": 2383, + "time_per_iteration": 2.598944664001465 + }, + { + "auxiliary_loss_clip": 0.06614614, + "auxiliary_loss_mlp": 0.01292365, + "balance_loss_clip": 0.06304877, + "balance_loss_mlp": 0.01261513, + "epoch": 0.1433338343604389, + "flos": 18082163243520.0, + "grad_norm": 3.628436675924041, + "language_loss": 0.88476908, + "learning_rate": 3.866767448340471e-06, + "loss": 0.96383882, + "num_input_tokens_seen": 51656580, + "router_z_loss_clip": 3.09765625, + "router_z_loss_mlp": 0.30834961, + "step": 2384, + "time_per_iteration": 2.5066895484924316 + }, + { + "auxiliary_loss_clip": 0.06611983, + "auxiliary_loss_mlp": 0.01297446, + "balance_loss_clip": 0.06300933, + "balance_loss_mlp": 0.0126719, + "epoch": 0.14339395761310686, + "flos": 15528110941440.0, + "grad_norm": 5.651210237348795, + "language_loss": 0.81964046, + "learning_rate": 3.866627642955895e-06, + "loss": 0.89873475, + "num_input_tokens_seen": 51674645, + "router_z_loss_clip": 3.11132812, + "router_z_loss_mlp": 0.30273438, + "step": 2385, + "time_per_iteration": 3.9016833305358887 + }, + { + "auxiliary_loss_clip": 0.06612079, + "auxiliary_loss_mlp": 0.01294874, + "balance_loss_clip": 0.06302845, + "balance_loss_mlp": 0.01266406, + "epoch": 0.14345408086577485, + "flos": 28556368773120.0, + "grad_norm": 2.028141972046204, + "language_loss": 0.76766604, + "learning_rate": 3.866487766788612e-06, + "loss": 0.8467356, + "num_input_tokens_seen": 51695770, + "router_z_loss_clip": 3.09570312, + "router_z_loss_mlp": 0.28479004, + "step": 2386, + "time_per_iteration": 4.032405376434326 + }, + { + "auxiliary_loss_clip": 0.06616995, + "auxiliary_loss_mlp": 0.01287556, + "balance_loss_clip": 0.06312285, + "balance_loss_mlp": 0.01258958, + "epoch": 0.14351420411844282, + "flos": 20236279207680.0, + "grad_norm": 2.123480501578919, + "language_loss": 0.79237044, + "learning_rate": 3.866347819843925e-06, + "loss": 0.87141591, + "num_input_tokens_seen": 51714165, + "router_z_loss_clip": 3.046875, + "router_z_loss_mlp": 0.28601074, + "step": 2387, + "time_per_iteration": 2.5608971118927 + }, + { + "auxiliary_loss_clip": 0.06612308, + "auxiliary_loss_mlp": 0.01293206, + "balance_loss_clip": 0.06306893, + "balance_loss_mlp": 0.01263023, + "epoch": 0.14357432737111078, + "flos": 19871157041280.0, + "grad_norm": 2.5788985385847396, + "language_loss": 0.83602524, + "learning_rate": 3.866207802127143e-06, + "loss": 0.91508037, + "num_input_tokens_seen": 51734440, + "router_z_loss_clip": 3.06054688, + "router_z_loss_mlp": 0.30200195, + "step": 2388, + "time_per_iteration": 2.5413224697113037 + }, + { + "auxiliary_loss_clip": 0.06619543, + "auxiliary_loss_mlp": 0.01287669, + "balance_loss_clip": 0.06312172, + "balance_loss_mlp": 0.0126006, + "epoch": 0.14363445062377875, + "flos": 28264354894080.0, + "grad_norm": 2.5598639084548176, + "language_loss": 0.83343434, + "learning_rate": 3.866067713643573e-06, + "loss": 0.91250646, + "num_input_tokens_seen": 51753730, + "router_z_loss_clip": 3.07421875, + "router_z_loss_mlp": 0.27648926, + "step": 2389, + "time_per_iteration": 2.6027376651763916 + }, + { + "auxiliary_loss_clip": 0.06612187, + "auxiliary_loss_mlp": 0.01286457, + "balance_loss_clip": 0.06301727, + "balance_loss_mlp": 0.01257013, + "epoch": 0.1436945738764467, + "flos": 18192517470720.0, + "grad_norm": 2.036228542153499, + "language_loss": 0.84029567, + "learning_rate": 3.8659275543985285e-06, + "loss": 0.91928208, + "num_input_tokens_seen": 51771195, + "router_z_loss_clip": 3.10546875, + "router_z_loss_mlp": 0.29467773, + "step": 2390, + "time_per_iteration": 5.428901672363281 + }, + { + "auxiliary_loss_clip": 0.06612678, + "auxiliary_loss_mlp": 0.01293631, + "balance_loss_clip": 0.06306715, + "balance_loss_mlp": 0.01264282, + "epoch": 0.14375469712911468, + "flos": 27315246896640.0, + "grad_norm": 2.34202135113637, + "language_loss": 0.75496042, + "learning_rate": 3.865787324397324e-06, + "loss": 0.83402348, + "num_input_tokens_seen": 51792290, + "router_z_loss_clip": 3.06054688, + "router_z_loss_mlp": 0.29345703, + "step": 2391, + "time_per_iteration": 2.599823236465454 + }, + { + "auxiliary_loss_clip": 0.06462222, + "auxiliary_loss_mlp": 0.01318708, + "balance_loss_clip": 0.06290679, + "balance_loss_mlp": 0.01307848, + "epoch": 0.14381482038178264, + "flos": 56908757980800.0, + "grad_norm": 0.847659725006037, + "language_loss": 0.61820173, + "learning_rate": 3.865647023645277e-06, + "loss": 0.69601095, + "num_input_tokens_seen": 51843675, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.10876465, + "step": 2392, + "time_per_iteration": 3.007570266723633 + }, + { + "auxiliary_loss_clip": 0.06623066, + "auxiliary_loss_mlp": 0.01297432, + "balance_loss_clip": 0.06308551, + "balance_loss_mlp": 0.01267105, + "epoch": 0.14387494363445064, + "flos": 14287282554240.0, + "grad_norm": 6.716541515366395, + "language_loss": 0.77778554, + "learning_rate": 3.865506652147709e-06, + "loss": 0.85699052, + "num_input_tokens_seen": 51860285, + "router_z_loss_clip": 3.14453125, + "router_z_loss_mlp": 0.30322266, + "step": 2393, + "time_per_iteration": 2.5064942836761475 + }, + { + "auxiliary_loss_clip": 0.06614703, + "auxiliary_loss_mlp": 0.01296275, + "balance_loss_clip": 0.06308223, + "balance_loss_mlp": 0.01266687, + "epoch": 0.1439350668871186, + "flos": 26768884348800.0, + "grad_norm": 2.0037821703408287, + "language_loss": 0.78038269, + "learning_rate": 3.865366209909941e-06, + "loss": 0.85949242, + "num_input_tokens_seen": 51880105, + "router_z_loss_clip": 3.06445312, + "router_z_loss_mlp": 0.2956543, + "step": 2394, + "time_per_iteration": 2.6112003326416016 + }, + { + "auxiliary_loss_clip": 0.06611894, + "auxiliary_loss_mlp": 0.01285238, + "balance_loss_clip": 0.06308618, + "balance_loss_mlp": 0.01256866, + "epoch": 0.14399519013978657, + "flos": 40709926632960.0, + "grad_norm": 2.2776605014778, + "language_loss": 0.87247694, + "learning_rate": 3.8652256969372994e-06, + "loss": 0.95144826, + "num_input_tokens_seen": 51905175, + "router_z_loss_clip": 3.03320312, + "router_z_loss_mlp": 0.28381348, + "step": 2395, + "time_per_iteration": 2.708005428314209 + }, + { + "auxiliary_loss_clip": 0.06606728, + "auxiliary_loss_mlp": 0.0129272, + "balance_loss_clip": 0.06306736, + "balance_loss_mlp": 0.01262846, + "epoch": 0.14405531339245453, + "flos": 20563652309760.0, + "grad_norm": 1.5258430726739798, + "language_loss": 0.83690441, + "learning_rate": 3.865085113235113e-06, + "loss": 0.91589892, + "num_input_tokens_seen": 51924490, + "router_z_loss_clip": 3.00195312, + "router_z_loss_mlp": 0.29882812, + "step": 2396, + "time_per_iteration": 2.554426431655884 + }, + { + "auxiliary_loss_clip": 0.06608565, + "auxiliary_loss_mlp": 0.01286347, + "balance_loss_clip": 0.06309813, + "balance_loss_mlp": 0.0125664, + "epoch": 0.1441154366451225, + "flos": 19578975454080.0, + "grad_norm": 3.4820488024482787, + "language_loss": 0.83915055, + "learning_rate": 3.864944458808712e-06, + "loss": 0.9180997, + "num_input_tokens_seen": 51940490, + "router_z_loss_clip": 2.98828125, + "router_z_loss_mlp": 0.29711914, + "step": 2397, + "time_per_iteration": 2.504763603210449 + }, + { + "auxiliary_loss_clip": 0.0661477, + "auxiliary_loss_mlp": 0.01289633, + "balance_loss_clip": 0.0631109, + "balance_loss_mlp": 0.01261452, + "epoch": 0.14417555989779046, + "flos": 18521735362560.0, + "grad_norm": 2.264494400552882, + "language_loss": 0.81188649, + "learning_rate": 3.86480373366343e-06, + "loss": 0.89093053, + "num_input_tokens_seen": 51957910, + "router_z_loss_clip": 3.03710938, + "router_z_loss_mlp": 0.28186035, + "step": 2398, + "time_per_iteration": 2.5385115146636963 + }, + { + "auxiliary_loss_clip": 0.0661198, + "auxiliary_loss_mlp": 0.01292634, + "balance_loss_clip": 0.06310214, + "balance_loss_mlp": 0.01263535, + "epoch": 0.14423568315045843, + "flos": 26038933505280.0, + "grad_norm": 2.0391001830721014, + "language_loss": 0.65964776, + "learning_rate": 3.864662937804603e-06, + "loss": 0.73869389, + "num_input_tokens_seen": 51978010, + "router_z_loss_clip": 3.01367188, + "router_z_loss_mlp": 0.2911377, + "step": 2399, + "time_per_iteration": 2.5843687057495117 + }, + { + "auxiliary_loss_clip": 0.06611193, + "auxiliary_loss_mlp": 0.01283302, + "balance_loss_clip": 0.06308104, + "balance_loss_mlp": 0.01253953, + "epoch": 0.14429580640312642, + "flos": 21295238307840.0, + "grad_norm": 1.6766317515480094, + "language_loss": 0.83645046, + "learning_rate": 3.864522071237571e-06, + "loss": 0.91539544, + "num_input_tokens_seen": 51998515, + "router_z_loss_clip": 3.03320312, + "router_z_loss_mlp": 0.29321289, + "step": 2400, + "time_per_iteration": 2.555400848388672 + }, + { + "auxiliary_loss_clip": 0.06611119, + "auxiliary_loss_mlp": 0.01295227, + "balance_loss_clip": 0.06304638, + "balance_loss_mlp": 0.01263494, + "epoch": 0.14435592965579438, + "flos": 25634636755200.0, + "grad_norm": 1.4775307939223221, + "language_loss": 0.75889075, + "learning_rate": 3.864381133967676e-06, + "loss": 0.83795416, + "num_input_tokens_seen": 52019270, + "router_z_loss_clip": 3.06445312, + "router_z_loss_mlp": 0.31738281, + "step": 2401, + "time_per_iteration": 2.595510959625244 + }, + { + "auxiliary_loss_clip": 0.06599294, + "auxiliary_loss_mlp": 0.01290815, + "balance_loss_clip": 0.06300685, + "balance_loss_mlp": 0.01262991, + "epoch": 0.14441605290846235, + "flos": 22971488037120.0, + "grad_norm": 3.551603969288966, + "language_loss": 0.81723303, + "learning_rate": 3.86424012600026e-06, + "loss": 0.89613414, + "num_input_tokens_seen": 52039315, + "router_z_loss_clip": 2.984375, + "router_z_loss_mlp": 0.27832031, + "step": 2402, + "time_per_iteration": 2.586766242980957 + }, + { + "auxiliary_loss_clip": 0.06609451, + "auxiliary_loss_mlp": 0.0129576, + "balance_loss_clip": 0.06306025, + "balance_loss_mlp": 0.01267246, + "epoch": 0.14447617616113032, + "flos": 17353386357120.0, + "grad_norm": 2.060017923221776, + "language_loss": 0.8556419, + "learning_rate": 3.864099047340673e-06, + "loss": 0.93469405, + "num_input_tokens_seen": 52056555, + "router_z_loss_clip": 3.03320312, + "router_z_loss_mlp": 0.28491211, + "step": 2403, + "time_per_iteration": 2.607682943344116 + }, + { + "auxiliary_loss_clip": 0.06604473, + "auxiliary_loss_mlp": 0.01296468, + "balance_loss_clip": 0.06304755, + "balance_loss_mlp": 0.01267644, + "epoch": 0.14453629941379828, + "flos": 24066896463360.0, + "grad_norm": 1.6573993279871784, + "language_loss": 0.71218109, + "learning_rate": 3.863957897994262e-06, + "loss": 0.79119051, + "num_input_tokens_seen": 52075800, + "router_z_loss_clip": 2.99609375, + "router_z_loss_mlp": 0.28833008, + "step": 2404, + "time_per_iteration": 2.5632174015045166 + }, + { + "auxiliary_loss_clip": 0.06603173, + "auxiliary_loss_mlp": 0.0129217, + "balance_loss_clip": 0.06303019, + "balance_loss_mlp": 0.0126282, + "epoch": 0.14459642266646625, + "flos": 14434924648320.0, + "grad_norm": 2.334574719230043, + "language_loss": 0.74209595, + "learning_rate": 3.863816677966381e-06, + "loss": 0.82104933, + "num_input_tokens_seen": 52092585, + "router_z_loss_clip": 3.0, + "router_z_loss_mlp": 0.29345703, + "step": 2405, + "time_per_iteration": 2.520474910736084 + }, + { + "auxiliary_loss_clip": 0.06599967, + "auxiliary_loss_mlp": 0.01307828, + "balance_loss_clip": 0.06301095, + "balance_loss_mlp": 0.01279647, + "epoch": 0.14465654591913424, + "flos": 9871337802240.0, + "grad_norm": 2.8694662985653245, + "language_loss": 0.74507034, + "learning_rate": 3.863675387262386e-06, + "loss": 0.8241483, + "num_input_tokens_seen": 52108990, + "router_z_loss_clip": 2.984375, + "router_z_loss_mlp": 0.28173828, + "step": 2406, + "time_per_iteration": 2.5204012393951416 + }, + { + "auxiliary_loss_clip": 0.0660891, + "auxiliary_loss_mlp": 0.01299289, + "balance_loss_clip": 0.06308217, + "balance_loss_mlp": 0.01270584, + "epoch": 0.1447166691718022, + "flos": 24979890551040.0, + "grad_norm": 2.4466515535741027, + "language_loss": 0.77524543, + "learning_rate": 3.8635340258876325e-06, + "loss": 0.85432744, + "num_input_tokens_seen": 52125385, + "router_z_loss_clip": 3.00976562, + "router_z_loss_mlp": 0.28686523, + "step": 2407, + "time_per_iteration": 2.5871012210845947 + }, + { + "auxiliary_loss_clip": 0.06596132, + "auxiliary_loss_mlp": 0.01309759, + "balance_loss_clip": 0.06298497, + "balance_loss_mlp": 0.01281459, + "epoch": 0.14477679242447017, + "flos": 21914457580800.0, + "grad_norm": 2.4005439664015156, + "language_loss": 0.80167431, + "learning_rate": 3.8633925938474826e-06, + "loss": 0.88073325, + "num_input_tokens_seen": 52144985, + "router_z_loss_clip": 2.9765625, + "router_z_loss_mlp": 0.28320312, + "step": 2408, + "time_per_iteration": 2.5400643348693848 + }, + { + "auxiliary_loss_clip": 0.06608425, + "auxiliary_loss_mlp": 0.01300861, + "balance_loss_clip": 0.06305376, + "balance_loss_mlp": 0.0126939, + "epoch": 0.14483691567713813, + "flos": 20747030970240.0, + "grad_norm": 2.230633188895553, + "language_loss": 0.83653724, + "learning_rate": 3.863251091147299e-06, + "loss": 0.9156301, + "num_input_tokens_seen": 52163885, + "router_z_loss_clip": 3.03320312, + "router_z_loss_mlp": 0.31445312, + "step": 2409, + "time_per_iteration": 2.5423808097839355 + }, + { + "auxiliary_loss_clip": 0.06608373, + "auxiliary_loss_mlp": 0.0129938, + "balance_loss_clip": 0.06298821, + "balance_loss_mlp": 0.0126978, + "epoch": 0.1448970389298061, + "flos": 35416388943360.0, + "grad_norm": 2.041474654068305, + "language_loss": 0.76231539, + "learning_rate": 3.863109517792446e-06, + "loss": 0.84139293, + "num_input_tokens_seen": 52184325, + "router_z_loss_clip": 3.09765625, + "router_z_loss_mlp": 0.29602051, + "step": 2410, + "time_per_iteration": 2.6380317211151123 + }, + { + "auxiliary_loss_clip": 0.0660304, + "auxiliary_loss_mlp": 0.01294458, + "balance_loss_clip": 0.06304888, + "balance_loss_mlp": 0.01265491, + "epoch": 0.14495716218247406, + "flos": 15419853066240.0, + "grad_norm": 1.847852108753089, + "language_loss": 0.8233192, + "learning_rate": 3.8629678737882945e-06, + "loss": 0.90229416, + "num_input_tokens_seen": 52202740, + "router_z_loss_clip": 2.98046875, + "router_z_loss_mlp": 0.28942871, + "step": 2411, + "time_per_iteration": 2.5439260005950928 + }, + { + "auxiliary_loss_clip": 0.06610366, + "auxiliary_loss_mlp": 0.0129153, + "balance_loss_clip": 0.06308557, + "balance_loss_mlp": 0.01262514, + "epoch": 0.14501728543514203, + "flos": 33701677390080.0, + "grad_norm": 2.23940850930143, + "language_loss": 0.71979284, + "learning_rate": 3.862826159140214e-06, + "loss": 0.79881179, + "num_input_tokens_seen": 52223100, + "router_z_loss_clip": 3.01757812, + "router_z_loss_mlp": 0.29003906, + "step": 2412, + "time_per_iteration": 2.654892921447754 + }, + { + "auxiliary_loss_clip": 0.06603752, + "auxiliary_loss_mlp": 0.01292883, + "balance_loss_clip": 0.06306557, + "balance_loss_mlp": 0.01265465, + "epoch": 0.14507740868781002, + "flos": 15601512718080.0, + "grad_norm": 1.90667529133839, + "language_loss": 0.78426313, + "learning_rate": 3.862684373853579e-06, + "loss": 0.86322957, + "num_input_tokens_seen": 52239690, + "router_z_loss_clip": 2.96875, + "router_z_loss_mlp": 0.27441406, + "step": 2413, + "time_per_iteration": 2.5105841159820557 + }, + { + "auxiliary_loss_clip": 0.06474504, + "auxiliary_loss_mlp": 0.01256457, + "balance_loss_clip": 0.06298508, + "balance_loss_mlp": 0.01246152, + "epoch": 0.145137531940478, + "flos": 66695247924480.0, + "grad_norm": 0.8850823768955927, + "language_loss": 0.58774322, + "learning_rate": 3.8625425179337656e-06, + "loss": 0.66505289, + "num_input_tokens_seen": 52296705, + "router_z_loss_clip": 1.76269531, + "router_z_loss_mlp": 0.10308838, + "step": 2414, + "time_per_iteration": 3.0886166095733643 + }, + { + "auxiliary_loss_clip": 0.06466582, + "auxiliary_loss_mlp": 0.01255839, + "balance_loss_clip": 0.06291236, + "balance_loss_mlp": 0.01245486, + "epoch": 0.14519765519314595, + "flos": 67542806373120.0, + "grad_norm": 0.8215511806181923, + "language_loss": 0.61917955, + "learning_rate": 3.862400591386154e-06, + "loss": 0.69640374, + "num_input_tokens_seen": 52361830, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.10357666, + "step": 2415, + "time_per_iteration": 3.1800529956817627 + }, + { + "auxiliary_loss_clip": 0.06605236, + "auxiliary_loss_mlp": 0.0128974, + "balance_loss_clip": 0.06304489, + "balance_loss_mlp": 0.01261226, + "epoch": 0.14525777844581392, + "flos": 17204151035520.0, + "grad_norm": 1.9287382315286696, + "language_loss": 0.72791839, + "learning_rate": 3.8622585942161245e-06, + "loss": 0.80686808, + "num_input_tokens_seen": 52379420, + "router_z_loss_clip": 3.0078125, + "router_z_loss_mlp": 0.28540039, + "step": 2416, + "time_per_iteration": 2.5888171195983887 + }, + { + "auxiliary_loss_clip": 0.06466876, + "auxiliary_loss_mlp": 0.01256349, + "balance_loss_clip": 0.06292045, + "balance_loss_mlp": 0.01246574, + "epoch": 0.14531790169848188, + "flos": 65425349370240.0, + "grad_norm": 0.6779730680906524, + "language_loss": 0.60441911, + "learning_rate": 3.8621165264290635e-06, + "loss": 0.68165135, + "num_input_tokens_seen": 52446290, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.09765625, + "step": 2417, + "time_per_iteration": 3.256091356277466 + }, + { + "auxiliary_loss_clip": 0.06611343, + "auxiliary_loss_mlp": 0.01295709, + "balance_loss_clip": 0.06300741, + "balance_loss_mlp": 0.0126543, + "epoch": 0.14537802495114985, + "flos": 32570783959680.0, + "grad_norm": 9.327498524911116, + "language_loss": 0.80428064, + "learning_rate": 3.861974388030356e-06, + "loss": 0.88335121, + "num_input_tokens_seen": 52467295, + "router_z_loss_clip": 3.10742188, + "router_z_loss_mlp": 0.30297852, + "step": 2418, + "time_per_iteration": 2.6627931594848633 + }, + { + "auxiliary_loss_clip": 0.06597205, + "auxiliary_loss_mlp": 0.01293692, + "balance_loss_clip": 0.06300168, + "balance_loss_mlp": 0.01265952, + "epoch": 0.1454381482038178, + "flos": 20232338065920.0, + "grad_norm": 1.7107019560934957, + "language_loss": 0.72557437, + "learning_rate": 3.861832179025394e-06, + "loss": 0.80448335, + "num_input_tokens_seen": 52487295, + "router_z_loss_clip": 2.97265625, + "router_z_loss_mlp": 0.27746582, + "step": 2419, + "time_per_iteration": 2.55110764503479 + }, + { + "auxiliary_loss_clip": 0.06605242, + "auxiliary_loss_mlp": 0.01287615, + "balance_loss_clip": 0.06300443, + "balance_loss_mlp": 0.01258563, + "epoch": 0.1454982714564858, + "flos": 22899721415040.0, + "grad_norm": 2.764675065682222, + "language_loss": 0.91167969, + "learning_rate": 3.861689899419569e-06, + "loss": 0.99060822, + "num_input_tokens_seen": 52504220, + "router_z_loss_clip": 3.04882812, + "router_z_loss_mlp": 0.29064941, + "step": 2420, + "time_per_iteration": 2.554682731628418 + }, + { + "auxiliary_loss_clip": 0.06610379, + "auxiliary_loss_mlp": 0.01289829, + "balance_loss_clip": 0.06309067, + "balance_loss_mlp": 0.01262757, + "epoch": 0.14555839470915377, + "flos": 20236027645440.0, + "grad_norm": 2.2697741355192034, + "language_loss": 0.83967364, + "learning_rate": 3.861547549218276e-06, + "loss": 0.91867572, + "num_input_tokens_seen": 52521900, + "router_z_loss_clip": 3.015625, + "router_z_loss_mlp": 0.27050781, + "step": 2421, + "time_per_iteration": 2.5464484691619873 + }, + { + "auxiliary_loss_clip": 0.06610221, + "auxiliary_loss_mlp": 0.01287397, + "balance_loss_clip": 0.0630337, + "balance_loss_mlp": 0.01259216, + "epoch": 0.14561851796182174, + "flos": 22242753077760.0, + "grad_norm": 1.9618808249376125, + "language_loss": 0.82542074, + "learning_rate": 3.861405128426914e-06, + "loss": 0.90439695, + "num_input_tokens_seen": 52540495, + "router_z_loss_clip": 3.06835938, + "router_z_loss_mlp": 0.28173828, + "step": 2422, + "time_per_iteration": 2.5524632930755615 + }, + { + "auxiliary_loss_clip": 0.06461698, + "auxiliary_loss_mlp": 0.01262269, + "balance_loss_clip": 0.06287467, + "balance_loss_mlp": 0.01252607, + "epoch": 0.1456786412144897, + "flos": 52655758692480.0, + "grad_norm": 0.899920685315801, + "language_loss": 0.63252938, + "learning_rate": 3.861262637050883e-06, + "loss": 0.70976901, + "num_input_tokens_seen": 52603305, + "router_z_loss_clip": 1.74316406, + "router_z_loss_mlp": 0.09649658, + "step": 2423, + "time_per_iteration": 3.186488151550293 + }, + { + "auxiliary_loss_clip": 0.06612016, + "auxiliary_loss_mlp": 0.01288368, + "balance_loss_clip": 0.06311088, + "balance_loss_mlp": 0.01261402, + "epoch": 0.14573876446715767, + "flos": 23228352328320.0, + "grad_norm": 1.6675722488639018, + "language_loss": 0.82883829, + "learning_rate": 3.861120075095585e-06, + "loss": 0.90784216, + "num_input_tokens_seen": 52623435, + "router_z_loss_clip": 3.00976562, + "router_z_loss_mlp": 0.26928711, + "step": 2424, + "time_per_iteration": 2.6136088371276855 + }, + { + "auxiliary_loss_clip": 0.0660837, + "auxiliary_loss_mlp": 0.01282475, + "balance_loss_clip": 0.06310098, + "balance_loss_mlp": 0.01254246, + "epoch": 0.14579888771982563, + "flos": 18120331578240.0, + "grad_norm": 3.5994104334935733, + "language_loss": 0.79757202, + "learning_rate": 3.860977442566429e-06, + "loss": 0.87648046, + "num_input_tokens_seen": 52642255, + "router_z_loss_clip": 2.98242188, + "router_z_loss_mlp": 0.28271484, + "step": 2425, + "time_per_iteration": 4.07472825050354 + }, + { + "auxiliary_loss_clip": 0.06616544, + "auxiliary_loss_mlp": 0.01291448, + "balance_loss_clip": 0.06312044, + "balance_loss_mlp": 0.01263577, + "epoch": 0.14585901097249362, + "flos": 23007476165760.0, + "grad_norm": 3.905152777460985, + "language_loss": 0.84682351, + "learning_rate": 3.860834739468821e-06, + "loss": 0.92590338, + "num_input_tokens_seen": 52658700, + "router_z_loss_clip": 3.046875, + "router_z_loss_mlp": 0.27893066, + "step": 2426, + "time_per_iteration": 3.9595530033111572 + }, + { + "auxiliary_loss_clip": 0.066182, + "auxiliary_loss_mlp": 0.01297578, + "balance_loss_clip": 0.06312812, + "balance_loss_mlp": 0.0126904, + "epoch": 0.1459191342251616, + "flos": 21915212267520.0, + "grad_norm": 3.268887858496738, + "language_loss": 0.87538207, + "learning_rate": 3.860691965808173e-06, + "loss": 0.95453984, + "num_input_tokens_seen": 52678140, + "router_z_loss_clip": 3.0546875, + "router_z_loss_mlp": 0.28564453, + "step": 2427, + "time_per_iteration": 2.5644760131835938 + }, + { + "auxiliary_loss_clip": 0.0661422, + "auxiliary_loss_mlp": 0.01289371, + "balance_loss_clip": 0.06305077, + "balance_loss_mlp": 0.01258805, + "epoch": 0.14597925747782955, + "flos": 14980742144640.0, + "grad_norm": 1.9191014162631195, + "language_loss": 0.67673224, + "learning_rate": 3.8605491215899e-06, + "loss": 0.75576818, + "num_input_tokens_seen": 52696825, + "router_z_loss_clip": 3.09179688, + "router_z_loss_mlp": 0.3059082, + "step": 2428, + "time_per_iteration": 2.507455348968506 + }, + { + "auxiliary_loss_clip": 0.06609876, + "auxiliary_loss_mlp": 0.01290631, + "balance_loss_clip": 0.06305391, + "balance_loss_mlp": 0.01261807, + "epoch": 0.14603938073049752, + "flos": 21075200686080.0, + "grad_norm": 1.7530902442774277, + "language_loss": 0.84668899, + "learning_rate": 3.860406206819417e-06, + "loss": 0.92569411, + "num_input_tokens_seen": 52715125, + "router_z_loss_clip": 3.046875, + "router_z_loss_mlp": 0.28833008, + "step": 2429, + "time_per_iteration": 2.5743284225463867 + }, + { + "auxiliary_loss_clip": 0.06606025, + "auxiliary_loss_mlp": 0.01297985, + "balance_loss_clip": 0.06307633, + "balance_loss_mlp": 0.01269661, + "epoch": 0.14609950398316549, + "flos": 19870863552000.0, + "grad_norm": 1.787324656259552, + "language_loss": 0.80119967, + "learning_rate": 3.860263221502145e-06, + "loss": 0.88023973, + "num_input_tokens_seen": 52734015, + "router_z_loss_clip": 2.98242188, + "router_z_loss_mlp": 0.28308105, + "step": 2430, + "time_per_iteration": 3.9587552547454834 + }, + { + "auxiliary_loss_clip": 0.06618911, + "auxiliary_loss_mlp": 0.01299566, + "balance_loss_clip": 0.06312407, + "balance_loss_mlp": 0.01271552, + "epoch": 0.14615962723583345, + "flos": 22425377051520.0, + "grad_norm": 2.031204881913862, + "language_loss": 0.84236491, + "learning_rate": 3.860120165643504e-06, + "loss": 0.92154968, + "num_input_tokens_seen": 52753025, + "router_z_loss_clip": 3.0625, + "router_z_loss_mlp": 0.28051758, + "step": 2431, + "time_per_iteration": 2.5258126258850098 + }, + { + "auxiliary_loss_clip": 0.06622316, + "auxiliary_loss_mlp": 0.01304388, + "balance_loss_clip": 0.06307245, + "balance_loss_mlp": 0.01273823, + "epoch": 0.14621975048850142, + "flos": 22352813815680.0, + "grad_norm": 2.3067012157334976, + "language_loss": 0.79905456, + "learning_rate": 3.859977039248921e-06, + "loss": 0.87832165, + "num_input_tokens_seen": 52773420, + "router_z_loss_clip": 3.15039062, + "router_z_loss_mlp": 0.30566406, + "step": 2432, + "time_per_iteration": 2.5560994148254395 + }, + { + "auxiliary_loss_clip": 0.06613283, + "auxiliary_loss_mlp": 0.01299078, + "balance_loss_clip": 0.06307921, + "balance_loss_mlp": 0.01268894, + "epoch": 0.1462798737411694, + "flos": 24396030501120.0, + "grad_norm": 3.9772219479987796, + "language_loss": 0.8163479, + "learning_rate": 3.859833842323822e-06, + "loss": 0.89547151, + "num_input_tokens_seen": 52792870, + "router_z_loss_clip": 3.0546875, + "router_z_loss_mlp": 0.30175781, + "step": 2433, + "time_per_iteration": 2.5528087615966797 + }, + { + "auxiliary_loss_clip": 0.06603821, + "auxiliary_loss_mlp": 0.01308033, + "balance_loss_clip": 0.06304027, + "balance_loss_mlp": 0.0128052, + "epoch": 0.14633999699383737, + "flos": 19250679957120.0, + "grad_norm": 5.860215383122996, + "language_loss": 0.79175711, + "learning_rate": 3.859690574873638e-06, + "loss": 0.87087572, + "num_input_tokens_seen": 52811615, + "router_z_loss_clip": 3.0, + "router_z_loss_mlp": 0.27526855, + "step": 2434, + "time_per_iteration": 2.5396053791046143 + }, + { + "auxiliary_loss_clip": 0.0649661, + "auxiliary_loss_mlp": 0.01339476, + "balance_loss_clip": 0.0632303, + "balance_loss_mlp": 0.01328705, + "epoch": 0.14640012024650534, + "flos": 62679658780800.0, + "grad_norm": 0.822335797554765, + "language_loss": 0.58256161, + "learning_rate": 3.8595472369038e-06, + "loss": 0.66092247, + "num_input_tokens_seen": 52873230, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.10784912, + "step": 2435, + "time_per_iteration": 3.147134304046631 + }, + { + "auxiliary_loss_clip": 0.06602708, + "auxiliary_loss_mlp": 0.0130236, + "balance_loss_clip": 0.06305322, + "balance_loss_mlp": 0.01274036, + "epoch": 0.1464602434991733, + "flos": 12281144100480.0, + "grad_norm": 2.2533392469478453, + "language_loss": 0.89637053, + "learning_rate": 3.859403828419744e-06, + "loss": 0.97542119, + "num_input_tokens_seen": 52889325, + "router_z_loss_clip": 2.97460938, + "router_z_loss_mlp": 0.28320312, + "step": 2436, + "time_per_iteration": 2.5397794246673584 + }, + { + "auxiliary_loss_clip": 0.06608147, + "auxiliary_loss_mlp": 0.01302382, + "balance_loss_clip": 0.06305888, + "balance_loss_mlp": 0.01274391, + "epoch": 0.14652036675184127, + "flos": 20928480986880.0, + "grad_norm": 2.9920720004583194, + "language_loss": 0.75810778, + "learning_rate": 3.85926034942691e-06, + "loss": 0.83721304, + "num_input_tokens_seen": 52909705, + "router_z_loss_clip": 3.02148438, + "router_z_loss_mlp": 0.2800293, + "step": 2437, + "time_per_iteration": 2.5667972564697266 + }, + { + "auxiliary_loss_clip": 0.06610391, + "auxiliary_loss_mlp": 0.01306019, + "balance_loss_clip": 0.06306973, + "balance_loss_mlp": 0.01277123, + "epoch": 0.14658049000450923, + "flos": 27710151989760.0, + "grad_norm": 2.606428121821339, + "language_loss": 0.7401824, + "learning_rate": 3.859116799930736e-06, + "loss": 0.81934643, + "num_input_tokens_seen": 52930300, + "router_z_loss_clip": 3.03515625, + "router_z_loss_mlp": 0.28857422, + "step": 2438, + "time_per_iteration": 2.6476521492004395 + }, + { + "auxiliary_loss_clip": 0.06605977, + "auxiliary_loss_mlp": 0.01303285, + "balance_loss_clip": 0.06305391, + "balance_loss_mlp": 0.01274865, + "epoch": 0.14664061325717723, + "flos": 24943483152000.0, + "grad_norm": 2.0459162456522595, + "language_loss": 0.7577256, + "learning_rate": 3.858973179936668e-06, + "loss": 0.83681822, + "num_input_tokens_seen": 52949955, + "router_z_loss_clip": 3.00585938, + "router_z_loss_mlp": 0.28442383, + "step": 2439, + "time_per_iteration": 2.5789241790771484 + }, + { + "auxiliary_loss_clip": 0.06618818, + "auxiliary_loss_mlp": 0.01305858, + "balance_loss_clip": 0.06318325, + "balance_loss_mlp": 0.01278261, + "epoch": 0.1467007365098452, + "flos": 40307306964480.0, + "grad_norm": 4.636382420589035, + "language_loss": 0.74925351, + "learning_rate": 3.85882948945015e-06, + "loss": 0.82850027, + "num_input_tokens_seen": 52972905, + "router_z_loss_clip": 3.00976562, + "router_z_loss_mlp": 0.27624512, + "step": 2440, + "time_per_iteration": 2.7299485206604004 + }, + { + "auxiliary_loss_clip": 0.06605764, + "auxiliary_loss_mlp": 0.01314168, + "balance_loss_clip": 0.06310172, + "balance_loss_mlp": 0.01287667, + "epoch": 0.14676085976251316, + "flos": 26548175894400.0, + "grad_norm": 2.8544116905201755, + "language_loss": 0.84429544, + "learning_rate": 3.85868572847663e-06, + "loss": 0.92349476, + "num_input_tokens_seen": 52994850, + "router_z_loss_clip": 2.95703125, + "router_z_loss_mlp": 0.26513672, + "step": 2441, + "time_per_iteration": 2.631824016571045 + }, + { + "auxiliary_loss_clip": 0.0662398, + "auxiliary_loss_mlp": 0.01301683, + "balance_loss_clip": 0.06313129, + "balance_loss_mlp": 0.0127188, + "epoch": 0.14682098301518112, + "flos": 23556857460480.0, + "grad_norm": 2.3203183858424175, + "language_loss": 0.73868263, + "learning_rate": 3.858541897021563e-06, + "loss": 0.81793922, + "num_input_tokens_seen": 53014740, + "router_z_loss_clip": 3.11132812, + "router_z_loss_mlp": 0.29785156, + "step": 2442, + "time_per_iteration": 2.549813747406006 + }, + { + "auxiliary_loss_clip": 0.06618661, + "auxiliary_loss_mlp": 0.01300103, + "balance_loss_clip": 0.06309915, + "balance_loss_mlp": 0.01271934, + "epoch": 0.1468811062678491, + "flos": 11655048792960.0, + "grad_norm": 3.9053582460255756, + "language_loss": 0.82657981, + "learning_rate": 3.8583979950904e-06, + "loss": 0.90576744, + "num_input_tokens_seen": 53029780, + "router_z_loss_clip": 3.08789062, + "router_z_loss_mlp": 0.28161621, + "step": 2443, + "time_per_iteration": 2.5171542167663574 + }, + { + "auxiliary_loss_clip": 0.06611481, + "auxiliary_loss_mlp": 0.01308471, + "balance_loss_clip": 0.06310362, + "balance_loss_mlp": 0.0128184, + "epoch": 0.14694122952051705, + "flos": 23009237101440.0, + "grad_norm": 2.0286604977239477, + "language_loss": 0.84266245, + "learning_rate": 3.858254022688599e-06, + "loss": 0.92186195, + "num_input_tokens_seen": 53048620, + "router_z_loss_clip": 3.01171875, + "router_z_loss_mlp": 0.26635742, + "step": 2444, + "time_per_iteration": 2.5373833179473877 + }, + { + "auxiliary_loss_clip": 0.06614003, + "auxiliary_loss_mlp": 0.01304434, + "balance_loss_clip": 0.0631294, + "balance_loss_mlp": 0.0127692, + "epoch": 0.14700135277318502, + "flos": 26509797924480.0, + "grad_norm": 1.800920496835182, + "language_loss": 0.72034383, + "learning_rate": 3.85810997982162e-06, + "loss": 0.79952818, + "num_input_tokens_seen": 53070055, + "router_z_loss_clip": 3.0078125, + "router_z_loss_mlp": 0.27539062, + "step": 2445, + "time_per_iteration": 2.6035430431365967 + }, + { + "auxiliary_loss_clip": 0.0652153, + "auxiliary_loss_mlp": 0.01258872, + "balance_loss_clip": 0.06346728, + "balance_loss_mlp": 0.01251392, + "epoch": 0.147061476025853, + "flos": 59467841527680.0, + "grad_norm": 0.7965915579325233, + "language_loss": 0.62555134, + "learning_rate": 3.857965866494923e-06, + "loss": 0.70335531, + "num_input_tokens_seen": 53126945, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.074646, + "step": 2446, + "time_per_iteration": 3.0864346027374268 + }, + { + "auxiliary_loss_clip": 0.06631434, + "auxiliary_loss_mlp": 0.01305294, + "balance_loss_clip": 0.06324492, + "balance_loss_mlp": 0.01278603, + "epoch": 0.14712159927852098, + "flos": 28338637138560.0, + "grad_norm": 5.819879904445231, + "language_loss": 0.75890815, + "learning_rate": 3.857821682713975e-06, + "loss": 0.83827543, + "num_input_tokens_seen": 53149130, + "router_z_loss_clip": 3.0703125, + "router_z_loss_mlp": 0.26708984, + "step": 2447, + "time_per_iteration": 2.6405458450317383 + }, + { + "auxiliary_loss_clip": 0.0662236, + "auxiliary_loss_mlp": 0.01295421, + "balance_loss_clip": 0.06319176, + "balance_loss_mlp": 0.01267097, + "epoch": 0.14718172253118894, + "flos": 27097263699840.0, + "grad_norm": 3.1585594254982094, + "language_loss": 0.86766493, + "learning_rate": 3.857677428484242e-06, + "loss": 0.94684267, + "num_input_tokens_seen": 53167120, + "router_z_loss_clip": 3.03515625, + "router_z_loss_mlp": 0.28344727, + "step": 2448, + "time_per_iteration": 2.588178873062134 + }, + { + "auxiliary_loss_clip": 0.06500641, + "auxiliary_loss_mlp": 0.01262898, + "balance_loss_clip": 0.0632707, + "balance_loss_mlp": 0.01254792, + "epoch": 0.1472418457838569, + "flos": 66725827464960.0, + "grad_norm": 0.7311302410121435, + "language_loss": 0.56820273, + "learning_rate": 3.857533103811195e-06, + "loss": 0.64583808, + "num_input_tokens_seen": 53227945, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.08105469, + "step": 2449, + "time_per_iteration": 3.1432383060455322 + }, + { + "auxiliary_loss_clip": 0.06619844, + "auxiliary_loss_mlp": 0.01304126, + "balance_loss_clip": 0.06319091, + "balance_loss_mlp": 0.01278663, + "epoch": 0.14730196903652487, + "flos": 19579730140800.0, + "grad_norm": 2.3714801519715185, + "language_loss": 0.86300421, + "learning_rate": 3.857388708700307e-06, + "loss": 0.94224387, + "num_input_tokens_seen": 53244615, + "router_z_loss_clip": 3.0078125, + "router_z_loss_mlp": 0.2545166, + "step": 2450, + "time_per_iteration": 2.6230788230895996 + }, + { + "auxiliary_loss_clip": 0.06624465, + "auxiliary_loss_mlp": 0.01292799, + "balance_loss_clip": 0.06318057, + "balance_loss_mlp": 0.01265774, + "epoch": 0.14736209228919284, + "flos": 16076611768320.0, + "grad_norm": 3.0293103266492336, + "language_loss": 0.76407862, + "learning_rate": 3.857244243157052e-06, + "loss": 0.84325123, + "num_input_tokens_seen": 53262205, + "router_z_loss_clip": 3.06445312, + "router_z_loss_mlp": 0.2701416, + "step": 2451, + "time_per_iteration": 2.562429428100586 + }, + { + "auxiliary_loss_clip": 0.06606978, + "auxiliary_loss_mlp": 0.0129124, + "balance_loss_clip": 0.0631422, + "balance_loss_mlp": 0.01263881, + "epoch": 0.1474222155418608, + "flos": 23046147624960.0, + "grad_norm": 2.189425489790517, + "language_loss": 0.82725209, + "learning_rate": 3.85709970718691e-06, + "loss": 0.90623426, + "num_input_tokens_seen": 53282445, + "router_z_loss_clip": 2.92578125, + "router_z_loss_mlp": 0.27355957, + "step": 2452, + "time_per_iteration": 2.5850419998168945 + }, + { + "auxiliary_loss_clip": 0.06614233, + "auxiliary_loss_mlp": 0.01290168, + "balance_loss_clip": 0.06316262, + "balance_loss_mlp": 0.01264562, + "epoch": 0.1474823387945288, + "flos": 17024210392320.0, + "grad_norm": 1.704036472783103, + "language_loss": 0.7534892, + "learning_rate": 3.856955100795361e-06, + "loss": 0.83253324, + "num_input_tokens_seen": 53299060, + "router_z_loss_clip": 2.98046875, + "router_z_loss_mlp": 0.2565918, + "step": 2453, + "time_per_iteration": 2.56315016746521 + }, + { + "auxiliary_loss_clip": 0.06629206, + "auxiliary_loss_mlp": 0.01291559, + "balance_loss_clip": 0.06321974, + "balance_loss_mlp": 0.01263521, + "epoch": 0.14754246204719676, + "flos": 17900880935040.0, + "grad_norm": 2.0859032314961836, + "language_loss": 0.7740314, + "learning_rate": 3.856810423987889e-06, + "loss": 0.853239, + "num_input_tokens_seen": 53315970, + "router_z_loss_clip": 3.06835938, + "router_z_loss_mlp": 0.28076172, + "step": 2454, + "time_per_iteration": 2.512051582336426 + }, + { + "auxiliary_loss_clip": 0.06621231, + "auxiliary_loss_mlp": 0.01296513, + "balance_loss_clip": 0.06321682, + "balance_loss_mlp": 0.01269392, + "epoch": 0.14760258529986472, + "flos": 13084161304320.0, + "grad_norm": 2.060710477094934, + "language_loss": 0.84565163, + "learning_rate": 3.856665676769979e-06, + "loss": 0.92482901, + "num_input_tokens_seen": 53332940, + "router_z_loss_clip": 2.99609375, + "router_z_loss_mlp": 0.2713623, + "step": 2455, + "time_per_iteration": 2.5514066219329834 + }, + { + "auxiliary_loss_clip": 0.06633241, + "auxiliary_loss_mlp": 0.01283691, + "balance_loss_clip": 0.06325488, + "balance_loss_mlp": 0.01257393, + "epoch": 0.1476627085525327, + "flos": 30813627513600.0, + "grad_norm": 5.872574686414898, + "language_loss": 0.85135001, + "learning_rate": 3.85652085914712e-06, + "loss": 0.93051934, + "num_input_tokens_seen": 53353295, + "router_z_loss_clip": 3.07421875, + "router_z_loss_mlp": 0.26281738, + "step": 2456, + "time_per_iteration": 2.638485908508301 + }, + { + "auxiliary_loss_clip": 0.0661984, + "auxiliary_loss_mlp": 0.01288462, + "balance_loss_clip": 0.06324227, + "balance_loss_mlp": 0.01261926, + "epoch": 0.14772283180520066, + "flos": 21695887405440.0, + "grad_norm": 3.5788318870076674, + "language_loss": 0.85374033, + "learning_rate": 3.856375971124805e-06, + "loss": 0.93282336, + "num_input_tokens_seen": 53373410, + "router_z_loss_clip": 2.9609375, + "router_z_loss_mlp": 0.26550293, + "step": 2457, + "time_per_iteration": 2.5397539138793945 + }, + { + "auxiliary_loss_clip": 0.06612187, + "auxiliary_loss_mlp": 0.01285174, + "balance_loss_clip": 0.06322154, + "balance_loss_mlp": 0.01258817, + "epoch": 0.14778295505786862, + "flos": 18776335593600.0, + "grad_norm": 2.2072082990650896, + "language_loss": 0.76667166, + "learning_rate": 3.856231012708527e-06, + "loss": 0.84564531, + "num_input_tokens_seen": 53391430, + "router_z_loss_clip": 2.90039062, + "router_z_loss_mlp": 0.26379395, + "step": 2458, + "time_per_iteration": 2.5479953289031982 + }, + { + "auxiliary_loss_clip": 0.0664083, + "auxiliary_loss_mlp": 0.01290982, + "balance_loss_clip": 0.06331704, + "balance_loss_mlp": 0.01262992, + "epoch": 0.1478430783105366, + "flos": 22900224539520.0, + "grad_norm": 2.4431680555354185, + "language_loss": 0.84230208, + "learning_rate": 3.856085983903782e-06, + "loss": 0.92162013, + "num_input_tokens_seen": 53409960, + "router_z_loss_clip": 3.09375, + "router_z_loss_mlp": 0.28027344, + "step": 2459, + "time_per_iteration": 2.555878162384033 + }, + { + "auxiliary_loss_clip": 0.06625295, + "auxiliary_loss_mlp": 0.01283208, + "balance_loss_clip": 0.06332543, + "balance_loss_mlp": 0.01257983, + "epoch": 0.14790320156320458, + "flos": 15090635174400.0, + "grad_norm": 2.440333441232677, + "language_loss": 0.76468259, + "learning_rate": 3.855940884716071e-06, + "loss": 0.84376764, + "num_input_tokens_seen": 53426160, + "router_z_loss_clip": 2.92773438, + "router_z_loss_mlp": 0.2520752, + "step": 2460, + "time_per_iteration": 2.528325319290161 + }, + { + "auxiliary_loss_clip": 0.06624737, + "auxiliary_loss_mlp": 0.01287086, + "balance_loss_clip": 0.06318681, + "balance_loss_mlp": 0.0125912, + "epoch": 0.14796332481587254, + "flos": 26511894276480.0, + "grad_norm": 1.7434250987621476, + "language_loss": 0.82039559, + "learning_rate": 3.855795715150896e-06, + "loss": 0.89951384, + "num_input_tokens_seen": 53448530, + "router_z_loss_clip": 3.0625, + "router_z_loss_mlp": 0.27941895, + "step": 2461, + "time_per_iteration": 2.609023332595825 + }, + { + "auxiliary_loss_clip": 0.06627606, + "auxiliary_loss_mlp": 0.0129144, + "balance_loss_clip": 0.06326235, + "balance_loss_mlp": 0.01263497, + "epoch": 0.1480234480685405, + "flos": 17568392734080.0, + "grad_norm": 4.638743932579621, + "language_loss": 0.6665929, + "learning_rate": 3.855650475213761e-06, + "loss": 0.74578333, + "num_input_tokens_seen": 53465915, + "router_z_loss_clip": 3.015625, + "router_z_loss_mlp": 0.27954102, + "step": 2462, + "time_per_iteration": 2.5234897136688232 + }, + { + "auxiliary_loss_clip": 0.06619708, + "auxiliary_loss_mlp": 0.01287497, + "balance_loss_clip": 0.06320504, + "balance_loss_mlp": 0.01260925, + "epoch": 0.14808357132120847, + "flos": 53594693147520.0, + "grad_norm": 12.154278546197556, + "language_loss": 0.68225503, + "learning_rate": 3.8555051649101745e-06, + "loss": 0.76132703, + "num_input_tokens_seen": 53496055, + "router_z_loss_clip": 2.9921875, + "router_z_loss_mlp": 0.26574707, + "step": 2463, + "time_per_iteration": 2.847352981567383 + }, + { + "auxiliary_loss_clip": 0.06631631, + "auxiliary_loss_mlp": 0.01292564, + "balance_loss_clip": 0.06328086, + "balance_loss_mlp": 0.01264788, + "epoch": 0.14814369457387644, + "flos": 19835420474880.0, + "grad_norm": 2.5558663587768917, + "language_loss": 0.77389717, + "learning_rate": 3.855359784245646e-06, + "loss": 0.85313916, + "num_input_tokens_seen": 53513790, + "router_z_loss_clip": 3.03515625, + "router_z_loss_mlp": 0.27783203, + "step": 2464, + "time_per_iteration": 3.9868950843811035 + }, + { + "auxiliary_loss_clip": 0.0661262, + "auxiliary_loss_mlp": 0.01291855, + "balance_loss_clip": 0.06322042, + "balance_loss_mlp": 0.01266356, + "epoch": 0.1482038178265444, + "flos": 23921769991680.0, + "grad_norm": 1.9637026483751652, + "language_loss": 0.80667269, + "learning_rate": 3.855214333225688e-06, + "loss": 0.88571739, + "num_input_tokens_seen": 53533410, + "router_z_loss_clip": 2.90820312, + "router_z_loss_mlp": 0.25500488, + "step": 2465, + "time_per_iteration": 4.024165630340576 + }, + { + "auxiliary_loss_clip": 0.06628035, + "auxiliary_loss_mlp": 0.01295444, + "balance_loss_clip": 0.06321928, + "balance_loss_mlp": 0.01265976, + "epoch": 0.1482639410792124, + "flos": 24177376471680.0, + "grad_norm": 3.100026638907138, + "language_loss": 0.77266049, + "learning_rate": 3.855068811855817e-06, + "loss": 0.85189527, + "num_input_tokens_seen": 53554775, + "router_z_loss_clip": 3.06054688, + "router_z_loss_mlp": 0.29467773, + "step": 2466, + "time_per_iteration": 2.583932638168335 + }, + { + "auxiliary_loss_clip": 0.06510445, + "auxiliary_loss_mlp": 0.01275284, + "balance_loss_clip": 0.06339325, + "balance_loss_mlp": 0.012657, + "epoch": 0.14832406433188036, + "flos": 66209205916800.0, + "grad_norm": 0.9642098795906485, + "language_loss": 0.60506117, + "learning_rate": 3.854923220141551e-06, + "loss": 0.68291849, + "num_input_tokens_seen": 53609675, + "router_z_loss_clip": 1.71191406, + "router_z_loss_mlp": 0.09570312, + "step": 2467, + "time_per_iteration": 3.206559419631958 + }, + { + "auxiliary_loss_clip": 0.06627056, + "auxiliary_loss_mlp": 0.0129155, + "balance_loss_clip": 0.06326642, + "balance_loss_mlp": 0.01264573, + "epoch": 0.14838418758454833, + "flos": 25418372567040.0, + "grad_norm": 2.1383686818257877, + "language_loss": 0.88646448, + "learning_rate": 3.85477755808841e-06, + "loss": 0.96565056, + "num_input_tokens_seen": 53626950, + "router_z_loss_clip": 3.00390625, + "router_z_loss_mlp": 0.26965332, + "step": 2468, + "time_per_iteration": 2.586428642272949 + }, + { + "auxiliary_loss_clip": 0.06632069, + "auxiliary_loss_mlp": 0.01295941, + "balance_loss_clip": 0.0632536, + "balance_loss_mlp": 0.01267236, + "epoch": 0.1484443108372163, + "flos": 23295800465280.0, + "grad_norm": 2.089009169061615, + "language_loss": 0.76661634, + "learning_rate": 3.854631825701919e-06, + "loss": 0.84589648, + "num_input_tokens_seen": 53644200, + "router_z_loss_clip": 3.0625, + "router_z_loss_mlp": 0.28686523, + "step": 2469, + "time_per_iteration": 5.45016884803772 + }, + { + "auxiliary_loss_clip": 0.06630681, + "auxiliary_loss_mlp": 0.01291477, + "balance_loss_clip": 0.06328478, + "balance_loss_mlp": 0.01264131, + "epoch": 0.14850443408988426, + "flos": 14652949772160.0, + "grad_norm": 3.485678754962802, + "language_loss": 0.76790643, + "learning_rate": 3.854486022987603e-06, + "loss": 0.84712803, + "num_input_tokens_seen": 53659650, + "router_z_loss_clip": 3.0234375, + "router_z_loss_mlp": 0.2734375, + "step": 2470, + "time_per_iteration": 2.514772653579712 + }, + { + "auxiliary_loss_clip": 0.06622952, + "auxiliary_loss_mlp": 0.01299835, + "balance_loss_clip": 0.06329592, + "balance_loss_mlp": 0.0127324, + "epoch": 0.14856455734255222, + "flos": 23554761108480.0, + "grad_norm": 3.1357945603829576, + "language_loss": 0.73019731, + "learning_rate": 3.8543401499509905e-06, + "loss": 0.80942523, + "num_input_tokens_seen": 53680275, + "router_z_loss_clip": 2.93359375, + "router_z_loss_mlp": 0.26623535, + "step": 2471, + "time_per_iteration": 2.5867044925689697 + }, + { + "auxiliary_loss_clip": 0.06632146, + "auxiliary_loss_mlp": 0.01309567, + "balance_loss_clip": 0.06325525, + "balance_loss_mlp": 0.01281862, + "epoch": 0.1486246805952202, + "flos": 18083127565440.0, + "grad_norm": 2.6270207816723894, + "language_loss": 0.90878981, + "learning_rate": 3.854194206597615e-06, + "loss": 0.98820698, + "num_input_tokens_seen": 53698270, + "router_z_loss_clip": 3.0625, + "router_z_loss_mlp": 0.27709961, + "step": 2472, + "time_per_iteration": 2.5934388637542725 + }, + { + "auxiliary_loss_clip": 0.06620878, + "auxiliary_loss_mlp": 0.01314043, + "balance_loss_clip": 0.06322667, + "balance_loss_mlp": 0.01286136, + "epoch": 0.14868480384788818, + "flos": 19359566737920.0, + "grad_norm": 2.5877207728101332, + "language_loss": 0.81794894, + "learning_rate": 3.854048192933008e-06, + "loss": 0.89729816, + "num_input_tokens_seen": 53716845, + "router_z_loss_clip": 2.984375, + "router_z_loss_mlp": 0.2791748, + "step": 2473, + "time_per_iteration": 2.551769256591797 + }, + { + "auxiliary_loss_clip": 0.06630681, + "auxiliary_loss_mlp": 0.01339003, + "balance_loss_clip": 0.06328606, + "balance_loss_mlp": 0.01311346, + "epoch": 0.14874492710055615, + "flos": 22206723022080.0, + "grad_norm": 2.4925002468384423, + "language_loss": 0.79495537, + "learning_rate": 3.853902108962709e-06, + "loss": 0.87465227, + "num_input_tokens_seen": 53734970, + "router_z_loss_clip": 3.01953125, + "router_z_loss_mlp": 0.27624512, + "step": 2474, + "time_per_iteration": 2.55029034614563 + }, + { + "auxiliary_loss_clip": 0.06643772, + "auxiliary_loss_mlp": 0.01336817, + "balance_loss_clip": 0.06335679, + "balance_loss_mlp": 0.01309256, + "epoch": 0.1488050503532241, + "flos": 21109427879040.0, + "grad_norm": 2.598618910298095, + "language_loss": 0.8324194, + "learning_rate": 3.853755954692255e-06, + "loss": 0.91222525, + "num_input_tokens_seen": 53753415, + "router_z_loss_clip": 3.08007812, + "router_z_loss_mlp": 0.27575684, + "step": 2475, + "time_per_iteration": 2.557748794555664 + }, + { + "auxiliary_loss_clip": 0.06641456, + "auxiliary_loss_mlp": 0.01357893, + "balance_loss_clip": 0.06342697, + "balance_loss_mlp": 0.01329998, + "epoch": 0.14886517360589208, + "flos": 12791476592640.0, + "grad_norm": 3.118918756982401, + "language_loss": 0.81896377, + "learning_rate": 3.85360973012719e-06, + "loss": 0.89895725, + "num_input_tokens_seen": 53770305, + "router_z_loss_clip": 2.984375, + "router_z_loss_mlp": 0.27929688, + "step": 2476, + "time_per_iteration": 2.5228424072265625 + }, + { + "auxiliary_loss_clip": 0.06643493, + "auxiliary_loss_mlp": 0.01381513, + "balance_loss_clip": 0.06351461, + "balance_loss_mlp": 0.01354202, + "epoch": 0.14892529685856004, + "flos": 29030503501440.0, + "grad_norm": 5.933104141951435, + "language_loss": 0.78306687, + "learning_rate": 3.853463435273058e-06, + "loss": 0.86331695, + "num_input_tokens_seen": 53788895, + "router_z_loss_clip": 2.921875, + "router_z_loss_mlp": 0.27307129, + "step": 2477, + "time_per_iteration": 2.6379337310791016 + }, + { + "auxiliary_loss_clip": 0.06518018, + "auxiliary_loss_mlp": 0.01346882, + "balance_loss_clip": 0.06346889, + "balance_loss_mlp": 0.01337793, + "epoch": 0.148985420111228, + "flos": 61944215495040.0, + "grad_norm": 0.7948106415234558, + "language_loss": 0.60108519, + "learning_rate": 3.853317070135407e-06, + "loss": 0.67973411, + "num_input_tokens_seen": 53850260, + "router_z_loss_clip": 1.7109375, + "router_z_loss_mlp": 0.09100342, + "step": 2478, + "time_per_iteration": 3.2091856002807617 + }, + { + "auxiliary_loss_clip": 0.06656381, + "auxiliary_loss_mlp": 0.01381988, + "balance_loss_clip": 0.06356013, + "balance_loss_mlp": 0.01354606, + "epoch": 0.149045543363896, + "flos": 23921937699840.0, + "grad_norm": 3.933079411076695, + "language_loss": 0.71247137, + "learning_rate": 3.853170634719787e-06, + "loss": 0.79285508, + "num_input_tokens_seen": 53867520, + "router_z_loss_clip": 3.00195312, + "router_z_loss_mlp": 0.27392578, + "step": 2479, + "time_per_iteration": 2.613901376724243 + }, + { + "auxiliary_loss_clip": 0.06657803, + "auxiliary_loss_mlp": 0.01383638, + "balance_loss_clip": 0.06357619, + "balance_loss_mlp": 0.01356411, + "epoch": 0.14910566661656396, + "flos": 23660293726080.0, + "grad_norm": 3.520474403550157, + "language_loss": 0.82057166, + "learning_rate": 3.853024129031751e-06, + "loss": 0.90098608, + "num_input_tokens_seen": 53886620, + "router_z_loss_clip": 3.00390625, + "router_z_loss_mlp": 0.27246094, + "step": 2480, + "time_per_iteration": 2.6175220012664795 + }, + { + "auxiliary_loss_clip": 0.06659204, + "auxiliary_loss_mlp": 0.01416958, + "balance_loss_clip": 0.06354087, + "balance_loss_mlp": 0.01387727, + "epoch": 0.14916578986923193, + "flos": 20520452730240.0, + "grad_norm": 2.2296604280919805, + "language_loss": 0.85048115, + "learning_rate": 3.852877553076854e-06, + "loss": 0.9312427, + "num_input_tokens_seen": 53902230, + "router_z_loss_clip": 3.05273438, + "router_z_loss_mlp": 0.29248047, + "step": 2481, + "time_per_iteration": 2.617551565170288 + }, + { + "auxiliary_loss_clip": 0.06647365, + "auxiliary_loss_mlp": 0.01423314, + "balance_loss_clip": 0.06347671, + "balance_loss_mlp": 0.01393416, + "epoch": 0.1492259131218999, + "flos": 22498359557760.0, + "grad_norm": 1.912212150867571, + "language_loss": 0.78788674, + "learning_rate": 3.8527309068606546e-06, + "loss": 0.86859351, + "num_input_tokens_seen": 53919475, + "router_z_loss_clip": 2.9921875, + "router_z_loss_mlp": 0.29882812, + "step": 2482, + "time_per_iteration": 2.5733768939971924 + }, + { + "auxiliary_loss_clip": 0.06663539, + "auxiliary_loss_mlp": 0.0143468, + "balance_loss_clip": 0.06351975, + "balance_loss_mlp": 0.01405808, + "epoch": 0.14928603637456786, + "flos": 23192657688960.0, + "grad_norm": 2.2991604479376777, + "language_loss": 0.80652654, + "learning_rate": 3.852584190388713e-06, + "loss": 0.88750875, + "num_input_tokens_seen": 53939150, + "router_z_loss_clip": 3.1171875, + "router_z_loss_mlp": 0.28857422, + "step": 2483, + "time_per_iteration": 2.597843647003174 + }, + { + "auxiliary_loss_clip": 0.06641878, + "auxiliary_loss_mlp": 0.01472083, + "balance_loss_clip": 0.06352127, + "balance_loss_mlp": 0.01442948, + "epoch": 0.14934615962723582, + "flos": 21659731568640.0, + "grad_norm": 2.0225233992765728, + "language_loss": 0.71627355, + "learning_rate": 3.852437403666595e-06, + "loss": 0.79741317, + "num_input_tokens_seen": 53958735, + "router_z_loss_clip": 2.90039062, + "router_z_loss_mlp": 0.2911377, + "step": 2484, + "time_per_iteration": 2.5717227458953857 + }, + { + "auxiliary_loss_clip": 0.06650308, + "auxiliary_loss_mlp": 0.01467216, + "balance_loss_clip": 0.06347484, + "balance_loss_mlp": 0.01435006, + "epoch": 0.1494062828799038, + "flos": 27016356983040.0, + "grad_norm": 2.0068383034806154, + "language_loss": 0.85284823, + "learning_rate": 3.852290546699863e-06, + "loss": 0.9340235, + "num_input_tokens_seen": 53975065, + "router_z_loss_clip": 3.02539062, + "router_z_loss_mlp": 0.32226562, + "step": 2485, + "time_per_iteration": 2.7037456035614014 + }, + { + "auxiliary_loss_clip": 0.0664534, + "auxiliary_loss_mlp": 0.01441016, + "balance_loss_clip": 0.06342804, + "balance_loss_mlp": 0.01410952, + "epoch": 0.14946640613257178, + "flos": 21221291479680.0, + "grad_norm": 2.0879118929126133, + "language_loss": 0.85614496, + "learning_rate": 3.8521436194940894e-06, + "loss": 0.93700856, + "num_input_tokens_seen": 53993330, + "router_z_loss_clip": 3.02539062, + "router_z_loss_mlp": 0.30053711, + "step": 2486, + "time_per_iteration": 2.5492942333221436 + }, + { + "auxiliary_loss_clip": 0.06628142, + "auxiliary_loss_mlp": 0.01484598, + "balance_loss_clip": 0.06337839, + "balance_loss_mlp": 0.01454963, + "epoch": 0.14952652938523975, + "flos": 13375965548160.0, + "grad_norm": 2.5864541617313805, + "language_loss": 0.75625527, + "learning_rate": 3.851996622054842e-06, + "loss": 0.83738261, + "num_input_tokens_seen": 54010515, + "router_z_loss_clip": 2.90039062, + "router_z_loss_mlp": 0.29638672, + "step": 2487, + "time_per_iteration": 2.6050243377685547 + }, + { + "auxiliary_loss_clip": 0.06636909, + "auxiliary_loss_mlp": 0.01458272, + "balance_loss_clip": 0.06336737, + "balance_loss_mlp": 0.01427635, + "epoch": 0.1495866526379077, + "flos": 35526491608320.0, + "grad_norm": 2.6345212857914415, + "language_loss": 0.72756326, + "learning_rate": 3.8518495543877e-06, + "loss": 0.80851501, + "num_input_tokens_seen": 54031315, + "router_z_loss_clip": 3.0, + "router_z_loss_mlp": 0.30639648, + "step": 2488, + "time_per_iteration": 2.7038300037384033 + }, + { + "auxiliary_loss_clip": 0.06629623, + "auxiliary_loss_mlp": 0.01463441, + "balance_loss_clip": 0.06324254, + "balance_loss_mlp": 0.01431421, + "epoch": 0.14964677589057568, + "flos": 17637392171520.0, + "grad_norm": 3.2533111651102633, + "language_loss": 0.71329439, + "learning_rate": 3.851702416498235e-06, + "loss": 0.79422504, + "num_input_tokens_seen": 54045965, + "router_z_loss_clip": 3.05273438, + "router_z_loss_mlp": 0.3203125, + "step": 2489, + "time_per_iteration": 2.6397132873535156 + }, + { + "auxiliary_loss_clip": 0.06627091, + "auxiliary_loss_mlp": 0.01445303, + "balance_loss_clip": 0.06321006, + "balance_loss_mlp": 0.01412807, + "epoch": 0.14970689914324364, + "flos": 20190102808320.0, + "grad_norm": 15.387963507460157, + "language_loss": 0.82698536, + "learning_rate": 3.8515552083920295e-06, + "loss": 0.90770924, + "num_input_tokens_seen": 54059960, + "router_z_loss_clip": 3.05859375, + "router_z_loss_mlp": 0.32446289, + "step": 2490, + "time_per_iteration": 2.560051918029785 + }, + { + "auxiliary_loss_clip": 0.0662353, + "auxiliary_loss_mlp": 0.01421627, + "balance_loss_clip": 0.06318316, + "balance_loss_mlp": 0.013913, + "epoch": 0.1497670223959116, + "flos": 37237136238720.0, + "grad_norm": 2.555318554574921, + "language_loss": 0.81524169, + "learning_rate": 3.851407930074666e-06, + "loss": 0.8956933, + "num_input_tokens_seen": 54079330, + "router_z_loss_clip": 3.05273438, + "router_z_loss_mlp": 0.30322266, + "step": 2491, + "time_per_iteration": 2.7191121578216553 + }, + { + "auxiliary_loss_clip": 0.06628857, + "auxiliary_loss_mlp": 0.01437567, + "balance_loss_clip": 0.06323408, + "balance_loss_mlp": 0.01406072, + "epoch": 0.1498271456485796, + "flos": 24461675775360.0, + "grad_norm": 2.0859620961652032, + "language_loss": 0.91616488, + "learning_rate": 3.851260581551727e-06, + "loss": 0.99682909, + "num_input_tokens_seen": 54097555, + "router_z_loss_clip": 3.05273438, + "router_z_loss_mlp": 0.31469727, + "step": 2492, + "time_per_iteration": 2.5775644779205322 + }, + { + "auxiliary_loss_clip": 0.06620014, + "auxiliary_loss_mlp": 0.01407656, + "balance_loss_clip": 0.06319647, + "balance_loss_mlp": 0.01375589, + "epoch": 0.14988726890124757, + "flos": 16259235742080.0, + "grad_norm": 4.194340578044498, + "language_loss": 0.80698526, + "learning_rate": 3.851113162828802e-06, + "loss": 0.88726199, + "num_input_tokens_seen": 54115600, + "router_z_loss_clip": 3.00585938, + "router_z_loss_mlp": 0.3203125, + "step": 2493, + "time_per_iteration": 2.522217273712158 + }, + { + "auxiliary_loss_clip": 0.06625558, + "auxiliary_loss_mlp": 0.01423964, + "balance_loss_clip": 0.06320652, + "balance_loss_mlp": 0.01391014, + "epoch": 0.14994739215391553, + "flos": 20672622944640.0, + "grad_norm": 1.92476481647275, + "language_loss": 0.81586623, + "learning_rate": 3.85096567391148e-06, + "loss": 0.89636147, + "num_input_tokens_seen": 54135220, + "router_z_loss_clip": 3.05078125, + "router_z_loss_mlp": 0.32958984, + "step": 2494, + "time_per_iteration": 2.5768370628356934 + }, + { + "auxiliary_loss_clip": 0.06620924, + "auxiliary_loss_mlp": 0.01381746, + "balance_loss_clip": 0.06323613, + "balance_loss_mlp": 0.01351562, + "epoch": 0.1500075154065835, + "flos": 70666855603200.0, + "grad_norm": 1.9921469546830013, + "language_loss": 0.67712897, + "learning_rate": 3.850818114805354e-06, + "loss": 0.75715572, + "num_input_tokens_seen": 54161065, + "router_z_loss_clip": 2.97851562, + "router_z_loss_mlp": 0.30187988, + "step": 2495, + "time_per_iteration": 2.9661571979522705 + }, + { + "auxiliary_loss_clip": 0.06548879, + "auxiliary_loss_mlp": 0.01321563, + "balance_loss_clip": 0.06377496, + "balance_loss_mlp": 0.01310876, + "epoch": 0.15006763865925146, + "flos": 68029827431040.0, + "grad_norm": 0.8769612772619841, + "language_loss": 0.5954529, + "learning_rate": 3.850670485516019e-06, + "loss": 0.67415726, + "num_input_tokens_seen": 54225095, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.10699463, + "step": 2496, + "time_per_iteration": 3.202047109603882 + }, + { + "auxiliary_loss_clip": 0.06631249, + "auxiliary_loss_mlp": 0.0133476, + "balance_loss_clip": 0.06323538, + "balance_loss_mlp": 0.01304254, + "epoch": 0.15012776191191943, + "flos": 18922216752000.0, + "grad_norm": 2.34505525234942, + "language_loss": 0.66916072, + "learning_rate": 3.850522786049075e-06, + "loss": 0.74882078, + "num_input_tokens_seen": 54243750, + "router_z_loss_clip": 3.08007812, + "router_z_loss_mlp": 0.30505371, + "step": 2497, + "time_per_iteration": 2.5355312824249268 + }, + { + "auxiliary_loss_clip": 0.06621728, + "auxiliary_loss_mlp": 0.01327478, + "balance_loss_clip": 0.06319709, + "balance_loss_mlp": 0.01299762, + "epoch": 0.1501878851645874, + "flos": 23708985747840.0, + "grad_norm": 1.6926191632820315, + "language_loss": 0.76545727, + "learning_rate": 3.850375016410121e-06, + "loss": 0.84494931, + "num_input_tokens_seen": 54266185, + "router_z_loss_clip": 3.0234375, + "router_z_loss_mlp": 0.27746582, + "step": 2498, + "time_per_iteration": 2.6315629482269287 + }, + { + "auxiliary_loss_clip": 0.06625126, + "auxiliary_loss_mlp": 0.0132033, + "balance_loss_clip": 0.06315958, + "balance_loss_mlp": 0.01288454, + "epoch": 0.15024800841725539, + "flos": 20418777400320.0, + "grad_norm": 2.3031515729251377, + "language_loss": 0.72851908, + "learning_rate": 3.850227176604761e-06, + "loss": 0.80797374, + "num_input_tokens_seen": 54283940, + "router_z_loss_clip": 3.09570312, + "router_z_loss_mlp": 0.3190918, + "step": 2499, + "time_per_iteration": 2.550572395324707 + }, + { + "auxiliary_loss_clip": 0.06615321, + "auxiliary_loss_mlp": 0.01299804, + "balance_loss_clip": 0.06312654, + "balance_loss_mlp": 0.01270002, + "epoch": 0.15030813166992335, + "flos": 31838904472320.0, + "grad_norm": 2.1036429780105204, + "language_loss": 0.72527623, + "learning_rate": 3.850079266638601e-06, + "loss": 0.80442744, + "num_input_tokens_seen": 54304830, + "router_z_loss_clip": 3.02539062, + "router_z_loss_mlp": 0.29760742, + "step": 2500, + "time_per_iteration": 2.66140079498291 + }, + { + "auxiliary_loss_clip": 0.06611083, + "auxiliary_loss_mlp": 0.01296332, + "balance_loss_clip": 0.06309603, + "balance_loss_mlp": 0.0126765, + "epoch": 0.15036825492259132, + "flos": 35665664440320.0, + "grad_norm": 2.1651988912264697, + "language_loss": 0.6639303, + "learning_rate": 3.849931286517249e-06, + "loss": 0.74300444, + "num_input_tokens_seen": 54325595, + "router_z_loss_clip": 3.01953125, + "router_z_loss_mlp": 0.28686523, + "step": 2501, + "time_per_iteration": 2.6920387744903564 + }, + { + "auxiliary_loss_clip": 0.06617519, + "auxiliary_loss_mlp": 0.0129342, + "balance_loss_clip": 0.06313312, + "balance_loss_mlp": 0.01262283, + "epoch": 0.15042837817525928, + "flos": 18843238679040.0, + "grad_norm": 2.189390095106363, + "language_loss": 0.84965289, + "learning_rate": 3.849783236246318e-06, + "loss": 0.92876226, + "num_input_tokens_seen": 54342180, + "router_z_loss_clip": 3.04101562, + "router_z_loss_mlp": 0.31152344, + "step": 2502, + "time_per_iteration": 2.5896334648132324 + }, + { + "auxiliary_loss_clip": 0.06611362, + "auxiliary_loss_mlp": 0.01289243, + "balance_loss_clip": 0.06310903, + "balance_loss_mlp": 0.0126142, + "epoch": 0.15048850142792725, + "flos": 19541436024960.0, + "grad_norm": 2.1165990533687746, + "language_loss": 0.78282011, + "learning_rate": 3.849635115831421e-06, + "loss": 0.86182618, + "num_input_tokens_seen": 54360255, + "router_z_loss_clip": 3.00585938, + "router_z_loss_mlp": 0.2779541, + "step": 2503, + "time_per_iteration": 3.9853694438934326 + }, + { + "auxiliary_loss_clip": 0.06603716, + "auxiliary_loss_mlp": 0.01289674, + "balance_loss_clip": 0.06307186, + "balance_loss_mlp": 0.01263102, + "epoch": 0.1505486246805952, + "flos": 22024015194240.0, + "grad_norm": 1.9675013040349558, + "language_loss": 0.8635025, + "learning_rate": 3.849486925278176e-06, + "loss": 0.94243646, + "num_input_tokens_seen": 54378260, + "router_z_loss_clip": 2.96679688, + "router_z_loss_mlp": 0.26586914, + "step": 2504, + "time_per_iteration": 2.544656991958618 + }, + { + "auxiliary_loss_clip": 0.06603047, + "auxiliary_loss_mlp": 0.0129183, + "balance_loss_clip": 0.06305411, + "balance_loss_mlp": 0.01264794, + "epoch": 0.15060874793326318, + "flos": 20749840081920.0, + "grad_norm": 2.8187796049403127, + "language_loss": 0.83803535, + "learning_rate": 3.8493386645922e-06, + "loss": 0.91698414, + "num_input_tokens_seen": 54399745, + "router_z_loss_clip": 2.97851562, + "router_z_loss_mlp": 0.27050781, + "step": 2505, + "time_per_iteration": 3.988954544067383 + }, + { + "auxiliary_loss_clip": 0.06600159, + "auxiliary_loss_mlp": 0.01291215, + "balance_loss_clip": 0.06303117, + "balance_loss_mlp": 0.01263249, + "epoch": 0.15066887118593117, + "flos": 16477470501120.0, + "grad_norm": 1.903749804745976, + "language_loss": 0.77148849, + "learning_rate": 3.849190333779117e-06, + "loss": 0.85040224, + "num_input_tokens_seen": 54417105, + "router_z_loss_clip": 2.97070312, + "router_z_loss_mlp": 0.27978516, + "step": 2506, + "time_per_iteration": 2.548551559448242 + }, + { + "auxiliary_loss_clip": 0.06619012, + "auxiliary_loss_mlp": 0.01287214, + "balance_loss_clip": 0.06307869, + "balance_loss_mlp": 0.01257722, + "epoch": 0.15072899443859913, + "flos": 19864490641920.0, + "grad_norm": 4.281401041045214, + "language_loss": 0.78119665, + "learning_rate": 3.849041932844552e-06, + "loss": 0.86025894, + "num_input_tokens_seen": 54433920, + "router_z_loss_clip": 3.11328125, + "router_z_loss_mlp": 0.29467773, + "step": 2507, + "time_per_iteration": 2.494123697280884 + }, + { + "auxiliary_loss_clip": 0.06598042, + "auxiliary_loss_mlp": 0.01289211, + "balance_loss_clip": 0.06304646, + "balance_loss_mlp": 0.01262532, + "epoch": 0.1507891176912671, + "flos": 20782348266240.0, + "grad_norm": 1.9743385281698682, + "language_loss": 0.69510758, + "learning_rate": 3.848893461794131e-06, + "loss": 0.77398014, + "num_input_tokens_seen": 54451540, + "router_z_loss_clip": 2.93554688, + "router_z_loss_mlp": 0.26647949, + "step": 2508, + "time_per_iteration": 2.53487491607666 + }, + { + "auxiliary_loss_clip": 0.06608425, + "auxiliary_loss_mlp": 0.01288258, + "balance_loss_clip": 0.06303222, + "balance_loss_mlp": 0.01259946, + "epoch": 0.15084924094393506, + "flos": 23593390640640.0, + "grad_norm": 1.8413842263271991, + "language_loss": 0.78278601, + "learning_rate": 3.8487449206334845e-06, + "loss": 0.86175287, + "num_input_tokens_seen": 54470800, + "router_z_loss_clip": 3.0546875, + "router_z_loss_mlp": 0.28320312, + "step": 2509, + "time_per_iteration": 5.512920141220093 + }, + { + "auxiliary_loss_clip": 0.06619874, + "auxiliary_loss_mlp": 0.01301611, + "balance_loss_clip": 0.06305903, + "balance_loss_mlp": 0.01270879, + "epoch": 0.15090936419660303, + "flos": 18916430820480.0, + "grad_norm": 3.8878243194331756, + "language_loss": 0.82607746, + "learning_rate": 3.848596309368246e-06, + "loss": 0.90529227, + "num_input_tokens_seen": 54486525, + "router_z_loss_clip": 3.140625, + "router_z_loss_mlp": 0.30688477, + "step": 2510, + "time_per_iteration": 2.4956603050231934 + }, + { + "auxiliary_loss_clip": 0.0661021, + "auxiliary_loss_mlp": 0.01290438, + "balance_loss_clip": 0.06301613, + "balance_loss_mlp": 0.01258919, + "epoch": 0.150969487449271, + "flos": 17933514900480.0, + "grad_norm": 2.455863983709149, + "language_loss": 0.74876237, + "learning_rate": 3.8484476280040495e-06, + "loss": 0.82776886, + "num_input_tokens_seen": 54503795, + "router_z_loss_clip": 3.08789062, + "router_z_loss_mlp": 0.31518555, + "step": 2511, + "time_per_iteration": 2.551175832748413 + }, + { + "auxiliary_loss_clip": 0.06603982, + "auxiliary_loss_mlp": 0.0129301, + "balance_loss_clip": 0.06306278, + "balance_loss_mlp": 0.012649, + "epoch": 0.151029610701939, + "flos": 24249897780480.0, + "grad_norm": 3.2919067663681854, + "language_loss": 0.6990515, + "learning_rate": 3.848298876546534e-06, + "loss": 0.77802145, + "num_input_tokens_seen": 54523025, + "router_z_loss_clip": 2.9765625, + "router_z_loss_mlp": 0.28100586, + "step": 2512, + "time_per_iteration": 2.592564344406128 + }, + { + "auxiliary_loss_clip": 0.06602003, + "auxiliary_loss_mlp": 0.01290201, + "balance_loss_clip": 0.06302576, + "balance_loss_mlp": 0.01260136, + "epoch": 0.15108973395460695, + "flos": 30270199858560.0, + "grad_norm": 3.311694411348407, + "language_loss": 0.75370401, + "learning_rate": 3.84815005500134e-06, + "loss": 0.8326261, + "num_input_tokens_seen": 54545025, + "router_z_loss_clip": 2.99609375, + "router_z_loss_mlp": 0.30078125, + "step": 2513, + "time_per_iteration": 2.675105571746826 + }, + { + "auxiliary_loss_clip": 0.06516539, + "auxiliary_loss_mlp": 0.01341982, + "balance_loss_clip": 0.06344443, + "balance_loss_mlp": 0.01333804, + "epoch": 0.15114985720727492, + "flos": 60456711087360.0, + "grad_norm": 0.8564181084280313, + "language_loss": 0.64582717, + "learning_rate": 3.84800116337411e-06, + "loss": 0.72441238, + "num_input_tokens_seen": 54604545, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.08178711, + "step": 2514, + "time_per_iteration": 3.1119604110717773 + }, + { + "auxiliary_loss_clip": 0.06602134, + "auxiliary_loss_mlp": 0.01300136, + "balance_loss_clip": 0.06303127, + "balance_loss_mlp": 0.01271299, + "epoch": 0.15120998045994288, + "flos": 20527915743360.0, + "grad_norm": 2.3848506685629487, + "language_loss": 0.74193883, + "learning_rate": 3.8478522016704916e-06, + "loss": 0.82096153, + "num_input_tokens_seen": 54620590, + "router_z_loss_clip": 2.9921875, + "router_z_loss_mlp": 0.28869629, + "step": 2515, + "time_per_iteration": 2.554006814956665 + }, + { + "auxiliary_loss_clip": 0.06601816, + "auxiliary_loss_mlp": 0.01297055, + "balance_loss_clip": 0.06304994, + "balance_loss_mlp": 0.01269577, + "epoch": 0.15127010371261085, + "flos": 21185303351040.0, + "grad_norm": 1.9231590772251361, + "language_loss": 0.78707075, + "learning_rate": 3.8477031698961325e-06, + "loss": 0.86605948, + "num_input_tokens_seen": 54640410, + "router_z_loss_clip": 2.96875, + "router_z_loss_mlp": 0.27490234, + "step": 2516, + "time_per_iteration": 2.5447309017181396 + }, + { + "auxiliary_loss_clip": 0.06496674, + "auxiliary_loss_mlp": 0.01300995, + "balance_loss_clip": 0.063242, + "balance_loss_mlp": 0.01292406, + "epoch": 0.1513302269652788, + "flos": 65339537189760.0, + "grad_norm": 0.7164418146378366, + "language_loss": 0.54901356, + "learning_rate": 3.8475540680566835e-06, + "loss": 0.62699026, + "num_input_tokens_seen": 54701430, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.08599854, + "step": 2517, + "time_per_iteration": 3.1926348209381104 + }, + { + "auxiliary_loss_clip": 0.06606746, + "auxiliary_loss_mlp": 0.01299298, + "balance_loss_clip": 0.06308446, + "balance_loss_mlp": 0.01269257, + "epoch": 0.15139035021794678, + "flos": 19141918957440.0, + "grad_norm": 1.8480469380115683, + "language_loss": 0.79359663, + "learning_rate": 3.8474048961577995e-06, + "loss": 0.87265706, + "num_input_tokens_seen": 54720845, + "router_z_loss_clip": 2.98046875, + "router_z_loss_mlp": 0.30078125, + "step": 2518, + "time_per_iteration": 2.563261032104492 + }, + { + "auxiliary_loss_clip": 0.06615496, + "auxiliary_loss_mlp": 0.01294147, + "balance_loss_clip": 0.06308527, + "balance_loss_mlp": 0.01264154, + "epoch": 0.15145047347061477, + "flos": 26585841104640.0, + "grad_norm": 2.595059574569343, + "language_loss": 0.71604168, + "learning_rate": 3.847255654205137e-06, + "loss": 0.79513812, + "num_input_tokens_seen": 54740495, + "router_z_loss_clip": 3.0703125, + "router_z_loss_mlp": 0.29980469, + "step": 2519, + "time_per_iteration": 2.5810017585754395 + }, + { + "auxiliary_loss_clip": 0.06607082, + "auxiliary_loss_mlp": 0.01285902, + "balance_loss_clip": 0.06307598, + "balance_loss_mlp": 0.01257483, + "epoch": 0.15151059672328274, + "flos": 20309177859840.0, + "grad_norm": 2.5486902935962368, + "language_loss": 0.80309343, + "learning_rate": 3.847106342204354e-06, + "loss": 0.88202327, + "num_input_tokens_seen": 54758415, + "router_z_loss_clip": 2.99804688, + "router_z_loss_mlp": 0.28393555, + "step": 2520, + "time_per_iteration": 2.5701065063476562 + }, + { + "auxiliary_loss_clip": 0.06607689, + "auxiliary_loss_mlp": 0.01293848, + "balance_loss_clip": 0.06306153, + "balance_loss_mlp": 0.01262853, + "epoch": 0.1515707199759507, + "flos": 27234591742080.0, + "grad_norm": 2.513682116437687, + "language_loss": 0.7522434, + "learning_rate": 3.846956960161114e-06, + "loss": 0.83125877, + "num_input_tokens_seen": 54779355, + "router_z_loss_clip": 3.01953125, + "router_z_loss_mlp": 0.31005859, + "step": 2521, + "time_per_iteration": 2.6066393852233887 + }, + { + "auxiliary_loss_clip": 0.06609409, + "auxiliary_loss_mlp": 0.01293912, + "balance_loss_clip": 0.06305401, + "balance_loss_mlp": 0.012643, + "epoch": 0.15163084322861867, + "flos": 23594229181440.0, + "grad_norm": 3.360256579964136, + "language_loss": 0.82804251, + "learning_rate": 3.84680750808108e-06, + "loss": 0.9070757, + "num_input_tokens_seen": 54799465, + "router_z_loss_clip": 3.04101562, + "router_z_loss_mlp": 0.29614258, + "step": 2522, + "time_per_iteration": 2.6204471588134766 + }, + { + "auxiliary_loss_clip": 0.06466869, + "auxiliary_loss_mlp": 0.01261371, + "balance_loss_clip": 0.06295171, + "balance_loss_mlp": 0.01253491, + "epoch": 0.15169096648128663, + "flos": 66908786855040.0, + "grad_norm": 0.8016115215940587, + "language_loss": 0.58029842, + "learning_rate": 3.846657985969922e-06, + "loss": 0.65758073, + "num_input_tokens_seen": 54857665, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.07873535, + "step": 2523, + "time_per_iteration": 3.1140880584716797 + }, + { + "auxiliary_loss_clip": 0.06599564, + "auxiliary_loss_mlp": 0.0128657, + "balance_loss_clip": 0.0630584, + "balance_loss_mlp": 0.0125821, + "epoch": 0.1517510897339546, + "flos": 29103024810240.0, + "grad_norm": 3.3848907238065324, + "language_loss": 0.7552231, + "learning_rate": 3.8465083938333066e-06, + "loss": 0.83408445, + "num_input_tokens_seen": 54879895, + "router_z_loss_clip": 2.93945312, + "router_z_loss_mlp": 0.2833252, + "step": 2524, + "time_per_iteration": 2.6701698303222656 + }, + { + "auxiliary_loss_clip": 0.066016, + "auxiliary_loss_mlp": 0.01289357, + "balance_loss_clip": 0.0629995, + "balance_loss_mlp": 0.01259889, + "epoch": 0.1518112129866226, + "flos": 18412597019520.0, + "grad_norm": 1.915224291313093, + "language_loss": 0.75580716, + "learning_rate": 3.8463587316769085e-06, + "loss": 0.8347168, + "num_input_tokens_seen": 54898245, + "router_z_loss_clip": 3.01757812, + "router_z_loss_mlp": 0.29443359, + "step": 2525, + "time_per_iteration": 2.5224146842956543 + }, + { + "auxiliary_loss_clip": 0.06610245, + "auxiliary_loss_mlp": 0.01284071, + "balance_loss_clip": 0.06303011, + "balance_loss_mlp": 0.01254436, + "epoch": 0.15187133623929056, + "flos": 19431165651840.0, + "grad_norm": 1.8765466933559616, + "language_loss": 0.80763042, + "learning_rate": 3.846208999506402e-06, + "loss": 0.88657361, + "num_input_tokens_seen": 54917060, + "router_z_loss_clip": 3.0703125, + "router_z_loss_mlp": 0.29638672, + "step": 2526, + "time_per_iteration": 2.6248834133148193 + }, + { + "auxiliary_loss_clip": 0.06594585, + "auxiliary_loss_mlp": 0.01286752, + "balance_loss_clip": 0.06300339, + "balance_loss_mlp": 0.01258869, + "epoch": 0.15193145949195852, + "flos": 17571914605440.0, + "grad_norm": 1.7842428302313325, + "language_loss": 0.8627159, + "learning_rate": 3.846059197327466e-06, + "loss": 0.94152921, + "num_input_tokens_seen": 54936365, + "router_z_loss_clip": 2.94335938, + "router_z_loss_mlp": 0.27893066, + "step": 2527, + "time_per_iteration": 2.5703248977661133 + }, + { + "auxiliary_loss_clip": 0.06595106, + "auxiliary_loss_mlp": 0.01287139, + "balance_loss_clip": 0.06298759, + "balance_loss_mlp": 0.01258386, + "epoch": 0.15199158274462649, + "flos": 36185472443520.0, + "grad_norm": 2.5277358880769034, + "language_loss": 0.69832277, + "learning_rate": 3.845909325145779e-06, + "loss": 0.77714521, + "num_input_tokens_seen": 54961365, + "router_z_loss_clip": 2.9609375, + "router_z_loss_mlp": 0.28710938, + "step": 2528, + "time_per_iteration": 2.6980392932891846 + }, + { + "auxiliary_loss_clip": 0.06594975, + "auxiliary_loss_mlp": 0.01296705, + "balance_loss_clip": 0.06302442, + "balance_loss_mlp": 0.01268142, + "epoch": 0.15205170599729445, + "flos": 23080416744960.0, + "grad_norm": 1.7045403282780136, + "language_loss": 0.87845027, + "learning_rate": 3.845759382967026e-06, + "loss": 0.95736718, + "num_input_tokens_seen": 54980750, + "router_z_loss_clip": 2.92382812, + "router_z_loss_mlp": 0.28588867, + "step": 2529, + "time_per_iteration": 2.557424545288086 + }, + { + "auxiliary_loss_clip": 0.06594887, + "auxiliary_loss_mlp": 0.01282389, + "balance_loss_clip": 0.06300049, + "balance_loss_mlp": 0.01254446, + "epoch": 0.15211182924996242, + "flos": 21914876851200.0, + "grad_norm": 2.4637975770903227, + "language_loss": 0.84209996, + "learning_rate": 3.845609370796893e-06, + "loss": 0.92087275, + "num_input_tokens_seen": 54999675, + "router_z_loss_clip": 2.9453125, + "router_z_loss_mlp": 0.27929688, + "step": 2530, + "time_per_iteration": 2.567228317260742 + }, + { + "auxiliary_loss_clip": 0.06598973, + "auxiliary_loss_mlp": 0.01283946, + "balance_loss_clip": 0.06302072, + "balance_loss_mlp": 0.01255336, + "epoch": 0.15217195250263038, + "flos": 13886675383680.0, + "grad_norm": 2.4321779104905312, + "language_loss": 0.82142234, + "learning_rate": 3.845459288641066e-06, + "loss": 0.90025157, + "num_input_tokens_seen": 55018295, + "router_z_loss_clip": 2.96875, + "router_z_loss_mlp": 0.28637695, + "step": 2531, + "time_per_iteration": 2.516402006149292 + }, + { + "auxiliary_loss_clip": 0.06592906, + "auxiliary_loss_mlp": 0.01285145, + "balance_loss_clip": 0.06298403, + "balance_loss_mlp": 0.01258085, + "epoch": 0.15223207575529837, + "flos": 24542247075840.0, + "grad_norm": 1.9096136580750296, + "language_loss": 0.79480046, + "learning_rate": 3.8453091365052394e-06, + "loss": 0.87358099, + "num_input_tokens_seen": 55037975, + "router_z_loss_clip": 2.94726562, + "router_z_loss_mlp": 0.27050781, + "step": 2532, + "time_per_iteration": 2.602570056915283 + }, + { + "auxiliary_loss_clip": 0.06598103, + "auxiliary_loss_mlp": 0.01292588, + "balance_loss_clip": 0.06306568, + "balance_loss_mlp": 0.01264038, + "epoch": 0.15229219900796634, + "flos": 25563876382080.0, + "grad_norm": 2.360683407186041, + "language_loss": 0.88639164, + "learning_rate": 3.845158914395105e-06, + "loss": 0.96529853, + "num_input_tokens_seen": 55057135, + "router_z_loss_clip": 2.91796875, + "router_z_loss_mlp": 0.28552246, + "step": 2533, + "time_per_iteration": 2.5762295722961426 + }, + { + "auxiliary_loss_clip": 0.06594107, + "auxiliary_loss_mlp": 0.01284606, + "balance_loss_clip": 0.06298208, + "balance_loss_mlp": 0.01254935, + "epoch": 0.1523523222606343, + "flos": 18222761594880.0, + "grad_norm": 2.499608410280873, + "language_loss": 0.79898536, + "learning_rate": 3.84500862231636e-06, + "loss": 0.87777245, + "num_input_tokens_seen": 55075525, + "router_z_loss_clip": 2.95898438, + "router_z_loss_mlp": 0.29650879, + "step": 2534, + "time_per_iteration": 2.5181829929351807 + }, + { + "auxiliary_loss_clip": 0.06609488, + "auxiliary_loss_mlp": 0.01289006, + "balance_loss_clip": 0.0630374, + "balance_loss_mlp": 0.01258965, + "epoch": 0.15241244551330227, + "flos": 13264940488320.0, + "grad_norm": 3.191609676619316, + "language_loss": 0.77956164, + "learning_rate": 3.844858260274702e-06, + "loss": 0.8585465, + "num_input_tokens_seen": 55090845, + "router_z_loss_clip": 3.05859375, + "router_z_loss_mlp": 0.30029297, + "step": 2535, + "time_per_iteration": 2.503119945526123 + }, + { + "auxiliary_loss_clip": 0.06608094, + "auxiliary_loss_mlp": 0.01284526, + "balance_loss_clip": 0.06304367, + "balance_loss_mlp": 0.01254271, + "epoch": 0.15247256876597023, + "flos": 19721083178880.0, + "grad_norm": 3.2947050027003066, + "language_loss": 0.79165435, + "learning_rate": 3.844707828275835e-06, + "loss": 0.87058055, + "num_input_tokens_seen": 55108750, + "router_z_loss_clip": 3.03710938, + "router_z_loss_mlp": 0.30249023, + "step": 2536, + "time_per_iteration": 2.5530476570129395 + }, + { + "auxiliary_loss_clip": 0.06598002, + "auxiliary_loss_mlp": 0.0128534, + "balance_loss_clip": 0.06305596, + "balance_loss_mlp": 0.01255537, + "epoch": 0.1525326920186382, + "flos": 20382076512000.0, + "grad_norm": 2.2639852442912174, + "language_loss": 0.76164496, + "learning_rate": 3.844557326325461e-06, + "loss": 0.84047836, + "num_input_tokens_seen": 55126750, + "router_z_loss_clip": 2.92578125, + "router_z_loss_mlp": 0.29785156, + "step": 2537, + "time_per_iteration": 2.5634751319885254 + }, + { + "auxiliary_loss_clip": 0.06616107, + "auxiliary_loss_mlp": 0.01291403, + "balance_loss_clip": 0.06314284, + "balance_loss_mlp": 0.0126017, + "epoch": 0.15259281527130616, + "flos": 13595122702080.0, + "grad_norm": 2.083719097909717, + "language_loss": 0.78846097, + "learning_rate": 3.8444067544292896e-06, + "loss": 0.86753607, + "num_input_tokens_seen": 55144690, + "router_z_loss_clip": 3.01953125, + "router_z_loss_mlp": 0.31225586, + "step": 2538, + "time_per_iteration": 2.525216579437256 + }, + { + "auxiliary_loss_clip": 0.0661103, + "auxiliary_loss_mlp": 0.01284923, + "balance_loss_clip": 0.06318808, + "balance_loss_mlp": 0.0125735, + "epoch": 0.15265293852397416, + "flos": 22867590574080.0, + "grad_norm": 1.595971485409624, + "language_loss": 0.90629852, + "learning_rate": 3.844256112593029e-06, + "loss": 0.98525798, + "num_input_tokens_seen": 55166055, + "router_z_loss_clip": 2.921875, + "router_z_loss_mlp": 0.27600098, + "step": 2539, + "time_per_iteration": 2.5915887355804443 + }, + { + "auxiliary_loss_clip": 0.06619261, + "auxiliary_loss_mlp": 0.01284998, + "balance_loss_clip": 0.06323005, + "balance_loss_mlp": 0.01258056, + "epoch": 0.15271306177664212, + "flos": 29245174462080.0, + "grad_norm": 1.9545185046664433, + "language_loss": 0.94507146, + "learning_rate": 3.844105400822391e-06, + "loss": 1.02411401, + "num_input_tokens_seen": 55186285, + "router_z_loss_clip": 2.9609375, + "router_z_loss_mlp": 0.26953125, + "step": 2540, + "time_per_iteration": 2.5957653522491455 + }, + { + "auxiliary_loss_clip": 0.06626961, + "auxiliary_loss_mlp": 0.01293534, + "balance_loss_clip": 0.06334557, + "balance_loss_mlp": 0.01266021, + "epoch": 0.1527731850293101, + "flos": 31253912392320.0, + "grad_norm": 1.8583637495379903, + "language_loss": 0.76235664, + "learning_rate": 3.843954619123092e-06, + "loss": 0.84156162, + "num_input_tokens_seen": 55207915, + "router_z_loss_clip": 2.92578125, + "router_z_loss_mlp": 0.27490234, + "step": 2541, + "time_per_iteration": 2.6641690731048584 + }, + { + "auxiliary_loss_clip": 0.06626125, + "auxiliary_loss_mlp": 0.01288118, + "balance_loss_clip": 0.06332077, + "balance_loss_mlp": 0.01259139, + "epoch": 0.15283330828197805, + "flos": 22388550382080.0, + "grad_norm": 1.961487412354616, + "language_loss": 0.82183802, + "learning_rate": 3.84380376750085e-06, + "loss": 0.90098047, + "num_input_tokens_seen": 55227860, + "router_z_loss_clip": 2.94140625, + "router_z_loss_mlp": 0.28991699, + "step": 2542, + "time_per_iteration": 2.5667076110839844 + }, + { + "auxiliary_loss_clip": 0.06644198, + "auxiliary_loss_mlp": 0.01293823, + "balance_loss_clip": 0.0634245, + "balance_loss_mlp": 0.01263568, + "epoch": 0.15289343153464602, + "flos": 25527175493760.0, + "grad_norm": 2.1541705335190597, + "language_loss": 0.78364998, + "learning_rate": 3.843652845961383e-06, + "loss": 0.8630302, + "num_input_tokens_seen": 55247330, + "router_z_loss_clip": 3.01953125, + "router_z_loss_mlp": 0.3026123, + "step": 2543, + "time_per_iteration": 3.986154556274414 + }, + { + "auxiliary_loss_clip": 0.06638096, + "auxiliary_loss_mlp": 0.01299522, + "balance_loss_clip": 0.06343587, + "balance_loss_mlp": 0.01271616, + "epoch": 0.15295355478731398, + "flos": 22716468535680.0, + "grad_norm": 3.1436155023596886, + "language_loss": 0.88072753, + "learning_rate": 3.843501854510416e-06, + "loss": 0.96010375, + "num_input_tokens_seen": 55266195, + "router_z_loss_clip": 2.9453125, + "router_z_loss_mlp": 0.27905273, + "step": 2544, + "time_per_iteration": 3.9873733520507812 + }, + { + "auxiliary_loss_clip": 0.06648069, + "auxiliary_loss_mlp": 0.01297216, + "balance_loss_clip": 0.06342938, + "balance_loss_mlp": 0.01266937, + "epoch": 0.15301367803998198, + "flos": 23257548276480.0, + "grad_norm": 3.867712661232465, + "language_loss": 0.83686781, + "learning_rate": 3.843350793153673e-06, + "loss": 0.91632062, + "num_input_tokens_seen": 55283305, + "router_z_loss_clip": 3.05078125, + "router_z_loss_mlp": 0.30273438, + "step": 2545, + "time_per_iteration": 2.5443849563598633 + }, + { + "auxiliary_loss_clip": 0.06650628, + "auxiliary_loss_mlp": 0.01286742, + "balance_loss_clip": 0.06356554, + "balance_loss_mlp": 0.01259086, + "epoch": 0.15307380129264994, + "flos": 25893597398400.0, + "grad_norm": 2.572032347282614, + "language_loss": 0.71873057, + "learning_rate": 3.843199661896884e-06, + "loss": 0.79810423, + "num_input_tokens_seen": 55303035, + "router_z_loss_clip": 2.94335938, + "router_z_loss_mlp": 0.27661133, + "step": 2546, + "time_per_iteration": 2.650826930999756 + }, + { + "auxiliary_loss_clip": 0.06637084, + "auxiliary_loss_mlp": 0.0129342, + "balance_loss_clip": 0.06340081, + "balance_loss_mlp": 0.01263164, + "epoch": 0.1531339245453179, + "flos": 46983780766080.0, + "grad_norm": 1.694960648035813, + "language_loss": 0.78831929, + "learning_rate": 3.843048460745779e-06, + "loss": 0.86762434, + "num_input_tokens_seen": 55327570, + "router_z_loss_clip": 2.96875, + "router_z_loss_mlp": 0.30249023, + "step": 2547, + "time_per_iteration": 2.7530312538146973 + }, + { + "auxiliary_loss_clip": 0.06643492, + "auxiliary_loss_mlp": 0.01284901, + "balance_loss_clip": 0.06342105, + "balance_loss_mlp": 0.0125579, + "epoch": 0.15319404779798587, + "flos": 35890817160960.0, + "grad_norm": 3.38346990001629, + "language_loss": 0.75178528, + "learning_rate": 3.842897189706092e-06, + "loss": 0.83106923, + "num_input_tokens_seen": 55351090, + "router_z_loss_clip": 3.015625, + "router_z_loss_mlp": 0.29138184, + "step": 2548, + "time_per_iteration": 4.090601682662964 + }, + { + "auxiliary_loss_clip": 0.06638174, + "auxiliary_loss_mlp": 0.01283175, + "balance_loss_clip": 0.06343598, + "balance_loss_mlp": 0.01255757, + "epoch": 0.15325417105065384, + "flos": 25671463424640.0, + "grad_norm": 1.8173203040893826, + "language_loss": 0.82054353, + "learning_rate": 3.842745848783558e-06, + "loss": 0.89975703, + "num_input_tokens_seen": 55371050, + "router_z_loss_clip": 2.94335938, + "router_z_loss_mlp": 0.27416992, + "step": 2549, + "time_per_iteration": 4.0024590492248535 + }, + { + "auxiliary_loss_clip": 0.06642953, + "auxiliary_loss_mlp": 0.01284523, + "balance_loss_clip": 0.06343073, + "balance_loss_mlp": 0.01256366, + "epoch": 0.1533142943033218, + "flos": 18776838718080.0, + "grad_norm": 1.6738213226373704, + "language_loss": 0.76089072, + "learning_rate": 3.842594437983917e-06, + "loss": 0.84016538, + "num_input_tokens_seen": 55390375, + "router_z_loss_clip": 2.99804688, + "router_z_loss_mlp": 0.28137207, + "step": 2550, + "time_per_iteration": 2.5584487915039062 + }, + { + "auxiliary_loss_clip": 0.06640078, + "auxiliary_loss_mlp": 0.01284284, + "balance_loss_clip": 0.063375, + "balance_loss_mlp": 0.01257093, + "epoch": 0.15337441755598977, + "flos": 23113218418560.0, + "grad_norm": 2.77223179347166, + "language_loss": 0.78078097, + "learning_rate": 3.8424429573129115e-06, + "loss": 0.86002457, + "num_input_tokens_seen": 55408890, + "router_z_loss_clip": 3.02734375, + "router_z_loss_mlp": 0.27172852, + "step": 2551, + "time_per_iteration": 2.5581319332122803 + }, + { + "auxiliary_loss_clip": 0.06594751, + "auxiliary_loss_mlp": 0.01264842, + "balance_loss_clip": 0.0641477, + "balance_loss_mlp": 0.01255657, + "epoch": 0.15343454080865776, + "flos": 59881278372480.0, + "grad_norm": 0.9086682427744472, + "language_loss": 0.56718183, + "learning_rate": 3.842291406776283e-06, + "loss": 0.6457777, + "num_input_tokens_seen": 55463815, + "router_z_loss_clip": 1.796875, + "router_z_loss_mlp": 0.09179688, + "step": 2552, + "time_per_iteration": 3.099020004272461 + }, + { + "auxiliary_loss_clip": 0.06649399, + "auxiliary_loss_mlp": 0.01294284, + "balance_loss_clip": 0.06343735, + "balance_loss_mlp": 0.01263695, + "epoch": 0.15349466406132573, + "flos": 11915644590720.0, + "grad_norm": 7.1683362370520625, + "language_loss": 0.89047897, + "learning_rate": 3.84213978637978e-06, + "loss": 0.96991581, + "num_input_tokens_seen": 55481050, + "router_z_loss_clip": 3.05664062, + "router_z_loss_mlp": 0.30615234, + "step": 2553, + "time_per_iteration": 2.5545389652252197 + }, + { + "auxiliary_loss_clip": 0.06633511, + "auxiliary_loss_mlp": 0.01288342, + "balance_loss_clip": 0.0633003, + "balance_loss_mlp": 0.01258575, + "epoch": 0.1535547873139937, + "flos": 24103681205760.0, + "grad_norm": 2.37345039804312, + "language_loss": 0.79193908, + "learning_rate": 3.841988096129152e-06, + "loss": 0.87115765, + "num_input_tokens_seen": 55500050, + "router_z_loss_clip": 3.03515625, + "router_z_loss_mlp": 0.29748535, + "step": 2554, + "time_per_iteration": 2.5949606895446777 + }, + { + "auxiliary_loss_clip": 0.06630482, + "auxiliary_loss_mlp": 0.01286402, + "balance_loss_clip": 0.06329404, + "balance_loss_mlp": 0.01256278, + "epoch": 0.15361491056666166, + "flos": 17572208094720.0, + "grad_norm": 5.650486163134607, + "language_loss": 0.79014289, + "learning_rate": 3.841836336030151e-06, + "loss": 0.86931169, + "num_input_tokens_seen": 55518125, + "router_z_loss_clip": 3.01171875, + "router_z_loss_mlp": 0.3013916, + "step": 2555, + "time_per_iteration": 2.5340495109558105 + }, + { + "auxiliary_loss_clip": 0.0662353, + "auxiliary_loss_mlp": 0.01288339, + "balance_loss_clip": 0.06330266, + "balance_loss_mlp": 0.01260671, + "epoch": 0.15367503381932962, + "flos": 25053040765440.0, + "grad_norm": 1.6796179562313394, + "language_loss": 0.78025055, + "learning_rate": 3.8416845060885305e-06, + "loss": 0.85936922, + "num_input_tokens_seen": 55540960, + "router_z_loss_clip": 2.93554688, + "router_z_loss_mlp": 0.2767334, + "step": 2556, + "time_per_iteration": 2.623685121536255 + }, + { + "auxiliary_loss_clip": 0.06620497, + "auxiliary_loss_mlp": 0.01288231, + "balance_loss_clip": 0.0633128, + "balance_loss_mlp": 0.01260086, + "epoch": 0.15373515707199759, + "flos": 21513808483200.0, + "grad_norm": 2.256114728182097, + "language_loss": 0.91304088, + "learning_rate": 3.84153260631005e-06, + "loss": 0.99212819, + "num_input_tokens_seen": 55559210, + "router_z_loss_clip": 2.890625, + "router_z_loss_mlp": 0.28161621, + "step": 2557, + "time_per_iteration": 2.6546642780303955 + }, + { + "auxiliary_loss_clip": 0.06632135, + "auxiliary_loss_mlp": 0.01294079, + "balance_loss_clip": 0.0633366, + "balance_loss_mlp": 0.0126411, + "epoch": 0.15379528032466555, + "flos": 26001897200640.0, + "grad_norm": 2.0796567985016656, + "language_loss": 0.71532625, + "learning_rate": 3.841380636700468e-06, + "loss": 0.79458839, + "num_input_tokens_seen": 55578925, + "router_z_loss_clip": 2.98632812, + "router_z_loss_mlp": 0.29980469, + "step": 2558, + "time_per_iteration": 2.604158401489258 + }, + { + "auxiliary_loss_clip": 0.06622511, + "auxiliary_loss_mlp": 0.01287721, + "balance_loss_clip": 0.06324002, + "balance_loss_mlp": 0.01258336, + "epoch": 0.15385540357733354, + "flos": 19282685016960.0, + "grad_norm": 2.0921223854633166, + "language_loss": 0.93401122, + "learning_rate": 3.841228597265548e-06, + "loss": 1.0131135, + "num_input_tokens_seen": 55597255, + "router_z_loss_clip": 2.98242188, + "router_z_loss_mlp": 0.29382324, + "step": 2559, + "time_per_iteration": 2.546621799468994 + }, + { + "auxiliary_loss_clip": 0.06626738, + "auxiliary_loss_mlp": 0.01291924, + "balance_loss_clip": 0.06328855, + "balance_loss_mlp": 0.01262289, + "epoch": 0.1539155268300015, + "flos": 28556788043520.0, + "grad_norm": 2.7498914144184994, + "language_loss": 0.65563196, + "learning_rate": 3.841076488011055e-06, + "loss": 0.73481858, + "num_input_tokens_seen": 55619515, + "router_z_loss_clip": 2.97851562, + "router_z_loss_mlp": 0.29638672, + "step": 2560, + "time_per_iteration": 2.633558511734009 + }, + { + "auxiliary_loss_clip": 0.06620878, + "auxiliary_loss_mlp": 0.01293003, + "balance_loss_clip": 0.06320217, + "balance_loss_mlp": 0.01262927, + "epoch": 0.15397565008266947, + "flos": 23554257984000.0, + "grad_norm": 1.9722034302545564, + "language_loss": 0.89109504, + "learning_rate": 3.8409243089427574e-06, + "loss": 0.9702338, + "num_input_tokens_seen": 55640050, + "router_z_loss_clip": 3.00585938, + "router_z_loss_mlp": 0.30065918, + "step": 2561, + "time_per_iteration": 2.593822479248047 + }, + { + "auxiliary_loss_clip": 0.06618848, + "auxiliary_loss_mlp": 0.01287729, + "balance_loss_clip": 0.06331521, + "balance_loss_mlp": 0.01260811, + "epoch": 0.15403577333533744, + "flos": 17135696649600.0, + "grad_norm": 2.292455015225775, + "language_loss": 0.83781528, + "learning_rate": 3.840772060066425e-06, + "loss": 0.91688108, + "num_input_tokens_seen": 55658695, + "router_z_loss_clip": 2.87109375, + "router_z_loss_mlp": 0.26928711, + "step": 2562, + "time_per_iteration": 2.5630288124084473 + }, + { + "auxiliary_loss_clip": 0.06628443, + "auxiliary_loss_mlp": 0.01297123, + "balance_loss_clip": 0.06321231, + "balance_loss_mlp": 0.01265175, + "epoch": 0.1540958965880054, + "flos": 17900252029440.0, + "grad_norm": 3.685635027542056, + "language_loss": 0.75855017, + "learning_rate": 3.840619741387832e-06, + "loss": 0.83780587, + "num_input_tokens_seen": 55676340, + "router_z_loss_clip": 3.0703125, + "router_z_loss_mlp": 0.31958008, + "step": 2563, + "time_per_iteration": 2.5140066146850586 + }, + { + "auxiliary_loss_clip": 0.06627464, + "auxiliary_loss_mlp": 0.01290382, + "balance_loss_clip": 0.06320702, + "balance_loss_mlp": 0.01258481, + "epoch": 0.15415601984067337, + "flos": 32169296321280.0, + "grad_norm": 2.478610974211426, + "language_loss": 0.77803361, + "learning_rate": 3.8404673529127534e-06, + "loss": 0.85721207, + "num_input_tokens_seen": 55698890, + "router_z_loss_clip": 3.0703125, + "router_z_loss_mlp": 0.3190918, + "step": 2564, + "time_per_iteration": 2.659982681274414 + }, + { + "auxiliary_loss_clip": 0.06615369, + "auxiliary_loss_mlp": 0.0129364, + "balance_loss_clip": 0.06320594, + "balance_loss_mlp": 0.01264267, + "epoch": 0.15421614309334136, + "flos": 24031243751040.0, + "grad_norm": 1.9916685694635767, + "language_loss": 0.71840364, + "learning_rate": 3.840314894646969e-06, + "loss": 0.7974937, + "num_input_tokens_seen": 55718535, + "router_z_loss_clip": 2.94726562, + "router_z_loss_mlp": 0.29321289, + "step": 2565, + "time_per_iteration": 2.553128480911255 + }, + { + "auxiliary_loss_clip": 0.06614129, + "auxiliary_loss_mlp": 0.01296634, + "balance_loss_clip": 0.06317951, + "balance_loss_mlp": 0.01266212, + "epoch": 0.15427626634600933, + "flos": 24392676337920.0, + "grad_norm": 2.5526224211901676, + "language_loss": 0.72527832, + "learning_rate": 3.840162366596259e-06, + "loss": 0.8043859, + "num_input_tokens_seen": 55738970, + "router_z_loss_clip": 2.9609375, + "router_z_loss_mlp": 0.30419922, + "step": 2566, + "time_per_iteration": 2.6016533374786377 + }, + { + "auxiliary_loss_clip": 0.06605071, + "auxiliary_loss_mlp": 0.01292884, + "balance_loss_clip": 0.06314062, + "balance_loss_mlp": 0.01265263, + "epoch": 0.1543363895986773, + "flos": 23338287285120.0, + "grad_norm": 2.301564838599309, + "language_loss": 0.86417472, + "learning_rate": 3.840009768766408e-06, + "loss": 0.94315434, + "num_input_tokens_seen": 55759585, + "router_z_loss_clip": 2.9140625, + "router_z_loss_mlp": 0.27612305, + "step": 2567, + "time_per_iteration": 2.5882625579833984 + }, + { + "auxiliary_loss_clip": 0.06608227, + "auxiliary_loss_mlp": 0.01293398, + "balance_loss_clip": 0.06315389, + "balance_loss_mlp": 0.01265348, + "epoch": 0.15439651285134526, + "flos": 24280225758720.0, + "grad_norm": 2.3922484360691576, + "language_loss": 0.79661417, + "learning_rate": 3.839857101163202e-06, + "loss": 0.87563044, + "num_input_tokens_seen": 55779250, + "router_z_loss_clip": 2.93164062, + "router_z_loss_mlp": 0.28039551, + "step": 2568, + "time_per_iteration": 2.6128549575805664 + }, + { + "auxiliary_loss_clip": 0.06604031, + "auxiliary_loss_mlp": 0.01296391, + "balance_loss_clip": 0.06313319, + "balance_loss_mlp": 0.01268103, + "epoch": 0.15445663610401322, + "flos": 22462832626560.0, + "grad_norm": 2.2987457723616482, + "language_loss": 0.71156412, + "learning_rate": 3.83970436379243e-06, + "loss": 0.79056835, + "num_input_tokens_seen": 55800470, + "router_z_loss_clip": 2.90820312, + "router_z_loss_mlp": 0.28295898, + "step": 2569, + "time_per_iteration": 2.555661916732788 + }, + { + "auxiliary_loss_clip": 0.06609643, + "auxiliary_loss_mlp": 0.0129108, + "balance_loss_clip": 0.06317194, + "balance_loss_mlp": 0.0126197, + "epoch": 0.1545167593566812, + "flos": 22055223640320.0, + "grad_norm": 2.1871959478456433, + "language_loss": 0.7775144, + "learning_rate": 3.839551556659884e-06, + "loss": 0.85652161, + "num_input_tokens_seen": 55817795, + "router_z_loss_clip": 2.92773438, + "router_z_loss_mlp": 0.29150391, + "step": 2570, + "time_per_iteration": 2.5834736824035645 + }, + { + "auxiliary_loss_clip": 0.06598657, + "auxiliary_loss_mlp": 0.01290077, + "balance_loss_clip": 0.06308745, + "balance_loss_mlp": 0.01260513, + "epoch": 0.15457688260934915, + "flos": 19324375223040.0, + "grad_norm": 2.749201239461968, + "language_loss": 0.7861867, + "learning_rate": 3.839398679771359e-06, + "loss": 0.86507404, + "num_input_tokens_seen": 55836125, + "router_z_loss_clip": 2.8984375, + "router_z_loss_mlp": 0.29541016, + "step": 2571, + "time_per_iteration": 2.5391428470611572 + }, + { + "auxiliary_loss_clip": 0.06606804, + "auxiliary_loss_mlp": 0.01294872, + "balance_loss_clip": 0.06313352, + "balance_loss_mlp": 0.01265785, + "epoch": 0.15463700586201715, + "flos": 24140843291520.0, + "grad_norm": 1.901838675989398, + "language_loss": 0.83756542, + "learning_rate": 3.839245733132652e-06, + "loss": 0.91658223, + "num_input_tokens_seen": 55855280, + "router_z_loss_clip": 2.93554688, + "router_z_loss_mlp": 0.29101562, + "step": 2572, + "time_per_iteration": 2.597111463546753 + }, + { + "auxiliary_loss_clip": 0.06611877, + "auxiliary_loss_mlp": 0.01296064, + "balance_loss_clip": 0.06316563, + "balance_loss_mlp": 0.01266393, + "epoch": 0.1546971291146851, + "flos": 22427808819840.0, + "grad_norm": 2.3334374955274466, + "language_loss": 0.91633451, + "learning_rate": 3.839092716749563e-06, + "loss": 0.9954139, + "num_input_tokens_seen": 55875695, + "router_z_loss_clip": 2.95898438, + "router_z_loss_mlp": 0.29699707, + "step": 2573, + "time_per_iteration": 2.553586721420288 + }, + { + "auxiliary_loss_clip": 0.06606219, + "auxiliary_loss_mlp": 0.01288918, + "balance_loss_clip": 0.06312492, + "balance_loss_mlp": 0.01258639, + "epoch": 0.15475725236735308, + "flos": 17536010330880.0, + "grad_norm": 1.5970575826599196, + "language_loss": 0.71088636, + "learning_rate": 3.838939630627893e-06, + "loss": 0.78983772, + "num_input_tokens_seen": 55894575, + "router_z_loss_clip": 2.93554688, + "router_z_loss_mlp": 0.30249023, + "step": 2574, + "time_per_iteration": 2.5485129356384277 + }, + { + "auxiliary_loss_clip": 0.06606239, + "auxiliary_loss_mlp": 0.01287836, + "balance_loss_clip": 0.06312916, + "balance_loss_mlp": 0.01258439, + "epoch": 0.15481737562002104, + "flos": 22567778265600.0, + "grad_norm": 2.064736624590997, + "language_loss": 0.83194166, + "learning_rate": 3.838786474773448e-06, + "loss": 0.91088241, + "num_input_tokens_seen": 55912855, + "router_z_loss_clip": 2.93359375, + "router_z_loss_mlp": 0.29394531, + "step": 2575, + "time_per_iteration": 2.5202696323394775 + }, + { + "auxiliary_loss_clip": 0.06611623, + "auxiliary_loss_mlp": 0.01295032, + "balance_loss_clip": 0.06317705, + "balance_loss_mlp": 0.01267137, + "epoch": 0.154877498872689, + "flos": 24907620804480.0, + "grad_norm": 1.9923268704643078, + "language_loss": 0.8600359, + "learning_rate": 3.838633249192036e-06, + "loss": 0.93910241, + "num_input_tokens_seen": 55932375, + "router_z_loss_clip": 2.93945312, + "router_z_loss_mlp": 0.27929688, + "step": 2576, + "time_per_iteration": 2.5677525997161865 + }, + { + "auxiliary_loss_clip": 0.06609543, + "auxiliary_loss_mlp": 0.01301269, + "balance_loss_clip": 0.06318229, + "balance_loss_mlp": 0.01275126, + "epoch": 0.15493762212535697, + "flos": 28155048842880.0, + "grad_norm": 2.065090565667539, + "language_loss": 0.82887769, + "learning_rate": 3.838479953889465e-06, + "loss": 0.90798575, + "num_input_tokens_seen": 55953970, + "router_z_loss_clip": 2.91796875, + "router_z_loss_mlp": 0.26147461, + "step": 2577, + "time_per_iteration": 2.5728230476379395 + }, + { + "auxiliary_loss_clip": 0.06618612, + "auxiliary_loss_mlp": 0.01306082, + "balance_loss_clip": 0.06324668, + "balance_loss_mlp": 0.01276852, + "epoch": 0.15499774537802496, + "flos": 25418162931840.0, + "grad_norm": 2.85112064725787, + "language_loss": 0.77597427, + "learning_rate": 3.8383265888715525e-06, + "loss": 0.85522127, + "num_input_tokens_seen": 55973120, + "router_z_loss_clip": 2.94335938, + "router_z_loss_mlp": 0.29199219, + "step": 2578, + "time_per_iteration": 2.5934667587280273 + }, + { + "auxiliary_loss_clip": 0.06630063, + "auxiliary_loss_mlp": 0.01289241, + "balance_loss_clip": 0.06328662, + "balance_loss_mlp": 0.01259224, + "epoch": 0.15505786863069293, + "flos": 22098213584640.0, + "grad_norm": 1.7655677053725216, + "language_loss": 0.8325448, + "learning_rate": 3.83817315414411e-06, + "loss": 0.91173792, + "num_input_tokens_seen": 55993260, + "router_z_loss_clip": 3.01367188, + "router_z_loss_mlp": 0.30004883, + "step": 2579, + "time_per_iteration": 2.5339503288269043 + }, + { + "auxiliary_loss_clip": 0.06624122, + "auxiliary_loss_mlp": 0.01293638, + "balance_loss_clip": 0.06327586, + "balance_loss_mlp": 0.01264074, + "epoch": 0.1551179918833609, + "flos": 18923223000960.0, + "grad_norm": 3.703462791860066, + "language_loss": 0.81290895, + "learning_rate": 3.838019649712958e-06, + "loss": 0.89208651, + "num_input_tokens_seen": 56012130, + "router_z_loss_clip": 2.96484375, + "router_z_loss_mlp": 0.2956543, + "step": 2580, + "time_per_iteration": 2.547076940536499 + }, + { + "auxiliary_loss_clip": 0.06553604, + "auxiliary_loss_mlp": 0.01296097, + "balance_loss_clip": 0.06379167, + "balance_loss_mlp": 0.01287341, + "epoch": 0.15517811513602886, + "flos": 66259281530880.0, + "grad_norm": 0.8290210768149422, + "language_loss": 0.59028411, + "learning_rate": 3.8378660755839166e-06, + "loss": 0.6687811, + "num_input_tokens_seen": 56079045, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.08770752, + "step": 2581, + "time_per_iteration": 4.748734712600708 + }, + { + "auxiliary_loss_clip": 0.06615421, + "auxiliary_loss_mlp": 0.01287932, + "balance_loss_clip": 0.06319774, + "balance_loss_mlp": 0.01259286, + "epoch": 0.15523823838869683, + "flos": 24027344536320.0, + "grad_norm": 2.048194408824491, + "language_loss": 0.86481762, + "learning_rate": 3.8377124317628095e-06, + "loss": 0.94385123, + "num_input_tokens_seen": 56098745, + "router_z_loss_clip": 2.95703125, + "router_z_loss_mlp": 0.28625488, + "step": 2582, + "time_per_iteration": 2.5417592525482178 + }, + { + "auxiliary_loss_clip": 0.0661144, + "auxiliary_loss_mlp": 0.01292493, + "balance_loss_clip": 0.06316175, + "balance_loss_mlp": 0.01262262, + "epoch": 0.1552983616413648, + "flos": 20491256782080.0, + "grad_norm": 2.196568898095916, + "language_loss": 0.79934382, + "learning_rate": 3.8375587182554625e-06, + "loss": 0.87838316, + "num_input_tokens_seen": 56117655, + "router_z_loss_clip": 2.95117188, + "router_z_loss_mlp": 0.30236816, + "step": 2583, + "time_per_iteration": 4.1261961460113525 + }, + { + "auxiliary_loss_clip": 0.06610835, + "auxiliary_loss_mlp": 0.01301507, + "balance_loss_clip": 0.06316249, + "balance_loss_mlp": 0.01272956, + "epoch": 0.15535848489403276, + "flos": 32131798819200.0, + "grad_norm": 2.2182475294075643, + "language_loss": 0.77203268, + "learning_rate": 3.837404935067705e-06, + "loss": 0.85115612, + "num_input_tokens_seen": 56141960, + "router_z_loss_clip": 2.9453125, + "router_z_loss_mlp": 0.28515625, + "step": 2584, + "time_per_iteration": 2.71648907661438 + }, + { + "auxiliary_loss_clip": 0.06603897, + "auxiliary_loss_mlp": 0.01292119, + "balance_loss_clip": 0.06309253, + "balance_loss_mlp": 0.01263676, + "epoch": 0.15541860814670075, + "flos": 19104379528320.0, + "grad_norm": 2.0708341386331157, + "language_loss": 0.76718783, + "learning_rate": 3.837251082205368e-06, + "loss": 0.84614801, + "num_input_tokens_seen": 56161430, + "router_z_loss_clip": 2.9453125, + "router_z_loss_mlp": 0.28442383, + "step": 2585, + "time_per_iteration": 2.548250198364258 + }, + { + "auxiliary_loss_clip": 0.06590863, + "auxiliary_loss_mlp": 0.01288896, + "balance_loss_clip": 0.06303678, + "balance_loss_mlp": 0.01260607, + "epoch": 0.1554787313993687, + "flos": 19178158648320.0, + "grad_norm": 2.0117198745869134, + "language_loss": 0.6235339, + "learning_rate": 3.837097159674286e-06, + "loss": 0.70233154, + "num_input_tokens_seen": 56179390, + "router_z_loss_clip": 2.87109375, + "router_z_loss_mlp": 0.28283691, + "step": 2586, + "time_per_iteration": 2.5397160053253174 + }, + { + "auxiliary_loss_clip": 0.06596754, + "auxiliary_loss_mlp": 0.01289508, + "balance_loss_clip": 0.0630295, + "balance_loss_mlp": 0.0126023, + "epoch": 0.15553885465203668, + "flos": 16149384639360.0, + "grad_norm": 2.0060039427442065, + "language_loss": 0.82540935, + "learning_rate": 3.836943167480296e-06, + "loss": 0.90427202, + "num_input_tokens_seen": 56198020, + "router_z_loss_clip": 2.9453125, + "router_z_loss_mlp": 0.29321289, + "step": 2587, + "time_per_iteration": 2.5246498584747314 + }, + { + "auxiliary_loss_clip": 0.06596097, + "auxiliary_loss_mlp": 0.01287288, + "balance_loss_clip": 0.06299823, + "balance_loss_mlp": 0.01257152, + "epoch": 0.15559897790470464, + "flos": 25344803082240.0, + "grad_norm": 1.8823875807099288, + "language_loss": 0.8996799, + "learning_rate": 3.836789105629236e-06, + "loss": 0.97851378, + "num_input_tokens_seen": 56218165, + "router_z_loss_clip": 2.9609375, + "router_z_loss_mlp": 0.30126953, + "step": 2588, + "time_per_iteration": 4.054608345031738 + }, + { + "auxiliary_loss_clip": 0.06588855, + "auxiliary_loss_mlp": 0.01285264, + "balance_loss_clip": 0.06298578, + "balance_loss_mlp": 0.01255628, + "epoch": 0.1556591011573726, + "flos": 23155453676160.0, + "grad_norm": 2.3276735592444253, + "language_loss": 0.65979421, + "learning_rate": 3.83663497412695e-06, + "loss": 0.7385354, + "num_input_tokens_seen": 56237160, + "router_z_loss_clip": 2.90234375, + "router_z_loss_mlp": 0.29614258, + "step": 2589, + "time_per_iteration": 2.5870378017425537 + }, + { + "auxiliary_loss_clip": 0.06587367, + "auxiliary_loss_mlp": 0.01283128, + "balance_loss_clip": 0.06296779, + "balance_loss_mlp": 0.01254554, + "epoch": 0.15571922441004057, + "flos": 25377353193600.0, + "grad_norm": 1.8444510343536653, + "language_loss": 0.83209628, + "learning_rate": 3.836480772979281e-06, + "loss": 0.91080129, + "num_input_tokens_seen": 56257610, + "router_z_loss_clip": 2.90820312, + "router_z_loss_mlp": 0.2857666, + "step": 2590, + "time_per_iteration": 2.567789316177368 + }, + { + "auxiliary_loss_clip": 0.06586926, + "auxiliary_loss_mlp": 0.01284797, + "balance_loss_clip": 0.06295232, + "balance_loss_mlp": 0.0125819, + "epoch": 0.15577934766270854, + "flos": 14506565489280.0, + "grad_norm": 2.5394168350381956, + "language_loss": 0.80645335, + "learning_rate": 3.836326502192077e-06, + "loss": 0.88517064, + "num_input_tokens_seen": 56275215, + "router_z_loss_clip": 2.91796875, + "router_z_loss_mlp": 0.26635742, + "step": 2591, + "time_per_iteration": 2.552945852279663 + }, + { + "auxiliary_loss_clip": 0.06583126, + "auxiliary_loss_mlp": 0.0128094, + "balance_loss_clip": 0.06296018, + "balance_loss_mlp": 0.01255953, + "epoch": 0.15583947091537653, + "flos": 37423575573120.0, + "grad_norm": 4.213698124732034, + "language_loss": 0.6586749, + "learning_rate": 3.836172161771189e-06, + "loss": 0.73731554, + "num_input_tokens_seen": 56297130, + "router_z_loss_clip": 2.87109375, + "router_z_loss_mlp": 0.25024414, + "step": 2592, + "time_per_iteration": 2.6843414306640625 + }, + { + "auxiliary_loss_clip": 0.06601857, + "auxiliary_loss_mlp": 0.01282978, + "balance_loss_clip": 0.06306329, + "balance_loss_mlp": 0.01254547, + "epoch": 0.1558995941680445, + "flos": 21841097731200.0, + "grad_norm": 2.3724666239354804, + "language_loss": 0.83576721, + "learning_rate": 3.836017751722467e-06, + "loss": 0.91461557, + "num_input_tokens_seen": 56314995, + "router_z_loss_clip": 2.95898438, + "router_z_loss_mlp": 0.28442383, + "step": 2593, + "time_per_iteration": 2.565361738204956 + }, + { + "auxiliary_loss_clip": 0.06586924, + "auxiliary_loss_mlp": 0.01289301, + "balance_loss_clip": 0.06303876, + "balance_loss_mlp": 0.01261526, + "epoch": 0.15595971742071246, + "flos": 19798845367680.0, + "grad_norm": 2.2297480783075847, + "language_loss": 0.74099863, + "learning_rate": 3.8358632720517695e-06, + "loss": 0.8197608, + "num_input_tokens_seen": 56334005, + "router_z_loss_clip": 2.83203125, + "router_z_loss_mlp": 0.27819824, + "step": 2594, + "time_per_iteration": 2.55253267288208 + }, + { + "auxiliary_loss_clip": 0.06601368, + "auxiliary_loss_mlp": 0.01282916, + "balance_loss_clip": 0.06319516, + "balance_loss_mlp": 0.01257346, + "epoch": 0.15601984067338043, + "flos": 26729038932480.0, + "grad_norm": 2.826820029132309, + "language_loss": 0.82562411, + "learning_rate": 3.835708722764952e-06, + "loss": 0.90446699, + "num_input_tokens_seen": 56353795, + "router_z_loss_clip": 2.81835938, + "router_z_loss_mlp": 0.2557373, + "step": 2595, + "time_per_iteration": 2.640240430831909 + }, + { + "auxiliary_loss_clip": 0.06626514, + "auxiliary_loss_mlp": 0.01281437, + "balance_loss_clip": 0.06334631, + "balance_loss_mlp": 0.01254936, + "epoch": 0.1560799639260484, + "flos": 18375183371520.0, + "grad_norm": 9.37489887619581, + "language_loss": 0.87632233, + "learning_rate": 3.835554103867876e-06, + "loss": 0.95540184, + "num_input_tokens_seen": 56373195, + "router_z_loss_clip": 2.91992188, + "router_z_loss_mlp": 0.26538086, + "step": 2596, + "time_per_iteration": 2.529327869415283 + }, + { + "auxiliary_loss_clip": 0.06606492, + "auxiliary_loss_mlp": 0.01287289, + "balance_loss_clip": 0.06323552, + "balance_loss_mlp": 0.01261015, + "epoch": 0.15614008717871636, + "flos": 22605149986560.0, + "grad_norm": 2.807545322610708, + "language_loss": 0.69688505, + "learning_rate": 3.835399415366404e-06, + "loss": 0.77582288, + "num_input_tokens_seen": 56391525, + "router_z_loss_clip": 2.82617188, + "router_z_loss_mlp": 0.26306152, + "step": 2597, + "time_per_iteration": 2.5685815811157227 + }, + { + "auxiliary_loss_clip": 0.0662894, + "auxiliary_loss_mlp": 0.01280666, + "balance_loss_clip": 0.06348241, + "balance_loss_mlp": 0.01256455, + "epoch": 0.15620021043138435, + "flos": 22753379059200.0, + "grad_norm": 2.0232351113841514, + "language_loss": 0.80914307, + "learning_rate": 3.8352446572664035e-06, + "loss": 0.88823915, + "num_input_tokens_seen": 56410715, + "router_z_loss_clip": 2.81054688, + "router_z_loss_mlp": 0.2421875, + "step": 2598, + "time_per_iteration": 2.554202079772949 + }, + { + "auxiliary_loss_clip": 0.0662708, + "auxiliary_loss_mlp": 0.01284312, + "balance_loss_clip": 0.06344105, + "balance_loss_mlp": 0.01257895, + "epoch": 0.15626033368405232, + "flos": 13119897870720.0, + "grad_norm": 2.0408523791990016, + "language_loss": 0.83276039, + "learning_rate": 3.8350898295737405e-06, + "loss": 0.91187429, + "num_input_tokens_seen": 56429170, + "router_z_loss_clip": 2.83007812, + "router_z_loss_mlp": 0.26391602, + "step": 2599, + "time_per_iteration": 2.66353702545166 + }, + { + "auxiliary_loss_clip": 0.06639346, + "auxiliary_loss_mlp": 0.01292644, + "balance_loss_clip": 0.06344323, + "balance_loss_mlp": 0.0126469, + "epoch": 0.15632045693672028, + "flos": 16477931698560.0, + "grad_norm": 2.3045518919772046, + "language_loss": 0.82379115, + "learning_rate": 3.834934932294287e-06, + "loss": 0.9031111, + "num_input_tokens_seen": 56445685, + "router_z_loss_clip": 2.95117188, + "router_z_loss_mlp": 0.27941895, + "step": 2600, + "time_per_iteration": 2.50607967376709 + }, + { + "auxiliary_loss_clip": 0.06646761, + "auxiliary_loss_mlp": 0.01287391, + "balance_loss_clip": 0.0635706, + "balance_loss_mlp": 0.01259305, + "epoch": 0.15638058018938825, + "flos": 20856672437760.0, + "grad_norm": 2.020166421544308, + "language_loss": 0.88839436, + "learning_rate": 3.834779965433917e-06, + "loss": 0.96773589, + "num_input_tokens_seen": 56465900, + "router_z_loss_clip": 2.8984375, + "router_z_loss_mlp": 0.28076172, + "step": 2601, + "time_per_iteration": 2.574437141418457 + }, + { + "auxiliary_loss_clip": 0.06648471, + "auxiliary_loss_mlp": 0.01294906, + "balance_loss_clip": 0.06352241, + "balance_loss_mlp": 0.01267989, + "epoch": 0.1564407034420562, + "flos": 21878762941440.0, + "grad_norm": 2.51177361833528, + "language_loss": 0.79510248, + "learning_rate": 3.834624928998508e-06, + "loss": 0.87453628, + "num_input_tokens_seen": 56485020, + "router_z_loss_clip": 2.96679688, + "router_z_loss_mlp": 0.26940918, + "step": 2602, + "time_per_iteration": 2.5957844257354736 + }, + { + "auxiliary_loss_clip": 0.06633168, + "auxiliary_loss_mlp": 0.01292264, + "balance_loss_clip": 0.06345348, + "balance_loss_mlp": 0.01265979, + "epoch": 0.15650082669472418, + "flos": 21840888096000.0, + "grad_norm": 1.9170738392352888, + "language_loss": 0.7431488, + "learning_rate": 3.8344698229939376e-06, + "loss": 0.82240313, + "num_input_tokens_seen": 56505205, + "router_z_loss_clip": 2.875, + "router_z_loss_mlp": 0.26293945, + "step": 2603, + "time_per_iteration": 2.5696704387664795 + }, + { + "auxiliary_loss_clip": 0.06625052, + "auxiliary_loss_mlp": 0.01287753, + "balance_loss_clip": 0.06337333, + "balance_loss_mlp": 0.01261217, + "epoch": 0.15656094994739214, + "flos": 13804343147520.0, + "grad_norm": 2.480258971716289, + "language_loss": 0.88529468, + "learning_rate": 3.8343146474260865e-06, + "loss": 0.9644227, + "num_input_tokens_seen": 56521495, + "router_z_loss_clip": 2.87890625, + "router_z_loss_mlp": 0.26538086, + "step": 2604, + "time_per_iteration": 2.5110373497009277 + }, + { + "auxiliary_loss_clip": 0.06634312, + "auxiliary_loss_mlp": 0.01291425, + "balance_loss_clip": 0.06341597, + "balance_loss_mlp": 0.01266558, + "epoch": 0.15662107320006013, + "flos": 27315582312960.0, + "grad_norm": 2.192350516429204, + "language_loss": 0.85880566, + "learning_rate": 3.834159402300841e-06, + "loss": 0.93806314, + "num_input_tokens_seen": 56540665, + "router_z_loss_clip": 2.9296875, + "router_z_loss_mlp": 0.2487793, + "step": 2605, + "time_per_iteration": 2.6109507083892822 + }, + { + "auxiliary_loss_clip": 0.06649123, + "auxiliary_loss_mlp": 0.01294389, + "balance_loss_clip": 0.06348212, + "balance_loss_mlp": 0.01265802, + "epoch": 0.1566811964527281, + "flos": 26691876846720.0, + "grad_norm": 1.9127965853266395, + "language_loss": 0.73996091, + "learning_rate": 3.834004087624087e-06, + "loss": 0.81939602, + "num_input_tokens_seen": 56560805, + "router_z_loss_clip": 3.00976562, + "router_z_loss_mlp": 0.28564453, + "step": 2606, + "time_per_iteration": 2.7345151901245117 + }, + { + "auxiliary_loss_clip": 0.06621392, + "auxiliary_loss_mlp": 0.01286091, + "balance_loss_clip": 0.06334884, + "balance_loss_mlp": 0.01260246, + "epoch": 0.15674131970539606, + "flos": 16108323338880.0, + "grad_norm": 2.273122789948623, + "language_loss": 0.77297181, + "learning_rate": 3.8338487034017145e-06, + "loss": 0.85204661, + "num_input_tokens_seen": 56576335, + "router_z_loss_clip": 2.8671875, + "router_z_loss_mlp": 0.25842285, + "step": 2607, + "time_per_iteration": 2.571983575820923 + }, + { + "auxiliary_loss_clip": 0.06614074, + "auxiliary_loss_mlp": 0.01286338, + "balance_loss_clip": 0.06327923, + "balance_loss_mlp": 0.01260791, + "epoch": 0.15680144295806403, + "flos": 19175349536640.0, + "grad_norm": 1.917731361959034, + "language_loss": 0.8328836, + "learning_rate": 3.833693249639615e-06, + "loss": 0.91188771, + "num_input_tokens_seen": 56595880, + "router_z_loss_clip": 2.86328125, + "router_z_loss_mlp": 0.25598145, + "step": 2608, + "time_per_iteration": 2.5823540687561035 + }, + { + "auxiliary_loss_clip": 0.06622173, + "auxiliary_loss_mlp": 0.01295073, + "balance_loss_clip": 0.06326167, + "balance_loss_mlp": 0.01264901, + "epoch": 0.156861566210732, + "flos": 20819678060160.0, + "grad_norm": 2.1481617307418017, + "language_loss": 0.73101258, + "learning_rate": 3.833537726343684e-06, + "loss": 0.81018502, + "num_input_tokens_seen": 56615130, + "router_z_loss_clip": 2.9609375, + "router_z_loss_mlp": 0.30163574, + "step": 2609, + "time_per_iteration": 2.572356700897217 + }, + { + "auxiliary_loss_clip": 0.06605803, + "auxiliary_loss_mlp": 0.01286832, + "balance_loss_clip": 0.06311236, + "balance_loss_mlp": 0.01260928, + "epoch": 0.15692168946339996, + "flos": 20054158358400.0, + "grad_norm": 2.0130429141277446, + "language_loss": 0.73445058, + "learning_rate": 3.833382133519818e-06, + "loss": 0.8133769, + "num_input_tokens_seen": 56634005, + "router_z_loss_clip": 2.94726562, + "router_z_loss_mlp": 0.2590332, + "step": 2610, + "time_per_iteration": 2.567537784576416 + }, + { + "auxiliary_loss_clip": 0.06606032, + "auxiliary_loss_mlp": 0.01288962, + "balance_loss_clip": 0.06310159, + "balance_loss_mlp": 0.01258873, + "epoch": 0.15698181271606793, + "flos": 21404502432000.0, + "grad_norm": 1.9787082052238874, + "language_loss": 0.73279381, + "learning_rate": 3.833226471173919e-06, + "loss": 0.81174374, + "num_input_tokens_seen": 56653480, + "router_z_loss_clip": 2.95898438, + "router_z_loss_mlp": 0.30065918, + "step": 2611, + "time_per_iteration": 2.582390308380127 + }, + { + "auxiliary_loss_clip": 0.06594902, + "auxiliary_loss_mlp": 0.01284265, + "balance_loss_clip": 0.06304685, + "balance_loss_mlp": 0.01259172, + "epoch": 0.15704193596873592, + "flos": 20851347703680.0, + "grad_norm": 2.098501694873674, + "language_loss": 0.71879792, + "learning_rate": 3.833070739311887e-06, + "loss": 0.79758954, + "num_input_tokens_seen": 56672270, + "router_z_loss_clip": 2.90234375, + "router_z_loss_mlp": 0.25097656, + "step": 2612, + "time_per_iteration": 2.577627658843994 + }, + { + "auxiliary_loss_clip": 0.0659887, + "auxiliary_loss_mlp": 0.01283795, + "balance_loss_clip": 0.06308534, + "balance_loss_mlp": 0.0125832, + "epoch": 0.15710205922140388, + "flos": 21769456890240.0, + "grad_norm": 2.359608918603851, + "language_loss": 0.77193695, + "learning_rate": 3.83291493793963e-06, + "loss": 0.85076362, + "num_input_tokens_seen": 56691510, + "router_z_loss_clip": 2.90429688, + "router_z_loss_mlp": 0.2545166, + "step": 2613, + "time_per_iteration": 2.5632479190826416 + }, + { + "auxiliary_loss_clip": 0.06608421, + "auxiliary_loss_mlp": 0.01292559, + "balance_loss_clip": 0.06315231, + "balance_loss_mlp": 0.01266106, + "epoch": 0.15716218247407185, + "flos": 25014453160320.0, + "grad_norm": 1.6622650675423762, + "language_loss": 0.66684031, + "learning_rate": 3.832759067063055e-06, + "loss": 0.74585009, + "num_input_tokens_seen": 56712230, + "router_z_loss_clip": 2.93359375, + "router_z_loss_mlp": 0.26428223, + "step": 2614, + "time_per_iteration": 2.684286117553711 + }, + { + "auxiliary_loss_clip": 0.0661184, + "auxiliary_loss_mlp": 0.01292567, + "balance_loss_clip": 0.06314493, + "balance_loss_mlp": 0.01264255, + "epoch": 0.1572223057267398, + "flos": 20197691602560.0, + "grad_norm": 3.2869095787841576, + "language_loss": 0.76402575, + "learning_rate": 3.832603126688072e-06, + "loss": 0.84306979, + "num_input_tokens_seen": 56727490, + "router_z_loss_clip": 2.9765625, + "router_z_loss_mlp": 0.28308105, + "step": 2615, + "time_per_iteration": 2.551769971847534 + }, + { + "auxiliary_loss_clip": 0.06589202, + "auxiliary_loss_mlp": 0.01285836, + "balance_loss_clip": 0.06304425, + "balance_loss_mlp": 0.01260587, + "epoch": 0.15728242897940778, + "flos": 20965810780800.0, + "grad_norm": 1.7986527043954237, + "language_loss": 0.74040192, + "learning_rate": 3.832447116820594e-06, + "loss": 0.81915236, + "num_input_tokens_seen": 56747385, + "router_z_loss_clip": 2.84960938, + "router_z_loss_mlp": 0.25256348, + "step": 2616, + "time_per_iteration": 2.5935630798339844 + }, + { + "auxiliary_loss_clip": 0.06601542, + "auxiliary_loss_mlp": 0.01283526, + "balance_loss_clip": 0.06305884, + "balance_loss_mlp": 0.01256966, + "epoch": 0.15734255223207574, + "flos": 23044764032640.0, + "grad_norm": 2.1005464521191426, + "language_loss": 0.73305666, + "learning_rate": 3.832291037466539e-06, + "loss": 0.81190741, + "num_input_tokens_seen": 56768055, + "router_z_loss_clip": 2.95703125, + "router_z_loss_mlp": 0.265625, + "step": 2617, + "time_per_iteration": 2.6116013526916504 + }, + { + "auxiliary_loss_clip": 0.06593003, + "auxiliary_loss_mlp": 0.01287239, + "balance_loss_clip": 0.06306564, + "balance_loss_mlp": 0.012605, + "epoch": 0.15740267548474374, + "flos": 20556357004800.0, + "grad_norm": 2.1735503953171813, + "language_loss": 0.75337285, + "learning_rate": 3.8321348886318235e-06, + "loss": 0.83217525, + "num_input_tokens_seen": 56785110, + "router_z_loss_clip": 2.86523438, + "router_z_loss_mlp": 0.26745605, + "step": 2618, + "time_per_iteration": 2.558271884918213 + }, + { + "auxiliary_loss_clip": 0.06606486, + "auxiliary_loss_mlp": 0.01288019, + "balance_loss_clip": 0.06305802, + "balance_loss_mlp": 0.01260052, + "epoch": 0.1574627987374117, + "flos": 22672262707200.0, + "grad_norm": 2.4653942739702277, + "language_loss": 0.79897004, + "learning_rate": 3.8319786703223695e-06, + "loss": 0.87791508, + "num_input_tokens_seen": 56804975, + "router_z_loss_clip": 3.00585938, + "router_z_loss_mlp": 0.2800293, + "step": 2619, + "time_per_iteration": 2.5732688903808594 + }, + { + "auxiliary_loss_clip": 0.06592336, + "auxiliary_loss_mlp": 0.01289339, + "balance_loss_clip": 0.06304029, + "balance_loss_mlp": 0.01263304, + "epoch": 0.15752292199007967, + "flos": 16806352976640.0, + "grad_norm": 1.8956550238632917, + "language_loss": 0.77960408, + "learning_rate": 3.831822382544101e-06, + "loss": 0.85842085, + "num_input_tokens_seen": 56822470, + "router_z_loss_clip": 2.88476562, + "router_z_loss_mlp": 0.26013184, + "step": 2620, + "time_per_iteration": 2.556342363357544 + }, + { + "auxiliary_loss_clip": 0.06608844, + "auxiliary_loss_mlp": 0.01287118, + "balance_loss_clip": 0.06316274, + "balance_loss_mlp": 0.01259843, + "epoch": 0.15758304524274763, + "flos": 29833856121600.0, + "grad_norm": 1.8795614053933318, + "language_loss": 0.72243416, + "learning_rate": 3.831666025302944e-06, + "loss": 0.80139381, + "num_input_tokens_seen": 56842100, + "router_z_loss_clip": 2.92382812, + "router_z_loss_mlp": 0.27282715, + "step": 2621, + "time_per_iteration": 4.014448881149292 + }, + { + "auxiliary_loss_clip": 0.06605494, + "auxiliary_loss_mlp": 0.01287754, + "balance_loss_clip": 0.06309334, + "balance_loss_mlp": 0.01260813, + "epoch": 0.1576431684954156, + "flos": 53589116851200.0, + "grad_norm": 5.362699165833927, + "language_loss": 0.73428345, + "learning_rate": 3.831509598604828e-06, + "loss": 0.81321585, + "num_input_tokens_seen": 56865920, + "router_z_loss_clip": 2.96484375, + "router_z_loss_mlp": 0.26940918, + "step": 2622, + "time_per_iteration": 2.9332852363586426 + }, + { + "auxiliary_loss_clip": 0.06587812, + "auxiliary_loss_mlp": 0.01287353, + "balance_loss_clip": 0.06302886, + "balance_loss_mlp": 0.01262284, + "epoch": 0.15770329174808356, + "flos": 20819887695360.0, + "grad_norm": 1.8034719431418926, + "language_loss": 0.88731241, + "learning_rate": 3.831353102455684e-06, + "loss": 0.96606404, + "num_input_tokens_seen": 56885265, + "router_z_loss_clip": 2.84765625, + "router_z_loss_mlp": 0.25085449, + "step": 2623, + "time_per_iteration": 3.993907928466797 + }, + { + "auxiliary_loss_clip": 0.06595732, + "auxiliary_loss_mlp": 0.01282154, + "balance_loss_clip": 0.0630911, + "balance_loss_mlp": 0.01255594, + "epoch": 0.15776341500075153, + "flos": 24981148362240.0, + "grad_norm": 2.539905380031208, + "language_loss": 0.82629728, + "learning_rate": 3.831196536861448e-06, + "loss": 0.90507615, + "num_input_tokens_seen": 56906710, + "router_z_loss_clip": 2.86523438, + "router_z_loss_mlp": 0.265625, + "step": 2624, + "time_per_iteration": 2.5706846714019775 + }, + { + "auxiliary_loss_clip": 0.06606949, + "auxiliary_loss_mlp": 0.01292533, + "balance_loss_clip": 0.06309812, + "balance_loss_mlp": 0.01266093, + "epoch": 0.15782353825341952, + "flos": 21914331799680.0, + "grad_norm": 3.0693090763099815, + "language_loss": 0.81940538, + "learning_rate": 3.831039901828054e-06, + "loss": 0.89840019, + "num_input_tokens_seen": 56924275, + "router_z_loss_clip": 2.9765625, + "router_z_loss_mlp": 0.26452637, + "step": 2625, + "time_per_iteration": 2.569840669631958 + }, + { + "auxiliary_loss_clip": 0.06593765, + "auxiliary_loss_mlp": 0.01293944, + "balance_loss_clip": 0.06303135, + "balance_loss_mlp": 0.01268064, + "epoch": 0.15788366150608749, + "flos": 26184395393280.0, + "grad_norm": 2.523517901800404, + "language_loss": 0.81776226, + "learning_rate": 3.830883197361445e-06, + "loss": 0.89663935, + "num_input_tokens_seen": 56941525, + "router_z_loss_clip": 2.90429688, + "router_z_loss_mlp": 0.25891113, + "step": 2626, + "time_per_iteration": 2.561379909515381 + }, + { + "auxiliary_loss_clip": 0.06594853, + "auxiliary_loss_mlp": 0.01294161, + "balance_loss_clip": 0.06304863, + "balance_loss_mlp": 0.01267434, + "epoch": 0.15794378475875545, + "flos": 27717321513600.0, + "grad_norm": 1.6929688421529916, + "language_loss": 0.7457962, + "learning_rate": 3.830726423467561e-06, + "loss": 0.82468635, + "num_input_tokens_seen": 56962145, + "router_z_loss_clip": 2.90039062, + "router_z_loss_mlp": 0.26708984, + "step": 2627, + "time_per_iteration": 2.596707344055176 + }, + { + "auxiliary_loss_clip": 0.06587663, + "auxiliary_loss_mlp": 0.01294139, + "balance_loss_clip": 0.06296949, + "balance_loss_mlp": 0.01267007, + "epoch": 0.15800390801142342, + "flos": 12135011379840.0, + "grad_norm": 2.3877400099999413, + "language_loss": 0.87097675, + "learning_rate": 3.830569580152348e-06, + "loss": 0.94979477, + "num_input_tokens_seen": 56977505, + "router_z_loss_clip": 2.91015625, + "router_z_loss_mlp": 0.27172852, + "step": 2628, + "time_per_iteration": 5.372643709182739 + }, + { + "auxiliary_loss_clip": 0.06588875, + "auxiliary_loss_mlp": 0.01280598, + "balance_loss_clip": 0.06300817, + "balance_loss_mlp": 0.0125548, + "epoch": 0.15806403126409138, + "flos": 20711084768640.0, + "grad_norm": 2.1789511738163236, + "language_loss": 0.77439439, + "learning_rate": 3.830412667421752e-06, + "loss": 0.85308909, + "num_input_tokens_seen": 56996770, + "router_z_loss_clip": 2.88085938, + "router_z_loss_mlp": 0.25097656, + "step": 2629, + "time_per_iteration": 2.571425199508667 + }, + { + "auxiliary_loss_clip": 0.06593206, + "auxiliary_loss_mlp": 0.0128531, + "balance_loss_clip": 0.06298864, + "balance_loss_mlp": 0.01257117, + "epoch": 0.15812415451675935, + "flos": 17827479158400.0, + "grad_norm": 2.6284348264521853, + "language_loss": 0.74838495, + "learning_rate": 3.8302556852817245e-06, + "loss": 0.82717013, + "num_input_tokens_seen": 57014970, + "router_z_loss_clip": 2.94726562, + "router_z_loss_mlp": 0.28186035, + "step": 2630, + "time_per_iteration": 2.538496971130371 + }, + { + "auxiliary_loss_clip": 0.06592915, + "auxiliary_loss_mlp": 0.01286291, + "balance_loss_clip": 0.06295606, + "balance_loss_mlp": 0.0125904, + "epoch": 0.15818427776942734, + "flos": 20090230341120.0, + "grad_norm": 3.888480122572148, + "language_loss": 0.84692156, + "learning_rate": 3.8300986337382184e-06, + "loss": 0.9257136, + "num_input_tokens_seen": 57034045, + "router_z_loss_clip": 2.97070312, + "router_z_loss_mlp": 0.27270508, + "step": 2631, + "time_per_iteration": 2.6821517944335938 + }, + { + "auxiliary_loss_clip": 0.06584532, + "auxiliary_loss_mlp": 0.01280599, + "balance_loss_clip": 0.06294788, + "balance_loss_mlp": 0.01253563, + "epoch": 0.1582444010220953, + "flos": 21221249552640.0, + "grad_norm": 8.851391146614638, + "language_loss": 0.79768324, + "learning_rate": 3.8299415127971895e-06, + "loss": 0.87633461, + "num_input_tokens_seen": 57053695, + "router_z_loss_clip": 2.8984375, + "router_z_loss_mlp": 0.27050781, + "step": 2632, + "time_per_iteration": 2.5977976322174072 + }, + { + "auxiliary_loss_clip": 0.06588165, + "auxiliary_loss_mlp": 0.01281414, + "balance_loss_clip": 0.06294183, + "balance_loss_mlp": 0.01255414, + "epoch": 0.15830452427476327, + "flos": 17864138119680.0, + "grad_norm": 1.985726901466477, + "language_loss": 0.83594966, + "learning_rate": 3.829784322464594e-06, + "loss": 0.91464543, + "num_input_tokens_seen": 57071290, + "router_z_loss_clip": 2.93945312, + "router_z_loss_mlp": 0.2598877, + "step": 2633, + "time_per_iteration": 2.569474220275879 + }, + { + "auxiliary_loss_clip": 0.0658908, + "auxiliary_loss_mlp": 0.0128242, + "balance_loss_clip": 0.0629508, + "balance_loss_mlp": 0.01256265, + "epoch": 0.15836464752743123, + "flos": 24541827805440.0, + "grad_norm": 1.6688248008006443, + "language_loss": 0.78379452, + "learning_rate": 3.829627062746394e-06, + "loss": 0.86250955, + "num_input_tokens_seen": 57091465, + "router_z_loss_clip": 2.93945312, + "router_z_loss_mlp": 0.26196289, + "step": 2634, + "time_per_iteration": 2.5919923782348633 + }, + { + "auxiliary_loss_clip": 0.06593279, + "auxiliary_loss_mlp": 0.01291316, + "balance_loss_clip": 0.06295943, + "balance_loss_mlp": 0.01263337, + "epoch": 0.1584247707800992, + "flos": 20127057010560.0, + "grad_norm": 2.0830753641117306, + "language_loss": 0.89997375, + "learning_rate": 3.829469733648552e-06, + "loss": 0.97881973, + "num_input_tokens_seen": 57110075, + "router_z_loss_clip": 2.9765625, + "router_z_loss_mlp": 0.27966309, + "step": 2635, + "time_per_iteration": 2.5786406993865967 + }, + { + "auxiliary_loss_clip": 0.06588058, + "auxiliary_loss_mlp": 0.01288113, + "balance_loss_clip": 0.06292774, + "balance_loss_mlp": 0.01260218, + "epoch": 0.15848489403276717, + "flos": 20382202293120.0, + "grad_norm": 2.014850044069841, + "language_loss": 0.7709136, + "learning_rate": 3.829312335177034e-06, + "loss": 0.8496753, + "num_input_tokens_seen": 57128945, + "router_z_loss_clip": 2.95703125, + "router_z_loss_mlp": 0.27868652, + "step": 2636, + "time_per_iteration": 2.6201331615448 + }, + { + "auxiliary_loss_clip": 0.06586573, + "auxiliary_loss_mlp": 0.0128751, + "balance_loss_clip": 0.06290652, + "balance_loss_mlp": 0.0126101, + "epoch": 0.15854501728543513, + "flos": 39356018760960.0, + "grad_norm": 2.044553358008507, + "language_loss": 0.73238122, + "learning_rate": 3.82915486733781e-06, + "loss": 0.81112206, + "num_input_tokens_seen": 57152385, + "router_z_loss_clip": 2.95898438, + "router_z_loss_mlp": 0.26489258, + "step": 2637, + "time_per_iteration": 2.742854595184326 + }, + { + "auxiliary_loss_clip": 0.06583421, + "auxiliary_loss_mlp": 0.01288932, + "balance_loss_clip": 0.06292208, + "balance_loss_mlp": 0.01262468, + "epoch": 0.15860514053810312, + "flos": 24871297259520.0, + "grad_norm": 1.8074381255816763, + "language_loss": 0.79285657, + "learning_rate": 3.82899733013685e-06, + "loss": 0.87158012, + "num_input_tokens_seen": 57172620, + "router_z_loss_clip": 2.91601562, + "router_z_loss_mlp": 0.26489258, + "step": 2638, + "time_per_iteration": 2.5642874240875244 + }, + { + "auxiliary_loss_clip": 0.06588158, + "auxiliary_loss_mlp": 0.01287351, + "balance_loss_clip": 0.06294204, + "balance_loss_mlp": 0.01258908, + "epoch": 0.1586652637907711, + "flos": 26184982371840.0, + "grad_norm": 2.3471549301232844, + "language_loss": 0.76132977, + "learning_rate": 3.828839723580128e-06, + "loss": 0.84008479, + "num_input_tokens_seen": 57194680, + "router_z_loss_clip": 2.94335938, + "router_z_loss_mlp": 0.28491211, + "step": 2639, + "time_per_iteration": 2.615779399871826 + }, + { + "auxiliary_loss_clip": 0.06586854, + "auxiliary_loss_mlp": 0.01295396, + "balance_loss_clip": 0.06293523, + "balance_loss_mlp": 0.01267299, + "epoch": 0.15872538704343905, + "flos": 19798174535040.0, + "grad_norm": 1.8583301329388602, + "language_loss": 0.82681525, + "learning_rate": 3.82868204767362e-06, + "loss": 0.90563774, + "num_input_tokens_seen": 57214675, + "router_z_loss_clip": 2.93164062, + "router_z_loss_mlp": 0.28076172, + "step": 2640, + "time_per_iteration": 2.5406789779663086 + }, + { + "auxiliary_loss_clip": 0.06583565, + "auxiliary_loss_mlp": 0.0129063, + "balance_loss_clip": 0.06294291, + "balance_loss_mlp": 0.01262342, + "epoch": 0.15878551029610702, + "flos": 28482883142400.0, + "grad_norm": 1.847395702831907, + "language_loss": 0.67676318, + "learning_rate": 3.828524302423306e-06, + "loss": 0.75550508, + "num_input_tokens_seen": 57235830, + "router_z_loss_clip": 2.89453125, + "router_z_loss_mlp": 0.28308105, + "step": 2641, + "time_per_iteration": 2.6107757091522217 + }, + { + "auxiliary_loss_clip": 0.06593709, + "auxiliary_loss_mlp": 0.01287834, + "balance_loss_clip": 0.06291051, + "balance_loss_mlp": 0.01259199, + "epoch": 0.15884563354877498, + "flos": 24213532308480.0, + "grad_norm": 2.4455482341546366, + "language_loss": 0.77487421, + "learning_rate": 3.828366487835167e-06, + "loss": 0.85368967, + "num_input_tokens_seen": 57255970, + "router_z_loss_clip": 3.02929688, + "router_z_loss_mlp": 0.28674316, + "step": 2642, + "time_per_iteration": 2.549790382385254 + }, + { + "auxiliary_loss_clip": 0.06588584, + "auxiliary_loss_mlp": 0.01290508, + "balance_loss_clip": 0.06297128, + "balance_loss_mlp": 0.0126303, + "epoch": 0.15890575680144295, + "flos": 23956332600960.0, + "grad_norm": 2.206510162678276, + "language_loss": 0.71574652, + "learning_rate": 3.828208603915186e-06, + "loss": 0.79453743, + "num_input_tokens_seen": 57274435, + "router_z_loss_clip": 2.91992188, + "router_z_loss_mlp": 0.27478027, + "step": 2643, + "time_per_iteration": 2.5622386932373047 + }, + { + "auxiliary_loss_clip": 0.06581764, + "auxiliary_loss_mlp": 0.01292278, + "balance_loss_clip": 0.06295977, + "balance_loss_mlp": 0.01265432, + "epoch": 0.15896588005411091, + "flos": 21221375333760.0, + "grad_norm": 1.9554363630175624, + "language_loss": 0.78877175, + "learning_rate": 3.828050650669353e-06, + "loss": 0.86751211, + "num_input_tokens_seen": 57293115, + "router_z_loss_clip": 2.859375, + "router_z_loss_mlp": 0.26867676, + "step": 2644, + "time_per_iteration": 2.519049644470215 + }, + { + "auxiliary_loss_clip": 0.06584983, + "auxiliary_loss_mlp": 0.01285638, + "balance_loss_clip": 0.06294875, + "balance_loss_mlp": 0.01257588, + "epoch": 0.1590260033067789, + "flos": 24359203831680.0, + "grad_norm": 1.8306681743440225, + "language_loss": 0.83401352, + "learning_rate": 3.827892628103657e-06, + "loss": 0.91271967, + "num_input_tokens_seen": 57312565, + "router_z_loss_clip": 2.90234375, + "router_z_loss_mlp": 0.28039551, + "step": 2645, + "time_per_iteration": 2.5938899517059326 + }, + { + "auxiliary_loss_clip": 0.06594808, + "auxiliary_loss_mlp": 0.01293395, + "balance_loss_clip": 0.063001, + "balance_loss_mlp": 0.01263914, + "epoch": 0.15908612655944687, + "flos": 32056719960960.0, + "grad_norm": 2.510422612834076, + "language_loss": 0.70788723, + "learning_rate": 3.827734536224087e-06, + "loss": 0.78676921, + "num_input_tokens_seen": 57333360, + "router_z_loss_clip": 2.94921875, + "router_z_loss_mlp": 0.2947998, + "step": 2646, + "time_per_iteration": 2.6329824924468994 + }, + { + "auxiliary_loss_clip": 0.06588359, + "auxiliary_loss_mlp": 0.01289443, + "balance_loss_clip": 0.06303679, + "balance_loss_mlp": 0.01262728, + "epoch": 0.15914624981211484, + "flos": 17791155613440.0, + "grad_norm": 1.930709185953096, + "language_loss": 0.63532102, + "learning_rate": 3.827576375036642e-06, + "loss": 0.71409905, + "num_input_tokens_seen": 57350575, + "router_z_loss_clip": 2.84765625, + "router_z_loss_mlp": 0.26696777, + "step": 2647, + "time_per_iteration": 2.5299501419067383 + }, + { + "auxiliary_loss_clip": 0.06584711, + "auxiliary_loss_mlp": 0.01288467, + "balance_loss_clip": 0.06297973, + "balance_loss_mlp": 0.0126174, + "epoch": 0.1592063730647828, + "flos": 17718298888320.0, + "grad_norm": 2.1247786745604818, + "language_loss": 0.90530396, + "learning_rate": 3.827418144547318e-06, + "loss": 0.98403573, + "num_input_tokens_seen": 57367570, + "router_z_loss_clip": 2.86914062, + "router_z_loss_mlp": 0.26757812, + "step": 2648, + "time_per_iteration": 2.5112242698669434 + }, + { + "auxiliary_loss_clip": 0.06582057, + "auxiliary_loss_mlp": 0.01285915, + "balance_loss_clip": 0.06301906, + "balance_loss_mlp": 0.01259915, + "epoch": 0.15926649631745077, + "flos": 18808927632000.0, + "grad_norm": 2.0063837423825044, + "language_loss": 0.92929685, + "learning_rate": 3.827259844762114e-06, + "loss": 1.00797653, + "num_input_tokens_seen": 57383980, + "router_z_loss_clip": 2.80078125, + "router_z_loss_mlp": 0.26013184, + "step": 2649, + "time_per_iteration": 2.5400166511535645 + }, + { + "auxiliary_loss_clip": 0.06614827, + "auxiliary_loss_mlp": 0.01289461, + "balance_loss_clip": 0.0630791, + "balance_loss_mlp": 0.01258156, + "epoch": 0.15932661957011873, + "flos": 17571956532480.0, + "grad_norm": 3.5338623134858924, + "language_loss": 0.73033249, + "learning_rate": 3.827101475687033e-06, + "loss": 0.80937541, + "num_input_tokens_seen": 57400840, + "router_z_loss_clip": 3.07226562, + "router_z_loss_mlp": 0.31311035, + "step": 2650, + "time_per_iteration": 2.499260187149048 + }, + { + "auxiliary_loss_clip": 0.06585062, + "auxiliary_loss_mlp": 0.01286624, + "balance_loss_clip": 0.06301688, + "balance_loss_mlp": 0.01259837, + "epoch": 0.15938674282278673, + "flos": 13339432368000.0, + "grad_norm": 2.105429239138805, + "language_loss": 0.72751939, + "learning_rate": 3.826943037328082e-06, + "loss": 0.80623615, + "num_input_tokens_seen": 57419230, + "router_z_loss_clip": 2.83398438, + "router_z_loss_mlp": 0.2677002, + "step": 2651, + "time_per_iteration": 2.5559604167938232 + }, + { + "auxiliary_loss_clip": 0.06597096, + "auxiliary_loss_mlp": 0.01284795, + "balance_loss_clip": 0.06307643, + "balance_loss_mlp": 0.01257925, + "epoch": 0.1594468660754547, + "flos": 22494879613440.0, + "grad_norm": 1.8417049105495777, + "language_loss": 0.80598879, + "learning_rate": 3.8267845296912674e-06, + "loss": 0.88480765, + "num_input_tokens_seen": 57439315, + "router_z_loss_clip": 2.89257812, + "router_z_loss_mlp": 0.26855469, + "step": 2652, + "time_per_iteration": 2.562206745147705 + }, + { + "auxiliary_loss_clip": 0.06582868, + "auxiliary_loss_mlp": 0.01288009, + "balance_loss_clip": 0.06299073, + "balance_loss_mlp": 0.01260745, + "epoch": 0.15950698932812266, + "flos": 15011782882560.0, + "grad_norm": 3.0665030726784233, + "language_loss": 0.71219099, + "learning_rate": 3.826625952782601e-06, + "loss": 0.79089975, + "num_input_tokens_seen": 57454635, + "router_z_loss_clip": 2.83984375, + "router_z_loss_mlp": 0.27258301, + "step": 2653, + "time_per_iteration": 2.5217130184173584 + }, + { + "auxiliary_loss_clip": 0.06588405, + "auxiliary_loss_mlp": 0.01286539, + "balance_loss_clip": 0.06299819, + "balance_loss_mlp": 0.01261064, + "epoch": 0.15956711258079062, + "flos": 30163074013440.0, + "grad_norm": 3.2964270915620655, + "language_loss": 0.78400207, + "learning_rate": 3.826467306608095e-06, + "loss": 0.86275154, + "num_input_tokens_seen": 57476805, + "router_z_loss_clip": 2.890625, + "router_z_loss_mlp": 0.25488281, + "step": 2654, + "time_per_iteration": 2.68938946723938 + }, + { + "auxiliary_loss_clip": 0.06585521, + "auxiliary_loss_mlp": 0.01284621, + "balance_loss_clip": 0.06301536, + "balance_loss_mlp": 0.01259265, + "epoch": 0.1596272358334586, + "flos": 21039044849280.0, + "grad_norm": 1.8634603693624054, + "language_loss": 0.82786137, + "learning_rate": 3.826308591173765e-06, + "loss": 0.90656281, + "num_input_tokens_seen": 57496400, + "router_z_loss_clip": 2.83984375, + "router_z_loss_mlp": 0.25341797, + "step": 2655, + "time_per_iteration": 2.5611259937286377 + }, + { + "auxiliary_loss_clip": 0.06585874, + "auxiliary_loss_mlp": 0.01285173, + "balance_loss_clip": 0.06296754, + "balance_loss_mlp": 0.01259937, + "epoch": 0.15968735908612655, + "flos": 15273426856320.0, + "grad_norm": 1.9406686852412747, + "language_loss": 0.74707991, + "learning_rate": 3.826149806485631e-06, + "loss": 0.82579041, + "num_input_tokens_seen": 57513700, + "router_z_loss_clip": 2.890625, + "router_z_loss_mlp": 0.25244141, + "step": 2656, + "time_per_iteration": 2.510824680328369 + }, + { + "auxiliary_loss_clip": 0.06577112, + "auxiliary_loss_mlp": 0.0129381, + "balance_loss_clip": 0.06299932, + "balance_loss_mlp": 0.01268705, + "epoch": 0.15974748233879452, + "flos": 52677338647680.0, + "grad_norm": 1.8958398061879393, + "language_loss": 0.78470719, + "learning_rate": 3.825990952549713e-06, + "loss": 0.86341643, + "num_input_tokens_seen": 57536180, + "router_z_loss_clip": 2.7734375, + "router_z_loss_mlp": 0.25109863, + "step": 2657, + "time_per_iteration": 2.8164706230163574 + }, + { + "auxiliary_loss_clip": 0.06582649, + "auxiliary_loss_mlp": 0.01286585, + "balance_loss_clip": 0.062974, + "balance_loss_mlp": 0.01260514, + "epoch": 0.1598076055914625, + "flos": 18739047726720.0, + "grad_norm": 1.7078792593137306, + "language_loss": 0.75124943, + "learning_rate": 3.825832029372035e-06, + "loss": 0.82994181, + "num_input_tokens_seen": 57555025, + "router_z_loss_clip": 2.84960938, + "router_z_loss_mlp": 0.26098633, + "step": 2658, + "time_per_iteration": 2.539357900619507 + }, + { + "auxiliary_loss_clip": 0.06584077, + "auxiliary_loss_mlp": 0.01290613, + "balance_loss_clip": 0.06297718, + "balance_loss_mlp": 0.0126354, + "epoch": 0.15986772884413047, + "flos": 34357681405440.0, + "grad_norm": 1.7106510421340806, + "language_loss": 0.76173538, + "learning_rate": 3.825673036958624e-06, + "loss": 0.84048235, + "num_input_tokens_seen": 57577660, + "router_z_loss_clip": 2.86328125, + "router_z_loss_mlp": 0.27087402, + "step": 2659, + "time_per_iteration": 2.7063279151916504 + }, + { + "auxiliary_loss_clip": 0.06590043, + "auxiliary_loss_mlp": 0.01292057, + "balance_loss_clip": 0.06300306, + "balance_loss_mlp": 0.01265164, + "epoch": 0.15992785209679844, + "flos": 22061596550400.0, + "grad_norm": 2.109703300615196, + "language_loss": 0.91436422, + "learning_rate": 3.825513975315508e-06, + "loss": 0.99318516, + "num_input_tokens_seen": 57596335, + "router_z_loss_clip": 2.8984375, + "router_z_loss_mlp": 0.26855469, + "step": 2660, + "time_per_iteration": 3.960657835006714 + }, + { + "auxiliary_loss_clip": 0.06587565, + "auxiliary_loss_mlp": 0.01283697, + "balance_loss_clip": 0.06297715, + "balance_loss_mlp": 0.01257018, + "epoch": 0.1599879753494664, + "flos": 33073946928000.0, + "grad_norm": 2.772952590222661, + "language_loss": 0.79090029, + "learning_rate": 3.82535484444872e-06, + "loss": 0.86961293, + "num_input_tokens_seen": 57616830, + "router_z_loss_clip": 2.90039062, + "router_z_loss_mlp": 0.26647949, + "step": 2661, + "time_per_iteration": 2.64117693901062 + }, + { + "auxiliary_loss_clip": 0.0657732, + "auxiliary_loss_mlp": 0.01287922, + "balance_loss_clip": 0.06293119, + "balance_loss_mlp": 0.01262495, + "epoch": 0.16004809860213437, + "flos": 28045533156480.0, + "grad_norm": 1.8363743510340895, + "language_loss": 0.74837106, + "learning_rate": 3.825195644364292e-06, + "loss": 0.82702351, + "num_input_tokens_seen": 57635515, + "router_z_loss_clip": 2.84375, + "router_z_loss_mlp": 0.25390625, + "step": 2662, + "time_per_iteration": 4.100783586502075 + }, + { + "auxiliary_loss_clip": 0.06590086, + "auxiliary_loss_mlp": 0.01285907, + "balance_loss_clip": 0.06299042, + "balance_loss_mlp": 0.01259967, + "epoch": 0.16010822185480234, + "flos": 22786096878720.0, + "grad_norm": 1.8771670502098623, + "language_loss": 0.82632995, + "learning_rate": 3.825036375068263e-06, + "loss": 0.90508991, + "num_input_tokens_seen": 57654250, + "router_z_loss_clip": 2.91015625, + "router_z_loss_mlp": 0.25964355, + "step": 2663, + "time_per_iteration": 2.5558366775512695 + }, + { + "auxiliary_loss_clip": 0.06586467, + "auxiliary_loss_mlp": 0.01285272, + "balance_loss_clip": 0.06297847, + "balance_loss_mlp": 0.01260011, + "epoch": 0.16016834510747033, + "flos": 20090188414080.0, + "grad_norm": 3.3923647685745344, + "language_loss": 0.81316251, + "learning_rate": 3.824877036566672e-06, + "loss": 0.89187992, + "num_input_tokens_seen": 57672645, + "router_z_loss_clip": 2.88671875, + "router_z_loss_mlp": 0.25268555, + "step": 2664, + "time_per_iteration": 2.5118319988250732 + }, + { + "auxiliary_loss_clip": 0.06584498, + "auxiliary_loss_mlp": 0.01285586, + "balance_loss_clip": 0.06298545, + "balance_loss_mlp": 0.01259038, + "epoch": 0.1602284683601383, + "flos": 21179391638400.0, + "grad_norm": 1.6927431664351194, + "language_loss": 0.94832575, + "learning_rate": 3.824717628865561e-06, + "loss": 1.02702665, + "num_input_tokens_seen": 57691055, + "router_z_loss_clip": 2.86132812, + "router_z_loss_mlp": 0.26550293, + "step": 2665, + "time_per_iteration": 2.54654860496521 + }, + { + "auxiliary_loss_clip": 0.06588221, + "auxiliary_loss_mlp": 0.0128992, + "balance_loss_clip": 0.06298642, + "balance_loss_mlp": 0.01263051, + "epoch": 0.16028859161280626, + "flos": 14652823991040.0, + "grad_norm": 2.069431022104881, + "language_loss": 0.85796285, + "learning_rate": 3.824558151970974e-06, + "loss": 0.93674427, + "num_input_tokens_seen": 57707235, + "router_z_loss_clip": 2.89648438, + "router_z_loss_mlp": 0.26879883, + "step": 2666, + "time_per_iteration": 2.483457088470459 + }, + { + "auxiliary_loss_clip": 0.06582008, + "auxiliary_loss_mlp": 0.01292714, + "balance_loss_clip": 0.06294423, + "balance_loss_mlp": 0.01268645, + "epoch": 0.16034871486547422, + "flos": 20995677561600.0, + "grad_norm": 1.9110296287370478, + "language_loss": 0.82042331, + "learning_rate": 3.8243986058889595e-06, + "loss": 0.89917052, + "num_input_tokens_seen": 57724190, + "router_z_loss_clip": 2.875, + "router_z_loss_mlp": 0.24072266, + "step": 2667, + "time_per_iteration": 3.9772729873657227 + }, + { + "auxiliary_loss_clip": 0.06585021, + "auxiliary_loss_mlp": 0.01299108, + "balance_loss_clip": 0.06302348, + "balance_loss_mlp": 0.01272608, + "epoch": 0.1604088381181422, + "flos": 21404167015680.0, + "grad_norm": 2.2548046072843664, + "language_loss": 0.74520987, + "learning_rate": 3.824238990625567e-06, + "loss": 0.82405114, + "num_input_tokens_seen": 57743620, + "router_z_loss_clip": 2.82421875, + "router_z_loss_mlp": 0.26513672, + "step": 2668, + "time_per_iteration": 2.5379245281219482 + }, + { + "auxiliary_loss_clip": 0.06581191, + "auxiliary_loss_mlp": 0.01286404, + "balance_loss_clip": 0.06295477, + "balance_loss_mlp": 0.01259296, + "epoch": 0.16046896137081015, + "flos": 23883601656960.0, + "grad_norm": 1.6904761581724046, + "language_loss": 0.78225315, + "learning_rate": 3.824079306186848e-06, + "loss": 0.86092913, + "num_input_tokens_seen": 57764810, + "router_z_loss_clip": 2.86132812, + "router_z_loss_mlp": 0.27124023, + "step": 2669, + "time_per_iteration": 2.5322623252868652 + }, + { + "auxiliary_loss_clip": 0.06461855, + "auxiliary_loss_mlp": 0.01262059, + "balance_loss_clip": 0.06292316, + "balance_loss_mlp": 0.01253518, + "epoch": 0.16052908462347812, + "flos": 59823907453440.0, + "grad_norm": 0.8025105121256505, + "language_loss": 0.55497211, + "learning_rate": 3.823919552578861e-06, + "loss": 0.63221133, + "num_input_tokens_seen": 57824390, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.08551025, + "step": 2670, + "time_per_iteration": 3.0635480880737305 + }, + { + "auxiliary_loss_clip": 0.06584324, + "auxiliary_loss_mlp": 0.01300694, + "balance_loss_clip": 0.06294604, + "balance_loss_mlp": 0.01273097, + "epoch": 0.1605892078761461, + "flos": 18302494354560.0, + "grad_norm": 1.9278903563018932, + "language_loss": 0.79113603, + "learning_rate": 3.82375972980766e-06, + "loss": 0.86998624, + "num_input_tokens_seen": 57843665, + "router_z_loss_clip": 2.8984375, + "router_z_loss_mlp": 0.27587891, + "step": 2671, + "time_per_iteration": 2.5478527545928955 + }, + { + "auxiliary_loss_clip": 0.06586512, + "auxiliary_loss_mlp": 0.01285282, + "balance_loss_clip": 0.06298812, + "balance_loss_mlp": 0.01259914, + "epoch": 0.16064933112881408, + "flos": 32168918977920.0, + "grad_norm": 2.1901870356390964, + "language_loss": 0.65440154, + "learning_rate": 3.8235998378793086e-06, + "loss": 0.73311949, + "num_input_tokens_seen": 57863305, + "router_z_loss_clip": 2.87890625, + "router_z_loss_mlp": 0.25378418, + "step": 2672, + "time_per_iteration": 2.659353494644165 + }, + { + "auxiliary_loss_clip": 0.06589735, + "auxiliary_loss_mlp": 0.01293218, + "balance_loss_clip": 0.06296135, + "balance_loss_mlp": 0.01263916, + "epoch": 0.16070945438148204, + "flos": 19834959277440.0, + "grad_norm": 2.1290275432047037, + "language_loss": 0.86193001, + "learning_rate": 3.8234398767998675e-06, + "loss": 0.94075954, + "num_input_tokens_seen": 57883025, + "router_z_loss_clip": 2.93554688, + "router_z_loss_mlp": 0.29296875, + "step": 2673, + "time_per_iteration": 2.5288193225860596 + }, + { + "auxiliary_loss_clip": 0.06583102, + "auxiliary_loss_mlp": 0.01291016, + "balance_loss_clip": 0.06295739, + "balance_loss_mlp": 0.0126572, + "epoch": 0.16076957763415, + "flos": 18918569099520.0, + "grad_norm": 2.3065631305512473, + "language_loss": 0.73982865, + "learning_rate": 3.823279846575403e-06, + "loss": 0.81856978, + "num_input_tokens_seen": 57901430, + "router_z_loss_clip": 2.87304688, + "router_z_loss_mlp": 0.25305176, + "step": 2674, + "time_per_iteration": 2.524121046066284 + }, + { + "auxiliary_loss_clip": 0.06576435, + "auxiliary_loss_mlp": 0.0128192, + "balance_loss_clip": 0.06293078, + "balance_loss_mlp": 0.01255086, + "epoch": 0.16082970088681797, + "flos": 16770071358720.0, + "grad_norm": 3.691225614104051, + "language_loss": 0.85411537, + "learning_rate": 3.823119747211986e-06, + "loss": 0.93269891, + "num_input_tokens_seen": 57919550, + "router_z_loss_clip": 2.83007812, + "router_z_loss_mlp": 0.26806641, + "step": 2675, + "time_per_iteration": 2.4984703063964844 + }, + { + "auxiliary_loss_clip": 0.06581541, + "auxiliary_loss_mlp": 0.01285801, + "balance_loss_clip": 0.06293826, + "balance_loss_mlp": 0.01259468, + "epoch": 0.16088982413948594, + "flos": 35158560330240.0, + "grad_norm": 1.8394721735800996, + "language_loss": 0.83251232, + "learning_rate": 3.822959578715685e-06, + "loss": 0.91118574, + "num_input_tokens_seen": 57939890, + "router_z_loss_clip": 2.875, + "router_z_loss_mlp": 0.26306152, + "step": 2676, + "time_per_iteration": 2.6714260578155518 + }, + { + "auxiliary_loss_clip": 0.06567734, + "auxiliary_loss_mlp": 0.01280714, + "balance_loss_clip": 0.06290022, + "balance_loss_mlp": 0.01257456, + "epoch": 0.1609499473921539, + "flos": 18631125267840.0, + "grad_norm": 4.8459600996760805, + "language_loss": 0.74951547, + "learning_rate": 3.822799341092573e-06, + "loss": 0.82799989, + "num_input_tokens_seen": 57957410, + "router_z_loss_clip": 2.77539062, + "router_z_loss_mlp": 0.23266602, + "step": 2677, + "time_per_iteration": 2.5061256885528564 + }, + { + "auxiliary_loss_clip": 0.06577247, + "auxiliary_loss_mlp": 0.01283067, + "balance_loss_clip": 0.06292509, + "balance_loss_mlp": 0.01258164, + "epoch": 0.1610100706448219, + "flos": 33154057031040.0, + "grad_norm": 1.8038433202406936, + "language_loss": 0.77285242, + "learning_rate": 3.822639034348728e-06, + "loss": 0.85145557, + "num_input_tokens_seen": 57977900, + "router_z_loss_clip": 2.84765625, + "router_z_loss_mlp": 0.24926758, + "step": 2678, + "time_per_iteration": 2.6886472702026367 + }, + { + "auxiliary_loss_clip": 0.06581186, + "auxiliary_loss_mlp": 0.01287879, + "balance_loss_clip": 0.06295253, + "balance_loss_mlp": 0.01261271, + "epoch": 0.16107019389748986, + "flos": 34685054507520.0, + "grad_norm": 1.8476006870379242, + "language_loss": 0.71465111, + "learning_rate": 3.822478658490228e-06, + "loss": 0.79334176, + "num_input_tokens_seen": 57998210, + "router_z_loss_clip": 2.85742188, + "router_z_loss_mlp": 0.26611328, + "step": 2679, + "time_per_iteration": 2.646629810333252 + }, + { + "auxiliary_loss_clip": 0.06453654, + "auxiliary_loss_mlp": 0.01258662, + "balance_loss_clip": 0.06285442, + "balance_loss_mlp": 0.01250973, + "epoch": 0.16113031715015783, + "flos": 65730920411520.0, + "grad_norm": 0.7655469055577169, + "language_loss": 0.51874888, + "learning_rate": 3.822318213523154e-06, + "loss": 0.59587204, + "num_input_tokens_seen": 58059420, + "router_z_loss_clip": 1.6875, + "router_z_loss_mlp": 0.07678223, + "step": 2680, + "time_per_iteration": 3.3470637798309326 + }, + { + "auxiliary_loss_clip": 0.06584955, + "auxiliary_loss_mlp": 0.01288163, + "balance_loss_clip": 0.06295321, + "balance_loss_mlp": 0.01259363, + "epoch": 0.1611904404028258, + "flos": 20816156188800.0, + "grad_norm": 2.2126972690115476, + "language_loss": 0.81079412, + "learning_rate": 3.8221576994535925e-06, + "loss": 0.88952529, + "num_input_tokens_seen": 58078370, + "router_z_loss_clip": 2.8984375, + "router_z_loss_mlp": 0.28808594, + "step": 2681, + "time_per_iteration": 2.5526723861694336 + }, + { + "auxiliary_loss_clip": 0.06577247, + "auxiliary_loss_mlp": 0.01287934, + "balance_loss_clip": 0.06295492, + "balance_loss_mlp": 0.01262029, + "epoch": 0.16125056365549376, + "flos": 27020172343680.0, + "grad_norm": 2.1176985882953647, + "language_loss": 0.70093226, + "learning_rate": 3.821997116287627e-06, + "loss": 0.77958405, + "num_input_tokens_seen": 58097395, + "router_z_loss_clip": 2.81835938, + "router_z_loss_mlp": 0.25891113, + "step": 2682, + "time_per_iteration": 2.5618250370025635 + }, + { + "auxiliary_loss_clip": 0.0657934, + "auxiliary_loss_mlp": 0.01288185, + "balance_loss_clip": 0.06295457, + "balance_loss_mlp": 0.01261708, + "epoch": 0.16131068690816172, + "flos": 19281762622080.0, + "grad_norm": 2.105414566897303, + "language_loss": 0.88063419, + "learning_rate": 3.821836464031348e-06, + "loss": 0.9593094, + "num_input_tokens_seen": 58115630, + "router_z_loss_clip": 2.83789062, + "router_z_loss_mlp": 0.26464844, + "step": 2683, + "time_per_iteration": 2.528503656387329 + }, + { + "auxiliary_loss_clip": 0.06581098, + "auxiliary_loss_mlp": 0.01286491, + "balance_loss_clip": 0.06297365, + "balance_loss_mlp": 0.01260718, + "epoch": 0.16137081016082971, + "flos": 35347137943680.0, + "grad_norm": 2.6304159370219447, + "language_loss": 0.75242329, + "learning_rate": 3.821675742690849e-06, + "loss": 0.83109927, + "num_input_tokens_seen": 58138655, + "router_z_loss_clip": 2.83984375, + "router_z_loss_mlp": 0.25744629, + "step": 2684, + "time_per_iteration": 2.6683855056762695 + }, + { + "auxiliary_loss_clip": 0.06584509, + "auxiliary_loss_mlp": 0.01281022, + "balance_loss_clip": 0.0629454, + "balance_loss_mlp": 0.01253831, + "epoch": 0.16143093341349768, + "flos": 34242924839040.0, + "grad_norm": 3.4255618739056395, + "language_loss": 0.70703149, + "learning_rate": 3.821514952272223e-06, + "loss": 0.78568679, + "num_input_tokens_seen": 58157440, + "router_z_loss_clip": 2.90039062, + "router_z_loss_mlp": 0.27185059, + "step": 2685, + "time_per_iteration": 2.6502463817596436 + }, + { + "auxiliary_loss_clip": 0.06573574, + "auxiliary_loss_mlp": 0.01295712, + "balance_loss_clip": 0.06295055, + "balance_loss_mlp": 0.01269724, + "epoch": 0.16149105666616564, + "flos": 28006400499840.0, + "grad_norm": 2.7207808014988495, + "language_loss": 0.72642833, + "learning_rate": 3.821354092781567e-06, + "loss": 0.80512118, + "num_input_tokens_seen": 58176660, + "router_z_loss_clip": 2.78515625, + "router_z_loss_mlp": 0.26000977, + "step": 2686, + "time_per_iteration": 2.5685417652130127 + }, + { + "auxiliary_loss_clip": 0.06583634, + "auxiliary_loss_mlp": 0.01298345, + "balance_loss_clip": 0.06294057, + "balance_loss_mlp": 0.01269628, + "epoch": 0.1615511799188336, + "flos": 19427434145280.0, + "grad_norm": 2.058545535595822, + "language_loss": 0.82461345, + "learning_rate": 3.821193164224981e-06, + "loss": 0.90343326, + "num_input_tokens_seen": 58195085, + "router_z_loss_clip": 2.90234375, + "router_z_loss_mlp": 0.2869873, + "step": 2687, + "time_per_iteration": 2.5222442150115967 + }, + { + "auxiliary_loss_clip": 0.06594162, + "auxiliary_loss_mlp": 0.01299687, + "balance_loss_clip": 0.06299458, + "balance_loss_mlp": 0.01269109, + "epoch": 0.16161130317150157, + "flos": 22861217664000.0, + "grad_norm": 2.6401237934402575, + "language_loss": 0.72416258, + "learning_rate": 3.821032166608568e-06, + "loss": 0.80310106, + "num_input_tokens_seen": 58213540, + "router_z_loss_clip": 2.9453125, + "router_z_loss_mlp": 0.30578613, + "step": 2688, + "time_per_iteration": 2.5157902240753174 + }, + { + "auxiliary_loss_clip": 0.06589709, + "auxiliary_loss_mlp": 0.01309231, + "balance_loss_clip": 0.06303161, + "balance_loss_mlp": 0.0128161, + "epoch": 0.16167142642416954, + "flos": 26118833973120.0, + "grad_norm": 1.7781492277957918, + "language_loss": 0.76426512, + "learning_rate": 3.8208710999384325e-06, + "loss": 0.84325451, + "num_input_tokens_seen": 58236995, + "router_z_loss_clip": 2.8671875, + "router_z_loss_mlp": 0.27636719, + "step": 2689, + "time_per_iteration": 2.61681866645813 + }, + { + "auxiliary_loss_clip": 0.06586435, + "auxiliary_loss_mlp": 0.01313647, + "balance_loss_clip": 0.06302889, + "balance_loss_mlp": 0.01286182, + "epoch": 0.1617315496768375, + "flos": 22785551827200.0, + "grad_norm": 2.168912849024457, + "language_loss": 0.883026, + "learning_rate": 3.820709964220683e-06, + "loss": 0.96202683, + "num_input_tokens_seen": 58257230, + "router_z_loss_clip": 2.8359375, + "router_z_loss_mlp": 0.27478027, + "step": 2690, + "time_per_iteration": 2.542171001434326 + }, + { + "auxiliary_loss_clip": 0.06581193, + "auxiliary_loss_mlp": 0.01303059, + "balance_loss_clip": 0.06297438, + "balance_loss_mlp": 0.01277, + "epoch": 0.1617916729295055, + "flos": 22023721704960.0, + "grad_norm": 1.681429316785462, + "language_loss": 0.88894439, + "learning_rate": 3.8205487594614284e-06, + "loss": 0.96778685, + "num_input_tokens_seen": 58277080, + "router_z_loss_clip": 2.83398438, + "router_z_loss_mlp": 0.26049805, + "step": 2691, + "time_per_iteration": 2.5444743633270264 + }, + { + "auxiliary_loss_clip": 0.06592601, + "auxiliary_loss_mlp": 0.01300554, + "balance_loss_clip": 0.06297764, + "balance_loss_mlp": 0.01270108, + "epoch": 0.16185179618217346, + "flos": 23444574589440.0, + "grad_norm": 5.894128293889176, + "language_loss": 0.8353231, + "learning_rate": 3.820387485666784e-06, + "loss": 0.91425461, + "num_input_tokens_seen": 58294815, + "router_z_loss_clip": 2.94921875, + "router_z_loss_mlp": 0.30456543, + "step": 2692, + "time_per_iteration": 2.5367183685302734 + }, + { + "auxiliary_loss_clip": 0.06601407, + "auxiliary_loss_mlp": 0.01299753, + "balance_loss_clip": 0.06306131, + "balance_loss_mlp": 0.01270404, + "epoch": 0.16191191943484143, + "flos": 25673182433280.0, + "grad_norm": 2.87727514771051, + "language_loss": 0.82700074, + "learning_rate": 3.820226142842862e-06, + "loss": 0.9060123, + "num_input_tokens_seen": 58313215, + "router_z_loss_clip": 2.953125, + "router_z_loss_mlp": 0.29333496, + "step": 2693, + "time_per_iteration": 2.6187057495117188 + }, + { + "auxiliary_loss_clip": 0.06582904, + "auxiliary_loss_mlp": 0.01312533, + "balance_loss_clip": 0.06302174, + "balance_loss_mlp": 0.01286724, + "epoch": 0.1619720426875094, + "flos": 23484126516480.0, + "grad_norm": 1.4528149346161843, + "language_loss": 0.85022998, + "learning_rate": 3.820064730995783e-06, + "loss": 0.92918432, + "num_input_tokens_seen": 58333215, + "router_z_loss_clip": 2.80664062, + "router_z_loss_mlp": 0.25793457, + "step": 2694, + "time_per_iteration": 2.5672922134399414 + }, + { + "auxiliary_loss_clip": 0.06594259, + "auxiliary_loss_mlp": 0.01304563, + "balance_loss_clip": 0.0630251, + "balance_loss_mlp": 0.0127612, + "epoch": 0.16203216594017736, + "flos": 24140465948160.0, + "grad_norm": 2.1096932177369654, + "language_loss": 0.70739377, + "learning_rate": 3.819903250131667e-06, + "loss": 0.78638196, + "num_input_tokens_seen": 58351160, + "router_z_loss_clip": 2.921875, + "router_z_loss_mlp": 0.28442383, + "step": 2695, + "time_per_iteration": 2.5555880069732666 + }, + { + "auxiliary_loss_clip": 0.0659132, + "auxiliary_loss_mlp": 0.01297552, + "balance_loss_clip": 0.0630125, + "balance_loss_mlp": 0.01269943, + "epoch": 0.16209228919284532, + "flos": 22346566686720.0, + "grad_norm": 2.7194545314545153, + "language_loss": 0.83673584, + "learning_rate": 3.819741700256637e-06, + "loss": 0.91562462, + "num_input_tokens_seen": 58368505, + "router_z_loss_clip": 2.90429688, + "router_z_loss_mlp": 0.27600098, + "step": 2696, + "time_per_iteration": 2.520920753479004 + }, + { + "auxiliary_loss_clip": 0.06605247, + "auxiliary_loss_mlp": 0.01295053, + "balance_loss_clip": 0.06302903, + "balance_loss_mlp": 0.01263773, + "epoch": 0.1621524124455133, + "flos": 15820586017920.0, + "grad_norm": 2.3129442406301766, + "language_loss": 0.89183378, + "learning_rate": 3.8195800813768194e-06, + "loss": 0.97083676, + "num_input_tokens_seen": 58385085, + "router_z_loss_clip": 3.02539062, + "router_z_loss_mlp": 0.31274414, + "step": 2697, + "time_per_iteration": 2.5259652137756348 + }, + { + "auxiliary_loss_clip": 0.0658388, + "auxiliary_loss_mlp": 0.01292599, + "balance_loss_clip": 0.06303512, + "balance_loss_mlp": 0.01267004, + "epoch": 0.16221253569818128, + "flos": 30193905116160.0, + "grad_norm": 1.495271767432462, + "language_loss": 0.81588805, + "learning_rate": 3.819418393498343e-06, + "loss": 0.89465284, + "num_input_tokens_seen": 58406985, + "router_z_loss_clip": 2.8046875, + "router_z_loss_mlp": 0.25598145, + "step": 2698, + "time_per_iteration": 2.595975160598755 + }, + { + "auxiliary_loss_clip": 0.06588376, + "auxiliary_loss_mlp": 0.01284743, + "balance_loss_clip": 0.06309167, + "balance_loss_mlp": 0.01257765, + "epoch": 0.16227265895084925, + "flos": 24612546251520.0, + "grad_norm": 1.6873939512975982, + "language_loss": 0.78418016, + "learning_rate": 3.819256636627339e-06, + "loss": 0.86291134, + "num_input_tokens_seen": 58426205, + "router_z_loss_clip": 2.79296875, + "router_z_loss_mlp": 0.26965332, + "step": 2699, + "time_per_iteration": 2.5874006748199463 + }, + { + "auxiliary_loss_clip": 0.06599343, + "auxiliary_loss_mlp": 0.01283682, + "balance_loss_clip": 0.06313124, + "balance_loss_mlp": 0.0125754, + "epoch": 0.1623327822035172, + "flos": 19579436651520.0, + "grad_norm": 5.305505294911747, + "language_loss": 0.86966538, + "learning_rate": 3.81909481076994e-06, + "loss": 0.94849563, + "num_input_tokens_seen": 58443830, + "router_z_loss_clip": 2.859375, + "router_z_loss_mlp": 0.2611084, + "step": 2700, + "time_per_iteration": 4.029258966445923 + }, + { + "auxiliary_loss_clip": 0.06593184, + "auxiliary_loss_mlp": 0.01283437, + "balance_loss_clip": 0.06310724, + "balance_loss_mlp": 0.01256042, + "epoch": 0.16239290545618518, + "flos": 26475612658560.0, + "grad_norm": 1.7724025685719413, + "language_loss": 0.80958557, + "learning_rate": 3.818932915932284e-06, + "loss": 0.8883518, + "num_input_tokens_seen": 58464405, + "router_z_loss_clip": 2.83007812, + "router_z_loss_mlp": 0.27404785, + "step": 2701, + "time_per_iteration": 2.5998921394348145 + }, + { + "auxiliary_loss_clip": 0.06590648, + "auxiliary_loss_mlp": 0.01284929, + "balance_loss_clip": 0.06304645, + "balance_loss_mlp": 0.01256271, + "epoch": 0.16245302870885314, + "flos": 15857454614400.0, + "grad_norm": 1.7204107394325303, + "language_loss": 0.74345064, + "learning_rate": 3.818770952120511e-06, + "loss": 0.8222065, + "num_input_tokens_seen": 58483295, + "router_z_loss_clip": 2.859375, + "router_z_loss_mlp": 0.28649902, + "step": 2702, + "time_per_iteration": 3.937354803085327 + }, + { + "auxiliary_loss_clip": 0.06603839, + "auxiliary_loss_mlp": 0.0128822, + "balance_loss_clip": 0.06313589, + "balance_loss_mlp": 0.01259252, + "epoch": 0.1625131519615211, + "flos": 14761710771840.0, + "grad_norm": 9.119129404803312, + "language_loss": 0.7369948, + "learning_rate": 3.81860891934076e-06, + "loss": 0.81591535, + "num_input_tokens_seen": 58501205, + "router_z_loss_clip": 2.90234375, + "router_z_loss_mlp": 0.28955078, + "step": 2703, + "time_per_iteration": 2.5070807933807373 + }, + { + "auxiliary_loss_clip": 0.066023, + "auxiliary_loss_mlp": 0.01283178, + "balance_loss_clip": 0.0631163, + "balance_loss_mlp": 0.01255033, + "epoch": 0.1625732752141891, + "flos": 28228073276160.0, + "grad_norm": 2.112253840465368, + "language_loss": 0.70914233, + "learning_rate": 3.818446817599176e-06, + "loss": 0.78799713, + "num_input_tokens_seen": 58522315, + "router_z_loss_clip": 2.90820312, + "router_z_loss_mlp": 0.28112793, + "step": 2704, + "time_per_iteration": 2.6071994304656982 + }, + { + "auxiliary_loss_clip": 0.06486984, + "auxiliary_loss_mlp": 0.01271467, + "balance_loss_clip": 0.06323022, + "balance_loss_mlp": 0.01264725, + "epoch": 0.16263339846685707, + "flos": 67347268871040.0, + "grad_norm": 0.7781332743607355, + "language_loss": 0.53379726, + "learning_rate": 3.818284646901907e-06, + "loss": 0.61138183, + "num_input_tokens_seen": 58586695, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.06756592, + "step": 2705, + "time_per_iteration": 3.1592283248901367 + }, + { + "auxiliary_loss_clip": 0.06599878, + "auxiliary_loss_mlp": 0.01288619, + "balance_loss_clip": 0.06308411, + "balance_loss_mlp": 0.01259854, + "epoch": 0.16269352171952503, + "flos": 14324360785920.0, + "grad_norm": 2.6444300047772575, + "language_loss": 0.76420808, + "learning_rate": 3.818122407255102e-06, + "loss": 0.84309304, + "num_input_tokens_seen": 58602435, + "router_z_loss_clip": 2.9140625, + "router_z_loss_mlp": 0.2878418, + "step": 2706, + "time_per_iteration": 2.494798183441162 + }, + { + "auxiliary_loss_clip": 0.06595413, + "auxiliary_loss_mlp": 0.01288657, + "balance_loss_clip": 0.06307741, + "balance_loss_mlp": 0.01263015, + "epoch": 0.162753644972193, + "flos": 28367916940800.0, + "grad_norm": 2.0996317585826727, + "language_loss": 0.73324966, + "learning_rate": 3.817960098664914e-06, + "loss": 0.8120904, + "num_input_tokens_seen": 58621275, + "router_z_loss_clip": 2.87695312, + "router_z_loss_mlp": 0.25646973, + "step": 2707, + "time_per_iteration": 5.361986875534058 + }, + { + "auxiliary_loss_clip": 0.06597963, + "auxiliary_loss_mlp": 0.01297936, + "balance_loss_clip": 0.06310263, + "balance_loss_mlp": 0.01270721, + "epoch": 0.16281376822486096, + "flos": 19943971839360.0, + "grad_norm": 3.72169556400114, + "language_loss": 0.83658004, + "learning_rate": 3.817797721137495e-06, + "loss": 0.91553903, + "num_input_tokens_seen": 58637550, + "router_z_loss_clip": 2.875, + "router_z_loss_mlp": 0.27233887, + "step": 2708, + "time_per_iteration": 2.528703451156616 + }, + { + "auxiliary_loss_clip": 0.0659356, + "auxiliary_loss_mlp": 0.01292098, + "balance_loss_clip": 0.06302815, + "balance_loss_mlp": 0.01262701, + "epoch": 0.16287389147752893, + "flos": 21258118149120.0, + "grad_norm": 2.208557612842335, + "language_loss": 0.86945301, + "learning_rate": 3.817635274679006e-06, + "loss": 0.94830966, + "num_input_tokens_seen": 58654135, + "router_z_loss_clip": 2.91015625, + "router_z_loss_mlp": 0.29394531, + "step": 2709, + "time_per_iteration": 2.5158472061157227 + }, + { + "auxiliary_loss_clip": 0.06590779, + "auxiliary_loss_mlp": 0.01297599, + "balance_loss_clip": 0.06302857, + "balance_loss_mlp": 0.0127123, + "epoch": 0.1629340147301969, + "flos": 19250679957120.0, + "grad_norm": 2.0845626973210942, + "language_loss": 0.926085, + "learning_rate": 3.817472759295605e-06, + "loss": 1.00496876, + "num_input_tokens_seen": 58674320, + "router_z_loss_clip": 2.87695312, + "router_z_loss_mlp": 0.26367188, + "step": 2710, + "time_per_iteration": 2.566678762435913 + }, + { + "auxiliary_loss_clip": 0.06590527, + "auxiliary_loss_mlp": 0.01299634, + "balance_loss_clip": 0.06304915, + "balance_loss_mlp": 0.01271691, + "epoch": 0.16299413798286488, + "flos": 21255896016000.0, + "grad_norm": 2.354283395736919, + "language_loss": 0.82405818, + "learning_rate": 3.817310174993453e-06, + "loss": 0.90295976, + "num_input_tokens_seen": 58691000, + "router_z_loss_clip": 2.859375, + "router_z_loss_mlp": 0.27954102, + "step": 2711, + "time_per_iteration": 2.5129330158233643 + }, + { + "auxiliary_loss_clip": 0.06600536, + "auxiliary_loss_mlp": 0.01290666, + "balance_loss_clip": 0.0630425, + "balance_loss_mlp": 0.0126115, + "epoch": 0.16305426123553285, + "flos": 18776545228800.0, + "grad_norm": 3.9666408475565462, + "language_loss": 0.82468587, + "learning_rate": 3.817147521778719e-06, + "loss": 0.90359789, + "num_input_tokens_seen": 58710230, + "router_z_loss_clip": 2.96289062, + "router_z_loss_mlp": 0.29516602, + "step": 2712, + "time_per_iteration": 2.5337300300598145 + }, + { + "auxiliary_loss_clip": 0.06597727, + "auxiliary_loss_mlp": 0.01290483, + "balance_loss_clip": 0.06302102, + "balance_loss_mlp": 0.01261563, + "epoch": 0.16311438448820081, + "flos": 22093643537280.0, + "grad_norm": 1.9569381877955756, + "language_loss": 0.78029472, + "learning_rate": 3.816984799657568e-06, + "loss": 0.85917681, + "num_input_tokens_seen": 58728610, + "router_z_loss_clip": 2.95898438, + "router_z_loss_mlp": 0.28942871, + "step": 2713, + "time_per_iteration": 2.5238146781921387 + }, + { + "auxiliary_loss_clip": 0.06594867, + "auxiliary_loss_mlp": 0.0130017, + "balance_loss_clip": 0.06315845, + "balance_loss_mlp": 0.01271799, + "epoch": 0.16317450774086878, + "flos": 16472565037440.0, + "grad_norm": 2.250248562702171, + "language_loss": 0.80385303, + "learning_rate": 3.8168220086361715e-06, + "loss": 0.88280344, + "num_input_tokens_seen": 58744385, + "router_z_loss_clip": 2.79101562, + "router_z_loss_mlp": 0.28369141, + "step": 2714, + "time_per_iteration": 2.5166831016540527 + }, + { + "auxiliary_loss_clip": 0.06589634, + "auxiliary_loss_mlp": 0.01294838, + "balance_loss_clip": 0.06306746, + "balance_loss_mlp": 0.01269899, + "epoch": 0.16323463099353674, + "flos": 24359832737280.0, + "grad_norm": 1.8056327126335605, + "language_loss": 0.78403461, + "learning_rate": 3.816659148720702e-06, + "loss": 0.8628794, + "num_input_tokens_seen": 58763905, + "router_z_loss_clip": 2.828125, + "router_z_loss_mlp": 0.24951172, + "step": 2715, + "time_per_iteration": 2.5939090251922607 + }, + { + "auxiliary_loss_clip": 0.06588797, + "auxiliary_loss_mlp": 0.01288106, + "balance_loss_clip": 0.06304932, + "balance_loss_mlp": 0.01261952, + "epoch": 0.1632947542462047, + "flos": 24907872366720.0, + "grad_norm": 2.046246244819102, + "language_loss": 0.82485706, + "learning_rate": 3.816496219917336e-06, + "loss": 0.90362608, + "num_input_tokens_seen": 58785580, + "router_z_loss_clip": 2.84179688, + "router_z_loss_mlp": 0.26147461, + "step": 2716, + "time_per_iteration": 2.593174457550049 + }, + { + "auxiliary_loss_clip": 0.06597836, + "auxiliary_loss_mlp": 0.01294616, + "balance_loss_clip": 0.06307962, + "balance_loss_mlp": 0.01266017, + "epoch": 0.1633548774988727, + "flos": 24907285388160.0, + "grad_norm": 1.9895193792693864, + "language_loss": 0.87446529, + "learning_rate": 3.816333222232251e-06, + "loss": 0.95338982, + "num_input_tokens_seen": 58806075, + "router_z_loss_clip": 2.90234375, + "router_z_loss_mlp": 0.28613281, + "step": 2717, + "time_per_iteration": 2.55460262298584 + }, + { + "auxiliary_loss_clip": 0.0659758, + "auxiliary_loss_mlp": 0.01288078, + "balance_loss_clip": 0.06314965, + "balance_loss_mlp": 0.01262413, + "epoch": 0.16341500075154067, + "flos": 30449008471680.0, + "grad_norm": 1.9093048334188691, + "language_loss": 0.77648151, + "learning_rate": 3.816170155671629e-06, + "loss": 0.8553381, + "num_input_tokens_seen": 58827405, + "router_z_loss_clip": 2.82617188, + "router_z_loss_mlp": 0.25671387, + "step": 2718, + "time_per_iteration": 2.6473746299743652 + }, + { + "auxiliary_loss_clip": 0.06597009, + "auxiliary_loss_mlp": 0.01285687, + "balance_loss_clip": 0.0631033, + "balance_loss_mlp": 0.01259783, + "epoch": 0.16347512400420863, + "flos": 22791253904640.0, + "grad_norm": 2.222005290704418, + "language_loss": 0.74954313, + "learning_rate": 3.816007020241652e-06, + "loss": 0.82837009, + "num_input_tokens_seen": 58847205, + "router_z_loss_clip": 2.8671875, + "router_z_loss_mlp": 0.25866699, + "step": 2719, + "time_per_iteration": 2.551116704940796 + }, + { + "auxiliary_loss_clip": 0.0659292, + "auxiliary_loss_mlp": 0.01283628, + "balance_loss_clip": 0.0630803, + "balance_loss_mlp": 0.01257831, + "epoch": 0.1635352472568766, + "flos": 22639083690240.0, + "grad_norm": 1.7533438569003168, + "language_loss": 0.73446441, + "learning_rate": 3.815843815948507e-06, + "loss": 0.81322992, + "num_input_tokens_seen": 58866865, + "router_z_loss_clip": 2.84765625, + "router_z_loss_mlp": 0.25805664, + "step": 2720, + "time_per_iteration": 2.5771543979644775 + }, + { + "auxiliary_loss_clip": 0.06588636, + "auxiliary_loss_mlp": 0.01282225, + "balance_loss_clip": 0.0630826, + "balance_loss_mlp": 0.01254949, + "epoch": 0.16359537050954456, + "flos": 15528362503680.0, + "grad_norm": 2.643329433322918, + "language_loss": 0.7707237, + "learning_rate": 3.8156805427983824e-06, + "loss": 0.84943235, + "num_input_tokens_seen": 58885200, + "router_z_loss_clip": 2.80664062, + "router_z_loss_mlp": 0.27294922, + "step": 2721, + "time_per_iteration": 2.4961769580841064 + }, + { + "auxiliary_loss_clip": 0.06596414, + "auxiliary_loss_mlp": 0.0128382, + "balance_loss_clip": 0.0630523, + "balance_loss_mlp": 0.01256175, + "epoch": 0.16365549376221253, + "flos": 22096578430080.0, + "grad_norm": 2.1311655694461917, + "language_loss": 0.79885328, + "learning_rate": 3.8155172007974695e-06, + "loss": 0.87765563, + "num_input_tokens_seen": 58906385, + "router_z_loss_clip": 2.91601562, + "router_z_loss_mlp": 0.27648926, + "step": 2722, + "time_per_iteration": 2.614875078201294 + }, + { + "auxiliary_loss_clip": 0.06605944, + "auxiliary_loss_mlp": 0.01289108, + "balance_loss_clip": 0.06310583, + "balance_loss_mlp": 0.01258602, + "epoch": 0.1637156170148805, + "flos": 24067148025600.0, + "grad_norm": 1.9382892216015752, + "language_loss": 0.85628319, + "learning_rate": 3.8153537899519624e-06, + "loss": 0.93523371, + "num_input_tokens_seen": 58925040, + "router_z_loss_clip": 2.95507812, + "router_z_loss_mlp": 0.30493164, + "step": 2723, + "time_per_iteration": 2.531521797180176 + }, + { + "auxiliary_loss_clip": 0.0658607, + "auxiliary_loss_mlp": 0.01286244, + "balance_loss_clip": 0.06307479, + "balance_loss_mlp": 0.01259732, + "epoch": 0.1637757402675485, + "flos": 26692212263040.0, + "grad_norm": 4.459915510598608, + "language_loss": 0.71697843, + "learning_rate": 3.815190310268058e-06, + "loss": 0.7957015, + "num_input_tokens_seen": 58944790, + "router_z_loss_clip": 2.79101562, + "router_z_loss_mlp": 0.26477051, + "step": 2724, + "time_per_iteration": 2.577958822250366 + }, + { + "auxiliary_loss_clip": 0.06581962, + "auxiliary_loss_mlp": 0.01288602, + "balance_loss_clip": 0.06304826, + "balance_loss_mlp": 0.01263521, + "epoch": 0.16383586352021645, + "flos": 16112432188800.0, + "grad_norm": 1.9457979219444324, + "language_loss": 0.71286237, + "learning_rate": 3.815026761751955e-06, + "loss": 0.79156804, + "num_input_tokens_seen": 58962500, + "router_z_loss_clip": 2.77539062, + "router_z_loss_mlp": 0.25085449, + "step": 2725, + "time_per_iteration": 2.497311592102051 + }, + { + "auxiliary_loss_clip": 0.06590257, + "auxiliary_loss_mlp": 0.01285785, + "balance_loss_clip": 0.06310654, + "balance_loss_mlp": 0.01259761, + "epoch": 0.16389598677288442, + "flos": 19171031051520.0, + "grad_norm": 2.1904929355188325, + "language_loss": 0.89010125, + "learning_rate": 3.814863144409855e-06, + "loss": 0.96886164, + "num_input_tokens_seen": 58980355, + "router_z_loss_clip": 2.79101562, + "router_z_loss_mlp": 0.26013184, + "step": 2726, + "time_per_iteration": 2.5101511478424072 + }, + { + "auxiliary_loss_clip": 0.06595127, + "auxiliary_loss_mlp": 0.01285031, + "balance_loss_clip": 0.06307214, + "balance_loss_mlp": 0.01257732, + "epoch": 0.16395611002555238, + "flos": 21513431139840.0, + "grad_norm": 1.9675738265317178, + "language_loss": 0.75618744, + "learning_rate": 3.814699458247963e-06, + "loss": 0.83498907, + "num_input_tokens_seen": 58999505, + "router_z_loss_clip": 2.87890625, + "router_z_loss_mlp": 0.27331543, + "step": 2727, + "time_per_iteration": 2.5322039127349854 + }, + { + "auxiliary_loss_clip": 0.06578872, + "auxiliary_loss_mlp": 0.012812, + "balance_loss_clip": 0.06301126, + "balance_loss_mlp": 0.01257298, + "epoch": 0.16401623327822035, + "flos": 21477401084160.0, + "grad_norm": 2.357425852181157, + "language_loss": 0.82921708, + "learning_rate": 3.8145357032724855e-06, + "loss": 0.90781784, + "num_input_tokens_seen": 59017930, + "router_z_loss_clip": 2.78125, + "router_z_loss_mlp": 0.23913574, + "step": 2728, + "time_per_iteration": 2.538081407546997 + }, + { + "auxiliary_loss_clip": 0.06590319, + "auxiliary_loss_mlp": 0.01282423, + "balance_loss_clip": 0.0630119, + "balance_loss_mlp": 0.01255685, + "epoch": 0.1640763565308883, + "flos": 13631362392960.0, + "grad_norm": 3.359167938327165, + "language_loss": 0.85634404, + "learning_rate": 3.814371879489633e-06, + "loss": 0.93507141, + "num_input_tokens_seen": 59035130, + "router_z_loss_clip": 2.890625, + "router_z_loss_mlp": 0.26745605, + "step": 2729, + "time_per_iteration": 2.555157423019409 + }, + { + "auxiliary_loss_clip": 0.06590364, + "auxiliary_loss_mlp": 0.01282244, + "balance_loss_clip": 0.06303068, + "balance_loss_mlp": 0.01255732, + "epoch": 0.16413647978355628, + "flos": 15457057079040.0, + "grad_norm": 2.0375012641424193, + "language_loss": 0.73386455, + "learning_rate": 3.814207986905616e-06, + "loss": 0.81259066, + "num_input_tokens_seen": 59053080, + "router_z_loss_clip": 2.87695312, + "router_z_loss_mlp": 0.26477051, + "step": 2730, + "time_per_iteration": 2.5347042083740234 + }, + { + "auxiliary_loss_clip": 0.06593673, + "auxiliary_loss_mlp": 0.01289719, + "balance_loss_clip": 0.06303447, + "balance_loss_mlp": 0.01261967, + "epoch": 0.16419660303622427, + "flos": 45889043172480.0, + "grad_norm": 1.5633038653846945, + "language_loss": 0.75101161, + "learning_rate": 3.814044025526651e-06, + "loss": 0.82984555, + "num_input_tokens_seen": 59075610, + "router_z_loss_clip": 2.90234375, + "router_z_loss_mlp": 0.27734375, + "step": 2731, + "time_per_iteration": 2.7257211208343506 + }, + { + "auxiliary_loss_clip": 0.06592289, + "auxiliary_loss_mlp": 0.012866, + "balance_loss_clip": 0.06302358, + "balance_loss_mlp": 0.01258967, + "epoch": 0.16425672628889224, + "flos": 18958791859200.0, + "grad_norm": 2.3112437011786238, + "language_loss": 0.79966319, + "learning_rate": 3.8138799953589548e-06, + "loss": 0.87845206, + "num_input_tokens_seen": 59094555, + "router_z_loss_clip": 2.90039062, + "router_z_loss_mlp": 0.27648926, + "step": 2732, + "time_per_iteration": 2.5160276889801025 + }, + { + "auxiliary_loss_clip": 0.06590726, + "auxiliary_loss_mlp": 0.01293299, + "balance_loss_clip": 0.06301223, + "balance_loss_mlp": 0.01263854, + "epoch": 0.1643168495415602, + "flos": 24319316488320.0, + "grad_norm": 2.024679597680736, + "language_loss": 0.69993633, + "learning_rate": 3.8137158964087473e-06, + "loss": 0.77877665, + "num_input_tokens_seen": 59113515, + "router_z_loss_clip": 2.89648438, + "router_z_loss_mlp": 0.29467773, + "step": 2733, + "time_per_iteration": 2.53328537940979 + }, + { + "auxiliary_loss_clip": 0.06586764, + "auxiliary_loss_mlp": 0.0128512, + "balance_loss_clip": 0.06300272, + "balance_loss_mlp": 0.01256426, + "epoch": 0.16437697279422817, + "flos": 26434970628480.0, + "grad_norm": 2.0387940274909537, + "language_loss": 0.81552017, + "learning_rate": 3.8135517286822508e-06, + "loss": 0.89423895, + "num_input_tokens_seen": 59133275, + "router_z_loss_clip": 2.86132812, + "router_z_loss_mlp": 0.28674316, + "step": 2734, + "time_per_iteration": 2.567229747772217 + }, + { + "auxiliary_loss_clip": 0.0658897, + "auxiliary_loss_mlp": 0.01289023, + "balance_loss_clip": 0.06299339, + "balance_loss_mlp": 0.01261271, + "epoch": 0.16443709604689613, + "flos": 34540808503680.0, + "grad_norm": 4.048112349799869, + "language_loss": 0.82907999, + "learning_rate": 3.8133874921856914e-06, + "loss": 0.90785992, + "num_input_tokens_seen": 59154095, + "router_z_loss_clip": 2.89257812, + "router_z_loss_mlp": 0.27758789, + "step": 2735, + "time_per_iteration": 2.63996958732605 + }, + { + "auxiliary_loss_clip": 0.06579679, + "auxiliary_loss_mlp": 0.01279603, + "balance_loss_clip": 0.06297098, + "balance_loss_mlp": 0.01254783, + "epoch": 0.1644972192995641, + "flos": 23264717800320.0, + "grad_norm": 2.4207218830736417, + "language_loss": 0.80072814, + "learning_rate": 3.813223186925296e-06, + "loss": 0.87932098, + "num_input_tokens_seen": 59173795, + "router_z_loss_clip": 2.828125, + "router_z_loss_mlp": 0.24816895, + "step": 2736, + "time_per_iteration": 2.546694755554199 + }, + { + "auxiliary_loss_clip": 0.0658504, + "auxiliary_loss_mlp": 0.0128325, + "balance_loss_clip": 0.06300261, + "balance_loss_mlp": 0.01256499, + "epoch": 0.1645573425522321, + "flos": 26986825618560.0, + "grad_norm": 1.6682039059194231, + "language_loss": 0.82238322, + "learning_rate": 3.8130588129072964e-06, + "loss": 0.90106606, + "num_input_tokens_seen": 59191610, + "router_z_loss_clip": 2.84765625, + "router_z_loss_mlp": 0.2677002, + "step": 2737, + "time_per_iteration": 2.5593652725219727 + }, + { + "auxiliary_loss_clip": 0.06591076, + "auxiliary_loss_mlp": 0.0128149, + "balance_loss_clip": 0.06302774, + "balance_loss_mlp": 0.01256087, + "epoch": 0.16461746580490005, + "flos": 28739495871360.0, + "grad_norm": 1.7184215818783282, + "language_loss": 0.88135791, + "learning_rate": 3.8128943701379246e-06, + "loss": 0.96008366, + "num_input_tokens_seen": 59213000, + "router_z_loss_clip": 2.88476562, + "router_z_loss_mlp": 0.25402832, + "step": 2738, + "time_per_iteration": 2.6650192737579346 + }, + { + "auxiliary_loss_clip": 0.06589583, + "auxiliary_loss_mlp": 0.0128808, + "balance_loss_clip": 0.06299618, + "balance_loss_mlp": 0.01259446, + "epoch": 0.16467758905756802, + "flos": 24936062065920.0, + "grad_norm": 2.428798415539057, + "language_loss": 0.72705042, + "learning_rate": 3.8127298586234167e-06, + "loss": 0.80582702, + "num_input_tokens_seen": 59232340, + "router_z_loss_clip": 2.90234375, + "router_z_loss_mlp": 0.28649902, + "step": 2739, + "time_per_iteration": 4.007360935211182 + }, + { + "auxiliary_loss_clip": 0.06576341, + "auxiliary_loss_mlp": 0.0128871, + "balance_loss_clip": 0.06294868, + "balance_loss_mlp": 0.01261435, + "epoch": 0.16473771231023598, + "flos": 24833380487040.0, + "grad_norm": 2.4914045636792133, + "language_loss": 0.82377362, + "learning_rate": 3.8125652783700104e-06, + "loss": 0.90242416, + "num_input_tokens_seen": 59253950, + "router_z_loss_clip": 2.81445312, + "router_z_loss_mlp": 0.27270508, + "step": 2740, + "time_per_iteration": 2.5806076526641846 + }, + { + "auxiliary_loss_clip": 0.06593102, + "auxiliary_loss_mlp": 0.01294674, + "balance_loss_clip": 0.0629887, + "balance_loss_mlp": 0.01265218, + "epoch": 0.16479783556290395, + "flos": 39905609690880.0, + "grad_norm": 2.0874742304604785, + "language_loss": 0.6960665, + "learning_rate": 3.8124006293839475e-06, + "loss": 0.77494431, + "num_input_tokens_seen": 59275545, + "router_z_loss_clip": 2.9453125, + "router_z_loss_mlp": 0.29431152, + "step": 2741, + "time_per_iteration": 2.67899489402771 + }, + { + "auxiliary_loss_clip": 0.06583216, + "auxiliary_loss_mlp": 0.01289999, + "balance_loss_clip": 0.06296665, + "balance_loss_mlp": 0.0126295, + "epoch": 0.16485795881557191, + "flos": 19902449341440.0, + "grad_norm": 1.99300527848014, + "language_loss": 0.80380434, + "learning_rate": 3.812235911671472e-06, + "loss": 0.88253653, + "num_input_tokens_seen": 59293480, + "router_z_loss_clip": 2.86523438, + "router_z_loss_mlp": 0.27062988, + "step": 2742, + "time_per_iteration": 4.01186203956604 + }, + { + "auxiliary_loss_clip": 0.06583486, + "auxiliary_loss_mlp": 0.0128544, + "balance_loss_clip": 0.06299208, + "balance_loss_mlp": 0.01258034, + "epoch": 0.16491808206823988, + "flos": 20562017155200.0, + "grad_norm": 1.859989576393153, + "language_loss": 0.85480952, + "learning_rate": 3.8120711252388274e-06, + "loss": 0.9334988, + "num_input_tokens_seen": 59313435, + "router_z_loss_clip": 2.84375, + "router_z_loss_mlp": 0.27392578, + "step": 2743, + "time_per_iteration": 2.531813859939575 + }, + { + "auxiliary_loss_clip": 0.06583907, + "auxiliary_loss_mlp": 0.01288972, + "balance_loss_clip": 0.06300064, + "balance_loss_mlp": 0.01261018, + "epoch": 0.16497820532090787, + "flos": 23806803790080.0, + "grad_norm": 1.9796677960929725, + "language_loss": 0.87141418, + "learning_rate": 3.811906270092265e-06, + "loss": 0.95014304, + "num_input_tokens_seen": 59331535, + "router_z_loss_clip": 2.8359375, + "router_z_loss_mlp": 0.27966309, + "step": 2744, + "time_per_iteration": 2.5968780517578125 + }, + { + "auxiliary_loss_clip": 0.06573457, + "auxiliary_loss_mlp": 0.01283559, + "balance_loss_clip": 0.0629618, + "balance_loss_mlp": 0.01258847, + "epoch": 0.16503832857357584, + "flos": 25489510283520.0, + "grad_norm": 2.535956000825199, + "language_loss": 0.83221614, + "learning_rate": 3.811741346238036e-06, + "loss": 0.91078633, + "num_input_tokens_seen": 59350680, + "router_z_loss_clip": 2.76953125, + "router_z_loss_mlp": 0.24743652, + "step": 2745, + "time_per_iteration": 2.5640015602111816 + }, + { + "auxiliary_loss_clip": 0.06588263, + "auxiliary_loss_mlp": 0.01287637, + "balance_loss_clip": 0.06305014, + "balance_loss_mlp": 0.01261196, + "epoch": 0.1650984518262438, + "flos": 17681849562240.0, + "grad_norm": 2.0373309792274883, + "language_loss": 0.7743578, + "learning_rate": 3.8115763536823923e-06, + "loss": 0.85311675, + "num_input_tokens_seen": 59367020, + "router_z_loss_clip": 2.83007812, + "router_z_loss_mlp": 0.26452637, + "step": 2746, + "time_per_iteration": 5.4125282764434814 + }, + { + "auxiliary_loss_clip": 0.06589019, + "auxiliary_loss_mlp": 0.01289439, + "balance_loss_clip": 0.06303473, + "balance_loss_mlp": 0.01261723, + "epoch": 0.16515857507891177, + "flos": 18704401263360.0, + "grad_norm": 1.60188965958096, + "language_loss": 0.81673479, + "learning_rate": 3.811411292431592e-06, + "loss": 0.89551938, + "num_input_tokens_seen": 59386075, + "router_z_loss_clip": 2.85742188, + "router_z_loss_mlp": 0.27685547, + "step": 2747, + "time_per_iteration": 2.5460550785064697 + }, + { + "auxiliary_loss_clip": 0.06594047, + "auxiliary_loss_mlp": 0.0128679, + "balance_loss_clip": 0.06307407, + "balance_loss_mlp": 0.01260707, + "epoch": 0.16521869833157973, + "flos": 15015472462080.0, + "grad_norm": 2.468884923074517, + "language_loss": 0.71168172, + "learning_rate": 3.8112461624918945e-06, + "loss": 0.79049003, + "num_input_tokens_seen": 59402690, + "router_z_loss_clip": 2.86328125, + "router_z_loss_mlp": 0.26074219, + "step": 2748, + "time_per_iteration": 2.493168592453003 + }, + { + "auxiliary_loss_clip": 0.06589203, + "auxiliary_loss_mlp": 0.01284146, + "balance_loss_clip": 0.06305005, + "balance_loss_mlp": 0.01259732, + "epoch": 0.1652788215842477, + "flos": 22126654846080.0, + "grad_norm": 5.244624397631241, + "language_loss": 0.8897143, + "learning_rate": 3.811080963869561e-06, + "loss": 0.9684478, + "num_input_tokens_seen": 59421130, + "router_z_loss_clip": 2.84179688, + "router_z_loss_mlp": 0.24401855, + "step": 2749, + "time_per_iteration": 2.6453802585601807 + }, + { + "auxiliary_loss_clip": 0.0659653, + "auxiliary_loss_mlp": 0.01290094, + "balance_loss_clip": 0.06307155, + "balance_loss_mlp": 0.01261913, + "epoch": 0.16533894483691566, + "flos": 18339027534720.0, + "grad_norm": 3.9658549336517446, + "language_loss": 0.79764348, + "learning_rate": 3.8109156965708557e-06, + "loss": 0.87650967, + "num_input_tokens_seen": 59438970, + "router_z_loss_clip": 2.88867188, + "router_z_loss_mlp": 0.28210449, + "step": 2750, + "time_per_iteration": 2.5099878311157227 + }, + { + "auxiliary_loss_clip": 0.06587892, + "auxiliary_loss_mlp": 0.01285687, + "balance_loss_clip": 0.06303497, + "balance_loss_mlp": 0.01257673, + "epoch": 0.16539906808958366, + "flos": 22388592309120.0, + "grad_norm": 1.8681239023451541, + "language_loss": 0.95973986, + "learning_rate": 3.8107503606020455e-06, + "loss": 1.03847575, + "num_input_tokens_seen": 59458510, + "router_z_loss_clip": 2.84570312, + "router_z_loss_mlp": 0.2800293, + "step": 2751, + "time_per_iteration": 2.580857753753662 + }, + { + "auxiliary_loss_clip": 0.06591333, + "auxiliary_loss_mlp": 0.01293333, + "balance_loss_clip": 0.06311293, + "balance_loss_mlp": 0.01266344, + "epoch": 0.16545919134225162, + "flos": 22717726346880.0, + "grad_norm": 2.017884310231, + "language_loss": 0.71926272, + "learning_rate": 3.8105849559693997e-06, + "loss": 0.79810935, + "num_input_tokens_seen": 59477110, + "router_z_loss_clip": 2.79882812, + "router_z_loss_mlp": 0.26965332, + "step": 2752, + "time_per_iteration": 2.5533626079559326 + }, + { + "auxiliary_loss_clip": 0.06474683, + "auxiliary_loss_mlp": 0.01280412, + "balance_loss_clip": 0.06313415, + "balance_loss_mlp": 0.01272663, + "epoch": 0.1655193145949196, + "flos": 67822493702400.0, + "grad_norm": 0.7367497765392101, + "language_loss": 0.5395115, + "learning_rate": 3.810419482679192e-06, + "loss": 0.61706245, + "num_input_tokens_seen": 59541155, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.07739258, + "step": 2753, + "time_per_iteration": 3.283729314804077 + }, + { + "auxiliary_loss_clip": 0.06593385, + "auxiliary_loss_mlp": 0.01285286, + "balance_loss_clip": 0.06311026, + "balance_loss_mlp": 0.01258547, + "epoch": 0.16557943784758755, + "flos": 24287353355520.0, + "grad_norm": 1.793852310261697, + "language_loss": 0.75999093, + "learning_rate": 3.8102539407376954e-06, + "loss": 0.8387776, + "num_input_tokens_seen": 59561155, + "router_z_loss_clip": 2.82421875, + "router_z_loss_mlp": 0.26757812, + "step": 2754, + "time_per_iteration": 2.608365297317505 + }, + { + "auxiliary_loss_clip": 0.06608296, + "auxiliary_loss_mlp": 0.01288183, + "balance_loss_clip": 0.06315503, + "balance_loss_mlp": 0.01260575, + "epoch": 0.16563956110025552, + "flos": 20089727216640.0, + "grad_norm": 2.367713266740868, + "language_loss": 0.87993264, + "learning_rate": 3.810088330151188e-06, + "loss": 0.95889747, + "num_input_tokens_seen": 59580460, + "router_z_loss_clip": 2.9296875, + "router_z_loss_mlp": 0.27600098, + "step": 2755, + "time_per_iteration": 2.5239596366882324 + }, + { + "auxiliary_loss_clip": 0.06584992, + "auxiliary_loss_mlp": 0.01279054, + "balance_loss_clip": 0.06303753, + "balance_loss_mlp": 0.01253877, + "epoch": 0.16569968435292348, + "flos": 28041382379520.0, + "grad_norm": 1.6563009546595795, + "language_loss": 0.7383014, + "learning_rate": 3.80992265092595e-06, + "loss": 0.81694186, + "num_input_tokens_seen": 59600025, + "router_z_loss_clip": 2.8125, + "router_z_loss_mlp": 0.25195312, + "step": 2756, + "time_per_iteration": 2.6032936573028564 + }, + { + "auxiliary_loss_clip": 0.06582732, + "auxiliary_loss_mlp": 0.01284003, + "balance_loss_clip": 0.06305105, + "balance_loss_mlp": 0.0125817, + "epoch": 0.16575980760559147, + "flos": 26257461753600.0, + "grad_norm": 1.6426190009356174, + "language_loss": 0.75875264, + "learning_rate": 3.8097569030682636e-06, + "loss": 0.83741999, + "num_input_tokens_seen": 59620600, + "router_z_loss_clip": 2.7734375, + "router_z_loss_mlp": 0.25817871, + "step": 2757, + "time_per_iteration": 2.5745790004730225 + }, + { + "auxiliary_loss_clip": 0.06586438, + "auxiliary_loss_mlp": 0.01285191, + "balance_loss_clip": 0.063063, + "balance_loss_mlp": 0.01258822, + "epoch": 0.16581993085825944, + "flos": 26951382541440.0, + "grad_norm": 1.7077128151850376, + "language_loss": 0.85793787, + "learning_rate": 3.8095910865844137e-06, + "loss": 0.93665409, + "num_input_tokens_seen": 59641385, + "router_z_loss_clip": 2.80078125, + "router_z_loss_mlp": 0.26391602, + "step": 2758, + "time_per_iteration": 2.6094768047332764 + }, + { + "auxiliary_loss_clip": 0.06582282, + "auxiliary_loss_mlp": 0.01281611, + "balance_loss_clip": 0.06301229, + "balance_loss_mlp": 0.01255981, + "epoch": 0.1658800541109274, + "flos": 21660192766080.0, + "grad_norm": 2.0058299268215602, + "language_loss": 0.79821748, + "learning_rate": 3.809425201480689e-06, + "loss": 0.87685645, + "num_input_tokens_seen": 59659865, + "router_z_loss_clip": 2.81054688, + "router_z_loss_mlp": 0.25646973, + "step": 2759, + "time_per_iteration": 2.5326881408691406 + }, + { + "auxiliary_loss_clip": 0.06584738, + "auxiliary_loss_mlp": 0.01287284, + "balance_loss_clip": 0.06296851, + "balance_loss_mlp": 0.01258721, + "epoch": 0.16594017736359537, + "flos": 16441063102080.0, + "grad_norm": 2.640523985370613, + "language_loss": 0.76520288, + "learning_rate": 3.8092592477633793e-06, + "loss": 0.84392309, + "num_input_tokens_seen": 59678780, + "router_z_loss_clip": 2.8828125, + "router_z_loss_mlp": 0.28588867, + "step": 2760, + "time_per_iteration": 2.5365755558013916 + }, + { + "auxiliary_loss_clip": 0.06596339, + "auxiliary_loss_mlp": 0.01287081, + "balance_loss_clip": 0.06307873, + "balance_loss_mlp": 0.01260986, + "epoch": 0.16600030061626334, + "flos": 22643779518720.0, + "grad_norm": 1.8139140163731928, + "language_loss": 0.74449325, + "learning_rate": 3.8090932254387774e-06, + "loss": 0.82332754, + "num_input_tokens_seen": 59698795, + "router_z_loss_clip": 2.88671875, + "router_z_loss_mlp": 0.26086426, + "step": 2761, + "time_per_iteration": 2.5551891326904297 + }, + { + "auxiliary_loss_clip": 0.06586796, + "auxiliary_loss_mlp": 0.01291841, + "balance_loss_clip": 0.0630264, + "balance_loss_mlp": 0.01263922, + "epoch": 0.1660604238689313, + "flos": 26403887963520.0, + "grad_norm": 1.8147235749558717, + "language_loss": 0.89404368, + "learning_rate": 3.8089271345131788e-06, + "loss": 0.97283, + "num_input_tokens_seen": 59718795, + "router_z_loss_clip": 2.83984375, + "router_z_loss_mlp": 0.27905273, + "step": 2762, + "time_per_iteration": 2.587952136993408 + }, + { + "auxiliary_loss_clip": 0.0659417, + "auxiliary_loss_mlp": 0.01281866, + "balance_loss_clip": 0.0630425, + "balance_loss_mlp": 0.01255282, + "epoch": 0.16612054712159927, + "flos": 23046776530560.0, + "grad_norm": 1.779645358746394, + "language_loss": 0.8912673, + "learning_rate": 3.8087609749928822e-06, + "loss": 0.97002763, + "num_input_tokens_seen": 59737555, + "router_z_loss_clip": 2.90234375, + "router_z_loss_mlp": 0.26611328, + "step": 2763, + "time_per_iteration": 2.5509772300720215 + }, + { + "auxiliary_loss_clip": 0.06462647, + "auxiliary_loss_mlp": 0.01266671, + "balance_loss_clip": 0.06301262, + "balance_loss_mlp": 0.01259697, + "epoch": 0.16618067037426726, + "flos": 59261388266880.0, + "grad_norm": 0.7675418877188291, + "language_loss": 0.59855133, + "learning_rate": 3.8085947468841885e-06, + "loss": 0.67584455, + "num_input_tokens_seen": 59800915, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.06988525, + "step": 2764, + "time_per_iteration": 3.221308708190918 + }, + { + "auxiliary_loss_clip": 0.06595036, + "auxiliary_loss_mlp": 0.0129625, + "balance_loss_clip": 0.06311496, + "balance_loss_mlp": 0.01269607, + "epoch": 0.16624079362693522, + "flos": 27206192407680.0, + "grad_norm": 22.231303672766604, + "language_loss": 0.8298772, + "learning_rate": 3.808428450193401e-06, + "loss": 0.90879005, + "num_input_tokens_seen": 59822910, + "router_z_loss_clip": 2.83203125, + "router_z_loss_mlp": 0.26635742, + "step": 2765, + "time_per_iteration": 2.5886435508728027 + }, + { + "auxiliary_loss_clip": 0.06603917, + "auxiliary_loss_mlp": 0.0129703, + "balance_loss_clip": 0.06306268, + "balance_loss_mlp": 0.01269099, + "epoch": 0.1663009168796032, + "flos": 10929542215680.0, + "grad_norm": 2.384069935097126, + "language_loss": 0.7120772, + "learning_rate": 3.8082620849268244e-06, + "loss": 0.79108667, + "num_input_tokens_seen": 59838805, + "router_z_loss_clip": 2.97851562, + "router_z_loss_mlp": 0.27941895, + "step": 2766, + "time_per_iteration": 2.526913642883301 + }, + { + "auxiliary_loss_clip": 0.06591118, + "auxiliary_loss_mlp": 0.0128837, + "balance_loss_clip": 0.06309089, + "balance_loss_mlp": 0.01262526, + "epoch": 0.16636104013227115, + "flos": 17900168175360.0, + "grad_norm": 2.2120517261374593, + "language_loss": 0.89624047, + "learning_rate": 3.808095651090769e-06, + "loss": 0.97503531, + "num_input_tokens_seen": 59855345, + "router_z_loss_clip": 2.82226562, + "router_z_loss_mlp": 0.25830078, + "step": 2767, + "time_per_iteration": 2.4989144802093506 + }, + { + "auxiliary_loss_clip": 0.06446301, + "auxiliary_loss_mlp": 0.0126062, + "balance_loss_clip": 0.0628543, + "balance_loss_mlp": 0.01253307, + "epoch": 0.16642116338493912, + "flos": 66748342285440.0, + "grad_norm": 0.6237778354152628, + "language_loss": 0.52864301, + "learning_rate": 3.8079291486915447e-06, + "loss": 0.60571223, + "num_input_tokens_seen": 59917710, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.07293701, + "step": 2768, + "time_per_iteration": 3.263981580734253 + }, + { + "auxiliary_loss_clip": 0.06597716, + "auxiliary_loss_mlp": 0.01287278, + "balance_loss_clip": 0.06305783, + "balance_loss_mlp": 0.0126048, + "epoch": 0.16648128663760708, + "flos": 19032067854720.0, + "grad_norm": 2.5043941820877524, + "language_loss": 0.85743988, + "learning_rate": 3.8077625777354667e-06, + "loss": 0.93628991, + "num_input_tokens_seen": 59935105, + "router_z_loss_clip": 2.91796875, + "router_z_loss_mlp": 0.26782227, + "step": 2769, + "time_per_iteration": 2.5169060230255127 + }, + { + "auxiliary_loss_clip": 0.06441471, + "auxiliary_loss_mlp": 0.01258691, + "balance_loss_clip": 0.06281121, + "balance_loss_mlp": 0.01251771, + "epoch": 0.16654140989027508, + "flos": 70154370103680.0, + "grad_norm": 0.7855037683883999, + "language_loss": 0.57378197, + "learning_rate": 3.80759593822885e-06, + "loss": 0.65078354, + "num_input_tokens_seen": 59984085, + "router_z_loss_clip": 1.60351562, + "router_z_loss_mlp": 0.06939697, + "step": 2770, + "time_per_iteration": 3.0450947284698486 + }, + { + "auxiliary_loss_clip": 0.0643771, + "auxiliary_loss_mlp": 0.01261832, + "balance_loss_clip": 0.06278233, + "balance_loss_mlp": 0.01254959, + "epoch": 0.16660153314294304, + "flos": 70290398407680.0, + "grad_norm": 0.8814976481921372, + "language_loss": 0.5630703, + "learning_rate": 3.807429230178015e-06, + "loss": 0.64006579, + "num_input_tokens_seen": 60043470, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.06890869, + "step": 2771, + "time_per_iteration": 3.0379133224487305 + }, + { + "auxiliary_loss_clip": 0.06582694, + "auxiliary_loss_mlp": 0.01286148, + "balance_loss_clip": 0.06303653, + "balance_loss_mlp": 0.01260756, + "epoch": 0.166661656395611, + "flos": 23081590702080.0, + "grad_norm": 2.5291823890046534, + "language_loss": 0.71466291, + "learning_rate": 3.8072624535892817e-06, + "loss": 0.79335129, + "num_input_tokens_seen": 60063045, + "router_z_loss_clip": 2.79101562, + "router_z_loss_mlp": 0.25378418, + "step": 2772, + "time_per_iteration": 2.551870584487915 + }, + { + "auxiliary_loss_clip": 0.06576528, + "auxiliary_loss_mlp": 0.01281534, + "balance_loss_clip": 0.06298962, + "balance_loss_mlp": 0.01255082, + "epoch": 0.16672177964827897, + "flos": 28373912507520.0, + "grad_norm": 1.9791838329774285, + "language_loss": 0.87486583, + "learning_rate": 3.807095608468975e-06, + "loss": 0.95344645, + "num_input_tokens_seen": 60081945, + "router_z_loss_clip": 2.77929688, + "router_z_loss_mlp": 0.26452637, + "step": 2773, + "time_per_iteration": 2.613593339920044 + }, + { + "auxiliary_loss_clip": 0.06585228, + "auxiliary_loss_mlp": 0.01284542, + "balance_loss_clip": 0.06305268, + "balance_loss_mlp": 0.01259532, + "epoch": 0.16678190290094694, + "flos": 19095700631040.0, + "grad_norm": 2.4658170667158545, + "language_loss": 0.8279835, + "learning_rate": 3.8069286948234224e-06, + "loss": 0.90668118, + "num_input_tokens_seen": 60096820, + "router_z_loss_clip": 2.80273438, + "router_z_loss_mlp": 0.25012207, + "step": 2774, + "time_per_iteration": 2.5196969509124756 + }, + { + "auxiliary_loss_clip": 0.06592362, + "auxiliary_loss_mlp": 0.01284239, + "balance_loss_clip": 0.06307152, + "balance_loss_mlp": 0.01258871, + "epoch": 0.1668420261536149, + "flos": 21805612727040.0, + "grad_norm": 2.7739422626660053, + "language_loss": 0.84618509, + "learning_rate": 3.806761712658952e-06, + "loss": 0.92495108, + "num_input_tokens_seen": 60116140, + "router_z_loss_clip": 2.85546875, + "router_z_loss_mlp": 0.25354004, + "step": 2775, + "time_per_iteration": 2.5799014568328857 + }, + { + "auxiliary_loss_clip": 0.06591405, + "auxiliary_loss_mlp": 0.01285295, + "balance_loss_clip": 0.06311037, + "balance_loss_mlp": 0.01260702, + "epoch": 0.16690214940628287, + "flos": 19068559107840.0, + "grad_norm": 2.4582225386756793, + "language_loss": 0.81805599, + "learning_rate": 3.806594661981897e-06, + "loss": 0.89682293, + "num_input_tokens_seen": 60134235, + "router_z_loss_clip": 2.8046875, + "router_z_loss_mlp": 0.24584961, + "step": 2776, + "time_per_iteration": 2.547075033187866 + }, + { + "auxiliary_loss_clip": 0.06574798, + "auxiliary_loss_mlp": 0.01281066, + "balance_loss_clip": 0.06304269, + "balance_loss_mlp": 0.01257188, + "epoch": 0.16696227265895086, + "flos": 18594550160640.0, + "grad_norm": 2.127036404214793, + "language_loss": 0.80698764, + "learning_rate": 3.8064275427985906e-06, + "loss": 0.88554621, + "num_input_tokens_seen": 60153275, + "router_z_loss_clip": 2.703125, + "router_z_loss_mlp": 0.2388916, + "step": 2777, + "time_per_iteration": 2.701383352279663 + }, + { + "auxiliary_loss_clip": 0.06586365, + "auxiliary_loss_mlp": 0.0128362, + "balance_loss_clip": 0.06303923, + "balance_loss_mlp": 0.01258323, + "epoch": 0.16702239591161883, + "flos": 23300747856000.0, + "grad_norm": 1.7658630551266277, + "language_loss": 0.85838449, + "learning_rate": 3.806260355115371e-06, + "loss": 0.93708432, + "num_input_tokens_seen": 60173215, + "router_z_loss_clip": 2.82421875, + "router_z_loss_mlp": 0.25305176, + "step": 2778, + "time_per_iteration": 4.054275989532471 + }, + { + "auxiliary_loss_clip": 0.06594409, + "auxiliary_loss_mlp": 0.01286931, + "balance_loss_clip": 0.06310806, + "balance_loss_mlp": 0.01260908, + "epoch": 0.1670825191642868, + "flos": 24432521754240.0, + "grad_norm": 2.130533626904146, + "language_loss": 0.75036883, + "learning_rate": 3.8060930989385778e-06, + "loss": 0.82918215, + "num_input_tokens_seen": 60190515, + "router_z_loss_clip": 2.83789062, + "router_z_loss_mlp": 0.26013184, + "step": 2779, + "time_per_iteration": 2.5570623874664307 + }, + { + "auxiliary_loss_clip": 0.06586824, + "auxiliary_loss_mlp": 0.01289404, + "balance_loss_clip": 0.06304757, + "balance_loss_mlp": 0.01263237, + "epoch": 0.16714264241695476, + "flos": 26804830550400.0, + "grad_norm": 2.754931380433817, + "language_loss": 0.66534865, + "learning_rate": 3.805925774274554e-06, + "loss": 0.74411094, + "num_input_tokens_seen": 60211655, + "router_z_loss_clip": 2.8203125, + "router_z_loss_mlp": 0.26147461, + "step": 2780, + "time_per_iteration": 2.5990118980407715 + }, + { + "auxiliary_loss_clip": 0.06585376, + "auxiliary_loss_mlp": 0.01289397, + "balance_loss_clip": 0.06306757, + "balance_loss_mlp": 0.01263075, + "epoch": 0.16720276566962272, + "flos": 21841768563840.0, + "grad_norm": 3.156228906236902, + "language_loss": 0.80115324, + "learning_rate": 3.805758381129643e-06, + "loss": 0.87990093, + "num_input_tokens_seen": 60230860, + "router_z_loss_clip": 2.78320312, + "router_z_loss_mlp": 0.26318359, + "step": 2781, + "time_per_iteration": 3.9395251274108887 + }, + { + "auxiliary_loss_clip": 0.06586023, + "auxiliary_loss_mlp": 0.01284981, + "balance_loss_clip": 0.06303258, + "balance_loss_mlp": 0.01258791, + "epoch": 0.1672628889222907, + "flos": 21476814105600.0, + "grad_norm": 1.4411022993090745, + "language_loss": 0.75756633, + "learning_rate": 3.805590919510193e-06, + "loss": 0.83627641, + "num_input_tokens_seen": 60250535, + "router_z_loss_clip": 2.828125, + "router_z_loss_mlp": 0.26171875, + "step": 2782, + "time_per_iteration": 2.6298012733459473 + }, + { + "auxiliary_loss_clip": 0.06600203, + "auxiliary_loss_mlp": 0.01288992, + "balance_loss_clip": 0.06305742, + "balance_loss_mlp": 0.0126242, + "epoch": 0.16732301217495865, + "flos": 30781915943040.0, + "grad_norm": 2.647632172572772, + "language_loss": 0.6861552, + "learning_rate": 3.8054233894225547e-06, + "loss": 0.76504719, + "num_input_tokens_seen": 60269530, + "router_z_loss_clip": 2.94335938, + "router_z_loss_mlp": 0.26550293, + "step": 2783, + "time_per_iteration": 2.5996532440185547 + }, + { + "auxiliary_loss_clip": 0.06581019, + "auxiliary_loss_mlp": 0.01284416, + "balance_loss_clip": 0.06301262, + "balance_loss_mlp": 0.0125931, + "epoch": 0.16738313542762664, + "flos": 23480940061440.0, + "grad_norm": 1.7043112393392166, + "language_loss": 0.70624614, + "learning_rate": 3.805255790873081e-06, + "loss": 0.78490055, + "num_input_tokens_seen": 60289900, + "router_z_loss_clip": 2.79492188, + "router_z_loss_mlp": 0.25109863, + "step": 2784, + "time_per_iteration": 2.5658257007598877 + }, + { + "auxiliary_loss_clip": 0.06592201, + "auxiliary_loss_mlp": 0.01289494, + "balance_loss_clip": 0.06306473, + "balance_loss_mlp": 0.01263041, + "epoch": 0.1674432586802946, + "flos": 29796861744000.0, + "grad_norm": 2.259998214947441, + "language_loss": 0.61717749, + "learning_rate": 3.805088123868126e-06, + "loss": 0.69599444, + "num_input_tokens_seen": 60310025, + "router_z_loss_clip": 2.85742188, + "router_z_loss_mlp": 0.2644043, + "step": 2785, + "time_per_iteration": 4.003845691680908 + }, + { + "auxiliary_loss_clip": 0.064503, + "auxiliary_loss_mlp": 0.01262182, + "balance_loss_clip": 0.06288917, + "balance_loss_mlp": 0.01255161, + "epoch": 0.16750338193296258, + "flos": 66157228857600.0, + "grad_norm": 0.7834191651915974, + "language_loss": 0.58330011, + "learning_rate": 3.8049203884140492e-06, + "loss": 0.66042489, + "num_input_tokens_seen": 60377800, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.07037354, + "step": 2786, + "time_per_iteration": 4.598146200180054 + }, + { + "auxiliary_loss_clip": 0.06587794, + "auxiliary_loss_mlp": 0.01289611, + "balance_loss_clip": 0.06301168, + "balance_loss_mlp": 0.0126298, + "epoch": 0.16756350518563054, + "flos": 25702881505920.0, + "grad_norm": 2.328984985341375, + "language_loss": 0.76757109, + "learning_rate": 3.80475258451721e-06, + "loss": 0.84634513, + "num_input_tokens_seen": 60398215, + "router_z_loss_clip": 2.8671875, + "router_z_loss_mlp": 0.26623535, + "step": 2787, + "time_per_iteration": 2.5801339149475098 + }, + { + "auxiliary_loss_clip": 0.06585419, + "auxiliary_loss_mlp": 0.01283974, + "balance_loss_clip": 0.06301223, + "balance_loss_mlp": 0.01257891, + "epoch": 0.1676236284382985, + "flos": 23841911450880.0, + "grad_norm": 1.9360315934234018, + "language_loss": 0.78495795, + "learning_rate": 3.804584712183972e-06, + "loss": 0.86365187, + "num_input_tokens_seen": 60416910, + "router_z_loss_clip": 2.84375, + "router_z_loss_mlp": 0.26086426, + "step": 2788, + "time_per_iteration": 2.5693655014038086 + }, + { + "auxiliary_loss_clip": 0.06435917, + "auxiliary_loss_mlp": 0.0126534, + "balance_loss_clip": 0.06275532, + "balance_loss_mlp": 0.01257765, + "epoch": 0.16768375169096647, + "flos": 59891313663360.0, + "grad_norm": 0.8394736884379908, + "language_loss": 0.59391403, + "learning_rate": 3.8044167714207013e-06, + "loss": 0.67092663, + "num_input_tokens_seen": 60468660, + "router_z_loss_clip": 1.60839844, + "router_z_loss_mlp": 0.07562256, + "step": 2789, + "time_per_iteration": 3.006455659866333 + }, + { + "auxiliary_loss_clip": 0.06580187, + "auxiliary_loss_mlp": 0.01282981, + "balance_loss_clip": 0.06298364, + "balance_loss_mlp": 0.01257566, + "epoch": 0.16774387494363446, + "flos": 38444785608960.0, + "grad_norm": 1.7149926461558054, + "language_loss": 0.71297312, + "learning_rate": 3.804248762233765e-06, + "loss": 0.79160476, + "num_input_tokens_seen": 60492370, + "router_z_loss_clip": 2.8203125, + "router_z_loss_mlp": 0.25427246, + "step": 2790, + "time_per_iteration": 2.6886403560638428 + }, + { + "auxiliary_loss_clip": 0.065869, + "auxiliary_loss_mlp": 0.01286845, + "balance_loss_clip": 0.06305605, + "balance_loss_mlp": 0.01260142, + "epoch": 0.16780399819630243, + "flos": 22644156862080.0, + "grad_norm": 1.6857838889349592, + "language_loss": 0.7969588, + "learning_rate": 3.8040806846295356e-06, + "loss": 0.8756963, + "num_input_tokens_seen": 60512655, + "router_z_loss_clip": 2.8125, + "router_z_loss_mlp": 0.26696777, + "step": 2791, + "time_per_iteration": 2.542351484298706 + }, + { + "auxiliary_loss_clip": 0.06585324, + "auxiliary_loss_mlp": 0.01283873, + "balance_loss_clip": 0.06304726, + "balance_loss_mlp": 0.01256502, + "epoch": 0.1678641214489704, + "flos": 32900001851520.0, + "grad_norm": 1.6260668766519037, + "language_loss": 0.72283256, + "learning_rate": 3.8039125386143853e-06, + "loss": 0.80152452, + "num_input_tokens_seen": 60533090, + "router_z_loss_clip": 2.80664062, + "router_z_loss_mlp": 0.27355957, + "step": 2792, + "time_per_iteration": 2.681652784347534 + }, + { + "auxiliary_loss_clip": 0.06588314, + "auxiliary_loss_mlp": 0.01281257, + "balance_loss_clip": 0.06305955, + "balance_loss_mlp": 0.01256223, + "epoch": 0.16792424470163836, + "flos": 19981133925120.0, + "grad_norm": 2.7315250216088756, + "language_loss": 0.7262826, + "learning_rate": 3.803744324194691e-06, + "loss": 0.80497831, + "num_input_tokens_seen": 60553190, + "router_z_loss_clip": 2.82617188, + "router_z_loss_mlp": 0.25036621, + "step": 2793, + "time_per_iteration": 2.5261969566345215 + }, + { + "auxiliary_loss_clip": 0.06583093, + "auxiliary_loss_mlp": 0.01283488, + "balance_loss_clip": 0.06301598, + "balance_loss_mlp": 0.01257333, + "epoch": 0.16798436795430632, + "flos": 19726114423680.0, + "grad_norm": 2.037397007218884, + "language_loss": 0.78064799, + "learning_rate": 3.803576041376831e-06, + "loss": 0.85931379, + "num_input_tokens_seen": 60571995, + "router_z_loss_clip": 2.81445312, + "router_z_loss_mlp": 0.26135254, + "step": 2794, + "time_per_iteration": 2.5393919944763184 + }, + { + "auxiliary_loss_clip": 0.06580402, + "auxiliary_loss_mlp": 0.01288563, + "balance_loss_clip": 0.06298761, + "balance_loss_mlp": 0.01262206, + "epoch": 0.1680444912069743, + "flos": 28111346138880.0, + "grad_norm": 2.312644294934493, + "language_loss": 0.72345173, + "learning_rate": 3.803407690167187e-06, + "loss": 0.80214143, + "num_input_tokens_seen": 60591275, + "router_z_loss_clip": 2.81640625, + "router_z_loss_mlp": 0.26379395, + "step": 2795, + "time_per_iteration": 2.565215587615967 + }, + { + "auxiliary_loss_clip": 0.06578698, + "auxiliary_loss_mlp": 0.01278302, + "balance_loss_clip": 0.06297935, + "balance_loss_mlp": 0.01254329, + "epoch": 0.16810461445964225, + "flos": 18080695797120.0, + "grad_norm": 1.8533332907405589, + "language_loss": 0.85181081, + "learning_rate": 3.803239270572142e-06, + "loss": 0.93038082, + "num_input_tokens_seen": 60609235, + "router_z_loss_clip": 2.81054688, + "router_z_loss_mlp": 0.23986816, + "step": 2796, + "time_per_iteration": 2.627962112426758 + }, + { + "auxiliary_loss_clip": 0.06595714, + "auxiliary_loss_mlp": 0.01283274, + "balance_loss_clip": 0.0630767, + "balance_loss_mlp": 0.01256571, + "epoch": 0.16816473771231025, + "flos": 23885488373760.0, + "grad_norm": 2.13286065055067, + "language_loss": 0.82093614, + "learning_rate": 3.8030707825980838e-06, + "loss": 0.89972603, + "num_input_tokens_seen": 60629880, + "router_z_loss_clip": 2.8828125, + "router_z_loss_mlp": 0.26696777, + "step": 2797, + "time_per_iteration": 2.5887176990509033 + }, + { + "auxiliary_loss_clip": 0.06571205, + "auxiliary_loss_mlp": 0.01280989, + "balance_loss_clip": 0.06298848, + "balance_loss_mlp": 0.01257922, + "epoch": 0.1682248609649782, + "flos": 22790163801600.0, + "grad_norm": 1.6719709230048432, + "language_loss": 0.75814915, + "learning_rate": 3.802902226251401e-06, + "loss": 0.83667111, + "num_input_tokens_seen": 60651175, + "router_z_loss_clip": 2.72460938, + "router_z_loss_mlp": 0.23071289, + "step": 2798, + "time_per_iteration": 2.5682647228240967 + }, + { + "auxiliary_loss_clip": 0.06575698, + "auxiliary_loss_mlp": 0.01285158, + "balance_loss_clip": 0.06297997, + "balance_loss_mlp": 0.01261483, + "epoch": 0.16828498421764618, + "flos": 20711545966080.0, + "grad_norm": 1.6493106854951614, + "language_loss": 0.8051939, + "learning_rate": 3.8027336015384845e-06, + "loss": 0.88380253, + "num_input_tokens_seen": 60670210, + "router_z_loss_clip": 2.77929688, + "router_z_loss_mlp": 0.23669434, + "step": 2799, + "time_per_iteration": 2.5808820724487305 + }, + { + "auxiliary_loss_clip": 0.06588444, + "auxiliary_loss_mlp": 0.01290497, + "balance_loss_clip": 0.06306663, + "balance_loss_mlp": 0.01264951, + "epoch": 0.16834510747031414, + "flos": 29427714581760.0, + "grad_norm": 2.08568782894778, + "language_loss": 0.71203279, + "learning_rate": 3.8025649084657296e-06, + "loss": 0.79082221, + "num_input_tokens_seen": 60690895, + "router_z_loss_clip": 2.81835938, + "router_z_loss_mlp": 0.25561523, + "step": 2800, + "time_per_iteration": 2.6072590351104736 + }, + { + "auxiliary_loss_clip": 0.06577089, + "auxiliary_loss_mlp": 0.01284192, + "balance_loss_clip": 0.06299706, + "balance_loss_mlp": 0.01258705, + "epoch": 0.1684052307229821, + "flos": 18150407994240.0, + "grad_norm": 2.3689825925758647, + "language_loss": 0.84516144, + "learning_rate": 3.8023961470395326e-06, + "loss": 0.9237743, + "num_input_tokens_seen": 60708280, + "router_z_loss_clip": 2.7734375, + "router_z_loss_mlp": 0.25488281, + "step": 2801, + "time_per_iteration": 2.5283203125 + }, + { + "auxiliary_loss_clip": 0.06582664, + "auxiliary_loss_mlp": 0.01284981, + "balance_loss_clip": 0.06302365, + "balance_loss_mlp": 0.01258612, + "epoch": 0.16846535397565007, + "flos": 16579439320320.0, + "grad_norm": 3.0795087290353744, + "language_loss": 0.84073383, + "learning_rate": 3.8022273172662933e-06, + "loss": 0.91941023, + "num_input_tokens_seen": 60724150, + "router_z_loss_clip": 2.80273438, + "router_z_loss_mlp": 0.26391602, + "step": 2802, + "time_per_iteration": 2.493727684020996 + }, + { + "auxiliary_loss_clip": 0.06582403, + "auxiliary_loss_mlp": 0.01282997, + "balance_loss_clip": 0.0630409, + "balance_loss_mlp": 0.01256831, + "epoch": 0.16852547722831807, + "flos": 30416667995520.0, + "grad_norm": 4.967511006144659, + "language_loss": 0.81234676, + "learning_rate": 3.802058419152413e-06, + "loss": 0.89100075, + "num_input_tokens_seen": 60746485, + "router_z_loss_clip": 2.78320312, + "router_z_loss_mlp": 0.26147461, + "step": 2803, + "time_per_iteration": 2.6188409328460693 + }, + { + "auxiliary_loss_clip": 0.06578018, + "auxiliary_loss_mlp": 0.01280405, + "balance_loss_clip": 0.06301461, + "balance_loss_mlp": 0.01256157, + "epoch": 0.16858560048098603, + "flos": 33515279982720.0, + "grad_norm": 2.6560543874068205, + "language_loss": 0.77301621, + "learning_rate": 3.801889452704297e-06, + "loss": 0.85160041, + "num_input_tokens_seen": 60762875, + "router_z_loss_clip": 2.76367188, + "router_z_loss_mlp": 0.24230957, + "step": 2804, + "time_per_iteration": 2.6222236156463623 + }, + { + "auxiliary_loss_clip": 0.06456417, + "auxiliary_loss_mlp": 0.01261999, + "balance_loss_clip": 0.06296105, + "balance_loss_mlp": 0.0125524, + "epoch": 0.168645723733654, + "flos": 67390845793920.0, + "grad_norm": 0.7985418659660302, + "language_loss": 0.55433214, + "learning_rate": 3.8017204179283526e-06, + "loss": 0.63151628, + "num_input_tokens_seen": 60825510, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.06774902, + "step": 2805, + "time_per_iteration": 3.1424005031585693 + }, + { + "auxiliary_loss_clip": 0.06571464, + "auxiliary_loss_mlp": 0.01283981, + "balance_loss_clip": 0.06301463, + "balance_loss_mlp": 0.01260723, + "epoch": 0.16870584698632196, + "flos": 21331016801280.0, + "grad_norm": 1.8814500249786532, + "language_loss": 0.74235076, + "learning_rate": 3.8015513148309892e-06, + "loss": 0.82090515, + "num_input_tokens_seen": 60844440, + "router_z_loss_clip": 2.70117188, + "router_z_loss_mlp": 0.23254395, + "step": 2806, + "time_per_iteration": 2.5448226928710938 + }, + { + "auxiliary_loss_clip": 0.06569488, + "auxiliary_loss_mlp": 0.01288633, + "balance_loss_clip": 0.06295753, + "balance_loss_mlp": 0.01264123, + "epoch": 0.16876597023898993, + "flos": 20747030970240.0, + "grad_norm": 2.4625186255791407, + "language_loss": 0.70848989, + "learning_rate": 3.80138214341862e-06, + "loss": 0.78707111, + "num_input_tokens_seen": 60863210, + "router_z_loss_clip": 2.734375, + "router_z_loss_mlp": 0.24523926, + "step": 2807, + "time_per_iteration": 2.5282390117645264 + }, + { + "auxiliary_loss_clip": 0.06578949, + "auxiliary_loss_mlp": 0.01289591, + "balance_loss_clip": 0.06299816, + "balance_loss_mlp": 0.0126383, + "epoch": 0.1688260934916579, + "flos": 20309806765440.0, + "grad_norm": 3.7758907272624715, + "language_loss": 0.71724349, + "learning_rate": 3.8012129036976587e-06, + "loss": 0.79592896, + "num_input_tokens_seen": 60882510, + "router_z_loss_clip": 2.79101562, + "router_z_loss_mlp": 0.25744629, + "step": 2808, + "time_per_iteration": 2.5146172046661377 + }, + { + "auxiliary_loss_clip": 0.06592815, + "auxiliary_loss_mlp": 0.01288179, + "balance_loss_clip": 0.06306504, + "balance_loss_mlp": 0.01261119, + "epoch": 0.16888621674432586, + "flos": 20347136559360.0, + "grad_norm": 2.150924717168134, + "language_loss": 0.80452245, + "learning_rate": 3.8010435956745236e-06, + "loss": 0.88333237, + "num_input_tokens_seen": 60901105, + "router_z_loss_clip": 2.86328125, + "router_z_loss_mlp": 0.27075195, + "step": 2809, + "time_per_iteration": 2.590801477432251 + }, + { + "auxiliary_loss_clip": 0.06586212, + "auxiliary_loss_mlp": 0.01286252, + "balance_loss_clip": 0.06299593, + "balance_loss_mlp": 0.01258965, + "epoch": 0.16894633999699385, + "flos": 16248963617280.0, + "grad_norm": 2.023624064417177, + "language_loss": 0.8897475, + "learning_rate": 3.8008742193556358e-06, + "loss": 0.96847212, + "num_input_tokens_seen": 60915340, + "router_z_loss_clip": 2.86523438, + "router_z_loss_mlp": 0.27294922, + "step": 2810, + "time_per_iteration": 2.553370714187622 + }, + { + "auxiliary_loss_clip": 0.0659079, + "auxiliary_loss_mlp": 0.01302127, + "balance_loss_clip": 0.06304274, + "balance_loss_mlp": 0.01273994, + "epoch": 0.16900646324966181, + "flos": 19616347175040.0, + "grad_norm": 1.906856377822649, + "language_loss": 0.93345243, + "learning_rate": 3.800704774747416e-06, + "loss": 1.01238155, + "num_input_tokens_seen": 60933735, + "router_z_loss_clip": 2.86523438, + "router_z_loss_mlp": 0.28137207, + "step": 2811, + "time_per_iteration": 2.5584306716918945 + }, + { + "auxiliary_loss_clip": 0.06579725, + "auxiliary_loss_mlp": 0.01293368, + "balance_loss_clip": 0.0629798, + "balance_loss_mlp": 0.01266534, + "epoch": 0.16906658650232978, + "flos": 22024644099840.0, + "grad_norm": 1.777677884933971, + "language_loss": 0.80087781, + "learning_rate": 3.800535261856291e-06, + "loss": 0.87960875, + "num_input_tokens_seen": 60953105, + "router_z_loss_clip": 2.81445312, + "router_z_loss_mlp": 0.26818848, + "step": 2812, + "time_per_iteration": 2.5193934440612793 + }, + { + "auxiliary_loss_clip": 0.06578699, + "auxiliary_loss_mlp": 0.01288816, + "balance_loss_clip": 0.06299478, + "balance_loss_mlp": 0.01262983, + "epoch": 0.16912670975499774, + "flos": 11768212131840.0, + "grad_norm": 2.3060118484148586, + "language_loss": 0.76260078, + "learning_rate": 3.8003656806886887e-06, + "loss": 0.84127587, + "num_input_tokens_seen": 60969150, + "router_z_loss_clip": 2.79296875, + "router_z_loss_mlp": 0.25830078, + "step": 2813, + "time_per_iteration": 2.5597875118255615 + }, + { + "auxiliary_loss_clip": 0.06583597, + "auxiliary_loss_mlp": 0.01290749, + "balance_loss_clip": 0.06300971, + "balance_loss_mlp": 0.01265083, + "epoch": 0.1691868330076657, + "flos": 17166443898240.0, + "grad_norm": 2.6968588943339444, + "language_loss": 0.70284265, + "learning_rate": 3.8001960312510396e-06, + "loss": 0.78158611, + "num_input_tokens_seen": 60982825, + "router_z_loss_clip": 2.82617188, + "router_z_loss_mlp": 0.2565918, + "step": 2814, + "time_per_iteration": 2.4971132278442383 + }, + { + "auxiliary_loss_clip": 0.06581523, + "auxiliary_loss_mlp": 0.01299068, + "balance_loss_clip": 0.06302465, + "balance_loss_mlp": 0.01272174, + "epoch": 0.16924695626033368, + "flos": 22422693720960.0, + "grad_norm": 1.782997034372258, + "language_loss": 0.63103068, + "learning_rate": 3.800026313549776e-06, + "loss": 0.7098366, + "num_input_tokens_seen": 61000875, + "router_z_loss_clip": 2.79296875, + "router_z_loss_mlp": 0.2689209, + "step": 2815, + "time_per_iteration": 2.583073377609253 + }, + { + "auxiliary_loss_clip": 0.06579722, + "auxiliary_loss_mlp": 0.01301206, + "balance_loss_clip": 0.06305208, + "balance_loss_mlp": 0.01275195, + "epoch": 0.16930707951300164, + "flos": 25746835772160.0, + "grad_norm": 1.6235196600742487, + "language_loss": 0.82652867, + "learning_rate": 3.7998565275913342e-06, + "loss": 0.90533793, + "num_input_tokens_seen": 61021940, + "router_z_loss_clip": 2.7421875, + "router_z_loss_mlp": 0.26037598, + "step": 2816, + "time_per_iteration": 2.567267894744873 + }, + { + "auxiliary_loss_clip": 0.06582578, + "auxiliary_loss_mlp": 0.01283511, + "balance_loss_clip": 0.06305215, + "balance_loss_mlp": 0.01257404, + "epoch": 0.16936720276566963, + "flos": 22753588694400.0, + "grad_norm": 2.305113279035628, + "language_loss": 0.88275278, + "learning_rate": 3.799686673382153e-06, + "loss": 0.96141362, + "num_input_tokens_seen": 61040285, + "router_z_loss_clip": 2.77148438, + "router_z_loss_mlp": 0.26074219, + "step": 2817, + "time_per_iteration": 2.55474853515625 + }, + { + "auxiliary_loss_clip": 0.06582828, + "auxiliary_loss_mlp": 0.0128986, + "balance_loss_clip": 0.06307572, + "balance_loss_mlp": 0.01264326, + "epoch": 0.1694273260183376, + "flos": 19580191338240.0, + "grad_norm": 1.9827332941616407, + "language_loss": 0.82882643, + "learning_rate": 3.799516750928672e-06, + "loss": 0.90755332, + "num_input_tokens_seen": 61059020, + "router_z_loss_clip": 2.75195312, + "router_z_loss_mlp": 0.2557373, + "step": 2818, + "time_per_iteration": 4.006748676300049 + }, + { + "auxiliary_loss_clip": 0.06584448, + "auxiliary_loss_mlp": 0.01293023, + "balance_loss_clip": 0.06306577, + "balance_loss_mlp": 0.01267905, + "epoch": 0.16948744927100556, + "flos": 12462636044160.0, + "grad_norm": 2.7889091010227367, + "language_loss": 0.81285071, + "learning_rate": 3.799346760237336e-06, + "loss": 0.8916254, + "num_input_tokens_seen": 61074245, + "router_z_loss_clip": 2.77929688, + "router_z_loss_mlp": 0.2512207, + "step": 2819, + "time_per_iteration": 2.513493537902832 + }, + { + "auxiliary_loss_clip": 0.06486231, + "auxiliary_loss_mlp": 0.01264851, + "balance_loss_clip": 0.06326687, + "balance_loss_mlp": 0.01257299, + "epoch": 0.16954757252367353, + "flos": 71309470164480.0, + "grad_norm": 0.8945207214981431, + "language_loss": 0.6004045, + "learning_rate": 3.7991767013145902e-06, + "loss": 0.67791533, + "num_input_tokens_seen": 61127080, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.07537842, + "step": 2820, + "time_per_iteration": 3.0841901302337646 + }, + { + "auxiliary_loss_clip": 0.06583934, + "auxiliary_loss_mlp": 0.01283404, + "balance_loss_clip": 0.06305862, + "balance_loss_mlp": 0.01258656, + "epoch": 0.1696076957763415, + "flos": 29614237770240.0, + "grad_norm": 2.2684361224992315, + "language_loss": 0.79040307, + "learning_rate": 3.7990065741668844e-06, + "loss": 0.86907649, + "num_input_tokens_seen": 61146955, + "router_z_loss_clip": 2.78125, + "router_z_loss_mlp": 0.24755859, + "step": 2821, + "time_per_iteration": 4.0664753913879395 + }, + { + "auxiliary_loss_clip": 0.06580411, + "auxiliary_loss_mlp": 0.01287682, + "balance_loss_clip": 0.06301302, + "balance_loss_mlp": 0.01260884, + "epoch": 0.16966781902900946, + "flos": 24395359668480.0, + "grad_norm": 4.427680473234215, + "language_loss": 0.79946303, + "learning_rate": 3.7988363788006685e-06, + "loss": 0.87814403, + "num_input_tokens_seen": 61166605, + "router_z_loss_clip": 2.79101562, + "router_z_loss_mlp": 0.26782227, + "step": 2822, + "time_per_iteration": 2.591439962387085 + }, + { + "auxiliary_loss_clip": 0.06573688, + "auxiliary_loss_mlp": 0.01292623, + "balance_loss_clip": 0.06300368, + "balance_loss_mlp": 0.0126834, + "epoch": 0.16972794228167745, + "flos": 23045392938240.0, + "grad_norm": 1.79403732378333, + "language_loss": 0.75404185, + "learning_rate": 3.7986661152223967e-06, + "loss": 0.83270496, + "num_input_tokens_seen": 61186535, + "router_z_loss_clip": 2.734375, + "router_z_loss_mlp": 0.24291992, + "step": 2823, + "time_per_iteration": 2.607241153717041 + }, + { + "auxiliary_loss_clip": 0.06584911, + "auxiliary_loss_mlp": 0.01296286, + "balance_loss_clip": 0.06309374, + "balance_loss_mlp": 0.01270704, + "epoch": 0.16978806553434542, + "flos": 35237915746560.0, + "grad_norm": 1.9541945473914888, + "language_loss": 0.60637134, + "learning_rate": 3.7984957834385257e-06, + "loss": 0.68518329, + "num_input_tokens_seen": 61208965, + "router_z_loss_clip": 2.7578125, + "router_z_loss_mlp": 0.2557373, + "step": 2824, + "time_per_iteration": 4.110937595367432 + }, + { + "auxiliary_loss_clip": 0.06588213, + "auxiliary_loss_mlp": 0.01295922, + "balance_loss_clip": 0.06311615, + "balance_loss_mlp": 0.01271114, + "epoch": 0.16984818878701338, + "flos": 32022366986880.0, + "grad_norm": 1.641592491230249, + "language_loss": 0.73562557, + "learning_rate": 3.7983253834555144e-06, + "loss": 0.81446695, + "num_input_tokens_seen": 61230670, + "router_z_loss_clip": 2.765625, + "router_z_loss_mlp": 0.24816895, + "step": 2825, + "time_per_iteration": 2.634206533432007 + }, + { + "auxiliary_loss_clip": 0.06593174, + "auxiliary_loss_mlp": 0.01295449, + "balance_loss_clip": 0.06306911, + "balance_loss_mlp": 0.01267411, + "epoch": 0.16990831203968135, + "flos": 22824936046080.0, + "grad_norm": 2.0964880275629465, + "language_loss": 0.86494017, + "learning_rate": 3.7981549152798245e-06, + "loss": 0.94382638, + "num_input_tokens_seen": 61249510, + "router_z_loss_clip": 2.8671875, + "router_z_loss_mlp": 0.28051758, + "step": 2826, + "time_per_iteration": 4.0616254806518555 + }, + { + "auxiliary_loss_clip": 0.0658946, + "auxiliary_loss_mlp": 0.01287444, + "balance_loss_clip": 0.0630484, + "balance_loss_mlp": 0.01260122, + "epoch": 0.1699684352923493, + "flos": 23046315333120.0, + "grad_norm": 1.7026807922554432, + "language_loss": 0.83019429, + "learning_rate": 3.7979843789179196e-06, + "loss": 0.90896332, + "num_input_tokens_seen": 61269440, + "router_z_loss_clip": 2.8515625, + "router_z_loss_mlp": 0.27307129, + "step": 2827, + "time_per_iteration": 2.5943539142608643 + }, + { + "auxiliary_loss_clip": 0.0658665, + "auxiliary_loss_mlp": 0.01291922, + "balance_loss_clip": 0.06303778, + "balance_loss_mlp": 0.01264206, + "epoch": 0.17002855854501728, + "flos": 21440532487680.0, + "grad_norm": 1.9993521816112911, + "language_loss": 0.75042886, + "learning_rate": 3.797813774376267e-06, + "loss": 0.82921457, + "num_input_tokens_seen": 61288195, + "router_z_loss_clip": 2.83203125, + "router_z_loss_mlp": 0.27722168, + "step": 2828, + "time_per_iteration": 2.5574147701263428 + }, + { + "auxiliary_loss_clip": 0.06457284, + "auxiliary_loss_mlp": 0.01264115, + "balance_loss_clip": 0.06297607, + "balance_loss_mlp": 0.01257433, + "epoch": 0.17008868179768524, + "flos": 71473966928640.0, + "grad_norm": 0.7544805989931621, + "language_loss": 0.56274545, + "learning_rate": 3.797643101661336e-06, + "loss": 0.63995945, + "num_input_tokens_seen": 61350850, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.06695557, + "step": 2829, + "time_per_iteration": 3.2194459438323975 + }, + { + "auxiliary_loss_clip": 0.06582125, + "auxiliary_loss_mlp": 0.01292929, + "balance_loss_clip": 0.06305368, + "balance_loss_mlp": 0.01267168, + "epoch": 0.17014880505035324, + "flos": 24907327315200.0, + "grad_norm": 1.8200636755843338, + "language_loss": 0.84280431, + "learning_rate": 3.7974723607795983e-06, + "loss": 0.9215548, + "num_input_tokens_seen": 61370765, + "router_z_loss_clip": 2.76757812, + "router_z_loss_mlp": 0.25769043, + "step": 2830, + "time_per_iteration": 2.5831046104431152 + }, + { + "auxiliary_loss_clip": 0.0658033, + "auxiliary_loss_mlp": 0.01286886, + "balance_loss_clip": 0.06302518, + "balance_loss_mlp": 0.0125985, + "epoch": 0.1702089283030212, + "flos": 29870263520640.0, + "grad_norm": 2.350653052094916, + "language_loss": 0.78878641, + "learning_rate": 3.797301551737529e-06, + "loss": 0.86745858, + "num_input_tokens_seen": 61388935, + "router_z_loss_clip": 2.77929688, + "router_z_loss_mlp": 0.2701416, + "step": 2831, + "time_per_iteration": 2.6084542274475098 + }, + { + "auxiliary_loss_clip": 0.06581105, + "auxiliary_loss_mlp": 0.01292582, + "balance_loss_clip": 0.06301375, + "balance_loss_mlp": 0.01266975, + "epoch": 0.17026905155568917, + "flos": 17749171918080.0, + "grad_norm": 2.0319157009696327, + "language_loss": 0.80466926, + "learning_rate": 3.7971306745416044e-06, + "loss": 0.88340604, + "num_input_tokens_seen": 61407350, + "router_z_loss_clip": 2.79882812, + "router_z_loss_mlp": 0.25610352, + "step": 2832, + "time_per_iteration": 2.5211668014526367 + }, + { + "auxiliary_loss_clip": 0.06573536, + "auxiliary_loss_mlp": 0.01286888, + "balance_loss_clip": 0.06297776, + "balance_loss_mlp": 0.0126133, + "epoch": 0.17032917480835713, + "flos": 23155327895040.0, + "grad_norm": 1.986078489446087, + "language_loss": 0.89480335, + "learning_rate": 3.7969597291983046e-06, + "loss": 0.97340751, + "num_input_tokens_seen": 61429010, + "router_z_loss_clip": 2.7578125, + "router_z_loss_mlp": 0.25561523, + "step": 2833, + "time_per_iteration": 2.560952663421631 + }, + { + "auxiliary_loss_clip": 0.06575279, + "auxiliary_loss_mlp": 0.01285966, + "balance_loss_clip": 0.06302077, + "balance_loss_mlp": 0.01261123, + "epoch": 0.1703892980610251, + "flos": 39211940465280.0, + "grad_norm": 2.220027390834487, + "language_loss": 0.73524815, + "learning_rate": 3.7967887157141115e-06, + "loss": 0.81386054, + "num_input_tokens_seen": 61450040, + "router_z_loss_clip": 2.73046875, + "router_z_loss_mlp": 0.24829102, + "step": 2834, + "time_per_iteration": 2.679527521133423 + }, + { + "auxiliary_loss_clip": 0.06581013, + "auxiliary_loss_mlp": 0.01285804, + "balance_loss_clip": 0.06300581, + "balance_loss_mlp": 0.01260245, + "epoch": 0.17044942131369306, + "flos": 23045728354560.0, + "grad_norm": 1.8327084439605401, + "language_loss": 0.87308288, + "learning_rate": 3.7966176340955106e-06, + "loss": 0.95175111, + "num_input_tokens_seen": 61468585, + "router_z_loss_clip": 2.8046875, + "router_z_loss_mlp": 0.2557373, + "step": 2835, + "time_per_iteration": 2.656421661376953 + }, + { + "auxiliary_loss_clip": 0.06579748, + "auxiliary_loss_mlp": 0.01283404, + "balance_loss_clip": 0.06297451, + "balance_loss_mlp": 0.01256451, + "epoch": 0.17050954456636103, + "flos": 17060533937280.0, + "grad_norm": 2.3811755619363058, + "language_loss": 0.75235045, + "learning_rate": 3.796446484348989e-06, + "loss": 0.83098197, + "num_input_tokens_seen": 61486330, + "router_z_loss_clip": 2.82226562, + "router_z_loss_mlp": 0.26940918, + "step": 2836, + "time_per_iteration": 2.4939451217651367 + }, + { + "auxiliary_loss_clip": 0.06577778, + "auxiliary_loss_mlp": 0.01283432, + "balance_loss_clip": 0.06295718, + "balance_loss_mlp": 0.01256955, + "epoch": 0.17056966781902902, + "flos": 16842634594560.0, + "grad_norm": 2.2113478912931606, + "language_loss": 0.81597924, + "learning_rate": 3.796275266481036e-06, + "loss": 0.89459133, + "num_input_tokens_seen": 61503950, + "router_z_loss_clip": 2.8203125, + "router_z_loss_mlp": 0.26501465, + "step": 2837, + "time_per_iteration": 2.5308785438537598 + }, + { + "auxiliary_loss_clip": 0.06567004, + "auxiliary_loss_mlp": 0.01296468, + "balance_loss_clip": 0.06298919, + "balance_loss_mlp": 0.01272149, + "epoch": 0.17062979107169698, + "flos": 17718340815360.0, + "grad_norm": 2.307982469607828, + "language_loss": 0.84291762, + "learning_rate": 3.7961039804981456e-06, + "loss": 0.92155236, + "num_input_tokens_seen": 61523550, + "router_z_loss_clip": 2.67578125, + "router_z_loss_mlp": 0.24328613, + "step": 2838, + "time_per_iteration": 2.509929895401001 + }, + { + "auxiliary_loss_clip": 0.06570365, + "auxiliary_loss_mlp": 0.01284738, + "balance_loss_clip": 0.06295732, + "balance_loss_mlp": 0.01260264, + "epoch": 0.17068991432436495, + "flos": 22531035450240.0, + "grad_norm": 1.8555127422179185, + "language_loss": 0.94406807, + "learning_rate": 3.795932626406812e-06, + "loss": 1.02261913, + "num_input_tokens_seen": 61542720, + "router_z_loss_clip": 2.74804688, + "router_z_loss_mlp": 0.24450684, + "step": 2839, + "time_per_iteration": 2.588021755218506 + }, + { + "auxiliary_loss_clip": 0.06569307, + "auxiliary_loss_mlp": 0.01284917, + "balance_loss_clip": 0.06293422, + "balance_loss_mlp": 0.01256808, + "epoch": 0.17075003757703291, + "flos": 25889698183680.0, + "grad_norm": 2.1000046554588394, + "language_loss": 0.84480917, + "learning_rate": 3.7957612042135336e-06, + "loss": 0.92335141, + "num_input_tokens_seen": 61563040, + "router_z_loss_clip": 2.75976562, + "router_z_loss_mlp": 0.28100586, + "step": 2840, + "time_per_iteration": 2.5653579235076904 + }, + { + "auxiliary_loss_clip": 0.06573716, + "auxiliary_loss_mlp": 0.01290397, + "balance_loss_clip": 0.06298221, + "balance_loss_mlp": 0.01263503, + "epoch": 0.17081016082970088, + "flos": 20126931229440.0, + "grad_norm": 1.871912800472889, + "language_loss": 0.76954079, + "learning_rate": 3.79558971392481e-06, + "loss": 0.8481819, + "num_input_tokens_seen": 61581890, + "router_z_loss_clip": 2.75390625, + "router_z_loss_mlp": 0.26879883, + "step": 2841, + "time_per_iteration": 2.5525524616241455 + }, + { + "auxiliary_loss_clip": 0.06573537, + "auxiliary_loss_mlp": 0.01282668, + "balance_loss_clip": 0.06297247, + "balance_loss_mlp": 0.01257026, + "epoch": 0.17087028408236885, + "flos": 24943441224960.0, + "grad_norm": 1.6793065618865832, + "language_loss": 0.77364486, + "learning_rate": 3.7954181555471443e-06, + "loss": 0.85220695, + "num_input_tokens_seen": 61602095, + "router_z_loss_clip": 2.765625, + "router_z_loss_mlp": 0.2565918, + "step": 2842, + "time_per_iteration": 2.5674381256103516 + }, + { + "auxiliary_loss_clip": 0.06561892, + "auxiliary_loss_mlp": 0.01282368, + "balance_loss_clip": 0.06295875, + "balance_loss_mlp": 0.01257489, + "epoch": 0.17093040733503684, + "flos": 19063108592640.0, + "grad_norm": 1.967223672886595, + "language_loss": 0.87176019, + "learning_rate": 3.795246529087043e-06, + "loss": 0.95020282, + "num_input_tokens_seen": 61620400, + "router_z_loss_clip": 2.66015625, + "router_z_loss_mlp": 0.24853516, + "step": 2843, + "time_per_iteration": 2.546586036682129 + }, + { + "auxiliary_loss_clip": 0.06571361, + "auxiliary_loss_mlp": 0.01285811, + "balance_loss_clip": 0.06299275, + "balance_loss_mlp": 0.01262339, + "epoch": 0.1709905305877048, + "flos": 13083993596160.0, + "grad_norm": 1.8800221555677419, + "language_loss": 0.69446707, + "learning_rate": 3.7950748345510126e-06, + "loss": 0.7730388, + "num_input_tokens_seen": 61637680, + "router_z_loss_clip": 2.72265625, + "router_z_loss_mlp": 0.23461914, + "step": 2844, + "time_per_iteration": 2.5857818126678467 + }, + { + "auxiliary_loss_clip": 0.06575634, + "auxiliary_loss_mlp": 0.01288208, + "balance_loss_clip": 0.06299984, + "balance_loss_mlp": 0.0126346, + "epoch": 0.17105065384037277, + "flos": 19215530369280.0, + "grad_norm": 1.7660184935388845, + "language_loss": 0.79213876, + "learning_rate": 3.7949030719455646e-06, + "loss": 0.87077713, + "num_input_tokens_seen": 61655630, + "router_z_loss_clip": 2.75585938, + "router_z_loss_mlp": 0.24780273, + "step": 2845, + "time_per_iteration": 2.5564208030700684 + }, + { + "auxiliary_loss_clip": 0.06577709, + "auxiliary_loss_mlp": 0.01293667, + "balance_loss_clip": 0.06302914, + "balance_loss_mlp": 0.01268586, + "epoch": 0.17111077709304073, + "flos": 18521106456960.0, + "grad_norm": 2.255753625544696, + "language_loss": 0.79110825, + "learning_rate": 3.7947312412772127e-06, + "loss": 0.86982203, + "num_input_tokens_seen": 61673475, + "router_z_loss_clip": 2.74804688, + "router_z_loss_mlp": 0.25085449, + "step": 2846, + "time_per_iteration": 2.513607978820801 + }, + { + "auxiliary_loss_clip": 0.06568472, + "auxiliary_loss_mlp": 0.01290569, + "balance_loss_clip": 0.06298524, + "balance_loss_mlp": 0.01266727, + "epoch": 0.1711709003457087, + "flos": 25089699726720.0, + "grad_norm": 1.7214534237870849, + "language_loss": 0.80675447, + "learning_rate": 3.794559342552472e-06, + "loss": 0.88534492, + "num_input_tokens_seen": 61693370, + "router_z_loss_clip": 2.69921875, + "router_z_loss_mlp": 0.23852539, + "step": 2847, + "time_per_iteration": 2.618793249130249 + }, + { + "auxiliary_loss_clip": 0.06569728, + "auxiliary_loss_mlp": 0.01293508, + "balance_loss_clip": 0.0629475, + "balance_loss_mlp": 0.01268796, + "epoch": 0.17123102359837666, + "flos": 17572124240640.0, + "grad_norm": 2.2846174525506973, + "language_loss": 0.88074541, + "learning_rate": 3.7943873757778614e-06, + "loss": 0.95937777, + "num_input_tokens_seen": 61710820, + "router_z_loss_clip": 2.75, + "router_z_loss_mlp": 0.24719238, + "step": 2848, + "time_per_iteration": 2.487272024154663 + }, + { + "auxiliary_loss_clip": 0.06569223, + "auxiliary_loss_mlp": 0.01309638, + "balance_loss_clip": 0.06294799, + "balance_loss_mlp": 0.01284688, + "epoch": 0.17129114685104463, + "flos": 26180244616320.0, + "grad_norm": 1.906108969463994, + "language_loss": 0.76101243, + "learning_rate": 3.794215340959902e-06, + "loss": 0.83980107, + "num_input_tokens_seen": 61729855, + "router_z_loss_clip": 2.74414062, + "router_z_loss_mlp": 0.24938965, + "step": 2849, + "time_per_iteration": 2.620347738265991 + }, + { + "auxiliary_loss_clip": 0.06449599, + "auxiliary_loss_mlp": 0.01264516, + "balance_loss_clip": 0.06291912, + "balance_loss_mlp": 0.01257077, + "epoch": 0.17135127010371262, + "flos": 69290696943360.0, + "grad_norm": 0.770033327211451, + "language_loss": 0.57434958, + "learning_rate": 3.7940432381051163e-06, + "loss": 0.65149075, + "num_input_tokens_seen": 61790290, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.07421875, + "step": 2850, + "time_per_iteration": 3.1464109420776367 + }, + { + "auxiliary_loss_clip": 0.0656237, + "auxiliary_loss_mlp": 0.01301725, + "balance_loss_clip": 0.06296088, + "balance_loss_mlp": 0.01277966, + "epoch": 0.1714113933563806, + "flos": 23556857460480.0, + "grad_norm": 2.479535747356738, + "language_loss": 0.81586778, + "learning_rate": 3.793871067220031e-06, + "loss": 0.89450872, + "num_input_tokens_seen": 61809265, + "router_z_loss_clip": 2.66601562, + "router_z_loss_mlp": 0.23742676, + "step": 2851, + "time_per_iteration": 2.558507204055786 + }, + { + "auxiliary_loss_clip": 0.06565535, + "auxiliary_loss_mlp": 0.01289531, + "balance_loss_clip": 0.06298645, + "balance_loss_mlp": 0.01267119, + "epoch": 0.17147151660904855, + "flos": 21148854024960.0, + "grad_norm": 2.2154108843285107, + "language_loss": 0.94662631, + "learning_rate": 3.7936988283111764e-06, + "loss": 1.025177, + "num_input_tokens_seen": 61828980, + "router_z_loss_clip": 2.66992188, + "router_z_loss_mlp": 0.22412109, + "step": 2852, + "time_per_iteration": 2.518974542617798 + }, + { + "auxiliary_loss_clip": 0.0657506, + "auxiliary_loss_mlp": 0.01290477, + "balance_loss_clip": 0.06300224, + "balance_loss_mlp": 0.01264299, + "epoch": 0.17153163986171652, + "flos": 18630873705600.0, + "grad_norm": 1.8056831581423547, + "language_loss": 0.70245004, + "learning_rate": 3.7935265213850817e-06, + "loss": 0.7811054, + "num_input_tokens_seen": 61847915, + "router_z_loss_clip": 2.74609375, + "router_z_loss_mlp": 0.26184082, + "step": 2853, + "time_per_iteration": 2.552562952041626 + }, + { + "auxiliary_loss_clip": 0.06576742, + "auxiliary_loss_mlp": 0.01296459, + "balance_loss_clip": 0.06299934, + "balance_loss_mlp": 0.01271663, + "epoch": 0.17159176311438448, + "flos": 18229134504960.0, + "grad_norm": 2.1946039611354418, + "language_loss": 0.67477524, + "learning_rate": 3.7933541464482815e-06, + "loss": 0.75350726, + "num_input_tokens_seen": 61865570, + "router_z_loss_clip": 2.765625, + "router_z_loss_mlp": 0.2479248, + "step": 2854, + "time_per_iteration": 2.5350561141967773 + }, + { + "auxiliary_loss_clip": 0.06572944, + "auxiliary_loss_mlp": 0.0128611, + "balance_loss_clip": 0.06305773, + "balance_loss_mlp": 0.01263973, + "epoch": 0.17165188636705245, + "flos": 20744976545280.0, + "grad_norm": 1.5291061865624715, + "language_loss": 0.89537871, + "learning_rate": 3.7931817035073124e-06, + "loss": 0.97396928, + "num_input_tokens_seen": 61883340, + "router_z_loss_clip": 2.671875, + "router_z_loss_mlp": 0.22143555, + "step": 2855, + "time_per_iteration": 2.5376133918762207 + }, + { + "auxiliary_loss_clip": 0.06575546, + "auxiliary_loss_mlp": 0.01295321, + "balance_loss_clip": 0.06304888, + "balance_loss_mlp": 0.01271145, + "epoch": 0.17171200961972044, + "flos": 24906824190720.0, + "grad_norm": 2.4271457535299654, + "language_loss": 0.84835625, + "learning_rate": 3.7930091925687134e-06, + "loss": 0.9270649, + "num_input_tokens_seen": 61900610, + "router_z_loss_clip": 2.70703125, + "router_z_loss_mlp": 0.24206543, + "step": 2856, + "time_per_iteration": 2.551483392715454 + }, + { + "auxiliary_loss_clip": 0.06575087, + "auxiliary_loss_mlp": 0.01290512, + "balance_loss_clip": 0.0630254, + "balance_loss_mlp": 0.01267528, + "epoch": 0.1717721328723884, + "flos": 20163464409600.0, + "grad_norm": 7.491722293090189, + "language_loss": 0.87615776, + "learning_rate": 3.792836613639026e-06, + "loss": 0.95481372, + "num_input_tokens_seen": 61916795, + "router_z_loss_clip": 2.72460938, + "router_z_loss_mlp": 0.23010254, + "step": 2857, + "time_per_iteration": 4.012267112731934 + }, + { + "auxiliary_loss_clip": 0.06572698, + "auxiliary_loss_mlp": 0.01287955, + "balance_loss_clip": 0.06301427, + "balance_loss_mlp": 0.01262385, + "epoch": 0.17183225612505637, + "flos": 23367357452160.0, + "grad_norm": 2.309816452702101, + "language_loss": 0.78393459, + "learning_rate": 3.7926639667247947e-06, + "loss": 0.86254114, + "num_input_tokens_seen": 61936665, + "router_z_loss_clip": 2.71484375, + "router_z_loss_mlp": 0.25585938, + "step": 2858, + "time_per_iteration": 2.58130145072937 + }, + { + "auxiliary_loss_clip": 0.06589144, + "auxiliary_loss_mlp": 0.0128985, + "balance_loss_clip": 0.06303509, + "balance_loss_mlp": 0.0126453, + "epoch": 0.17189237937772434, + "flos": 18120163870080.0, + "grad_norm": 2.664171996061716, + "language_loss": 0.77798349, + "learning_rate": 3.7924912518325663e-06, + "loss": 0.85677344, + "num_input_tokens_seen": 61954415, + "router_z_loss_clip": 2.85742188, + "router_z_loss_mlp": 0.25317383, + "step": 2859, + "time_per_iteration": 2.5043106079101562 + }, + { + "auxiliary_loss_clip": 0.06572397, + "auxiliary_loss_mlp": 0.01281612, + "balance_loss_clip": 0.06301641, + "balance_loss_mlp": 0.01258939, + "epoch": 0.1719525026303923, + "flos": 23265137070720.0, + "grad_norm": 5.679736885155129, + "language_loss": 0.77697283, + "learning_rate": 3.7923184689688902e-06, + "loss": 0.85551292, + "num_input_tokens_seen": 61973940, + "router_z_loss_clip": 2.7109375, + "router_z_loss_mlp": 0.22692871, + "step": 2860, + "time_per_iteration": 2.572662591934204 + }, + { + "auxiliary_loss_clip": 0.06574808, + "auxiliary_loss_mlp": 0.01292828, + "balance_loss_clip": 0.06301817, + "balance_loss_mlp": 0.01270583, + "epoch": 0.17201262588306027, + "flos": 20816156188800.0, + "grad_norm": 2.1792765136561036, + "language_loss": 0.82509398, + "learning_rate": 3.792145618140317e-06, + "loss": 0.90377033, + "num_input_tokens_seen": 61991845, + "router_z_loss_clip": 2.72851562, + "router_z_loss_mlp": 0.22229004, + "step": 2861, + "time_per_iteration": 3.9328150749206543 + }, + { + "auxiliary_loss_clip": 0.06577721, + "auxiliary_loss_mlp": 0.01292683, + "balance_loss_clip": 0.06305138, + "balance_loss_mlp": 0.0126896, + "epoch": 0.17207274913572823, + "flos": 20382076512000.0, + "grad_norm": 2.450020121503541, + "language_loss": 0.8692534, + "learning_rate": 3.7919726993534038e-06, + "loss": 0.9479574, + "num_input_tokens_seen": 62009395, + "router_z_loss_clip": 2.72460938, + "router_z_loss_mlp": 0.23718262, + "step": 2862, + "time_per_iteration": 2.533240795135498 + }, + { + "auxiliary_loss_clip": 0.06570788, + "auxiliary_loss_mlp": 0.01286464, + "balance_loss_clip": 0.06306001, + "balance_loss_mlp": 0.01264387, + "epoch": 0.17213287238839622, + "flos": 26805082112640.0, + "grad_norm": 1.8452916722599864, + "language_loss": 0.78642774, + "learning_rate": 3.7917997126147054e-06, + "loss": 0.86500025, + "num_input_tokens_seen": 62029005, + "router_z_loss_clip": 2.64257812, + "router_z_loss_mlp": 0.22045898, + "step": 2863, + "time_per_iteration": 2.5886759757995605 + }, + { + "auxiliary_loss_clip": 0.06585991, + "auxiliary_loss_mlp": 0.0129295, + "balance_loss_clip": 0.06318994, + "balance_loss_mlp": 0.01270336, + "epoch": 0.1721929956410642, + "flos": 26037927256320.0, + "grad_norm": 1.9522517065159992, + "language_loss": 0.73622, + "learning_rate": 3.7916266579307823e-06, + "loss": 0.81500947, + "num_input_tokens_seen": 62048730, + "router_z_loss_clip": 2.66796875, + "router_z_loss_mlp": 0.22631836, + "step": 2864, + "time_per_iteration": 4.05191445350647 + }, + { + "auxiliary_loss_clip": 0.06582697, + "auxiliary_loss_mlp": 0.01292894, + "balance_loss_clip": 0.06309051, + "balance_loss_mlp": 0.01269362, + "epoch": 0.17225311889373215, + "flos": 22279621674240.0, + "grad_norm": 1.6774687827131978, + "language_loss": 0.73856592, + "learning_rate": 3.7914535353081973e-06, + "loss": 0.81732178, + "num_input_tokens_seen": 62069000, + "router_z_loss_clip": 2.73632812, + "router_z_loss_mlp": 0.23535156, + "step": 2865, + "time_per_iteration": 3.9612531661987305 + }, + { + "auxiliary_loss_clip": 0.06584621, + "auxiliary_loss_mlp": 0.01305521, + "balance_loss_clip": 0.06313194, + "balance_loss_mlp": 0.01281405, + "epoch": 0.17231324214640012, + "flos": 21294106277760.0, + "grad_norm": 2.4869534197111385, + "language_loss": 0.79160404, + "learning_rate": 3.7912803447535145e-06, + "loss": 0.87050545, + "num_input_tokens_seen": 62086750, + "router_z_loss_clip": 2.71484375, + "router_z_loss_mlp": 0.24121094, + "step": 2866, + "time_per_iteration": 2.542663812637329 + }, + { + "auxiliary_loss_clip": 0.06586975, + "auxiliary_loss_mlp": 0.01295234, + "balance_loss_clip": 0.0631168, + "balance_loss_mlp": 0.01269688, + "epoch": 0.17237336539906808, + "flos": 19686520569600.0, + "grad_norm": 2.39942640082668, + "language_loss": 0.80413449, + "learning_rate": 3.7911070862733016e-06, + "loss": 0.8829565, + "num_input_tokens_seen": 62106240, + "router_z_loss_clip": 2.75390625, + "router_z_loss_mlp": 0.25549316, + "step": 2867, + "time_per_iteration": 2.524634599685669 + }, + { + "auxiliary_loss_clip": 0.06577912, + "auxiliary_loss_mlp": 0.01291096, + "balance_loss_clip": 0.063054, + "balance_loss_mlp": 0.01267123, + "epoch": 0.17243348865173605, + "flos": 17535339498240.0, + "grad_norm": 1.6440546002054504, + "language_loss": 0.80347586, + "learning_rate": 3.7909337598741276e-06, + "loss": 0.88216591, + "num_input_tokens_seen": 62124895, + "router_z_loss_clip": 2.72460938, + "router_z_loss_mlp": 0.23974609, + "step": 2868, + "time_per_iteration": 2.5237460136413574 + }, + { + "auxiliary_loss_clip": 0.06586674, + "auxiliary_loss_mlp": 0.0129419, + "balance_loss_clip": 0.06310418, + "balance_loss_mlp": 0.01270241, + "epoch": 0.17249361190440402, + "flos": 18265751539200.0, + "grad_norm": 1.9212015042396675, + "language_loss": 0.84995282, + "learning_rate": 3.7907603655625674e-06, + "loss": 0.92876148, + "num_input_tokens_seen": 62143510, + "router_z_loss_clip": 2.76171875, + "router_z_loss_mlp": 0.23937988, + "step": 2869, + "time_per_iteration": 2.4968101978302 + }, + { + "auxiliary_loss_clip": 0.06574747, + "auxiliary_loss_mlp": 0.01290391, + "balance_loss_clip": 0.06302473, + "balance_loss_mlp": 0.01265393, + "epoch": 0.172553735157072, + "flos": 21180020544000.0, + "grad_norm": 2.372251531694949, + "language_loss": 0.78318757, + "learning_rate": 3.7905869033451932e-06, + "loss": 0.861839, + "num_input_tokens_seen": 62162285, + "router_z_loss_clip": 2.71875, + "router_z_loss_mlp": 0.25, + "step": 2870, + "time_per_iteration": 2.6494200229644775 + }, + { + "auxiliary_loss_clip": 0.06572236, + "auxiliary_loss_mlp": 0.01286981, + "balance_loss_clip": 0.06308384, + "balance_loss_mlp": 0.01266083, + "epoch": 0.17261385840973997, + "flos": 22279831309440.0, + "grad_norm": 1.8100610801094352, + "language_loss": 0.77937269, + "learning_rate": 3.7904133732285857e-06, + "loss": 0.85796487, + "num_input_tokens_seen": 62180970, + "router_z_loss_clip": 2.63671875, + "router_z_loss_mlp": 0.20910645, + "step": 2871, + "time_per_iteration": 2.6145200729370117 + }, + { + "auxiliary_loss_clip": 0.06580749, + "auxiliary_loss_mlp": 0.01284391, + "balance_loss_clip": 0.06306709, + "balance_loss_mlp": 0.01260263, + "epoch": 0.17267398166240794, + "flos": 27928680238080.0, + "grad_norm": 2.361348336036686, + "language_loss": 0.75478256, + "learning_rate": 3.7902397752193228e-06, + "loss": 0.83343399, + "num_input_tokens_seen": 62198965, + "router_z_loss_clip": 2.74414062, + "router_z_loss_mlp": 0.24157715, + "step": 2872, + "time_per_iteration": 2.598762035369873 + }, + { + "auxiliary_loss_clip": 0.06570577, + "auxiliary_loss_mlp": 0.01297063, + "balance_loss_clip": 0.06302171, + "balance_loss_mlp": 0.01274067, + "epoch": 0.1727341049150759, + "flos": 21951661593600.0, + "grad_norm": 1.9699566193216007, + "language_loss": 0.83421481, + "learning_rate": 3.790066109323988e-06, + "loss": 0.91289121, + "num_input_tokens_seen": 62219890, + "router_z_loss_clip": 2.68359375, + "router_z_loss_mlp": 0.23010254, + "step": 2873, + "time_per_iteration": 2.5375001430511475 + }, + { + "auxiliary_loss_clip": 0.06575856, + "auxiliary_loss_mlp": 0.01290457, + "balance_loss_clip": 0.06307539, + "balance_loss_mlp": 0.01266198, + "epoch": 0.17279422816774387, + "flos": 18112742784000.0, + "grad_norm": 2.023952379864123, + "language_loss": 0.75553465, + "learning_rate": 3.7898923755491678e-06, + "loss": 0.83419782, + "num_input_tokens_seen": 62237140, + "router_z_loss_clip": 2.6796875, + "router_z_loss_mlp": 0.24243164, + "step": 2874, + "time_per_iteration": 2.6628403663635254 + }, + { + "auxiliary_loss_clip": 0.06583337, + "auxiliary_loss_mlp": 0.01288686, + "balance_loss_clip": 0.06308968, + "balance_loss_mlp": 0.01261959, + "epoch": 0.17285435142041183, + "flos": 21841936272000.0, + "grad_norm": 2.156422611189301, + "language_loss": 0.81707162, + "learning_rate": 3.7897185739014487e-06, + "loss": 0.89579183, + "num_input_tokens_seen": 62255405, + "router_z_loss_clip": 2.7421875, + "router_z_loss_mlp": 0.26733398, + "step": 2875, + "time_per_iteration": 2.5195512771606445 + }, + { + "auxiliary_loss_clip": 0.06576921, + "auxiliary_loss_mlp": 0.0129142, + "balance_loss_clip": 0.06303119, + "balance_loss_mlp": 0.01265122, + "epoch": 0.17291447467307983, + "flos": 18374219049600.0, + "grad_norm": 2.297860169925143, + "language_loss": 0.89334786, + "learning_rate": 3.7895447043874217e-06, + "loss": 0.9720313, + "num_input_tokens_seen": 62271280, + "router_z_loss_clip": 2.73828125, + "router_z_loss_mlp": 0.26281738, + "step": 2876, + "time_per_iteration": 2.5156540870666504 + }, + { + "auxiliary_loss_clip": 0.06576936, + "auxiliary_loss_mlp": 0.01286777, + "balance_loss_clip": 0.06307175, + "balance_loss_mlp": 0.01262793, + "epoch": 0.1729745979257478, + "flos": 18630580216320.0, + "grad_norm": 2.037856806425618, + "language_loss": 0.85539293, + "learning_rate": 3.789370767013681e-06, + "loss": 0.93403006, + "num_input_tokens_seen": 62289140, + "router_z_loss_clip": 2.69726562, + "router_z_loss_mlp": 0.23986816, + "step": 2877, + "time_per_iteration": 2.4874324798583984 + }, + { + "auxiliary_loss_clip": 0.06576495, + "auxiliary_loss_mlp": 0.01284602, + "balance_loss_clip": 0.06305559, + "balance_loss_mlp": 0.01260593, + "epoch": 0.17303472117841576, + "flos": 23004122002560.0, + "grad_norm": 1.956584823379214, + "language_loss": 0.79972547, + "learning_rate": 3.7891967617868204e-06, + "loss": 0.87833643, + "num_input_tokens_seen": 62307490, + "router_z_loss_clip": 2.70898438, + "router_z_loss_mlp": 0.23986816, + "step": 2878, + "time_per_iteration": 2.5546791553497314 + }, + { + "auxiliary_loss_clip": 0.06571983, + "auxiliary_loss_mlp": 0.01289115, + "balance_loss_clip": 0.06302349, + "balance_loss_mlp": 0.01264558, + "epoch": 0.17309484443108372, + "flos": 25671169935360.0, + "grad_norm": 1.824315336901638, + "language_loss": 0.72073978, + "learning_rate": 3.78902268871344e-06, + "loss": 0.79935074, + "num_input_tokens_seen": 62328570, + "router_z_loss_clip": 2.69335938, + "router_z_loss_mlp": 0.24584961, + "step": 2879, + "time_per_iteration": 2.5585644245147705 + }, + { + "auxiliary_loss_clip": 0.06575425, + "auxiliary_loss_mlp": 0.01284736, + "balance_loss_clip": 0.06301329, + "balance_loss_mlp": 0.01260048, + "epoch": 0.1731549676837517, + "flos": 13557960616320.0, + "grad_norm": 1.9540483547981324, + "language_loss": 0.8431474, + "learning_rate": 3.78884854780014e-06, + "loss": 0.921749, + "num_input_tokens_seen": 62345735, + "router_z_loss_clip": 2.7421875, + "router_z_loss_mlp": 0.24682617, + "step": 2880, + "time_per_iteration": 2.5332508087158203 + }, + { + "auxiliary_loss_clip": 0.06579134, + "auxiliary_loss_mlp": 0.01281408, + "balance_loss_clip": 0.06303075, + "balance_loss_mlp": 0.01256565, + "epoch": 0.17321509093641965, + "flos": 22863733286400.0, + "grad_norm": 3.3854797576129525, + "language_loss": 0.82168967, + "learning_rate": 3.7886743390535236e-06, + "loss": 0.90029514, + "num_input_tokens_seen": 62365525, + "router_z_loss_clip": 2.76171875, + "router_z_loss_mlp": 0.2487793, + "step": 2881, + "time_per_iteration": 2.5265071392059326 + }, + { + "auxiliary_loss_clip": 0.06575799, + "auxiliary_loss_mlp": 0.01283502, + "balance_loss_clip": 0.06305043, + "balance_loss_mlp": 0.0125904, + "epoch": 0.17327521418908762, + "flos": 24359665029120.0, + "grad_norm": 1.8504646386399068, + "language_loss": 0.77975154, + "learning_rate": 3.788500062480197e-06, + "loss": 0.85834455, + "num_input_tokens_seen": 62385160, + "router_z_loss_clip": 2.70898438, + "router_z_loss_mlp": 0.24450684, + "step": 2882, + "time_per_iteration": 2.56476092338562 + }, + { + "auxiliary_loss_clip": 0.0657361, + "auxiliary_loss_mlp": 0.01281786, + "balance_loss_clip": 0.063067, + "balance_loss_mlp": 0.01260495, + "epoch": 0.1733353374417556, + "flos": 33113373073920.0, + "grad_norm": 2.021690524452963, + "language_loss": 0.77161384, + "learning_rate": 3.788325718086769e-06, + "loss": 0.85016787, + "num_input_tokens_seen": 62405280, + "router_z_loss_clip": 2.66992188, + "router_z_loss_mlp": 0.21276855, + "step": 2883, + "time_per_iteration": 2.6154749393463135 + }, + { + "auxiliary_loss_clip": 0.06569435, + "auxiliary_loss_mlp": 0.01278991, + "balance_loss_clip": 0.06301424, + "balance_loss_mlp": 0.01256365, + "epoch": 0.17339546069442358, + "flos": 24395778938880.0, + "grad_norm": 4.943843215515709, + "language_loss": 0.86164784, + "learning_rate": 3.7881513058798503e-06, + "loss": 0.94013214, + "num_input_tokens_seen": 62423665, + "router_z_loss_clip": 2.68164062, + "router_z_loss_mlp": 0.22631836, + "step": 2884, + "time_per_iteration": 2.5598208904266357 + }, + { + "auxiliary_loss_clip": 0.06577636, + "auxiliary_loss_mlp": 0.01280409, + "balance_loss_clip": 0.06308297, + "balance_loss_mlp": 0.01256878, + "epoch": 0.17345558394709154, + "flos": 27461589252480.0, + "grad_norm": 1.714045228397976, + "language_loss": 0.75027329, + "learning_rate": 3.787976825866055e-06, + "loss": 0.82885373, + "num_input_tokens_seen": 62445170, + "router_z_loss_clip": 2.69335938, + "router_z_loss_mlp": 0.23535156, + "step": 2885, + "time_per_iteration": 2.584550619125366 + }, + { + "auxiliary_loss_clip": 0.06567928, + "auxiliary_loss_mlp": 0.01282091, + "balance_loss_clip": 0.06304367, + "balance_loss_mlp": 0.01259954, + "epoch": 0.1735157071997595, + "flos": 24689260264320.0, + "grad_norm": 1.6836608181022428, + "language_loss": 0.71760321, + "learning_rate": 3.7878022780519998e-06, + "loss": 0.79610336, + "num_input_tokens_seen": 62466135, + "router_z_loss_clip": 2.63671875, + "router_z_loss_mlp": 0.22131348, + "step": 2886, + "time_per_iteration": 2.5990986824035645 + }, + { + "auxiliary_loss_clip": 0.06574686, + "auxiliary_loss_mlp": 0.01280319, + "balance_loss_clip": 0.06304233, + "balance_loss_mlp": 0.01257275, + "epoch": 0.17357583045242747, + "flos": 21695300426880.0, + "grad_norm": 2.252280410203818, + "language_loss": 0.70329314, + "learning_rate": 3.7876276624443024e-06, + "loss": 0.78184319, + "num_input_tokens_seen": 62483910, + "router_z_loss_clip": 2.703125, + "router_z_loss_mlp": 0.23071289, + "step": 2887, + "time_per_iteration": 2.528995990753174 + }, + { + "auxiliary_loss_clip": 0.0657585, + "auxiliary_loss_mlp": 0.0127978, + "balance_loss_clip": 0.06305341, + "balance_loss_mlp": 0.01258155, + "epoch": 0.17363595370509544, + "flos": 15380846190720.0, + "grad_norm": 1.8987045627788157, + "language_loss": 0.85982835, + "learning_rate": 3.787452979049585e-06, + "loss": 0.93838477, + "num_input_tokens_seen": 62501530, + "router_z_loss_clip": 2.70507812, + "router_z_loss_mlp": 0.21618652, + "step": 2888, + "time_per_iteration": 2.520200252532959 + }, + { + "auxiliary_loss_clip": 0.06585068, + "auxiliary_loss_mlp": 0.0128524, + "balance_loss_clip": 0.06313335, + "balance_loss_mlp": 0.01262077, + "epoch": 0.1736960769577634, + "flos": 23447719117440.0, + "grad_norm": 1.9850534312792847, + "language_loss": 0.79895031, + "learning_rate": 3.7872782278744718e-06, + "loss": 0.87765336, + "num_input_tokens_seen": 62521295, + "router_z_loss_clip": 2.71679688, + "router_z_loss_mlp": 0.23193359, + "step": 2889, + "time_per_iteration": 2.5683798789978027 + }, + { + "auxiliary_loss_clip": 0.06572761, + "auxiliary_loss_mlp": 0.01291973, + "balance_loss_clip": 0.06309643, + "balance_loss_mlp": 0.01268966, + "epoch": 0.1737562002104314, + "flos": 18593711619840.0, + "grad_norm": 2.1673011596526743, + "language_loss": 0.85773498, + "learning_rate": 3.7871034089255883e-06, + "loss": 0.93638229, + "num_input_tokens_seen": 62539615, + "router_z_loss_clip": 2.62890625, + "router_z_loss_mlp": 0.23010254, + "step": 2890, + "time_per_iteration": 2.5268702507019043 + }, + { + "auxiliary_loss_clip": 0.06571183, + "auxiliary_loss_mlp": 0.0127752, + "balance_loss_clip": 0.06302673, + "balance_loss_mlp": 0.0125493, + "epoch": 0.17381632346309936, + "flos": 16003629262080.0, + "grad_norm": 2.262236435886973, + "language_loss": 0.8327142, + "learning_rate": 3.7869285222095653e-06, + "loss": 0.91120124, + "num_input_tokens_seen": 62556820, + "router_z_loss_clip": 2.68359375, + "router_z_loss_mlp": 0.22595215, + "step": 2891, + "time_per_iteration": 2.4975481033325195 + }, + { + "auxiliary_loss_clip": 0.065819, + "auxiliary_loss_mlp": 0.01286901, + "balance_loss_clip": 0.06304774, + "balance_loss_mlp": 0.01263512, + "epoch": 0.17387644671576732, + "flos": 13374749664000.0, + "grad_norm": 2.593478250918492, + "language_loss": 0.82133532, + "learning_rate": 3.7867535677330334e-06, + "loss": 0.9000234, + "num_input_tokens_seen": 62572450, + "router_z_loss_clip": 2.76953125, + "router_z_loss_mlp": 0.23388672, + "step": 2892, + "time_per_iteration": 2.488811492919922 + }, + { + "auxiliary_loss_clip": 0.06588026, + "auxiliary_loss_mlp": 0.0128266, + "balance_loss_clip": 0.06313482, + "balance_loss_mlp": 0.0125759, + "epoch": 0.1739365699684353, + "flos": 26622877409280.0, + "grad_norm": 1.869199176824797, + "language_loss": 0.7570942, + "learning_rate": 3.786578545502627e-06, + "loss": 0.83580112, + "num_input_tokens_seen": 62592580, + "router_z_loss_clip": 2.74804688, + "router_z_loss_mlp": 0.25061035, + "step": 2893, + "time_per_iteration": 2.6775050163269043 + }, + { + "auxiliary_loss_clip": 0.06578243, + "auxiliary_loss_mlp": 0.01282281, + "balance_loss_clip": 0.06306182, + "balance_loss_mlp": 0.01257903, + "epoch": 0.17399669322110325, + "flos": 23374736611200.0, + "grad_norm": 1.8950837051329763, + "language_loss": 0.82900345, + "learning_rate": 3.7864034555249828e-06, + "loss": 0.90760863, + "num_input_tokens_seen": 62611220, + "router_z_loss_clip": 2.72265625, + "router_z_loss_mlp": 0.24377441, + "step": 2894, + "time_per_iteration": 2.5567498207092285 + }, + { + "auxiliary_loss_clip": 0.06582697, + "auxiliary_loss_mlp": 0.01287491, + "balance_loss_clip": 0.06309928, + "balance_loss_mlp": 0.01263232, + "epoch": 0.17405681647377122, + "flos": 22060590301440.0, + "grad_norm": 2.244882299044818, + "language_loss": 0.74999332, + "learning_rate": 3.786228297806741e-06, + "loss": 0.82869518, + "num_input_tokens_seen": 62629185, + "router_z_loss_clip": 2.73046875, + "router_z_loss_mlp": 0.24279785, + "step": 2895, + "time_per_iteration": 2.535771369934082 + }, + { + "auxiliary_loss_clip": 0.06500985, + "auxiliary_loss_mlp": 0.01252168, + "balance_loss_clip": 0.06341717, + "balance_loss_mlp": 0.01244449, + "epoch": 0.1741169397264392, + "flos": 61476537530880.0, + "grad_norm": 0.8158755233881254, + "language_loss": 0.62716168, + "learning_rate": 3.7860530723545435e-06, + "loss": 0.7046932, + "num_input_tokens_seen": 62691895, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.0770874, + "step": 2896, + "time_per_iteration": 3.260303497314453 + }, + { + "auxiliary_loss_clip": 0.06578183, + "auxiliary_loss_mlp": 0.01278967, + "balance_loss_clip": 0.06304477, + "balance_loss_mlp": 0.01254791, + "epoch": 0.17417706297910718, + "flos": 27025245515520.0, + "grad_norm": 1.768440838457988, + "language_loss": 0.76261735, + "learning_rate": 3.785877779175034e-06, + "loss": 0.84118891, + "num_input_tokens_seen": 62713790, + "router_z_loss_clip": 2.73632812, + "router_z_loss_mlp": 0.24157715, + "step": 2897, + "time_per_iteration": 3.9564483165740967 + }, + { + "auxiliary_loss_clip": 0.06567717, + "auxiliary_loss_mlp": 0.01283821, + "balance_loss_clip": 0.06302972, + "balance_loss_mlp": 0.01260325, + "epoch": 0.17423718623177514, + "flos": 33516957064320.0, + "grad_norm": 2.1770598890745694, + "language_loss": 0.7037769, + "learning_rate": 3.7857024182748606e-06, + "loss": 0.78229225, + "num_input_tokens_seen": 62736285, + "router_z_loss_clip": 2.64453125, + "router_z_loss_mlp": 0.23486328, + "step": 2898, + "time_per_iteration": 2.6747710704803467 + }, + { + "auxiliary_loss_clip": 0.06586026, + "auxiliary_loss_mlp": 0.01283538, + "balance_loss_clip": 0.0630955, + "balance_loss_mlp": 0.01261008, + "epoch": 0.1742973094844431, + "flos": 27205982772480.0, + "grad_norm": 2.322018652940294, + "language_loss": 0.77535176, + "learning_rate": 3.7855269896606717e-06, + "loss": 0.85404742, + "num_input_tokens_seen": 62756240, + "router_z_loss_clip": 2.76757812, + "router_z_loss_mlp": 0.22509766, + "step": 2899, + "time_per_iteration": 2.5824503898620605 + }, + { + "auxiliary_loss_clip": 0.06566149, + "auxiliary_loss_mlp": 0.01285927, + "balance_loss_clip": 0.06301811, + "balance_loss_mlp": 0.01263611, + "epoch": 0.17435743273711107, + "flos": 22717307076480.0, + "grad_norm": 1.8730005414784603, + "language_loss": 0.7345652, + "learning_rate": 3.785351493339121e-06, + "loss": 0.81308603, + "num_input_tokens_seen": 62775910, + "router_z_loss_clip": 2.64453125, + "router_z_loss_mlp": 0.22302246, + "step": 2900, + "time_per_iteration": 3.9656574726104736 + }, + { + "auxiliary_loss_clip": 0.06572049, + "auxiliary_loss_mlp": 0.01282784, + "balance_loss_clip": 0.06301104, + "balance_loss_mlp": 0.01259311, + "epoch": 0.17441755598977904, + "flos": 41656141664640.0, + "grad_norm": 1.6285149505686385, + "language_loss": 0.70661789, + "learning_rate": 3.785175929316863e-06, + "loss": 0.7851662, + "num_input_tokens_seen": 62799385, + "router_z_loss_clip": 2.71289062, + "router_z_loss_mlp": 0.23474121, + "step": 2901, + "time_per_iteration": 2.6915066242218018 + }, + { + "auxiliary_loss_clip": 0.06578797, + "auxiliary_loss_mlp": 0.01281619, + "balance_loss_clip": 0.06304422, + "balance_loss_mlp": 0.0125885, + "epoch": 0.174477679242447, + "flos": 26294372277120.0, + "grad_norm": 4.182093359181909, + "language_loss": 0.76958787, + "learning_rate": 3.7850002976005543e-06, + "loss": 0.84819204, + "num_input_tokens_seen": 62819380, + "router_z_loss_clip": 2.74414062, + "router_z_loss_mlp": 0.2277832, + "step": 2902, + "time_per_iteration": 2.58911395072937 + }, + { + "auxiliary_loss_clip": 0.06574767, + "auxiliary_loss_mlp": 0.0128676, + "balance_loss_clip": 0.06303128, + "balance_loss_mlp": 0.01265076, + "epoch": 0.174537802495115, + "flos": 17864221973760.0, + "grad_norm": 2.5386707468858942, + "language_loss": 0.82260907, + "learning_rate": 3.7848245981968558e-06, + "loss": 0.90122437, + "num_input_tokens_seen": 62836205, + "router_z_loss_clip": 2.71875, + "router_z_loss_mlp": 0.21679688, + "step": 2903, + "time_per_iteration": 3.919084072113037 + }, + { + "auxiliary_loss_clip": 0.06573024, + "auxiliary_loss_mlp": 0.01291861, + "balance_loss_clip": 0.06307561, + "balance_loss_mlp": 0.01269139, + "epoch": 0.17459792574778296, + "flos": 16945441954560.0, + "grad_norm": 1.7914306748896518, + "language_loss": 0.7447511, + "learning_rate": 3.784648831112429e-06, + "loss": 0.82340002, + "num_input_tokens_seen": 62854045, + "router_z_loss_clip": 2.65039062, + "router_z_loss_mlp": 0.22717285, + "step": 2904, + "time_per_iteration": 2.578841209411621 + }, + { + "auxiliary_loss_clip": 0.06575242, + "auxiliary_loss_mlp": 0.01290708, + "balance_loss_clip": 0.0630535, + "balance_loss_mlp": 0.01266592, + "epoch": 0.17465804900045093, + "flos": 25527049712640.0, + "grad_norm": 2.1432197986147004, + "language_loss": 0.65256733, + "learning_rate": 3.7844729963539406e-06, + "loss": 0.73122686, + "num_input_tokens_seen": 62873075, + "router_z_loss_clip": 2.703125, + "router_z_loss_mlp": 0.24133301, + "step": 2905, + "time_per_iteration": 3.9871487617492676 + }, + { + "auxiliary_loss_clip": 0.06593791, + "auxiliary_loss_mlp": 0.0129467, + "balance_loss_clip": 0.06312381, + "balance_loss_mlp": 0.01270137, + "epoch": 0.1747181722531189, + "flos": 24135853973760.0, + "grad_norm": 2.2797831517729046, + "language_loss": 0.80441433, + "learning_rate": 3.7842970939280566e-06, + "loss": 0.88329899, + "num_input_tokens_seen": 62892675, + "router_z_loss_clip": 2.8125, + "router_z_loss_mlp": 0.24511719, + "step": 2906, + "time_per_iteration": 2.556459903717041 + }, + { + "auxiliary_loss_clip": 0.065907, + "auxiliary_loss_mlp": 0.01299352, + "balance_loss_clip": 0.0631306, + "balance_loss_mlp": 0.01274306, + "epoch": 0.17477829550578686, + "flos": 17754580506240.0, + "grad_norm": 7.784703467250062, + "language_loss": 0.81983393, + "learning_rate": 3.784121123841449e-06, + "loss": 0.89873445, + "num_input_tokens_seen": 62910675, + "router_z_loss_clip": 2.77734375, + "router_z_loss_mlp": 0.25024414, + "step": 2907, + "time_per_iteration": 2.5256009101867676 + }, + { + "auxiliary_loss_clip": 0.06586979, + "auxiliary_loss_mlp": 0.01293929, + "balance_loss_clip": 0.06311269, + "balance_loss_mlp": 0.01269777, + "epoch": 0.17483841875845482, + "flos": 15382732907520.0, + "grad_norm": 1.9551973542338994, + "language_loss": 0.82190001, + "learning_rate": 3.7839450861007886e-06, + "loss": 0.90070903, + "num_input_tokens_seen": 62928130, + "router_z_loss_clip": 2.7578125, + "router_z_loss_mlp": 0.24133301, + "step": 2908, + "time_per_iteration": 2.5280957221984863 + }, + { + "auxiliary_loss_clip": 0.0658935, + "auxiliary_loss_mlp": 0.01308706, + "balance_loss_clip": 0.06314441, + "balance_loss_mlp": 0.01283279, + "epoch": 0.17489854201112282, + "flos": 17168624104320.0, + "grad_norm": 3.0308502496460243, + "language_loss": 0.8151319, + "learning_rate": 3.7837689807127518e-06, + "loss": 0.89411247, + "num_input_tokens_seen": 62944290, + "router_z_loss_clip": 2.74804688, + "router_z_loss_mlp": 0.25427246, + "step": 2909, + "time_per_iteration": 2.501805543899536 + }, + { + "auxiliary_loss_clip": 0.06591058, + "auxiliary_loss_mlp": 0.01307034, + "balance_loss_clip": 0.06313848, + "balance_loss_mlp": 0.01280235, + "epoch": 0.17495866526379078, + "flos": 19761347865600.0, + "grad_norm": 2.106593508541441, + "language_loss": 0.77213359, + "learning_rate": 3.783592807684017e-06, + "loss": 0.85111451, + "num_input_tokens_seen": 62963505, + "router_z_loss_clip": 2.76953125, + "router_z_loss_mlp": 0.26818848, + "step": 2910, + "time_per_iteration": 2.5401246547698975 + }, + { + "auxiliary_loss_clip": 0.065902, + "auxiliary_loss_mlp": 0.01309875, + "balance_loss_clip": 0.06316847, + "balance_loss_mlp": 0.01282147, + "epoch": 0.17501878851645875, + "flos": 28518535854720.0, + "grad_norm": 6.625386462851426, + "language_loss": 0.8799597, + "learning_rate": 3.7834165670212645e-06, + "loss": 0.95896053, + "num_input_tokens_seen": 62985020, + "router_z_loss_clip": 2.73242188, + "router_z_loss_mlp": 0.27770996, + "step": 2911, + "time_per_iteration": 2.60190486907959 + }, + { + "auxiliary_loss_clip": 0.06591105, + "auxiliary_loss_mlp": 0.01300463, + "balance_loss_clip": 0.06318109, + "balance_loss_mlp": 0.0127537, + "epoch": 0.1750789117691267, + "flos": 17936994844800.0, + "grad_norm": 2.1857421016012832, + "language_loss": 0.90469962, + "learning_rate": 3.7832402587311764e-06, + "loss": 0.98361528, + "num_input_tokens_seen": 63001745, + "router_z_loss_clip": 2.73046875, + "router_z_loss_mlp": 0.2512207, + "step": 2912, + "time_per_iteration": 2.5914218425750732 + }, + { + "auxiliary_loss_clip": 0.06588344, + "auxiliary_loss_mlp": 0.01304507, + "balance_loss_clip": 0.06308792, + "balance_loss_mlp": 0.01277041, + "epoch": 0.17513903502179468, + "flos": 18265248414720.0, + "grad_norm": 2.129743219312126, + "language_loss": 0.74037218, + "learning_rate": 3.783063882820439e-06, + "loss": 0.81930077, + "num_input_tokens_seen": 63019750, + "router_z_loss_clip": 2.79296875, + "router_z_loss_mlp": 0.27453613, + "step": 2913, + "time_per_iteration": 2.5509839057922363 + }, + { + "auxiliary_loss_clip": 0.06580269, + "auxiliary_loss_mlp": 0.01314219, + "balance_loss_clip": 0.06308483, + "balance_loss_mlp": 0.01289781, + "epoch": 0.17519915827446264, + "flos": 20711084768640.0, + "grad_norm": 1.8784732947097995, + "language_loss": 0.70240569, + "learning_rate": 3.782887439295741e-06, + "loss": 0.78135055, + "num_input_tokens_seen": 63039500, + "router_z_loss_clip": 2.71289062, + "router_z_loss_mlp": 0.24450684, + "step": 2914, + "time_per_iteration": 2.560774564743042 + }, + { + "auxiliary_loss_clip": 0.06575729, + "auxiliary_loss_mlp": 0.0130416, + "balance_loss_clip": 0.06304997, + "balance_loss_mlp": 0.01278935, + "epoch": 0.1752592815271306, + "flos": 20529928241280.0, + "grad_norm": 1.7233134110017265, + "language_loss": 0.94360971, + "learning_rate": 3.782710928163772e-06, + "loss": 1.0224086, + "num_input_tokens_seen": 63059785, + "router_z_loss_clip": 2.70507812, + "router_z_loss_mlp": 0.25231934, + "step": 2915, + "time_per_iteration": 2.5500216484069824 + }, + { + "auxiliary_loss_clip": 0.06576817, + "auxiliary_loss_mlp": 0.01301313, + "balance_loss_clip": 0.06306335, + "balance_loss_mlp": 0.01277269, + "epoch": 0.1753194047797986, + "flos": 21805696581120.0, + "grad_norm": 1.6995224084103926, + "language_loss": 0.81995428, + "learning_rate": 3.782534349431226e-06, + "loss": 0.89873564, + "num_input_tokens_seen": 63079385, + "router_z_loss_clip": 2.70703125, + "router_z_loss_mlp": 0.24060059, + "step": 2916, + "time_per_iteration": 2.6210248470306396 + }, + { + "auxiliary_loss_clip": 0.06578801, + "auxiliary_loss_mlp": 0.01308944, + "balance_loss_clip": 0.06305841, + "balance_loss_mlp": 0.01282694, + "epoch": 0.17537952803246656, + "flos": 20674719296640.0, + "grad_norm": 7.015160336993527, + "language_loss": 0.74587643, + "learning_rate": 3.782357703104799e-06, + "loss": 0.82475388, + "num_input_tokens_seen": 63098970, + "router_z_loss_clip": 2.72851562, + "router_z_loss_mlp": 0.26245117, + "step": 2917, + "time_per_iteration": 2.5568697452545166 + }, + { + "auxiliary_loss_clip": 0.06575756, + "auxiliary_loss_mlp": 0.01293408, + "balance_loss_clip": 0.06306349, + "balance_loss_mlp": 0.01269018, + "epoch": 0.17543965128513453, + "flos": 23301837959040.0, + "grad_norm": 1.9034970134752385, + "language_loss": 0.77783519, + "learning_rate": 3.7821809891911897e-06, + "loss": 0.85652685, + "num_input_tokens_seen": 63118750, + "router_z_loss_clip": 2.6953125, + "router_z_loss_mlp": 0.24414062, + "step": 2918, + "time_per_iteration": 2.592294692993164 + }, + { + "auxiliary_loss_clip": 0.06589542, + "auxiliary_loss_mlp": 0.01295236, + "balance_loss_clip": 0.06310425, + "balance_loss_mlp": 0.01271549, + "epoch": 0.1754997745378025, + "flos": 29103234445440.0, + "grad_norm": 2.152727236459042, + "language_loss": 0.75315654, + "learning_rate": 3.782004207697098e-06, + "loss": 0.83200431, + "num_input_tokens_seen": 63136865, + "router_z_loss_clip": 2.79101562, + "router_z_loss_mlp": 0.23693848, + "step": 2919, + "time_per_iteration": 2.67553973197937 + }, + { + "auxiliary_loss_clip": 0.06596158, + "auxiliary_loss_mlp": 0.01303514, + "balance_loss_clip": 0.06314485, + "balance_loss_mlp": 0.01279601, + "epoch": 0.17555989779047046, + "flos": 30379547836800.0, + "grad_norm": 1.8096477139902465, + "language_loss": 0.74872279, + "learning_rate": 3.781827358629228e-06, + "loss": 0.82771957, + "num_input_tokens_seen": 63158325, + "router_z_loss_clip": 2.81835938, + "router_z_loss_mlp": 0.23925781, + "step": 2920, + "time_per_iteration": 2.6885359287261963 + }, + { + "auxiliary_loss_clip": 0.06577891, + "auxiliary_loss_mlp": 0.01294192, + "balance_loss_clip": 0.06307238, + "balance_loss_mlp": 0.01270982, + "epoch": 0.17562002104313842, + "flos": 23293284842880.0, + "grad_norm": 2.5308626608738423, + "language_loss": 0.80572176, + "learning_rate": 3.7816504419942873e-06, + "loss": 0.88444257, + "num_input_tokens_seen": 63173115, + "router_z_loss_clip": 2.70703125, + "router_z_loss_mlp": 0.23217773, + "step": 2921, + "time_per_iteration": 2.51985502243042 + }, + { + "auxiliary_loss_clip": 0.06590457, + "auxiliary_loss_mlp": 0.01284789, + "balance_loss_clip": 0.06311172, + "balance_loss_mlp": 0.01260971, + "epoch": 0.1756801442958064, + "flos": 24797434285440.0, + "grad_norm": 1.5780045761030037, + "language_loss": 0.88755381, + "learning_rate": 3.7814734577989823e-06, + "loss": 0.96630621, + "num_input_tokens_seen": 63192880, + "router_z_loss_clip": 2.79296875, + "router_z_loss_mlp": 0.23815918, + "step": 2922, + "time_per_iteration": 2.595477819442749 + }, + { + "auxiliary_loss_clip": 0.06584172, + "auxiliary_loss_mlp": 0.01290113, + "balance_loss_clip": 0.06306588, + "balance_loss_mlp": 0.01265211, + "epoch": 0.17574026754847438, + "flos": 25778086145280.0, + "grad_norm": 2.2356333874414043, + "language_loss": 0.63389397, + "learning_rate": 3.7812964060500253e-06, + "loss": 0.71263683, + "num_input_tokens_seen": 63214395, + "router_z_loss_clip": 2.78125, + "router_z_loss_mlp": 0.24890137, + "step": 2923, + "time_per_iteration": 2.56712007522583 + }, + { + "auxiliary_loss_clip": 0.06590886, + "auxiliary_loss_mlp": 0.01293522, + "balance_loss_clip": 0.06313786, + "balance_loss_mlp": 0.01269394, + "epoch": 0.17580039080114235, + "flos": 17462273137920.0, + "grad_norm": 2.8211803221017617, + "language_loss": 0.81614435, + "learning_rate": 3.78111928675413e-06, + "loss": 0.89498842, + "num_input_tokens_seen": 63231020, + "router_z_loss_clip": 2.77148438, + "router_z_loss_mlp": 0.24145508, + "step": 2924, + "time_per_iteration": 2.5396065711975098 + }, + { + "auxiliary_loss_clip": 0.06586142, + "auxiliary_loss_mlp": 0.01294774, + "balance_loss_clip": 0.06306558, + "balance_loss_mlp": 0.01269108, + "epoch": 0.1758605140538103, + "flos": 14869633230720.0, + "grad_norm": 2.6608767055753244, + "language_loss": 0.71953624, + "learning_rate": 3.7809420999180126e-06, + "loss": 0.79834545, + "num_input_tokens_seen": 63246245, + "router_z_loss_clip": 2.79492188, + "router_z_loss_mlp": 0.25671387, + "step": 2925, + "time_per_iteration": 2.594172239303589 + }, + { + "auxiliary_loss_clip": 0.0657725, + "auxiliary_loss_mlp": 0.01284494, + "balance_loss_clip": 0.06310555, + "balance_loss_mlp": 0.01261546, + "epoch": 0.17592063730647828, + "flos": 23011165745280.0, + "grad_norm": 1.6593164954495325, + "language_loss": 0.72342992, + "learning_rate": 3.7807648455483934e-06, + "loss": 0.80204731, + "num_input_tokens_seen": 63267790, + "router_z_loss_clip": 2.6640625, + "router_z_loss_mlp": 0.22961426, + "step": 2926, + "time_per_iteration": 2.592061758041382 + }, + { + "auxiliary_loss_clip": 0.06592301, + "auxiliary_loss_mlp": 0.0128622, + "balance_loss_clip": 0.06310115, + "balance_loss_mlp": 0.01260911, + "epoch": 0.17598076055914624, + "flos": 20747911438080.0, + "grad_norm": 1.7750261498089963, + "language_loss": 0.85897779, + "learning_rate": 3.7805875236519918e-06, + "loss": 0.93776292, + "num_input_tokens_seen": 63286830, + "router_z_loss_clip": 2.8203125, + "router_z_loss_mlp": 0.25317383, + "step": 2927, + "time_per_iteration": 2.546537160873413 + }, + { + "auxiliary_loss_clip": 0.06583759, + "auxiliary_loss_mlp": 0.01277616, + "balance_loss_clip": 0.06312352, + "balance_loss_mlp": 0.01255431, + "epoch": 0.1760408838118142, + "flos": 34100607479040.0, + "grad_norm": 1.9484214610767971, + "language_loss": 0.72539592, + "learning_rate": 3.7804101342355336e-06, + "loss": 0.80400968, + "num_input_tokens_seen": 63308870, + "router_z_loss_clip": 2.71289062, + "router_z_loss_mlp": 0.22167969, + "step": 2928, + "time_per_iteration": 2.674516201019287 + }, + { + "auxiliary_loss_clip": 0.06577812, + "auxiliary_loss_mlp": 0.01278822, + "balance_loss_clip": 0.06308608, + "balance_loss_mlp": 0.01256292, + "epoch": 0.1761010070644822, + "flos": 24174902776320.0, + "grad_norm": 1.786019104625144, + "language_loss": 0.83572811, + "learning_rate": 3.780232677305744e-06, + "loss": 0.91429448, + "num_input_tokens_seen": 63329005, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 0.22521973, + "step": 2929, + "time_per_iteration": 2.5528249740600586 + }, + { + "auxiliary_loss_clip": 0.06584716, + "auxiliary_loss_mlp": 0.01284422, + "balance_loss_clip": 0.06311291, + "balance_loss_mlp": 0.01261439, + "epoch": 0.17616113031715017, + "flos": 26583660898560.0, + "grad_norm": 1.8454669041222298, + "language_loss": 0.80018413, + "learning_rate": 3.7800551528693535e-06, + "loss": 0.87887549, + "num_input_tokens_seen": 63349390, + "router_z_loss_clip": 2.73242188, + "router_z_loss_mlp": 0.2298584, + "step": 2930, + "time_per_iteration": 2.6004958152770996 + }, + { + "auxiliary_loss_clip": 0.06579742, + "auxiliary_loss_mlp": 0.01287089, + "balance_loss_clip": 0.06306133, + "balance_loss_mlp": 0.01261935, + "epoch": 0.17622125356981813, + "flos": 25673853265920.0, + "grad_norm": 2.4724081113031677, + "language_loss": 0.77905595, + "learning_rate": 3.7798775609330927e-06, + "loss": 0.85772425, + "num_input_tokens_seen": 63368835, + "router_z_loss_clip": 2.73632812, + "router_z_loss_mlp": 0.25195312, + "step": 2931, + "time_per_iteration": 2.580275774002075 + }, + { + "auxiliary_loss_clip": 0.0657528, + "auxiliary_loss_mlp": 0.01279459, + "balance_loss_clip": 0.063051, + "balance_loss_mlp": 0.01256988, + "epoch": 0.1762813768224861, + "flos": 16514129462400.0, + "grad_norm": 2.8370907048277973, + "language_loss": 0.75863802, + "learning_rate": 3.779699901503696e-06, + "loss": 0.83718544, + "num_input_tokens_seen": 63385220, + "router_z_loss_clip": 2.703125, + "router_z_loss_mlp": 0.22473145, + "step": 2932, + "time_per_iteration": 2.5535829067230225 + }, + { + "auxiliary_loss_clip": 0.06587049, + "auxiliary_loss_mlp": 0.0128414, + "balance_loss_clip": 0.06307124, + "balance_loss_mlp": 0.01258975, + "epoch": 0.17634150007515406, + "flos": 11215518600960.0, + "grad_norm": 2.570844699660862, + "language_loss": 0.90240741, + "learning_rate": 3.7795221745879016e-06, + "loss": 0.98111933, + "num_input_tokens_seen": 63400865, + "router_z_loss_clip": 2.80273438, + "router_z_loss_mlp": 0.25146484, + "step": 2933, + "time_per_iteration": 2.5120935440063477 + }, + { + "auxiliary_loss_clip": 0.06578325, + "auxiliary_loss_mlp": 0.01278816, + "balance_loss_clip": 0.06313163, + "balance_loss_mlp": 0.01256893, + "epoch": 0.17640162332782203, + "flos": 23666750490240.0, + "grad_norm": 2.3821255620265376, + "language_loss": 0.89272201, + "learning_rate": 3.779344380192448e-06, + "loss": 0.97129339, + "num_input_tokens_seen": 63421390, + "router_z_loss_clip": 2.65429688, + "router_z_loss_mlp": 0.21936035, + "step": 2934, + "time_per_iteration": 2.5753555297851562 + }, + { + "auxiliary_loss_clip": 0.06578338, + "auxiliary_loss_mlp": 0.0128005, + "balance_loss_clip": 0.0630947, + "balance_loss_mlp": 0.0125709, + "epoch": 0.17646174658049, + "flos": 53808819056640.0, + "grad_norm": 1.971590125699774, + "language_loss": 0.71700215, + "learning_rate": 3.779166518324077e-06, + "loss": 0.79558611, + "num_input_tokens_seen": 63444715, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 0.2298584, + "step": 2935, + "time_per_iteration": 2.8537397384643555 + }, + { + "auxiliary_loss_clip": 0.06584434, + "auxiliary_loss_mlp": 0.01288458, + "balance_loss_clip": 0.06307955, + "balance_loss_mlp": 0.01264401, + "epoch": 0.17652186983315798, + "flos": 24250820175360.0, + "grad_norm": 8.554775287736033, + "language_loss": 0.71186781, + "learning_rate": 3.7789885889895325e-06, + "loss": 0.79059678, + "num_input_tokens_seen": 63465525, + "router_z_loss_clip": 2.76953125, + "router_z_loss_mlp": 0.24047852, + "step": 2936, + "time_per_iteration": 4.091250896453857 + }, + { + "auxiliary_loss_clip": 0.06580865, + "auxiliary_loss_mlp": 0.01286216, + "balance_loss_clip": 0.06309694, + "balance_loss_mlp": 0.01263745, + "epoch": 0.17658199308582595, + "flos": 27461715033600.0, + "grad_norm": 1.9442195602404513, + "language_loss": 0.72206265, + "learning_rate": 3.7788105921955634e-06, + "loss": 0.80073345, + "num_input_tokens_seen": 63485815, + "router_z_loss_clip": 2.7109375, + "router_z_loss_mlp": 0.22473145, + "step": 2937, + "time_per_iteration": 2.5836215019226074 + }, + { + "auxiliary_loss_clip": 0.06581761, + "auxiliary_loss_mlp": 0.0128249, + "balance_loss_clip": 0.06303879, + "balance_loss_mlp": 0.01258088, + "epoch": 0.17664211633849392, + "flos": 22425167416320.0, + "grad_norm": 2.618384752485795, + "language_loss": 0.76896954, + "learning_rate": 3.7786325279489184e-06, + "loss": 0.84761202, + "num_input_tokens_seen": 63503905, + "router_z_loss_clip": 2.77929688, + "router_z_loss_mlp": 0.24389648, + "step": 2938, + "time_per_iteration": 2.5426154136657715 + }, + { + "auxiliary_loss_clip": 0.06581972, + "auxiliary_loss_mlp": 0.0129211, + "balance_loss_clip": 0.06306289, + "balance_loss_mlp": 0.01268638, + "epoch": 0.17670223959116188, + "flos": 24721642667520.0, + "grad_norm": 2.0224209621562803, + "language_loss": 0.72049117, + "learning_rate": 3.7784543962563495e-06, + "loss": 0.79923201, + "num_input_tokens_seen": 63521985, + "router_z_loss_clip": 2.7578125, + "router_z_loss_mlp": 0.23474121, + "step": 2939, + "time_per_iteration": 4.034467935562134 + }, + { + "auxiliary_loss_clip": 0.06574269, + "auxiliary_loss_mlp": 0.01281711, + "balance_loss_clip": 0.06305616, + "balance_loss_mlp": 0.01258668, + "epoch": 0.17676236284382985, + "flos": 22533383364480.0, + "grad_norm": 2.2379803860691667, + "language_loss": 0.75736713, + "learning_rate": 3.7782761971246115e-06, + "loss": 0.83592695, + "num_input_tokens_seen": 63539830, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 0.23034668, + "step": 2940, + "time_per_iteration": 2.6091058254241943 + }, + { + "auxiliary_loss_clip": 0.06579125, + "auxiliary_loss_mlp": 0.01284811, + "balance_loss_clip": 0.06305407, + "balance_loss_mlp": 0.01261494, + "epoch": 0.1768224860964978, + "flos": 12389988954240.0, + "grad_norm": 2.2625025035762443, + "language_loss": 0.86326134, + "learning_rate": 3.7780979305604616e-06, + "loss": 0.94190073, + "num_input_tokens_seen": 63555495, + "router_z_loss_clip": 2.73632812, + "router_z_loss_mlp": 0.2331543, + "step": 2941, + "time_per_iteration": 2.529346227645874 + }, + { + "auxiliary_loss_clip": 0.06590004, + "auxiliary_loss_mlp": 0.01292545, + "balance_loss_clip": 0.06314506, + "balance_loss_mlp": 0.01269073, + "epoch": 0.1768826093491658, + "flos": 24360335861760.0, + "grad_norm": 2.5150262997144806, + "language_loss": 0.78079373, + "learning_rate": 3.7779195965706607e-06, + "loss": 0.8596192, + "num_input_tokens_seen": 63575290, + "router_z_loss_clip": 2.75390625, + "router_z_loss_mlp": 0.23498535, + "step": 2942, + "time_per_iteration": 2.5893354415893555 + }, + { + "auxiliary_loss_clip": 0.06590073, + "auxiliary_loss_mlp": 0.01285718, + "balance_loss_clip": 0.06313878, + "balance_loss_mlp": 0.01261745, + "epoch": 0.17694273260183377, + "flos": 23593893765120.0, + "grad_norm": 1.793399089669822, + "language_loss": 0.81007993, + "learning_rate": 3.77774119516197e-06, + "loss": 0.88883781, + "num_input_tokens_seen": 63594670, + "router_z_loss_clip": 2.76171875, + "router_z_loss_mlp": 0.23962402, + "step": 2943, + "time_per_iteration": 4.085087537765503 + }, + { + "auxiliary_loss_clip": 0.065895, + "auxiliary_loss_mlp": 0.01284454, + "balance_loss_clip": 0.06311318, + "balance_loss_mlp": 0.01260266, + "epoch": 0.17700285585450173, + "flos": 26768297370240.0, + "grad_norm": 2.7078535987609524, + "language_loss": 0.81690747, + "learning_rate": 3.777562726341155e-06, + "loss": 0.89564693, + "num_input_tokens_seen": 63614780, + "router_z_loss_clip": 2.78125, + "router_z_loss_mlp": 0.24194336, + "step": 2944, + "time_per_iteration": 4.037370204925537 + }, + { + "auxiliary_loss_clip": 0.06577846, + "auxiliary_loss_mlp": 0.01285687, + "balance_loss_clip": 0.06307179, + "balance_loss_mlp": 0.01262, + "epoch": 0.1770629791071697, + "flos": 42785986919040.0, + "grad_norm": 3.287704950657118, + "language_loss": 0.74187398, + "learning_rate": 3.7773841901149835e-06, + "loss": 0.82050931, + "num_input_tokens_seen": 63637190, + "router_z_loss_clip": 2.70898438, + "router_z_loss_mlp": 0.23693848, + "step": 2945, + "time_per_iteration": 2.726703405380249 + }, + { + "auxiliary_loss_clip": 0.06568955, + "auxiliary_loss_mlp": 0.01286818, + "balance_loss_clip": 0.06300092, + "balance_loss_mlp": 0.01263596, + "epoch": 0.17712310235983766, + "flos": 17350954588800.0, + "grad_norm": 3.5781735305150013, + "language_loss": 0.78848231, + "learning_rate": 3.7772055864902256e-06, + "loss": 0.86704004, + "num_input_tokens_seen": 63652140, + "router_z_loss_clip": 2.68554688, + "router_z_loss_mlp": 0.23217773, + "step": 2946, + "time_per_iteration": 2.6050639152526855 + }, + { + "auxiliary_loss_clip": 0.06568858, + "auxiliary_loss_mlp": 0.01284865, + "balance_loss_clip": 0.06300168, + "balance_loss_mlp": 0.01262156, + "epoch": 0.17718322561250563, + "flos": 23885278738560.0, + "grad_norm": 1.9584306466242212, + "language_loss": 0.77679253, + "learning_rate": 3.7770269154736535e-06, + "loss": 0.85532975, + "num_input_tokens_seen": 63671700, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 0.22705078, + "step": 2947, + "time_per_iteration": 2.562394857406616 + }, + { + "auxiliary_loss_clip": 0.06579228, + "auxiliary_loss_mlp": 0.01286605, + "balance_loss_clip": 0.06305858, + "balance_loss_mlp": 0.01262573, + "epoch": 0.1772433488651736, + "flos": 36475306116480.0, + "grad_norm": 3.3061595908349193, + "language_loss": 0.7337119, + "learning_rate": 3.7768481770720424e-06, + "loss": 0.81237024, + "num_input_tokens_seen": 63691685, + "router_z_loss_clip": 2.734375, + "router_z_loss_mlp": 0.24023438, + "step": 2948, + "time_per_iteration": 2.6738815307617188 + }, + { + "auxiliary_loss_clip": 0.06568594, + "auxiliary_loss_mlp": 0.01285694, + "balance_loss_clip": 0.06305531, + "balance_loss_mlp": 0.01263915, + "epoch": 0.1773034721178416, + "flos": 26691457576320.0, + "grad_norm": 2.3861566912178915, + "language_loss": 0.82720947, + "learning_rate": 3.776669371292171e-06, + "loss": 0.90575236, + "num_input_tokens_seen": 63711720, + "router_z_loss_clip": 2.63085938, + "router_z_loss_mlp": 0.21777344, + "step": 2949, + "time_per_iteration": 2.6339261531829834 + }, + { + "auxiliary_loss_clip": 0.06558515, + "auxiliary_loss_mlp": 0.0129088, + "balance_loss_clip": 0.06397671, + "balance_loss_mlp": 0.01282136, + "epoch": 0.17736359537050955, + "flos": 57136007053440.0, + "grad_norm": 0.7127406603181583, + "language_loss": 0.65079832, + "learning_rate": 3.7764904981408186e-06, + "loss": 0.72929227, + "num_input_tokens_seen": 63776280, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.08758545, + "step": 2950, + "time_per_iteration": 3.2668871879577637 + }, + { + "auxiliary_loss_clip": 0.06572378, + "auxiliary_loss_mlp": 0.01284106, + "balance_loss_clip": 0.06306554, + "balance_loss_mlp": 0.01260896, + "epoch": 0.17742371862317752, + "flos": 27205479648000.0, + "grad_norm": 1.9196695606626306, + "language_loss": 0.84746122, + "learning_rate": 3.7763115576247686e-06, + "loss": 0.92602605, + "num_input_tokens_seen": 63797535, + "router_z_loss_clip": 2.65820312, + "router_z_loss_mlp": 0.2322998, + "step": 2951, + "time_per_iteration": 2.585566520690918 + }, + { + "auxiliary_loss_clip": 0.06574618, + "auxiliary_loss_mlp": 0.01283229, + "balance_loss_clip": 0.06301534, + "balance_loss_mlp": 0.01260556, + "epoch": 0.17748384187584548, + "flos": 20966020416000.0, + "grad_norm": 2.232427680766164, + "language_loss": 0.82122993, + "learning_rate": 3.776132549750806e-06, + "loss": 0.89980847, + "num_input_tokens_seen": 63817045, + "router_z_loss_clip": 2.72851562, + "router_z_loss_mlp": 0.22680664, + "step": 2952, + "time_per_iteration": 2.55747652053833 + }, + { + "auxiliary_loss_clip": 0.06570595, + "auxiliary_loss_mlp": 0.01296069, + "balance_loss_clip": 0.06303248, + "balance_loss_mlp": 0.01272251, + "epoch": 0.17754396512851345, + "flos": 25017052636800.0, + "grad_norm": 5.629810818318968, + "language_loss": 0.8066265, + "learning_rate": 3.7759534745257194e-06, + "loss": 0.88529313, + "num_input_tokens_seen": 63837665, + "router_z_loss_clip": 2.671875, + "router_z_loss_mlp": 0.23840332, + "step": 2953, + "time_per_iteration": 2.5756490230560303 + }, + { + "auxiliary_loss_clip": 0.06576403, + "auxiliary_loss_mlp": 0.01299444, + "balance_loss_clip": 0.06307617, + "balance_loss_mlp": 0.01275877, + "epoch": 0.1776040883811814, + "flos": 32059780634880.0, + "grad_norm": 1.9568540134603198, + "language_loss": 0.89472413, + "learning_rate": 3.7757743319562994e-06, + "loss": 0.97348255, + "num_input_tokens_seen": 63858455, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 0.2355957, + "step": 2954, + "time_per_iteration": 2.64989972114563 + }, + { + "auxiliary_loss_clip": 0.06576417, + "auxiliary_loss_mlp": 0.01304463, + "balance_loss_clip": 0.06308817, + "balance_loss_mlp": 0.01280788, + "epoch": 0.17766421163384938, + "flos": 21579579538560.0, + "grad_norm": 2.0844074095191423, + "language_loss": 0.85445726, + "learning_rate": 3.7755951220493386e-06, + "loss": 0.93326604, + "num_input_tokens_seen": 63876935, + "router_z_loss_clip": 2.6796875, + "router_z_loss_mlp": 0.23693848, + "step": 2955, + "time_per_iteration": 2.5314552783966064 + }, + { + "auxiliary_loss_clip": 0.06566998, + "auxiliary_loss_mlp": 0.01298177, + "balance_loss_clip": 0.06301849, + "balance_loss_mlp": 0.01274287, + "epoch": 0.17772433488651737, + "flos": 22425922103040.0, + "grad_norm": 1.629233918934169, + "language_loss": 0.7198323, + "learning_rate": 3.7754158448116327e-06, + "loss": 0.79848409, + "num_input_tokens_seen": 63896815, + "router_z_loss_clip": 2.65039062, + "router_z_loss_mlp": 0.2388916, + "step": 2956, + "time_per_iteration": 2.5686161518096924 + }, + { + "auxiliary_loss_clip": 0.06565966, + "auxiliary_loss_mlp": 0.01302663, + "balance_loss_clip": 0.06303196, + "balance_loss_mlp": 0.01279632, + "epoch": 0.17778445813918534, + "flos": 25636481544960.0, + "grad_norm": 1.8690466813220736, + "language_loss": 0.8383618, + "learning_rate": 3.7752365002499795e-06, + "loss": 0.9170481, + "num_input_tokens_seen": 63916140, + "router_z_loss_clip": 2.63085938, + "router_z_loss_mlp": 0.23034668, + "step": 2957, + "time_per_iteration": 2.5693180561065674 + }, + { + "auxiliary_loss_clip": 0.06574687, + "auxiliary_loss_mlp": 0.0129708, + "balance_loss_clip": 0.06307757, + "balance_loss_mlp": 0.01274323, + "epoch": 0.1778445813918533, + "flos": 25635810712320.0, + "grad_norm": 1.5960329991483622, + "language_loss": 0.75535214, + "learning_rate": 3.7750570883711807e-06, + "loss": 0.83406979, + "num_input_tokens_seen": 63935220, + "router_z_loss_clip": 2.66992188, + "router_z_loss_mlp": 0.22753906, + "step": 2958, + "time_per_iteration": 2.6068832874298096 + }, + { + "auxiliary_loss_clip": 0.06572513, + "auxiliary_loss_mlp": 0.01295837, + "balance_loss_clip": 0.06305174, + "balance_loss_mlp": 0.01273533, + "epoch": 0.17790470464452127, + "flos": 22351975274880.0, + "grad_norm": 2.4916809347301867, + "language_loss": 0.8152473, + "learning_rate": 3.7748776091820397e-06, + "loss": 0.89393079, + "num_input_tokens_seen": 63954550, + "router_z_loss_clip": 2.67773438, + "router_z_loss_mlp": 0.22302246, + "step": 2959, + "time_per_iteration": 2.532893419265747 + }, + { + "auxiliary_loss_clip": 0.06580231, + "auxiliary_loss_mlp": 0.01291039, + "balance_loss_clip": 0.06308466, + "balance_loss_mlp": 0.01267293, + "epoch": 0.17796482789718923, + "flos": 18771052786560.0, + "grad_norm": 1.971364332808954, + "language_loss": 0.52699149, + "learning_rate": 3.774698062689362e-06, + "loss": 0.60570425, + "num_input_tokens_seen": 63972425, + "router_z_loss_clip": 2.71679688, + "router_z_loss_mlp": 0.23754883, + "step": 2960, + "time_per_iteration": 2.5427799224853516 + }, + { + "auxiliary_loss_clip": 0.06575893, + "auxiliary_loss_mlp": 0.01290781, + "balance_loss_clip": 0.06308038, + "balance_loss_mlp": 0.01267726, + "epoch": 0.1780249511498572, + "flos": 23447719117440.0, + "grad_norm": 1.7972451693934908, + "language_loss": 0.90068716, + "learning_rate": 3.7745184488999548e-06, + "loss": 0.97935379, + "num_input_tokens_seen": 63992165, + "router_z_loss_clip": 2.6796875, + "router_z_loss_mlp": 0.23083496, + "step": 2961, + "time_per_iteration": 2.5641977787017822 + }, + { + "auxiliary_loss_clip": 0.06579147, + "auxiliary_loss_mlp": 0.01285295, + "balance_loss_clip": 0.06309063, + "balance_loss_mlp": 0.0126075, + "epoch": 0.1780850744025252, + "flos": 23374149632640.0, + "grad_norm": 3.006724243875413, + "language_loss": 0.79600328, + "learning_rate": 3.774338767820631e-06, + "loss": 0.87464768, + "num_input_tokens_seen": 64013470, + "router_z_loss_clip": 2.703125, + "router_z_loss_mlp": 0.2454834, + "step": 2962, + "time_per_iteration": 2.605395555496216 + }, + { + "auxiliary_loss_clip": 0.06579778, + "auxiliary_loss_mlp": 0.01288142, + "balance_loss_clip": 0.06310856, + "balance_loss_mlp": 0.01262977, + "epoch": 0.17814519765519315, + "flos": 13777117770240.0, + "grad_norm": 1.8585534107816564, + "language_loss": 0.75987798, + "learning_rate": 3.774159019458203e-06, + "loss": 0.83855718, + "num_input_tokens_seen": 64030975, + "router_z_loss_clip": 2.6875, + "router_z_loss_mlp": 0.25146484, + "step": 2963, + "time_per_iteration": 2.4989051818847656 + }, + { + "auxiliary_loss_clip": 0.06582604, + "auxiliary_loss_mlp": 0.01280238, + "balance_loss_clip": 0.06308165, + "balance_loss_mlp": 0.01255573, + "epoch": 0.17820532090786112, + "flos": 21982073425920.0, + "grad_norm": 2.394373782804808, + "language_loss": 0.79892176, + "learning_rate": 3.7739792038194877e-06, + "loss": 0.87755024, + "num_input_tokens_seen": 64050075, + "router_z_loss_clip": 2.74609375, + "router_z_loss_mlp": 0.24682617, + "step": 2964, + "time_per_iteration": 2.6040844917297363 + }, + { + "auxiliary_loss_clip": 0.06584992, + "auxiliary_loss_mlp": 0.01284037, + "balance_loss_clip": 0.06315298, + "balance_loss_mlp": 0.01259289, + "epoch": 0.17826544416052909, + "flos": 24797727774720.0, + "grad_norm": 4.1010799155066, + "language_loss": 0.8221398, + "learning_rate": 3.7737993209113027e-06, + "loss": 0.90083003, + "num_input_tokens_seen": 64071920, + "router_z_loss_clip": 2.69726562, + "router_z_loss_mlp": 0.24755859, + "step": 2965, + "time_per_iteration": 2.5539731979370117 + }, + { + "auxiliary_loss_clip": 0.06570912, + "auxiliary_loss_mlp": 0.01281116, + "balance_loss_clip": 0.06306428, + "balance_loss_mlp": 0.01258788, + "epoch": 0.17832556741319705, + "flos": 13884411323520.0, + "grad_norm": 2.4679554184574974, + "language_loss": 0.96086347, + "learning_rate": 3.7736193707404698e-06, + "loss": 1.03938377, + "num_input_tokens_seen": 64086835, + "router_z_loss_clip": 2.64257812, + "router_z_loss_mlp": 0.22338867, + "step": 2966, + "time_per_iteration": 2.527735948562622 + }, + { + "auxiliary_loss_clip": 0.06579631, + "auxiliary_loss_mlp": 0.01280876, + "balance_loss_clip": 0.06311509, + "balance_loss_mlp": 0.0125688, + "epoch": 0.17838569066586502, + "flos": 36649502755200.0, + "grad_norm": 2.0843689120837965, + "language_loss": 0.73698831, + "learning_rate": 3.7734393533138127e-06, + "loss": 0.81559336, + "num_input_tokens_seen": 64107360, + "router_z_loss_clip": 2.68359375, + "router_z_loss_mlp": 0.24023438, + "step": 2967, + "time_per_iteration": 2.7015600204467773 + }, + { + "auxiliary_loss_clip": 0.06577688, + "auxiliary_loss_mlp": 0.01283294, + "balance_loss_clip": 0.06315881, + "balance_loss_mlp": 0.01260192, + "epoch": 0.17844581391853298, + "flos": 18732087838080.0, + "grad_norm": 3.4272342033369956, + "language_loss": 0.77622253, + "learning_rate": 3.773259268638157e-06, + "loss": 0.85483229, + "num_input_tokens_seen": 64124690, + "router_z_loss_clip": 2.62109375, + "router_z_loss_mlp": 0.2310791, + "step": 2968, + "time_per_iteration": 2.5782222747802734 + }, + { + "auxiliary_loss_clip": 0.06574235, + "auxiliary_loss_mlp": 0.01280569, + "balance_loss_clip": 0.06309816, + "balance_loss_mlp": 0.01257716, + "epoch": 0.17850593717120097, + "flos": 27385168728960.0, + "grad_norm": 2.732998701382931, + "language_loss": 0.76891911, + "learning_rate": 3.7730791167203333e-06, + "loss": 0.84746712, + "num_input_tokens_seen": 64146315, + "router_z_loss_clip": 2.64453125, + "router_z_loss_mlp": 0.2286377, + "step": 2969, + "time_per_iteration": 2.596932888031006 + }, + { + "auxiliary_loss_clip": 0.06469887, + "auxiliary_loss_mlp": 0.01257031, + "balance_loss_clip": 0.06316882, + "balance_loss_mlp": 0.01250105, + "epoch": 0.17856606042386894, + "flos": 67014696816000.0, + "grad_norm": 0.8163537423270849, + "language_loss": 0.69127434, + "learning_rate": 3.772898897567171e-06, + "loss": 0.76854354, + "num_input_tokens_seen": 64210875, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.06939697, + "step": 2970, + "time_per_iteration": 3.239208221435547 + }, + { + "auxiliary_loss_clip": 0.06585611, + "auxiliary_loss_mlp": 0.01285467, + "balance_loss_clip": 0.06311353, + "balance_loss_mlp": 0.01261936, + "epoch": 0.1786261836765369, + "flos": 36986015952000.0, + "grad_norm": 1.9165060952178286, + "language_loss": 0.67737955, + "learning_rate": 3.772718611185505e-06, + "loss": 0.75609034, + "num_input_tokens_seen": 64230740, + "router_z_loss_clip": 2.7421875, + "router_z_loss_mlp": 0.23522949, + "step": 2971, + "time_per_iteration": 2.6962218284606934 + }, + { + "auxiliary_loss_clip": 0.06573113, + "auxiliary_loss_mlp": 0.01289649, + "balance_loss_clip": 0.06303954, + "balance_loss_mlp": 0.01265164, + "epoch": 0.17868630692920487, + "flos": 24832122675840.0, + "grad_norm": 2.3195878790033992, + "language_loss": 0.90615618, + "learning_rate": 3.7725382575821717e-06, + "loss": 0.98478377, + "num_input_tokens_seen": 64252300, + "router_z_loss_clip": 2.69335938, + "router_z_loss_mlp": 0.24475098, + "step": 2972, + "time_per_iteration": 2.5959432125091553 + }, + { + "auxiliary_loss_clip": 0.06576589, + "auxiliary_loss_mlp": 0.01296839, + "balance_loss_clip": 0.06306117, + "balance_loss_mlp": 0.01272747, + "epoch": 0.17874643018187283, + "flos": 16987509504000.0, + "grad_norm": 2.140735852517547, + "language_loss": 0.89032125, + "learning_rate": 3.77235783676401e-06, + "loss": 0.96905553, + "num_input_tokens_seen": 64270105, + "router_z_loss_clip": 2.70507812, + "router_z_loss_mlp": 0.24084473, + "step": 2973, + "time_per_iteration": 2.5378026962280273 + }, + { + "auxiliary_loss_clip": 0.06586085, + "auxiliary_loss_mlp": 0.01287873, + "balance_loss_clip": 0.06315553, + "balance_loss_mlp": 0.01263459, + "epoch": 0.1788065534345408, + "flos": 21038499797760.0, + "grad_norm": 2.0743135363702097, + "language_loss": 0.77368832, + "learning_rate": 3.7721773487378615e-06, + "loss": 0.8524279, + "num_input_tokens_seen": 64287250, + "router_z_loss_clip": 2.70703125, + "router_z_loss_mlp": 0.2442627, + "step": 2974, + "time_per_iteration": 2.53279972076416 + }, + { + "auxiliary_loss_clip": 0.06580098, + "auxiliary_loss_mlp": 0.01294024, + "balance_loss_clip": 0.06311634, + "balance_loss_mlp": 0.01269825, + "epoch": 0.17886667668720876, + "flos": 23994500935680.0, + "grad_norm": 2.8964956916015323, + "language_loss": 0.75456583, + "learning_rate": 3.7719967935105705e-06, + "loss": 0.83330709, + "num_input_tokens_seen": 64307140, + "router_z_loss_clip": 2.68359375, + "router_z_loss_mlp": 0.24182129, + "step": 2975, + "time_per_iteration": 2.5941531658172607 + }, + { + "auxiliary_loss_clip": 0.06574937, + "auxiliary_loss_mlp": 0.01296496, + "balance_loss_clip": 0.06309143, + "balance_loss_mlp": 0.0127443, + "epoch": 0.17892679993987676, + "flos": 25746626136960.0, + "grad_norm": 1.5983536265516811, + "language_loss": 0.73931366, + "learning_rate": 3.7718161710889833e-06, + "loss": 0.81802797, + "num_input_tokens_seen": 64328760, + "router_z_loss_clip": 2.65820312, + "router_z_loss_mlp": 0.22070312, + "step": 2976, + "time_per_iteration": 3.9981672763824463 + }, + { + "auxiliary_loss_clip": 0.06569345, + "auxiliary_loss_mlp": 0.01289522, + "balance_loss_clip": 0.06309073, + "balance_loss_mlp": 0.01268697, + "epoch": 0.17898692319254472, + "flos": 25706277596160.0, + "grad_norm": 1.568582717127115, + "language_loss": 0.7779026, + "learning_rate": 3.7716354814799495e-06, + "loss": 0.85649121, + "num_input_tokens_seen": 64348800, + "router_z_loss_clip": 2.60351562, + "router_z_loss_mlp": 0.20837402, + "step": 2977, + "time_per_iteration": 2.6050028800964355 + }, + { + "auxiliary_loss_clip": 0.06579779, + "auxiliary_loss_mlp": 0.01290892, + "balance_loss_clip": 0.06314169, + "balance_loss_mlp": 0.01267538, + "epoch": 0.1790470464452127, + "flos": 19323830171520.0, + "grad_norm": 2.1998049901746395, + "language_loss": 0.80421352, + "learning_rate": 3.7714547246903203e-06, + "loss": 0.88292015, + "num_input_tokens_seen": 64367955, + "router_z_loss_clip": 2.65234375, + "router_z_loss_mlp": 0.23339844, + "step": 2978, + "time_per_iteration": 4.010040044784546 + }, + { + "auxiliary_loss_clip": 0.06576563, + "auxiliary_loss_mlp": 0.01293687, + "balance_loss_clip": 0.06306942, + "balance_loss_mlp": 0.01267556, + "epoch": 0.17910716969788065, + "flos": 30052048953600.0, + "grad_norm": 1.73318348994846, + "language_loss": 0.77042997, + "learning_rate": 3.7712739007269508e-06, + "loss": 0.84913242, + "num_input_tokens_seen": 64389805, + "router_z_loss_clip": 2.69335938, + "router_z_loss_mlp": 0.2611084, + "step": 2979, + "time_per_iteration": 2.608980655670166 + }, + { + "auxiliary_loss_clip": 0.06560802, + "auxiliary_loss_mlp": 0.01281236, + "balance_loss_clip": 0.06300105, + "balance_loss_mlp": 0.01258264, + "epoch": 0.17916729295054862, + "flos": 19433848982400.0, + "grad_norm": 2.44165935104879, + "language_loss": 0.69755781, + "learning_rate": 3.7710930095966976e-06, + "loss": 0.77597821, + "num_input_tokens_seen": 64408220, + "router_z_loss_clip": 2.60742188, + "router_z_loss_mlp": 0.22961426, + "step": 2980, + "time_per_iteration": 2.5433084964752197 + }, + { + "auxiliary_loss_clip": 0.06568111, + "auxiliary_loss_mlp": 0.01287625, + "balance_loss_clip": 0.06298865, + "balance_loss_mlp": 0.01262627, + "epoch": 0.17922741620321658, + "flos": 14616877789440.0, + "grad_norm": 2.147684280368508, + "language_loss": 0.7145257, + "learning_rate": 3.7709120513064196e-06, + "loss": 0.79308307, + "num_input_tokens_seen": 64426380, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 0.25, + "step": 2981, + "time_per_iteration": 2.500054359436035 + }, + { + "auxiliary_loss_clip": 0.06576173, + "auxiliary_loss_mlp": 0.01291804, + "balance_loss_clip": 0.06304301, + "balance_loss_mlp": 0.01267676, + "epoch": 0.17928753945588458, + "flos": 17171013945600.0, + "grad_norm": 2.0884907581744514, + "language_loss": 0.82620054, + "learning_rate": 3.7707310258629796e-06, + "loss": 0.90488029, + "num_input_tokens_seen": 64444355, + "router_z_loss_clip": 2.72070312, + "router_z_loss_mlp": 0.24145508, + "step": 2982, + "time_per_iteration": 2.5748655796051025 + }, + { + "auxiliary_loss_clip": 0.06564468, + "auxiliary_loss_mlp": 0.01285766, + "balance_loss_clip": 0.06298885, + "balance_loss_mlp": 0.01263212, + "epoch": 0.17934766270855254, + "flos": 31403860473600.0, + "grad_norm": 1.5724638299649338, + "language_loss": 0.83894312, + "learning_rate": 3.7705499332732413e-06, + "loss": 0.91744542, + "num_input_tokens_seen": 64467800, + "router_z_loss_clip": 2.65429688, + "router_z_loss_mlp": 0.2253418, + "step": 2983, + "time_per_iteration": 5.515043497085571 + }, + { + "auxiliary_loss_clip": 0.0656914, + "auxiliary_loss_mlp": 0.01282068, + "balance_loss_clip": 0.06294827, + "balance_loss_mlp": 0.01257571, + "epoch": 0.1794077859612205, + "flos": 20820558528000.0, + "grad_norm": 2.232182880378402, + "language_loss": 0.86948806, + "learning_rate": 3.7703687735440718e-06, + "loss": 0.94800013, + "num_input_tokens_seen": 64487230, + "router_z_loss_clip": 2.74414062, + "router_z_loss_mlp": 0.24523926, + "step": 2984, + "time_per_iteration": 2.51488995552063 + }, + { + "auxiliary_loss_clip": 0.0657285, + "auxiliary_loss_mlp": 0.0128885, + "balance_loss_clip": 0.06300434, + "balance_loss_mlp": 0.01263315, + "epoch": 0.17946790921388847, + "flos": 28994096102400.0, + "grad_norm": 1.3770556187482685, + "language_loss": 0.90024149, + "learning_rate": 3.7701875466823416e-06, + "loss": 0.97885847, + "num_input_tokens_seen": 64509165, + "router_z_loss_clip": 2.72851562, + "router_z_loss_mlp": 0.25537109, + "step": 2985, + "time_per_iteration": 2.6063013076782227 + }, + { + "auxiliary_loss_clip": 0.06556329, + "auxiliary_loss_mlp": 0.01283368, + "balance_loss_clip": 0.06297163, + "balance_loss_mlp": 0.01261088, + "epoch": 0.17952803246655644, + "flos": 20743131755520.0, + "grad_norm": 1.9976249367728316, + "language_loss": 0.71013325, + "learning_rate": 3.770006252694922e-06, + "loss": 0.78853023, + "num_input_tokens_seen": 64527940, + "router_z_loss_clip": 2.58984375, + "router_z_loss_mlp": 0.22277832, + "step": 2986, + "time_per_iteration": 2.519601345062256 + }, + { + "auxiliary_loss_clip": 0.0656532, + "auxiliary_loss_mlp": 0.01291064, + "balance_loss_clip": 0.06300499, + "balance_loss_mlp": 0.01266805, + "epoch": 0.1795881557192244, + "flos": 28263390572160.0, + "grad_norm": 2.1489314529360994, + "language_loss": 0.78320301, + "learning_rate": 3.769824891588688e-06, + "loss": 0.86176682, + "num_input_tokens_seen": 64545230, + "router_z_loss_clip": 2.65039062, + "router_z_loss_mlp": 0.24243164, + "step": 2987, + "time_per_iteration": 2.6449100971221924 + }, + { + "auxiliary_loss_clip": 0.06569126, + "auxiliary_loss_mlp": 0.01288456, + "balance_loss_clip": 0.06297948, + "balance_loss_mlp": 0.01263589, + "epoch": 0.17964827897189237, + "flos": 18558016980480.0, + "grad_norm": 1.9340316390641499, + "language_loss": 0.78628373, + "learning_rate": 3.7696434633705164e-06, + "loss": 0.86485958, + "num_input_tokens_seen": 64563820, + "router_z_loss_clip": 2.7109375, + "router_z_loss_mlp": 0.24890137, + "step": 2988, + "time_per_iteration": 2.53200101852417 + }, + { + "auxiliary_loss_clip": 0.06451814, + "auxiliary_loss_mlp": 0.01275074, + "balance_loss_clip": 0.06303016, + "balance_loss_mlp": 0.01267408, + "epoch": 0.17970840222456036, + "flos": 58182052625280.0, + "grad_norm": 0.7360596365876024, + "language_loss": 0.62615538, + "learning_rate": 3.7694619680472875e-06, + "loss": 0.70342427, + "num_input_tokens_seen": 64621315, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.07653809, + "step": 2989, + "time_per_iteration": 3.076199769973755 + }, + { + "auxiliary_loss_clip": 0.06567107, + "auxiliary_loss_mlp": 0.01292244, + "balance_loss_clip": 0.06300405, + "balance_loss_mlp": 0.0126808, + "epoch": 0.17976852547722832, + "flos": 20306662237440.0, + "grad_norm": 2.2696852334697035, + "language_loss": 0.71750367, + "learning_rate": 3.7692804056258837e-06, + "loss": 0.79609722, + "num_input_tokens_seen": 64639885, + "router_z_loss_clip": 2.66601562, + "router_z_loss_mlp": 0.24157715, + "step": 2990, + "time_per_iteration": 2.5519793033599854 + }, + { + "auxiliary_loss_clip": 0.06572431, + "auxiliary_loss_mlp": 0.01293466, + "balance_loss_clip": 0.0629989, + "balance_loss_mlp": 0.0126873, + "epoch": 0.1798286487298963, + "flos": 39677564004480.0, + "grad_norm": 1.9736942492438545, + "language_loss": 0.69419956, + "learning_rate": 3.7690987761131893e-06, + "loss": 0.77285856, + "num_input_tokens_seen": 64661220, + "router_z_loss_clip": 2.7265625, + "router_z_loss_mlp": 0.24743652, + "step": 2991, + "time_per_iteration": 2.6942460536956787 + }, + { + "auxiliary_loss_clip": 0.06566148, + "auxiliary_loss_mlp": 0.01286066, + "balance_loss_clip": 0.0629756, + "balance_loss_mlp": 0.012617, + "epoch": 0.17988877198256426, + "flos": 25527385128960.0, + "grad_norm": 1.696800264728132, + "language_loss": 0.83554435, + "learning_rate": 3.7689170795160924e-06, + "loss": 0.91406649, + "num_input_tokens_seen": 64682530, + "router_z_loss_clip": 2.68359375, + "router_z_loss_mlp": 0.24365234, + "step": 2992, + "time_per_iteration": 2.5905981063842773 + }, + { + "auxiliary_loss_clip": 0.06555136, + "auxiliary_loss_mlp": 0.01287452, + "balance_loss_clip": 0.06296399, + "balance_loss_mlp": 0.01264087, + "epoch": 0.17994889523523222, + "flos": 18813539606400.0, + "grad_norm": 1.8489809189150626, + "language_loss": 0.83113515, + "learning_rate": 3.7687353158414822e-06, + "loss": 0.90956104, + "num_input_tokens_seen": 64701025, + "router_z_loss_clip": 2.59179688, + "router_z_loss_mlp": 0.23352051, + "step": 2993, + "time_per_iteration": 2.52469801902771 + }, + { + "auxiliary_loss_clip": 0.06567293, + "auxiliary_loss_mlp": 0.01295673, + "balance_loss_clip": 0.06297931, + "balance_loss_mlp": 0.01270532, + "epoch": 0.18000901848790019, + "flos": 21110601836160.0, + "grad_norm": 1.6727087173341013, + "language_loss": 0.79138827, + "learning_rate": 3.7685534850962517e-06, + "loss": 0.87001795, + "num_input_tokens_seen": 64719570, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 0.25134277, + "step": 2994, + "time_per_iteration": 2.6068711280822754 + }, + { + "auxiliary_loss_clip": 0.06570512, + "auxiliary_loss_mlp": 0.01299664, + "balance_loss_clip": 0.06303661, + "balance_loss_mlp": 0.01275656, + "epoch": 0.18006914174056818, + "flos": 19652586865920.0, + "grad_norm": 2.057688194559839, + "language_loss": 0.81263554, + "learning_rate": 3.768371587287296e-06, + "loss": 0.89133728, + "num_input_tokens_seen": 64738110, + "router_z_loss_clip": 2.66796875, + "router_z_loss_mlp": 0.24023438, + "step": 2995, + "time_per_iteration": 2.55191707611084 + }, + { + "auxiliary_loss_clip": 0.06569074, + "auxiliary_loss_mlp": 0.0128305, + "balance_loss_clip": 0.06302823, + "balance_loss_mlp": 0.012599, + "epoch": 0.18012926499323614, + "flos": 19505909093760.0, + "grad_norm": 1.5669289310044971, + "language_loss": 0.84560204, + "learning_rate": 3.768189622421512e-06, + "loss": 0.92412329, + "num_input_tokens_seen": 64756345, + "router_z_loss_clip": 2.66210938, + "router_z_loss_mlp": 0.23156738, + "step": 2996, + "time_per_iteration": 2.5438597202301025 + }, + { + "auxiliary_loss_clip": 0.06562654, + "auxiliary_loss_mlp": 0.012845, + "balance_loss_clip": 0.06302606, + "balance_loss_mlp": 0.01261124, + "epoch": 0.1801893882459041, + "flos": 19470759505920.0, + "grad_norm": 1.7191902249906965, + "language_loss": 0.88438457, + "learning_rate": 3.7680075905058006e-06, + "loss": 0.96285611, + "num_input_tokens_seen": 64776375, + "router_z_loss_clip": 2.6015625, + "router_z_loss_mlp": 0.23352051, + "step": 2997, + "time_per_iteration": 2.5537290573120117 + }, + { + "auxiliary_loss_clip": 0.06589026, + "auxiliary_loss_mlp": 0.01294218, + "balance_loss_clip": 0.06317096, + "balance_loss_mlp": 0.01268731, + "epoch": 0.18024951149857207, + "flos": 26877938837760.0, + "grad_norm": 1.8629134602199495, + "language_loss": 0.86106455, + "learning_rate": 3.7678254915470643e-06, + "loss": 0.939897, + "num_input_tokens_seen": 64796210, + "router_z_loss_clip": 2.71679688, + "router_z_loss_mlp": 0.25500488, + "step": 2998, + "time_per_iteration": 2.6256613731384277 + }, + { + "auxiliary_loss_clip": 0.06576181, + "auxiliary_loss_mlp": 0.01293189, + "balance_loss_clip": 0.06311405, + "balance_loss_mlp": 0.01269573, + "epoch": 0.18030963475124004, + "flos": 30234421365120.0, + "grad_norm": 1.8712207411963018, + "language_loss": 0.84650278, + "learning_rate": 3.7676433255522084e-06, + "loss": 0.92519647, + "num_input_tokens_seen": 64818590, + "router_z_loss_clip": 2.64648438, + "router_z_loss_mlp": 0.23608398, + "step": 2999, + "time_per_iteration": 2.6169869899749756 + }, + { + "auxiliary_loss_clip": 0.06576863, + "auxiliary_loss_mlp": 0.01287758, + "balance_loss_clip": 0.06310622, + "balance_loss_mlp": 0.01263905, + "epoch": 0.180369758003908, + "flos": 22313681159040.0, + "grad_norm": 2.163703762887268, + "language_loss": 0.75604963, + "learning_rate": 3.76746109252814e-06, + "loss": 0.83469582, + "num_input_tokens_seen": 64838350, + "router_z_loss_clip": 2.6640625, + "router_z_loss_mlp": 0.23852539, + "step": 3000, + "time_per_iteration": 2.6028895378112793 + }, + { + "auxiliary_loss_clip": 0.06574081, + "auxiliary_loss_mlp": 0.01292075, + "balance_loss_clip": 0.06310557, + "balance_loss_mlp": 0.01270034, + "epoch": 0.18042988125657597, + "flos": 23738726747520.0, + "grad_norm": 2.5967993482221114, + "language_loss": 0.72796941, + "learning_rate": 3.76727879248177e-06, + "loss": 0.80663097, + "num_input_tokens_seen": 64858065, + "router_z_loss_clip": 2.63671875, + "router_z_loss_mlp": 0.22033691, + "step": 3001, + "time_per_iteration": 2.5506463050842285 + }, + { + "auxiliary_loss_clip": 0.06583872, + "auxiliary_loss_mlp": 0.01288133, + "balance_loss_clip": 0.06311986, + "balance_loss_mlp": 0.01262336, + "epoch": 0.18049000450924396, + "flos": 24099781991040.0, + "grad_norm": 2.0612506576335488, + "language_loss": 0.88948703, + "learning_rate": 3.767096425420011e-06, + "loss": 0.96820712, + "num_input_tokens_seen": 64877305, + "router_z_loss_clip": 2.71484375, + "router_z_loss_mlp": 0.25793457, + "step": 3002, + "time_per_iteration": 2.606262683868408 + }, + { + "auxiliary_loss_clip": 0.06584583, + "auxiliary_loss_mlp": 0.01297298, + "balance_loss_clip": 0.06316328, + "balance_loss_mlp": 0.01274613, + "epoch": 0.18055012776191193, + "flos": 22169602863360.0, + "grad_norm": 1.9471434915323604, + "language_loss": 0.82044661, + "learning_rate": 3.7669139913497788e-06, + "loss": 0.89926547, + "num_input_tokens_seen": 64896955, + "router_z_loss_clip": 2.68359375, + "router_z_loss_mlp": 0.22705078, + "step": 3003, + "time_per_iteration": 2.519054889678955 + }, + { + "auxiliary_loss_clip": 0.06584047, + "auxiliary_loss_mlp": 0.01304701, + "balance_loss_clip": 0.0631455, + "balance_loss_mlp": 0.01281098, + "epoch": 0.1806102510145799, + "flos": 28921155523200.0, + "grad_norm": 1.9671809983045359, + "language_loss": 0.67718011, + "learning_rate": 3.7667314902779907e-06, + "loss": 0.75606757, + "num_input_tokens_seen": 64917080, + "router_z_loss_clip": 2.6953125, + "router_z_loss_mlp": 0.23608398, + "step": 3004, + "time_per_iteration": 2.576216459274292 + }, + { + "auxiliary_loss_clip": 0.06581833, + "auxiliary_loss_mlp": 0.01290497, + "balance_loss_clip": 0.06313001, + "balance_loss_mlp": 0.01265976, + "epoch": 0.18067037426724786, + "flos": 19031648584320.0, + "grad_norm": 1.7292261015630317, + "language_loss": 0.86117315, + "learning_rate": 3.7665489222115677e-06, + "loss": 0.93989646, + "num_input_tokens_seen": 64935215, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 0.2454834, + "step": 3005, + "time_per_iteration": 2.51688814163208 + }, + { + "auxiliary_loss_clip": 0.06579112, + "auxiliary_loss_mlp": 0.01292933, + "balance_loss_clip": 0.0631589, + "balance_loss_mlp": 0.01270247, + "epoch": 0.18073049751991582, + "flos": 27460960346880.0, + "grad_norm": 1.9900110027616933, + "language_loss": 0.84054905, + "learning_rate": 3.766366287157432e-06, + "loss": 0.9192695, + "num_input_tokens_seen": 64956275, + "router_z_loss_clip": 2.6328125, + "router_z_loss_mlp": 0.22692871, + "step": 3006, + "time_per_iteration": 2.6471307277679443 + }, + { + "auxiliary_loss_clip": 0.06573892, + "auxiliary_loss_mlp": 0.01293776, + "balance_loss_clip": 0.06311665, + "balance_loss_mlp": 0.01270399, + "epoch": 0.1807906207725838, + "flos": 28736309416320.0, + "grad_norm": 1.8980852178108305, + "language_loss": 0.77909601, + "learning_rate": 3.7661835851225103e-06, + "loss": 0.85777271, + "num_input_tokens_seen": 64979390, + "router_z_loss_clip": 2.62304688, + "router_z_loss_mlp": 0.23376465, + "step": 3007, + "time_per_iteration": 2.596728801727295 + }, + { + "auxiliary_loss_clip": 0.06488212, + "auxiliary_loss_mlp": 0.01341948, + "balance_loss_clip": 0.06340114, + "balance_loss_mlp": 0.01332817, + "epoch": 0.18085074402525175, + "flos": 64488861411840.0, + "grad_norm": 0.8091646786767962, + "language_loss": 0.57128072, + "learning_rate": 3.7660008161137294e-06, + "loss": 0.64958233, + "num_input_tokens_seen": 65043135, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.09136963, + "step": 3008, + "time_per_iteration": 3.2818551063537598 + }, + { + "auxiliary_loss_clip": 0.06575561, + "auxiliary_loss_mlp": 0.0128936, + "balance_loss_clip": 0.06307852, + "balance_loss_mlp": 0.0126528, + "epoch": 0.18091086727791975, + "flos": 23483665319040.0, + "grad_norm": 2.791287786369512, + "language_loss": 0.68172324, + "learning_rate": 3.765817980138021e-06, + "loss": 0.76037246, + "num_input_tokens_seen": 65062845, + "router_z_loss_clip": 2.67773438, + "router_z_loss_mlp": 0.24072266, + "step": 3009, + "time_per_iteration": 2.612866163253784 + }, + { + "auxiliary_loss_clip": 0.06566571, + "auxiliary_loss_mlp": 0.01283544, + "balance_loss_clip": 0.06299911, + "balance_loss_mlp": 0.01261228, + "epoch": 0.1809709905305877, + "flos": 24177334544640.0, + "grad_norm": 2.2065616524174745, + "language_loss": 0.76732111, + "learning_rate": 3.7656350772023177e-06, + "loss": 0.84582222, + "num_input_tokens_seen": 65082110, + "router_z_loss_clip": 2.6640625, + "router_z_loss_mlp": 0.22314453, + "step": 3010, + "time_per_iteration": 2.570751190185547 + }, + { + "auxiliary_loss_clip": 0.0656049, + "auxiliary_loss_mlp": 0.01277678, + "balance_loss_clip": 0.06301664, + "balance_loss_mlp": 0.01255028, + "epoch": 0.18103111378325568, + "flos": 21657006311040.0, + "grad_norm": 1.5802962280270132, + "language_loss": 0.68172359, + "learning_rate": 3.7654521073135553e-06, + "loss": 0.76010525, + "num_input_tokens_seen": 65101985, + "router_z_loss_clip": 2.58789062, + "router_z_loss_mlp": 0.22644043, + "step": 3011, + "time_per_iteration": 2.5724563598632812 + }, + { + "auxiliary_loss_clip": 0.0656517, + "auxiliary_loss_mlp": 0.01279328, + "balance_loss_clip": 0.06304309, + "balance_loss_mlp": 0.01256989, + "epoch": 0.18109123703592364, + "flos": 53698632537600.0, + "grad_norm": 1.5833259733478497, + "language_loss": 0.71816081, + "learning_rate": 3.7652690704786723e-06, + "loss": 0.79660583, + "num_input_tokens_seen": 65129295, + "router_z_loss_clip": 2.60546875, + "router_z_loss_mlp": 0.22351074, + "step": 3012, + "time_per_iteration": 2.810831069946289 + }, + { + "auxiliary_loss_clip": 0.06566492, + "auxiliary_loss_mlp": 0.01285528, + "balance_loss_clip": 0.06309225, + "balance_loss_mlp": 0.01261376, + "epoch": 0.1811513602885916, + "flos": 35854325907840.0, + "grad_norm": 2.597528045864961, + "language_loss": 0.63496852, + "learning_rate": 3.765085966704609e-06, + "loss": 0.7134887, + "num_input_tokens_seen": 65150625, + "router_z_loss_clip": 2.57617188, + "router_z_loss_mlp": 0.24169922, + "step": 3013, + "time_per_iteration": 2.728149175643921 + }, + { + "auxiliary_loss_clip": 0.0656557, + "auxiliary_loss_mlp": 0.01286402, + "balance_loss_clip": 0.06302488, + "balance_loss_mlp": 0.01262405, + "epoch": 0.18121148354125957, + "flos": 23739355653120.0, + "grad_norm": 1.5758176693533255, + "language_loss": 0.76564461, + "learning_rate": 3.764902795998309e-06, + "loss": 0.84416431, + "num_input_tokens_seen": 65170880, + "router_z_loss_clip": 2.63085938, + "router_z_loss_mlp": 0.23986816, + "step": 3014, + "time_per_iteration": 2.547717332839966 + }, + { + "auxiliary_loss_clip": 0.06584823, + "auxiliary_loss_mlp": 0.01295776, + "balance_loss_clip": 0.06314109, + "balance_loss_mlp": 0.01270336, + "epoch": 0.18127160679392756, + "flos": 28735470875520.0, + "grad_norm": 2.560866552798296, + "language_loss": 0.66988617, + "learning_rate": 3.7647195583667184e-06, + "loss": 0.74869215, + "num_input_tokens_seen": 65192530, + "router_z_loss_clip": 2.70898438, + "router_z_loss_mlp": 0.2545166, + "step": 3015, + "time_per_iteration": 2.69026780128479 + }, + { + "auxiliary_loss_clip": 0.06569196, + "auxiliary_loss_mlp": 0.01280146, + "balance_loss_clip": 0.06306805, + "balance_loss_mlp": 0.0125696, + "epoch": 0.18133173004659553, + "flos": 20491256782080.0, + "grad_norm": 2.469275114619788, + "language_loss": 0.78958207, + "learning_rate": 3.764536253816785e-06, + "loss": 0.86807549, + "num_input_tokens_seen": 65211675, + "router_z_loss_clip": 2.62304688, + "router_z_loss_mlp": 0.23168945, + "step": 3016, + "time_per_iteration": 3.9831480979919434 + }, + { + "auxiliary_loss_clip": 0.0657627, + "auxiliary_loss_mlp": 0.01288204, + "balance_loss_clip": 0.06304967, + "balance_loss_mlp": 0.01262967, + "epoch": 0.1813918532992635, + "flos": 22857905427840.0, + "grad_norm": 1.6723213639278358, + "language_loss": 0.84196192, + "learning_rate": 3.7643528823554602e-06, + "loss": 0.92060661, + "num_input_tokens_seen": 65231185, + "router_z_loss_clip": 2.71484375, + "router_z_loss_mlp": 0.25256348, + "step": 3017, + "time_per_iteration": 2.5418076515197754 + }, + { + "auxiliary_loss_clip": 0.06562062, + "auxiliary_loss_mlp": 0.01287085, + "balance_loss_clip": 0.063041, + "balance_loss_mlp": 0.01264197, + "epoch": 0.18145197655193146, + "flos": 36074028113280.0, + "grad_norm": 1.9391079186566258, + "language_loss": 0.68509835, + "learning_rate": 3.764169443989697e-06, + "loss": 0.76358986, + "num_input_tokens_seen": 65251645, + "router_z_loss_clip": 2.58203125, + "router_z_loss_mlp": 0.22900391, + "step": 3018, + "time_per_iteration": 4.119429111480713 + }, + { + "auxiliary_loss_clip": 0.06567694, + "auxiliary_loss_mlp": 0.01285506, + "balance_loss_clip": 0.06301513, + "balance_loss_mlp": 0.01262296, + "epoch": 0.18151209980459942, + "flos": 24030698699520.0, + "grad_norm": 1.811235496294486, + "language_loss": 0.76789671, + "learning_rate": 3.7639859387264518e-06, + "loss": 0.84642869, + "num_input_tokens_seen": 65271125, + "router_z_loss_clip": 2.65820312, + "router_z_loss_mlp": 0.23205566, + "step": 3019, + "time_per_iteration": 2.5501174926757812 + }, + { + "auxiliary_loss_clip": 0.06571496, + "auxiliary_loss_mlp": 0.01294569, + "balance_loss_clip": 0.0630317, + "balance_loss_mlp": 0.01267544, + "epoch": 0.1815722230572674, + "flos": 23958470880000.0, + "grad_norm": 3.3265475746221305, + "language_loss": 0.82225502, + "learning_rate": 3.7638023665726834e-06, + "loss": 0.90091568, + "num_input_tokens_seen": 65290600, + "router_z_loss_clip": 2.68554688, + "router_z_loss_mlp": 0.26989746, + "step": 3020, + "time_per_iteration": 2.5695080757141113 + }, + { + "auxiliary_loss_clip": 0.06568192, + "auxiliary_loss_mlp": 0.01285845, + "balance_loss_clip": 0.06304596, + "balance_loss_mlp": 0.01262433, + "epoch": 0.18163234630993536, + "flos": 24392885973120.0, + "grad_norm": 1.8328180932997555, + "language_loss": 0.78643721, + "learning_rate": 3.763618727535352e-06, + "loss": 0.8649776, + "num_input_tokens_seen": 65311040, + "router_z_loss_clip": 2.63867188, + "router_z_loss_mlp": 0.234375, + "step": 3021, + "time_per_iteration": 2.551942825317383 + }, + { + "auxiliary_loss_clip": 0.06560968, + "auxiliary_loss_mlp": 0.01283899, + "balance_loss_clip": 0.06301476, + "balance_loss_mlp": 0.01261034, + "epoch": 0.18169246956260335, + "flos": 24688295942400.0, + "grad_norm": 2.040482316083418, + "language_loss": 0.85882831, + "learning_rate": 3.763435021621422e-06, + "loss": 0.93727696, + "num_input_tokens_seen": 65332115, + "router_z_loss_clip": 2.59179688, + "router_z_loss_mlp": 0.22851562, + "step": 3022, + "time_per_iteration": 5.58092737197876 + }, + { + "auxiliary_loss_clip": 0.06578015, + "auxiliary_loss_mlp": 0.01285165, + "balance_loss_clip": 0.06310268, + "balance_loss_mlp": 0.0126031, + "epoch": 0.1817525928152713, + "flos": 24250149342720.0, + "grad_norm": 1.8455534069636814, + "language_loss": 0.7011804, + "learning_rate": 3.763251248837859e-06, + "loss": 0.77981222, + "num_input_tokens_seen": 65352210, + "router_z_loss_clip": 2.67773438, + "router_z_loss_mlp": 0.24853516, + "step": 3023, + "time_per_iteration": 2.5510292053222656 + }, + { + "auxiliary_loss_clip": 0.06576993, + "auxiliary_loss_mlp": 0.01285425, + "balance_loss_clip": 0.06311849, + "balance_loss_mlp": 0.01262382, + "epoch": 0.18181271606793928, + "flos": 16477680136320.0, + "grad_norm": 3.5802196750479753, + "language_loss": 0.7475239, + "learning_rate": 3.7630674091916317e-06, + "loss": 0.82614803, + "num_input_tokens_seen": 65370600, + "router_z_loss_clip": 2.6484375, + "router_z_loss_mlp": 0.23034668, + "step": 3024, + "time_per_iteration": 2.532150983810425 + }, + { + "auxiliary_loss_clip": 0.0657917, + "auxiliary_loss_mlp": 0.01281973, + "balance_loss_clip": 0.06315119, + "balance_loss_mlp": 0.01258239, + "epoch": 0.18187283932060724, + "flos": 18585787409280.0, + "grad_norm": 2.5283577302616593, + "language_loss": 0.89396572, + "learning_rate": 3.7628835026897123e-06, + "loss": 0.97257715, + "num_input_tokens_seen": 65387270, + "router_z_loss_clip": 2.640625, + "router_z_loss_mlp": 0.23742676, + "step": 3025, + "time_per_iteration": 2.503992795944214 + }, + { + "auxiliary_loss_clip": 0.0657706, + "auxiliary_loss_mlp": 0.01284845, + "balance_loss_clip": 0.06313155, + "balance_loss_mlp": 0.01260049, + "epoch": 0.1819329625732752, + "flos": 20273105877120.0, + "grad_norm": 1.766887401432974, + "language_loss": 0.80214149, + "learning_rate": 3.7626995293390735e-06, + "loss": 0.88076055, + "num_input_tokens_seen": 65406550, + "router_z_loss_clip": 2.63867188, + "router_z_loss_mlp": 0.24804688, + "step": 3026, + "time_per_iteration": 2.5226128101348877 + }, + { + "auxiliary_loss_clip": 0.06583989, + "auxiliary_loss_mlp": 0.01292049, + "balance_loss_clip": 0.06316754, + "balance_loss_mlp": 0.01267695, + "epoch": 0.18199308582594317, + "flos": 25921242046080.0, + "grad_norm": 3.8781285127645924, + "language_loss": 0.76237446, + "learning_rate": 3.762515489146692e-06, + "loss": 0.84113485, + "num_input_tokens_seen": 65425955, + "router_z_loss_clip": 2.67578125, + "router_z_loss_mlp": 0.2434082, + "step": 3027, + "time_per_iteration": 2.578749418258667 + }, + { + "auxiliary_loss_clip": 0.06592765, + "auxiliary_loss_mlp": 0.01296803, + "balance_loss_clip": 0.06322083, + "balance_loss_mlp": 0.01271328, + "epoch": 0.18205320907861114, + "flos": 15382942542720.0, + "grad_norm": 3.274226659229475, + "language_loss": 0.86130804, + "learning_rate": 3.762331382119546e-06, + "loss": 0.94020373, + "num_input_tokens_seen": 65442820, + "router_z_loss_clip": 2.71289062, + "router_z_loss_mlp": 0.25476074, + "step": 3028, + "time_per_iteration": 2.5201306343078613 + }, + { + "auxiliary_loss_clip": 0.06585124, + "auxiliary_loss_mlp": 0.01291016, + "balance_loss_clip": 0.06319305, + "balance_loss_mlp": 0.01263896, + "epoch": 0.18211333233127913, + "flos": 25630485978240.0, + "grad_norm": 1.8702692274079507, + "language_loss": 0.83509612, + "learning_rate": 3.7621472082646183e-06, + "loss": 0.91385752, + "num_input_tokens_seen": 65461825, + "router_z_loss_clip": 2.65429688, + "router_z_loss_mlp": 0.27111816, + "step": 3029, + "time_per_iteration": 2.562183380126953 + }, + { + "auxiliary_loss_clip": 0.06592625, + "auxiliary_loss_mlp": 0.01296678, + "balance_loss_clip": 0.06326656, + "balance_loss_mlp": 0.01269153, + "epoch": 0.1821734555839471, + "flos": 14981329123200.0, + "grad_norm": 1.9791177396807749, + "language_loss": 0.78960443, + "learning_rate": 3.761962967588891e-06, + "loss": 0.86849743, + "num_input_tokens_seen": 65479480, + "router_z_loss_clip": 2.65820312, + "router_z_loss_mlp": 0.27514648, + "step": 3030, + "time_per_iteration": 2.5145437717437744 + }, + { + "auxiliary_loss_clip": 0.06592657, + "auxiliary_loss_mlp": 0.01296331, + "balance_loss_clip": 0.06325006, + "balance_loss_mlp": 0.01269748, + "epoch": 0.18223357883661506, + "flos": 20200291079040.0, + "grad_norm": 1.9881761765350903, + "language_loss": 0.86102521, + "learning_rate": 3.761778660099352e-06, + "loss": 0.93991506, + "num_input_tokens_seen": 65497775, + "router_z_loss_clip": 2.6796875, + "router_z_loss_mlp": 0.26623535, + "step": 3031, + "time_per_iteration": 2.5260634422302246 + }, + { + "auxiliary_loss_clip": 0.06592748, + "auxiliary_loss_mlp": 0.01294791, + "balance_loss_clip": 0.06325988, + "balance_loss_mlp": 0.01270473, + "epoch": 0.18229370208928303, + "flos": 15237438727680.0, + "grad_norm": 2.0909174524979033, + "language_loss": 0.8092168, + "learning_rate": 3.76159428580299e-06, + "loss": 0.88809216, + "num_input_tokens_seen": 65516505, + "router_z_loss_clip": 2.66796875, + "router_z_loss_mlp": 0.24316406, + "step": 3032, + "time_per_iteration": 2.5710113048553467 + }, + { + "auxiliary_loss_clip": 0.06594816, + "auxiliary_loss_mlp": 0.01293656, + "balance_loss_clip": 0.06321192, + "balance_loss_mlp": 0.0126718, + "epoch": 0.182353825341951, + "flos": 23847026549760.0, + "grad_norm": 1.952875580311909, + "language_loss": 0.81854784, + "learning_rate": 3.761409844706795e-06, + "loss": 0.89743257, + "num_input_tokens_seen": 65536160, + "router_z_loss_clip": 2.73632812, + "router_z_loss_mlp": 0.26501465, + "step": 3033, + "time_per_iteration": 2.5495798587799072 + }, + { + "auxiliary_loss_clip": 0.06484132, + "auxiliary_loss_mlp": 0.01303963, + "balance_loss_clip": 0.06340252, + "balance_loss_mlp": 0.01294378, + "epoch": 0.18241394859461896, + "flos": 61208017522560.0, + "grad_norm": 0.8447557433525825, + "language_loss": 0.63402653, + "learning_rate": 3.7612253368177625e-06, + "loss": 0.71190745, + "num_input_tokens_seen": 65589375, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.09570312, + "step": 3034, + "time_per_iteration": 3.0660452842712402 + }, + { + "auxiliary_loss_clip": 0.0658728, + "auxiliary_loss_mlp": 0.01296965, + "balance_loss_clip": 0.0632379, + "balance_loss_mlp": 0.01271896, + "epoch": 0.18247407184728695, + "flos": 18476439431040.0, + "grad_norm": 2.061097584564917, + "language_loss": 0.80526477, + "learning_rate": 3.7610407621428893e-06, + "loss": 0.88410723, + "num_input_tokens_seen": 65606720, + "router_z_loss_clip": 2.63476562, + "router_z_loss_mlp": 0.25073242, + "step": 3035, + "time_per_iteration": 2.5506694316864014 + }, + { + "auxiliary_loss_clip": 0.06580287, + "auxiliary_loss_mlp": 0.01288285, + "balance_loss_clip": 0.06319961, + "balance_loss_mlp": 0.01264181, + "epoch": 0.18253419509995492, + "flos": 21801042679680.0, + "grad_norm": 1.6140632959859456, + "language_loss": 0.85371202, + "learning_rate": 3.7608561206891735e-06, + "loss": 0.93239772, + "num_input_tokens_seen": 65625495, + "router_z_loss_clip": 2.6015625, + "router_z_loss_mlp": 0.24108887, + "step": 3036, + "time_per_iteration": 2.6029741764068604 + }, + { + "auxiliary_loss_clip": 0.06580038, + "auxiliary_loss_mlp": 0.01290184, + "balance_loss_clip": 0.0632468, + "balance_loss_mlp": 0.01266843, + "epoch": 0.18259431835262288, + "flos": 20154743585280.0, + "grad_norm": 2.265799944133398, + "language_loss": 0.80322921, + "learning_rate": 3.760671412463617e-06, + "loss": 0.88193142, + "num_input_tokens_seen": 65643515, + "router_z_loss_clip": 2.54882812, + "router_z_loss_mlp": 0.23327637, + "step": 3037, + "time_per_iteration": 2.519632577896118 + }, + { + "auxiliary_loss_clip": 0.06593587, + "auxiliary_loss_mlp": 0.01295693, + "balance_loss_clip": 0.063269, + "balance_loss_mlp": 0.01270373, + "epoch": 0.18265444160529085, + "flos": 16987132160640.0, + "grad_norm": 4.978587383263401, + "language_loss": 0.80596817, + "learning_rate": 3.7604866374732246e-06, + "loss": 0.88486093, + "num_input_tokens_seen": 65658155, + "router_z_loss_clip": 2.66601562, + "router_z_loss_mlp": 0.25341797, + "step": 3038, + "time_per_iteration": 2.549565315246582 + }, + { + "auxiliary_loss_clip": 0.06577064, + "auxiliary_loss_mlp": 0.01293219, + "balance_loss_clip": 0.06316892, + "balance_loss_mlp": 0.01268221, + "epoch": 0.1827145648579588, + "flos": 34431879795840.0, + "grad_norm": 3.0715308969073907, + "language_loss": 0.6822418, + "learning_rate": 3.7603017957250023e-06, + "loss": 0.76094472, + "num_input_tokens_seen": 65679310, + "router_z_loss_clip": 2.60351562, + "router_z_loss_mlp": 0.24987793, + "step": 3039, + "time_per_iteration": 2.664839267730713 + }, + { + "auxiliary_loss_clip": 0.06579359, + "auxiliary_loss_mlp": 0.01283138, + "balance_loss_clip": 0.06312781, + "balance_loss_mlp": 0.0125783, + "epoch": 0.18277468811062678, + "flos": 53298905834880.0, + "grad_norm": 2.0617529505454866, + "language_loss": 0.74242914, + "learning_rate": 3.7601168872259593e-06, + "loss": 0.82105416, + "num_input_tokens_seen": 65705235, + "router_z_loss_clip": 2.66796875, + "router_z_loss_mlp": 0.25305176, + "step": 3040, + "time_per_iteration": 2.8341598510742188 + }, + { + "auxiliary_loss_clip": 0.06576048, + "auxiliary_loss_mlp": 0.01287293, + "balance_loss_clip": 0.06314505, + "balance_loss_mlp": 0.01261997, + "epoch": 0.18283481136329474, + "flos": 31658879975040.0, + "grad_norm": 2.270513376553218, + "language_loss": 0.61012894, + "learning_rate": 3.7599319119831075e-06, + "loss": 0.68876237, + "num_input_tokens_seen": 65727575, + "router_z_loss_clip": 2.61328125, + "router_z_loss_mlp": 0.25305176, + "step": 3041, + "time_per_iteration": 2.6312432289123535 + }, + { + "auxiliary_loss_clip": 0.065763, + "auxiliary_loss_mlp": 0.01280171, + "balance_loss_clip": 0.06311682, + "balance_loss_mlp": 0.01254779, + "epoch": 0.18289493461596273, + "flos": 53148957753600.0, + "grad_norm": 1.9789856473501881, + "language_loss": 0.60569113, + "learning_rate": 3.7597468700034616e-06, + "loss": 0.68425584, + "num_input_tokens_seen": 65751370, + "router_z_loss_clip": 2.6484375, + "router_z_loss_mlp": 0.25366211, + "step": 3042, + "time_per_iteration": 2.8294289112091064 + }, + { + "auxiliary_loss_clip": 0.06571855, + "auxiliary_loss_mlp": 0.01284933, + "balance_loss_clip": 0.06311391, + "balance_loss_mlp": 0.01261818, + "epoch": 0.1829550578686307, + "flos": 25595797587840.0, + "grad_norm": 2.1969947776781593, + "language_loss": 0.87948751, + "learning_rate": 3.7595617612940374e-06, + "loss": 0.95805538, + "num_input_tokens_seen": 65771040, + "router_z_loss_clip": 2.60742188, + "router_z_loss_mlp": 0.2310791, + "step": 3043, + "time_per_iteration": 2.5895864963531494 + }, + { + "auxiliary_loss_clip": 0.06576079, + "auxiliary_loss_mlp": 0.01280472, + "balance_loss_clip": 0.06308874, + "balance_loss_mlp": 0.01255737, + "epoch": 0.18301518112129866, + "flos": 22608001025280.0, + "grad_norm": 2.7546688504112633, + "language_loss": 0.71556103, + "learning_rate": 3.7593765858618552e-06, + "loss": 0.79412657, + "num_input_tokens_seen": 65789345, + "router_z_loss_clip": 2.671875, + "router_z_loss_mlp": 0.24731445, + "step": 3044, + "time_per_iteration": 2.524653196334839 + }, + { + "auxiliary_loss_clip": 0.06580091, + "auxiliary_loss_mlp": 0.0128018, + "balance_loss_clip": 0.06309704, + "balance_loss_mlp": 0.01255277, + "epoch": 0.18307530437396663, + "flos": 34029176273280.0, + "grad_norm": 2.5838478211487406, + "language_loss": 0.65133858, + "learning_rate": 3.7591913437139365e-06, + "loss": 0.72994125, + "num_input_tokens_seen": 65810990, + "router_z_loss_clip": 2.70117188, + "router_z_loss_mlp": 0.24914551, + "step": 3045, + "time_per_iteration": 2.656454086303711 + }, + { + "auxiliary_loss_clip": 0.06567913, + "auxiliary_loss_mlp": 0.01279381, + "balance_loss_clip": 0.06306372, + "balance_loss_mlp": 0.01256898, + "epoch": 0.1831354276266346, + "flos": 21284756547840.0, + "grad_norm": 3.147408680423339, + "language_loss": 0.803563, + "learning_rate": 3.7590060348573066e-06, + "loss": 0.88203591, + "num_input_tokens_seen": 65827230, + "router_z_loss_clip": 2.6171875, + "router_z_loss_mlp": 0.22497559, + "step": 3046, + "time_per_iteration": 2.503777503967285 + }, + { + "auxiliary_loss_clip": 0.06581149, + "auxiliary_loss_mlp": 0.01284573, + "balance_loss_clip": 0.06310049, + "balance_loss_mlp": 0.01259217, + "epoch": 0.18319555087930256, + "flos": 21039338338560.0, + "grad_norm": 2.4200593706157627, + "language_loss": 0.79505324, + "learning_rate": 3.7588206592989903e-06, + "loss": 0.87371051, + "num_input_tokens_seen": 65845900, + "router_z_loss_clip": 2.71289062, + "router_z_loss_mlp": 0.25354004, + "step": 3047, + "time_per_iteration": 2.5604546070098877 + }, + { + "auxiliary_loss_clip": 0.06579873, + "auxiliary_loss_mlp": 0.01282037, + "balance_loss_clip": 0.06320655, + "balance_loss_mlp": 0.01258243, + "epoch": 0.18325567413197055, + "flos": 34390944276480.0, + "grad_norm": 1.4781726378987778, + "language_loss": 0.81601483, + "learning_rate": 3.7586352170460194e-06, + "loss": 0.89463389, + "num_input_tokens_seen": 65868730, + "router_z_loss_clip": 2.59570312, + "router_z_loss_mlp": 0.23779297, + "step": 3048, + "time_per_iteration": 2.6359665393829346 + }, + { + "auxiliary_loss_clip": 0.06575403, + "auxiliary_loss_mlp": 0.01285089, + "balance_loss_clip": 0.0631268, + "balance_loss_mlp": 0.01260472, + "epoch": 0.18331579738463852, + "flos": 20564742412800.0, + "grad_norm": 2.1940168845136045, + "language_loss": 0.87414008, + "learning_rate": 3.758449708105424e-06, + "loss": 0.95274496, + "num_input_tokens_seen": 65888420, + "router_z_loss_clip": 2.62890625, + "router_z_loss_mlp": 0.24633789, + "step": 3049, + "time_per_iteration": 2.5575695037841797 + }, + { + "auxiliary_loss_clip": 0.06592787, + "auxiliary_loss_mlp": 0.01283738, + "balance_loss_clip": 0.0632069, + "balance_loss_mlp": 0.01259086, + "epoch": 0.18337592063730648, + "flos": 19613663844480.0, + "grad_norm": 3.2022638976819486, + "language_loss": 0.78845787, + "learning_rate": 3.75826413248424e-06, + "loss": 0.86722308, + "num_input_tokens_seen": 65905840, + "router_z_loss_clip": 2.72265625, + "router_z_loss_mlp": 0.24694824, + "step": 3050, + "time_per_iteration": 2.5530426502227783 + }, + { + "auxiliary_loss_clip": 0.06580114, + "auxiliary_loss_mlp": 0.01276938, + "balance_loss_clip": 0.06318066, + "balance_loss_mlp": 0.01253466, + "epoch": 0.18343604388997445, + "flos": 20857301343360.0, + "grad_norm": 2.3642096483096764, + "language_loss": 1.00007951, + "learning_rate": 3.7580784901895035e-06, + "loss": 1.07865, + "num_input_tokens_seen": 65922845, + "router_z_loss_clip": 2.62109375, + "router_z_loss_mlp": 0.23474121, + "step": 3051, + "time_per_iteration": 2.53879714012146 + }, + { + "auxiliary_loss_clip": 0.06576733, + "auxiliary_loss_mlp": 0.01279033, + "balance_loss_clip": 0.06316614, + "balance_loss_mlp": 0.01255025, + "epoch": 0.1834961671426424, + "flos": 24402109921920.0, + "grad_norm": 1.6089937167063422, + "language_loss": 0.87510651, + "learning_rate": 3.7578927812282542e-06, + "loss": 0.95366418, + "num_input_tokens_seen": 65945555, + "router_z_loss_clip": 2.6015625, + "router_z_loss_mlp": 0.23999023, + "step": 3052, + "time_per_iteration": 2.616711378097534 + }, + { + "auxiliary_loss_clip": 0.06578867, + "auxiliary_loss_mlp": 0.01277944, + "balance_loss_clip": 0.06319693, + "balance_loss_mlp": 0.01255485, + "epoch": 0.18355629039531038, + "flos": 21257992368000.0, + "grad_norm": 1.906783267886923, + "language_loss": 0.73879737, + "learning_rate": 3.7577070056075356e-06, + "loss": 0.81736547, + "num_input_tokens_seen": 65963965, + "router_z_loss_clip": 2.59179688, + "router_z_loss_mlp": 0.22473145, + "step": 3053, + "time_per_iteration": 2.5624823570251465 + }, + { + "auxiliary_loss_clip": 0.06577893, + "auxiliary_loss_mlp": 0.01281464, + "balance_loss_clip": 0.06309894, + "balance_loss_mlp": 0.01257264, + "epoch": 0.18361641364797834, + "flos": 28663830034560.0, + "grad_norm": 2.5767200648108233, + "language_loss": 0.6330536, + "learning_rate": 3.7575211633343902e-06, + "loss": 0.71164715, + "num_input_tokens_seen": 65985965, + "router_z_loss_clip": 2.6796875, + "router_z_loss_mlp": 0.24194336, + "step": 3054, + "time_per_iteration": 2.6126291751861572 + }, + { + "auxiliary_loss_clip": 0.06580043, + "auxiliary_loss_mlp": 0.01278803, + "balance_loss_clip": 0.0631642, + "balance_loss_mlp": 0.0125539, + "epoch": 0.18367653690064634, + "flos": 20924414064000.0, + "grad_norm": 2.0083810279560192, + "language_loss": 0.79178774, + "learning_rate": 3.7573352544158663e-06, + "loss": 0.87037629, + "num_input_tokens_seen": 66005645, + "router_z_loss_clip": 2.63671875, + "router_z_loss_mlp": 0.23400879, + "step": 3055, + "time_per_iteration": 3.9858450889587402 + }, + { + "auxiliary_loss_clip": 0.06567059, + "auxiliary_loss_mlp": 0.01278609, + "balance_loss_clip": 0.06311087, + "balance_loss_mlp": 0.01255971, + "epoch": 0.1837366601533143, + "flos": 28772884523520.0, + "grad_norm": 1.844309785332071, + "language_loss": 0.71021843, + "learning_rate": 3.757149278859014e-06, + "loss": 0.78867513, + "num_input_tokens_seen": 66025675, + "router_z_loss_clip": 2.56054688, + "router_z_loss_mlp": 0.2265625, + "step": 3056, + "time_per_iteration": 2.623892068862915 + }, + { + "auxiliary_loss_clip": 0.06573971, + "auxiliary_loss_mlp": 0.01282679, + "balance_loss_clip": 0.06309162, + "balance_loss_mlp": 0.0125954, + "epoch": 0.18379678340598227, + "flos": 21257782732800.0, + "grad_norm": 1.9202402240588465, + "language_loss": 0.81177384, + "learning_rate": 3.7569632366708842e-06, + "loss": 0.89034033, + "num_input_tokens_seen": 66046125, + "router_z_loss_clip": 2.64648438, + "router_z_loss_mlp": 0.23144531, + "step": 3057, + "time_per_iteration": 3.994014263153076 + }, + { + "auxiliary_loss_clip": 0.06576763, + "auxiliary_loss_mlp": 0.01288527, + "balance_loss_clip": 0.06303927, + "balance_loss_mlp": 0.01263029, + "epoch": 0.18385690665865023, + "flos": 20455981413120.0, + "grad_norm": 5.209505310648867, + "language_loss": 0.83562195, + "learning_rate": 3.756777127858533e-06, + "loss": 0.91427481, + "num_input_tokens_seen": 66064375, + "router_z_loss_clip": 2.73242188, + "router_z_loss_mlp": 0.25500488, + "step": 3058, + "time_per_iteration": 2.559356689453125 + }, + { + "auxiliary_loss_clip": 0.0658073, + "auxiliary_loss_mlp": 0.01283954, + "balance_loss_clip": 0.06315949, + "balance_loss_mlp": 0.01259278, + "epoch": 0.1839170299113182, + "flos": 26147736432000.0, + "grad_norm": 2.1347539719525552, + "language_loss": 0.86113238, + "learning_rate": 3.756590952429017e-06, + "loss": 0.93977928, + "num_input_tokens_seen": 66084590, + "router_z_loss_clip": 2.6484375, + "router_z_loss_mlp": 0.2467041, + "step": 3059, + "time_per_iteration": 2.5702602863311768 + }, + { + "auxiliary_loss_clip": 0.0656752, + "auxiliary_loss_mlp": 0.01279577, + "balance_loss_clip": 0.06304803, + "balance_loss_mlp": 0.01255997, + "epoch": 0.18397715316398616, + "flos": 31765921966080.0, + "grad_norm": 1.5595075663945241, + "language_loss": 0.73269093, + "learning_rate": 3.756404710389396e-06, + "loss": 0.81116188, + "num_input_tokens_seen": 66107105, + "router_z_loss_clip": 2.625, + "router_z_loss_mlp": 0.23583984, + "step": 3060, + "time_per_iteration": 2.6496734619140625 + }, + { + "auxiliary_loss_clip": 0.06572919, + "auxiliary_loss_mlp": 0.01280202, + "balance_loss_clip": 0.06306632, + "balance_loss_mlp": 0.01254715, + "epoch": 0.18403727641665413, + "flos": 24619548067200.0, + "grad_norm": 1.685629450787069, + "language_loss": 0.73033082, + "learning_rate": 3.7562184017467323e-06, + "loss": 0.80886197, + "num_input_tokens_seen": 66129295, + "router_z_loss_clip": 2.6640625, + "router_z_loss_mlp": 0.25512695, + "step": 3061, + "time_per_iteration": 2.611788034439087 + }, + { + "auxiliary_loss_clip": 0.06574027, + "auxiliary_loss_mlp": 0.01285757, + "balance_loss_clip": 0.06309725, + "balance_loss_mlp": 0.01262666, + "epoch": 0.18409739966932212, + "flos": 23446503233280.0, + "grad_norm": 3.8650330009727893, + "language_loss": 0.81972837, + "learning_rate": 3.7560320265080906e-06, + "loss": 0.89832628, + "num_input_tokens_seen": 66146910, + "router_z_loss_clip": 2.64453125, + "router_z_loss_mlp": 0.23095703, + "step": 3062, + "time_per_iteration": 5.428592920303345 + }, + { + "auxiliary_loss_clip": 0.06579094, + "auxiliary_loss_mlp": 0.01285398, + "balance_loss_clip": 0.06309452, + "balance_loss_mlp": 0.01260806, + "epoch": 0.18415752292199009, + "flos": 21878637160320.0, + "grad_norm": 1.977008299285237, + "language_loss": 0.74067175, + "learning_rate": 3.7558455846805383e-06, + "loss": 0.81931663, + "num_input_tokens_seen": 66165370, + "router_z_loss_clip": 2.69921875, + "router_z_loss_mlp": 0.24572754, + "step": 3063, + "time_per_iteration": 2.53143572807312 + }, + { + "auxiliary_loss_clip": 0.06568366, + "auxiliary_loss_mlp": 0.0128141, + "balance_loss_clip": 0.06305687, + "balance_loss_mlp": 0.01257556, + "epoch": 0.18421764617465805, + "flos": 25417701734400.0, + "grad_norm": 1.7280289049146156, + "language_loss": 0.66864884, + "learning_rate": 3.7556590762711463e-06, + "loss": 0.74714661, + "num_input_tokens_seen": 66186210, + "router_z_loss_clip": 2.62890625, + "router_z_loss_mlp": 0.23864746, + "step": 3064, + "time_per_iteration": 2.595961332321167 + }, + { + "auxiliary_loss_clip": 0.06569844, + "auxiliary_loss_mlp": 0.0127972, + "balance_loss_clip": 0.06304256, + "balance_loss_mlp": 0.01256748, + "epoch": 0.18427776942732602, + "flos": 27205395793920.0, + "grad_norm": 1.7817654183541871, + "language_loss": 0.69580668, + "learning_rate": 3.7554725012869853e-06, + "loss": 0.77430236, + "num_input_tokens_seen": 66204800, + "router_z_loss_clip": 2.65625, + "router_z_loss_mlp": 0.22937012, + "step": 3065, + "time_per_iteration": 2.5717501640319824 + }, + { + "auxiliary_loss_clip": 0.06574196, + "auxiliary_loss_mlp": 0.01283905, + "balance_loss_clip": 0.06306924, + "balance_loss_mlp": 0.01258168, + "epoch": 0.18433789267999398, + "flos": 27859303457280.0, + "grad_norm": 2.294674560085645, + "language_loss": 0.73328084, + "learning_rate": 3.7552858597351318e-06, + "loss": 0.81186187, + "num_input_tokens_seen": 66222195, + "router_z_loss_clip": 2.671875, + "router_z_loss_mlp": 0.25720215, + "step": 3066, + "time_per_iteration": 2.5840933322906494 + }, + { + "auxiliary_loss_clip": 0.06567979, + "auxiliary_loss_mlp": 0.01283252, + "balance_loss_clip": 0.06303403, + "balance_loss_mlp": 0.01259458, + "epoch": 0.18439801593266195, + "flos": 17862502965120.0, + "grad_norm": 1.9426241343058523, + "language_loss": 0.8287726, + "learning_rate": 3.7550991516226622e-06, + "loss": 0.90728498, + "num_input_tokens_seen": 66239505, + "router_z_loss_clip": 2.64453125, + "router_z_loss_mlp": 0.23791504, + "step": 3067, + "time_per_iteration": 2.510010004043579 + }, + { + "auxiliary_loss_clip": 0.06482083, + "auxiliary_loss_mlp": 0.01256206, + "balance_loss_clip": 0.06330505, + "balance_loss_mlp": 0.01248302, + "epoch": 0.18445813918532994, + "flos": 56408236416000.0, + "grad_norm": 0.8014843936748705, + "language_loss": 0.59808761, + "learning_rate": 3.754912376956657e-06, + "loss": 0.67547047, + "num_input_tokens_seen": 66295695, + "router_z_loss_clip": 1.515625, + "router_z_loss_mlp": 0.07897949, + "step": 3068, + "time_per_iteration": 3.036146879196167 + }, + { + "auxiliary_loss_clip": 0.06564388, + "auxiliary_loss_mlp": 0.01280505, + "balance_loss_clip": 0.06303549, + "balance_loss_mlp": 0.01256687, + "epoch": 0.1845182624379979, + "flos": 20963085523200.0, + "grad_norm": 1.8439912741449518, + "language_loss": 0.77266169, + "learning_rate": 3.7547255357441987e-06, + "loss": 0.8511107, + "num_input_tokens_seen": 66315315, + "router_z_loss_clip": 2.60742188, + "router_z_loss_mlp": 0.23840332, + "step": 3069, + "time_per_iteration": 2.5499565601348877 + }, + { + "auxiliary_loss_clip": 0.06570058, + "auxiliary_loss_mlp": 0.01283287, + "balance_loss_clip": 0.06303704, + "balance_loss_mlp": 0.01258038, + "epoch": 0.18457838569066587, + "flos": 20491382563200.0, + "grad_norm": 2.2630610204441655, + "language_loss": 0.86447155, + "learning_rate": 3.7545386279923718e-06, + "loss": 0.94300503, + "num_input_tokens_seen": 66333675, + "router_z_loss_clip": 2.6640625, + "router_z_loss_mlp": 0.25280762, + "step": 3070, + "time_per_iteration": 2.573843479156494 + }, + { + "auxiliary_loss_clip": 0.06575848, + "auxiliary_loss_mlp": 0.0128984, + "balance_loss_clip": 0.06307413, + "balance_loss_mlp": 0.01265545, + "epoch": 0.18463850894333383, + "flos": 25017094563840.0, + "grad_norm": 2.0459920671080725, + "language_loss": 0.78778827, + "learning_rate": 3.754351653708265e-06, + "loss": 0.86644518, + "num_input_tokens_seen": 66354075, + "router_z_loss_clip": 2.68164062, + "router_z_loss_mlp": 0.24279785, + "step": 3071, + "time_per_iteration": 2.6498963832855225 + }, + { + "auxiliary_loss_clip": 0.06567957, + "auxiliary_loss_mlp": 0.01281558, + "balance_loss_clip": 0.06301579, + "balance_loss_mlp": 0.01256142, + "epoch": 0.1846986321960018, + "flos": 16806311049600.0, + "grad_norm": 2.346095649750701, + "language_loss": 0.77759838, + "learning_rate": 3.7541646128989674e-06, + "loss": 0.85609353, + "num_input_tokens_seen": 66372520, + "router_z_loss_clip": 2.66210938, + "router_z_loss_mlp": 0.25427246, + "step": 3072, + "time_per_iteration": 2.5731780529022217 + }, + { + "auxiliary_loss_clip": 0.06569058, + "auxiliary_loss_mlp": 0.01286345, + "balance_loss_clip": 0.06299037, + "balance_loss_mlp": 0.01261096, + "epoch": 0.18475875544866976, + "flos": 20820726236160.0, + "grad_norm": 1.9004070702769575, + "language_loss": 0.87276495, + "learning_rate": 3.7539775055715715e-06, + "loss": 0.95131898, + "num_input_tokens_seen": 66390745, + "router_z_loss_clip": 2.703125, + "router_z_loss_mlp": 0.25231934, + "step": 3073, + "time_per_iteration": 2.5327014923095703 + }, + { + "auxiliary_loss_clip": 0.06571067, + "auxiliary_loss_mlp": 0.01285925, + "balance_loss_clip": 0.06302057, + "balance_loss_mlp": 0.01261523, + "epoch": 0.18481887870133773, + "flos": 22608001025280.0, + "grad_norm": 2.4702398063651314, + "language_loss": 0.9204939, + "learning_rate": 3.7537903317331732e-06, + "loss": 0.99906385, + "num_input_tokens_seen": 66410525, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 0.24401855, + "step": 3074, + "time_per_iteration": 2.6219372749328613 + }, + { + "auxiliary_loss_clip": 0.06566601, + "auxiliary_loss_mlp": 0.01284131, + "balance_loss_clip": 0.06302647, + "balance_loss_mlp": 0.01257583, + "epoch": 0.18487900195400572, + "flos": 29466218332800.0, + "grad_norm": 2.295087571563985, + "language_loss": 0.64970315, + "learning_rate": 3.75360309139087e-06, + "loss": 0.72821045, + "num_input_tokens_seen": 66432535, + "router_z_loss_clip": 2.640625, + "router_z_loss_mlp": 0.26550293, + "step": 3075, + "time_per_iteration": 2.6108217239379883 + }, + { + "auxiliary_loss_clip": 0.06563977, + "auxiliary_loss_mlp": 0.0128829, + "balance_loss_clip": 0.06303947, + "balance_loss_mlp": 0.01264519, + "epoch": 0.1849391252066737, + "flos": 20634622318080.0, + "grad_norm": 2.1580493004205943, + "language_loss": 0.7321173, + "learning_rate": 3.753415784551761e-06, + "loss": 0.81063998, + "num_input_tokens_seen": 66450620, + "router_z_loss_clip": 2.60351562, + "router_z_loss_mlp": 0.23742676, + "step": 3076, + "time_per_iteration": 2.552551746368408 + }, + { + "auxiliary_loss_clip": 0.06574243, + "auxiliary_loss_mlp": 0.01280151, + "balance_loss_clip": 0.06304738, + "balance_loss_mlp": 0.01256309, + "epoch": 0.18499924845934165, + "flos": 14433750691200.0, + "grad_norm": 2.459416187119703, + "language_loss": 0.82324487, + "learning_rate": 3.7532284112229507e-06, + "loss": 0.90178883, + "num_input_tokens_seen": 66467865, + "router_z_loss_clip": 2.69726562, + "router_z_loss_mlp": 0.23864746, + "step": 3077, + "time_per_iteration": 2.493069648742676 + }, + { + "auxiliary_loss_clip": 0.06560019, + "auxiliary_loss_mlp": 0.01280161, + "balance_loss_clip": 0.06302261, + "balance_loss_mlp": 0.01256748, + "epoch": 0.18505937171200962, + "flos": 23733611648640.0, + "grad_norm": 1.8347096473751274, + "language_loss": 0.79534197, + "learning_rate": 3.7530409714115424e-06, + "loss": 0.87374371, + "num_input_tokens_seen": 66486245, + "router_z_loss_clip": 2.57617188, + "router_z_loss_mlp": 0.23425293, + "step": 3078, + "time_per_iteration": 2.5838091373443604 + }, + { + "auxiliary_loss_clip": 0.0657796, + "auxiliary_loss_mlp": 0.01288284, + "balance_loss_clip": 0.06314268, + "balance_loss_mlp": 0.0126536, + "epoch": 0.18511949496467758, + "flos": 25964525479680.0, + "grad_norm": 2.3879568543100174, + "language_loss": 0.78543603, + "learning_rate": 3.7528534651246453e-06, + "loss": 0.86409843, + "num_input_tokens_seen": 66506510, + "router_z_loss_clip": 2.63867188, + "router_z_loss_mlp": 0.22937012, + "step": 3079, + "time_per_iteration": 2.5836563110351562 + }, + { + "auxiliary_loss_clip": 0.06574707, + "auxiliary_loss_mlp": 0.01290584, + "balance_loss_clip": 0.06311746, + "balance_loss_mlp": 0.01266921, + "epoch": 0.18517961821734555, + "flos": 42423506156160.0, + "grad_norm": 2.6792059094445393, + "language_loss": 0.82738018, + "learning_rate": 3.752665892369369e-06, + "loss": 0.90603304, + "num_input_tokens_seen": 66530960, + "router_z_loss_clip": 2.62890625, + "router_z_loss_mlp": 0.23669434, + "step": 3080, + "time_per_iteration": 2.7419395446777344 + }, + { + "auxiliary_loss_clip": 0.06581488, + "auxiliary_loss_mlp": 0.01283912, + "balance_loss_clip": 0.06312552, + "balance_loss_mlp": 0.01258306, + "epoch": 0.18523974147001354, + "flos": 24104435892480.0, + "grad_norm": 2.0136248585759815, + "language_loss": 0.75280142, + "learning_rate": 3.7524782531528266e-06, + "loss": 0.83145541, + "num_input_tokens_seen": 66550275, + "router_z_loss_clip": 2.69140625, + "router_z_loss_mlp": 0.25622559, + "step": 3081, + "time_per_iteration": 2.558880567550659 + }, + { + "auxiliary_loss_clip": 0.06580579, + "auxiliary_loss_mlp": 0.01294641, + "balance_loss_clip": 0.06314941, + "balance_loss_mlp": 0.01267354, + "epoch": 0.1852998647226815, + "flos": 27381688784640.0, + "grad_norm": 2.2228183561660533, + "language_loss": 0.72592467, + "learning_rate": 3.7522905474821334e-06, + "loss": 0.80467689, + "num_input_tokens_seen": 66569040, + "router_z_loss_clip": 2.65820312, + "router_z_loss_mlp": 0.27282715, + "step": 3082, + "time_per_iteration": 2.588782787322998 + }, + { + "auxiliary_loss_clip": 0.06586821, + "auxiliary_loss_mlp": 0.01289587, + "balance_loss_clip": 0.06314754, + "balance_loss_mlp": 0.01263409, + "epoch": 0.18535998797534947, + "flos": 18338650191360.0, + "grad_norm": 1.9336985276158285, + "language_loss": 0.70667702, + "learning_rate": 3.752102775364407e-06, + "loss": 0.78544116, + "num_input_tokens_seen": 66587775, + "router_z_loss_clip": 2.72070312, + "router_z_loss_mlp": 0.26184082, + "step": 3083, + "time_per_iteration": 2.630099296569824 + }, + { + "auxiliary_loss_clip": 0.06573243, + "auxiliary_loss_mlp": 0.01286773, + "balance_loss_clip": 0.06312741, + "balance_loss_mlp": 0.01261548, + "epoch": 0.18542011122801744, + "flos": 37853881816320.0, + "grad_norm": 1.8745280868212635, + "language_loss": 0.69687432, + "learning_rate": 3.751914936806767e-06, + "loss": 0.77547449, + "num_input_tokens_seen": 66610800, + "router_z_loss_clip": 2.60742188, + "router_z_loss_mlp": 0.25244141, + "step": 3084, + "time_per_iteration": 2.7246148586273193 + }, + { + "auxiliary_loss_clip": 0.06577612, + "auxiliary_loss_mlp": 0.01284469, + "balance_loss_clip": 0.06314437, + "balance_loss_mlp": 0.01261402, + "epoch": 0.1854802344806854, + "flos": 25192171670400.0, + "grad_norm": 1.5329506051970134, + "language_loss": 0.78209639, + "learning_rate": 3.7517270318163377e-06, + "loss": 0.86071718, + "num_input_tokens_seen": 66630960, + "router_z_loss_clip": 2.6328125, + "router_z_loss_mlp": 0.23071289, + "step": 3085, + "time_per_iteration": 2.6189463138580322 + }, + { + "auxiliary_loss_clip": 0.06579587, + "auxiliary_loss_mlp": 0.01287952, + "balance_loss_clip": 0.06314654, + "balance_loss_mlp": 0.01261964, + "epoch": 0.18554035773335337, + "flos": 26691541430400.0, + "grad_norm": 1.8306415954747441, + "language_loss": 0.74554545, + "learning_rate": 3.751539060400244e-06, + "loss": 0.82422084, + "num_input_tokens_seen": 66650585, + "router_z_loss_clip": 2.65234375, + "router_z_loss_mlp": 0.2598877, + "step": 3086, + "time_per_iteration": 2.5668296813964844 + }, + { + "auxiliary_loss_clip": 0.06581503, + "auxiliary_loss_mlp": 0.0129843, + "balance_loss_clip": 0.06316213, + "balance_loss_mlp": 0.01272026, + "epoch": 0.18560048098602133, + "flos": 22353568502400.0, + "grad_norm": 2.451797107788235, + "language_loss": 0.70597452, + "learning_rate": 3.7513510225656132e-06, + "loss": 0.78477389, + "num_input_tokens_seen": 66670045, + "router_z_loss_clip": 2.65429688, + "router_z_loss_mlp": 0.26391602, + "step": 3087, + "time_per_iteration": 2.568380117416382 + }, + { + "auxiliary_loss_clip": 0.06584737, + "auxiliary_loss_mlp": 0.01292318, + "balance_loss_clip": 0.06317757, + "balance_loss_mlp": 0.01264543, + "epoch": 0.18566060423868933, + "flos": 17754245089920.0, + "grad_norm": 1.9281487675228464, + "language_loss": 0.73915106, + "learning_rate": 3.7511629183195764e-06, + "loss": 0.81792164, + "num_input_tokens_seen": 66688790, + "router_z_loss_clip": 2.671875, + "router_z_loss_mlp": 0.27783203, + "step": 3088, + "time_per_iteration": 2.536055326461792 + }, + { + "auxiliary_loss_clip": 0.06578237, + "auxiliary_loss_mlp": 0.01288694, + "balance_loss_clip": 0.06316703, + "balance_loss_mlp": 0.0126571, + "epoch": 0.1857207274913573, + "flos": 24683558186880.0, + "grad_norm": 1.798814131108877, + "language_loss": 0.92793214, + "learning_rate": 3.7509747476692663e-06, + "loss": 1.00660145, + "num_input_tokens_seen": 66708090, + "router_z_loss_clip": 2.6171875, + "router_z_loss_mlp": 0.2298584, + "step": 3089, + "time_per_iteration": 2.591520071029663 + }, + { + "auxiliary_loss_clip": 0.06581305, + "auxiliary_loss_mlp": 0.01284125, + "balance_loss_clip": 0.06316443, + "balance_loss_mlp": 0.01260772, + "epoch": 0.18578085074402526, + "flos": 28155426186240.0, + "grad_norm": 2.9732427277308724, + "language_loss": 0.59245396, + "learning_rate": 3.7507865106218176e-06, + "loss": 0.67110825, + "num_input_tokens_seen": 66727320, + "router_z_loss_clip": 2.6484375, + "router_z_loss_mlp": 0.23352051, + "step": 3090, + "time_per_iteration": 2.587693452835083 + }, + { + "auxiliary_loss_clip": 0.06569171, + "auxiliary_loss_mlp": 0.01294048, + "balance_loss_clip": 0.06308332, + "balance_loss_mlp": 0.01269372, + "epoch": 0.18584097399669322, + "flos": 23958764369280.0, + "grad_norm": 1.6455413495288673, + "language_loss": 0.825216, + "learning_rate": 3.7505982071843695e-06, + "loss": 0.90384817, + "num_input_tokens_seen": 66747505, + "router_z_loss_clip": 2.61132812, + "router_z_loss_mlp": 0.24694824, + "step": 3091, + "time_per_iteration": 2.564748525619507 + }, + { + "auxiliary_loss_clip": 0.06580666, + "auxiliary_loss_mlp": 0.01293234, + "balance_loss_clip": 0.06311382, + "balance_loss_mlp": 0.01266758, + "epoch": 0.18590109724936119, + "flos": 17207379417600.0, + "grad_norm": 2.4797040605264904, + "language_loss": 0.8537268, + "learning_rate": 3.7504098373640617e-06, + "loss": 0.93246579, + "num_input_tokens_seen": 66766425, + "router_z_loss_clip": 2.69335938, + "router_z_loss_mlp": 0.2644043, + "step": 3092, + "time_per_iteration": 2.514536142349243 + }, + { + "auxiliary_loss_clip": 0.06587748, + "auxiliary_loss_mlp": 0.01293739, + "balance_loss_clip": 0.06317791, + "balance_loss_mlp": 0.012665, + "epoch": 0.18596122050202915, + "flos": 17239761820800.0, + "grad_norm": 2.2590627268781316, + "language_loss": 0.93402261, + "learning_rate": 3.750221401168038e-06, + "loss": 1.01283741, + "num_input_tokens_seen": 66781130, + "router_z_loss_clip": 2.69921875, + "router_z_loss_mlp": 0.27246094, + "step": 3093, + "time_per_iteration": 2.5037660598754883 + }, + { + "auxiliary_loss_clip": 0.06575991, + "auxiliary_loss_mlp": 0.01284238, + "balance_loss_clip": 0.06309767, + "balance_loss_mlp": 0.01258477, + "epoch": 0.18602134375469712, + "flos": 19025862652800.0, + "grad_norm": 1.8616717248352448, + "language_loss": 0.77931499, + "learning_rate": 3.750032898603443e-06, + "loss": 0.85791731, + "num_input_tokens_seen": 66797535, + "router_z_loss_clip": 2.66210938, + "router_z_loss_mlp": 0.25744629, + "step": 3094, + "time_per_iteration": 2.529491662979126 + }, + { + "auxiliary_loss_clip": 0.06576168, + "auxiliary_loss_mlp": 0.0128492, + "balance_loss_clip": 0.06311647, + "balance_loss_mlp": 0.01260637, + "epoch": 0.1860814670073651, + "flos": 50961285429120.0, + "grad_norm": 1.6485050019084173, + "language_loss": 0.70511484, + "learning_rate": 3.749844329677425e-06, + "loss": 0.7837258, + "num_input_tokens_seen": 66821720, + "router_z_loss_clip": 2.64257812, + "router_z_loss_mlp": 0.24291992, + "step": 3095, + "time_per_iteration": 4.124077558517456 + }, + { + "auxiliary_loss_clip": 0.0658177, + "auxiliary_loss_mlp": 0.01296881, + "balance_loss_clip": 0.06310082, + "balance_loss_mlp": 0.01268819, + "epoch": 0.18614159026003307, + "flos": 19397064240000.0, + "grad_norm": 1.9264485804072164, + "language_loss": 0.81302798, + "learning_rate": 3.749655694397135e-06, + "loss": 0.89181447, + "num_input_tokens_seen": 66839060, + "router_z_loss_clip": 2.71484375, + "router_z_loss_mlp": 0.28051758, + "step": 3096, + "time_per_iteration": 2.5277867317199707 + }, + { + "auxiliary_loss_clip": 0.06581111, + "auxiliary_loss_mlp": 0.01285017, + "balance_loss_clip": 0.06310429, + "balance_loss_mlp": 0.01259173, + "epoch": 0.18620171351270104, + "flos": 21805235383680.0, + "grad_norm": 1.9931413029080365, + "language_loss": 0.76143897, + "learning_rate": 3.7494669927697255e-06, + "loss": 0.84010023, + "num_input_tokens_seen": 66857760, + "router_z_loss_clip": 2.70898438, + "router_z_loss_mlp": 0.25842285, + "step": 3097, + "time_per_iteration": 3.982475996017456 + }, + { + "auxiliary_loss_clip": 0.06569855, + "auxiliary_loss_mlp": 0.01288887, + "balance_loss_clip": 0.06308468, + "balance_loss_mlp": 0.01263877, + "epoch": 0.186261836765369, + "flos": 16368499866240.0, + "grad_norm": 2.207337076402474, + "language_loss": 0.67101508, + "learning_rate": 3.749278224802352e-06, + "loss": 0.74960256, + "num_input_tokens_seen": 66876460, + "router_z_loss_clip": 2.61328125, + "router_z_loss_mlp": 0.25061035, + "step": 3098, + "time_per_iteration": 2.5570473670959473 + }, + { + "auxiliary_loss_clip": 0.06578363, + "auxiliary_loss_mlp": 0.01287977, + "balance_loss_clip": 0.06308189, + "balance_loss_mlp": 0.0126044, + "epoch": 0.18632196001803697, + "flos": 23377168379520.0, + "grad_norm": 1.559550653919394, + "language_loss": 0.70188725, + "learning_rate": 3.7490893905021733e-06, + "loss": 0.7805506, + "num_input_tokens_seen": 66897960, + "router_z_loss_clip": 2.703125, + "router_z_loss_mlp": 0.2755127, + "step": 3099, + "time_per_iteration": 2.5704476833343506 + }, + { + "auxiliary_loss_clip": 0.0657559, + "auxiliary_loss_mlp": 0.01292152, + "balance_loss_clip": 0.06309687, + "balance_loss_mlp": 0.01266689, + "epoch": 0.18638208327070493, + "flos": 22498569192960.0, + "grad_norm": 1.5145032946618349, + "language_loss": 0.72489583, + "learning_rate": 3.7489004898763494e-06, + "loss": 0.80357325, + "num_input_tokens_seen": 66917675, + "router_z_loss_clip": 2.66015625, + "router_z_loss_mlp": 0.25463867, + "step": 3100, + "time_per_iteration": 2.628770351409912 + }, + { + "auxiliary_loss_clip": 0.06585407, + "auxiliary_loss_mlp": 0.01287458, + "balance_loss_clip": 0.06314865, + "balance_loss_mlp": 0.01261971, + "epoch": 0.18644220652337293, + "flos": 29172317736960.0, + "grad_norm": 1.7314771672192502, + "language_loss": 0.80930734, + "learning_rate": 3.7487115229320444e-06, + "loss": 0.88803601, + "num_input_tokens_seen": 66936000, + "router_z_loss_clip": 2.703125, + "router_z_loss_mlp": 0.25524902, + "step": 3101, + "time_per_iteration": 4.063347578048706 + }, + { + "auxiliary_loss_clip": 0.0657436, + "auxiliary_loss_mlp": 0.01283038, + "balance_loss_clip": 0.06309733, + "balance_loss_mlp": 0.01259494, + "epoch": 0.1865023297760409, + "flos": 24250736321280.0, + "grad_norm": 2.4348094857493834, + "language_loss": 0.77630436, + "learning_rate": 3.7485224896764222e-06, + "loss": 0.85487837, + "num_input_tokens_seen": 66955700, + "router_z_loss_clip": 2.64453125, + "router_z_loss_mlp": 0.23535156, + "step": 3102, + "time_per_iteration": 3.9878056049346924 + }, + { + "auxiliary_loss_clip": 0.06580452, + "auxiliary_loss_mlp": 0.01284097, + "balance_loss_clip": 0.0631346, + "balance_loss_mlp": 0.01259504, + "epoch": 0.18656245302870886, + "flos": 19133617403520.0, + "grad_norm": 4.261808326107292, + "language_loss": 0.77043533, + "learning_rate": 3.7483333901166525e-06, + "loss": 0.8490808, + "num_input_tokens_seen": 66972815, + "router_z_loss_clip": 2.66992188, + "router_z_loss_mlp": 0.24584961, + "step": 3103, + "time_per_iteration": 2.5497515201568604 + }, + { + "auxiliary_loss_clip": 0.06580411, + "auxiliary_loss_mlp": 0.01279736, + "balance_loss_clip": 0.06311087, + "balance_loss_mlp": 0.01255596, + "epoch": 0.18662257628137682, + "flos": 17791994154240.0, + "grad_norm": 1.8534126866214053, + "language_loss": 0.80155015, + "learning_rate": 3.7481442242599054e-06, + "loss": 0.88015163, + "num_input_tokens_seen": 66992280, + "router_z_loss_clip": 2.69335938, + "router_z_loss_mlp": 0.24157715, + "step": 3104, + "time_per_iteration": 2.5436315536499023 + }, + { + "auxiliary_loss_clip": 0.06576735, + "auxiliary_loss_mlp": 0.01287024, + "balance_loss_clip": 0.06310537, + "balance_loss_mlp": 0.01262884, + "epoch": 0.1866826995340448, + "flos": 24031201824000.0, + "grad_norm": 1.9078675803700618, + "language_loss": 0.86523151, + "learning_rate": 3.747954992113354e-06, + "loss": 0.94386911, + "num_input_tokens_seen": 67012220, + "router_z_loss_clip": 2.66210938, + "router_z_loss_mlp": 0.24169922, + "step": 3105, + "time_per_iteration": 2.5862667560577393 + }, + { + "auxiliary_loss_clip": 0.06594124, + "auxiliary_loss_mlp": 0.01282565, + "balance_loss_clip": 0.06317551, + "balance_loss_mlp": 0.01257853, + "epoch": 0.18674282278671275, + "flos": 26148533045760.0, + "grad_norm": 3.6817594399013203, + "language_loss": 0.87727821, + "learning_rate": 3.7477656936841742e-06, + "loss": 0.95604515, + "num_input_tokens_seen": 67032030, + "router_z_loss_clip": 2.765625, + "router_z_loss_mlp": 0.24719238, + "step": 3106, + "time_per_iteration": 2.6158018112182617 + }, + { + "auxiliary_loss_clip": 0.06587484, + "auxiliary_loss_mlp": 0.01282217, + "balance_loss_clip": 0.06311296, + "balance_loss_mlp": 0.01259078, + "epoch": 0.18680294603938072, + "flos": 19206893399040.0, + "grad_norm": 1.800292289422269, + "language_loss": 0.78916037, + "learning_rate": 3.7475763289795445e-06, + "loss": 0.86785746, + "num_input_tokens_seen": 67048920, + "router_z_loss_clip": 2.76171875, + "router_z_loss_mlp": 0.23132324, + "step": 3107, + "time_per_iteration": 2.519771099090576 + }, + { + "auxiliary_loss_clip": 0.06579127, + "auxiliary_loss_mlp": 0.01290711, + "balance_loss_clip": 0.06304596, + "balance_loss_mlp": 0.01264997, + "epoch": 0.1868630692920487, + "flos": 28551840652800.0, + "grad_norm": 3.3283393961991345, + "language_loss": 0.75120842, + "learning_rate": 3.7473868980066446e-06, + "loss": 0.82990676, + "num_input_tokens_seen": 67068645, + "router_z_loss_clip": 2.75, + "router_z_loss_mlp": 0.25720215, + "step": 3108, + "time_per_iteration": 2.5681068897247314 + }, + { + "auxiliary_loss_clip": 0.06588297, + "auxiliary_loss_mlp": 0.01287258, + "balance_loss_clip": 0.06313515, + "balance_loss_mlp": 0.01262451, + "epoch": 0.18692319254471668, + "flos": 17243702962560.0, + "grad_norm": 1.5585462553143232, + "language_loss": 0.7488178, + "learning_rate": 3.747197400772658e-06, + "loss": 0.82757336, + "num_input_tokens_seen": 67087075, + "router_z_loss_clip": 2.7421875, + "router_z_loss_mlp": 0.24816895, + "step": 3109, + "time_per_iteration": 2.5719470977783203 + }, + { + "auxiliary_loss_clip": 0.06585538, + "auxiliary_loss_mlp": 0.01282339, + "balance_loss_clip": 0.06316088, + "balance_loss_mlp": 0.01256113, + "epoch": 0.18698331579738464, + "flos": 23191861075200.0, + "grad_norm": 1.4817620217833272, + "language_loss": 0.85173523, + "learning_rate": 3.747007837284772e-06, + "loss": 0.93041396, + "num_input_tokens_seen": 67108040, + "router_z_loss_clip": 2.69726562, + "router_z_loss_mlp": 0.26220703, + "step": 3110, + "time_per_iteration": 2.604595899581909 + }, + { + "auxiliary_loss_clip": 0.06572624, + "auxiliary_loss_mlp": 0.01284902, + "balance_loss_clip": 0.06305574, + "balance_loss_mlp": 0.01260142, + "epoch": 0.1870434390500526, + "flos": 25523192424960.0, + "grad_norm": 2.402854340329271, + "language_loss": 0.85246378, + "learning_rate": 3.7468182075501737e-06, + "loss": 0.93103909, + "num_input_tokens_seen": 67127605, + "router_z_loss_clip": 2.671875, + "router_z_loss_mlp": 0.24755859, + "step": 3111, + "time_per_iteration": 2.58076810836792 + }, + { + "auxiliary_loss_clip": 0.06578258, + "auxiliary_loss_mlp": 0.0128217, + "balance_loss_clip": 0.06306738, + "balance_loss_mlp": 0.0125778, + "epoch": 0.18710356230272057, + "flos": 19506999196800.0, + "grad_norm": 1.9642208489694009, + "language_loss": 0.77830005, + "learning_rate": 3.7466285115760536e-06, + "loss": 0.85690439, + "num_input_tokens_seen": 67145785, + "router_z_loss_clip": 2.71679688, + "router_z_loss_mlp": 0.24365234, + "step": 3112, + "time_per_iteration": 2.5625264644622803 + }, + { + "auxiliary_loss_clip": 0.06577107, + "auxiliary_loss_mlp": 0.01281729, + "balance_loss_clip": 0.06307282, + "balance_loss_mlp": 0.01258113, + "epoch": 0.18716368555538854, + "flos": 26768129662080.0, + "grad_norm": 2.238016316213089, + "language_loss": 0.65778387, + "learning_rate": 3.7464387493696046e-06, + "loss": 0.73637217, + "num_input_tokens_seen": 67165930, + "router_z_loss_clip": 2.70117188, + "router_z_loss_mlp": 0.23620605, + "step": 3113, + "time_per_iteration": 2.6080710887908936 + }, + { + "auxiliary_loss_clip": 0.06588607, + "auxiliary_loss_mlp": 0.01279317, + "balance_loss_clip": 0.06312529, + "balance_loss_mlp": 0.01254962, + "epoch": 0.1872238088080565, + "flos": 25196490155520.0, + "grad_norm": 2.335075222112074, + "language_loss": 0.82613724, + "learning_rate": 3.746248920938024e-06, + "loss": 0.90481651, + "num_input_tokens_seen": 67185830, + "router_z_loss_clip": 2.75976562, + "router_z_loss_mlp": 0.2434082, + "step": 3114, + "time_per_iteration": 2.5988082885742188 + }, + { + "auxiliary_loss_clip": 0.06587939, + "auxiliary_loss_mlp": 0.01289131, + "balance_loss_clip": 0.06312289, + "balance_loss_mlp": 0.01262655, + "epoch": 0.1872839320607245, + "flos": 24141220634880.0, + "grad_norm": 2.589653310619875, + "language_loss": 0.58319235, + "learning_rate": 3.74605902628851e-06, + "loss": 0.66196311, + "num_input_tokens_seen": 67206930, + "router_z_loss_clip": 2.75390625, + "router_z_loss_mlp": 0.26464844, + "step": 3115, + "time_per_iteration": 2.597001552581787 + }, + { + "auxiliary_loss_clip": 0.06578196, + "auxiliary_loss_mlp": 0.01284839, + "balance_loss_clip": 0.06308471, + "balance_loss_mlp": 0.01261676, + "epoch": 0.18734405531339246, + "flos": 21179349711360.0, + "grad_norm": 2.089321408475999, + "language_loss": 0.7264486, + "learning_rate": 3.745869065428261e-06, + "loss": 0.80507892, + "num_input_tokens_seen": 67226290, + "router_z_loss_clip": 2.6953125, + "router_z_loss_mlp": 0.23168945, + "step": 3116, + "time_per_iteration": 2.559483051300049 + }, + { + "auxiliary_loss_clip": 0.06573902, + "auxiliary_loss_mlp": 0.01278215, + "balance_loss_clip": 0.06309307, + "balance_loss_mlp": 0.01256292, + "epoch": 0.18740417856606043, + "flos": 17243325619200.0, + "grad_norm": 2.0473943382883184, + "language_loss": 0.79514784, + "learning_rate": 3.7456790383644833e-06, + "loss": 0.87366909, + "num_input_tokens_seen": 67244410, + "router_z_loss_clip": 2.64648438, + "router_z_loss_mlp": 0.21936035, + "step": 3117, + "time_per_iteration": 2.5308892726898193 + }, + { + "auxiliary_loss_clip": 0.06575021, + "auxiliary_loss_mlp": 0.01286113, + "balance_loss_clip": 0.06310903, + "balance_loss_mlp": 0.01262426, + "epoch": 0.1874643018187284, + "flos": 32565626933760.0, + "grad_norm": 1.6927935343473184, + "language_loss": 0.84475845, + "learning_rate": 3.745488945104381e-06, + "loss": 0.92336977, + "num_input_tokens_seen": 67264470, + "router_z_loss_clip": 2.64257812, + "router_z_loss_mlp": 0.23669434, + "step": 3118, + "time_per_iteration": 2.645819902420044 + }, + { + "auxiliary_loss_clip": 0.06577513, + "auxiliary_loss_mlp": 0.01281432, + "balance_loss_clip": 0.06306227, + "balance_loss_mlp": 0.01256184, + "epoch": 0.18752442507139636, + "flos": 23264843581440.0, + "grad_norm": 1.8564508885039195, + "language_loss": 0.77631271, + "learning_rate": 3.7452987856551636e-06, + "loss": 0.85490215, + "num_input_tokens_seen": 67284315, + "router_z_loss_clip": 2.7109375, + "router_z_loss_mlp": 0.25280762, + "step": 3119, + "time_per_iteration": 2.5282692909240723 + }, + { + "auxiliary_loss_clip": 0.06577515, + "auxiliary_loss_mlp": 0.01280917, + "balance_loss_clip": 0.06308109, + "balance_loss_mlp": 0.01257934, + "epoch": 0.18758454832406432, + "flos": 21767150903040.0, + "grad_norm": 1.872231122069903, + "language_loss": 0.83286214, + "learning_rate": 3.7451085600240406e-06, + "loss": 0.91144645, + "num_input_tokens_seen": 67302780, + "router_z_loss_clip": 2.69921875, + "router_z_loss_mlp": 0.22973633, + "step": 3120, + "time_per_iteration": 2.5557563304901123 + }, + { + "auxiliary_loss_clip": 0.06574757, + "auxiliary_loss_mlp": 0.01283184, + "balance_loss_clip": 0.06308539, + "balance_loss_mlp": 0.01260606, + "epoch": 0.1876446715767323, + "flos": 29577956152320.0, + "grad_norm": 1.9256466590755805, + "language_loss": 0.85764915, + "learning_rate": 3.7449182682182263e-06, + "loss": 0.93622863, + "num_input_tokens_seen": 67323405, + "router_z_loss_clip": 2.6640625, + "router_z_loss_mlp": 0.22595215, + "step": 3121, + "time_per_iteration": 2.5938265323638916 + }, + { + "auxiliary_loss_clip": 0.06579052, + "auxiliary_loss_mlp": 0.01278188, + "balance_loss_clip": 0.06313133, + "balance_loss_mlp": 0.01255037, + "epoch": 0.18770479482940028, + "flos": 30348465171840.0, + "grad_norm": 1.7101492266675271, + "language_loss": 0.71341884, + "learning_rate": 3.744727910244937e-06, + "loss": 0.79199123, + "num_input_tokens_seen": 67345800, + "router_z_loss_clip": 2.66015625, + "router_z_loss_mlp": 0.23156738, + "step": 3122, + "time_per_iteration": 2.6486034393310547 + }, + { + "auxiliary_loss_clip": 0.06583723, + "auxiliary_loss_mlp": 0.01279754, + "balance_loss_clip": 0.06317301, + "balance_loss_mlp": 0.01255602, + "epoch": 0.18776491808206824, + "flos": 14470619287680.0, + "grad_norm": 1.9121070999681127, + "language_loss": 0.71984768, + "learning_rate": 3.7445374861113905e-06, + "loss": 0.79848242, + "num_input_tokens_seen": 67363575, + "router_z_loss_clip": 2.6640625, + "router_z_loss_mlp": 0.24157715, + "step": 3123, + "time_per_iteration": 2.50598406791687 + }, + { + "auxiliary_loss_clip": 0.06582906, + "auxiliary_loss_mlp": 0.01279768, + "balance_loss_clip": 0.06318765, + "balance_loss_mlp": 0.01258251, + "epoch": 0.1878250413347362, + "flos": 24505420406400.0, + "grad_norm": 1.8100549345620827, + "language_loss": 0.74830985, + "learning_rate": 3.7443469958248066e-06, + "loss": 0.8269366, + "num_input_tokens_seen": 67381765, + "router_z_loss_clip": 2.63867188, + "router_z_loss_mlp": 0.21520996, + "step": 3124, + "time_per_iteration": 2.588963031768799 + }, + { + "auxiliary_loss_clip": 0.06579177, + "auxiliary_loss_mlp": 0.01284317, + "balance_loss_clip": 0.06309149, + "balance_loss_mlp": 0.01260177, + "epoch": 0.18788516458740417, + "flos": 39795632807040.0, + "grad_norm": 2.0156197395212225, + "language_loss": 0.81827998, + "learning_rate": 3.7441564393924106e-06, + "loss": 0.89691496, + "num_input_tokens_seen": 67405000, + "router_z_loss_clip": 2.703125, + "router_z_loss_mlp": 0.24133301, + "step": 3125, + "time_per_iteration": 2.6984996795654297 + }, + { + "auxiliary_loss_clip": 0.06689048, + "auxiliary_loss_mlp": 0.01323199, + "balance_loss_clip": 0.06516109, + "balance_loss_mlp": 0.01312268, + "epoch": 0.18794528784007214, + "flos": 64717844221440.0, + "grad_norm": 0.9517259918121469, + "language_loss": 0.63560247, + "learning_rate": 3.7439658168214273e-06, + "loss": 0.715725, + "num_input_tokens_seen": 67467140, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.10949707, + "step": 3126, + "time_per_iteration": 3.246349811553955 + }, + { + "auxiliary_loss_clip": 0.06580469, + "auxiliary_loss_mlp": 0.01289138, + "balance_loss_clip": 0.06317941, + "balance_loss_mlp": 0.01265118, + "epoch": 0.1880054110927401, + "flos": 28629728622720.0, + "grad_norm": 1.7132867879725662, + "language_loss": 0.81907004, + "learning_rate": 3.7437751281190857e-06, + "loss": 0.89776611, + "num_input_tokens_seen": 67487980, + "router_z_loss_clip": 2.62304688, + "router_z_loss_mlp": 0.24035645, + "step": 3127, + "time_per_iteration": 2.6359355449676514 + }, + { + "auxiliary_loss_clip": 0.06571439, + "auxiliary_loss_mlp": 0.01288176, + "balance_loss_clip": 0.06401625, + "balance_loss_mlp": 0.01277983, + "epoch": 0.1880655343454081, + "flos": 64508959192320.0, + "grad_norm": 0.7555261261025208, + "language_loss": 0.61928779, + "learning_rate": 3.7435843732926164e-06, + "loss": 0.69788396, + "num_input_tokens_seen": 67552500, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.10192871, + "step": 3128, + "time_per_iteration": 3.3078746795654297 + }, + { + "auxiliary_loss_clip": 0.06593472, + "auxiliary_loss_mlp": 0.01285866, + "balance_loss_clip": 0.06323253, + "balance_loss_mlp": 0.0126243, + "epoch": 0.18812565759807606, + "flos": 32132679287040.0, + "grad_norm": 2.3201362692378806, + "language_loss": 0.72451007, + "learning_rate": 3.7433935523492536e-06, + "loss": 0.80330348, + "num_input_tokens_seen": 67573295, + "router_z_loss_clip": 2.70117188, + "router_z_loss_mlp": 0.234375, + "step": 3129, + "time_per_iteration": 2.684316396713257 + }, + { + "auxiliary_loss_clip": 0.06599562, + "auxiliary_loss_mlp": 0.01283183, + "balance_loss_clip": 0.06331511, + "balance_loss_mlp": 0.01259294, + "epoch": 0.18818578085074403, + "flos": 20629674927360.0, + "grad_norm": 2.0063290669545024, + "language_loss": 0.85961545, + "learning_rate": 3.7432026652962314e-06, + "loss": 0.93844295, + "num_input_tokens_seen": 67590010, + "router_z_loss_clip": 2.68359375, + "router_z_loss_mlp": 0.23876953, + "step": 3130, + "time_per_iteration": 2.5385701656341553 + }, + { + "auxiliary_loss_clip": 0.0659353, + "auxiliary_loss_mlp": 0.0128556, + "balance_loss_clip": 0.06323448, + "balance_loss_mlp": 0.01262564, + "epoch": 0.188245904103412, + "flos": 28848131089920.0, + "grad_norm": 1.7743332045981155, + "language_loss": 0.77165318, + "learning_rate": 3.7430117121407897e-06, + "loss": 0.85044408, + "num_input_tokens_seen": 67611110, + "router_z_loss_clip": 2.69921875, + "router_z_loss_mlp": 0.23010254, + "step": 3131, + "time_per_iteration": 2.6456139087677 + }, + { + "auxiliary_loss_clip": 0.06594209, + "auxiliary_loss_mlp": 0.0129295, + "balance_loss_clip": 0.06329745, + "balance_loss_mlp": 0.01266891, + "epoch": 0.18830602735607996, + "flos": 29427379165440.0, + "grad_norm": 1.8335043044334671, + "language_loss": 0.8226279, + "learning_rate": 3.74282069289017e-06, + "loss": 0.90149951, + "num_input_tokens_seen": 67631990, + "router_z_loss_clip": 2.640625, + "router_z_loss_mlp": 0.26049805, + "step": 3132, + "time_per_iteration": 2.604219436645508 + }, + { + "auxiliary_loss_clip": 0.06612615, + "auxiliary_loss_mlp": 0.01296327, + "balance_loss_clip": 0.06340778, + "balance_loss_mlp": 0.01269886, + "epoch": 0.18836615060874792, + "flos": 28879884587520.0, + "grad_norm": 2.5361304129104476, + "language_loss": 0.80964118, + "learning_rate": 3.742629607551614e-06, + "loss": 0.88873059, + "num_input_tokens_seen": 67650490, + "router_z_loss_clip": 2.72070312, + "router_z_loss_mlp": 0.26452637, + "step": 3133, + "time_per_iteration": 2.6110780239105225 + }, + { + "auxiliary_loss_clip": 0.06596034, + "auxiliary_loss_mlp": 0.01290384, + "balance_loss_clip": 0.06326675, + "balance_loss_mlp": 0.01266709, + "epoch": 0.18842627386141592, + "flos": 22608294514560.0, + "grad_norm": 1.918700832470348, + "language_loss": 0.83331311, + "learning_rate": 3.7424384561323698e-06, + "loss": 0.91217732, + "num_input_tokens_seen": 67668860, + "router_z_loss_clip": 2.69921875, + "router_z_loss_mlp": 0.23669434, + "step": 3134, + "time_per_iteration": 3.9871177673339844 + }, + { + "auxiliary_loss_clip": 0.06585519, + "auxiliary_loss_mlp": 0.01303727, + "balance_loss_clip": 0.06320879, + "balance_loss_mlp": 0.01279873, + "epoch": 0.18848639711408388, + "flos": 24580834680960.0, + "grad_norm": 1.5688225209098985, + "language_loss": 0.83794045, + "learning_rate": 3.742247238639684e-06, + "loss": 0.91683292, + "num_input_tokens_seen": 67690220, + "router_z_loss_clip": 2.64648438, + "router_z_loss_mlp": 0.23852539, + "step": 3135, + "time_per_iteration": 2.576728343963623 + }, + { + "auxiliary_loss_clip": 0.06580248, + "auxiliary_loss_mlp": 0.01300724, + "balance_loss_clip": 0.06314597, + "balance_loss_mlp": 0.01277049, + "epoch": 0.18854652036675185, + "flos": 34175350920960.0, + "grad_norm": 2.0171444284890674, + "language_loss": 0.79025453, + "learning_rate": 3.7420559550808083e-06, + "loss": 0.86906427, + "num_input_tokens_seen": 67709820, + "router_z_loss_clip": 2.65429688, + "router_z_loss_mlp": 0.23681641, + "step": 3136, + "time_per_iteration": 4.059029817581177 + }, + { + "auxiliary_loss_clip": 0.06580447, + "auxiliary_loss_mlp": 0.01296286, + "balance_loss_clip": 0.06314041, + "balance_loss_mlp": 0.01272348, + "epoch": 0.1886066436194198, + "flos": 24205985441280.0, + "grad_norm": 1.848748774649379, + "language_loss": 0.82736617, + "learning_rate": 3.741864605462996e-06, + "loss": 0.90613353, + "num_input_tokens_seen": 67729490, + "router_z_loss_clip": 2.66796875, + "router_z_loss_mlp": 0.23925781, + "step": 3137, + "time_per_iteration": 2.5432510375976562 + }, + { + "auxiliary_loss_clip": 0.06589224, + "auxiliary_loss_mlp": 0.01291304, + "balance_loss_clip": 0.0632188, + "balance_loss_mlp": 0.0126745, + "epoch": 0.18866676687208778, + "flos": 21257405389440.0, + "grad_norm": 1.7037003999682347, + "language_loss": 0.81716311, + "learning_rate": 3.741673189793504e-06, + "loss": 0.89596832, + "num_input_tokens_seen": 67749665, + "router_z_loss_clip": 2.67382812, + "router_z_loss_mlp": 0.23864746, + "step": 3138, + "time_per_iteration": 2.5536084175109863 + }, + { + "auxiliary_loss_clip": 0.06589679, + "auxiliary_loss_mlp": 0.01290101, + "balance_loss_clip": 0.06319093, + "balance_loss_mlp": 0.01265985, + "epoch": 0.18872689012475574, + "flos": 37318294517760.0, + "grad_norm": 2.1585183145570723, + "language_loss": 0.64404404, + "learning_rate": 3.7414817080795896e-06, + "loss": 0.72284186, + "num_input_tokens_seen": 67776230, + "router_z_loss_clip": 2.70507812, + "router_z_loss_mlp": 0.24133301, + "step": 3139, + "time_per_iteration": 2.7355217933654785 + }, + { + "auxiliary_loss_clip": 0.06586127, + "auxiliary_loss_mlp": 0.01305421, + "balance_loss_clip": 0.06318149, + "balance_loss_mlp": 0.01280554, + "epoch": 0.1887870133774237, + "flos": 21658641465600.0, + "grad_norm": 2.033663323673097, + "language_loss": 0.72120833, + "learning_rate": 3.741290160328514e-06, + "loss": 0.80012381, + "num_input_tokens_seen": 67795080, + "router_z_loss_clip": 2.68164062, + "router_z_loss_mlp": 0.24865723, + "step": 3140, + "time_per_iteration": 2.556196928024292 + }, + { + "auxiliary_loss_clip": 0.06585391, + "auxiliary_loss_mlp": 0.01291018, + "balance_loss_clip": 0.06316558, + "balance_loss_mlp": 0.01264935, + "epoch": 0.1888471366300917, + "flos": 15930143631360.0, + "grad_norm": 2.3984250647338254, + "language_loss": 0.88684165, + "learning_rate": 3.7410985465475412e-06, + "loss": 0.9656058, + "num_input_tokens_seen": 67813110, + "router_z_loss_clip": 2.68554688, + "router_z_loss_mlp": 0.26086426, + "step": 3141, + "time_per_iteration": 5.341757774353027 + }, + { + "auxiliary_loss_clip": 0.06587377, + "auxiliary_loss_mlp": 0.01281785, + "balance_loss_clip": 0.06315634, + "balance_loss_mlp": 0.01256358, + "epoch": 0.18890725988275966, + "flos": 18557933126400.0, + "grad_norm": 1.8324612256611552, + "language_loss": 0.7775296, + "learning_rate": 3.7409068667439378e-06, + "loss": 0.85622126, + "num_input_tokens_seen": 67831070, + "router_z_loss_clip": 2.72070312, + "router_z_loss_mlp": 0.25390625, + "step": 3142, + "time_per_iteration": 2.5836708545684814 + }, + { + "auxiliary_loss_clip": 0.06576081, + "auxiliary_loss_mlp": 0.01283372, + "balance_loss_clip": 0.06312332, + "balance_loss_mlp": 0.01261413, + "epoch": 0.18896738313542763, + "flos": 28848550360320.0, + "grad_norm": 1.9913316615923113, + "language_loss": 0.79816502, + "learning_rate": 3.740715120924971e-06, + "loss": 0.87675953, + "num_input_tokens_seen": 67852170, + "router_z_loss_clip": 2.640625, + "router_z_loss_mlp": 0.21972656, + "step": 3143, + "time_per_iteration": 2.6068625450134277 + }, + { + "auxiliary_loss_clip": 0.06581955, + "auxiliary_loss_mlp": 0.01290595, + "balance_loss_clip": 0.0631283, + "balance_loss_mlp": 0.01266146, + "epoch": 0.1890275063880956, + "flos": 22418249454720.0, + "grad_norm": 2.17929571565749, + "language_loss": 0.72435296, + "learning_rate": 3.740523309097912e-06, + "loss": 0.80307841, + "num_input_tokens_seen": 67869945, + "router_z_loss_clip": 2.69140625, + "router_z_loss_mlp": 0.24475098, + "step": 3144, + "time_per_iteration": 2.565488338470459 + }, + { + "auxiliary_loss_clip": 0.06576345, + "auxiliary_loss_mlp": 0.0128465, + "balance_loss_clip": 0.0630596, + "balance_loss_mlp": 0.012602, + "epoch": 0.18908762964076356, + "flos": 24250862102400.0, + "grad_norm": 2.4312750691575253, + "language_loss": 0.74294418, + "learning_rate": 3.7403314312700356e-06, + "loss": 0.82155418, + "num_input_tokens_seen": 67890240, + "router_z_loss_clip": 2.70507812, + "router_z_loss_mlp": 0.24438477, + "step": 3145, + "time_per_iteration": 2.582784414291382 + }, + { + "auxiliary_loss_clip": 0.0656594, + "auxiliary_loss_mlp": 0.01281011, + "balance_loss_clip": 0.063042, + "balance_loss_mlp": 0.01258385, + "epoch": 0.18914775289343153, + "flos": 16988599607040.0, + "grad_norm": 2.264042873648611, + "language_loss": 0.77487111, + "learning_rate": 3.740139487448616e-06, + "loss": 0.85334063, + "num_input_tokens_seen": 67907825, + "router_z_loss_clip": 2.6171875, + "router_z_loss_mlp": 0.22631836, + "step": 3146, + "time_per_iteration": 2.5446579456329346 + }, + { + "auxiliary_loss_clip": 0.06567892, + "auxiliary_loss_mlp": 0.01282874, + "balance_loss_clip": 0.06302544, + "balance_loss_mlp": 0.01259342, + "epoch": 0.1892078761460995, + "flos": 21550257809280.0, + "grad_norm": 2.367888350934947, + "language_loss": 0.79622674, + "learning_rate": 3.7399474776409326e-06, + "loss": 0.87473428, + "num_input_tokens_seen": 67926670, + "router_z_loss_clip": 2.66015625, + "router_z_loss_mlp": 0.23535156, + "step": 3147, + "time_per_iteration": 2.5432369709014893 + }, + { + "auxiliary_loss_clip": 0.06564464, + "auxiliary_loss_mlp": 0.0128295, + "balance_loss_clip": 0.06297393, + "balance_loss_mlp": 0.01259096, + "epoch": 0.18926799939876748, + "flos": 23007979290240.0, + "grad_norm": 3.3066597325179443, + "language_loss": 0.67790151, + "learning_rate": 3.739755401854267e-06, + "loss": 0.75637561, + "num_input_tokens_seen": 67943645, + "router_z_loss_clip": 2.66992188, + "router_z_loss_mlp": 0.23864746, + "step": 3148, + "time_per_iteration": 2.5936107635498047 + }, + { + "auxiliary_loss_clip": 0.06566582, + "auxiliary_loss_mlp": 0.01281142, + "balance_loss_clip": 0.06297165, + "balance_loss_mlp": 0.01256693, + "epoch": 0.18932812265143545, + "flos": 22279537820160.0, + "grad_norm": 2.2349625482761843, + "language_loss": 0.76378185, + "learning_rate": 3.739563260095902e-06, + "loss": 0.84225905, + "num_input_tokens_seen": 67962345, + "router_z_loss_clip": 2.69335938, + "router_z_loss_mlp": 0.24450684, + "step": 3149, + "time_per_iteration": 2.5491833686828613 + }, + { + "auxiliary_loss_clip": 0.0656079, + "auxiliary_loss_mlp": 0.01279685, + "balance_loss_clip": 0.06300658, + "balance_loss_mlp": 0.01256785, + "epoch": 0.1893882459041034, + "flos": 18630328654080.0, + "grad_norm": 2.2856364952022687, + "language_loss": 0.81782246, + "learning_rate": 3.7393710523731245e-06, + "loss": 0.89622724, + "num_input_tokens_seen": 67979760, + "router_z_loss_clip": 2.59960938, + "router_z_loss_mlp": 0.22912598, + "step": 3150, + "time_per_iteration": 2.568166494369507 + }, + { + "auxiliary_loss_clip": 0.06565347, + "auxiliary_loss_mlp": 0.01285958, + "balance_loss_clip": 0.06297709, + "balance_loss_mlp": 0.01262617, + "epoch": 0.18944836915677138, + "flos": 22899553706880.0, + "grad_norm": 2.23925150788406, + "language_loss": 0.86091208, + "learning_rate": 3.7391787786932215e-06, + "loss": 0.93942523, + "num_input_tokens_seen": 67996895, + "router_z_loss_clip": 2.67773438, + "router_z_loss_mlp": 0.2331543, + "step": 3151, + "time_per_iteration": 2.520254373550415 + }, + { + "auxiliary_loss_clip": 0.06570399, + "auxiliary_loss_mlp": 0.01289995, + "balance_loss_clip": 0.06303516, + "balance_loss_mlp": 0.01266297, + "epoch": 0.18950849240943934, + "flos": 26803698520320.0, + "grad_norm": 1.7542668261130185, + "language_loss": 0.75358492, + "learning_rate": 3.7389864390634857e-06, + "loss": 0.83218884, + "num_input_tokens_seen": 68018365, + "router_z_loss_clip": 2.66796875, + "router_z_loss_mlp": 0.23706055, + "step": 3152, + "time_per_iteration": 2.612248182296753 + }, + { + "auxiliary_loss_clip": 0.06565326, + "auxiliary_loss_mlp": 0.01283167, + "balance_loss_clip": 0.06301029, + "balance_loss_mlp": 0.01258431, + "epoch": 0.1895686156621073, + "flos": 24977919980160.0, + "grad_norm": 1.8204901028243692, + "language_loss": 0.76455373, + "learning_rate": 3.738794033491209e-06, + "loss": 0.84303862, + "num_input_tokens_seen": 68037985, + "router_z_loss_clip": 2.64257812, + "router_z_loss_mlp": 0.24755859, + "step": 3153, + "time_per_iteration": 2.5559494495391846 + }, + { + "auxiliary_loss_clip": 0.06567015, + "auxiliary_loss_mlp": 0.01280834, + "balance_loss_clip": 0.06300367, + "balance_loss_mlp": 0.01256599, + "epoch": 0.1896287389147753, + "flos": 21950990760960.0, + "grad_norm": 1.7894410743269322, + "language_loss": 0.80290896, + "learning_rate": 3.7386015619836887e-06, + "loss": 0.88138747, + "num_input_tokens_seen": 68057975, + "router_z_loss_clip": 2.66601562, + "router_z_loss_mlp": 0.24255371, + "step": 3154, + "time_per_iteration": 2.554861545562744 + }, + { + "auxiliary_loss_clip": 0.06572987, + "auxiliary_loss_mlp": 0.01294065, + "balance_loss_clip": 0.06302256, + "balance_loss_mlp": 0.01267612, + "epoch": 0.18968886216744327, + "flos": 18183628938240.0, + "grad_norm": 2.9256856308256447, + "language_loss": 0.74259496, + "learning_rate": 3.738409024548223e-06, + "loss": 0.82126546, + "num_input_tokens_seen": 68074175, + "router_z_loss_clip": 2.70898438, + "router_z_loss_mlp": 0.26452637, + "step": 3155, + "time_per_iteration": 2.473719358444214 + }, + { + "auxiliary_loss_clip": 0.06557501, + "auxiliary_loss_mlp": 0.01284077, + "balance_loss_clip": 0.06296935, + "balance_loss_mlp": 0.01260247, + "epoch": 0.18974898542011123, + "flos": 20418735473280.0, + "grad_norm": 2.585248701074102, + "language_loss": 0.74503541, + "learning_rate": 3.7382164211921136e-06, + "loss": 0.82345116, + "num_input_tokens_seen": 68095230, + "router_z_loss_clip": 2.60546875, + "router_z_loss_mlp": 0.23815918, + "step": 3156, + "time_per_iteration": 2.5825979709625244 + }, + { + "auxiliary_loss_clip": 0.06561351, + "auxiliary_loss_mlp": 0.01283032, + "balance_loss_clip": 0.06294506, + "balance_loss_mlp": 0.01259786, + "epoch": 0.1898091086727792, + "flos": 23991356407680.0, + "grad_norm": 1.7654819302184697, + "language_loss": 0.68914878, + "learning_rate": 3.7380237519226623e-06, + "loss": 0.76759267, + "num_input_tokens_seen": 68113805, + "router_z_loss_clip": 2.66796875, + "router_z_loss_mlp": 0.23266602, + "step": 3157, + "time_per_iteration": 2.614276170730591 + }, + { + "auxiliary_loss_clip": 0.06562739, + "auxiliary_loss_mlp": 0.01287461, + "balance_loss_clip": 0.06299365, + "balance_loss_mlp": 0.01263822, + "epoch": 0.18986923192544716, + "flos": 27644590569600.0, + "grad_norm": 1.6841569236878713, + "language_loss": 0.80553401, + "learning_rate": 3.737831016747176e-06, + "loss": 0.88403606, + "num_input_tokens_seen": 68133190, + "router_z_loss_clip": 2.63867188, + "router_z_loss_mlp": 0.23657227, + "step": 3158, + "time_per_iteration": 2.6667590141296387 + }, + { + "auxiliary_loss_clip": 0.06570458, + "auxiliary_loss_mlp": 0.01285173, + "balance_loss_clip": 0.06298561, + "balance_loss_mlp": 0.01260509, + "epoch": 0.18992935517811513, + "flos": 25491271219200.0, + "grad_norm": 2.1165299373469755, + "language_loss": 0.72984976, + "learning_rate": 3.737638215672964e-06, + "loss": 0.808406, + "num_input_tokens_seen": 68152330, + "router_z_loss_clip": 2.71875, + "router_z_loss_mlp": 0.2467041, + "step": 3159, + "time_per_iteration": 2.5685224533081055 + }, + { + "auxiliary_loss_clip": 0.06567825, + "auxiliary_loss_mlp": 0.01281428, + "balance_loss_clip": 0.06301159, + "balance_loss_mlp": 0.01257014, + "epoch": 0.1899894784307831, + "flos": 17426578498560.0, + "grad_norm": 1.8951112773112917, + "language_loss": 0.86019123, + "learning_rate": 3.7374453487073366e-06, + "loss": 0.93868375, + "num_input_tokens_seen": 68170185, + "router_z_loss_clip": 2.66601562, + "router_z_loss_mlp": 0.24438477, + "step": 3160, + "time_per_iteration": 2.533764362335205 + }, + { + "auxiliary_loss_clip": 0.06553883, + "auxiliary_loss_mlp": 0.0128672, + "balance_loss_clip": 0.06294671, + "balance_loss_mlp": 0.01264154, + "epoch": 0.19004960168345109, + "flos": 27499925295360.0, + "grad_norm": 1.7631570201415632, + "language_loss": 0.74244189, + "learning_rate": 3.7372524158576074e-06, + "loss": 0.82084787, + "num_input_tokens_seen": 68191665, + "router_z_loss_clip": 2.59570312, + "router_z_loss_mlp": 0.22570801, + "step": 3161, + "time_per_iteration": 2.590913772583008 + }, + { + "auxiliary_loss_clip": 0.06558438, + "auxiliary_loss_mlp": 0.01279623, + "balance_loss_clip": 0.06296802, + "balance_loss_mlp": 0.01255817, + "epoch": 0.19010972493611905, + "flos": 38663858908800.0, + "grad_norm": 1.9041337161295762, + "language_loss": 0.81525451, + "learning_rate": 3.7370594171310926e-06, + "loss": 0.89363515, + "num_input_tokens_seen": 68214635, + "router_z_loss_clip": 2.61523438, + "router_z_loss_mlp": 0.23803711, + "step": 3162, + "time_per_iteration": 2.7009496688842773 + }, + { + "auxiliary_loss_clip": 0.06556226, + "auxiliary_loss_mlp": 0.01281702, + "balance_loss_clip": 0.06291863, + "balance_loss_mlp": 0.012573, + "epoch": 0.19016984818878702, + "flos": 19250763811200.0, + "grad_norm": 2.198798501736265, + "language_loss": 0.77194953, + "learning_rate": 3.73686635253511e-06, + "loss": 0.8503288, + "num_input_tokens_seen": 68232150, + "router_z_loss_clip": 2.64453125, + "router_z_loss_mlp": 0.2442627, + "step": 3163, + "time_per_iteration": 2.5443172454833984 + }, + { + "auxiliary_loss_clip": 0.06551848, + "auxiliary_loss_mlp": 0.01280109, + "balance_loss_clip": 0.06291605, + "balance_loss_mlp": 0.01256291, + "epoch": 0.19022997144145498, + "flos": 37605947984640.0, + "grad_norm": 1.6741633946121544, + "language_loss": 0.75098169, + "learning_rate": 3.736673222076982e-06, + "loss": 0.82930118, + "num_input_tokens_seen": 68253370, + "router_z_loss_clip": 2.6015625, + "router_z_loss_mlp": 0.23815918, + "step": 3164, + "time_per_iteration": 2.6625473499298096 + }, + { + "auxiliary_loss_clip": 0.06555005, + "auxiliary_loss_mlp": 0.01280136, + "balance_loss_clip": 0.06294911, + "balance_loss_mlp": 0.01256759, + "epoch": 0.19029009469412295, + "flos": 61543874615040.0, + "grad_norm": 2.119573778415358, + "language_loss": 0.67527556, + "learning_rate": 3.7364800257640313e-06, + "loss": 0.75362694, + "num_input_tokens_seen": 68278895, + "router_z_loss_clip": 2.60351562, + "router_z_loss_mlp": 0.23364258, + "step": 3165, + "time_per_iteration": 2.8877623081207275 + }, + { + "auxiliary_loss_clip": 0.06552027, + "auxiliary_loss_mlp": 0.01278943, + "balance_loss_clip": 0.06292567, + "balance_loss_mlp": 0.01254433, + "epoch": 0.1903502179467909, + "flos": 13960077160320.0, + "grad_norm": 2.3966036589645916, + "language_loss": 0.75069398, + "learning_rate": 3.7362867636035835e-06, + "loss": 0.82900369, + "num_input_tokens_seen": 68294880, + "router_z_loss_clip": 2.59765625, + "router_z_loss_mlp": 0.24523926, + "step": 3166, + "time_per_iteration": 2.505680799484253 + }, + { + "auxiliary_loss_clip": 0.06499279, + "auxiliary_loss_mlp": 0.0131955, + "balance_loss_clip": 0.06350935, + "balance_loss_mlp": 0.01311236, + "epoch": 0.1904103411994589, + "flos": 66920484499200.0, + "grad_norm": 0.8228799096925371, + "language_loss": 0.50405741, + "learning_rate": 3.736093435602968e-06, + "loss": 0.58224571, + "num_input_tokens_seen": 68359665, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.08319092, + "step": 3167, + "time_per_iteration": 3.1767730712890625 + }, + { + "auxiliary_loss_clip": 0.06551085, + "auxiliary_loss_mlp": 0.0128493, + "balance_loss_clip": 0.06295685, + "balance_loss_mlp": 0.0126141, + "epoch": 0.19047046445212687, + "flos": 21915296121600.0, + "grad_norm": 1.8666443369688703, + "language_loss": 0.75258517, + "learning_rate": 3.7359000417695156e-06, + "loss": 0.83094531, + "num_input_tokens_seen": 68378950, + "router_z_loss_clip": 2.5546875, + "router_z_loss_mlp": 0.23522949, + "step": 3168, + "time_per_iteration": 2.539647102355957 + }, + { + "auxiliary_loss_clip": 0.06476398, + "auxiliary_loss_mlp": 0.01306941, + "balance_loss_clip": 0.06328493, + "balance_loss_mlp": 0.01299204, + "epoch": 0.19053058770479483, + "flos": 59271549338880.0, + "grad_norm": 0.8502356895352512, + "language_loss": 0.60174263, + "learning_rate": 3.73570658211056e-06, + "loss": 0.67957604, + "num_input_tokens_seen": 68434235, + "router_z_loss_clip": 1.4765625, + "router_z_loss_mlp": 0.07727051, + "step": 3169, + "time_per_iteration": 3.0786385536193848 + }, + { + "auxiliary_loss_clip": 0.06569149, + "auxiliary_loss_mlp": 0.01284984, + "balance_loss_clip": 0.06301555, + "balance_loss_mlp": 0.01260057, + "epoch": 0.1905907109574628, + "flos": 23958093536640.0, + "grad_norm": 1.6203962411975037, + "language_loss": 0.79296863, + "learning_rate": 3.735513056633436e-06, + "loss": 0.87151003, + "num_input_tokens_seen": 68453830, + "router_z_loss_clip": 2.67773438, + "router_z_loss_mlp": 0.24926758, + "step": 3170, + "time_per_iteration": 2.5439629554748535 + }, + { + "auxiliary_loss_clip": 0.06568529, + "auxiliary_loss_mlp": 0.01282478, + "balance_loss_clip": 0.06308423, + "balance_loss_mlp": 0.01258636, + "epoch": 0.19065083421013077, + "flos": 20818378321920.0, + "grad_norm": 3.266788836182488, + "language_loss": 0.78913432, + "learning_rate": 3.7353194653454834e-06, + "loss": 0.86764443, + "num_input_tokens_seen": 68473005, + "router_z_loss_clip": 2.6015625, + "router_z_loss_mlp": 0.23840332, + "step": 3171, + "time_per_iteration": 2.5944604873657227 + }, + { + "auxiliary_loss_clip": 0.06584235, + "auxiliary_loss_mlp": 0.01294559, + "balance_loss_clip": 0.06313154, + "balance_loss_mlp": 0.01269323, + "epoch": 0.19071095746279873, + "flos": 31293003121920.0, + "grad_norm": 1.9362395671252917, + "language_loss": 0.79769027, + "learning_rate": 3.7351258082540426e-06, + "loss": 0.8764782, + "num_input_tokens_seen": 68493470, + "router_z_loss_clip": 2.71484375, + "router_z_loss_mlp": 0.25256348, + "step": 3172, + "time_per_iteration": 2.6039323806762695 + }, + { + "auxiliary_loss_clip": 0.06578603, + "auxiliary_loss_mlp": 0.01291257, + "balance_loss_clip": 0.06316808, + "balance_loss_mlp": 0.0126738, + "epoch": 0.1907710807154667, + "flos": 14361397090560.0, + "grad_norm": 1.549568453685288, + "language_loss": 0.81519973, + "learning_rate": 3.7349320853664576e-06, + "loss": 0.89389837, + "num_input_tokens_seen": 68511290, + "router_z_loss_clip": 2.6171875, + "router_z_loss_mlp": 0.2388916, + "step": 3173, + "time_per_iteration": 2.566249132156372 + }, + { + "auxiliary_loss_clip": 0.06577085, + "auxiliary_loss_mlp": 0.01291087, + "balance_loss_clip": 0.06311868, + "balance_loss_mlp": 0.01266077, + "epoch": 0.1908312039681347, + "flos": 26914388163840.0, + "grad_norm": 1.4831321875737526, + "language_loss": 0.79620194, + "learning_rate": 3.7347382966900735e-06, + "loss": 0.87488365, + "num_input_tokens_seen": 68532575, + "router_z_loss_clip": 2.65429688, + "router_z_loss_mlp": 0.25012207, + "step": 3174, + "time_per_iteration": 4.032260179519653 + }, + { + "auxiliary_loss_clip": 0.06571774, + "auxiliary_loss_mlp": 0.01295417, + "balance_loss_clip": 0.06307514, + "balance_loss_mlp": 0.01271563, + "epoch": 0.19089132722080265, + "flos": 14498767059840.0, + "grad_norm": 1.9289574693520037, + "language_loss": 0.82161433, + "learning_rate": 3.7345444422322395e-06, + "loss": 0.9002862, + "num_input_tokens_seen": 68548760, + "router_z_loss_clip": 2.640625, + "router_z_loss_mlp": 0.23864746, + "step": 3175, + "time_per_iteration": 3.92791748046875 + }, + { + "auxiliary_loss_clip": 0.06570717, + "auxiliary_loss_mlp": 0.01290773, + "balance_loss_clip": 0.06306395, + "balance_loss_mlp": 0.01265393, + "epoch": 0.19095145047347062, + "flos": 13957771173120.0, + "grad_norm": 2.497584127695701, + "language_loss": 0.86521202, + "learning_rate": 3.7343505220003067e-06, + "loss": 0.94382691, + "num_input_tokens_seen": 68563100, + "router_z_loss_clip": 2.640625, + "router_z_loss_mlp": 0.25390625, + "step": 3176, + "time_per_iteration": 2.5083093643188477 + }, + { + "auxiliary_loss_clip": 0.06573781, + "auxiliary_loss_mlp": 0.01293305, + "balance_loss_clip": 0.06304888, + "balance_loss_mlp": 0.01265148, + "epoch": 0.19101157372613858, + "flos": 25308940734720.0, + "grad_norm": 2.21127293150792, + "language_loss": 0.82911885, + "learning_rate": 3.7341565360016285e-06, + "loss": 0.90778971, + "num_input_tokens_seen": 68581650, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 0.28137207, + "step": 3177, + "time_per_iteration": 2.5615227222442627 + }, + { + "auxiliary_loss_clip": 0.06560818, + "auxiliary_loss_mlp": 0.01287183, + "balance_loss_clip": 0.06300267, + "balance_loss_mlp": 0.01263985, + "epoch": 0.19107169697880655, + "flos": 20564448923520.0, + "grad_norm": 2.02770964818788, + "language_loss": 0.75787783, + "learning_rate": 3.73396248424356e-06, + "loss": 0.83635783, + "num_input_tokens_seen": 68600360, + "router_z_loss_clip": 2.60351562, + "router_z_loss_mlp": 0.23205566, + "step": 3178, + "time_per_iteration": 2.6215403079986572 + }, + { + "auxiliary_loss_clip": 0.06568342, + "auxiliary_loss_mlp": 0.01282871, + "balance_loss_clip": 0.06301986, + "balance_loss_mlp": 0.01260233, + "epoch": 0.19113182023147451, + "flos": 22169644790400.0, + "grad_norm": 1.6828125352275214, + "language_loss": 0.82549155, + "learning_rate": 3.7337683667334606e-06, + "loss": 0.90400362, + "num_input_tokens_seen": 68617885, + "router_z_loss_clip": 2.6640625, + "router_z_loss_mlp": 0.22644043, + "step": 3179, + "time_per_iteration": 2.5675652027130127 + }, + { + "auxiliary_loss_clip": 0.06569887, + "auxiliary_loss_mlp": 0.01296491, + "balance_loss_clip": 0.06307887, + "balance_loss_mlp": 0.012734, + "epoch": 0.19119194348414248, + "flos": 18586667877120.0, + "grad_norm": 2.5330173520749124, + "language_loss": 0.80732077, + "learning_rate": 3.733574183478691e-06, + "loss": 0.88598454, + "num_input_tokens_seen": 68634550, + "router_z_loss_clip": 2.62109375, + "router_z_loss_mlp": 0.23095703, + "step": 3180, + "time_per_iteration": 3.945387601852417 + }, + { + "auxiliary_loss_clip": 0.06563538, + "auxiliary_loss_mlp": 0.01290582, + "balance_loss_clip": 0.06302621, + "balance_loss_mlp": 0.01266883, + "epoch": 0.19125206673681047, + "flos": 19032738687360.0, + "grad_norm": 2.1003445268953373, + "language_loss": 0.79773259, + "learning_rate": 3.733379934486615e-06, + "loss": 0.87627381, + "num_input_tokens_seen": 68651895, + "router_z_loss_clip": 2.609375, + "router_z_loss_mlp": 0.23706055, + "step": 3181, + "time_per_iteration": 3.9274189472198486 + }, + { + "auxiliary_loss_clip": 0.06568001, + "auxiliary_loss_mlp": 0.01288302, + "balance_loss_clip": 0.06304715, + "balance_loss_mlp": 0.0126477, + "epoch": 0.19131218998947844, + "flos": 21696725946240.0, + "grad_norm": 2.2417902838655888, + "language_loss": 0.74386561, + "learning_rate": 3.7331856197645973e-06, + "loss": 0.82242858, + "num_input_tokens_seen": 68671500, + "router_z_loss_clip": 2.63476562, + "router_z_loss_mlp": 0.23547363, + "step": 3182, + "time_per_iteration": 2.550570487976074 + }, + { + "auxiliary_loss_clip": 0.06570706, + "auxiliary_loss_mlp": 0.0129189, + "balance_loss_clip": 0.06306151, + "balance_loss_mlp": 0.01267166, + "epoch": 0.1913723132421464, + "flos": 18448459367040.0, + "grad_norm": 1.7754326163332461, + "language_loss": 0.66467738, + "learning_rate": 3.7329912393200084e-06, + "loss": 0.7433033, + "num_input_tokens_seen": 68690570, + "router_z_loss_clip": 2.6484375, + "router_z_loss_mlp": 0.24719238, + "step": 3183, + "time_per_iteration": 2.589555501937866 + }, + { + "auxiliary_loss_clip": 0.06578184, + "auxiliary_loss_mlp": 0.01296721, + "balance_loss_clip": 0.06308434, + "balance_loss_mlp": 0.01268659, + "epoch": 0.19143243649481437, + "flos": 27167101678080.0, + "grad_norm": 1.7849918331200134, + "language_loss": 0.73866975, + "learning_rate": 3.7327967931602173e-06, + "loss": 0.81741881, + "num_input_tokens_seen": 68709735, + "router_z_loss_clip": 2.69921875, + "router_z_loss_mlp": 0.28076172, + "step": 3184, + "time_per_iteration": 2.7020864486694336 + }, + { + "auxiliary_loss_clip": 0.06571424, + "auxiliary_loss_mlp": 0.01290073, + "balance_loss_clip": 0.06304838, + "balance_loss_mlp": 0.01264049, + "epoch": 0.19149255974748233, + "flos": 21724244812800.0, + "grad_norm": 1.9651356872089878, + "language_loss": 0.89339554, + "learning_rate": 3.732602281292598e-06, + "loss": 0.97201049, + "num_input_tokens_seen": 68727565, + "router_z_loss_clip": 2.66601562, + "router_z_loss_mlp": 0.26037598, + "step": 3185, + "time_per_iteration": 2.512737512588501 + }, + { + "auxiliary_loss_clip": 0.06568564, + "auxiliary_loss_mlp": 0.01286821, + "balance_loss_clip": 0.06304171, + "balance_loss_mlp": 0.01261429, + "epoch": 0.1915526830001503, + "flos": 22969433612160.0, + "grad_norm": 2.041503418641191, + "language_loss": 0.74291968, + "learning_rate": 3.7324077037245267e-06, + "loss": 0.82147354, + "num_input_tokens_seen": 68748110, + "router_z_loss_clip": 2.64648438, + "router_z_loss_mlp": 0.25390625, + "step": 3186, + "time_per_iteration": 2.577359676361084 + }, + { + "auxiliary_loss_clip": 0.06579722, + "auxiliary_loss_mlp": 0.01289876, + "balance_loss_clip": 0.06312623, + "balance_loss_mlp": 0.01264675, + "epoch": 0.1916128062528183, + "flos": 26147946067200.0, + "grad_norm": 1.9086459802632982, + "language_loss": 0.84205973, + "learning_rate": 3.7322130604633825e-06, + "loss": 0.92075574, + "num_input_tokens_seen": 68769765, + "router_z_loss_clip": 2.671875, + "router_z_loss_mlp": 0.25231934, + "step": 3187, + "time_per_iteration": 2.575345039367676 + }, + { + "auxiliary_loss_clip": 0.06462009, + "auxiliary_loss_mlp": 0.01273815, + "balance_loss_clip": 0.06313258, + "balance_loss_mlp": 0.01266967, + "epoch": 0.19167292950548626, + "flos": 54943513119360.0, + "grad_norm": 0.8344019653061644, + "language_loss": 0.56017417, + "learning_rate": 3.732018351516544e-06, + "loss": 0.63753241, + "num_input_tokens_seen": 68826815, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.06866455, + "step": 3188, + "time_per_iteration": 3.186802387237549 + }, + { + "auxiliary_loss_clip": 0.06575608, + "auxiliary_loss_mlp": 0.01301201, + "balance_loss_clip": 0.06310253, + "balance_loss_mlp": 0.01276942, + "epoch": 0.19173305275815422, + "flos": 29943497589120.0, + "grad_norm": 2.242687399889932, + "language_loss": 0.70996517, + "learning_rate": 3.731823576891397e-06, + "loss": 0.78873324, + "num_input_tokens_seen": 68847585, + "router_z_loss_clip": 2.65039062, + "router_z_loss_mlp": 0.24267578, + "step": 3189, + "time_per_iteration": 2.5879886150360107 + }, + { + "auxiliary_loss_clip": 0.0656148, + "auxiliary_loss_mlp": 0.01285809, + "balance_loss_clip": 0.06303851, + "balance_loss_mlp": 0.01263994, + "epoch": 0.1917931760108222, + "flos": 24759140169600.0, + "grad_norm": 2.034629185065424, + "language_loss": 0.74848962, + "learning_rate": 3.7316287365953266e-06, + "loss": 0.82696253, + "num_input_tokens_seen": 68866620, + "router_z_loss_clip": 2.578125, + "router_z_loss_mlp": 0.21813965, + "step": 3190, + "time_per_iteration": 2.618912696838379 + }, + { + "auxiliary_loss_clip": 0.06566381, + "auxiliary_loss_mlp": 0.01292718, + "balance_loss_clip": 0.06306858, + "balance_loss_mlp": 0.01268614, + "epoch": 0.19185329926349015, + "flos": 18849527735040.0, + "grad_norm": 1.9370060266864375, + "language_loss": 0.84794742, + "learning_rate": 3.73143383063572e-06, + "loss": 0.92653841, + "num_input_tokens_seen": 68885515, + "router_z_loss_clip": 2.59179688, + "router_z_loss_mlp": 0.24108887, + "step": 3191, + "time_per_iteration": 2.5354197025299072 + }, + { + "auxiliary_loss_clip": 0.06560425, + "auxiliary_loss_mlp": 0.01288793, + "balance_loss_clip": 0.06303156, + "balance_loss_mlp": 0.01265595, + "epoch": 0.19191342251615812, + "flos": 22092721142400.0, + "grad_norm": 1.810553957384375, + "language_loss": 0.90797645, + "learning_rate": 3.73123885901997e-06, + "loss": 0.98646855, + "num_input_tokens_seen": 68903225, + "router_z_loss_clip": 2.57421875, + "router_z_loss_mlp": 0.23193359, + "step": 3192, + "time_per_iteration": 2.594034433364868 + }, + { + "auxiliary_loss_clip": 0.06575879, + "auxiliary_loss_mlp": 0.01297652, + "balance_loss_clip": 0.06307722, + "balance_loss_mlp": 0.01273727, + "epoch": 0.19197354576882608, + "flos": 22205465210880.0, + "grad_norm": 3.128458316309985, + "language_loss": 0.76021564, + "learning_rate": 3.7310438217554687e-06, + "loss": 0.83895093, + "num_input_tokens_seen": 68922860, + "router_z_loss_clip": 2.68554688, + "router_z_loss_mlp": 0.23925781, + "step": 3193, + "time_per_iteration": 2.5328986644744873 + }, + { + "auxiliary_loss_clip": 0.06572805, + "auxiliary_loss_mlp": 0.01303133, + "balance_loss_clip": 0.06305176, + "balance_loss_mlp": 0.01278504, + "epoch": 0.19203366902149407, + "flos": 24902505705600.0, + "grad_norm": 1.8726296466629722, + "language_loss": 0.75837868, + "learning_rate": 3.730848718849612e-06, + "loss": 0.83713806, + "num_input_tokens_seen": 68943000, + "router_z_loss_clip": 2.67382812, + "router_z_loss_mlp": 0.24633789, + "step": 3194, + "time_per_iteration": 2.594693660736084 + }, + { + "auxiliary_loss_clip": 0.06443634, + "auxiliary_loss_mlp": 0.01272062, + "balance_loss_clip": 0.06298726, + "balance_loss_mlp": 0.01264749, + "epoch": 0.19209379227416204, + "flos": 68435256211200.0, + "grad_norm": 0.738426265798758, + "language_loss": 0.68323666, + "learning_rate": 3.7306535503097985e-06, + "loss": 0.76039362, + "num_input_tokens_seen": 69000255, + "router_z_loss_clip": 1.453125, + "router_z_loss_mlp": 0.07293701, + "step": 3195, + "time_per_iteration": 3.082646369934082 + }, + { + "auxiliary_loss_clip": 0.0656238, + "auxiliary_loss_mlp": 0.0129433, + "balance_loss_clip": 0.06301422, + "balance_loss_mlp": 0.01270488, + "epoch": 0.19215391552683, + "flos": 22061848112640.0, + "grad_norm": 2.817360442151248, + "language_loss": 0.74132156, + "learning_rate": 3.730458316143429e-06, + "loss": 0.81988871, + "num_input_tokens_seen": 69019665, + "router_z_loss_clip": 2.61132812, + "router_z_loss_mlp": 0.23852539, + "step": 3196, + "time_per_iteration": 2.5596578121185303 + }, + { + "auxiliary_loss_clip": 0.0656443, + "auxiliary_loss_mlp": 0.01296614, + "balance_loss_clip": 0.06303307, + "balance_loss_mlp": 0.01272939, + "epoch": 0.19221403877949797, + "flos": 20309177859840.0, + "grad_norm": 2.156505210347581, + "language_loss": 0.84144557, + "learning_rate": 3.7302630163579068e-06, + "loss": 0.92005599, + "num_input_tokens_seen": 69039055, + "router_z_loss_clip": 2.61523438, + "router_z_loss_mlp": 0.23657227, + "step": 3197, + "time_per_iteration": 2.505884885787964 + }, + { + "auxiliary_loss_clip": 0.06563333, + "auxiliary_loss_mlp": 0.01294057, + "balance_loss_clip": 0.06297445, + "balance_loss_mlp": 0.0126894, + "epoch": 0.19227416203216594, + "flos": 23192028783360.0, + "grad_norm": 2.1973705189643042, + "language_loss": 0.8105517, + "learning_rate": 3.7300676509606373e-06, + "loss": 0.88912559, + "num_input_tokens_seen": 69056370, + "router_z_loss_clip": 2.65625, + "router_z_loss_mlp": 0.25109863, + "step": 3198, + "time_per_iteration": 2.5759875774383545 + }, + { + "auxiliary_loss_clip": 0.06570526, + "auxiliary_loss_mlp": 0.01303751, + "balance_loss_clip": 0.06301676, + "balance_loss_mlp": 0.01279194, + "epoch": 0.1923342852848339, + "flos": 25783872076800.0, + "grad_norm": 2.3405078734196274, + "language_loss": 0.79434526, + "learning_rate": 3.729872219959029e-06, + "loss": 0.873088, + "num_input_tokens_seen": 69075915, + "router_z_loss_clip": 2.6875, + "router_z_loss_mlp": 0.24536133, + "step": 3199, + "time_per_iteration": 2.57918643951416 + }, + { + "auxiliary_loss_clip": 0.06561789, + "auxiliary_loss_mlp": 0.01291155, + "balance_loss_clip": 0.06299184, + "balance_loss_mlp": 0.01267694, + "epoch": 0.19239440853750187, + "flos": 17133977640960.0, + "grad_norm": 1.9996812909650197, + "language_loss": 0.84443569, + "learning_rate": 3.7296767233604934e-06, + "loss": 0.92296517, + "num_input_tokens_seen": 69094145, + "router_z_loss_clip": 2.625, + "router_z_loss_mlp": 0.23449707, + "step": 3200, + "time_per_iteration": 2.5089356899261475 + }, + { + "auxiliary_loss_clip": 0.06560853, + "auxiliary_loss_mlp": 0.01287978, + "balance_loss_clip": 0.06299884, + "balance_loss_mlp": 0.01265185, + "epoch": 0.19245453179016986, + "flos": 16440601904640.0, + "grad_norm": 1.9071909055640763, + "language_loss": 0.79753184, + "learning_rate": 3.729481161172443e-06, + "loss": 0.87602013, + "num_input_tokens_seen": 69111110, + "router_z_loss_clip": 2.609375, + "router_z_loss_mlp": 0.22790527, + "step": 3201, + "time_per_iteration": 2.5428295135498047 + }, + { + "auxiliary_loss_clip": 0.06563856, + "auxiliary_loss_mlp": 0.01287849, + "balance_loss_clip": 0.06298736, + "balance_loss_mlp": 0.01264365, + "epoch": 0.19251465504283782, + "flos": 20236530769920.0, + "grad_norm": 3.4105372180153273, + "language_loss": 0.70024735, + "learning_rate": 3.7292855334022927e-06, + "loss": 0.77876443, + "num_input_tokens_seen": 69130280, + "router_z_loss_clip": 2.65039062, + "router_z_loss_mlp": 0.23498535, + "step": 3202, + "time_per_iteration": 2.545257806777954 + }, + { + "auxiliary_loss_clip": 0.06559525, + "auxiliary_loss_mlp": 0.01288531, + "balance_loss_clip": 0.06303041, + "balance_loss_mlp": 0.01265965, + "epoch": 0.1925747782955058, + "flos": 19470549870720.0, + "grad_norm": 1.8972638993856672, + "language_loss": 0.9187758, + "learning_rate": 3.7290898400574627e-06, + "loss": 0.9972564, + "num_input_tokens_seen": 69149570, + "router_z_loss_clip": 2.56445312, + "router_z_loss_mlp": 0.22570801, + "step": 3203, + "time_per_iteration": 2.52083420753479 + }, + { + "auxiliary_loss_clip": 0.06569508, + "auxiliary_loss_mlp": 0.01288191, + "balance_loss_clip": 0.06305829, + "balance_loss_mlp": 0.01263193, + "epoch": 0.19263490154817375, + "flos": 17791407175680.0, + "grad_norm": 2.3309919698880637, + "language_loss": 0.82672936, + "learning_rate": 3.7288940811453725e-06, + "loss": 0.9053064, + "num_input_tokens_seen": 69168190, + "router_z_loss_clip": 2.63671875, + "router_z_loss_mlp": 0.25012207, + "step": 3204, + "time_per_iteration": 2.552898645401001 + }, + { + "auxiliary_loss_clip": 0.06554051, + "auxiliary_loss_mlp": 0.01280623, + "balance_loss_clip": 0.06297573, + "balance_loss_mlp": 0.01257437, + "epoch": 0.19269502480084172, + "flos": 17462818189440.0, + "grad_norm": 2.4686415170818927, + "language_loss": 0.76927221, + "learning_rate": 3.7286982566734454e-06, + "loss": 0.84761888, + "num_input_tokens_seen": 69186950, + "router_z_loss_clip": 2.56835938, + "router_z_loss_mlp": 0.23181152, + "step": 3205, + "time_per_iteration": 2.635087251663208 + }, + { + "auxiliary_loss_clip": 0.06570686, + "auxiliary_loss_mlp": 0.01281824, + "balance_loss_clip": 0.06303753, + "balance_loss_mlp": 0.01259913, + "epoch": 0.19275514805350968, + "flos": 21513305358720.0, + "grad_norm": 2.6796703276560034, + "language_loss": 0.84088528, + "learning_rate": 3.728502366649107e-06, + "loss": 0.91941041, + "num_input_tokens_seen": 69204850, + "router_z_loss_clip": 2.671875, + "router_z_loss_mlp": 0.21911621, + "step": 3206, + "time_per_iteration": 2.5875258445739746 + }, + { + "auxiliary_loss_clip": 0.06462742, + "auxiliary_loss_mlp": 0.01299031, + "balance_loss_clip": 0.06320498, + "balance_loss_mlp": 0.01291426, + "epoch": 0.19281527130617768, + "flos": 47711578602240.0, + "grad_norm": 0.8155276906071137, + "language_loss": 0.60688889, + "learning_rate": 3.728306411079786e-06, + "loss": 0.68450665, + "num_input_tokens_seen": 69259200, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.07592773, + "step": 3207, + "time_per_iteration": 2.98170804977417 + }, + { + "auxiliary_loss_clip": 0.06570975, + "auxiliary_loss_mlp": 0.01284779, + "balance_loss_clip": 0.06306583, + "balance_loss_mlp": 0.01261426, + "epoch": 0.19287539455884564, + "flos": 11805961196160.0, + "grad_norm": 2.350100512422909, + "language_loss": 0.76272619, + "learning_rate": 3.7281103899729125e-06, + "loss": 0.8412838, + "num_input_tokens_seen": 69275835, + "router_z_loss_clip": 2.64453125, + "router_z_loss_mlp": 0.23364258, + "step": 3208, + "time_per_iteration": 2.495664358139038 + }, + { + "auxiliary_loss_clip": 0.06570548, + "auxiliary_loss_mlp": 0.01287656, + "balance_loss_clip": 0.06303693, + "balance_loss_mlp": 0.01263253, + "epoch": 0.1929355178115136, + "flos": 20637724919040.0, + "grad_norm": 2.572131519169912, + "language_loss": 0.61787575, + "learning_rate": 3.7279143033359195e-06, + "loss": 0.69645774, + "num_input_tokens_seen": 69294810, + "router_z_loss_clip": 2.66796875, + "router_z_loss_mlp": 0.24389648, + "step": 3209, + "time_per_iteration": 2.5720291137695312 + }, + { + "auxiliary_loss_clip": 0.06569174, + "auxiliary_loss_mlp": 0.0128696, + "balance_loss_clip": 0.06303342, + "balance_loss_mlp": 0.01262832, + "epoch": 0.19299564106418157, + "flos": 40817555602560.0, + "grad_norm": 2.1926342764258773, + "language_loss": 0.80817664, + "learning_rate": 3.727718151176243e-06, + "loss": 0.88673794, + "num_input_tokens_seen": 69316065, + "router_z_loss_clip": 2.65820312, + "router_z_loss_mlp": 0.24133301, + "step": 3210, + "time_per_iteration": 2.6967084407806396 + }, + { + "auxiliary_loss_clip": 0.06562287, + "auxiliary_loss_mlp": 0.01281086, + "balance_loss_clip": 0.06305824, + "balance_loss_mlp": 0.01258913, + "epoch": 0.19305576431684954, + "flos": 11365718244480.0, + "grad_norm": 4.335018711819376, + "language_loss": 0.83798629, + "learning_rate": 3.7275219335013217e-06, + "loss": 0.9164201, + "num_input_tokens_seen": 69332900, + "router_z_loss_clip": 2.56445312, + "router_z_loss_mlp": 0.22167969, + "step": 3211, + "time_per_iteration": 2.522151470184326 + }, + { + "auxiliary_loss_clip": 0.06460443, + "auxiliary_loss_mlp": 0.01261987, + "balance_loss_clip": 0.06318722, + "balance_loss_mlp": 0.01254787, + "epoch": 0.1931158875695175, + "flos": 54527476798080.0, + "grad_norm": 0.9401062048905866, + "language_loss": 0.63522434, + "learning_rate": 3.7273256503185953e-06, + "loss": 0.71244872, + "num_input_tokens_seen": 69382535, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.07196045, + "step": 3212, + "time_per_iteration": 3.0072474479675293 + }, + { + "auxiliary_loss_clip": 0.06559554, + "auxiliary_loss_mlp": 0.01284587, + "balance_loss_clip": 0.06301133, + "balance_loss_mlp": 0.01260936, + "epoch": 0.19317601082218547, + "flos": 19834540007040.0, + "grad_norm": 1.629103353649286, + "language_loss": 0.7732501, + "learning_rate": 3.7271293016355074e-06, + "loss": 0.85169148, + "num_input_tokens_seen": 69400600, + "router_z_loss_clip": 2.58398438, + "router_z_loss_mlp": 0.23669434, + "step": 3213, + "time_per_iteration": 3.972214698791504 + }, + { + "auxiliary_loss_clip": 0.06571522, + "auxiliary_loss_mlp": 0.01282458, + "balance_loss_clip": 0.06306578, + "balance_loss_mlp": 0.01259749, + "epoch": 0.19323613407485346, + "flos": 13157143810560.0, + "grad_norm": 2.0451873974907864, + "language_loss": 0.71339387, + "learning_rate": 3.726932887459503e-06, + "loss": 0.79193366, + "num_input_tokens_seen": 69417350, + "router_z_loss_clip": 2.64648438, + "router_z_loss_mlp": 0.22729492, + "step": 3214, + "time_per_iteration": 2.542698383331299 + }, + { + "auxiliary_loss_clip": 0.06565271, + "auxiliary_loss_mlp": 0.01287539, + "balance_loss_clip": 0.06303567, + "balance_loss_mlp": 0.01264365, + "epoch": 0.19329625732752143, + "flos": 14032388833920.0, + "grad_norm": 2.534528672768976, + "language_loss": 0.75987494, + "learning_rate": 3.72673640779803e-06, + "loss": 0.83840305, + "num_input_tokens_seen": 69431845, + "router_z_loss_clip": 2.61328125, + "router_z_loss_mlp": 0.23205566, + "step": 3215, + "time_per_iteration": 3.8739888668060303 + }, + { + "auxiliary_loss_clip": 0.06557035, + "auxiliary_loss_mlp": 0.01279943, + "balance_loss_clip": 0.06302097, + "balance_loss_mlp": 0.01257615, + "epoch": 0.1933563805801894, + "flos": 23448641512320.0, + "grad_norm": 2.010602658012729, + "language_loss": 0.88668227, + "learning_rate": 3.72653986265854e-06, + "loss": 0.96505201, + "num_input_tokens_seen": 69453275, + "router_z_loss_clip": 2.55273438, + "router_z_loss_mlp": 0.22338867, + "step": 3216, + "time_per_iteration": 2.5690455436706543 + }, + { + "auxiliary_loss_clip": 0.06557489, + "auxiliary_loss_mlp": 0.01281443, + "balance_loss_clip": 0.06301452, + "balance_loss_mlp": 0.01259019, + "epoch": 0.19341650383285736, + "flos": 20491550271360.0, + "grad_norm": 2.1677144094151823, + "language_loss": 0.80915409, + "learning_rate": 3.726343252048485e-06, + "loss": 0.88754338, + "num_input_tokens_seen": 69471830, + "router_z_loss_clip": 2.55859375, + "router_z_loss_mlp": 0.2244873, + "step": 3217, + "time_per_iteration": 2.522089958190918 + }, + { + "auxiliary_loss_clip": 0.06573136, + "auxiliary_loss_mlp": 0.01282755, + "balance_loss_clip": 0.06306149, + "balance_loss_mlp": 0.01257709, + "epoch": 0.19347662708552532, + "flos": 17864305827840.0, + "grad_norm": 3.8111547770960907, + "language_loss": 0.63612419, + "learning_rate": 3.7261465759753206e-06, + "loss": 0.71468312, + "num_input_tokens_seen": 69489320, + "router_z_loss_clip": 2.66992188, + "router_z_loss_mlp": 0.25048828, + "step": 3218, + "time_per_iteration": 2.511009693145752 + }, + { + "auxiliary_loss_clip": 0.06568655, + "auxiliary_loss_mlp": 0.01286799, + "balance_loss_clip": 0.06304532, + "balance_loss_mlp": 0.01262945, + "epoch": 0.1935367503381933, + "flos": 18193188303360.0, + "grad_norm": 1.6615722636986479, + "language_loss": 0.80769217, + "learning_rate": 3.7259498344465053e-06, + "loss": 0.88624674, + "num_input_tokens_seen": 69506665, + "router_z_loss_clip": 2.63867188, + "router_z_loss_mlp": 0.23852539, + "step": 3219, + "time_per_iteration": 2.49652099609375 + }, + { + "auxiliary_loss_clip": 0.06560229, + "auxiliary_loss_mlp": 0.01283688, + "balance_loss_clip": 0.06305727, + "balance_loss_mlp": 0.01262183, + "epoch": 0.19359687359086128, + "flos": 15961939056000.0, + "grad_norm": 2.4004031272371096, + "language_loss": 0.87055713, + "learning_rate": 3.7257530274694993e-06, + "loss": 0.94899631, + "num_input_tokens_seen": 69523835, + "router_z_loss_clip": 2.54492188, + "router_z_loss_mlp": 0.21520996, + "step": 3220, + "time_per_iteration": 3.9898974895477295 + }, + { + "auxiliary_loss_clip": 0.06557765, + "auxiliary_loss_mlp": 0.01279498, + "balance_loss_clip": 0.06308522, + "balance_loss_mlp": 0.0125829, + "epoch": 0.19365699684352924, + "flos": 21221584968960.0, + "grad_norm": 2.3273733740868296, + "language_loss": 0.84724689, + "learning_rate": 3.725556155051766e-06, + "loss": 0.92561948, + "num_input_tokens_seen": 69542620, + "router_z_loss_clip": 2.48828125, + "router_z_loss_mlp": 0.21191406, + "step": 3221, + "time_per_iteration": 2.546876907348633 + }, + { + "auxiliary_loss_clip": 0.06557351, + "auxiliary_loss_mlp": 0.01282697, + "balance_loss_clip": 0.06305219, + "balance_loss_mlp": 0.01260333, + "epoch": 0.1937171200961972, + "flos": 17316811249920.0, + "grad_norm": 2.1420374809622507, + "language_loss": 0.8628484, + "learning_rate": 3.7253592172007702e-06, + "loss": 0.94124895, + "num_input_tokens_seen": 69561130, + "router_z_loss_clip": 2.52148438, + "router_z_loss_mlp": 0.22351074, + "step": 3222, + "time_per_iteration": 2.497483015060425 + }, + { + "auxiliary_loss_clip": 0.06565784, + "auxiliary_loss_mlp": 0.0127706, + "balance_loss_clip": 0.06304947, + "balance_loss_mlp": 0.01255114, + "epoch": 0.19377724334886517, + "flos": 22642228218240.0, + "grad_norm": 2.292443034833117, + "language_loss": 0.7909472, + "learning_rate": 3.72516221392398e-06, + "loss": 0.86937559, + "num_input_tokens_seen": 69580425, + "router_z_loss_clip": 2.609375, + "router_z_loss_mlp": 0.21948242, + "step": 3223, + "time_per_iteration": 2.63804292678833 + }, + { + "auxiliary_loss_clip": 0.06563858, + "auxiliary_loss_mlp": 0.01278148, + "balance_loss_clip": 0.06308811, + "balance_loss_mlp": 0.01256452, + "epoch": 0.19383736660153314, + "flos": 15081872423040.0, + "grad_norm": 2.2027436227921977, + "language_loss": 0.76066363, + "learning_rate": 3.7249651452288653e-06, + "loss": 0.83908367, + "num_input_tokens_seen": 69597085, + "router_z_loss_clip": 2.55273438, + "router_z_loss_mlp": 0.21728516, + "step": 3224, + "time_per_iteration": 2.4926822185516357 + }, + { + "auxiliary_loss_clip": 0.06569614, + "auxiliary_loss_mlp": 0.01280842, + "balance_loss_clip": 0.06311695, + "balance_loss_mlp": 0.01257155, + "epoch": 0.1938974898542011, + "flos": 47130626246400.0, + "grad_norm": 2.47304361876348, + "language_loss": 0.71419585, + "learning_rate": 3.7247680111229e-06, + "loss": 0.79270041, + "num_input_tokens_seen": 69618885, + "router_z_loss_clip": 2.578125, + "router_z_loss_mlp": 0.23681641, + "step": 3225, + "time_per_iteration": 2.8417437076568604 + }, + { + "auxiliary_loss_clip": 0.0656653, + "auxiliary_loss_mlp": 0.01277702, + "balance_loss_clip": 0.06306545, + "balance_loss_mlp": 0.01255076, + "epoch": 0.19395761310686907, + "flos": 25819734424320.0, + "grad_norm": 2.3579945849430235, + "language_loss": 0.6987173, + "learning_rate": 3.7245708116135585e-06, + "loss": 0.77715963, + "num_input_tokens_seen": 69638200, + "router_z_loss_clip": 2.59960938, + "router_z_loss_mlp": 0.22619629, + "step": 3226, + "time_per_iteration": 2.5816895961761475 + }, + { + "auxiliary_loss_clip": 0.06556038, + "auxiliary_loss_mlp": 0.01279426, + "balance_loss_clip": 0.06305292, + "balance_loss_mlp": 0.01255608, + "epoch": 0.19401773635953706, + "flos": 23046315333120.0, + "grad_norm": 1.6993594132957168, + "language_loss": 0.76826584, + "learning_rate": 3.7243735467083193e-06, + "loss": 0.84662044, + "num_input_tokens_seen": 69657550, + "router_z_loss_clip": 2.5078125, + "router_z_loss_mlp": 0.23815918, + "step": 3227, + "time_per_iteration": 2.5873494148254395 + }, + { + "auxiliary_loss_clip": 0.06565821, + "auxiliary_loss_mlp": 0.01279646, + "balance_loss_clip": 0.063063, + "balance_loss_mlp": 0.01257187, + "epoch": 0.19407785961220503, + "flos": 15925615511040.0, + "grad_norm": 1.984580707337323, + "language_loss": 0.70403302, + "learning_rate": 3.724176216414662e-06, + "loss": 0.78248763, + "num_input_tokens_seen": 69675005, + "router_z_loss_clip": 2.59765625, + "router_z_loss_mlp": 0.22460938, + "step": 3228, + "time_per_iteration": 2.5275485515594482 + }, + { + "auxiliary_loss_clip": 0.06563079, + "auxiliary_loss_mlp": 0.01279835, + "balance_loss_clip": 0.06306829, + "balance_loss_mlp": 0.01257662, + "epoch": 0.194137982864873, + "flos": 25928872767360.0, + "grad_norm": 1.8334459249779138, + "language_loss": 0.74913502, + "learning_rate": 3.72397882074007e-06, + "loss": 0.82756412, + "num_input_tokens_seen": 69696455, + "router_z_loss_clip": 2.56640625, + "router_z_loss_mlp": 0.2220459, + "step": 3229, + "time_per_iteration": 2.588756561279297 + }, + { + "auxiliary_loss_clip": 0.06561101, + "auxiliary_loss_mlp": 0.01283623, + "balance_loss_clip": 0.06304256, + "balance_loss_mlp": 0.01260126, + "epoch": 0.19419810611754096, + "flos": 13266407934720.0, + "grad_norm": 2.0512138922716034, + "language_loss": 0.66050041, + "learning_rate": 3.7237813596920285e-06, + "loss": 0.73894763, + "num_input_tokens_seen": 69714245, + "router_z_loss_clip": 2.57226562, + "router_z_loss_mlp": 0.23486328, + "step": 3230, + "time_per_iteration": 2.51173996925354 + }, + { + "auxiliary_loss_clip": 0.06559683, + "auxiliary_loss_mlp": 0.01282022, + "balance_loss_clip": 0.06306173, + "balance_loss_mlp": 0.01259444, + "epoch": 0.19425822937020892, + "flos": 15710986477440.0, + "grad_norm": 1.9323382078744304, + "language_loss": 0.82361978, + "learning_rate": 3.7235838332780254e-06, + "loss": 0.90203679, + "num_input_tokens_seen": 69731515, + "router_z_loss_clip": 2.53515625, + "router_z_loss_mlp": 0.22583008, + "step": 3231, + "time_per_iteration": 2.5331170558929443 + }, + { + "auxiliary_loss_clip": 0.06565376, + "auxiliary_loss_mlp": 0.01284277, + "balance_loss_clip": 0.0630706, + "balance_loss_mlp": 0.01260793, + "epoch": 0.1943183526228769, + "flos": 23110912431360.0, + "grad_norm": 1.7851653331870696, + "language_loss": 0.8806898, + "learning_rate": 3.72338624150555e-06, + "loss": 0.95918632, + "num_input_tokens_seen": 69748885, + "router_z_loss_clip": 2.5859375, + "router_z_loss_mlp": 0.23474121, + "step": 3232, + "time_per_iteration": 2.556128740310669 + }, + { + "auxiliary_loss_clip": 0.06561054, + "auxiliary_loss_mlp": 0.01288213, + "balance_loss_clip": 0.06308518, + "balance_loss_mlp": 0.01265718, + "epoch": 0.19437847587554485, + "flos": 24718707774720.0, + "grad_norm": 1.9425002506843316, + "language_loss": 0.8592729, + "learning_rate": 3.723188584382096e-06, + "loss": 0.93776554, + "num_input_tokens_seen": 69767540, + "router_z_loss_clip": 2.52929688, + "router_z_loss_mlp": 0.22497559, + "step": 3233, + "time_per_iteration": 2.5888071060180664 + }, + { + "auxiliary_loss_clip": 0.06570844, + "auxiliary_loss_mlp": 0.01287681, + "balance_loss_clip": 0.06309654, + "balance_loss_mlp": 0.01263195, + "epoch": 0.19443859912821285, + "flos": 23123448616320.0, + "grad_norm": 2.322933236090491, + "language_loss": 0.8952834, + "learning_rate": 3.722990861915158e-06, + "loss": 0.97386861, + "num_input_tokens_seen": 69789340, + "router_z_loss_clip": 2.61328125, + "router_z_loss_mlp": 0.24499512, + "step": 3234, + "time_per_iteration": 2.598424196243286 + }, + { + "auxiliary_loss_clip": 0.0656711, + "auxiliary_loss_mlp": 0.01279524, + "balance_loss_clip": 0.06307149, + "balance_loss_mlp": 0.01256243, + "epoch": 0.1944987223808808, + "flos": 15089545071360.0, + "grad_norm": 2.0762312051619993, + "language_loss": 0.7883603, + "learning_rate": 3.722793074112234e-06, + "loss": 0.86682659, + "num_input_tokens_seen": 69806470, + "router_z_loss_clip": 2.59765625, + "router_z_loss_mlp": 0.23291016, + "step": 3235, + "time_per_iteration": 2.518150806427002 + }, + { + "auxiliary_loss_clip": 0.06562902, + "auxiliary_loss_mlp": 0.01278747, + "balance_loss_clip": 0.06309078, + "balance_loss_mlp": 0.01257253, + "epoch": 0.19455884563354878, + "flos": 17132258632320.0, + "grad_norm": 2.012702835830896, + "language_loss": 0.79693586, + "learning_rate": 3.7225952209808233e-06, + "loss": 0.87535232, + "num_input_tokens_seen": 69822655, + "router_z_loss_clip": 2.54296875, + "router_z_loss_mlp": 0.21520996, + "step": 3236, + "time_per_iteration": 2.5621957778930664 + }, + { + "auxiliary_loss_clip": 0.06562862, + "auxiliary_loss_mlp": 0.01279358, + "balance_loss_clip": 0.06309117, + "balance_loss_mlp": 0.0125635, + "epoch": 0.19461896888621674, + "flos": 20199578319360.0, + "grad_norm": 1.7644130728207734, + "language_loss": 0.76505381, + "learning_rate": 3.72239730252843e-06, + "loss": 0.84347594, + "num_input_tokens_seen": 69841895, + "router_z_loss_clip": 2.53710938, + "router_z_loss_mlp": 0.23010254, + "step": 3237, + "time_per_iteration": 2.545138359069824 + }, + { + "auxiliary_loss_clip": 0.06572011, + "auxiliary_loss_mlp": 0.01287724, + "balance_loss_clip": 0.06309787, + "balance_loss_mlp": 0.01264455, + "epoch": 0.1946790921388847, + "flos": 25308395683200.0, + "grad_norm": 3.0171180207385855, + "language_loss": 0.75939953, + "learning_rate": 3.7221993187625583e-06, + "loss": 0.8379969, + "num_input_tokens_seen": 69862220, + "router_z_loss_clip": 2.62304688, + "router_z_loss_mlp": 0.23291016, + "step": 3238, + "time_per_iteration": 2.6292033195495605 + }, + { + "auxiliary_loss_clip": 0.06564013, + "auxiliary_loss_mlp": 0.01283016, + "balance_loss_clip": 0.0631004, + "balance_loss_mlp": 0.0126033, + "epoch": 0.19473921539155267, + "flos": 20199578319360.0, + "grad_norm": 5.2039179549819, + "language_loss": 0.740753, + "learning_rate": 3.7220012696907155e-06, + "loss": 0.81922328, + "num_input_tokens_seen": 69881830, + "router_z_loss_clip": 2.5390625, + "router_z_loss_mlp": 0.22692871, + "step": 3239, + "time_per_iteration": 2.5251026153564453 + }, + { + "auxiliary_loss_clip": 0.06561047, + "auxiliary_loss_mlp": 0.01279887, + "balance_loss_clip": 0.06308049, + "balance_loss_mlp": 0.01257464, + "epoch": 0.19479933864422067, + "flos": 20894002231680.0, + "grad_norm": 2.589752485587752, + "language_loss": 0.74076676, + "learning_rate": 3.721803155320412e-06, + "loss": 0.8191762, + "num_input_tokens_seen": 69900515, + "router_z_loss_clip": 2.53125, + "router_z_loss_mlp": 0.22424316, + "step": 3240, + "time_per_iteration": 2.5630886554718018 + }, + { + "auxiliary_loss_clip": 0.06569096, + "auxiliary_loss_mlp": 0.01285658, + "balance_loss_clip": 0.06312588, + "balance_loss_mlp": 0.01262758, + "epoch": 0.19485946189688863, + "flos": 23301837959040.0, + "grad_norm": 2.269188581778515, + "language_loss": 0.67009896, + "learning_rate": 3.7216049756591606e-06, + "loss": 0.7486465, + "num_input_tokens_seen": 69920060, + "router_z_loss_clip": 2.56640625, + "router_z_loss_mlp": 0.22888184, + "step": 3241, + "time_per_iteration": 2.5366311073303223 + }, + { + "auxiliary_loss_clip": 0.0657091, + "auxiliary_loss_mlp": 0.01284859, + "balance_loss_clip": 0.06315701, + "balance_loss_mlp": 0.01261017, + "epoch": 0.1949195851495566, + "flos": 23301796032000.0, + "grad_norm": 1.7252715969085026, + "language_loss": 0.8313868, + "learning_rate": 3.7214067307144754e-06, + "loss": 0.90994453, + "num_input_tokens_seen": 69939820, + "router_z_loss_clip": 2.5546875, + "router_z_loss_mlp": 0.23828125, + "step": 3242, + "time_per_iteration": 2.5582659244537354 + }, + { + "auxiliary_loss_clip": 0.06462191, + "auxiliary_loss_mlp": 0.01271622, + "balance_loss_clip": 0.06317475, + "balance_loss_mlp": 0.01264684, + "epoch": 0.19497970840222456, + "flos": 64982884285440.0, + "grad_norm": 0.8039225971535554, + "language_loss": 0.57435864, + "learning_rate": 3.721208420493875e-06, + "loss": 0.6516968, + "num_input_tokens_seen": 70002145, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.06951904, + "step": 3243, + "time_per_iteration": 3.1517677307128906 + }, + { + "auxiliary_loss_clip": 0.06582105, + "auxiliary_loss_mlp": 0.01289713, + "balance_loss_clip": 0.06324299, + "balance_loss_mlp": 0.01264619, + "epoch": 0.19503983165489253, + "flos": 19650574368000.0, + "grad_norm": 1.7327160710810887, + "language_loss": 0.83662367, + "learning_rate": 3.7210100450048784e-06, + "loss": 0.91534185, + "num_input_tokens_seen": 70020510, + "router_z_loss_clip": 2.58007812, + "router_z_loss_mlp": 0.25085449, + "step": 3244, + "time_per_iteration": 2.580615282058716 + }, + { + "auxiliary_loss_clip": 0.06580628, + "auxiliary_loss_mlp": 0.01287488, + "balance_loss_clip": 0.06321178, + "balance_loss_mlp": 0.01264206, + "epoch": 0.1950999549075605, + "flos": 21148308973440.0, + "grad_norm": 1.8443508562563502, + "language_loss": 0.77383208, + "learning_rate": 3.7208116042550088e-06, + "loss": 0.85251331, + "num_input_tokens_seen": 70040760, + "router_z_loss_clip": 2.59375, + "router_z_loss_mlp": 0.23278809, + "step": 3245, + "time_per_iteration": 2.562547206878662 + }, + { + "auxiliary_loss_clip": 0.06574707, + "auxiliary_loss_mlp": 0.01284069, + "balance_loss_clip": 0.06316134, + "balance_loss_mlp": 0.01260168, + "epoch": 0.19516007816022846, + "flos": 20890815776640.0, + "grad_norm": 1.9180190042930891, + "language_loss": 0.84645605, + "learning_rate": 3.7206130982517906e-06, + "loss": 0.92504388, + "num_input_tokens_seen": 70058720, + "router_z_loss_clip": 2.58789062, + "router_z_loss_mlp": 0.2388916, + "step": 3246, + "time_per_iteration": 2.5781290531158447 + }, + { + "auxiliary_loss_clip": 0.06585012, + "auxiliary_loss_mlp": 0.01283635, + "balance_loss_clip": 0.0632351, + "balance_loss_mlp": 0.012612, + "epoch": 0.19522020141289645, + "flos": 16916287933440.0, + "grad_norm": 2.4019655481348177, + "language_loss": 0.77056623, + "learning_rate": 3.7204145270027514e-06, + "loss": 0.8492527, + "num_input_tokens_seen": 70076470, + "router_z_loss_clip": 2.61523438, + "router_z_loss_mlp": 0.22436523, + "step": 3247, + "time_per_iteration": 2.5042033195495605 + }, + { + "auxiliary_loss_clip": 0.06582692, + "auxiliary_loss_mlp": 0.01287787, + "balance_loss_clip": 0.06325091, + "balance_loss_mlp": 0.01264136, + "epoch": 0.19528032466556441, + "flos": 26732183460480.0, + "grad_norm": 1.5912411640106108, + "language_loss": 0.75763261, + "learning_rate": 3.720215890515421e-06, + "loss": 0.83633739, + "num_input_tokens_seen": 70096220, + "router_z_loss_clip": 2.58007812, + "router_z_loss_mlp": 0.23669434, + "step": 3248, + "time_per_iteration": 2.629751205444336 + }, + { + "auxiliary_loss_clip": 0.0657216, + "auxiliary_loss_mlp": 0.01286346, + "balance_loss_clip": 0.06312956, + "balance_loss_mlp": 0.01263994, + "epoch": 0.19534044791823238, + "flos": 21039170630400.0, + "grad_norm": 2.0257715109614822, + "language_loss": 0.79102194, + "learning_rate": 3.7200171887973316e-06, + "loss": 0.86960697, + "num_input_tokens_seen": 70114800, + "router_z_loss_clip": 2.58984375, + "router_z_loss_mlp": 0.22375488, + "step": 3249, + "time_per_iteration": 2.5774686336517334 + }, + { + "auxiliary_loss_clip": 0.06565905, + "auxiliary_loss_mlp": 0.01285899, + "balance_loss_clip": 0.06309386, + "balance_loss_mlp": 0.01263035, + "epoch": 0.19540057117090034, + "flos": 22350256266240.0, + "grad_norm": 1.6645797480066, + "language_loss": 0.73634374, + "learning_rate": 3.7198184218560176e-06, + "loss": 0.81486177, + "num_input_tokens_seen": 70134930, + "router_z_loss_clip": 2.56835938, + "router_z_loss_mlp": 0.2286377, + "step": 3250, + "time_per_iteration": 2.5834462642669678 + }, + { + "auxiliary_loss_clip": 0.06557436, + "auxiliary_loss_mlp": 0.01284202, + "balance_loss_clip": 0.06304777, + "balance_loss_mlp": 0.01261791, + "epoch": 0.1954606944235683, + "flos": 20307626559360.0, + "grad_norm": 5.203824713813235, + "language_loss": 0.80619103, + "learning_rate": 3.719619589699017e-06, + "loss": 0.88460743, + "num_input_tokens_seen": 70152045, + "router_z_loss_clip": 2.52929688, + "router_z_loss_mlp": 0.22399902, + "step": 3251, + "time_per_iteration": 2.5159976482391357 + }, + { + "auxiliary_loss_clip": 0.06569009, + "auxiliary_loss_mlp": 0.0128766, + "balance_loss_clip": 0.06309755, + "balance_loss_mlp": 0.01264593, + "epoch": 0.19552081767623627, + "flos": 17352463962240.0, + "grad_norm": 2.6280610562746882, + "language_loss": 0.84652966, + "learning_rate": 3.7194206923338695e-06, + "loss": 0.92509639, + "num_input_tokens_seen": 70169240, + "router_z_loss_clip": 2.59375, + "router_z_loss_mlp": 0.23071289, + "step": 3252, + "time_per_iteration": 2.584712505340576 + }, + { + "auxiliary_loss_clip": 0.0657175, + "auxiliary_loss_mlp": 0.01284666, + "balance_loss_clip": 0.06305347, + "balance_loss_mlp": 0.01258559, + "epoch": 0.19558094092890424, + "flos": 31985666098560.0, + "grad_norm": 1.8259798075239808, + "language_loss": 0.74205744, + "learning_rate": 3.719221729768117e-06, + "loss": 0.82062161, + "num_input_tokens_seen": 70192690, + "router_z_loss_clip": 2.66601562, + "router_z_loss_mlp": 0.26098633, + "step": 3253, + "time_per_iteration": 4.126874685287476 + }, + { + "auxiliary_loss_clip": 0.06567718, + "auxiliary_loss_mlp": 0.01281159, + "balance_loss_clip": 0.06301166, + "balance_loss_mlp": 0.0125721, + "epoch": 0.19564106418157223, + "flos": 22274716210560.0, + "grad_norm": 1.973936337746025, + "language_loss": 0.77398765, + "learning_rate": 3.7190227020093037e-06, + "loss": 0.85247642, + "num_input_tokens_seen": 70209685, + "router_z_loss_clip": 2.66210938, + "router_z_loss_mlp": 0.23962402, + "step": 3254, + "time_per_iteration": 2.6537773609161377 + }, + { + "auxiliary_loss_clip": 0.06437294, + "auxiliary_loss_mlp": 0.01260118, + "balance_loss_clip": 0.06291844, + "balance_loss_mlp": 0.01253204, + "epoch": 0.1957011874342402, + "flos": 54379876631040.0, + "grad_norm": 0.7412950515810539, + "language_loss": 0.55013955, + "learning_rate": 3.7188236090649774e-06, + "loss": 0.62711358, + "num_input_tokens_seen": 70265050, + "router_z_loss_clip": 1.453125, + "router_z_loss_mlp": 0.06933594, + "step": 3255, + "time_per_iteration": 4.54949426651001 + }, + { + "auxiliary_loss_clip": 0.06563026, + "auxiliary_loss_mlp": 0.01289416, + "balance_loss_clip": 0.06301506, + "balance_loss_mlp": 0.01265407, + "epoch": 0.19576131068690816, + "flos": 16511991183360.0, + "grad_norm": 2.710710922193229, + "language_loss": 0.71672189, + "learning_rate": 3.718624450942688e-06, + "loss": 0.79524636, + "num_input_tokens_seen": 70281830, + "router_z_loss_clip": 2.61523438, + "router_z_loss_mlp": 0.2401123, + "step": 3256, + "time_per_iteration": 2.563938617706299 + }, + { + "auxiliary_loss_clip": 0.06557887, + "auxiliary_loss_mlp": 0.01283051, + "balance_loss_clip": 0.06298412, + "balance_loss_mlp": 0.01259591, + "epoch": 0.19582143393957613, + "flos": 14724800248320.0, + "grad_norm": 2.2116868908222176, + "language_loss": 0.8133806, + "learning_rate": 3.718425227649987e-06, + "loss": 0.89178997, + "num_input_tokens_seen": 70297420, + "router_z_loss_clip": 2.59570312, + "router_z_loss_mlp": 0.23461914, + "step": 3257, + "time_per_iteration": 2.546842336654663 + }, + { + "auxiliary_loss_clip": 0.06568147, + "auxiliary_loss_mlp": 0.01289159, + "balance_loss_clip": 0.06309533, + "balance_loss_mlp": 0.01264554, + "epoch": 0.1958815571922441, + "flos": 24432354046080.0, + "grad_norm": 4.3707104143190785, + "language_loss": 0.76246595, + "learning_rate": 3.7182259391944292e-06, + "loss": 0.841039, + "num_input_tokens_seen": 70319210, + "router_z_loss_clip": 2.58398438, + "router_z_loss_mlp": 0.24609375, + "step": 3258, + "time_per_iteration": 2.596585273742676 + }, + { + "auxiliary_loss_clip": 0.06562606, + "auxiliary_loss_mlp": 0.01282027, + "balance_loss_clip": 0.06300102, + "balance_loss_mlp": 0.01257828, + "epoch": 0.19594168044491206, + "flos": 24907285388160.0, + "grad_norm": 1.9490064747675282, + "language_loss": 0.74507892, + "learning_rate": 3.7180265855835714e-06, + "loss": 0.82352525, + "num_input_tokens_seen": 70339045, + "router_z_loss_clip": 2.62695312, + "router_z_loss_mlp": 0.24230957, + "step": 3259, + "time_per_iteration": 2.572443723678589 + }, + { + "auxiliary_loss_clip": 0.06562422, + "auxiliary_loss_mlp": 0.01289683, + "balance_loss_clip": 0.06298189, + "balance_loss_mlp": 0.01263302, + "epoch": 0.19600180369758005, + "flos": 12061819238400.0, + "grad_norm": 2.2810085679716106, + "language_loss": 0.7772423, + "learning_rate": 3.7178271668249735e-06, + "loss": 0.85576332, + "num_input_tokens_seen": 70356505, + "router_z_loss_clip": 2.63867188, + "router_z_loss_mlp": 0.26379395, + "step": 3260, + "time_per_iteration": 5.330974340438843 + }, + { + "auxiliary_loss_clip": 0.06562512, + "auxiliary_loss_mlp": 0.01290293, + "balance_loss_clip": 0.06300309, + "balance_loss_mlp": 0.01266046, + "epoch": 0.19606192695024802, + "flos": 20856504729600.0, + "grad_norm": 2.085882514659535, + "language_loss": 0.83190846, + "learning_rate": 3.7176276829261975e-06, + "loss": 0.91043651, + "num_input_tokens_seen": 70375410, + "router_z_loss_clip": 2.62109375, + "router_z_loss_mlp": 0.24279785, + "step": 3261, + "time_per_iteration": 2.5832743644714355 + }, + { + "auxiliary_loss_clip": 0.06565593, + "auxiliary_loss_mlp": 0.01288067, + "balance_loss_clip": 0.06304751, + "balance_loss_mlp": 0.01263296, + "epoch": 0.19612205020291598, + "flos": 28483050850560.0, + "grad_norm": 1.7951789750723233, + "language_loss": 0.77451867, + "learning_rate": 3.717428133894807e-06, + "loss": 0.85305524, + "num_input_tokens_seen": 70396315, + "router_z_loss_clip": 2.60546875, + "router_z_loss_mlp": 0.24768066, + "step": 3262, + "time_per_iteration": 2.5895204544067383 + }, + { + "auxiliary_loss_clip": 0.06560683, + "auxiliary_loss_mlp": 0.01286928, + "balance_loss_clip": 0.06303811, + "balance_loss_mlp": 0.01264004, + "epoch": 0.19618217345558395, + "flos": 25563666746880.0, + "grad_norm": 1.6758780497522678, + "language_loss": 0.87025416, + "learning_rate": 3.71722851973837e-06, + "loss": 0.94873023, + "num_input_tokens_seen": 70417945, + "router_z_loss_clip": 2.57226562, + "router_z_loss_mlp": 0.22937012, + "step": 3263, + "time_per_iteration": 2.5864033699035645 + }, + { + "auxiliary_loss_clip": 0.0656628, + "auxiliary_loss_mlp": 0.01296773, + "balance_loss_clip": 0.06306224, + "balance_loss_mlp": 0.0127137, + "epoch": 0.1962422967082519, + "flos": 25271359378560.0, + "grad_norm": 1.67172611639437, + "language_loss": 0.74829996, + "learning_rate": 3.717028840464455e-06, + "loss": 0.82693052, + "num_input_tokens_seen": 70438690, + "router_z_loss_clip": 2.60546875, + "router_z_loss_mlp": 0.25390625, + "step": 3264, + "time_per_iteration": 2.5601091384887695 + }, + { + "auxiliary_loss_clip": 0.06569743, + "auxiliary_loss_mlp": 0.01288835, + "balance_loss_clip": 0.0631538, + "balance_loss_mlp": 0.01264337, + "epoch": 0.19630241996091988, + "flos": 18813371898240.0, + "grad_norm": 2.189524829184907, + "language_loss": 0.7983582, + "learning_rate": 3.7168290960806344e-06, + "loss": 0.87694395, + "num_input_tokens_seen": 70455385, + "router_z_loss_clip": 2.546875, + "router_z_loss_mlp": 0.24511719, + "step": 3265, + "time_per_iteration": 2.540691614151001 + }, + { + "auxiliary_loss_clip": 0.06455089, + "auxiliary_loss_mlp": 0.01265834, + "balance_loss_clip": 0.06313262, + "balance_loss_mlp": 0.01257317, + "epoch": 0.19636254321358784, + "flos": 62338240120320.0, + "grad_norm": 0.7691014679533006, + "language_loss": 0.53069305, + "learning_rate": 3.716629286594483e-06, + "loss": 0.60790235, + "num_input_tokens_seen": 70514280, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.08526611, + "step": 3266, + "time_per_iteration": 3.1712465286254883 + }, + { + "auxiliary_loss_clip": 0.06579427, + "auxiliary_loss_mlp": 0.01300624, + "balance_loss_clip": 0.06317084, + "balance_loss_mlp": 0.01276138, + "epoch": 0.19642266646625584, + "flos": 21075703810560.0, + "grad_norm": 2.1807082930425548, + "language_loss": 0.8080219, + "learning_rate": 3.7164294120135767e-06, + "loss": 0.88682246, + "num_input_tokens_seen": 70531800, + "router_z_loss_clip": 2.61914062, + "router_z_loss_mlp": 0.24487305, + "step": 3267, + "time_per_iteration": 2.551907539367676 + }, + { + "auxiliary_loss_clip": 0.06564153, + "auxiliary_loss_mlp": 0.0128147, + "balance_loss_clip": 0.06308893, + "balance_loss_mlp": 0.01257366, + "epoch": 0.1964827897189238, + "flos": 14543979137280.0, + "grad_norm": 2.1592598522148694, + "language_loss": 0.8731035, + "learning_rate": 3.7162294723454953e-06, + "loss": 0.95155978, + "num_input_tokens_seen": 70550615, + "router_z_loss_clip": 2.55078125, + "router_z_loss_mlp": 0.24108887, + "step": 3268, + "time_per_iteration": 2.520824909210205 + }, + { + "auxiliary_loss_clip": 0.06570253, + "auxiliary_loss_mlp": 0.01291413, + "balance_loss_clip": 0.0631839, + "balance_loss_mlp": 0.01268858, + "epoch": 0.19654291297159177, + "flos": 19250638030080.0, + "grad_norm": 2.3684809338902215, + "language_loss": 0.70127171, + "learning_rate": 3.7160294675978197e-06, + "loss": 0.77988833, + "num_input_tokens_seen": 70568690, + "router_z_loss_clip": 2.52148438, + "router_z_loss_mlp": 0.22546387, + "step": 3269, + "time_per_iteration": 2.542065382003784 + }, + { + "auxiliary_loss_clip": 0.06579614, + "auxiliary_loss_mlp": 0.01289007, + "balance_loss_clip": 0.06318989, + "balance_loss_mlp": 0.01263008, + "epoch": 0.19660303622425973, + "flos": 25782823900800.0, + "grad_norm": 3.1056086534351324, + "language_loss": 0.80997849, + "learning_rate": 3.715829397778135e-06, + "loss": 0.88866472, + "num_input_tokens_seen": 70588665, + "router_z_loss_clip": 2.60351562, + "router_z_loss_mlp": 0.25976562, + "step": 3270, + "time_per_iteration": 2.5732779502868652 + }, + { + "auxiliary_loss_clip": 0.0656828, + "auxiliary_loss_mlp": 0.0128367, + "balance_loss_clip": 0.06310552, + "balance_loss_mlp": 0.01257468, + "epoch": 0.1966631594769277, + "flos": 20601401374080.0, + "grad_norm": 4.117702501056874, + "language_loss": 0.84620351, + "learning_rate": 3.715629262894028e-06, + "loss": 0.92472303, + "num_input_tokens_seen": 70606900, + "router_z_loss_clip": 2.58007812, + "router_z_loss_mlp": 0.26220703, + "step": 3271, + "time_per_iteration": 2.54874587059021 + }, + { + "auxiliary_loss_clip": 0.06565209, + "auxiliary_loss_mlp": 0.01287963, + "balance_loss_clip": 0.06316341, + "balance_loss_mlp": 0.01263311, + "epoch": 0.19672328272959566, + "flos": 23629965747840.0, + "grad_norm": 1.9724475535226151, + "language_loss": 0.8064115, + "learning_rate": 3.715429062953087e-06, + "loss": 0.88494325, + "num_input_tokens_seen": 70625955, + "router_z_loss_clip": 2.48828125, + "router_z_loss_mlp": 0.2467041, + "step": 3272, + "time_per_iteration": 2.5446958541870117 + }, + { + "auxiliary_loss_clip": 0.06582461, + "auxiliary_loss_mlp": 0.01289002, + "balance_loss_clip": 0.06322335, + "balance_loss_mlp": 0.0126218, + "epoch": 0.19678340598226365, + "flos": 23117369195520.0, + "grad_norm": 1.7276133269560208, + "language_loss": 0.81592834, + "learning_rate": 3.7152287979629043e-06, + "loss": 0.89464301, + "num_input_tokens_seen": 70646090, + "router_z_loss_clip": 2.60351562, + "router_z_loss_mlp": 0.26831055, + "step": 3273, + "time_per_iteration": 2.625422239303589 + }, + { + "auxiliary_loss_clip": 0.06569564, + "auxiliary_loss_mlp": 0.01284595, + "balance_loss_clip": 0.06313652, + "balance_loss_mlp": 0.0126142, + "epoch": 0.19684352923493162, + "flos": 24541702024320.0, + "grad_norm": 1.8603958272733907, + "language_loss": 0.78998351, + "learning_rate": 3.7150284679310735e-06, + "loss": 0.86852515, + "num_input_tokens_seen": 70666065, + "router_z_loss_clip": 2.5625, + "router_z_loss_mlp": 0.23181152, + "step": 3274, + "time_per_iteration": 2.6299047470092773 + }, + { + "auxiliary_loss_clip": 0.06566115, + "auxiliary_loss_mlp": 0.01283599, + "balance_loss_clip": 0.0630929, + "balance_loss_mlp": 0.01259722, + "epoch": 0.19690365248759958, + "flos": 21802510126080.0, + "grad_norm": 2.495100495270235, + "language_loss": 0.82370663, + "learning_rate": 3.7148280728651914e-06, + "loss": 0.90220374, + "num_input_tokens_seen": 70681580, + "router_z_loss_clip": 2.56835938, + "router_z_loss_mlp": 0.23864746, + "step": 3275, + "time_per_iteration": 2.532348394393921 + }, + { + "auxiliary_loss_clip": 0.06571324, + "auxiliary_loss_mlp": 0.0128437, + "balance_loss_clip": 0.06313166, + "balance_loss_mlp": 0.01259134, + "epoch": 0.19696377574026755, + "flos": 19061683073280.0, + "grad_norm": 2.1007591714873968, + "language_loss": 0.81547761, + "learning_rate": 3.7146276127728563e-06, + "loss": 0.8940345, + "num_input_tokens_seen": 70697745, + "router_z_loss_clip": 2.58203125, + "router_z_loss_mlp": 0.25244141, + "step": 3276, + "time_per_iteration": 2.533137798309326 + }, + { + "auxiliary_loss_clip": 0.06571773, + "auxiliary_loss_mlp": 0.01280216, + "balance_loss_clip": 0.0631392, + "balance_loss_mlp": 0.01256135, + "epoch": 0.19702389899293551, + "flos": 22827325887360.0, + "grad_norm": 2.204561669505926, + "language_loss": 0.89893198, + "learning_rate": 3.7144270876616713e-06, + "loss": 0.97745186, + "num_input_tokens_seen": 70715110, + "router_z_loss_clip": 2.58007812, + "router_z_loss_mlp": 0.24084473, + "step": 3277, + "time_per_iteration": 2.5781216621398926 + }, + { + "auxiliary_loss_clip": 0.0657627, + "auxiliary_loss_mlp": 0.01285494, + "balance_loss_clip": 0.06313394, + "balance_loss_mlp": 0.01258922, + "epoch": 0.19708402224560348, + "flos": 22901021153280.0, + "grad_norm": 2.1685116517567273, + "language_loss": 0.63218272, + "learning_rate": 3.714226497539239e-06, + "loss": 0.71080041, + "num_input_tokens_seen": 70734715, + "router_z_loss_clip": 2.62890625, + "router_z_loss_mlp": 0.26574707, + "step": 3278, + "time_per_iteration": 2.5733482837677 + }, + { + "auxiliary_loss_clip": 0.06573428, + "auxiliary_loss_mlp": 0.01286907, + "balance_loss_clip": 0.0631459, + "balance_loss_mlp": 0.01261515, + "epoch": 0.19714414549827144, + "flos": 25668989729280.0, + "grad_norm": 2.1172991336759983, + "language_loss": 0.75555933, + "learning_rate": 3.714025842413166e-06, + "loss": 0.83416271, + "num_input_tokens_seen": 70752650, + "router_z_loss_clip": 2.5859375, + "router_z_loss_mlp": 0.25378418, + "step": 3279, + "time_per_iteration": 2.598710775375366 + }, + { + "auxiliary_loss_clip": 0.06574699, + "auxiliary_loss_mlp": 0.0128012, + "balance_loss_clip": 0.06317799, + "balance_loss_mlp": 0.01256671, + "epoch": 0.19720426875093944, + "flos": 23922776240640.0, + "grad_norm": 1.6530428540457747, + "language_loss": 0.82974696, + "learning_rate": 3.713825122291061e-06, + "loss": 0.90829515, + "num_input_tokens_seen": 70772365, + "router_z_loss_clip": 2.56835938, + "router_z_loss_mlp": 0.23449707, + "step": 3280, + "time_per_iteration": 2.618016481399536 + }, + { + "auxiliary_loss_clip": 0.06568167, + "auxiliary_loss_mlp": 0.01283165, + "balance_loss_clip": 0.0630914, + "balance_loss_mlp": 0.01259085, + "epoch": 0.1972643920036074, + "flos": 13887178508160.0, + "grad_norm": 2.6497469055747036, + "language_loss": 0.78509879, + "learning_rate": 3.713624337180536e-06, + "loss": 0.86361206, + "num_input_tokens_seen": 70790340, + "router_z_loss_clip": 2.58984375, + "router_z_loss_mlp": 0.24084473, + "step": 3281, + "time_per_iteration": 2.5222740173339844 + }, + { + "auxiliary_loss_clip": 0.06561945, + "auxiliary_loss_mlp": 0.01286304, + "balance_loss_clip": 0.06312899, + "balance_loss_mlp": 0.01263952, + "epoch": 0.19732451525627537, + "flos": 19869479959680.0, + "grad_norm": 1.7725817592402109, + "language_loss": 0.80340242, + "learning_rate": 3.7134234870892045e-06, + "loss": 0.88188481, + "num_input_tokens_seen": 70809295, + "router_z_loss_clip": 2.4921875, + "router_z_loss_mlp": 0.22351074, + "step": 3282, + "time_per_iteration": 2.6235008239746094 + }, + { + "auxiliary_loss_clip": 0.06573974, + "auxiliary_loss_mlp": 0.01283963, + "balance_loss_clip": 0.06315407, + "balance_loss_mlp": 0.01259668, + "epoch": 0.19738463850894333, + "flos": 24980477529600.0, + "grad_norm": 1.861487958506938, + "language_loss": 0.72318685, + "learning_rate": 3.7132225720246826e-06, + "loss": 0.80176622, + "num_input_tokens_seen": 70828765, + "router_z_loss_clip": 2.58398438, + "router_z_loss_mlp": 0.24304199, + "step": 3283, + "time_per_iteration": 2.5938494205474854 + }, + { + "auxiliary_loss_clip": 0.06574511, + "auxiliary_loss_mlp": 0.01281543, + "balance_loss_clip": 0.06317373, + "balance_loss_mlp": 0.01256247, + "epoch": 0.1974447617616113, + "flos": 18374722174080.0, + "grad_norm": 1.6759301931344739, + "language_loss": 0.79791147, + "learning_rate": 3.7130215919945886e-06, + "loss": 0.87647206, + "num_input_tokens_seen": 70846805, + "router_z_loss_clip": 2.56640625, + "router_z_loss_mlp": 0.25292969, + "step": 3284, + "time_per_iteration": 2.530935049057007 + }, + { + "auxiliary_loss_clip": 0.06572407, + "auxiliary_loss_mlp": 0.01285612, + "balance_loss_clip": 0.06312867, + "balance_loss_mlp": 0.01260554, + "epoch": 0.19750488501427926, + "flos": 22899511779840.0, + "grad_norm": 1.8637255752391477, + "language_loss": 0.87043929, + "learning_rate": 3.7128205470065445e-06, + "loss": 0.94901949, + "num_input_tokens_seen": 70863805, + "router_z_loss_clip": 2.59570312, + "router_z_loss_mlp": 0.25061035, + "step": 3285, + "time_per_iteration": 2.5539395809173584 + }, + { + "auxiliary_loss_clip": 0.06561802, + "auxiliary_loss_mlp": 0.01282259, + "balance_loss_clip": 0.06307627, + "balance_loss_mlp": 0.01258012, + "epoch": 0.19756500826694723, + "flos": 21877924400640.0, + "grad_norm": 2.4795216745498956, + "language_loss": 0.88948774, + "learning_rate": 3.712619437068174e-06, + "loss": 0.96792841, + "num_input_tokens_seen": 70882660, + "router_z_loss_clip": 2.54296875, + "router_z_loss_mlp": 0.24243164, + "step": 3286, + "time_per_iteration": 2.5367021560668945 + }, + { + "auxiliary_loss_clip": 0.06569161, + "auxiliary_loss_mlp": 0.01280864, + "balance_loss_clip": 0.06308903, + "balance_loss_mlp": 0.01256641, + "epoch": 0.19762513151961522, + "flos": 15164414294400.0, + "grad_norm": 2.1735993607640904, + "language_loss": 0.79236507, + "learning_rate": 3.712418262187102e-06, + "loss": 0.87086535, + "num_input_tokens_seen": 70898765, + "router_z_loss_clip": 2.6015625, + "router_z_loss_mlp": 0.24230957, + "step": 3287, + "time_per_iteration": 2.4954702854156494 + }, + { + "auxiliary_loss_clip": 0.0656468, + "auxiliary_loss_mlp": 0.01280142, + "balance_loss_clip": 0.0630395, + "balance_loss_mlp": 0.01256824, + "epoch": 0.1976852547722832, + "flos": 16984239194880.0, + "grad_norm": 4.513328663516958, + "language_loss": 0.81957221, + "learning_rate": 3.7122170223709584e-06, + "loss": 0.89802045, + "num_input_tokens_seen": 70916370, + "router_z_loss_clip": 2.60742188, + "router_z_loss_mlp": 0.23303223, + "step": 3288, + "time_per_iteration": 2.504995584487915 + }, + { + "auxiliary_loss_clip": 0.0655796, + "auxiliary_loss_mlp": 0.01284666, + "balance_loss_clip": 0.06307058, + "balance_loss_mlp": 0.01260526, + "epoch": 0.19774537802495115, + "flos": 20309135932800.0, + "grad_norm": 2.127297919409227, + "language_loss": 0.73378497, + "learning_rate": 3.712015717627374e-06, + "loss": 0.81221128, + "num_input_tokens_seen": 70934870, + "router_z_loss_clip": 2.50976562, + "router_z_loss_mlp": 0.24157715, + "step": 3289, + "time_per_iteration": 2.5189085006713867 + }, + { + "auxiliary_loss_clip": 0.06562441, + "auxiliary_loss_mlp": 0.01280497, + "balance_loss_clip": 0.06308928, + "balance_loss_mlp": 0.0125718, + "epoch": 0.19780550127761912, + "flos": 27242893296000.0, + "grad_norm": 3.229663808517491, + "language_loss": 0.79990375, + "learning_rate": 3.7118143479639813e-06, + "loss": 0.87833309, + "num_input_tokens_seen": 70955140, + "router_z_loss_clip": 2.53320312, + "router_z_loss_mlp": 0.2331543, + "step": 3290, + "time_per_iteration": 2.615630626678467 + }, + { + "auxiliary_loss_clip": 0.06446102, + "auxiliary_loss_mlp": 0.01262954, + "balance_loss_clip": 0.06305116, + "balance_loss_mlp": 0.01256308, + "epoch": 0.19786562453028708, + "flos": 63572597015040.0, + "grad_norm": 0.871535655745335, + "language_loss": 0.60331321, + "learning_rate": 3.711612913388418e-06, + "loss": 0.68040371, + "num_input_tokens_seen": 71012005, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.06658936, + "step": 3291, + "time_per_iteration": 3.1708285808563232 + }, + { + "auxiliary_loss_clip": 0.06578626, + "auxiliary_loss_mlp": 0.01283318, + "balance_loss_clip": 0.06312629, + "balance_loss_mlp": 0.0125621, + "epoch": 0.19792574778295505, + "flos": 26293869152640.0, + "grad_norm": 1.6662005392394712, + "language_loss": 0.82490212, + "learning_rate": 3.7114114139083204e-06, + "loss": 0.90352154, + "num_input_tokens_seen": 71031140, + "router_z_loss_clip": 2.65820312, + "router_z_loss_mlp": 0.2713623, + "step": 3292, + "time_per_iteration": 4.009428024291992 + }, + { + "auxiliary_loss_clip": 0.06559315, + "auxiliary_loss_mlp": 0.01281718, + "balance_loss_clip": 0.06308785, + "balance_loss_mlp": 0.01259641, + "epoch": 0.19798587103562304, + "flos": 19944265328640.0, + "grad_norm": 2.398610043576172, + "language_loss": 0.82271063, + "learning_rate": 3.7112098495313313e-06, + "loss": 0.9011209, + "num_input_tokens_seen": 71050250, + "router_z_loss_clip": 2.50390625, + "router_z_loss_mlp": 0.2208252, + "step": 3293, + "time_per_iteration": 2.5567917823791504 + }, + { + "auxiliary_loss_clip": 0.06584712, + "auxiliary_loss_mlp": 0.0128547, + "balance_loss_clip": 0.06316388, + "balance_loss_mlp": 0.01259351, + "epoch": 0.198045994288291, + "flos": 20126428104960.0, + "grad_norm": 22.121432113432896, + "language_loss": 0.62642097, + "learning_rate": 3.711008220265093e-06, + "loss": 0.70512283, + "num_input_tokens_seen": 71068665, + "router_z_loss_clip": 2.68554688, + "router_z_loss_mlp": 0.26135254, + "step": 3294, + "time_per_iteration": 4.055817365646362 + }, + { + "auxiliary_loss_clip": 0.06568369, + "auxiliary_loss_mlp": 0.01283249, + "balance_loss_clip": 0.06312987, + "balance_loss_mlp": 0.01259849, + "epoch": 0.19810611754095897, + "flos": 17973444170880.0, + "grad_norm": 2.078666367863598, + "language_loss": 0.88182533, + "learning_rate": 3.710806526117251e-06, + "loss": 0.96034157, + "num_input_tokens_seen": 71085320, + "router_z_loss_clip": 2.55273438, + "router_z_loss_mlp": 0.23413086, + "step": 3295, + "time_per_iteration": 2.616658926010132 + }, + { + "auxiliary_loss_clip": 0.06566019, + "auxiliary_loss_mlp": 0.01286636, + "balance_loss_clip": 0.06313851, + "balance_loss_mlp": 0.01265298, + "epoch": 0.19816624079362694, + "flos": 15090257831040.0, + "grad_norm": 2.9890739239636575, + "language_loss": 0.82427287, + "learning_rate": 3.7106047670954544e-06, + "loss": 0.90279943, + "num_input_tokens_seen": 71102020, + "router_z_loss_clip": 2.5234375, + "router_z_loss_mlp": 0.21337891, + "step": 3296, + "time_per_iteration": 2.642479658126831 + }, + { + "auxiliary_loss_clip": 0.06579386, + "auxiliary_loss_mlp": 0.01281841, + "balance_loss_clip": 0.06320241, + "balance_loss_mlp": 0.01256593, + "epoch": 0.1982263640462949, + "flos": 24907327315200.0, + "grad_norm": 2.6461649791490522, + "language_loss": 0.69111884, + "learning_rate": 3.710402943207354e-06, + "loss": 0.76973104, + "num_input_tokens_seen": 71123390, + "router_z_loss_clip": 2.59179688, + "router_z_loss_mlp": 0.25268555, + "step": 3297, + "time_per_iteration": 2.5983548164367676 + }, + { + "auxiliary_loss_clip": 0.06568186, + "auxiliary_loss_mlp": 0.01294298, + "balance_loss_clip": 0.06316572, + "balance_loss_mlp": 0.01272125, + "epoch": 0.19828648729896287, + "flos": 20382453855360.0, + "grad_norm": 1.615710211373745, + "language_loss": 0.8249923, + "learning_rate": 3.7102010544606016e-06, + "loss": 0.90361714, + "num_input_tokens_seen": 71141800, + "router_z_loss_clip": 2.51757812, + "router_z_loss_mlp": 0.22167969, + "step": 3298, + "time_per_iteration": 2.548333168029785 + }, + { + "auxiliary_loss_clip": 0.0657866, + "auxiliary_loss_mlp": 0.01298019, + "balance_loss_clip": 0.06318102, + "balance_loss_mlp": 0.01272592, + "epoch": 0.19834661055163083, + "flos": 18886018988160.0, + "grad_norm": 1.9534827487794544, + "language_loss": 0.86188138, + "learning_rate": 3.7099991008628544e-06, + "loss": 0.94064808, + "num_input_tokens_seen": 71159505, + "router_z_loss_clip": 2.60546875, + "router_z_loss_mlp": 0.25402832, + "step": 3299, + "time_per_iteration": 3.944326400756836 + }, + { + "auxiliary_loss_clip": 0.06449087, + "auxiliary_loss_mlp": 0.01270227, + "balance_loss_clip": 0.06307668, + "balance_loss_mlp": 0.01262615, + "epoch": 0.19840673380429882, + "flos": 60278908723200.0, + "grad_norm": 0.7519898728992364, + "language_loss": 0.53224742, + "learning_rate": 3.7097970824217706e-06, + "loss": 0.60944057, + "num_input_tokens_seen": 71223265, + "router_z_loss_clip": 1.41308594, + "router_z_loss_mlp": 0.07598877, + "step": 3300, + "time_per_iteration": 4.6055073738098145 + }, + { + "auxiliary_loss_clip": 0.06570522, + "auxiliary_loss_mlp": 0.01292871, + "balance_loss_clip": 0.06315967, + "balance_loss_mlp": 0.01267706, + "epoch": 0.1984668570569668, + "flos": 19908235272960.0, + "grad_norm": 2.2853574973511472, + "language_loss": 0.73847342, + "learning_rate": 3.7095949991450093e-06, + "loss": 0.81710732, + "num_input_tokens_seen": 71242385, + "router_z_loss_clip": 2.546875, + "router_z_loss_mlp": 0.25183105, + "step": 3301, + "time_per_iteration": 2.6006925106048584 + }, + { + "auxiliary_loss_clip": 0.06563142, + "auxiliary_loss_mlp": 0.01290092, + "balance_loss_clip": 0.0631086, + "balance_loss_mlp": 0.01267239, + "epoch": 0.19852698030963475, + "flos": 15635865692160.0, + "grad_norm": 3.8656690955217976, + "language_loss": 0.8953101, + "learning_rate": 3.709392851040235e-06, + "loss": 0.9738425, + "num_input_tokens_seen": 71258990, + "router_z_loss_clip": 2.5234375, + "router_z_loss_mlp": 0.22851562, + "step": 3302, + "time_per_iteration": 2.487173080444336 + }, + { + "auxiliary_loss_clip": 0.06567049, + "auxiliary_loss_mlp": 0.0128658, + "balance_loss_clip": 0.06310292, + "balance_loss_mlp": 0.01263013, + "epoch": 0.19858710356230272, + "flos": 43153037729280.0, + "grad_norm": 2.6127475741484347, + "language_loss": 0.74595749, + "learning_rate": 3.709190638115111e-06, + "loss": 0.82449377, + "num_input_tokens_seen": 71282770, + "router_z_loss_clip": 2.56835938, + "router_z_loss_mlp": 0.23596191, + "step": 3303, + "time_per_iteration": 2.733031749725342 + }, + { + "auxiliary_loss_clip": 0.06567588, + "auxiliary_loss_mlp": 0.0129499, + "balance_loss_clip": 0.06313773, + "balance_loss_mlp": 0.01270373, + "epoch": 0.19864722681497068, + "flos": 35151348879360.0, + "grad_norm": 2.3312818962460686, + "language_loss": 0.75973707, + "learning_rate": 3.7089883603773084e-06, + "loss": 0.83836287, + "num_input_tokens_seen": 71301410, + "router_z_loss_clip": 2.54101562, + "router_z_loss_mlp": 0.24597168, + "step": 3304, + "time_per_iteration": 2.627612829208374 + }, + { + "auxiliary_loss_clip": 0.06565879, + "auxiliary_loss_mlp": 0.01301567, + "balance_loss_clip": 0.06315561, + "balance_loss_mlp": 0.01279156, + "epoch": 0.19870735006763865, + "flos": 19432088046720.0, + "grad_norm": 2.2073504264205277, + "language_loss": 0.86939341, + "learning_rate": 3.7087860178344955e-06, + "loss": 0.9480679, + "num_input_tokens_seen": 71319670, + "router_z_loss_clip": 2.50390625, + "router_z_loss_mlp": 0.22399902, + "step": 3305, + "time_per_iteration": 2.5243277549743652 + }, + { + "auxiliary_loss_clip": 0.06573498, + "auxiliary_loss_mlp": 0.01293424, + "balance_loss_clip": 0.06314258, + "balance_loss_mlp": 0.01270035, + "epoch": 0.19876747332030664, + "flos": 23553671005440.0, + "grad_norm": 1.7277126311559312, + "language_loss": 0.69397068, + "learning_rate": 3.7085836104943445e-06, + "loss": 0.77263987, + "num_input_tokens_seen": 71339850, + "router_z_loss_clip": 2.59570312, + "router_z_loss_mlp": 0.23388672, + "step": 3306, + "time_per_iteration": 2.6042323112487793 + }, + { + "auxiliary_loss_clip": 0.06570327, + "auxiliary_loss_mlp": 0.01299594, + "balance_loss_clip": 0.06314942, + "balance_loss_mlp": 0.0127723, + "epoch": 0.1988275965729746, + "flos": 19835672037120.0, + "grad_norm": 3.1120189325389735, + "language_loss": 0.77373499, + "learning_rate": 3.7083811383645332e-06, + "loss": 0.85243422, + "num_input_tokens_seen": 71359795, + "router_z_loss_clip": 2.55273438, + "router_z_loss_mlp": 0.22375488, + "step": 3307, + "time_per_iteration": 2.6128084659576416 + }, + { + "auxiliary_loss_clip": 0.06569448, + "auxiliary_loss_mlp": 0.01292327, + "balance_loss_clip": 0.06316574, + "balance_loss_mlp": 0.01270452, + "epoch": 0.19888771982564257, + "flos": 23520366207360.0, + "grad_norm": 3.545114094394172, + "language_loss": 0.7662878, + "learning_rate": 3.708178601452737e-06, + "loss": 0.84490561, + "num_input_tokens_seen": 71378885, + "router_z_loss_clip": 2.52929688, + "router_z_loss_mlp": 0.21875, + "step": 3308, + "time_per_iteration": 2.5699222087860107 + }, + { + "auxiliary_loss_clip": 0.06565186, + "auxiliary_loss_mlp": 0.01291629, + "balance_loss_clip": 0.0631263, + "balance_loss_mlp": 0.0126799, + "epoch": 0.19894784307831054, + "flos": 18156403560960.0, + "grad_norm": 1.7056349525902872, + "language_loss": 0.76261461, + "learning_rate": 3.7079759997666374e-06, + "loss": 0.84118271, + "num_input_tokens_seen": 71397285, + "router_z_loss_clip": 2.52539062, + "router_z_loss_mlp": 0.23657227, + "step": 3309, + "time_per_iteration": 2.5804028511047363 + }, + { + "auxiliary_loss_clip": 0.06557433, + "auxiliary_loss_mlp": 0.01287248, + "balance_loss_clip": 0.06309506, + "balance_loss_mlp": 0.0126287, + "epoch": 0.1990079663309785, + "flos": 24282280183680.0, + "grad_norm": 1.5893437900436935, + "language_loss": 0.8845197, + "learning_rate": 3.707773333313917e-06, + "loss": 0.96296644, + "num_input_tokens_seen": 71415775, + "router_z_loss_clip": 2.47851562, + "router_z_loss_mlp": 0.24377441, + "step": 3310, + "time_per_iteration": 2.540788412094116 + }, + { + "auxiliary_loss_clip": 0.06554775, + "auxiliary_loss_mlp": 0.01280476, + "balance_loss_clip": 0.06304908, + "balance_loss_mlp": 0.01256575, + "epoch": 0.19906808958364647, + "flos": 34906391867520.0, + "grad_norm": 2.4688423193302347, + "language_loss": 0.64663219, + "learning_rate": 3.70757060210226e-06, + "loss": 0.72498477, + "num_input_tokens_seen": 71437315, + "router_z_loss_clip": 2.50195312, + "router_z_loss_mlp": 0.23925781, + "step": 3311, + "time_per_iteration": 2.6754508018493652 + }, + { + "auxiliary_loss_clip": 0.06567319, + "auxiliary_loss_mlp": 0.01285122, + "balance_loss_clip": 0.06310549, + "balance_loss_mlp": 0.01261351, + "epoch": 0.19912821283631443, + "flos": 24031788802560.0, + "grad_norm": 3.0857408174701186, + "language_loss": 0.75624847, + "learning_rate": 3.707367806139355e-06, + "loss": 0.83477283, + "num_input_tokens_seen": 71456320, + "router_z_loss_clip": 2.56445312, + "router_z_loss_mlp": 0.23779297, + "step": 3312, + "time_per_iteration": 2.5815083980560303 + }, + { + "auxiliary_loss_clip": 0.06553487, + "auxiliary_loss_mlp": 0.01286524, + "balance_loss_clip": 0.06300232, + "balance_loss_mlp": 0.01262611, + "epoch": 0.19918833608898243, + "flos": 19864155225600.0, + "grad_norm": 2.0583715987658264, + "language_loss": 0.84526402, + "learning_rate": 3.7071649454328915e-06, + "loss": 0.92366409, + "num_input_tokens_seen": 71475360, + "router_z_loss_clip": 2.53125, + "router_z_loss_mlp": 0.23937988, + "step": 3313, + "time_per_iteration": 2.5260941982269287 + }, + { + "auxiliary_loss_clip": 0.06547163, + "auxiliary_loss_mlp": 0.01284622, + "balance_loss_clip": 0.06294618, + "balance_loss_mlp": 0.01261376, + "epoch": 0.1992484593416504, + "flos": 29103444080640.0, + "grad_norm": 1.8813056340492245, + "language_loss": 0.81481469, + "learning_rate": 3.7069620199905625e-06, + "loss": 0.89313251, + "num_input_tokens_seen": 71496155, + "router_z_loss_clip": 2.52539062, + "router_z_loss_mlp": 0.2322998, + "step": 3314, + "time_per_iteration": 2.618865966796875 + }, + { + "auxiliary_loss_clip": 0.06544838, + "auxiliary_loss_mlp": 0.01278619, + "balance_loss_clip": 0.06300788, + "balance_loss_mlp": 0.01257924, + "epoch": 0.19930858259431836, + "flos": 23301754104960.0, + "grad_norm": 1.60969518187187, + "language_loss": 0.88063407, + "learning_rate": 3.7067590298200627e-06, + "loss": 0.95886856, + "num_input_tokens_seen": 71517295, + "router_z_loss_clip": 2.43945312, + "router_z_loss_mlp": 0.20690918, + "step": 3315, + "time_per_iteration": 2.5732057094573975 + }, + { + "auxiliary_loss_clip": 0.06550217, + "auxiliary_loss_mlp": 0.01280633, + "balance_loss_clip": 0.06298293, + "balance_loss_mlp": 0.0125728, + "epoch": 0.19936870584698632, + "flos": 25386619069440.0, + "grad_norm": 1.6023919835075873, + "language_loss": 0.71362162, + "learning_rate": 3.7065559749290892e-06, + "loss": 0.79193014, + "num_input_tokens_seen": 71540000, + "router_z_loss_clip": 2.51953125, + "router_z_loss_mlp": 0.23352051, + "step": 3316, + "time_per_iteration": 2.6071085929870605 + }, + { + "auxiliary_loss_clip": 0.06427301, + "auxiliary_loss_mlp": 0.01269861, + "balance_loss_clip": 0.06290084, + "balance_loss_mlp": 0.01263975, + "epoch": 0.1994288290996543, + "flos": 62190038246400.0, + "grad_norm": 0.8251623423654184, + "language_loss": 0.6634506, + "learning_rate": 3.706352855325342e-06, + "loss": 0.74042213, + "num_input_tokens_seen": 71607880, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.05880737, + "step": 3317, + "time_per_iteration": 3.216862201690674 + }, + { + "auxiliary_loss_clip": 0.06558052, + "auxiliary_loss_mlp": 0.01286476, + "balance_loss_clip": 0.06302503, + "balance_loss_mlp": 0.01262813, + "epoch": 0.19948895235232225, + "flos": 19031816292480.0, + "grad_norm": 2.159914212237722, + "language_loss": 0.74519444, + "learning_rate": 3.7061496710165233e-06, + "loss": 0.82363975, + "num_input_tokens_seen": 71625695, + "router_z_loss_clip": 2.55078125, + "router_z_loss_mlp": 0.23669434, + "step": 3318, + "time_per_iteration": 2.5432114601135254 + }, + { + "auxiliary_loss_clip": 0.06544004, + "auxiliary_loss_mlp": 0.01278248, + "balance_loss_clip": 0.06298326, + "balance_loss_mlp": 0.01256266, + "epoch": 0.19954907560499022, + "flos": 37824895503360.0, + "grad_norm": 2.0763327087054604, + "language_loss": 0.79865813, + "learning_rate": 3.7059464220103385e-06, + "loss": 0.87688065, + "num_input_tokens_seen": 71648520, + "router_z_loss_clip": 2.45703125, + "router_z_loss_mlp": 0.21984863, + "step": 3319, + "time_per_iteration": 2.6703901290893555 + }, + { + "auxiliary_loss_clip": 0.06551617, + "auxiliary_loss_mlp": 0.01282829, + "balance_loss_clip": 0.06300303, + "balance_loss_mlp": 0.01259631, + "epoch": 0.1996091988576582, + "flos": 49576420673280.0, + "grad_norm": 2.869788826425785, + "language_loss": 0.763668, + "learning_rate": 3.7057431083144945e-06, + "loss": 0.84201247, + "num_input_tokens_seen": 71672185, + "router_z_loss_clip": 2.51757812, + "router_z_loss_mlp": 0.2322998, + "step": 3320, + "time_per_iteration": 2.817199945449829 + }, + { + "auxiliary_loss_clip": 0.06552573, + "auxiliary_loss_mlp": 0.01291841, + "balance_loss_clip": 0.06302333, + "balance_loss_mlp": 0.01269608, + "epoch": 0.19966932211032618, + "flos": 22642018583040.0, + "grad_norm": 1.4988243809721686, + "language_loss": 0.81033528, + "learning_rate": 3.705539729936701e-06, + "loss": 0.8887794, + "num_input_tokens_seen": 71692890, + "router_z_loss_clip": 2.50390625, + "router_z_loss_mlp": 0.22229004, + "step": 3321, + "time_per_iteration": 2.6688761711120605 + }, + { + "auxiliary_loss_clip": 0.06416404, + "auxiliary_loss_mlp": 0.01265491, + "balance_loss_clip": 0.06281121, + "balance_loss_mlp": 0.01258195, + "epoch": 0.19972944536299414, + "flos": 54098973417600.0, + "grad_norm": 0.8569411614728654, + "language_loss": 0.65245974, + "learning_rate": 3.7053362868846696e-06, + "loss": 0.72927874, + "num_input_tokens_seen": 71745815, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.07275391, + "step": 3322, + "time_per_iteration": 3.000269651412964 + }, + { + "auxiliary_loss_clip": 0.06410387, + "auxiliary_loss_mlp": 0.01261864, + "balance_loss_clip": 0.06274698, + "balance_loss_mlp": 0.01254372, + "epoch": 0.1997895686156621, + "flos": 69371995731840.0, + "grad_norm": 0.7694165297899808, + "language_loss": 0.56849998, + "learning_rate": 3.7051327791661153e-06, + "loss": 0.64522249, + "num_input_tokens_seen": 71806915, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.07476807, + "step": 3323, + "time_per_iteration": 3.330606698989868 + }, + { + "auxiliary_loss_clip": 0.06562012, + "auxiliary_loss_mlp": 0.01292664, + "balance_loss_clip": 0.06316413, + "balance_loss_mlp": 0.01268596, + "epoch": 0.19984969186833007, + "flos": 18558058907520.0, + "grad_norm": 1.8232624283894519, + "language_loss": 0.81610429, + "learning_rate": 3.7049292067887555e-06, + "loss": 0.89465106, + "num_input_tokens_seen": 71824645, + "router_z_loss_clip": 2.45703125, + "router_z_loss_mlp": 0.24084473, + "step": 3324, + "time_per_iteration": 2.5314769744873047 + }, + { + "auxiliary_loss_clip": 0.06558169, + "auxiliary_loss_mlp": 0.01292911, + "balance_loss_clip": 0.06310347, + "balance_loss_mlp": 0.01268318, + "epoch": 0.19990981512099804, + "flos": 26436438074880.0, + "grad_norm": 1.6515442637335616, + "language_loss": 0.54047406, + "learning_rate": 3.7047255697603092e-06, + "loss": 0.61898488, + "num_input_tokens_seen": 71845125, + "router_z_loss_clip": 2.48242188, + "router_z_loss_mlp": 0.24609375, + "step": 3325, + "time_per_iteration": 2.6192479133605957 + }, + { + "auxiliary_loss_clip": 0.06565623, + "auxiliary_loss_mlp": 0.01288281, + "balance_loss_clip": 0.063146, + "balance_loss_mlp": 0.01265572, + "epoch": 0.19996993837366603, + "flos": 16331547415680.0, + "grad_norm": 1.9371709062145088, + "language_loss": 0.8658272, + "learning_rate": 3.7045218680884984e-06, + "loss": 0.94436622, + "num_input_tokens_seen": 71863500, + "router_z_loss_clip": 2.50976562, + "router_z_loss_mlp": 0.22729492, + "step": 3326, + "time_per_iteration": 2.5111629962921143 + }, + { + "auxiliary_loss_clip": 0.06551019, + "auxiliary_loss_mlp": 0.01289033, + "balance_loss_clip": 0.06305069, + "balance_loss_mlp": 0.01266705, + "epoch": 0.200030061626334, + "flos": 20849460986880.0, + "grad_norm": 6.809877440219623, + "language_loss": 0.7272824, + "learning_rate": 3.7043181017810476e-06, + "loss": 0.8056829, + "num_input_tokens_seen": 71881845, + "router_z_loss_clip": 2.45898438, + "router_z_loss_mlp": 0.22314453, + "step": 3327, + "time_per_iteration": 2.5571372509002686 + }, + { + "auxiliary_loss_clip": 0.06566358, + "auxiliary_loss_mlp": 0.01287053, + "balance_loss_clip": 0.06313111, + "balance_loss_mlp": 0.01261756, + "epoch": 0.20009018487900196, + "flos": 23768341966080.0, + "grad_norm": 1.841950801645188, + "language_loss": 0.77914047, + "learning_rate": 3.7041142708456833e-06, + "loss": 0.8576746, + "num_input_tokens_seen": 71900940, + "router_z_loss_clip": 2.53320312, + "router_z_loss_mlp": 0.25317383, + "step": 3328, + "time_per_iteration": 2.5489912033081055 + } + ], + "logging_steps": 1.0, + "max_steps": 16632, + "num_input_tokens_seen": 71900940, + "num_train_epochs": 1, + "save_steps": 3328, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.8075942270625382e+17, + "train_batch_size": 5, + "trial_name": null, + "trial_params": null +} diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-3328/training_args.bin b/sft/revise_Full_smoe_tcmoe/checkpoint-3328/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..97c752df28a864c1e1da329f5474435eefe7778b --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-3328/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fda08a1e9d46ee3a47070dfbfdde239474b3b39c0e298dedbf0b0dd9cdd3c27e +size 7992 diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-3328/zero_to_fp32.py b/sft/revise_Full_smoe_tcmoe/checkpoint-3328/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-3328/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-6656/added_tokens.json b/sft/revise_Full_smoe_tcmoe/checkpoint-6656/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..c9d3d3a1b74d87e381e471f7b33784015d2dc0ea --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-6656/added_tokens.json @@ -0,0 +1,13 @@ +{ + "<|assistant|>": 32001, + "<|endoftext|>": 32000, + "<|end|>": 32007, + "<|placeholder1|>": 32002, + "<|placeholder2|>": 32003, + "<|placeholder3|>": 32004, + "<|placeholder4|>": 32005, + "<|placeholder5|>": 32008, + "<|placeholder6|>": 32009, + "<|system|>": 32006, + "<|user|>": 32010 +} diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-6656/config.json b/sft/revise_Full_smoe_tcmoe/checkpoint-6656/config.json new file mode 100644 index 0000000000000000000000000000000000000000..da3b0c65c0ef1d3a1c68ffdd7565996d4dd85a33 --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-6656/config.json @@ -0,0 +1,203 @@ +{ + "_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "architectures": [ + "LlavaPhiForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_phi3.Phi3Config", + "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM" + }, + "bal_comp_loss_coef": 0.01, + "balance_loss_coef": 0.01, + "bos_token_id": 1, + "clip_smoe": true, + "diversity_loss_coef": 0.01, + "dropout": false, + "e_loss_coef": 0.001, + "embd_pdrop": 0.0, + "entropy_advance_loss": false, + "eos_token_id": 32000, + "freeze_backbone": false, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 3072, + "hybrid": false, + "image_aspect_ratio": "pad", + "init_weight": true, + "initializer_range": 0.02, + "intermediate_size": 8192, + "is_cosine": false, + "is_norm_weight": false, + "local_rank": 0, + "loss1": "balanceloss", + "loss2": "zloss", + "luna": false, + "max_compete_in_iter": 3, + "max_position_embeddings": 131072, + "mlp_smoe": true, + "mm_hidden_size": 1152, + "mm_patch_merge_type": "flat", + "mm_projector_lr": null, + "mm_projector_type": "moe", + "mm_use_im_patch_token": false, + "mm_use_im_start_end": false, + "mm_vision_select_feature": "patch", + "mm_vision_select_layer": -2, + "mm_vision_tower": "google/siglip-so400m-patch14-224", + "model_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "model_type": "llava_phi", + "moe_name": "smoe_tcmoe", + "moe_relu_l1_reg_coeff_multiplier": 1.2, + "mp_pixel_shuffle_factor": 1, + "norm_softmax": false, + "normalization": true, + "num_attention_heads": 32, + "num_experts": 4, + "num_hidden_layers": 32, + "num_key_value_heads": 32, + "num_layers": 3, + "num_selected": 2, + "number_of_previous_tokens": 2, + "original_max_position_embeddings": 4096, + "pad_token_id": 32000, + "pretrain_mm_mlp_adapter": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft/mm_projector.bin", + "rate_compete": 0.2, + "rate_flip": 0.05, + "resid_pdrop": 0.0, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "long_factor": [ + 1.0800000429153442, + 1.1100000143051147, + 1.1399999856948853, + 1.340000033378601, + 1.5899999141693115, + 1.600000023841858, + 1.6200000047683716, + 2.620000123977661, + 3.2300000190734863, + 3.2300000190734863, + 4.789999961853027, + 7.400000095367432, + 7.700000286102295, + 9.09000015258789, + 12.199999809265137, + 17.670000076293945, + 24.46000099182129, + 28.57000160217285, + 30.420001983642578, + 30.840002059936523, + 32.590003967285156, + 32.93000411987305, + 42.320003509521484, + 44.96000289916992, + 50.340003967285156, + 50.45000457763672, + 57.55000305175781, + 57.93000411987305, + 58.21000289916992, + 60.1400032043457, + 62.61000442504883, + 62.62000274658203, + 62.71000289916992, + 63.1400032043457, + 63.1400032043457, + 63.77000427246094, + 63.93000411987305, + 63.96000289916992, + 63.970001220703125, + 64.02999877929688, + 64.06999969482422, + 64.08000183105469, + 64.12000274658203, + 64.41000366210938, + 64.4800033569336, + 64.51000213623047, + 64.52999877929688, + 64.83999633789062 + ], + "short_factor": [ + 1.0, + 1.0199999809265137, + 1.0299999713897705, + 1.0299999713897705, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0699999332427979, + 1.0999999046325684, + 1.1099998950958252, + 1.1599998474121094, + 1.1599998474121094, + 1.1699998378753662, + 1.2899998426437378, + 1.339999794960022, + 1.679999828338623, + 1.7899998426437378, + 1.8199998140335083, + 1.8499997854232788, + 1.8799997568130493, + 1.9099997282028198, + 1.9399996995925903, + 1.9899996519088745, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0799996852874756, + 2.0899996757507324, + 2.189999580383301, + 2.2199995517730713, + 2.5899994373321533, + 2.729999542236328, + 2.749999523162842, + 2.8399994373321533 + ], + "type": "longrope" + }, + "rope_theta": 10000.0, + "router_loss_coef": 0.01, + "router_theta": 0.1, + "router_z_loss_coef": 0.001, + "scales": [ + 1, + 3 + ], + "sliding_window": 262144, + "sparse_upcycling": true, + "std_gate": 0.02, + "strategy_train": "base", + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "topk_max": 2, + "topk_min": 1, + "torch_dtype": "bfloat16", + "training": true, + "transformers_version": "4.43.0", + "tune_mm_mlp_adapter": false, + "unit_test": true, + "use_cache": false, + "use_mm_proj": true, + "use_old": false, + "version": "phi35", + "vision_tower": "google/siglip-so400m-patch14-224", + "vision_tower_dir": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft/clip.bin", + "vocab_size": 32064, + "warm_up": 0.05 +} diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-6656/generation_config.json b/sft/revise_Full_smoe_tcmoe/checkpoint-6656/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..dad5c4578f0dc5969b38755d095fc30c368bb54a --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-6656/generation_config.json @@ -0,0 +1,12 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "do_sample": true, + "eos_token_id": [ + 32007, + 32001, + 32000 + ], + "pad_token_id": 32000, + "transformers_version": "4.43.0" +} diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-6656/latest b/sft/revise_Full_smoe_tcmoe/checkpoint-6656/latest new file mode 100644 index 0000000000000000000000000000000000000000..24f37f789c4e6eb86270647db8ff45788e484aa2 --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-6656/latest @@ -0,0 +1 @@ +global_step6656 \ No newline at end of file diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-6656/model-00001-of-00003.safetensors b/sft/revise_Full_smoe_tcmoe/checkpoint-6656/model-00001-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..4aa330bd84e29cede0979b5dbd2ea4ab1b85ccde --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-6656/model-00001-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6efa60c70039c41cddffdfb12f0551007bdbde1abc4c13c449828ea3895e66b6 +size 4972489328 diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-6656/model-00002-of-00003.safetensors b/sft/revise_Full_smoe_tcmoe/checkpoint-6656/model-00002-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..07bffad53fec0229a5951c6365b8df5918934fe6 --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-6656/model-00002-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a1f2966c11d02886578cb3e5450ae73e19a298f8ba9a0a73caa43b50efc6850 +size 4985902928 diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-6656/model-00003-of-00003.safetensors b/sft/revise_Full_smoe_tcmoe/checkpoint-6656/model-00003-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..dcdd9f7d94d8c65a67428855bc99966270cea6ab --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-6656/model-00003-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f8b2c011e503fcba73e315e31a812633f89e9a904ced51c96ada217e9b47afc +size 248971200 diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-6656/model.safetensors.index.json b/sft/revise_Full_smoe_tcmoe/checkpoint-6656/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..3197289c4553bb4cba30dd31a8c232b7496a92b5 --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-6656/model.safetensors.index.json @@ -0,0 +1,1005 @@ +{ + "metadata": { + "total_size": 10207220352 + }, + "weight_map": { + "lm_head.weight": "model-00003-of-00003.safetensors", + "model.embed_tokens.weight": "model-00001-of-00003.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.16.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.16.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.17.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.17.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.18.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.18.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.19.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.19.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.21.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.25.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.25.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.25.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.26.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.26.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.26.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.27.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.27.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.27.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.28.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.28.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.28.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.29.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.29.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.29.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.30.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.30.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.30.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.31.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.31.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.31.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.mm_projector.moelayer.experts.0.0.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.0.0.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.0.2.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.0.2.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.1.0.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.1.0.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.1.2.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.1.2.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.2.0.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.2.0.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.2.2.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.2.2.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.3.0.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.3.0.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.3.2.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.3.2.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.gate.weight": "model-00003-of-00003.safetensors", + "model.norm.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00002-of-00003.safetensors" + } +} diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-6656/rng_state_0.pth b/sft/revise_Full_smoe_tcmoe/checkpoint-6656/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..9231f69f5fd461899867106a669ce247e70c72c2 --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-6656/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f9f23d807f0e704f4ca79670a6631cbff43189cf7f8ff4e1fc0a4330e636a798 +size 14960 diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-6656/rng_state_1.pth b/sft/revise_Full_smoe_tcmoe/checkpoint-6656/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..19fe2dcc766f192ea5de79cec4dcff17172a10f7 --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-6656/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d37f92f6aea5386e84d2d64a1a25d6ef96a10b3bbbfe63627981604c8934076 +size 14960 diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-6656/rng_state_2.pth b/sft/revise_Full_smoe_tcmoe/checkpoint-6656/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..bfe492519c6b79b07a8d68b98c5f3d0c073667aa --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-6656/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:667ebf727735115f00a6bdbe090344e9846c726d11bb555cdc201c415f27ad85 +size 14960 diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-6656/rng_state_3.pth b/sft/revise_Full_smoe_tcmoe/checkpoint-6656/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..838d42ad13e30851fdbd1d8801738a4106a9ce8b --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-6656/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49d306f8c511cba8a225e3b723c5fa79d8a6ecc922f834da914ff0780c78b1fc +size 14960 diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-6656/special_tokens_map.json b/sft/revise_Full_smoe_tcmoe/checkpoint-6656/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3e4d5a5bc1cb51753cc9ae0305ece0da60052b10 --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-6656/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-6656/tokenizer.model b/sft/revise_Full_smoe_tcmoe/checkpoint-6656/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-6656/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-6656/tokenizer_config.json b/sft/revise_Full_smoe_tcmoe/checkpoint-6656/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d579bb0b91b24b214ea3c2e487e27a65017cdc4a --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-6656/tokenizer_config.json @@ -0,0 +1,132 @@ +{ + "add_bos_token": false, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": false + }, + "32000": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32001": { + "content": "<|assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32002": { + "content": "<|placeholder1|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32003": { + "content": "<|placeholder2|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32004": { + "content": "<|placeholder3|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32005": { + "content": "<|placeholder4|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32006": { + "content": "<|system|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32007": { + "content": "<|end|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32008": { + "content": "<|placeholder5|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32009": { + "content": "<|placeholder6|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32010": { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "chat_template": "{% for message in messages %}{% if message['role'] == 'system' and message['content'] %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "legacy": false, + "model_max_length": 2048, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-6656/trainer_state.json b/sft/revise_Full_smoe_tcmoe/checkpoint-6656/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..185d19827ebb06ecee76addf8e27c228281ec30b --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-6656/trainer_state.json @@ -0,0 +1,113185 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.4001803697580039, + "eval_steps": 500, + "global_step": 6656, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "auxiliary_loss_clip": 0.20073968, + "auxiliary_loss_mlp": 1.0941844, + "balance_loss_clip": 0.12873733, + "balance_loss_mlp": 0.03705556, + "epoch": 6.012325266796934e-05, + "flos": 24462952254720.0, + "grad_norm": 941654.8300602314, + "language_loss": 24.32558632, + "learning_rate": 0.0, + "loss": 16.92002487, + "num_input_tokens_seen": 19155, + "router_z_loss_clip": 72.03125, + "router_z_loss_mlp": 1058.5, + "step": 1, + "time_per_iteration": 18.343486785888672 + }, + { + "auxiliary_loss_clip": 0.13316599, + "auxiliary_loss_mlp": 0.71558112, + "balance_loss_clip": 0.08576315, + "balance_loss_mlp": 0.02466314, + "epoch": 0.00012024650533593868, + "flos": 20231457598080.0, + "grad_norm": 271164.48776572174, + "language_loss": 15.90828419, + "learning_rate": 4.4628432569317594e-07, + "loss": 16.75703049, + "num_input_tokens_seen": 36175, + "router_z_loss_clip": 47.40625, + "router_z_loss_mlp": 691.5, + "step": 2, + "time_per_iteration": 2.4823946952819824 + }, + { + "auxiliary_loss_clip": 0.13345747, + "auxiliary_loss_mlp": 0.73460984, + "balance_loss_clip": 0.08591475, + "balance_loss_mlp": 0.02464893, + "epoch": 0.000180369758003908, + "flos": 22316532197760.0, + "grad_norm": 30890.300344628693, + "language_loss": 15.82156086, + "learning_rate": 7.073439208833112e-07, + "loss": 16.68962669, + "num_input_tokens_seen": 54870, + "router_z_loss_clip": 47.46875, + "router_z_loss_mlp": 711.0, + "step": 3, + "time_per_iteration": 2.4773216247558594 + }, + { + "auxiliary_loss_clip": 0.13399127, + "auxiliary_loss_mlp": 0.72687411, + "balance_loss_clip": 0.08587996, + "balance_loss_mlp": 0.02472562, + "epoch": 0.00024049301067187735, + "flos": 22420471587840.0, + "grad_norm": 3825.373736974443, + "language_loss": 15.7262888, + "learning_rate": 8.925686513863519e-07, + "loss": 16.58715439, + "num_input_tokens_seen": 74575, + "router_z_loss_clip": 48.15625, + "router_z_loss_mlp": 703.0, + "step": 4, + "time_per_iteration": 2.492133378982544 + }, + { + "auxiliary_loss_clip": 0.13353133, + "auxiliary_loss_mlp": 0.72775936, + "balance_loss_clip": 0.08579096, + "balance_loss_mlp": 0.02463434, + "epoch": 0.0003006162633398467, + "flos": 21403286547840.0, + "grad_norm": 4441.394942298188, + "language_loss": 15.57899952, + "learning_rate": 1.0362401141348472e-06, + "loss": 16.44029045, + "num_input_tokens_seen": 92580, + "router_z_loss_clip": 47.65625, + "router_z_loss_mlp": 704.0, + "step": 5, + "time_per_iteration": 2.7607173919677734 + }, + { + "auxiliary_loss_clip": 0.13327441, + "auxiliary_loss_mlp": 0.71557182, + "balance_loss_clip": 0.08570103, + "balance_loss_mlp": 0.02465384, + "epoch": 0.000360739516007816, + "flos": 21658725319680.0, + "grad_norm": 2540.715684092784, + "language_loss": 14.90827179, + "learning_rate": 1.153628246576487e-06, + "loss": 15.75711823, + "num_input_tokens_seen": 109705, + "router_z_loss_clip": 47.5625, + "router_z_loss_mlp": 691.5, + "step": 6, + "time_per_iteration": 2.6497979164123535 + }, + { + "auxiliary_loss_clip": 0.13351092, + "auxiliary_loss_mlp": 0.7340821, + "balance_loss_clip": 0.08562777, + "balance_loss_mlp": 0.02460942, + "epoch": 0.0004208627686757854, + "flos": 27166682407680.0, + "grad_norm": 2502.417206046203, + "language_loss": 14.593853, + "learning_rate": 1.2528784983718962e-06, + "loss": 15.46144581, + "num_input_tokens_seen": 129425, + "router_z_loss_clip": 47.875, + "router_z_loss_mlp": 710.5, + "step": 7, + "time_per_iteration": 2.7325549125671387 + }, + { + "auxiliary_loss_clip": 0.13360947, + "auxiliary_loss_mlp": 0.73910165, + "balance_loss_clip": 0.08574936, + "balance_loss_mlp": 0.02474618, + "epoch": 0.0004809860213437547, + "flos": 31326727190400.0, + "grad_norm": 4081.02679202092, + "language_loss": 14.47960091, + "learning_rate": 1.338852977079528e-06, + "loss": 15.35231113, + "num_input_tokens_seen": 149210, + "router_z_loss_clip": 47.78125, + "router_z_loss_mlp": 715.5, + "step": 8, + "time_per_iteration": 2.7674574851989746 + }, + { + "auxiliary_loss_clip": 0.13345738, + "auxiliary_loss_mlp": 0.74048162, + "balance_loss_clip": 0.08564517, + "balance_loss_mlp": 0.02466127, + "epoch": 0.000541109274011724, + "flos": 32168541634560.0, + "grad_norm": 2607.7195165159947, + "language_loss": 13.74505424, + "learning_rate": 1.4146878417666224e-06, + "loss": 14.61899281, + "num_input_tokens_seen": 169055, + "router_z_loss_clip": 47.78125, + "router_z_loss_mlp": 716.5, + "step": 9, + "time_per_iteration": 2.8135807514190674 + }, + { + "auxiliary_loss_clip": 0.13289651, + "auxiliary_loss_mlp": 0.7478379, + "balance_loss_clip": 0.08548209, + "balance_loss_mlp": 0.02469334, + "epoch": 0.0006012325266796934, + "flos": 18922845657600.0, + "grad_norm": 8226.203152944285, + "language_loss": 12.47718525, + "learning_rate": 1.4825244398280232e-06, + "loss": 13.35791969, + "num_input_tokens_seen": 188045, + "router_z_loss_clip": 47.375, + "router_z_loss_mlp": 724.5, + "step": 10, + "time_per_iteration": 2.665703296661377 + }, + { + "auxiliary_loss_clip": 0.1330242, + "auxiliary_loss_mlp": 0.74298382, + "balance_loss_clip": 0.08549603, + "balance_loss_mlp": 0.02472211, + "epoch": 0.0006613557793476627, + "flos": 20780755038720.0, + "grad_norm": 29924.608712817644, + "language_loss": 12.23305321, + "learning_rate": 1.5438901072051983e-06, + "loss": 13.10906219, + "num_input_tokens_seen": 207035, + "router_z_loss_clip": 47.53125, + "router_z_loss_mlp": 719.0, + "step": 11, + "time_per_iteration": 2.6799204349517822 + }, + { + "auxiliary_loss_clip": 0.133246, + "auxiliary_loss_mlp": 0.74782056, + "balance_loss_clip": 0.08560382, + "balance_loss_mlp": 0.02467602, + "epoch": 0.000721479032015632, + "flos": 16587321603840.0, + "grad_norm": 24119.088684995622, + "language_loss": 11.84583473, + "learning_rate": 1.5999125722696629e-06, + "loss": 12.72690105, + "num_input_tokens_seen": 223225, + "router_z_loss_clip": 47.6875, + "router_z_loss_mlp": 723.5, + "step": 12, + "time_per_iteration": 2.707231044769287 + }, + { + "auxiliary_loss_clip": 0.13276552, + "auxiliary_loss_mlp": 0.74238944, + "balance_loss_clip": 0.08559544, + "balance_loss_mlp": 0.02461605, + "epoch": 0.0007816022846836014, + "flos": 23812254305280.0, + "grad_norm": 118556.26638855682, + "language_loss": 11.36912918, + "learning_rate": 1.6514482443788434e-06, + "loss": 12.24428368, + "num_input_tokens_seen": 242570, + "router_z_loss_clip": 47.1875, + "router_z_loss_mlp": 718.0, + "step": 13, + "time_per_iteration": 2.696007251739502 + }, + { + "auxiliary_loss_clip": 0.13292459, + "auxiliary_loss_mlp": 0.74095768, + "balance_loss_clip": 0.0856985, + "balance_loss_mlp": 0.02464909, + "epoch": 0.0008417255373515708, + "flos": 19178284429440.0, + "grad_norm": 181106.81391623587, + "language_loss": 10.94849205, + "learning_rate": 1.6991628240650723e-06, + "loss": 11.82237434, + "num_input_tokens_seen": 261215, + "router_z_loss_clip": 47.1875, + "router_z_loss_mlp": 716.5, + "step": 14, + "time_per_iteration": 2.676393985748291 + }, + { + "auxiliary_loss_clip": 0.13372461, + "auxiliary_loss_mlp": 0.75321233, + "balance_loss_clip": 0.08592231, + "balance_loss_mlp": 0.02469672, + "epoch": 0.00090184879001954, + "flos": 26402714006400.0, + "grad_norm": 8872.944602873076, + "language_loss": 11.40745831, + "learning_rate": 1.7435840350181584e-06, + "loss": 12.29439545, + "num_input_tokens_seen": 280035, + "router_z_loss_clip": 47.78125, + "router_z_loss_mlp": 729.5, + "step": 15, + "time_per_iteration": 2.716722249984741 + }, + { + "auxiliary_loss_clip": 0.13287091, + "auxiliary_loss_mlp": 0.73999238, + "balance_loss_clip": 0.0855229, + "balance_loss_mlp": 0.02466036, + "epoch": 0.0009619720426875094, + "flos": 24686157663360.0, + "grad_norm": 5195.838129438997, + "language_loss": 10.71900749, + "learning_rate": 1.7851373027727038e-06, + "loss": 11.59187126, + "num_input_tokens_seen": 300265, + "router_z_loss_clip": 47.3125, + "router_z_loss_mlp": 716.5, + "step": 16, + "time_per_iteration": 2.744054079055786 + }, + { + "auxiliary_loss_clip": 0.13309729, + "auxiliary_loss_mlp": 0.76006317, + "balance_loss_clip": 0.08562544, + "balance_loss_mlp": 0.0247116, + "epoch": 0.0010220952953554788, + "flos": 18630454435200.0, + "grad_norm": 4421.362455936007, + "language_loss": 10.42590714, + "learning_rate": 1.8241705979033208e-06, + "loss": 11.319067, + "num_input_tokens_seen": 317375, + "router_z_loss_clip": 47.5, + "router_z_loss_mlp": 736.0, + "step": 17, + "time_per_iteration": 4.191499471664429 + }, + { + "auxiliary_loss_clip": 0.13315202, + "auxiliary_loss_mlp": 0.7600373, + "balance_loss_clip": 0.08556177, + "balance_loss_mlp": 0.02468574, + "epoch": 0.001082218548023448, + "flos": 26150042419200.0, + "grad_norm": 7888.125072686045, + "language_loss": 9.94283867, + "learning_rate": 1.860972167459798e-06, + "loss": 10.83602905, + "num_input_tokens_seen": 337975, + "router_z_loss_clip": 47.625, + "router_z_loss_mlp": 735.5, + "step": 18, + "time_per_iteration": 2.7808027267456055 + }, + { + "auxiliary_loss_clip": 0.13318592, + "auxiliary_loss_mlp": 0.73953104, + "balance_loss_clip": 0.08563764, + "balance_loss_mlp": 0.02468731, + "epoch": 0.0011423418006914173, + "flos": 19615885977600.0, + "grad_norm": 21999.592558043798, + "language_loss": 8.84625435, + "learning_rate": 1.89578346593066e-06, + "loss": 9.71897125, + "num_input_tokens_seen": 356635, + "router_z_loss_clip": 47.53125, + "router_z_loss_mlp": 716.0, + "step": 19, + "time_per_iteration": 4.131728172302246 + }, + { + "auxiliary_loss_clip": 0.13303626, + "auxiliary_loss_mlp": 0.74244332, + "balance_loss_clip": 0.08565694, + "balance_loss_mlp": 0.02466989, + "epoch": 0.0012024650533593868, + "flos": 17901258278400.0, + "grad_norm": 4121.169450537968, + "language_loss": 8.27947521, + "learning_rate": 1.928808765521199e-06, + "loss": 9.15495491, + "num_input_tokens_seen": 375625, + "router_z_loss_clip": 47.34375, + "router_z_loss_mlp": 718.5, + "step": 20, + "time_per_iteration": 2.708914279937744 + }, + { + "auxiliary_loss_clip": 0.13338368, + "auxiliary_loss_mlp": 0.76394671, + "balance_loss_clip": 0.08570746, + "balance_loss_mlp": 0.02468888, + "epoch": 0.001262588306027356, + "flos": 21258495492480.0, + "grad_norm": 4514.811048777073, + "language_loss": 8.72282791, + "learning_rate": 1.9602224192552076e-06, + "loss": 9.62015915, + "num_input_tokens_seen": 394350, + "router_z_loss_clip": 47.6875, + "router_z_loss_mlp": 740.0, + "step": 21, + "time_per_iteration": 2.685307502746582 + }, + { + "auxiliary_loss_clip": 0.13281943, + "auxiliary_loss_mlp": 0.75118458, + "balance_loss_clip": 0.08552284, + "balance_loss_mlp": 0.02462207, + "epoch": 0.0013227115586953253, + "flos": 26111245178880.0, + "grad_norm": 4471.445911682346, + "language_loss": 8.71503925, + "learning_rate": 1.9901744328983746e-06, + "loss": 9.5990448, + "num_input_tokens_seen": 413255, + "router_z_loss_clip": 47.28125, + "router_z_loss_mlp": 727.5, + "step": 22, + "time_per_iteration": 2.734961748123169 + }, + { + "auxiliary_loss_clip": 0.13285899, + "auxiliary_loss_mlp": 0.73805398, + "balance_loss_clip": 0.08560154, + "balance_loss_mlp": 0.02467511, + "epoch": 0.0013828348113632948, + "flos": 23958177390720.0, + "grad_norm": 2111.5818511880134, + "language_loss": 8.18912506, + "learning_rate": 2.018794797290208e-06, + "loss": 9.06003761, + "num_input_tokens_seen": 433065, + "router_z_loss_clip": 47.3125, + "router_z_loss_mlp": 714.5, + "step": 23, + "time_per_iteration": 2.756584882736206 + }, + { + "auxiliary_loss_clip": 0.13278747, + "auxiliary_loss_mlp": 0.74887347, + "balance_loss_clip": 0.08537573, + "balance_loss_mlp": 0.0247524, + "epoch": 0.001442958064031264, + "flos": 15965125511040.0, + "grad_norm": 1807.1551511559412, + "language_loss": 8.28752899, + "learning_rate": 2.046196897962839e-06, + "loss": 9.16918945, + "num_input_tokens_seen": 451175, + "router_z_loss_clip": 47.4375, + "router_z_loss_mlp": 724.5, + "step": 24, + "time_per_iteration": 2.6928858757019043 + }, + { + "auxiliary_loss_clip": 0.13229564, + "auxiliary_loss_mlp": 0.73557305, + "balance_loss_clip": 0.08544464, + "balance_loss_mlp": 0.02463556, + "epoch": 0.0015030813166992333, + "flos": 18113287835520.0, + "grad_norm": 1186.4376598888527, + "language_loss": 7.80813074, + "learning_rate": 2.0724802282696944e-06, + "loss": 8.67599869, + "num_input_tokens_seen": 468775, + "router_z_loss_clip": 46.875, + "router_z_loss_mlp": 712.0, + "step": 25, + "time_per_iteration": 2.7093117237091064 + }, + { + "auxiliary_loss_clip": 0.13238442, + "auxiliary_loss_mlp": 0.7248075, + "balance_loss_clip": 0.085484, + "balance_loss_mlp": 0.02461214, + "epoch": 0.0015632045693672028, + "flos": 22240740579840.0, + "grad_norm": 3090.3782450571143, + "language_loss": 8.51009178, + "learning_rate": 2.0977325700720194e-06, + "loss": 9.36728287, + "num_input_tokens_seen": 488530, + "router_z_loss_clip": 46.875, + "router_z_loss_mlp": 701.0, + "step": 26, + "time_per_iteration": 2.7142887115478516 + }, + { + "auxiliary_loss_clip": 0.13264546, + "auxiliary_loss_mlp": 0.74387956, + "balance_loss_clip": 0.085568, + "balance_loss_mlp": 0.02464127, + "epoch": 0.001623327822035172, + "flos": 23999448326400.0, + "grad_norm": 883.8040958014411, + "language_loss": 8.80418682, + "learning_rate": 2.122031762649933e-06, + "loss": 9.68071175, + "num_input_tokens_seen": 510495, + "router_z_loss_clip": 47.03125, + "router_z_loss_mlp": 720.5, + "step": 27, + "time_per_iteration": 2.739086389541626 + }, + { + "auxiliary_loss_clip": 0.13261499, + "auxiliary_loss_mlp": 0.74588925, + "balance_loss_clip": 0.08545862, + "balance_loss_mlp": 0.02469785, + "epoch": 0.0016834510747031415, + "flos": 19682914844160.0, + "grad_norm": 778.9563997110462, + "language_loss": 7.52667618, + "learning_rate": 2.1454471497582483e-06, + "loss": 8.40517998, + "num_input_tokens_seen": 528605, + "router_z_loss_clip": 47.125, + "router_z_loss_mlp": 722.0, + "step": 28, + "time_per_iteration": 2.684328079223633 + }, + { + "auxiliary_loss_clip": 0.1322532, + "auxiliary_loss_mlp": 0.72868228, + "balance_loss_clip": 0.08545788, + "balance_loss_mlp": 0.02458075, + "epoch": 0.0017435743273711108, + "flos": 20930241922560.0, + "grad_norm": 711.3301469780024, + "language_loss": 7.32490015, + "learning_rate": 2.1680407726407727e-06, + "loss": 8.18583584, + "num_input_tokens_seen": 548515, + "router_z_loss_clip": 46.84375, + "router_z_loss_mlp": 705.0, + "step": 29, + "time_per_iteration": 2.6822586059570312 + }, + { + "auxiliary_loss_clip": 0.13197789, + "auxiliary_loss_mlp": 0.72772777, + "balance_loss_clip": 0.08529261, + "balance_loss_mlp": 0.02460276, + "epoch": 0.00180369758003908, + "flos": 19533763376640.0, + "grad_norm": 596.7513494595695, + "language_loss": 7.62213326, + "learning_rate": 2.189868360711334e-06, + "loss": 8.48183823, + "num_input_tokens_seen": 564025, + "router_z_loss_clip": 46.65625, + "router_z_loss_mlp": 703.5, + "step": 30, + "time_per_iteration": 2.66929030418396 + }, + { + "auxiliary_loss_clip": 0.13220352, + "auxiliary_loss_mlp": 0.73066145, + "balance_loss_clip": 0.08544487, + "balance_loss_mlp": 0.02460678, + "epoch": 0.0018638208327070496, + "flos": 27460415295360.0, + "grad_norm": 562.9814252823624, + "language_loss": 6.46621895, + "learning_rate": 2.2109801597326265e-06, + "loss": 7.32908344, + "num_input_tokens_seen": 583345, + "router_z_loss_clip": 46.78125, + "router_z_loss_mlp": 707.0, + "step": 31, + "time_per_iteration": 2.769524574279785 + }, + { + "auxiliary_loss_clip": 0.13217463, + "auxiliary_loss_mlp": 0.72719908, + "balance_loss_clip": 0.08546316, + "balance_loss_mlp": 0.02456231, + "epoch": 0.0019239440853750188, + "flos": 13594535723520.0, + "grad_norm": 932.7202356227122, + "language_loss": 6.38840246, + "learning_rate": 2.2314216284658796e-06, + "loss": 7.24777603, + "num_input_tokens_seen": 600010, + "router_z_loss_clip": 46.65625, + "router_z_loss_mlp": 703.0, + "step": 32, + "time_per_iteration": 2.6535158157348633 + }, + { + "auxiliary_loss_clip": 0.13187753, + "auxiliary_loss_mlp": 0.73303366, + "balance_loss_clip": 0.08555806, + "balance_loss_mlp": 0.02453755, + "epoch": 0.001984067338042988, + "flos": 11258466618240.0, + "grad_norm": 1313.3745045414653, + "language_loss": 6.49637842, + "learning_rate": 2.2512340280885094e-06, + "loss": 7.36128998, + "num_input_tokens_seen": 616295, + "router_z_loss_clip": 46.34375, + "router_z_loss_mlp": 709.5, + "step": 33, + "time_per_iteration": 2.7210733890533447 + }, + { + "auxiliary_loss_clip": 0.13162288, + "auxiliary_loss_mlp": 0.73504317, + "balance_loss_clip": 0.08544378, + "balance_loss_mlp": 0.02459392, + "epoch": 0.0020441905907109576, + "flos": 22393413918720.0, + "grad_norm": 826.9088902553285, + "language_loss": 6.77253819, + "learning_rate": 2.270454923596497e-06, + "loss": 7.6392045, + "num_input_tokens_seen": 637640, + "router_z_loss_clip": 46.0625, + "router_z_loss_mlp": 711.5, + "step": 34, + "time_per_iteration": 2.7001218795776367 + }, + { + "auxiliary_loss_clip": 0.13097668, + "auxiliary_loss_mlp": 0.75116229, + "balance_loss_clip": 0.08524574, + "balance_loss_mlp": 0.02459984, + "epoch": 0.0021043138433789266, + "flos": 49788911427840.0, + "grad_norm": 577.9485802079388, + "language_loss": 6.20400715, + "learning_rate": 2.2891186125067434e-06, + "loss": 7.08614588, + "num_input_tokens_seen": 659710, + "router_z_loss_clip": 45.6875, + "router_z_loss_mlp": 727.0, + "step": 35, + "time_per_iteration": 3.031013250350952 + }, + { + "auxiliary_loss_clip": 0.13148203, + "auxiliary_loss_mlp": 0.75109303, + "balance_loss_clip": 0.08537915, + "balance_loss_mlp": 0.02453051, + "epoch": 0.002164437096046896, + "flos": 20564155434240.0, + "grad_norm": 623.9821605724222, + "language_loss": 6.06852198, + "learning_rate": 2.307256493152974e-06, + "loss": 6.95109653, + "num_input_tokens_seen": 679670, + "router_z_loss_clip": 46.0625, + "router_z_loss_mlp": 727.0, + "step": 36, + "time_per_iteration": 2.7437260150909424 + }, + { + "auxiliary_loss_clip": 0.13138273, + "auxiliary_loss_mlp": 0.77219343, + "balance_loss_clip": 0.08535384, + "balance_loss_mlp": 0.02463487, + "epoch": 0.0022245603487148656, + "flos": 26549601413760.0, + "grad_norm": 1356.3181729473308, + "language_loss": 6.23619747, + "learning_rate": 2.3248973825097614e-06, + "loss": 7.13977337, + "num_input_tokens_seen": 700170, + "router_z_loss_clip": 46.03125, + "router_z_loss_mlp": 747.5, + "step": 37, + "time_per_iteration": 2.761021375656128 + }, + { + "auxiliary_loss_clip": 0.1308586, + "auxiliary_loss_mlp": 0.75746208, + "balance_loss_clip": 0.0852948, + "balance_loss_mlp": 0.02455192, + "epoch": 0.0022846836013828346, + "flos": 20344201666560.0, + "grad_norm": 550.1318567752543, + "language_loss": 6.76989794, + "learning_rate": 2.3420677916238357e-06, + "loss": 7.65821838, + "num_input_tokens_seen": 718545, + "router_z_loss_clip": 45.53125, + "router_z_loss_mlp": 733.5, + "step": 38, + "time_per_iteration": 2.797001600265503 + }, + { + "auxiliary_loss_clip": 0.13035053, + "auxiliary_loss_mlp": 0.76824772, + "balance_loss_clip": 0.08534516, + "balance_loss_mlp": 0.02459541, + "epoch": 0.002344806854050804, + "flos": 26254359152640.0, + "grad_norm": 327.614641212253, + "language_loss": 6.69246101, + "learning_rate": 2.358792165262154e-06, + "loss": 7.59105968, + "num_input_tokens_seen": 739865, + "router_z_loss_clip": 45.0, + "router_z_loss_mlp": 744.0, + "step": 39, + "time_per_iteration": 2.7852022647857666 + }, + { + "auxiliary_loss_clip": 0.1300399, + "auxiliary_loss_mlp": 0.74368668, + "balance_loss_clip": 0.08536238, + "balance_loss_mlp": 0.0244484, + "epoch": 0.0024049301067187736, + "flos": 11806296612480.0, + "grad_norm": 474.92846081285364, + "language_loss": 5.92113161, + "learning_rate": 2.3750930912143747e-06, + "loss": 6.79485798, + "num_input_tokens_seen": 755770, + "router_z_loss_clip": 44.6875, + "router_z_loss_mlp": 720.0, + "step": 40, + "time_per_iteration": 2.679415464401245 + }, + { + "auxiliary_loss_clip": 0.1309007, + "auxiliary_loss_mlp": 0.78535652, + "balance_loss_clip": 0.08556648, + "balance_loss_mlp": 0.02461432, + "epoch": 0.0024650533593867426, + "flos": 20637808773120.0, + "grad_norm": 345.5419638030077, + "language_loss": 6.47731018, + "learning_rate": 2.3909914837471044e-06, + "loss": 7.39356709, + "num_input_tokens_seen": 773440, + "router_z_loss_clip": 45.3125, + "router_z_loss_mlp": 760.0, + "step": 41, + "time_per_iteration": 2.835094928741455 + }, + { + "auxiliary_loss_clip": 0.13010421, + "auxiliary_loss_mlp": 0.76229548, + "balance_loss_clip": 0.08534975, + "balance_loss_mlp": 0.02450255, + "epoch": 0.002525176612054712, + "flos": 18412093895040.0, + "grad_norm": 622.6550674421553, + "language_loss": 6.03043365, + "learning_rate": 2.4065067449483835e-06, + "loss": 6.92283392, + "num_input_tokens_seen": 790455, + "router_z_loss_clip": 44.75, + "router_z_loss_mlp": 738.0, + "step": 42, + "time_per_iteration": 2.66955828666687 + }, + { + "auxiliary_loss_clip": 0.13026509, + "auxiliary_loss_mlp": 0.76781166, + "balance_loss_clip": 0.08538143, + "balance_loss_mlp": 0.02464763, + "epoch": 0.0025852998647226816, + "flos": 28191582023040.0, + "grad_norm": 8462.035545761653, + "language_loss": 5.972929, + "learning_rate": 2.4216569070848724e-06, + "loss": 6.87100601, + "num_input_tokens_seen": 810645, + "router_z_loss_clip": 44.875, + "router_z_loss_mlp": 744.0, + "step": 43, + "time_per_iteration": 2.7703070640563965 + }, + { + "auxiliary_loss_clip": 0.13056265, + "auxiliary_loss_mlp": 0.74383116, + "balance_loss_clip": 0.0856277, + "balance_loss_mlp": 0.02459292, + "epoch": 0.0026454231173906506, + "flos": 14288372657280.0, + "grad_norm": 293.14149660558166, + "language_loss": 5.65497112, + "learning_rate": 2.4364587585915504e-06, + "loss": 6.52936459, + "num_input_tokens_seen": 827470, + "router_z_loss_clip": 44.875, + "router_z_loss_mlp": 720.0, + "step": 44, + "time_per_iteration": 2.655585527420044 + }, + { + "auxiliary_loss_clip": 0.13054577, + "auxiliary_loss_mlp": 0.75350422, + "balance_loss_clip": 0.08569255, + "balance_loss_mlp": 0.02450033, + "epoch": 0.00270554637005862, + "flos": 22425796321920.0, + "grad_norm": 174.2843578867089, + "language_loss": 6.01187468, + "learning_rate": 2.450927955901469e-06, + "loss": 6.89592457, + "num_input_tokens_seen": 847285, + "router_z_loss_clip": 44.84375, + "router_z_loss_mlp": 730.0, + "step": 45, + "time_per_iteration": 2.705265522003174 + }, + { + "auxiliary_loss_clip": 0.12984964, + "auxiliary_loss_mlp": 0.73199093, + "balance_loss_clip": 0.08560722, + "balance_loss_mlp": 0.02447144, + "epoch": 0.0027656696227265896, + "flos": 23992236875520.0, + "grad_norm": 191.3929439681521, + "language_loss": 6.48347139, + "learning_rate": 2.465079122983384e-06, + "loss": 7.34531212, + "num_input_tokens_seen": 867545, + "router_z_loss_clip": 44.1875, + "router_z_loss_mlp": 708.5, + "step": 46, + "time_per_iteration": 2.733833074569702 + }, + { + "auxiliary_loss_clip": 0.12997682, + "auxiliary_loss_mlp": 0.73999059, + "balance_loss_clip": 0.08536641, + "balance_loss_mlp": 0.02465855, + "epoch": 0.0028257928753945586, + "flos": 37678511220480.0, + "grad_norm": 214.21785552289575, + "language_loss": 5.68396425, + "learning_rate": 2.4789259401737868e-06, + "loss": 6.55393171, + "num_input_tokens_seen": 889915, + "router_z_loss_clip": 44.5625, + "router_z_loss_mlp": 716.0, + "step": 47, + "time_per_iteration": 2.8230926990509033 + }, + { + "auxiliary_loss_clip": 0.1297729, + "auxiliary_loss_mlp": 0.74471426, + "balance_loss_clip": 0.08536708, + "balance_loss_mlp": 0.0244994, + "epoch": 0.002885916128062528, + "flos": 22460945909760.0, + "grad_norm": 449.4004858001912, + "language_loss": 5.75540733, + "learning_rate": 2.492481223656015e-06, + "loss": 6.62989426, + "num_input_tokens_seen": 908975, + "router_z_loss_clip": 44.40625, + "router_z_loss_mlp": 721.5, + "step": 48, + "time_per_iteration": 2.7284624576568604 + }, + { + "auxiliary_loss_clip": 0.12959239, + "auxiliary_loss_mlp": 0.73848325, + "balance_loss_clip": 0.08549985, + "balance_loss_mlp": 0.02461606, + "epoch": 0.0029460393807304976, + "flos": 27019543438080.0, + "grad_norm": 230.30029270071188, + "language_loss": 6.70517731, + "learning_rate": 2.5057569967437924e-06, + "loss": 7.57325315, + "num_input_tokens_seen": 929810, + "router_z_loss_clip": 44.0625, + "router_z_loss_mlp": 715.0, + "step": 49, + "time_per_iteration": 2.792755603790283 + }, + { + "auxiliary_loss_clip": 0.12996669, + "auxiliary_loss_mlp": 0.71446228, + "balance_loss_clip": 0.08555867, + "balance_loss_mlp": 0.02452083, + "epoch": 0.0030061626333984666, + "flos": 15857328833280.0, + "grad_norm": 311.93786428729913, + "language_loss": 5.55702782, + "learning_rate": 2.51876455396287e-06, + "loss": 6.40145731, + "num_input_tokens_seen": 948650, + "router_z_loss_clip": 44.34375, + "router_z_loss_mlp": 690.5, + "step": 50, + "time_per_iteration": 2.689176559448242 + }, + { + "auxiliary_loss_clip": 0.12955803, + "auxiliary_loss_mlp": 0.71350002, + "balance_loss_clip": 0.08553191, + "balance_loss_mlp": 0.02453516, + "epoch": 0.003066285886066436, + "flos": 31834292497920.0, + "grad_norm": 326.0050772098012, + "language_loss": 6.42039013, + "learning_rate": 2.5315145187866316e-06, + "loss": 7.26344872, + "num_input_tokens_seen": 966455, + "router_z_loss_clip": 44.0, + "router_z_loss_mlp": 689.5, + "step": 51, + "time_per_iteration": 2.751997232437134 + }, + { + "auxiliary_loss_clip": 0.12936625, + "auxiliary_loss_mlp": 0.71062022, + "balance_loss_clip": 0.08552323, + "balance_loss_mlp": 0.02458507, + "epoch": 0.0031264091387344056, + "flos": 41437110291840.0, + "grad_norm": 467.7969407780881, + "language_loss": 5.78601551, + "learning_rate": 2.5440168957651953e-06, + "loss": 6.62600183, + "num_input_tokens_seen": 988110, + "router_z_loss_clip": 43.84375, + "router_z_loss_mlp": 686.5, + "step": 52, + "time_per_iteration": 2.8259687423706055 + }, + { + "auxiliary_loss_clip": 0.12935326, + "auxiliary_loss_mlp": 0.69343221, + "balance_loss_clip": 0.08550587, + "balance_loss_mlp": 0.02448688, + "epoch": 0.0031865323914023747, + "flos": 23447719117440.0, + "grad_norm": 4084.3297995155954, + "language_loss": 5.79331207, + "learning_rate": 2.5562811176888872e-06, + "loss": 6.61609745, + "num_input_tokens_seen": 1008550, + "router_z_loss_clip": 43.78125, + "router_z_loss_mlp": 669.0, + "step": 53, + "time_per_iteration": 2.6902496814727783 + }, + { + "auxiliary_loss_clip": 0.12926383, + "auxiliary_loss_mlp": 0.69104648, + "balance_loss_clip": 0.08542258, + "balance_loss_mlp": 0.02454257, + "epoch": 0.003246655644070344, + "flos": 14434505377920.0, + "grad_norm": 247.18448581495338, + "language_loss": 5.53028297, + "learning_rate": 2.5683160883431093e-06, + "loss": 6.35059309, + "num_input_tokens_seen": 1026840, + "router_z_loss_clip": 43.75, + "router_z_loss_mlp": 666.5, + "step": 54, + "time_per_iteration": 2.642801523208618 + }, + { + "auxiliary_loss_clip": 0.12913677, + "auxiliary_loss_mlp": 0.68966341, + "balance_loss_clip": 0.08543722, + "balance_loss_mlp": 0.02462436, + "epoch": 0.0033067788967383136, + "flos": 35926972997760.0, + "grad_norm": 431.229914559421, + "language_loss": 5.18386555, + "learning_rate": 2.580130221340046e-06, + "loss": 6.00266552, + "num_input_tokens_seen": 1048875, + "router_z_loss_clip": 43.6875, + "router_z_loss_mlp": 665.0, + "step": 55, + "time_per_iteration": 2.7916810512542725 + }, + { + "auxiliary_loss_clip": 0.12884736, + "auxiliary_loss_mlp": 0.68559694, + "balance_loss_clip": 0.08553176, + "balance_loss_mlp": 0.02446416, + "epoch": 0.003366902149406283, + "flos": 22964108878080.0, + "grad_norm": 559.5224439968259, + "language_loss": 5.74156904, + "learning_rate": 2.5917314754514246e-06, + "loss": 6.55601311, + "num_input_tokens_seen": 1066435, + "router_z_loss_clip": 43.28125, + "router_z_loss_mlp": 661.0, + "step": 56, + "time_per_iteration": 2.638873338699341 + }, + { + "auxiliary_loss_clip": 0.12877631, + "auxiliary_loss_mlp": 0.65916806, + "balance_loss_clip": 0.08553813, + "balance_loss_mlp": 0.02440244, + "epoch": 0.003427025402074252, + "flos": 26590830422400.0, + "grad_norm": 1293.1571760901363, + "language_loss": 6.61670828, + "learning_rate": 2.6031273868139713e-06, + "loss": 7.4046526, + "num_input_tokens_seen": 1090330, + "router_z_loss_clip": 43.28125, + "router_z_loss_mlp": 634.0, + "step": 57, + "time_per_iteration": 4.246931314468384 + }, + { + "auxiliary_loss_clip": 0.12864697, + "auxiliary_loss_mlp": 0.66109824, + "balance_loss_clip": 0.08544569, + "balance_loss_mlp": 0.02437945, + "epoch": 0.0034871486547422216, + "flos": 23957967755520.0, + "grad_norm": 1581.401693587077, + "language_loss": 6.75815916, + "learning_rate": 2.614325098333948e-06, + "loss": 7.54790401, + "num_input_tokens_seen": 1109840, + "router_z_loss_clip": 43.25, + "router_z_loss_mlp": 636.0, + "step": 58, + "time_per_iteration": 4.129940986633301 + }, + { + "auxiliary_loss_clip": 0.12923497, + "auxiliary_loss_mlp": 0.64957327, + "balance_loss_clip": 0.08577307, + "balance_loss_mlp": 0.02457325, + "epoch": 0.003547271907410191, + "flos": 21221333406720.0, + "grad_norm": 1242.7465016222895, + "language_loss": 5.84827662, + "learning_rate": 2.625331386578098e-06, + "loss": 6.62708521, + "num_input_tokens_seen": 1128415, + "router_z_loss_clip": 43.40625, + "router_z_loss_mlp": 624.0, + "step": 59, + "time_per_iteration": 2.81791090965271 + }, + { + "auxiliary_loss_clip": 0.1292145, + "auxiliary_loss_mlp": 0.65939367, + "balance_loss_clip": 0.08575267, + "balance_loss_mlp": 0.02462805, + "epoch": 0.00360739516007816, + "flos": 16509894831360.0, + "grad_norm": 2163.0106173410372, + "language_loss": 6.19513655, + "learning_rate": 2.63615268640451e-06, + "loss": 6.98374462, + "num_input_tokens_seen": 1146515, + "router_z_loss_clip": 43.4375, + "router_z_loss_mlp": 634.0, + "step": 60, + "time_per_iteration": 2.6462490558624268 + }, + { + "auxiliary_loss_clip": 0.12888563, + "auxiliary_loss_mlp": 0.64225286, + "balance_loss_clip": 0.08565725, + "balance_loss_mlp": 0.0245771, + "epoch": 0.0036675184127461296, + "flos": 19471052995200.0, + "grad_norm": 635.7445513752676, + "language_loss": 5.79569387, + "learning_rate": 2.6467951135575943e-06, + "loss": 6.56683254, + "num_input_tokens_seen": 1166330, + "router_z_loss_clip": 43.21875, + "router_z_loss_mlp": 617.0, + "step": 61, + "time_per_iteration": 2.681910753250122 + }, + { + "auxiliary_loss_clip": 0.12824672, + "auxiliary_loss_mlp": 0.63430971, + "balance_loss_clip": 0.08548941, + "balance_loss_mlp": 0.02444647, + "epoch": 0.003727641665414099, + "flos": 20963253231360.0, + "grad_norm": 899.0914058712833, + "language_loss": 5.87668133, + "learning_rate": 2.657264485425803e-06, + "loss": 6.63923836, + "num_input_tokens_seen": 1186010, + "router_z_loss_clip": 42.71875, + "router_z_loss_mlp": 609.0, + "step": 62, + "time_per_iteration": 2.6819515228271484 + }, + { + "auxiliary_loss_clip": 0.12823591, + "auxiliary_loss_mlp": 0.6255362, + "balance_loss_clip": 0.08562292, + "balance_loss_mlp": 0.02446202, + "epoch": 0.003787764918082068, + "flos": 18412010040960.0, + "grad_norm": 1285.0325266073119, + "language_loss": 5.71324301, + "learning_rate": 2.6675663401385186e-06, + "loss": 6.46701479, + "num_input_tokens_seen": 1204985, + "router_z_loss_clip": 42.59375, + "router_z_loss_mlp": 600.0, + "step": 63, + "time_per_iteration": 2.6705985069274902 + }, + { + "auxiliary_loss_clip": 0.12830947, + "auxiliary_loss_mlp": 0.62154531, + "balance_loss_clip": 0.08567161, + "balance_loss_mlp": 0.02437731, + "epoch": 0.0038478881707500376, + "flos": 12464271198720.0, + "grad_norm": 1843.6770385957534, + "language_loss": 5.25008583, + "learning_rate": 2.677705954159056e-06, + "loss": 5.99994087, + "num_input_tokens_seen": 1223545, + "router_z_loss_clip": 42.6875, + "router_z_loss_mlp": 597.0, + "step": 64, + "time_per_iteration": 2.7688894271850586 + }, + { + "auxiliary_loss_clip": 0.12807481, + "auxiliary_loss_mlp": 0.61575615, + "balance_loss_clip": 0.08564365, + "balance_loss_mlp": 0.02444756, + "epoch": 0.003908011423418007, + "flos": 13558463740800.0, + "grad_norm": 1007.498474071754, + "language_loss": 5.29735851, + "learning_rate": 2.6876883585136904e-06, + "loss": 6.04118919, + "num_input_tokens_seen": 1241175, + "router_z_loss_clip": 42.40625, + "router_z_loss_mlp": 590.5, + "step": 65, + "time_per_iteration": 2.7044079303741455 + }, + { + "auxiliary_loss_clip": 0.12739113, + "auxiliary_loss_mlp": 0.60150075, + "balance_loss_clip": 0.08550942, + "balance_loss_mlp": 0.02435229, + "epoch": 0.003968134676085976, + "flos": 18339488732160.0, + "grad_norm": 1472.5993340381553, + "language_loss": 5.05529404, + "learning_rate": 2.697518353781685e-06, + "loss": 5.78418589, + "num_input_tokens_seen": 1259315, + "router_z_loss_clip": 41.90625, + "router_z_loss_mlp": 577.0, + "step": 66, + "time_per_iteration": 2.639763116836548 + }, + { + "auxiliary_loss_clip": 0.12713413, + "auxiliary_loss_mlp": 0.58826029, + "balance_loss_clip": 0.08548602, + "balance_loss_mlp": 0.02429543, + "epoch": 0.004028257928753946, + "flos": 20491466417280.0, + "grad_norm": 2128.447716031984, + "language_loss": 5.57779789, + "learning_rate": 2.7072005239581103e-06, + "loss": 6.29319191, + "num_input_tokens_seen": 1277055, + "router_z_loss_clip": 41.65625, + "router_z_loss_mlp": 564.0, + "step": 67, + "time_per_iteration": 2.6764183044433594 + }, + { + "auxiliary_loss_clip": 0.12659386, + "auxiliary_loss_mlp": 0.59566367, + "balance_loss_clip": 0.08534892, + "balance_loss_mlp": 0.02437462, + "epoch": 0.004088381181421915, + "flos": 18849863151360.0, + "grad_norm": 1300.1095038466112, + "language_loss": 5.65431881, + "learning_rate": 2.7167392492896727e-06, + "loss": 6.37657642, + "num_input_tokens_seen": 1294355, + "router_z_loss_clip": 41.21875, + "router_z_loss_mlp": 571.5, + "step": 68, + "time_per_iteration": 2.6499533653259277 + }, + { + "auxiliary_loss_clip": 0.12670201, + "auxiliary_loss_mlp": 0.59023213, + "balance_loss_clip": 0.08528139, + "balance_loss_mlp": 0.02431421, + "epoch": 0.004148504434089885, + "flos": 19433974763520.0, + "grad_norm": 775.8661457915586, + "language_loss": 5.68540192, + "learning_rate": 2.7261387181735195e-06, + "loss": 6.40233564, + "num_input_tokens_seen": 1313525, + "router_z_loss_clip": 41.375, + "router_z_loss_mlp": 566.0, + "step": 69, + "time_per_iteration": 2.680570363998413 + }, + { + "auxiliary_loss_clip": 0.12638462, + "auxiliary_loss_mlp": 0.5930984, + "balance_loss_clip": 0.08532386, + "balance_loss_mlp": 0.02425073, + "epoch": 0.004208627686757853, + "flos": 20816868948480.0, + "grad_norm": 532.7078221445815, + "language_loss": 6.55753994, + "learning_rate": 2.7354029381999196e-06, + "loss": 7.27702332, + "num_input_tokens_seen": 1330505, + "router_z_loss_clip": 41.09375, + "router_z_loss_mlp": 570.0, + "step": 70, + "time_per_iteration": 2.6596553325653076 + }, + { + "auxiliary_loss_clip": 0.12589023, + "auxiliary_loss_mlp": 0.57596606, + "balance_loss_clip": 0.08525643, + "balance_loss_mlp": 0.02420826, + "epoch": 0.004268750939425823, + "flos": 19104589163520.0, + "grad_norm": 3523.620393185992, + "language_loss": 4.99572229, + "learning_rate": 2.7445357464116983e-06, + "loss": 5.69757891, + "num_input_tokens_seen": 1349615, + "router_z_loss_clip": 40.71875, + "router_z_loss_mlp": 552.5, + "step": 71, + "time_per_iteration": 2.6517086029052734 + }, + { + "auxiliary_loss_clip": 0.13345143, + "auxiliary_loss_mlp": 0.53337634, + "balance_loss_clip": 0.08910056, + "balance_loss_mlp": 0.02458726, + "epoch": 0.004328874192093792, + "flos": 52456112340480.0, + "grad_norm": 24.73254947156558, + "language_loss": 0.75920403, + "learning_rate": 2.75354081884615e-06, + "loss": 1.42603183, + "num_input_tokens_seen": 1410275, + "router_z_loss_clip": 44.375, + "router_z_loss_mlp": 508.25, + "step": 72, + "time_per_iteration": 3.4461121559143066 + }, + { + "auxiliary_loss_clip": 0.13279217, + "auxiliary_loss_mlp": 0.51093936, + "balance_loss_clip": 0.08903308, + "balance_loss_mlp": 0.02436709, + "epoch": 0.004388997444761762, + "flos": 66495922260480.0, + "grad_norm": 24.018429481505308, + "language_loss": 0.70889235, + "learning_rate": 2.7624216794188286e-06, + "loss": 1.35262394, + "num_input_tokens_seen": 1473020, + "router_z_loss_clip": 43.71875, + "router_z_loss_mlp": 486.25, + "step": 73, + "time_per_iteration": 3.8973076343536377 + }, + { + "auxiliary_loss_clip": 0.12491501, + "auxiliary_loss_mlp": 0.53349555, + "balance_loss_clip": 0.08502775, + "balance_loss_mlp": 0.02397403, + "epoch": 0.004449120697429731, + "flos": 18958959567360.0, + "grad_norm": 3320.4524015503866, + "language_loss": 5.2433157, + "learning_rate": 2.771181708202938e-06, + "loss": 5.90172577, + "num_input_tokens_seen": 1490385, + "router_z_loss_clip": 39.90625, + "router_z_loss_mlp": 509.5, + "step": 74, + "time_per_iteration": 2.6803529262542725 + }, + { + "auxiliary_loss_clip": 0.12445074, + "auxiliary_loss_mlp": 0.51731253, + "balance_loss_clip": 0.08501716, + "balance_loss_mlp": 0.02390428, + "epoch": 0.004509243950097701, + "flos": 21111817720320.0, + "grad_norm": 2097.466788992517, + "language_loss": 5.57566261, + "learning_rate": 2.779824149153005e-06, + "loss": 6.21742582, + "num_input_tokens_seen": 1509725, + "router_z_loss_clip": 39.4375, + "router_z_loss_mlp": 493.0, + "step": 75, + "time_per_iteration": 2.687678575515747 + }, + { + "auxiliary_loss_clip": 0.12385009, + "auxiliary_loss_mlp": 0.49917772, + "balance_loss_clip": 0.08505447, + "balance_loss_mlp": 0.0235918, + "epoch": 0.004569367202765669, + "flos": 20704082952960.0, + "grad_norm": 7030.779065512956, + "language_loss": 5.64007378, + "learning_rate": 2.788352117317012e-06, + "loss": 6.26310158, + "num_input_tokens_seen": 1527245, + "router_z_loss_clip": 38.8125, + "router_z_loss_mlp": 475.25, + "step": 76, + "time_per_iteration": 2.666630744934082 + }, + { + "auxiliary_loss_clip": 0.12336895, + "auxiliary_loss_mlp": 0.48941305, + "balance_loss_clip": 0.08483945, + "balance_loss_mlp": 0.02359273, + "epoch": 0.004629490455433639, + "flos": 28666136021760.0, + "grad_norm": 620.4309602119407, + "language_loss": 5.72052956, + "learning_rate": 2.796768605577095e-06, + "loss": 6.33331108, + "num_input_tokens_seen": 1548930, + "router_z_loss_clip": 38.5, + "router_z_loss_mlp": 465.5, + "step": 77, + "time_per_iteration": 2.7469568252563477 + }, + { + "auxiliary_loss_clip": 0.12308235, + "auxiliary_loss_mlp": 0.48191378, + "balance_loss_clip": 0.08460534, + "balance_loss_mlp": 0.02366182, + "epoch": 0.004689613708101608, + "flos": 11077142382720.0, + "grad_norm": 1643.3438058920954, + "language_loss": 5.09305811, + "learning_rate": 2.80507649095533e-06, + "loss": 5.69805431, + "num_input_tokens_seen": 1565695, + "router_z_loss_clip": 38.5, + "router_z_loss_mlp": 458.25, + "step": 78, + "time_per_iteration": 2.6558547019958496 + }, + { + "auxiliary_loss_clip": 0.12249273, + "auxiliary_loss_mlp": 0.46293706, + "balance_loss_clip": 0.08442898, + "balance_loss_mlp": 0.02348393, + "epoch": 0.004749736960769578, + "flos": 21805612727040.0, + "grad_norm": 2200.9167741447113, + "language_loss": 4.90451622, + "learning_rate": 2.813278540517843e-06, + "loss": 5.48994637, + "num_input_tokens_seen": 1582625, + "router_z_loss_clip": 38.0625, + "router_z_loss_mlp": 439.75, + "step": 79, + "time_per_iteration": 2.7162697315216064 + }, + { + "auxiliary_loss_clip": 0.12262511, + "auxiliary_loss_mlp": 0.46983981, + "balance_loss_clip": 0.08447941, + "balance_loss_mlp": 0.02355075, + "epoch": 0.004809860213437547, + "flos": 19798803440640.0, + "grad_norm": 344.66463824801895, + "language_loss": 5.05523586, + "learning_rate": 2.8213774169075505e-06, + "loss": 5.64770126, + "num_input_tokens_seen": 1601725, + "router_z_loss_clip": 38.125, + "router_z_loss_mlp": 446.75, + "step": 80, + "time_per_iteration": 2.687460422515869 + }, + { + "auxiliary_loss_clip": 0.12261841, + "auxiliary_loss_mlp": 0.45211679, + "balance_loss_clip": 0.08451226, + "balance_loss_mlp": 0.02364997, + "epoch": 0.004869983466105517, + "flos": 26580893713920.0, + "grad_norm": 1677.7099343970488, + "language_loss": 5.56453705, + "learning_rate": 2.829375683533245e-06, + "loss": 6.13927221, + "num_input_tokens_seen": 1622420, + "router_z_loss_clip": 38.125, + "router_z_loss_mlp": 428.5, + "step": 81, + "time_per_iteration": 2.7709527015686035 + }, + { + "auxiliary_loss_clip": 0.12245495, + "auxiliary_loss_mlp": 0.44303346, + "balance_loss_clip": 0.08439148, + "balance_loss_mlp": 0.02335574, + "epoch": 0.004930106718773485, + "flos": 12828345189120.0, + "grad_norm": 4679.4395433895315, + "language_loss": 4.60398674, + "learning_rate": 2.8372758094402803e-06, + "loss": 5.16947508, + "num_input_tokens_seen": 1640715, + "router_z_loss_clip": 38.125, + "router_z_loss_mlp": 419.75, + "step": 82, + "time_per_iteration": 2.6463286876678467 + }, + { + "auxiliary_loss_clip": 0.12233329, + "auxiliary_loss_mlp": 0.44903332, + "balance_loss_clip": 0.0843938, + "balance_loss_mlp": 0.0234962, + "epoch": 0.004990229971441455, + "flos": 25781901505920.0, + "grad_norm": 1468.5073951038269, + "language_loss": 5.41148376, + "learning_rate": 2.84508017388607e-06, + "loss": 5.98285007, + "num_input_tokens_seen": 1662210, + "router_z_loss_clip": 37.96875, + "router_z_loss_mlp": 425.5, + "step": 83, + "time_per_iteration": 2.751582145690918 + }, + { + "auxiliary_loss_clip": 0.12286501, + "auxiliary_loss_mlp": 0.44843888, + "balance_loss_clip": 0.08466095, + "balance_loss_mlp": 0.0236342, + "epoch": 0.005050353224109424, + "flos": 17463027824640.0, + "grad_norm": 333.54187308321605, + "language_loss": 4.89241934, + "learning_rate": 2.852791070641559e-06, + "loss": 5.46372318, + "num_input_tokens_seen": 1681070, + "router_z_loss_clip": 38.21875, + "router_z_loss_mlp": 425.0, + "step": 84, + "time_per_iteration": 2.6613667011260986 + }, + { + "auxiliary_loss_clip": 0.12715524, + "auxiliary_loss_mlp": 0.33666173, + "balance_loss_clip": 0.08695208, + "balance_loss_mlp": 0.02245275, + "epoch": 0.005110476476777394, + "flos": 69824607160320.0, + "grad_norm": 16.750834021856043, + "language_loss": 0.63998127, + "learning_rate": 2.8604107120381682e-06, + "loss": 1.10379827, + "num_input_tokens_seen": 1747140, + "router_z_loss_clip": 40.09375, + "router_z_loss_mlp": 313.75, + "step": 85, + "time_per_iteration": 3.4564764499664307 + }, + { + "auxiliary_loss_clip": 0.12209877, + "auxiliary_loss_mlp": 0.42757708, + "balance_loss_clip": 0.08426955, + "balance_loss_mlp": 0.02352437, + "epoch": 0.005170599729445363, + "flos": 24796973088000.0, + "grad_norm": 542.703970895993, + "language_loss": 4.92362881, + "learning_rate": 2.8679412327780482e-06, + "loss": 5.47330475, + "num_input_tokens_seen": 1767475, + "router_z_loss_clip": 37.90625, + "router_z_loss_mlp": 403.75, + "step": 86, + "time_per_iteration": 2.775689125061035 + }, + { + "auxiliary_loss_clip": 0.12224952, + "auxiliary_loss_mlp": 0.4164477, + "balance_loss_clip": 0.08412233, + "balance_loss_mlp": 0.02362544, + "epoch": 0.005230722982113333, + "flos": 23264717800320.0, + "grad_norm": 4371.207136836947, + "language_loss": 5.4414258, + "learning_rate": 2.8753846935240833e-06, + "loss": 5.98012304, + "num_input_tokens_seen": 1784980, + "router_z_loss_clip": 38.15625, + "router_z_loss_mlp": 392.25, + "step": 87, + "time_per_iteration": 2.7322311401367188 + }, + { + "auxiliary_loss_clip": 0.12200201, + "auxiliary_loss_mlp": 0.41744971, + "balance_loss_clip": 0.08406796, + "balance_loss_mlp": 0.02365087, + "epoch": 0.005290846234781301, + "flos": 16733622032640.0, + "grad_norm": 2919.861295310318, + "language_loss": 4.86351013, + "learning_rate": 2.8827430842847267e-06, + "loss": 5.40296173, + "num_input_tokens_seen": 1803030, + "router_z_loss_clip": 38.0, + "router_z_loss_mlp": 393.75, + "step": 88, + "time_per_iteration": 2.7260544300079346 + }, + { + "auxiliary_loss_clip": 0.1219901, + "auxiliary_loss_mlp": 0.40224642, + "balance_loss_clip": 0.08417168, + "balance_loss_mlp": 0.02358433, + "epoch": 0.005350969487449271, + "flos": 20892283223040.0, + "grad_norm": 1645.58162705774, + "language_loss": 5.16751766, + "learning_rate": 2.8900183276075957e-06, + "loss": 5.69175386, + "num_input_tokens_seen": 1822865, + "router_z_loss_clip": 37.875, + "router_z_loss_mlp": 378.5, + "step": 89, + "time_per_iteration": 2.674370288848877 + }, + { + "auxiliary_loss_clip": 0.12154645, + "auxiliary_loss_mlp": 0.38342261, + "balance_loss_clip": 0.0840472, + "balance_loss_mlp": 0.02331517, + "epoch": 0.00541109274011724, + "flos": 26216568161280.0, + "grad_norm": 1270.091627450628, + "language_loss": 4.37986279, + "learning_rate": 2.8972122815946455e-06, + "loss": 4.88483191, + "num_input_tokens_seen": 1842435, + "router_z_loss_clip": 37.5, + "router_z_loss_mlp": 360.75, + "step": 90, + "time_per_iteration": 2.7423648834228516 + }, + { + "auxiliary_loss_clip": 0.12150387, + "auxiliary_loss_mlp": 0.38653693, + "balance_loss_clip": 0.08385181, + "balance_loss_mlp": 0.02349981, + "epoch": 0.00547121599278521, + "flos": 21184926007680.0, + "grad_norm": 803.9563265609303, + "language_loss": 5.31085825, + "learning_rate": 2.90432674275074e-06, + "loss": 5.81889915, + "num_input_tokens_seen": 1860065, + "router_z_loss_clip": 37.6875, + "router_z_loss_mlp": 363.0, + "step": 91, + "time_per_iteration": 2.6603400707244873 + }, + { + "auxiliary_loss_clip": 0.12079477, + "auxiliary_loss_mlp": 0.37034535, + "balance_loss_clip": 0.08381163, + "balance_loss_mlp": 0.02342154, + "epoch": 0.005531339245453179, + "flos": 19724856612480.0, + "grad_norm": 829.7403965041182, + "language_loss": 4.4634366, + "learning_rate": 2.91136344867656e-06, + "loss": 4.95457649, + "num_input_tokens_seen": 1878135, + "router_z_loss_clip": 37.0, + "router_z_loss_mlp": 347.25, + "step": 92, + "time_per_iteration": 2.6818525791168213 + }, + { + "auxiliary_loss_clip": 0.1209444, + "auxiliary_loss_mlp": 0.35073167, + "balance_loss_clip": 0.08383686, + "balance_loss_mlp": 0.02309498, + "epoch": 0.005591462498121149, + "flos": 17641291386240.0, + "grad_norm": 1625.08326205636, + "language_loss": 4.56070709, + "learning_rate": 2.918324080615938e-06, + "loss": 5.03238297, + "num_input_tokens_seen": 1894895, + "router_z_loss_clip": 37.125, + "router_z_loss_mlp": 327.5, + "step": 93, + "time_per_iteration": 2.612030029296875 + }, + { + "auxiliary_loss_clip": 0.12023389, + "auxiliary_loss_mlp": 0.34590679, + "balance_loss_clip": 0.08357395, + "balance_loss_mlp": 0.02290875, + "epoch": 0.005651585750789117, + "flos": 20017415543040.0, + "grad_norm": 681.2724931544728, + "language_loss": 4.70847607, + "learning_rate": 2.925210265866963e-06, + "loss": 5.17461681, + "num_input_tokens_seen": 1913220, + "router_z_loss_clip": 36.625, + "router_z_loss_mlp": 322.75, + "step": 94, + "time_per_iteration": 2.6726646423339844 + }, + { + "auxiliary_loss_clip": 0.12331794, + "auxiliary_loss_mlp": 0.21429604, + "balance_loss_clip": 0.08515669, + "balance_loss_mlp": 0.01873939, + "epoch": 0.005711709003457087, + "flos": 59831202758400.0, + "grad_norm": 11.50707364837694, + "language_loss": 0.68575168, + "learning_rate": 2.932023580065507e-06, + "loss": 1.02336574, + "num_input_tokens_seen": 1970970, + "router_z_loss_clip": 38.0, + "router_z_loss_mlp": 195.25, + "step": 95, + "time_per_iteration": 3.168633222579956 + }, + { + "auxiliary_loss_clip": 0.11899618, + "auxiliary_loss_mlp": 0.32138801, + "balance_loss_clip": 0.08329217, + "balance_loss_mlp": 0.02231575, + "epoch": 0.005771832256125056, + "flos": 15564979537920.0, + "grad_norm": 1013.3395640383166, + "language_loss": 4.49414778, + "learning_rate": 2.9387655493491906e-06, + "loss": 4.93453217, + "num_input_tokens_seen": 1988930, + "router_z_loss_clip": 35.71875, + "router_z_loss_mlp": 298.5, + "step": 96, + "time_per_iteration": 5.5690062046051025 + }, + { + "auxiliary_loss_clip": 0.11822618, + "auxiliary_loss_mlp": 0.30064785, + "balance_loss_clip": 0.08285143, + "balance_loss_mlp": 0.02220548, + "epoch": 0.005831955508793026, + "flos": 22534934664960.0, + "grad_norm": 2356.5481695677104, + "language_loss": 5.16498899, + "learning_rate": 2.9454376524092147e-06, + "loss": 5.58386326, + "num_input_tokens_seen": 2006285, + "router_z_loss_clip": 35.4375, + "router_z_loss_mlp": 278.375, + "step": 97, + "time_per_iteration": 4.129577159881592 + }, + { + "auxiliary_loss_clip": 0.11772624, + "auxiliary_loss_mlp": 0.27429676, + "balance_loss_clip": 0.08268203, + "balance_loss_mlp": 0.02161121, + "epoch": 0.005892078761460995, + "flos": 22055600983680.0, + "grad_norm": 1442.767046866879, + "language_loss": 4.65611029, + "learning_rate": 2.952041322436969e-06, + "loss": 5.04813337, + "num_input_tokens_seen": 2024905, + "router_z_loss_clip": 35.03125, + "router_z_loss_mlp": 252.75, + "step": 98, + "time_per_iteration": 4.072925567626953 + }, + { + "auxiliary_loss_clip": 0.12124368, + "auxiliary_loss_mlp": 0.12855935, + "balance_loss_clip": 0.08381641, + "balance_loss_mlp": 0.01625466, + "epoch": 0.005952202014128965, + "flos": 68559865632000.0, + "grad_norm": 9.945172746585492, + "language_loss": 0.65681642, + "learning_rate": 2.9585779489718204e-06, + "loss": 0.90661949, + "num_input_tokens_seen": 2086220, + "router_z_loss_clip": 37.46875, + "router_z_loss_mlp": 112.4375, + "step": 99, + "time_per_iteration": 3.3806052207946777 + }, + { + "auxiliary_loss_clip": 0.11659142, + "auxiliary_loss_mlp": 0.25495899, + "balance_loss_clip": 0.08219896, + "balance_loss_mlp": 0.02095021, + "epoch": 0.006012325266796933, + "flos": 22966624500480.0, + "grad_norm": 5439.355539233552, + "language_loss": 4.89178705, + "learning_rate": 2.9650488796560464e-06, + "loss": 5.26333714, + "num_input_tokens_seen": 2103365, + "router_z_loss_clip": 34.34375, + "router_z_loss_mlp": 233.875, + "step": 100, + "time_per_iteration": 2.6920084953308105 + }, + { + "auxiliary_loss_clip": 0.11642508, + "auxiliary_loss_mlp": 0.23216301, + "balance_loss_clip": 0.08225508, + "balance_loss_mlp": 0.02037103, + "epoch": 0.006072448519464903, + "flos": 17353721773440.0, + "grad_norm": 71170.85330308754, + "language_loss": 4.95652103, + "learning_rate": 2.971455421902446e-06, + "loss": 5.30510902, + "num_input_tokens_seen": 2121995, + "router_z_loss_clip": 34.15625, + "router_z_loss_mlp": 211.875, + "step": 101, + "time_per_iteration": 2.652926206588745 + }, + { + "auxiliary_loss_clip": 0.11583164, + "auxiliary_loss_mlp": 0.214275, + "balance_loss_clip": 0.08206252, + "balance_loss_mlp": 0.01957287, + "epoch": 0.006132571772132872, + "flos": 24688044380160.0, + "grad_norm": 7482.306451170957, + "language_loss": 5.13341808, + "learning_rate": 2.9777988444798075e-06, + "loss": 5.4635253, + "num_input_tokens_seen": 2141815, + "router_z_loss_clip": 33.78125, + "router_z_loss_mlp": 194.625, + "step": 102, + "time_per_iteration": 2.7020983695983887 + }, + { + "auxiliary_loss_clip": 0.11553724, + "auxiliary_loss_mlp": 0.20282698, + "balance_loss_clip": 0.08193958, + "balance_loss_mlp": 0.01923322, + "epoch": 0.006192695024800842, + "flos": 21471279736320.0, + "grad_norm": 1966.1076689836887, + "language_loss": 4.95062399, + "learning_rate": 2.9840803790210285e-06, + "loss": 5.26898813, + "num_input_tokens_seen": 2161125, + "router_z_loss_clip": 33.59375, + "router_z_loss_mlp": 183.75, + "step": 103, + "time_per_iteration": 2.652406692504883 + }, + { + "auxiliary_loss_clip": 0.11498895, + "auxiliary_loss_mlp": 0.18188542, + "balance_loss_clip": 0.08159411, + "balance_loss_mlp": 0.01855535, + "epoch": 0.006252818277468811, + "flos": 17426117301120.0, + "grad_norm": 4017.94727583705, + "language_loss": 4.81252193, + "learning_rate": 2.990301221458371e-06, + "loss": 5.10939646, + "num_input_tokens_seen": 2179510, + "router_z_loss_clip": 33.40625, + "router_z_loss_mlp": 163.25, + "step": 104, + "time_per_iteration": 2.6669459342956543 + }, + { + "auxiliary_loss_clip": 0.11507185, + "auxiliary_loss_mlp": 0.18210354, + "balance_loss_clip": 0.081876, + "balance_loss_mlp": 0.01852931, + "epoch": 0.006312941530136781, + "flos": 19105679266560.0, + "grad_norm": 5275.119248926157, + "language_loss": 4.54453945, + "learning_rate": 2.9964625333900544e-06, + "loss": 4.84171486, + "num_input_tokens_seen": 2197870, + "router_z_loss_clip": 33.1875, + "router_z_loss_mlp": 163.625, + "step": 105, + "time_per_iteration": 2.6467208862304688 + }, + { + "auxiliary_loss_clip": 0.11489026, + "auxiliary_loss_mlp": 0.17571044, + "balance_loss_clip": 0.08164956, + "balance_loss_mlp": 0.01872801, + "epoch": 0.006373064782804749, + "flos": 24067651150080.0, + "grad_norm": 56669.614766689854, + "language_loss": 4.9280014, + "learning_rate": 3.002565443382063e-06, + "loss": 5.2186017, + "num_input_tokens_seen": 2217495, + "router_z_loss_clip": 33.1875, + "router_z_loss_mlp": 157.0, + "step": 106, + "time_per_iteration": 2.7375807762145996 + }, + { + "auxiliary_loss_clip": 0.11464141, + "auxiliary_loss_mlp": 0.16512999, + "balance_loss_clip": 0.08158538, + "balance_loss_mlp": 0.01815734, + "epoch": 0.006433188035472719, + "flos": 18338272848000.0, + "grad_norm": 94457.61945163306, + "language_loss": 4.08243847, + "learning_rate": 3.008611048208843e-06, + "loss": 4.36221027, + "num_input_tokens_seen": 2236520, + "router_z_loss_clip": 33.0625, + "router_z_loss_mlp": 146.875, + "step": 107, + "time_per_iteration": 2.6703994274139404 + }, + { + "auxiliary_loss_clip": 0.12281319, + "auxiliary_loss_mlp": 0.04033342, + "balance_loss_clip": 0.08292686, + "balance_loss_mlp": 0.01773516, + "epoch": 0.006493311288140688, + "flos": 62583266257920.0, + "grad_norm": 1.9990534397749096, + "language_loss": 0.6506741, + "learning_rate": 3.014600414036285e-06, + "loss": 0.81382072, + "num_input_tokens_seen": 2300140, + "router_z_loss_clip": 40.0, + "router_z_loss_mlp": 22.640625, + "step": 108, + "time_per_iteration": 3.3318073749542236 + }, + { + "auxiliary_loss_clip": 0.1146347, + "auxiliary_loss_mlp": 0.17600623, + "balance_loss_clip": 0.08161052, + "balance_loss_mlp": 0.01902381, + "epoch": 0.006553434540808658, + "flos": 19506202583040.0, + "grad_norm": 2213.052526088781, + "language_loss": 5.47699499, + "learning_rate": 3.0205345775501937e-06, + "loss": 5.76763535, + "num_input_tokens_seen": 2317320, + "router_z_loss_clip": 33.03125, + "router_z_loss_mlp": 156.875, + "step": 109, + "time_per_iteration": 2.719162940979004 + }, + { + "auxiliary_loss_clip": 0.11452536, + "auxiliary_loss_mlp": 0.16698027, + "balance_loss_clip": 0.08172794, + "balance_loss_mlp": 0.01903106, + "epoch": 0.006613557793476627, + "flos": 21111398449920.0, + "grad_norm": 8171.333832946622, + "language_loss": 4.33011436, + "learning_rate": 3.0264145470332218e-06, + "loss": 4.61161995, + "num_input_tokens_seen": 2337820, + "router_z_loss_clip": 32.765625, + "router_z_loss_mlp": 147.75, + "step": 110, + "time_per_iteration": 2.7021584510803223 + }, + { + "auxiliary_loss_clip": 0.11498255, + "auxiliary_loss_mlp": 0.16723976, + "balance_loss_clip": 0.08168858, + "balance_loss_mlp": 0.01916846, + "epoch": 0.006673681046144597, + "flos": 26037843402240.0, + "grad_norm": 85243.79091039153, + "language_loss": 5.33909988, + "learning_rate": 3.032241303393073e-06, + "loss": 5.62132263, + "num_input_tokens_seen": 2358560, + "router_z_loss_clip": 33.25, + "router_z_loss_mlp": 148.0625, + "step": 111, + "time_per_iteration": 2.763227939605713 + }, + { + "auxiliary_loss_clip": 0.11479855, + "auxiliary_loss_mlp": 0.17865081, + "balance_loss_clip": 0.08154993, + "balance_loss_mlp": 0.01983733, + "epoch": 0.006733804298812566, + "flos": 23154279719040.0, + "grad_norm": 75829.31622331966, + "language_loss": 4.96874857, + "learning_rate": 3.0380158011446e-06, + "loss": 5.26219797, + "num_input_tokens_seen": 2379005, + "router_z_loss_clip": 33.1875, + "router_z_loss_mlp": 158.875, + "step": 112, + "time_per_iteration": 2.656294822692871 + }, + { + "auxiliary_loss_clip": 0.1147141, + "auxiliary_loss_mlp": 0.17070231, + "balance_loss_clip": 0.08172764, + "balance_loss_mlp": 0.01933513, + "epoch": 0.006793927551480535, + "flos": 11769092599680.0, + "grad_norm": 3384.2074822155987, + "language_loss": 4.32218456, + "learning_rate": 3.0437389693482466e-06, + "loss": 4.60760117, + "num_input_tokens_seen": 2395610, + "router_z_loss_clip": 32.96875, + "router_z_loss_mlp": 151.25, + "step": 113, + "time_per_iteration": 2.6669225692749023 + }, + { + "auxiliary_loss_clip": 0.11510996, + "auxiliary_loss_mlp": 0.18198231, + "balance_loss_clip": 0.08184206, + "balance_loss_mlp": 0.019995, + "epoch": 0.006854050804148504, + "flos": 19177990940160.0, + "grad_norm": 1118.9556792976962, + "language_loss": 4.58965397, + "learning_rate": 3.0494117125071475e-06, + "loss": 4.88674641, + "num_input_tokens_seen": 2415005, + "router_z_loss_clip": 33.28125, + "router_z_loss_mlp": 161.875, + "step": 114, + "time_per_iteration": 2.6245124340057373 + }, + { + "auxiliary_loss_clip": 0.11491105, + "auxiliary_loss_mlp": 0.15876909, + "balance_loss_clip": 0.08183911, + "balance_loss_mlp": 0.01912064, + "epoch": 0.006914174056816474, + "flos": 21988488263040.0, + "grad_norm": 3570.8470324102345, + "language_loss": 4.92026377, + "learning_rate": 3.055034911425055e-06, + "loss": 5.19394398, + "num_input_tokens_seen": 2433965, + "router_z_loss_clip": 33.09375, + "router_z_loss_mlp": 139.625, + "step": 115, + "time_per_iteration": 2.694258689880371 + }, + { + "auxiliary_loss_clip": 0.11497033, + "auxiliary_loss_mlp": 0.17786066, + "balance_loss_clip": 0.08183155, + "balance_loss_mlp": 0.02014583, + "epoch": 0.006974297309484443, + "flos": 16294636892160.0, + "grad_norm": 28497.885490954828, + "language_loss": 4.11111546, + "learning_rate": 3.0606094240271244e-06, + "loss": 4.40394688, + "num_input_tokens_seen": 2451605, + "router_z_loss_clip": 33.125, + "router_z_loss_mlp": 157.75, + "step": 116, + "time_per_iteration": 2.6153717041015625 + }, + { + "auxiliary_loss_clip": 0.11479296, + "auxiliary_loss_mlp": 0.17568065, + "balance_loss_clip": 0.08183482, + "balance_loss_mlp": 0.02040722, + "epoch": 0.007034420562152413, + "flos": 26111161324800.0, + "grad_norm": 6129.230277666204, + "language_loss": 4.56221914, + "learning_rate": 3.0661360861454656e-06, + "loss": 4.8526926, + "num_input_tokens_seen": 2472035, + "router_z_loss_clip": 32.90625, + "router_z_loss_mlp": 155.25, + "step": 117, + "time_per_iteration": 2.698347568511963 + }, + { + "auxiliary_loss_clip": 0.11602448, + "auxiliary_loss_mlp": 0.18875569, + "balance_loss_clip": 0.08221327, + "balance_loss_mlp": 0.02151936, + "epoch": 0.007094543814820382, + "flos": 14208933386880.0, + "grad_norm": 568.8145863995832, + "language_loss": 4.50002289, + "learning_rate": 3.071615712271274e-06, + "loss": 4.80480337, + "num_input_tokens_seen": 2489285, + "router_z_loss_clip": 33.75, + "router_z_loss_mlp": 167.375, + "step": 118, + "time_per_iteration": 2.614288091659546 + }, + { + "auxiliary_loss_clip": 0.11586175, + "auxiliary_loss_mlp": 0.17393641, + "balance_loss_clip": 0.08235049, + "balance_loss_mlp": 0.02086024, + "epoch": 0.007154667067488351, + "flos": 14981329123200.0, + "grad_norm": 337.3163881950513, + "language_loss": 4.89806128, + "learning_rate": 3.0770490962752172e-06, + "loss": 5.18785954, + "num_input_tokens_seen": 2506460, + "router_z_loss_clip": 33.5625, + "router_z_loss_mlp": 153.0, + "step": 119, + "time_per_iteration": 2.6733670234680176 + }, + { + "auxiliary_loss_clip": 0.11613901, + "auxiliary_loss_mlp": 0.17884746, + "balance_loss_clip": 0.08224175, + "balance_loss_mlp": 0.02088849, + "epoch": 0.00721479032015632, + "flos": 20199452538240.0, + "grad_norm": 4431.2993639449, + "language_loss": 4.39706039, + "learning_rate": 3.082437012097686e-06, + "loss": 4.69204712, + "num_input_tokens_seen": 2525565, + "router_z_loss_clip": 33.9375, + "router_z_loss_mlp": 157.75, + "step": 120, + "time_per_iteration": 2.6733429431915283 + }, + { + "auxiliary_loss_clip": 0.11614023, + "auxiliary_loss_mlp": 0.18062758, + "balance_loss_clip": 0.0821183, + "balance_loss_mlp": 0.02144791, + "epoch": 0.00727491357282429, + "flos": 23153650813440.0, + "grad_norm": 6523.034573603343, + "language_loss": 5.06446743, + "learning_rate": 3.0877802144103967e-06, + "loss": 5.36123562, + "num_input_tokens_seen": 2546605, + "router_z_loss_clip": 34.0, + "router_z_loss_mlp": 159.0, + "step": 121, + "time_per_iteration": 2.726327419281006 + }, + { + "auxiliary_loss_clip": 0.11618941, + "auxiliary_loss_mlp": 0.17642631, + "balance_loss_clip": 0.08232379, + "balance_loss_mlp": 0.02127495, + "epoch": 0.007335036825492259, + "flos": 15526811203200.0, + "grad_norm": 1010.4173973733286, + "language_loss": 4.56235886, + "learning_rate": 3.09307943925077e-06, + "loss": 4.85497475, + "num_input_tokens_seen": 2560730, + "router_z_loss_clip": 33.875, + "router_z_loss_mlp": 155.125, + "step": 122, + "time_per_iteration": 2.640110969543457 + }, + { + "auxiliary_loss_clip": 0.11591011, + "auxiliary_loss_mlp": 0.16755471, + "balance_loss_clip": 0.08221178, + "balance_loss_mlp": 0.02094828, + "epoch": 0.007395160078160229, + "flos": 24250233196800.0, + "grad_norm": 4778.191954305265, + "language_loss": 4.97837877, + "learning_rate": 3.0983354046304154e-06, + "loss": 5.2618432, + "num_input_tokens_seen": 2579550, + "router_z_loss_clip": 33.625, + "router_z_loss_mlp": 146.625, + "step": 123, + "time_per_iteration": 2.689462661743164 + }, + { + "auxiliary_loss_clip": 0.11583175, + "auxiliary_loss_mlp": 0.16522312, + "balance_loss_clip": 0.08218054, + "balance_loss_mlp": 0.02069187, + "epoch": 0.007455283330828198, + "flos": 31767976391040.0, + "grad_norm": 918.147653305623, + "language_loss": 4.24658871, + "learning_rate": 3.103548811118979e-06, + "loss": 4.5276432, + "num_input_tokens_seen": 2600390, + "router_z_loss_clip": 33.6875, + "router_z_loss_mlp": 144.625, + "step": 124, + "time_per_iteration": 2.79850172996521 + }, + { + "auxiliary_loss_clip": 0.11631332, + "auxiliary_loss_mlp": 0.17508414, + "balance_loss_clip": 0.08243011, + "balance_loss_mlp": 0.02151969, + "epoch": 0.007515406583496167, + "flos": 26622458138880.0, + "grad_norm": 2521.4972321949017, + "language_loss": 4.22364092, + "learning_rate": 3.108720342404542e-06, + "loss": 4.51503849, + "num_input_tokens_seen": 2620770, + "router_z_loss_clip": 33.90625, + "router_z_loss_mlp": 153.375, + "step": 125, + "time_per_iteration": 2.699488401412964 + }, + { + "auxiliary_loss_clip": 0.11621339, + "auxiliary_loss_mlp": 0.16743667, + "balance_loss_clip": 0.08258513, + "balance_loss_mlp": 0.02131851, + "epoch": 0.007575529836164136, + "flos": 18229637629440.0, + "grad_norm": 2114.724785338214, + "language_loss": 4.42466068, + "learning_rate": 3.1138506658316945e-06, + "loss": 4.70831108, + "num_input_tokens_seen": 2639900, + "router_z_loss_clip": 33.625, + "router_z_loss_mlp": 146.125, + "step": 126, + "time_per_iteration": 2.65913987159729 + }, + { + "auxiliary_loss_clip": 0.11678092, + "auxiliary_loss_mlp": 0.16983882, + "balance_loss_clip": 0.08243092, + "balance_loss_mlp": 0.02127924, + "epoch": 0.007635653088832106, + "flos": 21586916770560.0, + "grad_norm": 719.841664884419, + "language_loss": 3.98921776, + "learning_rate": 3.1189404329183404e-06, + "loss": 4.2758379, + "num_input_tokens_seen": 2657450, + "router_z_loss_clip": 34.3125, + "router_z_loss_mlp": 148.625, + "step": 127, + "time_per_iteration": 2.6392276287078857 + }, + { + "auxiliary_loss_clip": 0.11679719, + "auxiliary_loss_mlp": 0.17065403, + "balance_loss_clip": 0.08245254, + "balance_loss_mlp": 0.02160617, + "epoch": 0.007695776341500075, + "flos": 25382216730240.0, + "grad_norm": 1269.777428310943, + "language_loss": 4.33711529, + "learning_rate": 3.1239902798522317e-06, + "loss": 4.62456656, + "num_input_tokens_seen": 2678150, + "router_z_loss_clip": 34.3125, + "router_z_loss_mlp": 149.125, + "step": 128, + "time_per_iteration": 2.698997974395752 + }, + { + "auxiliary_loss_clip": 0.11722346, + "auxiliary_loss_mlp": 0.16804715, + "balance_loss_clip": 0.08270991, + "balance_loss_mlp": 0.02131863, + "epoch": 0.007755899594168045, + "flos": 22350088558080.0, + "grad_norm": 1159.6537901720856, + "language_loss": 4.87967634, + "learning_rate": 3.129000827968184e-06, + "loss": 5.16494703, + "num_input_tokens_seen": 2698290, + "router_z_loss_clip": 34.5, + "router_z_loss_mlp": 146.625, + "step": 129, + "time_per_iteration": 2.6568491458892822 + }, + { + "auxiliary_loss_clip": 0.11725748, + "auxiliary_loss_mlp": 0.17228858, + "balance_loss_clip": 0.08278215, + "balance_loss_mlp": 0.02165382, + "epoch": 0.007816022846836013, + "flos": 22644869621760.0, + "grad_norm": 436.4430863377033, + "language_loss": 5.01482534, + "learning_rate": 3.133972684206866e-06, + "loss": 5.30437136, + "num_input_tokens_seen": 2717630, + "router_z_loss_clip": 34.4375, + "router_z_loss_mlp": 150.5, + "step": 130, + "time_per_iteration": 2.7268729209899902 + }, + { + "auxiliary_loss_clip": 0.11697873, + "auxiliary_loss_mlp": 0.16884172, + "balance_loss_clip": 0.08257942, + "balance_loss_mlp": 0.02162493, + "epoch": 0.007876146099503984, + "flos": 18188115131520.0, + "grad_norm": 1162.2622739405722, + "language_loss": 4.07958698, + "learning_rate": 3.138906441556014e-06, + "loss": 4.36540699, + "num_input_tokens_seen": 2735835, + "router_z_loss_clip": 34.40625, + "router_z_loss_mlp": 147.25, + "step": 131, + "time_per_iteration": 2.7109498977661133 + }, + { + "auxiliary_loss_clip": 0.11733647, + "auxiliary_loss_mlp": 0.16117501, + "balance_loss_clip": 0.08280095, + "balance_loss_mlp": 0.02128244, + "epoch": 0.007936269352171952, + "flos": 27125788815360.0, + "grad_norm": 7543.348079431309, + "language_loss": 4.20423412, + "learning_rate": 3.143802679474861e-06, + "loss": 4.48274565, + "num_input_tokens_seen": 2756335, + "router_z_loss_clip": 34.5, + "router_z_loss_mlp": 140.0, + "step": 132, + "time_per_iteration": 2.717806816101074 + }, + { + "auxiliary_loss_clip": 0.11797122, + "auxiliary_loss_mlp": 0.16945273, + "balance_loss_clip": 0.08290964, + "balance_loss_mlp": 0.0219918, + "epoch": 0.007996392604839923, + "flos": 19032403271040.0, + "grad_norm": 824.1057706186339, + "language_loss": 4.52130318, + "learning_rate": 3.1486619643025565e-06, + "loss": 4.80872679, + "num_input_tokens_seen": 2775090, + "router_z_loss_clip": 34.96875, + "router_z_loss_mlp": 147.375, + "step": 133, + "time_per_iteration": 2.6183056831359863 + }, + { + "auxiliary_loss_clip": 0.11778916, + "auxiliary_loss_mlp": 0.1607928, + "balance_loss_clip": 0.08279899, + "balance_loss_mlp": 0.02163264, + "epoch": 0.008056515857507891, + "flos": 25491271219200.0, + "grad_norm": 23901.09716796145, + "language_loss": 3.33778429, + "learning_rate": 3.153484849651286e-06, + "loss": 3.61636591, + "num_input_tokens_seen": 2795320, + "router_z_loss_clip": 34.96875, + "router_z_loss_mlp": 139.25, + "step": 134, + "time_per_iteration": 2.715651750564575 + }, + { + "auxiliary_loss_clip": 0.11796138, + "auxiliary_loss_mlp": 0.16928384, + "balance_loss_clip": 0.08284588, + "balance_loss_mlp": 0.02206703, + "epoch": 0.00811663911017586, + "flos": 20563694236800.0, + "grad_norm": 532.3002515432323, + "language_loss": 4.31598186, + "learning_rate": 3.1582718767847806e-06, + "loss": 4.60322666, + "num_input_tokens_seen": 2812815, + "router_z_loss_clip": 35.1875, + "router_z_loss_mlp": 147.25, + "step": 135, + "time_per_iteration": 2.658189296722412 + }, + { + "auxiliary_loss_clip": 0.11834078, + "auxiliary_loss_mlp": 0.17649791, + "balance_loss_clip": 0.08286304, + "balance_loss_mlp": 0.02256724, + "epoch": 0.00817676236284383, + "flos": 18804483365760.0, + "grad_norm": 591.2706889750153, + "language_loss": 4.16468382, + "learning_rate": 3.1630235749828485e-06, + "loss": 4.45952272, + "num_input_tokens_seen": 2830445, + "router_z_loss_clip": 35.4375, + "router_z_loss_mlp": 153.75, + "step": 136, + "time_per_iteration": 5.634068250656128 + }, + { + "auxiliary_loss_clip": 0.11831227, + "auxiliary_loss_mlp": 0.16616376, + "balance_loss_clip": 0.08291583, + "balance_loss_mlp": 0.02193768, + "epoch": 0.008236885615511799, + "flos": 23879576661120.0, + "grad_norm": 754.59577193491, + "language_loss": 4.28476763, + "learning_rate": 3.1677404618925676e-06, + "loss": 4.56924391, + "num_input_tokens_seen": 2846965, + "router_z_loss_clip": 35.46875, + "router_z_loss_mlp": 144.25, + "step": 137, + "time_per_iteration": 2.6984925270080566 + }, + { + "auxiliary_loss_clip": 0.11840196, + "auxiliary_loss_mlp": 0.16576298, + "balance_loss_clip": 0.08293904, + "balance_loss_mlp": 0.02214726, + "epoch": 0.00829700886817977, + "flos": 24650379169920.0, + "grad_norm": 767.1857414798482, + "language_loss": 4.50048828, + "learning_rate": 3.1724230438666953e-06, + "loss": 4.78465271, + "num_input_tokens_seen": 2867520, + "router_z_loss_clip": 35.5, + "router_z_loss_mlp": 143.5625, + "step": 138, + "time_per_iteration": 4.106135368347168 + }, + { + "auxiliary_loss_clip": 0.11846266, + "auxiliary_loss_mlp": 0.16453376, + "balance_loss_clip": 0.08313362, + "balance_loss_mlp": 0.02219978, + "epoch": 0.008357132120847738, + "flos": 25268550266880.0, + "grad_norm": 3135.202751990444, + "language_loss": 4.53827906, + "learning_rate": 3.177071816289865e-06, + "loss": 4.82127523, + "num_input_tokens_seen": 2885675, + "router_z_loss_clip": 35.34375, + "router_z_loss_mlp": 142.5, + "step": 139, + "time_per_iteration": 2.6956582069396973 + }, + { + "auxiliary_loss_clip": 0.11892673, + "auxiliary_loss_mlp": 0.17064422, + "balance_loss_clip": 0.08314734, + "balance_loss_mlp": 0.02245087, + "epoch": 0.008417255373515706, + "flos": 27352325128320.0, + "grad_norm": 729.9492101747932, + "language_loss": 3.41289186, + "learning_rate": 3.181687263893095e-06, + "loss": 3.70246267, + "num_input_tokens_seen": 2905960, + "router_z_loss_clip": 35.71875, + "router_z_loss_mlp": 148.125, + "step": 140, + "time_per_iteration": 2.6964235305786133 + }, + { + "auxiliary_loss_clip": 0.1186142, + "auxiliary_loss_mlp": 0.16847792, + "balance_loss_clip": 0.08325124, + "balance_loss_mlp": 0.02223768, + "epoch": 0.008477378626183677, + "flos": 17644771330560.0, + "grad_norm": 9248.736899536998, + "language_loss": 3.54738212, + "learning_rate": 3.186269861057098e-06, + "loss": 3.83447456, + "num_input_tokens_seen": 2922780, + "router_z_loss_clip": 35.375, + "router_z_loss_mlp": 146.125, + "step": 141, + "time_per_iteration": 2.6551992893218994 + }, + { + "auxiliary_loss_clip": 0.11875261, + "auxiliary_loss_mlp": 0.17182453, + "balance_loss_clip": 0.08333448, + "balance_loss_mlp": 0.02241047, + "epoch": 0.008537501878851645, + "flos": 13886465748480.0, + "grad_norm": 1195.8886145818353, + "language_loss": 3.75801992, + "learning_rate": 3.1908200721048745e-06, + "loss": 4.04859734, + "num_input_tokens_seen": 2938765, + "router_z_loss_clip": 35.46875, + "router_z_loss_mlp": 149.375, + "step": 142, + "time_per_iteration": 2.613173246383667 + }, + { + "auxiliary_loss_clip": 0.11767568, + "auxiliary_loss_mlp": 0.03479403, + "balance_loss_clip": 0.08269441, + "balance_loss_mlp": 0.01324862, + "epoch": 0.008597625131519616, + "flos": 71270783976960.0, + "grad_norm": 1.6897091068609469, + "language_loss": 0.6651473, + "learning_rate": 3.195338351584042e-06, + "loss": 0.81761706, + "num_input_tokens_seen": 3006665, + "router_z_loss_clip": 35.0, + "router_z_loss_mlp": 21.5625, + "step": 143, + "time_per_iteration": 3.571974754333496 + }, + { + "auxiliary_loss_clip": 0.11831102, + "auxiliary_loss_mlp": 0.18004906, + "balance_loss_clip": 0.08322103, + "balance_loss_mlp": 0.02245629, + "epoch": 0.008657748384187584, + "flos": 17608573566720.0, + "grad_norm": 764.3395719536082, + "language_loss": 4.02781963, + "learning_rate": 3.1998251445393258e-06, + "loss": 4.32617998, + "num_input_tokens_seen": 3024335, + "router_z_loss_clip": 35.125, + "router_z_loss_mlp": 157.625, + "step": 144, + "time_per_iteration": 2.950308322906494 + }, + { + "auxiliary_loss_clip": 0.11815393, + "auxiliary_loss_mlp": 0.1653876, + "balance_loss_clip": 0.08320558, + "balance_loss_mlp": 0.021955, + "epoch": 0.008717871636855555, + "flos": 19720789689600.0, + "grad_norm": 995.118837229873, + "language_loss": 3.85104275, + "learning_rate": 3.204280886775619e-06, + "loss": 4.13458443, + "num_input_tokens_seen": 3043300, + "router_z_loss_clip": 34.9375, + "router_z_loss_mlp": 143.625, + "step": 145, + "time_per_iteration": 2.704049587249756 + }, + { + "auxiliary_loss_clip": 0.11712223, + "auxiliary_loss_mlp": 0.1568643, + "balance_loss_clip": 0.08270143, + "balance_loss_mlp": 0.02154936, + "epoch": 0.008777994889523523, + "flos": 24724325998080.0, + "grad_norm": 15039.120691806027, + "language_loss": 3.98885298, + "learning_rate": 3.208706005112005e-06, + "loss": 4.26283932, + "num_input_tokens_seen": 3064610, + "router_z_loss_clip": 34.40625, + "router_z_loss_mlp": 135.4375, + "step": 146, + "time_per_iteration": 2.7329108715057373 + }, + { + "auxiliary_loss_clip": 0.11446112, + "auxiliary_loss_mlp": 0.02845502, + "balance_loss_clip": 0.08152023, + "balance_loss_mlp": 0.01408125, + "epoch": 0.008838118142191492, + "flos": 70150974013440.0, + "grad_norm": 1.1651618479175945, + "language_loss": 0.59517723, + "learning_rate": 3.213100917627104e-06, + "loss": 0.73809338, + "num_input_tokens_seen": 3130385, + "router_z_loss_clip": 33.0, + "router_z_loss_mlp": 14.3671875, + "step": 147, + "time_per_iteration": 3.3949942588806152 + }, + { + "auxiliary_loss_clip": 0.11677637, + "auxiliary_loss_mlp": 0.16713935, + "balance_loss_clip": 0.08274397, + "balance_loss_mlp": 0.02199776, + "epoch": 0.008898241394859462, + "flos": 20050510705920.0, + "grad_norm": 1889.1884601694564, + "language_loss": 4.35780334, + "learning_rate": 3.2174660338961135e-06, + "loss": 4.64171886, + "num_input_tokens_seen": 3149760, + "router_z_loss_clip": 33.96875, + "router_z_loss_mlp": 145.25, + "step": 148, + "time_per_iteration": 2.7146079540252686 + }, + { + "auxiliary_loss_clip": 0.1159438, + "auxiliary_loss_mlp": 0.16573352, + "balance_loss_clip": 0.08248326, + "balance_loss_mlp": 0.02217881, + "epoch": 0.008958364647527431, + "flos": 10748217980160.0, + "grad_norm": 637.0991660467967, + "language_loss": 4.14174032, + "learning_rate": 3.2218017552198588e-06, + "loss": 4.42341805, + "num_input_tokens_seen": 3164500, + "router_z_loss_clip": 33.4375, + "router_z_loss_mlp": 143.625, + "step": 149, + "time_per_iteration": 2.661672353744507 + }, + { + "auxiliary_loss_clip": 0.11618437, + "auxiliary_loss_mlp": 0.16563556, + "balance_loss_clip": 0.08263792, + "balance_loss_mlp": 0.02201984, + "epoch": 0.009018487900195401, + "flos": 29134317110400.0, + "grad_norm": 1769.3998229499293, + "language_loss": 4.95698929, + "learning_rate": 3.226108474846181e-06, + "loss": 5.23880959, + "num_input_tokens_seen": 3182455, + "router_z_loss_clip": 33.5625, + "router_z_loss_mlp": 143.6875, + "step": 150, + "time_per_iteration": 2.7311227321624756 + }, + { + "auxiliary_loss_clip": 0.11585926, + "auxiliary_loss_mlp": 0.16123089, + "balance_loss_clip": 0.08249478, + "balance_loss_mlp": 0.02219281, + "epoch": 0.00907861115286337, + "flos": 32972020035840.0, + "grad_norm": 2114.6136002652206, + "language_loss": 3.36094427, + "learning_rate": 3.2303865781839817e-06, + "loss": 3.63803458, + "num_input_tokens_seen": 3203995, + "router_z_loss_clip": 33.34375, + "router_z_loss_mlp": 139.125, + "step": 151, + "time_per_iteration": 2.7520253658294678 + }, + { + "auxiliary_loss_clip": 0.115492, + "auxiliary_loss_mlp": 0.15748456, + "balance_loss_clip": 0.08239767, + "balance_loss_mlp": 0.02198652, + "epoch": 0.009138734405531338, + "flos": 21768911838720.0, + "grad_norm": 3311.474565423633, + "language_loss": 3.73547316, + "learning_rate": 3.234636443010188e-06, + "loss": 4.00844955, + "num_input_tokens_seen": 3222575, + "router_z_loss_clip": 33.09375, + "router_z_loss_mlp": 135.625, + "step": 152, + "time_per_iteration": 2.694563865661621 + }, + { + "auxiliary_loss_clip": 0.1159073, + "auxiliary_loss_mlp": 0.1623821, + "balance_loss_clip": 0.08250044, + "balance_loss_mlp": 0.02248952, + "epoch": 0.009198857658199309, + "flos": 20847532343040.0, + "grad_norm": 1087.0956983151382, + "language_loss": 3.84302998, + "learning_rate": 3.238858439669943e-06, + "loss": 4.12131977, + "num_input_tokens_seen": 3240180, + "router_z_loss_clip": 33.40625, + "router_z_loss_mlp": 139.875, + "step": 153, + "time_per_iteration": 2.6366450786590576 + }, + { + "auxiliary_loss_clip": 0.11564142, + "auxiliary_loss_mlp": 0.15476364, + "balance_loss_clip": 0.08260261, + "balance_loss_mlp": 0.02207321, + "epoch": 0.009258980910867277, + "flos": 24834386736000.0, + "grad_norm": 8366.148944916698, + "language_loss": 4.13687325, + "learning_rate": 3.2430529312702712e-06, + "loss": 4.40727806, + "num_input_tokens_seen": 3259800, + "router_z_loss_clip": 33.03125, + "router_z_loss_mlp": 132.8125, + "step": 154, + "time_per_iteration": 2.7312138080596924 + }, + { + "auxiliary_loss_clip": 0.11535051, + "auxiliary_loss_mlp": 0.15077396, + "balance_loss_clip": 0.08268774, + "balance_loss_mlp": 0.02198978, + "epoch": 0.009319104163535248, + "flos": 28775442072960.0, + "grad_norm": 662.1258045248602, + "language_loss": 4.14579964, + "learning_rate": 3.2472202738674737e-06, + "loss": 4.41192484, + "num_input_tokens_seen": 3280400, + "router_z_loss_clip": 32.71875, + "router_z_loss_mlp": 128.6875, + "step": 155, + "time_per_iteration": 2.755199909210205 + }, + { + "auxiliary_loss_clip": 0.11566834, + "auxiliary_loss_mlp": 0.15004471, + "balance_loss_clip": 0.08261703, + "balance_loss_mlp": 0.02193191, + "epoch": 0.009379227416203216, + "flos": 16587698947200.0, + "grad_norm": 731.5664855161135, + "language_loss": 3.49704862, + "learning_rate": 3.2513608166485063e-06, + "loss": 3.76276183, + "num_input_tokens_seen": 3297600, + "router_z_loss_clip": 33.09375, + "router_z_loss_mlp": 128.125, + "step": 156, + "time_per_iteration": 2.7707407474517822 + }, + { + "auxiliary_loss_clip": 0.11568415, + "auxiliary_loss_mlp": 0.15332887, + "balance_loss_clip": 0.08266081, + "balance_loss_mlp": 0.02216432, + "epoch": 0.009439350668871187, + "flos": 18335337955200.0, + "grad_norm": 795.683005311381, + "language_loss": 3.94911337, + "learning_rate": 3.2554749021065498e-06, + "loss": 4.2181263, + "num_input_tokens_seen": 3313635, + "router_z_loss_clip": 32.96875, + "router_z_loss_mlp": 131.25, + "step": 157, + "time_per_iteration": 2.6737098693847656 + }, + { + "auxiliary_loss_clip": 0.11567172, + "auxiliary_loss_mlp": 0.15600383, + "balance_loss_clip": 0.0828969, + "balance_loss_mlp": 0.02264203, + "epoch": 0.009499473921539155, + "flos": 24356310865920.0, + "grad_norm": 748.6515809747107, + "language_loss": 3.9944849, + "learning_rate": 3.2595628662110186e-06, + "loss": 4.26616049, + "num_input_tokens_seen": 3333735, + "router_z_loss_clip": 32.75, + "router_z_loss_mlp": 133.5625, + "step": 158, + "time_per_iteration": 2.6704254150390625 + }, + { + "auxiliary_loss_clip": 0.11561831, + "auxiliary_loss_mlp": 0.15665153, + "balance_loss_clip": 0.08273103, + "balance_loss_mlp": 0.02231314, + "epoch": 0.009559597174207124, + "flos": 16404949192320.0, + "grad_norm": 1901.311070356518, + "language_loss": 3.80921197, + "learning_rate": 3.2636250385721982e-06, + "loss": 4.08148146, + "num_input_tokens_seen": 3348800, + "router_z_loss_clip": 32.90625, + "router_z_loss_mlp": 134.4375, + "step": 159, + "time_per_iteration": 2.6218996047973633 + }, + { + "auxiliary_loss_clip": 0.11580203, + "auxiliary_loss_mlp": 0.15643886, + "balance_loss_clip": 0.08278053, + "balance_loss_mlp": 0.02252773, + "epoch": 0.009619720426875094, + "flos": 22863523651200.0, + "grad_norm": 1785.522909187837, + "language_loss": 3.8831954, + "learning_rate": 3.2676617426007263e-06, + "loss": 4.15543652, + "num_input_tokens_seen": 3368595, + "router_z_loss_clip": 33.0, + "router_z_loss_mlp": 134.0, + "step": 160, + "time_per_iteration": 2.6699254512786865 + }, + { + "auxiliary_loss_clip": 0.11567888, + "auxiliary_loss_mlp": 0.15128596, + "balance_loss_clip": 0.08280417, + "balance_loss_mlp": 0.02237971, + "epoch": 0.009679843679543063, + "flos": 19140954635520.0, + "grad_norm": 1894.5705497879367, + "language_loss": 4.38242626, + "learning_rate": 3.2716732956621042e-06, + "loss": 4.6493907, + "num_input_tokens_seen": 3384975, + "router_z_loss_clip": 32.890625, + "router_z_loss_mlp": 129.0, + "step": 161, + "time_per_iteration": 2.692594289779663 + }, + { + "auxiliary_loss_clip": 0.11596949, + "auxiliary_loss_mlp": 0.15413821, + "balance_loss_clip": 0.08296333, + "balance_loss_mlp": 0.02279055, + "epoch": 0.009739966932211033, + "flos": 20309219786880.0, + "grad_norm": 1092.6315431795774, + "language_loss": 3.67637897, + "learning_rate": 3.2756600092264203e-06, + "loss": 3.94648647, + "num_input_tokens_seen": 3404755, + "router_z_loss_clip": 33.0, + "router_z_loss_mlp": 131.4375, + "step": 162, + "time_per_iteration": 2.684589147567749 + }, + { + "auxiliary_loss_clip": 0.10812573, + "auxiliary_loss_mlp": 0.02121325, + "balance_loss_clip": 0.08169468, + "balance_loss_mlp": 0.01469775, + "epoch": 0.009800090184879002, + "flos": 67053200567040.0, + "grad_norm": 1.455168404801105, + "language_loss": 0.72263706, + "learning_rate": 3.279622189013474e-06, + "loss": 0.85197604, + "num_input_tokens_seen": 3467210, + "router_z_loss_clip": 26.484375, + "router_z_loss_mlp": 6.515625, + "step": 163, + "time_per_iteration": 3.2609994411468506 + }, + { + "auxiliary_loss_clip": 0.1158057, + "auxiliary_loss_mlp": 0.15459523, + "balance_loss_clip": 0.08303102, + "balance_loss_mlp": 0.02282033, + "epoch": 0.00986021343754697, + "flos": 17170301185920.0, + "grad_norm": 728.8786194893343, + "language_loss": 3.07243919, + "learning_rate": 3.283560135133457e-06, + "loss": 3.34283996, + "num_input_tokens_seen": 3483220, + "router_z_loss_clip": 32.765625, + "router_z_loss_mlp": 131.8125, + "step": 164, + "time_per_iteration": 2.6558001041412354 + }, + { + "auxiliary_loss_clip": 0.11589515, + "auxiliary_loss_mlp": 0.15754591, + "balance_loss_clip": 0.08312181, + "balance_loss_mlp": 0.02308546, + "epoch": 0.00992033669021494, + "flos": 17755293265920.0, + "grad_norm": 847.0745501241739, + "language_loss": 3.51890922, + "learning_rate": 3.2874741422233565e-06, + "loss": 3.79235029, + "num_input_tokens_seen": 3501465, + "router_z_loss_clip": 32.78125, + "router_z_loss_mlp": 134.4375, + "step": 165, + "time_per_iteration": 2.661271095275879 + }, + { + "auxiliary_loss_clip": 0.11568248, + "auxiliary_loss_mlp": 0.15508898, + "balance_loss_clip": 0.08301617, + "balance_loss_mlp": 0.02294787, + "epoch": 0.00998045994288291, + "flos": 25303490219520.0, + "grad_norm": 327.0790624727143, + "language_loss": 3.23893571, + "learning_rate": 3.2913644995792465e-06, + "loss": 3.50970697, + "num_input_tokens_seen": 3520480, + "router_z_loss_clip": 32.6875, + "router_z_loss_mlp": 132.3125, + "step": 166, + "time_per_iteration": 2.710336923599243 + }, + { + "auxiliary_loss_clip": 0.11574914, + "auxiliary_loss_mlp": 0.14880663, + "balance_loss_clip": 0.08314175, + "balance_loss_mlp": 0.02301317, + "epoch": 0.01004058319555088, + "flos": 32305869676800.0, + "grad_norm": 776.5856268380442, + "language_loss": 4.07326555, + "learning_rate": 3.2952314912845914e-06, + "loss": 4.33782148, + "num_input_tokens_seen": 3539570, + "router_z_loss_clip": 32.609375, + "router_z_loss_mlp": 125.8125, + "step": 167, + "time_per_iteration": 2.779219150543213 + }, + { + "auxiliary_loss_clip": 0.1150827, + "auxiliary_loss_mlp": 0.15720402, + "balance_loss_clip": 0.083069, + "balance_loss_mlp": 0.02304874, + "epoch": 0.010100706448218848, + "flos": 11323399132800.0, + "grad_norm": 2394.835407434967, + "language_loss": 3.28905821, + "learning_rate": 3.299075396334735e-06, + "loss": 3.5613451, + "num_input_tokens_seen": 3555465, + "router_z_loss_clip": 32.0, + "router_z_loss_mlp": 134.25, + "step": 168, + "time_per_iteration": 2.6511645317077637 + }, + { + "auxiliary_loss_clip": 0.11477365, + "auxiliary_loss_mlp": 0.1529358, + "balance_loss_clip": 0.08283502, + "balance_loss_mlp": 0.02299196, + "epoch": 0.010160829700886819, + "flos": 29727820379520.0, + "grad_norm": 656.1528496227621, + "language_loss": 3.4663558, + "learning_rate": 3.3028964887576868e-06, + "loss": 3.73406529, + "num_input_tokens_seen": 3578970, + "router_z_loss_clip": 31.921875, + "router_z_loss_mlp": 130.0, + "step": 169, + "time_per_iteration": 2.744943141937256 + }, + { + "auxiliary_loss_clip": 0.1151928, + "auxiliary_loss_mlp": 0.1559048, + "balance_loss_clip": 0.08316396, + "balance_loss_mlp": 0.02315333, + "epoch": 0.010220952953554787, + "flos": 20418567765120.0, + "grad_norm": 1313.5821328962659, + "language_loss": 3.30928183, + "learning_rate": 3.306695037731344e-06, + "loss": 3.58037925, + "num_input_tokens_seen": 3597275, + "router_z_loss_clip": 32.03125, + "router_z_loss_mlp": 132.75, + "step": 170, + "time_per_iteration": 2.6904942989349365 + }, + { + "auxiliary_loss_clip": 0.11476055, + "auxiliary_loss_mlp": 0.14880618, + "balance_loss_clip": 0.08295664, + "balance_loss_mlp": 0.02301271, + "epoch": 0.010281076206222756, + "flos": 31293170830080.0, + "grad_norm": 1393.3935417181144, + "language_loss": 3.61100364, + "learning_rate": 3.3104713076972827e-06, + "loss": 3.87457037, + "num_input_tokens_seen": 3618905, + "router_z_loss_clip": 31.84375, + "router_z_loss_mlp": 125.75, + "step": 171, + "time_per_iteration": 2.7253830432891846 + }, + { + "auxiliary_loss_clip": 0.11506656, + "auxiliary_loss_mlp": 0.15002409, + "balance_loss_clip": 0.08299719, + "balance_loss_mlp": 0.02294889, + "epoch": 0.010341199458890726, + "flos": 21988949460480.0, + "grad_norm": 857.6014739419991, + "language_loss": 3.63604832, + "learning_rate": 3.314225558471224e-06, + "loss": 3.90113878, + "num_input_tokens_seen": 3639610, + "router_z_loss_clip": 32.015625, + "router_z_loss_mlp": 127.1875, + "step": 172, + "time_per_iteration": 2.687918186187744 + }, + { + "auxiliary_loss_clip": 0.11501465, + "auxiliary_loss_mlp": 0.15934135, + "balance_loss_clip": 0.08304699, + "balance_loss_mlp": 0.02359916, + "epoch": 0.010401322711558695, + "flos": 30818449123200.0, + "grad_norm": 2776.6711688344126, + "language_loss": 3.43709183, + "learning_rate": 3.317958045350308e-06, + "loss": 3.71144772, + "num_input_tokens_seen": 3664030, + "router_z_loss_clip": 31.9375, + "router_z_loss_mlp": 135.6875, + "step": 173, + "time_per_iteration": 2.760416030883789 + }, + { + "auxiliary_loss_clip": 0.11548179, + "auxiliary_loss_mlp": 0.15753293, + "balance_loss_clip": 0.08317138, + "balance_loss_mlp": 0.02337765, + "epoch": 0.010461445964226665, + "flos": 24721642667520.0, + "grad_norm": 1049.1047345334737, + "language_loss": 3.46181607, + "learning_rate": 3.3216690192172596e-06, + "loss": 3.73483086, + "num_input_tokens_seen": 3683615, + "router_z_loss_clip": 32.28125, + "router_z_loss_mlp": 134.125, + "step": 174, + "time_per_iteration": 2.8112432956695557 + }, + { + "auxiliary_loss_clip": 0.11529493, + "auxiliary_loss_mlp": 0.16248052, + "balance_loss_clip": 0.08304952, + "balance_loss_mlp": 0.02319829, + "epoch": 0.010521569216894634, + "flos": 27717950419200.0, + "grad_norm": 1443.6409322594398, + "language_loss": 3.14877939, + "learning_rate": 3.325358726641591e-06, + "loss": 3.42655468, + "num_input_tokens_seen": 3704540, + "router_z_loss_clip": 32.265625, + "router_z_loss_mlp": 139.25, + "step": 175, + "time_per_iteration": 5.6078009605407715 + }, + { + "auxiliary_loss_clip": 0.11549105, + "auxiliary_loss_mlp": 0.15645993, + "balance_loss_clip": 0.08317456, + "balance_loss_mlp": 0.02328122, + "epoch": 0.010581692469562603, + "flos": 12463223022720.0, + "grad_norm": 956.7802143525229, + "language_loss": 3.34866667, + "learning_rate": 3.329027409977902e-06, + "loss": 3.62061763, + "num_input_tokens_seen": 3721320, + "router_z_loss_clip": 32.34375, + "router_z_loss_mlp": 133.375, + "step": 176, + "time_per_iteration": 4.057558059692383 + }, + { + "auxiliary_loss_clip": 0.11580729, + "auxiliary_loss_mlp": 0.16905147, + "balance_loss_clip": 0.08321375, + "balance_loss_mlp": 0.02378779, + "epoch": 0.010641815722230573, + "flos": 19433723201280.0, + "grad_norm": 1505.424754847227, + "language_loss": 3.25544405, + "learning_rate": 3.3326753074614087e-06, + "loss": 3.54030275, + "num_input_tokens_seen": 3739385, + "router_z_loss_clip": 32.5625, + "router_z_loss_mlp": 145.25, + "step": 177, + "time_per_iteration": 4.175410032272339 + }, + { + "auxiliary_loss_clip": 0.11632887, + "auxiliary_loss_mlp": 0.17182559, + "balance_loss_clip": 0.08330977, + "balance_loss_mlp": 0.02387638, + "epoch": 0.010701938974898541, + "flos": 18338440556160.0, + "grad_norm": 1009.0094276513727, + "language_loss": 3.02760315, + "learning_rate": 3.3363026533007716e-06, + "loss": 3.31575751, + "num_input_tokens_seen": 3756360, + "router_z_loss_clip": 33.046875, + "router_z_loss_mlp": 148.0, + "step": 178, + "time_per_iteration": 2.6476314067840576 + }, + { + "auxiliary_loss_clip": 0.11659138, + "auxiliary_loss_mlp": 0.17559879, + "balance_loss_clip": 0.0834986, + "balance_loss_mlp": 0.02398745, + "epoch": 0.010762062227566512, + "flos": 19209283240320.0, + "grad_norm": 645.2944722680985, + "language_loss": 3.18850112, + "learning_rate": 3.3399096777683303e-06, + "loss": 3.48069143, + "num_input_tokens_seen": 3773930, + "router_z_loss_clip": 33.03125, + "router_z_loss_mlp": 151.5, + "step": 179, + "time_per_iteration": 2.673020601272583 + }, + { + "auxiliary_loss_clip": 0.11646449, + "auxiliary_loss_mlp": 0.17152536, + "balance_loss_clip": 0.0833544, + "balance_loss_mlp": 0.02369822, + "epoch": 0.01082218548023448, + "flos": 31432553297280.0, + "grad_norm": 1138.8337468152163, + "language_loss": 3.61664343, + "learning_rate": 3.3434966072878213e-06, + "loss": 3.90463305, + "num_input_tokens_seen": 3793630, + "router_z_loss_clip": 33.125, + "router_z_loss_mlp": 147.75, + "step": 180, + "time_per_iteration": 2.7129592895507812 + }, + { + "auxiliary_loss_clip": 0.1163583, + "auxiliary_loss_mlp": 0.17579561, + "balance_loss_clip": 0.08352019, + "balance_loss_mlp": 0.02406223, + "epoch": 0.01088230873290245, + "flos": 25053501962880.0, + "grad_norm": 1023.6426422721124, + "language_loss": 3.16591597, + "learning_rate": 3.3470636645196674e-06, + "loss": 3.45807004, + "num_input_tokens_seen": 3813610, + "router_z_loss_clip": 32.875, + "router_z_loss_mlp": 151.5, + "step": 181, + "time_per_iteration": 2.7088735103607178 + }, + { + "auxiliary_loss_clip": 0.11667231, + "auxiliary_loss_mlp": 0.17749819, + "balance_loss_clip": 0.08358228, + "balance_loss_mlp": 0.02381167, + "epoch": 0.01094243198557042, + "flos": 22900056831360.0, + "grad_norm": 355.45097956691654, + "language_loss": 3.57462454, + "learning_rate": 3.3506110684439156e-06, + "loss": 3.86879492, + "num_input_tokens_seen": 3831390, + "router_z_loss_clip": 33.03125, + "router_z_loss_mlp": 153.625, + "step": 182, + "time_per_iteration": 2.6655702590942383 + }, + { + "auxiliary_loss_clip": 0.11774068, + "auxiliary_loss_mlp": 0.186405, + "balance_loss_clip": 0.08392486, + "balance_loss_mlp": 0.02429562, + "epoch": 0.011002555238238388, + "flos": 17170720456320.0, + "grad_norm": 544.9308642616941, + "language_loss": 3.01895189, + "learning_rate": 3.3541390344409054e-06, + "loss": 3.32309771, + "num_input_tokens_seen": 3849705, + "router_z_loss_clip": 33.8125, + "router_z_loss_mlp": 162.0, + "step": 183, + "time_per_iteration": 2.672084331512451 + }, + { + "auxiliary_loss_clip": 0.11731043, + "auxiliary_loss_mlp": 0.17741105, + "balance_loss_clip": 0.0838448, + "balance_loss_mlp": 0.02409074, + "epoch": 0.011062678490906358, + "flos": 22316783760000.0, + "grad_norm": 900.0159693716428, + "language_loss": 3.54977012, + "learning_rate": 3.357647774369736e-06, + "loss": 3.84449148, + "num_input_tokens_seen": 3869230, + "router_z_loss_clip": 33.4375, + "router_z_loss_mlp": 153.25, + "step": 184, + "time_per_iteration": 2.664008140563965 + }, + { + "auxiliary_loss_clip": 0.11698474, + "auxiliary_loss_mlp": 0.18400645, + "balance_loss_clip": 0.08363934, + "balance_loss_mlp": 0.02433849, + "epoch": 0.011122801743574327, + "flos": 24395108106240.0, + "grad_norm": 434.928327577731, + "language_loss": 3.09638596, + "learning_rate": 3.3611374966446085e-06, + "loss": 3.39737701, + "num_input_tokens_seen": 3889735, + "router_z_loss_clip": 33.34375, + "router_z_loss_mlp": 159.5, + "step": 185, + "time_per_iteration": 2.726417303085327 + }, + { + "auxiliary_loss_clip": 0.11759127, + "auxiliary_loss_mlp": 0.17777845, + "balance_loss_clip": 0.08374798, + "balance_loss_mlp": 0.02421399, + "epoch": 0.011182924996242297, + "flos": 18156110071680.0, + "grad_norm": 629.7246053366609, + "language_loss": 2.4891119, + "learning_rate": 3.3646084063091142e-06, + "loss": 2.78448153, + "num_input_tokens_seen": 3908855, + "router_z_loss_clip": 33.84375, + "router_z_loss_mlp": 153.5, + "step": 186, + "time_per_iteration": 2.694352865219116 + }, + { + "auxiliary_loss_clip": 0.11730683, + "auxiliary_loss_mlp": 0.17846453, + "balance_loss_clip": 0.08379789, + "balance_loss_mlp": 0.0240456, + "epoch": 0.011243048248910266, + "flos": 15492206666880.0, + "grad_norm": 204.67136476740635, + "language_loss": 3.6299262, + "learning_rate": 3.3680607051085194e-06, + "loss": 3.9256978, + "num_input_tokens_seen": 3923865, + "router_z_loss_clip": 33.53125, + "router_z_loss_mlp": 154.25, + "step": 187, + "time_per_iteration": 2.6440258026123047 + }, + { + "auxiliary_loss_clip": 0.11782947, + "auxiliary_loss_mlp": 0.18885629, + "balance_loss_clip": 0.08391893, + "balance_loss_mlp": 0.02454964, + "epoch": 0.011303171501578235, + "flos": 40926442383360.0, + "grad_norm": 245.45256433797323, + "language_loss": 2.78124428, + "learning_rate": 3.371494591560139e-06, + "loss": 3.0879302, + "num_input_tokens_seen": 3946870, + "router_z_loss_clip": 33.875, + "router_z_loss_mlp": 164.25, + "step": 188, + "time_per_iteration": 2.8504083156585693 + }, + { + "auxiliary_loss_clip": 0.10094331, + "auxiliary_loss_mlp": 0.0271045, + "balance_loss_clip": 0.08081996, + "balance_loss_mlp": 0.01840699, + "epoch": 0.011363294754246205, + "flos": 66321237225600.0, + "grad_norm": 2.5418158680058287, + "language_loss": 0.5572542, + "learning_rate": 3.3749102610218297e-06, + "loss": 0.68530196, + "num_input_tokens_seen": 4010005, + "router_z_loss_clip": 20.140625, + "router_z_loss_mlp": 8.71875, + "step": 189, + "time_per_iteration": 3.351346492767334 + }, + { + "auxiliary_loss_clip": 0.11787133, + "auxiliary_loss_mlp": 0.18362574, + "balance_loss_clip": 0.08391854, + "balance_loss_mlp": 0.02444606, + "epoch": 0.011423418006914174, + "flos": 24907285388160.0, + "grad_norm": 1404.1743205968703, + "language_loss": 3.09611416, + "learning_rate": 3.3783079057586833e-06, + "loss": 3.39761114, + "num_input_tokens_seen": 4029035, + "router_z_loss_clip": 34.0, + "router_z_loss_mlp": 159.125, + "step": 190, + "time_per_iteration": 2.7106430530548096 + }, + { + "auxiliary_loss_clip": 0.11759384, + "auxiliary_loss_mlp": 0.1804318, + "balance_loss_clip": 0.08374631, + "balance_loss_mlp": 0.02442593, + "epoch": 0.011483541259582144, + "flos": 19797964899840.0, + "grad_norm": 958.8286854390585, + "language_loss": 3.06252718, + "learning_rate": 3.3816877150079665e-06, + "loss": 3.36055326, + "num_input_tokens_seen": 4046995, + "router_z_loss_clip": 33.875, + "router_z_loss_mlp": 156.0, + "step": 191, + "time_per_iteration": 2.6592226028442383 + }, + { + "auxiliary_loss_clip": 0.11741614, + "auxiliary_loss_mlp": 0.17628413, + "balance_loss_clip": 0.08397849, + "balance_loss_mlp": 0.02442867, + "epoch": 0.011543664512250112, + "flos": 26184101904000.0, + "grad_norm": 872.0200851454543, + "language_loss": 3.40287876, + "learning_rate": 3.385049875042367e-06, + "loss": 3.69657874, + "num_input_tokens_seen": 4065865, + "router_z_loss_clip": 33.4375, + "router_z_loss_mlp": 151.625, + "step": 192, + "time_per_iteration": 2.7246127128601074 + }, + { + "auxiliary_loss_clip": 0.11744646, + "auxiliary_loss_mlp": 0.1831618, + "balance_loss_clip": 0.08387344, + "balance_loss_mlp": 0.02459247, + "epoch": 0.011603787764918083, + "flos": 23775763052160.0, + "grad_norm": 255.22859463919886, + "language_loss": 3.03195429, + "learning_rate": 3.3883945692315938e-06, + "loss": 3.33256245, + "num_input_tokens_seen": 4085305, + "router_z_loss_clip": 33.59375, + "router_z_loss_mlp": 158.375, + "step": 193, + "time_per_iteration": 2.683800220489502 + }, + { + "auxiliary_loss_clip": 0.11792802, + "auxiliary_loss_mlp": 0.18172303, + "balance_loss_clip": 0.08409159, + "balance_loss_mlp": 0.02449647, + "epoch": 0.011663911017586051, + "flos": 25961255170560.0, + "grad_norm": 151.45813274947093, + "language_loss": 3.26517797, + "learning_rate": 3.3917219781023906e-06, + "loss": 3.56482911, + "num_input_tokens_seen": 4105185, + "router_z_loss_clip": 33.875, + "router_z_loss_mlp": 157.0, + "step": 194, + "time_per_iteration": 2.6878743171691895 + }, + { + "auxiliary_loss_clip": 0.11706592, + "auxiliary_loss_mlp": 0.17706957, + "balance_loss_clip": 0.08367997, + "balance_loss_mlp": 0.0244817, + "epoch": 0.01172403427025402, + "flos": 17901006716160.0, + "grad_norm": 341.36308265873936, + "language_loss": 3.21669102, + "learning_rate": 3.3950322793970014e-06, + "loss": 3.51082659, + "num_input_tokens_seen": 4123160, + "router_z_loss_clip": 33.375, + "router_z_loss_mlp": 152.25, + "step": 195, + "time_per_iteration": 2.6620969772338867 + }, + { + "auxiliary_loss_clip": 0.11741272, + "auxiliary_loss_mlp": 0.18081686, + "balance_loss_clip": 0.08387178, + "balance_loss_mlp": 0.02468893, + "epoch": 0.01178415752292199, + "flos": 17900293956480.0, + "grad_norm": 232.42067340374058, + "language_loss": 3.00283194, + "learning_rate": 3.3983256481301445e-06, + "loss": 3.30106115, + "num_input_tokens_seen": 4140425, + "router_z_loss_clip": 33.53125, + "router_z_loss_mlp": 156.0, + "step": 196, + "time_per_iteration": 2.608747720718384 + }, + { + "auxiliary_loss_clip": 0.11721249, + "auxiliary_loss_mlp": 0.17373422, + "balance_loss_clip": 0.08370736, + "balance_loss_mlp": 0.02444223, + "epoch": 0.011844280775589959, + "flos": 22900224539520.0, + "grad_norm": 115.37051275011517, + "language_loss": 2.93469787, + "learning_rate": 3.4016022566445335e-06, + "loss": 3.22564435, + "num_input_tokens_seen": 4159555, + "router_z_loss_clip": 33.5, + "router_z_loss_mlp": 149.0, + "step": 197, + "time_per_iteration": 2.6884865760803223 + }, + { + "auxiliary_loss_clip": 0.11780085, + "auxiliary_loss_mlp": 0.17500654, + "balance_loss_clip": 0.08412851, + "balance_loss_mlp": 0.02486004, + "epoch": 0.01190440402825793, + "flos": 26987748013440.0, + "grad_norm": 594.5655905086047, + "language_loss": 2.93459964, + "learning_rate": 3.4048622746649966e-06, + "loss": 3.22740698, + "num_input_tokens_seen": 4180480, + "router_z_loss_clip": 33.65625, + "router_z_loss_mlp": 150.25, + "step": 198, + "time_per_iteration": 2.7313427925109863 + }, + { + "auxiliary_loss_clip": 0.11754367, + "auxiliary_loss_mlp": 0.16903168, + "balance_loss_clip": 0.08420561, + "balance_loss_mlp": 0.02462251, + "epoch": 0.011964527280925898, + "flos": 20527789962240.0, + "grad_norm": 145.17481727818333, + "language_loss": 2.84690857, + "learning_rate": 3.4081058693512278e-06, + "loss": 3.13348389, + "num_input_tokens_seen": 4198835, + "router_z_loss_clip": 33.34375, + "router_z_loss_mlp": 144.5, + "step": 199, + "time_per_iteration": 2.688974618911743 + }, + { + "auxiliary_loss_clip": 0.11798929, + "auxiliary_loss_mlp": 0.17447452, + "balance_loss_clip": 0.08422767, + "balance_loss_mlp": 0.02481632, + "epoch": 0.012024650533593867, + "flos": 27753435423360.0, + "grad_norm": 82.0113766879368, + "language_loss": 2.56142473, + "learning_rate": 3.411333205349222e-06, + "loss": 2.85388851, + "num_input_tokens_seen": 4219335, + "router_z_loss_clip": 33.75, + "router_z_loss_mlp": 149.5, + "step": 200, + "time_per_iteration": 2.745638608932495 + }, + { + "auxiliary_loss_clip": 0.11760798, + "auxiliary_loss_mlp": 0.1661135, + "balance_loss_clip": 0.08439215, + "balance_loss_mlp": 0.02475607, + "epoch": 0.012084773786261837, + "flos": 10456623371520.0, + "grad_norm": 81.29107841083456, + "language_loss": 2.49306059, + "learning_rate": 3.4145444448414217e-06, + "loss": 2.77678204, + "num_input_tokens_seen": 4236940, + "router_z_loss_clip": 33.25, + "router_z_loss_mlp": 141.375, + "step": 201, + "time_per_iteration": 2.7527854442596436 + }, + { + "auxiliary_loss_clip": 0.1174719, + "auxiliary_loss_mlp": 0.16602293, + "balance_loss_clip": 0.08432734, + "balance_loss_mlp": 0.02490965, + "epoch": 0.012144897038929806, + "flos": 23111331701760.0, + "grad_norm": 843.8800494285322, + "language_loss": 2.70319819, + "learning_rate": 3.4177397475956223e-06, + "loss": 2.98669291, + "num_input_tokens_seen": 4256755, + "router_z_loss_clip": 33.21875, + "router_z_loss_mlp": 141.125, + "step": 202, + "time_per_iteration": 2.739138603210449 + }, + { + "auxiliary_loss_clip": 0.11772437, + "auxiliary_loss_mlp": 0.16814882, + "balance_loss_clip": 0.08448092, + "balance_loss_mlp": 0.02483826, + "epoch": 0.012205020291597776, + "flos": 21039631827840.0, + "grad_norm": 111.22984226607618, + "language_loss": 2.69834185, + "learning_rate": 3.4209192710126685e-06, + "loss": 2.98421502, + "num_input_tokens_seen": 4276505, + "router_z_loss_clip": 33.25, + "router_z_loss_mlp": 143.375, + "step": 203, + "time_per_iteration": 2.6849801540374756 + }, + { + "auxiliary_loss_clip": 0.09996115, + "auxiliary_loss_mlp": 0.01763683, + "balance_loss_clip": 0.08022483, + "balance_loss_mlp": 0.01355129, + "epoch": 0.012265143544265745, + "flos": 68465416481280.0, + "grad_norm": 2.5939001011358327, + "language_loss": 0.60663998, + "learning_rate": 3.4240831701729837e-06, + "loss": 0.72423798, + "num_input_tokens_seen": 4330965, + "router_z_loss_clip": 19.75, + "router_z_loss_mlp": 4.08984375, + "step": 204, + "time_per_iteration": 3.218200922012329 + }, + { + "auxiliary_loss_clip": 0.11829591, + "auxiliary_loss_mlp": 0.16426852, + "balance_loss_clip": 0.08460154, + "balance_loss_mlp": 0.02486424, + "epoch": 0.012325266796933715, + "flos": 17024923152000.0, + "grad_norm": 175.923318576614, + "language_loss": 2.6947825, + "learning_rate": 3.4272315978819516e-06, + "loss": 2.9773469, + "num_input_tokens_seen": 4348200, + "router_z_loss_clip": 33.6875, + "router_z_loss_mlp": 139.5, + "step": 205, + "time_per_iteration": 2.6580400466918945 + }, + { + "auxiliary_loss_clip": 0.11821875, + "auxiliary_loss_mlp": 0.15477848, + "balance_loss_clip": 0.0845597, + "balance_loss_mlp": 0.02483464, + "epoch": 0.012385390049601683, + "flos": 20195679104640.0, + "grad_norm": 179.20336452265943, + "language_loss": 2.76609898, + "learning_rate": 3.4303647047142043e-06, + "loss": 3.03909636, + "num_input_tokens_seen": 4365460, + "router_z_loss_clip": 33.71875, + "router_z_loss_mlp": 130.0625, + "step": 206, + "time_per_iteration": 2.732661724090576 + }, + { + "auxiliary_loss_clip": 0.11876252, + "auxiliary_loss_mlp": 0.15609139, + "balance_loss_clip": 0.0847889, + "balance_loss_mlp": 0.02498787, + "epoch": 0.012445513302269652, + "flos": 16258690690560.0, + "grad_norm": 37.57079461410369, + "language_loss": 2.63663292, + "learning_rate": 3.43348263905683e-06, + "loss": 2.91148686, + "num_input_tokens_seen": 4383650, + "router_z_loss_clip": 33.9375, + "router_z_loss_mlp": 131.25, + "step": 207, + "time_per_iteration": 2.655898332595825 + }, + { + "auxiliary_loss_clip": 0.11858118, + "auxiliary_loss_mlp": 0.15964949, + "balance_loss_clip": 0.08469288, + "balance_loss_mlp": 0.02500593, + "epoch": 0.012505636554937622, + "flos": 23776224249600.0, + "grad_norm": 80.16610328924297, + "language_loss": 2.31757832, + "learning_rate": 3.436585547151547e-06, + "loss": 2.59580898, + "num_input_tokens_seen": 4403765, + "router_z_loss_clip": 33.90625, + "router_z_loss_mlp": 134.8125, + "step": 208, + "time_per_iteration": 2.7096707820892334 + }, + { + "auxiliary_loss_clip": 0.11891477, + "auxiliary_loss_mlp": 0.15333374, + "balance_loss_clip": 0.08512411, + "balance_loss_mlp": 0.02509888, + "epoch": 0.012565759807605591, + "flos": 30599417750400.0, + "grad_norm": 94.61742092763181, + "language_loss": 2.89340639, + "learning_rate": 3.4396735731358586e-06, + "loss": 3.16565466, + "num_input_tokens_seen": 4421935, + "router_z_loss_clip": 33.8125, + "router_z_loss_mlp": 128.3125, + "step": 209, + "time_per_iteration": 2.7260549068450928 + }, + { + "auxiliary_loss_clip": 0.11866176, + "auxiliary_loss_mlp": 0.14843261, + "balance_loss_clip": 0.08489646, + "balance_loss_mlp": 0.02508056, + "epoch": 0.012625883060273561, + "flos": 40122838200960.0, + "grad_norm": 70.02885877178691, + "language_loss": 2.47040462, + "learning_rate": 3.4427468590832302e-06, + "loss": 2.737499, + "num_input_tokens_seen": 4441470, + "router_z_loss_clip": 33.78125, + "router_z_loss_mlp": 123.375, + "step": 210, + "time_per_iteration": 2.8969995975494385 + }, + { + "auxiliary_loss_clip": 0.1188697, + "auxiliary_loss_mlp": 0.14057073, + "balance_loss_clip": 0.08471721, + "balance_loss_mlp": 0.02497014, + "epoch": 0.01268600631294153, + "flos": 27096509013120.0, + "grad_norm": 122.06391807709156, + "language_loss": 2.54189563, + "learning_rate": 3.445805545042314e-06, + "loss": 2.80133629, + "num_input_tokens_seen": 4459950, + "router_z_loss_clip": 34.15625, + "router_z_loss_mlp": 115.625, + "step": 211, + "time_per_iteration": 2.708080768585205 + }, + { + "auxiliary_loss_clip": 0.11883873, + "auxiliary_loss_mlp": 0.13339609, + "balance_loss_clip": 0.08499163, + "balance_loss_mlp": 0.02499764, + "epoch": 0.012746129565609499, + "flos": 16988431898880.0, + "grad_norm": 126.44131700603937, + "language_loss": 2.37998009, + "learning_rate": 3.448849769075239e-06, + "loss": 2.63221502, + "num_input_tokens_seen": 4478390, + "router_z_loss_clip": 33.84375, + "router_z_loss_mlp": 108.375, + "step": 212, + "time_per_iteration": 2.6480045318603516 + }, + { + "auxiliary_loss_clip": 0.11928719, + "auxiliary_loss_mlp": 0.13044119, + "balance_loss_clip": 0.08510935, + "balance_loss_mlp": 0.02497243, + "epoch": 0.012806252818277469, + "flos": 46543621668480.0, + "grad_norm": 186.42729164055353, + "language_loss": 2.21970725, + "learning_rate": 3.4518796672950093e-06, + "loss": 2.46943569, + "num_input_tokens_seen": 4501665, + "router_z_loss_clip": 34.15625, + "router_z_loss_mlp": 105.5625, + "step": 213, + "time_per_iteration": 2.871330738067627 + }, + { + "auxiliary_loss_clip": 0.119517, + "auxiliary_loss_mlp": 0.12083894, + "balance_loss_clip": 0.08513753, + "balance_loss_mlp": 0.02489167, + "epoch": 0.012866376070945438, + "flos": 14393234442240.0, + "grad_norm": 59.129237382202305, + "language_loss": 2.15201378, + "learning_rate": 3.4548953739020187e-06, + "loss": 2.39236999, + "num_input_tokens_seen": 4519055, + "router_z_loss_clip": 34.40625, + "router_z_loss_mlp": 95.9375, + "step": 214, + "time_per_iteration": 2.677279472351074 + }, + { + "auxiliary_loss_clip": 0.11979187, + "auxiliary_loss_mlp": 0.11437444, + "balance_loss_clip": 0.08527225, + "balance_loss_mlp": 0.02483585, + "epoch": 0.012926499323613408, + "flos": 26148029921280.0, + "grad_norm": 82.8472801825022, + "language_loss": 2.01005268, + "learning_rate": 3.4578970212197196e-06, + "loss": 2.24421906, + "num_input_tokens_seen": 4540870, + "router_z_loss_clip": 34.5, + "router_z_loss_mlp": 89.625, + "step": 215, + "time_per_iteration": 5.505565881729126 + }, + { + "auxiliary_loss_clip": 0.11977073, + "auxiliary_loss_mlp": 0.10736242, + "balance_loss_clip": 0.08518873, + "balance_loss_mlp": 0.02484289, + "epoch": 0.012986622576281377, + "flos": 30124989532800.0, + "grad_norm": 444.29299491343255, + "language_loss": 2.23052669, + "learning_rate": 3.460884739729461e-06, + "loss": 2.45765996, + "num_input_tokens_seen": 4560395, + "router_z_loss_clip": 34.625, + "router_z_loss_mlp": 82.5, + "step": 216, + "time_per_iteration": 4.0875208377838135 + }, + { + "auxiliary_loss_clip": 0.11978886, + "auxiliary_loss_mlp": 0.10150906, + "balance_loss_clip": 0.0852896, + "balance_loss_mlp": 0.02478787, + "epoch": 0.013046745828949347, + "flos": 13959112838400.0, + "grad_norm": 45.21271501184753, + "language_loss": 2.33321786, + "learning_rate": 3.463858658104523e-06, + "loss": 2.55451584, + "num_input_tokens_seen": 4575785, + "router_z_loss_clip": 34.46875, + "router_z_loss_mlp": 76.625, + "step": 217, + "time_per_iteration": 4.032313585281372 + }, + { + "auxiliary_loss_clip": 0.11990365, + "auxiliary_loss_mlp": 0.09330522, + "balance_loss_clip": 0.08498306, + "balance_loss_mlp": 0.02482377, + "epoch": 0.013106869081617315, + "flos": 17353595992320.0, + "grad_norm": 48.7496700865691, + "language_loss": 2.077981, + "learning_rate": 3.4668189032433696e-06, + "loss": 2.29119015, + "num_input_tokens_seen": 4594985, + "router_z_loss_clip": 34.9375, + "router_z_loss_mlp": 68.625, + "step": 218, + "time_per_iteration": 2.655488967895508 + }, + { + "auxiliary_loss_clip": 0.12044869, + "auxiliary_loss_mlp": 0.08778962, + "balance_loss_clip": 0.08527655, + "balance_loss_mlp": 0.02477083, + "epoch": 0.013166992334285284, + "flos": 25892004170880.0, + "grad_norm": 58.49845250600888, + "language_loss": 2.1651845, + "learning_rate": 3.46976560030214e-06, + "loss": 2.3734231, + "num_input_tokens_seen": 4616125, + "router_z_loss_clip": 35.15625, + "router_z_loss_mlp": 63.0, + "step": 219, + "time_per_iteration": 2.7416553497314453 + }, + { + "auxiliary_loss_clip": 0.12097923, + "auxiliary_loss_mlp": 0.08351351, + "balance_loss_clip": 0.08555256, + "balance_loss_mlp": 0.0248282, + "epoch": 0.013227115586953254, + "flos": 31184032487040.0, + "grad_norm": 65.30096795058861, + "language_loss": 2.22661948, + "learning_rate": 3.4726988727263976e-06, + "loss": 2.43111229, + "num_input_tokens_seen": 4637795, + "router_z_loss_clip": 35.40625, + "router_z_loss_mlp": 58.625, + "step": 220, + "time_per_iteration": 2.825364351272583 + }, + { + "auxiliary_loss_clip": 0.12091806, + "auxiliary_loss_mlp": 0.07555279, + "balance_loss_clip": 0.08557573, + "balance_loss_mlp": 0.02477154, + "epoch": 0.013287238839621223, + "flos": 20415213601920.0, + "grad_norm": 85.51848477504389, + "language_loss": 2.08907223, + "learning_rate": 3.475618842282164e-06, + "loss": 2.2855432, + "num_input_tokens_seen": 4656835, + "router_z_loss_clip": 35.375, + "router_z_loss_mlp": 50.75, + "step": 221, + "time_per_iteration": 2.699341058731079 + }, + { + "auxiliary_loss_clip": 0.12102397, + "auxiliary_loss_mlp": 0.07188272, + "balance_loss_clip": 0.08552121, + "balance_loss_mlp": 0.02482462, + "epoch": 0.013347362092289193, + "flos": 14142365717760.0, + "grad_norm": 45.70301732891132, + "language_loss": 2.16536474, + "learning_rate": 3.4785256290862486e-06, + "loss": 2.3582716, + "num_input_tokens_seen": 4673015, + "router_z_loss_clip": 35.5, + "router_z_loss_mlp": 47.0, + "step": 222, + "time_per_iteration": 2.635849714279175 + }, + { + "auxiliary_loss_clip": 0.12141806, + "auxiliary_loss_mlp": 0.06919794, + "balance_loss_clip": 0.08555885, + "balance_loss_mlp": 0.0248864, + "epoch": 0.013407485344957162, + "flos": 21803977572480.0, + "grad_norm": 133.93360024755185, + "language_loss": 2.13315558, + "learning_rate": 3.481419351635897e-06, + "loss": 2.32377172, + "num_input_tokens_seen": 4692355, + "router_z_loss_clip": 35.84375, + "router_z_loss_mlp": 44.375, + "step": 223, + "time_per_iteration": 2.677440881729126 + }, + { + "auxiliary_loss_clip": 0.12133283, + "auxiliary_loss_mlp": 0.06662595, + "balance_loss_clip": 0.08527759, + "balance_loss_mlp": 0.0248779, + "epoch": 0.013467608597625132, + "flos": 18627058344960.0, + "grad_norm": 45.82649386348146, + "language_loss": 2.04508209, + "learning_rate": 3.484300126837776e-06, + "loss": 2.23304057, + "num_input_tokens_seen": 4710080, + "router_z_loss_clip": 36.0, + "router_z_loss_mlp": 41.71875, + "step": 224, + "time_per_iteration": 2.647221803665161 + }, + { + "auxiliary_loss_clip": 0.12132762, + "auxiliary_loss_mlp": 0.06591167, + "balance_loss_clip": 0.0855926, + "balance_loss_mlp": 0.02489604, + "epoch": 0.013527731850293101, + "flos": 18558352396800.0, + "grad_norm": 35.4602333373948, + "language_loss": 1.96751869, + "learning_rate": 3.487168070036317e-06, + "loss": 2.15475798, + "num_input_tokens_seen": 4728980, + "router_z_loss_clip": 35.71875, + "router_z_loss_mlp": 41.0, + "step": 225, + "time_per_iteration": 2.6572558879852295 + }, + { + "auxiliary_loss_clip": 0.12111218, + "auxiliary_loss_mlp": 0.06338836, + "balance_loss_clip": 0.08540972, + "balance_loss_mlp": 0.02487518, + "epoch": 0.01358785510296107, + "flos": 19170318291840.0, + "grad_norm": 35.010295897234684, + "language_loss": 2.14010954, + "learning_rate": 3.4900232950414224e-06, + "loss": 2.32460999, + "num_input_tokens_seen": 4747020, + "router_z_loss_clip": 35.6875, + "router_z_loss_mlp": 38.46875, + "step": 226, + "time_per_iteration": 2.6925666332244873 + }, + { + "auxiliary_loss_clip": 0.12106597, + "auxiliary_loss_mlp": 0.06106333, + "balance_loss_clip": 0.08537765, + "balance_loss_mlp": 0.02477793, + "epoch": 0.01364797835562904, + "flos": 23336442495360.0, + "grad_norm": 62.289483146556975, + "language_loss": 1.89336014, + "learning_rate": 3.4928659141555727e-06, + "loss": 2.07548952, + "num_input_tokens_seen": 4765000, + "router_z_loss_clip": 35.71875, + "router_z_loss_mlp": 36.25, + "step": 227, + "time_per_iteration": 2.662459373474121 + }, + { + "auxiliary_loss_clip": 0.09852038, + "auxiliary_loss_mlp": 0.02028254, + "balance_loss_clip": 0.08093569, + "balance_loss_mlp": 0.01678827, + "epoch": 0.013708101608297009, + "flos": 71016561089280.0, + "grad_norm": 1.118625578373922, + "language_loss": 0.572559, + "learning_rate": 3.4956960382003234e-06, + "loss": 0.6913619, + "num_input_tokens_seen": 4833210, + "router_z_loss_clip": 17.53125, + "router_z_loss_mlp": 3.49804688, + "step": 228, + "time_per_iteration": 3.3785295486450195 + }, + { + "auxiliary_loss_clip": 0.12056112, + "auxiliary_loss_mlp": 0.05858175, + "balance_loss_clip": 0.08522452, + "balance_loss_mlp": 0.02485983, + "epoch": 0.013768224860964979, + "flos": 16330583093760.0, + "grad_norm": 67.20403392826273, + "language_loss": 1.83727443, + "learning_rate": 3.4985137765422354e-06, + "loss": 2.0164175, + "num_input_tokens_seen": 4850120, + "router_z_loss_clip": 35.34375, + "router_z_loss_mlp": 33.765625, + "step": 229, + "time_per_iteration": 2.6247904300689697 + }, + { + "auxiliary_loss_clip": 0.11999249, + "auxiliary_loss_mlp": 0.05601757, + "balance_loss_clip": 0.08509874, + "balance_loss_mlp": 0.02482861, + "epoch": 0.013828348113632948, + "flos": 20199159048960.0, + "grad_norm": 53.50045183346903, + "language_loss": 1.8795563, + "learning_rate": 3.501319237118231e-06, + "loss": 2.05556631, + "num_input_tokens_seen": 4866215, + "router_z_loss_clip": 34.9375, + "router_z_loss_mlp": 31.1875, + "step": 230, + "time_per_iteration": 2.7507057189941406 + }, + { + "auxiliary_loss_clip": 0.12064129, + "auxiliary_loss_mlp": 0.05470717, + "balance_loss_clip": 0.08557475, + "balance_loss_mlp": 0.02487624, + "epoch": 0.013888471366300916, + "flos": 20747408313600.0, + "grad_norm": 34.266749882440614, + "language_loss": 1.64469385, + "learning_rate": 3.5041125264604056e-06, + "loss": 1.82004225, + "num_input_tokens_seen": 4885630, + "router_z_loss_clip": 35.09375, + "router_z_loss_mlp": 29.796875, + "step": 231, + "time_per_iteration": 2.641220808029175 + }, + { + "auxiliary_loss_clip": 0.12051, + "auxiliary_loss_mlp": 0.05321148, + "balance_loss_clip": 0.08549553, + "balance_loss_mlp": 0.02486065, + "epoch": 0.013948594618968886, + "flos": 22097123481600.0, + "grad_norm": 189.27377216215737, + "language_loss": 1.70564377, + "learning_rate": 3.5068937497203002e-06, + "loss": 1.87936521, + "num_input_tokens_seen": 4905570, + "router_z_loss_clip": 35.0, + "router_z_loss_mlp": 28.34375, + "step": 232, + "time_per_iteration": 2.6656322479248047 + }, + { + "auxiliary_loss_clip": 0.12035383, + "auxiliary_loss_mlp": 0.0510756, + "balance_loss_clip": 0.08542152, + "balance_loss_mlp": 0.02483049, + "epoch": 0.014008717871636855, + "flos": 19069229940480.0, + "grad_norm": 76.31242813901656, + "language_loss": 1.64492762, + "learning_rate": 3.509663010692652e-06, + "loss": 1.81635702, + "num_input_tokens_seen": 4923535, + "router_z_loss_clip": 34.96875, + "router_z_loss_mlp": 26.25, + "step": 233, + "time_per_iteration": 2.6354150772094727 + }, + { + "auxiliary_loss_clip": 0.12088259, + "auxiliary_loss_mlp": 0.05079982, + "balance_loss_clip": 0.08570465, + "balance_loss_mlp": 0.02490566, + "epoch": 0.014068841124304825, + "flos": 14534839042560.0, + "grad_norm": 50.00852440461159, + "language_loss": 1.75618017, + "learning_rate": 3.512420411838642e-06, + "loss": 1.92786264, + "num_input_tokens_seen": 4939200, + "router_z_loss_clip": 35.15625, + "router_z_loss_mlp": 25.890625, + "step": 234, + "time_per_iteration": 2.666630983352661 + }, + { + "auxiliary_loss_clip": 0.11989364, + "auxiliary_loss_mlp": 0.05021151, + "balance_loss_clip": 0.08533135, + "balance_loss_mlp": 0.0249277, + "epoch": 0.014128964376972794, + "flos": 18083253346560.0, + "grad_norm": 159.74277839526525, + "language_loss": 1.68861091, + "learning_rate": 3.515166054308634e-06, + "loss": 1.85871601, + "num_input_tokens_seen": 4956620, + "router_z_loss_clip": 34.625, + "router_z_loss_mlp": 25.28125, + "step": 235, + "time_per_iteration": 2.6749186515808105 + }, + { + "auxiliary_loss_clip": 0.12056133, + "auxiliary_loss_mlp": 0.04976581, + "balance_loss_clip": 0.08549982, + "balance_loss_mlp": 0.02495502, + "epoch": 0.014189087629640764, + "flos": 25340778086400.0, + "grad_norm": 181.61682318003585, + "language_loss": 1.60946572, + "learning_rate": 3.5179000379644498e-06, + "loss": 1.77979279, + "num_input_tokens_seen": 4975650, + "router_z_loss_clip": 35.03125, + "router_z_loss_mlp": 24.8125, + "step": 236, + "time_per_iteration": 2.744683027267456 + }, + { + "auxiliary_loss_clip": 0.11981137, + "auxiliary_loss_mlp": 0.04688486, + "balance_loss_clip": 0.08556408, + "balance_loss_mlp": 0.02492746, + "epoch": 0.014249210882308733, + "flos": 36148939263360.0, + "grad_norm": 53.559601436427585, + "language_loss": 1.50691867, + "learning_rate": 3.520622461401154e-06, + "loss": 1.67361498, + "num_input_tokens_seen": 4997415, + "router_z_loss_clip": 34.25, + "router_z_loss_mlp": 21.96875, + "step": 237, + "time_per_iteration": 2.845082998275757 + }, + { + "auxiliary_loss_clip": 0.12020621, + "auxiliary_loss_mlp": 0.04751597, + "balance_loss_clip": 0.08577786, + "balance_loss_mlp": 0.02497874, + "epoch": 0.014309334134976702, + "flos": 12937986656640.0, + "grad_norm": 74.10279300011292, + "language_loss": 1.46138978, + "learning_rate": 3.5233334219683935e-06, + "loss": 1.62911201, + "num_input_tokens_seen": 5013905, + "router_z_loss_clip": 34.4375, + "router_z_loss_mlp": 22.5625, + "step": 238, + "time_per_iteration": 2.658674716949463 + }, + { + "auxiliary_loss_clip": 0.11937614, + "auxiliary_loss_mlp": 0.04392426, + "balance_loss_clip": 0.08564249, + "balance_loss_mlp": 0.02485077, + "epoch": 0.014369457387644672, + "flos": 20783857639680.0, + "grad_norm": 42.588620022932425, + "language_loss": 1.53544843, + "learning_rate": 3.526033015791284e-06, + "loss": 1.69874883, + "num_input_tokens_seen": 5033645, + "router_z_loss_clip": 33.78125, + "router_z_loss_mlp": 19.046875, + "step": 239, + "time_per_iteration": 2.700894355773926 + }, + { + "auxiliary_loss_clip": 0.11902035, + "auxiliary_loss_mlp": 0.04253633, + "balance_loss_clip": 0.08564246, + "balance_loss_mlp": 0.02488191, + "epoch": 0.01442958064031264, + "flos": 25855638698880.0, + "grad_norm": 34.671761903295156, + "language_loss": 1.53386331, + "learning_rate": 3.528721337790862e-06, + "loss": 1.69542003, + "num_input_tokens_seen": 5052875, + "router_z_loss_clip": 33.4375, + "router_z_loss_mlp": 17.671875, + "step": 240, + "time_per_iteration": 2.712979555130005 + }, + { + "auxiliary_loss_clip": 0.11883197, + "auxiliary_loss_mlp": 0.04123231, + "balance_loss_clip": 0.08562298, + "balance_loss_mlp": 0.02487489, + "epoch": 0.014489703892980611, + "flos": 28227150881280.0, + "grad_norm": 79.00201559956153, + "language_loss": 1.47835279, + "learning_rate": 3.531398481704111e-06, + "loss": 1.63841701, + "num_input_tokens_seen": 5075005, + "router_z_loss_clip": 33.15625, + "router_z_loss_mlp": 16.359375, + "step": 241, + "time_per_iteration": 2.7748684883117676 + }, + { + "auxiliary_loss_clip": 0.11856598, + "auxiliary_loss_mlp": 0.0397551, + "balance_loss_clip": 0.08558369, + "balance_loss_mlp": 0.02488541, + "epoch": 0.01454982714564858, + "flos": 22497311381760.0, + "grad_norm": 26.156771136535646, + "language_loss": 1.46749806, + "learning_rate": 3.534064540103573e-06, + "loss": 1.62581909, + "num_input_tokens_seen": 5091875, + "router_z_loss_clip": 32.984375, + "router_z_loss_mlp": 14.875, + "step": 242, + "time_per_iteration": 2.69297456741333 + }, + { + "auxiliary_loss_clip": 0.11859537, + "auxiliary_loss_mlp": 0.03845835, + "balance_loss_clip": 0.08550237, + "balance_loss_mlp": 0.0248704, + "epoch": 0.014609950398316548, + "flos": 21659689641600.0, + "grad_norm": 40.62615504318681, + "language_loss": 1.44594622, + "learning_rate": 3.536719604416555e-06, + "loss": 1.60299993, + "num_input_tokens_seen": 5111290, + "router_z_loss_clip": 33.03125, + "router_z_loss_mlp": 13.5859375, + "step": 243, + "time_per_iteration": 2.7429516315460205 + }, + { + "auxiliary_loss_clip": 0.11778541, + "auxiliary_loss_mlp": 0.03809229, + "balance_loss_clip": 0.08539546, + "balance_loss_mlp": 0.02486292, + "epoch": 0.014670073650984519, + "flos": 21876163464960.0, + "grad_norm": 100.86422067940943, + "language_loss": 1.56203103, + "learning_rate": 3.5393637649439464e-06, + "loss": 1.71790862, + "num_input_tokens_seen": 5132265, + "router_z_loss_clip": 32.34375, + "router_z_loss_mlp": 13.2265625, + "step": 244, + "time_per_iteration": 2.6750683784484863 + }, + { + "auxiliary_loss_clip": 0.11823894, + "auxiliary_loss_mlp": 0.03778996, + "balance_loss_clip": 0.08550587, + "balance_loss_mlp": 0.02497257, + "epoch": 0.014730196903652487, + "flos": 23190142066560.0, + "grad_norm": 48.52251723310838, + "language_loss": 1.50476313, + "learning_rate": 3.54199711087864e-06, + "loss": 1.66079211, + "num_input_tokens_seen": 5148575, + "router_z_loss_clip": 32.71875, + "router_z_loss_mlp": 12.8125, + "step": 245, + "time_per_iteration": 2.72153639793396 + }, + { + "auxiliary_loss_clip": 0.11763392, + "auxiliary_loss_mlp": 0.03610927, + "balance_loss_clip": 0.08551488, + "balance_loss_mlp": 0.02484828, + "epoch": 0.014790320156320457, + "flos": 23229442431360.0, + "grad_norm": 98.70024924690004, + "language_loss": 1.52072549, + "learning_rate": 3.5446197303235913e-06, + "loss": 1.67446864, + "num_input_tokens_seen": 5170415, + "router_z_loss_clip": 32.078125, + "router_z_loss_mlp": 11.265625, + "step": 246, + "time_per_iteration": 2.739284038543701 + }, + { + "auxiliary_loss_clip": 0.11731501, + "auxiliary_loss_mlp": 0.03545591, + "balance_loss_clip": 0.08530955, + "balance_loss_mlp": 0.0246832, + "epoch": 0.014850443408988426, + "flos": 15821005288320.0, + "grad_norm": 33.98035395755878, + "language_loss": 1.40319586, + "learning_rate": 3.5472317103095034e-06, + "loss": 1.55596685, + "num_input_tokens_seen": 5188565, + "router_z_loss_clip": 31.96875, + "router_z_loss_mlp": 10.7734375, + "step": 247, + "time_per_iteration": 2.7273683547973633 + }, + { + "auxiliary_loss_clip": 0.1172208, + "auxiliary_loss_mlp": 0.03547119, + "balance_loss_clip": 0.08564139, + "balance_loss_mlp": 0.02478241, + "epoch": 0.014910566661656396, + "flos": 22787899741440.0, + "grad_norm": 52.371226674183355, + "language_loss": 1.30089116, + "learning_rate": 3.549833136812155e-06, + "loss": 1.453583, + "num_input_tokens_seen": 5207810, + "router_z_loss_clip": 31.578125, + "router_z_loss_mlp": 10.6953125, + "step": 248, + "time_per_iteration": 2.7991907596588135 + }, + { + "auxiliary_loss_clip": 0.11678547, + "auxiliary_loss_mlp": 0.03475812, + "balance_loss_clip": 0.08537906, + "balance_loss_mlp": 0.02466443, + "epoch": 0.014970689914324365, + "flos": 26871440146560.0, + "grad_norm": 39.139484540660874, + "language_loss": 1.33625245, + "learning_rate": 3.552424094769381e-06, + "loss": 1.48779607, + "num_input_tokens_seen": 5226210, + "router_z_loss_clip": 31.390625, + "router_z_loss_mlp": 10.0859375, + "step": 249, + "time_per_iteration": 2.7439961433410645 + }, + { + "auxiliary_loss_clip": 0.11684404, + "auxiliary_loss_mlp": 0.03406032, + "balance_loss_clip": 0.08537483, + "balance_loss_mlp": 0.02458461, + "epoch": 0.015030813166992334, + "flos": 13989943941120.0, + "grad_norm": 151.47532384589994, + "language_loss": 1.465379, + "learning_rate": 3.5550046680977174e-06, + "loss": 1.6162833, + "num_input_tokens_seen": 5241660, + "router_z_loss_clip": 31.46875, + "router_z_loss_mlp": 9.4765625, + "step": 250, + "time_per_iteration": 2.68412184715271 + }, + { + "auxiliary_loss_clip": 0.11659358, + "auxiliary_loss_mlp": 0.03389172, + "balance_loss_clip": 0.08554412, + "balance_loss_mlp": 0.02466397, + "epoch": 0.015090936419660304, + "flos": 24724787195520.0, + "grad_norm": 46.474949555678066, + "language_loss": 1.48383927, + "learning_rate": 3.5575749397087034e-06, + "loss": 1.63432467, + "num_input_tokens_seen": 5261090, + "router_z_loss_clip": 31.0625, + "router_z_loss_mlp": 9.22265625, + "step": 251, + "time_per_iteration": 2.7403595447540283 + }, + { + "auxiliary_loss_clip": 0.11684091, + "auxiliary_loss_mlp": 0.0341421, + "balance_loss_clip": 0.08552309, + "balance_loss_mlp": 0.02502498, + "epoch": 0.015151059672328273, + "flos": 25745829523200.0, + "grad_norm": 38.842940432028065, + "language_loss": 1.35644555, + "learning_rate": 3.5601349915248707e-06, + "loss": 1.50742865, + "num_input_tokens_seen": 5279175, + "router_z_loss_clip": 31.296875, + "router_z_loss_mlp": 9.1171875, + "step": 252, + "time_per_iteration": 2.791579246520996 + }, + { + "auxiliary_loss_clip": 0.11669001, + "auxiliary_loss_mlp": 0.03442915, + "balance_loss_clip": 0.08573347, + "balance_loss_mlp": 0.02537305, + "epoch": 0.015211182924996243, + "flos": 21877588984320.0, + "grad_norm": 62.5379323018988, + "language_loss": 1.55304623, + "learning_rate": 3.5626849044954064e-06, + "loss": 1.70416546, + "num_input_tokens_seen": 5296975, + "router_z_loss_clip": 30.96875, + "router_z_loss_mlp": 9.0625, + "step": 253, + "time_per_iteration": 2.6943836212158203 + }, + { + "auxiliary_loss_clip": 0.09242393, + "auxiliary_loss_mlp": 0.017157, + "balance_loss_clip": 0.07774388, + "balance_loss_mlp": 0.01455537, + "epoch": 0.015271306177664212, + "flos": 66915159765120.0, + "grad_norm": 1.2208472030610649, + "language_loss": 0.55767465, + "learning_rate": 3.5652247586115167e-06, + "loss": 0.66725558, + "num_input_tokens_seen": 5358375, + "router_z_loss_clip": 14.65625, + "router_z_loss_mlp": 2.6015625, + "step": 254, + "time_per_iteration": 4.672732353210449 + }, + { + "auxiliary_loss_clip": 0.11620437, + "auxiliary_loss_mlp": 0.03323486, + "balance_loss_clip": 0.08537702, + "balance_loss_mlp": 0.02497223, + "epoch": 0.01533142943033218, + "flos": 26841405657600.0, + "grad_norm": 25.800997540380294, + "language_loss": 1.37205672, + "learning_rate": 3.567754632921479e-06, + "loss": 1.52149594, + "num_input_tokens_seen": 5377255, + "router_z_loss_clip": 30.84375, + "router_z_loss_mlp": 8.265625, + "step": 255, + "time_per_iteration": 5.487545490264893 + }, + { + "auxiliary_loss_clip": 0.11549303, + "auxiliary_loss_mlp": 0.03243715, + "balance_loss_clip": 0.08531242, + "balance_loss_mlp": 0.02464373, + "epoch": 0.01539155268300015, + "flos": 20820055403520.0, + "grad_norm": 51.38147970022548, + "language_loss": 1.3568666, + "learning_rate": 3.5702746055454075e-06, + "loss": 1.50479686, + "num_input_tokens_seen": 5395320, + "router_z_loss_clip": 30.171875, + "router_z_loss_mlp": 7.7890625, + "step": 256, + "time_per_iteration": 2.7118937969207764 + }, + { + "auxiliary_loss_clip": 0.11515065, + "auxiliary_loss_mlp": 0.0323028, + "balance_loss_clip": 0.08509345, + "balance_loss_mlp": 0.02460093, + "epoch": 0.01545167593566812, + "flos": 15967473425280.0, + "grad_norm": 27.629045104410558, + "language_loss": 1.28094459, + "learning_rate": 3.5727847536897254e-06, + "loss": 1.42839789, + "num_input_tokens_seen": 5411970, + "router_z_loss_clip": 30.046875, + "router_z_loss_mlp": 7.69921875, + "step": 257, + "time_per_iteration": 4.093847751617432 + }, + { + "auxiliary_loss_clip": 0.11514995, + "auxiliary_loss_mlp": 0.03174197, + "balance_loss_clip": 0.08523524, + "balance_loss_mlp": 0.02457415, + "epoch": 0.01551179918833609, + "flos": 22608378368640.0, + "grad_norm": 22.193359085523966, + "language_loss": 1.37467206, + "learning_rate": 3.5752851536613596e-06, + "loss": 1.52156401, + "num_input_tokens_seen": 5430245, + "router_z_loss_clip": 29.921875, + "router_z_loss_mlp": 7.171875, + "step": 258, + "time_per_iteration": 2.6789233684539795 + }, + { + "auxiliary_loss_clip": 0.11490995, + "auxiliary_loss_mlp": 0.03125494, + "balance_loss_clip": 0.08525682, + "balance_loss_mlp": 0.02450675, + "epoch": 0.015571922441004058, + "flos": 22822713912960.0, + "grad_norm": 41.08352403819959, + "language_loss": 1.35431111, + "learning_rate": 3.577775880881658e-06, + "loss": 1.50047588, + "num_input_tokens_seen": 5448905, + "router_z_loss_clip": 29.640625, + "router_z_loss_mlp": 6.75390625, + "step": 259, + "time_per_iteration": 2.716095209121704 + }, + { + "auxiliary_loss_clip": 0.11409761, + "auxiliary_loss_mlp": 0.03065479, + "balance_loss_clip": 0.08500087, + "balance_loss_mlp": 0.02439868, + "epoch": 0.015632045693672027, + "flos": 18952502803200.0, + "grad_norm": 45.41794645804665, + "language_loss": 1.35833013, + "learning_rate": 3.5802570099000424e-06, + "loss": 1.50308251, + "num_input_tokens_seen": 5466405, + "router_z_loss_clip": 29.109375, + "router_z_loss_mlp": 6.25390625, + "step": 260, + "time_per_iteration": 2.63728666305542 + }, + { + "auxiliary_loss_clip": 0.11363758, + "auxiliary_loss_mlp": 0.03047284, + "balance_loss_clip": 0.0847533, + "balance_loss_mlp": 0.02422818, + "epoch": 0.015692168946339995, + "flos": 29979569571840.0, + "grad_norm": 14.449297272648009, + "language_loss": 1.30485594, + "learning_rate": 3.5827286144073947e-06, + "loss": 1.44896626, + "num_input_tokens_seen": 5487055, + "router_z_loss_clip": 28.921875, + "router_z_loss_mlp": 6.23828125, + "step": 261, + "time_per_iteration": 2.7847509384155273 + }, + { + "auxiliary_loss_clip": 0.11379428, + "auxiliary_loss_mlp": 0.03054321, + "balance_loss_clip": 0.08507971, + "balance_loss_mlp": 0.02459991, + "epoch": 0.015752292199007967, + "flos": 19398363978240.0, + "grad_norm": 31.701786044094614, + "language_loss": 1.03000259, + "learning_rate": 3.5851907672491904e-06, + "loss": 1.17434001, + "num_input_tokens_seen": 5506600, + "router_z_loss_clip": 28.71875, + "router_z_loss_mlp": 5.94140625, + "step": 262, + "time_per_iteration": 2.6821658611297607 + }, + { + "auxiliary_loss_clip": 0.11303549, + "auxiliary_loss_mlp": 0.02991728, + "balance_loss_clip": 0.0846238, + "balance_loss_mlp": 0.02461103, + "epoch": 0.015812415451675936, + "flos": 20346088383360.0, + "grad_norm": 21.20591685993131, + "language_loss": 1.06071973, + "learning_rate": 3.587643540438383e-06, + "loss": 1.20367253, + "num_input_tokens_seen": 5524350, + "router_z_loss_clip": 28.421875, + "router_z_loss_mlp": 5.30859375, + "step": 263, + "time_per_iteration": 2.6878163814544678 + }, + { + "auxiliary_loss_clip": 0.11343089, + "auxiliary_loss_mlp": 0.02942515, + "balance_loss_clip": 0.08484475, + "balance_loss_mlp": 0.0242982, + "epoch": 0.015872538704343905, + "flos": 17530392107520.0, + "grad_norm": 30.142563573193335, + "language_loss": 1.29773152, + "learning_rate": 3.590087005168037e-06, + "loss": 1.44058764, + "num_input_tokens_seen": 5542145, + "router_z_loss_clip": 28.59375, + "router_z_loss_mlp": 5.125, + "step": 264, + "time_per_iteration": 2.662154197692871 + }, + { + "auxiliary_loss_clip": 0.11317942, + "auxiliary_loss_mlp": 0.02875043, + "balance_loss_clip": 0.08491537, + "balance_loss_mlp": 0.02415754, + "epoch": 0.015932661957011873, + "flos": 15264622177920.0, + "grad_norm": 32.942584170075996, + "language_loss": 1.38455915, + "learning_rate": 3.5925212318237344e-06, + "loss": 1.52648902, + "num_input_tokens_seen": 5557920, + "router_z_loss_clip": 28.28125, + "router_z_loss_mlp": 4.59375, + "step": 265, + "time_per_iteration": 2.6390388011932373 + }, + { + "auxiliary_loss_clip": 0.11291553, + "auxiliary_loss_mlp": 0.02864291, + "balance_loss_clip": 0.08442727, + "balance_loss_mlp": 0.02421405, + "epoch": 0.015992785209679845, + "flos": 20308674735360.0, + "grad_norm": 55.122223701442024, + "language_loss": 1.13817394, + "learning_rate": 3.5949462899957323e-06, + "loss": 1.27973235, + "num_input_tokens_seen": 5576290, + "router_z_loss_clip": 28.484375, + "router_z_loss_mlp": 4.42773438, + "step": 266, + "time_per_iteration": 2.7511661052703857 + }, + { + "auxiliary_loss_clip": 0.11267024, + "auxiliary_loss_mlp": 0.02842336, + "balance_loss_clip": 0.08455394, + "balance_loss_mlp": 0.02423863, + "epoch": 0.016052908462347814, + "flos": 23368195992960.0, + "grad_norm": 26.951368678186665, + "language_loss": 1.23554707, + "learning_rate": 3.5973622484909068e-06, + "loss": 1.3766408, + "num_input_tokens_seen": 5595205, + "router_z_loss_clip": 28.140625, + "router_z_loss_mlp": 4.17773438, + "step": 267, + "time_per_iteration": 2.681403875350952 + }, + { + "auxiliary_loss_clip": 0.11252864, + "auxiliary_loss_mlp": 0.02837055, + "balance_loss_clip": 0.0845217, + "balance_loss_mlp": 0.02411335, + "epoch": 0.016113031715015783, + "flos": 21292722685440.0, + "grad_norm": 64.20150221953703, + "language_loss": 1.24742389, + "learning_rate": 3.599769175344462e-06, + "loss": 1.38832319, + "num_input_tokens_seen": 5612645, + "router_z_loss_clip": 28.0, + "router_z_loss_mlp": 4.2578125, + "step": 268, + "time_per_iteration": 2.72198224067688 + }, + { + "auxiliary_loss_clip": 0.11163211, + "auxiliary_loss_mlp": 0.02866759, + "balance_loss_clip": 0.08415397, + "balance_loss_mlp": 0.0243093, + "epoch": 0.01617315496768375, + "flos": 18920371962240.0, + "grad_norm": 170.41239636292127, + "language_loss": 1.22916961, + "learning_rate": 3.602167137831432e-06, + "loss": 1.3694694, + "num_input_tokens_seen": 5628345, + "router_z_loss_clip": 27.46875, + "router_z_loss_mlp": 4.36132812, + "step": 269, + "time_per_iteration": 2.6403703689575195 + }, + { + "auxiliary_loss_clip": 0.11217365, + "auxiliary_loss_mlp": 0.02780488, + "balance_loss_clip": 0.08470169, + "balance_loss_mlp": 0.02398446, + "epoch": 0.01623327822035172, + "flos": 16552339724160.0, + "grad_norm": 38.966481299889274, + "language_loss": 1.32494903, + "learning_rate": 3.6045562024779565e-06, + "loss": 1.46492743, + "num_input_tokens_seen": 5645940, + "router_z_loss_clip": 27.515625, + "router_z_loss_mlp": 3.82226562, + "step": 270, + "time_per_iteration": 2.7300021648406982 + }, + { + "auxiliary_loss_clip": 0.11115253, + "auxiliary_loss_mlp": 0.02879213, + "balance_loss_clip": 0.08416284, + "balance_loss_mlp": 0.02523302, + "epoch": 0.016293401473019692, + "flos": 23520198499200.0, + "grad_norm": 74.8782587112652, + "language_loss": 1.26303077, + "learning_rate": 3.606936435072361e-06, + "loss": 1.40297556, + "num_input_tokens_seen": 5665690, + "router_z_loss_clip": 26.984375, + "router_z_loss_mlp": 3.55859375, + "step": 271, + "time_per_iteration": 2.7073349952697754 + }, + { + "auxiliary_loss_clip": 0.11099713, + "auxiliary_loss_mlp": 0.02833465, + "balance_loss_clip": 0.08408779, + "balance_loss_mlp": 0.02473739, + "epoch": 0.01635352472568766, + "flos": 29022579290880.0, + "grad_norm": 92.09487601801163, + "language_loss": 1.22523308, + "learning_rate": 3.609307900676025e-06, + "loss": 1.36456478, + "num_input_tokens_seen": 5683190, + "router_z_loss_clip": 26.921875, + "router_z_loss_mlp": 3.59765625, + "step": 272, + "time_per_iteration": 2.767242670059204 + }, + { + "auxiliary_loss_clip": 0.11100094, + "auxiliary_loss_mlp": 0.02845915, + "balance_loss_clip": 0.08419856, + "balance_loss_mlp": 0.02489432, + "epoch": 0.01641364797835563, + "flos": 13375546277760.0, + "grad_norm": 162.68643260209848, + "language_loss": 1.12912893, + "learning_rate": 3.611670663634051e-06, + "loss": 1.26858902, + "num_input_tokens_seen": 5699780, + "router_z_loss_clip": 26.828125, + "router_z_loss_mlp": 3.5625, + "step": 273, + "time_per_iteration": 2.6756341457366943 + }, + { + "auxiliary_loss_clip": 0.11082844, + "auxiliary_loss_mlp": 0.02877946, + "balance_loss_clip": 0.08410685, + "balance_loss_mlp": 0.02487702, + "epoch": 0.016473771231023598, + "flos": 18883922636160.0, + "grad_norm": 33.34014800610017, + "language_loss": 1.30194449, + "learning_rate": 3.614024787585744e-06, + "loss": 1.44155228, + "num_input_tokens_seen": 5716980, + "router_z_loss_clip": 26.734375, + "router_z_loss_mlp": 3.90234375, + "step": 274, + "time_per_iteration": 2.7216930389404297 + }, + { + "auxiliary_loss_clip": 0.11044294, + "auxiliary_loss_mlp": 0.02852219, + "balance_loss_clip": 0.08402658, + "balance_loss_mlp": 0.02501839, + "epoch": 0.016533894483691566, + "flos": 22608252587520.0, + "grad_norm": 44.408233256015265, + "language_loss": 1.22405624, + "learning_rate": 3.6163703354748927e-06, + "loss": 1.36302137, + "num_input_tokens_seen": 5737780, + "router_z_loss_clip": 26.453125, + "router_z_loss_mlp": 3.50390625, + "step": 275, + "time_per_iteration": 2.6909008026123047 + }, + { + "auxiliary_loss_clip": 0.10985737, + "auxiliary_loss_mlp": 0.02874438, + "balance_loss_clip": 0.08389083, + "balance_loss_mlp": 0.02526728, + "epoch": 0.01659401773635954, + "flos": 21513640775040.0, + "grad_norm": 44.25598676438703, + "language_loss": 1.11958659, + "learning_rate": 3.6187073695598707e-06, + "loss": 1.25818849, + "num_input_tokens_seen": 5758330, + "router_z_loss_clip": 25.984375, + "router_z_loss_mlp": 3.4765625, + "step": 276, + "time_per_iteration": 2.700979471206665 + }, + { + "auxiliary_loss_clip": 0.10974017, + "auxiliary_loss_mlp": 0.02898641, + "balance_loss_clip": 0.08386508, + "balance_loss_mlp": 0.02528615, + "epoch": 0.016654140989027507, + "flos": 32858772842880.0, + "grad_norm": 42.11334181974309, + "language_loss": 1.14762068, + "learning_rate": 3.621035951423551e-06, + "loss": 1.28634739, + "num_input_tokens_seen": 5778340, + "router_z_loss_clip": 25.859375, + "router_z_loss_mlp": 3.703125, + "step": 277, + "time_per_iteration": 2.8497049808502197 + }, + { + "auxiliary_loss_clip": 0.10973347, + "auxiliary_loss_mlp": 0.02864523, + "balance_loss_clip": 0.08391111, + "balance_loss_mlp": 0.02533217, + "epoch": 0.016714264241695476, + "flos": 12310046559360.0, + "grad_norm": 887.2068563232498, + "language_loss": 1.11253488, + "learning_rate": 3.623356141983041e-06, + "loss": 1.25091362, + "num_input_tokens_seen": 5794295, + "router_z_loss_clip": 25.796875, + "router_z_loss_mlp": 3.3125, + "step": 278, + "time_per_iteration": 2.6813693046569824 + }, + { + "auxiliary_loss_clip": 0.10953625, + "auxiliary_loss_mlp": 0.02843702, + "balance_loss_clip": 0.08367237, + "balance_loss_mlp": 0.02501333, + "epoch": 0.016774387494363444, + "flos": 27130820060160.0, + "grad_norm": 34.273698880479216, + "language_loss": 1.25525784, + "learning_rate": 3.6256680014992486e-06, + "loss": 1.39323103, + "num_input_tokens_seen": 5814405, + "router_z_loss_clip": 25.859375, + "router_z_loss_mlp": 3.42382812, + "step": 279, + "time_per_iteration": 2.784980058670044 + }, + { + "auxiliary_loss_clip": 0.10968237, + "auxiliary_loss_mlp": 0.02757426, + "balance_loss_clip": 0.0838433, + "balance_loss_mlp": 0.02447863, + "epoch": 0.016834510747031413, + "flos": 20197356186240.0, + "grad_norm": 53.49395148263472, + "language_loss": 1.29536223, + "learning_rate": 3.6279715895862713e-06, + "loss": 1.43261886, + "num_input_tokens_seen": 5832795, + "router_z_loss_clip": 25.859375, + "router_z_loss_mlp": 3.09570312, + "step": 280, + "time_per_iteration": 2.681295871734619 + }, + { + "auxiliary_loss_clip": 0.10977297, + "auxiliary_loss_mlp": 0.02731509, + "balance_loss_clip": 0.083787, + "balance_loss_mlp": 0.02426143, + "epoch": 0.016894633999699385, + "flos": 27282067879680.0, + "grad_norm": 34.532536985404526, + "language_loss": 1.04021847, + "learning_rate": 3.6302669652206183e-06, + "loss": 1.17730653, + "num_input_tokens_seen": 5855750, + "router_z_loss_clip": 26.0, + "router_z_loss_mlp": 3.0546875, + "step": 281, + "time_per_iteration": 2.760214328765869 + }, + { + "auxiliary_loss_clip": 0.10965681, + "auxiliary_loss_mlp": 0.02675743, + "balance_loss_clip": 0.08379069, + "balance_loss_mlp": 0.02375717, + "epoch": 0.016954757252367354, + "flos": 14908262762880.0, + "grad_norm": 196.2497312811754, + "language_loss": 1.22675765, + "learning_rate": 3.632554186750274e-06, + "loss": 1.36317194, + "num_input_tokens_seen": 5872610, + "router_z_loss_clip": 25.875, + "router_z_loss_mlp": 2.99609375, + "step": 282, + "time_per_iteration": 2.619256019592285 + }, + { + "auxiliary_loss_clip": 0.10984524, + "auxiliary_loss_mlp": 0.02614953, + "balance_loss_clip": 0.0837212, + "balance_loss_mlp": 0.02316834, + "epoch": 0.017014880505035322, + "flos": 21364824723840.0, + "grad_norm": 113.89697119062544, + "language_loss": 1.1510148, + "learning_rate": 3.6348333119035937e-06, + "loss": 1.28700948, + "num_input_tokens_seen": 5892985, + "router_z_loss_clip": 26.125, + "router_z_loss_mlp": 2.98046875, + "step": 283, + "time_per_iteration": 2.7038846015930176 + }, + { + "auxiliary_loss_clip": 0.10939686, + "auxiliary_loss_mlp": 0.02615653, + "balance_loss_clip": 0.08368152, + "balance_loss_mlp": 0.02314101, + "epoch": 0.01707500375770329, + "flos": 35341561647360.0, + "grad_norm": 2832.5964725422496, + "language_loss": 1.17971587, + "learning_rate": 3.6371043977980503e-06, + "loss": 1.31526923, + "num_input_tokens_seen": 5914060, + "router_z_loss_clip": 25.703125, + "router_z_loss_mlp": 3.015625, + "step": 284, + "time_per_iteration": 2.779290199279785 + }, + { + "auxiliary_loss_clip": 0.11009269, + "auxiliary_loss_mlp": 0.02623795, + "balance_loss_clip": 0.08394658, + "balance_loss_mlp": 0.02300118, + "epoch": 0.01713512701037126, + "flos": 23588065906560.0, + "grad_norm": 202.09490986405962, + "language_loss": 1.3942194, + "learning_rate": 3.639367500948819e-06, + "loss": 1.53055, + "num_input_tokens_seen": 5932860, + "router_z_loss_clip": 26.15625, + "router_z_loss_mlp": 3.23632812, + "step": 285, + "time_per_iteration": 2.708090305328369 + }, + { + "auxiliary_loss_clip": 0.10991548, + "auxiliary_loss_mlp": 0.02635612, + "balance_loss_clip": 0.08366679, + "balance_loss_mlp": 0.02286949, + "epoch": 0.01719525026303923, + "flos": 27641781457920.0, + "grad_norm": 356.15135022069484, + "language_loss": 1.3973043, + "learning_rate": 3.6416226772772178e-06, + "loss": 1.53357589, + "num_input_tokens_seen": 5952725, + "router_z_loss_clip": 26.265625, + "router_z_loss_mlp": 3.48828125, + "step": 286, + "time_per_iteration": 2.719446897506714 + }, + { + "auxiliary_loss_clip": 0.11012185, + "auxiliary_loss_mlp": 0.02632762, + "balance_loss_clip": 0.08369677, + "balance_loss_mlp": 0.02288295, + "epoch": 0.0172553735157072, + "flos": 26987035253760.0, + "grad_norm": 104.57350843719594, + "language_loss": 1.20868826, + "learning_rate": 3.643869982119001e-06, + "loss": 1.34513772, + "num_input_tokens_seen": 5970560, + "router_z_loss_clip": 26.4375, + "router_z_loss_mlp": 3.44335938, + "step": 287, + "time_per_iteration": 2.729893207550049 + }, + { + "auxiliary_loss_clip": 0.10980022, + "auxiliary_loss_mlp": 0.02642429, + "balance_loss_clip": 0.08353196, + "balance_loss_mlp": 0.02284801, + "epoch": 0.01731549676837517, + "flos": 14060578533120.0, + "grad_norm": 166.25914626432441, + "language_loss": 1.43957901, + "learning_rate": 3.646109470232502e-06, + "loss": 1.57580352, + "num_input_tokens_seen": 5982980, + "router_z_loss_clip": 26.21875, + "router_z_loss_mlp": 3.57617188, + "step": 288, + "time_per_iteration": 2.649275779724121 + }, + { + "auxiliary_loss_clip": 0.08934768, + "auxiliary_loss_mlp": 0.02473956, + "balance_loss_clip": 0.07674165, + "balance_loss_mlp": 0.02246409, + "epoch": 0.017375620021043137, + "flos": 66533545543680.0, + "grad_norm": 1.4063062090104488, + "language_loss": 0.6396153, + "learning_rate": 3.6483411958066417e-06, + "loss": 0.75370252, + "num_input_tokens_seen": 6049445, + "router_z_loss_clip": 12.625, + "router_z_loss_mlp": 2.27734375, + "step": 289, + "time_per_iteration": 3.379565954208374 + }, + { + "auxiliary_loss_clip": 0.10942794, + "auxiliary_loss_mlp": 0.0259406, + "balance_loss_clip": 0.08345533, + "balance_loss_mlp": 0.02290982, + "epoch": 0.01743574327371111, + "flos": 15229472590080.0, + "grad_norm": 77.68078787610818, + "language_loss": 1.23036659, + "learning_rate": 3.6505652124687957e-06, + "loss": 1.36573505, + "num_input_tokens_seen": 6064150, + "router_z_loss_clip": 26.0, + "router_z_loss_mlp": 3.03320312, + "step": 290, + "time_per_iteration": 2.6509203910827637 + }, + { + "auxiliary_loss_clip": 0.10926615, + "auxiliary_loss_mlp": 0.02615048, + "balance_loss_clip": 0.08348773, + "balance_loss_mlp": 0.02310254, + "epoch": 0.017495866526379078, + "flos": 25380833137920.0, + "grad_norm": 27.564120325217353, + "language_loss": 1.14881706, + "learning_rate": 3.6527815732925258e-06, + "loss": 1.28423381, + "num_input_tokens_seen": 6083920, + "router_z_loss_clip": 25.796875, + "router_z_loss_mlp": 3.046875, + "step": 291, + "time_per_iteration": 2.7178046703338623 + }, + { + "auxiliary_loss_clip": 0.10883434, + "auxiliary_loss_mlp": 0.02591836, + "balance_loss_clip": 0.08332369, + "balance_loss_mlp": 0.02272164, + "epoch": 0.017555989779047047, + "flos": 26366683950720.0, + "grad_norm": 17.764405326344416, + "language_loss": 0.99533927, + "learning_rate": 3.6549903308051806e-06, + "loss": 1.13009202, + "num_input_tokens_seen": 6105460, + "router_z_loss_clip": 25.53125, + "router_z_loss_mlp": 3.1953125, + "step": 292, + "time_per_iteration": 2.788431406021118 + }, + { + "auxiliary_loss_clip": 0.10899352, + "auxiliary_loss_mlp": 0.02663543, + "balance_loss_clip": 0.08339885, + "balance_loss_mlp": 0.02329948, + "epoch": 0.017616113031715015, + "flos": 22344134918400.0, + "grad_norm": 26.042803645754148, + "language_loss": 1.17510223, + "learning_rate": 3.6571915369953646e-06, + "loss": 1.31073129, + "num_input_tokens_seen": 6122890, + "router_z_loss_clip": 25.59375, + "router_z_loss_mlp": 3.33398438, + "step": 293, + "time_per_iteration": 2.6952950954437256 + }, + { + "auxiliary_loss_clip": 0.10900117, + "auxiliary_loss_mlp": 0.02710556, + "balance_loss_clip": 0.08334709, + "balance_loss_mlp": 0.02379822, + "epoch": 0.017676236284382984, + "flos": 20163087066240.0, + "grad_norm": 32.066823918561106, + "language_loss": 1.13700342, + "learning_rate": 3.6593852433202797e-06, + "loss": 1.27311015, + "num_input_tokens_seen": 6142890, + "router_z_loss_clip": 25.640625, + "router_z_loss_mlp": 3.30859375, + "step": 294, + "time_per_iteration": 5.568135976791382 + }, + { + "auxiliary_loss_clip": 0.10885305, + "auxiliary_loss_mlp": 0.02641671, + "balance_loss_clip": 0.08332892, + "balance_loss_mlp": 0.02322953, + "epoch": 0.017736359537050956, + "flos": 25229501464320.0, + "grad_norm": 23.522869629200528, + "language_loss": 1.10671854, + "learning_rate": 3.6615715007129453e-06, + "loss": 1.24198818, + "num_input_tokens_seen": 6162030, + "router_z_loss_clip": 25.515625, + "router_z_loss_mlp": 3.1875, + "step": 295, + "time_per_iteration": 4.106949090957642 + }, + { + "auxiliary_loss_clip": 0.10915332, + "auxiliary_loss_mlp": 0.02662487, + "balance_loss_clip": 0.08334074, + "balance_loss_mlp": 0.02339572, + "epoch": 0.017796482789718925, + "flos": 20344914426240.0, + "grad_norm": 21.437764161161574, + "language_loss": 1.11617136, + "learning_rate": 3.6637503595892897e-06, + "loss": 1.25194955, + "num_input_tokens_seen": 6180540, + "router_z_loss_clip": 25.8125, + "router_z_loss_mlp": 3.22851562, + "step": 296, + "time_per_iteration": 2.6804072856903076 + }, + { + "auxiliary_loss_clip": 0.10889067, + "auxiliary_loss_mlp": 0.02644786, + "balance_loss_clip": 0.08324579, + "balance_loss_mlp": 0.02326259, + "epoch": 0.017856606042386893, + "flos": 22385196218880.0, + "grad_norm": 24.793293378850404, + "language_loss": 1.13374424, + "learning_rate": 3.665921869855132e-06, + "loss": 1.26908278, + "num_input_tokens_seen": 6199425, + "router_z_loss_clip": 25.671875, + "router_z_loss_mlp": 3.18554688, + "step": 297, + "time_per_iteration": 4.217481851577759 + }, + { + "auxiliary_loss_clip": 0.10852176, + "auxiliary_loss_mlp": 0.02688673, + "balance_loss_clip": 0.08303393, + "balance_loss_mlp": 0.02347639, + "epoch": 0.017916729295054862, + "flos": 20236279207680.0, + "grad_norm": 36.45374269731938, + "language_loss": 1.20502043, + "learning_rate": 3.6680860809130346e-06, + "loss": 1.34042883, + "num_input_tokens_seen": 6219170, + "router_z_loss_clip": 25.515625, + "router_z_loss_mlp": 3.40820312, + "step": 298, + "time_per_iteration": 2.6716575622558594 + }, + { + "auxiliary_loss_clip": 0.10865816, + "auxiliary_loss_mlp": 0.02644256, + "balance_loss_clip": 0.08315772, + "balance_loss_mlp": 0.02343848, + "epoch": 0.01797685254772283, + "flos": 19397064240000.0, + "grad_norm": 34.948505853119244, + "language_loss": 1.10227847, + "learning_rate": 3.6702430416690516e-06, + "loss": 1.23737931, + "num_input_tokens_seen": 6237930, + "router_z_loss_clip": 25.5, + "router_z_loss_mlp": 3.00390625, + "step": 299, + "time_per_iteration": 2.6678671836853027 + }, + { + "auxiliary_loss_clip": 0.10841461, + "auxiliary_loss_mlp": 0.02622314, + "balance_loss_clip": 0.08293117, + "balance_loss_mlp": 0.02329536, + "epoch": 0.018036975800390802, + "flos": 24432941024640.0, + "grad_norm": 19.38461643101093, + "language_loss": 0.93498641, + "learning_rate": 3.672392800539357e-06, + "loss": 1.06962407, + "num_input_tokens_seen": 6257170, + "router_z_loss_clip": 25.46875, + "router_z_loss_mlp": 2.92578125, + "step": 300, + "time_per_iteration": 2.678161382675171 + }, + { + "auxiliary_loss_clip": 0.10806506, + "auxiliary_loss_mlp": 0.02621871, + "balance_loss_clip": 0.08281456, + "balance_loss_mlp": 0.02336723, + "epoch": 0.01809709905305877, + "flos": 15784430181120.0, + "grad_norm": 20.696646248156853, + "language_loss": 1.21024799, + "learning_rate": 3.6745354054567686e-06, + "loss": 1.34453177, + "num_input_tokens_seen": 6274780, + "router_z_loss_clip": 25.265625, + "router_z_loss_mlp": 2.85351562, + "step": 301, + "time_per_iteration": 2.6817290782928467 + }, + { + "auxiliary_loss_clip": 0.0850801, + "auxiliary_loss_mlp": 0.01826254, + "balance_loss_clip": 0.07523113, + "balance_loss_mlp": 0.01690356, + "epoch": 0.01815722230572674, + "flos": 67371125356800.0, + "grad_norm": 1.2503467181890604, + "language_loss": 0.62148851, + "learning_rate": 3.676670903877158e-06, + "loss": 0.72483116, + "num_input_tokens_seen": 6340435, + "router_z_loss_clip": 9.859375, + "router_z_loss_mlp": 1.36035156, + "step": 302, + "time_per_iteration": 3.424029588699341 + }, + { + "auxiliary_loss_clip": 0.10791934, + "auxiliary_loss_mlp": 0.02578435, + "balance_loss_clip": 0.08265001, + "balance_loss_mlp": 0.02299963, + "epoch": 0.01821734555839471, + "flos": 15490823074560.0, + "grad_norm": 21.711544566316807, + "language_loss": 1.17839396, + "learning_rate": 3.6787993427857567e-06, + "loss": 1.31209755, + "num_input_tokens_seen": 6358160, + "router_z_loss_clip": 25.265625, + "router_z_loss_mlp": 2.78320312, + "step": 303, + "time_per_iteration": 2.6523215770721436 + }, + { + "auxiliary_loss_clip": 0.10728209, + "auxiliary_loss_mlp": 0.02544189, + "balance_loss_clip": 0.08224705, + "balance_loss_mlp": 0.02301288, + "epoch": 0.018277468811062677, + "flos": 24104268184320.0, + "grad_norm": 23.704422815160775, + "language_loss": 1.0746634, + "learning_rate": 3.680920768703364e-06, + "loss": 1.20738745, + "num_input_tokens_seen": 6378485, + "router_z_loss_clip": 25.03125, + "router_z_loss_mlp": 2.42675781, + "step": 304, + "time_per_iteration": 2.7344958782196045 + }, + { + "auxiliary_loss_clip": 0.1066777, + "auxiliary_loss_mlp": 0.02483555, + "balance_loss_clip": 0.08210013, + "balance_loss_mlp": 0.02260681, + "epoch": 0.01833759206373065, + "flos": 20965601145600.0, + "grad_norm": 30.99837504160223, + "language_loss": 1.03348625, + "learning_rate": 3.6830352276924415e-06, + "loss": 1.16499949, + "num_input_tokens_seen": 6397845, + "router_z_loss_clip": 24.5625, + "router_z_loss_mlp": 2.22949219, + "step": 305, + "time_per_iteration": 2.7260208129882812 + }, + { + "auxiliary_loss_clip": 0.10687442, + "auxiliary_loss_mlp": 0.0251225, + "balance_loss_clip": 0.08201034, + "balance_loss_mlp": 0.0229529, + "epoch": 0.018397715316398618, + "flos": 19396812677760.0, + "grad_norm": 19.918754118514013, + "language_loss": 1.13116205, + "learning_rate": 3.685142765363119e-06, + "loss": 1.26315892, + "num_input_tokens_seen": 6416475, + "router_z_loss_clip": 24.828125, + "router_z_loss_mlp": 2.16992188, + "step": 306, + "time_per_iteration": 2.691499948501587 + }, + { + "auxiliary_loss_clip": 0.10669354, + "auxiliary_loss_mlp": 0.02508631, + "balance_loss_clip": 0.08186156, + "balance_loss_mlp": 0.02314558, + "epoch": 0.018457838569066586, + "flos": 29140228823040.0, + "grad_norm": 47.10981354198648, + "language_loss": 1.13449669, + "learning_rate": 3.687243426879095e-06, + "loss": 1.2662766, + "num_input_tokens_seen": 6437520, + "router_z_loss_clip": 24.859375, + "router_z_loss_mlp": 1.94335938, + "step": 307, + "time_per_iteration": 2.7379393577575684 + }, + { + "auxiliary_loss_clip": 0.10625106, + "auxiliary_loss_mlp": 0.02487612, + "balance_loss_clip": 0.08165652, + "balance_loss_mlp": 0.02317095, + "epoch": 0.018517961821734555, + "flos": 19214733755520.0, + "grad_norm": 42.1678147839251, + "language_loss": 0.98589212, + "learning_rate": 3.6893372569634466e-06, + "loss": 1.11701941, + "num_input_tokens_seen": 6455680, + "router_z_loss_clip": 24.609375, + "router_z_loss_mlp": 1.70605469, + "step": 308, + "time_per_iteration": 2.702864646911621 + }, + { + "auxiliary_loss_clip": 0.1055109, + "auxiliary_loss_mlp": 0.02395341, + "balance_loss_clip": 0.08134291, + "balance_loss_mlp": 0.02218911, + "epoch": 0.018578085074402523, + "flos": 19868809127040.0, + "grad_norm": 28.65950876073581, + "language_loss": 1.1383698, + "learning_rate": 3.6914242999043395e-06, + "loss": 1.26783419, + "num_input_tokens_seen": 6474880, + "router_z_loss_clip": 24.171875, + "router_z_loss_mlp": 1.765625, + "step": 309, + "time_per_iteration": 2.6683051586151123 + }, + { + "auxiliary_loss_clip": 0.10586038, + "auxiliary_loss_mlp": 0.02405273, + "balance_loss_clip": 0.08121731, + "balance_loss_mlp": 0.02230465, + "epoch": 0.018638208327070496, + "flos": 29614740894720.0, + "grad_norm": 52.453360042586766, + "language_loss": 1.0296793, + "learning_rate": 3.69350459956065e-06, + "loss": 1.15959239, + "num_input_tokens_seen": 6495945, + "router_z_loss_clip": 24.625, + "router_z_loss_mlp": 1.74804688, + "step": 310, + "time_per_iteration": 2.775391101837158 + }, + { + "auxiliary_loss_clip": 0.10563378, + "auxiliary_loss_mlp": 0.02371235, + "balance_loss_clip": 0.08112171, + "balance_loss_mlp": 0.02215118, + "epoch": 0.018698331579738464, + "flos": 45741694567680.0, + "grad_norm": 23.410275827875097, + "language_loss": 0.97821265, + "learning_rate": 3.695578199367497e-06, + "loss": 1.10755873, + "num_input_tokens_seen": 6519930, + "router_z_loss_clip": 24.5, + "router_z_loss_mlp": 1.56054688, + "step": 311, + "time_per_iteration": 2.8839335441589355 + }, + { + "auxiliary_loss_clip": 0.10531655, + "auxiliary_loss_mlp": 0.02336008, + "balance_loss_clip": 0.08109175, + "balance_loss_mlp": 0.02177126, + "epoch": 0.018758454832406433, + "flos": 20489621627520.0, + "grad_norm": 82.59483456267918, + "language_loss": 1.18671477, + "learning_rate": 3.6976451423416825e-06, + "loss": 1.31539142, + "num_input_tokens_seen": 6535070, + "router_z_loss_clip": 24.203125, + "router_z_loss_mlp": 1.58886719, + "step": 312, + "time_per_iteration": 2.770037889480591 + }, + { + "auxiliary_loss_clip": 0.10558081, + "auxiliary_loss_mlp": 0.02280057, + "balance_loss_clip": 0.08105703, + "balance_loss_mlp": 0.02130998, + "epoch": 0.0188185780850744, + "flos": 15783088515840.0, + "grad_norm": 63.63527142809732, + "language_loss": 1.19325101, + "learning_rate": 3.699705471087043e-06, + "loss": 1.32163239, + "num_input_tokens_seen": 6554135, + "router_z_loss_clip": 24.515625, + "router_z_loss_mlp": 1.49121094, + "step": 313, + "time_per_iteration": 2.6673521995544434 + }, + { + "auxiliary_loss_clip": 0.10532573, + "auxiliary_loss_mlp": 0.02284473, + "balance_loss_clip": 0.08092797, + "balance_loss_mlp": 0.02119774, + "epoch": 0.018878701337742373, + "flos": 22462329502080.0, + "grad_norm": 55.57556601394066, + "language_loss": 1.1492281, + "learning_rate": 3.7017592277997256e-06, + "loss": 1.27739859, + "num_input_tokens_seen": 6572275, + "router_z_loss_clip": 24.375, + "router_z_loss_mlp": 1.6484375, + "step": 314, + "time_per_iteration": 2.6694388389587402 + }, + { + "auxiliary_loss_clip": 0.10578424, + "auxiliary_loss_mlp": 0.02246847, + "balance_loss_clip": 0.08105191, + "balance_loss_mlp": 0.02083482, + "epoch": 0.018938824590410342, + "flos": 31001576221440.0, + "grad_norm": 45.405049918855795, + "language_loss": 1.21203804, + "learning_rate": 3.7038064542733654e-06, + "loss": 1.34029078, + "num_input_tokens_seen": 6594520, + "router_z_loss_clip": 24.734375, + "router_z_loss_mlp": 1.6328125, + "step": 315, + "time_per_iteration": 2.7529938220977783 + }, + { + "auxiliary_loss_clip": 0.10473935, + "auxiliary_loss_mlp": 0.02224543, + "balance_loss_clip": 0.08059986, + "balance_loss_mlp": 0.02047731, + "epoch": 0.01899894784307831, + "flos": 23265724049280.0, + "grad_norm": 52.87369135887914, + "language_loss": 1.09085321, + "learning_rate": 3.7058471919041945e-06, + "loss": 1.21783805, + "num_input_tokens_seen": 6614245, + "router_z_loss_clip": 24.15625, + "router_z_loss_mlp": 1.76855469, + "step": 316, + "time_per_iteration": 2.7019717693328857 + }, + { + "auxiliary_loss_clip": 0.1049989, + "auxiliary_loss_mlp": 0.02224334, + "balance_loss_clip": 0.08073364, + "balance_loss_mlp": 0.02044757, + "epoch": 0.01905907109574628, + "flos": 17463782511360.0, + "grad_norm": 120.61991368810097, + "language_loss": 1.19369888, + "learning_rate": 3.7078814816960605e-06, + "loss": 1.32094109, + "num_input_tokens_seen": 6632015, + "router_z_loss_clip": 24.234375, + "router_z_loss_mlp": 1.79492188, + "step": 317, + "time_per_iteration": 2.6503257751464844 + }, + { + "auxiliary_loss_clip": 0.10466437, + "auxiliary_loss_mlp": 0.02269676, + "balance_loss_clip": 0.08054706, + "balance_loss_mlp": 0.02081039, + "epoch": 0.019119194348414248, + "flos": 14974578869760.0, + "grad_norm": 61.86297235247138, + "language_loss": 1.22225165, + "learning_rate": 3.709909364265374e-06, + "loss": 1.34961283, + "num_input_tokens_seen": 6649015, + "router_z_loss_clip": 24.109375, + "router_z_loss_mlp": 1.88769531, + "step": 318, + "time_per_iteration": 2.631645917892456 + }, + { + "auxiliary_loss_clip": 0.1039573, + "auxiliary_loss_mlp": 0.02220381, + "balance_loss_clip": 0.08026896, + "balance_loss_mlp": 0.02036608, + "epoch": 0.01917931760108222, + "flos": 25489719918720.0, + "grad_norm": 79.56078914423522, + "language_loss": 1.24628842, + "learning_rate": 3.7119308798459706e-06, + "loss": 1.3724494, + "num_input_tokens_seen": 6669225, + "router_z_loss_clip": 23.65625, + "router_z_loss_mlp": 1.83789062, + "step": 319, + "time_per_iteration": 2.723235607147217 + }, + { + "auxiliary_loss_clip": 0.08211939, + "auxiliary_loss_mlp": 0.01803451, + "balance_loss_clip": 0.07311222, + "balance_loss_mlp": 0.01697974, + "epoch": 0.01923944085375019, + "flos": 71576438872320.0, + "grad_norm": 0.9540157623115577, + "language_loss": 0.59494603, + "learning_rate": 3.7139460682939026e-06, + "loss": 0.69509989, + "num_input_tokens_seen": 6725775, + "router_z_loss_clip": 9.0, + "router_z_loss_mlp": 1.05664062, + "step": 320, + "time_per_iteration": 3.180224895477295 + }, + { + "auxiliary_loss_clip": 0.10427548, + "auxiliary_loss_mlp": 0.02254004, + "balance_loss_clip": 0.0803239, + "balance_loss_mlp": 0.02062601, + "epoch": 0.019299564106418157, + "flos": 19688574994560.0, + "grad_norm": 36.291900925718565, + "language_loss": 1.21542251, + "learning_rate": 3.715954969092154e-06, + "loss": 1.34223795, + "num_input_tokens_seen": 6744170, + "router_z_loss_clip": 23.921875, + "router_z_loss_mlp": 1.9140625, + "step": 321, + "time_per_iteration": 2.682126045227051 + }, + { + "auxiliary_loss_clip": 0.10335587, + "auxiliary_loss_mlp": 0.02247301, + "balance_loss_clip": 0.079924, + "balance_loss_mlp": 0.02050463, + "epoch": 0.019359687359086126, + "flos": 24393682586880.0, + "grad_norm": 33.259970226975035, + "language_loss": 1.13044763, + "learning_rate": 3.7179576213552805e-06, + "loss": 1.25627637, + "num_input_tokens_seen": 6764565, + "router_z_loss_clip": 23.40625, + "router_z_loss_mlp": 1.96972656, + "step": 322, + "time_per_iteration": 2.707108736038208 + }, + { + "auxiliary_loss_clip": 0.10356271, + "auxiliary_loss_mlp": 0.02232923, + "balance_loss_clip": 0.08007558, + "balance_loss_mlp": 0.02039518, + "epoch": 0.019419810611754094, + "flos": 23958177390720.0, + "grad_norm": 36.53278953975959, + "language_loss": 0.99391961, + "learning_rate": 3.719954063833981e-06, + "loss": 1.11981153, + "num_input_tokens_seen": 6785310, + "router_z_loss_clip": 23.46875, + "router_z_loss_mlp": 1.93554688, + "step": 323, + "time_per_iteration": 2.723851442337036 + }, + { + "auxiliary_loss_clip": 0.10368463, + "auxiliary_loss_mlp": 0.02256046, + "balance_loss_clip": 0.08015804, + "balance_loss_mlp": 0.02064739, + "epoch": 0.019479933864422067, + "flos": 22166164846080.0, + "grad_norm": 31.715264393756637, + "language_loss": 1.15310884, + "learning_rate": 3.721944334919596e-06, + "loss": 1.27935386, + "num_input_tokens_seen": 6803290, + "router_z_loss_clip": 23.5, + "router_z_loss_mlp": 1.9140625, + "step": 324, + "time_per_iteration": 2.696791887283325 + }, + { + "auxiliary_loss_clip": 0.10296808, + "auxiliary_loss_mlp": 0.02240866, + "balance_loss_clip": 0.08005355, + "balance_loss_mlp": 0.02052992, + "epoch": 0.019540057117090035, + "flos": 22243381983360.0, + "grad_norm": 43.49790109423306, + "language_loss": 0.94611681, + "learning_rate": 3.7239284726485375e-06, + "loss": 1.07149351, + "num_input_tokens_seen": 6822570, + "router_z_loss_clip": 22.90625, + "router_z_loss_mlp": 1.87890625, + "step": 325, + "time_per_iteration": 2.653348207473755 + }, + { + "auxiliary_loss_clip": 0.10282885, + "auxiliary_loss_mlp": 0.02182889, + "balance_loss_clip": 0.07997272, + "balance_loss_mlp": 0.02001023, + "epoch": 0.019600180369758004, + "flos": 23083603200000.0, + "grad_norm": 27.315965412731057, + "language_loss": 0.98057997, + "learning_rate": 3.72590651470665e-06, + "loss": 1.10523772, + "num_input_tokens_seen": 6841910, + "router_z_loss_clip": 22.859375, + "router_z_loss_mlp": 1.81835938, + "step": 326, + "time_per_iteration": 2.712902545928955 + }, + { + "auxiliary_loss_clip": 0.10212934, + "auxiliary_loss_mlp": 0.0211514, + "balance_loss_clip": 0.07960281, + "balance_loss_mlp": 0.01952062, + "epoch": 0.019660303622425972, + "flos": 25417911369600.0, + "grad_norm": 35.757935523376304, + "language_loss": 1.00482905, + "learning_rate": 3.727878498433505e-06, + "loss": 1.12810981, + "num_input_tokens_seen": 6862480, + "router_z_loss_clip": 22.53125, + "router_z_loss_mlp": 1.63085938, + "step": 327, + "time_per_iteration": 2.7241063117980957 + }, + { + "auxiliary_loss_clip": 0.10138492, + "auxiliary_loss_mlp": 0.02035691, + "balance_loss_clip": 0.07947245, + "balance_loss_mlp": 0.01881101, + "epoch": 0.01972042687509394, + "flos": 23663941378560.0, + "grad_norm": 104.32864902308236, + "language_loss": 1.03565025, + "learning_rate": 3.7298444608266328e-06, + "loss": 1.15739202, + "num_input_tokens_seen": 6882015, + "router_z_loss_clip": 21.9375, + "router_z_loss_mlp": 1.54492188, + "step": 328, + "time_per_iteration": 2.709101438522339 + }, + { + "auxiliary_loss_clip": 0.10164856, + "auxiliary_loss_mlp": 0.01970008, + "balance_loss_clip": 0.0795281, + "balance_loss_mlp": 0.01821044, + "epoch": 0.019780550127761913, + "flos": 18229386067200.0, + "grad_norm": 42.1606706132577, + "language_loss": 1.2875843, + "learning_rate": 3.731804438545683e-06, + "loss": 1.40893316, + "num_input_tokens_seen": 6899785, + "router_z_loss_clip": 22.125, + "router_z_loss_mlp": 1.49023438, + "step": 329, + "time_per_iteration": 2.6586227416992188 + }, + { + "auxiliary_loss_clip": 0.10175324, + "auxiliary_loss_mlp": 0.0194808, + "balance_loss_clip": 0.07956892, + "balance_loss_mlp": 0.0180417, + "epoch": 0.01984067338042988, + "flos": 22425293197440.0, + "grad_norm": 45.342797810033126, + "language_loss": 1.05014217, + "learning_rate": 3.7337584679165324e-06, + "loss": 1.17137623, + "num_input_tokens_seen": 6918575, + "router_z_loss_clip": 22.1875, + "router_z_loss_mlp": 1.43847656, + "step": 330, + "time_per_iteration": 2.7214515209198 + }, + { + "auxiliary_loss_clip": 0.10115402, + "auxiliary_loss_mlp": 0.01893459, + "balance_loss_clip": 0.07927606, + "balance_loss_mlp": 0.01745353, + "epoch": 0.01990079663309785, + "flos": 17060785499520.0, + "grad_norm": 59.15314637886723, + "language_loss": 1.25238144, + "learning_rate": 3.7357065849353186e-06, + "loss": 1.37247014, + "num_input_tokens_seen": 6936965, + "router_z_loss_clip": 21.890625, + "router_z_loss_mlp": 1.48046875, + "step": 331, + "time_per_iteration": 2.657338857650757 + }, + { + "auxiliary_loss_clip": 0.10080996, + "auxiliary_loss_mlp": 0.01847509, + "balance_loss_clip": 0.07917192, + "balance_loss_mlp": 0.01704076, + "epoch": 0.01996091988576582, + "flos": 15967389571200.0, + "grad_norm": 98.01539887897596, + "language_loss": 1.18547392, + "learning_rate": 3.737648825272422e-06, + "loss": 1.30475891, + "num_input_tokens_seen": 6953475, + "router_z_loss_clip": 21.625, + "router_z_loss_mlp": 1.43457031, + "step": 332, + "time_per_iteration": 2.653959035873413 + }, + { + "auxiliary_loss_clip": 0.10103545, + "auxiliary_loss_mlp": 0.01800932, + "balance_loss_clip": 0.07904914, + "balance_loss_mlp": 0.01663794, + "epoch": 0.02002104313843379, + "flos": 23593181005440.0, + "grad_norm": 35.094478760810134, + "language_loss": 1.10768199, + "learning_rate": 3.739585224276384e-06, + "loss": 1.22672677, + "num_input_tokens_seen": 6971630, + "router_z_loss_clip": 21.96875, + "router_z_loss_mlp": 1.37207031, + "step": 333, + "time_per_iteration": 4.1371009349823 + }, + { + "auxiliary_loss_clip": 0.10097618, + "auxiliary_loss_mlp": 0.01781343, + "balance_loss_clip": 0.07907948, + "balance_loss_mlp": 0.01654028, + "epoch": 0.02008116639110176, + "flos": 34103458517760.0, + "grad_norm": 136.68327853765982, + "language_loss": 1.06974816, + "learning_rate": 3.7415158169777673e-06, + "loss": 1.18853784, + "num_input_tokens_seen": 6992775, + "router_z_loss_clip": 21.921875, + "router_z_loss_mlp": 1.2734375, + "step": 334, + "time_per_iteration": 4.332135200500488 + }, + { + "auxiliary_loss_clip": 0.10031913, + "auxiliary_loss_mlp": 0.01781208, + "balance_loss_clip": 0.07884848, + "balance_loss_mlp": 0.01645405, + "epoch": 0.020141289643769728, + "flos": 19690000513920.0, + "grad_norm": 127.35413263461035, + "language_loss": 1.06165111, + "learning_rate": 3.7434406380929575e-06, + "loss": 1.17978239, + "num_input_tokens_seen": 7011425, + "router_z_loss_clip": 21.453125, + "router_z_loss_mlp": 1.35742188, + "step": 335, + "time_per_iteration": 2.6845688819885254 + }, + { + "auxiliary_loss_clip": 0.10012034, + "auxiliary_loss_mlp": 0.01785006, + "balance_loss_clip": 0.07876636, + "balance_loss_mlp": 0.01652064, + "epoch": 0.020201412896437697, + "flos": 20746821335040.0, + "grad_norm": 92.68671579424392, + "language_loss": 1.17325389, + "learning_rate": 3.745359722027911e-06, + "loss": 1.29122424, + "num_input_tokens_seen": 7029450, + "router_z_loss_clip": 21.34375, + "router_z_loss_mlp": 1.33007812, + "step": 336, + "time_per_iteration": 4.08910059928894 + }, + { + "auxiliary_loss_clip": 0.1002828, + "auxiliary_loss_mlp": 0.01777388, + "balance_loss_clip": 0.07887816, + "balance_loss_mlp": 0.01649119, + "epoch": 0.020261536149105665, + "flos": 20272728533760.0, + "grad_norm": 120.00954497896274, + "language_loss": 1.09627342, + "learning_rate": 3.7472731028818428e-06, + "loss": 1.21433008, + "num_input_tokens_seen": 7047555, + "router_z_loss_clip": 21.40625, + "router_z_loss_mlp": 1.28222656, + "step": 337, + "time_per_iteration": 2.805793285369873 + }, + { + "auxiliary_loss_clip": 0.09984031, + "auxiliary_loss_mlp": 0.01793779, + "balance_loss_clip": 0.07868993, + "balance_loss_mlp": 0.01666368, + "epoch": 0.020321659401773638, + "flos": 25855890261120.0, + "grad_norm": 28.99860578242643, + "language_loss": 1.06755781, + "learning_rate": 3.7491808144508626e-06, + "loss": 1.18533587, + "num_input_tokens_seen": 7068185, + "router_z_loss_clip": 21.15625, + "router_z_loss_mlp": 1.2734375, + "step": 338, + "time_per_iteration": 2.731576919555664 + }, + { + "auxiliary_loss_clip": 0.09960704, + "auxiliary_loss_mlp": 0.01799352, + "balance_loss_clip": 0.0785647, + "balance_loss_mlp": 0.01663931, + "epoch": 0.020381782654441606, + "flos": 17501028451200.0, + "grad_norm": 48.687202060804886, + "language_loss": 1.0690763, + "learning_rate": 3.7510828902315576e-06, + "loss": 1.18667698, + "num_input_tokens_seen": 7085955, + "router_z_loss_clip": 21.03125, + "router_z_loss_mlp": 1.35449219, + "step": 339, + "time_per_iteration": 2.6707966327667236 + }, + { + "auxiliary_loss_clip": 0.09979145, + "auxiliary_loss_mlp": 0.01800383, + "balance_loss_clip": 0.07839093, + "balance_loss_mlp": 0.01661433, + "epoch": 0.020441905907109575, + "flos": 24250904029440.0, + "grad_norm": 71.79969186636298, + "language_loss": 1.09025931, + "learning_rate": 3.75297936342452e-06, + "loss": 1.20805454, + "num_input_tokens_seen": 7106345, + "router_z_loss_clip": 21.4375, + "router_z_loss_mlp": 1.38964844, + "step": 340, + "time_per_iteration": 2.6860833168029785 + }, + { + "auxiliary_loss_clip": 0.09942168, + "auxiliary_loss_mlp": 0.01812594, + "balance_loss_clip": 0.07835533, + "balance_loss_mlp": 0.01670592, + "epoch": 0.020502029159777543, + "flos": 22239273133440.0, + "grad_norm": 33.37713513104353, + "language_loss": 1.09787846, + "learning_rate": 3.7548702669378253e-06, + "loss": 1.21542597, + "num_input_tokens_seen": 7125070, + "router_z_loss_clip": 21.09375, + "router_z_loss_mlp": 1.41992188, + "step": 341, + "time_per_iteration": 2.6922483444213867 + }, + { + "auxiliary_loss_clip": 0.09939329, + "auxiliary_loss_mlp": 0.01828812, + "balance_loss_clip": 0.07839939, + "balance_loss_mlp": 0.01694249, + "epoch": 0.020562152412445512, + "flos": 23994668643840.0, + "grad_norm": 29.77192234960925, + "language_loss": 1.11667454, + "learning_rate": 3.756755633390458e-06, + "loss": 1.23435605, + "num_input_tokens_seen": 7144675, + "router_z_loss_clip": 21.0, + "router_z_loss_mlp": 1.34472656, + "step": 342, + "time_per_iteration": 2.6834869384765625 + }, + { + "auxiliary_loss_clip": 0.09933892, + "auxiliary_loss_mlp": 0.01819402, + "balance_loss_clip": 0.07828948, + "balance_loss_mlp": 0.0168541, + "epoch": 0.020622275665113484, + "flos": 26981878227840.0, + "grad_norm": 22.197931915509507, + "language_loss": 1.07990003, + "learning_rate": 3.7586354951156886e-06, + "loss": 1.19743299, + "num_input_tokens_seen": 7165505, + "router_z_loss_clip": 21.0625, + "router_z_loss_mlp": 1.34082031, + "step": 343, + "time_per_iteration": 2.749616861343384 + }, + { + "auxiliary_loss_clip": 0.09917849, + "auxiliary_loss_mlp": 0.01848479, + "balance_loss_clip": 0.07828984, + "balance_loss_mlp": 0.01717921, + "epoch": 0.020682398917781453, + "flos": 22607162484480.0, + "grad_norm": 141.8901696404303, + "language_loss": 0.98407257, + "learning_rate": 3.7605098841644e-06, + "loss": 1.10173583, + "num_input_tokens_seen": 7184605, + "router_z_loss_clip": 20.859375, + "router_z_loss_mlp": 1.30566406, + "step": 344, + "time_per_iteration": 2.675349235534668 + }, + { + "auxiliary_loss_clip": 0.09898005, + "auxiliary_loss_mlp": 0.01869082, + "balance_loss_clip": 0.07812598, + "balance_loss_mlp": 0.01731467, + "epoch": 0.02074252217044942, + "flos": 15019120114560.0, + "grad_norm": 18.785611022256134, + "language_loss": 0.99672723, + "learning_rate": 3.7623788323083666e-06, + "loss": 1.11439812, + "num_input_tokens_seen": 7203065, + "router_z_loss_clip": 20.84375, + "router_z_loss_mlp": 1.37597656, + "step": 345, + "time_per_iteration": 2.692946434020996 + }, + { + "auxiliary_loss_clip": 0.09874325, + "auxiliary_loss_mlp": 0.01900277, + "balance_loss_clip": 0.07799722, + "balance_loss_mlp": 0.01757512, + "epoch": 0.02080264542311739, + "flos": 25345012717440.0, + "grad_norm": 55.83425603592709, + "language_loss": 1.104882, + "learning_rate": 3.7642423710434837e-06, + "loss": 1.222628, + "num_input_tokens_seen": 7222995, + "router_z_loss_clip": 20.734375, + "router_z_loss_mlp": 1.42871094, + "step": 346, + "time_per_iteration": 2.6843760013580322 + }, + { + "auxiliary_loss_clip": 0.09857361, + "auxiliary_loss_mlp": 0.01900508, + "balance_loss_clip": 0.07793791, + "balance_loss_mlp": 0.01751067, + "epoch": 0.02086276867578536, + "flos": 24395611230720.0, + "grad_norm": 77.40789728508068, + "language_loss": 1.02947056, + "learning_rate": 3.7661005315929563e-06, + "loss": 1.14704919, + "num_input_tokens_seen": 7244625, + "router_z_loss_clip": 20.625, + "router_z_loss_mlp": 1.49511719, + "step": 347, + "time_per_iteration": 2.7445502281188965 + }, + { + "auxiliary_loss_clip": 0.09829693, + "auxiliary_loss_mlp": 0.01850064, + "balance_loss_clip": 0.07772936, + "balance_loss_mlp": 0.01707585, + "epoch": 0.02092289192845333, + "flos": 24469096861440.0, + "grad_norm": 39.57326474220843, + "language_loss": 0.95316571, + "learning_rate": 3.7679533449104354e-06, + "loss": 1.06996334, + "num_input_tokens_seen": 7263255, + "router_z_loss_clip": 20.546875, + "router_z_loss_mlp": 1.42578125, + "step": 348, + "time_per_iteration": 2.8197853565216064 + }, + { + "auxiliary_loss_clip": 0.09904477, + "auxiliary_loss_mlp": 0.01869566, + "balance_loss_clip": 0.07792602, + "balance_loss_mlp": 0.01723273, + "epoch": 0.0209830151811213, + "flos": 17455942154880.0, + "grad_norm": 162.53223734199824, + "language_loss": 1.06930375, + "learning_rate": 3.7698008416831116e-06, + "loss": 1.18704414, + "num_input_tokens_seen": 7279275, + "router_z_loss_clip": 21.125, + "router_z_loss_mlp": 1.46289062, + "step": 349, + "time_per_iteration": 2.752092123031616 + }, + { + "auxiliary_loss_clip": 0.09846102, + "auxiliary_loss_mlp": 0.01921246, + "balance_loss_clip": 0.07772378, + "balance_loss_mlp": 0.01771328, + "epoch": 0.021043138433789268, + "flos": 24581295878400.0, + "grad_norm": 27.656933027979164, + "language_loss": 1.05012357, + "learning_rate": 3.7716430523347664e-06, + "loss": 1.16779709, + "num_input_tokens_seen": 7300180, + "router_z_loss_clip": 20.71875, + "router_z_loss_mlp": 1.49902344, + "step": 350, + "time_per_iteration": 2.766042947769165 + }, + { + "auxiliary_loss_clip": 0.0987936, + "auxiliary_loss_mlp": 0.01878538, + "balance_loss_clip": 0.07780807, + "balance_loss_mlp": 0.01733103, + "epoch": 0.021103261686457236, + "flos": 24459579423360.0, + "grad_norm": 79.75623451753691, + "language_loss": 0.99250925, + "learning_rate": 3.773480007028776e-06, + "loss": 1.11008823, + "num_input_tokens_seen": 7317430, + "router_z_loss_clip": 21.0, + "router_z_loss_mlp": 1.45507812, + "step": 351, + "time_per_iteration": 2.7852492332458496 + }, + { + "auxiliary_loss_clip": 0.09914102, + "auxiliary_loss_mlp": 0.01872584, + "balance_loss_clip": 0.07798491, + "balance_loss_mlp": 0.01732013, + "epoch": 0.021163384939125205, + "flos": 14688183214080.0, + "grad_norm": 45.172979776217204, + "language_loss": 1.05138326, + "learning_rate": 3.775311735671078e-06, + "loss": 1.16925001, + "num_input_tokens_seen": 7334875, + "router_z_loss_clip": 21.15625, + "router_z_loss_mlp": 1.40527344, + "step": 352, + "time_per_iteration": 2.670952558517456 + }, + { + "auxiliary_loss_clip": 0.09916839, + "auxiliary_loss_mlp": 0.0188162, + "balance_loss_clip": 0.07782572, + "balance_loss_mlp": 0.01727792, + "epoch": 0.021223508191793177, + "flos": 24499173277440.0, + "grad_norm": 32.69809617550279, + "language_loss": 1.02695966, + "learning_rate": 3.7771382679130878e-06, + "loss": 1.14494431, + "num_input_tokens_seen": 7355185, + "router_z_loss_clip": 21.375, + "router_z_loss_mlp": 1.5390625, + "step": 353, + "time_per_iteration": 2.7037458419799805 + }, + { + "auxiliary_loss_clip": 0.09877251, + "auxiliary_loss_mlp": 0.01866766, + "balance_loss_clip": 0.07783737, + "balance_loss_mlp": 0.01718565, + "epoch": 0.021283631444461146, + "flos": 24132667518720.0, + "grad_norm": 42.14264864151201, + "language_loss": 1.01166749, + "learning_rate": 3.7789596331545845e-06, + "loss": 1.12910759, + "num_input_tokens_seen": 7374425, + "router_z_loss_clip": 20.921875, + "router_z_loss_mlp": 1.48242188, + "step": 354, + "time_per_iteration": 2.692936658859253 + }, + { + "auxiliary_loss_clip": 0.0993467, + "auxiliary_loss_mlp": 0.0189021, + "balance_loss_clip": 0.07795032, + "balance_loss_mlp": 0.01743726, + "epoch": 0.021343754697129114, + "flos": 25199299267200.0, + "grad_norm": 49.082565254141, + "language_loss": 1.02249849, + "learning_rate": 3.780775860546545e-06, + "loss": 1.14074731, + "num_input_tokens_seen": 7394175, + "router_z_loss_clip": 21.359375, + "router_z_loss_mlp": 1.46484375, + "step": 355, + "time_per_iteration": 2.703904151916504 + }, + { + "auxiliary_loss_clip": 0.09890301, + "auxiliary_loss_mlp": 0.01933568, + "balance_loss_clip": 0.07771169, + "balance_loss_mlp": 0.01774495, + "epoch": 0.021403877949797083, + "flos": 17279816872320.0, + "grad_norm": 33.424095724347985, + "language_loss": 1.12320316, + "learning_rate": 3.7825869789939474e-06, + "loss": 1.24144173, + "num_input_tokens_seen": 7412645, + "router_z_loss_clip": 21.21875, + "router_z_loss_mlp": 1.58984375, + "step": 356, + "time_per_iteration": 2.7039332389831543 + }, + { + "auxiliary_loss_clip": 0.09926872, + "auxiliary_loss_mlp": 0.01913321, + "balance_loss_clip": 0.07763862, + "balance_loss_mlp": 0.01768648, + "epoch": 0.021464001202465055, + "flos": 30924946062720.0, + "grad_norm": 28.358403300745604, + "language_loss": 1.00492048, + "learning_rate": 3.784393017158528e-06, + "loss": 1.12332249, + "num_input_tokens_seen": 7432275, + "router_z_loss_clip": 21.640625, + "router_z_loss_mlp": 1.44628906, + "step": 357, + "time_per_iteration": 2.7567434310913086 + }, + { + "auxiliary_loss_clip": 0.09896905, + "auxiliary_loss_mlp": 0.0189471, + "balance_loss_clip": 0.0777001, + "balance_loss_mlp": 0.01751087, + "epoch": 0.021524124455133024, + "flos": 18192182054400.0, + "grad_norm": 311.83490549391024, + "language_loss": 1.00049341, + "learning_rate": 3.786194003461506e-06, + "loss": 1.11840951, + "num_input_tokens_seen": 7450245, + "router_z_loss_clip": 21.28125, + "router_z_loss_mlp": 1.43652344, + "step": 358, + "time_per_iteration": 2.697567939758301 + }, + { + "auxiliary_loss_clip": 0.09952264, + "auxiliary_loss_mlp": 0.01876113, + "balance_loss_clip": 0.0777906, + "balance_loss_mlp": 0.01737449, + "epoch": 0.021584247707800992, + "flos": 13810464495360.0, + "grad_norm": 74.44924093849752, + "language_loss": 1.11748183, + "learning_rate": 3.787989966086264e-06, + "loss": 1.2357657, + "num_input_tokens_seen": 7466845, + "router_z_loss_clip": 21.734375, + "router_z_loss_mlp": 1.38671875, + "step": 359, + "time_per_iteration": 2.683791399002075 + }, + { + "auxiliary_loss_clip": 0.09922898, + "auxiliary_loss_mlp": 0.01885242, + "balance_loss_clip": 0.07765573, + "balance_loss_mlp": 0.01746292, + "epoch": 0.02164437096046896, + "flos": 23301418688640.0, + "grad_norm": 64.98362502413198, + "language_loss": 1.06271791, + "learning_rate": 3.789780932980997e-06, + "loss": 1.18079925, + "num_input_tokens_seen": 7485450, + "router_z_loss_clip": 21.5625, + "router_z_loss_mlp": 1.38867188, + "step": 360, + "time_per_iteration": 2.7144362926483154 + }, + { + "auxiliary_loss_clip": 0.08207352, + "auxiliary_loss_mlp": 0.01776906, + "balance_loss_clip": 0.07236059, + "balance_loss_mlp": 0.01669809, + "epoch": 0.02170449421313693, + "flos": 68919621137280.0, + "grad_norm": 1.0217512577987982, + "language_loss": 0.65141213, + "learning_rate": 3.79156693186132e-06, + "loss": 0.75125468, + "num_input_tokens_seen": 7553780, + "router_z_loss_clip": 9.734375, + "router_z_loss_mlp": 1.07324219, + "step": 361, + "time_per_iteration": 3.3981525897979736 + }, + { + "auxiliary_loss_clip": 0.09926173, + "auxiliary_loss_mlp": 0.01850484, + "balance_loss_clip": 0.07767443, + "balance_loss_mlp": 0.01710961, + "epoch": 0.0217646174658049, + "flos": 25235580885120.0, + "grad_norm": 46.06075194478587, + "language_loss": 1.07240796, + "learning_rate": 3.7933479902128433e-06, + "loss": 1.19017458, + "num_input_tokens_seen": 7574155, + "router_z_loss_clip": 21.5625, + "router_z_loss_mlp": 1.39550781, + "step": 362, + "time_per_iteration": 2.7112934589385986 + }, + { + "auxiliary_loss_clip": 0.09902073, + "auxiliary_loss_mlp": 0.01838434, + "balance_loss_clip": 0.07771316, + "balance_loss_mlp": 0.01689852, + "epoch": 0.02182474071847287, + "flos": 22899721415040.0, + "grad_norm": 31.847388073363284, + "language_loss": 1.10624099, + "learning_rate": 3.7951241352937077e-06, + "loss": 1.22364616, + "num_input_tokens_seen": 7592320, + "router_z_loss_clip": 21.3125, + "router_z_loss_mlp": 1.48632812, + "step": 363, + "time_per_iteration": 2.7391881942749023 + }, + { + "auxiliary_loss_clip": 0.09905075, + "auxiliary_loss_mlp": 0.01804412, + "balance_loss_clip": 0.0776676, + "balance_loss_mlp": 0.01661742, + "epoch": 0.02188486397114084, + "flos": 23665660387200.0, + "grad_norm": 28.541039167709148, + "language_loss": 1.08880925, + "learning_rate": 3.7968953941370915e-06, + "loss": 1.20590401, + "num_input_tokens_seen": 7611185, + "router_z_loss_clip": 21.359375, + "router_z_loss_mlp": 1.42578125, + "step": 364, + "time_per_iteration": 2.7092103958129883 + }, + { + "auxiliary_loss_clip": 0.09940802, + "auxiliary_loss_mlp": 0.01790674, + "balance_loss_clip": 0.07771328, + "balance_loss_mlp": 0.01644666, + "epoch": 0.021944987223808807, + "flos": 21550090101120.0, + "grad_norm": 29.41270562877638, + "language_loss": 1.01945662, + "learning_rate": 3.798661793553676e-06, + "loss": 1.13677144, + "num_input_tokens_seen": 7631970, + "router_z_loss_clip": 21.6875, + "router_z_loss_mlp": 1.4609375, + "step": 365, + "time_per_iteration": 2.7039554119110107 + }, + { + "auxiliary_loss_clip": 0.09880184, + "auxiliary_loss_mlp": 0.01787501, + "balance_loss_clip": 0.07767902, + "balance_loss_mlp": 0.01639968, + "epoch": 0.022005110476476776, + "flos": 16076444060160.0, + "grad_norm": 25.357242967570325, + "language_loss": 1.00391948, + "learning_rate": 3.8004233601340808e-06, + "loss": 1.12059641, + "num_input_tokens_seen": 7649745, + "router_z_loss_clip": 21.125, + "router_z_loss_mlp": 1.47558594, + "step": 366, + "time_per_iteration": 2.6410672664642334 + }, + { + "auxiliary_loss_clip": 0.09886092, + "auxiliary_loss_mlp": 0.01802461, + "balance_loss_clip": 0.07774624, + "balance_loss_mlp": 0.01645009, + "epoch": 0.022065233729144748, + "flos": 21440071290240.0, + "grad_norm": 44.529255844390654, + "language_loss": 1.12988663, + "learning_rate": 3.8021801202512694e-06, + "loss": 1.24677217, + "num_input_tokens_seen": 7668830, + "router_z_loss_clip": 21.09375, + "router_z_loss_mlp": 1.57421875, + "step": 367, + "time_per_iteration": 2.742794990539551 + }, + { + "auxiliary_loss_clip": 0.09926969, + "auxiliary_loss_mlp": 0.01819149, + "balance_loss_clip": 0.0779452, + "balance_loss_mlp": 0.01654545, + "epoch": 0.022125356981812717, + "flos": 21550173955200.0, + "grad_norm": 31.338184320621753, + "language_loss": 1.07241869, + "learning_rate": 3.803932100062912e-06, + "loss": 1.18987989, + "num_input_tokens_seen": 7687240, + "router_z_loss_clip": 21.34375, + "router_z_loss_mlp": 1.64648438, + "step": 368, + "time_per_iteration": 2.660156488418579 + }, + { + "auxiliary_loss_clip": 0.09893043, + "auxiliary_loss_mlp": 0.01817736, + "balance_loss_clip": 0.07784697, + "balance_loss_mlp": 0.01649699, + "epoch": 0.022185480234480685, + "flos": 20710413936000.0, + "grad_norm": 81.09585500154182, + "language_loss": 1.0770272, + "learning_rate": 3.8056793255137264e-06, + "loss": 1.19413495, + "num_input_tokens_seen": 7704440, + "router_z_loss_clip": 21.09375, + "router_z_loss_mlp": 1.6796875, + "step": 369, + "time_per_iteration": 2.6966772079467773 + }, + { + "auxiliary_loss_clip": 0.09905175, + "auxiliary_loss_mlp": 0.01835143, + "balance_loss_clip": 0.07793829, + "balance_loss_mlp": 0.01659667, + "epoch": 0.022245603487148654, + "flos": 25200431297280.0, + "grad_norm": 48.526199326230525, + "language_loss": 1.05259717, + "learning_rate": 3.8074218223377844e-06, + "loss": 1.17000043, + "num_input_tokens_seen": 7727160, + "router_z_loss_clip": 21.09375, + "router_z_loss_mlp": 1.75585938, + "step": 370, + "time_per_iteration": 2.726882219314575 + }, + { + "auxiliary_loss_clip": 0.09840686, + "auxiliary_loss_mlp": 0.01849254, + "balance_loss_clip": 0.0775683, + "balance_loss_mlp": 0.01677497, + "epoch": 0.022305726739816623, + "flos": 21402070663680.0, + "grad_norm": 32.14486041550045, + "language_loss": 1.00516605, + "learning_rate": 3.8091596160607834e-06, + "loss": 1.12206554, + "num_input_tokens_seen": 7747730, + "router_z_loss_clip": 20.828125, + "router_z_loss_mlp": 1.71875, + "step": 371, + "time_per_iteration": 2.6846559047698975 + }, + { + "auxiliary_loss_clip": 0.09844472, + "auxiliary_loss_mlp": 0.01857578, + "balance_loss_clip": 0.07769165, + "balance_loss_mlp": 0.01683151, + "epoch": 0.022365849992484595, + "flos": 22498736901120.0, + "grad_norm": 33.301604666823, + "language_loss": 1.06231499, + "learning_rate": 3.8108927320022896e-06, + "loss": 1.17933559, + "num_input_tokens_seen": 7766765, + "router_z_loss_clip": 20.734375, + "router_z_loss_mlp": 1.74511719, + "step": 372, + "time_per_iteration": 2.7052745819091797 + }, + { + "auxiliary_loss_clip": 0.09826015, + "auxiliary_loss_mlp": 0.01853945, + "balance_loss_clip": 0.07764611, + "balance_loss_mlp": 0.01673796, + "epoch": 0.022425973245152563, + "flos": 17862083694720.0, + "grad_norm": 41.636352487556145, + "language_loss": 1.03913403, + "learning_rate": 3.8126211952779548e-06, + "loss": 1.15593362, + "num_input_tokens_seen": 7784010, + "router_z_loss_clip": 20.640625, + "router_z_loss_mlp": 1.80078125, + "step": 373, + "time_per_iteration": 4.106141090393066 + }, + { + "auxiliary_loss_clip": 0.09845725, + "auxiliary_loss_mlp": 0.01869282, + "balance_loss_clip": 0.07777153, + "balance_loss_mlp": 0.01685128, + "epoch": 0.022486096497820532, + "flos": 15487804327680.0, + "grad_norm": 61.54476347228186, + "language_loss": 1.0650835, + "learning_rate": 3.8143450308016952e-06, + "loss": 1.18223345, + "num_input_tokens_seen": 7801305, + "router_z_loss_clip": 20.703125, + "router_z_loss_mlp": 1.84277344, + "step": 374, + "time_per_iteration": 4.033753871917725 + }, + { + "auxiliary_loss_clip": 0.09812269, + "auxiliary_loss_mlp": 0.01856399, + "balance_loss_clip": 0.07757415, + "balance_loss_mlp": 0.01667095, + "epoch": 0.0225462197504885, + "flos": 27791897247360.0, + "grad_norm": 56.210759270114224, + "language_loss": 1.03319001, + "learning_rate": 3.8160642632878525e-06, + "loss": 1.14987683, + "num_input_tokens_seen": 7823965, + "router_z_loss_clip": 20.5625, + "router_z_loss_mlp": 1.89257812, + "step": 375, + "time_per_iteration": 2.7545790672302246 + }, + { + "auxiliary_loss_clip": 0.0981497, + "auxiliary_loss_mlp": 0.01843627, + "balance_loss_clip": 0.07751609, + "balance_loss_mlp": 0.01665767, + "epoch": 0.02260634300315647, + "flos": 19981804757760.0, + "grad_norm": 57.812718044092065, + "language_loss": 1.07001138, + "learning_rate": 3.817778917253314e-06, + "loss": 1.18659735, + "num_input_tokens_seen": 7842115, + "router_z_loss_clip": 20.625, + "router_z_loss_mlp": 1.77734375, + "step": 376, + "time_per_iteration": 4.076448202133179 + }, + { + "auxiliary_loss_clip": 0.09767978, + "auxiliary_loss_mlp": 0.01843169, + "balance_loss_clip": 0.07741934, + "balance_loss_mlp": 0.01659587, + "epoch": 0.02266646625582444, + "flos": 16032699429120.0, + "grad_norm": 49.61569881920644, + "language_loss": 1.03111744, + "learning_rate": 3.8194890170196155e-06, + "loss": 1.14722896, + "num_input_tokens_seen": 7857830, + "router_z_loss_clip": 20.265625, + "router_z_loss_mlp": 1.83691406, + "step": 377, + "time_per_iteration": 2.7254374027252197 + }, + { + "auxiliary_loss_clip": 0.09738941, + "auxiliary_loss_mlp": 0.01853994, + "balance_loss_clip": 0.07719769, + "balance_loss_mlp": 0.01670221, + "epoch": 0.02272658950849241, + "flos": 20409553451520.0, + "grad_norm": 48.84797020114705, + "language_loss": 1.2001133, + "learning_rate": 3.8211945867150055e-06, + "loss": 1.31604266, + "num_input_tokens_seen": 7875840, + "router_z_loss_clip": 20.171875, + "router_z_loss_mlp": 1.83691406, + "step": 378, + "time_per_iteration": 2.648167848587036 + }, + { + "auxiliary_loss_clip": 0.08046754, + "auxiliary_loss_mlp": 0.0138253, + "balance_loss_clip": 0.07155026, + "balance_loss_mlp": 0.01272953, + "epoch": 0.02278671276116038, + "flos": 69867387469440.0, + "grad_norm": 0.9915915427532991, + "language_loss": 0.75403833, + "learning_rate": 3.822895650276492e-06, + "loss": 0.84833115, + "num_input_tokens_seen": 7940190, + "router_z_loss_clip": 8.90625, + "router_z_loss_mlp": 1.09863281, + "step": 379, + "time_per_iteration": 3.301997661590576 + }, + { + "auxiliary_loss_clip": 0.09709425, + "auxiliary_loss_mlp": 0.01844372, + "balance_loss_clip": 0.07733691, + "balance_loss_mlp": 0.0167643, + "epoch": 0.022846836013828347, + "flos": 38517935823360.0, + "grad_norm": 57.599828595547535, + "language_loss": 1.02933359, + "learning_rate": 3.824592231451859e-06, + "loss": 1.14487147, + "num_input_tokens_seen": 7960840, + "router_z_loss_clip": 19.75, + "router_z_loss_mlp": 1.6796875, + "step": 380, + "time_per_iteration": 2.817310094833374 + }, + { + "auxiliary_loss_clip": 0.09699684, + "auxiliary_loss_mlp": 0.01850822, + "balance_loss_clip": 0.07715706, + "balance_loss_mlp": 0.01682976, + "epoch": 0.02290695926649632, + "flos": 20965768853760.0, + "grad_norm": 97.98649595332142, + "language_loss": 1.19140625, + "learning_rate": 3.826284353801652e-06, + "loss": 1.30691135, + "num_input_tokens_seen": 7975500, + "router_z_loss_clip": 19.875, + "router_z_loss_mlp": 1.6796875, + "step": 381, + "time_per_iteration": 2.6415421962738037 + }, + { + "auxiliary_loss_clip": 0.09691618, + "auxiliary_loss_mlp": 0.01878712, + "balance_loss_clip": 0.0772172, + "balance_loss_mlp": 0.01696942, + "epoch": 0.022967082519164288, + "flos": 24028895836800.0, + "grad_norm": 71.67825440631948, + "language_loss": 1.08586979, + "learning_rate": 3.827972040701142e-06, + "loss": 1.20157313, + "num_input_tokens_seen": 7993880, + "router_z_loss_clip": 19.703125, + "router_z_loss_mlp": 1.81640625, + "step": 382, + "time_per_iteration": 2.688380718231201 + }, + { + "auxiliary_loss_clip": 0.0969088, + "auxiliary_loss_mlp": 0.0187998, + "balance_loss_clip": 0.07735589, + "balance_loss_mlp": 0.01704695, + "epoch": 0.023027205771832256, + "flos": 21003643699200.0, + "grad_norm": 97.39739491884717, + "language_loss": 1.06533158, + "learning_rate": 3.829655315342268e-06, + "loss": 1.18104029, + "num_input_tokens_seen": 8012730, + "router_z_loss_clip": 19.53125, + "router_z_loss_mlp": 1.75292969, + "step": 383, + "time_per_iteration": 2.697038173675537 + }, + { + "auxiliary_loss_clip": 0.09652471, + "auxiliary_loss_mlp": 0.01917586, + "balance_loss_clip": 0.07717164, + "balance_loss_mlp": 0.017485, + "epoch": 0.023087329024500225, + "flos": 21367172638080.0, + "grad_norm": 19.8768776799836, + "language_loss": 1.04799581, + "learning_rate": 3.831334200735543e-06, + "loss": 1.16369653, + "num_input_tokens_seen": 8031275, + "router_z_loss_clip": 19.34375, + "router_z_loss_mlp": 1.68945312, + "step": 384, + "time_per_iteration": 2.778743028640747 + }, + { + "auxiliary_loss_clip": 0.09638548, + "auxiliary_loss_mlp": 0.01934173, + "balance_loss_clip": 0.07711613, + "balance_loss_mlp": 0.01771858, + "epoch": 0.023147452277168194, + "flos": 21879014503680.0, + "grad_norm": 73.36535290584087, + "language_loss": 1.05852127, + "learning_rate": 3.8330087197119426e-06, + "loss": 1.17424858, + "num_input_tokens_seen": 8051600, + "router_z_loss_clip": 19.265625, + "router_z_loss_mlp": 1.62402344, + "step": 385, + "time_per_iteration": 2.6939914226531982 + }, + { + "auxiliary_loss_clip": 0.09652182, + "auxiliary_loss_mlp": 0.01965061, + "balance_loss_clip": 0.07710169, + "balance_loss_mlp": 0.01799503, + "epoch": 0.023207575529836166, + "flos": 18922719876480.0, + "grad_norm": 50.36598663544367, + "language_loss": 0.83061486, + "learning_rate": 3.83467889492477e-06, + "loss": 0.9467873, + "num_input_tokens_seen": 8070600, + "router_z_loss_clip": 19.390625, + "router_z_loss_mlp": 1.65527344, + "step": 386, + "time_per_iteration": 2.655557870864868 + }, + { + "auxiliary_loss_clip": 0.09622966, + "auxiliary_loss_mlp": 0.01950141, + "balance_loss_clip": 0.07707699, + "balance_loss_mlp": 0.01772281, + "epoch": 0.023267698782504134, + "flos": 25052998838400.0, + "grad_norm": 988.1002722416383, + "language_loss": 1.04901791, + "learning_rate": 3.836344748851495e-06, + "loss": 1.16474891, + "num_input_tokens_seen": 8090680, + "router_z_loss_clip": 19.171875, + "router_z_loss_mlp": 1.77832031, + "step": 387, + "time_per_iteration": 2.7180447578430176 + }, + { + "auxiliary_loss_clip": 0.09642081, + "auxiliary_loss_mlp": 0.01949741, + "balance_loss_clip": 0.0771786, + "balance_loss_mlp": 0.0177932, + "epoch": 0.023327822035172103, + "flos": 28887221819520.0, + "grad_norm": 25.325317169555962, + "language_loss": 1.03613186, + "learning_rate": 3.838006303795566e-06, + "loss": 1.15205002, + "num_input_tokens_seen": 8114610, + "router_z_loss_clip": 19.21875, + "router_z_loss_mlp": 1.70410156, + "step": 388, + "time_per_iteration": 2.7562358379364014 + }, + { + "auxiliary_loss_clip": 0.09633669, + "auxiliary_loss_mlp": 0.01946229, + "balance_loss_clip": 0.0770783, + "balance_loss_mlp": 0.01764268, + "epoch": 0.02338794528784007, + "flos": 27128178656640.0, + "grad_norm": 20.981666659787948, + "language_loss": 1.1374321, + "learning_rate": 3.839663581888206e-06, + "loss": 1.25323105, + "num_input_tokens_seen": 8133975, + "router_z_loss_clip": 19.25, + "router_z_loss_mlp": 1.8203125, + "step": 389, + "time_per_iteration": 2.762704372406006 + }, + { + "auxiliary_loss_clip": 0.09556312, + "auxiliary_loss_mlp": 0.01957007, + "balance_loss_clip": 0.07663149, + "balance_loss_mlp": 0.01788016, + "epoch": 0.02344806854050804, + "flos": 21328375397760.0, + "grad_norm": 32.87948782751001, + "language_loss": 1.07566035, + "learning_rate": 3.841316605090178e-06, + "loss": 1.19079351, + "num_input_tokens_seen": 8153570, + "router_z_loss_clip": 18.921875, + "router_z_loss_mlp": 1.68945312, + "step": 390, + "time_per_iteration": 2.659283399581909 + }, + { + "auxiliary_loss_clip": 0.09492537, + "auxiliary_loss_mlp": 0.01896556, + "balance_loss_clip": 0.07636442, + "balance_loss_mlp": 0.01733001, + "epoch": 0.023508191793176012, + "flos": 24796847306880.0, + "grad_norm": 140.16785757024044, + "language_loss": 1.15910161, + "learning_rate": 3.842965395193529e-06, + "loss": 1.27299261, + "num_input_tokens_seen": 8170075, + "router_z_loss_clip": 18.546875, + "router_z_loss_mlp": 1.63476562, + "step": 391, + "time_per_iteration": 2.713545799255371 + }, + { + "auxiliary_loss_clip": 0.09538671, + "auxiliary_loss_mlp": 0.0188554, + "balance_loss_clip": 0.0766757, + "balance_loss_mlp": 0.01730473, + "epoch": 0.02356831504584398, + "flos": 26002651887360.0, + "grad_norm": 36.4029876381944, + "language_loss": 1.06844151, + "learning_rate": 3.84460997382332e-06, + "loss": 1.18268371, + "num_input_tokens_seen": 8190420, + "router_z_loss_clip": 18.6875, + "router_z_loss_mlp": 1.54882812, + "step": 392, + "time_per_iteration": 2.738403081893921 + }, + { + "auxiliary_loss_clip": 0.09424435, + "auxiliary_loss_mlp": 0.01937068, + "balance_loss_clip": 0.07618648, + "balance_loss_mlp": 0.01782287, + "epoch": 0.02362843829851195, + "flos": 19068475253760.0, + "grad_norm": 23.190572901307267, + "language_loss": 1.05277753, + "learning_rate": 3.8462503624393256e-06, + "loss": 1.16639256, + "num_input_tokens_seen": 8208790, + "router_z_loss_clip": 18.0625, + "router_z_loss_mlp": 1.546875, + "step": 393, + "time_per_iteration": 2.730311155319214 + }, + { + "auxiliary_loss_clip": 0.09391345, + "auxiliary_loss_mlp": 0.01894272, + "balance_loss_clip": 0.07595266, + "balance_loss_mlp": 0.01726616, + "epoch": 0.023688561551179918, + "flos": 16076611768320.0, + "grad_norm": 91.86478442531423, + "language_loss": 1.00682688, + "learning_rate": 3.84788658233771e-06, + "loss": 1.11968303, + "num_input_tokens_seen": 8226885, + "router_z_loss_clip": 17.953125, + "router_z_loss_mlp": 1.67578125, + "step": 394, + "time_per_iteration": 2.705462694168091 + }, + { + "auxiliary_loss_clip": 0.09387165, + "auxiliary_loss_mlp": 0.01881808, + "balance_loss_clip": 0.07597888, + "balance_loss_mlp": 0.01708144, + "epoch": 0.023748684803847887, + "flos": 21730575795840.0, + "grad_norm": 29.466731361634597, + "language_loss": 1.02469492, + "learning_rate": 3.84951865465269e-06, + "loss": 1.13738465, + "num_input_tokens_seen": 8246825, + "router_z_loss_clip": 17.875, + "router_z_loss_mlp": 1.73632812, + "step": 395, + "time_per_iteration": 2.67728328704834 + }, + { + "auxiliary_loss_clip": 0.07807533, + "auxiliary_loss_mlp": 0.01422272, + "balance_loss_clip": 0.06998962, + "balance_loss_mlp": 0.01324949, + "epoch": 0.02380880805651586, + "flos": 61944299349120.0, + "grad_norm": 0.9675883167947973, + "language_loss": 0.63979137, + "learning_rate": 3.851146600358172e-06, + "loss": 0.7320894, + "num_input_tokens_seen": 8302835, + "router_z_loss_clip": 8.09375, + "router_z_loss_mlp": 0.97216797, + "step": 396, + "time_per_iteration": 3.085773468017578 + }, + { + "auxiliary_loss_clip": 0.09369384, + "auxiliary_loss_mlp": 0.01878876, + "balance_loss_clip": 0.07592572, + "balance_loss_mlp": 0.01705307, + "epoch": 0.023868931309183827, + "flos": 20272518898560.0, + "grad_norm": 448.6329753345253, + "language_loss": 1.09206522, + "learning_rate": 3.852770440269372e-06, + "loss": 1.20454776, + "num_input_tokens_seen": 8320745, + "router_z_loss_clip": 17.765625, + "router_z_loss_mlp": 1.73632812, + "step": 397, + "time_per_iteration": 2.645312786102295 + }, + { + "auxiliary_loss_clip": 0.09360366, + "auxiliary_loss_mlp": 0.01887806, + "balance_loss_clip": 0.07592075, + "balance_loss_mlp": 0.01703461, + "epoch": 0.023929054561851796, + "flos": 21144954810240.0, + "grad_norm": 35.15382244199787, + "language_loss": 1.09138823, + "learning_rate": 3.854390195044404e-06, + "loss": 1.20386982, + "num_input_tokens_seen": 8339540, + "router_z_loss_clip": 17.671875, + "router_z_loss_mlp": 1.84277344, + "step": 398, + "time_per_iteration": 2.7186756134033203 + }, + { + "auxiliary_loss_clip": 0.09363802, + "auxiliary_loss_mlp": 0.01863352, + "balance_loss_clip": 0.07595689, + "balance_loss_mlp": 0.01681963, + "epoch": 0.023989177814519765, + "flos": 13703548285440.0, + "grad_norm": 79.14501576371894, + "language_loss": 1.17455924, + "learning_rate": 3.856005885185868e-06, + "loss": 1.2868309, + "num_input_tokens_seen": 8354890, + "router_z_loss_clip": 17.6875, + "router_z_loss_mlp": 1.81347656, + "step": 399, + "time_per_iteration": 2.6266868114471436 + }, + { + "auxiliary_loss_clip": 0.09350164, + "auxiliary_loss_mlp": 0.01862402, + "balance_loss_clip": 0.07603092, + "balance_loss_mlp": 0.0168683, + "epoch": 0.024049301067187733, + "flos": 26329060667520.0, + "grad_norm": 31.26445557719831, + "language_loss": 1.02793097, + "learning_rate": 3.857617531042398e-06, + "loss": 1.14005673, + "num_input_tokens_seen": 8375845, + "router_z_loss_clip": 17.46875, + "router_z_loss_mlp": 1.75585938, + "step": 400, + "time_per_iteration": 2.766996145248413 + }, + { + "auxiliary_loss_clip": 0.09326777, + "auxiliary_loss_mlp": 0.01879183, + "balance_loss_clip": 0.07581857, + "balance_loss_mlp": 0.01707522, + "epoch": 0.024109424319855705, + "flos": 24432270192000.0, + "grad_norm": 165.70452294486532, + "language_loss": 0.98901701, + "learning_rate": 3.8592251528102065e-06, + "loss": 1.1010766, + "num_input_tokens_seen": 8395240, + "router_z_loss_clip": 17.46875, + "router_z_loss_mlp": 1.71679688, + "step": 401, + "time_per_iteration": 2.6877481937408447 + }, + { + "auxiliary_loss_clip": 0.09325443, + "auxiliary_loss_mlp": 0.01927273, + "balance_loss_clip": 0.0761469, + "balance_loss_mlp": 0.01736538, + "epoch": 0.024169547572523674, + "flos": 29611764074880.0, + "grad_norm": 158.83382742696674, + "language_loss": 1.04086566, + "learning_rate": 3.8608287705345976e-06, + "loss": 1.15339279, + "num_input_tokens_seen": 8416950, + "router_z_loss_clip": 17.09375, + "router_z_loss_mlp": 1.90722656, + "step": 402, + "time_per_iteration": 2.7297163009643555 + }, + { + "auxiliary_loss_clip": 0.09320071, + "auxiliary_loss_mlp": 0.01914681, + "balance_loss_clip": 0.07593916, + "balance_loss_mlp": 0.01724327, + "epoch": 0.024229670825191642, + "flos": 22608042952320.0, + "grad_norm": 474.9195361774189, + "language_loss": 1.23886442, + "learning_rate": 3.86242840411147e-06, + "loss": 1.35121191, + "num_input_tokens_seen": 8433660, + "router_z_loss_clip": 17.265625, + "router_z_loss_mlp": 1.90234375, + "step": 403, + "time_per_iteration": 2.6663832664489746 + }, + { + "auxiliary_loss_clip": 0.09310063, + "auxiliary_loss_mlp": 0.01918458, + "balance_loss_clip": 0.07606195, + "balance_loss_mlp": 0.01729535, + "epoch": 0.02428979407785961, + "flos": 18156110071680.0, + "grad_norm": 557.4725363749534, + "language_loss": 1.23195148, + "learning_rate": 3.864024073288798e-06, + "loss": 1.34423661, + "num_input_tokens_seen": 8450180, + "router_z_loss_clip": 17.0625, + "router_z_loss_mlp": 1.88867188, + "step": 404, + "time_per_iteration": 2.6930551528930664 + }, + { + "auxiliary_loss_clip": 0.09236102, + "auxiliary_loss_mlp": 0.01972168, + "balance_loss_clip": 0.07543309, + "balance_loss_mlp": 0.01765125, + "epoch": 0.024349917330527583, + "flos": 15310463160960.0, + "grad_norm": 32.91094539461264, + "language_loss": 1.10026622, + "learning_rate": 3.865615797668091e-06, + "loss": 1.21234894, + "num_input_tokens_seen": 8467775, + "router_z_loss_clip": 16.921875, + "router_z_loss_mlp": 2.0703125, + "step": 405, + "time_per_iteration": 2.7313172817230225 + }, + { + "auxiliary_loss_clip": 0.09182028, + "auxiliary_loss_mlp": 0.01998566, + "balance_loss_clip": 0.0751636, + "balance_loss_mlp": 0.01782559, + "epoch": 0.024410040583195552, + "flos": 20779623008640.0, + "grad_norm": 51.884422925202074, + "language_loss": 1.20401216, + "learning_rate": 3.867203596705844e-06, + "loss": 1.31581819, + "num_input_tokens_seen": 8486765, + "router_z_loss_clip": 16.65625, + "router_z_loss_mlp": 2.16015625, + "step": 406, + "time_per_iteration": 2.687269449234009 + }, + { + "auxiliary_loss_clip": 0.09164648, + "auxiliary_loss_mlp": 0.02058169, + "balance_loss_clip": 0.07528092, + "balance_loss_mlp": 0.01824328, + "epoch": 0.02447016383586352, + "flos": 21805319237760.0, + "grad_norm": 51.34272238318618, + "language_loss": 1.09166133, + "learning_rate": 3.86878748971496e-06, + "loss": 1.20388949, + "num_input_tokens_seen": 8506515, + "router_z_loss_clip": 16.375, + "router_z_loss_mlp": 2.33789062, + "step": 407, + "time_per_iteration": 2.7443573474884033 + }, + { + "auxiliary_loss_clip": 0.0913244, + "auxiliary_loss_mlp": 0.02070529, + "balance_loss_clip": 0.07525964, + "balance_loss_mlp": 0.01834208, + "epoch": 0.02453028708853149, + "flos": 33956529183360.0, + "grad_norm": 76.90003006133684, + "language_loss": 0.92362475, + "learning_rate": 3.8703674958661596e-06, + "loss": 1.03565443, + "num_input_tokens_seen": 8528035, + "router_z_loss_clip": 16.0546875, + "router_z_loss_mlp": 2.36132812, + "step": 408, + "time_per_iteration": 2.78354549407959 + }, + { + "auxiliary_loss_clip": 0.09112523, + "auxiliary_loss_mlp": 0.02060747, + "balance_loss_clip": 0.07508834, + "balance_loss_mlp": 0.01828241, + "epoch": 0.024590410341199458, + "flos": 21798485130240.0, + "grad_norm": 96.45423831363296, + "language_loss": 1.18704772, + "learning_rate": 3.871943634189376e-06, + "loss": 1.29878044, + "num_input_tokens_seen": 8546455, + "router_z_loss_clip": 16.015625, + "router_z_loss_mlp": 2.32421875, + "step": 409, + "time_per_iteration": 2.7200136184692383 + }, + { + "auxiliary_loss_clip": 0.09154539, + "auxiliary_loss_mlp": 0.02068674, + "balance_loss_clip": 0.07541502, + "balance_loss_mlp": 0.01836741, + "epoch": 0.02465053359386743, + "flos": 35123243034240.0, + "grad_norm": 76.46793311342431, + "language_loss": 1.05106175, + "learning_rate": 3.873515923575128e-06, + "loss": 1.16329384, + "num_input_tokens_seen": 8568450, + "router_z_loss_clip": 16.1171875, + "router_z_loss_mlp": 2.3203125, + "step": 410, + "time_per_iteration": 2.7935402393341064 + }, + { + "auxiliary_loss_clip": 0.09179245, + "auxiliary_loss_mlp": 0.02052485, + "balance_loss_clip": 0.07555975, + "balance_loss_mlp": 0.01831042, + "epoch": 0.0247106568465354, + "flos": 27458360870400.0, + "grad_norm": 178.4501833385731, + "language_loss": 1.0301317, + "learning_rate": 3.875084382775879e-06, + "loss": 1.14244902, + "num_input_tokens_seen": 8589340, + "router_z_loss_clip": 16.25, + "router_z_loss_mlp": 2.21679688, + "step": 411, + "time_per_iteration": 2.810314416885376 + }, + { + "auxiliary_loss_clip": 0.09117973, + "auxiliary_loss_mlp": 0.02147569, + "balance_loss_clip": 0.07523946, + "balance_loss_mlp": 0.01899232, + "epoch": 0.024770780099203367, + "flos": 20709994665600.0, + "grad_norm": 31.381834451084366, + "language_loss": 1.07807076, + "learning_rate": 3.87664903040738e-06, + "loss": 1.19072616, + "num_input_tokens_seen": 8607150, + "router_z_loss_clip": 15.9375, + "router_z_loss_mlp": 2.48242188, + "step": 412, + "time_per_iteration": 4.135298252105713 + }, + { + "auxiliary_loss_clip": 0.0766484, + "auxiliary_loss_mlp": 0.01383218, + "balance_loss_clip": 0.06950212, + "balance_loss_mlp": 0.01289853, + "epoch": 0.024830903351871336, + "flos": 69571264740480.0, + "grad_norm": 0.8458100626859368, + "language_loss": 0.58554661, + "learning_rate": 3.878209884949994e-06, + "loss": 0.67602718, + "num_input_tokens_seen": 8669865, + "router_z_loss_clip": 7.13671875, + "router_z_loss_mlp": 0.93261719, + "step": 413, + "time_per_iteration": 4.813804864883423 + }, + { + "auxiliary_loss_clip": 0.09105721, + "auxiliary_loss_mlp": 0.02060854, + "balance_loss_clip": 0.07511897, + "balance_loss_mlp": 0.01837503, + "epoch": 0.024891026604539304, + "flos": 32278728153600.0, + "grad_norm": 48.89104730966055, + "language_loss": 0.9726972, + "learning_rate": 3.879766964750006e-06, + "loss": 1.08436298, + "num_input_tokens_seen": 8690235, + "router_z_loss_clip": 15.921875, + "router_z_loss_mlp": 2.234375, + "step": 414, + "time_per_iteration": 2.777872323989868 + }, + { + "auxiliary_loss_clip": 0.0905456, + "auxiliary_loss_mlp": 0.02077859, + "balance_loss_clip": 0.07483284, + "balance_loss_mlp": 0.0185365, + "epoch": 0.024951149857207276, + "flos": 18845712374400.0, + "grad_norm": 208.18956686369972, + "language_loss": 1.01095724, + "learning_rate": 3.881320288020917e-06, + "loss": 1.12228131, + "num_input_tokens_seen": 8706295, + "router_z_loss_clip": 15.71875, + "router_z_loss_mlp": 2.24023438, + "step": 415, + "time_per_iteration": 4.142550230026245 + }, + { + "auxiliary_loss_clip": 0.09080397, + "auxiliary_loss_mlp": 0.02074643, + "balance_loss_clip": 0.07484584, + "balance_loss_mlp": 0.0184805, + "epoch": 0.025011273109875245, + "flos": 15382565199360.0, + "grad_norm": 178.52142115782007, + "language_loss": 1.28543544, + "learning_rate": 3.882869872844723e-06, + "loss": 1.39698577, + "num_input_tokens_seen": 8724200, + "router_z_loss_clip": 15.953125, + "router_z_loss_mlp": 2.26757812, + "step": 416, + "time_per_iteration": 2.6912667751312256 + }, + { + "auxiliary_loss_clip": 0.09093624, + "auxiliary_loss_mlp": 0.02048458, + "balance_loss_clip": 0.07498566, + "balance_loss_mlp": 0.01806797, + "epoch": 0.025071396362543213, + "flos": 18921336284160.0, + "grad_norm": 52.83271193802728, + "language_loss": 0.94415307, + "learning_rate": 3.884415737173176e-06, + "loss": 1.05557394, + "num_input_tokens_seen": 8744170, + "router_z_loss_clip": 15.9609375, + "router_z_loss_mlp": 2.41796875, + "step": 417, + "time_per_iteration": 2.6768579483032227 + }, + { + "auxiliary_loss_clip": 0.0906695, + "auxiliary_loss_mlp": 0.02050523, + "balance_loss_clip": 0.07510033, + "balance_loss_mlp": 0.01817826, + "epoch": 0.025131519615211182, + "flos": 25345012717440.0, + "grad_norm": 47.28632079324067, + "language_loss": 0.95738804, + "learning_rate": 3.8859578988290344e-06, + "loss": 1.06856275, + "num_input_tokens_seen": 8765120, + "router_z_loss_clip": 15.5625, + "router_z_loss_mlp": 2.328125, + "step": 418, + "time_per_iteration": 2.7193026542663574 + }, + { + "auxiliary_loss_clip": 0.09048779, + "auxiliary_loss_mlp": 0.02107992, + "balance_loss_clip": 0.07468801, + "balance_loss_mlp": 0.01844969, + "epoch": 0.02519164286787915, + "flos": 18959169202560.0, + "grad_norm": 64.96228222580599, + "language_loss": 1.10502434, + "learning_rate": 3.887496375507294e-06, + "loss": 1.21659207, + "num_input_tokens_seen": 8783500, + "router_z_loss_clip": 15.7890625, + "router_z_loss_mlp": 2.62890625, + "step": 419, + "time_per_iteration": 2.661895513534546 + }, + { + "auxiliary_loss_clip": 0.09047179, + "auxiliary_loss_mlp": 0.02074314, + "balance_loss_clip": 0.07473344, + "balance_loss_mlp": 0.01826931, + "epoch": 0.025251766120547123, + "flos": 17426913914880.0, + "grad_norm": 60.48178105720379, + "language_loss": 0.91689897, + "learning_rate": 3.8890311847764065e-06, + "loss": 1.02811384, + "num_input_tokens_seen": 8801175, + "router_z_loss_clip": 15.7265625, + "router_z_loss_mlp": 2.47070312, + "step": 420, + "time_per_iteration": 2.690960168838501 + }, + { + "auxiliary_loss_clip": 0.09091747, + "auxiliary_loss_mlp": 0.02038651, + "balance_loss_clip": 0.07504605, + "balance_loss_mlp": 0.01800423, + "epoch": 0.02531188937321509, + "flos": 25052328005760.0, + "grad_norm": 83.61542449738408, + "language_loss": 0.95396888, + "learning_rate": 3.890562344079484e-06, + "loss": 1.06527293, + "num_input_tokens_seen": 8820215, + "router_z_loss_clip": 15.875, + "router_z_loss_mlp": 2.38085938, + "step": 421, + "time_per_iteration": 2.713627338409424 + }, + { + "auxiliary_loss_clip": 0.0910122, + "auxiliary_loss_mlp": 0.02078743, + "balance_loss_clip": 0.07504999, + "balance_loss_mlp": 0.0184185, + "epoch": 0.02537201262588306, + "flos": 30600214364160.0, + "grad_norm": 131.53322969932037, + "language_loss": 1.06396794, + "learning_rate": 3.89208987073549e-06, + "loss": 1.17576766, + "num_input_tokens_seen": 8839660, + "router_z_loss_clip": 15.96875, + "router_z_loss_mlp": 2.36914062, + "step": 422, + "time_per_iteration": 2.779984712600708 + }, + { + "auxiliary_loss_clip": 0.09149099, + "auxiliary_loss_mlp": 0.02005588, + "balance_loss_clip": 0.07524605, + "balance_loss_mlp": 0.01778041, + "epoch": 0.02543213587855103, + "flos": 26072154449280.0, + "grad_norm": 215.69560731113194, + "language_loss": 1.02335918, + "learning_rate": 3.893613781940409e-06, + "loss": 1.13490605, + "num_input_tokens_seen": 8859280, + "router_z_loss_clip": 16.2265625, + "router_z_loss_mlp": 2.27148438, + "step": 423, + "time_per_iteration": 2.72013783454895 + }, + { + "auxiliary_loss_clip": 0.09173086, + "auxiliary_loss_mlp": 0.0200403, + "balance_loss_clip": 0.07535084, + "balance_loss_mlp": 0.01785067, + "epoch": 0.025492259131218997, + "flos": 36030744679680.0, + "grad_norm": 27.081185373152007, + "language_loss": 0.91272038, + "learning_rate": 3.895134094768415e-06, + "loss": 1.02449155, + "num_input_tokens_seen": 8880560, + "router_z_loss_clip": 16.375, + "router_z_loss_mlp": 2.18945312, + "step": 424, + "time_per_iteration": 2.8317928314208984 + }, + { + "auxiliary_loss_clip": 0.09242675, + "auxiliary_loss_mlp": 0.01968499, + "balance_loss_clip": 0.07578178, + "balance_loss_mlp": 0.01753446, + "epoch": 0.02555238238388697, + "flos": 18593963182080.0, + "grad_norm": 166.26721899755887, + "language_loss": 1.05789995, + "learning_rate": 3.896650826173015e-06, + "loss": 1.17001164, + "num_input_tokens_seen": 8899155, + "router_z_loss_clip": 16.625, + "router_z_loss_mlp": 2.15332031, + "step": 425, + "time_per_iteration": 2.660106897354126 + }, + { + "auxiliary_loss_clip": 0.0923897, + "auxiliary_loss_mlp": 0.01943853, + "balance_loss_clip": 0.07566722, + "balance_loss_mlp": 0.01731852, + "epoch": 0.025612505636554938, + "flos": 24250023561600.0, + "grad_norm": 44.6180367993383, + "language_loss": 1.08164155, + "learning_rate": 3.898163992988186e-06, + "loss": 1.19346988, + "num_input_tokens_seen": 8917890, + "router_z_loss_clip": 16.703125, + "router_z_loss_mlp": 2.12109375, + "step": 426, + "time_per_iteration": 2.713566303253174 + }, + { + "auxiliary_loss_clip": 0.07567823, + "auxiliary_loss_mlp": 0.0137553, + "balance_loss_clip": 0.06925757, + "balance_loss_mlp": 0.01282499, + "epoch": 0.025672628889222907, + "flos": 60606617241600.0, + "grad_norm": 0.882551554014491, + "language_loss": 0.57127881, + "learning_rate": 3.899673611929491e-06, + "loss": 0.66071236, + "num_input_tokens_seen": 8978260, + "router_z_loss_clip": 6.43359375, + "router_z_loss_mlp": 0.92919922, + "step": 427, + "time_per_iteration": 3.3642380237579346 + }, + { + "auxiliary_loss_clip": 0.09344095, + "auxiliary_loss_mlp": 0.01954303, + "balance_loss_clip": 0.0761513, + "balance_loss_mlp": 0.01743541, + "epoch": 0.025732752141890875, + "flos": 19579352797440.0, + "grad_norm": 32.1114157010126, + "language_loss": 1.08901465, + "learning_rate": 3.901179699595194e-06, + "loss": 1.20199859, + "num_input_tokens_seen": 8994460, + "router_z_loss_clip": 17.296875, + "router_z_loss_mlp": 2.10839844, + "step": 428, + "time_per_iteration": 2.6606802940368652 + }, + { + "auxiliary_loss_clip": 0.09310514, + "auxiliary_loss_mlp": 0.01961632, + "balance_loss_clip": 0.07603246, + "balance_loss_mlp": 0.01752969, + "epoch": 0.025792875394558847, + "flos": 31292164581120.0, + "grad_norm": 36.551830180207176, + "language_loss": 1.00762367, + "learning_rate": 3.902682272467353e-06, + "loss": 1.12034512, + "num_input_tokens_seen": 9016670, + "router_z_loss_clip": 17.046875, + "router_z_loss_mlp": 2.08984375, + "step": 429, + "time_per_iteration": 2.8459787368774414 + }, + { + "auxiliary_loss_clip": 0.09338318, + "auxiliary_loss_mlp": 0.01955653, + "balance_loss_clip": 0.07623117, + "balance_loss_mlp": 0.01745367, + "epoch": 0.025852998647226816, + "flos": 32387824569600.0, + "grad_norm": 62.5354126598028, + "language_loss": 1.05025983, + "learning_rate": 3.904181346912895e-06, + "loss": 1.16319966, + "num_input_tokens_seen": 9039720, + "router_z_loss_clip": 17.15625, + "router_z_loss_mlp": 2.10644531, + "step": 430, + "time_per_iteration": 2.8446128368377686 + }, + { + "auxiliary_loss_clip": 0.09278628, + "auxiliary_loss_mlp": 0.01943414, + "balance_loss_clip": 0.07600376, + "balance_loss_mlp": 0.01729219, + "epoch": 0.025913121899894784, + "flos": 20199452538240.0, + "grad_norm": 28.225993864396795, + "language_loss": 1.00378919, + "learning_rate": 3.905676939184698e-06, + "loss": 1.11600959, + "num_input_tokens_seen": 9059850, + "router_z_loss_clip": 16.78125, + "router_z_loss_mlp": 2.14453125, + "step": 431, + "time_per_iteration": 2.735534906387329 + }, + { + "auxiliary_loss_clip": 0.09339449, + "auxiliary_loss_mlp": 0.01919694, + "balance_loss_clip": 0.07634744, + "balance_loss_mlp": 0.01714844, + "epoch": 0.025973245152562753, + "flos": 14725680716160.0, + "grad_norm": 242.91179280184718, + "language_loss": 1.11488628, + "learning_rate": 3.907169065422638e-06, + "loss": 1.22747779, + "num_input_tokens_seen": 9077590, + "router_z_loss_clip": 17.046875, + "router_z_loss_mlp": 2.04882812, + "step": 432, + "time_per_iteration": 2.6356372833251953 + }, + { + "auxiliary_loss_clip": 0.09349881, + "auxiliary_loss_mlp": 0.01923388, + "balance_loss_clip": 0.07619249, + "balance_loss_mlp": 0.01717585, + "epoch": 0.02603336840523072, + "flos": 31000947315840.0, + "grad_norm": 39.86728122976192, + "language_loss": 0.95303321, + "learning_rate": 3.908657741654636e-06, + "loss": 1.06576586, + "num_input_tokens_seen": 9099880, + "router_z_loss_clip": 17.328125, + "router_z_loss_mlp": 2.06054688, + "step": 433, + "time_per_iteration": 2.7784080505371094 + }, + { + "auxiliary_loss_clip": 0.09401309, + "auxiliary_loss_mlp": 0.0191169, + "balance_loss_clip": 0.07644869, + "balance_loss_mlp": 0.01712276, + "epoch": 0.026093491657898694, + "flos": 17679753210240.0, + "grad_norm": 1553.0281168066135, + "language_loss": 1.08543563, + "learning_rate": 3.910142983797699e-06, + "loss": 1.19856548, + "num_input_tokens_seen": 9118620, + "router_z_loss_clip": 17.5625, + "router_z_loss_mlp": 1.99511719, + "step": 434, + "time_per_iteration": 2.668267250061035 + }, + { + "auxiliary_loss_clip": 0.09433939, + "auxiliary_loss_mlp": 0.01869234, + "balance_loss_clip": 0.07651832, + "balance_loss_mlp": 0.01678308, + "epoch": 0.026153614910566662, + "flos": 17863593068160.0, + "grad_norm": 33.64342024905016, + "language_loss": 1.03063393, + "learning_rate": 3.9116248076589305e-06, + "loss": 1.14366555, + "num_input_tokens_seen": 9135655, + "router_z_loss_clip": 17.8125, + "router_z_loss_mlp": 1.90917969, + "step": 435, + "time_per_iteration": 2.6838159561157227 + }, + { + "auxiliary_loss_clip": 0.09478317, + "auxiliary_loss_mlp": 0.01863685, + "balance_loss_clip": 0.07678007, + "balance_loss_mlp": 0.01671615, + "epoch": 0.02621373816323463, + "flos": 20017289761920.0, + "grad_norm": 41.08687640619308, + "language_loss": 1.07638645, + "learning_rate": 3.913103228936546e-06, + "loss": 1.18980646, + "num_input_tokens_seen": 9153520, + "router_z_loss_clip": 18.0, + "router_z_loss_mlp": 1.91992188, + "step": 436, + "time_per_iteration": 2.760547399520874 + }, + { + "auxiliary_loss_clip": 0.09473966, + "auxiliary_loss_mlp": 0.0187601, + "balance_loss_clip": 0.07674257, + "balance_loss_mlp": 0.01688708, + "epoch": 0.0262738614159026, + "flos": 19287213137280.0, + "grad_norm": 53.25711722147742, + "language_loss": 0.98595166, + "learning_rate": 3.914578263220868e-06, + "loss": 1.09945142, + "num_input_tokens_seen": 9170750, + "router_z_loss_clip": 18.0, + "router_z_loss_mlp": 1.87402344, + "step": 437, + "time_per_iteration": 2.6779754161834717 + }, + { + "auxiliary_loss_clip": 0.0942243, + "auxiliary_loss_mlp": 0.01861842, + "balance_loss_clip": 0.0761686, + "balance_loss_mlp": 0.01679594, + "epoch": 0.026333984668570568, + "flos": 18813204190080.0, + "grad_norm": 25.40915552443808, + "language_loss": 1.10034943, + "learning_rate": 3.916049925995316e-06, + "loss": 1.21319222, + "num_input_tokens_seen": 9188430, + "router_z_loss_clip": 18.03125, + "router_z_loss_mlp": 1.82421875, + "step": 438, + "time_per_iteration": 2.6451144218444824 + }, + { + "auxiliary_loss_clip": 0.07475804, + "auxiliary_loss_mlp": 0.01367854, + "balance_loss_clip": 0.06865337, + "balance_loss_mlp": 0.01290463, + "epoch": 0.02639410792123854, + "flos": 64593723196800.0, + "grad_norm": 0.9063737016618233, + "language_loss": 0.62703174, + "learning_rate": 3.917518232637377e-06, + "loss": 0.71546829, + "num_input_tokens_seen": 9255835, + "router_z_loss_clip": 6.09765625, + "router_z_loss_mlp": 0.77294922, + "step": 439, + "time_per_iteration": 3.321974992752075 + }, + { + "auxiliary_loss_clip": 0.09522887, + "auxiliary_loss_mlp": 0.0184955, + "balance_loss_clip": 0.07696441, + "balance_loss_mlp": 0.01671499, + "epoch": 0.02645423117390651, + "flos": 28480661009280.0, + "grad_norm": 87.92324241889918, + "language_loss": 0.94047898, + "learning_rate": 3.918983198419573e-06, + "loss": 1.05420327, + "num_input_tokens_seen": 9276835, + "router_z_loss_clip": 18.25, + "router_z_loss_mlp": 1.78027344, + "step": 440, + "time_per_iteration": 2.7474722862243652 + }, + { + "auxiliary_loss_clip": 0.09507709, + "auxiliary_loss_mlp": 0.01844884, + "balance_loss_clip": 0.07691655, + "balance_loss_mlp": 0.01676846, + "epoch": 0.026514354426574478, + "flos": 18557094585600.0, + "grad_norm": 21.281112340814676, + "language_loss": 1.01854694, + "learning_rate": 3.920444838510415e-06, + "loss": 1.13207293, + "num_input_tokens_seen": 9295075, + "router_z_loss_clip": 18.171875, + "router_z_loss_mlp": 1.68066406, + "step": 441, + "time_per_iteration": 2.6456263065338135 + }, + { + "auxiliary_loss_clip": 0.09501958, + "auxiliary_loss_mlp": 0.01843855, + "balance_loss_clip": 0.07712354, + "balance_loss_mlp": 0.01682208, + "epoch": 0.026574477679242446, + "flos": 20674090391040.0, + "grad_norm": 41.33053095224922, + "language_loss": 0.97709602, + "learning_rate": 3.92190316797534e-06, + "loss": 1.09055424, + "num_input_tokens_seen": 9314205, + "router_z_loss_clip": 17.890625, + "router_z_loss_mlp": 1.61621094, + "step": 442, + "time_per_iteration": 2.672673463821411 + }, + { + "auxiliary_loss_clip": 0.07433579, + "auxiliary_loss_mlp": 0.01330966, + "balance_loss_clip": 0.06849352, + "balance_loss_mlp": 0.01265354, + "epoch": 0.026634600931910415, + "flos": 57974718896640.0, + "grad_norm": 0.9677279434812149, + "language_loss": 0.64635992, + "learning_rate": 3.92335820177765e-06, + "loss": 0.73400539, + "num_input_tokens_seen": 9367395, + "router_z_loss_clip": 5.83203125, + "router_z_loss_mlp": 0.65625, + "step": 443, + "time_per_iteration": 3.173064947128296 + }, + { + "auxiliary_loss_clip": 0.09527416, + "auxiliary_loss_mlp": 0.01860056, + "balance_loss_clip": 0.07710861, + "balance_loss_mlp": 0.01695928, + "epoch": 0.026694724184578387, + "flos": 15820586017920.0, + "grad_norm": 61.63283491372988, + "language_loss": 1.0548501, + "learning_rate": 3.924809954779425e-06, + "loss": 1.16872489, + "num_input_tokens_seen": 9385185, + "router_z_loss_clip": 18.15625, + "router_z_loss_mlp": 1.64160156, + "step": 444, + "time_per_iteration": 2.639677047729492 + }, + { + "auxiliary_loss_clip": 0.09502187, + "auxiliary_loss_mlp": 0.01838362, + "balance_loss_clip": 0.07703182, + "balance_loss_mlp": 0.01668608, + "epoch": 0.026754847437246355, + "flos": 23446922503680.0, + "grad_norm": 26.361183363910182, + "language_loss": 1.13923943, + "learning_rate": 3.9262584417424425e-06, + "loss": 1.2526449, + "num_input_tokens_seen": 9403225, + "router_z_loss_clip": 17.96875, + "router_z_loss_mlp": 1.69824219, + "step": 445, + "time_per_iteration": 2.6820874214172363 + }, + { + "auxiliary_loss_clip": 0.09478995, + "auxiliary_loss_mlp": 0.01847369, + "balance_loss_clip": 0.07693952, + "balance_loss_mlp": 0.01688678, + "epoch": 0.026814970689914324, + "flos": 17346552249600.0, + "grad_norm": 24.407324377890284, + "language_loss": 1.13474417, + "learning_rate": 3.9277036773290725e-06, + "loss": 1.24800777, + "num_input_tokens_seen": 9420540, + "router_z_loss_clip": 17.84375, + "router_z_loss_mlp": 1.5859375, + "step": 446, + "time_per_iteration": 2.6508054733276367 + }, + { + "auxiliary_loss_clip": 0.09462097, + "auxiliary_loss_mlp": 0.01860509, + "balance_loss_clip": 0.07703365, + "balance_loss_mlp": 0.01698385, + "epoch": 0.026875093942582293, + "flos": 17900503591680.0, + "grad_norm": 17.536194577693298, + "language_loss": 0.97970635, + "learning_rate": 3.92914567610317e-06, + "loss": 1.09293234, + "num_input_tokens_seen": 9438840, + "router_z_loss_clip": 17.609375, + "router_z_loss_mlp": 1.62109375, + "step": 447, + "time_per_iteration": 2.6584267616271973 + }, + { + "auxiliary_loss_clip": 0.0948635, + "auxiliary_loss_mlp": 0.01891451, + "balance_loss_clip": 0.0770483, + "balance_loss_mlp": 0.01723413, + "epoch": 0.026935217195250265, + "flos": 21730114598400.0, + "grad_norm": 21.562911901589327, + "language_loss": 1.05652094, + "learning_rate": 3.930584452530952e-06, + "loss": 1.17029905, + "num_input_tokens_seen": 9457215, + "router_z_loss_clip": 17.8125, + "router_z_loss_mlp": 1.67871094, + "step": 448, + "time_per_iteration": 2.672372341156006 + }, + { + "auxiliary_loss_clip": 0.09413482, + "auxiliary_loss_mlp": 0.01902533, + "balance_loss_clip": 0.07671943, + "balance_loss_mlp": 0.01741266, + "epoch": 0.026995340447918233, + "flos": 23629378769280.0, + "grad_norm": 23.02833788504926, + "language_loss": 1.03788567, + "learning_rate": 3.9320200209818755e-06, + "loss": 1.1510458, + "num_input_tokens_seen": 9475615, + "router_z_loss_clip": 17.421875, + "router_z_loss_mlp": 1.61328125, + "step": 449, + "time_per_iteration": 2.7325220108032227 + }, + { + "auxiliary_loss_clip": 0.09437311, + "auxiliary_loss_mlp": 0.01924822, + "balance_loss_clip": 0.07667883, + "balance_loss_mlp": 0.0175955, + "epoch": 0.027055463700586202, + "flos": 17937078698880.0, + "grad_norm": 25.829396596685555, + "language_loss": 1.03924859, + "learning_rate": 3.933452395729493e-06, + "loss": 1.15286994, + "num_input_tokens_seen": 9493975, + "router_z_loss_clip": 17.703125, + "router_z_loss_mlp": 1.65332031, + "step": 450, + "time_per_iteration": 2.7811074256896973 + }, + { + "auxiliary_loss_clip": 0.09359707, + "auxiliary_loss_mlp": 0.01970194, + "balance_loss_clip": 0.0764256, + "balance_loss_mlp": 0.01786802, + "epoch": 0.02711558695325417, + "flos": 25125897490560.0, + "grad_norm": 13.607653987068408, + "language_loss": 0.94443107, + "learning_rate": 3.934881590952304e-06, + "loss": 1.05773008, + "num_input_tokens_seen": 9514810, + "router_z_loss_clip": 17.171875, + "router_z_loss_mlp": 1.83398438, + "step": 451, + "time_per_iteration": 2.7412643432617188 + }, + { + "auxiliary_loss_clip": 0.09335385, + "auxiliary_loss_mlp": 0.02017307, + "balance_loss_clip": 0.07637483, + "balance_loss_mlp": 0.0183115, + "epoch": 0.02717571020592214, + "flos": 24245788930560.0, + "grad_norm": 37.22783951143226, + "language_loss": 0.88836813, + "learning_rate": 3.936307620734599e-06, + "loss": 1.00189495, + "num_input_tokens_seen": 9533635, + "router_z_loss_clip": 16.984375, + "router_z_loss_mlp": 1.86132812, + "step": 452, + "time_per_iteration": 4.115676403045654 + }, + { + "auxiliary_loss_clip": 0.09290475, + "auxiliary_loss_mlp": 0.0203207, + "balance_loss_clip": 0.07611442, + "balance_loss_mlp": 0.01843815, + "epoch": 0.02723583345859011, + "flos": 25125939417600.0, + "grad_norm": 26.908598142012707, + "language_loss": 0.85555518, + "learning_rate": 3.937730499067294e-06, + "loss": 0.96878058, + "num_input_tokens_seen": 9555420, + "router_z_loss_clip": 16.796875, + "router_z_loss_mlp": 1.88378906, + "step": 453, + "time_per_iteration": 4.138639211654663 + }, + { + "auxiliary_loss_clip": 0.09325944, + "auxiliary_loss_mlp": 0.02084866, + "balance_loss_clip": 0.07637945, + "balance_loss_mlp": 0.01890889, + "epoch": 0.02729595671125808, + "flos": 42751550090880.0, + "grad_norm": 24.937148454808558, + "language_loss": 1.02160192, + "learning_rate": 3.939150239848748e-06, + "loss": 1.13570988, + "num_input_tokens_seen": 9578950, + "router_z_loss_clip": 16.90625, + "router_z_loss_mlp": 1.94140625, + "step": 454, + "time_per_iteration": 2.851925849914551 + }, + { + "auxiliary_loss_clip": 0.09296365, + "auxiliary_loss_mlp": 0.02123722, + "balance_loss_clip": 0.07621342, + "balance_loss_mlp": 0.01917728, + "epoch": 0.02735607996392605, + "flos": 21436884835200.0, + "grad_norm": 33.11607572615514, + "language_loss": 0.89587128, + "learning_rate": 3.9405668568855866e-06, + "loss": 1.01007211, + "num_input_tokens_seen": 9598160, + "router_z_loss_clip": 16.734375, + "router_z_loss_mlp": 2.0625, + "step": 455, + "time_per_iteration": 4.109623432159424 + }, + { + "auxiliary_loss_clip": 0.09291606, + "auxiliary_loss_mlp": 0.02163595, + "balance_loss_clip": 0.07605162, + "balance_loss_mlp": 0.01945966, + "epoch": 0.027416203216594017, + "flos": 20857762540800.0, + "grad_norm": 21.694013226548094, + "language_loss": 0.99008209, + "learning_rate": 3.941980363893499e-06, + "loss": 1.10463405, + "num_input_tokens_seen": 9616010, + "router_z_loss_clip": 16.84375, + "router_z_loss_mlp": 2.17773438, + "step": 456, + "time_per_iteration": 2.6782984733581543 + }, + { + "auxiliary_loss_clip": 0.09230845, + "auxiliary_loss_mlp": 0.02187109, + "balance_loss_clip": 0.07574348, + "balance_loss_mlp": 0.01970243, + "epoch": 0.027476326469261986, + "flos": 13229497411200.0, + "grad_norm": 28.08353344684151, + "language_loss": 0.97085631, + "learning_rate": 3.9433907744980384e-06, + "loss": 1.0850358, + "num_input_tokens_seen": 9634000, + "router_z_loss_clip": 16.5625, + "router_z_loss_mlp": 2.16894531, + "step": 457, + "time_per_iteration": 2.6582846641540527 + }, + { + "auxiliary_loss_clip": 0.09249748, + "auxiliary_loss_mlp": 0.02209668, + "balance_loss_clip": 0.07581042, + "balance_loss_mlp": 0.01978497, + "epoch": 0.027536449721929958, + "flos": 24031369532160.0, + "grad_norm": 45.18041952436337, + "language_loss": 1.10011601, + "learning_rate": 3.944798102235412e-06, + "loss": 1.21471024, + "num_input_tokens_seen": 9653455, + "router_z_loss_clip": 16.671875, + "router_z_loss_mlp": 2.31054688, + "step": 458, + "time_per_iteration": 2.723140239715576 + }, + { + "auxiliary_loss_clip": 0.09220205, + "auxiliary_loss_mlp": 0.02210297, + "balance_loss_clip": 0.07555029, + "balance_loss_mlp": 0.01976265, + "epoch": 0.027596572974597926, + "flos": 13011094944000.0, + "grad_norm": 45.239920259124276, + "language_loss": 1.02681351, + "learning_rate": 3.9462023605532545e-06, + "loss": 1.14111853, + "num_input_tokens_seen": 9669650, + "router_z_loss_clip": 16.640625, + "router_z_loss_mlp": 2.33984375, + "step": 459, + "time_per_iteration": 2.671720027923584 + }, + { + "auxiliary_loss_clip": 0.09208341, + "auxiliary_loss_mlp": 0.02210187, + "balance_loss_clip": 0.07567435, + "balance_loss_mlp": 0.0198264, + "epoch": 0.027656696227265895, + "flos": 26150671324800.0, + "grad_norm": 19.623434288041715, + "language_loss": 0.97685856, + "learning_rate": 3.947603562811407e-06, + "loss": 1.09104395, + "num_input_tokens_seen": 9691415, + "router_z_loss_clip": 16.40625, + "router_z_loss_mlp": 2.2734375, + "step": 460, + "time_per_iteration": 2.757227897644043 + }, + { + "auxiliary_loss_clip": 0.07349286, + "auxiliary_loss_mlp": 0.01457289, + "balance_loss_clip": 0.06801966, + "balance_loss_mlp": 0.01381853, + "epoch": 0.027716819479933864, + "flos": 60717055322880.0, + "grad_norm": 1.34871546657126, + "language_loss": 0.73767412, + "learning_rate": 3.949001722282675e-06, + "loss": 0.8257398, + "num_input_tokens_seen": 9755605, + "router_z_loss_clip": 5.48828125, + "router_z_loss_mlp": 0.75292969, + "step": 461, + "time_per_iteration": 3.225203514099121 + }, + { + "auxiliary_loss_clip": 0.09153335, + "auxiliary_loss_mlp": 0.02158036, + "balance_loss_clip": 0.07562718, + "balance_loss_mlp": 0.01941456, + "epoch": 0.027776942732601832, + "flos": 31219936761600.0, + "grad_norm": 25.337070845847826, + "language_loss": 1.02236819, + "learning_rate": 3.950396852153582e-06, + "loss": 1.13548183, + "num_input_tokens_seen": 9776270, + "router_z_loss_clip": 15.921875, + "router_z_loss_mlp": 2.16503906, + "step": 462, + "time_per_iteration": 2.761122941970825 + }, + { + "auxiliary_loss_clip": 0.0917296, + "auxiliary_loss_mlp": 0.02143298, + "balance_loss_clip": 0.07564321, + "balance_loss_mlp": 0.01926432, + "epoch": 0.027837065985269804, + "flos": 22681277020800.0, + "grad_norm": 25.879214952659087, + "language_loss": 1.11945248, + "learning_rate": 3.951788965525118e-06, + "loss": 1.23261511, + "num_input_tokens_seen": 9794465, + "router_z_loss_clip": 16.09375, + "router_z_loss_mlp": 2.16796875, + "step": 463, + "time_per_iteration": 2.6517393589019775 + }, + { + "auxiliary_loss_clip": 0.07315847, + "auxiliary_loss_mlp": 0.01337025, + "balance_loss_clip": 0.06773283, + "balance_loss_mlp": 0.01272986, + "epoch": 0.027897189237937773, + "flos": 62200786296960.0, + "grad_norm": 0.9076693638551637, + "language_loss": 0.58966231, + "learning_rate": 3.953178075413476e-06, + "loss": 0.67619097, + "num_input_tokens_seen": 9849685, + "router_z_loss_clip": 5.4375, + "router_z_loss_mlp": 0.64013672, + "step": 464, + "time_per_iteration": 3.2396233081817627 + }, + { + "auxiliary_loss_clip": 0.09172998, + "auxiliary_loss_mlp": 0.02120585, + "balance_loss_clip": 0.07578301, + "balance_loss_mlp": 0.01918502, + "epoch": 0.02795731249060574, + "flos": 24499131350400.0, + "grad_norm": 45.20349334546378, + "language_loss": 1.03495145, + "learning_rate": 3.954564194750784e-06, + "loss": 1.14788723, + "num_input_tokens_seen": 9869505, + "router_z_loss_clip": 15.953125, + "router_z_loss_mlp": 2.02148438, + "step": 465, + "time_per_iteration": 2.725616931915283 + }, + { + "auxiliary_loss_clip": 0.09135859, + "auxiliary_loss_mlp": 0.0204377, + "balance_loss_clip": 0.07563674, + "balance_loss_mlp": 0.01849125, + "epoch": 0.02801743574327371, + "flos": 23739858777600.0, + "grad_norm": 33.78948466858622, + "language_loss": 0.95100033, + "learning_rate": 3.955947336385828e-06, + "loss": 1.06279659, + "num_input_tokens_seen": 9890950, + "router_z_loss_clip": 15.703125, + "router_z_loss_mlp": 1.94628906, + "step": 466, + "time_per_iteration": 2.7096307277679443 + }, + { + "auxiliary_loss_clip": 0.09162845, + "auxiliary_loss_mlp": 0.02091556, + "balance_loss_clip": 0.07588789, + "balance_loss_mlp": 0.0189424, + "epoch": 0.02807755899594168, + "flos": 20634999661440.0, + "grad_norm": 17.071922366982022, + "language_loss": 1.01469541, + "learning_rate": 3.957327513084761e-06, + "loss": 1.12723947, + "num_input_tokens_seen": 9911265, + "router_z_loss_clip": 15.75, + "router_z_loss_mlp": 1.97265625, + "step": 467, + "time_per_iteration": 2.697120189666748 + }, + { + "auxiliary_loss_clip": 0.0908498, + "auxiliary_loss_mlp": 0.02113688, + "balance_loss_clip": 0.07555597, + "balance_loss_mlp": 0.01908934, + "epoch": 0.02813768224860965, + "flos": 19250554176000.0, + "grad_norm": 23.52868546244156, + "language_loss": 1.03801823, + "learning_rate": 3.958704737531818e-06, + "loss": 1.15000498, + "num_input_tokens_seen": 9929025, + "router_z_loss_clip": 15.2734375, + "router_z_loss_mlp": 2.04882812, + "step": 468, + "time_per_iteration": 2.6348235607147217 + }, + { + "auxiliary_loss_clip": 0.09087479, + "auxiliary_loss_mlp": 0.02120186, + "balance_loss_clip": 0.07563758, + "balance_loss_mlp": 0.01912189, + "epoch": 0.02819780550127762, + "flos": 20820306965760.0, + "grad_norm": 34.78387665912523, + "language_loss": 1.11076498, + "learning_rate": 3.9600790223300065e-06, + "loss": 1.2228415, + "num_input_tokens_seen": 9945190, + "router_z_loss_clip": 15.2265625, + "router_z_loss_mlp": 2.08300781, + "step": 469, + "time_per_iteration": 2.6886401176452637 + }, + { + "auxiliary_loss_clip": 0.09051213, + "auxiliary_loss_mlp": 0.02126417, + "balance_loss_clip": 0.07552808, + "balance_loss_mlp": 0.01921949, + "epoch": 0.028257928753945588, + "flos": 19980211530240.0, + "grad_norm": 43.4409759227761, + "language_loss": 1.05499089, + "learning_rate": 3.96145038000181e-06, + "loss": 1.16676712, + "num_input_tokens_seen": 9962820, + "router_z_loss_clip": 15.0078125, + "router_z_loss_mlp": 2.046875, + "step": 470, + "time_per_iteration": 2.649240255355835 + }, + { + "auxiliary_loss_clip": 0.09054536, + "auxiliary_loss_mlp": 0.02164254, + "balance_loss_clip": 0.0753805, + "balance_loss_mlp": 0.0194281, + "epoch": 0.028318052006613557, + "flos": 20490585949440.0, + "grad_norm": 34.229925481391405, + "language_loss": 1.11025834, + "learning_rate": 3.962818822989861e-06, + "loss": 1.2224462, + "num_input_tokens_seen": 9982595, + "router_z_loss_clip": 15.1796875, + "router_z_loss_mlp": 2.21484375, + "step": 471, + "time_per_iteration": 2.694502592086792 + }, + { + "auxiliary_loss_clip": 0.0901389, + "auxiliary_loss_mlp": 0.02100335, + "balance_loss_clip": 0.07527161, + "balance_loss_mlp": 0.01902638, + "epoch": 0.02837817525928153, + "flos": 28522854339840.0, + "grad_norm": 28.640745518781863, + "language_loss": 0.93263328, + "learning_rate": 3.964184363657625e-06, + "loss": 1.04377556, + "num_input_tokens_seen": 10004645, + "router_z_loss_clip": 14.859375, + "router_z_loss_mlp": 1.9765625, + "step": 472, + "time_per_iteration": 2.723616123199463 + }, + { + "auxiliary_loss_clip": 0.09058346, + "auxiliary_loss_mlp": 0.02156495, + "balance_loss_clip": 0.07551048, + "balance_loss_mlp": 0.01941347, + "epoch": 0.028438298511949497, + "flos": 18557597710080.0, + "grad_norm": 31.883678895195217, + "language_loss": 1.09761989, + "learning_rate": 3.965547014290071e-06, + "loss": 1.2097683, + "num_input_tokens_seen": 10022555, + "router_z_loss_clip": 15.078125, + "router_z_loss_mlp": 2.15136719, + "step": 473, + "time_per_iteration": 2.678131580352783 + }, + { + "auxiliary_loss_clip": 0.09018995, + "auxiliary_loss_mlp": 0.02143272, + "balance_loss_clip": 0.07526669, + "balance_loss_mlp": 0.01926216, + "epoch": 0.028498421764617466, + "flos": 16915952517120.0, + "grad_norm": 82.06010961294956, + "language_loss": 1.11515367, + "learning_rate": 3.96690678709433e-06, + "loss": 1.22677636, + "num_input_tokens_seen": 10041025, + "router_z_loss_clip": 14.921875, + "router_z_loss_mlp": 2.171875, + "step": 474, + "time_per_iteration": 2.6410977840423584 + }, + { + "auxiliary_loss_clip": 0.08995185, + "auxiliary_loss_mlp": 0.02205209, + "balance_loss_clip": 0.0752454, + "balance_loss_mlp": 0.01985291, + "epoch": 0.028558545017285435, + "flos": 27785524337280.0, + "grad_norm": 24.826629982331372, + "language_loss": 0.97130352, + "learning_rate": 3.968263694200355e-06, + "loss": 1.0833075, + "num_input_tokens_seen": 10060775, + "router_z_loss_clip": 14.6953125, + "router_z_loss_mlp": 2.19726562, + "step": 475, + "time_per_iteration": 2.7301735877990723 + }, + { + "auxiliary_loss_clip": 0.07259832, + "auxiliary_loss_mlp": 0.01404773, + "balance_loss_clip": 0.06728013, + "balance_loss_mlp": 0.01346599, + "epoch": 0.028618668269953403, + "flos": 65674205596800.0, + "grad_norm": 0.9437348671950723, + "language_loss": 0.66932654, + "learning_rate": 3.969617747661569e-06, + "loss": 0.75597262, + "num_input_tokens_seen": 10120225, + "router_z_loss_clip": 5.3125, + "router_z_loss_mlp": 0.58154297, + "step": 476, + "time_per_iteration": 3.247438430786133 + }, + { + "auxiliary_loss_clip": 0.08952022, + "auxiliary_loss_mlp": 0.02252624, + "balance_loss_clip": 0.07508352, + "balance_loss_mlp": 0.02028701, + "epoch": 0.028678791522621375, + "flos": 21942269936640.0, + "grad_norm": 144.43661292546363, + "language_loss": 1.05051386, + "learning_rate": 3.970968959455509e-06, + "loss": 1.16256034, + "num_input_tokens_seen": 10137880, + "router_z_loss_clip": 14.4296875, + "router_z_loss_mlp": 2.24023438, + "step": 477, + "time_per_iteration": 2.6508686542510986 + }, + { + "auxiliary_loss_clip": 0.08993904, + "auxiliary_loss_mlp": 0.02256823, + "balance_loss_clip": 0.0754967, + "balance_loss_mlp": 0.02029467, + "epoch": 0.028738914775289344, + "flos": 24579115672320.0, + "grad_norm": 33.20185721324117, + "language_loss": 1.03065133, + "learning_rate": 3.97231734148446e-06, + "loss": 1.14315856, + "num_input_tokens_seen": 10156930, + "router_z_loss_clip": 14.453125, + "router_z_loss_mlp": 2.2734375, + "step": 478, + "time_per_iteration": 2.7467830181121826 + }, + { + "auxiliary_loss_clip": 0.08933547, + "auxiliary_loss_mlp": 0.0224041, + "balance_loss_clip": 0.07500903, + "balance_loss_mlp": 0.02019921, + "epoch": 0.028799038027957313, + "flos": 23264633946240.0, + "grad_norm": 28.885721108677235, + "language_loss": 1.00177026, + "learning_rate": 3.973662905576082e-06, + "loss": 1.11350989, + "num_input_tokens_seen": 10176295, + "router_z_loss_clip": 14.328125, + "router_z_loss_mlp": 2.20507812, + "step": 479, + "time_per_iteration": 2.7295467853546143 + }, + { + "auxiliary_loss_clip": 0.08948811, + "auxiliary_loss_mlp": 0.02267472, + "balance_loss_clip": 0.07523456, + "balance_loss_mlp": 0.02031152, + "epoch": 0.02885916128062528, + "flos": 22170692966400.0, + "grad_norm": 33.357673755660976, + "language_loss": 0.91625684, + "learning_rate": 3.975005663484038e-06, + "loss": 1.02841961, + "num_input_tokens_seen": 10195790, + "router_z_loss_clip": 14.25, + "router_z_loss_mlp": 2.36328125, + "step": 480, + "time_per_iteration": 2.766277551651001 + }, + { + "auxiliary_loss_clip": 0.08903027, + "auxiliary_loss_mlp": 0.02291788, + "balance_loss_clip": 0.07483099, + "balance_loss_mlp": 0.02045358, + "epoch": 0.02891928453329325, + "flos": 22939986101760.0, + "grad_norm": 22.287574516605755, + "language_loss": 1.01525128, + "learning_rate": 3.976345626888605e-06, + "loss": 1.12719941, + "num_input_tokens_seen": 10218405, + "router_z_loss_clip": 14.1875, + "router_z_loss_mlp": 2.4609375, + "step": 481, + "time_per_iteration": 2.692387580871582 + }, + { + "auxiliary_loss_clip": 0.07204929, + "auxiliary_loss_mlp": 0.0133661, + "balance_loss_clip": 0.06688471, + "balance_loss_mlp": 0.01279295, + "epoch": 0.028979407785961222, + "flos": 57449376524160.0, + "grad_norm": 0.8487290952821426, + "language_loss": 0.65879083, + "learning_rate": 3.9776828073972864e-06, + "loss": 0.74420619, + "num_input_tokens_seen": 10271005, + "router_z_loss_clip": 5.16015625, + "router_z_loss_mlp": 0.57275391, + "step": 482, + "time_per_iteration": 3.019406318664551 + }, + { + "auxiliary_loss_clip": 0.08916203, + "auxiliary_loss_mlp": 0.02251093, + "balance_loss_clip": 0.0748857, + "balance_loss_mlp": 0.02018397, + "epoch": 0.02903953103862919, + "flos": 16727584538880.0, + "grad_norm": 104.5991727322302, + "language_loss": 1.06331348, + "learning_rate": 3.979017216545415e-06, + "loss": 1.17498636, + "num_input_tokens_seen": 10288405, + "router_z_loss_clip": 14.28125, + "router_z_loss_mlp": 2.32421875, + "step": 483, + "time_per_iteration": 2.609882354736328 + }, + { + "auxiliary_loss_clip": 0.08908117, + "auxiliary_loss_mlp": 0.02236577, + "balance_loss_clip": 0.07510938, + "balance_loss_mlp": 0.02016469, + "epoch": 0.02909965429129716, + "flos": 16769232817920.0, + "grad_norm": 23.083678473769563, + "language_loss": 0.94234419, + "learning_rate": 3.980348865796749e-06, + "loss": 1.05379117, + "num_input_tokens_seen": 10306875, + "router_z_loss_clip": 13.96875, + "router_z_loss_mlp": 2.20507812, + "step": 484, + "time_per_iteration": 2.6507458686828613 + }, + { + "auxiliary_loss_clip": 0.08915585, + "auxiliary_loss_mlp": 0.02232887, + "balance_loss_clip": 0.07503805, + "balance_loss_mlp": 0.02011253, + "epoch": 0.029159777543965128, + "flos": 19790334178560.0, + "grad_norm": 110.91894314268477, + "language_loss": 1.00352454, + "learning_rate": 3.9816777665440615e-06, + "loss": 1.11500931, + "num_input_tokens_seen": 10323965, + "router_z_loss_clip": 14.125, + "router_z_loss_mlp": 2.21679688, + "step": 485, + "time_per_iteration": 2.7673757076263428 + }, + { + "auxiliary_loss_clip": 0.08880442, + "auxiliary_loss_mlp": 0.02237809, + "balance_loss_clip": 0.07482816, + "balance_loss_mlp": 0.02005876, + "epoch": 0.029219900796633096, + "flos": 19648184526720.0, + "grad_norm": 27.10228237086094, + "language_loss": 1.06272924, + "learning_rate": 3.983003930109732e-06, + "loss": 1.17391181, + "num_input_tokens_seen": 10342620, + "router_z_loss_clip": 13.96875, + "router_z_loss_mlp": 2.31835938, + "step": 486, + "time_per_iteration": 2.6508092880249023 + }, + { + "auxiliary_loss_clip": 0.08911004, + "auxiliary_loss_mlp": 0.02193732, + "balance_loss_clip": 0.0752122, + "balance_loss_mlp": 0.01974864, + "epoch": 0.02928002404930107, + "flos": 25892926565760.0, + "grad_norm": 15.693662583850747, + "language_loss": 1.04105806, + "learning_rate": 3.984327367746315e-06, + "loss": 1.15210545, + "num_input_tokens_seen": 10364610, + "router_z_loss_clip": 13.90625, + "router_z_loss_mlp": 2.19042969, + "step": 487, + "time_per_iteration": 2.81233286857605 + }, + { + "auxiliary_loss_clip": 0.0888624, + "auxiliary_loss_mlp": 0.02210903, + "balance_loss_clip": 0.07486838, + "balance_loss_mlp": 0.02002811, + "epoch": 0.029340147301969037, + "flos": 20665243785600.0, + "grad_norm": 49.61563210000309, + "language_loss": 1.12978697, + "learning_rate": 3.985648090637122e-06, + "loss": 1.24075842, + "num_input_tokens_seen": 10380910, + "router_z_loss_clip": 13.9921875, + "router_z_loss_mlp": 2.08300781, + "step": 488, + "time_per_iteration": 2.674189567565918 + }, + { + "auxiliary_loss_clip": 0.08953497, + "auxiliary_loss_mlp": 0.02211393, + "balance_loss_clip": 0.07543504, + "balance_loss_mlp": 0.02002347, + "epoch": 0.029400270554637006, + "flos": 24435288938880.0, + "grad_norm": 19.90256121713189, + "language_loss": 1.00477099, + "learning_rate": 3.986966109896785e-06, + "loss": 1.11641979, + "num_input_tokens_seen": 10400665, + "router_z_loss_clip": 14.1015625, + "router_z_loss_mlp": 2.09277344, + "step": 489, + "time_per_iteration": 2.7639148235321045 + }, + { + "auxiliary_loss_clip": 0.0892607, + "auxiliary_loss_mlp": 0.0220073, + "balance_loss_clip": 0.07529595, + "balance_loss_mlp": 0.01982529, + "epoch": 0.029460393807304974, + "flos": 20127140864640.0, + "grad_norm": 27.578366038116485, + "language_loss": 1.02338409, + "learning_rate": 3.988281436571815e-06, + "loss": 1.13465214, + "num_input_tokens_seen": 10420150, + "router_z_loss_clip": 13.96875, + "router_z_loss_mlp": 2.18359375, + "step": 490, + "time_per_iteration": 2.6444106101989746 + }, + { + "auxiliary_loss_clip": 0.08913176, + "auxiliary_loss_mlp": 0.02195572, + "balance_loss_clip": 0.07533699, + "balance_loss_mlp": 0.0197432, + "epoch": 0.029520517059972943, + "flos": 17681681854080.0, + "grad_norm": 29.015537112342308, + "language_loss": 1.11532688, + "learning_rate": 3.989594081641164e-06, + "loss": 1.22641444, + "num_input_tokens_seen": 10438210, + "router_z_loss_clip": 13.7890625, + "router_z_loss_mlp": 2.21289062, + "step": 491, + "time_per_iteration": 5.5153045654296875 + }, + { + "auxiliary_loss_clip": 0.08889591, + "auxiliary_loss_mlp": 0.02207651, + "balance_loss_clip": 0.07520857, + "balance_loss_mlp": 0.0199317, + "epoch": 0.029580640312640915, + "flos": 18959211129600.0, + "grad_norm": 14.57626480214455, + "language_loss": 0.9931764, + "learning_rate": 3.9909040560167675e-06, + "loss": 1.10414886, + "num_input_tokens_seen": 10455125, + "router_z_loss_clip": 13.6875, + "router_z_loss_mlp": 2.14550781, + "step": 492, + "time_per_iteration": 4.12203049659729 + }, + { + "auxiliary_loss_clip": 0.08912461, + "auxiliary_loss_mlp": 0.02272215, + "balance_loss_clip": 0.07548416, + "balance_loss_mlp": 0.02033606, + "epoch": 0.029640763565308884, + "flos": 18730746172800.0, + "grad_norm": 23.908228280746865, + "language_loss": 1.05753922, + "learning_rate": 3.992211370544093e-06, + "loss": 1.16938591, + "num_input_tokens_seen": 10470990, + "router_z_loss_clip": 13.625, + "router_z_loss_mlp": 2.3828125, + "step": 493, + "time_per_iteration": 2.6953020095825195 + }, + { + "auxiliary_loss_clip": 0.08946873, + "auxiliary_loss_mlp": 0.02207101, + "balance_loss_clip": 0.07561117, + "balance_loss_mlp": 0.01985753, + "epoch": 0.029700886817976852, + "flos": 20601652936320.0, + "grad_norm": 59.82783301164341, + "language_loss": 1.05118871, + "learning_rate": 3.99351603600268e-06, + "loss": 1.16272855, + "num_input_tokens_seen": 10490685, + "router_z_loss_clip": 13.8515625, + "router_z_loss_mlp": 2.21386719, + "step": 494, + "time_per_iteration": 2.6631805896759033 + }, + { + "auxiliary_loss_clip": 0.08915924, + "auxiliary_loss_mlp": 0.02239191, + "balance_loss_clip": 0.07543083, + "balance_loss_mlp": 0.0199753, + "epoch": 0.02976101007064482, + "flos": 22243423910400.0, + "grad_norm": 26.318413946561634, + "language_loss": 1.04354262, + "learning_rate": 3.994818063106668e-06, + "loss": 1.15509367, + "num_input_tokens_seen": 10509435, + "router_z_loss_clip": 13.7265625, + "router_z_loss_mlp": 2.4140625, + "step": 495, + "time_per_iteration": 4.107235908508301 + }, + { + "auxiliary_loss_clip": 0.08888054, + "auxiliary_loss_mlp": 0.02273613, + "balance_loss_clip": 0.07541628, + "balance_loss_mlp": 0.02036148, + "epoch": 0.029821133323312793, + "flos": 23739439507200.0, + "grad_norm": 14.252476342508674, + "language_loss": 0.79374158, + "learning_rate": 3.99611746250533e-06, + "loss": 0.9053582, + "num_input_tokens_seen": 10530050, + "router_z_loss_clip": 13.4609375, + "router_z_loss_mlp": 2.37304688, + "step": 496, + "time_per_iteration": 2.757887363433838 + }, + { + "auxiliary_loss_clip": 0.08908898, + "auxiliary_loss_mlp": 0.0225322, + "balance_loss_clip": 0.07561936, + "balance_loss_mlp": 0.02023385, + "epoch": 0.02988125657598076, + "flos": 22426131738240.0, + "grad_norm": 48.93797296748546, + "language_loss": 1.05435932, + "learning_rate": 3.997414244783595e-06, + "loss": 1.16598058, + "num_input_tokens_seen": 10551370, + "router_z_loss_clip": 13.453125, + "router_z_loss_mlp": 2.296875, + "step": 497, + "time_per_iteration": 2.698960781097412 + }, + { + "auxiliary_loss_clip": 0.08959304, + "auxiliary_loss_mlp": 0.0221962, + "balance_loss_clip": 0.07595803, + "balance_loss_mlp": 0.01998176, + "epoch": 0.02994137982864873, + "flos": 13850267984640.0, + "grad_norm": 57.28331954677374, + "language_loss": 1.09360301, + "learning_rate": 3.998708420462557e-06, + "loss": 1.20539236, + "num_input_tokens_seen": 10569225, + "router_z_loss_clip": 13.640625, + "router_z_loss_mlp": 2.21289062, + "step": 498, + "time_per_iteration": 2.699470281600952 + }, + { + "auxiliary_loss_clip": 0.08942117, + "auxiliary_loss_mlp": 0.02291662, + "balance_loss_clip": 0.07576901, + "balance_loss_mlp": 0.02053434, + "epoch": 0.0300015030813167, + "flos": 23914055416320.0, + "grad_norm": 30.471494656970325, + "language_loss": 1.05517888, + "learning_rate": 4e-06, + "loss": 1.16751671, + "num_input_tokens_seen": 10586170, + "router_z_loss_clip": 13.65625, + "router_z_loss_mlp": 2.37890625, + "step": 499, + "time_per_iteration": 2.6825146675109863 + }, + { + "auxiliary_loss_clip": 0.08909643, + "auxiliary_loss_mlp": 0.02277073, + "balance_loss_clip": 0.07578171, + "balance_loss_mlp": 0.02052769, + "epoch": 0.030061626333984667, + "flos": 22023134726400.0, + "grad_norm": 15.715356901732157, + "language_loss": 0.96281993, + "learning_rate": 3.9999999620799e-06, + "loss": 1.07468712, + "num_input_tokens_seen": 10606205, + "router_z_loss_clip": 13.3046875, + "router_z_loss_mlp": 2.24414062, + "step": 500, + "time_per_iteration": 2.7350914478302 + }, + { + "auxiliary_loss_clip": 0.08887713, + "auxiliary_loss_mlp": 0.02297984, + "balance_loss_clip": 0.07557485, + "balance_loss_mlp": 0.02069103, + "epoch": 0.03012174958665264, + "flos": 23046483041280.0, + "grad_norm": 15.325261953037035, + "language_loss": 1.09255648, + "learning_rate": 3.9999998483196e-06, + "loss": 1.20441341, + "num_input_tokens_seen": 10625995, + "router_z_loss_clip": 13.296875, + "router_z_loss_mlp": 2.2890625, + "step": 501, + "time_per_iteration": 2.6515860557556152 + }, + { + "auxiliary_loss_clip": 0.0895866, + "auxiliary_loss_mlp": 0.02279337, + "balance_loss_clip": 0.07618586, + "balance_loss_mlp": 0.02058275, + "epoch": 0.030181872839320608, + "flos": 18959294983680.0, + "grad_norm": 442.08874740717613, + "language_loss": 1.0616231, + "learning_rate": 3.9999996587191065e-06, + "loss": 1.17400312, + "num_input_tokens_seen": 10644105, + "router_z_loss_clip": 13.40625, + "router_z_loss_mlp": 2.21289062, + "step": 502, + "time_per_iteration": 2.6650314331054688 + }, + { + "auxiliary_loss_clip": 0.08926746, + "auxiliary_loss_mlp": 0.02313635, + "balance_loss_clip": 0.07593986, + "balance_loss_mlp": 0.02080176, + "epoch": 0.030241996091988577, + "flos": 16733747813760.0, + "grad_norm": 40.11923719359636, + "language_loss": 1.00487685, + "learning_rate": 3.999999393278425e-06, + "loss": 1.11728072, + "num_input_tokens_seen": 10661090, + "router_z_loss_clip": 13.3125, + "router_z_loss_mlp": 2.3359375, + "step": 503, + "time_per_iteration": 2.6301283836364746 + }, + { + "auxiliary_loss_clip": 0.08950677, + "auxiliary_loss_mlp": 0.02299167, + "balance_loss_clip": 0.07607222, + "balance_loss_mlp": 0.02070094, + "epoch": 0.030302119344656545, + "flos": 28628806227840.0, + "grad_norm": 16.096297116013613, + "language_loss": 1.02800179, + "learning_rate": 3.999999051997567e-06, + "loss": 1.14050031, + "num_input_tokens_seen": 10682380, + "router_z_loss_clip": 13.4375, + "router_z_loss_mlp": 2.28808594, + "step": 504, + "time_per_iteration": 2.7234466075897217 + }, + { + "auxiliary_loss_clip": 0.08954775, + "auxiliary_loss_mlp": 0.022733, + "balance_loss_clip": 0.07610564, + "balance_loss_mlp": 0.02054241, + "epoch": 0.030362242597324514, + "flos": 15674788713600.0, + "grad_norm": 53.80634610199122, + "language_loss": 0.90572113, + "learning_rate": 3.9999986348765425e-06, + "loss": 1.01800191, + "num_input_tokens_seen": 10699925, + "router_z_loss_clip": 13.453125, + "router_z_loss_mlp": 2.19042969, + "step": 505, + "time_per_iteration": 2.6355271339416504 + }, + { + "auxiliary_loss_clip": 0.07202613, + "auxiliary_loss_mlp": 0.01385887, + "balance_loss_clip": 0.06702607, + "balance_loss_mlp": 0.01312073, + "epoch": 0.030422365849992486, + "flos": 72149173528320.0, + "grad_norm": 1.0312424009228802, + "language_loss": 0.55707914, + "learning_rate": 3.999998141915371e-06, + "loss": 0.64296412, + "num_input_tokens_seen": 10766525, + "router_z_loss_clip": 5.0, + "router_z_loss_mlp": 0.73779297, + "step": 506, + "time_per_iteration": 3.4425716400146484 + }, + { + "auxiliary_loss_clip": 0.08947556, + "auxiliary_loss_mlp": 0.0229462, + "balance_loss_clip": 0.07588895, + "balance_loss_mlp": 0.02080234, + "epoch": 0.030482489102660455, + "flos": 19433974763520.0, + "grad_norm": 15.732874937996321, + "language_loss": 0.96318799, + "learning_rate": 3.999997573114069e-06, + "loss": 1.07560968, + "num_input_tokens_seen": 10786725, + "router_z_loss_clip": 13.5703125, + "router_z_loss_mlp": 2.14648438, + "step": 507, + "time_per_iteration": 2.6885857582092285 + }, + { + "auxiliary_loss_clip": 0.08928548, + "auxiliary_loss_mlp": 0.02259048, + "balance_loss_clip": 0.07588597, + "balance_loss_mlp": 0.02042945, + "epoch": 0.030542612355328423, + "flos": 20382034584960.0, + "grad_norm": 22.351883402694675, + "language_loss": 1.05944586, + "learning_rate": 3.999996928472659e-06, + "loss": 1.17132187, + "num_input_tokens_seen": 10805390, + "router_z_loss_clip": 13.3984375, + "router_z_loss_mlp": 2.15722656, + "step": 508, + "time_per_iteration": 2.659903049468994 + }, + { + "auxiliary_loss_clip": 0.08911724, + "auxiliary_loss_mlp": 0.02284852, + "balance_loss_clip": 0.07589735, + "balance_loss_mlp": 0.02067796, + "epoch": 0.030602735607996392, + "flos": 34685809194240.0, + "grad_norm": 36.57726962187856, + "language_loss": 0.84476292, + "learning_rate": 3.999996207991165e-06, + "loss": 0.95672864, + "num_input_tokens_seen": 10828030, + "router_z_loss_clip": 13.1953125, + "router_z_loss_mlp": 2.17089844, + "step": 509, + "time_per_iteration": 2.8194127082824707 + }, + { + "auxiliary_loss_clip": 0.08892205, + "auxiliary_loss_mlp": 0.02281797, + "balance_loss_clip": 0.07575735, + "balance_loss_mlp": 0.02065503, + "epoch": 0.03066285886066436, + "flos": 23665283043840.0, + "grad_norm": 17.47434487382061, + "language_loss": 0.97325271, + "learning_rate": 3.999995411669614e-06, + "loss": 1.08499277, + "num_input_tokens_seen": 10845240, + "router_z_loss_clip": 13.15625, + "router_z_loss_mlp": 2.16210938, + "step": 510, + "time_per_iteration": 2.6817235946655273 + }, + { + "auxiliary_loss_clip": 0.08892487, + "auxiliary_loss_mlp": 0.02360194, + "balance_loss_clip": 0.07583004, + "balance_loss_mlp": 0.02123492, + "epoch": 0.030722982113332332, + "flos": 23009656371840.0, + "grad_norm": 18.905046526469672, + "language_loss": 1.01792526, + "learning_rate": 3.999994539508036e-06, + "loss": 1.13045216, + "num_input_tokens_seen": 10864325, + "router_z_loss_clip": 13.109375, + "router_z_loss_mlp": 2.36328125, + "step": 511, + "time_per_iteration": 2.7218635082244873 + }, + { + "auxiliary_loss_clip": 0.08893925, + "auxiliary_loss_mlp": 0.02289988, + "balance_loss_clip": 0.07569309, + "balance_loss_mlp": 0.02083041, + "epoch": 0.0307831053660003, + "flos": 24757253452800.0, + "grad_norm": 19.668331583944035, + "language_loss": 0.98058987, + "learning_rate": 3.9999935915064655e-06, + "loss": 1.09242892, + "num_input_tokens_seen": 10883860, + "router_z_loss_clip": 13.25, + "router_z_loss_mlp": 2.07226562, + "step": 512, + "time_per_iteration": 2.6965620517730713 + }, + { + "auxiliary_loss_clip": 0.08852743, + "auxiliary_loss_mlp": 0.02379446, + "balance_loss_clip": 0.0755362, + "balance_loss_mlp": 0.02156858, + "epoch": 0.03084322861866827, + "flos": 26148113775360.0, + "grad_norm": 13.468181826610785, + "language_loss": 1.01916862, + "learning_rate": 3.9999925676649374e-06, + "loss": 1.13149047, + "num_input_tokens_seen": 10904555, + "router_z_loss_clip": 12.984375, + "router_z_loss_mlp": 2.22460938, + "step": 513, + "time_per_iteration": 2.711587429046631 + }, + { + "auxiliary_loss_clip": 0.08845583, + "auxiliary_loss_mlp": 0.02430958, + "balance_loss_clip": 0.07545915, + "balance_loss_mlp": 0.02204555, + "epoch": 0.03090335187133624, + "flos": 18777383769600.0, + "grad_norm": 6.55607776583441, + "language_loss": 0.95138013, + "learning_rate": 3.999991467983491e-06, + "loss": 1.06414557, + "num_input_tokens_seen": 10923700, + "router_z_loss_clip": 13.0, + "router_z_loss_mlp": 2.26269531, + "step": 514, + "time_per_iteration": 2.6500775814056396 + }, + { + "auxiliary_loss_clip": 0.08815307, + "auxiliary_loss_mlp": 0.02407072, + "balance_loss_clip": 0.07539771, + "balance_loss_mlp": 0.02187917, + "epoch": 0.030963475124004207, + "flos": 23228603890560.0, + "grad_norm": 18.204719930438795, + "language_loss": 0.97247916, + "learning_rate": 3.999990292462167e-06, + "loss": 1.08470297, + "num_input_tokens_seen": 10942730, + "router_z_loss_clip": 12.7578125, + "router_z_loss_mlp": 2.19335938, + "step": 515, + "time_per_iteration": 2.7167558670043945 + }, + { + "auxiliary_loss_clip": 0.08806405, + "auxiliary_loss_mlp": 0.02437712, + "balance_loss_clip": 0.0752582, + "balance_loss_mlp": 0.02208258, + "epoch": 0.03102359837667218, + "flos": 42535998662400.0, + "grad_norm": 5.904658856542002, + "language_loss": 1.00314569, + "learning_rate": 3.999989041101011e-06, + "loss": 1.11558676, + "num_input_tokens_seen": 10967120, + "router_z_loss_clip": 12.8203125, + "router_z_loss_mlp": 2.29492188, + "step": 516, + "time_per_iteration": 2.932173013687134 + }, + { + "auxiliary_loss_clip": 0.08796877, + "auxiliary_loss_mlp": 0.02455233, + "balance_loss_clip": 0.07514809, + "balance_loss_mlp": 0.02220629, + "epoch": 0.031083721629340148, + "flos": 21183039290880.0, + "grad_norm": 45.02393900109363, + "language_loss": 0.9180311, + "learning_rate": 3.999987713900071e-06, + "loss": 1.03055215, + "num_input_tokens_seen": 10986775, + "router_z_loss_clip": 12.8359375, + "router_z_loss_mlp": 2.34375, + "step": 517, + "time_per_iteration": 2.666154623031616 + }, + { + "auxiliary_loss_clip": 0.08820206, + "auxiliary_loss_mlp": 0.02414127, + "balance_loss_clip": 0.07551458, + "balance_loss_mlp": 0.02194306, + "epoch": 0.031143844882008116, + "flos": 29723963091840.0, + "grad_norm": 7.285252117980509, + "language_loss": 0.99479294, + "learning_rate": 3.999986310859396e-06, + "loss": 1.10713625, + "num_input_tokens_seen": 11011360, + "router_z_loss_clip": 12.6796875, + "router_z_loss_mlp": 2.19824219, + "step": 518, + "time_per_iteration": 2.752505302429199 + }, + { + "auxiliary_loss_clip": 0.08830461, + "auxiliary_loss_mlp": 0.024645, + "balance_loss_clip": 0.07556459, + "balance_loss_mlp": 0.02246586, + "epoch": 0.031203968134676085, + "flos": 23119172058240.0, + "grad_norm": 20.736865355911096, + "language_loss": 1.01917171, + "learning_rate": 3.999984831979039e-06, + "loss": 1.13212132, + "num_input_tokens_seen": 11030150, + "router_z_loss_clip": 12.734375, + "router_z_loss_mlp": 2.1796875, + "step": 519, + "time_per_iteration": 2.6659457683563232 + }, + { + "auxiliary_loss_clip": 0.08817208, + "auxiliary_loss_mlp": 0.02465606, + "balance_loss_clip": 0.07545176, + "balance_loss_mlp": 0.02241778, + "epoch": 0.03126409138734405, + "flos": 20959815214080.0, + "grad_norm": 7.142122271726701, + "language_loss": 1.00803113, + "learning_rate": 3.999983277259057e-06, + "loss": 1.12085938, + "num_input_tokens_seen": 11049145, + "router_z_loss_clip": 12.7265625, + "router_z_loss_mlp": 2.23632812, + "step": 520, + "time_per_iteration": 2.7612173557281494 + }, + { + "auxiliary_loss_clip": 0.08873951, + "auxiliary_loss_mlp": 0.02427922, + "balance_loss_clip": 0.07591425, + "balance_loss_mlp": 0.02219163, + "epoch": 0.031324214640012026, + "flos": 21656083916160.0, + "grad_norm": 5386.394179139514, + "language_loss": 1.03191018, + "learning_rate": 3.999981646699509e-06, + "loss": 1.14492893, + "num_input_tokens_seen": 11068835, + "router_z_loss_clip": 12.8203125, + "router_z_loss_mlp": 2.08886719, + "step": 521, + "time_per_iteration": 2.6934170722961426 + }, + { + "auxiliary_loss_clip": 0.08889641, + "auxiliary_loss_mlp": 0.02359363, + "balance_loss_clip": 0.07604645, + "balance_loss_mlp": 0.02163669, + "epoch": 0.03138433789267999, + "flos": 23448180314880.0, + "grad_norm": 8.073235529869596, + "language_loss": 0.83005708, + "learning_rate": 3.999979940300456e-06, + "loss": 0.94254714, + "num_input_tokens_seen": 11088980, + "router_z_loss_clip": 12.84375, + "router_z_loss_mlp": 1.95800781, + "step": 522, + "time_per_iteration": 2.8722758293151855 + }, + { + "auxiliary_loss_clip": 0.08903908, + "auxiliary_loss_mlp": 0.02254118, + "balance_loss_clip": 0.07622182, + "balance_loss_mlp": 0.0208465, + "epoch": 0.03144446114534796, + "flos": 18986939631360.0, + "grad_norm": 12.411483225368043, + "language_loss": 1.05680871, + "learning_rate": 3.999978158061963e-06, + "loss": 1.16838908, + "num_input_tokens_seen": 11104300, + "router_z_loss_clip": 12.8046875, + "router_z_loss_mlp": 1.6953125, + "step": 523, + "time_per_iteration": 2.650547742843628 + }, + { + "auxiliary_loss_clip": 0.08934012, + "auxiliary_loss_mlp": 0.02230434, + "balance_loss_clip": 0.07644011, + "balance_loss_mlp": 0.0206087, + "epoch": 0.031504584398015935, + "flos": 22644240716160.0, + "grad_norm": 13.96543726868128, + "language_loss": 1.08792841, + "learning_rate": 3.999976299984099e-06, + "loss": 1.1995728, + "num_input_tokens_seen": 11123335, + "router_z_loss_clip": 12.8984375, + "router_z_loss_mlp": 1.69628906, + "step": 524, + "time_per_iteration": 2.7135303020477295 + }, + { + "auxiliary_loss_clip": 0.08891568, + "auxiliary_loss_mlp": 0.02091454, + "balance_loss_clip": 0.07603844, + "balance_loss_mlp": 0.0193486, + "epoch": 0.0315647076506839, + "flos": 25303364438400.0, + "grad_norm": 13.325751395918596, + "language_loss": 0.96287918, + "learning_rate": 3.999974366066933e-06, + "loss": 1.07270944, + "num_input_tokens_seen": 11140880, + "router_z_loss_clip": 12.875, + "router_z_loss_mlp": 1.56542969, + "step": 525, + "time_per_iteration": 2.7008469104766846 + }, + { + "auxiliary_loss_clip": 0.08895689, + "auxiliary_loss_mlp": 0.02060743, + "balance_loss_clip": 0.07611247, + "balance_loss_mlp": 0.01902052, + "epoch": 0.03162483090335187, + "flos": 16988515752960.0, + "grad_norm": 10.865036443132793, + "language_loss": 0.93799376, + "learning_rate": 3.999972356310538e-06, + "loss": 1.04755807, + "num_input_tokens_seen": 11158710, + "router_z_loss_clip": 12.84375, + "router_z_loss_mlp": 1.58789062, + "step": 526, + "time_per_iteration": 2.6346511840820312 + }, + { + "auxiliary_loss_clip": 0.08917748, + "auxiliary_loss_mlp": 0.01935945, + "balance_loss_clip": 0.07596096, + "balance_loss_mlp": 0.01773629, + "epoch": 0.03168495415601984, + "flos": 18740515173120.0, + "grad_norm": 57.85895101220995, + "language_loss": 0.99752951, + "learning_rate": 3.999970270714991e-06, + "loss": 1.10606647, + "num_input_tokens_seen": 11177550, + "router_z_loss_clip": 13.2109375, + "router_z_loss_mlp": 1.62402344, + "step": 527, + "time_per_iteration": 2.679004669189453 + }, + { + "auxiliary_loss_clip": 0.08855803, + "auxiliary_loss_mlp": 0.01834989, + "balance_loss_clip": 0.07585346, + "balance_loss_mlp": 0.01673914, + "epoch": 0.03174507740868781, + "flos": 21221207625600.0, + "grad_norm": 46.02909291045389, + "language_loss": 1.11322296, + "learning_rate": 3.999968109280371e-06, + "loss": 1.22013092, + "num_input_tokens_seen": 11196230, + "router_z_loss_clip": 12.703125, + "router_z_loss_mlp": 1.61035156, + "step": 528, + "time_per_iteration": 2.6590561866760254 + }, + { + "auxiliary_loss_clip": 0.08896849, + "auxiliary_loss_mlp": 0.01846134, + "balance_loss_clip": 0.07587088, + "balance_loss_mlp": 0.01668655, + "epoch": 0.03180520066135578, + "flos": 24794122049280.0, + "grad_norm": 60.37354361545739, + "language_loss": 0.97275496, + "learning_rate": 3.99996587200676e-06, + "loss": 1.08018494, + "num_input_tokens_seen": 11214935, + "router_z_loss_clip": 13.09375, + "router_z_loss_mlp": 1.77539062, + "step": 529, + "time_per_iteration": 2.7260618209838867 + }, + { + "auxiliary_loss_clip": 0.08883977, + "auxiliary_loss_mlp": 0.01771414, + "balance_loss_clip": 0.07582102, + "balance_loss_mlp": 0.01579535, + "epoch": 0.03186532391402375, + "flos": 24871339186560.0, + "grad_norm": 10627.611218983826, + "language_loss": 1.18170238, + "learning_rate": 3.999963558894243e-06, + "loss": 1.28825641, + "num_input_tokens_seen": 11235310, + "router_z_loss_clip": 13.015625, + "router_z_loss_mlp": 1.91894531, + "step": 530, + "time_per_iteration": 2.7020938396453857 + }, + { + "auxiliary_loss_clip": 0.08833256, + "auxiliary_loss_mlp": 0.01774458, + "balance_loss_clip": 0.07546531, + "balance_loss_mlp": 0.01588683, + "epoch": 0.03192544716669172, + "flos": 21221417260800.0, + "grad_norm": 74.92861353079512, + "language_loss": 0.92192125, + "learning_rate": 3.999961169942907e-06, + "loss": 1.02799833, + "num_input_tokens_seen": 11254425, + "router_z_loss_clip": 12.8671875, + "router_z_loss_mlp": 1.85644531, + "step": 531, + "time_per_iteration": 5.536854028701782 + }, + { + "auxiliary_loss_clip": 0.08819988, + "auxiliary_loss_mlp": 0.0179185, + "balance_loss_clip": 0.07536054, + "balance_loss_mlp": 0.01611224, + "epoch": 0.03198557041935969, + "flos": 24360168153600.0, + "grad_norm": 15.362611414198588, + "language_loss": 1.04843593, + "learning_rate": 3.999958705152843e-06, + "loss": 1.15455437, + "num_input_tokens_seen": 11274595, + "router_z_loss_clip": 12.8359375, + "router_z_loss_mlp": 1.8046875, + "step": 532, + "time_per_iteration": 4.078269958496094 + }, + { + "auxiliary_loss_clip": 0.07593378, + "auxiliary_loss_mlp": 0.01964501, + "balance_loss_clip": 0.07000267, + "balance_loss_mlp": 0.01595619, + "epoch": 0.032045693672027656, + "flos": 61847235993600.0, + "grad_norm": 0.8955673428440366, + "language_loss": 0.58032346, + "learning_rate": 3.9999561645241445e-06, + "loss": 0.67590225, + "num_input_tokens_seen": 11336705, + "router_z_loss_clip": 5.9375, + "router_z_loss_mlp": 3.68554688, + "step": 533, + "time_per_iteration": 3.319361925125122 + }, + { + "auxiliary_loss_clip": 0.08788651, + "auxiliary_loss_mlp": 0.01742728, + "balance_loss_clip": 0.07528964, + "balance_loss_mlp": 0.01567061, + "epoch": 0.03210581692469563, + "flos": 28408475116800.0, + "grad_norm": 18.42557842883857, + "language_loss": 0.99417937, + "learning_rate": 3.999953548056907e-06, + "loss": 1.09949315, + "num_input_tokens_seen": 11356820, + "router_z_loss_clip": 12.5859375, + "router_z_loss_mlp": 1.75585938, + "step": 534, + "time_per_iteration": 4.265074729919434 + }, + { + "auxiliary_loss_clip": 0.08770919, + "auxiliary_loss_mlp": 0.0174947, + "balance_loss_clip": 0.07504185, + "balance_loss_mlp": 0.01577809, + "epoch": 0.03216594017736359, + "flos": 24724661414400.0, + "grad_norm": 508.9639434919875, + "language_loss": 0.94137996, + "learning_rate": 3.999950855751232e-06, + "loss": 1.04658389, + "num_input_tokens_seen": 11376645, + "router_z_loss_clip": 12.671875, + "router_z_loss_mlp": 1.71777344, + "step": 535, + "time_per_iteration": 2.7245981693267822 + }, + { + "auxiliary_loss_clip": 0.08758718, + "auxiliary_loss_mlp": 0.01725335, + "balance_loss_clip": 0.07518992, + "balance_loss_mlp": 0.01554437, + "epoch": 0.032226063430031565, + "flos": 31183445508480.0, + "grad_norm": 22.532643943929422, + "language_loss": 0.94802475, + "learning_rate": 3.999948087607219e-06, + "loss": 1.05286527, + "num_input_tokens_seen": 11397310, + "router_z_loss_clip": 12.390625, + "router_z_loss_mlp": 1.70996094, + "step": 536, + "time_per_iteration": 2.7583792209625244 + }, + { + "auxiliary_loss_clip": 0.08705089, + "auxiliary_loss_mlp": 0.01729852, + "balance_loss_clip": 0.07491484, + "balance_loss_mlp": 0.01569253, + "epoch": 0.03228618668269954, + "flos": 32206584188160.0, + "grad_norm": 18.146665662297185, + "language_loss": 0.83908743, + "learning_rate": 3.999945243624975e-06, + "loss": 0.94343686, + "num_input_tokens_seen": 11418475, + "router_z_loss_clip": 12.1484375, + "router_z_loss_mlp": 1.60546875, + "step": 537, + "time_per_iteration": 2.770418167114258 + }, + { + "auxiliary_loss_clip": 0.08731261, + "auxiliary_loss_mlp": 0.01758368, + "balance_loss_clip": 0.07496089, + "balance_loss_mlp": 0.0159672, + "epoch": 0.0323463099353675, + "flos": 22676036140800.0, + "grad_norm": 12.39933899749453, + "language_loss": 0.95942801, + "learning_rate": 3.999942323804607e-06, + "loss": 1.06432438, + "num_input_tokens_seen": 11436630, + "router_z_loss_clip": 12.3515625, + "router_z_loss_mlp": 1.6171875, + "step": 538, + "time_per_iteration": 2.7392029762268066 + }, + { + "auxiliary_loss_clip": 0.0875225, + "auxiliary_loss_mlp": 0.01750456, + "balance_loss_clip": 0.07507962, + "balance_loss_mlp": 0.01584802, + "epoch": 0.032406433188035474, + "flos": 26912207957760.0, + "grad_norm": 95.24255955505957, + "language_loss": 0.90228236, + "learning_rate": 3.999939328146225e-06, + "loss": 1.00730944, + "num_input_tokens_seen": 11457275, + "router_z_loss_clip": 12.4453125, + "router_z_loss_mlp": 1.65625, + "step": 539, + "time_per_iteration": 2.760545253753662 + }, + { + "auxiliary_loss_clip": 0.08700242, + "auxiliary_loss_mlp": 0.01788145, + "balance_loss_clip": 0.07481987, + "balance_loss_mlp": 0.0161162, + "epoch": 0.03246655644070344, + "flos": 31511992567680.0, + "grad_norm": 15.31403595077071, + "language_loss": 0.89398444, + "learning_rate": 3.999936256649943e-06, + "loss": 0.99886829, + "num_input_tokens_seen": 11476925, + "router_z_loss_clip": 12.1875, + "router_z_loss_mlp": 1.76757812, + "step": 540, + "time_per_iteration": 2.791525363922119 + }, + { + "auxiliary_loss_clip": 0.08740143, + "auxiliary_loss_mlp": 0.01834392, + "balance_loss_clip": 0.07499444, + "balance_loss_mlp": 0.01643276, + "epoch": 0.03252667969337141, + "flos": 23224453113600.0, + "grad_norm": 73.47244628512628, + "language_loss": 0.99572086, + "learning_rate": 3.999933109315878e-06, + "loss": 1.10146618, + "num_input_tokens_seen": 11496830, + "router_z_loss_clip": 12.40625, + "router_z_loss_mlp": 1.90917969, + "step": 541, + "time_per_iteration": 2.698315143585205 + }, + { + "auxiliary_loss_clip": 0.08765414, + "auxiliary_loss_mlp": 0.01821723, + "balance_loss_clip": 0.07523992, + "balance_loss_mlp": 0.01612201, + "epoch": 0.032586802946039384, + "flos": 14762800874880.0, + "grad_norm": 49.77821697975532, + "language_loss": 1.00654817, + "learning_rate": 3.9999298861441496e-06, + "loss": 1.11241961, + "num_input_tokens_seen": 11515605, + "router_z_loss_clip": 12.4296875, + "router_z_loss_mlp": 2.09667969, + "step": 542, + "time_per_iteration": 2.6720223426818848 + }, + { + "auxiliary_loss_clip": 0.08722232, + "auxiliary_loss_mlp": 0.01879557, + "balance_loss_clip": 0.07465587, + "balance_loss_mlp": 0.01644953, + "epoch": 0.03264692619870735, + "flos": 24287688771840.0, + "grad_norm": 65.19472082730613, + "language_loss": 0.83699101, + "learning_rate": 3.999926587134879e-06, + "loss": 0.9430089, + "num_input_tokens_seen": 11536230, + "router_z_loss_clip": 12.5625, + "router_z_loss_mlp": 2.34375, + "step": 543, + "time_per_iteration": 2.692474842071533 + }, + { + "auxiliary_loss_clip": 0.0878472, + "auxiliary_loss_mlp": 0.01882603, + "balance_loss_clip": 0.07507792, + "balance_loss_mlp": 0.01631214, + "epoch": 0.03270704945137532, + "flos": 22899763342080.0, + "grad_norm": 1912.553873416959, + "language_loss": 1.09316349, + "learning_rate": 3.999923212288192e-06, + "loss": 1.19983673, + "num_input_tokens_seen": 11554715, + "router_z_loss_clip": 12.7734375, + "router_z_loss_mlp": 2.51367188, + "step": 544, + "time_per_iteration": 2.663267135620117 + }, + { + "auxiliary_loss_clip": 0.0881625, + "auxiliary_loss_mlp": 0.01879222, + "balance_loss_clip": 0.07490219, + "balance_loss_mlp": 0.01537997, + "epoch": 0.032767172704043286, + "flos": 18046887874560.0, + "grad_norm": 1976.6790975556307, + "language_loss": 0.85651809, + "learning_rate": 3.999919761604216e-06, + "loss": 0.96347284, + "num_input_tokens_seen": 11571370, + "router_z_loss_clip": 13.265625, + "router_z_loss_mlp": 3.41210938, + "step": 545, + "time_per_iteration": 2.6566007137298584 + }, + { + "auxiliary_loss_clip": 0.08881226, + "auxiliary_loss_mlp": 0.01919651, + "balance_loss_clip": 0.07538594, + "balance_loss_mlp": 0.01591969, + "epoch": 0.03282729595671126, + "flos": 22535353935360.0, + "grad_norm": 36635.99630864103, + "language_loss": 1.19350576, + "learning_rate": 3.999916235083083e-06, + "loss": 1.30151451, + "num_input_tokens_seen": 11588560, + "router_z_loss_clip": 13.421875, + "router_z_loss_mlp": 3.27539062, + "step": 546, + "time_per_iteration": 2.6508443355560303 + }, + { + "auxiliary_loss_clip": 0.0885489, + "auxiliary_loss_mlp": 0.01969573, + "balance_loss_clip": 0.07525921, + "balance_loss_mlp": 0.01650092, + "epoch": 0.03288741920937923, + "flos": 20416555267200.0, + "grad_norm": 175.83782863941582, + "language_loss": 1.0484463, + "learning_rate": 3.999912632724925e-06, + "loss": 1.15669084, + "num_input_tokens_seen": 11605685, + "router_z_loss_clip": 13.28125, + "router_z_loss_mlp": 3.1953125, + "step": 547, + "time_per_iteration": 2.709317445755005 + }, + { + "auxiliary_loss_clip": 0.08846241, + "auxiliary_loss_mlp": 0.02054837, + "balance_loss_clip": 0.07521404, + "balance_loss_mlp": 0.01724484, + "epoch": 0.032947542462047195, + "flos": 20784402691200.0, + "grad_norm": 1231.4634556281662, + "language_loss": 0.99917918, + "learning_rate": 3.999908954529881e-06, + "loss": 1.10818994, + "num_input_tokens_seen": 11626290, + "router_z_loss_clip": 13.2578125, + "router_z_loss_mlp": 3.30664062, + "step": 548, + "time_per_iteration": 2.761152744293213 + }, + { + "auxiliary_loss_clip": 0.08837526, + "auxiliary_loss_mlp": 0.02099407, + "balance_loss_clip": 0.07500955, + "balance_loss_mlp": 0.01773059, + "epoch": 0.03300766571471517, + "flos": 19907354805120.0, + "grad_norm": 538.4476306780408, + "language_loss": 0.89559388, + "learning_rate": 3.999905200498087e-06, + "loss": 1.00496316, + "num_input_tokens_seen": 11643950, + "router_z_loss_clip": 13.3671875, + "router_z_loss_mlp": 3.26367188, + "step": 549, + "time_per_iteration": 2.7063941955566406 + }, + { + "auxiliary_loss_clip": 0.08802217, + "auxiliary_loss_mlp": 0.02104246, + "balance_loss_clip": 0.07490957, + "balance_loss_mlp": 0.0178324, + "epoch": 0.03306778896738313, + "flos": 17973569952000.0, + "grad_norm": 95.24031464069257, + "language_loss": 1.00179911, + "learning_rate": 3.999901370629689e-06, + "loss": 1.1108638, + "num_input_tokens_seen": 11662560, + "router_z_loss_clip": 13.125, + "router_z_loss_mlp": 3.20703125, + "step": 550, + "time_per_iteration": 2.6905276775360107 + }, + { + "auxiliary_loss_clip": 0.08789266, + "auxiliary_loss_mlp": 0.02134598, + "balance_loss_clip": 0.07500902, + "balance_loss_mlp": 0.01818551, + "epoch": 0.033127912220051105, + "flos": 21659899276800.0, + "grad_norm": 52.30662645055097, + "language_loss": 0.93777549, + "learning_rate": 3.99989746492483e-06, + "loss": 1.04701412, + "num_input_tokens_seen": 11682265, + "router_z_loss_clip": 12.8984375, + "router_z_loss_mlp": 3.16015625, + "step": 551, + "time_per_iteration": 2.7061314582824707 + }, + { + "auxiliary_loss_clip": 0.08738074, + "auxiliary_loss_mlp": 0.02134365, + "balance_loss_clip": 0.07474738, + "balance_loss_mlp": 0.01835484, + "epoch": 0.03318803547271908, + "flos": 30195875687040.0, + "grad_norm": 81.64424293941155, + "language_loss": 1.06586599, + "learning_rate": 3.999893483383658e-06, + "loss": 1.17459035, + "num_input_tokens_seen": 11699300, + "router_z_loss_clip": 12.6484375, + "router_z_loss_mlp": 2.98828125, + "step": 552, + "time_per_iteration": 2.7557857036590576 + }, + { + "auxiliary_loss_clip": 0.08738689, + "auxiliary_loss_mlp": 0.02132193, + "balance_loss_clip": 0.07474653, + "balance_loss_mlp": 0.01841513, + "epoch": 0.03324815872538704, + "flos": 20382286147200.0, + "grad_norm": 103.46520912531122, + "language_loss": 1.07230687, + "learning_rate": 3.999889426006326e-06, + "loss": 1.18101549, + "num_input_tokens_seen": 11716955, + "router_z_loss_clip": 12.6328125, + "router_z_loss_mlp": 2.90625, + "step": 553, + "time_per_iteration": 2.6690380573272705 + }, + { + "auxiliary_loss_clip": 0.0876793, + "auxiliary_loss_mlp": 0.02203825, + "balance_loss_clip": 0.07493228, + "balance_loss_mlp": 0.01878431, + "epoch": 0.033308281978055014, + "flos": 24500766504960.0, + "grad_norm": 2577.3704160991106, + "language_loss": 0.91311669, + "learning_rate": 3.999885292792986e-06, + "loss": 1.0228343, + "num_input_tokens_seen": 11736130, + "router_z_loss_clip": 12.75, + "router_z_loss_mlp": 3.25390625, + "step": 554, + "time_per_iteration": 2.690467119216919 + }, + { + "auxiliary_loss_clip": 0.08781252, + "auxiliary_loss_mlp": 0.02161472, + "balance_loss_clip": 0.0750941, + "balance_loss_mlp": 0.01854961, + "epoch": 0.03336840523072298, + "flos": 23406406254720.0, + "grad_norm": 23.66967902789698, + "language_loss": 0.92365468, + "learning_rate": 3.999881083743795e-06, + "loss": 1.03308201, + "num_input_tokens_seen": 11754425, + "router_z_loss_clip": 12.7265625, + "router_z_loss_mlp": 3.06445312, + "step": 555, + "time_per_iteration": 2.7009239196777344 + }, + { + "auxiliary_loss_clip": 0.0871176, + "auxiliary_loss_mlp": 0.02191896, + "balance_loss_clip": 0.0746032, + "balance_loss_mlp": 0.01904268, + "epoch": 0.03342852848339095, + "flos": 30557685617280.0, + "grad_norm": 32.47411862244808, + "language_loss": 1.03816569, + "learning_rate": 3.999876798858914e-06, + "loss": 1.14720225, + "num_input_tokens_seen": 11772845, + "router_z_loss_clip": 12.5234375, + "router_z_loss_mlp": 2.875, + "step": 556, + "time_per_iteration": 2.7751269340515137 + }, + { + "auxiliary_loss_clip": 0.08728363, + "auxiliary_loss_mlp": 0.02208938, + "balance_loss_clip": 0.07497713, + "balance_loss_mlp": 0.01914825, + "epoch": 0.03348865173605892, + "flos": 22899931050240.0, + "grad_norm": 26.350622314910414, + "language_loss": 0.97158062, + "learning_rate": 3.999872438138503e-06, + "loss": 1.0809536, + "num_input_tokens_seen": 11792850, + "router_z_loss_clip": 12.3046875, + "router_z_loss_mlp": 2.93945312, + "step": 557, + "time_per_iteration": 2.6803956031799316 + }, + { + "auxiliary_loss_clip": 0.08708371, + "auxiliary_loss_mlp": 0.02154386, + "balance_loss_clip": 0.0748485, + "balance_loss_mlp": 0.01905477, + "epoch": 0.03354877498872689, + "flos": 17681807635200.0, + "grad_norm": 18.772470179547817, + "language_loss": 1.10132766, + "learning_rate": 3.999868001582729e-06, + "loss": 1.20995522, + "num_input_tokens_seen": 11809670, + "router_z_loss_clip": 12.2265625, + "router_z_loss_mlp": 2.49023438, + "step": 558, + "time_per_iteration": 2.650348663330078 + }, + { + "auxiliary_loss_clip": 0.08667068, + "auxiliary_loss_mlp": 0.02131925, + "balance_loss_clip": 0.07472065, + "balance_loss_mlp": 0.01914487, + "epoch": 0.03360889824139486, + "flos": 21659438079360.0, + "grad_norm": 17.45552884003481, + "language_loss": 0.92322779, + "learning_rate": 3.99986348919176e-06, + "loss": 1.03121769, + "num_input_tokens_seen": 11829665, + "router_z_loss_clip": 11.953125, + "router_z_loss_mlp": 2.17578125, + "step": 559, + "time_per_iteration": 2.69866681098938 + }, + { + "auxiliary_loss_clip": 0.08715945, + "auxiliary_loss_mlp": 0.02064835, + "balance_loss_clip": 0.07521564, + "balance_loss_mlp": 0.01861607, + "epoch": 0.033669021494062826, + "flos": 21801671585280.0, + "grad_norm": 8.293279297555102, + "language_loss": 0.96911502, + "learning_rate": 3.9998589009657675e-06, + "loss": 1.07692266, + "num_input_tokens_seen": 11848190, + "router_z_loss_clip": 11.9453125, + "router_z_loss_mlp": 2.03417969, + "step": 560, + "time_per_iteration": 2.7140135765075684 + }, + { + "auxiliary_loss_clip": 0.08642244, + "auxiliary_loss_mlp": 0.01977364, + "balance_loss_clip": 0.07480196, + "balance_loss_mlp": 0.01790062, + "epoch": 0.0337291447467308, + "flos": 21871761125760.0, + "grad_norm": 36.168101096947126, + "language_loss": 0.91244531, + "learning_rate": 3.999854236904925e-06, + "loss": 1.01864135, + "num_input_tokens_seen": 11864795, + "router_z_loss_clip": 11.640625, + "router_z_loss_mlp": 1.875, + "step": 561, + "time_per_iteration": 2.6863293647766113 + }, + { + "auxiliary_loss_clip": 0.08645087, + "auxiliary_loss_mlp": 0.01996294, + "balance_loss_clip": 0.07495341, + "balance_loss_mlp": 0.01809374, + "epoch": 0.03378926799939877, + "flos": 24253251943680.0, + "grad_norm": 9.210066016696686, + "language_loss": 0.90415317, + "learning_rate": 3.999849497009409e-06, + "loss": 1.01056707, + "num_input_tokens_seen": 11885275, + "router_z_loss_clip": 11.4921875, + "router_z_loss_mlp": 1.86914062, + "step": 562, + "time_per_iteration": 2.724127769470215 + }, + { + "auxiliary_loss_clip": 0.08630846, + "auxiliary_loss_mlp": 0.01896325, + "balance_loss_clip": 0.07475269, + "balance_loss_mlp": 0.0172867, + "epoch": 0.033849391252066735, + "flos": 16513290921600.0, + "grad_norm": 8.70795014369516, + "language_loss": 0.93251538, + "learning_rate": 3.999844681279401e-06, + "loss": 1.03778696, + "num_input_tokens_seen": 11903595, + "router_z_loss_clip": 11.5546875, + "router_z_loss_mlp": 1.67773438, + "step": 563, + "time_per_iteration": 2.653869867324829 + }, + { + "auxiliary_loss_clip": 0.08601731, + "auxiliary_loss_mlp": 0.0185707, + "balance_loss_clip": 0.07466102, + "balance_loss_mlp": 0.01686648, + "epoch": 0.03390951450473471, + "flos": 15674746786560.0, + "grad_norm": 12.715008158349837, + "language_loss": 1.03361213, + "learning_rate": 3.99983978971508e-06, + "loss": 1.13820004, + "num_input_tokens_seen": 11917815, + "router_z_loss_clip": 11.34375, + "router_z_loss_mlp": 1.70507812, + "step": 564, + "time_per_iteration": 2.6272659301757812 + }, + { + "auxiliary_loss_clip": 0.08544251, + "auxiliary_loss_mlp": 0.01761406, + "balance_loss_clip": 0.07418631, + "balance_loss_mlp": 0.01609581, + "epoch": 0.03396963775740267, + "flos": 22681444728960.0, + "grad_norm": 17.830043780961535, + "language_loss": 1.06299067, + "learning_rate": 3.999834822316635e-06, + "loss": 1.1660471, + "num_input_tokens_seen": 11936305, + "router_z_loss_clip": 11.2578125, + "router_z_loss_mlp": 1.51855469, + "step": 565, + "time_per_iteration": 2.6662397384643555 + }, + { + "auxiliary_loss_clip": 0.07533604, + "auxiliary_loss_mlp": 0.01361189, + "balance_loss_clip": 0.07012594, + "balance_loss_mlp": 0.01291713, + "epoch": 0.034029761010070644, + "flos": 64414872656640.0, + "grad_norm": 1.941550580035849, + "language_loss": 0.56352836, + "learning_rate": 3.9998297790842535e-06, + "loss": 0.65247625, + "num_input_tokens_seen": 11998940, + "router_z_loss_clip": 5.203125, + "router_z_loss_mlp": 0.6953125, + "step": 566, + "time_per_iteration": 3.3542587757110596 + }, + { + "auxiliary_loss_clip": 0.08492532, + "auxiliary_loss_mlp": 0.0159982, + "balance_loss_clip": 0.07380439, + "balance_loss_mlp": 0.01460488, + "epoch": 0.034089884262738616, + "flos": 25010302383360.0, + "grad_norm": 17.320262523662066, + "language_loss": 0.91644871, + "learning_rate": 3.999824660018126e-06, + "loss": 1.01737225, + "num_input_tokens_seen": 12018860, + "router_z_loss_clip": 11.125, + "router_z_loss_mlp": 1.39355469, + "step": 567, + "time_per_iteration": 2.7798964977264404 + }, + { + "auxiliary_loss_clip": 0.08452182, + "auxiliary_loss_mlp": 0.01578824, + "balance_loss_clip": 0.07376789, + "balance_loss_mlp": 0.01451318, + "epoch": 0.03415000751540658, + "flos": 28446643451520.0, + "grad_norm": 16.848598157475653, + "language_loss": 0.91613495, + "learning_rate": 3.999819465118447e-06, + "loss": 1.01644492, + "num_input_tokens_seen": 12039675, + "router_z_loss_clip": 10.7578125, + "router_z_loss_mlp": 1.27539062, + "step": 568, + "time_per_iteration": 2.7506062984466553 + }, + { + "auxiliary_loss_clip": 0.08471178, + "auxiliary_loss_mlp": 0.01592293, + "balance_loss_clip": 0.07369491, + "balance_loss_mlp": 0.0146307, + "epoch": 0.034210130768074554, + "flos": 21474843534720.0, + "grad_norm": 19.531015605864777, + "language_loss": 0.96641582, + "learning_rate": 3.999814194385413e-06, + "loss": 1.06705046, + "num_input_tokens_seen": 12057680, + "router_z_loss_clip": 11.0234375, + "router_z_loss_mlp": 1.29199219, + "step": 569, + "time_per_iteration": 2.679094076156616 + }, + { + "auxiliary_loss_clip": 0.08444348, + "auxiliary_loss_mlp": 0.01572924, + "balance_loss_clip": 0.07354259, + "balance_loss_mlp": 0.01444559, + "epoch": 0.03427025402074252, + "flos": 18703436941440.0, + "grad_norm": 10.09748529662486, + "language_loss": 1.03407526, + "learning_rate": 3.9998088478192255e-06, + "loss": 1.13424802, + "num_input_tokens_seen": 12076135, + "router_z_loss_clip": 10.90625, + "router_z_loss_mlp": 1.28320312, + "step": 570, + "time_per_iteration": 5.62298059463501 + }, + { + "auxiliary_loss_clip": 0.08452979, + "auxiliary_loss_mlp": 0.01597574, + "balance_loss_clip": 0.07344566, + "balance_loss_mlp": 0.01465204, + "epoch": 0.03433037727341049, + "flos": 20856253167360.0, + "grad_norm": 7.817701028438559, + "language_loss": 0.91945982, + "learning_rate": 3.9998034254200846e-06, + "loss": 1.01996529, + "num_input_tokens_seen": 12094785, + "router_z_loss_clip": 11.09375, + "router_z_loss_mlp": 1.32421875, + "step": 571, + "time_per_iteration": 2.654836654663086 + }, + { + "auxiliary_loss_clip": 0.08401142, + "auxiliary_loss_mlp": 0.01674875, + "balance_loss_clip": 0.073204, + "balance_loss_mlp": 0.01534971, + "epoch": 0.03439050052607846, + "flos": 25417240536960.0, + "grad_norm": 10.131092922686104, + "language_loss": 0.93731064, + "learning_rate": 3.999797927188199e-06, + "loss": 1.0380708, + "num_input_tokens_seen": 12114590, + "router_z_loss_clip": 10.8046875, + "router_z_loss_mlp": 1.39941406, + "step": 572, + "time_per_iteration": 4.118088483810425 + }, + { + "auxiliary_loss_clip": 0.08396388, + "auxiliary_loss_mlp": 0.01765484, + "balance_loss_clip": 0.07306887, + "balance_loss_mlp": 0.01610417, + "epoch": 0.03445062377874643, + "flos": 17646029141760.0, + "grad_norm": 20.127104681387284, + "language_loss": 0.93513721, + "learning_rate": 3.999792353123774e-06, + "loss": 1.03675592, + "num_input_tokens_seen": 12132390, + "router_z_loss_clip": 10.8984375, + "router_z_loss_mlp": 1.55078125, + "step": 573, + "time_per_iteration": 2.743281841278076 + }, + { + "auxiliary_loss_clip": 0.08402257, + "auxiliary_loss_mlp": 0.01880152, + "balance_loss_clip": 0.07297936, + "balance_loss_mlp": 0.01694757, + "epoch": 0.0345107470314144, + "flos": 16770239066880.0, + "grad_norm": 36.525489937717154, + "language_loss": 0.90410393, + "learning_rate": 3.999786703227023e-06, + "loss": 1.00692797, + "num_input_tokens_seen": 12149035, + "router_z_loss_clip": 11.046875, + "router_z_loss_mlp": 1.85351562, + "step": 574, + "time_per_iteration": 4.080662250518799 + }, + { + "auxiliary_loss_clip": 0.08410574, + "auxiliary_loss_mlp": 0.01951083, + "balance_loss_clip": 0.0729783, + "balance_loss_mlp": 0.01742514, + "epoch": 0.03457087028408237, + "flos": 14689776441600.0, + "grad_norm": 44.337021824182244, + "language_loss": 0.94332999, + "learning_rate": 3.9997809774981606e-06, + "loss": 1.04694653, + "num_input_tokens_seen": 12167530, + "router_z_loss_clip": 11.125, + "router_z_loss_mlp": 2.08398438, + "step": 575, + "time_per_iteration": 2.6497297286987305 + }, + { + "auxiliary_loss_clip": 0.0841077, + "auxiliary_loss_mlp": 0.02005797, + "balance_loss_clip": 0.07284614, + "balance_loss_mlp": 0.01780635, + "epoch": 0.03463099353675034, + "flos": 20017499397120.0, + "grad_norm": 29.883353134979416, + "language_loss": 0.90882921, + "learning_rate": 3.9997751759374025e-06, + "loss": 1.01299489, + "num_input_tokens_seen": 12186340, + "router_z_loss_clip": 11.265625, + "router_z_loss_mlp": 2.24804688, + "step": 576, + "time_per_iteration": 2.67240309715271 + }, + { + "auxiliary_loss_clip": 0.08418353, + "auxiliary_loss_mlp": 0.02062659, + "balance_loss_clip": 0.07293572, + "balance_loss_mlp": 0.01817947, + "epoch": 0.03469111678941831, + "flos": 25308144120960.0, + "grad_norm": 230.42461275956111, + "language_loss": 0.94618452, + "learning_rate": 3.99976929854497e-06, + "loss": 1.05099463, + "num_input_tokens_seen": 12204090, + "router_z_loss_clip": 11.2421875, + "router_z_loss_mlp": 2.44921875, + "step": 577, + "time_per_iteration": 2.6817197799682617 + }, + { + "auxiliary_loss_clip": 0.08418664, + "auxiliary_loss_mlp": 0.02057238, + "balance_loss_clip": 0.07282382, + "balance_loss_mlp": 0.01803943, + "epoch": 0.034751240042086275, + "flos": 23266311027840.0, + "grad_norm": 40.134119868020754, + "language_loss": 0.81416667, + "learning_rate": 3.9997633453210845e-06, + "loss": 0.9189257, + "num_input_tokens_seen": 12224850, + "router_z_loss_clip": 11.359375, + "router_z_loss_mlp": 2.53320312, + "step": 578, + "time_per_iteration": 2.6971585750579834 + }, + { + "auxiliary_loss_clip": 0.08457734, + "auxiliary_loss_mlp": 0.0202791, + "balance_loss_clip": 0.07290839, + "balance_loss_mlp": 0.0177881, + "epoch": 0.03481136329475425, + "flos": 23776056541440.0, + "grad_norm": 24.631913893483972, + "language_loss": 0.86342728, + "learning_rate": 3.999757316265973e-06, + "loss": 0.96828371, + "num_input_tokens_seen": 12244935, + "router_z_loss_clip": 11.6640625, + "router_z_loss_mlp": 2.4921875, + "step": 579, + "time_per_iteration": 2.694719076156616 + }, + { + "auxiliary_loss_clip": 0.08425288, + "auxiliary_loss_mlp": 0.0202294, + "balance_loss_clip": 0.07289667, + "balance_loss_mlp": 0.01773459, + "epoch": 0.03487148654742222, + "flos": 20163799825920.0, + "grad_norm": 24.746236106534205, + "language_loss": 0.94137156, + "learning_rate": 3.999751211379863e-06, + "loss": 1.04585385, + "num_input_tokens_seen": 12262140, + "router_z_loss_clip": 11.3671875, + "router_z_loss_mlp": 2.49609375, + "step": 580, + "time_per_iteration": 2.6965222358703613 + }, + { + "auxiliary_loss_clip": 0.08429064, + "auxiliary_loss_mlp": 0.02027245, + "balance_loss_clip": 0.07292753, + "balance_loss_mlp": 0.01790066, + "epoch": 0.034931609800090184, + "flos": 15675082202880.0, + "grad_norm": 72.69729205239823, + "language_loss": 0.92401338, + "learning_rate": 3.999745030662987e-06, + "loss": 1.02857637, + "num_input_tokens_seen": 12280930, + "router_z_loss_clip": 11.34375, + "router_z_loss_mlp": 2.37011719, + "step": 581, + "time_per_iteration": 2.6485416889190674 + }, + { + "auxiliary_loss_clip": 0.08388546, + "auxiliary_loss_mlp": 0.01934185, + "balance_loss_clip": 0.07261664, + "balance_loss_mlp": 0.01722183, + "epoch": 0.034991733052758156, + "flos": 16367912887680.0, + "grad_norm": 7.903206829146829, + "language_loss": 0.86330044, + "learning_rate": 3.99973877411558e-06, + "loss": 0.96652782, + "num_input_tokens_seen": 12299125, + "router_z_loss_clip": 11.28125, + "router_z_loss_mlp": 2.11914062, + "step": 582, + "time_per_iteration": 2.649725914001465 + }, + { + "auxiliary_loss_clip": 0.08328964, + "auxiliary_loss_mlp": 0.01871683, + "balance_loss_clip": 0.07243238, + "balance_loss_mlp": 0.01678087, + "epoch": 0.03505185630542612, + "flos": 19392787681920.0, + "grad_norm": 16.174360943611433, + "language_loss": 0.95958614, + "learning_rate": 3.999732441737877e-06, + "loss": 1.06159258, + "num_input_tokens_seen": 12316905, + "router_z_loss_clip": 10.859375, + "router_z_loss_mlp": 1.9375, + "step": 583, + "time_per_iteration": 2.643488645553589 + }, + { + "auxiliary_loss_clip": 0.08363868, + "auxiliary_loss_mlp": 0.01881498, + "balance_loss_clip": 0.07254223, + "balance_loss_mlp": 0.0168199, + "epoch": 0.03511197955809409, + "flos": 21330094406400.0, + "grad_norm": 77.84633741200611, + "language_loss": 0.91128743, + "learning_rate": 3.99972603353012e-06, + "loss": 1.01374114, + "num_input_tokens_seen": 12335070, + "router_z_loss_clip": 11.09375, + "router_z_loss_mlp": 1.99511719, + "step": 584, + "time_per_iteration": 2.6665167808532715 + }, + { + "auxiliary_loss_clip": 0.08332659, + "auxiliary_loss_mlp": 0.01830344, + "balance_loss_clip": 0.07228079, + "balance_loss_mlp": 0.01642279, + "epoch": 0.035172102810762065, + "flos": 14141736812160.0, + "grad_norm": 18.638483190058057, + "language_loss": 1.05479646, + "learning_rate": 3.999719549492551e-06, + "loss": 1.15642655, + "num_input_tokens_seen": 12350315, + "router_z_loss_clip": 11.046875, + "router_z_loss_mlp": 1.88183594, + "step": 585, + "time_per_iteration": 2.6243345737457275 + }, + { + "auxiliary_loss_clip": 0.08346213, + "auxiliary_loss_mlp": 0.01757237, + "balance_loss_clip": 0.07237425, + "balance_loss_mlp": 0.01597305, + "epoch": 0.03523222606343003, + "flos": 20302092190080.0, + "grad_norm": 16.531437097419627, + "language_loss": 0.96612549, + "learning_rate": 3.9997129896254165e-06, + "loss": 1.06716001, + "num_input_tokens_seen": 12366030, + "router_z_loss_clip": 11.0859375, + "router_z_loss_mlp": 1.59960938, + "step": 586, + "time_per_iteration": 2.79085373878479 + }, + { + "auxiliary_loss_clip": 0.08346236, + "auxiliary_loss_mlp": 0.01816744, + "balance_loss_clip": 0.07224018, + "balance_loss_mlp": 0.01643652, + "epoch": 0.035292349316098, + "flos": 20382034584960.0, + "grad_norm": 18.968444028471765, + "language_loss": 0.85692161, + "learning_rate": 3.999706353928965e-06, + "loss": 0.95855141, + "num_input_tokens_seen": 12384895, + "router_z_loss_clip": 11.234375, + "router_z_loss_mlp": 1.73242188, + "step": 587, + "time_per_iteration": 2.6773126125335693 + }, + { + "auxiliary_loss_clip": 0.08336938, + "auxiliary_loss_mlp": 0.01864921, + "balance_loss_clip": 0.07205997, + "balance_loss_mlp": 0.01679527, + "epoch": 0.03535247256876597, + "flos": 21475011242880.0, + "grad_norm": 15.49018014588467, + "language_loss": 0.87486923, + "learning_rate": 3.999699642403449e-06, + "loss": 0.97688788, + "num_input_tokens_seen": 12404980, + "router_z_loss_clip": 11.3203125, + "router_z_loss_mlp": 1.85546875, + "step": 588, + "time_per_iteration": 2.7011075019836426 + }, + { + "auxiliary_loss_clip": 0.08372419, + "auxiliary_loss_mlp": 0.01837943, + "balance_loss_clip": 0.07240701, + "balance_loss_mlp": 0.01648257, + "epoch": 0.03541259582143394, + "flos": 23629798039680.0, + "grad_norm": 7.372880070726386, + "language_loss": 1.04957795, + "learning_rate": 3.99969285504912e-06, + "loss": 1.15168166, + "num_input_tokens_seen": 12423835, + "router_z_loss_clip": 11.3203125, + "router_z_loss_mlp": 1.8984375, + "step": 589, + "time_per_iteration": 2.6905288696289062 + }, + { + "auxiliary_loss_clip": 0.08381461, + "auxiliary_loss_mlp": 0.01904967, + "balance_loss_clip": 0.07235886, + "balance_loss_mlp": 0.0170708, + "epoch": 0.03547271907410191, + "flos": 33734269428480.0, + "grad_norm": 5.900447642035286, + "language_loss": 0.93457747, + "learning_rate": 3.99968599186624e-06, + "loss": 1.03744173, + "num_input_tokens_seen": 12443135, + "router_z_loss_clip": 11.4609375, + "router_z_loss_mlp": 1.98046875, + "step": 590, + "time_per_iteration": 2.7626585960388184 + }, + { + "auxiliary_loss_clip": 0.08363292, + "auxiliary_loss_mlp": 0.01913512, + "balance_loss_clip": 0.07212853, + "balance_loss_mlp": 0.01716864, + "epoch": 0.03553284232676988, + "flos": 21149147514240.0, + "grad_norm": 8.056614912073432, + "language_loss": 0.93932045, + "learning_rate": 3.999679052855065e-06, + "loss": 1.04208851, + "num_input_tokens_seen": 12462895, + "router_z_loss_clip": 11.5, + "router_z_loss_mlp": 1.96484375, + "step": 591, + "time_per_iteration": 2.6892929077148438 + }, + { + "auxiliary_loss_clip": 0.08372159, + "auxiliary_loss_mlp": 0.0192709, + "balance_loss_clip": 0.0721619, + "balance_loss_mlp": 0.01729871, + "epoch": 0.03559296557943785, + "flos": 20052607057920.0, + "grad_norm": 11.504016210282687, + "language_loss": 0.90931952, + "learning_rate": 3.999672038015861e-06, + "loss": 1.01231205, + "num_input_tokens_seen": 12481515, + "router_z_loss_clip": 11.5546875, + "router_z_loss_mlp": 1.97363281, + "step": 592, + "time_per_iteration": 2.682248830795288 + }, + { + "auxiliary_loss_clip": 0.07476875, + "auxiliary_loss_mlp": 0.01418694, + "balance_loss_clip": 0.06931903, + "balance_loss_mlp": 0.01348551, + "epoch": 0.035653088832105814, + "flos": 60354742268160.0, + "grad_norm": 1.7390456768388496, + "language_loss": 0.61271667, + "learning_rate": 3.999664947348893e-06, + "loss": 0.70167232, + "num_input_tokens_seen": 12548220, + "router_z_loss_clip": 5.4375, + "router_z_loss_mlp": 0.70214844, + "step": 593, + "time_per_iteration": 3.372291088104248 + }, + { + "auxiliary_loss_clip": 0.08396088, + "auxiliary_loss_mlp": 0.01873215, + "balance_loss_clip": 0.07235788, + "balance_loss_mlp": 0.0169402, + "epoch": 0.035713212084773786, + "flos": 20118084624000.0, + "grad_norm": 4.056543882896522, + "language_loss": 0.9366371, + "learning_rate": 3.999657780854429e-06, + "loss": 1.03933024, + "num_input_tokens_seen": 12566105, + "router_z_loss_clip": 11.609375, + "router_z_loss_mlp": 1.79199219, + "step": 594, + "time_per_iteration": 2.656702756881714 + }, + { + "auxiliary_loss_clip": 0.08370538, + "auxiliary_loss_mlp": 0.01864142, + "balance_loss_clip": 0.07210694, + "balance_loss_mlp": 0.01671786, + "epoch": 0.03577333533744176, + "flos": 26292862903680.0, + "grad_norm": 7.659859705492133, + "language_loss": 0.90299201, + "learning_rate": 3.999650538532742e-06, + "loss": 1.00533891, + "num_input_tokens_seen": 12586680, + "router_z_loss_clip": 11.609375, + "router_z_loss_mlp": 1.92480469, + "step": 595, + "time_per_iteration": 2.735182285308838 + }, + { + "auxiliary_loss_clip": 0.08357747, + "auxiliary_loss_mlp": 0.01819213, + "balance_loss_clip": 0.07199049, + "balance_loss_mlp": 0.01642402, + "epoch": 0.035833458590109724, + "flos": 10894392627840.0, + "grad_norm": 11.312857601205495, + "language_loss": 1.05936086, + "learning_rate": 3.999643220384106e-06, + "loss": 1.16113043, + "num_input_tokens_seen": 12601605, + "router_z_loss_clip": 11.6015625, + "router_z_loss_mlp": 1.76953125, + "step": 596, + "time_per_iteration": 2.6456210613250732 + }, + { + "auxiliary_loss_clip": 0.08308871, + "auxiliary_loss_mlp": 0.01797355, + "balance_loss_clip": 0.07171883, + "balance_loss_mlp": 0.01627124, + "epoch": 0.035893581842777696, + "flos": 22096620357120.0, + "grad_norm": 9.130935198122538, + "language_loss": 0.90824974, + "learning_rate": 3.999635826408799e-06, + "loss": 1.00931203, + "num_input_tokens_seen": 12620365, + "router_z_loss_clip": 11.3671875, + "router_z_loss_mlp": 1.70117188, + "step": 597, + "time_per_iteration": 2.6823341846466064 + }, + { + "auxiliary_loss_clip": 0.08270305, + "auxiliary_loss_mlp": 0.01746721, + "balance_loss_clip": 0.0715827, + "balance_loss_mlp": 0.01584406, + "epoch": 0.03595370509544566, + "flos": 23044847886720.0, + "grad_norm": 9.111056149089638, + "language_loss": 0.87109864, + "learning_rate": 3.999628356607101e-06, + "loss": 0.97126889, + "num_input_tokens_seen": 12641140, + "router_z_loss_clip": 11.1171875, + "router_z_loss_mlp": 1.62402344, + "step": 598, + "time_per_iteration": 2.720789670944214 + }, + { + "auxiliary_loss_clip": 0.08249436, + "auxiliary_loss_mlp": 0.01768458, + "balance_loss_clip": 0.07144348, + "balance_loss_mlp": 0.01596511, + "epoch": 0.03601382834811363, + "flos": 20784109201920.0, + "grad_norm": 3.8408259345244593, + "language_loss": 0.87403977, + "learning_rate": 3.999620810979295e-06, + "loss": 0.97421879, + "num_input_tokens_seen": 12661080, + "router_z_loss_clip": 11.0546875, + "router_z_loss_mlp": 1.71972656, + "step": 599, + "time_per_iteration": 2.648764133453369 + }, + { + "auxiliary_loss_clip": 0.08292407, + "auxiliary_loss_mlp": 0.01772624, + "balance_loss_clip": 0.07133689, + "balance_loss_mlp": 0.01594573, + "epoch": 0.036073951600781605, + "flos": 23958470880000.0, + "grad_norm": 6.448569836830266, + "language_loss": 0.96199447, + "learning_rate": 3.999613189525668e-06, + "loss": 1.06264472, + "num_input_tokens_seen": 12678270, + "router_z_loss_clip": 11.6015625, + "router_z_loss_mlp": 1.78027344, + "step": 600, + "time_per_iteration": 2.677182197570801 + }, + { + "auxiliary_loss_clip": 0.08248397, + "auxiliary_loss_mlp": 0.01755802, + "balance_loss_clip": 0.07142025, + "balance_loss_mlp": 0.01582996, + "epoch": 0.03613407485344957, + "flos": 18917562850560.0, + "grad_norm": 6.503034140887701, + "language_loss": 0.8985101, + "learning_rate": 3.999605492246508e-06, + "loss": 0.9985522, + "num_input_tokens_seen": 12697295, + "router_z_loss_clip": 11.0703125, + "router_z_loss_mlp": 1.72753906, + "step": 601, + "time_per_iteration": 2.6344988346099854 + }, + { + "auxiliary_loss_clip": 0.08262836, + "auxiliary_loss_mlp": 0.01796413, + "balance_loss_clip": 0.07111854, + "balance_loss_mlp": 0.01602054, + "epoch": 0.03619419810611754, + "flos": 23045057521920.0, + "grad_norm": 7.606856937764795, + "language_loss": 0.83811623, + "learning_rate": 3.999597719142107e-06, + "loss": 0.93870872, + "num_input_tokens_seen": 12716165, + "router_z_loss_clip": 11.5234375, + "router_z_loss_mlp": 1.94335938, + "step": 602, + "time_per_iteration": 2.6544992923736572 + }, + { + "auxiliary_loss_clip": 0.08245073, + "auxiliary_loss_mlp": 0.01805812, + "balance_loss_clip": 0.07111835, + "balance_loss_mlp": 0.01607543, + "epoch": 0.03625432135878551, + "flos": 29465002448640.0, + "grad_norm": 10.358505294515373, + "language_loss": 0.86272752, + "learning_rate": 3.999589870212761e-06, + "loss": 0.96323633, + "num_input_tokens_seen": 12735475, + "router_z_loss_clip": 11.328125, + "router_z_loss_mlp": 1.984375, + "step": 603, + "time_per_iteration": 2.7074103355407715 + }, + { + "auxiliary_loss_clip": 0.08216999, + "auxiliary_loss_mlp": 0.01791145, + "balance_loss_clip": 0.07080936, + "balance_loss_mlp": 0.01602794, + "epoch": 0.03631444461145348, + "flos": 23514412567680.0, + "grad_norm": 4.761739949728406, + "language_loss": 0.93545526, + "learning_rate": 3.9995819454587664e-06, + "loss": 1.03553677, + "num_input_tokens_seen": 12754540, + "router_z_loss_clip": 11.3671875, + "router_z_loss_mlp": 1.88574219, + "step": 604, + "time_per_iteration": 2.683458089828491 + }, + { + "auxiliary_loss_clip": 0.08179027, + "auxiliary_loss_mlp": 0.01779272, + "balance_loss_clip": 0.07038404, + "balance_loss_mlp": 0.01587965, + "epoch": 0.03637456786412145, + "flos": 16623770929920.0, + "grad_norm": 10.408229209770424, + "language_loss": 0.89575511, + "learning_rate": 3.999573944880424e-06, + "loss": 0.99533808, + "num_input_tokens_seen": 12773050, + "router_z_loss_clip": 11.4140625, + "router_z_loss_mlp": 1.91308594, + "step": 605, + "time_per_iteration": 2.6058335304260254 + }, + { + "auxiliary_loss_clip": 0.08185698, + "auxiliary_loss_mlp": 0.0179345, + "balance_loss_clip": 0.07041989, + "balance_loss_mlp": 0.01587933, + "epoch": 0.03643469111678942, + "flos": 15857328833280.0, + "grad_norm": 18.44965350869095, + "language_loss": 0.94496262, + "learning_rate": 3.9995658684780375e-06, + "loss": 1.04475403, + "num_input_tokens_seen": 12791240, + "router_z_loss_clip": 11.421875, + "router_z_loss_mlp": 2.05566406, + "step": 606, + "time_per_iteration": 2.6620774269104004 + }, + { + "auxiliary_loss_clip": 0.0816614, + "auxiliary_loss_mlp": 0.01748117, + "balance_loss_clip": 0.07028672, + "balance_loss_mlp": 0.01549944, + "epoch": 0.03649481436945739, + "flos": 23626695438720.0, + "grad_norm": 22.881578639374155, + "language_loss": 0.89864534, + "learning_rate": 3.999557716251912e-06, + "loss": 0.99778789, + "num_input_tokens_seen": 12812245, + "router_z_loss_clip": 11.3828125, + "router_z_loss_mlp": 1.98144531, + "step": 607, + "time_per_iteration": 2.643644332885742 + }, + { + "auxiliary_loss_clip": 0.08159362, + "auxiliary_loss_mlp": 0.01746593, + "balance_loss_clip": 0.07035235, + "balance_loss_mlp": 0.01550708, + "epoch": 0.036554937622125354, + "flos": 21760903774080.0, + "grad_norm": 5.869564247499357, + "language_loss": 0.89574814, + "learning_rate": 3.999549488202358e-06, + "loss": 0.99480766, + "num_input_tokens_seen": 12831085, + "router_z_loss_clip": 11.2421875, + "router_z_loss_mlp": 1.95800781, + "step": 608, + "time_per_iteration": 2.6450629234313965 + }, + { + "auxiliary_loss_clip": 0.08127657, + "auxiliary_loss_mlp": 0.01727103, + "balance_loss_clip": 0.07009961, + "balance_loss_mlp": 0.01525497, + "epoch": 0.036615060874793326, + "flos": 17825215098240.0, + "grad_norm": 10.044459064109706, + "language_loss": 0.90011758, + "learning_rate": 3.999541184329688e-06, + "loss": 0.99866509, + "num_input_tokens_seen": 12849115, + "router_z_loss_clip": 11.171875, + "router_z_loss_mlp": 2.01464844, + "step": 609, + "time_per_iteration": 4.030602216720581 + }, + { + "auxiliary_loss_clip": 0.08147175, + "auxiliary_loss_mlp": 0.01709632, + "balance_loss_clip": 0.07004737, + "balance_loss_mlp": 0.01506309, + "epoch": 0.0366751841274613, + "flos": 26759911962240.0, + "grad_norm": 23.288197653985222, + "language_loss": 0.89072526, + "learning_rate": 3.999532804634215e-06, + "loss": 0.98929334, + "num_input_tokens_seen": 12868005, + "router_z_loss_clip": 11.421875, + "router_z_loss_mlp": 2.03515625, + "step": 610, + "time_per_iteration": 4.13908052444458 + }, + { + "auxiliary_loss_clip": 0.08141156, + "auxiliary_loss_mlp": 0.01701532, + "balance_loss_clip": 0.06999695, + "balance_loss_mlp": 0.01503454, + "epoch": 0.03673530738012926, + "flos": 22202949588480.0, + "grad_norm": 12.716864123026268, + "language_loss": 0.93839324, + "learning_rate": 3.9995243491162575e-06, + "loss": 1.03682017, + "num_input_tokens_seen": 12886890, + "router_z_loss_clip": 11.421875, + "router_z_loss_mlp": 1.98046875, + "step": 611, + "time_per_iteration": 4.084355354309082 + }, + { + "auxiliary_loss_clip": 0.08129553, + "auxiliary_loss_mlp": 0.01677889, + "balance_loss_clip": 0.07002232, + "balance_loss_mlp": 0.01494783, + "epoch": 0.036795430632797235, + "flos": 24688673285760.0, + "grad_norm": 5.856966427284507, + "language_loss": 0.80289567, + "learning_rate": 3.999515817776136e-06, + "loss": 0.9009701, + "num_input_tokens_seen": 12906130, + "router_z_loss_clip": 11.296875, + "router_z_loss_mlp": 1.83007812, + "step": 612, + "time_per_iteration": 2.797450065612793 + }, + { + "auxiliary_loss_clip": 0.08124618, + "auxiliary_loss_mlp": 0.01670571, + "balance_loss_clip": 0.06981046, + "balance_loss_mlp": 0.01486607, + "epoch": 0.0368555538854652, + "flos": 17754706287360.0, + "grad_norm": 13.343841316796098, + "language_loss": 0.86962521, + "learning_rate": 3.999507210614175e-06, + "loss": 0.9675771, + "num_input_tokens_seen": 12925260, + "router_z_loss_clip": 11.4453125, + "router_z_loss_mlp": 1.83984375, + "step": 613, + "time_per_iteration": 4.1074419021606445 + }, + { + "auxiliary_loss_clip": 0.0806347, + "auxiliary_loss_mlp": 0.01642999, + "balance_loss_clip": 0.0695873, + "balance_loss_mlp": 0.01476392, + "epoch": 0.03691567713813317, + "flos": 20600772468480.0, + "grad_norm": 5.522225672422525, + "language_loss": 1.0065136, + "learning_rate": 3.9994985276307e-06, + "loss": 1.10357833, + "num_input_tokens_seen": 12944590, + "router_z_loss_clip": 11.046875, + "router_z_loss_mlp": 1.66699219, + "step": 614, + "time_per_iteration": 2.645425796508789 + }, + { + "auxiliary_loss_clip": 0.08091287, + "auxiliary_loss_mlp": 0.01664825, + "balance_loss_clip": 0.06965354, + "balance_loss_mlp": 0.01476188, + "epoch": 0.036975800390801145, + "flos": 33657765050880.0, + "grad_norm": 13.032636577175042, + "language_loss": 0.81820416, + "learning_rate": 3.999489768826041e-06, + "loss": 0.91576523, + "num_input_tokens_seen": 12964785, + "router_z_loss_clip": 11.265625, + "router_z_loss_mlp": 1.88671875, + "step": 615, + "time_per_iteration": 2.781172752380371 + }, + { + "auxiliary_loss_clip": 0.08073606, + "auxiliary_loss_mlp": 0.01648642, + "balance_loss_clip": 0.06957066, + "balance_loss_mlp": 0.01467158, + "epoch": 0.03703592364346911, + "flos": 28301307344640.0, + "grad_norm": 5.888176936290721, + "language_loss": 0.88226712, + "learning_rate": 3.999480934200528e-06, + "loss": 0.97948968, + "num_input_tokens_seen": 12986705, + "router_z_loss_clip": 11.171875, + "router_z_loss_mlp": 1.81445312, + "step": 616, + "time_per_iteration": 2.712480068206787 + }, + { + "auxiliary_loss_clip": 0.08063665, + "auxiliary_loss_mlp": 0.01595674, + "balance_loss_clip": 0.06951402, + "balance_loss_mlp": 0.01438985, + "epoch": 0.03709604689613708, + "flos": 31512327984000.0, + "grad_norm": 15.942016878304402, + "language_loss": 0.7623843, + "learning_rate": 3.999472023754499e-06, + "loss": 0.85897768, + "num_input_tokens_seen": 13010560, + "router_z_loss_clip": 11.1171875, + "router_z_loss_mlp": 1.56738281, + "step": 617, + "time_per_iteration": 2.738520622253418 + }, + { + "auxiliary_loss_clip": 0.08034836, + "auxiliary_loss_mlp": 0.01559373, + "balance_loss_clip": 0.06941325, + "balance_loss_mlp": 0.01401445, + "epoch": 0.03715617014880505, + "flos": 19615424780160.0, + "grad_norm": 6.714823910826054, + "language_loss": 0.88676983, + "learning_rate": 3.99946303748829e-06, + "loss": 0.98271191, + "num_input_tokens_seen": 13028935, + "router_z_loss_clip": 10.953125, + "router_z_loss_mlp": 1.57910156, + "step": 618, + "time_per_iteration": 2.6463687419891357 + }, + { + "auxiliary_loss_clip": 0.08035833, + "auxiliary_loss_mlp": 0.0158681, + "balance_loss_clip": 0.06917505, + "balance_loss_mlp": 0.01430789, + "epoch": 0.03721629340147302, + "flos": 15929598579840.0, + "grad_norm": 200.27470015941975, + "language_loss": 0.97611117, + "learning_rate": 3.999453975402242e-06, + "loss": 1.07233763, + "num_input_tokens_seen": 13046000, + "router_z_loss_clip": 11.171875, + "router_z_loss_mlp": 1.55957031, + "step": 619, + "time_per_iteration": 2.6415488719940186 + }, + { + "auxiliary_loss_clip": 0.08024481, + "auxiliary_loss_mlp": 0.01545146, + "balance_loss_clip": 0.06915386, + "balance_loss_mlp": 0.01399139, + "epoch": 0.03727641665414099, + "flos": 21110182565760.0, + "grad_norm": 5.601090655471351, + "language_loss": 1.00407517, + "learning_rate": 3.9994448374967e-06, + "loss": 1.0997715, + "num_input_tokens_seen": 13062995, + "router_z_loss_clip": 11.1015625, + "router_z_loss_mlp": 1.4609375, + "step": 620, + "time_per_iteration": 2.6569979190826416 + }, + { + "auxiliary_loss_clip": 0.08002374, + "auxiliary_loss_mlp": 0.01557386, + "balance_loss_clip": 0.06899319, + "balance_loss_mlp": 0.01406705, + "epoch": 0.037336539906808956, + "flos": 24138159960960.0, + "grad_norm": 36.40398806521908, + "language_loss": 0.83474398, + "learning_rate": 3.999435623772008e-06, + "loss": 0.9303416, + "num_input_tokens_seen": 13084120, + "router_z_loss_clip": 11.046875, + "router_z_loss_mlp": 1.5078125, + "step": 621, + "time_per_iteration": 2.690336227416992 + }, + { + "auxiliary_loss_clip": 0.07971206, + "auxiliary_loss_mlp": 0.01523645, + "balance_loss_clip": 0.06889994, + "balance_loss_mlp": 0.01385266, + "epoch": 0.03739666315947693, + "flos": 22352981523840.0, + "grad_norm": 9.446463642728892, + "language_loss": 0.92411411, + "learning_rate": 3.999426334228518e-06, + "loss": 1.01906252, + "num_input_tokens_seen": 13100035, + "router_z_loss_clip": 10.828125, + "router_z_loss_mlp": 1.38378906, + "step": 622, + "time_per_iteration": 2.658414363861084 + }, + { + "auxiliary_loss_clip": 0.07994708, + "auxiliary_loss_mlp": 0.01510841, + "balance_loss_clip": 0.06888318, + "balance_loss_mlp": 0.01382, + "epoch": 0.0374567864121449, + "flos": 20455855632000.0, + "grad_norm": 11.361437110202797, + "language_loss": 0.97279346, + "learning_rate": 3.999416968866581e-06, + "loss": 1.06784892, + "num_input_tokens_seen": 13118070, + "router_z_loss_clip": 11.0546875, + "router_z_loss_mlp": 1.2890625, + "step": 623, + "time_per_iteration": 2.641080617904663 + }, + { + "auxiliary_loss_clip": 0.07990901, + "auxiliary_loss_mlp": 0.01512746, + "balance_loss_clip": 0.06881022, + "balance_loss_mlp": 0.0138009, + "epoch": 0.037516909664812866, + "flos": 19214020995840.0, + "grad_norm": 6.5992711028490865, + "language_loss": 0.9044131, + "learning_rate": 3.999407527686551e-06, + "loss": 0.99944961, + "num_input_tokens_seen": 13136355, + "router_z_loss_clip": 11.1171875, + "router_z_loss_mlp": 1.32714844, + "step": 624, + "time_per_iteration": 2.6581132411956787 + }, + { + "auxiliary_loss_clip": 0.07970337, + "auxiliary_loss_mlp": 0.0150074, + "balance_loss_clip": 0.06882318, + "balance_loss_mlp": 0.01368561, + "epoch": 0.03757703291748084, + "flos": 35013643493760.0, + "grad_norm": 9.813739409664771, + "language_loss": 0.77213168, + "learning_rate": 3.999398010688788e-06, + "loss": 0.86684251, + "num_input_tokens_seen": 13155435, + "router_z_loss_clip": 10.8828125, + "router_z_loss_mlp": 1.32128906, + "step": 625, + "time_per_iteration": 2.741912603378296 + }, + { + "auxiliary_loss_clip": 0.07975402, + "auxiliary_loss_mlp": 0.01499832, + "balance_loss_clip": 0.06869578, + "balance_loss_mlp": 0.01362599, + "epoch": 0.0376371561701488, + "flos": 25490977729920.0, + "grad_norm": 10.795152981420221, + "language_loss": 0.84230971, + "learning_rate": 3.999388417873652e-06, + "loss": 0.93706203, + "num_input_tokens_seen": 13174295, + "router_z_loss_clip": 11.0625, + "router_z_loss_mlp": 1.37207031, + "step": 626, + "time_per_iteration": 2.7070746421813965 + }, + { + "auxiliary_loss_clip": 0.07968426, + "auxiliary_loss_mlp": 0.01497735, + "balance_loss_clip": 0.06873227, + "balance_loss_mlp": 0.01361264, + "epoch": 0.037697279422816775, + "flos": 18191301586560.0, + "grad_norm": 4.940336590948721, + "language_loss": 0.86271065, + "learning_rate": 3.999378749241506e-06, + "loss": 0.95737231, + "num_input_tokens_seen": 13192500, + "router_z_loss_clip": 10.953125, + "router_z_loss_mlp": 1.36425781, + "step": 627, + "time_per_iteration": 2.622081756591797 + }, + { + "auxiliary_loss_clip": 0.07952641, + "auxiliary_loss_mlp": 0.01462314, + "balance_loss_clip": 0.06847817, + "balance_loss_mlp": 0.01327273, + "epoch": 0.03775740267548475, + "flos": 24651133856640.0, + "grad_norm": 5.044807916969655, + "language_loss": 0.93558288, + "learning_rate": 3.999369004792719e-06, + "loss": 1.02973247, + "num_input_tokens_seen": 13213470, + "router_z_loss_clip": 11.046875, + "router_z_loss_mlp": 1.35058594, + "step": 628, + "time_per_iteration": 2.699890375137329 + }, + { + "auxiliary_loss_clip": 0.07954629, + "auxiliary_loss_mlp": 0.01473174, + "balance_loss_clip": 0.06867678, + "balance_loss_mlp": 0.01340232, + "epoch": 0.03781752592815271, + "flos": 21294609402240.0, + "grad_norm": 4.416786805856079, + "language_loss": 0.86205798, + "learning_rate": 3.999359184527658e-06, + "loss": 0.95633596, + "num_input_tokens_seen": 13232365, + "router_z_loss_clip": 10.8828125, + "router_z_loss_mlp": 1.32910156, + "step": 629, + "time_per_iteration": 2.629606246948242 + }, + { + "auxiliary_loss_clip": 0.07949786, + "auxiliary_loss_mlp": 0.01478041, + "balance_loss_clip": 0.06862906, + "balance_loss_mlp": 0.01348436, + "epoch": 0.037877649180820684, + "flos": 22095949524480.0, + "grad_norm": 11.02025815590499, + "language_loss": 0.82977569, + "learning_rate": 3.999349288446696e-06, + "loss": 0.92405391, + "num_input_tokens_seen": 13251920, + "router_z_loss_clip": 10.8671875, + "router_z_loss_mlp": 1.29589844, + "step": 630, + "time_per_iteration": 2.6579172611236572 + }, + { + "auxiliary_loss_clip": 0.07989411, + "auxiliary_loss_mlp": 0.01449511, + "balance_loss_clip": 0.06879212, + "balance_loss_mlp": 0.01315711, + "epoch": 0.03793777243348865, + "flos": 14506523562240.0, + "grad_norm": 6.642300097880606, + "language_loss": 0.99746037, + "learning_rate": 3.99933931655021e-06, + "loss": 1.09184957, + "num_input_tokens_seen": 13267440, + "router_z_loss_clip": 11.1015625, + "router_z_loss_mlp": 1.33789062, + "step": 631, + "time_per_iteration": 2.5856504440307617 + }, + { + "auxiliary_loss_clip": 0.079531, + "auxiliary_loss_mlp": 0.0144806, + "balance_loss_clip": 0.06880549, + "balance_loss_mlp": 0.01321221, + "epoch": 0.03799789568615662, + "flos": 21914918778240.0, + "grad_norm": 6.504165414948274, + "language_loss": 0.96511495, + "learning_rate": 3.999329268838575e-06, + "loss": 1.05912662, + "num_input_tokens_seen": 13287850, + "router_z_loss_clip": 10.71875, + "router_z_loss_mlp": 1.26953125, + "step": 632, + "time_per_iteration": 2.6638169288635254 + }, + { + "auxiliary_loss_clip": 0.07980786, + "auxiliary_loss_mlp": 0.01460671, + "balance_loss_clip": 0.06883863, + "balance_loss_mlp": 0.0132668, + "epoch": 0.03805801893882459, + "flos": 24833967465600.0, + "grad_norm": 3.720972995518591, + "language_loss": 0.88515753, + "learning_rate": 3.999319145312175e-06, + "loss": 0.97957206, + "num_input_tokens_seen": 13307760, + "router_z_loss_clip": 10.984375, + "router_z_loss_mlp": 1.33984375, + "step": 633, + "time_per_iteration": 2.7479147911071777 + }, + { + "auxiliary_loss_clip": 0.07973721, + "auxiliary_loss_mlp": 0.01476512, + "balance_loss_clip": 0.06873562, + "balance_loss_mlp": 0.01335273, + "epoch": 0.03811814219149256, + "flos": 30490950240000.0, + "grad_norm": 5.013866846245917, + "language_loss": 0.74909431, + "learning_rate": 3.999308945971392e-06, + "loss": 0.84359664, + "num_input_tokens_seen": 13331230, + "router_z_loss_clip": 11.0078125, + "router_z_loss_mlp": 1.4140625, + "step": 634, + "time_per_iteration": 2.7746760845184326 + }, + { + "auxiliary_loss_clip": 0.07892692, + "auxiliary_loss_mlp": 0.01617175, + "balance_loss_clip": 0.0733197, + "balance_loss_mlp": 0.01455336, + "epoch": 0.03817826544416053, + "flos": 67010671820160.0, + "grad_norm": 1.8703584651187424, + "language_loss": 0.63503969, + "learning_rate": 3.999298670816614e-06, + "loss": 0.73013842, + "num_input_tokens_seen": 13394760, + "router_z_loss_clip": 5.6171875, + "router_z_loss_mlp": 1.61816406, + "step": 635, + "time_per_iteration": 3.2972047328948975 + }, + { + "auxiliary_loss_clip": 0.08014892, + "auxiliary_loss_mlp": 0.01535345, + "balance_loss_clip": 0.06916042, + "balance_loss_mlp": 0.01392198, + "epoch": 0.038238388696828496, + "flos": 20491592198400.0, + "grad_norm": 9.695955755206388, + "language_loss": 0.90505767, + "learning_rate": 3.9992883198482294e-06, + "loss": 1.00056005, + "num_input_tokens_seen": 13412775, + "router_z_loss_clip": 10.9921875, + "router_z_loss_mlp": 1.43066406, + "step": 636, + "time_per_iteration": 2.6479721069335938 + }, + { + "auxiliary_loss_clip": 0.08042439, + "auxiliary_loss_mlp": 0.01559473, + "balance_loss_clip": 0.06923507, + "balance_loss_mlp": 0.01399637, + "epoch": 0.03829851194949647, + "flos": 17971389745920.0, + "grad_norm": 32.79410112755353, + "language_loss": 0.88142544, + "learning_rate": 3.999277893066632e-06, + "loss": 0.97744453, + "num_input_tokens_seen": 13427835, + "router_z_loss_clip": 11.1796875, + "router_z_loss_mlp": 1.59667969, + "step": 637, + "time_per_iteration": 2.6563000679016113 + }, + { + "auxiliary_loss_clip": 0.08110388, + "auxiliary_loss_mlp": 0.0159766, + "balance_loss_clip": 0.06951486, + "balance_loss_mlp": 0.0144078, + "epoch": 0.03835863520216444, + "flos": 22463251896960.0, + "grad_norm": 37.67076952511291, + "language_loss": 0.91187263, + "learning_rate": 3.999267390472215e-06, + "loss": 1.00895298, + "num_input_tokens_seen": 13447295, + "router_z_loss_clip": 11.578125, + "router_z_loss_mlp": 1.56933594, + "step": 638, + "time_per_iteration": 2.6984195709228516 + }, + { + "auxiliary_loss_clip": 0.08094786, + "auxiliary_loss_mlp": 0.01648944, + "balance_loss_clip": 0.0693827, + "balance_loss_mlp": 0.01462406, + "epoch": 0.038418758454832405, + "flos": 22171070309760.0, + "grad_norm": 8.895472090968715, + "language_loss": 0.76717615, + "learning_rate": 3.999256812065381e-06, + "loss": 0.86461353, + "num_input_tokens_seen": 13468455, + "router_z_loss_clip": 11.5703125, + "router_z_loss_mlp": 1.86621094, + "step": 639, + "time_per_iteration": 2.7338461875915527 + }, + { + "auxiliary_loss_clip": 0.08159171, + "auxiliary_loss_mlp": 0.0166434, + "balance_loss_clip": 0.06976852, + "balance_loss_mlp": 0.01475227, + "epoch": 0.03847888170750038, + "flos": 22754049891840.0, + "grad_norm": 14.750114797034104, + "language_loss": 0.93037415, + "learning_rate": 3.999246157846526e-06, + "loss": 1.02860928, + "num_input_tokens_seen": 13489085, + "router_z_loss_clip": 11.8203125, + "router_z_loss_mlp": 1.890625, + "step": 640, + "time_per_iteration": 2.6571292877197266 + }, + { + "auxiliary_loss_clip": 0.08171181, + "auxiliary_loss_mlp": 0.01715232, + "balance_loss_clip": 0.06975375, + "balance_loss_mlp": 0.01501704, + "epoch": 0.03853900496016834, + "flos": 22717852128000.0, + "grad_norm": 10.934463540103733, + "language_loss": 0.90094578, + "learning_rate": 3.9992354278160574e-06, + "loss": 0.99980986, + "num_input_tokens_seen": 13509120, + "router_z_loss_clip": 11.953125, + "router_z_loss_mlp": 2.1328125, + "step": 641, + "time_per_iteration": 2.6885619163513184 + }, + { + "auxiliary_loss_clip": 0.07644878, + "auxiliary_loss_mlp": 0.01447392, + "balance_loss_clip": 0.07120143, + "balance_loss_mlp": 0.01325512, + "epoch": 0.038599128212836314, + "flos": 70420039073280.0, + "grad_norm": 0.9281695288015585, + "language_loss": 0.65025115, + "learning_rate": 3.999224621974381e-06, + "loss": 0.74117386, + "num_input_tokens_seen": 13562005, + "router_z_loss_clip": 5.25, + "router_z_loss_mlp": 1.21679688, + "step": 642, + "time_per_iteration": 3.2678098678588867 + }, + { + "auxiliary_loss_clip": 0.08201542, + "auxiliary_loss_mlp": 0.01819887, + "balance_loss_clip": 0.07001273, + "balance_loss_mlp": 0.01562014, + "epoch": 0.03865925146550429, + "flos": 23301921813120.0, + "grad_norm": 11.481508748032715, + "language_loss": 0.86633605, + "learning_rate": 3.999213740321906e-06, + "loss": 0.96655035, + "num_input_tokens_seen": 13582185, + "router_z_loss_clip": 11.9921875, + "router_z_loss_mlp": 2.57617188, + "step": 643, + "time_per_iteration": 2.659075975418091 + }, + { + "auxiliary_loss_clip": 0.08181606, + "auxiliary_loss_mlp": 0.01825318, + "balance_loss_clip": 0.06992409, + "balance_loss_mlp": 0.01547799, + "epoch": 0.03871937471817225, + "flos": 21436255929600.0, + "grad_norm": 51.325604168223556, + "language_loss": 0.89457649, + "learning_rate": 3.999202782859046e-06, + "loss": 0.99464566, + "num_input_tokens_seen": 13599555, + "router_z_loss_clip": 11.890625, + "router_z_loss_mlp": 2.77539062, + "step": 644, + "time_per_iteration": 2.659674882888794 + }, + { + "auxiliary_loss_clip": 0.08227627, + "auxiliary_loss_mlp": 0.01840427, + "balance_loss_clip": 0.07032949, + "balance_loss_mlp": 0.01557186, + "epoch": 0.038779497970840224, + "flos": 34285914783360.0, + "grad_norm": 72.96819975442757, + "language_loss": 0.90063643, + "learning_rate": 3.9991917495862165e-06, + "loss": 1.00131702, + "num_input_tokens_seen": 13621160, + "router_z_loss_clip": 11.953125, + "router_z_loss_mlp": 2.83007812, + "step": 645, + "time_per_iteration": 2.732840061187744 + }, + { + "auxiliary_loss_clip": 0.08212948, + "auxiliary_loss_mlp": 0.01875445, + "balance_loss_clip": 0.07012647, + "balance_loss_mlp": 0.01580378, + "epoch": 0.03883962122350819, + "flos": 22754930359680.0, + "grad_norm": 12.262203154186425, + "language_loss": 0.90520537, + "learning_rate": 3.9991806405038345e-06, + "loss": 1.00608933, + "num_input_tokens_seen": 13641915, + "router_z_loss_clip": 12.0078125, + "router_z_loss_mlp": 2.95117188, + "step": 646, + "time_per_iteration": 2.6865735054016113 + }, + { + "auxiliary_loss_clip": 0.08250429, + "auxiliary_loss_mlp": 0.01894148, + "balance_loss_clip": 0.07030701, + "balance_loss_mlp": 0.01611288, + "epoch": 0.03889974447617616, + "flos": 21952500134400.0, + "grad_norm": 17.1595872898191, + "language_loss": 0.88891035, + "learning_rate": 3.999169455612323e-06, + "loss": 0.99035615, + "num_input_tokens_seen": 13661410, + "router_z_loss_clip": 12.1953125, + "router_z_loss_mlp": 2.83007812, + "step": 647, + "time_per_iteration": 2.648667097091675 + }, + { + "auxiliary_loss_clip": 0.08277115, + "auxiliary_loss_mlp": 0.01910975, + "balance_loss_clip": 0.0706424, + "balance_loss_mlp": 0.01610376, + "epoch": 0.03895986772884413, + "flos": 31513040743680.0, + "grad_norm": 19.91369953833428, + "language_loss": 0.91710514, + "learning_rate": 3.999158194912106e-06, + "loss": 1.01898599, + "num_input_tokens_seen": 13681705, + "router_z_loss_clip": 12.125, + "router_z_loss_mlp": 3.00585938, + "step": 648, + "time_per_iteration": 2.7659173011779785 + }, + { + "auxiliary_loss_clip": 0.08252379, + "auxiliary_loss_mlp": 0.0196062, + "balance_loss_clip": 0.0704875, + "balance_loss_mlp": 0.01647243, + "epoch": 0.0390199909815121, + "flos": 19907061315840.0, + "grad_norm": 11.116514995705378, + "language_loss": 0.90245318, + "learning_rate": 3.9991468584036086e-06, + "loss": 1.00458312, + "num_input_tokens_seen": 13700400, + "router_z_loss_clip": 12.0234375, + "router_z_loss_mlp": 3.1328125, + "step": 649, + "time_per_iteration": 4.126534938812256 + }, + { + "auxiliary_loss_clip": 0.08304022, + "auxiliary_loss_mlp": 0.01986477, + "balance_loss_clip": 0.07056045, + "balance_loss_mlp": 0.01679394, + "epoch": 0.03908011423418007, + "flos": 21618250997760.0, + "grad_norm": 9.336868328216912, + "language_loss": 0.85345471, + "learning_rate": 3.999135446087263e-06, + "loss": 0.95635974, + "num_input_tokens_seen": 13720145, + "router_z_loss_clip": 12.484375, + "router_z_loss_mlp": 3.07421875, + "step": 650, + "time_per_iteration": 4.1806252002716064 + }, + { + "auxiliary_loss_clip": 0.08239638, + "auxiliary_loss_mlp": 0.01912282, + "balance_loss_clip": 0.0705025, + "balance_loss_mlp": 0.01647351, + "epoch": 0.039140237486848035, + "flos": 18667406885760.0, + "grad_norm": 11.202480244033193, + "language_loss": 0.84588236, + "learning_rate": 3.9991239579635e-06, + "loss": 0.94740158, + "num_input_tokens_seen": 13737500, + "router_z_loss_clip": 11.890625, + "router_z_loss_mlp": 2.6484375, + "step": 651, + "time_per_iteration": 4.02846360206604 + }, + { + "auxiliary_loss_clip": 0.08228613, + "auxiliary_loss_mlp": 0.01893436, + "balance_loss_clip": 0.07038778, + "balance_loss_mlp": 0.01631557, + "epoch": 0.03920036073951601, + "flos": 18667071469440.0, + "grad_norm": 33.17940308554231, + "language_loss": 0.9516173, + "learning_rate": 3.999112394032757e-06, + "loss": 1.05283785, + "num_input_tokens_seen": 13754750, + "router_z_loss_clip": 11.90625, + "router_z_loss_mlp": 2.6171875, + "step": 652, + "time_per_iteration": 2.6877963542938232 + }, + { + "auxiliary_loss_clip": 0.08188264, + "auxiliary_loss_mlp": 0.01841461, + "balance_loss_clip": 0.07017257, + "balance_loss_mlp": 0.01607716, + "epoch": 0.03926048399218398, + "flos": 31361918705280.0, + "grad_norm": 14.717862862310868, + "language_loss": 0.87065995, + "learning_rate": 3.999100754295471e-06, + "loss": 0.97095722, + "num_input_tokens_seen": 13771990, + "router_z_loss_clip": 11.7109375, + "router_z_loss_mlp": 2.33691406, + "step": 653, + "time_per_iteration": 4.161829948425293 + }, + { + "auxiliary_loss_clip": 0.08235107, + "auxiliary_loss_mlp": 0.01869742, + "balance_loss_clip": 0.07023594, + "balance_loss_mlp": 0.01632659, + "epoch": 0.039320607244851945, + "flos": 29610715898880.0, + "grad_norm": 12.720561465838024, + "language_loss": 0.92308909, + "learning_rate": 3.999089038752085e-06, + "loss": 1.0241375, + "num_input_tokens_seen": 13792750, + "router_z_loss_clip": 12.125, + "router_z_loss_mlp": 2.37304688, + "step": 654, + "time_per_iteration": 2.7182300090789795 + }, + { + "auxiliary_loss_clip": 0.07219759, + "auxiliary_loss_mlp": 0.01432266, + "balance_loss_clip": 0.0672446, + "balance_loss_mlp": 0.01342621, + "epoch": 0.03938073049751992, + "flos": 66555362332800.0, + "grad_norm": 4.21609108891928, + "language_loss": 0.5259136, + "learning_rate": 3.999077247403041e-06, + "loss": 0.61243391, + "num_input_tokens_seen": 13858570, + "router_z_loss_clip": 4.94140625, + "router_z_loss_mlp": 0.89599609, + "step": 655, + "time_per_iteration": 3.3539531230926514 + }, + { + "auxiliary_loss_clip": 0.08163472, + "auxiliary_loss_mlp": 0.01789512, + "balance_loss_clip": 0.07021941, + "balance_loss_mlp": 0.01601352, + "epoch": 0.03944085375018788, + "flos": 23374568903040.0, + "grad_norm": 42.09331718280733, + "language_loss": 0.85369515, + "learning_rate": 3.9990653802487886e-06, + "loss": 0.95322502, + "num_input_tokens_seen": 13876335, + "router_z_loss_clip": 11.4140625, + "router_z_loss_mlp": 1.88183594, + "step": 656, + "time_per_iteration": 2.696443557739258 + }, + { + "auxiliary_loss_clip": 0.08208387, + "auxiliary_loss_mlp": 0.01830457, + "balance_loss_clip": 0.07014482, + "balance_loss_mlp": 0.01624177, + "epoch": 0.039500977002855854, + "flos": 18553656568320.0, + "grad_norm": 12.61442729870119, + "language_loss": 0.83751947, + "learning_rate": 3.999053437289776e-06, + "loss": 0.93790793, + "num_input_tokens_seen": 13892640, + "router_z_loss_clip": 11.9296875, + "router_z_loss_mlp": 2.06347656, + "step": 657, + "time_per_iteration": 2.6805458068847656 + }, + { + "auxiliary_loss_clip": 0.08160911, + "auxiliary_loss_mlp": 0.01759172, + "balance_loss_clip": 0.07011348, + "balance_loss_mlp": 0.0155871, + "epoch": 0.039561100255523826, + "flos": 25345264279680.0, + "grad_norm": 59.81491010429953, + "language_loss": 0.86573362, + "learning_rate": 3.999041418526457e-06, + "loss": 0.96493447, + "num_input_tokens_seen": 13910085, + "router_z_loss_clip": 11.5, + "router_z_loss_mlp": 2.00488281, + "step": 658, + "time_per_iteration": 2.7667956352233887 + }, + { + "auxiliary_loss_clip": 0.08139389, + "auxiliary_loss_mlp": 0.01752558, + "balance_loss_clip": 0.07002386, + "balance_loss_mlp": 0.01577368, + "epoch": 0.03962122350819179, + "flos": 18225193363200.0, + "grad_norm": 13.067415763006752, + "language_loss": 0.97220278, + "learning_rate": 3.999029323959287e-06, + "loss": 1.07112217, + "num_input_tokens_seen": 13928800, + "router_z_loss_clip": 11.375, + "router_z_loss_mlp": 1.75097656, + "step": 659, + "time_per_iteration": 2.7390072345733643 + }, + { + "auxiliary_loss_clip": 0.08160311, + "auxiliary_loss_mlp": 0.01767653, + "balance_loss_clip": 0.07020363, + "balance_loss_mlp": 0.01584643, + "epoch": 0.03968134676085976, + "flos": 20528544648960.0, + "grad_norm": 6.696604257077815, + "language_loss": 0.85069668, + "learning_rate": 3.999017153588724e-06, + "loss": 0.94997621, + "num_input_tokens_seen": 13948325, + "router_z_loss_clip": 11.40625, + "router_z_loss_mlp": 1.83203125, + "step": 660, + "time_per_iteration": 2.6942412853240967 + }, + { + "auxiliary_loss_clip": 0.08128712, + "auxiliary_loss_mlp": 0.01673628, + "balance_loss_clip": 0.07018431, + "balance_loss_mlp": 0.01512361, + "epoch": 0.03974147001352773, + "flos": 22429737463680.0, + "grad_norm": 7.3843033134333425, + "language_loss": 0.86255896, + "learning_rate": 3.999004907415231e-06, + "loss": 0.96058238, + "num_input_tokens_seen": 13969090, + "router_z_loss_clip": 11.109375, + "router_z_loss_mlp": 1.61132812, + "step": 661, + "time_per_iteration": 2.688343048095703 + }, + { + "auxiliary_loss_clip": 0.07200997, + "auxiliary_loss_mlp": 0.01397595, + "balance_loss_clip": 0.06707223, + "balance_loss_mlp": 0.01289354, + "epoch": 0.0398015932661957, + "flos": 71149780281600.0, + "grad_norm": 0.9134370604104062, + "language_loss": 0.69827634, + "learning_rate": 3.998992585439272e-06, + "loss": 0.78426224, + "num_input_tokens_seen": 14037555, + "router_z_loss_clip": 4.9375, + "router_z_loss_mlp": 1.08496094, + "step": 662, + "time_per_iteration": 3.4075381755828857 + }, + { + "auxiliary_loss_clip": 0.08114735, + "auxiliary_loss_mlp": 0.01667295, + "balance_loss_clip": 0.06992006, + "balance_loss_mlp": 0.01495347, + "epoch": 0.03986171651886367, + "flos": 16806688392960.0, + "grad_norm": 88.3041379662575, + "language_loss": 0.8901574, + "learning_rate": 3.998980187661314e-06, + "loss": 0.98797774, + "num_input_tokens_seen": 14055765, + "router_z_loss_clip": 11.234375, + "router_z_loss_mlp": 1.71875, + "step": 663, + "time_per_iteration": 2.6151316165924072 + }, + { + "auxiliary_loss_clip": 0.08116017, + "auxiliary_loss_mlp": 0.01665745, + "balance_loss_clip": 0.06974875, + "balance_loss_mlp": 0.01491318, + "epoch": 0.03992183977153164, + "flos": 24541953586560.0, + "grad_norm": 13.584726936237926, + "language_loss": 0.92355931, + "learning_rate": 3.998967714081826e-06, + "loss": 1.02137709, + "num_input_tokens_seen": 14074195, + "router_z_loss_clip": 11.3984375, + "router_z_loss_mlp": 1.74511719, + "step": 664, + "time_per_iteration": 2.7008705139160156 + }, + { + "auxiliary_loss_clip": 0.08040652, + "auxiliary_loss_mlp": 0.01593066, + "balance_loss_clip": 0.06989275, + "balance_loss_mlp": 0.01449252, + "epoch": 0.03998196302419961, + "flos": 15601261155840.0, + "grad_norm": 12.968973833741712, + "language_loss": 0.90573943, + "learning_rate": 3.998955164701281e-06, + "loss": 1.00207651, + "num_input_tokens_seen": 14090215, + "router_z_loss_clip": 10.5078125, + "router_z_loss_mlp": 1.43847656, + "step": 665, + "time_per_iteration": 2.588078737258911 + }, + { + "auxiliary_loss_clip": 0.0806282, + "auxiliary_loss_mlp": 0.01620663, + "balance_loss_clip": 0.06955597, + "balance_loss_mlp": 0.01454533, + "epoch": 0.04004208627686758, + "flos": 25312714168320.0, + "grad_norm": 13.194143098844163, + "language_loss": 0.86261296, + "learning_rate": 3.998942539520158e-06, + "loss": 0.9594478, + "num_input_tokens_seen": 14112150, + "router_z_loss_clip": 11.0859375, + "router_z_loss_mlp": 1.66113281, + "step": 666, + "time_per_iteration": 2.7150063514709473 + }, + { + "auxiliary_loss_clip": 0.08039176, + "auxiliary_loss_mlp": 0.01580059, + "balance_loss_clip": 0.06968041, + "balance_loss_mlp": 0.01428235, + "epoch": 0.04010220952953555, + "flos": 23482365580800.0, + "grad_norm": 143.76139759772911, + "language_loss": 0.91256213, + "learning_rate": 3.998929838538932e-06, + "loss": 1.00875449, + "num_input_tokens_seen": 14131475, + "router_z_loss_clip": 10.71875, + "router_z_loss_mlp": 1.51855469, + "step": 667, + "time_per_iteration": 2.6658053398132324 + }, + { + "auxiliary_loss_clip": 0.08004649, + "auxiliary_loss_mlp": 0.01530234, + "balance_loss_clip": 0.06972381, + "balance_loss_mlp": 0.01387469, + "epoch": 0.04016233278220352, + "flos": 18621691683840.0, + "grad_norm": 22.359711377029505, + "language_loss": 0.8821072, + "learning_rate": 3.998917061758087e-06, + "loss": 0.97745597, + "num_input_tokens_seen": 14146165, + "router_z_loss_clip": 10.3046875, + "router_z_loss_mlp": 1.42773438, + "step": 668, + "time_per_iteration": 2.6255545616149902 + }, + { + "auxiliary_loss_clip": 0.07152489, + "auxiliary_loss_mlp": 0.01341531, + "balance_loss_clip": 0.06666718, + "balance_loss_mlp": 0.01260421, + "epoch": 0.040222456034871484, + "flos": 70926556204800.0, + "grad_norm": 1.1799050230194268, + "language_loss": 0.60729092, + "learning_rate": 3.998904209178107e-06, + "loss": 0.69223112, + "num_input_tokens_seen": 14215005, + "router_z_loss_clip": 4.875, + "router_z_loss_mlp": 0.81103516, + "step": 669, + "time_per_iteration": 3.3595035076141357 + }, + { + "auxiliary_loss_clip": 0.08017544, + "auxiliary_loss_mlp": 0.01537312, + "balance_loss_clip": 0.06961209, + "balance_loss_mlp": 0.0138749, + "epoch": 0.040282579287539456, + "flos": 23770773734400.0, + "grad_norm": 21.749949136203163, + "language_loss": 0.91578722, + "learning_rate": 3.9988912807994785e-06, + "loss": 1.01133573, + "num_input_tokens_seen": 14235510, + "router_z_loss_clip": 10.5703125, + "router_z_loss_mlp": 1.49707031, + "step": 670, + "time_per_iteration": 2.66859769821167 + }, + { + "auxiliary_loss_clip": 0.08002704, + "auxiliary_loss_mlp": 0.01555976, + "balance_loss_clip": 0.0695509, + "balance_loss_mlp": 0.01413116, + "epoch": 0.04034270254020743, + "flos": 18484405568640.0, + "grad_norm": 9.221564261110139, + "language_loss": 0.80103904, + "learning_rate": 3.998878276622692e-06, + "loss": 0.89662588, + "num_input_tokens_seen": 14254565, + "router_z_loss_clip": 10.484375, + "router_z_loss_mlp": 1.4296875, + "step": 671, + "time_per_iteration": 2.6671946048736572 + }, + { + "auxiliary_loss_clip": 0.07994901, + "auxiliary_loss_mlp": 0.01548628, + "balance_loss_clip": 0.06957932, + "balance_loss_mlp": 0.01400332, + "epoch": 0.040402825792875394, + "flos": 17207589052800.0, + "grad_norm": 12.445045366932057, + "language_loss": 0.98976898, + "learning_rate": 3.998865196648242e-06, + "loss": 1.08520412, + "num_input_tokens_seen": 14271885, + "router_z_loss_clip": 10.375, + "router_z_loss_mlp": 1.484375, + "step": 672, + "time_per_iteration": 2.6043524742126465 + }, + { + "auxiliary_loss_clip": 0.08007569, + "auxiliary_loss_mlp": 0.01577526, + "balance_loss_clip": 0.06955793, + "balance_loss_mlp": 0.01428181, + "epoch": 0.040462949045543366, + "flos": 19178242502400.0, + "grad_norm": 16.68355787547426, + "language_loss": 0.95323932, + "learning_rate": 3.998852040876622e-06, + "loss": 1.04909039, + "num_input_tokens_seen": 14289670, + "router_z_loss_clip": 10.53125, + "router_z_loss_mlp": 1.49316406, + "step": 673, + "time_per_iteration": 2.67228102684021 + }, + { + "auxiliary_loss_clip": 0.07999671, + "auxiliary_loss_mlp": 0.01557213, + "balance_loss_clip": 0.06955186, + "balance_loss_mlp": 0.01413161, + "epoch": 0.04052307229821133, + "flos": 24025877089920.0, + "grad_norm": 7.385878323717427, + "language_loss": 0.80140877, + "learning_rate": 3.998838809308334e-06, + "loss": 0.89697754, + "num_input_tokens_seen": 14309285, + "router_z_loss_clip": 10.4375, + "router_z_loss_mlp": 1.43994141, + "step": 674, + "time_per_iteration": 2.6599738597869873 + }, + { + "auxiliary_loss_clip": 0.08032155, + "auxiliary_loss_mlp": 0.01590571, + "balance_loss_clip": 0.06966965, + "balance_loss_mlp": 0.01439795, + "epoch": 0.0405831955508793, + "flos": 16442362840320.0, + "grad_norm": 8.615330731484576, + "language_loss": 0.83709693, + "learning_rate": 3.9988255019438766e-06, + "loss": 0.93332422, + "num_input_tokens_seen": 14328300, + "router_z_loss_clip": 10.6484375, + "router_z_loss_mlp": 1.50683594, + "step": 675, + "time_per_iteration": 2.68145751953125 + }, + { + "auxiliary_loss_clip": 0.07989661, + "auxiliary_loss_mlp": 0.01530552, + "balance_loss_clip": 0.06954966, + "balance_loss_mlp": 0.01384926, + "epoch": 0.040643318803547275, + "flos": 24286808304000.0, + "grad_norm": 7.342047246701879, + "language_loss": 0.80985713, + "learning_rate": 3.998812118783757e-06, + "loss": 0.90505934, + "num_input_tokens_seen": 14346395, + "router_z_loss_clip": 10.3359375, + "router_z_loss_mlp": 1.45605469, + "step": 676, + "time_per_iteration": 2.6827666759490967 + }, + { + "auxiliary_loss_clip": 0.0800771, + "auxiliary_loss_mlp": 0.01548704, + "balance_loss_clip": 0.06941711, + "balance_loss_mlp": 0.01395925, + "epoch": 0.04070344205621524, + "flos": 17717795763840.0, + "grad_norm": 11.552804849972091, + "language_loss": 0.9000327, + "learning_rate": 3.9987986598284804e-06, + "loss": 0.99559683, + "num_input_tokens_seen": 14364605, + "router_z_loss_clip": 10.6640625, + "router_z_loss_mlp": 1.52734375, + "step": 677, + "time_per_iteration": 2.647284984588623 + }, + { + "auxiliary_loss_clip": 0.0795664, + "auxiliary_loss_mlp": 0.01525712, + "balance_loss_clip": 0.06946824, + "balance_loss_mlp": 0.01385522, + "epoch": 0.04076356530888321, + "flos": 26184940444800.0, + "grad_norm": 15.722345117009269, + "language_loss": 0.81235254, + "learning_rate": 3.998785125078559e-06, + "loss": 0.90717608, + "num_input_tokens_seen": 14385265, + "router_z_loss_clip": 10.09375, + "router_z_loss_mlp": 1.40039062, + "step": 678, + "time_per_iteration": 2.713604688644409 + }, + { + "auxiliary_loss_clip": 0.07982595, + "auxiliary_loss_mlp": 0.01542507, + "balance_loss_clip": 0.06946435, + "balance_loss_mlp": 0.01393447, + "epoch": 0.04082368856155118, + "flos": 35782349650560.0, + "grad_norm": 7.406308464158208, + "language_loss": 0.87816763, + "learning_rate": 3.998771514534505e-06, + "loss": 0.97341865, + "num_input_tokens_seen": 14406090, + "router_z_loss_clip": 10.3671875, + "router_z_loss_mlp": 1.4921875, + "step": 679, + "time_per_iteration": 2.7753264904022217 + }, + { + "auxiliary_loss_clip": 0.07950564, + "auxiliary_loss_mlp": 0.01522729, + "balance_loss_clip": 0.06942166, + "balance_loss_mlp": 0.01383969, + "epoch": 0.04088381181421915, + "flos": 28154042593920.0, + "grad_norm": 7.465466597866811, + "language_loss": 0.8230598, + "learning_rate": 3.998757828196835e-06, + "loss": 0.91779268, + "num_input_tokens_seen": 14425130, + "router_z_loss_clip": 10.0859375, + "router_z_loss_mlp": 1.38671875, + "step": 680, + "time_per_iteration": 2.729719400405884 + }, + { + "auxiliary_loss_clip": 0.07993592, + "auxiliary_loss_mlp": 0.01532905, + "balance_loss_clip": 0.06938143, + "balance_loss_mlp": 0.01378696, + "epoch": 0.04094393506688712, + "flos": 27604703226240.0, + "grad_norm": 9.665492233492547, + "language_loss": 0.8765927, + "learning_rate": 3.9987440660660685e-06, + "loss": 0.97185767, + "num_input_tokens_seen": 14447355, + "router_z_loss_clip": 10.5703125, + "router_z_loss_mlp": 1.54199219, + "step": 681, + "time_per_iteration": 2.752514600753784 + }, + { + "auxiliary_loss_clip": 0.07989424, + "auxiliary_loss_mlp": 0.01553673, + "balance_loss_clip": 0.0693374, + "balance_loss_mlp": 0.01390118, + "epoch": 0.04100405831955509, + "flos": 23118668933760.0, + "grad_norm": 7.019008438585821, + "language_loss": 0.77474326, + "learning_rate": 3.998730228142726e-06, + "loss": 0.87017429, + "num_input_tokens_seen": 14466790, + "router_z_loss_clip": 10.5546875, + "router_z_loss_mlp": 1.63476562, + "step": 682, + "time_per_iteration": 2.6727144718170166 + }, + { + "auxiliary_loss_clip": 0.07959605, + "auxiliary_loss_mlp": 0.01503527, + "balance_loss_clip": 0.06938009, + "balance_loss_mlp": 0.01370394, + "epoch": 0.04106418157222306, + "flos": 20162877431040.0, + "grad_norm": 10.358969831785554, + "language_loss": 0.77842575, + "learning_rate": 3.998716314427333e-06, + "loss": 0.87305701, + "num_input_tokens_seen": 14485195, + "router_z_loss_clip": 10.2109375, + "router_z_loss_mlp": 1.33007812, + "step": 683, + "time_per_iteration": 2.6043591499328613 + }, + { + "auxiliary_loss_clip": 0.07972776, + "auxiliary_loss_mlp": 0.01527418, + "balance_loss_clip": 0.06933653, + "balance_loss_mlp": 0.01377405, + "epoch": 0.041124304824891024, + "flos": 17426452717440.0, + "grad_norm": 41.27076771704703, + "language_loss": 0.86504227, + "learning_rate": 3.998702324920417e-06, + "loss": 0.96004421, + "num_input_tokens_seen": 14503370, + "router_z_loss_clip": 10.3984375, + "router_z_loss_mlp": 1.5, + "step": 684, + "time_per_iteration": 2.6286985874176025 + }, + { + "auxiliary_loss_clip": 0.07935933, + "auxiliary_loss_mlp": 0.01488839, + "balance_loss_clip": 0.06928104, + "balance_loss_mlp": 0.01343976, + "epoch": 0.041184428077558996, + "flos": 25788022853760.0, + "grad_norm": 3.9155930370094065, + "language_loss": 0.94948566, + "learning_rate": 3.9986882596225085e-06, + "loss": 1.04373336, + "num_input_tokens_seen": 14526415, + "router_z_loss_clip": 10.0859375, + "router_z_loss_mlp": 1.44824219, + "step": 685, + "time_per_iteration": 2.7345352172851562 + }, + { + "auxiliary_loss_clip": 0.07948299, + "auxiliary_loss_mlp": 0.0149691, + "balance_loss_clip": 0.06921411, + "balance_loss_mlp": 0.01346992, + "epoch": 0.04124455133022697, + "flos": 22971152620800.0, + "grad_norm": 3.7671102410224577, + "language_loss": 0.94070864, + "learning_rate": 3.998674118534141e-06, + "loss": 1.03516078, + "num_input_tokens_seen": 14546595, + "router_z_loss_clip": 10.2734375, + "router_z_loss_mlp": 1.5, + "step": 686, + "time_per_iteration": 2.6663894653320312 + }, + { + "auxiliary_loss_clip": 0.0795872, + "auxiliary_loss_mlp": 0.01501087, + "balance_loss_clip": 0.06920497, + "balance_loss_mlp": 0.01356414, + "epoch": 0.04130467458289493, + "flos": 21295615651200.0, + "grad_norm": 39.86585208650635, + "language_loss": 0.77225804, + "learning_rate": 3.998659901655851e-06, + "loss": 0.8668561, + "num_input_tokens_seen": 14566590, + "router_z_loss_clip": 10.3828125, + "router_z_loss_mlp": 1.44628906, + "step": 687, + "time_per_iteration": 2.6355550289154053 + }, + { + "auxiliary_loss_clip": 0.07898364, + "auxiliary_loss_mlp": 0.01464255, + "balance_loss_clip": 0.06899062, + "balance_loss_mlp": 0.01340564, + "epoch": 0.041364797835562905, + "flos": 19980337311360.0, + "grad_norm": 4.212344971526593, + "language_loss": 0.91093004, + "learning_rate": 3.998645608988177e-06, + "loss": 1.00455618, + "num_input_tokens_seen": 14585965, + "router_z_loss_clip": 10.0078125, + "router_z_loss_mlp": 1.23730469, + "step": 688, + "time_per_iteration": 4.057282209396362 + }, + { + "auxiliary_loss_clip": 0.07878294, + "auxiliary_loss_mlp": 0.01448978, + "balance_loss_clip": 0.06897704, + "balance_loss_mlp": 0.01329388, + "epoch": 0.04142492108823087, + "flos": 21912361228800.0, + "grad_norm": 22.971814885863903, + "language_loss": 0.88008463, + "learning_rate": 3.998631240531661e-06, + "loss": 0.97335738, + "num_input_tokens_seen": 14606015, + "router_z_loss_clip": 9.796875, + "router_z_loss_mlp": 1.19628906, + "step": 689, + "time_per_iteration": 4.07433295249939 + }, + { + "auxiliary_loss_clip": 0.07866906, + "auxiliary_loss_mlp": 0.01444557, + "balance_loss_clip": 0.06897521, + "balance_loss_mlp": 0.01326349, + "epoch": 0.04148504434089884, + "flos": 27647567389440.0, + "grad_norm": 6.767605845927541, + "language_loss": 0.72533339, + "learning_rate": 3.998616796286848e-06, + "loss": 0.81844807, + "num_input_tokens_seen": 14629955, + "router_z_loss_clip": 9.6953125, + "router_z_loss_mlp": 1.18212891, + "step": 690, + "time_per_iteration": 4.110247611999512 + }, + { + "auxiliary_loss_clip": 0.07835479, + "auxiliary_loss_mlp": 0.01439264, + "balance_loss_clip": 0.06874412, + "balance_loss_mlp": 0.01314809, + "epoch": 0.041545167593566815, + "flos": 20524058455680.0, + "grad_norm": 9.225891193910236, + "language_loss": 0.79284167, + "learning_rate": 3.998602276254286e-06, + "loss": 0.88558906, + "num_input_tokens_seen": 14648000, + "router_z_loss_clip": 9.6171875, + "router_z_loss_mlp": 1.24316406, + "step": 691, + "time_per_iteration": 2.667081594467163 + }, + { + "auxiliary_loss_clip": 0.07827538, + "auxiliary_loss_mlp": 0.01419803, + "balance_loss_clip": 0.06878158, + "balance_loss_mlp": 0.01303931, + "epoch": 0.04160529084623478, + "flos": 11872738500480.0, + "grad_norm": 5.1056325398424125, + "language_loss": 0.88591456, + "learning_rate": 3.998587680434526e-06, + "loss": 0.97838795, + "num_input_tokens_seen": 14662235, + "router_z_loss_clip": 9.484375, + "router_z_loss_mlp": 1.15820312, + "step": 692, + "time_per_iteration": 4.027364015579224 + }, + { + "auxiliary_loss_clip": 0.07869601, + "auxiliary_loss_mlp": 0.01461887, + "balance_loss_clip": 0.0685929, + "balance_loss_mlp": 0.01322936, + "epoch": 0.04166541409890275, + "flos": 14833309685760.0, + "grad_norm": 14.964488884578895, + "language_loss": 0.94025421, + "learning_rate": 3.99857300882812e-06, + "loss": 1.0335691, + "num_input_tokens_seen": 14676065, + "router_z_loss_clip": 10.1171875, + "router_z_loss_mlp": 1.38867188, + "step": 693, + "time_per_iteration": 2.6548287868499756 + }, + { + "auxiliary_loss_clip": 0.07852003, + "auxiliary_loss_mlp": 0.01436954, + "balance_loss_clip": 0.06875066, + "balance_loss_mlp": 0.01312977, + "epoch": 0.04172553735157072, + "flos": 25814577398400.0, + "grad_norm": 10.760604695701561, + "language_loss": 0.88156736, + "learning_rate": 3.998558261435626e-06, + "loss": 0.97445703, + "num_input_tokens_seen": 14694955, + "router_z_loss_clip": 9.765625, + "router_z_loss_mlp": 1.24023438, + "step": 694, + "time_per_iteration": 2.6794655323028564 + }, + { + "auxiliary_loss_clip": 0.07850839, + "auxiliary_loss_mlp": 0.01460734, + "balance_loss_clip": 0.0686307, + "balance_loss_mlp": 0.01329222, + "epoch": 0.04178566060423869, + "flos": 24286682522880.0, + "grad_norm": 6.107694720201945, + "language_loss": 0.89735746, + "learning_rate": 3.9985434382576015e-06, + "loss": 0.99047321, + "num_input_tokens_seen": 14715510, + "router_z_loss_clip": 9.890625, + "router_z_loss_mlp": 1.31445312, + "step": 695, + "time_per_iteration": 2.7562625408172607 + }, + { + "auxiliary_loss_clip": 0.07797342, + "auxiliary_loss_mlp": 0.01449631, + "balance_loss_clip": 0.0684258, + "balance_loss_mlp": 0.01321648, + "epoch": 0.04184578385690666, + "flos": 18227667058560.0, + "grad_norm": 4.8539800399764195, + "language_loss": 0.91097277, + "learning_rate": 3.99852853929461e-06, + "loss": 1.00344253, + "num_input_tokens_seen": 14731755, + "router_z_loss_clip": 9.5625, + "router_z_loss_mlp": 1.28027344, + "step": 696, + "time_per_iteration": 2.6180830001831055 + }, + { + "auxiliary_loss_clip": 0.07759669, + "auxiliary_loss_mlp": 0.01436884, + "balance_loss_clip": 0.06835265, + "balance_loss_mlp": 0.01318438, + "epoch": 0.041905907109574626, + "flos": 22781694539520.0, + "grad_norm": 8.248305080547661, + "language_loss": 0.97183168, + "learning_rate": 3.998513564547216e-06, + "loss": 1.06379724, + "num_input_tokens_seen": 14750810, + "router_z_loss_clip": 9.234375, + "router_z_loss_mlp": 1.18359375, + "step": 697, + "time_per_iteration": 2.6976754665374756 + }, + { + "auxiliary_loss_clip": 0.0775051, + "auxiliary_loss_mlp": 0.0142093, + "balance_loss_clip": 0.06823087, + "balance_loss_mlp": 0.01301005, + "epoch": 0.0419660303622426, + "flos": 20163128993280.0, + "grad_norm": 6.669627081417543, + "language_loss": 0.90090138, + "learning_rate": 3.998498514015987e-06, + "loss": 0.99261582, + "num_input_tokens_seen": 14768435, + "router_z_loss_clip": 9.2890625, + "router_z_loss_mlp": 1.20068359, + "step": 698, + "time_per_iteration": 2.6525814533233643 + }, + { + "auxiliary_loss_clip": 0.07798302, + "auxiliary_loss_mlp": 0.01439823, + "balance_loss_clip": 0.06844427, + "balance_loss_mlp": 0.01318039, + "epoch": 0.042026153614910564, + "flos": 23083142002560.0, + "grad_norm": 12.169844049295248, + "language_loss": 0.96140921, + "learning_rate": 3.998483387701495e-06, + "loss": 1.05379045, + "num_input_tokens_seen": 14786690, + "router_z_loss_clip": 9.546875, + "router_z_loss_mlp": 1.21728516, + "step": 699, + "time_per_iteration": 2.700636625289917 + }, + { + "auxiliary_loss_clip": 0.0715683, + "auxiliary_loss_mlp": 0.01383088, + "balance_loss_clip": 0.06685513, + "balance_loss_mlp": 0.01307272, + "epoch": 0.042086276867578536, + "flos": 64516296424320.0, + "grad_norm": 2.8955425132907755, + "language_loss": 0.7356112, + "learning_rate": 3.998468185604312e-06, + "loss": 0.82101035, + "num_input_tokens_seen": 14853840, + "router_z_loss_clip": 4.71875, + "router_z_loss_mlp": 0.75683594, + "step": 700, + "time_per_iteration": 3.2564964294433594 + }, + { + "auxiliary_loss_clip": 0.07741027, + "auxiliary_loss_mlp": 0.01429077, + "balance_loss_clip": 0.0681721, + "balance_loss_mlp": 0.01313587, + "epoch": 0.04214640012024651, + "flos": 15492458229120.0, + "grad_norm": 9.391497638208355, + "language_loss": 0.93962044, + "learning_rate": 3.998452907725016e-06, + "loss": 1.03132153, + "num_input_tokens_seen": 14869580, + "router_z_loss_clip": 9.2421875, + "router_z_loss_mlp": 1.15527344, + "step": 701, + "time_per_iteration": 2.66644024848938 + }, + { + "auxiliary_loss_clip": 0.07737128, + "auxiliary_loss_mlp": 0.01419929, + "balance_loss_clip": 0.06809002, + "balance_loss_mlp": 0.01302341, + "epoch": 0.04220652337291447, + "flos": 23883601656960.0, + "grad_norm": 33.27176662769112, + "language_loss": 0.71847737, + "learning_rate": 3.998437554064184e-06, + "loss": 0.81004792, + "num_input_tokens_seen": 14891065, + "router_z_loss_clip": 9.2890625, + "router_z_loss_mlp": 1.17529297, + "step": 702, + "time_per_iteration": 2.7162067890167236 + }, + { + "auxiliary_loss_clip": 0.07125677, + "auxiliary_loss_mlp": 0.01365095, + "balance_loss_clip": 0.06657615, + "balance_loss_mlp": 0.01297575, + "epoch": 0.042266646625582445, + "flos": 63815289966720.0, + "grad_norm": 0.8674304256332159, + "language_loss": 0.6110186, + "learning_rate": 3.9984221246224006e-06, + "loss": 0.69592631, + "num_input_tokens_seen": 14954815, + "router_z_loss_clip": 4.6875, + "router_z_loss_mlp": 0.67578125, + "step": 703, + "time_per_iteration": 3.3240442276000977 + }, + { + "auxiliary_loss_clip": 0.0710092, + "auxiliary_loss_mlp": 0.01355985, + "balance_loss_clip": 0.06631917, + "balance_loss_mlp": 0.01291803, + "epoch": 0.04232676987825041, + "flos": 50038912154880.0, + "grad_norm": 1.041495616235658, + "language_loss": 0.58151424, + "learning_rate": 3.9984066194002494e-06, + "loss": 0.66608322, + "num_input_tokens_seen": 15003050, + "router_z_loss_clip": 4.6875, + "router_z_loss_mlp": 0.64160156, + "step": 704, + "time_per_iteration": 3.174765110015869 + }, + { + "auxiliary_loss_clip": 0.07745479, + "auxiliary_loss_mlp": 0.01449155, + "balance_loss_clip": 0.06810448, + "balance_loss_mlp": 0.01329278, + "epoch": 0.04238689313091838, + "flos": 21622485628800.0, + "grad_norm": 12.557351496220864, + "language_loss": 0.93966371, + "learning_rate": 3.998391038398319e-06, + "loss": 1.03161013, + "num_input_tokens_seen": 15021990, + "router_z_loss_clip": 9.3515625, + "router_z_loss_mlp": 1.19775391, + "step": 705, + "time_per_iteration": 2.6435232162475586 + }, + { + "auxiliary_loss_clip": 0.07677379, + "auxiliary_loss_mlp": 0.01427121, + "balance_loss_clip": 0.06791299, + "balance_loss_mlp": 0.01325698, + "epoch": 0.042447016383586354, + "flos": 19141080416640.0, + "grad_norm": 3.7381942579388303, + "language_loss": 0.75889277, + "learning_rate": 3.998375381617201e-06, + "loss": 0.8499378, + "num_input_tokens_seen": 15040700, + "router_z_loss_clip": 8.8515625, + "router_z_loss_mlp": 1.01269531, + "step": 706, + "time_per_iteration": 2.671828508377075 + }, + { + "auxiliary_loss_clip": 0.07719514, + "auxiliary_loss_mlp": 0.01450054, + "balance_loss_clip": 0.06807585, + "balance_loss_mlp": 0.01336471, + "epoch": 0.04250713963625432, + "flos": 24432941024640.0, + "grad_norm": 29.794541170575812, + "language_loss": 0.97812521, + "learning_rate": 3.9983596490574875e-06, + "loss": 1.06982088, + "num_input_tokens_seen": 15056725, + "router_z_loss_clip": 9.1171875, + "router_z_loss_mlp": 1.13427734, + "step": 707, + "time_per_iteration": 2.6550920009613037 + }, + { + "auxiliary_loss_clip": 0.07717137, + "auxiliary_loss_mlp": 0.01443639, + "balance_loss_clip": 0.06809401, + "balance_loss_mlp": 0.01333776, + "epoch": 0.04256726288892229, + "flos": 30374348883840.0, + "grad_norm": 14.849267761051758, + "language_loss": 0.85616708, + "learning_rate": 3.998343840719776e-06, + "loss": 0.94777477, + "num_input_tokens_seen": 15077550, + "router_z_loss_clip": 9.09375, + "router_z_loss_mlp": 1.09863281, + "step": 708, + "time_per_iteration": 2.7447280883789062 + }, + { + "auxiliary_loss_clip": 0.07730591, + "auxiliary_loss_mlp": 0.01453146, + "balance_loss_clip": 0.06808455, + "balance_loss_mlp": 0.01341232, + "epoch": 0.04262738614159026, + "flos": 16368248304000.0, + "grad_norm": 3.836638557890093, + "language_loss": 0.88926339, + "learning_rate": 3.998327956604666e-06, + "loss": 0.98110074, + "num_input_tokens_seen": 15094955, + "router_z_loss_clip": 9.21875, + "router_z_loss_mlp": 1.11914062, + "step": 709, + "time_per_iteration": 2.632735252380371 + }, + { + "auxiliary_loss_clip": 0.07711782, + "auxiliary_loss_mlp": 0.01472, + "balance_loss_clip": 0.06786519, + "balance_loss_mlp": 0.01342396, + "epoch": 0.04268750939425823, + "flos": 20418609692160.0, + "grad_norm": 7.682824070104421, + "language_loss": 0.92841685, + "learning_rate": 3.99831199671276e-06, + "loss": 1.02025461, + "num_input_tokens_seen": 15113395, + "router_z_loss_clip": 9.2421875, + "router_z_loss_mlp": 1.296875, + "step": 710, + "time_per_iteration": 2.6799728870391846 + }, + { + "auxiliary_loss_clip": 0.07731062, + "auxiliary_loss_mlp": 0.01465957, + "balance_loss_clip": 0.06815341, + "balance_loss_mlp": 0.01351993, + "epoch": 0.0427476326469262, + "flos": 20309177859840.0, + "grad_norm": 5.073822997040578, + "language_loss": 0.89081585, + "learning_rate": 3.998295961044662e-06, + "loss": 0.98278606, + "num_input_tokens_seen": 15132920, + "router_z_loss_clip": 9.1484375, + "router_z_loss_mlp": 1.13867188, + "step": 711, + "time_per_iteration": 2.6377625465393066 + }, + { + "auxiliary_loss_clip": 0.07695919, + "auxiliary_loss_mlp": 0.01446717, + "balance_loss_clip": 0.06801347, + "balance_loss_mlp": 0.01336377, + "epoch": 0.042807755899594166, + "flos": 21656880529920.0, + "grad_norm": 4.571300727713509, + "language_loss": 0.91390419, + "learning_rate": 3.9982798496009804e-06, + "loss": 1.00533056, + "num_input_tokens_seen": 15153115, + "router_z_loss_clip": 8.9453125, + "router_z_loss_mlp": 1.10302734, + "step": 712, + "time_per_iteration": 2.6158323287963867 + }, + { + "auxiliary_loss_clip": 0.07722442, + "auxiliary_loss_mlp": 0.01473663, + "balance_loss_clip": 0.06794881, + "balance_loss_mlp": 0.01356647, + "epoch": 0.04286787915226214, + "flos": 21441580663680.0, + "grad_norm": 10.343893565695913, + "language_loss": 0.96509683, + "learning_rate": 3.998263662382328e-06, + "loss": 1.05705786, + "num_input_tokens_seen": 15172770, + "router_z_loss_clip": 9.265625, + "router_z_loss_mlp": 1.17041016, + "step": 713, + "time_per_iteration": 2.668109655380249 + }, + { + "auxiliary_loss_clip": 0.07025006, + "auxiliary_loss_mlp": 0.01310492, + "balance_loss_clip": 0.06573053, + "balance_loss_mlp": 0.01250029, + "epoch": 0.04292800240493011, + "flos": 66420256423680.0, + "grad_norm": 1.0671347208063184, + "language_loss": 0.65522671, + "learning_rate": 3.9982473993893165e-06, + "loss": 0.73858166, + "num_input_tokens_seen": 15240055, + "router_z_loss_clip": 4.5078125, + "router_z_loss_mlp": 0.60351562, + "step": 714, + "time_per_iteration": 3.317920207977295 + }, + { + "auxiliary_loss_clip": 0.07647526, + "auxiliary_loss_mlp": 0.01441108, + "balance_loss_clip": 0.0677468, + "balance_loss_mlp": 0.01326476, + "epoch": 0.042988125657598075, + "flos": 31658418777600.0, + "grad_norm": 3.6319248406792983, + "language_loss": 0.79793668, + "learning_rate": 3.998231060622563e-06, + "loss": 0.88882303, + "num_input_tokens_seen": 15261585, + "router_z_loss_clip": 8.73046875, + "router_z_loss_mlp": 1.14550781, + "step": 715, + "time_per_iteration": 2.717393398284912 + }, + { + "auxiliary_loss_clip": 0.07645463, + "auxiliary_loss_mlp": 0.01445614, + "balance_loss_clip": 0.06767702, + "balance_loss_mlp": 0.01331984, + "epoch": 0.04304824891026605, + "flos": 33255690433920.0, + "grad_norm": 29.540799393093693, + "language_loss": 0.77394652, + "learning_rate": 3.998214646082688e-06, + "loss": 0.86485732, + "num_input_tokens_seen": 15281160, + "router_z_loss_clip": 8.7890625, + "router_z_loss_mlp": 1.13623047, + "step": 716, + "time_per_iteration": 2.7298099994659424 + }, + { + "auxiliary_loss_clip": 0.07019071, + "auxiliary_loss_mlp": 0.01306888, + "balance_loss_clip": 0.06569381, + "balance_loss_mlp": 0.01252815, + "epoch": 0.04310837216293401, + "flos": 64086996430080.0, + "grad_norm": 0.9619131870502678, + "language_loss": 0.6602453, + "learning_rate": 3.998198155770314e-06, + "loss": 0.74350488, + "num_input_tokens_seen": 15344505, + "router_z_loss_clip": 4.5, + "router_z_loss_mlp": 0.54199219, + "step": 717, + "time_per_iteration": 3.2711920738220215 + }, + { + "auxiliary_loss_clip": 0.06998679, + "auxiliary_loss_mlp": 0.01302753, + "balance_loss_clip": 0.06550965, + "balance_loss_mlp": 0.01248918, + "epoch": 0.043168495415601985, + "flos": 61361990599680.0, + "grad_norm": 0.9806748941419274, + "language_loss": 0.58663344, + "learning_rate": 3.998181589686065e-06, + "loss": 0.66964775, + "num_input_tokens_seen": 15404050, + "router_z_loss_clip": 4.49609375, + "router_z_loss_mlp": 0.53955078, + "step": 718, + "time_per_iteration": 3.083362579345703 + }, + { + "auxiliary_loss_clip": 0.07634784, + "auxiliary_loss_mlp": 0.01408365, + "balance_loss_clip": 0.06757121, + "balance_loss_mlp": 0.01309135, + "epoch": 0.04322861866826996, + "flos": 20710539717120.0, + "grad_norm": 8.670927241625472, + "language_loss": 0.97469372, + "learning_rate": 3.99816494783057e-06, + "loss": 1.06512523, + "num_input_tokens_seen": 15424190, + "router_z_loss_clip": 8.78125, + "router_z_loss_mlp": 0.99316406, + "step": 719, + "time_per_iteration": 2.620244264602661 + }, + { + "auxiliary_loss_clip": 0.07617359, + "auxiliary_loss_mlp": 0.01437239, + "balance_loss_clip": 0.06746139, + "balance_loss_mlp": 0.01327042, + "epoch": 0.04328874192093792, + "flos": 30381308772480.0, + "grad_norm": 7.103043460272315, + "language_loss": 0.71241379, + "learning_rate": 3.99814823020446e-06, + "loss": 0.8029598, + "num_input_tokens_seen": 15446500, + "router_z_loss_clip": 8.703125, + "router_z_loss_mlp": 1.10253906, + "step": 720, + "time_per_iteration": 2.7137084007263184 + }, + { + "auxiliary_loss_clip": 0.07571768, + "auxiliary_loss_mlp": 0.01420566, + "balance_loss_clip": 0.06721878, + "balance_loss_mlp": 0.01314518, + "epoch": 0.043348865173605894, + "flos": 21951284250240.0, + "grad_norm": 7.242521234745598, + "language_loss": 0.82826072, + "learning_rate": 3.9981314368083684e-06, + "loss": 0.91818404, + "num_input_tokens_seen": 15465830, + "router_z_loss_clip": 8.5078125, + "router_z_loss_mlp": 1.06152344, + "step": 721, + "time_per_iteration": 2.6496849060058594 + }, + { + "auxiliary_loss_clip": 0.07618188, + "auxiliary_loss_mlp": 0.01421571, + "balance_loss_clip": 0.06749155, + "balance_loss_mlp": 0.01323009, + "epoch": 0.04340898842627386, + "flos": 15268982590080.0, + "grad_norm": 11.950148766430376, + "language_loss": 0.94630802, + "learning_rate": 3.998114567642933e-06, + "loss": 1.03670549, + "num_input_tokens_seen": 15479985, + "router_z_loss_clip": 8.6953125, + "router_z_loss_mlp": 0.98486328, + "step": 722, + "time_per_iteration": 2.665302038192749 + }, + { + "auxiliary_loss_clip": 0.07582939, + "auxiliary_loss_mlp": 0.01410079, + "balance_loss_clip": 0.06720737, + "balance_loss_mlp": 0.01309896, + "epoch": 0.04346911167894183, + "flos": 27973011847680.0, + "grad_norm": 7.626593725821058, + "language_loss": 0.90292984, + "learning_rate": 3.998097622708792e-06, + "loss": 0.99286008, + "num_input_tokens_seen": 15501545, + "router_z_loss_clip": 8.625, + "router_z_loss_mlp": 1.00195312, + "step": 723, + "time_per_iteration": 2.6893301010131836 + }, + { + "auxiliary_loss_clip": 0.0756183, + "auxiliary_loss_mlp": 0.01404071, + "balance_loss_clip": 0.06712201, + "balance_loss_mlp": 0.01307798, + "epoch": 0.0435292349316098, + "flos": 29249954144640.0, + "grad_norm": 5.654199567369001, + "language_loss": 0.8762064, + "learning_rate": 3.99808060200659e-06, + "loss": 0.96586531, + "num_input_tokens_seen": 15521725, + "router_z_loss_clip": 8.5, + "router_z_loss_mlp": 0.96337891, + "step": 724, + "time_per_iteration": 2.7862863540649414 + }, + { + "auxiliary_loss_clip": 0.07522231, + "auxiliary_loss_mlp": 0.01408898, + "balance_loss_clip": 0.06700347, + "balance_loss_mlp": 0.01310479, + "epoch": 0.04358935818427777, + "flos": 20564616631680.0, + "grad_norm": 17.469159252810304, + "language_loss": 0.84563124, + "learning_rate": 3.998063505536971e-06, + "loss": 0.93494248, + "num_input_tokens_seen": 15540910, + "router_z_loss_clip": 8.2109375, + "router_z_loss_mlp": 0.98339844, + "step": 725, + "time_per_iteration": 2.6348090171813965 + }, + { + "auxiliary_loss_clip": 0.07563804, + "auxiliary_loss_mlp": 0.01414464, + "balance_loss_clip": 0.06708695, + "balance_loss_mlp": 0.01317428, + "epoch": 0.04364948143694574, + "flos": 14470116163200.0, + "grad_norm": 13.275228581754149, + "language_loss": 0.94372833, + "learning_rate": 3.998046333300584e-06, + "loss": 1.03351104, + "num_input_tokens_seen": 15558640, + "router_z_loss_clip": 8.5546875, + "router_z_loss_mlp": 0.96972656, + "step": 726, + "time_per_iteration": 2.6198081970214844 + }, + { + "auxiliary_loss_clip": 0.06976914, + "auxiliary_loss_mlp": 0.01364793, + "balance_loss_clip": 0.0652867, + "balance_loss_mlp": 0.01297797, + "epoch": 0.043709604689613706, + "flos": 50083216565760.0, + "grad_norm": 0.973992689315138, + "language_loss": 0.56151426, + "learning_rate": 3.998029085298079e-06, + "loss": 0.64493132, + "num_input_tokens_seen": 15612975, + "router_z_loss_clip": 4.4921875, + "router_z_loss_mlp": 0.67041016, + "step": 727, + "time_per_iteration": 3.331416368484497 + }, + { + "auxiliary_loss_clip": 0.07546923, + "auxiliary_loss_mlp": 0.01412171, + "balance_loss_clip": 0.06696635, + "balance_loss_mlp": 0.01320475, + "epoch": 0.04376972794228168, + "flos": 13996861902720.0, + "grad_norm": 5.257747667032763, + "language_loss": 0.87717295, + "learning_rate": 3.998011761530112e-06, + "loss": 0.96676385, + "num_input_tokens_seen": 15631070, + "router_z_loss_clip": 8.51953125, + "router_z_loss_mlp": 0.91699219, + "step": 728, + "time_per_iteration": 3.989957571029663 + }, + { + "auxiliary_loss_clip": 0.07508835, + "auxiliary_loss_mlp": 0.01424416, + "balance_loss_clip": 0.06694756, + "balance_loss_mlp": 0.0133787, + "epoch": 0.04382985119494965, + "flos": 22015084734720.0, + "grad_norm": 7.636957371182376, + "language_loss": 0.80325305, + "learning_rate": 3.997994361997338e-06, + "loss": 0.89258564, + "num_input_tokens_seen": 15647825, + "router_z_loss_clip": 8.14453125, + "router_z_loss_mlp": 0.86572266, + "step": 729, + "time_per_iteration": 4.069265365600586 + }, + { + "auxiliary_loss_clip": 0.07515953, + "auxiliary_loss_mlp": 0.01429781, + "balance_loss_clip": 0.06682766, + "balance_loss_mlp": 0.01337561, + "epoch": 0.043889974447617615, + "flos": 24213322673280.0, + "grad_norm": 4.547809577279536, + "language_loss": 1.00979817, + "learning_rate": 3.997976886700417e-06, + "loss": 1.09925556, + "num_input_tokens_seen": 15668260, + "router_z_loss_clip": 8.33203125, + "router_z_loss_mlp": 0.92285156, + "step": 730, + "time_per_iteration": 4.043174982070923 + }, + { + "auxiliary_loss_clip": 0.07549515, + "auxiliary_loss_mlp": 0.01462607, + "balance_loss_clip": 0.06684491, + "balance_loss_mlp": 0.0135055, + "epoch": 0.04395009770028559, + "flos": 17280236142720.0, + "grad_norm": 42.34250232752857, + "language_loss": 0.93866402, + "learning_rate": 3.997959335640013e-06, + "loss": 1.02878523, + "num_input_tokens_seen": 15685630, + "router_z_loss_clip": 8.6640625, + "router_z_loss_mlp": 1.12011719, + "step": 731, + "time_per_iteration": 2.6158339977264404 + }, + { + "auxiliary_loss_clip": 0.07507139, + "auxiliary_loss_mlp": 0.01450773, + "balance_loss_clip": 0.06690555, + "balance_loss_mlp": 0.0135059, + "epoch": 0.04401022095295355, + "flos": 12314784314880.0, + "grad_norm": 29.143956092822908, + "language_loss": 0.9731133, + "learning_rate": 3.997941708816791e-06, + "loss": 1.0626924, + "num_input_tokens_seen": 15698645, + "router_z_loss_clip": 8.1640625, + "router_z_loss_mlp": 1.00146484, + "step": 732, + "time_per_iteration": 4.100733995437622 + }, + { + "auxiliary_loss_clip": 0.07525843, + "auxiliary_loss_mlp": 0.01458711, + "balance_loss_clip": 0.06679834, + "balance_loss_mlp": 0.01353854, + "epoch": 0.044070344205621524, + "flos": 20965978488960.0, + "grad_norm": 13.482370943505323, + "language_loss": 0.90961432, + "learning_rate": 3.997924006231419e-06, + "loss": 0.9994598, + "num_input_tokens_seen": 15716775, + "router_z_loss_clip": 8.46875, + "router_z_loss_mlp": 1.04785156, + "step": 733, + "time_per_iteration": 2.6597700119018555 + }, + { + "auxiliary_loss_clip": 0.07518548, + "auxiliary_loss_mlp": 0.01469977, + "balance_loss_clip": 0.06685109, + "balance_loss_mlp": 0.01364262, + "epoch": 0.044130467458289496, + "flos": 13850477619840.0, + "grad_norm": 7.4867822080691235, + "language_loss": 0.95689577, + "learning_rate": 3.9979062278845685e-06, + "loss": 1.04678106, + "num_input_tokens_seen": 15733320, + "router_z_loss_clip": 8.34375, + "router_z_loss_mlp": 1.05664062, + "step": 734, + "time_per_iteration": 2.5865581035614014 + }, + { + "auxiliary_loss_clip": 0.0748552, + "auxiliary_loss_mlp": 0.01451415, + "balance_loss_clip": 0.06673294, + "balance_loss_mlp": 0.01355809, + "epoch": 0.04419059071095746, + "flos": 28662152952960.0, + "grad_norm": 3.9560769382385237, + "language_loss": 0.82954776, + "learning_rate": 3.9978883737769125e-06, + "loss": 0.91891712, + "num_input_tokens_seen": 15752705, + "router_z_loss_clip": 8.12890625, + "router_z_loss_mlp": 0.95605469, + "step": 735, + "time_per_iteration": 2.7034595012664795 + }, + { + "auxiliary_loss_clip": 0.07501128, + "auxiliary_loss_mlp": 0.01471986, + "balance_loss_clip": 0.06663659, + "balance_loss_mlp": 0.01360931, + "epoch": 0.04425071396362543, + "flos": 28190743482240.0, + "grad_norm": 5.551572813958511, + "language_loss": 0.95522362, + "learning_rate": 3.9978704439091305e-06, + "loss": 1.04495478, + "num_input_tokens_seen": 15772800, + "router_z_loss_clip": 8.375, + "router_z_loss_mlp": 1.11132812, + "step": 736, + "time_per_iteration": 2.6946370601654053 + }, + { + "auxiliary_loss_clip": 0.07478474, + "auxiliary_loss_mlp": 0.01445427, + "balance_loss_clip": 0.06672784, + "balance_loss_mlp": 0.01338806, + "epoch": 0.0443108372162934, + "flos": 23665031481600.0, + "grad_norm": 16.744954570362566, + "language_loss": 0.88981938, + "learning_rate": 3.997852438281901e-06, + "loss": 0.97905844, + "num_input_tokens_seen": 15793665, + "router_z_loss_clip": 8.0625, + "router_z_loss_mlp": 1.06640625, + "step": 737, + "time_per_iteration": 2.715646266937256 + }, + { + "auxiliary_loss_clip": 0.07480585, + "auxiliary_loss_mlp": 0.01439926, + "balance_loss_clip": 0.0667211, + "balance_loss_mlp": 0.01326964, + "epoch": 0.04437096046896137, + "flos": 33987486067200.0, + "grad_norm": 222.55096495156016, + "language_loss": 0.89570022, + "learning_rate": 3.997834356895906e-06, + "loss": 0.98490536, + "num_input_tokens_seen": 15813175, + "router_z_loss_clip": 8.0859375, + "router_z_loss_mlp": 1.12988281, + "step": 738, + "time_per_iteration": 2.7859413623809814 + }, + { + "auxiliary_loss_clip": 0.06961473, + "auxiliary_loss_mlp": 0.01305245, + "balance_loss_clip": 0.06532852, + "balance_loss_mlp": 0.01250504, + "epoch": 0.04443108372162934, + "flos": 67416268308480.0, + "grad_norm": 0.9420923573397554, + "language_loss": 0.59376323, + "learning_rate": 3.9978161997518324e-06, + "loss": 0.67643034, + "num_input_tokens_seen": 15872050, + "router_z_loss_clip": 4.3046875, + "router_z_loss_mlp": 0.54882812, + "step": 739, + "time_per_iteration": 3.1967270374298096 + }, + { + "auxiliary_loss_clip": 0.07502826, + "auxiliary_loss_mlp": 0.01427717, + "balance_loss_clip": 0.06669345, + "balance_loss_mlp": 0.01320858, + "epoch": 0.04449120697429731, + "flos": 29760454344960.0, + "grad_norm": 6.6049127408313915, + "language_loss": 0.9770751, + "learning_rate": 3.997797966850369e-06, + "loss": 1.0663805, + "num_input_tokens_seen": 15891085, + "router_z_loss_clip": 8.3359375, + "router_z_loss_mlp": 1.06933594, + "step": 740, + "time_per_iteration": 2.768758535385132 + }, + { + "auxiliary_loss_clip": 0.07489674, + "auxiliary_loss_mlp": 0.0143368, + "balance_loss_clip": 0.06660549, + "balance_loss_mlp": 0.01330111, + "epoch": 0.04455133022696528, + "flos": 36510958828800.0, + "grad_norm": 21.062626098117025, + "language_loss": 0.76799577, + "learning_rate": 3.997779658192205e-06, + "loss": 0.85722935, + "num_input_tokens_seen": 15914225, + "router_z_loss_clip": 8.3046875, + "router_z_loss_mlp": 1.03515625, + "step": 741, + "time_per_iteration": 2.755948543548584 + }, + { + "auxiliary_loss_clip": 0.0744606, + "auxiliary_loss_mlp": 0.01441267, + "balance_loss_clip": 0.06655986, + "balance_loss_mlp": 0.01339128, + "epoch": 0.044611453479633245, + "flos": 28811220566400.0, + "grad_norm": 10.341428331493303, + "language_loss": 0.9204191, + "learning_rate": 3.997761273778037e-06, + "loss": 1.00929236, + "num_input_tokens_seen": 15934540, + "router_z_loss_clip": 7.90234375, + "router_z_loss_mlp": 1.02148438, + "step": 742, + "time_per_iteration": 2.6964497566223145 + }, + { + "auxiliary_loss_clip": 0.07461847, + "auxiliary_loss_mlp": 0.01424939, + "balance_loss_clip": 0.06654513, + "balance_loss_mlp": 0.01322085, + "epoch": 0.04467157673230122, + "flos": 20017122053760.0, + "grad_norm": 7.31366885778202, + "language_loss": 0.89204007, + "learning_rate": 3.997742813608561e-06, + "loss": 0.98090798, + "num_input_tokens_seen": 15952560, + "router_z_loss_clip": 8.078125, + "router_z_loss_mlp": 1.02880859, + "step": 743, + "time_per_iteration": 2.6080615520477295 + }, + { + "auxiliary_loss_clip": 0.07439004, + "auxiliary_loss_mlp": 0.01432385, + "balance_loss_clip": 0.06638713, + "balance_loss_mlp": 0.01329913, + "epoch": 0.04473169998496919, + "flos": 18010899745920.0, + "grad_norm": 13.675273731760388, + "language_loss": 0.85338962, + "learning_rate": 3.997724277684479e-06, + "loss": 0.94210356, + "num_input_tokens_seen": 15970620, + "router_z_loss_clip": 8.00390625, + "router_z_loss_mlp": 1.02490234, + "step": 744, + "time_per_iteration": 2.697763204574585 + }, + { + "auxiliary_loss_clip": 0.07427198, + "auxiliary_loss_mlp": 0.01407828, + "balance_loss_clip": 0.06637768, + "balance_loss_mlp": 0.01313938, + "epoch": 0.044791823237637154, + "flos": 20638060335360.0, + "grad_norm": 8.258556171326942, + "language_loss": 0.89771521, + "learning_rate": 3.99770566600649e-06, + "loss": 0.98606539, + "num_input_tokens_seen": 15987325, + "router_z_loss_clip": 7.89453125, + "router_z_loss_mlp": 0.93896484, + "step": 745, + "time_per_iteration": 2.609206438064575 + }, + { + "auxiliary_loss_clip": 0.07450528, + "auxiliary_loss_mlp": 0.01413412, + "balance_loss_clip": 0.06646559, + "balance_loss_mlp": 0.01313371, + "epoch": 0.04485194649030513, + "flos": 31184284049280.0, + "grad_norm": 12.351211228960139, + "language_loss": 0.73676586, + "learning_rate": 3.997686978575302e-06, + "loss": 0.82540524, + "num_input_tokens_seen": 16008310, + "router_z_loss_clip": 8.05859375, + "router_z_loss_mlp": 1.0, + "step": 746, + "time_per_iteration": 2.8217551708221436 + }, + { + "auxiliary_loss_clip": 0.07421336, + "auxiliary_loss_mlp": 0.01411005, + "balance_loss_clip": 0.06631814, + "balance_loss_mlp": 0.01308485, + "epoch": 0.04491206974297309, + "flos": 26150922887040.0, + "grad_norm": 4.52399420645529, + "language_loss": 0.7370531, + "learning_rate": 3.997668215391625e-06, + "loss": 0.82537645, + "num_input_tokens_seen": 16029620, + "router_z_loss_clip": 7.89453125, + "router_z_loss_mlp": 1.02587891, + "step": 747, + "time_per_iteration": 2.724240303039551 + }, + { + "auxiliary_loss_clip": 0.0741486, + "auxiliary_loss_mlp": 0.01407706, + "balance_loss_clip": 0.06629101, + "balance_loss_mlp": 0.0131005, + "epoch": 0.044972192995641064, + "flos": 20673922682880.0, + "grad_norm": 4.695342378066542, + "language_loss": 0.7142753, + "learning_rate": 3.997649376456168e-06, + "loss": 0.80250096, + "num_input_tokens_seen": 16049065, + "router_z_loss_clip": 7.859375, + "router_z_loss_mlp": 0.97607422, + "step": 748, + "time_per_iteration": 2.6020255088806152 + }, + { + "auxiliary_loss_clip": 0.0743566, + "auxiliary_loss_mlp": 0.01385894, + "balance_loss_clip": 0.06626688, + "balance_loss_mlp": 0.01281753, + "epoch": 0.045032316248309036, + "flos": 16112306407680.0, + "grad_norm": 6.462262226814603, + "language_loss": 0.81646264, + "learning_rate": 3.997630461769647e-06, + "loss": 0.90467817, + "num_input_tokens_seen": 16066765, + "router_z_loss_clip": 8.08984375, + "router_z_loss_mlp": 1.04199219, + "step": 749, + "time_per_iteration": 2.715440273284912 + }, + { + "auxiliary_loss_clip": 0.07424041, + "auxiliary_loss_mlp": 0.01391269, + "balance_loss_clip": 0.06627008, + "balance_loss_mlp": 0.01284601, + "epoch": 0.045092439500977, + "flos": 17864725098240.0, + "grad_norm": 4.760324696153287, + "language_loss": 0.94018352, + "learning_rate": 3.997611471332778e-06, + "loss": 1.02833652, + "num_input_tokens_seen": 16085980, + "router_z_loss_clip": 7.96484375, + "router_z_loss_mlp": 1.06542969, + "step": 750, + "time_per_iteration": 2.603782892227173 + }, + { + "auxiliary_loss_clip": 0.07430436, + "auxiliary_loss_mlp": 0.01400307, + "balance_loss_clip": 0.06634089, + "balance_loss_mlp": 0.01284579, + "epoch": 0.04515256275364497, + "flos": 24469809621120.0, + "grad_norm": 8.436133500985974, + "language_loss": 0.79776669, + "learning_rate": 3.9975924051462825e-06, + "loss": 0.88607413, + "num_input_tokens_seen": 16106260, + "router_z_loss_clip": 7.97265625, + "router_z_loss_mlp": 1.15673828, + "step": 751, + "time_per_iteration": 2.6831071376800537 + }, + { + "auxiliary_loss_clip": 0.07439418, + "auxiliary_loss_mlp": 0.01393415, + "balance_loss_clip": 0.06633066, + "balance_loss_mlp": 0.01282932, + "epoch": 0.04521268600631294, + "flos": 20921563025280.0, + "grad_norm": 6.241833654243461, + "language_loss": 0.75070345, + "learning_rate": 3.997573263210883e-06, + "loss": 0.83903182, + "num_input_tokens_seen": 16123475, + "router_z_loss_clip": 8.05859375, + "router_z_loss_mlp": 1.10351562, + "step": 752, + "time_per_iteration": 2.6177663803100586 + }, + { + "auxiliary_loss_clip": 0.07437599, + "auxiliary_loss_mlp": 0.01387858, + "balance_loss_clip": 0.06631324, + "balance_loss_mlp": 0.01275515, + "epoch": 0.04527280925898091, + "flos": 13376552526720.0, + "grad_norm": 9.915844804632899, + "language_loss": 0.97712451, + "learning_rate": 3.997554045527305e-06, + "loss": 1.06537914, + "num_input_tokens_seen": 16138335, + "router_z_loss_clip": 8.0703125, + "router_z_loss_mlp": 1.125, + "step": 753, + "time_per_iteration": 2.613664388656616 + }, + { + "auxiliary_loss_clip": 0.07467066, + "auxiliary_loss_mlp": 0.0138957, + "balance_loss_clip": 0.06645191, + "balance_loss_mlp": 0.01278133, + "epoch": 0.04533293251164888, + "flos": 23260650877440.0, + "grad_norm": 4.960920268809469, + "language_loss": 0.95308006, + "learning_rate": 3.997534752096277e-06, + "loss": 1.04164636, + "num_input_tokens_seen": 16157110, + "router_z_loss_clip": 8.23046875, + "router_z_loss_mlp": 1.11376953, + "step": 754, + "time_per_iteration": 2.6214957237243652 + }, + { + "auxiliary_loss_clip": 0.07402018, + "auxiliary_loss_mlp": 0.01373244, + "balance_loss_clip": 0.06614807, + "balance_loss_mlp": 0.01264812, + "epoch": 0.04539305576431685, + "flos": 12426899477760.0, + "grad_norm": 4.312204742226669, + "language_loss": 0.84473336, + "learning_rate": 3.997515382918531e-06, + "loss": 0.93248594, + "num_input_tokens_seen": 16174155, + "router_z_loss_clip": 7.87890625, + "router_z_loss_mlp": 1.08544922, + "step": 755, + "time_per_iteration": 2.659515857696533 + }, + { + "auxiliary_loss_clip": 0.07425568, + "auxiliary_loss_mlp": 0.01385083, + "balance_loss_clip": 0.06618007, + "balance_loss_mlp": 0.01261582, + "epoch": 0.04545317901698482, + "flos": 16076569841280.0, + "grad_norm": 4.663949688306233, + "language_loss": 0.85189492, + "learning_rate": 3.9974959379948015e-06, + "loss": 0.94000149, + "num_input_tokens_seen": 16192240, + "router_z_loss_clip": 8.078125, + "router_z_loss_mlp": 1.23632812, + "step": 756, + "time_per_iteration": 2.5948095321655273 + }, + { + "auxiliary_loss_clip": 0.0692629, + "auxiliary_loss_mlp": 0.01345145, + "balance_loss_clip": 0.06492035, + "balance_loss_mlp": 0.01295292, + "epoch": 0.045513302269652785, + "flos": 66418118144640.0, + "grad_norm": 0.7901603277703675, + "language_loss": 0.62960637, + "learning_rate": 3.997476417325827e-06, + "loss": 0.71232069, + "num_input_tokens_seen": 16255775, + "router_z_loss_clip": 4.34375, + "router_z_loss_mlp": 0.49829102, + "step": 757, + "time_per_iteration": 3.255581855773926 + }, + { + "auxiliary_loss_clip": 0.07416959, + "auxiliary_loss_mlp": 0.01380818, + "balance_loss_clip": 0.06624802, + "balance_loss_mlp": 0.01258747, + "epoch": 0.04557342552232076, + "flos": 21477694573440.0, + "grad_norm": 3.09506424046452, + "language_loss": 0.87773216, + "learning_rate": 3.997456820912346e-06, + "loss": 0.96570992, + "num_input_tokens_seen": 16277015, + "router_z_loss_clip": 7.921875, + "router_z_loss_mlp": 1.22070312, + "step": 758, + "time_per_iteration": 2.661123514175415 + }, + { + "auxiliary_loss_clip": 0.0740035, + "auxiliary_loss_mlp": 0.01375063, + "balance_loss_clip": 0.06621221, + "balance_loss_mlp": 0.01257952, + "epoch": 0.04563354877498873, + "flos": 23739481434240.0, + "grad_norm": 2.638413914831674, + "language_loss": 0.92492557, + "learning_rate": 3.997437148755101e-06, + "loss": 1.0126797, + "num_input_tokens_seen": 16296005, + "router_z_loss_clip": 7.78515625, + "router_z_loss_mlp": 1.17089844, + "step": 759, + "time_per_iteration": 2.668470859527588 + }, + { + "auxiliary_loss_clip": 0.07430892, + "auxiliary_loss_mlp": 0.01383461, + "balance_loss_clip": 0.06623936, + "balance_loss_mlp": 0.01266541, + "epoch": 0.045693672027656694, + "flos": 25742265724800.0, + "grad_norm": 3.8629420904701237, + "language_loss": 0.79697698, + "learning_rate": 3.9974174008548405e-06, + "loss": 0.88512051, + "num_input_tokens_seen": 16315300, + "router_z_loss_clip": 8.07421875, + "router_z_loss_mlp": 1.16992188, + "step": 760, + "time_per_iteration": 2.716425895690918 + }, + { + "auxiliary_loss_clip": 0.07406907, + "auxiliary_loss_mlp": 0.01369419, + "balance_loss_clip": 0.06620169, + "balance_loss_mlp": 0.01267519, + "epoch": 0.045753795280324666, + "flos": 19725108174720.0, + "grad_norm": 2.8686759977967458, + "language_loss": 0.87246794, + "learning_rate": 3.9973975772123105e-06, + "loss": 0.96023118, + "num_input_tokens_seen": 16333820, + "router_z_loss_clip": 7.87109375, + "router_z_loss_mlp": 1.01855469, + "step": 761, + "time_per_iteration": 2.6261487007141113 + }, + { + "auxiliary_loss_clip": 0.07379207, + "auxiliary_loss_mlp": 0.01371916, + "balance_loss_clip": 0.06607988, + "balance_loss_mlp": 0.01259764, + "epoch": 0.04581391853299264, + "flos": 23262076396800.0, + "grad_norm": 2.7268346941502273, + "language_loss": 0.83904314, + "learning_rate": 3.997377677828266e-06, + "loss": 0.92655438, + "num_input_tokens_seen": 16355290, + "router_z_loss_clip": 7.71875, + "router_z_loss_mlp": 1.12304688, + "step": 762, + "time_per_iteration": 2.677358627319336 + }, + { + "auxiliary_loss_clip": 0.06917945, + "auxiliary_loss_mlp": 0.01342542, + "balance_loss_clip": 0.06491472, + "balance_loss_mlp": 0.01301057, + "epoch": 0.0458740417856606, + "flos": 64250711308800.0, + "grad_norm": 0.9293980504879501, + "language_loss": 0.59131134, + "learning_rate": 3.9973577027034585e-06, + "loss": 0.67391622, + "num_input_tokens_seen": 16415995, + "router_z_loss_clip": 4.25, + "router_z_loss_mlp": 0.41503906, + "step": 763, + "time_per_iteration": 3.262456178665161 + }, + { + "auxiliary_loss_clip": 0.07421511, + "auxiliary_loss_mlp": 0.01399391, + "balance_loss_clip": 0.0662367, + "balance_loss_mlp": 0.01283425, + "epoch": 0.045934165038328575, + "flos": 20775220669440.0, + "grad_norm": 3.4758610459340535, + "language_loss": 0.92935646, + "learning_rate": 3.9973376518386475e-06, + "loss": 1.01756549, + "num_input_tokens_seen": 16433120, + "router_z_loss_clip": 7.98046875, + "router_z_loss_mlp": 1.15869141, + "step": 764, + "time_per_iteration": 2.66152024269104 + }, + { + "auxiliary_loss_clip": 0.07451791, + "auxiliary_loss_mlp": 0.01391333, + "balance_loss_clip": 0.06637829, + "balance_loss_mlp": 0.01274556, + "epoch": 0.04599428829099654, + "flos": 30270661056000.0, + "grad_norm": 3.768496915542153, + "language_loss": 0.90699267, + "learning_rate": 3.997317525234592e-06, + "loss": 0.99542397, + "num_input_tokens_seen": 16453360, + "router_z_loss_clip": 8.14453125, + "router_z_loss_mlp": 1.16845703, + "step": 765, + "time_per_iteration": 2.6835410594940186 + }, + { + "auxiliary_loss_clip": 0.07426902, + "auxiliary_loss_mlp": 0.01398616, + "balance_loss_clip": 0.0662117, + "balance_loss_mlp": 0.01278883, + "epoch": 0.04605441154366451, + "flos": 23045518719360.0, + "grad_norm": 7.076643019058991, + "language_loss": 0.94406933, + "learning_rate": 3.997297322892056e-06, + "loss": 1.03232455, + "num_input_tokens_seen": 16471160, + "router_z_loss_clip": 8.0625, + "router_z_loss_mlp": 1.19580078, + "step": 766, + "time_per_iteration": 2.6382553577423096 + }, + { + "auxiliary_loss_clip": 0.07415807, + "auxiliary_loss_mlp": 0.01393781, + "balance_loss_clip": 0.06614047, + "balance_loss_mlp": 0.01284967, + "epoch": 0.046114534796332485, + "flos": 22023847486080.0, + "grad_norm": 4.776611740874826, + "language_loss": 0.89285934, + "learning_rate": 3.997277044811806e-06, + "loss": 0.98095518, + "num_input_tokens_seen": 16488940, + "router_z_loss_clip": 8.01953125, + "router_z_loss_mlp": 1.08789062, + "step": 767, + "time_per_iteration": 4.195739984512329 + }, + { + "auxiliary_loss_clip": 0.07392205, + "auxiliary_loss_mlp": 0.01374375, + "balance_loss_clip": 0.0661349, + "balance_loss_mlp": 0.01267278, + "epoch": 0.04617465804900045, + "flos": 29870221593600.0, + "grad_norm": 7.642963435689524, + "language_loss": 0.92056656, + "learning_rate": 3.99725669099461e-06, + "loss": 1.00823236, + "num_input_tokens_seen": 16509505, + "router_z_loss_clip": 7.7890625, + "router_z_loss_mlp": 1.0703125, + "step": 768, + "time_per_iteration": 4.208758354187012 + }, + { + "auxiliary_loss_clip": 0.07427865, + "auxiliary_loss_mlp": 0.01386956, + "balance_loss_clip": 0.06619686, + "balance_loss_mlp": 0.01278571, + "epoch": 0.04623478130166842, + "flos": 25637194304640.0, + "grad_norm": 3.542997425401238, + "language_loss": 0.79400444, + "learning_rate": 3.9972362614412395e-06, + "loss": 0.88215268, + "num_input_tokens_seen": 16528840, + "router_z_loss_clip": 8.078125, + "router_z_loss_mlp": 1.08447266, + "step": 769, + "time_per_iteration": 4.17974328994751 + }, + { + "auxiliary_loss_clip": 0.07375413, + "auxiliary_loss_mlp": 0.01385881, + "balance_loss_clip": 0.06606276, + "balance_loss_mlp": 0.01275923, + "epoch": 0.04629490455433639, + "flos": 20455352507520.0, + "grad_norm": 2.7800745603564185, + "language_loss": 0.89842647, + "learning_rate": 3.997215756152471e-06, + "loss": 0.9860394, + "num_input_tokens_seen": 16548335, + "router_z_loss_clip": 7.69140625, + "router_z_loss_mlp": 1.10009766, + "step": 770, + "time_per_iteration": 2.656651735305786 + }, + { + "auxiliary_loss_clip": 0.07423855, + "auxiliary_loss_mlp": 0.01400348, + "balance_loss_clip": 0.06619771, + "balance_loss_mlp": 0.01292678, + "epoch": 0.04635502780700436, + "flos": 23155411749120.0, + "grad_norm": 4.755062709171144, + "language_loss": 0.92055309, + "learning_rate": 3.99719517512908e-06, + "loss": 1.00879514, + "num_input_tokens_seen": 16567725, + "router_z_loss_clip": 8.04296875, + "router_z_loss_mlp": 1.07714844, + "step": 771, + "time_per_iteration": 4.008092403411865 + }, + { + "auxiliary_loss_clip": 0.07446887, + "auxiliary_loss_mlp": 0.0141094, + "balance_loss_clip": 0.06623209, + "balance_loss_mlp": 0.01295641, + "epoch": 0.04641515105967233, + "flos": 23298274160640.0, + "grad_norm": 7.281609081858744, + "language_loss": 0.88918245, + "learning_rate": 3.997174518371848e-06, + "loss": 0.97776067, + "num_input_tokens_seen": 16588175, + "router_z_loss_clip": 8.2265625, + "router_z_loss_mlp": 1.15380859, + "step": 772, + "time_per_iteration": 2.6240971088409424 + }, + { + "auxiliary_loss_clip": 0.07388498, + "auxiliary_loss_mlp": 0.01396403, + "balance_loss_clip": 0.06612748, + "balance_loss_mlp": 0.01294503, + "epoch": 0.046475274312340296, + "flos": 25121579005440.0, + "grad_norm": 3.47084722704317, + "language_loss": 0.78166652, + "learning_rate": 3.997153785881557e-06, + "loss": 0.86951548, + "num_input_tokens_seen": 16607735, + "router_z_loss_clip": 7.765625, + "router_z_loss_mlp": 1.01904297, + "step": 773, + "time_per_iteration": 2.6761457920074463 + }, + { + "auxiliary_loss_clip": 0.07362784, + "auxiliary_loss_mlp": 0.01412458, + "balance_loss_clip": 0.06602354, + "balance_loss_mlp": 0.0130703, + "epoch": 0.04653539756500827, + "flos": 25271946357120.0, + "grad_norm": 3.68531082302782, + "language_loss": 0.82003927, + "learning_rate": 3.997132977658996e-06, + "loss": 0.90779173, + "num_input_tokens_seen": 16627225, + "router_z_loss_clip": 7.609375, + "router_z_loss_mlp": 1.05419922, + "step": 774, + "time_per_iteration": 2.6333625316619873 + }, + { + "auxiliary_loss_clip": 0.0737831, + "auxiliary_loss_mlp": 0.01410602, + "balance_loss_clip": 0.06605712, + "balance_loss_mlp": 0.0129783, + "epoch": 0.046595520817676234, + "flos": 35412238166400.0, + "grad_norm": 3.362442863286837, + "language_loss": 0.78172398, + "learning_rate": 3.997112093704952e-06, + "loss": 0.86961305, + "num_input_tokens_seen": 16647785, + "router_z_loss_clip": 7.73046875, + "router_z_loss_mlp": 1.12792969, + "step": 775, + "time_per_iteration": 2.7341220378875732 + }, + { + "auxiliary_loss_clip": 0.07397586, + "auxiliary_loss_mlp": 0.01408088, + "balance_loss_clip": 0.0662451, + "balance_loss_mlp": 0.01303994, + "epoch": 0.046655644070344206, + "flos": 18118151372160.0, + "grad_norm": 4.938605745427105, + "language_loss": 0.81674814, + "learning_rate": 3.997091134020217e-06, + "loss": 0.90480489, + "num_input_tokens_seen": 16667555, + "router_z_loss_clip": 7.734375, + "router_z_loss_mlp": 1.04052734, + "step": 776, + "time_per_iteration": 2.631185293197632 + }, + { + "auxiliary_loss_clip": 0.07349464, + "auxiliary_loss_mlp": 0.01382372, + "balance_loss_clip": 0.06605366, + "balance_loss_mlp": 0.01283905, + "epoch": 0.04671576732301218, + "flos": 29212959767040.0, + "grad_norm": 3.9530223985438724, + "language_loss": 0.76411474, + "learning_rate": 3.997070098605585e-06, + "loss": 0.85143304, + "num_input_tokens_seen": 16686875, + "router_z_loss_clip": 7.4375, + "router_z_loss_mlp": 0.98535156, + "step": 777, + "time_per_iteration": 2.6883299350738525 + }, + { + "auxiliary_loss_clip": 0.07356873, + "auxiliary_loss_mlp": 0.01403802, + "balance_loss_clip": 0.06604887, + "balance_loss_mlp": 0.0129618, + "epoch": 0.04677589057568014, + "flos": 30485541651840.0, + "grad_norm": 5.886017158674543, + "language_loss": 0.8144322, + "learning_rate": 3.997048987461856e-06, + "loss": 0.90203899, + "num_input_tokens_seen": 16706420, + "router_z_loss_clip": 7.52734375, + "router_z_loss_mlp": 1.07568359, + "step": 778, + "time_per_iteration": 2.685317277908325 + }, + { + "auxiliary_loss_clip": 0.07353938, + "auxiliary_loss_mlp": 0.01397494, + "balance_loss_clip": 0.06609853, + "balance_loss_mlp": 0.01301697, + "epoch": 0.046836013828348115, + "flos": 20563820017920.0, + "grad_norm": 3.1633004103469644, + "language_loss": 0.83870596, + "learning_rate": 3.997027800589829e-06, + "loss": 0.92622018, + "num_input_tokens_seen": 16726390, + "router_z_loss_clip": 7.4375, + "router_z_loss_mlp": 0.95849609, + "step": 779, + "time_per_iteration": 2.737780809402466 + }, + { + "auxiliary_loss_clip": 0.07349363, + "auxiliary_loss_mlp": 0.01400206, + "balance_loss_clip": 0.06610721, + "balance_loss_mlp": 0.01301119, + "epoch": 0.04689613708101608, + "flos": 25454444549760.0, + "grad_norm": 5.859193350473668, + "language_loss": 0.80411738, + "learning_rate": 3.997006537990308e-06, + "loss": 0.89161313, + "num_input_tokens_seen": 16748965, + "router_z_loss_clip": 7.38671875, + "router_z_loss_mlp": 0.99023438, + "step": 780, + "time_per_iteration": 2.7168006896972656 + }, + { + "auxiliary_loss_clip": 0.07343157, + "auxiliary_loss_mlp": 0.0140195, + "balance_loss_clip": 0.06612131, + "balance_loss_mlp": 0.01309253, + "epoch": 0.04695626033368405, + "flos": 23007811582080.0, + "grad_norm": 3.4762604948204707, + "language_loss": 0.80410504, + "learning_rate": 3.996985199664099e-06, + "loss": 0.89155614, + "num_input_tokens_seen": 16768620, + "router_z_loss_clip": 7.3125, + "router_z_loss_mlp": 0.92724609, + "step": 781, + "time_per_iteration": 2.6267943382263184 + }, + { + "auxiliary_loss_clip": 0.07401444, + "auxiliary_loss_mlp": 0.01433849, + "balance_loss_clip": 0.06619258, + "balance_loss_mlp": 0.01321363, + "epoch": 0.047016383586352024, + "flos": 29141193144960.0, + "grad_norm": 4.331089591937386, + "language_loss": 0.79331714, + "learning_rate": 3.99696378561201e-06, + "loss": 0.88167012, + "num_input_tokens_seen": 16789755, + "router_z_loss_clip": 7.83984375, + "router_z_loss_mlp": 1.12451172, + "step": 782, + "time_per_iteration": 2.7272114753723145 + }, + { + "auxiliary_loss_clip": 0.07364355, + "auxiliary_loss_mlp": 0.01439388, + "balance_loss_clip": 0.06623092, + "balance_loss_mlp": 0.01338251, + "epoch": 0.04707650683901999, + "flos": 14981706466560.0, + "grad_norm": 6.433414878185146, + "language_loss": 0.85460365, + "learning_rate": 3.996942295834855e-06, + "loss": 0.94264108, + "num_input_tokens_seen": 16807585, + "router_z_loss_clip": 7.421875, + "router_z_loss_mlp": 1.01269531, + "step": 783, + "time_per_iteration": 2.6950912475585938 + }, + { + "auxiliary_loss_clip": 0.07354224, + "auxiliary_loss_mlp": 0.01436959, + "balance_loss_clip": 0.06629962, + "balance_loss_mlp": 0.01332722, + "epoch": 0.04713663009168796, + "flos": 21657257873280.0, + "grad_norm": 5.367904788236997, + "language_loss": 0.87574267, + "learning_rate": 3.996920730333448e-06, + "loss": 0.96365452, + "num_input_tokens_seen": 16827220, + "router_z_loss_clip": 7.234375, + "router_z_loss_mlp": 1.04150391, + "step": 784, + "time_per_iteration": 2.649948835372925 + }, + { + "auxiliary_loss_clip": 0.07386977, + "auxiliary_loss_mlp": 0.01467498, + "balance_loss_clip": 0.06641141, + "balance_loss_mlp": 0.01344665, + "epoch": 0.04719675334435593, + "flos": 21331939196160.0, + "grad_norm": 33.75407076232228, + "language_loss": 0.85470867, + "learning_rate": 3.996899089108607e-06, + "loss": 0.9432534, + "num_input_tokens_seen": 16846230, + "router_z_loss_clip": 7.453125, + "router_z_loss_mlp": 1.22753906, + "step": 785, + "time_per_iteration": 2.641284227371216 + }, + { + "auxiliary_loss_clip": 0.07399641, + "auxiliary_loss_mlp": 0.01481075, + "balance_loss_clip": 0.06649202, + "balance_loss_mlp": 0.01357002, + "epoch": 0.0472568765970239, + "flos": 17937204480000.0, + "grad_norm": 4.826067054081543, + "language_loss": 0.94969213, + "learning_rate": 3.996877372161152e-06, + "loss": 1.03849936, + "num_input_tokens_seen": 16865325, + "router_z_loss_clip": 7.51953125, + "router_z_loss_mlp": 1.24023438, + "step": 786, + "time_per_iteration": 2.6160340309143066 + }, + { + "auxiliary_loss_clip": 0.07465263, + "auxiliary_loss_mlp": 0.01521969, + "balance_loss_clip": 0.06653383, + "balance_loss_mlp": 0.01371384, + "epoch": 0.04731699984969187, + "flos": 18083169492480.0, + "grad_norm": 10.690384669742231, + "language_loss": 0.84019518, + "learning_rate": 3.9968555794919065e-06, + "loss": 0.93006748, + "num_input_tokens_seen": 16882930, + "router_z_loss_clip": 8.1328125, + "router_z_loss_mlp": 1.50488281, + "step": 787, + "time_per_iteration": 2.5864908695220947 + }, + { + "auxiliary_loss_clip": 0.07389308, + "auxiliary_loss_mlp": 0.01468371, + "balance_loss_clip": 0.06647876, + "balance_loss_mlp": 0.01332663, + "epoch": 0.047377123102359836, + "flos": 23191735294080.0, + "grad_norm": 8.892570877156906, + "language_loss": 0.85964632, + "learning_rate": 3.996833711101698e-06, + "loss": 0.94822311, + "num_input_tokens_seen": 16900710, + "router_z_loss_clip": 7.41796875, + "router_z_loss_mlp": 1.35839844, + "step": 788, + "time_per_iteration": 2.6390748023986816 + }, + { + "auxiliary_loss_clip": 0.07401264, + "auxiliary_loss_mlp": 0.01469979, + "balance_loss_clip": 0.06672339, + "balance_loss_mlp": 0.01334367, + "epoch": 0.04743724635502781, + "flos": 22754469162240.0, + "grad_norm": 17.026258111429804, + "language_loss": 0.89192903, + "learning_rate": 3.996811766991355e-06, + "loss": 0.98064142, + "num_input_tokens_seen": 16919210, + "router_z_loss_clip": 7.29296875, + "router_z_loss_mlp": 1.35449219, + "step": 789, + "time_per_iteration": 2.6131770610809326 + }, + { + "auxiliary_loss_clip": 0.07421435, + "auxiliary_loss_mlp": 0.01479761, + "balance_loss_clip": 0.06683871, + "balance_loss_mlp": 0.01339475, + "epoch": 0.04749736960769577, + "flos": 17244499576320.0, + "grad_norm": 30.32315054606697, + "language_loss": 0.88307178, + "learning_rate": 3.996789747161709e-06, + "loss": 0.97208381, + "num_input_tokens_seen": 16937125, + "router_z_loss_clip": 7.37890625, + "router_z_loss_mlp": 1.40136719, + "step": 790, + "time_per_iteration": 2.618745803833008 + }, + { + "auxiliary_loss_clip": 0.07412322, + "auxiliary_loss_mlp": 0.01470303, + "balance_loss_clip": 0.06664298, + "balance_loss_mlp": 0.01331687, + "epoch": 0.047557492860363745, + "flos": 40488798908160.0, + "grad_norm": 154.88106341207603, + "language_loss": 0.94037831, + "learning_rate": 3.996767651613597e-06, + "loss": 1.02920461, + "num_input_tokens_seen": 16958610, + "router_z_loss_clip": 7.48046875, + "router_z_loss_mlp": 1.38623047, + "step": 791, + "time_per_iteration": 2.7700016498565674 + }, + { + "auxiliary_loss_clip": 0.07422841, + "auxiliary_loss_mlp": 0.01462484, + "balance_loss_clip": 0.06681914, + "balance_loss_mlp": 0.01322198, + "epoch": 0.04761761611303172, + "flos": 18704023920000.0, + "grad_norm": 23.33805920811653, + "language_loss": 0.9476828, + "learning_rate": 3.996745480347854e-06, + "loss": 1.03653598, + "num_input_tokens_seen": 16977300, + "router_z_loss_clip": 7.4140625, + "router_z_loss_mlp": 1.40332031, + "step": 792, + "time_per_iteration": 2.605254888534546 + }, + { + "auxiliary_loss_clip": 0.07424683, + "auxiliary_loss_mlp": 0.01473205, + "balance_loss_clip": 0.0668014, + "balance_loss_mlp": 0.01333396, + "epoch": 0.04767773936569968, + "flos": 20928103643520.0, + "grad_norm": 9.340139883580587, + "language_loss": 0.78320849, + "learning_rate": 3.996723233365324e-06, + "loss": 0.87218744, + "num_input_tokens_seen": 16994950, + "router_z_loss_clip": 7.44921875, + "router_z_loss_mlp": 1.39697266, + "step": 793, + "time_per_iteration": 2.589350938796997 + }, + { + "auxiliary_loss_clip": 0.07421647, + "auxiliary_loss_mlp": 0.01474475, + "balance_loss_clip": 0.06679038, + "balance_loss_mlp": 0.01333379, + "epoch": 0.047737862618367655, + "flos": 23739481434240.0, + "grad_norm": 17.45910394468578, + "language_loss": 0.91955769, + "learning_rate": 3.996700910666847e-06, + "loss": 1.00851893, + "num_input_tokens_seen": 17014760, + "router_z_loss_clip": 7.4296875, + "router_z_loss_mlp": 1.41064453, + "step": 794, + "time_per_iteration": 2.65012264251709 + }, + { + "auxiliary_loss_clip": 0.07410855, + "auxiliary_loss_mlp": 0.01451088, + "balance_loss_clip": 0.06674555, + "balance_loss_mlp": 0.01322247, + "epoch": 0.04779798587103562, + "flos": 23702487056640.0, + "grad_norm": 25.87656480685072, + "language_loss": 0.77586949, + "learning_rate": 3.996678512253272e-06, + "loss": 0.8644889, + "num_input_tokens_seen": 17032715, + "router_z_loss_clip": 7.3671875, + "router_z_loss_mlp": 1.28808594, + "step": 795, + "time_per_iteration": 2.6948788166046143 + }, + { + "auxiliary_loss_clip": 0.07379565, + "auxiliary_loss_mlp": 0.01431544, + "balance_loss_clip": 0.06667496, + "balance_loss_mlp": 0.01302989, + "epoch": 0.04785810912370359, + "flos": 23190058212480.0, + "grad_norm": 8.675826434601191, + "language_loss": 0.85312498, + "learning_rate": 3.996656038125449e-06, + "loss": 0.94123614, + "num_input_tokens_seen": 17052215, + "router_z_loss_clip": 7.12109375, + "router_z_loss_mlp": 1.28466797, + "step": 796, + "time_per_iteration": 2.7435877323150635 + }, + { + "auxiliary_loss_clip": 0.07385565, + "auxiliary_loss_mlp": 0.0140352, + "balance_loss_clip": 0.06662786, + "balance_loss_mlp": 0.01285074, + "epoch": 0.047918232376371564, + "flos": 18046426677120.0, + "grad_norm": 54.926272560680225, + "language_loss": 0.8855834, + "learning_rate": 3.996633488284228e-06, + "loss": 0.97347426, + "num_input_tokens_seen": 17069225, + "router_z_loss_clip": 7.23046875, + "router_z_loss_mlp": 1.18359375, + "step": 797, + "time_per_iteration": 2.6623764038085938 + }, + { + "auxiliary_loss_clip": 0.07094701, + "auxiliary_loss_mlp": 0.01316158, + "balance_loss_clip": 0.0666967, + "balance_loss_mlp": 0.01274649, + "epoch": 0.04797835562903953, + "flos": 62461717511040.0, + "grad_norm": 0.9155106497251145, + "language_loss": 0.64821255, + "learning_rate": 3.996610862730465e-06, + "loss": 0.73232114, + "num_input_tokens_seen": 17126680, + "router_z_loss_clip": 4.25, + "router_z_loss_mlp": 0.4152832, + "step": 798, + "time_per_iteration": 3.148404121398926 + }, + { + "auxiliary_loss_clip": 0.07427999, + "auxiliary_loss_mlp": 0.01422996, + "balance_loss_clip": 0.06684162, + "balance_loss_mlp": 0.01303215, + "epoch": 0.0480384788817075, + "flos": 21513766556160.0, + "grad_norm": 16.018908533164023, + "language_loss": 0.96157068, + "learning_rate": 3.996588161465018e-06, + "loss": 1.05008054, + "num_input_tokens_seen": 17144835, + "router_z_loss_clip": 7.4453125, + "router_z_loss_mlp": 1.19775391, + "step": 799, + "time_per_iteration": 2.6639058589935303 + }, + { + "auxiliary_loss_clip": 0.07364519, + "auxiliary_loss_mlp": 0.01407648, + "balance_loss_clip": 0.06657426, + "balance_loss_mlp": 0.01297594, + "epoch": 0.048098602134375466, + "flos": 21733301053440.0, + "grad_norm": 22.047266878511874, + "language_loss": 0.92366803, + "learning_rate": 3.996565384488748e-06, + "loss": 1.01138973, + "num_input_tokens_seen": 17165030, + "router_z_loss_clip": 7.07421875, + "router_z_loss_mlp": 1.10253906, + "step": 800, + "time_per_iteration": 2.646414041519165 + }, + { + "auxiliary_loss_clip": 0.07370388, + "auxiliary_loss_mlp": 0.01385117, + "balance_loss_clip": 0.06655432, + "balance_loss_mlp": 0.01282549, + "epoch": 0.04815872538704344, + "flos": 22937931676800.0, + "grad_norm": 10.357052219396058, + "language_loss": 0.89344579, + "learning_rate": 3.996542531802518e-06, + "loss": 0.98100084, + "num_input_tokens_seen": 17184895, + "router_z_loss_clip": 7.1484375, + "router_z_loss_mlp": 1.02636719, + "step": 801, + "time_per_iteration": 2.6882050037384033 + }, + { + "auxiliary_loss_clip": 0.07345966, + "auxiliary_loss_mlp": 0.01362249, + "balance_loss_clip": 0.06635958, + "balance_loss_mlp": 0.01265022, + "epoch": 0.04821884863971141, + "flos": 43183952686080.0, + "grad_norm": 6.136831614794949, + "language_loss": 0.85035717, + "learning_rate": 3.996519603407196e-06, + "loss": 0.93743926, + "num_input_tokens_seen": 17208225, + "router_z_loss_clip": 7.10546875, + "router_z_loss_mlp": 0.97216797, + "step": 802, + "time_per_iteration": 2.79622220993042 + }, + { + "auxiliary_loss_clip": 0.07318079, + "auxiliary_loss_mlp": 0.01347073, + "balance_loss_clip": 0.06636789, + "balance_loss_mlp": 0.01265057, + "epoch": 0.048278971892379376, + "flos": 18625171628160.0, + "grad_norm": 43.20373329941697, + "language_loss": 0.91245079, + "learning_rate": 3.996496599303649e-06, + "loss": 0.99910235, + "num_input_tokens_seen": 17226305, + "router_z_loss_clip": 6.81640625, + "router_z_loss_mlp": 0.81982422, + "step": 803, + "time_per_iteration": 2.624542236328125 + }, + { + "auxiliary_loss_clip": 0.07327777, + "auxiliary_loss_mlp": 0.01365974, + "balance_loss_clip": 0.06626104, + "balance_loss_mlp": 0.01271798, + "epoch": 0.04833909514504735, + "flos": 20236279207680.0, + "grad_norm": 95.48194102470296, + "language_loss": 0.905747, + "learning_rate": 3.996473519492753e-06, + "loss": 0.99268442, + "num_input_tokens_seen": 17244545, + "router_z_loss_clip": 7.01953125, + "router_z_loss_mlp": 0.94238281, + "step": 804, + "time_per_iteration": 2.597118854522705 + }, + { + "auxiliary_loss_clip": 0.07322634, + "auxiliary_loss_mlp": 0.01340955, + "balance_loss_clip": 0.0662351, + "balance_loss_mlp": 0.01259273, + "epoch": 0.04839921839771532, + "flos": 24652182032640.0, + "grad_norm": 4.3863417773594096, + "language_loss": 0.91238397, + "learning_rate": 3.99645036397538e-06, + "loss": 0.99901986, + "num_input_tokens_seen": 17265730, + "router_z_loss_clip": 6.9921875, + "router_z_loss_mlp": 0.81689453, + "step": 805, + "time_per_iteration": 2.6999049186706543 + }, + { + "auxiliary_loss_clip": 0.07332969, + "auxiliary_loss_mlp": 0.01347421, + "balance_loss_clip": 0.06628814, + "balance_loss_mlp": 0.01263783, + "epoch": 0.048459341650383285, + "flos": 24834470590080.0, + "grad_norm": 14.417666191465669, + "language_loss": 0.71703786, + "learning_rate": 3.9964271327524085e-06, + "loss": 0.80384171, + "num_input_tokens_seen": 17284820, + "router_z_loss_clip": 7.046875, + "router_z_loss_mlp": 0.8359375, + "step": 806, + "time_per_iteration": 4.025094985961914 + }, + { + "auxiliary_loss_clip": 0.07307116, + "auxiliary_loss_mlp": 0.01343001, + "balance_loss_clip": 0.06628814, + "balance_loss_mlp": 0.01262844, + "epoch": 0.04851946490305126, + "flos": 22169644790400.0, + "grad_norm": 6.037392612651371, + "language_loss": 0.81120235, + "learning_rate": 3.9964038258247214e-06, + "loss": 0.89770353, + "num_input_tokens_seen": 17305085, + "router_z_loss_clip": 6.7734375, + "router_z_loss_mlp": 0.80126953, + "step": 807, + "time_per_iteration": 4.06866717338562 + }, + { + "auxiliary_loss_clip": 0.07289852, + "auxiliary_loss_mlp": 0.01348053, + "balance_loss_clip": 0.06616738, + "balance_loss_mlp": 0.01266228, + "epoch": 0.04857958815571922, + "flos": 19798132608000.0, + "grad_norm": 11.228648532877324, + "language_loss": 0.92036742, + "learning_rate": 3.9963804431932005e-06, + "loss": 1.00674641, + "num_input_tokens_seen": 17322715, + "router_z_loss_clip": 6.73046875, + "router_z_loss_mlp": 0.81738281, + "step": 808, + "time_per_iteration": 3.9916791915893555 + }, + { + "auxiliary_loss_clip": 0.07360442, + "auxiliary_loss_mlp": 0.01352716, + "balance_loss_clip": 0.06635769, + "balance_loss_mlp": 0.01261115, + "epoch": 0.048639711408387194, + "flos": 18703981992960.0, + "grad_norm": 6.742572767322423, + "language_loss": 0.95677304, + "learning_rate": 3.996356984858732e-06, + "loss": 1.04390454, + "num_input_tokens_seen": 17341455, + "router_z_loss_clip": 7.2421875, + "router_z_loss_mlp": 0.91699219, + "step": 809, + "time_per_iteration": 2.6680333614349365 + }, + { + "auxiliary_loss_clip": 0.07315584, + "auxiliary_loss_mlp": 0.01344649, + "balance_loss_clip": 0.06624336, + "balance_loss_mlp": 0.01256863, + "epoch": 0.048699834661055166, + "flos": 24870458718720.0, + "grad_norm": 4.628704942448529, + "language_loss": 0.90077579, + "learning_rate": 3.996333450822208e-06, + "loss": 0.98737824, + "num_input_tokens_seen": 17360765, + "router_z_loss_clip": 6.92578125, + "router_z_loss_mlp": 0.87841797, + "step": 810, + "time_per_iteration": 2.6677091121673584 + }, + { + "auxiliary_loss_clip": 0.07363133, + "auxiliary_loss_mlp": 0.01339196, + "balance_loss_clip": 0.06638221, + "balance_loss_mlp": 0.0126109, + "epoch": 0.04875995791372313, + "flos": 20710246227840.0, + "grad_norm": 31.095133807277897, + "language_loss": 0.84460914, + "learning_rate": 3.99630984108452e-06, + "loss": 0.9316324, + "num_input_tokens_seen": 17380625, + "router_z_loss_clip": 7.25, + "router_z_loss_mlp": 0.78125, + "step": 811, + "time_per_iteration": 4.020594358444214 + }, + { + "auxiliary_loss_clip": 0.07316839, + "auxiliary_loss_mlp": 0.01338146, + "balance_loss_clip": 0.06624701, + "balance_loss_mlp": 0.01256941, + "epoch": 0.048820081166391104, + "flos": 18594256671360.0, + "grad_norm": 4.82975857058881, + "language_loss": 0.78335881, + "learning_rate": 3.9962861556465615e-06, + "loss": 0.86990869, + "num_input_tokens_seen": 17399355, + "router_z_loss_clip": 6.92578125, + "router_z_loss_mlp": 0.81152344, + "step": 812, + "time_per_iteration": 2.614077091217041 + }, + { + "auxiliary_loss_clip": 0.0728099, + "auxiliary_loss_mlp": 0.01351533, + "balance_loss_clip": 0.06610497, + "balance_loss_mlp": 0.0127009, + "epoch": 0.04888020441905907, + "flos": 22713324007680.0, + "grad_norm": 17.655616040127313, + "language_loss": 0.94109142, + "learning_rate": 3.996262394509233e-06, + "loss": 1.02741659, + "num_input_tokens_seen": 17418240, + "router_z_loss_clip": 6.703125, + "router_z_loss_mlp": 0.81494141, + "step": 813, + "time_per_iteration": 2.5956995487213135 + }, + { + "auxiliary_loss_clip": 0.07318511, + "auxiliary_loss_mlp": 0.01349544, + "balance_loss_clip": 0.0662335, + "balance_loss_mlp": 0.01262807, + "epoch": 0.04894032767172704, + "flos": 22791044269440.0, + "grad_norm": 7.289252550466507, + "language_loss": 0.78803051, + "learning_rate": 3.9962385576734335e-06, + "loss": 0.87471104, + "num_input_tokens_seen": 17436250, + "router_z_loss_clip": 6.953125, + "router_z_loss_mlp": 0.8671875, + "step": 814, + "time_per_iteration": 2.625399351119995 + }, + { + "auxiliary_loss_clip": 0.07335538, + "auxiliary_loss_mlp": 0.01355257, + "balance_loss_clip": 0.06626598, + "balance_loss_mlp": 0.01267948, + "epoch": 0.04900045092439501, + "flos": 25522521592320.0, + "grad_norm": 46.975949242566905, + "language_loss": 0.87790531, + "learning_rate": 3.9962146451400675e-06, + "loss": 0.96481323, + "num_input_tokens_seen": 17455750, + "router_z_loss_clip": 7.1015625, + "router_z_loss_mlp": 0.87451172, + "step": 815, + "time_per_iteration": 2.6799027919769287 + }, + { + "auxiliary_loss_clip": 0.0734727, + "auxiliary_loss_mlp": 0.0137345, + "balance_loss_clip": 0.06619896, + "balance_loss_mlp": 0.01271788, + "epoch": 0.04906057417706298, + "flos": 25965280166400.0, + "grad_norm": 11.89199068240792, + "language_loss": 0.95818853, + "learning_rate": 3.996190656910043e-06, + "loss": 1.04539561, + "num_input_tokens_seen": 17474995, + "router_z_loss_clip": 7.28125, + "router_z_loss_mlp": 1.01757812, + "step": 816, + "time_per_iteration": 2.668058395385742 + }, + { + "auxiliary_loss_clip": 0.07340101, + "auxiliary_loss_mlp": 0.01360138, + "balance_loss_clip": 0.066241, + "balance_loss_mlp": 0.01271828, + "epoch": 0.04912069742973095, + "flos": 18630580216320.0, + "grad_norm": 8.092720893633917, + "language_loss": 0.84299397, + "learning_rate": 3.996166592984268e-06, + "loss": 0.92999631, + "num_input_tokens_seen": 17493395, + "router_z_loss_clip": 7.1484375, + "router_z_loss_mlp": 0.88330078, + "step": 817, + "time_per_iteration": 2.5901565551757812 + }, + { + "auxiliary_loss_clip": 0.07312281, + "auxiliary_loss_mlp": 0.01371477, + "balance_loss_clip": 0.06618914, + "balance_loss_mlp": 0.01282404, + "epoch": 0.049180820682398915, + "flos": 23707182885120.0, + "grad_norm": 5.174214831161968, + "language_loss": 0.88566625, + "learning_rate": 3.996142453363656e-06, + "loss": 0.97250384, + "num_input_tokens_seen": 17514565, + "router_z_loss_clip": 6.93359375, + "router_z_loss_mlp": 0.89013672, + "step": 818, + "time_per_iteration": 2.6751646995544434 + }, + { + "auxiliary_loss_clip": 0.07361554, + "auxiliary_loss_mlp": 0.01384487, + "balance_loss_clip": 0.06625406, + "balance_loss_mlp": 0.01290598, + "epoch": 0.04924094393506689, + "flos": 22427179914240.0, + "grad_norm": 6.808629946314654, + "language_loss": 0.81731856, + "learning_rate": 3.996118238049124e-06, + "loss": 0.90477902, + "num_input_tokens_seen": 17534590, + "router_z_loss_clip": 7.36328125, + "router_z_loss_mlp": 0.93798828, + "step": 819, + "time_per_iteration": 2.638293504714966 + }, + { + "auxiliary_loss_clip": 0.07319279, + "auxiliary_loss_mlp": 0.01377789, + "balance_loss_clip": 0.06608901, + "balance_loss_mlp": 0.01285903, + "epoch": 0.04930106718773486, + "flos": 15743033464320.0, + "grad_norm": 10.609665501519604, + "language_loss": 0.88234192, + "learning_rate": 3.996093947041586e-06, + "loss": 0.96931261, + "num_input_tokens_seen": 17551900, + "router_z_loss_clip": 7.109375, + "router_z_loss_mlp": 0.91845703, + "step": 820, + "time_per_iteration": 2.6076858043670654 + }, + { + "auxiliary_loss_clip": 0.07310833, + "auxiliary_loss_mlp": 0.01372579, + "balance_loss_clip": 0.06604609, + "balance_loss_mlp": 0.01282171, + "epoch": 0.049361190440402825, + "flos": 26257922951040.0, + "grad_norm": 5.648893665912937, + "language_loss": 0.94581264, + "learning_rate": 3.996069580341966e-06, + "loss": 1.03264678, + "num_input_tokens_seen": 17571485, + "router_z_loss_clip": 7.0703125, + "router_z_loss_mlp": 0.90380859, + "step": 821, + "time_per_iteration": 2.7164249420166016 + }, + { + "auxiliary_loss_clip": 0.07296955, + "auxiliary_loss_mlp": 0.01366561, + "balance_loss_clip": 0.0660333, + "balance_loss_mlp": 0.01277488, + "epoch": 0.0494213136930708, + "flos": 21258872835840.0, + "grad_norm": 13.842694995476421, + "language_loss": 0.93458569, + "learning_rate": 3.996045137951188e-06, + "loss": 1.02122092, + "num_input_tokens_seen": 17591410, + "router_z_loss_clip": 6.9453125, + "router_z_loss_mlp": 0.890625, + "step": 822, + "time_per_iteration": 2.6453444957733154 + }, + { + "auxiliary_loss_clip": 0.07319045, + "auxiliary_loss_mlp": 0.01374655, + "balance_loss_clip": 0.06613644, + "balance_loss_mlp": 0.0128048, + "epoch": 0.04948143694573876, + "flos": 27973095701760.0, + "grad_norm": 7.088849816783062, + "language_loss": 0.7121917, + "learning_rate": 3.996020619870178e-06, + "loss": 0.79912865, + "num_input_tokens_seen": 17612010, + "router_z_loss_clip": 7.05078125, + "router_z_loss_mlp": 0.94238281, + "step": 823, + "time_per_iteration": 2.6804885864257812 + }, + { + "auxiliary_loss_clip": 0.06953795, + "auxiliary_loss_mlp": 0.01404355, + "balance_loss_clip": 0.06535611, + "balance_loss_mlp": 0.01345371, + "epoch": 0.049541560198406734, + "flos": 66197466345600.0, + "grad_norm": 1.28356919167216, + "language_loss": 0.63197851, + "learning_rate": 3.995996026099866e-06, + "loss": 0.71555996, + "num_input_tokens_seen": 17673430, + "router_z_loss_clip": 4.1875, + "router_z_loss_mlp": 0.58837891, + "step": 824, + "time_per_iteration": 3.3058674335479736 + }, + { + "auxiliary_loss_clip": 0.07323784, + "auxiliary_loss_mlp": 0.01374745, + "balance_loss_clip": 0.06612824, + "balance_loss_mlp": 0.01280998, + "epoch": 0.049601683451074706, + "flos": 22899218290560.0, + "grad_norm": 5.8210235967171435, + "language_loss": 0.9564544, + "learning_rate": 3.995971356641185e-06, + "loss": 1.04343963, + "num_input_tokens_seen": 17689545, + "router_z_loss_clip": 7.11328125, + "router_z_loss_mlp": 0.9375, + "step": 825, + "time_per_iteration": 2.62613844871521 + }, + { + "auxiliary_loss_clip": 0.07281419, + "auxiliary_loss_mlp": 0.01365594, + "balance_loss_clip": 0.06597939, + "balance_loss_mlp": 0.0127695, + "epoch": 0.04966180670374267, + "flos": 21439987436160.0, + "grad_norm": 7.03533776815666, + "language_loss": 0.71345061, + "learning_rate": 3.9959466114950695e-06, + "loss": 0.7999208, + "num_input_tokens_seen": 17705965, + "router_z_loss_clip": 6.83984375, + "router_z_loss_mlp": 0.88671875, + "step": 826, + "time_per_iteration": 2.607252359390259 + }, + { + "auxiliary_loss_clip": 0.07308409, + "auxiliary_loss_mlp": 0.01368352, + "balance_loss_clip": 0.06603594, + "balance_loss_mlp": 0.0127885, + "epoch": 0.04972192995641064, + "flos": 23113218418560.0, + "grad_norm": 6.719033594417253, + "language_loss": 0.82099521, + "learning_rate": 3.995921790662459e-06, + "loss": 0.90776283, + "num_input_tokens_seen": 17724580, + "router_z_loss_clip": 7.05078125, + "router_z_loss_mlp": 0.89550781, + "step": 827, + "time_per_iteration": 2.6468021869659424 + }, + { + "auxiliary_loss_clip": 0.07312737, + "auxiliary_loss_mlp": 0.01384514, + "balance_loss_clip": 0.06605525, + "balance_loss_mlp": 0.01293009, + "epoch": 0.04978205320907861, + "flos": 40415648693760.0, + "grad_norm": 3.6071356819257336, + "language_loss": 0.83064795, + "learning_rate": 3.995896894144294e-06, + "loss": 0.91762054, + "num_input_tokens_seen": 17747755, + "router_z_loss_clip": 7.05859375, + "router_z_loss_mlp": 0.91455078, + "step": 828, + "time_per_iteration": 2.7598366737365723 + }, + { + "auxiliary_loss_clip": 0.07248655, + "auxiliary_loss_mlp": 0.01357422, + "balance_loss_clip": 0.06587116, + "balance_loss_mlp": 0.01271687, + "epoch": 0.04984217646174658, + "flos": 25235580885120.0, + "grad_norm": 7.916023460171269, + "language_loss": 0.88066685, + "learning_rate": 3.995871921941519e-06, + "loss": 0.96672761, + "num_input_tokens_seen": 17768550, + "router_z_loss_clip": 6.609375, + "router_z_loss_mlp": 0.85791016, + "step": 829, + "time_per_iteration": 2.664443016052246 + }, + { + "auxiliary_loss_clip": 0.07290308, + "auxiliary_loss_mlp": 0.01371956, + "balance_loss_clip": 0.06599583, + "balance_loss_mlp": 0.01282025, + "epoch": 0.04990229971441455, + "flos": 15964873948800.0, + "grad_norm": 30.23399077612731, + "language_loss": 0.79482603, + "learning_rate": 3.99584687405508e-06, + "loss": 0.88144869, + "num_input_tokens_seen": 17786080, + "router_z_loss_clip": 6.90625, + "router_z_loss_mlp": 0.90039062, + "step": 830, + "time_per_iteration": 2.5562844276428223 + }, + { + "auxiliary_loss_clip": 0.07284638, + "auxiliary_loss_mlp": 0.01358745, + "balance_loss_clip": 0.06602956, + "balance_loss_mlp": 0.01273677, + "epoch": 0.04996242296708252, + "flos": 18410919937920.0, + "grad_norm": 6.720833612775693, + "language_loss": 0.82703733, + "learning_rate": 3.995821750485929e-06, + "loss": 0.91347122, + "num_input_tokens_seen": 17803635, + "router_z_loss_clip": 6.81640625, + "router_z_loss_mlp": 0.85058594, + "step": 831, + "time_per_iteration": 2.6576318740844727 + }, + { + "auxiliary_loss_clip": 0.07282449, + "auxiliary_loss_mlp": 0.01350763, + "balance_loss_clip": 0.06587234, + "balance_loss_mlp": 0.01262882, + "epoch": 0.05002254621975049, + "flos": 17863802703360.0, + "grad_norm": 5.424543563535015, + "language_loss": 0.97343409, + "learning_rate": 3.995796551235016e-06, + "loss": 1.05976629, + "num_input_tokens_seen": 17822190, + "router_z_loss_clip": 6.953125, + "router_z_loss_mlp": 0.87939453, + "step": 832, + "time_per_iteration": 2.5859360694885254 + }, + { + "auxiliary_loss_clip": 0.07242593, + "auxiliary_loss_mlp": 0.01355446, + "balance_loss_clip": 0.06576244, + "balance_loss_mlp": 0.01268804, + "epoch": 0.050082669472418455, + "flos": 45670682632320.0, + "grad_norm": 14.668918539875873, + "language_loss": 0.86283791, + "learning_rate": 3.9957712763032974e-06, + "loss": 0.94881833, + "num_input_tokens_seen": 17846915, + "router_z_loss_clip": 6.66015625, + "router_z_loss_mlp": 0.86621094, + "step": 833, + "time_per_iteration": 2.8055691719055176 + }, + { + "auxiliary_loss_clip": 0.07249285, + "auxiliary_loss_mlp": 0.01350346, + "balance_loss_clip": 0.06584433, + "balance_loss_mlp": 0.01262561, + "epoch": 0.05014279272508643, + "flos": 37971237859200.0, + "grad_norm": 3.800888643683855, + "language_loss": 0.8636179, + "learning_rate": 3.995745925691733e-06, + "loss": 0.94961417, + "num_input_tokens_seen": 17867270, + "router_z_loss_clip": 6.64453125, + "router_z_loss_mlp": 0.87695312, + "step": 834, + "time_per_iteration": 2.757873296737671 + }, + { + "auxiliary_loss_clip": 0.07281981, + "auxiliary_loss_mlp": 0.01348084, + "balance_loss_clip": 0.0659239, + "balance_loss_mlp": 0.01265353, + "epoch": 0.0502029159777544, + "flos": 21002511669120.0, + "grad_norm": 6.832202768967494, + "language_loss": 0.96576416, + "learning_rate": 3.995720499401282e-06, + "loss": 1.0520649, + "num_input_tokens_seen": 17884880, + "router_z_loss_clip": 6.890625, + "router_z_loss_mlp": 0.82666016, + "step": 835, + "time_per_iteration": 2.5905637741088867 + }, + { + "auxiliary_loss_clip": 0.07274499, + "auxiliary_loss_mlp": 0.01349147, + "balance_loss_clip": 0.06586967, + "balance_loss_mlp": 0.01266273, + "epoch": 0.050263039230422364, + "flos": 15893526597120.0, + "grad_norm": 5.723886418395804, + "language_loss": 0.82083344, + "learning_rate": 3.995694997432911e-06, + "loss": 0.90706992, + "num_input_tokens_seen": 17903695, + "router_z_loss_clip": 6.87890625, + "router_z_loss_mlp": 0.82861328, + "step": 836, + "time_per_iteration": 2.6167397499084473 + }, + { + "auxiliary_loss_clip": 0.0721738, + "auxiliary_loss_mlp": 0.01338932, + "balance_loss_clip": 0.06569374, + "balance_loss_mlp": 0.01261065, + "epoch": 0.050323162483090336, + "flos": 23739565288320.0, + "grad_norm": 23.66781297023958, + "language_loss": 0.88235295, + "learning_rate": 3.9956694197875855e-06, + "loss": 0.96791613, + "num_input_tokens_seen": 17920745, + "router_z_loss_clip": 6.48046875, + "router_z_loss_mlp": 0.77832031, + "step": 837, + "time_per_iteration": 2.614959955215454 + }, + { + "auxiliary_loss_clip": 0.07221343, + "auxiliary_loss_mlp": 0.01354096, + "balance_loss_clip": 0.06550418, + "balance_loss_mlp": 0.01265261, + "epoch": 0.0503832857357583, + "flos": 20272393117440.0, + "grad_norm": 6.0443181189796995, + "language_loss": 0.76965159, + "learning_rate": 3.995643766466275e-06, + "loss": 0.85540605, + "num_input_tokens_seen": 17938220, + "router_z_loss_clip": 6.7109375, + "router_z_loss_mlp": 0.88769531, + "step": 838, + "time_per_iteration": 2.622648239135742 + }, + { + "auxiliary_loss_clip": 0.0724083, + "auxiliary_loss_mlp": 0.01341893, + "balance_loss_clip": 0.06561115, + "balance_loss_mlp": 0.01259353, + "epoch": 0.05044340898842627, + "flos": 17790736343040.0, + "grad_norm": 4.747797763129113, + "language_loss": 0.86986995, + "learning_rate": 3.995618037469953e-06, + "loss": 0.95569718, + "num_input_tokens_seen": 17957325, + "router_z_loss_clip": 6.796875, + "router_z_loss_mlp": 0.82519531, + "step": 839, + "time_per_iteration": 2.5999207496643066 + }, + { + "auxiliary_loss_clip": 0.07210248, + "auxiliary_loss_mlp": 0.01342514, + "balance_loss_clip": 0.06558718, + "balance_loss_mlp": 0.01262024, + "epoch": 0.050503532241094246, + "flos": 22973207045760.0, + "grad_norm": 3.66950577076863, + "language_loss": 0.88844591, + "learning_rate": 3.995592232799595e-06, + "loss": 0.97397357, + "num_input_tokens_seen": 17975875, + "router_z_loss_clip": 6.51953125, + "router_z_loss_mlp": 0.80517578, + "step": 840, + "time_per_iteration": 2.688936948776245 + }, + { + "auxiliary_loss_clip": 0.07223296, + "auxiliary_loss_mlp": 0.01348235, + "balance_loss_clip": 0.06565775, + "balance_loss_mlp": 0.01264264, + "epoch": 0.05056365549376221, + "flos": 22782449226240.0, + "grad_norm": 5.237976654716359, + "language_loss": 0.98182797, + "learning_rate": 3.99556635245618e-06, + "loss": 1.06754327, + "num_input_tokens_seen": 17994340, + "router_z_loss_clip": 6.57421875, + "router_z_loss_mlp": 0.84033203, + "step": 841, + "time_per_iteration": 2.626171588897705 + }, + { + "auxiliary_loss_clip": 0.07216457, + "auxiliary_loss_mlp": 0.01346197, + "balance_loss_clip": 0.06556017, + "balance_loss_mlp": 0.01263227, + "epoch": 0.05062377874643018, + "flos": 30924401011200.0, + "grad_norm": 3.922284831716734, + "language_loss": 0.81540143, + "learning_rate": 3.995540396440688e-06, + "loss": 0.90102798, + "num_input_tokens_seen": 18015260, + "router_z_loss_clip": 6.609375, + "router_z_loss_mlp": 0.82958984, + "step": 842, + "time_per_iteration": 2.707146167755127 + }, + { + "auxiliary_loss_clip": 0.07236033, + "auxiliary_loss_mlp": 0.01355891, + "balance_loss_clip": 0.06555693, + "balance_loss_mlp": 0.0126391, + "epoch": 0.05068390199909815, + "flos": 19653425406720.0, + "grad_norm": 6.4717382946502635, + "language_loss": 0.81965601, + "learning_rate": 3.995514364754105e-06, + "loss": 0.90557522, + "num_input_tokens_seen": 18033960, + "router_z_loss_clip": 6.80078125, + "router_z_loss_mlp": 0.91943359, + "step": 843, + "time_per_iteration": 2.672064781188965 + }, + { + "auxiliary_loss_clip": 0.07235807, + "auxiliary_loss_mlp": 0.01361352, + "balance_loss_clip": 0.06552228, + "balance_loss_mlp": 0.01271992, + "epoch": 0.05074402525176612, + "flos": 37971279786240.0, + "grad_norm": 2.407141650516338, + "language_loss": 0.87016606, + "learning_rate": 3.995488257397417e-06, + "loss": 0.95613766, + "num_input_tokens_seen": 18056700, + "router_z_loss_clip": 6.83203125, + "router_z_loss_mlp": 0.89404297, + "step": 844, + "time_per_iteration": 2.7541916370391846 + }, + { + "auxiliary_loss_clip": 0.07238596, + "auxiliary_loss_mlp": 0.01357268, + "balance_loss_clip": 0.06561587, + "balance_loss_mlp": 0.01275109, + "epoch": 0.05080414850443409, + "flos": 22061177280000.0, + "grad_norm": 5.7438919546505876, + "language_loss": 0.80192208, + "learning_rate": 3.995462074371614e-06, + "loss": 0.8878808, + "num_input_tokens_seen": 18075815, + "router_z_loss_clip": 6.76953125, + "router_z_loss_mlp": 0.82226562, + "step": 845, + "time_per_iteration": 2.5944912433624268 + }, + { + "auxiliary_loss_clip": 0.07213366, + "auxiliary_loss_mlp": 0.01353915, + "balance_loss_clip": 0.06554674, + "balance_loss_mlp": 0.01268561, + "epoch": 0.05086427175710206, + "flos": 20231289889920.0, + "grad_norm": 4.0486216034950475, + "language_loss": 0.91612351, + "learning_rate": 3.99543581567769e-06, + "loss": 1.00179636, + "num_input_tokens_seen": 18095095, + "router_z_loss_clip": 6.578125, + "router_z_loss_mlp": 0.85400391, + "step": 846, + "time_per_iteration": 4.029407739639282 + }, + { + "auxiliary_loss_clip": 0.07198675, + "auxiliary_loss_mlp": 0.01353444, + "balance_loss_clip": 0.06555093, + "balance_loss_mlp": 0.01271094, + "epoch": 0.05092439500977003, + "flos": 15164707783680.0, + "grad_norm": 2.8334464640278307, + "language_loss": 0.91321969, + "learning_rate": 3.9954094813166394e-06, + "loss": 0.99874079, + "num_input_tokens_seen": 18112675, + "router_z_loss_clip": 6.4375, + "router_z_loss_mlp": 0.82324219, + "step": 847, + "time_per_iteration": 4.004042863845825 + }, + { + "auxiliary_loss_clip": 0.07199422, + "auxiliary_loss_mlp": 0.01355266, + "balance_loss_clip": 0.0654697, + "balance_loss_mlp": 0.01273202, + "epoch": 0.050984518262437994, + "flos": 22061806185600.0, + "grad_norm": 3.421485941815423, + "language_loss": 0.86160553, + "learning_rate": 3.995383071289462e-06, + "loss": 0.94715238, + "num_input_tokens_seen": 18130745, + "router_z_loss_clip": 6.52734375, + "router_z_loss_mlp": 0.82080078, + "step": 848, + "time_per_iteration": 4.033248662948608 + }, + { + "auxiliary_loss_clip": 0.07196971, + "auxiliary_loss_mlp": 0.01345708, + "balance_loss_clip": 0.06533228, + "balance_loss_mlp": 0.01262166, + "epoch": 0.05104464151510597, + "flos": 30232911991680.0, + "grad_norm": 3.7966495356829357, + "language_loss": 0.90386808, + "learning_rate": 3.995356585597158e-06, + "loss": 0.98929483, + "num_input_tokens_seen": 18152410, + "router_z_loss_clip": 6.640625, + "router_z_loss_mlp": 0.83544922, + "step": 849, + "time_per_iteration": 2.6612625122070312 + }, + { + "auxiliary_loss_clip": 0.07179346, + "auxiliary_loss_mlp": 0.01359214, + "balance_loss_clip": 0.06533284, + "balance_loss_mlp": 0.01279106, + "epoch": 0.05110476476777394, + "flos": 18338817899520.0, + "grad_norm": 8.277424439503498, + "language_loss": 0.88001835, + "learning_rate": 3.995330024240732e-06, + "loss": 0.96540397, + "num_input_tokens_seen": 18170870, + "router_z_loss_clip": 6.45703125, + "router_z_loss_mlp": 0.80126953, + "step": 850, + "time_per_iteration": 2.591169834136963 + }, + { + "auxiliary_loss_clip": 0.07213688, + "auxiliary_loss_mlp": 0.01358343, + "balance_loss_clip": 0.06542021, + "balance_loss_mlp": 0.01272131, + "epoch": 0.051164888020441904, + "flos": 38007938747520.0, + "grad_norm": 2.8793275004055894, + "language_loss": 0.702048, + "learning_rate": 3.995303387221192e-06, + "loss": 0.78776836, + "num_input_tokens_seen": 18191555, + "router_z_loss_clip": 6.72265625, + "router_z_loss_mlp": 0.86328125, + "step": 851, + "time_per_iteration": 4.218145132064819 + }, + { + "auxiliary_loss_clip": 0.07192284, + "auxiliary_loss_mlp": 0.0136467, + "balance_loss_clip": 0.06527439, + "balance_loss_mlp": 0.01276741, + "epoch": 0.051225011273109876, + "flos": 23045183303040.0, + "grad_norm": 3.6723766751173894, + "language_loss": 0.87184155, + "learning_rate": 3.995276674539547e-06, + "loss": 0.95741105, + "num_input_tokens_seen": 18208620, + "router_z_loss_clip": 6.66015625, + "router_z_loss_mlp": 0.87939453, + "step": 852, + "time_per_iteration": 2.629037380218506 + }, + { + "auxiliary_loss_clip": 0.07206973, + "auxiliary_loss_mlp": 0.01354841, + "balance_loss_clip": 0.06534127, + "balance_loss_mlp": 0.01269678, + "epoch": 0.05128513452577785, + "flos": 18265709612160.0, + "grad_norm": 3.821037496712823, + "language_loss": 0.8378402, + "learning_rate": 3.995249886196811e-06, + "loss": 0.92345834, + "num_input_tokens_seen": 18226370, + "router_z_loss_clip": 6.73046875, + "router_z_loss_mlp": 0.8515625, + "step": 853, + "time_per_iteration": 2.6316466331481934 + }, + { + "auxiliary_loss_clip": 0.07211602, + "auxiliary_loss_mlp": 0.01339797, + "balance_loss_clip": 0.06537303, + "balance_loss_mlp": 0.01257733, + "epoch": 0.05134525777844581, + "flos": 27206360115840.0, + "grad_norm": 3.182696022693741, + "language_loss": 0.80133533, + "learning_rate": 3.995223022193999e-06, + "loss": 0.88684934, + "num_input_tokens_seen": 18247075, + "router_z_loss_clip": 6.7421875, + "router_z_loss_mlp": 0.82080078, + "step": 854, + "time_per_iteration": 2.6477131843566895 + }, + { + "auxiliary_loss_clip": 0.07215541, + "auxiliary_loss_mlp": 0.01344733, + "balance_loss_clip": 0.0654063, + "balance_loss_mlp": 0.01263146, + "epoch": 0.051405381031113785, + "flos": 28369132824960.0, + "grad_norm": 35.99472555736179, + "language_loss": 0.85045469, + "learning_rate": 3.99519608253213e-06, + "loss": 0.93605745, + "num_input_tokens_seen": 18265680, + "router_z_loss_clip": 6.74609375, + "router_z_loss_mlp": 0.81542969, + "step": 855, + "time_per_iteration": 2.6279296875 + }, + { + "auxiliary_loss_clip": 0.06909335, + "auxiliary_loss_mlp": 0.01436301, + "balance_loss_clip": 0.0650633, + "balance_loss_mlp": 0.01398083, + "epoch": 0.05146550428378175, + "flos": 65638049760000.0, + "grad_norm": 0.9716530477482218, + "language_loss": 0.65818644, + "learning_rate": 3.995169067212227e-06, + "loss": 0.74164271, + "num_input_tokens_seen": 18327015, + "router_z_loss_clip": 4.01171875, + "router_z_loss_mlp": 0.3815918, + "step": 856, + "time_per_iteration": 3.1742889881134033 + }, + { + "auxiliary_loss_clip": 0.0715993, + "auxiliary_loss_mlp": 0.01330963, + "balance_loss_clip": 0.06518224, + "balance_loss_mlp": 0.01252571, + "epoch": 0.05152562753644972, + "flos": 22061470769280.0, + "grad_norm": 29.089515075725927, + "language_loss": 0.80351281, + "learning_rate": 3.9951419762353116e-06, + "loss": 0.88842171, + "num_input_tokens_seen": 18345235, + "router_z_loss_clip": 6.41796875, + "router_z_loss_mlp": 0.78417969, + "step": 857, + "time_per_iteration": 2.6136977672576904 + }, + { + "auxiliary_loss_clip": 0.07196955, + "auxiliary_loss_mlp": 0.01347875, + "balance_loss_clip": 0.06528607, + "balance_loss_mlp": 0.01259422, + "epoch": 0.051585750789117694, + "flos": 18514523911680.0, + "grad_norm": 4.501526487205694, + "language_loss": 0.9266271, + "learning_rate": 3.995114809602412e-06, + "loss": 1.01207542, + "num_input_tokens_seen": 18362350, + "router_z_loss_clip": 6.6875, + "router_z_loss_mlp": 0.88427734, + "step": 858, + "time_per_iteration": 2.606518268585205 + }, + { + "auxiliary_loss_clip": 0.07190363, + "auxiliary_loss_mlp": 0.0134683, + "balance_loss_clip": 0.06527077, + "balance_loss_mlp": 0.01261381, + "epoch": 0.05164587404178566, + "flos": 23736630395520.0, + "grad_norm": 4.049462391518637, + "language_loss": 0.80811787, + "learning_rate": 3.9950875673145605e-06, + "loss": 0.89348972, + "num_input_tokens_seen": 18383390, + "router_z_loss_clip": 6.6328125, + "router_z_loss_mlp": 0.85400391, + "step": 859, + "time_per_iteration": 2.624462604522705 + }, + { + "auxiliary_loss_clip": 0.07202329, + "auxiliary_loss_mlp": 0.01352935, + "balance_loss_clip": 0.06525081, + "balance_loss_mlp": 0.01264196, + "epoch": 0.05170599729445363, + "flos": 16258397201280.0, + "grad_norm": 12.806303000100046, + "language_loss": 0.95290452, + "learning_rate": 3.995060249372788e-06, + "loss": 1.03845716, + "num_input_tokens_seen": 18399220, + "router_z_loss_clip": 6.78125, + "router_z_loss_mlp": 0.88769531, + "step": 860, + "time_per_iteration": 2.6383068561553955 + }, + { + "auxiliary_loss_clip": 0.07167631, + "auxiliary_loss_mlp": 0.01344788, + "balance_loss_clip": 0.06524719, + "balance_loss_mlp": 0.01262868, + "epoch": 0.0517661205471216, + "flos": 23992404583680.0, + "grad_norm": 3.0591302489664116, + "language_loss": 0.86028093, + "learning_rate": 3.99503285577813e-06, + "loss": 0.94540519, + "num_input_tokens_seen": 18419005, + "router_z_loss_clip": 6.4375, + "router_z_loss_mlp": 0.81884766, + "step": 861, + "time_per_iteration": 2.6825718879699707 + }, + { + "auxiliary_loss_clip": 0.07179172, + "auxiliary_loss_mlp": 0.01338271, + "balance_loss_clip": 0.06521305, + "balance_loss_mlp": 0.01256732, + "epoch": 0.05182624379978957, + "flos": 29285313367680.0, + "grad_norm": 3.256695777108904, + "language_loss": 0.8236177, + "learning_rate": 3.995005386531627e-06, + "loss": 0.90879214, + "num_input_tokens_seen": 18440550, + "router_z_loss_clip": 6.578125, + "router_z_loss_mlp": 0.81542969, + "step": 862, + "time_per_iteration": 2.723032236099243 + }, + { + "auxiliary_loss_clip": 0.07146881, + "auxiliary_loss_mlp": 0.01338015, + "balance_loss_clip": 0.06502384, + "balance_loss_mlp": 0.01256428, + "epoch": 0.05188636705245754, + "flos": 24177753815040.0, + "grad_norm": 4.080001789672534, + "language_loss": 0.92516744, + "learning_rate": 3.9949778416343195e-06, + "loss": 1.01001632, + "num_input_tokens_seen": 18461950, + "router_z_loss_clip": 6.44140625, + "router_z_loss_mlp": 0.81591797, + "step": 863, + "time_per_iteration": 2.624147653579712 + }, + { + "auxiliary_loss_clip": 0.07156427, + "auxiliary_loss_mlp": 0.0133763, + "balance_loss_clip": 0.06515339, + "balance_loss_mlp": 0.01253897, + "epoch": 0.051946490305125506, + "flos": 26767961953920.0, + "grad_norm": 5.3541817649382875, + "language_loss": 0.7963919, + "learning_rate": 3.9949502210872525e-06, + "loss": 0.88133246, + "num_input_tokens_seen": 18480555, + "router_z_loss_clip": 6.41015625, + "router_z_loss_mlp": 0.83789062, + "step": 864, + "time_per_iteration": 2.6928389072418213 + }, + { + "auxiliary_loss_clip": 0.07167269, + "auxiliary_loss_mlp": 0.01333883, + "balance_loss_clip": 0.0651238, + "balance_loss_mlp": 0.01252963, + "epoch": 0.05200661355779348, + "flos": 21508190259840.0, + "grad_norm": 2.900845784392114, + "language_loss": 0.83983421, + "learning_rate": 3.994922524891474e-06, + "loss": 0.9248457, + "num_input_tokens_seen": 18499645, + "router_z_loss_clip": 6.546875, + "router_z_loss_mlp": 0.80908203, + "step": 865, + "time_per_iteration": 2.6349294185638428 + }, + { + "auxiliary_loss_clip": 0.07157271, + "auxiliary_loss_mlp": 0.01343197, + "balance_loss_clip": 0.06511506, + "balance_loss_mlp": 0.01259417, + "epoch": 0.05206673681046144, + "flos": 18120457359360.0, + "grad_norm": 4.23578044185309, + "language_loss": 0.89868104, + "learning_rate": 3.994894753048032e-06, + "loss": 0.98368573, + "num_input_tokens_seen": 18516810, + "router_z_loss_clip": 6.453125, + "router_z_loss_mlp": 0.83789062, + "step": 866, + "time_per_iteration": 2.605546236038208 + }, + { + "auxiliary_loss_clip": 0.07133412, + "auxiliary_loss_mlp": 0.01337077, + "balance_loss_clip": 0.06502427, + "balance_loss_mlp": 0.01258494, + "epoch": 0.052126860063129415, + "flos": 17528966588160.0, + "grad_norm": 5.089693219930068, + "language_loss": 0.91889334, + "learning_rate": 3.9948669055579815e-06, + "loss": 1.00359821, + "num_input_tokens_seen": 18532510, + "router_z_loss_clip": 6.30859375, + "router_z_loss_mlp": 0.78564453, + "step": 867, + "time_per_iteration": 2.5601866245269775 + }, + { + "auxiliary_loss_clip": 0.07109866, + "auxiliary_loss_mlp": 0.01340108, + "balance_loss_clip": 0.06500173, + "balance_loss_mlp": 0.0126019, + "epoch": 0.05218698331579739, + "flos": 32606227036800.0, + "grad_norm": 2.1025104258361558, + "language_loss": 0.66466248, + "learning_rate": 3.9948389824223785e-06, + "loss": 0.7491622, + "num_input_tokens_seen": 18557380, + "router_z_loss_clip": 6.09765625, + "router_z_loss_mlp": 0.79882812, + "step": 868, + "time_per_iteration": 2.6942384243011475 + }, + { + "auxiliary_loss_clip": 0.0714476, + "auxiliary_loss_mlp": 0.01358483, + "balance_loss_clip": 0.06494892, + "balance_loss_mlp": 0.01263545, + "epoch": 0.05224710656846535, + "flos": 22133824369920.0, + "grad_norm": 2.980657220865539, + "language_loss": 0.87344658, + "learning_rate": 3.994810983642281e-06, + "loss": 0.95847905, + "num_input_tokens_seen": 18575720, + "router_z_loss_clip": 6.5, + "router_z_loss_mlp": 0.94921875, + "step": 869, + "time_per_iteration": 2.5877575874328613 + }, + { + "auxiliary_loss_clip": 0.07143813, + "auxiliary_loss_mlp": 0.01349092, + "balance_loss_clip": 0.06488257, + "balance_loss_mlp": 0.01260353, + "epoch": 0.052307229821133325, + "flos": 11149789472640.0, + "grad_norm": 7.7840171376663285, + "language_loss": 0.91889322, + "learning_rate": 3.994782909218751e-06, + "loss": 1.00382233, + "num_input_tokens_seen": 18592185, + "router_z_loss_clip": 6.55859375, + "router_z_loss_mlp": 0.88720703, + "step": 870, + "time_per_iteration": 2.608442783355713 + }, + { + "auxiliary_loss_clip": 0.07122661, + "auxiliary_loss_mlp": 0.01356358, + "balance_loss_clip": 0.064864, + "balance_loss_mlp": 0.01265759, + "epoch": 0.05236735307380129, + "flos": 19132862716800.0, + "grad_norm": 2.918328667759454, + "language_loss": 0.843858, + "learning_rate": 3.994754759152854e-06, + "loss": 0.92864817, + "num_input_tokens_seen": 18609560, + "router_z_loss_clip": 6.3671875, + "router_z_loss_mlp": 0.90722656, + "step": 871, + "time_per_iteration": 2.5879244804382324 + }, + { + "auxiliary_loss_clip": 0.07078928, + "auxiliary_loss_mlp": 0.01364934, + "balance_loss_clip": 0.06478463, + "balance_loss_mlp": 0.01281488, + "epoch": 0.05242747632646926, + "flos": 20967152446080.0, + "grad_norm": 2.587533245039743, + "language_loss": 0.8462553, + "learning_rate": 3.994726533445656e-06, + "loss": 0.93069392, + "num_input_tokens_seen": 18629405, + "router_z_loss_clip": 6.0078125, + "router_z_loss_mlp": 0.83496094, + "step": 872, + "time_per_iteration": 2.6208133697509766 + }, + { + "auxiliary_loss_clip": 0.06844061, + "auxiliary_loss_mlp": 0.01482571, + "balance_loss_clip": 0.06436051, + "balance_loss_mlp": 0.0141405, + "epoch": 0.052487599579137234, + "flos": 65038005872640.0, + "grad_norm": 0.8977590463147395, + "language_loss": 0.61953008, + "learning_rate": 3.9946982320982274e-06, + "loss": 0.70279646, + "num_input_tokens_seen": 18681480, + "router_z_loss_clip": 4.0625, + "router_z_loss_mlp": 0.68603516, + "step": 873, + "time_per_iteration": 3.134603500366211 + }, + { + "auxiliary_loss_clip": 0.07129098, + "auxiliary_loss_mlp": 0.01340569, + "balance_loss_clip": 0.06492221, + "balance_loss_mlp": 0.01259269, + "epoch": 0.0525477228318052, + "flos": 23294584581120.0, + "grad_norm": 2.232892718211453, + "language_loss": 0.92670178, + "learning_rate": 3.994669855111643e-06, + "loss": 1.01139832, + "num_input_tokens_seen": 18700390, + "router_z_loss_clip": 6.37109375, + "router_z_loss_mlp": 0.81298828, + "step": 874, + "time_per_iteration": 2.6136653423309326 + }, + { + "auxiliary_loss_clip": 0.07136606, + "auxiliary_loss_mlp": 0.01342837, + "balance_loss_clip": 0.0649495, + "balance_loss_mlp": 0.01262681, + "epoch": 0.05260784608447317, + "flos": 32237834561280.0, + "grad_norm": 3.6657665933203796, + "language_loss": 0.78140688, + "learning_rate": 3.994641402486977e-06, + "loss": 0.86620128, + "num_input_tokens_seen": 18721280, + "router_z_loss_clip": 6.41796875, + "router_z_loss_mlp": 0.80175781, + "step": 875, + "time_per_iteration": 2.72760272026062 + }, + { + "auxiliary_loss_clip": 0.07132401, + "auxiliary_loss_mlp": 0.01330422, + "balance_loss_clip": 0.06503764, + "balance_loss_mlp": 0.01255511, + "epoch": 0.052667969337141136, + "flos": 24470270818560.0, + "grad_norm": 2.6184423818700684, + "language_loss": 0.96137547, + "learning_rate": 3.99461287422531e-06, + "loss": 1.04600358, + "num_input_tokens_seen": 18741545, + "router_z_loss_clip": 6.28515625, + "router_z_loss_mlp": 0.74902344, + "step": 876, + "time_per_iteration": 2.627152681350708 + }, + { + "auxiliary_loss_clip": 0.06850941, + "auxiliary_loss_mlp": 0.01378053, + "balance_loss_clip": 0.06451087, + "balance_loss_mlp": 0.01329487, + "epoch": 0.05272809258980911, + "flos": 57804673034880.0, + "grad_norm": 0.7984915998280667, + "language_loss": 0.63229537, + "learning_rate": 3.994584270327722e-06, + "loss": 0.7145853, + "num_input_tokens_seen": 18801400, + "router_z_loss_clip": 4.0, + "router_z_loss_mlp": 0.48510742, + "step": 877, + "time_per_iteration": 3.2541913986206055 + }, + { + "auxiliary_loss_clip": 0.0712804, + "auxiliary_loss_mlp": 0.01326088, + "balance_loss_clip": 0.06496318, + "balance_loss_mlp": 0.01255087, + "epoch": 0.05278821584247708, + "flos": 17426578498560.0, + "grad_norm": 2.7186428977077624, + "language_loss": 0.89685273, + "learning_rate": 3.994555590795299e-06, + "loss": 0.98139405, + "num_input_tokens_seen": 18819670, + "router_z_loss_clip": 6.3125, + "router_z_loss_mlp": 0.71044922, + "step": 878, + "time_per_iteration": 2.5782718658447266 + }, + { + "auxiliary_loss_clip": 0.07154611, + "auxiliary_loss_mlp": 0.0135536, + "balance_loss_clip": 0.06498797, + "balance_loss_mlp": 0.01272485, + "epoch": 0.052848339095145046, + "flos": 26143879144320.0, + "grad_norm": 3.677878171007489, + "language_loss": 0.873586, + "learning_rate": 3.9945268356291275e-06, + "loss": 0.9586857, + "num_input_tokens_seen": 18840580, + "router_z_loss_clip": 6.55859375, + "router_z_loss_mlp": 0.82910156, + "step": 879, + "time_per_iteration": 2.6588823795318604 + }, + { + "auxiliary_loss_clip": 0.07119917, + "auxiliary_loss_mlp": 0.01353348, + "balance_loss_clip": 0.06497534, + "balance_loss_mlp": 0.01274622, + "epoch": 0.05290846234781302, + "flos": 16477680136320.0, + "grad_norm": 3.320308324601447, + "language_loss": 0.88939857, + "learning_rate": 3.9944980048302985e-06, + "loss": 0.97413123, + "num_input_tokens_seen": 18859295, + "router_z_loss_clip": 6.2265625, + "router_z_loss_mlp": 0.78710938, + "step": 880, + "time_per_iteration": 2.578577756881714 + }, + { + "auxiliary_loss_clip": 0.07141528, + "auxiliary_loss_mlp": 0.01362108, + "balance_loss_clip": 0.06505635, + "balance_loss_mlp": 0.0127971, + "epoch": 0.05296858560048098, + "flos": 19871324749440.0, + "grad_norm": 13.59148063097553, + "language_loss": 0.93088204, + "learning_rate": 3.994469098399906e-06, + "loss": 1.01591837, + "num_input_tokens_seen": 18877485, + "router_z_loss_clip": 6.3671875, + "router_z_loss_mlp": 0.82421875, + "step": 881, + "time_per_iteration": 2.5984764099121094 + }, + { + "auxiliary_loss_clip": 0.07145406, + "auxiliary_loss_mlp": 0.01363259, + "balance_loss_clip": 0.06503064, + "balance_loss_mlp": 0.01280146, + "epoch": 0.053028708853148955, + "flos": 24395359668480.0, + "grad_norm": 2.511110361208876, + "language_loss": 0.91561359, + "learning_rate": 3.994440116339046e-06, + "loss": 1.00070024, + "num_input_tokens_seen": 18898275, + "router_z_loss_clip": 6.4140625, + "router_z_loss_mlp": 0.83203125, + "step": 882, + "time_per_iteration": 2.6321942806243896 + }, + { + "auxiliary_loss_clip": 0.07153618, + "auxiliary_loss_mlp": 0.01379213, + "balance_loss_clip": 0.06501983, + "balance_loss_mlp": 0.0129343, + "epoch": 0.05308883210581693, + "flos": 36402072048000.0, + "grad_norm": 3.8602802151834035, + "language_loss": 0.74549603, + "learning_rate": 3.994411058648816e-06, + "loss": 0.83082438, + "num_input_tokens_seen": 18920665, + "router_z_loss_clip": 6.515625, + "router_z_loss_mlp": 0.85839844, + "step": 883, + "time_per_iteration": 2.758694648742676 + }, + { + "auxiliary_loss_clip": 0.07123835, + "auxiliary_loss_mlp": 0.01365604, + "balance_loss_clip": 0.06493074, + "balance_loss_mlp": 0.01279965, + "epoch": 0.05314895535848489, + "flos": 22861427299200.0, + "grad_norm": 3.506018870992282, + "language_loss": 0.79542196, + "learning_rate": 3.994381925330319e-06, + "loss": 0.88031638, + "num_input_tokens_seen": 18939835, + "router_z_loss_clip": 6.3125, + "router_z_loss_mlp": 0.85644531, + "step": 884, + "time_per_iteration": 2.638016700744629 + }, + { + "auxiliary_loss_clip": 0.07094033, + "auxiliary_loss_mlp": 0.01359391, + "balance_loss_clip": 0.06489642, + "balance_loss_mlp": 0.01288057, + "epoch": 0.053209078611152864, + "flos": 12865381493760.0, + "grad_norm": 6.565904312623652, + "language_loss": 0.90469623, + "learning_rate": 3.994352716384659e-06, + "loss": 0.98923051, + "num_input_tokens_seen": 18958405, + "router_z_loss_clip": 6.0390625, + "router_z_loss_mlp": 0.71289062, + "step": 885, + "time_per_iteration": 2.5900588035583496 + }, + { + "auxiliary_loss_clip": 0.07139361, + "auxiliary_loss_mlp": 0.01377795, + "balance_loss_clip": 0.06508732, + "balance_loss_mlp": 0.0129225, + "epoch": 0.05326920186382083, + "flos": 12169112791680.0, + "grad_norm": 9.079017579739912, + "language_loss": 0.91530603, + "learning_rate": 3.994323431812945e-06, + "loss": 1.00047755, + "num_input_tokens_seen": 18975445, + "router_z_loss_clip": 6.3046875, + "router_z_loss_mlp": 0.85595703, + "step": 886, + "time_per_iteration": 4.099337339401245 + }, + { + "auxiliary_loss_clip": 0.07124092, + "auxiliary_loss_mlp": 0.01379295, + "balance_loss_clip": 0.06500152, + "balance_loss_mlp": 0.01295754, + "epoch": 0.0533293251164888, + "flos": 22710011771520.0, + "grad_norm": 3.9905004918105202, + "language_loss": 0.93810099, + "learning_rate": 3.994294071616286e-06, + "loss": 1.02313483, + "num_input_tokens_seen": 18991930, + "router_z_loss_clip": 6.23828125, + "router_z_loss_mlp": 0.83447266, + "step": 887, + "time_per_iteration": 2.5987393856048584 + }, + { + "auxiliary_loss_clip": 0.0714867, + "auxiliary_loss_mlp": 0.01405803, + "balance_loss_clip": 0.06507815, + "balance_loss_mlp": 0.01314536, + "epoch": 0.053389448369156774, + "flos": 26947860670080.0, + "grad_norm": 3.06900720752712, + "language_loss": 0.79354906, + "learning_rate": 3.994264635795796e-06, + "loss": 0.87909377, + "num_input_tokens_seen": 19009790, + "router_z_loss_clip": 6.40234375, + "router_z_loss_mlp": 0.91259766, + "step": 888, + "time_per_iteration": 4.025885820388794 + }, + { + "auxiliary_loss_clip": 0.07115386, + "auxiliary_loss_mlp": 0.01373999, + "balance_loss_clip": 0.06494455, + "balance_loss_mlp": 0.01293223, + "epoch": 0.05344957162182474, + "flos": 25563331330560.0, + "grad_norm": 6.088733603359691, + "language_loss": 0.92500973, + "learning_rate": 3.994235124352592e-06, + "loss": 1.00990355, + "num_input_tokens_seen": 19030170, + "router_z_loss_clip": 6.21484375, + "router_z_loss_mlp": 0.80761719, + "step": 889, + "time_per_iteration": 2.7182345390319824 + }, + { + "auxiliary_loss_clip": 0.07091353, + "auxiliary_loss_mlp": 0.01359755, + "balance_loss_clip": 0.06492079, + "balance_loss_mlp": 0.01289135, + "epoch": 0.05350969487449271, + "flos": 19725779007360.0, + "grad_norm": 3.9732892090836818, + "language_loss": 0.92642856, + "learning_rate": 3.994205537287791e-06, + "loss": 1.0109396, + "num_input_tokens_seen": 19048075, + "router_z_loss_clip": 5.99609375, + "router_z_loss_mlp": 0.70654297, + "step": 890, + "time_per_iteration": 4.055738925933838 + }, + { + "auxiliary_loss_clip": 0.071067, + "auxiliary_loss_mlp": 0.01356348, + "balance_loss_clip": 0.06478938, + "balance_loss_mlp": 0.01276573, + "epoch": 0.053569818127160676, + "flos": 27023694215040.0, + "grad_norm": 3.5767216506214523, + "language_loss": 0.98853362, + "learning_rate": 3.994175874602517e-06, + "loss": 1.07316399, + "num_input_tokens_seen": 19067465, + "router_z_loss_clip": 6.27734375, + "router_z_loss_mlp": 0.79785156, + "step": 891, + "time_per_iteration": 2.651681661605835 + }, + { + "auxiliary_loss_clip": 0.07084872, + "auxiliary_loss_mlp": 0.01351507, + "balance_loss_clip": 0.06476413, + "balance_loss_mlp": 0.01277788, + "epoch": 0.05362994137982865, + "flos": 13193383501440.0, + "grad_norm": 5.794831179079165, + "language_loss": 0.75768781, + "learning_rate": 3.994146136297893e-06, + "loss": 0.84205151, + "num_input_tokens_seen": 19085505, + "router_z_loss_clip": 6.08203125, + "router_z_loss_mlp": 0.73779297, + "step": 892, + "time_per_iteration": 2.5933892726898193 + }, + { + "auxiliary_loss_clip": 0.07096062, + "auxiliary_loss_mlp": 0.01350672, + "balance_loss_clip": 0.0647971, + "balance_loss_mlp": 0.01278002, + "epoch": 0.05369006463249662, + "flos": 28665590970240.0, + "grad_norm": 4.507397126758742, + "language_loss": 0.85958588, + "learning_rate": 3.994116322375049e-06, + "loss": 0.94405323, + "num_input_tokens_seen": 19104360, + "router_z_loss_clip": 6.16796875, + "router_z_loss_mlp": 0.7265625, + "step": 893, + "time_per_iteration": 2.6546378135681152 + }, + { + "auxiliary_loss_clip": 0.07101032, + "auxiliary_loss_mlp": 0.01336529, + "balance_loss_clip": 0.06474701, + "balance_loss_mlp": 0.01265099, + "epoch": 0.053750187885164585, + "flos": 28920736252800.0, + "grad_norm": 9.639579848612797, + "language_loss": 0.85423577, + "learning_rate": 3.994086432835114e-06, + "loss": 0.93861139, + "num_input_tokens_seen": 19124680, + "router_z_loss_clip": 6.265625, + "router_z_loss_mlp": 0.71484375, + "step": 894, + "time_per_iteration": 2.649336099624634 + }, + { + "auxiliary_loss_clip": 0.07051332, + "auxiliary_loss_mlp": 0.0132645, + "balance_loss_clip": 0.06452148, + "balance_loss_mlp": 0.01260742, + "epoch": 0.05381031113783256, + "flos": 15164246586240.0, + "grad_norm": 3.2292453008689215, + "language_loss": 0.79914492, + "learning_rate": 3.994056467679221e-06, + "loss": 0.88292277, + "num_input_tokens_seen": 19142895, + "router_z_loss_clip": 5.9921875, + "router_z_loss_mlp": 0.65722656, + "step": 895, + "time_per_iteration": 2.5825929641723633 + }, + { + "auxiliary_loss_clip": 0.07075687, + "auxiliary_loss_mlp": 0.01335812, + "balance_loss_clip": 0.06453281, + "balance_loss_mlp": 0.01257229, + "epoch": 0.05387043439050053, + "flos": 21841684709760.0, + "grad_norm": 4.836504932030544, + "language_loss": 0.91227436, + "learning_rate": 3.9940264269085065e-06, + "loss": 0.99638927, + "num_input_tokens_seen": 19163125, + "router_z_loss_clip": 6.2265625, + "router_z_loss_mlp": 0.78564453, + "step": 896, + "time_per_iteration": 2.657710313796997 + }, + { + "auxiliary_loss_clip": 0.07047559, + "auxiliary_loss_mlp": 0.0133946, + "balance_loss_clip": 0.06444345, + "balance_loss_mlp": 0.01266504, + "epoch": 0.053930557643168495, + "flos": 17315888855040.0, + "grad_norm": 5.716166538264852, + "language_loss": 0.91855001, + "learning_rate": 3.9939963105241115e-06, + "loss": 1.00242019, + "num_input_tokens_seen": 19179385, + "router_z_loss_clip": 6.0390625, + "router_z_loss_mlp": 0.72998047, + "step": 897, + "time_per_iteration": 2.5864884853363037 + }, + { + "auxiliary_loss_clip": 0.06997538, + "auxiliary_loss_mlp": 0.013383, + "balance_loss_clip": 0.06422779, + "balance_loss_mlp": 0.0126625, + "epoch": 0.05399068089583647, + "flos": 17354350679040.0, + "grad_norm": 28.355738836577903, + "language_loss": 0.93759477, + "learning_rate": 3.993966118527175e-06, + "loss": 1.02095306, + "num_input_tokens_seen": 19198725, + "router_z_loss_clip": 5.74609375, + "router_z_loss_mlp": 0.72070312, + "step": 898, + "time_per_iteration": 2.6132631301879883 + }, + { + "auxiliary_loss_clip": 0.07036521, + "auxiliary_loss_mlp": 0.01343105, + "balance_loss_clip": 0.06425488, + "balance_loss_mlp": 0.01264809, + "epoch": 0.05405080414850443, + "flos": 17491594867200.0, + "grad_norm": 4.630068897804509, + "language_loss": 0.97064686, + "learning_rate": 3.993935850918845e-06, + "loss": 1.05444312, + "num_input_tokens_seen": 19212380, + "router_z_loss_clip": 6.10546875, + "router_z_loss_mlp": 0.78320312, + "step": 899, + "time_per_iteration": 2.5816986560821533 + }, + { + "auxiliary_loss_clip": 0.07002847, + "auxiliary_loss_mlp": 0.01337851, + "balance_loss_clip": 0.06429946, + "balance_loss_mlp": 0.01263131, + "epoch": 0.054110927401172404, + "flos": 24503365981440.0, + "grad_norm": 5.469084454178289, + "language_loss": 0.79532343, + "learning_rate": 3.9939055077002665e-06, + "loss": 0.87873036, + "num_input_tokens_seen": 19232235, + "router_z_loss_clip": 5.73046875, + "router_z_loss_mlp": 0.74755859, + "step": 900, + "time_per_iteration": 2.6616973876953125 + }, + { + "auxiliary_loss_clip": 0.07026203, + "auxiliary_loss_mlp": 0.01335204, + "balance_loss_clip": 0.06429055, + "balance_loss_mlp": 0.01261628, + "epoch": 0.054171050653840376, + "flos": 22936715792640.0, + "grad_norm": 9.114074112173778, + "language_loss": 0.79687816, + "learning_rate": 3.993875088872592e-06, + "loss": 0.88049221, + "num_input_tokens_seen": 19251460, + "router_z_loss_clip": 5.9765625, + "router_z_loss_mlp": 0.73681641, + "step": 901, + "time_per_iteration": 2.6217994689941406 + }, + { + "auxiliary_loss_clip": 0.06969521, + "auxiliary_loss_mlp": 0.01353187, + "balance_loss_clip": 0.06413257, + "balance_loss_mlp": 0.01276941, + "epoch": 0.05423117390650834, + "flos": 12938238218880.0, + "grad_norm": 4.5794905652094675, + "language_loss": 0.8858788, + "learning_rate": 3.9938445944369745e-06, + "loss": 0.96910584, + "num_input_tokens_seen": 19269060, + "router_z_loss_clip": 5.56640625, + "router_z_loss_mlp": 0.76220703, + "step": 902, + "time_per_iteration": 2.600041151046753 + }, + { + "auxiliary_loss_clip": 0.07010742, + "auxiliary_loss_mlp": 0.01348168, + "balance_loss_clip": 0.0642361, + "balance_loss_mlp": 0.01272208, + "epoch": 0.05429129715917631, + "flos": 19907438659200.0, + "grad_norm": 3.5235627900978987, + "language_loss": 0.90038717, + "learning_rate": 3.993814024394569e-06, + "loss": 0.98397624, + "num_input_tokens_seen": 19288620, + "router_z_loss_clip": 5.875, + "router_z_loss_mlp": 0.75927734, + "step": 903, + "time_per_iteration": 2.654343843460083 + }, + { + "auxiliary_loss_clip": 0.07027672, + "auxiliary_loss_mlp": 0.01351984, + "balance_loss_clip": 0.06429485, + "balance_loss_mlp": 0.01276739, + "epoch": 0.05435142041184428, + "flos": 16914065800320.0, + "grad_norm": 3.6682943607818808, + "language_loss": 0.79433787, + "learning_rate": 3.993783378746537e-06, + "loss": 0.87813443, + "num_input_tokens_seen": 19306615, + "router_z_loss_clip": 5.98046875, + "router_z_loss_mlp": 0.75292969, + "step": 904, + "time_per_iteration": 2.5959675312042236 + }, + { + "auxiliary_loss_clip": 0.07042356, + "auxiliary_loss_mlp": 0.01361745, + "balance_loss_clip": 0.06427713, + "balance_loss_mlp": 0.01279062, + "epoch": 0.05441154366451225, + "flos": 23954613592320.0, + "grad_norm": 4.579053653377249, + "language_loss": 0.88901699, + "learning_rate": 3.993752657494039e-06, + "loss": 0.97305799, + "num_input_tokens_seen": 19321680, + "router_z_loss_clip": 6.140625, + "router_z_loss_mlp": 0.82714844, + "step": 905, + "time_per_iteration": 2.6219427585601807 + }, + { + "auxiliary_loss_clip": 0.06998053, + "auxiliary_loss_mlp": 0.01347731, + "balance_loss_clip": 0.06429392, + "balance_loss_mlp": 0.01274727, + "epoch": 0.05447166691718022, + "flos": 19981678976640.0, + "grad_norm": 3.7765145633999624, + "language_loss": 0.78233027, + "learning_rate": 3.993721860638241e-06, + "loss": 0.8657881, + "num_input_tokens_seen": 19339760, + "router_z_loss_clip": 5.6875, + "router_z_loss_mlp": 0.73046875, + "step": 906, + "time_per_iteration": 2.6213393211364746 + }, + { + "auxiliary_loss_clip": 0.07034522, + "auxiliary_loss_mlp": 0.01354415, + "balance_loss_clip": 0.06439427, + "balance_loss_mlp": 0.01281221, + "epoch": 0.05453179016984819, + "flos": 24943483152000.0, + "grad_norm": 3.1487164244038546, + "language_loss": 0.91526973, + "learning_rate": 3.993690988180309e-06, + "loss": 0.9991591, + "num_input_tokens_seen": 19359585, + "router_z_loss_clip": 5.94921875, + "router_z_loss_mlp": 0.73242188, + "step": 907, + "time_per_iteration": 2.6804075241088867 + }, + { + "auxiliary_loss_clip": 0.07033581, + "auxiliary_loss_mlp": 0.01357567, + "balance_loss_clip": 0.06437694, + "balance_loss_mlp": 0.01279461, + "epoch": 0.05459191342251616, + "flos": 18121170119040.0, + "grad_norm": 6.406912601020187, + "language_loss": 0.90540731, + "learning_rate": 3.9936600401214165e-06, + "loss": 0.98931873, + "num_input_tokens_seen": 19378590, + "router_z_loss_clip": 5.953125, + "router_z_loss_mlp": 0.78076172, + "step": 908, + "time_per_iteration": 2.645015001296997 + }, + { + "auxiliary_loss_clip": 0.07043326, + "auxiliary_loss_mlp": 0.01345219, + "balance_loss_clip": 0.06445918, + "balance_loss_mlp": 0.01274695, + "epoch": 0.054652036675184125, + "flos": 19214314485120.0, + "grad_norm": 7.110019645600745, + "language_loss": 0.94541007, + "learning_rate": 3.9936290164627345e-06, + "loss": 1.02929544, + "num_input_tokens_seen": 19397910, + "router_z_loss_clip": 5.96875, + "router_z_loss_mlp": 0.70507812, + "step": 909, + "time_per_iteration": 2.6648013591766357 + }, + { + "auxiliary_loss_clip": 0.07070212, + "auxiliary_loss_mlp": 0.01367531, + "balance_loss_clip": 0.06454301, + "balance_loss_mlp": 0.01287184, + "epoch": 0.0547121599278521, + "flos": 16331253926400.0, + "grad_norm": 4.130588011927331, + "language_loss": 0.76068008, + "learning_rate": 3.99359791720544e-06, + "loss": 0.84505749, + "num_input_tokens_seen": 19415950, + "router_z_loss_clip": 6.15625, + "router_z_loss_mlp": 0.80273438, + "step": 910, + "time_per_iteration": 2.588240146636963 + }, + { + "auxiliary_loss_clip": 0.07039558, + "auxiliary_loss_mlp": 0.0135407, + "balance_loss_clip": 0.06453503, + "balance_loss_mlp": 0.01281829, + "epoch": 0.05477228318052007, + "flos": 20345165988480.0, + "grad_norm": 30.49086914574189, + "language_loss": 0.86822844, + "learning_rate": 3.993566742350714e-06, + "loss": 0.95216471, + "num_input_tokens_seen": 19435275, + "router_z_loss_clip": 5.8671875, + "router_z_loss_mlp": 0.72265625, + "step": 911, + "time_per_iteration": 2.6324408054351807 + }, + { + "auxiliary_loss_clip": 0.07064489, + "auxiliary_loss_mlp": 0.01358074, + "balance_loss_clip": 0.06459624, + "balance_loss_mlp": 0.01280207, + "epoch": 0.054832406433188034, + "flos": 21978216138240.0, + "grad_norm": 33.1555590789585, + "language_loss": 0.80294693, + "learning_rate": 3.993535491899736e-06, + "loss": 0.88717258, + "num_input_tokens_seen": 19452090, + "router_z_loss_clip": 6.046875, + "router_z_loss_mlp": 0.77880859, + "step": 912, + "time_per_iteration": 2.590373992919922 + }, + { + "auxiliary_loss_clip": 0.0703726, + "auxiliary_loss_mlp": 0.01353834, + "balance_loss_clip": 0.06456903, + "balance_loss_mlp": 0.01284979, + "epoch": 0.054892529685856006, + "flos": 16404487994880.0, + "grad_norm": 20.678206909589232, + "language_loss": 0.87077272, + "learning_rate": 3.993504165853694e-06, + "loss": 0.9546836, + "num_input_tokens_seen": 19470865, + "router_z_loss_clip": 5.8046875, + "router_z_loss_mlp": 0.68896484, + "step": 913, + "time_per_iteration": 2.6207854747772217 + }, + { + "auxiliary_loss_clip": 0.07058232, + "auxiliary_loss_mlp": 0.01355937, + "balance_loss_clip": 0.06467378, + "balance_loss_mlp": 0.01279214, + "epoch": 0.05495265293852397, + "flos": 23918709317760.0, + "grad_norm": 2.929829982992902, + "language_loss": 0.86646307, + "learning_rate": 3.993472764213772e-06, + "loss": 0.9506048, + "num_input_tokens_seen": 19492145, + "router_z_loss_clip": 5.91015625, + "router_z_loss_mlp": 0.76708984, + "step": 914, + "time_per_iteration": 2.653738260269165 + }, + { + "auxiliary_loss_clip": 0.07080867, + "auxiliary_loss_mlp": 0.01347963, + "balance_loss_clip": 0.06487378, + "balance_loss_mlp": 0.01278583, + "epoch": 0.055012776191191944, + "flos": 23593767984000.0, + "grad_norm": 5.681880132712419, + "language_loss": 0.94313538, + "learning_rate": 3.9934412869811655e-06, + "loss": 1.02742374, + "num_input_tokens_seen": 19511015, + "router_z_loss_clip": 5.93359375, + "router_z_loss_mlp": 0.69433594, + "step": 915, + "time_per_iteration": 2.6307506561279297 + }, + { + "auxiliary_loss_clip": 0.07055361, + "auxiliary_loss_mlp": 0.01345822, + "balance_loss_clip": 0.06473369, + "balance_loss_mlp": 0.01276442, + "epoch": 0.055072899443859916, + "flos": 17533997832960.0, + "grad_norm": 9.383060565186796, + "language_loss": 0.9327727, + "learning_rate": 3.993409734157064e-06, + "loss": 1.01678455, + "num_input_tokens_seen": 19529040, + "router_z_loss_clip": 5.8203125, + "router_z_loss_mlp": 0.69384766, + "step": 916, + "time_per_iteration": 2.5821292400360107 + }, + { + "auxiliary_loss_clip": 0.0710435, + "auxiliary_loss_mlp": 0.01382873, + "balance_loss_clip": 0.06478155, + "balance_loss_mlp": 0.01299808, + "epoch": 0.05513302269652788, + "flos": 21693246001920.0, + "grad_norm": 9.219504726961107, + "language_loss": 0.83272588, + "learning_rate": 3.993378105742666e-06, + "loss": 0.91759813, + "num_input_tokens_seen": 19549540, + "router_z_loss_clip": 6.2578125, + "router_z_loss_mlp": 0.83056641, + "step": 917, + "time_per_iteration": 2.620739221572876 + }, + { + "auxiliary_loss_clip": 0.07102817, + "auxiliary_loss_mlp": 0.01375299, + "balance_loss_clip": 0.06484253, + "balance_loss_mlp": 0.01293473, + "epoch": 0.05519314594919585, + "flos": 21619257246720.0, + "grad_norm": 3.775060612193374, + "language_loss": 0.84478474, + "learning_rate": 3.9933464017391705e-06, + "loss": 0.92956591, + "num_input_tokens_seen": 19567570, + "router_z_loss_clip": 6.1875, + "router_z_loss_mlp": 0.81787109, + "step": 918, + "time_per_iteration": 2.594416379928589 + }, + { + "auxiliary_loss_clip": 0.07101964, + "auxiliary_loss_mlp": 0.01367305, + "balance_loss_clip": 0.06485492, + "balance_loss_mlp": 0.01289151, + "epoch": 0.05525326920186382, + "flos": 21804983821440.0, + "grad_norm": 30.311763596206674, + "language_loss": 0.92698455, + "learning_rate": 3.99331462214778e-06, + "loss": 1.01167727, + "num_input_tokens_seen": 19585330, + "router_z_loss_clip": 6.171875, + "router_z_loss_mlp": 0.78125, + "step": 919, + "time_per_iteration": 2.652820587158203 + }, + { + "auxiliary_loss_clip": 0.07067424, + "auxiliary_loss_mlp": 0.01355052, + "balance_loss_clip": 0.06469625, + "balance_loss_mlp": 0.01279807, + "epoch": 0.05531339245453179, + "flos": 28447272357120.0, + "grad_norm": 10.071293586926402, + "language_loss": 0.91352344, + "learning_rate": 3.993282766969699e-06, + "loss": 0.99774826, + "num_input_tokens_seen": 19604970, + "router_z_loss_clip": 5.97265625, + "router_z_loss_mlp": 0.75244141, + "step": 920, + "time_per_iteration": 2.676198720932007 + }, + { + "auxiliary_loss_clip": 0.0705073, + "auxiliary_loss_mlp": 0.01349539, + "balance_loss_clip": 0.06465692, + "balance_loss_mlp": 0.01277489, + "epoch": 0.05537351570719976, + "flos": 37383688229760.0, + "grad_norm": 4.912310342767309, + "language_loss": 0.69610375, + "learning_rate": 3.993250836206136e-06, + "loss": 0.78010643, + "num_input_tokens_seen": 19626235, + "router_z_loss_clip": 5.85546875, + "router_z_loss_mlp": 0.72021484, + "step": 921, + "time_per_iteration": 2.729602098464966 + }, + { + "auxiliary_loss_clip": 0.07080688, + "auxiliary_loss_mlp": 0.01369369, + "balance_loss_clip": 0.06465121, + "balance_loss_mlp": 0.01287687, + "epoch": 0.05543363895986773, + "flos": 20090733465600.0, + "grad_norm": 4.2535446135467785, + "language_loss": 0.76117694, + "learning_rate": 3.993218829858301e-06, + "loss": 0.8456775, + "num_input_tokens_seen": 19644305, + "router_z_loss_clip": 6.1640625, + "router_z_loss_mlp": 0.81689453, + "step": 922, + "time_per_iteration": 2.5846810340881348 + }, + { + "auxiliary_loss_clip": 0.07077445, + "auxiliary_loss_mlp": 0.01375095, + "balance_loss_clip": 0.06466563, + "balance_loss_mlp": 0.01293842, + "epoch": 0.0554937622125357, + "flos": 24539773380480.0, + "grad_norm": 5.782149663492731, + "language_loss": 0.86474669, + "learning_rate": 3.993186747927408e-06, + "loss": 0.9492721, + "num_input_tokens_seen": 19662130, + "router_z_loss_clip": 6.1171875, + "router_z_loss_mlp": 0.81298828, + "step": 923, + "time_per_iteration": 2.6038758754730225 + }, + { + "auxiliary_loss_clip": 0.07066977, + "auxiliary_loss_mlp": 0.01365852, + "balance_loss_clip": 0.06460079, + "balance_loss_mlp": 0.01286125, + "epoch": 0.055553885465203665, + "flos": 14325408961920.0, + "grad_norm": 4.5524709486596695, + "language_loss": 0.82890737, + "learning_rate": 3.993154590414675e-06, + "loss": 0.91323566, + "num_input_tokens_seen": 19680715, + "router_z_loss_clip": 6.0703125, + "router_z_loss_mlp": 0.79736328, + "step": 924, + "time_per_iteration": 2.563229560852051 + }, + { + "auxiliary_loss_clip": 0.07049644, + "auxiliary_loss_mlp": 0.01383238, + "balance_loss_clip": 0.06458092, + "balance_loss_mlp": 0.01303654, + "epoch": 0.05561400871787164, + "flos": 27388522892160.0, + "grad_norm": 5.4957057534226115, + "language_loss": 1.05798936, + "learning_rate": 3.993122357321319e-06, + "loss": 1.14231825, + "num_input_tokens_seen": 19700535, + "router_z_loss_clip": 5.9140625, + "router_z_loss_mlp": 0.79492188, + "step": 925, + "time_per_iteration": 4.167480230331421 + }, + { + "auxiliary_loss_clip": 0.07051321, + "auxiliary_loss_mlp": 0.01368022, + "balance_loss_clip": 0.06456822, + "balance_loss_mlp": 0.01291585, + "epoch": 0.05567413197053961, + "flos": 23227681495680.0, + "grad_norm": 4.150968516842117, + "language_loss": 0.85383534, + "learning_rate": 3.993090048648564e-06, + "loss": 0.93802875, + "num_input_tokens_seen": 19718825, + "router_z_loss_clip": 5.94921875, + "router_z_loss_mlp": 0.76367188, + "step": 926, + "time_per_iteration": 4.156589031219482 + }, + { + "auxiliary_loss_clip": 0.07111964, + "auxiliary_loss_mlp": 0.01390888, + "balance_loss_clip": 0.06470172, + "balance_loss_mlp": 0.0130129, + "epoch": 0.055734255223207574, + "flos": 25271988284160.0, + "grad_norm": 8.095313947782397, + "language_loss": 0.79582185, + "learning_rate": 3.993057664397634e-06, + "loss": 0.88085037, + "num_input_tokens_seen": 19739080, + "router_z_loss_clip": 6.42578125, + "router_z_loss_mlp": 0.89550781, + "step": 927, + "time_per_iteration": 2.6851751804351807 + }, + { + "auxiliary_loss_clip": 0.06860578, + "auxiliary_loss_mlp": 0.01306525, + "balance_loss_clip": 0.06486383, + "balance_loss_mlp": 0.01261607, + "epoch": 0.055794378475875546, + "flos": 66524698938240.0, + "grad_norm": 0.7865808163657396, + "language_loss": 0.59965324, + "learning_rate": 3.9930252045697585e-06, + "loss": 0.68132424, + "num_input_tokens_seen": 19802960, + "router_z_loss_clip": 3.74804688, + "router_z_loss_mlp": 0.44921875, + "step": 928, + "time_per_iteration": 4.694532632827759 + }, + { + "auxiliary_loss_clip": 0.0702403, + "auxiliary_loss_mlp": 0.01398439, + "balance_loss_clip": 0.06437568, + "balance_loss_mlp": 0.01313991, + "epoch": 0.05585450172854351, + "flos": 25344635374080.0, + "grad_norm": 5.300738051002958, + "language_loss": 0.99270105, + "learning_rate": 3.992992669166168e-06, + "loss": 1.07692575, + "num_input_tokens_seen": 19822765, + "router_z_loss_clip": 5.8671875, + "router_z_loss_mlp": 0.84472656, + "step": 929, + "time_per_iteration": 2.652329444885254 + }, + { + "auxiliary_loss_clip": 0.07033007, + "auxiliary_loss_mlp": 0.01402576, + "balance_loss_clip": 0.06441823, + "balance_loss_mlp": 0.01318938, + "epoch": 0.05591462498121148, + "flos": 33920163711360.0, + "grad_norm": 20.10669872289237, + "language_loss": 0.7473861, + "learning_rate": 3.992960058188094e-06, + "loss": 0.83174193, + "num_input_tokens_seen": 19843590, + "router_z_loss_clip": 5.91015625, + "router_z_loss_mlp": 0.83691406, + "step": 930, + "time_per_iteration": 4.218009948730469 + }, + { + "auxiliary_loss_clip": 0.0703931, + "auxiliary_loss_mlp": 0.01397804, + "balance_loss_clip": 0.06446733, + "balance_loss_mlp": 0.01313929, + "epoch": 0.055974748233879455, + "flos": 17936617501440.0, + "grad_norm": 4.521391546474749, + "language_loss": 0.88519967, + "learning_rate": 3.992927371636776e-06, + "loss": 0.96957082, + "num_input_tokens_seen": 19860230, + "router_z_loss_clip": 5.91796875, + "router_z_loss_mlp": 0.83886719, + "step": 931, + "time_per_iteration": 2.5678892135620117 + }, + { + "auxiliary_loss_clip": 0.07037735, + "auxiliary_loss_mlp": 0.01413156, + "balance_loss_clip": 0.06439222, + "balance_loss_mlp": 0.01325466, + "epoch": 0.05603487148654742, + "flos": 24028392712320.0, + "grad_norm": 3.3508446860260355, + "language_loss": 0.86982858, + "learning_rate": 3.9928946095134525e-06, + "loss": 0.95433742, + "num_input_tokens_seen": 19880795, + "router_z_loss_clip": 5.9921875, + "router_z_loss_mlp": 0.87695312, + "step": 932, + "time_per_iteration": 2.6454596519470215 + }, + { + "auxiliary_loss_clip": 0.07046005, + "auxiliary_loss_mlp": 0.01409303, + "balance_loss_clip": 0.06444195, + "balance_loss_mlp": 0.01322901, + "epoch": 0.05609499473921539, + "flos": 17312912035200.0, + "grad_norm": 4.63721211876497, + "language_loss": 0.79083282, + "learning_rate": 3.992861771819365e-06, + "loss": 0.87538588, + "num_input_tokens_seen": 19897960, + "router_z_loss_clip": 6.02734375, + "router_z_loss_mlp": 0.86328125, + "step": 933, + "time_per_iteration": 2.5537846088409424 + }, + { + "auxiliary_loss_clip": 0.07023589, + "auxiliary_loss_mlp": 0.01416541, + "balance_loss_clip": 0.06434061, + "balance_loss_mlp": 0.01334287, + "epoch": 0.05615511799188336, + "flos": 21000834587520.0, + "grad_norm": 6.948998666256607, + "language_loss": 0.90410703, + "learning_rate": 3.99282885855576e-06, + "loss": 0.98850828, + "num_input_tokens_seen": 19913315, + "router_z_loss_clip": 5.89453125, + "router_z_loss_mlp": 0.82275391, + "step": 934, + "time_per_iteration": 2.5762336254119873 + }, + { + "auxiliary_loss_clip": 0.06990926, + "auxiliary_loss_mlp": 0.01429171, + "balance_loss_clip": 0.06438624, + "balance_loss_mlp": 0.01345153, + "epoch": 0.05621524124455133, + "flos": 17279062185600.0, + "grad_norm": 7.5646674228018265, + "language_loss": 0.84164441, + "learning_rate": 3.992795869723885e-06, + "loss": 0.92584538, + "num_input_tokens_seen": 19928790, + "router_z_loss_clip": 5.52734375, + "router_z_loss_mlp": 0.83984375, + "step": 935, + "time_per_iteration": 2.6203958988189697 + }, + { + "auxiliary_loss_clip": 0.06841761, + "auxiliary_loss_mlp": 0.01418196, + "balance_loss_clip": 0.06462182, + "balance_loss_mlp": 0.01359927, + "epoch": 0.0562753644972193, + "flos": 58737597194880.0, + "grad_norm": 0.8140808506826857, + "language_loss": 0.69178045, + "learning_rate": 3.99276280532499e-06, + "loss": 0.77438003, + "num_input_tokens_seen": 19988785, + "router_z_loss_clip": 3.79101562, + "router_z_loss_mlp": 0.58105469, + "step": 936, + "time_per_iteration": 3.1629393100738525 + }, + { + "auxiliary_loss_clip": 0.070338, + "auxiliary_loss_mlp": 0.01416227, + "balance_loss_clip": 0.06443301, + "balance_loss_mlp": 0.0133178, + "epoch": 0.05633548774988727, + "flos": 17462776262400.0, + "grad_norm": 4.591481841632389, + "language_loss": 0.81027842, + "learning_rate": 3.992729665360331e-06, + "loss": 0.89477861, + "num_input_tokens_seen": 20007685, + "router_z_loss_clip": 5.8984375, + "router_z_loss_mlp": 0.84472656, + "step": 937, + "time_per_iteration": 2.650186538696289 + }, + { + "auxiliary_loss_clip": 0.0684337, + "auxiliary_loss_mlp": 0.01393468, + "balance_loss_clip": 0.06467308, + "balance_loss_mlp": 0.01340683, + "epoch": 0.05639561100255524, + "flos": 70675939042560.0, + "grad_norm": 0.8752420339339617, + "language_loss": 0.64563346, + "learning_rate": 3.992696449831162e-06, + "loss": 0.72800183, + "num_input_tokens_seen": 20072750, + "router_z_loss_clip": 3.75, + "router_z_loss_mlp": 0.52880859, + "step": 938, + "time_per_iteration": 3.200669050216675 + }, + { + "auxiliary_loss_clip": 0.07073379, + "auxiliary_loss_mlp": 0.01391777, + "balance_loss_clip": 0.06460777, + "balance_loss_mlp": 0.01309332, + "epoch": 0.056455734255223204, + "flos": 20492346885120.0, + "grad_norm": 5.43214954330628, + "language_loss": 0.84251928, + "learning_rate": 3.992663158738745e-06, + "loss": 0.92717087, + "num_input_tokens_seen": 20089070, + "router_z_loss_clip": 6.125, + "router_z_loss_mlp": 0.82373047, + "step": 939, + "time_per_iteration": 2.622727870941162 + }, + { + "auxiliary_loss_clip": 0.07029171, + "auxiliary_loss_mlp": 0.01403853, + "balance_loss_clip": 0.06452838, + "balance_loss_mlp": 0.01326081, + "epoch": 0.056515857507891176, + "flos": 22059961395840.0, + "grad_norm": 5.005416621507547, + "language_loss": 0.76388282, + "learning_rate": 3.992629792084341e-06, + "loss": 0.84821308, + "num_input_tokens_seen": 20108790, + "router_z_loss_clip": 5.765625, + "router_z_loss_mlp": 0.77734375, + "step": 940, + "time_per_iteration": 2.6560001373291016 + }, + { + "auxiliary_loss_clip": 0.07005631, + "auxiliary_loss_mlp": 0.01389365, + "balance_loss_clip": 0.06443679, + "balance_loss_mlp": 0.01314073, + "epoch": 0.05657598076055915, + "flos": 24032291927040.0, + "grad_norm": 11.024308816683174, + "language_loss": 0.7415117, + "learning_rate": 3.992596349869216e-06, + "loss": 0.82546163, + "num_input_tokens_seen": 20128455, + "router_z_loss_clip": 5.6171875, + "router_z_loss_mlp": 0.75341797, + "step": 941, + "time_per_iteration": 2.691328525543213 + }, + { + "auxiliary_loss_clip": 0.07028662, + "auxiliary_loss_mlp": 0.01392256, + "balance_loss_clip": 0.06448376, + "balance_loss_mlp": 0.0131496, + "epoch": 0.05663610401322711, + "flos": 20486057829120.0, + "grad_norm": 6.757951792278694, + "language_loss": 0.8311438, + "learning_rate": 3.992562832094637e-06, + "loss": 0.91535294, + "num_input_tokens_seen": 20145775, + "router_z_loss_clip": 5.80859375, + "router_z_loss_mlp": 0.77246094, + "step": 942, + "time_per_iteration": 2.5987863540649414 + }, + { + "auxiliary_loss_clip": 0.07036945, + "auxiliary_loss_mlp": 0.01378378, + "balance_loss_clip": 0.06460088, + "balance_loss_mlp": 0.01303896, + "epoch": 0.056696227265895086, + "flos": 21075368394240.0, + "grad_norm": 21.600438823460475, + "language_loss": 0.92831737, + "learning_rate": 3.9925292387618755e-06, + "loss": 1.01247072, + "num_input_tokens_seen": 20164315, + "router_z_loss_clip": 5.765625, + "router_z_loss_mlp": 0.74462891, + "step": 943, + "time_per_iteration": 2.62147855758667 + }, + { + "auxiliary_loss_clip": 0.07040788, + "auxiliary_loss_mlp": 0.01386269, + "balance_loss_clip": 0.06462353, + "balance_loss_mlp": 0.01313027, + "epoch": 0.05675635051856306, + "flos": 17827017960960.0, + "grad_norm": 6.279897483523164, + "language_loss": 0.7991842, + "learning_rate": 3.992495569872206e-06, + "loss": 0.8834548, + "num_input_tokens_seen": 20182760, + "router_z_loss_clip": 5.78125, + "router_z_loss_mlp": 0.73242188, + "step": 944, + "time_per_iteration": 2.5755181312561035 + }, + { + "auxiliary_loss_clip": 0.0704762, + "auxiliary_loss_mlp": 0.01372731, + "balance_loss_clip": 0.06471305, + "balance_loss_mlp": 0.01300109, + "epoch": 0.05681647377123102, + "flos": 23122065024000.0, + "grad_norm": 11.186502162192404, + "language_loss": 0.82437181, + "learning_rate": 3.992461825426906e-06, + "loss": 0.90857524, + "num_input_tokens_seen": 20203830, + "router_z_loss_clip": 5.76171875, + "router_z_loss_mlp": 0.7265625, + "step": 945, + "time_per_iteration": 2.646212339401245 + }, + { + "auxiliary_loss_clip": 0.07062095, + "auxiliary_loss_mlp": 0.01352146, + "balance_loss_clip": 0.06473356, + "balance_loss_mlp": 0.01276854, + "epoch": 0.056876597023898995, + "flos": 16076024789760.0, + "grad_norm": 6.503065924665904, + "language_loss": 0.86640823, + "learning_rate": 3.992428005427252e-06, + "loss": 0.95055068, + "num_input_tokens_seen": 20220365, + "router_z_loss_clip": 5.890625, + "router_z_loss_mlp": 0.75195312, + "step": 946, + "time_per_iteration": 2.5955421924591064 + }, + { + "auxiliary_loss_clip": 0.07105307, + "auxiliary_loss_mlp": 0.01349465, + "balance_loss_clip": 0.06487983, + "balance_loss_mlp": 0.01268975, + "epoch": 0.05693672027656696, + "flos": 16841083294080.0, + "grad_norm": 30.160109907470417, + "language_loss": 0.83428961, + "learning_rate": 3.992394109874529e-06, + "loss": 0.91883731, + "num_input_tokens_seen": 20238640, + "router_z_loss_clip": 6.171875, + "router_z_loss_mlp": 0.80517578, + "step": 947, + "time_per_iteration": 2.578885078430176 + }, + { + "auxiliary_loss_clip": 0.07120173, + "auxiliary_loss_mlp": 0.01346427, + "balance_loss_clip": 0.06479014, + "balance_loss_mlp": 0.01264888, + "epoch": 0.05699684352923493, + "flos": 21394104526080.0, + "grad_norm": 7.760122513642949, + "language_loss": 0.89679337, + "learning_rate": 3.9923601387700225e-06, + "loss": 0.98145938, + "num_input_tokens_seen": 20251025, + "router_z_loss_clip": 6.40625, + "router_z_loss_mlp": 0.81542969, + "step": 948, + "time_per_iteration": 2.6047542095184326 + }, + { + "auxiliary_loss_clip": 0.07067588, + "auxiliary_loss_mlp": 0.01342886, + "balance_loss_clip": 0.06478094, + "balance_loss_mlp": 0.01268786, + "epoch": 0.057056966781902904, + "flos": 15565818078720.0, + "grad_norm": 4.718676024566818, + "language_loss": 0.91130018, + "learning_rate": 3.992326092115019e-06, + "loss": 0.99540496, + "num_input_tokens_seen": 20269775, + "router_z_loss_clip": 5.8984375, + "router_z_loss_mlp": 0.74121094, + "step": 949, + "time_per_iteration": 2.59798526763916 + }, + { + "auxiliary_loss_clip": 0.07052803, + "auxiliary_loss_mlp": 0.01334514, + "balance_loss_clip": 0.06479354, + "balance_loss_mlp": 0.01265897, + "epoch": 0.05711709003457087, + "flos": 19943971839360.0, + "grad_norm": 5.50050902669799, + "language_loss": 0.81973231, + "learning_rate": 3.992291969910811e-06, + "loss": 0.90360546, + "num_input_tokens_seen": 20287715, + "router_z_loss_clip": 5.73828125, + "router_z_loss_mlp": 0.68603516, + "step": 950, + "time_per_iteration": 2.6259987354278564 + }, + { + "auxiliary_loss_clip": 0.07096414, + "auxiliary_loss_mlp": 0.01341844, + "balance_loss_clip": 0.06496268, + "balance_loss_mlp": 0.0126536, + "epoch": 0.05717721328723884, + "flos": 30339953982720.0, + "grad_norm": 5.942643661235501, + "language_loss": 0.85793424, + "learning_rate": 3.992257772158691e-06, + "loss": 0.94231689, + "num_input_tokens_seen": 20307070, + "router_z_loss_clip": 5.99609375, + "router_z_loss_mlp": 0.76464844, + "step": 951, + "time_per_iteration": 2.6625497341156006 + }, + { + "auxiliary_loss_clip": 0.07096014, + "auxiliary_loss_mlp": 0.01337385, + "balance_loss_clip": 0.06490001, + "balance_loss_mlp": 0.0125494, + "epoch": 0.05723733653990681, + "flos": 23660251799040.0, + "grad_norm": 12.14793274648965, + "language_loss": 0.90794688, + "learning_rate": 3.992223498859958e-06, + "loss": 0.9922809, + "num_input_tokens_seen": 20324945, + "router_z_loss_clip": 6.06640625, + "router_z_loss_mlp": 0.82373047, + "step": 952, + "time_per_iteration": 2.6754026412963867 + }, + { + "auxiliary_loss_clip": 0.07150276, + "auxiliary_loss_mlp": 0.01358536, + "balance_loss_clip": 0.06509267, + "balance_loss_mlp": 0.01266268, + "epoch": 0.05729745979257478, + "flos": 22062518945280.0, + "grad_norm": 4.876026783534778, + "language_loss": 0.83819556, + "learning_rate": 3.9921891500159084e-06, + "loss": 0.92328364, + "num_input_tokens_seen": 20346135, + "router_z_loss_clip": 6.4140625, + "router_z_loss_mlp": 0.92333984, + "step": 953, + "time_per_iteration": 2.6004669666290283 + }, + { + "auxiliary_loss_clip": 0.07094061, + "auxiliary_loss_mlp": 0.01342327, + "balance_loss_clip": 0.06495301, + "balance_loss_mlp": 0.01262409, + "epoch": 0.05735758304524275, + "flos": 19609554994560.0, + "grad_norm": 6.9064094964387, + "language_loss": 0.9058758, + "learning_rate": 3.992154725627848e-06, + "loss": 0.99023962, + "num_input_tokens_seen": 20364450, + "router_z_loss_clip": 5.98046875, + "router_z_loss_mlp": 0.79931641, + "step": 954, + "time_per_iteration": 2.6270759105682373 + }, + { + "auxiliary_loss_clip": 0.07104363, + "auxiliary_loss_mlp": 0.01340099, + "balance_loss_clip": 0.06505129, + "balance_loss_mlp": 0.01262661, + "epoch": 0.057417706297910716, + "flos": 19105050360960.0, + "grad_norm": 6.439393268367411, + "language_loss": 0.9193548, + "learning_rate": 3.9921202256970804e-06, + "loss": 1.00379944, + "num_input_tokens_seen": 20383500, + "router_z_loss_clip": 6.0, + "router_z_loss_mlp": 0.77490234, + "step": 955, + "time_per_iteration": 2.5784714221954346 + }, + { + "auxiliary_loss_clip": 0.07088242, + "auxiliary_loss_mlp": 0.01339912, + "balance_loss_clip": 0.06500716, + "balance_loss_mlp": 0.01263379, + "epoch": 0.05747782955057869, + "flos": 16660136401920.0, + "grad_norm": 130.9595542139282, + "language_loss": 0.93622941, + "learning_rate": 3.992085650224914e-06, + "loss": 1.02051091, + "num_input_tokens_seen": 20400295, + "router_z_loss_clip": 5.875, + "router_z_loss_mlp": 0.765625, + "step": 956, + "time_per_iteration": 2.654709815979004 + }, + { + "auxiliary_loss_clip": 0.07069805, + "auxiliary_loss_mlp": 0.01336322, + "balance_loss_clip": 0.06513655, + "balance_loss_mlp": 0.01263795, + "epoch": 0.05753795280324665, + "flos": 14507362103040.0, + "grad_norm": 7.35623901329006, + "language_loss": 0.79601187, + "learning_rate": 3.99205099921266e-06, + "loss": 0.88007313, + "num_input_tokens_seen": 20419085, + "router_z_loss_clip": 5.5625, + "router_z_loss_mlp": 0.72509766, + "step": 957, + "time_per_iteration": 2.5814363956451416 + }, + { + "auxiliary_loss_clip": 0.07102334, + "auxiliary_loss_mlp": 0.013347, + "balance_loss_clip": 0.06516448, + "balance_loss_mlp": 0.01260171, + "epoch": 0.057598076055914625, + "flos": 18081995535360.0, + "grad_norm": 9.445676211161578, + "language_loss": 0.8370564, + "learning_rate": 3.992016272661633e-06, + "loss": 0.92142671, + "num_input_tokens_seen": 20437465, + "router_z_loss_clip": 5.859375, + "router_z_loss_mlp": 0.74511719, + "step": 958, + "time_per_iteration": 2.6244523525238037 + }, + { + "auxiliary_loss_clip": 0.0710094, + "auxiliary_loss_mlp": 0.01346675, + "balance_loss_clip": 0.06526074, + "balance_loss_mlp": 0.01272241, + "epoch": 0.0576581993085826, + "flos": 22130679841920.0, + "grad_norm": 4.908180525960309, + "language_loss": 0.91401774, + "learning_rate": 3.99198147057315e-06, + "loss": 0.99849397, + "num_input_tokens_seen": 20456235, + "router_z_loss_clip": 5.74609375, + "router_z_loss_mlp": 0.74365234, + "step": 959, + "time_per_iteration": 2.5950703620910645 + }, + { + "auxiliary_loss_clip": 0.07097997, + "auxiliary_loss_mlp": 0.01349298, + "balance_loss_clip": 0.06514278, + "balance_loss_mlp": 0.01272431, + "epoch": 0.05771832256125056, + "flos": 33190003232640.0, + "grad_norm": 5.502917231642364, + "language_loss": 0.82885253, + "learning_rate": 3.991946592948529e-06, + "loss": 0.91332549, + "num_input_tokens_seen": 20476825, + "router_z_loss_clip": 5.83203125, + "router_z_loss_mlp": 0.76904297, + "step": 960, + "time_per_iteration": 2.7026655673980713 + }, + { + "auxiliary_loss_clip": 0.07121219, + "auxiliary_loss_mlp": 0.0136329, + "balance_loss_clip": 0.06516127, + "balance_loss_mlp": 0.01276888, + "epoch": 0.057778445813918534, + "flos": 24176957201280.0, + "grad_norm": 10.105803552355386, + "language_loss": 0.96418011, + "learning_rate": 3.991911639789094e-06, + "loss": 1.0490253, + "num_input_tokens_seen": 20496965, + "router_z_loss_clip": 6.0546875, + "router_z_loss_mlp": 0.86425781, + "step": 961, + "time_per_iteration": 2.621075391769409 + }, + { + "auxiliary_loss_clip": 0.07137178, + "auxiliary_loss_mlp": 0.0136525, + "balance_loss_clip": 0.06529568, + "balance_loss_mlp": 0.01280421, + "epoch": 0.0578385690665865, + "flos": 29650770950400.0, + "grad_norm": 15.740079848034652, + "language_loss": 0.72144246, + "learning_rate": 3.991876611096169e-06, + "loss": 0.80646676, + "num_input_tokens_seen": 20518035, + "router_z_loss_clip": 6.08203125, + "router_z_loss_mlp": 0.84863281, + "step": 962, + "time_per_iteration": 2.662982702255249 + }, + { + "auxiliary_loss_clip": 0.07124397, + "auxiliary_loss_mlp": 0.01385383, + "balance_loss_clip": 0.06529254, + "balance_loss_mlp": 0.01300888, + "epoch": 0.05789869231925447, + "flos": 20891528536320.0, + "grad_norm": 6.9214750574770765, + "language_loss": 0.92274594, + "learning_rate": 3.991841506871084e-06, + "loss": 1.00784373, + "num_input_tokens_seen": 20534740, + "router_z_loss_clip": 5.953125, + "router_z_loss_mlp": 0.84521484, + "step": 963, + "time_per_iteration": 2.6076695919036865 + }, + { + "auxiliary_loss_clip": 0.07119042, + "auxiliary_loss_mlp": 0.01381304, + "balance_loss_clip": 0.06523143, + "balance_loss_mlp": 0.01297953, + "epoch": 0.057958815571922444, + "flos": 26038262672640.0, + "grad_norm": 11.895031253661099, + "language_loss": 0.8968147, + "learning_rate": 3.99180632711517e-06, + "loss": 0.98181814, + "num_input_tokens_seen": 20553485, + "router_z_loss_clip": 5.96484375, + "router_z_loss_mlp": 0.83300781, + "step": 964, + "time_per_iteration": 2.686906337738037 + }, + { + "auxiliary_loss_clip": 0.07105853, + "auxiliary_loss_mlp": 0.01387507, + "balance_loss_clip": 0.06517063, + "balance_loss_mlp": 0.01305252, + "epoch": 0.05801893882459041, + "flos": 18083588762880.0, + "grad_norm": 5.536598394443464, + "language_loss": 0.80100715, + "learning_rate": 3.99177107182976e-06, + "loss": 0.88594079, + "num_input_tokens_seen": 20572155, + "router_z_loss_clip": 5.88671875, + "router_z_loss_mlp": 0.82275391, + "step": 965, + "time_per_iteration": 4.090426921844482 + }, + { + "auxiliary_loss_clip": 0.07108907, + "auxiliary_loss_mlp": 0.01388674, + "balance_loss_clip": 0.0653006, + "balance_loss_mlp": 0.01307803, + "epoch": 0.05807906207725838, + "flos": 17754664360320.0, + "grad_norm": 8.638909024191255, + "language_loss": 0.85803884, + "learning_rate": 3.99173574101619e-06, + "loss": 0.94301462, + "num_input_tokens_seen": 20590395, + "router_z_loss_clip": 5.79296875, + "router_z_loss_mlp": 0.80859375, + "step": 966, + "time_per_iteration": 2.593015670776367 + }, + { + "auxiliary_loss_clip": 0.07081844, + "auxiliary_loss_mlp": 0.01385278, + "balance_loss_clip": 0.06515825, + "balance_loss_mlp": 0.01308507, + "epoch": 0.058139185329926346, + "flos": 18046133187840.0, + "grad_norm": 11.004143242377477, + "language_loss": 0.80350578, + "learning_rate": 3.9917003346758035e-06, + "loss": 0.88817692, + "num_input_tokens_seen": 20608435, + "router_z_loss_clip": 5.671875, + "router_z_loss_mlp": 0.76855469, + "step": 967, + "time_per_iteration": 4.057944297790527 + }, + { + "auxiliary_loss_clip": 0.06839906, + "auxiliary_loss_mlp": 0.01357839, + "balance_loss_clip": 0.06483683, + "balance_loss_mlp": 0.01313065, + "epoch": 0.05819930858259432, + "flos": 62381355845760.0, + "grad_norm": 0.8360355245003168, + "language_loss": 0.57554376, + "learning_rate": 3.991664852809939e-06, + "loss": 0.65752125, + "num_input_tokens_seen": 20668575, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 0.44799805, + "step": 968, + "time_per_iteration": 3.167989730834961 + }, + { + "auxiliary_loss_clip": 0.07096039, + "auxiliary_loss_mlp": 0.01391053, + "balance_loss_clip": 0.06529184, + "balance_loss_mlp": 0.01317, + "epoch": 0.05825943183526229, + "flos": 19141373905920.0, + "grad_norm": 7.005112994692607, + "language_loss": 0.84630275, + "learning_rate": 3.991629295419945e-06, + "loss": 0.93117368, + "num_input_tokens_seen": 20687355, + "router_z_loss_clip": 5.67578125, + "router_z_loss_mlp": 0.74072266, + "step": 969, + "time_per_iteration": 4.074899911880493 + }, + { + "auxiliary_loss_clip": 0.07116528, + "auxiliary_loss_mlp": 0.0138256, + "balance_loss_clip": 0.06523499, + "balance_loss_mlp": 0.01301068, + "epoch": 0.058319555087930255, + "flos": 29030042304000.0, + "grad_norm": 8.083926871251307, + "language_loss": 0.82668531, + "learning_rate": 3.991593662507167e-06, + "loss": 0.91167617, + "num_input_tokens_seen": 20705710, + "router_z_loss_clip": 5.9296875, + "router_z_loss_mlp": 0.81542969, + "step": 970, + "time_per_iteration": 2.659989833831787 + }, + { + "auxiliary_loss_clip": 0.07099806, + "auxiliary_loss_mlp": 0.01400005, + "balance_loss_clip": 0.06510817, + "balance_loss_mlp": 0.01317321, + "epoch": 0.05837967834059823, + "flos": 18885977061120.0, + "grad_norm": 16.518563352615757, + "language_loss": 0.96487081, + "learning_rate": 3.991557954072958e-06, + "loss": 1.04986882, + "num_input_tokens_seen": 20722405, + "router_z_loss_clip": 5.890625, + "router_z_loss_mlp": 0.82714844, + "step": 971, + "time_per_iteration": 2.666133165359497 + }, + { + "auxiliary_loss_clip": 0.07087609, + "auxiliary_loss_mlp": 0.01388607, + "balance_loss_clip": 0.06502773, + "balance_loss_mlp": 0.01310834, + "epoch": 0.05843980159326619, + "flos": 25710218737920.0, + "grad_norm": 16.27135895590574, + "language_loss": 0.89295512, + "learning_rate": 3.991522170118673e-06, + "loss": 0.97771728, + "num_input_tokens_seen": 20741480, + "router_z_loss_clip": 5.84765625, + "router_z_loss_mlp": 0.77832031, + "step": 972, + "time_per_iteration": 2.655470848083496 + }, + { + "auxiliary_loss_clip": 0.07066658, + "auxiliary_loss_mlp": 0.01374677, + "balance_loss_clip": 0.0650342, + "balance_loss_mlp": 0.01301482, + "epoch": 0.058499924845934165, + "flos": 25558425866880.0, + "grad_norm": 4.193788183762945, + "language_loss": 0.90456176, + "learning_rate": 3.991486310645667e-06, + "loss": 0.98897511, + "num_input_tokens_seen": 20759685, + "router_z_loss_clip": 5.62890625, + "router_z_loss_mlp": 0.73144531, + "step": 973, + "time_per_iteration": 2.6482443809509277 + }, + { + "auxiliary_loss_clip": 0.0705331, + "auxiliary_loss_mlp": 0.01383547, + "balance_loss_clip": 0.06485617, + "balance_loss_mlp": 0.01307635, + "epoch": 0.05856004809860214, + "flos": 16441859715840.0, + "grad_norm": 11.262132273646074, + "language_loss": 0.77443254, + "learning_rate": 3.991450375655301e-06, + "loss": 0.85880107, + "num_input_tokens_seen": 20778180, + "router_z_loss_clip": 5.6796875, + "router_z_loss_mlp": 0.75878906, + "step": 974, + "time_per_iteration": 2.57619047164917 + }, + { + "auxiliary_loss_clip": 0.07050242, + "auxiliary_loss_mlp": 0.01379524, + "balance_loss_clip": 0.06485987, + "balance_loss_mlp": 0.01304852, + "epoch": 0.0586201713512701, + "flos": 39468385486080.0, + "grad_norm": 6.566272929573762, + "language_loss": 0.79448825, + "learning_rate": 3.991414365148936e-06, + "loss": 0.87878591, + "num_input_tokens_seen": 20802705, + "router_z_loss_clip": 5.640625, + "router_z_loss_mlp": 0.74707031, + "step": 975, + "time_per_iteration": 2.79398250579834 + }, + { + "auxiliary_loss_clip": 0.07056309, + "auxiliary_loss_mlp": 0.0138878, + "balance_loss_clip": 0.06472544, + "balance_loss_mlp": 0.01304809, + "epoch": 0.058680294603938074, + "flos": 23371466302080.0, + "grad_norm": 4.828568059250088, + "language_loss": 0.79758298, + "learning_rate": 3.99137827912794e-06, + "loss": 0.88203388, + "num_input_tokens_seen": 20822540, + "router_z_loss_clip": 5.83984375, + "router_z_loss_mlp": 0.83984375, + "step": 976, + "time_per_iteration": 2.6214101314544678 + }, + { + "auxiliary_loss_clip": 0.07040592, + "auxiliary_loss_mlp": 0.01371791, + "balance_loss_clip": 0.06474636, + "balance_loss_mlp": 0.01299216, + "epoch": 0.05874041785660604, + "flos": 32239930913280.0, + "grad_norm": 7.236872171762386, + "language_loss": 0.89953148, + "learning_rate": 3.991342117593679e-06, + "loss": 0.98365533, + "num_input_tokens_seen": 20844175, + "router_z_loss_clip": 5.66015625, + "router_z_loss_mlp": 0.72607422, + "step": 977, + "time_per_iteration": 2.681955099105835 + }, + { + "auxiliary_loss_clip": 0.07041348, + "auxiliary_loss_mlp": 0.01373201, + "balance_loss_clip": 0.06467118, + "balance_loss_mlp": 0.01295619, + "epoch": 0.05880054110927401, + "flos": 22316657978880.0, + "grad_norm": 7.280318669233247, + "language_loss": 0.82238227, + "learning_rate": 3.991305880547527e-06, + "loss": 0.90652776, + "num_input_tokens_seen": 20864730, + "router_z_loss_clip": 5.734375, + "router_z_loss_mlp": 0.77587891, + "step": 978, + "time_per_iteration": 2.614290952682495 + }, + { + "auxiliary_loss_clip": 0.0707294, + "auxiliary_loss_mlp": 0.0136034, + "balance_loss_clip": 0.06484175, + "balance_loss_mlp": 0.01280184, + "epoch": 0.05886066436194198, + "flos": 27387726278400.0, + "grad_norm": 155.96057049304315, + "language_loss": 0.83328485, + "learning_rate": 3.991269567990855e-06, + "loss": 0.91761768, + "num_input_tokens_seen": 20885200, + "router_z_loss_clip": 5.89453125, + "router_z_loss_mlp": 0.80175781, + "step": 979, + "time_per_iteration": 2.635091543197632 + }, + { + "auxiliary_loss_clip": 0.0672864, + "auxiliary_loss_mlp": 0.01304756, + "balance_loss_clip": 0.06376771, + "balance_loss_mlp": 0.01257311, + "epoch": 0.05892078761460995, + "flos": 59601102647040.0, + "grad_norm": 0.9093094214807238, + "language_loss": 0.59396595, + "learning_rate": 3.9912331799250415e-06, + "loss": 0.67429984, + "num_input_tokens_seen": 20940325, + "router_z_loss_clip": 3.51953125, + "router_z_loss_mlp": 0.47387695, + "step": 980, + "time_per_iteration": 3.1261343955993652 + }, + { + "auxiliary_loss_clip": 0.07034945, + "auxiliary_loss_mlp": 0.01348733, + "balance_loss_clip": 0.06472749, + "balance_loss_mlp": 0.01274394, + "epoch": 0.05898091086727792, + "flos": 15419517649920.0, + "grad_norm": 3.186788863209633, + "language_loss": 0.90080172, + "learning_rate": 3.9911967163514665e-06, + "loss": 0.98463851, + "num_input_tokens_seen": 20958220, + "router_z_loss_clip": 5.625, + "router_z_loss_mlp": 0.74267578, + "step": 981, + "time_per_iteration": 2.5808515548706055 + }, + { + "auxiliary_loss_clip": 0.0705516, + "auxiliary_loss_mlp": 0.01348366, + "balance_loss_clip": 0.06484837, + "balance_loss_mlp": 0.0127746, + "epoch": 0.059041034119945886, + "flos": 23661383829120.0, + "grad_norm": 5.662656134717616, + "language_loss": 0.82531273, + "learning_rate": 3.991160177271513e-06, + "loss": 0.90934801, + "num_input_tokens_seen": 20978920, + "router_z_loss_clip": 5.703125, + "router_z_loss_mlp": 0.70898438, + "step": 982, + "time_per_iteration": 2.7105038166046143 + }, + { + "auxiliary_loss_clip": 0.07084571, + "auxiliary_loss_mlp": 0.01361032, + "balance_loss_clip": 0.06488383, + "balance_loss_mlp": 0.01281162, + "epoch": 0.05910115737261386, + "flos": 24761026886400.0, + "grad_norm": 3.604575523078559, + "language_loss": 0.87251258, + "learning_rate": 3.9911235626865654e-06, + "loss": 0.95696855, + "num_input_tokens_seen": 20999490, + "router_z_loss_clip": 5.9609375, + "router_z_loss_mlp": 0.79882812, + "step": 983, + "time_per_iteration": 2.744180917739868 + }, + { + "auxiliary_loss_clip": 0.07044654, + "auxiliary_loss_mlp": 0.01351466, + "balance_loss_clip": 0.06470264, + "balance_loss_mlp": 0.01274648, + "epoch": 0.05916128062528183, + "flos": 11733523741440.0, + "grad_norm": 4.930042751750388, + "language_loss": 0.87498015, + "learning_rate": 3.9910868725980125e-06, + "loss": 0.95894134, + "num_input_tokens_seen": 21017865, + "router_z_loss_clip": 5.734375, + "router_z_loss_mlp": 0.76806641, + "step": 984, + "time_per_iteration": 2.651169538497925 + }, + { + "auxiliary_loss_clip": 0.0704496, + "auxiliary_loss_mlp": 0.01342068, + "balance_loss_clip": 0.06470487, + "balance_loss_mlp": 0.01264582, + "epoch": 0.059221403877949795, + "flos": 21908587795200.0, + "grad_norm": 5.844491017467261, + "language_loss": 0.80473924, + "learning_rate": 3.9910501070072465e-06, + "loss": 0.88860953, + "num_input_tokens_seen": 21035900, + "router_z_loss_clip": 5.74609375, + "router_z_loss_mlp": 0.77490234, + "step": 985, + "time_per_iteration": 2.6289291381835938 + }, + { + "auxiliary_loss_clip": 0.07058708, + "auxiliary_loss_mlp": 0.01361985, + "balance_loss_clip": 0.06475725, + "balance_loss_mlp": 0.01284213, + "epoch": 0.05928152713061777, + "flos": 20519614189440.0, + "grad_norm": 6.301686711015131, + "language_loss": 0.93571031, + "learning_rate": 3.991013265915661e-06, + "loss": 1.01991737, + "num_input_tokens_seen": 21053235, + "router_z_loss_clip": 5.83203125, + "router_z_loss_mlp": 0.77783203, + "step": 986, + "time_per_iteration": 2.655438184738159 + }, + { + "auxiliary_loss_clip": 0.0708475, + "auxiliary_loss_mlp": 0.01349267, + "balance_loss_clip": 0.06479746, + "balance_loss_mlp": 0.01270303, + "epoch": 0.05934165038328574, + "flos": 24501437337600.0, + "grad_norm": 4.15562600287031, + "language_loss": 0.79382873, + "learning_rate": 3.9909763493246525e-06, + "loss": 0.87816888, + "num_input_tokens_seen": 21073090, + "router_z_loss_clip": 6.0546875, + "router_z_loss_mlp": 0.79003906, + "step": 987, + "time_per_iteration": 2.635974168777466 + }, + { + "auxiliary_loss_clip": 0.07112011, + "auxiliary_loss_mlp": 0.01375395, + "balance_loss_clip": 0.06492966, + "balance_loss_mlp": 0.0128861, + "epoch": 0.059401773635953704, + "flos": 38737302612480.0, + "grad_norm": 3.024721532830348, + "language_loss": 0.74664164, + "learning_rate": 3.990939357235621e-06, + "loss": 0.83151573, + "num_input_tokens_seen": 21094895, + "router_z_loss_clip": 6.19140625, + "router_z_loss_mlp": 0.8671875, + "step": 988, + "time_per_iteration": 2.8440210819244385 + }, + { + "auxiliary_loss_clip": 0.06738614, + "auxiliary_loss_mlp": 0.01302441, + "balance_loss_clip": 0.06389277, + "balance_loss_mlp": 0.01254757, + "epoch": 0.059461896888621676, + "flos": 58041244638720.0, + "grad_norm": 0.9346440677006217, + "language_loss": 0.71295583, + "learning_rate": 3.99090228964997e-06, + "loss": 0.79336637, + "num_input_tokens_seen": 21147555, + "router_z_loss_clip": 3.5, + "router_z_loss_mlp": 0.4765625, + "step": 989, + "time_per_iteration": 3.0397932529449463 + }, + { + "auxiliary_loss_clip": 0.07105568, + "auxiliary_loss_mlp": 0.01373719, + "balance_loss_clip": 0.06490866, + "balance_loss_mlp": 0.01288604, + "epoch": 0.05952202014128964, + "flos": 22134369421440.0, + "grad_norm": 3.813782873152628, + "language_loss": 0.81950057, + "learning_rate": 3.990865146569105e-06, + "loss": 0.90429342, + "num_input_tokens_seen": 21167845, + "router_z_loss_clip": 6.1484375, + "router_z_loss_mlp": 0.85107422, + "step": 990, + "time_per_iteration": 2.679490804672241 + }, + { + "auxiliary_loss_clip": 0.07070604, + "auxiliary_loss_mlp": 0.0136635, + "balance_loss_clip": 0.0648191, + "balance_loss_mlp": 0.01286957, + "epoch": 0.059582143393957614, + "flos": 20451495219840.0, + "grad_norm": 3.1821025671437786, + "language_loss": 0.88952839, + "learning_rate": 3.990827927994434e-06, + "loss": 0.97389793, + "num_input_tokens_seen": 21185085, + "router_z_loss_clip": 5.890625, + "router_z_loss_mlp": 0.79443359, + "step": 991, + "time_per_iteration": 2.6212010383605957 + }, + { + "auxiliary_loss_clip": 0.07097097, + "auxiliary_loss_mlp": 0.01373652, + "balance_loss_clip": 0.06486384, + "balance_loss_mlp": 0.012893, + "epoch": 0.059642266646625586, + "flos": 20601149811840.0, + "grad_norm": 4.7552664277712475, + "language_loss": 0.80401003, + "learning_rate": 3.9907906339273674e-06, + "loss": 0.88871753, + "num_input_tokens_seen": 21204230, + "router_z_loss_clip": 6.1171875, + "router_z_loss_mlp": 0.84375, + "step": 992, + "time_per_iteration": 2.6194934844970703 + }, + { + "auxiliary_loss_clip": 0.07081859, + "auxiliary_loss_mlp": 0.01371261, + "balance_loss_clip": 0.06485239, + "balance_loss_mlp": 0.01292869, + "epoch": 0.05970238989929355, + "flos": 19358434707840.0, + "grad_norm": 7.615023287218043, + "language_loss": 0.78822339, + "learning_rate": 3.9907532643693215e-06, + "loss": 0.87275457, + "num_input_tokens_seen": 21222655, + "router_z_loss_clip": 5.96875, + "router_z_loss_mlp": 0.78417969, + "step": 993, + "time_per_iteration": 2.5962717533111572 + }, + { + "auxiliary_loss_clip": 0.07073358, + "auxiliary_loss_mlp": 0.01364747, + "balance_loss_clip": 0.06486119, + "balance_loss_mlp": 0.01289073, + "epoch": 0.05976251315196152, + "flos": 30272002721280.0, + "grad_norm": 5.1352604598244, + "language_loss": 0.83427668, + "learning_rate": 3.990715819321712e-06, + "loss": 0.91865766, + "num_input_tokens_seen": 21242310, + "router_z_loss_clip": 5.875, + "router_z_loss_mlp": 0.75634766, + "step": 994, + "time_per_iteration": 2.677586317062378 + }, + { + "auxiliary_loss_clip": 0.07096842, + "auxiliary_loss_mlp": 0.01391454, + "balance_loss_clip": 0.06492864, + "balance_loss_mlp": 0.01313491, + "epoch": 0.05982263640462949, + "flos": 23191819148160.0, + "grad_norm": 4.423928105923456, + "language_loss": 0.83424294, + "learning_rate": 3.99067829878596e-06, + "loss": 0.91912591, + "num_input_tokens_seen": 21261410, + "router_z_loss_clip": 6.046875, + "router_z_loss_mlp": 0.77978516, + "step": 995, + "time_per_iteration": 2.62821364402771 + }, + { + "auxiliary_loss_clip": 0.07109222, + "auxiliary_loss_mlp": 0.01389117, + "balance_loss_clip": 0.06503183, + "balance_loss_mlp": 0.01309247, + "epoch": 0.05988275965729746, + "flos": 27857584448640.0, + "grad_norm": 3.07551937102457, + "language_loss": 0.89631027, + "learning_rate": 3.990640702763487e-06, + "loss": 0.98129368, + "num_input_tokens_seen": 21280080, + "router_z_loss_clip": 6.05859375, + "router_z_loss_mlp": 0.79785156, + "step": 996, + "time_per_iteration": 2.6472525596618652 + }, + { + "auxiliary_loss_clip": 0.0709434, + "auxiliary_loss_mlp": 0.01374144, + "balance_loss_clip": 0.06487706, + "balance_loss_mlp": 0.01292461, + "epoch": 0.05994288290996543, + "flos": 24686744641920.0, + "grad_norm": 3.8490454271878023, + "language_loss": 0.91812748, + "learning_rate": 3.990603031255718e-06, + "loss": 1.00281239, + "num_input_tokens_seen": 21296765, + "router_z_loss_clip": 6.05078125, + "router_z_loss_mlp": 0.81689453, + "step": 997, + "time_per_iteration": 2.6353485584259033 + }, + { + "auxiliary_loss_clip": 0.06747872, + "auxiliary_loss_mlp": 0.0129538, + "balance_loss_clip": 0.06402076, + "balance_loss_mlp": 0.01256113, + "epoch": 0.0600030061626334, + "flos": 69951187152000.0, + "grad_norm": 1.0138660307708214, + "language_loss": 0.75495923, + "learning_rate": 3.990565284264083e-06, + "loss": 0.83539176, + "num_input_tokens_seen": 21363345, + "router_z_loss_clip": 3.4609375, + "router_z_loss_mlp": 0.39233398, + "step": 998, + "time_per_iteration": 3.2664620876312256 + }, + { + "auxiliary_loss_clip": 0.07050692, + "auxiliary_loss_mlp": 0.01361564, + "balance_loss_clip": 0.06468829, + "balance_loss_mlp": 0.01286844, + "epoch": 0.06006312941530137, + "flos": 26547085791360.0, + "grad_norm": 6.665102912139699, + "language_loss": 0.78679419, + "learning_rate": 3.990527461790013e-06, + "loss": 0.87091672, + "num_input_tokens_seen": 21385290, + "router_z_loss_clip": 5.8203125, + "router_z_loss_mlp": 0.74707031, + "step": 999, + "time_per_iteration": 2.6708481311798096 + }, + { + "auxiliary_loss_clip": 0.07090119, + "auxiliary_loss_mlp": 0.01381378, + "balance_loss_clip": 0.06486722, + "balance_loss_mlp": 0.01301603, + "epoch": 0.060123252667969335, + "flos": 27351276952320.0, + "grad_norm": 3.7400701542168013, + "language_loss": 0.85150427, + "learning_rate": 3.990489563834943e-06, + "loss": 0.93621922, + "num_input_tokens_seen": 21407625, + "router_z_loss_clip": 6.03515625, + "router_z_loss_mlp": 0.79833984, + "step": 1000, + "time_per_iteration": 2.643961191177368 + }, + { + "auxiliary_loss_clip": 0.07061431, + "auxiliary_loss_mlp": 0.01377664, + "balance_loss_clip": 0.06471995, + "balance_loss_mlp": 0.01297555, + "epoch": 0.06018337592063731, + "flos": 27024113485440.0, + "grad_norm": 4.060867986193189, + "language_loss": 0.88738573, + "learning_rate": 3.990451590400309e-06, + "loss": 0.97177666, + "num_input_tokens_seen": 21426835, + "router_z_loss_clip": 5.890625, + "router_z_loss_mlp": 0.80126953, + "step": 1001, + "time_per_iteration": 2.629136323928833 + }, + { + "auxiliary_loss_clip": 0.07032709, + "auxiliary_loss_mlp": 0.01355395, + "balance_loss_clip": 0.06470643, + "balance_loss_mlp": 0.01289306, + "epoch": 0.06024349917330528, + "flos": 25599990291840.0, + "grad_norm": 3.249124655019378, + "language_loss": 0.76097226, + "learning_rate": 3.990413541487551e-06, + "loss": 0.84485334, + "num_input_tokens_seen": 21444920, + "router_z_loss_clip": 5.6171875, + "router_z_loss_mlp": 0.66162109, + "step": 1002, + "time_per_iteration": 2.6258249282836914 + }, + { + "auxiliary_loss_clip": 0.07068716, + "auxiliary_loss_mlp": 0.01374313, + "balance_loss_clip": 0.06480874, + "balance_loss_mlp": 0.01298067, + "epoch": 0.060303622425973244, + "flos": 26139225242880.0, + "grad_norm": 4.8561241229026075, + "language_loss": 0.78990388, + "learning_rate": 3.990375417098112e-06, + "loss": 0.87433422, + "num_input_tokens_seen": 21463555, + "router_z_loss_clip": 5.8828125, + "router_z_loss_mlp": 0.76220703, + "step": 1003, + "time_per_iteration": 2.7662932872772217 + }, + { + "auxiliary_loss_clip": 0.0707517, + "auxiliary_loss_mlp": 0.01365139, + "balance_loss_clip": 0.0647432, + "balance_loss_mlp": 0.01284077, + "epoch": 0.060363745678641216, + "flos": 20383627812480.0, + "grad_norm": 4.219450714846169, + "language_loss": 0.73012471, + "learning_rate": 3.990337217233437e-06, + "loss": 0.81452775, + "num_input_tokens_seen": 21481990, + "router_z_loss_clip": 6.015625, + "router_z_loss_mlp": 0.81005859, + "step": 1004, + "time_per_iteration": 5.472697734832764 + }, + { + "auxiliary_loss_clip": 0.07068998, + "auxiliary_loss_mlp": 0.01370949, + "balance_loss_clip": 0.06471765, + "balance_loss_mlp": 0.0129313, + "epoch": 0.06042386893130918, + "flos": 17754999776640.0, + "grad_norm": 3.350107422381743, + "language_loss": 0.86839885, + "learning_rate": 3.990298941894976e-06, + "loss": 0.95279837, + "num_input_tokens_seen": 21500385, + "router_z_loss_clip": 5.96875, + "router_z_loss_mlp": 0.77832031, + "step": 1005, + "time_per_iteration": 2.628612518310547 + }, + { + "auxiliary_loss_clip": 0.06732726, + "auxiliary_loss_mlp": 0.01300149, + "balance_loss_clip": 0.06388327, + "balance_loss_mlp": 0.01255518, + "epoch": 0.06048399218397715, + "flos": 68559110945280.0, + "grad_norm": 0.8658661250215584, + "language_loss": 0.59003174, + "learning_rate": 3.9902605910841794e-06, + "loss": 0.67036045, + "num_input_tokens_seen": 21561040, + "router_z_loss_clip": 3.4375, + "router_z_loss_mlp": 0.4465332, + "step": 1006, + "time_per_iteration": 3.2709102630615234 + }, + { + "auxiliary_loss_clip": 0.07070711, + "auxiliary_loss_mlp": 0.01360281, + "balance_loss_clip": 0.06464767, + "balance_loss_mlp": 0.01278123, + "epoch": 0.060544115436645125, + "flos": 23265262851840.0, + "grad_norm": 3.0418653981095973, + "language_loss": 0.77645856, + "learning_rate": 3.990222164802503e-06, + "loss": 0.8607685, + "num_input_tokens_seen": 21580655, + "router_z_loss_clip": 6.05859375, + "router_z_loss_mlp": 0.82128906, + "step": 1007, + "time_per_iteration": 4.056382894515991 + }, + { + "auxiliary_loss_clip": 0.07091306, + "auxiliary_loss_mlp": 0.01370917, + "balance_loss_clip": 0.06486145, + "balance_loss_mlp": 0.01290475, + "epoch": 0.06060423868931309, + "flos": 23885236811520.0, + "grad_norm": 3.189900491688776, + "language_loss": 0.83630216, + "learning_rate": 3.9901836630514006e-06, + "loss": 0.92092443, + "num_input_tokens_seen": 21599650, + "router_z_loss_clip": 6.05859375, + "router_z_loss_mlp": 0.8046875, + "step": 1008, + "time_per_iteration": 2.6701247692108154 + }, + { + "auxiliary_loss_clip": 0.07042849, + "auxiliary_loss_mlp": 0.01344814, + "balance_loss_clip": 0.06474254, + "balance_loss_mlp": 0.01273718, + "epoch": 0.06066436194198106, + "flos": 18733010232960.0, + "grad_norm": 8.677434751337552, + "language_loss": 0.80948377, + "learning_rate": 3.990145085832335e-06, + "loss": 0.89336038, + "num_input_tokens_seen": 21617550, + "router_z_loss_clip": 5.6875, + "router_z_loss_mlp": 0.71142578, + "step": 1009, + "time_per_iteration": 4.013457536697388 + }, + { + "auxiliary_loss_clip": 0.07022181, + "auxiliary_loss_mlp": 0.01332483, + "balance_loss_clip": 0.06467105, + "balance_loss_mlp": 0.01266345, + "epoch": 0.06072448519464903, + "flos": 24646689590400.0, + "grad_norm": 3.258884654543471, + "language_loss": 0.95985019, + "learning_rate": 3.990106433146769e-06, + "loss": 1.04339683, + "num_input_tokens_seen": 21635865, + "router_z_loss_clip": 5.55078125, + "router_z_loss_mlp": 0.66162109, + "step": 1010, + "time_per_iteration": 2.631512403488159 + }, + { + "auxiliary_loss_clip": 0.07117961, + "auxiliary_loss_mlp": 0.01383111, + "balance_loss_clip": 0.06489638, + "balance_loss_mlp": 0.01291845, + "epoch": 0.060784608447317, + "flos": 17383672408320.0, + "grad_norm": 3.3823449890168145, + "language_loss": 0.75409305, + "learning_rate": 3.9900677049961665e-06, + "loss": 0.83910382, + "num_input_tokens_seen": 21653945, + "router_z_loss_clip": 6.28125, + "router_z_loss_mlp": 0.91259766, + "step": 1011, + "time_per_iteration": 2.5896708965301514 + }, + { + "auxiliary_loss_clip": 0.07033786, + "auxiliary_loss_mlp": 0.01345512, + "balance_loss_clip": 0.06462559, + "balance_loss_mlp": 0.0126526, + "epoch": 0.06084473169998497, + "flos": 23698336279680.0, + "grad_norm": 3.246815093008435, + "language_loss": 0.89853048, + "learning_rate": 3.990028901381999e-06, + "loss": 0.98232347, + "num_input_tokens_seen": 21671230, + "router_z_loss_clip": 5.7109375, + "router_z_loss_mlp": 0.80273438, + "step": 1012, + "time_per_iteration": 2.637019157409668 + }, + { + "auxiliary_loss_clip": 0.07040339, + "auxiliary_loss_mlp": 0.01338129, + "balance_loss_clip": 0.06458548, + "balance_loss_mlp": 0.01258211, + "epoch": 0.06090485495265294, + "flos": 23552455121280.0, + "grad_norm": 2.5392970439405116, + "language_loss": 0.79602826, + "learning_rate": 3.989990022305734e-06, + "loss": 0.8798129, + "num_input_tokens_seen": 21691155, + "router_z_loss_clip": 5.81640625, + "router_z_loss_mlp": 0.79980469, + "step": 1013, + "time_per_iteration": 2.658986806869507 + }, + { + "auxiliary_loss_clip": 0.0703081, + "auxiliary_loss_mlp": 0.01334151, + "balance_loss_clip": 0.06449694, + "balance_loss_mlp": 0.01255664, + "epoch": 0.06096497820532091, + "flos": 20345501404800.0, + "grad_norm": 3.5799775107607585, + "language_loss": 0.88768977, + "learning_rate": 3.98995106776885e-06, + "loss": 0.97133934, + "num_input_tokens_seen": 21707405, + "router_z_loss_clip": 5.8203125, + "router_z_loss_mlp": 0.78515625, + "step": 1014, + "time_per_iteration": 2.6026017665863037 + }, + { + "auxiliary_loss_clip": 0.07069368, + "auxiliary_loss_mlp": 0.01344703, + "balance_loss_clip": 0.06459542, + "balance_loss_mlp": 0.01260589, + "epoch": 0.061025101457988874, + "flos": 26945638536960.0, + "grad_norm": 5.148864357756937, + "language_loss": 0.77818727, + "learning_rate": 3.98991203777282e-06, + "loss": 0.86232805, + "num_input_tokens_seen": 21728090, + "router_z_loss_clip": 6.1015625, + "router_z_loss_mlp": 0.84082031, + "step": 1015, + "time_per_iteration": 2.6645917892456055 + }, + { + "auxiliary_loss_clip": 0.07000691, + "auxiliary_loss_mlp": 0.01326184, + "balance_loss_clip": 0.06455131, + "balance_loss_mlp": 0.01257949, + "epoch": 0.061085224710656846, + "flos": 25382216730240.0, + "grad_norm": 2.4567185281472868, + "language_loss": 0.82061088, + "learning_rate": 3.9898729323191275e-06, + "loss": 0.90387964, + "num_input_tokens_seen": 21747950, + "router_z_loss_clip": 5.453125, + "router_z_loss_mlp": 0.68359375, + "step": 1016, + "time_per_iteration": 2.631394863128662 + }, + { + "auxiliary_loss_clip": 0.07014458, + "auxiliary_loss_mlp": 0.01339398, + "balance_loss_clip": 0.06457797, + "balance_loss_mlp": 0.01263962, + "epoch": 0.06114534796332482, + "flos": 24831326062080.0, + "grad_norm": 2.2885034058804363, + "language_loss": 0.78705657, + "learning_rate": 3.989833751409254e-06, + "loss": 0.8705951, + "num_input_tokens_seen": 21767900, + "router_z_loss_clip": 5.55859375, + "router_z_loss_mlp": 0.75390625, + "step": 1017, + "time_per_iteration": 2.657306432723999 + }, + { + "auxiliary_loss_clip": 0.07054974, + "auxiliary_loss_mlp": 0.0134134, + "balance_loss_clip": 0.06458369, + "balance_loss_mlp": 0.01256225, + "epoch": 0.061205471215992784, + "flos": 20637724919040.0, + "grad_norm": 9.632952296777574, + "language_loss": 0.88575757, + "learning_rate": 3.989794495044685e-06, + "loss": 0.96972066, + "num_input_tokens_seen": 21787375, + "router_z_loss_clip": 5.97265625, + "router_z_loss_mlp": 0.85107422, + "step": 1018, + "time_per_iteration": 2.5989861488342285 + }, + { + "auxiliary_loss_clip": 0.07009743, + "auxiliary_loss_mlp": 0.01334982, + "balance_loss_clip": 0.06455217, + "balance_loss_mlp": 0.01259165, + "epoch": 0.061265594468660756, + "flos": 16513919827200.0, + "grad_norm": 8.927182809216816, + "language_loss": 0.8225174, + "learning_rate": 3.989755163226909e-06, + "loss": 0.90596467, + "num_input_tokens_seen": 21806275, + "router_z_loss_clip": 5.54296875, + "router_z_loss_mlp": 0.75878906, + "step": 1019, + "time_per_iteration": 2.596885919570923 + }, + { + "auxiliary_loss_clip": 0.07013386, + "auxiliary_loss_mlp": 0.01335228, + "balance_loss_clip": 0.06456258, + "balance_loss_mlp": 0.01263417, + "epoch": 0.06132571772132872, + "flos": 26252765925120.0, + "grad_norm": 3.333827515378615, + "language_loss": 0.86933666, + "learning_rate": 3.989715755957418e-06, + "loss": 0.9528228, + "num_input_tokens_seen": 21826430, + "router_z_loss_clip": 5.5625, + "router_z_loss_mlp": 0.71826172, + "step": 1020, + "time_per_iteration": 2.6224961280822754 + }, + { + "auxiliary_loss_clip": 0.06996658, + "auxiliary_loss_mlp": 0.01346945, + "balance_loss_clip": 0.06447957, + "balance_loss_mlp": 0.01273989, + "epoch": 0.06138584097399669, + "flos": 37423869062400.0, + "grad_norm": 2.8232559173096914, + "language_loss": 0.81487918, + "learning_rate": 3.989676273237705e-06, + "loss": 0.89831525, + "num_input_tokens_seen": 21847800, + "router_z_loss_clip": 5.48828125, + "router_z_loss_mlp": 0.72949219, + "step": 1021, + "time_per_iteration": 2.771052598953247 + }, + { + "auxiliary_loss_clip": 0.06976922, + "auxiliary_loss_mlp": 0.0136383, + "balance_loss_clip": 0.06428508, + "balance_loss_mlp": 0.01285295, + "epoch": 0.061445964226664665, + "flos": 17426410790400.0, + "grad_norm": 7.734725170769636, + "language_loss": 0.9093855, + "learning_rate": 3.9896367150692705e-06, + "loss": 0.99279296, + "num_input_tokens_seen": 21863385, + "router_z_loss_clip": 5.48046875, + "router_z_loss_mlp": 0.78466797, + "step": 1022, + "time_per_iteration": 2.5622968673706055 + }, + { + "auxiliary_loss_clip": 0.0697528, + "auxiliary_loss_mlp": 0.01365327, + "balance_loss_clip": 0.06437931, + "balance_loss_mlp": 0.01295518, + "epoch": 0.06150608747933263, + "flos": 22606365870720.0, + "grad_norm": 3.61040283013288, + "language_loss": 0.84977013, + "learning_rate": 3.989597081453611e-06, + "loss": 0.93317622, + "num_input_tokens_seen": 21881880, + "router_z_loss_clip": 5.37109375, + "router_z_loss_mlp": 0.69824219, + "step": 1023, + "time_per_iteration": 2.6407079696655273 + }, + { + "auxiliary_loss_clip": 0.0673309, + "auxiliary_loss_mlp": 0.01419946, + "balance_loss_clip": 0.06385664, + "balance_loss_mlp": 0.0137119, + "epoch": 0.0615662107320006, + "flos": 56758097139840.0, + "grad_norm": 0.9164460168563352, + "language_loss": 0.64884549, + "learning_rate": 3.989557372392231e-06, + "loss": 0.73037589, + "num_input_tokens_seen": 21940550, + "router_z_loss_clip": 3.46875, + "router_z_loss_mlp": 0.48706055, + "step": 1024, + "time_per_iteration": 3.240457534790039 + }, + { + "auxiliary_loss_clip": 0.06995942, + "auxiliary_loss_mlp": 0.01352799, + "balance_loss_clip": 0.06434722, + "balance_loss_mlp": 0.01272356, + "epoch": 0.06162633398466857, + "flos": 22571342064000.0, + "grad_norm": 2.66796346315112, + "language_loss": 0.91765183, + "learning_rate": 3.989517587886636e-06, + "loss": 1.00113928, + "num_input_tokens_seen": 21958390, + "router_z_loss_clip": 5.61328125, + "router_z_loss_mlp": 0.80371094, + "step": 1025, + "time_per_iteration": 2.6372737884521484 + }, + { + "auxiliary_loss_clip": 0.06986167, + "auxiliary_loss_mlp": 0.01374261, + "balance_loss_clip": 0.06435852, + "balance_loss_mlp": 0.01300828, + "epoch": 0.06168645723733654, + "flos": 25600158000000.0, + "grad_norm": 2.4272602971827535, + "language_loss": 0.871768, + "learning_rate": 3.989477727938335e-06, + "loss": 0.95537233, + "num_input_tokens_seen": 21978625, + "router_z_loss_clip": 5.50390625, + "router_z_loss_mlp": 0.73486328, + "step": 1026, + "time_per_iteration": 2.6508452892303467 + }, + { + "auxiliary_loss_clip": 0.06989977, + "auxiliary_loss_mlp": 0.01363012, + "balance_loss_clip": 0.06439693, + "balance_loss_mlp": 0.01286622, + "epoch": 0.06174658049000451, + "flos": 16003461553920.0, + "grad_norm": 3.495791258705881, + "language_loss": 0.8437736, + "learning_rate": 3.989437792548839e-06, + "loss": 0.92730343, + "num_input_tokens_seen": 21996035, + "router_z_loss_clip": 5.5, + "router_z_loss_mlp": 0.76416016, + "step": 1027, + "time_per_iteration": 2.613172769546509 + }, + { + "auxiliary_loss_clip": 0.06973707, + "auxiliary_loss_mlp": 0.01359003, + "balance_loss_clip": 0.0641673, + "balance_loss_mlp": 0.01281422, + "epoch": 0.06180670374267248, + "flos": 11289842772480.0, + "grad_norm": 3.8173647671524793, + "language_loss": 0.87086433, + "learning_rate": 3.989397781719663e-06, + "loss": 0.95419139, + "num_input_tokens_seen": 22011625, + "router_z_loss_clip": 5.5625, + "router_z_loss_mlp": 0.77539062, + "step": 1028, + "time_per_iteration": 2.6524107456207275 + }, + { + "auxiliary_loss_clip": 0.06704632, + "auxiliary_loss_mlp": 0.01372349, + "balance_loss_clip": 0.06357226, + "balance_loss_mlp": 0.01321519, + "epoch": 0.06186682699534045, + "flos": 65147647340160.0, + "grad_norm": 0.9176628937357996, + "language_loss": 0.60490429, + "learning_rate": 3.989357695452323e-06, + "loss": 0.68567419, + "num_input_tokens_seen": 22066035, + "router_z_loss_clip": 3.46875, + "router_z_loss_mlp": 0.50830078, + "step": 1029, + "time_per_iteration": 3.218085289001465 + }, + { + "auxiliary_loss_clip": 0.07009555, + "auxiliary_loss_mlp": 0.01372678, + "balance_loss_clip": 0.06434123, + "balance_loss_mlp": 0.01287372, + "epoch": 0.061926950248008414, + "flos": 21112111209600.0, + "grad_norm": 3.737194986722716, + "language_loss": 0.85668898, + "learning_rate": 3.98931753374834e-06, + "loss": 0.94051135, + "num_input_tokens_seen": 22085015, + "router_z_loss_clip": 5.75390625, + "router_z_loss_mlp": 0.85253906, + "step": 1030, + "time_per_iteration": 2.7052202224731445 + }, + { + "auxiliary_loss_clip": 0.06989674, + "auxiliary_loss_mlp": 0.01357455, + "balance_loss_clip": 0.06431329, + "balance_loss_mlp": 0.01280446, + "epoch": 0.061987073500676386, + "flos": 17754161235840.0, + "grad_norm": 3.4423452178420013, + "language_loss": 0.83235556, + "learning_rate": 3.989277296609237e-06, + "loss": 0.91582686, + "num_input_tokens_seen": 22102775, + "router_z_loss_clip": 5.5859375, + "router_z_loss_mlp": 0.77050781, + "step": 1031, + "time_per_iteration": 2.588575839996338 + }, + { + "auxiliary_loss_clip": 0.06983647, + "auxiliary_loss_mlp": 0.01355074, + "balance_loss_clip": 0.06433594, + "balance_loss_mlp": 0.01283453, + "epoch": 0.06204719675334436, + "flos": 21842858666880.0, + "grad_norm": 14.220096224086527, + "language_loss": 0.80345309, + "learning_rate": 3.98923698403654e-06, + "loss": 0.88684022, + "num_input_tokens_seen": 22121680, + "router_z_loss_clip": 5.50390625, + "router_z_loss_mlp": 0.71582031, + "step": 1032, + "time_per_iteration": 2.6636962890625 + }, + { + "auxiliary_loss_clip": 0.06996015, + "auxiliary_loss_mlp": 0.01349932, + "balance_loss_clip": 0.064355, + "balance_loss_mlp": 0.01272828, + "epoch": 0.06210732000601232, + "flos": 19359650592000.0, + "grad_norm": 3.724079257252284, + "language_loss": 0.9305315, + "learning_rate": 3.989196596031776e-06, + "loss": 1.01399088, + "num_input_tokens_seen": 22138155, + "router_z_loss_clip": 5.60546875, + "router_z_loss_mlp": 0.77197266, + "step": 1033, + "time_per_iteration": 2.5974748134613037 + }, + { + "auxiliary_loss_clip": 0.06988779, + "auxiliary_loss_mlp": 0.01347157, + "balance_loss_clip": 0.06438898, + "balance_loss_mlp": 0.0127525, + "epoch": 0.062167443258680295, + "flos": 24755534444160.0, + "grad_norm": 3.649174890809254, + "language_loss": 0.87141907, + "learning_rate": 3.989156132596479e-06, + "loss": 0.95477843, + "num_input_tokens_seen": 22157420, + "router_z_loss_clip": 5.49609375, + "router_z_loss_mlp": 0.71875, + "step": 1034, + "time_per_iteration": 2.6747853755950928 + }, + { + "auxiliary_loss_clip": 0.06962503, + "auxiliary_loss_mlp": 0.01360042, + "balance_loss_clip": 0.06434912, + "balance_loss_mlp": 0.01290854, + "epoch": 0.06222756651134827, + "flos": 34466903602560.0, + "grad_norm": 3.3762373845942313, + "language_loss": 0.84657645, + "learning_rate": 3.989115593732182e-06, + "loss": 0.92980194, + "num_input_tokens_seen": 22178620, + "router_z_loss_clip": 5.2734375, + "router_z_loss_mlp": 0.69189453, + "step": 1035, + "time_per_iteration": 2.690265655517578 + }, + { + "auxiliary_loss_clip": 0.06995995, + "auxiliary_loss_mlp": 0.01348638, + "balance_loss_clip": 0.06441504, + "balance_loss_mlp": 0.01275015, + "epoch": 0.06228768976401623, + "flos": 25673601703680.0, + "grad_norm": 4.464615872821339, + "language_loss": 0.81925672, + "learning_rate": 3.989074979440421e-06, + "loss": 0.90270305, + "num_input_tokens_seen": 22197125, + "router_z_loss_clip": 5.5390625, + "router_z_loss_mlp": 0.73583984, + "step": 1036, + "time_per_iteration": 2.6662774085998535 + }, + { + "auxiliary_loss_clip": 0.07003354, + "auxiliary_loss_mlp": 0.01370226, + "balance_loss_clip": 0.064463, + "balance_loss_mlp": 0.01293693, + "epoch": 0.062347813016684205, + "flos": 25301687356800.0, + "grad_norm": 3.754285367283167, + "language_loss": 0.89123344, + "learning_rate": 3.989034289722739e-06, + "loss": 0.97496927, + "num_input_tokens_seen": 22217575, + "router_z_loss_clip": 5.56640625, + "router_z_loss_mlp": 0.76513672, + "step": 1037, + "time_per_iteration": 2.609894037246704 + }, + { + "auxiliary_loss_clip": 0.07008456, + "auxiliary_loss_mlp": 0.01342836, + "balance_loss_clip": 0.06453587, + "balance_loss_mlp": 0.01269641, + "epoch": 0.06240793626935217, + "flos": 26914388163840.0, + "grad_norm": 15.327798453817612, + "language_loss": 0.8346867, + "learning_rate": 3.988993524580676e-06, + "loss": 0.91819966, + "num_input_tokens_seen": 22236840, + "router_z_loss_clip": 5.54296875, + "router_z_loss_mlp": 0.73095703, + "step": 1038, + "time_per_iteration": 2.6626057624816895 + }, + { + "auxiliary_loss_clip": 0.06993866, + "auxiliary_loss_mlp": 0.01340149, + "balance_loss_clip": 0.0645204, + "balance_loss_mlp": 0.01267956, + "epoch": 0.06246805952202014, + "flos": 21622108285440.0, + "grad_norm": 3.08050473605758, + "language_loss": 0.88628823, + "learning_rate": 3.98895268401578e-06, + "loss": 0.96962833, + "num_input_tokens_seen": 22256465, + "router_z_loss_clip": 5.41796875, + "router_z_loss_mlp": 0.72167969, + "step": 1039, + "time_per_iteration": 2.6248486042022705 + }, + { + "auxiliary_loss_clip": 0.0701851, + "auxiliary_loss_mlp": 0.01340836, + "balance_loss_clip": 0.06453219, + "balance_loss_mlp": 0.01264352, + "epoch": 0.0625281827746881, + "flos": 19316954136960.0, + "grad_norm": 4.220230384937809, + "language_loss": 0.85023952, + "learning_rate": 3.9889117680296e-06, + "loss": 0.933833, + "num_input_tokens_seen": 22274025, + "router_z_loss_clip": 5.6484375, + "router_z_loss_mlp": 0.76513672, + "step": 1040, + "time_per_iteration": 2.6646370887756348 + }, + { + "auxiliary_loss_clip": 0.07036482, + "auxiliary_loss_mlp": 0.01364298, + "balance_loss_clip": 0.06464302, + "balance_loss_mlp": 0.01274987, + "epoch": 0.06258830602735609, + "flos": 27753183861120.0, + "grad_norm": 4.590358257909823, + "language_loss": 0.72318321, + "learning_rate": 3.988870776623685e-06, + "loss": 0.80719095, + "num_input_tokens_seen": 22292245, + "router_z_loss_clip": 5.71875, + "router_z_loss_mlp": 0.89306641, + "step": 1041, + "time_per_iteration": 2.6730599403381348 + }, + { + "auxiliary_loss_clip": 0.07040736, + "auxiliary_loss_mlp": 0.01378227, + "balance_loss_clip": 0.06470466, + "balance_loss_mlp": 0.01298548, + "epoch": 0.06264842928002405, + "flos": 23229442431360.0, + "grad_norm": 2.706616424442574, + "language_loss": 0.84952104, + "learning_rate": 3.9888297097995905e-06, + "loss": 0.93371069, + "num_input_tokens_seen": 22311455, + "router_z_loss_clip": 5.6953125, + "router_z_loss_mlp": 0.796875, + "step": 1042, + "time_per_iteration": 2.6521389484405518 + }, + { + "auxiliary_loss_clip": 0.0703849, + "auxiliary_loss_mlp": 0.0134851, + "balance_loss_clip": 0.06476429, + "balance_loss_mlp": 0.01272598, + "epoch": 0.06270855253269202, + "flos": 38408671699200.0, + "grad_norm": 3.072391396873047, + "language_loss": 0.79772788, + "learning_rate": 3.988788567558874e-06, + "loss": 0.88159788, + "num_input_tokens_seen": 22333750, + "router_z_loss_clip": 5.62109375, + "router_z_loss_mlp": 0.75927734, + "step": 1043, + "time_per_iteration": 4.132354021072388 + }, + { + "auxiliary_loss_clip": 0.07023476, + "auxiliary_loss_mlp": 0.01365807, + "balance_loss_clip": 0.06473523, + "balance_loss_mlp": 0.01289656, + "epoch": 0.06276867578535998, + "flos": 22459771952640.0, + "grad_norm": 8.578696431093903, + "language_loss": 0.95484012, + "learning_rate": 3.988747349903097e-06, + "loss": 1.03873289, + "num_input_tokens_seen": 22351940, + "router_z_loss_clip": 5.49609375, + "router_z_loss_mlp": 0.76123047, + "step": 1044, + "time_per_iteration": 4.0872087478637695 + }, + { + "auxiliary_loss_clip": 0.0702454, + "auxiliary_loss_mlp": 0.0136404, + "balance_loss_clip": 0.06474113, + "balance_loss_mlp": 0.0129156, + "epoch": 0.06282879903802796, + "flos": 22937176990080.0, + "grad_norm": 5.298315501835511, + "language_loss": 0.88737643, + "learning_rate": 3.988706056833821e-06, + "loss": 0.97126228, + "num_input_tokens_seen": 22372085, + "router_z_loss_clip": 5.5, + "router_z_loss_mlp": 0.72412109, + "step": 1045, + "time_per_iteration": 2.6359164714813232 + }, + { + "auxiliary_loss_clip": 0.07016507, + "auxiliary_loss_mlp": 0.01377248, + "balance_loss_clip": 0.06467608, + "balance_loss_mlp": 0.01300334, + "epoch": 0.06288892229069593, + "flos": 34827036451200.0, + "grad_norm": 2.8748954821383803, + "language_loss": 0.81643683, + "learning_rate": 3.9886646883526125e-06, + "loss": 0.90037435, + "num_input_tokens_seen": 22392020, + "router_z_loss_clip": 5.484375, + "router_z_loss_mlp": 0.76855469, + "step": 1046, + "time_per_iteration": 4.205566883087158 + }, + { + "auxiliary_loss_clip": 0.07049687, + "auxiliary_loss_mlp": 0.01383919, + "balance_loss_clip": 0.0647831, + "balance_loss_mlp": 0.01309628, + "epoch": 0.06294904554336389, + "flos": 19433178149760.0, + "grad_norm": 3.049904917466256, + "language_loss": 0.8054778, + "learning_rate": 3.988623244461039e-06, + "loss": 0.8898139, + "num_input_tokens_seen": 22411180, + "router_z_loss_clip": 5.71484375, + "router_z_loss_mlp": 0.74267578, + "step": 1047, + "time_per_iteration": 2.628453493118286 + }, + { + "auxiliary_loss_clip": 0.07082113, + "auxiliary_loss_mlp": 0.01418593, + "balance_loss_clip": 0.06488797, + "balance_loss_mlp": 0.01332237, + "epoch": 0.06300916879603187, + "flos": 40671464808960.0, + "grad_norm": 5.477739593856775, + "language_loss": 0.80062962, + "learning_rate": 3.988581725160672e-06, + "loss": 0.88563669, + "num_input_tokens_seen": 22435105, + "router_z_loss_clip": 5.921875, + "router_z_loss_mlp": 0.86279297, + "step": 1048, + "time_per_iteration": 4.191184997558594 + }, + { + "auxiliary_loss_clip": 0.07059699, + "auxiliary_loss_mlp": 0.01409495, + "balance_loss_clip": 0.06479897, + "balance_loss_mlp": 0.01322902, + "epoch": 0.06306929204869983, + "flos": 23810703004800.0, + "grad_norm": 4.634968800445174, + "language_loss": 0.81291783, + "learning_rate": 3.988540130453087e-06, + "loss": 0.89760983, + "num_input_tokens_seen": 22452710, + "router_z_loss_clip": 5.796875, + "router_z_loss_mlp": 0.86669922, + "step": 1049, + "time_per_iteration": 2.650202989578247 + }, + { + "auxiliary_loss_clip": 0.07039324, + "auxiliary_loss_mlp": 0.01395065, + "balance_loss_clip": 0.06466646, + "balance_loss_mlp": 0.01316435, + "epoch": 0.0631294153013678, + "flos": 18921671700480.0, + "grad_norm": 5.321703459602036, + "language_loss": 0.85613585, + "learning_rate": 3.988498460339862e-06, + "loss": 0.9404797, + "num_input_tokens_seen": 22470175, + "router_z_loss_clip": 5.71875, + "router_z_loss_mlp": 0.78662109, + "step": 1050, + "time_per_iteration": 2.6393301486968994 + }, + { + "auxiliary_loss_clip": 0.07003346, + "auxiliary_loss_mlp": 0.01381224, + "balance_loss_clip": 0.06475418, + "balance_loss_mlp": 0.01309221, + "epoch": 0.06318953855403578, + "flos": 24287101793280.0, + "grad_norm": 2.921652621723748, + "language_loss": 0.80915332, + "learning_rate": 3.988456714822575e-06, + "loss": 0.89299899, + "num_input_tokens_seen": 22490020, + "router_z_loss_clip": 5.2734375, + "router_z_loss_mlp": 0.71972656, + "step": 1051, + "time_per_iteration": 2.6563098430633545 + }, + { + "auxiliary_loss_clip": 0.07019964, + "auxiliary_loss_mlp": 0.01395256, + "balance_loss_clip": 0.06461668, + "balance_loss_mlp": 0.01314957, + "epoch": 0.06324966180670374, + "flos": 22535563570560.0, + "grad_norm": 3.4102512673670256, + "language_loss": 0.84142733, + "learning_rate": 3.98841489390281e-06, + "loss": 0.92557955, + "num_input_tokens_seen": 22509685, + "router_z_loss_clip": 5.57421875, + "router_z_loss_mlp": 0.80224609, + "step": 1052, + "time_per_iteration": 2.6776039600372314 + }, + { + "auxiliary_loss_clip": 0.07036786, + "auxiliary_loss_mlp": 0.01379519, + "balance_loss_clip": 0.06459802, + "balance_loss_mlp": 0.01299411, + "epoch": 0.06330978505937171, + "flos": 15783465859200.0, + "grad_norm": 2.8507947153873663, + "language_loss": 0.80809307, + "learning_rate": 3.988372997582155e-06, + "loss": 0.89225614, + "num_input_tokens_seen": 22527905, + "router_z_loss_clip": 5.76953125, + "router_z_loss_mlp": 0.80175781, + "step": 1053, + "time_per_iteration": 2.6043174266815186 + }, + { + "auxiliary_loss_clip": 0.06984901, + "auxiliary_loss_mlp": 0.01368181, + "balance_loss_clip": 0.06446727, + "balance_loss_mlp": 0.0129532, + "epoch": 0.06336990831203967, + "flos": 21477610719360.0, + "grad_norm": 4.159955078588776, + "language_loss": 0.88012934, + "learning_rate": 3.988331025862195e-06, + "loss": 0.96366018, + "num_input_tokens_seen": 22546335, + "router_z_loss_clip": 5.3828125, + "router_z_loss_mlp": 0.72802734, + "step": 1054, + "time_per_iteration": 2.647026777267456 + }, + { + "auxiliary_loss_clip": 0.06987712, + "auxiliary_loss_mlp": 0.01370375, + "balance_loss_clip": 0.06445334, + "balance_loss_mlp": 0.01301568, + "epoch": 0.06343003156470765, + "flos": 18484824839040.0, + "grad_norm": 2.8104304693341837, + "language_loss": 0.89331806, + "learning_rate": 3.9882889787445225e-06, + "loss": 0.97689891, + "num_input_tokens_seen": 22563885, + "router_z_loss_clip": 5.421875, + "router_z_loss_mlp": 0.68798828, + "step": 1055, + "time_per_iteration": 2.5695717334747314 + }, + { + "auxiliary_loss_clip": 0.07031021, + "auxiliary_loss_mlp": 0.01393239, + "balance_loss_clip": 0.06440826, + "balance_loss_mlp": 0.01302354, + "epoch": 0.06349015481737562, + "flos": 25161801765120.0, + "grad_norm": 4.1133835551619224, + "language_loss": 0.85196388, + "learning_rate": 3.988246856230734e-06, + "loss": 0.93620646, + "num_input_tokens_seen": 22583035, + "router_z_loss_clip": 5.89453125, + "router_z_loss_mlp": 0.90820312, + "step": 1056, + "time_per_iteration": 2.685821056365967 + }, + { + "auxiliary_loss_clip": 0.07029925, + "auxiliary_loss_mlp": 0.01408784, + "balance_loss_clip": 0.06446205, + "balance_loss_mlp": 0.01319377, + "epoch": 0.06355027807004358, + "flos": 26879322430080.0, + "grad_norm": 5.02877545894497, + "language_loss": 0.84474576, + "learning_rate": 3.988204658322426e-06, + "loss": 0.92913282, + "num_input_tokens_seen": 22605055, + "router_z_loss_clip": 5.8359375, + "router_z_loss_mlp": 0.89501953, + "step": 1057, + "time_per_iteration": 2.6688387393951416 + }, + { + "auxiliary_loss_clip": 0.06953399, + "auxiliary_loss_mlp": 0.01345887, + "balance_loss_clip": 0.06428042, + "balance_loss_mlp": 0.01278987, + "epoch": 0.06361040132271156, + "flos": 21402951131520.0, + "grad_norm": 3.9641222811805337, + "language_loss": 0.85986251, + "learning_rate": 3.988162385021196e-06, + "loss": 0.94285542, + "num_input_tokens_seen": 22623760, + "router_z_loss_clip": 5.25, + "router_z_loss_mlp": 0.66845703, + "step": 1058, + "time_per_iteration": 2.6371591091156006 + }, + { + "auxiliary_loss_clip": 0.0698344, + "auxiliary_loss_mlp": 0.01353949, + "balance_loss_clip": 0.06427366, + "balance_loss_mlp": 0.01275796, + "epoch": 0.06367052457537953, + "flos": 25739959737600.0, + "grad_norm": 3.2277693096185125, + "language_loss": 0.90202904, + "learning_rate": 3.988120036328651e-06, + "loss": 0.98540288, + "num_input_tokens_seen": 22643000, + "router_z_loss_clip": 5.5625, + "router_z_loss_mlp": 0.78173828, + "step": 1059, + "time_per_iteration": 2.6188669204711914 + }, + { + "auxiliary_loss_clip": 0.06969759, + "auxiliary_loss_mlp": 0.01343893, + "balance_loss_clip": 0.06422018, + "balance_loss_mlp": 0.01267218, + "epoch": 0.0637306478280475, + "flos": 17635840871040.0, + "grad_norm": 3.450468160359764, + "language_loss": 0.94701946, + "learning_rate": 3.988077612246394e-06, + "loss": 1.0301559, + "num_input_tokens_seen": 22660460, + "router_z_loss_clip": 5.48046875, + "router_z_loss_mlp": 0.76708984, + "step": 1060, + "time_per_iteration": 2.659820079803467 + }, + { + "auxiliary_loss_clip": 0.06957703, + "auxiliary_loss_mlp": 0.0133292, + "balance_loss_clip": 0.06419823, + "balance_loss_mlp": 0.01262396, + "epoch": 0.06379077108071547, + "flos": 13667727864960.0, + "grad_norm": 3.5269486179455622, + "language_loss": 0.91039562, + "learning_rate": 3.988035112776035e-06, + "loss": 0.99330181, + "num_input_tokens_seen": 22679270, + "router_z_loss_clip": 5.38671875, + "router_z_loss_mlp": 0.70483398, + "step": 1061, + "time_per_iteration": 2.595237970352173 + }, + { + "auxiliary_loss_clip": 0.07004992, + "auxiliary_loss_mlp": 0.0134989, + "balance_loss_clip": 0.06433421, + "balance_loss_mlp": 0.01272071, + "epoch": 0.06385089433338344, + "flos": 28486950065280.0, + "grad_norm": 26.387846770017223, + "language_loss": 0.80432439, + "learning_rate": 3.987992537919185e-06, + "loss": 0.88787317, + "num_input_tokens_seen": 22699330, + "router_z_loss_clip": 5.7109375, + "router_z_loss_mlp": 0.77832031, + "step": 1062, + "time_per_iteration": 2.69326114654541 + }, + { + "auxiliary_loss_clip": 0.06971388, + "auxiliary_loss_mlp": 0.01333448, + "balance_loss_clip": 0.06420203, + "balance_loss_mlp": 0.01260349, + "epoch": 0.0639110175860514, + "flos": 24317052428160.0, + "grad_norm": 14.259145516712906, + "language_loss": 0.90426183, + "learning_rate": 3.987949887677459e-06, + "loss": 0.98731029, + "num_input_tokens_seen": 22717945, + "router_z_loss_clip": 5.5, + "router_z_loss_mlp": 0.73095703, + "step": 1063, + "time_per_iteration": 2.642476797103882 + }, + { + "auxiliary_loss_clip": 0.06974378, + "auxiliary_loss_mlp": 0.01332583, + "balance_loss_clip": 0.06425211, + "balance_loss_mlp": 0.01259436, + "epoch": 0.06397114083871938, + "flos": 22097291189760.0, + "grad_norm": 2.9601227778370176, + "language_loss": 0.82562792, + "learning_rate": 3.9879071620524744e-06, + "loss": 0.90869761, + "num_input_tokens_seen": 22736790, + "router_z_loss_clip": 5.48828125, + "router_z_loss_mlp": 0.73144531, + "step": 1064, + "time_per_iteration": 2.661435604095459 + }, + { + "auxiliary_loss_clip": 0.06941259, + "auxiliary_loss_mlp": 0.01342729, + "balance_loss_clip": 0.06412596, + "balance_loss_mlp": 0.01271298, + "epoch": 0.06403126409138735, + "flos": 19578849672960.0, + "grad_norm": 3.2505919469988727, + "language_loss": 0.86995006, + "learning_rate": 3.987864361045851e-06, + "loss": 0.95278984, + "num_input_tokens_seen": 22754745, + "router_z_loss_clip": 5.28515625, + "router_z_loss_mlp": 0.71386719, + "step": 1065, + "time_per_iteration": 2.5758113861083984 + }, + { + "auxiliary_loss_clip": 0.06963679, + "auxiliary_loss_mlp": 0.01340247, + "balance_loss_clip": 0.06401139, + "balance_loss_mlp": 0.01265669, + "epoch": 0.06409138734405531, + "flos": 40816968624000.0, + "grad_norm": 2.0842805851080395, + "language_loss": 0.71325147, + "learning_rate": 3.987821484659211e-06, + "loss": 0.79629076, + "num_input_tokens_seen": 22776780, + "router_z_loss_clip": 5.625, + "router_z_loss_mlp": 0.74609375, + "step": 1066, + "time_per_iteration": 2.7917239665985107 + }, + { + "auxiliary_loss_clip": 0.06944396, + "auxiliary_loss_mlp": 0.0133661, + "balance_loss_clip": 0.06404863, + "balance_loss_mlp": 0.01266419, + "epoch": 0.06415151059672328, + "flos": 20446631683200.0, + "grad_norm": 3.9323967107233093, + "language_loss": 0.93839109, + "learning_rate": 3.987778532894181e-06, + "loss": 1.02120125, + "num_input_tokens_seen": 22793915, + "router_z_loss_clip": 5.390625, + "router_z_loss_mlp": 0.70166016, + "step": 1067, + "time_per_iteration": 2.6115174293518066 + }, + { + "auxiliary_loss_clip": 0.06956208, + "auxiliary_loss_mlp": 0.0134, + "balance_loss_clip": 0.06410809, + "balance_loss_mlp": 0.01270954, + "epoch": 0.06421163384939126, + "flos": 18077006217600.0, + "grad_norm": 2.3907527813163947, + "language_loss": 0.86262715, + "learning_rate": 3.987735505752391e-06, + "loss": 0.94558918, + "num_input_tokens_seen": 22812670, + "router_z_loss_clip": 5.453125, + "router_z_loss_mlp": 0.68994141, + "step": 1068, + "time_per_iteration": 2.6069822311401367 + }, + { + "auxiliary_loss_clip": 0.06937677, + "auxiliary_loss_mlp": 0.01339596, + "balance_loss_clip": 0.0640877, + "balance_loss_mlp": 0.01269787, + "epoch": 0.06427175710205922, + "flos": 25126526396160.0, + "grad_norm": 3.0644651013361175, + "language_loss": 0.92719203, + "learning_rate": 3.987692403235471e-06, + "loss": 1.0099647, + "num_input_tokens_seen": 22832440, + "router_z_loss_clip": 5.296875, + "router_z_loss_mlp": 0.69775391, + "step": 1069, + "time_per_iteration": 2.6751255989074707 + }, + { + "auxiliary_loss_clip": 0.06952519, + "auxiliary_loss_mlp": 0.01331878, + "balance_loss_clip": 0.06402327, + "balance_loss_mlp": 0.01256777, + "epoch": 0.06433188035472719, + "flos": 17385684906240.0, + "grad_norm": 4.001862380962301, + "language_loss": 0.98985177, + "learning_rate": 3.987649225345056e-06, + "loss": 1.07269573, + "num_input_tokens_seen": 22845495, + "router_z_loss_clip": 5.5078125, + "router_z_loss_mlp": 0.75048828, + "step": 1070, + "time_per_iteration": 2.5646464824676514 + }, + { + "auxiliary_loss_clip": 0.06933151, + "auxiliary_loss_mlp": 0.01337757, + "balance_loss_clip": 0.0639724, + "balance_loss_mlp": 0.01267042, + "epoch": 0.06439200360739517, + "flos": 23552371267200.0, + "grad_norm": 2.5082910657712474, + "language_loss": 0.90418053, + "learning_rate": 3.987605972082782e-06, + "loss": 0.98688966, + "num_input_tokens_seen": 22865390, + "router_z_loss_clip": 5.359375, + "router_z_loss_mlp": 0.70703125, + "step": 1071, + "time_per_iteration": 2.6427106857299805 + }, + { + "auxiliary_loss_clip": 0.06918223, + "auxiliary_loss_mlp": 0.01334321, + "balance_loss_clip": 0.06398708, + "balance_loss_mlp": 0.01262414, + "epoch": 0.06445212686006313, + "flos": 21986014567680.0, + "grad_norm": 1.871300371090536, + "language_loss": 0.79228568, + "learning_rate": 3.987562643450292e-06, + "loss": 0.87481117, + "num_input_tokens_seen": 22885495, + "router_z_loss_clip": 5.19921875, + "router_z_loss_mlp": 0.71923828, + "step": 1072, + "time_per_iteration": 2.647038698196411 + }, + { + "auxiliary_loss_clip": 0.06937171, + "auxiliary_loss_mlp": 0.01329872, + "balance_loss_clip": 0.06401432, + "balance_loss_mlp": 0.01259205, + "epoch": 0.0645122501127311, + "flos": 25928369642880.0, + "grad_norm": 2.655186985808554, + "language_loss": 0.84775895, + "learning_rate": 3.987519239449226e-06, + "loss": 0.9304294, + "num_input_tokens_seen": 22904845, + "router_z_loss_clip": 5.35546875, + "router_z_loss_mlp": 0.70800781, + "step": 1073, + "time_per_iteration": 2.658341646194458 + }, + { + "auxiliary_loss_clip": 0.06906792, + "auxiliary_loss_mlp": 0.01330074, + "balance_loss_clip": 0.06396446, + "balance_loss_mlp": 0.01263412, + "epoch": 0.06457237336539907, + "flos": 25632498476160.0, + "grad_norm": 1.923481252052909, + "language_loss": 0.82366061, + "learning_rate": 3.987475760081233e-06, + "loss": 0.90602928, + "num_input_tokens_seen": 22925940, + "router_z_loss_clip": 5.09765625, + "router_z_loss_mlp": 0.66650391, + "step": 1074, + "time_per_iteration": 2.6500589847564697 + }, + { + "auxiliary_loss_clip": 0.06911084, + "auxiliary_loss_mlp": 0.01341632, + "balance_loss_clip": 0.0638795, + "balance_loss_mlp": 0.01268152, + "epoch": 0.06463249661806704, + "flos": 19470088673280.0, + "grad_norm": 4.283359791903129, + "language_loss": 0.82960403, + "learning_rate": 3.987432205347958e-06, + "loss": 0.91213125, + "num_input_tokens_seen": 22944375, + "router_z_loss_clip": 5.23046875, + "router_z_loss_mlp": 0.73486328, + "step": 1075, + "time_per_iteration": 2.620055675506592 + }, + { + "auxiliary_loss_clip": 0.06919183, + "auxiliary_loss_mlp": 0.01329908, + "balance_loss_clip": 0.06393343, + "balance_loss_mlp": 0.01260528, + "epoch": 0.064692619870735, + "flos": 24504833427840.0, + "grad_norm": 4.7074268898703, + "language_loss": 0.90130782, + "learning_rate": 3.987388575251055e-06, + "loss": 0.98379874, + "num_input_tokens_seen": 22959145, + "router_z_loss_clip": 5.26171875, + "router_z_loss_mlp": 0.69335938, + "step": 1076, + "time_per_iteration": 2.6410202980041504 + }, + { + "auxiliary_loss_clip": 0.06917243, + "auxiliary_loss_mlp": 0.01324517, + "balance_loss_clip": 0.06391963, + "balance_loss_mlp": 0.01256901, + "epoch": 0.06475274312340297, + "flos": 17024252319360.0, + "grad_norm": 4.89859871786138, + "language_loss": 0.84430212, + "learning_rate": 3.98734486979218e-06, + "loss": 0.92671967, + "num_input_tokens_seen": 22978100, + "router_z_loss_clip": 5.25390625, + "router_z_loss_mlp": 0.67578125, + "step": 1077, + "time_per_iteration": 2.6577157974243164 + }, + { + "auxiliary_loss_clip": 0.06961326, + "auxiliary_loss_mlp": 0.0134572, + "balance_loss_clip": 0.06399816, + "balance_loss_mlp": 0.01265659, + "epoch": 0.06481286637607095, + "flos": 24579409161600.0, + "grad_norm": 2.525164880783881, + "language_loss": 0.95071888, + "learning_rate": 3.987301088972986e-06, + "loss": 1.03378928, + "num_input_tokens_seen": 22997285, + "router_z_loss_clip": 5.609375, + "router_z_loss_mlp": 0.80078125, + "step": 1078, + "time_per_iteration": 2.60807466506958 + }, + { + "auxiliary_loss_clip": 0.0696152, + "auxiliary_loss_mlp": 0.01348441, + "balance_loss_clip": 0.0639492, + "balance_loss_mlp": 0.01266616, + "epoch": 0.06487298962873891, + "flos": 21111985428480.0, + "grad_norm": 2.577127703708103, + "language_loss": 0.81118071, + "learning_rate": 3.987257232795137e-06, + "loss": 0.89428037, + "num_input_tokens_seen": 23016285, + "router_z_loss_clip": 5.6640625, + "router_z_loss_mlp": 0.81835938, + "step": 1079, + "time_per_iteration": 2.6317968368530273 + }, + { + "auxiliary_loss_clip": 0.06928547, + "auxiliary_loss_mlp": 0.01328554, + "balance_loss_clip": 0.06390582, + "balance_loss_mlp": 0.01256837, + "epoch": 0.06493311288140688, + "flos": 24615103800960.0, + "grad_norm": 2.4676521714353865, + "language_loss": 0.72843546, + "learning_rate": 3.987213301260294e-06, + "loss": 0.81100643, + "num_input_tokens_seen": 23036420, + "router_z_loss_clip": 5.37109375, + "router_z_loss_mlp": 0.71728516, + "step": 1080, + "time_per_iteration": 2.6215646266937256 + }, + { + "auxiliary_loss_clip": 0.06919578, + "auxiliary_loss_mlp": 0.01334283, + "balance_loss_clip": 0.06385017, + "balance_loss_mlp": 0.01258323, + "epoch": 0.06499323613407486, + "flos": 25345054644480.0, + "grad_norm": 2.8195024652173233, + "language_loss": 0.76152724, + "learning_rate": 3.987169294370123e-06, + "loss": 0.8440659, + "num_input_tokens_seen": 23056945, + "router_z_loss_clip": 5.34375, + "router_z_loss_mlp": 0.75927734, + "step": 1081, + "time_per_iteration": 2.619861364364624 + }, + { + "auxiliary_loss_clip": 0.06903991, + "auxiliary_loss_mlp": 0.01330699, + "balance_loss_clip": 0.06382824, + "balance_loss_mlp": 0.01260985, + "epoch": 0.06505335938674282, + "flos": 20381908803840.0, + "grad_norm": 3.8302016885059436, + "language_loss": 0.87991226, + "learning_rate": 3.987125212126294e-06, + "loss": 0.96225917, + "num_input_tokens_seen": 23074940, + "router_z_loss_clip": 5.2109375, + "router_z_loss_mlp": 0.69726562, + "step": 1082, + "time_per_iteration": 3.9682254791259766 + }, + { + "auxiliary_loss_clip": 0.06965172, + "auxiliary_loss_mlp": 0.01343743, + "balance_loss_clip": 0.06394538, + "balance_loss_mlp": 0.01265304, + "epoch": 0.06511348263941079, + "flos": 25344970790400.0, + "grad_norm": 3.078052560557278, + "language_loss": 0.85807657, + "learning_rate": 3.987081054530478e-06, + "loss": 0.94116569, + "num_input_tokens_seen": 23093420, + "router_z_loss_clip": 5.70703125, + "router_z_loss_mlp": 0.78417969, + "step": 1083, + "time_per_iteration": 4.172176361083984 + }, + { + "auxiliary_loss_clip": 0.06918654, + "auxiliary_loss_mlp": 0.01347933, + "balance_loss_clip": 0.06379002, + "balance_loss_mlp": 0.01269684, + "epoch": 0.06517360589207877, + "flos": 20337912610560.0, + "grad_norm": 5.768369350853526, + "language_loss": 0.82737648, + "learning_rate": 3.987036821584348e-06, + "loss": 0.91004241, + "num_input_tokens_seen": 23111550, + "router_z_loss_clip": 5.40234375, + "router_z_loss_mlp": 0.78173828, + "step": 1084, + "time_per_iteration": 2.5647377967834473 + }, + { + "auxiliary_loss_clip": 0.06925946, + "auxiliary_loss_mlp": 0.01344614, + "balance_loss_clip": 0.06381474, + "balance_loss_mlp": 0.0126379, + "epoch": 0.06523372914474673, + "flos": 31688956391040.0, + "grad_norm": 2.8637661589946664, + "language_loss": 0.69041795, + "learning_rate": 3.986992513289584e-06, + "loss": 0.7731235, + "num_input_tokens_seen": 23130335, + "router_z_loss_clip": 5.44921875, + "router_z_loss_mlp": 0.80908203, + "step": 1085, + "time_per_iteration": 2.6726510524749756 + }, + { + "auxiliary_loss_clip": 0.06912835, + "auxiliary_loss_mlp": 0.01346265, + "balance_loss_clip": 0.06394207, + "balance_loss_mlp": 0.01271496, + "epoch": 0.0652938523974147, + "flos": 20784612326400.0, + "grad_norm": 3.652482458321433, + "language_loss": 0.80282378, + "learning_rate": 3.9869481296478645e-06, + "loss": 0.88541472, + "num_input_tokens_seen": 23152380, + "router_z_loss_clip": 5.1875, + "router_z_loss_mlp": 0.74707031, + "step": 1086, + "time_per_iteration": 4.0445778369903564 + }, + { + "auxiliary_loss_clip": 0.06903446, + "auxiliary_loss_mlp": 0.01343539, + "balance_loss_clip": 0.06383859, + "balance_loss_mlp": 0.01271489, + "epoch": 0.06535397565008266, + "flos": 16696627655040.0, + "grad_norm": 2.983342921031512, + "language_loss": 0.88718885, + "learning_rate": 3.986903670660872e-06, + "loss": 0.96965867, + "num_input_tokens_seen": 23171630, + "router_z_loss_clip": 5.1953125, + "router_z_loss_mlp": 0.72021484, + "step": 1087, + "time_per_iteration": 2.612272024154663 + }, + { + "auxiliary_loss_clip": 0.06922436, + "auxiliary_loss_mlp": 0.01359561, + "balance_loss_clip": 0.06381297, + "balance_loss_mlp": 0.01282457, + "epoch": 0.06541409890275064, + "flos": 26875171653120.0, + "grad_norm": 4.165814553604834, + "language_loss": 0.81038088, + "learning_rate": 3.9868591363302945e-06, + "loss": 0.89320087, + "num_input_tokens_seen": 23192520, + "router_z_loss_clip": 5.4140625, + "router_z_loss_mlp": 0.77099609, + "step": 1088, + "time_per_iteration": 4.128512620925903 + }, + { + "auxiliary_loss_clip": 0.06905861, + "auxiliary_loss_mlp": 0.01369914, + "balance_loss_clip": 0.0637981, + "balance_loss_mlp": 0.01292333, + "epoch": 0.06547422215541861, + "flos": 20527831889280.0, + "grad_norm": 2.3905965673188043, + "language_loss": 0.73899305, + "learning_rate": 3.9868145266578186e-06, + "loss": 0.82175082, + "num_input_tokens_seen": 23210710, + "router_z_loss_clip": 5.26171875, + "router_z_loss_mlp": 0.77587891, + "step": 1089, + "time_per_iteration": 2.5846424102783203 + }, + { + "auxiliary_loss_clip": 0.06903853, + "auxiliary_loss_mlp": 0.01367809, + "balance_loss_clip": 0.06390744, + "balance_loss_mlp": 0.01297094, + "epoch": 0.06553434540808657, + "flos": 22022925091200.0, + "grad_norm": 2.5933459275490005, + "language_loss": 0.88925481, + "learning_rate": 3.9867698416451366e-06, + "loss": 0.97197139, + "num_input_tokens_seen": 23230305, + "router_z_loss_clip": 5.12890625, + "router_z_loss_mlp": 0.70751953, + "step": 1090, + "time_per_iteration": 2.632730722427368 + }, + { + "auxiliary_loss_clip": 0.06923388, + "auxiliary_loss_mlp": 0.01379562, + "balance_loss_clip": 0.06394897, + "balance_loss_mlp": 0.01304031, + "epoch": 0.06559446866075455, + "flos": 24615648852480.0, + "grad_norm": 5.07637209675267, + "language_loss": 0.7519111, + "learning_rate": 3.9867250812939434e-06, + "loss": 0.83494061, + "num_input_tokens_seen": 23249015, + "router_z_loss_clip": 5.28125, + "router_z_loss_mlp": 0.75634766, + "step": 1091, + "time_per_iteration": 2.6071624755859375 + }, + { + "auxiliary_loss_clip": 0.06920849, + "auxiliary_loss_mlp": 0.01367283, + "balance_loss_clip": 0.06403629, + "balance_loss_mlp": 0.01298141, + "epoch": 0.06565459191342252, + "flos": 24280686956160.0, + "grad_norm": 3.183278775232349, + "language_loss": 0.85751635, + "learning_rate": 3.986680245605936e-06, + "loss": 0.94039762, + "num_input_tokens_seen": 23265105, + "router_z_loss_clip": 5.1640625, + "router_z_loss_mlp": 0.69091797, + "step": 1092, + "time_per_iteration": 2.605273962020874 + }, + { + "auxiliary_loss_clip": 0.06938382, + "auxiliary_loss_mlp": 0.01382517, + "balance_loss_clip": 0.06414036, + "balance_loss_mlp": 0.0131123, + "epoch": 0.06571471516609048, + "flos": 24793493143680.0, + "grad_norm": 3.590473362105347, + "language_loss": 0.74473059, + "learning_rate": 3.986635334582814e-06, + "loss": 0.82793957, + "num_input_tokens_seen": 23283950, + "router_z_loss_clip": 5.2421875, + "router_z_loss_mlp": 0.71337891, + "step": 1093, + "time_per_iteration": 2.638237237930298 + }, + { + "auxiliary_loss_clip": 0.06921268, + "auxiliary_loss_mlp": 0.01380472, + "balance_loss_clip": 0.06396792, + "balance_loss_mlp": 0.01303797, + "epoch": 0.06577483841875846, + "flos": 26221347843840.0, + "grad_norm": 88.21387149104662, + "language_loss": 0.90390575, + "learning_rate": 3.986590348226282e-06, + "loss": 0.98692322, + "num_input_tokens_seen": 23305005, + "router_z_loss_clip": 5.2421875, + "router_z_loss_mlp": 0.76660156, + "step": 1094, + "time_per_iteration": 2.6458590030670166 + }, + { + "auxiliary_loss_clip": 0.06927408, + "auxiliary_loss_mlp": 0.01386993, + "balance_loss_clip": 0.06403756, + "balance_loss_mlp": 0.01310603, + "epoch": 0.06583496167142643, + "flos": 25087519520640.0, + "grad_norm": 2.736930049066649, + "language_loss": 0.83897924, + "learning_rate": 3.986545286538044e-06, + "loss": 0.92212319, + "num_input_tokens_seen": 23323220, + "router_z_loss_clip": 5.23046875, + "router_z_loss_mlp": 0.76416016, + "step": 1095, + "time_per_iteration": 2.678666114807129 + }, + { + "auxiliary_loss_clip": 0.06935441, + "auxiliary_loss_mlp": 0.01385344, + "balance_loss_clip": 0.06404546, + "balance_loss_mlp": 0.01317443, + "epoch": 0.06589508492409439, + "flos": 25636900815360.0, + "grad_norm": 5.395614329655057, + "language_loss": 0.73154068, + "learning_rate": 3.986500149519811e-06, + "loss": 0.81474853, + "num_input_tokens_seen": 23342235, + "router_z_loss_clip": 5.3046875, + "router_z_loss_mlp": 0.67871094, + "step": 1096, + "time_per_iteration": 2.6446287631988525 + }, + { + "auxiliary_loss_clip": 0.06917029, + "auxiliary_loss_mlp": 0.01365132, + "balance_loss_clip": 0.06399326, + "balance_loss_mlp": 0.01297755, + "epoch": 0.06595520817676236, + "flos": 23627701687680.0, + "grad_norm": 3.583666651431395, + "language_loss": 0.80129099, + "learning_rate": 3.986454937173292e-06, + "loss": 0.8841126, + "num_input_tokens_seen": 23363680, + "router_z_loss_clip": 5.171875, + "router_z_loss_mlp": 0.67285156, + "step": 1097, + "time_per_iteration": 2.610381603240967 + }, + { + "auxiliary_loss_clip": 0.06948523, + "auxiliary_loss_mlp": 0.01368674, + "balance_loss_clip": 0.0639759, + "balance_loss_mlp": 0.01295384, + "epoch": 0.06601533142943034, + "flos": 33810019119360.0, + "grad_norm": 2.548144949478092, + "language_loss": 0.80388427, + "learning_rate": 3.986409649500203e-06, + "loss": 0.88705623, + "num_input_tokens_seen": 23385590, + "router_z_loss_clip": 5.50390625, + "router_z_loss_mlp": 0.73339844, + "step": 1098, + "time_per_iteration": 2.720482110977173 + }, + { + "auxiliary_loss_clip": 0.06938128, + "auxiliary_loss_mlp": 0.01366931, + "balance_loss_clip": 0.06409903, + "balance_loss_mlp": 0.01293498, + "epoch": 0.0660754546820983, + "flos": 20264175417600.0, + "grad_norm": 10.171489722923557, + "language_loss": 0.84726501, + "learning_rate": 3.986364286502261e-06, + "loss": 0.93031561, + "num_input_tokens_seen": 23402945, + "router_z_loss_clip": 5.28125, + "router_z_loss_mlp": 0.73486328, + "step": 1099, + "time_per_iteration": 2.598655939102173 + }, + { + "auxiliary_loss_clip": 0.06904539, + "auxiliary_loss_mlp": 0.01375441, + "balance_loss_clip": 0.0639468, + "balance_loss_mlp": 0.01307397, + "epoch": 0.06613557793476627, + "flos": 19360195643520.0, + "grad_norm": 3.568327868722517, + "language_loss": 0.8664155, + "learning_rate": 3.986318848181186e-06, + "loss": 0.94921529, + "num_input_tokens_seen": 23421410, + "router_z_loss_clip": 5.09765625, + "router_z_loss_mlp": 0.68066406, + "step": 1100, + "time_per_iteration": 2.577528238296509 + }, + { + "auxiliary_loss_clip": 0.06927315, + "auxiliary_loss_mlp": 0.01369622, + "balance_loss_clip": 0.06391686, + "balance_loss_mlp": 0.01299861, + "epoch": 0.06619570118743424, + "flos": 13777788602880.0, + "grad_norm": 2.758398197018795, + "language_loss": 0.76281518, + "learning_rate": 3.986273334538702e-06, + "loss": 0.84578454, + "num_input_tokens_seen": 23438870, + "router_z_loss_clip": 5.3515625, + "router_z_loss_mlp": 0.69775391, + "step": 1101, + "time_per_iteration": 2.6156139373779297 + }, + { + "auxiliary_loss_clip": 0.06904308, + "auxiliary_loss_mlp": 0.01359683, + "balance_loss_clip": 0.06387865, + "balance_loss_mlp": 0.01295215, + "epoch": 0.06625582444010221, + "flos": 17863593068160.0, + "grad_norm": 4.389912717391851, + "language_loss": 0.89471924, + "learning_rate": 3.986227745576533e-06, + "loss": 0.97735918, + "num_input_tokens_seen": 23456975, + "router_z_loss_clip": 5.16796875, + "router_z_loss_mlp": 0.64501953, + "step": 1102, + "time_per_iteration": 2.569350242614746 + }, + { + "auxiliary_loss_clip": 0.0692213, + "auxiliary_loss_mlp": 0.01377442, + "balance_loss_clip": 0.06385392, + "balance_loss_mlp": 0.01306584, + "epoch": 0.06631594769277017, + "flos": 11843584479360.0, + "grad_norm": 3.5425773042581055, + "language_loss": 0.86216784, + "learning_rate": 3.98618208129641e-06, + "loss": 0.94516355, + "num_input_tokens_seen": 23473440, + "router_z_loss_clip": 5.36328125, + "router_z_loss_mlp": 0.70898438, + "step": 1103, + "time_per_iteration": 2.6067960262298584 + }, + { + "auxiliary_loss_clip": 0.06886483, + "auxiliary_loss_mlp": 0.01371541, + "balance_loss_clip": 0.06376658, + "balance_loss_mlp": 0.01305547, + "epoch": 0.06637607094543815, + "flos": 19799683908480.0, + "grad_norm": 2.4626452299406383, + "language_loss": 0.8457936, + "learning_rate": 3.986136341700063e-06, + "loss": 0.92837381, + "num_input_tokens_seen": 23493880, + "router_z_loss_clip": 5.09765625, + "router_z_loss_mlp": 0.66015625, + "step": 1104, + "time_per_iteration": 2.5836308002471924 + }, + { + "auxiliary_loss_clip": 0.06882686, + "auxiliary_loss_mlp": 0.01367781, + "balance_loss_clip": 0.0637526, + "balance_loss_mlp": 0.01303408, + "epoch": 0.06643619419810612, + "flos": 25493032154880.0, + "grad_norm": 1.7655477747418094, + "language_loss": 0.83173895, + "learning_rate": 3.986090526789227e-06, + "loss": 0.91424364, + "num_input_tokens_seen": 23514920, + "router_z_loss_clip": 5.0703125, + "router_z_loss_mlp": 0.64306641, + "step": 1105, + "time_per_iteration": 2.662261486053467 + }, + { + "auxiliary_loss_clip": 0.06873615, + "auxiliary_loss_mlp": 0.01369586, + "balance_loss_clip": 0.06380346, + "balance_loss_mlp": 0.01308694, + "epoch": 0.06649631745077408, + "flos": 16952234135040.0, + "grad_norm": 2.812403865753697, + "language_loss": 0.99235487, + "learning_rate": 3.986044636565639e-06, + "loss": 1.0747869, + "num_input_tokens_seen": 23531635, + "router_z_loss_clip": 4.9296875, + "router_z_loss_mlp": 0.60839844, + "step": 1106, + "time_per_iteration": 2.55377459526062 + }, + { + "auxiliary_loss_clip": 0.0691068, + "auxiliary_loss_mlp": 0.01368117, + "balance_loss_clip": 0.06380811, + "balance_loss_mlp": 0.01299977, + "epoch": 0.06655644070344206, + "flos": 17864431608960.0, + "grad_norm": 9.796712570365342, + "language_loss": 0.85572082, + "learning_rate": 3.985998671031039e-06, + "loss": 0.93850881, + "num_input_tokens_seen": 23551020, + "router_z_loss_clip": 5.296875, + "router_z_loss_mlp": 0.68115234, + "step": 1107, + "time_per_iteration": 2.607999324798584 + }, + { + "auxiliary_loss_clip": 0.06769384, + "auxiliary_loss_mlp": 0.01408352, + "balance_loss_clip": 0.06440101, + "balance_loss_mlp": 0.01358189, + "epoch": 0.06661656395611003, + "flos": 61438033779840.0, + "grad_norm": 0.835907980773472, + "language_loss": 0.57139766, + "learning_rate": 3.9859526301871705e-06, + "loss": 0.653175, + "num_input_tokens_seen": 23610675, + "router_z_loss_clip": 3.28515625, + "router_z_loss_mlp": 0.50195312, + "step": 1108, + "time_per_iteration": 3.1505634784698486 + }, + { + "auxiliary_loss_clip": 0.06919513, + "auxiliary_loss_mlp": 0.01358617, + "balance_loss_clip": 0.06388947, + "balance_loss_mlp": 0.01289285, + "epoch": 0.066676687208778, + "flos": 20668304459520.0, + "grad_norm": 4.7813305453067985, + "language_loss": 0.74593651, + "learning_rate": 3.9859065140357795e-06, + "loss": 0.82871783, + "num_input_tokens_seen": 23628710, + "router_z_loss_clip": 5.30078125, + "router_z_loss_mlp": 0.69384766, + "step": 1109, + "time_per_iteration": 2.5951621532440186 + }, + { + "auxiliary_loss_clip": 0.06901313, + "auxiliary_loss_mlp": 0.01359309, + "balance_loss_clip": 0.06382284, + "balance_loss_mlp": 0.01292219, + "epoch": 0.06673681046144596, + "flos": 20929613016960.0, + "grad_norm": 2.4423466539648686, + "language_loss": 0.81162918, + "learning_rate": 3.985860322578614e-06, + "loss": 0.89423537, + "num_input_tokens_seen": 23649160, + "router_z_loss_clip": 5.18359375, + "router_z_loss_mlp": 0.66992188, + "step": 1110, + "time_per_iteration": 2.5594658851623535 + }, + { + "auxiliary_loss_clip": 0.06916048, + "auxiliary_loss_mlp": 0.01350686, + "balance_loss_clip": 0.06385787, + "balance_loss_mlp": 0.01283261, + "epoch": 0.06679693371411394, + "flos": 31073762113920.0, + "grad_norm": 3.192640550751645, + "language_loss": 0.74339402, + "learning_rate": 3.985814055817427e-06, + "loss": 0.82606131, + "num_input_tokens_seen": 23671995, + "router_z_loss_clip": 5.296875, + "router_z_loss_mlp": 0.67431641, + "step": 1111, + "time_per_iteration": 2.6675732135772705 + }, + { + "auxiliary_loss_clip": 0.0692247, + "auxiliary_loss_mlp": 0.01336011, + "balance_loss_clip": 0.0638883, + "balance_loss_mlp": 0.01269492, + "epoch": 0.0668570569667819, + "flos": 21732630220800.0, + "grad_norm": 3.09844838926034, + "language_loss": 0.81051421, + "learning_rate": 3.985767713753971e-06, + "loss": 0.89309895, + "num_input_tokens_seen": 23690705, + "router_z_loss_clip": 5.3359375, + "router_z_loss_mlp": 0.66455078, + "step": 1112, + "time_per_iteration": 2.5785021781921387 + }, + { + "auxiliary_loss_clip": 0.06900664, + "auxiliary_loss_mlp": 0.01347702, + "balance_loss_clip": 0.06384552, + "balance_loss_mlp": 0.01282185, + "epoch": 0.06691718021944987, + "flos": 22753840256640.0, + "grad_norm": 2.9756537070092466, + "language_loss": 0.82400674, + "learning_rate": 3.985721296390005e-06, + "loss": 0.90649039, + "num_input_tokens_seen": 23709990, + "router_z_loss_clip": 5.1640625, + "router_z_loss_mlp": 0.65576172, + "step": 1113, + "time_per_iteration": 2.6159799098968506 + }, + { + "auxiliary_loss_clip": 0.06872059, + "auxiliary_loss_mlp": 0.01337269, + "balance_loss_clip": 0.06376456, + "balance_loss_mlp": 0.01280382, + "epoch": 0.06697730347211785, + "flos": 16551333475200.0, + "grad_norm": 3.049422068587495, + "language_loss": 0.85146165, + "learning_rate": 3.985674803727289e-06, + "loss": 0.93355489, + "num_input_tokens_seen": 23728485, + "router_z_loss_clip": 4.95703125, + "router_z_loss_mlp": 0.56884766, + "step": 1114, + "time_per_iteration": 2.5442495346069336 + }, + { + "auxiliary_loss_clip": 0.06720632, + "auxiliary_loss_mlp": 0.01311166, + "balance_loss_clip": 0.06393555, + "balance_loss_mlp": 0.01264675, + "epoch": 0.06703742672478581, + "flos": 59801545612800.0, + "grad_norm": 0.814822871226623, + "language_loss": 0.58299243, + "learning_rate": 3.985628235767584e-06, + "loss": 0.66331041, + "num_input_tokens_seen": 23786650, + "router_z_loss_clip": 3.2734375, + "router_z_loss_mlp": 0.46435547, + "step": 1115, + "time_per_iteration": 3.1831469535827637 + }, + { + "auxiliary_loss_clip": 0.06912658, + "auxiliary_loss_mlp": 0.01326736, + "balance_loss_clip": 0.06393988, + "balance_loss_mlp": 0.01261314, + "epoch": 0.06709754997745378, + "flos": 16805807925120.0, + "grad_norm": 5.78180725653176, + "language_loss": 0.94695258, + "learning_rate": 3.985581592512658e-06, + "loss": 1.02934647, + "num_input_tokens_seen": 23802555, + "router_z_loss_clip": 5.1875, + "router_z_loss_mlp": 0.65332031, + "step": 1116, + "time_per_iteration": 2.6025443077087402 + }, + { + "auxiliary_loss_clip": 0.06950381, + "auxiliary_loss_mlp": 0.01352294, + "balance_loss_clip": 0.06407215, + "balance_loss_mlp": 0.01283105, + "epoch": 0.06715767323012176, + "flos": 22129883228160.0, + "grad_norm": 3.297350824619057, + "language_loss": 0.90161335, + "learning_rate": 3.985534873964279e-06, + "loss": 0.98464012, + "num_input_tokens_seen": 23822945, + "router_z_loss_clip": 5.42578125, + "router_z_loss_mlp": 0.69189453, + "step": 1117, + "time_per_iteration": 2.640014410018921 + }, + { + "auxiliary_loss_clip": 0.06703123, + "auxiliary_loss_mlp": 0.01296382, + "balance_loss_clip": 0.06378835, + "balance_loss_mlp": 0.01254898, + "epoch": 0.06721779648278972, + "flos": 66634522842240.0, + "grad_norm": 0.828477744144983, + "language_loss": 0.59793437, + "learning_rate": 3.985488080124218e-06, + "loss": 0.67792934, + "num_input_tokens_seen": 23874075, + "router_z_loss_clip": 3.24804688, + "router_z_loss_mlp": 0.41503906, + "step": 1118, + "time_per_iteration": 3.1895816326141357 + }, + { + "auxiliary_loss_clip": 0.0694533, + "auxiliary_loss_mlp": 0.0134688, + "balance_loss_clip": 0.06400572, + "balance_loss_mlp": 0.0127092, + "epoch": 0.06727791973545769, + "flos": 22389011579520.0, + "grad_norm": 4.072656467009049, + "language_loss": 0.87426257, + "learning_rate": 3.985441210994251e-06, + "loss": 0.95718467, + "num_input_tokens_seen": 23889720, + "router_z_loss_clip": 5.453125, + "router_z_loss_mlp": 0.76025391, + "step": 1119, + "time_per_iteration": 2.588590621948242 + }, + { + "auxiliary_loss_clip": 0.0690966, + "auxiliary_loss_mlp": 0.01331486, + "balance_loss_clip": 0.06396869, + "balance_loss_mlp": 0.01269116, + "epoch": 0.06733804298812565, + "flos": 24287143720320.0, + "grad_norm": 3.964620176038611, + "language_loss": 0.88010037, + "learning_rate": 3.9853942665761545e-06, + "loss": 0.9625119, + "num_input_tokens_seen": 23909385, + "router_z_loss_clip": 5.12109375, + "router_z_loss_mlp": 0.62451172, + "step": 1120, + "time_per_iteration": 2.6959142684936523 + }, + { + "auxiliary_loss_clip": 0.06922112, + "auxiliary_loss_mlp": 0.01340271, + "balance_loss_clip": 0.06406626, + "balance_loss_mlp": 0.01275421, + "epoch": 0.06739816624079363, + "flos": 15922638691200.0, + "grad_norm": 2.824028723834481, + "language_loss": 0.81958008, + "learning_rate": 3.985347246871708e-06, + "loss": 0.90220392, + "num_input_tokens_seen": 23926830, + "router_z_loss_clip": 5.15625, + "router_z_loss_mlp": 0.6484375, + "step": 1121, + "time_per_iteration": 2.5337889194488525 + }, + { + "auxiliary_loss_clip": 0.0669936, + "auxiliary_loss_mlp": 0.01328619, + "balance_loss_clip": 0.0637704, + "balance_loss_mlp": 0.01291044, + "epoch": 0.0674582894934616, + "flos": 71422031796480.0, + "grad_norm": 0.7591545371637793, + "language_loss": 0.58392835, + "learning_rate": 3.985300151882694e-06, + "loss": 0.66420811, + "num_input_tokens_seen": 23992640, + "router_z_loss_clip": 3.21875, + "router_z_loss_mlp": 0.375, + "step": 1122, + "time_per_iteration": 4.871971130371094 + }, + { + "auxiliary_loss_clip": 0.06934178, + "auxiliary_loss_mlp": 0.01339594, + "balance_loss_clip": 0.06410946, + "balance_loss_mlp": 0.01275269, + "epoch": 0.06751841274612956, + "flos": 25271988284160.0, + "grad_norm": 2.7004693252579286, + "language_loss": 0.75033748, + "learning_rate": 3.985252981610901e-06, + "loss": 0.83307523, + "num_input_tokens_seen": 24011135, + "router_z_loss_clip": 5.23046875, + "router_z_loss_mlp": 0.64355469, + "step": 1123, + "time_per_iteration": 4.122293472290039 + }, + { + "auxiliary_loss_clip": 0.06974602, + "auxiliary_loss_mlp": 0.0135696, + "balance_loss_clip": 0.06425263, + "balance_loss_mlp": 0.01278282, + "epoch": 0.06757853599879754, + "flos": 23809067850240.0, + "grad_norm": 9.643312426369809, + "language_loss": 0.82052922, + "learning_rate": 3.985205736058114e-06, + "loss": 0.90384483, + "num_input_tokens_seen": 24030695, + "router_z_loss_clip": 5.49609375, + "router_z_loss_mlp": 0.78637695, + "step": 1124, + "time_per_iteration": 2.6173415184020996 + }, + { + "auxiliary_loss_clip": 0.06911455, + "auxiliary_loss_mlp": 0.01341629, + "balance_loss_clip": 0.06401114, + "balance_loss_mlp": 0.01274705, + "epoch": 0.0676386592514655, + "flos": 21040260733440.0, + "grad_norm": 3.063274936287039, + "language_loss": 0.74925935, + "learning_rate": 3.985158415226128e-06, + "loss": 0.83179009, + "num_input_tokens_seen": 24050680, + "router_z_loss_clip": 5.10546875, + "router_z_loss_mlp": 0.66870117, + "step": 1125, + "time_per_iteration": 3.984415292739868 + }, + { + "auxiliary_loss_clip": 0.0694951, + "auxiliary_loss_mlp": 0.01360506, + "balance_loss_clip": 0.06422167, + "balance_loss_mlp": 0.01290745, + "epoch": 0.06769878250413347, + "flos": 25563331330560.0, + "grad_norm": 3.6371795971434935, + "language_loss": 0.84025776, + "learning_rate": 3.985111019116736e-06, + "loss": 0.92335784, + "num_input_tokens_seen": 24067205, + "router_z_loss_clip": 5.2734375, + "router_z_loss_mlp": 0.69726562, + "step": 1126, + "time_per_iteration": 2.6536872386932373 + }, + { + "auxiliary_loss_clip": 0.06684255, + "auxiliary_loss_mlp": 0.01367323, + "balance_loss_clip": 0.06366412, + "balance_loss_mlp": 0.01329891, + "epoch": 0.06775890575680145, + "flos": 70676316385920.0, + "grad_norm": 0.9685337357274917, + "language_loss": 0.60214978, + "learning_rate": 3.985063547731735e-06, + "loss": 0.68266553, + "num_input_tokens_seen": 24131320, + "router_z_loss_clip": 3.1796875, + "router_z_loss_mlp": 0.37353516, + "step": 1127, + "time_per_iteration": 3.2334144115448 + }, + { + "auxiliary_loss_clip": 0.06927685, + "auxiliary_loss_mlp": 0.01345826, + "balance_loss_clip": 0.0640737, + "balance_loss_mlp": 0.01276304, + "epoch": 0.06781902900946941, + "flos": 24241051175040.0, + "grad_norm": 3.0319163993738307, + "language_loss": 0.83925569, + "learning_rate": 3.985016001072925e-06, + "loss": 0.92199081, + "num_input_tokens_seen": 24149930, + "router_z_loss_clip": 5.19921875, + "router_z_loss_mlp": 0.6953125, + "step": 1128, + "time_per_iteration": 4.002989053726196 + }, + { + "auxiliary_loss_clip": 0.06986301, + "auxiliary_loss_mlp": 0.01369711, + "balance_loss_clip": 0.06426411, + "balance_loss_mlp": 0.01288792, + "epoch": 0.06787915226213738, + "flos": 22423825751040.0, + "grad_norm": 5.128906887201041, + "language_loss": 0.79490405, + "learning_rate": 3.984968379142109e-06, + "loss": 0.87846416, + "num_input_tokens_seen": 24169590, + "router_z_loss_clip": 5.59375, + "router_z_loss_mlp": 0.80908203, + "step": 1129, + "time_per_iteration": 2.6091246604919434 + }, + { + "auxiliary_loss_clip": 0.06950344, + "auxiliary_loss_mlp": 0.0134506, + "balance_loss_clip": 0.06413193, + "balance_loss_mlp": 0.01275251, + "epoch": 0.06793927551480534, + "flos": 37716092576640.0, + "grad_norm": 7.724208809946286, + "language_loss": 0.75193048, + "learning_rate": 3.984920681941094e-06, + "loss": 0.83488452, + "num_input_tokens_seen": 24189965, + "router_z_loss_clip": 5.37109375, + "router_z_loss_mlp": 0.69873047, + "step": 1130, + "time_per_iteration": 2.747319221496582 + }, + { + "auxiliary_loss_clip": 0.06924557, + "auxiliary_loss_mlp": 0.01342805, + "balance_loss_clip": 0.06402417, + "balance_loss_mlp": 0.01275428, + "epoch": 0.06799939876747332, + "flos": 20637682992000.0, + "grad_norm": 3.4742611596039583, + "language_loss": 0.83601421, + "learning_rate": 3.984872909471688e-06, + "loss": 0.91868782, + "num_input_tokens_seen": 24208045, + "router_z_loss_clip": 5.21484375, + "router_z_loss_mlp": 0.67333984, + "step": 1131, + "time_per_iteration": 2.619173765182495 + }, + { + "auxiliary_loss_clip": 0.06889838, + "auxiliary_loss_mlp": 0.01323899, + "balance_loss_clip": 0.06390625, + "balance_loss_mlp": 0.01266011, + "epoch": 0.06805952202014129, + "flos": 14869759011840.0, + "grad_norm": 6.452833361572522, + "language_loss": 0.83523953, + "learning_rate": 3.984825061735701e-06, + "loss": 0.91737688, + "num_input_tokens_seen": 24223805, + "router_z_loss_clip": 4.99609375, + "router_z_loss_mlp": 0.57958984, + "step": 1132, + "time_per_iteration": 2.5897791385650635 + }, + { + "auxiliary_loss_clip": 0.06909724, + "auxiliary_loss_mlp": 0.01329094, + "balance_loss_clip": 0.06400912, + "balance_loss_mlp": 0.0126813, + "epoch": 0.06811964527280925, + "flos": 48920710147200.0, + "grad_norm": 2.2815724812180056, + "language_loss": 0.66480637, + "learning_rate": 3.9847771387349495e-06, + "loss": 0.74719459, + "num_input_tokens_seen": 24249475, + "router_z_loss_clip": 5.09375, + "router_z_loss_mlp": 0.61035156, + "step": 1133, + "time_per_iteration": 2.830873966217041 + }, + { + "auxiliary_loss_clip": 0.06951424, + "auxiliary_loss_mlp": 0.01351356, + "balance_loss_clip": 0.06402567, + "balance_loss_mlp": 0.0127573, + "epoch": 0.06817976852547723, + "flos": 15382649053440.0, + "grad_norm": 2.526233551435035, + "language_loss": 0.78033423, + "learning_rate": 3.9847291404712506e-06, + "loss": 0.86336207, + "num_input_tokens_seen": 24267980, + "router_z_loss_clip": 5.484375, + "router_z_loss_mlp": 0.75634766, + "step": 1134, + "time_per_iteration": 2.5770034790039062 + }, + { + "auxiliary_loss_clip": 0.06920115, + "auxiliary_loss_mlp": 0.0133773, + "balance_loss_clip": 0.06399941, + "balance_loss_mlp": 0.01275216, + "epoch": 0.0682398917781452, + "flos": 20161661546880.0, + "grad_norm": 3.170480536995333, + "language_loss": 0.89855266, + "learning_rate": 3.984681066946423e-06, + "loss": 0.98113102, + "num_input_tokens_seen": 24286805, + "router_z_loss_clip": 5.19921875, + "router_z_loss_mlp": 0.625, + "step": 1135, + "time_per_iteration": 2.574153423309326 + }, + { + "auxiliary_loss_clip": 0.06912802, + "auxiliary_loss_mlp": 0.01339867, + "balance_loss_clip": 0.06390901, + "balance_loss_mlp": 0.01268723, + "epoch": 0.06830001503081316, + "flos": 23447341774080.0, + "grad_norm": 4.323885929511343, + "language_loss": 0.81566894, + "learning_rate": 3.984632918162291e-06, + "loss": 0.89819562, + "num_input_tokens_seen": 24305855, + "router_z_loss_clip": 5.2109375, + "router_z_loss_mlp": 0.7109375, + "step": 1136, + "time_per_iteration": 2.632093906402588 + }, + { + "auxiliary_loss_clip": 0.0691568, + "auxiliary_loss_mlp": 0.01339988, + "balance_loss_clip": 0.06395651, + "balance_loss_mlp": 0.01271133, + "epoch": 0.06836013828348114, + "flos": 34358352238080.0, + "grad_norm": 3.452027949613855, + "language_loss": 0.86628962, + "learning_rate": 3.984584694120679e-06, + "loss": 0.94884622, + "num_input_tokens_seen": 24326535, + "router_z_loss_clip": 5.1953125, + "router_z_loss_mlp": 0.68798828, + "step": 1137, + "time_per_iteration": 2.7281885147094727 + }, + { + "auxiliary_loss_clip": 0.0688309, + "auxiliary_loss_mlp": 0.01332345, + "balance_loss_clip": 0.06381994, + "balance_loss_mlp": 0.01269736, + "epoch": 0.06842026153614911, + "flos": 23155537530240.0, + "grad_norm": 8.291551749105667, + "language_loss": 0.81329322, + "learning_rate": 3.984536394823418e-06, + "loss": 0.89544761, + "num_input_tokens_seen": 24345810, + "router_z_loss_clip": 5.0078125, + "router_z_loss_mlp": 0.62646484, + "step": 1138, + "time_per_iteration": 2.605118989944458 + }, + { + "auxiliary_loss_clip": 0.06915967, + "auxiliary_loss_mlp": 0.01331137, + "balance_loss_clip": 0.06396595, + "balance_loss_mlp": 0.01263808, + "epoch": 0.06848038478881707, + "flos": 24616026195840.0, + "grad_norm": 3.6376188064113704, + "language_loss": 0.88301587, + "learning_rate": 3.984488020272336e-06, + "loss": 0.96548682, + "num_input_tokens_seen": 24366095, + "router_z_loss_clip": 5.203125, + "router_z_loss_mlp": 0.67382812, + "step": 1139, + "time_per_iteration": 2.5919554233551025 + }, + { + "auxiliary_loss_clip": 0.06913859, + "auxiliary_loss_mlp": 0.01335261, + "balance_loss_clip": 0.0640454, + "balance_loss_mlp": 0.01272175, + "epoch": 0.06854050804148504, + "flos": 40890663889920.0, + "grad_norm": 3.4360954602414515, + "language_loss": 0.78086925, + "learning_rate": 3.984439570469271e-06, + "loss": 0.8633604, + "num_input_tokens_seen": 24388665, + "router_z_loss_clip": 5.09375, + "router_z_loss_mlp": 0.6315918, + "step": 1140, + "time_per_iteration": 2.805285930633545 + }, + { + "auxiliary_loss_clip": 0.06922249, + "auxiliary_loss_mlp": 0.01343333, + "balance_loss_clip": 0.06401816, + "balance_loss_mlp": 0.01273191, + "epoch": 0.06860063129415302, + "flos": 31694448833280.0, + "grad_norm": 3.650068739701382, + "language_loss": 0.7214306, + "learning_rate": 3.9843910454160574e-06, + "loss": 0.80408645, + "num_input_tokens_seen": 24407705, + "router_z_loss_clip": 5.1953125, + "router_z_loss_mlp": 0.70166016, + "step": 1141, + "time_per_iteration": 2.661224603652954 + }, + { + "auxiliary_loss_clip": 0.06967719, + "auxiliary_loss_mlp": 0.0134803, + "balance_loss_clip": 0.06416196, + "balance_loss_mlp": 0.01274931, + "epoch": 0.06866075454682098, + "flos": 26549265997440.0, + "grad_norm": 3.4867433558806664, + "language_loss": 0.81973946, + "learning_rate": 3.984342445114538e-06, + "loss": 0.902897, + "num_input_tokens_seen": 24428390, + "router_z_loss_clip": 5.515625, + "router_z_loss_mlp": 0.73095703, + "step": 1142, + "time_per_iteration": 2.6615188121795654 + }, + { + "auxiliary_loss_clip": 0.06894746, + "auxiliary_loss_mlp": 0.01330861, + "balance_loss_clip": 0.06396586, + "balance_loss_mlp": 0.01266488, + "epoch": 0.06872087779948895, + "flos": 29797658357760.0, + "grad_norm": 2.7600235318020157, + "language_loss": 0.71011055, + "learning_rate": 3.984293769566553e-06, + "loss": 0.79236662, + "num_input_tokens_seen": 24450810, + "router_z_loss_clip": 4.97265625, + "router_z_loss_mlp": 0.64404297, + "step": 1143, + "time_per_iteration": 2.6366419792175293 + }, + { + "auxiliary_loss_clip": 0.06881121, + "auxiliary_loss_mlp": 0.01324263, + "balance_loss_clip": 0.06384973, + "balance_loss_mlp": 0.01260987, + "epoch": 0.06878100105215693, + "flos": 26948070305280.0, + "grad_norm": 2.948232373137099, + "language_loss": 0.77426863, + "learning_rate": 3.98424501877395e-06, + "loss": 0.85632247, + "num_input_tokens_seen": 24469965, + "router_z_loss_clip": 4.9609375, + "router_z_loss_mlp": 0.63232422, + "step": 1144, + "time_per_iteration": 2.6423499584198 + }, + { + "auxiliary_loss_clip": 0.06941762, + "auxiliary_loss_mlp": 0.01342145, + "balance_loss_clip": 0.0640377, + "balance_loss_mlp": 0.01268617, + "epoch": 0.06884112430482489, + "flos": 10675361255040.0, + "grad_norm": 11.35172742857112, + "language_loss": 0.95204943, + "learning_rate": 3.984196192738577e-06, + "loss": 1.03488851, + "num_input_tokens_seen": 24486370, + "router_z_loss_clip": 5.37890625, + "router_z_loss_mlp": 0.73486328, + "step": 1145, + "time_per_iteration": 2.5397605895996094 + }, + { + "auxiliary_loss_clip": 0.06956828, + "auxiliary_loss_mlp": 0.01350992, + "balance_loss_clip": 0.06409793, + "balance_loss_mlp": 0.01275032, + "epoch": 0.06890124755749286, + "flos": 20199871808640.0, + "grad_norm": 2.888200090327115, + "language_loss": 0.85492933, + "learning_rate": 3.984147291462285e-06, + "loss": 0.93800759, + "num_input_tokens_seen": 24503780, + "router_z_loss_clip": 5.47265625, + "router_z_loss_mlp": 0.76025391, + "step": 1146, + "time_per_iteration": 2.594526529312134 + }, + { + "auxiliary_loss_clip": 0.06872599, + "auxiliary_loss_mlp": 0.01322623, + "balance_loss_clip": 0.06383249, + "balance_loss_mlp": 0.01261373, + "epoch": 0.06896137081016084, + "flos": 20455520215680.0, + "grad_norm": 3.1845992476426472, + "language_loss": 0.87540007, + "learning_rate": 3.98409831494693e-06, + "loss": 0.95735222, + "num_input_tokens_seen": 24522320, + "router_z_loss_clip": 4.890625, + "router_z_loss_mlp": 0.61303711, + "step": 1147, + "time_per_iteration": 2.583275556564331 + }, + { + "auxiliary_loss_clip": 0.06904457, + "auxiliary_loss_mlp": 0.01331833, + "balance_loss_clip": 0.06408815, + "balance_loss_mlp": 0.01268628, + "epoch": 0.0690214940628288, + "flos": 18374512538880.0, + "grad_norm": 2.487655094523106, + "language_loss": 0.88253343, + "learning_rate": 3.984049263194367e-06, + "loss": 0.96489632, + "num_input_tokens_seen": 24540445, + "router_z_loss_clip": 4.9453125, + "router_z_loss_mlp": 0.63232422, + "step": 1148, + "time_per_iteration": 2.6046411991119385 + }, + { + "auxiliary_loss_clip": 0.06914362, + "auxiliary_loss_mlp": 0.01331137, + "balance_loss_clip": 0.0640358, + "balance_loss_mlp": 0.01259516, + "epoch": 0.06908161731549677, + "flos": 20564239288320.0, + "grad_norm": 4.03707404203517, + "language_loss": 0.7250514, + "learning_rate": 3.9840001362064575e-06, + "loss": 0.80750638, + "num_input_tokens_seen": 24557105, + "router_z_loss_clip": 5.10546875, + "router_z_loss_mlp": 0.71606445, + "step": 1149, + "time_per_iteration": 2.598886489868164 + }, + { + "auxiliary_loss_clip": 0.06921704, + "auxiliary_loss_mlp": 0.01339506, + "balance_loss_clip": 0.06409335, + "balance_loss_mlp": 0.01271891, + "epoch": 0.06914174056816474, + "flos": 27571104938880.0, + "grad_norm": 5.60622478722484, + "language_loss": 0.87750047, + "learning_rate": 3.983950933985064e-06, + "loss": 0.96011257, + "num_input_tokens_seen": 24578240, + "router_z_loss_clip": 5.125, + "router_z_loss_mlp": 0.67626953, + "step": 1150, + "time_per_iteration": 2.618924379348755 + }, + { + "auxiliary_loss_clip": 0.06931552, + "auxiliary_loss_mlp": 0.01344517, + "balance_loss_clip": 0.06421608, + "balance_loss_mlp": 0.01277283, + "epoch": 0.06920186382083271, + "flos": 15309331130880.0, + "grad_norm": 4.140310732721626, + "language_loss": 0.85321879, + "learning_rate": 3.983901656532052e-06, + "loss": 0.93597955, + "num_input_tokens_seen": 24593585, + "router_z_loss_clip": 5.08984375, + "router_z_loss_mlp": 0.671875, + "step": 1151, + "time_per_iteration": 2.561635971069336 + }, + { + "auxiliary_loss_clip": 0.06954889, + "auxiliary_loss_mlp": 0.01331032, + "balance_loss_clip": 0.06432007, + "balance_loss_mlp": 0.01262987, + "epoch": 0.06926198707350067, + "flos": 25198125310080.0, + "grad_norm": 6.641784633133515, + "language_loss": 0.8773886, + "learning_rate": 3.983852303849291e-06, + "loss": 0.96024776, + "num_input_tokens_seen": 24613110, + "router_z_loss_clip": 5.2265625, + "router_z_loss_mlp": 0.68066406, + "step": 1152, + "time_per_iteration": 2.610301971435547 + }, + { + "auxiliary_loss_clip": 0.06939621, + "auxiliary_loss_mlp": 0.01350234, + "balance_loss_clip": 0.06435804, + "balance_loss_mlp": 0.01282142, + "epoch": 0.06932211032616864, + "flos": 13260328513920.0, + "grad_norm": 2.8280818960049046, + "language_loss": 0.93534935, + "learning_rate": 3.983802875938651e-06, + "loss": 1.01824796, + "num_input_tokens_seen": 24628795, + "router_z_loss_clip": 5.03125, + "router_z_loss_mlp": 0.68066406, + "step": 1153, + "time_per_iteration": 2.595799207687378 + }, + { + "auxiliary_loss_clip": 0.06937614, + "auxiliary_loss_mlp": 0.01346443, + "balance_loss_clip": 0.06424908, + "balance_loss_mlp": 0.01280687, + "epoch": 0.06938223357883662, + "flos": 24834386736000.0, + "grad_norm": 3.275555077522592, + "language_loss": 0.83502865, + "learning_rate": 3.983753372802008e-06, + "loss": 0.91786921, + "num_input_tokens_seen": 24645480, + "router_z_loss_clip": 5.125, + "router_z_loss_mlp": 0.65771484, + "step": 1154, + "time_per_iteration": 2.615935802459717 + }, + { + "auxiliary_loss_clip": 0.06924553, + "auxiliary_loss_mlp": 0.01343071, + "balance_loss_clip": 0.06417688, + "balance_loss_mlp": 0.01275837, + "epoch": 0.06944235683150458, + "flos": 27274730647680.0, + "grad_norm": 2.790851822686811, + "language_loss": 0.77858025, + "learning_rate": 3.983703794441237e-06, + "loss": 0.86125654, + "num_input_tokens_seen": 24664630, + "router_z_loss_clip": 5.06640625, + "router_z_loss_mlp": 0.67285156, + "step": 1155, + "time_per_iteration": 2.6646928787231445 + }, + { + "auxiliary_loss_clip": 0.06934217, + "auxiliary_loss_mlp": 0.01349275, + "balance_loss_clip": 0.06429212, + "balance_loss_mlp": 0.01284616, + "epoch": 0.06950248008417255, + "flos": 25814493544320.0, + "grad_norm": 4.449978036613599, + "language_loss": 0.73122412, + "learning_rate": 3.98365414085822e-06, + "loss": 0.81405902, + "num_input_tokens_seen": 24684210, + "router_z_loss_clip": 5.05078125, + "router_z_loss_mlp": 0.64697266, + "step": 1156, + "time_per_iteration": 2.6129708290100098 + }, + { + "auxiliary_loss_clip": 0.06933945, + "auxiliary_loss_mlp": 0.0134792, + "balance_loss_clip": 0.06418756, + "balance_loss_mlp": 0.01275202, + "epoch": 0.06956260333684053, + "flos": 22277818811520.0, + "grad_norm": 6.490327446037073, + "language_loss": 0.77343124, + "learning_rate": 3.98360441205484e-06, + "loss": 0.85624993, + "num_input_tokens_seen": 24702490, + "router_z_loss_clip": 5.1484375, + "router_z_loss_mlp": 0.7265625, + "step": 1157, + "time_per_iteration": 2.617549419403076 + }, + { + "auxiliary_loss_clip": 0.06920086, + "auxiliary_loss_mlp": 0.01334116, + "balance_loss_clip": 0.06410048, + "balance_loss_mlp": 0.01268265, + "epoch": 0.0696227265895085, + "flos": 29689442409600.0, + "grad_norm": 3.2808507481159785, + "language_loss": 0.7421459, + "learning_rate": 3.983554608032982e-06, + "loss": 0.8246879, + "num_input_tokens_seen": 24724340, + "router_z_loss_clip": 5.1015625, + "router_z_loss_mlp": 0.65869141, + "step": 1158, + "time_per_iteration": 2.649886131286621 + }, + { + "auxiliary_loss_clip": 0.0693851, + "auxiliary_loss_mlp": 0.01343202, + "balance_loss_clip": 0.06428596, + "balance_loss_mlp": 0.01279401, + "epoch": 0.06968284984217646, + "flos": 25531158562560.0, + "grad_norm": 2.8574838231568687, + "language_loss": 0.82572293, + "learning_rate": 3.983504728794533e-06, + "loss": 0.90854007, + "num_input_tokens_seen": 24745550, + "router_z_loss_clip": 5.09765625, + "router_z_loss_mlp": 0.63818359, + "step": 1159, + "time_per_iteration": 2.657604694366455 + }, + { + "auxiliary_loss_clip": 0.06916194, + "auxiliary_loss_mlp": 0.01333029, + "balance_loss_clip": 0.06403087, + "balance_loss_mlp": 0.01260598, + "epoch": 0.06974297309484444, + "flos": 20703454047360.0, + "grad_norm": 4.319041132998911, + "language_loss": 0.83704364, + "learning_rate": 3.983454774341387e-06, + "loss": 0.91953588, + "num_input_tokens_seen": 24762575, + "router_z_loss_clip": 5.125, + "router_z_loss_mlp": 0.72460938, + "step": 1160, + "time_per_iteration": 2.5699267387390137 + }, + { + "auxiliary_loss_clip": 0.06909285, + "auxiliary_loss_mlp": 0.01331612, + "balance_loss_clip": 0.06406631, + "balance_loss_mlp": 0.01266857, + "epoch": 0.0698030963475124, + "flos": 26512397400960.0, + "grad_norm": 2.5893552087800598, + "language_loss": 0.78334123, + "learning_rate": 3.983404744675437e-06, + "loss": 0.86575019, + "num_input_tokens_seen": 24782605, + "router_z_loss_clip": 5.0234375, + "router_z_loss_mlp": 0.64794922, + "step": 1161, + "time_per_iteration": 4.190939664840698 + }, + { + "auxiliary_loss_clip": 0.06900249, + "auxiliary_loss_mlp": 0.0132851, + "balance_loss_clip": 0.06396457, + "balance_loss_mlp": 0.01263899, + "epoch": 0.06986321960018037, + "flos": 23047279655040.0, + "grad_norm": 6.695162889354259, + "language_loss": 0.8492136, + "learning_rate": 3.9833546397985794e-06, + "loss": 0.93150115, + "num_input_tokens_seen": 24802910, + "router_z_loss_clip": 5.0390625, + "router_z_loss_mlp": 0.64575195, + "step": 1162, + "time_per_iteration": 2.639911413192749 + }, + { + "auxiliary_loss_clip": 0.06873773, + "auxiliary_loss_mlp": 0.01325161, + "balance_loss_clip": 0.06388026, + "balance_loss_mlp": 0.01266557, + "epoch": 0.06992334285284833, + "flos": 28592356901760.0, + "grad_norm": 3.1892890701678778, + "language_loss": 0.82525402, + "learning_rate": 3.983304459712716e-06, + "loss": 0.90724337, + "num_input_tokens_seen": 24823305, + "router_z_loss_clip": 4.8515625, + "router_z_loss_mlp": 0.58642578, + "step": 1163, + "time_per_iteration": 4.1009368896484375 + }, + { + "auxiliary_loss_clip": 0.06902477, + "auxiliary_loss_mlp": 0.0132859, + "balance_loss_clip": 0.06390633, + "balance_loss_mlp": 0.01260832, + "epoch": 0.06998346610551631, + "flos": 20601694863360.0, + "grad_norm": 2.8425577951758956, + "language_loss": 0.8088491, + "learning_rate": 3.983254204419749e-06, + "loss": 0.89115977, + "num_input_tokens_seen": 24842155, + "router_z_loss_clip": 5.1171875, + "router_z_loss_mlp": 0.67773438, + "step": 1164, + "time_per_iteration": 2.6123766899108887 + }, + { + "auxiliary_loss_clip": 0.06897761, + "auxiliary_loss_mlp": 0.01323941, + "balance_loss_clip": 0.06385773, + "balance_loss_mlp": 0.012589, + "epoch": 0.07004358935818428, + "flos": 22535437789440.0, + "grad_norm": 2.2246598791524903, + "language_loss": 0.75642318, + "learning_rate": 3.983203873921583e-06, + "loss": 0.83864021, + "num_input_tokens_seen": 24862080, + "router_z_loss_clip": 5.1171875, + "router_z_loss_mlp": 0.64941406, + "step": 1165, + "time_per_iteration": 4.041048288345337 + }, + { + "auxiliary_loss_clip": 0.06871405, + "auxiliary_loss_mlp": 0.01319453, + "balance_loss_clip": 0.06375992, + "balance_loss_mlp": 0.01258847, + "epoch": 0.07010371261085224, + "flos": 28957646776320.0, + "grad_norm": 2.442665636555923, + "language_loss": 0.83451885, + "learning_rate": 3.983153468220128e-06, + "loss": 0.91642749, + "num_input_tokens_seen": 24886165, + "router_z_loss_clip": 4.94921875, + "router_z_loss_mlp": 0.60668945, + "step": 1166, + "time_per_iteration": 2.652954339981079 + }, + { + "auxiliary_loss_clip": 0.06883232, + "auxiliary_loss_mlp": 0.01318395, + "balance_loss_clip": 0.06374976, + "balance_loss_mlp": 0.01257599, + "epoch": 0.07016383586352022, + "flos": 23665870022400.0, + "grad_norm": 2.9279177018628393, + "language_loss": 0.87250483, + "learning_rate": 3.983102987317295e-06, + "loss": 0.95452112, + "num_input_tokens_seen": 24905775, + "router_z_loss_clip": 5.07421875, + "router_z_loss_mlp": 0.60791016, + "step": 1167, + "time_per_iteration": 3.997807502746582 + }, + { + "auxiliary_loss_clip": 0.06869654, + "auxiliary_loss_mlp": 0.01315759, + "balance_loss_clip": 0.0637234, + "balance_loss_mlp": 0.01256608, + "epoch": 0.07022395911618819, + "flos": 19798258389120.0, + "grad_norm": 3.2057139816430826, + "language_loss": 0.9293927, + "learning_rate": 3.983052431214997e-06, + "loss": 1.01124692, + "num_input_tokens_seen": 24924295, + "router_z_loss_clip": 4.97265625, + "router_z_loss_mlp": 0.59106445, + "step": 1168, + "time_per_iteration": 2.6452579498291016 + }, + { + "auxiliary_loss_clip": 0.06893629, + "auxiliary_loss_mlp": 0.01330714, + "balance_loss_clip": 0.06368282, + "balance_loss_mlp": 0.01258331, + "epoch": 0.07028408236885615, + "flos": 21695551989120.0, + "grad_norm": 11.495675802169094, + "language_loss": 0.91365838, + "learning_rate": 3.983001799915153e-06, + "loss": 0.99590182, + "num_input_tokens_seen": 24943210, + "router_z_loss_clip": 5.24609375, + "router_z_loss_mlp": 0.72363281, + "step": 1169, + "time_per_iteration": 2.647975444793701 + }, + { + "auxiliary_loss_clip": 0.06888205, + "auxiliary_loss_mlp": 0.01328046, + "balance_loss_clip": 0.06373216, + "balance_loss_mlp": 0.01262696, + "epoch": 0.07034420562152413, + "flos": 25637445866880.0, + "grad_norm": 2.8251979605986515, + "language_loss": 0.87019682, + "learning_rate": 3.982951093419681e-06, + "loss": 0.95235932, + "num_input_tokens_seen": 24960360, + "router_z_loss_clip": 5.14453125, + "router_z_loss_mlp": 0.65356445, + "step": 1170, + "time_per_iteration": 2.6168391704559326 + }, + { + "auxiliary_loss_clip": 0.06855451, + "auxiliary_loss_mlp": 0.01322256, + "balance_loss_clip": 0.06370235, + "balance_loss_mlp": 0.01265703, + "epoch": 0.0704043288741921, + "flos": 20816198115840.0, + "grad_norm": 5.8134102676021175, + "language_loss": 0.77777052, + "learning_rate": 3.982900311730506e-06, + "loss": 0.85954762, + "num_input_tokens_seen": 24978290, + "router_z_loss_clip": 4.84765625, + "router_z_loss_mlp": 0.56542969, + "step": 1171, + "time_per_iteration": 2.5752956867218018 + }, + { + "auxiliary_loss_clip": 0.06854077, + "auxiliary_loss_mlp": 0.01325506, + "balance_loss_clip": 0.06365283, + "balance_loss_mlp": 0.01268191, + "epoch": 0.07046445212686006, + "flos": 25600241854080.0, + "grad_norm": 2.1487650465547463, + "language_loss": 0.92066246, + "learning_rate": 3.9828494548495514e-06, + "loss": 1.00245833, + "num_input_tokens_seen": 24997055, + "router_z_loss_clip": 4.88671875, + "router_z_loss_mlp": 0.57373047, + "step": 1172, + "time_per_iteration": 2.6476805210113525 + }, + { + "auxiliary_loss_clip": 0.06885421, + "auxiliary_loss_mlp": 0.01324663, + "balance_loss_clip": 0.06371161, + "balance_loss_mlp": 0.01262006, + "epoch": 0.07052457537952803, + "flos": 25564086017280.0, + "grad_norm": 2.603738764291359, + "language_loss": 0.84748065, + "learning_rate": 3.982798522778748e-06, + "loss": 0.92958152, + "num_input_tokens_seen": 25017490, + "router_z_loss_clip": 5.140625, + "router_z_loss_mlp": 0.62695312, + "step": 1173, + "time_per_iteration": 2.6071321964263916 + }, + { + "auxiliary_loss_clip": 0.06857952, + "auxiliary_loss_mlp": 0.01331109, + "balance_loss_clip": 0.06368312, + "balance_loss_mlp": 0.01273054, + "epoch": 0.070584698632196, + "flos": 17974450419840.0, + "grad_norm": 3.5775835502164868, + "language_loss": 0.85116845, + "learning_rate": 3.9827475155200245e-06, + "loss": 0.9330591, + "num_input_tokens_seen": 25035660, + "router_z_loss_clip": 4.89453125, + "router_z_loss_mlp": 0.58129883, + "step": 1174, + "time_per_iteration": 2.57753324508667 + }, + { + "auxiliary_loss_clip": 0.06853965, + "auxiliary_loss_mlp": 0.01334878, + "balance_loss_clip": 0.06364483, + "balance_loss_mlp": 0.01276847, + "epoch": 0.07064482188486397, + "flos": 25377353193600.0, + "grad_norm": 2.5795508468108053, + "language_loss": 0.87789464, + "learning_rate": 3.982696433075317e-06, + "loss": 0.95978308, + "num_input_tokens_seen": 25054785, + "router_z_loss_clip": 4.89453125, + "router_z_loss_mlp": 0.58056641, + "step": 1175, + "time_per_iteration": 2.610611915588379 + }, + { + "auxiliary_loss_clip": 0.06871554, + "auxiliary_loss_mlp": 0.01331862, + "balance_loss_clip": 0.06373453, + "balance_loss_mlp": 0.0127116, + "epoch": 0.07070494513753194, + "flos": 24906782263680.0, + "grad_norm": 2.676154874226604, + "language_loss": 0.87147272, + "learning_rate": 3.982645275446563e-06, + "loss": 0.95350683, + "num_input_tokens_seen": 25075180, + "router_z_loss_clip": 4.97265625, + "router_z_loss_mlp": 0.60644531, + "step": 1176, + "time_per_iteration": 2.6749603748321533 + }, + { + "auxiliary_loss_clip": 0.06855497, + "auxiliary_loss_mlp": 0.01331059, + "balance_loss_clip": 0.06369121, + "balance_loss_mlp": 0.01272075, + "epoch": 0.07076506839019991, + "flos": 22343715648000.0, + "grad_norm": 7.137695949749425, + "language_loss": 0.76855987, + "learning_rate": 3.982594042635701e-06, + "loss": 0.85042542, + "num_input_tokens_seen": 25093035, + "router_z_loss_clip": 4.86328125, + "router_z_loss_mlp": 0.58984375, + "step": 1177, + "time_per_iteration": 2.57594895362854 + }, + { + "auxiliary_loss_clip": 0.06883623, + "auxiliary_loss_mlp": 0.0132835, + "balance_loss_clip": 0.06377017, + "balance_loss_mlp": 0.01265599, + "epoch": 0.07082519164286788, + "flos": 18666694126080.0, + "grad_norm": 2.8035814441303164, + "language_loss": 0.8769573, + "learning_rate": 3.982542734644673e-06, + "loss": 0.959077, + "num_input_tokens_seen": 25112520, + "router_z_loss_clip": 5.0625, + "router_z_loss_mlp": 0.62695312, + "step": 1178, + "time_per_iteration": 2.6013543605804443 + }, + { + "auxiliary_loss_clip": 0.06703987, + "auxiliary_loss_mlp": 0.0134181, + "balance_loss_clip": 0.06385635, + "balance_loss_mlp": 0.01304808, + "epoch": 0.07088531489553584, + "flos": 63674691615360.0, + "grad_norm": 0.8655968349167181, + "language_loss": 0.63642812, + "learning_rate": 3.982491351475427e-06, + "loss": 0.71688616, + "num_input_tokens_seen": 25177760, + "router_z_loss_clip": 3.1875, + "router_z_loss_mlp": 0.36938477, + "step": 1179, + "time_per_iteration": 3.3081142902374268 + }, + { + "auxiliary_loss_clip": 0.06890059, + "auxiliary_loss_mlp": 0.01335612, + "balance_loss_clip": 0.06383069, + "balance_loss_mlp": 0.01270047, + "epoch": 0.07094543814820382, + "flos": 21577902456960.0, + "grad_norm": 4.088495173814758, + "language_loss": 0.87769747, + "learning_rate": 3.98243989312991e-06, + "loss": 0.9599542, + "num_input_tokens_seen": 25195260, + "router_z_loss_clip": 5.0703125, + "router_z_loss_mlp": 0.65625, + "step": 1180, + "time_per_iteration": 2.559685707092285 + }, + { + "auxiliary_loss_clip": 0.06872466, + "auxiliary_loss_mlp": 0.01339604, + "balance_loss_clip": 0.06370541, + "balance_loss_mlp": 0.01274754, + "epoch": 0.07100556140087179, + "flos": 22096326867840.0, + "grad_norm": 6.479686279022214, + "language_loss": 0.90814912, + "learning_rate": 3.982388359610074e-06, + "loss": 0.99026984, + "num_input_tokens_seen": 25212740, + "router_z_loss_clip": 5.015625, + "router_z_loss_mlp": 0.6484375, + "step": 1181, + "time_per_iteration": 2.616978883743286 + }, + { + "auxiliary_loss_clip": 0.06848356, + "auxiliary_loss_mlp": 0.01339504, + "balance_loss_clip": 0.06372169, + "balance_loss_mlp": 0.01279351, + "epoch": 0.07106568465353975, + "flos": 47933056471680.0, + "grad_norm": 6.025910143763993, + "language_loss": 0.86037725, + "learning_rate": 3.9823367509178725e-06, + "loss": 0.94225585, + "num_input_tokens_seen": 25236420, + "router_z_loss_clip": 4.75390625, + "router_z_loss_mlp": 0.60131836, + "step": 1182, + "time_per_iteration": 2.7946407794952393 + }, + { + "auxiliary_loss_clip": 0.06876318, + "auxiliary_loss_mlp": 0.0134218, + "balance_loss_clip": 0.06371553, + "balance_loss_mlp": 0.01276806, + "epoch": 0.07112580790620772, + "flos": 23447551409280.0, + "grad_norm": 3.676638851024929, + "language_loss": 0.82862288, + "learning_rate": 3.982285067055262e-06, + "loss": 0.91080785, + "num_input_tokens_seen": 25255120, + "router_z_loss_clip": 5.04296875, + "router_z_loss_mlp": 0.65332031, + "step": 1183, + "time_per_iteration": 2.60546612739563 + }, + { + "auxiliary_loss_clip": 0.06882935, + "auxiliary_loss_mlp": 0.01336855, + "balance_loss_clip": 0.0637991, + "balance_loss_mlp": 0.01272101, + "epoch": 0.0711859311588757, + "flos": 31877030880000.0, + "grad_norm": 4.3786669508725335, + "language_loss": 0.81657791, + "learning_rate": 3.982233308024204e-06, + "loss": 0.8987757, + "num_input_tokens_seen": 25275150, + "router_z_loss_clip": 5.02734375, + "router_z_loss_mlp": 0.64794922, + "step": 1184, + "time_per_iteration": 2.651372194290161 + }, + { + "auxiliary_loss_clip": 0.06854693, + "auxiliary_loss_mlp": 0.013301, + "balance_loss_clip": 0.06374621, + "balance_loss_mlp": 0.01271926, + "epoch": 0.07124605441154366, + "flos": 19616514883200.0, + "grad_norm": 2.502972307695957, + "language_loss": 0.79704922, + "learning_rate": 3.98218147382666e-06, + "loss": 0.87889707, + "num_input_tokens_seen": 25293680, + "router_z_loss_clip": 4.796875, + "router_z_loss_mlp": 0.58178711, + "step": 1185, + "time_per_iteration": 2.591947555541992 + }, + { + "auxiliary_loss_clip": 0.06869413, + "auxiliary_loss_mlp": 0.01332248, + "balance_loss_clip": 0.06377724, + "balance_loss_mlp": 0.0127169, + "epoch": 0.07130617766421163, + "flos": 14689776441600.0, + "grad_norm": 8.952451247795917, + "language_loss": 0.68110502, + "learning_rate": 3.982129564464596e-06, + "loss": 0.7631216, + "num_input_tokens_seen": 25310050, + "router_z_loss_clip": 4.91015625, + "router_z_loss_mlp": 0.60546875, + "step": 1186, + "time_per_iteration": 2.52742862701416 + }, + { + "auxiliary_loss_clip": 0.06856332, + "auxiliary_loss_mlp": 0.01335213, + "balance_loss_clip": 0.06375858, + "balance_loss_mlp": 0.01277587, + "epoch": 0.07136630091687961, + "flos": 26075131269120.0, + "grad_norm": 3.0050123348369984, + "language_loss": 0.72187626, + "learning_rate": 3.98207757993998e-06, + "loss": 0.8037917, + "num_input_tokens_seen": 25331020, + "router_z_loss_clip": 4.796875, + "router_z_loss_mlp": 0.57641602, + "step": 1187, + "time_per_iteration": 2.6516740322113037 + }, + { + "auxiliary_loss_clip": 0.06852362, + "auxiliary_loss_mlp": 0.01318955, + "balance_loss_clip": 0.06373794, + "balance_loss_mlp": 0.01261901, + "epoch": 0.07142642416954757, + "flos": 15674621005440.0, + "grad_norm": 8.213543534109728, + "language_loss": 0.81159407, + "learning_rate": 3.9820255202547845e-06, + "loss": 0.89330727, + "num_input_tokens_seen": 25347875, + "router_z_loss_clip": 4.78125, + "router_z_loss_mlp": 0.57006836, + "step": 1188, + "time_per_iteration": 2.535729169845581 + }, + { + "auxiliary_loss_clip": 0.06864372, + "auxiliary_loss_mlp": 0.01337634, + "balance_loss_clip": 0.06379133, + "balance_loss_mlp": 0.01275216, + "epoch": 0.07148654742221554, + "flos": 19761389792640.0, + "grad_norm": 3.9335979273681794, + "language_loss": 0.87605166, + "learning_rate": 3.981973385410981e-06, + "loss": 0.95807171, + "num_input_tokens_seen": 25366715, + "router_z_loss_clip": 4.84765625, + "router_z_loss_mlp": 0.62402344, + "step": 1189, + "time_per_iteration": 2.6562387943267822 + }, + { + "auxiliary_loss_clip": 0.06861293, + "auxiliary_loss_mlp": 0.01342124, + "balance_loss_clip": 0.06382903, + "balance_loss_mlp": 0.01281685, + "epoch": 0.07154667067488352, + "flos": 23477669752320.0, + "grad_norm": 2.556740892092056, + "language_loss": 0.79916418, + "learning_rate": 3.9819211754105494e-06, + "loss": 0.88119841, + "num_input_tokens_seen": 25385450, + "router_z_loss_clip": 4.78125, + "router_z_loss_mlp": 0.60473633, + "step": 1190, + "time_per_iteration": 2.5854697227478027 + }, + { + "auxiliary_loss_clip": 0.06877136, + "auxiliary_loss_mlp": 0.01341277, + "balance_loss_clip": 0.06381981, + "balance_loss_mlp": 0.01274925, + "epoch": 0.07160679392755148, + "flos": 18338859826560.0, + "grad_norm": 3.405692469784563, + "language_loss": 0.78708088, + "learning_rate": 3.981868890255468e-06, + "loss": 0.86926508, + "num_input_tokens_seen": 25403940, + "router_z_loss_clip": 4.94140625, + "router_z_loss_mlp": 0.6628418, + "step": 1191, + "time_per_iteration": 2.638591766357422 + }, + { + "auxiliary_loss_clip": 0.06881537, + "auxiliary_loss_mlp": 0.01331932, + "balance_loss_clip": 0.06384552, + "balance_loss_mlp": 0.01271493, + "epoch": 0.07166691718021945, + "flos": 17752484154240.0, + "grad_norm": 4.470338815774188, + "language_loss": 0.76098609, + "learning_rate": 3.981816529947719e-06, + "loss": 0.84312069, + "num_input_tokens_seen": 25420410, + "router_z_loss_clip": 4.9609375, + "router_z_loss_mlp": 0.60424805, + "step": 1192, + "time_per_iteration": 2.5505447387695312 + }, + { + "auxiliary_loss_clip": 0.06871057, + "auxiliary_loss_mlp": 0.01335615, + "balance_loss_clip": 0.06381638, + "balance_loss_mlp": 0.01275009, + "epoch": 0.07172704043288743, + "flos": 22457885235840.0, + "grad_norm": 6.182703134969588, + "language_loss": 0.8089788, + "learning_rate": 3.9817640944892896e-06, + "loss": 0.89104557, + "num_input_tokens_seen": 25439415, + "router_z_loss_clip": 4.88671875, + "router_z_loss_mlp": 0.60644531, + "step": 1193, + "time_per_iteration": 2.633073329925537 + }, + { + "auxiliary_loss_clip": 0.06859954, + "auxiliary_loss_mlp": 0.01339771, + "balance_loss_clip": 0.06379488, + "balance_loss_mlp": 0.0127733, + "epoch": 0.07178716368555539, + "flos": 23228981233920.0, + "grad_norm": 5.198460731675794, + "language_loss": 0.88664103, + "learning_rate": 3.981711583882166e-06, + "loss": 0.96863824, + "num_input_tokens_seen": 25458715, + "router_z_loss_clip": 4.80078125, + "router_z_loss_mlp": 0.62426758, + "step": 1194, + "time_per_iteration": 2.5827341079711914 + }, + { + "auxiliary_loss_clip": 0.06866181, + "auxiliary_loss_mlp": 0.01325528, + "balance_loss_clip": 0.06383646, + "balance_loss_mlp": 0.01270096, + "epoch": 0.07184728693822336, + "flos": 25157064009600.0, + "grad_norm": 6.369260359442203, + "language_loss": 0.83872163, + "learning_rate": 3.981658998128341e-06, + "loss": 0.92063868, + "num_input_tokens_seen": 25477985, + "router_z_loss_clip": 4.828125, + "router_z_loss_mlp": 0.55444336, + "step": 1195, + "time_per_iteration": 2.6193504333496094 + }, + { + "auxiliary_loss_clip": 0.06856936, + "auxiliary_loss_mlp": 0.01324202, + "balance_loss_clip": 0.06375654, + "balance_loss_mlp": 0.01265241, + "epoch": 0.07190741019089132, + "flos": 22717894055040.0, + "grad_norm": 2.883346879050408, + "language_loss": 0.81836474, + "learning_rate": 3.981606337229808e-06, + "loss": 0.90017617, + "num_input_tokens_seen": 25497110, + "router_z_loss_clip": 4.8046875, + "router_z_loss_mlp": 0.58984375, + "step": 1196, + "time_per_iteration": 2.586151123046875 + }, + { + "auxiliary_loss_clip": 0.06870347, + "auxiliary_loss_mlp": 0.0135034, + "balance_loss_clip": 0.06381004, + "balance_loss_mlp": 0.0128828, + "epoch": 0.0719675334435593, + "flos": 29357247697920.0, + "grad_norm": 3.757214572000768, + "language_loss": 0.74150658, + "learning_rate": 3.9815536011885655e-06, + "loss": 0.82371342, + "num_input_tokens_seen": 25516555, + "router_z_loss_clip": 4.89453125, + "router_z_loss_mlp": 0.62109375, + "step": 1197, + "time_per_iteration": 2.653139114379883 + }, + { + "auxiliary_loss_clip": 0.06849834, + "auxiliary_loss_mlp": 0.01333514, + "balance_loss_clip": 0.0637273, + "balance_loss_mlp": 0.01277867, + "epoch": 0.07202765669622727, + "flos": 17645609871360.0, + "grad_norm": 7.565571046606514, + "language_loss": 0.88836908, + "learning_rate": 3.98150079000661e-06, + "loss": 0.97020251, + "num_input_tokens_seen": 25533895, + "router_z_loss_clip": 4.76953125, + "router_z_loss_mlp": 0.55664062, + "step": 1198, + "time_per_iteration": 2.558506727218628 + }, + { + "auxiliary_loss_clip": 0.06868395, + "auxiliary_loss_mlp": 0.01336115, + "balance_loss_clip": 0.06385568, + "balance_loss_mlp": 0.01278942, + "epoch": 0.07208777994889523, + "flos": 21440448633600.0, + "grad_norm": 9.650241915118821, + "language_loss": 0.86308157, + "learning_rate": 3.981447903685947e-06, + "loss": 0.94512665, + "num_input_tokens_seen": 25554195, + "router_z_loss_clip": 4.828125, + "router_z_loss_mlp": 0.57202148, + "step": 1199, + "time_per_iteration": 2.593768835067749 + }, + { + "auxiliary_loss_clip": 0.06879794, + "auxiliary_loss_mlp": 0.01340676, + "balance_loss_clip": 0.06389172, + "balance_loss_mlp": 0.01281167, + "epoch": 0.07214790320156321, + "flos": 26947776816000.0, + "grad_norm": 2.5713335496183136, + "language_loss": 0.78793061, + "learning_rate": 3.981394942228581e-06, + "loss": 0.87013531, + "num_input_tokens_seen": 25574155, + "router_z_loss_clip": 4.91015625, + "router_z_loss_mlp": 0.59521484, + "step": 1200, + "time_per_iteration": 2.6549324989318848 + }, + { + "auxiliary_loss_clip": 0.06889373, + "auxiliary_loss_mlp": 0.01341905, + "balance_loss_clip": 0.06398184, + "balance_loss_mlp": 0.01281109, + "epoch": 0.07220802645423118, + "flos": 23886997747200.0, + "grad_norm": 3.3919476714664185, + "language_loss": 0.84325218, + "learning_rate": 3.98134190563652e-06, + "loss": 0.925565, + "num_input_tokens_seen": 25592735, + "router_z_loss_clip": 4.91015625, + "router_z_loss_mlp": 0.60839844, + "step": 1201, + "time_per_iteration": 3.9977235794067383 + }, + { + "auxiliary_loss_clip": 0.06908435, + "auxiliary_loss_mlp": 0.01338574, + "balance_loss_clip": 0.06397285, + "balance_loss_mlp": 0.0127382, + "epoch": 0.07226814970689914, + "flos": 19249464072960.0, + "grad_norm": 2.7243272317134624, + "language_loss": 0.71221054, + "learning_rate": 3.981288793911775e-06, + "loss": 0.7946806, + "num_input_tokens_seen": 25611510, + "router_z_loss_clip": 5.109375, + "router_z_loss_mlp": 0.6472168, + "step": 1202, + "time_per_iteration": 4.006861925125122 + }, + { + "auxiliary_loss_clip": 0.06890082, + "auxiliary_loss_mlp": 0.01341886, + "balance_loss_clip": 0.06389347, + "balance_loss_mlp": 0.01278705, + "epoch": 0.07232827295956712, + "flos": 19178074794240.0, + "grad_norm": 3.218171076661328, + "language_loss": 0.89525115, + "learning_rate": 3.98123560705636e-06, + "loss": 0.97757077, + "num_input_tokens_seen": 25629560, + "router_z_loss_clip": 5.00390625, + "router_z_loss_mlp": 0.63232422, + "step": 1203, + "time_per_iteration": 2.6098897457122803 + }, + { + "auxiliary_loss_clip": 0.069024, + "auxiliary_loss_mlp": 0.01349525, + "balance_loss_clip": 0.06393193, + "balance_loss_mlp": 0.01279335, + "epoch": 0.07238839621223508, + "flos": 17645567944320.0, + "grad_norm": 3.0614329982122266, + "language_loss": 0.81485641, + "learning_rate": 3.981182345072293e-06, + "loss": 0.89737558, + "num_input_tokens_seen": 25648330, + "router_z_loss_clip": 5.09375, + "router_z_loss_mlp": 0.70214844, + "step": 1204, + "time_per_iteration": 3.999619960784912 + }, + { + "auxiliary_loss_clip": 0.06911701, + "auxiliary_loss_mlp": 0.01333494, + "balance_loss_clip": 0.06413823, + "balance_loss_mlp": 0.01269693, + "epoch": 0.07244851946490305, + "flos": 28299797971200.0, + "grad_norm": 3.782046298297649, + "language_loss": 0.84954846, + "learning_rate": 3.981129007961593e-06, + "loss": 0.9320004, + "num_input_tokens_seen": 25669470, + "router_z_loss_clip": 4.97265625, + "router_z_loss_mlp": 0.63818359, + "step": 1205, + "time_per_iteration": 2.658663272857666 + }, + { + "auxiliary_loss_clip": 0.06914138, + "auxiliary_loss_mlp": 0.0134752, + "balance_loss_clip": 0.06405394, + "balance_loss_mlp": 0.01278021, + "epoch": 0.07250864271757101, + "flos": 22571383991040.0, + "grad_norm": 9.50364615421703, + "language_loss": 0.78291214, + "learning_rate": 3.981075595726283e-06, + "loss": 0.86552876, + "num_input_tokens_seen": 25690470, + "router_z_loss_clip": 5.078125, + "router_z_loss_mlp": 0.69458008, + "step": 1206, + "time_per_iteration": 2.6500728130340576 + }, + { + "auxiliary_loss_clip": 0.06879818, + "auxiliary_loss_mlp": 0.01347642, + "balance_loss_clip": 0.06386471, + "balance_loss_mlp": 0.0128594, + "epoch": 0.072568765970239, + "flos": 21768869911680.0, + "grad_norm": 3.061800504881848, + "language_loss": 0.79528189, + "learning_rate": 3.981022108368387e-06, + "loss": 0.87755644, + "num_input_tokens_seen": 25709205, + "router_z_loss_clip": 4.9375, + "router_z_loss_mlp": 0.61767578, + "step": 1207, + "time_per_iteration": 4.111234903335571 + }, + { + "auxiliary_loss_clip": 0.06890166, + "auxiliary_loss_mlp": 0.0133734, + "balance_loss_clip": 0.06392397, + "balance_loss_mlp": 0.01278618, + "epoch": 0.07262888922290696, + "flos": 25526672369280.0, + "grad_norm": 2.516808639831756, + "language_loss": 0.82780725, + "learning_rate": 3.9809685458899345e-06, + "loss": 0.91008234, + "num_input_tokens_seen": 25728485, + "router_z_loss_clip": 4.98046875, + "router_z_loss_mlp": 0.58789062, + "step": 1208, + "time_per_iteration": 2.65267276763916 + }, + { + "auxiliary_loss_clip": 0.06873606, + "auxiliary_loss_mlp": 0.01329274, + "balance_loss_clip": 0.06393886, + "balance_loss_mlp": 0.01270813, + "epoch": 0.07268901247557492, + "flos": 21252080655360.0, + "grad_norm": 3.726862788271486, + "language_loss": 0.80825698, + "learning_rate": 3.980914908292955e-06, + "loss": 0.89028573, + "num_input_tokens_seen": 25747730, + "router_z_loss_clip": 4.80078125, + "router_z_loss_mlp": 0.58496094, + "step": 1209, + "time_per_iteration": 2.5653858184814453 + }, + { + "auxiliary_loss_clip": 0.06887256, + "auxiliary_loss_mlp": 0.01333341, + "balance_loss_clip": 0.06401981, + "balance_loss_mlp": 0.012714, + "epoch": 0.0727491357282429, + "flos": 25485611068800.0, + "grad_norm": 85.1554110577333, + "language_loss": 0.83058631, + "learning_rate": 3.980861195579486e-06, + "loss": 0.91279227, + "num_input_tokens_seen": 25768050, + "router_z_loss_clip": 4.84375, + "router_z_loss_mlp": 0.61962891, + "step": 1210, + "time_per_iteration": 2.6290841102600098 + }, + { + "auxiliary_loss_clip": 0.06912959, + "auxiliary_loss_mlp": 0.01335995, + "balance_loss_clip": 0.064188, + "balance_loss_mlp": 0.01275437, + "epoch": 0.07280925898091087, + "flos": 24469054934400.0, + "grad_norm": 2.3690681332483092, + "language_loss": 0.87872899, + "learning_rate": 3.98080740775156e-06, + "loss": 0.96121848, + "num_input_tokens_seen": 25787985, + "router_z_loss_clip": 4.93359375, + "router_z_loss_mlp": 0.60571289, + "step": 1211, + "time_per_iteration": 2.601407289505005 + }, + { + "auxiliary_loss_clip": 0.06907704, + "auxiliary_loss_mlp": 0.01325307, + "balance_loss_clip": 0.06408024, + "balance_loss_mlp": 0.01262221, + "epoch": 0.07286938223357883, + "flos": 18292725354240.0, + "grad_norm": 12.676001298421971, + "language_loss": 0.94102865, + "learning_rate": 3.98075354481122e-06, + "loss": 1.0233587, + "num_input_tokens_seen": 25803620, + "router_z_loss_clip": 5.0, + "router_z_loss_mlp": 0.63134766, + "step": 1212, + "time_per_iteration": 2.583038806915283 + }, + { + "auxiliary_loss_clip": 0.06906819, + "auxiliary_loss_mlp": 0.0132597, + "balance_loss_clip": 0.06410546, + "balance_loss_mlp": 0.01265579, + "epoch": 0.07292950548624681, + "flos": 21221123771520.0, + "grad_norm": 2.174057870864043, + "language_loss": 0.74973536, + "learning_rate": 3.9806996067605055e-06, + "loss": 0.8320632, + "num_input_tokens_seen": 25823315, + "router_z_loss_clip": 4.96484375, + "router_z_loss_mlp": 0.60449219, + "step": 1213, + "time_per_iteration": 2.58750319480896 + }, + { + "auxiliary_loss_clip": 0.06919889, + "auxiliary_loss_mlp": 0.01335737, + "balance_loss_clip": 0.06414144, + "balance_loss_mlp": 0.01270815, + "epoch": 0.07298962873891478, + "flos": 24648492453120.0, + "grad_norm": 3.5327448066046547, + "language_loss": 0.86681479, + "learning_rate": 3.980645593601465e-06, + "loss": 0.9493711, + "num_input_tokens_seen": 25842605, + "router_z_loss_clip": 5.046875, + "router_z_loss_mlp": 0.64868164, + "step": 1214, + "time_per_iteration": 2.6603875160217285 + }, + { + "auxiliary_loss_clip": 0.0691122, + "auxiliary_loss_mlp": 0.01328745, + "balance_loss_clip": 0.06415356, + "balance_loss_mlp": 0.01268855, + "epoch": 0.07304975199158274, + "flos": 27060101614080.0, + "grad_norm": 2.7007963802747197, + "language_loss": 0.87098217, + "learning_rate": 3.980591505336144e-06, + "loss": 0.95338178, + "num_input_tokens_seen": 25863030, + "router_z_loss_clip": 4.95703125, + "router_z_loss_mlp": 0.59863281, + "step": 1215, + "time_per_iteration": 2.6591246128082275 + }, + { + "auxiliary_loss_clip": 0.06944987, + "auxiliary_loss_mlp": 0.01336211, + "balance_loss_clip": 0.06434523, + "balance_loss_mlp": 0.01269025, + "epoch": 0.07310987524425071, + "flos": 33558353781120.0, + "grad_norm": 3.0486240121539385, + "language_loss": 0.83975989, + "learning_rate": 3.980537341966595e-06, + "loss": 0.9225719, + "num_input_tokens_seen": 25888015, + "router_z_loss_clip": 5.1015625, + "router_z_loss_mlp": 0.67138672, + "step": 1216, + "time_per_iteration": 2.7674107551574707 + }, + { + "auxiliary_loss_clip": 0.06944714, + "auxiliary_loss_mlp": 0.01339054, + "balance_loss_clip": 0.06429577, + "balance_loss_mlp": 0.01274585, + "epoch": 0.07316999849691869, + "flos": 28118473735680.0, + "grad_norm": 3.328421621220486, + "language_loss": 0.78921533, + "learning_rate": 3.980483103494872e-06, + "loss": 0.87205303, + "num_input_tokens_seen": 25908660, + "router_z_loss_clip": 5.1484375, + "router_z_loss_mlp": 0.64550781, + "step": 1217, + "time_per_iteration": 2.672692060470581 + }, + { + "auxiliary_loss_clip": 0.06904574, + "auxiliary_loss_mlp": 0.01321216, + "balance_loss_clip": 0.06406265, + "balance_loss_mlp": 0.01263614, + "epoch": 0.07323012174958665, + "flos": 14397888343680.0, + "grad_norm": 2.4648840381938752, + "language_loss": 0.88704532, + "learning_rate": 3.98042878992303e-06, + "loss": 0.96930325, + "num_input_tokens_seen": 25927215, + "router_z_loss_clip": 4.984375, + "router_z_loss_mlp": 0.57592773, + "step": 1218, + "time_per_iteration": 2.6067652702331543 + }, + { + "auxiliary_loss_clip": 0.06908453, + "auxiliary_loss_mlp": 0.01339024, + "balance_loss_clip": 0.06418494, + "balance_loss_mlp": 0.01277607, + "epoch": 0.07329024500225462, + "flos": 21622862972160.0, + "grad_norm": 2.509726295852636, + "language_loss": 0.89056909, + "learning_rate": 3.9803744012531305e-06, + "loss": 0.9730438, + "num_input_tokens_seen": 25945500, + "router_z_loss_clip": 4.89453125, + "router_z_loss_mlp": 0.61376953, + "step": 1219, + "time_per_iteration": 2.644948959350586 + }, + { + "auxiliary_loss_clip": 0.0689719, + "auxiliary_loss_mlp": 0.01336847, + "balance_loss_clip": 0.06407624, + "balance_loss_mlp": 0.01275287, + "epoch": 0.0733503682549226, + "flos": 13229078140800.0, + "grad_norm": 3.459180464583836, + "language_loss": 0.87265766, + "learning_rate": 3.980319937487235e-06, + "loss": 0.95499802, + "num_input_tokens_seen": 25963105, + "router_z_loss_clip": 4.8984375, + "router_z_loss_mlp": 0.61621094, + "step": 1220, + "time_per_iteration": 2.575570583343506 + }, + { + "auxiliary_loss_clip": 0.06925908, + "auxiliary_loss_mlp": 0.01352206, + "balance_loss_clip": 0.06422862, + "balance_loss_mlp": 0.0128974, + "epoch": 0.07341049150759056, + "flos": 20893331399040.0, + "grad_norm": 4.615259324948809, + "language_loss": 0.79933828, + "learning_rate": 3.98026539862741e-06, + "loss": 0.88211942, + "num_input_tokens_seen": 25981690, + "router_z_loss_clip": 5.03125, + "router_z_loss_mlp": 0.62451172, + "step": 1221, + "time_per_iteration": 2.6174440383911133 + }, + { + "auxiliary_loss_clip": 0.06900848, + "auxiliary_loss_mlp": 0.01351796, + "balance_loss_clip": 0.06404451, + "balance_loss_mlp": 0.01290761, + "epoch": 0.07347061476025853, + "flos": 15418972598400.0, + "grad_norm": 2.5998624424358106, + "language_loss": 0.95159388, + "learning_rate": 3.980210784675722e-06, + "loss": 1.03412032, + "num_input_tokens_seen": 25999890, + "router_z_loss_clip": 4.9609375, + "router_z_loss_mlp": 0.61035156, + "step": 1222, + "time_per_iteration": 2.5956273078918457 + }, + { + "auxiliary_loss_clip": 0.06908462, + "auxiliary_loss_mlp": 0.01358079, + "balance_loss_clip": 0.06414389, + "balance_loss_mlp": 0.01303147, + "epoch": 0.0735307380129265, + "flos": 11113591708800.0, + "grad_norm": 14.551194351183868, + "language_loss": 0.93725538, + "learning_rate": 3.980156095634242e-06, + "loss": 1.01992083, + "num_input_tokens_seen": 26016445, + "router_z_loss_clip": 4.94140625, + "router_z_loss_mlp": 0.54907227, + "step": 1223, + "time_per_iteration": 2.5886712074279785 + }, + { + "auxiliary_loss_clip": 0.06916398, + "auxiliary_loss_mlp": 0.01394841, + "balance_loss_clip": 0.06417241, + "balance_loss_mlp": 0.01330874, + "epoch": 0.07359086126559447, + "flos": 23739146017920.0, + "grad_norm": 2.48832330955176, + "language_loss": 0.84952593, + "learning_rate": 3.980101331505045e-06, + "loss": 0.93263835, + "num_input_tokens_seen": 26036080, + "router_z_loss_clip": 4.984375, + "router_z_loss_mlp": 0.63989258, + "step": 1224, + "time_per_iteration": 2.600796937942505 + }, + { + "auxiliary_loss_clip": 0.06916806, + "auxiliary_loss_mlp": 0.01413444, + "balance_loss_clip": 0.06410658, + "balance_loss_mlp": 0.0134354, + "epoch": 0.07365098451826244, + "flos": 20999115578880.0, + "grad_norm": 3.5000549679052932, + "language_loss": 0.86487269, + "learning_rate": 3.9800464922902076e-06, + "loss": 0.94817519, + "num_input_tokens_seen": 26055805, + "router_z_loss_clip": 5.0625, + "router_z_loss_mlp": 0.69921875, + "step": 1225, + "time_per_iteration": 2.6348657608032227 + }, + { + "auxiliary_loss_clip": 0.06893472, + "auxiliary_loss_mlp": 0.01405003, + "balance_loss_clip": 0.06406252, + "balance_loss_mlp": 0.01345017, + "epoch": 0.0737111077709304, + "flos": 19938982521600.0, + "grad_norm": 2.4160640893773544, + "language_loss": 0.93043572, + "learning_rate": 3.979991577991808e-06, + "loss": 1.01342046, + "num_input_tokens_seen": 26073905, + "router_z_loss_clip": 4.8671875, + "router_z_loss_mlp": 0.59960938, + "step": 1226, + "time_per_iteration": 2.5814220905303955 + }, + { + "auxiliary_loss_clip": 0.06951886, + "auxiliary_loss_mlp": 0.01454874, + "balance_loss_clip": 0.06431323, + "balance_loss_mlp": 0.01382633, + "epoch": 0.07377123102359838, + "flos": 16587153895680.0, + "grad_norm": 17.71044350544229, + "language_loss": 0.81177175, + "learning_rate": 3.97993658861193e-06, + "loss": 0.89583939, + "num_input_tokens_seen": 26091700, + "router_z_loss_clip": 5.21484375, + "router_z_loss_mlp": 0.72216797, + "step": 1227, + "time_per_iteration": 2.562495708465576 + }, + { + "auxiliary_loss_clip": 0.06910308, + "auxiliary_loss_mlp": 0.0141995, + "balance_loss_clip": 0.06419577, + "balance_loss_mlp": 0.01357318, + "epoch": 0.07383135427626634, + "flos": 28335911880960.0, + "grad_norm": 2.0840618907227113, + "language_loss": 0.88551241, + "learning_rate": 3.9798815241526575e-06, + "loss": 0.96881503, + "num_input_tokens_seen": 26114105, + "router_z_loss_clip": 4.90625, + "router_z_loss_mlp": 0.6262207, + "step": 1228, + "time_per_iteration": 2.6383354663848877 + }, + { + "auxiliary_loss_clip": 0.06927899, + "auxiliary_loss_mlp": 0.01421335, + "balance_loss_clip": 0.06420749, + "balance_loss_mlp": 0.01352098, + "epoch": 0.07389147752893431, + "flos": 20053277890560.0, + "grad_norm": 2.9618119227327493, + "language_loss": 0.82374752, + "learning_rate": 3.97982638461608e-06, + "loss": 0.90723979, + "num_input_tokens_seen": 26131165, + "router_z_loss_clip": 5.0625, + "router_z_loss_mlp": 0.69238281, + "step": 1229, + "time_per_iteration": 2.572110414505005 + }, + { + "auxiliary_loss_clip": 0.06918953, + "auxiliary_loss_mlp": 0.01426217, + "balance_loss_clip": 0.06413613, + "balance_loss_mlp": 0.01351926, + "epoch": 0.07395160078160229, + "flos": 18120038088960.0, + "grad_norm": 2.8764105468999697, + "language_loss": 0.81244183, + "learning_rate": 3.979771170004287e-06, + "loss": 0.89589357, + "num_input_tokens_seen": 26150040, + "router_z_loss_clip": 5.046875, + "router_z_loss_mlp": 0.74267578, + "step": 1230, + "time_per_iteration": 2.580080270767212 + }, + { + "auxiliary_loss_clip": 0.06901585, + "auxiliary_loss_mlp": 0.01391553, + "balance_loss_clip": 0.06406316, + "balance_loss_mlp": 0.01325273, + "epoch": 0.07401172403427025, + "flos": 23593726056960.0, + "grad_norm": 2.3354922031953547, + "language_loss": 0.83756942, + "learning_rate": 3.979715880319372e-06, + "loss": 0.92050081, + "num_input_tokens_seen": 26169380, + "router_z_loss_clip": 4.95703125, + "router_z_loss_mlp": 0.66210938, + "step": 1231, + "time_per_iteration": 2.6182961463928223 + }, + { + "auxiliary_loss_clip": 0.06916339, + "auxiliary_loss_mlp": 0.01398184, + "balance_loss_clip": 0.06416178, + "balance_loss_mlp": 0.01340868, + "epoch": 0.07407184728693822, + "flos": 26367187075200.0, + "grad_norm": 2.448759958115063, + "language_loss": 0.97958755, + "learning_rate": 3.979660515563434e-06, + "loss": 1.0627327, + "num_input_tokens_seen": 26189420, + "router_z_loss_clip": 5.0, + "router_z_loss_mlp": 0.57373047, + "step": 1232, + "time_per_iteration": 2.6219074726104736 + }, + { + "auxiliary_loss_clip": 0.06881506, + "auxiliary_loss_mlp": 0.01383375, + "balance_loss_clip": 0.06404279, + "balance_loss_mlp": 0.01327991, + "epoch": 0.0741319705396062, + "flos": 22207016511360.0, + "grad_norm": 2.790382340569057, + "language_loss": 0.83657277, + "learning_rate": 3.979605075738569e-06, + "loss": 0.91922164, + "num_input_tokens_seen": 26209300, + "router_z_loss_clip": 4.7734375, + "router_z_loss_mlp": 0.55395508, + "step": 1233, + "time_per_iteration": 2.6186439990997314 + }, + { + "auxiliary_loss_clip": 0.06909496, + "auxiliary_loss_mlp": 0.0136395, + "balance_loss_clip": 0.06408279, + "balance_loss_mlp": 0.01302462, + "epoch": 0.07419209379227416, + "flos": 39209508696960.0, + "grad_norm": 3.1172656995673393, + "language_loss": 0.73086953, + "learning_rate": 3.979549560846883e-06, + "loss": 0.813604, + "num_input_tokens_seen": 26228110, + "router_z_loss_clip": 5.0078125, + "router_z_loss_mlp": 0.61450195, + "step": 1234, + "time_per_iteration": 2.750397205352783 + }, + { + "auxiliary_loss_clip": 0.0689207, + "auxiliary_loss_mlp": 0.01355226, + "balance_loss_clip": 0.06398024, + "balance_loss_mlp": 0.01294786, + "epoch": 0.07425221704494213, + "flos": 22787899741440.0, + "grad_norm": 2.355636628350322, + "language_loss": 0.789891, + "learning_rate": 3.979493970890478e-06, + "loss": 0.87236392, + "num_input_tokens_seen": 26247020, + "router_z_loss_clip": 4.9375, + "router_z_loss_mlp": 0.60473633, + "step": 1235, + "time_per_iteration": 2.5847980976104736 + }, + { + "auxiliary_loss_clip": 0.06876536, + "auxiliary_loss_mlp": 0.0134157, + "balance_loss_clip": 0.0640441, + "balance_loss_mlp": 0.01286972, + "epoch": 0.0743123402976101, + "flos": 22279495893120.0, + "grad_norm": 4.38662001374288, + "language_loss": 0.84938204, + "learning_rate": 3.979438305871464e-06, + "loss": 0.93156314, + "num_input_tokens_seen": 26265750, + "router_z_loss_clip": 4.71484375, + "router_z_loss_mlp": 0.54589844, + "step": 1236, + "time_per_iteration": 2.6517555713653564 + }, + { + "auxiliary_loss_clip": 0.06904443, + "auxiliary_loss_mlp": 0.013457, + "balance_loss_clip": 0.06407445, + "balance_loss_mlp": 0.01288479, + "epoch": 0.07437246355027807, + "flos": 29322768942720.0, + "grad_norm": 2.2405587930301705, + "language_loss": 0.78282797, + "learning_rate": 3.979382565791951e-06, + "loss": 0.86532938, + "num_input_tokens_seen": 26287905, + "router_z_loss_clip": 4.96875, + "router_z_loss_mlp": 0.57275391, + "step": 1237, + "time_per_iteration": 2.729818105697632 + }, + { + "auxiliary_loss_clip": 0.06881858, + "auxiliary_loss_mlp": 0.01325868, + "balance_loss_clip": 0.06397796, + "balance_loss_mlp": 0.01274488, + "epoch": 0.07443258680294604, + "flos": 31953367549440.0, + "grad_norm": 2.5947803667316123, + "language_loss": 0.79746008, + "learning_rate": 3.979326750654053e-06, + "loss": 0.87953734, + "num_input_tokens_seen": 26311795, + "router_z_loss_clip": 4.84765625, + "router_z_loss_mlp": 0.51391602, + "step": 1238, + "time_per_iteration": 2.7127678394317627 + }, + { + "auxiliary_loss_clip": 0.06888152, + "auxiliary_loss_mlp": 0.01350045, + "balance_loss_clip": 0.06387939, + "balance_loss_mlp": 0.01285982, + "epoch": 0.074492710055614, + "flos": 22682031707520.0, + "grad_norm": 6.17193517167714, + "language_loss": 0.88359845, + "learning_rate": 3.9792708604598854e-06, + "loss": 0.96598047, + "num_input_tokens_seen": 26330330, + "router_z_loss_clip": 5.00390625, + "router_z_loss_mlp": 0.64038086, + "step": 1239, + "time_per_iteration": 2.5982487201690674 + }, + { + "auxiliary_loss_clip": 0.06867203, + "auxiliary_loss_mlp": 0.01339139, + "balance_loss_clip": 0.06376298, + "balance_loss_mlp": 0.01279201, + "epoch": 0.07455283330828198, + "flos": 21290752114560.0, + "grad_norm": 4.728508562946579, + "language_loss": 0.9183414, + "learning_rate": 3.979214895211569e-06, + "loss": 1.00040483, + "num_input_tokens_seen": 26348865, + "router_z_loss_clip": 4.90625, + "router_z_loss_mlp": 0.59960938, + "step": 1240, + "time_per_iteration": 3.982212781906128 + }, + { + "auxiliary_loss_clip": 0.0687404, + "auxiliary_loss_mlp": 0.01344277, + "balance_loss_clip": 0.06383809, + "balance_loss_mlp": 0.01287676, + "epoch": 0.07461295656094995, + "flos": 24395150033280.0, + "grad_norm": 2.7209561023558506, + "language_loss": 0.903265, + "learning_rate": 3.979158854911225e-06, + "loss": 0.98544812, + "num_input_tokens_seen": 26368210, + "router_z_loss_clip": 4.8984375, + "router_z_loss_mlp": 0.56616211, + "step": 1241, + "time_per_iteration": 2.622676372528076 + }, + { + "auxiliary_loss_clip": 0.06764787, + "auxiliary_loss_mlp": 0.01319561, + "balance_loss_clip": 0.06452408, + "balance_loss_mlp": 0.01283775, + "epoch": 0.07467307981361791, + "flos": 62127971498880.0, + "grad_norm": 0.8806411506129102, + "language_loss": 0.63242501, + "learning_rate": 3.979102739560979e-06, + "loss": 0.71326846, + "num_input_tokens_seen": 26424890, + "router_z_loss_clip": 3.125, + "router_z_loss_mlp": 0.35864258, + "step": 1242, + "time_per_iteration": 4.608001947402954 + }, + { + "auxiliary_loss_clip": 0.06884564, + "auxiliary_loss_mlp": 0.01350666, + "balance_loss_clip": 0.06376857, + "balance_loss_mlp": 0.01288319, + "epoch": 0.07473320306628589, + "flos": 24870039448320.0, + "grad_norm": 20.01115775481137, + "language_loss": 0.65988898, + "learning_rate": 3.9790465491629595e-06, + "loss": 0.74224126, + "num_input_tokens_seen": 26446405, + "router_z_loss_clip": 5.08203125, + "router_z_loss_mlp": 0.6237793, + "step": 1243, + "time_per_iteration": 2.686720371246338 + }, + { + "auxiliary_loss_clip": 0.068617, + "auxiliary_loss_mlp": 0.01347661, + "balance_loss_clip": 0.06381305, + "balance_loss_mlp": 0.01292491, + "epoch": 0.07479332631895386, + "flos": 24903973152000.0, + "grad_norm": 3.6813184842747346, + "language_loss": 0.78008217, + "learning_rate": 3.978990283719296e-06, + "loss": 0.86217576, + "num_input_tokens_seen": 26466070, + "router_z_loss_clip": 4.8046875, + "router_z_loss_mlp": 0.55175781, + "step": 1244, + "time_per_iteration": 4.040115833282471 + }, + { + "auxiliary_loss_clip": 0.06851211, + "auxiliary_loss_mlp": 0.01348909, + "balance_loss_clip": 0.06370524, + "balance_loss_mlp": 0.01292833, + "epoch": 0.07485344957162182, + "flos": 17819932291200.0, + "grad_norm": 21.86650929914808, + "language_loss": 0.72362238, + "learning_rate": 3.978933943232123e-06, + "loss": 0.80562365, + "num_input_tokens_seen": 26479350, + "router_z_loss_clip": 4.80078125, + "router_z_loss_mlp": 0.56103516, + "step": 1245, + "time_per_iteration": 2.524477481842041 + }, + { + "auxiliary_loss_clip": 0.06865877, + "auxiliary_loss_mlp": 0.01375645, + "balance_loss_clip": 0.06379819, + "balance_loss_mlp": 0.01317042, + "epoch": 0.0749135728242898, + "flos": 25017304199040.0, + "grad_norm": 2.436107230077969, + "language_loss": 0.90751457, + "learning_rate": 3.978877527703576e-06, + "loss": 0.98992985, + "num_input_tokens_seen": 26498255, + "router_z_loss_clip": 4.85546875, + "router_z_loss_mlp": 0.58642578, + "step": 1246, + "time_per_iteration": 4.0361082553863525 + }, + { + "auxiliary_loss_clip": 0.06889592, + "auxiliary_loss_mlp": 0.01353914, + "balance_loss_clip": 0.06373734, + "balance_loss_mlp": 0.0128978, + "epoch": 0.07497369607695777, + "flos": 17827898428800.0, + "grad_norm": 3.630435288529284, + "language_loss": 0.91536689, + "learning_rate": 3.9788210371357945e-06, + "loss": 0.99780184, + "num_input_tokens_seen": 26515375, + "router_z_loss_clip": 5.15234375, + "router_z_loss_mlp": 0.64111328, + "step": 1247, + "time_per_iteration": 2.558710813522339 + }, + { + "auxiliary_loss_clip": 0.06850724, + "auxiliary_loss_mlp": 0.01373111, + "balance_loss_clip": 0.06373762, + "balance_loss_mlp": 0.01312124, + "epoch": 0.07503381932962573, + "flos": 15126287886720.0, + "grad_norm": 2.9459859952497336, + "language_loss": 0.67146099, + "learning_rate": 3.978764471530921e-06, + "loss": 0.7536993, + "num_input_tokens_seen": 26533595, + "router_z_loss_clip": 4.7578125, + "router_z_loss_mlp": 0.60986328, + "step": 1248, + "time_per_iteration": 2.5451245307922363 + }, + { + "auxiliary_loss_clip": 0.06826814, + "auxiliary_loss_mlp": 0.0138466, + "balance_loss_clip": 0.06362367, + "balance_loss_mlp": 0.01326009, + "epoch": 0.0750939425822937, + "flos": 12820588686720.0, + "grad_norm": 4.865871965779137, + "language_loss": 0.76126468, + "learning_rate": 3.978707830891102e-06, + "loss": 0.84337938, + "num_input_tokens_seen": 26549405, + "router_z_loss_clip": 4.64453125, + "router_z_loss_mlp": 0.58642578, + "step": 1249, + "time_per_iteration": 2.547814130783081 + }, + { + "auxiliary_loss_clip": 0.06878477, + "auxiliary_loss_mlp": 0.01356674, + "balance_loss_clip": 0.06384575, + "balance_loss_mlp": 0.01291156, + "epoch": 0.07515406583496168, + "flos": 24213700016640.0, + "grad_norm": 3.3650478618726805, + "language_loss": 0.84855753, + "learning_rate": 3.978651115218482e-06, + "loss": 0.93090904, + "num_input_tokens_seen": 26567200, + "router_z_loss_clip": 4.9296875, + "router_z_loss_mlp": 0.65429688, + "step": 1250, + "time_per_iteration": 2.6201655864715576 + }, + { + "auxiliary_loss_clip": 0.0685844, + "auxiliary_loss_mlp": 0.01372833, + "balance_loss_clip": 0.06383228, + "balance_loss_mlp": 0.01312036, + "epoch": 0.07521418908762964, + "flos": 26695482572160.0, + "grad_norm": 2.950747307093222, + "language_loss": 0.7010417, + "learning_rate": 3.978594324515215e-06, + "loss": 0.7833544, + "num_input_tokens_seen": 26586190, + "router_z_loss_clip": 4.74609375, + "router_z_loss_mlp": 0.60742188, + "step": 1251, + "time_per_iteration": 2.6431658267974854 + }, + { + "auxiliary_loss_clip": 0.06735167, + "auxiliary_loss_mlp": 0.01321971, + "balance_loss_clip": 0.06424966, + "balance_loss_mlp": 0.0128411, + "epoch": 0.0752743123402976, + "flos": 59115255546240.0, + "grad_norm": 0.864981950603712, + "language_loss": 0.69976699, + "learning_rate": 3.9785374587834515e-06, + "loss": 0.78033841, + "num_input_tokens_seen": 26650710, + "router_z_loss_clip": 3.10351562, + "router_z_loss_mlp": 0.37792969, + "step": 1252, + "time_per_iteration": 3.2185781002044678 + }, + { + "auxiliary_loss_clip": 0.06854245, + "auxiliary_loss_mlp": 0.01348889, + "balance_loss_clip": 0.06374305, + "balance_loss_mlp": 0.01288426, + "epoch": 0.07533443559296558, + "flos": 23483749173120.0, + "grad_norm": 3.3162526589419876, + "language_loss": 0.82824075, + "learning_rate": 3.97848051802535e-06, + "loss": 0.91027212, + "num_input_tokens_seen": 26669000, + "router_z_loss_clip": 4.7890625, + "router_z_loss_mlp": 0.60498047, + "step": 1253, + "time_per_iteration": 2.6227848529815674 + }, + { + "auxiliary_loss_clip": 0.06867173, + "auxiliary_loss_mlp": 0.01358456, + "balance_loss_clip": 0.06365065, + "balance_loss_mlp": 0.01293749, + "epoch": 0.07539455884563355, + "flos": 20884149377280.0, + "grad_norm": 6.3858164660002625, + "language_loss": 0.96525204, + "learning_rate": 3.978423502243069e-06, + "loss": 1.04750824, + "num_input_tokens_seen": 26683075, + "router_z_loss_clip": 5.015625, + "router_z_loss_mlp": 0.64697266, + "step": 1254, + "time_per_iteration": 2.5511484146118164 + }, + { + "auxiliary_loss_clip": 0.06840456, + "auxiliary_loss_mlp": 0.0135521, + "balance_loss_clip": 0.06368542, + "balance_loss_mlp": 0.012916, + "epoch": 0.07545468209830151, + "flos": 27680327136000.0, + "grad_norm": 2.4514498349060307, + "language_loss": 0.9076122, + "learning_rate": 3.97836641143877e-06, + "loss": 0.98956883, + "num_input_tokens_seen": 26701875, + "router_z_loss_clip": 4.71875, + "router_z_loss_mlp": 0.63525391, + "step": 1255, + "time_per_iteration": 2.6308302879333496 + }, + { + "auxiliary_loss_clip": 0.06840869, + "auxiliary_loss_mlp": 0.01347194, + "balance_loss_clip": 0.06364559, + "balance_loss_mlp": 0.01285968, + "epoch": 0.0755148053509695, + "flos": 14142198009600.0, + "grad_norm": 2.7245497332904325, + "language_loss": 0.81970763, + "learning_rate": 3.978309245614618e-06, + "loss": 0.90158832, + "num_input_tokens_seen": 26719050, + "router_z_loss_clip": 4.75390625, + "router_z_loss_mlp": 0.61230469, + "step": 1256, + "time_per_iteration": 2.552151679992676 + }, + { + "auxiliary_loss_clip": 0.06681269, + "auxiliary_loss_mlp": 0.01315431, + "balance_loss_clip": 0.06378952, + "balance_loss_mlp": 0.01282076, + "epoch": 0.07557492860363746, + "flos": 58251764822400.0, + "grad_norm": 0.7695886437006154, + "language_loss": 0.58049726, + "learning_rate": 3.9782520047727825e-06, + "loss": 0.66046429, + "num_input_tokens_seen": 26780650, + "router_z_loss_clip": 3.03125, + "router_z_loss_mlp": 0.33374023, + "step": 1257, + "time_per_iteration": 3.304816246032715 + }, + { + "auxiliary_loss_clip": 0.06853162, + "auxiliary_loss_mlp": 0.0135189, + "balance_loss_clip": 0.0636155, + "balance_loss_mlp": 0.01284012, + "epoch": 0.07563505185630542, + "flos": 24651259637760.0, + "grad_norm": 2.373470459060695, + "language_loss": 0.93104446, + "learning_rate": 3.978194688915432e-06, + "loss": 1.0130949, + "num_input_tokens_seen": 26798725, + "router_z_loss_clip": 4.91015625, + "router_z_loss_mlp": 0.6784668, + "step": 1258, + "time_per_iteration": 2.6907479763031006 + }, + { + "auxiliary_loss_clip": 0.06829782, + "auxiliary_loss_mlp": 0.01330684, + "balance_loss_clip": 0.06361564, + "balance_loss_mlp": 0.01273559, + "epoch": 0.07569517510897339, + "flos": 15528362503680.0, + "grad_norm": 3.094615329702446, + "language_loss": 0.84079689, + "learning_rate": 3.978137298044741e-06, + "loss": 0.92240155, + "num_input_tokens_seen": 26817005, + "router_z_loss_clip": 4.6796875, + "router_z_loss_mlp": 0.57128906, + "step": 1259, + "time_per_iteration": 2.5581536293029785 + }, + { + "auxiliary_loss_clip": 0.06848526, + "auxiliary_loss_mlp": 0.0132832, + "balance_loss_clip": 0.06371632, + "balance_loss_mlp": 0.01271052, + "epoch": 0.07575529836164137, + "flos": 22934954856960.0, + "grad_norm": 3.148240250348832, + "language_loss": 0.77577376, + "learning_rate": 3.978079832162885e-06, + "loss": 0.85754222, + "num_input_tokens_seen": 26836655, + "router_z_loss_clip": 4.76171875, + "router_z_loss_mlp": 0.57275391, + "step": 1260, + "time_per_iteration": 2.601511240005493 + }, + { + "auxiliary_loss_clip": 0.06837059, + "auxiliary_loss_mlp": 0.01329742, + "balance_loss_clip": 0.06359653, + "balance_loss_mlp": 0.01268421, + "epoch": 0.07581542161430933, + "flos": 19506537999360.0, + "grad_norm": 2.0302273693268535, + "language_loss": 0.87771595, + "learning_rate": 3.978022291272044e-06, + "loss": 0.95938396, + "num_input_tokens_seen": 26854925, + "router_z_loss_clip": 4.77734375, + "router_z_loss_mlp": 0.61328125, + "step": 1261, + "time_per_iteration": 2.5501255989074707 + }, + { + "auxiliary_loss_clip": 0.06841564, + "auxiliary_loss_mlp": 0.01315914, + "balance_loss_clip": 0.06369701, + "balance_loss_mlp": 0.01256547, + "epoch": 0.0758755448669773, + "flos": 24980519456640.0, + "grad_norm": 2.7189086354386407, + "language_loss": 0.84886664, + "learning_rate": 3.977964675374399e-06, + "loss": 0.93044144, + "num_input_tokens_seen": 26876170, + "router_z_loss_clip": 4.70703125, + "router_z_loss_mlp": 0.59423828, + "step": 1262, + "time_per_iteration": 2.642197370529175 + }, + { + "auxiliary_loss_clip": 0.06848589, + "auxiliary_loss_mlp": 0.01328257, + "balance_loss_clip": 0.06354951, + "balance_loss_mlp": 0.01263312, + "epoch": 0.07593566811964528, + "flos": 22754678797440.0, + "grad_norm": 3.7332355829542183, + "language_loss": 0.84859836, + "learning_rate": 3.977906984472136e-06, + "loss": 0.93036681, + "num_input_tokens_seen": 26895005, + "router_z_loss_clip": 4.94140625, + "router_z_loss_mlp": 0.64941406, + "step": 1263, + "time_per_iteration": 2.5762293338775635 + }, + { + "auxiliary_loss_clip": 0.06852871, + "auxiliary_loss_mlp": 0.01316465, + "balance_loss_clip": 0.06365145, + "balance_loss_mlp": 0.0126039, + "epoch": 0.07599579137231324, + "flos": 23119088204160.0, + "grad_norm": 2.8380907470503036, + "language_loss": 0.78429461, + "learning_rate": 3.977849218567442e-06, + "loss": 0.86598796, + "num_input_tokens_seen": 26913930, + "router_z_loss_clip": 4.87890625, + "router_z_loss_mlp": 0.56103516, + "step": 1264, + "time_per_iteration": 2.7333550453186035 + }, + { + "auxiliary_loss_clip": 0.06862055, + "auxiliary_loss_mlp": 0.01331538, + "balance_loss_clip": 0.06363812, + "balance_loss_mlp": 0.01272362, + "epoch": 0.07605591462498121, + "flos": 14507362103040.0, + "grad_norm": 3.0292139687816455, + "language_loss": 0.84203875, + "learning_rate": 3.977791377662507e-06, + "loss": 0.92397463, + "num_input_tokens_seen": 26931485, + "router_z_loss_clip": 4.984375, + "router_z_loss_mlp": 0.59179688, + "step": 1265, + "time_per_iteration": 2.587218761444092 + }, + { + "auxiliary_loss_clip": 0.06855778, + "auxiliary_loss_mlp": 0.01328532, + "balance_loss_clip": 0.0636021, + "balance_loss_mlp": 0.01264779, + "epoch": 0.07611603787764919, + "flos": 23521037040000.0, + "grad_norm": 3.3546410086249976, + "language_loss": 0.67662913, + "learning_rate": 3.977733461759524e-06, + "loss": 0.7584722, + "num_input_tokens_seen": 26951670, + "router_z_loss_clip": 4.953125, + "router_z_loss_mlp": 0.63720703, + "step": 1266, + "time_per_iteration": 2.6307120323181152 + }, + { + "auxiliary_loss_clip": 0.06869242, + "auxiliary_loss_mlp": 0.01332957, + "balance_loss_clip": 0.06363578, + "balance_loss_mlp": 0.01267201, + "epoch": 0.07617616113031715, + "flos": 21513640775040.0, + "grad_norm": 2.4484297039949894, + "language_loss": 0.81777161, + "learning_rate": 3.977675470860691e-06, + "loss": 0.89979357, + "num_input_tokens_seen": 26970335, + "router_z_loss_clip": 5.0546875, + "router_z_loss_mlp": 0.65673828, + "step": 1267, + "time_per_iteration": 2.5816946029663086 + }, + { + "auxiliary_loss_clip": 0.06859374, + "auxiliary_loss_mlp": 0.01329793, + "balance_loss_clip": 0.06364329, + "balance_loss_mlp": 0.01269354, + "epoch": 0.07623628438298512, + "flos": 14578164403200.0, + "grad_norm": 3.901991680203772, + "language_loss": 0.74711108, + "learning_rate": 3.977617404968205e-06, + "loss": 0.82900274, + "num_input_tokens_seen": 26986025, + "router_z_loss_clip": 4.94140625, + "router_z_loss_mlp": 0.60498047, + "step": 1268, + "time_per_iteration": 2.5329971313476562 + }, + { + "auxiliary_loss_clip": 0.06849901, + "auxiliary_loss_mlp": 0.01321442, + "balance_loss_clip": 0.06367739, + "balance_loss_mlp": 0.01263959, + "epoch": 0.07629640763565308, + "flos": 14725638789120.0, + "grad_norm": 7.47291205592579, + "language_loss": 0.85124403, + "learning_rate": 3.977559264084269e-06, + "loss": 0.93295747, + "num_input_tokens_seen": 27004045, + "router_z_loss_clip": 4.8125, + "router_z_loss_mlp": 0.57421875, + "step": 1269, + "time_per_iteration": 2.5311200618743896 + }, + { + "auxiliary_loss_clip": 0.06839523, + "auxiliary_loss_mlp": 0.01320369, + "balance_loss_clip": 0.0637067, + "balance_loss_mlp": 0.01264126, + "epoch": 0.07635653088832106, + "flos": 14908220835840.0, + "grad_norm": 2.6697300314393355, + "language_loss": 0.91628265, + "learning_rate": 3.977501048211088e-06, + "loss": 0.99788159, + "num_input_tokens_seen": 27022070, + "router_z_loss_clip": 4.6875, + "router_z_loss_mlp": 0.5625, + "step": 1270, + "time_per_iteration": 2.590938091278076 + }, + { + "auxiliary_loss_clip": 0.06847905, + "auxiliary_loss_mlp": 0.01334774, + "balance_loss_clip": 0.06368862, + "balance_loss_mlp": 0.01272309, + "epoch": 0.07641665414098903, + "flos": 26658865537920.0, + "grad_norm": 4.240829447117421, + "language_loss": 0.73391259, + "learning_rate": 3.977442757350869e-06, + "loss": 0.81573939, + "num_input_tokens_seen": 27041755, + "router_z_loss_clip": 4.7890625, + "router_z_loss_mlp": 0.625, + "step": 1271, + "time_per_iteration": 2.5961694717407227 + }, + { + "auxiliary_loss_clip": 0.06838269, + "auxiliary_loss_mlp": 0.01329276, + "balance_loss_clip": 0.06381856, + "balance_loss_mlp": 0.01278445, + "epoch": 0.07647677739365699, + "flos": 25199970099840.0, + "grad_norm": 3.136617280050721, + "language_loss": 0.8526597, + "learning_rate": 3.977384391505823e-06, + "loss": 0.93433517, + "num_input_tokens_seen": 27061540, + "router_z_loss_clip": 4.55859375, + "router_z_loss_mlp": 0.50878906, + "step": 1272, + "time_per_iteration": 2.6091222763061523 + }, + { + "auxiliary_loss_clip": 0.06845278, + "auxiliary_loss_mlp": 0.01336295, + "balance_loss_clip": 0.06370107, + "balance_loss_mlp": 0.01279599, + "epoch": 0.07653690064632497, + "flos": 20564365069440.0, + "grad_norm": 3.1222866186562674, + "language_loss": 0.82570672, + "learning_rate": 3.977325950678162e-06, + "loss": 0.90752244, + "num_input_tokens_seen": 27081395, + "router_z_loss_clip": 4.75, + "router_z_loss_mlp": 0.56713867, + "step": 1273, + "time_per_iteration": 2.5675384998321533 + }, + { + "auxiliary_loss_clip": 0.06864737, + "auxiliary_loss_mlp": 0.01336748, + "balance_loss_clip": 0.06374316, + "balance_loss_mlp": 0.01277787, + "epoch": 0.07659702389899294, + "flos": 22275219335040.0, + "grad_norm": 2.5887634532412123, + "language_loss": 0.83504725, + "learning_rate": 3.977267434870103e-06, + "loss": 0.91706204, + "num_input_tokens_seen": 27101175, + "router_z_loss_clip": 4.90234375, + "router_z_loss_mlp": 0.58862305, + "step": 1274, + "time_per_iteration": 2.594106912612915 + }, + { + "auxiliary_loss_clip": 0.06835781, + "auxiliary_loss_mlp": 0.01338776, + "balance_loss_clip": 0.06372908, + "balance_loss_mlp": 0.01281961, + "epoch": 0.0766571471516609, + "flos": 32644563079680.0, + "grad_norm": 2.657989216371077, + "language_loss": 0.75383544, + "learning_rate": 3.977208844083865e-06, + "loss": 0.835581, + "num_input_tokens_seen": 27124505, + "router_z_loss_clip": 4.62109375, + "router_z_loss_mlp": 0.56835938, + "step": 1275, + "time_per_iteration": 2.6635921001434326 + }, + { + "auxiliary_loss_clip": 0.06867371, + "auxiliary_loss_mlp": 0.01354656, + "balance_loss_clip": 0.06370118, + "balance_loss_mlp": 0.01289377, + "epoch": 0.07671727040432888, + "flos": 15272672169600.0, + "grad_norm": 3.4268385774262637, + "language_loss": 0.82329005, + "learning_rate": 3.9771501783216685e-06, + "loss": 0.90551031, + "num_input_tokens_seen": 27140960, + "router_z_loss_clip": 4.98046875, + "router_z_loss_mlp": 0.65234375, + "step": 1276, + "time_per_iteration": 2.5468428134918213 + }, + { + "auxiliary_loss_clip": 0.06860888, + "auxiliary_loss_mlp": 0.01344496, + "balance_loss_clip": 0.06380928, + "balance_loss_mlp": 0.01285964, + "epoch": 0.07677739365699685, + "flos": 28191665877120.0, + "grad_norm": 8.54617583390301, + "language_loss": 0.61651218, + "learning_rate": 3.97709143758574e-06, + "loss": 0.69856602, + "num_input_tokens_seen": 27160985, + "router_z_loss_clip": 4.78515625, + "router_z_loss_mlp": 0.58544922, + "step": 1277, + "time_per_iteration": 2.6240146160125732 + }, + { + "auxiliary_loss_clip": 0.06864151, + "auxiliary_loss_mlp": 0.01358552, + "balance_loss_clip": 0.06375778, + "balance_loss_mlp": 0.01298471, + "epoch": 0.07683751690966481, + "flos": 18301991230080.0, + "grad_norm": 2.6958136098916565, + "language_loss": 0.76683849, + "learning_rate": 3.977032621878305e-06, + "loss": 0.84906554, + "num_input_tokens_seen": 27178390, + "router_z_loss_clip": 4.875, + "router_z_loss_mlp": 0.60058594, + "step": 1278, + "time_per_iteration": 2.595947742462158 + }, + { + "auxiliary_loss_clip": 0.06835216, + "auxiliary_loss_mlp": 0.01346069, + "balance_loss_clip": 0.06372848, + "balance_loss_mlp": 0.01289683, + "epoch": 0.07689764016233278, + "flos": 21987565868160.0, + "grad_norm": 3.428980152963994, + "language_loss": 0.90527773, + "learning_rate": 3.976973731201596e-06, + "loss": 0.98709059, + "num_input_tokens_seen": 27197505, + "router_z_loss_clip": 4.62109375, + "router_z_loss_mlp": 0.56420898, + "step": 1279, + "time_per_iteration": 3.962568521499634 + }, + { + "auxiliary_loss_clip": 0.06834365, + "auxiliary_loss_mlp": 0.01339419, + "balance_loss_clip": 0.06362047, + "balance_loss_mlp": 0.01287301, + "epoch": 0.07695776341500075, + "flos": 22242417661440.0, + "grad_norm": 3.3495960477632685, + "language_loss": 0.85256732, + "learning_rate": 3.976914765557845e-06, + "loss": 0.93430507, + "num_input_tokens_seen": 27214260, + "router_z_loss_clip": 4.71484375, + "router_z_loss_mlp": 0.52148438, + "step": 1280, + "time_per_iteration": 2.5692243576049805 + }, + { + "auxiliary_loss_clip": 0.06832324, + "auxiliary_loss_mlp": 0.01339262, + "balance_loss_clip": 0.06368576, + "balance_loss_mlp": 0.01283662, + "epoch": 0.07701788666766872, + "flos": 16149300785280.0, + "grad_norm": 2.5153075146211274, + "language_loss": 0.78576446, + "learning_rate": 3.9768557249492875e-06, + "loss": 0.8674804, + "num_input_tokens_seen": 27232525, + "router_z_loss_clip": 4.63671875, + "router_z_loss_mlp": 0.55541992, + "step": 1281, + "time_per_iteration": 4.005364894866943 + }, + { + "auxiliary_loss_clip": 0.06866302, + "auxiliary_loss_mlp": 0.01356763, + "balance_loss_clip": 0.06371205, + "balance_loss_mlp": 0.01291317, + "epoch": 0.07707800992033668, + "flos": 19468998570240.0, + "grad_norm": 5.650134420498799, + "language_loss": 0.77910447, + "learning_rate": 3.9767966093781634e-06, + "loss": 0.8613351, + "num_input_tokens_seen": 27249800, + "router_z_loss_clip": 4.95703125, + "router_z_loss_mlp": 0.65429688, + "step": 1282, + "time_per_iteration": 2.6096553802490234 + }, + { + "auxiliary_loss_clip": 0.06843832, + "auxiliary_loss_mlp": 0.01354603, + "balance_loss_clip": 0.06370867, + "balance_loss_mlp": 0.01298647, + "epoch": 0.07713813317300466, + "flos": 18996415142400.0, + "grad_norm": 3.5179830835441974, + "language_loss": 0.86225599, + "learning_rate": 3.976737418846713e-06, + "loss": 0.94424033, + "num_input_tokens_seen": 27268895, + "router_z_loss_clip": 4.72265625, + "router_z_loss_mlp": 0.55932617, + "step": 1283, + "time_per_iteration": 2.605346202850342 + }, + { + "auxiliary_loss_clip": 0.06835528, + "auxiliary_loss_mlp": 0.01347471, + "balance_loss_clip": 0.06358841, + "balance_loss_mlp": 0.01292039, + "epoch": 0.07719825642567263, + "flos": 18119828453760.0, + "grad_norm": 2.430743235056626, + "language_loss": 0.77539676, + "learning_rate": 3.976678153357181e-06, + "loss": 0.85722673, + "num_input_tokens_seen": 27288180, + "router_z_loss_clip": 4.76171875, + "router_z_loss_mlp": 0.55444336, + "step": 1284, + "time_per_iteration": 3.990124225616455 + }, + { + "auxiliary_loss_clip": 0.06827543, + "auxiliary_loss_mlp": 0.01355487, + "balance_loss_clip": 0.06358978, + "balance_loss_mlp": 0.01300294, + "epoch": 0.0772583796783406, + "flos": 42204307075200.0, + "grad_norm": 2.435341154952095, + "language_loss": 0.78285027, + "learning_rate": 3.976618812911817e-06, + "loss": 0.86468053, + "num_input_tokens_seen": 27311815, + "router_z_loss_clip": 4.6796875, + "router_z_loss_mlp": 0.55200195, + "step": 1285, + "time_per_iteration": 2.7569363117218018 + }, + { + "auxiliary_loss_clip": 0.06851525, + "auxiliary_loss_mlp": 0.01337351, + "balance_loss_clip": 0.06371935, + "balance_loss_mlp": 0.01278081, + "epoch": 0.07731850293100857, + "flos": 24760565688960.0, + "grad_norm": 2.195462031898389, + "language_loss": 0.86501926, + "learning_rate": 3.9765593975128685e-06, + "loss": 0.946908, + "num_input_tokens_seen": 27331890, + "router_z_loss_clip": 4.79296875, + "router_z_loss_mlp": 0.59277344, + "step": 1286, + "time_per_iteration": 4.058920383453369 + }, + { + "auxiliary_loss_clip": 0.06876462, + "auxiliary_loss_mlp": 0.01367501, + "balance_loss_clip": 0.0637191, + "balance_loss_mlp": 0.01299314, + "epoch": 0.07737862618367654, + "flos": 17571537262080.0, + "grad_norm": 2.773879522110049, + "language_loss": 0.79808044, + "learning_rate": 3.97649990716259e-06, + "loss": 0.88052011, + "num_input_tokens_seen": 27348320, + "router_z_loss_clip": 5.04296875, + "router_z_loss_mlp": 0.68212891, + "step": 1287, + "time_per_iteration": 2.562206506729126 + }, + { + "auxiliary_loss_clip": 0.06845251, + "auxiliary_loss_mlp": 0.01340112, + "balance_loss_clip": 0.06370382, + "balance_loss_mlp": 0.01288136, + "epoch": 0.0774387494363445, + "flos": 25633798214400.0, + "grad_norm": 2.3847373218246983, + "language_loss": 0.8715058, + "learning_rate": 3.976440341863237e-06, + "loss": 0.95335943, + "num_input_tokens_seen": 27367670, + "router_z_loss_clip": 4.74609375, + "router_z_loss_mlp": 0.51953125, + "step": 1288, + "time_per_iteration": 2.600308656692505 + }, + { + "auxiliary_loss_clip": 0.0688329, + "auxiliary_loss_mlp": 0.01364865, + "balance_loss_clip": 0.06375885, + "balance_loss_mlp": 0.01300611, + "epoch": 0.07749887268901248, + "flos": 12244778628480.0, + "grad_norm": 3.451146773235629, + "language_loss": 0.8824665, + "learning_rate": 3.976380701617068e-06, + "loss": 0.96494806, + "num_input_tokens_seen": 27385485, + "router_z_loss_clip": 5.0703125, + "router_z_loss_mlp": 0.64306641, + "step": 1289, + "time_per_iteration": 2.6120755672454834 + }, + { + "auxiliary_loss_clip": 0.06845821, + "auxiliary_loss_mlp": 0.01332003, + "balance_loss_clip": 0.06365949, + "balance_loss_mlp": 0.0127781, + "epoch": 0.07755899594168045, + "flos": 25088609623680.0, + "grad_norm": 3.9721153981819377, + "language_loss": 0.87731397, + "learning_rate": 3.976320986426344e-06, + "loss": 0.95909214, + "num_input_tokens_seen": 27405110, + "router_z_loss_clip": 4.80078125, + "router_z_loss_mlp": 0.54150391, + "step": 1290, + "time_per_iteration": 2.6039535999298096 + }, + { + "auxiliary_loss_clip": 0.06849636, + "auxiliary_loss_mlp": 0.0134794, + "balance_loss_clip": 0.0637328, + "balance_loss_mlp": 0.01286833, + "epoch": 0.07761911919434841, + "flos": 14251629841920.0, + "grad_norm": 2.80389948255575, + "language_loss": 0.9359982, + "learning_rate": 3.9762611962933315e-06, + "loss": 1.0179739, + "num_input_tokens_seen": 27422855, + "router_z_loss_clip": 4.765625, + "router_z_loss_mlp": 0.61157227, + "step": 1291, + "time_per_iteration": 2.620960235595703 + }, + { + "auxiliary_loss_clip": 0.06740145, + "auxiliary_loss_mlp": 0.01502792, + "balance_loss_clip": 0.06432445, + "balance_loss_mlp": 0.01475422, + "epoch": 0.07767924244701638, + "flos": 67259639099520.0, + "grad_norm": 0.9524065323514693, + "language_loss": 0.65448344, + "learning_rate": 3.9762013312202955e-06, + "loss": 0.73691273, + "num_input_tokens_seen": 27487190, + "router_z_loss_clip": 3.078125, + "router_z_loss_mlp": 0.27416992, + "step": 1292, + "time_per_iteration": 3.3147408962249756 + }, + { + "auxiliary_loss_clip": 0.06863274, + "auxiliary_loss_mlp": 0.01339428, + "balance_loss_clip": 0.06369414, + "balance_loss_mlp": 0.01279203, + "epoch": 0.07773936569968436, + "flos": 28558548979200.0, + "grad_norm": 5.92776916982661, + "language_loss": 0.89760518, + "learning_rate": 3.9761413912095075e-06, + "loss": 0.97963214, + "num_input_tokens_seen": 27510465, + "router_z_loss_clip": 4.9375, + "router_z_loss_mlp": 0.60229492, + "step": 1293, + "time_per_iteration": 2.649545431137085 + }, + { + "auxiliary_loss_clip": 0.06850281, + "auxiliary_loss_mlp": 0.0134015, + "balance_loss_clip": 0.06365186, + "balance_loss_mlp": 0.01280689, + "epoch": 0.07779948895235232, + "flos": 27497619308160.0, + "grad_norm": 4.7786851588669315, + "language_loss": 0.88117272, + "learning_rate": 3.976081376263239e-06, + "loss": 0.96307707, + "num_input_tokens_seen": 27528645, + "router_z_loss_clip": 4.84765625, + "router_z_loss_mlp": 0.59521484, + "step": 1294, + "time_per_iteration": 2.7246196269989014 + }, + { + "auxiliary_loss_clip": 0.06872948, + "auxiliary_loss_mlp": 0.01341599, + "balance_loss_clip": 0.06369777, + "balance_loss_mlp": 0.01276034, + "epoch": 0.07785961220502029, + "flos": 18229176432000.0, + "grad_norm": 2.917147299599652, + "language_loss": 0.82283127, + "learning_rate": 3.976021286383768e-06, + "loss": 0.90497679, + "num_input_tokens_seen": 27546165, + "router_z_loss_clip": 5.03125, + "router_z_loss_mlp": 0.65576172, + "step": 1295, + "time_per_iteration": 2.565981149673462 + }, + { + "auxiliary_loss_clip": 0.06823503, + "auxiliary_loss_mlp": 0.0131494, + "balance_loss_clip": 0.06354046, + "balance_loss_mlp": 0.01258459, + "epoch": 0.07791973545768827, + "flos": 24615145728000.0, + "grad_norm": 2.406299450212834, + "language_loss": 0.90690672, + "learning_rate": 3.975961121573371e-06, + "loss": 0.9882912, + "num_input_tokens_seen": 27566520, + "router_z_loss_clip": 4.69140625, + "router_z_loss_mlp": 0.56494141, + "step": 1296, + "time_per_iteration": 2.6269545555114746 + }, + { + "auxiliary_loss_clip": 0.06845632, + "auxiliary_loss_mlp": 0.01328069, + "balance_loss_clip": 0.06355733, + "balance_loss_mlp": 0.01267058, + "epoch": 0.07797985871035623, + "flos": 14287156773120.0, + "grad_norm": 2.6954148658412636, + "language_loss": 0.98733974, + "learning_rate": 3.9759008818343305e-06, + "loss": 1.06907678, + "num_input_tokens_seen": 27581960, + "router_z_loss_clip": 4.8984375, + "router_z_loss_mlp": 0.61010742, + "step": 1297, + "time_per_iteration": 2.550185441970825 + }, + { + "auxiliary_loss_clip": 0.06845116, + "auxiliary_loss_mlp": 0.01318807, + "balance_loss_clip": 0.06359702, + "balance_loss_mlp": 0.01258606, + "epoch": 0.0780399819630242, + "flos": 26616965696640.0, + "grad_norm": 2.8603722020093287, + "language_loss": 0.7874198, + "learning_rate": 3.97584056716893e-06, + "loss": 0.86905909, + "num_input_tokens_seen": 27601415, + "router_z_loss_clip": 4.8515625, + "router_z_loss_mlp": 0.60229492, + "step": 1298, + "time_per_iteration": 2.6391749382019043 + }, + { + "auxiliary_loss_clip": 0.06826787, + "auxiliary_loss_mlp": 0.01312488, + "balance_loss_clip": 0.06351642, + "balance_loss_mlp": 0.01258558, + "epoch": 0.07810010521569218, + "flos": 21840846168960.0, + "grad_norm": 2.2381109850938077, + "language_loss": 0.83600903, + "learning_rate": 3.9757801775794575e-06, + "loss": 0.91740179, + "num_input_tokens_seen": 27621490, + "router_z_loss_clip": 4.75, + "router_z_loss_mlp": 0.53979492, + "step": 1299, + "time_per_iteration": 2.5871427059173584 + }, + { + "auxiliary_loss_clip": 0.0681142, + "auxiliary_loss_mlp": 0.01314166, + "balance_loss_clip": 0.06352274, + "balance_loss_mlp": 0.01260713, + "epoch": 0.07816022846836014, + "flos": 25088022645120.0, + "grad_norm": 2.404074331576357, + "language_loss": 0.89199561, + "learning_rate": 3.975719713068202e-06, + "loss": 0.97325152, + "num_input_tokens_seen": 27640600, + "router_z_loss_clip": 4.58984375, + "router_z_loss_mlp": 0.53442383, + "step": 1300, + "time_per_iteration": 2.633734941482544 + }, + { + "auxiliary_loss_clip": 0.06848504, + "auxiliary_loss_mlp": 0.01319579, + "balance_loss_clip": 0.0636059, + "balance_loss_mlp": 0.0125964, + "epoch": 0.0782203517210281, + "flos": 40927197070080.0, + "grad_norm": 2.022718991796153, + "language_loss": 0.7445091, + "learning_rate": 3.975659173637458e-06, + "loss": 0.82618994, + "num_input_tokens_seen": 27663070, + "router_z_loss_clip": 4.87890625, + "router_z_loss_mlp": 0.59936523, + "step": 1301, + "time_per_iteration": 2.7330377101898193 + }, + { + "auxiliary_loss_clip": 0.06825704, + "auxiliary_loss_mlp": 0.01316028, + "balance_loss_clip": 0.0635405, + "balance_loss_mlp": 0.01261335, + "epoch": 0.07828047497369607, + "flos": 41181587665920.0, + "grad_norm": 2.1366155853756275, + "language_loss": 0.73607302, + "learning_rate": 3.97559855928952e-06, + "loss": 0.81749034, + "num_input_tokens_seen": 27686425, + "router_z_loss_clip": 4.7109375, + "router_z_loss_mlp": 0.54736328, + "step": 1302, + "time_per_iteration": 2.781339168548584 + }, + { + "auxiliary_loss_clip": 0.06820317, + "auxiliary_loss_mlp": 0.01324174, + "balance_loss_clip": 0.06356553, + "balance_loss_mlp": 0.01270124, + "epoch": 0.07834059822636405, + "flos": 23513951370240.0, + "grad_norm": 3.2246124193670433, + "language_loss": 0.84486687, + "learning_rate": 3.9755378700266864e-06, + "loss": 0.92631173, + "num_input_tokens_seen": 27704900, + "router_z_loss_clip": 4.640625, + "router_z_loss_mlp": 0.54101562, + "step": 1303, + "time_per_iteration": 2.5946569442749023 + }, + { + "auxiliary_loss_clip": 0.06814861, + "auxiliary_loss_mlp": 0.01309278, + "balance_loss_clip": 0.06343949, + "balance_loss_mlp": 0.01254919, + "epoch": 0.07840072147903202, + "flos": 20200165297920.0, + "grad_norm": 2.085099882897468, + "language_loss": 0.77159727, + "learning_rate": 3.9754771058512585e-06, + "loss": 0.85283864, + "num_input_tokens_seen": 27724890, + "router_z_loss_clip": 4.69921875, + "router_z_loss_mlp": 0.54394531, + "step": 1304, + "time_per_iteration": 2.5800909996032715 + }, + { + "auxiliary_loss_clip": 0.06828763, + "auxiliary_loss_mlp": 0.01313707, + "balance_loss_clip": 0.06349462, + "balance_loss_mlp": 0.01258799, + "epoch": 0.07846084473169998, + "flos": 21367172638080.0, + "grad_norm": 2.1177139553290734, + "language_loss": 0.7841258, + "learning_rate": 3.975416266765542e-06, + "loss": 0.86555046, + "num_input_tokens_seen": 27743115, + "router_z_loss_clip": 4.78515625, + "router_z_loss_mlp": 0.54882812, + "step": 1305, + "time_per_iteration": 2.569558620452881 + }, + { + "auxiliary_loss_clip": 0.06855056, + "auxiliary_loss_mlp": 0.01321096, + "balance_loss_clip": 0.06367438, + "balance_loss_mlp": 0.01261348, + "epoch": 0.07852096798436796, + "flos": 25418037150720.0, + "grad_norm": 3.9004874062794057, + "language_loss": 0.88314414, + "learning_rate": 3.975355352771841e-06, + "loss": 0.96490562, + "num_input_tokens_seen": 27763570, + "router_z_loss_clip": 4.87109375, + "router_z_loss_mlp": 0.59765625, + "step": 1306, + "time_per_iteration": 2.6575305461883545 + }, + { + "auxiliary_loss_clip": 0.06810681, + "auxiliary_loss_mlp": 0.01315273, + "balance_loss_clip": 0.06347391, + "balance_loss_mlp": 0.01263608, + "epoch": 0.07858109123703592, + "flos": 24578360985600.0, + "grad_norm": 4.395850337278793, + "language_loss": 0.93214571, + "learning_rate": 3.975294363872468e-06, + "loss": 1.01340532, + "num_input_tokens_seen": 27780030, + "router_z_loss_clip": 4.6328125, + "router_z_loss_mlp": 0.51660156, + "step": 1307, + "time_per_iteration": 2.592435359954834 + }, + { + "auxiliary_loss_clip": 0.0682511, + "auxiliary_loss_mlp": 0.0131993, + "balance_loss_clip": 0.06345625, + "balance_loss_mlp": 0.01262566, + "epoch": 0.07864121448970389, + "flos": 20704250661120.0, + "grad_norm": 3.2307026300408683, + "language_loss": 0.8507998, + "learning_rate": 3.975233300069735e-06, + "loss": 0.93225014, + "num_input_tokens_seen": 27796225, + "router_z_loss_clip": 4.7890625, + "router_z_loss_mlp": 0.57373047, + "step": 1308, + "time_per_iteration": 2.597881555557251 + }, + { + "auxiliary_loss_clip": 0.06792136, + "auxiliary_loss_mlp": 0.01314144, + "balance_loss_clip": 0.06338251, + "balance_loss_mlp": 0.01262598, + "epoch": 0.07870133774237187, + "flos": 22973207045760.0, + "grad_norm": 1.9389316858499817, + "language_loss": 0.79464692, + "learning_rate": 3.975172161365958e-06, + "loss": 0.87570971, + "num_input_tokens_seen": 27815975, + "router_z_loss_clip": 4.53515625, + "router_z_loss_mlp": 0.515625, + "step": 1309, + "time_per_iteration": 2.599799871444702 + }, + { + "auxiliary_loss_clip": 0.06823064, + "auxiliary_loss_mlp": 0.01328854, + "balance_loss_clip": 0.06347175, + "balance_loss_mlp": 0.01272683, + "epoch": 0.07876146099503983, + "flos": 18848689194240.0, + "grad_norm": 2.5866734138361345, + "language_loss": 0.83378398, + "learning_rate": 3.975110947763453e-06, + "loss": 0.91530323, + "num_input_tokens_seen": 27832255, + "router_z_loss_clip": 4.765625, + "router_z_loss_mlp": 0.56176758, + "step": 1310, + "time_per_iteration": 2.5724973678588867 + }, + { + "auxiliary_loss_clip": 0.0678651, + "auxiliary_loss_mlp": 0.01315999, + "balance_loss_clip": 0.06338531, + "balance_loss_mlp": 0.01264811, + "epoch": 0.0788215842477078, + "flos": 23812631648640.0, + "grad_norm": 2.2765510373912683, + "language_loss": 0.76230896, + "learning_rate": 3.9750496592645435e-06, + "loss": 0.84333402, + "num_input_tokens_seen": 27852180, + "router_z_loss_clip": 4.48046875, + "router_z_loss_mlp": 0.51123047, + "step": 1311, + "time_per_iteration": 2.632310628890991 + }, + { + "auxiliary_loss_clip": 0.0680154, + "auxiliary_loss_mlp": 0.01319845, + "balance_loss_clip": 0.06336971, + "balance_loss_mlp": 0.01265009, + "epoch": 0.07888170750037576, + "flos": 21586329792000.0, + "grad_norm": 3.554782909684318, + "language_loss": 0.88360095, + "learning_rate": 3.974988295871553e-06, + "loss": 0.96481478, + "num_input_tokens_seen": 27871435, + "router_z_loss_clip": 4.640625, + "router_z_loss_mlp": 0.54882812, + "step": 1312, + "time_per_iteration": 2.7384519577026367 + }, + { + "auxiliary_loss_clip": 0.06786558, + "auxiliary_loss_mlp": 0.01318936, + "balance_loss_clip": 0.06334423, + "balance_loss_mlp": 0.01270561, + "epoch": 0.07894183075304374, + "flos": 19870947406080.0, + "grad_norm": 2.1624292410526773, + "language_loss": 0.84578681, + "learning_rate": 3.9749268575868085e-06, + "loss": 0.92684174, + "num_input_tokens_seen": 27890625, + "router_z_loss_clip": 4.5234375, + "router_z_loss_mlp": 0.48388672, + "step": 1313, + "time_per_iteration": 2.6043031215667725 + }, + { + "auxiliary_loss_clip": 0.06836893, + "auxiliary_loss_mlp": 0.01334789, + "balance_loss_clip": 0.06342322, + "balance_loss_mlp": 0.01270368, + "epoch": 0.07900195400571171, + "flos": 16148965368960.0, + "grad_norm": 3.8741474948490717, + "language_loss": 0.75254732, + "learning_rate": 3.97486534441264e-06, + "loss": 0.83426416, + "num_input_tokens_seen": 27906530, + "router_z_loss_clip": 4.94140625, + "router_z_loss_mlp": 0.64404297, + "step": 1314, + "time_per_iteration": 2.532270669937134 + }, + { + "auxiliary_loss_clip": 0.06814209, + "auxiliary_loss_mlp": 0.01316459, + "balance_loss_clip": 0.06346349, + "balance_loss_mlp": 0.01263363, + "epoch": 0.07906207725837967, + "flos": 23736840030720.0, + "grad_norm": 2.0058439737114826, + "language_loss": 0.8208642, + "learning_rate": 3.974803756351379e-06, + "loss": 0.9021709, + "num_input_tokens_seen": 27926725, + "router_z_loss_clip": 4.67578125, + "router_z_loss_mlp": 0.53125, + "step": 1315, + "time_per_iteration": 2.6085028648376465 + }, + { + "auxiliary_loss_clip": 0.06824351, + "auxiliary_loss_mlp": 0.01326067, + "balance_loss_clip": 0.06345295, + "balance_loss_mlp": 0.01265914, + "epoch": 0.07912220051104765, + "flos": 24322712578560.0, + "grad_norm": 1.9106769346900934, + "language_loss": 0.76054502, + "learning_rate": 3.974742093405362e-06, + "loss": 0.84204924, + "num_input_tokens_seen": 27947875, + "router_z_loss_clip": 4.79296875, + "router_z_loss_mlp": 0.60083008, + "step": 1316, + "time_per_iteration": 2.586472749710083 + }, + { + "auxiliary_loss_clip": 0.0684765, + "auxiliary_loss_mlp": 0.01325754, + "balance_loss_clip": 0.06349534, + "balance_loss_mlp": 0.01266244, + "epoch": 0.07918232376371562, + "flos": 18886018988160.0, + "grad_norm": 4.4995832003619, + "language_loss": 0.68677568, + "learning_rate": 3.974680355576927e-06, + "loss": 0.76850969, + "num_input_tokens_seen": 27965040, + "router_z_loss_clip": 4.98046875, + "router_z_loss_mlp": 0.59472656, + "step": 1317, + "time_per_iteration": 2.5489861965179443 + }, + { + "auxiliary_loss_clip": 0.06869859, + "auxiliary_loss_mlp": 0.01349552, + "balance_loss_clip": 0.06357804, + "balance_loss_mlp": 0.01281912, + "epoch": 0.07924244701638358, + "flos": 27382862741760.0, + "grad_norm": 3.047310758275923, + "language_loss": 0.75324464, + "learning_rate": 3.974618542868415e-06, + "loss": 0.83543873, + "num_input_tokens_seen": 27985330, + "router_z_loss_clip": 5.12109375, + "router_z_loss_mlp": 0.67700195, + "step": 1318, + "time_per_iteration": 2.5918128490448 + }, + { + "auxiliary_loss_clip": 0.06830844, + "auxiliary_loss_mlp": 0.01322573, + "balance_loss_clip": 0.06359029, + "balance_loss_mlp": 0.01269692, + "epoch": 0.07930257026905156, + "flos": 25127574572160.0, + "grad_norm": 1.9442087070115428, + "language_loss": 0.92534363, + "learning_rate": 3.97455665528217e-06, + "loss": 1.0068779, + "num_input_tokens_seen": 28007615, + "router_z_loss_clip": 4.7109375, + "router_z_loss_mlp": 0.52929688, + "step": 1319, + "time_per_iteration": 3.993619203567505 + }, + { + "auxiliary_loss_clip": 0.06832193, + "auxiliary_loss_mlp": 0.0132254, + "balance_loss_clip": 0.06361841, + "balance_loss_mlp": 0.01272902, + "epoch": 0.07936269352171953, + "flos": 21840804241920.0, + "grad_norm": 2.144433650708689, + "language_loss": 0.81964207, + "learning_rate": 3.974494692820539e-06, + "loss": 0.90118945, + "num_input_tokens_seen": 28027765, + "router_z_loss_clip": 4.703125, + "router_z_loss_mlp": 0.49633789, + "step": 1320, + "time_per_iteration": 3.991323232650757 + }, + { + "auxiliary_loss_clip": 0.06858893, + "auxiliary_loss_mlp": 0.01331954, + "balance_loss_clip": 0.06361651, + "balance_loss_mlp": 0.01271801, + "epoch": 0.07942281677438749, + "flos": 16944477632640.0, + "grad_norm": 2.2380017082009576, + "language_loss": 0.71816266, + "learning_rate": 3.974432655485872e-06, + "loss": 0.80007118, + "num_input_tokens_seen": 28044225, + "router_z_loss_clip": 4.96484375, + "router_z_loss_mlp": 0.60205078, + "step": 1321, + "time_per_iteration": 2.5437092781066895 + }, + { + "auxiliary_loss_clip": 0.06835557, + "auxiliary_loss_mlp": 0.01340758, + "balance_loss_clip": 0.06363731, + "balance_loss_mlp": 0.01282297, + "epoch": 0.07948294002705546, + "flos": 18992515927680.0, + "grad_norm": 2.7756488817332943, + "language_loss": 0.86391938, + "learning_rate": 3.9743705432805195e-06, + "loss": 0.94568253, + "num_input_tokens_seen": 28062915, + "router_z_loss_clip": 4.7109375, + "router_z_loss_mlp": 0.5847168, + "step": 1322, + "time_per_iteration": 2.579061985015869 + }, + { + "auxiliary_loss_clip": 0.06837995, + "auxiliary_loss_mlp": 0.01339731, + "balance_loss_clip": 0.0636203, + "balance_loss_mlp": 0.01284681, + "epoch": 0.07954306327972344, + "flos": 21659983130880.0, + "grad_norm": 2.3668510426442144, + "language_loss": 0.92888951, + "learning_rate": 3.974308356206838e-06, + "loss": 1.01066673, + "num_input_tokens_seen": 28082175, + "router_z_loss_clip": 4.7578125, + "router_z_loss_mlp": 0.55053711, + "step": 1323, + "time_per_iteration": 3.9885079860687256 + }, + { + "auxiliary_loss_clip": 0.06820317, + "auxiliary_loss_mlp": 0.01320075, + "balance_loss_clip": 0.06361794, + "balance_loss_mlp": 0.01267504, + "epoch": 0.0796031865323914, + "flos": 23226717173760.0, + "grad_norm": 4.577989929254941, + "language_loss": 0.84617591, + "learning_rate": 3.974246094267187e-06, + "loss": 0.92757982, + "num_input_tokens_seen": 28102645, + "router_z_loss_clip": 4.58203125, + "router_z_loss_mlp": 0.52661133, + "step": 1324, + "time_per_iteration": 2.575162410736084 + }, + { + "auxiliary_loss_clip": 0.0682738, + "auxiliary_loss_mlp": 0.01317412, + "balance_loss_clip": 0.06365715, + "balance_loss_mlp": 0.0126372, + "epoch": 0.07966330978505937, + "flos": 23301209053440.0, + "grad_norm": 4.146924168553952, + "language_loss": 0.81619465, + "learning_rate": 3.974183757463925e-06, + "loss": 0.89764249, + "num_input_tokens_seen": 28122805, + "router_z_loss_clip": 4.6171875, + "router_z_loss_mlp": 0.53710938, + "step": 1325, + "time_per_iteration": 3.9960508346557617 + }, + { + "auxiliary_loss_clip": 0.06838783, + "auxiliary_loss_mlp": 0.01317663, + "balance_loss_clip": 0.06375229, + "balance_loss_mlp": 0.01262112, + "epoch": 0.07972343303772735, + "flos": 18368768534400.0, + "grad_norm": 3.482553532723253, + "language_loss": 0.90544963, + "learning_rate": 3.974121345799418e-06, + "loss": 0.98701411, + "num_input_tokens_seen": 28140530, + "router_z_loss_clip": 4.63671875, + "router_z_loss_mlp": 0.55493164, + "step": 1326, + "time_per_iteration": 2.5401828289031982 + }, + { + "auxiliary_loss_clip": 0.0682137, + "auxiliary_loss_mlp": 0.01316322, + "balance_loss_clip": 0.06366737, + "balance_loss_mlp": 0.01263488, + "epoch": 0.07978355629039531, + "flos": 21768995692800.0, + "grad_norm": 2.4962093100336085, + "language_loss": 0.85295928, + "learning_rate": 3.974058859276032e-06, + "loss": 0.93433619, + "num_input_tokens_seen": 28159640, + "router_z_loss_clip": 4.54296875, + "router_z_loss_mlp": 0.52856445, + "step": 1327, + "time_per_iteration": 2.6081485748291016 + }, + { + "auxiliary_loss_clip": 0.0686523, + "auxiliary_loss_mlp": 0.01320845, + "balance_loss_clip": 0.06376741, + "balance_loss_mlp": 0.01260119, + "epoch": 0.07984367954306328, + "flos": 18557178439680.0, + "grad_norm": 3.6856767873413077, + "language_loss": 0.82425529, + "learning_rate": 3.9739962978961354e-06, + "loss": 0.90611601, + "num_input_tokens_seen": 28177050, + "router_z_loss_clip": 4.88671875, + "router_z_loss_mlp": 0.60742188, + "step": 1328, + "time_per_iteration": 2.5963807106018066 + }, + { + "auxiliary_loss_clip": 0.06855517, + "auxiliary_loss_mlp": 0.01323941, + "balance_loss_clip": 0.06378672, + "balance_loss_mlp": 0.01266315, + "epoch": 0.07990380279573125, + "flos": 16908741066240.0, + "grad_norm": 2.810501054411486, + "language_loss": 0.77465802, + "learning_rate": 3.973933661662101e-06, + "loss": 0.85645258, + "num_input_tokens_seen": 28193245, + "router_z_loss_clip": 4.76953125, + "router_z_loss_mlp": 0.57666016, + "step": 1329, + "time_per_iteration": 2.5654993057250977 + }, + { + "auxiliary_loss_clip": 0.06870389, + "auxiliary_loss_mlp": 0.01332359, + "balance_loss_clip": 0.06403654, + "balance_loss_mlp": 0.01277785, + "epoch": 0.07996392604839922, + "flos": 24105358287360.0, + "grad_norm": 3.2158550447724354, + "language_loss": 0.83423603, + "learning_rate": 3.973870950576305e-06, + "loss": 0.91626346, + "num_input_tokens_seen": 28213570, + "router_z_loss_clip": 4.66796875, + "router_z_loss_mlp": 0.5456543, + "step": 1330, + "time_per_iteration": 2.689359426498413 + }, + { + "auxiliary_loss_clip": 0.06871998, + "auxiliary_loss_mlp": 0.01327325, + "balance_loss_clip": 0.06395264, + "balance_loss_mlp": 0.01271893, + "epoch": 0.08002404930106718, + "flos": 14283257558400.0, + "grad_norm": 2.3593668670474375, + "language_loss": 0.91363919, + "learning_rate": 3.9738081646411255e-06, + "loss": 0.99563241, + "num_input_tokens_seen": 28229980, + "router_z_loss_clip": 4.765625, + "router_z_loss_mlp": 0.5534668, + "step": 1331, + "time_per_iteration": 2.535022735595703 + }, + { + "auxiliary_loss_clip": 0.06886654, + "auxiliary_loss_mlp": 0.01331981, + "balance_loss_clip": 0.0639886, + "balance_loss_mlp": 0.01274283, + "epoch": 0.08008417255373516, + "flos": 40415732547840.0, + "grad_norm": 8.382777264974079, + "language_loss": 0.75984204, + "learning_rate": 3.973745303858942e-06, + "loss": 0.84202838, + "num_input_tokens_seen": 28253840, + "router_z_loss_clip": 4.875, + "router_z_loss_mlp": 0.57666016, + "step": 1332, + "time_per_iteration": 2.798543691635132 + }, + { + "auxiliary_loss_clip": 0.06853566, + "auxiliary_loss_mlp": 0.01322273, + "balance_loss_clip": 0.06399575, + "balance_loss_mlp": 0.01270894, + "epoch": 0.08014429580640313, + "flos": 18484866766080.0, + "grad_norm": 3.077187306300229, + "language_loss": 0.84502465, + "learning_rate": 3.973682368232138e-06, + "loss": 0.92678297, + "num_input_tokens_seen": 28271675, + "router_z_loss_clip": 4.54296875, + "router_z_loss_mlp": 0.51318359, + "step": 1333, + "time_per_iteration": 2.55322003364563 + }, + { + "auxiliary_loss_clip": 0.06860092, + "auxiliary_loss_mlp": 0.01337998, + "balance_loss_clip": 0.06402323, + "balance_loss_mlp": 0.01283972, + "epoch": 0.0802044190590711, + "flos": 22059835614720.0, + "grad_norm": 5.409358557797253, + "language_loss": 0.77425432, + "learning_rate": 3.9736193577631015e-06, + "loss": 0.85623527, + "num_input_tokens_seen": 28291850, + "router_z_loss_clip": 4.57421875, + "router_z_loss_mlp": 0.54052734, + "step": 1334, + "time_per_iteration": 2.6176130771636963 + }, + { + "auxiliary_loss_clip": 0.06866166, + "auxiliary_loss_mlp": 0.01339925, + "balance_loss_clip": 0.06404187, + "balance_loss_mlp": 0.01288045, + "epoch": 0.08026454231173906, + "flos": 24579115672320.0, + "grad_norm": 2.171957673256717, + "language_loss": 0.82094586, + "learning_rate": 3.973556272454221e-06, + "loss": 0.90300679, + "num_input_tokens_seen": 28310780, + "router_z_loss_clip": 4.6171875, + "router_z_loss_mlp": 0.51855469, + "step": 1335, + "time_per_iteration": 2.5995283126831055 + }, + { + "auxiliary_loss_clip": 0.0666078, + "auxiliary_loss_mlp": 0.0130214, + "balance_loss_clip": 0.06361455, + "balance_loss_mlp": 0.01275747, + "epoch": 0.08032466556440704, + "flos": 52597716940800.0, + "grad_norm": 0.7171954407460774, + "language_loss": 0.56264853, + "learning_rate": 3.973493112307889e-06, + "loss": 0.64227772, + "num_input_tokens_seen": 28369985, + "router_z_loss_clip": 3.0, + "router_z_loss_mlp": 0.2644043, + "step": 1336, + "time_per_iteration": 3.246748447418213 + }, + { + "auxiliary_loss_clip": 0.06839207, + "auxiliary_loss_mlp": 0.01326336, + "balance_loss_clip": 0.06379974, + "balance_loss_mlp": 0.01274528, + "epoch": 0.080384788817075, + "flos": 23849500245120.0, + "grad_norm": 4.030100704660237, + "language_loss": 0.70582694, + "learning_rate": 3.9734298773265005e-06, + "loss": 0.78748238, + "num_input_tokens_seen": 28388670, + "router_z_loss_clip": 4.58984375, + "router_z_loss_mlp": 0.51757812, + "step": 1337, + "time_per_iteration": 2.6283833980560303 + }, + { + "auxiliary_loss_clip": 0.06838794, + "auxiliary_loss_mlp": 0.01334035, + "balance_loss_clip": 0.06387126, + "balance_loss_mlp": 0.01282751, + "epoch": 0.08044491206974297, + "flos": 25307640996480.0, + "grad_norm": 2.123866739454124, + "language_loss": 0.89543176, + "learning_rate": 3.973366567512453e-06, + "loss": 0.97716004, + "num_input_tokens_seen": 28411845, + "router_z_loss_clip": 4.515625, + "router_z_loss_mlp": 0.51245117, + "step": 1338, + "time_per_iteration": 2.657308340072632 + }, + { + "auxiliary_loss_clip": 0.0684766, + "auxiliary_loss_mlp": 0.01327669, + "balance_loss_clip": 0.06375088, + "balance_loss_mlp": 0.01275956, + "epoch": 0.08050503532241095, + "flos": 22382093617920.0, + "grad_norm": 3.2141596734882705, + "language_loss": 0.89268589, + "learning_rate": 3.973303182868147e-06, + "loss": 0.97443926, + "num_input_tokens_seen": 28427875, + "router_z_loss_clip": 4.7265625, + "router_z_loss_mlp": 0.51708984, + "step": 1339, + "time_per_iteration": 2.592478036880493 + }, + { + "auxiliary_loss_clip": 0.06819817, + "auxiliary_loss_mlp": 0.01317452, + "balance_loss_clip": 0.06381136, + "balance_loss_mlp": 0.01272391, + "epoch": 0.08056515857507891, + "flos": 18375351079680.0, + "grad_norm": 3.0627135326619093, + "language_loss": 0.91607487, + "learning_rate": 3.973239723395988e-06, + "loss": 0.99744761, + "num_input_tokens_seen": 28446615, + "router_z_loss_clip": 4.390625, + "router_z_loss_mlp": 0.45019531, + "step": 1340, + "time_per_iteration": 2.576737403869629 + }, + { + "auxiliary_loss_clip": 0.06633395, + "auxiliary_loss_mlp": 0.01308679, + "balance_loss_clip": 0.06341641, + "balance_loss_mlp": 0.01279282, + "epoch": 0.08062528182774688, + "flos": 51364938545280.0, + "grad_norm": 0.8608858843500025, + "language_loss": 0.65432441, + "learning_rate": 3.97317618909838e-06, + "loss": 0.73374522, + "num_input_tokens_seen": 28505290, + "router_z_loss_clip": 2.91992188, + "router_z_loss_mlp": 0.29321289, + "step": 1341, + "time_per_iteration": 3.1589889526367188 + }, + { + "auxiliary_loss_clip": 0.06851779, + "auxiliary_loss_mlp": 0.01330947, + "balance_loss_clip": 0.06375904, + "balance_loss_mlp": 0.01274966, + "epoch": 0.08068540508041486, + "flos": 17604925914240.0, + "grad_norm": 3.057229978757205, + "language_loss": 0.9131434, + "learning_rate": 3.973112579977733e-06, + "loss": 0.99497068, + "num_input_tokens_seen": 28522735, + "router_z_loss_clip": 4.7578125, + "router_z_loss_mlp": 0.55932617, + "step": 1342, + "time_per_iteration": 2.5444014072418213 + }, + { + "auxiliary_loss_clip": 0.06830276, + "auxiliary_loss_mlp": 0.01334079, + "balance_loss_clip": 0.06376267, + "balance_loss_mlp": 0.01283748, + "epoch": 0.08074552833308282, + "flos": 10565761714560.0, + "grad_norm": 4.354152160697022, + "language_loss": 0.78571475, + "learning_rate": 3.973048896036459e-06, + "loss": 0.86735827, + "num_input_tokens_seen": 28539460, + "router_z_loss_clip": 4.54296875, + "router_z_loss_mlp": 0.50268555, + "step": 1343, + "time_per_iteration": 2.5960419178009033 + }, + { + "auxiliary_loss_clip": 0.06624237, + "auxiliary_loss_mlp": 0.01296199, + "balance_loss_clip": 0.06332739, + "balance_loss_mlp": 0.0127157, + "epoch": 0.08080565158575079, + "flos": 60859624245120.0, + "grad_norm": 0.7713053801929547, + "language_loss": 0.57751364, + "learning_rate": 3.972985137276974e-06, + "loss": 0.65671802, + "num_input_tokens_seen": 28599855, + "router_z_loss_clip": 2.90625, + "router_z_loss_mlp": 0.24609375, + "step": 1344, + "time_per_iteration": 3.101456880569458 + }, + { + "auxiliary_loss_clip": 0.06825489, + "auxiliary_loss_mlp": 0.01321695, + "balance_loss_clip": 0.06367917, + "balance_loss_mlp": 0.01271937, + "epoch": 0.08086577483841875, + "flos": 18338188993920.0, + "grad_norm": 5.096262211204216, + "language_loss": 0.90334368, + "learning_rate": 3.972921303701695e-06, + "loss": 0.98481554, + "num_input_tokens_seen": 28617585, + "router_z_loss_clip": 4.578125, + "router_z_loss_mlp": 0.49780273, + "step": 1345, + "time_per_iteration": 2.586388349533081 + }, + { + "auxiliary_loss_clip": 0.0679345, + "auxiliary_loss_mlp": 0.013189, + "balance_loss_clip": 0.06356402, + "balance_loss_mlp": 0.01272527, + "epoch": 0.08092589809108673, + "flos": 21550048174080.0, + "grad_norm": 2.3072860000969437, + "language_loss": 0.89656544, + "learning_rate": 3.972857395313042e-06, + "loss": 0.97768891, + "num_input_tokens_seen": 28636355, + "router_z_loss_clip": 4.37109375, + "router_z_loss_mlp": 0.46386719, + "step": 1346, + "time_per_iteration": 2.582712411880493 + }, + { + "auxiliary_loss_clip": 0.06790248, + "auxiliary_loss_mlp": 0.01314356, + "balance_loss_clip": 0.06353667, + "balance_loss_mlp": 0.0126734, + "epoch": 0.0809860213437547, + "flos": 22134662910720.0, + "grad_norm": 2.14729633171376, + "language_loss": 0.94647479, + "learning_rate": 3.972793412113439e-06, + "loss": 1.0275209, + "num_input_tokens_seen": 28656260, + "router_z_loss_clip": 4.36328125, + "router_z_loss_mlp": 0.47021484, + "step": 1347, + "time_per_iteration": 2.625967025756836 + }, + { + "auxiliary_loss_clip": 0.06793564, + "auxiliary_loss_mlp": 0.01318721, + "balance_loss_clip": 0.06355867, + "balance_loss_mlp": 0.01268487, + "epoch": 0.08104614459642266, + "flos": 21731875534080.0, + "grad_norm": 1.9969105850097444, + "language_loss": 0.91454613, + "learning_rate": 3.972729354105312e-06, + "loss": 0.99566901, + "num_input_tokens_seen": 28675865, + "router_z_loss_clip": 4.37109375, + "router_z_loss_mlp": 0.50219727, + "step": 1348, + "time_per_iteration": 2.5634779930114746 + }, + { + "auxiliary_loss_clip": 0.06800284, + "auxiliary_loss_mlp": 0.01324319, + "balance_loss_clip": 0.06360676, + "balance_loss_mlp": 0.01274585, + "epoch": 0.08110626784909064, + "flos": 23958764369280.0, + "grad_norm": 1.9721965286660104, + "language_loss": 0.78618681, + "learning_rate": 3.97266522129109e-06, + "loss": 0.86743283, + "num_input_tokens_seen": 28696255, + "router_z_loss_clip": 4.39453125, + "router_z_loss_mlp": 0.49731445, + "step": 1349, + "time_per_iteration": 2.6185498237609863 + }, + { + "auxiliary_loss_clip": 0.06800876, + "auxiliary_loss_mlp": 0.01313559, + "balance_loss_clip": 0.06350809, + "balance_loss_mlp": 0.01260082, + "epoch": 0.0811663911017586, + "flos": 19031648584320.0, + "grad_norm": 2.1691769325426407, + "language_loss": 0.90292668, + "learning_rate": 3.972601013673205e-06, + "loss": 0.98407102, + "num_input_tokens_seen": 28713905, + "router_z_loss_clip": 4.5, + "router_z_loss_mlp": 0.53491211, + "step": 1350, + "time_per_iteration": 2.5529837608337402 + }, + { + "auxiliary_loss_clip": 0.06778225, + "auxiliary_loss_mlp": 0.01313184, + "balance_loss_clip": 0.06345821, + "balance_loss_mlp": 0.01263522, + "epoch": 0.08122651435442657, + "flos": 15346744778880.0, + "grad_norm": 2.4256402439075524, + "language_loss": 0.84302771, + "learning_rate": 3.972536731254092e-06, + "loss": 0.92394179, + "num_input_tokens_seen": 28732075, + "router_z_loss_clip": 4.33203125, + "router_z_loss_mlp": 0.49633789, + "step": 1351, + "time_per_iteration": 2.574605941772461 + }, + { + "auxiliary_loss_clip": 0.06780043, + "auxiliary_loss_mlp": 0.01313675, + "balance_loss_clip": 0.06340061, + "balance_loss_mlp": 0.01260365, + "epoch": 0.08128663760709455, + "flos": 23228226547200.0, + "grad_norm": 2.4241077577089296, + "language_loss": 0.77524561, + "learning_rate": 3.972472374036189e-06, + "loss": 0.85618269, + "num_input_tokens_seen": 28751150, + "router_z_loss_clip": 4.39453125, + "router_z_loss_mlp": 0.53393555, + "step": 1352, + "time_per_iteration": 2.5638983249664307 + }, + { + "auxiliary_loss_clip": 0.06784214, + "auxiliary_loss_mlp": 0.01317971, + "balance_loss_clip": 0.06339107, + "balance_loss_mlp": 0.01263802, + "epoch": 0.08134676085976252, + "flos": 22972158869760.0, + "grad_norm": 2.0098905052691154, + "language_loss": 0.84226817, + "learning_rate": 3.972407942021935e-06, + "loss": 0.92329001, + "num_input_tokens_seen": 28773360, + "router_z_loss_clip": 4.44140625, + "router_z_loss_mlp": 0.54223633, + "step": 1353, + "time_per_iteration": 2.64945125579834 + }, + { + "auxiliary_loss_clip": 0.06608218, + "auxiliary_loss_mlp": 0.01309213, + "balance_loss_clip": 0.06325812, + "balance_loss_mlp": 0.01278219, + "epoch": 0.08140688411243048, + "flos": 64338592642560.0, + "grad_norm": 0.8262871142057754, + "language_loss": 0.5983628, + "learning_rate": 3.972343435213775e-06, + "loss": 0.67753708, + "num_input_tokens_seen": 28833390, + "router_z_loss_clip": 2.828125, + "router_z_loss_mlp": 0.30957031, + "step": 1354, + "time_per_iteration": 3.1732943058013916 + }, + { + "auxiliary_loss_clip": 0.06774879, + "auxiliary_loss_mlp": 0.0130121, + "balance_loss_clip": 0.0634238, + "balance_loss_mlp": 0.01251332, + "epoch": 0.08146700736509845, + "flos": 22498401484800.0, + "grad_norm": 1.9500881523267093, + "language_loss": 0.84588456, + "learning_rate": 3.972278853614154e-06, + "loss": 0.92664552, + "num_input_tokens_seen": 28852430, + "router_z_loss_clip": 4.32421875, + "router_z_loss_mlp": 0.49853516, + "step": 1355, + "time_per_iteration": 2.6024701595306396 + }, + { + "auxiliary_loss_clip": 0.06776839, + "auxiliary_loss_mlp": 0.01312133, + "balance_loss_clip": 0.06341404, + "balance_loss_mlp": 0.01258727, + "epoch": 0.08152713061776642, + "flos": 20453885061120.0, + "grad_norm": 2.065670918937768, + "language_loss": 0.73062277, + "learning_rate": 3.972214197225521e-06, + "loss": 0.81151247, + "num_input_tokens_seen": 28870685, + "router_z_loss_clip": 4.3515625, + "router_z_loss_mlp": 0.53393555, + "step": 1356, + "time_per_iteration": 2.72872257232666 + }, + { + "auxiliary_loss_clip": 0.06800745, + "auxiliary_loss_mlp": 0.01315187, + "balance_loss_clip": 0.06343117, + "balance_loss_mlp": 0.01261305, + "epoch": 0.08158725387043439, + "flos": 23556983241600.0, + "grad_norm": 2.136910900826005, + "language_loss": 0.72079623, + "learning_rate": 3.972149466050329e-06, + "loss": 0.80195546, + "num_input_tokens_seen": 28889860, + "router_z_loss_clip": 4.57421875, + "router_z_loss_mlp": 0.5390625, + "step": 1357, + "time_per_iteration": 2.5841641426086426 + }, + { + "auxiliary_loss_clip": 0.06792152, + "auxiliary_loss_mlp": 0.01312262, + "balance_loss_clip": 0.06345978, + "balance_loss_mlp": 0.01258093, + "epoch": 0.08164737712310235, + "flos": 22023763632000.0, + "grad_norm": 3.905031036394957, + "language_loss": 0.86688, + "learning_rate": 3.97208466009103e-06, + "loss": 0.94792414, + "num_input_tokens_seen": 28905865, + "router_z_loss_clip": 4.45703125, + "router_z_loss_mlp": 0.54150391, + "step": 1358, + "time_per_iteration": 4.091388940811157 + }, + { + "auxiliary_loss_clip": 0.0678063, + "auxiliary_loss_mlp": 0.01322843, + "balance_loss_clip": 0.06336431, + "balance_loss_mlp": 0.01268985, + "epoch": 0.08170750037577033, + "flos": 23374568903040.0, + "grad_norm": 2.183092150408785, + "language_loss": 1.0464294, + "learning_rate": 3.972019779350084e-06, + "loss": 1.12746406, + "num_input_tokens_seen": 28925250, + "router_z_loss_clip": 4.4453125, + "router_z_loss_mlp": 0.53857422, + "step": 1359, + "time_per_iteration": 2.638028860092163 + }, + { + "auxiliary_loss_clip": 0.06798591, + "auxiliary_loss_mlp": 0.01334932, + "balance_loss_clip": 0.06339104, + "balance_loss_mlp": 0.01274732, + "epoch": 0.0817676236284383, + "flos": 28404743610240.0, + "grad_norm": 2.2550025008974335, + "language_loss": 0.86049831, + "learning_rate": 3.971954823829951e-06, + "loss": 0.9418335, + "num_input_tokens_seen": 28943445, + "router_z_loss_clip": 4.59765625, + "router_z_loss_mlp": 0.60229492, + "step": 1360, + "time_per_iteration": 4.079089164733887 + }, + { + "auxiliary_loss_clip": 0.06791367, + "auxiliary_loss_mlp": 0.01327265, + "balance_loss_clip": 0.06338443, + "balance_loss_mlp": 0.01274146, + "epoch": 0.08182774688110626, + "flos": 19215027244800.0, + "grad_norm": 8.376592298607987, + "language_loss": 0.74940681, + "learning_rate": 3.971889793533093e-06, + "loss": 0.83059311, + "num_input_tokens_seen": 28962695, + "router_z_loss_clip": 4.53125, + "router_z_loss_mlp": 0.53125, + "step": 1361, + "time_per_iteration": 2.6070094108581543 + }, + { + "auxiliary_loss_clip": 0.06780887, + "auxiliary_loss_mlp": 0.01320749, + "balance_loss_clip": 0.06343664, + "balance_loss_mlp": 0.01270443, + "epoch": 0.08188787013377424, + "flos": 22790750780160.0, + "grad_norm": 2.8909747766913574, + "language_loss": 0.79067749, + "learning_rate": 3.971824688461976e-06, + "loss": 0.87169385, + "num_input_tokens_seen": 28982120, + "router_z_loss_clip": 4.3671875, + "router_z_loss_mlp": 0.50244141, + "step": 1362, + "time_per_iteration": 2.575406074523926 + }, + { + "auxiliary_loss_clip": 0.06776625, + "auxiliary_loss_mlp": 0.01317112, + "balance_loss_clip": 0.06338399, + "balance_loss_mlp": 0.01266543, + "epoch": 0.08194799338644221, + "flos": 16473026234880.0, + "grad_norm": 2.5840358465526787, + "language_loss": 0.74518561, + "learning_rate": 3.971759508619069e-06, + "loss": 0.826123, + "num_input_tokens_seen": 28998100, + "router_z_loss_clip": 4.3828125, + "router_z_loss_mlp": 0.50537109, + "step": 1363, + "time_per_iteration": 3.9524402618408203 + }, + { + "auxiliary_loss_clip": 0.06785508, + "auxiliary_loss_mlp": 0.01321755, + "balance_loss_clip": 0.06342393, + "balance_loss_mlp": 0.01265846, + "epoch": 0.08200811663911017, + "flos": 23920218691200.0, + "grad_norm": 2.478943630227512, + "language_loss": 0.79175317, + "learning_rate": 3.971694254006844e-06, + "loss": 0.87282574, + "num_input_tokens_seen": 29017095, + "router_z_loss_clip": 4.43359375, + "router_z_loss_mlp": 0.55859375, + "step": 1364, + "time_per_iteration": 2.607170343399048 + }, + { + "auxiliary_loss_clip": 0.06783722, + "auxiliary_loss_mlp": 0.01316868, + "balance_loss_clip": 0.06340142, + "balance_loss_mlp": 0.01262867, + "epoch": 0.08206823989177814, + "flos": 17902641870720.0, + "grad_norm": 2.8411268969790275, + "language_loss": 0.83563399, + "learning_rate": 3.971628924627776e-06, + "loss": 0.91663992, + "num_input_tokens_seen": 29037240, + "router_z_loss_clip": 4.4375, + "router_z_loss_mlp": 0.54003906, + "step": 1365, + "time_per_iteration": 4.020315647125244 + }, + { + "auxiliary_loss_clip": 0.06767645, + "auxiliary_loss_mlp": 0.01324198, + "balance_loss_clip": 0.06336691, + "balance_loss_mlp": 0.01274917, + "epoch": 0.08212836314444612, + "flos": 22094272442880.0, + "grad_norm": 1.9744562731627089, + "language_loss": 0.83576512, + "learning_rate": 3.97156352048434e-06, + "loss": 0.91668355, + "num_input_tokens_seen": 29056250, + "router_z_loss_clip": 4.30078125, + "router_z_loss_mlp": 0.49243164, + "step": 1366, + "time_per_iteration": 2.5904746055603027 + }, + { + "auxiliary_loss_clip": 0.06785953, + "auxiliary_loss_mlp": 0.01321056, + "balance_loss_clip": 0.06344087, + "balance_loss_mlp": 0.01269963, + "epoch": 0.08218848639711408, + "flos": 17602326437760.0, + "grad_norm": 2.595099293602591, + "language_loss": 0.84101415, + "learning_rate": 3.97149804157902e-06, + "loss": 0.92208421, + "num_input_tokens_seen": 29073380, + "router_z_loss_clip": 4.41015625, + "router_z_loss_mlp": 0.51074219, + "step": 1367, + "time_per_iteration": 2.547091007232666 + }, + { + "auxiliary_loss_clip": 0.06812844, + "auxiliary_loss_mlp": 0.01336623, + "balance_loss_clip": 0.06357861, + "balance_loss_mlp": 0.01283504, + "epoch": 0.08224860964978205, + "flos": 17863551141120.0, + "grad_norm": 3.794710967606561, + "language_loss": 0.85955203, + "learning_rate": 3.9714324879142946e-06, + "loss": 0.94104671, + "num_input_tokens_seen": 29091330, + "router_z_loss_clip": 4.546875, + "router_z_loss_mlp": 0.53100586, + "step": 1368, + "time_per_iteration": 2.6025125980377197 + }, + { + "auxiliary_loss_clip": 0.06754048, + "auxiliary_loss_mlp": 0.01305347, + "balance_loss_clip": 0.06340475, + "balance_loss_mlp": 0.01259881, + "epoch": 0.08230873290245003, + "flos": 25234406928000.0, + "grad_norm": 1.7485210372757418, + "language_loss": 0.82751203, + "learning_rate": 3.971366859492653e-06, + "loss": 0.90810603, + "num_input_tokens_seen": 29110375, + "router_z_loss_clip": 4.12890625, + "router_z_loss_mlp": 0.45458984, + "step": 1369, + "time_per_iteration": 2.6027116775512695 + }, + { + "auxiliary_loss_clip": 0.06772825, + "auxiliary_loss_mlp": 0.01314688, + "balance_loss_clip": 0.06341462, + "balance_loss_mlp": 0.01264811, + "epoch": 0.08236885615511799, + "flos": 31768144099200.0, + "grad_norm": 4.8921113569353425, + "language_loss": 0.77775633, + "learning_rate": 3.971301156316582e-06, + "loss": 0.85863149, + "num_input_tokens_seen": 29129395, + "router_z_loss_clip": 4.31640625, + "router_z_loss_mlp": 0.49902344, + "step": 1370, + "time_per_iteration": 2.685317039489746 + }, + { + "auxiliary_loss_clip": 0.06783543, + "auxiliary_loss_mlp": 0.01317271, + "balance_loss_clip": 0.06345622, + "balance_loss_mlp": 0.01265153, + "epoch": 0.08242897940778596, + "flos": 23192615761920.0, + "grad_norm": 2.053394395942029, + "language_loss": 0.76803637, + "learning_rate": 3.971235378388573e-06, + "loss": 0.84904444, + "num_input_tokens_seen": 29148650, + "router_z_loss_clip": 4.37890625, + "router_z_loss_mlp": 0.52124023, + "step": 1371, + "time_per_iteration": 2.6406354904174805 + }, + { + "auxiliary_loss_clip": 0.06769266, + "auxiliary_loss_mlp": 0.01317025, + "balance_loss_clip": 0.06335683, + "balance_loss_mlp": 0.01267625, + "epoch": 0.08248910266045394, + "flos": 34499327932800.0, + "grad_norm": 3.0324747361967557, + "language_loss": 0.72827047, + "learning_rate": 3.971169525711122e-06, + "loss": 0.80913335, + "num_input_tokens_seen": 29170785, + "router_z_loss_clip": 4.33203125, + "router_z_loss_mlp": 0.49438477, + "step": 1372, + "time_per_iteration": 2.709796905517578 + }, + { + "auxiliary_loss_clip": 0.06798708, + "auxiliary_loss_mlp": 0.01317216, + "balance_loss_clip": 0.06345405, + "balance_loss_mlp": 0.01260854, + "epoch": 0.0825492259131219, + "flos": 13440059521920.0, + "grad_norm": 3.0329353190283075, + "language_loss": 0.9010855, + "learning_rate": 3.9711035982867246e-06, + "loss": 0.98224467, + "num_input_tokens_seen": 29185210, + "router_z_loss_clip": 4.53125, + "router_z_loss_mlp": 0.56420898, + "step": 1373, + "time_per_iteration": 2.5570318698883057 + }, + { + "auxiliary_loss_clip": 0.06774755, + "auxiliary_loss_mlp": 0.01317124, + "balance_loss_clip": 0.0634156, + "balance_loss_mlp": 0.01267056, + "epoch": 0.08260934916578987, + "flos": 25819608643200.0, + "grad_norm": 3.0603308178325657, + "language_loss": 0.84582615, + "learning_rate": 3.971037596117882e-06, + "loss": 0.92674494, + "num_input_tokens_seen": 29205210, + "router_z_loss_clip": 4.3359375, + "router_z_loss_mlp": 0.50024414, + "step": 1374, + "time_per_iteration": 2.596226215362549 + }, + { + "auxiliary_loss_clip": 0.06626149, + "auxiliary_loss_mlp": 0.0129603, + "balance_loss_clip": 0.06341976, + "balance_loss_mlp": 0.01265918, + "epoch": 0.08266947241845783, + "flos": 63478609061760.0, + "grad_norm": 0.8009341803089134, + "language_loss": 0.60659707, + "learning_rate": 3.970971519207095e-06, + "loss": 0.68581879, + "num_input_tokens_seen": 29265350, + "router_z_loss_clip": 2.84375, + "router_z_loss_mlp": 0.30053711, + "step": 1375, + "time_per_iteration": 3.177459716796875 + }, + { + "auxiliary_loss_clip": 0.06618689, + "auxiliary_loss_mlp": 0.01286424, + "balance_loss_clip": 0.06334813, + "balance_loss_mlp": 0.01256718, + "epoch": 0.08272959567112581, + "flos": 70013855606400.0, + "grad_norm": 0.886054791003263, + "language_loss": 0.62275791, + "learning_rate": 3.970905367556871e-06, + "loss": 0.70180905, + "num_input_tokens_seen": 29321475, + "router_z_loss_clip": 2.84375, + "router_z_loss_mlp": 0.29638672, + "step": 1376, + "time_per_iteration": 3.1206676959991455 + }, + { + "auxiliary_loss_clip": 0.06771185, + "auxiliary_loss_mlp": 0.01316915, + "balance_loss_clip": 0.06341776, + "balance_loss_mlp": 0.01268611, + "epoch": 0.08278971892379378, + "flos": 20419574014080.0, + "grad_norm": 2.5198182509144735, + "language_loss": 0.84768277, + "learning_rate": 3.970839141169718e-06, + "loss": 0.92856377, + "num_input_tokens_seen": 29341405, + "router_z_loss_clip": 4.2890625, + "router_z_loss_mlp": 0.48266602, + "step": 1377, + "time_per_iteration": 2.6820216178894043 + }, + { + "auxiliary_loss_clip": 0.06764729, + "auxiliary_loss_mlp": 0.01308146, + "balance_loss_clip": 0.06342821, + "balance_loss_mlp": 0.0126144, + "epoch": 0.08284984217646174, + "flos": 26257461753600.0, + "grad_norm": 2.286420184169047, + "language_loss": 0.86602247, + "learning_rate": 3.970772840048147e-06, + "loss": 0.94675124, + "num_input_tokens_seen": 29361955, + "router_z_loss_clip": 4.2109375, + "router_z_loss_mlp": 0.46728516, + "step": 1378, + "time_per_iteration": 2.5983967781066895 + }, + { + "auxiliary_loss_clip": 0.06779523, + "auxiliary_loss_mlp": 0.01324128, + "balance_loss_clip": 0.06348801, + "balance_loss_mlp": 0.01275396, + "epoch": 0.08290996542912972, + "flos": 27201370798080.0, + "grad_norm": 4.155383498543994, + "language_loss": 0.9020921, + "learning_rate": 3.970706464194672e-06, + "loss": 0.98312867, + "num_input_tokens_seen": 29382395, + "router_z_loss_clip": 4.30664062, + "router_z_loss_mlp": 0.48779297, + "step": 1379, + "time_per_iteration": 2.6558284759521484 + }, + { + "auxiliary_loss_clip": 0.06771149, + "auxiliary_loss_mlp": 0.01307486, + "balance_loss_clip": 0.06347619, + "balance_loss_mlp": 0.01261972, + "epoch": 0.08297008868179769, + "flos": 38627367655680.0, + "grad_norm": 2.766384510146163, + "language_loss": 0.80964148, + "learning_rate": 3.970640013611812e-06, + "loss": 0.89042783, + "num_input_tokens_seen": 29404460, + "router_z_loss_clip": 4.23828125, + "router_z_loss_mlp": 0.45483398, + "step": 1380, + "time_per_iteration": 2.7228140830993652 + }, + { + "auxiliary_loss_clip": 0.06759404, + "auxiliary_loss_mlp": 0.01314619, + "balance_loss_clip": 0.06340429, + "balance_loss_mlp": 0.01265576, + "epoch": 0.08303021193446565, + "flos": 19980924289920.0, + "grad_norm": 2.7915027065661593, + "language_loss": 0.88561881, + "learning_rate": 3.970573488302083e-06, + "loss": 0.96635896, + "num_input_tokens_seen": 29422675, + "router_z_loss_clip": 4.18554688, + "router_z_loss_mlp": 0.49023438, + "step": 1381, + "time_per_iteration": 2.6598143577575684 + }, + { + "auxiliary_loss_clip": 0.06800985, + "auxiliary_loss_mlp": 0.0131809, + "balance_loss_clip": 0.06359053, + "balance_loss_mlp": 0.01265972, + "epoch": 0.08309033518713363, + "flos": 13667769792000.0, + "grad_norm": 3.693105114641136, + "language_loss": 0.91473186, + "learning_rate": 3.970506888268011e-06, + "loss": 0.99592257, + "num_input_tokens_seen": 29439840, + "router_z_loss_clip": 4.41796875, + "router_z_loss_mlp": 0.52148438, + "step": 1382, + "time_per_iteration": 2.5975959300994873 + }, + { + "auxiliary_loss_clip": 0.06790116, + "auxiliary_loss_mlp": 0.01312438, + "balance_loss_clip": 0.06361018, + "balance_loss_mlp": 0.01263229, + "epoch": 0.0831504584398016, + "flos": 17974492346880.0, + "grad_norm": 2.495217268396043, + "language_loss": 0.78734231, + "learning_rate": 3.970440213512121e-06, + "loss": 0.86836791, + "num_input_tokens_seen": 29457360, + "router_z_loss_clip": 4.28515625, + "router_z_loss_mlp": 0.49243164, + "step": 1383, + "time_per_iteration": 2.625793695449829 + }, + { + "auxiliary_loss_clip": 0.06786636, + "auxiliary_loss_mlp": 0.01320002, + "balance_loss_clip": 0.06359254, + "balance_loss_mlp": 0.01273797, + "epoch": 0.08321058169246956, + "flos": 22607959098240.0, + "grad_norm": 2.963836437118746, + "language_loss": 0.85324878, + "learning_rate": 3.97037346403694e-06, + "loss": 0.93431515, + "num_input_tokens_seen": 29477040, + "router_z_loss_clip": 4.26953125, + "router_z_loss_mlp": 0.46240234, + "step": 1384, + "time_per_iteration": 2.6376733779907227 + }, + { + "auxiliary_loss_clip": 0.06818897, + "auxiliary_loss_mlp": 0.01334638, + "balance_loss_clip": 0.06359202, + "balance_loss_mlp": 0.01276106, + "epoch": 0.08327070494513754, + "flos": 22855976784000.0, + "grad_norm": 3.1601990232642225, + "language_loss": 0.86789215, + "learning_rate": 3.970306639845e-06, + "loss": 0.94942749, + "num_input_tokens_seen": 29492010, + "router_z_loss_clip": 4.59375, + "router_z_loss_mlp": 0.58569336, + "step": 1385, + "time_per_iteration": 2.568554639816284 + }, + { + "auxiliary_loss_clip": 0.06798602, + "auxiliary_loss_mlp": 0.0132055, + "balance_loss_clip": 0.06352767, + "balance_loss_mlp": 0.01267978, + "epoch": 0.0833308281978055, + "flos": 22789451041920.0, + "grad_norm": 2.43217008586481, + "language_loss": 0.71394652, + "learning_rate": 3.970239740938835e-06, + "loss": 0.795138, + "num_input_tokens_seen": 29511850, + "router_z_loss_clip": 4.45703125, + "router_z_loss_mlp": 0.52563477, + "step": 1386, + "time_per_iteration": 2.6096982955932617 + }, + { + "auxiliary_loss_clip": 0.06791467, + "auxiliary_loss_mlp": 0.01322523, + "balance_loss_clip": 0.06356902, + "balance_loss_mlp": 0.01273099, + "epoch": 0.08339095145047347, + "flos": 20818713738240.0, + "grad_norm": 2.3900622326762133, + "language_loss": 0.84172809, + "learning_rate": 3.97017276732098e-06, + "loss": 0.92286795, + "num_input_tokens_seen": 29531415, + "router_z_loss_clip": 4.34375, + "router_z_loss_mlp": 0.49389648, + "step": 1387, + "time_per_iteration": 2.575343132019043 + }, + { + "auxiliary_loss_clip": 0.06797379, + "auxiliary_loss_mlp": 0.01318956, + "balance_loss_clip": 0.06353064, + "balance_loss_mlp": 0.01265598, + "epoch": 0.08345107470314143, + "flos": 18521274165120.0, + "grad_norm": 5.434584550719809, + "language_loss": 0.79640985, + "learning_rate": 3.970105718993978e-06, + "loss": 0.87757325, + "num_input_tokens_seen": 29549525, + "router_z_loss_clip": 4.44140625, + "router_z_loss_mlp": 0.53369141, + "step": 1388, + "time_per_iteration": 2.567218780517578 + }, + { + "auxiliary_loss_clip": 0.06780161, + "auxiliary_loss_mlp": 0.01317075, + "balance_loss_clip": 0.06354657, + "balance_loss_mlp": 0.0126932, + "epoch": 0.08351119795580941, + "flos": 18813623460480.0, + "grad_norm": 2.631761877844796, + "language_loss": 0.82141799, + "learning_rate": 3.970038595960369e-06, + "loss": 0.90239036, + "num_input_tokens_seen": 29568705, + "router_z_loss_clip": 4.2578125, + "router_z_loss_mlp": 0.47827148, + "step": 1389, + "time_per_iteration": 2.5653841495513916 + }, + { + "auxiliary_loss_clip": 0.06804, + "auxiliary_loss_mlp": 0.01321664, + "balance_loss_clip": 0.06357203, + "balance_loss_mlp": 0.01264014, + "epoch": 0.08357132120847738, + "flos": 18447662753280.0, + "grad_norm": 4.4672809610096005, + "language_loss": 0.89901805, + "learning_rate": 3.969971398222699e-06, + "loss": 0.9802748, + "num_input_tokens_seen": 29585855, + "router_z_loss_clip": 4.46484375, + "router_z_loss_mlp": 0.57666016, + "step": 1390, + "time_per_iteration": 2.5599520206451416 + }, + { + "auxiliary_loss_clip": 0.06784607, + "auxiliary_loss_mlp": 0.01318322, + "balance_loss_clip": 0.06351756, + "balance_loss_mlp": 0.01268469, + "epoch": 0.08363144446114534, + "flos": 25929585527040.0, + "grad_norm": 2.0099549817565, + "language_loss": 0.88354278, + "learning_rate": 3.969904125783517e-06, + "loss": 0.96457207, + "num_input_tokens_seen": 29607280, + "router_z_loss_clip": 4.328125, + "router_z_loss_mlp": 0.49853516, + "step": 1391, + "time_per_iteration": 2.611985921859741 + }, + { + "auxiliary_loss_clip": 0.06815389, + "auxiliary_loss_mlp": 0.01329624, + "balance_loss_clip": 0.06354406, + "balance_loss_mlp": 0.01268851, + "epoch": 0.08369156771381332, + "flos": 18047223290880.0, + "grad_norm": 3.4660821416963805, + "language_loss": 0.90262675, + "learning_rate": 3.969836778645371e-06, + "loss": 0.98407698, + "num_input_tokens_seen": 29624130, + "router_z_loss_clip": 4.609375, + "router_z_loss_mlp": 0.60791016, + "step": 1392, + "time_per_iteration": 2.5649681091308594 + }, + { + "auxiliary_loss_clip": 0.06784143, + "auxiliary_loss_mlp": 0.01319854, + "balance_loss_clip": 0.06346482, + "balance_loss_mlp": 0.01270025, + "epoch": 0.08375169096648129, + "flos": 22681822072320.0, + "grad_norm": 4.398591622405809, + "language_loss": 0.82388842, + "learning_rate": 3.969769356810819e-06, + "loss": 0.90492845, + "num_input_tokens_seen": 29643210, + "router_z_loss_clip": 4.375, + "router_z_loss_mlp": 0.4987793, + "step": 1393, + "time_per_iteration": 2.596484899520874 + }, + { + "auxiliary_loss_clip": 0.06777762, + "auxiliary_loss_mlp": 0.01325984, + "balance_loss_clip": 0.06353533, + "balance_loss_mlp": 0.01276679, + "epoch": 0.08381181421914925, + "flos": 26110238929920.0, + "grad_norm": 2.2804276198164386, + "language_loss": 0.86896241, + "learning_rate": 3.969701860282415e-06, + "loss": 0.94999981, + "num_input_tokens_seen": 29663920, + "router_z_loss_clip": 4.24414062, + "router_z_loss_mlp": 0.49291992, + "step": 1394, + "time_per_iteration": 2.6082303524017334 + }, + { + "auxiliary_loss_clip": 0.06795013, + "auxiliary_loss_mlp": 0.01318108, + "balance_loss_clip": 0.06360835, + "balance_loss_mlp": 0.01267063, + "epoch": 0.08387193747181723, + "flos": 20635796275200.0, + "grad_norm": 2.9482675367733306, + "language_loss": 0.84974355, + "learning_rate": 3.969634289062719e-06, + "loss": 0.93087476, + "num_input_tokens_seen": 29683825, + "router_z_loss_clip": 4.3359375, + "router_z_loss_mlp": 0.51098633, + "step": 1395, + "time_per_iteration": 2.579622745513916 + }, + { + "auxiliary_loss_clip": 0.06798401, + "auxiliary_loss_mlp": 0.01311309, + "balance_loss_clip": 0.06349191, + "balance_loss_mlp": 0.01256282, + "epoch": 0.0839320607244852, + "flos": 13448193367680.0, + "grad_norm": 3.513957453818194, + "language_loss": 0.85002828, + "learning_rate": 3.969566643154293e-06, + "loss": 0.93112534, + "num_input_tokens_seen": 29698775, + "router_z_loss_clip": 4.48828125, + "router_z_loss_mlp": 0.55078125, + "step": 1396, + "time_per_iteration": 2.5521080493927 + }, + { + "auxiliary_loss_clip": 0.06784061, + "auxiliary_loss_mlp": 0.0131232, + "balance_loss_clip": 0.06356047, + "balance_loss_mlp": 0.0126261, + "epoch": 0.08399218397715316, + "flos": 23484000735360.0, + "grad_norm": 4.145800578493811, + "language_loss": 0.79030329, + "learning_rate": 3.969498922559703e-06, + "loss": 0.87126708, + "num_input_tokens_seen": 29719430, + "router_z_loss_clip": 4.28515625, + "router_z_loss_mlp": 0.49682617, + "step": 1397, + "time_per_iteration": 4.026551961898804 + }, + { + "auxiliary_loss_clip": 0.06777123, + "auxiliary_loss_mlp": 0.01309701, + "balance_loss_clip": 0.06349255, + "balance_loss_mlp": 0.01258655, + "epoch": 0.08405230722982113, + "flos": 25927698810240.0, + "grad_norm": 3.1837358420566173, + "language_loss": 0.79802477, + "learning_rate": 3.969431127281516e-06, + "loss": 0.87889296, + "num_input_tokens_seen": 29739685, + "router_z_loss_clip": 4.27734375, + "router_z_loss_mlp": 0.51123047, + "step": 1398, + "time_per_iteration": 2.6027841567993164 + }, + { + "auxiliary_loss_clip": 0.06793746, + "auxiliary_loss_mlp": 0.01312625, + "balance_loss_clip": 0.06375143, + "balance_loss_mlp": 0.01265299, + "epoch": 0.0841124304824891, + "flos": 17973192608640.0, + "grad_norm": 3.0716222673767404, + "language_loss": 0.96745825, + "learning_rate": 3.969363257322304e-06, + "loss": 1.048522, + "num_input_tokens_seen": 29756165, + "router_z_loss_clip": 4.1875, + "router_z_loss_mlp": 0.47290039, + "step": 1399, + "time_per_iteration": 3.9915521144866943 + }, + { + "auxiliary_loss_clip": 0.06813341, + "auxiliary_loss_mlp": 0.01316281, + "balance_loss_clip": 0.06352973, + "balance_loss_mlp": 0.01258012, + "epoch": 0.08417255373515707, + "flos": 25636733107200.0, + "grad_norm": 6.6751707009018055, + "language_loss": 0.83959824, + "learning_rate": 3.96929531268464e-06, + "loss": 0.92089444, + "num_input_tokens_seen": 29776425, + "router_z_loss_clip": 4.6015625, + "router_z_loss_mlp": 0.58300781, + "step": 1400, + "time_per_iteration": 2.6097705364227295 + }, + { + "auxiliary_loss_clip": 0.06801295, + "auxiliary_loss_mlp": 0.01317439, + "balance_loss_clip": 0.06362335, + "balance_loss_mlp": 0.01264868, + "epoch": 0.08423267698782504, + "flos": 26256874775040.0, + "grad_norm": 2.3612401801911487, + "language_loss": 0.8841815, + "learning_rate": 3.969227293371099e-06, + "loss": 0.96536887, + "num_input_tokens_seen": 29796440, + "router_z_loss_clip": 4.38671875, + "router_z_loss_mlp": 0.52539062, + "step": 1401, + "time_per_iteration": 2.654085874557495 + }, + { + "auxiliary_loss_clip": 0.06806403, + "auxiliary_loss_mlp": 0.01316426, + "balance_loss_clip": 0.0637629, + "balance_loss_mlp": 0.01264594, + "epoch": 0.08429280024049302, + "flos": 20125757272320.0, + "grad_norm": 2.1446358728684753, + "language_loss": 0.90116793, + "learning_rate": 3.969159199384263e-06, + "loss": 0.98239625, + "num_input_tokens_seen": 29814755, + "router_z_loss_clip": 4.296875, + "router_z_loss_mlp": 0.51733398, + "step": 1402, + "time_per_iteration": 4.018750905990601 + }, + { + "auxiliary_loss_clip": 0.067935, + "auxiliary_loss_mlp": 0.01308153, + "balance_loss_clip": 0.06370865, + "balance_loss_mlp": 0.01261519, + "epoch": 0.08435292349316098, + "flos": 42934593335040.0, + "grad_norm": 3.3097945414979324, + "language_loss": 0.91613716, + "learning_rate": 3.9690910307267125e-06, + "loss": 0.99715364, + "num_input_tokens_seen": 29834785, + "router_z_loss_clip": 4.21484375, + "router_z_loss_mlp": 0.46655273, + "step": 1403, + "time_per_iteration": 2.75314998626709 + }, + { + "auxiliary_loss_clip": 0.06802634, + "auxiliary_loss_mlp": 0.01312918, + "balance_loss_clip": 0.0636553, + "balance_loss_mlp": 0.01259679, + "epoch": 0.08441304674582895, + "flos": 22863984848640.0, + "grad_norm": 2.1842752098613696, + "language_loss": 0.8341198, + "learning_rate": 3.969022787401033e-06, + "loss": 0.91527522, + "num_input_tokens_seen": 29854695, + "router_z_loss_clip": 4.37109375, + "router_z_loss_mlp": 0.5324707, + "step": 1404, + "time_per_iteration": 4.128188371658325 + }, + { + "auxiliary_loss_clip": 0.06814778, + "auxiliary_loss_mlp": 0.01317505, + "balance_loss_clip": 0.06364593, + "balance_loss_mlp": 0.01263884, + "epoch": 0.08447316999849692, + "flos": 18703436941440.0, + "grad_norm": 2.408821192970914, + "language_loss": 0.85791099, + "learning_rate": 3.968954469409811e-06, + "loss": 0.93923384, + "num_input_tokens_seen": 29872180, + "router_z_loss_clip": 4.5, + "router_z_loss_mlp": 0.53588867, + "step": 1405, + "time_per_iteration": 2.6186141967773438 + }, + { + "auxiliary_loss_clip": 0.06785356, + "auxiliary_loss_mlp": 0.01307288, + "balance_loss_clip": 0.06358731, + "balance_loss_mlp": 0.01261488, + "epoch": 0.08453329325116489, + "flos": 25491061584000.0, + "grad_norm": 2.376275583502495, + "language_loss": 0.82456648, + "learning_rate": 3.968886076755639e-06, + "loss": 0.9054929, + "num_input_tokens_seen": 29893205, + "router_z_loss_clip": 4.2578125, + "router_z_loss_mlp": 0.45825195, + "step": 1406, + "time_per_iteration": 2.620391845703125 + }, + { + "auxiliary_loss_clip": 0.06791453, + "auxiliary_loss_mlp": 0.01321291, + "balance_loss_clip": 0.06356591, + "balance_loss_mlp": 0.01271461, + "epoch": 0.08459341650383286, + "flos": 20925839583360.0, + "grad_norm": 2.994077443847897, + "language_loss": 0.81261843, + "learning_rate": 3.96881760944111e-06, + "loss": 0.8937459, + "num_input_tokens_seen": 29911970, + "router_z_loss_clip": 4.34765625, + "router_z_loss_mlp": 0.49853516, + "step": 1407, + "time_per_iteration": 2.6037673950195312 + }, + { + "auxiliary_loss_clip": 0.06790854, + "auxiliary_loss_mlp": 0.01321715, + "balance_loss_clip": 0.06351606, + "balance_loss_mlp": 0.01269525, + "epoch": 0.08465353975650082, + "flos": 13048215102720.0, + "grad_norm": 4.665844838977458, + "language_loss": 0.93093699, + "learning_rate": 3.968749067468819e-06, + "loss": 1.01206267, + "num_input_tokens_seen": 29929925, + "router_z_loss_clip": 4.39453125, + "router_z_loss_mlp": 0.52197266, + "step": 1408, + "time_per_iteration": 2.5401058197021484 + }, + { + "auxiliary_loss_clip": 0.06614841, + "auxiliary_loss_mlp": 0.0131788, + "balance_loss_clip": 0.06340891, + "balance_loss_mlp": 0.01289985, + "epoch": 0.0847136630091688, + "flos": 60896912112000.0, + "grad_norm": 0.8563868358173309, + "language_loss": 0.62132567, + "learning_rate": 3.968680450841368e-06, + "loss": 0.7006529, + "num_input_tokens_seen": 29985950, + "router_z_loss_clip": 2.75, + "router_z_loss_mlp": 0.27954102, + "step": 1409, + "time_per_iteration": 3.2652077674865723 + }, + { + "auxiliary_loss_clip": 0.06755531, + "auxiliary_loss_mlp": 0.01311791, + "balance_loss_clip": 0.06338526, + "balance_loss_mlp": 0.01266802, + "epoch": 0.08477378626183676, + "flos": 22051743696000.0, + "grad_norm": 2.2146573769232916, + "language_loss": 0.88621575, + "learning_rate": 3.968611759561355e-06, + "loss": 0.96688896, + "num_input_tokens_seen": 30004330, + "router_z_loss_clip": 4.1640625, + "router_z_loss_mlp": 0.44995117, + "step": 1410, + "time_per_iteration": 2.5771710872650146 + }, + { + "auxiliary_loss_clip": 0.06769306, + "auxiliary_loss_mlp": 0.01318797, + "balance_loss_clip": 0.06336072, + "balance_loss_mlp": 0.01268253, + "epoch": 0.08483390951450473, + "flos": 16695537552000.0, + "grad_norm": 2.3714211979189987, + "language_loss": 0.76187658, + "learning_rate": 3.968542993631388e-06, + "loss": 0.84275758, + "num_input_tokens_seen": 30022555, + "router_z_loss_clip": 4.328125, + "router_z_loss_mlp": 0.50585938, + "step": 1411, + "time_per_iteration": 2.5831918716430664 + }, + { + "auxiliary_loss_clip": 0.06605848, + "auxiliary_loss_mlp": 0.01302084, + "balance_loss_clip": 0.06331737, + "balance_loss_mlp": 0.01268491, + "epoch": 0.08489403276717271, + "flos": 51604430313600.0, + "grad_norm": 0.8982882759913209, + "language_loss": 0.57100856, + "learning_rate": 3.968474153054073e-06, + "loss": 0.65008789, + "num_input_tokens_seen": 30077220, + "router_z_loss_clip": 2.7421875, + "router_z_loss_mlp": 0.33618164, + "step": 1412, + "time_per_iteration": 3.1449196338653564 + }, + { + "auxiliary_loss_clip": 0.06776647, + "auxiliary_loss_mlp": 0.0131046, + "balance_loss_clip": 0.06348051, + "balance_loss_mlp": 0.01261393, + "epoch": 0.08495415601984067, + "flos": 17098031439360.0, + "grad_norm": 4.4528738806487, + "language_loss": 0.91184032, + "learning_rate": 3.96840523783202e-06, + "loss": 0.99271137, + "num_input_tokens_seen": 30094600, + "router_z_loss_clip": 4.28515625, + "router_z_loss_mlp": 0.49145508, + "step": 1413, + "time_per_iteration": 2.5736677646636963 + }, + { + "auxiliary_loss_clip": 0.06762269, + "auxiliary_loss_mlp": 0.01310346, + "balance_loss_clip": 0.06341726, + "balance_loss_mlp": 0.01261685, + "epoch": 0.08501427927250864, + "flos": 23155034405760.0, + "grad_norm": 2.1658829941413997, + "language_loss": 0.9017415, + "learning_rate": 3.968336247967844e-06, + "loss": 0.98246765, + "num_input_tokens_seen": 30114475, + "router_z_loss_clip": 4.20703125, + "router_z_loss_mlp": 0.48706055, + "step": 1414, + "time_per_iteration": 2.6087806224823 + }, + { + "auxiliary_loss_clip": 0.06782193, + "auxiliary_loss_mlp": 0.01303484, + "balance_loss_clip": 0.06352735, + "balance_loss_mlp": 0.01258423, + "epoch": 0.08507440252517662, + "flos": 19069649210880.0, + "grad_norm": 2.082765030572706, + "language_loss": 0.79920703, + "learning_rate": 3.96826718346416e-06, + "loss": 0.88006377, + "num_input_tokens_seen": 30133350, + "router_z_loss_clip": 4.2890625, + "router_z_loss_mlp": 0.45068359, + "step": 1415, + "time_per_iteration": 2.5629544258117676 + }, + { + "auxiliary_loss_clip": 0.06759159, + "auxiliary_loss_mlp": 0.01306699, + "balance_loss_clip": 0.06336564, + "balance_loss_mlp": 0.01259492, + "epoch": 0.08513452577784458, + "flos": 60195249550080.0, + "grad_norm": 8.264598666401978, + "language_loss": 0.72300386, + "learning_rate": 3.968198044323587e-06, + "loss": 0.80366242, + "num_input_tokens_seen": 30159005, + "router_z_loss_clip": 4.21875, + "router_z_loss_mlp": 0.47216797, + "step": 1416, + "time_per_iteration": 2.9444239139556885 + }, + { + "auxiliary_loss_clip": 0.06803774, + "auxiliary_loss_mlp": 0.01317561, + "balance_loss_clip": 0.0635466, + "balance_loss_mlp": 0.01264608, + "epoch": 0.08519464903051255, + "flos": 27315917729280.0, + "grad_norm": 2.5149113887395407, + "language_loss": 0.77021283, + "learning_rate": 3.968128830548748e-06, + "loss": 0.85142624, + "num_input_tokens_seen": 30179450, + "router_z_loss_clip": 4.48046875, + "router_z_loss_mlp": 0.5300293, + "step": 1417, + "time_per_iteration": 2.619328260421753 + }, + { + "auxiliary_loss_clip": 0.06779526, + "auxiliary_loss_mlp": 0.01310101, + "balance_loss_clip": 0.06341187, + "balance_loss_mlp": 0.01259341, + "epoch": 0.08525477228318051, + "flos": 20272644679680.0, + "grad_norm": 2.930615198621333, + "language_loss": 0.84423447, + "learning_rate": 3.968059542142265e-06, + "loss": 0.92513078, + "num_input_tokens_seen": 30197235, + "router_z_loss_clip": 4.37890625, + "router_z_loss_mlp": 0.5078125, + "step": 1418, + "time_per_iteration": 2.5782899856567383 + }, + { + "auxiliary_loss_clip": 0.06606524, + "auxiliary_loss_mlp": 0.01274095, + "balance_loss_clip": 0.06333332, + "balance_loss_mlp": 0.01249931, + "epoch": 0.08531489553584849, + "flos": 67633580672640.0, + "grad_norm": 0.9458512268838744, + "language_loss": 0.5659793, + "learning_rate": 3.9679901791067685e-06, + "loss": 0.64478552, + "num_input_tokens_seen": 30257410, + "router_z_loss_clip": 2.73046875, + "router_z_loss_mlp": 0.24157715, + "step": 1419, + "time_per_iteration": 3.1296868324279785 + }, + { + "auxiliary_loss_clip": 0.06790996, + "auxiliary_loss_mlp": 0.01306783, + "balance_loss_clip": 0.06354627, + "balance_loss_mlp": 0.01259004, + "epoch": 0.08537501878851646, + "flos": 27534362123520.0, + "grad_norm": 2.6126551890980076, + "language_loss": 0.72536588, + "learning_rate": 3.967920741444886e-06, + "loss": 0.80634367, + "num_input_tokens_seen": 30277865, + "router_z_loss_clip": 4.359375, + "router_z_loss_mlp": 0.4777832, + "step": 1420, + "time_per_iteration": 2.629305839538574 + }, + { + "auxiliary_loss_clip": 0.06772007, + "auxiliary_loss_mlp": 0.01307483, + "balance_loss_clip": 0.06343359, + "balance_loss_mlp": 0.01257272, + "epoch": 0.08543514204118442, + "flos": 22790918488320.0, + "grad_norm": 2.3388359886837917, + "language_loss": 0.89903885, + "learning_rate": 3.967851229159252e-06, + "loss": 0.97983378, + "num_input_tokens_seen": 30298545, + "router_z_loss_clip": 4.27929688, + "router_z_loss_mlp": 0.50244141, + "step": 1421, + "time_per_iteration": 2.5863590240478516 + }, + { + "auxiliary_loss_clip": 0.06597036, + "auxiliary_loss_mlp": 0.01275978, + "balance_loss_clip": 0.06325173, + "balance_loss_mlp": 0.01249919, + "epoch": 0.0854952652938524, + "flos": 61010872064640.0, + "grad_norm": 0.7745811005373293, + "language_loss": 0.63692141, + "learning_rate": 3.967781642252502e-06, + "loss": 0.71565151, + "num_input_tokens_seen": 30361725, + "router_z_loss_clip": 2.71875, + "router_z_loss_mlp": 0.26098633, + "step": 1422, + "time_per_iteration": 3.19461989402771 + }, + { + "auxiliary_loss_clip": 0.06765623, + "auxiliary_loss_mlp": 0.01311314, + "balance_loss_clip": 0.06344545, + "balance_loss_mlp": 0.01266444, + "epoch": 0.08555538854652037, + "flos": 28045575083520.0, + "grad_norm": 3.3087422543747205, + "language_loss": 0.84878761, + "learning_rate": 3.967711980727276e-06, + "loss": 0.92955703, + "num_input_tokens_seen": 30382180, + "router_z_loss_clip": 4.21289062, + "router_z_loss_mlp": 0.44873047, + "step": 1423, + "time_per_iteration": 2.6554226875305176 + }, + { + "auxiliary_loss_clip": 0.06776007, + "auxiliary_loss_mlp": 0.01303967, + "balance_loss_clip": 0.06351057, + "balance_loss_mlp": 0.01261314, + "epoch": 0.08561551179918833, + "flos": 23515293035520.0, + "grad_norm": 2.569087931646671, + "language_loss": 0.7765131, + "learning_rate": 3.967642244586213e-06, + "loss": 0.85731286, + "num_input_tokens_seen": 30402980, + "router_z_loss_clip": 4.24609375, + "router_z_loss_mlp": 0.42602539, + "step": 1424, + "time_per_iteration": 2.7058026790618896 + }, + { + "auxiliary_loss_clip": 0.06765693, + "auxiliary_loss_mlp": 0.01310667, + "balance_loss_clip": 0.06343248, + "balance_loss_mlp": 0.01265988, + "epoch": 0.08567563505185631, + "flos": 17932005527040.0, + "grad_norm": 1.9981101747379681, + "language_loss": 0.78279495, + "learning_rate": 3.96757243383196e-06, + "loss": 0.86355859, + "num_input_tokens_seen": 30420800, + "router_z_loss_clip": 4.22265625, + "router_z_loss_mlp": 0.44677734, + "step": 1425, + "time_per_iteration": 2.575941801071167 + }, + { + "auxiliary_loss_clip": 0.06768522, + "auxiliary_loss_mlp": 0.01310756, + "balance_loss_clip": 0.06347974, + "balance_loss_mlp": 0.01264074, + "epoch": 0.08573575830452428, + "flos": 19725695153280.0, + "grad_norm": 2.337358950389625, + "language_loss": 0.95636088, + "learning_rate": 3.9675025484671624e-06, + "loss": 1.03715372, + "num_input_tokens_seen": 30439620, + "router_z_loss_clip": 4.20507812, + "router_z_loss_mlp": 0.46679688, + "step": 1426, + "time_per_iteration": 2.5706772804260254 + }, + { + "auxiliary_loss_clip": 0.06791019, + "auxiliary_loss_mlp": 0.01318941, + "balance_loss_clip": 0.06355577, + "balance_loss_mlp": 0.01267776, + "epoch": 0.08579588155719224, + "flos": 17937414115200.0, + "grad_norm": 3.6077969135085945, + "language_loss": 0.78100324, + "learning_rate": 3.967432588494471e-06, + "loss": 0.86210281, + "num_input_tokens_seen": 30457300, + "router_z_loss_clip": 4.3515625, + "router_z_loss_mlp": 0.51196289, + "step": 1427, + "time_per_iteration": 2.620664119720459 + }, + { + "auxiliary_loss_clip": 0.06773555, + "auxiliary_loss_mlp": 0.01322231, + "balance_loss_clip": 0.06351949, + "balance_loss_mlp": 0.01272831, + "epoch": 0.08585600480986022, + "flos": 16038694995840.0, + "grad_norm": 4.670417341284444, + "language_loss": 0.84344131, + "learning_rate": 3.96736255391654e-06, + "loss": 0.92439914, + "num_input_tokens_seen": 30471580, + "router_z_loss_clip": 4.21679688, + "router_z_loss_mlp": 0.49414062, + "step": 1428, + "time_per_iteration": 2.5323448181152344 + }, + { + "auxiliary_loss_clip": 0.06797348, + "auxiliary_loss_mlp": 0.01327926, + "balance_loss_clip": 0.06359121, + "balance_loss_mlp": 0.01274211, + "epoch": 0.08591612806252819, + "flos": 28664920137600.0, + "grad_norm": 3.8563401660428136, + "language_loss": 0.82438064, + "learning_rate": 3.967292444736023e-06, + "loss": 0.90563333, + "num_input_tokens_seen": 30492720, + "router_z_loss_clip": 4.375, + "router_z_loss_mlp": 0.53710938, + "step": 1429, + "time_per_iteration": 2.6729156970977783 + }, + { + "auxiliary_loss_clip": 0.06787296, + "auxiliary_loss_mlp": 0.01320421, + "balance_loss_clip": 0.06368907, + "balance_loss_mlp": 0.0127586, + "epoch": 0.08597625131519615, + "flos": 20965349583360.0, + "grad_norm": 2.123464733030403, + "language_loss": 0.90146309, + "learning_rate": 3.967222260955578e-06, + "loss": 0.98254025, + "num_input_tokens_seen": 30509535, + "router_z_loss_clip": 4.18554688, + "router_z_loss_mlp": 0.44580078, + "step": 1430, + "time_per_iteration": 2.553527355194092 + }, + { + "auxiliary_loss_clip": 0.06773631, + "auxiliary_loss_mlp": 0.01318779, + "balance_loss_clip": 0.06357691, + "balance_loss_mlp": 0.01274552, + "epoch": 0.08603637456786412, + "flos": 23262747229440.0, + "grad_norm": 2.0722520617005924, + "language_loss": 0.84170914, + "learning_rate": 3.96715200257787e-06, + "loss": 0.92263317, + "num_input_tokens_seen": 30529490, + "router_z_loss_clip": 4.16015625, + "router_z_loss_mlp": 0.44213867, + "step": 1431, + "time_per_iteration": 2.5954349040985107 + }, + { + "auxiliary_loss_clip": 0.06773046, + "auxiliary_loss_mlp": 0.01317231, + "balance_loss_clip": 0.06352717, + "balance_loss_mlp": 0.01270858, + "epoch": 0.0860964978205321, + "flos": 28701704880000.0, + "grad_norm": 5.769747909175534, + "language_loss": 0.79544812, + "learning_rate": 3.967081669605559e-06, + "loss": 0.87635088, + "num_input_tokens_seen": 30550205, + "router_z_loss_clip": 4.19726562, + "router_z_loss_mlp": 0.46362305, + "step": 1432, + "time_per_iteration": 2.6024515628814697 + }, + { + "auxiliary_loss_clip": 0.06771973, + "auxiliary_loss_mlp": 0.01314171, + "balance_loss_clip": 0.06355675, + "balance_loss_mlp": 0.01269325, + "epoch": 0.08615662107320006, + "flos": 19324542931200.0, + "grad_norm": 3.3903634053002336, + "language_loss": 0.75487757, + "learning_rate": 3.967011262041315e-06, + "loss": 0.83573902, + "num_input_tokens_seen": 30568830, + "router_z_loss_clip": 4.1640625, + "router_z_loss_mlp": 0.44848633, + "step": 1433, + "time_per_iteration": 2.5895845890045166 + }, + { + "auxiliary_loss_clip": 0.06795658, + "auxiliary_loss_mlp": 0.01322619, + "balance_loss_clip": 0.0636312, + "balance_loss_mlp": 0.01272313, + "epoch": 0.08621674432586802, + "flos": 15857161125120.0, + "grad_norm": 4.641351982999466, + "language_loss": 0.88055921, + "learning_rate": 3.9669407798878065e-06, + "loss": 0.96174198, + "num_input_tokens_seen": 30585730, + "router_z_loss_clip": 4.328125, + "router_z_loss_mlp": 0.50268555, + "step": 1434, + "time_per_iteration": 2.5355098247528076 + }, + { + "auxiliary_loss_clip": 0.06779063, + "auxiliary_loss_mlp": 0.01311558, + "balance_loss_clip": 0.06353655, + "balance_loss_mlp": 0.01263803, + "epoch": 0.086276867578536, + "flos": 14105874464640.0, + "grad_norm": 4.793331202343017, + "language_loss": 0.80184627, + "learning_rate": 3.966870223147707e-06, + "loss": 0.88275254, + "num_input_tokens_seen": 30603180, + "router_z_loss_clip": 4.25195312, + "router_z_loss_mlp": 0.4777832, + "step": 1435, + "time_per_iteration": 2.57381272315979 + }, + { + "auxiliary_loss_clip": 0.06627634, + "auxiliary_loss_mlp": 0.01282391, + "balance_loss_clip": 0.06350996, + "balance_loss_mlp": 0.01255616, + "epoch": 0.08633699083120397, + "flos": 70206500142720.0, + "grad_norm": 0.941958531658993, + "language_loss": 0.58419931, + "learning_rate": 3.96679959182369e-06, + "loss": 0.66329956, + "num_input_tokens_seen": 30668895, + "router_z_loss_clip": 2.7734375, + "router_z_loss_mlp": 0.26831055, + "step": 1436, + "time_per_iteration": 3.282787561416626 + }, + { + "auxiliary_loss_clip": 0.06781173, + "auxiliary_loss_mlp": 0.01309156, + "balance_loss_clip": 0.06351152, + "balance_loss_mlp": 0.01261949, + "epoch": 0.08639711408387193, + "flos": 30306565330560.0, + "grad_norm": 3.136203943019662, + "language_loss": 0.71995145, + "learning_rate": 3.966728885918437e-06, + "loss": 0.80085474, + "num_input_tokens_seen": 30688955, + "router_z_loss_clip": 4.296875, + "router_z_loss_mlp": 0.47167969, + "step": 1437, + "time_per_iteration": 4.062320232391357 + }, + { + "auxiliary_loss_clip": 0.06771993, + "auxiliary_loss_mlp": 0.01311453, + "balance_loss_clip": 0.06345055, + "balance_loss_mlp": 0.01262553, + "epoch": 0.08645723733653991, + "flos": 20303014584960.0, + "grad_norm": 2.1552544434513154, + "language_loss": 0.74663305, + "learning_rate": 3.966658105434627e-06, + "loss": 0.82746744, + "num_input_tokens_seen": 30706095, + "router_z_loss_clip": 4.26757812, + "router_z_loss_mlp": 0.48925781, + "step": 1438, + "time_per_iteration": 2.5902743339538574 + }, + { + "auxiliary_loss_clip": 0.06752677, + "auxiliary_loss_mlp": 0.01311557, + "balance_loss_clip": 0.06331892, + "balance_loss_mlp": 0.01263468, + "epoch": 0.08651736058920788, + "flos": 32898911748480.0, + "grad_norm": 2.1102638652127093, + "language_loss": 0.6610049, + "learning_rate": 3.966587250374945e-06, + "loss": 0.7416473, + "num_input_tokens_seen": 30729025, + "router_z_loss_clip": 4.20703125, + "router_z_loss_mlp": 0.48071289, + "step": 1439, + "time_per_iteration": 4.177356719970703 + }, + { + "auxiliary_loss_clip": 0.06767576, + "auxiliary_loss_mlp": 0.01319213, + "balance_loss_clip": 0.06342776, + "balance_loss_mlp": 0.01270934, + "epoch": 0.08657748384187584, + "flos": 22643863372800.0, + "grad_norm": 6.195931442958794, + "language_loss": 0.89298683, + "learning_rate": 3.966516320742077e-06, + "loss": 0.97385472, + "num_input_tokens_seen": 30746155, + "router_z_loss_clip": 4.25, + "router_z_loss_mlp": 0.4831543, + "step": 1440, + "time_per_iteration": 2.5557472705841064 + }, + { + "auxiliary_loss_clip": 0.06781097, + "auxiliary_loss_mlp": 0.01307911, + "balance_loss_clip": 0.06338568, + "balance_loss_mlp": 0.01254028, + "epoch": 0.08663760709454381, + "flos": 23664947627520.0, + "grad_norm": 2.369224573412665, + "language_loss": 0.86471045, + "learning_rate": 3.9664453165387124e-06, + "loss": 0.94560057, + "num_input_tokens_seen": 30761410, + "router_z_loss_clip": 4.421875, + "router_z_loss_mlp": 0.53833008, + "step": 1441, + "time_per_iteration": 2.65085768699646 + }, + { + "auxiliary_loss_clip": 0.06611373, + "auxiliary_loss_mlp": 0.01295436, + "balance_loss_clip": 0.06333591, + "balance_loss_mlp": 0.01268138, + "epoch": 0.08669773034721179, + "flos": 62703823484160.0, + "grad_norm": 0.803695610307685, + "language_loss": 0.60671109, + "learning_rate": 3.966374237767545e-06, + "loss": 0.68577921, + "num_input_tokens_seen": 30823010, + "router_z_loss_clip": 2.78125, + "router_z_loss_mlp": 0.27368164, + "step": 1442, + "time_per_iteration": 4.761855125427246 + }, + { + "auxiliary_loss_clip": 0.0676527, + "auxiliary_loss_mlp": 0.0130763, + "balance_loss_clip": 0.06333362, + "balance_loss_mlp": 0.0125885, + "epoch": 0.08675785359987975, + "flos": 20673713047680.0, + "grad_norm": 2.753695330350272, + "language_loss": 0.81546146, + "learning_rate": 3.96630308443127e-06, + "loss": 0.8961904, + "num_input_tokens_seen": 30841980, + "router_z_loss_clip": 4.31640625, + "router_z_loss_mlp": 0.48803711, + "step": 1443, + "time_per_iteration": 2.581735134124756 + }, + { + "auxiliary_loss_clip": 0.06751874, + "auxiliary_loss_mlp": 0.01309584, + "balance_loss_clip": 0.06329648, + "balance_loss_mlp": 0.01264404, + "epoch": 0.08681797685254772, + "flos": 26948070305280.0, + "grad_norm": 2.052695672066824, + "language_loss": 0.83898687, + "learning_rate": 3.966231856532584e-06, + "loss": 0.91960144, + "num_input_tokens_seen": 30863280, + "router_z_loss_clip": 4.22460938, + "router_z_loss_mlp": 0.45166016, + "step": 1444, + "time_per_iteration": 4.03491473197937 + }, + { + "auxiliary_loss_clip": 0.06771353, + "auxiliary_loss_mlp": 0.01313762, + "balance_loss_clip": 0.063327, + "balance_loss_mlp": 0.01263408, + "epoch": 0.0868781001052157, + "flos": 17718676231680.0, + "grad_norm": 2.3029002758170236, + "language_loss": 0.89515543, + "learning_rate": 3.966160554074189e-06, + "loss": 0.97600663, + "num_input_tokens_seen": 30881710, + "router_z_loss_clip": 4.3828125, + "router_z_loss_mlp": 0.50341797, + "step": 1445, + "time_per_iteration": 2.53659987449646 + }, + { + "auxiliary_loss_clip": 0.06757164, + "auxiliary_loss_mlp": 0.01319102, + "balance_loss_clip": 0.0633342, + "balance_loss_mlp": 0.01269916, + "epoch": 0.08693822335788366, + "flos": 19901820435840.0, + "grad_norm": 2.912516601595955, + "language_loss": 0.84297967, + "learning_rate": 3.96608917705879e-06, + "loss": 0.92374229, + "num_input_tokens_seen": 30900225, + "router_z_loss_clip": 4.23632812, + "router_z_loss_mlp": 0.49169922, + "step": 1446, + "time_per_iteration": 2.5991437435150146 + }, + { + "auxiliary_loss_clip": 0.06602339, + "auxiliary_loss_mlp": 0.01278086, + "balance_loss_clip": 0.06327674, + "balance_loss_mlp": 0.01252623, + "epoch": 0.08699834661055163, + "flos": 67040957871360.0, + "grad_norm": 0.7332106315857324, + "language_loss": 0.54912937, + "learning_rate": 3.966017725489091e-06, + "loss": 0.62793368, + "num_input_tokens_seen": 30959580, + "router_z_loss_clip": 2.75, + "router_z_loss_mlp": 0.25488281, + "step": 1447, + "time_per_iteration": 3.2708306312561035 + }, + { + "auxiliary_loss_clip": 0.06739033, + "auxiliary_loss_mlp": 0.01328667, + "balance_loss_clip": 0.06324905, + "balance_loss_mlp": 0.01282223, + "epoch": 0.0870584698632196, + "flos": 13485648942720.0, + "grad_norm": 3.073032874929238, + "language_loss": 0.86241722, + "learning_rate": 3.965946199367804e-06, + "loss": 0.94309419, + "num_input_tokens_seen": 30976775, + "router_z_loss_clip": 4.140625, + "router_z_loss_mlp": 0.46508789, + "step": 1448, + "time_per_iteration": 2.537522792816162 + }, + { + "auxiliary_loss_clip": 0.067637, + "auxiliary_loss_mlp": 0.01323636, + "balance_loss_clip": 0.06333195, + "balance_loss_mlp": 0.01275666, + "epoch": 0.08711859311588757, + "flos": 16112516042880.0, + "grad_norm": 5.523495984670142, + "language_loss": 0.81949937, + "learning_rate": 3.965874598697638e-06, + "loss": 0.90037274, + "num_input_tokens_seen": 30990495, + "router_z_loss_clip": 4.3046875, + "router_z_loss_mlp": 0.47949219, + "step": 1449, + "time_per_iteration": 2.57389760017395 + }, + { + "auxiliary_loss_clip": 0.06749628, + "auxiliary_loss_mlp": 0.01305238, + "balance_loss_clip": 0.06335508, + "balance_loss_mlp": 0.01262227, + "epoch": 0.08717871636855554, + "flos": 38481528424320.0, + "grad_norm": 2.3810554922577354, + "language_loss": 0.73064238, + "learning_rate": 3.965802923481313e-06, + "loss": 0.81119096, + "num_input_tokens_seen": 31014080, + "router_z_loss_clip": 4.140625, + "router_z_loss_mlp": 0.43017578, + "step": 1450, + "time_per_iteration": 2.7252304553985596 + }, + { + "auxiliary_loss_clip": 0.06761701, + "auxiliary_loss_mlp": 0.01323911, + "balance_loss_clip": 0.06337759, + "balance_loss_mlp": 0.01275416, + "epoch": 0.0872388396212235, + "flos": 17605932163200.0, + "grad_norm": 2.1112425767796474, + "language_loss": 0.85553432, + "learning_rate": 3.965731173721542e-06, + "loss": 0.9363904, + "num_input_tokens_seen": 31031210, + "router_z_loss_clip": 4.24414062, + "router_z_loss_mlp": 0.48486328, + "step": 1451, + "time_per_iteration": 2.556896209716797 + }, + { + "auxiliary_loss_clip": 0.06751224, + "auxiliary_loss_mlp": 0.01307951, + "balance_loss_clip": 0.06344092, + "balance_loss_mlp": 0.01266395, + "epoch": 0.08729896287389148, + "flos": 25265489592960.0, + "grad_norm": 2.067410826923288, + "language_loss": 0.76721281, + "learning_rate": 3.965659349421049e-06, + "loss": 0.84780455, + "num_input_tokens_seen": 31049710, + "router_z_loss_clip": 4.0703125, + "router_z_loss_mlp": 0.41577148, + "step": 1452, + "time_per_iteration": 2.5980234146118164 + }, + { + "auxiliary_loss_clip": 0.06767467, + "auxiliary_loss_mlp": 0.01321022, + "balance_loss_clip": 0.06343699, + "balance_loss_mlp": 0.01272623, + "epoch": 0.08735908612655945, + "flos": 15637836263040.0, + "grad_norm": 4.836985480100509, + "language_loss": 0.8246457, + "learning_rate": 3.965587450582556e-06, + "loss": 0.90553057, + "num_input_tokens_seen": 31066160, + "router_z_loss_clip": 4.23828125, + "router_z_loss_mlp": 0.48364258, + "step": 1453, + "time_per_iteration": 2.5459630489349365 + }, + { + "auxiliary_loss_clip": 0.06754768, + "auxiliary_loss_mlp": 0.0129928, + "balance_loss_clip": 0.06342497, + "balance_loss_mlp": 0.0125646, + "epoch": 0.08741920937922741, + "flos": 20345920675200.0, + "grad_norm": 3.0656217118084, + "language_loss": 0.72998244, + "learning_rate": 3.96551547720879e-06, + "loss": 0.81052291, + "num_input_tokens_seen": 31085270, + "router_z_loss_clip": 4.12695312, + "router_z_loss_mlp": 0.42822266, + "step": 1454, + "time_per_iteration": 2.551548957824707 + }, + { + "auxiliary_loss_clip": 0.0662789, + "auxiliary_loss_mlp": 0.01303999, + "balance_loss_clip": 0.06353966, + "balance_loss_mlp": 0.01280789, + "epoch": 0.08747933263189539, + "flos": 62841052944000.0, + "grad_norm": 0.7529223255178736, + "language_loss": 0.58298737, + "learning_rate": 3.96544342930248e-06, + "loss": 0.66230631, + "num_input_tokens_seen": 31148445, + "router_z_loss_clip": 2.734375, + "router_z_loss_mlp": 0.23181152, + "step": 1455, + "time_per_iteration": 3.2130184173583984 + }, + { + "auxiliary_loss_clip": 0.06774339, + "auxiliary_loss_mlp": 0.01313917, + "balance_loss_clip": 0.06350334, + "balance_loss_mlp": 0.01265303, + "epoch": 0.08753945588456336, + "flos": 33044122074240.0, + "grad_norm": 1.7776650768799964, + "language_loss": 0.79278296, + "learning_rate": 3.965371306866359e-06, + "loss": 0.87366557, + "num_input_tokens_seen": 31168770, + "router_z_loss_clip": 4.23632812, + "router_z_loss_mlp": 0.4855957, + "step": 1456, + "time_per_iteration": 2.6745898723602295 + }, + { + "auxiliary_loss_clip": 0.06785175, + "auxiliary_loss_mlp": 0.01319613, + "balance_loss_clip": 0.06356893, + "balance_loss_mlp": 0.01271881, + "epoch": 0.08759957913723132, + "flos": 35554807088640.0, + "grad_norm": 2.255439619282858, + "language_loss": 0.74143755, + "learning_rate": 3.96529910990316e-06, + "loss": 0.82248545, + "num_input_tokens_seen": 31189270, + "router_z_loss_clip": 4.28515625, + "router_z_loss_mlp": 0.47753906, + "step": 1457, + "time_per_iteration": 2.6837821006774902 + }, + { + "auxiliary_loss_clip": 0.06763137, + "auxiliary_loss_mlp": 0.01308035, + "balance_loss_clip": 0.06348729, + "balance_loss_mlp": 0.01264738, + "epoch": 0.0876597023898993, + "flos": 23917283798400.0, + "grad_norm": 1.7808177247023305, + "language_loss": 0.88680792, + "learning_rate": 3.965226838415622e-06, + "loss": 0.96751964, + "num_input_tokens_seen": 31210385, + "router_z_loss_clip": 4.140625, + "router_z_loss_mlp": 0.43261719, + "step": 1458, + "time_per_iteration": 2.5912857055664062 + }, + { + "auxiliary_loss_clip": 0.0677645, + "auxiliary_loss_mlp": 0.01313188, + "balance_loss_clip": 0.06355318, + "balance_loss_mlp": 0.01268151, + "epoch": 0.08771982564256726, + "flos": 18119912307840.0, + "grad_norm": 3.1042726617035297, + "language_loss": 0.82429975, + "learning_rate": 3.965154492406486e-06, + "loss": 0.90519613, + "num_input_tokens_seen": 31229745, + "router_z_loss_clip": 4.20703125, + "router_z_loss_mlp": 0.45043945, + "step": 1459, + "time_per_iteration": 2.5870959758758545 + }, + { + "auxiliary_loss_clip": 0.0679104, + "auxiliary_loss_mlp": 0.01327895, + "balance_loss_clip": 0.06355593, + "balance_loss_mlp": 0.01275062, + "epoch": 0.08777994889523523, + "flos": 17717711909760.0, + "grad_norm": 7.236455309064537, + "language_loss": 0.8621763, + "learning_rate": 3.9650820718784945e-06, + "loss": 0.94336569, + "num_input_tokens_seen": 31248280, + "router_z_loss_clip": 4.35546875, + "router_z_loss_mlp": 0.52856445, + "step": 1460, + "time_per_iteration": 2.574669361114502 + }, + { + "auxiliary_loss_clip": 0.06771254, + "auxiliary_loss_mlp": 0.01315799, + "balance_loss_clip": 0.06352662, + "balance_loss_mlp": 0.01271215, + "epoch": 0.0878400721479032, + "flos": 12824320193280.0, + "grad_norm": 3.2811276479841847, + "language_loss": 0.83160508, + "learning_rate": 3.965009576834394e-06, + "loss": 0.91247559, + "num_input_tokens_seen": 31262190, + "router_z_loss_clip": 4.18359375, + "router_z_loss_mlp": 0.44580078, + "step": 1461, + "time_per_iteration": 2.575343608856201 + }, + { + "auxiliary_loss_clip": 0.06765963, + "auxiliary_loss_mlp": 0.01303985, + "balance_loss_clip": 0.06350134, + "balance_loss_mlp": 0.01261094, + "epoch": 0.08790019540057117, + "flos": 26399359843200.0, + "grad_norm": 3.960130795636661, + "language_loss": 0.77723432, + "learning_rate": 3.964937007276932e-06, + "loss": 0.85793376, + "num_input_tokens_seen": 31283690, + "router_z_loss_clip": 4.15625, + "router_z_loss_mlp": 0.42895508, + "step": 1462, + "time_per_iteration": 2.6177735328674316 + }, + { + "auxiliary_loss_clip": 0.06788168, + "auxiliary_loss_mlp": 0.01309058, + "balance_loss_clip": 0.06352487, + "balance_loss_mlp": 0.01258371, + "epoch": 0.08796031865323914, + "flos": 19139822605440.0, + "grad_norm": 5.369695457360621, + "language_loss": 0.76475191, + "learning_rate": 3.9648643632088634e-06, + "loss": 0.84572417, + "num_input_tokens_seen": 31302505, + "router_z_loss_clip": 4.359375, + "router_z_loss_mlp": 0.50732422, + "step": 1463, + "time_per_iteration": 2.532130241394043 + }, + { + "auxiliary_loss_clip": 0.06770946, + "auxiliary_loss_mlp": 0.01316317, + "balance_loss_clip": 0.06331752, + "balance_loss_mlp": 0.01261218, + "epoch": 0.0880204419059071, + "flos": 26070896638080.0, + "grad_norm": 3.6430076592813427, + "language_loss": 0.85532415, + "learning_rate": 3.964791644632941e-06, + "loss": 0.9361968, + "num_input_tokens_seen": 31323070, + "router_z_loss_clip": 4.39453125, + "router_z_loss_mlp": 0.55126953, + "step": 1464, + "time_per_iteration": 2.606081962585449 + }, + { + "auxiliary_loss_clip": 0.06766248, + "auxiliary_loss_mlp": 0.01314801, + "balance_loss_clip": 0.06340823, + "balance_loss_mlp": 0.01264948, + "epoch": 0.08808056515857508, + "flos": 22383602991360.0, + "grad_norm": 2.6056498019463774, + "language_loss": 0.80711126, + "learning_rate": 3.964718851551923e-06, + "loss": 0.88792181, + "num_input_tokens_seen": 31341880, + "router_z_loss_clip": 4.25, + "router_z_loss_mlp": 0.4987793, + "step": 1465, + "time_per_iteration": 2.555612325668335 + }, + { + "auxiliary_loss_clip": 0.06765096, + "auxiliary_loss_mlp": 0.0132391, + "balance_loss_clip": 0.06346563, + "balance_loss_mlp": 0.01275654, + "epoch": 0.08814068841124305, + "flos": 23191986856320.0, + "grad_norm": 5.208613872763048, + "language_loss": 0.8713969, + "learning_rate": 3.9646459839685675e-06, + "loss": 0.95228696, + "num_input_tokens_seen": 31361995, + "router_z_loss_clip": 4.18554688, + "router_z_loss_mlp": 0.48266602, + "step": 1466, + "time_per_iteration": 2.5865933895111084 + }, + { + "auxiliary_loss_clip": 0.067513, + "auxiliary_loss_mlp": 0.01319742, + "balance_loss_clip": 0.06332761, + "balance_loss_mlp": 0.01270842, + "epoch": 0.08820081166391101, + "flos": 25162262962560.0, + "grad_norm": 2.171865464101356, + "language_loss": 0.85806906, + "learning_rate": 3.964573041885641e-06, + "loss": 0.93877947, + "num_input_tokens_seen": 31381515, + "router_z_loss_clip": 4.18359375, + "router_z_loss_mlp": 0.48852539, + "step": 1467, + "time_per_iteration": 2.5861306190490723 + }, + { + "auxiliary_loss_clip": 0.06751268, + "auxiliary_loss_mlp": 0.0130998, + "balance_loss_clip": 0.06337693, + "balance_loss_mlp": 0.01262654, + "epoch": 0.08826093491657899, + "flos": 22237386416640.0, + "grad_norm": 2.29409858909566, + "language_loss": 0.78131318, + "learning_rate": 3.964500025305907e-06, + "loss": 0.86192572, + "num_input_tokens_seen": 31400345, + "router_z_loss_clip": 4.1328125, + "router_z_loss_mlp": 0.47387695, + "step": 1468, + "time_per_iteration": 2.5800206661224365 + }, + { + "auxiliary_loss_clip": 0.06742708, + "auxiliary_loss_mlp": 0.01311969, + "balance_loss_clip": 0.06332668, + "balance_loss_mlp": 0.01265501, + "epoch": 0.08832105816924696, + "flos": 22133279318400.0, + "grad_norm": 1.8356690071746322, + "language_loss": 0.82406783, + "learning_rate": 3.9644269342321355e-06, + "loss": 0.90461457, + "num_input_tokens_seen": 31419620, + "router_z_loss_clip": 4.09570312, + "router_z_loss_mlp": 0.46459961, + "step": 1469, + "time_per_iteration": 2.5584611892700195 + }, + { + "auxiliary_loss_clip": 0.06744162, + "auxiliary_loss_mlp": 0.01313281, + "balance_loss_clip": 0.06327502, + "balance_loss_mlp": 0.01264739, + "epoch": 0.08838118142191492, + "flos": 17572250021760.0, + "grad_norm": 2.2192924058432615, + "language_loss": 0.79711461, + "learning_rate": 3.9643537686670974e-06, + "loss": 0.877689, + "num_input_tokens_seen": 31437970, + "router_z_loss_clip": 4.16210938, + "router_z_loss_mlp": 0.48535156, + "step": 1470, + "time_per_iteration": 2.5447630882263184 + }, + { + "auxiliary_loss_clip": 0.06739189, + "auxiliary_loss_mlp": 0.01312164, + "balance_loss_clip": 0.06326798, + "balance_loss_mlp": 0.0126274, + "epoch": 0.0884413046745829, + "flos": 20783480296320.0, + "grad_norm": 2.030528760335608, + "language_loss": 0.86272311, + "learning_rate": 3.964280528613569e-06, + "loss": 0.94323671, + "num_input_tokens_seen": 31457040, + "router_z_loss_clip": 4.125, + "router_z_loss_mlp": 0.49511719, + "step": 1471, + "time_per_iteration": 2.7219297885894775 + }, + { + "auxiliary_loss_clip": 0.06719133, + "auxiliary_loss_mlp": 0.01304039, + "balance_loss_clip": 0.06321308, + "balance_loss_mlp": 0.01263222, + "epoch": 0.08850142792725087, + "flos": 22131686090880.0, + "grad_norm": 5.945068157557599, + "language_loss": 0.85369575, + "learning_rate": 3.964207214074324e-06, + "loss": 0.93392742, + "num_input_tokens_seen": 31477520, + "router_z_loss_clip": 3.97851562, + "router_z_loss_mlp": 0.40820312, + "step": 1472, + "time_per_iteration": 2.6007394790649414 + }, + { + "auxiliary_loss_clip": 0.06741676, + "auxiliary_loss_mlp": 0.01307162, + "balance_loss_clip": 0.06323978, + "balance_loss_mlp": 0.01258811, + "epoch": 0.08856155117991883, + "flos": 22425251270400.0, + "grad_norm": 4.024487815181785, + "language_loss": 0.85227764, + "learning_rate": 3.964133825052146e-06, + "loss": 0.93276608, + "num_input_tokens_seen": 31495575, + "router_z_loss_clip": 4.1796875, + "router_z_loss_mlp": 0.48388672, + "step": 1473, + "time_per_iteration": 2.610280752182007 + }, + { + "auxiliary_loss_clip": 0.06745915, + "auxiliary_loss_mlp": 0.01303107, + "balance_loss_clip": 0.0632661, + "balance_loss_mlp": 0.01257998, + "epoch": 0.0886216744325868, + "flos": 29945132743680.0, + "grad_norm": 1.5926466073589443, + "language_loss": 0.80301654, + "learning_rate": 3.964060361549816e-06, + "loss": 0.88350677, + "num_input_tokens_seen": 31520020, + "router_z_loss_clip": 4.1953125, + "router_z_loss_mlp": 0.45092773, + "step": 1474, + "time_per_iteration": 2.74392032623291 + }, + { + "auxiliary_loss_clip": 0.0673038, + "auxiliary_loss_mlp": 0.01308218, + "balance_loss_clip": 0.06324204, + "balance_loss_mlp": 0.01263062, + "epoch": 0.08868179768525478, + "flos": 23988798858240.0, + "grad_norm": 2.028999420252469, + "language_loss": 0.80928683, + "learning_rate": 3.963986823570121e-06, + "loss": 0.88967282, + "num_input_tokens_seen": 31539265, + "router_z_loss_clip": 4.05859375, + "router_z_loss_mlp": 0.45166016, + "step": 1475, + "time_per_iteration": 2.570007801055908 + }, + { + "auxiliary_loss_clip": 0.06742392, + "auxiliary_loss_mlp": 0.01303332, + "balance_loss_clip": 0.06327485, + "balance_loss_mlp": 0.01256387, + "epoch": 0.08874192093792274, + "flos": 43187264922240.0, + "grad_norm": 1.8785525854248355, + "language_loss": 0.76261604, + "learning_rate": 3.963913211115848e-06, + "loss": 0.84307337, + "num_input_tokens_seen": 31563425, + "router_z_loss_clip": 4.1484375, + "router_z_loss_mlp": 0.46972656, + "step": 1476, + "time_per_iteration": 4.163857460021973 + }, + { + "auxiliary_loss_clip": 0.06743093, + "auxiliary_loss_mlp": 0.01308468, + "balance_loss_clip": 0.06333718, + "balance_loss_mlp": 0.01262405, + "epoch": 0.0888020441905907, + "flos": 32860491851520.0, + "grad_norm": 1.6890231836232912, + "language_loss": 0.76270819, + "learning_rate": 3.9638395241897895e-06, + "loss": 0.84322381, + "num_input_tokens_seen": 31584525, + "router_z_loss_clip": 4.09375, + "router_z_loss_mlp": 0.46069336, + "step": 1477, + "time_per_iteration": 2.6772334575653076 + }, + { + "auxiliary_loss_clip": 0.06751049, + "auxiliary_loss_mlp": 0.01308123, + "balance_loss_clip": 0.06334269, + "balance_loss_mlp": 0.01263468, + "epoch": 0.08886216744325869, + "flos": 23156124508800.0, + "grad_norm": 2.600680931100332, + "language_loss": 0.88817739, + "learning_rate": 3.963765762794739e-06, + "loss": 0.96876919, + "num_input_tokens_seen": 31603325, + "router_z_loss_clip": 4.16601562, + "router_z_loss_mlp": 0.44677734, + "step": 1478, + "time_per_iteration": 4.08270525932312 + }, + { + "auxiliary_loss_clip": 0.0675, + "auxiliary_loss_mlp": 0.01309174, + "balance_loss_clip": 0.06336476, + "balance_loss_mlp": 0.01263803, + "epoch": 0.08892229069592665, + "flos": 23338371139200.0, + "grad_norm": 1.8272738608530537, + "language_loss": 0.79003656, + "learning_rate": 3.963691926933495e-06, + "loss": 0.87062836, + "num_input_tokens_seen": 31624820, + "router_z_loss_clip": 4.1328125, + "router_z_loss_mlp": 0.45361328, + "step": 1479, + "time_per_iteration": 2.5917623043060303 + }, + { + "auxiliary_loss_clip": 0.06747445, + "auxiliary_loss_mlp": 0.01303872, + "balance_loss_clip": 0.06333964, + "balance_loss_mlp": 0.01256665, + "epoch": 0.08898241394859462, + "flos": 26221012427520.0, + "grad_norm": 4.931621721483509, + "language_loss": 0.80906087, + "learning_rate": 3.9636180166088555e-06, + "loss": 0.88957405, + "num_input_tokens_seen": 31646080, + "router_z_loss_clip": 4.1328125, + "router_z_loss_mlp": 0.47265625, + "step": 1480, + "time_per_iteration": 2.6102962493896484 + }, + { + "auxiliary_loss_clip": 0.06771734, + "auxiliary_loss_mlp": 0.01331796, + "balance_loss_clip": 0.06338413, + "balance_loss_mlp": 0.01278986, + "epoch": 0.0890425372012626, + "flos": 23557444439040.0, + "grad_norm": 2.1143063599710135, + "language_loss": 0.68804622, + "learning_rate": 3.963544031823624e-06, + "loss": 0.76908153, + "num_input_tokens_seen": 31665770, + "router_z_loss_clip": 4.33203125, + "router_z_loss_mlp": 0.52807617, + "step": 1481, + "time_per_iteration": 4.085212707519531 + }, + { + "auxiliary_loss_clip": 0.06743339, + "auxiliary_loss_mlp": 0.01307322, + "balance_loss_clip": 0.06335256, + "balance_loss_mlp": 0.01264358, + "epoch": 0.08910266045393056, + "flos": 23009446736640.0, + "grad_norm": 2.5169726563525234, + "language_loss": 0.99559236, + "learning_rate": 3.9634699725806065e-06, + "loss": 1.07609892, + "num_input_tokens_seen": 31683805, + "router_z_loss_clip": 4.07617188, + "router_z_loss_mlp": 0.42993164, + "step": 1482, + "time_per_iteration": 2.564034938812256 + }, + { + "auxiliary_loss_clip": 0.06760907, + "auxiliary_loss_mlp": 0.0131259, + "balance_loss_clip": 0.06338564, + "balance_loss_mlp": 0.01264024, + "epoch": 0.08916278370659853, + "flos": 31943766257280.0, + "grad_norm": 3.2036096398767993, + "language_loss": 0.81227845, + "learning_rate": 3.96339583888261e-06, + "loss": 0.89301342, + "num_input_tokens_seen": 31704630, + "router_z_loss_clip": 4.22460938, + "router_z_loss_mlp": 0.48535156, + "step": 1483, + "time_per_iteration": 4.063607215881348 + }, + { + "auxiliary_loss_clip": 0.06743906, + "auxiliary_loss_mlp": 0.01316489, + "balance_loss_clip": 0.06329283, + "balance_loss_mlp": 0.01268519, + "epoch": 0.08922290695926649, + "flos": 17536219966080.0, + "grad_norm": 10.926297293099243, + "language_loss": 0.87554848, + "learning_rate": 3.963321630732448e-06, + "loss": 0.95615244, + "num_input_tokens_seen": 31723255, + "router_z_loss_clip": 4.140625, + "router_z_loss_mlp": 0.47998047, + "step": 1484, + "time_per_iteration": 2.5457398891448975 + }, + { + "auxiliary_loss_clip": 0.06757183, + "auxiliary_loss_mlp": 0.01321525, + "balance_loss_clip": 0.06330685, + "balance_loss_mlp": 0.01272315, + "epoch": 0.08928303021193447, + "flos": 32133392046720.0, + "grad_norm": 2.337720635500538, + "language_loss": 0.82324612, + "learning_rate": 3.963247348132932e-06, + "loss": 0.90403324, + "num_input_tokens_seen": 31747045, + "router_z_loss_clip": 4.265625, + "router_z_loss_mlp": 0.49267578, + "step": 1485, + "time_per_iteration": 2.6794724464416504 + }, + { + "auxiliary_loss_clip": 0.06736165, + "auxiliary_loss_mlp": 0.01302402, + "balance_loss_clip": 0.06326707, + "balance_loss_mlp": 0.01256125, + "epoch": 0.08934315346460243, + "flos": 22131392601600.0, + "grad_norm": 3.158284640334893, + "language_loss": 0.84766626, + "learning_rate": 3.96317299108688e-06, + "loss": 0.92805195, + "num_input_tokens_seen": 31766615, + "router_z_loss_clip": 4.09765625, + "router_z_loss_mlp": 0.46264648, + "step": 1486, + "time_per_iteration": 2.5732409954071045 + }, + { + "auxiliary_loss_clip": 0.06736217, + "auxiliary_loss_mlp": 0.0130934, + "balance_loss_clip": 0.06328043, + "balance_loss_mlp": 0.01267569, + "epoch": 0.0894032767172704, + "flos": 22572264458880.0, + "grad_norm": 1.7672180345851645, + "language_loss": 0.78605509, + "learning_rate": 3.963098559597111e-06, + "loss": 0.86651075, + "num_input_tokens_seen": 31785855, + "router_z_loss_clip": 4.07617188, + "router_z_loss_mlp": 0.41748047, + "step": 1487, + "time_per_iteration": 2.5952718257904053 + }, + { + "auxiliary_loss_clip": 0.06736919, + "auxiliary_loss_mlp": 0.01308401, + "balance_loss_clip": 0.06326038, + "balance_loss_mlp": 0.0126353, + "epoch": 0.08946339996993838, + "flos": 20199578319360.0, + "grad_norm": 4.25204894574284, + "language_loss": 0.85387635, + "learning_rate": 3.963024053666449e-06, + "loss": 0.93432951, + "num_input_tokens_seen": 31804210, + "router_z_loss_clip": 4.10546875, + "router_z_loss_mlp": 0.44873047, + "step": 1488, + "time_per_iteration": 2.5534958839416504 + }, + { + "auxiliary_loss_clip": 0.06725559, + "auxiliary_loss_mlp": 0.01303445, + "balance_loss_clip": 0.06320536, + "balance_loss_mlp": 0.01259838, + "epoch": 0.08952352322260634, + "flos": 48371035363200.0, + "grad_norm": 2.4620081078023173, + "language_loss": 0.74370039, + "learning_rate": 3.962949473297718e-06, + "loss": 0.82399046, + "num_input_tokens_seen": 31826150, + "router_z_loss_clip": 4.04882812, + "router_z_loss_mlp": 0.43554688, + "step": 1489, + "time_per_iteration": 2.780122756958008 + }, + { + "auxiliary_loss_clip": 0.06736162, + "auxiliary_loss_mlp": 0.01308033, + "balance_loss_clip": 0.06324734, + "balance_loss_mlp": 0.01264092, + "epoch": 0.08958364647527431, + "flos": 31800736137600.0, + "grad_norm": 2.6258968543660584, + "language_loss": 0.91654348, + "learning_rate": 3.962874818493745e-06, + "loss": 0.99698538, + "num_input_tokens_seen": 31848060, + "router_z_loss_clip": 4.11132812, + "router_z_loss_mlp": 0.43945312, + "step": 1490, + "time_per_iteration": 2.619051456451416 + }, + { + "auxiliary_loss_clip": 0.06748827, + "auxiliary_loss_mlp": 0.01303631, + "balance_loss_clip": 0.06332797, + "balance_loss_mlp": 0.01258737, + "epoch": 0.08964376972794229, + "flos": 23374988173440.0, + "grad_norm": 2.6637397886572076, + "language_loss": 0.76370478, + "learning_rate": 3.9628000892573635e-06, + "loss": 0.84422934, + "num_input_tokens_seen": 31870040, + "router_z_loss_clip": 4.1640625, + "router_z_loss_mlp": 0.44897461, + "step": 1491, + "time_per_iteration": 2.590679407119751 + }, + { + "auxiliary_loss_clip": 0.06728335, + "auxiliary_loss_mlp": 0.01302455, + "balance_loss_clip": 0.06325481, + "balance_loss_mlp": 0.01261804, + "epoch": 0.08970389298061025, + "flos": 23301502542720.0, + "grad_norm": 1.853626118240874, + "language_loss": 0.78431886, + "learning_rate": 3.9627252855914055e-06, + "loss": 0.86462677, + "num_input_tokens_seen": 31890400, + "router_z_loss_clip": 4.02539062, + "router_z_loss_mlp": 0.40673828, + "step": 1492, + "time_per_iteration": 2.5715339183807373 + }, + { + "auxiliary_loss_clip": 0.06729841, + "auxiliary_loss_mlp": 0.01304764, + "balance_loss_clip": 0.06324601, + "balance_loss_mlp": 0.01260298, + "epoch": 0.08976401623327822, + "flos": 33769419016320.0, + "grad_norm": 3.870321699477457, + "language_loss": 0.73167109, + "learning_rate": 3.962650407498707e-06, + "loss": 0.81201714, + "num_input_tokens_seen": 31913435, + "router_z_loss_clip": 4.046875, + "router_z_loss_mlp": 0.44433594, + "step": 1493, + "time_per_iteration": 2.6644091606140137 + }, + { + "auxiliary_loss_clip": 0.0673489, + "auxiliary_loss_mlp": 0.01306407, + "balance_loss_clip": 0.06327641, + "balance_loss_mlp": 0.01259987, + "epoch": 0.08982413948594618, + "flos": 23917535360640.0, + "grad_norm": 1.970514386565943, + "language_loss": 0.88832223, + "learning_rate": 3.962575454982109e-06, + "loss": 0.96873516, + "num_input_tokens_seen": 31932435, + "router_z_loss_clip": 4.07617188, + "router_z_loss_mlp": 0.46435547, + "step": 1494, + "time_per_iteration": 2.58363676071167 + }, + { + "auxiliary_loss_clip": 0.06728575, + "auxiliary_loss_mlp": 0.01309753, + "balance_loss_clip": 0.06328882, + "balance_loss_mlp": 0.01267792, + "epoch": 0.08988426273861416, + "flos": 16843305427200.0, + "grad_norm": 4.2307100076147774, + "language_loss": 0.84796005, + "learning_rate": 3.962500428044454e-06, + "loss": 0.92834336, + "num_input_tokens_seen": 31950125, + "router_z_loss_clip": 3.99414062, + "router_z_loss_mlp": 0.41967773, + "step": 1495, + "time_per_iteration": 2.5592563152313232 + }, + { + "auxiliary_loss_clip": 0.06737964, + "auxiliary_loss_mlp": 0.01307798, + "balance_loss_clip": 0.06329042, + "balance_loss_mlp": 0.01263476, + "epoch": 0.08994438599128213, + "flos": 14798621295360.0, + "grad_norm": 2.6872032858380885, + "language_loss": 0.72458923, + "learning_rate": 3.962425326688585e-06, + "loss": 0.80504692, + "num_input_tokens_seen": 31968050, + "router_z_loss_clip": 4.08203125, + "router_z_loss_mlp": 0.44287109, + "step": 1496, + "time_per_iteration": 2.527702569961548 + }, + { + "auxiliary_loss_clip": 0.06731858, + "auxiliary_loss_mlp": 0.01301643, + "balance_loss_clip": 0.06328158, + "balance_loss_mlp": 0.01259038, + "epoch": 0.09000450924395009, + "flos": 17390087245440.0, + "grad_norm": 1.9873412980644265, + "language_loss": 0.82173735, + "learning_rate": 3.962350150917351e-06, + "loss": 0.90207237, + "num_input_tokens_seen": 31985675, + "router_z_loss_clip": 4.03515625, + "router_z_loss_mlp": 0.42578125, + "step": 1497, + "time_per_iteration": 2.5877413749694824 + }, + { + "auxiliary_loss_clip": 0.06743819, + "auxiliary_loss_mlp": 0.01303103, + "balance_loss_clip": 0.06327296, + "balance_loss_mlp": 0.01257064, + "epoch": 0.09006463249661807, + "flos": 24287269501440.0, + "grad_norm": 4.64905554567639, + "language_loss": 0.85617393, + "learning_rate": 3.9622749007336035e-06, + "loss": 0.93664312, + "num_input_tokens_seen": 32005180, + "router_z_loss_clip": 4.1640625, + "router_z_loss_mlp": 0.4609375, + "step": 1498, + "time_per_iteration": 2.5904557704925537 + }, + { + "auxiliary_loss_clip": 0.06749868, + "auxiliary_loss_mlp": 0.01309538, + "balance_loss_clip": 0.06334974, + "balance_loss_mlp": 0.01263666, + "epoch": 0.09012475574928604, + "flos": 13666931251200.0, + "grad_norm": 3.85109419291821, + "language_loss": 0.81540704, + "learning_rate": 3.962199576140195e-06, + "loss": 0.89600116, + "num_input_tokens_seen": 32022970, + "router_z_loss_clip": 4.1484375, + "router_z_loss_mlp": 0.45849609, + "step": 1499, + "time_per_iteration": 2.5302114486694336 + }, + { + "auxiliary_loss_clip": 0.06728019, + "auxiliary_loss_mlp": 0.01300863, + "balance_loss_clip": 0.06331602, + "balance_loss_mlp": 0.01261142, + "epoch": 0.090184879001954, + "flos": 23333884945920.0, + "grad_norm": 2.0381377997897636, + "language_loss": 0.94349372, + "learning_rate": 3.962124177139981e-06, + "loss": 1.02378249, + "num_input_tokens_seen": 32043055, + "router_z_loss_clip": 3.96484375, + "router_z_loss_mlp": 0.3972168, + "step": 1500, + "time_per_iteration": 2.5795865058898926 + }, + { + "auxiliary_loss_clip": 0.0677222, + "auxiliary_loss_mlp": 0.01314156, + "balance_loss_clip": 0.06350215, + "balance_loss_mlp": 0.01263539, + "epoch": 0.09024500225462198, + "flos": 23009320955520.0, + "grad_norm": 3.436423392701186, + "language_loss": 0.77039468, + "learning_rate": 3.962048703735822e-06, + "loss": 0.8512584, + "num_input_tokens_seen": 32061900, + "router_z_loss_clip": 4.21875, + "router_z_loss_mlp": 0.50634766, + "step": 1501, + "time_per_iteration": 2.5764503479003906 + }, + { + "auxiliary_loss_clip": 0.06607839, + "auxiliary_loss_mlp": 0.01283791, + "balance_loss_clip": 0.06328217, + "balance_loss_mlp": 0.01261165, + "epoch": 0.09030512550728995, + "flos": 62208626653440.0, + "grad_norm": 0.7031155649326037, + "language_loss": 0.58089769, + "learning_rate": 3.96197315593058e-06, + "loss": 0.659814, + "num_input_tokens_seen": 32122745, + "router_z_loss_clip": 2.796875, + "router_z_loss_mlp": 0.22619629, + "step": 1502, + "time_per_iteration": 3.1644375324249268 + }, + { + "auxiliary_loss_clip": 0.06763642, + "auxiliary_loss_mlp": 0.01313188, + "balance_loss_clip": 0.06354539, + "balance_loss_mlp": 0.01269653, + "epoch": 0.09036524875995791, + "flos": 38809907775360.0, + "grad_norm": 3.4086152145479427, + "language_loss": 0.72101718, + "learning_rate": 3.961897533727119e-06, + "loss": 0.80178547, + "num_input_tokens_seen": 32145125, + "router_z_loss_clip": 4.09179688, + "router_z_loss_mlp": 0.43579102, + "step": 1503, + "time_per_iteration": 2.724386215209961 + }, + { + "auxiliary_loss_clip": 0.06781425, + "auxiliary_loss_mlp": 0.01307874, + "balance_loss_clip": 0.06363953, + "balance_loss_mlp": 0.01263075, + "epoch": 0.09042537201262588, + "flos": 21696642092160.0, + "grad_norm": 2.1842796361034793, + "language_loss": 0.881266, + "learning_rate": 3.961821837128306e-06, + "loss": 0.96215898, + "num_input_tokens_seen": 32166255, + "router_z_loss_clip": 4.1796875, + "router_z_loss_mlp": 0.44848633, + "step": 1504, + "time_per_iteration": 2.5873734951019287 + }, + { + "auxiliary_loss_clip": 0.06790902, + "auxiliary_loss_mlp": 0.01331983, + "balance_loss_clip": 0.06361797, + "balance_loss_mlp": 0.01280795, + "epoch": 0.09048549526529386, + "flos": 22272536004480.0, + "grad_norm": 3.0474410186464427, + "language_loss": 0.75017542, + "learning_rate": 3.961746066137014e-06, + "loss": 0.83140427, + "num_input_tokens_seen": 32184010, + "router_z_loss_clip": 4.2890625, + "router_z_loss_mlp": 0.51171875, + "step": 1505, + "time_per_iteration": 2.542175054550171 + }, + { + "auxiliary_loss_clip": 0.06765792, + "auxiliary_loss_mlp": 0.0131069, + "balance_loss_clip": 0.06354111, + "balance_loss_mlp": 0.01263936, + "epoch": 0.09054561851796182, + "flos": 14616165029760.0, + "grad_norm": 3.6481054719455166, + "language_loss": 0.83357459, + "learning_rate": 3.961670220756114e-06, + "loss": 0.91433942, + "num_input_tokens_seen": 32201635, + "router_z_loss_clip": 4.11914062, + "router_z_loss_mlp": 0.46777344, + "step": 1506, + "time_per_iteration": 2.5811927318573 + }, + { + "auxiliary_loss_clip": 0.06768796, + "auxiliary_loss_mlp": 0.01305475, + "balance_loss_clip": 0.06366544, + "balance_loss_mlp": 0.01262584, + "epoch": 0.09060574177062979, + "flos": 27643542393600.0, + "grad_norm": 2.7002549048976388, + "language_loss": 0.78016138, + "learning_rate": 3.961594300988482e-06, + "loss": 0.8609041, + "num_input_tokens_seen": 32221940, + "router_z_loss_clip": 4.02148438, + "router_z_loss_mlp": 0.42871094, + "step": 1507, + "time_per_iteration": 2.6117966175079346 + }, + { + "auxiliary_loss_clip": 0.06588461, + "auxiliary_loss_mlp": 0.01287299, + "balance_loss_clip": 0.06317182, + "balance_loss_mlp": 0.01264351, + "epoch": 0.09066586502329776, + "flos": 66104637621120.0, + "grad_norm": 0.7149959192610794, + "language_loss": 0.57417059, + "learning_rate": 3.961518306836998e-06, + "loss": 0.65292823, + "num_input_tokens_seen": 32276495, + "router_z_loss_clip": 2.71875, + "router_z_loss_mlp": 0.22924805, + "step": 1508, + "time_per_iteration": 3.055577516555786 + }, + { + "auxiliary_loss_clip": 0.06765939, + "auxiliary_loss_mlp": 0.01315934, + "balance_loss_clip": 0.06356797, + "balance_loss_mlp": 0.01271135, + "epoch": 0.09072598827596573, + "flos": 18922426387200.0, + "grad_norm": 2.757411639882116, + "language_loss": 0.87097013, + "learning_rate": 3.961442238304543e-06, + "loss": 0.95178884, + "num_input_tokens_seen": 32294130, + "router_z_loss_clip": 4.09179688, + "router_z_loss_mlp": 0.44775391, + "step": 1509, + "time_per_iteration": 2.5325253009796143 + }, + { + "auxiliary_loss_clip": 0.06796411, + "auxiliary_loss_mlp": 0.01325092, + "balance_loss_clip": 0.06366567, + "balance_loss_mlp": 0.01275358, + "epoch": 0.0907861115286337, + "flos": 24827804190720.0, + "grad_norm": 3.0354649762753896, + "language_loss": 0.86899114, + "learning_rate": 3.961366095394002e-06, + "loss": 0.95020616, + "num_input_tokens_seen": 32313555, + "router_z_loss_clip": 4.29492188, + "router_z_loss_mlp": 0.49707031, + "step": 1510, + "time_per_iteration": 2.608421564102173 + }, + { + "auxiliary_loss_clip": 0.06775412, + "auxiliary_loss_mlp": 0.01304282, + "balance_loss_clip": 0.06358128, + "balance_loss_mlp": 0.01260127, + "epoch": 0.09084623478130167, + "flos": 21659270371200.0, + "grad_norm": 2.4633218193770103, + "language_loss": 0.89968181, + "learning_rate": 3.961289878108262e-06, + "loss": 0.98047876, + "num_input_tokens_seen": 32331430, + "router_z_loss_clip": 4.17773438, + "router_z_loss_mlp": 0.44140625, + "step": 1511, + "time_per_iteration": 2.566403388977051 + }, + { + "auxiliary_loss_clip": 0.0674355, + "auxiliary_loss_mlp": 0.01315251, + "balance_loss_clip": 0.06338912, + "balance_loss_mlp": 0.01272121, + "epoch": 0.09090635803396964, + "flos": 27647148119040.0, + "grad_norm": 2.09202487509347, + "language_loss": 0.86417758, + "learning_rate": 3.9612135864502135e-06, + "loss": 0.94476557, + "num_input_tokens_seen": 32353705, + "router_z_loss_clip": 4.0546875, + "router_z_loss_mlp": 0.43164062, + "step": 1512, + "time_per_iteration": 2.665790319442749 + }, + { + "auxiliary_loss_clip": 0.06752454, + "auxiliary_loss_mlp": 0.0130495, + "balance_loss_clip": 0.06350584, + "balance_loss_mlp": 0.01262726, + "epoch": 0.0909664812866376, + "flos": 17673757643520.0, + "grad_norm": 2.5146334197942926, + "language_loss": 0.88217908, + "learning_rate": 3.961137220422749e-06, + "loss": 0.96275318, + "num_input_tokens_seen": 32370520, + "router_z_loss_clip": 4.02148438, + "router_z_loss_mlp": 0.42211914, + "step": 1513, + "time_per_iteration": 2.531816244125366 + }, + { + "auxiliary_loss_clip": 0.06760095, + "auxiliary_loss_mlp": 0.01314183, + "balance_loss_clip": 0.06354512, + "balance_loss_mlp": 0.01272078, + "epoch": 0.09102660453930557, + "flos": 23958261244800.0, + "grad_norm": 5.873122305201123, + "language_loss": 0.88520277, + "learning_rate": 3.961060780028764e-06, + "loss": 0.9659456, + "num_input_tokens_seen": 32389105, + "router_z_loss_clip": 4.05664062, + "router_z_loss_mlp": 0.42138672, + "step": 1514, + "time_per_iteration": 2.609802722930908 + }, + { + "auxiliary_loss_clip": 0.06748682, + "auxiliary_loss_mlp": 0.01305229, + "balance_loss_clip": 0.06345841, + "balance_loss_mlp": 0.01266104, + "epoch": 0.09108672779197355, + "flos": 25820195621760.0, + "grad_norm": 1.9733366853077507, + "language_loss": 0.91259241, + "learning_rate": 3.960984265271159e-06, + "loss": 0.99313152, + "num_input_tokens_seen": 32408065, + "router_z_loss_clip": 4.02929688, + "router_z_loss_mlp": 0.39111328, + "step": 1515, + "time_per_iteration": 2.626183271408081 + }, + { + "auxiliary_loss_clip": 0.06753635, + "auxiliary_loss_mlp": 0.01307479, + "balance_loss_clip": 0.06346089, + "balance_loss_mlp": 0.01264754, + "epoch": 0.09114685104464151, + "flos": 29646620173440.0, + "grad_norm": 2.1883056599674195, + "language_loss": 0.87669599, + "learning_rate": 3.9609076761528335e-06, + "loss": 0.9573071, + "num_input_tokens_seen": 32427225, + "router_z_loss_clip": 4.078125, + "router_z_loss_mlp": 0.42700195, + "step": 1516, + "time_per_iteration": 4.0171709060668945 + }, + { + "auxiliary_loss_clip": 0.06753673, + "auxiliary_loss_mlp": 0.01309986, + "balance_loss_clip": 0.06344739, + "balance_loss_mlp": 0.01267643, + "epoch": 0.09120697429730948, + "flos": 33738084789120.0, + "grad_norm": 1.96049698042547, + "language_loss": 0.82941747, + "learning_rate": 3.960831012676692e-06, + "loss": 0.91005409, + "num_input_tokens_seen": 32450510, + "router_z_loss_clip": 4.0859375, + "router_z_loss_mlp": 0.42285156, + "step": 1517, + "time_per_iteration": 4.134803056716919 + }, + { + "auxiliary_loss_clip": 0.06748644, + "auxiliary_loss_mlp": 0.01313239, + "balance_loss_clip": 0.06338718, + "balance_loss_mlp": 0.0127061, + "epoch": 0.09126709754997746, + "flos": 18406559525760.0, + "grad_norm": 1.9085933618955446, + "language_loss": 0.79150838, + "learning_rate": 3.960754274845642e-06, + "loss": 0.87212718, + "num_input_tokens_seen": 32468425, + "router_z_loss_clip": 4.09375, + "router_z_loss_mlp": 0.42626953, + "step": 1518, + "time_per_iteration": 2.609239101409912 + }, + { + "auxiliary_loss_clip": 0.06742416, + "auxiliary_loss_mlp": 0.01311508, + "balance_loss_clip": 0.0633543, + "balance_loss_mlp": 0.01267853, + "epoch": 0.09132722080264542, + "flos": 22098674782080.0, + "grad_norm": 1.8265694387954685, + "language_loss": 0.88381147, + "learning_rate": 3.960677462662594e-06, + "loss": 0.9643507, + "num_input_tokens_seen": 32487510, + "router_z_loss_clip": 4.0703125, + "router_z_loss_mlp": 0.43676758, + "step": 1519, + "time_per_iteration": 2.559178590774536 + }, + { + "auxiliary_loss_clip": 0.06749827, + "auxiliary_loss_mlp": 0.01303758, + "balance_loss_clip": 0.06334724, + "balance_loss_mlp": 0.01259507, + "epoch": 0.09138734405531339, + "flos": 21039547973760.0, + "grad_norm": 3.1504469624820497, + "language_loss": 0.75833631, + "learning_rate": 3.96060057613046e-06, + "loss": 0.83887213, + "num_input_tokens_seen": 32507250, + "router_z_loss_clip": 4.15625, + "router_z_loss_mlp": 0.44238281, + "step": 1520, + "time_per_iteration": 2.5994057655334473 + }, + { + "auxiliary_loss_clip": 0.06753822, + "auxiliary_loss_mlp": 0.0130995, + "balance_loss_clip": 0.06342606, + "balance_loss_mlp": 0.01263912, + "epoch": 0.09144746730798137, + "flos": 20090104560000.0, + "grad_norm": 3.4850769207863648, + "language_loss": 0.8813951, + "learning_rate": 3.960523615252156e-06, + "loss": 0.96203285, + "num_input_tokens_seen": 32526045, + "router_z_loss_clip": 4.1171875, + "router_z_loss_mlp": 0.45996094, + "step": 1521, + "time_per_iteration": 3.9595701694488525 + }, + { + "auxiliary_loss_clip": 0.06768003, + "auxiliary_loss_mlp": 0.0131471, + "balance_loss_clip": 0.06346045, + "balance_loss_mlp": 0.01269864, + "epoch": 0.09150759056064933, + "flos": 22783874745600.0, + "grad_norm": 2.490873911959668, + "language_loss": 0.85374022, + "learning_rate": 3.960446580030599e-06, + "loss": 0.93456733, + "num_input_tokens_seen": 32546575, + "router_z_loss_clip": 4.21875, + "router_z_loss_mlp": 0.44824219, + "step": 1522, + "time_per_iteration": 4.0201475620269775 + }, + { + "auxiliary_loss_clip": 0.06745256, + "auxiliary_loss_mlp": 0.01307893, + "balance_loss_clip": 0.06349748, + "balance_loss_mlp": 0.01265359, + "epoch": 0.0915677138133173, + "flos": 27571733844480.0, + "grad_norm": 3.0013683058651974, + "language_loss": 0.82841086, + "learning_rate": 3.960369470468711e-06, + "loss": 0.90894234, + "num_input_tokens_seen": 32568795, + "router_z_loss_clip": 3.95117188, + "router_z_loss_mlp": 0.42504883, + "step": 1523, + "time_per_iteration": 2.6468050479888916 + }, + { + "auxiliary_loss_clip": 0.0678298, + "auxiliary_loss_mlp": 0.01311185, + "balance_loss_clip": 0.06364655, + "balance_loss_mlp": 0.01265838, + "epoch": 0.09162783706598528, + "flos": 17680340188800.0, + "grad_norm": 4.7132272646544395, + "language_loss": 0.75685203, + "learning_rate": 3.960292286569418e-06, + "loss": 0.83779365, + "num_input_tokens_seen": 32587010, + "router_z_loss_clip": 4.1796875, + "router_z_loss_mlp": 0.45361328, + "step": 1524, + "time_per_iteration": 2.521636962890625 + }, + { + "auxiliary_loss_clip": 0.06770191, + "auxiliary_loss_mlp": 0.01303707, + "balance_loss_clip": 0.06361801, + "balance_loss_mlp": 0.01259814, + "epoch": 0.09168796031865324, + "flos": 18484028225280.0, + "grad_norm": 2.538080589714564, + "language_loss": 0.88912833, + "learning_rate": 3.960215028335644e-06, + "loss": 0.96986729, + "num_input_tokens_seen": 32602375, + "router_z_loss_clip": 4.08398438, + "router_z_loss_mlp": 0.43920898, + "step": 1525, + "time_per_iteration": 2.523988962173462 + }, + { + "auxiliary_loss_clip": 0.06788673, + "auxiliary_loss_mlp": 0.01309343, + "balance_loss_clip": 0.06375777, + "balance_loss_mlp": 0.01263877, + "epoch": 0.0917480835713212, + "flos": 29395290251520.0, + "grad_norm": 2.947838768384084, + "language_loss": 0.76479626, + "learning_rate": 3.96013769577032e-06, + "loss": 0.84577644, + "num_input_tokens_seen": 32621460, + "router_z_loss_clip": 4.125, + "router_z_loss_mlp": 0.45458984, + "step": 1526, + "time_per_iteration": 2.622180700302124 + }, + { + "auxiliary_loss_clip": 0.06764297, + "auxiliary_loss_mlp": 0.0130915, + "balance_loss_clip": 0.06361825, + "balance_loss_mlp": 0.01267212, + "epoch": 0.09180820682398917, + "flos": 19835504328960.0, + "grad_norm": 3.217414250452265, + "language_loss": 0.78915322, + "learning_rate": 3.960060288876378e-06, + "loss": 0.86988777, + "num_input_tokens_seen": 32640440, + "router_z_loss_clip": 4.01953125, + "router_z_loss_mlp": 0.41967773, + "step": 1527, + "time_per_iteration": 2.574036121368408 + }, + { + "auxiliary_loss_clip": 0.0678985, + "auxiliary_loss_mlp": 0.0131218, + "balance_loss_clip": 0.0637854, + "balance_loss_mlp": 0.01269146, + "epoch": 0.09186833007665715, + "flos": 23848619777280.0, + "grad_norm": 2.3845621342237284, + "language_loss": 0.81092995, + "learning_rate": 3.959982807656753e-06, + "loss": 0.89195025, + "num_input_tokens_seen": 32660020, + "router_z_loss_clip": 4.109375, + "router_z_loss_mlp": 0.42993164, + "step": 1528, + "time_per_iteration": 2.55942440032959 + }, + { + "auxiliary_loss_clip": 0.067963, + "auxiliary_loss_mlp": 0.01308536, + "balance_loss_clip": 0.06370017, + "balance_loss_mlp": 0.01259708, + "epoch": 0.09192845332932512, + "flos": 12937693167360.0, + "grad_norm": 3.969055249882827, + "language_loss": 0.79179597, + "learning_rate": 3.959905252114384e-06, + "loss": 0.87284434, + "num_input_tokens_seen": 32678170, + "router_z_loss_clip": 4.26171875, + "router_z_loss_mlp": 0.48828125, + "step": 1529, + "time_per_iteration": 2.559513807296753 + }, + { + "auxiliary_loss_clip": 0.06793401, + "auxiliary_loss_mlp": 0.01313121, + "balance_loss_clip": 0.06376834, + "balance_loss_mlp": 0.01266081, + "epoch": 0.09198857658199308, + "flos": 24574503697920.0, + "grad_norm": 2.3851695624911433, + "language_loss": 0.84393311, + "learning_rate": 3.959827622252211e-06, + "loss": 0.92499834, + "num_input_tokens_seen": 32697540, + "router_z_loss_clip": 4.1640625, + "router_z_loss_mlp": 0.47021484, + "step": 1530, + "time_per_iteration": 2.586825132369995 + }, + { + "auxiliary_loss_clip": 0.06782777, + "auxiliary_loss_mlp": 0.01307988, + "balance_loss_clip": 0.0637871, + "balance_loss_mlp": 0.01264596, + "epoch": 0.09204869983466106, + "flos": 20273231658240.0, + "grad_norm": 2.9699033759595728, + "language_loss": 0.85435712, + "learning_rate": 3.959749918073179e-06, + "loss": 0.93526471, + "num_input_tokens_seen": 32716805, + "router_z_loss_clip": 4.0390625, + "router_z_loss_mlp": 0.43383789, + "step": 1531, + "time_per_iteration": 2.592822313308716 + }, + { + "auxiliary_loss_clip": 0.06784501, + "auxiliary_loss_mlp": 0.01306885, + "balance_loss_clip": 0.06371005, + "balance_loss_mlp": 0.01261967, + "epoch": 0.09210882308732903, + "flos": 20891780098560.0, + "grad_norm": 2.1537883780568907, + "language_loss": 0.82955891, + "learning_rate": 3.959672139580233e-06, + "loss": 0.91047275, + "num_input_tokens_seen": 32736385, + "router_z_loss_clip": 4.13671875, + "router_z_loss_mlp": 0.44897461, + "step": 1532, + "time_per_iteration": 2.5733680725097656 + }, + { + "auxiliary_loss_clip": 0.06776289, + "auxiliary_loss_mlp": 0.01303592, + "balance_loss_clip": 0.06368969, + "balance_loss_mlp": 0.01262059, + "epoch": 0.09216894633999699, + "flos": 30964246427520.0, + "grad_norm": 3.2208618489711593, + "language_loss": 0.85266644, + "learning_rate": 3.9595942867763235e-06, + "loss": 0.93346524, + "num_input_tokens_seen": 32757140, + "router_z_loss_clip": 4.06835938, + "router_z_loss_mlp": 0.41552734, + "step": 1533, + "time_per_iteration": 2.640906810760498 + }, + { + "auxiliary_loss_clip": 0.06779255, + "auxiliary_loss_mlp": 0.01307047, + "balance_loss_clip": 0.06369043, + "balance_loss_mlp": 0.01263369, + "epoch": 0.09222906959266497, + "flos": 13156556832000.0, + "grad_norm": 2.5924628709665987, + "language_loss": 0.91772735, + "learning_rate": 3.959516359664402e-06, + "loss": 0.99859047, + "num_input_tokens_seen": 32774860, + "router_z_loss_clip": 4.09960938, + "router_z_loss_mlp": 0.43652344, + "step": 1534, + "time_per_iteration": 2.5586555004119873 + }, + { + "auxiliary_loss_clip": 0.06771498, + "auxiliary_loss_mlp": 0.01306705, + "balance_loss_clip": 0.06357232, + "balance_loss_mlp": 0.01260142, + "epoch": 0.09228919284533293, + "flos": 26001603711360.0, + "grad_norm": 3.0123317324125694, + "language_loss": 0.77440608, + "learning_rate": 3.959438358247424e-06, + "loss": 0.85518813, + "num_input_tokens_seen": 32795250, + "router_z_loss_clip": 4.14257812, + "router_z_loss_mlp": 0.46557617, + "step": 1535, + "time_per_iteration": 2.5873541831970215 + }, + { + "auxiliary_loss_clip": 0.06759383, + "auxiliary_loss_mlp": 0.0131007, + "balance_loss_clip": 0.06362146, + "balance_loss_mlp": 0.012688, + "epoch": 0.0923493160980009, + "flos": 18666694126080.0, + "grad_norm": 2.0947698011843707, + "language_loss": 0.83399653, + "learning_rate": 3.959360282528346e-06, + "loss": 0.91469115, + "num_input_tokens_seen": 32813805, + "router_z_loss_clip": 3.97070312, + "router_z_loss_mlp": 0.41235352, + "step": 1536, + "time_per_iteration": 2.5708868503570557 + }, + { + "auxiliary_loss_clip": 0.06743568, + "auxiliary_loss_mlp": 0.01297679, + "balance_loss_clip": 0.06350097, + "balance_loss_mlp": 0.01257767, + "epoch": 0.09240943935066886, + "flos": 21146673818880.0, + "grad_norm": 2.077431495660488, + "language_loss": 0.91567117, + "learning_rate": 3.959282132510131e-06, + "loss": 0.99608374, + "num_input_tokens_seen": 32830960, + "router_z_loss_clip": 3.93945312, + "router_z_loss_mlp": 0.39916992, + "step": 1537, + "time_per_iteration": 2.5669217109680176 + }, + { + "auxiliary_loss_clip": 0.06758659, + "auxiliary_loss_mlp": 0.01302061, + "balance_loss_clip": 0.06354217, + "balance_loss_mlp": 0.01258288, + "epoch": 0.09246956260333684, + "flos": 20598298773120.0, + "grad_norm": 2.764633424079652, + "language_loss": 0.82388502, + "learning_rate": 3.959203908195741e-06, + "loss": 0.9044922, + "num_input_tokens_seen": 32848275, + "router_z_loss_clip": 4.04296875, + "router_z_loss_mlp": 0.43774414, + "step": 1538, + "time_per_iteration": 2.5693938732147217 + }, + { + "auxiliary_loss_clip": 0.06616426, + "auxiliary_loss_mlp": 0.01331188, + "balance_loss_clip": 0.06353034, + "balance_loss_mlp": 0.01300217, + "epoch": 0.09252968585600481, + "flos": 67580052312960.0, + "grad_norm": 0.7302597602699774, + "language_loss": 0.57435596, + "learning_rate": 3.959125609588142e-06, + "loss": 0.65383208, + "num_input_tokens_seen": 32917730, + "router_z_loss_clip": 2.625, + "router_z_loss_mlp": 0.30932617, + "step": 1539, + "time_per_iteration": 3.310535430908203 + }, + { + "auxiliary_loss_clip": 0.06755982, + "auxiliary_loss_mlp": 0.01299614, + "balance_loss_clip": 0.06351999, + "balance_loss_mlp": 0.01256174, + "epoch": 0.09258980910867277, + "flos": 17389542193920.0, + "grad_norm": 3.846304679224495, + "language_loss": 0.7084049, + "learning_rate": 3.959047236690304e-06, + "loss": 0.78896087, + "num_input_tokens_seen": 32934910, + "router_z_loss_clip": 4.03320312, + "router_z_loss_mlp": 0.43457031, + "step": 1540, + "time_per_iteration": 2.5759708881378174 + }, + { + "auxiliary_loss_clip": 0.06744132, + "auxiliary_loss_mlp": 0.01299701, + "balance_loss_clip": 0.0634924, + "balance_loss_mlp": 0.0125824, + "epoch": 0.09264993236134075, + "flos": 19872205217280.0, + "grad_norm": 1.8486482297190108, + "language_loss": 0.8567428, + "learning_rate": 3.958968789505198e-06, + "loss": 0.93718112, + "num_input_tokens_seen": 32953840, + "router_z_loss_clip": 3.95117188, + "router_z_loss_mlp": 0.41455078, + "step": 1541, + "time_per_iteration": 2.5332911014556885 + }, + { + "auxiliary_loss_clip": 0.06613824, + "auxiliary_loss_mlp": 0.01296188, + "balance_loss_clip": 0.06351398, + "balance_loss_mlp": 0.01268222, + "epoch": 0.09271005561400872, + "flos": 62301455377920.0, + "grad_norm": 0.8853632542817719, + "language_loss": 0.62370431, + "learning_rate": 3.9588902680358e-06, + "loss": 0.70280445, + "num_input_tokens_seen": 33011410, + "router_z_loss_clip": 2.625, + "router_z_loss_mlp": 0.28027344, + "step": 1542, + "time_per_iteration": 3.234708309173584 + }, + { + "auxiliary_loss_clip": 0.06759306, + "auxiliary_loss_mlp": 0.01304245, + "balance_loss_clip": 0.06356558, + "balance_loss_mlp": 0.01259923, + "epoch": 0.09277017886667668, + "flos": 23336358641280.0, + "grad_norm": 2.3970894213309, + "language_loss": 0.84548283, + "learning_rate": 3.958811672285086e-06, + "loss": 0.92611837, + "num_input_tokens_seen": 33031675, + "router_z_loss_clip": 4.03320312, + "router_z_loss_mlp": 0.44360352, + "step": 1543, + "time_per_iteration": 2.5636215209960938 + }, + { + "auxiliary_loss_clip": 0.06747155, + "auxiliary_loss_mlp": 0.01303454, + "balance_loss_clip": 0.06351274, + "balance_loss_mlp": 0.01258178, + "epoch": 0.09283030211934466, + "flos": 54757088513280.0, + "grad_norm": 2.335606951107943, + "language_loss": 0.73961073, + "learning_rate": 3.958733002256038e-06, + "loss": 0.82011688, + "num_input_tokens_seen": 33056355, + "router_z_loss_clip": 3.96289062, + "router_z_loss_mlp": 0.45288086, + "step": 1544, + "time_per_iteration": 2.8664584159851074 + }, + { + "auxiliary_loss_clip": 0.06775358, + "auxiliary_loss_mlp": 0.01304809, + "balance_loss_clip": 0.06364222, + "balance_loss_mlp": 0.01260082, + "epoch": 0.09289042537201263, + "flos": 30342385751040.0, + "grad_norm": 2.3360980643139673, + "language_loss": 0.78971326, + "learning_rate": 3.958654257951637e-06, + "loss": 0.87051487, + "num_input_tokens_seen": 33079520, + "router_z_loss_clip": 4.109375, + "router_z_loss_mlp": 0.44750977, + "step": 1545, + "time_per_iteration": 2.6384429931640625 + }, + { + "auxiliary_loss_clip": 0.0674521, + "auxiliary_loss_mlp": 0.01308675, + "balance_loss_clip": 0.06349306, + "balance_loss_mlp": 0.01266499, + "epoch": 0.09295054862468059, + "flos": 17752274519040.0, + "grad_norm": 3.8854693427637796, + "language_loss": 0.77781618, + "learning_rate": 3.9585754393748706e-06, + "loss": 0.85835493, + "num_input_tokens_seen": 33096135, + "router_z_loss_clip": 3.95703125, + "router_z_loss_mlp": 0.42163086, + "step": 1546, + "time_per_iteration": 2.5352087020874023 + }, + { + "auxiliary_loss_clip": 0.06760454, + "auxiliary_loss_mlp": 0.01300982, + "balance_loss_clip": 0.06357808, + "balance_loss_mlp": 0.01258066, + "epoch": 0.09301067187734856, + "flos": 23664528357120.0, + "grad_norm": 2.488248885797729, + "language_loss": 0.85732055, + "learning_rate": 3.9584965465287275e-06, + "loss": 0.93793488, + "num_input_tokens_seen": 33115245, + "router_z_loss_clip": 4.02734375, + "router_z_loss_mlp": 0.42919922, + "step": 1547, + "time_per_iteration": 2.6185734272003174 + }, + { + "auxiliary_loss_clip": 0.0676943, + "auxiliary_loss_mlp": 0.01302462, + "balance_loss_clip": 0.06361516, + "balance_loss_mlp": 0.01256733, + "epoch": 0.09307079513001654, + "flos": 27535242591360.0, + "grad_norm": 10.105633046635301, + "language_loss": 0.69631422, + "learning_rate": 3.958417579416199e-06, + "loss": 0.77703309, + "num_input_tokens_seen": 33136640, + "router_z_loss_clip": 4.078125, + "router_z_loss_mlp": 0.45703125, + "step": 1548, + "time_per_iteration": 2.590592861175537 + }, + { + "auxiliary_loss_clip": 0.06756231, + "auxiliary_loss_mlp": 0.01308751, + "balance_loss_clip": 0.06351212, + "balance_loss_mlp": 0.01262164, + "epoch": 0.0931309183826845, + "flos": 20632945236480.0, + "grad_norm": 2.778765119974638, + "language_loss": 0.85783607, + "learning_rate": 3.9583385380402795e-06, + "loss": 0.93848586, + "num_input_tokens_seen": 33155060, + "router_z_loss_clip": 4.0546875, + "router_z_loss_mlp": 0.46582031, + "step": 1549, + "time_per_iteration": 2.5733652114868164 + }, + { + "auxiliary_loss_clip": 0.0674461, + "auxiliary_loss_mlp": 0.0130734, + "balance_loss_clip": 0.06348558, + "balance_loss_mlp": 0.01260515, + "epoch": 0.09319104163535247, + "flos": 29028239441280.0, + "grad_norm": 2.291130376172184, + "language_loss": 0.78293371, + "learning_rate": 3.958259422403966e-06, + "loss": 0.86345315, + "num_input_tokens_seen": 33175420, + "router_z_loss_clip": 3.96289062, + "router_z_loss_mlp": 0.46777344, + "step": 1550, + "time_per_iteration": 2.675468683242798 + }, + { + "auxiliary_loss_clip": 0.06764482, + "auxiliary_loss_mlp": 0.01307112, + "balance_loss_clip": 0.06363475, + "balance_loss_mlp": 0.01261932, + "epoch": 0.09325116488802045, + "flos": 25308605318400.0, + "grad_norm": 3.8025580487165827, + "language_loss": 0.85284662, + "learning_rate": 3.95818023251026e-06, + "loss": 0.93356252, + "num_input_tokens_seen": 33194120, + "router_z_loss_clip": 4.00976562, + "router_z_loss_mlp": 0.4519043, + "step": 1551, + "time_per_iteration": 2.6053500175476074 + }, + { + "auxiliary_loss_clip": 0.06596169, + "auxiliary_loss_mlp": 0.0130535, + "balance_loss_clip": 0.0633968, + "balance_loss_mlp": 0.01277837, + "epoch": 0.09331128814068841, + "flos": 61556144509440.0, + "grad_norm": 0.7233822491319317, + "language_loss": 0.61895663, + "learning_rate": 3.958100968362163e-06, + "loss": 0.69797182, + "num_input_tokens_seen": 33261080, + "router_z_loss_clip": 2.5625, + "router_z_loss_mlp": 0.27587891, + "step": 1552, + "time_per_iteration": 3.3384416103363037 + }, + { + "auxiliary_loss_clip": 0.06590016, + "auxiliary_loss_mlp": 0.01301581, + "balance_loss_clip": 0.06333126, + "balance_loss_mlp": 0.012734, + "epoch": 0.09337141139335638, + "flos": 53312810883840.0, + "grad_norm": 0.7946952857616146, + "language_loss": 0.59040678, + "learning_rate": 3.958021629962681e-06, + "loss": 0.66932273, + "num_input_tokens_seen": 33330235, + "router_z_loss_clip": 2.5625, + "router_z_loss_mlp": 0.28222656, + "step": 1553, + "time_per_iteration": 3.328634262084961 + }, + { + "auxiliary_loss_clip": 0.06762205, + "auxiliary_loss_mlp": 0.01305187, + "balance_loss_clip": 0.06356394, + "balance_loss_mlp": 0.01259005, + "epoch": 0.09343153464602436, + "flos": 23483539537920.0, + "grad_norm": 2.4998209031659853, + "language_loss": 0.888143, + "learning_rate": 3.957942217314823e-06, + "loss": 0.96881694, + "num_input_tokens_seen": 33349035, + "router_z_loss_clip": 4.05078125, + "router_z_loss_mlp": 0.46142578, + "step": 1554, + "time_per_iteration": 2.581807851791382 + }, + { + "auxiliary_loss_clip": 0.06741555, + "auxiliary_loss_mlp": 0.01307833, + "balance_loss_clip": 0.06351957, + "balance_loss_mlp": 0.01266014, + "epoch": 0.09349165789869232, + "flos": 19359399029760.0, + "grad_norm": 2.344370035353047, + "language_loss": 0.83131635, + "learning_rate": 3.957862730421599e-06, + "loss": 0.91181016, + "num_input_tokens_seen": 33368060, + "router_z_loss_clip": 3.89453125, + "router_z_loss_mlp": 0.41772461, + "step": 1555, + "time_per_iteration": 2.5902695655822754 + }, + { + "auxiliary_loss_clip": 0.06587426, + "auxiliary_loss_mlp": 0.01289293, + "balance_loss_clip": 0.06331394, + "balance_loss_mlp": 0.01264736, + "epoch": 0.09355178115136029, + "flos": 67520626968960.0, + "grad_norm": 0.861973728001382, + "language_loss": 0.59963852, + "learning_rate": 3.957783169286024e-06, + "loss": 0.67840576, + "num_input_tokens_seen": 33430825, + "router_z_loss_clip": 2.5625, + "router_z_loss_mlp": 0.2454834, + "step": 1556, + "time_per_iteration": 4.633097410202026 + }, + { + "auxiliary_loss_clip": 0.06743869, + "auxiliary_loss_mlp": 0.01306461, + "balance_loss_clip": 0.06350282, + "balance_loss_mlp": 0.01262378, + "epoch": 0.09361190440402825, + "flos": 37350676920960.0, + "grad_norm": 4.324378965941339, + "language_loss": 0.86094332, + "learning_rate": 3.9577035339111155e-06, + "loss": 0.94144666, + "num_input_tokens_seen": 33454855, + "router_z_loss_clip": 3.93359375, + "router_z_loss_mlp": 0.44091797, + "step": 1557, + "time_per_iteration": 4.159425258636475 + }, + { + "auxiliary_loss_clip": 0.06735416, + "auxiliary_loss_mlp": 0.01305568, + "balance_loss_clip": 0.0634184, + "balance_loss_mlp": 0.01261961, + "epoch": 0.09367202765669623, + "flos": 24906614555520.0, + "grad_norm": 1.8416864834979163, + "language_loss": 0.79618692, + "learning_rate": 3.957623824299893e-06, + "loss": 0.87659669, + "num_input_tokens_seen": 33476000, + "router_z_loss_clip": 3.9375, + "router_z_loss_mlp": 0.4362793, + "step": 1558, + "time_per_iteration": 2.592564105987549 + }, + { + "auxiliary_loss_clip": 0.0675108, + "auxiliary_loss_mlp": 0.01310633, + "balance_loss_clip": 0.06350247, + "balance_loss_mlp": 0.0126562, + "epoch": 0.0937321509093642, + "flos": 15710986477440.0, + "grad_norm": 2.1774663365636555, + "language_loss": 0.81722063, + "learning_rate": 3.957544040455379e-06, + "loss": 0.89783776, + "num_input_tokens_seen": 33493845, + "router_z_loss_clip": 4.00390625, + "router_z_loss_mlp": 0.44995117, + "step": 1559, + "time_per_iteration": 2.6032233238220215 + }, + { + "auxiliary_loss_clip": 0.06735763, + "auxiliary_loss_mlp": 0.01315647, + "balance_loss_clip": 0.06339972, + "balance_loss_mlp": 0.0126844, + "epoch": 0.09379227416203216, + "flos": 20489663554560.0, + "grad_norm": 4.6744208078316785, + "language_loss": 0.77938354, + "learning_rate": 3.957464182380599e-06, + "loss": 0.85989761, + "num_input_tokens_seen": 33510850, + "router_z_loss_clip": 3.95117188, + "router_z_loss_mlp": 0.47216797, + "step": 1560, + "time_per_iteration": 4.077486753463745 + }, + { + "auxiliary_loss_clip": 0.06748343, + "auxiliary_loss_mlp": 0.01308417, + "balance_loss_clip": 0.06347422, + "balance_loss_mlp": 0.01262736, + "epoch": 0.09385239741470014, + "flos": 24359329612800.0, + "grad_norm": 2.0394992370655975, + "language_loss": 0.82801652, + "learning_rate": 3.95738425007858e-06, + "loss": 0.90858412, + "num_input_tokens_seen": 33530430, + "router_z_loss_clip": 4.0078125, + "router_z_loss_mlp": 0.45678711, + "step": 1561, + "time_per_iteration": 2.596116781234741 + }, + { + "auxiliary_loss_clip": 0.06752103, + "auxiliary_loss_mlp": 0.01323602, + "balance_loss_clip": 0.06347683, + "balance_loss_mlp": 0.01280186, + "epoch": 0.0939125206673681, + "flos": 33299812408320.0, + "grad_norm": 7.4214047506541085, + "language_loss": 0.63655907, + "learning_rate": 3.957304243552354e-06, + "loss": 0.71731609, + "num_input_tokens_seen": 33551975, + "router_z_loss_clip": 4.046875, + "router_z_loss_mlp": 0.43457031, + "step": 1562, + "time_per_iteration": 4.075207710266113 + }, + { + "auxiliary_loss_clip": 0.06726522, + "auxiliary_loss_mlp": 0.01325114, + "balance_loss_clip": 0.06341539, + "balance_loss_mlp": 0.012796, + "epoch": 0.09397264392003607, + "flos": 19250973446400.0, + "grad_norm": 3.0209063418471516, + "language_loss": 0.87167883, + "learning_rate": 3.957224162804956e-06, + "loss": 0.95219523, + "num_input_tokens_seen": 33569850, + "router_z_loss_clip": 3.84960938, + "router_z_loss_mlp": 0.45556641, + "step": 1563, + "time_per_iteration": 2.5672974586486816 + }, + { + "auxiliary_loss_clip": 0.06731268, + "auxiliary_loss_mlp": 0.01318973, + "balance_loss_clip": 0.06341776, + "balance_loss_mlp": 0.01275843, + "epoch": 0.09403276717270405, + "flos": 19323997879680.0, + "grad_norm": 4.036825223775372, + "language_loss": 0.77853692, + "learning_rate": 3.9571440078394205e-06, + "loss": 0.85903931, + "num_input_tokens_seen": 33590510, + "router_z_loss_clip": 3.90039062, + "router_z_loss_mlp": 0.43139648, + "step": 1564, + "time_per_iteration": 2.586803913116455 + }, + { + "auxiliary_loss_clip": 0.06734219, + "auxiliary_loss_mlp": 0.0132655, + "balance_loss_clip": 0.06344242, + "balance_loss_mlp": 0.01285876, + "epoch": 0.09409289042537201, + "flos": 23589701061120.0, + "grad_norm": 2.2846066488683725, + "language_loss": 0.81194431, + "learning_rate": 3.9570637786587895e-06, + "loss": 0.89255196, + "num_input_tokens_seen": 33608810, + "router_z_loss_clip": 3.90039062, + "router_z_loss_mlp": 0.40649414, + "step": 1565, + "time_per_iteration": 2.5794317722320557 + }, + { + "auxiliary_loss_clip": 0.06753047, + "auxiliary_loss_mlp": 0.01322466, + "balance_loss_clip": 0.06351732, + "balance_loss_mlp": 0.01275616, + "epoch": 0.09415301367803998, + "flos": 20083689722880.0, + "grad_norm": 2.6435222335860984, + "language_loss": 0.77859378, + "learning_rate": 3.956983475266103e-06, + "loss": 0.85934889, + "num_input_tokens_seen": 33627265, + "router_z_loss_clip": 4.01171875, + "router_z_loss_mlp": 0.46850586, + "step": 1566, + "time_per_iteration": 2.585827112197876 + }, + { + "auxiliary_loss_clip": 0.06732298, + "auxiliary_loss_mlp": 0.01317656, + "balance_loss_clip": 0.06341095, + "balance_loss_mlp": 0.01273048, + "epoch": 0.09421313693070796, + "flos": 21067234548480.0, + "grad_norm": 2.512043511854747, + "language_loss": 0.79885954, + "learning_rate": 3.956903097664407e-06, + "loss": 0.87935913, + "num_input_tokens_seen": 33644810, + "router_z_loss_clip": 3.91015625, + "router_z_loss_mlp": 0.44555664, + "step": 1567, + "time_per_iteration": 2.6127569675445557 + }, + { + "auxiliary_loss_clip": 0.06736939, + "auxiliary_loss_mlp": 0.01312026, + "balance_loss_clip": 0.06345257, + "balance_loss_mlp": 0.01268467, + "epoch": 0.09427326018337592, + "flos": 24323006067840.0, + "grad_norm": 2.023408518632979, + "language_loss": 0.8442241, + "learning_rate": 3.956822645856749e-06, + "loss": 0.92471373, + "num_input_tokens_seen": 33665665, + "router_z_loss_clip": 3.91796875, + "router_z_loss_mlp": 0.43505859, + "step": 1568, + "time_per_iteration": 2.569720506668091 + }, + { + "auxiliary_loss_clip": 0.06755883, + "auxiliary_loss_mlp": 0.01306618, + "balance_loss_clip": 0.06353641, + "balance_loss_mlp": 0.01263583, + "epoch": 0.09433338343604389, + "flos": 20269667859840.0, + "grad_norm": 2.477497103121254, + "language_loss": 0.77784359, + "learning_rate": 3.9567421198461814e-06, + "loss": 0.85846859, + "num_input_tokens_seen": 33684760, + "router_z_loss_clip": 4.01757812, + "router_z_loss_mlp": 0.43041992, + "step": 1569, + "time_per_iteration": 2.573776960372925 + }, + { + "auxiliary_loss_clip": 0.06750233, + "auxiliary_loss_mlp": 0.01322236, + "balance_loss_clip": 0.06360742, + "balance_loss_mlp": 0.01281443, + "epoch": 0.09439350668871185, + "flos": 12746683785600.0, + "grad_norm": 3.1104432371221495, + "language_loss": 0.87103617, + "learning_rate": 3.956661519635756e-06, + "loss": 0.95176083, + "num_input_tokens_seen": 33700750, + "router_z_loss_clip": 3.8984375, + "router_z_loss_mlp": 0.40795898, + "step": 1570, + "time_per_iteration": 2.5129590034484863 + }, + { + "auxiliary_loss_clip": 0.06749961, + "auxiliary_loss_mlp": 0.01311255, + "balance_loss_clip": 0.06350505, + "balance_loss_mlp": 0.01269007, + "epoch": 0.09445362994137983, + "flos": 25970101776000.0, + "grad_norm": 2.3671248077954297, + "language_loss": 0.7803812, + "learning_rate": 3.95658084522853e-06, + "loss": 0.86099339, + "num_input_tokens_seen": 33724430, + "router_z_loss_clip": 3.99609375, + "router_z_loss_mlp": 0.42236328, + "step": 1571, + "time_per_iteration": 2.7541556358337402 + }, + { + "auxiliary_loss_clip": 0.0672407, + "auxiliary_loss_mlp": 0.01308455, + "balance_loss_clip": 0.06346194, + "balance_loss_mlp": 0.01269807, + "epoch": 0.0945137531940478, + "flos": 19720831616640.0, + "grad_norm": 2.4306247586771934, + "language_loss": 0.81068146, + "learning_rate": 3.956500096627561e-06, + "loss": 0.89100671, + "num_input_tokens_seen": 33743455, + "router_z_loss_clip": 3.78125, + "router_z_loss_mlp": 0.38623047, + "step": 1572, + "time_per_iteration": 2.5679988861083984 + }, + { + "auxiliary_loss_clip": 0.06744019, + "auxiliary_loss_mlp": 0.01308416, + "balance_loss_clip": 0.06344286, + "balance_loss_mlp": 0.01265691, + "epoch": 0.09457387644671576, + "flos": 23622796224000.0, + "grad_norm": 3.3370924728894185, + "language_loss": 0.8915112, + "learning_rate": 3.956419273835913e-06, + "loss": 0.97203565, + "num_input_tokens_seen": 33763435, + "router_z_loss_clip": 3.99804688, + "router_z_loss_mlp": 0.42700195, + "step": 1573, + "time_per_iteration": 2.607600688934326 + }, + { + "auxiliary_loss_clip": 0.06757497, + "auxiliary_loss_mlp": 0.01304776, + "balance_loss_clip": 0.0635422, + "balance_loss_mlp": 0.0125919, + "epoch": 0.09463399969938374, + "flos": 26914681653120.0, + "grad_norm": 3.5983977458342764, + "language_loss": 0.83351094, + "learning_rate": 3.95633837685665e-06, + "loss": 0.91413361, + "num_input_tokens_seen": 33784325, + "router_z_loss_clip": 4.02734375, + "router_z_loss_mlp": 0.45605469, + "step": 1574, + "time_per_iteration": 2.629686117172241 + }, + { + "auxiliary_loss_clip": 0.06738517, + "auxiliary_loss_mlp": 0.01306377, + "balance_loss_clip": 0.06343692, + "balance_loss_mlp": 0.01264463, + "epoch": 0.0946941229520517, + "flos": 23666331219840.0, + "grad_norm": 2.307572986084867, + "language_loss": 0.82900977, + "learning_rate": 3.95625740569284e-06, + "loss": 0.9094587, + "num_input_tokens_seen": 33802510, + "router_z_loss_clip": 3.9453125, + "router_z_loss_mlp": 0.41918945, + "step": 1575, + "time_per_iteration": 2.6788809299468994 + }, + { + "auxiliary_loss_clip": 0.06738277, + "auxiliary_loss_mlp": 0.013099, + "balance_loss_clip": 0.06341611, + "balance_loss_mlp": 0.01265912, + "epoch": 0.09475424620471967, + "flos": 24140927145600.0, + "grad_norm": 3.091827797586119, + "language_loss": 0.88420904, + "learning_rate": 3.956176360347553e-06, + "loss": 0.9646908, + "num_input_tokens_seen": 33819980, + "router_z_loss_clip": 3.96875, + "router_z_loss_mlp": 0.43969727, + "step": 1576, + "time_per_iteration": 2.579481840133667 + }, + { + "auxiliary_loss_clip": 0.06599005, + "auxiliary_loss_mlp": 0.01293963, + "balance_loss_clip": 0.06343846, + "balance_loss_mlp": 0.01269894, + "epoch": 0.09481436945738765, + "flos": 68446283022720.0, + "grad_norm": 0.9736372426009887, + "language_loss": 0.66026628, + "learning_rate": 3.956095240823862e-06, + "loss": 0.73919594, + "num_input_tokens_seen": 33878925, + "router_z_loss_clip": 2.5546875, + "router_z_loss_mlp": 0.24060059, + "step": 1577, + "time_per_iteration": 3.1515533924102783 + }, + { + "auxiliary_loss_clip": 0.06730399, + "auxiliary_loss_mlp": 0.01300904, + "balance_loss_clip": 0.06338648, + "balance_loss_mlp": 0.01260373, + "epoch": 0.09487449271005562, + "flos": 16659633277440.0, + "grad_norm": 8.095983487206498, + "language_loss": 0.81352609, + "learning_rate": 3.956014047124844e-06, + "loss": 0.89383912, + "num_input_tokens_seen": 33897600, + "router_z_loss_clip": 3.91601562, + "router_z_loss_mlp": 0.40551758, + "step": 1578, + "time_per_iteration": 2.5477943420410156 + }, + { + "auxiliary_loss_clip": 0.06728384, + "auxiliary_loss_mlp": 0.01305272, + "balance_loss_clip": 0.06339101, + "balance_loss_mlp": 0.01262261, + "epoch": 0.09493461596272358, + "flos": 24281860913280.0, + "grad_norm": 2.2398618164761674, + "language_loss": 0.79482144, + "learning_rate": 3.955932779253578e-06, + "loss": 0.87515795, + "num_input_tokens_seen": 33917365, + "router_z_loss_clip": 3.89453125, + "router_z_loss_mlp": 0.43017578, + "step": 1579, + "time_per_iteration": 2.5968475341796875 + }, + { + "auxiliary_loss_clip": 0.06732477, + "auxiliary_loss_mlp": 0.01300696, + "balance_loss_clip": 0.06336749, + "balance_loss_mlp": 0.012579, + "epoch": 0.09499473921539155, + "flos": 21876373100160.0, + "grad_norm": 2.5076146880491406, + "language_loss": 0.75397295, + "learning_rate": 3.955851437213144e-06, + "loss": 0.83430469, + "num_input_tokens_seen": 33936680, + "router_z_loss_clip": 3.95703125, + "router_z_loss_mlp": 0.42822266, + "step": 1580, + "time_per_iteration": 2.570138931274414 + }, + { + "auxiliary_loss_clip": 0.06724589, + "auxiliary_loss_mlp": 0.01310914, + "balance_loss_clip": 0.06333821, + "balance_loss_mlp": 0.01268666, + "epoch": 0.09505486246805953, + "flos": 33555544669440.0, + "grad_norm": 5.064476993970354, + "language_loss": 0.78532892, + "learning_rate": 3.955770021006627e-06, + "loss": 0.86568391, + "num_input_tokens_seen": 33960685, + "router_z_loss_clip": 3.90625, + "router_z_loss_mlp": 0.42236328, + "step": 1581, + "time_per_iteration": 2.6650803089141846 + }, + { + "auxiliary_loss_clip": 0.06722299, + "auxiliary_loss_mlp": 0.01301656, + "balance_loss_clip": 0.06332248, + "balance_loss_mlp": 0.01261006, + "epoch": 0.09511498572072749, + "flos": 21221752677120.0, + "grad_norm": 5.1362606458817925, + "language_loss": 0.89191097, + "learning_rate": 3.955688530637116e-06, + "loss": 0.97215056, + "num_input_tokens_seen": 33980015, + "router_z_loss_clip": 3.90429688, + "router_z_loss_mlp": 0.40698242, + "step": 1582, + "time_per_iteration": 2.5564815998077393 + }, + { + "auxiliary_loss_clip": 0.06727481, + "auxiliary_loss_mlp": 0.01303544, + "balance_loss_clip": 0.06332925, + "balance_loss_mlp": 0.01261773, + "epoch": 0.09517510897339546, + "flos": 14616542373120.0, + "grad_norm": 2.3229781210723393, + "language_loss": 0.68368226, + "learning_rate": 3.955606966107699e-06, + "loss": 0.76399243, + "num_input_tokens_seen": 33997705, + "router_z_loss_clip": 3.94140625, + "router_z_loss_mlp": 0.41772461, + "step": 1583, + "time_per_iteration": 2.6164753437042236 + }, + { + "auxiliary_loss_clip": 0.06727771, + "auxiliary_loss_mlp": 0.01304751, + "balance_loss_clip": 0.06331809, + "balance_loss_mlp": 0.01261048, + "epoch": 0.09523523222606343, + "flos": 27824531212800.0, + "grad_norm": 3.115442275670272, + "language_loss": 0.72724044, + "learning_rate": 3.95552532742147e-06, + "loss": 0.80756557, + "num_input_tokens_seen": 34017465, + "router_z_loss_clip": 3.95703125, + "router_z_loss_mlp": 0.43725586, + "step": 1584, + "time_per_iteration": 2.604071855545044 + }, + { + "auxiliary_loss_clip": 0.06722259, + "auxiliary_loss_mlp": 0.01304961, + "balance_loss_clip": 0.06331295, + "balance_loss_mlp": 0.01265431, + "epoch": 0.0952953554787314, + "flos": 20712887631360.0, + "grad_norm": 1.6075041233622491, + "language_loss": 0.82572448, + "learning_rate": 3.955443614581525e-06, + "loss": 0.90599668, + "num_input_tokens_seen": 34038550, + "router_z_loss_clip": 3.91210938, + "router_z_loss_mlp": 0.39550781, + "step": 1585, + "time_per_iteration": 2.586507797241211 + }, + { + "auxiliary_loss_clip": 0.0673333, + "auxiliary_loss_mlp": 0.01317767, + "balance_loss_clip": 0.06331026, + "balance_loss_mlp": 0.01272039, + "epoch": 0.09535547873139937, + "flos": 24794080122240.0, + "grad_norm": 2.5515489551775854, + "language_loss": 0.74444079, + "learning_rate": 3.955361827590961e-06, + "loss": 0.82495177, + "num_input_tokens_seen": 34058665, + "router_z_loss_clip": 4.01953125, + "router_z_loss_mlp": 0.45727539, + "step": 1586, + "time_per_iteration": 2.629486083984375 + }, + { + "auxiliary_loss_clip": 0.06581648, + "auxiliary_loss_mlp": 0.01282136, + "balance_loss_clip": 0.06328419, + "balance_loss_mlp": 0.01258128, + "epoch": 0.09541560198406734, + "flos": 71930114956800.0, + "grad_norm": 0.7905774049307454, + "language_loss": 0.55110765, + "learning_rate": 3.955279966452883e-06, + "loss": 0.62974548, + "num_input_tokens_seen": 34109655, + "router_z_loss_clip": 2.53125, + "router_z_loss_mlp": 0.23974609, + "step": 1587, + "time_per_iteration": 2.9765305519104004 + }, + { + "auxiliary_loss_clip": 0.06737173, + "auxiliary_loss_mlp": 0.01308566, + "balance_loss_clip": 0.06336194, + "balance_loss_mlp": 0.01264316, + "epoch": 0.09547572523673531, + "flos": 28989609909120.0, + "grad_norm": 3.1625529132554835, + "language_loss": 0.82650244, + "learning_rate": 3.955198031170391e-06, + "loss": 0.90695989, + "num_input_tokens_seen": 34131115, + "router_z_loss_clip": 4.01171875, + "router_z_loss_mlp": 0.44213867, + "step": 1588, + "time_per_iteration": 2.6358370780944824 + }, + { + "auxiliary_loss_clip": 0.06726347, + "auxiliary_loss_mlp": 0.01313798, + "balance_loss_clip": 0.06331095, + "balance_loss_mlp": 0.01270716, + "epoch": 0.09553584848940327, + "flos": 24140759437440.0, + "grad_norm": 5.541794796195464, + "language_loss": 0.83084911, + "learning_rate": 3.955116021746594e-06, + "loss": 0.91125059, + "num_input_tokens_seen": 34151925, + "router_z_loss_clip": 3.95507812, + "router_z_loss_mlp": 0.43066406, + "step": 1589, + "time_per_iteration": 2.609682559967041 + }, + { + "auxiliary_loss_clip": 0.06720543, + "auxiliary_loss_mlp": 0.01306342, + "balance_loss_clip": 0.06330015, + "balance_loss_mlp": 0.01265263, + "epoch": 0.09559597174207124, + "flos": 42861401193600.0, + "grad_norm": 2.659540476465126, + "language_loss": 0.66428804, + "learning_rate": 3.955033938184601e-06, + "loss": 0.7445569, + "num_input_tokens_seen": 34175395, + "router_z_loss_clip": 3.91015625, + "router_z_loss_mlp": 0.41113281, + "step": 1590, + "time_per_iteration": 2.7904412746429443 + }, + { + "auxiliary_loss_clip": 0.06727439, + "auxiliary_loss_mlp": 0.01307692, + "balance_loss_clip": 0.06336293, + "balance_loss_mlp": 0.01267947, + "epoch": 0.09565609499473922, + "flos": 32678999907840.0, + "grad_norm": 1.976054240399588, + "language_loss": 0.84640449, + "learning_rate": 3.954951780487526e-06, + "loss": 0.92675579, + "num_input_tokens_seen": 34197760, + "router_z_loss_clip": 3.91210938, + "router_z_loss_mlp": 0.39746094, + "step": 1591, + "time_per_iteration": 2.677856683731079 + }, + { + "auxiliary_loss_clip": 0.0673625, + "auxiliary_loss_mlp": 0.01301164, + "balance_loss_clip": 0.06335758, + "balance_loss_mlp": 0.01259751, + "epoch": 0.09571621824740718, + "flos": 18484279787520.0, + "grad_norm": 3.2019409014799245, + "language_loss": 0.76485634, + "learning_rate": 3.9548695486584835e-06, + "loss": 0.84523046, + "num_input_tokens_seen": 34215330, + "router_z_loss_clip": 4.00976562, + "router_z_loss_mlp": 0.41381836, + "step": 1592, + "time_per_iteration": 2.5469346046447754 + }, + { + "auxiliary_loss_clip": 0.06718349, + "auxiliary_loss_mlp": 0.01308454, + "balance_loss_clip": 0.06327368, + "balance_loss_mlp": 0.01266444, + "epoch": 0.09577634150007515, + "flos": 29395164470400.0, + "grad_norm": 2.5830614134690757, + "language_loss": 0.75440031, + "learning_rate": 3.954787242700592e-06, + "loss": 0.8346684, + "num_input_tokens_seen": 34237745, + "router_z_loss_clip": 3.90625, + "router_z_loss_mlp": 0.42041016, + "step": 1593, + "time_per_iteration": 2.6077914237976074 + }, + { + "auxiliary_loss_clip": 0.06715257, + "auxiliary_loss_mlp": 0.01313469, + "balance_loss_clip": 0.06327495, + "balance_loss_mlp": 0.01269863, + "epoch": 0.09583646475274313, + "flos": 22754511089280.0, + "grad_norm": 3.098780608368182, + "language_loss": 0.70938909, + "learning_rate": 3.954704862616971e-06, + "loss": 0.78967637, + "num_input_tokens_seen": 34256565, + "router_z_loss_clip": 3.87890625, + "router_z_loss_mlp": 0.4362793, + "step": 1594, + "time_per_iteration": 2.6091833114624023 + }, + { + "auxiliary_loss_clip": 0.06719844, + "auxiliary_loss_mlp": 0.01312184, + "balance_loss_clip": 0.06326512, + "balance_loss_mlp": 0.01271247, + "epoch": 0.0958965880054111, + "flos": 23224495040640.0, + "grad_norm": 3.065197690061672, + "language_loss": 0.83355862, + "learning_rate": 3.954622408410747e-06, + "loss": 0.91387886, + "num_input_tokens_seen": 34275970, + "router_z_loss_clip": 3.92773438, + "router_z_loss_mlp": 0.40942383, + "step": 1595, + "time_per_iteration": 3.978273630142212 + }, + { + "auxiliary_loss_clip": 0.06729501, + "auxiliary_loss_mlp": 0.01321195, + "balance_loss_clip": 0.06329941, + "balance_loss_mlp": 0.01278638, + "epoch": 0.09595671125807906, + "flos": 21330807166080.0, + "grad_norm": 2.8509518249201866, + "language_loss": 0.87066317, + "learning_rate": 3.954539880085045e-06, + "loss": 0.95117009, + "num_input_tokens_seen": 34295490, + "router_z_loss_clip": 3.99414062, + "router_z_loss_mlp": 0.42529297, + "step": 1596, + "time_per_iteration": 4.032626390457153 + }, + { + "auxiliary_loss_clip": 0.06723377, + "auxiliary_loss_mlp": 0.01316069, + "balance_loss_clip": 0.06335501, + "balance_loss_mlp": 0.01273273, + "epoch": 0.09601683451074704, + "flos": 39612841125120.0, + "grad_norm": 3.1423731979310587, + "language_loss": 0.70766866, + "learning_rate": 3.9544572776429945e-06, + "loss": 0.78806317, + "num_input_tokens_seen": 34319990, + "router_z_loss_clip": 3.8828125, + "router_z_loss_mlp": 0.42773438, + "step": 1597, + "time_per_iteration": 2.7174298763275146 + }, + { + "auxiliary_loss_clip": 0.06742129, + "auxiliary_loss_mlp": 0.01306146, + "balance_loss_clip": 0.06339651, + "balance_loss_mlp": 0.01265687, + "epoch": 0.096076957763415, + "flos": 23739523361280.0, + "grad_norm": 3.050895337571829, + "language_loss": 0.77272135, + "learning_rate": 3.954374601087729e-06, + "loss": 0.85320413, + "num_input_tokens_seen": 34339225, + "router_z_loss_clip": 4.02734375, + "router_z_loss_mlp": 0.40429688, + "step": 1598, + "time_per_iteration": 2.5799829959869385 + }, + { + "auxiliary_loss_clip": 0.06737213, + "auxiliary_loss_mlp": 0.01319114, + "balance_loss_clip": 0.06339812, + "balance_loss_mlp": 0.01276103, + "epoch": 0.09613708101608297, + "flos": 34686689662080.0, + "grad_norm": 4.982256482437043, + "language_loss": 0.70875788, + "learning_rate": 3.954291850422382e-06, + "loss": 0.78932118, + "num_input_tokens_seen": 34361020, + "router_z_loss_clip": 3.96679688, + "router_z_loss_mlp": 0.43041992, + "step": 1599, + "time_per_iteration": 4.165144443511963 + }, + { + "auxiliary_loss_clip": 0.0672265, + "auxiliary_loss_mlp": 0.01315059, + "balance_loss_clip": 0.06336158, + "balance_loss_mlp": 0.01275029, + "epoch": 0.09619720426875093, + "flos": 20746192429440.0, + "grad_norm": 2.7563705555600655, + "language_loss": 0.85738063, + "learning_rate": 3.954209025650093e-06, + "loss": 0.93775773, + "num_input_tokens_seen": 34378630, + "router_z_loss_clip": 3.86523438, + "router_z_loss_mlp": 0.40014648, + "step": 1600, + "time_per_iteration": 2.583336591720581 + }, + { + "auxiliary_loss_clip": 0.06737998, + "auxiliary_loss_mlp": 0.01310218, + "balance_loss_clip": 0.06341977, + "balance_loss_mlp": 0.01270641, + "epoch": 0.09625732752141891, + "flos": 13047795832320.0, + "grad_norm": 2.909698328635622, + "language_loss": 0.82446879, + "learning_rate": 3.954126126774001e-06, + "loss": 0.90495098, + "num_input_tokens_seen": 34397110, + "router_z_loss_clip": 3.96484375, + "router_z_loss_mlp": 0.39599609, + "step": 1601, + "time_per_iteration": 3.9834721088409424 + }, + { + "auxiliary_loss_clip": 0.06743482, + "auxiliary_loss_mlp": 0.01303448, + "balance_loss_clip": 0.06337628, + "balance_loss_mlp": 0.01262368, + "epoch": 0.09631745077408688, + "flos": 22280250579840.0, + "grad_norm": 5.887605287140624, + "language_loss": 0.84592891, + "learning_rate": 3.954043153797251e-06, + "loss": 0.92639828, + "num_input_tokens_seen": 34414165, + "router_z_loss_clip": 4.0546875, + "router_z_loss_mlp": 0.41088867, + "step": 1602, + "time_per_iteration": 2.5633962154388428 + }, + { + "auxiliary_loss_clip": 0.06747036, + "auxiliary_loss_mlp": 0.01307728, + "balance_loss_clip": 0.06349348, + "balance_loss_mlp": 0.012661, + "epoch": 0.09637757402675484, + "flos": 24761236521600.0, + "grad_norm": 2.955003508709107, + "language_loss": 0.65285349, + "learning_rate": 3.953960106722989e-06, + "loss": 0.73340118, + "num_input_tokens_seen": 34434445, + "router_z_loss_clip": 3.97851562, + "router_z_loss_mlp": 0.41625977, + "step": 1603, + "time_per_iteration": 2.6790709495544434 + }, + { + "auxiliary_loss_clip": 0.06770037, + "auxiliary_loss_mlp": 0.01301761, + "balance_loss_clip": 0.06360609, + "balance_loss_mlp": 0.01258321, + "epoch": 0.09643769727942282, + "flos": 22531873991040.0, + "grad_norm": 5.353230367509213, + "language_loss": 0.72867018, + "learning_rate": 3.953876985554364e-06, + "loss": 0.80938816, + "num_input_tokens_seen": 34453095, + "router_z_loss_clip": 4.09570312, + "router_z_loss_mlp": 0.43505859, + "step": 1604, + "time_per_iteration": 2.608727216720581 + }, + { + "auxiliary_loss_clip": 0.06740201, + "auxiliary_loss_mlp": 0.01291258, + "balance_loss_clip": 0.06351058, + "balance_loss_mlp": 0.01254327, + "epoch": 0.09649782053209079, + "flos": 30929138766720.0, + "grad_norm": 4.793252253869783, + "language_loss": 0.80923069, + "learning_rate": 3.953793790294527e-06, + "loss": 0.88954532, + "num_input_tokens_seen": 34473680, + "router_z_loss_clip": 3.890625, + "router_z_loss_mlp": 0.36938477, + "step": 1605, + "time_per_iteration": 2.6763031482696533 + }, + { + "auxiliary_loss_clip": 0.06759577, + "auxiliary_loss_mlp": 0.01298287, + "balance_loss_clip": 0.06351094, + "balance_loss_mlp": 0.01258805, + "epoch": 0.09655794378475875, + "flos": 25344635374080.0, + "grad_norm": 2.3859738867756524, + "language_loss": 0.77227855, + "learning_rate": 3.953710520946634e-06, + "loss": 0.85285711, + "num_input_tokens_seen": 34492610, + "router_z_loss_clip": 4.08203125, + "router_z_loss_mlp": 0.39501953, + "step": 1606, + "time_per_iteration": 2.5902390480041504 + }, + { + "auxiliary_loss_clip": 0.0675118, + "auxiliary_loss_mlp": 0.0129606, + "balance_loss_clip": 0.06355944, + "balance_loss_mlp": 0.01258009, + "epoch": 0.09661806703742673, + "flos": 22352604180480.0, + "grad_norm": 2.2398823980048133, + "language_loss": 0.77161521, + "learning_rate": 3.953627177513843e-06, + "loss": 0.85208762, + "num_input_tokens_seen": 34511855, + "router_z_loss_clip": 3.953125, + "router_z_loss_mlp": 0.38085938, + "step": 1607, + "time_per_iteration": 2.5747807025909424 + }, + { + "auxiliary_loss_clip": 0.06767638, + "auxiliary_loss_mlp": 0.01306362, + "balance_loss_clip": 0.06365312, + "balance_loss_mlp": 0.01268597, + "epoch": 0.0966781902900947, + "flos": 17463405168000.0, + "grad_norm": 2.424309477239619, + "language_loss": 0.89527833, + "learning_rate": 3.953543759999312e-06, + "loss": 0.97601831, + "num_input_tokens_seen": 34528905, + "router_z_loss_clip": 4.02539062, + "router_z_loss_mlp": 0.37768555, + "step": 1608, + "time_per_iteration": 2.528881072998047 + }, + { + "auxiliary_loss_clip": 0.06782863, + "auxiliary_loss_mlp": 0.01306552, + "balance_loss_clip": 0.06378618, + "balance_loss_mlp": 0.01264471, + "epoch": 0.09673831354276266, + "flos": 36912991518720.0, + "grad_norm": 7.970472148643012, + "language_loss": 0.74000025, + "learning_rate": 3.953460268406207e-06, + "loss": 0.82089442, + "num_input_tokens_seen": 34548480, + "router_z_loss_clip": 4.0390625, + "router_z_loss_mlp": 0.4206543, + "step": 1609, + "time_per_iteration": 2.734060764312744 + }, + { + "auxiliary_loss_clip": 0.06767572, + "auxiliary_loss_mlp": 0.01304591, + "balance_loss_clip": 0.06368488, + "balance_loss_mlp": 0.01264418, + "epoch": 0.09679843679543064, + "flos": 20707185553920.0, + "grad_norm": 3.4585784172758123, + "language_loss": 0.86017323, + "learning_rate": 3.953376702737693e-06, + "loss": 0.94089484, + "num_input_tokens_seen": 34565410, + "router_z_loss_clip": 3.99414062, + "router_z_loss_mlp": 0.40185547, + "step": 1610, + "time_per_iteration": 2.6115059852600098 + }, + { + "auxiliary_loss_clip": 0.06763892, + "auxiliary_loss_mlp": 0.01304909, + "balance_loss_clip": 0.06364195, + "balance_loss_mlp": 0.01263877, + "epoch": 0.0968585600480986, + "flos": 23521288602240.0, + "grad_norm": 2.270672864322457, + "language_loss": 0.68734491, + "learning_rate": 3.953293062996939e-06, + "loss": 0.76803291, + "num_input_tokens_seen": 34584840, + "router_z_loss_clip": 3.9921875, + "router_z_loss_mlp": 0.41040039, + "step": 1611, + "time_per_iteration": 2.614010810852051 + }, + { + "auxiliary_loss_clip": 0.06775121, + "auxiliary_loss_mlp": 0.01302817, + "balance_loss_clip": 0.06373329, + "balance_loss_mlp": 0.01263239, + "epoch": 0.09691868330076657, + "flos": 20127350499840.0, + "grad_norm": 2.139701940573329, + "language_loss": 0.82997268, + "learning_rate": 3.953209349187115e-06, + "loss": 0.91075206, + "num_input_tokens_seen": 34603360, + "router_z_loss_clip": 4.0234375, + "router_z_loss_mlp": 0.39599609, + "step": 1612, + "time_per_iteration": 2.5493521690368652 + }, + { + "auxiliary_loss_clip": 0.06771481, + "auxiliary_loss_mlp": 0.01301111, + "balance_loss_clip": 0.06373016, + "balance_loss_mlp": 0.01260509, + "epoch": 0.09697880655343454, + "flos": 16550243372160.0, + "grad_norm": 8.083682244788854, + "language_loss": 0.82256299, + "learning_rate": 3.953125561311398e-06, + "loss": 0.90328896, + "num_input_tokens_seen": 34620760, + "router_z_loss_clip": 3.984375, + "router_z_loss_mlp": 0.40600586, + "step": 1613, + "time_per_iteration": 2.597912311553955 + }, + { + "auxiliary_loss_clip": 0.06750716, + "auxiliary_loss_mlp": 0.01299993, + "balance_loss_clip": 0.06359349, + "balance_loss_mlp": 0.01259724, + "epoch": 0.09703892980610251, + "flos": 26111370960000.0, + "grad_norm": 2.0260319330855654, + "language_loss": 0.86653531, + "learning_rate": 3.953041699372964e-06, + "loss": 0.94704247, + "num_input_tokens_seen": 34640695, + "router_z_loss_clip": 3.91210938, + "router_z_loss_mlp": 0.40258789, + "step": 1614, + "time_per_iteration": 2.6904046535491943 + }, + { + "auxiliary_loss_clip": 0.06673412, + "auxiliary_loss_mlp": 0.0133076, + "balance_loss_clip": 0.06412064, + "balance_loss_mlp": 0.01308611, + "epoch": 0.09709905305877048, + "flos": 60463712903040.0, + "grad_norm": 0.7036996820791193, + "language_loss": 0.54819673, + "learning_rate": 3.952957763374992e-06, + "loss": 0.6282385, + "num_input_tokens_seen": 34702395, + "router_z_loss_clip": 2.61914062, + "router_z_loss_mlp": 0.22180176, + "step": 1615, + "time_per_iteration": 3.235962152481079 + }, + { + "auxiliary_loss_clip": 0.06658442, + "auxiliary_loss_mlp": 0.01303789, + "balance_loss_clip": 0.06397749, + "balance_loss_mlp": 0.01282129, + "epoch": 0.09715917631143844, + "flos": 57660510885120.0, + "grad_norm": 0.7526049722603284, + "language_loss": 0.58190084, + "learning_rate": 3.952873753320666e-06, + "loss": 0.66152322, + "num_input_tokens_seen": 34768910, + "router_z_loss_clip": 2.59765625, + "router_z_loss_mlp": 0.21691895, + "step": 1616, + "time_per_iteration": 3.387523889541626 + }, + { + "auxiliary_loss_clip": 0.06757308, + "auxiliary_loss_mlp": 0.01307733, + "balance_loss_clip": 0.06359798, + "balance_loss_mlp": 0.01265652, + "epoch": 0.09721929956410642, + "flos": 20564448923520.0, + "grad_norm": 2.209089082853045, + "language_loss": 0.70192569, + "learning_rate": 3.952789669213172e-06, + "loss": 0.78257608, + "num_input_tokens_seen": 34787680, + "router_z_loss_clip": 3.97265625, + "router_z_loss_mlp": 0.42041016, + "step": 1617, + "time_per_iteration": 2.5756118297576904 + }, + { + "auxiliary_loss_clip": 0.06757677, + "auxiliary_loss_mlp": 0.0131002, + "balance_loss_clip": 0.06358766, + "balance_loss_mlp": 0.01269298, + "epoch": 0.09727942281677439, + "flos": 27351696222720.0, + "grad_norm": 2.235248973511229, + "language_loss": 0.81849337, + "learning_rate": 3.952705511055698e-06, + "loss": 0.89917034, + "num_input_tokens_seen": 34808330, + "router_z_loss_clip": 3.98632812, + "router_z_loss_mlp": 0.40722656, + "step": 1618, + "time_per_iteration": 2.6768393516540527 + }, + { + "auxiliary_loss_clip": 0.0674091, + "auxiliary_loss_mlp": 0.01309795, + "balance_loss_clip": 0.06356256, + "balance_loss_mlp": 0.01273293, + "epoch": 0.09733954606944235, + "flos": 24906991898880.0, + "grad_norm": 1.9369475823390685, + "language_loss": 0.94461536, + "learning_rate": 3.952621278851435e-06, + "loss": 1.0251224, + "num_input_tokens_seen": 34830020, + "router_z_loss_clip": 3.84375, + "router_z_loss_mlp": 0.36474609, + "step": 1619, + "time_per_iteration": 2.6324799060821533 + }, + { + "auxiliary_loss_clip": 0.06749003, + "auxiliary_loss_mlp": 0.01319848, + "balance_loss_clip": 0.06356695, + "balance_loss_mlp": 0.01280556, + "epoch": 0.09739966932211033, + "flos": 31511992567680.0, + "grad_norm": 2.8077555075872183, + "language_loss": 0.90160304, + "learning_rate": 3.9525369726035784e-06, + "loss": 0.98229158, + "num_input_tokens_seen": 34850330, + "router_z_loss_clip": 3.91601562, + "router_z_loss_mlp": 0.39257812, + "step": 1620, + "time_per_iteration": 2.658043146133423 + }, + { + "auxiliary_loss_clip": 0.06742691, + "auxiliary_loss_mlp": 0.01310778, + "balance_loss_clip": 0.06352507, + "balance_loss_mlp": 0.01268602, + "epoch": 0.0974597925747783, + "flos": 23885614154880.0, + "grad_norm": 11.754534189846764, + "language_loss": 0.78833234, + "learning_rate": 3.952452592315324e-06, + "loss": 0.86886704, + "num_input_tokens_seen": 34871640, + "router_z_loss_clip": 3.90429688, + "router_z_loss_mlp": 0.421875, + "step": 1621, + "time_per_iteration": 2.575810432434082 + }, + { + "auxiliary_loss_clip": 0.06744215, + "auxiliary_loss_mlp": 0.01311535, + "balance_loss_clip": 0.06357577, + "balance_loss_mlp": 0.01271863, + "epoch": 0.09751991582744626, + "flos": 17025300495360.0, + "grad_norm": 3.321884403192612, + "language_loss": 0.7956326, + "learning_rate": 3.952368137989871e-06, + "loss": 0.87619019, + "num_input_tokens_seen": 34888100, + "router_z_loss_clip": 3.86523438, + "router_z_loss_mlp": 0.39648438, + "step": 1622, + "time_per_iteration": 2.5544931888580322 + }, + { + "auxiliary_loss_clip": 0.06764823, + "auxiliary_loss_mlp": 0.01312235, + "balance_loss_clip": 0.06359966, + "balance_loss_mlp": 0.0127199, + "epoch": 0.09758003908011423, + "flos": 28410403760640.0, + "grad_norm": 4.629544309513281, + "language_loss": 0.86985308, + "learning_rate": 3.9522836096304225e-06, + "loss": 0.95062363, + "num_input_tokens_seen": 34910485, + "router_z_loss_clip": 4.046875, + "router_z_loss_mlp": 0.40209961, + "step": 1623, + "time_per_iteration": 2.612455129623413 + }, + { + "auxiliary_loss_clip": 0.06759211, + "auxiliary_loss_mlp": 0.01313929, + "balance_loss_clip": 0.06368798, + "balance_loss_mlp": 0.01275353, + "epoch": 0.09764016233278221, + "flos": 18149150183040.0, + "grad_norm": 2.3724260177997, + "language_loss": 0.82168519, + "learning_rate": 3.952199007240184e-06, + "loss": 0.90241659, + "num_input_tokens_seen": 34928615, + "router_z_loss_clip": 3.90429688, + "router_z_loss_mlp": 0.38598633, + "step": 1624, + "time_per_iteration": 2.572327136993408 + }, + { + "auxiliary_loss_clip": 0.06750062, + "auxiliary_loss_mlp": 0.01321107, + "balance_loss_clip": 0.06362263, + "balance_loss_mlp": 0.01284462, + "epoch": 0.09770028558545017, + "flos": 15270869306880.0, + "grad_norm": 2.8002590375685195, + "language_loss": 0.87639892, + "learning_rate": 3.952114330822364e-06, + "loss": 0.95711064, + "num_input_tokens_seen": 34946045, + "router_z_loss_clip": 3.87890625, + "router_z_loss_mlp": 0.36645508, + "step": 1625, + "time_per_iteration": 2.5327792167663574 + }, + { + "auxiliary_loss_clip": 0.06781108, + "auxiliary_loss_mlp": 0.01314743, + "balance_loss_clip": 0.06374431, + "balance_loss_mlp": 0.01273353, + "epoch": 0.09776040883811814, + "flos": 23478382512000.0, + "grad_norm": 2.111707696763749, + "language_loss": 0.8695811, + "learning_rate": 3.952029580380172e-06, + "loss": 0.95053965, + "num_input_tokens_seen": 34962865, + "router_z_loss_clip": 4.06445312, + "router_z_loss_mlp": 0.4140625, + "step": 1626, + "time_per_iteration": 2.631251096725464 + }, + { + "auxiliary_loss_clip": 0.067652, + "auxiliary_loss_mlp": 0.01306731, + "balance_loss_clip": 0.06367379, + "balance_loss_mlp": 0.01267177, + "epoch": 0.09782053209078612, + "flos": 24506510509440.0, + "grad_norm": 2.38090987978409, + "language_loss": 0.84928203, + "learning_rate": 3.9519447559168234e-06, + "loss": 0.93000138, + "num_input_tokens_seen": 34983505, + "router_z_loss_clip": 3.97460938, + "router_z_loss_mlp": 0.39550781, + "step": 1627, + "time_per_iteration": 2.6171953678131104 + }, + { + "auxiliary_loss_clip": 0.06749414, + "auxiliary_loss_mlp": 0.01311575, + "balance_loss_clip": 0.06362557, + "balance_loss_mlp": 0.01274334, + "epoch": 0.09788065534345408, + "flos": 21586623281280.0, + "grad_norm": 2.0465991602511107, + "language_loss": 0.86433482, + "learning_rate": 3.951859857435534e-06, + "loss": 0.94494474, + "num_input_tokens_seen": 35001825, + "router_z_loss_clip": 3.8671875, + "router_z_loss_mlp": 0.37255859, + "step": 1628, + "time_per_iteration": 2.5730161666870117 + }, + { + "auxiliary_loss_clip": 0.06751154, + "auxiliary_loss_mlp": 0.013221, + "balance_loss_clip": 0.06365977, + "balance_loss_mlp": 0.0128362, + "epoch": 0.09794077859612205, + "flos": 23849332536960.0, + "grad_norm": 2.074450963540643, + "language_loss": 0.76707101, + "learning_rate": 3.951774884939523e-06, + "loss": 0.84780353, + "num_input_tokens_seen": 35023075, + "router_z_loss_clip": 3.8515625, + "router_z_loss_mlp": 0.38452148, + "step": 1629, + "time_per_iteration": 2.615643262863159 + }, + { + "auxiliary_loss_clip": 0.06753751, + "auxiliary_loss_mlp": 0.01312675, + "balance_loss_clip": 0.06363355, + "balance_loss_mlp": 0.01273288, + "epoch": 0.09800090184879003, + "flos": 23666708563200.0, + "grad_norm": 2.0658158581699806, + "language_loss": 0.79474878, + "learning_rate": 3.951689838432013e-06, + "loss": 0.87541306, + "num_input_tokens_seen": 35043480, + "router_z_loss_clip": 3.91015625, + "router_z_loss_mlp": 0.39379883, + "step": 1630, + "time_per_iteration": 2.5846662521362305 + }, + { + "auxiliary_loss_clip": 0.06751612, + "auxiliary_loss_mlp": 0.01306103, + "balance_loss_clip": 0.06359278, + "balance_loss_mlp": 0.01266335, + "epoch": 0.09806102510145799, + "flos": 17061456332160.0, + "grad_norm": 3.092577982684634, + "language_loss": 0.88391125, + "learning_rate": 3.951604717916228e-06, + "loss": 0.96448845, + "num_input_tokens_seen": 35061490, + "router_z_loss_clip": 3.92382812, + "router_z_loss_mlp": 0.39770508, + "step": 1631, + "time_per_iteration": 2.545468807220459 + }, + { + "auxiliary_loss_clip": 0.06742664, + "auxiliary_loss_mlp": 0.01296447, + "balance_loss_clip": 0.06359032, + "balance_loss_mlp": 0.01259039, + "epoch": 0.09812114835412596, + "flos": 23885278738560.0, + "grad_norm": 2.2303411170681566, + "language_loss": 0.8421644, + "learning_rate": 3.9515195233953975e-06, + "loss": 0.92255551, + "num_input_tokens_seen": 35079670, + "router_z_loss_clip": 3.8359375, + "router_z_loss_mlp": 0.37426758, + "step": 1632, + "time_per_iteration": 2.5765457153320312 + }, + { + "auxiliary_loss_clip": 0.06746343, + "auxiliary_loss_mlp": 0.01300275, + "balance_loss_clip": 0.0636283, + "balance_loss_mlp": 0.01262557, + "epoch": 0.09818127160679392, + "flos": 20601862571520.0, + "grad_norm": 2.054168262723839, + "language_loss": 0.80421484, + "learning_rate": 3.951434254872751e-06, + "loss": 0.88468099, + "num_input_tokens_seen": 35099205, + "router_z_loss_clip": 3.83984375, + "router_z_loss_mlp": 0.37744141, + "step": 1633, + "time_per_iteration": 2.5900163650512695 + }, + { + "auxiliary_loss_clip": 0.06752759, + "auxiliary_loss_mlp": 0.01296054, + "balance_loss_clip": 0.06366011, + "balance_loss_mlp": 0.01257931, + "epoch": 0.0982413948594619, + "flos": 15492835572480.0, + "grad_norm": 3.0165255601535743, + "language_loss": 0.74936914, + "learning_rate": 3.951348912351521e-06, + "loss": 0.82985729, + "num_input_tokens_seen": 35115270, + "router_z_loss_clip": 3.86914062, + "router_z_loss_mlp": 0.38134766, + "step": 1634, + "time_per_iteration": 3.9524917602539062 + }, + { + "auxiliary_loss_clip": 0.06754396, + "auxiliary_loss_mlp": 0.01296894, + "balance_loss_clip": 0.06358244, + "balance_loss_mlp": 0.01258485, + "epoch": 0.09830151811212987, + "flos": 24214999754880.0, + "grad_norm": 4.629396807552869, + "language_loss": 0.75166363, + "learning_rate": 3.951263495834947e-06, + "loss": 0.83217651, + "num_input_tokens_seen": 35134065, + "router_z_loss_clip": 3.95898438, + "router_z_loss_mlp": 0.3840332, + "step": 1635, + "time_per_iteration": 2.619173049926758 + }, + { + "auxiliary_loss_clip": 0.06750873, + "auxiliary_loss_mlp": 0.01303971, + "balance_loss_clip": 0.0635405, + "balance_loss_mlp": 0.01262486, + "epoch": 0.09836164136479783, + "flos": 20600814395520.0, + "grad_norm": 5.1262872331137945, + "language_loss": 0.79884511, + "learning_rate": 3.951178005326264e-06, + "loss": 0.87939358, + "num_input_tokens_seen": 35154870, + "router_z_loss_clip": 3.96875, + "router_z_loss_mlp": 0.41455078, + "step": 1636, + "time_per_iteration": 4.063632965087891 + }, + { + "auxiliary_loss_clip": 0.06755228, + "auxiliary_loss_mlp": 0.0130259, + "balance_loss_clip": 0.06357834, + "balance_loss_mlp": 0.01260486, + "epoch": 0.09842176461746581, + "flos": 19939653354240.0, + "grad_norm": 2.182253503011162, + "language_loss": 0.72318256, + "learning_rate": 3.951092440828715e-06, + "loss": 0.80376077, + "num_input_tokens_seen": 35171850, + "router_z_loss_clip": 3.97460938, + "router_z_loss_mlp": 0.42163086, + "step": 1637, + "time_per_iteration": 2.573108196258545 + }, + { + "auxiliary_loss_clip": 0.0673624, + "auxiliary_loss_mlp": 0.01302289, + "balance_loss_clip": 0.06349343, + "balance_loss_mlp": 0.01263045, + "epoch": 0.09848188787013377, + "flos": 21220956063360.0, + "grad_norm": 2.9423896219595016, + "language_loss": 0.79459947, + "learning_rate": 3.951006802345545e-06, + "loss": 0.87498474, + "num_input_tokens_seen": 35188795, + "router_z_loss_clip": 3.87304688, + "router_z_loss_mlp": 0.39257812, + "step": 1638, + "time_per_iteration": 2.620058536529541 + }, + { + "auxiliary_loss_clip": 0.06725241, + "auxiliary_loss_mlp": 0.01294434, + "balance_loss_clip": 0.06345727, + "balance_loss_mlp": 0.01258027, + "epoch": 0.09854201112280174, + "flos": 30162109691520.0, + "grad_norm": 1.743966069044169, + "language_loss": 0.7446866, + "learning_rate": 3.950921089880003e-06, + "loss": 0.82488334, + "num_input_tokens_seen": 35212100, + "router_z_loss_clip": 3.79296875, + "router_z_loss_mlp": 0.36401367, + "step": 1639, + "time_per_iteration": 4.186578750610352 + }, + { + "auxiliary_loss_clip": 0.06740695, + "auxiliary_loss_mlp": 0.01301032, + "balance_loss_clip": 0.06346842, + "balance_loss_mlp": 0.01260025, + "epoch": 0.09860213437546972, + "flos": 21801671585280.0, + "grad_norm": 2.1837560711862114, + "language_loss": 0.90050477, + "learning_rate": 3.950835303435337e-06, + "loss": 0.9809221, + "num_input_tokens_seen": 35230390, + "router_z_loss_clip": 3.93945312, + "router_z_loss_mlp": 0.41040039, + "step": 1640, + "time_per_iteration": 2.571072816848755 + }, + { + "auxiliary_loss_clip": 0.06734361, + "auxiliary_loss_mlp": 0.01304387, + "balance_loss_clip": 0.06346233, + "balance_loss_mlp": 0.01265548, + "epoch": 0.09866225762813768, + "flos": 21842062053120.0, + "grad_norm": 2.730520486163119, + "language_loss": 0.82726961, + "learning_rate": 3.950749443014801e-06, + "loss": 0.90765709, + "num_input_tokens_seen": 35250405, + "router_z_loss_clip": 3.87695312, + "router_z_loss_mlp": 0.38818359, + "step": 1641, + "time_per_iteration": 3.9849867820739746 + }, + { + "auxiliary_loss_clip": 0.06739942, + "auxiliary_loss_mlp": 0.01313392, + "balance_loss_clip": 0.06347778, + "balance_loss_mlp": 0.01271692, + "epoch": 0.09872238088080565, + "flos": 17605093622400.0, + "grad_norm": 3.096093902434135, + "language_loss": 0.88531339, + "learning_rate": 3.95066350862165e-06, + "loss": 0.96584678, + "num_input_tokens_seen": 35262820, + "router_z_loss_clip": 3.92382812, + "router_z_loss_mlp": 0.41699219, + "step": 1642, + "time_per_iteration": 2.516415596008301 + }, + { + "auxiliary_loss_clip": 0.06737699, + "auxiliary_loss_mlp": 0.01318919, + "balance_loss_clip": 0.06353228, + "balance_loss_mlp": 0.01281606, + "epoch": 0.09878250413347361, + "flos": 27643500466560.0, + "grad_norm": 2.0791034906225883, + "language_loss": 0.82263941, + "learning_rate": 3.950577500259144e-06, + "loss": 0.90320563, + "num_input_tokens_seen": 35284490, + "router_z_loss_clip": 3.84179688, + "router_z_loss_mlp": 0.37304688, + "step": 1643, + "time_per_iteration": 2.647494077682495 + }, + { + "auxiliary_loss_clip": 0.06734201, + "auxiliary_loss_mlp": 0.01331721, + "balance_loss_clip": 0.06346507, + "balance_loss_mlp": 0.01293407, + "epoch": 0.0988426273861416, + "flos": 16550285299200.0, + "grad_norm": 2.4456553195112574, + "language_loss": 0.84032261, + "learning_rate": 3.950491417930543e-06, + "loss": 0.92098182, + "num_input_tokens_seen": 35302815, + "router_z_loss_clip": 3.88085938, + "router_z_loss_mlp": 0.3828125, + "step": 1644, + "time_per_iteration": 2.532773733139038 + }, + { + "auxiliary_loss_clip": 0.06725995, + "auxiliary_loss_mlp": 0.01324281, + "balance_loss_clip": 0.06350633, + "balance_loss_mlp": 0.0128499, + "epoch": 0.09890275063880956, + "flos": 21221668823040.0, + "grad_norm": 2.0467133061416956, + "language_loss": 0.70372713, + "learning_rate": 3.9504052616391124e-06, + "loss": 0.78422999, + "num_input_tokens_seen": 35321175, + "router_z_loss_clip": 3.75390625, + "router_z_loss_mlp": 0.39282227, + "step": 1645, + "time_per_iteration": 2.622675657272339 + }, + { + "auxiliary_loss_clip": 0.06615774, + "auxiliary_loss_mlp": 0.01318713, + "balance_loss_clip": 0.06367776, + "balance_loss_mlp": 0.01297721, + "epoch": 0.09896287389147752, + "flos": 59398255111680.0, + "grad_norm": 0.866313536392572, + "language_loss": 0.6076256, + "learning_rate": 3.950319031388119e-06, + "loss": 0.68697047, + "num_input_tokens_seen": 35381740, + "router_z_loss_clip": 2.48242188, + "router_z_loss_mlp": 0.21008301, + "step": 1646, + "time_per_iteration": 3.1056430339813232 + }, + { + "auxiliary_loss_clip": 0.06736847, + "auxiliary_loss_mlp": 0.01330956, + "balance_loss_clip": 0.06343894, + "balance_loss_mlp": 0.01288517, + "epoch": 0.0990229971441455, + "flos": 29650351680000.0, + "grad_norm": 13.669187568501263, + "language_loss": 0.74906254, + "learning_rate": 3.950232727180833e-06, + "loss": 0.82974058, + "num_input_tokens_seen": 35403760, + "router_z_loss_clip": 3.93164062, + "router_z_loss_mlp": 0.42456055, + "step": 1647, + "time_per_iteration": 2.6270813941955566 + }, + { + "auxiliary_loss_clip": 0.06742343, + "auxiliary_loss_mlp": 0.01344997, + "balance_loss_clip": 0.0635362, + "balance_loss_mlp": 0.01305277, + "epoch": 0.09908312039681347, + "flos": 21841265439360.0, + "grad_norm": 3.219880040136517, + "language_loss": 0.86054468, + "learning_rate": 3.950146349020525e-06, + "loss": 0.94141805, + "num_input_tokens_seen": 35424050, + "router_z_loss_clip": 3.88671875, + "router_z_loss_mlp": 0.3972168, + "step": 1648, + "time_per_iteration": 2.6192800998687744 + }, + { + "auxiliary_loss_clip": 0.06595583, + "auxiliary_loss_mlp": 0.01312987, + "balance_loss_clip": 0.06350748, + "balance_loss_mlp": 0.01292542, + "epoch": 0.09914324364948143, + "flos": 57584425777920.0, + "grad_norm": 0.7273762983113155, + "language_loss": 0.5560773, + "learning_rate": 3.950059896910473e-06, + "loss": 0.63516295, + "num_input_tokens_seen": 35481690, + "router_z_loss_clip": 2.44140625, + "router_z_loss_mlp": 0.20446777, + "step": 1649, + "time_per_iteration": 3.1318249702453613 + }, + { + "auxiliary_loss_clip": 0.06736004, + "auxiliary_loss_mlp": 0.01331784, + "balance_loss_clip": 0.06347787, + "balance_loss_mlp": 0.01293232, + "epoch": 0.09920336690214941, + "flos": 34131270873600.0, + "grad_norm": 3.80404299498915, + "language_loss": 0.92154968, + "learning_rate": 3.949973370853954e-06, + "loss": 1.00222754, + "num_input_tokens_seen": 35498635, + "router_z_loss_clip": 3.88085938, + "router_z_loss_mlp": 0.38574219, + "step": 1650, + "time_per_iteration": 2.640519142150879 + }, + { + "auxiliary_loss_clip": 0.06583999, + "auxiliary_loss_mlp": 0.012899, + "balance_loss_clip": 0.06337862, + "balance_loss_mlp": 0.012688, + "epoch": 0.09926349015481738, + "flos": 71239910947200.0, + "grad_norm": 0.7750953568391499, + "language_loss": 0.63578606, + "learning_rate": 3.94988677085425e-06, + "loss": 0.71452504, + "num_input_tokens_seen": 35565720, + "router_z_loss_clip": 2.46679688, + "router_z_loss_mlp": 0.21118164, + "step": 1651, + "time_per_iteration": 3.380758047103882 + }, + { + "auxiliary_loss_clip": 0.06739324, + "auxiliary_loss_mlp": 0.01313359, + "balance_loss_clip": 0.06352896, + "balance_loss_mlp": 0.01275236, + "epoch": 0.09932361340748534, + "flos": 23155369822080.0, + "grad_norm": 3.694899481712973, + "language_loss": 0.89802289, + "learning_rate": 3.949800096914643e-06, + "loss": 0.97854972, + "num_input_tokens_seen": 35586000, + "router_z_loss_clip": 3.8671875, + "router_z_loss_mlp": 0.38110352, + "step": 1652, + "time_per_iteration": 2.571901321411133 + }, + { + "auxiliary_loss_clip": 0.06737585, + "auxiliary_loss_mlp": 0.01305643, + "balance_loss_clip": 0.06349514, + "balance_loss_mlp": 0.01267735, + "epoch": 0.09938373666015332, + "flos": 19834791569280.0, + "grad_norm": 2.586330184077195, + "language_loss": 0.8401894, + "learning_rate": 3.949713349038422e-06, + "loss": 0.92062169, + "num_input_tokens_seen": 35604355, + "router_z_loss_clip": 3.88085938, + "router_z_loss_mlp": 0.37890625, + "step": 1653, + "time_per_iteration": 2.5631346702575684 + }, + { + "auxiliary_loss_clip": 0.0674301, + "auxiliary_loss_mlp": 0.01306602, + "balance_loss_clip": 0.06348432, + "balance_loss_mlp": 0.01266428, + "epoch": 0.09944385991282129, + "flos": 22097165408640.0, + "grad_norm": 3.5179958225358914, + "language_loss": 0.81669748, + "learning_rate": 3.949626527228875e-06, + "loss": 0.89719361, + "num_input_tokens_seen": 35625495, + "router_z_loss_clip": 3.94335938, + "router_z_loss_mlp": 0.40136719, + "step": 1654, + "time_per_iteration": 2.602562427520752 + }, + { + "auxiliary_loss_clip": 0.06716993, + "auxiliary_loss_mlp": 0.01303058, + "balance_loss_clip": 0.0634619, + "balance_loss_mlp": 0.01268178, + "epoch": 0.09950398316548925, + "flos": 19835043131520.0, + "grad_norm": 8.671208784933132, + "language_loss": 0.83012509, + "learning_rate": 3.949539631489295e-06, + "loss": 0.91032565, + "num_input_tokens_seen": 35645030, + "router_z_loss_clip": 3.70703125, + "router_z_loss_mlp": 0.34863281, + "step": 1655, + "time_per_iteration": 2.5673985481262207 + }, + { + "auxiliary_loss_clip": 0.06726938, + "auxiliary_loss_mlp": 0.01297279, + "balance_loss_clip": 0.06340201, + "balance_loss_mlp": 0.01259799, + "epoch": 0.09956410641815722, + "flos": 25009715404800.0, + "grad_norm": 2.461628043042503, + "language_loss": 0.82767576, + "learning_rate": 3.9494526618229765e-06, + "loss": 0.90791798, + "num_input_tokens_seen": 35664305, + "router_z_loss_clip": 3.86523438, + "router_z_loss_mlp": 0.37475586, + "step": 1656, + "time_per_iteration": 2.581664800643921 + }, + { + "auxiliary_loss_clip": 0.06710893, + "auxiliary_loss_mlp": 0.01307317, + "balance_loss_clip": 0.06336491, + "balance_loss_mlp": 0.01268812, + "epoch": 0.0996242296708252, + "flos": 19323746317440.0, + "grad_norm": 1.719286888169867, + "language_loss": 0.90283895, + "learning_rate": 3.949365618233217e-06, + "loss": 0.98302102, + "num_input_tokens_seen": 35684060, + "router_z_loss_clip": 3.74804688, + "router_z_loss_mlp": 0.38525391, + "step": 1657, + "time_per_iteration": 2.57688045501709 + }, + { + "auxiliary_loss_clip": 0.06739774, + "auxiliary_loss_mlp": 0.01311666, + "balance_loss_clip": 0.06340782, + "balance_loss_mlp": 0.01267869, + "epoch": 0.09968435292349316, + "flos": 21878050181760.0, + "grad_norm": 2.9029706728478533, + "language_loss": 0.87311482, + "learning_rate": 3.9492785007233195e-06, + "loss": 0.95362926, + "num_input_tokens_seen": 35703250, + "router_z_loss_clip": 3.98632812, + "router_z_loss_mlp": 0.43823242, + "step": 1658, + "time_per_iteration": 2.628093719482422 + }, + { + "auxiliary_loss_clip": 0.06571998, + "auxiliary_loss_mlp": 0.01376397, + "balance_loss_clip": 0.06328425, + "balance_loss_mlp": 0.01349933, + "epoch": 0.09974447617616113, + "flos": 65401912154880.0, + "grad_norm": 0.9037243571562794, + "language_loss": 0.60433233, + "learning_rate": 3.949191309296585e-06, + "loss": 0.68381631, + "num_input_tokens_seen": 35762165, + "router_z_loss_clip": 2.4375, + "router_z_loss_mlp": 0.26513672, + "step": 1659, + "time_per_iteration": 3.2305996417999268 + }, + { + "auxiliary_loss_clip": 0.06713426, + "auxiliary_loss_mlp": 0.01317119, + "balance_loss_clip": 0.06331229, + "balance_loss_mlp": 0.0127735, + "epoch": 0.0998045994288291, + "flos": 23666624709120.0, + "grad_norm": 2.0571407511312865, + "language_loss": 0.87086773, + "learning_rate": 3.949104043956321e-06, + "loss": 0.95117325, + "num_input_tokens_seen": 35781520, + "router_z_loss_clip": 3.82226562, + "router_z_loss_mlp": 0.39746094, + "step": 1660, + "time_per_iteration": 2.5779190063476562 + }, + { + "auxiliary_loss_clip": 0.0670151, + "auxiliary_loss_mlp": 0.01332109, + "balance_loss_clip": 0.06323117, + "balance_loss_mlp": 0.01290529, + "epoch": 0.09986472268149707, + "flos": 19615802123520.0, + "grad_norm": 2.4762315311071315, + "language_loss": 0.80644435, + "learning_rate": 3.949016704705836e-06, + "loss": 0.88678062, + "num_input_tokens_seen": 35799565, + "router_z_loss_clip": 3.78710938, + "router_z_loss_mlp": 0.41552734, + "step": 1661, + "time_per_iteration": 2.691804885864258 + }, + { + "auxiliary_loss_clip": 0.06725313, + "auxiliary_loss_mlp": 0.0132162, + "balance_loss_clip": 0.0632514, + "balance_loss_mlp": 0.01278443, + "epoch": 0.09992484593416504, + "flos": 26220467376000.0, + "grad_norm": 2.2620896744149412, + "language_loss": 0.8613416, + "learning_rate": 3.948929291548443e-06, + "loss": 0.94181097, + "num_input_tokens_seen": 35821085, + "router_z_loss_clip": 4.00585938, + "router_z_loss_mlp": 0.43164062, + "step": 1662, + "time_per_iteration": 2.6255035400390625 + }, + { + "auxiliary_loss_clip": 0.06704119, + "auxiliary_loss_mlp": 0.0133037, + "balance_loss_clip": 0.06321694, + "balance_loss_mlp": 0.0128941, + "epoch": 0.09998496918683301, + "flos": 17499393296640.0, + "grad_norm": 2.3672212997838993, + "language_loss": 0.90448183, + "learning_rate": 3.9488418044874546e-06, + "loss": 0.98482674, + "num_input_tokens_seen": 35839840, + "router_z_loss_clip": 3.82226562, + "router_z_loss_mlp": 0.40966797, + "step": 1663, + "time_per_iteration": 2.6671247482299805 + }, + { + "auxiliary_loss_clip": 0.06712753, + "auxiliary_loss_mlp": 0.01334758, + "balance_loss_clip": 0.06319161, + "balance_loss_mlp": 0.01292105, + "epoch": 0.10004509243950098, + "flos": 22791715102080.0, + "grad_norm": 2.952995005402735, + "language_loss": 0.72149938, + "learning_rate": 3.948754243526191e-06, + "loss": 0.80197442, + "num_input_tokens_seen": 35861545, + "router_z_loss_clip": 3.93164062, + "router_z_loss_mlp": 0.42651367, + "step": 1664, + "time_per_iteration": 2.619164228439331 + }, + { + "auxiliary_loss_clip": 0.06713652, + "auxiliary_loss_mlp": 0.01325429, + "balance_loss_clip": 0.06323303, + "balance_loss_mlp": 0.01284159, + "epoch": 0.10010521569216894, + "flos": 16258984179840.0, + "grad_norm": 39.90990553234195, + "language_loss": 0.80576968, + "learning_rate": 3.94866660866797e-06, + "loss": 0.88616049, + "num_input_tokens_seen": 35878295, + "router_z_loss_clip": 3.90429688, + "router_z_loss_mlp": 0.41235352, + "step": 1665, + "time_per_iteration": 2.605639934539795 + }, + { + "auxiliary_loss_clip": 0.06714154, + "auxiliary_loss_mlp": 0.01316999, + "balance_loss_clip": 0.06327689, + "balance_loss_mlp": 0.01278017, + "epoch": 0.10016533894483691, + "flos": 23409047658240.0, + "grad_norm": 2.1899546372821566, + "language_loss": 0.71735048, + "learning_rate": 3.9485788999161165e-06, + "loss": 0.79766202, + "num_input_tokens_seen": 35898990, + "router_z_loss_clip": 3.86523438, + "router_z_loss_mlp": 0.38964844, + "step": 1666, + "time_per_iteration": 2.565112352371216 + }, + { + "auxiliary_loss_clip": 0.06721501, + "auxiliary_loss_mlp": 0.01334152, + "balance_loss_clip": 0.06329556, + "balance_loss_mlp": 0.01286492, + "epoch": 0.10022546219750489, + "flos": 19360195643520.0, + "grad_norm": 2.4453770076419055, + "language_loss": 0.80451995, + "learning_rate": 3.948491117273956e-06, + "loss": 0.88507646, + "num_input_tokens_seen": 35916225, + "router_z_loss_clip": 3.921875, + "router_z_loss_mlp": 0.47680664, + "step": 1667, + "time_per_iteration": 2.5686376094818115 + }, + { + "auxiliary_loss_clip": 0.06714002, + "auxiliary_loss_mlp": 0.01313023, + "balance_loss_clip": 0.06328776, + "balance_loss_mlp": 0.01272492, + "epoch": 0.10028558545017285, + "flos": 27092525944320.0, + "grad_norm": 3.3659339438704357, + "language_loss": 0.79832667, + "learning_rate": 3.948403260744817e-06, + "loss": 0.8785969, + "num_input_tokens_seen": 35934630, + "router_z_loss_clip": 3.84960938, + "router_z_loss_mlp": 0.40551758, + "step": 1668, + "time_per_iteration": 2.5726866722106934 + }, + { + "auxiliary_loss_clip": 0.0670673, + "auxiliary_loss_mlp": 0.013093, + "balance_loss_clip": 0.06318925, + "balance_loss_mlp": 0.01268101, + "epoch": 0.10034570870284082, + "flos": 25854003544320.0, + "grad_norm": 2.568927800509246, + "language_loss": 0.79338908, + "learning_rate": 3.948315330332031e-06, + "loss": 0.87354934, + "num_input_tokens_seen": 35953855, + "router_z_loss_clip": 3.87695312, + "router_z_loss_mlp": 0.41235352, + "step": 1669, + "time_per_iteration": 2.6188042163848877 + }, + { + "auxiliary_loss_clip": 0.06725293, + "auxiliary_loss_mlp": 0.0130808, + "balance_loss_clip": 0.06329028, + "balance_loss_mlp": 0.01264497, + "epoch": 0.1004058319555088, + "flos": 26256707066880.0, + "grad_norm": 15.895164476932296, + "language_loss": 0.87389982, + "learning_rate": 3.948227326038933e-06, + "loss": 0.95423353, + "num_input_tokens_seen": 35974555, + "router_z_loss_clip": 3.9609375, + "router_z_loss_mlp": 0.43579102, + "step": 1670, + "time_per_iteration": 2.6586272716522217 + }, + { + "auxiliary_loss_clip": 0.06691795, + "auxiliary_loss_mlp": 0.01298769, + "balance_loss_clip": 0.06322314, + "balance_loss_mlp": 0.0126098, + "epoch": 0.10046595520817676, + "flos": 25381545897600.0, + "grad_norm": 1.8967452212827218, + "language_loss": 0.7865597, + "learning_rate": 3.9481392478688586e-06, + "loss": 0.86646533, + "num_input_tokens_seen": 35996830, + "router_z_loss_clip": 3.69335938, + "router_z_loss_mlp": 0.37817383, + "step": 1671, + "time_per_iteration": 2.6737799644470215 + }, + { + "auxiliary_loss_clip": 0.06549042, + "auxiliary_loss_mlp": 0.01335852, + "balance_loss_clip": 0.06305933, + "balance_loss_mlp": 0.01310293, + "epoch": 0.10052607846084473, + "flos": 67479146398080.0, + "grad_norm": 0.7871321089675286, + "language_loss": 0.60865933, + "learning_rate": 3.948051095825149e-06, + "loss": 0.68750823, + "num_input_tokens_seen": 36054465, + "router_z_loss_clip": 2.4375, + "router_z_loss_mlp": 0.25585938, + "step": 1672, + "time_per_iteration": 3.1528263092041016 + }, + { + "auxiliary_loss_clip": 0.06706591, + "auxiliary_loss_mlp": 0.01299319, + "balance_loss_clip": 0.06322384, + "balance_loss_mlp": 0.01258406, + "epoch": 0.10058620171351271, + "flos": 21366795294720.0, + "grad_norm": 25.353895208902486, + "language_loss": 0.78260916, + "learning_rate": 3.947962869911147e-06, + "loss": 0.86266828, + "num_input_tokens_seen": 36073480, + "router_z_loss_clip": 3.83984375, + "router_z_loss_mlp": 0.40917969, + "step": 1673, + "time_per_iteration": 2.548840045928955 + }, + { + "auxiliary_loss_clip": 0.06713213, + "auxiliary_loss_mlp": 0.01301927, + "balance_loss_clip": 0.06326719, + "balance_loss_mlp": 0.01261419, + "epoch": 0.10064632496618067, + "flos": 16805724071040.0, + "grad_norm": 3.2623460746575867, + "language_loss": 0.75444734, + "learning_rate": 3.947874570130197e-06, + "loss": 0.83459872, + "num_input_tokens_seen": 36091830, + "router_z_loss_clip": 3.8671875, + "router_z_loss_mlp": 0.4050293, + "step": 1674, + "time_per_iteration": 3.9417338371276855 + }, + { + "auxiliary_loss_clip": 0.06701215, + "auxiliary_loss_mlp": 0.01303034, + "balance_loss_clip": 0.0631593, + "balance_loss_mlp": 0.01264124, + "epoch": 0.10070644821884864, + "flos": 23631433194240.0, + "grad_norm": 2.3845334341515905, + "language_loss": 0.80716002, + "learning_rate": 3.947786196485649e-06, + "loss": 0.88720256, + "num_input_tokens_seen": 36111400, + "router_z_loss_clip": 3.8515625, + "router_z_loss_mlp": 0.38891602, + "step": 1675, + "time_per_iteration": 2.6035287380218506 + }, + { + "auxiliary_loss_clip": 0.06711227, + "auxiliary_loss_mlp": 0.01308342, + "balance_loss_clip": 0.06320765, + "balance_loss_mlp": 0.01266404, + "epoch": 0.1007665714715166, + "flos": 24469516131840.0, + "grad_norm": 3.2401043480386122, + "language_loss": 0.82723379, + "learning_rate": 3.947697748980853e-06, + "loss": 0.90742946, + "num_input_tokens_seen": 36129345, + "router_z_loss_clip": 3.90234375, + "router_z_loss_mlp": 0.41943359, + "step": 1676, + "time_per_iteration": 4.029613256454468 + }, + { + "auxiliary_loss_clip": 0.06714617, + "auxiliary_loss_mlp": 0.01315911, + "balance_loss_clip": 0.0632771, + "balance_loss_mlp": 0.0127476, + "epoch": 0.10082669472418458, + "flos": 16804550113920.0, + "grad_norm": 2.3128991920650295, + "language_loss": 0.87477523, + "learning_rate": 3.947609227619163e-06, + "loss": 0.95508051, + "num_input_tokens_seen": 36146255, + "router_z_loss_clip": 3.87109375, + "router_z_loss_mlp": 0.41113281, + "step": 1677, + "time_per_iteration": 2.593122720718384 + }, + { + "auxiliary_loss_clip": 0.06712872, + "auxiliary_loss_mlp": 0.01323048, + "balance_loss_clip": 0.06321359, + "balance_loss_mlp": 0.01280586, + "epoch": 0.10088681797685255, + "flos": 13558673376000.0, + "grad_norm": 2.3885344519990017, + "language_loss": 0.87886804, + "learning_rate": 3.947520632403936e-06, + "loss": 0.9592272, + "num_input_tokens_seen": 36164050, + "router_z_loss_clip": 3.9140625, + "router_z_loss_mlp": 0.42480469, + "step": 1678, + "time_per_iteration": 4.02148962020874 + }, + { + "auxiliary_loss_clip": 0.06711318, + "auxiliary_loss_mlp": 0.01321227, + "balance_loss_clip": 0.06328011, + "balance_loss_mlp": 0.01282985, + "epoch": 0.10094694122952051, + "flos": 25272868752000.0, + "grad_norm": 13.556620814946344, + "language_loss": 0.91124773, + "learning_rate": 3.947431963338532e-06, + "loss": 0.99157315, + "num_input_tokens_seen": 36183530, + "router_z_loss_clip": 3.83007812, + "router_z_loss_mlp": 0.38256836, + "step": 1679, + "time_per_iteration": 2.593204975128174 + }, + { + "auxiliary_loss_clip": 0.06551328, + "auxiliary_loss_mlp": 0.01270219, + "balance_loss_clip": 0.06307815, + "balance_loss_mlp": 0.01249143, + "epoch": 0.10100706448218849, + "flos": 69875521315200.0, + "grad_norm": 0.8658555731993547, + "language_loss": 0.53157437, + "learning_rate": 3.947343220426312e-06, + "loss": 0.60978985, + "num_input_tokens_seen": 36248550, + "router_z_loss_clip": 2.4375, + "router_z_loss_mlp": 0.2109375, + "step": 1680, + "time_per_iteration": 4.680401802062988 + }, + { + "auxiliary_loss_clip": 0.06706315, + "auxiliary_loss_mlp": 0.01330393, + "balance_loss_clip": 0.06326837, + "balance_loss_mlp": 0.0129103, + "epoch": 0.10106718773485646, + "flos": 20012677787520.0, + "grad_norm": 2.2086252291478403, + "language_loss": 0.78363287, + "learning_rate": 3.947254403670641e-06, + "loss": 0.86399996, + "num_input_tokens_seen": 36266065, + "router_z_loss_clip": 3.79296875, + "router_z_loss_mlp": 0.39331055, + "step": 1681, + "time_per_iteration": 2.5842180252075195 + }, + { + "auxiliary_loss_clip": 0.06727763, + "auxiliary_loss_mlp": 0.0133733, + "balance_loss_clip": 0.06334171, + "balance_loss_mlp": 0.01293271, + "epoch": 0.10112731098752442, + "flos": 13484852328960.0, + "grad_norm": 2.7825426019965707, + "language_loss": 0.9580273, + "learning_rate": 3.947165513074889e-06, + "loss": 1.03867817, + "num_input_tokens_seen": 36280960, + "router_z_loss_clip": 3.9375, + "router_z_loss_mlp": 0.44067383, + "step": 1682, + "time_per_iteration": 2.5091476440429688 + }, + { + "auxiliary_loss_clip": 0.06722884, + "auxiliary_loss_mlp": 0.01333979, + "balance_loss_clip": 0.06334428, + "balance_loss_mlp": 0.01291803, + "epoch": 0.1011874342401924, + "flos": 18521944997760.0, + "grad_norm": 4.013093374062749, + "language_loss": 0.88974559, + "learning_rate": 3.947076548642425e-06, + "loss": 0.97031426, + "num_input_tokens_seen": 36299010, + "router_z_loss_clip": 3.8828125, + "router_z_loss_mlp": 0.421875, + "step": 1683, + "time_per_iteration": 2.583263635635376 + }, + { + "auxiliary_loss_clip": 0.0671032, + "auxiliary_loss_mlp": 0.01319793, + "balance_loss_clip": 0.06327897, + "balance_loss_mlp": 0.0128074, + "epoch": 0.10124755749286037, + "flos": 20708904562560.0, + "grad_norm": 3.51695946667963, + "language_loss": 0.76482016, + "learning_rate": 3.946987510376624e-06, + "loss": 0.84512126, + "num_input_tokens_seen": 36318400, + "router_z_loss_clip": 3.82226562, + "router_z_loss_mlp": 0.390625, + "step": 1684, + "time_per_iteration": 2.5566201210021973 + }, + { + "auxiliary_loss_clip": 0.06545618, + "auxiliary_loss_mlp": 0.01270157, + "balance_loss_clip": 0.06304231, + "balance_loss_mlp": 0.01252085, + "epoch": 0.10130768074552833, + "flos": 56130100387200.0, + "grad_norm": 0.7359306974182547, + "language_loss": 0.6108619, + "learning_rate": 3.9468983982808615e-06, + "loss": 0.68901968, + "num_input_tokens_seen": 36381815, + "router_z_loss_clip": 2.40625, + "router_z_loss_mlp": 0.1809082, + "step": 1685, + "time_per_iteration": 3.2871286869049072 + }, + { + "auxiliary_loss_clip": 0.06715102, + "auxiliary_loss_mlp": 0.01314643, + "balance_loss_clip": 0.06328554, + "balance_loss_mlp": 0.01273612, + "epoch": 0.1013678039981963, + "flos": 33410921322240.0, + "grad_norm": 2.782312478618552, + "language_loss": 0.61882973, + "learning_rate": 3.946809212358516e-06, + "loss": 0.6991272, + "num_input_tokens_seen": 36404320, + "router_z_loss_clip": 3.86328125, + "router_z_loss_mlp": 0.41064453, + "step": 1686, + "time_per_iteration": 2.6534583568573 + }, + { + "auxiliary_loss_clip": 0.0670934, + "auxiliary_loss_mlp": 0.01311437, + "balance_loss_clip": 0.0633449, + "balance_loss_mlp": 0.01272622, + "epoch": 0.10142792725086427, + "flos": 31913480206080.0, + "grad_norm": 4.585581221965215, + "language_loss": 0.8288697, + "learning_rate": 3.946719952612972e-06, + "loss": 0.90907753, + "num_input_tokens_seen": 36427510, + "router_z_loss_clip": 3.74804688, + "router_z_loss_mlp": 0.38793945, + "step": 1687, + "time_per_iteration": 2.6766278743743896 + }, + { + "auxiliary_loss_clip": 0.06718412, + "auxiliary_loss_mlp": 0.0131249, + "balance_loss_clip": 0.06331126, + "balance_loss_mlp": 0.01271601, + "epoch": 0.10148805050353224, + "flos": 28483512048000.0, + "grad_norm": 2.9352499009147386, + "language_loss": 0.73686063, + "learning_rate": 3.94663061904761e-06, + "loss": 0.81716961, + "num_input_tokens_seen": 36448230, + "router_z_loss_clip": 3.88085938, + "router_z_loss_mlp": 0.40917969, + "step": 1688, + "time_per_iteration": 2.625084400177002 + }, + { + "auxiliary_loss_clip": 0.06704164, + "auxiliary_loss_mlp": 0.01310415, + "balance_loss_clip": 0.06328401, + "balance_loss_mlp": 0.01267905, + "epoch": 0.1015481737562002, + "flos": 25154799949440.0, + "grad_norm": 2.7691275113498293, + "language_loss": 0.88195848, + "learning_rate": 3.94654121166582e-06, + "loss": 0.9621042, + "num_input_tokens_seen": 36464395, + "router_z_loss_clip": 3.7578125, + "router_z_loss_mlp": 0.42480469, + "step": 1689, + "time_per_iteration": 2.595492362976074 + }, + { + "auxiliary_loss_clip": 0.06716056, + "auxiliary_loss_mlp": 0.01310716, + "balance_loss_clip": 0.06332745, + "balance_loss_mlp": 0.01270328, + "epoch": 0.10160829700886818, + "flos": 30890593088640.0, + "grad_norm": 2.202394662859946, + "language_loss": 0.89776945, + "learning_rate": 3.946451730470993e-06, + "loss": 0.97803724, + "num_input_tokens_seen": 36486475, + "router_z_loss_clip": 3.83398438, + "router_z_loss_mlp": 0.40429688, + "step": 1690, + "time_per_iteration": 2.6406383514404297 + }, + { + "auxiliary_loss_clip": 0.06720668, + "auxiliary_loss_mlp": 0.01309465, + "balance_loss_clip": 0.06337205, + "balance_loss_mlp": 0.01267932, + "epoch": 0.10166842026153615, + "flos": 20418190421760.0, + "grad_norm": 2.5850789066585595, + "language_loss": 0.85274917, + "learning_rate": 3.946362175466521e-06, + "loss": 0.93305051, + "num_input_tokens_seen": 36505310, + "router_z_loss_clip": 3.83789062, + "router_z_loss_mlp": 0.4152832, + "step": 1691, + "time_per_iteration": 2.6336474418640137 + }, + { + "auxiliary_loss_clip": 0.06720576, + "auxiliary_loss_mlp": 0.01308382, + "balance_loss_clip": 0.06329723, + "balance_loss_mlp": 0.01266039, + "epoch": 0.10172854351420411, + "flos": 33485832472320.0, + "grad_norm": 1.9210168222319979, + "language_loss": 0.67985535, + "learning_rate": 3.946272546655801e-06, + "loss": 0.76014495, + "num_input_tokens_seen": 36529820, + "router_z_loss_clip": 3.91210938, + "router_z_loss_mlp": 0.4230957, + "step": 1692, + "time_per_iteration": 2.7298569679260254 + }, + { + "auxiliary_loss_clip": 0.0670909, + "auxiliary_loss_mlp": 0.01313275, + "balance_loss_clip": 0.06329532, + "balance_loss_mlp": 0.01271933, + "epoch": 0.1017886667668721, + "flos": 23557109022720.0, + "grad_norm": 2.364359015626866, + "language_loss": 0.77791357, + "learning_rate": 3.94618284404223e-06, + "loss": 0.85813725, + "num_input_tokens_seen": 36549000, + "router_z_loss_clip": 3.79101562, + "router_z_loss_mlp": 0.41333008, + "step": 1693, + "time_per_iteration": 2.5772159099578857 + }, + { + "auxiliary_loss_clip": 0.06718149, + "auxiliary_loss_mlp": 0.01308582, + "balance_loss_clip": 0.06332842, + "balance_loss_mlp": 0.01267813, + "epoch": 0.10184879001954006, + "flos": 23303011916160.0, + "grad_norm": 1.7868831519316952, + "language_loss": 0.88559091, + "learning_rate": 3.9460930676292105e-06, + "loss": 0.96585822, + "num_input_tokens_seen": 36567515, + "router_z_loss_clip": 3.85742188, + "router_z_loss_mlp": 0.4074707, + "step": 1694, + "time_per_iteration": 2.6128172874450684 + }, + { + "auxiliary_loss_clip": 0.06728393, + "auxiliary_loss_mlp": 0.01308189, + "balance_loss_clip": 0.06335086, + "balance_loss_mlp": 0.01266681, + "epoch": 0.10190891327220802, + "flos": 18339069461760.0, + "grad_norm": 12.701803193315635, + "language_loss": 0.81483626, + "learning_rate": 3.946003217420147e-06, + "loss": 0.89520216, + "num_input_tokens_seen": 36586190, + "router_z_loss_clip": 3.9375, + "router_z_loss_mlp": 0.41503906, + "step": 1695, + "time_per_iteration": 2.5383710861206055 + }, + { + "auxiliary_loss_clip": 0.06719907, + "auxiliary_loss_mlp": 0.01309327, + "balance_loss_clip": 0.06335149, + "balance_loss_mlp": 0.01268152, + "epoch": 0.10196903652487599, + "flos": 26472006933120.0, + "grad_norm": 2.5208321376903173, + "language_loss": 0.87899506, + "learning_rate": 3.945913293418447e-06, + "loss": 0.95928741, + "num_input_tokens_seen": 36607495, + "router_z_loss_clip": 3.84765625, + "router_z_loss_mlp": 0.41186523, + "step": 1696, + "time_per_iteration": 2.651993989944458 + }, + { + "auxiliary_loss_clip": 0.067072, + "auxiliary_loss_mlp": 0.01308456, + "balance_loss_clip": 0.06329801, + "balance_loss_mlp": 0.01268545, + "epoch": 0.10202915977754397, + "flos": 21875618413440.0, + "grad_norm": 1.9807901580601361, + "language_loss": 0.83342528, + "learning_rate": 3.945823295627519e-06, + "loss": 0.91358191, + "num_input_tokens_seen": 36628555, + "router_z_loss_clip": 3.77734375, + "router_z_loss_mlp": 0.39916992, + "step": 1697, + "time_per_iteration": 2.5826144218444824 + }, + { + "auxiliary_loss_clip": 0.06717139, + "auxiliary_loss_mlp": 0.01309728, + "balance_loss_clip": 0.06333424, + "balance_loss_mlp": 0.01268339, + "epoch": 0.10208928303021193, + "flos": 22316322562560.0, + "grad_norm": 4.080073154744023, + "language_loss": 0.82607067, + "learning_rate": 3.9457332240507775e-06, + "loss": 0.90633935, + "num_input_tokens_seen": 36646250, + "router_z_loss_clip": 3.83789062, + "router_z_loss_mlp": 0.4140625, + "step": 1698, + "time_per_iteration": 2.6105751991271973 + }, + { + "auxiliary_loss_clip": 0.06711876, + "auxiliary_loss_mlp": 0.01312643, + "balance_loss_clip": 0.06331024, + "balance_loss_mlp": 0.01272541, + "epoch": 0.1021494062828799, + "flos": 22131811872000.0, + "grad_norm": 3.7730678992984594, + "language_loss": 0.78052682, + "learning_rate": 3.945643078691637e-06, + "loss": 0.86077201, + "num_input_tokens_seen": 36666675, + "router_z_loss_clip": 3.8046875, + "router_z_loss_mlp": 0.40112305, + "step": 1699, + "time_per_iteration": 2.554769515991211 + }, + { + "auxiliary_loss_clip": 0.06706256, + "auxiliary_loss_mlp": 0.01310666, + "balance_loss_clip": 0.06325917, + "balance_loss_mlp": 0.01269253, + "epoch": 0.10220952953554788, + "flos": 19652922282240.0, + "grad_norm": 2.595218153740113, + "language_loss": 0.81135154, + "learning_rate": 3.945552859553516e-06, + "loss": 0.89152074, + "num_input_tokens_seen": 36685225, + "router_z_loss_clip": 3.80273438, + "router_z_loss_mlp": 0.41430664, + "step": 1700, + "time_per_iteration": 2.6276824474334717 + }, + { + "auxiliary_loss_clip": 0.06713387, + "auxiliary_loss_mlp": 0.01308957, + "balance_loss_clip": 0.06330973, + "balance_loss_mlp": 0.01269284, + "epoch": 0.10226965278821584, + "flos": 29794765392000.0, + "grad_norm": 1.915620858004171, + "language_loss": 0.78195202, + "learning_rate": 3.945462566639836e-06, + "loss": 0.86217546, + "num_input_tokens_seen": 36705985, + "router_z_loss_clip": 3.82421875, + "router_z_loss_mlp": 0.39697266, + "step": 1701, + "time_per_iteration": 2.6159350872039795 + }, + { + "auxiliary_loss_clip": 0.06729369, + "auxiliary_loss_mlp": 0.01324821, + "balance_loss_clip": 0.06331599, + "balance_loss_mlp": 0.01279617, + "epoch": 0.10232977604088381, + "flos": 27024239266560.0, + "grad_norm": 2.5261274720011473, + "language_loss": 0.79135132, + "learning_rate": 3.945372199954019e-06, + "loss": 0.87189317, + "num_input_tokens_seen": 36725815, + "router_z_loss_clip": 3.98046875, + "router_z_loss_mlp": 0.4519043, + "step": 1702, + "time_per_iteration": 2.629913806915283 + }, + { + "auxiliary_loss_clip": 0.06706569, + "auxiliary_loss_mlp": 0.01317465, + "balance_loss_clip": 0.06326532, + "balance_loss_mlp": 0.01277983, + "epoch": 0.10238989929355179, + "flos": 20783857639680.0, + "grad_norm": 2.3222724065629494, + "language_loss": 0.95639896, + "learning_rate": 3.945281759499494e-06, + "loss": 1.03663921, + "num_input_tokens_seen": 36742345, + "router_z_loss_clip": 3.80078125, + "router_z_loss_mlp": 0.39501953, + "step": 1703, + "time_per_iteration": 2.601848840713501 + }, + { + "auxiliary_loss_clip": 0.06547229, + "auxiliary_loss_mlp": 0.01318477, + "balance_loss_clip": 0.06308849, + "balance_loss_mlp": 0.01299118, + "epoch": 0.10245002254621975, + "flos": 57716471013120.0, + "grad_norm": 0.8331319138238726, + "language_loss": 0.55242068, + "learning_rate": 3.94519124527969e-06, + "loss": 0.63107777, + "num_input_tokens_seen": 36798775, + "router_z_loss_clip": 2.375, + "router_z_loss_mlp": 0.19335938, + "step": 1704, + "time_per_iteration": 3.1248717308044434 + }, + { + "auxiliary_loss_clip": 0.06706051, + "auxiliary_loss_mlp": 0.01308758, + "balance_loss_clip": 0.06321411, + "balance_loss_mlp": 0.0126775, + "epoch": 0.10251014579888772, + "flos": 16805724071040.0, + "grad_norm": 2.30707717904525, + "language_loss": 0.8659755, + "learning_rate": 3.945100657298039e-06, + "loss": 0.94612348, + "num_input_tokens_seen": 36816295, + "router_z_loss_clip": 3.84960938, + "router_z_loss_mlp": 0.41015625, + "step": 1705, + "time_per_iteration": 2.5850555896759033 + }, + { + "auxiliary_loss_clip": 0.06541149, + "auxiliary_loss_mlp": 0.01304681, + "balance_loss_clip": 0.06304285, + "balance_loss_mlp": 0.01286478, + "epoch": 0.1025702690515557, + "flos": 68584533459840.0, + "grad_norm": 0.7436655566620352, + "language_loss": 0.60505682, + "learning_rate": 3.9450099955579765e-06, + "loss": 0.68351519, + "num_input_tokens_seen": 36882030, + "router_z_loss_clip": 2.375, + "router_z_loss_mlp": 0.18212891, + "step": 1706, + "time_per_iteration": 3.239501953125 + }, + { + "auxiliary_loss_clip": 0.06703549, + "auxiliary_loss_mlp": 0.01305907, + "balance_loss_clip": 0.0632052, + "balance_loss_mlp": 0.01262729, + "epoch": 0.10263039230422366, + "flos": 14871939217920.0, + "grad_norm": 2.8485004441458637, + "language_loss": 0.88280994, + "learning_rate": 3.94491926006294e-06, + "loss": 0.96290451, + "num_input_tokens_seen": 36899245, + "router_z_loss_clip": 3.828125, + "router_z_loss_mlp": 0.43188477, + "step": 1707, + "time_per_iteration": 2.6399993896484375 + }, + { + "auxiliary_loss_clip": 0.0669533, + "auxiliary_loss_mlp": 0.01302799, + "balance_loss_clip": 0.06323209, + "balance_loss_mlp": 0.01262887, + "epoch": 0.10269051555689163, + "flos": 25344593447040.0, + "grad_norm": 2.5980108077369604, + "language_loss": 0.74784869, + "learning_rate": 3.944828450816369e-06, + "loss": 0.82783002, + "num_input_tokens_seen": 36920950, + "router_z_loss_clip": 3.71875, + "router_z_loss_mlp": 0.39892578, + "step": 1708, + "time_per_iteration": 2.654852867126465 + }, + { + "auxiliary_loss_clip": 0.06703041, + "auxiliary_loss_mlp": 0.01305178, + "balance_loss_clip": 0.06323138, + "balance_loss_mlp": 0.01263049, + "epoch": 0.10275063880955959, + "flos": 21075116832000.0, + "grad_norm": 2.060667127210552, + "language_loss": 0.92398179, + "learning_rate": 3.944737567821709e-06, + "loss": 1.00406396, + "num_input_tokens_seen": 36938900, + "router_z_loss_clip": 3.80078125, + "router_z_loss_mlp": 0.42114258, + "step": 1709, + "time_per_iteration": 2.573854446411133 + }, + { + "auxiliary_loss_clip": 0.06702737, + "auxiliary_loss_mlp": 0.01298282, + "balance_loss_clip": 0.06322797, + "balance_loss_mlp": 0.01257703, + "epoch": 0.10281076206222757, + "flos": 30373636124160.0, + "grad_norm": 12.814317235362356, + "language_loss": 0.90276158, + "learning_rate": 3.944646611082406e-06, + "loss": 0.98277175, + "num_input_tokens_seen": 36957010, + "router_z_loss_clip": 3.79882812, + "router_z_loss_mlp": 0.40551758, + "step": 1710, + "time_per_iteration": 2.6228139400482178 + }, + { + "auxiliary_loss_clip": 0.06701953, + "auxiliary_loss_mlp": 0.01305177, + "balance_loss_clip": 0.06325494, + "balance_loss_mlp": 0.01263096, + "epoch": 0.10287088531489554, + "flos": 22424748145920.0, + "grad_norm": 2.0240875797159554, + "language_loss": 0.80754149, + "learning_rate": 3.944555580601908e-06, + "loss": 0.88761282, + "num_input_tokens_seen": 36977690, + "router_z_loss_clip": 3.75976562, + "router_z_loss_mlp": 0.42089844, + "step": 1711, + "time_per_iteration": 2.583343982696533 + }, + { + "auxiliary_loss_clip": 0.06708579, + "auxiliary_loss_mlp": 0.01306816, + "balance_loss_clip": 0.06325286, + "balance_loss_mlp": 0.01263447, + "epoch": 0.1029310085675635, + "flos": 25122501400320.0, + "grad_norm": 2.3794944473216684, + "language_loss": 0.74649823, + "learning_rate": 3.944464476383668e-06, + "loss": 0.82665217, + "num_input_tokens_seen": 36997300, + "router_z_loss_clip": 3.83398438, + "router_z_loss_mlp": 0.43383789, + "step": 1712, + "time_per_iteration": 2.571152687072754 + }, + { + "auxiliary_loss_clip": 0.06692443, + "auxiliary_loss_mlp": 0.01305083, + "balance_loss_clip": 0.0632696, + "balance_loss_mlp": 0.01265911, + "epoch": 0.10299113182023148, + "flos": 19871869800960.0, + "grad_norm": 3.881117444097493, + "language_loss": 0.88232982, + "learning_rate": 3.94437329843114e-06, + "loss": 0.96230507, + "num_input_tokens_seen": 37016110, + "router_z_loss_clip": 3.65429688, + "router_z_loss_mlp": 0.3918457, + "step": 1713, + "time_per_iteration": 4.005250453948975 + }, + { + "auxiliary_loss_clip": 0.06698017, + "auxiliary_loss_mlp": 0.01309494, + "balance_loss_clip": 0.06326848, + "balance_loss_mlp": 0.0126789, + "epoch": 0.10305125507289944, + "flos": 20453633498880.0, + "grad_norm": 1.7755930908575366, + "language_loss": 0.74034607, + "learning_rate": 3.944282046747782e-06, + "loss": 0.82042122, + "num_input_tokens_seen": 37036405, + "router_z_loss_clip": 3.70703125, + "router_z_loss_mlp": 0.41601562, + "step": 1714, + "time_per_iteration": 2.5871846675872803 + }, + { + "auxiliary_loss_clip": 0.06718543, + "auxiliary_loss_mlp": 0.01323459, + "balance_loss_clip": 0.06333546, + "balance_loss_mlp": 0.01278446, + "epoch": 0.10311137832556741, + "flos": 26258090659200.0, + "grad_norm": 2.9350503756017425, + "language_loss": 0.92344153, + "learning_rate": 3.944190721337053e-06, + "loss": 1.00386155, + "num_input_tokens_seen": 37057580, + "router_z_loss_clip": 3.84765625, + "router_z_loss_mlp": 0.45043945, + "step": 1715, + "time_per_iteration": 4.0185253620147705 + }, + { + "auxiliary_loss_clip": 0.06704861, + "auxiliary_loss_mlp": 0.01311537, + "balance_loss_clip": 0.06330159, + "balance_loss_mlp": 0.01269957, + "epoch": 0.10317150157823539, + "flos": 35307711797760.0, + "grad_norm": 2.2230189858401834, + "language_loss": 0.77534348, + "learning_rate": 3.944099322202418e-06, + "loss": 0.85550749, + "num_input_tokens_seen": 37079120, + "router_z_loss_clip": 3.74804688, + "router_z_loss_mlp": 0.41577148, + "step": 1716, + "time_per_iteration": 2.6924543380737305 + }, + { + "auxiliary_loss_clip": 0.06704281, + "auxiliary_loss_mlp": 0.01322549, + "balance_loss_clip": 0.06326932, + "balance_loss_mlp": 0.01278037, + "epoch": 0.10323162483090335, + "flos": 25747171188480.0, + "grad_norm": 4.647251493858166, + "language_loss": 0.87329108, + "learning_rate": 3.944007849347342e-06, + "loss": 0.9535594, + "num_input_tokens_seen": 37099710, + "router_z_loss_clip": 3.76757812, + "router_z_loss_mlp": 0.44506836, + "step": 1717, + "time_per_iteration": 2.5771939754486084 + }, + { + "auxiliary_loss_clip": 0.06709914, + "auxiliary_loss_mlp": 0.01337871, + "balance_loss_clip": 0.06322803, + "balance_loss_mlp": 0.0129393, + "epoch": 0.10329174808357132, + "flos": 16295475432960.0, + "grad_norm": 2.5245058321168297, + "language_loss": 0.84142077, + "learning_rate": 3.943916302775292e-06, + "loss": 0.9218986, + "num_input_tokens_seen": 37117775, + "router_z_loss_clip": 3.87109375, + "router_z_loss_mlp": 0.43945312, + "step": 1718, + "time_per_iteration": 3.9576940536499023 + }, + { + "auxiliary_loss_clip": 0.06693481, + "auxiliary_loss_mlp": 0.01328919, + "balance_loss_clip": 0.06322589, + "balance_loss_mlp": 0.01288626, + "epoch": 0.10335187133623928, + "flos": 36696475768320.0, + "grad_norm": 4.723677538171457, + "language_loss": 0.75181365, + "learning_rate": 3.943824682489742e-06, + "loss": 0.83203769, + "num_input_tokens_seen": 37140280, + "router_z_loss_clip": 3.70703125, + "router_z_loss_mlp": 0.40283203, + "step": 1719, + "time_per_iteration": 4.132940769195557 + }, + { + "auxiliary_loss_clip": 0.06689329, + "auxiliary_loss_mlp": 0.01317642, + "balance_loss_clip": 0.06317558, + "balance_loss_mlp": 0.01278064, + "epoch": 0.10341199458890726, + "flos": 14980909852800.0, + "grad_norm": 1.9928809485399477, + "language_loss": 0.94301736, + "learning_rate": 3.9437329884941665e-06, + "loss": 1.02308702, + "num_input_tokens_seen": 37158350, + "router_z_loss_clip": 3.71679688, + "router_z_loss_mlp": 0.39575195, + "step": 1720, + "time_per_iteration": 2.53070068359375 + }, + { + "auxiliary_loss_clip": 0.06693915, + "auxiliary_loss_mlp": 0.01322313, + "balance_loss_clip": 0.06316631, + "balance_loss_mlp": 0.0127811, + "epoch": 0.10347211784157523, + "flos": 21037745111040.0, + "grad_norm": 2.2577738133608944, + "language_loss": 0.80850732, + "learning_rate": 3.943641220792039e-06, + "loss": 0.88866961, + "num_input_tokens_seen": 37177120, + "router_z_loss_clip": 3.77148438, + "router_z_loss_mlp": 0.44213867, + "step": 1721, + "time_per_iteration": 2.6165122985839844 + }, + { + "auxiliary_loss_clip": 0.06711201, + "auxiliary_loss_mlp": 0.01332384, + "balance_loss_clip": 0.06324577, + "balance_loss_mlp": 0.01286345, + "epoch": 0.1035322410942432, + "flos": 19798216462080.0, + "grad_norm": 2.2916288774806137, + "language_loss": 0.81885946, + "learning_rate": 3.9435493793868434e-06, + "loss": 0.89929533, + "num_input_tokens_seen": 37195895, + "router_z_loss_clip": 3.87109375, + "router_z_loss_mlp": 0.46044922, + "step": 1722, + "time_per_iteration": 2.585881471633911 + }, + { + "auxiliary_loss_clip": 0.06547391, + "auxiliary_loss_mlp": 0.01290481, + "balance_loss_clip": 0.06313527, + "balance_loss_mlp": 0.01272635, + "epoch": 0.10359236434691117, + "flos": 52716037305600.0, + "grad_norm": 0.9610809671594381, + "language_loss": 0.66722119, + "learning_rate": 3.943457464282059e-06, + "loss": 0.74559999, + "num_input_tokens_seen": 37247270, + "router_z_loss_clip": 2.34375, + "router_z_loss_mlp": 0.17883301, + "step": 1723, + "time_per_iteration": 2.9245951175689697 + }, + { + "auxiliary_loss_clip": 0.0669903, + "auxiliary_loss_mlp": 0.01310212, + "balance_loss_clip": 0.06318312, + "balance_loss_mlp": 0.01267582, + "epoch": 0.10365248759957914, + "flos": 18411255354240.0, + "grad_norm": 3.390195963482514, + "language_loss": 0.78785694, + "learning_rate": 3.9433654754811745e-06, + "loss": 0.86794937, + "num_input_tokens_seen": 37265595, + "router_z_loss_clip": 3.8046875, + "router_z_loss_mlp": 0.42651367, + "step": 1724, + "time_per_iteration": 2.587998151779175 + }, + { + "auxiliary_loss_clip": 0.06701188, + "auxiliary_loss_mlp": 0.01310671, + "balance_loss_clip": 0.06321733, + "balance_loss_mlp": 0.01269663, + "epoch": 0.1037126108522471, + "flos": 47563615820160.0, + "grad_norm": 2.288753840195378, + "language_loss": 0.76223904, + "learning_rate": 3.943273412987676e-06, + "loss": 0.84235764, + "num_input_tokens_seen": 37286660, + "router_z_loss_clip": 3.79296875, + "router_z_loss_mlp": 0.41015625, + "step": 1725, + "time_per_iteration": 2.7683663368225098 + }, + { + "auxiliary_loss_clip": 0.06675334, + "auxiliary_loss_mlp": 0.01298882, + "balance_loss_clip": 0.06309348, + "balance_loss_mlp": 0.01258041, + "epoch": 0.10377273410491508, + "flos": 22822671985920.0, + "grad_norm": 2.2764288322332265, + "language_loss": 0.76062018, + "learning_rate": 3.943181276805054e-06, + "loss": 0.84036231, + "num_input_tokens_seen": 37304915, + "router_z_loss_clip": 3.66601562, + "router_z_loss_mlp": 0.40869141, + "step": 1726, + "time_per_iteration": 2.587892770767212 + }, + { + "auxiliary_loss_clip": 0.06701919, + "auxiliary_loss_mlp": 0.01307243, + "balance_loss_clip": 0.0631658, + "balance_loss_mlp": 0.0126316, + "epoch": 0.10383285735758305, + "flos": 26145556225920.0, + "grad_norm": 2.697441848061202, + "language_loss": 0.76235563, + "learning_rate": 3.9430890669368035e-06, + "loss": 0.84244722, + "num_input_tokens_seen": 37325265, + "router_z_loss_clip": 3.85351562, + "router_z_loss_mlp": 0.44042969, + "step": 1727, + "time_per_iteration": 2.6308248043060303 + }, + { + "auxiliary_loss_clip": 0.06691539, + "auxiliary_loss_mlp": 0.0130793, + "balance_loss_clip": 0.0631765, + "balance_loss_mlp": 0.01265277, + "epoch": 0.10389298061025101, + "flos": 17097402533760.0, + "grad_norm": 2.4502843901442315, + "language_loss": 0.86415958, + "learning_rate": 3.942996783386422e-06, + "loss": 0.94415426, + "num_input_tokens_seen": 37341650, + "router_z_loss_clip": 3.73828125, + "router_z_loss_mlp": 0.42675781, + "step": 1728, + "time_per_iteration": 2.5618197917938232 + }, + { + "auxiliary_loss_clip": 0.06685561, + "auxiliary_loss_mlp": 0.01302161, + "balance_loss_clip": 0.06312057, + "balance_loss_mlp": 0.01259484, + "epoch": 0.10395310386291898, + "flos": 20782683682560.0, + "grad_norm": 2.0546311064170726, + "language_loss": 0.71406788, + "learning_rate": 3.942904426157406e-06, + "loss": 0.79394507, + "num_input_tokens_seen": 37360270, + "router_z_loss_clip": 3.73632812, + "router_z_loss_mlp": 0.42675781, + "step": 1729, + "time_per_iteration": 2.5618793964385986 + }, + { + "auxiliary_loss_clip": 0.06693864, + "auxiliary_loss_mlp": 0.01305753, + "balance_loss_clip": 0.06314608, + "balance_loss_mlp": 0.01260954, + "epoch": 0.10401322711558696, + "flos": 12825032952960.0, + "grad_norm": 2.8841772006205617, + "language_loss": 0.83575559, + "learning_rate": 3.9428119952532605e-06, + "loss": 0.91575181, + "num_input_tokens_seen": 37375225, + "router_z_loss_clip": 3.79492188, + "router_z_loss_mlp": 0.44775391, + "step": 1730, + "time_per_iteration": 2.623878002166748 + }, + { + "auxiliary_loss_clip": 0.06680113, + "auxiliary_loss_mlp": 0.01302214, + "balance_loss_clip": 0.06313114, + "balance_loss_mlp": 0.01260681, + "epoch": 0.10407335036825492, + "flos": 23191274096640.0, + "grad_norm": 1.835927341089653, + "language_loss": 0.77408624, + "learning_rate": 3.942719490677489e-06, + "loss": 0.85390949, + "num_input_tokens_seen": 37395165, + "router_z_loss_clip": 3.67382812, + "router_z_loss_mlp": 0.4152832, + "step": 1731, + "time_per_iteration": 2.5633392333984375 + }, + { + "auxiliary_loss_clip": 0.0668644, + "auxiliary_loss_mlp": 0.01313118, + "balance_loss_clip": 0.0632073, + "balance_loss_mlp": 0.01273159, + "epoch": 0.10413347362092289, + "flos": 26111370960000.0, + "grad_norm": 1.90471773366097, + "language_loss": 0.84198594, + "learning_rate": 3.9426269124336e-06, + "loss": 0.92198151, + "num_input_tokens_seen": 37414845, + "router_z_loss_clip": 3.65625, + "router_z_loss_mlp": 0.39941406, + "step": 1732, + "time_per_iteration": 2.6176345348358154 + }, + { + "auxiliary_loss_clip": 0.06683554, + "auxiliary_loss_mlp": 0.01314534, + "balance_loss_clip": 0.06312263, + "balance_loss_mlp": 0.01271905, + "epoch": 0.10419359687359087, + "flos": 12646014704640.0, + "grad_norm": 2.549467420686237, + "language_loss": 0.8515988, + "learning_rate": 3.942534260525104e-06, + "loss": 0.93157971, + "num_input_tokens_seen": 37432490, + "router_z_loss_clip": 3.71679688, + "router_z_loss_mlp": 0.42626953, + "step": 1733, + "time_per_iteration": 2.529829978942871 + }, + { + "auxiliary_loss_clip": 0.06699164, + "auxiliary_loss_mlp": 0.01313294, + "balance_loss_clip": 0.06323372, + "balance_loss_mlp": 0.01269139, + "epoch": 0.10425372012625883, + "flos": 12129099667200.0, + "grad_norm": 4.348408719624472, + "language_loss": 0.78445566, + "learning_rate": 3.942441534955514e-06, + "loss": 0.86458015, + "num_input_tokens_seen": 37449435, + "router_z_loss_clip": 3.7578125, + "router_z_loss_mlp": 0.44165039, + "step": 1734, + "time_per_iteration": 2.5436649322509766 + }, + { + "auxiliary_loss_clip": 0.06683113, + "auxiliary_loss_mlp": 0.01310658, + "balance_loss_clip": 0.06320634, + "balance_loss_mlp": 0.01270937, + "epoch": 0.1043138433789268, + "flos": 25344551520000.0, + "grad_norm": 1.8276863047745044, + "language_loss": 0.76546466, + "learning_rate": 3.9423487357283465e-06, + "loss": 0.84540236, + "num_input_tokens_seen": 37469105, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 0.3972168, + "step": 1735, + "time_per_iteration": 2.6129813194274902 + }, + { + "auxiliary_loss_clip": 0.06697765, + "auxiliary_loss_mlp": 0.01313856, + "balance_loss_clip": 0.06318491, + "balance_loss_mlp": 0.01269438, + "epoch": 0.10437396663159478, + "flos": 29174539870080.0, + "grad_norm": 2.0479038136948735, + "language_loss": 0.80253965, + "learning_rate": 3.94225586284712e-06, + "loss": 0.88265586, + "num_input_tokens_seen": 37490540, + "router_z_loss_clip": 3.79492188, + "router_z_loss_mlp": 0.44360352, + "step": 1736, + "time_per_iteration": 2.6438446044921875 + }, + { + "auxiliary_loss_clip": 0.06694648, + "auxiliary_loss_mlp": 0.01312039, + "balance_loss_clip": 0.06322388, + "balance_loss_mlp": 0.01269267, + "epoch": 0.10443408988426274, + "flos": 25087687228800.0, + "grad_norm": 4.638523885209388, + "language_loss": 0.71961701, + "learning_rate": 3.942162916315356e-06, + "loss": 0.79968387, + "num_input_tokens_seen": 37511905, + "router_z_loss_clip": 3.72460938, + "router_z_loss_mlp": 0.42773438, + "step": 1737, + "time_per_iteration": 2.5947039127349854 + }, + { + "auxiliary_loss_clip": 0.06704547, + "auxiliary_loss_mlp": 0.01309535, + "balance_loss_clip": 0.06322168, + "balance_loss_mlp": 0.01263305, + "epoch": 0.1044942131369307, + "flos": 26766746069760.0, + "grad_norm": 2.5677527060209715, + "language_loss": 0.83228981, + "learning_rate": 3.942069896136581e-06, + "loss": 0.91243058, + "num_input_tokens_seen": 37533635, + "router_z_loss_clip": 3.82617188, + "router_z_loss_mlp": 0.46191406, + "step": 1738, + "time_per_iteration": 2.615252733230591 + }, + { + "auxiliary_loss_clip": 0.06695886, + "auxiliary_loss_mlp": 0.01310975, + "balance_loss_clip": 0.06315427, + "balance_loss_mlp": 0.01265747, + "epoch": 0.10455433638959867, + "flos": 18448543221120.0, + "grad_norm": 2.179337588406841, + "language_loss": 0.76366144, + "learning_rate": 3.9419768023143196e-06, + "loss": 0.84373009, + "num_input_tokens_seen": 37552035, + "router_z_loss_clip": 3.80273438, + "router_z_loss_mlp": 0.45239258, + "step": 1739, + "time_per_iteration": 2.5386781692504883 + }, + { + "auxiliary_loss_clip": 0.06684839, + "auxiliary_loss_mlp": 0.01316183, + "balance_loss_clip": 0.06310752, + "balance_loss_mlp": 0.01271456, + "epoch": 0.10461445964226665, + "flos": 23225207800320.0, + "grad_norm": 1.9549702888486553, + "language_loss": 0.7847473, + "learning_rate": 3.941883634852104e-06, + "loss": 0.86475754, + "num_input_tokens_seen": 37571540, + "router_z_loss_clip": 3.74023438, + "router_z_loss_mlp": 0.44775391, + "step": 1740, + "time_per_iteration": 2.6215531826019287 + }, + { + "auxiliary_loss_clip": 0.06687017, + "auxiliary_loss_mlp": 0.01315844, + "balance_loss_clip": 0.06320937, + "balance_loss_mlp": 0.01273953, + "epoch": 0.10467458289493461, + "flos": 24350860350720.0, + "grad_norm": 2.5281783737696246, + "language_loss": 0.86859214, + "learning_rate": 3.941790393753467e-06, + "loss": 0.94862068, + "num_input_tokens_seen": 37588265, + "router_z_loss_clip": 3.66210938, + "router_z_loss_mlp": 0.41894531, + "step": 1741, + "time_per_iteration": 2.5947859287261963 + }, + { + "auxiliary_loss_clip": 0.06689818, + "auxiliary_loss_mlp": 0.01306432, + "balance_loss_clip": 0.06307445, + "balance_loss_mlp": 0.01259201, + "epoch": 0.10473470614760258, + "flos": 21294315912960.0, + "grad_norm": 3.2114625668667367, + "language_loss": 0.76732343, + "learning_rate": 3.941697079021942e-06, + "loss": 0.84728593, + "num_input_tokens_seen": 37606860, + "router_z_loss_clip": 3.82421875, + "router_z_loss_mlp": 0.47265625, + "step": 1742, + "time_per_iteration": 2.5832579135894775 + }, + { + "auxiliary_loss_clip": 0.06678567, + "auxiliary_loss_mlp": 0.01303781, + "balance_loss_clip": 0.06306475, + "balance_loss_mlp": 0.01260628, + "epoch": 0.10479482940027056, + "flos": 21693287928960.0, + "grad_norm": 9.553870000179, + "language_loss": 0.89069176, + "learning_rate": 3.94160369066107e-06, + "loss": 0.97051525, + "num_input_tokens_seen": 37625210, + "router_z_loss_clip": 3.71875, + "router_z_loss_mlp": 0.43164062, + "step": 1743, + "time_per_iteration": 2.5764474868774414 + }, + { + "auxiliary_loss_clip": 0.06671779, + "auxiliary_loss_mlp": 0.01307955, + "balance_loss_clip": 0.06307401, + "balance_loss_mlp": 0.01264801, + "epoch": 0.10485495265293852, + "flos": 21579076414080.0, + "grad_norm": 2.2332748103162907, + "language_loss": 0.77711093, + "learning_rate": 3.941510228674391e-06, + "loss": 0.8569082, + "num_input_tokens_seen": 37644110, + "router_z_loss_clip": 3.64648438, + "router_z_loss_mlp": 0.43164062, + "step": 1744, + "time_per_iteration": 2.5712687969207764 + }, + { + "auxiliary_loss_clip": 0.06674588, + "auxiliary_loss_mlp": 0.01310978, + "balance_loss_clip": 0.06307609, + "balance_loss_mlp": 0.01270685, + "epoch": 0.10491507590560649, + "flos": 37971070151040.0, + "grad_norm": 4.071178521090377, + "language_loss": 0.81752264, + "learning_rate": 3.941416693065451e-06, + "loss": 0.89737833, + "num_input_tokens_seen": 37665800, + "router_z_loss_clip": 3.671875, + "router_z_loss_mlp": 0.40332031, + "step": 1745, + "time_per_iteration": 2.7351014614105225 + }, + { + "auxiliary_loss_clip": 0.06685829, + "auxiliary_loss_mlp": 0.01305127, + "balance_loss_clip": 0.0631006, + "balance_loss_mlp": 0.01260472, + "epoch": 0.10497519915827447, + "flos": 26403552547200.0, + "grad_norm": 2.408878958176613, + "language_loss": 0.84535897, + "learning_rate": 3.941323083837794e-06, + "loss": 0.92526853, + "num_input_tokens_seen": 37685095, + "router_z_loss_clip": 3.7578125, + "router_z_loss_mlp": 0.44628906, + "step": 1746, + "time_per_iteration": 2.6103639602661133 + }, + { + "auxiliary_loss_clip": 0.06678679, + "auxiliary_loss_mlp": 0.01312181, + "balance_loss_clip": 0.06308784, + "balance_loss_mlp": 0.01272174, + "epoch": 0.10503532241094243, + "flos": 40671842152320.0, + "grad_norm": 2.4792988701606444, + "language_loss": 0.72187877, + "learning_rate": 3.941229400994971e-06, + "loss": 0.80178738, + "num_input_tokens_seen": 37707445, + "router_z_loss_clip": 3.69921875, + "router_z_loss_mlp": 0.40014648, + "step": 1747, + "time_per_iteration": 2.7907614707946777 + }, + { + "auxiliary_loss_clip": 0.06697921, + "auxiliary_loss_mlp": 0.01310121, + "balance_loss_clip": 0.06312211, + "balance_loss_mlp": 0.0126432, + "epoch": 0.1050954456636104, + "flos": 29797239087360.0, + "grad_norm": 4.268942313212568, + "language_loss": 0.86334866, + "learning_rate": 3.941135644540535e-06, + "loss": 0.94342911, + "num_input_tokens_seen": 37728325, + "router_z_loss_clip": 3.859375, + "router_z_loss_mlp": 0.45825195, + "step": 1748, + "time_per_iteration": 2.6081960201263428 + }, + { + "auxiliary_loss_clip": 0.06687598, + "auxiliary_loss_mlp": 0.01305718, + "balance_loss_clip": 0.06311792, + "balance_loss_mlp": 0.0126409, + "epoch": 0.10515556891627838, + "flos": 23955116716800.0, + "grad_norm": 1.9464829787737532, + "language_loss": 0.73449892, + "learning_rate": 3.941041814478041e-06, + "loss": 0.81443208, + "num_input_tokens_seen": 37748910, + "router_z_loss_clip": 3.76171875, + "router_z_loss_mlp": 0.41625977, + "step": 1749, + "time_per_iteration": 2.628052234649658 + }, + { + "auxiliary_loss_clip": 0.06669957, + "auxiliary_loss_mlp": 0.01310674, + "balance_loss_clip": 0.0630856, + "balance_loss_mlp": 0.01270882, + "epoch": 0.10521569216894634, + "flos": 18265458049920.0, + "grad_norm": 3.456638635747079, + "language_loss": 0.84465253, + "learning_rate": 3.940947910811047e-06, + "loss": 0.92445886, + "num_input_tokens_seen": 37765745, + "router_z_loss_clip": 3.61328125, + "router_z_loss_mlp": 0.39794922, + "step": 1750, + "time_per_iteration": 2.537736177444458 + }, + { + "auxiliary_loss_clip": 0.06687038, + "auxiliary_loss_mlp": 0.01306152, + "balance_loss_clip": 0.06307652, + "balance_loss_mlp": 0.01264238, + "epoch": 0.10527581542161431, + "flos": 15636033400320.0, + "grad_norm": 3.4228490231822364, + "language_loss": 0.94313812, + "learning_rate": 3.940853933543114e-06, + "loss": 1.0230701, + "num_input_tokens_seen": 37780520, + "router_z_loss_clip": 3.79101562, + "router_z_loss_mlp": 0.41918945, + "step": 1751, + "time_per_iteration": 2.525054931640625 + }, + { + "auxiliary_loss_clip": 0.06674927, + "auxiliary_loss_mlp": 0.01302904, + "balance_loss_clip": 0.06309814, + "balance_loss_mlp": 0.01265686, + "epoch": 0.10533593867428227, + "flos": 18302494354560.0, + "grad_norm": 3.1318677329631757, + "language_loss": 0.8055681, + "learning_rate": 3.940759882677805e-06, + "loss": 0.88534641, + "num_input_tokens_seen": 37799515, + "router_z_loss_clip": 3.65234375, + "router_z_loss_mlp": 0.37207031, + "step": 1752, + "time_per_iteration": 2.61299467086792 + }, + { + "auxiliary_loss_clip": 0.06668897, + "auxiliary_loss_mlp": 0.01309257, + "balance_loss_clip": 0.06304127, + "balance_loss_mlp": 0.01268869, + "epoch": 0.10539606192695025, + "flos": 29030922771840.0, + "grad_norm": 1.9587092194109417, + "language_loss": 0.77260768, + "learning_rate": 3.940665758218686e-06, + "loss": 0.85238922, + "num_input_tokens_seen": 37818695, + "router_z_loss_clip": 3.64453125, + "router_z_loss_mlp": 0.40356445, + "step": 1753, + "time_per_iteration": 3.9985692501068115 + }, + { + "auxiliary_loss_clip": 0.06682716, + "auxiliary_loss_mlp": 0.01311036, + "balance_loss_clip": 0.06304091, + "balance_loss_mlp": 0.01267, + "epoch": 0.10545618517961822, + "flos": 19974593306880.0, + "grad_norm": 2.3568862676270244, + "language_loss": 0.85363507, + "learning_rate": 3.940571560169328e-06, + "loss": 0.93357253, + "num_input_tokens_seen": 37837860, + "router_z_loss_clip": 3.78710938, + "router_z_loss_mlp": 0.44067383, + "step": 1754, + "time_per_iteration": 2.5938985347747803 + }, + { + "auxiliary_loss_clip": 0.06682456, + "auxiliary_loss_mlp": 0.01316264, + "balance_loss_clip": 0.06304919, + "balance_loss_mlp": 0.012723, + "epoch": 0.10551630843228618, + "flos": 16148923441920.0, + "grad_norm": 4.265882829931168, + "language_loss": 0.71315837, + "learning_rate": 3.940477288533302e-06, + "loss": 0.7931456, + "num_input_tokens_seen": 37856260, + "router_z_loss_clip": 3.77148438, + "router_z_loss_mlp": 0.43969727, + "step": 1755, + "time_per_iteration": 3.9860999584198 + }, + { + "auxiliary_loss_clip": 0.06684709, + "auxiliary_loss_mlp": 0.01318348, + "balance_loss_clip": 0.06302933, + "balance_loss_mlp": 0.01273025, + "epoch": 0.10557643168495416, + "flos": 23446754795520.0, + "grad_norm": 2.7157076999837364, + "language_loss": 0.78681093, + "learning_rate": 3.940382943314182e-06, + "loss": 0.86684155, + "num_input_tokens_seen": 37876960, + "router_z_loss_clip": 3.8203125, + "router_z_loss_mlp": 0.453125, + "step": 1756, + "time_per_iteration": 2.616227149963379 + }, + { + "auxiliary_loss_clip": 0.06683522, + "auxiliary_loss_mlp": 0.01310683, + "balance_loss_clip": 0.06306458, + "balance_loss_mlp": 0.0126927, + "epoch": 0.10563655493762213, + "flos": 21805528872960.0, + "grad_norm": 1.8370818155350874, + "language_loss": 0.81619543, + "learning_rate": 3.940288524515547e-06, + "loss": 0.89613748, + "num_input_tokens_seen": 37897070, + "router_z_loss_clip": 3.77148438, + "router_z_loss_mlp": 0.41381836, + "step": 1757, + "time_per_iteration": 2.5410592555999756 + }, + { + "auxiliary_loss_clip": 0.06685489, + "auxiliary_loss_mlp": 0.01318192, + "balance_loss_clip": 0.06307954, + "balance_loss_mlp": 0.01272177, + "epoch": 0.10569667819029009, + "flos": 53813347176960.0, + "grad_norm": 2.270274116106966, + "language_loss": 0.800345, + "learning_rate": 3.940194032140976e-06, + "loss": 0.88038182, + "num_input_tokens_seen": 37923635, + "router_z_loss_clip": 3.77734375, + "router_z_loss_mlp": 0.46020508, + "step": 1758, + "time_per_iteration": 4.229799032211304 + }, + { + "auxiliary_loss_clip": 0.06687906, + "auxiliary_loss_mlp": 0.01314474, + "balance_loss_clip": 0.06312382, + "balance_loss_mlp": 0.01272537, + "epoch": 0.10575680144295807, + "flos": 22931432985600.0, + "grad_norm": 1.92460183667747, + "language_loss": 0.93262696, + "learning_rate": 3.940099466194054e-06, + "loss": 1.01265085, + "num_input_tokens_seen": 37942650, + "router_z_loss_clip": 3.75585938, + "router_z_loss_mlp": 0.41967773, + "step": 1759, + "time_per_iteration": 4.090106248855591 + }, + { + "auxiliary_loss_clip": 0.066918, + "auxiliary_loss_mlp": 0.01305635, + "balance_loss_clip": 0.06315835, + "balance_loss_mlp": 0.01262219, + "epoch": 0.10581692469562604, + "flos": 14141820666240.0, + "grad_norm": 3.0343588084928204, + "language_loss": 0.78992438, + "learning_rate": 3.940004826678365e-06, + "loss": 0.86989868, + "num_input_tokens_seen": 37960660, + "router_z_loss_clip": 3.75976562, + "router_z_loss_mlp": 0.43383789, + "step": 1760, + "time_per_iteration": 2.5582082271575928 + }, + { + "auxiliary_loss_clip": 0.06697676, + "auxiliary_loss_mlp": 0.0131432, + "balance_loss_clip": 0.06312977, + "balance_loss_mlp": 0.01266588, + "epoch": 0.105877047948294, + "flos": 25965909072000.0, + "grad_norm": 2.31808263898244, + "language_loss": 0.91032952, + "learning_rate": 3.939910113597498e-06, + "loss": 0.99044949, + "num_input_tokens_seen": 37978625, + "router_z_loss_clip": 3.8515625, + "router_z_loss_mlp": 0.47729492, + "step": 1761, + "time_per_iteration": 2.5757992267608643 + }, + { + "auxiliary_loss_clip": 0.06676473, + "auxiliary_loss_mlp": 0.01306238, + "balance_loss_clip": 0.06308871, + "balance_loss_mlp": 0.01264229, + "epoch": 0.10593717120096197, + "flos": 30672693745920.0, + "grad_norm": 2.4539135080814862, + "language_loss": 0.79606199, + "learning_rate": 3.9398153269550464e-06, + "loss": 0.87588912, + "num_input_tokens_seen": 38000005, + "router_z_loss_clip": 3.6796875, + "router_z_loss_mlp": 0.42041016, + "step": 1762, + "time_per_iteration": 2.6716315746307373 + }, + { + "auxiliary_loss_clip": 0.06617578, + "auxiliary_loss_mlp": 0.01351391, + "balance_loss_clip": 0.06387473, + "balance_loss_mlp": 0.01331745, + "epoch": 0.10599729445362994, + "flos": 66459347153280.0, + "grad_norm": 0.7549006377741803, + "language_loss": 0.60690284, + "learning_rate": 3.939720466754602e-06, + "loss": 0.68659246, + "num_input_tokens_seen": 38066165, + "router_z_loss_clip": 2.31054688, + "router_z_loss_mlp": 0.19628906, + "step": 1763, + "time_per_iteration": 3.3268401622772217 + }, + { + "auxiliary_loss_clip": 0.06678826, + "auxiliary_loss_mlp": 0.01304205, + "balance_loss_clip": 0.06307326, + "balance_loss_mlp": 0.01263221, + "epoch": 0.10605741770629791, + "flos": 23954445884160.0, + "grad_norm": 2.5468873407149744, + "language_loss": 0.81550586, + "learning_rate": 3.939625532999763e-06, + "loss": 0.89533615, + "num_input_tokens_seen": 38086150, + "router_z_loss_clip": 3.71289062, + "router_z_loss_mlp": 0.40991211, + "step": 1764, + "time_per_iteration": 2.6332688331604004 + }, + { + "auxiliary_loss_clip": 0.06680285, + "auxiliary_loss_mlp": 0.01305528, + "balance_loss_clip": 0.06314, + "balance_loss_mlp": 0.0126359, + "epoch": 0.10611754095896588, + "flos": 19393039244160.0, + "grad_norm": 2.1888720223736384, + "language_loss": 0.81130767, + "learning_rate": 3.9395305256941314e-06, + "loss": 0.89116579, + "num_input_tokens_seen": 38104205, + "router_z_loss_clip": 3.66210938, + "router_z_loss_mlp": 0.41943359, + "step": 1765, + "time_per_iteration": 2.5613298416137695 + }, + { + "auxiliary_loss_clip": 0.0667872, + "auxiliary_loss_mlp": 0.01306506, + "balance_loss_clip": 0.06306241, + "balance_loss_mlp": 0.01263328, + "epoch": 0.10617766421163385, + "flos": 22244472086400.0, + "grad_norm": 2.2657345433152853, + "language_loss": 0.78213799, + "learning_rate": 3.939435444841306e-06, + "loss": 0.86199021, + "num_input_tokens_seen": 38122005, + "router_z_loss_clip": 3.72460938, + "router_z_loss_mlp": 0.43188477, + "step": 1766, + "time_per_iteration": 2.596531867980957 + }, + { + "auxiliary_loss_clip": 0.0668143, + "auxiliary_loss_mlp": 0.01312404, + "balance_loss_clip": 0.06318849, + "balance_loss_mlp": 0.01270705, + "epoch": 0.10623778746430182, + "flos": 28412248550400.0, + "grad_norm": 1.8379569457301719, + "language_loss": 0.78568375, + "learning_rate": 3.939340290444895e-06, + "loss": 0.8656221, + "num_input_tokens_seen": 38143365, + "router_z_loss_clip": 3.62304688, + "router_z_loss_mlp": 0.41674805, + "step": 1767, + "time_per_iteration": 2.6066575050354004 + }, + { + "auxiliary_loss_clip": 0.06566842, + "auxiliary_loss_mlp": 0.01278755, + "balance_loss_clip": 0.06337046, + "balance_loss_mlp": 0.01260039, + "epoch": 0.10629791071696978, + "flos": 64254778231680.0, + "grad_norm": 0.6896173149576642, + "language_loss": 0.57757622, + "learning_rate": 3.939245062508506e-06, + "loss": 0.6560322, + "num_input_tokens_seen": 38210035, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.18688965, + "step": 1768, + "time_per_iteration": 3.3073205947875977 + }, + { + "auxiliary_loss_clip": 0.06681848, + "auxiliary_loss_mlp": 0.01302238, + "balance_loss_clip": 0.06313933, + "balance_loss_mlp": 0.01260634, + "epoch": 0.10635803396963776, + "flos": 22754217600000.0, + "grad_norm": 1.7735238866189138, + "language_loss": 0.88016206, + "learning_rate": 3.939149761035749e-06, + "loss": 0.9600029, + "num_input_tokens_seen": 38231230, + "router_z_loss_clip": 3.67578125, + "router_z_loss_mlp": 0.41625977, + "step": 1769, + "time_per_iteration": 2.59757924079895 + }, + { + "auxiliary_loss_clip": 0.06688489, + "auxiliary_loss_mlp": 0.01307377, + "balance_loss_clip": 0.06315921, + "balance_loss_mlp": 0.01266035, + "epoch": 0.10641815722230573, + "flos": 31403818546560.0, + "grad_norm": 1.8774824554466385, + "language_loss": 0.62396371, + "learning_rate": 3.9390543860302395e-06, + "loss": 0.70392233, + "num_input_tokens_seen": 38253890, + "router_z_loss_clip": 3.72460938, + "router_z_loss_mlp": 0.41357422, + "step": 1770, + "time_per_iteration": 2.619767904281616 + }, + { + "auxiliary_loss_clip": 0.06544405, + "auxiliary_loss_mlp": 0.01277398, + "balance_loss_clip": 0.06314689, + "balance_loss_mlp": 0.01260136, + "epoch": 0.1064782804749737, + "flos": 58567230645120.0, + "grad_norm": 0.8566843095142983, + "language_loss": 0.57127362, + "learning_rate": 3.9389589374955925e-06, + "loss": 0.64949167, + "num_input_tokens_seen": 38304290, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.17285156, + "step": 1771, + "time_per_iteration": 3.075225353240967 + }, + { + "auxiliary_loss_clip": 0.06680871, + "auxiliary_loss_mlp": 0.01316894, + "balance_loss_clip": 0.06314114, + "balance_loss_mlp": 0.01274432, + "epoch": 0.10653840372764166, + "flos": 23994626716800.0, + "grad_norm": 1.9413884947034454, + "language_loss": 0.90273499, + "learning_rate": 3.938863415435429e-06, + "loss": 0.98271263, + "num_input_tokens_seen": 38324725, + "router_z_loss_clip": 3.66601562, + "router_z_loss_mlp": 0.42431641, + "step": 1772, + "time_per_iteration": 2.5640146732330322 + }, + { + "auxiliary_loss_clip": 0.06695, + "auxiliary_loss_mlp": 0.01317722, + "balance_loss_clip": 0.0631227, + "balance_loss_mlp": 0.01272828, + "epoch": 0.10659852698030964, + "flos": 18300272221440.0, + "grad_norm": 4.259637608820723, + "language_loss": 0.78636491, + "learning_rate": 3.93876781985337e-06, + "loss": 0.86649209, + "num_input_tokens_seen": 38340735, + "router_z_loss_clip": 3.83203125, + "router_z_loss_mlp": 0.44824219, + "step": 1773, + "time_per_iteration": 2.528411626815796 + }, + { + "auxiliary_loss_clip": 0.06679896, + "auxiliary_loss_mlp": 0.01313366, + "balance_loss_clip": 0.06312554, + "balance_loss_mlp": 0.01272024, + "epoch": 0.1066586502329776, + "flos": 32168751269760.0, + "grad_norm": 2.123173958110219, + "language_loss": 0.84472597, + "learning_rate": 3.938672150753041e-06, + "loss": 0.92465854, + "num_input_tokens_seen": 38361315, + "router_z_loss_clip": 3.67578125, + "router_z_loss_mlp": 0.41333008, + "step": 1774, + "time_per_iteration": 2.6232900619506836 + }, + { + "auxiliary_loss_clip": 0.06689709, + "auxiliary_loss_mlp": 0.01315484, + "balance_loss_clip": 0.06314571, + "balance_loss_mlp": 0.0127245, + "epoch": 0.10671877348564557, + "flos": 17790904051200.0, + "grad_norm": 3.7633279602301326, + "language_loss": 0.78288794, + "learning_rate": 3.9385764081380704e-06, + "loss": 0.86293983, + "num_input_tokens_seen": 38377425, + "router_z_loss_clip": 3.75585938, + "router_z_loss_mlp": 0.43066406, + "step": 1775, + "time_per_iteration": 2.5444161891937256 + }, + { + "auxiliary_loss_clip": 0.06541309, + "auxiliary_loss_mlp": 0.01282331, + "balance_loss_clip": 0.06314777, + "balance_loss_mlp": 0.0126594, + "epoch": 0.10677889673831355, + "flos": 63531074517120.0, + "grad_norm": 0.8449773894494127, + "language_loss": 0.57561356, + "learning_rate": 3.9384805920120876e-06, + "loss": 0.65384996, + "num_input_tokens_seen": 38440275, + "router_z_loss_clip": 2.265625, + "router_z_loss_mlp": 0.16394043, + "step": 1776, + "time_per_iteration": 3.194715976715088 + }, + { + "auxiliary_loss_clip": 0.06668387, + "auxiliary_loss_mlp": 0.01308478, + "balance_loss_clip": 0.063052, + "balance_loss_mlp": 0.01266421, + "epoch": 0.10683901999098151, + "flos": 22024182902400.0, + "grad_norm": 4.182030492494299, + "language_loss": 0.84917277, + "learning_rate": 3.938384702378727e-06, + "loss": 0.92894137, + "num_input_tokens_seen": 38461820, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 0.42041016, + "step": 1777, + "time_per_iteration": 2.595827102661133 + }, + { + "auxiliary_loss_clip": 0.06665277, + "auxiliary_loss_mlp": 0.01305083, + "balance_loss_clip": 0.06308808, + "balance_loss_mlp": 0.01265076, + "epoch": 0.10689914324364948, + "flos": 25049435040000.0, + "grad_norm": 3.105295988575609, + "language_loss": 0.89778632, + "learning_rate": 3.938288739241625e-06, + "loss": 0.97748995, + "num_input_tokens_seen": 38482235, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 0.40014648, + "step": 1778, + "time_per_iteration": 2.5659501552581787 + }, + { + "auxiliary_loss_clip": 0.06673209, + "auxiliary_loss_mlp": 0.0130986, + "balance_loss_clip": 0.06311059, + "balance_loss_mlp": 0.01270068, + "epoch": 0.10695926649631746, + "flos": 16440643831680.0, + "grad_norm": 2.394911901784639, + "language_loss": 0.85383832, + "learning_rate": 3.938192702604417e-06, + "loss": 0.93366897, + "num_input_tokens_seen": 38500690, + "router_z_loss_clip": 3.62304688, + "router_z_loss_mlp": 0.39794922, + "step": 1779, + "time_per_iteration": 2.593081474304199 + }, + { + "auxiliary_loss_clip": 0.06673639, + "auxiliary_loss_mlp": 0.01307049, + "balance_loss_clip": 0.06310658, + "balance_loss_mlp": 0.01266255, + "epoch": 0.10701938974898542, + "flos": 16984281121920.0, + "grad_norm": 6.263456292034634, + "language_loss": 0.689089, + "learning_rate": 3.9380965924707495e-06, + "loss": 0.76889586, + "num_input_tokens_seen": 38518405, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 0.40844727, + "step": 1780, + "time_per_iteration": 2.5288658142089844 + }, + { + "auxiliary_loss_clip": 0.06670965, + "auxiliary_loss_mlp": 0.01308635, + "balance_loss_clip": 0.06307741, + "balance_loss_mlp": 0.01267675, + "epoch": 0.10707951300165339, + "flos": 15893568524160.0, + "grad_norm": 2.7813039840033116, + "language_loss": 0.94183797, + "learning_rate": 3.938000408844265e-06, + "loss": 1.02163386, + "num_input_tokens_seen": 38535060, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 0.40942383, + "step": 1781, + "time_per_iteration": 2.5472099781036377 + }, + { + "auxiliary_loss_clip": 0.06674273, + "auxiliary_loss_mlp": 0.01309874, + "balance_loss_clip": 0.06307364, + "balance_loss_mlp": 0.01267793, + "epoch": 0.10713963625432135, + "flos": 14252510309760.0, + "grad_norm": 2.902551508287184, + "language_loss": 0.80661923, + "learning_rate": 3.9379041517286105e-06, + "loss": 0.88646066, + "num_input_tokens_seen": 38552855, + "router_z_loss_clip": 3.671875, + "router_z_loss_mlp": 0.4206543, + "step": 1782, + "time_per_iteration": 2.510643482208252 + }, + { + "auxiliary_loss_clip": 0.06686161, + "auxiliary_loss_mlp": 0.01310662, + "balance_loss_clip": 0.06313431, + "balance_loss_mlp": 0.01267341, + "epoch": 0.10719975950698933, + "flos": 16761224753280.0, + "grad_norm": 2.870404925374148, + "language_loss": 0.80170923, + "learning_rate": 3.937807821127436e-06, + "loss": 0.88167745, + "num_input_tokens_seen": 38570075, + "router_z_loss_clip": 3.7265625, + "router_z_loss_mlp": 0.43334961, + "step": 1783, + "time_per_iteration": 2.5342109203338623 + }, + { + "auxiliary_loss_clip": 0.06683534, + "auxiliary_loss_mlp": 0.01311834, + "balance_loss_clip": 0.063077, + "balance_loss_mlp": 0.0126818, + "epoch": 0.1072598827596573, + "flos": 22717181295360.0, + "grad_norm": 2.882000106412139, + "language_loss": 0.88123596, + "learning_rate": 3.937711417044395e-06, + "loss": 0.96118969, + "num_input_tokens_seen": 38587970, + "router_z_loss_clip": 3.75390625, + "router_z_loss_mlp": 0.4362793, + "step": 1784, + "time_per_iteration": 2.5347747802734375 + }, + { + "auxiliary_loss_clip": 0.0667218, + "auxiliary_loss_mlp": 0.0129997, + "balance_loss_clip": 0.06303082, + "balance_loss_mlp": 0.01257484, + "epoch": 0.10732000601232526, + "flos": 23264969362560.0, + "grad_norm": 3.307544320202646, + "language_loss": 1.02124667, + "learning_rate": 3.937614939483143e-06, + "loss": 1.10096812, + "num_input_tokens_seen": 38605840, + "router_z_loss_clip": 3.6875, + "router_z_loss_mlp": 0.42480469, + "step": 1785, + "time_per_iteration": 2.573028802871704 + }, + { + "auxiliary_loss_clip": 0.06653184, + "auxiliary_loss_mlp": 0.01298346, + "balance_loss_clip": 0.06301528, + "balance_loss_mlp": 0.01260676, + "epoch": 0.10738012926499324, + "flos": 24213951578880.0, + "grad_norm": 1.5126040850021356, + "language_loss": 0.86291718, + "learning_rate": 3.937518388447339e-06, + "loss": 0.94243246, + "num_input_tokens_seen": 38627070, + "router_z_loss_clip": 3.51757812, + "router_z_loss_mlp": 0.37670898, + "step": 1786, + "time_per_iteration": 2.583588123321533 + }, + { + "auxiliary_loss_clip": 0.06674268, + "auxiliary_loss_mlp": 0.01305446, + "balance_loss_clip": 0.06299917, + "balance_loss_mlp": 0.01260337, + "epoch": 0.1074402525176612, + "flos": 20929361454720.0, + "grad_norm": 2.204457856509681, + "language_loss": 0.80718577, + "learning_rate": 3.937421763940642e-06, + "loss": 0.88698298, + "num_input_tokens_seen": 38645840, + "router_z_loss_clip": 3.74414062, + "router_z_loss_mlp": 0.45092773, + "step": 1787, + "time_per_iteration": 2.5648107528686523 + }, + { + "auxiliary_loss_clip": 0.06675328, + "auxiliary_loss_mlp": 0.01304706, + "balance_loss_clip": 0.06304328, + "balance_loss_mlp": 0.01262769, + "epoch": 0.10750037577032917, + "flos": 16952695332480.0, + "grad_norm": 2.64327450986053, + "language_loss": 0.8385697, + "learning_rate": 3.937325065966719e-06, + "loss": 0.91837001, + "num_input_tokens_seen": 38664770, + "router_z_loss_clip": 3.71484375, + "router_z_loss_mlp": 0.41943359, + "step": 1788, + "time_per_iteration": 2.5402321815490723 + }, + { + "auxiliary_loss_clip": 0.06668989, + "auxiliary_loss_mlp": 0.01316653, + "balance_loss_clip": 0.0630315, + "balance_loss_mlp": 0.01276384, + "epoch": 0.10756049902299715, + "flos": 20272770460800.0, + "grad_norm": 2.8631598958886135, + "language_loss": 0.79821587, + "learning_rate": 3.9372282945292335e-06, + "loss": 0.87807226, + "num_input_tokens_seen": 38683865, + "router_z_loss_clip": 3.66015625, + "router_z_loss_mlp": 0.40258789, + "step": 1789, + "time_per_iteration": 2.5255203247070312 + }, + { + "auxiliary_loss_clip": 0.06671752, + "auxiliary_loss_mlp": 0.01304626, + "balance_loss_clip": 0.06304207, + "balance_loss_mlp": 0.01261019, + "epoch": 0.10762062227566511, + "flos": 23593264859520.0, + "grad_norm": 3.1602441142249584, + "language_loss": 0.75890934, + "learning_rate": 3.937131449631859e-06, + "loss": 0.83867311, + "num_input_tokens_seen": 38702485, + "router_z_loss_clip": 3.671875, + "router_z_loss_mlp": 0.43603516, + "step": 1790, + "time_per_iteration": 2.6021804809570312 + }, + { + "auxiliary_loss_clip": 0.06681746, + "auxiliary_loss_mlp": 0.01304108, + "balance_loss_clip": 0.06308466, + "balance_loss_mlp": 0.01261741, + "epoch": 0.10768074552833308, + "flos": 24316549303680.0, + "grad_norm": 2.153087509424505, + "language_loss": 0.80275488, + "learning_rate": 3.9370345312782645e-06, + "loss": 0.88261342, + "num_input_tokens_seen": 38722475, + "router_z_loss_clip": 3.734375, + "router_z_loss_mlp": 0.42333984, + "step": 1791, + "time_per_iteration": 2.546696662902832 + }, + { + "auxiliary_loss_clip": 0.06660049, + "auxiliary_loss_mlp": 0.01311951, + "balance_loss_clip": 0.06307684, + "balance_loss_mlp": 0.01273255, + "epoch": 0.10774086878100106, + "flos": 25306760528640.0, + "grad_norm": 1.9333309848647533, + "language_loss": 0.72259545, + "learning_rate": 3.936937539472126e-06, + "loss": 0.80231547, + "num_input_tokens_seen": 38743285, + "router_z_loss_clip": 3.5234375, + "router_z_loss_mlp": 0.38647461, + "step": 1792, + "time_per_iteration": 3.9801604747772217 + }, + { + "auxiliary_loss_clip": 0.06673245, + "auxiliary_loss_mlp": 0.01302989, + "balance_loss_clip": 0.06307209, + "balance_loss_mlp": 0.01260813, + "epoch": 0.10780099203366902, + "flos": 22060506447360.0, + "grad_norm": 2.562098500680419, + "language_loss": 0.78115147, + "learning_rate": 3.9368404742171236e-06, + "loss": 0.86091387, + "num_input_tokens_seen": 38763035, + "router_z_loss_clip": 3.65820312, + "router_z_loss_mlp": 0.42163086, + "step": 1793, + "time_per_iteration": 2.5435540676116943 + }, + { + "auxiliary_loss_clip": 0.06668183, + "auxiliary_loss_mlp": 0.01304414, + "balance_loss_clip": 0.06312631, + "balance_loss_mlp": 0.01268151, + "epoch": 0.10786111528633699, + "flos": 22754091818880.0, + "grad_norm": 1.5894120102976992, + "language_loss": 0.86093199, + "learning_rate": 3.936743335516936e-06, + "loss": 0.94065803, + "num_input_tokens_seen": 38784900, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 0.36279297, + "step": 1794, + "time_per_iteration": 4.001549482345581 + }, + { + "auxiliary_loss_clip": 0.0669271, + "auxiliary_loss_mlp": 0.01312602, + "balance_loss_clip": 0.06319374, + "balance_loss_mlp": 0.01269472, + "epoch": 0.10792123853900495, + "flos": 20857510978560.0, + "grad_norm": 2.1590787324009257, + "language_loss": 0.77325815, + "learning_rate": 3.936646123375246e-06, + "loss": 0.8533113, + "num_input_tokens_seen": 38804695, + "router_z_loss_clip": 3.734375, + "router_z_loss_mlp": 0.43115234, + "step": 1795, + "time_per_iteration": 2.601548910140991 + }, + { + "auxiliary_loss_clip": 0.06686068, + "auxiliary_loss_mlp": 0.01301313, + "balance_loss_clip": 0.06317562, + "balance_loss_mlp": 0.01262212, + "epoch": 0.10798136179167293, + "flos": 17754454725120.0, + "grad_norm": 3.0035183040345306, + "language_loss": 0.83787191, + "learning_rate": 3.936548837795741e-06, + "loss": 0.91774577, + "num_input_tokens_seen": 38822395, + "router_z_loss_clip": 3.68359375, + "router_z_loss_mlp": 0.39086914, + "step": 1796, + "time_per_iteration": 2.506821870803833 + }, + { + "auxiliary_loss_clip": 0.06692545, + "auxiliary_loss_mlp": 0.01329164, + "balance_loss_clip": 0.06318776, + "balance_loss_mlp": 0.01285318, + "epoch": 0.1080414850443409, + "flos": 13594745358720.0, + "grad_norm": 2.560788533662373, + "language_loss": 0.7551347, + "learning_rate": 3.936451478782111e-06, + "loss": 0.83535177, + "num_input_tokens_seen": 38839865, + "router_z_loss_clip": 3.73828125, + "router_z_loss_mlp": 0.43847656, + "step": 1797, + "time_per_iteration": 3.9367597103118896 + }, + { + "auxiliary_loss_clip": 0.06662647, + "auxiliary_loss_mlp": 0.01300606, + "balance_loss_clip": 0.06308785, + "balance_loss_mlp": 0.0126265, + "epoch": 0.10810160829700886, + "flos": 16259026106880.0, + "grad_norm": 2.354924251941542, + "language_loss": 0.83353364, + "learning_rate": 3.936354046338046e-06, + "loss": 0.91316622, + "num_input_tokens_seen": 38857300, + "router_z_loss_clip": 3.53515625, + "router_z_loss_mlp": 0.37939453, + "step": 1798, + "time_per_iteration": 4.009509086608887 + }, + { + "auxiliary_loss_clip": 0.06672391, + "auxiliary_loss_mlp": 0.01305094, + "balance_loss_clip": 0.06315865, + "balance_loss_mlp": 0.01265635, + "epoch": 0.10816173154967684, + "flos": 15163282264320.0, + "grad_norm": 3.5539012768628786, + "language_loss": 0.87248892, + "learning_rate": 3.936256540467242e-06, + "loss": 0.95226371, + "num_input_tokens_seen": 38874960, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 0.39477539, + "step": 1799, + "time_per_iteration": 2.5058934688568115 + }, + { + "auxiliary_loss_clip": 0.06677136, + "auxiliary_loss_mlp": 0.01305557, + "balance_loss_clip": 0.06318786, + "balance_loss_mlp": 0.01268459, + "epoch": 0.10822185480234481, + "flos": 17791113686400.0, + "grad_norm": 2.263102555339672, + "language_loss": 0.78951424, + "learning_rate": 3.9361589611733955e-06, + "loss": 0.86934125, + "num_input_tokens_seen": 38893610, + "router_z_loss_clip": 3.58203125, + "router_z_loss_mlp": 0.37084961, + "step": 1800, + "time_per_iteration": 2.546147584915161 + }, + { + "auxiliary_loss_clip": 0.06672224, + "auxiliary_loss_mlp": 0.01299, + "balance_loss_clip": 0.06316296, + "balance_loss_mlp": 0.01262546, + "epoch": 0.10828197805501277, + "flos": 25563708673920.0, + "grad_norm": 5.510395821762047, + "language_loss": 0.74356997, + "learning_rate": 3.9360613084602075e-06, + "loss": 0.82328218, + "num_input_tokens_seen": 38913485, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 0.36425781, + "step": 1801, + "time_per_iteration": 2.6982262134552 + }, + { + "auxiliary_loss_clip": 0.06691626, + "auxiliary_loss_mlp": 0.01309625, + "balance_loss_clip": 0.06324095, + "balance_loss_mlp": 0.01272813, + "epoch": 0.10834210130768075, + "flos": 28991748188160.0, + "grad_norm": 2.1562213268616355, + "language_loss": 0.67963791, + "learning_rate": 3.935963582331381e-06, + "loss": 0.75965041, + "num_input_tokens_seen": 38935650, + "router_z_loss_clip": 3.67578125, + "router_z_loss_mlp": 0.3684082, + "step": 1802, + "time_per_iteration": 2.633770704269409 + }, + { + "auxiliary_loss_clip": 0.06676073, + "auxiliary_loss_mlp": 0.01309023, + "balance_loss_clip": 0.0632169, + "balance_loss_mlp": 0.01273379, + "epoch": 0.10840222456034872, + "flos": 20270045203200.0, + "grad_norm": 4.600711865085207, + "language_loss": 0.83367407, + "learning_rate": 3.935865782790621e-06, + "loss": 0.9135251, + "num_input_tokens_seen": 38954130, + "router_z_loss_clip": 3.55078125, + "router_z_loss_mlp": 0.35668945, + "step": 1803, + "time_per_iteration": 2.5231714248657227 + }, + { + "auxiliary_loss_clip": 0.06688153, + "auxiliary_loss_mlp": 0.01302267, + "balance_loss_clip": 0.06328186, + "balance_loss_mlp": 0.01263286, + "epoch": 0.10846234781301668, + "flos": 19868851054080.0, + "grad_norm": 2.166179009667806, + "language_loss": 0.92279881, + "learning_rate": 3.9357679098416365e-06, + "loss": 1.00270307, + "num_input_tokens_seen": 38972905, + "router_z_loss_clip": 3.59765625, + "router_z_loss_mlp": 0.39013672, + "step": 1804, + "time_per_iteration": 2.5790512561798096 + }, + { + "auxiliary_loss_clip": 0.06684472, + "auxiliary_loss_mlp": 0.01313096, + "balance_loss_clip": 0.06322414, + "balance_loss_mlp": 0.01273327, + "epoch": 0.10852247106568465, + "flos": 26476283491200.0, + "grad_norm": 2.1541825231451384, + "language_loss": 0.7834245, + "learning_rate": 3.935669963488139e-06, + "loss": 0.8634001, + "num_input_tokens_seen": 38993255, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 0.39794922, + "step": 1805, + "time_per_iteration": 2.579225778579712 + }, + { + "auxiliary_loss_clip": 0.06686831, + "auxiliary_loss_mlp": 0.01314489, + "balance_loss_clip": 0.06327775, + "balance_loss_mlp": 0.01276938, + "epoch": 0.10858259431835263, + "flos": 30089420674560.0, + "grad_norm": 1.8150777160293243, + "language_loss": 0.87391019, + "learning_rate": 3.935571943733843e-06, + "loss": 0.95392346, + "num_input_tokens_seen": 39012610, + "router_z_loss_clip": 3.58789062, + "router_z_loss_mlp": 0.37548828, + "step": 1806, + "time_per_iteration": 2.6113767623901367 + }, + { + "auxiliary_loss_clip": 0.06674515, + "auxiliary_loss_mlp": 0.01306373, + "balance_loss_clip": 0.06320654, + "balance_loss_mlp": 0.01270038, + "epoch": 0.10864271757102059, + "flos": 19069313794560.0, + "grad_norm": 2.587857349139583, + "language_loss": 0.81862879, + "learning_rate": 3.9354738505824635e-06, + "loss": 0.89843768, + "num_input_tokens_seen": 39030120, + "router_z_loss_clip": 3.53515625, + "router_z_loss_mlp": 0.36328125, + "step": 1807, + "time_per_iteration": 2.5133659839630127 + }, + { + "auxiliary_loss_clip": 0.06671922, + "auxiliary_loss_mlp": 0.01298096, + "balance_loss_clip": 0.06316403, + "balance_loss_mlp": 0.01264193, + "epoch": 0.10870284082368856, + "flos": 24721558813440.0, + "grad_norm": 5.872677105154593, + "language_loss": 0.80080831, + "learning_rate": 3.9353756840377225e-06, + "loss": 0.88050854, + "num_input_tokens_seen": 39049875, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 0.33911133, + "step": 1808, + "time_per_iteration": 2.615813732147217 + }, + { + "auxiliary_loss_clip": 0.06679243, + "auxiliary_loss_mlp": 0.01305785, + "balance_loss_clip": 0.06317936, + "balance_loss_mlp": 0.0126926, + "epoch": 0.10876296407635654, + "flos": 20633322579840.0, + "grad_norm": 1.9478579539752536, + "language_loss": 0.80837792, + "learning_rate": 3.935277444103342e-06, + "loss": 0.88822818, + "num_input_tokens_seen": 39068935, + "router_z_loss_clip": 3.61328125, + "router_z_loss_mlp": 0.36523438, + "step": 1809, + "time_per_iteration": 2.5448191165924072 + }, + { + "auxiliary_loss_clip": 0.0666375, + "auxiliary_loss_mlp": 0.01303981, + "balance_loss_clip": 0.06309726, + "balance_loss_mlp": 0.01265119, + "epoch": 0.1088230873290245, + "flos": 21586245937920.0, + "grad_norm": 2.4636813373380213, + "language_loss": 0.86466354, + "learning_rate": 3.935179130783046e-06, + "loss": 0.94434083, + "num_input_tokens_seen": 39087370, + "router_z_loss_clip": 3.54101562, + "router_z_loss_mlp": 0.38891602, + "step": 1810, + "time_per_iteration": 2.603607654571533 + }, + { + "auxiliary_loss_clip": 0.06689243, + "auxiliary_loss_mlp": 0.01306323, + "balance_loss_clip": 0.06319645, + "balance_loss_mlp": 0.01268367, + "epoch": 0.10888321058169247, + "flos": 26476283491200.0, + "grad_norm": 1.9747664396184277, + "language_loss": 0.65524805, + "learning_rate": 3.935080744080564e-06, + "loss": 0.73520374, + "num_input_tokens_seen": 39106635, + "router_z_loss_clip": 3.69726562, + "router_z_loss_mlp": 0.37939453, + "step": 1811, + "time_per_iteration": 2.581341505050659 + }, + { + "auxiliary_loss_clip": 0.0667599, + "auxiliary_loss_mlp": 0.01304861, + "balance_loss_clip": 0.06313843, + "balance_loss_mlp": 0.01266166, + "epoch": 0.10894333383436045, + "flos": 25855722552960.0, + "grad_norm": 2.675746043218001, + "language_loss": 0.75747859, + "learning_rate": 3.934982283999626e-06, + "loss": 0.83728707, + "num_input_tokens_seen": 39126335, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 0.38671875, + "step": 1812, + "time_per_iteration": 2.6015379428863525 + }, + { + "auxiliary_loss_clip": 0.06657378, + "auxiliary_loss_mlp": 0.01303294, + "balance_loss_clip": 0.06303936, + "balance_loss_mlp": 0.01265219, + "epoch": 0.10900345708702841, + "flos": 19543238887680.0, + "grad_norm": 2.31852988369708, + "language_loss": 0.74425399, + "learning_rate": 3.934883750543966e-06, + "loss": 0.82386076, + "num_input_tokens_seen": 39144820, + "router_z_loss_clip": 3.53320312, + "router_z_loss_mlp": 0.38085938, + "step": 1813, + "time_per_iteration": 2.5689308643341064 + }, + { + "auxiliary_loss_clip": 0.06659622, + "auxiliary_loss_mlp": 0.01293341, + "balance_loss_clip": 0.06308373, + "balance_loss_mlp": 0.01258556, + "epoch": 0.10906358033969638, + "flos": 23630091528960.0, + "grad_norm": 1.8365155089256564, + "language_loss": 0.84168994, + "learning_rate": 3.93478514371732e-06, + "loss": 0.92121959, + "num_input_tokens_seen": 39165945, + "router_z_loss_clip": 3.51367188, + "router_z_loss_mlp": 0.34790039, + "step": 1814, + "time_per_iteration": 2.5616791248321533 + }, + { + "auxiliary_loss_clip": 0.06670845, + "auxiliary_loss_mlp": 0.01300399, + "balance_loss_clip": 0.06307411, + "balance_loss_mlp": 0.01261036, + "epoch": 0.10912370359236434, + "flos": 21221039917440.0, + "grad_norm": 3.301230683958358, + "language_loss": 0.85154849, + "learning_rate": 3.934686463523429e-06, + "loss": 0.93126094, + "num_input_tokens_seen": 39183520, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 0.39355469, + "step": 1815, + "time_per_iteration": 2.57688307762146 + }, + { + "auxiliary_loss_clip": 0.06661555, + "auxiliary_loss_mlp": 0.01302183, + "balance_loss_clip": 0.06312289, + "balance_loss_mlp": 0.01263726, + "epoch": 0.10918382684503232, + "flos": 13558296032640.0, + "grad_norm": 2.7300514950641714, + "language_loss": 0.73428917, + "learning_rate": 3.9345877099660315e-06, + "loss": 0.81392652, + "num_input_tokens_seen": 39201190, + "router_z_loss_clip": 3.4921875, + "router_z_loss_mlp": 0.38476562, + "step": 1816, + "time_per_iteration": 2.503822088241577 + }, + { + "auxiliary_loss_clip": 0.06674603, + "auxiliary_loss_mlp": 0.01310351, + "balance_loss_clip": 0.06307942, + "balance_loss_mlp": 0.01269105, + "epoch": 0.10924395009770028, + "flos": 27971712109440.0, + "grad_norm": 2.9873916021139078, + "language_loss": 0.74010128, + "learning_rate": 3.9344888830488744e-06, + "loss": 0.81995082, + "num_input_tokens_seen": 39221210, + "router_z_loss_clip": 3.66601562, + "router_z_loss_mlp": 0.41235352, + "step": 1817, + "time_per_iteration": 2.636141300201416 + }, + { + "auxiliary_loss_clip": 0.06667508, + "auxiliary_loss_mlp": 0.01306282, + "balance_loss_clip": 0.06316356, + "balance_loss_mlp": 0.01268659, + "epoch": 0.10930407335036825, + "flos": 25600912686720.0, + "grad_norm": 1.8767258076281454, + "language_loss": 0.68811858, + "learning_rate": 3.934389982775706e-06, + "loss": 0.76785648, + "num_input_tokens_seen": 39242025, + "router_z_loss_clip": 3.51171875, + "router_z_loss_mlp": 0.37597656, + "step": 1818, + "time_per_iteration": 2.5823488235473633 + }, + { + "auxiliary_loss_clip": 0.06675036, + "auxiliary_loss_mlp": 0.01306463, + "balance_loss_clip": 0.06313543, + "balance_loss_mlp": 0.01266575, + "epoch": 0.10936419660303623, + "flos": 18412177749120.0, + "grad_norm": 2.168064712705315, + "language_loss": 0.74997962, + "learning_rate": 3.934291009150275e-06, + "loss": 0.82979459, + "num_input_tokens_seen": 39259870, + "router_z_loss_clip": 3.61523438, + "router_z_loss_mlp": 0.39892578, + "step": 1819, + "time_per_iteration": 2.5780999660491943 + }, + { + "auxiliary_loss_clip": 0.0666959, + "auxiliary_loss_mlp": 0.01302484, + "balance_loss_clip": 0.06316112, + "balance_loss_mlp": 0.01264123, + "epoch": 0.1094243198557042, + "flos": 23846523425280.0, + "grad_norm": 2.805852177899608, + "language_loss": 0.75565147, + "learning_rate": 3.934191962176335e-06, + "loss": 0.83537227, + "num_input_tokens_seen": 39278500, + "router_z_loss_clip": 3.53515625, + "router_z_loss_mlp": 0.38354492, + "step": 1820, + "time_per_iteration": 2.55102801322937 + }, + { + "auxiliary_loss_clip": 0.06670672, + "auxiliary_loss_mlp": 0.01301119, + "balance_loss_clip": 0.06311028, + "balance_loss_mlp": 0.01261065, + "epoch": 0.10948444310837216, + "flos": 14648589360000.0, + "grad_norm": 3.185311290283081, + "language_loss": 0.84421206, + "learning_rate": 3.934092841857642e-06, + "loss": 0.92392999, + "num_input_tokens_seen": 39294800, + "router_z_loss_clip": 3.59765625, + "router_z_loss_mlp": 0.40039062, + "step": 1821, + "time_per_iteration": 2.557086229324341 + }, + { + "auxiliary_loss_clip": 0.06666994, + "auxiliary_loss_mlp": 0.01310986, + "balance_loss_clip": 0.06314231, + "balance_loss_mlp": 0.01271409, + "epoch": 0.10954456636104014, + "flos": 27826250221440.0, + "grad_norm": 3.7637860321271117, + "language_loss": 0.78284943, + "learning_rate": 3.933993648197955e-06, + "loss": 0.86262918, + "num_input_tokens_seen": 39314625, + "router_z_loss_clip": 3.52734375, + "router_z_loss_mlp": 0.39575195, + "step": 1822, + "time_per_iteration": 2.607753038406372 + }, + { + "auxiliary_loss_clip": 0.06665225, + "auxiliary_loss_mlp": 0.01305751, + "balance_loss_clip": 0.06311564, + "balance_loss_mlp": 0.01267271, + "epoch": 0.1096046896137081, + "flos": 33629491497600.0, + "grad_norm": 2.4721955378281133, + "language_loss": 0.81345534, + "learning_rate": 3.933894381201034e-06, + "loss": 0.89316511, + "num_input_tokens_seen": 39336465, + "router_z_loss_clip": 3.53515625, + "router_z_loss_mlp": 0.38525391, + "step": 1823, + "time_per_iteration": 2.7046356201171875 + }, + { + "auxiliary_loss_clip": 0.06663416, + "auxiliary_loss_mlp": 0.01297526, + "balance_loss_clip": 0.06311031, + "balance_loss_mlp": 0.01260643, + "epoch": 0.10966481286637607, + "flos": 26987370670080.0, + "grad_norm": 1.5405254615008266, + "language_loss": 0.8184576, + "learning_rate": 3.933795040870645e-06, + "loss": 0.898067, + "num_input_tokens_seen": 39357930, + "router_z_loss_clip": 3.5234375, + "router_z_loss_mlp": 0.36889648, + "step": 1824, + "time_per_iteration": 2.6020491123199463 + }, + { + "auxiliary_loss_clip": 0.06675697, + "auxiliary_loss_mlp": 0.01302612, + "balance_loss_clip": 0.06317075, + "balance_loss_mlp": 0.01262796, + "epoch": 0.10972493611904403, + "flos": 23042751534720.0, + "grad_norm": 2.030784567379419, + "language_loss": 0.88740194, + "learning_rate": 3.933695627210554e-06, + "loss": 0.96718502, + "num_input_tokens_seen": 39376380, + "router_z_loss_clip": 3.5859375, + "router_z_loss_mlp": 0.3984375, + "step": 1825, + "time_per_iteration": 2.6143786907196045 + }, + { + "auxiliary_loss_clip": 0.06672946, + "auxiliary_loss_mlp": 0.01304094, + "balance_loss_clip": 0.06315491, + "balance_loss_mlp": 0.01265113, + "epoch": 0.10978505937171201, + "flos": 38113261729920.0, + "grad_norm": 4.39958169553056, + "language_loss": 0.77133435, + "learning_rate": 3.933596140224532e-06, + "loss": 0.85110474, + "num_input_tokens_seen": 39399935, + "router_z_loss_clip": 3.57617188, + "router_z_loss_mlp": 0.39013672, + "step": 1826, + "time_per_iteration": 2.6767754554748535 + }, + { + "auxiliary_loss_clip": 0.06562361, + "auxiliary_loss_mlp": 0.01306115, + "balance_loss_clip": 0.06342762, + "balance_loss_mlp": 0.01289641, + "epoch": 0.10984518262437998, + "flos": 59867987500800.0, + "grad_norm": 0.8265503512589908, + "language_loss": 0.55217832, + "learning_rate": 3.93349657991635e-06, + "loss": 0.63086313, + "num_input_tokens_seen": 39460685, + "router_z_loss_clip": 2.19921875, + "router_z_loss_mlp": 0.16479492, + "step": 1827, + "time_per_iteration": 3.2042500972747803 + }, + { + "auxiliary_loss_clip": 0.06558152, + "auxiliary_loss_mlp": 0.01284859, + "balance_loss_clip": 0.06338888, + "balance_loss_mlp": 0.01267704, + "epoch": 0.10990530587704794, + "flos": 66741088907520.0, + "grad_norm": 0.7202592314019287, + "language_loss": 0.55369592, + "learning_rate": 3.933396946289784e-06, + "loss": 0.63212597, + "num_input_tokens_seen": 39524765, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.17175293, + "step": 1828, + "time_per_iteration": 3.2514500617980957 + }, + { + "auxiliary_loss_clip": 0.06692256, + "auxiliary_loss_mlp": 0.01311884, + "balance_loss_clip": 0.06327218, + "balance_loss_mlp": 0.01270018, + "epoch": 0.10996542912971592, + "flos": 25454234914560.0, + "grad_norm": 6.114677648786519, + "language_loss": 0.86263084, + "learning_rate": 3.933297239348612e-06, + "loss": 0.94267225, + "num_input_tokens_seen": 39543640, + "router_z_loss_clip": 3.65234375, + "router_z_loss_mlp": 0.41918945, + "step": 1829, + "time_per_iteration": 2.586923360824585 + }, + { + "auxiliary_loss_clip": 0.06682983, + "auxiliary_loss_mlp": 0.01319143, + "balance_loss_clip": 0.06320649, + "balance_loss_mlp": 0.01279207, + "epoch": 0.11002555238238389, + "flos": 44028282752640.0, + "grad_norm": 2.5270889660052025, + "language_loss": 0.90112162, + "learning_rate": 3.933197459096614e-06, + "loss": 0.98114288, + "num_input_tokens_seen": 39567525, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 0.3996582, + "step": 1830, + "time_per_iteration": 2.8102030754089355 + }, + { + "auxiliary_loss_clip": 0.06544227, + "auxiliary_loss_mlp": 0.01284934, + "balance_loss_clip": 0.06324031, + "balance_loss_mlp": 0.01268376, + "epoch": 0.11008567563505185, + "flos": 54085248547200.0, + "grad_norm": 0.6738836054555057, + "language_loss": 0.55525172, + "learning_rate": 3.9330976055375756e-06, + "loss": 0.63354337, + "num_input_tokens_seen": 39628470, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.16564941, + "step": 1831, + "time_per_iteration": 4.652044057846069 + }, + { + "auxiliary_loss_clip": 0.06700309, + "auxiliary_loss_mlp": 0.01328613, + "balance_loss_clip": 0.06332322, + "balance_loss_mlp": 0.01284744, + "epoch": 0.11014579888771983, + "flos": 24249981634560.0, + "grad_norm": 4.072580491450979, + "language_loss": 0.92313743, + "learning_rate": 3.932997678675282e-06, + "loss": 1.00342667, + "num_input_tokens_seen": 39646670, + "router_z_loss_clip": 3.67773438, + "router_z_loss_mlp": 0.43823242, + "step": 1832, + "time_per_iteration": 2.6010701656341553 + }, + { + "auxiliary_loss_clip": 0.06543858, + "auxiliary_loss_mlp": 0.01268849, + "balance_loss_clip": 0.06322708, + "balance_loss_mlp": 0.0125247, + "epoch": 0.1102059221403878, + "flos": 57763653661440.0, + "grad_norm": 0.681716215184674, + "language_loss": 0.59753174, + "learning_rate": 3.932897678513523e-06, + "loss": 0.67565876, + "num_input_tokens_seen": 39712915, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.16381836, + "step": 1833, + "time_per_iteration": 3.3245253562927246 + }, + { + "auxiliary_loss_clip": 0.0668912, + "auxiliary_loss_mlp": 0.01321784, + "balance_loss_clip": 0.06319445, + "balance_loss_mlp": 0.01278773, + "epoch": 0.11026604539305576, + "flos": 16800818607360.0, + "grad_norm": 5.311308312768562, + "language_loss": 0.81575066, + "learning_rate": 3.93279760505609e-06, + "loss": 0.89585972, + "num_input_tokens_seen": 39730650, + "router_z_loss_clip": 3.703125, + "router_z_loss_mlp": 0.42993164, + "step": 1834, + "time_per_iteration": 4.020633697509766 + }, + { + "auxiliary_loss_clip": 0.0668771, + "auxiliary_loss_mlp": 0.01323505, + "balance_loss_clip": 0.0632341, + "balance_loss_mlp": 0.01282997, + "epoch": 0.11032616864572373, + "flos": 23994920206080.0, + "grad_norm": 4.522465656610911, + "language_loss": 0.91756475, + "learning_rate": 3.932697458306779e-06, + "loss": 0.99767691, + "num_input_tokens_seen": 39751065, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 0.40478516, + "step": 1835, + "time_per_iteration": 2.5956919193267822 + }, + { + "auxiliary_loss_clip": 0.06685364, + "auxiliary_loss_mlp": 0.01321402, + "balance_loss_clip": 0.06324954, + "balance_loss_mlp": 0.01281729, + "epoch": 0.1103862918983917, + "flos": 19689329681280.0, + "grad_norm": 3.000861759629478, + "language_loss": 0.66412532, + "learning_rate": 3.932597238269386e-06, + "loss": 0.74419296, + "num_input_tokens_seen": 39769245, + "router_z_loss_clip": 3.60546875, + "router_z_loss_mlp": 0.39648438, + "step": 1836, + "time_per_iteration": 2.5927958488464355 + }, + { + "auxiliary_loss_clip": 0.06670263, + "auxiliary_loss_mlp": 0.01319261, + "balance_loss_clip": 0.06317647, + "balance_loss_mlp": 0.01279541, + "epoch": 0.11044641515105967, + "flos": 32169086686080.0, + "grad_norm": 2.1343283023714865, + "language_loss": 0.74546272, + "learning_rate": 3.932496944947711e-06, + "loss": 0.82535791, + "num_input_tokens_seen": 39790830, + "router_z_loss_clip": 3.52539062, + "router_z_loss_mlp": 0.3972168, + "step": 1837, + "time_per_iteration": 5.453325033187866 + }, + { + "auxiliary_loss_clip": 0.06688204, + "auxiliary_loss_mlp": 0.01319143, + "balance_loss_clip": 0.06321806, + "balance_loss_mlp": 0.01281496, + "epoch": 0.11050653840372764, + "flos": 16694573230080.0, + "grad_norm": 2.107729732197389, + "language_loss": 0.79967713, + "learning_rate": 3.93239657834556e-06, + "loss": 0.87975061, + "num_input_tokens_seen": 39809475, + "router_z_loss_clip": 3.66210938, + "router_z_loss_mlp": 0.3762207, + "step": 1838, + "time_per_iteration": 2.5330708026885986 + }, + { + "auxiliary_loss_clip": 0.06681567, + "auxiliary_loss_mlp": 0.01310209, + "balance_loss_clip": 0.06323014, + "balance_loss_mlp": 0.01271013, + "epoch": 0.11056666165639562, + "flos": 21214205809920.0, + "grad_norm": 1.83916180844076, + "language_loss": 0.72651547, + "learning_rate": 3.932296138466736e-06, + "loss": 0.8064332, + "num_input_tokens_seen": 39826355, + "router_z_loss_clip": 3.58398438, + "router_z_loss_mlp": 0.39160156, + "step": 1839, + "time_per_iteration": 2.5494542121887207 + }, + { + "auxiliary_loss_clip": 0.06685573, + "auxiliary_loss_mlp": 0.01308897, + "balance_loss_clip": 0.06317459, + "balance_loss_mlp": 0.0126777, + "epoch": 0.11062678490906358, + "flos": 19170444072960.0, + "grad_norm": 2.2710606045718835, + "language_loss": 0.80620813, + "learning_rate": 3.93219562531505e-06, + "loss": 0.88615286, + "num_input_tokens_seen": 39845335, + "router_z_loss_clip": 3.68359375, + "router_z_loss_mlp": 0.41137695, + "step": 1840, + "time_per_iteration": 2.525967836380005 + }, + { + "auxiliary_loss_clip": 0.0666925, + "auxiliary_loss_mlp": 0.01306907, + "balance_loss_clip": 0.06314851, + "balance_loss_mlp": 0.01271287, + "epoch": 0.11068690816173155, + "flos": 24901457529600.0, + "grad_norm": 1.7471100044619239, + "language_loss": 0.89207804, + "learning_rate": 3.932095038894311e-06, + "loss": 0.97183955, + "num_input_tokens_seen": 39865065, + "router_z_loss_clip": 3.54296875, + "router_z_loss_mlp": 0.35620117, + "step": 1841, + "time_per_iteration": 2.6120924949645996 + }, + { + "auxiliary_loss_clip": 0.06674149, + "auxiliary_loss_mlp": 0.01316221, + "balance_loss_clip": 0.06318908, + "balance_loss_mlp": 0.01276739, + "epoch": 0.11074703141439952, + "flos": 16478015552640.0, + "grad_norm": 2.1111741847875822, + "language_loss": 0.92148924, + "learning_rate": 3.931994379208334e-06, + "loss": 1.00139296, + "num_input_tokens_seen": 39882780, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 0.39477539, + "step": 1842, + "time_per_iteration": 2.5187559127807617 + }, + { + "auxiliary_loss_clip": 0.06674332, + "auxiliary_loss_mlp": 0.01308171, + "balance_loss_clip": 0.06317849, + "balance_loss_mlp": 0.01269166, + "epoch": 0.11080715466706749, + "flos": 19178535991680.0, + "grad_norm": 2.023955120097268, + "language_loss": 0.87531722, + "learning_rate": 3.931893646260937e-06, + "loss": 0.95514226, + "num_input_tokens_seen": 39900295, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 0.39038086, + "step": 1843, + "time_per_iteration": 2.6090967655181885 + }, + { + "auxiliary_loss_clip": 0.06693342, + "auxiliary_loss_mlp": 0.01302224, + "balance_loss_clip": 0.0632928, + "balance_loss_mlp": 0.01261073, + "epoch": 0.11086727791973545, + "flos": 27711325946880.0, + "grad_norm": 2.219830309112563, + "language_loss": 0.75884986, + "learning_rate": 3.931792840055941e-06, + "loss": 0.8388055, + "num_input_tokens_seen": 39922075, + "router_z_loss_clip": 3.64257812, + "router_z_loss_mlp": 0.41137695, + "step": 1844, + "time_per_iteration": 2.6051831245422363 + }, + { + "auxiliary_loss_clip": 0.06685966, + "auxiliary_loss_mlp": 0.01305534, + "balance_loss_clip": 0.06324236, + "balance_loss_mlp": 0.01264311, + "epoch": 0.11092740117240343, + "flos": 18520854894720.0, + "grad_norm": 2.695467374521673, + "language_loss": 0.77040052, + "learning_rate": 3.931691960597165e-06, + "loss": 0.85031545, + "num_input_tokens_seen": 39940115, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 0.41235352, + "step": 1845, + "time_per_iteration": 2.6330642700195312 + }, + { + "auxiliary_loss_clip": 0.06677614, + "auxiliary_loss_mlp": 0.01301707, + "balance_loss_clip": 0.06324686, + "balance_loss_mlp": 0.01264681, + "epoch": 0.1109875244250714, + "flos": 20528796211200.0, + "grad_norm": 2.004922205839187, + "language_loss": 0.77657044, + "learning_rate": 3.9315910078884375e-06, + "loss": 0.85636371, + "num_input_tokens_seen": 39959920, + "router_z_loss_clip": 3.52929688, + "router_z_loss_mlp": 0.37036133, + "step": 1846, + "time_per_iteration": 2.5549449920654297 + }, + { + "auxiliary_loss_clip": 0.06701723, + "auxiliary_loss_mlp": 0.01300229, + "balance_loss_clip": 0.0633509, + "balance_loss_mlp": 0.01259627, + "epoch": 0.11104764767773936, + "flos": 14103484623360.0, + "grad_norm": 2.935889161115543, + "language_loss": 0.88190699, + "learning_rate": 3.931489981933584e-06, + "loss": 0.96192646, + "num_input_tokens_seen": 39974755, + "router_z_loss_clip": 3.671875, + "router_z_loss_mlp": 0.40600586, + "step": 1847, + "time_per_iteration": 2.544952869415283 + }, + { + "auxiliary_loss_clip": 0.06695546, + "auxiliary_loss_mlp": 0.01304809, + "balance_loss_clip": 0.06331737, + "balance_loss_mlp": 0.01263944, + "epoch": 0.11110777093040733, + "flos": 20600730541440.0, + "grad_norm": 2.320230631722476, + "language_loss": 0.79106438, + "learning_rate": 3.931388882736438e-06, + "loss": 0.87106788, + "num_input_tokens_seen": 39993355, + "router_z_loss_clip": 3.63476562, + "router_z_loss_mlp": 0.40893555, + "step": 1848, + "time_per_iteration": 2.6920952796936035 + }, + { + "auxiliary_loss_clip": 0.0668249, + "auxiliary_loss_mlp": 0.01302322, + "balance_loss_clip": 0.06330639, + "balance_loss_mlp": 0.01266702, + "epoch": 0.11116789418307531, + "flos": 21876247319040.0, + "grad_norm": 2.02298107620041, + "language_loss": 0.79027736, + "learning_rate": 3.931287710300832e-06, + "loss": 0.87012547, + "num_input_tokens_seen": 40012410, + "router_z_loss_clip": 3.51757812, + "router_z_loss_mlp": 0.35595703, + "step": 1849, + "time_per_iteration": 2.630244255065918 + }, + { + "auxiliary_loss_clip": 0.0669456, + "auxiliary_loss_mlp": 0.01300991, + "balance_loss_clip": 0.06327619, + "balance_loss_mlp": 0.01259363, + "epoch": 0.11122801743574327, + "flos": 15528488284800.0, + "grad_norm": 3.153012159345978, + "language_loss": 0.73516262, + "learning_rate": 3.931186464630601e-06, + "loss": 0.81511813, + "num_input_tokens_seen": 40029315, + "router_z_loss_clip": 3.66601562, + "router_z_loss_mlp": 0.41625977, + "step": 1850, + "time_per_iteration": 2.5095834732055664 + }, + { + "auxiliary_loss_clip": 0.06693517, + "auxiliary_loss_mlp": 0.01305101, + "balance_loss_clip": 0.06331346, + "balance_loss_mlp": 0.01265952, + "epoch": 0.11128814068841124, + "flos": 14397511000320.0, + "grad_norm": 2.7195587095410594, + "language_loss": 0.83262205, + "learning_rate": 3.931085145729588e-06, + "loss": 0.91260827, + "num_input_tokens_seen": 40045765, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 0.39135742, + "step": 1851, + "time_per_iteration": 2.5094821453094482 + }, + { + "auxiliary_loss_clip": 0.06681279, + "auxiliary_loss_mlp": 0.01301356, + "balance_loss_clip": 0.06326675, + "balance_loss_mlp": 0.01266285, + "epoch": 0.11134826394107922, + "flos": 16659465569280.0, + "grad_norm": 3.1935743698172874, + "language_loss": 0.90682918, + "learning_rate": 3.930983753601631e-06, + "loss": 0.98665553, + "num_input_tokens_seen": 40061660, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 0.35083008, + "step": 1852, + "time_per_iteration": 2.5097947120666504 + }, + { + "auxiliary_loss_clip": 0.06688742, + "auxiliary_loss_mlp": 0.0130004, + "balance_loss_clip": 0.06332849, + "balance_loss_mlp": 0.01261392, + "epoch": 0.11140838719374718, + "flos": 16696627655040.0, + "grad_norm": 2.055655946127079, + "language_loss": 0.73742187, + "learning_rate": 3.930882288250578e-06, + "loss": 0.81730974, + "num_input_tokens_seen": 40080180, + "router_z_loss_clip": 3.56445312, + "router_z_loss_mlp": 0.38647461, + "step": 1853, + "time_per_iteration": 2.5568370819091797 + }, + { + "auxiliary_loss_clip": 0.06563053, + "auxiliary_loss_mlp": 0.01299008, + "balance_loss_clip": 0.06346013, + "balance_loss_mlp": 0.01281771, + "epoch": 0.11146851044641515, + "flos": 60994101248640.0, + "grad_norm": 0.7599812832333546, + "language_loss": 0.53835392, + "learning_rate": 3.930780749680273e-06, + "loss": 0.61697447, + "num_input_tokens_seen": 40138910, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.17260742, + "step": 1854, + "time_per_iteration": 3.1410884857177734 + }, + { + "auxiliary_loss_clip": 0.06710939, + "auxiliary_loss_mlp": 0.01301728, + "balance_loss_clip": 0.06327829, + "balance_loss_mlp": 0.01258336, + "epoch": 0.11152863369908313, + "flos": 22199301936000.0, + "grad_norm": 2.170007206040738, + "language_loss": 0.86019069, + "learning_rate": 3.9306791378945705e-06, + "loss": 0.94031739, + "num_input_tokens_seen": 40157745, + "router_z_loss_clip": 3.83398438, + "router_z_loss_mlp": 0.43383789, + "step": 1855, + "time_per_iteration": 2.5451245307922363 + }, + { + "auxiliary_loss_clip": 0.06687084, + "auxiliary_loss_mlp": 0.01297488, + "balance_loss_clip": 0.0632429, + "balance_loss_mlp": 0.01258745, + "epoch": 0.11158875695175109, + "flos": 19543742012160.0, + "grad_norm": 2.6985711919434054, + "language_loss": 0.83108622, + "learning_rate": 3.9305774528973205e-06, + "loss": 0.91093194, + "num_input_tokens_seen": 40175375, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 0.38720703, + "step": 1856, + "time_per_iteration": 2.578641653060913 + }, + { + "auxiliary_loss_clip": 0.06667097, + "auxiliary_loss_mlp": 0.01293205, + "balance_loss_clip": 0.06315985, + "balance_loss_mlp": 0.01257824, + "epoch": 0.11164888020441906, + "flos": 25448994034560.0, + "grad_norm": 1.90457681551641, + "language_loss": 0.84520233, + "learning_rate": 3.93047569469238e-06, + "loss": 0.92480534, + "num_input_tokens_seen": 40195715, + "router_z_loss_clip": 3.51171875, + "router_z_loss_mlp": 0.35375977, + "step": 1857, + "time_per_iteration": 2.581700086593628 + }, + { + "auxiliary_loss_clip": 0.06686676, + "auxiliary_loss_mlp": 0.01304106, + "balance_loss_clip": 0.06318156, + "balance_loss_mlp": 0.01263289, + "epoch": 0.11170900345708702, + "flos": 15638171679360.0, + "grad_norm": 2.609725880853407, + "language_loss": 0.85109961, + "learning_rate": 3.930373863283608e-06, + "loss": 0.9310075, + "num_input_tokens_seen": 40213975, + "router_z_loss_clip": 3.68164062, + "router_z_loss_mlp": 0.40795898, + "step": 1858, + "time_per_iteration": 2.536013603210449 + }, + { + "auxiliary_loss_clip": 0.0668328, + "auxiliary_loss_mlp": 0.01297406, + "balance_loss_clip": 0.06323688, + "balance_loss_mlp": 0.01259569, + "epoch": 0.111769126709755, + "flos": 23046105697920.0, + "grad_norm": 2.4700078024873102, + "language_loss": 0.92790282, + "learning_rate": 3.930271958674866e-06, + "loss": 1.00770962, + "num_input_tokens_seen": 40233905, + "router_z_loss_clip": 3.59570312, + "router_z_loss_mlp": 0.37841797, + "step": 1859, + "time_per_iteration": 2.541881799697876 + }, + { + "auxiliary_loss_clip": 0.06691643, + "auxiliary_loss_mlp": 0.01299678, + "balance_loss_clip": 0.06318307, + "balance_loss_mlp": 0.0125774, + "epoch": 0.11182924996242297, + "flos": 20857091708160.0, + "grad_norm": 2.367815973832506, + "language_loss": 0.8396585, + "learning_rate": 3.930169980870018e-06, + "loss": 0.9195717, + "num_input_tokens_seen": 40252810, + "router_z_loss_clip": 3.72851562, + "router_z_loss_mlp": 0.41943359, + "step": 1860, + "time_per_iteration": 2.565051555633545 + }, + { + "auxiliary_loss_clip": 0.06669357, + "auxiliary_loss_mlp": 0.01300378, + "balance_loss_clip": 0.06315688, + "balance_loss_mlp": 0.01263065, + "epoch": 0.11188937321509093, + "flos": 17460763764480.0, + "grad_norm": 2.7908462123762026, + "language_loss": 0.7628203, + "learning_rate": 3.930067929872931e-06, + "loss": 0.84251761, + "num_input_tokens_seen": 40272000, + "router_z_loss_clip": 3.53710938, + "router_z_loss_mlp": 0.37304688, + "step": 1861, + "time_per_iteration": 2.5033557415008545 + }, + { + "auxiliary_loss_clip": 0.06670874, + "auxiliary_loss_mlp": 0.01303256, + "balance_loss_clip": 0.0631748, + "balance_loss_mlp": 0.01266635, + "epoch": 0.11194949646775891, + "flos": 24102507248640.0, + "grad_norm": 2.306450242478339, + "language_loss": 0.90480924, + "learning_rate": 3.929965805687474e-06, + "loss": 0.9845506, + "num_input_tokens_seen": 40290660, + "router_z_loss_clip": 3.53515625, + "router_z_loss_mlp": 0.3659668, + "step": 1862, + "time_per_iteration": 2.582846164703369 + }, + { + "auxiliary_loss_clip": 0.06675294, + "auxiliary_loss_mlp": 0.01301536, + "balance_loss_clip": 0.0632014, + "balance_loss_mlp": 0.01265273, + "epoch": 0.11200961972042688, + "flos": 25160627808000.0, + "grad_norm": 2.402216402179579, + "language_loss": 0.88216799, + "learning_rate": 3.92986360831752e-06, + "loss": 0.9619363, + "num_input_tokens_seen": 40307820, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 0.36279297, + "step": 1863, + "time_per_iteration": 2.548849105834961 + }, + { + "auxiliary_loss_clip": 0.06661677, + "auxiliary_loss_mlp": 0.01299701, + "balance_loss_clip": 0.06311835, + "balance_loss_mlp": 0.01259933, + "epoch": 0.11206974297309484, + "flos": 21294735183360.0, + "grad_norm": 3.3365899426908574, + "language_loss": 0.65844059, + "learning_rate": 3.929761337766945e-06, + "loss": 0.73805434, + "num_input_tokens_seen": 40327430, + "router_z_loss_clip": 3.5, + "router_z_loss_mlp": 0.39770508, + "step": 1864, + "time_per_iteration": 2.5405185222625732 + }, + { + "auxiliary_loss_clip": 0.06660779, + "auxiliary_loss_mlp": 0.01305926, + "balance_loss_clip": 0.06303211, + "balance_loss_mlp": 0.01270211, + "epoch": 0.11212986622576282, + "flos": 18921881335680.0, + "grad_norm": 2.2819326265061717, + "language_loss": 0.75939113, + "learning_rate": 3.929658994039627e-06, + "loss": 0.83905816, + "num_input_tokens_seen": 40344545, + "router_z_loss_clip": 3.58007812, + "router_z_loss_mlp": 0.35693359, + "step": 1865, + "time_per_iteration": 2.518132209777832 + }, + { + "auxiliary_loss_clip": 0.06676203, + "auxiliary_loss_mlp": 0.01303479, + "balance_loss_clip": 0.06308948, + "balance_loss_mlp": 0.01262066, + "epoch": 0.11218998947843078, + "flos": 22061344988160.0, + "grad_norm": 2.4630430297676087, + "language_loss": 0.86701274, + "learning_rate": 3.929556577139446e-06, + "loss": 0.94680953, + "num_input_tokens_seen": 40362300, + "router_z_loss_clip": 3.67382812, + "router_z_loss_mlp": 0.4140625, + "step": 1866, + "time_per_iteration": 2.559826135635376 + }, + { + "auxiliary_loss_clip": 0.06668604, + "auxiliary_loss_mlp": 0.0129946, + "balance_loss_clip": 0.06308037, + "balance_loss_mlp": 0.01259405, + "epoch": 0.11225011273109875, + "flos": 24578612547840.0, + "grad_norm": 1.6697676286935108, + "language_loss": 0.82806516, + "learning_rate": 3.929454087070286e-06, + "loss": 0.90774584, + "num_input_tokens_seen": 40384720, + "router_z_loss_clip": 3.60546875, + "router_z_loss_mlp": 0.40014648, + "step": 1867, + "time_per_iteration": 2.6024861335754395 + }, + { + "auxiliary_loss_clip": 0.06666633, + "auxiliary_loss_mlp": 0.01303841, + "balance_loss_clip": 0.06308746, + "balance_loss_mlp": 0.01266099, + "epoch": 0.11231023598376672, + "flos": 28446140327040.0, + "grad_norm": 2.646357828465267, + "language_loss": 0.88275552, + "learning_rate": 3.929351523836035e-06, + "loss": 0.96246034, + "num_input_tokens_seen": 40404000, + "router_z_loss_clip": 3.58007812, + "router_z_loss_mlp": 0.37744141, + "step": 1868, + "time_per_iteration": 2.6040542125701904 + }, + { + "auxiliary_loss_clip": 0.06659871, + "auxiliary_loss_mlp": 0.01297203, + "balance_loss_clip": 0.06306987, + "balance_loss_mlp": 0.01259866, + "epoch": 0.1123703592364347, + "flos": 14431318922880.0, + "grad_norm": 2.6026187077821796, + "language_loss": 0.69696379, + "learning_rate": 3.9292488874405795e-06, + "loss": 0.77653456, + "num_input_tokens_seen": 40418665, + "router_z_loss_clip": 3.52929688, + "router_z_loss_mlp": 0.3737793, + "step": 1869, + "time_per_iteration": 2.562173843383789 + }, + { + "auxiliary_loss_clip": 0.06669002, + "auxiliary_loss_mlp": 0.01308207, + "balance_loss_clip": 0.06307223, + "balance_loss_mlp": 0.01267629, + "epoch": 0.11243048248910266, + "flos": 22242753077760.0, + "grad_norm": 2.004713314117072, + "language_loss": 0.78550231, + "learning_rate": 3.929146177887814e-06, + "loss": 0.86527443, + "num_input_tokens_seen": 40437870, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 0.40600586, + "step": 1870, + "time_per_iteration": 2.5912842750549316 + }, + { + "auxiliary_loss_clip": 0.06677727, + "auxiliary_loss_mlp": 0.01300065, + "balance_loss_clip": 0.06308755, + "balance_loss_mlp": 0.01259462, + "epoch": 0.11249060574177062, + "flos": 18589435061760.0, + "grad_norm": 2.325375460191994, + "language_loss": 0.77409399, + "learning_rate": 3.929043395181631e-06, + "loss": 0.85387194, + "num_input_tokens_seen": 40455570, + "router_z_loss_clip": 3.69140625, + "router_z_loss_mlp": 0.40625, + "step": 1871, + "time_per_iteration": 3.970134735107422 + }, + { + "auxiliary_loss_clip": 0.06669156, + "auxiliary_loss_mlp": 0.01304929, + "balance_loss_clip": 0.06304972, + "balance_loss_mlp": 0.01264803, + "epoch": 0.1125507289944386, + "flos": 22863146307840.0, + "grad_norm": 2.5010943819542395, + "language_loss": 0.83236814, + "learning_rate": 3.928940539325929e-06, + "loss": 0.91210902, + "num_input_tokens_seen": 40473600, + "router_z_loss_clip": 3.64257812, + "router_z_loss_mlp": 0.40112305, + "step": 1872, + "time_per_iteration": 2.53498911857605 + }, + { + "auxiliary_loss_clip": 0.0666475, + "auxiliary_loss_mlp": 0.0132478, + "balance_loss_clip": 0.06302819, + "balance_loss_mlp": 0.01284344, + "epoch": 0.11261085224710657, + "flos": 19681447397760.0, + "grad_norm": 2.9026103981965963, + "language_loss": 0.84496641, + "learning_rate": 3.9288376103246095e-06, + "loss": 0.92486167, + "num_input_tokens_seen": 40490025, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 0.40454102, + "step": 1873, + "time_per_iteration": 3.988614082336426 + }, + { + "auxiliary_loss_clip": 0.06668855, + "auxiliary_loss_mlp": 0.01305813, + "balance_loss_clip": 0.06300959, + "balance_loss_mlp": 0.01266664, + "epoch": 0.11267097549977453, + "flos": 26069680753920.0, + "grad_norm": 2.0146094287088454, + "language_loss": 0.92890203, + "learning_rate": 3.928734608181575e-06, + "loss": 1.00864863, + "num_input_tokens_seen": 40511580, + "router_z_loss_clip": 3.67578125, + "router_z_loss_mlp": 0.3918457, + "step": 1874, + "time_per_iteration": 2.594095230102539 + }, + { + "auxiliary_loss_clip": 0.06647091, + "auxiliary_loss_mlp": 0.01311618, + "balance_loss_clip": 0.06294458, + "balance_loss_mlp": 0.01272589, + "epoch": 0.11273109875244251, + "flos": 21074194437120.0, + "grad_norm": 2.447545582518425, + "language_loss": 0.7598331, + "learning_rate": 3.928631532900729e-06, + "loss": 0.8394202, + "num_input_tokens_seen": 40530155, + "router_z_loss_clip": 3.52539062, + "router_z_loss_mlp": 0.39038086, + "step": 1875, + "time_per_iteration": 2.5846669673919678 + }, + { + "auxiliary_loss_clip": 0.06650866, + "auxiliary_loss_mlp": 0.01305089, + "balance_loss_clip": 0.06300622, + "balance_loss_mlp": 0.01270042, + "epoch": 0.11279122200511048, + "flos": 27096299377920.0, + "grad_norm": 2.1373581639008603, + "language_loss": 0.73336905, + "learning_rate": 3.928528384485984e-06, + "loss": 0.81292862, + "num_input_tokens_seen": 40549500, + "router_z_loss_clip": 3.50585938, + "router_z_loss_mlp": 0.3503418, + "step": 1876, + "time_per_iteration": 3.9819693565368652 + }, + { + "auxiliary_loss_clip": 0.06655607, + "auxiliary_loss_mlp": 0.01304943, + "balance_loss_clip": 0.06303705, + "balance_loss_mlp": 0.01268489, + "epoch": 0.11285134525777844, + "flos": 20193163482240.0, + "grad_norm": 1.9863695087931013, + "language_loss": 0.78284073, + "learning_rate": 3.9284251629412475e-06, + "loss": 0.86244625, + "num_input_tokens_seen": 40567475, + "router_z_loss_clip": 3.51953125, + "router_z_loss_mlp": 0.36474609, + "step": 1877, + "time_per_iteration": 4.03458046913147 + }, + { + "auxiliary_loss_clip": 0.06652889, + "auxiliary_loss_mlp": 0.01306338, + "balance_loss_clip": 0.06294097, + "balance_loss_mlp": 0.01265139, + "epoch": 0.11291146851044641, + "flos": 12463348803840.0, + "grad_norm": 2.614643448765401, + "language_loss": 0.8943826, + "learning_rate": 3.928321868270436e-06, + "loss": 0.97397494, + "num_input_tokens_seen": 40583280, + "router_z_loss_clip": 3.58789062, + "router_z_loss_mlp": 0.41186523, + "step": 1878, + "time_per_iteration": 2.5039942264556885 + }, + { + "auxiliary_loss_clip": 0.06650617, + "auxiliary_loss_mlp": 0.01298934, + "balance_loss_clip": 0.0629722, + "balance_loss_mlp": 0.01262981, + "epoch": 0.11297159176311439, + "flos": 23849164828800.0, + "grad_norm": 2.5452203644148748, + "language_loss": 0.83347368, + "learning_rate": 3.928218500477466e-06, + "loss": 0.91296917, + "num_input_tokens_seen": 40603080, + "router_z_loss_clip": 3.53710938, + "router_z_loss_mlp": 0.35961914, + "step": 1879, + "time_per_iteration": 2.597705125808716 + }, + { + "auxiliary_loss_clip": 0.06658179, + "auxiliary_loss_mlp": 0.01304624, + "balance_loss_clip": 0.06296952, + "balance_loss_mlp": 0.01265333, + "epoch": 0.11303171501578235, + "flos": 29937585876480.0, + "grad_norm": 2.2031468075921765, + "language_loss": 0.71889591, + "learning_rate": 3.928115059566259e-06, + "loss": 0.79852396, + "num_input_tokens_seen": 40623255, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 0.39306641, + "step": 1880, + "time_per_iteration": 2.5943877696990967 + }, + { + "auxiliary_loss_clip": 0.06640352, + "auxiliary_loss_mlp": 0.01299738, + "balance_loss_clip": 0.06297569, + "balance_loss_mlp": 0.01262163, + "epoch": 0.11309183826845032, + "flos": 16186169381760.0, + "grad_norm": 2.477930763311184, + "language_loss": 0.74137151, + "learning_rate": 3.928011545540734e-06, + "loss": 0.82077241, + "num_input_tokens_seen": 40641570, + "router_z_loss_clip": 3.43164062, + "router_z_loss_mlp": 0.37573242, + "step": 1881, + "time_per_iteration": 2.5628225803375244 + }, + { + "auxiliary_loss_clip": 0.06661209, + "auxiliary_loss_mlp": 0.01303844, + "balance_loss_clip": 0.06301182, + "balance_loss_mlp": 0.01264767, + "epoch": 0.1131519615211183, + "flos": 12025537620480.0, + "grad_norm": 2.71671437451568, + "language_loss": 0.75070721, + "learning_rate": 3.927907958404819e-06, + "loss": 0.83035773, + "num_input_tokens_seen": 40658775, + "router_z_loss_clip": 3.59765625, + "router_z_loss_mlp": 0.39111328, + "step": 1882, + "time_per_iteration": 2.5252811908721924 + }, + { + "auxiliary_loss_clip": 0.06659748, + "auxiliary_loss_mlp": 0.01301896, + "balance_loss_clip": 0.06302463, + "balance_loss_mlp": 0.0126363, + "epoch": 0.11321208477378626, + "flos": 26257335972480.0, + "grad_norm": 2.360500107686341, + "language_loss": 0.81115943, + "learning_rate": 3.92780429816244e-06, + "loss": 0.89077592, + "num_input_tokens_seen": 40679555, + "router_z_loss_clip": 3.57226562, + "router_z_loss_mlp": 0.3828125, + "step": 1883, + "time_per_iteration": 2.6215126514434814 + }, + { + "auxiliary_loss_clip": 0.06662337, + "auxiliary_loss_mlp": 0.01301794, + "balance_loss_clip": 0.06305344, + "balance_loss_mlp": 0.01264076, + "epoch": 0.11327220802645423, + "flos": 13631530101120.0, + "grad_norm": 4.398339236734383, + "language_loss": 0.78793007, + "learning_rate": 3.927700564817529e-06, + "loss": 0.86757141, + "num_input_tokens_seen": 40697295, + "router_z_loss_clip": 3.56835938, + "router_z_loss_mlp": 0.37719727, + "step": 1884, + "time_per_iteration": 2.5176398754119873 + }, + { + "auxiliary_loss_clip": 0.06509344, + "auxiliary_loss_mlp": 0.0129272, + "balance_loss_clip": 0.06295511, + "balance_loss_mlp": 0.0127789, + "epoch": 0.1133323312791222, + "flos": 57210582787200.0, + "grad_norm": 0.8090343621743066, + "language_loss": 0.55328304, + "learning_rate": 3.927596758374019e-06, + "loss": 0.63130367, + "num_input_tokens_seen": 40758095, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.14794922, + "step": 1885, + "time_per_iteration": 3.0971505641937256 + }, + { + "auxiliary_loss_clip": 0.06646755, + "auxiliary_loss_mlp": 0.01313183, + "balance_loss_clip": 0.06301701, + "balance_loss_mlp": 0.01277062, + "epoch": 0.11339245453179017, + "flos": 24358407217920.0, + "grad_norm": 2.1975512476365444, + "language_loss": 0.917539, + "learning_rate": 3.927492878835848e-06, + "loss": 0.99713838, + "num_input_tokens_seen": 40777140, + "router_z_loss_clip": 3.45117188, + "router_z_loss_mlp": 0.36132812, + "step": 1886, + "time_per_iteration": 2.557039260864258 + }, + { + "auxiliary_loss_clip": 0.06661782, + "auxiliary_loss_mlp": 0.01305618, + "balance_loss_clip": 0.06311518, + "balance_loss_mlp": 0.01271882, + "epoch": 0.11345257778445814, + "flos": 22676665046400.0, + "grad_norm": 2.7768273002598427, + "language_loss": 0.86747134, + "learning_rate": 3.927388926206953e-06, + "loss": 0.94714534, + "num_input_tokens_seen": 40797505, + "router_z_loss_clip": 3.50195312, + "router_z_loss_mlp": 0.33740234, + "step": 1887, + "time_per_iteration": 2.586071014404297 + }, + { + "auxiliary_loss_clip": 0.06653242, + "auxiliary_loss_mlp": 0.01304972, + "balance_loss_clip": 0.06302808, + "balance_loss_mlp": 0.01268279, + "epoch": 0.11351270103712612, + "flos": 20993245793280.0, + "grad_norm": 4.850859640376328, + "language_loss": 0.7868247, + "learning_rate": 3.927284900491277e-06, + "loss": 0.86640686, + "num_input_tokens_seen": 40812970, + "router_z_loss_clip": 3.50195312, + "router_z_loss_mlp": 0.36694336, + "step": 1888, + "time_per_iteration": 2.5445072650909424 + }, + { + "auxiliary_loss_clip": 0.06662205, + "auxiliary_loss_mlp": 0.01311301, + "balance_loss_clip": 0.06306933, + "balance_loss_mlp": 0.01271366, + "epoch": 0.11357282428979408, + "flos": 37358014152960.0, + "grad_norm": 2.243152205453325, + "language_loss": 0.69439191, + "learning_rate": 3.927180801692764e-06, + "loss": 0.77412695, + "num_input_tokens_seen": 40837745, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 0.39916992, + "step": 1889, + "time_per_iteration": 2.7570948600769043 + }, + { + "auxiliary_loss_clip": 0.06658383, + "auxiliary_loss_mlp": 0.01303074, + "balance_loss_clip": 0.06306529, + "balance_loss_mlp": 0.01266811, + "epoch": 0.11363294754246205, + "flos": 21762580855680.0, + "grad_norm": 2.3560992330068, + "language_loss": 0.85365129, + "learning_rate": 3.927076629815362e-06, + "loss": 0.93326581, + "num_input_tokens_seen": 40856490, + "router_z_loss_clip": 3.51953125, + "router_z_loss_mlp": 0.36279297, + "step": 1890, + "time_per_iteration": 2.539299964904785 + }, + { + "auxiliary_loss_clip": 0.06646931, + "auxiliary_loss_mlp": 0.0130946, + "balance_loss_clip": 0.06299055, + "balance_loss_mlp": 0.01272887, + "epoch": 0.11369307079513001, + "flos": 22608252587520.0, + "grad_norm": 3.2867804654433734, + "language_loss": 0.66679269, + "learning_rate": 3.926972384863022e-06, + "loss": 0.74635661, + "num_input_tokens_seen": 40874070, + "router_z_loss_clip": 3.48046875, + "router_z_loss_mlp": 0.36572266, + "step": 1891, + "time_per_iteration": 2.5804758071899414 + }, + { + "auxiliary_loss_clip": 0.06662975, + "auxiliary_loss_mlp": 0.01306025, + "balance_loss_clip": 0.06305033, + "balance_loss_mlp": 0.01268188, + "epoch": 0.11375319404779799, + "flos": 21950655344640.0, + "grad_norm": 2.3010503008358887, + "language_loss": 0.89755237, + "learning_rate": 3.9268680668396956e-06, + "loss": 0.97724235, + "num_input_tokens_seen": 40892425, + "router_z_loss_clip": 3.58398438, + "router_z_loss_mlp": 0.37817383, + "step": 1892, + "time_per_iteration": 2.5231149196624756 + }, + { + "auxiliary_loss_clip": 0.06664805, + "auxiliary_loss_mlp": 0.01310273, + "balance_loss_clip": 0.06304479, + "balance_loss_mlp": 0.01271149, + "epoch": 0.11381331730046595, + "flos": 26402588225280.0, + "grad_norm": 2.9760722646413966, + "language_loss": 0.75163257, + "learning_rate": 3.926763675749339e-06, + "loss": 0.83138341, + "num_input_tokens_seen": 40912190, + "router_z_loss_clip": 3.60546875, + "router_z_loss_mlp": 0.39111328, + "step": 1893, + "time_per_iteration": 2.6722171306610107 + }, + { + "auxiliary_loss_clip": 0.06657124, + "auxiliary_loss_mlp": 0.0130867, + "balance_loss_clip": 0.06306865, + "balance_loss_mlp": 0.01271405, + "epoch": 0.11387344055313392, + "flos": 23811373837440.0, + "grad_norm": 2.1739305302665417, + "language_loss": 0.81218535, + "learning_rate": 3.92665921159591e-06, + "loss": 0.89184326, + "num_input_tokens_seen": 40928395, + "router_z_loss_clip": 3.5, + "router_z_loss_mlp": 0.37255859, + "step": 1894, + "time_per_iteration": 2.5737743377685547 + }, + { + "auxiliary_loss_clip": 0.06661002, + "auxiliary_loss_mlp": 0.01313123, + "balance_loss_clip": 0.06302214, + "balance_loss_mlp": 0.01272187, + "epoch": 0.1139335638058019, + "flos": 34529865546240.0, + "grad_norm": 3.0499673553250317, + "language_loss": 0.81167793, + "learning_rate": 3.926554674383371e-06, + "loss": 0.89141917, + "num_input_tokens_seen": 40946555, + "router_z_loss_clip": 3.5859375, + "router_z_loss_mlp": 0.40991211, + "step": 1895, + "time_per_iteration": 2.6510303020477295 + }, + { + "auxiliary_loss_clip": 0.06495596, + "auxiliary_loss_mlp": 0.01269878, + "balance_loss_clip": 0.06284232, + "balance_loss_mlp": 0.01256026, + "epoch": 0.11399368705846986, + "flos": 70609790643840.0, + "grad_norm": 0.7664991761837657, + "language_loss": 0.63306981, + "learning_rate": 3.926450064115686e-06, + "loss": 0.71072453, + "num_input_tokens_seen": 41004910, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.13891602, + "step": 1896, + "time_per_iteration": 3.2715020179748535 + }, + { + "auxiliary_loss_clip": 0.06653456, + "auxiliary_loss_mlp": 0.01306088, + "balance_loss_clip": 0.06306494, + "balance_loss_mlp": 0.01266224, + "epoch": 0.11405381031113783, + "flos": 21330597530880.0, + "grad_norm": 2.7976416245645988, + "language_loss": 0.86136234, + "learning_rate": 3.926345380796821e-06, + "loss": 0.94095778, + "num_input_tokens_seen": 41026385, + "router_z_loss_clip": 3.47070312, + "router_z_loss_mlp": 0.3984375, + "step": 1897, + "time_per_iteration": 2.602890729904175 + }, + { + "auxiliary_loss_clip": 0.06656732, + "auxiliary_loss_mlp": 0.01307974, + "balance_loss_clip": 0.06304093, + "balance_loss_mlp": 0.01270041, + "epoch": 0.11411393356380581, + "flos": 19725820934400.0, + "grad_norm": 2.6374143353220068, + "language_loss": 0.80644619, + "learning_rate": 3.9262406244307465e-06, + "loss": 0.88609326, + "num_input_tokens_seen": 41045315, + "router_z_loss_clip": 3.52734375, + "router_z_loss_mlp": 0.37915039, + "step": 1898, + "time_per_iteration": 2.5834596157073975 + }, + { + "auxiliary_loss_clip": 0.06665078, + "auxiliary_loss_mlp": 0.0130214, + "balance_loss_clip": 0.06307302, + "balance_loss_mlp": 0.01261823, + "epoch": 0.11417405681647377, + "flos": 17536261893120.0, + "grad_norm": 3.558801225381502, + "language_loss": 0.74948764, + "learning_rate": 3.926135795021435e-06, + "loss": 0.82915986, + "num_input_tokens_seen": 41063390, + "router_z_loss_clip": 3.57226562, + "router_z_loss_mlp": 0.40283203, + "step": 1899, + "time_per_iteration": 2.5195093154907227 + }, + { + "auxiliary_loss_clip": 0.06484325, + "auxiliary_loss_mlp": 0.01277698, + "balance_loss_clip": 0.06276824, + "balance_loss_mlp": 0.01262463, + "epoch": 0.11423418006914174, + "flos": 59694168205440.0, + "grad_norm": 0.8563849035990295, + "language_loss": 0.63607001, + "learning_rate": 3.92603089257286e-06, + "loss": 0.71369016, + "num_input_tokens_seen": 41124180, + "router_z_loss_clip": 2.078125, + "router_z_loss_mlp": 0.15209961, + "step": 1900, + "time_per_iteration": 3.140596389770508 + }, + { + "auxiliary_loss_clip": 0.06654657, + "auxiliary_loss_mlp": 0.01295658, + "balance_loss_clip": 0.06311209, + "balance_loss_mlp": 0.01260706, + "epoch": 0.1142943033218097, + "flos": 22969223976960.0, + "grad_norm": 2.413799712437086, + "language_loss": 0.7948848, + "learning_rate": 3.925925917089001e-06, + "loss": 0.87438798, + "num_input_tokens_seen": 41143485, + "router_z_loss_clip": 3.43359375, + "router_z_loss_mlp": 0.34960938, + "step": 1901, + "time_per_iteration": 2.5521771907806396 + }, + { + "auxiliary_loss_clip": 0.06657314, + "auxiliary_loss_mlp": 0.01303255, + "balance_loss_clip": 0.06311248, + "balance_loss_mlp": 0.01264011, + "epoch": 0.11435442657447768, + "flos": 18261558835200.0, + "grad_norm": 2.3832212906881862, + "language_loss": 0.8530966, + "learning_rate": 3.925820868573839e-06, + "loss": 0.93270218, + "num_input_tokens_seen": 41161695, + "router_z_loss_clip": 3.4609375, + "router_z_loss_mlp": 0.39257812, + "step": 1902, + "time_per_iteration": 2.538130521774292 + }, + { + "auxiliary_loss_clip": 0.06657556, + "auxiliary_loss_mlp": 0.01298528, + "balance_loss_clip": 0.06305373, + "balance_loss_mlp": 0.01259737, + "epoch": 0.11441454982714565, + "flos": 24068070420480.0, + "grad_norm": 1.6413453356185448, + "language_loss": 0.79046285, + "learning_rate": 3.925715747031356e-06, + "loss": 0.87002361, + "num_input_tokens_seen": 41181715, + "router_z_loss_clip": 3.52539062, + "router_z_loss_mlp": 0.38793945, + "step": 1903, + "time_per_iteration": 2.5491714477539062 + }, + { + "auxiliary_loss_clip": 0.0665084, + "auxiliary_loss_mlp": 0.01296782, + "balance_loss_clip": 0.06302907, + "balance_loss_mlp": 0.01262021, + "epoch": 0.11447467307981361, + "flos": 25344719228160.0, + "grad_norm": 2.444047148927425, + "language_loss": 0.7716713, + "learning_rate": 3.925610552465539e-06, + "loss": 0.85114753, + "num_input_tokens_seen": 41201770, + "router_z_loss_clip": 3.47851562, + "router_z_loss_mlp": 0.34765625, + "step": 1904, + "time_per_iteration": 2.581732749938965 + }, + { + "auxiliary_loss_clip": 0.0665014, + "auxiliary_loss_mlp": 0.01305214, + "balance_loss_clip": 0.0630317, + "balance_loss_mlp": 0.01263967, + "epoch": 0.11453479633248159, + "flos": 21732546366720.0, + "grad_norm": 2.531757155305884, + "language_loss": 0.9328481, + "learning_rate": 3.9255052848803764e-06, + "loss": 1.01240158, + "num_input_tokens_seen": 41220590, + "router_z_loss_clip": 3.46484375, + "router_z_loss_mlp": 0.41259766, + "step": 1905, + "time_per_iteration": 2.5455148220062256 + }, + { + "auxiliary_loss_clip": 0.06677254, + "auxiliary_loss_mlp": 0.01302143, + "balance_loss_clip": 0.06310458, + "balance_loss_mlp": 0.0126185, + "epoch": 0.11459491958514956, + "flos": 12974771399040.0, + "grad_norm": 15.201644676234393, + "language_loss": 0.79179782, + "learning_rate": 3.925399944279861e-06, + "loss": 0.87159181, + "num_input_tokens_seen": 41237250, + "router_z_loss_clip": 3.66992188, + "router_z_loss_mlp": 0.40258789, + "step": 1906, + "time_per_iteration": 2.557220220565796 + }, + { + "auxiliary_loss_clip": 0.06651148, + "auxiliary_loss_mlp": 0.01309487, + "balance_loss_clip": 0.06300925, + "balance_loss_mlp": 0.0127022, + "epoch": 0.11465504283781752, + "flos": 22717935982080.0, + "grad_norm": 2.7916231383135903, + "language_loss": 0.84417903, + "learning_rate": 3.925294530667986e-06, + "loss": 0.92378545, + "num_input_tokens_seen": 41256680, + "router_z_loss_clip": 3.50195312, + "router_z_loss_mlp": 0.39257812, + "step": 1907, + "time_per_iteration": 2.538357734680176 + }, + { + "auxiliary_loss_clip": 0.06659371, + "auxiliary_loss_mlp": 0.01305713, + "balance_loss_clip": 0.06306633, + "balance_loss_mlp": 0.01266064, + "epoch": 0.1147151660904855, + "flos": 23404142194560.0, + "grad_norm": 5.983288386648609, + "language_loss": 0.85784996, + "learning_rate": 3.92518904404875e-06, + "loss": 0.93750072, + "num_input_tokens_seen": 41270955, + "router_z_loss_clip": 3.53125, + "router_z_loss_mlp": 0.39648438, + "step": 1908, + "time_per_iteration": 2.566323757171631 + }, + { + "auxiliary_loss_clip": 0.06483665, + "auxiliary_loss_mlp": 0.01269821, + "balance_loss_clip": 0.0627609, + "balance_loss_mlp": 0.01254252, + "epoch": 0.11477528934315347, + "flos": 63028639036800.0, + "grad_norm": 0.8722245963969955, + "language_loss": 0.60927975, + "learning_rate": 3.925083484426153e-06, + "loss": 0.68681461, + "num_input_tokens_seen": 41319180, + "router_z_loss_clip": 2.07617188, + "router_z_loss_mlp": 0.15551758, + "step": 1909, + "time_per_iteration": 2.9047083854675293 + }, + { + "auxiliary_loss_clip": 0.06651932, + "auxiliary_loss_mlp": 0.01304657, + "balance_loss_clip": 0.06305454, + "balance_loss_mlp": 0.01265223, + "epoch": 0.11483541259582143, + "flos": 16331086218240.0, + "grad_norm": 2.669666495614271, + "language_loss": 0.8074221, + "learning_rate": 3.924977851804197e-06, + "loss": 0.88698798, + "num_input_tokens_seen": 41337480, + "router_z_loss_clip": 3.46289062, + "router_z_loss_mlp": 0.39404297, + "step": 1910, + "time_per_iteration": 2.5531835556030273 + }, + { + "auxiliary_loss_clip": 0.06656756, + "auxiliary_loss_mlp": 0.01297855, + "balance_loss_clip": 0.06303862, + "balance_loss_mlp": 0.01258516, + "epoch": 0.1148955358484894, + "flos": 21586916770560.0, + "grad_norm": 2.9098941838716046, + "language_loss": 0.78589714, + "learning_rate": 3.9248721461868875e-06, + "loss": 0.86544329, + "num_input_tokens_seen": 41354650, + "router_z_loss_clip": 3.52734375, + "router_z_loss_mlp": 0.39331055, + "step": 1911, + "time_per_iteration": 3.928828477859497 + }, + { + "auxiliary_loss_clip": 0.06639488, + "auxiliary_loss_mlp": 0.01303362, + "balance_loss_clip": 0.06301475, + "balance_loss_mlp": 0.01266931, + "epoch": 0.11495565910115738, + "flos": 27681249530880.0, + "grad_norm": 2.02553210679246, + "language_loss": 0.80990648, + "learning_rate": 3.9247663675782336e-06, + "loss": 0.88933504, + "num_input_tokens_seen": 41376935, + "router_z_loss_clip": 3.38085938, + "router_z_loss_mlp": 0.36401367, + "step": 1912, + "time_per_iteration": 2.5985615253448486 + }, + { + "auxiliary_loss_clip": 0.06649567, + "auxiliary_loss_mlp": 0.01304436, + "balance_loss_clip": 0.06303079, + "balance_loss_mlp": 0.01266575, + "epoch": 0.11501578235382534, + "flos": 20638815022080.0, + "grad_norm": 2.0778571754475124, + "language_loss": 0.79150605, + "learning_rate": 3.924660515982246e-06, + "loss": 0.87104607, + "num_input_tokens_seen": 41396105, + "router_z_loss_clip": 3.46484375, + "router_z_loss_mlp": 0.37866211, + "step": 1913, + "time_per_iteration": 3.9840147495269775 + }, + { + "auxiliary_loss_clip": 0.06649221, + "auxiliary_loss_mlp": 0.01302596, + "balance_loss_clip": 0.06302825, + "balance_loss_mlp": 0.01266214, + "epoch": 0.1150759056064933, + "flos": 19835252766720.0, + "grad_norm": 2.174223201073213, + "language_loss": 0.71977127, + "learning_rate": 3.924554591402939e-06, + "loss": 0.79928941, + "num_input_tokens_seen": 41415600, + "router_z_loss_clip": 3.46484375, + "router_z_loss_mlp": 0.36352539, + "step": 1914, + "time_per_iteration": 2.564162492752075 + }, + { + "auxiliary_loss_clip": 0.06490675, + "auxiliary_loss_mlp": 0.01271492, + "balance_loss_clip": 0.06283194, + "balance_loss_mlp": 0.01257139, + "epoch": 0.11513602885916129, + "flos": 70068543194880.0, + "grad_norm": 0.7330745369663106, + "language_loss": 0.61048496, + "learning_rate": 3.92444859384433e-06, + "loss": 0.68810666, + "num_input_tokens_seen": 41478760, + "router_z_loss_clip": 2.0625, + "router_z_loss_mlp": 0.14343262, + "step": 1915, + "time_per_iteration": 4.616885662078857 + }, + { + "auxiliary_loss_clip": 0.06646329, + "auxiliary_loss_mlp": 0.01309796, + "balance_loss_clip": 0.06301694, + "balance_loss_mlp": 0.01271697, + "epoch": 0.11519615211182925, + "flos": 15747100387200.0, + "grad_norm": 2.8536727053056077, + "language_loss": 0.94662005, + "learning_rate": 3.924342523310436e-06, + "loss": 1.02618122, + "num_input_tokens_seen": 41495720, + "router_z_loss_clip": 3.44726562, + "router_z_loss_mlp": 0.38085938, + "step": 1916, + "time_per_iteration": 2.544074058532715 + }, + { + "auxiliary_loss_clip": 0.06649305, + "auxiliary_loss_mlp": 0.01297855, + "balance_loss_clip": 0.06298364, + "balance_loss_mlp": 0.01258945, + "epoch": 0.11525627536449722, + "flos": 20673880755840.0, + "grad_norm": 1.9176091228095486, + "language_loss": 0.73714519, + "learning_rate": 3.9242363798052806e-06, + "loss": 0.81661683, + "num_input_tokens_seen": 41513585, + "router_z_loss_clip": 3.5078125, + "router_z_loss_mlp": 0.3894043, + "step": 1917, + "time_per_iteration": 3.988520383834839 + }, + { + "auxiliary_loss_clip": 0.06637132, + "auxiliary_loss_mlp": 0.01303977, + "balance_loss_clip": 0.06296226, + "balance_loss_mlp": 0.01264876, + "epoch": 0.1153163986171652, + "flos": 20309555203200.0, + "grad_norm": 2.2006178662795546, + "language_loss": 0.7638135, + "learning_rate": 3.92413016333289e-06, + "loss": 0.84322459, + "num_input_tokens_seen": 41533390, + "router_z_loss_clip": 3.40820312, + "router_z_loss_mlp": 0.39135742, + "step": 1918, + "time_per_iteration": 2.531501531600952 + }, + { + "auxiliary_loss_clip": 0.06653848, + "auxiliary_loss_mlp": 0.01302011, + "balance_loss_clip": 0.06300295, + "balance_loss_mlp": 0.01263983, + "epoch": 0.11537652186983316, + "flos": 17645064819840.0, + "grad_norm": 6.624924967769877, + "language_loss": 0.87652063, + "learning_rate": 3.92402387389729e-06, + "loss": 0.95607924, + "num_input_tokens_seen": 41551015, + "router_z_loss_clip": 3.53320312, + "router_z_loss_mlp": 0.38037109, + "step": 1919, + "time_per_iteration": 2.5388336181640625 + }, + { + "auxiliary_loss_clip": 0.06642918, + "auxiliary_loss_mlp": 0.01303256, + "balance_loss_clip": 0.06300272, + "balance_loss_mlp": 0.01265872, + "epoch": 0.11543664512250112, + "flos": 21075787664640.0, + "grad_norm": 2.5165855021660697, + "language_loss": 0.87737721, + "learning_rate": 3.923917511502512e-06, + "loss": 0.95683897, + "num_input_tokens_seen": 41568055, + "router_z_loss_clip": 3.42773438, + "router_z_loss_mlp": 0.37402344, + "step": 1920, + "time_per_iteration": 2.536255121231079 + }, + { + "auxiliary_loss_clip": 0.0663945, + "auxiliary_loss_mlp": 0.01300031, + "balance_loss_clip": 0.06300904, + "balance_loss_mlp": 0.01262671, + "epoch": 0.11549676837516909, + "flos": 22754175672960.0, + "grad_norm": 2.0755692503441696, + "language_loss": 0.81216776, + "learning_rate": 3.923811076152589e-06, + "loss": 0.89156258, + "num_input_tokens_seen": 41587435, + "router_z_loss_clip": 3.3828125, + "router_z_loss_mlp": 0.3737793, + "step": 1921, + "time_per_iteration": 2.5809693336486816 + }, + { + "auxiliary_loss_clip": 0.06661837, + "auxiliary_loss_mlp": 0.01301821, + "balance_loss_clip": 0.06303193, + "balance_loss_mlp": 0.0126036, + "epoch": 0.11555689162783707, + "flos": 19174510995840.0, + "grad_norm": 2.11935003712056, + "language_loss": 0.79765266, + "learning_rate": 3.923704567851557e-06, + "loss": 0.87728924, + "num_input_tokens_seen": 41604975, + "router_z_loss_clip": 3.5859375, + "router_z_loss_mlp": 0.41455078, + "step": 1922, + "time_per_iteration": 2.521562099456787 + }, + { + "auxiliary_loss_clip": 0.06651014, + "auxiliary_loss_mlp": 0.01303966, + "balance_loss_clip": 0.06302896, + "balance_loss_mlp": 0.01265939, + "epoch": 0.11561701488050503, + "flos": 24579031818240.0, + "grad_norm": 1.9630494189649508, + "language_loss": 0.85855269, + "learning_rate": 3.923597986603456e-06, + "loss": 0.93810248, + "num_input_tokens_seen": 41626155, + "router_z_loss_clip": 3.48046875, + "router_z_loss_mlp": 0.38037109, + "step": 1923, + "time_per_iteration": 2.6439831256866455 + }, + { + "auxiliary_loss_clip": 0.06647194, + "auxiliary_loss_mlp": 0.01294133, + "balance_loss_clip": 0.0630134, + "balance_loss_mlp": 0.01258465, + "epoch": 0.115677138133173, + "flos": 17098283001600.0, + "grad_norm": 2.06344411433486, + "language_loss": 0.8208636, + "learning_rate": 3.9234913324123264e-06, + "loss": 0.90027684, + "num_input_tokens_seen": 41644805, + "router_z_loss_clip": 3.4609375, + "router_z_loss_mlp": 0.35668945, + "step": 1924, + "time_per_iteration": 2.5213494300842285 + }, + { + "auxiliary_loss_clip": 0.06494077, + "auxiliary_loss_mlp": 0.01268349, + "balance_loss_clip": 0.06289093, + "balance_loss_mlp": 0.01252459, + "epoch": 0.11573726138584098, + "flos": 62724032317440.0, + "grad_norm": 0.8075731701213882, + "language_loss": 0.60936594, + "learning_rate": 3.923384605282212e-06, + "loss": 0.6869902, + "num_input_tokens_seen": 41709345, + "router_z_loss_clip": 2.06054688, + "router_z_loss_mlp": 0.15881348, + "step": 1925, + "time_per_iteration": 3.2047207355499268 + }, + { + "auxiliary_loss_clip": 0.06648477, + "auxiliary_loss_mlp": 0.01300045, + "balance_loss_clip": 0.06303966, + "balance_loss_mlp": 0.01261016, + "epoch": 0.11579738463850894, + "flos": 22607665608960.0, + "grad_norm": 2.013389480073572, + "language_loss": 0.76518846, + "learning_rate": 3.923277805217161e-06, + "loss": 0.84467369, + "num_input_tokens_seen": 41730210, + "router_z_loss_clip": 3.44726562, + "router_z_loss_mlp": 0.39038086, + "step": 1926, + "time_per_iteration": 2.55283784866333 + }, + { + "auxiliary_loss_clip": 0.06666763, + "auxiliary_loss_mlp": 0.01299238, + "balance_loss_clip": 0.06301835, + "balance_loss_mlp": 0.01255583, + "epoch": 0.11585750789117691, + "flos": 21732630220800.0, + "grad_norm": 5.887246019394102, + "language_loss": 0.7431767, + "learning_rate": 3.923170932221222e-06, + "loss": 0.82283664, + "num_input_tokens_seen": 41750270, + "router_z_loss_clip": 3.64648438, + "router_z_loss_mlp": 0.43652344, + "step": 1927, + "time_per_iteration": 2.560518503189087 + }, + { + "auxiliary_loss_clip": 0.06652652, + "auxiliary_loss_mlp": 0.01306042, + "balance_loss_clip": 0.0630243, + "balance_loss_mlp": 0.01264986, + "epoch": 0.11591763114384489, + "flos": 26294917328640.0, + "grad_norm": 2.5509114333241873, + "language_loss": 0.88765574, + "learning_rate": 3.92306398629845e-06, + "loss": 0.96724266, + "num_input_tokens_seen": 41772975, + "router_z_loss_clip": 3.50585938, + "router_z_loss_mlp": 0.41064453, + "step": 1928, + "time_per_iteration": 2.6590919494628906 + }, + { + "auxiliary_loss_clip": 0.06657438, + "auxiliary_loss_mlp": 0.01301093, + "balance_loss_clip": 0.06300268, + "balance_loss_mlp": 0.01261468, + "epoch": 0.11597775439651285, + "flos": 23006721479040.0, + "grad_norm": 2.0893495121762844, + "language_loss": 0.7806766, + "learning_rate": 3.922956967452898e-06, + "loss": 0.86026198, + "num_input_tokens_seen": 41791765, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 0.39648438, + "step": 1929, + "time_per_iteration": 2.5792133808135986 + }, + { + "auxiliary_loss_clip": 0.06650299, + "auxiliary_loss_mlp": 0.01295794, + "balance_loss_clip": 0.06304935, + "balance_loss_mlp": 0.01259626, + "epoch": 0.11603787764918082, + "flos": 31949845678080.0, + "grad_norm": 1.6257603780251215, + "language_loss": 0.78351086, + "learning_rate": 3.922849875688626e-06, + "loss": 0.86297178, + "num_input_tokens_seen": 41815615, + "router_z_loss_clip": 3.453125, + "router_z_loss_mlp": 0.36181641, + "step": 1930, + "time_per_iteration": 2.6880123615264893 + }, + { + "auxiliary_loss_clip": 0.06647912, + "auxiliary_loss_mlp": 0.01295728, + "balance_loss_clip": 0.06302683, + "balance_loss_mlp": 0.01257438, + "epoch": 0.1160980009018488, + "flos": 22277944592640.0, + "grad_norm": 1.7868265367767153, + "language_loss": 0.73173678, + "learning_rate": 3.922742711009693e-06, + "loss": 0.81117314, + "num_input_tokens_seen": 41834810, + "router_z_loss_clip": 3.453125, + "router_z_loss_mlp": 0.3828125, + "step": 1931, + "time_per_iteration": 2.5717685222625732 + }, + { + "auxiliary_loss_clip": 0.06652078, + "auxiliary_loss_mlp": 0.01303044, + "balance_loss_clip": 0.06304099, + "balance_loss_mlp": 0.01264539, + "epoch": 0.11615812415451676, + "flos": 22790205728640.0, + "grad_norm": 1.6665760080165584, + "language_loss": 0.8340829, + "learning_rate": 3.922635473420164e-06, + "loss": 0.91363412, + "num_input_tokens_seen": 41854975, + "router_z_loss_clip": 3.47851562, + "router_z_loss_mlp": 0.38500977, + "step": 1932, + "time_per_iteration": 2.601752519607544 + }, + { + "auxiliary_loss_clip": 0.0648433, + "auxiliary_loss_mlp": 0.01265345, + "balance_loss_clip": 0.06279489, + "balance_loss_mlp": 0.01250242, + "epoch": 0.11621824740718473, + "flos": 67165483438080.0, + "grad_norm": 0.7530575515980809, + "language_loss": 0.61312342, + "learning_rate": 3.922528162924105e-06, + "loss": 0.69062018, + "num_input_tokens_seen": 41911105, + "router_z_loss_clip": 2.046875, + "router_z_loss_mlp": 0.15075684, + "step": 1933, + "time_per_iteration": 3.078101873397827 + }, + { + "auxiliary_loss_clip": 0.06656399, + "auxiliary_loss_mlp": 0.01297791, + "balance_loss_clip": 0.06303177, + "balance_loss_mlp": 0.01259239, + "epoch": 0.11627837065985269, + "flos": 20382160366080.0, + "grad_norm": 2.5724054750959446, + "language_loss": 0.8773917, + "learning_rate": 3.922420779525586e-06, + "loss": 0.95693362, + "num_input_tokens_seen": 41931750, + "router_z_loss_clip": 3.52929688, + "router_z_loss_mlp": 0.38574219, + "step": 1934, + "time_per_iteration": 2.5999112129211426 + }, + { + "auxiliary_loss_clip": 0.06669597, + "auxiliary_loss_mlp": 0.01303802, + "balance_loss_clip": 0.0630424, + "balance_loss_mlp": 0.01260386, + "epoch": 0.11633849391252067, + "flos": 21732252877440.0, + "grad_norm": 3.12484100633917, + "language_loss": 0.67964768, + "learning_rate": 3.9223133232286776e-06, + "loss": 0.75938165, + "num_input_tokens_seen": 41949400, + "router_z_loss_clip": 3.65625, + "router_z_loss_mlp": 0.43408203, + "step": 1935, + "time_per_iteration": 2.5801587104797363 + }, + { + "auxiliary_loss_clip": 0.06657647, + "auxiliary_loss_mlp": 0.01296559, + "balance_loss_clip": 0.06305058, + "balance_loss_mlp": 0.01259485, + "epoch": 0.11639861716518864, + "flos": 18811023984000.0, + "grad_norm": 1.935927362539055, + "language_loss": 0.77021551, + "learning_rate": 3.922205794037456e-06, + "loss": 0.84975761, + "num_input_tokens_seen": 41968100, + "router_z_loss_clip": 3.52734375, + "router_z_loss_mlp": 0.37084961, + "step": 1936, + "time_per_iteration": 2.5624840259552 + }, + { + "auxiliary_loss_clip": 0.06655373, + "auxiliary_loss_mlp": 0.01299017, + "balance_loss_clip": 0.06303351, + "balance_loss_mlp": 0.01259678, + "epoch": 0.1164587404178566, + "flos": 21221333406720.0, + "grad_norm": 1.9207342779057202, + "language_loss": 0.85928023, + "learning_rate": 3.922098191955998e-06, + "loss": 0.93882406, + "num_input_tokens_seen": 41986375, + "router_z_loss_clip": 3.51953125, + "router_z_loss_mlp": 0.39355469, + "step": 1937, + "time_per_iteration": 2.5510001182556152 + }, + { + "auxiliary_loss_clip": 0.06649198, + "auxiliary_loss_mlp": 0.01298206, + "balance_loss_clip": 0.06305847, + "balance_loss_mlp": 0.01261561, + "epoch": 0.11651886367052458, + "flos": 27826040586240.0, + "grad_norm": 2.6065443485594613, + "language_loss": 0.78032261, + "learning_rate": 3.921990516988384e-06, + "loss": 0.85979664, + "num_input_tokens_seen": 42006055, + "router_z_loss_clip": 3.43359375, + "router_z_loss_mlp": 0.36645508, + "step": 1938, + "time_per_iteration": 2.6225640773773193 + }, + { + "auxiliary_loss_clip": 0.06663075, + "auxiliary_loss_mlp": 0.01303768, + "balance_loss_clip": 0.06310252, + "balance_loss_mlp": 0.01266098, + "epoch": 0.11657898692319255, + "flos": 22895570638080.0, + "grad_norm": 1.931552039208485, + "language_loss": 0.80530608, + "learning_rate": 3.921882769138696e-06, + "loss": 0.88497448, + "num_input_tokens_seen": 42024995, + "router_z_loss_clip": 3.52929688, + "router_z_loss_mlp": 0.37670898, + "step": 1939, + "time_per_iteration": 2.5451977252960205 + }, + { + "auxiliary_loss_clip": 0.06656967, + "auxiliary_loss_mlp": 0.01296552, + "balance_loss_clip": 0.06312265, + "balance_loss_mlp": 0.01261409, + "epoch": 0.11663911017586051, + "flos": 24322712578560.0, + "grad_norm": 2.6690615994939795, + "language_loss": 0.88347197, + "learning_rate": 3.9217749484110215e-06, + "loss": 0.96300709, + "num_input_tokens_seen": 42042640, + "router_z_loss_clip": 3.4453125, + "router_z_loss_mlp": 0.3515625, + "step": 1940, + "time_per_iteration": 2.572737216949463 + }, + { + "auxiliary_loss_clip": 0.06642211, + "auxiliary_loss_mlp": 0.01298321, + "balance_loss_clip": 0.06303503, + "balance_loss_mlp": 0.01262987, + "epoch": 0.11669923342852849, + "flos": 42350020525440.0, + "grad_norm": 1.538525373225641, + "language_loss": 0.7696858, + "learning_rate": 3.921667054809449e-06, + "loss": 0.84909111, + "num_input_tokens_seen": 42067005, + "router_z_loss_clip": 3.38671875, + "router_z_loss_mlp": 0.35327148, + "step": 1941, + "time_per_iteration": 2.72994065284729 + }, + { + "auxiliary_loss_clip": 0.06658466, + "auxiliary_loss_mlp": 0.01294978, + "balance_loss_clip": 0.06313083, + "balance_loss_mlp": 0.01259525, + "epoch": 0.11675935668119646, + "flos": 14646660716160.0, + "grad_norm": 2.147321627209633, + "language_loss": 0.9028796, + "learning_rate": 3.921559088338068e-06, + "loss": 0.98241401, + "num_input_tokens_seen": 42082295, + "router_z_loss_clip": 3.453125, + "router_z_loss_mlp": 0.35449219, + "step": 1942, + "time_per_iteration": 2.550832986831665 + }, + { + "auxiliary_loss_clip": 0.06645136, + "auxiliary_loss_mlp": 0.0129601, + "balance_loss_clip": 0.06305736, + "balance_loss_mlp": 0.01262154, + "epoch": 0.11681947993386442, + "flos": 35125213605120.0, + "grad_norm": 1.8932460092328547, + "language_loss": 0.69414169, + "learning_rate": 3.921451049000975e-06, + "loss": 0.77355313, + "num_input_tokens_seen": 42105295, + "router_z_loss_clip": 3.39453125, + "router_z_loss_mlp": 0.33813477, + "step": 1943, + "time_per_iteration": 2.6689436435699463 + }, + { + "auxiliary_loss_clip": 0.06646268, + "auxiliary_loss_mlp": 0.01301771, + "balance_loss_clip": 0.06305961, + "balance_loss_mlp": 0.01264721, + "epoch": 0.11687960318653239, + "flos": 38992531749120.0, + "grad_norm": 3.030291623904481, + "language_loss": 0.71275461, + "learning_rate": 3.921342936802265e-06, + "loss": 0.79223496, + "num_input_tokens_seen": 42125520, + "router_z_loss_clip": 3.40429688, + "router_z_loss_mlp": 0.37060547, + "step": 1944, + "time_per_iteration": 2.8050050735473633 + }, + { + "auxiliary_loss_clip": 0.06641431, + "auxiliary_loss_mlp": 0.01296797, + "balance_loss_clip": 0.06306483, + "balance_loss_mlp": 0.01261606, + "epoch": 0.11693972643920036, + "flos": 26002190689920.0, + "grad_norm": 1.654338946560172, + "language_loss": 0.83736217, + "learning_rate": 3.921234751746038e-06, + "loss": 0.91674441, + "num_input_tokens_seen": 42146335, + "router_z_loss_clip": 3.34765625, + "router_z_loss_mlp": 0.35205078, + "step": 1945, + "time_per_iteration": 2.6361136436462402 + }, + { + "auxiliary_loss_clip": 0.06650846, + "auxiliary_loss_mlp": 0.01293506, + "balance_loss_clip": 0.06312834, + "balance_loss_mlp": 0.01259579, + "epoch": 0.11699984969186833, + "flos": 27279552257280.0, + "grad_norm": 2.078454883436641, + "language_loss": 0.78074771, + "learning_rate": 3.9211264938363975e-06, + "loss": 0.86019123, + "num_input_tokens_seen": 42165320, + "router_z_loss_clip": 3.38085938, + "router_z_loss_mlp": 0.33935547, + "step": 1946, + "time_per_iteration": 2.6417500972747803 + }, + { + "auxiliary_loss_clip": 0.06645864, + "auxiliary_loss_mlp": 0.01291798, + "balance_loss_clip": 0.06307344, + "balance_loss_mlp": 0.01256083, + "epoch": 0.1170599729445363, + "flos": 15273217221120.0, + "grad_norm": 2.310732730392425, + "language_loss": 0.70257539, + "learning_rate": 3.921018163077448e-06, + "loss": 0.78195202, + "num_input_tokens_seen": 42182955, + "router_z_loss_clip": 3.38671875, + "router_z_loss_mlp": 0.35717773, + "step": 1947, + "time_per_iteration": 2.536513090133667 + }, + { + "auxiliary_loss_clip": 0.0665355, + "auxiliary_loss_mlp": 0.01301689, + "balance_loss_clip": 0.0630812, + "balance_loss_mlp": 0.01263113, + "epoch": 0.11712009619720427, + "flos": 17170007696640.0, + "grad_norm": 1.8188768357243443, + "language_loss": 0.86507225, + "learning_rate": 3.920909759473295e-06, + "loss": 0.94462466, + "num_input_tokens_seen": 42200760, + "router_z_loss_clip": 3.45507812, + "router_z_loss_mlp": 0.38574219, + "step": 1948, + "time_per_iteration": 2.515779495239258 + }, + { + "auxiliary_loss_clip": 0.06494473, + "auxiliary_loss_mlp": 0.01265792, + "balance_loss_clip": 0.06290484, + "balance_loss_mlp": 0.01249031, + "epoch": 0.11718021944987224, + "flos": 70961076887040.0, + "grad_norm": 2.567078438362061, + "language_loss": 0.65165019, + "learning_rate": 3.920801283028054e-06, + "loss": 0.72925287, + "num_input_tokens_seen": 42265745, + "router_z_loss_clip": 2.04101562, + "router_z_loss_mlp": 0.16772461, + "step": 1949, + "time_per_iteration": 3.177534341812134 + }, + { + "auxiliary_loss_clip": 0.06637877, + "auxiliary_loss_mlp": 0.0129446, + "balance_loss_clip": 0.06306669, + "balance_loss_mlp": 0.01261344, + "epoch": 0.1172403427025402, + "flos": 27460750711680.0, + "grad_norm": 1.6361907196052987, + "language_loss": 0.73358595, + "learning_rate": 3.920692733745835e-06, + "loss": 0.81290931, + "num_input_tokens_seen": 42286245, + "router_z_loss_clip": 3.31054688, + "router_z_loss_mlp": 0.33129883, + "step": 1950, + "time_per_iteration": 4.022751808166504 + }, + { + "auxiliary_loss_clip": 0.06660106, + "auxiliary_loss_mlp": 0.01302647, + "balance_loss_clip": 0.063132, + "balance_loss_mlp": 0.01265382, + "epoch": 0.11730046595520818, + "flos": 15674075953920.0, + "grad_norm": 2.7331916034067363, + "language_loss": 0.77657926, + "learning_rate": 3.920584111630755e-06, + "loss": 0.85620677, + "num_input_tokens_seen": 42302710, + "router_z_loss_clip": 3.46875, + "router_z_loss_mlp": 0.37280273, + "step": 1951, + "time_per_iteration": 2.5281777381896973 + }, + { + "auxiliary_loss_clip": 0.06648034, + "auxiliary_loss_mlp": 0.01294944, + "balance_loss_clip": 0.06303104, + "balance_loss_mlp": 0.01259801, + "epoch": 0.11736058920787615, + "flos": 25637320085760.0, + "grad_norm": 1.948975435069226, + "language_loss": 0.77674389, + "learning_rate": 3.9204754166869325e-06, + "loss": 0.85617363, + "num_input_tokens_seen": 42324115, + "router_z_loss_clip": 3.44921875, + "router_z_loss_mlp": 0.35131836, + "step": 1952, + "time_per_iteration": 4.001826286315918 + }, + { + "auxiliary_loss_clip": 0.06657356, + "auxiliary_loss_mlp": 0.01307688, + "balance_loss_clip": 0.06309209, + "balance_loss_mlp": 0.01270828, + "epoch": 0.11742071246054411, + "flos": 21440742122880.0, + "grad_norm": 9.62552088472932, + "language_loss": 0.73713255, + "learning_rate": 3.920366648918491e-06, + "loss": 0.81678301, + "num_input_tokens_seen": 42342505, + "router_z_loss_clip": 3.48242188, + "router_z_loss_mlp": 0.3684082, + "step": 1953, + "time_per_iteration": 2.5549252033233643 + }, + { + "auxiliary_loss_clip": 0.06670918, + "auxiliary_loss_mlp": 0.0130466, + "balance_loss_clip": 0.06317334, + "balance_loss_mlp": 0.01266203, + "epoch": 0.11748083571321208, + "flos": 16003377699840.0, + "grad_norm": 2.536716983337743, + "language_loss": 0.80894691, + "learning_rate": 3.920257808329552e-06, + "loss": 0.88870263, + "num_input_tokens_seen": 42360525, + "router_z_loss_clip": 3.53710938, + "router_z_loss_mlp": 0.38452148, + "step": 1954, + "time_per_iteration": 2.5963521003723145 + }, + { + "auxiliary_loss_clip": 0.06659664, + "auxiliary_loss_mlp": 0.01298566, + "balance_loss_clip": 0.06309056, + "balance_loss_mlp": 0.01260037, + "epoch": 0.11754095896588006, + "flos": 16185582403200.0, + "grad_norm": 1.9904438509588216, + "language_loss": 0.86966431, + "learning_rate": 3.920148894924246e-06, + "loss": 0.94924664, + "num_input_tokens_seen": 42377045, + "router_z_loss_clip": 3.50195312, + "router_z_loss_mlp": 0.38500977, + "step": 1955, + "time_per_iteration": 3.9597103595733643 + }, + { + "auxiliary_loss_clip": 0.06656501, + "auxiliary_loss_mlp": 0.0129708, + "balance_loss_clip": 0.06311554, + "balance_loss_mlp": 0.01262962, + "epoch": 0.11760108221854802, + "flos": 13266701424000.0, + "grad_norm": 2.228472811519511, + "language_loss": 0.79745102, + "learning_rate": 3.920039908706701e-06, + "loss": 0.8769868, + "num_input_tokens_seen": 42393960, + "router_z_loss_clip": 3.44726562, + "router_z_loss_mlp": 0.34130859, + "step": 1956, + "time_per_iteration": 3.990912437438965 + }, + { + "auxiliary_loss_clip": 0.0665153, + "auxiliary_loss_mlp": 0.01299416, + "balance_loss_clip": 0.06313992, + "balance_loss_mlp": 0.01266014, + "epoch": 0.11766120547121599, + "flos": 24505294625280.0, + "grad_norm": 2.0751916947238755, + "language_loss": 0.81691504, + "learning_rate": 3.91993084968105e-06, + "loss": 0.89642453, + "num_input_tokens_seen": 42413160, + "router_z_loss_clip": 3.37695312, + "router_z_loss_mlp": 0.33398438, + "step": 1957, + "time_per_iteration": 2.6472387313842773 + }, + { + "auxiliary_loss_clip": 0.06660254, + "auxiliary_loss_mlp": 0.01296947, + "balance_loss_clip": 0.06313962, + "balance_loss_mlp": 0.01261757, + "epoch": 0.11772132872388397, + "flos": 17789562385920.0, + "grad_norm": 3.000987002447453, + "language_loss": 0.80231309, + "learning_rate": 3.919821717851428e-06, + "loss": 0.88188511, + "num_input_tokens_seen": 42432590, + "router_z_loss_clip": 3.45898438, + "router_z_loss_mlp": 0.35180664, + "step": 1958, + "time_per_iteration": 2.5531046390533447 + }, + { + "auxiliary_loss_clip": 0.06667449, + "auxiliary_loss_mlp": 0.01302997, + "balance_loss_clip": 0.06316346, + "balance_loss_mlp": 0.01263968, + "epoch": 0.11778145197655193, + "flos": 13220776586880.0, + "grad_norm": 3.2848276198767725, + "language_loss": 0.78886813, + "learning_rate": 3.919712513221976e-06, + "loss": 0.86857259, + "num_input_tokens_seen": 42450135, + "router_z_loss_clip": 3.51171875, + "router_z_loss_mlp": 0.39038086, + "step": 1959, + "time_per_iteration": 2.57987642288208 + }, + { + "auxiliary_loss_clip": 0.06661299, + "auxiliary_loss_mlp": 0.01290487, + "balance_loss_clip": 0.06313363, + "balance_loss_mlp": 0.0125656, + "epoch": 0.1178415752292199, + "flos": 20236446915840.0, + "grad_norm": 2.2069161558777033, + "language_loss": 0.72216022, + "learning_rate": 3.919603235796832e-06, + "loss": 0.80167806, + "num_input_tokens_seen": 42470050, + "router_z_loss_clip": 3.47851562, + "router_z_loss_mlp": 0.33911133, + "step": 1960, + "time_per_iteration": 2.568760633468628 + }, + { + "auxiliary_loss_clip": 0.06675136, + "auxiliary_loss_mlp": 0.0129754, + "balance_loss_clip": 0.0632275, + "balance_loss_mlp": 0.01260156, + "epoch": 0.11790169848188788, + "flos": 13044777085440.0, + "grad_norm": 2.729190408722114, + "language_loss": 0.83173323, + "learning_rate": 3.9194938855801406e-06, + "loss": 0.91146004, + "num_input_tokens_seen": 42484335, + "router_z_loss_clip": 3.51953125, + "router_z_loss_mlp": 0.3737793, + "step": 1961, + "time_per_iteration": 2.5375704765319824 + }, + { + "auxiliary_loss_clip": 0.06648357, + "auxiliary_loss_mlp": 0.01294811, + "balance_loss_clip": 0.06310797, + "balance_loss_mlp": 0.01261671, + "epoch": 0.11796182173455584, + "flos": 22271026631040.0, + "grad_norm": 1.7537121481691995, + "language_loss": 0.93383837, + "learning_rate": 3.919384462576049e-06, + "loss": 1.01327002, + "num_input_tokens_seen": 42502720, + "router_z_loss_clip": 3.375, + "router_z_loss_mlp": 0.33105469, + "step": 1962, + "time_per_iteration": 2.5976755619049072 + }, + { + "auxiliary_loss_clip": 0.06656337, + "auxiliary_loss_mlp": 0.01295869, + "balance_loss_clip": 0.06308894, + "balance_loss_mlp": 0.0125994, + "epoch": 0.1180219449872238, + "flos": 10639750469760.0, + "grad_norm": 2.255465148131723, + "language_loss": 0.89418864, + "learning_rate": 3.919274966788707e-06, + "loss": 0.97371072, + "num_input_tokens_seen": 42519460, + "router_z_loss_clip": 3.4765625, + "router_z_loss_mlp": 0.35961914, + "step": 1963, + "time_per_iteration": 2.543811321258545 + }, + { + "auxiliary_loss_clip": 0.06669922, + "auxiliary_loss_mlp": 0.01296273, + "balance_loss_clip": 0.0631619, + "balance_loss_mlp": 0.01260963, + "epoch": 0.11808206823989177, + "flos": 20929906506240.0, + "grad_norm": 1.978622705265592, + "language_loss": 0.85645056, + "learning_rate": 3.919165398222265e-06, + "loss": 0.93611252, + "num_input_tokens_seen": 42539420, + "router_z_loss_clip": 3.5390625, + "router_z_loss_mlp": 0.35327148, + "step": 1964, + "time_per_iteration": 2.623378276824951 + }, + { + "auxiliary_loss_clip": 0.06654269, + "auxiliary_loss_mlp": 0.01293841, + "balance_loss_clip": 0.06309862, + "balance_loss_mlp": 0.01258722, + "epoch": 0.11814219149255975, + "flos": 20784151128960.0, + "grad_norm": 2.5088973707394833, + "language_loss": 0.84141672, + "learning_rate": 3.919055756880879e-06, + "loss": 0.92089784, + "num_input_tokens_seen": 42558225, + "router_z_loss_clip": 3.44335938, + "router_z_loss_mlp": 0.35107422, + "step": 1965, + "time_per_iteration": 2.5660836696624756 + }, + { + "auxiliary_loss_clip": 0.0666364, + "auxiliary_loss_mlp": 0.01301878, + "balance_loss_clip": 0.06310593, + "balance_loss_mlp": 0.01261681, + "epoch": 0.11820231474522772, + "flos": 48770594357760.0, + "grad_norm": 7.622964926374016, + "language_loss": 0.75756431, + "learning_rate": 3.918946042768707e-06, + "loss": 0.83721948, + "num_input_tokens_seen": 42580790, + "router_z_loss_clip": 3.53320312, + "router_z_loss_mlp": 0.40185547, + "step": 1966, + "time_per_iteration": 2.82966947555542 + }, + { + "auxiliary_loss_clip": 0.06671088, + "auxiliary_loss_mlp": 0.01309316, + "balance_loss_clip": 0.06322029, + "balance_loss_mlp": 0.01273887, + "epoch": 0.11826243799789568, + "flos": 16696166457600.0, + "grad_norm": 4.386609320764267, + "language_loss": 0.74750423, + "learning_rate": 3.918836255889908e-06, + "loss": 0.8273083, + "num_input_tokens_seen": 42597355, + "router_z_loss_clip": 3.49414062, + "router_z_loss_mlp": 0.35449219, + "step": 1967, + "time_per_iteration": 2.5282158851623535 + }, + { + "auxiliary_loss_clip": 0.06658092, + "auxiliary_loss_mlp": 0.01304409, + "balance_loss_clip": 0.06307551, + "balance_loss_mlp": 0.01268003, + "epoch": 0.11832256125056366, + "flos": 16915533246720.0, + "grad_norm": 2.9401944207789934, + "language_loss": 0.90244436, + "learning_rate": 3.9187263962486456e-06, + "loss": 0.98206937, + "num_input_tokens_seen": 42616060, + "router_z_loss_clip": 3.5078125, + "router_z_loss_mlp": 0.36401367, + "step": 1968, + "time_per_iteration": 2.573209285736084 + }, + { + "auxiliary_loss_clip": 0.06659393, + "auxiliary_loss_mlp": 0.01300215, + "balance_loss_clip": 0.06313194, + "balance_loss_mlp": 0.01266264, + "epoch": 0.11838268450323162, + "flos": 22827032398080.0, + "grad_norm": 2.909458687960279, + "language_loss": 0.68506658, + "learning_rate": 3.918616463849087e-06, + "loss": 0.76466268, + "num_input_tokens_seen": 42636285, + "router_z_loss_clip": 3.46289062, + "router_z_loss_mlp": 0.33935547, + "step": 1969, + "time_per_iteration": 2.574584484100342 + }, + { + "auxiliary_loss_clip": 0.06652254, + "auxiliary_loss_mlp": 0.01317322, + "balance_loss_clip": 0.06307729, + "balance_loss_mlp": 0.01281034, + "epoch": 0.11844280775589959, + "flos": 33554035296000.0, + "grad_norm": 1.9192483322460232, + "language_loss": 0.81922328, + "learning_rate": 3.918506458695399e-06, + "loss": 0.89891899, + "num_input_tokens_seen": 42658320, + "router_z_loss_clip": 3.44335938, + "router_z_loss_mlp": 0.36303711, + "step": 1970, + "time_per_iteration": 2.688477039337158 + }, + { + "auxiliary_loss_clip": 0.06493312, + "auxiliary_loss_mlp": 0.01272243, + "balance_loss_clip": 0.06287479, + "balance_loss_mlp": 0.01257163, + "epoch": 0.11850293100856757, + "flos": 66371522474880.0, + "grad_norm": 0.7778041955901001, + "language_loss": 0.66349763, + "learning_rate": 3.918396380791754e-06, + "loss": 0.74115324, + "num_input_tokens_seen": 42721500, + "router_z_loss_clip": 2.0625, + "router_z_loss_mlp": 0.1505127, + "step": 1971, + "time_per_iteration": 3.1715264320373535 + }, + { + "auxiliary_loss_clip": 0.06664559, + "auxiliary_loss_mlp": 0.01309662, + "balance_loss_clip": 0.06317366, + "balance_loss_mlp": 0.01274996, + "epoch": 0.11856305426123553, + "flos": 24687960526080.0, + "grad_norm": 2.78038897761295, + "language_loss": 0.81843936, + "learning_rate": 3.918286230142327e-06, + "loss": 0.89818156, + "num_input_tokens_seen": 42739825, + "router_z_loss_clip": 3.47070312, + "router_z_loss_mlp": 0.34643555, + "step": 1972, + "time_per_iteration": 2.6285483837127686 + }, + { + "auxiliary_loss_clip": 0.06645221, + "auxiliary_loss_mlp": 0.01320916, + "balance_loss_clip": 0.06308153, + "balance_loss_mlp": 0.01286179, + "epoch": 0.1186231775139035, + "flos": 24287017939200.0, + "grad_norm": 2.7493832888964116, + "language_loss": 0.746387, + "learning_rate": 3.918176006751292e-06, + "loss": 0.82604837, + "num_input_tokens_seen": 42758695, + "router_z_loss_clip": 3.36914062, + "router_z_loss_mlp": 0.34716797, + "step": 1973, + "time_per_iteration": 2.607680082321167 + }, + { + "auxiliary_loss_clip": 0.06639803, + "auxiliary_loss_mlp": 0.0131421, + "balance_loss_clip": 0.06300108, + "balance_loss_mlp": 0.01277851, + "epoch": 0.11868330076657148, + "flos": 21763042053120.0, + "grad_norm": 1.6365219196166583, + "language_loss": 0.73750299, + "learning_rate": 3.918065710622832e-06, + "loss": 0.81704313, + "num_input_tokens_seen": 42778510, + "router_z_loss_clip": 3.3984375, + "router_z_loss_mlp": 0.36352539, + "step": 1974, + "time_per_iteration": 2.603078603744507 + }, + { + "auxiliary_loss_clip": 0.06653641, + "auxiliary_loss_mlp": 0.01323127, + "balance_loss_clip": 0.06305285, + "balance_loss_mlp": 0.01286196, + "epoch": 0.11874342401923944, + "flos": 17197568490240.0, + "grad_norm": 3.7102130607090893, + "language_loss": 0.79475862, + "learning_rate": 3.917955341761128e-06, + "loss": 0.87452626, + "num_input_tokens_seen": 42793995, + "router_z_loss_clip": 3.48242188, + "router_z_loss_mlp": 0.36914062, + "step": 1975, + "time_per_iteration": 2.529472827911377 + }, + { + "auxiliary_loss_clip": 0.06637481, + "auxiliary_loss_mlp": 0.01318957, + "balance_loss_clip": 0.06305119, + "balance_loss_mlp": 0.01286246, + "epoch": 0.11880354727190741, + "flos": 15234629616000.0, + "grad_norm": 3.277775960681522, + "language_loss": 0.77101427, + "learning_rate": 3.917844900170364e-06, + "loss": 0.85057861, + "num_input_tokens_seen": 42809000, + "router_z_loss_clip": 3.3203125, + "router_z_loss_mlp": 0.32714844, + "step": 1976, + "time_per_iteration": 2.5576260089874268 + }, + { + "auxiliary_loss_clip": 0.06648317, + "auxiliary_loss_mlp": 0.01301156, + "balance_loss_clip": 0.0630495, + "balance_loss_mlp": 0.0126537, + "epoch": 0.11886367052457537, + "flos": 27317343248640.0, + "grad_norm": 1.6788870618385208, + "language_loss": 0.76201534, + "learning_rate": 3.91773438585473e-06, + "loss": 0.84151006, + "num_input_tokens_seen": 42831585, + "router_z_loss_clip": 3.43164062, + "router_z_loss_mlp": 0.35791016, + "step": 1977, + "time_per_iteration": 2.6103506088256836 + }, + { + "auxiliary_loss_clip": 0.06654633, + "auxiliary_loss_mlp": 0.01297753, + "balance_loss_clip": 0.06301999, + "balance_loss_mlp": 0.01261346, + "epoch": 0.11892379377724335, + "flos": 21804648405120.0, + "grad_norm": 2.329560685386949, + "language_loss": 0.75601208, + "learning_rate": 3.9176237988184165e-06, + "loss": 0.835536, + "num_input_tokens_seen": 42848420, + "router_z_loss_clip": 3.5234375, + "router_z_loss_mlp": 0.36401367, + "step": 1978, + "time_per_iteration": 2.556502103805542 + }, + { + "auxiliary_loss_clip": 0.06647499, + "auxiliary_loss_mlp": 0.01294249, + "balance_loss_clip": 0.06307921, + "balance_loss_mlp": 0.0126068, + "epoch": 0.11898391702991132, + "flos": 13996191070080.0, + "grad_norm": 1.8023230195278173, + "language_loss": 0.74423146, + "learning_rate": 3.917513139065616e-06, + "loss": 0.82364893, + "num_input_tokens_seen": 42866645, + "router_z_loss_clip": 3.40039062, + "router_z_loss_mlp": 0.33569336, + "step": 1979, + "time_per_iteration": 2.595372200012207 + }, + { + "auxiliary_loss_clip": 0.0664144, + "auxiliary_loss_mlp": 0.01296465, + "balance_loss_clip": 0.06302245, + "balance_loss_mlp": 0.01261965, + "epoch": 0.11904404028257928, + "flos": 32242907733120.0, + "grad_norm": 1.646895354500375, + "language_loss": 0.99974936, + "learning_rate": 3.917402406600525e-06, + "loss": 1.07912838, + "num_input_tokens_seen": 42888515, + "router_z_loss_clip": 3.39257812, + "router_z_loss_mlp": 0.34521484, + "step": 1980, + "time_per_iteration": 2.6381077766418457 + }, + { + "auxiliary_loss_clip": 0.06647406, + "auxiliary_loss_mlp": 0.01292706, + "balance_loss_clip": 0.0630409, + "balance_loss_mlp": 0.01256299, + "epoch": 0.11910416353524726, + "flos": 23592971370240.0, + "grad_norm": 2.6857595325388095, + "language_loss": 0.87083352, + "learning_rate": 3.917291601427342e-06, + "loss": 0.95023465, + "num_input_tokens_seen": 42909035, + "router_z_loss_clip": 3.43359375, + "router_z_loss_mlp": 0.36401367, + "step": 1981, + "time_per_iteration": 2.5953710079193115 + }, + { + "auxiliary_loss_clip": 0.0664432, + "auxiliary_loss_mlp": 0.01298025, + "balance_loss_clip": 0.06305191, + "balance_loss_mlp": 0.01263287, + "epoch": 0.11916428678791523, + "flos": 25339268712960.0, + "grad_norm": 1.936683956575477, + "language_loss": 0.86578631, + "learning_rate": 3.91718072355027e-06, + "loss": 0.94520986, + "num_input_tokens_seen": 42927555, + "router_z_loss_clip": 3.38867188, + "router_z_loss_mlp": 0.34765625, + "step": 1982, + "time_per_iteration": 2.5845234394073486 + }, + { + "auxiliary_loss_clip": 0.06636401, + "auxiliary_loss_mlp": 0.01296498, + "balance_loss_clip": 0.06295685, + "balance_loss_mlp": 0.0126095, + "epoch": 0.11922441004058319, + "flos": 19793939904000.0, + "grad_norm": 2.0505681107153273, + "language_loss": 0.86230731, + "learning_rate": 3.917069772973513e-06, + "loss": 0.94163632, + "num_input_tokens_seen": 42945300, + "router_z_loss_clip": 3.40820312, + "router_z_loss_mlp": 0.35571289, + "step": 1983, + "time_per_iteration": 2.554844379425049 + }, + { + "auxiliary_loss_clip": 0.06654783, + "auxiliary_loss_mlp": 0.01292763, + "balance_loss_clip": 0.06302382, + "balance_loss_mlp": 0.01256858, + "epoch": 0.11928453329325117, + "flos": 21541578912000.0, + "grad_norm": 3.6464912777756373, + "language_loss": 0.78593659, + "learning_rate": 3.916958749701277e-06, + "loss": 0.86541206, + "num_input_tokens_seen": 42961295, + "router_z_loss_clip": 3.51757812, + "router_z_loss_mlp": 0.35913086, + "step": 1984, + "time_per_iteration": 2.5320324897766113 + }, + { + "auxiliary_loss_clip": 0.06647135, + "auxiliary_loss_mlp": 0.01292695, + "balance_loss_clip": 0.0630364, + "balance_loss_mlp": 0.0125574, + "epoch": 0.11934465654591914, + "flos": 20821522849920.0, + "grad_norm": 1.8707303629344072, + "language_loss": 0.84522444, + "learning_rate": 3.9168476537377745e-06, + "loss": 0.92462277, + "num_input_tokens_seen": 42980330, + "router_z_loss_clip": 3.43164062, + "router_z_loss_mlp": 0.36962891, + "step": 1985, + "time_per_iteration": 2.6096858978271484 + }, + { + "auxiliary_loss_clip": 0.06641059, + "auxiliary_loss_mlp": 0.01296367, + "balance_loss_clip": 0.06304613, + "balance_loss_mlp": 0.01263346, + "epoch": 0.1194047797985871, + "flos": 19066169266560.0, + "grad_norm": 3.6983230286651945, + "language_loss": 0.75468755, + "learning_rate": 3.916736485087216e-06, + "loss": 0.83406186, + "num_input_tokens_seen": 42996125, + "router_z_loss_clip": 3.3671875, + "router_z_loss_mlp": 0.33007812, + "step": 1986, + "time_per_iteration": 2.497166633605957 + }, + { + "auxiliary_loss_clip": 0.06650525, + "auxiliary_loss_mlp": 0.01300056, + "balance_loss_clip": 0.06311469, + "balance_loss_mlp": 0.01265771, + "epoch": 0.11946490305125507, + "flos": 27196842677760.0, + "grad_norm": 2.5090300356015227, + "language_loss": 0.73365855, + "learning_rate": 3.916625243753819e-06, + "loss": 0.81316435, + "num_input_tokens_seen": 43014180, + "router_z_loss_clip": 3.390625, + "router_z_loss_mlp": 0.34301758, + "step": 1987, + "time_per_iteration": 2.6316466331481934 + }, + { + "auxiliary_loss_clip": 0.06659403, + "auxiliary_loss_mlp": 0.01313937, + "balance_loss_clip": 0.06313819, + "balance_loss_mlp": 0.01275886, + "epoch": 0.11952502630392305, + "flos": 21146925381120.0, + "grad_norm": 1.9895182313514284, + "language_loss": 0.73564172, + "learning_rate": 3.916513929741799e-06, + "loss": 0.81537521, + "num_input_tokens_seen": 43032120, + "router_z_loss_clip": 3.453125, + "router_z_loss_mlp": 0.38012695, + "step": 1988, + "time_per_iteration": 2.538780450820923 + }, + { + "auxiliary_loss_clip": 0.06646325, + "auxiliary_loss_mlp": 0.01300531, + "balance_loss_clip": 0.06309503, + "balance_loss_mlp": 0.01265817, + "epoch": 0.11958514955659101, + "flos": 22130260571520.0, + "grad_norm": 2.1843811344265434, + "language_loss": 0.82602763, + "learning_rate": 3.91640254305538e-06, + "loss": 0.90549618, + "num_input_tokens_seen": 43052215, + "router_z_loss_clip": 3.3671875, + "router_z_loss_mlp": 0.34716797, + "step": 1989, + "time_per_iteration": 2.6741979122161865 + }, + { + "auxiliary_loss_clip": 0.06651568, + "auxiliary_loss_mlp": 0.01303723, + "balance_loss_clip": 0.06308736, + "balance_loss_mlp": 0.01266482, + "epoch": 0.11964527280925898, + "flos": 17427333185280.0, + "grad_norm": 3.1495832164614828, + "language_loss": 0.77526391, + "learning_rate": 3.916291083698784e-06, + "loss": 0.85481679, + "num_input_tokens_seen": 43069720, + "router_z_loss_clip": 3.42578125, + "router_z_loss_mlp": 0.37255859, + "step": 1990, + "time_per_iteration": 3.9906837940216064 + }, + { + "auxiliary_loss_clip": 0.06541168, + "auxiliary_loss_mlp": 0.0131986, + "balance_loss_clip": 0.06337936, + "balance_loss_mlp": 0.01304852, + "epoch": 0.11970539606192696, + "flos": 70698804007680.0, + "grad_norm": 0.8660684283454352, + "language_loss": 0.55407226, + "learning_rate": 3.916179551676238e-06, + "loss": 0.63268256, + "num_input_tokens_seen": 43123130, + "router_z_loss_clip": 2.03125, + "router_z_loss_mlp": 0.14978027, + "step": 1991, + "time_per_iteration": 4.6956093311309814 + }, + { + "auxiliary_loss_clip": 0.06638116, + "auxiliary_loss_mlp": 0.01295675, + "balance_loss_clip": 0.06307568, + "balance_loss_mlp": 0.01263345, + "epoch": 0.11976551931459492, + "flos": 21221375333760.0, + "grad_norm": 2.476959921909238, + "language_loss": 0.79074007, + "learning_rate": 3.916067946991971e-06, + "loss": 0.87007797, + "num_input_tokens_seen": 43140015, + "router_z_loss_clip": 3.3046875, + "router_z_loss_mlp": 0.32348633, + "step": 1992, + "time_per_iteration": 2.5945029258728027 + }, + { + "auxiliary_loss_clip": 0.06650865, + "auxiliary_loss_mlp": 0.01302479, + "balance_loss_clip": 0.06309184, + "balance_loss_mlp": 0.01267647, + "epoch": 0.11982564256726289, + "flos": 25995566217600.0, + "grad_norm": 2.0953190944700215, + "language_loss": 0.800017, + "learning_rate": 3.915956269650216e-06, + "loss": 0.87955046, + "num_input_tokens_seen": 43160105, + "router_z_loss_clip": 3.41601562, + "router_z_loss_mlp": 0.34838867, + "step": 1993, + "time_per_iteration": 2.5923471450805664 + }, + { + "auxiliary_loss_clip": 0.06641386, + "auxiliary_loss_mlp": 0.0130103, + "balance_loss_clip": 0.06304519, + "balance_loss_mlp": 0.01266793, + "epoch": 0.11988576581993086, + "flos": 21656964384000.0, + "grad_norm": 1.8929635889117382, + "language_loss": 0.83093858, + "learning_rate": 3.915844519655208e-06, + "loss": 0.91036278, + "num_input_tokens_seen": 43179835, + "router_z_loss_clip": 3.37304688, + "router_z_loss_mlp": 0.3425293, + "step": 1994, + "time_per_iteration": 2.58314847946167 + }, + { + "auxiliary_loss_clip": 0.06638885, + "auxiliary_loss_mlp": 0.01299925, + "balance_loss_clip": 0.06306463, + "balance_loss_mlp": 0.01265617, + "epoch": 0.11994588907259883, + "flos": 17863048016640.0, + "grad_norm": 2.42141016996774, + "language_loss": 0.90494514, + "learning_rate": 3.915732697011183e-06, + "loss": 0.98433328, + "num_input_tokens_seen": 43197210, + "router_z_loss_clip": 3.3203125, + "router_z_loss_mlp": 0.34301758, + "step": 1995, + "time_per_iteration": 5.38932991027832 + }, + { + "auxiliary_loss_clip": 0.06647271, + "auxiliary_loss_mlp": 0.01300085, + "balance_loss_clip": 0.06306107, + "balance_loss_mlp": 0.01263583, + "epoch": 0.1200060123252668, + "flos": 24469725767040.0, + "grad_norm": 3.463827549229225, + "language_loss": 0.75938386, + "learning_rate": 3.9156208017223825e-06, + "loss": 0.83885741, + "num_input_tokens_seen": 43215050, + "router_z_loss_clip": 3.41015625, + "router_z_loss_mlp": 0.36523438, + "step": 1996, + "time_per_iteration": 2.630936861038208 + }, + { + "auxiliary_loss_clip": 0.06633951, + "auxiliary_loss_mlp": 0.01306595, + "balance_loss_clip": 0.06300932, + "balance_loss_mlp": 0.01273097, + "epoch": 0.12006613557793476, + "flos": 18737831842560.0, + "grad_norm": 2.002664476767551, + "language_loss": 0.88733006, + "learning_rate": 3.915508833793048e-06, + "loss": 0.96673548, + "num_input_tokens_seen": 43233900, + "router_z_loss_clip": 3.328125, + "router_z_loss_mlp": 0.33496094, + "step": 1997, + "time_per_iteration": 2.542490243911743 + }, + { + "auxiliary_loss_clip": 0.06639601, + "auxiliary_loss_mlp": 0.01299934, + "balance_loss_clip": 0.06303362, + "balance_loss_mlp": 0.01265864, + "epoch": 0.12012625883060274, + "flos": 22273374545280.0, + "grad_norm": 2.268718132008626, + "language_loss": 0.8047471, + "learning_rate": 3.915396793227428e-06, + "loss": 0.88414252, + "num_input_tokens_seen": 43252105, + "router_z_loss_clip": 3.36523438, + "router_z_loss_mlp": 0.34033203, + "step": 1998, + "time_per_iteration": 2.6070334911346436 + }, + { + "auxiliary_loss_clip": 0.06640439, + "auxiliary_loss_mlp": 0.01306471, + "balance_loss_clip": 0.06312488, + "balance_loss_mlp": 0.01272401, + "epoch": 0.1201863820832707, + "flos": 21764761061760.0, + "grad_norm": 2.100057893204002, + "language_loss": 0.73916173, + "learning_rate": 3.915284680029769e-06, + "loss": 0.81863081, + "num_input_tokens_seen": 43270315, + "router_z_loss_clip": 3.27929688, + "router_z_loss_mlp": 0.34033203, + "step": 1999, + "time_per_iteration": 2.5563113689422607 + }, + { + "auxiliary_loss_clip": 0.0664693, + "auxiliary_loss_mlp": 0.01298334, + "balance_loss_clip": 0.06304446, + "balance_loss_mlp": 0.01263763, + "epoch": 0.12024650533593867, + "flos": 21914415653760.0, + "grad_norm": 2.961282874650153, + "language_loss": 0.76137137, + "learning_rate": 3.915172494204323e-06, + "loss": 0.84082401, + "num_input_tokens_seen": 43289935, + "router_z_loss_clip": 3.42578125, + "router_z_loss_mlp": 0.34545898, + "step": 2000, + "time_per_iteration": 2.6174545288085938 + }, + { + "auxiliary_loss_clip": 0.0664265, + "auxiliary_loss_mlp": 0.0131017, + "balance_loss_clip": 0.06307586, + "balance_loss_mlp": 0.012756, + "epoch": 0.12030662858860665, + "flos": 21695635843200.0, + "grad_norm": 1.7187756113932227, + "language_loss": 0.86554497, + "learning_rate": 3.915060235755344e-06, + "loss": 0.94507325, + "num_input_tokens_seen": 43309325, + "router_z_loss_clip": 3.34960938, + "router_z_loss_mlp": 0.34545898, + "step": 2001, + "time_per_iteration": 2.575740098953247 + }, + { + "auxiliary_loss_clip": 0.06635608, + "auxiliary_loss_mlp": 0.01303825, + "balance_loss_clip": 0.06303231, + "balance_loss_mlp": 0.01270232, + "epoch": 0.12036675184127461, + "flos": 12938280145920.0, + "grad_norm": 3.0530773908117297, + "language_loss": 0.75370091, + "learning_rate": 3.91494790468709e-06, + "loss": 0.83309525, + "num_input_tokens_seen": 43327010, + "router_z_loss_clip": 3.32226562, + "router_z_loss_mlp": 0.33618164, + "step": 2002, + "time_per_iteration": 2.5708627700805664 + }, + { + "auxiliary_loss_clip": 0.06653483, + "auxiliary_loss_mlp": 0.01301657, + "balance_loss_clip": 0.06308778, + "balance_loss_mlp": 0.01265322, + "epoch": 0.12042687509394258, + "flos": 20857469051520.0, + "grad_norm": 3.724600785525669, + "language_loss": 0.79714429, + "learning_rate": 3.9148355010038185e-06, + "loss": 0.87669575, + "num_input_tokens_seen": 43345650, + "router_z_loss_clip": 3.44726562, + "router_z_loss_mlp": 0.36352539, + "step": 2003, + "time_per_iteration": 2.5530362129211426 + }, + { + "auxiliary_loss_clip": 0.06639621, + "auxiliary_loss_mlp": 0.01310661, + "balance_loss_clip": 0.06304517, + "balance_loss_mlp": 0.01276638, + "epoch": 0.12048699834661056, + "flos": 23885320665600.0, + "grad_norm": 3.082354768272036, + "language_loss": 0.72748882, + "learning_rate": 3.914723024709793e-06, + "loss": 0.80699164, + "num_input_tokens_seen": 43365555, + "router_z_loss_clip": 3.3515625, + "router_z_loss_mlp": 0.34008789, + "step": 2004, + "time_per_iteration": 2.583922863006592 + }, + { + "auxiliary_loss_clip": 0.06642192, + "auxiliary_loss_mlp": 0.01300449, + "balance_loss_clip": 0.06302966, + "balance_loss_mlp": 0.01263899, + "epoch": 0.12054712159927852, + "flos": 19762605676800.0, + "grad_norm": 1.8151207739831152, + "language_loss": 0.79435182, + "learning_rate": 3.914610475809279e-06, + "loss": 0.87377822, + "num_input_tokens_seen": 43384990, + "router_z_loss_clip": 3.39257812, + "router_z_loss_mlp": 0.36547852, + "step": 2005, + "time_per_iteration": 2.5544016361236572 + }, + { + "auxiliary_loss_clip": 0.06498255, + "auxiliary_loss_mlp": 0.01304889, + "balance_loss_clip": 0.06296292, + "balance_loss_mlp": 0.01289821, + "epoch": 0.12060724485194649, + "flos": 51688999411200.0, + "grad_norm": 0.895152271859771, + "language_loss": 0.5819217, + "learning_rate": 3.914497854306543e-06, + "loss": 0.65995312, + "num_input_tokens_seen": 43436335, + "router_z_loss_clip": 2.02539062, + "router_z_loss_mlp": 0.15039062, + "step": 2006, + "time_per_iteration": 2.9925737380981445 + }, + { + "auxiliary_loss_clip": 0.06637617, + "auxiliary_loss_mlp": 0.01298518, + "balance_loss_clip": 0.06307045, + "balance_loss_mlp": 0.01264042, + "epoch": 0.12066736810461445, + "flos": 18996582850560.0, + "grad_norm": 2.2145885601274653, + "language_loss": 0.77570707, + "learning_rate": 3.9143851602058575e-06, + "loss": 0.85506845, + "num_input_tokens_seen": 43456495, + "router_z_loss_clip": 3.30664062, + "router_z_loss_mlp": 0.34472656, + "step": 2007, + "time_per_iteration": 2.5426108837127686 + }, + { + "auxiliary_loss_clip": 0.0663473, + "auxiliary_loss_mlp": 0.01296019, + "balance_loss_clip": 0.06301288, + "balance_loss_mlp": 0.01260352, + "epoch": 0.12072749135728243, + "flos": 16477554355200.0, + "grad_norm": 3.5055454300142346, + "language_loss": 0.8601926, + "learning_rate": 3.914272393511494e-06, + "loss": 0.93950009, + "num_input_tokens_seen": 43473085, + "router_z_loss_clip": 3.328125, + "router_z_loss_mlp": 0.35668945, + "step": 2008, + "time_per_iteration": 2.5499417781829834 + }, + { + "auxiliary_loss_clip": 0.06641807, + "auxiliary_loss_mlp": 0.01291488, + "balance_loss_clip": 0.06305657, + "balance_loss_mlp": 0.0125768, + "epoch": 0.1207876146099504, + "flos": 18082917930240.0, + "grad_norm": 2.14462830622821, + "language_loss": 0.84945571, + "learning_rate": 3.91415955422773e-06, + "loss": 0.92878866, + "num_input_tokens_seen": 43491135, + "router_z_loss_clip": 3.359375, + "router_z_loss_mlp": 0.33813477, + "step": 2009, + "time_per_iteration": 2.5377557277679443 + }, + { + "auxiliary_loss_clip": 0.06634751, + "auxiliary_loss_mlp": 0.01300176, + "balance_loss_clip": 0.06306206, + "balance_loss_mlp": 0.01266225, + "epoch": 0.12084773786261836, + "flos": 21878008254720.0, + "grad_norm": 2.1676887329617336, + "language_loss": 0.85496145, + "learning_rate": 3.914046642358844e-06, + "loss": 0.93431073, + "num_input_tokens_seen": 43510440, + "router_z_loss_clip": 3.29101562, + "router_z_loss_mlp": 0.33959961, + "step": 2010, + "time_per_iteration": 2.577526330947876 + }, + { + "auxiliary_loss_clip": 0.06654292, + "auxiliary_loss_mlp": 0.0131443, + "balance_loss_clip": 0.06313477, + "balance_loss_mlp": 0.01277666, + "epoch": 0.12090786111528634, + "flos": 18338985607680.0, + "grad_norm": 2.943319840268963, + "language_loss": 0.85397738, + "learning_rate": 3.9139336579091174e-06, + "loss": 0.93366468, + "num_input_tokens_seen": 43530145, + "router_z_loss_clip": 3.40625, + "router_z_loss_mlp": 0.36767578, + "step": 2011, + "time_per_iteration": 2.5281803607940674 + }, + { + "auxiliary_loss_clip": 0.06651285, + "auxiliary_loss_mlp": 0.01306451, + "balance_loss_clip": 0.06310041, + "balance_loss_mlp": 0.01270975, + "epoch": 0.1209679843679543, + "flos": 21112236990720.0, + "grad_norm": 2.078534673475464, + "language_loss": 0.97477353, + "learning_rate": 3.913820600882834e-06, + "loss": 1.05435085, + "num_input_tokens_seen": 43549315, + "router_z_loss_clip": 3.4140625, + "router_z_loss_mlp": 0.35498047, + "step": 2012, + "time_per_iteration": 2.607473611831665 + }, + { + "auxiliary_loss_clip": 0.06639741, + "auxiliary_loss_mlp": 0.01302196, + "balance_loss_clip": 0.06309405, + "balance_loss_mlp": 0.01268865, + "epoch": 0.12102810762062227, + "flos": 29248612479360.0, + "grad_norm": 1.9848767494674133, + "language_loss": 0.81610048, + "learning_rate": 3.913707471284283e-06, + "loss": 0.89551985, + "num_input_tokens_seen": 43569240, + "router_z_loss_clip": 3.30078125, + "router_z_loss_mlp": 0.33325195, + "step": 2013, + "time_per_iteration": 2.616990566253662 + }, + { + "auxiliary_loss_clip": 0.06652003, + "auxiliary_loss_mlp": 0.01311561, + "balance_loss_clip": 0.06309032, + "balance_loss_mlp": 0.0127525, + "epoch": 0.12108823087329025, + "flos": 17936407866240.0, + "grad_norm": 5.4278493881784415, + "language_loss": 0.78293782, + "learning_rate": 3.9135942691177515e-06, + "loss": 0.8625735, + "num_input_tokens_seen": 43587710, + "router_z_loss_clip": 3.43164062, + "router_z_loss_mlp": 0.36328125, + "step": 2014, + "time_per_iteration": 2.651820421218872 + }, + { + "auxiliary_loss_clip": 0.06640598, + "auxiliary_loss_mlp": 0.01320367, + "balance_loss_clip": 0.0630708, + "balance_loss_mlp": 0.01286344, + "epoch": 0.12114835412595822, + "flos": 22098549000960.0, + "grad_norm": 2.982829144387911, + "language_loss": 0.88284999, + "learning_rate": 3.913480994387535e-06, + "loss": 0.96245968, + "num_input_tokens_seen": 43606000, + "router_z_loss_clip": 3.33398438, + "router_z_loss_mlp": 0.34008789, + "step": 2015, + "time_per_iteration": 2.5447444915771484 + }, + { + "auxiliary_loss_clip": 0.06640744, + "auxiliary_loss_mlp": 0.01318151, + "balance_loss_clip": 0.06308715, + "balance_loss_mlp": 0.01284534, + "epoch": 0.12120847737862618, + "flos": 20418567765120.0, + "grad_norm": 2.096885211944344, + "language_loss": 0.70457768, + "learning_rate": 3.913367647097926e-06, + "loss": 0.78416657, + "num_input_tokens_seen": 43624815, + "router_z_loss_clip": 3.32226562, + "router_z_loss_mlp": 0.3359375, + "step": 2016, + "time_per_iteration": 2.596148729324341 + }, + { + "auxiliary_loss_clip": 0.06646016, + "auxiliary_loss_mlp": 0.01314653, + "balance_loss_clip": 0.06304827, + "balance_loss_mlp": 0.01276792, + "epoch": 0.12126860063129415, + "flos": 22315484021760.0, + "grad_norm": 2.9748504234470214, + "language_loss": 0.80719239, + "learning_rate": 3.913254227253225e-06, + "loss": 0.8867991, + "num_input_tokens_seen": 43643960, + "router_z_loss_clip": 3.41015625, + "router_z_loss_mlp": 0.37890625, + "step": 2017, + "time_per_iteration": 2.531651020050049 + }, + { + "auxiliary_loss_clip": 0.06646961, + "auxiliary_loss_mlp": 0.01325201, + "balance_loss_clip": 0.06301364, + "balance_loss_mlp": 0.01289128, + "epoch": 0.12132872388396213, + "flos": 13704428753280.0, + "grad_norm": 11.74399096976628, + "language_loss": 0.70780957, + "learning_rate": 3.913140734857731e-06, + "loss": 0.78753114, + "num_input_tokens_seen": 43662650, + "router_z_loss_clip": 3.45507812, + "router_z_loss_mlp": 0.3605957, + "step": 2018, + "time_per_iteration": 2.555253267288208 + }, + { + "auxiliary_loss_clip": 0.06636061, + "auxiliary_loss_mlp": 0.01298517, + "balance_loss_clip": 0.06301028, + "balance_loss_mlp": 0.01264828, + "epoch": 0.12138884713663009, + "flos": 26473851722880.0, + "grad_norm": 2.8042762769346714, + "language_loss": 0.73802805, + "learning_rate": 3.91302716991575e-06, + "loss": 0.81737387, + "num_input_tokens_seen": 43684205, + "router_z_loss_clip": 3.34765625, + "router_z_loss_mlp": 0.33691406, + "step": 2019, + "time_per_iteration": 2.6203458309173584 + }, + { + "auxiliary_loss_clip": 0.06639916, + "auxiliary_loss_mlp": 0.01311356, + "balance_loss_clip": 0.06299765, + "balance_loss_mlp": 0.01277238, + "epoch": 0.12144897038929806, + "flos": 26148952316160.0, + "grad_norm": 1.829808829925435, + "language_loss": 0.93501657, + "learning_rate": 3.912913532431586e-06, + "loss": 1.01452923, + "num_input_tokens_seen": 43706320, + "router_z_loss_clip": 3.40234375, + "router_z_loss_mlp": 0.34130859, + "step": 2020, + "time_per_iteration": 2.5888445377349854 + }, + { + "auxiliary_loss_clip": 0.06633772, + "auxiliary_loss_mlp": 0.01299116, + "balance_loss_clip": 0.06297548, + "balance_loss_mlp": 0.01263568, + "epoch": 0.12150909364196603, + "flos": 24724451779200.0, + "grad_norm": 2.526616616661372, + "language_loss": 0.78976464, + "learning_rate": 3.912799822409549e-06, + "loss": 0.86909354, + "num_input_tokens_seen": 43724805, + "router_z_loss_clip": 3.36328125, + "router_z_loss_mlp": 0.35546875, + "step": 2021, + "time_per_iteration": 2.6022841930389404 + }, + { + "auxiliary_loss_clip": 0.0663517, + "auxiliary_loss_mlp": 0.01299013, + "balance_loss_clip": 0.06302813, + "balance_loss_mlp": 0.01266898, + "epoch": 0.121569216894634, + "flos": 25193177919360.0, + "grad_norm": 2.2515588789305645, + "language_loss": 0.8175382, + "learning_rate": 3.912686039853952e-06, + "loss": 0.89688003, + "num_input_tokens_seen": 43742320, + "router_z_loss_clip": 3.32421875, + "router_z_loss_mlp": 0.32128906, + "step": 2022, + "time_per_iteration": 2.5850207805633545 + }, + { + "auxiliary_loss_clip": 0.0664625, + "auxiliary_loss_mlp": 0.01295093, + "balance_loss_clip": 0.06304103, + "balance_loss_mlp": 0.0125964, + "epoch": 0.12162934014730196, + "flos": 13449241543680.0, + "grad_norm": 2.226180845904462, + "language_loss": 0.8644762, + "learning_rate": 3.912572184769108e-06, + "loss": 0.94388956, + "num_input_tokens_seen": 43760665, + "router_z_loss_clip": 3.421875, + "router_z_loss_mlp": 0.35424805, + "step": 2023, + "time_per_iteration": 2.541822671890259 + }, + { + "auxiliary_loss_clip": 0.06652313, + "auxiliary_loss_mlp": 0.01299326, + "balance_loss_clip": 0.06306356, + "balance_loss_mlp": 0.01261394, + "epoch": 0.12168946339996994, + "flos": 16951772937600.0, + "grad_norm": 3.6496728157667477, + "language_loss": 0.87528783, + "learning_rate": 3.912458257159335e-06, + "loss": 0.95480424, + "num_input_tokens_seen": 43779020, + "router_z_loss_clip": 3.4609375, + "router_z_loss_mlp": 0.37963867, + "step": 2024, + "time_per_iteration": 2.510047674179077 + }, + { + "auxiliary_loss_clip": 0.06637174, + "auxiliary_loss_mlp": 0.01298516, + "balance_loss_clip": 0.06299831, + "balance_loss_mlp": 0.01262872, + "epoch": 0.12174958665263791, + "flos": 29828699095680.0, + "grad_norm": 2.180683853985422, + "language_loss": 0.73548269, + "learning_rate": 3.912344257028954e-06, + "loss": 0.8148396, + "num_input_tokens_seen": 43798850, + "router_z_loss_clip": 3.37695312, + "router_z_loss_mlp": 0.35620117, + "step": 2025, + "time_per_iteration": 2.612072229385376 + }, + { + "auxiliary_loss_clip": 0.06640136, + "auxiliary_loss_mlp": 0.01296236, + "balance_loss_clip": 0.06301836, + "balance_loss_mlp": 0.01260425, + "epoch": 0.12180970990530587, + "flos": 24648366672000.0, + "grad_norm": 1.6158057232252747, + "language_loss": 0.77162802, + "learning_rate": 3.912230184382286e-06, + "loss": 0.85099173, + "num_input_tokens_seen": 43820130, + "router_z_loss_clip": 3.38476562, + "router_z_loss_mlp": 0.35766602, + "step": 2026, + "time_per_iteration": 2.5995230674743652 + }, + { + "auxiliary_loss_clip": 0.06645372, + "auxiliary_loss_mlp": 0.01300506, + "balance_loss_clip": 0.06307228, + "balance_loss_mlp": 0.01264219, + "epoch": 0.12186983315797385, + "flos": 20527915743360.0, + "grad_norm": 2.387338120412035, + "language_loss": 0.90280318, + "learning_rate": 3.912116039223659e-06, + "loss": 0.9822619, + "num_input_tokens_seen": 43838485, + "router_z_loss_clip": 3.37890625, + "router_z_loss_mlp": 0.36254883, + "step": 2027, + "time_per_iteration": 2.534867763519287 + }, + { + "auxiliary_loss_clip": 0.06634748, + "auxiliary_loss_mlp": 0.0129945, + "balance_loss_clip": 0.06304284, + "balance_loss_mlp": 0.01266905, + "epoch": 0.12192995641064182, + "flos": 27825705169920.0, + "grad_norm": 2.1781707070906644, + "language_loss": 0.76798415, + "learning_rate": 3.912001821557399e-06, + "loss": 0.84732616, + "num_input_tokens_seen": 43859080, + "router_z_loss_clip": 3.30859375, + "router_z_loss_mlp": 0.32543945, + "step": 2028, + "time_per_iteration": 2.578725576400757 + }, + { + "auxiliary_loss_clip": 0.0664517, + "auxiliary_loss_mlp": 0.01295232, + "balance_loss_clip": 0.06306128, + "balance_loss_mlp": 0.012614, + "epoch": 0.12199007966330978, + "flos": 22023512069760.0, + "grad_norm": 2.4518178731886318, + "language_loss": 0.78897178, + "learning_rate": 3.911887531387839e-06, + "loss": 0.86837584, + "num_input_tokens_seen": 43879030, + "router_z_loss_clip": 3.39257812, + "router_z_loss_mlp": 0.33813477, + "step": 2029, + "time_per_iteration": 2.5508341789245605 + }, + { + "auxiliary_loss_clip": 0.06643746, + "auxiliary_loss_mlp": 0.01296807, + "balance_loss_clip": 0.06307071, + "balance_loss_mlp": 0.01262475, + "epoch": 0.12205020291597775, + "flos": 23302005667200.0, + "grad_norm": 2.091887383256169, + "language_loss": 0.80821085, + "learning_rate": 3.911773168719313e-06, + "loss": 0.8876164, + "num_input_tokens_seen": 43898505, + "router_z_loss_clip": 3.36328125, + "router_z_loss_mlp": 0.34326172, + "step": 2030, + "time_per_iteration": 3.9340591430664062 + }, + { + "auxiliary_loss_clip": 0.06641008, + "auxiliary_loss_mlp": 0.01296523, + "balance_loss_clip": 0.06307271, + "balance_loss_mlp": 0.01263097, + "epoch": 0.12211032616864573, + "flos": 26038849651200.0, + "grad_norm": 4.123821558530392, + "language_loss": 0.75410855, + "learning_rate": 3.911658733556155e-06, + "loss": 0.83348382, + "num_input_tokens_seen": 43917945, + "router_z_loss_clip": 3.33984375, + "router_z_loss_mlp": 0.33398438, + "step": 2031, + "time_per_iteration": 4.0164101123809814 + }, + { + "auxiliary_loss_clip": 0.06642319, + "auxiliary_loss_mlp": 0.01298968, + "balance_loss_clip": 0.06307532, + "balance_loss_mlp": 0.01265947, + "epoch": 0.12217044942131369, + "flos": 20416932610560.0, + "grad_norm": 1.945082071582731, + "language_loss": 0.76790285, + "learning_rate": 3.911544225902707e-06, + "loss": 0.84731567, + "num_input_tokens_seen": 43937385, + "router_z_loss_clip": 3.34765625, + "router_z_loss_mlp": 0.33032227, + "step": 2032, + "time_per_iteration": 2.5583930015563965 + }, + { + "auxiliary_loss_clip": 0.0663031, + "auxiliary_loss_mlp": 0.01300948, + "balance_loss_clip": 0.06305249, + "balance_loss_mlp": 0.01266901, + "epoch": 0.12223057267398166, + "flos": 22863817140480.0, + "grad_norm": 1.7389762148633483, + "language_loss": 0.89850545, + "learning_rate": 3.911429645763311e-06, + "loss": 0.97781807, + "num_input_tokens_seen": 43958130, + "router_z_loss_clip": 3.24609375, + "router_z_loss_mlp": 0.34057617, + "step": 2033, + "time_per_iteration": 2.5717952251434326 + }, + { + "auxiliary_loss_clip": 0.06656118, + "auxiliary_loss_mlp": 0.01295873, + "balance_loss_clip": 0.06305313, + "balance_loss_mlp": 0.01260063, + "epoch": 0.12229069592664964, + "flos": 20053739088000.0, + "grad_norm": 2.329108980084039, + "language_loss": 0.67293733, + "learning_rate": 3.911314993142311e-06, + "loss": 0.75245726, + "num_input_tokens_seen": 43976800, + "router_z_loss_clip": 3.50585938, + "router_z_loss_mlp": 0.3581543, + "step": 2034, + "time_per_iteration": 5.42257833480835 + }, + { + "auxiliary_loss_clip": 0.06636314, + "auxiliary_loss_mlp": 0.01296044, + "balance_loss_clip": 0.06304356, + "balance_loss_mlp": 0.0126164, + "epoch": 0.1223508191793176, + "flos": 22280963339520.0, + "grad_norm": 1.830897331176389, + "language_loss": 0.77330279, + "learning_rate": 3.911200268044055e-06, + "loss": 0.85262644, + "num_input_tokens_seen": 43996620, + "router_z_loss_clip": 3.32226562, + "router_z_loss_mlp": 0.34375, + "step": 2035, + "time_per_iteration": 2.636413097381592 + }, + { + "auxiliary_loss_clip": 0.06651293, + "auxiliary_loss_mlp": 0.01293249, + "balance_loss_clip": 0.06302898, + "balance_loss_mlp": 0.01258893, + "epoch": 0.12241094243198557, + "flos": 21292009925760.0, + "grad_norm": 2.7740017238095187, + "language_loss": 0.73084652, + "learning_rate": 3.911085470472892e-06, + "loss": 0.81029195, + "num_input_tokens_seen": 44016175, + "router_z_loss_clip": 3.48632812, + "router_z_loss_mlp": 0.34350586, + "step": 2036, + "time_per_iteration": 2.528167724609375 + }, + { + "auxiliary_loss_clip": 0.06639268, + "auxiliary_loss_mlp": 0.01290851, + "balance_loss_clip": 0.06301118, + "balance_loss_mlp": 0.01256185, + "epoch": 0.12247106568465355, + "flos": 17387823185280.0, + "grad_norm": 1.824605307650974, + "language_loss": 0.84228837, + "learning_rate": 3.910970600433178e-06, + "loss": 0.92158961, + "num_input_tokens_seen": 44035060, + "router_z_loss_clip": 3.3828125, + "router_z_loss_mlp": 0.34692383, + "step": 2037, + "time_per_iteration": 2.554356575012207 + }, + { + "auxiliary_loss_clip": 0.06640968, + "auxiliary_loss_mlp": 0.0129909, + "balance_loss_clip": 0.06304546, + "balance_loss_mlp": 0.01265043, + "epoch": 0.12253118893732151, + "flos": 27051548497920.0, + "grad_norm": 3.231665500772768, + "language_loss": 0.81365263, + "learning_rate": 3.910855657929267e-06, + "loss": 0.89305323, + "num_input_tokens_seen": 44053330, + "router_z_loss_clip": 3.36523438, + "router_z_loss_mlp": 0.34057617, + "step": 2038, + "time_per_iteration": 2.5666050910949707 + }, + { + "auxiliary_loss_clip": 0.0649721, + "auxiliary_loss_mlp": 0.01268916, + "balance_loss_clip": 0.06293084, + "balance_loss_mlp": 0.01256113, + "epoch": 0.12259131218998948, + "flos": 53878055328000.0, + "grad_norm": 0.7896182211698063, + "language_loss": 0.58607936, + "learning_rate": 3.910740642965518e-06, + "loss": 0.66374058, + "num_input_tokens_seen": 44107575, + "router_z_loss_clip": 2.04101562, + "router_z_loss_mlp": 0.12817383, + "step": 2039, + "time_per_iteration": 3.1232099533081055 + }, + { + "auxiliary_loss_clip": 0.06641525, + "auxiliary_loss_mlp": 0.0129467, + "balance_loss_clip": 0.06306375, + "balance_loss_mlp": 0.01261053, + "epoch": 0.12265143544265744, + "flos": 17897233282560.0, + "grad_norm": 3.4610063472864065, + "language_loss": 0.82137585, + "learning_rate": 3.910625555546292e-06, + "loss": 0.90073782, + "num_input_tokens_seen": 44126075, + "router_z_loss_clip": 3.34765625, + "router_z_loss_mlp": 0.33569336, + "step": 2040, + "time_per_iteration": 2.5443432331085205 + }, + { + "auxiliary_loss_clip": 0.06629258, + "auxiliary_loss_mlp": 0.01288004, + "balance_loss_clip": 0.06301395, + "balance_loss_mlp": 0.01255031, + "epoch": 0.12271155869532542, + "flos": 21806577048960.0, + "grad_norm": 2.3749836007198546, + "language_loss": 0.84196723, + "learning_rate": 3.910510395675953e-06, + "loss": 0.92113984, + "num_input_tokens_seen": 44145605, + "router_z_loss_clip": 3.27539062, + "router_z_loss_mlp": 0.32983398, + "step": 2041, + "time_per_iteration": 2.5387189388275146 + }, + { + "auxiliary_loss_clip": 0.06646631, + "auxiliary_loss_mlp": 0.01292367, + "balance_loss_clip": 0.06301489, + "balance_loss_mlp": 0.0125627, + "epoch": 0.12277168194799339, + "flos": 19834917350400.0, + "grad_norm": 2.032940304960421, + "language_loss": 0.68564701, + "learning_rate": 3.9103951633588694e-06, + "loss": 0.76503706, + "num_input_tokens_seen": 44164770, + "router_z_loss_clip": 3.44726562, + "router_z_loss_mlp": 0.36083984, + "step": 2042, + "time_per_iteration": 2.5871469974517822 + }, + { + "auxiliary_loss_clip": 0.06626363, + "auxiliary_loss_mlp": 0.01291525, + "balance_loss_clip": 0.06293724, + "balance_loss_mlp": 0.01258957, + "epoch": 0.12283180520066135, + "flos": 23227597641600.0, + "grad_norm": 4.507885061874762, + "language_loss": 0.82501084, + "learning_rate": 3.910279858599409e-06, + "loss": 0.90418965, + "num_input_tokens_seen": 44184025, + "router_z_loss_clip": 3.328125, + "router_z_loss_mlp": 0.32568359, + "step": 2043, + "time_per_iteration": 2.5436289310455322 + }, + { + "auxiliary_loss_clip": 0.06642601, + "auxiliary_loss_mlp": 0.01293474, + "balance_loss_clip": 0.06301275, + "balance_loss_mlp": 0.01260501, + "epoch": 0.12289192845332933, + "flos": 18594466306560.0, + "grad_norm": 1.8262165625903515, + "language_loss": 0.8169322, + "learning_rate": 3.910164481401946e-06, + "loss": 0.89629292, + "num_input_tokens_seen": 44202950, + "router_z_loss_clip": 3.41210938, + "router_z_loss_mlp": 0.32983398, + "step": 2044, + "time_per_iteration": 2.5594139099121094 + }, + { + "auxiliary_loss_clip": 0.06635186, + "auxiliary_loss_mlp": 0.0128851, + "balance_loss_clip": 0.06299295, + "balance_loss_mlp": 0.01254416, + "epoch": 0.1229520517059973, + "flos": 25775612449920.0, + "grad_norm": 1.8452303970598702, + "language_loss": 0.79028547, + "learning_rate": 3.910049031770853e-06, + "loss": 0.86952239, + "num_input_tokens_seen": 44221115, + "router_z_loss_clip": 3.35742188, + "router_z_loss_mlp": 0.34082031, + "step": 2045, + "time_per_iteration": 2.5465781688690186 + }, + { + "auxiliary_loss_clip": 0.06636953, + "auxiliary_loss_mlp": 0.01295167, + "balance_loss_clip": 0.06298777, + "balance_loss_mlp": 0.01262408, + "epoch": 0.12301217495866526, + "flos": 20893541034240.0, + "grad_norm": 1.9769865564806426, + "language_loss": 0.69156218, + "learning_rate": 3.90993350971051e-06, + "loss": 0.77088338, + "num_input_tokens_seen": 44240575, + "router_z_loss_clip": 3.38671875, + "router_z_loss_mlp": 0.32763672, + "step": 2046, + "time_per_iteration": 2.5848565101623535 + }, + { + "auxiliary_loss_clip": 0.06628656, + "auxiliary_loss_mlp": 0.01290131, + "balance_loss_clip": 0.06297234, + "balance_loss_mlp": 0.01257277, + "epoch": 0.12307229821133324, + "flos": 22384735021440.0, + "grad_norm": 2.0992511324886713, + "language_loss": 0.73182803, + "learning_rate": 3.909817915225297e-06, + "loss": 0.8110159, + "num_input_tokens_seen": 44257145, + "router_z_loss_clip": 3.31640625, + "router_z_loss_mlp": 0.32861328, + "step": 2047, + "time_per_iteration": 2.5309009552001953 + }, + { + "auxiliary_loss_clip": 0.06630135, + "auxiliary_loss_mlp": 0.0129866, + "balance_loss_clip": 0.06297912, + "balance_loss_mlp": 0.01263732, + "epoch": 0.1231324214640012, + "flos": 23374065778560.0, + "grad_norm": 2.486188262823441, + "language_loss": 0.77457881, + "learning_rate": 3.909702248319597e-06, + "loss": 0.85386682, + "num_input_tokens_seen": 44278035, + "router_z_loss_clip": 3.32226562, + "router_z_loss_mlp": 0.34912109, + "step": 2048, + "time_per_iteration": 2.6273012161254883 + }, + { + "auxiliary_loss_clip": 0.06627734, + "auxiliary_loss_mlp": 0.01290224, + "balance_loss_clip": 0.06297483, + "balance_loss_mlp": 0.01258514, + "epoch": 0.12319254471666917, + "flos": 23773624773120.0, + "grad_norm": 1.9256853930308273, + "language_loss": 0.8659687, + "learning_rate": 3.909586508997797e-06, + "loss": 0.94514829, + "num_input_tokens_seen": 44296980, + "router_z_loss_clip": 3.30273438, + "router_z_loss_mlp": 0.31665039, + "step": 2049, + "time_per_iteration": 2.559253692626953 + }, + { + "auxiliary_loss_clip": 0.06639866, + "auxiliary_loss_mlp": 0.01291416, + "balance_loss_clip": 0.06300847, + "balance_loss_mlp": 0.01257751, + "epoch": 0.12325266796933713, + "flos": 23556899387520.0, + "grad_norm": 2.574663902354124, + "language_loss": 0.76814753, + "learning_rate": 3.909470697264285e-06, + "loss": 0.84746033, + "num_input_tokens_seen": 44318005, + "router_z_loss_clip": 3.390625, + "router_z_loss_mlp": 0.33691406, + "step": 2050, + "time_per_iteration": 2.6138648986816406 + }, + { + "auxiliary_loss_clip": 0.06634495, + "auxiliary_loss_mlp": 0.0128935, + "balance_loss_clip": 0.06301371, + "balance_loss_mlp": 0.01256353, + "epoch": 0.12331279122200511, + "flos": 24430593110400.0, + "grad_norm": 2.4676515957678826, + "language_loss": 0.82809746, + "learning_rate": 3.909354813123452e-06, + "loss": 0.90733588, + "num_input_tokens_seen": 44335260, + "router_z_loss_clip": 3.328125, + "router_z_loss_mlp": 0.32983398, + "step": 2051, + "time_per_iteration": 2.53440260887146 + }, + { + "auxiliary_loss_clip": 0.06631288, + "auxiliary_loss_mlp": 0.01288335, + "balance_loss_clip": 0.06299216, + "balance_loss_mlp": 0.01256625, + "epoch": 0.12337291447467308, + "flos": 25491438927360.0, + "grad_norm": 2.0266783151609666, + "language_loss": 0.81273621, + "learning_rate": 3.909238856579693e-06, + "loss": 0.89193243, + "num_input_tokens_seen": 44355315, + "router_z_loss_clip": 3.32421875, + "router_z_loss_mlp": 0.3170166, + "step": 2052, + "time_per_iteration": 2.5801045894622803 + }, + { + "auxiliary_loss_clip": 0.06643972, + "auxiliary_loss_mlp": 0.012894, + "balance_loss_clip": 0.06304064, + "balance_loss_mlp": 0.0125533, + "epoch": 0.12343303772734104, + "flos": 23556731679360.0, + "grad_norm": 2.520879144307052, + "language_loss": 0.75331706, + "learning_rate": 3.909122827637406e-06, + "loss": 0.83265078, + "num_input_tokens_seen": 44373020, + "router_z_loss_clip": 3.40039062, + "router_z_loss_mlp": 0.34082031, + "step": 2053, + "time_per_iteration": 2.5373435020446777 + }, + { + "auxiliary_loss_clip": 0.06645267, + "auxiliary_loss_mlp": 0.01289892, + "balance_loss_clip": 0.06306874, + "balance_loss_mlp": 0.01256919, + "epoch": 0.12349316098000902, + "flos": 47567724670080.0, + "grad_norm": 1.6252086945457442, + "language_loss": 0.75631851, + "learning_rate": 3.909006726300991e-06, + "loss": 0.83567011, + "num_input_tokens_seen": 44397525, + "router_z_loss_clip": 3.37890625, + "router_z_loss_mlp": 0.32983398, + "step": 2054, + "time_per_iteration": 2.7952961921691895 + }, + { + "auxiliary_loss_clip": 0.06634779, + "auxiliary_loss_mlp": 0.01287596, + "balance_loss_clip": 0.06307411, + "balance_loss_mlp": 0.0125715, + "epoch": 0.12355328423267699, + "flos": 25052956911360.0, + "grad_norm": 1.7485213657356729, + "language_loss": 0.86270738, + "learning_rate": 3.908890552574849e-06, + "loss": 0.94193119, + "num_input_tokens_seen": 44415890, + "router_z_loss_clip": 3.27148438, + "router_z_loss_mlp": 0.30419922, + "step": 2055, + "time_per_iteration": 2.553056001663208 + }, + { + "auxiliary_loss_clip": 0.06643809, + "auxiliary_loss_mlp": 0.01295066, + "balance_loss_clip": 0.06311696, + "balance_loss_mlp": 0.0126226, + "epoch": 0.12361340748534495, + "flos": 27716524899840.0, + "grad_norm": 2.053117172443155, + "language_loss": 0.78908336, + "learning_rate": 3.908774306463384e-06, + "loss": 0.86847222, + "num_input_tokens_seen": 44436625, + "router_z_loss_clip": 3.32421875, + "router_z_loss_mlp": 0.328125, + "step": 2056, + "time_per_iteration": 2.632049322128296 + }, + { + "auxiliary_loss_clip": 0.06652766, + "auxiliary_loss_mlp": 0.01294236, + "balance_loss_clip": 0.06316112, + "balance_loss_mlp": 0.01262002, + "epoch": 0.12367353073801293, + "flos": 26147778359040.0, + "grad_norm": 2.0516910638510835, + "language_loss": 0.84512216, + "learning_rate": 3.908657987971009e-06, + "loss": 0.92459214, + "num_input_tokens_seen": 44455265, + "router_z_loss_clip": 3.3671875, + "router_z_loss_mlp": 0.32226562, + "step": 2057, + "time_per_iteration": 2.5529589653015137 + }, + { + "auxiliary_loss_clip": 0.06650747, + "auxiliary_loss_mlp": 0.0129436, + "balance_loss_clip": 0.06317189, + "balance_loss_mlp": 0.01261553, + "epoch": 0.1237336539906809, + "flos": 25163143430400.0, + "grad_norm": 1.8863431007110945, + "language_loss": 0.7932052, + "learning_rate": 3.90854159710213e-06, + "loss": 0.87265623, + "num_input_tokens_seen": 44475815, + "router_z_loss_clip": 3.33398438, + "router_z_loss_mlp": 0.328125, + "step": 2058, + "time_per_iteration": 2.636936902999878 + }, + { + "auxiliary_loss_clip": 0.06652544, + "auxiliary_loss_mlp": 0.01294377, + "balance_loss_clip": 0.06313539, + "balance_loss_mlp": 0.01259782, + "epoch": 0.12379377724334886, + "flos": 15310001963520.0, + "grad_norm": 2.1631103181071865, + "language_loss": 0.84899569, + "learning_rate": 3.9084251338611624e-06, + "loss": 0.92846489, + "num_input_tokens_seen": 44494045, + "router_z_loss_clip": 3.38671875, + "router_z_loss_mlp": 0.34619141, + "step": 2059, + "time_per_iteration": 2.534330129623413 + }, + { + "auxiliary_loss_clip": 0.06649262, + "auxiliary_loss_mlp": 0.01290616, + "balance_loss_clip": 0.06311791, + "balance_loss_mlp": 0.01258405, + "epoch": 0.12385390049601683, + "flos": 21321792852480.0, + "grad_norm": 2.425291985469593, + "language_loss": 0.82626045, + "learning_rate": 3.908308598252523e-06, + "loss": 0.90565926, + "num_input_tokens_seen": 44509120, + "router_z_loss_clip": 3.37695312, + "router_z_loss_mlp": 0.32177734, + "step": 2060, + "time_per_iteration": 2.6014535427093506 + }, + { + "auxiliary_loss_clip": 0.06642138, + "auxiliary_loss_mlp": 0.01290673, + "balance_loss_clip": 0.06310271, + "balance_loss_mlp": 0.01256579, + "epoch": 0.1239140237486848, + "flos": 15120711590400.0, + "grad_norm": 2.0800945388405734, + "language_loss": 0.87935984, + "learning_rate": 3.9081919902806306e-06, + "loss": 0.95868802, + "num_input_tokens_seen": 44525780, + "router_z_loss_clip": 3.31640625, + "router_z_loss_mlp": 0.34082031, + "step": 2061, + "time_per_iteration": 2.494584321975708 + }, + { + "auxiliary_loss_clip": 0.0663335, + "auxiliary_loss_mlp": 0.01291205, + "balance_loss_clip": 0.06306711, + "balance_loss_mlp": 0.01260259, + "epoch": 0.12397414700135277, + "flos": 21982534623360.0, + "grad_norm": 1.9753177189275368, + "language_loss": 0.85858583, + "learning_rate": 3.908075309949906e-06, + "loss": 0.9378314, + "num_input_tokens_seen": 44543125, + "router_z_loss_clip": 3.26953125, + "router_z_loss_mlp": 0.30932617, + "step": 2062, + "time_per_iteration": 2.5650103092193604 + }, + { + "auxiliary_loss_clip": 0.06642005, + "auxiliary_loss_mlp": 0.01288926, + "balance_loss_clip": 0.06311023, + "balance_loss_mlp": 0.01256549, + "epoch": 0.12403427025402074, + "flos": 13404909934080.0, + "grad_norm": 1.7604795458830171, + "language_loss": 0.80305374, + "learning_rate": 3.907958557264774e-06, + "loss": 0.88236302, + "num_input_tokens_seen": 44560275, + "router_z_loss_clip": 3.30859375, + "router_z_loss_mlp": 0.32373047, + "step": 2063, + "time_per_iteration": 2.5019121170043945 + }, + { + "auxiliary_loss_clip": 0.06644779, + "auxiliary_loss_mlp": 0.0129093, + "balance_loss_clip": 0.06312533, + "balance_loss_mlp": 0.01257146, + "epoch": 0.12409439350668872, + "flos": 15309750401280.0, + "grad_norm": 2.5047408324670832, + "language_loss": 0.80646086, + "learning_rate": 3.907841732229663e-06, + "loss": 0.885818, + "num_input_tokens_seen": 44577640, + "router_z_loss_clip": 3.32421875, + "router_z_loss_mlp": 0.33789062, + "step": 2064, + "time_per_iteration": 2.5915873050689697 + }, + { + "auxiliary_loss_clip": 0.06642206, + "auxiliary_loss_mlp": 0.01295102, + "balance_loss_clip": 0.06310631, + "balance_loss_mlp": 0.01263583, + "epoch": 0.12415451675935668, + "flos": 25016339877120.0, + "grad_norm": 2.4114555321806677, + "language_loss": 0.93642998, + "learning_rate": 3.907724834849002e-06, + "loss": 1.0158031, + "num_input_tokens_seen": 44594860, + "router_z_loss_clip": 3.31445312, + "router_z_loss_mlp": 0.31542969, + "step": 2065, + "time_per_iteration": 2.561858892440796 + }, + { + "auxiliary_loss_clip": 0.06650305, + "auxiliary_loss_mlp": 0.01289676, + "balance_loss_clip": 0.06313996, + "balance_loss_mlp": 0.01256845, + "epoch": 0.12421464001202465, + "flos": 23666457000960.0, + "grad_norm": 2.189266948105698, + "language_loss": 0.81909287, + "learning_rate": 3.907607865127225e-06, + "loss": 0.89849269, + "num_input_tokens_seen": 44614780, + "router_z_loss_clip": 3.36523438, + "router_z_loss_mlp": 0.32836914, + "step": 2066, + "time_per_iteration": 2.593202590942383 + }, + { + "auxiliary_loss_clip": 0.06490391, + "auxiliary_loss_mlp": 0.01263186, + "balance_loss_clip": 0.06289329, + "balance_loss_mlp": 0.01251599, + "epoch": 0.12427476326469263, + "flos": 65753686794240.0, + "grad_norm": 0.8319051039342746, + "language_loss": 0.63633674, + "learning_rate": 3.907490823068766e-06, + "loss": 0.71387255, + "num_input_tokens_seen": 44671240, + "router_z_loss_clip": 2.00195312, + "router_z_loss_mlp": 0.11578369, + "step": 2067, + "time_per_iteration": 3.1761627197265625 + }, + { + "auxiliary_loss_clip": 0.06645706, + "auxiliary_loss_mlp": 0.01298846, + "balance_loss_clip": 0.0631035, + "balance_loss_mlp": 0.01263441, + "epoch": 0.12433488651736059, + "flos": 24542372856960.0, + "grad_norm": 1.826307317776044, + "language_loss": 0.94409752, + "learning_rate": 3.907373708678063e-06, + "loss": 1.023543, + "num_input_tokens_seen": 44691050, + "router_z_loss_clip": 3.3515625, + "router_z_loss_mlp": 0.35375977, + "step": 2068, + "time_per_iteration": 2.548051357269287 + }, + { + "auxiliary_loss_clip": 0.06634392, + "auxiliary_loss_mlp": 0.01295819, + "balance_loss_clip": 0.06307046, + "balance_loss_mlp": 0.01265087, + "epoch": 0.12439500977002856, + "flos": 21037828965120.0, + "grad_norm": 2.192174211914145, + "language_loss": 0.82850045, + "learning_rate": 3.9072565219595596e-06, + "loss": 0.90780252, + "num_input_tokens_seen": 44709850, + "router_z_loss_clip": 3.2734375, + "router_z_loss_mlp": 0.30712891, + "step": 2069, + "time_per_iteration": 3.9771463871002197 + }, + { + "auxiliary_loss_clip": 0.0664653, + "auxiliary_loss_mlp": 0.01287176, + "balance_loss_clip": 0.06312294, + "balance_loss_mlp": 0.01255276, + "epoch": 0.12445513302269653, + "flos": 26837380661760.0, + "grad_norm": 2.140489528942806, + "language_loss": 0.78554291, + "learning_rate": 3.907139262917696e-06, + "loss": 0.86487997, + "num_input_tokens_seen": 44731475, + "router_z_loss_clip": 3.33984375, + "router_z_loss_mlp": 0.31884766, + "step": 2070, + "time_per_iteration": 2.5697221755981445 + }, + { + "auxiliary_loss_clip": 0.06645045, + "auxiliary_loss_mlp": 0.01288939, + "balance_loss_clip": 0.06311486, + "balance_loss_mlp": 0.01258469, + "epoch": 0.1245152562753645, + "flos": 18374764101120.0, + "grad_norm": 2.28424874253062, + "language_loss": 0.81667042, + "learning_rate": 3.907021931556922e-06, + "loss": 0.89601028, + "num_input_tokens_seen": 44749685, + "router_z_loss_clip": 3.3359375, + "router_z_loss_mlp": 0.3046875, + "step": 2071, + "time_per_iteration": 3.9356284141540527 + }, + { + "auxiliary_loss_clip": 0.06624742, + "auxiliary_loss_mlp": 0.01289094, + "balance_loss_clip": 0.06303577, + "balance_loss_mlp": 0.01256407, + "epoch": 0.12457537952803246, + "flos": 33116098331520.0, + "grad_norm": 2.0527550980706626, + "language_loss": 0.79415953, + "learning_rate": 3.906904527881684e-06, + "loss": 0.87329787, + "num_input_tokens_seen": 44772165, + "router_z_loss_clip": 3.20898438, + "router_z_loss_mlp": 0.32666016, + "step": 2072, + "time_per_iteration": 2.659824848175049 + }, + { + "auxiliary_loss_clip": 0.06639021, + "auxiliary_loss_mlp": 0.01293554, + "balance_loss_clip": 0.06306598, + "balance_loss_mlp": 0.01260819, + "epoch": 0.12463550278070043, + "flos": 22276267511040.0, + "grad_norm": 2.0170209718237144, + "language_loss": 0.76458508, + "learning_rate": 3.9067870518964355e-06, + "loss": 0.84391081, + "num_input_tokens_seen": 44790580, + "router_z_loss_clip": 3.32226562, + "router_z_loss_mlp": 0.32739258, + "step": 2073, + "time_per_iteration": 4.0372233390808105 + }, + { + "auxiliary_loss_clip": 0.06627664, + "auxiliary_loss_mlp": 0.012867, + "balance_loss_clip": 0.06303963, + "balance_loss_mlp": 0.01255491, + "epoch": 0.12469562603336841, + "flos": 14683445458560.0, + "grad_norm": 1.9751185197934578, + "language_loss": 0.9136548, + "learning_rate": 3.906669503605631e-06, + "loss": 0.99279845, + "num_input_tokens_seen": 44806730, + "router_z_loss_clip": 3.24023438, + "router_z_loss_mlp": 0.3125, + "step": 2074, + "time_per_iteration": 3.880718946456909 + }, + { + "auxiliary_loss_clip": 0.06644025, + "auxiliary_loss_mlp": 0.01296508, + "balance_loss_clip": 0.06306964, + "balance_loss_mlp": 0.0126065, + "epoch": 0.12475574928603637, + "flos": 24651720835200.0, + "grad_norm": 2.411338932827457, + "language_loss": 0.85379255, + "learning_rate": 3.906551883013728e-06, + "loss": 0.93319792, + "num_input_tokens_seen": 44825550, + "router_z_loss_clip": 3.36914062, + "router_z_loss_mlp": 0.35839844, + "step": 2075, + "time_per_iteration": 2.593402147293091 + }, + { + "auxiliary_loss_clip": 0.06632458, + "auxiliary_loss_mlp": 0.01300353, + "balance_loss_clip": 0.06302904, + "balance_loss_mlp": 0.01267166, + "epoch": 0.12481587253870434, + "flos": 21769540744320.0, + "grad_norm": 1.9904013424210072, + "language_loss": 0.73795271, + "learning_rate": 3.9064341901251865e-06, + "loss": 0.81728083, + "num_input_tokens_seen": 44844155, + "router_z_loss_clip": 3.29296875, + "router_z_loss_mlp": 0.33227539, + "step": 2076, + "time_per_iteration": 2.5252525806427 + }, + { + "auxiliary_loss_clip": 0.06619625, + "auxiliary_loss_mlp": 0.01296003, + "balance_loss_clip": 0.06298469, + "balance_loss_mlp": 0.0126632, + "epoch": 0.12487599579137232, + "flos": 21438687697920.0, + "grad_norm": 2.119852671968812, + "language_loss": 0.76853049, + "learning_rate": 3.906316424944469e-06, + "loss": 0.84768671, + "num_input_tokens_seen": 44863780, + "router_z_loss_clip": 3.2109375, + "router_z_loss_mlp": 0.29663086, + "step": 2077, + "time_per_iteration": 2.5812795162200928 + }, + { + "auxiliary_loss_clip": 0.06627834, + "auxiliary_loss_mlp": 0.01294428, + "balance_loss_clip": 0.06298409, + "balance_loss_mlp": 0.01261503, + "epoch": 0.12493611904404028, + "flos": 16113228802560.0, + "grad_norm": 2.6079444778137906, + "language_loss": 0.83980322, + "learning_rate": 3.906198587476043e-06, + "loss": 0.9190259, + "num_input_tokens_seen": 44881480, + "router_z_loss_clip": 3.296875, + "router_z_loss_mlp": 0.3293457, + "step": 2078, + "time_per_iteration": 2.5144779682159424 + }, + { + "auxiliary_loss_clip": 0.06633472, + "auxiliary_loss_mlp": 0.01297977, + "balance_loss_clip": 0.06301548, + "balance_loss_mlp": 0.01265337, + "epoch": 0.12499624229670825, + "flos": 21586749062400.0, + "grad_norm": 2.088353376240652, + "language_loss": 0.7681694, + "learning_rate": 3.906080677724374e-06, + "loss": 0.84748387, + "num_input_tokens_seen": 44900390, + "router_z_loss_clip": 3.32226562, + "router_z_loss_mlp": 0.32617188, + "step": 2079, + "time_per_iteration": 2.638761043548584 + }, + { + "auxiliary_loss_clip": 0.06640807, + "auxiliary_loss_mlp": 0.01295919, + "balance_loss_clip": 0.06307015, + "balance_loss_mlp": 0.01263351, + "epoch": 0.1250563655493762, + "flos": 25705522909440.0, + "grad_norm": 2.3726479932939064, + "language_loss": 0.85245967, + "learning_rate": 3.905962695693935e-06, + "loss": 0.93182695, + "num_input_tokens_seen": 44920375, + "router_z_loss_clip": 3.33984375, + "router_z_loss_mlp": 0.32592773, + "step": 2080, + "time_per_iteration": 2.5898683071136475 + }, + { + "auxiliary_loss_clip": 0.06618196, + "auxiliary_loss_mlp": 0.0130361, + "balance_loss_clip": 0.06292213, + "balance_loss_mlp": 0.01269993, + "epoch": 0.12511648880204418, + "flos": 16915113976320.0, + "grad_norm": 2.1047824756143263, + "language_loss": 0.86146665, + "learning_rate": 3.9058446413892e-06, + "loss": 0.94068468, + "num_input_tokens_seen": 44938415, + "router_z_loss_clip": 3.25976562, + "router_z_loss_mlp": 0.3359375, + "step": 2081, + "time_per_iteration": 2.5291430950164795 + }, + { + "auxiliary_loss_clip": 0.06628423, + "auxiliary_loss_mlp": 0.01299212, + "balance_loss_clip": 0.06304745, + "balance_loss_mlp": 0.01268289, + "epoch": 0.12517661205471217, + "flos": 17573423978880.0, + "grad_norm": 1.9525319716543403, + "language_loss": 0.77591729, + "learning_rate": 3.905726514814646e-06, + "loss": 0.85519361, + "num_input_tokens_seen": 44957135, + "router_z_loss_clip": 3.23632812, + "router_z_loss_mlp": 0.30908203, + "step": 2082, + "time_per_iteration": 2.5817041397094727 + }, + { + "auxiliary_loss_clip": 0.06645833, + "auxiliary_loss_mlp": 0.01295307, + "balance_loss_clip": 0.06304055, + "balance_loss_mlp": 0.01261118, + "epoch": 0.12523673530738014, + "flos": 16039240047360.0, + "grad_norm": 3.06086551706414, + "language_loss": 0.80167735, + "learning_rate": 3.9056083159747495e-06, + "loss": 0.88108873, + "num_input_tokens_seen": 44974480, + "router_z_loss_clip": 3.421875, + "router_z_loss_mlp": 0.34179688, + "step": 2083, + "time_per_iteration": 2.6278059482574463 + }, + { + "auxiliary_loss_clip": 0.06632711, + "auxiliary_loss_mlp": 0.01297422, + "balance_loss_clip": 0.06298797, + "balance_loss_mlp": 0.0126297, + "epoch": 0.1252968585600481, + "flos": 18813833095680.0, + "grad_norm": 3.451384720222282, + "language_loss": 0.92214763, + "learning_rate": 3.9054900448739966e-06, + "loss": 1.00144899, + "num_input_tokens_seen": 44990310, + "router_z_loss_clip": 3.33789062, + "router_z_loss_mlp": 0.34472656, + "step": 2084, + "time_per_iteration": 2.501530647277832 + }, + { + "auxiliary_loss_clip": 0.0662484, + "auxiliary_loss_mlp": 0.01295191, + "balance_loss_clip": 0.06296518, + "balance_loss_mlp": 0.01263171, + "epoch": 0.12535698181271607, + "flos": 27278923351680.0, + "grad_norm": 1.9702751102582312, + "language_loss": 0.81308639, + "learning_rate": 3.905371701516869e-06, + "loss": 0.89228666, + "num_input_tokens_seen": 45010720, + "router_z_loss_clip": 3.28320312, + "router_z_loss_mlp": 0.32006836, + "step": 2085, + "time_per_iteration": 2.5993080139160156 + }, + { + "auxiliary_loss_clip": 0.06621981, + "auxiliary_loss_mlp": 0.01314133, + "balance_loss_clip": 0.06297316, + "balance_loss_mlp": 0.01281469, + "epoch": 0.12541710506538403, + "flos": 22060590301440.0, + "grad_norm": 2.513443994409739, + "language_loss": 0.89793539, + "learning_rate": 3.905253285907856e-06, + "loss": 0.97729653, + "num_input_tokens_seen": 45030360, + "router_z_loss_clip": 3.24609375, + "router_z_loss_mlp": 0.32641602, + "step": 2086, + "time_per_iteration": 2.526017427444458 + }, + { + "auxiliary_loss_clip": 0.0661508, + "auxiliary_loss_mlp": 0.01297904, + "balance_loss_clip": 0.06298057, + "balance_loss_mlp": 0.01269651, + "epoch": 0.125477228318052, + "flos": 12607888296960.0, + "grad_norm": 2.458580206146656, + "language_loss": 0.88740981, + "learning_rate": 3.905134798051447e-06, + "loss": 0.96653962, + "num_input_tokens_seen": 45045085, + "router_z_loss_clip": 3.16992188, + "router_z_loss_mlp": 0.28271484, + "step": 2087, + "time_per_iteration": 2.6768429279327393 + }, + { + "auxiliary_loss_clip": 0.06626555, + "auxiliary_loss_mlp": 0.0130267, + "balance_loss_clip": 0.06301963, + "balance_loss_mlp": 0.0127077, + "epoch": 0.12553735157071996, + "flos": 23885362592640.0, + "grad_norm": 1.907782132807464, + "language_loss": 0.74902099, + "learning_rate": 3.905016237952136e-06, + "loss": 0.82831323, + "num_input_tokens_seen": 45065145, + "router_z_loss_clip": 3.24804688, + "router_z_loss_mlp": 0.3190918, + "step": 2088, + "time_per_iteration": 2.584322690963745 + }, + { + "auxiliary_loss_clip": 0.06515329, + "auxiliary_loss_mlp": 0.01279784, + "balance_loss_clip": 0.06318291, + "balance_loss_mlp": 0.01264752, + "epoch": 0.12559747482338796, + "flos": 69940998881280.0, + "grad_norm": 0.7370797813517723, + "language_loss": 0.61766195, + "learning_rate": 3.904897605614418e-06, + "loss": 0.69561303, + "num_input_tokens_seen": 45126230, + "router_z_loss_clip": 1.96875, + "router_z_loss_mlp": 0.15002441, + "step": 2089, + "time_per_iteration": 3.1401424407958984 + }, + { + "auxiliary_loss_clip": 0.06624255, + "auxiliary_loss_mlp": 0.01293606, + "balance_loss_clip": 0.06302167, + "balance_loss_mlp": 0.01262707, + "epoch": 0.12565759807605592, + "flos": 24286389033600.0, + "grad_norm": 1.9922861494736146, + "language_loss": 0.80224949, + "learning_rate": 3.904778901042793e-06, + "loss": 0.88142806, + "num_input_tokens_seen": 45145545, + "router_z_loss_clip": 3.22070312, + "router_z_loss_mlp": 0.30883789, + "step": 2090, + "time_per_iteration": 2.6044373512268066 + }, + { + "auxiliary_loss_clip": 0.0651547, + "auxiliary_loss_mlp": 0.01267283, + "balance_loss_clip": 0.06318653, + "balance_loss_mlp": 0.01254635, + "epoch": 0.12571772132872389, + "flos": 56468011904640.0, + "grad_norm": 0.7384472353065198, + "language_loss": 0.58865118, + "learning_rate": 3.90466012424176e-06, + "loss": 0.66647875, + "num_input_tokens_seen": 45206845, + "router_z_loss_clip": 1.96875, + "router_z_loss_mlp": 0.12646484, + "step": 2091, + "time_per_iteration": 3.1160824298858643 + }, + { + "auxiliary_loss_clip": 0.06630008, + "auxiliary_loss_mlp": 0.01289162, + "balance_loss_clip": 0.06302688, + "balance_loss_mlp": 0.0125781, + "epoch": 0.12577784458139185, + "flos": 41255576421120.0, + "grad_norm": 1.8290499485408422, + "language_loss": 0.65244853, + "learning_rate": 3.904541275215825e-06, + "loss": 0.73164022, + "num_input_tokens_seen": 45228495, + "router_z_loss_clip": 3.26953125, + "router_z_loss_mlp": 0.31347656, + "step": 2092, + "time_per_iteration": 2.743067741394043 + }, + { + "auxiliary_loss_clip": 0.06640761, + "auxiliary_loss_mlp": 0.01299851, + "balance_loss_clip": 0.06305548, + "balance_loss_mlp": 0.01265542, + "epoch": 0.12583796783405982, + "flos": 19761599427840.0, + "grad_norm": 2.082922063254684, + "language_loss": 0.82319552, + "learning_rate": 3.904422353969493e-06, + "loss": 0.9026016, + "num_input_tokens_seen": 45245720, + "router_z_loss_clip": 3.34960938, + "router_z_loss_mlp": 0.34277344, + "step": 2093, + "time_per_iteration": 2.5252139568328857 + }, + { + "auxiliary_loss_clip": 0.06622188, + "auxiliary_loss_mlp": 0.01291379, + "balance_loss_clip": 0.06303331, + "balance_loss_mlp": 0.01260766, + "epoch": 0.12589809108672778, + "flos": 22608797639040.0, + "grad_norm": 2.0047110075262635, + "language_loss": 0.76888406, + "learning_rate": 3.904303360507276e-06, + "loss": 0.84801972, + "num_input_tokens_seen": 45265650, + "router_z_loss_clip": 3.1875, + "router_z_loss_mlp": 0.30639648, + "step": 2094, + "time_per_iteration": 2.5590462684631348 + }, + { + "auxiliary_loss_clip": 0.06619669, + "auxiliary_loss_mlp": 0.01298019, + "balance_loss_clip": 0.06299751, + "balance_loss_mlp": 0.01266309, + "epoch": 0.12595821433939577, + "flos": 45233248792320.0, + "grad_norm": 1.7774170004570267, + "language_loss": 0.78170305, + "learning_rate": 3.9041842948336835e-06, + "loss": 0.8608799, + "num_input_tokens_seen": 45287790, + "router_z_loss_clip": 3.20117188, + "router_z_loss_mlp": 0.31689453, + "step": 2095, + "time_per_iteration": 2.7437078952789307 + }, + { + "auxiliary_loss_clip": 0.06632219, + "auxiliary_loss_mlp": 0.01294772, + "balance_loss_clip": 0.06299502, + "balance_loss_mlp": 0.01263492, + "epoch": 0.12601833759206374, + "flos": 14325115472640.0, + "grad_norm": 2.871933509106217, + "language_loss": 0.84611917, + "learning_rate": 3.904065156953232e-06, + "loss": 0.92538905, + "num_input_tokens_seen": 45305720, + "router_z_loss_clip": 3.328125, + "router_z_loss_mlp": 0.31274414, + "step": 2096, + "time_per_iteration": 2.530060052871704 + }, + { + "auxiliary_loss_clip": 0.06630743, + "auxiliary_loss_mlp": 0.01306013, + "balance_loss_clip": 0.06297809, + "balance_loss_mlp": 0.01272038, + "epoch": 0.1260784608447317, + "flos": 21294651329280.0, + "grad_norm": 2.3649533335504365, + "language_loss": 0.7677502, + "learning_rate": 3.903945946870439e-06, + "loss": 0.84711778, + "num_input_tokens_seen": 45325290, + "router_z_loss_clip": 3.33203125, + "router_z_loss_mlp": 0.33984375, + "step": 2097, + "time_per_iteration": 2.5258843898773193 + }, + { + "auxiliary_loss_clip": 0.06624204, + "auxiliary_loss_mlp": 0.01300362, + "balance_loss_clip": 0.06299201, + "balance_loss_mlp": 0.0127025, + "epoch": 0.12613858409739967, + "flos": 26258719564800.0, + "grad_norm": 2.151256625756143, + "language_loss": 0.88275403, + "learning_rate": 3.9038266645898246e-06, + "loss": 0.96199965, + "num_input_tokens_seen": 45344465, + "router_z_loss_clip": 3.25195312, + "router_z_loss_mlp": 0.30102539, + "step": 2098, + "time_per_iteration": 2.5916357040405273 + }, + { + "auxiliary_loss_clip": 0.0664238, + "auxiliary_loss_mlp": 0.01307801, + "balance_loss_clip": 0.06306277, + "balance_loss_mlp": 0.0127149, + "epoch": 0.12619870735006763, + "flos": 21586413646080.0, + "grad_norm": 1.8808679634119545, + "language_loss": 0.71169508, + "learning_rate": 3.903707310115912e-06, + "loss": 0.79119694, + "num_input_tokens_seen": 45362465, + "router_z_loss_clip": 3.359375, + "router_z_loss_mlp": 0.36303711, + "step": 2099, + "time_per_iteration": 2.525548219680786 + }, + { + "auxiliary_loss_clip": 0.06636767, + "auxiliary_loss_mlp": 0.01301654, + "balance_loss_clip": 0.06306287, + "balance_loss_mlp": 0.0126756, + "epoch": 0.1262588306027356, + "flos": 23373646508160.0, + "grad_norm": 3.191355313927065, + "language_loss": 0.83154678, + "learning_rate": 3.903587883453228e-06, + "loss": 0.91093099, + "num_input_tokens_seen": 45382700, + "router_z_loss_clip": 3.30664062, + "router_z_loss_mlp": 0.34106445, + "step": 2100, + "time_per_iteration": 2.581777572631836 + }, + { + "auxiliary_loss_clip": 0.06632592, + "auxiliary_loss_mlp": 0.01304584, + "balance_loss_clip": 0.06304123, + "balance_loss_mlp": 0.01271325, + "epoch": 0.12631895385540357, + "flos": 23955619841280.0, + "grad_norm": 1.9586534535799036, + "language_loss": 0.81579792, + "learning_rate": 3.903468384606302e-06, + "loss": 0.89516962, + "num_input_tokens_seen": 45401005, + "router_z_loss_clip": 3.28515625, + "router_z_loss_mlp": 0.33227539, + "step": 2101, + "time_per_iteration": 2.579571008682251 + }, + { + "auxiliary_loss_clip": 0.06508025, + "auxiliary_loss_mlp": 0.01260999, + "balance_loss_clip": 0.06310984, + "balance_loss_mlp": 0.0125033, + "epoch": 0.12637907710807156, + "flos": 70301760635520.0, + "grad_norm": 0.6797956524806741, + "language_loss": 0.57154572, + "learning_rate": 3.903348813579662e-06, + "loss": 0.6492359, + "num_input_tokens_seen": 45466555, + "router_z_loss_clip": 1.96875, + "router_z_loss_mlp": 0.10681152, + "step": 2102, + "time_per_iteration": 3.2542574405670166 + }, + { + "auxiliary_loss_clip": 0.06635006, + "auxiliary_loss_mlp": 0.0129624, + "balance_loss_clip": 0.06302785, + "balance_loss_mlp": 0.01264888, + "epoch": 0.12643920036073952, + "flos": 18920833159680.0, + "grad_norm": 2.1103424848105177, + "language_loss": 0.95015359, + "learning_rate": 3.903229170377845e-06, + "loss": 1.02946603, + "num_input_tokens_seen": 45485165, + "router_z_loss_clip": 3.32617188, + "router_z_loss_mlp": 0.31396484, + "step": 2103, + "time_per_iteration": 2.554858684539795 + }, + { + "auxiliary_loss_clip": 0.06615217, + "auxiliary_loss_mlp": 0.01290733, + "balance_loss_clip": 0.0629935, + "balance_loss_mlp": 0.0126099, + "epoch": 0.1264993236134075, + "flos": 27789926676480.0, + "grad_norm": 1.8409874759375768, + "language_loss": 0.79467118, + "learning_rate": 3.903109455005387e-06, + "loss": 0.8737306, + "num_input_tokens_seen": 45504630, + "router_z_loss_clip": 3.15820312, + "router_z_loss_mlp": 0.29711914, + "step": 2104, + "time_per_iteration": 2.6194100379943848 + }, + { + "auxiliary_loss_clip": 0.06630556, + "auxiliary_loss_mlp": 0.01294269, + "balance_loss_clip": 0.06301397, + "balance_loss_mlp": 0.0126256, + "epoch": 0.12655944686607545, + "flos": 24761739646080.0, + "grad_norm": 2.4857210053550625, + "language_loss": 0.82356828, + "learning_rate": 3.902989667466828e-06, + "loss": 0.90281653, + "num_input_tokens_seen": 45524885, + "router_z_loss_clip": 3.29296875, + "router_z_loss_mlp": 0.31713867, + "step": 2105, + "time_per_iteration": 2.6011011600494385 + }, + { + "auxiliary_loss_clip": 0.06645899, + "auxiliary_loss_mlp": 0.01301591, + "balance_loss_clip": 0.0630343, + "balance_loss_mlp": 0.01263587, + "epoch": 0.12661957011874342, + "flos": 24139753188480.0, + "grad_norm": 2.6380144602222653, + "language_loss": 0.84079802, + "learning_rate": 3.90286980776671e-06, + "loss": 0.92027295, + "num_input_tokens_seen": 45545000, + "router_z_loss_clip": 3.421875, + "router_z_loss_mlp": 0.37963867, + "step": 2106, + "time_per_iteration": 2.572817087173462 + }, + { + "auxiliary_loss_clip": 0.0662559, + "auxiliary_loss_mlp": 0.012898, + "balance_loss_clip": 0.06298016, + "balance_loss_mlp": 0.01256422, + "epoch": 0.12667969337141138, + "flos": 24576180779520.0, + "grad_norm": 1.9395738781277843, + "language_loss": 0.74407184, + "learning_rate": 3.902749875909578e-06, + "loss": 0.82322574, + "num_input_tokens_seen": 45564210, + "router_z_loss_clip": 3.2734375, + "router_z_loss_mlp": 0.33374023, + "step": 2107, + "time_per_iteration": 2.6193723678588867 + }, + { + "auxiliary_loss_clip": 0.06622959, + "auxiliary_loss_mlp": 0.01290393, + "balance_loss_clip": 0.06299001, + "balance_loss_mlp": 0.01259017, + "epoch": 0.12673981662407935, + "flos": 22967546895360.0, + "grad_norm": 2.0472212441306175, + "language_loss": 0.80444276, + "learning_rate": 3.90262987189998e-06, + "loss": 0.88357627, + "num_input_tokens_seen": 45583030, + "router_z_loss_clip": 3.24023438, + "router_z_loss_mlp": 0.31396484, + "step": 2108, + "time_per_iteration": 2.5497617721557617 + }, + { + "auxiliary_loss_clip": 0.06627882, + "auxiliary_loss_mlp": 0.01288653, + "balance_loss_clip": 0.06298642, + "balance_loss_mlp": 0.01256562, + "epoch": 0.12679993987674734, + "flos": 17280613486080.0, + "grad_norm": 2.14760795310841, + "language_loss": 0.77326792, + "learning_rate": 3.902509795742467e-06, + "loss": 0.85243326, + "num_input_tokens_seen": 45602265, + "router_z_loss_clip": 3.29296875, + "router_z_loss_mlp": 0.32080078, + "step": 2109, + "time_per_iteration": 3.9535577297210693 + }, + { + "auxiliary_loss_clip": 0.06619301, + "auxiliary_loss_mlp": 0.01294051, + "balance_loss_clip": 0.0629691, + "balance_loss_mlp": 0.01260672, + "epoch": 0.1268600631294153, + "flos": 17280865048320.0, + "grad_norm": 1.6861552096477337, + "language_loss": 0.83234507, + "learning_rate": 3.902389647441592e-06, + "loss": 0.91147858, + "num_input_tokens_seen": 45620595, + "router_z_loss_clip": 3.22265625, + "router_z_loss_mlp": 0.33374023, + "step": 2110, + "time_per_iteration": 3.975102424621582 + }, + { + "auxiliary_loss_clip": 0.06634356, + "auxiliary_loss_mlp": 0.01289468, + "balance_loss_clip": 0.06303843, + "balance_loss_mlp": 0.01256661, + "epoch": 0.12692018638208327, + "flos": 24067902712320.0, + "grad_norm": 1.6854035382994426, + "language_loss": 0.79946983, + "learning_rate": 3.90226942700191e-06, + "loss": 0.878708, + "num_input_tokens_seen": 45641140, + "router_z_loss_clip": 3.31054688, + "router_z_loss_mlp": 0.32788086, + "step": 2111, + "time_per_iteration": 2.549649953842163 + }, + { + "auxiliary_loss_clip": 0.06640926, + "auxiliary_loss_mlp": 0.0129832, + "balance_loss_clip": 0.06299084, + "balance_loss_mlp": 0.01261199, + "epoch": 0.12698030963475124, + "flos": 31839952648320.0, + "grad_norm": 2.9365318295255984, + "language_loss": 0.78364569, + "learning_rate": 3.902149134427982e-06, + "loss": 0.86303812, + "num_input_tokens_seen": 45662315, + "router_z_loss_clip": 3.4140625, + "router_z_loss_mlp": 0.37109375, + "step": 2112, + "time_per_iteration": 2.641850233078003 + }, + { + "auxiliary_loss_clip": 0.06616612, + "auxiliary_loss_mlp": 0.01293574, + "balance_loss_clip": 0.062942, + "balance_loss_mlp": 0.01262342, + "epoch": 0.1270404328874192, + "flos": 25194058387200.0, + "grad_norm": 2.0317084660262688, + "language_loss": 0.86970478, + "learning_rate": 3.902028769724367e-06, + "loss": 0.94880664, + "num_input_tokens_seen": 45680335, + "router_z_loss_clip": 3.22851562, + "router_z_loss_mlp": 0.31225586, + "step": 2113, + "time_per_iteration": 5.534189224243164 + }, + { + "auxiliary_loss_clip": 0.06626937, + "auxiliary_loss_mlp": 0.01298292, + "balance_loss_clip": 0.06295247, + "balance_loss_mlp": 0.01265462, + "epoch": 0.12710055614008717, + "flos": 16002790721280.0, + "grad_norm": 2.427248740860799, + "language_loss": 0.75266403, + "learning_rate": 3.9019083328956315e-06, + "loss": 0.83191633, + "num_input_tokens_seen": 45696240, + "router_z_loss_clip": 3.3203125, + "router_z_loss_mlp": 0.32788086, + "step": 2114, + "time_per_iteration": 2.491520643234253 + }, + { + "auxiliary_loss_clip": 0.06621046, + "auxiliary_loss_mlp": 0.01302494, + "balance_loss_clip": 0.06295703, + "balance_loss_mlp": 0.01270975, + "epoch": 0.12716067939275516, + "flos": 15091012517760.0, + "grad_norm": 2.3252793600318125, + "language_loss": 0.85064435, + "learning_rate": 3.901787823946341e-06, + "loss": 0.92987972, + "num_input_tokens_seen": 45713695, + "router_z_loss_clip": 3.2578125, + "router_z_loss_mlp": 0.31518555, + "step": 2115, + "time_per_iteration": 2.5152101516723633 + }, + { + "auxiliary_loss_clip": 0.06622103, + "auxiliary_loss_mlp": 0.01292068, + "balance_loss_clip": 0.06295006, + "balance_loss_mlp": 0.01260787, + "epoch": 0.12722080264542313, + "flos": 28374373704960.0, + "grad_norm": 1.6080767966631377, + "language_loss": 0.88167703, + "learning_rate": 3.901667242881065e-06, + "loss": 0.96081877, + "num_input_tokens_seen": 45736655, + "router_z_loss_clip": 3.2734375, + "router_z_loss_mlp": 0.3125, + "step": 2116, + "time_per_iteration": 2.61238169670105 + }, + { + "auxiliary_loss_clip": 0.06614063, + "auxiliary_loss_mlp": 0.01310146, + "balance_loss_clip": 0.06294715, + "balance_loss_mlp": 0.0127877, + "epoch": 0.1272809258980911, + "flos": 32388159985920.0, + "grad_norm": 4.443941469464488, + "language_loss": 0.72083235, + "learning_rate": 3.9015465897043775e-06, + "loss": 0.8000744, + "num_input_tokens_seen": 45758195, + "router_z_loss_clip": 3.1953125, + "router_z_loss_mlp": 0.3137207, + "step": 2117, + "time_per_iteration": 2.6185410022735596 + }, + { + "auxiliary_loss_clip": 0.06630652, + "auxiliary_loss_mlp": 0.01300593, + "balance_loss_clip": 0.06301345, + "balance_loss_mlp": 0.0126781, + "epoch": 0.12734104915075906, + "flos": 16039952807040.0, + "grad_norm": 1.9850917523754936, + "language_loss": 0.87703407, + "learning_rate": 3.901425864420852e-06, + "loss": 0.95634645, + "num_input_tokens_seen": 45774280, + "router_z_loss_clip": 3.29101562, + "router_z_loss_mlp": 0.32739258, + "step": 2118, + "time_per_iteration": 2.503112316131592 + }, + { + "auxiliary_loss_clip": 0.06623712, + "auxiliary_loss_mlp": 0.01308307, + "balance_loss_clip": 0.06299254, + "balance_loss_mlp": 0.01276359, + "epoch": 0.12740117240342702, + "flos": 18266296590720.0, + "grad_norm": 1.8669738886398666, + "language_loss": 0.88737518, + "learning_rate": 3.901305067035068e-06, + "loss": 0.96669531, + "num_input_tokens_seen": 45792760, + "router_z_loss_clip": 3.24414062, + "router_z_loss_mlp": 0.31945801, + "step": 2119, + "time_per_iteration": 2.541663885116577 + }, + { + "auxiliary_loss_clip": 0.06633841, + "auxiliary_loss_mlp": 0.01294245, + "balance_loss_clip": 0.06305236, + "balance_loss_mlp": 0.01260652, + "epoch": 0.127461295656095, + "flos": 12125242379520.0, + "grad_norm": 2.4570566612421154, + "language_loss": 0.88616729, + "learning_rate": 3.901184197551605e-06, + "loss": 0.96544814, + "num_input_tokens_seen": 45804300, + "router_z_loss_clip": 3.2890625, + "router_z_loss_mlp": 0.33569336, + "step": 2120, + "time_per_iteration": 2.481060743331909 + }, + { + "auxiliary_loss_clip": 0.06631807, + "auxiliary_loss_mlp": 0.01302004, + "balance_loss_clip": 0.06303513, + "balance_loss_mlp": 0.01269079, + "epoch": 0.12752141890876295, + "flos": 23155831019520.0, + "grad_norm": 1.9663880058350043, + "language_loss": 0.7779758, + "learning_rate": 3.901063255975046e-06, + "loss": 0.85731387, + "num_input_tokens_seen": 45823780, + "router_z_loss_clip": 3.28515625, + "router_z_loss_mlp": 0.3293457, + "step": 2121, + "time_per_iteration": 2.5578267574310303 + }, + { + "auxiliary_loss_clip": 0.06632394, + "auxiliary_loss_mlp": 0.01293067, + "balance_loss_clip": 0.06304775, + "balance_loss_mlp": 0.01258949, + "epoch": 0.12758154216143094, + "flos": 21622359847680.0, + "grad_norm": 2.5772818076611976, + "language_loss": 0.84019601, + "learning_rate": 3.900942242309978e-06, + "loss": 0.91945064, + "num_input_tokens_seen": 45840495, + "router_z_loss_clip": 3.28125, + "router_z_loss_mlp": 0.34106445, + "step": 2122, + "time_per_iteration": 2.5861244201660156 + }, + { + "auxiliary_loss_clip": 0.06629082, + "auxiliary_loss_mlp": 0.01293636, + "balance_loss_clip": 0.06302215, + "balance_loss_mlp": 0.01260162, + "epoch": 0.1276416654140989, + "flos": 15930395193600.0, + "grad_norm": 1.9995911681983476, + "language_loss": 0.80520052, + "learning_rate": 3.90082115656099e-06, + "loss": 0.88442767, + "num_input_tokens_seen": 45857735, + "router_z_loss_clip": 3.26953125, + "router_z_loss_mlp": 0.33496094, + "step": 2123, + "time_per_iteration": 2.543966770172119 + }, + { + "auxiliary_loss_clip": 0.06636834, + "auxiliary_loss_mlp": 0.01289825, + "balance_loss_clip": 0.06312384, + "balance_loss_mlp": 0.01257687, + "epoch": 0.12770178866676687, + "flos": 22389263141760.0, + "grad_norm": 1.6312979029769639, + "language_loss": 0.80678988, + "learning_rate": 3.900699998732673e-06, + "loss": 0.88605642, + "num_input_tokens_seen": 45876485, + "router_z_loss_clip": 3.24023438, + "router_z_loss_mlp": 0.3215332, + "step": 2124, + "time_per_iteration": 2.590118169784546 + }, + { + "auxiliary_loss_clip": 0.06636873, + "auxiliary_loss_mlp": 0.01291865, + "balance_loss_clip": 0.06307361, + "balance_loss_mlp": 0.01261228, + "epoch": 0.12776191191943484, + "flos": 21658851100800.0, + "grad_norm": 2.2926076774548765, + "language_loss": 0.76290202, + "learning_rate": 3.900578768829623e-06, + "loss": 0.84218943, + "num_input_tokens_seen": 45894645, + "router_z_loss_clip": 3.29492188, + "router_z_loss_mlp": 0.30639648, + "step": 2125, + "time_per_iteration": 2.5684149265289307 + }, + { + "auxiliary_loss_clip": 0.06631321, + "auxiliary_loss_mlp": 0.01289055, + "balance_loss_clip": 0.0630435, + "balance_loss_mlp": 0.01257011, + "epoch": 0.1278220351721028, + "flos": 25742056089600.0, + "grad_norm": 2.526811883204058, + "language_loss": 0.79172325, + "learning_rate": 3.900457466856434e-06, + "loss": 0.87092698, + "num_input_tokens_seen": 45913755, + "router_z_loss_clip": 3.26757812, + "router_z_loss_mlp": 0.3203125, + "step": 2126, + "time_per_iteration": 2.6264641284942627 + }, + { + "auxiliary_loss_clip": 0.06645348, + "auxiliary_loss_mlp": 0.01292083, + "balance_loss_clip": 0.06316036, + "balance_loss_mlp": 0.01259563, + "epoch": 0.12788215842477077, + "flos": 41252515747200.0, + "grad_norm": 1.559600581864003, + "language_loss": 0.70510435, + "learning_rate": 3.9003360928177085e-06, + "loss": 0.7844786, + "num_input_tokens_seen": 45936095, + "router_z_loss_clip": 3.29101562, + "router_z_loss_mlp": 0.32543945, + "step": 2127, + "time_per_iteration": 2.7501988410949707 + }, + { + "auxiliary_loss_clip": 0.06512339, + "auxiliary_loss_mlp": 0.01271557, + "balance_loss_clip": 0.06312746, + "balance_loss_mlp": 0.01259123, + "epoch": 0.12794228167743876, + "flos": 70899079265280.0, + "grad_norm": 0.8027421200972868, + "language_loss": 0.6268698, + "learning_rate": 3.900214646718047e-06, + "loss": 0.70470876, + "num_input_tokens_seen": 46004655, + "router_z_loss_clip": 2.0, + "router_z_loss_mlp": 0.12438965, + "step": 2128, + "time_per_iteration": 3.2327187061309814 + }, + { + "auxiliary_loss_clip": 0.06647713, + "auxiliary_loss_mlp": 0.0129082, + "balance_loss_clip": 0.06314018, + "balance_loss_mlp": 0.01255987, + "epoch": 0.12800240493010673, + "flos": 16295307724800.0, + "grad_norm": 3.2224372102485757, + "language_loss": 0.78878236, + "learning_rate": 3.900093128562056e-06, + "loss": 0.86816764, + "num_input_tokens_seen": 46023610, + "router_z_loss_clip": 3.3359375, + "router_z_loss_mlp": 0.34790039, + "step": 2129, + "time_per_iteration": 2.513296365737915 + }, + { + "auxiliary_loss_clip": 0.06653494, + "auxiliary_loss_mlp": 0.01302761, + "balance_loss_clip": 0.06312658, + "balance_loss_mlp": 0.012649, + "epoch": 0.1280625281827747, + "flos": 20637850700160.0, + "grad_norm": 2.4415165367574394, + "language_loss": 0.80974901, + "learning_rate": 3.899971538354343e-06, + "loss": 0.88931155, + "num_input_tokens_seen": 46041725, + "router_z_loss_clip": 3.40820312, + "router_z_loss_mlp": 0.37866211, + "step": 2130, + "time_per_iteration": 2.551335573196411 + }, + { + "auxiliary_loss_clip": 0.06635942, + "auxiliary_loss_mlp": 0.01301168, + "balance_loss_clip": 0.06304602, + "balance_loss_mlp": 0.01268457, + "epoch": 0.12812265143544266, + "flos": 22644869621760.0, + "grad_norm": 1.8063453022697407, + "language_loss": 0.73535526, + "learning_rate": 3.899849876099518e-06, + "loss": 0.81472635, + "num_input_tokens_seen": 46061095, + "router_z_loss_clip": 3.31445312, + "router_z_loss_mlp": 0.3269043, + "step": 2131, + "time_per_iteration": 2.591715097427368 + }, + { + "auxiliary_loss_clip": 0.06649061, + "auxiliary_loss_mlp": 0.01307481, + "balance_loss_clip": 0.06316839, + "balance_loss_mlp": 0.01274961, + "epoch": 0.12818277468811062, + "flos": 34723306696320.0, + "grad_norm": 2.4480572994081213, + "language_loss": 0.74477613, + "learning_rate": 3.899728141802197e-06, + "loss": 0.8243416, + "num_input_tokens_seen": 46082670, + "router_z_loss_clip": 3.32226562, + "router_z_loss_mlp": 0.32519531, + "step": 2132, + "time_per_iteration": 2.644005060195923 + }, + { + "auxiliary_loss_clip": 0.06630264, + "auxiliary_loss_mlp": 0.01301188, + "balance_loss_clip": 0.06311467, + "balance_loss_mlp": 0.01268573, + "epoch": 0.1282428979407786, + "flos": 23118752787840.0, + "grad_norm": 2.134664592917613, + "language_loss": 0.83662349, + "learning_rate": 3.8996063354669935e-06, + "loss": 0.91593802, + "num_input_tokens_seen": 46102410, + "router_z_loss_clip": 3.1875, + "router_z_loss_mlp": 0.32617188, + "step": 2133, + "time_per_iteration": 2.526437520980835 + }, + { + "auxiliary_loss_clip": 0.06657492, + "auxiliary_loss_mlp": 0.01312656, + "balance_loss_clip": 0.06318928, + "balance_loss_mlp": 0.01277823, + "epoch": 0.12830302119344655, + "flos": 20892786347520.0, + "grad_norm": 3.0593036297338223, + "language_loss": 0.82609046, + "learning_rate": 3.899484457098528e-06, + "loss": 0.90579188, + "num_input_tokens_seen": 46121145, + "router_z_loss_clip": 3.38476562, + "router_z_loss_mlp": 0.34814453, + "step": 2134, + "time_per_iteration": 2.57069993019104 + }, + { + "auxiliary_loss_clip": 0.06644946, + "auxiliary_loss_mlp": 0.01299694, + "balance_loss_clip": 0.0631265, + "balance_loss_mlp": 0.01266363, + "epoch": 0.12836314444611455, + "flos": 21404208942720.0, + "grad_norm": 1.8809028559826366, + "language_loss": 0.84531921, + "learning_rate": 3.899362506701421e-06, + "loss": 0.92476559, + "num_input_tokens_seen": 46140740, + "router_z_loss_clip": 3.3203125, + "router_z_loss_mlp": 0.33325195, + "step": 2135, + "time_per_iteration": 2.5816993713378906 + }, + { + "auxiliary_loss_clip": 0.06641332, + "auxiliary_loss_mlp": 0.01305378, + "balance_loss_clip": 0.06312244, + "balance_loss_mlp": 0.01272142, + "epoch": 0.1284232676987825, + "flos": 13667560156800.0, + "grad_norm": 3.0323333945799176, + "language_loss": 0.78892457, + "learning_rate": 3.899240484280298e-06, + "loss": 0.86839169, + "num_input_tokens_seen": 46156805, + "router_z_loss_clip": 3.2890625, + "router_z_loss_mlp": 0.33227539, + "step": 2136, + "time_per_iteration": 2.529231548309326 + }, + { + "auxiliary_loss_clip": 0.06499572, + "auxiliary_loss_mlp": 0.01289102, + "balance_loss_clip": 0.06299701, + "balance_loss_mlp": 0.01276156, + "epoch": 0.12848339095145048, + "flos": 60012904337280.0, + "grad_norm": 0.8797489168749767, + "language_loss": 0.5947628, + "learning_rate": 3.899118389839785e-06, + "loss": 0.67264956, + "num_input_tokens_seen": 46222085, + "router_z_loss_clip": 2.0, + "router_z_loss_mlp": 0.12957764, + "step": 2137, + "time_per_iteration": 3.308232545852661 + }, + { + "auxiliary_loss_clip": 0.06652065, + "auxiliary_loss_mlp": 0.01307251, + "balance_loss_clip": 0.06317523, + "balance_loss_mlp": 0.01273515, + "epoch": 0.12854351420411844, + "flos": 13886507675520.0, + "grad_norm": 2.603073013301421, + "language_loss": 0.84481782, + "learning_rate": 3.898996223384512e-06, + "loss": 0.924411, + "num_input_tokens_seen": 46239970, + "router_z_loss_clip": 3.34765625, + "router_z_loss_mlp": 0.3371582, + "step": 2138, + "time_per_iteration": 2.5150487422943115 + }, + { + "auxiliary_loss_clip": 0.0665133, + "auxiliary_loss_mlp": 0.01300544, + "balance_loss_clip": 0.06310506, + "balance_loss_mlp": 0.01263136, + "epoch": 0.1286036374567864, + "flos": 22644534205440.0, + "grad_norm": 2.3721539245571237, + "language_loss": 0.79668736, + "learning_rate": 3.898873984919113e-06, + "loss": 0.87620616, + "num_input_tokens_seen": 46257740, + "router_z_loss_clip": 3.41015625, + "router_z_loss_mlp": 0.37402344, + "step": 2139, + "time_per_iteration": 2.5760304927825928 + }, + { + "auxiliary_loss_clip": 0.06645858, + "auxiliary_loss_mlp": 0.01289965, + "balance_loss_clip": 0.06314536, + "balance_loss_mlp": 0.0125754, + "epoch": 0.12866376070945437, + "flos": 16330121896320.0, + "grad_norm": 1.944874099387006, + "language_loss": 0.86374593, + "learning_rate": 3.8987516744482215e-06, + "loss": 0.94310415, + "num_input_tokens_seen": 46275445, + "router_z_loss_clip": 3.30859375, + "router_z_loss_mlp": 0.32421875, + "step": 2140, + "time_per_iteration": 2.5656511783599854 + }, + { + "auxiliary_loss_clip": 0.06634524, + "auxiliary_loss_mlp": 0.01284799, + "balance_loss_clip": 0.06308289, + "balance_loss_mlp": 0.01254496, + "epoch": 0.12872388396212234, + "flos": 11879321045760.0, + "grad_norm": 2.00800168780761, + "language_loss": 0.87046349, + "learning_rate": 3.898629291976476e-06, + "loss": 0.94965667, + "num_input_tokens_seen": 46291710, + "router_z_loss_clip": 3.2578125, + "router_z_loss_mlp": 0.30322266, + "step": 2141, + "time_per_iteration": 2.589749336242676 + }, + { + "auxiliary_loss_clip": 0.06646, + "auxiliary_loss_mlp": 0.01294177, + "balance_loss_clip": 0.06311622, + "balance_loss_mlp": 0.01261037, + "epoch": 0.12878400721479033, + "flos": 28374331777920.0, + "grad_norm": 2.3143248810569563, + "language_loss": 0.69344199, + "learning_rate": 3.898506837508518e-06, + "loss": 0.77284372, + "num_input_tokens_seen": 46311335, + "router_z_loss_clip": 3.34375, + "router_z_loss_mlp": 0.33154297, + "step": 2142, + "time_per_iteration": 2.631613254547119 + }, + { + "auxiliary_loss_clip": 0.06645877, + "auxiliary_loss_mlp": 0.01292532, + "balance_loss_clip": 0.06308207, + "balance_loss_mlp": 0.01257723, + "epoch": 0.1288441304674583, + "flos": 25892842711680.0, + "grad_norm": 1.8471793604151003, + "language_loss": 0.84538341, + "learning_rate": 3.89838431104899e-06, + "loss": 0.92476749, + "num_input_tokens_seen": 46330985, + "router_z_loss_clip": 3.38085938, + "router_z_loss_mlp": 0.34814453, + "step": 2143, + "time_per_iteration": 2.62510085105896 + }, + { + "auxiliary_loss_clip": 0.06646847, + "auxiliary_loss_mlp": 0.01296075, + "balance_loss_clip": 0.06309757, + "balance_loss_mlp": 0.01261194, + "epoch": 0.12890425372012626, + "flos": 20820097330560.0, + "grad_norm": 2.9481033880232284, + "language_loss": 0.82936227, + "learning_rate": 3.898261712602539e-06, + "loss": 0.90879142, + "num_input_tokens_seen": 46351295, + "router_z_loss_clip": 3.37109375, + "router_z_loss_mlp": 0.34912109, + "step": 2144, + "time_per_iteration": 2.562148332595825 + }, + { + "auxiliary_loss_clip": 0.06632444, + "auxiliary_loss_mlp": 0.01299578, + "balance_loss_clip": 0.06302489, + "balance_loss_mlp": 0.01263196, + "epoch": 0.12896437697279423, + "flos": 22572599875200.0, + "grad_norm": 2.2245116542983046, + "language_loss": 0.80073792, + "learning_rate": 3.898139042173813e-06, + "loss": 0.88005811, + "num_input_tokens_seen": 46368600, + "router_z_loss_clip": 3.29882812, + "router_z_loss_mlp": 0.36376953, + "step": 2145, + "time_per_iteration": 2.5510518550872803 + }, + { + "auxiliary_loss_clip": 0.06636346, + "auxiliary_loss_mlp": 0.0130688, + "balance_loss_clip": 0.06306225, + "balance_loss_mlp": 0.01269877, + "epoch": 0.1290245002254622, + "flos": 17499561004800.0, + "grad_norm": 2.1761731102138686, + "language_loss": 0.83456767, + "learning_rate": 3.898016299767465e-06, + "loss": 0.91399992, + "num_input_tokens_seen": 46387370, + "router_z_loss_clip": 3.30273438, + "router_z_loss_mlp": 0.36987305, + "step": 2146, + "time_per_iteration": 2.5113868713378906 + }, + { + "auxiliary_loss_clip": 0.06626259, + "auxiliary_loss_mlp": 0.01301495, + "balance_loss_clip": 0.06300884, + "balance_loss_mlp": 0.01266042, + "epoch": 0.12908462347813016, + "flos": 36324142151040.0, + "grad_norm": 4.395125583857354, + "language_loss": 0.72594023, + "learning_rate": 3.897893485388149e-06, + "loss": 0.8052178, + "num_input_tokens_seen": 46409570, + "router_z_loss_clip": 3.25585938, + "router_z_loss_mlp": 0.35449219, + "step": 2147, + "time_per_iteration": 2.7282183170318604 + }, + { + "auxiliary_loss_clip": 0.06638759, + "auxiliary_loss_mlp": 0.01311135, + "balance_loss_clip": 0.0630547, + "balance_loss_mlp": 0.0127685, + "epoch": 0.12914474673079815, + "flos": 22535312008320.0, + "grad_norm": 2.709676387149746, + "language_loss": 0.73026669, + "learning_rate": 3.897770599040521e-06, + "loss": 0.80976564, + "num_input_tokens_seen": 46429320, + "router_z_loss_clip": 3.33203125, + "router_z_loss_mlp": 0.34326172, + "step": 2148, + "time_per_iteration": 2.5520236492156982 + }, + { + "auxiliary_loss_clip": 0.0663462, + "auxiliary_loss_mlp": 0.01329577, + "balance_loss_clip": 0.06310473, + "balance_loss_mlp": 0.01295626, + "epoch": 0.12920486998346611, + "flos": 21478533114240.0, + "grad_norm": 1.8799370652963014, + "language_loss": 0.80598587, + "learning_rate": 3.897647640729242e-06, + "loss": 0.88562787, + "num_input_tokens_seen": 46450155, + "router_z_loss_clip": 3.2421875, + "router_z_loss_mlp": 0.33959961, + "step": 2149, + "time_per_iteration": 3.9808621406555176 + }, + { + "auxiliary_loss_clip": 0.06633235, + "auxiliary_loss_mlp": 0.01311577, + "balance_loss_clip": 0.06304948, + "balance_loss_mlp": 0.01273907, + "epoch": 0.12926499323613408, + "flos": 27316001583360.0, + "grad_norm": 1.9848043356035314, + "language_loss": 0.77766216, + "learning_rate": 3.897524610458975e-06, + "loss": 0.85711026, + "num_input_tokens_seen": 46470280, + "router_z_loss_clip": 3.27929688, + "router_z_loss_mlp": 0.37646484, + "step": 2150, + "time_per_iteration": 4.050567388534546 + }, + { + "auxiliary_loss_clip": 0.06637069, + "auxiliary_loss_mlp": 0.01309125, + "balance_loss_clip": 0.06305329, + "balance_loss_mlp": 0.01273791, + "epoch": 0.12932511648880204, + "flos": 22097710460160.0, + "grad_norm": 2.600129389398131, + "language_loss": 0.71828127, + "learning_rate": 3.8974015082343835e-06, + "loss": 0.79774326, + "num_input_tokens_seen": 46487605, + "router_z_loss_clip": 3.31835938, + "router_z_loss_mlp": 0.35351562, + "step": 2151, + "time_per_iteration": 2.539199113845825 + }, + { + "auxiliary_loss_clip": 0.06638855, + "auxiliary_loss_mlp": 0.01316478, + "balance_loss_clip": 0.06308948, + "balance_loss_mlp": 0.01280716, + "epoch": 0.12938523974147, + "flos": 20308968224640.0, + "grad_norm": 2.09152011854814, + "language_loss": 0.85415232, + "learning_rate": 3.897278334060137e-06, + "loss": 0.93370569, + "num_input_tokens_seen": 46505100, + "router_z_loss_clip": 3.296875, + "router_z_loss_mlp": 0.35766602, + "step": 2152, + "time_per_iteration": 4.064931631088257 + }, + { + "auxiliary_loss_clip": 0.06626976, + "auxiliary_loss_mlp": 0.0130895, + "balance_loss_clip": 0.06301983, + "balance_loss_mlp": 0.01275118, + "epoch": 0.12944536299413797, + "flos": 19505992947840.0, + "grad_norm": 2.0734690645371865, + "language_loss": 0.79983026, + "learning_rate": 3.897155087940906e-06, + "loss": 0.87918949, + "num_input_tokens_seen": 46524020, + "router_z_loss_clip": 3.25, + "router_z_loss_mlp": 0.33837891, + "step": 2153, + "time_per_iteration": 3.9787750244140625 + }, + { + "auxiliary_loss_clip": 0.06634978, + "auxiliary_loss_mlp": 0.01296438, + "balance_loss_clip": 0.06309275, + "balance_loss_mlp": 0.01262845, + "epoch": 0.12950548624680594, + "flos": 27715099380480.0, + "grad_norm": 1.6134334939452253, + "language_loss": 0.81228089, + "learning_rate": 3.897031769881364e-06, + "loss": 0.89159513, + "num_input_tokens_seen": 46544640, + "router_z_loss_clip": 3.2578125, + "router_z_loss_mlp": 0.3359375, + "step": 2154, + "time_per_iteration": 2.6176583766937256 + }, + { + "auxiliary_loss_clip": 0.06634305, + "auxiliary_loss_mlp": 0.01301182, + "balance_loss_clip": 0.06307935, + "balance_loss_mlp": 0.01267756, + "epoch": 0.12956560949947393, + "flos": 17571369553920.0, + "grad_norm": 5.013009585067341, + "language_loss": 0.84744835, + "learning_rate": 3.896908379886188e-06, + "loss": 0.92680323, + "num_input_tokens_seen": 46561395, + "router_z_loss_clip": 3.265625, + "router_z_loss_mlp": 0.33422852, + "step": 2155, + "time_per_iteration": 2.512476921081543 + }, + { + "auxiliary_loss_clip": 0.06635429, + "auxiliary_loss_mlp": 0.01300286, + "balance_loss_clip": 0.06301479, + "balance_loss_mlp": 0.01265668, + "epoch": 0.1296257327521419, + "flos": 20746989043200.0, + "grad_norm": 7.629659850029062, + "language_loss": 0.77301121, + "learning_rate": 3.896784917960055e-06, + "loss": 0.85236835, + "num_input_tokens_seen": 46579395, + "router_z_loss_clip": 3.34179688, + "router_z_loss_mlp": 0.34619141, + "step": 2156, + "time_per_iteration": 2.5492148399353027 + }, + { + "auxiliary_loss_clip": 0.06627367, + "auxiliary_loss_mlp": 0.01301012, + "balance_loss_clip": 0.06305566, + "balance_loss_mlp": 0.01268063, + "epoch": 0.12968585600480986, + "flos": 16400756488320.0, + "grad_norm": 2.322189413476167, + "language_loss": 0.88143146, + "learning_rate": 3.896661384107648e-06, + "loss": 0.96071517, + "num_input_tokens_seen": 46597090, + "router_z_loss_clip": 3.21875, + "router_z_loss_mlp": 0.32910156, + "step": 2157, + "time_per_iteration": 2.571720838546753 + }, + { + "auxiliary_loss_clip": 0.06642087, + "auxiliary_loss_mlp": 0.0129196, + "balance_loss_clip": 0.06308718, + "balance_loss_mlp": 0.01257699, + "epoch": 0.12974597925747783, + "flos": 28337043911040.0, + "grad_norm": 2.3553612027238753, + "language_loss": 0.82135451, + "learning_rate": 3.896537778333651e-06, + "loss": 0.90069497, + "num_input_tokens_seen": 46617355, + "router_z_loss_clip": 3.33398438, + "router_z_loss_mlp": 0.34277344, + "step": 2158, + "time_per_iteration": 2.5973830223083496 + }, + { + "auxiliary_loss_clip": 0.06639753, + "auxiliary_loss_mlp": 0.0129687, + "balance_loss_clip": 0.06306097, + "balance_loss_mlp": 0.01263467, + "epoch": 0.1298061025101458, + "flos": 9687036746880.0, + "grad_norm": 2.577133138726625, + "language_loss": 0.76591945, + "learning_rate": 3.896414100642752e-06, + "loss": 0.84528571, + "num_input_tokens_seen": 46633130, + "router_z_loss_clip": 3.33984375, + "router_z_loss_mlp": 0.33422852, + "step": 2159, + "time_per_iteration": 2.4932103157043457 + }, + { + "auxiliary_loss_clip": 0.06634657, + "auxiliary_loss_mlp": 0.01294131, + "balance_loss_clip": 0.06308954, + "balance_loss_mlp": 0.01261086, + "epoch": 0.12986622576281376, + "flos": 27716986097280.0, + "grad_norm": 2.475517406269625, + "language_loss": 0.83553314, + "learning_rate": 3.89629035103964e-06, + "loss": 0.91482103, + "num_input_tokens_seen": 46650575, + "router_z_loss_clip": 3.26171875, + "router_z_loss_mlp": 0.33056641, + "step": 2160, + "time_per_iteration": 2.603818655014038 + }, + { + "auxiliary_loss_clip": 0.06627609, + "auxiliary_loss_mlp": 0.01293116, + "balance_loss_clip": 0.06306535, + "balance_loss_mlp": 0.01259118, + "epoch": 0.12992634901548175, + "flos": 18807963310080.0, + "grad_norm": 1.593154120113757, + "language_loss": 0.83271182, + "learning_rate": 3.896166529529008e-06, + "loss": 0.91191912, + "num_input_tokens_seen": 46668780, + "router_z_loss_clip": 3.21484375, + "router_z_loss_mlp": 0.33984375, + "step": 2161, + "time_per_iteration": 2.5266897678375244 + }, + { + "auxiliary_loss_clip": 0.06639621, + "auxiliary_loss_mlp": 0.01302779, + "balance_loss_clip": 0.06313581, + "balance_loss_mlp": 0.01268423, + "epoch": 0.12998647226814972, + "flos": 29134442891520.0, + "grad_norm": 2.3185391348432254, + "language_loss": 0.83230841, + "learning_rate": 3.896042636115551e-06, + "loss": 0.91173244, + "num_input_tokens_seen": 46687550, + "router_z_loss_clip": 3.25976562, + "router_z_loss_mlp": 0.34375, + "step": 2162, + "time_per_iteration": 2.65075945854187 + }, + { + "auxiliary_loss_clip": 0.06644595, + "auxiliary_loss_mlp": 0.0130915, + "balance_loss_clip": 0.06308532, + "balance_loss_mlp": 0.01275485, + "epoch": 0.13004659552081768, + "flos": 19579855921920.0, + "grad_norm": 2.844531827385147, + "language_loss": 0.74537766, + "learning_rate": 3.895918670803968e-06, + "loss": 0.82491517, + "num_input_tokens_seen": 46706730, + "router_z_loss_clip": 3.36132812, + "router_z_loss_mlp": 0.33666992, + "step": 2163, + "time_per_iteration": 2.54642653465271 + }, + { + "auxiliary_loss_clip": 0.06640218, + "auxiliary_loss_mlp": 0.0130695, + "balance_loss_clip": 0.06307475, + "balance_loss_mlp": 0.01271259, + "epoch": 0.13010671877348565, + "flos": 22497059819520.0, + "grad_norm": 2.8300840640024605, + "language_loss": 0.82687104, + "learning_rate": 3.895794633598958e-06, + "loss": 0.90634274, + "num_input_tokens_seen": 46724250, + "router_z_loss_clip": 3.33007812, + "router_z_loss_mlp": 0.35668945, + "step": 2164, + "time_per_iteration": 2.5606889724731445 + }, + { + "auxiliary_loss_clip": 0.06643611, + "auxiliary_loss_mlp": 0.01308241, + "balance_loss_clip": 0.0631078, + "balance_loss_mlp": 0.0127317, + "epoch": 0.1301668420261536, + "flos": 23884985249280.0, + "grad_norm": 2.1372618334431004, + "language_loss": 0.72789967, + "learning_rate": 3.8956705245052256e-06, + "loss": 0.80741817, + "num_input_tokens_seen": 46744105, + "router_z_loss_clip": 3.33007812, + "router_z_loss_mlp": 0.35058594, + "step": 2165, + "time_per_iteration": 2.5799126625061035 + }, + { + "auxiliary_loss_clip": 0.06653779, + "auxiliary_loss_mlp": 0.01315345, + "balance_loss_clip": 0.06317334, + "balance_loss_mlp": 0.0127932, + "epoch": 0.13022696527882158, + "flos": 23156963049600.0, + "grad_norm": 2.4025078023781563, + "language_loss": 0.76332915, + "learning_rate": 3.8955463435274765e-06, + "loss": 0.84302044, + "num_input_tokens_seen": 46764250, + "router_z_loss_clip": 3.3671875, + "router_z_loss_mlp": 0.35986328, + "step": 2166, + "time_per_iteration": 2.6160640716552734 + }, + { + "auxiliary_loss_clip": 0.06650659, + "auxiliary_loss_mlp": 0.01325427, + "balance_loss_clip": 0.06318434, + "balance_loss_mlp": 0.01292144, + "epoch": 0.13028708853148954, + "flos": 26916149099520.0, + "grad_norm": 2.7267776489226945, + "language_loss": 0.84227574, + "learning_rate": 3.895422090670421e-06, + "loss": 0.92203659, + "num_input_tokens_seen": 46786865, + "router_z_loss_clip": 3.3203125, + "router_z_loss_mlp": 0.33276367, + "step": 2167, + "time_per_iteration": 2.6118650436401367 + }, + { + "auxiliary_loss_clip": 0.0665281, + "auxiliary_loss_mlp": 0.01322266, + "balance_loss_clip": 0.06323615, + "balance_loss_mlp": 0.01284524, + "epoch": 0.13034721178415754, + "flos": 21257824659840.0, + "grad_norm": 1.882236850474067, + "language_loss": 0.84621233, + "learning_rate": 3.89529776593877e-06, + "loss": 0.9259631, + "num_input_tokens_seen": 46807030, + "router_z_loss_clip": 3.29101562, + "router_z_loss_mlp": 0.37719727, + "step": 2168, + "time_per_iteration": 2.599341869354248 + }, + { + "auxiliary_loss_clip": 0.06651181, + "auxiliary_loss_mlp": 0.01330045, + "balance_loss_clip": 0.0631827, + "balance_loss_mlp": 0.01296166, + "epoch": 0.1304073350368255, + "flos": 18772646014080.0, + "grad_norm": 2.6769280516725495, + "language_loss": 0.81258374, + "learning_rate": 3.8951733693372375e-06, + "loss": 0.89239597, + "num_input_tokens_seen": 46826280, + "router_z_loss_clip": 3.32617188, + "router_z_loss_mlp": 0.33886719, + "step": 2169, + "time_per_iteration": 2.551320791244507 + }, + { + "auxiliary_loss_clip": 0.06645042, + "auxiliary_loss_mlp": 0.01325755, + "balance_loss_clip": 0.06314517, + "balance_loss_mlp": 0.01290898, + "epoch": 0.13046745828949347, + "flos": 28371941936640.0, + "grad_norm": 2.6264294111585285, + "language_loss": 0.6902529, + "learning_rate": 3.8950489008705406e-06, + "loss": 0.76996082, + "num_input_tokens_seen": 46846505, + "router_z_loss_clip": 3.30664062, + "router_z_loss_mlp": 0.34838867, + "step": 2170, + "time_per_iteration": 2.636103868484497 + }, + { + "auxiliary_loss_clip": 0.06639146, + "auxiliary_loss_mlp": 0.01323013, + "balance_loss_clip": 0.063104, + "balance_loss_mlp": 0.01289826, + "epoch": 0.13052758154216143, + "flos": 29612518761600.0, + "grad_norm": 2.576487358768087, + "language_loss": 0.68392706, + "learning_rate": 3.8949243605434e-06, + "loss": 0.76354867, + "num_input_tokens_seen": 46867380, + "router_z_loss_clip": 3.28125, + "router_z_loss_mlp": 0.33178711, + "step": 2171, + "time_per_iteration": 2.6055140495300293 + }, + { + "auxiliary_loss_clip": 0.06645554, + "auxiliary_loss_mlp": 0.01327149, + "balance_loss_clip": 0.06309786, + "balance_loss_mlp": 0.0129215, + "epoch": 0.1305877047948294, + "flos": 19396938458880.0, + "grad_norm": 3.1003670458212973, + "language_loss": 0.73706764, + "learning_rate": 3.894799748360537e-06, + "loss": 0.81679469, + "num_input_tokens_seen": 46886810, + "router_z_loss_clip": 3.35742188, + "router_z_loss_mlp": 0.35009766, + "step": 2172, + "time_per_iteration": 2.541368007659912 + }, + { + "auxiliary_loss_clip": 0.06633269, + "auxiliary_loss_mlp": 0.01311381, + "balance_loss_clip": 0.06310625, + "balance_loss_mlp": 0.01278884, + "epoch": 0.13064782804749736, + "flos": 16879209701760.0, + "grad_norm": 2.044770569718403, + "language_loss": 0.7695576, + "learning_rate": 3.894675064326678e-06, + "loss": 0.84900403, + "num_input_tokens_seen": 46905620, + "router_z_loss_clip": 3.22460938, + "router_z_loss_mlp": 0.32470703, + "step": 2173, + "time_per_iteration": 2.5094704627990723 + }, + { + "auxiliary_loss_clip": 0.06648449, + "auxiliary_loss_mlp": 0.0132515, + "balance_loss_clip": 0.06310691, + "balance_loss_mlp": 0.01289125, + "epoch": 0.13070795130016533, + "flos": 24506049312000.0, + "grad_norm": 2.8505370909687575, + "language_loss": 0.725703, + "learning_rate": 3.894550308446551e-06, + "loss": 0.805439, + "num_input_tokens_seen": 46925120, + "router_z_loss_clip": 3.3828125, + "router_z_loss_mlp": 0.36035156, + "step": 2174, + "time_per_iteration": 2.5734338760375977 + }, + { + "auxiliary_loss_clip": 0.06505907, + "auxiliary_loss_mlp": 0.01291883, + "balance_loss_clip": 0.0631025, + "balance_loss_mlp": 0.0128004, + "epoch": 0.13076807455283332, + "flos": 71075288401920.0, + "grad_norm": 0.7747015133023086, + "language_loss": 0.58868217, + "learning_rate": 3.894425480724886e-06, + "loss": 0.66666007, + "num_input_tokens_seen": 46988195, + "router_z_loss_clip": 1.953125, + "router_z_loss_mlp": 0.11834717, + "step": 2175, + "time_per_iteration": 3.2926440238952637 + }, + { + "auxiliary_loss_clip": 0.0663542, + "auxiliary_loss_mlp": 0.01313196, + "balance_loss_clip": 0.06304372, + "balance_loss_mlp": 0.01276337, + "epoch": 0.13082819780550128, + "flos": 20270380619520.0, + "grad_norm": 2.4663196598164543, + "language_loss": 0.8129558, + "learning_rate": 3.894300581166417e-06, + "loss": 0.89244199, + "num_input_tokens_seen": 47004720, + "router_z_loss_clip": 3.30664062, + "router_z_loss_mlp": 0.36865234, + "step": 2176, + "time_per_iteration": 2.509202480316162 + }, + { + "auxiliary_loss_clip": 0.06636009, + "auxiliary_loss_mlp": 0.01308249, + "balance_loss_clip": 0.06307728, + "balance_loss_mlp": 0.01275204, + "epoch": 0.13088832105816925, + "flos": 34211884101120.0, + "grad_norm": 2.555490160200695, + "language_loss": 0.75945169, + "learning_rate": 3.894175609775881e-06, + "loss": 0.83889425, + "num_input_tokens_seen": 47024255, + "router_z_loss_clip": 3.28710938, + "router_z_loss_mlp": 0.33056641, + "step": 2177, + "time_per_iteration": 2.666957378387451 + }, + { + "auxiliary_loss_clip": 0.06632685, + "auxiliary_loss_mlp": 0.01303929, + "balance_loss_clip": 0.0630488, + "balance_loss_mlp": 0.01266378, + "epoch": 0.13094844431083721, + "flos": 17900797080960.0, + "grad_norm": 1.8104390236362107, + "language_loss": 0.8256914, + "learning_rate": 3.894050566558015e-06, + "loss": 0.90505755, + "num_input_tokens_seen": 47042465, + "router_z_loss_clip": 3.27929688, + "router_z_loss_mlp": 0.37548828, + "step": 2178, + "time_per_iteration": 2.5337579250335693 + }, + { + "auxiliary_loss_clip": 0.06635031, + "auxiliary_loss_mlp": 0.01298768, + "balance_loss_clip": 0.06305701, + "balance_loss_mlp": 0.01263625, + "epoch": 0.13100856756350518, + "flos": 17317062812160.0, + "grad_norm": 2.2347658227591327, + "language_loss": 0.76173234, + "learning_rate": 3.893925451517562e-06, + "loss": 0.84107035, + "num_input_tokens_seen": 47060370, + "router_z_loss_clip": 3.296875, + "router_z_loss_mlp": 0.35131836, + "step": 2179, + "time_per_iteration": 2.606982469558716 + }, + { + "auxiliary_loss_clip": 0.06624588, + "auxiliary_loss_mlp": 0.01289469, + "balance_loss_clip": 0.0630476, + "balance_loss_mlp": 0.01256281, + "epoch": 0.13106869081617314, + "flos": 22207142292480.0, + "grad_norm": 2.1299268574103074, + "language_loss": 0.85375142, + "learning_rate": 3.893800264659266e-06, + "loss": 0.93289196, + "num_input_tokens_seen": 47081415, + "router_z_loss_clip": 3.19921875, + "router_z_loss_mlp": 0.33154297, + "step": 2180, + "time_per_iteration": 2.585345506668091 + }, + { + "auxiliary_loss_clip": 0.06632008, + "auxiliary_loss_mlp": 0.01298661, + "balance_loss_clip": 0.06304625, + "balance_loss_mlp": 0.01265282, + "epoch": 0.13112881406884114, + "flos": 21769708452480.0, + "grad_norm": 1.7694842435775522, + "language_loss": 0.9062323, + "learning_rate": 3.8936750059878746e-06, + "loss": 0.98553902, + "num_input_tokens_seen": 47099860, + "router_z_loss_clip": 3.2734375, + "router_z_loss_mlp": 0.33374023, + "step": 2181, + "time_per_iteration": 2.5587892532348633 + }, + { + "auxiliary_loss_clip": 0.06634288, + "auxiliary_loss_mlp": 0.01294395, + "balance_loss_clip": 0.06307417, + "balance_loss_mlp": 0.01259776, + "epoch": 0.1311889373215091, + "flos": 23337784160640.0, + "grad_norm": 2.2247782487696557, + "language_loss": 0.70639372, + "learning_rate": 3.893549675508137e-06, + "loss": 0.78568053, + "num_input_tokens_seen": 47118540, + "router_z_loss_clip": 3.26953125, + "router_z_loss_mlp": 0.34594727, + "step": 2182, + "time_per_iteration": 2.5555248260498047 + }, + { + "auxiliary_loss_clip": 0.06638541, + "auxiliary_loss_mlp": 0.0130911, + "balance_loss_clip": 0.06305085, + "balance_loss_mlp": 0.01272799, + "epoch": 0.13124906057417707, + "flos": 21473250307200.0, + "grad_norm": 2.348832160211932, + "language_loss": 0.79619586, + "learning_rate": 3.893424273224806e-06, + "loss": 0.8756724, + "num_input_tokens_seen": 47136710, + "router_z_loss_clip": 3.33789062, + "router_z_loss_mlp": 0.36303711, + "step": 2183, + "time_per_iteration": 2.6583075523376465 + }, + { + "auxiliary_loss_clip": 0.06622553, + "auxiliary_loss_mlp": 0.01296715, + "balance_loss_clip": 0.06301284, + "balance_loss_mlp": 0.0126379, + "epoch": 0.13130918382684503, + "flos": 23261531345280.0, + "grad_norm": 1.7633024883927577, + "language_loss": 0.86310816, + "learning_rate": 3.893298799142636e-06, + "loss": 0.94230086, + "num_input_tokens_seen": 47157155, + "router_z_loss_clip": 3.2109375, + "router_z_loss_mlp": 0.32910156, + "step": 2184, + "time_per_iteration": 2.565059185028076 + }, + { + "auxiliary_loss_clip": 0.06636564, + "auxiliary_loss_mlp": 0.01289356, + "balance_loss_clip": 0.06310757, + "balance_loss_mlp": 0.0125593, + "epoch": 0.131369307079513, + "flos": 20856588583680.0, + "grad_norm": 2.0374007595813106, + "language_loss": 0.83394486, + "learning_rate": 3.893173253266387e-06, + "loss": 0.91320401, + "num_input_tokens_seen": 47176820, + "router_z_loss_clip": 3.25585938, + "router_z_loss_mlp": 0.33447266, + "step": 2185, + "time_per_iteration": 2.581048011779785 + }, + { + "auxiliary_loss_clip": 0.06633392, + "auxiliary_loss_mlp": 0.01301523, + "balance_loss_clip": 0.063053, + "balance_loss_mlp": 0.012675, + "epoch": 0.13142943033218096, + "flos": 17864138119680.0, + "grad_norm": 2.061355049120503, + "language_loss": 0.7394222, + "learning_rate": 3.893047635600818e-06, + "loss": 0.8187713, + "num_input_tokens_seen": 47195855, + "router_z_loss_clip": 3.27734375, + "router_z_loss_mlp": 0.33984375, + "step": 2186, + "time_per_iteration": 2.5314900875091553 + }, + { + "auxiliary_loss_clip": 0.06633774, + "auxiliary_loss_mlp": 0.01305006, + "balance_loss_clip": 0.06309012, + "balance_loss_mlp": 0.01268337, + "epoch": 0.13148955358484893, + "flos": 21002343960960.0, + "grad_norm": 2.3237992911957748, + "language_loss": 0.8187871, + "learning_rate": 3.892921946150693e-06, + "loss": 0.89817482, + "num_input_tokens_seen": 47214535, + "router_z_loss_clip": 3.24609375, + "router_z_loss_mlp": 0.36669922, + "step": 2187, + "time_per_iteration": 2.575146198272705 + }, + { + "auxiliary_loss_clip": 0.0650041, + "auxiliary_loss_mlp": 0.01303078, + "balance_loss_clip": 0.06306808, + "balance_loss_mlp": 0.01287998, + "epoch": 0.13154967683751692, + "flos": 70192035313920.0, + "grad_norm": 0.8229480574179819, + "language_loss": 0.58883667, + "learning_rate": 3.892796184920778e-06, + "loss": 0.66687155, + "num_input_tokens_seen": 47270300, + "router_z_loss_clip": 1.9375, + "router_z_loss_mlp": 0.1505127, + "step": 2188, + "time_per_iteration": 4.631601572036743 + }, + { + "auxiliary_loss_clip": 0.06627252, + "auxiliary_loss_mlp": 0.01301964, + "balance_loss_clip": 0.06307825, + "balance_loss_mlp": 0.01268609, + "epoch": 0.1316098000901849, + "flos": 20382411928320.0, + "grad_norm": 1.8739878728488704, + "language_loss": 0.75486964, + "learning_rate": 3.892670351915842e-06, + "loss": 0.83416182, + "num_input_tokens_seen": 47290720, + "router_z_loss_clip": 3.1953125, + "router_z_loss_mlp": 0.33300781, + "step": 2189, + "time_per_iteration": 4.007068395614624 + }, + { + "auxiliary_loss_clip": 0.06638934, + "auxiliary_loss_mlp": 0.01302262, + "balance_loss_clip": 0.06312171, + "balance_loss_mlp": 0.01267691, + "epoch": 0.13166992334285285, + "flos": 23227723422720.0, + "grad_norm": 2.019862807668573, + "language_loss": 0.73193908, + "learning_rate": 3.892544447140657e-06, + "loss": 0.81135106, + "num_input_tokens_seen": 47311820, + "router_z_loss_clip": 3.26757812, + "router_z_loss_mlp": 0.34570312, + "step": 2190, + "time_per_iteration": 2.5776755809783936 + }, + { + "auxiliary_loss_clip": 0.06636755, + "auxiliary_loss_mlp": 0.01299753, + "balance_loss_clip": 0.06315562, + "balance_loss_mlp": 0.01266828, + "epoch": 0.13173004659552082, + "flos": 23337616452480.0, + "grad_norm": 1.8457361126651268, + "language_loss": 0.75608957, + "learning_rate": 3.892418470599996e-06, + "loss": 0.83545464, + "num_input_tokens_seen": 47331605, + "router_z_loss_clip": 3.21484375, + "router_z_loss_mlp": 0.32958984, + "step": 2191, + "time_per_iteration": 2.580988645553589 + }, + { + "auxiliary_loss_clip": 0.06637161, + "auxiliary_loss_mlp": 0.01295844, + "balance_loss_clip": 0.06311083, + "balance_loss_mlp": 0.01258699, + "epoch": 0.13179016984818878, + "flos": 21257866586880.0, + "grad_norm": 2.0212941585210613, + "language_loss": 0.80481809, + "learning_rate": 3.892292422298637e-06, + "loss": 0.88414812, + "num_input_tokens_seen": 47350455, + "router_z_loss_clip": 3.2578125, + "router_z_loss_mlp": 0.37133789, + "step": 2192, + "time_per_iteration": 5.4770941734313965 + }, + { + "auxiliary_loss_clip": 0.06644538, + "auxiliary_loss_mlp": 0.01301425, + "balance_loss_clip": 0.06318243, + "balance_loss_mlp": 0.01265758, + "epoch": 0.13185029310085675, + "flos": 17783357184000.0, + "grad_norm": 2.540381366914011, + "language_loss": 0.86697793, + "learning_rate": 3.892166302241361e-06, + "loss": 0.94643748, + "num_input_tokens_seen": 47368225, + "router_z_loss_clip": 3.26757812, + "router_z_loss_mlp": 0.35693359, + "step": 2193, + "time_per_iteration": 2.5420453548431396 + }, + { + "auxiliary_loss_clip": 0.06500036, + "auxiliary_loss_mlp": 0.01269775, + "balance_loss_clip": 0.06307782, + "balance_loss_mlp": 0.01257103, + "epoch": 0.1319104163535247, + "flos": 69872586422400.0, + "grad_norm": 0.721919772393688, + "language_loss": 0.54093373, + "learning_rate": 3.8920401104329475e-06, + "loss": 0.61863184, + "num_input_tokens_seen": 47427125, + "router_z_loss_clip": 1.921875, + "router_z_loss_mlp": 0.12683105, + "step": 2194, + "time_per_iteration": 3.1521217823028564 + }, + { + "auxiliary_loss_clip": 0.06633582, + "auxiliary_loss_mlp": 0.01294441, + "balance_loss_clip": 0.06310762, + "balance_loss_mlp": 0.01261277, + "epoch": 0.1319705396061927, + "flos": 25200305516160.0, + "grad_norm": 1.726437316735012, + "language_loss": 0.7434622, + "learning_rate": 3.891913846878185e-06, + "loss": 0.82274246, + "num_input_tokens_seen": 47450275, + "router_z_loss_clip": 3.23046875, + "router_z_loss_mlp": 0.33154297, + "step": 2195, + "time_per_iteration": 2.593909740447998 + }, + { + "auxiliary_loss_clip": 0.06639563, + "auxiliary_loss_mlp": 0.01299138, + "balance_loss_clip": 0.0630713, + "balance_loss_mlp": 0.01264305, + "epoch": 0.13203066285886067, + "flos": 20746695553920.0, + "grad_norm": 1.9416785711103928, + "language_loss": 0.79390305, + "learning_rate": 3.891787511581859e-06, + "loss": 0.87329006, + "num_input_tokens_seen": 47469155, + "router_z_loss_clip": 3.32421875, + "router_z_loss_mlp": 0.34838867, + "step": 2196, + "time_per_iteration": 2.5824716091156006 + }, + { + "auxiliary_loss_clip": 0.06635743, + "auxiliary_loss_mlp": 0.01302288, + "balance_loss_clip": 0.06304654, + "balance_loss_mlp": 0.01269148, + "epoch": 0.13209078611152864, + "flos": 22060925717760.0, + "grad_norm": 8.075867999821003, + "language_loss": 0.76482284, + "learning_rate": 3.89166110454876e-06, + "loss": 0.84420311, + "num_input_tokens_seen": 47488405, + "router_z_loss_clip": 3.31054688, + "router_z_loss_mlp": 0.33105469, + "step": 2197, + "time_per_iteration": 2.5501832962036133 + }, + { + "auxiliary_loss_clip": 0.06635305, + "auxiliary_loss_mlp": 0.01300777, + "balance_loss_clip": 0.06304274, + "balance_loss_mlp": 0.01266063, + "epoch": 0.1321509093641966, + "flos": 16289731428480.0, + "grad_norm": 2.9293196732039126, + "language_loss": 0.81022984, + "learning_rate": 3.891534625783685e-06, + "loss": 0.88959062, + "num_input_tokens_seen": 47505650, + "router_z_loss_clip": 3.31054688, + "router_z_loss_mlp": 0.34716797, + "step": 2198, + "time_per_iteration": 2.570861577987671 + }, + { + "auxiliary_loss_clip": 0.06631541, + "auxiliary_loss_mlp": 0.01313296, + "balance_loss_clip": 0.06305937, + "balance_loss_mlp": 0.01279513, + "epoch": 0.13221103261686457, + "flos": 16988725388160.0, + "grad_norm": 2.4451285716665914, + "language_loss": 0.83851683, + "learning_rate": 3.891408075291425e-06, + "loss": 0.91796517, + "num_input_tokens_seen": 47521540, + "router_z_loss_clip": 3.25390625, + "router_z_loss_mlp": 0.33764648, + "step": 2199, + "time_per_iteration": 2.521033525466919 + }, + { + "auxiliary_loss_clip": 0.06631772, + "auxiliary_loss_mlp": 0.01306909, + "balance_loss_clip": 0.06307507, + "balance_loss_mlp": 0.01272887, + "epoch": 0.13227115586953253, + "flos": 34240996195200.0, + "grad_norm": 1.9425616182298255, + "language_loss": 0.71189994, + "learning_rate": 3.8912814530767826e-06, + "loss": 0.79128671, + "num_input_tokens_seen": 47543625, + "router_z_loss_clip": 3.24023438, + "router_z_loss_mlp": 0.34033203, + "step": 2200, + "time_per_iteration": 2.670046806335449 + }, + { + "auxiliary_loss_clip": 0.06617988, + "auxiliary_loss_mlp": 0.01304715, + "balance_loss_clip": 0.06300868, + "balance_loss_mlp": 0.01274341, + "epoch": 0.13233127912220052, + "flos": 20711000914560.0, + "grad_norm": 2.1724926946699754, + "language_loss": 0.86090875, + "learning_rate": 3.891154759144557e-06, + "loss": 0.94013584, + "num_input_tokens_seen": 47563740, + "router_z_loss_clip": 3.17773438, + "router_z_loss_mlp": 0.30371094, + "step": 2201, + "time_per_iteration": 2.570223569869995 + }, + { + "auxiliary_loss_clip": 0.06631213, + "auxiliary_loss_mlp": 0.01297349, + "balance_loss_clip": 0.06304044, + "balance_loss_mlp": 0.01263828, + "epoch": 0.1323914023748685, + "flos": 25810971672960.0, + "grad_norm": 1.9172071001088793, + "language_loss": 0.87768662, + "learning_rate": 3.891027993499554e-06, + "loss": 0.95697218, + "num_input_tokens_seen": 47582655, + "router_z_loss_clip": 3.2734375, + "router_z_loss_mlp": 0.33496094, + "step": 2202, + "time_per_iteration": 2.6102631092071533 + }, + { + "auxiliary_loss_clip": 0.06636258, + "auxiliary_loss_mlp": 0.012969, + "balance_loss_clip": 0.06311007, + "balance_loss_mlp": 0.01264427, + "epoch": 0.13245152562753645, + "flos": 21257908513920.0, + "grad_norm": 2.5432278039111202, + "language_loss": 0.73953617, + "learning_rate": 3.89090115614658e-06, + "loss": 0.81886774, + "num_input_tokens_seen": 47600875, + "router_z_loss_clip": 3.25390625, + "router_z_loss_mlp": 0.32470703, + "step": 2203, + "time_per_iteration": 2.582125425338745 + }, + { + "auxiliary_loss_clip": 0.0663885, + "auxiliary_loss_mlp": 0.01297802, + "balance_loss_clip": 0.06312627, + "balance_loss_mlp": 0.01266879, + "epoch": 0.13251164888020442, + "flos": 26617552675200.0, + "grad_norm": 2.0999892579623918, + "language_loss": 0.74886954, + "learning_rate": 3.890774247090444e-06, + "loss": 0.82823604, + "num_input_tokens_seen": 47619250, + "router_z_loss_clip": 3.26171875, + "router_z_loss_mlp": 0.30883789, + "step": 2204, + "time_per_iteration": 2.634873867034912 + }, + { + "auxiliary_loss_clip": 0.06637383, + "auxiliary_loss_mlp": 0.01309474, + "balance_loss_clip": 0.06314126, + "balance_loss_mlp": 0.01276119, + "epoch": 0.13257177213287238, + "flos": 29834485027200.0, + "grad_norm": 2.4895096645832235, + "language_loss": 0.79621047, + "learning_rate": 3.89064726633596e-06, + "loss": 0.87567902, + "num_input_tokens_seen": 47639445, + "router_z_loss_clip": 3.23242188, + "router_z_loss_mlp": 0.33349609, + "step": 2205, + "time_per_iteration": 2.619999647140503 + }, + { + "auxiliary_loss_clip": 0.06630976, + "auxiliary_loss_mlp": 0.01295213, + "balance_loss_clip": 0.06307817, + "balance_loss_mlp": 0.01261548, + "epoch": 0.13263189538554035, + "flos": 21294902891520.0, + "grad_norm": 2.228894402461185, + "language_loss": 0.80627573, + "learning_rate": 3.890520213887941e-06, + "loss": 0.88553762, + "num_input_tokens_seen": 47658740, + "router_z_loss_clip": 3.234375, + "router_z_loss_mlp": 0.33666992, + "step": 2206, + "time_per_iteration": 2.5711123943328857 + }, + { + "auxiliary_loss_clip": 0.06638241, + "auxiliary_loss_mlp": 0.01297492, + "balance_loss_clip": 0.06313571, + "balance_loss_mlp": 0.0126676, + "epoch": 0.13269201863820831, + "flos": 16879880534400.0, + "grad_norm": 2.2771237083056297, + "language_loss": 0.76153713, + "learning_rate": 3.890393089751208e-06, + "loss": 0.84089446, + "num_input_tokens_seen": 47676880, + "router_z_loss_clip": 3.2421875, + "router_z_loss_mlp": 0.30688477, + "step": 2207, + "time_per_iteration": 2.5054686069488525 + }, + { + "auxiliary_loss_clip": 0.06632576, + "auxiliary_loss_mlp": 0.01289317, + "balance_loss_clip": 0.06313936, + "balance_loss_mlp": 0.01259014, + "epoch": 0.1327521418908763, + "flos": 23775679198080.0, + "grad_norm": 2.287917678450009, + "language_loss": 0.85195792, + "learning_rate": 3.890265893930578e-06, + "loss": 0.9311769, + "num_input_tokens_seen": 47696635, + "router_z_loss_clip": 3.18359375, + "router_z_loss_mlp": 0.30322266, + "step": 2208, + "time_per_iteration": 2.609978675842285 + }, + { + "auxiliary_loss_clip": 0.0661916, + "auxiliary_loss_mlp": 0.0129287, + "balance_loss_clip": 0.06309634, + "balance_loss_mlp": 0.01263712, + "epoch": 0.13281226514354427, + "flos": 26512858598400.0, + "grad_norm": 2.1774657992842923, + "language_loss": 0.86578667, + "learning_rate": 3.890138626430876e-06, + "loss": 0.94490695, + "num_input_tokens_seen": 47717760, + "router_z_loss_clip": 3.09765625, + "router_z_loss_mlp": 0.29174805, + "step": 2209, + "time_per_iteration": 2.5905022621154785 + }, + { + "auxiliary_loss_clip": 0.06630558, + "auxiliary_loss_mlp": 0.01296527, + "balance_loss_clip": 0.06307525, + "balance_loss_mlp": 0.01264817, + "epoch": 0.13287238839621224, + "flos": 24505671968640.0, + "grad_norm": 2.0974790857001255, + "language_loss": 0.83324587, + "learning_rate": 3.890011287256929e-06, + "loss": 0.91251671, + "num_input_tokens_seen": 47737685, + "router_z_loss_clip": 3.234375, + "router_z_loss_mlp": 0.31689453, + "step": 2210, + "time_per_iteration": 2.605640172958374 + }, + { + "auxiliary_loss_clip": 0.06520031, + "auxiliary_loss_mlp": 0.01268108, + "balance_loss_clip": 0.06330763, + "balance_loss_mlp": 0.01256634, + "epoch": 0.1329325116488802, + "flos": 67713984264960.0, + "grad_norm": 0.7321997743468096, + "language_loss": 0.57977009, + "learning_rate": 3.889883876413563e-06, + "loss": 0.65765154, + "num_input_tokens_seen": 47802415, + "router_z_loss_clip": 1.890625, + "router_z_loss_mlp": 0.11456299, + "step": 2211, + "time_per_iteration": 3.2822937965393066 + }, + { + "auxiliary_loss_clip": 0.06521661, + "auxiliary_loss_mlp": 0.01258942, + "balance_loss_clip": 0.0633207, + "balance_loss_mlp": 0.01247897, + "epoch": 0.13299263490154817, + "flos": 72283440896640.0, + "grad_norm": 0.7669964089142771, + "language_loss": 0.54991639, + "learning_rate": 3.889756393905611e-06, + "loss": 0.62772238, + "num_input_tokens_seen": 47871485, + "router_z_loss_clip": 1.89648438, + "router_z_loss_mlp": 0.1105957, + "step": 2212, + "time_per_iteration": 3.2838916778564453 + }, + { + "auxiliary_loss_clip": 0.0664072, + "auxiliary_loss_mlp": 0.01298095, + "balance_loss_clip": 0.06314459, + "balance_loss_mlp": 0.012661, + "epoch": 0.13305275815421613, + "flos": 17937078698880.0, + "grad_norm": 3.2445802523020144, + "language_loss": 0.75483733, + "learning_rate": 3.889628839737908e-06, + "loss": 0.83422554, + "num_input_tokens_seen": 47888315, + "router_z_loss_clip": 3.26367188, + "router_z_loss_mlp": 0.31982422, + "step": 2213, + "time_per_iteration": 2.599457025527954 + }, + { + "auxiliary_loss_clip": 0.06623878, + "auxiliary_loss_mlp": 0.01290528, + "balance_loss_clip": 0.06308766, + "balance_loss_mlp": 0.01260917, + "epoch": 0.13311288140688413, + "flos": 22346566686720.0, + "grad_norm": 1.7850496574832224, + "language_loss": 0.80468798, + "learning_rate": 3.889501213915291e-06, + "loss": 0.88383198, + "num_input_tokens_seen": 47906600, + "router_z_loss_clip": 3.15429688, + "router_z_loss_mlp": 0.29614258, + "step": 2214, + "time_per_iteration": 2.572476625442505 + }, + { + "auxiliary_loss_clip": 0.06633762, + "auxiliary_loss_mlp": 0.01291249, + "balance_loss_clip": 0.06310902, + "balance_loss_mlp": 0.01259992, + "epoch": 0.1331730046595521, + "flos": 31877030880000.0, + "grad_norm": 1.879682062967662, + "language_loss": 0.71106076, + "learning_rate": 3.889373516442597e-06, + "loss": 0.79031086, + "num_input_tokens_seen": 47927630, + "router_z_loss_clip": 3.22851562, + "router_z_loss_mlp": 0.3125, + "step": 2215, + "time_per_iteration": 2.6289784908294678 + }, + { + "auxiliary_loss_clip": 0.06635362, + "auxiliary_loss_mlp": 0.01297639, + "balance_loss_clip": 0.06308068, + "balance_loss_mlp": 0.01264762, + "epoch": 0.13323312791222006, + "flos": 22573438416000.0, + "grad_norm": 2.1877299894623063, + "language_loss": 0.81866241, + "learning_rate": 3.889245747324671e-06, + "loss": 0.89799237, + "num_input_tokens_seen": 47947935, + "router_z_loss_clip": 3.27148438, + "router_z_loss_mlp": 0.32861328, + "step": 2216, + "time_per_iteration": 2.5978689193725586 + }, + { + "auxiliary_loss_clip": 0.06628902, + "auxiliary_loss_mlp": 0.01291342, + "balance_loss_clip": 0.06306753, + "balance_loss_mlp": 0.01260229, + "epoch": 0.13329325116488802, + "flos": 15090635174400.0, + "grad_norm": 1.945076656101512, + "language_loss": 0.8810879, + "learning_rate": 3.889117906566356e-06, + "loss": 0.96029037, + "num_input_tokens_seen": 47965515, + "router_z_loss_clip": 3.22265625, + "router_z_loss_mlp": 0.3112793, + "step": 2217, + "time_per_iteration": 2.5901639461517334 + }, + { + "auxiliary_loss_clip": 0.0662536, + "auxiliary_loss_mlp": 0.0129587, + "balance_loss_clip": 0.06307805, + "balance_loss_mlp": 0.01262563, + "epoch": 0.133353374417556, + "flos": 27461002273920.0, + "grad_norm": 2.771116888328456, + "language_loss": 0.75384659, + "learning_rate": 3.888989994172501e-06, + "loss": 0.83305889, + "num_input_tokens_seen": 47985675, + "router_z_loss_clip": 3.1796875, + "router_z_loss_mlp": 0.33349609, + "step": 2218, + "time_per_iteration": 2.5716331005096436 + }, + { + "auxiliary_loss_clip": 0.06631406, + "auxiliary_loss_mlp": 0.01293158, + "balance_loss_clip": 0.06307958, + "balance_loss_mlp": 0.01259875, + "epoch": 0.13341349767022395, + "flos": 24101081729280.0, + "grad_norm": 1.6852729372488615, + "language_loss": 0.88550645, + "learning_rate": 3.8888620101479565e-06, + "loss": 0.96475214, + "num_input_tokens_seen": 48004985, + "router_z_loss_clip": 3.23242188, + "router_z_loss_mlp": 0.33300781, + "step": 2219, + "time_per_iteration": 2.6070170402526855 + }, + { + "auxiliary_loss_clip": 0.06621003, + "auxiliary_loss_mlp": 0.01287851, + "balance_loss_clip": 0.06303806, + "balance_loss_mlp": 0.01257381, + "epoch": 0.13347362092289192, + "flos": 24140088604800.0, + "grad_norm": 2.0906842838932556, + "language_loss": 0.7815029, + "learning_rate": 3.888733954497574e-06, + "loss": 0.86059141, + "num_input_tokens_seen": 48024965, + "router_z_loss_clip": 3.16992188, + "router_z_loss_mlp": 0.3046875, + "step": 2220, + "time_per_iteration": 2.5560426712036133 + }, + { + "auxiliary_loss_clip": 0.06625573, + "auxiliary_loss_mlp": 0.01294385, + "balance_loss_clip": 0.06307516, + "balance_loss_mlp": 0.0126432, + "epoch": 0.1335337441755599, + "flos": 18441499478400.0, + "grad_norm": 3.5848326197945974, + "language_loss": 0.80259734, + "learning_rate": 3.888605827226212e-06, + "loss": 0.88179696, + "num_input_tokens_seen": 48040890, + "router_z_loss_clip": 3.18164062, + "router_z_loss_mlp": 0.30078125, + "step": 2221, + "time_per_iteration": 2.554230213165283 + }, + { + "auxiliary_loss_clip": 0.06500886, + "auxiliary_loss_mlp": 0.01279151, + "balance_loss_clip": 0.06314573, + "balance_loss_mlp": 0.01265382, + "epoch": 0.13359386742822787, + "flos": 50627608542720.0, + "grad_norm": 0.9620548374199929, + "language_loss": 0.69134498, + "learning_rate": 3.8884776283387275e-06, + "loss": 0.76914537, + "num_input_tokens_seen": 48091855, + "router_z_loss_clip": 1.86621094, + "router_z_loss_mlp": 0.13806152, + "step": 2222, + "time_per_iteration": 3.0396814346313477 + }, + { + "auxiliary_loss_clip": 0.0662626, + "auxiliary_loss_mlp": 0.01285858, + "balance_loss_clip": 0.06309929, + "balance_loss_mlp": 0.01257987, + "epoch": 0.13365399068089584, + "flos": 22784294016000.0, + "grad_norm": 6.993006748631453, + "language_loss": 0.68394774, + "learning_rate": 3.888349357839982e-06, + "loss": 0.76306891, + "num_input_tokens_seen": 48111350, + "router_z_loss_clip": 3.16015625, + "router_z_loss_mlp": 0.27856445, + "step": 2223, + "time_per_iteration": 2.6058313846588135 + }, + { + "auxiliary_loss_clip": 0.06624826, + "auxiliary_loss_mlp": 0.01288517, + "balance_loss_clip": 0.06304329, + "balance_loss_mlp": 0.01257296, + "epoch": 0.1337141139335638, + "flos": 12536540945280.0, + "grad_norm": 2.4608215865303937, + "language_loss": 0.8412739, + "learning_rate": 3.88822101573484e-06, + "loss": 0.9204073, + "num_input_tokens_seen": 48129840, + "router_z_loss_clip": 3.2109375, + "router_z_loss_mlp": 0.31213379, + "step": 2224, + "time_per_iteration": 2.5372307300567627 + }, + { + "auxiliary_loss_clip": 0.066294, + "auxiliary_loss_mlp": 0.01287352, + "balance_loss_clip": 0.06301981, + "balance_loss_mlp": 0.01255499, + "epoch": 0.13377423718623177, + "flos": 23045560646400.0, + "grad_norm": 2.2168840240666294, + "language_loss": 0.67877412, + "learning_rate": 3.888092602028167e-06, + "loss": 0.7579416, + "num_input_tokens_seen": 48149240, + "router_z_loss_clip": 3.27734375, + "router_z_loss_mlp": 0.31835938, + "step": 2225, + "time_per_iteration": 2.567253589630127 + }, + { + "auxiliary_loss_clip": 0.06627665, + "auxiliary_loss_mlp": 0.01285599, + "balance_loss_clip": 0.06307095, + "balance_loss_mlp": 0.01257406, + "epoch": 0.13383436043889974, + "flos": 16221905948160.0, + "grad_norm": 2.1695875347778184, + "language_loss": 0.90785301, + "learning_rate": 3.887964116724835e-06, + "loss": 0.98698568, + "num_input_tokens_seen": 48166330, + "router_z_loss_clip": 3.20703125, + "router_z_loss_mlp": 0.28186035, + "step": 2226, + "time_per_iteration": 2.6064305305480957 + }, + { + "auxiliary_loss_clip": 0.06623043, + "auxiliary_loss_mlp": 0.0129267, + "balance_loss_clip": 0.06300287, + "balance_loss_mlp": 0.01261771, + "epoch": 0.1338944836915677, + "flos": 24286514814720.0, + "grad_norm": 2.574481606503262, + "language_loss": 0.75021911, + "learning_rate": 3.887835559829712e-06, + "loss": 0.82937622, + "num_input_tokens_seen": 48187600, + "router_z_loss_clip": 3.23046875, + "router_z_loss_mlp": 0.30883789, + "step": 2227, + "time_per_iteration": 4.016468286514282 + }, + { + "auxiliary_loss_clip": 0.06618345, + "auxiliary_loss_mlp": 0.01292665, + "balance_loss_clip": 0.0629885, + "balance_loss_mlp": 0.01261265, + "epoch": 0.1339546069442357, + "flos": 17603793884160.0, + "grad_norm": 2.0025343623105214, + "language_loss": 0.8591758, + "learning_rate": 3.8877069313476764e-06, + "loss": 0.93828595, + "num_input_tokens_seen": 48204400, + "router_z_loss_clip": 3.1953125, + "router_z_loss_mlp": 0.31396484, + "step": 2228, + "time_per_iteration": 2.55798077583313 + }, + { + "auxiliary_loss_clip": 0.06615113, + "auxiliary_loss_mlp": 0.01284588, + "balance_loss_clip": 0.06298958, + "balance_loss_mlp": 0.01255548, + "epoch": 0.13401473019690366, + "flos": 18996163580160.0, + "grad_norm": 1.8879365390563052, + "language_loss": 0.82201439, + "learning_rate": 3.8875782312836054e-06, + "loss": 0.90101147, + "num_input_tokens_seen": 48222180, + "router_z_loss_clip": 3.15820312, + "router_z_loss_mlp": 0.29052734, + "step": 2229, + "time_per_iteration": 4.120098829269409 + }, + { + "auxiliary_loss_clip": 0.06619616, + "auxiliary_loss_mlp": 0.01290736, + "balance_loss_clip": 0.06300908, + "balance_loss_mlp": 0.01259849, + "epoch": 0.13407485344957162, + "flos": 26951214833280.0, + "grad_norm": 2.2979177943800386, + "language_loss": 0.7564404, + "learning_rate": 3.887449459642378e-06, + "loss": 0.83554387, + "num_input_tokens_seen": 48243245, + "router_z_loss_clip": 3.1875, + "router_z_loss_mlp": 0.30871582, + "step": 2230, + "time_per_iteration": 2.6150131225585938 + }, + { + "auxiliary_loss_clip": 0.06620437, + "auxiliary_loss_mlp": 0.01289621, + "balance_loss_clip": 0.06302108, + "balance_loss_mlp": 0.01261059, + "epoch": 0.1341349767022396, + "flos": 20345585258880.0, + "grad_norm": 1.8496833611889134, + "language_loss": 0.81113201, + "learning_rate": 3.8873206164288785e-06, + "loss": 0.89023262, + "num_input_tokens_seen": 48262600, + "router_z_loss_clip": 3.1796875, + "router_z_loss_mlp": 0.28564453, + "step": 2231, + "time_per_iteration": 2.5791971683502197 + }, + { + "auxiliary_loss_clip": 0.06629717, + "auxiliary_loss_mlp": 0.01304097, + "balance_loss_clip": 0.0629984, + "balance_loss_mlp": 0.01268811, + "epoch": 0.13419509995490755, + "flos": 29869802323200.0, + "grad_norm": 3.0058197712179218, + "language_loss": 0.73244405, + "learning_rate": 3.887191701647992e-06, + "loss": 0.81178224, + "num_input_tokens_seen": 48285075, + "router_z_loss_clip": 3.30273438, + "router_z_loss_mlp": 0.3527832, + "step": 2232, + "time_per_iteration": 4.126416444778442 + }, + { + "auxiliary_loss_clip": 0.06625827, + "auxiliary_loss_mlp": 0.01292477, + "balance_loss_clip": 0.06298069, + "balance_loss_mlp": 0.01260052, + "epoch": 0.13425522320757552, + "flos": 26950250511360.0, + "grad_norm": 2.8502119867979823, + "language_loss": 0.67005944, + "learning_rate": 3.8870627153046066e-06, + "loss": 0.74924242, + "num_input_tokens_seen": 48301285, + "router_z_loss_clip": 3.27539062, + "router_z_loss_mlp": 0.32421875, + "step": 2233, + "time_per_iteration": 2.57535457611084 + }, + { + "auxiliary_loss_clip": 0.0661561, + "auxiliary_loss_mlp": 0.01292122, + "balance_loss_clip": 0.0629602, + "balance_loss_mlp": 0.0126096, + "epoch": 0.1343153464602435, + "flos": 15782501537280.0, + "grad_norm": 2.818232021038303, + "language_loss": 0.82633889, + "learning_rate": 3.886933657403615e-06, + "loss": 0.90541625, + "num_input_tokens_seen": 48317835, + "router_z_loss_clip": 3.19726562, + "router_z_loss_mlp": 0.31176758, + "step": 2234, + "time_per_iteration": 2.5729787349700928 + }, + { + "auxiliary_loss_clip": 0.06617501, + "auxiliary_loss_mlp": 0.01296303, + "balance_loss_clip": 0.06299153, + "balance_loss_mlp": 0.01266668, + "epoch": 0.13437546971291148, + "flos": 24321370913280.0, + "grad_norm": 2.028590274897441, + "language_loss": 0.82841778, + "learning_rate": 3.886804527949909e-06, + "loss": 0.90755594, + "num_input_tokens_seen": 48335670, + "router_z_loss_clip": 3.18359375, + "router_z_loss_mlp": 0.29638672, + "step": 2235, + "time_per_iteration": 2.593050241470337 + }, + { + "auxiliary_loss_clip": 0.06612507, + "auxiliary_loss_mlp": 0.01293723, + "balance_loss_clip": 0.06294179, + "balance_loss_mlp": 0.01261989, + "epoch": 0.13443559296557944, + "flos": 26657817361920.0, + "grad_norm": 1.9716678370354759, + "language_loss": 0.87708902, + "learning_rate": 3.8866753269483864e-06, + "loss": 0.95615125, + "num_input_tokens_seen": 48357805, + "router_z_loss_clip": 3.18554688, + "router_z_loss_mlp": 0.31738281, + "step": 2236, + "time_per_iteration": 2.5910720825195312 + }, + { + "auxiliary_loss_clip": 0.06621092, + "auxiliary_loss_mlp": 0.01294743, + "balance_loss_clip": 0.06297852, + "balance_loss_mlp": 0.012627, + "epoch": 0.1344957162182474, + "flos": 21802216636800.0, + "grad_norm": 1.7646832896946034, + "language_loss": 0.78455186, + "learning_rate": 3.886546054403946e-06, + "loss": 0.86371022, + "num_input_tokens_seen": 48377845, + "router_z_loss_clip": 3.23632812, + "router_z_loss_mlp": 0.32080078, + "step": 2237, + "time_per_iteration": 2.5423593521118164 + }, + { + "auxiliary_loss_clip": 0.06621015, + "auxiliary_loss_mlp": 0.01296744, + "balance_loss_clip": 0.06297819, + "balance_loss_mlp": 0.01263746, + "epoch": 0.13455583947091537, + "flos": 19871785946880.0, + "grad_norm": 2.139876962287315, + "language_loss": 0.80559266, + "learning_rate": 3.886416710321491e-06, + "loss": 0.88477021, + "num_input_tokens_seen": 48394735, + "router_z_loss_clip": 3.23046875, + "router_z_loss_mlp": 0.33007812, + "step": 2238, + "time_per_iteration": 2.547511100769043 + }, + { + "auxiliary_loss_clip": 0.0662026, + "auxiliary_loss_mlp": 0.01290468, + "balance_loss_clip": 0.06300892, + "balance_loss_mlp": 0.0125945, + "epoch": 0.13461596272358334, + "flos": 30854730741120.0, + "grad_norm": 2.2946937997388983, + "language_loss": 0.69019175, + "learning_rate": 3.886287294705924e-06, + "loss": 0.76929903, + "num_input_tokens_seen": 48414200, + "router_z_loss_clip": 3.19335938, + "router_z_loss_mlp": 0.31005859, + "step": 2239, + "time_per_iteration": 2.6161396503448486 + }, + { + "auxiliary_loss_clip": 0.06626255, + "auxiliary_loss_mlp": 0.0129458, + "balance_loss_clip": 0.06302193, + "balance_loss_mlp": 0.01262609, + "epoch": 0.1346760859762513, + "flos": 12499253078400.0, + "grad_norm": 2.740092234793679, + "language_loss": 0.83294439, + "learning_rate": 3.8861578075621555e-06, + "loss": 0.91215271, + "num_input_tokens_seen": 48431065, + "router_z_loss_clip": 3.2421875, + "router_z_loss_mlp": 0.31958008, + "step": 2240, + "time_per_iteration": 2.531810998916626 + }, + { + "auxiliary_loss_clip": 0.06621873, + "auxiliary_loss_mlp": 0.01289824, + "balance_loss_clip": 0.06297486, + "balance_loss_mlp": 0.01256278, + "epoch": 0.1347362092289193, + "flos": 21842607104640.0, + "grad_norm": 1.6487000610588447, + "language_loss": 0.78665066, + "learning_rate": 3.886028248895093e-06, + "loss": 0.86576766, + "num_input_tokens_seen": 48450335, + "router_z_loss_clip": 3.2421875, + "router_z_loss_mlp": 0.33569336, + "step": 2241, + "time_per_iteration": 2.5346198081970215 + }, + { + "auxiliary_loss_clip": 0.06618196, + "auxiliary_loss_mlp": 0.01285675, + "balance_loss_clip": 0.06305367, + "balance_loss_mlp": 0.01256636, + "epoch": 0.13479633248158726, + "flos": 23515502670720.0, + "grad_norm": 1.8184249012274396, + "language_loss": 0.84641361, + "learning_rate": 3.88589861870965e-06, + "loss": 0.92545235, + "num_input_tokens_seen": 48468555, + "router_z_loss_clip": 3.12890625, + "router_z_loss_mlp": 0.29052734, + "step": 2242, + "time_per_iteration": 2.6532411575317383 + }, + { + "auxiliary_loss_clip": 0.0662721, + "auxiliary_loss_mlp": 0.01293952, + "balance_loss_clip": 0.06304164, + "balance_loss_mlp": 0.01261098, + "epoch": 0.13485645573425523, + "flos": 29350874787840.0, + "grad_norm": 2.677815565759994, + "language_loss": 0.66332561, + "learning_rate": 3.885768917010744e-06, + "loss": 0.74253726, + "num_input_tokens_seen": 48488515, + "router_z_loss_clip": 3.22851562, + "router_z_loss_mlp": 0.32836914, + "step": 2243, + "time_per_iteration": 2.599304437637329 + }, + { + "auxiliary_loss_clip": 0.06611082, + "auxiliary_loss_mlp": 0.01284736, + "balance_loss_clip": 0.06295401, + "balance_loss_mlp": 0.01256042, + "epoch": 0.1349165789869232, + "flos": 28044484980480.0, + "grad_norm": 1.4756823100545766, + "language_loss": 0.73444742, + "learning_rate": 3.8856391438032895e-06, + "loss": 0.81340563, + "num_input_tokens_seen": 48510515, + "router_z_loss_clip": 3.15234375, + "router_z_loss_mlp": 0.28662109, + "step": 2244, + "time_per_iteration": 2.640366554260254 + }, + { + "auxiliary_loss_clip": 0.06614108, + "auxiliary_loss_mlp": 0.01291938, + "balance_loss_clip": 0.062971, + "balance_loss_mlp": 0.01260133, + "epoch": 0.13497670223959116, + "flos": 22859834071680.0, + "grad_norm": 7.9965666613423, + "language_loss": 0.87522435, + "learning_rate": 3.88550929909221e-06, + "loss": 0.95428485, + "num_input_tokens_seen": 48529940, + "router_z_loss_clip": 3.17578125, + "router_z_loss_mlp": 0.31787109, + "step": 2245, + "time_per_iteration": 2.537259340286255 + }, + { + "auxiliary_loss_clip": 0.06609753, + "auxiliary_loss_mlp": 0.01291064, + "balance_loss_clip": 0.06298651, + "balance_loss_mlp": 0.0126119, + "epoch": 0.13503682549225912, + "flos": 16509517488000.0, + "grad_norm": 1.6351770671547161, + "language_loss": 0.80275553, + "learning_rate": 3.88537938288243e-06, + "loss": 0.88176376, + "num_input_tokens_seen": 48548190, + "router_z_loss_clip": 3.11328125, + "router_z_loss_mlp": 0.29858398, + "step": 2246, + "time_per_iteration": 2.576324224472046 + }, + { + "auxiliary_loss_clip": 0.06503996, + "auxiliary_loss_mlp": 0.01268266, + "balance_loss_clip": 0.06311698, + "balance_loss_mlp": 0.01256631, + "epoch": 0.1350969487449271, + "flos": 70775979217920.0, + "grad_norm": 0.7288766997222871, + "language_loss": 0.60674834, + "learning_rate": 3.885249395178874e-06, + "loss": 0.68447095, + "num_input_tokens_seen": 48613165, + "router_z_loss_clip": 1.921875, + "router_z_loss_mlp": 0.11621094, + "step": 2247, + "time_per_iteration": 3.295891046524048 + }, + { + "auxiliary_loss_clip": 0.06638567, + "auxiliary_loss_mlp": 0.01298182, + "balance_loss_clip": 0.06305797, + "balance_loss_mlp": 0.01262229, + "epoch": 0.13515707199759508, + "flos": 23082680805120.0, + "grad_norm": 2.7104639981136662, + "language_loss": 0.82279253, + "learning_rate": 3.885119335986473e-06, + "loss": 0.90216005, + "num_input_tokens_seen": 48631705, + "router_z_loss_clip": 3.33203125, + "router_z_loss_mlp": 0.359375, + "step": 2248, + "time_per_iteration": 2.575490951538086 + }, + { + "auxiliary_loss_clip": 0.06606994, + "auxiliary_loss_mlp": 0.01284005, + "balance_loss_clip": 0.0629556, + "balance_loss_mlp": 0.01255013, + "epoch": 0.13521719525026304, + "flos": 23193244667520.0, + "grad_norm": 1.8435286673705464, + "language_loss": 0.7853781, + "learning_rate": 3.884989205310157e-06, + "loss": 0.86428809, + "num_input_tokens_seen": 48649740, + "router_z_loss_clip": 3.11328125, + "router_z_loss_mlp": 0.2902832, + "step": 2249, + "time_per_iteration": 2.5745737552642822 + }, + { + "auxiliary_loss_clip": 0.06615513, + "auxiliary_loss_mlp": 0.01290474, + "balance_loss_clip": 0.06300813, + "balance_loss_mlp": 0.01262293, + "epoch": 0.135277318502931, + "flos": 24797937409920.0, + "grad_norm": 1.7186486055988894, + "language_loss": 0.86064833, + "learning_rate": 3.884859003154862e-06, + "loss": 0.93970823, + "num_input_tokens_seen": 48671565, + "router_z_loss_clip": 3.14453125, + "router_z_loss_mlp": 0.28210449, + "step": 2250, + "time_per_iteration": 2.594517469406128 + }, + { + "auxiliary_loss_clip": 0.06621417, + "auxiliary_loss_mlp": 0.01303153, + "balance_loss_clip": 0.06298415, + "balance_loss_mlp": 0.01270108, + "epoch": 0.13533744175559898, + "flos": 21915044559360.0, + "grad_norm": 3.4195422131585564, + "language_loss": 0.83116192, + "learning_rate": 3.884728729525524e-06, + "loss": 0.91040766, + "num_input_tokens_seen": 48690425, + "router_z_loss_clip": 3.22851562, + "router_z_loss_mlp": 0.33032227, + "step": 2251, + "time_per_iteration": 2.5615222454071045 + }, + { + "auxiliary_loss_clip": 0.066163, + "auxiliary_loss_mlp": 0.01290158, + "balance_loss_clip": 0.06295693, + "balance_loss_mlp": 0.01258579, + "epoch": 0.13539756500826694, + "flos": 21217434192000.0, + "grad_norm": 1.7358628614083547, + "language_loss": 0.86943758, + "learning_rate": 3.884598384427084e-06, + "loss": 0.94850212, + "num_input_tokens_seen": 48707505, + "router_z_loss_clip": 3.20703125, + "router_z_loss_mlp": 0.31555176, + "step": 2252, + "time_per_iteration": 2.5325772762298584 + }, + { + "auxiliary_loss_clip": 0.06482528, + "auxiliary_loss_mlp": 0.01279879, + "balance_loss_clip": 0.06294215, + "balance_loss_mlp": 0.01267404, + "epoch": 0.1354576882609349, + "flos": 63260835500160.0, + "grad_norm": 0.7528010548037618, + "language_loss": 0.61151105, + "learning_rate": 3.884467967864485e-06, + "loss": 0.68913507, + "num_input_tokens_seen": 48775895, + "router_z_loss_clip": 1.88867188, + "router_z_loss_mlp": 0.12481689, + "step": 2253, + "time_per_iteration": 3.2731101512908936 + }, + { + "auxiliary_loss_clip": 0.06617865, + "auxiliary_loss_mlp": 0.01297527, + "balance_loss_clip": 0.06298327, + "balance_loss_mlp": 0.01266961, + "epoch": 0.1355178115136029, + "flos": 25489971480960.0, + "grad_norm": 1.734180018549956, + "language_loss": 0.90171039, + "learning_rate": 3.884337479842671e-06, + "loss": 0.98086423, + "num_input_tokens_seen": 48798370, + "router_z_loss_clip": 3.19726562, + "router_z_loss_mlp": 0.30517578, + "step": 2254, + "time_per_iteration": 2.5830373764038086 + }, + { + "auxiliary_loss_clip": 0.06624171, + "auxiliary_loss_mlp": 0.01291824, + "balance_loss_clip": 0.06297128, + "balance_loss_mlp": 0.01259709, + "epoch": 0.13557793476627086, + "flos": 21623491877760.0, + "grad_norm": 2.5405517045767865, + "language_loss": 0.85834336, + "learning_rate": 3.884206920366591e-06, + "loss": 0.93750322, + "num_input_tokens_seen": 48817955, + "router_z_loss_clip": 3.26757812, + "router_z_loss_mlp": 0.32104492, + "step": 2255, + "time_per_iteration": 2.5849766731262207 + }, + { + "auxiliary_loss_clip": 0.06615041, + "auxiliary_loss_mlp": 0.01294235, + "balance_loss_clip": 0.06296261, + "balance_loss_mlp": 0.01264862, + "epoch": 0.13563805801893883, + "flos": 24933839932800.0, + "grad_norm": 2.4937460094050534, + "language_loss": 0.7602762, + "learning_rate": 3.884076289441196e-06, + "loss": 0.83936894, + "num_input_tokens_seen": 48836330, + "router_z_loss_clip": 3.18945312, + "router_z_loss_mlp": 0.29370117, + "step": 2256, + "time_per_iteration": 2.5914275646209717 + }, + { + "auxiliary_loss_clip": 0.06621285, + "auxiliary_loss_mlp": 0.01288962, + "balance_loss_clip": 0.06294358, + "balance_loss_mlp": 0.01257563, + "epoch": 0.1356981812716068, + "flos": 14754415466880.0, + "grad_norm": 2.129121942862091, + "language_loss": 0.84234703, + "learning_rate": 3.88394558707144e-06, + "loss": 0.92144954, + "num_input_tokens_seen": 48851890, + "router_z_loss_clip": 3.26367188, + "router_z_loss_mlp": 0.31420898, + "step": 2257, + "time_per_iteration": 2.5664286613464355 + }, + { + "auxiliary_loss_clip": 0.06630847, + "auxiliary_loss_mlp": 0.0129256, + "balance_loss_clip": 0.06299773, + "balance_loss_mlp": 0.01259658, + "epoch": 0.13575830452427476, + "flos": 11113256292480.0, + "grad_norm": 1.9364367185101232, + "language_loss": 0.83362973, + "learning_rate": 3.883814813262277e-06, + "loss": 0.91286373, + "num_input_tokens_seen": 48865510, + "router_z_loss_clip": 3.3125, + "router_z_loss_mlp": 0.32910156, + "step": 2258, + "time_per_iteration": 2.521657705307007 + }, + { + "auxiliary_loss_clip": 0.06621088, + "auxiliary_loss_mlp": 0.01297355, + "balance_loss_clip": 0.0629478, + "balance_loss_mlp": 0.01264858, + "epoch": 0.13581842777694272, + "flos": 17964849127680.0, + "grad_norm": 2.721301656824917, + "language_loss": 0.83752787, + "learning_rate": 3.883683968018669e-06, + "loss": 0.91671234, + "num_input_tokens_seen": 48882360, + "router_z_loss_clip": 3.26171875, + "router_z_loss_mlp": 0.32519531, + "step": 2259, + "time_per_iteration": 2.521693706512451 + }, + { + "auxiliary_loss_clip": 0.0660786, + "auxiliary_loss_mlp": 0.01289157, + "balance_loss_clip": 0.06291058, + "balance_loss_mlp": 0.01260952, + "epoch": 0.1358785510296107, + "flos": 22863817140480.0, + "grad_norm": 2.0214358343175927, + "language_loss": 0.74903429, + "learning_rate": 3.8835530513455755e-06, + "loss": 0.82800448, + "num_input_tokens_seen": 48902700, + "router_z_loss_clip": 3.16992188, + "router_z_loss_mlp": 0.28198242, + "step": 2260, + "time_per_iteration": 2.5302374362945557 + }, + { + "auxiliary_loss_clip": 0.0660997, + "auxiliary_loss_mlp": 0.0129096, + "balance_loss_clip": 0.06293269, + "balance_loss_mlp": 0.01260859, + "epoch": 0.13593867428227868, + "flos": 25746542282880.0, + "grad_norm": 2.2338901691781925, + "language_loss": 0.76686287, + "learning_rate": 3.883422063247961e-06, + "loss": 0.84587216, + "num_input_tokens_seen": 48922525, + "router_z_loss_clip": 3.16796875, + "router_z_loss_mlp": 0.30114746, + "step": 2261, + "time_per_iteration": 2.5939574241638184 + }, + { + "auxiliary_loss_clip": 0.06616522, + "auxiliary_loss_mlp": 0.01291008, + "balance_loss_clip": 0.0629552, + "balance_loss_mlp": 0.01259132, + "epoch": 0.13599879753494665, + "flos": 31257350409600.0, + "grad_norm": 2.2895573692407547, + "language_loss": 0.6521523, + "learning_rate": 3.883291003730794e-06, + "loss": 0.73122764, + "num_input_tokens_seen": 48942510, + "router_z_loss_clip": 3.21289062, + "router_z_loss_mlp": 0.31884766, + "step": 2262, + "time_per_iteration": 2.615324020385742 + }, + { + "auxiliary_loss_clip": 0.0662135, + "auxiliary_loss_mlp": 0.01300411, + "balance_loss_clip": 0.06298003, + "balance_loss_mlp": 0.01269584, + "epoch": 0.1360589207876146, + "flos": 23921853845760.0, + "grad_norm": 2.421989013841254, + "language_loss": 0.84175652, + "learning_rate": 3.883159872799043e-06, + "loss": 0.92097414, + "num_input_tokens_seen": 48962625, + "router_z_loss_clip": 3.23632812, + "router_z_loss_mlp": 0.30859375, + "step": 2263, + "time_per_iteration": 2.5566399097442627 + }, + { + "auxiliary_loss_clip": 0.06629188, + "auxiliary_loss_mlp": 0.01291754, + "balance_loss_clip": 0.06304573, + "balance_loss_mlp": 0.0125859, + "epoch": 0.13611904404028258, + "flos": 19980295384320.0, + "grad_norm": 2.5264058207475215, + "language_loss": 0.89336157, + "learning_rate": 3.8830286704576815e-06, + "loss": 0.97257102, + "num_input_tokens_seen": 48982525, + "router_z_loss_clip": 3.24804688, + "router_z_loss_mlp": 0.33178711, + "step": 2264, + "time_per_iteration": 2.5305962562561035 + }, + { + "auxiliary_loss_clip": 0.06637362, + "auxiliary_loss_mlp": 0.0129781, + "balance_loss_clip": 0.06308438, + "balance_loss_mlp": 0.01265195, + "epoch": 0.13617916729295054, + "flos": 15345990092160.0, + "grad_norm": 2.7927094576438716, + "language_loss": 0.71764517, + "learning_rate": 3.882897396711683e-06, + "loss": 0.79699689, + "num_input_tokens_seen": 48997605, + "router_z_loss_clip": 3.28515625, + "router_z_loss_mlp": 0.32617188, + "step": 2265, + "time_per_iteration": 2.561797857284546 + }, + { + "auxiliary_loss_clip": 0.06615983, + "auxiliary_loss_mlp": 0.01290453, + "balance_loss_clip": 0.06299248, + "balance_loss_mlp": 0.01262034, + "epoch": 0.1362392905456185, + "flos": 27458402797440.0, + "grad_norm": 2.5604448311617825, + "language_loss": 0.67458075, + "learning_rate": 3.882766051566027e-06, + "loss": 0.75364506, + "num_input_tokens_seen": 49018535, + "router_z_loss_clip": 3.16992188, + "router_z_loss_mlp": 0.28381348, + "step": 2266, + "time_per_iteration": 2.5694286823272705 + }, + { + "auxiliary_loss_clip": 0.06624304, + "auxiliary_loss_mlp": 0.01294932, + "balance_loss_clip": 0.06304609, + "balance_loss_mlp": 0.01263711, + "epoch": 0.1362994137982865, + "flos": 25015920606720.0, + "grad_norm": 2.0527906242943983, + "language_loss": 0.77445233, + "learning_rate": 3.882634635025694e-06, + "loss": 0.85364473, + "num_input_tokens_seen": 49038865, + "router_z_loss_clip": 3.1953125, + "router_z_loss_mlp": 0.31237793, + "step": 2267, + "time_per_iteration": 4.004362106323242 + }, + { + "auxiliary_loss_clip": 0.06632047, + "auxiliary_loss_mlp": 0.01290209, + "balance_loss_clip": 0.0631062, + "balance_loss_mlp": 0.01259882, + "epoch": 0.13635953705095447, + "flos": 20309261713920.0, + "grad_norm": 1.8370610095313742, + "language_loss": 0.836191, + "learning_rate": 3.882503147095667e-06, + "loss": 0.91541362, + "num_input_tokens_seen": 49058010, + "router_z_loss_clip": 3.21875, + "router_z_loss_mlp": 0.30322266, + "step": 2268, + "time_per_iteration": 3.9506208896636963 + }, + { + "auxiliary_loss_clip": 0.06630498, + "auxiliary_loss_mlp": 0.01294319, + "balance_loss_clip": 0.06311751, + "balance_loss_mlp": 0.01262013, + "epoch": 0.13641966030362243, + "flos": 31366530679680.0, + "grad_norm": 1.9828007462930386, + "language_loss": 0.7747438, + "learning_rate": 3.882371587780931e-06, + "loss": 0.85399193, + "num_input_tokens_seen": 49080330, + "router_z_loss_clip": 3.1875, + "router_z_loss_mlp": 0.32299805, + "step": 2269, + "time_per_iteration": 2.653453826904297 + }, + { + "auxiliary_loss_clip": 0.06638865, + "auxiliary_loss_mlp": 0.01296587, + "balance_loss_clip": 0.06316057, + "balance_loss_mlp": 0.0126545, + "epoch": 0.1364797835562904, + "flos": 20483122936320.0, + "grad_norm": 2.359526754249971, + "language_loss": 0.8236903, + "learning_rate": 3.882239957086477e-06, + "loss": 0.90304482, + "num_input_tokens_seen": 49097035, + "router_z_loss_clip": 3.22851562, + "router_z_loss_mlp": 0.31152344, + "step": 2270, + "time_per_iteration": 2.5675413608551025 + }, + { + "auxiliary_loss_clip": 0.06635441, + "auxiliary_loss_mlp": 0.01293131, + "balance_loss_clip": 0.06311204, + "balance_loss_mlp": 0.01261254, + "epoch": 0.13653990680895836, + "flos": 13083280836480.0, + "grad_norm": 2.670574241660613, + "language_loss": 0.77002323, + "learning_rate": 3.882108255017295e-06, + "loss": 0.84930891, + "num_input_tokens_seen": 49113945, + "router_z_loss_clip": 3.24414062, + "router_z_loss_mlp": 0.31884766, + "step": 2271, + "time_per_iteration": 3.976745367050171 + }, + { + "auxiliary_loss_clip": 0.06636623, + "auxiliary_loss_mlp": 0.01296686, + "balance_loss_clip": 0.06313315, + "balance_loss_mlp": 0.0126419, + "epoch": 0.13660003006162633, + "flos": 16952443770240.0, + "grad_norm": 2.320627701174975, + "language_loss": 0.81754398, + "learning_rate": 3.881976481578379e-06, + "loss": 0.89687717, + "num_input_tokens_seen": 49132855, + "router_z_loss_clip": 3.23242188, + "router_z_loss_mlp": 0.32495117, + "step": 2272, + "time_per_iteration": 4.03596043586731 + }, + { + "auxiliary_loss_clip": 0.0650102, + "auxiliary_loss_mlp": 0.01266825, + "balance_loss_clip": 0.06312356, + "balance_loss_mlp": 0.01255327, + "epoch": 0.1366601533142943, + "flos": 68703105386880.0, + "grad_norm": 0.6745755938751765, + "language_loss": 0.60570937, + "learning_rate": 3.8818446367747255e-06, + "loss": 0.68338782, + "num_input_tokens_seen": 49198310, + "router_z_loss_clip": 1.890625, + "router_z_loss_mlp": 0.11480713, + "step": 2273, + "time_per_iteration": 3.287332534790039 + }, + { + "auxiliary_loss_clip": 0.06625689, + "auxiliary_loss_mlp": 0.01290706, + "balance_loss_clip": 0.06308322, + "balance_loss_mlp": 0.01259831, + "epoch": 0.13672027656696228, + "flos": 19250176832640.0, + "grad_norm": 1.730825672757131, + "language_loss": 0.79225731, + "learning_rate": 3.881712720611336e-06, + "loss": 0.87142122, + "num_input_tokens_seen": 49217250, + "router_z_loss_clip": 3.171875, + "router_z_loss_mlp": 0.30883789, + "step": 2274, + "time_per_iteration": 2.562556743621826 + }, + { + "auxiliary_loss_clip": 0.06626303, + "auxiliary_loss_mlp": 0.01302977, + "balance_loss_clip": 0.06308225, + "balance_loss_mlp": 0.01270457, + "epoch": 0.13678039981963025, + "flos": 24541785878400.0, + "grad_norm": 2.937872524874316, + "language_loss": 0.79763901, + "learning_rate": 3.881580733093211e-06, + "loss": 0.87693179, + "num_input_tokens_seen": 49236615, + "router_z_loss_clip": 3.17773438, + "router_z_loss_mlp": 0.32519531, + "step": 2275, + "time_per_iteration": 2.560577630996704 + }, + { + "auxiliary_loss_clip": 0.06630076, + "auxiliary_loss_mlp": 0.01293627, + "balance_loss_clip": 0.06306267, + "balance_loss_mlp": 0.01259914, + "epoch": 0.13684052307229821, + "flos": 15674788713600.0, + "grad_norm": 2.8834689051693196, + "language_loss": 0.82202291, + "learning_rate": 3.881448674225356e-06, + "loss": 0.9012599, + "num_input_tokens_seen": 49253935, + "router_z_loss_clip": 3.2421875, + "router_z_loss_mlp": 0.33691406, + "step": 2276, + "time_per_iteration": 2.6382758617401123 + }, + { + "auxiliary_loss_clip": 0.06636757, + "auxiliary_loss_mlp": 0.01296316, + "balance_loss_clip": 0.06304651, + "balance_loss_mlp": 0.01260839, + "epoch": 0.13690064632496618, + "flos": 28371983863680.0, + "grad_norm": 2.682466270477189, + "language_loss": 0.71951526, + "learning_rate": 3.881316544012779e-06, + "loss": 0.79884601, + "num_input_tokens_seen": 49273605, + "router_z_loss_clip": 3.32421875, + "router_z_loss_mlp": 0.35473633, + "step": 2277, + "time_per_iteration": 2.59140944480896 + }, + { + "auxiliary_loss_clip": 0.06638919, + "auxiliary_loss_mlp": 0.01298071, + "balance_loss_clip": 0.06309501, + "balance_loss_mlp": 0.01265312, + "epoch": 0.13696076957763414, + "flos": 23411605207680.0, + "grad_norm": 2.2485386037649144, + "language_loss": 0.82153767, + "learning_rate": 3.88118434246049e-06, + "loss": 0.90090752, + "num_input_tokens_seen": 49291785, + "router_z_loss_clip": 3.296875, + "router_z_loss_mlp": 0.32739258, + "step": 2278, + "time_per_iteration": 2.5540530681610107 + }, + { + "auxiliary_loss_clip": 0.06627095, + "auxiliary_loss_mlp": 0.01287889, + "balance_loss_clip": 0.06304022, + "balance_loss_mlp": 0.01256358, + "epoch": 0.1370208928303021, + "flos": 37205760084480.0, + "grad_norm": 2.776511982198055, + "language_loss": 0.76353186, + "learning_rate": 3.881052069573502e-06, + "loss": 0.84268171, + "num_input_tokens_seen": 49311405, + "router_z_loss_clip": 3.234375, + "router_z_loss_mlp": 0.31506348, + "step": 2279, + "time_per_iteration": 2.659834623336792 + }, + { + "auxiliary_loss_clip": 0.06632279, + "auxiliary_loss_mlp": 0.01290702, + "balance_loss_clip": 0.06309781, + "balance_loss_mlp": 0.01260041, + "epoch": 0.13708101608297008, + "flos": 26983052184960.0, + "grad_norm": 1.8236300001025265, + "language_loss": 0.78161544, + "learning_rate": 3.880919725356831e-06, + "loss": 0.86084521, + "num_input_tokens_seen": 49331835, + "router_z_loss_clip": 3.22460938, + "router_z_loss_mlp": 0.30639648, + "step": 2280, + "time_per_iteration": 2.5933265686035156 + }, + { + "auxiliary_loss_clip": 0.06616117, + "auxiliary_loss_mlp": 0.01291386, + "balance_loss_clip": 0.06299774, + "balance_loss_mlp": 0.01259009, + "epoch": 0.13714113933563807, + "flos": 32564243341440.0, + "grad_norm": 2.0971089694494003, + "language_loss": 0.80573678, + "learning_rate": 3.880787309815496e-06, + "loss": 0.88481188, + "num_input_tokens_seen": 49352290, + "router_z_loss_clip": 3.1640625, + "router_z_loss_mlp": 0.32373047, + "step": 2281, + "time_per_iteration": 2.6089248657226562 + }, + { + "auxiliary_loss_clip": 0.06637304, + "auxiliary_loss_mlp": 0.01294147, + "balance_loss_clip": 0.06310696, + "balance_loss_mlp": 0.01260601, + "epoch": 0.13720126258830603, + "flos": 16105807716480.0, + "grad_norm": 1.9438647514298306, + "language_loss": 0.84104228, + "learning_rate": 3.880654822954518e-06, + "loss": 0.92035675, + "num_input_tokens_seen": 49370285, + "router_z_loss_clip": 3.26367188, + "router_z_loss_mlp": 0.33544922, + "step": 2282, + "time_per_iteration": 2.6252219676971436 + }, + { + "auxiliary_loss_clip": 0.06621532, + "auxiliary_loss_mlp": 0.01288566, + "balance_loss_clip": 0.06310192, + "balance_loss_mlp": 0.01258716, + "epoch": 0.137261385840974, + "flos": 18959630400000.0, + "grad_norm": 1.6598116001029841, + "language_loss": 0.74414694, + "learning_rate": 3.8805222647789195e-06, + "loss": 0.82324791, + "num_input_tokens_seen": 49389610, + "router_z_loss_clip": 3.1171875, + "router_z_loss_mlp": 0.29858398, + "step": 2283, + "time_per_iteration": 2.510495185852051 + }, + { + "auxiliary_loss_clip": 0.06626984, + "auxiliary_loss_mlp": 0.01293133, + "balance_loss_clip": 0.06314456, + "balance_loss_mlp": 0.01261686, + "epoch": 0.13732150909364196, + "flos": 23302173375360.0, + "grad_norm": 4.31542841231349, + "language_loss": 0.85737264, + "learning_rate": 3.880389635293729e-06, + "loss": 0.93657386, + "num_input_tokens_seen": 49408390, + "router_z_loss_clip": 3.125, + "router_z_loss_mlp": 0.31445312, + "step": 2284, + "time_per_iteration": 2.569772720336914 + }, + { + "auxiliary_loss_clip": 0.06637374, + "auxiliary_loss_mlp": 0.01296079, + "balance_loss_clip": 0.06309589, + "balance_loss_mlp": 0.01263702, + "epoch": 0.13738163234630993, + "flos": 29358966706560.0, + "grad_norm": 2.3287060101811643, + "language_loss": 0.76374751, + "learning_rate": 3.880256934503974e-06, + "loss": 0.84308201, + "num_input_tokens_seen": 49427725, + "router_z_loss_clip": 3.27734375, + "router_z_loss_mlp": 0.32348633, + "step": 2285, + "time_per_iteration": 2.618502140045166 + }, + { + "auxiliary_loss_clip": 0.06630811, + "auxiliary_loss_mlp": 0.01295468, + "balance_loss_clip": 0.06312186, + "balance_loss_mlp": 0.0126619, + "epoch": 0.1374417555989779, + "flos": 26658572048640.0, + "grad_norm": 1.8592668297074675, + "language_loss": 0.76012349, + "learning_rate": 3.880124162414689e-06, + "loss": 0.83938622, + "num_input_tokens_seen": 49449000, + "router_z_loss_clip": 3.18164062, + "router_z_loss_mlp": 0.29296875, + "step": 2286, + "time_per_iteration": 2.7475874423980713 + }, + { + "auxiliary_loss_clip": 0.06634222, + "auxiliary_loss_mlp": 0.01290764, + "balance_loss_clip": 0.06310531, + "balance_loss_mlp": 0.01258029, + "epoch": 0.1375018788516459, + "flos": 28411074593280.0, + "grad_norm": 5.375995383381602, + "language_loss": 0.87619269, + "learning_rate": 3.879991319030908e-06, + "loss": 0.95544249, + "num_input_tokens_seen": 49468360, + "router_z_loss_clip": 3.23242188, + "router_z_loss_mlp": 0.32763672, + "step": 2287, + "time_per_iteration": 2.7319629192352295 + }, + { + "auxiliary_loss_clip": 0.06638976, + "auxiliary_loss_mlp": 0.01305844, + "balance_loss_clip": 0.06320731, + "balance_loss_mlp": 0.01274683, + "epoch": 0.13756200210431385, + "flos": 37422695105280.0, + "grad_norm": 2.4551568049715486, + "language_loss": 0.70291626, + "learning_rate": 3.879858404357666e-06, + "loss": 0.78236449, + "num_input_tokens_seen": 49493450, + "router_z_loss_clip": 3.18359375, + "router_z_loss_mlp": 0.3112793, + "step": 2288, + "time_per_iteration": 2.6788651943206787 + }, + { + "auxiliary_loss_clip": 0.06632806, + "auxiliary_loss_mlp": 0.01293292, + "balance_loss_clip": 0.06312902, + "balance_loss_mlp": 0.01262667, + "epoch": 0.13762212535698182, + "flos": 22717642492800.0, + "grad_norm": 3.117032975681255, + "language_loss": 0.88826561, + "learning_rate": 3.879725418400005e-06, + "loss": 0.96752661, + "num_input_tokens_seen": 49511220, + "router_z_loss_clip": 3.1953125, + "router_z_loss_mlp": 0.30651855, + "step": 2289, + "time_per_iteration": 2.5602166652679443 + }, + { + "auxiliary_loss_clip": 0.06632558, + "auxiliary_loss_mlp": 0.01293233, + "balance_loss_clip": 0.06320693, + "balance_loss_mlp": 0.01263181, + "epoch": 0.13768224860964978, + "flos": 23959057858560.0, + "grad_norm": 1.9772525840465298, + "language_loss": 0.75630605, + "learning_rate": 3.879592361162969e-06, + "loss": 0.8355639, + "num_input_tokens_seen": 49529820, + "router_z_loss_clip": 3.11914062, + "router_z_loss_mlp": 0.30065918, + "step": 2290, + "time_per_iteration": 2.5592398643493652 + }, + { + "auxiliary_loss_clip": 0.06540786, + "auxiliary_loss_mlp": 0.01268874, + "balance_loss_clip": 0.06353199, + "balance_loss_mlp": 0.01257585, + "epoch": 0.13774237186231775, + "flos": 63612568212480.0, + "grad_norm": 0.6705422790130379, + "language_loss": 0.51642907, + "learning_rate": 3.8794592326516015e-06, + "loss": 0.59452564, + "num_input_tokens_seen": 49595325, + "router_z_loss_clip": 1.875, + "router_z_loss_mlp": 0.112854, + "step": 2291, + "time_per_iteration": 3.2724592685699463 + }, + { + "auxiliary_loss_clip": 0.06630601, + "auxiliary_loss_mlp": 0.01294866, + "balance_loss_clip": 0.0631279, + "balance_loss_mlp": 0.01263657, + "epoch": 0.1378024951149857, + "flos": 24286263252480.0, + "grad_norm": 2.140362896023876, + "language_loss": 0.72877645, + "learning_rate": 3.879326032870952e-06, + "loss": 0.80803108, + "num_input_tokens_seen": 49615850, + "router_z_loss_clip": 3.1796875, + "router_z_loss_mlp": 0.31201172, + "step": 2292, + "time_per_iteration": 2.571537971496582 + }, + { + "auxiliary_loss_clip": 0.0663756, + "auxiliary_loss_mlp": 0.01294271, + "balance_loss_clip": 0.06317808, + "balance_loss_mlp": 0.01261179, + "epoch": 0.13786261836765368, + "flos": 14025722434560.0, + "grad_norm": 2.9525020540096842, + "language_loss": 0.81376028, + "learning_rate": 3.879192761826071e-06, + "loss": 0.89307863, + "num_input_tokens_seen": 49631860, + "router_z_loss_clip": 3.19726562, + "router_z_loss_mlp": 0.33056641, + "step": 2293, + "time_per_iteration": 2.520320177078247 + }, + { + "auxiliary_loss_clip": 0.06629369, + "auxiliary_loss_mlp": 0.01294538, + "balance_loss_clip": 0.06308883, + "balance_loss_mlp": 0.01262065, + "epoch": 0.13792274162032167, + "flos": 28886592913920.0, + "grad_norm": 15.103956304175181, + "language_loss": 0.79534554, + "learning_rate": 3.879059419522011e-06, + "loss": 0.87458467, + "num_input_tokens_seen": 49652145, + "router_z_loss_clip": 3.20507812, + "router_z_loss_mlp": 0.32470703, + "step": 2294, + "time_per_iteration": 2.5958240032196045 + }, + { + "auxiliary_loss_clip": 0.06628333, + "auxiliary_loss_mlp": 0.01293802, + "balance_loss_clip": 0.06314936, + "balance_loss_mlp": 0.01264739, + "epoch": 0.13798286487298964, + "flos": 21147344651520.0, + "grad_norm": 2.1249265647314575, + "language_loss": 0.82119411, + "learning_rate": 3.878926005963831e-06, + "loss": 0.90041548, + "num_input_tokens_seen": 49669880, + "router_z_loss_clip": 3.1328125, + "router_z_loss_mlp": 0.29040527, + "step": 2295, + "time_per_iteration": 2.5259695053100586 + }, + { + "auxiliary_loss_clip": 0.06624444, + "auxiliary_loss_mlp": 0.0128892, + "balance_loss_clip": 0.06304439, + "balance_loss_mlp": 0.01258569, + "epoch": 0.1380429881256576, + "flos": 22493286385920.0, + "grad_norm": 1.9411162070190993, + "language_loss": 0.79297817, + "learning_rate": 3.878792521156588e-06, + "loss": 0.8721118, + "num_input_tokens_seen": 49687255, + "router_z_loss_clip": 3.203125, + "router_z_loss_mlp": 0.3034668, + "step": 2296, + "time_per_iteration": 2.5404605865478516 + }, + { + "auxiliary_loss_clip": 0.06623581, + "auxiliary_loss_mlp": 0.01292011, + "balance_loss_clip": 0.06309658, + "balance_loss_mlp": 0.01261755, + "epoch": 0.13810311137832557, + "flos": 21399429260160.0, + "grad_norm": 1.8193304302063846, + "language_loss": 0.79101717, + "learning_rate": 3.8786589651053446e-06, + "loss": 0.87017298, + "num_input_tokens_seen": 49706650, + "router_z_loss_clip": 3.13867188, + "router_z_loss_mlp": 0.30249023, + "step": 2297, + "time_per_iteration": 2.544902801513672 + }, + { + "auxiliary_loss_clip": 0.06617336, + "auxiliary_loss_mlp": 0.01292431, + "balance_loss_clip": 0.06304273, + "balance_loss_mlp": 0.01261162, + "epoch": 0.13816323463099353, + "flos": 25996195123200.0, + "grad_norm": 2.1649336589446113, + "language_loss": 0.70034248, + "learning_rate": 3.878525337815164e-06, + "loss": 0.77944016, + "num_input_tokens_seen": 49725715, + "router_z_loss_clip": 3.12890625, + "router_z_loss_mlp": 0.31286621, + "step": 2298, + "time_per_iteration": 2.7027747631073 + }, + { + "auxiliary_loss_clip": 0.06625488, + "auxiliary_loss_mlp": 0.01293838, + "balance_loss_clip": 0.06304887, + "balance_loss_mlp": 0.01263511, + "epoch": 0.1382233578836615, + "flos": 19250260686720.0, + "grad_norm": 1.8032659924791181, + "language_loss": 0.87816125, + "learning_rate": 3.878391639291116e-06, + "loss": 0.95735455, + "num_input_tokens_seen": 49744710, + "router_z_loss_clip": 3.20703125, + "router_z_loss_mlp": 0.30310059, + "step": 2299, + "time_per_iteration": 2.5216784477233887 + }, + { + "auxiliary_loss_clip": 0.06619459, + "auxiliary_loss_mlp": 0.01291843, + "balance_loss_clip": 0.06297824, + "balance_loss_mlp": 0.01258965, + "epoch": 0.1382834811363295, + "flos": 25673392068480.0, + "grad_norm": 1.8041271752460513, + "language_loss": 0.77313578, + "learning_rate": 3.878257869538267e-06, + "loss": 0.85224879, + "num_input_tokens_seen": 49764300, + "router_z_loss_clip": 3.21484375, + "router_z_loss_mlp": 0.32910156, + "step": 2300, + "time_per_iteration": 2.5652666091918945 + }, + { + "auxiliary_loss_clip": 0.06615824, + "auxiliary_loss_mlp": 0.01292831, + "balance_loss_clip": 0.06301995, + "balance_loss_mlp": 0.01263219, + "epoch": 0.13834360438899745, + "flos": 19788992513280.0, + "grad_norm": 2.607101946436598, + "language_loss": 0.84398985, + "learning_rate": 3.878124028561692e-06, + "loss": 0.92307633, + "num_input_tokens_seen": 49778380, + "router_z_loss_clip": 3.13867188, + "router_z_loss_mlp": 0.29589844, + "step": 2301, + "time_per_iteration": 2.5100109577178955 + }, + { + "auxiliary_loss_clip": 0.06616862, + "auxiliary_loss_mlp": 0.01292457, + "balance_loss_clip": 0.06302989, + "balance_loss_mlp": 0.01262631, + "epoch": 0.13840372764166542, + "flos": 26659200954240.0, + "grad_norm": 1.960897603887865, + "language_loss": 0.87807304, + "learning_rate": 3.877990116366466e-06, + "loss": 0.95716619, + "num_input_tokens_seen": 49797460, + "router_z_loss_clip": 3.140625, + "router_z_loss_mlp": 0.2980957, + "step": 2302, + "time_per_iteration": 2.5661840438842773 + }, + { + "auxiliary_loss_clip": 0.0648245, + "auxiliary_loss_mlp": 0.01256791, + "balance_loss_clip": 0.06296428, + "balance_loss_mlp": 0.01245943, + "epoch": 0.13846385089433338, + "flos": 70532321944320.0, + "grad_norm": 0.7317106160807376, + "language_loss": 0.65412122, + "learning_rate": 3.877856132957667e-06, + "loss": 0.73151362, + "num_input_tokens_seen": 49868005, + "router_z_loss_clip": 1.859375, + "router_z_loss_mlp": 0.10864258, + "step": 2303, + "time_per_iteration": 3.325839042663574 + }, + { + "auxiliary_loss_clip": 0.06609396, + "auxiliary_loss_mlp": 0.01287851, + "balance_loss_clip": 0.0630075, + "balance_loss_mlp": 0.01258263, + "epoch": 0.13852397414700135, + "flos": 17354644168320.0, + "grad_norm": 2.0774651772022885, + "language_loss": 0.79740053, + "learning_rate": 3.877722078340374e-06, + "loss": 0.87637299, + "num_input_tokens_seen": 49885825, + "router_z_loss_clip": 3.09179688, + "router_z_loss_mlp": 0.29589844, + "step": 2304, + "time_per_iteration": 2.543011426925659 + }, + { + "auxiliary_loss_clip": 0.06619786, + "auxiliary_loss_mlp": 0.01290997, + "balance_loss_clip": 0.06300867, + "balance_loss_mlp": 0.01261147, + "epoch": 0.13858409739966931, + "flos": 21550257809280.0, + "grad_norm": 3.5409811557707527, + "language_loss": 0.78727001, + "learning_rate": 3.877587952519672e-06, + "loss": 0.86637783, + "num_input_tokens_seen": 49905975, + "router_z_loss_clip": 3.19335938, + "router_z_loss_mlp": 0.2980957, + "step": 2305, + "time_per_iteration": 2.546365261077881 + }, + { + "auxiliary_loss_clip": 0.06604174, + "auxiliary_loss_mlp": 0.01290068, + "balance_loss_clip": 0.06297874, + "balance_loss_mlp": 0.01261624, + "epoch": 0.13864422065233728, + "flos": 21586329792000.0, + "grad_norm": 1.8829847036148735, + "language_loss": 0.89061654, + "learning_rate": 3.877453755500647e-06, + "loss": 0.96955895, + "num_input_tokens_seen": 49925800, + "router_z_loss_clip": 3.06054688, + "router_z_loss_mlp": 0.28442383, + "step": 2306, + "time_per_iteration": 2.564483165740967 + }, + { + "auxiliary_loss_clip": 0.06468673, + "auxiliary_loss_mlp": 0.0125835, + "balance_loss_clip": 0.0628318, + "balance_loss_mlp": 0.01247258, + "epoch": 0.13870434390500527, + "flos": 53384927650560.0, + "grad_norm": 0.8396257339497795, + "language_loss": 0.58554721, + "learning_rate": 3.877319487288387e-06, + "loss": 0.66281742, + "num_input_tokens_seen": 49977620, + "router_z_loss_clip": 1.859375, + "router_z_loss_mlp": 0.11108398, + "step": 2307, + "time_per_iteration": 4.632705450057983 + }, + { + "auxiliary_loss_clip": 0.0661881, + "auxiliary_loss_mlp": 0.01288588, + "balance_loss_clip": 0.06295981, + "balance_loss_mlp": 0.01258022, + "epoch": 0.13876446715767324, + "flos": 22572641802240.0, + "grad_norm": 1.7746642333134461, + "language_loss": 0.80762124, + "learning_rate": 3.877185147887984e-06, + "loss": 0.88669527, + "num_input_tokens_seen": 49996650, + "router_z_loss_clip": 3.23046875, + "router_z_loss_mlp": 0.30566406, + "step": 2308, + "time_per_iteration": 3.985261917114258 + }, + { + "auxiliary_loss_clip": 0.06612652, + "auxiliary_loss_mlp": 0.0129232, + "balance_loss_clip": 0.06302111, + "balance_loss_mlp": 0.01262208, + "epoch": 0.1388245904103412, + "flos": 20711671747200.0, + "grad_norm": 2.3070434354932425, + "language_loss": 0.7942912, + "learning_rate": 3.877050737304533e-06, + "loss": 0.8733409, + "num_input_tokens_seen": 50015640, + "router_z_loss_clip": 3.10742188, + "router_z_loss_mlp": 0.30102539, + "step": 2309, + "time_per_iteration": 2.5814623832702637 + }, + { + "auxiliary_loss_clip": 0.06621584, + "auxiliary_loss_mlp": 0.01295268, + "balance_loss_clip": 0.06297516, + "balance_loss_mlp": 0.0126444, + "epoch": 0.13888471366300917, + "flos": 20560382000640.0, + "grad_norm": 2.2863258472271437, + "language_loss": 0.6975733, + "learning_rate": 3.876916255543129e-06, + "loss": 0.77674186, + "num_input_tokens_seen": 50033500, + "router_z_loss_clip": 3.24023438, + "router_z_loss_mlp": 0.30786133, + "step": 2310, + "time_per_iteration": 2.5402469635009766 + }, + { + "auxiliary_loss_clip": 0.06612189, + "auxiliary_loss_mlp": 0.01299127, + "balance_loss_clip": 0.06296849, + "balance_loss_mlp": 0.01268967, + "epoch": 0.13894483691567713, + "flos": 13842008357760.0, + "grad_norm": 1.8909078278877924, + "language_loss": 0.85131961, + "learning_rate": 3.8767817026088725e-06, + "loss": 0.9304328, + "num_input_tokens_seen": 50050075, + "router_z_loss_clip": 3.15625, + "router_z_loss_mlp": 0.30126953, + "step": 2311, + "time_per_iteration": 5.377658128738403 + }, + { + "auxiliary_loss_clip": 0.06618226, + "auxiliary_loss_mlp": 0.01294733, + "balance_loss_clip": 0.06296492, + "balance_loss_mlp": 0.01264358, + "epoch": 0.1390049601683451, + "flos": 28037567018880.0, + "grad_norm": 2.5894979273704783, + "language_loss": 0.83215213, + "learning_rate": 3.876647078506866e-06, + "loss": 0.9112817, + "num_input_tokens_seen": 50070080, + "router_z_loss_clip": 3.21679688, + "router_z_loss_mlp": 0.30395508, + "step": 2312, + "time_per_iteration": 2.6039178371429443 + }, + { + "auxiliary_loss_clip": 0.06618522, + "auxiliary_loss_mlp": 0.01290839, + "balance_loss_clip": 0.06296252, + "balance_loss_mlp": 0.01259964, + "epoch": 0.13906508342101306, + "flos": 26763475760640.0, + "grad_norm": 1.7282329609081795, + "language_loss": 0.87823701, + "learning_rate": 3.876512383242215e-06, + "loss": 0.95733058, + "num_input_tokens_seen": 50090040, + "router_z_loss_clip": 3.22460938, + "router_z_loss_mlp": 0.30883789, + "step": 2313, + "time_per_iteration": 2.6105740070343018 + }, + { + "auxiliary_loss_clip": 0.06614069, + "auxiliary_loss_mlp": 0.01289702, + "balance_loss_clip": 0.06295129, + "balance_loss_mlp": 0.01259185, + "epoch": 0.13912520667368106, + "flos": 24541995513600.0, + "grad_norm": 1.8286826676096326, + "language_loss": 0.81090409, + "learning_rate": 3.876377616820024e-06, + "loss": 0.88994175, + "num_input_tokens_seen": 50110595, + "router_z_loss_clip": 3.1875, + "router_z_loss_mlp": 0.30541992, + "step": 2314, + "time_per_iteration": 2.581137180328369 + }, + { + "auxiliary_loss_clip": 0.06609131, + "auxiliary_loss_mlp": 0.0129379, + "balance_loss_clip": 0.06295457, + "balance_loss_mlp": 0.01263678, + "epoch": 0.13918532992634902, + "flos": 19388007999360.0, + "grad_norm": 4.757536248820732, + "language_loss": 0.86588097, + "learning_rate": 3.876242779245409e-06, + "loss": 0.94491017, + "num_input_tokens_seen": 50125430, + "router_z_loss_clip": 3.140625, + "router_z_loss_mlp": 0.30126953, + "step": 2315, + "time_per_iteration": 2.5262932777404785 + }, + { + "auxiliary_loss_clip": 0.06611065, + "auxiliary_loss_mlp": 0.01285772, + "balance_loss_clip": 0.06296186, + "balance_loss_mlp": 0.01255159, + "epoch": 0.139245453179017, + "flos": 21330010552320.0, + "grad_norm": 2.405797075318415, + "language_loss": 0.78922898, + "learning_rate": 3.876107870523477e-06, + "loss": 0.86819738, + "num_input_tokens_seen": 50144120, + "router_z_loss_clip": 3.15039062, + "router_z_loss_mlp": 0.30615234, + "step": 2316, + "time_per_iteration": 2.529972553253174 + }, + { + "auxiliary_loss_clip": 0.06613404, + "auxiliary_loss_mlp": 0.01292141, + "balance_loss_clip": 0.06296711, + "balance_loss_mlp": 0.01260026, + "epoch": 0.13930557643168495, + "flos": 19506747634560.0, + "grad_norm": 1.7528689753979556, + "language_loss": 0.77613419, + "learning_rate": 3.875972890659349e-06, + "loss": 0.85518968, + "num_input_tokens_seen": 50162500, + "router_z_loss_clip": 3.16992188, + "router_z_loss_mlp": 0.32116699, + "step": 2317, + "time_per_iteration": 2.5425355434417725 + }, + { + "auxiliary_loss_clip": 0.06624125, + "auxiliary_loss_mlp": 0.01286591, + "balance_loss_clip": 0.0630217, + "balance_loss_mlp": 0.01257027, + "epoch": 0.13936569968435292, + "flos": 25417869442560.0, + "grad_norm": 1.999588880264202, + "language_loss": 0.81447107, + "learning_rate": 3.875837839658139e-06, + "loss": 0.89357817, + "num_input_tokens_seen": 50182415, + "router_z_loss_clip": 3.22265625, + "router_z_loss_mlp": 0.2956543, + "step": 2318, + "time_per_iteration": 2.577786922454834 + }, + { + "auxiliary_loss_clip": 0.06479447, + "auxiliary_loss_mlp": 0.01268448, + "balance_loss_clip": 0.06297284, + "balance_loss_mlp": 0.01257373, + "epoch": 0.13942582293702088, + "flos": 70793211231360.0, + "grad_norm": 0.8224169172372592, + "language_loss": 0.59232461, + "learning_rate": 3.87570271752497e-06, + "loss": 0.66980362, + "num_input_tokens_seen": 50245160, + "router_z_loss_clip": 1.82421875, + "router_z_loss_mlp": 0.11090088, + "step": 2319, + "time_per_iteration": 3.204317092895508 + }, + { + "auxiliary_loss_clip": 0.06613657, + "auxiliary_loss_mlp": 0.01294413, + "balance_loss_clip": 0.06293797, + "balance_loss_mlp": 0.01263514, + "epoch": 0.13948594618968888, + "flos": 35599725676800.0, + "grad_norm": 2.1444622790100762, + "language_loss": 0.66576529, + "learning_rate": 3.875567524264967e-06, + "loss": 0.74484605, + "num_input_tokens_seen": 50268215, + "router_z_loss_clip": 3.20117188, + "router_z_loss_mlp": 0.30957031, + "step": 2320, + "time_per_iteration": 2.677716016769409 + }, + { + "auxiliary_loss_clip": 0.06604615, + "auxiliary_loss_mlp": 0.01292225, + "balance_loss_clip": 0.062957, + "balance_loss_mlp": 0.01263245, + "epoch": 0.13954606944235684, + "flos": 21111482304000.0, + "grad_norm": 1.7128433163135388, + "language_loss": 0.7132194, + "learning_rate": 3.875432259883256e-06, + "loss": 0.79218775, + "num_input_tokens_seen": 50288575, + "router_z_loss_clip": 3.08984375, + "router_z_loss_mlp": 0.28967285, + "step": 2321, + "time_per_iteration": 2.5557823181152344 + }, + { + "auxiliary_loss_clip": 0.06610114, + "auxiliary_loss_mlp": 0.01289737, + "balance_loss_clip": 0.06294077, + "balance_loss_mlp": 0.01258158, + "epoch": 0.1396061926950248, + "flos": 25051154048640.0, + "grad_norm": 2.1088337541486215, + "language_loss": 0.87096989, + "learning_rate": 3.875296924384965e-06, + "loss": 0.9499684, + "num_input_tokens_seen": 50308735, + "router_z_loss_clip": 3.15820312, + "router_z_loss_mlp": 0.3157959, + "step": 2322, + "time_per_iteration": 2.563751459121704 + }, + { + "auxiliary_loss_clip": 0.06602737, + "auxiliary_loss_mlp": 0.01287424, + "balance_loss_clip": 0.06298044, + "balance_loss_mlp": 0.01258718, + "epoch": 0.13966631594769277, + "flos": 37643193924480.0, + "grad_norm": 1.6181543517844332, + "language_loss": 0.68045509, + "learning_rate": 3.875161517775226e-06, + "loss": 0.75935674, + "num_input_tokens_seen": 50331025, + "router_z_loss_clip": 3.05078125, + "router_z_loss_mlp": 0.28710938, + "step": 2323, + "time_per_iteration": 2.8503611087799072 + }, + { + "auxiliary_loss_clip": 0.06623898, + "auxiliary_loss_mlp": 0.01286528, + "balance_loss_clip": 0.06301014, + "balance_loss_mlp": 0.01257393, + "epoch": 0.13972643920036074, + "flos": 16696627655040.0, + "grad_norm": 2.142170673512178, + "language_loss": 0.90579832, + "learning_rate": 3.875026040059175e-06, + "loss": 0.98490262, + "num_input_tokens_seen": 50349725, + "router_z_loss_clip": 3.23046875, + "router_z_loss_mlp": 0.29150391, + "step": 2324, + "time_per_iteration": 2.5540571212768555 + }, + { + "auxiliary_loss_clip": 0.06618317, + "auxiliary_loss_mlp": 0.01286509, + "balance_loss_clip": 0.0630155, + "balance_loss_mlp": 0.01256659, + "epoch": 0.1397865624530287, + "flos": 23337742233600.0, + "grad_norm": 4.139742528061125, + "language_loss": 0.72620469, + "learning_rate": 3.8748904912419485e-06, + "loss": 0.80525297, + "num_input_tokens_seen": 50367965, + "router_z_loss_clip": 3.16992188, + "router_z_loss_mlp": 0.29821777, + "step": 2325, + "time_per_iteration": 2.5619618892669678 + }, + { + "auxiliary_loss_clip": 0.0662512, + "auxiliary_loss_mlp": 0.01293129, + "balance_loss_clip": 0.06308709, + "balance_loss_mlp": 0.01264591, + "epoch": 0.13984668570569667, + "flos": 22784000526720.0, + "grad_norm": 2.1958407614138, + "language_loss": 0.83206451, + "learning_rate": 3.874754871328688e-06, + "loss": 0.91124701, + "num_input_tokens_seen": 50385605, + "router_z_loss_clip": 3.16210938, + "router_z_loss_mlp": 0.28503418, + "step": 2326, + "time_per_iteration": 2.544154167175293 + }, + { + "auxiliary_loss_clip": 0.06607386, + "auxiliary_loss_mlp": 0.0128622, + "balance_loss_clip": 0.06303836, + "balance_loss_mlp": 0.01256764, + "epoch": 0.13990680895836466, + "flos": 19470759505920.0, + "grad_norm": 1.8381162719470834, + "language_loss": 0.90198052, + "learning_rate": 3.874619180324534e-06, + "loss": 0.98091662, + "num_input_tokens_seen": 50403985, + "router_z_loss_clip": 3.03515625, + "router_z_loss_mlp": 0.2947998, + "step": 2327, + "time_per_iteration": 2.544022798538208 + }, + { + "auxiliary_loss_clip": 0.06612301, + "auxiliary_loss_mlp": 0.01294926, + "balance_loss_clip": 0.06299497, + "balance_loss_mlp": 0.01263479, + "epoch": 0.13996693221103262, + "flos": 20309555203200.0, + "grad_norm": 2.1153988454525927, + "language_loss": 0.86492193, + "learning_rate": 3.874483418234632e-06, + "loss": 0.9439941, + "num_input_tokens_seen": 50421590, + "router_z_loss_clip": 3.12890625, + "router_z_loss_mlp": 0.31433105, + "step": 2328, + "time_per_iteration": 2.498436212539673 + }, + { + "auxiliary_loss_clip": 0.06619829, + "auxiliary_loss_mlp": 0.01290779, + "balance_loss_clip": 0.06303053, + "balance_loss_mlp": 0.01261239, + "epoch": 0.1400270554637006, + "flos": 26625434958720.0, + "grad_norm": 2.232478376897894, + "language_loss": 0.74862719, + "learning_rate": 3.874347585064131e-06, + "loss": 0.82773322, + "num_input_tokens_seen": 50443945, + "router_z_loss_clip": 3.16992188, + "router_z_loss_mlp": 0.29541016, + "step": 2329, + "time_per_iteration": 2.625213146209717 + }, + { + "auxiliary_loss_clip": 0.06613478, + "auxiliary_loss_mlp": 0.01291404, + "balance_loss_clip": 0.06302348, + "balance_loss_mlp": 0.01261912, + "epoch": 0.14008717871636855, + "flos": 19397651218560.0, + "grad_norm": 2.9962397362189797, + "language_loss": 0.79502976, + "learning_rate": 3.874211680818183e-06, + "loss": 0.87407863, + "num_input_tokens_seen": 50462065, + "router_z_loss_clip": 3.11132812, + "router_z_loss_mlp": 0.29516602, + "step": 2330, + "time_per_iteration": 2.526705265045166 + }, + { + "auxiliary_loss_clip": 0.06610473, + "auxiliary_loss_mlp": 0.01292963, + "balance_loss_clip": 0.06299306, + "balance_loss_mlp": 0.01265187, + "epoch": 0.14014730196903652, + "flos": 15309624620160.0, + "grad_norm": 3.126642482841082, + "language_loss": 0.73399383, + "learning_rate": 3.87407570550194e-06, + "loss": 0.81302822, + "num_input_tokens_seen": 50479565, + "router_z_loss_clip": 3.11328125, + "router_z_loss_mlp": 0.27783203, + "step": 2331, + "time_per_iteration": 2.5545501708984375 + }, + { + "auxiliary_loss_clip": 0.06595145, + "auxiliary_loss_mlp": 0.01295524, + "balance_loss_clip": 0.06296061, + "balance_loss_mlp": 0.01267176, + "epoch": 0.14020742522170448, + "flos": 14945047505280.0, + "grad_norm": 1.5446780905805184, + "language_loss": 0.73888373, + "learning_rate": 3.873939659120557e-06, + "loss": 0.81779039, + "num_input_tokens_seen": 50497305, + "router_z_loss_clip": 2.9921875, + "router_z_loss_mlp": 0.28344727, + "step": 2332, + "time_per_iteration": 2.5132856369018555 + }, + { + "auxiliary_loss_clip": 0.06469279, + "auxiliary_loss_mlp": 0.01265718, + "balance_loss_clip": 0.0628898, + "balance_loss_mlp": 0.01254947, + "epoch": 0.14026754847437245, + "flos": 48839956410240.0, + "grad_norm": 0.7856293848414069, + "language_loss": 0.55978549, + "learning_rate": 3.873803541679196e-06, + "loss": 0.63713545, + "num_input_tokens_seen": 50549735, + "router_z_loss_clip": 1.80371094, + "router_z_loss_mlp": 0.10784912, + "step": 2333, + "time_per_iteration": 3.0545504093170166 + }, + { + "auxiliary_loss_clip": 0.06614032, + "auxiliary_loss_mlp": 0.01304219, + "balance_loss_clip": 0.06302805, + "balance_loss_mlp": 0.01274512, + "epoch": 0.14032767172704044, + "flos": 25779972862080.0, + "grad_norm": 1.7607916686559548, + "language_loss": 0.83699584, + "learning_rate": 3.873667353183016e-06, + "loss": 0.91617835, + "num_input_tokens_seen": 50570100, + "router_z_loss_clip": 3.11328125, + "router_z_loss_mlp": 0.29699707, + "step": 2334, + "time_per_iteration": 2.6067097187042236 + }, + { + "auxiliary_loss_clip": 0.06611067, + "auxiliary_loss_mlp": 0.01296359, + "balance_loss_clip": 0.06295306, + "balance_loss_mlp": 0.01268023, + "epoch": 0.1403877949797084, + "flos": 21222884707200.0, + "grad_norm": 3.2536049566200846, + "language_loss": 0.81910211, + "learning_rate": 3.8735310936371825e-06, + "loss": 0.89817637, + "num_input_tokens_seen": 50589185, + "router_z_loss_clip": 3.15625, + "router_z_loss_mlp": 0.28356934, + "step": 2335, + "time_per_iteration": 2.5793120861053467 + }, + { + "auxiliary_loss_clip": 0.06618994, + "auxiliary_loss_mlp": 0.0129466, + "balance_loss_clip": 0.06299357, + "balance_loss_mlp": 0.01262044, + "epoch": 0.14044791823237637, + "flos": 22754678797440.0, + "grad_norm": 1.8425920337650705, + "language_loss": 0.83025301, + "learning_rate": 3.873394763046862e-06, + "loss": 0.9093895, + "num_input_tokens_seen": 50609645, + "router_z_loss_clip": 3.19335938, + "router_z_loss_mlp": 0.32617188, + "step": 2336, + "time_per_iteration": 2.5754895210266113 + }, + { + "auxiliary_loss_clip": 0.0660933, + "auxiliary_loss_mlp": 0.0129189, + "balance_loss_clip": 0.06299824, + "balance_loss_mlp": 0.01261516, + "epoch": 0.14050804148504434, + "flos": 22970775277440.0, + "grad_norm": 1.9428001111866895, + "language_loss": 0.81449389, + "learning_rate": 3.873258361417225e-06, + "loss": 0.89350611, + "num_input_tokens_seen": 50628385, + "router_z_loss_clip": 3.09765625, + "router_z_loss_mlp": 0.30371094, + "step": 2337, + "time_per_iteration": 2.542494773864746 + }, + { + "auxiliary_loss_clip": 0.06620462, + "auxiliary_loss_mlp": 0.01292117, + "balance_loss_clip": 0.06304233, + "balance_loss_mlp": 0.01262493, + "epoch": 0.1405681647377123, + "flos": 22206890730240.0, + "grad_norm": 2.099495755823345, + "language_loss": 0.80428421, + "learning_rate": 3.873121888753442e-06, + "loss": 0.88341004, + "num_input_tokens_seen": 50647260, + "router_z_loss_clip": 3.16796875, + "router_z_loss_mlp": 0.29626465, + "step": 2338, + "time_per_iteration": 2.5587832927703857 + }, + { + "auxiliary_loss_clip": 0.06618391, + "auxiliary_loss_mlp": 0.01291133, + "balance_loss_clip": 0.06299177, + "balance_loss_mlp": 0.01259447, + "epoch": 0.14062828799038027, + "flos": 23739607215360.0, + "grad_norm": 2.563407914599119, + "language_loss": 0.81585765, + "learning_rate": 3.87298534506069e-06, + "loss": 0.89495289, + "num_input_tokens_seen": 50666130, + "router_z_loss_clip": 3.19140625, + "router_z_loss_mlp": 0.31689453, + "step": 2339, + "time_per_iteration": 2.541985273361206 + }, + { + "auxiliary_loss_clip": 0.06608106, + "auxiliary_loss_mlp": 0.01284227, + "balance_loss_clip": 0.06301871, + "balance_loss_mlp": 0.01254735, + "epoch": 0.14068841124304826, + "flos": 39211856611200.0, + "grad_norm": 1.7427009821835167, + "language_loss": 0.66622555, + "learning_rate": 3.872848730344146e-06, + "loss": 0.7451489, + "num_input_tokens_seen": 50687440, + "router_z_loss_clip": 3.06640625, + "router_z_loss_mlp": 0.29492188, + "step": 2340, + "time_per_iteration": 2.7599191665649414 + }, + { + "auxiliary_loss_clip": 0.06615461, + "auxiliary_loss_mlp": 0.01296967, + "balance_loss_clip": 0.06309174, + "balance_loss_mlp": 0.01267952, + "epoch": 0.14074853449571623, + "flos": 20198278581120.0, + "grad_norm": 2.455789479029152, + "language_loss": 0.80003643, + "learning_rate": 3.87271204460899e-06, + "loss": 0.87916064, + "num_input_tokens_seen": 50704030, + "router_z_loss_clip": 3.06445312, + "router_z_loss_mlp": 0.2902832, + "step": 2341, + "time_per_iteration": 2.5097782611846924 + }, + { + "auxiliary_loss_clip": 0.06617275, + "auxiliary_loss_mlp": 0.01290109, + "balance_loss_clip": 0.06306843, + "balance_loss_mlp": 0.01261118, + "epoch": 0.1408086577483842, + "flos": 18411800405760.0, + "grad_norm": 1.7920815266740484, + "language_loss": 0.81707942, + "learning_rate": 3.8725752878604066e-06, + "loss": 0.89615333, + "num_input_tokens_seen": 50723305, + "router_z_loss_clip": 3.10351562, + "router_z_loss_mlp": 0.29003906, + "step": 2342, + "time_per_iteration": 2.5234599113464355 + }, + { + "auxiliary_loss_clip": 0.06617711, + "auxiliary_loss_mlp": 0.01285014, + "balance_loss_clip": 0.06315217, + "balance_loss_mlp": 0.01257858, + "epoch": 0.14086878100105216, + "flos": 25271569013760.0, + "grad_norm": 1.8907393143090194, + "language_loss": 0.79096431, + "learning_rate": 3.87243846010358e-06, + "loss": 0.8699916, + "num_input_tokens_seen": 50743270, + "router_z_loss_clip": 3.02734375, + "router_z_loss_mlp": 0.27172852, + "step": 2343, + "time_per_iteration": 2.566734552383423 + }, + { + "auxiliary_loss_clip": 0.0648333, + "auxiliary_loss_mlp": 0.01280273, + "balance_loss_clip": 0.06304723, + "balance_loss_mlp": 0.01268566, + "epoch": 0.14092890425372012, + "flos": 65997553703040.0, + "grad_norm": 0.8105470614930316, + "language_loss": 0.61667693, + "learning_rate": 3.872301561343699e-06, + "loss": 0.69431293, + "num_input_tokens_seen": 50802710, + "router_z_loss_clip": 1.78125, + "router_z_loss_mlp": 0.11694336, + "step": 2344, + "time_per_iteration": 3.107311964035034 + }, + { + "auxiliary_loss_clip": 0.06612515, + "auxiliary_loss_mlp": 0.01296816, + "balance_loss_clip": 0.06307824, + "balance_loss_mlp": 0.01267514, + "epoch": 0.1409890275063881, + "flos": 23701564661760.0, + "grad_norm": 1.4479662088391603, + "language_loss": 0.66076458, + "learning_rate": 3.872164591585956e-06, + "loss": 0.73985791, + "num_input_tokens_seen": 50822625, + "router_z_loss_clip": 3.046875, + "router_z_loss_mlp": 0.29321289, + "step": 2345, + "time_per_iteration": 2.548482656478882 + }, + { + "auxiliary_loss_clip": 0.06630909, + "auxiliary_loss_mlp": 0.0129167, + "balance_loss_clip": 0.06307563, + "balance_loss_mlp": 0.01260676, + "epoch": 0.14104915075905605, + "flos": 23629923820800.0, + "grad_norm": 2.297389176264822, + "language_loss": 0.7525146, + "learning_rate": 3.8720275508355435e-06, + "loss": 0.83174026, + "num_input_tokens_seen": 50842330, + "router_z_loss_clip": 3.23242188, + "router_z_loss_mlp": 0.31005859, + "step": 2346, + "time_per_iteration": 3.9794979095458984 + }, + { + "auxiliary_loss_clip": 0.06626198, + "auxiliary_loss_mlp": 0.01293091, + "balance_loss_clip": 0.06312405, + "balance_loss_mlp": 0.0126162, + "epoch": 0.14110927401172405, + "flos": 20601485228160.0, + "grad_norm": 2.0524474508447876, + "language_loss": 0.7827574, + "learning_rate": 3.8718904390976585e-06, + "loss": 0.86195028, + "num_input_tokens_seen": 50861035, + "router_z_loss_clip": 3.13867188, + "router_z_loss_mlp": 0.31445312, + "step": 2347, + "time_per_iteration": 3.98130202293396 + }, + { + "auxiliary_loss_clip": 0.06624688, + "auxiliary_loss_mlp": 0.01292693, + "balance_loss_clip": 0.06315368, + "balance_loss_mlp": 0.01263725, + "epoch": 0.141169397264392, + "flos": 28555530232320.0, + "grad_norm": 2.266106813963602, + "language_loss": 0.77906024, + "learning_rate": 3.8717532563775e-06, + "loss": 0.85823405, + "num_input_tokens_seen": 50880105, + "router_z_loss_clip": 3.09765625, + "router_z_loss_mlp": 0.28955078, + "step": 2348, + "time_per_iteration": 2.594891309738159 + }, + { + "auxiliary_loss_clip": 0.06614843, + "auxiliary_loss_mlp": 0.01295406, + "balance_loss_clip": 0.06306847, + "balance_loss_mlp": 0.01267558, + "epoch": 0.14122952051705998, + "flos": 17097947585280.0, + "grad_norm": 2.2615839491571097, + "language_loss": 0.88040984, + "learning_rate": 3.871616002680272e-06, + "loss": 0.95951235, + "num_input_tokens_seen": 50897720, + "router_z_loss_clip": 3.08007812, + "router_z_loss_mlp": 0.27856445, + "step": 2349, + "time_per_iteration": 2.547189712524414 + }, + { + "auxiliary_loss_clip": 0.06613597, + "auxiliary_loss_mlp": 0.01290937, + "balance_loss_clip": 0.06307055, + "balance_loss_mlp": 0.01260754, + "epoch": 0.14128964376972794, + "flos": 28953915269760.0, + "grad_norm": 1.755772853620136, + "language_loss": 0.89833802, + "learning_rate": 3.871478678011177e-06, + "loss": 0.97738338, + "num_input_tokens_seen": 50918385, + "router_z_loss_clip": 3.06445312, + "router_z_loss_mlp": 0.30200195, + "step": 2350, + "time_per_iteration": 2.5965797901153564 + }, + { + "auxiliary_loss_clip": 0.06614771, + "auxiliary_loss_mlp": 0.01295884, + "balance_loss_clip": 0.06303953, + "balance_loss_mlp": 0.0126626, + "epoch": 0.1413497670223959, + "flos": 18995828163840.0, + "grad_norm": 2.169076392434691, + "language_loss": 0.81670076, + "learning_rate": 3.871341282375423e-06, + "loss": 0.89580733, + "num_input_tokens_seen": 50938270, + "router_z_loss_clip": 3.10742188, + "router_z_loss_mlp": 0.29638672, + "step": 2351, + "time_per_iteration": 4.039130687713623 + }, + { + "auxiliary_loss_clip": 0.06617273, + "auxiliary_loss_mlp": 0.012885, + "balance_loss_clip": 0.06303668, + "balance_loss_mlp": 0.01259246, + "epoch": 0.14140989027506387, + "flos": 29870053885440.0, + "grad_norm": 2.711725731055931, + "language_loss": 0.85320342, + "learning_rate": 3.871203815778219e-06, + "loss": 0.93226123, + "num_input_tokens_seen": 50958155, + "router_z_loss_clip": 3.1328125, + "router_z_loss_mlp": 0.29223633, + "step": 2352, + "time_per_iteration": 2.6179373264312744 + }, + { + "auxiliary_loss_clip": 0.06476805, + "auxiliary_loss_mlp": 0.01279755, + "balance_loss_clip": 0.06299812, + "balance_loss_mlp": 0.01267614, + "epoch": 0.14147001352773186, + "flos": 62098901331840.0, + "grad_norm": 0.8822482530682503, + "language_loss": 0.61915213, + "learning_rate": 3.87106627822478e-06, + "loss": 0.69671774, + "num_input_tokens_seen": 51020705, + "router_z_loss_clip": 1.76953125, + "router_z_loss_mlp": 0.12139893, + "step": 2353, + "time_per_iteration": 3.087498188018799 + }, + { + "auxiliary_loss_clip": 0.06606863, + "auxiliary_loss_mlp": 0.01289785, + "balance_loss_clip": 0.06297718, + "balance_loss_mlp": 0.01259458, + "epoch": 0.14153013678039983, + "flos": 22023973267200.0, + "grad_norm": 1.6072508509392793, + "language_loss": 0.88457793, + "learning_rate": 3.8709286697203196e-06, + "loss": 0.96354443, + "num_input_tokens_seen": 51039995, + "router_z_loss_clip": 3.09179688, + "router_z_loss_mlp": 0.30297852, + "step": 2354, + "time_per_iteration": 2.5465357303619385 + }, + { + "auxiliary_loss_clip": 0.06612588, + "auxiliary_loss_mlp": 0.01286583, + "balance_loss_clip": 0.0630111, + "balance_loss_mlp": 0.01255231, + "epoch": 0.1415902600330678, + "flos": 19726365985920.0, + "grad_norm": 1.842515646240357, + "language_loss": 0.75627196, + "learning_rate": 3.870790990270057e-06, + "loss": 0.83526361, + "num_input_tokens_seen": 51059075, + "router_z_loss_clip": 3.11523438, + "router_z_loss_mlp": 0.31347656, + "step": 2355, + "time_per_iteration": 2.5172102451324463 + }, + { + "auxiliary_loss_clip": 0.0647012, + "auxiliary_loss_mlp": 0.01269619, + "balance_loss_clip": 0.06293327, + "balance_loss_mlp": 0.01258312, + "epoch": 0.14165038328573576, + "flos": 65919330316800.0, + "grad_norm": 0.6582247032564781, + "language_loss": 0.51791292, + "learning_rate": 3.870653239879212e-06, + "loss": 0.59531033, + "num_input_tokens_seen": 51120380, + "router_z_loss_clip": 1.765625, + "router_z_loss_mlp": 0.11303711, + "step": 2356, + "time_per_iteration": 3.150625228881836 + }, + { + "auxiliary_loss_clip": 0.06615196, + "auxiliary_loss_mlp": 0.01292015, + "balance_loss_clip": 0.06304777, + "balance_loss_mlp": 0.01263262, + "epoch": 0.14171050653840372, + "flos": 12135011379840.0, + "grad_norm": 2.2420127528599973, + "language_loss": 0.71637189, + "learning_rate": 3.8705154185530095e-06, + "loss": 0.79544401, + "num_input_tokens_seen": 51136950, + "router_z_loss_clip": 3.10742188, + "router_z_loss_mlp": 0.28759766, + "step": 2357, + "time_per_iteration": 2.552600383758545 + }, + { + "auxiliary_loss_clip": 0.06616427, + "auxiliary_loss_mlp": 0.01288449, + "balance_loss_clip": 0.06301764, + "balance_loss_mlp": 0.01259624, + "epoch": 0.1417706297910717, + "flos": 20418735473280.0, + "grad_norm": 1.865810969860464, + "language_loss": 0.83125997, + "learning_rate": 3.870377526296674e-06, + "loss": 0.91030866, + "num_input_tokens_seen": 51155175, + "router_z_loss_clip": 3.15039062, + "router_z_loss_mlp": 0.28833008, + "step": 2358, + "time_per_iteration": 2.5359318256378174 + }, + { + "auxiliary_loss_clip": 0.06627464, + "auxiliary_loss_mlp": 0.01304325, + "balance_loss_clip": 0.06307626, + "balance_loss_mlp": 0.01270685, + "epoch": 0.14183075304373965, + "flos": 22386831373440.0, + "grad_norm": 2.098054947183796, + "language_loss": 0.72660583, + "learning_rate": 3.870239563115436e-06, + "loss": 0.8059237, + "num_input_tokens_seen": 51174500, + "router_z_loss_clip": 3.19726562, + "router_z_loss_mlp": 0.33642578, + "step": 2359, + "time_per_iteration": 2.5888121128082275 + }, + { + "auxiliary_loss_clip": 0.06615248, + "auxiliary_loss_mlp": 0.01292517, + "balance_loss_clip": 0.06299685, + "balance_loss_mlp": 0.0126126, + "epoch": 0.14189087629640765, + "flos": 21587503749120.0, + "grad_norm": 2.25647767982073, + "language_loss": 0.77278101, + "learning_rate": 3.870101529014526e-06, + "loss": 0.85185868, + "num_input_tokens_seen": 51194270, + "router_z_loss_clip": 3.16015625, + "router_z_loss_mlp": 0.31225586, + "step": 2360, + "time_per_iteration": 2.579084634780884 + }, + { + "auxiliary_loss_clip": 0.06601179, + "auxiliary_loss_mlp": 0.01289048, + "balance_loss_clip": 0.06295604, + "balance_loss_mlp": 0.01258936, + "epoch": 0.1419509995490756, + "flos": 20014312942080.0, + "grad_norm": 2.059957260866831, + "language_loss": 0.83125579, + "learning_rate": 3.869963423999178e-06, + "loss": 0.91015804, + "num_input_tokens_seen": 51211850, + "router_z_loss_clip": 3.0546875, + "router_z_loss_mlp": 0.30102539, + "step": 2361, + "time_per_iteration": 2.5846474170684814 + }, + { + "auxiliary_loss_clip": 0.06605215, + "auxiliary_loss_mlp": 0.01291381, + "balance_loss_clip": 0.06296673, + "balance_loss_mlp": 0.01261745, + "epoch": 0.14201112280174358, + "flos": 31949552188800.0, + "grad_norm": 1.940007653055607, + "language_loss": 0.75587547, + "learning_rate": 3.86982524807463e-06, + "loss": 0.83484137, + "num_input_tokens_seen": 51233545, + "router_z_loss_clip": 3.08789062, + "router_z_loss_mlp": 0.29663086, + "step": 2362, + "time_per_iteration": 2.6412899494171143 + }, + { + "auxiliary_loss_clip": 0.06603248, + "auxiliary_loss_mlp": 0.01291653, + "balance_loss_clip": 0.06299227, + "balance_loss_mlp": 0.01262948, + "epoch": 0.14207124605441154, + "flos": 41473811180160.0, + "grad_norm": 1.7220107932789903, + "language_loss": 0.74775076, + "learning_rate": 3.869687001246122e-06, + "loss": 0.82669979, + "num_input_tokens_seen": 51257615, + "router_z_loss_clip": 3.04296875, + "router_z_loss_mlp": 0.28686523, + "step": 2363, + "time_per_iteration": 2.7700705528259277 + }, + { + "auxiliary_loss_clip": 0.0660228, + "auxiliary_loss_mlp": 0.01297174, + "balance_loss_clip": 0.06294844, + "balance_loss_mlp": 0.01268051, + "epoch": 0.1421313693070795, + "flos": 31913186716800.0, + "grad_norm": 1.995738601500514, + "language_loss": 0.74229443, + "learning_rate": 3.8695486835188946e-06, + "loss": 0.82128894, + "num_input_tokens_seen": 51279645, + "router_z_loss_clip": 3.07226562, + "router_z_loss_mlp": 0.2911377, + "step": 2364, + "time_per_iteration": 2.636725664138794 + }, + { + "auxiliary_loss_clip": 0.06596863, + "auxiliary_loss_mlp": 0.01292827, + "balance_loss_clip": 0.06297632, + "balance_loss_mlp": 0.01264741, + "epoch": 0.14219149255974747, + "flos": 26878609670400.0, + "grad_norm": 3.4348232103303853, + "language_loss": 0.91282582, + "learning_rate": 3.869410294898195e-06, + "loss": 0.9917227, + "num_input_tokens_seen": 51299775, + "router_z_loss_clip": 2.9921875, + "router_z_loss_mlp": 0.28100586, + "step": 2365, + "time_per_iteration": 2.6131789684295654 + }, + { + "auxiliary_loss_clip": 0.06604894, + "auxiliary_loss_mlp": 0.01286963, + "balance_loss_clip": 0.06295748, + "balance_loss_mlp": 0.01257613, + "epoch": 0.14225161581241544, + "flos": 27461882741760.0, + "grad_norm": 1.7987446671320764, + "language_loss": 0.67002726, + "learning_rate": 3.869271835389268e-06, + "loss": 0.74894583, + "num_input_tokens_seen": 51319430, + "router_z_loss_clip": 3.09570312, + "router_z_loss_mlp": 0.29345703, + "step": 2366, + "time_per_iteration": 2.5887913703918457 + }, + { + "auxiliary_loss_clip": 0.06604536, + "auxiliary_loss_mlp": 0.01294035, + "balance_loss_clip": 0.06302322, + "balance_loss_mlp": 0.01266069, + "epoch": 0.14231173906508343, + "flos": 10566055203840.0, + "grad_norm": 1.9092553080536903, + "language_loss": 0.81985664, + "learning_rate": 3.8691333049973665e-06, + "loss": 0.89884233, + "num_input_tokens_seen": 51336045, + "router_z_loss_clip": 3.0234375, + "router_z_loss_mlp": 0.27978516, + "step": 2367, + "time_per_iteration": 2.5478296279907227 + }, + { + "auxiliary_loss_clip": 0.06620896, + "auxiliary_loss_mlp": 0.01287452, + "balance_loss_clip": 0.06312472, + "balance_loss_mlp": 0.01257244, + "epoch": 0.1423718623177514, + "flos": 28367539597440.0, + "grad_norm": 1.7968709236925184, + "language_loss": 0.83861458, + "learning_rate": 3.868994703727742e-06, + "loss": 0.91769814, + "num_input_tokens_seen": 51357030, + "router_z_loss_clip": 3.08984375, + "router_z_loss_mlp": 0.30224609, + "step": 2368, + "time_per_iteration": 2.6346163749694824 + }, + { + "auxiliary_loss_clip": 0.06607647, + "auxiliary_loss_mlp": 0.01292051, + "balance_loss_clip": 0.06299834, + "balance_loss_mlp": 0.01262558, + "epoch": 0.14243198557041936, + "flos": 19360279497600.0, + "grad_norm": 2.15297979683556, + "language_loss": 0.8844623, + "learning_rate": 3.868856031585652e-06, + "loss": 0.96345925, + "num_input_tokens_seen": 51374890, + "router_z_loss_clip": 3.078125, + "router_z_loss_mlp": 0.29516602, + "step": 2369, + "time_per_iteration": 2.539616823196411 + }, + { + "auxiliary_loss_clip": 0.06609218, + "auxiliary_loss_mlp": 0.01286988, + "balance_loss_clip": 0.06298466, + "balance_loss_mlp": 0.01257067, + "epoch": 0.14249210882308733, + "flos": 28814952072960.0, + "grad_norm": 1.4943626605358518, + "language_loss": 0.76837498, + "learning_rate": 3.868717288576354e-06, + "loss": 0.84733701, + "num_input_tokens_seen": 51398100, + "router_z_loss_clip": 3.10546875, + "router_z_loss_mlp": 0.29931641, + "step": 2370, + "time_per_iteration": 2.6086556911468506 + }, + { + "auxiliary_loss_clip": 0.06600792, + "auxiliary_loss_mlp": 0.01298284, + "balance_loss_clip": 0.06298122, + "balance_loss_mlp": 0.01270198, + "epoch": 0.1425522320757553, + "flos": 21841433147520.0, + "grad_norm": 1.5553091357309907, + "language_loss": 0.83888042, + "learning_rate": 3.868578474705109e-06, + "loss": 0.91787124, + "num_input_tokens_seen": 51418745, + "router_z_loss_clip": 3.02734375, + "router_z_loss_mlp": 0.28076172, + "step": 2371, + "time_per_iteration": 2.5464093685150146 + }, + { + "auxiliary_loss_clip": 0.06608661, + "auxiliary_loss_mlp": 0.01298037, + "balance_loss_clip": 0.06299958, + "balance_loss_mlp": 0.01267448, + "epoch": 0.14261235532842326, + "flos": 17317230520320.0, + "grad_norm": 1.80299500179396, + "language_loss": 0.84039259, + "learning_rate": 3.868439589977181e-06, + "loss": 0.91945958, + "num_input_tokens_seen": 51437455, + "router_z_loss_clip": 3.08984375, + "router_z_loss_mlp": 0.30615234, + "step": 2372, + "time_per_iteration": 2.6340725421905518 + }, + { + "auxiliary_loss_clip": 0.0660327, + "auxiliary_loss_mlp": 0.01297499, + "balance_loss_clip": 0.06296232, + "balance_loss_mlp": 0.01267149, + "epoch": 0.14267247858109125, + "flos": 18812659138560.0, + "grad_norm": 1.948811934487197, + "language_loss": 0.8570497, + "learning_rate": 3.868300634397836e-06, + "loss": 0.93605745, + "num_input_tokens_seen": 51455710, + "router_z_loss_clip": 3.07226562, + "router_z_loss_mlp": 0.30322266, + "step": 2373, + "time_per_iteration": 2.580719232559204 + }, + { + "auxiliary_loss_clip": 0.06601362, + "auxiliary_loss_mlp": 0.01295253, + "balance_loss_clip": 0.06296989, + "balance_loss_mlp": 0.01266547, + "epoch": 0.14273260183375922, + "flos": 11362783351680.0, + "grad_norm": 1.9518464435556906, + "language_loss": 0.87130672, + "learning_rate": 3.8681616079723445e-06, + "loss": 0.95027292, + "num_input_tokens_seen": 51471270, + "router_z_loss_clip": 3.04101562, + "router_z_loss_mlp": 0.28710938, + "step": 2374, + "time_per_iteration": 2.499939441680908 + }, + { + "auxiliary_loss_clip": 0.0660402, + "auxiliary_loss_mlp": 0.01294805, + "balance_loss_clip": 0.06292336, + "balance_loss_mlp": 0.01264526, + "epoch": 0.14279272508642718, + "flos": 27575800767360.0, + "grad_norm": 1.5586534981326832, + "language_loss": 0.79946959, + "learning_rate": 3.868022510705977e-06, + "loss": 0.87845778, + "num_input_tokens_seen": 51492705, + "router_z_loss_clip": 3.1171875, + "router_z_loss_mlp": 0.30273438, + "step": 2375, + "time_per_iteration": 2.610959768295288 + }, + { + "auxiliary_loss_clip": 0.06608847, + "auxiliary_loss_mlp": 0.01308792, + "balance_loss_clip": 0.06302035, + "balance_loss_mlp": 0.01278454, + "epoch": 0.14285284833909515, + "flos": 16258019857920.0, + "grad_norm": 4.976375068021591, + "language_loss": 0.77988309, + "learning_rate": 3.867883342604009e-06, + "loss": 0.85905945, + "num_input_tokens_seen": 51510780, + "router_z_loss_clip": 3.0703125, + "router_z_loss_mlp": 0.30310059, + "step": 2376, + "time_per_iteration": 2.5109288692474365 + }, + { + "auxiliary_loss_clip": 0.06606634, + "auxiliary_loss_mlp": 0.01292138, + "balance_loss_clip": 0.06302465, + "balance_loss_mlp": 0.01263742, + "epoch": 0.1429129715917631, + "flos": 19761725208960.0, + "grad_norm": 1.9346292161061796, + "language_loss": 0.94255036, + "learning_rate": 3.867744103671717e-06, + "loss": 1.02153814, + "num_input_tokens_seen": 51531400, + "router_z_loss_clip": 3.04101562, + "router_z_loss_mlp": 0.28393555, + "step": 2377, + "time_per_iteration": 2.5885112285614014 + }, + { + "auxiliary_loss_clip": 0.06608409, + "auxiliary_loss_mlp": 0.01297565, + "balance_loss_clip": 0.06298596, + "balance_loss_mlp": 0.01267524, + "epoch": 0.14297309484443108, + "flos": 21142606896000.0, + "grad_norm": 1.9262255620531108, + "language_loss": 0.92638403, + "learning_rate": 3.867604793914382e-06, + "loss": 1.00544381, + "num_input_tokens_seen": 51548215, + "router_z_loss_clip": 3.09960938, + "router_z_loss_mlp": 0.30029297, + "step": 2378, + "time_per_iteration": 2.5396018028259277 + }, + { + "auxiliary_loss_clip": 0.06602019, + "auxiliary_loss_mlp": 0.01288289, + "balance_loss_clip": 0.06294227, + "balance_loss_mlp": 0.01259667, + "epoch": 0.14303321809709904, + "flos": 23593432567680.0, + "grad_norm": 1.925396398414909, + "language_loss": 0.7506215, + "learning_rate": 3.8674654133372864e-06, + "loss": 0.82952458, + "num_input_tokens_seen": 51566820, + "router_z_loss_clip": 3.07421875, + "router_z_loss_mlp": 0.28649902, + "step": 2379, + "time_per_iteration": 2.5452654361724854 + }, + { + "auxiliary_loss_clip": 0.06604548, + "auxiliary_loss_mlp": 0.01289072, + "balance_loss_clip": 0.06300471, + "balance_loss_mlp": 0.01259342, + "epoch": 0.14309334134976703, + "flos": 15893778159360.0, + "grad_norm": 2.089306422098332, + "language_loss": 0.80051982, + "learning_rate": 3.867325961945714e-06, + "loss": 0.87945604, + "num_input_tokens_seen": 51585075, + "router_z_loss_clip": 3.0390625, + "router_z_loss_mlp": 0.29736328, + "step": 2380, + "time_per_iteration": 2.526667594909668 + }, + { + "auxiliary_loss_clip": 0.06614038, + "auxiliary_loss_mlp": 0.01293901, + "balance_loss_clip": 0.06305015, + "balance_loss_mlp": 0.01263348, + "epoch": 0.143153464602435, + "flos": 16331086218240.0, + "grad_norm": 2.094305551914021, + "language_loss": 0.88833153, + "learning_rate": 3.867186439744955e-06, + "loss": 0.96741092, + "num_input_tokens_seen": 51603185, + "router_z_loss_clip": 3.09179688, + "router_z_loss_mlp": 0.30578613, + "step": 2381, + "time_per_iteration": 2.5728068351745605 + }, + { + "auxiliary_loss_clip": 0.06602444, + "auxiliary_loss_mlp": 0.0128486, + "balance_loss_clip": 0.06299065, + "balance_loss_mlp": 0.01256226, + "epoch": 0.14321358785510296, + "flos": 17097737950080.0, + "grad_norm": 2.316632685614806, + "language_loss": 0.77740443, + "learning_rate": 3.867046846740299e-06, + "loss": 0.85627747, + "num_input_tokens_seen": 51620880, + "router_z_loss_clip": 3.02929688, + "router_z_loss_mlp": 0.28625488, + "step": 2382, + "time_per_iteration": 2.5297727584838867 + }, + { + "auxiliary_loss_clip": 0.06601999, + "auxiliary_loss_mlp": 0.01286872, + "balance_loss_clip": 0.06297128, + "balance_loss_mlp": 0.01257904, + "epoch": 0.14327371110777093, + "flos": 26330108843520.0, + "grad_norm": 2.004241684907444, + "language_loss": 0.78048921, + "learning_rate": 3.866907182937039e-06, + "loss": 0.85937786, + "num_input_tokens_seen": 51640170, + "router_z_loss_clip": 3.05078125, + "router_z_loss_mlp": 0.28955078, + "step": 2383, + "time_per_iteration": 2.598944664001465 + }, + { + "auxiliary_loss_clip": 0.06614614, + "auxiliary_loss_mlp": 0.01292365, + "balance_loss_clip": 0.06304877, + "balance_loss_mlp": 0.01261513, + "epoch": 0.1433338343604389, + "flos": 18082163243520.0, + "grad_norm": 3.628436675924041, + "language_loss": 0.88476908, + "learning_rate": 3.866767448340471e-06, + "loss": 0.96383882, + "num_input_tokens_seen": 51656580, + "router_z_loss_clip": 3.09765625, + "router_z_loss_mlp": 0.30834961, + "step": 2384, + "time_per_iteration": 2.5066895484924316 + }, + { + "auxiliary_loss_clip": 0.06611983, + "auxiliary_loss_mlp": 0.01297446, + "balance_loss_clip": 0.06300933, + "balance_loss_mlp": 0.0126719, + "epoch": 0.14339395761310686, + "flos": 15528110941440.0, + "grad_norm": 5.651210237348795, + "language_loss": 0.81964046, + "learning_rate": 3.866627642955895e-06, + "loss": 0.89873475, + "num_input_tokens_seen": 51674645, + "router_z_loss_clip": 3.11132812, + "router_z_loss_mlp": 0.30273438, + "step": 2385, + "time_per_iteration": 3.9016833305358887 + }, + { + "auxiliary_loss_clip": 0.06612079, + "auxiliary_loss_mlp": 0.01294874, + "balance_loss_clip": 0.06302845, + "balance_loss_mlp": 0.01266406, + "epoch": 0.14345408086577485, + "flos": 28556368773120.0, + "grad_norm": 2.028141972046204, + "language_loss": 0.76766604, + "learning_rate": 3.866487766788612e-06, + "loss": 0.8467356, + "num_input_tokens_seen": 51695770, + "router_z_loss_clip": 3.09570312, + "router_z_loss_mlp": 0.28479004, + "step": 2386, + "time_per_iteration": 4.032405376434326 + }, + { + "auxiliary_loss_clip": 0.06616995, + "auxiliary_loss_mlp": 0.01287556, + "balance_loss_clip": 0.06312285, + "balance_loss_mlp": 0.01258958, + "epoch": 0.14351420411844282, + "flos": 20236279207680.0, + "grad_norm": 2.123480501578919, + "language_loss": 0.79237044, + "learning_rate": 3.866347819843925e-06, + "loss": 0.87141591, + "num_input_tokens_seen": 51714165, + "router_z_loss_clip": 3.046875, + "router_z_loss_mlp": 0.28601074, + "step": 2387, + "time_per_iteration": 2.5608971118927 + }, + { + "auxiliary_loss_clip": 0.06612308, + "auxiliary_loss_mlp": 0.01293206, + "balance_loss_clip": 0.06306893, + "balance_loss_mlp": 0.01263023, + "epoch": 0.14357432737111078, + "flos": 19871157041280.0, + "grad_norm": 2.5788985385847396, + "language_loss": 0.83602524, + "learning_rate": 3.866207802127143e-06, + "loss": 0.91508037, + "num_input_tokens_seen": 51734440, + "router_z_loss_clip": 3.06054688, + "router_z_loss_mlp": 0.30200195, + "step": 2388, + "time_per_iteration": 2.5413224697113037 + }, + { + "auxiliary_loss_clip": 0.06619543, + "auxiliary_loss_mlp": 0.01287669, + "balance_loss_clip": 0.06312172, + "balance_loss_mlp": 0.0126006, + "epoch": 0.14363445062377875, + "flos": 28264354894080.0, + "grad_norm": 2.5598639084548176, + "language_loss": 0.83343434, + "learning_rate": 3.866067713643573e-06, + "loss": 0.91250646, + "num_input_tokens_seen": 51753730, + "router_z_loss_clip": 3.07421875, + "router_z_loss_mlp": 0.27648926, + "step": 2389, + "time_per_iteration": 2.6027376651763916 + }, + { + "auxiliary_loss_clip": 0.06612187, + "auxiliary_loss_mlp": 0.01286457, + "balance_loss_clip": 0.06301727, + "balance_loss_mlp": 0.01257013, + "epoch": 0.1436945738764467, + "flos": 18192517470720.0, + "grad_norm": 2.036228542153499, + "language_loss": 0.84029567, + "learning_rate": 3.8659275543985285e-06, + "loss": 0.91928208, + "num_input_tokens_seen": 51771195, + "router_z_loss_clip": 3.10546875, + "router_z_loss_mlp": 0.29467773, + "step": 2390, + "time_per_iteration": 5.428901672363281 + }, + { + "auxiliary_loss_clip": 0.06612678, + "auxiliary_loss_mlp": 0.01293631, + "balance_loss_clip": 0.06306715, + "balance_loss_mlp": 0.01264282, + "epoch": 0.14375469712911468, + "flos": 27315246896640.0, + "grad_norm": 2.34202135113637, + "language_loss": 0.75496042, + "learning_rate": 3.865787324397324e-06, + "loss": 0.83402348, + "num_input_tokens_seen": 51792290, + "router_z_loss_clip": 3.06054688, + "router_z_loss_mlp": 0.29345703, + "step": 2391, + "time_per_iteration": 2.599823236465454 + }, + { + "auxiliary_loss_clip": 0.06462222, + "auxiliary_loss_mlp": 0.01318708, + "balance_loss_clip": 0.06290679, + "balance_loss_mlp": 0.01307848, + "epoch": 0.14381482038178264, + "flos": 56908757980800.0, + "grad_norm": 0.847659725006037, + "language_loss": 0.61820173, + "learning_rate": 3.865647023645277e-06, + "loss": 0.69601095, + "num_input_tokens_seen": 51843675, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.10876465, + "step": 2392, + "time_per_iteration": 3.007570266723633 + }, + { + "auxiliary_loss_clip": 0.06623066, + "auxiliary_loss_mlp": 0.01297432, + "balance_loss_clip": 0.06308551, + "balance_loss_mlp": 0.01267105, + "epoch": 0.14387494363445064, + "flos": 14287282554240.0, + "grad_norm": 6.716541515366395, + "language_loss": 0.77778554, + "learning_rate": 3.865506652147709e-06, + "loss": 0.85699052, + "num_input_tokens_seen": 51860285, + "router_z_loss_clip": 3.14453125, + "router_z_loss_mlp": 0.30322266, + "step": 2393, + "time_per_iteration": 2.5064942836761475 + }, + { + "auxiliary_loss_clip": 0.06614703, + "auxiliary_loss_mlp": 0.01296275, + "balance_loss_clip": 0.06308223, + "balance_loss_mlp": 0.01266687, + "epoch": 0.1439350668871186, + "flos": 26768884348800.0, + "grad_norm": 2.0037821703408287, + "language_loss": 0.78038269, + "learning_rate": 3.865366209909941e-06, + "loss": 0.85949242, + "num_input_tokens_seen": 51880105, + "router_z_loss_clip": 3.06445312, + "router_z_loss_mlp": 0.2956543, + "step": 2394, + "time_per_iteration": 2.6112003326416016 + }, + { + "auxiliary_loss_clip": 0.06611894, + "auxiliary_loss_mlp": 0.01285238, + "balance_loss_clip": 0.06308618, + "balance_loss_mlp": 0.01256866, + "epoch": 0.14399519013978657, + "flos": 40709926632960.0, + "grad_norm": 2.2776605014778, + "language_loss": 0.87247694, + "learning_rate": 3.8652256969372994e-06, + "loss": 0.95144826, + "num_input_tokens_seen": 51905175, + "router_z_loss_clip": 3.03320312, + "router_z_loss_mlp": 0.28381348, + "step": 2395, + "time_per_iteration": 2.708005428314209 + }, + { + "auxiliary_loss_clip": 0.06606728, + "auxiliary_loss_mlp": 0.0129272, + "balance_loss_clip": 0.06306736, + "balance_loss_mlp": 0.01262846, + "epoch": 0.14405531339245453, + "flos": 20563652309760.0, + "grad_norm": 1.5258430726739798, + "language_loss": 0.83690441, + "learning_rate": 3.865085113235113e-06, + "loss": 0.91589892, + "num_input_tokens_seen": 51924490, + "router_z_loss_clip": 3.00195312, + "router_z_loss_mlp": 0.29882812, + "step": 2396, + "time_per_iteration": 2.554426431655884 + }, + { + "auxiliary_loss_clip": 0.06608565, + "auxiliary_loss_mlp": 0.01286347, + "balance_loss_clip": 0.06309813, + "balance_loss_mlp": 0.0125664, + "epoch": 0.1441154366451225, + "flos": 19578975454080.0, + "grad_norm": 3.4820488024482787, + "language_loss": 0.83915055, + "learning_rate": 3.864944458808712e-06, + "loss": 0.9180997, + "num_input_tokens_seen": 51940490, + "router_z_loss_clip": 2.98828125, + "router_z_loss_mlp": 0.29711914, + "step": 2397, + "time_per_iteration": 2.504763603210449 + }, + { + "auxiliary_loss_clip": 0.0661477, + "auxiliary_loss_mlp": 0.01289633, + "balance_loss_clip": 0.0631109, + "balance_loss_mlp": 0.01261452, + "epoch": 0.14417555989779046, + "flos": 18521735362560.0, + "grad_norm": 2.264494400552882, + "language_loss": 0.81188649, + "learning_rate": 3.86480373366343e-06, + "loss": 0.89093053, + "num_input_tokens_seen": 51957910, + "router_z_loss_clip": 3.03710938, + "router_z_loss_mlp": 0.28186035, + "step": 2398, + "time_per_iteration": 2.5385115146636963 + }, + { + "auxiliary_loss_clip": 0.0661198, + "auxiliary_loss_mlp": 0.01292634, + "balance_loss_clip": 0.06310214, + "balance_loss_mlp": 0.01263535, + "epoch": 0.14423568315045843, + "flos": 26038933505280.0, + "grad_norm": 2.0391001830721014, + "language_loss": 0.65964776, + "learning_rate": 3.864662937804603e-06, + "loss": 0.73869389, + "num_input_tokens_seen": 51978010, + "router_z_loss_clip": 3.01367188, + "router_z_loss_mlp": 0.2911377, + "step": 2399, + "time_per_iteration": 2.5843687057495117 + }, + { + "auxiliary_loss_clip": 0.06611193, + "auxiliary_loss_mlp": 0.01283302, + "balance_loss_clip": 0.06308104, + "balance_loss_mlp": 0.01253953, + "epoch": 0.14429580640312642, + "flos": 21295238307840.0, + "grad_norm": 1.6766317515480094, + "language_loss": 0.83645046, + "learning_rate": 3.864522071237571e-06, + "loss": 0.91539544, + "num_input_tokens_seen": 51998515, + "router_z_loss_clip": 3.03320312, + "router_z_loss_mlp": 0.29321289, + "step": 2400, + "time_per_iteration": 2.555400848388672 + }, + { + "auxiliary_loss_clip": 0.06611119, + "auxiliary_loss_mlp": 0.01295227, + "balance_loss_clip": 0.06304638, + "balance_loss_mlp": 0.01263494, + "epoch": 0.14435592965579438, + "flos": 25634636755200.0, + "grad_norm": 1.4775307939223221, + "language_loss": 0.75889075, + "learning_rate": 3.864381133967676e-06, + "loss": 0.83795416, + "num_input_tokens_seen": 52019270, + "router_z_loss_clip": 3.06445312, + "router_z_loss_mlp": 0.31738281, + "step": 2401, + "time_per_iteration": 2.595510959625244 + }, + { + "auxiliary_loss_clip": 0.06599294, + "auxiliary_loss_mlp": 0.01290815, + "balance_loss_clip": 0.06300685, + "balance_loss_mlp": 0.01262991, + "epoch": 0.14441605290846235, + "flos": 22971488037120.0, + "grad_norm": 3.551603969288966, + "language_loss": 0.81723303, + "learning_rate": 3.86424012600026e-06, + "loss": 0.89613414, + "num_input_tokens_seen": 52039315, + "router_z_loss_clip": 2.984375, + "router_z_loss_mlp": 0.27832031, + "step": 2402, + "time_per_iteration": 2.586766242980957 + }, + { + "auxiliary_loss_clip": 0.06609451, + "auxiliary_loss_mlp": 0.0129576, + "balance_loss_clip": 0.06306025, + "balance_loss_mlp": 0.01267246, + "epoch": 0.14447617616113032, + "flos": 17353386357120.0, + "grad_norm": 2.060017923221776, + "language_loss": 0.8556419, + "learning_rate": 3.864099047340673e-06, + "loss": 0.93469405, + "num_input_tokens_seen": 52056555, + "router_z_loss_clip": 3.03320312, + "router_z_loss_mlp": 0.28491211, + "step": 2403, + "time_per_iteration": 2.607682943344116 + }, + { + "auxiliary_loss_clip": 0.06604473, + "auxiliary_loss_mlp": 0.01296468, + "balance_loss_clip": 0.06304755, + "balance_loss_mlp": 0.01267644, + "epoch": 0.14453629941379828, + "flos": 24066896463360.0, + "grad_norm": 1.6573993279871784, + "language_loss": 0.71218109, + "learning_rate": 3.863957897994262e-06, + "loss": 0.79119051, + "num_input_tokens_seen": 52075800, + "router_z_loss_clip": 2.99609375, + "router_z_loss_mlp": 0.28833008, + "step": 2404, + "time_per_iteration": 2.5632174015045166 + }, + { + "auxiliary_loss_clip": 0.06603173, + "auxiliary_loss_mlp": 0.0129217, + "balance_loss_clip": 0.06303019, + "balance_loss_mlp": 0.0126282, + "epoch": 0.14459642266646625, + "flos": 14434924648320.0, + "grad_norm": 2.334574719230043, + "language_loss": 0.74209595, + "learning_rate": 3.863816677966381e-06, + "loss": 0.82104933, + "num_input_tokens_seen": 52092585, + "router_z_loss_clip": 3.0, + "router_z_loss_mlp": 0.29345703, + "step": 2405, + "time_per_iteration": 2.520474910736084 + }, + { + "auxiliary_loss_clip": 0.06599967, + "auxiliary_loss_mlp": 0.01307828, + "balance_loss_clip": 0.06301095, + "balance_loss_mlp": 0.01279647, + "epoch": 0.14465654591913424, + "flos": 9871337802240.0, + "grad_norm": 2.8694662985653245, + "language_loss": 0.74507034, + "learning_rate": 3.863675387262386e-06, + "loss": 0.8241483, + "num_input_tokens_seen": 52108990, + "router_z_loss_clip": 2.984375, + "router_z_loss_mlp": 0.28173828, + "step": 2406, + "time_per_iteration": 2.5204012393951416 + }, + { + "auxiliary_loss_clip": 0.0660891, + "auxiliary_loss_mlp": 0.01299289, + "balance_loss_clip": 0.06308217, + "balance_loss_mlp": 0.01270584, + "epoch": 0.1447166691718022, + "flos": 24979890551040.0, + "grad_norm": 2.4466515535741027, + "language_loss": 0.77524543, + "learning_rate": 3.8635340258876325e-06, + "loss": 0.85432744, + "num_input_tokens_seen": 52125385, + "router_z_loss_clip": 3.00976562, + "router_z_loss_mlp": 0.28686523, + "step": 2407, + "time_per_iteration": 2.5871012210845947 + }, + { + "auxiliary_loss_clip": 0.06596132, + "auxiliary_loss_mlp": 0.01309759, + "balance_loss_clip": 0.06298497, + "balance_loss_mlp": 0.01281459, + "epoch": 0.14477679242447017, + "flos": 21914457580800.0, + "grad_norm": 2.4005439664015156, + "language_loss": 0.80167431, + "learning_rate": 3.8633925938474826e-06, + "loss": 0.88073325, + "num_input_tokens_seen": 52144985, + "router_z_loss_clip": 2.9765625, + "router_z_loss_mlp": 0.28320312, + "step": 2408, + "time_per_iteration": 2.5400643348693848 + }, + { + "auxiliary_loss_clip": 0.06608425, + "auxiliary_loss_mlp": 0.01300861, + "balance_loss_clip": 0.06305376, + "balance_loss_mlp": 0.0126939, + "epoch": 0.14483691567713813, + "flos": 20747030970240.0, + "grad_norm": 2.230633188895553, + "language_loss": 0.83653724, + "learning_rate": 3.863251091147299e-06, + "loss": 0.9156301, + "num_input_tokens_seen": 52163885, + "router_z_loss_clip": 3.03320312, + "router_z_loss_mlp": 0.31445312, + "step": 2409, + "time_per_iteration": 2.5423808097839355 + }, + { + "auxiliary_loss_clip": 0.06608373, + "auxiliary_loss_mlp": 0.0129938, + "balance_loss_clip": 0.06298821, + "balance_loss_mlp": 0.0126978, + "epoch": 0.1448970389298061, + "flos": 35416388943360.0, + "grad_norm": 2.041474654068305, + "language_loss": 0.76231539, + "learning_rate": 3.863109517792446e-06, + "loss": 0.84139293, + "num_input_tokens_seen": 52184325, + "router_z_loss_clip": 3.09765625, + "router_z_loss_mlp": 0.29602051, + "step": 2410, + "time_per_iteration": 2.6380317211151123 + }, + { + "auxiliary_loss_clip": 0.0660304, + "auxiliary_loss_mlp": 0.01294458, + "balance_loss_clip": 0.06304888, + "balance_loss_mlp": 0.01265491, + "epoch": 0.14495716218247406, + "flos": 15419853066240.0, + "grad_norm": 1.847852108753089, + "language_loss": 0.8233192, + "learning_rate": 3.8629678737882945e-06, + "loss": 0.90229416, + "num_input_tokens_seen": 52202740, + "router_z_loss_clip": 2.98046875, + "router_z_loss_mlp": 0.28942871, + "step": 2411, + "time_per_iteration": 2.5439260005950928 + }, + { + "auxiliary_loss_clip": 0.06610366, + "auxiliary_loss_mlp": 0.0129153, + "balance_loss_clip": 0.06308557, + "balance_loss_mlp": 0.01262514, + "epoch": 0.14501728543514203, + "flos": 33701677390080.0, + "grad_norm": 2.23940850930143, + "language_loss": 0.71979284, + "learning_rate": 3.862826159140214e-06, + "loss": 0.79881179, + "num_input_tokens_seen": 52223100, + "router_z_loss_clip": 3.01757812, + "router_z_loss_mlp": 0.29003906, + "step": 2412, + "time_per_iteration": 2.654892921447754 + }, + { + "auxiliary_loss_clip": 0.06603752, + "auxiliary_loss_mlp": 0.01292883, + "balance_loss_clip": 0.06306557, + "balance_loss_mlp": 0.01265465, + "epoch": 0.14507740868781002, + "flos": 15601512718080.0, + "grad_norm": 1.90667529133839, + "language_loss": 0.78426313, + "learning_rate": 3.862684373853579e-06, + "loss": 0.86322957, + "num_input_tokens_seen": 52239690, + "router_z_loss_clip": 2.96875, + "router_z_loss_mlp": 0.27441406, + "step": 2413, + "time_per_iteration": 2.5105841159820557 + }, + { + "auxiliary_loss_clip": 0.06474504, + "auxiliary_loss_mlp": 0.01256457, + "balance_loss_clip": 0.06298508, + "balance_loss_mlp": 0.01246152, + "epoch": 0.145137531940478, + "flos": 66695247924480.0, + "grad_norm": 0.8850823768955927, + "language_loss": 0.58774322, + "learning_rate": 3.8625425179337656e-06, + "loss": 0.66505289, + "num_input_tokens_seen": 52296705, + "router_z_loss_clip": 1.76269531, + "router_z_loss_mlp": 0.10308838, + "step": 2414, + "time_per_iteration": 3.0886166095733643 + }, + { + "auxiliary_loss_clip": 0.06466582, + "auxiliary_loss_mlp": 0.01255839, + "balance_loss_clip": 0.06291236, + "balance_loss_mlp": 0.01245486, + "epoch": 0.14519765519314595, + "flos": 67542806373120.0, + "grad_norm": 0.8215511806181923, + "language_loss": 0.61917955, + "learning_rate": 3.862400591386154e-06, + "loss": 0.69640374, + "num_input_tokens_seen": 52361830, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.10357666, + "step": 2415, + "time_per_iteration": 3.1800529956817627 + }, + { + "auxiliary_loss_clip": 0.06605236, + "auxiliary_loss_mlp": 0.0128974, + "balance_loss_clip": 0.06304489, + "balance_loss_mlp": 0.01261226, + "epoch": 0.14525777844581392, + "flos": 17204151035520.0, + "grad_norm": 1.9287382315286696, + "language_loss": 0.72791839, + "learning_rate": 3.8622585942161245e-06, + "loss": 0.80686808, + "num_input_tokens_seen": 52379420, + "router_z_loss_clip": 3.0078125, + "router_z_loss_mlp": 0.28540039, + "step": 2416, + "time_per_iteration": 2.5888171195983887 + }, + { + "auxiliary_loss_clip": 0.06466876, + "auxiliary_loss_mlp": 0.01256349, + "balance_loss_clip": 0.06292045, + "balance_loss_mlp": 0.01246574, + "epoch": 0.14531790169848188, + "flos": 65425349370240.0, + "grad_norm": 0.6779730680906524, + "language_loss": 0.60441911, + "learning_rate": 3.8621165264290635e-06, + "loss": 0.68165135, + "num_input_tokens_seen": 52446290, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.09765625, + "step": 2417, + "time_per_iteration": 3.256091356277466 + }, + { + "auxiliary_loss_clip": 0.06611343, + "auxiliary_loss_mlp": 0.01295709, + "balance_loss_clip": 0.06300741, + "balance_loss_mlp": 0.0126543, + "epoch": 0.14537802495114985, + "flos": 32570783959680.0, + "grad_norm": 9.327498524911116, + "language_loss": 0.80428064, + "learning_rate": 3.861974388030356e-06, + "loss": 0.88335121, + "num_input_tokens_seen": 52467295, + "router_z_loss_clip": 3.10742188, + "router_z_loss_mlp": 0.30297852, + "step": 2418, + "time_per_iteration": 2.6627931594848633 + }, + { + "auxiliary_loss_clip": 0.06597205, + "auxiliary_loss_mlp": 0.01293692, + "balance_loss_clip": 0.06300168, + "balance_loss_mlp": 0.01265952, + "epoch": 0.1454381482038178, + "flos": 20232338065920.0, + "grad_norm": 1.7107019560934957, + "language_loss": 0.72557437, + "learning_rate": 3.861832179025394e-06, + "loss": 0.80448335, + "num_input_tokens_seen": 52487295, + "router_z_loss_clip": 2.97265625, + "router_z_loss_mlp": 0.27746582, + "step": 2419, + "time_per_iteration": 2.55110764503479 + }, + { + "auxiliary_loss_clip": 0.06605242, + "auxiliary_loss_mlp": 0.01287615, + "balance_loss_clip": 0.06300443, + "balance_loss_mlp": 0.01258563, + "epoch": 0.1454982714564858, + "flos": 22899721415040.0, + "grad_norm": 2.764675065682222, + "language_loss": 0.91167969, + "learning_rate": 3.861689899419569e-06, + "loss": 0.99060822, + "num_input_tokens_seen": 52504220, + "router_z_loss_clip": 3.04882812, + "router_z_loss_mlp": 0.29064941, + "step": 2420, + "time_per_iteration": 2.554682731628418 + }, + { + "auxiliary_loss_clip": 0.06610379, + "auxiliary_loss_mlp": 0.01289829, + "balance_loss_clip": 0.06309067, + "balance_loss_mlp": 0.01262757, + "epoch": 0.14555839470915377, + "flos": 20236027645440.0, + "grad_norm": 2.2697741355192034, + "language_loss": 0.83967364, + "learning_rate": 3.861547549218276e-06, + "loss": 0.91867572, + "num_input_tokens_seen": 52521900, + "router_z_loss_clip": 3.015625, + "router_z_loss_mlp": 0.27050781, + "step": 2421, + "time_per_iteration": 2.5464484691619873 + }, + { + "auxiliary_loss_clip": 0.06610221, + "auxiliary_loss_mlp": 0.01287397, + "balance_loss_clip": 0.0630337, + "balance_loss_mlp": 0.01259216, + "epoch": 0.14561851796182174, + "flos": 22242753077760.0, + "grad_norm": 1.9618808249376125, + "language_loss": 0.82542074, + "learning_rate": 3.861405128426914e-06, + "loss": 0.90439695, + "num_input_tokens_seen": 52540495, + "router_z_loss_clip": 3.06835938, + "router_z_loss_mlp": 0.28173828, + "step": 2422, + "time_per_iteration": 2.5524632930755615 + }, + { + "auxiliary_loss_clip": 0.06461698, + "auxiliary_loss_mlp": 0.01262269, + "balance_loss_clip": 0.06287467, + "balance_loss_mlp": 0.01252607, + "epoch": 0.1456786412144897, + "flos": 52655758692480.0, + "grad_norm": 0.899920685315801, + "language_loss": 0.63252938, + "learning_rate": 3.861262637050883e-06, + "loss": 0.70976901, + "num_input_tokens_seen": 52603305, + "router_z_loss_clip": 1.74316406, + "router_z_loss_mlp": 0.09649658, + "step": 2423, + "time_per_iteration": 3.186488151550293 + }, + { + "auxiliary_loss_clip": 0.06612016, + "auxiliary_loss_mlp": 0.01288368, + "balance_loss_clip": 0.06311088, + "balance_loss_mlp": 0.01261402, + "epoch": 0.14573876446715767, + "flos": 23228352328320.0, + "grad_norm": 1.6675722488639018, + "language_loss": 0.82883829, + "learning_rate": 3.861120075095585e-06, + "loss": 0.90784216, + "num_input_tokens_seen": 52623435, + "router_z_loss_clip": 3.00976562, + "router_z_loss_mlp": 0.26928711, + "step": 2424, + "time_per_iteration": 2.6136088371276855 + }, + { + "auxiliary_loss_clip": 0.0660837, + "auxiliary_loss_mlp": 0.01282475, + "balance_loss_clip": 0.06310098, + "balance_loss_mlp": 0.01254246, + "epoch": 0.14579888771982563, + "flos": 18120331578240.0, + "grad_norm": 3.5994104334935733, + "language_loss": 0.79757202, + "learning_rate": 3.860977442566429e-06, + "loss": 0.87648046, + "num_input_tokens_seen": 52642255, + "router_z_loss_clip": 2.98242188, + "router_z_loss_mlp": 0.28271484, + "step": 2425, + "time_per_iteration": 4.07472825050354 + }, + { + "auxiliary_loss_clip": 0.06616544, + "auxiliary_loss_mlp": 0.01291448, + "balance_loss_clip": 0.06312044, + "balance_loss_mlp": 0.01263577, + "epoch": 0.14585901097249362, + "flos": 23007476165760.0, + "grad_norm": 3.905152777460985, + "language_loss": 0.84682351, + "learning_rate": 3.860834739468821e-06, + "loss": 0.92590338, + "num_input_tokens_seen": 52658700, + "router_z_loss_clip": 3.046875, + "router_z_loss_mlp": 0.27893066, + "step": 2426, + "time_per_iteration": 3.9595530033111572 + }, + { + "auxiliary_loss_clip": 0.066182, + "auxiliary_loss_mlp": 0.01297578, + "balance_loss_clip": 0.06312812, + "balance_loss_mlp": 0.0126904, + "epoch": 0.1459191342251616, + "flos": 21915212267520.0, + "grad_norm": 3.268887858496738, + "language_loss": 0.87538207, + "learning_rate": 3.860691965808173e-06, + "loss": 0.95453984, + "num_input_tokens_seen": 52678140, + "router_z_loss_clip": 3.0546875, + "router_z_loss_mlp": 0.28564453, + "step": 2427, + "time_per_iteration": 2.5644760131835938 + }, + { + "auxiliary_loss_clip": 0.0661422, + "auxiliary_loss_mlp": 0.01289371, + "balance_loss_clip": 0.06305077, + "balance_loss_mlp": 0.01258805, + "epoch": 0.14597925747782955, + "flos": 14980742144640.0, + "grad_norm": 1.9191014162631195, + "language_loss": 0.67673224, + "learning_rate": 3.8605491215899e-06, + "loss": 0.75576818, + "num_input_tokens_seen": 52696825, + "router_z_loss_clip": 3.09179688, + "router_z_loss_mlp": 0.3059082, + "step": 2428, + "time_per_iteration": 2.507455348968506 + }, + { + "auxiliary_loss_clip": 0.06609876, + "auxiliary_loss_mlp": 0.01290631, + "balance_loss_clip": 0.06305391, + "balance_loss_mlp": 0.01261807, + "epoch": 0.14603938073049752, + "flos": 21075200686080.0, + "grad_norm": 1.7530902442774277, + "language_loss": 0.84668899, + "learning_rate": 3.860406206819417e-06, + "loss": 0.92569411, + "num_input_tokens_seen": 52715125, + "router_z_loss_clip": 3.046875, + "router_z_loss_mlp": 0.28833008, + "step": 2429, + "time_per_iteration": 2.5743284225463867 + }, + { + "auxiliary_loss_clip": 0.06606025, + "auxiliary_loss_mlp": 0.01297985, + "balance_loss_clip": 0.06307633, + "balance_loss_mlp": 0.01269661, + "epoch": 0.14609950398316549, + "flos": 19870863552000.0, + "grad_norm": 1.787324656259552, + "language_loss": 0.80119967, + "learning_rate": 3.860263221502145e-06, + "loss": 0.88023973, + "num_input_tokens_seen": 52734015, + "router_z_loss_clip": 2.98242188, + "router_z_loss_mlp": 0.28308105, + "step": 2430, + "time_per_iteration": 3.9587552547454834 + }, + { + "auxiliary_loss_clip": 0.06618911, + "auxiliary_loss_mlp": 0.01299566, + "balance_loss_clip": 0.06312407, + "balance_loss_mlp": 0.01271552, + "epoch": 0.14615962723583345, + "flos": 22425377051520.0, + "grad_norm": 2.031204881913862, + "language_loss": 0.84236491, + "learning_rate": 3.860120165643504e-06, + "loss": 0.92154968, + "num_input_tokens_seen": 52753025, + "router_z_loss_clip": 3.0625, + "router_z_loss_mlp": 0.28051758, + "step": 2431, + "time_per_iteration": 2.5258126258850098 + }, + { + "auxiliary_loss_clip": 0.06622316, + "auxiliary_loss_mlp": 0.01304388, + "balance_loss_clip": 0.06307245, + "balance_loss_mlp": 0.01273823, + "epoch": 0.14621975048850142, + "flos": 22352813815680.0, + "grad_norm": 2.3067012157334976, + "language_loss": 0.79905456, + "learning_rate": 3.859977039248921e-06, + "loss": 0.87832165, + "num_input_tokens_seen": 52773420, + "router_z_loss_clip": 3.15039062, + "router_z_loss_mlp": 0.30566406, + "step": 2432, + "time_per_iteration": 2.5560994148254395 + }, + { + "auxiliary_loss_clip": 0.06613283, + "auxiliary_loss_mlp": 0.01299078, + "balance_loss_clip": 0.06307921, + "balance_loss_mlp": 0.01268894, + "epoch": 0.1462798737411694, + "flos": 24396030501120.0, + "grad_norm": 3.9772219479987796, + "language_loss": 0.8163479, + "learning_rate": 3.859833842323822e-06, + "loss": 0.89547151, + "num_input_tokens_seen": 52792870, + "router_z_loss_clip": 3.0546875, + "router_z_loss_mlp": 0.30175781, + "step": 2433, + "time_per_iteration": 2.5528087615966797 + }, + { + "auxiliary_loss_clip": 0.06603821, + "auxiliary_loss_mlp": 0.01308033, + "balance_loss_clip": 0.06304027, + "balance_loss_mlp": 0.0128052, + "epoch": 0.14633999699383737, + "flos": 19250679957120.0, + "grad_norm": 5.860215383122996, + "language_loss": 0.79175711, + "learning_rate": 3.859690574873638e-06, + "loss": 0.87087572, + "num_input_tokens_seen": 52811615, + "router_z_loss_clip": 3.0, + "router_z_loss_mlp": 0.27526855, + "step": 2434, + "time_per_iteration": 2.5396053791046143 + }, + { + "auxiliary_loss_clip": 0.0649661, + "auxiliary_loss_mlp": 0.01339476, + "balance_loss_clip": 0.0632303, + "balance_loss_mlp": 0.01328705, + "epoch": 0.14640012024650534, + "flos": 62679658780800.0, + "grad_norm": 0.822335797554765, + "language_loss": 0.58256161, + "learning_rate": 3.8595472369038e-06, + "loss": 0.66092247, + "num_input_tokens_seen": 52873230, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.10784912, + "step": 2435, + "time_per_iteration": 3.147134304046631 + }, + { + "auxiliary_loss_clip": 0.06602708, + "auxiliary_loss_mlp": 0.0130236, + "balance_loss_clip": 0.06305322, + "balance_loss_mlp": 0.01274036, + "epoch": 0.1464602434991733, + "flos": 12281144100480.0, + "grad_norm": 2.2533392469478453, + "language_loss": 0.89637053, + "learning_rate": 3.859403828419744e-06, + "loss": 0.97542119, + "num_input_tokens_seen": 52889325, + "router_z_loss_clip": 2.97460938, + "router_z_loss_mlp": 0.28320312, + "step": 2436, + "time_per_iteration": 2.5397794246673584 + }, + { + "auxiliary_loss_clip": 0.06608147, + "auxiliary_loss_mlp": 0.01302382, + "balance_loss_clip": 0.06305888, + "balance_loss_mlp": 0.01274391, + "epoch": 0.14652036675184127, + "flos": 20928480986880.0, + "grad_norm": 2.9920720004583194, + "language_loss": 0.75810778, + "learning_rate": 3.85926034942691e-06, + "loss": 0.83721304, + "num_input_tokens_seen": 52909705, + "router_z_loss_clip": 3.02148438, + "router_z_loss_mlp": 0.2800293, + "step": 2437, + "time_per_iteration": 2.5667972564697266 + }, + { + "auxiliary_loss_clip": 0.06610391, + "auxiliary_loss_mlp": 0.01306019, + "balance_loss_clip": 0.06306973, + "balance_loss_mlp": 0.01277123, + "epoch": 0.14658049000450923, + "flos": 27710151989760.0, + "grad_norm": 2.606428121821339, + "language_loss": 0.7401824, + "learning_rate": 3.859116799930736e-06, + "loss": 0.81934643, + "num_input_tokens_seen": 52930300, + "router_z_loss_clip": 3.03515625, + "router_z_loss_mlp": 0.28857422, + "step": 2438, + "time_per_iteration": 2.6476521492004395 + }, + { + "auxiliary_loss_clip": 0.06605977, + "auxiliary_loss_mlp": 0.01303285, + "balance_loss_clip": 0.06305391, + "balance_loss_mlp": 0.01274865, + "epoch": 0.14664061325717723, + "flos": 24943483152000.0, + "grad_norm": 2.0459162456522595, + "language_loss": 0.7577256, + "learning_rate": 3.858973179936668e-06, + "loss": 0.83681822, + "num_input_tokens_seen": 52949955, + "router_z_loss_clip": 3.00585938, + "router_z_loss_mlp": 0.28442383, + "step": 2439, + "time_per_iteration": 2.5789241790771484 + }, + { + "auxiliary_loss_clip": 0.06618818, + "auxiliary_loss_mlp": 0.01305858, + "balance_loss_clip": 0.06318325, + "balance_loss_mlp": 0.01278261, + "epoch": 0.1467007365098452, + "flos": 40307306964480.0, + "grad_norm": 4.636382420589035, + "language_loss": 0.74925351, + "learning_rate": 3.85882948945015e-06, + "loss": 0.82850027, + "num_input_tokens_seen": 52972905, + "router_z_loss_clip": 3.00976562, + "router_z_loss_mlp": 0.27624512, + "step": 2440, + "time_per_iteration": 2.7299485206604004 + }, + { + "auxiliary_loss_clip": 0.06605764, + "auxiliary_loss_mlp": 0.01314168, + "balance_loss_clip": 0.06310172, + "balance_loss_mlp": 0.01287667, + "epoch": 0.14676085976251316, + "flos": 26548175894400.0, + "grad_norm": 2.8544116905201755, + "language_loss": 0.84429544, + "learning_rate": 3.85868572847663e-06, + "loss": 0.92349476, + "num_input_tokens_seen": 52994850, + "router_z_loss_clip": 2.95703125, + "router_z_loss_mlp": 0.26513672, + "step": 2441, + "time_per_iteration": 2.631824016571045 + }, + { + "auxiliary_loss_clip": 0.0662398, + "auxiliary_loss_mlp": 0.01301683, + "balance_loss_clip": 0.06313129, + "balance_loss_mlp": 0.0127188, + "epoch": 0.14682098301518112, + "flos": 23556857460480.0, + "grad_norm": 2.3203183858424175, + "language_loss": 0.73868263, + "learning_rate": 3.858541897021563e-06, + "loss": 0.81793922, + "num_input_tokens_seen": 53014740, + "router_z_loss_clip": 3.11132812, + "router_z_loss_mlp": 0.29785156, + "step": 2442, + "time_per_iteration": 2.549813747406006 + }, + { + "auxiliary_loss_clip": 0.06618661, + "auxiliary_loss_mlp": 0.01300103, + "balance_loss_clip": 0.06309915, + "balance_loss_mlp": 0.01271934, + "epoch": 0.1468811062678491, + "flos": 11655048792960.0, + "grad_norm": 3.9053582460255756, + "language_loss": 0.82657981, + "learning_rate": 3.8583979950904e-06, + "loss": 0.90576744, + "num_input_tokens_seen": 53029780, + "router_z_loss_clip": 3.08789062, + "router_z_loss_mlp": 0.28161621, + "step": 2443, + "time_per_iteration": 2.5171542167663574 + }, + { + "auxiliary_loss_clip": 0.06611481, + "auxiliary_loss_mlp": 0.01308471, + "balance_loss_clip": 0.06310362, + "balance_loss_mlp": 0.0128184, + "epoch": 0.14694122952051705, + "flos": 23009237101440.0, + "grad_norm": 2.0286604977239477, + "language_loss": 0.84266245, + "learning_rate": 3.858254022688599e-06, + "loss": 0.92186195, + "num_input_tokens_seen": 53048620, + "router_z_loss_clip": 3.01171875, + "router_z_loss_mlp": 0.26635742, + "step": 2444, + "time_per_iteration": 2.5373833179473877 + }, + { + "auxiliary_loss_clip": 0.06614003, + "auxiliary_loss_mlp": 0.01304434, + "balance_loss_clip": 0.0631294, + "balance_loss_mlp": 0.0127692, + "epoch": 0.14700135277318502, + "flos": 26509797924480.0, + "grad_norm": 1.800920496835182, + "language_loss": 0.72034383, + "learning_rate": 3.85810997982162e-06, + "loss": 0.79952818, + "num_input_tokens_seen": 53070055, + "router_z_loss_clip": 3.0078125, + "router_z_loss_mlp": 0.27539062, + "step": 2445, + "time_per_iteration": 2.6035430431365967 + }, + { + "auxiliary_loss_clip": 0.0652153, + "auxiliary_loss_mlp": 0.01258872, + "balance_loss_clip": 0.06346728, + "balance_loss_mlp": 0.01251392, + "epoch": 0.147061476025853, + "flos": 59467841527680.0, + "grad_norm": 0.7965915579325233, + "language_loss": 0.62555134, + "learning_rate": 3.857965866494923e-06, + "loss": 0.70335531, + "num_input_tokens_seen": 53126945, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.074646, + "step": 2446, + "time_per_iteration": 3.0864346027374268 + }, + { + "auxiliary_loss_clip": 0.06631434, + "auxiliary_loss_mlp": 0.01305294, + "balance_loss_clip": 0.06324492, + "balance_loss_mlp": 0.01278603, + "epoch": 0.14712159927852098, + "flos": 28338637138560.0, + "grad_norm": 5.819879904445231, + "language_loss": 0.75890815, + "learning_rate": 3.857821682713975e-06, + "loss": 0.83827543, + "num_input_tokens_seen": 53149130, + "router_z_loss_clip": 3.0703125, + "router_z_loss_mlp": 0.26708984, + "step": 2447, + "time_per_iteration": 2.6405458450317383 + }, + { + "auxiliary_loss_clip": 0.0662236, + "auxiliary_loss_mlp": 0.01295421, + "balance_loss_clip": 0.06319176, + "balance_loss_mlp": 0.01267097, + "epoch": 0.14718172253118894, + "flos": 27097263699840.0, + "grad_norm": 3.1585594254982094, + "language_loss": 0.86766493, + "learning_rate": 3.857677428484242e-06, + "loss": 0.94684267, + "num_input_tokens_seen": 53167120, + "router_z_loss_clip": 3.03515625, + "router_z_loss_mlp": 0.28344727, + "step": 2448, + "time_per_iteration": 2.588178873062134 + }, + { + "auxiliary_loss_clip": 0.06500641, + "auxiliary_loss_mlp": 0.01262898, + "balance_loss_clip": 0.0632707, + "balance_loss_mlp": 0.01254792, + "epoch": 0.1472418457838569, + "flos": 66725827464960.0, + "grad_norm": 0.7311302410121435, + "language_loss": 0.56820273, + "learning_rate": 3.857533103811195e-06, + "loss": 0.64583808, + "num_input_tokens_seen": 53227945, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.08105469, + "step": 2449, + "time_per_iteration": 3.1432383060455322 + }, + { + "auxiliary_loss_clip": 0.06619844, + "auxiliary_loss_mlp": 0.01304126, + "balance_loss_clip": 0.06319091, + "balance_loss_mlp": 0.01278663, + "epoch": 0.14730196903652487, + "flos": 19579730140800.0, + "grad_norm": 2.3714801519715185, + "language_loss": 0.86300421, + "learning_rate": 3.857388708700307e-06, + "loss": 0.94224387, + "num_input_tokens_seen": 53244615, + "router_z_loss_clip": 3.0078125, + "router_z_loss_mlp": 0.2545166, + "step": 2450, + "time_per_iteration": 2.6230788230895996 + }, + { + "auxiliary_loss_clip": 0.06624465, + "auxiliary_loss_mlp": 0.01292799, + "balance_loss_clip": 0.06318057, + "balance_loss_mlp": 0.01265774, + "epoch": 0.14736209228919284, + "flos": 16076611768320.0, + "grad_norm": 3.0293103266492336, + "language_loss": 0.76407862, + "learning_rate": 3.857244243157052e-06, + "loss": 0.84325123, + "num_input_tokens_seen": 53262205, + "router_z_loss_clip": 3.06445312, + "router_z_loss_mlp": 0.2701416, + "step": 2451, + "time_per_iteration": 2.562429428100586 + }, + { + "auxiliary_loss_clip": 0.06606978, + "auxiliary_loss_mlp": 0.0129124, + "balance_loss_clip": 0.0631422, + "balance_loss_mlp": 0.01263881, + "epoch": 0.1474222155418608, + "flos": 23046147624960.0, + "grad_norm": 2.189425489790517, + "language_loss": 0.82725209, + "learning_rate": 3.85709970718691e-06, + "loss": 0.90623426, + "num_input_tokens_seen": 53282445, + "router_z_loss_clip": 2.92578125, + "router_z_loss_mlp": 0.27355957, + "step": 2452, + "time_per_iteration": 2.5850419998168945 + }, + { + "auxiliary_loss_clip": 0.06614233, + "auxiliary_loss_mlp": 0.01290168, + "balance_loss_clip": 0.06316262, + "balance_loss_mlp": 0.01264562, + "epoch": 0.1474823387945288, + "flos": 17024210392320.0, + "grad_norm": 1.704036472783103, + "language_loss": 0.7534892, + "learning_rate": 3.856955100795361e-06, + "loss": 0.83253324, + "num_input_tokens_seen": 53299060, + "router_z_loss_clip": 2.98046875, + "router_z_loss_mlp": 0.2565918, + "step": 2453, + "time_per_iteration": 2.56315016746521 + }, + { + "auxiliary_loss_clip": 0.06629206, + "auxiliary_loss_mlp": 0.01291559, + "balance_loss_clip": 0.06321974, + "balance_loss_mlp": 0.01263521, + "epoch": 0.14754246204719676, + "flos": 17900880935040.0, + "grad_norm": 2.0859032314961836, + "language_loss": 0.7740314, + "learning_rate": 3.856810423987889e-06, + "loss": 0.853239, + "num_input_tokens_seen": 53315970, + "router_z_loss_clip": 3.06835938, + "router_z_loss_mlp": 0.28076172, + "step": 2454, + "time_per_iteration": 2.512051582336426 + }, + { + "auxiliary_loss_clip": 0.06621231, + "auxiliary_loss_mlp": 0.01296513, + "balance_loss_clip": 0.06321682, + "balance_loss_mlp": 0.01269392, + "epoch": 0.14760258529986472, + "flos": 13084161304320.0, + "grad_norm": 2.060710477094934, + "language_loss": 0.84565163, + "learning_rate": 3.856665676769979e-06, + "loss": 0.92482901, + "num_input_tokens_seen": 53332940, + "router_z_loss_clip": 2.99609375, + "router_z_loss_mlp": 0.2713623, + "step": 2455, + "time_per_iteration": 2.5514066219329834 + }, + { + "auxiliary_loss_clip": 0.06633241, + "auxiliary_loss_mlp": 0.01283691, + "balance_loss_clip": 0.06325488, + "balance_loss_mlp": 0.01257393, + "epoch": 0.1476627085525327, + "flos": 30813627513600.0, + "grad_norm": 5.872574686414898, + "language_loss": 0.85135001, + "learning_rate": 3.85652085914712e-06, + "loss": 0.93051934, + "num_input_tokens_seen": 53353295, + "router_z_loss_clip": 3.07421875, + "router_z_loss_mlp": 0.26281738, + "step": 2456, + "time_per_iteration": 2.638485908508301 + }, + { + "auxiliary_loss_clip": 0.0661984, + "auxiliary_loss_mlp": 0.01288462, + "balance_loss_clip": 0.06324227, + "balance_loss_mlp": 0.01261926, + "epoch": 0.14772283180520066, + "flos": 21695887405440.0, + "grad_norm": 3.5788318870076674, + "language_loss": 0.85374033, + "learning_rate": 3.856375971124805e-06, + "loss": 0.93282336, + "num_input_tokens_seen": 53373410, + "router_z_loss_clip": 2.9609375, + "router_z_loss_mlp": 0.26550293, + "step": 2457, + "time_per_iteration": 2.5397539138793945 + }, + { + "auxiliary_loss_clip": 0.06612187, + "auxiliary_loss_mlp": 0.01285174, + "balance_loss_clip": 0.06322154, + "balance_loss_mlp": 0.01258817, + "epoch": 0.14778295505786862, + "flos": 18776335593600.0, + "grad_norm": 2.2072082990650896, + "language_loss": 0.76667166, + "learning_rate": 3.856231012708527e-06, + "loss": 0.84564531, + "num_input_tokens_seen": 53391430, + "router_z_loss_clip": 2.90039062, + "router_z_loss_mlp": 0.26379395, + "step": 2458, + "time_per_iteration": 2.5479953289031982 + }, + { + "auxiliary_loss_clip": 0.0664083, + "auxiliary_loss_mlp": 0.01290982, + "balance_loss_clip": 0.06331704, + "balance_loss_mlp": 0.01262992, + "epoch": 0.1478430783105366, + "flos": 22900224539520.0, + "grad_norm": 2.4431680555354185, + "language_loss": 0.84230208, + "learning_rate": 3.856085983903782e-06, + "loss": 0.92162013, + "num_input_tokens_seen": 53409960, + "router_z_loss_clip": 3.09375, + "router_z_loss_mlp": 0.28027344, + "step": 2459, + "time_per_iteration": 2.555878162384033 + }, + { + "auxiliary_loss_clip": 0.06625295, + "auxiliary_loss_mlp": 0.01283208, + "balance_loss_clip": 0.06332543, + "balance_loss_mlp": 0.01257983, + "epoch": 0.14790320156320458, + "flos": 15090635174400.0, + "grad_norm": 2.440333441232677, + "language_loss": 0.76468259, + "learning_rate": 3.855940884716071e-06, + "loss": 0.84376764, + "num_input_tokens_seen": 53426160, + "router_z_loss_clip": 2.92773438, + "router_z_loss_mlp": 0.2520752, + "step": 2460, + "time_per_iteration": 2.528325319290161 + }, + { + "auxiliary_loss_clip": 0.06624737, + "auxiliary_loss_mlp": 0.01287086, + "balance_loss_clip": 0.06318681, + "balance_loss_mlp": 0.0125912, + "epoch": 0.14796332481587254, + "flos": 26511894276480.0, + "grad_norm": 1.7434250987621476, + "language_loss": 0.82039559, + "learning_rate": 3.855795715150896e-06, + "loss": 0.89951384, + "num_input_tokens_seen": 53448530, + "router_z_loss_clip": 3.0625, + "router_z_loss_mlp": 0.27941895, + "step": 2461, + "time_per_iteration": 2.609023332595825 + }, + { + "auxiliary_loss_clip": 0.06627606, + "auxiliary_loss_mlp": 0.0129144, + "balance_loss_clip": 0.06326235, + "balance_loss_mlp": 0.01263497, + "epoch": 0.1480234480685405, + "flos": 17568392734080.0, + "grad_norm": 4.638743932579621, + "language_loss": 0.6665929, + "learning_rate": 3.855650475213761e-06, + "loss": 0.74578333, + "num_input_tokens_seen": 53465915, + "router_z_loss_clip": 3.015625, + "router_z_loss_mlp": 0.27954102, + "step": 2462, + "time_per_iteration": 2.5234897136688232 + }, + { + "auxiliary_loss_clip": 0.06619708, + "auxiliary_loss_mlp": 0.01287497, + "balance_loss_clip": 0.06320504, + "balance_loss_mlp": 0.01260925, + "epoch": 0.14808357132120847, + "flos": 53594693147520.0, + "grad_norm": 12.154278546197556, + "language_loss": 0.68225503, + "learning_rate": 3.8555051649101745e-06, + "loss": 0.76132703, + "num_input_tokens_seen": 53496055, + "router_z_loss_clip": 2.9921875, + "router_z_loss_mlp": 0.26574707, + "step": 2463, + "time_per_iteration": 2.847352981567383 + }, + { + "auxiliary_loss_clip": 0.06631631, + "auxiliary_loss_mlp": 0.01292564, + "balance_loss_clip": 0.06328086, + "balance_loss_mlp": 0.01264788, + "epoch": 0.14814369457387644, + "flos": 19835420474880.0, + "grad_norm": 2.5558663587768917, + "language_loss": 0.77389717, + "learning_rate": 3.855359784245646e-06, + "loss": 0.85313916, + "num_input_tokens_seen": 53513790, + "router_z_loss_clip": 3.03515625, + "router_z_loss_mlp": 0.27783203, + "step": 2464, + "time_per_iteration": 3.9868950843811035 + }, + { + "auxiliary_loss_clip": 0.0661262, + "auxiliary_loss_mlp": 0.01291855, + "balance_loss_clip": 0.06322042, + "balance_loss_mlp": 0.01266356, + "epoch": 0.1482038178265444, + "flos": 23921769991680.0, + "grad_norm": 1.9637026483751652, + "language_loss": 0.80667269, + "learning_rate": 3.855214333225688e-06, + "loss": 0.88571739, + "num_input_tokens_seen": 53533410, + "router_z_loss_clip": 2.90820312, + "router_z_loss_mlp": 0.25500488, + "step": 2465, + "time_per_iteration": 4.024165630340576 + }, + { + "auxiliary_loss_clip": 0.06628035, + "auxiliary_loss_mlp": 0.01295444, + "balance_loss_clip": 0.06321928, + "balance_loss_mlp": 0.01265976, + "epoch": 0.1482639410792124, + "flos": 24177376471680.0, + "grad_norm": 3.100026638907138, + "language_loss": 0.77266049, + "learning_rate": 3.855068811855817e-06, + "loss": 0.85189527, + "num_input_tokens_seen": 53554775, + "router_z_loss_clip": 3.06054688, + "router_z_loss_mlp": 0.29467773, + "step": 2466, + "time_per_iteration": 2.583932638168335 + }, + { + "auxiliary_loss_clip": 0.06510445, + "auxiliary_loss_mlp": 0.01275284, + "balance_loss_clip": 0.06339325, + "balance_loss_mlp": 0.012657, + "epoch": 0.14832406433188036, + "flos": 66209205916800.0, + "grad_norm": 0.9642098795906485, + "language_loss": 0.60506117, + "learning_rate": 3.854923220141551e-06, + "loss": 0.68291849, + "num_input_tokens_seen": 53609675, + "router_z_loss_clip": 1.71191406, + "router_z_loss_mlp": 0.09570312, + "step": 2467, + "time_per_iteration": 3.206559419631958 + }, + { + "auxiliary_loss_clip": 0.06627056, + "auxiliary_loss_mlp": 0.0129155, + "balance_loss_clip": 0.06326642, + "balance_loss_mlp": 0.01264573, + "epoch": 0.14838418758454833, + "flos": 25418372567040.0, + "grad_norm": 2.1383686818257877, + "language_loss": 0.88646448, + "learning_rate": 3.85477755808841e-06, + "loss": 0.96565056, + "num_input_tokens_seen": 53626950, + "router_z_loss_clip": 3.00390625, + "router_z_loss_mlp": 0.26965332, + "step": 2468, + "time_per_iteration": 2.586428642272949 + }, + { + "auxiliary_loss_clip": 0.06632069, + "auxiliary_loss_mlp": 0.01295941, + "balance_loss_clip": 0.0632536, + "balance_loss_mlp": 0.01267236, + "epoch": 0.1484443108372163, + "flos": 23295800465280.0, + "grad_norm": 2.089009169061615, + "language_loss": 0.76661634, + "learning_rate": 3.854631825701919e-06, + "loss": 0.84589648, + "num_input_tokens_seen": 53644200, + "router_z_loss_clip": 3.0625, + "router_z_loss_mlp": 0.28686523, + "step": 2469, + "time_per_iteration": 5.45016884803772 + }, + { + "auxiliary_loss_clip": 0.06630681, + "auxiliary_loss_mlp": 0.01291477, + "balance_loss_clip": 0.06328478, + "balance_loss_mlp": 0.01264131, + "epoch": 0.14850443408988426, + "flos": 14652949772160.0, + "grad_norm": 3.485678754962802, + "language_loss": 0.76790643, + "learning_rate": 3.854486022987603e-06, + "loss": 0.84712803, + "num_input_tokens_seen": 53659650, + "router_z_loss_clip": 3.0234375, + "router_z_loss_mlp": 0.2734375, + "step": 2470, + "time_per_iteration": 2.514772653579712 + }, + { + "auxiliary_loss_clip": 0.06622952, + "auxiliary_loss_mlp": 0.01299835, + "balance_loss_clip": 0.06329592, + "balance_loss_mlp": 0.0127324, + "epoch": 0.14856455734255222, + "flos": 23554761108480.0, + "grad_norm": 3.1357945603829576, + "language_loss": 0.73019731, + "learning_rate": 3.8543401499509905e-06, + "loss": 0.80942523, + "num_input_tokens_seen": 53680275, + "router_z_loss_clip": 2.93359375, + "router_z_loss_mlp": 0.26623535, + "step": 2471, + "time_per_iteration": 2.5867044925689697 + }, + { + "auxiliary_loss_clip": 0.06632146, + "auxiliary_loss_mlp": 0.01309567, + "balance_loss_clip": 0.06325525, + "balance_loss_mlp": 0.01281862, + "epoch": 0.1486246805952202, + "flos": 18083127565440.0, + "grad_norm": 2.6270207816723894, + "language_loss": 0.90878981, + "learning_rate": 3.854194206597615e-06, + "loss": 0.98820698, + "num_input_tokens_seen": 53698270, + "router_z_loss_clip": 3.0625, + "router_z_loss_mlp": 0.27709961, + "step": 2472, + "time_per_iteration": 2.5934388637542725 + }, + { + "auxiliary_loss_clip": 0.06620878, + "auxiliary_loss_mlp": 0.01314043, + "balance_loss_clip": 0.06322667, + "balance_loss_mlp": 0.01286136, + "epoch": 0.14868480384788818, + "flos": 19359566737920.0, + "grad_norm": 2.5877207728101332, + "language_loss": 0.81794894, + "learning_rate": 3.854048192933008e-06, + "loss": 0.89729816, + "num_input_tokens_seen": 53716845, + "router_z_loss_clip": 2.984375, + "router_z_loss_mlp": 0.2791748, + "step": 2473, + "time_per_iteration": 2.551769256591797 + }, + { + "auxiliary_loss_clip": 0.06630681, + "auxiliary_loss_mlp": 0.01339003, + "balance_loss_clip": 0.06328606, + "balance_loss_mlp": 0.01311346, + "epoch": 0.14874492710055615, + "flos": 22206723022080.0, + "grad_norm": 2.4925002468384423, + "language_loss": 0.79495537, + "learning_rate": 3.853902108962709e-06, + "loss": 0.87465227, + "num_input_tokens_seen": 53734970, + "router_z_loss_clip": 3.01953125, + "router_z_loss_mlp": 0.27624512, + "step": 2474, + "time_per_iteration": 2.55029034614563 + }, + { + "auxiliary_loss_clip": 0.06643772, + "auxiliary_loss_mlp": 0.01336817, + "balance_loss_clip": 0.06335679, + "balance_loss_mlp": 0.01309256, + "epoch": 0.1488050503532241, + "flos": 21109427879040.0, + "grad_norm": 2.598618910298095, + "language_loss": 0.8324194, + "learning_rate": 3.853755954692255e-06, + "loss": 0.91222525, + "num_input_tokens_seen": 53753415, + "router_z_loss_clip": 3.08007812, + "router_z_loss_mlp": 0.27575684, + "step": 2475, + "time_per_iteration": 2.557748794555664 + }, + { + "auxiliary_loss_clip": 0.06641456, + "auxiliary_loss_mlp": 0.01357893, + "balance_loss_clip": 0.06342697, + "balance_loss_mlp": 0.01329998, + "epoch": 0.14886517360589208, + "flos": 12791476592640.0, + "grad_norm": 3.118918756982401, + "language_loss": 0.81896377, + "learning_rate": 3.85360973012719e-06, + "loss": 0.89895725, + "num_input_tokens_seen": 53770305, + "router_z_loss_clip": 2.984375, + "router_z_loss_mlp": 0.27929688, + "step": 2476, + "time_per_iteration": 2.5228424072265625 + }, + { + "auxiliary_loss_clip": 0.06643493, + "auxiliary_loss_mlp": 0.01381513, + "balance_loss_clip": 0.06351461, + "balance_loss_mlp": 0.01354202, + "epoch": 0.14892529685856004, + "flos": 29030503501440.0, + "grad_norm": 5.933104141951435, + "language_loss": 0.78306687, + "learning_rate": 3.853463435273058e-06, + "loss": 0.86331695, + "num_input_tokens_seen": 53788895, + "router_z_loss_clip": 2.921875, + "router_z_loss_mlp": 0.27307129, + "step": 2477, + "time_per_iteration": 2.6379337310791016 + }, + { + "auxiliary_loss_clip": 0.06518018, + "auxiliary_loss_mlp": 0.01346882, + "balance_loss_clip": 0.06346889, + "balance_loss_mlp": 0.01337793, + "epoch": 0.148985420111228, + "flos": 61944215495040.0, + "grad_norm": 0.7948106415234558, + "language_loss": 0.60108519, + "learning_rate": 3.853317070135407e-06, + "loss": 0.67973411, + "num_input_tokens_seen": 53850260, + "router_z_loss_clip": 1.7109375, + "router_z_loss_mlp": 0.09100342, + "step": 2478, + "time_per_iteration": 3.2091856002807617 + }, + { + "auxiliary_loss_clip": 0.06656381, + "auxiliary_loss_mlp": 0.01381988, + "balance_loss_clip": 0.06356013, + "balance_loss_mlp": 0.01354606, + "epoch": 0.149045543363896, + "flos": 23921937699840.0, + "grad_norm": 3.933079411076695, + "language_loss": 0.71247137, + "learning_rate": 3.853170634719787e-06, + "loss": 0.79285508, + "num_input_tokens_seen": 53867520, + "router_z_loss_clip": 3.00195312, + "router_z_loss_mlp": 0.27392578, + "step": 2479, + "time_per_iteration": 2.613901376724243 + }, + { + "auxiliary_loss_clip": 0.06657803, + "auxiliary_loss_mlp": 0.01383638, + "balance_loss_clip": 0.06357619, + "balance_loss_mlp": 0.01356411, + "epoch": 0.14910566661656396, + "flos": 23660293726080.0, + "grad_norm": 3.520474403550157, + "language_loss": 0.82057166, + "learning_rate": 3.853024129031751e-06, + "loss": 0.90098608, + "num_input_tokens_seen": 53886620, + "router_z_loss_clip": 3.00390625, + "router_z_loss_mlp": 0.27246094, + "step": 2480, + "time_per_iteration": 2.6175220012664795 + }, + { + "auxiliary_loss_clip": 0.06659204, + "auxiliary_loss_mlp": 0.01416958, + "balance_loss_clip": 0.06354087, + "balance_loss_mlp": 0.01387727, + "epoch": 0.14916578986923193, + "flos": 20520452730240.0, + "grad_norm": 2.2296604280919805, + "language_loss": 0.85048115, + "learning_rate": 3.852877553076854e-06, + "loss": 0.9312427, + "num_input_tokens_seen": 53902230, + "router_z_loss_clip": 3.05273438, + "router_z_loss_mlp": 0.29248047, + "step": 2481, + "time_per_iteration": 2.617551565170288 + }, + { + "auxiliary_loss_clip": 0.06647365, + "auxiliary_loss_mlp": 0.01423314, + "balance_loss_clip": 0.06347671, + "balance_loss_mlp": 0.01393416, + "epoch": 0.1492259131218999, + "flos": 22498359557760.0, + "grad_norm": 1.912212150867571, + "language_loss": 0.78788674, + "learning_rate": 3.8527309068606546e-06, + "loss": 0.86859351, + "num_input_tokens_seen": 53919475, + "router_z_loss_clip": 2.9921875, + "router_z_loss_mlp": 0.29882812, + "step": 2482, + "time_per_iteration": 2.5733768939971924 + }, + { + "auxiliary_loss_clip": 0.06663539, + "auxiliary_loss_mlp": 0.0143468, + "balance_loss_clip": 0.06351975, + "balance_loss_mlp": 0.01405808, + "epoch": 0.14928603637456786, + "flos": 23192657688960.0, + "grad_norm": 2.2991604479376777, + "language_loss": 0.80652654, + "learning_rate": 3.852584190388713e-06, + "loss": 0.88750875, + "num_input_tokens_seen": 53939150, + "router_z_loss_clip": 3.1171875, + "router_z_loss_mlp": 0.28857422, + "step": 2483, + "time_per_iteration": 2.597843647003174 + }, + { + "auxiliary_loss_clip": 0.06641878, + "auxiliary_loss_mlp": 0.01472083, + "balance_loss_clip": 0.06352127, + "balance_loss_mlp": 0.01442948, + "epoch": 0.14934615962723582, + "flos": 21659731568640.0, + "grad_norm": 2.0225233992765728, + "language_loss": 0.71627355, + "learning_rate": 3.852437403666595e-06, + "loss": 0.79741317, + "num_input_tokens_seen": 53958735, + "router_z_loss_clip": 2.90039062, + "router_z_loss_mlp": 0.2911377, + "step": 2484, + "time_per_iteration": 2.5717227458953857 + }, + { + "auxiliary_loss_clip": 0.06650308, + "auxiliary_loss_mlp": 0.01467216, + "balance_loss_clip": 0.06347484, + "balance_loss_mlp": 0.01435006, + "epoch": 0.1494062828799038, + "flos": 27016356983040.0, + "grad_norm": 2.0068383034806154, + "language_loss": 0.85284823, + "learning_rate": 3.852290546699863e-06, + "loss": 0.9340235, + "num_input_tokens_seen": 53975065, + "router_z_loss_clip": 3.02539062, + "router_z_loss_mlp": 0.32226562, + "step": 2485, + "time_per_iteration": 2.7037456035614014 + }, + { + "auxiliary_loss_clip": 0.0664534, + "auxiliary_loss_mlp": 0.01441016, + "balance_loss_clip": 0.06342804, + "balance_loss_mlp": 0.01410952, + "epoch": 0.14946640613257178, + "flos": 21221291479680.0, + "grad_norm": 2.0879118929126133, + "language_loss": 0.85614496, + "learning_rate": 3.8521436194940894e-06, + "loss": 0.93700856, + "num_input_tokens_seen": 53993330, + "router_z_loss_clip": 3.02539062, + "router_z_loss_mlp": 0.30053711, + "step": 2486, + "time_per_iteration": 2.5492942333221436 + }, + { + "auxiliary_loss_clip": 0.06628142, + "auxiliary_loss_mlp": 0.01484598, + "balance_loss_clip": 0.06337839, + "balance_loss_mlp": 0.01454963, + "epoch": 0.14952652938523975, + "flos": 13375965548160.0, + "grad_norm": 2.5864541617313805, + "language_loss": 0.75625527, + "learning_rate": 3.851996622054842e-06, + "loss": 0.83738261, + "num_input_tokens_seen": 54010515, + "router_z_loss_clip": 2.90039062, + "router_z_loss_mlp": 0.29638672, + "step": 2487, + "time_per_iteration": 2.6050243377685547 + }, + { + "auxiliary_loss_clip": 0.06636909, + "auxiliary_loss_mlp": 0.01458272, + "balance_loss_clip": 0.06336737, + "balance_loss_mlp": 0.01427635, + "epoch": 0.1495866526379077, + "flos": 35526491608320.0, + "grad_norm": 2.6345212857914415, + "language_loss": 0.72756326, + "learning_rate": 3.8518495543877e-06, + "loss": 0.80851501, + "num_input_tokens_seen": 54031315, + "router_z_loss_clip": 3.0, + "router_z_loss_mlp": 0.30639648, + "step": 2488, + "time_per_iteration": 2.7038300037384033 + }, + { + "auxiliary_loss_clip": 0.06629623, + "auxiliary_loss_mlp": 0.01463441, + "balance_loss_clip": 0.06324254, + "balance_loss_mlp": 0.01431421, + "epoch": 0.14964677589057568, + "flos": 17637392171520.0, + "grad_norm": 3.2533111651102633, + "language_loss": 0.71329439, + "learning_rate": 3.851702416498235e-06, + "loss": 0.79422504, + "num_input_tokens_seen": 54045965, + "router_z_loss_clip": 3.05273438, + "router_z_loss_mlp": 0.3203125, + "step": 2489, + "time_per_iteration": 2.6397132873535156 + }, + { + "auxiliary_loss_clip": 0.06627091, + "auxiliary_loss_mlp": 0.01445303, + "balance_loss_clip": 0.06321006, + "balance_loss_mlp": 0.01412807, + "epoch": 0.14970689914324364, + "flos": 20190102808320.0, + "grad_norm": 15.387963507460157, + "language_loss": 0.82698536, + "learning_rate": 3.8515552083920295e-06, + "loss": 0.90770924, + "num_input_tokens_seen": 54059960, + "router_z_loss_clip": 3.05859375, + "router_z_loss_mlp": 0.32446289, + "step": 2490, + "time_per_iteration": 2.560051918029785 + }, + { + "auxiliary_loss_clip": 0.0662353, + "auxiliary_loss_mlp": 0.01421627, + "balance_loss_clip": 0.06318316, + "balance_loss_mlp": 0.013913, + "epoch": 0.1497670223959116, + "flos": 37237136238720.0, + "grad_norm": 2.555318554574921, + "language_loss": 0.81524169, + "learning_rate": 3.851407930074666e-06, + "loss": 0.8956933, + "num_input_tokens_seen": 54079330, + "router_z_loss_clip": 3.05273438, + "router_z_loss_mlp": 0.30322266, + "step": 2491, + "time_per_iteration": 2.7191121578216553 + }, + { + "auxiliary_loss_clip": 0.06628857, + "auxiliary_loss_mlp": 0.01437567, + "balance_loss_clip": 0.06323408, + "balance_loss_mlp": 0.01406072, + "epoch": 0.1498271456485796, + "flos": 24461675775360.0, + "grad_norm": 2.0859620961652032, + "language_loss": 0.91616488, + "learning_rate": 3.851260581551727e-06, + "loss": 0.99682909, + "num_input_tokens_seen": 54097555, + "router_z_loss_clip": 3.05273438, + "router_z_loss_mlp": 0.31469727, + "step": 2492, + "time_per_iteration": 2.5775644779205322 + }, + { + "auxiliary_loss_clip": 0.06620014, + "auxiliary_loss_mlp": 0.01407656, + "balance_loss_clip": 0.06319647, + "balance_loss_mlp": 0.01375589, + "epoch": 0.14988726890124757, + "flos": 16259235742080.0, + "grad_norm": 4.194340578044498, + "language_loss": 0.80698526, + "learning_rate": 3.851113162828802e-06, + "loss": 0.88726199, + "num_input_tokens_seen": 54115600, + "router_z_loss_clip": 3.00585938, + "router_z_loss_mlp": 0.3203125, + "step": 2493, + "time_per_iteration": 2.522217273712158 + }, + { + "auxiliary_loss_clip": 0.06625558, + "auxiliary_loss_mlp": 0.01423964, + "balance_loss_clip": 0.06320652, + "balance_loss_mlp": 0.01391014, + "epoch": 0.14994739215391553, + "flos": 20672622944640.0, + "grad_norm": 1.92476481647275, + "language_loss": 0.81586623, + "learning_rate": 3.85096567391148e-06, + "loss": 0.89636147, + "num_input_tokens_seen": 54135220, + "router_z_loss_clip": 3.05078125, + "router_z_loss_mlp": 0.32958984, + "step": 2494, + "time_per_iteration": 2.5768370628356934 + }, + { + "auxiliary_loss_clip": 0.06620924, + "auxiliary_loss_mlp": 0.01381746, + "balance_loss_clip": 0.06323613, + "balance_loss_mlp": 0.01351562, + "epoch": 0.1500075154065835, + "flos": 70666855603200.0, + "grad_norm": 1.9921469546830013, + "language_loss": 0.67712897, + "learning_rate": 3.850818114805354e-06, + "loss": 0.75715572, + "num_input_tokens_seen": 54161065, + "router_z_loss_clip": 2.97851562, + "router_z_loss_mlp": 0.30187988, + "step": 2495, + "time_per_iteration": 2.9661571979522705 + }, + { + "auxiliary_loss_clip": 0.06548879, + "auxiliary_loss_mlp": 0.01321563, + "balance_loss_clip": 0.06377496, + "balance_loss_mlp": 0.01310876, + "epoch": 0.15006763865925146, + "flos": 68029827431040.0, + "grad_norm": 0.8769612772619841, + "language_loss": 0.5954529, + "learning_rate": 3.850670485516019e-06, + "loss": 0.67415726, + "num_input_tokens_seen": 54225095, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.10699463, + "step": 2496, + "time_per_iteration": 3.202047109603882 + }, + { + "auxiliary_loss_clip": 0.06631249, + "auxiliary_loss_mlp": 0.0133476, + "balance_loss_clip": 0.06323538, + "balance_loss_mlp": 0.01304254, + "epoch": 0.15012776191191943, + "flos": 18922216752000.0, + "grad_norm": 2.34505525234942, + "language_loss": 0.66916072, + "learning_rate": 3.850522786049075e-06, + "loss": 0.74882078, + "num_input_tokens_seen": 54243750, + "router_z_loss_clip": 3.08007812, + "router_z_loss_mlp": 0.30505371, + "step": 2497, + "time_per_iteration": 2.5355312824249268 + }, + { + "auxiliary_loss_clip": 0.06621728, + "auxiliary_loss_mlp": 0.01327478, + "balance_loss_clip": 0.06319709, + "balance_loss_mlp": 0.01299762, + "epoch": 0.1501878851645874, + "flos": 23708985747840.0, + "grad_norm": 1.6926191632820315, + "language_loss": 0.76545727, + "learning_rate": 3.850375016410121e-06, + "loss": 0.84494931, + "num_input_tokens_seen": 54266185, + "router_z_loss_clip": 3.0234375, + "router_z_loss_mlp": 0.27746582, + "step": 2498, + "time_per_iteration": 2.6315629482269287 + }, + { + "auxiliary_loss_clip": 0.06625126, + "auxiliary_loss_mlp": 0.0132033, + "balance_loss_clip": 0.06315958, + "balance_loss_mlp": 0.01288454, + "epoch": 0.15024800841725539, + "flos": 20418777400320.0, + "grad_norm": 2.3031515729251377, + "language_loss": 0.72851908, + "learning_rate": 3.850227176604761e-06, + "loss": 0.80797374, + "num_input_tokens_seen": 54283940, + "router_z_loss_clip": 3.09570312, + "router_z_loss_mlp": 0.3190918, + "step": 2499, + "time_per_iteration": 2.550572395324707 + }, + { + "auxiliary_loss_clip": 0.06615321, + "auxiliary_loss_mlp": 0.01299804, + "balance_loss_clip": 0.06312654, + "balance_loss_mlp": 0.01270002, + "epoch": 0.15030813166992335, + "flos": 31838904472320.0, + "grad_norm": 2.1036429780105204, + "language_loss": 0.72527623, + "learning_rate": 3.850079266638601e-06, + "loss": 0.80442744, + "num_input_tokens_seen": 54304830, + "router_z_loss_clip": 3.02539062, + "router_z_loss_mlp": 0.29760742, + "step": 2500, + "time_per_iteration": 2.66140079498291 + }, + { + "auxiliary_loss_clip": 0.06611083, + "auxiliary_loss_mlp": 0.01296332, + "balance_loss_clip": 0.06309603, + "balance_loss_mlp": 0.0126765, + "epoch": 0.15036825492259132, + "flos": 35665664440320.0, + "grad_norm": 2.1651988912264697, + "language_loss": 0.6639303, + "learning_rate": 3.849931286517249e-06, + "loss": 0.74300444, + "num_input_tokens_seen": 54325595, + "router_z_loss_clip": 3.01953125, + "router_z_loss_mlp": 0.28686523, + "step": 2501, + "time_per_iteration": 2.6920387744903564 + }, + { + "auxiliary_loss_clip": 0.06617519, + "auxiliary_loss_mlp": 0.0129342, + "balance_loss_clip": 0.06313312, + "balance_loss_mlp": 0.01262283, + "epoch": 0.15042837817525928, + "flos": 18843238679040.0, + "grad_norm": 2.189390095106363, + "language_loss": 0.84965289, + "learning_rate": 3.849783236246318e-06, + "loss": 0.92876226, + "num_input_tokens_seen": 54342180, + "router_z_loss_clip": 3.04101562, + "router_z_loss_mlp": 0.31152344, + "step": 2502, + "time_per_iteration": 2.5896334648132324 + }, + { + "auxiliary_loss_clip": 0.06611362, + "auxiliary_loss_mlp": 0.01289243, + "balance_loss_clip": 0.06310903, + "balance_loss_mlp": 0.0126142, + "epoch": 0.15048850142792725, + "flos": 19541436024960.0, + "grad_norm": 2.1165990533687746, + "language_loss": 0.78282011, + "learning_rate": 3.849635115831421e-06, + "loss": 0.86182618, + "num_input_tokens_seen": 54360255, + "router_z_loss_clip": 3.00585938, + "router_z_loss_mlp": 0.2779541, + "step": 2503, + "time_per_iteration": 3.9853694438934326 + }, + { + "auxiliary_loss_clip": 0.06603716, + "auxiliary_loss_mlp": 0.01289674, + "balance_loss_clip": 0.06307186, + "balance_loss_mlp": 0.01263102, + "epoch": 0.1505486246805952, + "flos": 22024015194240.0, + "grad_norm": 1.9675013040349558, + "language_loss": 0.8635025, + "learning_rate": 3.849486925278176e-06, + "loss": 0.94243646, + "num_input_tokens_seen": 54378260, + "router_z_loss_clip": 2.96679688, + "router_z_loss_mlp": 0.26586914, + "step": 2504, + "time_per_iteration": 2.544656991958618 + }, + { + "auxiliary_loss_clip": 0.06603047, + "auxiliary_loss_mlp": 0.0129183, + "balance_loss_clip": 0.06305411, + "balance_loss_mlp": 0.01264794, + "epoch": 0.15060874793326318, + "flos": 20749840081920.0, + "grad_norm": 2.8187796049403127, + "language_loss": 0.83803535, + "learning_rate": 3.8493386645922e-06, + "loss": 0.91698414, + "num_input_tokens_seen": 54399745, + "router_z_loss_clip": 2.97851562, + "router_z_loss_mlp": 0.27050781, + "step": 2505, + "time_per_iteration": 3.988954544067383 + }, + { + "auxiliary_loss_clip": 0.06600159, + "auxiliary_loss_mlp": 0.01291215, + "balance_loss_clip": 0.06303117, + "balance_loss_mlp": 0.01263249, + "epoch": 0.15066887118593117, + "flos": 16477470501120.0, + "grad_norm": 1.903749804745976, + "language_loss": 0.77148849, + "learning_rate": 3.849190333779117e-06, + "loss": 0.85040224, + "num_input_tokens_seen": 54417105, + "router_z_loss_clip": 2.97070312, + "router_z_loss_mlp": 0.27978516, + "step": 2506, + "time_per_iteration": 2.548551559448242 + }, + { + "auxiliary_loss_clip": 0.06619012, + "auxiliary_loss_mlp": 0.01287214, + "balance_loss_clip": 0.06307869, + "balance_loss_mlp": 0.01257722, + "epoch": 0.15072899443859913, + "flos": 19864490641920.0, + "grad_norm": 4.281401041045214, + "language_loss": 0.78119665, + "learning_rate": 3.849041932844552e-06, + "loss": 0.86025894, + "num_input_tokens_seen": 54433920, + "router_z_loss_clip": 3.11328125, + "router_z_loss_mlp": 0.29467773, + "step": 2507, + "time_per_iteration": 2.494123697280884 + }, + { + "auxiliary_loss_clip": 0.06598042, + "auxiliary_loss_mlp": 0.01289211, + "balance_loss_clip": 0.06304646, + "balance_loss_mlp": 0.01262532, + "epoch": 0.1507891176912671, + "flos": 20782348266240.0, + "grad_norm": 1.9743385281698682, + "language_loss": 0.69510758, + "learning_rate": 3.848893461794131e-06, + "loss": 0.77398014, + "num_input_tokens_seen": 54451540, + "router_z_loss_clip": 2.93554688, + "router_z_loss_mlp": 0.26647949, + "step": 2508, + "time_per_iteration": 2.53487491607666 + }, + { + "auxiliary_loss_clip": 0.06608425, + "auxiliary_loss_mlp": 0.01288258, + "balance_loss_clip": 0.06303222, + "balance_loss_mlp": 0.01259946, + "epoch": 0.15084924094393506, + "flos": 23593390640640.0, + "grad_norm": 1.8413842263271991, + "language_loss": 0.78278601, + "learning_rate": 3.8487449206334845e-06, + "loss": 0.86175287, + "num_input_tokens_seen": 54470800, + "router_z_loss_clip": 3.0546875, + "router_z_loss_mlp": 0.28320312, + "step": 2509, + "time_per_iteration": 5.512920141220093 + }, + { + "auxiliary_loss_clip": 0.06619874, + "auxiliary_loss_mlp": 0.01301611, + "balance_loss_clip": 0.06305903, + "balance_loss_mlp": 0.01270879, + "epoch": 0.15090936419660303, + "flos": 18916430820480.0, + "grad_norm": 3.8878243194331756, + "language_loss": 0.82607746, + "learning_rate": 3.848596309368246e-06, + "loss": 0.90529227, + "num_input_tokens_seen": 54486525, + "router_z_loss_clip": 3.140625, + "router_z_loss_mlp": 0.30688477, + "step": 2510, + "time_per_iteration": 2.4956603050231934 + }, + { + "auxiliary_loss_clip": 0.0661021, + "auxiliary_loss_mlp": 0.01290438, + "balance_loss_clip": 0.06301613, + "balance_loss_mlp": 0.01258919, + "epoch": 0.150969487449271, + "flos": 17933514900480.0, + "grad_norm": 2.455863983709149, + "language_loss": 0.74876237, + "learning_rate": 3.8484476280040495e-06, + "loss": 0.82776886, + "num_input_tokens_seen": 54503795, + "router_z_loss_clip": 3.08789062, + "router_z_loss_mlp": 0.31518555, + "step": 2511, + "time_per_iteration": 2.551175832748413 + }, + { + "auxiliary_loss_clip": 0.06603982, + "auxiliary_loss_mlp": 0.0129301, + "balance_loss_clip": 0.06306278, + "balance_loss_mlp": 0.012649, + "epoch": 0.151029610701939, + "flos": 24249897780480.0, + "grad_norm": 3.2919067663681854, + "language_loss": 0.6990515, + "learning_rate": 3.848298876546534e-06, + "loss": 0.77802145, + "num_input_tokens_seen": 54523025, + "router_z_loss_clip": 2.9765625, + "router_z_loss_mlp": 0.28100586, + "step": 2512, + "time_per_iteration": 2.592564344406128 + }, + { + "auxiliary_loss_clip": 0.06602003, + "auxiliary_loss_mlp": 0.01290201, + "balance_loss_clip": 0.06302576, + "balance_loss_mlp": 0.01260136, + "epoch": 0.15108973395460695, + "flos": 30270199858560.0, + "grad_norm": 3.311694411348407, + "language_loss": 0.75370401, + "learning_rate": 3.84815005500134e-06, + "loss": 0.8326261, + "num_input_tokens_seen": 54545025, + "router_z_loss_clip": 2.99609375, + "router_z_loss_mlp": 0.30078125, + "step": 2513, + "time_per_iteration": 2.675105571746826 + }, + { + "auxiliary_loss_clip": 0.06516539, + "auxiliary_loss_mlp": 0.01341982, + "balance_loss_clip": 0.06344443, + "balance_loss_mlp": 0.01333804, + "epoch": 0.15114985720727492, + "flos": 60456711087360.0, + "grad_norm": 0.8564181084280313, + "language_loss": 0.64582717, + "learning_rate": 3.84800116337411e-06, + "loss": 0.72441238, + "num_input_tokens_seen": 54604545, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.08178711, + "step": 2514, + "time_per_iteration": 3.1119604110717773 + }, + { + "auxiliary_loss_clip": 0.06602134, + "auxiliary_loss_mlp": 0.01300136, + "balance_loss_clip": 0.06303127, + "balance_loss_mlp": 0.01271299, + "epoch": 0.15120998045994288, + "flos": 20527915743360.0, + "grad_norm": 2.3848506685629487, + "language_loss": 0.74193883, + "learning_rate": 3.8478522016704916e-06, + "loss": 0.82096153, + "num_input_tokens_seen": 54620590, + "router_z_loss_clip": 2.9921875, + "router_z_loss_mlp": 0.28869629, + "step": 2515, + "time_per_iteration": 2.554006814956665 + }, + { + "auxiliary_loss_clip": 0.06601816, + "auxiliary_loss_mlp": 0.01297055, + "balance_loss_clip": 0.06304994, + "balance_loss_mlp": 0.01269577, + "epoch": 0.15127010371261085, + "flos": 21185303351040.0, + "grad_norm": 1.9231590772251361, + "language_loss": 0.78707075, + "learning_rate": 3.8477031698961325e-06, + "loss": 0.86605948, + "num_input_tokens_seen": 54640410, + "router_z_loss_clip": 2.96875, + "router_z_loss_mlp": 0.27490234, + "step": 2516, + "time_per_iteration": 2.5447309017181396 + }, + { + "auxiliary_loss_clip": 0.06496674, + "auxiliary_loss_mlp": 0.01300995, + "balance_loss_clip": 0.063242, + "balance_loss_mlp": 0.01292406, + "epoch": 0.1513302269652788, + "flos": 65339537189760.0, + "grad_norm": 0.7164418146378366, + "language_loss": 0.54901356, + "learning_rate": 3.8475540680566835e-06, + "loss": 0.62699026, + "num_input_tokens_seen": 54701430, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.08599854, + "step": 2517, + "time_per_iteration": 3.1926348209381104 + }, + { + "auxiliary_loss_clip": 0.06606746, + "auxiliary_loss_mlp": 0.01299298, + "balance_loss_clip": 0.06308446, + "balance_loss_mlp": 0.01269257, + "epoch": 0.15139035021794678, + "flos": 19141918957440.0, + "grad_norm": 1.8480469380115683, + "language_loss": 0.79359663, + "learning_rate": 3.8474048961577995e-06, + "loss": 0.87265706, + "num_input_tokens_seen": 54720845, + "router_z_loss_clip": 2.98046875, + "router_z_loss_mlp": 0.30078125, + "step": 2518, + "time_per_iteration": 2.563261032104492 + }, + { + "auxiliary_loss_clip": 0.06615496, + "auxiliary_loss_mlp": 0.01294147, + "balance_loss_clip": 0.06308527, + "balance_loss_mlp": 0.01264154, + "epoch": 0.15145047347061477, + "flos": 26585841104640.0, + "grad_norm": 2.595059574569343, + "language_loss": 0.71604168, + "learning_rate": 3.847255654205137e-06, + "loss": 0.79513812, + "num_input_tokens_seen": 54740495, + "router_z_loss_clip": 3.0703125, + "router_z_loss_mlp": 0.29980469, + "step": 2519, + "time_per_iteration": 2.5810017585754395 + }, + { + "auxiliary_loss_clip": 0.06607082, + "auxiliary_loss_mlp": 0.01285902, + "balance_loss_clip": 0.06307598, + "balance_loss_mlp": 0.01257483, + "epoch": 0.15151059672328274, + "flos": 20309177859840.0, + "grad_norm": 2.5486902935962368, + "language_loss": 0.80309343, + "learning_rate": 3.847106342204354e-06, + "loss": 0.88202327, + "num_input_tokens_seen": 54758415, + "router_z_loss_clip": 2.99804688, + "router_z_loss_mlp": 0.28393555, + "step": 2520, + "time_per_iteration": 2.5701065063476562 + }, + { + "auxiliary_loss_clip": 0.06607689, + "auxiliary_loss_mlp": 0.01293848, + "balance_loss_clip": 0.06306153, + "balance_loss_mlp": 0.01262853, + "epoch": 0.1515707199759507, + "flos": 27234591742080.0, + "grad_norm": 2.513682116437687, + "language_loss": 0.7522434, + "learning_rate": 3.846956960161114e-06, + "loss": 0.83125877, + "num_input_tokens_seen": 54779355, + "router_z_loss_clip": 3.01953125, + "router_z_loss_mlp": 0.31005859, + "step": 2521, + "time_per_iteration": 2.6066393852233887 + }, + { + "auxiliary_loss_clip": 0.06609409, + "auxiliary_loss_mlp": 0.01293912, + "balance_loss_clip": 0.06305401, + "balance_loss_mlp": 0.012643, + "epoch": 0.15163084322861867, + "flos": 23594229181440.0, + "grad_norm": 3.360256579964136, + "language_loss": 0.82804251, + "learning_rate": 3.84680750808108e-06, + "loss": 0.9070757, + "num_input_tokens_seen": 54799465, + "router_z_loss_clip": 3.04101562, + "router_z_loss_mlp": 0.29614258, + "step": 2522, + "time_per_iteration": 2.6204471588134766 + }, + { + "auxiliary_loss_clip": 0.06466869, + "auxiliary_loss_mlp": 0.01261371, + "balance_loss_clip": 0.06295171, + "balance_loss_mlp": 0.01253491, + "epoch": 0.15169096648128663, + "flos": 66908786855040.0, + "grad_norm": 0.8016115215940587, + "language_loss": 0.58029842, + "learning_rate": 3.846657985969922e-06, + "loss": 0.65758073, + "num_input_tokens_seen": 54857665, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.07873535, + "step": 2523, + "time_per_iteration": 3.1140880584716797 + }, + { + "auxiliary_loss_clip": 0.06599564, + "auxiliary_loss_mlp": 0.0128657, + "balance_loss_clip": 0.0630584, + "balance_loss_mlp": 0.0125821, + "epoch": 0.1517510897339546, + "flos": 29103024810240.0, + "grad_norm": 3.3848907238065324, + "language_loss": 0.7552231, + "learning_rate": 3.8465083938333066e-06, + "loss": 0.83408445, + "num_input_tokens_seen": 54879895, + "router_z_loss_clip": 2.93945312, + "router_z_loss_mlp": 0.2833252, + "step": 2524, + "time_per_iteration": 2.6701698303222656 + }, + { + "auxiliary_loss_clip": 0.066016, + "auxiliary_loss_mlp": 0.01289357, + "balance_loss_clip": 0.0629995, + "balance_loss_mlp": 0.01259889, + "epoch": 0.1518112129866226, + "flos": 18412597019520.0, + "grad_norm": 1.915224291313093, + "language_loss": 0.75580716, + "learning_rate": 3.8463587316769085e-06, + "loss": 0.8347168, + "num_input_tokens_seen": 54898245, + "router_z_loss_clip": 3.01757812, + "router_z_loss_mlp": 0.29443359, + "step": 2525, + "time_per_iteration": 2.5224146842956543 + }, + { + "auxiliary_loss_clip": 0.06610245, + "auxiliary_loss_mlp": 0.01284071, + "balance_loss_clip": 0.06303011, + "balance_loss_mlp": 0.01254436, + "epoch": 0.15187133623929056, + "flos": 19431165651840.0, + "grad_norm": 1.8765466933559616, + "language_loss": 0.80763042, + "learning_rate": 3.846208999506402e-06, + "loss": 0.88657361, + "num_input_tokens_seen": 54917060, + "router_z_loss_clip": 3.0703125, + "router_z_loss_mlp": 0.29638672, + "step": 2526, + "time_per_iteration": 2.6248834133148193 + }, + { + "auxiliary_loss_clip": 0.06594585, + "auxiliary_loss_mlp": 0.01286752, + "balance_loss_clip": 0.06300339, + "balance_loss_mlp": 0.01258869, + "epoch": 0.15193145949195852, + "flos": 17571914605440.0, + "grad_norm": 1.7842428302313325, + "language_loss": 0.8627159, + "learning_rate": 3.846059197327466e-06, + "loss": 0.94152921, + "num_input_tokens_seen": 54936365, + "router_z_loss_clip": 2.94335938, + "router_z_loss_mlp": 0.27893066, + "step": 2527, + "time_per_iteration": 2.5703248977661133 + }, + { + "auxiliary_loss_clip": 0.06595106, + "auxiliary_loss_mlp": 0.01287139, + "balance_loss_clip": 0.06298759, + "balance_loss_mlp": 0.01258386, + "epoch": 0.15199158274462649, + "flos": 36185472443520.0, + "grad_norm": 2.5277358880769034, + "language_loss": 0.69832277, + "learning_rate": 3.845909325145779e-06, + "loss": 0.77714521, + "num_input_tokens_seen": 54961365, + "router_z_loss_clip": 2.9609375, + "router_z_loss_mlp": 0.28710938, + "step": 2528, + "time_per_iteration": 2.6980392932891846 + }, + { + "auxiliary_loss_clip": 0.06594975, + "auxiliary_loss_mlp": 0.01296705, + "balance_loss_clip": 0.06302442, + "balance_loss_mlp": 0.01268142, + "epoch": 0.15205170599729445, + "flos": 23080416744960.0, + "grad_norm": 1.7045403282780136, + "language_loss": 0.87845027, + "learning_rate": 3.845759382967026e-06, + "loss": 0.95736718, + "num_input_tokens_seen": 54980750, + "router_z_loss_clip": 2.92382812, + "router_z_loss_mlp": 0.28588867, + "step": 2529, + "time_per_iteration": 2.557424545288086 + }, + { + "auxiliary_loss_clip": 0.06594887, + "auxiliary_loss_mlp": 0.01282389, + "balance_loss_clip": 0.06300049, + "balance_loss_mlp": 0.01254446, + "epoch": 0.15211182924996242, + "flos": 21914876851200.0, + "grad_norm": 2.4637975770903227, + "language_loss": 0.84209996, + "learning_rate": 3.845609370796893e-06, + "loss": 0.92087275, + "num_input_tokens_seen": 54999675, + "router_z_loss_clip": 2.9453125, + "router_z_loss_mlp": 0.27929688, + "step": 2530, + "time_per_iteration": 2.567228317260742 + }, + { + "auxiliary_loss_clip": 0.06598973, + "auxiliary_loss_mlp": 0.01283946, + "balance_loss_clip": 0.06302072, + "balance_loss_mlp": 0.01255336, + "epoch": 0.15217195250263038, + "flos": 13886675383680.0, + "grad_norm": 2.4321779104905312, + "language_loss": 0.82142234, + "learning_rate": 3.845459288641066e-06, + "loss": 0.90025157, + "num_input_tokens_seen": 55018295, + "router_z_loss_clip": 2.96875, + "router_z_loss_mlp": 0.28637695, + "step": 2531, + "time_per_iteration": 2.516402006149292 + }, + { + "auxiliary_loss_clip": 0.06592906, + "auxiliary_loss_mlp": 0.01285145, + "balance_loss_clip": 0.06298403, + "balance_loss_mlp": 0.01258085, + "epoch": 0.15223207575529837, + "flos": 24542247075840.0, + "grad_norm": 1.9096136580750296, + "language_loss": 0.79480046, + "learning_rate": 3.8453091365052394e-06, + "loss": 0.87358099, + "num_input_tokens_seen": 55037975, + "router_z_loss_clip": 2.94726562, + "router_z_loss_mlp": 0.27050781, + "step": 2532, + "time_per_iteration": 2.602570056915283 + }, + { + "auxiliary_loss_clip": 0.06598103, + "auxiliary_loss_mlp": 0.01292588, + "balance_loss_clip": 0.06306568, + "balance_loss_mlp": 0.01264038, + "epoch": 0.15229219900796634, + "flos": 25563876382080.0, + "grad_norm": 2.360683407186041, + "language_loss": 0.88639164, + "learning_rate": 3.845158914395105e-06, + "loss": 0.96529853, + "num_input_tokens_seen": 55057135, + "router_z_loss_clip": 2.91796875, + "router_z_loss_mlp": 0.28552246, + "step": 2533, + "time_per_iteration": 2.5762295722961426 + }, + { + "auxiliary_loss_clip": 0.06594107, + "auxiliary_loss_mlp": 0.01284606, + "balance_loss_clip": 0.06298208, + "balance_loss_mlp": 0.01254935, + "epoch": 0.1523523222606343, + "flos": 18222761594880.0, + "grad_norm": 2.499608410280873, + "language_loss": 0.79898536, + "learning_rate": 3.84500862231636e-06, + "loss": 0.87777245, + "num_input_tokens_seen": 55075525, + "router_z_loss_clip": 2.95898438, + "router_z_loss_mlp": 0.29650879, + "step": 2534, + "time_per_iteration": 2.5181829929351807 + }, + { + "auxiliary_loss_clip": 0.06609488, + "auxiliary_loss_mlp": 0.01289006, + "balance_loss_clip": 0.0630374, + "balance_loss_mlp": 0.01258965, + "epoch": 0.15241244551330227, + "flos": 13264940488320.0, + "grad_norm": 3.191609676619316, + "language_loss": 0.77956164, + "learning_rate": 3.844858260274702e-06, + "loss": 0.8585465, + "num_input_tokens_seen": 55090845, + "router_z_loss_clip": 3.05859375, + "router_z_loss_mlp": 0.30029297, + "step": 2535, + "time_per_iteration": 2.503119945526123 + }, + { + "auxiliary_loss_clip": 0.06608094, + "auxiliary_loss_mlp": 0.01284526, + "balance_loss_clip": 0.06304367, + "balance_loss_mlp": 0.01254271, + "epoch": 0.15247256876597023, + "flos": 19721083178880.0, + "grad_norm": 3.2947050027003066, + "language_loss": 0.79165435, + "learning_rate": 3.844707828275835e-06, + "loss": 0.87058055, + "num_input_tokens_seen": 55108750, + "router_z_loss_clip": 3.03710938, + "router_z_loss_mlp": 0.30249023, + "step": 2536, + "time_per_iteration": 2.5530476570129395 + }, + { + "auxiliary_loss_clip": 0.06598002, + "auxiliary_loss_mlp": 0.0128534, + "balance_loss_clip": 0.06305596, + "balance_loss_mlp": 0.01255537, + "epoch": 0.1525326920186382, + "flos": 20382076512000.0, + "grad_norm": 2.2639852442912174, + "language_loss": 0.76164496, + "learning_rate": 3.844557326325461e-06, + "loss": 0.84047836, + "num_input_tokens_seen": 55126750, + "router_z_loss_clip": 2.92578125, + "router_z_loss_mlp": 0.29785156, + "step": 2537, + "time_per_iteration": 2.5634751319885254 + }, + { + "auxiliary_loss_clip": 0.06616107, + "auxiliary_loss_mlp": 0.01291403, + "balance_loss_clip": 0.06314284, + "balance_loss_mlp": 0.0126017, + "epoch": 0.15259281527130616, + "flos": 13595122702080.0, + "grad_norm": 2.083719097909717, + "language_loss": 0.78846097, + "learning_rate": 3.8444067544292896e-06, + "loss": 0.86753607, + "num_input_tokens_seen": 55144690, + "router_z_loss_clip": 3.01953125, + "router_z_loss_mlp": 0.31225586, + "step": 2538, + "time_per_iteration": 2.525216579437256 + }, + { + "auxiliary_loss_clip": 0.0661103, + "auxiliary_loss_mlp": 0.01284923, + "balance_loss_clip": 0.06318808, + "balance_loss_mlp": 0.0125735, + "epoch": 0.15265293852397416, + "flos": 22867590574080.0, + "grad_norm": 1.595971485409624, + "language_loss": 0.90629852, + "learning_rate": 3.844256112593029e-06, + "loss": 0.98525798, + "num_input_tokens_seen": 55166055, + "router_z_loss_clip": 2.921875, + "router_z_loss_mlp": 0.27600098, + "step": 2539, + "time_per_iteration": 2.5915887355804443 + }, + { + "auxiliary_loss_clip": 0.06619261, + "auxiliary_loss_mlp": 0.01284998, + "balance_loss_clip": 0.06323005, + "balance_loss_mlp": 0.01258056, + "epoch": 0.15271306177664212, + "flos": 29245174462080.0, + "grad_norm": 1.9545185046664433, + "language_loss": 0.94507146, + "learning_rate": 3.844105400822391e-06, + "loss": 1.02411401, + "num_input_tokens_seen": 55186285, + "router_z_loss_clip": 2.9609375, + "router_z_loss_mlp": 0.26953125, + "step": 2540, + "time_per_iteration": 2.5957653522491455 + }, + { + "auxiliary_loss_clip": 0.06626961, + "auxiliary_loss_mlp": 0.01293534, + "balance_loss_clip": 0.06334557, + "balance_loss_mlp": 0.01266021, + "epoch": 0.1527731850293101, + "flos": 31253912392320.0, + "grad_norm": 1.8583637495379903, + "language_loss": 0.76235664, + "learning_rate": 3.843954619123092e-06, + "loss": 0.84156162, + "num_input_tokens_seen": 55207915, + "router_z_loss_clip": 2.92578125, + "router_z_loss_mlp": 0.27490234, + "step": 2541, + "time_per_iteration": 2.6641690731048584 + }, + { + "auxiliary_loss_clip": 0.06626125, + "auxiliary_loss_mlp": 0.01288118, + "balance_loss_clip": 0.06332077, + "balance_loss_mlp": 0.01259139, + "epoch": 0.15283330828197805, + "flos": 22388550382080.0, + "grad_norm": 1.961487412354616, + "language_loss": 0.82183802, + "learning_rate": 3.84380376750085e-06, + "loss": 0.90098047, + "num_input_tokens_seen": 55227860, + "router_z_loss_clip": 2.94140625, + "router_z_loss_mlp": 0.28991699, + "step": 2542, + "time_per_iteration": 2.5667076110839844 + }, + { + "auxiliary_loss_clip": 0.06644198, + "auxiliary_loss_mlp": 0.01293823, + "balance_loss_clip": 0.0634245, + "balance_loss_mlp": 0.01263568, + "epoch": 0.15289343153464602, + "flos": 25527175493760.0, + "grad_norm": 2.1541705335190597, + "language_loss": 0.78364998, + "learning_rate": 3.843652845961383e-06, + "loss": 0.8630302, + "num_input_tokens_seen": 55247330, + "router_z_loss_clip": 3.01953125, + "router_z_loss_mlp": 0.3026123, + "step": 2543, + "time_per_iteration": 3.986154556274414 + }, + { + "auxiliary_loss_clip": 0.06638096, + "auxiliary_loss_mlp": 0.01299522, + "balance_loss_clip": 0.06343587, + "balance_loss_mlp": 0.01271616, + "epoch": 0.15295355478731398, + "flos": 22716468535680.0, + "grad_norm": 3.1436155023596886, + "language_loss": 0.88072753, + "learning_rate": 3.843501854510416e-06, + "loss": 0.96010375, + "num_input_tokens_seen": 55266195, + "router_z_loss_clip": 2.9453125, + "router_z_loss_mlp": 0.27905273, + "step": 2544, + "time_per_iteration": 3.9873733520507812 + }, + { + "auxiliary_loss_clip": 0.06648069, + "auxiliary_loss_mlp": 0.01297216, + "balance_loss_clip": 0.06342938, + "balance_loss_mlp": 0.01266937, + "epoch": 0.15301367803998198, + "flos": 23257548276480.0, + "grad_norm": 3.867712661232465, + "language_loss": 0.83686781, + "learning_rate": 3.843350793153673e-06, + "loss": 0.91632062, + "num_input_tokens_seen": 55283305, + "router_z_loss_clip": 3.05078125, + "router_z_loss_mlp": 0.30273438, + "step": 2545, + "time_per_iteration": 2.5443849563598633 + }, + { + "auxiliary_loss_clip": 0.06650628, + "auxiliary_loss_mlp": 0.01286742, + "balance_loss_clip": 0.06356554, + "balance_loss_mlp": 0.01259086, + "epoch": 0.15307380129264994, + "flos": 25893597398400.0, + "grad_norm": 2.572032347282614, + "language_loss": 0.71873057, + "learning_rate": 3.843199661896884e-06, + "loss": 0.79810423, + "num_input_tokens_seen": 55303035, + "router_z_loss_clip": 2.94335938, + "router_z_loss_mlp": 0.27661133, + "step": 2546, + "time_per_iteration": 2.650826930999756 + }, + { + "auxiliary_loss_clip": 0.06637084, + "auxiliary_loss_mlp": 0.0129342, + "balance_loss_clip": 0.06340081, + "balance_loss_mlp": 0.01263164, + "epoch": 0.1531339245453179, + "flos": 46983780766080.0, + "grad_norm": 1.694960648035813, + "language_loss": 0.78831929, + "learning_rate": 3.843048460745779e-06, + "loss": 0.86762434, + "num_input_tokens_seen": 55327570, + "router_z_loss_clip": 2.96875, + "router_z_loss_mlp": 0.30249023, + "step": 2547, + "time_per_iteration": 2.7530312538146973 + }, + { + "auxiliary_loss_clip": 0.06643492, + "auxiliary_loss_mlp": 0.01284901, + "balance_loss_clip": 0.06342105, + "balance_loss_mlp": 0.0125579, + "epoch": 0.15319404779798587, + "flos": 35890817160960.0, + "grad_norm": 3.38346990001629, + "language_loss": 0.75178528, + "learning_rate": 3.842897189706092e-06, + "loss": 0.83106923, + "num_input_tokens_seen": 55351090, + "router_z_loss_clip": 3.015625, + "router_z_loss_mlp": 0.29138184, + "step": 2548, + "time_per_iteration": 4.090601682662964 + }, + { + "auxiliary_loss_clip": 0.06638174, + "auxiliary_loss_mlp": 0.01283175, + "balance_loss_clip": 0.06343598, + "balance_loss_mlp": 0.01255757, + "epoch": 0.15325417105065384, + "flos": 25671463424640.0, + "grad_norm": 1.8173203040893826, + "language_loss": 0.82054353, + "learning_rate": 3.842745848783558e-06, + "loss": 0.89975703, + "num_input_tokens_seen": 55371050, + "router_z_loss_clip": 2.94335938, + "router_z_loss_mlp": 0.27416992, + "step": 2549, + "time_per_iteration": 4.0024590492248535 + }, + { + "auxiliary_loss_clip": 0.06642953, + "auxiliary_loss_mlp": 0.01284523, + "balance_loss_clip": 0.06343073, + "balance_loss_mlp": 0.01256366, + "epoch": 0.1533142943033218, + "flos": 18776838718080.0, + "grad_norm": 1.6738213226373704, + "language_loss": 0.76089072, + "learning_rate": 3.842594437983917e-06, + "loss": 0.84016538, + "num_input_tokens_seen": 55390375, + "router_z_loss_clip": 2.99804688, + "router_z_loss_mlp": 0.28137207, + "step": 2550, + "time_per_iteration": 2.5584487915039062 + }, + { + "auxiliary_loss_clip": 0.06640078, + "auxiliary_loss_mlp": 0.01284284, + "balance_loss_clip": 0.063375, + "balance_loss_mlp": 0.01257093, + "epoch": 0.15337441755598977, + "flos": 23113218418560.0, + "grad_norm": 2.77223179347166, + "language_loss": 0.78078097, + "learning_rate": 3.8424429573129115e-06, + "loss": 0.86002457, + "num_input_tokens_seen": 55408890, + "router_z_loss_clip": 3.02734375, + "router_z_loss_mlp": 0.27172852, + "step": 2551, + "time_per_iteration": 2.5581319332122803 + }, + { + "auxiliary_loss_clip": 0.06594751, + "auxiliary_loss_mlp": 0.01264842, + "balance_loss_clip": 0.0641477, + "balance_loss_mlp": 0.01255657, + "epoch": 0.15343454080865776, + "flos": 59881278372480.0, + "grad_norm": 0.9086682427744472, + "language_loss": 0.56718183, + "learning_rate": 3.842291406776283e-06, + "loss": 0.6457777, + "num_input_tokens_seen": 55463815, + "router_z_loss_clip": 1.796875, + "router_z_loss_mlp": 0.09179688, + "step": 2552, + "time_per_iteration": 3.099020004272461 + }, + { + "auxiliary_loss_clip": 0.06649399, + "auxiliary_loss_mlp": 0.01294284, + "balance_loss_clip": 0.06343735, + "balance_loss_mlp": 0.01263695, + "epoch": 0.15349466406132573, + "flos": 11915644590720.0, + "grad_norm": 7.1683362370520625, + "language_loss": 0.89047897, + "learning_rate": 3.84213978637978e-06, + "loss": 0.96991581, + "num_input_tokens_seen": 55481050, + "router_z_loss_clip": 3.05664062, + "router_z_loss_mlp": 0.30615234, + "step": 2553, + "time_per_iteration": 2.5545389652252197 + }, + { + "auxiliary_loss_clip": 0.06633511, + "auxiliary_loss_mlp": 0.01288342, + "balance_loss_clip": 0.0633003, + "balance_loss_mlp": 0.01258575, + "epoch": 0.1535547873139937, + "flos": 24103681205760.0, + "grad_norm": 2.37345039804312, + "language_loss": 0.79193908, + "learning_rate": 3.841988096129152e-06, + "loss": 0.87115765, + "num_input_tokens_seen": 55500050, + "router_z_loss_clip": 3.03515625, + "router_z_loss_mlp": 0.29748535, + "step": 2554, + "time_per_iteration": 2.5949606895446777 + }, + { + "auxiliary_loss_clip": 0.06630482, + "auxiliary_loss_mlp": 0.01286402, + "balance_loss_clip": 0.06329404, + "balance_loss_mlp": 0.01256278, + "epoch": 0.15361491056666166, + "flos": 17572208094720.0, + "grad_norm": 5.650486163134607, + "language_loss": 0.79014289, + "learning_rate": 3.841836336030151e-06, + "loss": 0.86931169, + "num_input_tokens_seen": 55518125, + "router_z_loss_clip": 3.01171875, + "router_z_loss_mlp": 0.3013916, + "step": 2555, + "time_per_iteration": 2.5340495109558105 + }, + { + "auxiliary_loss_clip": 0.0662353, + "auxiliary_loss_mlp": 0.01288339, + "balance_loss_clip": 0.06330266, + "balance_loss_mlp": 0.01260671, + "epoch": 0.15367503381932962, + "flos": 25053040765440.0, + "grad_norm": 1.6796179562313394, + "language_loss": 0.78025055, + "learning_rate": 3.8416845060885305e-06, + "loss": 0.85936922, + "num_input_tokens_seen": 55540960, + "router_z_loss_clip": 2.93554688, + "router_z_loss_mlp": 0.2767334, + "step": 2556, + "time_per_iteration": 2.623685121536255 + }, + { + "auxiliary_loss_clip": 0.06620497, + "auxiliary_loss_mlp": 0.01288231, + "balance_loss_clip": 0.0633128, + "balance_loss_mlp": 0.01260086, + "epoch": 0.15373515707199759, + "flos": 21513808483200.0, + "grad_norm": 2.256114728182097, + "language_loss": 0.91304088, + "learning_rate": 3.84153260631005e-06, + "loss": 0.99212819, + "num_input_tokens_seen": 55559210, + "router_z_loss_clip": 2.890625, + "router_z_loss_mlp": 0.28161621, + "step": 2557, + "time_per_iteration": 2.6546642780303955 + }, + { + "auxiliary_loss_clip": 0.06632135, + "auxiliary_loss_mlp": 0.01294079, + "balance_loss_clip": 0.0633366, + "balance_loss_mlp": 0.0126411, + "epoch": 0.15379528032466555, + "flos": 26001897200640.0, + "grad_norm": 2.0796567985016656, + "language_loss": 0.71532625, + "learning_rate": 3.841380636700468e-06, + "loss": 0.79458839, + "num_input_tokens_seen": 55578925, + "router_z_loss_clip": 2.98632812, + "router_z_loss_mlp": 0.29980469, + "step": 2558, + "time_per_iteration": 2.604158401489258 + }, + { + "auxiliary_loss_clip": 0.06622511, + "auxiliary_loss_mlp": 0.01287721, + "balance_loss_clip": 0.06324002, + "balance_loss_mlp": 0.01258336, + "epoch": 0.15385540357733354, + "flos": 19282685016960.0, + "grad_norm": 2.0921223854633166, + "language_loss": 0.93401122, + "learning_rate": 3.841228597265548e-06, + "loss": 1.0131135, + "num_input_tokens_seen": 55597255, + "router_z_loss_clip": 2.98242188, + "router_z_loss_mlp": 0.29382324, + "step": 2559, + "time_per_iteration": 2.546621799468994 + }, + { + "auxiliary_loss_clip": 0.06626738, + "auxiliary_loss_mlp": 0.01291924, + "balance_loss_clip": 0.06328855, + "balance_loss_mlp": 0.01262289, + "epoch": 0.1539155268300015, + "flos": 28556788043520.0, + "grad_norm": 2.7498914144184994, + "language_loss": 0.65563196, + "learning_rate": 3.841076488011055e-06, + "loss": 0.73481858, + "num_input_tokens_seen": 55619515, + "router_z_loss_clip": 2.97851562, + "router_z_loss_mlp": 0.29638672, + "step": 2560, + "time_per_iteration": 2.633558511734009 + }, + { + "auxiliary_loss_clip": 0.06620878, + "auxiliary_loss_mlp": 0.01293003, + "balance_loss_clip": 0.06320217, + "balance_loss_mlp": 0.01262927, + "epoch": 0.15397565008266947, + "flos": 23554257984000.0, + "grad_norm": 1.9722034302545564, + "language_loss": 0.89109504, + "learning_rate": 3.8409243089427574e-06, + "loss": 0.9702338, + "num_input_tokens_seen": 55640050, + "router_z_loss_clip": 3.00585938, + "router_z_loss_mlp": 0.30065918, + "step": 2561, + "time_per_iteration": 2.593822479248047 + }, + { + "auxiliary_loss_clip": 0.06618848, + "auxiliary_loss_mlp": 0.01287729, + "balance_loss_clip": 0.06331521, + "balance_loss_mlp": 0.01260811, + "epoch": 0.15403577333533744, + "flos": 17135696649600.0, + "grad_norm": 2.292455015225775, + "language_loss": 0.83781528, + "learning_rate": 3.840772060066425e-06, + "loss": 0.91688108, + "num_input_tokens_seen": 55658695, + "router_z_loss_clip": 2.87109375, + "router_z_loss_mlp": 0.26928711, + "step": 2562, + "time_per_iteration": 2.5630288124084473 + }, + { + "auxiliary_loss_clip": 0.06628443, + "auxiliary_loss_mlp": 0.01297123, + "balance_loss_clip": 0.06321231, + "balance_loss_mlp": 0.01265175, + "epoch": 0.1540958965880054, + "flos": 17900252029440.0, + "grad_norm": 3.685635027542056, + "language_loss": 0.75855017, + "learning_rate": 3.840619741387832e-06, + "loss": 0.83780587, + "num_input_tokens_seen": 55676340, + "router_z_loss_clip": 3.0703125, + "router_z_loss_mlp": 0.31958008, + "step": 2563, + "time_per_iteration": 2.5140066146850586 + }, + { + "auxiliary_loss_clip": 0.06627464, + "auxiliary_loss_mlp": 0.01290382, + "balance_loss_clip": 0.06320702, + "balance_loss_mlp": 0.01258481, + "epoch": 0.15415601984067337, + "flos": 32169296321280.0, + "grad_norm": 2.478610974211426, + "language_loss": 0.77803361, + "learning_rate": 3.8404673529127534e-06, + "loss": 0.85721207, + "num_input_tokens_seen": 55698890, + "router_z_loss_clip": 3.0703125, + "router_z_loss_mlp": 0.3190918, + "step": 2564, + "time_per_iteration": 2.659982681274414 + }, + { + "auxiliary_loss_clip": 0.06615369, + "auxiliary_loss_mlp": 0.0129364, + "balance_loss_clip": 0.06320594, + "balance_loss_mlp": 0.01264267, + "epoch": 0.15421614309334136, + "flos": 24031243751040.0, + "grad_norm": 1.9916685694635767, + "language_loss": 0.71840364, + "learning_rate": 3.840314894646969e-06, + "loss": 0.7974937, + "num_input_tokens_seen": 55718535, + "router_z_loss_clip": 2.94726562, + "router_z_loss_mlp": 0.29321289, + "step": 2565, + "time_per_iteration": 2.553128480911255 + }, + { + "auxiliary_loss_clip": 0.06614129, + "auxiliary_loss_mlp": 0.01296634, + "balance_loss_clip": 0.06317951, + "balance_loss_mlp": 0.01266212, + "epoch": 0.15427626634600933, + "flos": 24392676337920.0, + "grad_norm": 2.5526224211901676, + "language_loss": 0.72527832, + "learning_rate": 3.840162366596259e-06, + "loss": 0.8043859, + "num_input_tokens_seen": 55738970, + "router_z_loss_clip": 2.9609375, + "router_z_loss_mlp": 0.30419922, + "step": 2566, + "time_per_iteration": 2.6016533374786377 + }, + { + "auxiliary_loss_clip": 0.06605071, + "auxiliary_loss_mlp": 0.01292884, + "balance_loss_clip": 0.06314062, + "balance_loss_mlp": 0.01265263, + "epoch": 0.1543363895986773, + "flos": 23338287285120.0, + "grad_norm": 2.301564838599309, + "language_loss": 0.86417472, + "learning_rate": 3.840009768766408e-06, + "loss": 0.94315434, + "num_input_tokens_seen": 55759585, + "router_z_loss_clip": 2.9140625, + "router_z_loss_mlp": 0.27612305, + "step": 2567, + "time_per_iteration": 2.5882625579833984 + }, + { + "auxiliary_loss_clip": 0.06608227, + "auxiliary_loss_mlp": 0.01293398, + "balance_loss_clip": 0.06315389, + "balance_loss_mlp": 0.01265348, + "epoch": 0.15439651285134526, + "flos": 24280225758720.0, + "grad_norm": 2.3922484360691576, + "language_loss": 0.79661417, + "learning_rate": 3.839857101163202e-06, + "loss": 0.87563044, + "num_input_tokens_seen": 55779250, + "router_z_loss_clip": 2.93164062, + "router_z_loss_mlp": 0.28039551, + "step": 2568, + "time_per_iteration": 2.6128549575805664 + }, + { + "auxiliary_loss_clip": 0.06604031, + "auxiliary_loss_mlp": 0.01296391, + "balance_loss_clip": 0.06313319, + "balance_loss_mlp": 0.01268103, + "epoch": 0.15445663610401322, + "flos": 22462832626560.0, + "grad_norm": 2.2987457723616482, + "language_loss": 0.71156412, + "learning_rate": 3.83970436379243e-06, + "loss": 0.79056835, + "num_input_tokens_seen": 55800470, + "router_z_loss_clip": 2.90820312, + "router_z_loss_mlp": 0.28295898, + "step": 2569, + "time_per_iteration": 2.555661916732788 + }, + { + "auxiliary_loss_clip": 0.06609643, + "auxiliary_loss_mlp": 0.0129108, + "balance_loss_clip": 0.06317194, + "balance_loss_mlp": 0.0126197, + "epoch": 0.1545167593566812, + "flos": 22055223640320.0, + "grad_norm": 2.1871959478456433, + "language_loss": 0.7775144, + "learning_rate": 3.839551556659884e-06, + "loss": 0.85652161, + "num_input_tokens_seen": 55817795, + "router_z_loss_clip": 2.92773438, + "router_z_loss_mlp": 0.29150391, + "step": 2570, + "time_per_iteration": 2.5834736824035645 + }, + { + "auxiliary_loss_clip": 0.06598657, + "auxiliary_loss_mlp": 0.01290077, + "balance_loss_clip": 0.06308745, + "balance_loss_mlp": 0.01260513, + "epoch": 0.15457688260934915, + "flos": 19324375223040.0, + "grad_norm": 2.749201239461968, + "language_loss": 0.7861867, + "learning_rate": 3.839398679771359e-06, + "loss": 0.86507404, + "num_input_tokens_seen": 55836125, + "router_z_loss_clip": 2.8984375, + "router_z_loss_mlp": 0.29541016, + "step": 2571, + "time_per_iteration": 2.5391428470611572 + }, + { + "auxiliary_loss_clip": 0.06606804, + "auxiliary_loss_mlp": 0.01294872, + "balance_loss_clip": 0.06313352, + "balance_loss_mlp": 0.01265785, + "epoch": 0.15463700586201715, + "flos": 24140843291520.0, + "grad_norm": 1.901838675989398, + "language_loss": 0.83756542, + "learning_rate": 3.839245733132652e-06, + "loss": 0.91658223, + "num_input_tokens_seen": 55855280, + "router_z_loss_clip": 2.93554688, + "router_z_loss_mlp": 0.29101562, + "step": 2572, + "time_per_iteration": 2.597111463546753 + }, + { + "auxiliary_loss_clip": 0.06611877, + "auxiliary_loss_mlp": 0.01296064, + "balance_loss_clip": 0.06316563, + "balance_loss_mlp": 0.01266393, + "epoch": 0.1546971291146851, + "flos": 22427808819840.0, + "grad_norm": 2.3334374955274466, + "language_loss": 0.91633451, + "learning_rate": 3.839092716749563e-06, + "loss": 0.9954139, + "num_input_tokens_seen": 55875695, + "router_z_loss_clip": 2.95898438, + "router_z_loss_mlp": 0.29699707, + "step": 2573, + "time_per_iteration": 2.553586721420288 + }, + { + "auxiliary_loss_clip": 0.06606219, + "auxiliary_loss_mlp": 0.01288918, + "balance_loss_clip": 0.06312492, + "balance_loss_mlp": 0.01258639, + "epoch": 0.15475725236735308, + "flos": 17536010330880.0, + "grad_norm": 1.5970575826599196, + "language_loss": 0.71088636, + "learning_rate": 3.838939630627893e-06, + "loss": 0.78983772, + "num_input_tokens_seen": 55894575, + "router_z_loss_clip": 2.93554688, + "router_z_loss_mlp": 0.30249023, + "step": 2574, + "time_per_iteration": 2.5485129356384277 + }, + { + "auxiliary_loss_clip": 0.06606239, + "auxiliary_loss_mlp": 0.01287836, + "balance_loss_clip": 0.06312916, + "balance_loss_mlp": 0.01258439, + "epoch": 0.15481737562002104, + "flos": 22567778265600.0, + "grad_norm": 2.064736624590997, + "language_loss": 0.83194166, + "learning_rate": 3.838786474773448e-06, + "loss": 0.91088241, + "num_input_tokens_seen": 55912855, + "router_z_loss_clip": 2.93359375, + "router_z_loss_mlp": 0.29394531, + "step": 2575, + "time_per_iteration": 2.5202696323394775 + }, + { + "auxiliary_loss_clip": 0.06611623, + "auxiliary_loss_mlp": 0.01295032, + "balance_loss_clip": 0.06317705, + "balance_loss_mlp": 0.01267137, + "epoch": 0.154877498872689, + "flos": 24907620804480.0, + "grad_norm": 1.9923268704643078, + "language_loss": 0.8600359, + "learning_rate": 3.838633249192036e-06, + "loss": 0.93910241, + "num_input_tokens_seen": 55932375, + "router_z_loss_clip": 2.93945312, + "router_z_loss_mlp": 0.27929688, + "step": 2576, + "time_per_iteration": 2.5677525997161865 + }, + { + "auxiliary_loss_clip": 0.06609543, + "auxiliary_loss_mlp": 0.01301269, + "balance_loss_clip": 0.06318229, + "balance_loss_mlp": 0.01275126, + "epoch": 0.15493762212535697, + "flos": 28155048842880.0, + "grad_norm": 2.065090565667539, + "language_loss": 0.82887769, + "learning_rate": 3.838479953889465e-06, + "loss": 0.90798575, + "num_input_tokens_seen": 55953970, + "router_z_loss_clip": 2.91796875, + "router_z_loss_mlp": 0.26147461, + "step": 2577, + "time_per_iteration": 2.5728230476379395 + }, + { + "auxiliary_loss_clip": 0.06618612, + "auxiliary_loss_mlp": 0.01306082, + "balance_loss_clip": 0.06324668, + "balance_loss_mlp": 0.01276852, + "epoch": 0.15499774537802496, + "flos": 25418162931840.0, + "grad_norm": 2.85112064725787, + "language_loss": 0.77597427, + "learning_rate": 3.8383265888715525e-06, + "loss": 0.85522127, + "num_input_tokens_seen": 55973120, + "router_z_loss_clip": 2.94335938, + "router_z_loss_mlp": 0.29199219, + "step": 2578, + "time_per_iteration": 2.5934667587280273 + }, + { + "auxiliary_loss_clip": 0.06630063, + "auxiliary_loss_mlp": 0.01289241, + "balance_loss_clip": 0.06328662, + "balance_loss_mlp": 0.01259224, + "epoch": 0.15505786863069293, + "flos": 22098213584640.0, + "grad_norm": 1.7655677053725216, + "language_loss": 0.8325448, + "learning_rate": 3.83817315414411e-06, + "loss": 0.91173792, + "num_input_tokens_seen": 55993260, + "router_z_loss_clip": 3.01367188, + "router_z_loss_mlp": 0.30004883, + "step": 2579, + "time_per_iteration": 2.5339503288269043 + }, + { + "auxiliary_loss_clip": 0.06624122, + "auxiliary_loss_mlp": 0.01293638, + "balance_loss_clip": 0.06327586, + "balance_loss_mlp": 0.01264074, + "epoch": 0.1551179918833609, + "flos": 18923223000960.0, + "grad_norm": 3.703462791860066, + "language_loss": 0.81290895, + "learning_rate": 3.838019649712958e-06, + "loss": 0.89208651, + "num_input_tokens_seen": 56012130, + "router_z_loss_clip": 2.96484375, + "router_z_loss_mlp": 0.2956543, + "step": 2580, + "time_per_iteration": 2.547076940536499 + }, + { + "auxiliary_loss_clip": 0.06553604, + "auxiliary_loss_mlp": 0.01296097, + "balance_loss_clip": 0.06379167, + "balance_loss_mlp": 0.01287341, + "epoch": 0.15517811513602886, + "flos": 66259281530880.0, + "grad_norm": 0.8290210768149422, + "language_loss": 0.59028411, + "learning_rate": 3.8378660755839166e-06, + "loss": 0.6687811, + "num_input_tokens_seen": 56079045, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.08770752, + "step": 2581, + "time_per_iteration": 4.748734712600708 + }, + { + "auxiliary_loss_clip": 0.06615421, + "auxiliary_loss_mlp": 0.01287932, + "balance_loss_clip": 0.06319774, + "balance_loss_mlp": 0.01259286, + "epoch": 0.15523823838869683, + "flos": 24027344536320.0, + "grad_norm": 2.048194408824491, + "language_loss": 0.86481762, + "learning_rate": 3.8377124317628095e-06, + "loss": 0.94385123, + "num_input_tokens_seen": 56098745, + "router_z_loss_clip": 2.95703125, + "router_z_loss_mlp": 0.28625488, + "step": 2582, + "time_per_iteration": 2.5417592525482178 + }, + { + "auxiliary_loss_clip": 0.0661144, + "auxiliary_loss_mlp": 0.01292493, + "balance_loss_clip": 0.06316175, + "balance_loss_mlp": 0.01262262, + "epoch": 0.1552983616413648, + "flos": 20491256782080.0, + "grad_norm": 2.196568898095916, + "language_loss": 0.79934382, + "learning_rate": 3.8375587182554625e-06, + "loss": 0.87838316, + "num_input_tokens_seen": 56117655, + "router_z_loss_clip": 2.95117188, + "router_z_loss_mlp": 0.30236816, + "step": 2583, + "time_per_iteration": 4.1261961460113525 + }, + { + "auxiliary_loss_clip": 0.06610835, + "auxiliary_loss_mlp": 0.01301507, + "balance_loss_clip": 0.06316249, + "balance_loss_mlp": 0.01272956, + "epoch": 0.15535848489403276, + "flos": 32131798819200.0, + "grad_norm": 2.2182475294075643, + "language_loss": 0.77203268, + "learning_rate": 3.837404935067705e-06, + "loss": 0.85115612, + "num_input_tokens_seen": 56141960, + "router_z_loss_clip": 2.9453125, + "router_z_loss_mlp": 0.28515625, + "step": 2584, + "time_per_iteration": 2.71648907661438 + }, + { + "auxiliary_loss_clip": 0.06603897, + "auxiliary_loss_mlp": 0.01292119, + "balance_loss_clip": 0.06309253, + "balance_loss_mlp": 0.01263676, + "epoch": 0.15541860814670075, + "flos": 19104379528320.0, + "grad_norm": 2.0708341386331157, + "language_loss": 0.76718783, + "learning_rate": 3.837251082205368e-06, + "loss": 0.84614801, + "num_input_tokens_seen": 56161430, + "router_z_loss_clip": 2.9453125, + "router_z_loss_mlp": 0.28442383, + "step": 2585, + "time_per_iteration": 2.548250198364258 + }, + { + "auxiliary_loss_clip": 0.06590863, + "auxiliary_loss_mlp": 0.01288896, + "balance_loss_clip": 0.06303678, + "balance_loss_mlp": 0.01260607, + "epoch": 0.1554787313993687, + "flos": 19178158648320.0, + "grad_norm": 2.0117198745869134, + "language_loss": 0.6235339, + "learning_rate": 3.837097159674286e-06, + "loss": 0.70233154, + "num_input_tokens_seen": 56179390, + "router_z_loss_clip": 2.87109375, + "router_z_loss_mlp": 0.28283691, + "step": 2586, + "time_per_iteration": 2.5397160053253174 + }, + { + "auxiliary_loss_clip": 0.06596754, + "auxiliary_loss_mlp": 0.01289508, + "balance_loss_clip": 0.0630295, + "balance_loss_mlp": 0.0126023, + "epoch": 0.15553885465203668, + "flos": 16149384639360.0, + "grad_norm": 2.0060039427442065, + "language_loss": 0.82540935, + "learning_rate": 3.836943167480296e-06, + "loss": 0.90427202, + "num_input_tokens_seen": 56198020, + "router_z_loss_clip": 2.9453125, + "router_z_loss_mlp": 0.29321289, + "step": 2587, + "time_per_iteration": 2.5246498584747314 + }, + { + "auxiliary_loss_clip": 0.06596097, + "auxiliary_loss_mlp": 0.01287288, + "balance_loss_clip": 0.06299823, + "balance_loss_mlp": 0.01257152, + "epoch": 0.15559897790470464, + "flos": 25344803082240.0, + "grad_norm": 1.8823875807099288, + "language_loss": 0.8996799, + "learning_rate": 3.836789105629236e-06, + "loss": 0.97851378, + "num_input_tokens_seen": 56218165, + "router_z_loss_clip": 2.9609375, + "router_z_loss_mlp": 0.30126953, + "step": 2588, + "time_per_iteration": 4.054608345031738 + }, + { + "auxiliary_loss_clip": 0.06588855, + "auxiliary_loss_mlp": 0.01285264, + "balance_loss_clip": 0.06298578, + "balance_loss_mlp": 0.01255628, + "epoch": 0.1556591011573726, + "flos": 23155453676160.0, + "grad_norm": 2.3276735592444253, + "language_loss": 0.65979421, + "learning_rate": 3.83663497412695e-06, + "loss": 0.7385354, + "num_input_tokens_seen": 56237160, + "router_z_loss_clip": 2.90234375, + "router_z_loss_mlp": 0.29614258, + "step": 2589, + "time_per_iteration": 2.5870378017425537 + }, + { + "auxiliary_loss_clip": 0.06587367, + "auxiliary_loss_mlp": 0.01283128, + "balance_loss_clip": 0.06296779, + "balance_loss_mlp": 0.01254554, + "epoch": 0.15571922441004057, + "flos": 25377353193600.0, + "grad_norm": 1.8444510343536653, + "language_loss": 0.83209628, + "learning_rate": 3.836480772979281e-06, + "loss": 0.91080129, + "num_input_tokens_seen": 56257610, + "router_z_loss_clip": 2.90820312, + "router_z_loss_mlp": 0.2857666, + "step": 2590, + "time_per_iteration": 2.567789316177368 + }, + { + "auxiliary_loss_clip": 0.06586926, + "auxiliary_loss_mlp": 0.01284797, + "balance_loss_clip": 0.06295232, + "balance_loss_mlp": 0.0125819, + "epoch": 0.15577934766270854, + "flos": 14506565489280.0, + "grad_norm": 2.5394168350381956, + "language_loss": 0.80645335, + "learning_rate": 3.836326502192077e-06, + "loss": 0.88517064, + "num_input_tokens_seen": 56275215, + "router_z_loss_clip": 2.91796875, + "router_z_loss_mlp": 0.26635742, + "step": 2591, + "time_per_iteration": 2.552945852279663 + }, + { + "auxiliary_loss_clip": 0.06583126, + "auxiliary_loss_mlp": 0.0128094, + "balance_loss_clip": 0.06296018, + "balance_loss_mlp": 0.01255953, + "epoch": 0.15583947091537653, + "flos": 37423575573120.0, + "grad_norm": 4.213698124732034, + "language_loss": 0.6586749, + "learning_rate": 3.836172161771189e-06, + "loss": 0.73731554, + "num_input_tokens_seen": 56297130, + "router_z_loss_clip": 2.87109375, + "router_z_loss_mlp": 0.25024414, + "step": 2592, + "time_per_iteration": 2.6843414306640625 + }, + { + "auxiliary_loss_clip": 0.06601857, + "auxiliary_loss_mlp": 0.01282978, + "balance_loss_clip": 0.06306329, + "balance_loss_mlp": 0.01254547, + "epoch": 0.1558995941680445, + "flos": 21841097731200.0, + "grad_norm": 2.3724666239354804, + "language_loss": 0.83576721, + "learning_rate": 3.836017751722467e-06, + "loss": 0.91461557, + "num_input_tokens_seen": 56314995, + "router_z_loss_clip": 2.95898438, + "router_z_loss_mlp": 0.28442383, + "step": 2593, + "time_per_iteration": 2.565361738204956 + }, + { + "auxiliary_loss_clip": 0.06586924, + "auxiliary_loss_mlp": 0.01289301, + "balance_loss_clip": 0.06303876, + "balance_loss_mlp": 0.01261526, + "epoch": 0.15595971742071246, + "flos": 19798845367680.0, + "grad_norm": 2.2297480783075847, + "language_loss": 0.74099863, + "learning_rate": 3.8358632720517695e-06, + "loss": 0.8197608, + "num_input_tokens_seen": 56334005, + "router_z_loss_clip": 2.83203125, + "router_z_loss_mlp": 0.27819824, + "step": 2594, + "time_per_iteration": 2.55253267288208 + }, + { + "auxiliary_loss_clip": 0.06601368, + "auxiliary_loss_mlp": 0.01282916, + "balance_loss_clip": 0.06319516, + "balance_loss_mlp": 0.01257346, + "epoch": 0.15601984067338043, + "flos": 26729038932480.0, + "grad_norm": 2.826820029132309, + "language_loss": 0.82562411, + "learning_rate": 3.835708722764952e-06, + "loss": 0.90446699, + "num_input_tokens_seen": 56353795, + "router_z_loss_clip": 2.81835938, + "router_z_loss_mlp": 0.2557373, + "step": 2595, + "time_per_iteration": 2.640240430831909 + }, + { + "auxiliary_loss_clip": 0.06626514, + "auxiliary_loss_mlp": 0.01281437, + "balance_loss_clip": 0.06334631, + "balance_loss_mlp": 0.01254936, + "epoch": 0.1560799639260484, + "flos": 18375183371520.0, + "grad_norm": 9.37489887619581, + "language_loss": 0.87632233, + "learning_rate": 3.835554103867876e-06, + "loss": 0.95540184, + "num_input_tokens_seen": 56373195, + "router_z_loss_clip": 2.91992188, + "router_z_loss_mlp": 0.26538086, + "step": 2596, + "time_per_iteration": 2.529327869415283 + }, + { + "auxiliary_loss_clip": 0.06606492, + "auxiliary_loss_mlp": 0.01287289, + "balance_loss_clip": 0.06323552, + "balance_loss_mlp": 0.01261015, + "epoch": 0.15614008717871636, + "flos": 22605149986560.0, + "grad_norm": 2.807545322610708, + "language_loss": 0.69688505, + "learning_rate": 3.835399415366404e-06, + "loss": 0.77582288, + "num_input_tokens_seen": 56391525, + "router_z_loss_clip": 2.82617188, + "router_z_loss_mlp": 0.26306152, + "step": 2597, + "time_per_iteration": 2.5685815811157227 + }, + { + "auxiliary_loss_clip": 0.0662894, + "auxiliary_loss_mlp": 0.01280666, + "balance_loss_clip": 0.06348241, + "balance_loss_mlp": 0.01256455, + "epoch": 0.15620021043138435, + "flos": 22753379059200.0, + "grad_norm": 2.0232351113841514, + "language_loss": 0.80914307, + "learning_rate": 3.8352446572664035e-06, + "loss": 0.88823915, + "num_input_tokens_seen": 56410715, + "router_z_loss_clip": 2.81054688, + "router_z_loss_mlp": 0.2421875, + "step": 2598, + "time_per_iteration": 2.554202079772949 + }, + { + "auxiliary_loss_clip": 0.0662708, + "auxiliary_loss_mlp": 0.01284312, + "balance_loss_clip": 0.06344105, + "balance_loss_mlp": 0.01257895, + "epoch": 0.15626033368405232, + "flos": 13119897870720.0, + "grad_norm": 2.0408523791990016, + "language_loss": 0.83276039, + "learning_rate": 3.8350898295737405e-06, + "loss": 0.91187429, + "num_input_tokens_seen": 56429170, + "router_z_loss_clip": 2.83007812, + "router_z_loss_mlp": 0.26391602, + "step": 2599, + "time_per_iteration": 2.66353702545166 + }, + { + "auxiliary_loss_clip": 0.06639346, + "auxiliary_loss_mlp": 0.01292644, + "balance_loss_clip": 0.06344323, + "balance_loss_mlp": 0.0126469, + "epoch": 0.15632045693672028, + "flos": 16477931698560.0, + "grad_norm": 2.3045518919772046, + "language_loss": 0.82379115, + "learning_rate": 3.834934932294287e-06, + "loss": 0.9031111, + "num_input_tokens_seen": 56445685, + "router_z_loss_clip": 2.95117188, + "router_z_loss_mlp": 0.27941895, + "step": 2600, + "time_per_iteration": 2.50607967376709 + }, + { + "auxiliary_loss_clip": 0.06646761, + "auxiliary_loss_mlp": 0.01287391, + "balance_loss_clip": 0.0635706, + "balance_loss_mlp": 0.01259305, + "epoch": 0.15638058018938825, + "flos": 20856672437760.0, + "grad_norm": 2.020166421544308, + "language_loss": 0.88839436, + "learning_rate": 3.834779965433917e-06, + "loss": 0.96773589, + "num_input_tokens_seen": 56465900, + "router_z_loss_clip": 2.8984375, + "router_z_loss_mlp": 0.28076172, + "step": 2601, + "time_per_iteration": 2.574437141418457 + }, + { + "auxiliary_loss_clip": 0.06648471, + "auxiliary_loss_mlp": 0.01294906, + "balance_loss_clip": 0.06352241, + "balance_loss_mlp": 0.01267989, + "epoch": 0.1564407034420562, + "flos": 21878762941440.0, + "grad_norm": 2.51177361833528, + "language_loss": 0.79510248, + "learning_rate": 3.834624928998508e-06, + "loss": 0.87453628, + "num_input_tokens_seen": 56485020, + "router_z_loss_clip": 2.96679688, + "router_z_loss_mlp": 0.26940918, + "step": 2602, + "time_per_iteration": 2.5957844257354736 + }, + { + "auxiliary_loss_clip": 0.06633168, + "auxiliary_loss_mlp": 0.01292264, + "balance_loss_clip": 0.06345348, + "balance_loss_mlp": 0.01265979, + "epoch": 0.15650082669472418, + "flos": 21840888096000.0, + "grad_norm": 1.9170738392352888, + "language_loss": 0.7431488, + "learning_rate": 3.8344698229939376e-06, + "loss": 0.82240313, + "num_input_tokens_seen": 56505205, + "router_z_loss_clip": 2.875, + "router_z_loss_mlp": 0.26293945, + "step": 2603, + "time_per_iteration": 2.5696704387664795 + }, + { + "auxiliary_loss_clip": 0.06625052, + "auxiliary_loss_mlp": 0.01287753, + "balance_loss_clip": 0.06337333, + "balance_loss_mlp": 0.01261217, + "epoch": 0.15656094994739214, + "flos": 13804343147520.0, + "grad_norm": 2.480258971716289, + "language_loss": 0.88529468, + "learning_rate": 3.8343146474260865e-06, + "loss": 0.9644227, + "num_input_tokens_seen": 56521495, + "router_z_loss_clip": 2.87890625, + "router_z_loss_mlp": 0.26538086, + "step": 2604, + "time_per_iteration": 2.5110373497009277 + }, + { + "auxiliary_loss_clip": 0.06634312, + "auxiliary_loss_mlp": 0.01291425, + "balance_loss_clip": 0.06341597, + "balance_loss_mlp": 0.01266558, + "epoch": 0.15662107320006013, + "flos": 27315582312960.0, + "grad_norm": 2.192350516429204, + "language_loss": 0.85880566, + "learning_rate": 3.834159402300841e-06, + "loss": 0.93806314, + "num_input_tokens_seen": 56540665, + "router_z_loss_clip": 2.9296875, + "router_z_loss_mlp": 0.2487793, + "step": 2605, + "time_per_iteration": 2.6109507083892822 + }, + { + "auxiliary_loss_clip": 0.06649123, + "auxiliary_loss_mlp": 0.01294389, + "balance_loss_clip": 0.06348212, + "balance_loss_mlp": 0.01265802, + "epoch": 0.1566811964527281, + "flos": 26691876846720.0, + "grad_norm": 1.9127965853266395, + "language_loss": 0.73996091, + "learning_rate": 3.834004087624087e-06, + "loss": 0.81939602, + "num_input_tokens_seen": 56560805, + "router_z_loss_clip": 3.00976562, + "router_z_loss_mlp": 0.28564453, + "step": 2606, + "time_per_iteration": 2.7345151901245117 + }, + { + "auxiliary_loss_clip": 0.06621392, + "auxiliary_loss_mlp": 0.01286091, + "balance_loss_clip": 0.06334884, + "balance_loss_mlp": 0.01260246, + "epoch": 0.15674131970539606, + "flos": 16108323338880.0, + "grad_norm": 2.273122789948623, + "language_loss": 0.77297181, + "learning_rate": 3.8338487034017145e-06, + "loss": 0.85204661, + "num_input_tokens_seen": 56576335, + "router_z_loss_clip": 2.8671875, + "router_z_loss_mlp": 0.25842285, + "step": 2607, + "time_per_iteration": 2.571983575820923 + }, + { + "auxiliary_loss_clip": 0.06614074, + "auxiliary_loss_mlp": 0.01286338, + "balance_loss_clip": 0.06327923, + "balance_loss_mlp": 0.01260791, + "epoch": 0.15680144295806403, + "flos": 19175349536640.0, + "grad_norm": 1.917731361959034, + "language_loss": 0.8328836, + "learning_rate": 3.833693249639615e-06, + "loss": 0.91188771, + "num_input_tokens_seen": 56595880, + "router_z_loss_clip": 2.86328125, + "router_z_loss_mlp": 0.25598145, + "step": 2608, + "time_per_iteration": 2.5823540687561035 + }, + { + "auxiliary_loss_clip": 0.06622173, + "auxiliary_loss_mlp": 0.01295073, + "balance_loss_clip": 0.06326167, + "balance_loss_mlp": 0.01264901, + "epoch": 0.156861566210732, + "flos": 20819678060160.0, + "grad_norm": 2.1481617307418017, + "language_loss": 0.73101258, + "learning_rate": 3.833537726343684e-06, + "loss": 0.81018502, + "num_input_tokens_seen": 56615130, + "router_z_loss_clip": 2.9609375, + "router_z_loss_mlp": 0.30163574, + "step": 2609, + "time_per_iteration": 2.572356700897217 + }, + { + "auxiliary_loss_clip": 0.06605803, + "auxiliary_loss_mlp": 0.01286832, + "balance_loss_clip": 0.06311236, + "balance_loss_mlp": 0.01260928, + "epoch": 0.15692168946339996, + "flos": 20054158358400.0, + "grad_norm": 2.0130429141277446, + "language_loss": 0.73445058, + "learning_rate": 3.833382133519818e-06, + "loss": 0.8133769, + "num_input_tokens_seen": 56634005, + "router_z_loss_clip": 2.94726562, + "router_z_loss_mlp": 0.2590332, + "step": 2610, + "time_per_iteration": 2.567537784576416 + }, + { + "auxiliary_loss_clip": 0.06606032, + "auxiliary_loss_mlp": 0.01288962, + "balance_loss_clip": 0.06310159, + "balance_loss_mlp": 0.01258873, + "epoch": 0.15698181271606793, + "flos": 21404502432000.0, + "grad_norm": 1.9787082052238874, + "language_loss": 0.73279381, + "learning_rate": 3.833226471173919e-06, + "loss": 0.81174374, + "num_input_tokens_seen": 56653480, + "router_z_loss_clip": 2.95898438, + "router_z_loss_mlp": 0.30065918, + "step": 2611, + "time_per_iteration": 2.582390308380127 + }, + { + "auxiliary_loss_clip": 0.06594902, + "auxiliary_loss_mlp": 0.01284265, + "balance_loss_clip": 0.06304685, + "balance_loss_mlp": 0.01259172, + "epoch": 0.15704193596873592, + "flos": 20851347703680.0, + "grad_norm": 2.098501694873674, + "language_loss": 0.71879792, + "learning_rate": 3.833070739311887e-06, + "loss": 0.79758954, + "num_input_tokens_seen": 56672270, + "router_z_loss_clip": 2.90234375, + "router_z_loss_mlp": 0.25097656, + "step": 2612, + "time_per_iteration": 2.577627658843994 + }, + { + "auxiliary_loss_clip": 0.0659887, + "auxiliary_loss_mlp": 0.01283795, + "balance_loss_clip": 0.06308534, + "balance_loss_mlp": 0.0125832, + "epoch": 0.15710205922140388, + "flos": 21769456890240.0, + "grad_norm": 2.359608918603851, + "language_loss": 0.77193695, + "learning_rate": 3.83291493793963e-06, + "loss": 0.85076362, + "num_input_tokens_seen": 56691510, + "router_z_loss_clip": 2.90429688, + "router_z_loss_mlp": 0.2545166, + "step": 2613, + "time_per_iteration": 2.5632479190826416 + }, + { + "auxiliary_loss_clip": 0.06608421, + "auxiliary_loss_mlp": 0.01292559, + "balance_loss_clip": 0.06315231, + "balance_loss_mlp": 0.01266106, + "epoch": 0.15716218247407185, + "flos": 25014453160320.0, + "grad_norm": 1.6622650675423762, + "language_loss": 0.66684031, + "learning_rate": 3.832759067063055e-06, + "loss": 0.74585009, + "num_input_tokens_seen": 56712230, + "router_z_loss_clip": 2.93359375, + "router_z_loss_mlp": 0.26428223, + "step": 2614, + "time_per_iteration": 2.684286117553711 + }, + { + "auxiliary_loss_clip": 0.0661184, + "auxiliary_loss_mlp": 0.01292567, + "balance_loss_clip": 0.06314493, + "balance_loss_mlp": 0.01264255, + "epoch": 0.1572223057267398, + "flos": 20197691602560.0, + "grad_norm": 3.2869095787841576, + "language_loss": 0.76402575, + "learning_rate": 3.832603126688072e-06, + "loss": 0.84306979, + "num_input_tokens_seen": 56727490, + "router_z_loss_clip": 2.9765625, + "router_z_loss_mlp": 0.28308105, + "step": 2615, + "time_per_iteration": 2.551769971847534 + }, + { + "auxiliary_loss_clip": 0.06589202, + "auxiliary_loss_mlp": 0.01285836, + "balance_loss_clip": 0.06304425, + "balance_loss_mlp": 0.01260587, + "epoch": 0.15728242897940778, + "flos": 20965810780800.0, + "grad_norm": 1.7986527043954237, + "language_loss": 0.74040192, + "learning_rate": 3.832447116820594e-06, + "loss": 0.81915236, + "num_input_tokens_seen": 56747385, + "router_z_loss_clip": 2.84960938, + "router_z_loss_mlp": 0.25256348, + "step": 2616, + "time_per_iteration": 2.5935630798339844 + }, + { + "auxiliary_loss_clip": 0.06601542, + "auxiliary_loss_mlp": 0.01283526, + "balance_loss_clip": 0.06305884, + "balance_loss_mlp": 0.01256966, + "epoch": 0.15734255223207574, + "flos": 23044764032640.0, + "grad_norm": 2.1005464521191426, + "language_loss": 0.73305666, + "learning_rate": 3.832291037466539e-06, + "loss": 0.81190741, + "num_input_tokens_seen": 56768055, + "router_z_loss_clip": 2.95703125, + "router_z_loss_mlp": 0.265625, + "step": 2617, + "time_per_iteration": 2.6116013526916504 + }, + { + "auxiliary_loss_clip": 0.06593003, + "auxiliary_loss_mlp": 0.01287239, + "balance_loss_clip": 0.06306564, + "balance_loss_mlp": 0.012605, + "epoch": 0.15740267548474374, + "flos": 20556357004800.0, + "grad_norm": 2.1735503953171813, + "language_loss": 0.75337285, + "learning_rate": 3.8321348886318235e-06, + "loss": 0.83217525, + "num_input_tokens_seen": 56785110, + "router_z_loss_clip": 2.86523438, + "router_z_loss_mlp": 0.26745605, + "step": 2618, + "time_per_iteration": 2.558271884918213 + }, + { + "auxiliary_loss_clip": 0.06606486, + "auxiliary_loss_mlp": 0.01288019, + "balance_loss_clip": 0.06305802, + "balance_loss_mlp": 0.01260052, + "epoch": 0.1574627987374117, + "flos": 22672262707200.0, + "grad_norm": 2.4653942739702277, + "language_loss": 0.79897004, + "learning_rate": 3.8319786703223695e-06, + "loss": 0.87791508, + "num_input_tokens_seen": 56804975, + "router_z_loss_clip": 3.00585938, + "router_z_loss_mlp": 0.2800293, + "step": 2619, + "time_per_iteration": 2.5732688903808594 + }, + { + "auxiliary_loss_clip": 0.06592336, + "auxiliary_loss_mlp": 0.01289339, + "balance_loss_clip": 0.06304029, + "balance_loss_mlp": 0.01263304, + "epoch": 0.15752292199007967, + "flos": 16806352976640.0, + "grad_norm": 1.8956550238632917, + "language_loss": 0.77960408, + "learning_rate": 3.831822382544101e-06, + "loss": 0.85842085, + "num_input_tokens_seen": 56822470, + "router_z_loss_clip": 2.88476562, + "router_z_loss_mlp": 0.26013184, + "step": 2620, + "time_per_iteration": 2.556342363357544 + }, + { + "auxiliary_loss_clip": 0.06608844, + "auxiliary_loss_mlp": 0.01287118, + "balance_loss_clip": 0.06316274, + "balance_loss_mlp": 0.01259843, + "epoch": 0.15758304524274763, + "flos": 29833856121600.0, + "grad_norm": 1.8795614053933318, + "language_loss": 0.72243416, + "learning_rate": 3.831666025302944e-06, + "loss": 0.80139381, + "num_input_tokens_seen": 56842100, + "router_z_loss_clip": 2.92382812, + "router_z_loss_mlp": 0.27282715, + "step": 2621, + "time_per_iteration": 4.014448881149292 + }, + { + "auxiliary_loss_clip": 0.06605494, + "auxiliary_loss_mlp": 0.01287754, + "balance_loss_clip": 0.06309334, + "balance_loss_mlp": 0.01260813, + "epoch": 0.1576431684954156, + "flos": 53589116851200.0, + "grad_norm": 5.362699165833927, + "language_loss": 0.73428345, + "learning_rate": 3.831509598604828e-06, + "loss": 0.81321585, + "num_input_tokens_seen": 56865920, + "router_z_loss_clip": 2.96484375, + "router_z_loss_mlp": 0.26940918, + "step": 2622, + "time_per_iteration": 2.9332852363586426 + }, + { + "auxiliary_loss_clip": 0.06587812, + "auxiliary_loss_mlp": 0.01287353, + "balance_loss_clip": 0.06302886, + "balance_loss_mlp": 0.01262284, + "epoch": 0.15770329174808356, + "flos": 20819887695360.0, + "grad_norm": 1.8034719431418926, + "language_loss": 0.88731241, + "learning_rate": 3.831353102455684e-06, + "loss": 0.96606404, + "num_input_tokens_seen": 56885265, + "router_z_loss_clip": 2.84765625, + "router_z_loss_mlp": 0.25085449, + "step": 2623, + "time_per_iteration": 3.993907928466797 + }, + { + "auxiliary_loss_clip": 0.06595732, + "auxiliary_loss_mlp": 0.01282154, + "balance_loss_clip": 0.0630911, + "balance_loss_mlp": 0.01255594, + "epoch": 0.15776341500075153, + "flos": 24981148362240.0, + "grad_norm": 2.539905380031208, + "language_loss": 0.82629728, + "learning_rate": 3.831196536861448e-06, + "loss": 0.90507615, + "num_input_tokens_seen": 56906710, + "router_z_loss_clip": 2.86523438, + "router_z_loss_mlp": 0.265625, + "step": 2624, + "time_per_iteration": 2.5706846714019775 + }, + { + "auxiliary_loss_clip": 0.06606949, + "auxiliary_loss_mlp": 0.01292533, + "balance_loss_clip": 0.06309812, + "balance_loss_mlp": 0.01266093, + "epoch": 0.15782353825341952, + "flos": 21914331799680.0, + "grad_norm": 3.0693090763099815, + "language_loss": 0.81940538, + "learning_rate": 3.831039901828054e-06, + "loss": 0.89840019, + "num_input_tokens_seen": 56924275, + "router_z_loss_clip": 2.9765625, + "router_z_loss_mlp": 0.26452637, + "step": 2625, + "time_per_iteration": 2.569840669631958 + }, + { + "auxiliary_loss_clip": 0.06593765, + "auxiliary_loss_mlp": 0.01293944, + "balance_loss_clip": 0.06303135, + "balance_loss_mlp": 0.01268064, + "epoch": 0.15788366150608749, + "flos": 26184395393280.0, + "grad_norm": 2.523517901800404, + "language_loss": 0.81776226, + "learning_rate": 3.830883197361445e-06, + "loss": 0.89663935, + "num_input_tokens_seen": 56941525, + "router_z_loss_clip": 2.90429688, + "router_z_loss_mlp": 0.25891113, + "step": 2626, + "time_per_iteration": 2.561379909515381 + }, + { + "auxiliary_loss_clip": 0.06594853, + "auxiliary_loss_mlp": 0.01294161, + "balance_loss_clip": 0.06304863, + "balance_loss_mlp": 0.01267434, + "epoch": 0.15794378475875545, + "flos": 27717321513600.0, + "grad_norm": 1.6929688421529916, + "language_loss": 0.7457962, + "learning_rate": 3.830726423467561e-06, + "loss": 0.82468635, + "num_input_tokens_seen": 56962145, + "router_z_loss_clip": 2.90039062, + "router_z_loss_mlp": 0.26708984, + "step": 2627, + "time_per_iteration": 2.596707344055176 + }, + { + "auxiliary_loss_clip": 0.06587663, + "auxiliary_loss_mlp": 0.01294139, + "balance_loss_clip": 0.06296949, + "balance_loss_mlp": 0.01267007, + "epoch": 0.15800390801142342, + "flos": 12135011379840.0, + "grad_norm": 2.3877400099999413, + "language_loss": 0.87097675, + "learning_rate": 3.830569580152348e-06, + "loss": 0.94979477, + "num_input_tokens_seen": 56977505, + "router_z_loss_clip": 2.91015625, + "router_z_loss_mlp": 0.27172852, + "step": 2628, + "time_per_iteration": 5.372643709182739 + }, + { + "auxiliary_loss_clip": 0.06588875, + "auxiliary_loss_mlp": 0.01280598, + "balance_loss_clip": 0.06300817, + "balance_loss_mlp": 0.0125548, + "epoch": 0.15806403126409138, + "flos": 20711084768640.0, + "grad_norm": 2.1789511738163236, + "language_loss": 0.77439439, + "learning_rate": 3.830412667421752e-06, + "loss": 0.85308909, + "num_input_tokens_seen": 56996770, + "router_z_loss_clip": 2.88085938, + "router_z_loss_mlp": 0.25097656, + "step": 2629, + "time_per_iteration": 2.571425199508667 + }, + { + "auxiliary_loss_clip": 0.06593206, + "auxiliary_loss_mlp": 0.0128531, + "balance_loss_clip": 0.06298864, + "balance_loss_mlp": 0.01257117, + "epoch": 0.15812415451675935, + "flos": 17827479158400.0, + "grad_norm": 2.6284348264521853, + "language_loss": 0.74838495, + "learning_rate": 3.8302556852817245e-06, + "loss": 0.82717013, + "num_input_tokens_seen": 57014970, + "router_z_loss_clip": 2.94726562, + "router_z_loss_mlp": 0.28186035, + "step": 2630, + "time_per_iteration": 2.538496971130371 + }, + { + "auxiliary_loss_clip": 0.06592915, + "auxiliary_loss_mlp": 0.01286291, + "balance_loss_clip": 0.06295606, + "balance_loss_mlp": 0.0125904, + "epoch": 0.15818427776942734, + "flos": 20090230341120.0, + "grad_norm": 3.888480122572148, + "language_loss": 0.84692156, + "learning_rate": 3.8300986337382184e-06, + "loss": 0.9257136, + "num_input_tokens_seen": 57034045, + "router_z_loss_clip": 2.97070312, + "router_z_loss_mlp": 0.27270508, + "step": 2631, + "time_per_iteration": 2.6821517944335938 + }, + { + "auxiliary_loss_clip": 0.06584532, + "auxiliary_loss_mlp": 0.01280599, + "balance_loss_clip": 0.06294788, + "balance_loss_mlp": 0.01253563, + "epoch": 0.1582444010220953, + "flos": 21221249552640.0, + "grad_norm": 8.851391146614638, + "language_loss": 0.79768324, + "learning_rate": 3.8299415127971895e-06, + "loss": 0.87633461, + "num_input_tokens_seen": 57053695, + "router_z_loss_clip": 2.8984375, + "router_z_loss_mlp": 0.27050781, + "step": 2632, + "time_per_iteration": 2.5977976322174072 + }, + { + "auxiliary_loss_clip": 0.06588165, + "auxiliary_loss_mlp": 0.01281414, + "balance_loss_clip": 0.06294183, + "balance_loss_mlp": 0.01255414, + "epoch": 0.15830452427476327, + "flos": 17864138119680.0, + "grad_norm": 1.985726901466477, + "language_loss": 0.83594966, + "learning_rate": 3.829784322464594e-06, + "loss": 0.91464543, + "num_input_tokens_seen": 57071290, + "router_z_loss_clip": 2.93945312, + "router_z_loss_mlp": 0.2598877, + "step": 2633, + "time_per_iteration": 2.569474220275879 + }, + { + "auxiliary_loss_clip": 0.0658908, + "auxiliary_loss_mlp": 0.0128242, + "balance_loss_clip": 0.0629508, + "balance_loss_mlp": 0.01256265, + "epoch": 0.15836464752743123, + "flos": 24541827805440.0, + "grad_norm": 1.6688248008006443, + "language_loss": 0.78379452, + "learning_rate": 3.829627062746394e-06, + "loss": 0.86250955, + "num_input_tokens_seen": 57091465, + "router_z_loss_clip": 2.93945312, + "router_z_loss_mlp": 0.26196289, + "step": 2634, + "time_per_iteration": 2.5919923782348633 + }, + { + "auxiliary_loss_clip": 0.06593279, + "auxiliary_loss_mlp": 0.01291316, + "balance_loss_clip": 0.06295943, + "balance_loss_mlp": 0.01263337, + "epoch": 0.1584247707800992, + "flos": 20127057010560.0, + "grad_norm": 2.0830753641117306, + "language_loss": 0.89997375, + "learning_rate": 3.829469733648552e-06, + "loss": 0.97881973, + "num_input_tokens_seen": 57110075, + "router_z_loss_clip": 2.9765625, + "router_z_loss_mlp": 0.27966309, + "step": 2635, + "time_per_iteration": 2.5786406993865967 + }, + { + "auxiliary_loss_clip": 0.06588058, + "auxiliary_loss_mlp": 0.01288113, + "balance_loss_clip": 0.06292774, + "balance_loss_mlp": 0.01260218, + "epoch": 0.15848489403276717, + "flos": 20382202293120.0, + "grad_norm": 2.014850044069841, + "language_loss": 0.7709136, + "learning_rate": 3.829312335177034e-06, + "loss": 0.8496753, + "num_input_tokens_seen": 57128945, + "router_z_loss_clip": 2.95703125, + "router_z_loss_mlp": 0.27868652, + "step": 2636, + "time_per_iteration": 2.6201331615448 + }, + { + "auxiliary_loss_clip": 0.06586573, + "auxiliary_loss_mlp": 0.0128751, + "balance_loss_clip": 0.06290652, + "balance_loss_mlp": 0.0126101, + "epoch": 0.15854501728543513, + "flos": 39356018760960.0, + "grad_norm": 2.044553358008507, + "language_loss": 0.73238122, + "learning_rate": 3.82915486733781e-06, + "loss": 0.81112206, + "num_input_tokens_seen": 57152385, + "router_z_loss_clip": 2.95898438, + "router_z_loss_mlp": 0.26489258, + "step": 2637, + "time_per_iteration": 2.742854595184326 + }, + { + "auxiliary_loss_clip": 0.06583421, + "auxiliary_loss_mlp": 0.01288932, + "balance_loss_clip": 0.06292208, + "balance_loss_mlp": 0.01262468, + "epoch": 0.15860514053810312, + "flos": 24871297259520.0, + "grad_norm": 1.8074381255816763, + "language_loss": 0.79285657, + "learning_rate": 3.82899733013685e-06, + "loss": 0.87158012, + "num_input_tokens_seen": 57172620, + "router_z_loss_clip": 2.91601562, + "router_z_loss_mlp": 0.26489258, + "step": 2638, + "time_per_iteration": 2.5642874240875244 + }, + { + "auxiliary_loss_clip": 0.06588158, + "auxiliary_loss_mlp": 0.01287351, + "balance_loss_clip": 0.06294204, + "balance_loss_mlp": 0.01258908, + "epoch": 0.1586652637907711, + "flos": 26184982371840.0, + "grad_norm": 2.3471549301232844, + "language_loss": 0.76132977, + "learning_rate": 3.828839723580128e-06, + "loss": 0.84008479, + "num_input_tokens_seen": 57194680, + "router_z_loss_clip": 2.94335938, + "router_z_loss_mlp": 0.28491211, + "step": 2639, + "time_per_iteration": 2.615779399871826 + }, + { + "auxiliary_loss_clip": 0.06586854, + "auxiliary_loss_mlp": 0.01295396, + "balance_loss_clip": 0.06293523, + "balance_loss_mlp": 0.01267299, + "epoch": 0.15872538704343905, + "flos": 19798174535040.0, + "grad_norm": 1.8583301329388602, + "language_loss": 0.82681525, + "learning_rate": 3.82868204767362e-06, + "loss": 0.90563774, + "num_input_tokens_seen": 57214675, + "router_z_loss_clip": 2.93164062, + "router_z_loss_mlp": 0.28076172, + "step": 2640, + "time_per_iteration": 2.5406789779663086 + }, + { + "auxiliary_loss_clip": 0.06583565, + "auxiliary_loss_mlp": 0.0129063, + "balance_loss_clip": 0.06294291, + "balance_loss_mlp": 0.01262342, + "epoch": 0.15878551029610702, + "flos": 28482883142400.0, + "grad_norm": 1.847395702831907, + "language_loss": 0.67676318, + "learning_rate": 3.828524302423306e-06, + "loss": 0.75550508, + "num_input_tokens_seen": 57235830, + "router_z_loss_clip": 2.89453125, + "router_z_loss_mlp": 0.28308105, + "step": 2641, + "time_per_iteration": 2.6107757091522217 + }, + { + "auxiliary_loss_clip": 0.06593709, + "auxiliary_loss_mlp": 0.01287834, + "balance_loss_clip": 0.06291051, + "balance_loss_mlp": 0.01259199, + "epoch": 0.15884563354877498, + "flos": 24213532308480.0, + "grad_norm": 2.4455482341546366, + "language_loss": 0.77487421, + "learning_rate": 3.828366487835167e-06, + "loss": 0.85368967, + "num_input_tokens_seen": 57255970, + "router_z_loss_clip": 3.02929688, + "router_z_loss_mlp": 0.28674316, + "step": 2642, + "time_per_iteration": 2.549790382385254 + }, + { + "auxiliary_loss_clip": 0.06588584, + "auxiliary_loss_mlp": 0.01290508, + "balance_loss_clip": 0.06297128, + "balance_loss_mlp": 0.0126303, + "epoch": 0.15890575680144295, + "flos": 23956332600960.0, + "grad_norm": 2.206510162678276, + "language_loss": 0.71574652, + "learning_rate": 3.828208603915186e-06, + "loss": 0.79453743, + "num_input_tokens_seen": 57274435, + "router_z_loss_clip": 2.91992188, + "router_z_loss_mlp": 0.27478027, + "step": 2643, + "time_per_iteration": 2.5622386932373047 + }, + { + "auxiliary_loss_clip": 0.06581764, + "auxiliary_loss_mlp": 0.01292278, + "balance_loss_clip": 0.06295977, + "balance_loss_mlp": 0.01265432, + "epoch": 0.15896588005411091, + "flos": 21221375333760.0, + "grad_norm": 1.9554363630175624, + "language_loss": 0.78877175, + "learning_rate": 3.828050650669353e-06, + "loss": 0.86751211, + "num_input_tokens_seen": 57293115, + "router_z_loss_clip": 2.859375, + "router_z_loss_mlp": 0.26867676, + "step": 2644, + "time_per_iteration": 2.519049644470215 + }, + { + "auxiliary_loss_clip": 0.06584983, + "auxiliary_loss_mlp": 0.01285638, + "balance_loss_clip": 0.06294875, + "balance_loss_mlp": 0.01257588, + "epoch": 0.1590260033067789, + "flos": 24359203831680.0, + "grad_norm": 1.8306681743440225, + "language_loss": 0.83401352, + "learning_rate": 3.827892628103657e-06, + "loss": 0.91271967, + "num_input_tokens_seen": 57312565, + "router_z_loss_clip": 2.90234375, + "router_z_loss_mlp": 0.28039551, + "step": 2645, + "time_per_iteration": 2.5938899517059326 + }, + { + "auxiliary_loss_clip": 0.06594808, + "auxiliary_loss_mlp": 0.01293395, + "balance_loss_clip": 0.063001, + "balance_loss_mlp": 0.01263914, + "epoch": 0.15908612655944687, + "flos": 32056719960960.0, + "grad_norm": 2.510422612834076, + "language_loss": 0.70788723, + "learning_rate": 3.827734536224087e-06, + "loss": 0.78676921, + "num_input_tokens_seen": 57333360, + "router_z_loss_clip": 2.94921875, + "router_z_loss_mlp": 0.2947998, + "step": 2646, + "time_per_iteration": 2.6329824924468994 + }, + { + "auxiliary_loss_clip": 0.06588359, + "auxiliary_loss_mlp": 0.01289443, + "balance_loss_clip": 0.06303679, + "balance_loss_mlp": 0.01262728, + "epoch": 0.15914624981211484, + "flos": 17791155613440.0, + "grad_norm": 1.930709185953096, + "language_loss": 0.63532102, + "learning_rate": 3.827576375036642e-06, + "loss": 0.71409905, + "num_input_tokens_seen": 57350575, + "router_z_loss_clip": 2.84765625, + "router_z_loss_mlp": 0.26696777, + "step": 2647, + "time_per_iteration": 2.5299501419067383 + }, + { + "auxiliary_loss_clip": 0.06584711, + "auxiliary_loss_mlp": 0.01288467, + "balance_loss_clip": 0.06297973, + "balance_loss_mlp": 0.0126174, + "epoch": 0.1592063730647828, + "flos": 17718298888320.0, + "grad_norm": 2.1247786745604818, + "language_loss": 0.90530396, + "learning_rate": 3.827418144547318e-06, + "loss": 0.98403573, + "num_input_tokens_seen": 57367570, + "router_z_loss_clip": 2.86914062, + "router_z_loss_mlp": 0.26757812, + "step": 2648, + "time_per_iteration": 2.5112242698669434 + }, + { + "auxiliary_loss_clip": 0.06582057, + "auxiliary_loss_mlp": 0.01285915, + "balance_loss_clip": 0.06301906, + "balance_loss_mlp": 0.01259915, + "epoch": 0.15926649631745077, + "flos": 18808927632000.0, + "grad_norm": 2.0063837423825044, + "language_loss": 0.92929685, + "learning_rate": 3.827259844762114e-06, + "loss": 1.00797653, + "num_input_tokens_seen": 57383980, + "router_z_loss_clip": 2.80078125, + "router_z_loss_mlp": 0.26013184, + "step": 2649, + "time_per_iteration": 2.5400166511535645 + }, + { + "auxiliary_loss_clip": 0.06614827, + "auxiliary_loss_mlp": 0.01289461, + "balance_loss_clip": 0.0630791, + "balance_loss_mlp": 0.01258156, + "epoch": 0.15932661957011873, + "flos": 17571956532480.0, + "grad_norm": 3.5338623134858924, + "language_loss": 0.73033249, + "learning_rate": 3.827101475687033e-06, + "loss": 0.80937541, + "num_input_tokens_seen": 57400840, + "router_z_loss_clip": 3.07226562, + "router_z_loss_mlp": 0.31311035, + "step": 2650, + "time_per_iteration": 2.499260187149048 + }, + { + "auxiliary_loss_clip": 0.06585062, + "auxiliary_loss_mlp": 0.01286624, + "balance_loss_clip": 0.06301688, + "balance_loss_mlp": 0.01259837, + "epoch": 0.15938674282278673, + "flos": 13339432368000.0, + "grad_norm": 2.105429239138805, + "language_loss": 0.72751939, + "learning_rate": 3.826943037328082e-06, + "loss": 0.80623615, + "num_input_tokens_seen": 57419230, + "router_z_loss_clip": 2.83398438, + "router_z_loss_mlp": 0.2677002, + "step": 2651, + "time_per_iteration": 2.5559604167938232 + }, + { + "auxiliary_loss_clip": 0.06597096, + "auxiliary_loss_mlp": 0.01284795, + "balance_loss_clip": 0.06307643, + "balance_loss_mlp": 0.01257925, + "epoch": 0.1594468660754547, + "flos": 22494879613440.0, + "grad_norm": 1.8417049105495777, + "language_loss": 0.80598879, + "learning_rate": 3.8267845296912674e-06, + "loss": 0.88480765, + "num_input_tokens_seen": 57439315, + "router_z_loss_clip": 2.89257812, + "router_z_loss_mlp": 0.26855469, + "step": 2652, + "time_per_iteration": 2.562206745147705 + }, + { + "auxiliary_loss_clip": 0.06582868, + "auxiliary_loss_mlp": 0.01288009, + "balance_loss_clip": 0.06299073, + "balance_loss_mlp": 0.01260745, + "epoch": 0.15950698932812266, + "flos": 15011782882560.0, + "grad_norm": 3.0665030726784233, + "language_loss": 0.71219099, + "learning_rate": 3.826625952782601e-06, + "loss": 0.79089975, + "num_input_tokens_seen": 57454635, + "router_z_loss_clip": 2.83984375, + "router_z_loss_mlp": 0.27258301, + "step": 2653, + "time_per_iteration": 2.5217130184173584 + }, + { + "auxiliary_loss_clip": 0.06588405, + "auxiliary_loss_mlp": 0.01286539, + "balance_loss_clip": 0.06299819, + "balance_loss_mlp": 0.01261064, + "epoch": 0.15956711258079062, + "flos": 30163074013440.0, + "grad_norm": 3.2964270915620655, + "language_loss": 0.78400207, + "learning_rate": 3.826467306608095e-06, + "loss": 0.86275154, + "num_input_tokens_seen": 57476805, + "router_z_loss_clip": 2.890625, + "router_z_loss_mlp": 0.25488281, + "step": 2654, + "time_per_iteration": 2.68938946723938 + }, + { + "auxiliary_loss_clip": 0.06585521, + "auxiliary_loss_mlp": 0.01284621, + "balance_loss_clip": 0.06301536, + "balance_loss_mlp": 0.01259265, + "epoch": 0.1596272358334586, + "flos": 21039044849280.0, + "grad_norm": 1.8634603693624054, + "language_loss": 0.82786137, + "learning_rate": 3.826308591173765e-06, + "loss": 0.90656281, + "num_input_tokens_seen": 57496400, + "router_z_loss_clip": 2.83984375, + "router_z_loss_mlp": 0.25341797, + "step": 2655, + "time_per_iteration": 2.5611259937286377 + }, + { + "auxiliary_loss_clip": 0.06585874, + "auxiliary_loss_mlp": 0.01285173, + "balance_loss_clip": 0.06296754, + "balance_loss_mlp": 0.01259937, + "epoch": 0.15968735908612655, + "flos": 15273426856320.0, + "grad_norm": 1.9406686852412747, + "language_loss": 0.74707991, + "learning_rate": 3.826149806485631e-06, + "loss": 0.82579041, + "num_input_tokens_seen": 57513700, + "router_z_loss_clip": 2.890625, + "router_z_loss_mlp": 0.25244141, + "step": 2656, + "time_per_iteration": 2.510824680328369 + }, + { + "auxiliary_loss_clip": 0.06577112, + "auxiliary_loss_mlp": 0.0129381, + "balance_loss_clip": 0.06299932, + "balance_loss_mlp": 0.01268705, + "epoch": 0.15974748233879452, + "flos": 52677338647680.0, + "grad_norm": 1.8958398061879393, + "language_loss": 0.78470719, + "learning_rate": 3.825990952549713e-06, + "loss": 0.86341643, + "num_input_tokens_seen": 57536180, + "router_z_loss_clip": 2.7734375, + "router_z_loss_mlp": 0.25109863, + "step": 2657, + "time_per_iteration": 2.8164706230163574 + }, + { + "auxiliary_loss_clip": 0.06582649, + "auxiliary_loss_mlp": 0.01286585, + "balance_loss_clip": 0.062974, + "balance_loss_mlp": 0.01260514, + "epoch": 0.1598076055914625, + "flos": 18739047726720.0, + "grad_norm": 1.7078792593137306, + "language_loss": 0.75124943, + "learning_rate": 3.825832029372035e-06, + "loss": 0.82994181, + "num_input_tokens_seen": 57555025, + "router_z_loss_clip": 2.84960938, + "router_z_loss_mlp": 0.26098633, + "step": 2658, + "time_per_iteration": 2.539357900619507 + }, + { + "auxiliary_loss_clip": 0.06584077, + "auxiliary_loss_mlp": 0.01290613, + "balance_loss_clip": 0.06297718, + "balance_loss_mlp": 0.0126354, + "epoch": 0.15986772884413047, + "flos": 34357681405440.0, + "grad_norm": 1.7106510421340806, + "language_loss": 0.76173538, + "learning_rate": 3.825673036958624e-06, + "loss": 0.84048235, + "num_input_tokens_seen": 57577660, + "router_z_loss_clip": 2.86328125, + "router_z_loss_mlp": 0.27087402, + "step": 2659, + "time_per_iteration": 2.7063279151916504 + }, + { + "auxiliary_loss_clip": 0.06590043, + "auxiliary_loss_mlp": 0.01292057, + "balance_loss_clip": 0.06300306, + "balance_loss_mlp": 0.01265164, + "epoch": 0.15992785209679844, + "flos": 22061596550400.0, + "grad_norm": 2.109703300615196, + "language_loss": 0.91436422, + "learning_rate": 3.825513975315508e-06, + "loss": 0.99318516, + "num_input_tokens_seen": 57596335, + "router_z_loss_clip": 2.8984375, + "router_z_loss_mlp": 0.26855469, + "step": 2660, + "time_per_iteration": 3.960657835006714 + }, + { + "auxiliary_loss_clip": 0.06587565, + "auxiliary_loss_mlp": 0.01283697, + "balance_loss_clip": 0.06297715, + "balance_loss_mlp": 0.01257018, + "epoch": 0.1599879753494664, + "flos": 33073946928000.0, + "grad_norm": 2.772952590222661, + "language_loss": 0.79090029, + "learning_rate": 3.82535484444872e-06, + "loss": 0.86961293, + "num_input_tokens_seen": 57616830, + "router_z_loss_clip": 2.90039062, + "router_z_loss_mlp": 0.26647949, + "step": 2661, + "time_per_iteration": 2.64117693901062 + }, + { + "auxiliary_loss_clip": 0.0657732, + "auxiliary_loss_mlp": 0.01287922, + "balance_loss_clip": 0.06293119, + "balance_loss_mlp": 0.01262495, + "epoch": 0.16004809860213437, + "flos": 28045533156480.0, + "grad_norm": 1.8363743510340895, + "language_loss": 0.74837106, + "learning_rate": 3.825195644364292e-06, + "loss": 0.82702351, + "num_input_tokens_seen": 57635515, + "router_z_loss_clip": 2.84375, + "router_z_loss_mlp": 0.25390625, + "step": 2662, + "time_per_iteration": 4.100783586502075 + }, + { + "auxiliary_loss_clip": 0.06590086, + "auxiliary_loss_mlp": 0.01285907, + "balance_loss_clip": 0.06299042, + "balance_loss_mlp": 0.01259967, + "epoch": 0.16010822185480234, + "flos": 22786096878720.0, + "grad_norm": 1.8771670502098623, + "language_loss": 0.82632995, + "learning_rate": 3.825036375068263e-06, + "loss": 0.90508991, + "num_input_tokens_seen": 57654250, + "router_z_loss_clip": 2.91015625, + "router_z_loss_mlp": 0.25964355, + "step": 2663, + "time_per_iteration": 2.5558366775512695 + }, + { + "auxiliary_loss_clip": 0.06586467, + "auxiliary_loss_mlp": 0.01285272, + "balance_loss_clip": 0.06297847, + "balance_loss_mlp": 0.01260011, + "epoch": 0.16016834510747033, + "flos": 20090188414080.0, + "grad_norm": 3.3923647685745344, + "language_loss": 0.81316251, + "learning_rate": 3.824877036566672e-06, + "loss": 0.89187992, + "num_input_tokens_seen": 57672645, + "router_z_loss_clip": 2.88671875, + "router_z_loss_mlp": 0.25268555, + "step": 2664, + "time_per_iteration": 2.5118319988250732 + }, + { + "auxiliary_loss_clip": 0.06584498, + "auxiliary_loss_mlp": 0.01285586, + "balance_loss_clip": 0.06298545, + "balance_loss_mlp": 0.01259038, + "epoch": 0.1602284683601383, + "flos": 21179391638400.0, + "grad_norm": 1.6927431664351194, + "language_loss": 0.94832575, + "learning_rate": 3.824717628865561e-06, + "loss": 1.02702665, + "num_input_tokens_seen": 57691055, + "router_z_loss_clip": 2.86132812, + "router_z_loss_mlp": 0.26550293, + "step": 2665, + "time_per_iteration": 2.54654860496521 + }, + { + "auxiliary_loss_clip": 0.06588221, + "auxiliary_loss_mlp": 0.0128992, + "balance_loss_clip": 0.06298642, + "balance_loss_mlp": 0.01263051, + "epoch": 0.16028859161280626, + "flos": 14652823991040.0, + "grad_norm": 2.069431022104881, + "language_loss": 0.85796285, + "learning_rate": 3.824558151970974e-06, + "loss": 0.93674427, + "num_input_tokens_seen": 57707235, + "router_z_loss_clip": 2.89648438, + "router_z_loss_mlp": 0.26879883, + "step": 2666, + "time_per_iteration": 2.483457088470459 + }, + { + "auxiliary_loss_clip": 0.06582008, + "auxiliary_loss_mlp": 0.01292714, + "balance_loss_clip": 0.06294423, + "balance_loss_mlp": 0.01268645, + "epoch": 0.16034871486547422, + "flos": 20995677561600.0, + "grad_norm": 1.9110296287370478, + "language_loss": 0.82042331, + "learning_rate": 3.8243986058889595e-06, + "loss": 0.89917052, + "num_input_tokens_seen": 57724190, + "router_z_loss_clip": 2.875, + "router_z_loss_mlp": 0.24072266, + "step": 2667, + "time_per_iteration": 3.9772729873657227 + }, + { + "auxiliary_loss_clip": 0.06585021, + "auxiliary_loss_mlp": 0.01299108, + "balance_loss_clip": 0.06302348, + "balance_loss_mlp": 0.01272608, + "epoch": 0.1604088381181422, + "flos": 21404167015680.0, + "grad_norm": 2.2548046072843664, + "language_loss": 0.74520987, + "learning_rate": 3.824238990625567e-06, + "loss": 0.82405114, + "num_input_tokens_seen": 57743620, + "router_z_loss_clip": 2.82421875, + "router_z_loss_mlp": 0.26513672, + "step": 2668, + "time_per_iteration": 2.5379245281219482 + }, + { + "auxiliary_loss_clip": 0.06581191, + "auxiliary_loss_mlp": 0.01286404, + "balance_loss_clip": 0.06295477, + "balance_loss_mlp": 0.01259296, + "epoch": 0.16046896137081015, + "flos": 23883601656960.0, + "grad_norm": 1.6904761581724046, + "language_loss": 0.78225315, + "learning_rate": 3.824079306186848e-06, + "loss": 0.86092913, + "num_input_tokens_seen": 57764810, + "router_z_loss_clip": 2.86132812, + "router_z_loss_mlp": 0.27124023, + "step": 2669, + "time_per_iteration": 2.5322623252868652 + }, + { + "auxiliary_loss_clip": 0.06461855, + "auxiliary_loss_mlp": 0.01262059, + "balance_loss_clip": 0.06292316, + "balance_loss_mlp": 0.01253518, + "epoch": 0.16052908462347812, + "flos": 59823907453440.0, + "grad_norm": 0.8025105121256505, + "language_loss": 0.55497211, + "learning_rate": 3.823919552578861e-06, + "loss": 0.63221133, + "num_input_tokens_seen": 57824390, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.08551025, + "step": 2670, + "time_per_iteration": 3.0635480880737305 + }, + { + "auxiliary_loss_clip": 0.06584324, + "auxiliary_loss_mlp": 0.01300694, + "balance_loss_clip": 0.06294604, + "balance_loss_mlp": 0.01273097, + "epoch": 0.1605892078761461, + "flos": 18302494354560.0, + "grad_norm": 1.9278903563018932, + "language_loss": 0.79113603, + "learning_rate": 3.82375972980766e-06, + "loss": 0.86998624, + "num_input_tokens_seen": 57843665, + "router_z_loss_clip": 2.8984375, + "router_z_loss_mlp": 0.27587891, + "step": 2671, + "time_per_iteration": 2.5478527545928955 + }, + { + "auxiliary_loss_clip": 0.06586512, + "auxiliary_loss_mlp": 0.01285282, + "balance_loss_clip": 0.06298812, + "balance_loss_mlp": 0.01259914, + "epoch": 0.16064933112881408, + "flos": 32168918977920.0, + "grad_norm": 2.1901870356390964, + "language_loss": 0.65440154, + "learning_rate": 3.8235998378793086e-06, + "loss": 0.73311949, + "num_input_tokens_seen": 57863305, + "router_z_loss_clip": 2.87890625, + "router_z_loss_mlp": 0.25378418, + "step": 2672, + "time_per_iteration": 2.659353494644165 + }, + { + "auxiliary_loss_clip": 0.06589735, + "auxiliary_loss_mlp": 0.01293218, + "balance_loss_clip": 0.06296135, + "balance_loss_mlp": 0.01263916, + "epoch": 0.16070945438148204, + "flos": 19834959277440.0, + "grad_norm": 2.1290275432047037, + "language_loss": 0.86193001, + "learning_rate": 3.8234398767998675e-06, + "loss": 0.94075954, + "num_input_tokens_seen": 57883025, + "router_z_loss_clip": 2.93554688, + "router_z_loss_mlp": 0.29296875, + "step": 2673, + "time_per_iteration": 2.5288193225860596 + }, + { + "auxiliary_loss_clip": 0.06583102, + "auxiliary_loss_mlp": 0.01291016, + "balance_loss_clip": 0.06295739, + "balance_loss_mlp": 0.0126572, + "epoch": 0.16076957763415, + "flos": 18918569099520.0, + "grad_norm": 2.3065631305512473, + "language_loss": 0.73982865, + "learning_rate": 3.823279846575403e-06, + "loss": 0.81856978, + "num_input_tokens_seen": 57901430, + "router_z_loss_clip": 2.87304688, + "router_z_loss_mlp": 0.25305176, + "step": 2674, + "time_per_iteration": 2.524121046066284 + }, + { + "auxiliary_loss_clip": 0.06576435, + "auxiliary_loss_mlp": 0.0128192, + "balance_loss_clip": 0.06293078, + "balance_loss_mlp": 0.01255086, + "epoch": 0.16082970088681797, + "flos": 16770071358720.0, + "grad_norm": 3.691225614104051, + "language_loss": 0.85411537, + "learning_rate": 3.823119747211986e-06, + "loss": 0.93269891, + "num_input_tokens_seen": 57919550, + "router_z_loss_clip": 2.83007812, + "router_z_loss_mlp": 0.26806641, + "step": 2675, + "time_per_iteration": 2.4984703063964844 + }, + { + "auxiliary_loss_clip": 0.06581541, + "auxiliary_loss_mlp": 0.01285801, + "balance_loss_clip": 0.06293826, + "balance_loss_mlp": 0.01259468, + "epoch": 0.16088982413948594, + "flos": 35158560330240.0, + "grad_norm": 1.8394721735800996, + "language_loss": 0.83251232, + "learning_rate": 3.822959578715685e-06, + "loss": 0.91118574, + "num_input_tokens_seen": 57939890, + "router_z_loss_clip": 2.875, + "router_z_loss_mlp": 0.26306152, + "step": 2676, + "time_per_iteration": 2.6714260578155518 + }, + { + "auxiliary_loss_clip": 0.06567734, + "auxiliary_loss_mlp": 0.01280714, + "balance_loss_clip": 0.06290022, + "balance_loss_mlp": 0.01257456, + "epoch": 0.1609499473921539, + "flos": 18631125267840.0, + "grad_norm": 4.8459600996760805, + "language_loss": 0.74951547, + "learning_rate": 3.822799341092573e-06, + "loss": 0.82799989, + "num_input_tokens_seen": 57957410, + "router_z_loss_clip": 2.77539062, + "router_z_loss_mlp": 0.23266602, + "step": 2677, + "time_per_iteration": 2.5061256885528564 + }, + { + "auxiliary_loss_clip": 0.06577247, + "auxiliary_loss_mlp": 0.01283067, + "balance_loss_clip": 0.06292509, + "balance_loss_mlp": 0.01258164, + "epoch": 0.1610100706448219, + "flos": 33154057031040.0, + "grad_norm": 1.8038433202406936, + "language_loss": 0.77285242, + "learning_rate": 3.822639034348728e-06, + "loss": 0.85145557, + "num_input_tokens_seen": 57977900, + "router_z_loss_clip": 2.84765625, + "router_z_loss_mlp": 0.24926758, + "step": 2678, + "time_per_iteration": 2.6886472702026367 + }, + { + "auxiliary_loss_clip": 0.06581186, + "auxiliary_loss_mlp": 0.01287879, + "balance_loss_clip": 0.06295253, + "balance_loss_mlp": 0.01261271, + "epoch": 0.16107019389748986, + "flos": 34685054507520.0, + "grad_norm": 1.8476006870379242, + "language_loss": 0.71465111, + "learning_rate": 3.822478658490228e-06, + "loss": 0.79334176, + "num_input_tokens_seen": 57998210, + "router_z_loss_clip": 2.85742188, + "router_z_loss_mlp": 0.26611328, + "step": 2679, + "time_per_iteration": 2.646629810333252 + }, + { + "auxiliary_loss_clip": 0.06453654, + "auxiliary_loss_mlp": 0.01258662, + "balance_loss_clip": 0.06285442, + "balance_loss_mlp": 0.01250973, + "epoch": 0.16113031715015783, + "flos": 65730920411520.0, + "grad_norm": 0.7655469055577169, + "language_loss": 0.51874888, + "learning_rate": 3.822318213523154e-06, + "loss": 0.59587204, + "num_input_tokens_seen": 58059420, + "router_z_loss_clip": 1.6875, + "router_z_loss_mlp": 0.07678223, + "step": 2680, + "time_per_iteration": 3.3470637798309326 + }, + { + "auxiliary_loss_clip": 0.06584955, + "auxiliary_loss_mlp": 0.01288163, + "balance_loss_clip": 0.06295321, + "balance_loss_mlp": 0.01259363, + "epoch": 0.1611904404028258, + "flos": 20816156188800.0, + "grad_norm": 2.2126972690115476, + "language_loss": 0.81079412, + "learning_rate": 3.8221576994535925e-06, + "loss": 0.88952529, + "num_input_tokens_seen": 58078370, + "router_z_loss_clip": 2.8984375, + "router_z_loss_mlp": 0.28808594, + "step": 2681, + "time_per_iteration": 2.5526723861694336 + }, + { + "auxiliary_loss_clip": 0.06577247, + "auxiliary_loss_mlp": 0.01287934, + "balance_loss_clip": 0.06295492, + "balance_loss_mlp": 0.01262029, + "epoch": 0.16125056365549376, + "flos": 27020172343680.0, + "grad_norm": 2.1176985882953647, + "language_loss": 0.70093226, + "learning_rate": 3.821997116287627e-06, + "loss": 0.77958405, + "num_input_tokens_seen": 58097395, + "router_z_loss_clip": 2.81835938, + "router_z_loss_mlp": 0.25891113, + "step": 2682, + "time_per_iteration": 2.5618250370025635 + }, + { + "auxiliary_loss_clip": 0.0657934, + "auxiliary_loss_mlp": 0.01288185, + "balance_loss_clip": 0.06295457, + "balance_loss_mlp": 0.01261708, + "epoch": 0.16131068690816172, + "flos": 19281762622080.0, + "grad_norm": 2.105414566897303, + "language_loss": 0.88063419, + "learning_rate": 3.821836464031348e-06, + "loss": 0.9593094, + "num_input_tokens_seen": 58115630, + "router_z_loss_clip": 2.83789062, + "router_z_loss_mlp": 0.26464844, + "step": 2683, + "time_per_iteration": 2.528503656387329 + }, + { + "auxiliary_loss_clip": 0.06581098, + "auxiliary_loss_mlp": 0.01286491, + "balance_loss_clip": 0.06297365, + "balance_loss_mlp": 0.01260718, + "epoch": 0.16137081016082971, + "flos": 35347137943680.0, + "grad_norm": 2.6304159370219447, + "language_loss": 0.75242329, + "learning_rate": 3.821675742690849e-06, + "loss": 0.83109927, + "num_input_tokens_seen": 58138655, + "router_z_loss_clip": 2.83984375, + "router_z_loss_mlp": 0.25744629, + "step": 2684, + "time_per_iteration": 2.6683855056762695 + }, + { + "auxiliary_loss_clip": 0.06584509, + "auxiliary_loss_mlp": 0.01281022, + "balance_loss_clip": 0.0629454, + "balance_loss_mlp": 0.01253831, + "epoch": 0.16143093341349768, + "flos": 34242924839040.0, + "grad_norm": 3.4255618739056395, + "language_loss": 0.70703149, + "learning_rate": 3.821514952272223e-06, + "loss": 0.78568679, + "num_input_tokens_seen": 58157440, + "router_z_loss_clip": 2.90039062, + "router_z_loss_mlp": 0.27185059, + "step": 2685, + "time_per_iteration": 2.6502463817596436 + }, + { + "auxiliary_loss_clip": 0.06573574, + "auxiliary_loss_mlp": 0.01295712, + "balance_loss_clip": 0.06295055, + "balance_loss_mlp": 0.01269724, + "epoch": 0.16149105666616564, + "flos": 28006400499840.0, + "grad_norm": 2.7207808014988495, + "language_loss": 0.72642833, + "learning_rate": 3.821354092781567e-06, + "loss": 0.80512118, + "num_input_tokens_seen": 58176660, + "router_z_loss_clip": 2.78515625, + "router_z_loss_mlp": 0.26000977, + "step": 2686, + "time_per_iteration": 2.5685417652130127 + }, + { + "auxiliary_loss_clip": 0.06583634, + "auxiliary_loss_mlp": 0.01298345, + "balance_loss_clip": 0.06294057, + "balance_loss_mlp": 0.01269628, + "epoch": 0.1615511799188336, + "flos": 19427434145280.0, + "grad_norm": 2.058545535595822, + "language_loss": 0.82461345, + "learning_rate": 3.821193164224981e-06, + "loss": 0.90343326, + "num_input_tokens_seen": 58195085, + "router_z_loss_clip": 2.90234375, + "router_z_loss_mlp": 0.2869873, + "step": 2687, + "time_per_iteration": 2.5222442150115967 + }, + { + "auxiliary_loss_clip": 0.06594162, + "auxiliary_loss_mlp": 0.01299687, + "balance_loss_clip": 0.06299458, + "balance_loss_mlp": 0.01269109, + "epoch": 0.16161130317150157, + "flos": 22861217664000.0, + "grad_norm": 2.6401237934402575, + "language_loss": 0.72416258, + "learning_rate": 3.821032166608568e-06, + "loss": 0.80310106, + "num_input_tokens_seen": 58213540, + "router_z_loss_clip": 2.9453125, + "router_z_loss_mlp": 0.30578613, + "step": 2688, + "time_per_iteration": 2.5157902240753174 + }, + { + "auxiliary_loss_clip": 0.06589709, + "auxiliary_loss_mlp": 0.01309231, + "balance_loss_clip": 0.06303161, + "balance_loss_mlp": 0.0128161, + "epoch": 0.16167142642416954, + "flos": 26118833973120.0, + "grad_norm": 1.7781492277957918, + "language_loss": 0.76426512, + "learning_rate": 3.8208710999384325e-06, + "loss": 0.84325451, + "num_input_tokens_seen": 58236995, + "router_z_loss_clip": 2.8671875, + "router_z_loss_mlp": 0.27636719, + "step": 2689, + "time_per_iteration": 2.61681866645813 + }, + { + "auxiliary_loss_clip": 0.06586435, + "auxiliary_loss_mlp": 0.01313647, + "balance_loss_clip": 0.06302889, + "balance_loss_mlp": 0.01286182, + "epoch": 0.1617315496768375, + "flos": 22785551827200.0, + "grad_norm": 2.168912849024457, + "language_loss": 0.883026, + "learning_rate": 3.820709964220683e-06, + "loss": 0.96202683, + "num_input_tokens_seen": 58257230, + "router_z_loss_clip": 2.8359375, + "router_z_loss_mlp": 0.27478027, + "step": 2690, + "time_per_iteration": 2.542171001434326 + }, + { + "auxiliary_loss_clip": 0.06581193, + "auxiliary_loss_mlp": 0.01303059, + "balance_loss_clip": 0.06297438, + "balance_loss_mlp": 0.01277, + "epoch": 0.1617916729295055, + "flos": 22023721704960.0, + "grad_norm": 1.681429316785462, + "language_loss": 0.88894439, + "learning_rate": 3.8205487594614284e-06, + "loss": 0.96778685, + "num_input_tokens_seen": 58277080, + "router_z_loss_clip": 2.83398438, + "router_z_loss_mlp": 0.26049805, + "step": 2691, + "time_per_iteration": 2.5444743633270264 + }, + { + "auxiliary_loss_clip": 0.06592601, + "auxiliary_loss_mlp": 0.01300554, + "balance_loss_clip": 0.06297764, + "balance_loss_mlp": 0.01270108, + "epoch": 0.16185179618217346, + "flos": 23444574589440.0, + "grad_norm": 5.894128293889176, + "language_loss": 0.8353231, + "learning_rate": 3.820387485666784e-06, + "loss": 0.91425461, + "num_input_tokens_seen": 58294815, + "router_z_loss_clip": 2.94921875, + "router_z_loss_mlp": 0.30456543, + "step": 2692, + "time_per_iteration": 2.5367183685302734 + }, + { + "auxiliary_loss_clip": 0.06601407, + "auxiliary_loss_mlp": 0.01299753, + "balance_loss_clip": 0.06306131, + "balance_loss_mlp": 0.01270404, + "epoch": 0.16191191943484143, + "flos": 25673182433280.0, + "grad_norm": 2.87727514771051, + "language_loss": 0.82700074, + "learning_rate": 3.820226142842862e-06, + "loss": 0.9060123, + "num_input_tokens_seen": 58313215, + "router_z_loss_clip": 2.953125, + "router_z_loss_mlp": 0.29333496, + "step": 2693, + "time_per_iteration": 2.6187057495117188 + }, + { + "auxiliary_loss_clip": 0.06582904, + "auxiliary_loss_mlp": 0.01312533, + "balance_loss_clip": 0.06302174, + "balance_loss_mlp": 0.01286724, + "epoch": 0.1619720426875094, + "flos": 23484126516480.0, + "grad_norm": 1.4528149346161843, + "language_loss": 0.85022998, + "learning_rate": 3.820064730995783e-06, + "loss": 0.92918432, + "num_input_tokens_seen": 58333215, + "router_z_loss_clip": 2.80664062, + "router_z_loss_mlp": 0.25793457, + "step": 2694, + "time_per_iteration": 2.5672922134399414 + }, + { + "auxiliary_loss_clip": 0.06594259, + "auxiliary_loss_mlp": 0.01304563, + "balance_loss_clip": 0.0630251, + "balance_loss_mlp": 0.0127612, + "epoch": 0.16203216594017736, + "flos": 24140465948160.0, + "grad_norm": 2.1096932177369654, + "language_loss": 0.70739377, + "learning_rate": 3.819903250131667e-06, + "loss": 0.78638196, + "num_input_tokens_seen": 58351160, + "router_z_loss_clip": 2.921875, + "router_z_loss_mlp": 0.28442383, + "step": 2695, + "time_per_iteration": 2.5555880069732666 + }, + { + "auxiliary_loss_clip": 0.0659132, + "auxiliary_loss_mlp": 0.01297552, + "balance_loss_clip": 0.0630125, + "balance_loss_mlp": 0.01269943, + "epoch": 0.16209228919284532, + "flos": 22346566686720.0, + "grad_norm": 2.7194545314545153, + "language_loss": 0.83673584, + "learning_rate": 3.819741700256637e-06, + "loss": 0.91562462, + "num_input_tokens_seen": 58368505, + "router_z_loss_clip": 2.90429688, + "router_z_loss_mlp": 0.27600098, + "step": 2696, + "time_per_iteration": 2.520920753479004 + }, + { + "auxiliary_loss_clip": 0.06605247, + "auxiliary_loss_mlp": 0.01295053, + "balance_loss_clip": 0.06302903, + "balance_loss_mlp": 0.01263773, + "epoch": 0.1621524124455133, + "flos": 15820586017920.0, + "grad_norm": 2.3129442406301766, + "language_loss": 0.89183378, + "learning_rate": 3.8195800813768194e-06, + "loss": 0.97083676, + "num_input_tokens_seen": 58385085, + "router_z_loss_clip": 3.02539062, + "router_z_loss_mlp": 0.31274414, + "step": 2697, + "time_per_iteration": 2.5259652137756348 + }, + { + "auxiliary_loss_clip": 0.0658388, + "auxiliary_loss_mlp": 0.01292599, + "balance_loss_clip": 0.06303512, + "balance_loss_mlp": 0.01267004, + "epoch": 0.16221253569818128, + "flos": 30193905116160.0, + "grad_norm": 1.495271767432462, + "language_loss": 0.81588805, + "learning_rate": 3.819418393498343e-06, + "loss": 0.89465284, + "num_input_tokens_seen": 58406985, + "router_z_loss_clip": 2.8046875, + "router_z_loss_mlp": 0.25598145, + "step": 2698, + "time_per_iteration": 2.595975160598755 + }, + { + "auxiliary_loss_clip": 0.06588376, + "auxiliary_loss_mlp": 0.01284743, + "balance_loss_clip": 0.06309167, + "balance_loss_mlp": 0.01257765, + "epoch": 0.16227265895084925, + "flos": 24612546251520.0, + "grad_norm": 1.6873939512975982, + "language_loss": 0.78418016, + "learning_rate": 3.819256636627339e-06, + "loss": 0.86291134, + "num_input_tokens_seen": 58426205, + "router_z_loss_clip": 2.79296875, + "router_z_loss_mlp": 0.26965332, + "step": 2699, + "time_per_iteration": 2.5874006748199463 + }, + { + "auxiliary_loss_clip": 0.06599343, + "auxiliary_loss_mlp": 0.01283682, + "balance_loss_clip": 0.06313124, + "balance_loss_mlp": 0.0125754, + "epoch": 0.1623327822035172, + "flos": 19579436651520.0, + "grad_norm": 5.305505294911747, + "language_loss": 0.86966538, + "learning_rate": 3.81909481076994e-06, + "loss": 0.94849563, + "num_input_tokens_seen": 58443830, + "router_z_loss_clip": 2.859375, + "router_z_loss_mlp": 0.2611084, + "step": 2700, + "time_per_iteration": 4.029258966445923 + }, + { + "auxiliary_loss_clip": 0.06593184, + "auxiliary_loss_mlp": 0.01283437, + "balance_loss_clip": 0.06310724, + "balance_loss_mlp": 0.01256042, + "epoch": 0.16239290545618518, + "flos": 26475612658560.0, + "grad_norm": 1.7724025685719413, + "language_loss": 0.80958557, + "learning_rate": 3.818932915932284e-06, + "loss": 0.8883518, + "num_input_tokens_seen": 58464405, + "router_z_loss_clip": 2.83007812, + "router_z_loss_mlp": 0.27404785, + "step": 2701, + "time_per_iteration": 2.5998921394348145 + }, + { + "auxiliary_loss_clip": 0.06590648, + "auxiliary_loss_mlp": 0.01284929, + "balance_loss_clip": 0.06304645, + "balance_loss_mlp": 0.01256271, + "epoch": 0.16245302870885314, + "flos": 15857454614400.0, + "grad_norm": 1.7204107394325303, + "language_loss": 0.74345064, + "learning_rate": 3.818770952120511e-06, + "loss": 0.8222065, + "num_input_tokens_seen": 58483295, + "router_z_loss_clip": 2.859375, + "router_z_loss_mlp": 0.28649902, + "step": 2702, + "time_per_iteration": 3.937354803085327 + }, + { + "auxiliary_loss_clip": 0.06603839, + "auxiliary_loss_mlp": 0.0128822, + "balance_loss_clip": 0.06313589, + "balance_loss_mlp": 0.01259252, + "epoch": 0.1625131519615211, + "flos": 14761710771840.0, + "grad_norm": 9.119129404803312, + "language_loss": 0.7369948, + "learning_rate": 3.81860891934076e-06, + "loss": 0.81591535, + "num_input_tokens_seen": 58501205, + "router_z_loss_clip": 2.90234375, + "router_z_loss_mlp": 0.28955078, + "step": 2703, + "time_per_iteration": 2.5070807933807373 + }, + { + "auxiliary_loss_clip": 0.066023, + "auxiliary_loss_mlp": 0.01283178, + "balance_loss_clip": 0.0631163, + "balance_loss_mlp": 0.01255033, + "epoch": 0.1625732752141891, + "flos": 28228073276160.0, + "grad_norm": 2.112253840465368, + "language_loss": 0.70914233, + "learning_rate": 3.818446817599176e-06, + "loss": 0.78799713, + "num_input_tokens_seen": 58522315, + "router_z_loss_clip": 2.90820312, + "router_z_loss_mlp": 0.28112793, + "step": 2704, + "time_per_iteration": 2.6071994304656982 + }, + { + "auxiliary_loss_clip": 0.06486984, + "auxiliary_loss_mlp": 0.01271467, + "balance_loss_clip": 0.06323022, + "balance_loss_mlp": 0.01264725, + "epoch": 0.16263339846685707, + "flos": 67347268871040.0, + "grad_norm": 0.7781332743607355, + "language_loss": 0.53379726, + "learning_rate": 3.818284646901907e-06, + "loss": 0.61138183, + "num_input_tokens_seen": 58586695, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.06756592, + "step": 2705, + "time_per_iteration": 3.1592283248901367 + }, + { + "auxiliary_loss_clip": 0.06599878, + "auxiliary_loss_mlp": 0.01288619, + "balance_loss_clip": 0.06308411, + "balance_loss_mlp": 0.01259854, + "epoch": 0.16269352171952503, + "flos": 14324360785920.0, + "grad_norm": 2.6444300047772575, + "language_loss": 0.76420808, + "learning_rate": 3.818122407255102e-06, + "loss": 0.84309304, + "num_input_tokens_seen": 58602435, + "router_z_loss_clip": 2.9140625, + "router_z_loss_mlp": 0.2878418, + "step": 2706, + "time_per_iteration": 2.494798183441162 + }, + { + "auxiliary_loss_clip": 0.06595413, + "auxiliary_loss_mlp": 0.01288657, + "balance_loss_clip": 0.06307741, + "balance_loss_mlp": 0.01263015, + "epoch": 0.162753644972193, + "flos": 28367916940800.0, + "grad_norm": 2.0996317585826727, + "language_loss": 0.73324966, + "learning_rate": 3.817960098664914e-06, + "loss": 0.8120904, + "num_input_tokens_seen": 58621275, + "router_z_loss_clip": 2.87695312, + "router_z_loss_mlp": 0.25646973, + "step": 2707, + "time_per_iteration": 5.361986875534058 + }, + { + "auxiliary_loss_clip": 0.06597963, + "auxiliary_loss_mlp": 0.01297936, + "balance_loss_clip": 0.06310263, + "balance_loss_mlp": 0.01270721, + "epoch": 0.16281376822486096, + "flos": 19943971839360.0, + "grad_norm": 3.72169556400114, + "language_loss": 0.83658004, + "learning_rate": 3.817797721137495e-06, + "loss": 0.91553903, + "num_input_tokens_seen": 58637550, + "router_z_loss_clip": 2.875, + "router_z_loss_mlp": 0.27233887, + "step": 2708, + "time_per_iteration": 2.528703451156616 + }, + { + "auxiliary_loss_clip": 0.0659356, + "auxiliary_loss_mlp": 0.01292098, + "balance_loss_clip": 0.06302815, + "balance_loss_mlp": 0.01262701, + "epoch": 0.16287389147752893, + "flos": 21258118149120.0, + "grad_norm": 2.208557612842335, + "language_loss": 0.86945301, + "learning_rate": 3.817635274679006e-06, + "loss": 0.94830966, + "num_input_tokens_seen": 58654135, + "router_z_loss_clip": 2.91015625, + "router_z_loss_mlp": 0.29394531, + "step": 2709, + "time_per_iteration": 2.5158472061157227 + }, + { + "auxiliary_loss_clip": 0.06590779, + "auxiliary_loss_mlp": 0.01297599, + "balance_loss_clip": 0.06302857, + "balance_loss_mlp": 0.0127123, + "epoch": 0.1629340147301969, + "flos": 19250679957120.0, + "grad_norm": 2.0845626973210942, + "language_loss": 0.926085, + "learning_rate": 3.817472759295605e-06, + "loss": 1.00496876, + "num_input_tokens_seen": 58674320, + "router_z_loss_clip": 2.87695312, + "router_z_loss_mlp": 0.26367188, + "step": 2710, + "time_per_iteration": 2.566678762435913 + }, + { + "auxiliary_loss_clip": 0.06590527, + "auxiliary_loss_mlp": 0.01299634, + "balance_loss_clip": 0.06304915, + "balance_loss_mlp": 0.01271691, + "epoch": 0.16299413798286488, + "flos": 21255896016000.0, + "grad_norm": 2.354283395736919, + "language_loss": 0.82405818, + "learning_rate": 3.817310174993453e-06, + "loss": 0.90295976, + "num_input_tokens_seen": 58691000, + "router_z_loss_clip": 2.859375, + "router_z_loss_mlp": 0.27954102, + "step": 2711, + "time_per_iteration": 2.5129330158233643 + }, + { + "auxiliary_loss_clip": 0.06600536, + "auxiliary_loss_mlp": 0.01290666, + "balance_loss_clip": 0.0630425, + "balance_loss_mlp": 0.0126115, + "epoch": 0.16305426123553285, + "flos": 18776545228800.0, + "grad_norm": 3.9666408475565462, + "language_loss": 0.82468587, + "learning_rate": 3.817147521778719e-06, + "loss": 0.90359789, + "num_input_tokens_seen": 58710230, + "router_z_loss_clip": 2.96289062, + "router_z_loss_mlp": 0.29516602, + "step": 2712, + "time_per_iteration": 2.5337300300598145 + }, + { + "auxiliary_loss_clip": 0.06597727, + "auxiliary_loss_mlp": 0.01290483, + "balance_loss_clip": 0.06302102, + "balance_loss_mlp": 0.01261563, + "epoch": 0.16311438448820081, + "flos": 22093643537280.0, + "grad_norm": 1.9569381877955756, + "language_loss": 0.78029472, + "learning_rate": 3.816984799657568e-06, + "loss": 0.85917681, + "num_input_tokens_seen": 58728610, + "router_z_loss_clip": 2.95898438, + "router_z_loss_mlp": 0.28942871, + "step": 2713, + "time_per_iteration": 2.5238146781921387 + }, + { + "auxiliary_loss_clip": 0.06594867, + "auxiliary_loss_mlp": 0.0130017, + "balance_loss_clip": 0.06315845, + "balance_loss_mlp": 0.01271799, + "epoch": 0.16317450774086878, + "flos": 16472565037440.0, + "grad_norm": 2.250248562702171, + "language_loss": 0.80385303, + "learning_rate": 3.8168220086361715e-06, + "loss": 0.88280344, + "num_input_tokens_seen": 58744385, + "router_z_loss_clip": 2.79101562, + "router_z_loss_mlp": 0.28369141, + "step": 2714, + "time_per_iteration": 2.5166831016540527 + }, + { + "auxiliary_loss_clip": 0.06589634, + "auxiliary_loss_mlp": 0.01294838, + "balance_loss_clip": 0.06306746, + "balance_loss_mlp": 0.01269899, + "epoch": 0.16323463099353674, + "flos": 24359832737280.0, + "grad_norm": 1.8056327126335605, + "language_loss": 0.78403461, + "learning_rate": 3.816659148720702e-06, + "loss": 0.8628794, + "num_input_tokens_seen": 58763905, + "router_z_loss_clip": 2.828125, + "router_z_loss_mlp": 0.24951172, + "step": 2715, + "time_per_iteration": 2.5939090251922607 + }, + { + "auxiliary_loss_clip": 0.06588797, + "auxiliary_loss_mlp": 0.01288106, + "balance_loss_clip": 0.06304932, + "balance_loss_mlp": 0.01261952, + "epoch": 0.1632947542462047, + "flos": 24907872366720.0, + "grad_norm": 2.046246244819102, + "language_loss": 0.82485706, + "learning_rate": 3.816496219917336e-06, + "loss": 0.90362608, + "num_input_tokens_seen": 58785580, + "router_z_loss_clip": 2.84179688, + "router_z_loss_mlp": 0.26147461, + "step": 2716, + "time_per_iteration": 2.593174457550049 + }, + { + "auxiliary_loss_clip": 0.06597836, + "auxiliary_loss_mlp": 0.01294616, + "balance_loss_clip": 0.06307962, + "balance_loss_mlp": 0.01266017, + "epoch": 0.1633548774988727, + "flos": 24907285388160.0, + "grad_norm": 1.9895193792693864, + "language_loss": 0.87446529, + "learning_rate": 3.816333222232251e-06, + "loss": 0.95338982, + "num_input_tokens_seen": 58806075, + "router_z_loss_clip": 2.90234375, + "router_z_loss_mlp": 0.28613281, + "step": 2717, + "time_per_iteration": 2.55460262298584 + }, + { + "auxiliary_loss_clip": 0.0659758, + "auxiliary_loss_mlp": 0.01288078, + "balance_loss_clip": 0.06314965, + "balance_loss_mlp": 0.01262413, + "epoch": 0.16341500075154067, + "flos": 30449008471680.0, + "grad_norm": 1.9093048334188691, + "language_loss": 0.77648151, + "learning_rate": 3.816170155671629e-06, + "loss": 0.8553381, + "num_input_tokens_seen": 58827405, + "router_z_loss_clip": 2.82617188, + "router_z_loss_mlp": 0.25671387, + "step": 2718, + "time_per_iteration": 2.6473746299743652 + }, + { + "auxiliary_loss_clip": 0.06597009, + "auxiliary_loss_mlp": 0.01285687, + "balance_loss_clip": 0.0631033, + "balance_loss_mlp": 0.01259783, + "epoch": 0.16347512400420863, + "flos": 22791253904640.0, + "grad_norm": 2.222005290704418, + "language_loss": 0.74954313, + "learning_rate": 3.816007020241652e-06, + "loss": 0.82837009, + "num_input_tokens_seen": 58847205, + "router_z_loss_clip": 2.8671875, + "router_z_loss_mlp": 0.25866699, + "step": 2719, + "time_per_iteration": 2.551116704940796 + }, + { + "auxiliary_loss_clip": 0.0659292, + "auxiliary_loss_mlp": 0.01283628, + "balance_loss_clip": 0.0630803, + "balance_loss_mlp": 0.01257831, + "epoch": 0.1635352472568766, + "flos": 22639083690240.0, + "grad_norm": 1.7533438569003168, + "language_loss": 0.73446441, + "learning_rate": 3.815843815948507e-06, + "loss": 0.81322992, + "num_input_tokens_seen": 58866865, + "router_z_loss_clip": 2.84765625, + "router_z_loss_mlp": 0.25805664, + "step": 2720, + "time_per_iteration": 2.5771543979644775 + }, + { + "auxiliary_loss_clip": 0.06588636, + "auxiliary_loss_mlp": 0.01282225, + "balance_loss_clip": 0.0630826, + "balance_loss_mlp": 0.01254949, + "epoch": 0.16359537050954456, + "flos": 15528362503680.0, + "grad_norm": 2.643329433322918, + "language_loss": 0.7707237, + "learning_rate": 3.8156805427983824e-06, + "loss": 0.84943235, + "num_input_tokens_seen": 58885200, + "router_z_loss_clip": 2.80664062, + "router_z_loss_mlp": 0.27294922, + "step": 2721, + "time_per_iteration": 2.4961769580841064 + }, + { + "auxiliary_loss_clip": 0.06596414, + "auxiliary_loss_mlp": 0.0128382, + "balance_loss_clip": 0.0630523, + "balance_loss_mlp": 0.01256175, + "epoch": 0.16365549376221253, + "flos": 22096578430080.0, + "grad_norm": 2.1311655694461917, + "language_loss": 0.79885328, + "learning_rate": 3.8155172007974695e-06, + "loss": 0.87765563, + "num_input_tokens_seen": 58906385, + "router_z_loss_clip": 2.91601562, + "router_z_loss_mlp": 0.27648926, + "step": 2722, + "time_per_iteration": 2.614875078201294 + }, + { + "auxiliary_loss_clip": 0.06605944, + "auxiliary_loss_mlp": 0.01289108, + "balance_loss_clip": 0.06310583, + "balance_loss_mlp": 0.01258602, + "epoch": 0.1637156170148805, + "flos": 24067148025600.0, + "grad_norm": 1.9382892216015752, + "language_loss": 0.85628319, + "learning_rate": 3.8153537899519624e-06, + "loss": 0.93523371, + "num_input_tokens_seen": 58925040, + "router_z_loss_clip": 2.95507812, + "router_z_loss_mlp": 0.30493164, + "step": 2723, + "time_per_iteration": 2.531521797180176 + }, + { + "auxiliary_loss_clip": 0.0658607, + "auxiliary_loss_mlp": 0.01286244, + "balance_loss_clip": 0.06307479, + "balance_loss_mlp": 0.01259732, + "epoch": 0.1637757402675485, + "flos": 26692212263040.0, + "grad_norm": 4.459915510598608, + "language_loss": 0.71697843, + "learning_rate": 3.815190310268058e-06, + "loss": 0.7957015, + "num_input_tokens_seen": 58944790, + "router_z_loss_clip": 2.79101562, + "router_z_loss_mlp": 0.26477051, + "step": 2724, + "time_per_iteration": 2.577958822250366 + }, + { + "auxiliary_loss_clip": 0.06581962, + "auxiliary_loss_mlp": 0.01288602, + "balance_loss_clip": 0.06304826, + "balance_loss_mlp": 0.01263521, + "epoch": 0.16383586352021645, + "flos": 16112432188800.0, + "grad_norm": 1.9457979219444324, + "language_loss": 0.71286237, + "learning_rate": 3.815026761751955e-06, + "loss": 0.79156804, + "num_input_tokens_seen": 58962500, + "router_z_loss_clip": 2.77539062, + "router_z_loss_mlp": 0.25085449, + "step": 2725, + "time_per_iteration": 2.497311592102051 + }, + { + "auxiliary_loss_clip": 0.06590257, + "auxiliary_loss_mlp": 0.01285785, + "balance_loss_clip": 0.06310654, + "balance_loss_mlp": 0.01259761, + "epoch": 0.16389598677288442, + "flos": 19171031051520.0, + "grad_norm": 2.1904929355188325, + "language_loss": 0.89010125, + "learning_rate": 3.814863144409855e-06, + "loss": 0.96886164, + "num_input_tokens_seen": 58980355, + "router_z_loss_clip": 2.79101562, + "router_z_loss_mlp": 0.26013184, + "step": 2726, + "time_per_iteration": 2.5101511478424072 + }, + { + "auxiliary_loss_clip": 0.06595127, + "auxiliary_loss_mlp": 0.01285031, + "balance_loss_clip": 0.06307214, + "balance_loss_mlp": 0.01257732, + "epoch": 0.16395611002555238, + "flos": 21513431139840.0, + "grad_norm": 1.9675738265317178, + "language_loss": 0.75618744, + "learning_rate": 3.814699458247963e-06, + "loss": 0.83498907, + "num_input_tokens_seen": 58999505, + "router_z_loss_clip": 2.87890625, + "router_z_loss_mlp": 0.27331543, + "step": 2727, + "time_per_iteration": 2.5322039127349854 + }, + { + "auxiliary_loss_clip": 0.06578872, + "auxiliary_loss_mlp": 0.012812, + "balance_loss_clip": 0.06301126, + "balance_loss_mlp": 0.01257298, + "epoch": 0.16401623327822035, + "flos": 21477401084160.0, + "grad_norm": 2.357425852181157, + "language_loss": 0.82921708, + "learning_rate": 3.8145357032724855e-06, + "loss": 0.90781784, + "num_input_tokens_seen": 59017930, + "router_z_loss_clip": 2.78125, + "router_z_loss_mlp": 0.23913574, + "step": 2728, + "time_per_iteration": 2.538081407546997 + }, + { + "auxiliary_loss_clip": 0.06590319, + "auxiliary_loss_mlp": 0.01282423, + "balance_loss_clip": 0.0630119, + "balance_loss_mlp": 0.01255685, + "epoch": 0.1640763565308883, + "flos": 13631362392960.0, + "grad_norm": 3.359167938327165, + "language_loss": 0.85634404, + "learning_rate": 3.814371879489633e-06, + "loss": 0.93507141, + "num_input_tokens_seen": 59035130, + "router_z_loss_clip": 2.890625, + "router_z_loss_mlp": 0.26745605, + "step": 2729, + "time_per_iteration": 2.555157423019409 + }, + { + "auxiliary_loss_clip": 0.06590364, + "auxiliary_loss_mlp": 0.01282244, + "balance_loss_clip": 0.06303068, + "balance_loss_mlp": 0.01255732, + "epoch": 0.16413647978355628, + "flos": 15457057079040.0, + "grad_norm": 2.0375012641424193, + "language_loss": 0.73386455, + "learning_rate": 3.814207986905616e-06, + "loss": 0.81259066, + "num_input_tokens_seen": 59053080, + "router_z_loss_clip": 2.87695312, + "router_z_loss_mlp": 0.26477051, + "step": 2730, + "time_per_iteration": 2.5347042083740234 + }, + { + "auxiliary_loss_clip": 0.06593673, + "auxiliary_loss_mlp": 0.01289719, + "balance_loss_clip": 0.06303447, + "balance_loss_mlp": 0.01261967, + "epoch": 0.16419660303622427, + "flos": 45889043172480.0, + "grad_norm": 1.5633038653846945, + "language_loss": 0.75101161, + "learning_rate": 3.814044025526651e-06, + "loss": 0.82984555, + "num_input_tokens_seen": 59075610, + "router_z_loss_clip": 2.90234375, + "router_z_loss_mlp": 0.27734375, + "step": 2731, + "time_per_iteration": 2.7257211208343506 + }, + { + "auxiliary_loss_clip": 0.06592289, + "auxiliary_loss_mlp": 0.012866, + "balance_loss_clip": 0.06302358, + "balance_loss_mlp": 0.01258967, + "epoch": 0.16425672628889224, + "flos": 18958791859200.0, + "grad_norm": 2.3112437011786238, + "language_loss": 0.79966319, + "learning_rate": 3.8138799953589548e-06, + "loss": 0.87845206, + "num_input_tokens_seen": 59094555, + "router_z_loss_clip": 2.90039062, + "router_z_loss_mlp": 0.27648926, + "step": 2732, + "time_per_iteration": 2.5160276889801025 + }, + { + "auxiliary_loss_clip": 0.06590726, + "auxiliary_loss_mlp": 0.01293299, + "balance_loss_clip": 0.06301223, + "balance_loss_mlp": 0.01263854, + "epoch": 0.1643168495415602, + "flos": 24319316488320.0, + "grad_norm": 2.024679597680736, + "language_loss": 0.69993633, + "learning_rate": 3.8137158964087473e-06, + "loss": 0.77877665, + "num_input_tokens_seen": 59113515, + "router_z_loss_clip": 2.89648438, + "router_z_loss_mlp": 0.29467773, + "step": 2733, + "time_per_iteration": 2.53328537940979 + }, + { + "auxiliary_loss_clip": 0.06586764, + "auxiliary_loss_mlp": 0.0128512, + "balance_loss_clip": 0.06300272, + "balance_loss_mlp": 0.01256426, + "epoch": 0.16437697279422817, + "flos": 26434970628480.0, + "grad_norm": 2.0387940274909537, + "language_loss": 0.81552017, + "learning_rate": 3.8135517286822508e-06, + "loss": 0.89423895, + "num_input_tokens_seen": 59133275, + "router_z_loss_clip": 2.86132812, + "router_z_loss_mlp": 0.28674316, + "step": 2734, + "time_per_iteration": 2.567229747772217 + }, + { + "auxiliary_loss_clip": 0.0658897, + "auxiliary_loss_mlp": 0.01289023, + "balance_loss_clip": 0.06299339, + "balance_loss_mlp": 0.01261271, + "epoch": 0.16443709604689613, + "flos": 34540808503680.0, + "grad_norm": 4.048112349799869, + "language_loss": 0.82907999, + "learning_rate": 3.8133874921856914e-06, + "loss": 0.90785992, + "num_input_tokens_seen": 59154095, + "router_z_loss_clip": 2.89257812, + "router_z_loss_mlp": 0.27758789, + "step": 2735, + "time_per_iteration": 2.63996958732605 + }, + { + "auxiliary_loss_clip": 0.06579679, + "auxiliary_loss_mlp": 0.01279603, + "balance_loss_clip": 0.06297098, + "balance_loss_mlp": 0.01254783, + "epoch": 0.1644972192995641, + "flos": 23264717800320.0, + "grad_norm": 2.4207218830736417, + "language_loss": 0.80072814, + "learning_rate": 3.813223186925296e-06, + "loss": 0.87932098, + "num_input_tokens_seen": 59173795, + "router_z_loss_clip": 2.828125, + "router_z_loss_mlp": 0.24816895, + "step": 2736, + "time_per_iteration": 2.546694755554199 + }, + { + "auxiliary_loss_clip": 0.0658504, + "auxiliary_loss_mlp": 0.0128325, + "balance_loss_clip": 0.06300261, + "balance_loss_mlp": 0.01256499, + "epoch": 0.1645573425522321, + "flos": 26986825618560.0, + "grad_norm": 1.6682039059194231, + "language_loss": 0.82238322, + "learning_rate": 3.8130588129072964e-06, + "loss": 0.90106606, + "num_input_tokens_seen": 59191610, + "router_z_loss_clip": 2.84765625, + "router_z_loss_mlp": 0.2677002, + "step": 2737, + "time_per_iteration": 2.5593652725219727 + }, + { + "auxiliary_loss_clip": 0.06591076, + "auxiliary_loss_mlp": 0.0128149, + "balance_loss_clip": 0.06302774, + "balance_loss_mlp": 0.01256087, + "epoch": 0.16461746580490005, + "flos": 28739495871360.0, + "grad_norm": 1.7184215818783282, + "language_loss": 0.88135791, + "learning_rate": 3.8128943701379246e-06, + "loss": 0.96008366, + "num_input_tokens_seen": 59213000, + "router_z_loss_clip": 2.88476562, + "router_z_loss_mlp": 0.25402832, + "step": 2738, + "time_per_iteration": 2.6650192737579346 + }, + { + "auxiliary_loss_clip": 0.06589583, + "auxiliary_loss_mlp": 0.0128808, + "balance_loss_clip": 0.06299618, + "balance_loss_mlp": 0.01259446, + "epoch": 0.16467758905756802, + "flos": 24936062065920.0, + "grad_norm": 2.428798415539057, + "language_loss": 0.72705042, + "learning_rate": 3.8127298586234167e-06, + "loss": 0.80582702, + "num_input_tokens_seen": 59232340, + "router_z_loss_clip": 2.90234375, + "router_z_loss_mlp": 0.28649902, + "step": 2739, + "time_per_iteration": 4.007360935211182 + }, + { + "auxiliary_loss_clip": 0.06576341, + "auxiliary_loss_mlp": 0.0128871, + "balance_loss_clip": 0.06294868, + "balance_loss_mlp": 0.01261435, + "epoch": 0.16473771231023598, + "flos": 24833380487040.0, + "grad_norm": 2.4914045636792133, + "language_loss": 0.82377362, + "learning_rate": 3.8125652783700104e-06, + "loss": 0.90242416, + "num_input_tokens_seen": 59253950, + "router_z_loss_clip": 2.81445312, + "router_z_loss_mlp": 0.27270508, + "step": 2740, + "time_per_iteration": 2.5806076526641846 + }, + { + "auxiliary_loss_clip": 0.06593102, + "auxiliary_loss_mlp": 0.01294674, + "balance_loss_clip": 0.0629887, + "balance_loss_mlp": 0.01265218, + "epoch": 0.16479783556290395, + "flos": 39905609690880.0, + "grad_norm": 2.0874742304604785, + "language_loss": 0.6960665, + "learning_rate": 3.8124006293839475e-06, + "loss": 0.77494431, + "num_input_tokens_seen": 59275545, + "router_z_loss_clip": 2.9453125, + "router_z_loss_mlp": 0.29431152, + "step": 2741, + "time_per_iteration": 2.67899489402771 + }, + { + "auxiliary_loss_clip": 0.06583216, + "auxiliary_loss_mlp": 0.01289999, + "balance_loss_clip": 0.06296665, + "balance_loss_mlp": 0.0126295, + "epoch": 0.16485795881557191, + "flos": 19902449341440.0, + "grad_norm": 1.99300527848014, + "language_loss": 0.80380434, + "learning_rate": 3.812235911671472e-06, + "loss": 0.88253653, + "num_input_tokens_seen": 59293480, + "router_z_loss_clip": 2.86523438, + "router_z_loss_mlp": 0.27062988, + "step": 2742, + "time_per_iteration": 4.01186203956604 + }, + { + "auxiliary_loss_clip": 0.06583486, + "auxiliary_loss_mlp": 0.0128544, + "balance_loss_clip": 0.06299208, + "balance_loss_mlp": 0.01258034, + "epoch": 0.16491808206823988, + "flos": 20562017155200.0, + "grad_norm": 1.859989576393153, + "language_loss": 0.85480952, + "learning_rate": 3.8120711252388274e-06, + "loss": 0.9334988, + "num_input_tokens_seen": 59313435, + "router_z_loss_clip": 2.84375, + "router_z_loss_mlp": 0.27392578, + "step": 2743, + "time_per_iteration": 2.531813859939575 + }, + { + "auxiliary_loss_clip": 0.06583907, + "auxiliary_loss_mlp": 0.01288972, + "balance_loss_clip": 0.06300064, + "balance_loss_mlp": 0.01261018, + "epoch": 0.16497820532090787, + "flos": 23806803790080.0, + "grad_norm": 1.9796677960929725, + "language_loss": 0.87141418, + "learning_rate": 3.811906270092265e-06, + "loss": 0.95014304, + "num_input_tokens_seen": 59331535, + "router_z_loss_clip": 2.8359375, + "router_z_loss_mlp": 0.27966309, + "step": 2744, + "time_per_iteration": 2.5968780517578125 + }, + { + "auxiliary_loss_clip": 0.06573457, + "auxiliary_loss_mlp": 0.01283559, + "balance_loss_clip": 0.0629618, + "balance_loss_mlp": 0.01258847, + "epoch": 0.16503832857357584, + "flos": 25489510283520.0, + "grad_norm": 2.535956000825199, + "language_loss": 0.83221614, + "learning_rate": 3.811741346238036e-06, + "loss": 0.91078633, + "num_input_tokens_seen": 59350680, + "router_z_loss_clip": 2.76953125, + "router_z_loss_mlp": 0.24743652, + "step": 2745, + "time_per_iteration": 2.5640015602111816 + }, + { + "auxiliary_loss_clip": 0.06588263, + "auxiliary_loss_mlp": 0.01287637, + "balance_loss_clip": 0.06305014, + "balance_loss_mlp": 0.01261196, + "epoch": 0.1650984518262438, + "flos": 17681849562240.0, + "grad_norm": 2.0373309792274883, + "language_loss": 0.7743578, + "learning_rate": 3.8115763536823923e-06, + "loss": 0.85311675, + "num_input_tokens_seen": 59367020, + "router_z_loss_clip": 2.83007812, + "router_z_loss_mlp": 0.26452637, + "step": 2746, + "time_per_iteration": 5.4125282764434814 + }, + { + "auxiliary_loss_clip": 0.06589019, + "auxiliary_loss_mlp": 0.01289439, + "balance_loss_clip": 0.06303473, + "balance_loss_mlp": 0.01261723, + "epoch": 0.16515857507891177, + "flos": 18704401263360.0, + "grad_norm": 1.60188965958096, + "language_loss": 0.81673479, + "learning_rate": 3.811411292431592e-06, + "loss": 0.89551938, + "num_input_tokens_seen": 59386075, + "router_z_loss_clip": 2.85742188, + "router_z_loss_mlp": 0.27685547, + "step": 2747, + "time_per_iteration": 2.5460550785064697 + }, + { + "auxiliary_loss_clip": 0.06594047, + "auxiliary_loss_mlp": 0.0128679, + "balance_loss_clip": 0.06307407, + "balance_loss_mlp": 0.01260707, + "epoch": 0.16521869833157973, + "flos": 15015472462080.0, + "grad_norm": 2.468884923074517, + "language_loss": 0.71168172, + "learning_rate": 3.8112461624918945e-06, + "loss": 0.79049003, + "num_input_tokens_seen": 59402690, + "router_z_loss_clip": 2.86328125, + "router_z_loss_mlp": 0.26074219, + "step": 2748, + "time_per_iteration": 2.493168592453003 + }, + { + "auxiliary_loss_clip": 0.06589203, + "auxiliary_loss_mlp": 0.01284146, + "balance_loss_clip": 0.06305005, + "balance_loss_mlp": 0.01259732, + "epoch": 0.1652788215842477, + "flos": 22126654846080.0, + "grad_norm": 5.244624397631241, + "language_loss": 0.8897143, + "learning_rate": 3.811080963869561e-06, + "loss": 0.9684478, + "num_input_tokens_seen": 59421130, + "router_z_loss_clip": 2.84179688, + "router_z_loss_mlp": 0.24401855, + "step": 2749, + "time_per_iteration": 2.6453802585601807 + }, + { + "auxiliary_loss_clip": 0.0659653, + "auxiliary_loss_mlp": 0.01290094, + "balance_loss_clip": 0.06307155, + "balance_loss_mlp": 0.01261913, + "epoch": 0.16533894483691566, + "flos": 18339027534720.0, + "grad_norm": 3.9658549336517446, + "language_loss": 0.79764348, + "learning_rate": 3.8109156965708557e-06, + "loss": 0.87650967, + "num_input_tokens_seen": 59438970, + "router_z_loss_clip": 2.88867188, + "router_z_loss_mlp": 0.28210449, + "step": 2750, + "time_per_iteration": 2.5099878311157227 + }, + { + "auxiliary_loss_clip": 0.06587892, + "auxiliary_loss_mlp": 0.01285687, + "balance_loss_clip": 0.06303497, + "balance_loss_mlp": 0.01257673, + "epoch": 0.16539906808958366, + "flos": 22388592309120.0, + "grad_norm": 1.8681239023451541, + "language_loss": 0.95973986, + "learning_rate": 3.8107503606020455e-06, + "loss": 1.03847575, + "num_input_tokens_seen": 59458510, + "router_z_loss_clip": 2.84570312, + "router_z_loss_mlp": 0.2800293, + "step": 2751, + "time_per_iteration": 2.580857753753662 + }, + { + "auxiliary_loss_clip": 0.06591333, + "auxiliary_loss_mlp": 0.01293333, + "balance_loss_clip": 0.06311293, + "balance_loss_mlp": 0.01266344, + "epoch": 0.16545919134225162, + "flos": 22717726346880.0, + "grad_norm": 2.017884310231, + "language_loss": 0.71926272, + "learning_rate": 3.8105849559693997e-06, + "loss": 0.79810935, + "num_input_tokens_seen": 59477110, + "router_z_loss_clip": 2.79882812, + "router_z_loss_mlp": 0.26965332, + "step": 2752, + "time_per_iteration": 2.5533626079559326 + }, + { + "auxiliary_loss_clip": 0.06474683, + "auxiliary_loss_mlp": 0.01280412, + "balance_loss_clip": 0.06313415, + "balance_loss_mlp": 0.01272663, + "epoch": 0.1655193145949196, + "flos": 67822493702400.0, + "grad_norm": 0.7367497765392101, + "language_loss": 0.5395115, + "learning_rate": 3.810419482679192e-06, + "loss": 0.61706245, + "num_input_tokens_seen": 59541155, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.07739258, + "step": 2753, + "time_per_iteration": 3.283729314804077 + }, + { + "auxiliary_loss_clip": 0.06593385, + "auxiliary_loss_mlp": 0.01285286, + "balance_loss_clip": 0.06311026, + "balance_loss_mlp": 0.01258547, + "epoch": 0.16557943784758755, + "flos": 24287353355520.0, + "grad_norm": 1.793852310261697, + "language_loss": 0.75999093, + "learning_rate": 3.8102539407376954e-06, + "loss": 0.8387776, + "num_input_tokens_seen": 59561155, + "router_z_loss_clip": 2.82421875, + "router_z_loss_mlp": 0.26757812, + "step": 2754, + "time_per_iteration": 2.608365297317505 + }, + { + "auxiliary_loss_clip": 0.06608296, + "auxiliary_loss_mlp": 0.01288183, + "balance_loss_clip": 0.06315503, + "balance_loss_mlp": 0.01260575, + "epoch": 0.16563956110025552, + "flos": 20089727216640.0, + "grad_norm": 2.367713266740868, + "language_loss": 0.87993264, + "learning_rate": 3.810088330151188e-06, + "loss": 0.95889747, + "num_input_tokens_seen": 59580460, + "router_z_loss_clip": 2.9296875, + "router_z_loss_mlp": 0.27600098, + "step": 2755, + "time_per_iteration": 2.5239596366882324 + }, + { + "auxiliary_loss_clip": 0.06584992, + "auxiliary_loss_mlp": 0.01279054, + "balance_loss_clip": 0.06303753, + "balance_loss_mlp": 0.01253877, + "epoch": 0.16569968435292348, + "flos": 28041382379520.0, + "grad_norm": 1.6563009546595795, + "language_loss": 0.7383014, + "learning_rate": 3.80992265092595e-06, + "loss": 0.81694186, + "num_input_tokens_seen": 59600025, + "router_z_loss_clip": 2.8125, + "router_z_loss_mlp": 0.25195312, + "step": 2756, + "time_per_iteration": 2.6032936573028564 + }, + { + "auxiliary_loss_clip": 0.06582732, + "auxiliary_loss_mlp": 0.01284003, + "balance_loss_clip": 0.06305105, + "balance_loss_mlp": 0.0125817, + "epoch": 0.16575980760559147, + "flos": 26257461753600.0, + "grad_norm": 1.6426190009356174, + "language_loss": 0.75875264, + "learning_rate": 3.8097569030682636e-06, + "loss": 0.83741999, + "num_input_tokens_seen": 59620600, + "router_z_loss_clip": 2.7734375, + "router_z_loss_mlp": 0.25817871, + "step": 2757, + "time_per_iteration": 2.5745790004730225 + }, + { + "auxiliary_loss_clip": 0.06586438, + "auxiliary_loss_mlp": 0.01285191, + "balance_loss_clip": 0.063063, + "balance_loss_mlp": 0.01258822, + "epoch": 0.16581993085825944, + "flos": 26951382541440.0, + "grad_norm": 1.7077128151850376, + "language_loss": 0.85793787, + "learning_rate": 3.8095910865844137e-06, + "loss": 0.93665409, + "num_input_tokens_seen": 59641385, + "router_z_loss_clip": 2.80078125, + "router_z_loss_mlp": 0.26391602, + "step": 2758, + "time_per_iteration": 2.6094768047332764 + }, + { + "auxiliary_loss_clip": 0.06582282, + "auxiliary_loss_mlp": 0.01281611, + "balance_loss_clip": 0.06301229, + "balance_loss_mlp": 0.01255981, + "epoch": 0.1658800541109274, + "flos": 21660192766080.0, + "grad_norm": 2.0058299268215602, + "language_loss": 0.79821748, + "learning_rate": 3.809425201480689e-06, + "loss": 0.87685645, + "num_input_tokens_seen": 59659865, + "router_z_loss_clip": 2.81054688, + "router_z_loss_mlp": 0.25646973, + "step": 2759, + "time_per_iteration": 2.5326881408691406 + }, + { + "auxiliary_loss_clip": 0.06584738, + "auxiliary_loss_mlp": 0.01287284, + "balance_loss_clip": 0.06296851, + "balance_loss_mlp": 0.01258721, + "epoch": 0.16594017736359537, + "flos": 16441063102080.0, + "grad_norm": 2.640523985370613, + "language_loss": 0.76520288, + "learning_rate": 3.8092592477633793e-06, + "loss": 0.84392309, + "num_input_tokens_seen": 59678780, + "router_z_loss_clip": 2.8828125, + "router_z_loss_mlp": 0.28588867, + "step": 2760, + "time_per_iteration": 2.5365755558013916 + }, + { + "auxiliary_loss_clip": 0.06596339, + "auxiliary_loss_mlp": 0.01287081, + "balance_loss_clip": 0.06307873, + "balance_loss_mlp": 0.01260986, + "epoch": 0.16600030061626334, + "flos": 22643779518720.0, + "grad_norm": 1.8139140163731928, + "language_loss": 0.74449325, + "learning_rate": 3.8090932254387774e-06, + "loss": 0.82332754, + "num_input_tokens_seen": 59698795, + "router_z_loss_clip": 2.88671875, + "router_z_loss_mlp": 0.26086426, + "step": 2761, + "time_per_iteration": 2.5551891326904297 + }, + { + "auxiliary_loss_clip": 0.06586796, + "auxiliary_loss_mlp": 0.01291841, + "balance_loss_clip": 0.0630264, + "balance_loss_mlp": 0.01263922, + "epoch": 0.1660604238689313, + "flos": 26403887963520.0, + "grad_norm": 1.8147235749558717, + "language_loss": 0.89404368, + "learning_rate": 3.8089271345131788e-06, + "loss": 0.97283, + "num_input_tokens_seen": 59718795, + "router_z_loss_clip": 2.83984375, + "router_z_loss_mlp": 0.27905273, + "step": 2762, + "time_per_iteration": 2.587952136993408 + }, + { + "auxiliary_loss_clip": 0.0659417, + "auxiliary_loss_mlp": 0.01281866, + "balance_loss_clip": 0.0630425, + "balance_loss_mlp": 0.01255282, + "epoch": 0.16612054712159927, + "flos": 23046776530560.0, + "grad_norm": 1.779645358746394, + "language_loss": 0.8912673, + "learning_rate": 3.8087609749928822e-06, + "loss": 0.97002763, + "num_input_tokens_seen": 59737555, + "router_z_loss_clip": 2.90234375, + "router_z_loss_mlp": 0.26611328, + "step": 2763, + "time_per_iteration": 2.5509772300720215 + }, + { + "auxiliary_loss_clip": 0.06462647, + "auxiliary_loss_mlp": 0.01266671, + "balance_loss_clip": 0.06301262, + "balance_loss_mlp": 0.01259697, + "epoch": 0.16618067037426726, + "flos": 59261388266880.0, + "grad_norm": 0.7675418877188291, + "language_loss": 0.59855133, + "learning_rate": 3.8085947468841885e-06, + "loss": 0.67584455, + "num_input_tokens_seen": 59800915, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.06988525, + "step": 2764, + "time_per_iteration": 3.221308708190918 + }, + { + "auxiliary_loss_clip": 0.06595036, + "auxiliary_loss_mlp": 0.0129625, + "balance_loss_clip": 0.06311496, + "balance_loss_mlp": 0.01269607, + "epoch": 0.16624079362693522, + "flos": 27206192407680.0, + "grad_norm": 22.231303672766604, + "language_loss": 0.8298772, + "learning_rate": 3.808428450193401e-06, + "loss": 0.90879005, + "num_input_tokens_seen": 59822910, + "router_z_loss_clip": 2.83203125, + "router_z_loss_mlp": 0.26635742, + "step": 2765, + "time_per_iteration": 2.5886435508728027 + }, + { + "auxiliary_loss_clip": 0.06603917, + "auxiliary_loss_mlp": 0.0129703, + "balance_loss_clip": 0.06306268, + "balance_loss_mlp": 0.01269099, + "epoch": 0.1663009168796032, + "flos": 10929542215680.0, + "grad_norm": 2.384069935097126, + "language_loss": 0.7120772, + "learning_rate": 3.8082620849268244e-06, + "loss": 0.79108667, + "num_input_tokens_seen": 59838805, + "router_z_loss_clip": 2.97851562, + "router_z_loss_mlp": 0.27941895, + "step": 2766, + "time_per_iteration": 2.526913642883301 + }, + { + "auxiliary_loss_clip": 0.06591118, + "auxiliary_loss_mlp": 0.0128837, + "balance_loss_clip": 0.06309089, + "balance_loss_mlp": 0.01262526, + "epoch": 0.16636104013227115, + "flos": 17900168175360.0, + "grad_norm": 2.2120517261374593, + "language_loss": 0.89624047, + "learning_rate": 3.808095651090769e-06, + "loss": 0.97503531, + "num_input_tokens_seen": 59855345, + "router_z_loss_clip": 2.82226562, + "router_z_loss_mlp": 0.25830078, + "step": 2767, + "time_per_iteration": 2.4989144802093506 + }, + { + "auxiliary_loss_clip": 0.06446301, + "auxiliary_loss_mlp": 0.0126062, + "balance_loss_clip": 0.0628543, + "balance_loss_mlp": 0.01253307, + "epoch": 0.16642116338493912, + "flos": 66748342285440.0, + "grad_norm": 0.6237778354152628, + "language_loss": 0.52864301, + "learning_rate": 3.8079291486915447e-06, + "loss": 0.60571223, + "num_input_tokens_seen": 59917710, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.07293701, + "step": 2768, + "time_per_iteration": 3.263981580734253 + }, + { + "auxiliary_loss_clip": 0.06597716, + "auxiliary_loss_mlp": 0.01287278, + "balance_loss_clip": 0.06305783, + "balance_loss_mlp": 0.0126048, + "epoch": 0.16648128663760708, + "flos": 19032067854720.0, + "grad_norm": 2.5043941820877524, + "language_loss": 0.85743988, + "learning_rate": 3.8077625777354667e-06, + "loss": 0.93628991, + "num_input_tokens_seen": 59935105, + "router_z_loss_clip": 2.91796875, + "router_z_loss_mlp": 0.26782227, + "step": 2769, + "time_per_iteration": 2.5169060230255127 + }, + { + "auxiliary_loss_clip": 0.06441471, + "auxiliary_loss_mlp": 0.01258691, + "balance_loss_clip": 0.06281121, + "balance_loss_mlp": 0.01251771, + "epoch": 0.16654140989027508, + "flos": 70154370103680.0, + "grad_norm": 0.7855037683883999, + "language_loss": 0.57378197, + "learning_rate": 3.80759593822885e-06, + "loss": 0.65078354, + "num_input_tokens_seen": 59984085, + "router_z_loss_clip": 1.60351562, + "router_z_loss_mlp": 0.06939697, + "step": 2770, + "time_per_iteration": 3.0450947284698486 + }, + { + "auxiliary_loss_clip": 0.0643771, + "auxiliary_loss_mlp": 0.01261832, + "balance_loss_clip": 0.06278233, + "balance_loss_mlp": 0.01254959, + "epoch": 0.16660153314294304, + "flos": 70290398407680.0, + "grad_norm": 0.8814976481921372, + "language_loss": 0.5630703, + "learning_rate": 3.807429230178015e-06, + "loss": 0.64006579, + "num_input_tokens_seen": 60043470, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.06890869, + "step": 2771, + "time_per_iteration": 3.0379133224487305 + }, + { + "auxiliary_loss_clip": 0.06582694, + "auxiliary_loss_mlp": 0.01286148, + "balance_loss_clip": 0.06303653, + "balance_loss_mlp": 0.01260756, + "epoch": 0.166661656395611, + "flos": 23081590702080.0, + "grad_norm": 2.5291823890046534, + "language_loss": 0.71466291, + "learning_rate": 3.8072624535892817e-06, + "loss": 0.79335129, + "num_input_tokens_seen": 60063045, + "router_z_loss_clip": 2.79101562, + "router_z_loss_mlp": 0.25378418, + "step": 2772, + "time_per_iteration": 2.551870584487915 + }, + { + "auxiliary_loss_clip": 0.06576528, + "auxiliary_loss_mlp": 0.01281534, + "balance_loss_clip": 0.06298962, + "balance_loss_mlp": 0.01255082, + "epoch": 0.16672177964827897, + "flos": 28373912507520.0, + "grad_norm": 1.9791838329774285, + "language_loss": 0.87486583, + "learning_rate": 3.807095608468975e-06, + "loss": 0.95344645, + "num_input_tokens_seen": 60081945, + "router_z_loss_clip": 2.77929688, + "router_z_loss_mlp": 0.26452637, + "step": 2773, + "time_per_iteration": 2.613593339920044 + }, + { + "auxiliary_loss_clip": 0.06585228, + "auxiliary_loss_mlp": 0.01284542, + "balance_loss_clip": 0.06305268, + "balance_loss_mlp": 0.01259532, + "epoch": 0.16678190290094694, + "flos": 19095700631040.0, + "grad_norm": 2.4658170667158545, + "language_loss": 0.8279835, + "learning_rate": 3.8069286948234224e-06, + "loss": 0.90668118, + "num_input_tokens_seen": 60096820, + "router_z_loss_clip": 2.80273438, + "router_z_loss_mlp": 0.25012207, + "step": 2774, + "time_per_iteration": 2.5196969509124756 + }, + { + "auxiliary_loss_clip": 0.06592362, + "auxiliary_loss_mlp": 0.01284239, + "balance_loss_clip": 0.06307152, + "balance_loss_mlp": 0.01258871, + "epoch": 0.1668420261536149, + "flos": 21805612727040.0, + "grad_norm": 2.7739422626660053, + "language_loss": 0.84618509, + "learning_rate": 3.806761712658952e-06, + "loss": 0.92495108, + "num_input_tokens_seen": 60116140, + "router_z_loss_clip": 2.85546875, + "router_z_loss_mlp": 0.25354004, + "step": 2775, + "time_per_iteration": 2.5799014568328857 + }, + { + "auxiliary_loss_clip": 0.06591405, + "auxiliary_loss_mlp": 0.01285295, + "balance_loss_clip": 0.06311037, + "balance_loss_mlp": 0.01260702, + "epoch": 0.16690214940628287, + "flos": 19068559107840.0, + "grad_norm": 2.4582225386756793, + "language_loss": 0.81805599, + "learning_rate": 3.806594661981897e-06, + "loss": 0.89682293, + "num_input_tokens_seen": 60134235, + "router_z_loss_clip": 2.8046875, + "router_z_loss_mlp": 0.24584961, + "step": 2776, + "time_per_iteration": 2.547075033187866 + }, + { + "auxiliary_loss_clip": 0.06574798, + "auxiliary_loss_mlp": 0.01281066, + "balance_loss_clip": 0.06304269, + "balance_loss_mlp": 0.01257188, + "epoch": 0.16696227265895086, + "flos": 18594550160640.0, + "grad_norm": 2.127036404214793, + "language_loss": 0.80698764, + "learning_rate": 3.8064275427985906e-06, + "loss": 0.88554621, + "num_input_tokens_seen": 60153275, + "router_z_loss_clip": 2.703125, + "router_z_loss_mlp": 0.2388916, + "step": 2777, + "time_per_iteration": 2.701383352279663 + }, + { + "auxiliary_loss_clip": 0.06586365, + "auxiliary_loss_mlp": 0.0128362, + "balance_loss_clip": 0.06303923, + "balance_loss_mlp": 0.01258323, + "epoch": 0.16702239591161883, + "flos": 23300747856000.0, + "grad_norm": 1.7658630551266277, + "language_loss": 0.85838449, + "learning_rate": 3.806260355115371e-06, + "loss": 0.93708432, + "num_input_tokens_seen": 60173215, + "router_z_loss_clip": 2.82421875, + "router_z_loss_mlp": 0.25305176, + "step": 2778, + "time_per_iteration": 4.054275989532471 + }, + { + "auxiliary_loss_clip": 0.06594409, + "auxiliary_loss_mlp": 0.01286931, + "balance_loss_clip": 0.06310806, + "balance_loss_mlp": 0.01260908, + "epoch": 0.1670825191642868, + "flos": 24432521754240.0, + "grad_norm": 2.130533626904146, + "language_loss": 0.75036883, + "learning_rate": 3.8060930989385778e-06, + "loss": 0.82918215, + "num_input_tokens_seen": 60190515, + "router_z_loss_clip": 2.83789062, + "router_z_loss_mlp": 0.26013184, + "step": 2779, + "time_per_iteration": 2.5570623874664307 + }, + { + "auxiliary_loss_clip": 0.06586824, + "auxiliary_loss_mlp": 0.01289404, + "balance_loss_clip": 0.06304757, + "balance_loss_mlp": 0.01263237, + "epoch": 0.16714264241695476, + "flos": 26804830550400.0, + "grad_norm": 2.754931380433817, + "language_loss": 0.66534865, + "learning_rate": 3.805925774274554e-06, + "loss": 0.74411094, + "num_input_tokens_seen": 60211655, + "router_z_loss_clip": 2.8203125, + "router_z_loss_mlp": 0.26147461, + "step": 2780, + "time_per_iteration": 2.5990118980407715 + }, + { + "auxiliary_loss_clip": 0.06585376, + "auxiliary_loss_mlp": 0.01289397, + "balance_loss_clip": 0.06306757, + "balance_loss_mlp": 0.01263075, + "epoch": 0.16720276566962272, + "flos": 21841768563840.0, + "grad_norm": 3.156228906236902, + "language_loss": 0.80115324, + "learning_rate": 3.805758381129643e-06, + "loss": 0.87990093, + "num_input_tokens_seen": 60230860, + "router_z_loss_clip": 2.78320312, + "router_z_loss_mlp": 0.26318359, + "step": 2781, + "time_per_iteration": 3.9395251274108887 + }, + { + "auxiliary_loss_clip": 0.06586023, + "auxiliary_loss_mlp": 0.01284981, + "balance_loss_clip": 0.06303258, + "balance_loss_mlp": 0.01258791, + "epoch": 0.1672628889222907, + "flos": 21476814105600.0, + "grad_norm": 1.4411022993090745, + "language_loss": 0.75756633, + "learning_rate": 3.805590919510193e-06, + "loss": 0.83627641, + "num_input_tokens_seen": 60250535, + "router_z_loss_clip": 2.828125, + "router_z_loss_mlp": 0.26171875, + "step": 2782, + "time_per_iteration": 2.6298012733459473 + }, + { + "auxiliary_loss_clip": 0.06600203, + "auxiliary_loss_mlp": 0.01288992, + "balance_loss_clip": 0.06305742, + "balance_loss_mlp": 0.0126242, + "epoch": 0.16732301217495865, + "flos": 30781915943040.0, + "grad_norm": 2.647632172572772, + "language_loss": 0.6861552, + "learning_rate": 3.8054233894225547e-06, + "loss": 0.76504719, + "num_input_tokens_seen": 60269530, + "router_z_loss_clip": 2.94335938, + "router_z_loss_mlp": 0.26550293, + "step": 2783, + "time_per_iteration": 2.5996532440185547 + }, + { + "auxiliary_loss_clip": 0.06581019, + "auxiliary_loss_mlp": 0.01284416, + "balance_loss_clip": 0.06301262, + "balance_loss_mlp": 0.0125931, + "epoch": 0.16738313542762664, + "flos": 23480940061440.0, + "grad_norm": 1.7043112393392166, + "language_loss": 0.70624614, + "learning_rate": 3.805255790873081e-06, + "loss": 0.78490055, + "num_input_tokens_seen": 60289900, + "router_z_loss_clip": 2.79492188, + "router_z_loss_mlp": 0.25109863, + "step": 2784, + "time_per_iteration": 2.5658257007598877 + }, + { + "auxiliary_loss_clip": 0.06592201, + "auxiliary_loss_mlp": 0.01289494, + "balance_loss_clip": 0.06306473, + "balance_loss_mlp": 0.01263041, + "epoch": 0.1674432586802946, + "flos": 29796861744000.0, + "grad_norm": 2.259998214947441, + "language_loss": 0.61717749, + "learning_rate": 3.805088123868126e-06, + "loss": 0.69599444, + "num_input_tokens_seen": 60310025, + "router_z_loss_clip": 2.85742188, + "router_z_loss_mlp": 0.2644043, + "step": 2785, + "time_per_iteration": 4.003845691680908 + }, + { + "auxiliary_loss_clip": 0.064503, + "auxiliary_loss_mlp": 0.01262182, + "balance_loss_clip": 0.06288917, + "balance_loss_mlp": 0.01255161, + "epoch": 0.16750338193296258, + "flos": 66157228857600.0, + "grad_norm": 0.7834191651915974, + "language_loss": 0.58330011, + "learning_rate": 3.8049203884140492e-06, + "loss": 0.66042489, + "num_input_tokens_seen": 60377800, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.07037354, + "step": 2786, + "time_per_iteration": 4.598146200180054 + }, + { + "auxiliary_loss_clip": 0.06587794, + "auxiliary_loss_mlp": 0.01289611, + "balance_loss_clip": 0.06301168, + "balance_loss_mlp": 0.0126298, + "epoch": 0.16756350518563054, + "flos": 25702881505920.0, + "grad_norm": 2.328984985341375, + "language_loss": 0.76757109, + "learning_rate": 3.80475258451721e-06, + "loss": 0.84634513, + "num_input_tokens_seen": 60398215, + "router_z_loss_clip": 2.8671875, + "router_z_loss_mlp": 0.26623535, + "step": 2787, + "time_per_iteration": 2.5801339149475098 + }, + { + "auxiliary_loss_clip": 0.06585419, + "auxiliary_loss_mlp": 0.01283974, + "balance_loss_clip": 0.06301223, + "balance_loss_mlp": 0.01257891, + "epoch": 0.1676236284382985, + "flos": 23841911450880.0, + "grad_norm": 1.9360315934234018, + "language_loss": 0.78495795, + "learning_rate": 3.804584712183972e-06, + "loss": 0.86365187, + "num_input_tokens_seen": 60416910, + "router_z_loss_clip": 2.84375, + "router_z_loss_mlp": 0.26086426, + "step": 2788, + "time_per_iteration": 2.5693655014038086 + }, + { + "auxiliary_loss_clip": 0.06435917, + "auxiliary_loss_mlp": 0.0126534, + "balance_loss_clip": 0.06275532, + "balance_loss_mlp": 0.01257765, + "epoch": 0.16768375169096647, + "flos": 59891313663360.0, + "grad_norm": 0.8394736884379908, + "language_loss": 0.59391403, + "learning_rate": 3.8044167714207013e-06, + "loss": 0.67092663, + "num_input_tokens_seen": 60468660, + "router_z_loss_clip": 1.60839844, + "router_z_loss_mlp": 0.07562256, + "step": 2789, + "time_per_iteration": 3.006455659866333 + }, + { + "auxiliary_loss_clip": 0.06580187, + "auxiliary_loss_mlp": 0.01282981, + "balance_loss_clip": 0.06298364, + "balance_loss_mlp": 0.01257566, + "epoch": 0.16774387494363446, + "flos": 38444785608960.0, + "grad_norm": 1.7149926461558054, + "language_loss": 0.71297312, + "learning_rate": 3.804248762233765e-06, + "loss": 0.79160476, + "num_input_tokens_seen": 60492370, + "router_z_loss_clip": 2.8203125, + "router_z_loss_mlp": 0.25427246, + "step": 2790, + "time_per_iteration": 2.6886403560638428 + }, + { + "auxiliary_loss_clip": 0.065869, + "auxiliary_loss_mlp": 0.01286845, + "balance_loss_clip": 0.06305605, + "balance_loss_mlp": 0.01260142, + "epoch": 0.16780399819630243, + "flos": 22644156862080.0, + "grad_norm": 1.6857838889349592, + "language_loss": 0.7969588, + "learning_rate": 3.8040806846295356e-06, + "loss": 0.8756963, + "num_input_tokens_seen": 60512655, + "router_z_loss_clip": 2.8125, + "router_z_loss_mlp": 0.26696777, + "step": 2791, + "time_per_iteration": 2.542351484298706 + }, + { + "auxiliary_loss_clip": 0.06585324, + "auxiliary_loss_mlp": 0.01283873, + "balance_loss_clip": 0.06304726, + "balance_loss_mlp": 0.01256502, + "epoch": 0.1678641214489704, + "flos": 32900001851520.0, + "grad_norm": 1.6260668766519037, + "language_loss": 0.72283256, + "learning_rate": 3.8039125386143853e-06, + "loss": 0.80152452, + "num_input_tokens_seen": 60533090, + "router_z_loss_clip": 2.80664062, + "router_z_loss_mlp": 0.27355957, + "step": 2792, + "time_per_iteration": 2.681652784347534 + }, + { + "auxiliary_loss_clip": 0.06588314, + "auxiliary_loss_mlp": 0.01281257, + "balance_loss_clip": 0.06305955, + "balance_loss_mlp": 0.01256223, + "epoch": 0.16792424470163836, + "flos": 19981133925120.0, + "grad_norm": 2.7315250216088756, + "language_loss": 0.7262826, + "learning_rate": 3.803744324194691e-06, + "loss": 0.80497831, + "num_input_tokens_seen": 60553190, + "router_z_loss_clip": 2.82617188, + "router_z_loss_mlp": 0.25036621, + "step": 2793, + "time_per_iteration": 2.5261969566345215 + }, + { + "auxiliary_loss_clip": 0.06583093, + "auxiliary_loss_mlp": 0.01283488, + "balance_loss_clip": 0.06301598, + "balance_loss_mlp": 0.01257333, + "epoch": 0.16798436795430632, + "flos": 19726114423680.0, + "grad_norm": 2.037397007218884, + "language_loss": 0.78064799, + "learning_rate": 3.803576041376831e-06, + "loss": 0.85931379, + "num_input_tokens_seen": 60571995, + "router_z_loss_clip": 2.81445312, + "router_z_loss_mlp": 0.26135254, + "step": 2794, + "time_per_iteration": 2.5393919944763184 + }, + { + "auxiliary_loss_clip": 0.06580402, + "auxiliary_loss_mlp": 0.01288563, + "balance_loss_clip": 0.06298761, + "balance_loss_mlp": 0.01262206, + "epoch": 0.1680444912069743, + "flos": 28111346138880.0, + "grad_norm": 2.312644294934493, + "language_loss": 0.72345173, + "learning_rate": 3.803407690167187e-06, + "loss": 0.80214143, + "num_input_tokens_seen": 60591275, + "router_z_loss_clip": 2.81640625, + "router_z_loss_mlp": 0.26379395, + "step": 2795, + "time_per_iteration": 2.565215587615967 + }, + { + "auxiliary_loss_clip": 0.06578698, + "auxiliary_loss_mlp": 0.01278302, + "balance_loss_clip": 0.06297935, + "balance_loss_mlp": 0.01254329, + "epoch": 0.16810461445964225, + "flos": 18080695797120.0, + "grad_norm": 1.8533332907405589, + "language_loss": 0.85181081, + "learning_rate": 3.803239270572142e-06, + "loss": 0.93038082, + "num_input_tokens_seen": 60609235, + "router_z_loss_clip": 2.81054688, + "router_z_loss_mlp": 0.23986816, + "step": 2796, + "time_per_iteration": 2.627962112426758 + }, + { + "auxiliary_loss_clip": 0.06595714, + "auxiliary_loss_mlp": 0.01283274, + "balance_loss_clip": 0.0630767, + "balance_loss_mlp": 0.01256571, + "epoch": 0.16816473771231025, + "flos": 23885488373760.0, + "grad_norm": 2.13286065055067, + "language_loss": 0.82093614, + "learning_rate": 3.8030707825980838e-06, + "loss": 0.89972603, + "num_input_tokens_seen": 60629880, + "router_z_loss_clip": 2.8828125, + "router_z_loss_mlp": 0.26696777, + "step": 2797, + "time_per_iteration": 2.5887176990509033 + }, + { + "auxiliary_loss_clip": 0.06571205, + "auxiliary_loss_mlp": 0.01280989, + "balance_loss_clip": 0.06298848, + "balance_loss_mlp": 0.01257922, + "epoch": 0.1682248609649782, + "flos": 22790163801600.0, + "grad_norm": 1.6719709230048432, + "language_loss": 0.75814915, + "learning_rate": 3.802902226251401e-06, + "loss": 0.83667111, + "num_input_tokens_seen": 60651175, + "router_z_loss_clip": 2.72460938, + "router_z_loss_mlp": 0.23071289, + "step": 2798, + "time_per_iteration": 2.5682647228240967 + }, + { + "auxiliary_loss_clip": 0.06575698, + "auxiliary_loss_mlp": 0.01285158, + "balance_loss_clip": 0.06297997, + "balance_loss_mlp": 0.01261483, + "epoch": 0.16828498421764618, + "flos": 20711545966080.0, + "grad_norm": 1.6493106854951614, + "language_loss": 0.8051939, + "learning_rate": 3.8027336015384845e-06, + "loss": 0.88380253, + "num_input_tokens_seen": 60670210, + "router_z_loss_clip": 2.77929688, + "router_z_loss_mlp": 0.23669434, + "step": 2799, + "time_per_iteration": 2.5808820724487305 + }, + { + "auxiliary_loss_clip": 0.06588444, + "auxiliary_loss_mlp": 0.01290497, + "balance_loss_clip": 0.06306663, + "balance_loss_mlp": 0.01264951, + "epoch": 0.16834510747031414, + "flos": 29427714581760.0, + "grad_norm": 2.08568782894778, + "language_loss": 0.71203279, + "learning_rate": 3.8025649084657296e-06, + "loss": 0.79082221, + "num_input_tokens_seen": 60690895, + "router_z_loss_clip": 2.81835938, + "router_z_loss_mlp": 0.25561523, + "step": 2800, + "time_per_iteration": 2.6072590351104736 + }, + { + "auxiliary_loss_clip": 0.06577089, + "auxiliary_loss_mlp": 0.01284192, + "balance_loss_clip": 0.06299706, + "balance_loss_mlp": 0.01258705, + "epoch": 0.1684052307229821, + "flos": 18150407994240.0, + "grad_norm": 2.3689825925758647, + "language_loss": 0.84516144, + "learning_rate": 3.8023961470395326e-06, + "loss": 0.9237743, + "num_input_tokens_seen": 60708280, + "router_z_loss_clip": 2.7734375, + "router_z_loss_mlp": 0.25488281, + "step": 2801, + "time_per_iteration": 2.5283203125 + }, + { + "auxiliary_loss_clip": 0.06582664, + "auxiliary_loss_mlp": 0.01284981, + "balance_loss_clip": 0.06302365, + "balance_loss_mlp": 0.01258612, + "epoch": 0.16846535397565007, + "flos": 16579439320320.0, + "grad_norm": 3.0795087290353744, + "language_loss": 0.84073383, + "learning_rate": 3.8022273172662933e-06, + "loss": 0.91941023, + "num_input_tokens_seen": 60724150, + "router_z_loss_clip": 2.80273438, + "router_z_loss_mlp": 0.26391602, + "step": 2802, + "time_per_iteration": 2.493727684020996 + }, + { + "auxiliary_loss_clip": 0.06582403, + "auxiliary_loss_mlp": 0.01282997, + "balance_loss_clip": 0.0630409, + "balance_loss_mlp": 0.01256831, + "epoch": 0.16852547722831807, + "flos": 30416667995520.0, + "grad_norm": 4.967511006144659, + "language_loss": 0.81234676, + "learning_rate": 3.802058419152413e-06, + "loss": 0.89100075, + "num_input_tokens_seen": 60746485, + "router_z_loss_clip": 2.78320312, + "router_z_loss_mlp": 0.26147461, + "step": 2803, + "time_per_iteration": 2.6188409328460693 + }, + { + "auxiliary_loss_clip": 0.06578018, + "auxiliary_loss_mlp": 0.01280405, + "balance_loss_clip": 0.06301461, + "balance_loss_mlp": 0.01256157, + "epoch": 0.16858560048098603, + "flos": 33515279982720.0, + "grad_norm": 2.6560543874068205, + "language_loss": 0.77301621, + "learning_rate": 3.801889452704297e-06, + "loss": 0.85160041, + "num_input_tokens_seen": 60762875, + "router_z_loss_clip": 2.76367188, + "router_z_loss_mlp": 0.24230957, + "step": 2804, + "time_per_iteration": 2.6222236156463623 + }, + { + "auxiliary_loss_clip": 0.06456417, + "auxiliary_loss_mlp": 0.01261999, + "balance_loss_clip": 0.06296105, + "balance_loss_mlp": 0.0125524, + "epoch": 0.168645723733654, + "flos": 67390845793920.0, + "grad_norm": 0.7985418659660302, + "language_loss": 0.55433214, + "learning_rate": 3.8017204179283526e-06, + "loss": 0.63151628, + "num_input_tokens_seen": 60825510, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.06774902, + "step": 2805, + "time_per_iteration": 3.1424005031585693 + }, + { + "auxiliary_loss_clip": 0.06571464, + "auxiliary_loss_mlp": 0.01283981, + "balance_loss_clip": 0.06301463, + "balance_loss_mlp": 0.01260723, + "epoch": 0.16870584698632196, + "flos": 21331016801280.0, + "grad_norm": 1.8814500249786532, + "language_loss": 0.74235076, + "learning_rate": 3.8015513148309892e-06, + "loss": 0.82090515, + "num_input_tokens_seen": 60844440, + "router_z_loss_clip": 2.70117188, + "router_z_loss_mlp": 0.23254395, + "step": 2806, + "time_per_iteration": 2.5448226928710938 + }, + { + "auxiliary_loss_clip": 0.06569488, + "auxiliary_loss_mlp": 0.01288633, + "balance_loss_clip": 0.06295753, + "balance_loss_mlp": 0.01264123, + "epoch": 0.16876597023898993, + "flos": 20747030970240.0, + "grad_norm": 2.4625186255791407, + "language_loss": 0.70848989, + "learning_rate": 3.80138214341862e-06, + "loss": 0.78707111, + "num_input_tokens_seen": 60863210, + "router_z_loss_clip": 2.734375, + "router_z_loss_mlp": 0.24523926, + "step": 2807, + "time_per_iteration": 2.5282390117645264 + }, + { + "auxiliary_loss_clip": 0.06578949, + "auxiliary_loss_mlp": 0.01289591, + "balance_loss_clip": 0.06299816, + "balance_loss_mlp": 0.0126383, + "epoch": 0.1688260934916579, + "flos": 20309806765440.0, + "grad_norm": 3.7758907272624715, + "language_loss": 0.71724349, + "learning_rate": 3.8012129036976587e-06, + "loss": 0.79592896, + "num_input_tokens_seen": 60882510, + "router_z_loss_clip": 2.79101562, + "router_z_loss_mlp": 0.25744629, + "step": 2808, + "time_per_iteration": 2.5146172046661377 + }, + { + "auxiliary_loss_clip": 0.06592815, + "auxiliary_loss_mlp": 0.01288179, + "balance_loss_clip": 0.06306504, + "balance_loss_mlp": 0.01261119, + "epoch": 0.16888621674432586, + "flos": 20347136559360.0, + "grad_norm": 2.150924717168134, + "language_loss": 0.80452245, + "learning_rate": 3.8010435956745236e-06, + "loss": 0.88333237, + "num_input_tokens_seen": 60901105, + "router_z_loss_clip": 2.86328125, + "router_z_loss_mlp": 0.27075195, + "step": 2809, + "time_per_iteration": 2.590801477432251 + }, + { + "auxiliary_loss_clip": 0.06586212, + "auxiliary_loss_mlp": 0.01286252, + "balance_loss_clip": 0.06299593, + "balance_loss_mlp": 0.01258965, + "epoch": 0.16894633999699385, + "flos": 16248963617280.0, + "grad_norm": 2.023624064417177, + "language_loss": 0.8897475, + "learning_rate": 3.8008742193556358e-06, + "loss": 0.96847212, + "num_input_tokens_seen": 60915340, + "router_z_loss_clip": 2.86523438, + "router_z_loss_mlp": 0.27294922, + "step": 2810, + "time_per_iteration": 2.553370714187622 + }, + { + "auxiliary_loss_clip": 0.0659079, + "auxiliary_loss_mlp": 0.01302127, + "balance_loss_clip": 0.06304274, + "balance_loss_mlp": 0.01273994, + "epoch": 0.16900646324966181, + "flos": 19616347175040.0, + "grad_norm": 1.906856377822649, + "language_loss": 0.93345243, + "learning_rate": 3.800704774747416e-06, + "loss": 1.01238155, + "num_input_tokens_seen": 60933735, + "router_z_loss_clip": 2.86523438, + "router_z_loss_mlp": 0.28137207, + "step": 2811, + "time_per_iteration": 2.5584306716918945 + }, + { + "auxiliary_loss_clip": 0.06579725, + "auxiliary_loss_mlp": 0.01293368, + "balance_loss_clip": 0.0629798, + "balance_loss_mlp": 0.01266534, + "epoch": 0.16906658650232978, + "flos": 22024644099840.0, + "grad_norm": 1.777677884933971, + "language_loss": 0.80087781, + "learning_rate": 3.800535261856291e-06, + "loss": 0.87960875, + "num_input_tokens_seen": 60953105, + "router_z_loss_clip": 2.81445312, + "router_z_loss_mlp": 0.26818848, + "step": 2812, + "time_per_iteration": 2.5193934440612793 + }, + { + "auxiliary_loss_clip": 0.06578699, + "auxiliary_loss_mlp": 0.01288816, + "balance_loss_clip": 0.06299478, + "balance_loss_mlp": 0.01262983, + "epoch": 0.16912670975499774, + "flos": 11768212131840.0, + "grad_norm": 2.3060118484148586, + "language_loss": 0.76260078, + "learning_rate": 3.8003656806886887e-06, + "loss": 0.84127587, + "num_input_tokens_seen": 60969150, + "router_z_loss_clip": 2.79296875, + "router_z_loss_mlp": 0.25830078, + "step": 2813, + "time_per_iteration": 2.5597875118255615 + }, + { + "auxiliary_loss_clip": 0.06583597, + "auxiliary_loss_mlp": 0.01290749, + "balance_loss_clip": 0.06300971, + "balance_loss_mlp": 0.01265083, + "epoch": 0.1691868330076657, + "flos": 17166443898240.0, + "grad_norm": 2.6968588943339444, + "language_loss": 0.70284265, + "learning_rate": 3.8001960312510396e-06, + "loss": 0.78158611, + "num_input_tokens_seen": 60982825, + "router_z_loss_clip": 2.82617188, + "router_z_loss_mlp": 0.2565918, + "step": 2814, + "time_per_iteration": 2.4971132278442383 + }, + { + "auxiliary_loss_clip": 0.06581523, + "auxiliary_loss_mlp": 0.01299068, + "balance_loss_clip": 0.06302465, + "balance_loss_mlp": 0.01272174, + "epoch": 0.16924695626033368, + "flos": 22422693720960.0, + "grad_norm": 1.782997034372258, + "language_loss": 0.63103068, + "learning_rate": 3.800026313549776e-06, + "loss": 0.7098366, + "num_input_tokens_seen": 61000875, + "router_z_loss_clip": 2.79296875, + "router_z_loss_mlp": 0.2689209, + "step": 2815, + "time_per_iteration": 2.583073377609253 + }, + { + "auxiliary_loss_clip": 0.06579722, + "auxiliary_loss_mlp": 0.01301206, + "balance_loss_clip": 0.06305208, + "balance_loss_mlp": 0.01275195, + "epoch": 0.16930707951300164, + "flos": 25746835772160.0, + "grad_norm": 1.6235196600742487, + "language_loss": 0.82652867, + "learning_rate": 3.7998565275913342e-06, + "loss": 0.90533793, + "num_input_tokens_seen": 61021940, + "router_z_loss_clip": 2.7421875, + "router_z_loss_mlp": 0.26037598, + "step": 2816, + "time_per_iteration": 2.567267894744873 + }, + { + "auxiliary_loss_clip": 0.06582578, + "auxiliary_loss_mlp": 0.01283511, + "balance_loss_clip": 0.06305215, + "balance_loss_mlp": 0.01257404, + "epoch": 0.16936720276566963, + "flos": 22753588694400.0, + "grad_norm": 2.305113279035628, + "language_loss": 0.88275278, + "learning_rate": 3.799686673382153e-06, + "loss": 0.96141362, + "num_input_tokens_seen": 61040285, + "router_z_loss_clip": 2.77148438, + "router_z_loss_mlp": 0.26074219, + "step": 2817, + "time_per_iteration": 2.55474853515625 + }, + { + "auxiliary_loss_clip": 0.06582828, + "auxiliary_loss_mlp": 0.0128986, + "balance_loss_clip": 0.06307572, + "balance_loss_mlp": 0.01264326, + "epoch": 0.1694273260183376, + "flos": 19580191338240.0, + "grad_norm": 1.9827332941616407, + "language_loss": 0.82882643, + "learning_rate": 3.799516750928672e-06, + "loss": 0.90755332, + "num_input_tokens_seen": 61059020, + "router_z_loss_clip": 2.75195312, + "router_z_loss_mlp": 0.2557373, + "step": 2818, + "time_per_iteration": 4.006748676300049 + }, + { + "auxiliary_loss_clip": 0.06584448, + "auxiliary_loss_mlp": 0.01293023, + "balance_loss_clip": 0.06306577, + "balance_loss_mlp": 0.01267905, + "epoch": 0.16948744927100556, + "flos": 12462636044160.0, + "grad_norm": 2.7889091010227367, + "language_loss": 0.81285071, + "learning_rate": 3.799346760237336e-06, + "loss": 0.8916254, + "num_input_tokens_seen": 61074245, + "router_z_loss_clip": 2.77929688, + "router_z_loss_mlp": 0.2512207, + "step": 2819, + "time_per_iteration": 2.513493537902832 + }, + { + "auxiliary_loss_clip": 0.06486231, + "auxiliary_loss_mlp": 0.01264851, + "balance_loss_clip": 0.06326687, + "balance_loss_mlp": 0.01257299, + "epoch": 0.16954757252367353, + "flos": 71309470164480.0, + "grad_norm": 0.8945207214981431, + "language_loss": 0.6004045, + "learning_rate": 3.7991767013145902e-06, + "loss": 0.67791533, + "num_input_tokens_seen": 61127080, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.07537842, + "step": 2820, + "time_per_iteration": 3.0841901302337646 + }, + { + "auxiliary_loss_clip": 0.06583934, + "auxiliary_loss_mlp": 0.01283404, + "balance_loss_clip": 0.06305862, + "balance_loss_mlp": 0.01258656, + "epoch": 0.1696076957763415, + "flos": 29614237770240.0, + "grad_norm": 2.2684361224992315, + "language_loss": 0.79040307, + "learning_rate": 3.7990065741668844e-06, + "loss": 0.86907649, + "num_input_tokens_seen": 61146955, + "router_z_loss_clip": 2.78125, + "router_z_loss_mlp": 0.24755859, + "step": 2821, + "time_per_iteration": 4.0664753913879395 + }, + { + "auxiliary_loss_clip": 0.06580411, + "auxiliary_loss_mlp": 0.01287682, + "balance_loss_clip": 0.06301302, + "balance_loss_mlp": 0.01260884, + "epoch": 0.16966781902900946, + "flos": 24395359668480.0, + "grad_norm": 4.427680473234215, + "language_loss": 0.79946303, + "learning_rate": 3.7988363788006685e-06, + "loss": 0.87814403, + "num_input_tokens_seen": 61166605, + "router_z_loss_clip": 2.79101562, + "router_z_loss_mlp": 0.26782227, + "step": 2822, + "time_per_iteration": 2.591439962387085 + }, + { + "auxiliary_loss_clip": 0.06573688, + "auxiliary_loss_mlp": 0.01292623, + "balance_loss_clip": 0.06300368, + "balance_loss_mlp": 0.0126834, + "epoch": 0.16972794228167745, + "flos": 23045392938240.0, + "grad_norm": 1.79403732378333, + "language_loss": 0.75404185, + "learning_rate": 3.7986661152223967e-06, + "loss": 0.83270496, + "num_input_tokens_seen": 61186535, + "router_z_loss_clip": 2.734375, + "router_z_loss_mlp": 0.24291992, + "step": 2823, + "time_per_iteration": 2.607241153717041 + }, + { + "auxiliary_loss_clip": 0.06584911, + "auxiliary_loss_mlp": 0.01296286, + "balance_loss_clip": 0.06309374, + "balance_loss_mlp": 0.01270704, + "epoch": 0.16978806553434542, + "flos": 35237915746560.0, + "grad_norm": 1.9541945473914888, + "language_loss": 0.60637134, + "learning_rate": 3.7984957834385257e-06, + "loss": 0.68518329, + "num_input_tokens_seen": 61208965, + "router_z_loss_clip": 2.7578125, + "router_z_loss_mlp": 0.2557373, + "step": 2824, + "time_per_iteration": 4.110937595367432 + }, + { + "auxiliary_loss_clip": 0.06588213, + "auxiliary_loss_mlp": 0.01295922, + "balance_loss_clip": 0.06311615, + "balance_loss_mlp": 0.01271114, + "epoch": 0.16984818878701338, + "flos": 32022366986880.0, + "grad_norm": 1.641592491230249, + "language_loss": 0.73562557, + "learning_rate": 3.7983253834555144e-06, + "loss": 0.81446695, + "num_input_tokens_seen": 61230670, + "router_z_loss_clip": 2.765625, + "router_z_loss_mlp": 0.24816895, + "step": 2825, + "time_per_iteration": 2.634206533432007 + }, + { + "auxiliary_loss_clip": 0.06593174, + "auxiliary_loss_mlp": 0.01295449, + "balance_loss_clip": 0.06306911, + "balance_loss_mlp": 0.01267411, + "epoch": 0.16990831203968135, + "flos": 22824936046080.0, + "grad_norm": 2.0964880275629465, + "language_loss": 0.86494017, + "learning_rate": 3.7981549152798245e-06, + "loss": 0.94382638, + "num_input_tokens_seen": 61249510, + "router_z_loss_clip": 2.8671875, + "router_z_loss_mlp": 0.28051758, + "step": 2826, + "time_per_iteration": 4.0616254806518555 + }, + { + "auxiliary_loss_clip": 0.0658946, + "auxiliary_loss_mlp": 0.01287444, + "balance_loss_clip": 0.0630484, + "balance_loss_mlp": 0.01260122, + "epoch": 0.1699684352923493, + "flos": 23046315333120.0, + "grad_norm": 1.7026807922554432, + "language_loss": 0.83019429, + "learning_rate": 3.7979843789179196e-06, + "loss": 0.90896332, + "num_input_tokens_seen": 61269440, + "router_z_loss_clip": 2.8515625, + "router_z_loss_mlp": 0.27307129, + "step": 2827, + "time_per_iteration": 2.5943539142608643 + }, + { + "auxiliary_loss_clip": 0.0658665, + "auxiliary_loss_mlp": 0.01291922, + "balance_loss_clip": 0.06303778, + "balance_loss_mlp": 0.01264206, + "epoch": 0.17002855854501728, + "flos": 21440532487680.0, + "grad_norm": 1.9993521816112911, + "language_loss": 0.75042886, + "learning_rate": 3.797813774376267e-06, + "loss": 0.82921457, + "num_input_tokens_seen": 61288195, + "router_z_loss_clip": 2.83203125, + "router_z_loss_mlp": 0.27722168, + "step": 2828, + "time_per_iteration": 2.5574147701263428 + }, + { + "auxiliary_loss_clip": 0.06457284, + "auxiliary_loss_mlp": 0.01264115, + "balance_loss_clip": 0.06297607, + "balance_loss_mlp": 0.01257433, + "epoch": 0.17008868179768524, + "flos": 71473966928640.0, + "grad_norm": 0.7544805989931621, + "language_loss": 0.56274545, + "learning_rate": 3.797643101661336e-06, + "loss": 0.63995945, + "num_input_tokens_seen": 61350850, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.06695557, + "step": 2829, + "time_per_iteration": 3.2194459438323975 + }, + { + "auxiliary_loss_clip": 0.06582125, + "auxiliary_loss_mlp": 0.01292929, + "balance_loss_clip": 0.06305368, + "balance_loss_mlp": 0.01267168, + "epoch": 0.17014880505035324, + "flos": 24907327315200.0, + "grad_norm": 1.8200636755843338, + "language_loss": 0.84280431, + "learning_rate": 3.7974723607795983e-06, + "loss": 0.9215548, + "num_input_tokens_seen": 61370765, + "router_z_loss_clip": 2.76757812, + "router_z_loss_mlp": 0.25769043, + "step": 2830, + "time_per_iteration": 2.5831046104431152 + }, + { + "auxiliary_loss_clip": 0.0658033, + "auxiliary_loss_mlp": 0.01286886, + "balance_loss_clip": 0.06302518, + "balance_loss_mlp": 0.0125985, + "epoch": 0.1702089283030212, + "flos": 29870263520640.0, + "grad_norm": 2.350653052094916, + "language_loss": 0.78878641, + "learning_rate": 3.797301551737529e-06, + "loss": 0.86745858, + "num_input_tokens_seen": 61388935, + "router_z_loss_clip": 2.77929688, + "router_z_loss_mlp": 0.2701416, + "step": 2831, + "time_per_iteration": 2.6084542274475098 + }, + { + "auxiliary_loss_clip": 0.06581105, + "auxiliary_loss_mlp": 0.01292582, + "balance_loss_clip": 0.06301375, + "balance_loss_mlp": 0.01266975, + "epoch": 0.17026905155568917, + "flos": 17749171918080.0, + "grad_norm": 2.0319157009696327, + "language_loss": 0.80466926, + "learning_rate": 3.7971306745416044e-06, + "loss": 0.88340604, + "num_input_tokens_seen": 61407350, + "router_z_loss_clip": 2.79882812, + "router_z_loss_mlp": 0.25610352, + "step": 2832, + "time_per_iteration": 2.5211668014526367 + }, + { + "auxiliary_loss_clip": 0.06573536, + "auxiliary_loss_mlp": 0.01286888, + "balance_loss_clip": 0.06297776, + "balance_loss_mlp": 0.0126133, + "epoch": 0.17032917480835713, + "flos": 23155327895040.0, + "grad_norm": 1.986078489446087, + "language_loss": 0.89480335, + "learning_rate": 3.7969597291983046e-06, + "loss": 0.97340751, + "num_input_tokens_seen": 61429010, + "router_z_loss_clip": 2.7578125, + "router_z_loss_mlp": 0.25561523, + "step": 2833, + "time_per_iteration": 2.560952663421631 + }, + { + "auxiliary_loss_clip": 0.06575279, + "auxiliary_loss_mlp": 0.01285966, + "balance_loss_clip": 0.06302077, + "balance_loss_mlp": 0.01261123, + "epoch": 0.1703892980610251, + "flos": 39211940465280.0, + "grad_norm": 2.220027390834487, + "language_loss": 0.73524815, + "learning_rate": 3.7967887157141115e-06, + "loss": 0.81386054, + "num_input_tokens_seen": 61450040, + "router_z_loss_clip": 2.73046875, + "router_z_loss_mlp": 0.24829102, + "step": 2834, + "time_per_iteration": 2.679527521133423 + }, + { + "auxiliary_loss_clip": 0.06581013, + "auxiliary_loss_mlp": 0.01285804, + "balance_loss_clip": 0.06300581, + "balance_loss_mlp": 0.01260245, + "epoch": 0.17044942131369306, + "flos": 23045728354560.0, + "grad_norm": 1.8327084439605401, + "language_loss": 0.87308288, + "learning_rate": 3.7966176340955106e-06, + "loss": 0.95175111, + "num_input_tokens_seen": 61468585, + "router_z_loss_clip": 2.8046875, + "router_z_loss_mlp": 0.2557373, + "step": 2835, + "time_per_iteration": 2.656421661376953 + }, + { + "auxiliary_loss_clip": 0.06579748, + "auxiliary_loss_mlp": 0.01283404, + "balance_loss_clip": 0.06297451, + "balance_loss_mlp": 0.01256451, + "epoch": 0.17050954456636103, + "flos": 17060533937280.0, + "grad_norm": 2.3811755619363058, + "language_loss": 0.75235045, + "learning_rate": 3.796446484348989e-06, + "loss": 0.83098197, + "num_input_tokens_seen": 61486330, + "router_z_loss_clip": 2.82226562, + "router_z_loss_mlp": 0.26940918, + "step": 2836, + "time_per_iteration": 2.4939451217651367 + }, + { + "auxiliary_loss_clip": 0.06577778, + "auxiliary_loss_mlp": 0.01283432, + "balance_loss_clip": 0.06295718, + "balance_loss_mlp": 0.01256955, + "epoch": 0.17056966781902902, + "flos": 16842634594560.0, + "grad_norm": 2.2113478912931606, + "language_loss": 0.81597924, + "learning_rate": 3.796275266481036e-06, + "loss": 0.89459133, + "num_input_tokens_seen": 61503950, + "router_z_loss_clip": 2.8203125, + "router_z_loss_mlp": 0.26501465, + "step": 2837, + "time_per_iteration": 2.5308785438537598 + }, + { + "auxiliary_loss_clip": 0.06567004, + "auxiliary_loss_mlp": 0.01296468, + "balance_loss_clip": 0.06298919, + "balance_loss_mlp": 0.01272149, + "epoch": 0.17062979107169698, + "flos": 17718340815360.0, + "grad_norm": 2.307982469607828, + "language_loss": 0.84291762, + "learning_rate": 3.7961039804981456e-06, + "loss": 0.92155236, + "num_input_tokens_seen": 61523550, + "router_z_loss_clip": 2.67578125, + "router_z_loss_mlp": 0.24328613, + "step": 2838, + "time_per_iteration": 2.509929895401001 + }, + { + "auxiliary_loss_clip": 0.06570365, + "auxiliary_loss_mlp": 0.01284738, + "balance_loss_clip": 0.06295732, + "balance_loss_mlp": 0.01260264, + "epoch": 0.17068991432436495, + "flos": 22531035450240.0, + "grad_norm": 1.8555127422179185, + "language_loss": 0.94406807, + "learning_rate": 3.795932626406812e-06, + "loss": 1.02261913, + "num_input_tokens_seen": 61542720, + "router_z_loss_clip": 2.74804688, + "router_z_loss_mlp": 0.24450684, + "step": 2839, + "time_per_iteration": 2.588021755218506 + }, + { + "auxiliary_loss_clip": 0.06569307, + "auxiliary_loss_mlp": 0.01284917, + "balance_loss_clip": 0.06293422, + "balance_loss_mlp": 0.01256808, + "epoch": 0.17075003757703291, + "flos": 25889698183680.0, + "grad_norm": 2.1000046554588394, + "language_loss": 0.84480917, + "learning_rate": 3.7957612042135336e-06, + "loss": 0.92335141, + "num_input_tokens_seen": 61563040, + "router_z_loss_clip": 2.75976562, + "router_z_loss_mlp": 0.28100586, + "step": 2840, + "time_per_iteration": 2.5653579235076904 + }, + { + "auxiliary_loss_clip": 0.06573716, + "auxiliary_loss_mlp": 0.01290397, + "balance_loss_clip": 0.06298221, + "balance_loss_mlp": 0.01263503, + "epoch": 0.17081016082970088, + "flos": 20126931229440.0, + "grad_norm": 1.871912800472889, + "language_loss": 0.76954079, + "learning_rate": 3.79558971392481e-06, + "loss": 0.8481819, + "num_input_tokens_seen": 61581890, + "router_z_loss_clip": 2.75390625, + "router_z_loss_mlp": 0.26879883, + "step": 2841, + "time_per_iteration": 2.5525524616241455 + }, + { + "auxiliary_loss_clip": 0.06573537, + "auxiliary_loss_mlp": 0.01282668, + "balance_loss_clip": 0.06297247, + "balance_loss_mlp": 0.01257026, + "epoch": 0.17087028408236885, + "flos": 24943441224960.0, + "grad_norm": 1.6793065618865832, + "language_loss": 0.77364486, + "learning_rate": 3.7954181555471443e-06, + "loss": 0.85220695, + "num_input_tokens_seen": 61602095, + "router_z_loss_clip": 2.765625, + "router_z_loss_mlp": 0.2565918, + "step": 2842, + "time_per_iteration": 2.5674381256103516 + }, + { + "auxiliary_loss_clip": 0.06561892, + "auxiliary_loss_mlp": 0.01282368, + "balance_loss_clip": 0.06295875, + "balance_loss_mlp": 0.01257489, + "epoch": 0.17093040733503684, + "flos": 19063108592640.0, + "grad_norm": 1.967223672886595, + "language_loss": 0.87176019, + "learning_rate": 3.795246529087043e-06, + "loss": 0.95020282, + "num_input_tokens_seen": 61620400, + "router_z_loss_clip": 2.66015625, + "router_z_loss_mlp": 0.24853516, + "step": 2843, + "time_per_iteration": 2.546586036682129 + }, + { + "auxiliary_loss_clip": 0.06571361, + "auxiliary_loss_mlp": 0.01285811, + "balance_loss_clip": 0.06299275, + "balance_loss_mlp": 0.01262339, + "epoch": 0.1709905305877048, + "flos": 13083993596160.0, + "grad_norm": 1.8800221555677419, + "language_loss": 0.69446707, + "learning_rate": 3.7950748345510126e-06, + "loss": 0.7730388, + "num_input_tokens_seen": 61637680, + "router_z_loss_clip": 2.72265625, + "router_z_loss_mlp": 0.23461914, + "step": 2844, + "time_per_iteration": 2.5857818126678467 + }, + { + "auxiliary_loss_clip": 0.06575634, + "auxiliary_loss_mlp": 0.01288208, + "balance_loss_clip": 0.06299984, + "balance_loss_mlp": 0.0126346, + "epoch": 0.17105065384037277, + "flos": 19215530369280.0, + "grad_norm": 1.7660184935388845, + "language_loss": 0.79213876, + "learning_rate": 3.7949030719455646e-06, + "loss": 0.87077713, + "num_input_tokens_seen": 61655630, + "router_z_loss_clip": 2.75585938, + "router_z_loss_mlp": 0.24780273, + "step": 2845, + "time_per_iteration": 2.5564208030700684 + }, + { + "auxiliary_loss_clip": 0.06577709, + "auxiliary_loss_mlp": 0.01293667, + "balance_loss_clip": 0.06302914, + "balance_loss_mlp": 0.01268586, + "epoch": 0.17111077709304073, + "flos": 18521106456960.0, + "grad_norm": 2.255753625544696, + "language_loss": 0.79110825, + "learning_rate": 3.7947312412772127e-06, + "loss": 0.86982203, + "num_input_tokens_seen": 61673475, + "router_z_loss_clip": 2.74804688, + "router_z_loss_mlp": 0.25085449, + "step": 2846, + "time_per_iteration": 2.513607978820801 + }, + { + "auxiliary_loss_clip": 0.06568472, + "auxiliary_loss_mlp": 0.01290569, + "balance_loss_clip": 0.06298524, + "balance_loss_mlp": 0.01266727, + "epoch": 0.1711709003457087, + "flos": 25089699726720.0, + "grad_norm": 1.7214534237870849, + "language_loss": 0.80675447, + "learning_rate": 3.794559342552472e-06, + "loss": 0.88534492, + "num_input_tokens_seen": 61693370, + "router_z_loss_clip": 2.69921875, + "router_z_loss_mlp": 0.23852539, + "step": 2847, + "time_per_iteration": 2.618793249130249 + }, + { + "auxiliary_loss_clip": 0.06569728, + "auxiliary_loss_mlp": 0.01293508, + "balance_loss_clip": 0.0629475, + "balance_loss_mlp": 0.01268796, + "epoch": 0.17123102359837666, + "flos": 17572124240640.0, + "grad_norm": 2.2846174525506973, + "language_loss": 0.88074541, + "learning_rate": 3.7943873757778614e-06, + "loss": 0.95937777, + "num_input_tokens_seen": 61710820, + "router_z_loss_clip": 2.75, + "router_z_loss_mlp": 0.24719238, + "step": 2848, + "time_per_iteration": 2.487272024154663 + }, + { + "auxiliary_loss_clip": 0.06569223, + "auxiliary_loss_mlp": 0.01309638, + "balance_loss_clip": 0.06294799, + "balance_loss_mlp": 0.01284688, + "epoch": 0.17129114685104463, + "flos": 26180244616320.0, + "grad_norm": 1.906108969463994, + "language_loss": 0.76101243, + "learning_rate": 3.794215340959902e-06, + "loss": 0.83980107, + "num_input_tokens_seen": 61729855, + "router_z_loss_clip": 2.74414062, + "router_z_loss_mlp": 0.24938965, + "step": 2849, + "time_per_iteration": 2.620347738265991 + }, + { + "auxiliary_loss_clip": 0.06449599, + "auxiliary_loss_mlp": 0.01264516, + "balance_loss_clip": 0.06291912, + "balance_loss_mlp": 0.01257077, + "epoch": 0.17135127010371262, + "flos": 69290696943360.0, + "grad_norm": 0.770033327211451, + "language_loss": 0.57434958, + "learning_rate": 3.7940432381051163e-06, + "loss": 0.65149075, + "num_input_tokens_seen": 61790290, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.07421875, + "step": 2850, + "time_per_iteration": 3.1464109420776367 + }, + { + "auxiliary_loss_clip": 0.0656237, + "auxiliary_loss_mlp": 0.01301725, + "balance_loss_clip": 0.06296088, + "balance_loss_mlp": 0.01277966, + "epoch": 0.1714113933563806, + "flos": 23556857460480.0, + "grad_norm": 2.479535747356738, + "language_loss": 0.81586778, + "learning_rate": 3.793871067220031e-06, + "loss": 0.89450872, + "num_input_tokens_seen": 61809265, + "router_z_loss_clip": 2.66601562, + "router_z_loss_mlp": 0.23742676, + "step": 2851, + "time_per_iteration": 2.558507204055786 + }, + { + "auxiliary_loss_clip": 0.06565535, + "auxiliary_loss_mlp": 0.01289531, + "balance_loss_clip": 0.06298645, + "balance_loss_mlp": 0.01267119, + "epoch": 0.17147151660904855, + "flos": 21148854024960.0, + "grad_norm": 2.2154108843285107, + "language_loss": 0.94662631, + "learning_rate": 3.7936988283111764e-06, + "loss": 1.025177, + "num_input_tokens_seen": 61828980, + "router_z_loss_clip": 2.66992188, + "router_z_loss_mlp": 0.22412109, + "step": 2852, + "time_per_iteration": 2.518974542617798 + }, + { + "auxiliary_loss_clip": 0.0657506, + "auxiliary_loss_mlp": 0.01290477, + "balance_loss_clip": 0.06300224, + "balance_loss_mlp": 0.01264299, + "epoch": 0.17153163986171652, + "flos": 18630873705600.0, + "grad_norm": 1.8056831581423547, + "language_loss": 0.70245004, + "learning_rate": 3.7935265213850817e-06, + "loss": 0.7811054, + "num_input_tokens_seen": 61847915, + "router_z_loss_clip": 2.74609375, + "router_z_loss_mlp": 0.26184082, + "step": 2853, + "time_per_iteration": 2.552562952041626 + }, + { + "auxiliary_loss_clip": 0.06576742, + "auxiliary_loss_mlp": 0.01296459, + "balance_loss_clip": 0.06299934, + "balance_loss_mlp": 0.01271663, + "epoch": 0.17159176311438448, + "flos": 18229134504960.0, + "grad_norm": 2.1946039611354418, + "language_loss": 0.67477524, + "learning_rate": 3.7933541464482815e-06, + "loss": 0.75350726, + "num_input_tokens_seen": 61865570, + "router_z_loss_clip": 2.765625, + "router_z_loss_mlp": 0.2479248, + "step": 2854, + "time_per_iteration": 2.5350561141967773 + }, + { + "auxiliary_loss_clip": 0.06572944, + "auxiliary_loss_mlp": 0.0128611, + "balance_loss_clip": 0.06305773, + "balance_loss_mlp": 0.01263973, + "epoch": 0.17165188636705245, + "flos": 20744976545280.0, + "grad_norm": 1.5291061865624715, + "language_loss": 0.89537871, + "learning_rate": 3.7931817035073124e-06, + "loss": 0.97396928, + "num_input_tokens_seen": 61883340, + "router_z_loss_clip": 2.671875, + "router_z_loss_mlp": 0.22143555, + "step": 2855, + "time_per_iteration": 2.5376133918762207 + }, + { + "auxiliary_loss_clip": 0.06575546, + "auxiliary_loss_mlp": 0.01295321, + "balance_loss_clip": 0.06304888, + "balance_loss_mlp": 0.01271145, + "epoch": 0.17171200961972044, + "flos": 24906824190720.0, + "grad_norm": 2.4271457535299654, + "language_loss": 0.84835625, + "learning_rate": 3.7930091925687134e-06, + "loss": 0.9270649, + "num_input_tokens_seen": 61900610, + "router_z_loss_clip": 2.70703125, + "router_z_loss_mlp": 0.24206543, + "step": 2856, + "time_per_iteration": 2.551483392715454 + }, + { + "auxiliary_loss_clip": 0.06575087, + "auxiliary_loss_mlp": 0.01290512, + "balance_loss_clip": 0.0630254, + "balance_loss_mlp": 0.01267528, + "epoch": 0.1717721328723884, + "flos": 20163464409600.0, + "grad_norm": 7.491722293090189, + "language_loss": 0.87615776, + "learning_rate": 3.792836613639026e-06, + "loss": 0.95481372, + "num_input_tokens_seen": 61916795, + "router_z_loss_clip": 2.72460938, + "router_z_loss_mlp": 0.23010254, + "step": 2857, + "time_per_iteration": 4.012267112731934 + }, + { + "auxiliary_loss_clip": 0.06572698, + "auxiliary_loss_mlp": 0.01287955, + "balance_loss_clip": 0.06301427, + "balance_loss_mlp": 0.01262385, + "epoch": 0.17183225612505637, + "flos": 23367357452160.0, + "grad_norm": 2.309816452702101, + "language_loss": 0.78393459, + "learning_rate": 3.7926639667247947e-06, + "loss": 0.86254114, + "num_input_tokens_seen": 61936665, + "router_z_loss_clip": 2.71484375, + "router_z_loss_mlp": 0.25585938, + "step": 2858, + "time_per_iteration": 2.58130145072937 + }, + { + "auxiliary_loss_clip": 0.06589144, + "auxiliary_loss_mlp": 0.0128985, + "balance_loss_clip": 0.06303509, + "balance_loss_mlp": 0.0126453, + "epoch": 0.17189237937772434, + "flos": 18120163870080.0, + "grad_norm": 2.664171996061716, + "language_loss": 0.77798349, + "learning_rate": 3.7924912518325663e-06, + "loss": 0.85677344, + "num_input_tokens_seen": 61954415, + "router_z_loss_clip": 2.85742188, + "router_z_loss_mlp": 0.25317383, + "step": 2859, + "time_per_iteration": 2.5043106079101562 + }, + { + "auxiliary_loss_clip": 0.06572397, + "auxiliary_loss_mlp": 0.01281612, + "balance_loss_clip": 0.06301641, + "balance_loss_mlp": 0.01258939, + "epoch": 0.1719525026303923, + "flos": 23265137070720.0, + "grad_norm": 5.679736885155129, + "language_loss": 0.77697283, + "learning_rate": 3.7923184689688902e-06, + "loss": 0.85551292, + "num_input_tokens_seen": 61973940, + "router_z_loss_clip": 2.7109375, + "router_z_loss_mlp": 0.22692871, + "step": 2860, + "time_per_iteration": 2.572662591934204 + }, + { + "auxiliary_loss_clip": 0.06574808, + "auxiliary_loss_mlp": 0.01292828, + "balance_loss_clip": 0.06301817, + "balance_loss_mlp": 0.01270583, + "epoch": 0.17201262588306027, + "flos": 20816156188800.0, + "grad_norm": 2.1792765136561036, + "language_loss": 0.82509398, + "learning_rate": 3.792145618140317e-06, + "loss": 0.90377033, + "num_input_tokens_seen": 61991845, + "router_z_loss_clip": 2.72851562, + "router_z_loss_mlp": 0.22229004, + "step": 2861, + "time_per_iteration": 3.9328150749206543 + }, + { + "auxiliary_loss_clip": 0.06577721, + "auxiliary_loss_mlp": 0.01292683, + "balance_loss_clip": 0.06305138, + "balance_loss_mlp": 0.0126896, + "epoch": 0.17207274913572823, + "flos": 20382076512000.0, + "grad_norm": 2.450020121503541, + "language_loss": 0.8692534, + "learning_rate": 3.7919726993534038e-06, + "loss": 0.9479574, + "num_input_tokens_seen": 62009395, + "router_z_loss_clip": 2.72460938, + "router_z_loss_mlp": 0.23718262, + "step": 2862, + "time_per_iteration": 2.533240795135498 + }, + { + "auxiliary_loss_clip": 0.06570788, + "auxiliary_loss_mlp": 0.01286464, + "balance_loss_clip": 0.06306001, + "balance_loss_mlp": 0.01264387, + "epoch": 0.17213287238839622, + "flos": 26805082112640.0, + "grad_norm": 1.8452916722599864, + "language_loss": 0.78642774, + "learning_rate": 3.7917997126147054e-06, + "loss": 0.86500025, + "num_input_tokens_seen": 62029005, + "router_z_loss_clip": 2.64257812, + "router_z_loss_mlp": 0.22045898, + "step": 2863, + "time_per_iteration": 2.5886759757995605 + }, + { + "auxiliary_loss_clip": 0.06585991, + "auxiliary_loss_mlp": 0.0129295, + "balance_loss_clip": 0.06318994, + "balance_loss_mlp": 0.01270336, + "epoch": 0.1721929956410642, + "flos": 26037927256320.0, + "grad_norm": 1.9522517065159992, + "language_loss": 0.73622, + "learning_rate": 3.7916266579307823e-06, + "loss": 0.81500947, + "num_input_tokens_seen": 62048730, + "router_z_loss_clip": 2.66796875, + "router_z_loss_mlp": 0.22631836, + "step": 2864, + "time_per_iteration": 4.05191445350647 + }, + { + "auxiliary_loss_clip": 0.06582697, + "auxiliary_loss_mlp": 0.01292894, + "balance_loss_clip": 0.06309051, + "balance_loss_mlp": 0.01269362, + "epoch": 0.17225311889373215, + "flos": 22279621674240.0, + "grad_norm": 1.6774687827131978, + "language_loss": 0.73856592, + "learning_rate": 3.7914535353081973e-06, + "loss": 0.81732178, + "num_input_tokens_seen": 62069000, + "router_z_loss_clip": 2.73632812, + "router_z_loss_mlp": 0.23535156, + "step": 2865, + "time_per_iteration": 3.9612531661987305 + }, + { + "auxiliary_loss_clip": 0.06584621, + "auxiliary_loss_mlp": 0.01305521, + "balance_loss_clip": 0.06313194, + "balance_loss_mlp": 0.01281405, + "epoch": 0.17231324214640012, + "flos": 21294106277760.0, + "grad_norm": 2.4869534197111385, + "language_loss": 0.79160404, + "learning_rate": 3.7912803447535145e-06, + "loss": 0.87050545, + "num_input_tokens_seen": 62086750, + "router_z_loss_clip": 2.71484375, + "router_z_loss_mlp": 0.24121094, + "step": 2866, + "time_per_iteration": 2.542663812637329 + }, + { + "auxiliary_loss_clip": 0.06586975, + "auxiliary_loss_mlp": 0.01295234, + "balance_loss_clip": 0.0631168, + "balance_loss_mlp": 0.01269688, + "epoch": 0.17237336539906808, + "flos": 19686520569600.0, + "grad_norm": 2.39942640082668, + "language_loss": 0.80413449, + "learning_rate": 3.7911070862733016e-06, + "loss": 0.8829565, + "num_input_tokens_seen": 62106240, + "router_z_loss_clip": 2.75390625, + "router_z_loss_mlp": 0.25549316, + "step": 2867, + "time_per_iteration": 2.524634599685669 + }, + { + "auxiliary_loss_clip": 0.06577912, + "auxiliary_loss_mlp": 0.01291096, + "balance_loss_clip": 0.063054, + "balance_loss_mlp": 0.01267123, + "epoch": 0.17243348865173605, + "flos": 17535339498240.0, + "grad_norm": 1.6440546002054504, + "language_loss": 0.80347586, + "learning_rate": 3.7909337598741276e-06, + "loss": 0.88216591, + "num_input_tokens_seen": 62124895, + "router_z_loss_clip": 2.72460938, + "router_z_loss_mlp": 0.23974609, + "step": 2868, + "time_per_iteration": 2.5237460136413574 + }, + { + "auxiliary_loss_clip": 0.06586674, + "auxiliary_loss_mlp": 0.0129419, + "balance_loss_clip": 0.06310418, + "balance_loss_mlp": 0.01270241, + "epoch": 0.17249361190440402, + "flos": 18265751539200.0, + "grad_norm": 1.9212015042396675, + "language_loss": 0.84995282, + "learning_rate": 3.7907603655625674e-06, + "loss": 0.92876148, + "num_input_tokens_seen": 62143510, + "router_z_loss_clip": 2.76171875, + "router_z_loss_mlp": 0.23937988, + "step": 2869, + "time_per_iteration": 2.4968101978302 + }, + { + "auxiliary_loss_clip": 0.06574747, + "auxiliary_loss_mlp": 0.01290391, + "balance_loss_clip": 0.06302473, + "balance_loss_mlp": 0.01265393, + "epoch": 0.172553735157072, + "flos": 21180020544000.0, + "grad_norm": 2.372251531694949, + "language_loss": 0.78318757, + "learning_rate": 3.7905869033451932e-06, + "loss": 0.861839, + "num_input_tokens_seen": 62162285, + "router_z_loss_clip": 2.71875, + "router_z_loss_mlp": 0.25, + "step": 2870, + "time_per_iteration": 2.6494200229644775 + }, + { + "auxiliary_loss_clip": 0.06572236, + "auxiliary_loss_mlp": 0.01286981, + "balance_loss_clip": 0.06308384, + "balance_loss_mlp": 0.01266083, + "epoch": 0.17261385840973997, + "flos": 22279831309440.0, + "grad_norm": 1.8100610801094352, + "language_loss": 0.77937269, + "learning_rate": 3.7904133732285857e-06, + "loss": 0.85796487, + "num_input_tokens_seen": 62180970, + "router_z_loss_clip": 2.63671875, + "router_z_loss_mlp": 0.20910645, + "step": 2871, + "time_per_iteration": 2.6145200729370117 + }, + { + "auxiliary_loss_clip": 0.06580749, + "auxiliary_loss_mlp": 0.01284391, + "balance_loss_clip": 0.06306709, + "balance_loss_mlp": 0.01260263, + "epoch": 0.17267398166240794, + "flos": 27928680238080.0, + "grad_norm": 2.361348336036686, + "language_loss": 0.75478256, + "learning_rate": 3.7902397752193228e-06, + "loss": 0.83343399, + "num_input_tokens_seen": 62198965, + "router_z_loss_clip": 2.74414062, + "router_z_loss_mlp": 0.24157715, + "step": 2872, + "time_per_iteration": 2.598762035369873 + }, + { + "auxiliary_loss_clip": 0.06570577, + "auxiliary_loss_mlp": 0.01297063, + "balance_loss_clip": 0.06302171, + "balance_loss_mlp": 0.01274067, + "epoch": 0.1727341049150759, + "flos": 21951661593600.0, + "grad_norm": 1.9699566193216007, + "language_loss": 0.83421481, + "learning_rate": 3.790066109323988e-06, + "loss": 0.91289121, + "num_input_tokens_seen": 62219890, + "router_z_loss_clip": 2.68359375, + "router_z_loss_mlp": 0.23010254, + "step": 2873, + "time_per_iteration": 2.5375001430511475 + }, + { + "auxiliary_loss_clip": 0.06575856, + "auxiliary_loss_mlp": 0.01290457, + "balance_loss_clip": 0.06307539, + "balance_loss_mlp": 0.01266198, + "epoch": 0.17279422816774387, + "flos": 18112742784000.0, + "grad_norm": 2.023952379864123, + "language_loss": 0.75553465, + "learning_rate": 3.7898923755491678e-06, + "loss": 0.83419782, + "num_input_tokens_seen": 62237140, + "router_z_loss_clip": 2.6796875, + "router_z_loss_mlp": 0.24243164, + "step": 2874, + "time_per_iteration": 2.6628403663635254 + }, + { + "auxiliary_loss_clip": 0.06583337, + "auxiliary_loss_mlp": 0.01288686, + "balance_loss_clip": 0.06308968, + "balance_loss_mlp": 0.01261959, + "epoch": 0.17285435142041183, + "flos": 21841936272000.0, + "grad_norm": 2.156422611189301, + "language_loss": 0.81707162, + "learning_rate": 3.7897185739014487e-06, + "loss": 0.89579183, + "num_input_tokens_seen": 62255405, + "router_z_loss_clip": 2.7421875, + "router_z_loss_mlp": 0.26733398, + "step": 2875, + "time_per_iteration": 2.5195512771606445 + }, + { + "auxiliary_loss_clip": 0.06576921, + "auxiliary_loss_mlp": 0.0129142, + "balance_loss_clip": 0.06303119, + "balance_loss_mlp": 0.01265122, + "epoch": 0.17291447467307983, + "flos": 18374219049600.0, + "grad_norm": 2.297860169925143, + "language_loss": 0.89334786, + "learning_rate": 3.7895447043874217e-06, + "loss": 0.9720313, + "num_input_tokens_seen": 62271280, + "router_z_loss_clip": 2.73828125, + "router_z_loss_mlp": 0.26281738, + "step": 2876, + "time_per_iteration": 2.5156540870666504 + }, + { + "auxiliary_loss_clip": 0.06576936, + "auxiliary_loss_mlp": 0.01286777, + "balance_loss_clip": 0.06307175, + "balance_loss_mlp": 0.01262793, + "epoch": 0.1729745979257478, + "flos": 18630580216320.0, + "grad_norm": 2.037856806425618, + "language_loss": 0.85539293, + "learning_rate": 3.789370767013681e-06, + "loss": 0.93403006, + "num_input_tokens_seen": 62289140, + "router_z_loss_clip": 2.69726562, + "router_z_loss_mlp": 0.23986816, + "step": 2877, + "time_per_iteration": 2.4874324798583984 + }, + { + "auxiliary_loss_clip": 0.06576495, + "auxiliary_loss_mlp": 0.01284602, + "balance_loss_clip": 0.06305559, + "balance_loss_mlp": 0.01260593, + "epoch": 0.17303472117841576, + "flos": 23004122002560.0, + "grad_norm": 1.956584823379214, + "language_loss": 0.79972547, + "learning_rate": 3.7891967617868204e-06, + "loss": 0.87833643, + "num_input_tokens_seen": 62307490, + "router_z_loss_clip": 2.70898438, + "router_z_loss_mlp": 0.23986816, + "step": 2878, + "time_per_iteration": 2.5546791553497314 + }, + { + "auxiliary_loss_clip": 0.06571983, + "auxiliary_loss_mlp": 0.01289115, + "balance_loss_clip": 0.06302349, + "balance_loss_mlp": 0.01264558, + "epoch": 0.17309484443108372, + "flos": 25671169935360.0, + "grad_norm": 1.824315336901638, + "language_loss": 0.72073978, + "learning_rate": 3.78902268871344e-06, + "loss": 0.79935074, + "num_input_tokens_seen": 62328570, + "router_z_loss_clip": 2.69335938, + "router_z_loss_mlp": 0.24584961, + "step": 2879, + "time_per_iteration": 2.5585644245147705 + }, + { + "auxiliary_loss_clip": 0.06575425, + "auxiliary_loss_mlp": 0.01284736, + "balance_loss_clip": 0.06301329, + "balance_loss_mlp": 0.01260048, + "epoch": 0.1731549676837517, + "flos": 13557960616320.0, + "grad_norm": 1.9540483547981324, + "language_loss": 0.8431474, + "learning_rate": 3.78884854780014e-06, + "loss": 0.921749, + "num_input_tokens_seen": 62345735, + "router_z_loss_clip": 2.7421875, + "router_z_loss_mlp": 0.24682617, + "step": 2880, + "time_per_iteration": 2.5332508087158203 + }, + { + "auxiliary_loss_clip": 0.06579134, + "auxiliary_loss_mlp": 0.01281408, + "balance_loss_clip": 0.06303075, + "balance_loss_mlp": 0.01256565, + "epoch": 0.17321509093641965, + "flos": 22863733286400.0, + "grad_norm": 3.3854797576129525, + "language_loss": 0.82168967, + "learning_rate": 3.7886743390535236e-06, + "loss": 0.90029514, + "num_input_tokens_seen": 62365525, + "router_z_loss_clip": 2.76171875, + "router_z_loss_mlp": 0.2487793, + "step": 2881, + "time_per_iteration": 2.5265071392059326 + }, + { + "auxiliary_loss_clip": 0.06575799, + "auxiliary_loss_mlp": 0.01283502, + "balance_loss_clip": 0.06305043, + "balance_loss_mlp": 0.0125904, + "epoch": 0.17327521418908762, + "flos": 24359665029120.0, + "grad_norm": 1.8504646386399068, + "language_loss": 0.77975154, + "learning_rate": 3.788500062480197e-06, + "loss": 0.85834455, + "num_input_tokens_seen": 62385160, + "router_z_loss_clip": 2.70898438, + "router_z_loss_mlp": 0.24450684, + "step": 2882, + "time_per_iteration": 2.56476092338562 + }, + { + "auxiliary_loss_clip": 0.0657361, + "auxiliary_loss_mlp": 0.01281786, + "balance_loss_clip": 0.063067, + "balance_loss_mlp": 0.01260495, + "epoch": 0.1733353374417556, + "flos": 33113373073920.0, + "grad_norm": 2.021690524452963, + "language_loss": 0.77161384, + "learning_rate": 3.788325718086769e-06, + "loss": 0.85016787, + "num_input_tokens_seen": 62405280, + "router_z_loss_clip": 2.66992188, + "router_z_loss_mlp": 0.21276855, + "step": 2883, + "time_per_iteration": 2.6154749393463135 + }, + { + "auxiliary_loss_clip": 0.06569435, + "auxiliary_loss_mlp": 0.01278991, + "balance_loss_clip": 0.06301424, + "balance_loss_mlp": 0.01256365, + "epoch": 0.17339546069442358, + "flos": 24395778938880.0, + "grad_norm": 4.943843215515709, + "language_loss": 0.86164784, + "learning_rate": 3.7881513058798503e-06, + "loss": 0.94013214, + "num_input_tokens_seen": 62423665, + "router_z_loss_clip": 2.68164062, + "router_z_loss_mlp": 0.22631836, + "step": 2884, + "time_per_iteration": 2.5598208904266357 + }, + { + "auxiliary_loss_clip": 0.06577636, + "auxiliary_loss_mlp": 0.01280409, + "balance_loss_clip": 0.06308297, + "balance_loss_mlp": 0.01256878, + "epoch": 0.17345558394709154, + "flos": 27461589252480.0, + "grad_norm": 1.714045228397976, + "language_loss": 0.75027329, + "learning_rate": 3.787976825866055e-06, + "loss": 0.82885373, + "num_input_tokens_seen": 62445170, + "router_z_loss_clip": 2.69335938, + "router_z_loss_mlp": 0.23535156, + "step": 2885, + "time_per_iteration": 2.584550619125366 + }, + { + "auxiliary_loss_clip": 0.06567928, + "auxiliary_loss_mlp": 0.01282091, + "balance_loss_clip": 0.06304367, + "balance_loss_mlp": 0.01259954, + "epoch": 0.1735157071997595, + "flos": 24689260264320.0, + "grad_norm": 1.6836608181022428, + "language_loss": 0.71760321, + "learning_rate": 3.7878022780519998e-06, + "loss": 0.79610336, + "num_input_tokens_seen": 62466135, + "router_z_loss_clip": 2.63671875, + "router_z_loss_mlp": 0.22131348, + "step": 2886, + "time_per_iteration": 2.5990986824035645 + }, + { + "auxiliary_loss_clip": 0.06574686, + "auxiliary_loss_mlp": 0.01280319, + "balance_loss_clip": 0.06304233, + "balance_loss_mlp": 0.01257275, + "epoch": 0.17357583045242747, + "flos": 21695300426880.0, + "grad_norm": 2.252280410203818, + "language_loss": 0.70329314, + "learning_rate": 3.7876276624443024e-06, + "loss": 0.78184319, + "num_input_tokens_seen": 62483910, + "router_z_loss_clip": 2.703125, + "router_z_loss_mlp": 0.23071289, + "step": 2887, + "time_per_iteration": 2.528995990753174 + }, + { + "auxiliary_loss_clip": 0.0657585, + "auxiliary_loss_mlp": 0.0127978, + "balance_loss_clip": 0.06305341, + "balance_loss_mlp": 0.01258155, + "epoch": 0.17363595370509544, + "flos": 15380846190720.0, + "grad_norm": 1.8987045627788157, + "language_loss": 0.85982835, + "learning_rate": 3.787452979049585e-06, + "loss": 0.93838477, + "num_input_tokens_seen": 62501530, + "router_z_loss_clip": 2.70507812, + "router_z_loss_mlp": 0.21618652, + "step": 2888, + "time_per_iteration": 2.520200252532959 + }, + { + "auxiliary_loss_clip": 0.06585068, + "auxiliary_loss_mlp": 0.0128524, + "balance_loss_clip": 0.06313335, + "balance_loss_mlp": 0.01262077, + "epoch": 0.1736960769577634, + "flos": 23447719117440.0, + "grad_norm": 1.9850534312792847, + "language_loss": 0.79895031, + "learning_rate": 3.7872782278744718e-06, + "loss": 0.87765336, + "num_input_tokens_seen": 62521295, + "router_z_loss_clip": 2.71679688, + "router_z_loss_mlp": 0.23193359, + "step": 2889, + "time_per_iteration": 2.5683798789978027 + }, + { + "auxiliary_loss_clip": 0.06572761, + "auxiliary_loss_mlp": 0.01291973, + "balance_loss_clip": 0.06309643, + "balance_loss_mlp": 0.01268966, + "epoch": 0.1737562002104314, + "flos": 18593711619840.0, + "grad_norm": 2.1673011596526743, + "language_loss": 0.85773498, + "learning_rate": 3.7871034089255883e-06, + "loss": 0.93638229, + "num_input_tokens_seen": 62539615, + "router_z_loss_clip": 2.62890625, + "router_z_loss_mlp": 0.23010254, + "step": 2890, + "time_per_iteration": 2.5268702507019043 + }, + { + "auxiliary_loss_clip": 0.06571183, + "auxiliary_loss_mlp": 0.0127752, + "balance_loss_clip": 0.06302673, + "balance_loss_mlp": 0.0125493, + "epoch": 0.17381632346309936, + "flos": 16003629262080.0, + "grad_norm": 2.262236435886973, + "language_loss": 0.8327142, + "learning_rate": 3.7869285222095653e-06, + "loss": 0.91120124, + "num_input_tokens_seen": 62556820, + "router_z_loss_clip": 2.68359375, + "router_z_loss_mlp": 0.22595215, + "step": 2891, + "time_per_iteration": 2.4975481033325195 + }, + { + "auxiliary_loss_clip": 0.065819, + "auxiliary_loss_mlp": 0.01286901, + "balance_loss_clip": 0.06304774, + "balance_loss_mlp": 0.01263512, + "epoch": 0.17387644671576732, + "flos": 13374749664000.0, + "grad_norm": 2.593478250918492, + "language_loss": 0.82133532, + "learning_rate": 3.7867535677330334e-06, + "loss": 0.9000234, + "num_input_tokens_seen": 62572450, + "router_z_loss_clip": 2.76953125, + "router_z_loss_mlp": 0.23388672, + "step": 2892, + "time_per_iteration": 2.488811492919922 + }, + { + "auxiliary_loss_clip": 0.06588026, + "auxiliary_loss_mlp": 0.0128266, + "balance_loss_clip": 0.06313482, + "balance_loss_mlp": 0.0125759, + "epoch": 0.1739365699684353, + "flos": 26622877409280.0, + "grad_norm": 1.869199176824797, + "language_loss": 0.7570942, + "learning_rate": 3.786578545502627e-06, + "loss": 0.83580112, + "num_input_tokens_seen": 62592580, + "router_z_loss_clip": 2.74804688, + "router_z_loss_mlp": 0.25061035, + "step": 2893, + "time_per_iteration": 2.6775050163269043 + }, + { + "auxiliary_loss_clip": 0.06578243, + "auxiliary_loss_mlp": 0.01282281, + "balance_loss_clip": 0.06306182, + "balance_loss_mlp": 0.01257903, + "epoch": 0.17399669322110325, + "flos": 23374736611200.0, + "grad_norm": 1.8950837051329763, + "language_loss": 0.82900345, + "learning_rate": 3.7864034555249828e-06, + "loss": 0.90760863, + "num_input_tokens_seen": 62611220, + "router_z_loss_clip": 2.72265625, + "router_z_loss_mlp": 0.24377441, + "step": 2894, + "time_per_iteration": 2.5567498207092285 + }, + { + "auxiliary_loss_clip": 0.06582697, + "auxiliary_loss_mlp": 0.01287491, + "balance_loss_clip": 0.06309928, + "balance_loss_mlp": 0.01263232, + "epoch": 0.17405681647377122, + "flos": 22060590301440.0, + "grad_norm": 2.244882299044818, + "language_loss": 0.74999332, + "learning_rate": 3.786228297806741e-06, + "loss": 0.82869518, + "num_input_tokens_seen": 62629185, + "router_z_loss_clip": 2.73046875, + "router_z_loss_mlp": 0.24279785, + "step": 2895, + "time_per_iteration": 2.535771369934082 + }, + { + "auxiliary_loss_clip": 0.06500985, + "auxiliary_loss_mlp": 0.01252168, + "balance_loss_clip": 0.06341717, + "balance_loss_mlp": 0.01244449, + "epoch": 0.1741169397264392, + "flos": 61476537530880.0, + "grad_norm": 0.8158755233881254, + "language_loss": 0.62716168, + "learning_rate": 3.7860530723545435e-06, + "loss": 0.7046932, + "num_input_tokens_seen": 62691895, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.0770874, + "step": 2896, + "time_per_iteration": 3.260303497314453 + }, + { + "auxiliary_loss_clip": 0.06578183, + "auxiliary_loss_mlp": 0.01278967, + "balance_loss_clip": 0.06304477, + "balance_loss_mlp": 0.01254791, + "epoch": 0.17417706297910718, + "flos": 27025245515520.0, + "grad_norm": 1.768440838457988, + "language_loss": 0.76261735, + "learning_rate": 3.785877779175034e-06, + "loss": 0.84118891, + "num_input_tokens_seen": 62713790, + "router_z_loss_clip": 2.73632812, + "router_z_loss_mlp": 0.24157715, + "step": 2897, + "time_per_iteration": 3.9564483165740967 + }, + { + "auxiliary_loss_clip": 0.06567717, + "auxiliary_loss_mlp": 0.01283821, + "balance_loss_clip": 0.06302972, + "balance_loss_mlp": 0.01260325, + "epoch": 0.17423718623177514, + "flos": 33516957064320.0, + "grad_norm": 2.1770598890745694, + "language_loss": 0.7037769, + "learning_rate": 3.7857024182748606e-06, + "loss": 0.78229225, + "num_input_tokens_seen": 62736285, + "router_z_loss_clip": 2.64453125, + "router_z_loss_mlp": 0.23486328, + "step": 2898, + "time_per_iteration": 2.6747710704803467 + }, + { + "auxiliary_loss_clip": 0.06586026, + "auxiliary_loss_mlp": 0.01283538, + "balance_loss_clip": 0.0630955, + "balance_loss_mlp": 0.01261008, + "epoch": 0.1742973094844431, + "flos": 27205982772480.0, + "grad_norm": 2.322018652940294, + "language_loss": 0.77535176, + "learning_rate": 3.7855269896606717e-06, + "loss": 0.85404742, + "num_input_tokens_seen": 62756240, + "router_z_loss_clip": 2.76757812, + "router_z_loss_mlp": 0.22509766, + "step": 2899, + "time_per_iteration": 2.5824503898620605 + }, + { + "auxiliary_loss_clip": 0.06566149, + "auxiliary_loss_mlp": 0.01285927, + "balance_loss_clip": 0.06301811, + "balance_loss_mlp": 0.01263611, + "epoch": 0.17435743273711107, + "flos": 22717307076480.0, + "grad_norm": 1.8730005414784603, + "language_loss": 0.7345652, + "learning_rate": 3.785351493339121e-06, + "loss": 0.81308603, + "num_input_tokens_seen": 62775910, + "router_z_loss_clip": 2.64453125, + "router_z_loss_mlp": 0.22302246, + "step": 2900, + "time_per_iteration": 3.9656574726104736 + }, + { + "auxiliary_loss_clip": 0.06572049, + "auxiliary_loss_mlp": 0.01282784, + "balance_loss_clip": 0.06301104, + "balance_loss_mlp": 0.01259311, + "epoch": 0.17441755598977904, + "flos": 41656141664640.0, + "grad_norm": 1.6285149505686385, + "language_loss": 0.70661789, + "learning_rate": 3.785175929316863e-06, + "loss": 0.7851662, + "num_input_tokens_seen": 62799385, + "router_z_loss_clip": 2.71289062, + "router_z_loss_mlp": 0.23474121, + "step": 2901, + "time_per_iteration": 2.6915066242218018 + }, + { + "auxiliary_loss_clip": 0.06578797, + "auxiliary_loss_mlp": 0.01281619, + "balance_loss_clip": 0.06304422, + "balance_loss_mlp": 0.0125885, + "epoch": 0.174477679242447, + "flos": 26294372277120.0, + "grad_norm": 4.182093359181909, + "language_loss": 0.76958787, + "learning_rate": 3.7850002976005543e-06, + "loss": 0.84819204, + "num_input_tokens_seen": 62819380, + "router_z_loss_clip": 2.74414062, + "router_z_loss_mlp": 0.2277832, + "step": 2902, + "time_per_iteration": 2.58911395072937 + }, + { + "auxiliary_loss_clip": 0.06574767, + "auxiliary_loss_mlp": 0.0128676, + "balance_loss_clip": 0.06303128, + "balance_loss_mlp": 0.01265076, + "epoch": 0.174537802495115, + "flos": 17864221973760.0, + "grad_norm": 2.5386707468858942, + "language_loss": 0.82260907, + "learning_rate": 3.7848245981968558e-06, + "loss": 0.90122437, + "num_input_tokens_seen": 62836205, + "router_z_loss_clip": 2.71875, + "router_z_loss_mlp": 0.21679688, + "step": 2903, + "time_per_iteration": 3.919084072113037 + }, + { + "auxiliary_loss_clip": 0.06573024, + "auxiliary_loss_mlp": 0.01291861, + "balance_loss_clip": 0.06307561, + "balance_loss_mlp": 0.01269139, + "epoch": 0.17459792574778296, + "flos": 16945441954560.0, + "grad_norm": 1.7914306748896518, + "language_loss": 0.7447511, + "learning_rate": 3.784648831112429e-06, + "loss": 0.82340002, + "num_input_tokens_seen": 62854045, + "router_z_loss_clip": 2.65039062, + "router_z_loss_mlp": 0.22717285, + "step": 2904, + "time_per_iteration": 2.578841209411621 + }, + { + "auxiliary_loss_clip": 0.06575242, + "auxiliary_loss_mlp": 0.01290708, + "balance_loss_clip": 0.0630535, + "balance_loss_mlp": 0.01266592, + "epoch": 0.17465804900045093, + "flos": 25527049712640.0, + "grad_norm": 2.1432197986147004, + "language_loss": 0.65256733, + "learning_rate": 3.7844729963539406e-06, + "loss": 0.73122686, + "num_input_tokens_seen": 62873075, + "router_z_loss_clip": 2.703125, + "router_z_loss_mlp": 0.24133301, + "step": 2905, + "time_per_iteration": 3.9871487617492676 + }, + { + "auxiliary_loss_clip": 0.06593791, + "auxiliary_loss_mlp": 0.0129467, + "balance_loss_clip": 0.06312381, + "balance_loss_mlp": 0.01270137, + "epoch": 0.1747181722531189, + "flos": 24135853973760.0, + "grad_norm": 2.2797831517729046, + "language_loss": 0.80441433, + "learning_rate": 3.7842970939280566e-06, + "loss": 0.88329899, + "num_input_tokens_seen": 62892675, + "router_z_loss_clip": 2.8125, + "router_z_loss_mlp": 0.24511719, + "step": 2906, + "time_per_iteration": 2.556459903717041 + }, + { + "auxiliary_loss_clip": 0.065907, + "auxiliary_loss_mlp": 0.01299352, + "balance_loss_clip": 0.0631306, + "balance_loss_mlp": 0.01274306, + "epoch": 0.17477829550578686, + "flos": 17754580506240.0, + "grad_norm": 7.784703467250062, + "language_loss": 0.81983393, + "learning_rate": 3.784121123841449e-06, + "loss": 0.89873445, + "num_input_tokens_seen": 62910675, + "router_z_loss_clip": 2.77734375, + "router_z_loss_mlp": 0.25024414, + "step": 2907, + "time_per_iteration": 2.5256009101867676 + }, + { + "auxiliary_loss_clip": 0.06586979, + "auxiliary_loss_mlp": 0.01293929, + "balance_loss_clip": 0.06311269, + "balance_loss_mlp": 0.01269777, + "epoch": 0.17483841875845482, + "flos": 15382732907520.0, + "grad_norm": 1.9551973542338994, + "language_loss": 0.82190001, + "learning_rate": 3.7839450861007886e-06, + "loss": 0.90070903, + "num_input_tokens_seen": 62928130, + "router_z_loss_clip": 2.7578125, + "router_z_loss_mlp": 0.24133301, + "step": 2908, + "time_per_iteration": 2.5280957221984863 + }, + { + "auxiliary_loss_clip": 0.0658935, + "auxiliary_loss_mlp": 0.01308706, + "balance_loss_clip": 0.06314441, + "balance_loss_mlp": 0.01283279, + "epoch": 0.17489854201112282, + "flos": 17168624104320.0, + "grad_norm": 3.0308502496460243, + "language_loss": 0.8151319, + "learning_rate": 3.7837689807127518e-06, + "loss": 0.89411247, + "num_input_tokens_seen": 62944290, + "router_z_loss_clip": 2.74804688, + "router_z_loss_mlp": 0.25427246, + "step": 2909, + "time_per_iteration": 2.501805543899536 + }, + { + "auxiliary_loss_clip": 0.06591058, + "auxiliary_loss_mlp": 0.01307034, + "balance_loss_clip": 0.06313848, + "balance_loss_mlp": 0.01280235, + "epoch": 0.17495866526379078, + "flos": 19761347865600.0, + "grad_norm": 2.106593508541441, + "language_loss": 0.77213359, + "learning_rate": 3.783592807684017e-06, + "loss": 0.85111451, + "num_input_tokens_seen": 62963505, + "router_z_loss_clip": 2.76953125, + "router_z_loss_mlp": 0.26818848, + "step": 2910, + "time_per_iteration": 2.5401246547698975 + }, + { + "auxiliary_loss_clip": 0.065902, + "auxiliary_loss_mlp": 0.01309875, + "balance_loss_clip": 0.06316847, + "balance_loss_mlp": 0.01282147, + "epoch": 0.17501878851645875, + "flos": 28518535854720.0, + "grad_norm": 6.625386462851426, + "language_loss": 0.8799597, + "learning_rate": 3.7834165670212645e-06, + "loss": 0.95896053, + "num_input_tokens_seen": 62985020, + "router_z_loss_clip": 2.73242188, + "router_z_loss_mlp": 0.27770996, + "step": 2911, + "time_per_iteration": 2.60190486907959 + }, + { + "auxiliary_loss_clip": 0.06591105, + "auxiliary_loss_mlp": 0.01300463, + "balance_loss_clip": 0.06318109, + "balance_loss_mlp": 0.0127537, + "epoch": 0.1750789117691267, + "flos": 17936994844800.0, + "grad_norm": 2.1857421016012832, + "language_loss": 0.90469962, + "learning_rate": 3.7832402587311764e-06, + "loss": 0.98361528, + "num_input_tokens_seen": 63001745, + "router_z_loss_clip": 2.73046875, + "router_z_loss_mlp": 0.2512207, + "step": 2912, + "time_per_iteration": 2.5914218425750732 + }, + { + "auxiliary_loss_clip": 0.06588344, + "auxiliary_loss_mlp": 0.01304507, + "balance_loss_clip": 0.06308792, + "balance_loss_mlp": 0.01277041, + "epoch": 0.17513903502179468, + "flos": 18265248414720.0, + "grad_norm": 2.129743219312126, + "language_loss": 0.74037218, + "learning_rate": 3.783063882820439e-06, + "loss": 0.81930077, + "num_input_tokens_seen": 63019750, + "router_z_loss_clip": 2.79296875, + "router_z_loss_mlp": 0.27453613, + "step": 2913, + "time_per_iteration": 2.5509839057922363 + }, + { + "auxiliary_loss_clip": 0.06580269, + "auxiliary_loss_mlp": 0.01314219, + "balance_loss_clip": 0.06308483, + "balance_loss_mlp": 0.01289781, + "epoch": 0.17519915827446264, + "flos": 20711084768640.0, + "grad_norm": 1.8784732947097995, + "language_loss": 0.70240569, + "learning_rate": 3.782887439295741e-06, + "loss": 0.78135055, + "num_input_tokens_seen": 63039500, + "router_z_loss_clip": 2.71289062, + "router_z_loss_mlp": 0.24450684, + "step": 2914, + "time_per_iteration": 2.560774564743042 + }, + { + "auxiliary_loss_clip": 0.06575729, + "auxiliary_loss_mlp": 0.0130416, + "balance_loss_clip": 0.06304997, + "balance_loss_mlp": 0.01278935, + "epoch": 0.1752592815271306, + "flos": 20529928241280.0, + "grad_norm": 1.7233134110017265, + "language_loss": 0.94360971, + "learning_rate": 3.782710928163772e-06, + "loss": 1.0224086, + "num_input_tokens_seen": 63059785, + "router_z_loss_clip": 2.70507812, + "router_z_loss_mlp": 0.25231934, + "step": 2915, + "time_per_iteration": 2.5500216484069824 + }, + { + "auxiliary_loss_clip": 0.06576817, + "auxiliary_loss_mlp": 0.01301313, + "balance_loss_clip": 0.06306335, + "balance_loss_mlp": 0.01277269, + "epoch": 0.1753194047797986, + "flos": 21805696581120.0, + "grad_norm": 1.6995224084103926, + "language_loss": 0.81995428, + "learning_rate": 3.782534349431226e-06, + "loss": 0.89873564, + "num_input_tokens_seen": 63079385, + "router_z_loss_clip": 2.70703125, + "router_z_loss_mlp": 0.24060059, + "step": 2916, + "time_per_iteration": 2.6210248470306396 + }, + { + "auxiliary_loss_clip": 0.06578801, + "auxiliary_loss_mlp": 0.01308944, + "balance_loss_clip": 0.06305841, + "balance_loss_mlp": 0.01282694, + "epoch": 0.17537952803246656, + "flos": 20674719296640.0, + "grad_norm": 7.015160336993527, + "language_loss": 0.74587643, + "learning_rate": 3.782357703104799e-06, + "loss": 0.82475388, + "num_input_tokens_seen": 63098970, + "router_z_loss_clip": 2.72851562, + "router_z_loss_mlp": 0.26245117, + "step": 2917, + "time_per_iteration": 2.5568697452545166 + }, + { + "auxiliary_loss_clip": 0.06575756, + "auxiliary_loss_mlp": 0.01293408, + "balance_loss_clip": 0.06306349, + "balance_loss_mlp": 0.01269018, + "epoch": 0.17543965128513453, + "flos": 23301837959040.0, + "grad_norm": 1.9034970134752385, + "language_loss": 0.77783519, + "learning_rate": 3.7821809891911897e-06, + "loss": 0.85652685, + "num_input_tokens_seen": 63118750, + "router_z_loss_clip": 2.6953125, + "router_z_loss_mlp": 0.24414062, + "step": 2918, + "time_per_iteration": 2.592294692993164 + }, + { + "auxiliary_loss_clip": 0.06589542, + "auxiliary_loss_mlp": 0.01295236, + "balance_loss_clip": 0.06310425, + "balance_loss_mlp": 0.01271549, + "epoch": 0.1754997745378025, + "flos": 29103234445440.0, + "grad_norm": 2.152727236459042, + "language_loss": 0.75315654, + "learning_rate": 3.782004207697098e-06, + "loss": 0.83200431, + "num_input_tokens_seen": 63136865, + "router_z_loss_clip": 2.79101562, + "router_z_loss_mlp": 0.23693848, + "step": 2919, + "time_per_iteration": 2.67553973197937 + }, + { + "auxiliary_loss_clip": 0.06596158, + "auxiliary_loss_mlp": 0.01303514, + "balance_loss_clip": 0.06314485, + "balance_loss_mlp": 0.01279601, + "epoch": 0.17555989779047046, + "flos": 30379547836800.0, + "grad_norm": 1.8096477139902465, + "language_loss": 0.74872279, + "learning_rate": 3.781827358629228e-06, + "loss": 0.82771957, + "num_input_tokens_seen": 63158325, + "router_z_loss_clip": 2.81835938, + "router_z_loss_mlp": 0.23925781, + "step": 2920, + "time_per_iteration": 2.6885359287261963 + }, + { + "auxiliary_loss_clip": 0.06577891, + "auxiliary_loss_mlp": 0.01294192, + "balance_loss_clip": 0.06307238, + "balance_loss_mlp": 0.01270982, + "epoch": 0.17562002104313842, + "flos": 23293284842880.0, + "grad_norm": 2.5308626608738423, + "language_loss": 0.80572176, + "learning_rate": 3.7816504419942873e-06, + "loss": 0.88444257, + "num_input_tokens_seen": 63173115, + "router_z_loss_clip": 2.70703125, + "router_z_loss_mlp": 0.23217773, + "step": 2921, + "time_per_iteration": 2.51985502243042 + }, + { + "auxiliary_loss_clip": 0.06590457, + "auxiliary_loss_mlp": 0.01284789, + "balance_loss_clip": 0.06311172, + "balance_loss_mlp": 0.01260971, + "epoch": 0.1756801442958064, + "flos": 24797434285440.0, + "grad_norm": 1.5780045761030037, + "language_loss": 0.88755381, + "learning_rate": 3.7814734577989823e-06, + "loss": 0.96630621, + "num_input_tokens_seen": 63192880, + "router_z_loss_clip": 2.79296875, + "router_z_loss_mlp": 0.23815918, + "step": 2922, + "time_per_iteration": 2.595477819442749 + }, + { + "auxiliary_loss_clip": 0.06584172, + "auxiliary_loss_mlp": 0.01290113, + "balance_loss_clip": 0.06306588, + "balance_loss_mlp": 0.01265211, + "epoch": 0.17574026754847438, + "flos": 25778086145280.0, + "grad_norm": 2.2356333874414043, + "language_loss": 0.63389397, + "learning_rate": 3.7812964060500253e-06, + "loss": 0.71263683, + "num_input_tokens_seen": 63214395, + "router_z_loss_clip": 2.78125, + "router_z_loss_mlp": 0.24890137, + "step": 2923, + "time_per_iteration": 2.56712007522583 + }, + { + "auxiliary_loss_clip": 0.06590886, + "auxiliary_loss_mlp": 0.01293522, + "balance_loss_clip": 0.06313786, + "balance_loss_mlp": 0.01269394, + "epoch": 0.17580039080114235, + "flos": 17462273137920.0, + "grad_norm": 2.8211803221017617, + "language_loss": 0.81614435, + "learning_rate": 3.78111928675413e-06, + "loss": 0.89498842, + "num_input_tokens_seen": 63231020, + "router_z_loss_clip": 2.77148438, + "router_z_loss_mlp": 0.24145508, + "step": 2924, + "time_per_iteration": 2.5396065711975098 + }, + { + "auxiliary_loss_clip": 0.06586142, + "auxiliary_loss_mlp": 0.01294774, + "balance_loss_clip": 0.06306558, + "balance_loss_mlp": 0.01269108, + "epoch": 0.1758605140538103, + "flos": 14869633230720.0, + "grad_norm": 2.6608767055753244, + "language_loss": 0.71953624, + "learning_rate": 3.7809420999180126e-06, + "loss": 0.79834545, + "num_input_tokens_seen": 63246245, + "router_z_loss_clip": 2.79492188, + "router_z_loss_mlp": 0.25671387, + "step": 2925, + "time_per_iteration": 2.594172239303589 + }, + { + "auxiliary_loss_clip": 0.0657725, + "auxiliary_loss_mlp": 0.01284494, + "balance_loss_clip": 0.06310555, + "balance_loss_mlp": 0.01261546, + "epoch": 0.17592063730647828, + "flos": 23011165745280.0, + "grad_norm": 1.6593164954495325, + "language_loss": 0.72342992, + "learning_rate": 3.7807648455483934e-06, + "loss": 0.80204731, + "num_input_tokens_seen": 63267790, + "router_z_loss_clip": 2.6640625, + "router_z_loss_mlp": 0.22961426, + "step": 2926, + "time_per_iteration": 2.592061758041382 + }, + { + "auxiliary_loss_clip": 0.06592301, + "auxiliary_loss_mlp": 0.0128622, + "balance_loss_clip": 0.06310115, + "balance_loss_mlp": 0.01260911, + "epoch": 0.17598076055914624, + "flos": 20747911438080.0, + "grad_norm": 1.7750261498089963, + "language_loss": 0.85897779, + "learning_rate": 3.7805875236519918e-06, + "loss": 0.93776292, + "num_input_tokens_seen": 63286830, + "router_z_loss_clip": 2.8203125, + "router_z_loss_mlp": 0.25317383, + "step": 2927, + "time_per_iteration": 2.546537160873413 + }, + { + "auxiliary_loss_clip": 0.06583759, + "auxiliary_loss_mlp": 0.01277616, + "balance_loss_clip": 0.06312352, + "balance_loss_mlp": 0.01255431, + "epoch": 0.1760408838118142, + "flos": 34100607479040.0, + "grad_norm": 1.9484214610767971, + "language_loss": 0.72539592, + "learning_rate": 3.7804101342355336e-06, + "loss": 0.80400968, + "num_input_tokens_seen": 63308870, + "router_z_loss_clip": 2.71289062, + "router_z_loss_mlp": 0.22167969, + "step": 2928, + "time_per_iteration": 2.674516201019287 + }, + { + "auxiliary_loss_clip": 0.06577812, + "auxiliary_loss_mlp": 0.01278822, + "balance_loss_clip": 0.06308608, + "balance_loss_mlp": 0.01256292, + "epoch": 0.1761010070644822, + "flos": 24174902776320.0, + "grad_norm": 1.786019104625144, + "language_loss": 0.83572811, + "learning_rate": 3.780232677305744e-06, + "loss": 0.91429448, + "num_input_tokens_seen": 63329005, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 0.22521973, + "step": 2929, + "time_per_iteration": 2.5528249740600586 + }, + { + "auxiliary_loss_clip": 0.06584716, + "auxiliary_loss_mlp": 0.01284422, + "balance_loss_clip": 0.06311291, + "balance_loss_mlp": 0.01261439, + "epoch": 0.17616113031715017, + "flos": 26583660898560.0, + "grad_norm": 1.8454669041222298, + "language_loss": 0.80018413, + "learning_rate": 3.7800551528693535e-06, + "loss": 0.87887549, + "num_input_tokens_seen": 63349390, + "router_z_loss_clip": 2.73242188, + "router_z_loss_mlp": 0.2298584, + "step": 2930, + "time_per_iteration": 2.6004958152770996 + }, + { + "auxiliary_loss_clip": 0.06579742, + "auxiliary_loss_mlp": 0.01287089, + "balance_loss_clip": 0.06306133, + "balance_loss_mlp": 0.01261935, + "epoch": 0.17622125356981813, + "flos": 25673853265920.0, + "grad_norm": 2.4724081113031677, + "language_loss": 0.77905595, + "learning_rate": 3.7798775609330927e-06, + "loss": 0.85772425, + "num_input_tokens_seen": 63368835, + "router_z_loss_clip": 2.73632812, + "router_z_loss_mlp": 0.25195312, + "step": 2931, + "time_per_iteration": 2.580275774002075 + }, + { + "auxiliary_loss_clip": 0.0657528, + "auxiliary_loss_mlp": 0.01279459, + "balance_loss_clip": 0.063051, + "balance_loss_mlp": 0.01256988, + "epoch": 0.1762813768224861, + "flos": 16514129462400.0, + "grad_norm": 2.8370907048277973, + "language_loss": 0.75863802, + "learning_rate": 3.779699901503696e-06, + "loss": 0.83718544, + "num_input_tokens_seen": 63385220, + "router_z_loss_clip": 2.703125, + "router_z_loss_mlp": 0.22473145, + "step": 2932, + "time_per_iteration": 2.5535829067230225 + }, + { + "auxiliary_loss_clip": 0.06587049, + "auxiliary_loss_mlp": 0.0128414, + "balance_loss_clip": 0.06307124, + "balance_loss_mlp": 0.01258975, + "epoch": 0.17634150007515406, + "flos": 11215518600960.0, + "grad_norm": 2.570844699660862, + "language_loss": 0.90240741, + "learning_rate": 3.7795221745879016e-06, + "loss": 0.98111933, + "num_input_tokens_seen": 63400865, + "router_z_loss_clip": 2.80273438, + "router_z_loss_mlp": 0.25146484, + "step": 2933, + "time_per_iteration": 2.5120935440063477 + }, + { + "auxiliary_loss_clip": 0.06578325, + "auxiliary_loss_mlp": 0.01278816, + "balance_loss_clip": 0.06313163, + "balance_loss_mlp": 0.01256893, + "epoch": 0.17640162332782203, + "flos": 23666750490240.0, + "grad_norm": 2.3821255620265376, + "language_loss": 0.89272201, + "learning_rate": 3.779344380192448e-06, + "loss": 0.97129339, + "num_input_tokens_seen": 63421390, + "router_z_loss_clip": 2.65429688, + "router_z_loss_mlp": 0.21936035, + "step": 2934, + "time_per_iteration": 2.5753555297851562 + }, + { + "auxiliary_loss_clip": 0.06578338, + "auxiliary_loss_mlp": 0.0128005, + "balance_loss_clip": 0.0630947, + "balance_loss_mlp": 0.0125709, + "epoch": 0.17646174658049, + "flos": 53808819056640.0, + "grad_norm": 1.971590125699774, + "language_loss": 0.71700215, + "learning_rate": 3.779166518324077e-06, + "loss": 0.79558611, + "num_input_tokens_seen": 63444715, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 0.2298584, + "step": 2935, + "time_per_iteration": 2.8537397384643555 + }, + { + "auxiliary_loss_clip": 0.06584434, + "auxiliary_loss_mlp": 0.01288458, + "balance_loss_clip": 0.06307955, + "balance_loss_mlp": 0.01264401, + "epoch": 0.17652186983315798, + "flos": 24250820175360.0, + "grad_norm": 8.554775287736033, + "language_loss": 0.71186781, + "learning_rate": 3.7789885889895325e-06, + "loss": 0.79059678, + "num_input_tokens_seen": 63465525, + "router_z_loss_clip": 2.76953125, + "router_z_loss_mlp": 0.24047852, + "step": 2936, + "time_per_iteration": 4.091250896453857 + }, + { + "auxiliary_loss_clip": 0.06580865, + "auxiliary_loss_mlp": 0.01286216, + "balance_loss_clip": 0.06309694, + "balance_loss_mlp": 0.01263745, + "epoch": 0.17658199308582595, + "flos": 27461715033600.0, + "grad_norm": 1.9442195602404513, + "language_loss": 0.72206265, + "learning_rate": 3.7788105921955634e-06, + "loss": 0.80073345, + "num_input_tokens_seen": 63485815, + "router_z_loss_clip": 2.7109375, + "router_z_loss_mlp": 0.22473145, + "step": 2937, + "time_per_iteration": 2.5836215019226074 + }, + { + "auxiliary_loss_clip": 0.06581761, + "auxiliary_loss_mlp": 0.0128249, + "balance_loss_clip": 0.06303879, + "balance_loss_mlp": 0.01258088, + "epoch": 0.17664211633849392, + "flos": 22425167416320.0, + "grad_norm": 2.618384752485795, + "language_loss": 0.76896954, + "learning_rate": 3.7786325279489184e-06, + "loss": 0.84761202, + "num_input_tokens_seen": 63503905, + "router_z_loss_clip": 2.77929688, + "router_z_loss_mlp": 0.24389648, + "step": 2938, + "time_per_iteration": 2.5426154136657715 + }, + { + "auxiliary_loss_clip": 0.06581972, + "auxiliary_loss_mlp": 0.0129211, + "balance_loss_clip": 0.06306289, + "balance_loss_mlp": 0.01268638, + "epoch": 0.17670223959116188, + "flos": 24721642667520.0, + "grad_norm": 2.0224209621562803, + "language_loss": 0.72049117, + "learning_rate": 3.7784543962563495e-06, + "loss": 0.79923201, + "num_input_tokens_seen": 63521985, + "router_z_loss_clip": 2.7578125, + "router_z_loss_mlp": 0.23474121, + "step": 2939, + "time_per_iteration": 4.034467935562134 + }, + { + "auxiliary_loss_clip": 0.06574269, + "auxiliary_loss_mlp": 0.01281711, + "balance_loss_clip": 0.06305616, + "balance_loss_mlp": 0.01258668, + "epoch": 0.17676236284382985, + "flos": 22533383364480.0, + "grad_norm": 2.2379803860691667, + "language_loss": 0.75736713, + "learning_rate": 3.7782761971246115e-06, + "loss": 0.83592695, + "num_input_tokens_seen": 63539830, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 0.23034668, + "step": 2940, + "time_per_iteration": 2.6091058254241943 + }, + { + "auxiliary_loss_clip": 0.06579125, + "auxiliary_loss_mlp": 0.01284811, + "balance_loss_clip": 0.06305407, + "balance_loss_mlp": 0.01261494, + "epoch": 0.1768224860964978, + "flos": 12389988954240.0, + "grad_norm": 2.2625025035762443, + "language_loss": 0.86326134, + "learning_rate": 3.7780979305604616e-06, + "loss": 0.94190073, + "num_input_tokens_seen": 63555495, + "router_z_loss_clip": 2.73632812, + "router_z_loss_mlp": 0.2331543, + "step": 2941, + "time_per_iteration": 2.529346227645874 + }, + { + "auxiliary_loss_clip": 0.06590004, + "auxiliary_loss_mlp": 0.01292545, + "balance_loss_clip": 0.06314506, + "balance_loss_mlp": 0.01269073, + "epoch": 0.1768826093491658, + "flos": 24360335861760.0, + "grad_norm": 2.5150262997144806, + "language_loss": 0.78079373, + "learning_rate": 3.7779195965706607e-06, + "loss": 0.8596192, + "num_input_tokens_seen": 63575290, + "router_z_loss_clip": 2.75390625, + "router_z_loss_mlp": 0.23498535, + "step": 2942, + "time_per_iteration": 2.5893354415893555 + }, + { + "auxiliary_loss_clip": 0.06590073, + "auxiliary_loss_mlp": 0.01285718, + "balance_loss_clip": 0.06313878, + "balance_loss_mlp": 0.01261745, + "epoch": 0.17694273260183377, + "flos": 23593893765120.0, + "grad_norm": 1.793399089669822, + "language_loss": 0.81007993, + "learning_rate": 3.77774119516197e-06, + "loss": 0.88883781, + "num_input_tokens_seen": 63594670, + "router_z_loss_clip": 2.76171875, + "router_z_loss_mlp": 0.23962402, + "step": 2943, + "time_per_iteration": 4.085087537765503 + }, + { + "auxiliary_loss_clip": 0.065895, + "auxiliary_loss_mlp": 0.01284454, + "balance_loss_clip": 0.06311318, + "balance_loss_mlp": 0.01260266, + "epoch": 0.17700285585450173, + "flos": 26768297370240.0, + "grad_norm": 2.7078535987609524, + "language_loss": 0.81690747, + "learning_rate": 3.777562726341155e-06, + "loss": 0.89564693, + "num_input_tokens_seen": 63614780, + "router_z_loss_clip": 2.78125, + "router_z_loss_mlp": 0.24194336, + "step": 2944, + "time_per_iteration": 4.037370204925537 + }, + { + "auxiliary_loss_clip": 0.06577846, + "auxiliary_loss_mlp": 0.01285687, + "balance_loss_clip": 0.06307179, + "balance_loss_mlp": 0.01262, + "epoch": 0.1770629791071697, + "flos": 42785986919040.0, + "grad_norm": 3.287704950657118, + "language_loss": 0.74187398, + "learning_rate": 3.7773841901149835e-06, + "loss": 0.82050931, + "num_input_tokens_seen": 63637190, + "router_z_loss_clip": 2.70898438, + "router_z_loss_mlp": 0.23693848, + "step": 2945, + "time_per_iteration": 2.726703405380249 + }, + { + "auxiliary_loss_clip": 0.06568955, + "auxiliary_loss_mlp": 0.01286818, + "balance_loss_clip": 0.06300092, + "balance_loss_mlp": 0.01263596, + "epoch": 0.17712310235983766, + "flos": 17350954588800.0, + "grad_norm": 3.5781735305150013, + "language_loss": 0.78848231, + "learning_rate": 3.7772055864902256e-06, + "loss": 0.86704004, + "num_input_tokens_seen": 63652140, + "router_z_loss_clip": 2.68554688, + "router_z_loss_mlp": 0.23217773, + "step": 2946, + "time_per_iteration": 2.6050639152526855 + }, + { + "auxiliary_loss_clip": 0.06568858, + "auxiliary_loss_mlp": 0.01284865, + "balance_loss_clip": 0.06300168, + "balance_loss_mlp": 0.01262156, + "epoch": 0.17718322561250563, + "flos": 23885278738560.0, + "grad_norm": 1.9584306466242212, + "language_loss": 0.77679253, + "learning_rate": 3.7770269154736535e-06, + "loss": 0.85532975, + "num_input_tokens_seen": 63671700, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 0.22705078, + "step": 2947, + "time_per_iteration": 2.562394857406616 + }, + { + "auxiliary_loss_clip": 0.06579228, + "auxiliary_loss_mlp": 0.01286605, + "balance_loss_clip": 0.06305858, + "balance_loss_mlp": 0.01262573, + "epoch": 0.1772433488651736, + "flos": 36475306116480.0, + "grad_norm": 3.3061595908349193, + "language_loss": 0.7337119, + "learning_rate": 3.7768481770720424e-06, + "loss": 0.81237024, + "num_input_tokens_seen": 63691685, + "router_z_loss_clip": 2.734375, + "router_z_loss_mlp": 0.24023438, + "step": 2948, + "time_per_iteration": 2.6738815307617188 + }, + { + "auxiliary_loss_clip": 0.06568594, + "auxiliary_loss_mlp": 0.01285694, + "balance_loss_clip": 0.06305531, + "balance_loss_mlp": 0.01263915, + "epoch": 0.1773034721178416, + "flos": 26691457576320.0, + "grad_norm": 2.3861566912178915, + "language_loss": 0.82720947, + "learning_rate": 3.776669371292171e-06, + "loss": 0.90575236, + "num_input_tokens_seen": 63711720, + "router_z_loss_clip": 2.63085938, + "router_z_loss_mlp": 0.21777344, + "step": 2949, + "time_per_iteration": 2.6339261531829834 + }, + { + "auxiliary_loss_clip": 0.06558515, + "auxiliary_loss_mlp": 0.0129088, + "balance_loss_clip": 0.06397671, + "balance_loss_mlp": 0.01282136, + "epoch": 0.17736359537050955, + "flos": 57136007053440.0, + "grad_norm": 0.7127406603181583, + "language_loss": 0.65079832, + "learning_rate": 3.7764904981408186e-06, + "loss": 0.72929227, + "num_input_tokens_seen": 63776280, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.08758545, + "step": 2950, + "time_per_iteration": 3.2668871879577637 + }, + { + "auxiliary_loss_clip": 0.06572378, + "auxiliary_loss_mlp": 0.01284106, + "balance_loss_clip": 0.06306554, + "balance_loss_mlp": 0.01260896, + "epoch": 0.17742371862317752, + "flos": 27205479648000.0, + "grad_norm": 1.9196695606626306, + "language_loss": 0.84746122, + "learning_rate": 3.7763115576247686e-06, + "loss": 0.92602605, + "num_input_tokens_seen": 63797535, + "router_z_loss_clip": 2.65820312, + "router_z_loss_mlp": 0.2322998, + "step": 2951, + "time_per_iteration": 2.585566520690918 + }, + { + "auxiliary_loss_clip": 0.06574618, + "auxiliary_loss_mlp": 0.01283229, + "balance_loss_clip": 0.06301534, + "balance_loss_mlp": 0.01260556, + "epoch": 0.17748384187584548, + "flos": 20966020416000.0, + "grad_norm": 2.232427680766164, + "language_loss": 0.82122993, + "learning_rate": 3.776132549750806e-06, + "loss": 0.89980847, + "num_input_tokens_seen": 63817045, + "router_z_loss_clip": 2.72851562, + "router_z_loss_mlp": 0.22680664, + "step": 2952, + "time_per_iteration": 2.55747652053833 + }, + { + "auxiliary_loss_clip": 0.06570595, + "auxiliary_loss_mlp": 0.01296069, + "balance_loss_clip": 0.06303248, + "balance_loss_mlp": 0.01272251, + "epoch": 0.17754396512851345, + "flos": 25017052636800.0, + "grad_norm": 5.629810818318968, + "language_loss": 0.8066265, + "learning_rate": 3.7759534745257194e-06, + "loss": 0.88529313, + "num_input_tokens_seen": 63837665, + "router_z_loss_clip": 2.671875, + "router_z_loss_mlp": 0.23840332, + "step": 2953, + "time_per_iteration": 2.5756490230560303 + }, + { + "auxiliary_loss_clip": 0.06576403, + "auxiliary_loss_mlp": 0.01299444, + "balance_loss_clip": 0.06307617, + "balance_loss_mlp": 0.01275877, + "epoch": 0.1776040883811814, + "flos": 32059780634880.0, + "grad_norm": 1.9568540134603198, + "language_loss": 0.89472413, + "learning_rate": 3.7757743319562994e-06, + "loss": 0.97348255, + "num_input_tokens_seen": 63858455, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 0.2355957, + "step": 2954, + "time_per_iteration": 2.64989972114563 + }, + { + "auxiliary_loss_clip": 0.06576417, + "auxiliary_loss_mlp": 0.01304463, + "balance_loss_clip": 0.06308817, + "balance_loss_mlp": 0.01280788, + "epoch": 0.17766421163384938, + "flos": 21579579538560.0, + "grad_norm": 2.0844074095191423, + "language_loss": 0.85445726, + "learning_rate": 3.7755951220493386e-06, + "loss": 0.93326604, + "num_input_tokens_seen": 63876935, + "router_z_loss_clip": 2.6796875, + "router_z_loss_mlp": 0.23693848, + "step": 2955, + "time_per_iteration": 2.5314552783966064 + }, + { + "auxiliary_loss_clip": 0.06566998, + "auxiliary_loss_mlp": 0.01298177, + "balance_loss_clip": 0.06301849, + "balance_loss_mlp": 0.01274287, + "epoch": 0.17772433488651737, + "flos": 22425922103040.0, + "grad_norm": 1.629233918934169, + "language_loss": 0.7198323, + "learning_rate": 3.7754158448116327e-06, + "loss": 0.79848409, + "num_input_tokens_seen": 63896815, + "router_z_loss_clip": 2.65039062, + "router_z_loss_mlp": 0.2388916, + "step": 2956, + "time_per_iteration": 2.5686161518096924 + }, + { + "auxiliary_loss_clip": 0.06565966, + "auxiliary_loss_mlp": 0.01302663, + "balance_loss_clip": 0.06303196, + "balance_loss_mlp": 0.01279632, + "epoch": 0.17778445813918534, + "flos": 25636481544960.0, + "grad_norm": 1.8690466813220736, + "language_loss": 0.8383618, + "learning_rate": 3.7752365002499795e-06, + "loss": 0.9170481, + "num_input_tokens_seen": 63916140, + "router_z_loss_clip": 2.63085938, + "router_z_loss_mlp": 0.23034668, + "step": 2957, + "time_per_iteration": 2.5693180561065674 + }, + { + "auxiliary_loss_clip": 0.06574687, + "auxiliary_loss_mlp": 0.0129708, + "balance_loss_clip": 0.06307757, + "balance_loss_mlp": 0.01274323, + "epoch": 0.1778445813918533, + "flos": 25635810712320.0, + "grad_norm": 1.5960329991483622, + "language_loss": 0.75535214, + "learning_rate": 3.7750570883711807e-06, + "loss": 0.83406979, + "num_input_tokens_seen": 63935220, + "router_z_loss_clip": 2.66992188, + "router_z_loss_mlp": 0.22753906, + "step": 2958, + "time_per_iteration": 2.6068832874298096 + }, + { + "auxiliary_loss_clip": 0.06572513, + "auxiliary_loss_mlp": 0.01295837, + "balance_loss_clip": 0.06305174, + "balance_loss_mlp": 0.01273533, + "epoch": 0.17790470464452127, + "flos": 22351975274880.0, + "grad_norm": 2.4916809347301867, + "language_loss": 0.8152473, + "learning_rate": 3.7748776091820397e-06, + "loss": 0.89393079, + "num_input_tokens_seen": 63954550, + "router_z_loss_clip": 2.67773438, + "router_z_loss_mlp": 0.22302246, + "step": 2959, + "time_per_iteration": 2.532893419265747 + }, + { + "auxiliary_loss_clip": 0.06580231, + "auxiliary_loss_mlp": 0.01291039, + "balance_loss_clip": 0.06308466, + "balance_loss_mlp": 0.01267293, + "epoch": 0.17796482789718923, + "flos": 18771052786560.0, + "grad_norm": 1.971364332808954, + "language_loss": 0.52699149, + "learning_rate": 3.774698062689362e-06, + "loss": 0.60570425, + "num_input_tokens_seen": 63972425, + "router_z_loss_clip": 2.71679688, + "router_z_loss_mlp": 0.23754883, + "step": 2960, + "time_per_iteration": 2.5427799224853516 + }, + { + "auxiliary_loss_clip": 0.06575893, + "auxiliary_loss_mlp": 0.01290781, + "balance_loss_clip": 0.06308038, + "balance_loss_mlp": 0.01267726, + "epoch": 0.1780249511498572, + "flos": 23447719117440.0, + "grad_norm": 1.7972451693934908, + "language_loss": 0.90068716, + "learning_rate": 3.7745184488999548e-06, + "loss": 0.97935379, + "num_input_tokens_seen": 63992165, + "router_z_loss_clip": 2.6796875, + "router_z_loss_mlp": 0.23083496, + "step": 2961, + "time_per_iteration": 2.5641977787017822 + }, + { + "auxiliary_loss_clip": 0.06579147, + "auxiliary_loss_mlp": 0.01285295, + "balance_loss_clip": 0.06309063, + "balance_loss_mlp": 0.0126075, + "epoch": 0.1780850744025252, + "flos": 23374149632640.0, + "grad_norm": 3.006724243875413, + "language_loss": 0.79600328, + "learning_rate": 3.774338767820631e-06, + "loss": 0.87464768, + "num_input_tokens_seen": 64013470, + "router_z_loss_clip": 2.703125, + "router_z_loss_mlp": 0.2454834, + "step": 2962, + "time_per_iteration": 2.605395555496216 + }, + { + "auxiliary_loss_clip": 0.06579778, + "auxiliary_loss_mlp": 0.01288142, + "balance_loss_clip": 0.06310856, + "balance_loss_mlp": 0.01262977, + "epoch": 0.17814519765519315, + "flos": 13777117770240.0, + "grad_norm": 1.8585534107816564, + "language_loss": 0.75987798, + "learning_rate": 3.774159019458203e-06, + "loss": 0.83855718, + "num_input_tokens_seen": 64030975, + "router_z_loss_clip": 2.6875, + "router_z_loss_mlp": 0.25146484, + "step": 2963, + "time_per_iteration": 2.4989051818847656 + }, + { + "auxiliary_loss_clip": 0.06582604, + "auxiliary_loss_mlp": 0.01280238, + "balance_loss_clip": 0.06308165, + "balance_loss_mlp": 0.01255573, + "epoch": 0.17820532090786112, + "flos": 21982073425920.0, + "grad_norm": 2.394373782804808, + "language_loss": 0.79892176, + "learning_rate": 3.7739792038194877e-06, + "loss": 0.87755024, + "num_input_tokens_seen": 64050075, + "router_z_loss_clip": 2.74609375, + "router_z_loss_mlp": 0.24682617, + "step": 2964, + "time_per_iteration": 2.6040844917297363 + }, + { + "auxiliary_loss_clip": 0.06584992, + "auxiliary_loss_mlp": 0.01284037, + "balance_loss_clip": 0.06315298, + "balance_loss_mlp": 0.01259289, + "epoch": 0.17826544416052909, + "flos": 24797727774720.0, + "grad_norm": 4.1010799155066, + "language_loss": 0.8221398, + "learning_rate": 3.7737993209113027e-06, + "loss": 0.90083003, + "num_input_tokens_seen": 64071920, + "router_z_loss_clip": 2.69726562, + "router_z_loss_mlp": 0.24755859, + "step": 2965, + "time_per_iteration": 2.5539731979370117 + }, + { + "auxiliary_loss_clip": 0.06570912, + "auxiliary_loss_mlp": 0.01281116, + "balance_loss_clip": 0.06306428, + "balance_loss_mlp": 0.01258788, + "epoch": 0.17832556741319705, + "flos": 13884411323520.0, + "grad_norm": 2.4679554184574974, + "language_loss": 0.96086347, + "learning_rate": 3.7736193707404698e-06, + "loss": 1.03938377, + "num_input_tokens_seen": 64086835, + "router_z_loss_clip": 2.64257812, + "router_z_loss_mlp": 0.22338867, + "step": 2966, + "time_per_iteration": 2.527735948562622 + }, + { + "auxiliary_loss_clip": 0.06579631, + "auxiliary_loss_mlp": 0.01280876, + "balance_loss_clip": 0.06311509, + "balance_loss_mlp": 0.0125688, + "epoch": 0.17838569066586502, + "flos": 36649502755200.0, + "grad_norm": 2.0843689120837965, + "language_loss": 0.73698831, + "learning_rate": 3.7734393533138127e-06, + "loss": 0.81559336, + "num_input_tokens_seen": 64107360, + "router_z_loss_clip": 2.68359375, + "router_z_loss_mlp": 0.24023438, + "step": 2967, + "time_per_iteration": 2.7015600204467773 + }, + { + "auxiliary_loss_clip": 0.06577688, + "auxiliary_loss_mlp": 0.01283294, + "balance_loss_clip": 0.06315881, + "balance_loss_mlp": 0.01260192, + "epoch": 0.17844581391853298, + "flos": 18732087838080.0, + "grad_norm": 3.4272342033369956, + "language_loss": 0.77622253, + "learning_rate": 3.773259268638157e-06, + "loss": 0.85483229, + "num_input_tokens_seen": 64124690, + "router_z_loss_clip": 2.62109375, + "router_z_loss_mlp": 0.2310791, + "step": 2968, + "time_per_iteration": 2.5782222747802734 + }, + { + "auxiliary_loss_clip": 0.06574235, + "auxiliary_loss_mlp": 0.01280569, + "balance_loss_clip": 0.06309816, + "balance_loss_mlp": 0.01257716, + "epoch": 0.17850593717120097, + "flos": 27385168728960.0, + "grad_norm": 2.732998701382931, + "language_loss": 0.76891911, + "learning_rate": 3.7730791167203333e-06, + "loss": 0.84746712, + "num_input_tokens_seen": 64146315, + "router_z_loss_clip": 2.64453125, + "router_z_loss_mlp": 0.2286377, + "step": 2969, + "time_per_iteration": 2.596932888031006 + }, + { + "auxiliary_loss_clip": 0.06469887, + "auxiliary_loss_mlp": 0.01257031, + "balance_loss_clip": 0.06316882, + "balance_loss_mlp": 0.01250105, + "epoch": 0.17856606042386894, + "flos": 67014696816000.0, + "grad_norm": 0.8163537423270849, + "language_loss": 0.69127434, + "learning_rate": 3.772898897567171e-06, + "loss": 0.76854354, + "num_input_tokens_seen": 64210875, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.06939697, + "step": 2970, + "time_per_iteration": 3.239208221435547 + }, + { + "auxiliary_loss_clip": 0.06585611, + "auxiliary_loss_mlp": 0.01285467, + "balance_loss_clip": 0.06311353, + "balance_loss_mlp": 0.01261936, + "epoch": 0.1786261836765369, + "flos": 36986015952000.0, + "grad_norm": 1.9165060952178286, + "language_loss": 0.67737955, + "learning_rate": 3.772718611185505e-06, + "loss": 0.75609034, + "num_input_tokens_seen": 64230740, + "router_z_loss_clip": 2.7421875, + "router_z_loss_mlp": 0.23522949, + "step": 2971, + "time_per_iteration": 2.6962218284606934 + }, + { + "auxiliary_loss_clip": 0.06573113, + "auxiliary_loss_mlp": 0.01289649, + "balance_loss_clip": 0.06303954, + "balance_loss_mlp": 0.01265164, + "epoch": 0.17868630692920487, + "flos": 24832122675840.0, + "grad_norm": 2.3195878790033992, + "language_loss": 0.90615618, + "learning_rate": 3.7725382575821717e-06, + "loss": 0.98478377, + "num_input_tokens_seen": 64252300, + "router_z_loss_clip": 2.69335938, + "router_z_loss_mlp": 0.24475098, + "step": 2972, + "time_per_iteration": 2.5959432125091553 + }, + { + "auxiliary_loss_clip": 0.06576589, + "auxiliary_loss_mlp": 0.01296839, + "balance_loss_clip": 0.06306117, + "balance_loss_mlp": 0.01272747, + "epoch": 0.17874643018187283, + "flos": 16987509504000.0, + "grad_norm": 2.140735852517547, + "language_loss": 0.89032125, + "learning_rate": 3.77235783676401e-06, + "loss": 0.96905553, + "num_input_tokens_seen": 64270105, + "router_z_loss_clip": 2.70507812, + "router_z_loss_mlp": 0.24084473, + "step": 2973, + "time_per_iteration": 2.5378026962280273 + }, + { + "auxiliary_loss_clip": 0.06586085, + "auxiliary_loss_mlp": 0.01287873, + "balance_loss_clip": 0.06315553, + "balance_loss_mlp": 0.01263459, + "epoch": 0.1788065534345408, + "flos": 21038499797760.0, + "grad_norm": 2.0743135363702097, + "language_loss": 0.77368832, + "learning_rate": 3.7721773487378615e-06, + "loss": 0.8524279, + "num_input_tokens_seen": 64287250, + "router_z_loss_clip": 2.70703125, + "router_z_loss_mlp": 0.2442627, + "step": 2974, + "time_per_iteration": 2.53279972076416 + }, + { + "auxiliary_loss_clip": 0.06580098, + "auxiliary_loss_mlp": 0.01294024, + "balance_loss_clip": 0.06311634, + "balance_loss_mlp": 0.01269825, + "epoch": 0.17886667668720876, + "flos": 23994500935680.0, + "grad_norm": 2.8964956916015323, + "language_loss": 0.75456583, + "learning_rate": 3.7719967935105705e-06, + "loss": 0.83330709, + "num_input_tokens_seen": 64307140, + "router_z_loss_clip": 2.68359375, + "router_z_loss_mlp": 0.24182129, + "step": 2975, + "time_per_iteration": 2.5941531658172607 + }, + { + "auxiliary_loss_clip": 0.06574937, + "auxiliary_loss_mlp": 0.01296496, + "balance_loss_clip": 0.06309143, + "balance_loss_mlp": 0.0127443, + "epoch": 0.17892679993987676, + "flos": 25746626136960.0, + "grad_norm": 1.5983536265516811, + "language_loss": 0.73931366, + "learning_rate": 3.7718161710889833e-06, + "loss": 0.81802797, + "num_input_tokens_seen": 64328760, + "router_z_loss_clip": 2.65820312, + "router_z_loss_mlp": 0.22070312, + "step": 2976, + "time_per_iteration": 3.9981672763824463 + }, + { + "auxiliary_loss_clip": 0.06569345, + "auxiliary_loss_mlp": 0.01289522, + "balance_loss_clip": 0.06309073, + "balance_loss_mlp": 0.01268697, + "epoch": 0.17898692319254472, + "flos": 25706277596160.0, + "grad_norm": 1.568582717127115, + "language_loss": 0.7779026, + "learning_rate": 3.7716354814799495e-06, + "loss": 0.85649121, + "num_input_tokens_seen": 64348800, + "router_z_loss_clip": 2.60351562, + "router_z_loss_mlp": 0.20837402, + "step": 2977, + "time_per_iteration": 2.6050028800964355 + }, + { + "auxiliary_loss_clip": 0.06579779, + "auxiliary_loss_mlp": 0.01290892, + "balance_loss_clip": 0.06314169, + "balance_loss_mlp": 0.01267538, + "epoch": 0.1790470464452127, + "flos": 19323830171520.0, + "grad_norm": 2.1998049901746395, + "language_loss": 0.80421352, + "learning_rate": 3.7714547246903203e-06, + "loss": 0.88292015, + "num_input_tokens_seen": 64367955, + "router_z_loss_clip": 2.65234375, + "router_z_loss_mlp": 0.23339844, + "step": 2978, + "time_per_iteration": 4.010040044784546 + }, + { + "auxiliary_loss_clip": 0.06576563, + "auxiliary_loss_mlp": 0.01293687, + "balance_loss_clip": 0.06306942, + "balance_loss_mlp": 0.01267556, + "epoch": 0.17910716969788065, + "flos": 30052048953600.0, + "grad_norm": 1.73318348994846, + "language_loss": 0.77042997, + "learning_rate": 3.7712739007269508e-06, + "loss": 0.84913242, + "num_input_tokens_seen": 64389805, + "router_z_loss_clip": 2.69335938, + "router_z_loss_mlp": 0.2611084, + "step": 2979, + "time_per_iteration": 2.608980655670166 + }, + { + "auxiliary_loss_clip": 0.06560802, + "auxiliary_loss_mlp": 0.01281236, + "balance_loss_clip": 0.06300105, + "balance_loss_mlp": 0.01258264, + "epoch": 0.17916729295054862, + "flos": 19433848982400.0, + "grad_norm": 2.44165935104879, + "language_loss": 0.69755781, + "learning_rate": 3.7710930095966976e-06, + "loss": 0.77597821, + "num_input_tokens_seen": 64408220, + "router_z_loss_clip": 2.60742188, + "router_z_loss_mlp": 0.22961426, + "step": 2980, + "time_per_iteration": 2.5433084964752197 + }, + { + "auxiliary_loss_clip": 0.06568111, + "auxiliary_loss_mlp": 0.01287625, + "balance_loss_clip": 0.06298865, + "balance_loss_mlp": 0.01262627, + "epoch": 0.17922741620321658, + "flos": 14616877789440.0, + "grad_norm": 2.147684280368508, + "language_loss": 0.7145257, + "learning_rate": 3.7709120513064196e-06, + "loss": 0.79308307, + "num_input_tokens_seen": 64426380, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 0.25, + "step": 2981, + "time_per_iteration": 2.500054359436035 + }, + { + "auxiliary_loss_clip": 0.06576173, + "auxiliary_loss_mlp": 0.01291804, + "balance_loss_clip": 0.06304301, + "balance_loss_mlp": 0.01267676, + "epoch": 0.17928753945588458, + "flos": 17171013945600.0, + "grad_norm": 2.0884907581744514, + "language_loss": 0.82620054, + "learning_rate": 3.7707310258629796e-06, + "loss": 0.90488029, + "num_input_tokens_seen": 64444355, + "router_z_loss_clip": 2.72070312, + "router_z_loss_mlp": 0.24145508, + "step": 2982, + "time_per_iteration": 2.5748655796051025 + }, + { + "auxiliary_loss_clip": 0.06564468, + "auxiliary_loss_mlp": 0.01285766, + "balance_loss_clip": 0.06298885, + "balance_loss_mlp": 0.01263212, + "epoch": 0.17934766270855254, + "flos": 31403860473600.0, + "grad_norm": 1.5724638299649338, + "language_loss": 0.83894312, + "learning_rate": 3.7705499332732413e-06, + "loss": 0.91744542, + "num_input_tokens_seen": 64467800, + "router_z_loss_clip": 2.65429688, + "router_z_loss_mlp": 0.2253418, + "step": 2983, + "time_per_iteration": 5.515043497085571 + }, + { + "auxiliary_loss_clip": 0.0656914, + "auxiliary_loss_mlp": 0.01282068, + "balance_loss_clip": 0.06294827, + "balance_loss_mlp": 0.01257571, + "epoch": 0.1794077859612205, + "flos": 20820558528000.0, + "grad_norm": 2.232182880378402, + "language_loss": 0.86948806, + "learning_rate": 3.7703687735440718e-06, + "loss": 0.94800013, + "num_input_tokens_seen": 64487230, + "router_z_loss_clip": 2.74414062, + "router_z_loss_mlp": 0.24523926, + "step": 2984, + "time_per_iteration": 2.51488995552063 + }, + { + "auxiliary_loss_clip": 0.0657285, + "auxiliary_loss_mlp": 0.0128885, + "balance_loss_clip": 0.06300434, + "balance_loss_mlp": 0.01263315, + "epoch": 0.17946790921388847, + "flos": 28994096102400.0, + "grad_norm": 1.3770556187482685, + "language_loss": 0.90024149, + "learning_rate": 3.7701875466823416e-06, + "loss": 0.97885847, + "num_input_tokens_seen": 64509165, + "router_z_loss_clip": 2.72851562, + "router_z_loss_mlp": 0.25537109, + "step": 2985, + "time_per_iteration": 2.6063013076782227 + }, + { + "auxiliary_loss_clip": 0.06556329, + "auxiliary_loss_mlp": 0.01283368, + "balance_loss_clip": 0.06297163, + "balance_loss_mlp": 0.01261088, + "epoch": 0.17952803246655644, + "flos": 20743131755520.0, + "grad_norm": 1.9976249367728316, + "language_loss": 0.71013325, + "learning_rate": 3.770006252694922e-06, + "loss": 0.78853023, + "num_input_tokens_seen": 64527940, + "router_z_loss_clip": 2.58984375, + "router_z_loss_mlp": 0.22277832, + "step": 2986, + "time_per_iteration": 2.519601345062256 + }, + { + "auxiliary_loss_clip": 0.0656532, + "auxiliary_loss_mlp": 0.01291064, + "balance_loss_clip": 0.06300499, + "balance_loss_mlp": 0.01266805, + "epoch": 0.1795881557192244, + "flos": 28263390572160.0, + "grad_norm": 2.1489314529360994, + "language_loss": 0.78320301, + "learning_rate": 3.769824891588688e-06, + "loss": 0.86176682, + "num_input_tokens_seen": 64545230, + "router_z_loss_clip": 2.65039062, + "router_z_loss_mlp": 0.24243164, + "step": 2987, + "time_per_iteration": 2.6449100971221924 + }, + { + "auxiliary_loss_clip": 0.06569126, + "auxiliary_loss_mlp": 0.01288456, + "balance_loss_clip": 0.06297948, + "balance_loss_mlp": 0.01263589, + "epoch": 0.17964827897189237, + "flos": 18558016980480.0, + "grad_norm": 1.9340316390641499, + "language_loss": 0.78628373, + "learning_rate": 3.7696434633705164e-06, + "loss": 0.86485958, + "num_input_tokens_seen": 64563820, + "router_z_loss_clip": 2.7109375, + "router_z_loss_mlp": 0.24890137, + "step": 2988, + "time_per_iteration": 2.53200101852417 + }, + { + "auxiliary_loss_clip": 0.06451814, + "auxiliary_loss_mlp": 0.01275074, + "balance_loss_clip": 0.06303016, + "balance_loss_mlp": 0.01267408, + "epoch": 0.17970840222456036, + "flos": 58182052625280.0, + "grad_norm": 0.7360596365876024, + "language_loss": 0.62615538, + "learning_rate": 3.7694619680472875e-06, + "loss": 0.70342427, + "num_input_tokens_seen": 64621315, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.07653809, + "step": 2989, + "time_per_iteration": 3.076199769973755 + }, + { + "auxiliary_loss_clip": 0.06567107, + "auxiliary_loss_mlp": 0.01292244, + "balance_loss_clip": 0.06300405, + "balance_loss_mlp": 0.0126808, + "epoch": 0.17976852547722832, + "flos": 20306662237440.0, + "grad_norm": 2.2696852334697035, + "language_loss": 0.71750367, + "learning_rate": 3.7692804056258837e-06, + "loss": 0.79609722, + "num_input_tokens_seen": 64639885, + "router_z_loss_clip": 2.66601562, + "router_z_loss_mlp": 0.24157715, + "step": 2990, + "time_per_iteration": 2.5519793033599854 + }, + { + "auxiliary_loss_clip": 0.06572431, + "auxiliary_loss_mlp": 0.01293466, + "balance_loss_clip": 0.0629989, + "balance_loss_mlp": 0.0126873, + "epoch": 0.1798286487298963, + "flos": 39677564004480.0, + "grad_norm": 1.9736942492438545, + "language_loss": 0.69419956, + "learning_rate": 3.7690987761131893e-06, + "loss": 0.77285856, + "num_input_tokens_seen": 64661220, + "router_z_loss_clip": 2.7265625, + "router_z_loss_mlp": 0.24743652, + "step": 2991, + "time_per_iteration": 2.6942460536956787 + }, + { + "auxiliary_loss_clip": 0.06566148, + "auxiliary_loss_mlp": 0.01286066, + "balance_loss_clip": 0.0629756, + "balance_loss_mlp": 0.012617, + "epoch": 0.17988877198256426, + "flos": 25527385128960.0, + "grad_norm": 1.696800264728132, + "language_loss": 0.83554435, + "learning_rate": 3.7689170795160924e-06, + "loss": 0.91406649, + "num_input_tokens_seen": 64682530, + "router_z_loss_clip": 2.68359375, + "router_z_loss_mlp": 0.24365234, + "step": 2992, + "time_per_iteration": 2.5905981063842773 + }, + { + "auxiliary_loss_clip": 0.06555136, + "auxiliary_loss_mlp": 0.01287452, + "balance_loss_clip": 0.06296399, + "balance_loss_mlp": 0.01264087, + "epoch": 0.17994889523523222, + "flos": 18813539606400.0, + "grad_norm": 1.8489809189150626, + "language_loss": 0.83113515, + "learning_rate": 3.7687353158414822e-06, + "loss": 0.90956104, + "num_input_tokens_seen": 64701025, + "router_z_loss_clip": 2.59179688, + "router_z_loss_mlp": 0.23352051, + "step": 2993, + "time_per_iteration": 2.52469801902771 + }, + { + "auxiliary_loss_clip": 0.06567293, + "auxiliary_loss_mlp": 0.01295673, + "balance_loss_clip": 0.06297931, + "balance_loss_mlp": 0.01270532, + "epoch": 0.18000901848790019, + "flos": 21110601836160.0, + "grad_norm": 1.6727087173341013, + "language_loss": 0.79138827, + "learning_rate": 3.7685534850962517e-06, + "loss": 0.87001795, + "num_input_tokens_seen": 64719570, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 0.25134277, + "step": 2994, + "time_per_iteration": 2.6068711280822754 + }, + { + "auxiliary_loss_clip": 0.06570512, + "auxiliary_loss_mlp": 0.01299664, + "balance_loss_clip": 0.06303661, + "balance_loss_mlp": 0.01275656, + "epoch": 0.18006914174056818, + "flos": 19652586865920.0, + "grad_norm": 2.057688194559839, + "language_loss": 0.81263554, + "learning_rate": 3.768371587287296e-06, + "loss": 0.89133728, + "num_input_tokens_seen": 64738110, + "router_z_loss_clip": 2.66796875, + "router_z_loss_mlp": 0.24023438, + "step": 2995, + "time_per_iteration": 2.55191707611084 + }, + { + "auxiliary_loss_clip": 0.06569074, + "auxiliary_loss_mlp": 0.0128305, + "balance_loss_clip": 0.06302823, + "balance_loss_mlp": 0.012599, + "epoch": 0.18012926499323614, + "flos": 19505909093760.0, + "grad_norm": 1.5669289310044971, + "language_loss": 0.84560204, + "learning_rate": 3.768189622421512e-06, + "loss": 0.92412329, + "num_input_tokens_seen": 64756345, + "router_z_loss_clip": 2.66210938, + "router_z_loss_mlp": 0.23156738, + "step": 2996, + "time_per_iteration": 2.5438597202301025 + }, + { + "auxiliary_loss_clip": 0.06562654, + "auxiliary_loss_mlp": 0.012845, + "balance_loss_clip": 0.06302606, + "balance_loss_mlp": 0.01261124, + "epoch": 0.1801893882459041, + "flos": 19470759505920.0, + "grad_norm": 1.7191902249906965, + "language_loss": 0.88438457, + "learning_rate": 3.7680075905058006e-06, + "loss": 0.96285611, + "num_input_tokens_seen": 64776375, + "router_z_loss_clip": 2.6015625, + "router_z_loss_mlp": 0.23352051, + "step": 2997, + "time_per_iteration": 2.5537290573120117 + }, + { + "auxiliary_loss_clip": 0.06589026, + "auxiliary_loss_mlp": 0.01294218, + "balance_loss_clip": 0.06317096, + "balance_loss_mlp": 0.01268731, + "epoch": 0.18024951149857207, + "flos": 26877938837760.0, + "grad_norm": 1.8629134602199495, + "language_loss": 0.86106455, + "learning_rate": 3.7678254915470643e-06, + "loss": 0.939897, + "num_input_tokens_seen": 64796210, + "router_z_loss_clip": 2.71679688, + "router_z_loss_mlp": 0.25500488, + "step": 2998, + "time_per_iteration": 2.6256613731384277 + }, + { + "auxiliary_loss_clip": 0.06576181, + "auxiliary_loss_mlp": 0.01293189, + "balance_loss_clip": 0.06311405, + "balance_loss_mlp": 0.01269573, + "epoch": 0.18030963475124004, + "flos": 30234421365120.0, + "grad_norm": 1.8712207411963018, + "language_loss": 0.84650278, + "learning_rate": 3.7676433255522084e-06, + "loss": 0.92519647, + "num_input_tokens_seen": 64818590, + "router_z_loss_clip": 2.64648438, + "router_z_loss_mlp": 0.23608398, + "step": 2999, + "time_per_iteration": 2.6169869899749756 + }, + { + "auxiliary_loss_clip": 0.06576863, + "auxiliary_loss_mlp": 0.01287758, + "balance_loss_clip": 0.06310622, + "balance_loss_mlp": 0.01263905, + "epoch": 0.180369758003908, + "flos": 22313681159040.0, + "grad_norm": 2.163703762887268, + "language_loss": 0.75604963, + "learning_rate": 3.76746109252814e-06, + "loss": 0.83469582, + "num_input_tokens_seen": 64838350, + "router_z_loss_clip": 2.6640625, + "router_z_loss_mlp": 0.23852539, + "step": 3000, + "time_per_iteration": 2.6028895378112793 + }, + { + "auxiliary_loss_clip": 0.06574081, + "auxiliary_loss_mlp": 0.01292075, + "balance_loss_clip": 0.06310557, + "balance_loss_mlp": 0.01270034, + "epoch": 0.18042988125657597, + "flos": 23738726747520.0, + "grad_norm": 2.5967993482221114, + "language_loss": 0.72796941, + "learning_rate": 3.76727879248177e-06, + "loss": 0.80663097, + "num_input_tokens_seen": 64858065, + "router_z_loss_clip": 2.63671875, + "router_z_loss_mlp": 0.22033691, + "step": 3001, + "time_per_iteration": 2.5506463050842285 + }, + { + "auxiliary_loss_clip": 0.06583872, + "auxiliary_loss_mlp": 0.01288133, + "balance_loss_clip": 0.06311986, + "balance_loss_mlp": 0.01262336, + "epoch": 0.18049000450924396, + "flos": 24099781991040.0, + "grad_norm": 2.0612506576335488, + "language_loss": 0.88948703, + "learning_rate": 3.767096425420011e-06, + "loss": 0.96820712, + "num_input_tokens_seen": 64877305, + "router_z_loss_clip": 2.71484375, + "router_z_loss_mlp": 0.25793457, + "step": 3002, + "time_per_iteration": 2.606262683868408 + }, + { + "auxiliary_loss_clip": 0.06584583, + "auxiliary_loss_mlp": 0.01297298, + "balance_loss_clip": 0.06316328, + "balance_loss_mlp": 0.01274613, + "epoch": 0.18055012776191193, + "flos": 22169602863360.0, + "grad_norm": 1.9471434915323604, + "language_loss": 0.82044661, + "learning_rate": 3.7669139913497788e-06, + "loss": 0.89926547, + "num_input_tokens_seen": 64896955, + "router_z_loss_clip": 2.68359375, + "router_z_loss_mlp": 0.22705078, + "step": 3003, + "time_per_iteration": 2.519054889678955 + }, + { + "auxiliary_loss_clip": 0.06584047, + "auxiliary_loss_mlp": 0.01304701, + "balance_loss_clip": 0.0631455, + "balance_loss_mlp": 0.01281098, + "epoch": 0.1806102510145799, + "flos": 28921155523200.0, + "grad_norm": 1.9671809983045359, + "language_loss": 0.67718011, + "learning_rate": 3.7667314902779907e-06, + "loss": 0.75606757, + "num_input_tokens_seen": 64917080, + "router_z_loss_clip": 2.6953125, + "router_z_loss_mlp": 0.23608398, + "step": 3004, + "time_per_iteration": 2.576216459274292 + }, + { + "auxiliary_loss_clip": 0.06581833, + "auxiliary_loss_mlp": 0.01290497, + "balance_loss_clip": 0.06313001, + "balance_loss_mlp": 0.01265976, + "epoch": 0.18067037426724786, + "flos": 19031648584320.0, + "grad_norm": 1.7292261015630317, + "language_loss": 0.86117315, + "learning_rate": 3.7665489222115677e-06, + "loss": 0.93989646, + "num_input_tokens_seen": 64935215, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 0.2454834, + "step": 3005, + "time_per_iteration": 2.51688814163208 + }, + { + "auxiliary_loss_clip": 0.06579112, + "auxiliary_loss_mlp": 0.01292933, + "balance_loss_clip": 0.0631589, + "balance_loss_mlp": 0.01270247, + "epoch": 0.18073049751991582, + "flos": 27460960346880.0, + "grad_norm": 1.9900110027616933, + "language_loss": 0.84054905, + "learning_rate": 3.766366287157432e-06, + "loss": 0.9192695, + "num_input_tokens_seen": 64956275, + "router_z_loss_clip": 2.6328125, + "router_z_loss_mlp": 0.22692871, + "step": 3006, + "time_per_iteration": 2.6471307277679443 + }, + { + "auxiliary_loss_clip": 0.06573892, + "auxiliary_loss_mlp": 0.01293776, + "balance_loss_clip": 0.06311665, + "balance_loss_mlp": 0.01270399, + "epoch": 0.1807906207725838, + "flos": 28736309416320.0, + "grad_norm": 1.8980852178108305, + "language_loss": 0.77909601, + "learning_rate": 3.7661835851225103e-06, + "loss": 0.85777271, + "num_input_tokens_seen": 64979390, + "router_z_loss_clip": 2.62304688, + "router_z_loss_mlp": 0.23376465, + "step": 3007, + "time_per_iteration": 2.596728801727295 + }, + { + "auxiliary_loss_clip": 0.06488212, + "auxiliary_loss_mlp": 0.01341948, + "balance_loss_clip": 0.06340114, + "balance_loss_mlp": 0.01332817, + "epoch": 0.18085074402525175, + "flos": 64488861411840.0, + "grad_norm": 0.8091646786767962, + "language_loss": 0.57128072, + "learning_rate": 3.7660008161137294e-06, + "loss": 0.64958233, + "num_input_tokens_seen": 65043135, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.09136963, + "step": 3008, + "time_per_iteration": 3.2818551063537598 + }, + { + "auxiliary_loss_clip": 0.06575561, + "auxiliary_loss_mlp": 0.0128936, + "balance_loss_clip": 0.06307852, + "balance_loss_mlp": 0.0126528, + "epoch": 0.18091086727791975, + "flos": 23483665319040.0, + "grad_norm": 2.791287786369512, + "language_loss": 0.68172324, + "learning_rate": 3.765817980138021e-06, + "loss": 0.76037246, + "num_input_tokens_seen": 65062845, + "router_z_loss_clip": 2.67773438, + "router_z_loss_mlp": 0.24072266, + "step": 3009, + "time_per_iteration": 2.612866163253784 + }, + { + "auxiliary_loss_clip": 0.06566571, + "auxiliary_loss_mlp": 0.01283544, + "balance_loss_clip": 0.06299911, + "balance_loss_mlp": 0.01261228, + "epoch": 0.1809709905305877, + "flos": 24177334544640.0, + "grad_norm": 2.2065616524174745, + "language_loss": 0.76732111, + "learning_rate": 3.7656350772023177e-06, + "loss": 0.84582222, + "num_input_tokens_seen": 65082110, + "router_z_loss_clip": 2.6640625, + "router_z_loss_mlp": 0.22314453, + "step": 3010, + "time_per_iteration": 2.570751190185547 + }, + { + "auxiliary_loss_clip": 0.0656049, + "auxiliary_loss_mlp": 0.01277678, + "balance_loss_clip": 0.06301664, + "balance_loss_mlp": 0.01255028, + "epoch": 0.18103111378325568, + "flos": 21657006311040.0, + "grad_norm": 1.5802962280270132, + "language_loss": 0.68172359, + "learning_rate": 3.7654521073135553e-06, + "loss": 0.76010525, + "num_input_tokens_seen": 65101985, + "router_z_loss_clip": 2.58789062, + "router_z_loss_mlp": 0.22644043, + "step": 3011, + "time_per_iteration": 2.5724563598632812 + }, + { + "auxiliary_loss_clip": 0.0656517, + "auxiliary_loss_mlp": 0.01279328, + "balance_loss_clip": 0.06304309, + "balance_loss_mlp": 0.01256989, + "epoch": 0.18109123703592364, + "flos": 53698632537600.0, + "grad_norm": 1.5833259733478497, + "language_loss": 0.71816081, + "learning_rate": 3.7652690704786723e-06, + "loss": 0.79660583, + "num_input_tokens_seen": 65129295, + "router_z_loss_clip": 2.60546875, + "router_z_loss_mlp": 0.22351074, + "step": 3012, + "time_per_iteration": 2.810831069946289 + }, + { + "auxiliary_loss_clip": 0.06566492, + "auxiliary_loss_mlp": 0.01285528, + "balance_loss_clip": 0.06309225, + "balance_loss_mlp": 0.01261376, + "epoch": 0.1811513602885916, + "flos": 35854325907840.0, + "grad_norm": 2.597528045864961, + "language_loss": 0.63496852, + "learning_rate": 3.765085966704609e-06, + "loss": 0.7134887, + "num_input_tokens_seen": 65150625, + "router_z_loss_clip": 2.57617188, + "router_z_loss_mlp": 0.24169922, + "step": 3013, + "time_per_iteration": 2.728149175643921 + }, + { + "auxiliary_loss_clip": 0.0656557, + "auxiliary_loss_mlp": 0.01286402, + "balance_loss_clip": 0.06302488, + "balance_loss_mlp": 0.01262405, + "epoch": 0.18121148354125957, + "flos": 23739355653120.0, + "grad_norm": 1.5758176693533255, + "language_loss": 0.76564461, + "learning_rate": 3.764902795998309e-06, + "loss": 0.84416431, + "num_input_tokens_seen": 65170880, + "router_z_loss_clip": 2.63085938, + "router_z_loss_mlp": 0.23986816, + "step": 3014, + "time_per_iteration": 2.547717332839966 + }, + { + "auxiliary_loss_clip": 0.06584823, + "auxiliary_loss_mlp": 0.01295776, + "balance_loss_clip": 0.06314109, + "balance_loss_mlp": 0.01270336, + "epoch": 0.18127160679392756, + "flos": 28735470875520.0, + "grad_norm": 2.560866552798296, + "language_loss": 0.66988617, + "learning_rate": 3.7647195583667184e-06, + "loss": 0.74869215, + "num_input_tokens_seen": 65192530, + "router_z_loss_clip": 2.70898438, + "router_z_loss_mlp": 0.2545166, + "step": 3015, + "time_per_iteration": 2.69026780128479 + }, + { + "auxiliary_loss_clip": 0.06569196, + "auxiliary_loss_mlp": 0.01280146, + "balance_loss_clip": 0.06306805, + "balance_loss_mlp": 0.0125696, + "epoch": 0.18133173004659553, + "flos": 20491256782080.0, + "grad_norm": 2.469275114619788, + "language_loss": 0.78958207, + "learning_rate": 3.764536253816785e-06, + "loss": 0.86807549, + "num_input_tokens_seen": 65211675, + "router_z_loss_clip": 2.62304688, + "router_z_loss_mlp": 0.23168945, + "step": 3016, + "time_per_iteration": 3.9831480979919434 + }, + { + "auxiliary_loss_clip": 0.0657627, + "auxiliary_loss_mlp": 0.01288204, + "balance_loss_clip": 0.06304967, + "balance_loss_mlp": 0.01262967, + "epoch": 0.1813918532992635, + "flos": 22857905427840.0, + "grad_norm": 1.6723213639278358, + "language_loss": 0.84196192, + "learning_rate": 3.7643528823554602e-06, + "loss": 0.92060661, + "num_input_tokens_seen": 65231185, + "router_z_loss_clip": 2.71484375, + "router_z_loss_mlp": 0.25256348, + "step": 3017, + "time_per_iteration": 2.5418076515197754 + }, + { + "auxiliary_loss_clip": 0.06562062, + "auxiliary_loss_mlp": 0.01287085, + "balance_loss_clip": 0.063041, + "balance_loss_mlp": 0.01264197, + "epoch": 0.18145197655193146, + "flos": 36074028113280.0, + "grad_norm": 1.9391079186566258, + "language_loss": 0.68509835, + "learning_rate": 3.764169443989697e-06, + "loss": 0.76358986, + "num_input_tokens_seen": 65251645, + "router_z_loss_clip": 2.58203125, + "router_z_loss_mlp": 0.22900391, + "step": 3018, + "time_per_iteration": 4.119429111480713 + }, + { + "auxiliary_loss_clip": 0.06567694, + "auxiliary_loss_mlp": 0.01285506, + "balance_loss_clip": 0.06301513, + "balance_loss_mlp": 0.01262296, + "epoch": 0.18151209980459942, + "flos": 24030698699520.0, + "grad_norm": 1.811235496294486, + "language_loss": 0.76789671, + "learning_rate": 3.7639859387264518e-06, + "loss": 0.84642869, + "num_input_tokens_seen": 65271125, + "router_z_loss_clip": 2.65820312, + "router_z_loss_mlp": 0.23205566, + "step": 3019, + "time_per_iteration": 2.5501174926757812 + }, + { + "auxiliary_loss_clip": 0.06571496, + "auxiliary_loss_mlp": 0.01294569, + "balance_loss_clip": 0.0630317, + "balance_loss_mlp": 0.01267544, + "epoch": 0.1815722230572674, + "flos": 23958470880000.0, + "grad_norm": 3.3265475746221305, + "language_loss": 0.82225502, + "learning_rate": 3.7638023665726834e-06, + "loss": 0.90091568, + "num_input_tokens_seen": 65290600, + "router_z_loss_clip": 2.68554688, + "router_z_loss_mlp": 0.26989746, + "step": 3020, + "time_per_iteration": 2.5695080757141113 + }, + { + "auxiliary_loss_clip": 0.06568192, + "auxiliary_loss_mlp": 0.01285845, + "balance_loss_clip": 0.06304596, + "balance_loss_mlp": 0.01262433, + "epoch": 0.18163234630993536, + "flos": 24392885973120.0, + "grad_norm": 1.8328180932997555, + "language_loss": 0.78643721, + "learning_rate": 3.763618727535352e-06, + "loss": 0.8649776, + "num_input_tokens_seen": 65311040, + "router_z_loss_clip": 2.63867188, + "router_z_loss_mlp": 0.234375, + "step": 3021, + "time_per_iteration": 2.551942825317383 + }, + { + "auxiliary_loss_clip": 0.06560968, + "auxiliary_loss_mlp": 0.01283899, + "balance_loss_clip": 0.06301476, + "balance_loss_mlp": 0.01261034, + "epoch": 0.18169246956260335, + "flos": 24688295942400.0, + "grad_norm": 2.040482316083418, + "language_loss": 0.85882831, + "learning_rate": 3.763435021621422e-06, + "loss": 0.93727696, + "num_input_tokens_seen": 65332115, + "router_z_loss_clip": 2.59179688, + "router_z_loss_mlp": 0.22851562, + "step": 3022, + "time_per_iteration": 5.58092737197876 + }, + { + "auxiliary_loss_clip": 0.06578015, + "auxiliary_loss_mlp": 0.01285165, + "balance_loss_clip": 0.06310268, + "balance_loss_mlp": 0.0126031, + "epoch": 0.1817525928152713, + "flos": 24250149342720.0, + "grad_norm": 1.8455534069636814, + "language_loss": 0.7011804, + "learning_rate": 3.763251248837859e-06, + "loss": 0.77981222, + "num_input_tokens_seen": 65352210, + "router_z_loss_clip": 2.67773438, + "router_z_loss_mlp": 0.24853516, + "step": 3023, + "time_per_iteration": 2.5510292053222656 + }, + { + "auxiliary_loss_clip": 0.06576993, + "auxiliary_loss_mlp": 0.01285425, + "balance_loss_clip": 0.06311849, + "balance_loss_mlp": 0.01262382, + "epoch": 0.18181271606793928, + "flos": 16477680136320.0, + "grad_norm": 3.5802196750479753, + "language_loss": 0.7475239, + "learning_rate": 3.7630674091916317e-06, + "loss": 0.82614803, + "num_input_tokens_seen": 65370600, + "router_z_loss_clip": 2.6484375, + "router_z_loss_mlp": 0.23034668, + "step": 3024, + "time_per_iteration": 2.532150983810425 + }, + { + "auxiliary_loss_clip": 0.0657917, + "auxiliary_loss_mlp": 0.01281973, + "balance_loss_clip": 0.06315119, + "balance_loss_mlp": 0.01258239, + "epoch": 0.18187283932060724, + "flos": 18585787409280.0, + "grad_norm": 2.5283577302616593, + "language_loss": 0.89396572, + "learning_rate": 3.7628835026897123e-06, + "loss": 0.97257715, + "num_input_tokens_seen": 65387270, + "router_z_loss_clip": 2.640625, + "router_z_loss_mlp": 0.23742676, + "step": 3025, + "time_per_iteration": 2.503992795944214 + }, + { + "auxiliary_loss_clip": 0.0657706, + "auxiliary_loss_mlp": 0.01284845, + "balance_loss_clip": 0.06313155, + "balance_loss_mlp": 0.01260049, + "epoch": 0.1819329625732752, + "flos": 20273105877120.0, + "grad_norm": 1.766887401432974, + "language_loss": 0.80214149, + "learning_rate": 3.7626995293390735e-06, + "loss": 0.88076055, + "num_input_tokens_seen": 65406550, + "router_z_loss_clip": 2.63867188, + "router_z_loss_mlp": 0.24804688, + "step": 3026, + "time_per_iteration": 2.5226128101348877 + }, + { + "auxiliary_loss_clip": 0.06583989, + "auxiliary_loss_mlp": 0.01292049, + "balance_loss_clip": 0.06316754, + "balance_loss_mlp": 0.01267695, + "epoch": 0.18199308582594317, + "flos": 25921242046080.0, + "grad_norm": 3.8781285127645924, + "language_loss": 0.76237446, + "learning_rate": 3.762515489146692e-06, + "loss": 0.84113485, + "num_input_tokens_seen": 65425955, + "router_z_loss_clip": 2.67578125, + "router_z_loss_mlp": 0.2434082, + "step": 3027, + "time_per_iteration": 2.578749418258667 + }, + { + "auxiliary_loss_clip": 0.06592765, + "auxiliary_loss_mlp": 0.01296803, + "balance_loss_clip": 0.06322083, + "balance_loss_mlp": 0.01271328, + "epoch": 0.18205320907861114, + "flos": 15382942542720.0, + "grad_norm": 3.274226659229475, + "language_loss": 0.86130804, + "learning_rate": 3.762331382119546e-06, + "loss": 0.94020373, + "num_input_tokens_seen": 65442820, + "router_z_loss_clip": 2.71289062, + "router_z_loss_mlp": 0.25476074, + "step": 3028, + "time_per_iteration": 2.5201306343078613 + }, + { + "auxiliary_loss_clip": 0.06585124, + "auxiliary_loss_mlp": 0.01291016, + "balance_loss_clip": 0.06319305, + "balance_loss_mlp": 0.01263896, + "epoch": 0.18211333233127913, + "flos": 25630485978240.0, + "grad_norm": 1.8702692274079507, + "language_loss": 0.83509612, + "learning_rate": 3.7621472082646183e-06, + "loss": 0.91385752, + "num_input_tokens_seen": 65461825, + "router_z_loss_clip": 2.65429688, + "router_z_loss_mlp": 0.27111816, + "step": 3029, + "time_per_iteration": 2.562183380126953 + }, + { + "auxiliary_loss_clip": 0.06592625, + "auxiliary_loss_mlp": 0.01296678, + "balance_loss_clip": 0.06326656, + "balance_loss_mlp": 0.01269153, + "epoch": 0.1821734555839471, + "flos": 14981329123200.0, + "grad_norm": 1.9791177396807749, + "language_loss": 0.78960443, + "learning_rate": 3.761962967588891e-06, + "loss": 0.86849743, + "num_input_tokens_seen": 65479480, + "router_z_loss_clip": 2.65820312, + "router_z_loss_mlp": 0.27514648, + "step": 3030, + "time_per_iteration": 2.5145437717437744 + }, + { + "auxiliary_loss_clip": 0.06592657, + "auxiliary_loss_mlp": 0.01296331, + "balance_loss_clip": 0.06325006, + "balance_loss_mlp": 0.01269748, + "epoch": 0.18223357883661506, + "flos": 20200291079040.0, + "grad_norm": 1.9881761765350903, + "language_loss": 0.86102521, + "learning_rate": 3.761778660099352e-06, + "loss": 0.93991506, + "num_input_tokens_seen": 65497775, + "router_z_loss_clip": 2.6796875, + "router_z_loss_mlp": 0.26623535, + "step": 3031, + "time_per_iteration": 2.5260634422302246 + }, + { + "auxiliary_loss_clip": 0.06592748, + "auxiliary_loss_mlp": 0.01294791, + "balance_loss_clip": 0.06325988, + "balance_loss_mlp": 0.01270473, + "epoch": 0.18229370208928303, + "flos": 15237438727680.0, + "grad_norm": 2.0909174524979033, + "language_loss": 0.8092168, + "learning_rate": 3.76159428580299e-06, + "loss": 0.88809216, + "num_input_tokens_seen": 65516505, + "router_z_loss_clip": 2.66796875, + "router_z_loss_mlp": 0.24316406, + "step": 3032, + "time_per_iteration": 2.5710113048553467 + }, + { + "auxiliary_loss_clip": 0.06594816, + "auxiliary_loss_mlp": 0.01293656, + "balance_loss_clip": 0.06321192, + "balance_loss_mlp": 0.0126718, + "epoch": 0.182353825341951, + "flos": 23847026549760.0, + "grad_norm": 1.952875580311909, + "language_loss": 0.81854784, + "learning_rate": 3.761409844706795e-06, + "loss": 0.89743257, + "num_input_tokens_seen": 65536160, + "router_z_loss_clip": 2.73632812, + "router_z_loss_mlp": 0.26501465, + "step": 3033, + "time_per_iteration": 2.5495798587799072 + }, + { + "auxiliary_loss_clip": 0.06484132, + "auxiliary_loss_mlp": 0.01303963, + "balance_loss_clip": 0.06340252, + "balance_loss_mlp": 0.01294378, + "epoch": 0.18241394859461896, + "flos": 61208017522560.0, + "grad_norm": 0.8447557433525825, + "language_loss": 0.63402653, + "learning_rate": 3.7612253368177625e-06, + "loss": 0.71190745, + "num_input_tokens_seen": 65589375, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.09570312, + "step": 3034, + "time_per_iteration": 3.0660452842712402 + }, + { + "auxiliary_loss_clip": 0.0658728, + "auxiliary_loss_mlp": 0.01296965, + "balance_loss_clip": 0.0632379, + "balance_loss_mlp": 0.01271896, + "epoch": 0.18247407184728695, + "flos": 18476439431040.0, + "grad_norm": 2.061097584564917, + "language_loss": 0.80526477, + "learning_rate": 3.7610407621428893e-06, + "loss": 0.88410723, + "num_input_tokens_seen": 65606720, + "router_z_loss_clip": 2.63476562, + "router_z_loss_mlp": 0.25073242, + "step": 3035, + "time_per_iteration": 2.5506694316864014 + }, + { + "auxiliary_loss_clip": 0.06580287, + "auxiliary_loss_mlp": 0.01288285, + "balance_loss_clip": 0.06319961, + "balance_loss_mlp": 0.01264181, + "epoch": 0.18253419509995492, + "flos": 21801042679680.0, + "grad_norm": 1.6140632959859456, + "language_loss": 0.85371202, + "learning_rate": 3.7608561206891735e-06, + "loss": 0.93239772, + "num_input_tokens_seen": 65625495, + "router_z_loss_clip": 2.6015625, + "router_z_loss_mlp": 0.24108887, + "step": 3036, + "time_per_iteration": 2.6029741764068604 + }, + { + "auxiliary_loss_clip": 0.06580038, + "auxiliary_loss_mlp": 0.01290184, + "balance_loss_clip": 0.0632468, + "balance_loss_mlp": 0.01266843, + "epoch": 0.18259431835262288, + "flos": 20154743585280.0, + "grad_norm": 2.265799944133398, + "language_loss": 0.80322921, + "learning_rate": 3.760671412463617e-06, + "loss": 0.88193142, + "num_input_tokens_seen": 65643515, + "router_z_loss_clip": 2.54882812, + "router_z_loss_mlp": 0.23327637, + "step": 3037, + "time_per_iteration": 2.519632577896118 + }, + { + "auxiliary_loss_clip": 0.06593587, + "auxiliary_loss_mlp": 0.01295693, + "balance_loss_clip": 0.063269, + "balance_loss_mlp": 0.01270373, + "epoch": 0.18265444160529085, + "flos": 16987132160640.0, + "grad_norm": 4.978587383263401, + "language_loss": 0.80596817, + "learning_rate": 3.7604866374732246e-06, + "loss": 0.88486093, + "num_input_tokens_seen": 65658155, + "router_z_loss_clip": 2.66601562, + "router_z_loss_mlp": 0.25341797, + "step": 3038, + "time_per_iteration": 2.549565315246582 + }, + { + "auxiliary_loss_clip": 0.06577064, + "auxiliary_loss_mlp": 0.01293219, + "balance_loss_clip": 0.06316892, + "balance_loss_mlp": 0.01268221, + "epoch": 0.1827145648579588, + "flos": 34431879795840.0, + "grad_norm": 3.0715308969073907, + "language_loss": 0.6822418, + "learning_rate": 3.7603017957250023e-06, + "loss": 0.76094472, + "num_input_tokens_seen": 65679310, + "router_z_loss_clip": 2.60351562, + "router_z_loss_mlp": 0.24987793, + "step": 3039, + "time_per_iteration": 2.664839267730713 + }, + { + "auxiliary_loss_clip": 0.06579359, + "auxiliary_loss_mlp": 0.01283138, + "balance_loss_clip": 0.06312781, + "balance_loss_mlp": 0.0125783, + "epoch": 0.18277468811062678, + "flos": 53298905834880.0, + "grad_norm": 2.0617529505454866, + "language_loss": 0.74242914, + "learning_rate": 3.7601168872259593e-06, + "loss": 0.82105416, + "num_input_tokens_seen": 65705235, + "router_z_loss_clip": 2.66796875, + "router_z_loss_mlp": 0.25305176, + "step": 3040, + "time_per_iteration": 2.8341598510742188 + }, + { + "auxiliary_loss_clip": 0.06576048, + "auxiliary_loss_mlp": 0.01287293, + "balance_loss_clip": 0.06314505, + "balance_loss_mlp": 0.01261997, + "epoch": 0.18283481136329474, + "flos": 31658879975040.0, + "grad_norm": 2.270513376553218, + "language_loss": 0.61012894, + "learning_rate": 3.7599319119831075e-06, + "loss": 0.68876237, + "num_input_tokens_seen": 65727575, + "router_z_loss_clip": 2.61328125, + "router_z_loss_mlp": 0.25305176, + "step": 3041, + "time_per_iteration": 2.6312432289123535 + }, + { + "auxiliary_loss_clip": 0.065763, + "auxiliary_loss_mlp": 0.01280171, + "balance_loss_clip": 0.06311682, + "balance_loss_mlp": 0.01254779, + "epoch": 0.18289493461596273, + "flos": 53148957753600.0, + "grad_norm": 1.9789856473501881, + "language_loss": 0.60569113, + "learning_rate": 3.7597468700034616e-06, + "loss": 0.68425584, + "num_input_tokens_seen": 65751370, + "router_z_loss_clip": 2.6484375, + "router_z_loss_mlp": 0.25366211, + "step": 3042, + "time_per_iteration": 2.8294289112091064 + }, + { + "auxiliary_loss_clip": 0.06571855, + "auxiliary_loss_mlp": 0.01284933, + "balance_loss_clip": 0.06311391, + "balance_loss_mlp": 0.01261818, + "epoch": 0.1829550578686307, + "flos": 25595797587840.0, + "grad_norm": 2.1969947776781593, + "language_loss": 0.87948751, + "learning_rate": 3.7595617612940374e-06, + "loss": 0.95805538, + "num_input_tokens_seen": 65771040, + "router_z_loss_clip": 2.60742188, + "router_z_loss_mlp": 0.2310791, + "step": 3043, + "time_per_iteration": 2.5895864963531494 + }, + { + "auxiliary_loss_clip": 0.06576079, + "auxiliary_loss_mlp": 0.01280472, + "balance_loss_clip": 0.06308874, + "balance_loss_mlp": 0.01255737, + "epoch": 0.18301518112129866, + "flos": 22608001025280.0, + "grad_norm": 2.7546688504112633, + "language_loss": 0.71556103, + "learning_rate": 3.7593765858618552e-06, + "loss": 0.79412657, + "num_input_tokens_seen": 65789345, + "router_z_loss_clip": 2.671875, + "router_z_loss_mlp": 0.24731445, + "step": 3044, + "time_per_iteration": 2.524653196334839 + }, + { + "auxiliary_loss_clip": 0.06580091, + "auxiliary_loss_mlp": 0.0128018, + "balance_loss_clip": 0.06309704, + "balance_loss_mlp": 0.01255277, + "epoch": 0.18307530437396663, + "flos": 34029176273280.0, + "grad_norm": 2.5838478211487406, + "language_loss": 0.65133858, + "learning_rate": 3.7591913437139365e-06, + "loss": 0.72994125, + "num_input_tokens_seen": 65810990, + "router_z_loss_clip": 2.70117188, + "router_z_loss_mlp": 0.24914551, + "step": 3045, + "time_per_iteration": 2.656454086303711 + }, + { + "auxiliary_loss_clip": 0.06567913, + "auxiliary_loss_mlp": 0.01279381, + "balance_loss_clip": 0.06306372, + "balance_loss_mlp": 0.01256898, + "epoch": 0.1831354276266346, + "flos": 21284756547840.0, + "grad_norm": 3.147408680423339, + "language_loss": 0.803563, + "learning_rate": 3.7590060348573066e-06, + "loss": 0.88203591, + "num_input_tokens_seen": 65827230, + "router_z_loss_clip": 2.6171875, + "router_z_loss_mlp": 0.22497559, + "step": 3046, + "time_per_iteration": 2.503777503967285 + }, + { + "auxiliary_loss_clip": 0.06581149, + "auxiliary_loss_mlp": 0.01284573, + "balance_loss_clip": 0.06310049, + "balance_loss_mlp": 0.01259217, + "epoch": 0.18319555087930256, + "flos": 21039338338560.0, + "grad_norm": 2.4200593706157627, + "language_loss": 0.79505324, + "learning_rate": 3.7588206592989903e-06, + "loss": 0.87371051, + "num_input_tokens_seen": 65845900, + "router_z_loss_clip": 2.71289062, + "router_z_loss_mlp": 0.25354004, + "step": 3047, + "time_per_iteration": 2.5604546070098877 + }, + { + "auxiliary_loss_clip": 0.06579873, + "auxiliary_loss_mlp": 0.01282037, + "balance_loss_clip": 0.06320655, + "balance_loss_mlp": 0.01258243, + "epoch": 0.18325567413197055, + "flos": 34390944276480.0, + "grad_norm": 1.4781726378987778, + "language_loss": 0.81601483, + "learning_rate": 3.7586352170460194e-06, + "loss": 0.89463389, + "num_input_tokens_seen": 65868730, + "router_z_loss_clip": 2.59570312, + "router_z_loss_mlp": 0.23779297, + "step": 3048, + "time_per_iteration": 2.6359665393829346 + }, + { + "auxiliary_loss_clip": 0.06575403, + "auxiliary_loss_mlp": 0.01285089, + "balance_loss_clip": 0.0631268, + "balance_loss_mlp": 0.01260472, + "epoch": 0.18331579738463852, + "flos": 20564742412800.0, + "grad_norm": 2.1940168845136045, + "language_loss": 0.87414008, + "learning_rate": 3.758449708105424e-06, + "loss": 0.95274496, + "num_input_tokens_seen": 65888420, + "router_z_loss_clip": 2.62890625, + "router_z_loss_mlp": 0.24633789, + "step": 3049, + "time_per_iteration": 2.5575695037841797 + }, + { + "auxiliary_loss_clip": 0.06592787, + "auxiliary_loss_mlp": 0.01283738, + "balance_loss_clip": 0.0632069, + "balance_loss_mlp": 0.01259086, + "epoch": 0.18337592063730648, + "flos": 19613663844480.0, + "grad_norm": 3.2022638976819486, + "language_loss": 0.78845787, + "learning_rate": 3.75826413248424e-06, + "loss": 0.86722308, + "num_input_tokens_seen": 65905840, + "router_z_loss_clip": 2.72265625, + "router_z_loss_mlp": 0.24694824, + "step": 3050, + "time_per_iteration": 2.5530426502227783 + }, + { + "auxiliary_loss_clip": 0.06580114, + "auxiliary_loss_mlp": 0.01276938, + "balance_loss_clip": 0.06318066, + "balance_loss_mlp": 0.01253466, + "epoch": 0.18343604388997445, + "flos": 20857301343360.0, + "grad_norm": 2.3642096483096764, + "language_loss": 1.00007951, + "learning_rate": 3.7580784901895035e-06, + "loss": 1.07865, + "num_input_tokens_seen": 65922845, + "router_z_loss_clip": 2.62109375, + "router_z_loss_mlp": 0.23474121, + "step": 3051, + "time_per_iteration": 2.53879714012146 + }, + { + "auxiliary_loss_clip": 0.06576733, + "auxiliary_loss_mlp": 0.01279033, + "balance_loss_clip": 0.06316614, + "balance_loss_mlp": 0.01255025, + "epoch": 0.1834961671426424, + "flos": 24402109921920.0, + "grad_norm": 1.6089937167063422, + "language_loss": 0.87510651, + "learning_rate": 3.7578927812282542e-06, + "loss": 0.95366418, + "num_input_tokens_seen": 65945555, + "router_z_loss_clip": 2.6015625, + "router_z_loss_mlp": 0.23999023, + "step": 3052, + "time_per_iteration": 2.616711378097534 + }, + { + "auxiliary_loss_clip": 0.06578867, + "auxiliary_loss_mlp": 0.01277944, + "balance_loss_clip": 0.06319693, + "balance_loss_mlp": 0.01255485, + "epoch": 0.18355629039531038, + "flos": 21257992368000.0, + "grad_norm": 1.906783267886923, + "language_loss": 0.73879737, + "learning_rate": 3.7577070056075356e-06, + "loss": 0.81736547, + "num_input_tokens_seen": 65963965, + "router_z_loss_clip": 2.59179688, + "router_z_loss_mlp": 0.22473145, + "step": 3053, + "time_per_iteration": 2.5624823570251465 + }, + { + "auxiliary_loss_clip": 0.06577893, + "auxiliary_loss_mlp": 0.01281464, + "balance_loss_clip": 0.06309894, + "balance_loss_mlp": 0.01257264, + "epoch": 0.18361641364797834, + "flos": 28663830034560.0, + "grad_norm": 2.5767200648108233, + "language_loss": 0.6330536, + "learning_rate": 3.7575211633343902e-06, + "loss": 0.71164715, + "num_input_tokens_seen": 65985965, + "router_z_loss_clip": 2.6796875, + "router_z_loss_mlp": 0.24194336, + "step": 3054, + "time_per_iteration": 2.6126291751861572 + }, + { + "auxiliary_loss_clip": 0.06580043, + "auxiliary_loss_mlp": 0.01278803, + "balance_loss_clip": 0.0631642, + "balance_loss_mlp": 0.0125539, + "epoch": 0.18367653690064634, + "flos": 20924414064000.0, + "grad_norm": 2.0083810279560192, + "language_loss": 0.79178774, + "learning_rate": 3.7573352544158663e-06, + "loss": 0.87037629, + "num_input_tokens_seen": 66005645, + "router_z_loss_clip": 2.63671875, + "router_z_loss_mlp": 0.23400879, + "step": 3055, + "time_per_iteration": 3.9858450889587402 + }, + { + "auxiliary_loss_clip": 0.06567059, + "auxiliary_loss_mlp": 0.01278609, + "balance_loss_clip": 0.06311087, + "balance_loss_mlp": 0.01255971, + "epoch": 0.1837366601533143, + "flos": 28772884523520.0, + "grad_norm": 1.844309785332071, + "language_loss": 0.71021843, + "learning_rate": 3.757149278859014e-06, + "loss": 0.78867513, + "num_input_tokens_seen": 66025675, + "router_z_loss_clip": 2.56054688, + "router_z_loss_mlp": 0.2265625, + "step": 3056, + "time_per_iteration": 2.623892068862915 + }, + { + "auxiliary_loss_clip": 0.06573971, + "auxiliary_loss_mlp": 0.01282679, + "balance_loss_clip": 0.06309162, + "balance_loss_mlp": 0.0125954, + "epoch": 0.18379678340598227, + "flos": 21257782732800.0, + "grad_norm": 1.9202402240588465, + "language_loss": 0.81177384, + "learning_rate": 3.7569632366708842e-06, + "loss": 0.89034033, + "num_input_tokens_seen": 66046125, + "router_z_loss_clip": 2.64648438, + "router_z_loss_mlp": 0.23144531, + "step": 3057, + "time_per_iteration": 3.994014263153076 + }, + { + "auxiliary_loss_clip": 0.06576763, + "auxiliary_loss_mlp": 0.01288527, + "balance_loss_clip": 0.06303927, + "balance_loss_mlp": 0.01263029, + "epoch": 0.18385690665865023, + "flos": 20455981413120.0, + "grad_norm": 5.209505310648867, + "language_loss": 0.83562195, + "learning_rate": 3.756777127858533e-06, + "loss": 0.91427481, + "num_input_tokens_seen": 66064375, + "router_z_loss_clip": 2.73242188, + "router_z_loss_mlp": 0.25500488, + "step": 3058, + "time_per_iteration": 2.559356689453125 + }, + { + "auxiliary_loss_clip": 0.0658073, + "auxiliary_loss_mlp": 0.01283954, + "balance_loss_clip": 0.06315949, + "balance_loss_mlp": 0.01259278, + "epoch": 0.1839170299113182, + "flos": 26147736432000.0, + "grad_norm": 2.1347539719525552, + "language_loss": 0.86113238, + "learning_rate": 3.756590952429017e-06, + "loss": 0.93977928, + "num_input_tokens_seen": 66084590, + "router_z_loss_clip": 2.6484375, + "router_z_loss_mlp": 0.2467041, + "step": 3059, + "time_per_iteration": 2.5702602863311768 + }, + { + "auxiliary_loss_clip": 0.0656752, + "auxiliary_loss_mlp": 0.01279577, + "balance_loss_clip": 0.06304803, + "balance_loss_mlp": 0.01255997, + "epoch": 0.18397715316398616, + "flos": 31765921966080.0, + "grad_norm": 1.5595075663945241, + "language_loss": 0.73269093, + "learning_rate": 3.756404710389396e-06, + "loss": 0.81116188, + "num_input_tokens_seen": 66107105, + "router_z_loss_clip": 2.625, + "router_z_loss_mlp": 0.23583984, + "step": 3060, + "time_per_iteration": 2.6496734619140625 + }, + { + "auxiliary_loss_clip": 0.06572919, + "auxiliary_loss_mlp": 0.01280202, + "balance_loss_clip": 0.06306632, + "balance_loss_mlp": 0.01254715, + "epoch": 0.18403727641665413, + "flos": 24619548067200.0, + "grad_norm": 1.685629450787069, + "language_loss": 0.73033082, + "learning_rate": 3.7562184017467323e-06, + "loss": 0.80886197, + "num_input_tokens_seen": 66129295, + "router_z_loss_clip": 2.6640625, + "router_z_loss_mlp": 0.25512695, + "step": 3061, + "time_per_iteration": 2.611788034439087 + }, + { + "auxiliary_loss_clip": 0.06574027, + "auxiliary_loss_mlp": 0.01285757, + "balance_loss_clip": 0.06309725, + "balance_loss_mlp": 0.01262666, + "epoch": 0.18409739966932212, + "flos": 23446503233280.0, + "grad_norm": 3.8650330009727893, + "language_loss": 0.81972837, + "learning_rate": 3.7560320265080906e-06, + "loss": 0.89832628, + "num_input_tokens_seen": 66146910, + "router_z_loss_clip": 2.64453125, + "router_z_loss_mlp": 0.23095703, + "step": 3062, + "time_per_iteration": 5.428592920303345 + }, + { + "auxiliary_loss_clip": 0.06579094, + "auxiliary_loss_mlp": 0.01285398, + "balance_loss_clip": 0.06309452, + "balance_loss_mlp": 0.01260806, + "epoch": 0.18415752292199009, + "flos": 21878637160320.0, + "grad_norm": 1.977008299285237, + "language_loss": 0.74067175, + "learning_rate": 3.7558455846805383e-06, + "loss": 0.81931663, + "num_input_tokens_seen": 66165370, + "router_z_loss_clip": 2.69921875, + "router_z_loss_mlp": 0.24572754, + "step": 3063, + "time_per_iteration": 2.53143572807312 + }, + { + "auxiliary_loss_clip": 0.06568366, + "auxiliary_loss_mlp": 0.0128141, + "balance_loss_clip": 0.06305687, + "balance_loss_mlp": 0.01257556, + "epoch": 0.18421764617465805, + "flos": 25417701734400.0, + "grad_norm": 1.7280289049146156, + "language_loss": 0.66864884, + "learning_rate": 3.7556590762711463e-06, + "loss": 0.74714661, + "num_input_tokens_seen": 66186210, + "router_z_loss_clip": 2.62890625, + "router_z_loss_mlp": 0.23864746, + "step": 3064, + "time_per_iteration": 2.595961332321167 + }, + { + "auxiliary_loss_clip": 0.06569844, + "auxiliary_loss_mlp": 0.0127972, + "balance_loss_clip": 0.06304256, + "balance_loss_mlp": 0.01256748, + "epoch": 0.18427776942732602, + "flos": 27205395793920.0, + "grad_norm": 1.7817654183541871, + "language_loss": 0.69580668, + "learning_rate": 3.7554725012869853e-06, + "loss": 0.77430236, + "num_input_tokens_seen": 66204800, + "router_z_loss_clip": 2.65625, + "router_z_loss_mlp": 0.22937012, + "step": 3065, + "time_per_iteration": 2.5717501640319824 + }, + { + "auxiliary_loss_clip": 0.06574196, + "auxiliary_loss_mlp": 0.01283905, + "balance_loss_clip": 0.06306924, + "balance_loss_mlp": 0.01258168, + "epoch": 0.18433789267999398, + "flos": 27859303457280.0, + "grad_norm": 2.294674560085645, + "language_loss": 0.73328084, + "learning_rate": 3.7552858597351318e-06, + "loss": 0.81186187, + "num_input_tokens_seen": 66222195, + "router_z_loss_clip": 2.671875, + "router_z_loss_mlp": 0.25720215, + "step": 3066, + "time_per_iteration": 2.5840933322906494 + }, + { + "auxiliary_loss_clip": 0.06567979, + "auxiliary_loss_mlp": 0.01283252, + "balance_loss_clip": 0.06303403, + "balance_loss_mlp": 0.01259458, + "epoch": 0.18439801593266195, + "flos": 17862502965120.0, + "grad_norm": 1.9426241343058523, + "language_loss": 0.8287726, + "learning_rate": 3.7550991516226622e-06, + "loss": 0.90728498, + "num_input_tokens_seen": 66239505, + "router_z_loss_clip": 2.64453125, + "router_z_loss_mlp": 0.23791504, + "step": 3067, + "time_per_iteration": 2.510010004043579 + }, + { + "auxiliary_loss_clip": 0.06482083, + "auxiliary_loss_mlp": 0.01256206, + "balance_loss_clip": 0.06330505, + "balance_loss_mlp": 0.01248302, + "epoch": 0.18445813918532994, + "flos": 56408236416000.0, + "grad_norm": 0.8014843936748705, + "language_loss": 0.59808761, + "learning_rate": 3.754912376956657e-06, + "loss": 0.67547047, + "num_input_tokens_seen": 66295695, + "router_z_loss_clip": 1.515625, + "router_z_loss_mlp": 0.07897949, + "step": 3068, + "time_per_iteration": 3.036146879196167 + }, + { + "auxiliary_loss_clip": 0.06564388, + "auxiliary_loss_mlp": 0.01280505, + "balance_loss_clip": 0.06303549, + "balance_loss_mlp": 0.01256687, + "epoch": 0.1845182624379979, + "flos": 20963085523200.0, + "grad_norm": 1.8439912741449518, + "language_loss": 0.77266169, + "learning_rate": 3.7547255357441987e-06, + "loss": 0.8511107, + "num_input_tokens_seen": 66315315, + "router_z_loss_clip": 2.60742188, + "router_z_loss_mlp": 0.23840332, + "step": 3069, + "time_per_iteration": 2.5499565601348877 + }, + { + "auxiliary_loss_clip": 0.06570058, + "auxiliary_loss_mlp": 0.01283287, + "balance_loss_clip": 0.06303704, + "balance_loss_mlp": 0.01258038, + "epoch": 0.18457838569066587, + "flos": 20491382563200.0, + "grad_norm": 2.2630610204441655, + "language_loss": 0.86447155, + "learning_rate": 3.7545386279923718e-06, + "loss": 0.94300503, + "num_input_tokens_seen": 66333675, + "router_z_loss_clip": 2.6640625, + "router_z_loss_mlp": 0.25280762, + "step": 3070, + "time_per_iteration": 2.573843479156494 + }, + { + "auxiliary_loss_clip": 0.06575848, + "auxiliary_loss_mlp": 0.0128984, + "balance_loss_clip": 0.06307413, + "balance_loss_mlp": 0.01265545, + "epoch": 0.18463850894333383, + "flos": 25017094563840.0, + "grad_norm": 2.0459920671080725, + "language_loss": 0.78778827, + "learning_rate": 3.754351653708265e-06, + "loss": 0.86644518, + "num_input_tokens_seen": 66354075, + "router_z_loss_clip": 2.68164062, + "router_z_loss_mlp": 0.24279785, + "step": 3071, + "time_per_iteration": 2.6498963832855225 + }, + { + "auxiliary_loss_clip": 0.06567957, + "auxiliary_loss_mlp": 0.01281558, + "balance_loss_clip": 0.06301579, + "balance_loss_mlp": 0.01256142, + "epoch": 0.1846986321960018, + "flos": 16806311049600.0, + "grad_norm": 2.346095649750701, + "language_loss": 0.77759838, + "learning_rate": 3.7541646128989674e-06, + "loss": 0.85609353, + "num_input_tokens_seen": 66372520, + "router_z_loss_clip": 2.66210938, + "router_z_loss_mlp": 0.25427246, + "step": 3072, + "time_per_iteration": 2.5731780529022217 + }, + { + "auxiliary_loss_clip": 0.06569058, + "auxiliary_loss_mlp": 0.01286345, + "balance_loss_clip": 0.06299037, + "balance_loss_mlp": 0.01261096, + "epoch": 0.18475875544866976, + "flos": 20820726236160.0, + "grad_norm": 1.9004070702769575, + "language_loss": 0.87276495, + "learning_rate": 3.7539775055715715e-06, + "loss": 0.95131898, + "num_input_tokens_seen": 66390745, + "router_z_loss_clip": 2.703125, + "router_z_loss_mlp": 0.25231934, + "step": 3073, + "time_per_iteration": 2.5327014923095703 + }, + { + "auxiliary_loss_clip": 0.06571067, + "auxiliary_loss_mlp": 0.01285925, + "balance_loss_clip": 0.06302057, + "balance_loss_mlp": 0.01261523, + "epoch": 0.18481887870133773, + "flos": 22608001025280.0, + "grad_norm": 2.4702398063651314, + "language_loss": 0.9204939, + "learning_rate": 3.7537903317331732e-06, + "loss": 0.99906385, + "num_input_tokens_seen": 66410525, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 0.24401855, + "step": 3074, + "time_per_iteration": 2.6219372749328613 + }, + { + "auxiliary_loss_clip": 0.06566601, + "auxiliary_loss_mlp": 0.01284131, + "balance_loss_clip": 0.06302647, + "balance_loss_mlp": 0.01257583, + "epoch": 0.18487900195400572, + "flos": 29466218332800.0, + "grad_norm": 2.295087571563985, + "language_loss": 0.64970315, + "learning_rate": 3.75360309139087e-06, + "loss": 0.72821045, + "num_input_tokens_seen": 66432535, + "router_z_loss_clip": 2.640625, + "router_z_loss_mlp": 0.26550293, + "step": 3075, + "time_per_iteration": 2.6108217239379883 + }, + { + "auxiliary_loss_clip": 0.06563977, + "auxiliary_loss_mlp": 0.0128829, + "balance_loss_clip": 0.06303947, + "balance_loss_mlp": 0.01264519, + "epoch": 0.1849391252066737, + "flos": 20634622318080.0, + "grad_norm": 2.1580493004205943, + "language_loss": 0.7321173, + "learning_rate": 3.753415784551761e-06, + "loss": 0.81063998, + "num_input_tokens_seen": 66450620, + "router_z_loss_clip": 2.60351562, + "router_z_loss_mlp": 0.23742676, + "step": 3076, + "time_per_iteration": 2.552551746368408 + }, + { + "auxiliary_loss_clip": 0.06574243, + "auxiliary_loss_mlp": 0.01280151, + "balance_loss_clip": 0.06304738, + "balance_loss_mlp": 0.01256309, + "epoch": 0.18499924845934165, + "flos": 14433750691200.0, + "grad_norm": 2.459416187119703, + "language_loss": 0.82324487, + "learning_rate": 3.7532284112229507e-06, + "loss": 0.90178883, + "num_input_tokens_seen": 66467865, + "router_z_loss_clip": 2.69726562, + "router_z_loss_mlp": 0.23864746, + "step": 3077, + "time_per_iteration": 2.493069648742676 + }, + { + "auxiliary_loss_clip": 0.06560019, + "auxiliary_loss_mlp": 0.01280161, + "balance_loss_clip": 0.06302261, + "balance_loss_mlp": 0.01256748, + "epoch": 0.18505937171200962, + "flos": 23733611648640.0, + "grad_norm": 1.8347096473751274, + "language_loss": 0.79534197, + "learning_rate": 3.7530409714115424e-06, + "loss": 0.87374371, + "num_input_tokens_seen": 66486245, + "router_z_loss_clip": 2.57617188, + "router_z_loss_mlp": 0.23425293, + "step": 3078, + "time_per_iteration": 2.5838091373443604 + }, + { + "auxiliary_loss_clip": 0.0657796, + "auxiliary_loss_mlp": 0.01288284, + "balance_loss_clip": 0.06314268, + "balance_loss_mlp": 0.0126536, + "epoch": 0.18511949496467758, + "flos": 25964525479680.0, + "grad_norm": 2.3879568543100174, + "language_loss": 0.78543603, + "learning_rate": 3.7528534651246453e-06, + "loss": 0.86409843, + "num_input_tokens_seen": 66506510, + "router_z_loss_clip": 2.63867188, + "router_z_loss_mlp": 0.22937012, + "step": 3079, + "time_per_iteration": 2.5836563110351562 + }, + { + "auxiliary_loss_clip": 0.06574707, + "auxiliary_loss_mlp": 0.01290584, + "balance_loss_clip": 0.06311746, + "balance_loss_mlp": 0.01266921, + "epoch": 0.18517961821734555, + "flos": 42423506156160.0, + "grad_norm": 2.6792059094445393, + "language_loss": 0.82738018, + "learning_rate": 3.752665892369369e-06, + "loss": 0.90603304, + "num_input_tokens_seen": 66530960, + "router_z_loss_clip": 2.62890625, + "router_z_loss_mlp": 0.23669434, + "step": 3080, + "time_per_iteration": 2.7419395446777344 + }, + { + "auxiliary_loss_clip": 0.06581488, + "auxiliary_loss_mlp": 0.01283912, + "balance_loss_clip": 0.06312552, + "balance_loss_mlp": 0.01258306, + "epoch": 0.18523974147001354, + "flos": 24104435892480.0, + "grad_norm": 2.0136248585759815, + "language_loss": 0.75280142, + "learning_rate": 3.7524782531528266e-06, + "loss": 0.83145541, + "num_input_tokens_seen": 66550275, + "router_z_loss_clip": 2.69140625, + "router_z_loss_mlp": 0.25622559, + "step": 3081, + "time_per_iteration": 2.558880567550659 + }, + { + "auxiliary_loss_clip": 0.06580579, + "auxiliary_loss_mlp": 0.01294641, + "balance_loss_clip": 0.06314941, + "balance_loss_mlp": 0.01267354, + "epoch": 0.1852998647226815, + "flos": 27381688784640.0, + "grad_norm": 2.2228183561660533, + "language_loss": 0.72592467, + "learning_rate": 3.7522905474821334e-06, + "loss": 0.80467689, + "num_input_tokens_seen": 66569040, + "router_z_loss_clip": 2.65820312, + "router_z_loss_mlp": 0.27282715, + "step": 3082, + "time_per_iteration": 2.588782787322998 + }, + { + "auxiliary_loss_clip": 0.06586821, + "auxiliary_loss_mlp": 0.01289587, + "balance_loss_clip": 0.06314754, + "balance_loss_mlp": 0.01263409, + "epoch": 0.18535998797534947, + "flos": 18338650191360.0, + "grad_norm": 1.9336985276158285, + "language_loss": 0.70667702, + "learning_rate": 3.752102775364407e-06, + "loss": 0.78544116, + "num_input_tokens_seen": 66587775, + "router_z_loss_clip": 2.72070312, + "router_z_loss_mlp": 0.26184082, + "step": 3083, + "time_per_iteration": 2.630099296569824 + }, + { + "auxiliary_loss_clip": 0.06573243, + "auxiliary_loss_mlp": 0.01286773, + "balance_loss_clip": 0.06312741, + "balance_loss_mlp": 0.01261548, + "epoch": 0.18542011122801744, + "flos": 37853881816320.0, + "grad_norm": 1.8745280868212635, + "language_loss": 0.69687432, + "learning_rate": 3.751914936806767e-06, + "loss": 0.77547449, + "num_input_tokens_seen": 66610800, + "router_z_loss_clip": 2.60742188, + "router_z_loss_mlp": 0.25244141, + "step": 3084, + "time_per_iteration": 2.7246148586273193 + }, + { + "auxiliary_loss_clip": 0.06577612, + "auxiliary_loss_mlp": 0.01284469, + "balance_loss_clip": 0.06314437, + "balance_loss_mlp": 0.01261402, + "epoch": 0.1854802344806854, + "flos": 25192171670400.0, + "grad_norm": 1.5329506051970134, + "language_loss": 0.78209639, + "learning_rate": 3.7517270318163377e-06, + "loss": 0.86071718, + "num_input_tokens_seen": 66630960, + "router_z_loss_clip": 2.6328125, + "router_z_loss_mlp": 0.23071289, + "step": 3085, + "time_per_iteration": 2.6189463138580322 + }, + { + "auxiliary_loss_clip": 0.06579587, + "auxiliary_loss_mlp": 0.01287952, + "balance_loss_clip": 0.06314654, + "balance_loss_mlp": 0.01261964, + "epoch": 0.18554035773335337, + "flos": 26691541430400.0, + "grad_norm": 1.8306415954747441, + "language_loss": 0.74554545, + "learning_rate": 3.751539060400244e-06, + "loss": 0.82422084, + "num_input_tokens_seen": 66650585, + "router_z_loss_clip": 2.65234375, + "router_z_loss_mlp": 0.2598877, + "step": 3086, + "time_per_iteration": 2.5668296813964844 + }, + { + "auxiliary_loss_clip": 0.06581503, + "auxiliary_loss_mlp": 0.0129843, + "balance_loss_clip": 0.06316213, + "balance_loss_mlp": 0.01272026, + "epoch": 0.18560048098602133, + "flos": 22353568502400.0, + "grad_norm": 2.451797107788235, + "language_loss": 0.70597452, + "learning_rate": 3.7513510225656132e-06, + "loss": 0.78477389, + "num_input_tokens_seen": 66670045, + "router_z_loss_clip": 2.65429688, + "router_z_loss_mlp": 0.26391602, + "step": 3087, + "time_per_iteration": 2.568380117416382 + }, + { + "auxiliary_loss_clip": 0.06584737, + "auxiliary_loss_mlp": 0.01292318, + "balance_loss_clip": 0.06317757, + "balance_loss_mlp": 0.01264543, + "epoch": 0.18566060423868933, + "flos": 17754245089920.0, + "grad_norm": 1.9281487675228464, + "language_loss": 0.73915106, + "learning_rate": 3.7511629183195764e-06, + "loss": 0.81792164, + "num_input_tokens_seen": 66688790, + "router_z_loss_clip": 2.671875, + "router_z_loss_mlp": 0.27783203, + "step": 3088, + "time_per_iteration": 2.536055326461792 + }, + { + "auxiliary_loss_clip": 0.06578237, + "auxiliary_loss_mlp": 0.01288694, + "balance_loss_clip": 0.06316703, + "balance_loss_mlp": 0.0126571, + "epoch": 0.1857207274913573, + "flos": 24683558186880.0, + "grad_norm": 1.798814131108877, + "language_loss": 0.92793214, + "learning_rate": 3.7509747476692663e-06, + "loss": 1.00660145, + "num_input_tokens_seen": 66708090, + "router_z_loss_clip": 2.6171875, + "router_z_loss_mlp": 0.2298584, + "step": 3089, + "time_per_iteration": 2.591520071029663 + }, + { + "auxiliary_loss_clip": 0.06581305, + "auxiliary_loss_mlp": 0.01284125, + "balance_loss_clip": 0.06316443, + "balance_loss_mlp": 0.01260772, + "epoch": 0.18578085074402526, + "flos": 28155426186240.0, + "grad_norm": 2.9732427277308724, + "language_loss": 0.59245396, + "learning_rate": 3.7507865106218176e-06, + "loss": 0.67110825, + "num_input_tokens_seen": 66727320, + "router_z_loss_clip": 2.6484375, + "router_z_loss_mlp": 0.23352051, + "step": 3090, + "time_per_iteration": 2.587693452835083 + }, + { + "auxiliary_loss_clip": 0.06569171, + "auxiliary_loss_mlp": 0.01294048, + "balance_loss_clip": 0.06308332, + "balance_loss_mlp": 0.01269372, + "epoch": 0.18584097399669322, + "flos": 23958764369280.0, + "grad_norm": 1.6455413495288673, + "language_loss": 0.825216, + "learning_rate": 3.7505982071843695e-06, + "loss": 0.90384817, + "num_input_tokens_seen": 66747505, + "router_z_loss_clip": 2.61132812, + "router_z_loss_mlp": 0.24694824, + "step": 3091, + "time_per_iteration": 2.564748525619507 + }, + { + "auxiliary_loss_clip": 0.06580666, + "auxiliary_loss_mlp": 0.01293234, + "balance_loss_clip": 0.06311382, + "balance_loss_mlp": 0.01266758, + "epoch": 0.18590109724936119, + "flos": 17207379417600.0, + "grad_norm": 2.4797040605264904, + "language_loss": 0.8537268, + "learning_rate": 3.7504098373640617e-06, + "loss": 0.93246579, + "num_input_tokens_seen": 66766425, + "router_z_loss_clip": 2.69335938, + "router_z_loss_mlp": 0.2644043, + "step": 3092, + "time_per_iteration": 2.514536142349243 + }, + { + "auxiliary_loss_clip": 0.06587748, + "auxiliary_loss_mlp": 0.01293739, + "balance_loss_clip": 0.06317791, + "balance_loss_mlp": 0.012665, + "epoch": 0.18596122050202915, + "flos": 17239761820800.0, + "grad_norm": 2.2590627268781316, + "language_loss": 0.93402261, + "learning_rate": 3.750221401168038e-06, + "loss": 1.01283741, + "num_input_tokens_seen": 66781130, + "router_z_loss_clip": 2.69921875, + "router_z_loss_mlp": 0.27246094, + "step": 3093, + "time_per_iteration": 2.5037660598754883 + }, + { + "auxiliary_loss_clip": 0.06575991, + "auxiliary_loss_mlp": 0.01284238, + "balance_loss_clip": 0.06309767, + "balance_loss_mlp": 0.01258477, + "epoch": 0.18602134375469712, + "flos": 19025862652800.0, + "grad_norm": 1.8616717248352448, + "language_loss": 0.77931499, + "learning_rate": 3.750032898603443e-06, + "loss": 0.85791731, + "num_input_tokens_seen": 66797535, + "router_z_loss_clip": 2.66210938, + "router_z_loss_mlp": 0.25744629, + "step": 3094, + "time_per_iteration": 2.529491662979126 + }, + { + "auxiliary_loss_clip": 0.06576168, + "auxiliary_loss_mlp": 0.0128492, + "balance_loss_clip": 0.06311647, + "balance_loss_mlp": 0.01260637, + "epoch": 0.1860814670073651, + "flos": 50961285429120.0, + "grad_norm": 1.6485050019084173, + "language_loss": 0.70511484, + "learning_rate": 3.749844329677425e-06, + "loss": 0.7837258, + "num_input_tokens_seen": 66821720, + "router_z_loss_clip": 2.64257812, + "router_z_loss_mlp": 0.24291992, + "step": 3095, + "time_per_iteration": 4.124077558517456 + }, + { + "auxiliary_loss_clip": 0.0658177, + "auxiliary_loss_mlp": 0.01296881, + "balance_loss_clip": 0.06310082, + "balance_loss_mlp": 0.01268819, + "epoch": 0.18614159026003307, + "flos": 19397064240000.0, + "grad_norm": 1.9264485804072164, + "language_loss": 0.81302798, + "learning_rate": 3.749655694397135e-06, + "loss": 0.89181447, + "num_input_tokens_seen": 66839060, + "router_z_loss_clip": 2.71484375, + "router_z_loss_mlp": 0.28051758, + "step": 3096, + "time_per_iteration": 2.5277867317199707 + }, + { + "auxiliary_loss_clip": 0.06581111, + "auxiliary_loss_mlp": 0.01285017, + "balance_loss_clip": 0.06310429, + "balance_loss_mlp": 0.01259173, + "epoch": 0.18620171351270104, + "flos": 21805235383680.0, + "grad_norm": 1.9931413029080365, + "language_loss": 0.76143897, + "learning_rate": 3.7494669927697255e-06, + "loss": 0.84010023, + "num_input_tokens_seen": 66857760, + "router_z_loss_clip": 2.70898438, + "router_z_loss_mlp": 0.25842285, + "step": 3097, + "time_per_iteration": 3.982475996017456 + }, + { + "auxiliary_loss_clip": 0.06569855, + "auxiliary_loss_mlp": 0.01288887, + "balance_loss_clip": 0.06308468, + "balance_loss_mlp": 0.01263877, + "epoch": 0.186261836765369, + "flos": 16368499866240.0, + "grad_norm": 2.207337076402474, + "language_loss": 0.67101508, + "learning_rate": 3.749278224802352e-06, + "loss": 0.74960256, + "num_input_tokens_seen": 66876460, + "router_z_loss_clip": 2.61328125, + "router_z_loss_mlp": 0.25061035, + "step": 3098, + "time_per_iteration": 2.5570473670959473 + }, + { + "auxiliary_loss_clip": 0.06578363, + "auxiliary_loss_mlp": 0.01287977, + "balance_loss_clip": 0.06308189, + "balance_loss_mlp": 0.0126044, + "epoch": 0.18632196001803697, + "flos": 23377168379520.0, + "grad_norm": 1.559550653919394, + "language_loss": 0.70188725, + "learning_rate": 3.7490893905021733e-06, + "loss": 0.7805506, + "num_input_tokens_seen": 66897960, + "router_z_loss_clip": 2.703125, + "router_z_loss_mlp": 0.2755127, + "step": 3099, + "time_per_iteration": 2.5704476833343506 + }, + { + "auxiliary_loss_clip": 0.0657559, + "auxiliary_loss_mlp": 0.01292152, + "balance_loss_clip": 0.06309687, + "balance_loss_mlp": 0.01266689, + "epoch": 0.18638208327070493, + "flos": 22498569192960.0, + "grad_norm": 1.5145032946618349, + "language_loss": 0.72489583, + "learning_rate": 3.7489004898763494e-06, + "loss": 0.80357325, + "num_input_tokens_seen": 66917675, + "router_z_loss_clip": 2.66015625, + "router_z_loss_mlp": 0.25463867, + "step": 3100, + "time_per_iteration": 2.628770351409912 + }, + { + "auxiliary_loss_clip": 0.06585407, + "auxiliary_loss_mlp": 0.01287458, + "balance_loss_clip": 0.06314865, + "balance_loss_mlp": 0.01261971, + "epoch": 0.18644220652337293, + "flos": 29172317736960.0, + "grad_norm": 1.7314771672192502, + "language_loss": 0.80930734, + "learning_rate": 3.7487115229320444e-06, + "loss": 0.88803601, + "num_input_tokens_seen": 66936000, + "router_z_loss_clip": 2.703125, + "router_z_loss_mlp": 0.25524902, + "step": 3101, + "time_per_iteration": 4.063347578048706 + }, + { + "auxiliary_loss_clip": 0.0657436, + "auxiliary_loss_mlp": 0.01283038, + "balance_loss_clip": 0.06309733, + "balance_loss_mlp": 0.01259494, + "epoch": 0.1865023297760409, + "flos": 24250736321280.0, + "grad_norm": 2.4348094857493834, + "language_loss": 0.77630436, + "learning_rate": 3.7485224896764222e-06, + "loss": 0.85487837, + "num_input_tokens_seen": 66955700, + "router_z_loss_clip": 2.64453125, + "router_z_loss_mlp": 0.23535156, + "step": 3102, + "time_per_iteration": 3.9878056049346924 + }, + { + "auxiliary_loss_clip": 0.06580452, + "auxiliary_loss_mlp": 0.01284097, + "balance_loss_clip": 0.0631346, + "balance_loss_mlp": 0.01259504, + "epoch": 0.18656245302870886, + "flos": 19133617403520.0, + "grad_norm": 4.261808326107292, + "language_loss": 0.77043533, + "learning_rate": 3.7483333901166525e-06, + "loss": 0.8490808, + "num_input_tokens_seen": 66972815, + "router_z_loss_clip": 2.66992188, + "router_z_loss_mlp": 0.24584961, + "step": 3103, + "time_per_iteration": 2.5497515201568604 + }, + { + "auxiliary_loss_clip": 0.06580411, + "auxiliary_loss_mlp": 0.01279736, + "balance_loss_clip": 0.06311087, + "balance_loss_mlp": 0.01255596, + "epoch": 0.18662257628137682, + "flos": 17791994154240.0, + "grad_norm": 1.8534126866214053, + "language_loss": 0.80155015, + "learning_rate": 3.7481442242599054e-06, + "loss": 0.88015163, + "num_input_tokens_seen": 66992280, + "router_z_loss_clip": 2.69335938, + "router_z_loss_mlp": 0.24157715, + "step": 3104, + "time_per_iteration": 2.5436315536499023 + }, + { + "auxiliary_loss_clip": 0.06576735, + "auxiliary_loss_mlp": 0.01287024, + "balance_loss_clip": 0.06310537, + "balance_loss_mlp": 0.01262884, + "epoch": 0.1866826995340448, + "flos": 24031201824000.0, + "grad_norm": 1.9078675803700618, + "language_loss": 0.86523151, + "learning_rate": 3.747954992113354e-06, + "loss": 0.94386911, + "num_input_tokens_seen": 67012220, + "router_z_loss_clip": 2.66210938, + "router_z_loss_mlp": 0.24169922, + "step": 3105, + "time_per_iteration": 2.5862667560577393 + }, + { + "auxiliary_loss_clip": 0.06594124, + "auxiliary_loss_mlp": 0.01282565, + "balance_loss_clip": 0.06317551, + "balance_loss_mlp": 0.01257853, + "epoch": 0.18674282278671275, + "flos": 26148533045760.0, + "grad_norm": 3.6817594399013203, + "language_loss": 0.87727821, + "learning_rate": 3.7477656936841742e-06, + "loss": 0.95604515, + "num_input_tokens_seen": 67032030, + "router_z_loss_clip": 2.765625, + "router_z_loss_mlp": 0.24719238, + "step": 3106, + "time_per_iteration": 2.6158018112182617 + }, + { + "auxiliary_loss_clip": 0.06587484, + "auxiliary_loss_mlp": 0.01282217, + "balance_loss_clip": 0.06311296, + "balance_loss_mlp": 0.01259078, + "epoch": 0.18680294603938072, + "flos": 19206893399040.0, + "grad_norm": 1.800292289422269, + "language_loss": 0.78916037, + "learning_rate": 3.7475763289795445e-06, + "loss": 0.86785746, + "num_input_tokens_seen": 67048920, + "router_z_loss_clip": 2.76171875, + "router_z_loss_mlp": 0.23132324, + "step": 3107, + "time_per_iteration": 2.519771099090576 + }, + { + "auxiliary_loss_clip": 0.06579127, + "auxiliary_loss_mlp": 0.01290711, + "balance_loss_clip": 0.06304596, + "balance_loss_mlp": 0.01264997, + "epoch": 0.1868630692920487, + "flos": 28551840652800.0, + "grad_norm": 3.3283393961991345, + "language_loss": 0.75120842, + "learning_rate": 3.7473868980066446e-06, + "loss": 0.82990676, + "num_input_tokens_seen": 67068645, + "router_z_loss_clip": 2.75, + "router_z_loss_mlp": 0.25720215, + "step": 3108, + "time_per_iteration": 2.5681068897247314 + }, + { + "auxiliary_loss_clip": 0.06588297, + "auxiliary_loss_mlp": 0.01287258, + "balance_loss_clip": 0.06313515, + "balance_loss_mlp": 0.01262451, + "epoch": 0.18692319254471668, + "flos": 17243702962560.0, + "grad_norm": 1.5585462553143232, + "language_loss": 0.7488178, + "learning_rate": 3.747197400772658e-06, + "loss": 0.82757336, + "num_input_tokens_seen": 67087075, + "router_z_loss_clip": 2.7421875, + "router_z_loss_mlp": 0.24816895, + "step": 3109, + "time_per_iteration": 2.5719470977783203 + }, + { + "auxiliary_loss_clip": 0.06585538, + "auxiliary_loss_mlp": 0.01282339, + "balance_loss_clip": 0.06316088, + "balance_loss_mlp": 0.01256113, + "epoch": 0.18698331579738464, + "flos": 23191861075200.0, + "grad_norm": 1.4817620217833272, + "language_loss": 0.85173523, + "learning_rate": 3.747007837284772e-06, + "loss": 0.93041396, + "num_input_tokens_seen": 67108040, + "router_z_loss_clip": 2.69726562, + "router_z_loss_mlp": 0.26220703, + "step": 3110, + "time_per_iteration": 2.604595899581909 + }, + { + "auxiliary_loss_clip": 0.06572624, + "auxiliary_loss_mlp": 0.01284902, + "balance_loss_clip": 0.06305574, + "balance_loss_mlp": 0.01260142, + "epoch": 0.1870434390500526, + "flos": 25523192424960.0, + "grad_norm": 2.402854340329271, + "language_loss": 0.85246378, + "learning_rate": 3.7468182075501737e-06, + "loss": 0.93103909, + "num_input_tokens_seen": 67127605, + "router_z_loss_clip": 2.671875, + "router_z_loss_mlp": 0.24755859, + "step": 3111, + "time_per_iteration": 2.58076810836792 + }, + { + "auxiliary_loss_clip": 0.06578258, + "auxiliary_loss_mlp": 0.0128217, + "balance_loss_clip": 0.06306738, + "balance_loss_mlp": 0.0125778, + "epoch": 0.18710356230272057, + "flos": 19506999196800.0, + "grad_norm": 1.9642208489694009, + "language_loss": 0.77830005, + "learning_rate": 3.7466285115760536e-06, + "loss": 0.85690439, + "num_input_tokens_seen": 67145785, + "router_z_loss_clip": 2.71679688, + "router_z_loss_mlp": 0.24365234, + "step": 3112, + "time_per_iteration": 2.5625264644622803 + }, + { + "auxiliary_loss_clip": 0.06577107, + "auxiliary_loss_mlp": 0.01281729, + "balance_loss_clip": 0.06307282, + "balance_loss_mlp": 0.01258113, + "epoch": 0.18716368555538854, + "flos": 26768129662080.0, + "grad_norm": 2.238016316213089, + "language_loss": 0.65778387, + "learning_rate": 3.7464387493696046e-06, + "loss": 0.73637217, + "num_input_tokens_seen": 67165930, + "router_z_loss_clip": 2.70117188, + "router_z_loss_mlp": 0.23620605, + "step": 3113, + "time_per_iteration": 2.6080710887908936 + }, + { + "auxiliary_loss_clip": 0.06588607, + "auxiliary_loss_mlp": 0.01279317, + "balance_loss_clip": 0.06312529, + "balance_loss_mlp": 0.01254962, + "epoch": 0.1872238088080565, + "flos": 25196490155520.0, + "grad_norm": 2.335075222112074, + "language_loss": 0.82613724, + "learning_rate": 3.746248920938024e-06, + "loss": 0.90481651, + "num_input_tokens_seen": 67185830, + "router_z_loss_clip": 2.75976562, + "router_z_loss_mlp": 0.2434082, + "step": 3114, + "time_per_iteration": 2.5988082885742188 + }, + { + "auxiliary_loss_clip": 0.06587939, + "auxiliary_loss_mlp": 0.01289131, + "balance_loss_clip": 0.06312289, + "balance_loss_mlp": 0.01262655, + "epoch": 0.1872839320607245, + "flos": 24141220634880.0, + "grad_norm": 2.589653310619875, + "language_loss": 0.58319235, + "learning_rate": 3.74605902628851e-06, + "loss": 0.66196311, + "num_input_tokens_seen": 67206930, + "router_z_loss_clip": 2.75390625, + "router_z_loss_mlp": 0.26464844, + "step": 3115, + "time_per_iteration": 2.597001552581787 + }, + { + "auxiliary_loss_clip": 0.06578196, + "auxiliary_loss_mlp": 0.01284839, + "balance_loss_clip": 0.06308471, + "balance_loss_mlp": 0.01261676, + "epoch": 0.18734405531339246, + "flos": 21179349711360.0, + "grad_norm": 2.089321408475999, + "language_loss": 0.7264486, + "learning_rate": 3.745869065428261e-06, + "loss": 0.80507892, + "num_input_tokens_seen": 67226290, + "router_z_loss_clip": 2.6953125, + "router_z_loss_mlp": 0.23168945, + "step": 3116, + "time_per_iteration": 2.559483051300049 + }, + { + "auxiliary_loss_clip": 0.06573902, + "auxiliary_loss_mlp": 0.01278215, + "balance_loss_clip": 0.06309307, + "balance_loss_mlp": 0.01256292, + "epoch": 0.18740417856606043, + "flos": 17243325619200.0, + "grad_norm": 2.0473943382883184, + "language_loss": 0.79514784, + "learning_rate": 3.7456790383644833e-06, + "loss": 0.87366909, + "num_input_tokens_seen": 67244410, + "router_z_loss_clip": 2.64648438, + "router_z_loss_mlp": 0.21936035, + "step": 3117, + "time_per_iteration": 2.5308892726898193 + }, + { + "auxiliary_loss_clip": 0.06575021, + "auxiliary_loss_mlp": 0.01286113, + "balance_loss_clip": 0.06310903, + "balance_loss_mlp": 0.01262426, + "epoch": 0.1874643018187284, + "flos": 32565626933760.0, + "grad_norm": 1.6927935343473184, + "language_loss": 0.84475845, + "learning_rate": 3.745488945104381e-06, + "loss": 0.92336977, + "num_input_tokens_seen": 67264470, + "router_z_loss_clip": 2.64257812, + "router_z_loss_mlp": 0.23669434, + "step": 3118, + "time_per_iteration": 2.645819902420044 + }, + { + "auxiliary_loss_clip": 0.06577513, + "auxiliary_loss_mlp": 0.01281432, + "balance_loss_clip": 0.06306227, + "balance_loss_mlp": 0.01256184, + "epoch": 0.18752442507139636, + "flos": 23264843581440.0, + "grad_norm": 1.8564508885039195, + "language_loss": 0.77631271, + "learning_rate": 3.7452987856551636e-06, + "loss": 0.85490215, + "num_input_tokens_seen": 67284315, + "router_z_loss_clip": 2.7109375, + "router_z_loss_mlp": 0.25280762, + "step": 3119, + "time_per_iteration": 2.5282692909240723 + }, + { + "auxiliary_loss_clip": 0.06577515, + "auxiliary_loss_mlp": 0.01280917, + "balance_loss_clip": 0.06308109, + "balance_loss_mlp": 0.01257934, + "epoch": 0.18758454832406432, + "flos": 21767150903040.0, + "grad_norm": 1.872231122069903, + "language_loss": 0.83286214, + "learning_rate": 3.7451085600240406e-06, + "loss": 0.91144645, + "num_input_tokens_seen": 67302780, + "router_z_loss_clip": 2.69921875, + "router_z_loss_mlp": 0.22973633, + "step": 3120, + "time_per_iteration": 2.5557563304901123 + }, + { + "auxiliary_loss_clip": 0.06574757, + "auxiliary_loss_mlp": 0.01283184, + "balance_loss_clip": 0.06308539, + "balance_loss_mlp": 0.01260606, + "epoch": 0.1876446715767323, + "flos": 29577956152320.0, + "grad_norm": 1.9256466590755805, + "language_loss": 0.85764915, + "learning_rate": 3.7449182682182263e-06, + "loss": 0.93622863, + "num_input_tokens_seen": 67323405, + "router_z_loss_clip": 2.6640625, + "router_z_loss_mlp": 0.22595215, + "step": 3121, + "time_per_iteration": 2.5938265323638916 + }, + { + "auxiliary_loss_clip": 0.06579052, + "auxiliary_loss_mlp": 0.01278188, + "balance_loss_clip": 0.06313133, + "balance_loss_mlp": 0.01255037, + "epoch": 0.18770479482940028, + "flos": 30348465171840.0, + "grad_norm": 1.7101492266675271, + "language_loss": 0.71341884, + "learning_rate": 3.744727910244937e-06, + "loss": 0.79199123, + "num_input_tokens_seen": 67345800, + "router_z_loss_clip": 2.66015625, + "router_z_loss_mlp": 0.23156738, + "step": 3122, + "time_per_iteration": 2.6486034393310547 + }, + { + "auxiliary_loss_clip": 0.06583723, + "auxiliary_loss_mlp": 0.01279754, + "balance_loss_clip": 0.06317301, + "balance_loss_mlp": 0.01255602, + "epoch": 0.18776491808206824, + "flos": 14470619287680.0, + "grad_norm": 1.9121070999681127, + "language_loss": 0.71984768, + "learning_rate": 3.7445374861113905e-06, + "loss": 0.79848242, + "num_input_tokens_seen": 67363575, + "router_z_loss_clip": 2.6640625, + "router_z_loss_mlp": 0.24157715, + "step": 3123, + "time_per_iteration": 2.50598406791687 + }, + { + "auxiliary_loss_clip": 0.06582906, + "auxiliary_loss_mlp": 0.01279768, + "balance_loss_clip": 0.06318765, + "balance_loss_mlp": 0.01258251, + "epoch": 0.1878250413347362, + "flos": 24505420406400.0, + "grad_norm": 1.8100549345620827, + "language_loss": 0.74830985, + "learning_rate": 3.7443469958248066e-06, + "loss": 0.8269366, + "num_input_tokens_seen": 67381765, + "router_z_loss_clip": 2.63867188, + "router_z_loss_mlp": 0.21520996, + "step": 3124, + "time_per_iteration": 2.588963031768799 + }, + { + "auxiliary_loss_clip": 0.06579177, + "auxiliary_loss_mlp": 0.01284317, + "balance_loss_clip": 0.06309149, + "balance_loss_mlp": 0.01260177, + "epoch": 0.18788516458740417, + "flos": 39795632807040.0, + "grad_norm": 2.0156197395212225, + "language_loss": 0.81827998, + "learning_rate": 3.7441564393924106e-06, + "loss": 0.89691496, + "num_input_tokens_seen": 67405000, + "router_z_loss_clip": 2.703125, + "router_z_loss_mlp": 0.24133301, + "step": 3125, + "time_per_iteration": 2.6984996795654297 + }, + { + "auxiliary_loss_clip": 0.06689048, + "auxiliary_loss_mlp": 0.01323199, + "balance_loss_clip": 0.06516109, + "balance_loss_mlp": 0.01312268, + "epoch": 0.18794528784007214, + "flos": 64717844221440.0, + "grad_norm": 0.9517259918121469, + "language_loss": 0.63560247, + "learning_rate": 3.7439658168214273e-06, + "loss": 0.715725, + "num_input_tokens_seen": 67467140, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.10949707, + "step": 3126, + "time_per_iteration": 3.246349811553955 + }, + { + "auxiliary_loss_clip": 0.06580469, + "auxiliary_loss_mlp": 0.01289138, + "balance_loss_clip": 0.06317941, + "balance_loss_mlp": 0.01265118, + "epoch": 0.1880054110927401, + "flos": 28629728622720.0, + "grad_norm": 1.7132867879725662, + "language_loss": 0.81907004, + "learning_rate": 3.7437751281190857e-06, + "loss": 0.89776611, + "num_input_tokens_seen": 67487980, + "router_z_loss_clip": 2.62304688, + "router_z_loss_mlp": 0.24035645, + "step": 3127, + "time_per_iteration": 2.6359355449676514 + }, + { + "auxiliary_loss_clip": 0.06571439, + "auxiliary_loss_mlp": 0.01288176, + "balance_loss_clip": 0.06401625, + "balance_loss_mlp": 0.01277983, + "epoch": 0.1880655343454081, + "flos": 64508959192320.0, + "grad_norm": 0.7555261261025208, + "language_loss": 0.61928779, + "learning_rate": 3.7435843732926164e-06, + "loss": 0.69788396, + "num_input_tokens_seen": 67552500, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.10192871, + "step": 3128, + "time_per_iteration": 3.3078746795654297 + }, + { + "auxiliary_loss_clip": 0.06593472, + "auxiliary_loss_mlp": 0.01285866, + "balance_loss_clip": 0.06323253, + "balance_loss_mlp": 0.0126243, + "epoch": 0.18812565759807606, + "flos": 32132679287040.0, + "grad_norm": 2.3201362692378806, + "language_loss": 0.72451007, + "learning_rate": 3.7433935523492536e-06, + "loss": 0.80330348, + "num_input_tokens_seen": 67573295, + "router_z_loss_clip": 2.70117188, + "router_z_loss_mlp": 0.234375, + "step": 3129, + "time_per_iteration": 2.684316396713257 + }, + { + "auxiliary_loss_clip": 0.06599562, + "auxiliary_loss_mlp": 0.01283183, + "balance_loss_clip": 0.06331511, + "balance_loss_mlp": 0.01259294, + "epoch": 0.18818578085074403, + "flos": 20629674927360.0, + "grad_norm": 2.0063290669545024, + "language_loss": 0.85961545, + "learning_rate": 3.7432026652962314e-06, + "loss": 0.93844295, + "num_input_tokens_seen": 67590010, + "router_z_loss_clip": 2.68359375, + "router_z_loss_mlp": 0.23876953, + "step": 3130, + "time_per_iteration": 2.5385701656341553 + }, + { + "auxiliary_loss_clip": 0.0659353, + "auxiliary_loss_mlp": 0.0128556, + "balance_loss_clip": 0.06323448, + "balance_loss_mlp": 0.01262564, + "epoch": 0.188245904103412, + "flos": 28848131089920.0, + "grad_norm": 1.7743332045981155, + "language_loss": 0.77165318, + "learning_rate": 3.7430117121407897e-06, + "loss": 0.85044408, + "num_input_tokens_seen": 67611110, + "router_z_loss_clip": 2.69921875, + "router_z_loss_mlp": 0.23010254, + "step": 3131, + "time_per_iteration": 2.6456139087677 + }, + { + "auxiliary_loss_clip": 0.06594209, + "auxiliary_loss_mlp": 0.0129295, + "balance_loss_clip": 0.06329745, + "balance_loss_mlp": 0.01266891, + "epoch": 0.18830602735607996, + "flos": 29427379165440.0, + "grad_norm": 1.8335043044334671, + "language_loss": 0.8226279, + "learning_rate": 3.74282069289017e-06, + "loss": 0.90149951, + "num_input_tokens_seen": 67631990, + "router_z_loss_clip": 2.640625, + "router_z_loss_mlp": 0.26049805, + "step": 3132, + "time_per_iteration": 2.604219436645508 + }, + { + "auxiliary_loss_clip": 0.06612615, + "auxiliary_loss_mlp": 0.01296327, + "balance_loss_clip": 0.06340778, + "balance_loss_mlp": 0.01269886, + "epoch": 0.18836615060874792, + "flos": 28879884587520.0, + "grad_norm": 2.5361304129104476, + "language_loss": 0.80964118, + "learning_rate": 3.742629607551614e-06, + "loss": 0.88873059, + "num_input_tokens_seen": 67650490, + "router_z_loss_clip": 2.72070312, + "router_z_loss_mlp": 0.26452637, + "step": 3133, + "time_per_iteration": 2.6110780239105225 + }, + { + "auxiliary_loss_clip": 0.06596034, + "auxiliary_loss_mlp": 0.01290384, + "balance_loss_clip": 0.06326675, + "balance_loss_mlp": 0.01266709, + "epoch": 0.18842627386141592, + "flos": 22608294514560.0, + "grad_norm": 1.918700832470348, + "language_loss": 0.83331311, + "learning_rate": 3.7424384561323698e-06, + "loss": 0.91217732, + "num_input_tokens_seen": 67668860, + "router_z_loss_clip": 2.69921875, + "router_z_loss_mlp": 0.23669434, + "step": 3134, + "time_per_iteration": 3.9871177673339844 + }, + { + "auxiliary_loss_clip": 0.06585519, + "auxiliary_loss_mlp": 0.01303727, + "balance_loss_clip": 0.06320879, + "balance_loss_mlp": 0.01279873, + "epoch": 0.18848639711408388, + "flos": 24580834680960.0, + "grad_norm": 1.5688225209098985, + "language_loss": 0.83794045, + "learning_rate": 3.742247238639684e-06, + "loss": 0.91683292, + "num_input_tokens_seen": 67690220, + "router_z_loss_clip": 2.64648438, + "router_z_loss_mlp": 0.23852539, + "step": 3135, + "time_per_iteration": 2.576728343963623 + }, + { + "auxiliary_loss_clip": 0.06580248, + "auxiliary_loss_mlp": 0.01300724, + "balance_loss_clip": 0.06314597, + "balance_loss_mlp": 0.01277049, + "epoch": 0.18854652036675185, + "flos": 34175350920960.0, + "grad_norm": 2.0171444284890674, + "language_loss": 0.79025453, + "learning_rate": 3.7420559550808083e-06, + "loss": 0.86906427, + "num_input_tokens_seen": 67709820, + "router_z_loss_clip": 2.65429688, + "router_z_loss_mlp": 0.23681641, + "step": 3136, + "time_per_iteration": 4.059029817581177 + }, + { + "auxiliary_loss_clip": 0.06580447, + "auxiliary_loss_mlp": 0.01296286, + "balance_loss_clip": 0.06314041, + "balance_loss_mlp": 0.01272348, + "epoch": 0.1886066436194198, + "flos": 24205985441280.0, + "grad_norm": 1.848748774649379, + "language_loss": 0.82736617, + "learning_rate": 3.741864605462996e-06, + "loss": 0.90613353, + "num_input_tokens_seen": 67729490, + "router_z_loss_clip": 2.66796875, + "router_z_loss_mlp": 0.23925781, + "step": 3137, + "time_per_iteration": 2.5432510375976562 + }, + { + "auxiliary_loss_clip": 0.06589224, + "auxiliary_loss_mlp": 0.01291304, + "balance_loss_clip": 0.0632188, + "balance_loss_mlp": 0.0126745, + "epoch": 0.18866676687208778, + "flos": 21257405389440.0, + "grad_norm": 1.7037003999682347, + "language_loss": 0.81716311, + "learning_rate": 3.741673189793504e-06, + "loss": 0.89596832, + "num_input_tokens_seen": 67749665, + "router_z_loss_clip": 2.67382812, + "router_z_loss_mlp": 0.23864746, + "step": 3138, + "time_per_iteration": 2.5536084175109863 + }, + { + "auxiliary_loss_clip": 0.06589679, + "auxiliary_loss_mlp": 0.01290101, + "balance_loss_clip": 0.06319093, + "balance_loss_mlp": 0.01265985, + "epoch": 0.18872689012475574, + "flos": 37318294517760.0, + "grad_norm": 2.1585183145570723, + "language_loss": 0.64404404, + "learning_rate": 3.7414817080795896e-06, + "loss": 0.72284186, + "num_input_tokens_seen": 67776230, + "router_z_loss_clip": 2.70507812, + "router_z_loss_mlp": 0.24133301, + "step": 3139, + "time_per_iteration": 2.7355217933654785 + }, + { + "auxiliary_loss_clip": 0.06586127, + "auxiliary_loss_mlp": 0.01305421, + "balance_loss_clip": 0.06318149, + "balance_loss_mlp": 0.01280554, + "epoch": 0.1887870133774237, + "flos": 21658641465600.0, + "grad_norm": 2.033663323673097, + "language_loss": 0.72120833, + "learning_rate": 3.741290160328514e-06, + "loss": 0.80012381, + "num_input_tokens_seen": 67795080, + "router_z_loss_clip": 2.68164062, + "router_z_loss_mlp": 0.24865723, + "step": 3140, + "time_per_iteration": 2.556196928024292 + }, + { + "auxiliary_loss_clip": 0.06585391, + "auxiliary_loss_mlp": 0.01291018, + "balance_loss_clip": 0.06316558, + "balance_loss_mlp": 0.01264935, + "epoch": 0.1888471366300917, + "flos": 15930143631360.0, + "grad_norm": 2.3984250647338254, + "language_loss": 0.88684165, + "learning_rate": 3.7410985465475412e-06, + "loss": 0.9656058, + "num_input_tokens_seen": 67813110, + "router_z_loss_clip": 2.68554688, + "router_z_loss_mlp": 0.26086426, + "step": 3141, + "time_per_iteration": 5.341757774353027 + }, + { + "auxiliary_loss_clip": 0.06587377, + "auxiliary_loss_mlp": 0.01281785, + "balance_loss_clip": 0.06315634, + "balance_loss_mlp": 0.01256358, + "epoch": 0.18890725988275966, + "flos": 18557933126400.0, + "grad_norm": 1.8324612256611552, + "language_loss": 0.7775296, + "learning_rate": 3.7409068667439378e-06, + "loss": 0.85622126, + "num_input_tokens_seen": 67831070, + "router_z_loss_clip": 2.72070312, + "router_z_loss_mlp": 0.25390625, + "step": 3142, + "time_per_iteration": 2.5836708545684814 + }, + { + "auxiliary_loss_clip": 0.06576081, + "auxiliary_loss_mlp": 0.01283372, + "balance_loss_clip": 0.06312332, + "balance_loss_mlp": 0.01261413, + "epoch": 0.18896738313542763, + "flos": 28848550360320.0, + "grad_norm": 1.9913316615923113, + "language_loss": 0.79816502, + "learning_rate": 3.740715120924971e-06, + "loss": 0.87675953, + "num_input_tokens_seen": 67852170, + "router_z_loss_clip": 2.640625, + "router_z_loss_mlp": 0.21972656, + "step": 3143, + "time_per_iteration": 2.6068625450134277 + }, + { + "auxiliary_loss_clip": 0.06581955, + "auxiliary_loss_mlp": 0.01290595, + "balance_loss_clip": 0.0631283, + "balance_loss_mlp": 0.01266146, + "epoch": 0.1890275063880956, + "flos": 22418249454720.0, + "grad_norm": 2.17929571565749, + "language_loss": 0.72435296, + "learning_rate": 3.740523309097912e-06, + "loss": 0.80307841, + "num_input_tokens_seen": 67869945, + "router_z_loss_clip": 2.69140625, + "router_z_loss_mlp": 0.24475098, + "step": 3144, + "time_per_iteration": 2.565488338470459 + }, + { + "auxiliary_loss_clip": 0.06576345, + "auxiliary_loss_mlp": 0.0128465, + "balance_loss_clip": 0.0630596, + "balance_loss_mlp": 0.012602, + "epoch": 0.18908762964076356, + "flos": 24250862102400.0, + "grad_norm": 2.4312750691575253, + "language_loss": 0.74294418, + "learning_rate": 3.7403314312700356e-06, + "loss": 0.82155418, + "num_input_tokens_seen": 67890240, + "router_z_loss_clip": 2.70507812, + "router_z_loss_mlp": 0.24438477, + "step": 3145, + "time_per_iteration": 2.582784414291382 + }, + { + "auxiliary_loss_clip": 0.0656594, + "auxiliary_loss_mlp": 0.01281011, + "balance_loss_clip": 0.063042, + "balance_loss_mlp": 0.01258385, + "epoch": 0.18914775289343153, + "flos": 16988599607040.0, + "grad_norm": 2.264042873648611, + "language_loss": 0.77487111, + "learning_rate": 3.740139487448616e-06, + "loss": 0.85334063, + "num_input_tokens_seen": 67907825, + "router_z_loss_clip": 2.6171875, + "router_z_loss_mlp": 0.22631836, + "step": 3146, + "time_per_iteration": 2.5446579456329346 + }, + { + "auxiliary_loss_clip": 0.06567892, + "auxiliary_loss_mlp": 0.01282874, + "balance_loss_clip": 0.06302544, + "balance_loss_mlp": 0.01259342, + "epoch": 0.1892078761460995, + "flos": 21550257809280.0, + "grad_norm": 2.367888350934947, + "language_loss": 0.79622674, + "learning_rate": 3.7399474776409326e-06, + "loss": 0.87473428, + "num_input_tokens_seen": 67926670, + "router_z_loss_clip": 2.66015625, + "router_z_loss_mlp": 0.23535156, + "step": 3147, + "time_per_iteration": 2.5432369709014893 + }, + { + "auxiliary_loss_clip": 0.06564464, + "auxiliary_loss_mlp": 0.0128295, + "balance_loss_clip": 0.06297393, + "balance_loss_mlp": 0.01259096, + "epoch": 0.18926799939876748, + "flos": 23007979290240.0, + "grad_norm": 3.3066597325179443, + "language_loss": 0.67790151, + "learning_rate": 3.739755401854267e-06, + "loss": 0.75637561, + "num_input_tokens_seen": 67943645, + "router_z_loss_clip": 2.66992188, + "router_z_loss_mlp": 0.23864746, + "step": 3148, + "time_per_iteration": 2.5936107635498047 + }, + { + "auxiliary_loss_clip": 0.06566582, + "auxiliary_loss_mlp": 0.01281142, + "balance_loss_clip": 0.06297165, + "balance_loss_mlp": 0.01256693, + "epoch": 0.18932812265143545, + "flos": 22279537820160.0, + "grad_norm": 2.2349625482761843, + "language_loss": 0.76378185, + "learning_rate": 3.739563260095902e-06, + "loss": 0.84225905, + "num_input_tokens_seen": 67962345, + "router_z_loss_clip": 2.69335938, + "router_z_loss_mlp": 0.24450684, + "step": 3149, + "time_per_iteration": 2.5491833686828613 + }, + { + "auxiliary_loss_clip": 0.0656079, + "auxiliary_loss_mlp": 0.01279685, + "balance_loss_clip": 0.06300658, + "balance_loss_mlp": 0.01256785, + "epoch": 0.1893882459041034, + "flos": 18630328654080.0, + "grad_norm": 2.2856364952022687, + "language_loss": 0.81782246, + "learning_rate": 3.7393710523731245e-06, + "loss": 0.89622724, + "num_input_tokens_seen": 67979760, + "router_z_loss_clip": 2.59960938, + "router_z_loss_mlp": 0.22912598, + "step": 3150, + "time_per_iteration": 2.568166494369507 + }, + { + "auxiliary_loss_clip": 0.06565347, + "auxiliary_loss_mlp": 0.01285958, + "balance_loss_clip": 0.06297709, + "balance_loss_mlp": 0.01262617, + "epoch": 0.18944836915677138, + "flos": 22899553706880.0, + "grad_norm": 2.23925150788406, + "language_loss": 0.86091208, + "learning_rate": 3.7391787786932215e-06, + "loss": 0.93942523, + "num_input_tokens_seen": 67996895, + "router_z_loss_clip": 2.67773438, + "router_z_loss_mlp": 0.2331543, + "step": 3151, + "time_per_iteration": 2.520254373550415 + }, + { + "auxiliary_loss_clip": 0.06570399, + "auxiliary_loss_mlp": 0.01289995, + "balance_loss_clip": 0.06303516, + "balance_loss_mlp": 0.01266297, + "epoch": 0.18950849240943934, + "flos": 26803698520320.0, + "grad_norm": 1.7542668261130185, + "language_loss": 0.75358492, + "learning_rate": 3.7389864390634857e-06, + "loss": 0.83218884, + "num_input_tokens_seen": 68018365, + "router_z_loss_clip": 2.66796875, + "router_z_loss_mlp": 0.23706055, + "step": 3152, + "time_per_iteration": 2.612248182296753 + }, + { + "auxiliary_loss_clip": 0.06565326, + "auxiliary_loss_mlp": 0.01283167, + "balance_loss_clip": 0.06301029, + "balance_loss_mlp": 0.01258431, + "epoch": 0.1895686156621073, + "flos": 24977919980160.0, + "grad_norm": 1.8204901028243692, + "language_loss": 0.76455373, + "learning_rate": 3.738794033491209e-06, + "loss": 0.84303862, + "num_input_tokens_seen": 68037985, + "router_z_loss_clip": 2.64257812, + "router_z_loss_mlp": 0.24755859, + "step": 3153, + "time_per_iteration": 2.5559494495391846 + }, + { + "auxiliary_loss_clip": 0.06567015, + "auxiliary_loss_mlp": 0.01280834, + "balance_loss_clip": 0.06300367, + "balance_loss_mlp": 0.01256599, + "epoch": 0.1896287389147753, + "flos": 21950990760960.0, + "grad_norm": 1.7894410743269322, + "language_loss": 0.80290896, + "learning_rate": 3.7386015619836887e-06, + "loss": 0.88138747, + "num_input_tokens_seen": 68057975, + "router_z_loss_clip": 2.66601562, + "router_z_loss_mlp": 0.24255371, + "step": 3154, + "time_per_iteration": 2.554861545562744 + }, + { + "auxiliary_loss_clip": 0.06572987, + "auxiliary_loss_mlp": 0.01294065, + "balance_loss_clip": 0.06302256, + "balance_loss_mlp": 0.01267612, + "epoch": 0.18968886216744327, + "flos": 18183628938240.0, + "grad_norm": 2.9256856308256447, + "language_loss": 0.74259496, + "learning_rate": 3.738409024548223e-06, + "loss": 0.82126546, + "num_input_tokens_seen": 68074175, + "router_z_loss_clip": 2.70898438, + "router_z_loss_mlp": 0.26452637, + "step": 3155, + "time_per_iteration": 2.473719358444214 + }, + { + "auxiliary_loss_clip": 0.06557501, + "auxiliary_loss_mlp": 0.01284077, + "balance_loss_clip": 0.06296935, + "balance_loss_mlp": 0.01260247, + "epoch": 0.18974898542011123, + "flos": 20418735473280.0, + "grad_norm": 2.585248701074102, + "language_loss": 0.74503541, + "learning_rate": 3.7382164211921136e-06, + "loss": 0.82345116, + "num_input_tokens_seen": 68095230, + "router_z_loss_clip": 2.60546875, + "router_z_loss_mlp": 0.23815918, + "step": 3156, + "time_per_iteration": 2.5825979709625244 + }, + { + "auxiliary_loss_clip": 0.06561351, + "auxiliary_loss_mlp": 0.01283032, + "balance_loss_clip": 0.06294506, + "balance_loss_mlp": 0.01259786, + "epoch": 0.1898091086727792, + "flos": 23991356407680.0, + "grad_norm": 1.7654819302184697, + "language_loss": 0.68914878, + "learning_rate": 3.7380237519226623e-06, + "loss": 0.76759267, + "num_input_tokens_seen": 68113805, + "router_z_loss_clip": 2.66796875, + "router_z_loss_mlp": 0.23266602, + "step": 3157, + "time_per_iteration": 2.614276170730591 + }, + { + "auxiliary_loss_clip": 0.06562739, + "auxiliary_loss_mlp": 0.01287461, + "balance_loss_clip": 0.06299365, + "balance_loss_mlp": 0.01263822, + "epoch": 0.18986923192544716, + "flos": 27644590569600.0, + "grad_norm": 1.6841569236878713, + "language_loss": 0.80553401, + "learning_rate": 3.737831016747176e-06, + "loss": 0.88403606, + "num_input_tokens_seen": 68133190, + "router_z_loss_clip": 2.63867188, + "router_z_loss_mlp": 0.23657227, + "step": 3158, + "time_per_iteration": 2.6667590141296387 + }, + { + "auxiliary_loss_clip": 0.06570458, + "auxiliary_loss_mlp": 0.01285173, + "balance_loss_clip": 0.06298561, + "balance_loss_mlp": 0.01260509, + "epoch": 0.18992935517811513, + "flos": 25491271219200.0, + "grad_norm": 2.1165299373469755, + "language_loss": 0.72984976, + "learning_rate": 3.737638215672964e-06, + "loss": 0.808406, + "num_input_tokens_seen": 68152330, + "router_z_loss_clip": 2.71875, + "router_z_loss_mlp": 0.2467041, + "step": 3159, + "time_per_iteration": 2.5685224533081055 + }, + { + "auxiliary_loss_clip": 0.06567825, + "auxiliary_loss_mlp": 0.01281428, + "balance_loss_clip": 0.06301159, + "balance_loss_mlp": 0.01257014, + "epoch": 0.1899894784307831, + "flos": 17426578498560.0, + "grad_norm": 1.8951112773112917, + "language_loss": 0.86019123, + "learning_rate": 3.7374453487073366e-06, + "loss": 0.93868375, + "num_input_tokens_seen": 68170185, + "router_z_loss_clip": 2.66601562, + "router_z_loss_mlp": 0.24438477, + "step": 3160, + "time_per_iteration": 2.533764362335205 + }, + { + "auxiliary_loss_clip": 0.06553883, + "auxiliary_loss_mlp": 0.0128672, + "balance_loss_clip": 0.06294671, + "balance_loss_mlp": 0.01264154, + "epoch": 0.19004960168345109, + "flos": 27499925295360.0, + "grad_norm": 1.7631570201415632, + "language_loss": 0.74244189, + "learning_rate": 3.7372524158576074e-06, + "loss": 0.82084787, + "num_input_tokens_seen": 68191665, + "router_z_loss_clip": 2.59570312, + "router_z_loss_mlp": 0.22570801, + "step": 3161, + "time_per_iteration": 2.590913772583008 + }, + { + "auxiliary_loss_clip": 0.06558438, + "auxiliary_loss_mlp": 0.01279623, + "balance_loss_clip": 0.06296802, + "balance_loss_mlp": 0.01255817, + "epoch": 0.19010972493611905, + "flos": 38663858908800.0, + "grad_norm": 1.9041337161295762, + "language_loss": 0.81525451, + "learning_rate": 3.7370594171310926e-06, + "loss": 0.89363515, + "num_input_tokens_seen": 68214635, + "router_z_loss_clip": 2.61523438, + "router_z_loss_mlp": 0.23803711, + "step": 3162, + "time_per_iteration": 2.7009496688842773 + }, + { + "auxiliary_loss_clip": 0.06556226, + "auxiliary_loss_mlp": 0.01281702, + "balance_loss_clip": 0.06291863, + "balance_loss_mlp": 0.012573, + "epoch": 0.19016984818878702, + "flos": 19250763811200.0, + "grad_norm": 2.198798501736265, + "language_loss": 0.77194953, + "learning_rate": 3.73686635253511e-06, + "loss": 0.8503288, + "num_input_tokens_seen": 68232150, + "router_z_loss_clip": 2.64453125, + "router_z_loss_mlp": 0.2442627, + "step": 3163, + "time_per_iteration": 2.5443172454833984 + }, + { + "auxiliary_loss_clip": 0.06551848, + "auxiliary_loss_mlp": 0.01280109, + "balance_loss_clip": 0.06291605, + "balance_loss_mlp": 0.01256291, + "epoch": 0.19022997144145498, + "flos": 37605947984640.0, + "grad_norm": 1.6741633946121544, + "language_loss": 0.75098169, + "learning_rate": 3.736673222076982e-06, + "loss": 0.82930118, + "num_input_tokens_seen": 68253370, + "router_z_loss_clip": 2.6015625, + "router_z_loss_mlp": 0.23815918, + "step": 3164, + "time_per_iteration": 2.6625473499298096 + }, + { + "auxiliary_loss_clip": 0.06555005, + "auxiliary_loss_mlp": 0.01280136, + "balance_loss_clip": 0.06294911, + "balance_loss_mlp": 0.01256759, + "epoch": 0.19029009469412295, + "flos": 61543874615040.0, + "grad_norm": 2.119573778415358, + "language_loss": 0.67527556, + "learning_rate": 3.7364800257640313e-06, + "loss": 0.75362694, + "num_input_tokens_seen": 68278895, + "router_z_loss_clip": 2.60351562, + "router_z_loss_mlp": 0.23364258, + "step": 3165, + "time_per_iteration": 2.8877623081207275 + }, + { + "auxiliary_loss_clip": 0.06552027, + "auxiliary_loss_mlp": 0.01278943, + "balance_loss_clip": 0.06292567, + "balance_loss_mlp": 0.01254433, + "epoch": 0.1903502179467909, + "flos": 13960077160320.0, + "grad_norm": 2.3966036589645916, + "language_loss": 0.75069398, + "learning_rate": 3.7362867636035835e-06, + "loss": 0.82900369, + "num_input_tokens_seen": 68294880, + "router_z_loss_clip": 2.59765625, + "router_z_loss_mlp": 0.24523926, + "step": 3166, + "time_per_iteration": 2.505680799484253 + }, + { + "auxiliary_loss_clip": 0.06499279, + "auxiliary_loss_mlp": 0.0131955, + "balance_loss_clip": 0.06350935, + "balance_loss_mlp": 0.01311236, + "epoch": 0.1904103411994589, + "flos": 66920484499200.0, + "grad_norm": 0.8228799096925371, + "language_loss": 0.50405741, + "learning_rate": 3.736093435602968e-06, + "loss": 0.58224571, + "num_input_tokens_seen": 68359665, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.08319092, + "step": 3167, + "time_per_iteration": 3.1767730712890625 + }, + { + "auxiliary_loss_clip": 0.06551085, + "auxiliary_loss_mlp": 0.0128493, + "balance_loss_clip": 0.06295685, + "balance_loss_mlp": 0.0126141, + "epoch": 0.19047046445212687, + "flos": 21915296121600.0, + "grad_norm": 1.8666443369688703, + "language_loss": 0.75258517, + "learning_rate": 3.7359000417695156e-06, + "loss": 0.83094531, + "num_input_tokens_seen": 68378950, + "router_z_loss_clip": 2.5546875, + "router_z_loss_mlp": 0.23522949, + "step": 3168, + "time_per_iteration": 2.539647102355957 + }, + { + "auxiliary_loss_clip": 0.06476398, + "auxiliary_loss_mlp": 0.01306941, + "balance_loss_clip": 0.06328493, + "balance_loss_mlp": 0.01299204, + "epoch": 0.19053058770479483, + "flos": 59271549338880.0, + "grad_norm": 0.8502356895352512, + "language_loss": 0.60174263, + "learning_rate": 3.73570658211056e-06, + "loss": 0.67957604, + "num_input_tokens_seen": 68434235, + "router_z_loss_clip": 1.4765625, + "router_z_loss_mlp": 0.07727051, + "step": 3169, + "time_per_iteration": 3.0786385536193848 + }, + { + "auxiliary_loss_clip": 0.06569149, + "auxiliary_loss_mlp": 0.01284984, + "balance_loss_clip": 0.06301555, + "balance_loss_mlp": 0.01260057, + "epoch": 0.1905907109574628, + "flos": 23958093536640.0, + "grad_norm": 1.6203962411975037, + "language_loss": 0.79296863, + "learning_rate": 3.735513056633436e-06, + "loss": 0.87151003, + "num_input_tokens_seen": 68453830, + "router_z_loss_clip": 2.67773438, + "router_z_loss_mlp": 0.24926758, + "step": 3170, + "time_per_iteration": 2.5439629554748535 + }, + { + "auxiliary_loss_clip": 0.06568529, + "auxiliary_loss_mlp": 0.01282478, + "balance_loss_clip": 0.06308423, + "balance_loss_mlp": 0.01258636, + "epoch": 0.19065083421013077, + "flos": 20818378321920.0, + "grad_norm": 3.266788836182488, + "language_loss": 0.78913432, + "learning_rate": 3.7353194653454834e-06, + "loss": 0.86764443, + "num_input_tokens_seen": 68473005, + "router_z_loss_clip": 2.6015625, + "router_z_loss_mlp": 0.23840332, + "step": 3171, + "time_per_iteration": 2.5944604873657227 + }, + { + "auxiliary_loss_clip": 0.06584235, + "auxiliary_loss_mlp": 0.01294559, + "balance_loss_clip": 0.06313154, + "balance_loss_mlp": 0.01269323, + "epoch": 0.19071095746279873, + "flos": 31293003121920.0, + "grad_norm": 1.9362395671252917, + "language_loss": 0.79769027, + "learning_rate": 3.7351258082540426e-06, + "loss": 0.8764782, + "num_input_tokens_seen": 68493470, + "router_z_loss_clip": 2.71484375, + "router_z_loss_mlp": 0.25256348, + "step": 3172, + "time_per_iteration": 2.6039323806762695 + }, + { + "auxiliary_loss_clip": 0.06578603, + "auxiliary_loss_mlp": 0.01291257, + "balance_loss_clip": 0.06316808, + "balance_loss_mlp": 0.0126738, + "epoch": 0.1907710807154667, + "flos": 14361397090560.0, + "grad_norm": 1.549568453685288, + "language_loss": 0.81519973, + "learning_rate": 3.7349320853664576e-06, + "loss": 0.89389837, + "num_input_tokens_seen": 68511290, + "router_z_loss_clip": 2.6171875, + "router_z_loss_mlp": 0.2388916, + "step": 3173, + "time_per_iteration": 2.566249132156372 + }, + { + "auxiliary_loss_clip": 0.06577085, + "auxiliary_loss_mlp": 0.01291087, + "balance_loss_clip": 0.06311868, + "balance_loss_mlp": 0.01266077, + "epoch": 0.1908312039681347, + "flos": 26914388163840.0, + "grad_norm": 1.4831321875737526, + "language_loss": 0.79620194, + "learning_rate": 3.7347382966900735e-06, + "loss": 0.87488365, + "num_input_tokens_seen": 68532575, + "router_z_loss_clip": 2.65429688, + "router_z_loss_mlp": 0.25012207, + "step": 3174, + "time_per_iteration": 4.032260179519653 + }, + { + "auxiliary_loss_clip": 0.06571774, + "auxiliary_loss_mlp": 0.01295417, + "balance_loss_clip": 0.06307514, + "balance_loss_mlp": 0.01271563, + "epoch": 0.19089132722080265, + "flos": 14498767059840.0, + "grad_norm": 1.9289574693520037, + "language_loss": 0.82161433, + "learning_rate": 3.7345444422322395e-06, + "loss": 0.9002862, + "num_input_tokens_seen": 68548760, + "router_z_loss_clip": 2.640625, + "router_z_loss_mlp": 0.23864746, + "step": 3175, + "time_per_iteration": 3.92791748046875 + }, + { + "auxiliary_loss_clip": 0.06570717, + "auxiliary_loss_mlp": 0.01290773, + "balance_loss_clip": 0.06306395, + "balance_loss_mlp": 0.01265393, + "epoch": 0.19095145047347062, + "flos": 13957771173120.0, + "grad_norm": 2.497584127695701, + "language_loss": 0.86521202, + "learning_rate": 3.7343505220003067e-06, + "loss": 0.94382691, + "num_input_tokens_seen": 68563100, + "router_z_loss_clip": 2.640625, + "router_z_loss_mlp": 0.25390625, + "step": 3176, + "time_per_iteration": 2.5083093643188477 + }, + { + "auxiliary_loss_clip": 0.06573781, + "auxiliary_loss_mlp": 0.01293305, + "balance_loss_clip": 0.06304888, + "balance_loss_mlp": 0.01265148, + "epoch": 0.19101157372613858, + "flos": 25308940734720.0, + "grad_norm": 2.21127293150792, + "language_loss": 0.82911885, + "learning_rate": 3.7341565360016285e-06, + "loss": 0.90778971, + "num_input_tokens_seen": 68581650, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 0.28137207, + "step": 3177, + "time_per_iteration": 2.5615227222442627 + }, + { + "auxiliary_loss_clip": 0.06560818, + "auxiliary_loss_mlp": 0.01287183, + "balance_loss_clip": 0.06300267, + "balance_loss_mlp": 0.01263985, + "epoch": 0.19107169697880655, + "flos": 20564448923520.0, + "grad_norm": 2.02770964818788, + "language_loss": 0.75787783, + "learning_rate": 3.73396248424356e-06, + "loss": 0.83635783, + "num_input_tokens_seen": 68600360, + "router_z_loss_clip": 2.60351562, + "router_z_loss_mlp": 0.23205566, + "step": 3178, + "time_per_iteration": 2.6215403079986572 + }, + { + "auxiliary_loss_clip": 0.06568342, + "auxiliary_loss_mlp": 0.01282871, + "balance_loss_clip": 0.06301986, + "balance_loss_mlp": 0.01260233, + "epoch": 0.19113182023147451, + "flos": 22169644790400.0, + "grad_norm": 1.6828125352275214, + "language_loss": 0.82549155, + "learning_rate": 3.7337683667334606e-06, + "loss": 0.90400362, + "num_input_tokens_seen": 68617885, + "router_z_loss_clip": 2.6640625, + "router_z_loss_mlp": 0.22644043, + "step": 3179, + "time_per_iteration": 2.5675652027130127 + }, + { + "auxiliary_loss_clip": 0.06569887, + "auxiliary_loss_mlp": 0.01296491, + "balance_loss_clip": 0.06307887, + "balance_loss_mlp": 0.012734, + "epoch": 0.19119194348414248, + "flos": 18586667877120.0, + "grad_norm": 2.5330173520749124, + "language_loss": 0.80732077, + "learning_rate": 3.733574183478691e-06, + "loss": 0.88598454, + "num_input_tokens_seen": 68634550, + "router_z_loss_clip": 2.62109375, + "router_z_loss_mlp": 0.23095703, + "step": 3180, + "time_per_iteration": 3.945387601852417 + }, + { + "auxiliary_loss_clip": 0.06563538, + "auxiliary_loss_mlp": 0.01290582, + "balance_loss_clip": 0.06302621, + "balance_loss_mlp": 0.01266883, + "epoch": 0.19125206673681047, + "flos": 19032738687360.0, + "grad_norm": 2.1003445268953373, + "language_loss": 0.79773259, + "learning_rate": 3.733379934486615e-06, + "loss": 0.87627381, + "num_input_tokens_seen": 68651895, + "router_z_loss_clip": 2.609375, + "router_z_loss_mlp": 0.23706055, + "step": 3181, + "time_per_iteration": 3.9274189472198486 + }, + { + "auxiliary_loss_clip": 0.06568001, + "auxiliary_loss_mlp": 0.01288302, + "balance_loss_clip": 0.06304715, + "balance_loss_mlp": 0.0126477, + "epoch": 0.19131218998947844, + "flos": 21696725946240.0, + "grad_norm": 2.2417902838655888, + "language_loss": 0.74386561, + "learning_rate": 3.7331856197645973e-06, + "loss": 0.82242858, + "num_input_tokens_seen": 68671500, + "router_z_loss_clip": 2.63476562, + "router_z_loss_mlp": 0.23547363, + "step": 3182, + "time_per_iteration": 2.550570487976074 + }, + { + "auxiliary_loss_clip": 0.06570706, + "auxiliary_loss_mlp": 0.0129189, + "balance_loss_clip": 0.06306151, + "balance_loss_mlp": 0.01267166, + "epoch": 0.1913723132421464, + "flos": 18448459367040.0, + "grad_norm": 1.7754326163332461, + "language_loss": 0.66467738, + "learning_rate": 3.7329912393200084e-06, + "loss": 0.7433033, + "num_input_tokens_seen": 68690570, + "router_z_loss_clip": 2.6484375, + "router_z_loss_mlp": 0.24719238, + "step": 3183, + "time_per_iteration": 2.589555501937866 + }, + { + "auxiliary_loss_clip": 0.06578184, + "auxiliary_loss_mlp": 0.01296721, + "balance_loss_clip": 0.06308434, + "balance_loss_mlp": 0.01268659, + "epoch": 0.19143243649481437, + "flos": 27167101678080.0, + "grad_norm": 1.7849918331200134, + "language_loss": 0.73866975, + "learning_rate": 3.7327967931602173e-06, + "loss": 0.81741881, + "num_input_tokens_seen": 68709735, + "router_z_loss_clip": 2.69921875, + "router_z_loss_mlp": 0.28076172, + "step": 3184, + "time_per_iteration": 2.7020864486694336 + }, + { + "auxiliary_loss_clip": 0.06571424, + "auxiliary_loss_mlp": 0.01290073, + "balance_loss_clip": 0.06304838, + "balance_loss_mlp": 0.01264049, + "epoch": 0.19149255974748233, + "flos": 21724244812800.0, + "grad_norm": 1.9651356872089878, + "language_loss": 0.89339554, + "learning_rate": 3.732602281292598e-06, + "loss": 0.97201049, + "num_input_tokens_seen": 68727565, + "router_z_loss_clip": 2.66601562, + "router_z_loss_mlp": 0.26037598, + "step": 3185, + "time_per_iteration": 2.512737512588501 + }, + { + "auxiliary_loss_clip": 0.06568564, + "auxiliary_loss_mlp": 0.01286821, + "balance_loss_clip": 0.06304171, + "balance_loss_mlp": 0.01261429, + "epoch": 0.1915526830001503, + "flos": 22969433612160.0, + "grad_norm": 2.041503418641191, + "language_loss": 0.74291968, + "learning_rate": 3.7324077037245267e-06, + "loss": 0.82147354, + "num_input_tokens_seen": 68748110, + "router_z_loss_clip": 2.64648438, + "router_z_loss_mlp": 0.25390625, + "step": 3186, + "time_per_iteration": 2.577359676361084 + }, + { + "auxiliary_loss_clip": 0.06579722, + "auxiliary_loss_mlp": 0.01289876, + "balance_loss_clip": 0.06312623, + "balance_loss_mlp": 0.01264675, + "epoch": 0.1916128062528183, + "flos": 26147946067200.0, + "grad_norm": 1.9086459802632982, + "language_loss": 0.84205973, + "learning_rate": 3.7322130604633825e-06, + "loss": 0.92075574, + "num_input_tokens_seen": 68769765, + "router_z_loss_clip": 2.671875, + "router_z_loss_mlp": 0.25231934, + "step": 3187, + "time_per_iteration": 2.575345039367676 + }, + { + "auxiliary_loss_clip": 0.06462009, + "auxiliary_loss_mlp": 0.01273815, + "balance_loss_clip": 0.06313258, + "balance_loss_mlp": 0.01266967, + "epoch": 0.19167292950548626, + "flos": 54943513119360.0, + "grad_norm": 0.8344019653061644, + "language_loss": 0.56017417, + "learning_rate": 3.732018351516544e-06, + "loss": 0.63753241, + "num_input_tokens_seen": 68826815, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.06866455, + "step": 3188, + "time_per_iteration": 3.186802387237549 + }, + { + "auxiliary_loss_clip": 0.06575608, + "auxiliary_loss_mlp": 0.01301201, + "balance_loss_clip": 0.06310253, + "balance_loss_mlp": 0.01276942, + "epoch": 0.19173305275815422, + "flos": 29943497589120.0, + "grad_norm": 2.242687399889932, + "language_loss": 0.70996517, + "learning_rate": 3.731823576891397e-06, + "loss": 0.78873324, + "num_input_tokens_seen": 68847585, + "router_z_loss_clip": 2.65039062, + "router_z_loss_mlp": 0.24267578, + "step": 3189, + "time_per_iteration": 2.5879886150360107 + }, + { + "auxiliary_loss_clip": 0.0656148, + "auxiliary_loss_mlp": 0.01285809, + "balance_loss_clip": 0.06303851, + "balance_loss_mlp": 0.01263994, + "epoch": 0.1917931760108222, + "flos": 24759140169600.0, + "grad_norm": 2.034629185065424, + "language_loss": 0.74848962, + "learning_rate": 3.7316287365953266e-06, + "loss": 0.82696253, + "num_input_tokens_seen": 68866620, + "router_z_loss_clip": 2.578125, + "router_z_loss_mlp": 0.21813965, + "step": 3190, + "time_per_iteration": 2.618912696838379 + }, + { + "auxiliary_loss_clip": 0.06566381, + "auxiliary_loss_mlp": 0.01292718, + "balance_loss_clip": 0.06306858, + "balance_loss_mlp": 0.01268614, + "epoch": 0.19185329926349015, + "flos": 18849527735040.0, + "grad_norm": 1.9370060266864375, + "language_loss": 0.84794742, + "learning_rate": 3.73143383063572e-06, + "loss": 0.92653841, + "num_input_tokens_seen": 68885515, + "router_z_loss_clip": 2.59179688, + "router_z_loss_mlp": 0.24108887, + "step": 3191, + "time_per_iteration": 2.5354197025299072 + }, + { + "auxiliary_loss_clip": 0.06560425, + "auxiliary_loss_mlp": 0.01288793, + "balance_loss_clip": 0.06303156, + "balance_loss_mlp": 0.01265595, + "epoch": 0.19191342251615812, + "flos": 22092721142400.0, + "grad_norm": 1.810553957384375, + "language_loss": 0.90797645, + "learning_rate": 3.73123885901997e-06, + "loss": 0.98646855, + "num_input_tokens_seen": 68903225, + "router_z_loss_clip": 2.57421875, + "router_z_loss_mlp": 0.23193359, + "step": 3192, + "time_per_iteration": 2.594034433364868 + }, + { + "auxiliary_loss_clip": 0.06575879, + "auxiliary_loss_mlp": 0.01297652, + "balance_loss_clip": 0.06307722, + "balance_loss_mlp": 0.01273727, + "epoch": 0.19197354576882608, + "flos": 22205465210880.0, + "grad_norm": 3.128458316309985, + "language_loss": 0.76021564, + "learning_rate": 3.7310438217554687e-06, + "loss": 0.83895093, + "num_input_tokens_seen": 68922860, + "router_z_loss_clip": 2.68554688, + "router_z_loss_mlp": 0.23925781, + "step": 3193, + "time_per_iteration": 2.5328986644744873 + }, + { + "auxiliary_loss_clip": 0.06572805, + "auxiliary_loss_mlp": 0.01303133, + "balance_loss_clip": 0.06305176, + "balance_loss_mlp": 0.01278504, + "epoch": 0.19203366902149407, + "flos": 24902505705600.0, + "grad_norm": 1.8726296466629722, + "language_loss": 0.75837868, + "learning_rate": 3.730848718849612e-06, + "loss": 0.83713806, + "num_input_tokens_seen": 68943000, + "router_z_loss_clip": 2.67382812, + "router_z_loss_mlp": 0.24633789, + "step": 3194, + "time_per_iteration": 2.594693660736084 + }, + { + "auxiliary_loss_clip": 0.06443634, + "auxiliary_loss_mlp": 0.01272062, + "balance_loss_clip": 0.06298726, + "balance_loss_mlp": 0.01264749, + "epoch": 0.19209379227416204, + "flos": 68435256211200.0, + "grad_norm": 0.738426265798758, + "language_loss": 0.68323666, + "learning_rate": 3.7306535503097985e-06, + "loss": 0.76039362, + "num_input_tokens_seen": 69000255, + "router_z_loss_clip": 1.453125, + "router_z_loss_mlp": 0.07293701, + "step": 3195, + "time_per_iteration": 3.082646369934082 + }, + { + "auxiliary_loss_clip": 0.0656238, + "auxiliary_loss_mlp": 0.0129433, + "balance_loss_clip": 0.06301422, + "balance_loss_mlp": 0.01270488, + "epoch": 0.19215391552683, + "flos": 22061848112640.0, + "grad_norm": 2.817360442151248, + "language_loss": 0.74132156, + "learning_rate": 3.730458316143429e-06, + "loss": 0.81988871, + "num_input_tokens_seen": 69019665, + "router_z_loss_clip": 2.61132812, + "router_z_loss_mlp": 0.23852539, + "step": 3196, + "time_per_iteration": 2.5596578121185303 + }, + { + "auxiliary_loss_clip": 0.0656443, + "auxiliary_loss_mlp": 0.01296614, + "balance_loss_clip": 0.06303307, + "balance_loss_mlp": 0.01272939, + "epoch": 0.19221403877949797, + "flos": 20309177859840.0, + "grad_norm": 2.156505210347581, + "language_loss": 0.84144557, + "learning_rate": 3.7302630163579068e-06, + "loss": 0.92005599, + "num_input_tokens_seen": 69039055, + "router_z_loss_clip": 2.61523438, + "router_z_loss_mlp": 0.23657227, + "step": 3197, + "time_per_iteration": 2.505884885787964 + }, + { + "auxiliary_loss_clip": 0.06563333, + "auxiliary_loss_mlp": 0.01294057, + "balance_loss_clip": 0.06297445, + "balance_loss_mlp": 0.0126894, + "epoch": 0.19227416203216594, + "flos": 23192028783360.0, + "grad_norm": 2.1973705189643042, + "language_loss": 0.8105517, + "learning_rate": 3.7300676509606373e-06, + "loss": 0.88912559, + "num_input_tokens_seen": 69056370, + "router_z_loss_clip": 2.65625, + "router_z_loss_mlp": 0.25109863, + "step": 3198, + "time_per_iteration": 2.5759875774383545 + }, + { + "auxiliary_loss_clip": 0.06570526, + "auxiliary_loss_mlp": 0.01303751, + "balance_loss_clip": 0.06301676, + "balance_loss_mlp": 0.01279194, + "epoch": 0.1923342852848339, + "flos": 25783872076800.0, + "grad_norm": 2.3405078734196274, + "language_loss": 0.79434526, + "learning_rate": 3.729872219959029e-06, + "loss": 0.873088, + "num_input_tokens_seen": 69075915, + "router_z_loss_clip": 2.6875, + "router_z_loss_mlp": 0.24536133, + "step": 3199, + "time_per_iteration": 2.57918643951416 + }, + { + "auxiliary_loss_clip": 0.06561789, + "auxiliary_loss_mlp": 0.01291155, + "balance_loss_clip": 0.06299184, + "balance_loss_mlp": 0.01267694, + "epoch": 0.19239440853750187, + "flos": 17133977640960.0, + "grad_norm": 1.9996812909650197, + "language_loss": 0.84443569, + "learning_rate": 3.7296767233604934e-06, + "loss": 0.92296517, + "num_input_tokens_seen": 69094145, + "router_z_loss_clip": 2.625, + "router_z_loss_mlp": 0.23449707, + "step": 3200, + "time_per_iteration": 2.5089356899261475 + }, + { + "auxiliary_loss_clip": 0.06560853, + "auxiliary_loss_mlp": 0.01287978, + "balance_loss_clip": 0.06299884, + "balance_loss_mlp": 0.01265185, + "epoch": 0.19245453179016986, + "flos": 16440601904640.0, + "grad_norm": 1.9071909055640763, + "language_loss": 0.79753184, + "learning_rate": 3.729481161172443e-06, + "loss": 0.87602013, + "num_input_tokens_seen": 69111110, + "router_z_loss_clip": 2.609375, + "router_z_loss_mlp": 0.22790527, + "step": 3201, + "time_per_iteration": 2.5428295135498047 + }, + { + "auxiliary_loss_clip": 0.06563856, + "auxiliary_loss_mlp": 0.01287849, + "balance_loss_clip": 0.06298736, + "balance_loss_mlp": 0.01264365, + "epoch": 0.19251465504283782, + "flos": 20236530769920.0, + "grad_norm": 3.4105372180153273, + "language_loss": 0.70024735, + "learning_rate": 3.7292855334022927e-06, + "loss": 0.77876443, + "num_input_tokens_seen": 69130280, + "router_z_loss_clip": 2.65039062, + "router_z_loss_mlp": 0.23498535, + "step": 3202, + "time_per_iteration": 2.545257806777954 + }, + { + "auxiliary_loss_clip": 0.06559525, + "auxiliary_loss_mlp": 0.01288531, + "balance_loss_clip": 0.06303041, + "balance_loss_mlp": 0.01265965, + "epoch": 0.1925747782955058, + "flos": 19470549870720.0, + "grad_norm": 1.8972638993856672, + "language_loss": 0.9187758, + "learning_rate": 3.7290898400574627e-06, + "loss": 0.9972564, + "num_input_tokens_seen": 69149570, + "router_z_loss_clip": 2.56445312, + "router_z_loss_mlp": 0.22570801, + "step": 3203, + "time_per_iteration": 2.52083420753479 + }, + { + "auxiliary_loss_clip": 0.06569508, + "auxiliary_loss_mlp": 0.01288191, + "balance_loss_clip": 0.06305829, + "balance_loss_mlp": 0.01263193, + "epoch": 0.19263490154817375, + "flos": 17791407175680.0, + "grad_norm": 2.3309919698880637, + "language_loss": 0.82672936, + "learning_rate": 3.7288940811453725e-06, + "loss": 0.9053064, + "num_input_tokens_seen": 69168190, + "router_z_loss_clip": 2.63671875, + "router_z_loss_mlp": 0.25012207, + "step": 3204, + "time_per_iteration": 2.552898645401001 + }, + { + "auxiliary_loss_clip": 0.06554051, + "auxiliary_loss_mlp": 0.01280623, + "balance_loss_clip": 0.06297573, + "balance_loss_mlp": 0.01257437, + "epoch": 0.19269502480084172, + "flos": 17462818189440.0, + "grad_norm": 2.4686415170818927, + "language_loss": 0.76927221, + "learning_rate": 3.7286982566734454e-06, + "loss": 0.84761888, + "num_input_tokens_seen": 69186950, + "router_z_loss_clip": 2.56835938, + "router_z_loss_mlp": 0.23181152, + "step": 3205, + "time_per_iteration": 2.635087251663208 + }, + { + "auxiliary_loss_clip": 0.06570686, + "auxiliary_loss_mlp": 0.01281824, + "balance_loss_clip": 0.06303753, + "balance_loss_mlp": 0.01259913, + "epoch": 0.19275514805350968, + "flos": 21513305358720.0, + "grad_norm": 2.6796703276560034, + "language_loss": 0.84088528, + "learning_rate": 3.728502366649107e-06, + "loss": 0.91941041, + "num_input_tokens_seen": 69204850, + "router_z_loss_clip": 2.671875, + "router_z_loss_mlp": 0.21911621, + "step": 3206, + "time_per_iteration": 2.5875258445739746 + }, + { + "auxiliary_loss_clip": 0.06462742, + "auxiliary_loss_mlp": 0.01299031, + "balance_loss_clip": 0.06320498, + "balance_loss_mlp": 0.01291426, + "epoch": 0.19281527130617768, + "flos": 47711578602240.0, + "grad_norm": 0.8155276906071137, + "language_loss": 0.60688889, + "learning_rate": 3.728306411079786e-06, + "loss": 0.68450665, + "num_input_tokens_seen": 69259200, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.07592773, + "step": 3207, + "time_per_iteration": 2.98170804977417 + }, + { + "auxiliary_loss_clip": 0.06570975, + "auxiliary_loss_mlp": 0.01284779, + "balance_loss_clip": 0.06306583, + "balance_loss_mlp": 0.01261426, + "epoch": 0.19287539455884564, + "flos": 11805961196160.0, + "grad_norm": 2.350100512422909, + "language_loss": 0.76272619, + "learning_rate": 3.7281103899729125e-06, + "loss": 0.8412838, + "num_input_tokens_seen": 69275835, + "router_z_loss_clip": 2.64453125, + "router_z_loss_mlp": 0.23364258, + "step": 3208, + "time_per_iteration": 2.495664358139038 + }, + { + "auxiliary_loss_clip": 0.06570548, + "auxiliary_loss_mlp": 0.01287656, + "balance_loss_clip": 0.06303693, + "balance_loss_mlp": 0.01263253, + "epoch": 0.1929355178115136, + "flos": 20637724919040.0, + "grad_norm": 2.572131519169912, + "language_loss": 0.61787575, + "learning_rate": 3.7279143033359195e-06, + "loss": 0.69645774, + "num_input_tokens_seen": 69294810, + "router_z_loss_clip": 2.66796875, + "router_z_loss_mlp": 0.24389648, + "step": 3209, + "time_per_iteration": 2.5720291137695312 + }, + { + "auxiliary_loss_clip": 0.06569174, + "auxiliary_loss_mlp": 0.0128696, + "balance_loss_clip": 0.06303342, + "balance_loss_mlp": 0.01262832, + "epoch": 0.19299564106418157, + "flos": 40817555602560.0, + "grad_norm": 2.1926342764258773, + "language_loss": 0.80817664, + "learning_rate": 3.727718151176243e-06, + "loss": 0.88673794, + "num_input_tokens_seen": 69316065, + "router_z_loss_clip": 2.65820312, + "router_z_loss_mlp": 0.24133301, + "step": 3210, + "time_per_iteration": 2.6967084407806396 + }, + { + "auxiliary_loss_clip": 0.06562287, + "auxiliary_loss_mlp": 0.01281086, + "balance_loss_clip": 0.06305824, + "balance_loss_mlp": 0.01258913, + "epoch": 0.19305576431684954, + "flos": 11365718244480.0, + "grad_norm": 4.335018711819376, + "language_loss": 0.83798629, + "learning_rate": 3.7275219335013217e-06, + "loss": 0.9164201, + "num_input_tokens_seen": 69332900, + "router_z_loss_clip": 2.56445312, + "router_z_loss_mlp": 0.22167969, + "step": 3211, + "time_per_iteration": 2.522151470184326 + }, + { + "auxiliary_loss_clip": 0.06460443, + "auxiliary_loss_mlp": 0.01261987, + "balance_loss_clip": 0.06318722, + "balance_loss_mlp": 0.01254787, + "epoch": 0.1931158875695175, + "flos": 54527476798080.0, + "grad_norm": 0.9401062048905866, + "language_loss": 0.63522434, + "learning_rate": 3.7273256503185953e-06, + "loss": 0.71244872, + "num_input_tokens_seen": 69382535, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.07196045, + "step": 3212, + "time_per_iteration": 3.0072474479675293 + }, + { + "auxiliary_loss_clip": 0.06559554, + "auxiliary_loss_mlp": 0.01284587, + "balance_loss_clip": 0.06301133, + "balance_loss_mlp": 0.01260936, + "epoch": 0.19317601082218547, + "flos": 19834540007040.0, + "grad_norm": 1.629103353649286, + "language_loss": 0.7732501, + "learning_rate": 3.7271293016355074e-06, + "loss": 0.85169148, + "num_input_tokens_seen": 69400600, + "router_z_loss_clip": 2.58398438, + "router_z_loss_mlp": 0.23669434, + "step": 3213, + "time_per_iteration": 3.972214698791504 + }, + { + "auxiliary_loss_clip": 0.06571522, + "auxiliary_loss_mlp": 0.01282458, + "balance_loss_clip": 0.06306578, + "balance_loss_mlp": 0.01259749, + "epoch": 0.19323613407485346, + "flos": 13157143810560.0, + "grad_norm": 2.0451873974907864, + "language_loss": 0.71339387, + "learning_rate": 3.726932887459503e-06, + "loss": 0.79193366, + "num_input_tokens_seen": 69417350, + "router_z_loss_clip": 2.64648438, + "router_z_loss_mlp": 0.22729492, + "step": 3214, + "time_per_iteration": 2.542698383331299 + }, + { + "auxiliary_loss_clip": 0.06565271, + "auxiliary_loss_mlp": 0.01287539, + "balance_loss_clip": 0.06303567, + "balance_loss_mlp": 0.01264365, + "epoch": 0.19329625732752143, + "flos": 14032388833920.0, + "grad_norm": 2.534528672768976, + "language_loss": 0.75987494, + "learning_rate": 3.72673640779803e-06, + "loss": 0.83840305, + "num_input_tokens_seen": 69431845, + "router_z_loss_clip": 2.61328125, + "router_z_loss_mlp": 0.23205566, + "step": 3215, + "time_per_iteration": 3.8739888668060303 + }, + { + "auxiliary_loss_clip": 0.06557035, + "auxiliary_loss_mlp": 0.01279943, + "balance_loss_clip": 0.06302097, + "balance_loss_mlp": 0.01257615, + "epoch": 0.1933563805801894, + "flos": 23448641512320.0, + "grad_norm": 2.010602658012729, + "language_loss": 0.88668227, + "learning_rate": 3.72653986265854e-06, + "loss": 0.96505201, + "num_input_tokens_seen": 69453275, + "router_z_loss_clip": 2.55273438, + "router_z_loss_mlp": 0.22338867, + "step": 3216, + "time_per_iteration": 2.5690455436706543 + }, + { + "auxiliary_loss_clip": 0.06557489, + "auxiliary_loss_mlp": 0.01281443, + "balance_loss_clip": 0.06301452, + "balance_loss_mlp": 0.01259019, + "epoch": 0.19341650383285736, + "flos": 20491550271360.0, + "grad_norm": 2.1677144094151823, + "language_loss": 0.80915409, + "learning_rate": 3.726343252048485e-06, + "loss": 0.88754338, + "num_input_tokens_seen": 69471830, + "router_z_loss_clip": 2.55859375, + "router_z_loss_mlp": 0.2244873, + "step": 3217, + "time_per_iteration": 2.522089958190918 + }, + { + "auxiliary_loss_clip": 0.06573136, + "auxiliary_loss_mlp": 0.01282755, + "balance_loss_clip": 0.06306149, + "balance_loss_mlp": 0.01257709, + "epoch": 0.19347662708552532, + "flos": 17864305827840.0, + "grad_norm": 3.8111547770960907, + "language_loss": 0.63612419, + "learning_rate": 3.7261465759753206e-06, + "loss": 0.71468312, + "num_input_tokens_seen": 69489320, + "router_z_loss_clip": 2.66992188, + "router_z_loss_mlp": 0.25048828, + "step": 3218, + "time_per_iteration": 2.511009693145752 + }, + { + "auxiliary_loss_clip": 0.06568655, + "auxiliary_loss_mlp": 0.01286799, + "balance_loss_clip": 0.06304532, + "balance_loss_mlp": 0.01262945, + "epoch": 0.1935367503381933, + "flos": 18193188303360.0, + "grad_norm": 1.6615722636986479, + "language_loss": 0.80769217, + "learning_rate": 3.7259498344465053e-06, + "loss": 0.88624674, + "num_input_tokens_seen": 69506665, + "router_z_loss_clip": 2.63867188, + "router_z_loss_mlp": 0.23852539, + "step": 3219, + "time_per_iteration": 2.49652099609375 + }, + { + "auxiliary_loss_clip": 0.06560229, + "auxiliary_loss_mlp": 0.01283688, + "balance_loss_clip": 0.06305727, + "balance_loss_mlp": 0.01262183, + "epoch": 0.19359687359086128, + "flos": 15961939056000.0, + "grad_norm": 2.4004031272371096, + "language_loss": 0.87055713, + "learning_rate": 3.7257530274694993e-06, + "loss": 0.94899631, + "num_input_tokens_seen": 69523835, + "router_z_loss_clip": 2.54492188, + "router_z_loss_mlp": 0.21520996, + "step": 3220, + "time_per_iteration": 3.9898974895477295 + }, + { + "auxiliary_loss_clip": 0.06557765, + "auxiliary_loss_mlp": 0.01279498, + "balance_loss_clip": 0.06308522, + "balance_loss_mlp": 0.0125829, + "epoch": 0.19365699684352924, + "flos": 21221584968960.0, + "grad_norm": 2.3273733740868296, + "language_loss": 0.84724689, + "learning_rate": 3.725556155051766e-06, + "loss": 0.92561948, + "num_input_tokens_seen": 69542620, + "router_z_loss_clip": 2.48828125, + "router_z_loss_mlp": 0.21191406, + "step": 3221, + "time_per_iteration": 2.546876907348633 + }, + { + "auxiliary_loss_clip": 0.06557351, + "auxiliary_loss_mlp": 0.01282697, + "balance_loss_clip": 0.06305219, + "balance_loss_mlp": 0.01260333, + "epoch": 0.1937171200961972, + "flos": 17316811249920.0, + "grad_norm": 2.1420374809622507, + "language_loss": 0.8628484, + "learning_rate": 3.7253592172007702e-06, + "loss": 0.94124895, + "num_input_tokens_seen": 69561130, + "router_z_loss_clip": 2.52148438, + "router_z_loss_mlp": 0.22351074, + "step": 3222, + "time_per_iteration": 2.497483015060425 + }, + { + "auxiliary_loss_clip": 0.06565784, + "auxiliary_loss_mlp": 0.0127706, + "balance_loss_clip": 0.06304947, + "balance_loss_mlp": 0.01255114, + "epoch": 0.19377724334886517, + "flos": 22642228218240.0, + "grad_norm": 2.292443034833117, + "language_loss": 0.7909472, + "learning_rate": 3.72516221392398e-06, + "loss": 0.86937559, + "num_input_tokens_seen": 69580425, + "router_z_loss_clip": 2.609375, + "router_z_loss_mlp": 0.21948242, + "step": 3223, + "time_per_iteration": 2.63804292678833 + }, + { + "auxiliary_loss_clip": 0.06563858, + "auxiliary_loss_mlp": 0.01278148, + "balance_loss_clip": 0.06308811, + "balance_loss_mlp": 0.01256452, + "epoch": 0.19383736660153314, + "flos": 15081872423040.0, + "grad_norm": 2.2027436227921977, + "language_loss": 0.76066363, + "learning_rate": 3.7249651452288653e-06, + "loss": 0.83908367, + "num_input_tokens_seen": 69597085, + "router_z_loss_clip": 2.55273438, + "router_z_loss_mlp": 0.21728516, + "step": 3224, + "time_per_iteration": 2.4926822185516357 + }, + { + "auxiliary_loss_clip": 0.06569614, + "auxiliary_loss_mlp": 0.01280842, + "balance_loss_clip": 0.06311695, + "balance_loss_mlp": 0.01257155, + "epoch": 0.1938974898542011, + "flos": 47130626246400.0, + "grad_norm": 2.47304361876348, + "language_loss": 0.71419585, + "learning_rate": 3.7247680111229e-06, + "loss": 0.79270041, + "num_input_tokens_seen": 69618885, + "router_z_loss_clip": 2.578125, + "router_z_loss_mlp": 0.23681641, + "step": 3225, + "time_per_iteration": 2.8417437076568604 + }, + { + "auxiliary_loss_clip": 0.0656653, + "auxiliary_loss_mlp": 0.01277702, + "balance_loss_clip": 0.06306545, + "balance_loss_mlp": 0.01255076, + "epoch": 0.19395761310686907, + "flos": 25819734424320.0, + "grad_norm": 2.3579945849430235, + "language_loss": 0.6987173, + "learning_rate": 3.7245708116135585e-06, + "loss": 0.77715963, + "num_input_tokens_seen": 69638200, + "router_z_loss_clip": 2.59960938, + "router_z_loss_mlp": 0.22619629, + "step": 3226, + "time_per_iteration": 2.5816895961761475 + }, + { + "auxiliary_loss_clip": 0.06556038, + "auxiliary_loss_mlp": 0.01279426, + "balance_loss_clip": 0.06305292, + "balance_loss_mlp": 0.01255608, + "epoch": 0.19401773635953706, + "flos": 23046315333120.0, + "grad_norm": 1.6993594132957168, + "language_loss": 0.76826584, + "learning_rate": 3.7243735467083193e-06, + "loss": 0.84662044, + "num_input_tokens_seen": 69657550, + "router_z_loss_clip": 2.5078125, + "router_z_loss_mlp": 0.23815918, + "step": 3227, + "time_per_iteration": 2.5873494148254395 + }, + { + "auxiliary_loss_clip": 0.06565821, + "auxiliary_loss_mlp": 0.01279646, + "balance_loss_clip": 0.063063, + "balance_loss_mlp": 0.01257187, + "epoch": 0.19407785961220503, + "flos": 15925615511040.0, + "grad_norm": 1.984580707337323, + "language_loss": 0.70403302, + "learning_rate": 3.724176216414662e-06, + "loss": 0.78248763, + "num_input_tokens_seen": 69675005, + "router_z_loss_clip": 2.59765625, + "router_z_loss_mlp": 0.22460938, + "step": 3228, + "time_per_iteration": 2.5275485515594482 + }, + { + "auxiliary_loss_clip": 0.06563079, + "auxiliary_loss_mlp": 0.01279835, + "balance_loss_clip": 0.06306829, + "balance_loss_mlp": 0.01257662, + "epoch": 0.194137982864873, + "flos": 25928872767360.0, + "grad_norm": 1.8334459249779138, + "language_loss": 0.74913502, + "learning_rate": 3.72397882074007e-06, + "loss": 0.82756412, + "num_input_tokens_seen": 69696455, + "router_z_loss_clip": 2.56640625, + "router_z_loss_mlp": 0.2220459, + "step": 3229, + "time_per_iteration": 2.588756561279297 + }, + { + "auxiliary_loss_clip": 0.06561101, + "auxiliary_loss_mlp": 0.01283623, + "balance_loss_clip": 0.06304256, + "balance_loss_mlp": 0.01260126, + "epoch": 0.19419810611754096, + "flos": 13266407934720.0, + "grad_norm": 2.0512138922716034, + "language_loss": 0.66050041, + "learning_rate": 3.7237813596920285e-06, + "loss": 0.73894763, + "num_input_tokens_seen": 69714245, + "router_z_loss_clip": 2.57226562, + "router_z_loss_mlp": 0.23486328, + "step": 3230, + "time_per_iteration": 2.51173996925354 + }, + { + "auxiliary_loss_clip": 0.06559683, + "auxiliary_loss_mlp": 0.01282022, + "balance_loss_clip": 0.06306173, + "balance_loss_mlp": 0.01259444, + "epoch": 0.19425822937020892, + "flos": 15710986477440.0, + "grad_norm": 1.9323382078744304, + "language_loss": 0.82361978, + "learning_rate": 3.7235838332780254e-06, + "loss": 0.90203679, + "num_input_tokens_seen": 69731515, + "router_z_loss_clip": 2.53515625, + "router_z_loss_mlp": 0.22583008, + "step": 3231, + "time_per_iteration": 2.5331170558929443 + }, + { + "auxiliary_loss_clip": 0.06565376, + "auxiliary_loss_mlp": 0.01284277, + "balance_loss_clip": 0.0630706, + "balance_loss_mlp": 0.01260793, + "epoch": 0.1943183526228769, + "flos": 23110912431360.0, + "grad_norm": 1.7851653331870696, + "language_loss": 0.8806898, + "learning_rate": 3.72338624150555e-06, + "loss": 0.95918632, + "num_input_tokens_seen": 69748885, + "router_z_loss_clip": 2.5859375, + "router_z_loss_mlp": 0.23474121, + "step": 3232, + "time_per_iteration": 2.556128740310669 + }, + { + "auxiliary_loss_clip": 0.06561054, + "auxiliary_loss_mlp": 0.01288213, + "balance_loss_clip": 0.06308518, + "balance_loss_mlp": 0.01265718, + "epoch": 0.19437847587554485, + "flos": 24718707774720.0, + "grad_norm": 1.9425002506843316, + "language_loss": 0.8592729, + "learning_rate": 3.723188584382096e-06, + "loss": 0.93776554, + "num_input_tokens_seen": 69767540, + "router_z_loss_clip": 2.52929688, + "router_z_loss_mlp": 0.22497559, + "step": 3233, + "time_per_iteration": 2.5888071060180664 + }, + { + "auxiliary_loss_clip": 0.06570844, + "auxiliary_loss_mlp": 0.01287681, + "balance_loss_clip": 0.06309654, + "balance_loss_mlp": 0.01263195, + "epoch": 0.19443859912821285, + "flos": 23123448616320.0, + "grad_norm": 2.322933236090491, + "language_loss": 0.8952834, + "learning_rate": 3.722990861915158e-06, + "loss": 0.97386861, + "num_input_tokens_seen": 69789340, + "router_z_loss_clip": 2.61328125, + "router_z_loss_mlp": 0.24499512, + "step": 3234, + "time_per_iteration": 2.598424196243286 + }, + { + "auxiliary_loss_clip": 0.0656711, + "auxiliary_loss_mlp": 0.01279524, + "balance_loss_clip": 0.06307149, + "balance_loss_mlp": 0.01256243, + "epoch": 0.1944987223808808, + "flos": 15089545071360.0, + "grad_norm": 2.0762312051619993, + "language_loss": 0.7883603, + "learning_rate": 3.722793074112234e-06, + "loss": 0.86682659, + "num_input_tokens_seen": 69806470, + "router_z_loss_clip": 2.59765625, + "router_z_loss_mlp": 0.23291016, + "step": 3235, + "time_per_iteration": 2.518150806427002 + }, + { + "auxiliary_loss_clip": 0.06562902, + "auxiliary_loss_mlp": 0.01278747, + "balance_loss_clip": 0.06309078, + "balance_loss_mlp": 0.01257253, + "epoch": 0.19455884563354878, + "flos": 17132258632320.0, + "grad_norm": 2.012702835830896, + "language_loss": 0.79693586, + "learning_rate": 3.7225952209808233e-06, + "loss": 0.87535232, + "num_input_tokens_seen": 69822655, + "router_z_loss_clip": 2.54296875, + "router_z_loss_mlp": 0.21520996, + "step": 3236, + "time_per_iteration": 2.5621957778930664 + }, + { + "auxiliary_loss_clip": 0.06562862, + "auxiliary_loss_mlp": 0.01279358, + "balance_loss_clip": 0.06309117, + "balance_loss_mlp": 0.0125635, + "epoch": 0.19461896888621674, + "flos": 20199578319360.0, + "grad_norm": 1.7644130728207734, + "language_loss": 0.76505381, + "learning_rate": 3.72239730252843e-06, + "loss": 0.84347594, + "num_input_tokens_seen": 69841895, + "router_z_loss_clip": 2.53710938, + "router_z_loss_mlp": 0.23010254, + "step": 3237, + "time_per_iteration": 2.545138359069824 + }, + { + "auxiliary_loss_clip": 0.06572011, + "auxiliary_loss_mlp": 0.01287724, + "balance_loss_clip": 0.06309787, + "balance_loss_mlp": 0.01264455, + "epoch": 0.1946790921388847, + "flos": 25308395683200.0, + "grad_norm": 3.0171180207385855, + "language_loss": 0.75939953, + "learning_rate": 3.7221993187625583e-06, + "loss": 0.8379969, + "num_input_tokens_seen": 69862220, + "router_z_loss_clip": 2.62304688, + "router_z_loss_mlp": 0.23291016, + "step": 3238, + "time_per_iteration": 2.6292033195495605 + }, + { + "auxiliary_loss_clip": 0.06564013, + "auxiliary_loss_mlp": 0.01283016, + "balance_loss_clip": 0.0631004, + "balance_loss_mlp": 0.0126033, + "epoch": 0.19473921539155267, + "flos": 20199578319360.0, + "grad_norm": 5.2039179549819, + "language_loss": 0.740753, + "learning_rate": 3.7220012696907155e-06, + "loss": 0.81922328, + "num_input_tokens_seen": 69881830, + "router_z_loss_clip": 2.5390625, + "router_z_loss_mlp": 0.22692871, + "step": 3239, + "time_per_iteration": 2.5251026153564453 + }, + { + "auxiliary_loss_clip": 0.06561047, + "auxiliary_loss_mlp": 0.01279887, + "balance_loss_clip": 0.06308049, + "balance_loss_mlp": 0.01257464, + "epoch": 0.19479933864422067, + "flos": 20894002231680.0, + "grad_norm": 2.589752485587752, + "language_loss": 0.74076676, + "learning_rate": 3.721803155320412e-06, + "loss": 0.8191762, + "num_input_tokens_seen": 69900515, + "router_z_loss_clip": 2.53125, + "router_z_loss_mlp": 0.22424316, + "step": 3240, + "time_per_iteration": 2.5630886554718018 + }, + { + "auxiliary_loss_clip": 0.06569096, + "auxiliary_loss_mlp": 0.01285658, + "balance_loss_clip": 0.06312588, + "balance_loss_mlp": 0.01262758, + "epoch": 0.19485946189688863, + "flos": 23301837959040.0, + "grad_norm": 2.269188581778515, + "language_loss": 0.67009896, + "learning_rate": 3.7216049756591606e-06, + "loss": 0.7486465, + "num_input_tokens_seen": 69920060, + "router_z_loss_clip": 2.56640625, + "router_z_loss_mlp": 0.22888184, + "step": 3241, + "time_per_iteration": 2.5366311073303223 + }, + { + "auxiliary_loss_clip": 0.0657091, + "auxiliary_loss_mlp": 0.01284859, + "balance_loss_clip": 0.06315701, + "balance_loss_mlp": 0.01261017, + "epoch": 0.1949195851495566, + "flos": 23301796032000.0, + "grad_norm": 1.7252715969085026, + "language_loss": 0.8313868, + "learning_rate": 3.7214067307144754e-06, + "loss": 0.90994453, + "num_input_tokens_seen": 69939820, + "router_z_loss_clip": 2.5546875, + "router_z_loss_mlp": 0.23828125, + "step": 3242, + "time_per_iteration": 2.5582659244537354 + }, + { + "auxiliary_loss_clip": 0.06462191, + "auxiliary_loss_mlp": 0.01271622, + "balance_loss_clip": 0.06317475, + "balance_loss_mlp": 0.01264684, + "epoch": 0.19497970840222456, + "flos": 64982884285440.0, + "grad_norm": 0.8039225971535554, + "language_loss": 0.57435864, + "learning_rate": 3.721208420493875e-06, + "loss": 0.6516968, + "num_input_tokens_seen": 70002145, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.06951904, + "step": 3243, + "time_per_iteration": 3.1517677307128906 + }, + { + "auxiliary_loss_clip": 0.06582105, + "auxiliary_loss_mlp": 0.01289713, + "balance_loss_clip": 0.06324299, + "balance_loss_mlp": 0.01264619, + "epoch": 0.19503983165489253, + "flos": 19650574368000.0, + "grad_norm": 1.7327160710810887, + "language_loss": 0.83662367, + "learning_rate": 3.7210100450048784e-06, + "loss": 0.91534185, + "num_input_tokens_seen": 70020510, + "router_z_loss_clip": 2.58007812, + "router_z_loss_mlp": 0.25085449, + "step": 3244, + "time_per_iteration": 2.580615282058716 + }, + { + "auxiliary_loss_clip": 0.06580628, + "auxiliary_loss_mlp": 0.01287488, + "balance_loss_clip": 0.06321178, + "balance_loss_mlp": 0.01264206, + "epoch": 0.1950999549075605, + "flos": 21148308973440.0, + "grad_norm": 1.8443508562563502, + "language_loss": 0.77383208, + "learning_rate": 3.7208116042550088e-06, + "loss": 0.85251331, + "num_input_tokens_seen": 70040760, + "router_z_loss_clip": 2.59375, + "router_z_loss_mlp": 0.23278809, + "step": 3245, + "time_per_iteration": 2.562547206878662 + }, + { + "auxiliary_loss_clip": 0.06574707, + "auxiliary_loss_mlp": 0.01284069, + "balance_loss_clip": 0.06316134, + "balance_loss_mlp": 0.01260168, + "epoch": 0.19516007816022846, + "flos": 20890815776640.0, + "grad_norm": 1.9180190042930891, + "language_loss": 0.84645605, + "learning_rate": 3.7206130982517906e-06, + "loss": 0.92504388, + "num_input_tokens_seen": 70058720, + "router_z_loss_clip": 2.58789062, + "router_z_loss_mlp": 0.2388916, + "step": 3246, + "time_per_iteration": 2.5781290531158447 + }, + { + "auxiliary_loss_clip": 0.06585012, + "auxiliary_loss_mlp": 0.01283635, + "balance_loss_clip": 0.0632351, + "balance_loss_mlp": 0.012612, + "epoch": 0.19522020141289645, + "flos": 16916287933440.0, + "grad_norm": 2.4019655481348177, + "language_loss": 0.77056623, + "learning_rate": 3.7204145270027514e-06, + "loss": 0.8492527, + "num_input_tokens_seen": 70076470, + "router_z_loss_clip": 2.61523438, + "router_z_loss_mlp": 0.22436523, + "step": 3247, + "time_per_iteration": 2.5042033195495605 + }, + { + "auxiliary_loss_clip": 0.06582692, + "auxiliary_loss_mlp": 0.01287787, + "balance_loss_clip": 0.06325091, + "balance_loss_mlp": 0.01264136, + "epoch": 0.19528032466556441, + "flos": 26732183460480.0, + "grad_norm": 1.5912411640106108, + "language_loss": 0.75763261, + "learning_rate": 3.720215890515421e-06, + "loss": 0.83633739, + "num_input_tokens_seen": 70096220, + "router_z_loss_clip": 2.58007812, + "router_z_loss_mlp": 0.23669434, + "step": 3248, + "time_per_iteration": 2.629751205444336 + }, + { + "auxiliary_loss_clip": 0.0657216, + "auxiliary_loss_mlp": 0.01286346, + "balance_loss_clip": 0.06312956, + "balance_loss_mlp": 0.01263994, + "epoch": 0.19534044791823238, + "flos": 21039170630400.0, + "grad_norm": 2.0257715109614822, + "language_loss": 0.79102194, + "learning_rate": 3.7200171887973316e-06, + "loss": 0.86960697, + "num_input_tokens_seen": 70114800, + "router_z_loss_clip": 2.58984375, + "router_z_loss_mlp": 0.22375488, + "step": 3249, + "time_per_iteration": 2.5774686336517334 + }, + { + "auxiliary_loss_clip": 0.06565905, + "auxiliary_loss_mlp": 0.01285899, + "balance_loss_clip": 0.06309386, + "balance_loss_mlp": 0.01263035, + "epoch": 0.19540057117090034, + "flos": 22350256266240.0, + "grad_norm": 1.6645797480066, + "language_loss": 0.73634374, + "learning_rate": 3.7198184218560176e-06, + "loss": 0.81486177, + "num_input_tokens_seen": 70134930, + "router_z_loss_clip": 2.56835938, + "router_z_loss_mlp": 0.2286377, + "step": 3250, + "time_per_iteration": 2.5834462642669678 + }, + { + "auxiliary_loss_clip": 0.06557436, + "auxiliary_loss_mlp": 0.01284202, + "balance_loss_clip": 0.06304777, + "balance_loss_mlp": 0.01261791, + "epoch": 0.1954606944235683, + "flos": 20307626559360.0, + "grad_norm": 5.203824713813235, + "language_loss": 0.80619103, + "learning_rate": 3.719619589699017e-06, + "loss": 0.88460743, + "num_input_tokens_seen": 70152045, + "router_z_loss_clip": 2.52929688, + "router_z_loss_mlp": 0.22399902, + "step": 3251, + "time_per_iteration": 2.5159976482391357 + }, + { + "auxiliary_loss_clip": 0.06569009, + "auxiliary_loss_mlp": 0.0128766, + "balance_loss_clip": 0.06309755, + "balance_loss_mlp": 0.01264593, + "epoch": 0.19552081767623627, + "flos": 17352463962240.0, + "grad_norm": 2.6280610562746882, + "language_loss": 0.84652966, + "learning_rate": 3.7194206923338695e-06, + "loss": 0.92509639, + "num_input_tokens_seen": 70169240, + "router_z_loss_clip": 2.59375, + "router_z_loss_mlp": 0.23071289, + "step": 3252, + "time_per_iteration": 2.584712505340576 + }, + { + "auxiliary_loss_clip": 0.0657175, + "auxiliary_loss_mlp": 0.01284666, + "balance_loss_clip": 0.06305347, + "balance_loss_mlp": 0.01258559, + "epoch": 0.19558094092890424, + "flos": 31985666098560.0, + "grad_norm": 1.8259798075239808, + "language_loss": 0.74205744, + "learning_rate": 3.719221729768117e-06, + "loss": 0.82062161, + "num_input_tokens_seen": 70192690, + "router_z_loss_clip": 2.66601562, + "router_z_loss_mlp": 0.26098633, + "step": 3253, + "time_per_iteration": 4.126874685287476 + }, + { + "auxiliary_loss_clip": 0.06567718, + "auxiliary_loss_mlp": 0.01281159, + "balance_loss_clip": 0.06301166, + "balance_loss_mlp": 0.0125721, + "epoch": 0.19564106418157223, + "flos": 22274716210560.0, + "grad_norm": 1.973936337746025, + "language_loss": 0.77398765, + "learning_rate": 3.7190227020093037e-06, + "loss": 0.85247642, + "num_input_tokens_seen": 70209685, + "router_z_loss_clip": 2.66210938, + "router_z_loss_mlp": 0.23962402, + "step": 3254, + "time_per_iteration": 2.6537773609161377 + }, + { + "auxiliary_loss_clip": 0.06437294, + "auxiliary_loss_mlp": 0.01260118, + "balance_loss_clip": 0.06291844, + "balance_loss_mlp": 0.01253204, + "epoch": 0.1957011874342402, + "flos": 54379876631040.0, + "grad_norm": 0.7412950515810539, + "language_loss": 0.55013955, + "learning_rate": 3.7188236090649774e-06, + "loss": 0.62711358, + "num_input_tokens_seen": 70265050, + "router_z_loss_clip": 1.453125, + "router_z_loss_mlp": 0.06933594, + "step": 3255, + "time_per_iteration": 4.54949426651001 + }, + { + "auxiliary_loss_clip": 0.06563026, + "auxiliary_loss_mlp": 0.01289416, + "balance_loss_clip": 0.06301506, + "balance_loss_mlp": 0.01265407, + "epoch": 0.19576131068690816, + "flos": 16511991183360.0, + "grad_norm": 2.710710922193229, + "language_loss": 0.71672189, + "learning_rate": 3.718624450942688e-06, + "loss": 0.79524636, + "num_input_tokens_seen": 70281830, + "router_z_loss_clip": 2.61523438, + "router_z_loss_mlp": 0.2401123, + "step": 3256, + "time_per_iteration": 2.563938617706299 + }, + { + "auxiliary_loss_clip": 0.06557887, + "auxiliary_loss_mlp": 0.01283051, + "balance_loss_clip": 0.06298412, + "balance_loss_mlp": 0.01259591, + "epoch": 0.19582143393957613, + "flos": 14724800248320.0, + "grad_norm": 2.2116868908222176, + "language_loss": 0.8133806, + "learning_rate": 3.718425227649987e-06, + "loss": 0.89178997, + "num_input_tokens_seen": 70297420, + "router_z_loss_clip": 2.59570312, + "router_z_loss_mlp": 0.23461914, + "step": 3257, + "time_per_iteration": 2.546842336654663 + }, + { + "auxiliary_loss_clip": 0.06568147, + "auxiliary_loss_mlp": 0.01289159, + "balance_loss_clip": 0.06309533, + "balance_loss_mlp": 0.01264554, + "epoch": 0.1958815571922441, + "flos": 24432354046080.0, + "grad_norm": 4.3707104143190785, + "language_loss": 0.76246595, + "learning_rate": 3.7182259391944292e-06, + "loss": 0.841039, + "num_input_tokens_seen": 70319210, + "router_z_loss_clip": 2.58398438, + "router_z_loss_mlp": 0.24609375, + "step": 3258, + "time_per_iteration": 2.596585273742676 + }, + { + "auxiliary_loss_clip": 0.06562606, + "auxiliary_loss_mlp": 0.01282027, + "balance_loss_clip": 0.06300102, + "balance_loss_mlp": 0.01257828, + "epoch": 0.19594168044491206, + "flos": 24907285388160.0, + "grad_norm": 1.9490064747675282, + "language_loss": 0.74507892, + "learning_rate": 3.7180265855835714e-06, + "loss": 0.82352525, + "num_input_tokens_seen": 70339045, + "router_z_loss_clip": 2.62695312, + "router_z_loss_mlp": 0.24230957, + "step": 3259, + "time_per_iteration": 2.572443723678589 + }, + { + "auxiliary_loss_clip": 0.06562422, + "auxiliary_loss_mlp": 0.01289683, + "balance_loss_clip": 0.06298189, + "balance_loss_mlp": 0.01263302, + "epoch": 0.19600180369758005, + "flos": 12061819238400.0, + "grad_norm": 2.2810085679716106, + "language_loss": 0.7772423, + "learning_rate": 3.7178271668249735e-06, + "loss": 0.85576332, + "num_input_tokens_seen": 70356505, + "router_z_loss_clip": 2.63867188, + "router_z_loss_mlp": 0.26379395, + "step": 3260, + "time_per_iteration": 5.330974340438843 + }, + { + "auxiliary_loss_clip": 0.06562512, + "auxiliary_loss_mlp": 0.01290293, + "balance_loss_clip": 0.06300309, + "balance_loss_mlp": 0.01266046, + "epoch": 0.19606192695024802, + "flos": 20856504729600.0, + "grad_norm": 2.085882514659535, + "language_loss": 0.83190846, + "learning_rate": 3.7176276829261975e-06, + "loss": 0.91043651, + "num_input_tokens_seen": 70375410, + "router_z_loss_clip": 2.62109375, + "router_z_loss_mlp": 0.24279785, + "step": 3261, + "time_per_iteration": 2.5832743644714355 + }, + { + "auxiliary_loss_clip": 0.06565593, + "auxiliary_loss_mlp": 0.01288067, + "balance_loss_clip": 0.06304751, + "balance_loss_mlp": 0.01263296, + "epoch": 0.19612205020291598, + "flos": 28483050850560.0, + "grad_norm": 1.7951789750723233, + "language_loss": 0.77451867, + "learning_rate": 3.717428133894807e-06, + "loss": 0.85305524, + "num_input_tokens_seen": 70396315, + "router_z_loss_clip": 2.60546875, + "router_z_loss_mlp": 0.24768066, + "step": 3262, + "time_per_iteration": 2.5895204544067383 + }, + { + "auxiliary_loss_clip": 0.06560683, + "auxiliary_loss_mlp": 0.01286928, + "balance_loss_clip": 0.06303811, + "balance_loss_mlp": 0.01264004, + "epoch": 0.19618217345558395, + "flos": 25563666746880.0, + "grad_norm": 1.6758780497522678, + "language_loss": 0.87025416, + "learning_rate": 3.71722851973837e-06, + "loss": 0.94873023, + "num_input_tokens_seen": 70417945, + "router_z_loss_clip": 2.57226562, + "router_z_loss_mlp": 0.22937012, + "step": 3263, + "time_per_iteration": 2.5864033699035645 + }, + { + "auxiliary_loss_clip": 0.0656628, + "auxiliary_loss_mlp": 0.01296773, + "balance_loss_clip": 0.06306224, + "balance_loss_mlp": 0.0127137, + "epoch": 0.1962422967082519, + "flos": 25271359378560.0, + "grad_norm": 1.67172611639437, + "language_loss": 0.74829996, + "learning_rate": 3.717028840464455e-06, + "loss": 0.82693052, + "num_input_tokens_seen": 70438690, + "router_z_loss_clip": 2.60546875, + "router_z_loss_mlp": 0.25390625, + "step": 3264, + "time_per_iteration": 2.5601091384887695 + }, + { + "auxiliary_loss_clip": 0.06569743, + "auxiliary_loss_mlp": 0.01288835, + "balance_loss_clip": 0.0631538, + "balance_loss_mlp": 0.01264337, + "epoch": 0.19630241996091988, + "flos": 18813371898240.0, + "grad_norm": 2.189524829184907, + "language_loss": 0.7983582, + "learning_rate": 3.7168290960806344e-06, + "loss": 0.87694395, + "num_input_tokens_seen": 70455385, + "router_z_loss_clip": 2.546875, + "router_z_loss_mlp": 0.24511719, + "step": 3265, + "time_per_iteration": 2.540691614151001 + }, + { + "auxiliary_loss_clip": 0.06455089, + "auxiliary_loss_mlp": 0.01265834, + "balance_loss_clip": 0.06313262, + "balance_loss_mlp": 0.01257317, + "epoch": 0.19636254321358784, + "flos": 62338240120320.0, + "grad_norm": 0.7691014679533006, + "language_loss": 0.53069305, + "learning_rate": 3.716629286594483e-06, + "loss": 0.60790235, + "num_input_tokens_seen": 70514280, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.08526611, + "step": 3266, + "time_per_iteration": 3.1712465286254883 + }, + { + "auxiliary_loss_clip": 0.06579427, + "auxiliary_loss_mlp": 0.01300624, + "balance_loss_clip": 0.06317084, + "balance_loss_mlp": 0.01276138, + "epoch": 0.19642266646625584, + "flos": 21075703810560.0, + "grad_norm": 2.1807082930425548, + "language_loss": 0.8080219, + "learning_rate": 3.7164294120135767e-06, + "loss": 0.88682246, + "num_input_tokens_seen": 70531800, + "router_z_loss_clip": 2.61914062, + "router_z_loss_mlp": 0.24487305, + "step": 3267, + "time_per_iteration": 2.551907539367676 + }, + { + "auxiliary_loss_clip": 0.06564153, + "auxiliary_loss_mlp": 0.0128147, + "balance_loss_clip": 0.06308893, + "balance_loss_mlp": 0.01257366, + "epoch": 0.1964827897189238, + "flos": 14543979137280.0, + "grad_norm": 2.1592598522148694, + "language_loss": 0.8731035, + "learning_rate": 3.7162294723454953e-06, + "loss": 0.95155978, + "num_input_tokens_seen": 70550615, + "router_z_loss_clip": 2.55078125, + "router_z_loss_mlp": 0.24108887, + "step": 3268, + "time_per_iteration": 2.520824909210205 + }, + { + "auxiliary_loss_clip": 0.06570253, + "auxiliary_loss_mlp": 0.01291413, + "balance_loss_clip": 0.0631839, + "balance_loss_mlp": 0.01268858, + "epoch": 0.19654291297159177, + "flos": 19250638030080.0, + "grad_norm": 2.3684809338902215, + "language_loss": 0.70127171, + "learning_rate": 3.7160294675978197e-06, + "loss": 0.77988833, + "num_input_tokens_seen": 70568690, + "router_z_loss_clip": 2.52148438, + "router_z_loss_mlp": 0.22546387, + "step": 3269, + "time_per_iteration": 2.542065382003784 + }, + { + "auxiliary_loss_clip": 0.06579614, + "auxiliary_loss_mlp": 0.01289007, + "balance_loss_clip": 0.06318989, + "balance_loss_mlp": 0.01263008, + "epoch": 0.19660303622425973, + "flos": 25782823900800.0, + "grad_norm": 3.1056086534351324, + "language_loss": 0.80997849, + "learning_rate": 3.715829397778135e-06, + "loss": 0.88866472, + "num_input_tokens_seen": 70588665, + "router_z_loss_clip": 2.60351562, + "router_z_loss_mlp": 0.25976562, + "step": 3270, + "time_per_iteration": 2.5732779502868652 + }, + { + "auxiliary_loss_clip": 0.0656828, + "auxiliary_loss_mlp": 0.0128367, + "balance_loss_clip": 0.06310552, + "balance_loss_mlp": 0.01257468, + "epoch": 0.1966631594769277, + "flos": 20601401374080.0, + "grad_norm": 4.117702501056874, + "language_loss": 0.84620351, + "learning_rate": 3.715629262894028e-06, + "loss": 0.92472303, + "num_input_tokens_seen": 70606900, + "router_z_loss_clip": 2.58007812, + "router_z_loss_mlp": 0.26220703, + "step": 3271, + "time_per_iteration": 2.54874587059021 + }, + { + "auxiliary_loss_clip": 0.06565209, + "auxiliary_loss_mlp": 0.01287963, + "balance_loss_clip": 0.06316341, + "balance_loss_mlp": 0.01263311, + "epoch": 0.19672328272959566, + "flos": 23629965747840.0, + "grad_norm": 1.9724475535226151, + "language_loss": 0.8064115, + "learning_rate": 3.715429062953087e-06, + "loss": 0.88494325, + "num_input_tokens_seen": 70625955, + "router_z_loss_clip": 2.48828125, + "router_z_loss_mlp": 0.2467041, + "step": 3272, + "time_per_iteration": 2.5446958541870117 + }, + { + "auxiliary_loss_clip": 0.06582461, + "auxiliary_loss_mlp": 0.01289002, + "balance_loss_clip": 0.06322335, + "balance_loss_mlp": 0.0126218, + "epoch": 0.19678340598226365, + "flos": 23117369195520.0, + "grad_norm": 1.7276133269560208, + "language_loss": 0.81592834, + "learning_rate": 3.7152287979629043e-06, + "loss": 0.89464301, + "num_input_tokens_seen": 70646090, + "router_z_loss_clip": 2.60351562, + "router_z_loss_mlp": 0.26831055, + "step": 3273, + "time_per_iteration": 2.625422239303589 + }, + { + "auxiliary_loss_clip": 0.06569564, + "auxiliary_loss_mlp": 0.01284595, + "balance_loss_clip": 0.06313652, + "balance_loss_mlp": 0.0126142, + "epoch": 0.19684352923493162, + "flos": 24541702024320.0, + "grad_norm": 1.8603958272733907, + "language_loss": 0.78998351, + "learning_rate": 3.7150284679310735e-06, + "loss": 0.86852515, + "num_input_tokens_seen": 70666065, + "router_z_loss_clip": 2.5625, + "router_z_loss_mlp": 0.23181152, + "step": 3274, + "time_per_iteration": 2.6299047470092773 + }, + { + "auxiliary_loss_clip": 0.06566115, + "auxiliary_loss_mlp": 0.01283599, + "balance_loss_clip": 0.0630929, + "balance_loss_mlp": 0.01259722, + "epoch": 0.19690365248759958, + "flos": 21802510126080.0, + "grad_norm": 2.495100495270235, + "language_loss": 0.82370663, + "learning_rate": 3.7148280728651914e-06, + "loss": 0.90220374, + "num_input_tokens_seen": 70681580, + "router_z_loss_clip": 2.56835938, + "router_z_loss_mlp": 0.23864746, + "step": 3275, + "time_per_iteration": 2.532348394393921 + }, + { + "auxiliary_loss_clip": 0.06571324, + "auxiliary_loss_mlp": 0.0128437, + "balance_loss_clip": 0.06313166, + "balance_loss_mlp": 0.01259134, + "epoch": 0.19696377574026755, + "flos": 19061683073280.0, + "grad_norm": 2.1007591714873968, + "language_loss": 0.81547761, + "learning_rate": 3.7146276127728563e-06, + "loss": 0.8940345, + "num_input_tokens_seen": 70697745, + "router_z_loss_clip": 2.58203125, + "router_z_loss_mlp": 0.25244141, + "step": 3276, + "time_per_iteration": 2.533137798309326 + }, + { + "auxiliary_loss_clip": 0.06571773, + "auxiliary_loss_mlp": 0.01280216, + "balance_loss_clip": 0.0631392, + "balance_loss_mlp": 0.01256135, + "epoch": 0.19702389899293551, + "flos": 22827325887360.0, + "grad_norm": 2.204561669505926, + "language_loss": 0.89893198, + "learning_rate": 3.7144270876616713e-06, + "loss": 0.97745186, + "num_input_tokens_seen": 70715110, + "router_z_loss_clip": 2.58007812, + "router_z_loss_mlp": 0.24084473, + "step": 3277, + "time_per_iteration": 2.5781216621398926 + }, + { + "auxiliary_loss_clip": 0.0657627, + "auxiliary_loss_mlp": 0.01285494, + "balance_loss_clip": 0.06313394, + "balance_loss_mlp": 0.01258922, + "epoch": 0.19708402224560348, + "flos": 22901021153280.0, + "grad_norm": 2.1685116517567273, + "language_loss": 0.63218272, + "learning_rate": 3.714226497539239e-06, + "loss": 0.71080041, + "num_input_tokens_seen": 70734715, + "router_z_loss_clip": 2.62890625, + "router_z_loss_mlp": 0.26574707, + "step": 3278, + "time_per_iteration": 2.5733482837677 + }, + { + "auxiliary_loss_clip": 0.06573428, + "auxiliary_loss_mlp": 0.01286907, + "balance_loss_clip": 0.0631459, + "balance_loss_mlp": 0.01261515, + "epoch": 0.19714414549827144, + "flos": 25668989729280.0, + "grad_norm": 2.1172991336759983, + "language_loss": 0.75555933, + "learning_rate": 3.714025842413166e-06, + "loss": 0.83416271, + "num_input_tokens_seen": 70752650, + "router_z_loss_clip": 2.5859375, + "router_z_loss_mlp": 0.25378418, + "step": 3279, + "time_per_iteration": 2.598710775375366 + }, + { + "auxiliary_loss_clip": 0.06574699, + "auxiliary_loss_mlp": 0.0128012, + "balance_loss_clip": 0.06317799, + "balance_loss_mlp": 0.01256671, + "epoch": 0.19720426875093944, + "flos": 23922776240640.0, + "grad_norm": 1.6530428540457747, + "language_loss": 0.82974696, + "learning_rate": 3.713825122291061e-06, + "loss": 0.90829515, + "num_input_tokens_seen": 70772365, + "router_z_loss_clip": 2.56835938, + "router_z_loss_mlp": 0.23449707, + "step": 3280, + "time_per_iteration": 2.618016481399536 + }, + { + "auxiliary_loss_clip": 0.06568167, + "auxiliary_loss_mlp": 0.01283165, + "balance_loss_clip": 0.0630914, + "balance_loss_mlp": 0.01259085, + "epoch": 0.1972643920036074, + "flos": 13887178508160.0, + "grad_norm": 2.6497469055747036, + "language_loss": 0.78509879, + "learning_rate": 3.713624337180536e-06, + "loss": 0.86361206, + "num_input_tokens_seen": 70790340, + "router_z_loss_clip": 2.58984375, + "router_z_loss_mlp": 0.24084473, + "step": 3281, + "time_per_iteration": 2.5222740173339844 + }, + { + "auxiliary_loss_clip": 0.06561945, + "auxiliary_loss_mlp": 0.01286304, + "balance_loss_clip": 0.06312899, + "balance_loss_mlp": 0.01263952, + "epoch": 0.19732451525627537, + "flos": 19869479959680.0, + "grad_norm": 1.7725817592402109, + "language_loss": 0.80340242, + "learning_rate": 3.7134234870892045e-06, + "loss": 0.88188481, + "num_input_tokens_seen": 70809295, + "router_z_loss_clip": 2.4921875, + "router_z_loss_mlp": 0.22351074, + "step": 3282, + "time_per_iteration": 2.6235008239746094 + }, + { + "auxiliary_loss_clip": 0.06573974, + "auxiliary_loss_mlp": 0.01283963, + "balance_loss_clip": 0.06315407, + "balance_loss_mlp": 0.01259668, + "epoch": 0.19738463850894333, + "flos": 24980477529600.0, + "grad_norm": 1.861487958506938, + "language_loss": 0.72318685, + "learning_rate": 3.7132225720246826e-06, + "loss": 0.80176622, + "num_input_tokens_seen": 70828765, + "router_z_loss_clip": 2.58398438, + "router_z_loss_mlp": 0.24304199, + "step": 3283, + "time_per_iteration": 2.5938494205474854 + }, + { + "auxiliary_loss_clip": 0.06574511, + "auxiliary_loss_mlp": 0.01281543, + "balance_loss_clip": 0.06317373, + "balance_loss_mlp": 0.01256247, + "epoch": 0.1974447617616113, + "flos": 18374722174080.0, + "grad_norm": 1.6759301931344739, + "language_loss": 0.79791147, + "learning_rate": 3.7130215919945886e-06, + "loss": 0.87647206, + "num_input_tokens_seen": 70846805, + "router_z_loss_clip": 2.56640625, + "router_z_loss_mlp": 0.25292969, + "step": 3284, + "time_per_iteration": 2.530935049057007 + }, + { + "auxiliary_loss_clip": 0.06572407, + "auxiliary_loss_mlp": 0.01285612, + "balance_loss_clip": 0.06312867, + "balance_loss_mlp": 0.01260554, + "epoch": 0.19750488501427926, + "flos": 22899511779840.0, + "grad_norm": 1.8637255752391477, + "language_loss": 0.87043929, + "learning_rate": 3.7128205470065445e-06, + "loss": 0.94901949, + "num_input_tokens_seen": 70863805, + "router_z_loss_clip": 2.59570312, + "router_z_loss_mlp": 0.25061035, + "step": 3285, + "time_per_iteration": 2.5539395809173584 + }, + { + "auxiliary_loss_clip": 0.06561802, + "auxiliary_loss_mlp": 0.01282259, + "balance_loss_clip": 0.06307627, + "balance_loss_mlp": 0.01258012, + "epoch": 0.19756500826694723, + "flos": 21877924400640.0, + "grad_norm": 2.4795216745498956, + "language_loss": 0.88948774, + "learning_rate": 3.712619437068174e-06, + "loss": 0.96792841, + "num_input_tokens_seen": 70882660, + "router_z_loss_clip": 2.54296875, + "router_z_loss_mlp": 0.24243164, + "step": 3286, + "time_per_iteration": 2.5367021560668945 + }, + { + "auxiliary_loss_clip": 0.06569161, + "auxiliary_loss_mlp": 0.01280864, + "balance_loss_clip": 0.06308903, + "balance_loss_mlp": 0.01256641, + "epoch": 0.19762513151961522, + "flos": 15164414294400.0, + "grad_norm": 2.1735993607640904, + "language_loss": 0.79236507, + "learning_rate": 3.712418262187102e-06, + "loss": 0.87086535, + "num_input_tokens_seen": 70898765, + "router_z_loss_clip": 2.6015625, + "router_z_loss_mlp": 0.24230957, + "step": 3287, + "time_per_iteration": 2.4954702854156494 + }, + { + "auxiliary_loss_clip": 0.0656468, + "auxiliary_loss_mlp": 0.01280142, + "balance_loss_clip": 0.0630395, + "balance_loss_mlp": 0.01256824, + "epoch": 0.1976852547722832, + "flos": 16984239194880.0, + "grad_norm": 4.513328663516958, + "language_loss": 0.81957221, + "learning_rate": 3.7122170223709584e-06, + "loss": 0.89802045, + "num_input_tokens_seen": 70916370, + "router_z_loss_clip": 2.60742188, + "router_z_loss_mlp": 0.23303223, + "step": 3288, + "time_per_iteration": 2.504995584487915 + }, + { + "auxiliary_loss_clip": 0.0655796, + "auxiliary_loss_mlp": 0.01284666, + "balance_loss_clip": 0.06307058, + "balance_loss_mlp": 0.01260526, + "epoch": 0.19774537802495115, + "flos": 20309135932800.0, + "grad_norm": 2.127297919409227, + "language_loss": 0.73378497, + "learning_rate": 3.712015717627374e-06, + "loss": 0.81221128, + "num_input_tokens_seen": 70934870, + "router_z_loss_clip": 2.50976562, + "router_z_loss_mlp": 0.24157715, + "step": 3289, + "time_per_iteration": 2.5189085006713867 + }, + { + "auxiliary_loss_clip": 0.06562441, + "auxiliary_loss_mlp": 0.01280497, + "balance_loss_clip": 0.06308928, + "balance_loss_mlp": 0.0125718, + "epoch": 0.19780550127761912, + "flos": 27242893296000.0, + "grad_norm": 3.229663808517491, + "language_loss": 0.79990375, + "learning_rate": 3.7118143479639813e-06, + "loss": 0.87833309, + "num_input_tokens_seen": 70955140, + "router_z_loss_clip": 2.53320312, + "router_z_loss_mlp": 0.2331543, + "step": 3290, + "time_per_iteration": 2.615630626678467 + }, + { + "auxiliary_loss_clip": 0.06446102, + "auxiliary_loss_mlp": 0.01262954, + "balance_loss_clip": 0.06305116, + "balance_loss_mlp": 0.01256308, + "epoch": 0.19786562453028708, + "flos": 63572597015040.0, + "grad_norm": 0.871535655745335, + "language_loss": 0.60331321, + "learning_rate": 3.711612913388418e-06, + "loss": 0.68040371, + "num_input_tokens_seen": 71012005, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.06658936, + "step": 3291, + "time_per_iteration": 3.1708285808563232 + }, + { + "auxiliary_loss_clip": 0.06578626, + "auxiliary_loss_mlp": 0.01283318, + "balance_loss_clip": 0.06312629, + "balance_loss_mlp": 0.0125621, + "epoch": 0.19792574778295505, + "flos": 26293869152640.0, + "grad_norm": 1.6662005392394712, + "language_loss": 0.82490212, + "learning_rate": 3.7114114139083204e-06, + "loss": 0.90352154, + "num_input_tokens_seen": 71031140, + "router_z_loss_clip": 2.65820312, + "router_z_loss_mlp": 0.2713623, + "step": 3292, + "time_per_iteration": 4.009428024291992 + }, + { + "auxiliary_loss_clip": 0.06559315, + "auxiliary_loss_mlp": 0.01281718, + "balance_loss_clip": 0.06308785, + "balance_loss_mlp": 0.01259641, + "epoch": 0.19798587103562304, + "flos": 19944265328640.0, + "grad_norm": 2.398610043576172, + "language_loss": 0.82271063, + "learning_rate": 3.7112098495313313e-06, + "loss": 0.9011209, + "num_input_tokens_seen": 71050250, + "router_z_loss_clip": 2.50390625, + "router_z_loss_mlp": 0.2208252, + "step": 3293, + "time_per_iteration": 2.5567917823791504 + }, + { + "auxiliary_loss_clip": 0.06584712, + "auxiliary_loss_mlp": 0.0128547, + "balance_loss_clip": 0.06316388, + "balance_loss_mlp": 0.01259351, + "epoch": 0.198045994288291, + "flos": 20126428104960.0, + "grad_norm": 22.121432113432896, + "language_loss": 0.62642097, + "learning_rate": 3.711008220265093e-06, + "loss": 0.70512283, + "num_input_tokens_seen": 71068665, + "router_z_loss_clip": 2.68554688, + "router_z_loss_mlp": 0.26135254, + "step": 3294, + "time_per_iteration": 4.055817365646362 + }, + { + "auxiliary_loss_clip": 0.06568369, + "auxiliary_loss_mlp": 0.01283249, + "balance_loss_clip": 0.06312987, + "balance_loss_mlp": 0.01259849, + "epoch": 0.19810611754095897, + "flos": 17973444170880.0, + "grad_norm": 2.078666367863598, + "language_loss": 0.88182533, + "learning_rate": 3.710806526117251e-06, + "loss": 0.96034157, + "num_input_tokens_seen": 71085320, + "router_z_loss_clip": 2.55273438, + "router_z_loss_mlp": 0.23413086, + "step": 3295, + "time_per_iteration": 2.616658926010132 + }, + { + "auxiliary_loss_clip": 0.06566019, + "auxiliary_loss_mlp": 0.01286636, + "balance_loss_clip": 0.06313851, + "balance_loss_mlp": 0.01265298, + "epoch": 0.19816624079362694, + "flos": 15090257831040.0, + "grad_norm": 2.9890739239636575, + "language_loss": 0.82427287, + "learning_rate": 3.7106047670954544e-06, + "loss": 0.90279943, + "num_input_tokens_seen": 71102020, + "router_z_loss_clip": 2.5234375, + "router_z_loss_mlp": 0.21337891, + "step": 3296, + "time_per_iteration": 2.642479658126831 + }, + { + "auxiliary_loss_clip": 0.06579386, + "auxiliary_loss_mlp": 0.01281841, + "balance_loss_clip": 0.06320241, + "balance_loss_mlp": 0.01256593, + "epoch": 0.1982263640462949, + "flos": 24907327315200.0, + "grad_norm": 2.6461649791490522, + "language_loss": 0.69111884, + "learning_rate": 3.710402943207354e-06, + "loss": 0.76973104, + "num_input_tokens_seen": 71123390, + "router_z_loss_clip": 2.59179688, + "router_z_loss_mlp": 0.25268555, + "step": 3297, + "time_per_iteration": 2.5983548164367676 + }, + { + "auxiliary_loss_clip": 0.06568186, + "auxiliary_loss_mlp": 0.01294298, + "balance_loss_clip": 0.06316572, + "balance_loss_mlp": 0.01272125, + "epoch": 0.19828648729896287, + "flos": 20382453855360.0, + "grad_norm": 1.615710211373745, + "language_loss": 0.8249923, + "learning_rate": 3.7102010544606016e-06, + "loss": 0.90361714, + "num_input_tokens_seen": 71141800, + "router_z_loss_clip": 2.51757812, + "router_z_loss_mlp": 0.22167969, + "step": 3298, + "time_per_iteration": 2.548333168029785 + }, + { + "auxiliary_loss_clip": 0.0657866, + "auxiliary_loss_mlp": 0.01298019, + "balance_loss_clip": 0.06318102, + "balance_loss_mlp": 0.01272592, + "epoch": 0.19834661055163083, + "flos": 18886018988160.0, + "grad_norm": 1.9534827487794544, + "language_loss": 0.86188138, + "learning_rate": 3.7099991008628544e-06, + "loss": 0.94064808, + "num_input_tokens_seen": 71159505, + "router_z_loss_clip": 2.60546875, + "router_z_loss_mlp": 0.25402832, + "step": 3299, + "time_per_iteration": 3.944326400756836 + }, + { + "auxiliary_loss_clip": 0.06449087, + "auxiliary_loss_mlp": 0.01270227, + "balance_loss_clip": 0.06307668, + "balance_loss_mlp": 0.01262615, + "epoch": 0.19840673380429882, + "flos": 60278908723200.0, + "grad_norm": 0.7519898728992364, + "language_loss": 0.53224742, + "learning_rate": 3.7097970824217706e-06, + "loss": 0.60944057, + "num_input_tokens_seen": 71223265, + "router_z_loss_clip": 1.41308594, + "router_z_loss_mlp": 0.07598877, + "step": 3300, + "time_per_iteration": 4.6055073738098145 + }, + { + "auxiliary_loss_clip": 0.06570522, + "auxiliary_loss_mlp": 0.01292871, + "balance_loss_clip": 0.06315967, + "balance_loss_mlp": 0.01267706, + "epoch": 0.1984668570569668, + "flos": 19908235272960.0, + "grad_norm": 2.2853574973511472, + "language_loss": 0.73847342, + "learning_rate": 3.7095949991450093e-06, + "loss": 0.81710732, + "num_input_tokens_seen": 71242385, + "router_z_loss_clip": 2.546875, + "router_z_loss_mlp": 0.25183105, + "step": 3301, + "time_per_iteration": 2.6006925106048584 + }, + { + "auxiliary_loss_clip": 0.06563142, + "auxiliary_loss_mlp": 0.01290092, + "balance_loss_clip": 0.0631086, + "balance_loss_mlp": 0.01267239, + "epoch": 0.19852698030963475, + "flos": 15635865692160.0, + "grad_norm": 3.8656690955217976, + "language_loss": 0.8953101, + "learning_rate": 3.709392851040235e-06, + "loss": 0.9738425, + "num_input_tokens_seen": 71258990, + "router_z_loss_clip": 2.5234375, + "router_z_loss_mlp": 0.22851562, + "step": 3302, + "time_per_iteration": 2.487173080444336 + }, + { + "auxiliary_loss_clip": 0.06567049, + "auxiliary_loss_mlp": 0.0128658, + "balance_loss_clip": 0.06310292, + "balance_loss_mlp": 0.01263013, + "epoch": 0.19858710356230272, + "flos": 43153037729280.0, + "grad_norm": 2.6127475741484347, + "language_loss": 0.74595749, + "learning_rate": 3.709190638115111e-06, + "loss": 0.82449377, + "num_input_tokens_seen": 71282770, + "router_z_loss_clip": 2.56835938, + "router_z_loss_mlp": 0.23596191, + "step": 3303, + "time_per_iteration": 2.733031749725342 + }, + { + "auxiliary_loss_clip": 0.06567588, + "auxiliary_loss_mlp": 0.0129499, + "balance_loss_clip": 0.06313773, + "balance_loss_mlp": 0.01270373, + "epoch": 0.19864722681497068, + "flos": 35151348879360.0, + "grad_norm": 2.3312818962460686, + "language_loss": 0.75973707, + "learning_rate": 3.7089883603773084e-06, + "loss": 0.83836287, + "num_input_tokens_seen": 71301410, + "router_z_loss_clip": 2.54101562, + "router_z_loss_mlp": 0.24597168, + "step": 3304, + "time_per_iteration": 2.627612829208374 + }, + { + "auxiliary_loss_clip": 0.06565879, + "auxiliary_loss_mlp": 0.01301567, + "balance_loss_clip": 0.06315561, + "balance_loss_mlp": 0.01279156, + "epoch": 0.19870735006763865, + "flos": 19432088046720.0, + "grad_norm": 2.2073504264205277, + "language_loss": 0.86939341, + "learning_rate": 3.7087860178344955e-06, + "loss": 0.9480679, + "num_input_tokens_seen": 71319670, + "router_z_loss_clip": 2.50390625, + "router_z_loss_mlp": 0.22399902, + "step": 3305, + "time_per_iteration": 2.5243277549743652 + }, + { + "auxiliary_loss_clip": 0.06573498, + "auxiliary_loss_mlp": 0.01293424, + "balance_loss_clip": 0.06314258, + "balance_loss_mlp": 0.01270035, + "epoch": 0.19876747332030664, + "flos": 23553671005440.0, + "grad_norm": 1.7277126311559312, + "language_loss": 0.69397068, + "learning_rate": 3.7085836104943445e-06, + "loss": 0.77263987, + "num_input_tokens_seen": 71339850, + "router_z_loss_clip": 2.59570312, + "router_z_loss_mlp": 0.23388672, + "step": 3306, + "time_per_iteration": 2.6042323112487793 + }, + { + "auxiliary_loss_clip": 0.06570327, + "auxiliary_loss_mlp": 0.01299594, + "balance_loss_clip": 0.06314942, + "balance_loss_mlp": 0.0127723, + "epoch": 0.1988275965729746, + "flos": 19835672037120.0, + "grad_norm": 3.1120189325389735, + "language_loss": 0.77373499, + "learning_rate": 3.7083811383645332e-06, + "loss": 0.85243422, + "num_input_tokens_seen": 71359795, + "router_z_loss_clip": 2.55273438, + "router_z_loss_mlp": 0.22375488, + "step": 3307, + "time_per_iteration": 2.6128084659576416 + }, + { + "auxiliary_loss_clip": 0.06569448, + "auxiliary_loss_mlp": 0.01292327, + "balance_loss_clip": 0.06316574, + "balance_loss_mlp": 0.01270452, + "epoch": 0.19888771982564257, + "flos": 23520366207360.0, + "grad_norm": 3.545114094394172, + "language_loss": 0.7662878, + "learning_rate": 3.708178601452737e-06, + "loss": 0.84490561, + "num_input_tokens_seen": 71378885, + "router_z_loss_clip": 2.52929688, + "router_z_loss_mlp": 0.21875, + "step": 3308, + "time_per_iteration": 2.5699222087860107 + }, + { + "auxiliary_loss_clip": 0.06565186, + "auxiliary_loss_mlp": 0.01291629, + "balance_loss_clip": 0.0631263, + "balance_loss_mlp": 0.0126799, + "epoch": 0.19894784307831054, + "flos": 18156403560960.0, + "grad_norm": 1.7056349525902872, + "language_loss": 0.76261461, + "learning_rate": 3.7079759997666374e-06, + "loss": 0.84118271, + "num_input_tokens_seen": 71397285, + "router_z_loss_clip": 2.52539062, + "router_z_loss_mlp": 0.23657227, + "step": 3309, + "time_per_iteration": 2.5804028511047363 + }, + { + "auxiliary_loss_clip": 0.06557433, + "auxiliary_loss_mlp": 0.01287248, + "balance_loss_clip": 0.06309506, + "balance_loss_mlp": 0.0126287, + "epoch": 0.1990079663309785, + "flos": 24282280183680.0, + "grad_norm": 1.5893437900436935, + "language_loss": 0.8845197, + "learning_rate": 3.707773333313917e-06, + "loss": 0.96296644, + "num_input_tokens_seen": 71415775, + "router_z_loss_clip": 2.47851562, + "router_z_loss_mlp": 0.24377441, + "step": 3310, + "time_per_iteration": 2.540788412094116 + }, + { + "auxiliary_loss_clip": 0.06554775, + "auxiliary_loss_mlp": 0.01280476, + "balance_loss_clip": 0.06304908, + "balance_loss_mlp": 0.01256575, + "epoch": 0.19906808958364647, + "flos": 34906391867520.0, + "grad_norm": 2.4688423193302347, + "language_loss": 0.64663219, + "learning_rate": 3.70757060210226e-06, + "loss": 0.72498477, + "num_input_tokens_seen": 71437315, + "router_z_loss_clip": 2.50195312, + "router_z_loss_mlp": 0.23925781, + "step": 3311, + "time_per_iteration": 2.6754508018493652 + }, + { + "auxiliary_loss_clip": 0.06567319, + "auxiliary_loss_mlp": 0.01285122, + "balance_loss_clip": 0.06310549, + "balance_loss_mlp": 0.01261351, + "epoch": 0.19912821283631443, + "flos": 24031788802560.0, + "grad_norm": 3.0857408174701186, + "language_loss": 0.75624847, + "learning_rate": 3.707367806139355e-06, + "loss": 0.83477283, + "num_input_tokens_seen": 71456320, + "router_z_loss_clip": 2.56445312, + "router_z_loss_mlp": 0.23779297, + "step": 3312, + "time_per_iteration": 2.5815083980560303 + }, + { + "auxiliary_loss_clip": 0.06553487, + "auxiliary_loss_mlp": 0.01286524, + "balance_loss_clip": 0.06300232, + "balance_loss_mlp": 0.01262611, + "epoch": 0.19918833608898243, + "flos": 19864155225600.0, + "grad_norm": 2.0583715987658264, + "language_loss": 0.84526402, + "learning_rate": 3.7071649454328915e-06, + "loss": 0.92366409, + "num_input_tokens_seen": 71475360, + "router_z_loss_clip": 2.53125, + "router_z_loss_mlp": 0.23937988, + "step": 3313, + "time_per_iteration": 2.5260941982269287 + }, + { + "auxiliary_loss_clip": 0.06547163, + "auxiliary_loss_mlp": 0.01284622, + "balance_loss_clip": 0.06294618, + "balance_loss_mlp": 0.01261376, + "epoch": 0.1992484593416504, + "flos": 29103444080640.0, + "grad_norm": 1.8813056340492245, + "language_loss": 0.81481469, + "learning_rate": 3.7069620199905625e-06, + "loss": 0.89313251, + "num_input_tokens_seen": 71496155, + "router_z_loss_clip": 2.52539062, + "router_z_loss_mlp": 0.2322998, + "step": 3314, + "time_per_iteration": 2.618865966796875 + }, + { + "auxiliary_loss_clip": 0.06544838, + "auxiliary_loss_mlp": 0.01278619, + "balance_loss_clip": 0.06300788, + "balance_loss_mlp": 0.01257924, + "epoch": 0.19930858259431836, + "flos": 23301754104960.0, + "grad_norm": 1.60969518187187, + "language_loss": 0.88063407, + "learning_rate": 3.7067590298200627e-06, + "loss": 0.95886856, + "num_input_tokens_seen": 71517295, + "router_z_loss_clip": 2.43945312, + "router_z_loss_mlp": 0.20690918, + "step": 3315, + "time_per_iteration": 2.5732057094573975 + }, + { + "auxiliary_loss_clip": 0.06550217, + "auxiliary_loss_mlp": 0.01280633, + "balance_loss_clip": 0.06298293, + "balance_loss_mlp": 0.0125728, + "epoch": 0.19936870584698632, + "flos": 25386619069440.0, + "grad_norm": 1.6023919835075873, + "language_loss": 0.71362162, + "learning_rate": 3.7065559749290892e-06, + "loss": 0.79193014, + "num_input_tokens_seen": 71540000, + "router_z_loss_clip": 2.51953125, + "router_z_loss_mlp": 0.23352051, + "step": 3316, + "time_per_iteration": 2.6071085929870605 + }, + { + "auxiliary_loss_clip": 0.06427301, + "auxiliary_loss_mlp": 0.01269861, + "balance_loss_clip": 0.06290084, + "balance_loss_mlp": 0.01263975, + "epoch": 0.1994288290996543, + "flos": 62190038246400.0, + "grad_norm": 0.8251623423654184, + "language_loss": 0.6634506, + "learning_rate": 3.706352855325342e-06, + "loss": 0.74042213, + "num_input_tokens_seen": 71607880, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.05880737, + "step": 3317, + "time_per_iteration": 3.216862201690674 + }, + { + "auxiliary_loss_clip": 0.06558052, + "auxiliary_loss_mlp": 0.01286476, + "balance_loss_clip": 0.06302503, + "balance_loss_mlp": 0.01262813, + "epoch": 0.19948895235232225, + "flos": 19031816292480.0, + "grad_norm": 2.159914212237722, + "language_loss": 0.74519444, + "learning_rate": 3.7061496710165233e-06, + "loss": 0.82363975, + "num_input_tokens_seen": 71625695, + "router_z_loss_clip": 2.55078125, + "router_z_loss_mlp": 0.23669434, + "step": 3318, + "time_per_iteration": 2.5432114601135254 + }, + { + "auxiliary_loss_clip": 0.06544004, + "auxiliary_loss_mlp": 0.01278248, + "balance_loss_clip": 0.06298326, + "balance_loss_mlp": 0.01256266, + "epoch": 0.19954907560499022, + "flos": 37824895503360.0, + "grad_norm": 2.0763327087054604, + "language_loss": 0.79865813, + "learning_rate": 3.7059464220103385e-06, + "loss": 0.87688065, + "num_input_tokens_seen": 71648520, + "router_z_loss_clip": 2.45703125, + "router_z_loss_mlp": 0.21984863, + "step": 3319, + "time_per_iteration": 2.6703901290893555 + }, + { + "auxiliary_loss_clip": 0.06551617, + "auxiliary_loss_mlp": 0.01282829, + "balance_loss_clip": 0.06300303, + "balance_loss_mlp": 0.01259631, + "epoch": 0.1996091988576582, + "flos": 49576420673280.0, + "grad_norm": 2.869788826425785, + "language_loss": 0.763668, + "learning_rate": 3.7057431083144945e-06, + "loss": 0.84201247, + "num_input_tokens_seen": 71672185, + "router_z_loss_clip": 2.51757812, + "router_z_loss_mlp": 0.2322998, + "step": 3320, + "time_per_iteration": 2.817199945449829 + }, + { + "auxiliary_loss_clip": 0.06552573, + "auxiliary_loss_mlp": 0.01291841, + "balance_loss_clip": 0.06302333, + "balance_loss_mlp": 0.01269608, + "epoch": 0.19966932211032618, + "flos": 22642018583040.0, + "grad_norm": 1.4988243809721686, + "language_loss": 0.81033528, + "learning_rate": 3.705539729936701e-06, + "loss": 0.8887794, + "num_input_tokens_seen": 71692890, + "router_z_loss_clip": 2.50390625, + "router_z_loss_mlp": 0.22229004, + "step": 3321, + "time_per_iteration": 2.6688761711120605 + }, + { + "auxiliary_loss_clip": 0.06416404, + "auxiliary_loss_mlp": 0.01265491, + "balance_loss_clip": 0.06281121, + "balance_loss_mlp": 0.01258195, + "epoch": 0.19972944536299414, + "flos": 54098973417600.0, + "grad_norm": 0.8569411614728654, + "language_loss": 0.65245974, + "learning_rate": 3.7053362868846696e-06, + "loss": 0.72927874, + "num_input_tokens_seen": 71745815, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.07275391, + "step": 3322, + "time_per_iteration": 3.000269651412964 + }, + { + "auxiliary_loss_clip": 0.06410387, + "auxiliary_loss_mlp": 0.01261864, + "balance_loss_clip": 0.06274698, + "balance_loss_mlp": 0.01254372, + "epoch": 0.1997895686156621, + "flos": 69371995731840.0, + "grad_norm": 0.7694165297899808, + "language_loss": 0.56849998, + "learning_rate": 3.7051327791661153e-06, + "loss": 0.64522249, + "num_input_tokens_seen": 71806915, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.07476807, + "step": 3323, + "time_per_iteration": 3.330606698989868 + }, + { + "auxiliary_loss_clip": 0.06562012, + "auxiliary_loss_mlp": 0.01292664, + "balance_loss_clip": 0.06316413, + "balance_loss_mlp": 0.01268596, + "epoch": 0.19984969186833007, + "flos": 18558058907520.0, + "grad_norm": 1.8232624283894519, + "language_loss": 0.81610429, + "learning_rate": 3.7049292067887555e-06, + "loss": 0.89465106, + "num_input_tokens_seen": 71824645, + "router_z_loss_clip": 2.45703125, + "router_z_loss_mlp": 0.24084473, + "step": 3324, + "time_per_iteration": 2.5314769744873047 + }, + { + "auxiliary_loss_clip": 0.06558169, + "auxiliary_loss_mlp": 0.01292911, + "balance_loss_clip": 0.06310347, + "balance_loss_mlp": 0.01268318, + "epoch": 0.19990981512099804, + "flos": 26436438074880.0, + "grad_norm": 1.6515442637335616, + "language_loss": 0.54047406, + "learning_rate": 3.7047255697603092e-06, + "loss": 0.61898488, + "num_input_tokens_seen": 71845125, + "router_z_loss_clip": 2.48242188, + "router_z_loss_mlp": 0.24609375, + "step": 3325, + "time_per_iteration": 2.6192479133605957 + }, + { + "auxiliary_loss_clip": 0.06565623, + "auxiliary_loss_mlp": 0.01288281, + "balance_loss_clip": 0.063146, + "balance_loss_mlp": 0.01265572, + "epoch": 0.19996993837366603, + "flos": 16331547415680.0, + "grad_norm": 1.9371709062145088, + "language_loss": 0.8658272, + "learning_rate": 3.7045218680884984e-06, + "loss": 0.94436622, + "num_input_tokens_seen": 71863500, + "router_z_loss_clip": 2.50976562, + "router_z_loss_mlp": 0.22729492, + "step": 3326, + "time_per_iteration": 2.5111629962921143 + }, + { + "auxiliary_loss_clip": 0.06551019, + "auxiliary_loss_mlp": 0.01289033, + "balance_loss_clip": 0.06305069, + "balance_loss_mlp": 0.01266705, + "epoch": 0.200030061626334, + "flos": 20849460986880.0, + "grad_norm": 6.809877440219623, + "language_loss": 0.7272824, + "learning_rate": 3.7043181017810476e-06, + "loss": 0.8056829, + "num_input_tokens_seen": 71881845, + "router_z_loss_clip": 2.45898438, + "router_z_loss_mlp": 0.22314453, + "step": 3327, + "time_per_iteration": 2.5571372509002686 + }, + { + "auxiliary_loss_clip": 0.06566358, + "auxiliary_loss_mlp": 0.01287053, + "balance_loss_clip": 0.06313111, + "balance_loss_mlp": 0.01261756, + "epoch": 0.20009018487900196, + "flos": 23768341966080.0, + "grad_norm": 1.841950801645188, + "language_loss": 0.77914047, + "learning_rate": 3.7041142708456833e-06, + "loss": 0.8576746, + "num_input_tokens_seen": 71900940, + "router_z_loss_clip": 2.53320312, + "router_z_loss_mlp": 0.25317383, + "step": 3328, + "time_per_iteration": 2.5489912033081055 + }, + { + "auxiliary_loss_clip": 0.06559211, + "auxiliary_loss_mlp": 0.01288822, + "balance_loss_clip": 0.06314486, + "balance_loss_mlp": 0.01265338, + "epoch": 0.20015030813166992, + "flos": 28119186495360.0, + "grad_norm": 1.7739956363125764, + "language_loss": 0.6938678, + "learning_rate": 3.7039103752901353e-06, + "loss": 0.77234817, + "num_input_tokens_seen": 71921925, + "router_z_loss_clip": 2.4453125, + "router_z_loss_mlp": 0.23474121, + "step": 3329, + "time_per_iteration": 2.790318489074707 + }, + { + "auxiliary_loss_clip": 0.06562928, + "auxiliary_loss_mlp": 0.01288787, + "balance_loss_clip": 0.06310034, + "balance_loss_mlp": 0.01263396, + "epoch": 0.2002104313843379, + "flos": 26074250801280.0, + "grad_norm": 1.6222638892170962, + "language_loss": 0.81793886, + "learning_rate": 3.7037064151221353e-06, + "loss": 0.896456, + "num_input_tokens_seen": 71941855, + "router_z_loss_clip": 2.52929688, + "router_z_loss_mlp": 0.25415039, + "step": 3330, + "time_per_iteration": 2.6165175437927246 + }, + { + "auxiliary_loss_clip": 0.06561245, + "auxiliary_loss_mlp": 0.01293061, + "balance_loss_clip": 0.06310615, + "balance_loss_mlp": 0.01268874, + "epoch": 0.20027055463700585, + "flos": 22973332826880.0, + "grad_norm": 3.6220429921180877, + "language_loss": 0.7808395, + "learning_rate": 3.703502390349417e-06, + "loss": 0.85938263, + "num_input_tokens_seen": 71960915, + "router_z_loss_clip": 2.50585938, + "router_z_loss_mlp": 0.24194336, + "step": 3331, + "time_per_iteration": 4.07051157951355 + }, + { + "auxiliary_loss_clip": 0.06564473, + "auxiliary_loss_mlp": 0.01290798, + "balance_loss_clip": 0.06310149, + "balance_loss_mlp": 0.01266014, + "epoch": 0.20033067788967382, + "flos": 17171433216000.0, + "grad_norm": 1.7477664730796658, + "language_loss": 0.79863441, + "learning_rate": 3.7032983009797176e-06, + "loss": 0.87718713, + "num_input_tokens_seen": 71979220, + "router_z_loss_clip": 2.54296875, + "router_z_loss_mlp": 0.24780273, + "step": 3332, + "time_per_iteration": 2.5321452617645264 + }, + { + "auxiliary_loss_clip": 0.06409155, + "auxiliary_loss_mlp": 0.01261657, + "balance_loss_clip": 0.06275231, + "balance_loss_mlp": 0.01253551, + "epoch": 0.2003908011423418, + "flos": 60842476085760.0, + "grad_norm": 0.9021189232739572, + "language_loss": 0.61913729, + "learning_rate": 3.703094147020776e-06, + "loss": 0.69584543, + "num_input_tokens_seen": 72033950, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.08105469, + "step": 3333, + "time_per_iteration": 4.713933706283569 + }, + { + "auxiliary_loss_clip": 0.06552575, + "auxiliary_loss_mlp": 0.0128469, + "balance_loss_clip": 0.06299093, + "balance_loss_mlp": 0.0126123, + "epoch": 0.20045092439500978, + "flos": 24212987256960.0, + "grad_norm": 1.8847951547254278, + "language_loss": 0.82181144, + "learning_rate": 3.7028899284803334e-06, + "loss": 0.90018404, + "num_input_tokens_seen": 72051395, + "router_z_loss_clip": 2.53320312, + "router_z_loss_mlp": 0.23461914, + "step": 3334, + "time_per_iteration": 2.597038984298706 + }, + { + "auxiliary_loss_clip": 0.0654801, + "auxiliary_loss_mlp": 0.01282898, + "balance_loss_clip": 0.06293298, + "balance_loss_mlp": 0.01256874, + "epoch": 0.20051104764767774, + "flos": 29395290251520.0, + "grad_norm": 2.256626356817437, + "language_loss": 0.7536357, + "learning_rate": 3.702685645366134e-06, + "loss": 0.83194482, + "num_input_tokens_seen": 72071305, + "router_z_loss_clip": 2.54882812, + "router_z_loss_mlp": 0.26049805, + "step": 3335, + "time_per_iteration": 2.5860390663146973 + }, + { + "auxiliary_loss_clip": 0.06552432, + "auxiliary_loss_mlp": 0.01280424, + "balance_loss_clip": 0.06300009, + "balance_loss_mlp": 0.0125632, + "epoch": 0.2005711709003457, + "flos": 23520575842560.0, + "grad_norm": 6.047041669068293, + "language_loss": 0.80452931, + "learning_rate": 3.7024812976859243e-06, + "loss": 0.88285786, + "num_input_tokens_seen": 72090165, + "router_z_loss_clip": 2.5234375, + "router_z_loss_mlp": 0.24108887, + "step": 3336, + "time_per_iteration": 2.662705898284912 + }, + { + "auxiliary_loss_clip": 0.06555694, + "auxiliary_loss_mlp": 0.01283807, + "balance_loss_clip": 0.06297083, + "balance_loss_mlp": 0.01258045, + "epoch": 0.20063129415301367, + "flos": 22529106806400.0, + "grad_norm": 1.88296777376126, + "language_loss": 0.78839928, + "learning_rate": 3.7022768854474532e-06, + "loss": 0.86679429, + "num_input_tokens_seen": 72107210, + "router_z_loss_clip": 2.5859375, + "router_z_loss_mlp": 0.25756836, + "step": 3337, + "time_per_iteration": 2.541239023208618 + }, + { + "auxiliary_loss_clip": 0.06548997, + "auxiliary_loss_mlp": 0.01282446, + "balance_loss_clip": 0.06296889, + "balance_loss_mlp": 0.01258389, + "epoch": 0.20069141740568164, + "flos": 25965405947520.0, + "grad_norm": 2.093788516709133, + "language_loss": 0.69608915, + "learning_rate": 3.7020724086584724e-06, + "loss": 0.77440357, + "num_input_tokens_seen": 72126315, + "router_z_loss_clip": 2.51953125, + "router_z_loss_mlp": 0.24072266, + "step": 3338, + "time_per_iteration": 4.011674165725708 + }, + { + "auxiliary_loss_clip": 0.06553162, + "auxiliary_loss_mlp": 0.01285819, + "balance_loss_clip": 0.06298589, + "balance_loss_mlp": 0.01261703, + "epoch": 0.2007515406583496, + "flos": 24797560066560.0, + "grad_norm": 2.5614555335728375, + "language_loss": 0.70278549, + "learning_rate": 3.701867867326735e-06, + "loss": 0.78117526, + "num_input_tokens_seen": 72146470, + "router_z_loss_clip": 2.54492188, + "router_z_loss_mlp": 0.24121094, + "step": 3339, + "time_per_iteration": 4.021097183227539 + }, + { + "auxiliary_loss_clip": 0.06558233, + "auxiliary_loss_mlp": 0.01288707, + "balance_loss_clip": 0.06300814, + "balance_loss_mlp": 0.01264401, + "epoch": 0.2008116639110176, + "flos": 37934746606080.0, + "grad_norm": 2.4782874615073265, + "language_loss": 0.67773008, + "learning_rate": 3.7016632614599974e-06, + "loss": 0.75619948, + "num_input_tokens_seen": 72166600, + "router_z_loss_clip": 2.578125, + "router_z_loss_mlp": 0.24291992, + "step": 3340, + "time_per_iteration": 2.741156816482544 + }, + { + "auxiliary_loss_clip": 0.06555235, + "auxiliary_loss_mlp": 0.01284766, + "balance_loss_clip": 0.06297287, + "balance_loss_mlp": 0.01258122, + "epoch": 0.20087178716368556, + "flos": 20746779408000.0, + "grad_norm": 2.067820693237163, + "language_loss": 0.74698186, + "learning_rate": 3.701458591066019e-06, + "loss": 0.82538182, + "num_input_tokens_seen": 72185160, + "router_z_loss_clip": 2.58007812, + "router_z_loss_mlp": 0.26623535, + "step": 3341, + "time_per_iteration": 2.564480781555176 + }, + { + "auxiliary_loss_clip": 0.06547385, + "auxiliary_loss_mlp": 0.01280207, + "balance_loss_clip": 0.06298249, + "balance_loss_mlp": 0.01256532, + "epoch": 0.20093191041635353, + "flos": 23849122901760.0, + "grad_norm": 1.820842392943319, + "language_loss": 0.7265389, + "learning_rate": 3.70125385615256e-06, + "loss": 0.80481482, + "num_input_tokens_seen": 72205160, + "router_z_loss_clip": 2.48828125, + "router_z_loss_mlp": 0.23657227, + "step": 3342, + "time_per_iteration": 2.5828449726104736 + }, + { + "auxiliary_loss_clip": 0.065575, + "auxiliary_loss_mlp": 0.01288338, + "balance_loss_clip": 0.06302083, + "balance_loss_mlp": 0.01264174, + "epoch": 0.2009920336690215, + "flos": 21797395027200.0, + "grad_norm": 1.987813203177408, + "language_loss": 0.73357129, + "learning_rate": 3.701049056727384e-06, + "loss": 0.81202972, + "num_input_tokens_seen": 72223555, + "router_z_loss_clip": 2.55273438, + "router_z_loss_mlp": 0.24169922, + "step": 3343, + "time_per_iteration": 2.547868490219116 + }, + { + "auxiliary_loss_clip": 0.06554922, + "auxiliary_loss_mlp": 0.012954, + "balance_loss_clip": 0.06301528, + "balance_loss_mlp": 0.01269865, + "epoch": 0.20105215692168946, + "flos": 26366390461440.0, + "grad_norm": 2.115251797604865, + "language_loss": 0.81433517, + "learning_rate": 3.7008441927982574e-06, + "loss": 0.89283836, + "num_input_tokens_seen": 72242465, + "router_z_loss_clip": 2.53515625, + "router_z_loss_mlp": 0.25524902, + "step": 3344, + "time_per_iteration": 2.6067302227020264 + }, + { + "auxiliary_loss_clip": 0.06556335, + "auxiliary_loss_mlp": 0.01281302, + "balance_loss_clip": 0.06301118, + "balance_loss_mlp": 0.01258426, + "epoch": 0.20111228017435742, + "flos": 18813288044160.0, + "grad_norm": 4.0042293338609385, + "language_loss": 0.84618676, + "learning_rate": 3.700639264372948e-06, + "loss": 0.92456311, + "num_input_tokens_seen": 72260655, + "router_z_loss_clip": 2.55273438, + "router_z_loss_mlp": 0.2286377, + "step": 3345, + "time_per_iteration": 2.554713726043701 + }, + { + "auxiliary_loss_clip": 0.06542929, + "auxiliary_loss_mlp": 0.01295407, + "balance_loss_clip": 0.0629687, + "balance_loss_mlp": 0.01272697, + "epoch": 0.20117240342702541, + "flos": 19981301633280.0, + "grad_norm": 2.1108086187654025, + "language_loss": 0.68437809, + "learning_rate": 3.7004342714592283e-06, + "loss": 0.76276147, + "num_input_tokens_seen": 72279055, + "router_z_loss_clip": 2.4609375, + "router_z_loss_mlp": 0.22705078, + "step": 3346, + "time_per_iteration": 2.5748066902160645 + }, + { + "auxiliary_loss_clip": 0.06553109, + "auxiliary_loss_mlp": 0.01283392, + "balance_loss_clip": 0.06301546, + "balance_loss_mlp": 0.01258739, + "epoch": 0.20123252667969338, + "flos": 23148368006400.0, + "grad_norm": 1.9426154174848713, + "language_loss": 0.73952061, + "learning_rate": 3.70022921406487e-06, + "loss": 0.81788564, + "num_input_tokens_seen": 72297895, + "router_z_loss_clip": 2.51757812, + "router_z_loss_mlp": 0.24682617, + "step": 3347, + "time_per_iteration": 2.5353236198425293 + }, + { + "auxiliary_loss_clip": 0.06546339, + "auxiliary_loss_mlp": 0.01287781, + "balance_loss_clip": 0.0629671, + "balance_loss_mlp": 0.01263487, + "epoch": 0.20129264993236134, + "flos": 23228352328320.0, + "grad_norm": 1.557023243146552, + "language_loss": 0.87284029, + "learning_rate": 3.70002409219765e-06, + "loss": 0.95118147, + "num_input_tokens_seen": 72318385, + "router_z_loss_clip": 2.49414062, + "router_z_loss_mlp": 0.24316406, + "step": 3348, + "time_per_iteration": 2.5943105220794678 + }, + { + "auxiliary_loss_clip": 0.06550047, + "auxiliary_loss_mlp": 0.01294068, + "balance_loss_clip": 0.06302264, + "balance_loss_mlp": 0.01269034, + "epoch": 0.2013527731850293, + "flos": 21877882473600.0, + "grad_norm": 1.6966939322149492, + "language_loss": 0.71502012, + "learning_rate": 3.699818905865346e-06, + "loss": 0.7934612, + "num_input_tokens_seen": 72338235, + "router_z_loss_clip": 2.47851562, + "router_z_loss_mlp": 0.25061035, + "step": 3349, + "time_per_iteration": 2.5671966075897217 + }, + { + "auxiliary_loss_clip": 0.06552055, + "auxiliary_loss_mlp": 0.01290022, + "balance_loss_clip": 0.06301533, + "balance_loss_mlp": 0.01263486, + "epoch": 0.20141289643769728, + "flos": 18046636312320.0, + "grad_norm": 1.7460886195435679, + "language_loss": 0.72473693, + "learning_rate": 3.6996136550757377e-06, + "loss": 0.80315775, + "num_input_tokens_seen": 72357825, + "router_z_loss_clip": 2.50195312, + "router_z_loss_mlp": 0.26501465, + "step": 3350, + "time_per_iteration": 2.558486223220825 + }, + { + "auxiliary_loss_clip": 0.06561922, + "auxiliary_loss_mlp": 0.01282894, + "balance_loss_clip": 0.0630732, + "balance_loss_mlp": 0.01256728, + "epoch": 0.20147301969036524, + "flos": 23958219317760.0, + "grad_norm": 2.4285458765514623, + "language_loss": 0.76773715, + "learning_rate": 3.69940833983661e-06, + "loss": 0.84618533, + "num_input_tokens_seen": 72376335, + "router_z_loss_clip": 2.54882812, + "router_z_loss_mlp": 0.26135254, + "step": 3351, + "time_per_iteration": 2.5236856937408447 + }, + { + "auxiliary_loss_clip": 0.0657143, + "auxiliary_loss_mlp": 0.01289916, + "balance_loss_clip": 0.06311074, + "balance_loss_mlp": 0.01260638, + "epoch": 0.2015331429430332, + "flos": 25594749411840.0, + "grad_norm": 1.6280311670130643, + "language_loss": 0.81367022, + "learning_rate": 3.699202960155748e-06, + "loss": 0.89228368, + "num_input_tokens_seen": 72395440, + "router_z_loss_clip": 2.60742188, + "router_z_loss_mlp": 0.29248047, + "step": 3352, + "time_per_iteration": 2.603740692138672 + }, + { + "auxiliary_loss_clip": 0.06557955, + "auxiliary_loss_mlp": 0.01286544, + "balance_loss_clip": 0.06303968, + "balance_loss_mlp": 0.01258458, + "epoch": 0.2015932661957012, + "flos": 26732351168640.0, + "grad_norm": 2.001275007108419, + "language_loss": 0.81670761, + "learning_rate": 3.6989975160409396e-06, + "loss": 0.89515263, + "num_input_tokens_seen": 72414670, + "router_z_loss_clip": 2.54101562, + "router_z_loss_mlp": 0.28063965, + "step": 3353, + "time_per_iteration": 2.5631332397460938 + }, + { + "auxiliary_loss_clip": 0.06555627, + "auxiliary_loss_mlp": 0.01278407, + "balance_loss_clip": 0.0630668, + "balance_loss_mlp": 0.01253206, + "epoch": 0.20165338944836916, + "flos": 15638632876800.0, + "grad_norm": 1.8574199324884482, + "language_loss": 0.9049592, + "learning_rate": 3.6987920074999747e-06, + "loss": 0.98329961, + "num_input_tokens_seen": 72432210, + "router_z_loss_clip": 2.49414062, + "router_z_loss_mlp": 0.2520752, + "step": 3354, + "time_per_iteration": 2.567229986190796 + }, + { + "auxiliary_loss_clip": 0.06439115, + "auxiliary_loss_mlp": 0.01275126, + "balance_loss_clip": 0.06305242, + "balance_loss_mlp": 0.01268129, + "epoch": 0.20171351270103713, + "flos": 57929926089600.0, + "grad_norm": 0.8202677442032412, + "language_loss": 0.55840385, + "learning_rate": 3.6985864345406465e-06, + "loss": 0.63554633, + "num_input_tokens_seen": 72489225, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.07012939, + "step": 3355, + "time_per_iteration": 3.118603229522705 + }, + { + "auxiliary_loss_clip": 0.06557105, + "auxiliary_loss_mlp": 0.01281149, + "balance_loss_clip": 0.06309459, + "balance_loss_mlp": 0.01257474, + "epoch": 0.2017736359537051, + "flos": 20820768163200.0, + "grad_norm": 1.5861142309185163, + "language_loss": 0.84845644, + "learning_rate": 3.698380797170751e-06, + "loss": 0.92683893, + "num_input_tokens_seen": 72508715, + "router_z_loss_clip": 2.47265625, + "router_z_loss_mlp": 0.23669434, + "step": 3356, + "time_per_iteration": 2.5407068729400635 + }, + { + "auxiliary_loss_clip": 0.06578876, + "auxiliary_loss_mlp": 0.01283859, + "balance_loss_clip": 0.06314196, + "balance_loss_mlp": 0.01255344, + "epoch": 0.20183375920637306, + "flos": 17097696023040.0, + "grad_norm": 3.7689574240726147, + "language_loss": 0.71072245, + "learning_rate": 3.698175095398085e-06, + "loss": 0.78934979, + "num_input_tokens_seen": 72525135, + "router_z_loss_clip": 2.64257812, + "router_z_loss_mlp": 0.28515625, + "step": 3357, + "time_per_iteration": 2.4921233654022217 + }, + { + "auxiliary_loss_clip": 0.065685, + "auxiliary_loss_mlp": 0.01288812, + "balance_loss_clip": 0.0631017, + "balance_loss_mlp": 0.01263206, + "epoch": 0.20189388245904102, + "flos": 18667323031680.0, + "grad_norm": 2.064581487792546, + "language_loss": 0.72707927, + "learning_rate": 3.6979693292304493e-06, + "loss": 0.80565238, + "num_input_tokens_seen": 72543690, + "router_z_loss_clip": 2.58007812, + "router_z_loss_mlp": 0.25585938, + "step": 3358, + "time_per_iteration": 2.531280040740967 + }, + { + "auxiliary_loss_clip": 0.06550319, + "auxiliary_loss_mlp": 0.0128707, + "balance_loss_clip": 0.06304348, + "balance_loss_mlp": 0.01263633, + "epoch": 0.20195400571170902, + "flos": 16802705324160.0, + "grad_norm": 1.761827203655194, + "language_loss": 0.83542818, + "learning_rate": 3.6977634986756463e-06, + "loss": 0.91380209, + "num_input_tokens_seen": 72560725, + "router_z_loss_clip": 2.46289062, + "router_z_loss_mlp": 0.234375, + "step": 3359, + "time_per_iteration": 2.5004122257232666 + }, + { + "auxiliary_loss_clip": 0.06415485, + "auxiliary_loss_mlp": 0.01275385, + "balance_loss_clip": 0.06281421, + "balance_loss_mlp": 0.01269109, + "epoch": 0.20201412896437698, + "flos": 67192792669440.0, + "grad_norm": 0.7763137973079639, + "language_loss": 0.58718604, + "learning_rate": 3.697557603741482e-06, + "loss": 0.66409475, + "num_input_tokens_seen": 72621940, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.06274414, + "step": 3360, + "time_per_iteration": 3.202280282974243 + }, + { + "auxiliary_loss_clip": 0.06567518, + "auxiliary_loss_mlp": 0.01281863, + "balance_loss_clip": 0.06312253, + "balance_loss_mlp": 0.01257055, + "epoch": 0.20207425221704495, + "flos": 21331477998720.0, + "grad_norm": 2.7701451368403767, + "language_loss": 0.63371557, + "learning_rate": 3.697351644435763e-06, + "loss": 0.71220934, + "num_input_tokens_seen": 72639135, + "router_z_loss_clip": 2.55078125, + "router_z_loss_mlp": 0.24841309, + "step": 3361, + "time_per_iteration": 2.591505527496338 + }, + { + "auxiliary_loss_clip": 0.06556661, + "auxiliary_loss_mlp": 0.01280295, + "balance_loss_clip": 0.06304803, + "balance_loss_mlp": 0.01257049, + "epoch": 0.2021343754697129, + "flos": 22533509145600.0, + "grad_norm": 1.837331842396403, + "language_loss": 0.76495373, + "learning_rate": 3.6971456207662993e-06, + "loss": 0.84332329, + "num_input_tokens_seen": 72658525, + "router_z_loss_clip": 2.51953125, + "router_z_loss_mlp": 0.23254395, + "step": 3362, + "time_per_iteration": 2.5748798847198486 + }, + { + "auxiliary_loss_clip": 0.06552652, + "auxiliary_loss_mlp": 0.01281781, + "balance_loss_clip": 0.06300291, + "balance_loss_mlp": 0.01257379, + "epoch": 0.20219449872238088, + "flos": 19068852597120.0, + "grad_norm": 1.6506097934595576, + "language_loss": 0.77716577, + "learning_rate": 3.6969395327409035e-06, + "loss": 0.85551012, + "num_input_tokens_seen": 72678085, + "router_z_loss_clip": 2.52734375, + "router_z_loss_mlp": 0.24365234, + "step": 3363, + "time_per_iteration": 2.5682361125946045 + }, + { + "auxiliary_loss_clip": 0.06556462, + "auxiliary_loss_mlp": 0.01285372, + "balance_loss_clip": 0.06303493, + "balance_loss_mlp": 0.01262198, + "epoch": 0.20225462197504884, + "flos": 24723864800640.0, + "grad_norm": 1.5662342973814338, + "language_loss": 0.75767177, + "learning_rate": 3.696733380367391e-06, + "loss": 0.83609009, + "num_input_tokens_seen": 72698695, + "router_z_loss_clip": 2.53515625, + "router_z_loss_mlp": 0.23181152, + "step": 3364, + "time_per_iteration": 2.620352029800415 + }, + { + "auxiliary_loss_clip": 0.06564072, + "auxiliary_loss_mlp": 0.01282858, + "balance_loss_clip": 0.06306748, + "balance_loss_mlp": 0.01259374, + "epoch": 0.2023147452277168, + "flos": 22024895662080.0, + "grad_norm": 2.684464985384485, + "language_loss": 0.72232616, + "learning_rate": 3.6965271636535783e-06, + "loss": 0.80079544, + "num_input_tokens_seen": 72717880, + "router_z_loss_clip": 2.57226562, + "router_z_loss_mlp": 0.23474121, + "step": 3365, + "time_per_iteration": 2.6884727478027344 + }, + { + "auxiliary_loss_clip": 0.06551654, + "auxiliary_loss_mlp": 0.01279748, + "balance_loss_clip": 0.0629961, + "balance_loss_mlp": 0.01256336, + "epoch": 0.2023748684803848, + "flos": 17750555510400.0, + "grad_norm": 1.8865204005259733, + "language_loss": 0.86329257, + "learning_rate": 3.696320882607286e-06, + "loss": 0.94160658, + "num_input_tokens_seen": 72736410, + "router_z_loss_clip": 2.52148438, + "router_z_loss_mlp": 0.23425293, + "step": 3366, + "time_per_iteration": 2.541398525238037 + }, + { + "auxiliary_loss_clip": 0.06552443, + "auxiliary_loss_mlp": 0.01277252, + "balance_loss_clip": 0.06302489, + "balance_loss_mlp": 0.01254698, + "epoch": 0.20243499173305277, + "flos": 31146912328320.0, + "grad_norm": 1.6069123477498997, + "language_loss": 0.69763649, + "learning_rate": 3.696114537236335e-06, + "loss": 0.77593338, + "num_input_tokens_seen": 72758295, + "router_z_loss_clip": 2.49804688, + "router_z_loss_mlp": 0.22558594, + "step": 3367, + "time_per_iteration": 2.674370527267456 + }, + { + "auxiliary_loss_clip": 0.06562914, + "auxiliary_loss_mlp": 0.01285589, + "balance_loss_clip": 0.06300482, + "balance_loss_mlp": 0.01257777, + "epoch": 0.20249511498572073, + "flos": 33847726256640.0, + "grad_norm": 1.76028679400595, + "language_loss": 0.69152057, + "learning_rate": 3.6959081275485512e-06, + "loss": 0.77000558, + "num_input_tokens_seen": 72782495, + "router_z_loss_clip": 2.62109375, + "router_z_loss_mlp": 0.27819824, + "step": 3368, + "time_per_iteration": 2.6662635803222656 + }, + { + "auxiliary_loss_clip": 0.06551345, + "auxiliary_loss_mlp": 0.0128738, + "balance_loss_clip": 0.06301117, + "balance_loss_mlp": 0.01263657, + "epoch": 0.2025552382383887, + "flos": 21222088093440.0, + "grad_norm": 1.819755421756695, + "language_loss": 0.78064144, + "learning_rate": 3.6957016535517615e-06, + "loss": 0.8590287, + "num_input_tokens_seen": 72801885, + "router_z_loss_clip": 2.50390625, + "router_z_loss_mlp": 0.23718262, + "step": 3369, + "time_per_iteration": 2.5846660137176514 + }, + { + "auxiliary_loss_clip": 0.06560668, + "auxiliary_loss_mlp": 0.01282514, + "balance_loss_clip": 0.06299458, + "balance_loss_mlp": 0.01257492, + "epoch": 0.20261536149105666, + "flos": 14652614355840.0, + "grad_norm": 3.2010156823618687, + "language_loss": 0.66533637, + "learning_rate": 3.695495115253795e-06, + "loss": 0.74376816, + "num_input_tokens_seen": 72816990, + "router_z_loss_clip": 2.61132812, + "router_z_loss_mlp": 0.25024414, + "step": 3370, + "time_per_iteration": 3.953664541244507 + }, + { + "auxiliary_loss_clip": 0.06420556, + "auxiliary_loss_mlp": 0.01256354, + "balance_loss_clip": 0.06284036, + "balance_loss_mlp": 0.01249797, + "epoch": 0.20267548474372463, + "flos": 66803380018560.0, + "grad_norm": 0.6606134365812599, + "language_loss": 0.58273321, + "learning_rate": 3.6952885126624834e-06, + "loss": 0.65950233, + "num_input_tokens_seen": 72879240, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.06567383, + "step": 3371, + "time_per_iteration": 3.2517025470733643 + }, + { + "auxiliary_loss_clip": 0.06555597, + "auxiliary_loss_mlp": 0.01283717, + "balance_loss_clip": 0.06300298, + "balance_loss_mlp": 0.01257944, + "epoch": 0.2027356079963926, + "flos": 24687667036800.0, + "grad_norm": 1.6416079718190109, + "language_loss": 0.92020303, + "learning_rate": 3.6950818457856617e-06, + "loss": 0.99859619, + "num_input_tokens_seen": 72899030, + "router_z_loss_clip": 2.55273438, + "router_z_loss_mlp": 0.25769043, + "step": 3372, + "time_per_iteration": 4.108370065689087 + }, + { + "auxiliary_loss_clip": 0.06555616, + "auxiliary_loss_mlp": 0.01283062, + "balance_loss_clip": 0.06298956, + "balance_loss_mlp": 0.01258672, + "epoch": 0.20279573124906058, + "flos": 26399443697280.0, + "grad_norm": 1.769817073167301, + "language_loss": 0.79293168, + "learning_rate": 3.694875114631167e-06, + "loss": 0.87131846, + "num_input_tokens_seen": 72919190, + "router_z_loss_clip": 2.56640625, + "router_z_loss_mlp": 0.24414062, + "step": 3373, + "time_per_iteration": 2.6076717376708984 + }, + { + "auxiliary_loss_clip": 0.06543471, + "auxiliary_loss_mlp": 0.01280674, + "balance_loss_clip": 0.06296648, + "balance_loss_mlp": 0.01256343, + "epoch": 0.20285585450172855, + "flos": 33808006621440.0, + "grad_norm": 3.4143342380796255, + "language_loss": 0.72364163, + "learning_rate": 3.6946683192068377e-06, + "loss": 0.8018831, + "num_input_tokens_seen": 72939720, + "router_z_loss_clip": 2.46679688, + "router_z_loss_mlp": 0.24328613, + "step": 3374, + "time_per_iteration": 2.6686174869537354 + }, + { + "auxiliary_loss_clip": 0.06419748, + "auxiliary_loss_mlp": 0.01258876, + "balance_loss_clip": 0.06284177, + "balance_loss_mlp": 0.01252266, + "epoch": 0.20291597775439651, + "flos": 71185768410240.0, + "grad_norm": 1.0120800133799934, + "language_loss": 0.62520474, + "learning_rate": 3.694461459520516e-06, + "loss": 0.70199096, + "num_input_tokens_seen": 73000015, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.06622314, + "step": 3375, + "time_per_iteration": 3.159513473510742 + }, + { + "auxiliary_loss_clip": 0.06548455, + "auxiliary_loss_mlp": 0.01283408, + "balance_loss_clip": 0.06294296, + "balance_loss_mlp": 0.0125891, + "epoch": 0.20297610100706448, + "flos": 19499368475520.0, + "grad_norm": 1.6178559610323104, + "language_loss": 0.82908762, + "learning_rate": 3.6942545355800463e-06, + "loss": 0.90740621, + "num_input_tokens_seen": 73017675, + "router_z_loss_clip": 2.54296875, + "router_z_loss_mlp": 0.24499512, + "step": 3376, + "time_per_iteration": 2.5366275310516357 + }, + { + "auxiliary_loss_clip": 0.06553418, + "auxiliary_loss_mlp": 0.01284265, + "balance_loss_clip": 0.06296962, + "balance_loss_mlp": 0.0125854, + "epoch": 0.20303622425973245, + "flos": 25050944413440.0, + "grad_norm": 2.015544075965587, + "language_loss": 0.82464767, + "learning_rate": 3.6940475473932743e-06, + "loss": 0.90302449, + "num_input_tokens_seen": 73036135, + "router_z_loss_clip": 2.56835938, + "router_z_loss_mlp": 0.25720215, + "step": 3377, + "time_per_iteration": 2.579468250274658 + }, + { + "auxiliary_loss_clip": 0.06554671, + "auxiliary_loss_mlp": 0.01287763, + "balance_loss_clip": 0.06300091, + "balance_loss_mlp": 0.01261453, + "epoch": 0.2030963475124004, + "flos": 21986266129920.0, + "grad_norm": 1.7361857812490578, + "language_loss": 0.7745406, + "learning_rate": 3.69384049496805e-06, + "loss": 0.85296494, + "num_input_tokens_seen": 73054075, + "router_z_loss_clip": 2.546875, + "router_z_loss_mlp": 0.26306152, + "step": 3378, + "time_per_iteration": 3.999164342880249 + }, + { + "auxiliary_loss_clip": 0.06557525, + "auxiliary_loss_mlp": 0.01285912, + "balance_loss_clip": 0.06298093, + "balance_loss_mlp": 0.01259423, + "epoch": 0.2031564707650684, + "flos": 19506496072320.0, + "grad_norm": 1.7814270376711854, + "language_loss": 0.80552137, + "learning_rate": 3.6936333783122242e-06, + "loss": 0.88395572, + "num_input_tokens_seen": 73073530, + "router_z_loss_clip": 2.59375, + "router_z_loss_mlp": 0.26525879, + "step": 3379, + "time_per_iteration": 3.94376277923584 + }, + { + "auxiliary_loss_clip": 0.06547987, + "auxiliary_loss_mlp": 0.01283987, + "balance_loss_clip": 0.06298195, + "balance_loss_mlp": 0.01259799, + "epoch": 0.20321659401773637, + "flos": 22753630621440.0, + "grad_norm": 1.8399421212903948, + "language_loss": 0.87578034, + "learning_rate": 3.6934261974336505e-06, + "loss": 0.95410013, + "num_input_tokens_seen": 73092820, + "router_z_loss_clip": 2.49609375, + "router_z_loss_mlp": 0.24206543, + "step": 3380, + "time_per_iteration": 2.5826356410980225 + }, + { + "auxiliary_loss_clip": 0.06554954, + "auxiliary_loss_mlp": 0.01300173, + "balance_loss_clip": 0.06299303, + "balance_loss_mlp": 0.01274817, + "epoch": 0.20327671727040433, + "flos": 22462455283200.0, + "grad_norm": 2.147675917051705, + "language_loss": 0.75801265, + "learning_rate": 3.693218952340186e-06, + "loss": 0.83656389, + "num_input_tokens_seen": 73113385, + "router_z_loss_clip": 2.55664062, + "router_z_loss_mlp": 0.2532959, + "step": 3381, + "time_per_iteration": 2.580035924911499 + }, + { + "auxiliary_loss_clip": 0.06559204, + "auxiliary_loss_mlp": 0.0128659, + "balance_loss_clip": 0.06297147, + "balance_loss_mlp": 0.01260198, + "epoch": 0.2033368405230723, + "flos": 19540807119360.0, + "grad_norm": 1.8225171591496117, + "language_loss": 0.79701936, + "learning_rate": 3.6930116430396895e-06, + "loss": 0.87547731, + "num_input_tokens_seen": 73131195, + "router_z_loss_clip": 2.62304688, + "router_z_loss_mlp": 0.26391602, + "step": 3382, + "time_per_iteration": 2.743842601776123 + }, + { + "auxiliary_loss_clip": 0.06551235, + "auxiliary_loss_mlp": 0.01283934, + "balance_loss_clip": 0.06293041, + "balance_loss_mlp": 0.01258745, + "epoch": 0.20339696377574026, + "flos": 13814489491200.0, + "grad_norm": 1.712325191768153, + "language_loss": 0.80308962, + "learning_rate": 3.6928042695400214e-06, + "loss": 0.8814413, + "num_input_tokens_seen": 73148850, + "router_z_loss_clip": 2.58203125, + "router_z_loss_mlp": 0.25195312, + "step": 3383, + "time_per_iteration": 2.6428067684173584 + }, + { + "auxiliary_loss_clip": 0.06548008, + "auxiliary_loss_mlp": 0.01285433, + "balance_loss_clip": 0.06295451, + "balance_loss_mlp": 0.01259541, + "epoch": 0.20345708702840823, + "flos": 20345627185920.0, + "grad_norm": 1.7809184522678074, + "language_loss": 0.75199848, + "learning_rate": 3.6925968318490464e-06, + "loss": 0.83033288, + "num_input_tokens_seen": 73166775, + "router_z_loss_clip": 2.52734375, + "router_z_loss_mlp": 0.25891113, + "step": 3384, + "time_per_iteration": 2.5601112842559814 + }, + { + "auxiliary_loss_clip": 0.06573269, + "auxiliary_loss_mlp": 0.01282943, + "balance_loss_clip": 0.06306025, + "balance_loss_mlp": 0.01256229, + "epoch": 0.2035172102810762, + "flos": 20339254275840.0, + "grad_norm": 2.5841350087074852, + "language_loss": 0.77226508, + "learning_rate": 3.6923893299746293e-06, + "loss": 0.85082722, + "num_input_tokens_seen": 73183215, + "router_z_loss_clip": 2.66992188, + "router_z_loss_mlp": 0.26745605, + "step": 3385, + "time_per_iteration": 2.527583122253418 + }, + { + "auxiliary_loss_clip": 0.06553946, + "auxiliary_loss_mlp": 0.01288968, + "balance_loss_clip": 0.06300423, + "balance_loss_mlp": 0.01263934, + "epoch": 0.2035773335337442, + "flos": 23337658379520.0, + "grad_norm": 1.6683994830989402, + "language_loss": 0.70000219, + "learning_rate": 3.692181763924639e-06, + "loss": 0.7784313, + "num_input_tokens_seen": 73203290, + "router_z_loss_clip": 2.53710938, + "router_z_loss_mlp": 0.25048828, + "step": 3386, + "time_per_iteration": 2.583940029144287 + }, + { + "auxiliary_loss_clip": 0.06550556, + "auxiliary_loss_mlp": 0.01289862, + "balance_loss_clip": 0.0629431, + "balance_loss_mlp": 0.01265495, + "epoch": 0.20363745678641215, + "flos": 28337924378880.0, + "grad_norm": 1.2744067098921972, + "language_loss": 0.81998229, + "learning_rate": 3.691974133706947e-06, + "loss": 0.89838648, + "num_input_tokens_seen": 73226185, + "router_z_loss_clip": 2.5625, + "router_z_loss_mlp": 0.24365234, + "step": 3387, + "time_per_iteration": 2.624765634536743 + }, + { + "auxiliary_loss_clip": 0.06543861, + "auxiliary_loss_mlp": 0.01285642, + "balance_loss_clip": 0.06297304, + "balance_loss_mlp": 0.01261705, + "epoch": 0.20369758003908012, + "flos": 18921503992320.0, + "grad_norm": 2.338231566069276, + "language_loss": 0.80333674, + "learning_rate": 3.6917664393294262e-06, + "loss": 0.88163185, + "num_input_tokens_seen": 73243300, + "router_z_loss_clip": 2.47070312, + "router_z_loss_mlp": 0.23925781, + "step": 3388, + "time_per_iteration": 2.565795421600342 + }, + { + "auxiliary_loss_clip": 0.06553982, + "auxiliary_loss_mlp": 0.01281213, + "balance_loss_clip": 0.06297579, + "balance_loss_mlp": 0.0125693, + "epoch": 0.20375770329174808, + "flos": 19212218133120.0, + "grad_norm": 1.8814817968190891, + "language_loss": 0.72894287, + "learning_rate": 3.6915586807999527e-06, + "loss": 0.80729485, + "num_input_tokens_seen": 73261490, + "router_z_loss_clip": 2.56445312, + "router_z_loss_mlp": 0.24279785, + "step": 3389, + "time_per_iteration": 2.5263590812683105 + }, + { + "auxiliary_loss_clip": 0.06544612, + "auxiliary_loss_mlp": 0.01286594, + "balance_loss_clip": 0.06296231, + "balance_loss_mlp": 0.01262204, + "epoch": 0.20381782654441605, + "flos": 19397106167040.0, + "grad_norm": 2.5524619095037626, + "language_loss": 0.88214552, + "learning_rate": 3.691350858126404e-06, + "loss": 0.96045768, + "num_input_tokens_seen": 73280180, + "router_z_loss_clip": 2.484375, + "router_z_loss_mlp": 0.24377441, + "step": 3390, + "time_per_iteration": 2.5450997352600098 + }, + { + "auxiliary_loss_clip": 0.06546676, + "auxiliary_loss_mlp": 0.01283726, + "balance_loss_clip": 0.06297011, + "balance_loss_mlp": 0.01260683, + "epoch": 0.203877949797084, + "flos": 24834764079360.0, + "grad_norm": 2.430374095532116, + "language_loss": 0.71690643, + "learning_rate": 3.691142971316662e-06, + "loss": 0.79521036, + "num_input_tokens_seen": 73300680, + "router_z_loss_clip": 2.49609375, + "router_z_loss_mlp": 0.23022461, + "step": 3391, + "time_per_iteration": 2.5983424186706543 + }, + { + "auxiliary_loss_clip": 0.06548478, + "auxiliary_loss_mlp": 0.01287319, + "balance_loss_clip": 0.06300271, + "balance_loss_mlp": 0.01263799, + "epoch": 0.20393807304975198, + "flos": 18009432299520.0, + "grad_norm": 3.271459971820983, + "language_loss": 0.87029123, + "learning_rate": 3.6909350203786086e-06, + "loss": 0.94864917, + "num_input_tokens_seen": 73316760, + "router_z_loss_clip": 2.48242188, + "router_z_loss_mlp": 0.2355957, + "step": 3392, + "time_per_iteration": 2.5094432830810547 + }, + { + "auxiliary_loss_clip": 0.06555735, + "auxiliary_loss_mlp": 0.01288889, + "balance_loss_clip": 0.06302007, + "balance_loss_mlp": 0.0126432, + "epoch": 0.20399819630241997, + "flos": 24213867724800.0, + "grad_norm": 1.4298747009925739, + "language_loss": 0.8143822, + "learning_rate": 3.69072700532013e-06, + "loss": 0.8928284, + "num_input_tokens_seen": 73339385, + "router_z_loss_clip": 2.53710938, + "router_z_loss_mlp": 0.24560547, + "step": 3393, + "time_per_iteration": 2.674898147583008 + }, + { + "auxiliary_loss_clip": 0.06555712, + "auxiliary_loss_mlp": 0.01283361, + "balance_loss_clip": 0.0630876, + "balance_loss_mlp": 0.01260747, + "epoch": 0.20405831955508794, + "flos": 20783396442240.0, + "grad_norm": 2.2973425083766377, + "language_loss": 0.87181509, + "learning_rate": 3.6905189261491137e-06, + "loss": 0.9502058, + "num_input_tokens_seen": 73357235, + "router_z_loss_clip": 2.47070312, + "router_z_loss_mlp": 0.22619629, + "step": 3394, + "time_per_iteration": 2.5489470958709717 + }, + { + "auxiliary_loss_clip": 0.06548424, + "auxiliary_loss_mlp": 0.0128548, + "balance_loss_clip": 0.06299029, + "balance_loss_mlp": 0.01262448, + "epoch": 0.2041184428077559, + "flos": 15492332448000.0, + "grad_norm": 2.1306464149991027, + "language_loss": 0.8456347, + "learning_rate": 3.69031078287345e-06, + "loss": 0.92397374, + "num_input_tokens_seen": 73374435, + "router_z_loss_clip": 2.49609375, + "router_z_loss_mlp": 0.23034668, + "step": 3395, + "time_per_iteration": 2.5297558307647705 + }, + { + "auxiliary_loss_clip": 0.06554371, + "auxiliary_loss_mlp": 0.01288203, + "balance_loss_clip": 0.06299008, + "balance_loss_mlp": 0.0126448, + "epoch": 0.20417856606042387, + "flos": 15592582258560.0, + "grad_norm": 1.9297262637725432, + "language_loss": 0.84104818, + "learning_rate": 3.690102575501033e-06, + "loss": 0.91947389, + "num_input_tokens_seen": 73391025, + "router_z_loss_clip": 2.55273438, + "router_z_loss_mlp": 0.23730469, + "step": 3396, + "time_per_iteration": 2.492448568344116 + }, + { + "auxiliary_loss_clip": 0.0654766, + "auxiliary_loss_mlp": 0.01296047, + "balance_loss_clip": 0.06301443, + "balance_loss_mlp": 0.01272706, + "epoch": 0.20423868931309183, + "flos": 24286137471360.0, + "grad_norm": 2.084884773893835, + "language_loss": 0.7751056, + "learning_rate": 3.6898943040397556e-06, + "loss": 0.85354269, + "num_input_tokens_seen": 73409270, + "router_z_loss_clip": 2.46289062, + "router_z_loss_mlp": 0.2331543, + "step": 3397, + "time_per_iteration": 2.5621836185455322 + }, + { + "auxiliary_loss_clip": 0.06547033, + "auxiliary_loss_mlp": 0.01291146, + "balance_loss_clip": 0.06300367, + "balance_loss_mlp": 0.01268067, + "epoch": 0.2042988125657598, + "flos": 18619176061440.0, + "grad_norm": 3.401004534017878, + "language_loss": 0.88746947, + "learning_rate": 3.689685968497518e-06, + "loss": 0.96585131, + "num_input_tokens_seen": 73425225, + "router_z_loss_clip": 2.46875, + "router_z_loss_mlp": 0.23083496, + "step": 3398, + "time_per_iteration": 2.4821889400482178 + }, + { + "auxiliary_loss_clip": 0.06555858, + "auxiliary_loss_mlp": 0.01287072, + "balance_loss_clip": 0.06305312, + "balance_loss_mlp": 0.01263361, + "epoch": 0.2043589358184278, + "flos": 17855836565760.0, + "grad_norm": 2.044777021305177, + "language_loss": 0.79053116, + "learning_rate": 3.6894775688822186e-06, + "loss": 0.8689605, + "num_input_tokens_seen": 73440940, + "router_z_loss_clip": 2.50390625, + "router_z_loss_mlp": 0.23706055, + "step": 3399, + "time_per_iteration": 2.5007028579711914 + }, + { + "auxiliary_loss_clip": 0.06554085, + "auxiliary_loss_mlp": 0.01288353, + "balance_loss_clip": 0.06300685, + "balance_loss_mlp": 0.01264678, + "epoch": 0.20441905907109575, + "flos": 21441832225920.0, + "grad_norm": 3.4484144890832327, + "language_loss": 0.77263522, + "learning_rate": 3.6892691052017603e-06, + "loss": 0.85105962, + "num_input_tokens_seen": 73458805, + "router_z_loss_clip": 2.53125, + "router_z_loss_mlp": 0.23669434, + "step": 3400, + "time_per_iteration": 2.524930715560913 + }, + { + "auxiliary_loss_clip": 0.06546277, + "auxiliary_loss_mlp": 0.0128369, + "balance_loss_clip": 0.0630067, + "balance_loss_mlp": 0.01262423, + "epoch": 0.20447918232376372, + "flos": 27714847818240.0, + "grad_norm": 1.566944783994086, + "language_loss": 0.7976017, + "learning_rate": 3.6890605774640487e-06, + "loss": 0.87590134, + "num_input_tokens_seen": 73479380, + "router_z_loss_clip": 2.45703125, + "router_z_loss_mlp": 0.21264648, + "step": 3401, + "time_per_iteration": 2.5868172645568848 + }, + { + "auxiliary_loss_clip": 0.06547564, + "auxiliary_loss_mlp": 0.01287222, + "balance_loss_clip": 0.06297088, + "balance_loss_mlp": 0.01263833, + "epoch": 0.20453930557643168, + "flos": 30533017789440.0, + "grad_norm": 1.6743436404675067, + "language_loss": 0.69998658, + "learning_rate": 3.688851985676991e-06, + "loss": 0.7783345, + "num_input_tokens_seen": 73505105, + "router_z_loss_clip": 2.50195312, + "router_z_loss_mlp": 0.23400879, + "step": 3402, + "time_per_iteration": 2.664961099624634 + }, + { + "auxiliary_loss_clip": 0.06561718, + "auxiliary_loss_mlp": 0.01282309, + "balance_loss_clip": 0.06309628, + "balance_loss_mlp": 0.01259981, + "epoch": 0.20459942882909965, + "flos": 18993480249600.0, + "grad_norm": 2.0207590642868736, + "language_loss": 0.82498461, + "learning_rate": 3.688643329848496e-06, + "loss": 0.90342486, + "num_input_tokens_seen": 73523700, + "router_z_loss_clip": 2.52148438, + "router_z_loss_mlp": 0.2232666, + "step": 3403, + "time_per_iteration": 2.527240514755249 + }, + { + "auxiliary_loss_clip": 0.0655287, + "auxiliary_loss_mlp": 0.0128312, + "balance_loss_clip": 0.06304024, + "balance_loss_mlp": 0.01260256, + "epoch": 0.20465955208176762, + "flos": 20345207915520.0, + "grad_norm": 1.870475930372837, + "language_loss": 0.83792305, + "learning_rate": 3.6884346099864772e-06, + "loss": 0.91628289, + "num_input_tokens_seen": 73542625, + "router_z_loss_clip": 2.48632812, + "router_z_loss_mlp": 0.22900391, + "step": 3404, + "time_per_iteration": 2.5108580589294434 + }, + { + "auxiliary_loss_clip": 0.06555478, + "auxiliary_loss_mlp": 0.01280254, + "balance_loss_clip": 0.06302839, + "balance_loss_mlp": 0.0125671, + "epoch": 0.20471967533443558, + "flos": 21257615024640.0, + "grad_norm": 1.9668153962924477, + "language_loss": 0.86568373, + "learning_rate": 3.6882258260988487e-06, + "loss": 0.94404107, + "num_input_tokens_seen": 73561450, + "router_z_loss_clip": 2.52539062, + "router_z_loss_mlp": 0.2355957, + "step": 3405, + "time_per_iteration": 2.6064257621765137 + }, + { + "auxiliary_loss_clip": 0.06551084, + "auxiliary_loss_mlp": 0.0128024, + "balance_loss_clip": 0.06302287, + "balance_loss_mlp": 0.01257256, + "epoch": 0.20477979858710357, + "flos": 14506775124480.0, + "grad_norm": 2.695451734790842, + "language_loss": 0.85318458, + "learning_rate": 3.6880169781935276e-06, + "loss": 0.93149781, + "num_input_tokens_seen": 73577155, + "router_z_loss_clip": 2.49023438, + "router_z_loss_mlp": 0.22973633, + "step": 3406, + "time_per_iteration": 2.490360975265503 + }, + { + "auxiliary_loss_clip": 0.06551544, + "auxiliary_loss_mlp": 0.01279954, + "balance_loss_clip": 0.06302837, + "balance_loss_mlp": 0.01256768, + "epoch": 0.20483992183977154, + "flos": 11405018609280.0, + "grad_norm": 8.923539759508978, + "language_loss": 0.69000643, + "learning_rate": 3.6878080662784336e-06, + "loss": 0.76832145, + "num_input_tokens_seen": 73594900, + "router_z_loss_clip": 2.484375, + "router_z_loss_mlp": 0.23193359, + "step": 3407, + "time_per_iteration": 2.5344340801239014 + }, + { + "auxiliary_loss_clip": 0.06549555, + "auxiliary_loss_mlp": 0.01280964, + "balance_loss_clip": 0.06303824, + "balance_loss_mlp": 0.01258374, + "epoch": 0.2049000450924395, + "flos": 19065917704320.0, + "grad_norm": 2.112423962078429, + "language_loss": 0.85367447, + "learning_rate": 3.6875990903614886e-06, + "loss": 0.93197966, + "num_input_tokens_seen": 73613810, + "router_z_loss_clip": 2.45703125, + "router_z_loss_mlp": 0.22583008, + "step": 3408, + "time_per_iteration": 2.5491087436676025 + }, + { + "auxiliary_loss_clip": 0.06564584, + "auxiliary_loss_mlp": 0.0128728, + "balance_loss_clip": 0.06310433, + "balance_loss_mlp": 0.0126314, + "epoch": 0.20496016834510747, + "flos": 14579799557760.0, + "grad_norm": 2.4221013711544876, + "language_loss": 0.65169537, + "learning_rate": 3.6873900504506166e-06, + "loss": 0.730214, + "num_input_tokens_seen": 73631495, + "router_z_loss_clip": 2.54492188, + "router_z_loss_mlp": 0.24121094, + "step": 3409, + "time_per_iteration": 2.5570828914642334 + }, + { + "auxiliary_loss_clip": 0.06553619, + "auxiliary_loss_mlp": 0.0128187, + "balance_loss_clip": 0.06302843, + "balance_loss_mlp": 0.01259029, + "epoch": 0.20502029159777543, + "flos": 22133069683200.0, + "grad_norm": 1.5677004994493864, + "language_loss": 0.81331646, + "learning_rate": 3.687180946553745e-06, + "loss": 0.89167136, + "num_input_tokens_seen": 73652840, + "router_z_loss_clip": 2.5078125, + "router_z_loss_mlp": 0.22851562, + "step": 3410, + "time_per_iteration": 3.9941341876983643 + }, + { + "auxiliary_loss_clip": 0.06562116, + "auxiliary_loss_mlp": 0.01278044, + "balance_loss_clip": 0.06316169, + "balance_loss_mlp": 0.01256252, + "epoch": 0.2050804148504434, + "flos": 25373873249280.0, + "grad_norm": 2.231323409005704, + "language_loss": 0.76898587, + "learning_rate": 3.686971778678803e-06, + "loss": 0.84738749, + "num_input_tokens_seen": 73672150, + "router_z_loss_clip": 2.45898438, + "router_z_loss_mlp": 0.21801758, + "step": 3411, + "time_per_iteration": 2.557502031326294 + }, + { + "auxiliary_loss_clip": 0.06566584, + "auxiliary_loss_mlp": 0.01283098, + "balance_loss_clip": 0.06318649, + "balance_loss_mlp": 0.01260567, + "epoch": 0.2051405381031114, + "flos": 23626443876480.0, + "grad_norm": 1.9814328821552187, + "language_loss": 0.73997778, + "learning_rate": 3.686762546833722e-06, + "loss": 0.81847459, + "num_input_tokens_seen": 73691940, + "router_z_loss_clip": 2.47851562, + "router_z_loss_mlp": 0.22521973, + "step": 3412, + "time_per_iteration": 4.038960695266724 + }, + { + "auxiliary_loss_clip": 0.06568237, + "auxiliary_loss_mlp": 0.01280941, + "balance_loss_clip": 0.06316938, + "balance_loss_mlp": 0.01257183, + "epoch": 0.20520066135577936, + "flos": 19570338483840.0, + "grad_norm": 2.4438525241528963, + "language_loss": 0.79063112, + "learning_rate": 3.6865532510264362e-06, + "loss": 0.86912292, + "num_input_tokens_seen": 73709080, + "router_z_loss_clip": 2.515625, + "router_z_loss_mlp": 0.23754883, + "step": 3413, + "time_per_iteration": 2.5169565677642822 + }, + { + "auxiliary_loss_clip": 0.0655475, + "auxiliary_loss_mlp": 0.0128187, + "balance_loss_clip": 0.06315412, + "balance_loss_mlp": 0.01259423, + "epoch": 0.20526078460844732, + "flos": 17682184978560.0, + "grad_norm": 1.8594099787920526, + "language_loss": 0.85324407, + "learning_rate": 3.6863438912648823e-06, + "loss": 0.93161035, + "num_input_tokens_seen": 73727670, + "router_z_loss_clip": 2.39453125, + "router_z_loss_mlp": 0.2244873, + "step": 3414, + "time_per_iteration": 2.51891827583313 + }, + { + "auxiliary_loss_clip": 0.06556672, + "auxiliary_loss_mlp": 0.01283982, + "balance_loss_clip": 0.0631127, + "balance_loss_mlp": 0.01261451, + "epoch": 0.2053209078611153, + "flos": 21505632710400.0, + "grad_norm": 1.8989416463636506, + "language_loss": 0.8139196, + "learning_rate": 3.6861344675569986e-06, + "loss": 0.89232612, + "num_input_tokens_seen": 73747170, + "router_z_loss_clip": 2.453125, + "router_z_loss_mlp": 0.22521973, + "step": 3415, + "time_per_iteration": 2.534064769744873 + }, + { + "auxiliary_loss_clip": 0.06545444, + "auxiliary_loss_mlp": 0.01280017, + "balance_loss_clip": 0.06300274, + "balance_loss_mlp": 0.01259048, + "epoch": 0.20538103111378325, + "flos": 25670163686400.0, + "grad_norm": 1.9272907146050138, + "language_loss": 0.73450923, + "learning_rate": 3.6859249799107275e-06, + "loss": 0.81276381, + "num_input_tokens_seen": 73767690, + "router_z_loss_clip": 2.44921875, + "router_z_loss_mlp": 0.20959473, + "step": 3416, + "time_per_iteration": 2.5862622261047363 + }, + { + "auxiliary_loss_clip": 0.06555279, + "auxiliary_loss_mlp": 0.01278586, + "balance_loss_clip": 0.06309061, + "balance_loss_mlp": 0.01256342, + "epoch": 0.20544115436645122, + "flos": 23155663311360.0, + "grad_norm": 3.21470343355828, + "language_loss": 0.79731691, + "learning_rate": 3.6857154283340115e-06, + "loss": 0.87565553, + "num_input_tokens_seen": 73786900, + "router_z_loss_clip": 2.46484375, + "router_z_loss_mlp": 0.22253418, + "step": 3417, + "time_per_iteration": 2.5488288402557373 + }, + { + "auxiliary_loss_clip": 0.06553051, + "auxiliary_loss_mlp": 0.0128197, + "balance_loss_clip": 0.06304517, + "balance_loss_mlp": 0.01258248, + "epoch": 0.20550127761911918, + "flos": 19396435334400.0, + "grad_norm": 3.2012221600430744, + "language_loss": 0.88593423, + "learning_rate": 3.685505812834798e-06, + "loss": 0.96428442, + "num_input_tokens_seen": 73804515, + "router_z_loss_clip": 2.484375, + "router_z_loss_mlp": 0.23681641, + "step": 3418, + "time_per_iteration": 5.385840177536011 + }, + { + "auxiliary_loss_clip": 0.06553373, + "auxiliary_loss_mlp": 0.01284895, + "balance_loss_clip": 0.06304052, + "balance_loss_mlp": 0.0125998, + "epoch": 0.20556140087178718, + "flos": 22899721415040.0, + "grad_norm": 2.325256215928591, + "language_loss": 0.63040721, + "learning_rate": 3.685296133421035e-06, + "loss": 0.70878994, + "num_input_tokens_seen": 73822910, + "router_z_loss_clip": 2.49609375, + "router_z_loss_mlp": 0.24926758, + "step": 3419, + "time_per_iteration": 2.5786759853363037 + }, + { + "auxiliary_loss_clip": 0.06563735, + "auxiliary_loss_mlp": 0.01291649, + "balance_loss_clip": 0.06310479, + "balance_loss_mlp": 0.01265554, + "epoch": 0.20562152412445514, + "flos": 19795365423360.0, + "grad_norm": 1.7732270709951168, + "language_loss": 0.86988509, + "learning_rate": 3.685086390100674e-06, + "loss": 0.948439, + "num_input_tokens_seen": 73841160, + "router_z_loss_clip": 2.53515625, + "router_z_loss_mlp": 0.26098633, + "step": 3420, + "time_per_iteration": 2.5364928245544434 + }, + { + "auxiliary_loss_clip": 0.06546585, + "auxiliary_loss_mlp": 0.01284653, + "balance_loss_clip": 0.0630153, + "balance_loss_mlp": 0.01261109, + "epoch": 0.2056816473771231, + "flos": 31509728507520.0, + "grad_norm": 10.333340616962191, + "language_loss": 0.71886712, + "learning_rate": 3.684876582881668e-06, + "loss": 0.79717946, + "num_input_tokens_seen": 73862795, + "router_z_loss_clip": 2.44726562, + "router_z_loss_mlp": 0.2355957, + "step": 3421, + "time_per_iteration": 2.6350786685943604 + }, + { + "auxiliary_loss_clip": 0.06544094, + "auxiliary_loss_mlp": 0.01288814, + "balance_loss_clip": 0.0629928, + "balance_loss_mlp": 0.0126564, + "epoch": 0.20574177062979107, + "flos": 23265095143680.0, + "grad_norm": 2.122387036588777, + "language_loss": 0.72175372, + "learning_rate": 3.6846667117719732e-06, + "loss": 0.8000828, + "num_input_tokens_seen": 73881525, + "router_z_loss_clip": 2.44921875, + "router_z_loss_mlp": 0.23168945, + "step": 3422, + "time_per_iteration": 2.578552007675171 + }, + { + "auxiliary_loss_clip": 0.06409879, + "auxiliary_loss_mlp": 0.01269861, + "balance_loss_clip": 0.06279843, + "balance_loss_mlp": 0.01263078, + "epoch": 0.20580189388245904, + "flos": 70331124291840.0, + "grad_norm": 0.7131964126658911, + "language_loss": 0.551377, + "learning_rate": 3.684456776779548e-06, + "loss": 0.62817442, + "num_input_tokens_seen": 73937775, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.06799316, + "step": 3423, + "time_per_iteration": 3.2106337547302246 + }, + { + "auxiliary_loss_clip": 0.06548166, + "auxiliary_loss_mlp": 0.0128448, + "balance_loss_clip": 0.06301543, + "balance_loss_mlp": 0.01261091, + "epoch": 0.205862017135127, + "flos": 30745802033280.0, + "grad_norm": 1.8660135712145316, + "language_loss": 0.72238076, + "learning_rate": 3.684246777912353e-06, + "loss": 0.80070728, + "num_input_tokens_seen": 73958250, + "router_z_loss_clip": 2.46875, + "router_z_loss_mlp": 0.23400879, + "step": 3424, + "time_per_iteration": 2.614389181137085 + }, + { + "auxiliary_loss_clip": 0.06544662, + "auxiliary_loss_mlp": 0.01287262, + "balance_loss_clip": 0.06303795, + "balance_loss_mlp": 0.01263229, + "epoch": 0.20592214038779497, + "flos": 21330932947200.0, + "grad_norm": 1.6926765615616197, + "language_loss": 0.75646138, + "learning_rate": 3.684036715178351e-06, + "loss": 0.83478063, + "num_input_tokens_seen": 73977775, + "router_z_loss_clip": 2.41015625, + "router_z_loss_mlp": 0.24023438, + "step": 3425, + "time_per_iteration": 2.5351436138153076 + }, + { + "auxiliary_loss_clip": 0.06546403, + "auxiliary_loss_mlp": 0.01289796, + "balance_loss_clip": 0.06304145, + "balance_loss_mlp": 0.01266813, + "epoch": 0.20598226364046296, + "flos": 22898002406400.0, + "grad_norm": 1.848184132977354, + "language_loss": 0.88618112, + "learning_rate": 3.683826588585508e-06, + "loss": 0.9645431, + "num_input_tokens_seen": 73996590, + "router_z_loss_clip": 2.421875, + "router_z_loss_mlp": 0.22998047, + "step": 3426, + "time_per_iteration": 2.604752779006958 + }, + { + "auxiliary_loss_clip": 0.06551787, + "auxiliary_loss_mlp": 0.01284615, + "balance_loss_clip": 0.06311674, + "balance_loss_mlp": 0.01261226, + "epoch": 0.20604238689313092, + "flos": 23885362592640.0, + "grad_norm": 1.5517486951437824, + "language_loss": 0.77144063, + "learning_rate": 3.6836163981417926e-06, + "loss": 0.8498047, + "num_input_tokens_seen": 74015935, + "router_z_loss_clip": 2.40625, + "router_z_loss_mlp": 0.23376465, + "step": 3427, + "time_per_iteration": 2.5526115894317627 + }, + { + "auxiliary_loss_clip": 0.06556956, + "auxiliary_loss_mlp": 0.01287227, + "balance_loss_clip": 0.06309945, + "balance_loss_mlp": 0.01264661, + "epoch": 0.2061025101457989, + "flos": 22498024141440.0, + "grad_norm": 1.8896972045039995, + "language_loss": 0.74443614, + "learning_rate": 3.683406143855174e-06, + "loss": 0.822878, + "num_input_tokens_seen": 74036575, + "router_z_loss_clip": 2.46875, + "router_z_loss_mlp": 0.22558594, + "step": 3428, + "time_per_iteration": 2.5644474029541016 + }, + { + "auxiliary_loss_clip": 0.06552382, + "auxiliary_loss_mlp": 0.01283805, + "balance_loss_clip": 0.06304047, + "balance_loss_mlp": 0.01259427, + "epoch": 0.20616263339846685, + "flos": 22784713286400.0, + "grad_norm": 1.96097325322206, + "language_loss": 0.74164659, + "learning_rate": 3.6831958257336256e-06, + "loss": 0.82000846, + "num_input_tokens_seen": 74055365, + "router_z_loss_clip": 2.484375, + "router_z_loss_mlp": 0.24377441, + "step": 3429, + "time_per_iteration": 2.5337913036346436 + }, + { + "auxiliary_loss_clip": 0.06551956, + "auxiliary_loss_mlp": 0.01286455, + "balance_loss_clip": 0.06304303, + "balance_loss_mlp": 0.01263126, + "epoch": 0.20622275665113482, + "flos": 20887755102720.0, + "grad_norm": 2.9642283368918863, + "language_loss": 0.86220586, + "learning_rate": 3.6829854437851237e-06, + "loss": 0.94058996, + "num_input_tokens_seen": 74074875, + "router_z_loss_clip": 2.4765625, + "router_z_loss_mlp": 0.23327637, + "step": 3430, + "time_per_iteration": 2.5939443111419678 + }, + { + "auxiliary_loss_clip": 0.06546243, + "auxiliary_loss_mlp": 0.01280876, + "balance_loss_clip": 0.06300765, + "balance_loss_mlp": 0.01257607, + "epoch": 0.20628287990380278, + "flos": 19360489132800.0, + "grad_norm": 1.6588894263331828, + "language_loss": 0.70011377, + "learning_rate": 3.6827749980176444e-06, + "loss": 0.77838504, + "num_input_tokens_seen": 74094505, + "router_z_loss_clip": 2.45507812, + "router_z_loss_mlp": 0.23278809, + "step": 3431, + "time_per_iteration": 2.565840482711792 + }, + { + "auxiliary_loss_clip": 0.06410907, + "auxiliary_loss_mlp": 0.0126731, + "balance_loss_clip": 0.06280327, + "balance_loss_mlp": 0.01261215, + "epoch": 0.20634300315647078, + "flos": 71536970799360.0, + "grad_norm": 0.791675242165557, + "language_loss": 0.60400987, + "learning_rate": 3.6825644884391693e-06, + "loss": 0.68079197, + "num_input_tokens_seen": 74158500, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.0609436, + "step": 3432, + "time_per_iteration": 3.305082082748413 + }, + { + "auxiliary_loss_clip": 0.06552991, + "auxiliary_loss_mlp": 0.01280414, + "balance_loss_clip": 0.06308176, + "balance_loss_mlp": 0.01257561, + "epoch": 0.20640312640913874, + "flos": 21730072671360.0, + "grad_norm": 1.5897016059046762, + "language_loss": 0.72477019, + "learning_rate": 3.682353915057679e-06, + "loss": 0.80310422, + "num_input_tokens_seen": 74176685, + "router_z_loss_clip": 2.44921875, + "router_z_loss_mlp": 0.22875977, + "step": 3433, + "time_per_iteration": 2.564393997192383 + }, + { + "auxiliary_loss_clip": 0.06561184, + "auxiliary_loss_mlp": 0.01281531, + "balance_loss_clip": 0.06312474, + "balance_loss_mlp": 0.01258512, + "epoch": 0.2064632496618067, + "flos": 20560256219520.0, + "grad_norm": 1.7877531320590552, + "language_loss": 0.87141019, + "learning_rate": 3.6821432778811604e-06, + "loss": 0.94983733, + "num_input_tokens_seen": 74194935, + "router_z_loss_clip": 2.484375, + "router_z_loss_mlp": 0.23010254, + "step": 3434, + "time_per_iteration": 2.5466108322143555 + }, + { + "auxiliary_loss_clip": 0.06556005, + "auxiliary_loss_mlp": 0.01283316, + "balance_loss_clip": 0.06305495, + "balance_loss_mlp": 0.01259427, + "epoch": 0.20652337291447467, + "flos": 29830669666560.0, + "grad_norm": 1.6526860814470912, + "language_loss": 0.6970489, + "learning_rate": 3.6819325769176004e-06, + "loss": 0.77544212, + "num_input_tokens_seen": 74215400, + "router_z_loss_clip": 2.50585938, + "router_z_loss_mlp": 0.2388916, + "step": 3435, + "time_per_iteration": 2.613896369934082 + }, + { + "auxiliary_loss_clip": 0.06545977, + "auxiliary_loss_mlp": 0.01289312, + "balance_loss_clip": 0.0630382, + "balance_loss_mlp": 0.01264325, + "epoch": 0.20658349616714264, + "flos": 26220844719360.0, + "grad_norm": 1.7674379542335852, + "language_loss": 0.89957321, + "learning_rate": 3.681721812174988e-06, + "loss": 0.97792608, + "num_input_tokens_seen": 74234090, + "router_z_loss_clip": 2.42578125, + "router_z_loss_mlp": 0.24975586, + "step": 3436, + "time_per_iteration": 2.590360641479492 + }, + { + "auxiliary_loss_clip": 0.06548543, + "auxiliary_loss_mlp": 0.01277538, + "balance_loss_clip": 0.06303848, + "balance_loss_mlp": 0.01254209, + "epoch": 0.2066436194198106, + "flos": 26001477930240.0, + "grad_norm": 1.7140409089026185, + "language_loss": 0.77244872, + "learning_rate": 3.6815109836613163e-06, + "loss": 0.8507095, + "num_input_tokens_seen": 74253345, + "router_z_loss_clip": 2.44335938, + "router_z_loss_mlp": 0.23339844, + "step": 3437, + "time_per_iteration": 2.6068568229675293 + }, + { + "auxiliary_loss_clip": 0.06548648, + "auxiliary_loss_mlp": 0.01280201, + "balance_loss_clip": 0.06300757, + "balance_loss_mlp": 0.01257682, + "epoch": 0.20670374267247857, + "flos": 21367466127360.0, + "grad_norm": 2.0146667208247355, + "language_loss": 0.78725338, + "learning_rate": 3.6813000913845795e-06, + "loss": 0.86554188, + "num_input_tokens_seen": 74271615, + "router_z_loss_clip": 2.47460938, + "router_z_loss_mlp": 0.22521973, + "step": 3438, + "time_per_iteration": 2.567963123321533 + }, + { + "auxiliary_loss_clip": 0.06407821, + "auxiliary_loss_mlp": 0.01263014, + "balance_loss_clip": 0.06278364, + "balance_loss_mlp": 0.01257164, + "epoch": 0.20676386592514656, + "flos": 66403108264320.0, + "grad_norm": 0.8029327028802032, + "language_loss": 0.66817588, + "learning_rate": 3.6810891353527747e-06, + "loss": 0.74488425, + "num_input_tokens_seen": 74331390, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.05844116, + "step": 3439, + "time_per_iteration": 3.1231849193573 + }, + { + "auxiliary_loss_clip": 0.06557775, + "auxiliary_loss_mlp": 0.01283609, + "balance_loss_clip": 0.06302103, + "balance_loss_mlp": 0.01260423, + "epoch": 0.20682398917781453, + "flos": 17280278069760.0, + "grad_norm": 1.9287299109512155, + "language_loss": 0.8404541, + "learning_rate": 3.6808781155739014e-06, + "loss": 0.91886795, + "num_input_tokens_seen": 74347335, + "router_z_loss_clip": 2.5546875, + "router_z_loss_mlp": 0.23168945, + "step": 3440, + "time_per_iteration": 2.496563196182251 + }, + { + "auxiliary_loss_clip": 0.06545421, + "auxiliary_loss_mlp": 0.01282262, + "balance_loss_clip": 0.06298509, + "balance_loss_mlp": 0.0126028, + "epoch": 0.2068841124304825, + "flos": 18083127565440.0, + "grad_norm": 3.100665935871663, + "language_loss": 0.85299611, + "learning_rate": 3.6806670320559614e-06, + "loss": 0.93127292, + "num_input_tokens_seen": 74366310, + "router_z_loss_clip": 2.47265625, + "router_z_loss_mlp": 0.2199707, + "step": 3441, + "time_per_iteration": 2.528823137283325 + }, + { + "auxiliary_loss_clip": 0.06546343, + "auxiliary_loss_mlp": 0.01282668, + "balance_loss_clip": 0.06300771, + "balance_loss_mlp": 0.01258958, + "epoch": 0.20694423568315046, + "flos": 27354798823680.0, + "grad_norm": 1.6487564578537555, + "language_loss": 0.86298448, + "learning_rate": 3.680455884806959e-06, + "loss": 0.94127464, + "num_input_tokens_seen": 74387100, + "router_z_loss_clip": 2.45898438, + "router_z_loss_mlp": 0.23693848, + "step": 3442, + "time_per_iteration": 2.5904433727264404 + }, + { + "auxiliary_loss_clip": 0.06553168, + "auxiliary_loss_mlp": 0.0128107, + "balance_loss_clip": 0.06302296, + "balance_loss_mlp": 0.01256298, + "epoch": 0.20700435893581842, + "flos": 20236027645440.0, + "grad_norm": 1.991917549605425, + "language_loss": 0.74110967, + "learning_rate": 3.6802446738349014e-06, + "loss": 0.81945205, + "num_input_tokens_seen": 74404460, + "router_z_loss_clip": 2.50976562, + "router_z_loss_mlp": 0.24755859, + "step": 3443, + "time_per_iteration": 2.546297311782837 + }, + { + "auxiliary_loss_clip": 0.06540793, + "auxiliary_loss_mlp": 0.01282, + "balance_loss_clip": 0.06298059, + "balance_loss_mlp": 0.01259183, + "epoch": 0.2070644821884864, + "flos": 20637347575680.0, + "grad_norm": 5.522598582225395, + "language_loss": 0.86263227, + "learning_rate": 3.680033399147797e-06, + "loss": 0.94086015, + "num_input_tokens_seen": 74423790, + "router_z_loss_clip": 2.42773438, + "router_z_loss_mlp": 0.22814941, + "step": 3444, + "time_per_iteration": 2.5644776821136475 + }, + { + "auxiliary_loss_clip": 0.06396829, + "auxiliary_loss_mlp": 0.01270586, + "balance_loss_clip": 0.06267206, + "balance_loss_mlp": 0.01264399, + "epoch": 0.20712460544115438, + "flos": 65960098128000.0, + "grad_norm": 0.6752802627643808, + "language_loss": 0.56895542, + "learning_rate": 3.6798220607536585e-06, + "loss": 0.64562953, + "num_input_tokens_seen": 74488130, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.06185913, + "step": 3445, + "time_per_iteration": 3.133159637451172 + }, + { + "auxiliary_loss_clip": 0.06550106, + "auxiliary_loss_mlp": 0.0128273, + "balance_loss_clip": 0.06305806, + "balance_loss_mlp": 0.01259412, + "epoch": 0.20718472869382235, + "flos": 19431542995200.0, + "grad_norm": 1.845349461285762, + "language_loss": 0.78388685, + "learning_rate": 3.6796106586604987e-06, + "loss": 0.86221522, + "num_input_tokens_seen": 74506720, + "router_z_loss_clip": 2.4453125, + "router_z_loss_mlp": 0.23327637, + "step": 3446, + "time_per_iteration": 2.5563149452209473 + }, + { + "auxiliary_loss_clip": 0.06562304, + "auxiliary_loss_mlp": 0.0128875, + "balance_loss_clip": 0.06302087, + "balance_loss_mlp": 0.01263215, + "epoch": 0.2072448519464903, + "flos": 24506007384960.0, + "grad_norm": 2.528724295630225, + "language_loss": 0.63215572, + "learning_rate": 3.679399192876334e-06, + "loss": 0.7106663, + "num_input_tokens_seen": 74525330, + "router_z_loss_clip": 2.60351562, + "router_z_loss_mlp": 0.25549316, + "step": 3447, + "time_per_iteration": 2.5858354568481445 + }, + { + "auxiliary_loss_clip": 0.06550243, + "auxiliary_loss_mlp": 0.01285454, + "balance_loss_clip": 0.06302016, + "balance_loss_mlp": 0.01261624, + "epoch": 0.20730497519915828, + "flos": 23082345388800.0, + "grad_norm": 1.7246458475869415, + "language_loss": 0.87330115, + "learning_rate": 3.679187663409184e-06, + "loss": 0.95165813, + "num_input_tokens_seen": 74544535, + "router_z_loss_clip": 2.47851562, + "router_z_loss_mlp": 0.23840332, + "step": 3448, + "time_per_iteration": 2.5367424488067627 + }, + { + "auxiliary_loss_clip": 0.06547908, + "auxiliary_loss_mlp": 0.01287375, + "balance_loss_clip": 0.06301224, + "balance_loss_mlp": 0.0126407, + "epoch": 0.20736509845182624, + "flos": 21075368394240.0, + "grad_norm": 2.238353970842136, + "language_loss": 0.75934261, + "learning_rate": 3.6789760702670696e-06, + "loss": 0.83769548, + "num_input_tokens_seen": 74562300, + "router_z_loss_clip": 2.47070312, + "router_z_loss_mlp": 0.23291016, + "step": 3449, + "time_per_iteration": 3.94480562210083 + }, + { + "auxiliary_loss_clip": 0.06557415, + "auxiliary_loss_mlp": 0.01291462, + "balance_loss_clip": 0.06305711, + "balance_loss_mlp": 0.01267262, + "epoch": 0.2074252217044942, + "flos": 17638021077120.0, + "grad_norm": 1.9890451191355467, + "language_loss": 0.77508813, + "learning_rate": 3.6787644134580134e-06, + "loss": 0.8535769, + "num_input_tokens_seen": 74580080, + "router_z_loss_clip": 2.51757812, + "router_z_loss_mlp": 0.24243164, + "step": 3450, + "time_per_iteration": 2.545430898666382 + }, + { + "auxiliary_loss_clip": 0.06561074, + "auxiliary_loss_mlp": 0.01294493, + "balance_loss_clip": 0.06309673, + "balance_loss_mlp": 0.01270579, + "epoch": 0.20748534495716217, + "flos": 23553209808000.0, + "grad_norm": 2.274256725147599, + "language_loss": 0.823879, + "learning_rate": 3.6785526929900436e-06, + "loss": 0.90243471, + "num_input_tokens_seen": 74598980, + "router_z_loss_clip": 2.515625, + "router_z_loss_mlp": 0.23913574, + "step": 3451, + "time_per_iteration": 4.003388404846191 + }, + { + "auxiliary_loss_clip": 0.0640305, + "auxiliary_loss_mlp": 0.01254439, + "balance_loss_clip": 0.06273949, + "balance_loss_mlp": 0.01248494, + "epoch": 0.20754546820983016, + "flos": 52268666757120.0, + "grad_norm": 0.7675919354914552, + "language_loss": 0.56549037, + "learning_rate": 3.6783409088711875e-06, + "loss": 0.64206523, + "num_input_tokens_seen": 74655275, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.05941772, + "step": 3452, + "time_per_iteration": 3.0660083293914795 + }, + { + "auxiliary_loss_clip": 0.06557937, + "auxiliary_loss_mlp": 0.01287582, + "balance_loss_clip": 0.06309802, + "balance_loss_mlp": 0.01264956, + "epoch": 0.20760559146249813, + "flos": 20418609692160.0, + "grad_norm": 1.8872949255610445, + "language_loss": 0.88967919, + "learning_rate": 3.6781290611094755e-06, + "loss": 0.9681344, + "num_input_tokens_seen": 74674560, + "router_z_loss_clip": 2.48046875, + "router_z_loss_mlp": 0.22619629, + "step": 3453, + "time_per_iteration": 2.581430673599243 + }, + { + "auxiliary_loss_clip": 0.06554953, + "auxiliary_loss_mlp": 0.01287205, + "balance_loss_clip": 0.06307904, + "balance_loss_mlp": 0.01263256, + "epoch": 0.2076657147151661, + "flos": 23192825397120.0, + "grad_norm": 1.4776896143180385, + "language_loss": 0.80720532, + "learning_rate": 3.6779171497129407e-06, + "loss": 0.88562691, + "num_input_tokens_seen": 74694500, + "router_z_loss_clip": 2.47070312, + "router_z_loss_mlp": 0.23962402, + "step": 3454, + "time_per_iteration": 2.5793018341064453 + }, + { + "auxiliary_loss_clip": 0.06549348, + "auxiliary_loss_mlp": 0.01286388, + "balance_loss_clip": 0.06301847, + "balance_loss_mlp": 0.01263476, + "epoch": 0.20772583796783406, + "flos": 18298595139840.0, + "grad_norm": 4.241833159654324, + "language_loss": 0.78446364, + "learning_rate": 3.6777051746896202e-06, + "loss": 0.86282104, + "num_input_tokens_seen": 74710485, + "router_z_loss_clip": 2.4765625, + "router_z_loss_mlp": 0.22912598, + "step": 3455, + "time_per_iteration": 2.5377535820007324 + }, + { + "auxiliary_loss_clip": 0.0654678, + "auxiliary_loss_mlp": 0.01279125, + "balance_loss_clip": 0.06301546, + "balance_loss_mlp": 0.01256547, + "epoch": 0.20778596122050202, + "flos": 17608531639680.0, + "grad_norm": 1.6321737814924744, + "language_loss": 0.81251496, + "learning_rate": 3.6774931360475516e-06, + "loss": 0.89077407, + "num_input_tokens_seen": 74727450, + "router_z_loss_clip": 2.453125, + "router_z_loss_mlp": 0.22595215, + "step": 3456, + "time_per_iteration": 2.5125768184661865 + }, + { + "auxiliary_loss_clip": 0.06554688, + "auxiliary_loss_mlp": 0.01282924, + "balance_loss_clip": 0.06304802, + "balance_loss_mlp": 0.01259893, + "epoch": 0.20784608447317, + "flos": 23812380086400.0, + "grad_norm": 2.3276439316102695, + "language_loss": 0.79071975, + "learning_rate": 3.6772810337947745e-06, + "loss": 0.86909586, + "num_input_tokens_seen": 74746725, + "router_z_loss_clip": 2.49804688, + "router_z_loss_mlp": 0.23022461, + "step": 3457, + "time_per_iteration": 5.41590428352356 + }, + { + "auxiliary_loss_clip": 0.06553855, + "auxiliary_loss_mlp": 0.01279092, + "balance_loss_clip": 0.0630386, + "balance_loss_mlp": 0.01255739, + "epoch": 0.20790620772583795, + "flos": 17645022892800.0, + "grad_norm": 1.9963286729709264, + "language_loss": 0.84664595, + "learning_rate": 3.677068867939333e-06, + "loss": 0.9249754, + "num_input_tokens_seen": 74765255, + "router_z_loss_clip": 2.49609375, + "router_z_loss_mlp": 0.23364258, + "step": 3458, + "time_per_iteration": 2.610107183456421 + }, + { + "auxiliary_loss_clip": 0.06541788, + "auxiliary_loss_mlp": 0.01277058, + "balance_loss_clip": 0.06299603, + "balance_loss_mlp": 0.01254289, + "epoch": 0.20796633097850595, + "flos": 27680997968640.0, + "grad_norm": 1.7522329071194311, + "language_loss": 0.76853168, + "learning_rate": 3.676856638489272e-06, + "loss": 0.8467201, + "num_input_tokens_seen": 74785710, + "router_z_loss_clip": 2.41796875, + "router_z_loss_mlp": 0.2277832, + "step": 3459, + "time_per_iteration": 2.63517689704895 + }, + { + "auxiliary_loss_clip": 0.06543219, + "auxiliary_loss_mlp": 0.01279579, + "balance_loss_clip": 0.06299554, + "balance_loss_mlp": 0.01257024, + "epoch": 0.2080264542311739, + "flos": 19251770060160.0, + "grad_norm": 1.8057193688460893, + "language_loss": 0.77803749, + "learning_rate": 3.6766443454526382e-06, + "loss": 0.85626543, + "num_input_tokens_seen": 74804490, + "router_z_loss_clip": 2.43945312, + "router_z_loss_mlp": 0.22570801, + "step": 3460, + "time_per_iteration": 2.5500359535217285 + }, + { + "auxiliary_loss_clip": 0.06544735, + "auxiliary_loss_mlp": 0.01276939, + "balance_loss_clip": 0.06297737, + "balance_loss_mlp": 0.01255315, + "epoch": 0.20808657748384188, + "flos": 27533146239360.0, + "grad_norm": 1.865214089074118, + "language_loss": 0.76152873, + "learning_rate": 3.6764319888374836e-06, + "loss": 0.8397454, + "num_input_tokens_seen": 74826340, + "router_z_loss_clip": 2.46484375, + "router_z_loss_mlp": 0.21618652, + "step": 3461, + "time_per_iteration": 2.575975179672241 + }, + { + "auxiliary_loss_clip": 0.06554922, + "auxiliary_loss_mlp": 0.01279751, + "balance_loss_clip": 0.06301013, + "balance_loss_mlp": 0.01256183, + "epoch": 0.20814670073650984, + "flos": 26914262382720.0, + "grad_norm": 2.229402903272821, + "language_loss": 0.89438462, + "learning_rate": 3.6762195686518604e-06, + "loss": 0.97273135, + "num_input_tokens_seen": 74844960, + "router_z_loss_clip": 2.53515625, + "router_z_loss_mlp": 0.23571777, + "step": 3462, + "time_per_iteration": 2.5732173919677734 + }, + { + "auxiliary_loss_clip": 0.06402825, + "auxiliary_loss_mlp": 0.01283843, + "balance_loss_clip": 0.06274004, + "balance_loss_mlp": 0.01278395, + "epoch": 0.2082068239891778, + "flos": 70195850674560.0, + "grad_norm": 0.9150130859854356, + "language_loss": 0.59001637, + "learning_rate": 3.6760070849038226e-06, + "loss": 0.66688299, + "num_input_tokens_seen": 74909075, + "router_z_loss_clip": 1.28417969, + "router_z_loss_mlp": 0.05456543, + "step": 3463, + "time_per_iteration": 3.269202709197998 + }, + { + "auxiliary_loss_clip": 0.06550549, + "auxiliary_loss_mlp": 0.01282784, + "balance_loss_clip": 0.06300794, + "balance_loss_mlp": 0.01257929, + "epoch": 0.20826694724184577, + "flos": 24614978019840.0, + "grad_norm": 2.6522237220698663, + "language_loss": 0.66949397, + "learning_rate": 3.675794537601429e-06, + "loss": 0.74782729, + "num_input_tokens_seen": 74928125, + "router_z_loss_clip": 2.5, + "router_z_loss_mlp": 0.2487793, + "step": 3464, + "time_per_iteration": 2.5638158321380615 + }, + { + "auxiliary_loss_clip": 0.06556059, + "auxiliary_loss_mlp": 0.01287892, + "balance_loss_clip": 0.06307128, + "balance_loss_mlp": 0.01263299, + "epoch": 0.20832707049451377, + "flos": 12897218845440.0, + "grad_norm": 2.2476817474527913, + "language_loss": 0.84321886, + "learning_rate": 3.6755819267527373e-06, + "loss": 0.9216584, + "num_input_tokens_seen": 74945090, + "router_z_loss_clip": 2.49023438, + "router_z_loss_mlp": 0.24609375, + "step": 3465, + "time_per_iteration": 2.5794646739959717 + }, + { + "auxiliary_loss_clip": 0.06542073, + "auxiliary_loss_mlp": 0.01282156, + "balance_loss_clip": 0.06295872, + "balance_loss_mlp": 0.01258326, + "epoch": 0.20838719374718173, + "flos": 22205129794560.0, + "grad_norm": 3.281235222185926, + "language_loss": 0.82741451, + "learning_rate": 3.6753692523658113e-06, + "loss": 0.90565681, + "num_input_tokens_seen": 74963630, + "router_z_loss_clip": 2.46289062, + "router_z_loss_mlp": 0.23828125, + "step": 3466, + "time_per_iteration": 2.540011405944824 + }, + { + "auxiliary_loss_clip": 0.06540319, + "auxiliary_loss_mlp": 0.01287937, + "balance_loss_clip": 0.06300111, + "balance_loss_mlp": 0.01267243, + "epoch": 0.2084473169998497, + "flos": 15164036951040.0, + "grad_norm": 2.490655035944783, + "language_loss": 0.82892549, + "learning_rate": 3.675156514448716e-06, + "loss": 0.90720803, + "num_input_tokens_seen": 74981875, + "router_z_loss_clip": 2.40234375, + "router_z_loss_mlp": 0.20690918, + "step": 3467, + "time_per_iteration": 2.54622745513916 + }, + { + "auxiliary_loss_clip": 0.06540733, + "auxiliary_loss_mlp": 0.01289148, + "balance_loss_clip": 0.06303266, + "balance_loss_mlp": 0.01268167, + "epoch": 0.20850744025251766, + "flos": 17462482773120.0, + "grad_norm": 1.8114532422505003, + "language_loss": 0.82299387, + "learning_rate": 3.674943713009518e-06, + "loss": 0.90129268, + "num_input_tokens_seen": 74999155, + "router_z_loss_clip": 2.37304688, + "router_z_loss_mlp": 0.2097168, + "step": 3468, + "time_per_iteration": 2.5321285724639893 + }, + { + "auxiliary_loss_clip": 0.06553383, + "auxiliary_loss_mlp": 0.01280357, + "balance_loss_clip": 0.06302625, + "balance_loss_mlp": 0.01257158, + "epoch": 0.20856756350518563, + "flos": 25705439055360.0, + "grad_norm": 1.667306072143411, + "language_loss": 0.9042781, + "learning_rate": 3.6747308480562856e-06, + "loss": 0.98261553, + "num_input_tokens_seen": 75017850, + "router_z_loss_clip": 2.5078125, + "router_z_loss_mlp": 0.23217773, + "step": 3469, + "time_per_iteration": 2.6107866764068604 + }, + { + "auxiliary_loss_clip": 0.0655106, + "auxiliary_loss_mlp": 0.01281556, + "balance_loss_clip": 0.06308927, + "balance_loss_mlp": 0.01259872, + "epoch": 0.2086276867578536, + "flos": 37898213425920.0, + "grad_norm": 1.9476878714472061, + "language_loss": 0.77294397, + "learning_rate": 3.674517919597092e-06, + "loss": 0.85127008, + "num_input_tokens_seen": 75039270, + "router_z_loss_clip": 2.421875, + "router_z_loss_mlp": 0.21679688, + "step": 3470, + "time_per_iteration": 2.7083425521850586 + }, + { + "auxiliary_loss_clip": 0.06547298, + "auxiliary_loss_mlp": 0.01289218, + "balance_loss_clip": 0.06307482, + "balance_loss_mlp": 0.01266283, + "epoch": 0.20868781001052156, + "flos": 25564169871360.0, + "grad_norm": 1.8036684586339249, + "language_loss": 0.76289082, + "learning_rate": 3.674304927640011e-06, + "loss": 0.84125602, + "num_input_tokens_seen": 75059350, + "router_z_loss_clip": 2.40039062, + "router_z_loss_mlp": 0.22937012, + "step": 3471, + "time_per_iteration": 2.589884042739868 + }, + { + "auxiliary_loss_clip": 0.06554438, + "auxiliary_loss_mlp": 0.01280867, + "balance_loss_clip": 0.06303854, + "balance_loss_mlp": 0.01259028, + "epoch": 0.20874793326318955, + "flos": 27536961600000.0, + "grad_norm": 1.6381609540737498, + "language_loss": 0.76341867, + "learning_rate": 3.67409187219312e-06, + "loss": 0.84177172, + "num_input_tokens_seen": 75080150, + "router_z_loss_clip": 2.5078125, + "router_z_loss_mlp": 0.21813965, + "step": 3472, + "time_per_iteration": 2.610260009765625 + }, + { + "auxiliary_loss_clip": 0.06544036, + "auxiliary_loss_mlp": 0.01279562, + "balance_loss_clip": 0.06302247, + "balance_loss_mlp": 0.01259022, + "epoch": 0.20880805651585752, + "flos": 18554243546880.0, + "grad_norm": 2.073955911698539, + "language_loss": 0.85418117, + "learning_rate": 3.6738787532644966e-06, + "loss": 0.93241715, + "num_input_tokens_seen": 75097920, + "router_z_loss_clip": 2.41601562, + "router_z_loss_mlp": 0.20532227, + "step": 3473, + "time_per_iteration": 2.5741372108459473 + }, + { + "auxiliary_loss_clip": 0.06431094, + "auxiliary_loss_mlp": 0.01255526, + "balance_loss_clip": 0.06305239, + "balance_loss_mlp": 0.01250132, + "epoch": 0.20886817976852548, + "flos": 65966596819200.0, + "grad_norm": 0.8661888314681573, + "language_loss": 0.63746876, + "learning_rate": 3.6736655708622235e-06, + "loss": 0.71433502, + "num_input_tokens_seen": 75152410, + "router_z_loss_clip": 1.25585938, + "router_z_loss_mlp": 0.05401611, + "step": 3474, + "time_per_iteration": 3.061617612838745 + }, + { + "auxiliary_loss_clip": 0.06545534, + "auxiliary_loss_mlp": 0.01278543, + "balance_loss_clip": 0.06299987, + "balance_loss_mlp": 0.01255751, + "epoch": 0.20892830302119345, + "flos": 36548120914560.0, + "grad_norm": 1.9594452651536962, + "language_loss": 0.70746702, + "learning_rate": 3.6734523249943844e-06, + "loss": 0.78570777, + "num_input_tokens_seen": 75173265, + "router_z_loss_clip": 2.45898438, + "router_z_loss_mlp": 0.22790527, + "step": 3475, + "time_per_iteration": 2.7295854091644287 + }, + { + "auxiliary_loss_clip": 0.06544538, + "auxiliary_loss_mlp": 0.01277403, + "balance_loss_clip": 0.06299123, + "balance_loss_mlp": 0.01255754, + "epoch": 0.2089884262738614, + "flos": 20962582398720.0, + "grad_norm": 1.6086426160627472, + "language_loss": 0.70801485, + "learning_rate": 3.673239015669065e-06, + "loss": 0.78623426, + "num_input_tokens_seen": 75193640, + "router_z_loss_clip": 2.45703125, + "router_z_loss_mlp": 0.21643066, + "step": 3476, + "time_per_iteration": 2.6065874099731445 + }, + { + "auxiliary_loss_clip": 0.06538086, + "auxiliary_loss_mlp": 0.01275305, + "balance_loss_clip": 0.06299278, + "balance_loss_mlp": 0.0125523, + "epoch": 0.20904854952652938, + "flos": 22790666926080.0, + "grad_norm": 1.9785394209574967, + "language_loss": 0.90003526, + "learning_rate": 3.6730256428943544e-06, + "loss": 0.9781692, + "num_input_tokens_seen": 75212545, + "router_z_loss_clip": 2.390625, + "router_z_loss_mlp": 0.20080566, + "step": 3477, + "time_per_iteration": 2.5576000213623047 + }, + { + "auxiliary_loss_clip": 0.06542666, + "auxiliary_loss_mlp": 0.01278801, + "balance_loss_clip": 0.06302647, + "balance_loss_mlp": 0.01257594, + "epoch": 0.20910867277919734, + "flos": 27309838308480.0, + "grad_norm": 2.554960999675803, + "language_loss": 0.69433093, + "learning_rate": 3.672812206678344e-06, + "loss": 0.77254558, + "num_input_tokens_seen": 75230865, + "router_z_loss_clip": 2.40234375, + "router_z_loss_mlp": 0.21203613, + "step": 3478, + "time_per_iteration": 2.605890989303589 + }, + { + "auxiliary_loss_clip": 0.0654031, + "auxiliary_loss_mlp": 0.01282288, + "balance_loss_clip": 0.06298592, + "balance_loss_mlp": 0.01260461, + "epoch": 0.20916879603186533, + "flos": 14324444640000.0, + "grad_norm": 1.9959140715838508, + "language_loss": 0.85550553, + "learning_rate": 3.672598707029127e-06, + "loss": 0.93373156, + "num_input_tokens_seen": 75248285, + "router_z_loss_clip": 2.41992188, + "router_z_loss_mlp": 0.21813965, + "step": 3479, + "time_per_iteration": 2.5808637142181396 + }, + { + "auxiliary_loss_clip": 0.06542581, + "auxiliary_loss_mlp": 0.01279649, + "balance_loss_clip": 0.06299447, + "balance_loss_mlp": 0.01258072, + "epoch": 0.2092289192845333, + "flos": 22279537820160.0, + "grad_norm": 2.3833241848820372, + "language_loss": 0.75129831, + "learning_rate": 3.6723851439548003e-06, + "loss": 0.82952058, + "num_input_tokens_seen": 75266310, + "router_z_loss_clip": 2.43359375, + "router_z_loss_mlp": 0.21569824, + "step": 3480, + "time_per_iteration": 2.519789218902588 + }, + { + "auxiliary_loss_clip": 0.06546038, + "auxiliary_loss_mlp": 0.01278892, + "balance_loss_clip": 0.06306421, + "balance_loss_mlp": 0.01258495, + "epoch": 0.20928904253720126, + "flos": 14836118797440.0, + "grad_norm": 2.1621149118450163, + "language_loss": 0.7689389, + "learning_rate": 3.67217151746346e-06, + "loss": 0.84718817, + "num_input_tokens_seen": 75284175, + "router_z_loss_clip": 2.39648438, + "router_z_loss_mlp": 0.20410156, + "step": 3481, + "time_per_iteration": 2.541019916534424 + }, + { + "auxiliary_loss_clip": 0.06542054, + "auxiliary_loss_mlp": 0.01279748, + "balance_loss_clip": 0.06299154, + "balance_loss_mlp": 0.01257718, + "epoch": 0.20934916578986923, + "flos": 23266017538560.0, + "grad_norm": 1.9029543431357738, + "language_loss": 0.85756385, + "learning_rate": 3.671957827563209e-06, + "loss": 0.93578184, + "num_input_tokens_seen": 75303465, + "router_z_loss_clip": 2.4296875, + "router_z_loss_mlp": 0.22021484, + "step": 3482, + "time_per_iteration": 2.57550048828125 + }, + { + "auxiliary_loss_clip": 0.06538534, + "auxiliary_loss_mlp": 0.01281551, + "balance_loss_clip": 0.0629866, + "balance_loss_mlp": 0.01260237, + "epoch": 0.2094092890425372, + "flos": 32022492768000.0, + "grad_norm": 2.0122422455266076, + "language_loss": 0.71876764, + "learning_rate": 3.6717440742621494e-06, + "loss": 0.79696846, + "num_input_tokens_seen": 75325290, + "router_z_loss_clip": 2.3984375, + "router_z_loss_mlp": 0.21325684, + "step": 3483, + "time_per_iteration": 2.6664113998413086 + }, + { + "auxiliary_loss_clip": 0.06543796, + "auxiliary_loss_mlp": 0.01277426, + "balance_loss_clip": 0.06297769, + "balance_loss_mlp": 0.0125567, + "epoch": 0.20946941229520516, + "flos": 20016744710400.0, + "grad_norm": 1.623254768822543, + "language_loss": 0.75620067, + "learning_rate": 3.6715302575683865e-06, + "loss": 0.83441281, + "num_input_tokens_seen": 75343895, + "router_z_loss_clip": 2.45898438, + "router_z_loss_mlp": 0.21728516, + "step": 3484, + "time_per_iteration": 2.537745714187622 + }, + { + "auxiliary_loss_clip": 0.06537648, + "auxiliary_loss_mlp": 0.01274667, + "balance_loss_clip": 0.0629506, + "balance_loss_mlp": 0.01252733, + "epoch": 0.20952953554787315, + "flos": 30748401509760.0, + "grad_norm": 1.6710062021876058, + "language_loss": 0.71473777, + "learning_rate": 3.6713163774900292e-06, + "loss": 0.79286093, + "num_input_tokens_seen": 75367100, + "router_z_loss_clip": 2.42382812, + "router_z_loss_mlp": 0.21936035, + "step": 3485, + "time_per_iteration": 2.6310439109802246 + }, + { + "auxiliary_loss_clip": 0.0654947, + "auxiliary_loss_mlp": 0.01281894, + "balance_loss_clip": 0.06304678, + "balance_loss_mlp": 0.01258517, + "epoch": 0.20958965880054112, + "flos": 27055950837120.0, + "grad_norm": 1.7793136829828902, + "language_loss": 0.83105123, + "learning_rate": 3.6711024340351875e-06, + "loss": 0.90936482, + "num_input_tokens_seen": 75389925, + "router_z_loss_clip": 2.44921875, + "router_z_loss_mlp": 0.23376465, + "step": 3486, + "time_per_iteration": 2.5819222927093506 + }, + { + "auxiliary_loss_clip": 0.06539689, + "auxiliary_loss_mlp": 0.01279221, + "balance_loss_clip": 0.06297638, + "balance_loss_mlp": 0.01257978, + "epoch": 0.20964978205320908, + "flos": 34212680714880.0, + "grad_norm": 2.582218695391969, + "language_loss": 0.87821579, + "learning_rate": 3.6708884272119737e-06, + "loss": 0.95640486, + "num_input_tokens_seen": 75408575, + "router_z_loss_clip": 2.421875, + "router_z_loss_mlp": 0.21240234, + "step": 3487, + "time_per_iteration": 2.639369487762451 + }, + { + "auxiliary_loss_clip": 0.06538714, + "auxiliary_loss_mlp": 0.01279661, + "balance_loss_clip": 0.06298582, + "balance_loss_mlp": 0.01258227, + "epoch": 0.20970990530587705, + "flos": 23484168443520.0, + "grad_norm": 2.287931950731532, + "language_loss": 0.72719586, + "learning_rate": 3.670674357028504e-06, + "loss": 0.80537963, + "num_input_tokens_seen": 75427155, + "router_z_loss_clip": 2.40039062, + "router_z_loss_mlp": 0.21411133, + "step": 3488, + "time_per_iteration": 3.9480032920837402 + }, + { + "auxiliary_loss_clip": 0.06540683, + "auxiliary_loss_mlp": 0.01275293, + "balance_loss_clip": 0.06299531, + "balance_loss_mlp": 0.01255123, + "epoch": 0.209770028558545, + "flos": 18557346147840.0, + "grad_norm": 2.67396224290917, + "language_loss": 0.81189376, + "learning_rate": 3.6704602234928945e-06, + "loss": 0.89005351, + "num_input_tokens_seen": 75444450, + "router_z_loss_clip": 2.41210938, + "router_z_loss_mlp": 0.20178223, + "step": 3489, + "time_per_iteration": 2.500709295272827 + }, + { + "auxiliary_loss_clip": 0.0654545, + "auxiliary_loss_mlp": 0.01278304, + "balance_loss_clip": 0.06303608, + "balance_loss_mlp": 0.0125724, + "epoch": 0.20983015181121298, + "flos": 21623533804800.0, + "grad_norm": 2.0567102060198743, + "language_loss": 0.73407692, + "learning_rate": 3.670246026613266e-06, + "loss": 0.81231445, + "num_input_tokens_seen": 75462625, + "router_z_loss_clip": 2.41992188, + "router_z_loss_mlp": 0.21057129, + "step": 3490, + "time_per_iteration": 2.5622947216033936 + }, + { + "auxiliary_loss_clip": 0.06534347, + "auxiliary_loss_mlp": 0.01280989, + "balance_loss_clip": 0.06300151, + "balance_loss_mlp": 0.01260128, + "epoch": 0.20989027506388094, + "flos": 16619787861120.0, + "grad_norm": 1.7677892351641744, + "language_loss": 0.71503973, + "learning_rate": 3.6700317663977415e-06, + "loss": 0.7931931, + "num_input_tokens_seen": 75480640, + "router_z_loss_clip": 2.34570312, + "router_z_loss_mlp": 0.20849609, + "step": 3491, + "time_per_iteration": 4.0022783279418945 + }, + { + "auxiliary_loss_clip": 0.06542461, + "auxiliary_loss_mlp": 0.01283797, + "balance_loss_clip": 0.0629908, + "balance_loss_mlp": 0.01260957, + "epoch": 0.20995039831654894, + "flos": 23222692177920.0, + "grad_norm": 2.702657778988086, + "language_loss": 0.80329478, + "learning_rate": 3.669817442854444e-06, + "loss": 0.88155735, + "num_input_tokens_seen": 75494900, + "router_z_loss_clip": 2.43164062, + "router_z_loss_mlp": 0.22839355, + "step": 3492, + "time_per_iteration": 2.5376975536346436 + }, + { + "auxiliary_loss_clip": 0.06546506, + "auxiliary_loss_mlp": 0.01283519, + "balance_loss_clip": 0.06307527, + "balance_loss_mlp": 0.01262741, + "epoch": 0.2100105215692169, + "flos": 18152881689600.0, + "grad_norm": 1.9319737068083613, + "language_loss": 0.87613726, + "learning_rate": 3.669603055991502e-06, + "loss": 0.95443749, + "num_input_tokens_seen": 75513370, + "router_z_loss_clip": 2.39453125, + "router_z_loss_mlp": 0.20800781, + "step": 3493, + "time_per_iteration": 2.5462660789489746 + }, + { + "auxiliary_loss_clip": 0.06538918, + "auxiliary_loss_mlp": 0.01283808, + "balance_loss_clip": 0.06303683, + "balance_loss_mlp": 0.01262673, + "epoch": 0.21007064482188487, + "flos": 15967179936000.0, + "grad_norm": 1.7380368048158776, + "language_loss": 0.69753766, + "learning_rate": 3.6693886058170455e-06, + "loss": 0.77576494, + "num_input_tokens_seen": 75532480, + "router_z_loss_clip": 2.35351562, + "router_z_loss_mlp": 0.21130371, + "step": 3494, + "time_per_iteration": 2.523575782775879 + }, + { + "auxiliary_loss_clip": 0.0654956, + "auxiliary_loss_mlp": 0.0128408, + "balance_loss_clip": 0.06306064, + "balance_loss_mlp": 0.01262598, + "epoch": 0.21013076807455283, + "flos": 32242614243840.0, + "grad_norm": 1.6795437076377473, + "language_loss": 0.79639518, + "learning_rate": 3.6691740923392053e-06, + "loss": 0.87473154, + "num_input_tokens_seen": 75552745, + "router_z_loss_clip": 2.4375, + "router_z_loss_mlp": 0.21472168, + "step": 3495, + "time_per_iteration": 2.679564952850342 + }, + { + "auxiliary_loss_clip": 0.06543255, + "auxiliary_loss_mlp": 0.01280683, + "balance_loss_clip": 0.06300748, + "balance_loss_mlp": 0.01258832, + "epoch": 0.2101908913272208, + "flos": 23703493305600.0, + "grad_norm": 2.110842443067005, + "language_loss": 0.77733672, + "learning_rate": 3.668959515566116e-06, + "loss": 0.85557616, + "num_input_tokens_seen": 75574355, + "router_z_loss_clip": 2.42773438, + "router_z_loss_mlp": 0.21862793, + "step": 3496, + "time_per_iteration": 2.5728261470794678 + }, + { + "auxiliary_loss_clip": 0.06546371, + "auxiliary_loss_mlp": 0.01280297, + "balance_loss_clip": 0.06304142, + "balance_loss_mlp": 0.01257993, + "epoch": 0.21025101457988876, + "flos": 20381992657920.0, + "grad_norm": 2.1840810602746643, + "language_loss": 0.82214069, + "learning_rate": 3.668744875505915e-06, + "loss": 0.90040743, + "num_input_tokens_seen": 75592215, + "router_z_loss_clip": 2.421875, + "router_z_loss_mlp": 0.22302246, + "step": 3497, + "time_per_iteration": 5.435751438140869 + }, + { + "auxiliary_loss_clip": 0.06554863, + "auxiliary_loss_mlp": 0.01281759, + "balance_loss_clip": 0.06307989, + "balance_loss_mlp": 0.01259205, + "epoch": 0.21031113783255675, + "flos": 25782740046720.0, + "grad_norm": 1.9653925911520136, + "language_loss": 0.68009126, + "learning_rate": 3.668530172166741e-06, + "loss": 0.75845742, + "num_input_tokens_seen": 75610740, + "router_z_loss_clip": 2.46875, + "router_z_loss_mlp": 0.22558594, + "step": 3498, + "time_per_iteration": 2.6047511100769043 + }, + { + "auxiliary_loss_clip": 0.06550896, + "auxiliary_loss_mlp": 0.01291723, + "balance_loss_clip": 0.06304521, + "balance_loss_mlp": 0.01269789, + "epoch": 0.21037126108522472, + "flos": 22024769880960.0, + "grad_norm": 1.5964372308761317, + "language_loss": 0.81248403, + "learning_rate": 3.6683154055567352e-06, + "loss": 0.89091027, + "num_input_tokens_seen": 75631005, + "router_z_loss_clip": 2.46484375, + "router_z_loss_mlp": 0.21948242, + "step": 3499, + "time_per_iteration": 2.5279107093811035 + }, + { + "auxiliary_loss_clip": 0.06537838, + "auxiliary_loss_mlp": 0.01278117, + "balance_loss_clip": 0.06300277, + "balance_loss_mlp": 0.01257911, + "epoch": 0.21043138433789269, + "flos": 25340861940480.0, + "grad_norm": 2.3111316875342274, + "language_loss": 0.78733355, + "learning_rate": 3.668100575684043e-06, + "loss": 0.86549306, + "num_input_tokens_seen": 75650655, + "router_z_loss_clip": 2.37890625, + "router_z_loss_mlp": 0.20214844, + "step": 3500, + "time_per_iteration": 2.5789358615875244 + }, + { + "auxiliary_loss_clip": 0.06548081, + "auxiliary_loss_mlp": 0.01281815, + "balance_loss_clip": 0.06307902, + "balance_loss_mlp": 0.01259809, + "epoch": 0.21049150759056065, + "flos": 25563708673920.0, + "grad_norm": 1.5222387073827752, + "language_loss": 0.74519855, + "learning_rate": 3.6678856825568094e-06, + "loss": 0.82349753, + "num_input_tokens_seen": 75669895, + "router_z_loss_clip": 2.40234375, + "router_z_loss_mlp": 0.22021484, + "step": 3501, + "time_per_iteration": 2.5740344524383545 + }, + { + "auxiliary_loss_clip": 0.06532234, + "auxiliary_loss_mlp": 0.01279657, + "balance_loss_clip": 0.06293183, + "balance_loss_mlp": 0.01258521, + "epoch": 0.21055163084322862, + "flos": 24501982389120.0, + "grad_norm": 1.5726278305934103, + "language_loss": 0.75732303, + "learning_rate": 3.667670726183183e-06, + "loss": 0.83544195, + "num_input_tokens_seen": 75689535, + "router_z_loss_clip": 2.39453125, + "router_z_loss_mlp": 0.21142578, + "step": 3502, + "time_per_iteration": 2.564650535583496 + }, + { + "auxiliary_loss_clip": 0.06532737, + "auxiliary_loss_mlp": 0.01282141, + "balance_loss_clip": 0.06294994, + "balance_loss_mlp": 0.01260731, + "epoch": 0.21061175409589658, + "flos": 25746123012480.0, + "grad_norm": 2.0578640076956165, + "language_loss": 0.78642297, + "learning_rate": 3.667455706571316e-06, + "loss": 0.86457181, + "num_input_tokens_seen": 75709265, + "router_z_loss_clip": 2.37695312, + "router_z_loss_mlp": 0.21411133, + "step": 3503, + "time_per_iteration": 2.5651087760925293 + }, + { + "auxiliary_loss_clip": 0.06548393, + "auxiliary_loss_mlp": 0.01287579, + "balance_loss_clip": 0.06300595, + "balance_loss_mlp": 0.01262426, + "epoch": 0.21067187734856455, + "flos": 18995115404160.0, + "grad_norm": 2.3829290271278363, + "language_loss": 0.79109055, + "learning_rate": 3.6672406237293617e-06, + "loss": 0.86945021, + "num_input_tokens_seen": 75727050, + "router_z_loss_clip": 2.47460938, + "router_z_loss_mlp": 0.25134277, + "step": 3504, + "time_per_iteration": 2.5907576084136963 + }, + { + "auxiliary_loss_clip": 0.06540846, + "auxiliary_loss_mlp": 0.01277653, + "balance_loss_clip": 0.06295908, + "balance_loss_mlp": 0.012561, + "epoch": 0.21073200060123254, + "flos": 24688337869440.0, + "grad_norm": 2.6276986020802386, + "language_loss": 0.77414715, + "learning_rate": 3.6670254776654754e-06, + "loss": 0.85233212, + "num_input_tokens_seen": 75747175, + "router_z_loss_clip": 2.44921875, + "router_z_loss_mlp": 0.21557617, + "step": 3505, + "time_per_iteration": 2.564504861831665 + }, + { + "auxiliary_loss_clip": 0.06529057, + "auxiliary_loss_mlp": 0.01277583, + "balance_loss_clip": 0.06294015, + "balance_loss_mlp": 0.01257186, + "epoch": 0.2107921238539005, + "flos": 28557039605760.0, + "grad_norm": 2.0513581673642434, + "language_loss": 0.64351165, + "learning_rate": 3.6668102683878163e-06, + "loss": 0.721578, + "num_input_tokens_seen": 75767690, + "router_z_loss_clip": 2.34960938, + "router_z_loss_mlp": 0.20397949, + "step": 3506, + "time_per_iteration": 2.641390323638916 + }, + { + "auxiliary_loss_clip": 0.06535215, + "auxiliary_loss_mlp": 0.01278768, + "balance_loss_clip": 0.0629719, + "balance_loss_mlp": 0.01257656, + "epoch": 0.21085224710656847, + "flos": 25893094273920.0, + "grad_norm": 2.3889311598286436, + "language_loss": 0.82716179, + "learning_rate": 3.6665949959045443e-06, + "loss": 0.90530163, + "num_input_tokens_seen": 75787255, + "router_z_loss_clip": 2.38085938, + "router_z_loss_mlp": 0.21105957, + "step": 3507, + "time_per_iteration": 2.5718142986297607 + }, + { + "auxiliary_loss_clip": 0.06534198, + "auxiliary_loss_mlp": 0.01280018, + "balance_loss_clip": 0.06294642, + "balance_loss_mlp": 0.0125769, + "epoch": 0.21091237035923643, + "flos": 14981664539520.0, + "grad_norm": 1.9856074738329712, + "language_loss": 0.76547742, + "learning_rate": 3.666379660223824e-06, + "loss": 0.84361959, + "num_input_tokens_seen": 75805890, + "router_z_loss_clip": 2.3984375, + "router_z_loss_mlp": 0.22338867, + "step": 3508, + "time_per_iteration": 2.5104117393493652 + }, + { + "auxiliary_loss_clip": 0.06543706, + "auxiliary_loss_mlp": 0.01282498, + "balance_loss_clip": 0.06299506, + "balance_loss_mlp": 0.01261159, + "epoch": 0.2109724936119044, + "flos": 16368080595840.0, + "grad_norm": 2.529935640705384, + "language_loss": 0.86242574, + "learning_rate": 3.6661642613538192e-06, + "loss": 0.94068778, + "num_input_tokens_seen": 75821620, + "router_z_loss_clip": 2.44335938, + "router_z_loss_mlp": 0.21325684, + "step": 3509, + "time_per_iteration": 2.508370876312256 + }, + { + "auxiliary_loss_clip": 0.06541994, + "auxiliary_loss_mlp": 0.01280685, + "balance_loss_clip": 0.06295836, + "balance_loss_mlp": 0.01258679, + "epoch": 0.21103261686457236, + "flos": 31510315486080.0, + "grad_norm": 1.7053981088389916, + "language_loss": 0.68853724, + "learning_rate": 3.6659487993026987e-06, + "loss": 0.76676404, + "num_input_tokens_seen": 75842490, + "router_z_loss_clip": 2.46289062, + "router_z_loss_mlp": 0.22009277, + "step": 3510, + "time_per_iteration": 2.6452746391296387 + }, + { + "auxiliary_loss_clip": 0.06542882, + "auxiliary_loss_mlp": 0.01284418, + "balance_loss_clip": 0.06299803, + "balance_loss_mlp": 0.01263259, + "epoch": 0.21109274011724033, + "flos": 27351360806400.0, + "grad_norm": 1.7932280077203222, + "language_loss": 0.7352736, + "learning_rate": 3.6657332740786327e-06, + "loss": 0.8135466, + "num_input_tokens_seen": 75865985, + "router_z_loss_clip": 2.43164062, + "router_z_loss_mlp": 0.21154785, + "step": 3511, + "time_per_iteration": 2.6538095474243164 + }, + { + "auxiliary_loss_clip": 0.06553793, + "auxiliary_loss_mlp": 0.01288613, + "balance_loss_clip": 0.06308056, + "balance_loss_mlp": 0.01265546, + "epoch": 0.21115286336990832, + "flos": 17825927857920.0, + "grad_norm": 2.4490749473958577, + "language_loss": 0.70309734, + "learning_rate": 3.665517685689794e-06, + "loss": 0.78152132, + "num_input_tokens_seen": 75882745, + "router_z_loss_clip": 2.4609375, + "router_z_loss_mlp": 0.23071289, + "step": 3512, + "time_per_iteration": 2.5178020000457764 + }, + { + "auxiliary_loss_clip": 0.06542063, + "auxiliary_loss_mlp": 0.01280138, + "balance_loss_clip": 0.06299283, + "balance_loss_mlp": 0.01257739, + "epoch": 0.2112129866225763, + "flos": 27205228085760.0, + "grad_norm": 1.580176351931222, + "language_loss": 0.73930323, + "learning_rate": 3.6653020341443584e-06, + "loss": 0.81752527, + "num_input_tokens_seen": 75904305, + "router_z_loss_clip": 2.43164062, + "router_z_loss_mlp": 0.22412109, + "step": 3513, + "time_per_iteration": 2.62662410736084 + }, + { + "auxiliary_loss_clip": 0.06537203, + "auxiliary_loss_mlp": 0.01281283, + "balance_loss_clip": 0.06301522, + "balance_loss_mlp": 0.01260303, + "epoch": 0.21127310987524425, + "flos": 23737846279680.0, + "grad_norm": 1.7494748899805272, + "language_loss": 0.75353736, + "learning_rate": 3.665086319450502e-06, + "loss": 0.8317222, + "num_input_tokens_seen": 75923710, + "router_z_loss_clip": 2.359375, + "router_z_loss_mlp": 0.20983887, + "step": 3514, + "time_per_iteration": 2.584502696990967 + }, + { + "auxiliary_loss_clip": 0.06546184, + "auxiliary_loss_mlp": 0.01281455, + "balance_loss_clip": 0.06301809, + "balance_loss_mlp": 0.01261309, + "epoch": 0.21133323312791222, + "flos": 18338356702080.0, + "grad_norm": 1.6761924057980855, + "language_loss": 0.77322358, + "learning_rate": 3.6648705416164062e-06, + "loss": 0.85149997, + "num_input_tokens_seen": 75942625, + "router_z_loss_clip": 2.4453125, + "router_z_loss_mlp": 0.20141602, + "step": 3515, + "time_per_iteration": 2.552231550216675 + }, + { + "auxiliary_loss_clip": 0.06544478, + "auxiliary_loss_mlp": 0.0128088, + "balance_loss_clip": 0.06304052, + "balance_loss_mlp": 0.01260865, + "epoch": 0.21139335638058018, + "flos": 17936994844800.0, + "grad_norm": 2.0687526262765212, + "language_loss": 0.69083852, + "learning_rate": 3.6646547006502518e-06, + "loss": 0.76909214, + "num_input_tokens_seen": 75959930, + "router_z_loss_clip": 2.40625, + "router_z_loss_mlp": 0.19995117, + "step": 3516, + "time_per_iteration": 2.535282611846924 + }, + { + "auxiliary_loss_clip": 0.0654862, + "auxiliary_loss_mlp": 0.01279905, + "balance_loss_clip": 0.0630609, + "balance_loss_mlp": 0.01257756, + "epoch": 0.21145347963324815, + "flos": 24579073745280.0, + "grad_norm": 1.818548989117399, + "language_loss": 0.85523438, + "learning_rate": 3.664438796560225e-06, + "loss": 0.93351966, + "num_input_tokens_seen": 75980335, + "router_z_loss_clip": 2.42578125, + "router_z_loss_mlp": 0.22155762, + "step": 3517, + "time_per_iteration": 2.5862202644348145 + }, + { + "auxiliary_loss_clip": 0.06554718, + "auxiliary_loss_mlp": 0.01280908, + "balance_loss_clip": 0.06311698, + "balance_loss_mlp": 0.01260368, + "epoch": 0.21151360288591614, + "flos": 35854787105280.0, + "grad_norm": 2.178791897783965, + "language_loss": 0.6333189, + "learning_rate": 3.664222829354512e-06, + "loss": 0.71167523, + "num_input_tokens_seen": 76002095, + "router_z_loss_clip": 2.43164062, + "router_z_loss_mlp": 0.20532227, + "step": 3518, + "time_per_iteration": 2.6618587970733643 + }, + { + "auxiliary_loss_clip": 0.0654604, + "auxiliary_loss_mlp": 0.0129195, + "balance_loss_clip": 0.06306089, + "balance_loss_mlp": 0.01271625, + "epoch": 0.2115737261385841, + "flos": 24647989328640.0, + "grad_norm": 1.8588369306942552, + "language_loss": 0.90024757, + "learning_rate": 3.664006799041303e-06, + "loss": 0.97862744, + "num_input_tokens_seen": 76020425, + "router_z_loss_clip": 2.39648438, + "router_z_loss_mlp": 0.20336914, + "step": 3519, + "time_per_iteration": 2.5962281227111816 + }, + { + "auxiliary_loss_clip": 0.06553498, + "auxiliary_loss_mlp": 0.0129082, + "balance_loss_clip": 0.06309567, + "balance_loss_mlp": 0.01268945, + "epoch": 0.21163384939125207, + "flos": 25233652241280.0, + "grad_norm": 1.74321759448714, + "language_loss": 0.81933582, + "learning_rate": 3.6637907056287886e-06, + "loss": 0.89777905, + "num_input_tokens_seen": 76041210, + "router_z_loss_clip": 2.43945312, + "router_z_loss_mlp": 0.21862793, + "step": 3520, + "time_per_iteration": 2.6036746501922607 + }, + { + "auxiliary_loss_clip": 0.06544603, + "auxiliary_loss_mlp": 0.0127827, + "balance_loss_clip": 0.0630887, + "balance_loss_mlp": 0.01257576, + "epoch": 0.21169397264392004, + "flos": 26074670071680.0, + "grad_norm": 1.5989262406015683, + "language_loss": 0.76731956, + "learning_rate": 3.6635745491251642e-06, + "loss": 0.84554833, + "num_input_tokens_seen": 76062685, + "router_z_loss_clip": 2.35742188, + "router_z_loss_mlp": 0.20690918, + "step": 3521, + "time_per_iteration": 2.613945960998535 + }, + { + "auxiliary_loss_clip": 0.06548078, + "auxiliary_loss_mlp": 0.01281462, + "balance_loss_clip": 0.06310651, + "balance_loss_mlp": 0.01261364, + "epoch": 0.211754095896588, + "flos": 23114266594560.0, + "grad_norm": 2.104686387571933, + "language_loss": 0.75886559, + "learning_rate": 3.663358329538626e-06, + "loss": 0.83716094, + "num_input_tokens_seen": 76082300, + "router_z_loss_clip": 2.37695312, + "router_z_loss_mlp": 0.20092773, + "step": 3522, + "time_per_iteration": 2.530388355255127 + }, + { + "auxiliary_loss_clip": 0.06550008, + "auxiliary_loss_mlp": 0.01276271, + "balance_loss_clip": 0.06309568, + "balance_loss_mlp": 0.01255994, + "epoch": 0.21181421914925597, + "flos": 27928806019200.0, + "grad_norm": 2.55069435165465, + "language_loss": 0.71218652, + "learning_rate": 3.663142046877374e-06, + "loss": 0.79044926, + "num_input_tokens_seen": 76101135, + "router_z_loss_clip": 2.40625, + "router_z_loss_mlp": 0.20288086, + "step": 3523, + "time_per_iteration": 2.6448264122009277 + }, + { + "auxiliary_loss_clip": 0.06544726, + "auxiliary_loss_mlp": 0.01276969, + "balance_loss_clip": 0.06308427, + "balance_loss_mlp": 0.01256191, + "epoch": 0.21187434240192393, + "flos": 17134313057280.0, + "grad_norm": 2.0846198886990566, + "language_loss": 0.77930927, + "learning_rate": 3.6629257011496085e-06, + "loss": 0.8575263, + "num_input_tokens_seen": 76119320, + "router_z_loss_clip": 2.36523438, + "router_z_loss_mlp": 0.20788574, + "step": 3524, + "time_per_iteration": 2.527096748352051 + }, + { + "auxiliary_loss_clip": 0.06557429, + "auxiliary_loss_mlp": 0.01277075, + "balance_loss_clip": 0.0631334, + "balance_loss_mlp": 0.01255045, + "epoch": 0.21193446565459192, + "flos": 22354071626880.0, + "grad_norm": 2.138137470282545, + "language_loss": 0.82111794, + "learning_rate": 3.6627092923635338e-06, + "loss": 0.89946306, + "num_input_tokens_seen": 76137445, + "router_z_loss_clip": 2.43945312, + "router_z_loss_mlp": 0.22033691, + "step": 3525, + "time_per_iteration": 2.583249807357788 + }, + { + "auxiliary_loss_clip": 0.06547971, + "auxiliary_loss_mlp": 0.01274856, + "balance_loss_clip": 0.06308704, + "balance_loss_mlp": 0.01254519, + "epoch": 0.2119945889072599, + "flos": 27206779386240.0, + "grad_norm": 1.7514877674009408, + "language_loss": 0.75671291, + "learning_rate": 3.662492820527356e-06, + "loss": 0.83494115, + "num_input_tokens_seen": 76159500, + "router_z_loss_clip": 2.39453125, + "router_z_loss_mlp": 0.20324707, + "step": 3526, + "time_per_iteration": 2.56286883354187 + }, + { + "auxiliary_loss_clip": 0.06556675, + "auxiliary_loss_mlp": 0.01279028, + "balance_loss_clip": 0.0631361, + "balance_loss_mlp": 0.01258107, + "epoch": 0.21205471215992786, + "flos": 20997480424320.0, + "grad_norm": 1.9989732630407808, + "language_loss": 0.77276337, + "learning_rate": 3.662276285649284e-06, + "loss": 0.85112035, + "num_input_tokens_seen": 76177990, + "router_z_loss_clip": 2.4296875, + "router_z_loss_mlp": 0.20910645, + "step": 3527, + "time_per_iteration": 2.7162973880767822 + }, + { + "auxiliary_loss_clip": 0.06551696, + "auxiliary_loss_mlp": 0.01279873, + "balance_loss_clip": 0.06314081, + "balance_loss_mlp": 0.01258224, + "epoch": 0.21211483541259582, + "flos": 20784025347840.0, + "grad_norm": 2.0427089539116783, + "language_loss": 0.78184944, + "learning_rate": 3.662059687737528e-06, + "loss": 0.86016512, + "num_input_tokens_seen": 76197125, + "router_z_loss_clip": 2.375, + "router_z_loss_mlp": 0.21643066, + "step": 3528, + "time_per_iteration": 3.990530490875244 + }, + { + "auxiliary_loss_clip": 0.06551792, + "auxiliary_loss_mlp": 0.01277875, + "balance_loss_clip": 0.06313196, + "balance_loss_mlp": 0.01257025, + "epoch": 0.21217495866526379, + "flos": 18996079726080.0, + "grad_norm": 1.942993331862389, + "language_loss": 0.82054245, + "learning_rate": 3.6618430268003024e-06, + "loss": 0.89883912, + "num_input_tokens_seen": 76216215, + "router_z_loss_clip": 2.38671875, + "router_z_loss_mlp": 0.20861816, + "step": 3529, + "time_per_iteration": 2.564383029937744 + }, + { + "auxiliary_loss_clip": 0.06555474, + "auxiliary_loss_mlp": 0.01278138, + "balance_loss_clip": 0.06313926, + "balance_loss_mlp": 0.01257134, + "epoch": 0.21223508191793175, + "flos": 20673503412480.0, + "grad_norm": 2.2777790477523236, + "language_loss": 0.77694297, + "learning_rate": 3.6616263028458235e-06, + "loss": 0.85527909, + "num_input_tokens_seen": 76237010, + "router_z_loss_clip": 2.41796875, + "router_z_loss_mlp": 0.21008301, + "step": 3530, + "time_per_iteration": 2.576662540435791 + }, + { + "auxiliary_loss_clip": 0.06550869, + "auxiliary_loss_mlp": 0.01274157, + "balance_loss_clip": 0.06314521, + "balance_loss_mlp": 0.01254106, + "epoch": 0.21229520517059972, + "flos": 21622904899200.0, + "grad_norm": 2.3150689342230644, + "language_loss": 0.83926791, + "learning_rate": 3.661409515882308e-06, + "loss": 0.91751814, + "num_input_tokens_seen": 76255965, + "router_z_loss_clip": 2.36523438, + "router_z_loss_mlp": 0.20043945, + "step": 3531, + "time_per_iteration": 4.092180252075195 + }, + { + "auxiliary_loss_clip": 0.06553733, + "auxiliary_loss_mlp": 0.01280648, + "balance_loss_clip": 0.06313696, + "balance_loss_mlp": 0.0125888, + "epoch": 0.2123553284232677, + "flos": 13996232997120.0, + "grad_norm": 2.2553338764718145, + "language_loss": 0.74256229, + "learning_rate": 3.661192665917977e-06, + "loss": 0.82090604, + "num_input_tokens_seen": 76272150, + "router_z_loss_clip": 2.40039062, + "router_z_loss_mlp": 0.21777344, + "step": 3532, + "time_per_iteration": 2.5215070247650146 + }, + { + "auxiliary_loss_clip": 0.06549011, + "auxiliary_loss_mlp": 0.01276957, + "balance_loss_clip": 0.06309506, + "balance_loss_mlp": 0.01255714, + "epoch": 0.21241545167593567, + "flos": 18302745916800.0, + "grad_norm": 1.8963653738624293, + "language_loss": 0.74378759, + "learning_rate": 3.660975752961054e-06, + "loss": 0.82204729, + "num_input_tokens_seen": 76291425, + "router_z_loss_clip": 2.39648438, + "router_z_loss_mlp": 0.21252441, + "step": 3533, + "time_per_iteration": 2.5286645889282227 + }, + { + "auxiliary_loss_clip": 0.06554842, + "auxiliary_loss_mlp": 0.01279741, + "balance_loss_clip": 0.06312128, + "balance_loss_mlp": 0.01257341, + "epoch": 0.21247557492860364, + "flos": 34721461906560.0, + "grad_norm": 1.8118406193913599, + "language_loss": 0.71620667, + "learning_rate": 3.6607587770197634e-06, + "loss": 0.79455251, + "num_input_tokens_seen": 76313975, + "router_z_loss_clip": 2.42773438, + "router_z_loss_mlp": 0.22399902, + "step": 3534, + "time_per_iteration": 2.6872916221618652 + }, + { + "auxiliary_loss_clip": 0.06548804, + "auxiliary_loss_mlp": 0.01283614, + "balance_loss_clip": 0.0630706, + "balance_loss_mlp": 0.01262586, + "epoch": 0.2125356981812716, + "flos": 22060254885120.0, + "grad_norm": 2.3502862502903046, + "language_loss": 0.72866982, + "learning_rate": 3.6605417381023346e-06, + "loss": 0.80699402, + "num_input_tokens_seen": 76330955, + "router_z_loss_clip": 2.41992188, + "router_z_loss_mlp": 0.21032715, + "step": 3535, + "time_per_iteration": 2.5843448638916016 + }, + { + "auxiliary_loss_clip": 0.06546953, + "auxiliary_loss_mlp": 0.01279722, + "balance_loss_clip": 0.06307133, + "balance_loss_mlp": 0.01257621, + "epoch": 0.21259582143393957, + "flos": 28555865648640.0, + "grad_norm": 2.199655139190772, + "language_loss": 0.70759106, + "learning_rate": 3.660324636216996e-06, + "loss": 0.7858578, + "num_input_tokens_seen": 76352680, + "router_z_loss_clip": 2.3984375, + "router_z_loss_mlp": 0.22106934, + "step": 3536, + "time_per_iteration": 4.056318998336792 + }, + { + "auxiliary_loss_clip": 0.06557733, + "auxiliary_loss_mlp": 0.01286072, + "balance_loss_clip": 0.06310252, + "balance_loss_mlp": 0.0126415, + "epoch": 0.21265594468660753, + "flos": 20127140864640.0, + "grad_norm": 2.2134041941920897, + "language_loss": 0.8820163, + "learning_rate": 3.660107471371981e-06, + "loss": 0.96045434, + "num_input_tokens_seen": 76370750, + "router_z_loss_clip": 2.4765625, + "router_z_loss_mlp": 0.21911621, + "step": 3537, + "time_per_iteration": 2.6233468055725098 + }, + { + "auxiliary_loss_clip": 0.06541121, + "auxiliary_loss_mlp": 0.01278147, + "balance_loss_clip": 0.06304413, + "balance_loss_mlp": 0.01256094, + "epoch": 0.21271606793927553, + "flos": 23082890440320.0, + "grad_norm": 1.7848498720134809, + "language_loss": 0.81086004, + "learning_rate": 3.659890243575524e-06, + "loss": 0.88905263, + "num_input_tokens_seen": 76390610, + "router_z_loss_clip": 2.36914062, + "router_z_loss_mlp": 0.22058105, + "step": 3538, + "time_per_iteration": 2.5589442253112793 + }, + { + "auxiliary_loss_clip": 0.06545715, + "auxiliary_loss_mlp": 0.01283722, + "balance_loss_clip": 0.06305592, + "balance_loss_mlp": 0.01263981, + "epoch": 0.2127761911919435, + "flos": 26394118963200.0, + "grad_norm": 2.023826748108625, + "language_loss": 0.87817419, + "learning_rate": 3.659672952835863e-06, + "loss": 0.95646858, + "num_input_tokens_seen": 76408860, + "router_z_loss_clip": 2.40234375, + "router_z_loss_mlp": 0.19763184, + "step": 3539, + "time_per_iteration": 2.6115527153015137 + }, + { + "auxiliary_loss_clip": 0.06554011, + "auxiliary_loss_mlp": 0.01284638, + "balance_loss_clip": 0.06309317, + "balance_loss_mlp": 0.01264277, + "epoch": 0.21283631444461146, + "flos": 20234182855680.0, + "grad_norm": 3.1687626880856667, + "language_loss": 0.59144789, + "learning_rate": 3.659455599161237e-06, + "loss": 0.66983438, + "num_input_tokens_seen": 76424980, + "router_z_loss_clip": 2.4453125, + "router_z_loss_mlp": 0.20361328, + "step": 3540, + "time_per_iteration": 2.525139570236206 + }, + { + "auxiliary_loss_clip": 0.06543202, + "auxiliary_loss_mlp": 0.01278528, + "balance_loss_clip": 0.0630211, + "balance_loss_mlp": 0.01256557, + "epoch": 0.21289643769727942, + "flos": 13522140195840.0, + "grad_norm": 1.940296770056649, + "language_loss": 0.7721082, + "learning_rate": 3.659238182559888e-06, + "loss": 0.85032547, + "num_input_tokens_seen": 76443135, + "router_z_loss_clip": 2.40820312, + "router_z_loss_mlp": 0.21972656, + "step": 3541, + "time_per_iteration": 2.563164234161377 + }, + { + "auxiliary_loss_clip": 0.06542824, + "auxiliary_loss_mlp": 0.01283205, + "balance_loss_clip": 0.06305471, + "balance_loss_mlp": 0.01262486, + "epoch": 0.2129565609499474, + "flos": 24833967465600.0, + "grad_norm": 1.7979798329536472, + "language_loss": 0.69596064, + "learning_rate": 3.6590207030400615e-06, + "loss": 0.77422094, + "num_input_tokens_seen": 76462470, + "router_z_loss_clip": 2.37304688, + "router_z_loss_mlp": 0.20703125, + "step": 3542, + "time_per_iteration": 2.6213386058807373 + }, + { + "auxiliary_loss_clip": 0.06542216, + "auxiliary_loss_mlp": 0.01284362, + "balance_loss_clip": 0.0630642, + "balance_loss_mlp": 0.01264692, + "epoch": 0.21301668420261535, + "flos": 23665953876480.0, + "grad_norm": 1.8238030340304547, + "language_loss": 0.77012485, + "learning_rate": 3.658803160610004e-06, + "loss": 0.84839058, + "num_input_tokens_seen": 76481995, + "router_z_loss_clip": 2.359375, + "router_z_loss_mlp": 0.19677734, + "step": 3543, + "time_per_iteration": 2.5654232501983643 + }, + { + "auxiliary_loss_clip": 0.0654586, + "auxiliary_loss_mlp": 0.01282767, + "balance_loss_clip": 0.0630815, + "balance_loss_mlp": 0.01261488, + "epoch": 0.21307680745528332, + "flos": 16368416012160.0, + "grad_norm": 2.0315626098903468, + "language_loss": 0.67305464, + "learning_rate": 3.6585855552779634e-06, + "loss": 0.75134087, + "num_input_tokens_seen": 76500245, + "router_z_loss_clip": 2.37890625, + "router_z_loss_mlp": 0.2130127, + "step": 3544, + "time_per_iteration": 2.513288736343384 + }, + { + "auxiliary_loss_clip": 0.06542834, + "auxiliary_loss_mlp": 0.01284, + "balance_loss_clip": 0.06304078, + "balance_loss_mlp": 0.01264223, + "epoch": 0.2131369307079513, + "flos": 19105092288000.0, + "grad_norm": 1.7034786511890583, + "language_loss": 0.71322483, + "learning_rate": 3.6583678870521934e-06, + "loss": 0.79149318, + "num_input_tokens_seen": 76519535, + "router_z_loss_clip": 2.38867188, + "router_z_loss_mlp": 0.19763184, + "step": 3545, + "time_per_iteration": 2.5347442626953125 + }, + { + "auxiliary_loss_clip": 0.06549121, + "auxiliary_loss_mlp": 0.01288311, + "balance_loss_clip": 0.06306408, + "balance_loss_mlp": 0.01268224, + "epoch": 0.21319705396061928, + "flos": 30380050961280.0, + "grad_norm": 2.304335172733059, + "language_loss": 0.73178399, + "learning_rate": 3.658150155940946e-06, + "loss": 0.81015837, + "num_input_tokens_seen": 76542065, + "router_z_loss_clip": 2.42382812, + "router_z_loss_mlp": 0.20092773, + "step": 3546, + "time_per_iteration": 2.6647720336914062 + }, + { + "auxiliary_loss_clip": 0.0655164, + "auxiliary_loss_mlp": 0.01278696, + "balance_loss_clip": 0.06310475, + "balance_loss_mlp": 0.01258609, + "epoch": 0.21325717721328724, + "flos": 21761616533760.0, + "grad_norm": 1.9338253687785023, + "language_loss": 0.81206107, + "learning_rate": 3.657932361952479e-06, + "loss": 0.89036447, + "num_input_tokens_seen": 76560540, + "router_z_loss_clip": 2.41210938, + "router_z_loss_mlp": 0.20092773, + "step": 3547, + "time_per_iteration": 2.533062696456909 + }, + { + "auxiliary_loss_clip": 0.06547703, + "auxiliary_loss_mlp": 0.01281658, + "balance_loss_clip": 0.06302875, + "balance_loss_mlp": 0.01259127, + "epoch": 0.2133173004659552, + "flos": 28738447695360.0, + "grad_norm": 3.206018032759459, + "language_loss": 0.74960929, + "learning_rate": 3.6577145050950504e-06, + "loss": 0.82790291, + "num_input_tokens_seen": 76581760, + "router_z_loss_clip": 2.45117188, + "router_z_loss_mlp": 0.22521973, + "step": 3548, + "time_per_iteration": 2.605151414871216 + }, + { + "auxiliary_loss_clip": 0.06554648, + "auxiliary_loss_mlp": 0.01281207, + "balance_loss_clip": 0.06309359, + "balance_loss_mlp": 0.01259236, + "epoch": 0.21337742371862317, + "flos": 16842760375680.0, + "grad_norm": 2.056331081084102, + "language_loss": 0.74889886, + "learning_rate": 3.657496585376922e-06, + "loss": 0.82725745, + "num_input_tokens_seen": 76599940, + "router_z_loss_clip": 2.453125, + "router_z_loss_mlp": 0.21972656, + "step": 3549, + "time_per_iteration": 2.518305540084839 + }, + { + "auxiliary_loss_clip": 0.06547625, + "auxiliary_loss_mlp": 0.01281066, + "balance_loss_clip": 0.06306933, + "balance_loss_mlp": 0.01261278, + "epoch": 0.21343754697129114, + "flos": 24431683213440.0, + "grad_norm": 1.7052192349692608, + "language_loss": 0.8095907, + "learning_rate": 3.657278602806357e-06, + "loss": 0.88787764, + "num_input_tokens_seen": 76619580, + "router_z_loss_clip": 2.40625, + "router_z_loss_mlp": 0.19787598, + "step": 3550, + "time_per_iteration": 2.621840715408325 + }, + { + "auxiliary_loss_clip": 0.06544942, + "auxiliary_loss_mlp": 0.01278049, + "balance_loss_clip": 0.06309815, + "balance_loss_mlp": 0.01258653, + "epoch": 0.21349767022395913, + "flos": 19283271995520.0, + "grad_norm": 1.8011583081598594, + "language_loss": 0.88582718, + "learning_rate": 3.657060557391621e-06, + "loss": 0.96405709, + "num_input_tokens_seen": 76638195, + "router_z_loss_clip": 2.35546875, + "router_z_loss_mlp": 0.19384766, + "step": 3551, + "time_per_iteration": 2.5354909896850586 + }, + { + "auxiliary_loss_clip": 0.06541884, + "auxiliary_loss_mlp": 0.01276889, + "balance_loss_clip": 0.06304973, + "balance_loss_mlp": 0.01256635, + "epoch": 0.2135577934766271, + "flos": 17353260576000.0, + "grad_norm": 1.8291964059748265, + "language_loss": 0.83669794, + "learning_rate": 3.656842449140983e-06, + "loss": 0.91488564, + "num_input_tokens_seen": 76656695, + "router_z_loss_clip": 2.36914062, + "router_z_loss_mlp": 0.20275879, + "step": 3552, + "time_per_iteration": 2.5428099632263184 + }, + { + "auxiliary_loss_clip": 0.06543534, + "auxiliary_loss_mlp": 0.01282655, + "balance_loss_clip": 0.06305505, + "balance_loss_mlp": 0.01261329, + "epoch": 0.21361791672929506, + "flos": 24063416519040.0, + "grad_norm": 1.71251087169846, + "language_loss": 0.77181637, + "learning_rate": 3.656624278062713e-06, + "loss": 0.85007823, + "num_input_tokens_seen": 76677430, + "router_z_loss_clip": 2.38085938, + "router_z_loss_mlp": 0.21325684, + "step": 3553, + "time_per_iteration": 2.5453906059265137 + }, + { + "auxiliary_loss_clip": 0.06546006, + "auxiliary_loss_mlp": 0.01280965, + "balance_loss_clip": 0.06308904, + "balance_loss_mlp": 0.01260556, + "epoch": 0.21367803998196302, + "flos": 22168596614400.0, + "grad_norm": 1.6386548216082337, + "language_loss": 0.72918522, + "learning_rate": 3.6564060441650843e-06, + "loss": 0.80745488, + "num_input_tokens_seen": 76697615, + "router_z_loss_clip": 2.37109375, + "router_z_loss_mlp": 0.20397949, + "step": 3554, + "time_per_iteration": 2.610447883605957 + }, + { + "auxiliary_loss_clip": 0.06543835, + "auxiliary_loss_mlp": 0.01296522, + "balance_loss_clip": 0.06304221, + "balance_loss_mlp": 0.01276483, + "epoch": 0.213738163234631, + "flos": 20893205617920.0, + "grad_norm": 2.167468133085416, + "language_loss": 0.6838634, + "learning_rate": 3.6561877474563724e-06, + "loss": 0.76226699, + "num_input_tokens_seen": 76715685, + "router_z_loss_clip": 2.39453125, + "router_z_loss_mlp": 0.20043945, + "step": 3555, + "time_per_iteration": 2.6348068714141846 + }, + { + "auxiliary_loss_clip": 0.06544648, + "auxiliary_loss_mlp": 0.01283651, + "balance_loss_clip": 0.06303324, + "balance_loss_mlp": 0.01262861, + "epoch": 0.21379828648729896, + "flos": 28410739176960.0, + "grad_norm": 1.8068010568670265, + "language_loss": 0.6581043, + "learning_rate": 3.6559693879448553e-06, + "loss": 0.73638725, + "num_input_tokens_seen": 76735405, + "router_z_loss_clip": 2.4140625, + "router_z_loss_mlp": 0.20800781, + "step": 3556, + "time_per_iteration": 2.6547720432281494 + }, + { + "auxiliary_loss_clip": 0.06542179, + "auxiliary_loss_mlp": 0.0129054, + "balance_loss_clip": 0.06305043, + "balance_loss_mlp": 0.01269905, + "epoch": 0.21385840973996692, + "flos": 25486030339200.0, + "grad_norm": 1.6965425102308196, + "language_loss": 0.73263884, + "learning_rate": 3.6557509656388125e-06, + "loss": 0.81096601, + "num_input_tokens_seen": 76754395, + "router_z_loss_clip": 2.375, + "router_z_loss_mlp": 0.20617676, + "step": 3557, + "time_per_iteration": 2.5850143432617188 + }, + { + "auxiliary_loss_clip": 0.06555384, + "auxiliary_loss_mlp": 0.01282331, + "balance_loss_clip": 0.06310774, + "balance_loss_mlp": 0.01260814, + "epoch": 0.2139185329926349, + "flos": 28081772847360.0, + "grad_norm": 1.6861756161591135, + "language_loss": 0.67894918, + "learning_rate": 3.655532480546528e-06, + "loss": 0.75732636, + "num_input_tokens_seen": 76777210, + "router_z_loss_clip": 2.4453125, + "router_z_loss_mlp": 0.21508789, + "step": 3558, + "time_per_iteration": 2.6937482357025146 + }, + { + "auxiliary_loss_clip": 0.06554736, + "auxiliary_loss_mlp": 0.01278958, + "balance_loss_clip": 0.06306359, + "balance_loss_mlp": 0.0125905, + "epoch": 0.21397865624530288, + "flos": 19614628166400.0, + "grad_norm": 2.1418574307637575, + "language_loss": 0.81358159, + "learning_rate": 3.655313932676286e-06, + "loss": 0.89191854, + "num_input_tokens_seen": 76795830, + "router_z_loss_clip": 2.484375, + "router_z_loss_mlp": 0.19909668, + "step": 3559, + "time_per_iteration": 2.5145814418792725 + }, + { + "auxiliary_loss_clip": 0.06551723, + "auxiliary_loss_mlp": 0.01281472, + "balance_loss_clip": 0.06314635, + "balance_loss_mlp": 0.01262899, + "epoch": 0.21403877949797084, + "flos": 24688463650560.0, + "grad_norm": 1.6715073288493136, + "language_loss": 0.68710625, + "learning_rate": 3.655095322036373e-06, + "loss": 0.7654382, + "num_input_tokens_seen": 76814700, + "router_z_loss_clip": 2.36914062, + "router_z_loss_mlp": 0.18554688, + "step": 3560, + "time_per_iteration": 2.5956926345825195 + }, + { + "auxiliary_loss_clip": 0.06554615, + "auxiliary_loss_mlp": 0.01279566, + "balance_loss_clip": 0.0631121, + "balance_loss_mlp": 0.01259313, + "epoch": 0.2140989027506388, + "flos": 19866628920960.0, + "grad_norm": 1.9885830979576231, + "language_loss": 0.73618603, + "learning_rate": 3.65487664863508e-06, + "loss": 0.81452787, + "num_input_tokens_seen": 76833400, + "router_z_loss_clip": 2.43359375, + "router_z_loss_mlp": 0.20263672, + "step": 3561, + "time_per_iteration": 2.5286123752593994 + }, + { + "auxiliary_loss_clip": 0.06553814, + "auxiliary_loss_mlp": 0.01282143, + "balance_loss_clip": 0.06311779, + "balance_loss_mlp": 0.01262402, + "epoch": 0.21415902600330677, + "flos": 19141331978880.0, + "grad_norm": 2.350872095274855, + "language_loss": 0.78756285, + "learning_rate": 3.654657912480698e-06, + "loss": 0.86592233, + "num_input_tokens_seen": 76850645, + "router_z_loss_clip": 2.41992188, + "router_z_loss_mlp": 0.19763184, + "step": 3562, + "time_per_iteration": 2.608041286468506 + }, + { + "auxiliary_loss_clip": 0.06546983, + "auxiliary_loss_mlp": 0.01281911, + "balance_loss_clip": 0.06309752, + "balance_loss_mlp": 0.01261788, + "epoch": 0.21421914925597474, + "flos": 22279076622720.0, + "grad_norm": 1.5018972458321598, + "language_loss": 0.85257983, + "learning_rate": 3.6544391135815237e-06, + "loss": 0.93086874, + "num_input_tokens_seen": 76870135, + "router_z_loss_clip": 2.37304688, + "router_z_loss_mlp": 0.20117188, + "step": 3563, + "time_per_iteration": 2.5593912601470947 + }, + { + "auxiliary_loss_clip": 0.06548097, + "auxiliary_loss_mlp": 0.01281509, + "balance_loss_clip": 0.06308593, + "balance_loss_mlp": 0.01262531, + "epoch": 0.2142792725086427, + "flos": 33883504750080.0, + "grad_norm": 1.9248219523503745, + "language_loss": 0.76925778, + "learning_rate": 3.6542202519458507e-06, + "loss": 0.84755385, + "num_input_tokens_seen": 76893905, + "router_z_loss_clip": 2.39453125, + "router_z_loss_mlp": 0.18981934, + "step": 3564, + "time_per_iteration": 2.668755531311035 + }, + { + "auxiliary_loss_clip": 0.06542072, + "auxiliary_loss_mlp": 0.01280159, + "balance_loss_clip": 0.06305549, + "balance_loss_mlp": 0.01261181, + "epoch": 0.2143393957613107, + "flos": 19865538817920.0, + "grad_norm": 1.690691453330226, + "language_loss": 0.89139843, + "learning_rate": 3.654001327581981e-06, + "loss": 0.9696207, + "num_input_tokens_seen": 76914205, + "router_z_loss_clip": 2.36328125, + "router_z_loss_mlp": 0.18969727, + "step": 3565, + "time_per_iteration": 2.660306215286255 + }, + { + "auxiliary_loss_clip": 0.06436334, + "auxiliary_loss_mlp": 0.01286647, + "balance_loss_clip": 0.06303974, + "balance_loss_mlp": 0.01279924, + "epoch": 0.21439951901397866, + "flos": 68549300017920.0, + "grad_norm": 0.8225285981700966, + "language_loss": 0.52211988, + "learning_rate": 3.653782340498215e-06, + "loss": 0.59934968, + "num_input_tokens_seen": 76975650, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.06738281, + "step": 3566, + "time_per_iteration": 3.0845720767974854 + }, + { + "auxiliary_loss_clip": 0.06539588, + "auxiliary_loss_mlp": 0.01284533, + "balance_loss_clip": 0.06306818, + "balance_loss_mlp": 0.0126478, + "epoch": 0.21445964226664663, + "flos": 19689161973120.0, + "grad_norm": 1.8060006281631265, + "language_loss": 0.68295264, + "learning_rate": 3.6535632907028566e-06, + "loss": 0.76119387, + "num_input_tokens_seen": 76992615, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.19775391, + "step": 3567, + "time_per_iteration": 2.5250415802001953 + }, + { + "auxiliary_loss_clip": 0.06543978, + "auxiliary_loss_mlp": 0.01283364, + "balance_loss_clip": 0.06310168, + "balance_loss_mlp": 0.012641, + "epoch": 0.2145197655193146, + "flos": 31116039298560.0, + "grad_norm": 2.0548954423707753, + "language_loss": 0.75150776, + "learning_rate": 3.6533441782042126e-06, + "loss": 0.82978123, + "num_input_tokens_seen": 77017005, + "router_z_loss_clip": 2.33984375, + "router_z_loss_mlp": 0.19250488, + "step": 3568, + "time_per_iteration": 4.018412113189697 + }, + { + "auxiliary_loss_clip": 0.06538366, + "auxiliary_loss_mlp": 0.01282205, + "balance_loss_clip": 0.063043, + "balance_loss_mlp": 0.01261773, + "epoch": 0.21457988877198256, + "flos": 20127015083520.0, + "grad_norm": 2.3975687399079284, + "language_loss": 0.78487438, + "learning_rate": 3.6531250030105917e-06, + "loss": 0.86308008, + "num_input_tokens_seen": 77034990, + "router_z_loss_clip": 2.34179688, + "router_z_loss_mlp": 0.20446777, + "step": 3569, + "time_per_iteration": 2.6051042079925537 + }, + { + "auxiliary_loss_clip": 0.06554128, + "auxiliary_loss_mlp": 0.01283223, + "balance_loss_clip": 0.06309038, + "balance_loss_mlp": 0.01262183, + "epoch": 0.21464001202465052, + "flos": 18593963182080.0, + "grad_norm": 2.5916710851503173, + "language_loss": 0.7048617, + "learning_rate": 3.6529057651303053e-06, + "loss": 0.78323519, + "num_input_tokens_seen": 77052610, + "router_z_loss_clip": 2.453125, + "router_z_loss_mlp": 0.21032715, + "step": 3570, + "time_per_iteration": 2.5029172897338867 + }, + { + "auxiliary_loss_clip": 0.06548594, + "auxiliary_loss_mlp": 0.01293921, + "balance_loss_clip": 0.06305287, + "balance_loss_mlp": 0.01274621, + "epoch": 0.21470013527731852, + "flos": 21841600855680.0, + "grad_norm": 3.519297534980699, + "language_loss": 0.79412138, + "learning_rate": 3.6526864645716666e-06, + "loss": 0.87254649, + "num_input_tokens_seen": 77072475, + "router_z_loss_clip": 2.43554688, + "router_z_loss_mlp": 0.19311523, + "step": 3571, + "time_per_iteration": 3.984830141067505 + }, + { + "auxiliary_loss_clip": 0.06547887, + "auxiliary_loss_mlp": 0.01283536, + "balance_loss_clip": 0.06306981, + "balance_loss_mlp": 0.01263413, + "epoch": 0.21476025852998648, + "flos": 17608992837120.0, + "grad_norm": 2.1137138833129114, + "language_loss": 0.83417559, + "learning_rate": 3.652467101342991e-06, + "loss": 0.91248989, + "num_input_tokens_seen": 77089930, + "router_z_loss_clip": 2.41210938, + "router_z_loss_mlp": 0.20117188, + "step": 3572, + "time_per_iteration": 2.550900459289551 + }, + { + "auxiliary_loss_clip": 0.06544446, + "auxiliary_loss_mlp": 0.01290796, + "balance_loss_clip": 0.06300403, + "balance_loss_mlp": 0.01271114, + "epoch": 0.21482038178265445, + "flos": 24835267203840.0, + "grad_norm": 5.91831897424108, + "language_loss": 0.6534397, + "learning_rate": 3.652247675452598e-06, + "loss": 0.73179209, + "num_input_tokens_seen": 77108970, + "router_z_loss_clip": 2.44140625, + "router_z_loss_mlp": 0.19677734, + "step": 3573, + "time_per_iteration": 2.574037551879883 + }, + { + "auxiliary_loss_clip": 0.06536618, + "auxiliary_loss_mlp": 0.01287357, + "balance_loss_clip": 0.06305118, + "balance_loss_mlp": 0.0126814, + "epoch": 0.2148805050353224, + "flos": 23264927435520.0, + "grad_norm": 1.8228372560216166, + "language_loss": 0.76129293, + "learning_rate": 3.652028186908807e-06, + "loss": 0.83953267, + "num_input_tokens_seen": 77126045, + "router_z_loss_clip": 2.31640625, + "router_z_loss_mlp": 0.1920166, + "step": 3574, + "time_per_iteration": 2.610541343688965 + }, + { + "auxiliary_loss_clip": 0.06537417, + "auxiliary_loss_mlp": 0.01280783, + "balance_loss_clip": 0.06298707, + "balance_loss_mlp": 0.0126066, + "epoch": 0.21494062828799038, + "flos": 21326907951360.0, + "grad_norm": 2.0935140233911644, + "language_loss": 0.72909325, + "learning_rate": 3.6518086357199416e-06, + "loss": 0.8072753, + "num_input_tokens_seen": 77144600, + "router_z_loss_clip": 2.38867188, + "router_z_loss_mlp": 0.20117188, + "step": 3575, + "time_per_iteration": 2.581932306289673 + }, + { + "auxiliary_loss_clip": 0.06537387, + "auxiliary_loss_mlp": 0.01288909, + "balance_loss_clip": 0.06302074, + "balance_loss_mlp": 0.01269657, + "epoch": 0.21500075154065834, + "flos": 18849276172800.0, + "grad_norm": 2.2103119968131986, + "language_loss": 0.6923548, + "learning_rate": 3.6515890218943277e-06, + "loss": 0.77061772, + "num_input_tokens_seen": 77162965, + "router_z_loss_clip": 2.35742188, + "router_z_loss_mlp": 0.19262695, + "step": 3576, + "time_per_iteration": 5.394233703613281 + }, + { + "auxiliary_loss_clip": 0.06547244, + "auxiliary_loss_mlp": 0.01282016, + "balance_loss_clip": 0.06304461, + "balance_loss_mlp": 0.0126069, + "epoch": 0.2150608747933263, + "flos": 18447872388480.0, + "grad_norm": 1.9274083971527407, + "language_loss": 0.89371777, + "learning_rate": 3.651369345440292e-06, + "loss": 0.97201031, + "num_input_tokens_seen": 77179960, + "router_z_loss_clip": 2.42773438, + "router_z_loss_mlp": 0.21337891, + "step": 3577, + "time_per_iteration": 2.5629777908325195 + }, + { + "auxiliary_loss_clip": 0.06425267, + "auxiliary_loss_mlp": 0.01303124, + "balance_loss_clip": 0.06298774, + "balance_loss_mlp": 0.01297548, + "epoch": 0.2151209980459943, + "flos": 66617443808640.0, + "grad_norm": 0.7978427219987446, + "language_loss": 0.56304139, + "learning_rate": 3.6511496063661654e-06, + "loss": 0.64032531, + "num_input_tokens_seen": 77239500, + "router_z_loss_clip": 1.265625, + "router_z_loss_mlp": 0.05581665, + "step": 3578, + "time_per_iteration": 3.0982370376586914 + }, + { + "auxiliary_loss_clip": 0.06546376, + "auxiliary_loss_mlp": 0.0128684, + "balance_loss_clip": 0.06309081, + "balance_loss_mlp": 0.0126729, + "epoch": 0.21518112129866226, + "flos": 21581633963520.0, + "grad_norm": 1.7619248126111737, + "language_loss": 0.89097106, + "learning_rate": 3.6509298046802807e-06, + "loss": 0.96930325, + "num_input_tokens_seen": 77254680, + "router_z_loss_clip": 2.37304688, + "router_z_loss_mlp": 0.19555664, + "step": 3579, + "time_per_iteration": 2.5552327632904053 + }, + { + "auxiliary_loss_clip": 0.06544919, + "auxiliary_loss_mlp": 0.01281002, + "balance_loss_clip": 0.06304899, + "balance_loss_mlp": 0.01260498, + "epoch": 0.21524124455133023, + "flos": 20053822942080.0, + "grad_norm": 1.8548300822509616, + "language_loss": 0.78671825, + "learning_rate": 3.650709940390972e-06, + "loss": 0.86497748, + "num_input_tokens_seen": 77274060, + "router_z_loss_clip": 2.39648438, + "router_z_loss_mlp": 0.20507812, + "step": 3580, + "time_per_iteration": 2.538740634918213 + }, + { + "auxiliary_loss_clip": 0.06547832, + "auxiliary_loss_mlp": 0.01284221, + "balance_loss_clip": 0.06311843, + "balance_loss_mlp": 0.01265279, + "epoch": 0.2153013678039982, + "flos": 23958680515200.0, + "grad_norm": 2.0040984242528905, + "language_loss": 0.73520374, + "learning_rate": 3.6504900135065775e-06, + "loss": 0.81352425, + "num_input_tokens_seen": 77293255, + "router_z_loss_clip": 2.359375, + "router_z_loss_mlp": 0.18933105, + "step": 3581, + "time_per_iteration": 2.5783493518829346 + }, + { + "auxiliary_loss_clip": 0.06544261, + "auxiliary_loss_mlp": 0.01283002, + "balance_loss_clip": 0.06307264, + "balance_loss_mlp": 0.01262438, + "epoch": 0.21536149105666616, + "flos": 20601107884800.0, + "grad_norm": 2.9043222851567574, + "language_loss": 0.71477044, + "learning_rate": 3.6502700240354357e-06, + "loss": 0.79304302, + "num_input_tokens_seen": 77312390, + "router_z_loss_clip": 2.3671875, + "router_z_loss_mlp": 0.20556641, + "step": 3582, + "time_per_iteration": 2.5253281593322754 + }, + { + "auxiliary_loss_clip": 0.06553562, + "auxiliary_loss_mlp": 0.01282797, + "balance_loss_clip": 0.06315581, + "balance_loss_mlp": 0.01262209, + "epoch": 0.21542161430933413, + "flos": 12865046077440.0, + "grad_norm": 2.5916269023447795, + "language_loss": 0.85900396, + "learning_rate": 3.650049971985889e-06, + "loss": 0.93736756, + "num_input_tokens_seen": 77330985, + "router_z_loss_clip": 2.37890625, + "router_z_loss_mlp": 0.20568848, + "step": 3583, + "time_per_iteration": 2.580411434173584 + }, + { + "auxiliary_loss_clip": 0.0655268, + "auxiliary_loss_mlp": 0.01295505, + "balance_loss_clip": 0.06312086, + "balance_loss_mlp": 0.01275561, + "epoch": 0.21548173756200212, + "flos": 26111077470720.0, + "grad_norm": 2.720923149453336, + "language_loss": 0.83510441, + "learning_rate": 3.6498298573662824e-06, + "loss": 0.91358626, + "num_input_tokens_seen": 77350770, + "router_z_loss_clip": 2.40625, + "router_z_loss_mlp": 0.19934082, + "step": 3584, + "time_per_iteration": 2.587843179702759 + }, + { + "auxiliary_loss_clip": 0.06549002, + "auxiliary_loss_mlp": 0.01288111, + "balance_loss_clip": 0.06314336, + "balance_loss_mlp": 0.01267667, + "epoch": 0.21554186081467008, + "flos": 22170315623040.0, + "grad_norm": 2.7712372256622357, + "language_loss": 0.91010725, + "learning_rate": 3.6496096801849625e-06, + "loss": 0.9884783, + "num_input_tokens_seen": 77370510, + "router_z_loss_clip": 2.34375, + "router_z_loss_mlp": 0.20446777, + "step": 3585, + "time_per_iteration": 2.5638017654418945 + }, + { + "auxiliary_loss_clip": 0.06548285, + "auxiliary_loss_mlp": 0.0129374, + "balance_loss_clip": 0.0631054, + "balance_loss_mlp": 0.012745, + "epoch": 0.21560198406733805, + "flos": 22973458608000.0, + "grad_norm": 2.0799258962001548, + "language_loss": 0.75285476, + "learning_rate": 3.649389440450277e-06, + "loss": 0.83127499, + "num_input_tokens_seen": 77390645, + "router_z_loss_clip": 2.37890625, + "router_z_loss_mlp": 0.19238281, + "step": 3586, + "time_per_iteration": 2.5816385746002197 + }, + { + "auxiliary_loss_clip": 0.06560329, + "auxiliary_loss_mlp": 0.01301548, + "balance_loss_clip": 0.06317623, + "balance_loss_mlp": 0.012817, + "epoch": 0.215662107320006, + "flos": 22790708853120.0, + "grad_norm": 1.7819627104594034, + "language_loss": 0.83628035, + "learning_rate": 3.6491691381705804e-06, + "loss": 0.91489911, + "num_input_tokens_seen": 77409655, + "router_z_loss_clip": 2.42382812, + "router_z_loss_mlp": 0.19848633, + "step": 3587, + "time_per_iteration": 2.5768468379974365 + }, + { + "auxiliary_loss_clip": 0.06549525, + "auxiliary_loss_mlp": 0.01284104, + "balance_loss_clip": 0.06311873, + "balance_loss_mlp": 0.01265114, + "epoch": 0.21572223057267398, + "flos": 30891850899840.0, + "grad_norm": 2.819752743062096, + "language_loss": 0.764575, + "learning_rate": 3.648948773354224e-06, + "loss": 0.8429113, + "num_input_tokens_seen": 77430560, + "router_z_loss_clip": 2.37890625, + "router_z_loss_mlp": 0.18981934, + "step": 3588, + "time_per_iteration": 2.6578357219696045 + }, + { + "auxiliary_loss_clip": 0.06557232, + "auxiliary_loss_mlp": 0.01294163, + "balance_loss_clip": 0.06316121, + "balance_loss_mlp": 0.01274494, + "epoch": 0.21578235382534194, + "flos": 26918413159680.0, + "grad_norm": 3.674353356251158, + "language_loss": 0.8181411, + "learning_rate": 3.6487283460095643e-06, + "loss": 0.89665502, + "num_input_tokens_seen": 77455000, + "router_z_loss_clip": 2.4140625, + "router_z_loss_mlp": 0.19689941, + "step": 3589, + "time_per_iteration": 2.6730964183807373 + }, + { + "auxiliary_loss_clip": 0.06560542, + "auxiliary_loss_mlp": 0.01287343, + "balance_loss_clip": 0.06321919, + "balance_loss_mlp": 0.01267959, + "epoch": 0.2158424770780099, + "flos": 24432605608320.0, + "grad_norm": 2.119721317496626, + "language_loss": 0.73323047, + "learning_rate": 3.648507856144961e-06, + "loss": 0.81170928, + "num_input_tokens_seen": 77475075, + "router_z_loss_clip": 2.38671875, + "router_z_loss_mlp": 0.19384766, + "step": 3590, + "time_per_iteration": 2.5885848999023438 + }, + { + "auxiliary_loss_clip": 0.06554762, + "auxiliary_loss_mlp": 0.0128494, + "balance_loss_clip": 0.06310897, + "balance_loss_mlp": 0.01264401, + "epoch": 0.2159026003306779, + "flos": 23956542236160.0, + "grad_norm": 2.0666561712978813, + "language_loss": 0.84929311, + "learning_rate": 3.648287303768775e-06, + "loss": 0.92769015, + "num_input_tokens_seen": 77495945, + "router_z_loss_clip": 2.44140625, + "router_z_loss_mlp": 0.20544434, + "step": 3591, + "time_per_iteration": 2.5598154067993164 + }, + { + "auxiliary_loss_clip": 0.0656037, + "auxiliary_loss_mlp": 0.01294269, + "balance_loss_clip": 0.06315921, + "balance_loss_mlp": 0.01272776, + "epoch": 0.21596272358334587, + "flos": 30048191665920.0, + "grad_norm": 1.8943006547331833, + "language_loss": 0.69118065, + "learning_rate": 3.6480666888893686e-06, + "loss": 0.76972699, + "num_input_tokens_seen": 77517140, + "router_z_loss_clip": 2.44140625, + "router_z_loss_mlp": 0.21496582, + "step": 3592, + "time_per_iteration": 2.623124599456787 + }, + { + "auxiliary_loss_clip": 0.06558264, + "auxiliary_loss_mlp": 0.01284651, + "balance_loss_clip": 0.06314576, + "balance_loss_mlp": 0.01264218, + "epoch": 0.21602284683601383, + "flos": 20382495782400.0, + "grad_norm": 3.2836833125469753, + "language_loss": 0.84947151, + "learning_rate": 3.647846011515108e-06, + "loss": 0.92790061, + "num_input_tokens_seen": 77536085, + "router_z_loss_clip": 2.4375, + "router_z_loss_mlp": 0.2043457, + "step": 3593, + "time_per_iteration": 2.5159051418304443 + }, + { + "auxiliary_loss_clip": 0.06551524, + "auxiliary_loss_mlp": 0.01289729, + "balance_loss_clip": 0.06309479, + "balance_loss_mlp": 0.01267615, + "epoch": 0.2160829700886818, + "flos": 20783648004480.0, + "grad_norm": 2.6962087820066567, + "language_loss": 0.76424301, + "learning_rate": 3.6476252716543625e-06, + "loss": 0.84265554, + "num_input_tokens_seen": 77553675, + "router_z_loss_clip": 2.42382812, + "router_z_loss_mlp": 0.22119141, + "step": 3594, + "time_per_iteration": 2.530874490737915 + }, + { + "auxiliary_loss_clip": 0.06549954, + "auxiliary_loss_mlp": 0.01280574, + "balance_loss_clip": 0.06313863, + "balance_loss_mlp": 0.01260189, + "epoch": 0.21614309334134976, + "flos": 22316322562560.0, + "grad_norm": 1.5622924015328905, + "language_loss": 0.80828846, + "learning_rate": 3.6474044693155007e-06, + "loss": 0.88659382, + "num_input_tokens_seen": 77573360, + "router_z_loss_clip": 2.36132812, + "router_z_loss_mlp": 0.20385742, + "step": 3595, + "time_per_iteration": 2.5720436573028564 + }, + { + "auxiliary_loss_clip": 0.0655812, + "auxiliary_loss_mlp": 0.01282788, + "balance_loss_clip": 0.06310599, + "balance_loss_mlp": 0.01261962, + "epoch": 0.21620321659401773, + "flos": 19615592488320.0, + "grad_norm": 2.071968351759389, + "language_loss": 0.79120421, + "learning_rate": 3.647183604506897e-06, + "loss": 0.86961329, + "num_input_tokens_seen": 77591865, + "router_z_loss_clip": 2.4765625, + "router_z_loss_mlp": 0.20825195, + "step": 3596, + "time_per_iteration": 2.529978036880493 + }, + { + "auxiliary_loss_clip": 0.06547653, + "auxiliary_loss_mlp": 0.01279822, + "balance_loss_clip": 0.06309111, + "balance_loss_mlp": 0.01258615, + "epoch": 0.2162633398466857, + "flos": 18850701692160.0, + "grad_norm": 1.8098333997433065, + "language_loss": 0.83728772, + "learning_rate": 3.6469626772369253e-06, + "loss": 0.91556245, + "num_input_tokens_seen": 77611600, + "router_z_loss_clip": 2.38671875, + "router_z_loss_mlp": 0.2121582, + "step": 3597, + "time_per_iteration": 2.514389991760254 + }, + { + "auxiliary_loss_clip": 0.06559294, + "auxiliary_loss_mlp": 0.01284022, + "balance_loss_clip": 0.06315802, + "balance_loss_mlp": 0.01262421, + "epoch": 0.21632346309935369, + "flos": 18774490803840.0, + "grad_norm": 2.0845397374343655, + "language_loss": 0.81213892, + "learning_rate": 3.6467416875139642e-06, + "loss": 0.89057213, + "num_input_tokens_seen": 77630665, + "router_z_loss_clip": 2.4375, + "router_z_loss_mlp": 0.21606445, + "step": 3598, + "time_per_iteration": 2.517596960067749 + }, + { + "auxiliary_loss_clip": 0.06554621, + "auxiliary_loss_mlp": 0.01287936, + "balance_loss_clip": 0.06312433, + "balance_loss_mlp": 0.01265072, + "epoch": 0.21638358635202165, + "flos": 26331576289920.0, + "grad_norm": 1.6266226591192001, + "language_loss": 0.82318664, + "learning_rate": 3.6465206353463934e-06, + "loss": 0.90161228, + "num_input_tokens_seen": 77650835, + "router_z_loss_clip": 2.41992188, + "router_z_loss_mlp": 0.22851562, + "step": 3599, + "time_per_iteration": 2.567528486251831 + }, + { + "auxiliary_loss_clip": 0.06553015, + "auxiliary_loss_mlp": 0.01284743, + "balance_loss_clip": 0.06314674, + "balance_loss_mlp": 0.01263107, + "epoch": 0.21644370960468962, + "flos": 20747156751360.0, + "grad_norm": 2.0891036476830585, + "language_loss": 0.76652539, + "learning_rate": 3.6462995207425947e-06, + "loss": 0.84490293, + "num_input_tokens_seen": 77669000, + "router_z_loss_clip": 2.38476562, + "router_z_loss_mlp": 0.21618652, + "step": 3600, + "time_per_iteration": 2.5642178058624268 + }, + { + "auxiliary_loss_clip": 0.06555548, + "auxiliary_loss_mlp": 0.01287253, + "balance_loss_clip": 0.06316924, + "balance_loss_mlp": 0.01267512, + "epoch": 0.21650383285735758, + "flos": 23959183639680.0, + "grad_norm": 1.8375873098897355, + "language_loss": 0.80812716, + "learning_rate": 3.6460783437109533e-06, + "loss": 0.88655519, + "num_input_tokens_seen": 77688745, + "router_z_loss_clip": 2.38476562, + "router_z_loss_mlp": 0.19726562, + "step": 3601, + "time_per_iteration": 2.536790132522583 + }, + { + "auxiliary_loss_clip": 0.06558496, + "auxiliary_loss_mlp": 0.01286287, + "balance_loss_clip": 0.06317312, + "balance_loss_mlp": 0.01265783, + "epoch": 0.21656395611002555, + "flos": 23702864400000.0, + "grad_norm": 1.8593805820505158, + "language_loss": 0.84205902, + "learning_rate": 3.6458571042598565e-06, + "loss": 0.92050683, + "num_input_tokens_seen": 77708445, + "router_z_loss_clip": 2.41210938, + "router_z_loss_mlp": 0.2052002, + "step": 3602, + "time_per_iteration": 2.5919816493988037 + }, + { + "auxiliary_loss_clip": 0.06553967, + "auxiliary_loss_mlp": 0.01285958, + "balance_loss_clip": 0.06313825, + "balance_loss_mlp": 0.01265371, + "epoch": 0.2166240793626935, + "flos": 20672035966080.0, + "grad_norm": 1.6537912100509087, + "language_loss": 0.75107038, + "learning_rate": 3.645635802397693e-06, + "loss": 0.82946962, + "num_input_tokens_seen": 77728465, + "router_z_loss_clip": 2.40429688, + "router_z_loss_mlp": 0.20581055, + "step": 3603, + "time_per_iteration": 2.5602827072143555 + }, + { + "auxiliary_loss_clip": 0.06545025, + "auxiliary_loss_mlp": 0.01279689, + "balance_loss_clip": 0.06314509, + "balance_loss_mlp": 0.0125996, + "epoch": 0.2166842026153615, + "flos": 21586916770560.0, + "grad_norm": 1.9607230977514314, + "language_loss": 0.75016356, + "learning_rate": 3.645414438132855e-06, + "loss": 0.82841063, + "num_input_tokens_seen": 77746735, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.1973877, + "step": 3604, + "time_per_iteration": 2.7099287509918213 + }, + { + "auxiliary_loss_clip": 0.06550605, + "auxiliary_loss_mlp": 0.01283396, + "balance_loss_clip": 0.06315283, + "balance_loss_mlp": 0.01263881, + "epoch": 0.21674432586802947, + "flos": 25637068523520.0, + "grad_norm": 1.5948705207891358, + "language_loss": 0.80732697, + "learning_rate": 3.6451930114737366e-06, + "loss": 0.88566697, + "num_input_tokens_seen": 77768105, + "router_z_loss_clip": 2.35351562, + "router_z_loss_mlp": 0.19506836, + "step": 3605, + "time_per_iteration": 2.601269483566284 + }, + { + "auxiliary_loss_clip": 0.06465107, + "auxiliary_loss_mlp": 0.01314575, + "balance_loss_clip": 0.0633797, + "balance_loss_mlp": 0.01307596, + "epoch": 0.21680444912069743, + "flos": 56435126376960.0, + "grad_norm": 0.68181157035555, + "language_loss": 0.58316016, + "learning_rate": 3.6449715224287347e-06, + "loss": 0.66095698, + "num_input_tokens_seen": 77833750, + "router_z_loss_clip": 1.27246094, + "router_z_loss_mlp": 0.06994629, + "step": 3606, + "time_per_iteration": 3.2531886100769043 + }, + { + "auxiliary_loss_clip": 0.06547002, + "auxiliary_loss_mlp": 0.01286663, + "balance_loss_clip": 0.06303971, + "balance_loss_mlp": 0.01264502, + "epoch": 0.2168645723733654, + "flos": 23885823790080.0, + "grad_norm": 1.8693102201830953, + "language_loss": 0.73682618, + "learning_rate": 3.644749971006248e-06, + "loss": 0.81516284, + "num_input_tokens_seen": 77853780, + "router_z_loss_clip": 2.42773438, + "router_z_loss_mlp": 0.22155762, + "step": 3607, + "time_per_iteration": 4.0285868644714355 + }, + { + "auxiliary_loss_clip": 0.06548688, + "auxiliary_loss_mlp": 0.01281672, + "balance_loss_clip": 0.06306184, + "balance_loss_mlp": 0.01259595, + "epoch": 0.21692469562603336, + "flos": 16951814864640.0, + "grad_norm": 1.845726065350227, + "language_loss": 0.78116572, + "learning_rate": 3.6445283572146765e-06, + "loss": 0.85946935, + "num_input_tokens_seen": 77872575, + "router_z_loss_clip": 2.421875, + "router_z_loss_mlp": 0.22070312, + "step": 3608, + "time_per_iteration": 2.4997665882110596 + }, + { + "auxiliary_loss_clip": 0.06549841, + "auxiliary_loss_mlp": 0.01279583, + "balance_loss_clip": 0.06307275, + "balance_loss_mlp": 0.01260248, + "epoch": 0.21698481887870133, + "flos": 25126065198720.0, + "grad_norm": 2.052249511327834, + "language_loss": 0.74638152, + "learning_rate": 3.6443066810624255e-06, + "loss": 0.82467568, + "num_input_tokens_seen": 77892700, + "router_z_loss_clip": 2.42773438, + "router_z_loss_mlp": 0.19335938, + "step": 3609, + "time_per_iteration": 2.5834193229675293 + }, + { + "auxiliary_loss_clip": 0.06540599, + "auxiliary_loss_mlp": 0.01279572, + "balance_loss_clip": 0.06301089, + "balance_loss_mlp": 0.01258221, + "epoch": 0.2170449421313693, + "flos": 17900461664640.0, + "grad_norm": 2.066668805909691, + "language_loss": 0.8888129, + "learning_rate": 3.6440849425579e-06, + "loss": 0.96701467, + "num_input_tokens_seen": 77911060, + "router_z_loss_clip": 2.39648438, + "router_z_loss_mlp": 0.21374512, + "step": 3610, + "time_per_iteration": 3.978980302810669 + }, + { + "auxiliary_loss_clip": 0.06540407, + "auxiliary_loss_mlp": 0.01284961, + "balance_loss_clip": 0.06302356, + "balance_loss_mlp": 0.01264457, + "epoch": 0.2171050653840373, + "flos": 22645121184000.0, + "grad_norm": 2.4524698956279978, + "language_loss": 0.78034103, + "learning_rate": 3.6438631417095095e-06, + "loss": 0.85859472, + "num_input_tokens_seen": 77929930, + "router_z_loss_clip": 2.38085938, + "router_z_loss_mlp": 0.20507812, + "step": 3611, + "time_per_iteration": 2.537783622741699 + }, + { + "auxiliary_loss_clip": 0.06539893, + "auxiliary_loss_mlp": 0.01277493, + "balance_loss_clip": 0.06301216, + "balance_loss_mlp": 0.0125619, + "epoch": 0.21716518863670525, + "flos": 19506034874880.0, + "grad_norm": 1.9372172398113192, + "language_loss": 0.63866782, + "learning_rate": 3.6436412785256637e-06, + "loss": 0.71684164, + "num_input_tokens_seen": 77949060, + "router_z_loss_clip": 2.38671875, + "router_z_loss_mlp": 0.21313477, + "step": 3612, + "time_per_iteration": 2.5200283527374268 + }, + { + "auxiliary_loss_clip": 0.06543254, + "auxiliary_loss_mlp": 0.01280194, + "balance_loss_clip": 0.06303414, + "balance_loss_mlp": 0.01259761, + "epoch": 0.21722531188937322, + "flos": 19798132608000.0, + "grad_norm": 1.7866878621114652, + "language_loss": 0.76463711, + "learning_rate": 3.643419353014776e-06, + "loss": 0.84287155, + "num_input_tokens_seen": 77967920, + "router_z_loss_clip": 2.40039062, + "router_z_loss_mlp": 0.2043457, + "step": 3613, + "time_per_iteration": 2.536395311355591 + }, + { + "auxiliary_loss_clip": 0.06540725, + "auxiliary_loss_mlp": 0.01277778, + "balance_loss_clip": 0.06303174, + "balance_loss_mlp": 0.01256165, + "epoch": 0.21728543514204118, + "flos": 13339474295040.0, + "grad_norm": 1.8023674067133515, + "language_loss": 0.72213733, + "learning_rate": 3.643197365185261e-06, + "loss": 0.80032235, + "num_input_tokens_seen": 77985330, + "router_z_loss_clip": 2.375, + "router_z_loss_mlp": 0.21582031, + "step": 3614, + "time_per_iteration": 2.5000360012054443 + }, + { + "auxiliary_loss_clip": 0.06542929, + "auxiliary_loss_mlp": 0.01277823, + "balance_loss_clip": 0.06304483, + "balance_loss_mlp": 0.01256973, + "epoch": 0.21734555839470915, + "flos": 15237312946560.0, + "grad_norm": 2.7303590898197463, + "language_loss": 0.73928845, + "learning_rate": 3.6429753150455378e-06, + "loss": 0.81749594, + "num_input_tokens_seen": 78003105, + "router_z_loss_clip": 2.38476562, + "router_z_loss_mlp": 0.20849609, + "step": 3615, + "time_per_iteration": 3.924616813659668 + }, + { + "auxiliary_loss_clip": 0.0654763, + "auxiliary_loss_mlp": 0.01279327, + "balance_loss_clip": 0.06301322, + "balance_loss_mlp": 0.0125694, + "epoch": 0.2174056816473771, + "flos": 19980043822080.0, + "grad_norm": 2.1391350951981467, + "language_loss": 0.913239, + "learning_rate": 3.6427532026040263e-06, + "loss": 0.99150848, + "num_input_tokens_seen": 78019655, + "router_z_loss_clip": 2.46875, + "router_z_loss_mlp": 0.22387695, + "step": 3616, + "time_per_iteration": 3.9379403591156006 + }, + { + "auxiliary_loss_clip": 0.06540038, + "auxiliary_loss_mlp": 0.01284656, + "balance_loss_clip": 0.06298746, + "balance_loss_mlp": 0.01263163, + "epoch": 0.21746580490004508, + "flos": 16692309169920.0, + "grad_norm": 2.057861674488091, + "language_loss": 0.81572813, + "learning_rate": 3.642531027869148e-06, + "loss": 0.89397502, + "num_input_tokens_seen": 78036025, + "router_z_loss_clip": 2.41601562, + "router_z_loss_mlp": 0.21496582, + "step": 3617, + "time_per_iteration": 2.5517330169677734 + }, + { + "auxiliary_loss_clip": 0.06543958, + "auxiliary_loss_mlp": 0.01280959, + "balance_loss_clip": 0.06300673, + "balance_loss_mlp": 0.01258881, + "epoch": 0.21752592815271307, + "flos": 25778840832000.0, + "grad_norm": 1.7475820668036919, + "language_loss": 0.76030993, + "learning_rate": 3.642308790849329e-06, + "loss": 0.83855915, + "num_input_tokens_seen": 78055645, + "router_z_loss_clip": 2.43164062, + "router_z_loss_mlp": 0.2208252, + "step": 3618, + "time_per_iteration": 2.5874650478363037 + }, + { + "auxiliary_loss_clip": 0.06542084, + "auxiliary_loss_mlp": 0.01277743, + "balance_loss_clip": 0.06299525, + "balance_loss_mlp": 0.01255928, + "epoch": 0.21758605140538104, + "flos": 11259430940160.0, + "grad_norm": 1.9309868599682727, + "language_loss": 0.69592559, + "learning_rate": 3.642086491552996e-06, + "loss": 0.77412391, + "num_input_tokens_seen": 78071660, + "router_z_loss_clip": 2.421875, + "router_z_loss_mlp": 0.21826172, + "step": 3619, + "time_per_iteration": 2.5259079933166504 + }, + { + "auxiliary_loss_clip": 0.06549741, + "auxiliary_loss_mlp": 0.01287424, + "balance_loss_clip": 0.06307657, + "balance_loss_mlp": 0.01264906, + "epoch": 0.217646174658049, + "flos": 19248290115840.0, + "grad_norm": 1.6696593228851853, + "language_loss": 0.78744078, + "learning_rate": 3.641864129988579e-06, + "loss": 0.86581242, + "num_input_tokens_seen": 78091265, + "router_z_loss_clip": 2.41992188, + "router_z_loss_mlp": 0.22521973, + "step": 3620, + "time_per_iteration": 2.5225844383239746 + }, + { + "auxiliary_loss_clip": 0.06542689, + "auxiliary_loss_mlp": 0.01283495, + "balance_loss_clip": 0.06306273, + "balance_loss_mlp": 0.01263349, + "epoch": 0.21770629791071697, + "flos": 21951619666560.0, + "grad_norm": 1.6751510482296663, + "language_loss": 0.80184436, + "learning_rate": 3.641641706164509e-06, + "loss": 0.88010621, + "num_input_tokens_seen": 78110095, + "router_z_loss_clip": 2.36328125, + "router_z_loss_mlp": 0.20141602, + "step": 3621, + "time_per_iteration": 2.5528457164764404 + }, + { + "auxiliary_loss_clip": 0.0654473, + "auxiliary_loss_mlp": 0.01278712, + "balance_loss_clip": 0.06305254, + "balance_loss_mlp": 0.012594, + "epoch": 0.21776642116338493, + "flos": 24943776641280.0, + "grad_norm": 1.5217586163816694, + "language_loss": 0.87951142, + "learning_rate": 3.641419220089221e-06, + "loss": 0.95774585, + "num_input_tokens_seen": 78129475, + "router_z_loss_clip": 2.39648438, + "router_z_loss_mlp": 0.19299316, + "step": 3622, + "time_per_iteration": 2.621716022491455 + }, + { + "auxiliary_loss_clip": 0.06559718, + "auxiliary_loss_mlp": 0.01277107, + "balance_loss_clip": 0.06313318, + "balance_loss_mlp": 0.01254445, + "epoch": 0.2178265444160529, + "flos": 17827017960960.0, + "grad_norm": 3.34018590012949, + "language_loss": 0.77879506, + "learning_rate": 3.641196671771152e-06, + "loss": 0.85716331, + "num_input_tokens_seen": 78146880, + "router_z_loss_clip": 2.46289062, + "router_z_loss_mlp": 0.22668457, + "step": 3623, + "time_per_iteration": 2.5479788780212402 + }, + { + "auxiliary_loss_clip": 0.0655373, + "auxiliary_loss_mlp": 0.01283267, + "balance_loss_clip": 0.06310436, + "balance_loss_mlp": 0.0126132, + "epoch": 0.2178866676687209, + "flos": 17718760085760.0, + "grad_norm": 2.118806527220675, + "language_loss": 0.85078007, + "learning_rate": 3.640974061218741e-06, + "loss": 0.92914999, + "num_input_tokens_seen": 78165065, + "router_z_loss_clip": 2.43359375, + "router_z_loss_mlp": 0.21936035, + "step": 3624, + "time_per_iteration": 2.4991443157196045 + }, + { + "auxiliary_loss_clip": 0.06544428, + "auxiliary_loss_mlp": 0.01281962, + "balance_loss_clip": 0.06301346, + "balance_loss_mlp": 0.01259014, + "epoch": 0.21794679092138886, + "flos": 16951437521280.0, + "grad_norm": 2.3785715622769357, + "language_loss": 0.7814458, + "learning_rate": 3.640751388440429e-06, + "loss": 0.85970974, + "num_input_tokens_seen": 78180005, + "router_z_loss_clip": 2.42773438, + "router_z_loss_mlp": 0.22961426, + "step": 3625, + "time_per_iteration": 2.5113301277160645 + }, + { + "auxiliary_loss_clip": 0.06435797, + "auxiliary_loss_mlp": 0.01281105, + "balance_loss_clip": 0.0630773, + "balance_loss_mlp": 0.01275631, + "epoch": 0.21800691417405682, + "flos": 63737737413120.0, + "grad_norm": 0.7732492376258139, + "language_loss": 0.60674119, + "learning_rate": 3.64052865344466e-06, + "loss": 0.68391013, + "num_input_tokens_seen": 78245350, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.05477905, + "step": 3626, + "time_per_iteration": 3.230576992034912 + }, + { + "auxiliary_loss_clip": 0.06551459, + "auxiliary_loss_mlp": 0.01275255, + "balance_loss_clip": 0.06306285, + "balance_loss_mlp": 0.01252271, + "epoch": 0.21806703742672479, + "flos": 21622821045120.0, + "grad_norm": 2.0426080259896664, + "language_loss": 0.91217983, + "learning_rate": 3.6403058562398795e-06, + "loss": 0.99044704, + "num_input_tokens_seen": 78264165, + "router_z_loss_clip": 2.45117188, + "router_z_loss_mlp": 0.22961426, + "step": 3627, + "time_per_iteration": 2.571704149246216 + }, + { + "auxiliary_loss_clip": 0.06549745, + "auxiliary_loss_mlp": 0.01279629, + "balance_loss_clip": 0.06307864, + "balance_loss_mlp": 0.01257313, + "epoch": 0.21812716067939275, + "flos": 19361034184320.0, + "grad_norm": 1.8240036323551578, + "language_loss": 0.74830574, + "learning_rate": 3.6400829968345365e-06, + "loss": 0.82659948, + "num_input_tokens_seen": 78283745, + "router_z_loss_clip": 2.421875, + "router_z_loss_mlp": 0.2232666, + "step": 3628, + "time_per_iteration": 2.5547990798950195 + }, + { + "auxiliary_loss_clip": 0.06543273, + "auxiliary_loss_mlp": 0.01279581, + "balance_loss_clip": 0.06304347, + "balance_loss_mlp": 0.01257039, + "epoch": 0.21818728393206072, + "flos": 23554467619200.0, + "grad_norm": 1.7805187473711719, + "language_loss": 0.77940357, + "learning_rate": 3.6398600752370826e-06, + "loss": 0.85763204, + "num_input_tokens_seen": 78302900, + "router_z_loss_clip": 2.38867188, + "router_z_loss_mlp": 0.2253418, + "step": 3629, + "time_per_iteration": 2.5777294635772705 + }, + { + "auxiliary_loss_clip": 0.06540327, + "auxiliary_loss_mlp": 0.01278528, + "balance_loss_clip": 0.06302765, + "balance_loss_mlp": 0.01257822, + "epoch": 0.21824740718472868, + "flos": 30233289335040.0, + "grad_norm": 1.6105707802077895, + "language_loss": 0.72294879, + "learning_rate": 3.63963709145597e-06, + "loss": 0.80113733, + "num_input_tokens_seen": 78326470, + "router_z_loss_clip": 2.375, + "router_z_loss_mlp": 0.20703125, + "step": 3630, + "time_per_iteration": 2.6015560626983643 + }, + { + "auxiliary_loss_clip": 0.06535304, + "auxiliary_loss_mlp": 0.01279689, + "balance_loss_clip": 0.06303381, + "balance_loss_mlp": 0.01259364, + "epoch": 0.21830753043739667, + "flos": 26140860397440.0, + "grad_norm": 1.9295675894773927, + "language_loss": 0.77031553, + "learning_rate": 3.6394140454996544e-06, + "loss": 0.8484655, + "num_input_tokens_seen": 78345810, + "router_z_loss_clip": 2.31835938, + "router_z_loss_mlp": 0.203125, + "step": 3631, + "time_per_iteration": 2.5712599754333496 + }, + { + "auxiliary_loss_clip": 0.06546577, + "auxiliary_loss_mlp": 0.01286362, + "balance_loss_clip": 0.06304416, + "balance_loss_mlp": 0.01265274, + "epoch": 0.21836765369006464, + "flos": 21726299237760.0, + "grad_norm": 24.58992261392957, + "language_loss": 0.76358086, + "learning_rate": 3.639190937376594e-06, + "loss": 0.84191024, + "num_input_tokens_seen": 78364085, + "router_z_loss_clip": 2.41992188, + "router_z_loss_mlp": 0.21081543, + "step": 3632, + "time_per_iteration": 2.5312108993530273 + }, + { + "auxiliary_loss_clip": 0.06541382, + "auxiliary_loss_mlp": 0.01277975, + "balance_loss_clip": 0.06306228, + "balance_loss_mlp": 0.01258008, + "epoch": 0.2184277769427326, + "flos": 19943678350080.0, + "grad_norm": 2.014902514553352, + "language_loss": 0.8455261, + "learning_rate": 3.638967767095249e-06, + "loss": 0.9237197, + "num_input_tokens_seen": 78381385, + "router_z_loss_clip": 2.35351562, + "router_z_loss_mlp": 0.19958496, + "step": 3633, + "time_per_iteration": 2.5392541885375977 + }, + { + "auxiliary_loss_clip": 0.06536385, + "auxiliary_loss_mlp": 0.01279679, + "balance_loss_clip": 0.06300621, + "balance_loss_mlp": 0.0125821, + "epoch": 0.21848790019540057, + "flos": 20346591507840.0, + "grad_norm": 2.269088705731375, + "language_loss": 0.82069844, + "learning_rate": 3.6387445346640823e-06, + "loss": 0.89885902, + "num_input_tokens_seen": 78400500, + "router_z_loss_clip": 2.35742188, + "router_z_loss_mlp": 0.21484375, + "step": 3634, + "time_per_iteration": 2.5536303520202637 + }, + { + "auxiliary_loss_clip": 0.06544928, + "auxiliary_loss_mlp": 0.01275115, + "balance_loss_clip": 0.063034, + "balance_loss_mlp": 0.01254063, + "epoch": 0.21854802344806853, + "flos": 15456302392320.0, + "grad_norm": 2.1744892406337133, + "language_loss": 0.75276726, + "learning_rate": 3.638521240091558e-06, + "loss": 0.83096772, + "num_input_tokens_seen": 78418340, + "router_z_loss_clip": 2.41796875, + "router_z_loss_mlp": 0.21044922, + "step": 3635, + "time_per_iteration": 2.5158851146698 + }, + { + "auxiliary_loss_clip": 0.0653572, + "auxiliary_loss_mlp": 0.01278867, + "balance_loss_clip": 0.06301719, + "balance_loss_mlp": 0.01259018, + "epoch": 0.2186081467007365, + "flos": 16325384140800.0, + "grad_norm": 1.9753193728837781, + "language_loss": 0.88470638, + "learning_rate": 3.6382978833861445e-06, + "loss": 0.96285218, + "num_input_tokens_seen": 78434375, + "router_z_loss_clip": 2.33984375, + "router_z_loss_mlp": 0.19836426, + "step": 3636, + "time_per_iteration": 2.5056772232055664 + }, + { + "auxiliary_loss_clip": 0.06536973, + "auxiliary_loss_mlp": 0.01285934, + "balance_loss_clip": 0.06300446, + "balance_loss_mlp": 0.01264798, + "epoch": 0.2186682699534045, + "flos": 21695677770240.0, + "grad_norm": 1.933426681732421, + "language_loss": 0.76219505, + "learning_rate": 3.638074464556311e-06, + "loss": 0.84042412, + "num_input_tokens_seen": 78451735, + "router_z_loss_clip": 2.3671875, + "router_z_loss_mlp": 0.21118164, + "step": 3637, + "time_per_iteration": 2.5159406661987305 + }, + { + "auxiliary_loss_clip": 0.06547473, + "auxiliary_loss_mlp": 0.01280148, + "balance_loss_clip": 0.06303671, + "balance_loss_mlp": 0.0125726, + "epoch": 0.21872839320607246, + "flos": 17743427913600.0, + "grad_norm": 3.0066644559057867, + "language_loss": 0.90341294, + "learning_rate": 3.63785098361053e-06, + "loss": 0.98168921, + "num_input_tokens_seen": 78462730, + "router_z_loss_clip": 2.43945312, + "router_z_loss_mlp": 0.22888184, + "step": 3638, + "time_per_iteration": 2.475271224975586 + }, + { + "auxiliary_loss_clip": 0.06535378, + "auxiliary_loss_mlp": 0.01286586, + "balance_loss_clip": 0.06297417, + "balance_loss_mlp": 0.01264318, + "epoch": 0.21878851645874042, + "flos": 18656757417600.0, + "grad_norm": 3.417327747399998, + "language_loss": 0.90034223, + "learning_rate": 3.637627440557275e-06, + "loss": 0.97856188, + "num_input_tokens_seen": 78476300, + "router_z_loss_clip": 2.38085938, + "router_z_loss_mlp": 0.22265625, + "step": 3639, + "time_per_iteration": 2.4722554683685303 + }, + { + "auxiliary_loss_clip": 0.06531254, + "auxiliary_loss_mlp": 0.01281993, + "balance_loss_clip": 0.06296734, + "balance_loss_mlp": 0.01262264, + "epoch": 0.2188486397114084, + "flos": 25564463360640.0, + "grad_norm": 1.6695470201966474, + "language_loss": 0.7997371, + "learning_rate": 3.637403835405024e-06, + "loss": 0.87786961, + "num_input_tokens_seen": 78496135, + "router_z_loss_clip": 2.34375, + "router_z_loss_mlp": 0.19726562, + "step": 3640, + "time_per_iteration": 2.5905494689941406 + }, + { + "auxiliary_loss_clip": 0.06541579, + "auxiliary_loss_mlp": 0.01284166, + "balance_loss_clip": 0.06302525, + "balance_loss_mlp": 0.01260074, + "epoch": 0.21890876296407635, + "flos": 17897400990720.0, + "grad_norm": 8.732271245188107, + "language_loss": 0.72940969, + "learning_rate": 3.637180168162255e-06, + "loss": 0.80766714, + "num_input_tokens_seen": 78513855, + "router_z_loss_clip": 2.38671875, + "router_z_loss_mlp": 0.24084473, + "step": 3641, + "time_per_iteration": 2.5452075004577637 + }, + { + "auxiliary_loss_clip": 0.06541288, + "auxiliary_loss_mlp": 0.01280481, + "balance_loss_clip": 0.06304857, + "balance_loss_mlp": 0.01259619, + "epoch": 0.21896888621674432, + "flos": 17754915922560.0, + "grad_norm": 1.8801395061290727, + "language_loss": 0.81693721, + "learning_rate": 3.63695643883745e-06, + "loss": 0.89515489, + "num_input_tokens_seen": 78531740, + "router_z_loss_clip": 2.36328125, + "router_z_loss_mlp": 0.20874023, + "step": 3642, + "time_per_iteration": 2.5234179496765137 + }, + { + "auxiliary_loss_clip": 0.06550857, + "auxiliary_loss_mlp": 0.01284985, + "balance_loss_clip": 0.06311135, + "balance_loss_mlp": 0.01262204, + "epoch": 0.21902900946941228, + "flos": 23082890440320.0, + "grad_norm": 1.5963488152753738, + "language_loss": 0.71952182, + "learning_rate": 3.6367326474390928e-06, + "loss": 0.79788017, + "num_input_tokens_seen": 78549600, + "router_z_loss_clip": 2.39648438, + "router_z_loss_mlp": 0.2277832, + "step": 3643, + "time_per_iteration": 2.5542049407958984 + }, + { + "auxiliary_loss_clip": 0.06535246, + "auxiliary_loss_mlp": 0.01285725, + "balance_loss_clip": 0.06298445, + "balance_loss_mlp": 0.01264506, + "epoch": 0.21908913272208028, + "flos": 48189501492480.0, + "grad_norm": 1.9271022520918928, + "language_loss": 0.69055694, + "learning_rate": 3.6365087939756696e-06, + "loss": 0.76876664, + "num_input_tokens_seen": 78573350, + "router_z_loss_clip": 2.37109375, + "router_z_loss_mlp": 0.21228027, + "step": 3644, + "time_per_iteration": 2.8034632205963135 + }, + { + "auxiliary_loss_clip": 0.06548485, + "auxiliary_loss_mlp": 0.01283418, + "balance_loss_clip": 0.06302129, + "balance_loss_mlp": 0.01261531, + "epoch": 0.21914925597474824, + "flos": 22243298129280.0, + "grad_norm": 2.4423330778710937, + "language_loss": 0.78728521, + "learning_rate": 3.636284878455669e-06, + "loss": 0.86560422, + "num_input_tokens_seen": 78591005, + "router_z_loss_clip": 2.46484375, + "router_z_loss_mlp": 0.21911621, + "step": 3645, + "time_per_iteration": 2.547746419906616 + }, + { + "auxiliary_loss_clip": 0.06531754, + "auxiliary_loss_mlp": 0.01275201, + "balance_loss_clip": 0.06300971, + "balance_loss_mlp": 0.01254936, + "epoch": 0.2192093792274162, + "flos": 22131853799040.0, + "grad_norm": 1.5020846701532837, + "language_loss": 0.82847381, + "learning_rate": 3.636060900887582e-06, + "loss": 0.90654337, + "num_input_tokens_seen": 78610645, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.20263672, + "step": 3646, + "time_per_iteration": 2.569216012954712 + }, + { + "auxiliary_loss_clip": 0.06536786, + "auxiliary_loss_mlp": 0.01283667, + "balance_loss_clip": 0.06302559, + "balance_loss_mlp": 0.01263449, + "epoch": 0.21926950248008417, + "flos": 15674914494720.0, + "grad_norm": 1.6949719683005162, + "language_loss": 0.83080441, + "learning_rate": 3.635836861279901e-06, + "loss": 0.90900892, + "num_input_tokens_seen": 78628340, + "router_z_loss_clip": 2.34179688, + "router_z_loss_mlp": 0.20227051, + "step": 3647, + "time_per_iteration": 3.9349160194396973 + }, + { + "auxiliary_loss_clip": 0.06534994, + "auxiliary_loss_mlp": 0.01281644, + "balance_loss_clip": 0.06301765, + "balance_loss_mlp": 0.01261105, + "epoch": 0.21932962573275214, + "flos": 30270199858560.0, + "grad_norm": 1.587891801710132, + "language_loss": 0.7257458, + "learning_rate": 3.635612759641123e-06, + "loss": 0.80391216, + "num_input_tokens_seen": 78649355, + "router_z_loss_clip": 2.33203125, + "router_z_loss_mlp": 0.20532227, + "step": 3648, + "time_per_iteration": 2.6465656757354736 + }, + { + "auxiliary_loss_clip": 0.06545104, + "auxiliary_loss_mlp": 0.0128538, + "balance_loss_clip": 0.06304809, + "balance_loss_mlp": 0.01263434, + "epoch": 0.2193897489854201, + "flos": 10784751160320.0, + "grad_norm": 3.088861131276654, + "language_loss": 0.74724281, + "learning_rate": 3.635388595979745e-06, + "loss": 0.8255477, + "num_input_tokens_seen": 78664915, + "router_z_loss_clip": 2.40039062, + "router_z_loss_mlp": 0.21960449, + "step": 3649, + "time_per_iteration": 2.510040283203125 + }, + { + "auxiliary_loss_clip": 0.06531087, + "auxiliary_loss_mlp": 0.01295006, + "balance_loss_clip": 0.06299826, + "balance_loss_mlp": 0.01274752, + "epoch": 0.21944987223808807, + "flos": 19138984064640.0, + "grad_norm": 4.303407628828735, + "language_loss": 0.86915123, + "learning_rate": 3.635164370304267e-06, + "loss": 0.94741207, + "num_input_tokens_seen": 78681475, + "router_z_loss_clip": 2.3125, + "router_z_loss_mlp": 0.20251465, + "step": 3650, + "time_per_iteration": 3.93752384185791 + }, + { + "auxiliary_loss_clip": 0.06543732, + "auxiliary_loss_mlp": 0.01294843, + "balance_loss_clip": 0.06307691, + "balance_loss_mlp": 0.01273422, + "epoch": 0.21950999549075606, + "flos": 22717726346880.0, + "grad_norm": 2.457938069648898, + "language_loss": 0.8456791, + "learning_rate": 3.6349400826231927e-06, + "loss": 0.92406487, + "num_input_tokens_seen": 78702300, + "router_z_loss_clip": 2.359375, + "router_z_loss_mlp": 0.2142334, + "step": 3651, + "time_per_iteration": 2.7058322429656982 + }, + { + "auxiliary_loss_clip": 0.06539044, + "auxiliary_loss_mlp": 0.01290725, + "balance_loss_clip": 0.06304742, + "balance_loss_mlp": 0.01270257, + "epoch": 0.21957011874342403, + "flos": 10565929422720.0, + "grad_norm": 1.8310150193660448, + "language_loss": 0.74885792, + "learning_rate": 3.634715732945027e-06, + "loss": 0.82715559, + "num_input_tokens_seen": 78720230, + "router_z_loss_clip": 2.34570312, + "router_z_loss_mlp": 0.20458984, + "step": 3652, + "time_per_iteration": 2.512620210647583 + }, + { + "auxiliary_loss_clip": 0.06458014, + "auxiliary_loss_mlp": 0.01487979, + "balance_loss_clip": 0.06335165, + "balance_loss_mlp": 0.01477775, + "epoch": 0.219630241996092, + "flos": 65765105677440.0, + "grad_norm": 0.8085744951241601, + "language_loss": 0.51588702, + "learning_rate": 3.6344913212782764e-06, + "loss": 0.59534693, + "num_input_tokens_seen": 78780200, + "router_z_loss_clip": 1.234375, + "router_z_loss_mlp": 0.10205078, + "step": 3653, + "time_per_iteration": 3.156705617904663 + }, + { + "auxiliary_loss_clip": 0.06532414, + "auxiliary_loss_mlp": 0.01292976, + "balance_loss_clip": 0.06300488, + "balance_loss_mlp": 0.01271685, + "epoch": 0.21969036524875996, + "flos": 23703367524480.0, + "grad_norm": 2.2498105533123467, + "language_loss": 0.7598449, + "learning_rate": 3.6342668476314514e-06, + "loss": 0.83809876, + "num_input_tokens_seen": 78800575, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.21289062, + "step": 3654, + "time_per_iteration": 2.5549349784851074 + }, + { + "auxiliary_loss_clip": 0.06539033, + "auxiliary_loss_mlp": 0.01287688, + "balance_loss_clip": 0.06300852, + "balance_loss_mlp": 0.01265277, + "epoch": 0.21975048850142792, + "flos": 19646130101760.0, + "grad_norm": 1.856190016757107, + "language_loss": 0.72937429, + "learning_rate": 3.634042312013064e-06, + "loss": 0.80764157, + "num_input_tokens_seen": 78819585, + "router_z_loss_clip": 2.3828125, + "router_z_loss_mlp": 0.22412109, + "step": 3655, + "time_per_iteration": 5.397899866104126 + }, + { + "auxiliary_loss_clip": 0.06537225, + "auxiliary_loss_mlp": 0.01285968, + "balance_loss_clip": 0.06301227, + "balance_loss_mlp": 0.01265667, + "epoch": 0.21981061175409589, + "flos": 22453944094080.0, + "grad_norm": 1.6446350088012902, + "language_loss": 0.81351042, + "learning_rate": 3.6338177144316276e-06, + "loss": 0.89174235, + "num_input_tokens_seen": 78837330, + "router_z_loss_clip": 2.36328125, + "router_z_loss_mlp": 0.20300293, + "step": 3656, + "time_per_iteration": 2.53308367729187 + }, + { + "auxiliary_loss_clip": 0.06536204, + "auxiliary_loss_mlp": 0.01286139, + "balance_loss_clip": 0.06302683, + "balance_loss_mlp": 0.01265027, + "epoch": 0.21987073500676388, + "flos": 18157032466560.0, + "grad_norm": 2.081609460517537, + "language_loss": 0.86280632, + "learning_rate": 3.63359305489566e-06, + "loss": 0.94102979, + "num_input_tokens_seen": 78854955, + "router_z_loss_clip": 2.33398438, + "router_z_loss_mlp": 0.21105957, + "step": 3657, + "time_per_iteration": 2.5165464878082275 + }, + { + "auxiliary_loss_clip": 0.06534712, + "auxiliary_loss_mlp": 0.01283645, + "balance_loss_clip": 0.0629717, + "balance_loss_mlp": 0.01263439, + "epoch": 0.21993085825943184, + "flos": 25632666184320.0, + "grad_norm": 1.606816904846988, + "language_loss": 0.80728716, + "learning_rate": 3.6333683334136803e-06, + "loss": 0.88547069, + "num_input_tokens_seen": 78874965, + "router_z_loss_clip": 2.375, + "router_z_loss_mlp": 0.20202637, + "step": 3658, + "time_per_iteration": 2.5528533458709717 + }, + { + "auxiliary_loss_clip": 0.06407537, + "auxiliary_loss_mlp": 0.01256954, + "balance_loss_clip": 0.0628604, + "balance_loss_mlp": 0.01250839, + "epoch": 0.2199909815120998, + "flos": 70946429621760.0, + "grad_norm": 0.7593962827668586, + "language_loss": 0.58126092, + "learning_rate": 3.6331435499942095e-06, + "loss": 0.65790582, + "num_input_tokens_seen": 78937740, + "router_z_loss_clip": 1.21875, + "router_z_loss_mlp": 0.06103516, + "step": 3659, + "time_per_iteration": 3.237276077270508 + }, + { + "auxiliary_loss_clip": 0.06524363, + "auxiliary_loss_mlp": 0.01284023, + "balance_loss_clip": 0.06293888, + "balance_loss_mlp": 0.01264091, + "epoch": 0.22005110476476777, + "flos": 21549964320000.0, + "grad_norm": 2.05919214646248, + "language_loss": 0.75117528, + "learning_rate": 3.632918704645772e-06, + "loss": 0.82925916, + "num_input_tokens_seen": 78955055, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.19946289, + "step": 3660, + "time_per_iteration": 2.5259556770324707 + }, + { + "auxiliary_loss_clip": 0.06528022, + "auxiliary_loss_mlp": 0.01287991, + "balance_loss_clip": 0.06292684, + "balance_loss_mlp": 0.01267976, + "epoch": 0.22011122801743574, + "flos": 22061051498880.0, + "grad_norm": 2.4805712407940645, + "language_loss": 0.81579179, + "learning_rate": 3.632693797376893e-06, + "loss": 0.89395189, + "num_input_tokens_seen": 78974895, + "router_z_loss_clip": 2.35546875, + "router_z_loss_mlp": 0.20019531, + "step": 3661, + "time_per_iteration": 2.5724833011627197 + }, + { + "auxiliary_loss_clip": 0.06527096, + "auxiliary_loss_mlp": 0.01283614, + "balance_loss_clip": 0.06295218, + "balance_loss_mlp": 0.01264039, + "epoch": 0.2201713512701037, + "flos": 26694811739520.0, + "grad_norm": 2.4209612671003993, + "language_loss": 0.73935246, + "learning_rate": 3.632468828196102e-06, + "loss": 0.81745958, + "num_input_tokens_seen": 78994990, + "router_z_loss_clip": 2.31640625, + "router_z_loss_mlp": 0.19567871, + "step": 3662, + "time_per_iteration": 2.594336986541748 + }, + { + "auxiliary_loss_clip": 0.06524752, + "auxiliary_loss_mlp": 0.01286026, + "balance_loss_clip": 0.06294893, + "balance_loss_mlp": 0.01266976, + "epoch": 0.22023147452277167, + "flos": 22168470833280.0, + "grad_norm": 1.5979135918213576, + "language_loss": 0.79490995, + "learning_rate": 3.632243797111929e-06, + "loss": 0.87301779, + "num_input_tokens_seen": 79014405, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.19042969, + "step": 3663, + "time_per_iteration": 2.6437172889709473 + }, + { + "auxiliary_loss_clip": 0.06536885, + "auxiliary_loss_mlp": 0.01285417, + "balance_loss_clip": 0.06298422, + "balance_loss_mlp": 0.01264627, + "epoch": 0.22029159777543966, + "flos": 22528981025280.0, + "grad_norm": 1.9228872111745317, + "language_loss": 0.81154871, + "learning_rate": 3.632018704132908e-06, + "loss": 0.8897717, + "num_input_tokens_seen": 79032375, + "router_z_loss_clip": 2.38671875, + "router_z_loss_mlp": 0.20800781, + "step": 3664, + "time_per_iteration": 2.551218271255493 + }, + { + "auxiliary_loss_clip": 0.06543128, + "auxiliary_loss_mlp": 0.01279618, + "balance_loss_clip": 0.06298529, + "balance_loss_mlp": 0.01257457, + "epoch": 0.22035172102810763, + "flos": 13047502343040.0, + "grad_norm": 2.388837963421245, + "language_loss": 0.77563322, + "learning_rate": 3.6317935492675742e-06, + "loss": 0.85386074, + "num_input_tokens_seen": 79049635, + "router_z_loss_clip": 2.44335938, + "router_z_loss_mlp": 0.22167969, + "step": 3665, + "time_per_iteration": 2.5317838191986084 + }, + { + "auxiliary_loss_clip": 0.06533245, + "auxiliary_loss_mlp": 0.0128412, + "balance_loss_clip": 0.06298798, + "balance_loss_mlp": 0.01263616, + "epoch": 0.2204118442807756, + "flos": 12170538311040.0, + "grad_norm": 5.328131395204355, + "language_loss": 0.98459631, + "learning_rate": 3.631568332524466e-06, + "loss": 1.06277001, + "num_input_tokens_seen": 79062890, + "router_z_loss_clip": 2.34765625, + "router_z_loss_mlp": 0.20507812, + "step": 3666, + "time_per_iteration": 2.500293254852295 + }, + { + "auxiliary_loss_clip": 0.06531642, + "auxiliary_loss_mlp": 0.01281342, + "balance_loss_clip": 0.06297208, + "balance_loss_mlp": 0.01260767, + "epoch": 0.22047196753344356, + "flos": 40117345758720.0, + "grad_norm": 2.0087807452217143, + "language_loss": 0.81544572, + "learning_rate": 3.631343053912122e-06, + "loss": 0.89357555, + "num_input_tokens_seen": 79085495, + "router_z_loss_clip": 2.34179688, + "router_z_loss_mlp": 0.20568848, + "step": 3667, + "time_per_iteration": 2.7539899349212646 + }, + { + "auxiliary_loss_clip": 0.06542197, + "auxiliary_loss_mlp": 0.0128155, + "balance_loss_clip": 0.06300189, + "balance_loss_mlp": 0.01258363, + "epoch": 0.22053209078611152, + "flos": 20706892064640.0, + "grad_norm": 2.631241235852179, + "language_loss": 0.77648765, + "learning_rate": 3.631117713439087e-06, + "loss": 0.85472512, + "num_input_tokens_seen": 79101820, + "router_z_loss_clip": 2.41992188, + "router_z_loss_mlp": 0.23168945, + "step": 3668, + "time_per_iteration": 2.524740695953369 + }, + { + "auxiliary_loss_clip": 0.06534266, + "auxiliary_loss_mlp": 0.01279226, + "balance_loss_clip": 0.06300663, + "balance_loss_mlp": 0.01258758, + "epoch": 0.2205922140387795, + "flos": 24723026259840.0, + "grad_norm": 2.1996350177899386, + "language_loss": 0.72024125, + "learning_rate": 3.630892311113904e-06, + "loss": 0.7983762, + "num_input_tokens_seen": 79123320, + "router_z_loss_clip": 2.33789062, + "router_z_loss_mlp": 0.20471191, + "step": 3669, + "time_per_iteration": 2.5901756286621094 + }, + { + "auxiliary_loss_clip": 0.06540591, + "auxiliary_loss_mlp": 0.01281842, + "balance_loss_clip": 0.06304247, + "balance_loss_mlp": 0.01261398, + "epoch": 0.22065233729144745, + "flos": 23484000735360.0, + "grad_norm": 1.708018932230371, + "language_loss": 0.85830641, + "learning_rate": 3.6306668469451215e-06, + "loss": 0.93653071, + "num_input_tokens_seen": 79141615, + "router_z_loss_clip": 2.36132812, + "router_z_loss_mlp": 0.20422363, + "step": 3670, + "time_per_iteration": 2.6102726459503174 + }, + { + "auxiliary_loss_clip": 0.06539471, + "auxiliary_loss_mlp": 0.01279884, + "balance_loss_clip": 0.06300244, + "balance_loss_mlp": 0.01259678, + "epoch": 0.22071246054411545, + "flos": 35234268094080.0, + "grad_norm": 1.8596418583208814, + "language_loss": 0.77398729, + "learning_rate": 3.6304413209412886e-06, + "loss": 0.85218084, + "num_input_tokens_seen": 79164910, + "router_z_loss_clip": 2.39453125, + "router_z_loss_mlp": 0.20202637, + "step": 3671, + "time_per_iteration": 2.6463472843170166 + }, + { + "auxiliary_loss_clip": 0.06536315, + "auxiliary_loss_mlp": 0.01275828, + "balance_loss_clip": 0.06302021, + "balance_loss_mlp": 0.01256934, + "epoch": 0.2207725837967834, + "flos": 18156151998720.0, + "grad_norm": 3.3605951725525807, + "language_loss": 0.81071377, + "learning_rate": 3.6302157331109573e-06, + "loss": 0.88883519, + "num_input_tokens_seen": 79179685, + "router_z_loss_clip": 2.34375, + "router_z_loss_mlp": 0.18896484, + "step": 3672, + "time_per_iteration": 2.522409200668335 + }, + { + "auxiliary_loss_clip": 0.06541845, + "auxiliary_loss_mlp": 0.01282888, + "balance_loss_clip": 0.06304064, + "balance_loss_mlp": 0.01262086, + "epoch": 0.22083270704945138, + "flos": 20484967726080.0, + "grad_norm": 2.0276751679318905, + "language_loss": 0.74039209, + "learning_rate": 3.629990083462682e-06, + "loss": 0.8186394, + "num_input_tokens_seen": 79196285, + "router_z_loss_clip": 2.38085938, + "router_z_loss_mlp": 0.20800781, + "step": 3673, + "time_per_iteration": 2.5588481426239014 + }, + { + "auxiliary_loss_clip": 0.06537451, + "auxiliary_loss_mlp": 0.0127904, + "balance_loss_clip": 0.06301045, + "balance_loss_mlp": 0.01258154, + "epoch": 0.22089283030211934, + "flos": 34133451079680.0, + "grad_norm": 2.1113123853963223, + "language_loss": 0.77576697, + "learning_rate": 3.6297643720050203e-06, + "loss": 0.85393184, + "num_input_tokens_seen": 79216060, + "router_z_loss_clip": 2.3671875, + "router_z_loss_mlp": 0.2088623, + "step": 3674, + "time_per_iteration": 2.6212525367736816 + }, + { + "auxiliary_loss_clip": 0.06539989, + "auxiliary_loss_mlp": 0.01277379, + "balance_loss_clip": 0.06303889, + "balance_loss_mlp": 0.01255349, + "epoch": 0.2209529535547873, + "flos": 18083043711360.0, + "grad_norm": 2.9913121905850213, + "language_loss": 0.7632584, + "learning_rate": 3.6295385987465293e-06, + "loss": 0.84143209, + "num_input_tokens_seen": 79235145, + "router_z_loss_clip": 2.36523438, + "router_z_loss_mlp": 0.22033691, + "step": 3675, + "time_per_iteration": 2.529346466064453 + }, + { + "auxiliary_loss_clip": 0.06540923, + "auxiliary_loss_mlp": 0.01279311, + "balance_loss_clip": 0.06303286, + "balance_loss_mlp": 0.01258592, + "epoch": 0.22101307680745527, + "flos": 27242725587840.0, + "grad_norm": 1.8493496269427605, + "language_loss": 0.8074736, + "learning_rate": 3.629312763695772e-06, + "loss": 0.88567591, + "num_input_tokens_seen": 79256960, + "router_z_loss_clip": 2.38085938, + "router_z_loss_mlp": 0.20727539, + "step": 3676, + "time_per_iteration": 2.5729713439941406 + }, + { + "auxiliary_loss_clip": 0.06539683, + "auxiliary_loss_mlp": 0.0128126, + "balance_loss_clip": 0.06299066, + "balance_loss_mlp": 0.01260637, + "epoch": 0.22107320006012326, + "flos": 16548566290560.0, + "grad_norm": 2.695197102889201, + "language_loss": 0.76204234, + "learning_rate": 3.6290868668613107e-06, + "loss": 0.84025168, + "num_input_tokens_seen": 79274860, + "router_z_loss_clip": 2.40820312, + "router_z_loss_mlp": 0.2064209, + "step": 3677, + "time_per_iteration": 2.5165653228759766 + }, + { + "auxiliary_loss_clip": 0.0653778, + "auxiliary_loss_mlp": 0.01277642, + "balance_loss_clip": 0.06301221, + "balance_loss_mlp": 0.01257889, + "epoch": 0.22113332331279123, + "flos": 22061009571840.0, + "grad_norm": 1.9269573452829223, + "language_loss": 0.84673274, + "learning_rate": 3.628860908251712e-06, + "loss": 0.92488694, + "num_input_tokens_seen": 79294005, + "router_z_loss_clip": 2.36523438, + "router_z_loss_mlp": 0.19750977, + "step": 3678, + "time_per_iteration": 2.5460638999938965 + }, + { + "auxiliary_loss_clip": 0.06537814, + "auxiliary_loss_mlp": 0.01282989, + "balance_loss_clip": 0.06304095, + "balance_loss_mlp": 0.01262354, + "epoch": 0.2211934465654592, + "flos": 26619690954240.0, + "grad_norm": 2.1729831488916327, + "language_loss": 0.89362311, + "learning_rate": 3.6286348878755452e-06, + "loss": 0.9718312, + "num_input_tokens_seen": 79314005, + "router_z_loss_clip": 2.34179688, + "router_z_loss_mlp": 0.20629883, + "step": 3679, + "time_per_iteration": 2.596503973007202 + }, + { + "auxiliary_loss_clip": 0.06542142, + "auxiliary_loss_mlp": 0.01291632, + "balance_loss_clip": 0.06301068, + "balance_loss_mlp": 0.01269817, + "epoch": 0.22125356981812716, + "flos": 16365564973440.0, + "grad_norm": 3.197923457760992, + "language_loss": 0.87311327, + "learning_rate": 3.6284088057413803e-06, + "loss": 0.95145106, + "num_input_tokens_seen": 79331030, + "router_z_loss_clip": 2.40820312, + "router_z_loss_mlp": 0.21801758, + "step": 3680, + "time_per_iteration": 2.507798433303833 + }, + { + "auxiliary_loss_clip": 0.06534758, + "auxiliary_loss_mlp": 0.01279239, + "balance_loss_clip": 0.06302372, + "balance_loss_mlp": 0.01258211, + "epoch": 0.22131369307079513, + "flos": 21657257873280.0, + "grad_norm": 1.8058433539562604, + "language_loss": 0.81643963, + "learning_rate": 3.6281826618577894e-06, + "loss": 0.89457959, + "num_input_tokens_seen": 79348560, + "router_z_loss_clip": 2.32226562, + "router_z_loss_mlp": 0.21032715, + "step": 3681, + "time_per_iteration": 2.536559820175171 + }, + { + "auxiliary_loss_clip": 0.06530598, + "auxiliary_loss_mlp": 0.01283453, + "balance_loss_clip": 0.06302136, + "balance_loss_mlp": 0.01264344, + "epoch": 0.2213738163234631, + "flos": 19615592488320.0, + "grad_norm": 3.0843961282743138, + "language_loss": 0.80613208, + "learning_rate": 3.62795645623335e-06, + "loss": 0.88427258, + "num_input_tokens_seen": 79367175, + "router_z_loss_clip": 2.28320312, + "router_z_loss_mlp": 0.19116211, + "step": 3682, + "time_per_iteration": 2.5523715019226074 + }, + { + "auxiliary_loss_clip": 0.06540116, + "auxiliary_loss_mlp": 0.01284666, + "balance_loss_clip": 0.06302039, + "balance_loss_mlp": 0.01261933, + "epoch": 0.22143393957613106, + "flos": 23630217310080.0, + "grad_norm": 1.560467578099588, + "language_loss": 0.78323001, + "learning_rate": 3.627730188876638e-06, + "loss": 0.86147785, + "num_input_tokens_seen": 79388435, + "router_z_loss_clip": 2.38085938, + "router_z_loss_mlp": 0.22729492, + "step": 3683, + "time_per_iteration": 2.563915491104126 + }, + { + "auxiliary_loss_clip": 0.06546305, + "auxiliary_loss_mlp": 0.01292128, + "balance_loss_clip": 0.06304266, + "balance_loss_mlp": 0.01270801, + "epoch": 0.22149406282879905, + "flos": 26185108152960.0, + "grad_norm": 2.3659446396904276, + "language_loss": 0.73827177, + "learning_rate": 3.627503859796234e-06, + "loss": 0.81665611, + "num_input_tokens_seen": 79407910, + "router_z_loss_clip": 2.42382812, + "router_z_loss_mlp": 0.21337891, + "step": 3684, + "time_per_iteration": 2.5829403400421143 + }, + { + "auxiliary_loss_clip": 0.06539842, + "auxiliary_loss_mlp": 0.01288295, + "balance_loss_clip": 0.06303138, + "balance_loss_mlp": 0.01266396, + "epoch": 0.221554186081467, + "flos": 14544104918400.0, + "grad_norm": 1.9346272357304948, + "language_loss": 0.81055164, + "learning_rate": 3.6272774690007207e-06, + "loss": 0.88883299, + "num_input_tokens_seen": 79424020, + "router_z_loss_clip": 2.3671875, + "router_z_loss_mlp": 0.21899414, + "step": 3685, + "time_per_iteration": 2.5229949951171875 + }, + { + "auxiliary_loss_clip": 0.06531791, + "auxiliary_loss_mlp": 0.0128599, + "balance_loss_clip": 0.06302623, + "balance_loss_mlp": 0.01266607, + "epoch": 0.22161430933413498, + "flos": 22245059064960.0, + "grad_norm": 1.5947500054188823, + "language_loss": 0.87523818, + "learning_rate": 3.6270510164986823e-06, + "loss": 0.95341599, + "num_input_tokens_seen": 79445605, + "router_z_loss_clip": 2.29296875, + "router_z_loss_mlp": 0.19372559, + "step": 3686, + "time_per_iteration": 4.0018064975738525 + }, + { + "auxiliary_loss_clip": 0.06530964, + "auxiliary_loss_mlp": 0.01294037, + "balance_loss_clip": 0.06297237, + "balance_loss_mlp": 0.01272198, + "epoch": 0.22167443258680294, + "flos": 23483162194560.0, + "grad_norm": 2.0272053301197186, + "language_loss": 0.78420949, + "learning_rate": 3.626824502298707e-06, + "loss": 0.86245942, + "num_input_tokens_seen": 79463850, + "router_z_loss_clip": 2.3359375, + "router_z_loss_mlp": 0.21826172, + "step": 3687, + "time_per_iteration": 2.543321132659912 + }, + { + "auxiliary_loss_clip": 0.06551681, + "auxiliary_loss_mlp": 0.01283958, + "balance_loss_clip": 0.0630649, + "balance_loss_mlp": 0.01261177, + "epoch": 0.2217345558394709, + "flos": 23227723422720.0, + "grad_norm": 1.7957197826329643, + "language_loss": 0.85492283, + "learning_rate": 3.626597926409383e-06, + "loss": 0.93327922, + "num_input_tokens_seen": 79482845, + "router_z_loss_clip": 2.44921875, + "router_z_loss_mlp": 0.2277832, + "step": 3688, + "time_per_iteration": 2.5456702709198 + }, + { + "auxiliary_loss_clip": 0.06557921, + "auxiliary_loss_mlp": 0.01283081, + "balance_loss_clip": 0.0631456, + "balance_loss_mlp": 0.01260812, + "epoch": 0.22179467909213887, + "flos": 20017247834880.0, + "grad_norm": 1.8193279444648072, + "language_loss": 0.81821239, + "learning_rate": 3.6263712888393027e-06, + "loss": 0.89662236, + "num_input_tokens_seen": 79501550, + "router_z_loss_clip": 2.43554688, + "router_z_loss_mlp": 0.22265625, + "step": 3689, + "time_per_iteration": 4.073091506958008 + }, + { + "auxiliary_loss_clip": 0.06540284, + "auxiliary_loss_mlp": 0.0128456, + "balance_loss_clip": 0.06304172, + "balance_loss_mlp": 0.01263269, + "epoch": 0.22185480234480687, + "flos": 19689203900160.0, + "grad_norm": 2.302195520769192, + "language_loss": 0.70934272, + "learning_rate": 3.626144589597061e-06, + "loss": 0.7875911, + "num_input_tokens_seen": 79519680, + "router_z_loss_clip": 2.36132812, + "router_z_loss_mlp": 0.2130127, + "step": 3690, + "time_per_iteration": 2.5177161693573 + }, + { + "auxiliary_loss_clip": 0.06548303, + "auxiliary_loss_mlp": 0.01286756, + "balance_loss_clip": 0.06307022, + "balance_loss_mlp": 0.01264416, + "epoch": 0.22191492559747483, + "flos": 21987817430400.0, + "grad_norm": 2.3084892961245576, + "language_loss": 0.7285862, + "learning_rate": 3.6259178286912528e-06, + "loss": 0.80693686, + "num_input_tokens_seen": 79539000, + "router_z_loss_clip": 2.4140625, + "router_z_loss_mlp": 0.22338867, + "step": 3691, + "time_per_iteration": 2.545271873474121 + }, + { + "auxiliary_loss_clip": 0.0654895, + "auxiliary_loss_mlp": 0.01283693, + "balance_loss_clip": 0.06313456, + "balance_loss_mlp": 0.01261771, + "epoch": 0.2219750488501428, + "flos": 23228813525760.0, + "grad_norm": 2.0680633952732195, + "language_loss": 0.71962094, + "learning_rate": 3.625691006130477e-06, + "loss": 0.79794735, + "num_input_tokens_seen": 79559695, + "router_z_loss_clip": 2.36132812, + "router_z_loss_mlp": 0.21936035, + "step": 3692, + "time_per_iteration": 2.543306350708008 + }, + { + "auxiliary_loss_clip": 0.06558576, + "auxiliary_loss_mlp": 0.0128071, + "balance_loss_clip": 0.06317012, + "balance_loss_mlp": 0.01258394, + "epoch": 0.22203517210281076, + "flos": 22459939660800.0, + "grad_norm": 1.9780142392305156, + "language_loss": 0.87528688, + "learning_rate": 3.6254641219233362e-06, + "loss": 0.95367974, + "num_input_tokens_seen": 79579095, + "router_z_loss_clip": 2.41601562, + "router_z_loss_mlp": 0.22338867, + "step": 3693, + "time_per_iteration": 2.571045398712158 + }, + { + "auxiliary_loss_clip": 0.06534213, + "auxiliary_loss_mlp": 0.01282043, + "balance_loss_clip": 0.06303744, + "balance_loss_mlp": 0.01261122, + "epoch": 0.22209529535547873, + "flos": 17569985961600.0, + "grad_norm": 2.4004359049860824, + "language_loss": 0.86418116, + "learning_rate": 3.6252371760784325e-06, + "loss": 0.94234371, + "num_input_tokens_seen": 79596430, + "router_z_loss_clip": 2.30859375, + "router_z_loss_mlp": 0.20922852, + "step": 3694, + "time_per_iteration": 4.03299617767334 + }, + { + "auxiliary_loss_clip": 0.06554222, + "auxiliary_loss_mlp": 0.0127962, + "balance_loss_clip": 0.06307386, + "balance_loss_mlp": 0.0125815, + "epoch": 0.2221554186081467, + "flos": 21475178951040.0, + "grad_norm": 1.7692850214061204, + "language_loss": 0.69924927, + "learning_rate": 3.6250101686043725e-06, + "loss": 0.77758765, + "num_input_tokens_seen": 79615825, + "router_z_loss_clip": 2.46875, + "router_z_loss_mlp": 0.21472168, + "step": 3695, + "time_per_iteration": 3.989173412322998 + }, + { + "auxiliary_loss_clip": 0.06536973, + "auxiliary_loss_mlp": 0.01283487, + "balance_loss_clip": 0.0630603, + "balance_loss_mlp": 0.01262781, + "epoch": 0.22221554186081466, + "flos": 27680956041600.0, + "grad_norm": 1.7088419756312998, + "language_loss": 0.72215462, + "learning_rate": 3.6247830995097637e-06, + "loss": 0.80035925, + "num_input_tokens_seen": 79637875, + "router_z_loss_clip": 2.30859375, + "router_z_loss_mlp": 0.20715332, + "step": 3696, + "time_per_iteration": 2.6339590549468994 + }, + { + "auxiliary_loss_clip": 0.06543445, + "auxiliary_loss_mlp": 0.01279581, + "balance_loss_clip": 0.06307454, + "balance_loss_mlp": 0.01257825, + "epoch": 0.22227566511348265, + "flos": 25966202561280.0, + "grad_norm": 1.8417969407055101, + "language_loss": 0.88068652, + "learning_rate": 3.624555968803217e-06, + "loss": 0.95891678, + "num_input_tokens_seen": 79656970, + "router_z_loss_clip": 2.36132812, + "router_z_loss_mlp": 0.21740723, + "step": 3697, + "time_per_iteration": 2.5599191188812256 + }, + { + "auxiliary_loss_clip": 0.06533489, + "auxiliary_loss_mlp": 0.01284902, + "balance_loss_clip": 0.06305174, + "balance_loss_mlp": 0.01265042, + "epoch": 0.22233578836615062, + "flos": 39213240203520.0, + "grad_norm": 2.5935528152985867, + "language_loss": 0.6687606, + "learning_rate": 3.624328776493346e-06, + "loss": 0.74694455, + "num_input_tokens_seen": 79680275, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.1986084, + "step": 3698, + "time_per_iteration": 2.812140703201294 + }, + { + "auxiliary_loss_clip": 0.06546268, + "auxiliary_loss_mlp": 0.01282222, + "balance_loss_clip": 0.06307642, + "balance_loss_mlp": 0.01260216, + "epoch": 0.22239591161881858, + "flos": 36292682142720.0, + "grad_norm": 1.853195446284453, + "language_loss": 0.82990527, + "learning_rate": 3.6241015225887637e-06, + "loss": 0.90819019, + "num_input_tokens_seen": 79701255, + "router_z_loss_clip": 2.38671875, + "router_z_loss_mlp": 0.22009277, + "step": 3699, + "time_per_iteration": 2.667423725128174 + }, + { + "auxiliary_loss_clip": 0.06537004, + "auxiliary_loss_mlp": 0.01281329, + "balance_loss_clip": 0.06302205, + "balance_loss_mlp": 0.01260014, + "epoch": 0.22245603487148655, + "flos": 19725779007360.0, + "grad_norm": 1.45021308141165, + "language_loss": 0.80335897, + "learning_rate": 3.62387420709809e-06, + "loss": 0.88154227, + "num_input_tokens_seen": 79721315, + "router_z_loss_clip": 2.34765625, + "router_z_loss_mlp": 0.21313477, + "step": 3700, + "time_per_iteration": 2.5526716709136963 + }, + { + "auxiliary_loss_clip": 0.06548695, + "auxiliary_loss_mlp": 0.01279557, + "balance_loss_clip": 0.06306358, + "balance_loss_mlp": 0.01257885, + "epoch": 0.2225161581241545, + "flos": 46290950081280.0, + "grad_norm": 3.047641549556173, + "language_loss": 0.73186177, + "learning_rate": 3.623646830029943e-06, + "loss": 0.81014431, + "num_input_tokens_seen": 79742705, + "router_z_loss_clip": 2.42382812, + "router_z_loss_mlp": 0.21655273, + "step": 3701, + "time_per_iteration": 2.776974678039551 + }, + { + "auxiliary_loss_clip": 0.06535295, + "auxiliary_loss_mlp": 0.01280405, + "balance_loss_clip": 0.06300849, + "balance_loss_mlp": 0.01259734, + "epoch": 0.22257628137682248, + "flos": 23702990181120.0, + "grad_norm": 4.404280219854046, + "language_loss": 0.80455184, + "learning_rate": 3.6234193913929454e-06, + "loss": 0.88270885, + "num_input_tokens_seen": 79763000, + "router_z_loss_clip": 2.34570312, + "router_z_loss_mlp": 0.20666504, + "step": 3702, + "time_per_iteration": 2.5657999515533447 + }, + { + "auxiliary_loss_clip": 0.06524558, + "auxiliary_loss_mlp": 0.01274253, + "balance_loss_clip": 0.06297488, + "balance_loss_mlp": 0.01253331, + "epoch": 0.22263640462949044, + "flos": 19359986008320.0, + "grad_norm": 3.4101413472023405, + "language_loss": 0.78629804, + "learning_rate": 3.623191891195723e-06, + "loss": 0.86428618, + "num_input_tokens_seen": 79781335, + "router_z_loss_clip": 2.27148438, + "router_z_loss_mlp": 0.20910645, + "step": 3703, + "time_per_iteration": 2.550189971923828 + }, + { + "auxiliary_loss_clip": 0.06541737, + "auxiliary_loss_mlp": 0.01279602, + "balance_loss_clip": 0.06300878, + "balance_loss_mlp": 0.01257084, + "epoch": 0.22269652788215843, + "flos": 20782138631040.0, + "grad_norm": 2.0986231414271828, + "language_loss": 0.75210625, + "learning_rate": 3.6229643294469005e-06, + "loss": 0.83031964, + "num_input_tokens_seen": 79800150, + "router_z_loss_clip": 2.41015625, + "router_z_loss_mlp": 0.22509766, + "step": 3704, + "time_per_iteration": 2.5540754795074463 + }, + { + "auxiliary_loss_clip": 0.06527826, + "auxiliary_loss_mlp": 0.01288936, + "balance_loss_clip": 0.06299336, + "balance_loss_mlp": 0.01268682, + "epoch": 0.2227566511348264, + "flos": 47969631578880.0, + "grad_norm": 1.891044771341396, + "language_loss": 0.65108556, + "learning_rate": 3.6227367061551074e-06, + "loss": 0.72925317, + "num_input_tokens_seen": 79822390, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.20239258, + "step": 3705, + "time_per_iteration": 2.8109097480773926 + }, + { + "auxiliary_loss_clip": 0.06438605, + "auxiliary_loss_mlp": 0.01266416, + "balance_loss_clip": 0.0631493, + "balance_loss_mlp": 0.012611, + "epoch": 0.22281677438749437, + "flos": 66235676607360.0, + "grad_norm": 1.322453387614222, + "language_loss": 0.65218806, + "learning_rate": 3.6225090213289766e-06, + "loss": 0.72923827, + "num_input_tokens_seen": 79873350, + "router_z_loss_clip": 1.234375, + "router_z_loss_mlp": 0.05322266, + "step": 3706, + "time_per_iteration": 3.059636354446411 + }, + { + "auxiliary_loss_clip": 0.06534128, + "auxiliary_loss_mlp": 0.01286492, + "balance_loss_clip": 0.06297205, + "balance_loss_mlp": 0.01266274, + "epoch": 0.22287689764016233, + "flos": 21878050181760.0, + "grad_norm": 2.374246987916323, + "language_loss": 0.80905002, + "learning_rate": 3.622281274977141e-06, + "loss": 0.88725626, + "num_input_tokens_seen": 79891715, + "router_z_loss_clip": 2.36914062, + "router_z_loss_mlp": 0.20202637, + "step": 3707, + "time_per_iteration": 2.5891129970550537 + }, + { + "auxiliary_loss_clip": 0.06532377, + "auxiliary_loss_mlp": 0.01280313, + "balance_loss_clip": 0.06298505, + "balance_loss_mlp": 0.01257854, + "epoch": 0.2229370208928303, + "flos": 27679824011520.0, + "grad_norm": 1.802742500055583, + "language_loss": 0.79219007, + "learning_rate": 3.6220534671082367e-06, + "loss": 0.87031698, + "num_input_tokens_seen": 79911175, + "router_z_loss_clip": 2.34179688, + "router_z_loss_mlp": 0.2244873, + "step": 3708, + "time_per_iteration": 2.5907180309295654 + }, + { + "auxiliary_loss_clip": 0.06539932, + "auxiliary_loss_mlp": 0.01293698, + "balance_loss_clip": 0.06300655, + "balance_loss_mlp": 0.01271525, + "epoch": 0.22299714414549826, + "flos": 30162612816000.0, + "grad_norm": 1.9019649120082793, + "language_loss": 0.81583631, + "learning_rate": 3.6218255977309024e-06, + "loss": 0.89417267, + "num_input_tokens_seen": 79931875, + "router_z_loss_clip": 2.39453125, + "router_z_loss_mlp": 0.22167969, + "step": 3709, + "time_per_iteration": 2.658768892288208 + }, + { + "auxiliary_loss_clip": 0.06540084, + "auxiliary_loss_mlp": 0.01295766, + "balance_loss_clip": 0.0630019, + "balance_loss_mlp": 0.01274464, + "epoch": 0.22305726739816625, + "flos": 23148871130880.0, + "grad_norm": 2.9556041497723236, + "language_loss": 0.69413233, + "learning_rate": 3.6215976668537787e-06, + "loss": 0.77249086, + "num_input_tokens_seen": 79952445, + "router_z_loss_clip": 2.39648438, + "router_z_loss_mlp": 0.21289062, + "step": 3710, + "time_per_iteration": 2.603476047515869 + }, + { + "auxiliary_loss_clip": 0.06536471, + "auxiliary_loss_mlp": 0.01286054, + "balance_loss_clip": 0.06297636, + "balance_loss_mlp": 0.01264429, + "epoch": 0.22311739065083422, + "flos": 19178116721280.0, + "grad_norm": 2.184897161331363, + "language_loss": 0.91282266, + "learning_rate": 3.6213696744855096e-06, + "loss": 0.99104792, + "num_input_tokens_seen": 79971030, + "router_z_loss_clip": 2.39257812, + "router_z_loss_mlp": 0.21606445, + "step": 3711, + "time_per_iteration": 2.6093854904174805 + }, + { + "auxiliary_loss_clip": 0.06539471, + "auxiliary_loss_mlp": 0.01298084, + "balance_loss_clip": 0.06302293, + "balance_loss_mlp": 0.01275911, + "epoch": 0.22317751390350218, + "flos": 13621467611520.0, + "grad_norm": 2.3638705243519142, + "language_loss": 0.89271343, + "learning_rate": 3.6211416206347395e-06, + "loss": 0.97108901, + "num_input_tokens_seen": 79982085, + "router_z_loss_clip": 2.375, + "router_z_loss_mlp": 0.22192383, + "step": 3712, + "time_per_iteration": 2.5170199871063232 + }, + { + "auxiliary_loss_clip": 0.06530519, + "auxiliary_loss_mlp": 0.01292247, + "balance_loss_clip": 0.06299888, + "balance_loss_mlp": 0.01271481, + "epoch": 0.22323763715617015, + "flos": 11032643064960.0, + "grad_norm": 2.927785991832361, + "language_loss": 0.74880064, + "learning_rate": 3.620913505310117e-06, + "loss": 0.82702827, + "num_input_tokens_seen": 79997460, + "router_z_loss_clip": 2.30859375, + "router_z_loss_mlp": 0.2076416, + "step": 3713, + "time_per_iteration": 2.521813154220581 + }, + { + "auxiliary_loss_clip": 0.06534518, + "auxiliary_loss_mlp": 0.0130023, + "balance_loss_clip": 0.06298245, + "balance_loss_mlp": 0.01277556, + "epoch": 0.22329776040883811, + "flos": 41360647841280.0, + "grad_norm": 2.458794372685298, + "language_loss": 0.62675929, + "learning_rate": 3.6206853285202917e-06, + "loss": 0.70510674, + "num_input_tokens_seen": 80022450, + "router_z_loss_clip": 2.36328125, + "router_z_loss_mlp": 0.22668457, + "step": 3714, + "time_per_iteration": 2.704357862472534 + }, + { + "auxiliary_loss_clip": 0.06529912, + "auxiliary_loss_mlp": 0.01289936, + "balance_loss_clip": 0.06295826, + "balance_loss_mlp": 0.01267906, + "epoch": 0.22335788366150608, + "flos": 25126568323200.0, + "grad_norm": 1.757427072944695, + "language_loss": 0.79499549, + "learning_rate": 3.6204570902739164e-06, + "loss": 0.87319398, + "num_input_tokens_seen": 80042100, + "router_z_loss_clip": 2.33984375, + "router_z_loss_mlp": 0.22009277, + "step": 3715, + "time_per_iteration": 2.571711301803589 + }, + { + "auxiliary_loss_clip": 0.06527971, + "auxiliary_loss_mlp": 0.01294287, + "balance_loss_clip": 0.06293058, + "balance_loss_mlp": 0.0127302, + "epoch": 0.22341800691417404, + "flos": 16989144658560.0, + "grad_norm": 1.5961840175356918, + "language_loss": 0.77329421, + "learning_rate": 3.620228790579645e-06, + "loss": 0.85151684, + "num_input_tokens_seen": 80059690, + "router_z_loss_clip": 2.34765625, + "router_z_loss_mlp": 0.21276855, + "step": 3716, + "time_per_iteration": 2.502037286758423 + }, + { + "auxiliary_loss_clip": 0.06529684, + "auxiliary_loss_mlp": 0.0129404, + "balance_loss_clip": 0.06297298, + "balance_loss_mlp": 0.01273977, + "epoch": 0.22347813016684204, + "flos": 14141904520320.0, + "grad_norm": 2.4369226344025665, + "language_loss": 0.80004126, + "learning_rate": 3.6200004294461367e-06, + "loss": 0.87827849, + "num_input_tokens_seen": 80076060, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.20068359, + "step": 3717, + "time_per_iteration": 2.5208563804626465 + }, + { + "auxiliary_loss_clip": 0.065373, + "auxiliary_loss_mlp": 0.01297317, + "balance_loss_clip": 0.06298472, + "balance_loss_mlp": 0.01275215, + "epoch": 0.22353825341951, + "flos": 23589323717760.0, + "grad_norm": 2.564573329936102, + "language_loss": 0.68781847, + "learning_rate": 3.6197720068820497e-06, + "loss": 0.76616466, + "num_input_tokens_seen": 80094760, + "router_z_loss_clip": 2.38867188, + "router_z_loss_mlp": 0.22106934, + "step": 3718, + "time_per_iteration": 2.6491305828094482 + }, + { + "auxiliary_loss_clip": 0.06536659, + "auxiliary_loss_mlp": 0.01296292, + "balance_loss_clip": 0.06298986, + "balance_loss_mlp": 0.01271187, + "epoch": 0.22359837667217797, + "flos": 29831759769600.0, + "grad_norm": 1.515297493499622, + "language_loss": 0.80957985, + "learning_rate": 3.619543522896045e-06, + "loss": 0.88790929, + "num_input_tokens_seen": 80114475, + "router_z_loss_clip": 2.37695312, + "router_z_loss_mlp": 0.25085449, + "step": 3719, + "time_per_iteration": 2.6334550380706787 + }, + { + "auxiliary_loss_clip": 0.06540611, + "auxiliary_loss_mlp": 0.01300766, + "balance_loss_clip": 0.06299402, + "balance_loss_mlp": 0.01276793, + "epoch": 0.22365849992484593, + "flos": 17608867056000.0, + "grad_norm": 2.352033480486632, + "language_loss": 0.87360144, + "learning_rate": 3.6193149774967885e-06, + "loss": 0.95201522, + "num_input_tokens_seen": 80132920, + "router_z_loss_clip": 2.41210938, + "router_z_loss_mlp": 0.23962402, + "step": 3720, + "time_per_iteration": 2.5415003299713135 + }, + { + "auxiliary_loss_clip": 0.06526608, + "auxiliary_loss_mlp": 0.01292998, + "balance_loss_clip": 0.06295964, + "balance_loss_mlp": 0.01271672, + "epoch": 0.2237186231775139, + "flos": 22717558638720.0, + "grad_norm": 1.8478771577440833, + "language_loss": 0.75151736, + "learning_rate": 3.619086370692945e-06, + "loss": 0.8297134, + "num_input_tokens_seen": 80152845, + "router_z_loss_clip": 2.30664062, + "router_z_loss_mlp": 0.21325684, + "step": 3721, + "time_per_iteration": 2.548450469970703 + }, + { + "auxiliary_loss_clip": 0.06540586, + "auxiliary_loss_mlp": 0.0129148, + "balance_loss_clip": 0.06299528, + "balance_loss_mlp": 0.01269105, + "epoch": 0.22377874643018186, + "flos": 13376720234880.0, + "grad_norm": 2.2094798322640736, + "language_loss": 0.79352558, + "learning_rate": 3.6188577024931844e-06, + "loss": 0.87184626, + "num_input_tokens_seen": 80170680, + "router_z_loss_clip": 2.41210938, + "router_z_loss_mlp": 0.22375488, + "step": 3722, + "time_per_iteration": 2.519277572631836 + }, + { + "auxiliary_loss_clip": 0.06531984, + "auxiliary_loss_mlp": 0.01288897, + "balance_loss_clip": 0.06299505, + "balance_loss_mlp": 0.01267964, + "epoch": 0.22383886968284986, + "flos": 17900797080960.0, + "grad_norm": 2.2930078409484196, + "language_loss": 0.83410442, + "learning_rate": 3.618628972906178e-06, + "loss": 0.91231328, + "num_input_tokens_seen": 80189030, + "router_z_loss_clip": 2.32421875, + "router_z_loss_mlp": 0.20922852, + "step": 3723, + "time_per_iteration": 2.5086076259613037 + }, + { + "auxiliary_loss_clip": 0.06544059, + "auxiliary_loss_mlp": 0.01285781, + "balance_loss_clip": 0.06305651, + "balance_loss_mlp": 0.01263834, + "epoch": 0.22389899293551782, + "flos": 23886033425280.0, + "grad_norm": 4.429276920778782, + "language_loss": 0.84606177, + "learning_rate": 3.6184001819405984e-06, + "loss": 0.92436016, + "num_input_tokens_seen": 80208365, + "router_z_loss_clip": 2.3828125, + "router_z_loss_mlp": 0.21960449, + "step": 3724, + "time_per_iteration": 2.574178695678711 + }, + { + "auxiliary_loss_clip": 0.06534179, + "auxiliary_loss_mlp": 0.01287846, + "balance_loss_clip": 0.06299204, + "balance_loss_mlp": 0.01267211, + "epoch": 0.2239591161881858, + "flos": 27279929600640.0, + "grad_norm": 1.978846940821608, + "language_loss": 0.79885381, + "learning_rate": 3.618171329605121e-06, + "loss": 0.87707412, + "num_input_tokens_seen": 80228685, + "router_z_loss_clip": 2.3515625, + "router_z_loss_mlp": 0.20617676, + "step": 3725, + "time_per_iteration": 2.589184522628784 + }, + { + "auxiliary_loss_clip": 0.06541407, + "auxiliary_loss_mlp": 0.01289084, + "balance_loss_clip": 0.06307919, + "balance_loss_mlp": 0.01267197, + "epoch": 0.22401923944085375, + "flos": 22243423910400.0, + "grad_norm": 1.7178260071510263, + "language_loss": 0.78001326, + "learning_rate": 3.6179424159084254e-06, + "loss": 0.85831815, + "num_input_tokens_seen": 80247635, + "router_z_loss_clip": 2.3359375, + "router_z_loss_mlp": 0.21875, + "step": 3726, + "time_per_iteration": 3.980494976043701 + }, + { + "auxiliary_loss_clip": 0.06552388, + "auxiliary_loss_mlp": 0.01297244, + "balance_loss_clip": 0.06307887, + "balance_loss_mlp": 0.01272175, + "epoch": 0.22407936269352172, + "flos": 12057920023680.0, + "grad_norm": 3.478702992871699, + "language_loss": 0.73437679, + "learning_rate": 3.6177134408591914e-06, + "loss": 0.81287301, + "num_input_tokens_seen": 80260045, + "router_z_loss_clip": 2.45117188, + "router_z_loss_mlp": 0.25097656, + "step": 3727, + "time_per_iteration": 2.4799015522003174 + }, + { + "auxiliary_loss_clip": 0.06549139, + "auxiliary_loss_mlp": 0.01296668, + "balance_loss_clip": 0.06309944, + "balance_loss_mlp": 0.0127341, + "epoch": 0.22413948594618968, + "flos": 19359482883840.0, + "grad_norm": 2.179866459674304, + "language_loss": 0.8799302, + "learning_rate": 3.6174844044661013e-06, + "loss": 0.95838827, + "num_input_tokens_seen": 80277680, + "router_z_loss_clip": 2.39257812, + "router_z_loss_mlp": 0.23254395, + "step": 3728, + "time_per_iteration": 2.547523021697998 + }, + { + "auxiliary_loss_clip": 0.0653842, + "auxiliary_loss_mlp": 0.01294185, + "balance_loss_clip": 0.06303863, + "balance_loss_mlp": 0.0126989, + "epoch": 0.22419960919885765, + "flos": 24176789493120.0, + "grad_norm": 1.9160734665449493, + "language_loss": 0.80446088, + "learning_rate": 3.6172553067378406e-06, + "loss": 0.88278687, + "num_input_tokens_seen": 80294795, + "router_z_loss_clip": 2.34375, + "router_z_loss_mlp": 0.24328613, + "step": 3729, + "time_per_iteration": 4.021615266799927 + }, + { + "auxiliary_loss_clip": 0.06533324, + "auxiliary_loss_mlp": 0.01292111, + "balance_loss_clip": 0.06302898, + "balance_loss_mlp": 0.01271237, + "epoch": 0.22425973245152564, + "flos": 27386007269760.0, + "grad_norm": 1.6841051152750983, + "language_loss": 0.87170112, + "learning_rate": 3.6170261476830964e-06, + "loss": 0.94995546, + "num_input_tokens_seen": 80315425, + "router_z_loss_clip": 2.30273438, + "router_z_loss_mlp": 0.2088623, + "step": 3730, + "time_per_iteration": 2.598576307296753 + }, + { + "auxiliary_loss_clip": 0.0653019, + "auxiliary_loss_mlp": 0.01298076, + "balance_loss_clip": 0.06300467, + "balance_loss_mlp": 0.01276403, + "epoch": 0.2243198557041936, + "flos": 13740794225280.0, + "grad_norm": 2.088554635044429, + "language_loss": 0.73449922, + "learning_rate": 3.616796927310559e-06, + "loss": 0.81278187, + "num_input_tokens_seen": 80333905, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.21655273, + "step": 3731, + "time_per_iteration": 2.5361716747283936 + }, + { + "auxiliary_loss_clip": 0.06541456, + "auxiliary_loss_mlp": 0.01292681, + "balance_loss_clip": 0.06301124, + "balance_loss_mlp": 0.01267933, + "epoch": 0.22437997895686157, + "flos": 19535775874560.0, + "grad_norm": 5.172507402775724, + "language_loss": 0.75803339, + "learning_rate": 3.6165676456289195e-06, + "loss": 0.83637482, + "num_input_tokens_seen": 80352165, + "router_z_loss_clip": 2.40625, + "router_z_loss_mlp": 0.24755859, + "step": 3732, + "time_per_iteration": 2.5423076152801514 + }, + { + "auxiliary_loss_clip": 0.06533462, + "auxiliary_loss_mlp": 0.01296517, + "balance_loss_clip": 0.06298938, + "balance_loss_mlp": 0.01273664, + "epoch": 0.22444010220952954, + "flos": 23703032108160.0, + "grad_norm": 1.6752991374876018, + "language_loss": 0.89338291, + "learning_rate": 3.616338302646873e-06, + "loss": 0.97168273, + "num_input_tokens_seen": 80371305, + "router_z_loss_clip": 2.34375, + "router_z_loss_mlp": 0.2286377, + "step": 3733, + "time_per_iteration": 4.021088123321533 + }, + { + "auxiliary_loss_clip": 0.065323, + "auxiliary_loss_mlp": 0.01294952, + "balance_loss_clip": 0.06298727, + "balance_loss_mlp": 0.01270193, + "epoch": 0.2245002254621975, + "flos": 22389514704000.0, + "grad_norm": 1.4651206016819107, + "language_loss": 0.85422146, + "learning_rate": 3.6161088983731166e-06, + "loss": 0.93249398, + "num_input_tokens_seen": 80391020, + "router_z_loss_clip": 2.33398438, + "router_z_loss_mlp": 0.24780273, + "step": 3734, + "time_per_iteration": 2.5562949180603027 + }, + { + "auxiliary_loss_clip": 0.06539299, + "auxiliary_loss_mlp": 0.01283537, + "balance_loss_clip": 0.06303868, + "balance_loss_mlp": 0.01261113, + "epoch": 0.22456034871486547, + "flos": 26949453897600.0, + "grad_norm": 1.579737554219585, + "language_loss": 0.77332962, + "learning_rate": 3.6158794328163482e-06, + "loss": 0.85155803, + "num_input_tokens_seen": 80411365, + "router_z_loss_clip": 2.3515625, + "router_z_loss_mlp": 0.22436523, + "step": 3735, + "time_per_iteration": 4.016703367233276 + }, + { + "auxiliary_loss_clip": 0.06526705, + "auxiliary_loss_mlp": 0.01290552, + "balance_loss_clip": 0.06298478, + "balance_loss_mlp": 0.01269559, + "epoch": 0.22462047196753343, + "flos": 28990700012160.0, + "grad_norm": 1.885472064442235, + "language_loss": 0.84907603, + "learning_rate": 3.6156499059852702e-06, + "loss": 0.92724866, + "num_input_tokens_seen": 80431075, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.21008301, + "step": 3736, + "time_per_iteration": 2.6118290424346924 + }, + { + "auxiliary_loss_clip": 0.06536424, + "auxiliary_loss_mlp": 0.01285836, + "balance_loss_clip": 0.0630133, + "balance_loss_mlp": 0.01261922, + "epoch": 0.22468059522020142, + "flos": 20017541324160.0, + "grad_norm": 1.5290746464045628, + "language_loss": 0.87103891, + "learning_rate": 3.615420317888586e-06, + "loss": 0.94926155, + "num_input_tokens_seen": 80449240, + "router_z_loss_clip": 2.3515625, + "router_z_loss_mlp": 0.23913574, + "step": 3737, + "time_per_iteration": 2.5211808681488037 + }, + { + "auxiliary_loss_clip": 0.06534176, + "auxiliary_loss_mlp": 0.01288351, + "balance_loss_clip": 0.06294889, + "balance_loss_mlp": 0.01263949, + "epoch": 0.2247407184728694, + "flos": 29321846547840.0, + "grad_norm": 1.8581473098744326, + "language_loss": 0.80131769, + "learning_rate": 3.6151906685350006e-06, + "loss": 0.87954295, + "num_input_tokens_seen": 80467900, + "router_z_loss_clip": 2.39257812, + "router_z_loss_mlp": 0.24389648, + "step": 3738, + "time_per_iteration": 2.604417085647583 + }, + { + "auxiliary_loss_clip": 0.06530435, + "auxiliary_loss_mlp": 0.01285051, + "balance_loss_clip": 0.06293893, + "balance_loss_mlp": 0.01263295, + "epoch": 0.22480084172553735, + "flos": 22317035322240.0, + "grad_norm": 1.7432458267253939, + "language_loss": 0.77190316, + "learning_rate": 3.614960957933224e-06, + "loss": 0.85005802, + "num_input_tokens_seen": 80487100, + "router_z_loss_clip": 2.36914062, + "router_z_loss_mlp": 0.21728516, + "step": 3739, + "time_per_iteration": 2.540266275405884 + }, + { + "auxiliary_loss_clip": 0.06531328, + "auxiliary_loss_mlp": 0.01283134, + "balance_loss_clip": 0.06295189, + "balance_loss_mlp": 0.01260091, + "epoch": 0.22486096497820532, + "flos": 25598019720960.0, + "grad_norm": 4.441094103460663, + "language_loss": 0.74799633, + "learning_rate": 3.6147311860919655e-06, + "loss": 0.82614094, + "num_input_tokens_seen": 80508625, + "router_z_loss_clip": 2.36132812, + "router_z_loss_mlp": 0.23022461, + "step": 3740, + "time_per_iteration": 2.640592575073242 + }, + { + "auxiliary_loss_clip": 0.06520827, + "auxiliary_loss_mlp": 0.01278747, + "balance_loss_clip": 0.06289122, + "balance_loss_mlp": 0.01256681, + "epoch": 0.22492108823087328, + "flos": 17645651798400.0, + "grad_norm": 2.0040821388775285, + "language_loss": 0.75983584, + "learning_rate": 3.614501353019939e-06, + "loss": 0.83783156, + "num_input_tokens_seen": 80527345, + "router_z_loss_clip": 2.31640625, + "router_z_loss_mlp": 0.22070312, + "step": 3741, + "time_per_iteration": 2.513965129852295 + }, + { + "auxiliary_loss_clip": 0.06526901, + "auxiliary_loss_mlp": 0.01283674, + "balance_loss_clip": 0.06296658, + "balance_loss_mlp": 0.0126224, + "epoch": 0.22498121148354125, + "flos": 16040246296320.0, + "grad_norm": 1.702368757801579, + "language_loss": 0.87747514, + "learning_rate": 3.6142714587258592e-06, + "loss": 0.95558089, + "num_input_tokens_seen": 80545545, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.21435547, + "step": 3742, + "time_per_iteration": 2.5164167881011963 + }, + { + "auxiliary_loss_clip": 0.0652426, + "auxiliary_loss_mlp": 0.01281848, + "balance_loss_clip": 0.06294844, + "balance_loss_mlp": 0.01259389, + "epoch": 0.22504133473620924, + "flos": 24030489064320.0, + "grad_norm": 1.7109022824395175, + "language_loss": 0.82010657, + "learning_rate": 3.614041503218444e-06, + "loss": 0.89816761, + "num_input_tokens_seen": 80565040, + "router_z_loss_clip": 2.29296875, + "router_z_loss_mlp": 0.22473145, + "step": 3743, + "time_per_iteration": 2.5486276149749756 + }, + { + "auxiliary_loss_clip": 0.06524298, + "auxiliary_loss_mlp": 0.0127565, + "balance_loss_clip": 0.06291372, + "balance_loss_mlp": 0.01254562, + "epoch": 0.2251014579888772, + "flos": 16769610161280.0, + "grad_norm": 2.126207867209009, + "language_loss": 0.64185399, + "learning_rate": 3.6138114865064134e-06, + "loss": 0.7198534, + "num_input_tokens_seen": 80582815, + "router_z_loss_clip": 2.328125, + "router_z_loss_mlp": 0.2109375, + "step": 3744, + "time_per_iteration": 2.535020351409912 + }, + { + "auxiliary_loss_clip": 0.06527244, + "auxiliary_loss_mlp": 0.01277496, + "balance_loss_clip": 0.06293654, + "balance_loss_mlp": 0.01256372, + "epoch": 0.22516158124154517, + "flos": 13996191070080.0, + "grad_norm": 3.1643825534304684, + "language_loss": 0.76886272, + "learning_rate": 3.613581408598489e-06, + "loss": 0.84691012, + "num_input_tokens_seen": 80600865, + "router_z_loss_clip": 2.3359375, + "router_z_loss_mlp": 0.21105957, + "step": 3745, + "time_per_iteration": 2.5233495235443115 + }, + { + "auxiliary_loss_clip": 0.06522205, + "auxiliary_loss_mlp": 0.01281406, + "balance_loss_clip": 0.06290923, + "balance_loss_mlp": 0.01260675, + "epoch": 0.22522170449421314, + "flos": 14394869596800.0, + "grad_norm": 1.6969236990578618, + "language_loss": 0.80721819, + "learning_rate": 3.6133512695033965e-06, + "loss": 0.88525426, + "num_input_tokens_seen": 80617455, + "router_z_loss_clip": 2.3125, + "router_z_loss_mlp": 0.20739746, + "step": 3746, + "time_per_iteration": 2.559129476547241 + }, + { + "auxiliary_loss_clip": 0.06533524, + "auxiliary_loss_mlp": 0.01280566, + "balance_loss_clip": 0.06296681, + "balance_loss_mlp": 0.0125881, + "epoch": 0.2252818277468811, + "flos": 23812338159360.0, + "grad_norm": 2.077776202364112, + "language_loss": 0.86226261, + "learning_rate": 3.613121069229862e-06, + "loss": 0.94040346, + "num_input_tokens_seen": 80635125, + "router_z_loss_clip": 2.3671875, + "router_z_loss_mlp": 0.21765137, + "step": 3747, + "time_per_iteration": 2.5834550857543945 + }, + { + "auxiliary_loss_clip": 0.06530412, + "auxiliary_loss_mlp": 0.01275087, + "balance_loss_clip": 0.06296586, + "balance_loss_mlp": 0.01255095, + "epoch": 0.22534195099954907, + "flos": 24725038757760.0, + "grad_norm": 1.8595393434505574, + "language_loss": 0.76982796, + "learning_rate": 3.6128908077866145e-06, + "loss": 0.84788299, + "num_input_tokens_seen": 80656370, + "router_z_loss_clip": 2.34179688, + "router_z_loss_mlp": 0.1998291, + "step": 3748, + "time_per_iteration": 2.5877788066864014 + }, + { + "auxiliary_loss_clip": 0.0652978, + "auxiliary_loss_mlp": 0.0128313, + "balance_loss_clip": 0.06296694, + "balance_loss_mlp": 0.01261768, + "epoch": 0.22540207425221703, + "flos": 21038625578880.0, + "grad_norm": 1.5282192474331018, + "language_loss": 0.80547005, + "learning_rate": 3.6126604851823864e-06, + "loss": 0.88359916, + "num_input_tokens_seen": 80676495, + "router_z_loss_clip": 2.33203125, + "router_z_loss_mlp": 0.21374512, + "step": 3749, + "time_per_iteration": 2.5356597900390625 + }, + { + "auxiliary_loss_clip": 0.06526259, + "auxiliary_loss_mlp": 0.01273546, + "balance_loss_clip": 0.06298405, + "balance_loss_mlp": 0.01253698, + "epoch": 0.22546219750488503, + "flos": 19396351480320.0, + "grad_norm": 1.5225090015602234, + "language_loss": 0.80070651, + "learning_rate": 3.6124301014259108e-06, + "loss": 0.87870455, + "num_input_tokens_seen": 80694755, + "router_z_loss_clip": 2.2734375, + "router_z_loss_mlp": 0.19848633, + "step": 3750, + "time_per_iteration": 2.524614095687866 + }, + { + "auxiliary_loss_clip": 0.06532078, + "auxiliary_loss_mlp": 0.01279372, + "balance_loss_clip": 0.06297495, + "balance_loss_mlp": 0.01258117, + "epoch": 0.225522320757553, + "flos": 25199760464640.0, + "grad_norm": 5.336084937176506, + "language_loss": 0.8300491, + "learning_rate": 3.6121996565259244e-06, + "loss": 0.90816361, + "num_input_tokens_seen": 80713670, + "router_z_loss_clip": 2.34765625, + "router_z_loss_mlp": 0.21264648, + "step": 3751, + "time_per_iteration": 2.5638771057128906 + }, + { + "auxiliary_loss_clip": 0.06527963, + "auxiliary_loss_mlp": 0.01280546, + "balance_loss_clip": 0.06296829, + "balance_loss_mlp": 0.01260149, + "epoch": 0.22558244401022096, + "flos": 17168456396160.0, + "grad_norm": 1.7246902184661286, + "language_loss": 0.8427825, + "learning_rate": 3.611969150491165e-06, + "loss": 0.92086762, + "num_input_tokens_seen": 80731450, + "router_z_loss_clip": 2.31054688, + "router_z_loss_mlp": 0.20385742, + "step": 3752, + "time_per_iteration": 2.5650362968444824 + }, + { + "auxiliary_loss_clip": 0.06527157, + "auxiliary_loss_mlp": 0.01275092, + "balance_loss_clip": 0.06298538, + "balance_loss_mlp": 0.01254839, + "epoch": 0.22564256726288892, + "flos": 15236306697600.0, + "grad_norm": 1.7312534305272433, + "language_loss": 0.78620666, + "learning_rate": 3.611738583330375e-06, + "loss": 0.8642292, + "num_input_tokens_seen": 80748415, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.20251465, + "step": 3753, + "time_per_iteration": 2.510344982147217 + }, + { + "auxiliary_loss_clip": 0.06525348, + "auxiliary_loss_mlp": 0.01279816, + "balance_loss_clip": 0.06296748, + "balance_loss_mlp": 0.01257869, + "epoch": 0.2257026905155569, + "flos": 34577215902720.0, + "grad_norm": 1.9706921359503449, + "language_loss": 0.79448152, + "learning_rate": 3.611507955052295e-06, + "loss": 0.8725332, + "num_input_tokens_seen": 80770835, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.21948242, + "step": 3754, + "time_per_iteration": 2.6429665088653564 + }, + { + "auxiliary_loss_clip": 0.06526577, + "auxiliary_loss_mlp": 0.01281198, + "balance_loss_clip": 0.06299241, + "balance_loss_mlp": 0.01259835, + "epoch": 0.22576281376822485, + "flos": 19944642672000.0, + "grad_norm": 1.7667035857085684, + "language_loss": 0.70640147, + "learning_rate": 3.6112772656656727e-06, + "loss": 0.78447914, + "num_input_tokens_seen": 80787840, + "router_z_loss_clip": 2.2734375, + "router_z_loss_mlp": 0.21374512, + "step": 3755, + "time_per_iteration": 2.5482447147369385 + }, + { + "auxiliary_loss_clip": 0.06530152, + "auxiliary_loss_mlp": 0.01282078, + "balance_loss_clip": 0.06295566, + "balance_loss_mlp": 0.01261085, + "epoch": 0.22582293702089282, + "flos": 24607892350080.0, + "grad_norm": 2.6955819116528588, + "language_loss": 0.77899122, + "learning_rate": 3.6110465151792547e-06, + "loss": 0.85711348, + "num_input_tokens_seen": 80806335, + "router_z_loss_clip": 2.34570312, + "router_z_loss_mlp": 0.21008301, + "step": 3756, + "time_per_iteration": 2.573639392852783 + }, + { + "auxiliary_loss_clip": 0.06536651, + "auxiliary_loss_mlp": 0.01278842, + "balance_loss_clip": 0.0629873, + "balance_loss_mlp": 0.01255394, + "epoch": 0.2258830602735608, + "flos": 23041451796480.0, + "grad_norm": 2.9460656412940405, + "language_loss": 0.82867002, + "learning_rate": 3.6108157036017916e-06, + "loss": 0.90682495, + "num_input_tokens_seen": 80825355, + "router_z_loss_clip": 2.38085938, + "router_z_loss_mlp": 0.23461914, + "step": 3757, + "time_per_iteration": 2.5425305366516113 + }, + { + "auxiliary_loss_clip": 0.06538612, + "auxiliary_loss_mlp": 0.01279229, + "balance_loss_clip": 0.06302969, + "balance_loss_mlp": 0.01257164, + "epoch": 0.22594318352622877, + "flos": 22164068494080.0, + "grad_norm": 3.099441845199118, + "language_loss": 0.73941171, + "learning_rate": 3.6105848309420358e-06, + "loss": 0.81759018, + "num_input_tokens_seen": 80842570, + "router_z_loss_clip": 2.359375, + "router_z_loss_mlp": 0.2208252, + "step": 3758, + "time_per_iteration": 2.506148099899292 + }, + { + "auxiliary_loss_clip": 0.06531477, + "auxiliary_loss_mlp": 0.01288595, + "balance_loss_clip": 0.06296086, + "balance_loss_mlp": 0.01266816, + "epoch": 0.22600330677889674, + "flos": 20600478979200.0, + "grad_norm": 2.4125098710516117, + "language_loss": 0.77881908, + "learning_rate": 3.6103538972087412e-06, + "loss": 0.85701978, + "num_input_tokens_seen": 80858745, + "router_z_loss_clip": 2.35546875, + "router_z_loss_mlp": 0.21777344, + "step": 3759, + "time_per_iteration": 2.5171775817871094 + }, + { + "auxiliary_loss_clip": 0.06534176, + "auxiliary_loss_mlp": 0.01288917, + "balance_loss_clip": 0.06296586, + "balance_loss_mlp": 0.01266267, + "epoch": 0.2260634300315647, + "flos": 35667970427520.0, + "grad_norm": 1.6851914496917324, + "language_loss": 0.7921207, + "learning_rate": 3.6101229024106655e-06, + "loss": 0.87035167, + "num_input_tokens_seen": 80880085, + "router_z_loss_clip": 2.37695312, + "router_z_loss_mlp": 0.22644043, + "step": 3760, + "time_per_iteration": 2.6410677433013916 + }, + { + "auxiliary_loss_clip": 0.06433272, + "auxiliary_loss_mlp": 0.01258557, + "balance_loss_clip": 0.06311189, + "balance_loss_mlp": 0.01252156, + "epoch": 0.22612355328423267, + "flos": 72107707685760.0, + "grad_norm": 0.875668320300708, + "language_loss": 0.60230321, + "learning_rate": 3.609891846556569e-06, + "loss": 0.67922151, + "num_input_tokens_seen": 80937660, + "router_z_loss_clip": 1.21875, + "router_z_loss_mlp": 0.06408691, + "step": 3761, + "time_per_iteration": 3.1083786487579346 + }, + { + "auxiliary_loss_clip": 0.06545433, + "auxiliary_loss_mlp": 0.01288291, + "balance_loss_clip": 0.06303856, + "balance_loss_mlp": 0.01267012, + "epoch": 0.22618367653690064, + "flos": 22790373436800.0, + "grad_norm": 3.0022983434583783, + "language_loss": 0.77876461, + "learning_rate": 3.609660729655211e-06, + "loss": 0.8571018, + "num_input_tokens_seen": 80956265, + "router_z_loss_clip": 2.41601562, + "router_z_loss_mlp": 0.21289062, + "step": 3762, + "time_per_iteration": 2.5256128311157227 + }, + { + "auxiliary_loss_clip": 0.06531228, + "auxiliary_loss_mlp": 0.01280361, + "balance_loss_clip": 0.06294668, + "balance_loss_mlp": 0.01258343, + "epoch": 0.22624379978956863, + "flos": 20454388185600.0, + "grad_norm": 1.959767281760525, + "language_loss": 0.79828411, + "learning_rate": 3.6094295517153573e-06, + "loss": 0.87639999, + "num_input_tokens_seen": 80975185, + "router_z_loss_clip": 2.36328125, + "router_z_loss_mlp": 0.22033691, + "step": 3763, + "time_per_iteration": 2.528965950012207 + }, + { + "auxiliary_loss_clip": 0.06540731, + "auxiliary_loss_mlp": 0.01291635, + "balance_loss_clip": 0.06300753, + "balance_loss_mlp": 0.01268949, + "epoch": 0.2263039230422366, + "flos": 17500189910400.0, + "grad_norm": 1.5800574189561347, + "language_loss": 0.91907668, + "learning_rate": 3.6091983127457743e-06, + "loss": 0.99740022, + "num_input_tokens_seen": 80992830, + "router_z_loss_clip": 2.3984375, + "router_z_loss_mlp": 0.22705078, + "step": 3764, + "time_per_iteration": 2.5012450218200684 + }, + { + "auxiliary_loss_clip": 0.06527007, + "auxiliary_loss_mlp": 0.01291683, + "balance_loss_clip": 0.06295396, + "balance_loss_mlp": 0.01271001, + "epoch": 0.22636404629490456, + "flos": 28337295473280.0, + "grad_norm": 3.379650672619254, + "language_loss": 0.75542498, + "learning_rate": 3.6089670127552293e-06, + "loss": 0.83361191, + "num_input_tokens_seen": 81013675, + "router_z_loss_clip": 2.31445312, + "router_z_loss_mlp": 0.20690918, + "step": 3765, + "time_per_iteration": 2.6149775981903076 + }, + { + "auxiliary_loss_clip": 0.06519896, + "auxiliary_loss_mlp": 0.01290584, + "balance_loss_clip": 0.06290467, + "balance_loss_mlp": 0.01268256, + "epoch": 0.22642416954757252, + "flos": 17494152416640.0, + "grad_norm": 2.1325205607667526, + "language_loss": 0.90732884, + "learning_rate": 3.608735651752494e-06, + "loss": 0.98543364, + "num_input_tokens_seen": 81030345, + "router_z_loss_clip": 2.29492188, + "router_z_loss_mlp": 0.22338867, + "step": 3766, + "time_per_iteration": 3.925321340560913 + }, + { + "auxiliary_loss_clip": 0.06520344, + "auxiliary_loss_mlp": 0.01279841, + "balance_loss_clip": 0.0629393, + "balance_loss_mlp": 0.0125756, + "epoch": 0.2264842928002405, + "flos": 24390621912960.0, + "grad_norm": 1.5335844294501488, + "language_loss": 0.74866152, + "learning_rate": 3.6085042297463417e-06, + "loss": 0.82666337, + "num_input_tokens_seen": 81051000, + "router_z_loss_clip": 2.26757812, + "router_z_loss_mlp": 0.22290039, + "step": 3767, + "time_per_iteration": 2.585827589035034 + }, + { + "auxiliary_loss_clip": 0.06526411, + "auxiliary_loss_mlp": 0.01285323, + "balance_loss_clip": 0.06291486, + "balance_loss_mlp": 0.01262816, + "epoch": 0.22654441605290845, + "flos": 19836971775360.0, + "grad_norm": 1.5156609478299474, + "language_loss": 0.72064531, + "learning_rate": 3.6082727467455477e-06, + "loss": 0.79876268, + "num_input_tokens_seen": 81071205, + "router_z_loss_clip": 2.35351562, + "router_z_loss_mlp": 0.22521973, + "step": 3768, + "time_per_iteration": 3.9932377338409424 + }, + { + "auxiliary_loss_clip": 0.06525982, + "auxiliary_loss_mlp": 0.01291355, + "balance_loss_clip": 0.06294759, + "balance_loss_mlp": 0.01268347, + "epoch": 0.22660453930557642, + "flos": 27462050449920.0, + "grad_norm": 1.8227506475765343, + "language_loss": 0.78781188, + "learning_rate": 3.6080412027588905e-06, + "loss": 0.86598527, + "num_input_tokens_seen": 81091880, + "router_z_loss_clip": 2.31445312, + "router_z_loss_mlp": 0.22998047, + "step": 3769, + "time_per_iteration": 2.5796549320220947 + }, + { + "auxiliary_loss_clip": 0.06531481, + "auxiliary_loss_mlp": 0.01287446, + "balance_loss_clip": 0.06292526, + "balance_loss_mlp": 0.01265428, + "epoch": 0.2266646625582444, + "flos": 23995004060160.0, + "grad_norm": 2.604534401291856, + "language_loss": 0.69374454, + "learning_rate": 3.6078095977951488e-06, + "loss": 0.77193379, + "num_input_tokens_seen": 81113290, + "router_z_loss_clip": 2.38867188, + "router_z_loss_mlp": 0.22021484, + "step": 3770, + "time_per_iteration": 2.6160407066345215 + }, + { + "auxiliary_loss_clip": 0.065291, + "auxiliary_loss_mlp": 0.01289999, + "balance_loss_clip": 0.06292273, + "balance_loss_mlp": 0.01269077, + "epoch": 0.22672478581091238, + "flos": 26034698874240.0, + "grad_norm": 1.4830972618629188, + "language_loss": 0.8083868, + "learning_rate": 3.6075779318631067e-06, + "loss": 0.88657784, + "num_input_tokens_seen": 81133535, + "router_z_loss_clip": 2.37109375, + "router_z_loss_mlp": 0.20922852, + "step": 3771, + "time_per_iteration": 2.576948642730713 + }, + { + "auxiliary_loss_clip": 0.06521479, + "auxiliary_loss_mlp": 0.01283736, + "balance_loss_clip": 0.06290901, + "balance_loss_mlp": 0.012613, + "epoch": 0.22678490906358034, + "flos": 23848577850240.0, + "grad_norm": 1.5694676435300003, + "language_loss": 0.79189658, + "learning_rate": 3.6073462049715486e-06, + "loss": 0.86994874, + "num_input_tokens_seen": 81154650, + "router_z_loss_clip": 2.30273438, + "router_z_loss_mlp": 0.22436523, + "step": 3772, + "time_per_iteration": 4.012827396392822 + }, + { + "auxiliary_loss_clip": 0.06410234, + "auxiliary_loss_mlp": 0.01286376, + "balance_loss_clip": 0.06287075, + "balance_loss_mlp": 0.01280571, + "epoch": 0.2268450323162483, + "flos": 65070163912320.0, + "grad_norm": 0.6415690360853892, + "language_loss": 0.53899318, + "learning_rate": 3.607114417129261e-06, + "loss": 0.61595929, + "num_input_tokens_seen": 81221240, + "router_z_loss_clip": 1.234375, + "router_z_loss_mlp": 0.0579834, + "step": 3773, + "time_per_iteration": 3.249551773071289 + }, + { + "auxiliary_loss_clip": 0.06526346, + "auxiliary_loss_mlp": 0.01287624, + "balance_loss_clip": 0.06294057, + "balance_loss_mlp": 0.01266238, + "epoch": 0.22690515556891627, + "flos": 22532251334400.0, + "grad_norm": 1.8359701531623327, + "language_loss": 0.70997107, + "learning_rate": 3.6068825683450334e-06, + "loss": 0.78811073, + "num_input_tokens_seen": 81241520, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.21386719, + "step": 3774, + "time_per_iteration": 2.558279275894165 + }, + { + "auxiliary_loss_clip": 0.06521672, + "auxiliary_loss_mlp": 0.01287873, + "balance_loss_clip": 0.06291246, + "balance_loss_mlp": 0.01266857, + "epoch": 0.22696527882158424, + "flos": 18229344140160.0, + "grad_norm": 2.047907778931267, + "language_loss": 0.75449002, + "learning_rate": 3.606650658627658e-06, + "loss": 0.83258545, + "num_input_tokens_seen": 81256825, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.21008301, + "step": 3775, + "time_per_iteration": 3.928666353225708 + }, + { + "auxiliary_loss_clip": 0.06524701, + "auxiliary_loss_mlp": 0.01286732, + "balance_loss_clip": 0.06292307, + "balance_loss_mlp": 0.01266168, + "epoch": 0.22702540207425223, + "flos": 17024923152000.0, + "grad_norm": 2.031895062113734, + "language_loss": 0.82818532, + "learning_rate": 3.606418687985928e-06, + "loss": 0.90629965, + "num_input_tokens_seen": 81275695, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.20581055, + "step": 3776, + "time_per_iteration": 2.5941483974456787 + }, + { + "auxiliary_loss_clip": 0.06528914, + "auxiliary_loss_mlp": 0.01279846, + "balance_loss_clip": 0.06293055, + "balance_loss_mlp": 0.01259222, + "epoch": 0.2270855253269202, + "flos": 21332316539520.0, + "grad_norm": 1.645158938946052, + "language_loss": 0.83362442, + "learning_rate": 3.606186656428641e-06, + "loss": 0.91171205, + "num_input_tokens_seen": 81294920, + "router_z_loss_clip": 2.359375, + "router_z_loss_mlp": 0.20617676, + "step": 3777, + "time_per_iteration": 2.5177228450775146 + }, + { + "auxiliary_loss_clip": 0.06532624, + "auxiliary_loss_mlp": 0.01278936, + "balance_loss_clip": 0.06296799, + "balance_loss_mlp": 0.01257002, + "epoch": 0.22714564857958816, + "flos": 23557276730880.0, + "grad_norm": 1.8837878269403912, + "language_loss": 0.73246169, + "learning_rate": 3.6059545639645955e-06, + "loss": 0.81057739, + "num_input_tokens_seen": 81314275, + "router_z_loss_clip": 2.36328125, + "router_z_loss_mlp": 0.21948242, + "step": 3778, + "time_per_iteration": 2.5589511394500732 + }, + { + "auxiliary_loss_clip": 0.06530988, + "auxiliary_loss_mlp": 0.01275867, + "balance_loss_clip": 0.06293572, + "balance_loss_mlp": 0.01255673, + "epoch": 0.22720577183225613, + "flos": 25996237050240.0, + "grad_norm": 2.9659284448048555, + "language_loss": 0.65779513, + "learning_rate": 3.605722410602591e-06, + "loss": 0.73586369, + "num_input_tokens_seen": 81333890, + "router_z_loss_clip": 2.37304688, + "router_z_loss_mlp": 0.20178223, + "step": 3779, + "time_per_iteration": 2.543818950653076 + }, + { + "auxiliary_loss_clip": 0.06525169, + "auxiliary_loss_mlp": 0.01276701, + "balance_loss_clip": 0.06295511, + "balance_loss_mlp": 0.01255982, + "epoch": 0.2272658950849241, + "flos": 20820432746880.0, + "grad_norm": 1.7825989229768946, + "language_loss": 0.70823693, + "learning_rate": 3.6054901963514323e-06, + "loss": 0.7862556, + "num_input_tokens_seen": 81353640, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.20703125, + "step": 3780, + "time_per_iteration": 2.558850049972534 + }, + { + "auxiliary_loss_clip": 0.06528573, + "auxiliary_loss_mlp": 0.01280577, + "balance_loss_clip": 0.06296494, + "balance_loss_mlp": 0.01257927, + "epoch": 0.22732601833759206, + "flos": 23915187446400.0, + "grad_norm": 1.6463040629853982, + "language_loss": 0.89639765, + "learning_rate": 3.6052579212199246e-06, + "loss": 0.97448915, + "num_input_tokens_seen": 81371595, + "router_z_loss_clip": 2.32226562, + "router_z_loss_mlp": 0.2265625, + "step": 3781, + "time_per_iteration": 2.527230739593506 + }, + { + "auxiliary_loss_clip": 0.06532317, + "auxiliary_loss_mlp": 0.01280346, + "balance_loss_clip": 0.06296034, + "balance_loss_mlp": 0.01257672, + "epoch": 0.22738614159026002, + "flos": 15929850142080.0, + "grad_norm": 2.4692396393453016, + "language_loss": 0.75309098, + "learning_rate": 3.6050255852168753e-06, + "loss": 0.83121765, + "num_input_tokens_seen": 81388435, + "router_z_loss_clip": 2.36523438, + "router_z_loss_mlp": 0.2265625, + "step": 3782, + "time_per_iteration": 2.4901020526885986 + }, + { + "auxiliary_loss_clip": 0.06532567, + "auxiliary_loss_mlp": 0.01278379, + "balance_loss_clip": 0.06300219, + "balance_loss_mlp": 0.01257959, + "epoch": 0.22744626484292801, + "flos": 24212148716160.0, + "grad_norm": 1.7681967435875452, + "language_loss": 0.8314634, + "learning_rate": 3.604793188351095e-06, + "loss": 0.90957284, + "num_input_tokens_seen": 81410195, + "router_z_loss_clip": 2.32421875, + "router_z_loss_mlp": 0.20422363, + "step": 3783, + "time_per_iteration": 2.559361696243286 + }, + { + "auxiliary_loss_clip": 0.06539755, + "auxiliary_loss_mlp": 0.0128451, + "balance_loss_clip": 0.06305835, + "balance_loss_mlp": 0.01262266, + "epoch": 0.22750638809559598, + "flos": 24798734023680.0, + "grad_norm": 1.794476113807414, + "language_loss": 0.76757884, + "learning_rate": 3.6045607306313964e-06, + "loss": 0.8458215, + "num_input_tokens_seen": 81430060, + "router_z_loss_clip": 2.34179688, + "router_z_loss_mlp": 0.22229004, + "step": 3784, + "time_per_iteration": 2.6693339347839355 + }, + { + "auxiliary_loss_clip": 0.06533188, + "auxiliary_loss_mlp": 0.012806, + "balance_loss_clip": 0.06299379, + "balance_loss_mlp": 0.01257998, + "epoch": 0.22756651134826394, + "flos": 22243004640000.0, + "grad_norm": 1.5985438146538498, + "language_loss": 0.71667248, + "learning_rate": 3.604328212066594e-06, + "loss": 0.79481035, + "num_input_tokens_seen": 81447375, + "router_z_loss_clip": 2.34179688, + "router_z_loss_mlp": 0.22583008, + "step": 3785, + "time_per_iteration": 2.5436675548553467 + }, + { + "auxiliary_loss_clip": 0.06421004, + "auxiliary_loss_mlp": 0.0127133, + "balance_loss_clip": 0.0629871, + "balance_loss_mlp": 0.01265915, + "epoch": 0.2276266346009319, + "flos": 62728225021440.0, + "grad_norm": 1.545506426452605, + "language_loss": 0.63058448, + "learning_rate": 3.6040956326655047e-06, + "loss": 0.70750785, + "num_input_tokens_seen": 81505235, + "router_z_loss_clip": 1.21875, + "router_z_loss_mlp": 0.05422974, + "step": 3786, + "time_per_iteration": 3.1247661113739014 + }, + { + "auxiliary_loss_clip": 0.06538717, + "auxiliary_loss_mlp": 0.01277474, + "balance_loss_clip": 0.06302891, + "balance_loss_mlp": 0.01254299, + "epoch": 0.22768675785359987, + "flos": 18618085958400.0, + "grad_norm": 2.466113986800572, + "language_loss": 0.8751514, + "learning_rate": 3.6038629924369486e-06, + "loss": 0.95331335, + "num_input_tokens_seen": 81518685, + "router_z_loss_clip": 2.35742188, + "router_z_loss_mlp": 0.23156738, + "step": 3787, + "time_per_iteration": 2.488539457321167 + }, + { + "auxiliary_loss_clip": 0.06537791, + "auxiliary_loss_mlp": 0.01280159, + "balance_loss_clip": 0.06305036, + "balance_loss_mlp": 0.01259488, + "epoch": 0.22774688110626784, + "flos": 26877477640320.0, + "grad_norm": 2.053207704033697, + "language_loss": 0.73054254, + "learning_rate": 3.6036302913897474e-06, + "loss": 0.80872202, + "num_input_tokens_seen": 81538940, + "router_z_loss_clip": 2.33007812, + "router_z_loss_mlp": 0.20678711, + "step": 3788, + "time_per_iteration": 2.5763657093048096 + }, + { + "auxiliary_loss_clip": 0.06534025, + "auxiliary_loss_mlp": 0.01282834, + "balance_loss_clip": 0.06303776, + "balance_loss_mlp": 0.01260971, + "epoch": 0.2278070043589358, + "flos": 15557977722240.0, + "grad_norm": 4.57361945380841, + "language_loss": 0.68007839, + "learning_rate": 3.6033975295327243e-06, + "loss": 0.75824702, + "num_input_tokens_seen": 81555525, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.21850586, + "step": 3789, + "time_per_iteration": 2.4907443523406982 + }, + { + "auxiliary_loss_clip": 0.0653897, + "auxiliary_loss_mlp": 0.01283477, + "balance_loss_clip": 0.06308074, + "balance_loss_mlp": 0.0126115, + "epoch": 0.2278671276116038, + "flos": 22422987210240.0, + "grad_norm": 2.4388022002275243, + "language_loss": 0.76775718, + "learning_rate": 3.6031647068747065e-06, + "loss": 0.84598166, + "num_input_tokens_seen": 81576305, + "router_z_loss_clip": 2.31054688, + "router_z_loss_mlp": 0.22338867, + "step": 3790, + "time_per_iteration": 2.5787651538848877 + }, + { + "auxiliary_loss_clip": 0.06540109, + "auxiliary_loss_mlp": 0.01282259, + "balance_loss_clip": 0.06309578, + "balance_loss_mlp": 0.01259252, + "epoch": 0.22792725086427176, + "flos": 20637641064960.0, + "grad_norm": 1.9300771626575046, + "language_loss": 0.91910696, + "learning_rate": 3.602931823424522e-06, + "loss": 0.99733061, + "num_input_tokens_seen": 81594115, + "router_z_loss_clip": 2.30664062, + "router_z_loss_mlp": 0.23010254, + "step": 3791, + "time_per_iteration": 2.52327823638916 + }, + { + "auxiliary_loss_clip": 0.06538808, + "auxiliary_loss_mlp": 0.01277492, + "balance_loss_clip": 0.06302848, + "balance_loss_mlp": 0.01256893, + "epoch": 0.22798737411693973, + "flos": 31436662147200.0, + "grad_norm": 1.9637481556258098, + "language_loss": 0.83064067, + "learning_rate": 3.6026988791910026e-06, + "loss": 0.9088037, + "num_input_tokens_seen": 81615355, + "router_z_loss_clip": 2.36132812, + "router_z_loss_mlp": 0.20617676, + "step": 3792, + "time_per_iteration": 2.6190388202667236 + }, + { + "auxiliary_loss_clip": 0.06410792, + "auxiliary_loss_mlp": 0.01268683, + "balance_loss_clip": 0.06289717, + "balance_loss_mlp": 0.01263259, + "epoch": 0.2280474973696077, + "flos": 52412074220160.0, + "grad_norm": 1.1033671526650368, + "language_loss": 0.65792358, + "learning_rate": 3.602465874182981e-06, + "loss": 0.73471832, + "num_input_tokens_seen": 81662075, + "router_z_loss_clip": 1.2109375, + "router_z_loss_mlp": 0.05432129, + "step": 3793, + "time_per_iteration": 2.9110665321350098 + }, + { + "auxiliary_loss_clip": 0.0654863, + "auxiliary_loss_mlp": 0.01287304, + "balance_loss_clip": 0.06306019, + "balance_loss_mlp": 0.01261889, + "epoch": 0.22810762062227566, + "flos": 26403300984960.0, + "grad_norm": 1.9908643306499119, + "language_loss": 0.78207439, + "learning_rate": 3.602232808409293e-06, + "loss": 0.8604337, + "num_input_tokens_seen": 81681625, + "router_z_loss_clip": 2.4296875, + "router_z_loss_mlp": 0.25415039, + "step": 3794, + "time_per_iteration": 2.5911734104156494 + }, + { + "auxiliary_loss_clip": 0.06544799, + "auxiliary_loss_mlp": 0.01285336, + "balance_loss_clip": 0.06310074, + "balance_loss_mlp": 0.01262412, + "epoch": 0.22816774387494362, + "flos": 25637445866880.0, + "grad_norm": 3.443157636284035, + "language_loss": 0.81285226, + "learning_rate": 3.6019996818787755e-06, + "loss": 0.89115357, + "num_input_tokens_seen": 81701170, + "router_z_loss_clip": 2.34960938, + "router_z_loss_mlp": 0.22912598, + "step": 3795, + "time_per_iteration": 2.6825528144836426 + }, + { + "auxiliary_loss_clip": 0.06536914, + "auxiliary_loss_mlp": 0.0128896, + "balance_loss_clip": 0.06306744, + "balance_loss_mlp": 0.01267586, + "epoch": 0.22822786712761162, + "flos": 22457507892480.0, + "grad_norm": 1.703568435651106, + "language_loss": 0.77948368, + "learning_rate": 3.6017664946002704e-06, + "loss": 0.85774243, + "num_input_tokens_seen": 81721265, + "router_z_loss_clip": 2.30664062, + "router_z_loss_mlp": 0.21362305, + "step": 3796, + "time_per_iteration": 2.5418922901153564 + }, + { + "auxiliary_loss_clip": 0.06535624, + "auxiliary_loss_mlp": 0.01278994, + "balance_loss_clip": 0.06302401, + "balance_loss_mlp": 0.01258692, + "epoch": 0.22828799038027958, + "flos": 12207323053440.0, + "grad_norm": 2.5041816771456076, + "language_loss": 0.96305406, + "learning_rate": 3.6015332465826188e-06, + "loss": 1.04120016, + "num_input_tokens_seen": 81736565, + "router_z_loss_clip": 2.33203125, + "router_z_loss_mlp": 0.20324707, + "step": 3797, + "time_per_iteration": 2.5794107913970947 + }, + { + "auxiliary_loss_clip": 0.06537494, + "auxiliary_loss_mlp": 0.01281478, + "balance_loss_clip": 0.06304877, + "balance_loss_mlp": 0.01260057, + "epoch": 0.22834811363294755, + "flos": 22091379477120.0, + "grad_norm": 1.517581709018558, + "language_loss": 0.82277977, + "learning_rate": 3.601299937834666e-06, + "loss": 0.90096951, + "num_input_tokens_seen": 81756240, + "router_z_loss_clip": 2.32421875, + "router_z_loss_mlp": 0.2142334, + "step": 3798, + "time_per_iteration": 2.618784189224243 + }, + { + "auxiliary_loss_clip": 0.06536907, + "auxiliary_loss_mlp": 0.01279844, + "balance_loss_clip": 0.06300005, + "balance_loss_mlp": 0.01257146, + "epoch": 0.2284082368856155, + "flos": 24867104555520.0, + "grad_norm": 1.8603662335211264, + "language_loss": 0.79381669, + "learning_rate": 3.6010665683652596e-06, + "loss": 0.87198418, + "num_input_tokens_seen": 81775720, + "router_z_loss_clip": 2.36914062, + "router_z_loss_mlp": 0.22705078, + "step": 3799, + "time_per_iteration": 2.591053009033203 + }, + { + "auxiliary_loss_clip": 0.06534393, + "auxiliary_loss_mlp": 0.0128126, + "balance_loss_clip": 0.06300979, + "balance_loss_mlp": 0.01258646, + "epoch": 0.22846836013828348, + "flos": 23299280409600.0, + "grad_norm": 1.5152328596048934, + "language_loss": 0.75782096, + "learning_rate": 3.6008331381832484e-06, + "loss": 0.83597749, + "num_input_tokens_seen": 81795830, + "router_z_loss_clip": 2.33398438, + "router_z_loss_mlp": 0.22619629, + "step": 3800, + "time_per_iteration": 2.5370395183563232 + }, + { + "auxiliary_loss_clip": 0.06535068, + "auxiliary_loss_mlp": 0.01279113, + "balance_loss_clip": 0.06302812, + "balance_loss_mlp": 0.01258001, + "epoch": 0.22852848339095144, + "flos": 27423462844800.0, + "grad_norm": 1.9420817073182375, + "language_loss": 0.64685607, + "learning_rate": 3.600599647297484e-06, + "loss": 0.72499788, + "num_input_tokens_seen": 81815745, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.21105957, + "step": 3801, + "time_per_iteration": 2.6190593242645264 + }, + { + "auxiliary_loss_clip": 0.06524718, + "auxiliary_loss_mlp": 0.01278565, + "balance_loss_clip": 0.06296816, + "balance_loss_mlp": 0.01257835, + "epoch": 0.2285886066436194, + "flos": 26328054418560.0, + "grad_norm": 1.6808395254049295, + "language_loss": 0.81957126, + "learning_rate": 3.60036609571682e-06, + "loss": 0.89760411, + "num_input_tokens_seen": 81835155, + "router_z_loss_clip": 2.27929688, + "router_z_loss_mlp": 0.20727539, + "step": 3802, + "time_per_iteration": 2.554079055786133 + }, + { + "auxiliary_loss_clip": 0.06534229, + "auxiliary_loss_mlp": 0.01286931, + "balance_loss_clip": 0.06299631, + "balance_loss_mlp": 0.0126415, + "epoch": 0.2286487298962874, + "flos": 29724298508160.0, + "grad_norm": 1.6760491170738747, + "language_loss": 0.79838073, + "learning_rate": 3.600132483450114e-06, + "loss": 0.87659228, + "num_input_tokens_seen": 81855655, + "router_z_loss_clip": 2.34960938, + "router_z_loss_mlp": 0.22790527, + "step": 3803, + "time_per_iteration": 2.6287641525268555 + }, + { + "auxiliary_loss_clip": 0.0653572, + "auxiliary_loss_mlp": 0.01279074, + "balance_loss_clip": 0.06296768, + "balance_loss_mlp": 0.012559, + "epoch": 0.22870885314895537, + "flos": 21293435445120.0, + "grad_norm": 1.7238152987334623, + "language_loss": 0.86273003, + "learning_rate": 3.5998988105062235e-06, + "loss": 0.94087803, + "num_input_tokens_seen": 81876385, + "router_z_loss_clip": 2.390625, + "router_z_loss_mlp": 0.23168945, + "step": 3804, + "time_per_iteration": 2.511462450027466 + }, + { + "auxiliary_loss_clip": 0.06539486, + "auxiliary_loss_mlp": 0.01279472, + "balance_loss_clip": 0.06301028, + "balance_loss_mlp": 0.01257537, + "epoch": 0.22876897640162333, + "flos": 14944754016000.0, + "grad_norm": 1.89266353651555, + "language_loss": 0.76854289, + "learning_rate": 3.59966507689401e-06, + "loss": 0.84673244, + "num_input_tokens_seen": 81893225, + "router_z_loss_clip": 2.38476562, + "router_z_loss_mlp": 0.21923828, + "step": 3805, + "time_per_iteration": 3.929358959197998 + }, + { + "auxiliary_loss_clip": 0.0654166, + "auxiliary_loss_mlp": 0.01280204, + "balance_loss_clip": 0.06298529, + "balance_loss_mlp": 0.01257542, + "epoch": 0.2288290996542913, + "flos": 18119786526720.0, + "grad_norm": 2.0123502787071073, + "language_loss": 0.79403114, + "learning_rate": 3.5994312826223363e-06, + "loss": 0.87224978, + "num_input_tokens_seen": 81911350, + "router_z_loss_clip": 2.43359375, + "router_z_loss_mlp": 0.22680664, + "step": 3806, + "time_per_iteration": 2.538203477859497 + }, + { + "auxiliary_loss_clip": 0.06540429, + "auxiliary_loss_mlp": 0.01282432, + "balance_loss_clip": 0.06303287, + "balance_loss_mlp": 0.01259878, + "epoch": 0.22888922290695926, + "flos": 39864296828160.0, + "grad_norm": 1.8839046523975558, + "language_loss": 0.70310783, + "learning_rate": 3.5991974277000684e-06, + "loss": 0.78133643, + "num_input_tokens_seen": 81935420, + "router_z_loss_clip": 2.37304688, + "router_z_loss_mlp": 0.22546387, + "step": 3807, + "time_per_iteration": 4.134840488433838 + }, + { + "auxiliary_loss_clip": 0.06550615, + "auxiliary_loss_mlp": 0.01290274, + "balance_loss_clip": 0.06307966, + "balance_loss_mlp": 0.01265121, + "epoch": 0.22894934615962723, + "flos": 23410431250560.0, + "grad_norm": 2.1946772997431103, + "language_loss": 0.65960705, + "learning_rate": 3.5989635121360733e-06, + "loss": 0.73801601, + "num_input_tokens_seen": 81953845, + "router_z_loss_clip": 2.42578125, + "router_z_loss_mlp": 0.25183105, + "step": 3808, + "time_per_iteration": 2.561497688293457 + }, + { + "auxiliary_loss_clip": 0.06539108, + "auxiliary_loss_mlp": 0.01281064, + "balance_loss_clip": 0.06300798, + "balance_loss_mlp": 0.01259154, + "epoch": 0.22900946941229522, + "flos": 18848898829440.0, + "grad_norm": 1.7761632941249064, + "language_loss": 0.75198555, + "learning_rate": 3.598729535939222e-06, + "loss": 0.83018732, + "num_input_tokens_seen": 81972100, + "router_z_loss_clip": 2.38476562, + "router_z_loss_mlp": 0.21899414, + "step": 3809, + "time_per_iteration": 2.490895986557007 + }, + { + "auxiliary_loss_clip": 0.06533305, + "auxiliary_loss_mlp": 0.0127892, + "balance_loss_clip": 0.06299955, + "balance_loss_mlp": 0.01257331, + "epoch": 0.22906959266496318, + "flos": 22935961105920.0, + "grad_norm": 1.4656596651362013, + "language_loss": 0.82576305, + "learning_rate": 3.5984954991183862e-06, + "loss": 0.90388525, + "num_input_tokens_seen": 81992760, + "router_z_loss_clip": 2.33398438, + "router_z_loss_mlp": 0.21606445, + "step": 3810, + "time_per_iteration": 2.5684924125671387 + }, + { + "auxiliary_loss_clip": 0.06535805, + "auxiliary_loss_mlp": 0.01278794, + "balance_loss_clip": 0.06303711, + "balance_loss_mlp": 0.01259041, + "epoch": 0.22912971591763115, + "flos": 19360614913920.0, + "grad_norm": 1.8664104481323773, + "language_loss": 0.79914212, + "learning_rate": 3.598261401682441e-06, + "loss": 0.8772881, + "num_input_tokens_seen": 82009080, + "router_z_loss_clip": 2.31835938, + "router_z_loss_mlp": 0.19750977, + "step": 3811, + "time_per_iteration": 3.9766526222229004 + }, + { + "auxiliary_loss_clip": 0.0653518, + "auxiliary_loss_mlp": 0.01280553, + "balance_loss_clip": 0.06300636, + "balance_loss_mlp": 0.01258976, + "epoch": 0.22918983917029911, + "flos": 19938940594560.0, + "grad_norm": 1.7476175457386653, + "language_loss": 0.83391893, + "learning_rate": 3.5980272436402632e-06, + "loss": 0.91207623, + "num_input_tokens_seen": 82026705, + "router_z_loss_clip": 2.34765625, + "router_z_loss_mlp": 0.21569824, + "step": 3812, + "time_per_iteration": 2.5174708366394043 + }, + { + "auxiliary_loss_clip": 0.0655017, + "auxiliary_loss_mlp": 0.01288002, + "balance_loss_clip": 0.06306149, + "balance_loss_mlp": 0.01264673, + "epoch": 0.22924996242296708, + "flos": 16696501873920.0, + "grad_norm": 2.3839142545709886, + "language_loss": 0.8400377, + "learning_rate": 3.5977930250007324e-06, + "loss": 0.91841948, + "num_input_tokens_seen": 82043245, + "router_z_loss_clip": 2.44335938, + "router_z_loss_mlp": 0.2331543, + "step": 3813, + "time_per_iteration": 2.554779052734375 + }, + { + "auxiliary_loss_clip": 0.06538843, + "auxiliary_loss_mlp": 0.01276305, + "balance_loss_clip": 0.06301966, + "balance_loss_mlp": 0.01255456, + "epoch": 0.22931008567563504, + "flos": 33044457490560.0, + "grad_norm": 1.6858267943586043, + "language_loss": 0.70580167, + "learning_rate": 3.5975587457727298e-06, + "loss": 0.78395313, + "num_input_tokens_seen": 82066870, + "router_z_loss_clip": 2.36914062, + "router_z_loss_mlp": 0.20861816, + "step": 3814, + "time_per_iteration": 2.6764509677886963 + }, + { + "auxiliary_loss_clip": 0.06536946, + "auxiliary_loss_mlp": 0.01276372, + "balance_loss_clip": 0.06305344, + "balance_loss_mlp": 0.01256786, + "epoch": 0.229370208928303, + "flos": 23337322963200.0, + "grad_norm": 2.8831118113675114, + "language_loss": 0.67954975, + "learning_rate": 3.597324405965139e-06, + "loss": 0.75768292, + "num_input_tokens_seen": 82083180, + "router_z_loss_clip": 2.31445312, + "router_z_loss_mlp": 0.19604492, + "step": 3815, + "time_per_iteration": 3.9759562015533447 + }, + { + "auxiliary_loss_clip": 0.06547147, + "auxiliary_loss_mlp": 0.01282792, + "balance_loss_clip": 0.06311129, + "balance_loss_mlp": 0.01259952, + "epoch": 0.229430332180971, + "flos": 28624068472320.0, + "grad_norm": 1.7261339214380451, + "language_loss": 0.83511633, + "learning_rate": 3.597090005586848e-06, + "loss": 0.91341567, + "num_input_tokens_seen": 82102950, + "router_z_loss_clip": 2.359375, + "router_z_loss_mlp": 0.22839355, + "step": 3816, + "time_per_iteration": 2.6059420108795166 + }, + { + "auxiliary_loss_clip": 0.06539545, + "auxiliary_loss_mlp": 0.01275582, + "balance_loss_clip": 0.06303526, + "balance_loss_mlp": 0.01253302, + "epoch": 0.22949045543363897, + "flos": 17243912597760.0, + "grad_norm": 2.759151157832335, + "language_loss": 0.87850988, + "learning_rate": 3.596855544646742e-06, + "loss": 0.95666116, + "num_input_tokens_seen": 82119510, + "router_z_loss_clip": 2.36132812, + "router_z_loss_mlp": 0.22290039, + "step": 3817, + "time_per_iteration": 2.4830808639526367 + }, + { + "auxiliary_loss_clip": 0.06543944, + "auxiliary_loss_mlp": 0.01278311, + "balance_loss_clip": 0.06306894, + "balance_loss_mlp": 0.01256412, + "epoch": 0.22955057868630693, + "flos": 27496654986240.0, + "grad_norm": 1.6534336608142677, + "language_loss": 0.75343978, + "learning_rate": 3.5966210231537154e-06, + "loss": 0.83166242, + "num_input_tokens_seen": 82140095, + "router_z_loss_clip": 2.37304688, + "router_z_loss_mlp": 0.21899414, + "step": 3818, + "time_per_iteration": 2.634387969970703 + }, + { + "auxiliary_loss_clip": 0.06541272, + "auxiliary_loss_mlp": 0.01278617, + "balance_loss_clip": 0.06305389, + "balance_loss_mlp": 0.0125524, + "epoch": 0.2296107019389749, + "flos": 23483036413440.0, + "grad_norm": 1.7338201278327374, + "language_loss": 0.75486314, + "learning_rate": 3.596386441116659e-06, + "loss": 0.83306205, + "num_input_tokens_seen": 82159510, + "router_z_loss_clip": 2.359375, + "router_z_loss_mlp": 0.23376465, + "step": 3819, + "time_per_iteration": 2.593780279159546 + }, + { + "auxiliary_loss_clip": 0.06542156, + "auxiliary_loss_mlp": 0.01283095, + "balance_loss_clip": 0.06305272, + "balance_loss_mlp": 0.01263009, + "epoch": 0.22967082519164286, + "flos": 31293212757120.0, + "grad_norm": 1.753994919034331, + "language_loss": 0.8208195, + "learning_rate": 3.5961517985444684e-06, + "loss": 0.89907205, + "num_input_tokens_seen": 82179580, + "router_z_loss_clip": 2.3671875, + "router_z_loss_mlp": 0.20092773, + "step": 3820, + "time_per_iteration": 2.6047699451446533 + }, + { + "auxiliary_loss_clip": 0.06548945, + "auxiliary_loss_mlp": 0.0128207, + "balance_loss_clip": 0.06306617, + "balance_loss_mlp": 0.0125892, + "epoch": 0.22973094844431083, + "flos": 14647415402880.0, + "grad_norm": 4.329935521611207, + "language_loss": 0.70069146, + "learning_rate": 3.595917095446042e-06, + "loss": 0.77900159, + "num_input_tokens_seen": 82195585, + "router_z_loss_clip": 2.42773438, + "router_z_loss_mlp": 0.23156738, + "step": 3821, + "time_per_iteration": 2.479454517364502 + }, + { + "auxiliary_loss_clip": 0.06540461, + "auxiliary_loss_mlp": 0.01284444, + "balance_loss_clip": 0.06305948, + "balance_loss_mlp": 0.0126177, + "epoch": 0.2297910716969788, + "flos": 22831057393920.0, + "grad_norm": 2.1026243527938897, + "language_loss": 0.83607674, + "learning_rate": 3.5956823318302796e-06, + "loss": 0.91432583, + "num_input_tokens_seen": 82217530, + "router_z_loss_clip": 2.34375, + "router_z_loss_mlp": 0.22668457, + "step": 3822, + "time_per_iteration": 2.6070644855499268 + }, + { + "auxiliary_loss_clip": 0.06532617, + "auxiliary_loss_mlp": 0.01279894, + "balance_loss_clip": 0.06300794, + "balance_loss_mlp": 0.01256637, + "epoch": 0.2298511949496468, + "flos": 23045644500480.0, + "grad_norm": 1.4679532921797136, + "language_loss": 0.66860032, + "learning_rate": 3.5954475077060833e-06, + "loss": 0.74672538, + "num_input_tokens_seen": 82237980, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.23266602, + "step": 3823, + "time_per_iteration": 2.5421886444091797 + }, + { + "auxiliary_loss_clip": 0.06414426, + "auxiliary_loss_mlp": 0.01282472, + "balance_loss_clip": 0.062925, + "balance_loss_mlp": 0.01277524, + "epoch": 0.22991131820231475, + "flos": 66910296228480.0, + "grad_norm": 0.7674542175482253, + "language_loss": 0.56982124, + "learning_rate": 3.595212623082357e-06, + "loss": 0.64679027, + "num_input_tokens_seen": 82301785, + "router_z_loss_clip": 1.21875, + "router_z_loss_mlp": 0.04943848, + "step": 3824, + "time_per_iteration": 3.2466728687286377 + }, + { + "auxiliary_loss_clip": 0.06530097, + "auxiliary_loss_mlp": 0.0127961, + "balance_loss_clip": 0.06299412, + "balance_loss_mlp": 0.01258975, + "epoch": 0.22997144145498272, + "flos": 17891782767360.0, + "grad_norm": 2.0818696062092643, + "language_loss": 0.73658061, + "learning_rate": 3.594977677968009e-06, + "loss": 0.81467766, + "num_input_tokens_seen": 82317355, + "router_z_loss_clip": 2.30664062, + "router_z_loss_mlp": 0.2064209, + "step": 3825, + "time_per_iteration": 2.4705512523651123 + }, + { + "auxiliary_loss_clip": 0.06534772, + "auxiliary_loss_mlp": 0.01279784, + "balance_loss_clip": 0.06299614, + "balance_loss_mlp": 0.01257432, + "epoch": 0.23003156470765068, + "flos": 24683055062400.0, + "grad_norm": 2.356013632504241, + "language_loss": 0.88289648, + "learning_rate": 3.5947426723719473e-06, + "loss": 0.96104205, + "num_input_tokens_seen": 82336645, + "router_z_loss_clip": 2.3515625, + "router_z_loss_mlp": 0.22351074, + "step": 3826, + "time_per_iteration": 2.5636119842529297 + }, + { + "auxiliary_loss_clip": 0.06540347, + "auxiliary_loss_mlp": 0.01282145, + "balance_loss_clip": 0.0629928, + "balance_loss_mlp": 0.0125897, + "epoch": 0.23009168796031865, + "flos": 15819412060800.0, + "grad_norm": 2.476820030154751, + "language_loss": 0.81866372, + "learning_rate": 3.594507606303083e-06, + "loss": 0.89688861, + "num_input_tokens_seen": 82354225, + "router_z_loss_clip": 2.41601562, + "router_z_loss_mlp": 0.23181152, + "step": 3827, + "time_per_iteration": 2.4817094802856445 + }, + { + "auxiliary_loss_clip": 0.06527712, + "auxiliary_loss_mlp": 0.01278643, + "balance_loss_clip": 0.06294617, + "balance_loss_mlp": 0.01256911, + "epoch": 0.2301518112129866, + "flos": 16217755171200.0, + "grad_norm": 1.7308897820243296, + "language_loss": 0.87303799, + "learning_rate": 3.5942724797703314e-06, + "loss": 0.95110154, + "num_input_tokens_seen": 82370240, + "router_z_loss_clip": 2.33398438, + "router_z_loss_mlp": 0.21716309, + "step": 3828, + "time_per_iteration": 2.517916202545166 + }, + { + "auxiliary_loss_clip": 0.06537049, + "auxiliary_loss_mlp": 0.01281894, + "balance_loss_clip": 0.06300969, + "balance_loss_mlp": 0.01260686, + "epoch": 0.2302119344656546, + "flos": 20601820644480.0, + "grad_norm": 2.1621841127041668, + "language_loss": 0.71223086, + "learning_rate": 3.594037292782607e-06, + "loss": 0.79042029, + "num_input_tokens_seen": 82389145, + "router_z_loss_clip": 2.36328125, + "router_z_loss_mlp": 0.21191406, + "step": 3829, + "time_per_iteration": 2.5232293605804443 + }, + { + "auxiliary_loss_clip": 0.06527743, + "auxiliary_loss_mlp": 0.01278561, + "balance_loss_clip": 0.06299868, + "balance_loss_mlp": 0.01258629, + "epoch": 0.23027205771832257, + "flos": 26804117790720.0, + "grad_norm": 1.5730479724984117, + "language_loss": 0.84944689, + "learning_rate": 3.5938020453488293e-06, + "loss": 0.92750996, + "num_input_tokens_seen": 82409185, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.19934082, + "step": 3830, + "time_per_iteration": 2.6153595447540283 + }, + { + "auxiliary_loss_clip": 0.0653088, + "auxiliary_loss_mlp": 0.01278488, + "balance_loss_clip": 0.06299009, + "balance_loss_mlp": 0.01256863, + "epoch": 0.23033218097099054, + "flos": 43883365916160.0, + "grad_norm": 2.1076872960056834, + "language_loss": 0.67121679, + "learning_rate": 3.5935667374779177e-06, + "loss": 0.74931049, + "num_input_tokens_seen": 82432070, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.21630859, + "step": 3831, + "time_per_iteration": 2.7302401065826416 + }, + { + "auxiliary_loss_clip": 0.06528492, + "auxiliary_loss_mlp": 0.0127826, + "balance_loss_clip": 0.06295311, + "balance_loss_mlp": 0.01255944, + "epoch": 0.2303923042236585, + "flos": 26074837779840.0, + "grad_norm": 2.0679638399971525, + "language_loss": 0.7580992, + "learning_rate": 3.5933313691787957e-06, + "loss": 0.83616674, + "num_input_tokens_seen": 82450625, + "router_z_loss_clip": 2.33203125, + "router_z_loss_mlp": 0.2232666, + "step": 3832, + "time_per_iteration": 2.5789363384246826 + }, + { + "auxiliary_loss_clip": 0.06538022, + "auxiliary_loss_mlp": 0.01277154, + "balance_loss_clip": 0.06301656, + "balance_loss_mlp": 0.01254731, + "epoch": 0.23045242747632647, + "flos": 18302284719360.0, + "grad_norm": 1.9809188001289737, + "language_loss": 0.88229948, + "learning_rate": 3.593095940460389e-06, + "loss": 0.96045125, + "num_input_tokens_seen": 82468575, + "router_z_loss_clip": 2.36523438, + "router_z_loss_mlp": 0.22387695, + "step": 3833, + "time_per_iteration": 2.4890406131744385 + }, + { + "auxiliary_loss_clip": 0.06526786, + "auxiliary_loss_mlp": 0.01275622, + "balance_loss_clip": 0.06291149, + "balance_loss_mlp": 0.01253295, + "epoch": 0.23051255072899443, + "flos": 25527636691200.0, + "grad_norm": 1.751792699614105, + "language_loss": 0.75447762, + "learning_rate": 3.592860451331624e-06, + "loss": 0.83250165, + "num_input_tokens_seen": 82488655, + "router_z_loss_clip": 2.35742188, + "router_z_loss_mlp": 0.2232666, + "step": 3834, + "time_per_iteration": 2.5706892013549805 + }, + { + "auxiliary_loss_clip": 0.06528607, + "auxiliary_loss_mlp": 0.0128462, + "balance_loss_clip": 0.06295913, + "balance_loss_mlp": 0.01262089, + "epoch": 0.2305726739816624, + "flos": 21221584968960.0, + "grad_norm": 2.065687600185831, + "language_loss": 0.86859775, + "learning_rate": 3.592624901801432e-06, + "loss": 0.94673002, + "num_input_tokens_seen": 82507220, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.2253418, + "step": 3835, + "time_per_iteration": 2.5243782997131348 + }, + { + "auxiliary_loss_clip": 0.06531255, + "auxiliary_loss_mlp": 0.01277066, + "balance_loss_clip": 0.06292518, + "balance_loss_mlp": 0.01255489, + "epoch": 0.2306327972343304, + "flos": 23337826087680.0, + "grad_norm": 2.699164056519065, + "language_loss": 0.8346436, + "learning_rate": 3.5923892918787432e-06, + "loss": 0.91272676, + "num_input_tokens_seen": 82527920, + "router_z_loss_clip": 2.38671875, + "router_z_loss_mlp": 0.21594238, + "step": 3836, + "time_per_iteration": 2.588916301727295 + }, + { + "auxiliary_loss_clip": 0.06530184, + "auxiliary_loss_mlp": 0.01278505, + "balance_loss_clip": 0.0629724, + "balance_loss_mlp": 0.01257918, + "epoch": 0.23069292048699835, + "flos": 20672832579840.0, + "grad_norm": 1.5308621387149557, + "language_loss": 0.80123997, + "learning_rate": 3.5921536215724934e-06, + "loss": 0.87932694, + "num_input_tokens_seen": 82549040, + "router_z_loss_clip": 2.328125, + "router_z_loss_mlp": 0.20581055, + "step": 3837, + "time_per_iteration": 2.5265891551971436 + }, + { + "auxiliary_loss_clip": 0.06398934, + "auxiliary_loss_mlp": 0.01263477, + "balance_loss_clip": 0.06276935, + "balance_loss_mlp": 0.01257871, + "epoch": 0.23075304373966632, + "flos": 70472854673280.0, + "grad_norm": 0.8661269137999401, + "language_loss": 0.65425092, + "learning_rate": 3.5919178908916184e-06, + "loss": 0.73087507, + "num_input_tokens_seen": 82604070, + "router_z_loss_clip": 1.21875, + "router_z_loss_mlp": 0.05606079, + "step": 3838, + "time_per_iteration": 3.0690691471099854 + }, + { + "auxiliary_loss_clip": 0.06529964, + "auxiliary_loss_mlp": 0.01281931, + "balance_loss_clip": 0.0629662, + "balance_loss_mlp": 0.01260592, + "epoch": 0.23081316699233428, + "flos": 16623603221760.0, + "grad_norm": 1.9712307402798914, + "language_loss": 0.76919234, + "learning_rate": 3.591682099845058e-06, + "loss": 0.84731126, + "num_input_tokens_seen": 82619665, + "router_z_loss_clip": 2.33398438, + "router_z_loss_mlp": 0.21337891, + "step": 3839, + "time_per_iteration": 2.507899522781372 + }, + { + "auxiliary_loss_clip": 0.06539556, + "auxiliary_loss_mlp": 0.01283771, + "balance_loss_clip": 0.06303147, + "balance_loss_mlp": 0.01261873, + "epoch": 0.23087329024500225, + "flos": 13303192677120.0, + "grad_norm": 1.9535711626830803, + "language_loss": 0.6973604, + "learning_rate": 3.591446248441752e-06, + "loss": 0.77559364, + "num_input_tokens_seen": 82637530, + "router_z_loss_clip": 2.36523438, + "router_z_loss_mlp": 0.21899414, + "step": 3840, + "time_per_iteration": 2.507403612136841 + }, + { + "auxiliary_loss_clip": 0.06524121, + "auxiliary_loss_mlp": 0.01283726, + "balance_loss_clip": 0.06291715, + "balance_loss_mlp": 0.01261994, + "epoch": 0.23093341349767021, + "flos": 17791574883840.0, + "grad_norm": 2.1010490795203967, + "language_loss": 0.79679501, + "learning_rate": 3.591210336690645e-06, + "loss": 0.87487352, + "num_input_tokens_seen": 82656130, + "router_z_loss_clip": 2.32421875, + "router_z_loss_mlp": 0.21740723, + "step": 3841, + "time_per_iteration": 2.542506456375122 + }, + { + "auxiliary_loss_clip": 0.06525128, + "auxiliary_loss_mlp": 0.0128577, + "balance_loss_clip": 0.06292316, + "balance_loss_mlp": 0.0126591, + "epoch": 0.23099353675033818, + "flos": 23994920206080.0, + "grad_norm": 2.202794692504719, + "language_loss": 0.83472121, + "learning_rate": 3.590974364600683e-06, + "loss": 0.91283023, + "num_input_tokens_seen": 82675295, + "router_z_loss_clip": 2.328125, + "router_z_loss_mlp": 0.19873047, + "step": 3842, + "time_per_iteration": 2.5885045528411865 + }, + { + "auxiliary_loss_clip": 0.06525495, + "auxiliary_loss_mlp": 0.01277864, + "balance_loss_clip": 0.06294134, + "balance_loss_mlp": 0.01256251, + "epoch": 0.23105366000300617, + "flos": 36004567478400.0, + "grad_norm": 1.5198018897685672, + "language_loss": 0.66582537, + "learning_rate": 3.5907383321808135e-06, + "loss": 0.74385899, + "num_input_tokens_seen": 82703260, + "router_z_loss_clip": 2.3125, + "router_z_loss_mlp": 0.21630859, + "step": 3843, + "time_per_iteration": 2.7418570518493652 + }, + { + "auxiliary_loss_clip": 0.06517389, + "auxiliary_loss_mlp": 0.01282302, + "balance_loss_clip": 0.06289946, + "balance_loss_mlp": 0.01261667, + "epoch": 0.23111378325567414, + "flos": 31252822289280.0, + "grad_norm": 2.0273673860648613, + "language_loss": 0.77953953, + "learning_rate": 3.590502239439987e-06, + "loss": 0.85753644, + "num_input_tokens_seen": 82725060, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.2064209, + "step": 3844, + "time_per_iteration": 2.697105884552002 + }, + { + "auxiliary_loss_clip": 0.0652685, + "auxiliary_loss_mlp": 0.01278908, + "balance_loss_clip": 0.0629425, + "balance_loss_mlp": 0.01258618, + "epoch": 0.2311739065083421, + "flos": 19214230631040.0, + "grad_norm": 1.5733936305181, + "language_loss": 0.78526026, + "learning_rate": 3.590266086387156e-06, + "loss": 0.86331779, + "num_input_tokens_seen": 82742960, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.20275879, + "step": 3845, + "time_per_iteration": 3.9081645011901855 + }, + { + "auxiliary_loss_clip": 0.06512116, + "auxiliary_loss_mlp": 0.01275528, + "balance_loss_clip": 0.06288872, + "balance_loss_mlp": 0.01256323, + "epoch": 0.23123402976101007, + "flos": 23365638443520.0, + "grad_norm": 2.144369954512039, + "language_loss": 0.7696318, + "learning_rate": 3.590029873031276e-06, + "loss": 0.84750825, + "num_input_tokens_seen": 82760205, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.1920166, + "step": 3846, + "time_per_iteration": 2.5204334259033203 + }, + { + "auxiliary_loss_clip": 0.06530652, + "auxiliary_loss_mlp": 0.01280785, + "balance_loss_clip": 0.06296441, + "balance_loss_mlp": 0.01258946, + "epoch": 0.23129415301367803, + "flos": 13740458808960.0, + "grad_norm": 2.058546116129278, + "language_loss": 0.70736533, + "learning_rate": 3.589793599381304e-06, + "loss": 0.78547966, + "num_input_tokens_seen": 82778590, + "router_z_loss_clip": 2.34570312, + "router_z_loss_mlp": 0.21862793, + "step": 3847, + "time_per_iteration": 3.955061197280884 + }, + { + "auxiliary_loss_clip": 0.06395237, + "auxiliary_loss_mlp": 0.01270099, + "balance_loss_clip": 0.06274906, + "balance_loss_mlp": 0.01264553, + "epoch": 0.231354276266346, + "flos": 69756907461120.0, + "grad_norm": 0.7764718422559022, + "language_loss": 0.60909712, + "learning_rate": 3.589557265446198e-06, + "loss": 0.68575048, + "num_input_tokens_seen": 82833925, + "router_z_loss_clip": 1.203125, + "router_z_loss_mlp": 0.05557251, + "step": 3848, + "time_per_iteration": 3.0406246185302734 + }, + { + "auxiliary_loss_clip": 0.0652846, + "auxiliary_loss_mlp": 0.0128118, + "balance_loss_clip": 0.06295802, + "balance_loss_mlp": 0.01259925, + "epoch": 0.231414399519014, + "flos": 18840597275520.0, + "grad_norm": 2.051565204924659, + "language_loss": 0.79345453, + "learning_rate": 3.589320871234923e-06, + "loss": 0.87155092, + "num_input_tokens_seen": 82850625, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.21252441, + "step": 3849, + "time_per_iteration": 2.508357048034668 + }, + { + "auxiliary_loss_clip": 0.06525768, + "auxiliary_loss_mlp": 0.01279584, + "balance_loss_clip": 0.06294318, + "balance_loss_mlp": 0.01257995, + "epoch": 0.23147452277168196, + "flos": 36143949945600.0, + "grad_norm": 1.9799304996672493, + "language_loss": 0.72033536, + "learning_rate": 3.5890844167564405e-06, + "loss": 0.7983889, + "num_input_tokens_seen": 82872105, + "router_z_loss_clip": 2.31640625, + "router_z_loss_mlp": 0.21594238, + "step": 3850, + "time_per_iteration": 2.6283209323883057 + }, + { + "auxiliary_loss_clip": 0.06522007, + "auxiliary_loss_mlp": 0.012814, + "balance_loss_clip": 0.06293751, + "balance_loss_mlp": 0.01260562, + "epoch": 0.23153464602434992, + "flos": 20819091081600.0, + "grad_norm": 2.1585980033328216, + "language_loss": 0.76770389, + "learning_rate": 3.588847902019718e-06, + "loss": 0.84573799, + "num_input_tokens_seen": 82890595, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.20825195, + "step": 3851, + "time_per_iteration": 3.9542527198791504 + }, + { + "auxiliary_loss_clip": 0.06522575, + "auxiliary_loss_mlp": 0.01285563, + "balance_loss_clip": 0.06294242, + "balance_loss_mlp": 0.01264367, + "epoch": 0.2315947692770179, + "flos": 19945606993920.0, + "grad_norm": 4.396515099862161, + "language_loss": 0.70780337, + "learning_rate": 3.588611327033723e-06, + "loss": 0.78588474, + "num_input_tokens_seen": 82908910, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.21191406, + "step": 3852, + "time_per_iteration": 2.5292365550994873 + }, + { + "auxiliary_loss_clip": 0.06530476, + "auxiliary_loss_mlp": 0.01287483, + "balance_loss_clip": 0.0629744, + "balance_loss_mlp": 0.01267027, + "epoch": 0.23165489252968585, + "flos": 12859805197440.0, + "grad_norm": 2.0519661349019906, + "language_loss": 0.68142366, + "learning_rate": 3.588374691807428e-06, + "loss": 0.75960326, + "num_input_tokens_seen": 82925405, + "router_z_loss_clip": 2.33203125, + "router_z_loss_mlp": 0.20471191, + "step": 3853, + "time_per_iteration": 2.524214267730713 + }, + { + "auxiliary_loss_clip": 0.06532255, + "auxiliary_loss_mlp": 0.0127975, + "balance_loss_clip": 0.06299816, + "balance_loss_mlp": 0.01258579, + "epoch": 0.23171501578235382, + "flos": 30636202492800.0, + "grad_norm": 2.067759569090495, + "language_loss": 0.80620718, + "learning_rate": 3.5881379963498053e-06, + "loss": 0.88432729, + "num_input_tokens_seen": 82945615, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.21166992, + "step": 3854, + "time_per_iteration": 3.9913628101348877 + }, + { + "auxiliary_loss_clip": 0.06540599, + "auxiliary_loss_mlp": 0.0128392, + "balance_loss_clip": 0.06299743, + "balance_loss_mlp": 0.0126201, + "epoch": 0.23177513903502178, + "flos": 23849709880320.0, + "grad_norm": 1.9679065377847755, + "language_loss": 0.66096866, + "learning_rate": 3.587901240669831e-06, + "loss": 0.73921382, + "num_input_tokens_seen": 82967570, + "router_z_loss_clip": 2.4140625, + "router_z_loss_mlp": 0.21899414, + "step": 3855, + "time_per_iteration": 2.560032844543457 + }, + { + "auxiliary_loss_clip": 0.06526054, + "auxiliary_loss_mlp": 0.0129156, + "balance_loss_clip": 0.06295231, + "balance_loss_mlp": 0.0126972, + "epoch": 0.23183526228768978, + "flos": 29578040006400.0, + "grad_norm": 1.903884891832667, + "language_loss": 0.71179903, + "learning_rate": 3.5876644247764815e-06, + "loss": 0.78997517, + "num_input_tokens_seen": 82987435, + "router_z_loss_clip": 2.30664062, + "router_z_loss_mlp": 0.21838379, + "step": 3856, + "time_per_iteration": 2.602130174636841 + }, + { + "auxiliary_loss_clip": 0.06526691, + "auxiliary_loss_mlp": 0.01281572, + "balance_loss_clip": 0.06295416, + "balance_loss_mlp": 0.01261032, + "epoch": 0.23189538554035774, + "flos": 34467155164800.0, + "grad_norm": 1.5724941960823864, + "language_loss": 0.77830631, + "learning_rate": 3.5874275486787387e-06, + "loss": 0.85638893, + "num_input_tokens_seen": 83010505, + "router_z_loss_clip": 2.3125, + "router_z_loss_mlp": 0.20532227, + "step": 3857, + "time_per_iteration": 2.6366043090820312 + }, + { + "auxiliary_loss_clip": 0.06534412, + "auxiliary_loss_mlp": 0.0128226, + "balance_loss_clip": 0.06299518, + "balance_loss_mlp": 0.01259813, + "epoch": 0.2319555087930257, + "flos": 18009558080640.0, + "grad_norm": 2.2572913357008804, + "language_loss": 0.91563249, + "learning_rate": 3.587190612385584e-06, + "loss": 0.99379921, + "num_input_tokens_seen": 83026705, + "router_z_loss_clip": 2.34765625, + "router_z_loss_mlp": 0.2244873, + "step": 3858, + "time_per_iteration": 2.532270908355713 + }, + { + "auxiliary_loss_clip": 0.06524485, + "auxiliary_loss_mlp": 0.01281992, + "balance_loss_clip": 0.06299204, + "balance_loss_mlp": 0.01261833, + "epoch": 0.23201563204569367, + "flos": 23149709671680.0, + "grad_norm": 2.204043049012761, + "language_loss": 0.77328205, + "learning_rate": 3.5869536159060026e-06, + "loss": 0.85134679, + "num_input_tokens_seen": 83046500, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.20153809, + "step": 3859, + "time_per_iteration": 2.539982318878174 + }, + { + "auxiliary_loss_clip": 0.06526206, + "auxiliary_loss_mlp": 0.01282174, + "balance_loss_clip": 0.0629694, + "balance_loss_mlp": 0.01261098, + "epoch": 0.23207575529836164, + "flos": 20674300026240.0, + "grad_norm": 1.845949683873727, + "language_loss": 0.84980345, + "learning_rate": 3.58671655924898e-06, + "loss": 0.9278872, + "num_input_tokens_seen": 83065280, + "router_z_loss_clip": 2.29101562, + "router_z_loss_mlp": 0.21057129, + "step": 3860, + "time_per_iteration": 2.5464277267456055 + }, + { + "auxiliary_loss_clip": 0.06522566, + "auxiliary_loss_mlp": 0.01275514, + "balance_loss_clip": 0.06296555, + "balance_loss_mlp": 0.01254927, + "epoch": 0.2321358785510296, + "flos": 16477805917440.0, + "grad_norm": 2.2860023761203423, + "language_loss": 0.83316106, + "learning_rate": 3.586479442423508e-06, + "loss": 0.91114187, + "num_input_tokens_seen": 83082310, + "router_z_loss_clip": 2.25976562, + "router_z_loss_mlp": 0.20581055, + "step": 3861, + "time_per_iteration": 2.611527681350708 + }, + { + "auxiliary_loss_clip": 0.06526297, + "auxiliary_loss_mlp": 0.01281702, + "balance_loss_clip": 0.06296666, + "balance_loss_mlp": 0.01261198, + "epoch": 0.2321960018036976, + "flos": 21622737191040.0, + "grad_norm": 1.932164160561112, + "language_loss": 0.86100018, + "learning_rate": 3.586242265438576e-06, + "loss": 0.93908012, + "num_input_tokens_seen": 83102065, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.2052002, + "step": 3862, + "time_per_iteration": 2.599078893661499 + }, + { + "auxiliary_loss_clip": 0.06517789, + "auxiliary_loss_mlp": 0.01277863, + "balance_loss_clip": 0.0629621, + "balance_loss_mlp": 0.0125898, + "epoch": 0.23225612505636556, + "flos": 22277734957440.0, + "grad_norm": 1.8279700206037066, + "language_loss": 0.75524014, + "learning_rate": 3.5860050283031773e-06, + "loss": 0.83319664, + "num_input_tokens_seen": 83121445, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.18884277, + "step": 3863, + "time_per_iteration": 2.5592801570892334 + }, + { + "auxiliary_loss_clip": 0.06518993, + "auxiliary_loss_mlp": 0.01279608, + "balance_loss_clip": 0.06295245, + "balance_loss_mlp": 0.01260237, + "epoch": 0.23231624830903352, + "flos": 17057431336320.0, + "grad_norm": 1.8656538002376628, + "language_loss": 0.7504397, + "learning_rate": 3.58576773102631e-06, + "loss": 0.82842577, + "num_input_tokens_seen": 83138175, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.19372559, + "step": 3864, + "time_per_iteration": 2.549480438232422 + }, + { + "auxiliary_loss_clip": 0.06521947, + "auxiliary_loss_mlp": 0.01276148, + "balance_loss_clip": 0.06295212, + "balance_loss_mlp": 0.01255572, + "epoch": 0.2323763715617015, + "flos": 34648353619200.0, + "grad_norm": 2.1960138476201023, + "language_loss": 0.70505309, + "learning_rate": 3.5855303736169714e-06, + "loss": 0.78303403, + "num_input_tokens_seen": 83161975, + "router_z_loss_clip": 2.26757812, + "router_z_loss_mlp": 0.20568848, + "step": 3865, + "time_per_iteration": 2.6358752250671387 + }, + { + "auxiliary_loss_clip": 0.06539118, + "auxiliary_loss_mlp": 0.01279948, + "balance_loss_clip": 0.06299968, + "balance_loss_mlp": 0.01256464, + "epoch": 0.23243649481436945, + "flos": 25557922742400.0, + "grad_norm": 1.8533317501805489, + "language_loss": 0.95648015, + "learning_rate": 3.5852929560841617e-06, + "loss": 1.03467083, + "num_input_tokens_seen": 83180905, + "router_z_loss_clip": 2.39257812, + "router_z_loss_mlp": 0.23510742, + "step": 3866, + "time_per_iteration": 2.5805771350860596 + }, + { + "auxiliary_loss_clip": 0.06523386, + "auxiliary_loss_mlp": 0.0128215, + "balance_loss_clip": 0.06294955, + "balance_loss_mlp": 0.01260561, + "epoch": 0.23249661806703742, + "flos": 20489411992320.0, + "grad_norm": 3.3036871554572285, + "language_loss": 0.74161094, + "learning_rate": 3.5850554784368846e-06, + "loss": 0.81966627, + "num_input_tokens_seen": 83196390, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.21569824, + "step": 3867, + "time_per_iteration": 2.485872268676758 + }, + { + "auxiliary_loss_clip": 0.06527717, + "auxiliary_loss_mlp": 0.01278812, + "balance_loss_clip": 0.06298171, + "balance_loss_mlp": 0.01257271, + "epoch": 0.23255674131970538, + "flos": 20382956979840.0, + "grad_norm": 1.7596317335066716, + "language_loss": 0.82912898, + "learning_rate": 3.584817940684145e-06, + "loss": 0.90719432, + "num_input_tokens_seen": 83216165, + "router_z_loss_clip": 2.29296875, + "router_z_loss_mlp": 0.2154541, + "step": 3868, + "time_per_iteration": 2.5404841899871826 + }, + { + "auxiliary_loss_clip": 0.06518516, + "auxiliary_loss_mlp": 0.01279395, + "balance_loss_clip": 0.0629604, + "balance_loss_mlp": 0.01260321, + "epoch": 0.23261686457237338, + "flos": 17061833675520.0, + "grad_norm": 1.6597028261056146, + "language_loss": 0.73686016, + "learning_rate": 3.58458034283495e-06, + "loss": 0.81483924, + "num_input_tokens_seen": 83233845, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.1907959, + "step": 3869, + "time_per_iteration": 2.4850685596466064 + }, + { + "auxiliary_loss_clip": 0.06524374, + "auxiliary_loss_mlp": 0.01289937, + "balance_loss_clip": 0.06296247, + "balance_loss_mlp": 0.01268241, + "epoch": 0.23267698782504134, + "flos": 29177726325120.0, + "grad_norm": 1.8030595092782438, + "language_loss": 0.8079325, + "learning_rate": 3.5843426848983097e-06, + "loss": 0.88607562, + "num_input_tokens_seen": 83254930, + "router_z_loss_clip": 2.28320312, + "router_z_loss_mlp": 0.21716309, + "step": 3870, + "time_per_iteration": 2.5915870666503906 + }, + { + "auxiliary_loss_clip": 0.06532744, + "auxiliary_loss_mlp": 0.01283178, + "balance_loss_clip": 0.06299601, + "balance_loss_mlp": 0.0126178, + "epoch": 0.2327371110777093, + "flos": 21180355960320.0, + "grad_norm": 1.9640097574691695, + "language_loss": 0.71693742, + "learning_rate": 3.5841049668832357e-06, + "loss": 0.79509664, + "num_input_tokens_seen": 83272095, + "router_z_loss_clip": 2.33398438, + "router_z_loss_mlp": 0.21411133, + "step": 3871, + "time_per_iteration": 2.4897918701171875 + }, + { + "auxiliary_loss_clip": 0.065286, + "auxiliary_loss_mlp": 0.01280741, + "balance_loss_clip": 0.06295659, + "balance_loss_mlp": 0.01260034, + "epoch": 0.23279723433037727, + "flos": 24869997521280.0, + "grad_norm": 2.5352867939179484, + "language_loss": 0.69289309, + "learning_rate": 3.5838671887987433e-06, + "loss": 0.77098656, + "num_input_tokens_seen": 83290980, + "router_z_loss_clip": 2.33007812, + "router_z_loss_mlp": 0.20715332, + "step": 3872, + "time_per_iteration": 2.5636072158813477 + }, + { + "auxiliary_loss_clip": 0.06535204, + "auxiliary_loss_mlp": 0.01285984, + "balance_loss_clip": 0.06299452, + "balance_loss_mlp": 0.01263894, + "epoch": 0.23285735758304524, + "flos": 38809823921280.0, + "grad_norm": 2.0709139139802497, + "language_loss": 0.78303361, + "learning_rate": 3.5836293506538474e-06, + "loss": 0.86124545, + "num_input_tokens_seen": 83315175, + "router_z_loss_clip": 2.35742188, + "router_z_loss_mlp": 0.22094727, + "step": 3873, + "time_per_iteration": 2.671551465988159 + }, + { + "auxiliary_loss_clip": 0.06419215, + "auxiliary_loss_mlp": 0.01286246, + "balance_loss_clip": 0.06302594, + "balance_loss_mlp": 0.01280601, + "epoch": 0.2329174808357132, + "flos": 53962274280960.0, + "grad_norm": 0.8377063316545934, + "language_loss": 0.60286367, + "learning_rate": 3.5833914524575687e-06, + "loss": 0.67991829, + "num_input_tokens_seen": 83372060, + "router_z_loss_clip": 1.171875, + "router_z_loss_mlp": 0.05636597, + "step": 3874, + "time_per_iteration": 3.087822675704956 + }, + { + "auxiliary_loss_clip": 0.06525364, + "auxiliary_loss_mlp": 0.01281697, + "balance_loss_clip": 0.06298245, + "balance_loss_mlp": 0.012608, + "epoch": 0.23297760408838117, + "flos": 21222549290880.0, + "grad_norm": 2.3064833177652773, + "language_loss": 0.81324208, + "learning_rate": 3.583153494218927e-06, + "loss": 0.89131272, + "num_input_tokens_seen": 83389795, + "router_z_loss_clip": 2.2734375, + "router_z_loss_mlp": 0.20898438, + "step": 3875, + "time_per_iteration": 2.560511589050293 + }, + { + "auxiliary_loss_clip": 0.06520373, + "auxiliary_loss_mlp": 0.01275593, + "balance_loss_clip": 0.06294609, + "balance_loss_mlp": 0.01255983, + "epoch": 0.23303772734104916, + "flos": 28410613395840.0, + "grad_norm": 2.285945976693144, + "language_loss": 0.62077069, + "learning_rate": 3.5829154759469464e-06, + "loss": 0.69873035, + "num_input_tokens_seen": 83410005, + "router_z_loss_clip": 2.25976562, + "router_z_loss_mlp": 0.19628906, + "step": 3876, + "time_per_iteration": 2.63901948928833 + }, + { + "auxiliary_loss_clip": 0.06525883, + "auxiliary_loss_mlp": 0.01277799, + "balance_loss_clip": 0.06296121, + "balance_loss_mlp": 0.01258034, + "epoch": 0.23309785059371713, + "flos": 24321328986240.0, + "grad_norm": 1.9984006432494335, + "language_loss": 0.71087664, + "learning_rate": 3.5826773976506523e-06, + "loss": 0.78891349, + "num_input_tokens_seen": 83430250, + "router_z_loss_clip": 2.29492188, + "router_z_loss_mlp": 0.19787598, + "step": 3877, + "time_per_iteration": 2.533858299255371 + }, + { + "auxiliary_loss_clip": 0.06524412, + "auxiliary_loss_mlp": 0.01274037, + "balance_loss_clip": 0.06297307, + "balance_loss_mlp": 0.01253485, + "epoch": 0.2331579738463851, + "flos": 15997633695360.0, + "grad_norm": 2.4085120625047143, + "language_loss": 0.81286502, + "learning_rate": 3.582439259339073e-06, + "loss": 0.89084947, + "num_input_tokens_seen": 83447950, + "router_z_loss_clip": 2.27148438, + "router_z_loss_mlp": 0.20556641, + "step": 3878, + "time_per_iteration": 2.5396199226379395 + }, + { + "auxiliary_loss_clip": 0.06534204, + "auxiliary_loss_mlp": 0.01280932, + "balance_loss_clip": 0.06299698, + "balance_loss_mlp": 0.0126013, + "epoch": 0.23321809709905306, + "flos": 36435418773120.0, + "grad_norm": 2.3738521781051207, + "language_loss": 0.75046253, + "learning_rate": 3.5822010610212374e-06, + "loss": 0.82861388, + "num_input_tokens_seen": 83467785, + "router_z_loss_clip": 2.34570312, + "router_z_loss_mlp": 0.20788574, + "step": 3879, + "time_per_iteration": 2.6389944553375244 + }, + { + "auxiliary_loss_clip": 0.06528227, + "auxiliary_loss_mlp": 0.01279465, + "balance_loss_clip": 0.06299725, + "balance_loss_mlp": 0.01257972, + "epoch": 0.23327822035172102, + "flos": 21331184509440.0, + "grad_norm": 4.081669167605711, + "language_loss": 0.90526301, + "learning_rate": 3.5819628027061795e-06, + "loss": 0.98333991, + "num_input_tokens_seen": 83485390, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.21496582, + "step": 3880, + "time_per_iteration": 2.5659923553466797 + }, + { + "auxiliary_loss_clip": 0.06530303, + "auxiliary_loss_mlp": 0.01278258, + "balance_loss_clip": 0.06297769, + "balance_loss_mlp": 0.0125841, + "epoch": 0.233338343604389, + "flos": 19177907086080.0, + "grad_norm": 1.8856968798779488, + "language_loss": 0.72716117, + "learning_rate": 3.5817244844029334e-06, + "loss": 0.80524671, + "num_input_tokens_seen": 83504890, + "router_z_loss_clip": 2.32421875, + "router_z_loss_mlp": 0.19848633, + "step": 3881, + "time_per_iteration": 2.528083324432373 + }, + { + "auxiliary_loss_clip": 0.0653114, + "auxiliary_loss_mlp": 0.01278184, + "balance_loss_clip": 0.06302784, + "balance_loss_mlp": 0.0125805, + "epoch": 0.23339846685705698, + "flos": 26915939464320.0, + "grad_norm": 1.6578041146422486, + "language_loss": 0.68699455, + "learning_rate": 3.581486106120537e-06, + "loss": 0.76508778, + "num_input_tokens_seen": 83526475, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.20129395, + "step": 3882, + "time_per_iteration": 2.575275182723999 + }, + { + "auxiliary_loss_clip": 0.06529698, + "auxiliary_loss_mlp": 0.0127867, + "balance_loss_clip": 0.0629693, + "balance_loss_mlp": 0.01258226, + "epoch": 0.23345859010972494, + "flos": 32351375243520.0, + "grad_norm": 2.0584115637368767, + "language_loss": 0.77458596, + "learning_rate": 3.5812476678680287e-06, + "loss": 0.8526696, + "num_input_tokens_seen": 83546620, + "router_z_loss_clip": 2.33007812, + "router_z_loss_mlp": 0.20446777, + "step": 3883, + "time_per_iteration": 2.626533269882202 + }, + { + "auxiliary_loss_clip": 0.06405331, + "auxiliary_loss_mlp": 0.01262592, + "balance_loss_clip": 0.0628854, + "balance_loss_mlp": 0.01257663, + "epoch": 0.2335187133623929, + "flos": 58505805273600.0, + "grad_norm": 0.7704933603606158, + "language_loss": 0.59193355, + "learning_rate": 3.58100916965445e-06, + "loss": 0.66861278, + "num_input_tokens_seen": 83616160, + "router_z_loss_clip": 1.17089844, + "router_z_loss_mlp": 0.04925537, + "step": 3884, + "time_per_iteration": 4.6365087032318115 + }, + { + "auxiliary_loss_clip": 0.06533933, + "auxiliary_loss_mlp": 0.01280044, + "balance_loss_clip": 0.06302384, + "balance_loss_mlp": 0.01260017, + "epoch": 0.23357883661506088, + "flos": 24509822745600.0, + "grad_norm": 1.6610169782824564, + "language_loss": 0.80755335, + "learning_rate": 3.5807706114888455e-06, + "loss": 0.88569313, + "num_input_tokens_seen": 83636795, + "router_z_loss_clip": 2.31640625, + "router_z_loss_mlp": 0.20031738, + "step": 3885, + "time_per_iteration": 2.6180286407470703 + }, + { + "auxiliary_loss_clip": 0.06523974, + "auxiliary_loss_mlp": 0.01286823, + "balance_loss_clip": 0.06296945, + "balance_loss_mlp": 0.01265687, + "epoch": 0.23363895986772884, + "flos": 18953760614400.0, + "grad_norm": 2.3207575064623613, + "language_loss": 0.88500953, + "learning_rate": 3.580531993380261e-06, + "loss": 0.96311754, + "num_input_tokens_seen": 83654050, + "router_z_loss_clip": 2.26953125, + "router_z_loss_mlp": 0.21130371, + "step": 3886, + "time_per_iteration": 2.5116477012634277 + }, + { + "auxiliary_loss_clip": 0.06532702, + "auxiliary_loss_mlp": 0.01282855, + "balance_loss_clip": 0.06302926, + "balance_loss_mlp": 0.01262518, + "epoch": 0.2336990831203968, + "flos": 31694993884800.0, + "grad_norm": 1.8877154320423692, + "language_loss": 0.74203557, + "learning_rate": 3.5802933153377445e-06, + "loss": 0.82019114, + "num_input_tokens_seen": 83673720, + "router_z_loss_clip": 2.29882812, + "router_z_loss_mlp": 0.20336914, + "step": 3887, + "time_per_iteration": 4.024793863296509 + }, + { + "auxiliary_loss_clip": 0.06531121, + "auxiliary_loss_mlp": 0.01281305, + "balance_loss_clip": 0.06301375, + "balance_loss_mlp": 0.01261206, + "epoch": 0.23375920637306477, + "flos": 27717237659520.0, + "grad_norm": 1.8176198265631485, + "language_loss": 0.84478307, + "learning_rate": 3.5800545773703475e-06, + "loss": 0.92290735, + "num_input_tokens_seen": 83693470, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.20092773, + "step": 3888, + "time_per_iteration": 2.6297786235809326 + }, + { + "auxiliary_loss_clip": 0.06524558, + "auxiliary_loss_mlp": 0.01283639, + "balance_loss_clip": 0.06298919, + "balance_loss_mlp": 0.01263934, + "epoch": 0.23381932962573276, + "flos": 17681346437760.0, + "grad_norm": 2.056965631559896, + "language_loss": 0.88319886, + "learning_rate": 3.5798157794871225e-06, + "loss": 0.96128076, + "num_input_tokens_seen": 83711620, + "router_z_loss_clip": 2.25585938, + "router_z_loss_mlp": 0.19689941, + "step": 3889, + "time_per_iteration": 2.524937152862549 + }, + { + "auxiliary_loss_clip": 0.06524722, + "auxiliary_loss_mlp": 0.01282198, + "balance_loss_clip": 0.06299812, + "balance_loss_mlp": 0.01262708, + "epoch": 0.23387945287840073, + "flos": 14395833918720.0, + "grad_norm": 2.5361674913720487, + "language_loss": 0.7777229, + "learning_rate": 3.579576921697125e-06, + "loss": 0.85579211, + "num_input_tokens_seen": 83727890, + "router_z_loss_clip": 2.24804688, + "router_z_loss_mlp": 0.19470215, + "step": 3890, + "time_per_iteration": 4.02982497215271 + }, + { + "auxiliary_loss_clip": 0.06526545, + "auxiliary_loss_mlp": 0.01284178, + "balance_loss_clip": 0.06297928, + "balance_loss_mlp": 0.01264008, + "epoch": 0.2339395761310687, + "flos": 46108451888640.0, + "grad_norm": 1.897831891943022, + "language_loss": 0.74213481, + "learning_rate": 3.579338004009412e-06, + "loss": 0.82024205, + "num_input_tokens_seen": 83749370, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.20166016, + "step": 3891, + "time_per_iteration": 2.7951042652130127 + }, + { + "auxiliary_loss_clip": 0.06524959, + "auxiliary_loss_mlp": 0.01281513, + "balance_loss_clip": 0.06301059, + "balance_loss_mlp": 0.01262821, + "epoch": 0.23399969938373666, + "flos": 22388508455040.0, + "grad_norm": 1.6273389699862264, + "language_loss": 0.82863498, + "learning_rate": 3.5790990264330433e-06, + "loss": 0.90669972, + "num_input_tokens_seen": 83769560, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.18688965, + "step": 3892, + "time_per_iteration": 2.530782461166382 + }, + { + "auxiliary_loss_clip": 0.06531358, + "auxiliary_loss_mlp": 0.01281181, + "balance_loss_clip": 0.06301633, + "balance_loss_mlp": 0.01260951, + "epoch": 0.23405982263640462, + "flos": 43518746874240.0, + "grad_norm": 1.4575042253356143, + "language_loss": 0.65593249, + "learning_rate": 3.578859988977082e-06, + "loss": 0.7340579, + "num_input_tokens_seen": 83795635, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.20227051, + "step": 3893, + "time_per_iteration": 4.212572813034058 + }, + { + "auxiliary_loss_clip": 0.06519544, + "auxiliary_loss_mlp": 0.01283369, + "balance_loss_clip": 0.06297972, + "balance_loss_mlp": 0.01263259, + "epoch": 0.2341199458890726, + "flos": 22571216282880.0, + "grad_norm": 2.0084649252152564, + "language_loss": 0.79620147, + "learning_rate": 3.5786208916505916e-06, + "loss": 0.87423062, + "num_input_tokens_seen": 83814090, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.20117188, + "step": 3894, + "time_per_iteration": 2.580109119415283 + }, + { + "auxiliary_loss_clip": 0.06524212, + "auxiliary_loss_mlp": 0.01276443, + "balance_loss_clip": 0.06300013, + "balance_loss_mlp": 0.01257763, + "epoch": 0.23418006914174055, + "flos": 25641764352000.0, + "grad_norm": 1.5130292757453454, + "language_loss": 0.82681906, + "learning_rate": 3.5783817344626383e-06, + "loss": 0.90482563, + "num_input_tokens_seen": 83836870, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.18664551, + "step": 3895, + "time_per_iteration": 2.583759069442749 + }, + { + "auxiliary_loss_clip": 0.06520028, + "auxiliary_loss_mlp": 0.01278233, + "balance_loss_clip": 0.06295593, + "balance_loss_mlp": 0.0125885, + "epoch": 0.23424019239440855, + "flos": 13549826770560.0, + "grad_norm": 2.4592405022159496, + "language_loss": 0.81334293, + "learning_rate": 3.578142517422292e-06, + "loss": 0.89132559, + "num_input_tokens_seen": 83853275, + "router_z_loss_clip": 2.24414062, + "router_z_loss_mlp": 0.19372559, + "step": 3896, + "time_per_iteration": 2.536252021789551 + }, + { + "auxiliary_loss_clip": 0.06530771, + "auxiliary_loss_mlp": 0.012867, + "balance_loss_clip": 0.06299435, + "balance_loss_mlp": 0.01264253, + "epoch": 0.2343003156470765, + "flos": 22426131738240.0, + "grad_norm": 3.0940729647414598, + "language_loss": 0.83988011, + "learning_rate": 3.577903240538623e-06, + "loss": 0.91805482, + "num_input_tokens_seen": 83872340, + "router_z_loss_clip": 2.31445312, + "router_z_loss_mlp": 0.2244873, + "step": 3897, + "time_per_iteration": 2.572230577468872 + }, + { + "auxiliary_loss_clip": 0.06528857, + "auxiliary_loss_mlp": 0.01279177, + "balance_loss_clip": 0.06296414, + "balance_loss_mlp": 0.01258626, + "epoch": 0.23436043889974448, + "flos": 14795644475520.0, + "grad_norm": 2.317273344502078, + "language_loss": 0.79819012, + "learning_rate": 3.577663903820705e-06, + "loss": 0.87627041, + "num_input_tokens_seen": 83888795, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.20544434, + "step": 3898, + "time_per_iteration": 2.5207583904266357 + }, + { + "auxiliary_loss_clip": 0.0651897, + "auxiliary_loss_mlp": 0.01278878, + "balance_loss_clip": 0.06297988, + "balance_loss_mlp": 0.0126021, + "epoch": 0.23442056215241244, + "flos": 22972242723840.0, + "grad_norm": 1.88849810547605, + "language_loss": 0.7476474, + "learning_rate": 3.577424507277614e-06, + "loss": 0.82562584, + "num_input_tokens_seen": 83906820, + "router_z_loss_clip": 2.20800781, + "router_z_loss_mlp": 0.18676758, + "step": 3899, + "time_per_iteration": 2.535256862640381 + }, + { + "auxiliary_loss_clip": 0.06525272, + "auxiliary_loss_mlp": 0.01280019, + "balance_loss_clip": 0.06296974, + "balance_loss_mlp": 0.01259515, + "epoch": 0.2344806854050804, + "flos": 23077901122560.0, + "grad_norm": 1.7218865416029, + "language_loss": 0.75599915, + "learning_rate": 3.5771850509184277e-06, + "loss": 0.83405209, + "num_input_tokens_seen": 83926370, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.20507812, + "step": 3900, + "time_per_iteration": 2.5674827098846436 + }, + { + "auxiliary_loss_clip": 0.06524841, + "auxiliary_loss_mlp": 0.01281356, + "balance_loss_clip": 0.06296976, + "balance_loss_mlp": 0.01260959, + "epoch": 0.23454080865774837, + "flos": 16332805226880.0, + "grad_norm": 2.155964713283421, + "language_loss": 0.67468774, + "learning_rate": 3.5769455347522256e-06, + "loss": 0.75274968, + "num_input_tokens_seen": 83944600, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.20410156, + "step": 3901, + "time_per_iteration": 2.536736249923706 + }, + { + "auxiliary_loss_clip": 0.06415819, + "auxiliary_loss_mlp": 0.01256149, + "balance_loss_clip": 0.06299057, + "balance_loss_mlp": 0.01251181, + "epoch": 0.23460093191041637, + "flos": 67779545685120.0, + "grad_norm": 0.7514179301091559, + "language_loss": 0.58278525, + "learning_rate": 3.576705958788091e-06, + "loss": 0.65950489, + "num_input_tokens_seen": 84005100, + "router_z_loss_clip": 1.16796875, + "router_z_loss_mlp": 0.0496521, + "step": 3902, + "time_per_iteration": 3.134718894958496 + }, + { + "auxiliary_loss_clip": 0.06519462, + "auxiliary_loss_mlp": 0.01278211, + "balance_loss_clip": 0.06292997, + "balance_loss_mlp": 0.01258375, + "epoch": 0.23466105516308433, + "flos": 20082725400960.0, + "grad_norm": 4.781089560028637, + "language_loss": 0.80931306, + "learning_rate": 3.576466323035108e-06, + "loss": 0.88728976, + "num_input_tokens_seen": 84023775, + "router_z_loss_clip": 2.26171875, + "router_z_loss_mlp": 0.19836426, + "step": 3903, + "time_per_iteration": 2.525059938430786 + }, + { + "auxiliary_loss_clip": 0.06522641, + "auxiliary_loss_mlp": 0.01278887, + "balance_loss_clip": 0.06295069, + "balance_loss_mlp": 0.01258955, + "epoch": 0.2347211784157523, + "flos": 24542708273280.0, + "grad_norm": 1.8578223556950417, + "language_loss": 0.82988703, + "learning_rate": 3.5762266275023645e-06, + "loss": 0.90790236, + "num_input_tokens_seen": 84042605, + "router_z_loss_clip": 2.27929688, + "router_z_loss_mlp": 0.19909668, + "step": 3904, + "time_per_iteration": 2.5903875827789307 + }, + { + "auxiliary_loss_clip": 0.0652332, + "auxiliary_loss_mlp": 0.01285911, + "balance_loss_clip": 0.06295672, + "balance_loss_mlp": 0.01265562, + "epoch": 0.23478130166842026, + "flos": 23811751180800.0, + "grad_norm": 1.985666710181995, + "language_loss": 0.7223646, + "learning_rate": 3.57598687219895e-06, + "loss": 0.80045688, + "num_input_tokens_seen": 84061520, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.20361328, + "step": 3905, + "time_per_iteration": 2.5441884994506836 + }, + { + "auxiliary_loss_clip": 0.06517074, + "auxiliary_loss_mlp": 0.01274876, + "balance_loss_clip": 0.06294023, + "balance_loss_mlp": 0.01255564, + "epoch": 0.23484142492108823, + "flos": 24099823918080.0, + "grad_norm": 2.433861192511871, + "language_loss": 0.71703601, + "learning_rate": 3.5757470571339543e-06, + "loss": 0.79495549, + "num_input_tokens_seen": 84081800, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.19311523, + "step": 3906, + "time_per_iteration": 2.698309898376465 + }, + { + "auxiliary_loss_clip": 0.06533175, + "auxiliary_loss_mlp": 0.01285298, + "balance_loss_clip": 0.06297503, + "balance_loss_mlp": 0.01264341, + "epoch": 0.2349015481737562, + "flos": 29103486007680.0, + "grad_norm": 2.7858195598302014, + "language_loss": 0.74089986, + "learning_rate": 3.575507182316473e-06, + "loss": 0.81908458, + "num_input_tokens_seen": 84102340, + "router_z_loss_clip": 2.35742188, + "router_z_loss_mlp": 0.20959473, + "step": 3907, + "time_per_iteration": 2.578900098800659 + }, + { + "auxiliary_loss_clip": 0.06524273, + "auxiliary_loss_mlp": 0.01280946, + "balance_loss_clip": 0.06294693, + "balance_loss_mlp": 0.01260418, + "epoch": 0.23496167142642416, + "flos": 18922258679040.0, + "grad_norm": 2.1308722973133385, + "language_loss": 0.73705935, + "learning_rate": 3.575267247755601e-06, + "loss": 0.81511152, + "num_input_tokens_seen": 84120370, + "router_z_loss_clip": 2.29492188, + "router_z_loss_mlp": 0.2052002, + "step": 3908, + "time_per_iteration": 2.599888801574707 + }, + { + "auxiliary_loss_clip": 0.06415461, + "auxiliary_loss_mlp": 0.01265268, + "balance_loss_clip": 0.06297816, + "balance_loss_mlp": 0.01259901, + "epoch": 0.23502179467909215, + "flos": 55884906541440.0, + "grad_norm": 1.2475277524680826, + "language_loss": 0.73364127, + "learning_rate": 3.5750272534604367e-06, + "loss": 0.81044865, + "num_input_tokens_seen": 84165515, + "router_z_loss_clip": 1.171875, + "router_z_loss_mlp": 0.05374146, + "step": 3909, + "time_per_iteration": 2.9221227169036865 + }, + { + "auxiliary_loss_clip": 0.06524064, + "auxiliary_loss_mlp": 0.01285302, + "balance_loss_clip": 0.06297419, + "balance_loss_mlp": 0.01265013, + "epoch": 0.23508191793176011, + "flos": 23408083336320.0, + "grad_norm": 1.6005271399570604, + "language_loss": 0.88581395, + "learning_rate": 3.5747871994400822e-06, + "loss": 0.9639076, + "num_input_tokens_seen": 84184540, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.20288086, + "step": 3910, + "time_per_iteration": 2.571974277496338 + }, + { + "auxiliary_loss_clip": 0.06520193, + "auxiliary_loss_mlp": 0.01278841, + "balance_loss_clip": 0.06293051, + "balance_loss_mlp": 0.01258658, + "epoch": 0.23514204118442808, + "flos": 20053864869120.0, + "grad_norm": 1.9643755437340527, + "language_loss": 0.76589572, + "learning_rate": 3.5745470857036386e-06, + "loss": 0.84388608, + "num_input_tokens_seen": 84202025, + "router_z_loss_clip": 2.27148438, + "router_z_loss_mlp": 0.2019043, + "step": 3911, + "time_per_iteration": 2.5159506797790527 + }, + { + "auxiliary_loss_clip": 0.06514487, + "auxiliary_loss_mlp": 0.01291153, + "balance_loss_clip": 0.06293596, + "balance_loss_mlp": 0.01272568, + "epoch": 0.23520216443709605, + "flos": 21587126405760.0, + "grad_norm": 1.5390832092388007, + "language_loss": 0.82200038, + "learning_rate": 3.5743069122602122e-06, + "loss": 0.90005672, + "num_input_tokens_seen": 84221895, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.18579102, + "step": 3912, + "time_per_iteration": 2.53330135345459 + }, + { + "auxiliary_loss_clip": 0.06515642, + "auxiliary_loss_mlp": 0.01288785, + "balance_loss_clip": 0.06294793, + "balance_loss_mlp": 0.01269604, + "epoch": 0.235262287689764, + "flos": 23192573834880.0, + "grad_norm": 1.8330232089961167, + "language_loss": 0.72023201, + "learning_rate": 3.574066679118909e-06, + "loss": 0.79827625, + "num_input_tokens_seen": 84240455, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.19177246, + "step": 3913, + "time_per_iteration": 2.5643818378448486 + }, + { + "auxiliary_loss_clip": 0.06528541, + "auxiliary_loss_mlp": 0.01277731, + "balance_loss_clip": 0.0629672, + "balance_loss_mlp": 0.01257238, + "epoch": 0.23532241094243198, + "flos": 23191903002240.0, + "grad_norm": 1.784539383466316, + "language_loss": 0.76976919, + "learning_rate": 3.57382638628884e-06, + "loss": 0.84783185, + "num_input_tokens_seen": 84261605, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.20483398, + "step": 3914, + "time_per_iteration": 2.575133800506592 + }, + { + "auxiliary_loss_clip": 0.06525879, + "auxiliary_loss_mlp": 0.01279953, + "balance_loss_clip": 0.06294835, + "balance_loss_mlp": 0.01259759, + "epoch": 0.23538253419509997, + "flos": 17025007006080.0, + "grad_norm": 2.4875564397369745, + "language_loss": 0.90170735, + "learning_rate": 3.5735860337791174e-06, + "loss": 0.97976559, + "num_input_tokens_seen": 84278675, + "router_z_loss_clip": 2.30859375, + "router_z_loss_mlp": 0.2019043, + "step": 3915, + "time_per_iteration": 2.563430070877075 + }, + { + "auxiliary_loss_clip": 0.06418007, + "auxiliary_loss_mlp": 0.01258116, + "balance_loss_clip": 0.06301998, + "balance_loss_mlp": 0.0125336, + "epoch": 0.23544265744776793, + "flos": 63465276263040.0, + "grad_norm": 0.7933859009920101, + "language_loss": 0.59378946, + "learning_rate": 3.573345621598854e-06, + "loss": 0.6705507, + "num_input_tokens_seen": 84329765, + "router_z_loss_clip": 1.15625, + "router_z_loss_mlp": 0.04748535, + "step": 3916, + "time_per_iteration": 3.0965490341186523 + }, + { + "auxiliary_loss_clip": 0.06410776, + "auxiliary_loss_mlp": 0.01260488, + "balance_loss_clip": 0.06295535, + "balance_loss_mlp": 0.01255756, + "epoch": 0.2355027807004359, + "flos": 70537395116160.0, + "grad_norm": 0.7426668339088592, + "language_loss": 0.49443412, + "learning_rate": 3.5731051497571675e-06, + "loss": 0.57114673, + "num_input_tokens_seen": 84393680, + "router_z_loss_clip": 1.15625, + "router_z_loss_mlp": 0.04724121, + "step": 3917, + "time_per_iteration": 3.180136203765869 + }, + { + "auxiliary_loss_clip": 0.06525698, + "auxiliary_loss_mlp": 0.01279416, + "balance_loss_clip": 0.06297344, + "balance_loss_mlp": 0.01259687, + "epoch": 0.23556290395310386, + "flos": 21440742122880.0, + "grad_norm": 2.189382839240281, + "language_loss": 0.77017808, + "learning_rate": 3.5728646182631756e-06, + "loss": 0.84822929, + "num_input_tokens_seen": 84412640, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.19714355, + "step": 3918, + "time_per_iteration": 2.546833038330078 + }, + { + "auxiliary_loss_clip": 0.0652653, + "auxiliary_loss_mlp": 0.01274201, + "balance_loss_clip": 0.06294574, + "balance_loss_mlp": 0.01254353, + "epoch": 0.23562302720577183, + "flos": 18192223981440.0, + "grad_norm": 2.402769767514051, + "language_loss": 0.70165813, + "learning_rate": 3.5726240271259995e-06, + "loss": 0.77966547, + "num_input_tokens_seen": 84431605, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.1986084, + "step": 3919, + "time_per_iteration": 2.561800479888916 + }, + { + "auxiliary_loss_clip": 0.06516096, + "auxiliary_loss_mlp": 0.01279326, + "balance_loss_clip": 0.06294449, + "balance_loss_mlp": 0.0125999, + "epoch": 0.2356831504584398, + "flos": 33739091038080.0, + "grad_norm": 1.6359966895302622, + "language_loss": 0.71094656, + "learning_rate": 3.5723833763547634e-06, + "loss": 0.78890085, + "num_input_tokens_seen": 84454210, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.19335938, + "step": 3920, + "time_per_iteration": 2.672703504562378 + }, + { + "auxiliary_loss_clip": 0.065192, + "auxiliary_loss_mlp": 0.0127625, + "balance_loss_clip": 0.06295229, + "balance_loss_mlp": 0.0125707, + "epoch": 0.23574327371110776, + "flos": 24939122739840.0, + "grad_norm": 1.9300596293530992, + "language_loss": 0.77833009, + "learning_rate": 3.5721426659585916e-06, + "loss": 0.85628462, + "num_input_tokens_seen": 84475540, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.19189453, + "step": 3921, + "time_per_iteration": 2.5823934078216553 + }, + { + "auxiliary_loss_clip": 0.06519832, + "auxiliary_loss_mlp": 0.01273471, + "balance_loss_clip": 0.06293498, + "balance_loss_mlp": 0.01254898, + "epoch": 0.23580339696377575, + "flos": 17827940355840.0, + "grad_norm": 2.282195745019935, + "language_loss": 0.76750088, + "learning_rate": 3.571901895946612e-06, + "loss": 0.84543383, + "num_input_tokens_seen": 84494580, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.18566895, + "step": 3922, + "time_per_iteration": 2.5005834102630615 + }, + { + "auxiliary_loss_clip": 0.06518443, + "auxiliary_loss_mlp": 0.01276376, + "balance_loss_clip": 0.06292558, + "balance_loss_mlp": 0.01257255, + "epoch": 0.23586352021644372, + "flos": 26293827225600.0, + "grad_norm": 2.0102031772622277, + "language_loss": 0.80626559, + "learning_rate": 3.571661066327956e-06, + "loss": 0.88421381, + "num_input_tokens_seen": 84513850, + "router_z_loss_clip": 2.25976562, + "router_z_loss_mlp": 0.19128418, + "step": 3923, + "time_per_iteration": 2.581338882446289 + }, + { + "auxiliary_loss_clip": 0.0652013, + "auxiliary_loss_mlp": 0.01275781, + "balance_loss_clip": 0.06296518, + "balance_loss_mlp": 0.01256326, + "epoch": 0.23592364346911168, + "flos": 14251965258240.0, + "grad_norm": 1.780788070615976, + "language_loss": 0.7507394, + "learning_rate": 3.571420177111754e-06, + "loss": 0.82869852, + "num_input_tokens_seen": 84532315, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.19458008, + "step": 3924, + "time_per_iteration": 3.9297289848327637 + }, + { + "auxiliary_loss_clip": 0.06516001, + "auxiliary_loss_mlp": 0.01276934, + "balance_loss_clip": 0.06293369, + "balance_loss_mlp": 0.01258039, + "epoch": 0.23598376672177965, + "flos": 18593837400960.0, + "grad_norm": 1.7528516859224217, + "language_loss": 0.83231425, + "learning_rate": 3.5711792283071416e-06, + "loss": 0.91024363, + "num_input_tokens_seen": 84550970, + "router_z_loss_clip": 2.2265625, + "router_z_loss_mlp": 0.18884277, + "step": 3925, + "time_per_iteration": 2.5267770290374756 + }, + { + "auxiliary_loss_clip": 0.06520985, + "auxiliary_loss_mlp": 0.01279855, + "balance_loss_clip": 0.06293195, + "balance_loss_mlp": 0.01259673, + "epoch": 0.2360438899744476, + "flos": 22682325196800.0, + "grad_norm": 1.753261892654821, + "language_loss": 0.60038519, + "learning_rate": 3.5709382199232564e-06, + "loss": 0.6783936, + "num_input_tokens_seen": 84571655, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.20178223, + "step": 3926, + "time_per_iteration": 4.023118257522583 + }, + { + "auxiliary_loss_clip": 0.06514051, + "auxiliary_loss_mlp": 0.01276677, + "balance_loss_clip": 0.06293727, + "balance_loss_mlp": 0.01257735, + "epoch": 0.23610401322711558, + "flos": 29577872298240.0, + "grad_norm": 1.9607796947198142, + "language_loss": 0.72402066, + "learning_rate": 3.570697151969235e-06, + "loss": 0.80192792, + "num_input_tokens_seen": 84593130, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.1895752, + "step": 3927, + "time_per_iteration": 2.6113367080688477 + }, + { + "auxiliary_loss_clip": 0.06515504, + "auxiliary_loss_mlp": 0.01275291, + "balance_loss_clip": 0.06291251, + "balance_loss_mlp": 0.01256373, + "epoch": 0.23616413647978354, + "flos": 17864347754880.0, + "grad_norm": 2.08357001670468, + "language_loss": 0.75570691, + "learning_rate": 3.570456024454221e-06, + "loss": 0.83361489, + "num_input_tokens_seen": 84612410, + "router_z_loss_clip": 2.24609375, + "router_z_loss_mlp": 0.18920898, + "step": 3928, + "time_per_iteration": 2.601884365081787 + }, + { + "auxiliary_loss_clip": 0.06522287, + "auxiliary_loss_mlp": 0.01280424, + "balance_loss_clip": 0.06293722, + "balance_loss_mlp": 0.01260338, + "epoch": 0.23622425973245154, + "flos": 11039393318400.0, + "grad_norm": 3.3378461006384788, + "language_loss": 0.82518888, + "learning_rate": 3.5702148373873576e-06, + "loss": 0.903216, + "num_input_tokens_seen": 84627610, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.20080566, + "step": 3929, + "time_per_iteration": 3.9035136699676514 + }, + { + "auxiliary_loss_clip": 0.0652993, + "auxiliary_loss_mlp": 0.01281554, + "balance_loss_clip": 0.06295136, + "balance_loss_mlp": 0.01261228, + "epoch": 0.2362843829851195, + "flos": 23410766666880.0, + "grad_norm": 2.0127268398029607, + "language_loss": 0.7229315, + "learning_rate": 3.569973590777789e-06, + "loss": 0.80104637, + "num_input_tokens_seen": 84648415, + "router_z_loss_clip": 2.34960938, + "router_z_loss_mlp": 0.203125, + "step": 3930, + "time_per_iteration": 2.5537455081939697 + }, + { + "auxiliary_loss_clip": 0.06516138, + "auxiliary_loss_mlp": 0.01275778, + "balance_loss_clip": 0.06290947, + "balance_loss_mlp": 0.01257312, + "epoch": 0.23634450623778747, + "flos": 39539103932160.0, + "grad_norm": 1.8975533795335693, + "language_loss": 0.74476141, + "learning_rate": 3.569732284634665e-06, + "loss": 0.82268059, + "num_input_tokens_seen": 84670080, + "router_z_loss_clip": 2.25390625, + "router_z_loss_mlp": 0.18444824, + "step": 3931, + "time_per_iteration": 2.6975677013397217 + }, + { + "auxiliary_loss_clip": 0.06517775, + "auxiliary_loss_mlp": 0.01279269, + "balance_loss_clip": 0.06291172, + "balance_loss_mlp": 0.01260208, + "epoch": 0.23640462949045543, + "flos": 24214077360000.0, + "grad_norm": 2.102820580807434, + "language_loss": 0.8105433, + "learning_rate": 3.569490918967136e-06, + "loss": 0.88851368, + "num_input_tokens_seen": 84686465, + "router_z_loss_clip": 2.26757812, + "router_z_loss_mlp": 0.19055176, + "step": 3932, + "time_per_iteration": 2.539280652999878 + }, + { + "auxiliary_loss_clip": 0.06510118, + "auxiliary_loss_mlp": 0.01272436, + "balance_loss_clip": 0.06289183, + "balance_loss_mlp": 0.01254949, + "epoch": 0.2364647527431234, + "flos": 26184898517760.0, + "grad_norm": 1.6370407311570319, + "language_loss": 0.85819322, + "learning_rate": 3.5692494937843537e-06, + "loss": 0.93601882, + "num_input_tokens_seen": 84708825, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.17480469, + "step": 3933, + "time_per_iteration": 4.0140979290008545 + }, + { + "auxiliary_loss_clip": 0.06528582, + "auxiliary_loss_mlp": 0.01277532, + "balance_loss_clip": 0.06296912, + "balance_loss_mlp": 0.01257314, + "epoch": 0.23652487599579136, + "flos": 22643444102400.0, + "grad_norm": 3.233125821654351, + "language_loss": 0.83709848, + "learning_rate": 3.5690080090954727e-06, + "loss": 0.91515964, + "num_input_tokens_seen": 84726165, + "router_z_loss_clip": 2.31835938, + "router_z_loss_mlp": 0.20214844, + "step": 3934, + "time_per_iteration": 2.542692184448242 + }, + { + "auxiliary_loss_clip": 0.06519171, + "auxiliary_loss_mlp": 0.01281493, + "balance_loss_clip": 0.06292115, + "balance_loss_mlp": 0.01262896, + "epoch": 0.23658499924845935, + "flos": 21768702203520.0, + "grad_norm": 1.7174434370199074, + "language_loss": 0.7898351, + "learning_rate": 3.5687664649096515e-06, + "loss": 0.86784172, + "num_input_tokens_seen": 84745815, + "router_z_loss_clip": 2.26953125, + "router_z_loss_mlp": 0.18615723, + "step": 3935, + "time_per_iteration": 2.5311288833618164 + }, + { + "auxiliary_loss_clip": 0.0651848, + "auxiliary_loss_mlp": 0.01276844, + "balance_loss_clip": 0.06296465, + "balance_loss_mlp": 0.01258533, + "epoch": 0.23664512250112732, + "flos": 21805486945920.0, + "grad_norm": 1.7511193987533888, + "language_loss": 0.80239666, + "learning_rate": 3.5685248612360487e-06, + "loss": 0.88034987, + "num_input_tokens_seen": 84765415, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.1829834, + "step": 3936, + "time_per_iteration": 2.5497477054595947 + }, + { + "auxiliary_loss_clip": 0.06513149, + "auxiliary_loss_mlp": 0.01276001, + "balance_loss_clip": 0.06288509, + "balance_loss_mlp": 0.01256593, + "epoch": 0.23670524575379528, + "flos": 22644450351360.0, + "grad_norm": 1.4782770271817958, + "language_loss": 0.79820013, + "learning_rate": 3.568283198083826e-06, + "loss": 0.8760916, + "num_input_tokens_seen": 84787080, + "router_z_loss_clip": 2.24804688, + "router_z_loss_mlp": 0.19396973, + "step": 3937, + "time_per_iteration": 2.5636842250823975 + }, + { + "auxiliary_loss_clip": 0.06515164, + "auxiliary_loss_mlp": 0.0127913, + "balance_loss_clip": 0.06294726, + "balance_loss_mlp": 0.01261487, + "epoch": 0.23676536900646325, + "flos": 16730225942400.0, + "grad_norm": 2.2850190898814686, + "language_loss": 0.85810506, + "learning_rate": 3.568041475462147e-06, + "loss": 0.93604803, + "num_input_tokens_seen": 84805395, + "router_z_loss_clip": 2.20507812, + "router_z_loss_mlp": 0.1763916, + "step": 3938, + "time_per_iteration": 2.568195343017578 + }, + { + "auxiliary_loss_clip": 0.06509314, + "auxiliary_loss_mlp": 0.01278669, + "balance_loss_clip": 0.06288411, + "balance_loss_mlp": 0.01259393, + "epoch": 0.23682549225913122, + "flos": 11138720734080.0, + "grad_norm": 3.1023600205020876, + "language_loss": 0.94564033, + "learning_rate": 3.5677996933801785e-06, + "loss": 1.02351999, + "num_input_tokens_seen": 84818090, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.19287109, + "step": 3939, + "time_per_iteration": 2.4615180492401123 + }, + { + "auxiliary_loss_clip": 0.0652378, + "auxiliary_loss_mlp": 0.01277473, + "balance_loss_clip": 0.06294175, + "balance_loss_mlp": 0.0125803, + "epoch": 0.23688561551179918, + "flos": 22564843372800.0, + "grad_norm": 5.475058210638743, + "language_loss": 0.82803464, + "learning_rate": 3.567557851847088e-06, + "loss": 0.90604717, + "num_input_tokens_seen": 84837695, + "router_z_loss_clip": 2.29492188, + "router_z_loss_mlp": 0.19445801, + "step": 3940, + "time_per_iteration": 2.573552131652832 + }, + { + "auxiliary_loss_clip": 0.06531326, + "auxiliary_loss_mlp": 0.01276996, + "balance_loss_clip": 0.06295921, + "balance_loss_mlp": 0.0125679, + "epoch": 0.23694573876446715, + "flos": 18520771040640.0, + "grad_norm": 2.098492916494123, + "language_loss": 0.8946867, + "learning_rate": 3.5673159508720464e-06, + "loss": 0.97276992, + "num_input_tokens_seen": 84854630, + "router_z_loss_clip": 2.35351562, + "router_z_loss_mlp": 0.2019043, + "step": 3941, + "time_per_iteration": 2.5142972469329834 + }, + { + "auxiliary_loss_clip": 0.06529268, + "auxiliary_loss_mlp": 0.01286958, + "balance_loss_clip": 0.06297106, + "balance_loss_mlp": 0.01267503, + "epoch": 0.23700586201713514, + "flos": 15340246087680.0, + "grad_norm": 1.8886698836060631, + "language_loss": 0.84989077, + "learning_rate": 3.5670739904642274e-06, + "loss": 0.92805308, + "num_input_tokens_seen": 84871805, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.19458008, + "step": 3942, + "time_per_iteration": 2.56052827835083 + }, + { + "auxiliary_loss_clip": 0.06538361, + "auxiliary_loss_mlp": 0.01285865, + "balance_loss_clip": 0.06307331, + "balance_loss_mlp": 0.01265492, + "epoch": 0.2370659852698031, + "flos": 23953775051520.0, + "grad_norm": 2.0845511028002197, + "language_loss": 0.81156456, + "learning_rate": 3.5668319706328065e-06, + "loss": 0.88980681, + "num_input_tokens_seen": 84889815, + "router_z_loss_clip": 2.31054688, + "router_z_loss_mlp": 0.20373535, + "step": 3943, + "time_per_iteration": 2.539264678955078 + }, + { + "auxiliary_loss_clip": 0.06543057, + "auxiliary_loss_mlp": 0.01292355, + "balance_loss_clip": 0.06306483, + "balance_loss_mlp": 0.01271494, + "epoch": 0.23712610852247107, + "flos": 15336514581120.0, + "grad_norm": 2.5863771047568926, + "language_loss": 0.682428, + "learning_rate": 3.566589891386959e-06, + "loss": 0.76078212, + "num_input_tokens_seen": 84904380, + "router_z_loss_clip": 2.3671875, + "router_z_loss_mlp": 0.20861816, + "step": 3944, + "time_per_iteration": 2.520453929901123 + }, + { + "auxiliary_loss_clip": 0.06529288, + "auxiliary_loss_mlp": 0.01297026, + "balance_loss_clip": 0.06299931, + "balance_loss_mlp": 0.01276963, + "epoch": 0.23718623177513903, + "flos": 19688658848640.0, + "grad_norm": 1.6926271274644824, + "language_loss": 0.76068223, + "learning_rate": 3.566347752735866e-06, + "loss": 0.83894539, + "num_input_tokens_seen": 84922935, + "router_z_loss_clip": 2.29296875, + "router_z_loss_mlp": 0.20043945, + "step": 3945, + "time_per_iteration": 2.517084836959839 + }, + { + "auxiliary_loss_clip": 0.06535566, + "auxiliary_loss_mlp": 0.01288141, + "balance_loss_clip": 0.06307153, + "balance_loss_mlp": 0.0126859, + "epoch": 0.237246355027807, + "flos": 24980351748480.0, + "grad_norm": 1.7408538946114391, + "language_loss": 0.63962567, + "learning_rate": 3.5661055546887094e-06, + "loss": 0.71786278, + "num_input_tokens_seen": 84943685, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.19555664, + "step": 3946, + "time_per_iteration": 2.6133670806884766 + }, + { + "auxiliary_loss_clip": 0.06535441, + "auxiliary_loss_mlp": 0.01289697, + "balance_loss_clip": 0.06306995, + "balance_loss_mlp": 0.01269324, + "epoch": 0.23730647828047496, + "flos": 15382816761600.0, + "grad_norm": 3.1254224655104252, + "language_loss": 0.77114201, + "learning_rate": 3.5658632972546734e-06, + "loss": 0.84939343, + "num_input_tokens_seen": 84959505, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.20385742, + "step": 3947, + "time_per_iteration": 2.495837926864624 + }, + { + "auxiliary_loss_clip": 0.06540522, + "auxiliary_loss_mlp": 0.01290208, + "balance_loss_clip": 0.06311937, + "balance_loss_mlp": 0.01270431, + "epoch": 0.23736660153314296, + "flos": 28158738422400.0, + "grad_norm": 1.595292591120463, + "language_loss": 0.80941439, + "learning_rate": 3.565620980442944e-06, + "loss": 0.88772172, + "num_input_tokens_seen": 84982130, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.19775391, + "step": 3948, + "time_per_iteration": 2.6460211277008057 + }, + { + "auxiliary_loss_clip": 0.06542704, + "auxiliary_loss_mlp": 0.01297731, + "balance_loss_clip": 0.06312679, + "balance_loss_mlp": 0.01277025, + "epoch": 0.23742672478581092, + "flos": 22092385726080.0, + "grad_norm": 1.753357741589714, + "language_loss": 0.80419362, + "learning_rate": 3.5653786042627107e-06, + "loss": 0.88259804, + "num_input_tokens_seen": 85000640, + "router_z_loss_clip": 2.30078125, + "router_z_loss_mlp": 0.20715332, + "step": 3949, + "time_per_iteration": 2.5428664684295654 + }, + { + "auxiliary_loss_clip": 0.06549721, + "auxiliary_loss_mlp": 0.01294419, + "balance_loss_clip": 0.06317213, + "balance_loss_mlp": 0.012732, + "epoch": 0.2374868480384789, + "flos": 19543238887680.0, + "grad_norm": 1.6923054699564082, + "language_loss": 0.73375976, + "learning_rate": 3.565136168723163e-06, + "loss": 0.81220114, + "num_input_tokens_seen": 85018970, + "router_z_loss_clip": 2.32421875, + "router_z_loss_mlp": 0.2121582, + "step": 3950, + "time_per_iteration": 2.6125261783599854 + }, + { + "auxiliary_loss_clip": 0.06527583, + "auxiliary_loss_mlp": 0.01288007, + "balance_loss_clip": 0.06302388, + "balance_loss_mlp": 0.01268957, + "epoch": 0.23754697129114685, + "flos": 19427769561600.0, + "grad_norm": 1.893051910973559, + "language_loss": 0.73254943, + "learning_rate": 3.564893673833495e-06, + "loss": 0.8107053, + "num_input_tokens_seen": 85035905, + "router_z_loss_clip": 2.25390625, + "router_z_loss_mlp": 0.1907959, + "step": 3951, + "time_per_iteration": 2.501091957092285 + }, + { + "auxiliary_loss_clip": 0.06543966, + "auxiliary_loss_mlp": 0.01301622, + "balance_loss_clip": 0.06315006, + "balance_loss_mlp": 0.01280332, + "epoch": 0.23760709454381482, + "flos": 19507208832000.0, + "grad_norm": 1.727887568846887, + "language_loss": 0.7427932, + "learning_rate": 3.564651119602903e-06, + "loss": 0.82124901, + "num_input_tokens_seen": 85054560, + "router_z_loss_clip": 2.2890625, + "router_z_loss_mlp": 0.2130127, + "step": 3952, + "time_per_iteration": 2.5467019081115723 + }, + { + "auxiliary_loss_clip": 0.06536686, + "auxiliary_loss_mlp": 0.01292988, + "balance_loss_clip": 0.0630881, + "balance_loss_mlp": 0.01273379, + "epoch": 0.23766721779648278, + "flos": 27644045518080.0, + "grad_norm": 3.105577179216311, + "language_loss": 0.71633041, + "learning_rate": 3.564408506040583e-06, + "loss": 0.79462719, + "num_input_tokens_seen": 85074425, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.19604492, + "step": 3953, + "time_per_iteration": 2.599946975708008 + }, + { + "auxiliary_loss_clip": 0.06537458, + "auxiliary_loss_mlp": 0.01292831, + "balance_loss_clip": 0.06305911, + "balance_loss_mlp": 0.01272673, + "epoch": 0.23772734104915075, + "flos": 23411102083200.0, + "grad_norm": 6.547469437533346, + "language_loss": 0.82534778, + "learning_rate": 3.5641658331557356e-06, + "loss": 0.90365064, + "num_input_tokens_seen": 85092865, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.20166016, + "step": 3954, + "time_per_iteration": 2.595163583755493 + }, + { + "auxiliary_loss_clip": 0.06538694, + "auxiliary_loss_mlp": 0.01291334, + "balance_loss_clip": 0.0630859, + "balance_loss_mlp": 0.01271486, + "epoch": 0.23778746430181874, + "flos": 15710902623360.0, + "grad_norm": 2.2065720754909606, + "language_loss": 0.66202033, + "learning_rate": 3.5639231009575634e-06, + "loss": 0.74032056, + "num_input_tokens_seen": 85110175, + "router_z_loss_clip": 2.30273438, + "router_z_loss_mlp": 0.19848633, + "step": 3955, + "time_per_iteration": 2.5345511436462402 + }, + { + "auxiliary_loss_clip": 0.06527859, + "auxiliary_loss_mlp": 0.01285762, + "balance_loss_clip": 0.06301668, + "balance_loss_mlp": 0.01266081, + "epoch": 0.2378475875544867, + "flos": 19432381536000.0, + "grad_norm": 1.4478942147045952, + "language_loss": 0.84203303, + "learning_rate": 3.5636803094552704e-06, + "loss": 0.92016923, + "num_input_tokens_seen": 85129925, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.19689941, + "step": 3956, + "time_per_iteration": 2.5458483695983887 + }, + { + "auxiliary_loss_clip": 0.06526335, + "auxiliary_loss_mlp": 0.01287929, + "balance_loss_clip": 0.06303546, + "balance_loss_mlp": 0.01268438, + "epoch": 0.23790771080715467, + "flos": 22274338867200.0, + "grad_norm": 2.194064451149358, + "language_loss": 0.8561964, + "learning_rate": 3.5634374586580635e-06, + "loss": 0.93433905, + "num_input_tokens_seen": 85147755, + "router_z_loss_clip": 2.2265625, + "router_z_loss_mlp": 0.19494629, + "step": 3957, + "time_per_iteration": 2.5579113960266113 + }, + { + "auxiliary_loss_clip": 0.06532466, + "auxiliary_loss_mlp": 0.01283677, + "balance_loss_clip": 0.0630599, + "balance_loss_mlp": 0.01264008, + "epoch": 0.23796783405982264, + "flos": 20053445598720.0, + "grad_norm": 2.4454692262909856, + "language_loss": 0.7073434, + "learning_rate": 3.563194548575151e-06, + "loss": 0.78550482, + "num_input_tokens_seen": 85165270, + "router_z_loss_clip": 2.26757812, + "router_z_loss_mlp": 0.19665527, + "step": 3958, + "time_per_iteration": 2.556201219558716 + }, + { + "auxiliary_loss_clip": 0.06533751, + "auxiliary_loss_mlp": 0.01277914, + "balance_loss_clip": 0.06301822, + "balance_loss_mlp": 0.01257303, + "epoch": 0.2380279573124906, + "flos": 14251084790400.0, + "grad_norm": 4.548053192599961, + "language_loss": 0.66760004, + "learning_rate": 3.562951579215745e-06, + "loss": 0.74571669, + "num_input_tokens_seen": 85181555, + "router_z_loss_clip": 2.32226562, + "router_z_loss_mlp": 0.20617676, + "step": 3959, + "time_per_iteration": 2.491999626159668 + }, + { + "auxiliary_loss_clip": 0.06529753, + "auxiliary_loss_mlp": 0.01278003, + "balance_loss_clip": 0.06303047, + "balance_loss_mlp": 0.01259228, + "epoch": 0.23808808056515857, + "flos": 21185638767360.0, + "grad_norm": 1.7806564555446132, + "language_loss": 0.72341377, + "learning_rate": 3.5627085505890586e-06, + "loss": 0.80149138, + "num_input_tokens_seen": 85199455, + "router_z_loss_clip": 2.26757812, + "router_z_loss_mlp": 0.18774414, + "step": 3960, + "time_per_iteration": 2.523761034011841 + }, + { + "auxiliary_loss_clip": 0.0652384, + "auxiliary_loss_mlp": 0.0127522, + "balance_loss_clip": 0.06296217, + "balance_loss_mlp": 0.01255169, + "epoch": 0.23814820381782653, + "flos": 22534850810880.0, + "grad_norm": 1.610971251516654, + "language_loss": 0.7476449, + "learning_rate": 3.562465462704307e-06, + "loss": 0.82563543, + "num_input_tokens_seen": 85219170, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.20031738, + "step": 3961, + "time_per_iteration": 2.5350120067596436 + }, + { + "auxiliary_loss_clip": 0.06528293, + "auxiliary_loss_mlp": 0.01283237, + "balance_loss_clip": 0.06297825, + "balance_loss_mlp": 0.01261505, + "epoch": 0.23820832707049452, + "flos": 22309991579520.0, + "grad_norm": 2.008938617955162, + "language_loss": 0.66267157, + "learning_rate": 3.5622223155707085e-06, + "loss": 0.74078679, + "num_input_tokens_seen": 85238480, + "router_z_loss_clip": 2.30273438, + "router_z_loss_mlp": 0.21728516, + "step": 3962, + "time_per_iteration": 2.554936170578003 + }, + { + "auxiliary_loss_clip": 0.06522447, + "auxiliary_loss_mlp": 0.01279056, + "balance_loss_clip": 0.0629696, + "balance_loss_mlp": 0.0126009, + "epoch": 0.2382684503231625, + "flos": 24871297259520.0, + "grad_norm": 1.868964177707197, + "language_loss": 0.75134146, + "learning_rate": 3.561979109197483e-06, + "loss": 0.82935649, + "num_input_tokens_seen": 85259180, + "router_z_loss_clip": 2.25585938, + "router_z_loss_mlp": 0.18969727, + "step": 3963, + "time_per_iteration": 3.9841935634613037 + }, + { + "auxiliary_loss_clip": 0.0652955, + "auxiliary_loss_mlp": 0.01278457, + "balance_loss_clip": 0.06298651, + "balance_loss_mlp": 0.01257428, + "epoch": 0.23832857357583045, + "flos": 21878050181760.0, + "grad_norm": 2.083636930734351, + "language_loss": 0.77508426, + "learning_rate": 3.5617358435938538e-06, + "loss": 0.85316432, + "num_input_tokens_seen": 85278550, + "router_z_loss_clip": 2.30859375, + "router_z_loss_mlp": 0.21032715, + "step": 3964, + "time_per_iteration": 2.546093463897705 + }, + { + "auxiliary_loss_clip": 0.06513681, + "auxiliary_loss_mlp": 0.01275741, + "balance_loss_clip": 0.06290185, + "balance_loss_mlp": 0.01256275, + "epoch": 0.23838869682849842, + "flos": 21294441694080.0, + "grad_norm": 2.0070777911568207, + "language_loss": 0.72507781, + "learning_rate": 3.561492518769045e-06, + "loss": 0.80297208, + "num_input_tokens_seen": 85297345, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.19458008, + "step": 3965, + "time_per_iteration": 2.605717182159424 + }, + { + "auxiliary_loss_clip": 0.06518564, + "auxiliary_loss_mlp": 0.012776, + "balance_loss_clip": 0.06293208, + "balance_loss_mlp": 0.01258181, + "epoch": 0.23844882008116638, + "flos": 16186211308800.0, + "grad_norm": 2.069567415104782, + "language_loss": 0.79030257, + "learning_rate": 3.561249134732282e-06, + "loss": 0.8682642, + "num_input_tokens_seen": 85315105, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.19396973, + "step": 3966, + "time_per_iteration": 3.980722427368164 + }, + { + "auxiliary_loss_clip": 0.06517511, + "auxiliary_loss_mlp": 0.01283232, + "balance_loss_clip": 0.06290257, + "balance_loss_mlp": 0.01264647, + "epoch": 0.23850894333383435, + "flos": 21076165008000.0, + "grad_norm": 3.0015774693629433, + "language_loss": 0.69417417, + "learning_rate": 3.561005691492797e-06, + "loss": 0.77218163, + "num_input_tokens_seen": 85334735, + "router_z_loss_clip": 2.27929688, + "router_z_loss_mlp": 0.18579102, + "step": 3967, + "time_per_iteration": 2.542595386505127 + }, + { + "auxiliary_loss_clip": 0.06523537, + "auxiliary_loss_mlp": 0.01278611, + "balance_loss_clip": 0.0629587, + "balance_loss_mlp": 0.01257821, + "epoch": 0.23856906658650234, + "flos": 17207295563520.0, + "grad_norm": 1.9959497275253817, + "language_loss": 0.68410718, + "learning_rate": 3.5607621890598185e-06, + "loss": 0.76212859, + "num_input_tokens_seen": 85352875, + "router_z_loss_clip": 2.27539062, + "router_z_loss_mlp": 0.20800781, + "step": 3968, + "time_per_iteration": 2.5275728702545166 + }, + { + "auxiliary_loss_clip": 0.06526159, + "auxiliary_loss_mlp": 0.01279655, + "balance_loss_clip": 0.0629804, + "balance_loss_mlp": 0.01261392, + "epoch": 0.2386291898391703, + "flos": 29501451774720.0, + "grad_norm": 2.0078802263631994, + "language_loss": 0.77147222, + "learning_rate": 3.5605186274425823e-06, + "loss": 0.84953034, + "num_input_tokens_seen": 85372205, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.18261719, + "step": 3969, + "time_per_iteration": 4.006864547729492 + }, + { + "auxiliary_loss_clip": 0.06514208, + "auxiliary_loss_mlp": 0.01292793, + "balance_loss_clip": 0.06291697, + "balance_loss_mlp": 0.01274602, + "epoch": 0.23868931309183827, + "flos": 21148854024960.0, + "grad_norm": 1.9717404660495825, + "language_loss": 0.76892555, + "learning_rate": 3.5602750066503225e-06, + "loss": 0.84699559, + "num_input_tokens_seen": 85389705, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.18188477, + "step": 3970, + "time_per_iteration": 2.558915615081787 + }, + { + "auxiliary_loss_clip": 0.06523073, + "auxiliary_loss_mlp": 0.0128602, + "balance_loss_clip": 0.0629265, + "balance_loss_mlp": 0.01265969, + "epoch": 0.23874943634450624, + "flos": 25665342076800.0, + "grad_norm": 2.212795121423013, + "language_loss": 0.85452002, + "learning_rate": 3.5600313266922793e-06, + "loss": 0.93261099, + "num_input_tokens_seen": 85407855, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.20043945, + "step": 3971, + "time_per_iteration": 2.5621652603149414 + }, + { + "auxiliary_loss_clip": 0.06391954, + "auxiliary_loss_mlp": 0.01255828, + "balance_loss_clip": 0.06279661, + "balance_loss_mlp": 0.01251122, + "epoch": 0.2388095595971742, + "flos": 59006871889920.0, + "grad_norm": 0.7183517633018239, + "language_loss": 0.62744105, + "learning_rate": 3.5597875875776915e-06, + "loss": 0.70391893, + "num_input_tokens_seen": 85470885, + "router_z_loss_clip": 1.125, + "router_z_loss_mlp": 0.04696655, + "step": 3972, + "time_per_iteration": 4.643376350402832 + }, + { + "auxiliary_loss_clip": 0.06515118, + "auxiliary_loss_mlp": 0.01277926, + "balance_loss_clip": 0.06290536, + "balance_loss_mlp": 0.01258399, + "epoch": 0.23886968284984217, + "flos": 16805975633280.0, + "grad_norm": 3.0192177240020976, + "language_loss": 0.81866533, + "learning_rate": 3.5595437893158013e-06, + "loss": 0.89659578, + "num_input_tokens_seen": 85488460, + "router_z_loss_clip": 2.2421875, + "router_z_loss_mlp": 0.19543457, + "step": 3973, + "time_per_iteration": 2.5597283840179443 + }, + { + "auxiliary_loss_clip": 0.06517763, + "auxiliary_loss_mlp": 0.01283675, + "balance_loss_clip": 0.06291795, + "balance_loss_mlp": 0.01265162, + "epoch": 0.23892980610251013, + "flos": 22389221214720.0, + "grad_norm": 1.829209898292947, + "language_loss": 0.79696077, + "learning_rate": 3.5592999319158546e-06, + "loss": 0.8749752, + "num_input_tokens_seen": 85508590, + "router_z_loss_clip": 2.25976562, + "router_z_loss_mlp": 0.18518066, + "step": 3974, + "time_per_iteration": 2.5331227779388428 + }, + { + "auxiliary_loss_clip": 0.06524864, + "auxiliary_loss_mlp": 0.01291591, + "balance_loss_clip": 0.06296244, + "balance_loss_mlp": 0.01272279, + "epoch": 0.23898992935517813, + "flos": 12828135553920.0, + "grad_norm": 6.773745042238101, + "language_loss": 0.85156423, + "learning_rate": 3.5590560153870984e-06, + "loss": 0.92972875, + "num_input_tokens_seen": 85525970, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.19311523, + "step": 3975, + "time_per_iteration": 2.6016342639923096 + }, + { + "auxiliary_loss_clip": 0.06513388, + "auxiliary_loss_mlp": 0.01278416, + "balance_loss_clip": 0.06290747, + "balance_loss_mlp": 0.01260117, + "epoch": 0.2390500526078461, + "flos": 22352142983040.0, + "grad_norm": 3.375355565005516, + "language_loss": 0.84191501, + "learning_rate": 3.5588120397387816e-06, + "loss": 0.91983294, + "num_input_tokens_seen": 85543700, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.1829834, + "step": 3976, + "time_per_iteration": 2.5339527130126953 + }, + { + "auxiliary_loss_clip": 0.06511909, + "auxiliary_loss_mlp": 0.01282136, + "balance_loss_clip": 0.06290296, + "balance_loss_mlp": 0.01264111, + "epoch": 0.23911017586051406, + "flos": 22641263896320.0, + "grad_norm": 3.0704844059493497, + "language_loss": 0.74960983, + "learning_rate": 3.5585680049801566e-06, + "loss": 0.82755029, + "num_input_tokens_seen": 85562765, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.18029785, + "step": 3977, + "time_per_iteration": 2.5528597831726074 + }, + { + "auxiliary_loss_clip": 0.06524444, + "auxiliary_loss_mlp": 0.01281803, + "balance_loss_clip": 0.06296922, + "balance_loss_mlp": 0.01261478, + "epoch": 0.23917029911318202, + "flos": 23658993987840.0, + "grad_norm": 3.246082679368102, + "language_loss": 0.7235828, + "learning_rate": 3.5583239111204764e-06, + "loss": 0.80164528, + "num_input_tokens_seen": 85581755, + "router_z_loss_clip": 2.2734375, + "router_z_loss_mlp": 0.203125, + "step": 3978, + "time_per_iteration": 2.548459768295288 + }, + { + "auxiliary_loss_clip": 0.06536747, + "auxiliary_loss_mlp": 0.01279264, + "balance_loss_clip": 0.06306014, + "balance_loss_mlp": 0.0125994, + "epoch": 0.23923042236585, + "flos": 22790163801600.0, + "grad_norm": 2.3394422136849875, + "language_loss": 0.79264927, + "learning_rate": 3.558079758168997e-06, + "loss": 0.87080932, + "num_input_tokens_seen": 85599455, + "router_z_loss_clip": 2.30859375, + "router_z_loss_mlp": 0.1932373, + "step": 3979, + "time_per_iteration": 2.5696120262145996 + }, + { + "auxiliary_loss_clip": 0.06521225, + "auxiliary_loss_mlp": 0.01282521, + "balance_loss_clip": 0.06295727, + "balance_loss_mlp": 0.01263185, + "epoch": 0.23929054561851795, + "flos": 28155300405120.0, + "grad_norm": 1.7900268576070866, + "language_loss": 0.81971824, + "learning_rate": 3.557835546134977e-06, + "loss": 0.89775562, + "num_input_tokens_seen": 85619970, + "router_z_loss_clip": 2.25585938, + "router_z_loss_mlp": 0.1932373, + "step": 3980, + "time_per_iteration": 2.587286949157715 + }, + { + "auxiliary_loss_clip": 0.06519361, + "auxiliary_loss_mlp": 0.01281001, + "balance_loss_clip": 0.06296664, + "balance_loss_mlp": 0.01261891, + "epoch": 0.23935066887118592, + "flos": 21692491315200.0, + "grad_norm": 1.7930077111492302, + "language_loss": 0.84270984, + "learning_rate": 3.5575912750276775e-06, + "loss": 0.92071348, + "num_input_tokens_seen": 85638850, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.19091797, + "step": 3981, + "time_per_iteration": 2.550725221633911 + }, + { + "auxiliary_loss_clip": 0.06535558, + "auxiliary_loss_mlp": 0.01280601, + "balance_loss_clip": 0.06303745, + "balance_loss_mlp": 0.01260669, + "epoch": 0.2394107921238539, + "flos": 32130121737600.0, + "grad_norm": 2.0248039039910393, + "language_loss": 0.77712274, + "learning_rate": 3.5573469448563607e-06, + "loss": 0.85528433, + "num_input_tokens_seen": 85656285, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.19934082, + "step": 3982, + "time_per_iteration": 2.594698667526245 + }, + { + "auxiliary_loss_clip": 0.06530322, + "auxiliary_loss_mlp": 0.01280321, + "balance_loss_clip": 0.06304529, + "balance_loss_mlp": 0.01261307, + "epoch": 0.23947091537652188, + "flos": 17024839297920.0, + "grad_norm": 1.9623565914246572, + "language_loss": 0.7809152, + "learning_rate": 3.5571025556302915e-06, + "loss": 0.85902166, + "num_input_tokens_seen": 85673020, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.19006348, + "step": 3983, + "time_per_iteration": 2.537132740020752 + }, + { + "auxiliary_loss_clip": 0.06527262, + "auxiliary_loss_mlp": 0.01280803, + "balance_loss_clip": 0.0630171, + "balance_loss_mlp": 0.01261956, + "epoch": 0.23953103862918984, + "flos": 20599640438400.0, + "grad_norm": 2.137172968887566, + "language_loss": 0.73945713, + "learning_rate": 3.556858107358737e-06, + "loss": 0.81753772, + "num_input_tokens_seen": 85692565, + "router_z_loss_clip": 2.25585938, + "router_z_loss_mlp": 0.18835449, + "step": 3984, + "time_per_iteration": 2.538221836090088 + }, + { + "auxiliary_loss_clip": 0.06531888, + "auxiliary_loss_mlp": 0.01281613, + "balance_loss_clip": 0.06302323, + "balance_loss_mlp": 0.01262587, + "epoch": 0.2395911618818578, + "flos": 20710707425280.0, + "grad_norm": 1.9765684717262704, + "language_loss": 0.7965889, + "learning_rate": 3.5566136000509674e-06, + "loss": 0.87472391, + "num_input_tokens_seen": 85709730, + "router_z_loss_clip": 2.29492188, + "router_z_loss_mlp": 0.19030762, + "step": 3985, + "time_per_iteration": 2.551649570465088 + }, + { + "auxiliary_loss_clip": 0.06532246, + "auxiliary_loss_mlp": 0.0127953, + "balance_loss_clip": 0.06304285, + "balance_loss_mlp": 0.01259265, + "epoch": 0.23965128513452577, + "flos": 27060982081920.0, + "grad_norm": 1.916737509209056, + "language_loss": 0.73610401, + "learning_rate": 3.556369033716254e-06, + "loss": 0.8142218, + "num_input_tokens_seen": 85730045, + "router_z_loss_clip": 2.27929688, + "router_z_loss_mlp": 0.20263672, + "step": 3986, + "time_per_iteration": 2.710397481918335 + }, + { + "auxiliary_loss_clip": 0.06540911, + "auxiliary_loss_mlp": 0.01281338, + "balance_loss_clip": 0.0630495, + "balance_loss_mlp": 0.01261, + "epoch": 0.23971140838719374, + "flos": 23150254723200.0, + "grad_norm": 1.785192597796332, + "language_loss": 0.88325328, + "learning_rate": 3.556124408363871e-06, + "loss": 0.96147585, + "num_input_tokens_seen": 85747590, + "router_z_loss_clip": 2.36328125, + "router_z_loss_mlp": 0.20336914, + "step": 3987, + "time_per_iteration": 2.6331911087036133 + }, + { + "auxiliary_loss_clip": 0.06529854, + "auxiliary_loss_mlp": 0.01278502, + "balance_loss_clip": 0.06312454, + "balance_loss_mlp": 0.0126043, + "epoch": 0.23977153163986173, + "flos": 18039341007360.0, + "grad_norm": 2.2552133940915224, + "language_loss": 0.84056735, + "learning_rate": 3.5558797240030945e-06, + "loss": 0.91865093, + "num_input_tokens_seen": 85763460, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.18078613, + "step": 3988, + "time_per_iteration": 2.5413994789123535 + }, + { + "auxiliary_loss_clip": 0.06533512, + "auxiliary_loss_mlp": 0.01288032, + "balance_loss_clip": 0.06306052, + "balance_loss_mlp": 0.01267052, + "epoch": 0.2398316548925297, + "flos": 18119157621120.0, + "grad_norm": 1.6232739060807335, + "language_loss": 0.85473406, + "learning_rate": 3.5556349806432035e-06, + "loss": 0.93294942, + "num_input_tokens_seen": 85782050, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.2097168, + "step": 3989, + "time_per_iteration": 2.528348207473755 + }, + { + "auxiliary_loss_clip": 0.06527147, + "auxiliary_loss_mlp": 0.01286562, + "balance_loss_clip": 0.06305796, + "balance_loss_mlp": 0.01266642, + "epoch": 0.23989177814519766, + "flos": 12572612928000.0, + "grad_norm": 2.695913709141839, + "language_loss": 0.8517406, + "learning_rate": 3.555390178293477e-06, + "loss": 0.92987764, + "num_input_tokens_seen": 85797400, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.19909668, + "step": 3990, + "time_per_iteration": 2.52915358543396 + }, + { + "auxiliary_loss_clip": 0.06527729, + "auxiliary_loss_mlp": 0.01283435, + "balance_loss_clip": 0.06302518, + "balance_loss_mlp": 0.01264064, + "epoch": 0.23995190139786562, + "flos": 25271569013760.0, + "grad_norm": 1.4267230320219149, + "language_loss": 0.76345301, + "learning_rate": 3.5551453169631994e-06, + "loss": 0.84156466, + "num_input_tokens_seen": 85818995, + "router_z_loss_clip": 2.25, + "router_z_loss_mlp": 0.19372559, + "step": 3991, + "time_per_iteration": 2.556820869445801 + }, + { + "auxiliary_loss_clip": 0.06413993, + "auxiliary_loss_mlp": 0.0126815, + "balance_loss_clip": 0.06298733, + "balance_loss_mlp": 0.01262789, + "epoch": 0.2400120246505336, + "flos": 61978107271680.0, + "grad_norm": 0.8724678757997124, + "language_loss": 0.6358996, + "learning_rate": 3.554900396661656e-06, + "loss": 0.71272099, + "num_input_tokens_seen": 85876695, + "router_z_loss_clip": 1.15625, + "router_z_loss_mlp": 0.05368042, + "step": 3992, + "time_per_iteration": 3.0817418098449707 + }, + { + "auxiliary_loss_clip": 0.06411353, + "auxiliary_loss_mlp": 0.01264238, + "balance_loss_clip": 0.06297012, + "balance_loss_mlp": 0.01259121, + "epoch": 0.24007214790320155, + "flos": 66727923816960.0, + "grad_norm": 0.7394753945990321, + "language_loss": 0.62864375, + "learning_rate": 3.5546554173981334e-06, + "loss": 0.70539963, + "num_input_tokens_seen": 85940990, + "router_z_loss_clip": 1.140625, + "router_z_loss_mlp": 0.05117798, + "step": 3993, + "time_per_iteration": 3.2552971839904785 + }, + { + "auxiliary_loss_clip": 0.0652933, + "auxiliary_loss_mlp": 0.01280032, + "balance_loss_clip": 0.062997, + "balance_loss_mlp": 0.0125886, + "epoch": 0.24013227115586952, + "flos": 25815667501440.0, + "grad_norm": 1.8775036450716396, + "language_loss": 0.77610862, + "learning_rate": 3.5544103791819218e-06, + "loss": 0.85420227, + "num_input_tokens_seen": 85961165, + "router_z_loss_clip": 2.30078125, + "router_z_loss_mlp": 0.21154785, + "step": 3994, + "time_per_iteration": 2.6225738525390625 + }, + { + "auxiliary_loss_clip": 0.06526788, + "auxiliary_loss_mlp": 0.01288387, + "balance_loss_clip": 0.06296962, + "balance_loss_mlp": 0.01266822, + "epoch": 0.2401923944085375, + "flos": 25564672995840.0, + "grad_norm": 1.626402048760673, + "language_loss": 0.78733414, + "learning_rate": 3.5541652820223124e-06, + "loss": 0.86548591, + "num_input_tokens_seen": 85982710, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.21557617, + "step": 3995, + "time_per_iteration": 2.5860579013824463 + }, + { + "auxiliary_loss_clip": 0.06395802, + "auxiliary_loss_mlp": 0.01265549, + "balance_loss_clip": 0.06282736, + "balance_loss_mlp": 0.01260685, + "epoch": 0.24025251766120548, + "flos": 54961457892480.0, + "grad_norm": 0.8928130340410044, + "language_loss": 0.63566971, + "learning_rate": 3.5539201259286006e-06, + "loss": 0.71228325, + "num_input_tokens_seen": 86046935, + "router_z_loss_clip": 1.12597656, + "router_z_loss_mlp": 0.04858398, + "step": 3996, + "time_per_iteration": 3.232227087020874 + }, + { + "auxiliary_loss_clip": 0.06522241, + "auxiliary_loss_mlp": 0.01283128, + "balance_loss_clip": 0.06290409, + "balance_loss_mlp": 0.0126328, + "epoch": 0.24031264091387344, + "flos": 20637305648640.0, + "grad_norm": 2.8724335092069864, + "language_loss": 0.71121502, + "learning_rate": 3.5536749109100808e-06, + "loss": 0.78926873, + "num_input_tokens_seen": 86064355, + "router_z_loss_clip": 2.31835938, + "router_z_loss_mlp": 0.19848633, + "step": 3997, + "time_per_iteration": 2.5484869480133057 + }, + { + "auxiliary_loss_clip": 0.06510898, + "auxiliary_loss_mlp": 0.01285703, + "balance_loss_clip": 0.06285729, + "balance_loss_mlp": 0.01265473, + "epoch": 0.2403727641665414, + "flos": 20892492858240.0, + "grad_norm": 1.7909711234465908, + "language_loss": 0.87516266, + "learning_rate": 3.5534296369760535e-06, + "loss": 0.9531287, + "num_input_tokens_seen": 86081340, + "router_z_loss_clip": 2.25390625, + "router_z_loss_mlp": 0.20227051, + "step": 3998, + "time_per_iteration": 2.563215970993042 + }, + { + "auxiliary_loss_clip": 0.06526193, + "auxiliary_loss_mlp": 0.01279159, + "balance_loss_clip": 0.06292593, + "balance_loss_mlp": 0.01258762, + "epoch": 0.24043288741920937, + "flos": 22826613127680.0, + "grad_norm": 1.593528116777893, + "language_loss": 0.76414531, + "learning_rate": 3.5531843041358183e-06, + "loss": 0.84219879, + "num_input_tokens_seen": 86102260, + "router_z_loss_clip": 2.3359375, + "router_z_loss_mlp": 0.20410156, + "step": 3999, + "time_per_iteration": 2.5577592849731445 + }, + { + "auxiliary_loss_clip": 0.06511137, + "auxiliary_loss_mlp": 0.01275527, + "balance_loss_clip": 0.0628795, + "balance_loss_mlp": 0.01256716, + "epoch": 0.24049301067187734, + "flos": 27966261594240.0, + "grad_norm": 2.3407253335254086, + "language_loss": 0.73292184, + "learning_rate": 3.552938912398679e-06, + "loss": 0.81078851, + "num_input_tokens_seen": 86123400, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.18823242, + "step": 4000, + "time_per_iteration": 2.583524703979492 + }, + { + "auxiliary_loss_clip": 0.06528921, + "auxiliary_loss_mlp": 0.01283655, + "balance_loss_clip": 0.06293923, + "balance_loss_mlp": 0.01261935, + "epoch": 0.24055313392454533, + "flos": 27458360870400.0, + "grad_norm": 2.671051655318694, + "language_loss": 0.67159665, + "learning_rate": 3.5526934617739397e-06, + "loss": 0.74972242, + "num_input_tokens_seen": 86144060, + "router_z_loss_clip": 2.3515625, + "router_z_loss_mlp": 0.21728516, + "step": 4001, + "time_per_iteration": 2.6188552379608154 + }, + { + "auxiliary_loss_clip": 0.06522354, + "auxiliary_loss_mlp": 0.01279459, + "balance_loss_clip": 0.06293849, + "balance_loss_mlp": 0.01257703, + "epoch": 0.2406132571772133, + "flos": 25563666746880.0, + "grad_norm": 5.034242823707272, + "language_loss": 0.83152658, + "learning_rate": 3.5524479522709095e-06, + "loss": 0.90954471, + "num_input_tokens_seen": 86163005, + "router_z_loss_clip": 2.28320312, + "router_z_loss_mlp": 0.21740723, + "step": 4002, + "time_per_iteration": 3.9769785404205322 + }, + { + "auxiliary_loss_clip": 0.06519094, + "auxiliary_loss_mlp": 0.01282536, + "balance_loss_clip": 0.06293523, + "balance_loss_mlp": 0.01262032, + "epoch": 0.24067338042988126, + "flos": 24798482461440.0, + "grad_norm": 2.0463487498067323, + "language_loss": 0.83599687, + "learning_rate": 3.552202383898897e-06, + "loss": 0.91401321, + "num_input_tokens_seen": 86182580, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.20483398, + "step": 4003, + "time_per_iteration": 2.581669569015503 + }, + { + "auxiliary_loss_clip": 0.06526292, + "auxiliary_loss_mlp": 0.01281725, + "balance_loss_clip": 0.06295015, + "balance_loss_mlp": 0.01261412, + "epoch": 0.24073350368254923, + "flos": 21184171320960.0, + "grad_norm": 2.0670244348036646, + "language_loss": 0.87907362, + "learning_rate": 3.551956756667215e-06, + "loss": 0.9571538, + "num_input_tokens_seen": 86200665, + "router_z_loss_clip": 2.31445312, + "router_z_loss_mlp": 0.20300293, + "step": 4004, + "time_per_iteration": 2.514268636703491 + }, + { + "auxiliary_loss_clip": 0.06526911, + "auxiliary_loss_mlp": 0.01282868, + "balance_loss_clip": 0.06294513, + "balance_loss_mlp": 0.01261815, + "epoch": 0.2407936269352172, + "flos": 22501252523520.0, + "grad_norm": 3.538522770409821, + "language_loss": 0.78168321, + "learning_rate": 3.551711070585177e-06, + "loss": 0.85978097, + "num_input_tokens_seen": 86221640, + "router_z_loss_clip": 2.32421875, + "router_z_loss_mlp": 0.21057129, + "step": 4005, + "time_per_iteration": 2.67775559425354 + }, + { + "auxiliary_loss_clip": 0.0651572, + "auxiliary_loss_mlp": 0.01283457, + "balance_loss_clip": 0.06293365, + "balance_loss_mlp": 0.01263084, + "epoch": 0.24085375018788516, + "flos": 18556968804480.0, + "grad_norm": 2.371719422478697, + "language_loss": 0.79360878, + "learning_rate": 3.5514653256620995e-06, + "loss": 0.87160051, + "num_input_tokens_seen": 86240795, + "router_z_loss_clip": 2.2265625, + "router_z_loss_mlp": 0.20373535, + "step": 4006, + "time_per_iteration": 4.034858465194702 + }, + { + "auxiliary_loss_clip": 0.0653493, + "auxiliary_loss_mlp": 0.01283621, + "balance_loss_clip": 0.06296381, + "balance_loss_mlp": 0.01260709, + "epoch": 0.24091387344055312, + "flos": 24177418398720.0, + "grad_norm": 1.8737477168573817, + "language_loss": 0.71813238, + "learning_rate": 3.551219521907302e-06, + "loss": 0.79631788, + "num_input_tokens_seen": 86262000, + "router_z_loss_clip": 2.38671875, + "router_z_loss_mlp": 0.22912598, + "step": 4007, + "time_per_iteration": 2.5730202198028564 + }, + { + "auxiliary_loss_clip": 0.06518448, + "auxiliary_loss_mlp": 0.01300708, + "balance_loss_clip": 0.06295364, + "balance_loss_mlp": 0.01278773, + "epoch": 0.24097399669322112, + "flos": 11041112327040.0, + "grad_norm": 6.473369852788927, + "language_loss": 0.76978099, + "learning_rate": 3.5509736593301042e-06, + "loss": 0.84797251, + "num_input_tokens_seen": 86279680, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.21936035, + "step": 4008, + "time_per_iteration": 2.55989146232605 + }, + { + "auxiliary_loss_clip": 0.06518552, + "auxiliary_loss_mlp": 0.01286303, + "balance_loss_clip": 0.062894, + "balance_loss_mlp": 0.01264928, + "epoch": 0.24103411994588908, + "flos": 17170762383360.0, + "grad_norm": 2.1979472110907556, + "language_loss": 0.75080305, + "learning_rate": 3.5507277379398295e-06, + "loss": 0.82885164, + "num_input_tokens_seen": 86297180, + "router_z_loss_clip": 2.29296875, + "router_z_loss_mlp": 0.21398926, + "step": 4009, + "time_per_iteration": 3.957920551300049 + }, + { + "auxiliary_loss_clip": 0.06521554, + "auxiliary_loss_mlp": 0.01301136, + "balance_loss_clip": 0.06293823, + "balance_loss_mlp": 0.01279869, + "epoch": 0.24109424319855705, + "flos": 20674258099200.0, + "grad_norm": 1.5898496231384156, + "language_loss": 0.80111217, + "learning_rate": 3.550481757745804e-06, + "loss": 0.8793391, + "num_input_tokens_seen": 86317660, + "router_z_loss_clip": 2.27539062, + "router_z_loss_mlp": 0.21264648, + "step": 4010, + "time_per_iteration": 2.5475916862487793 + }, + { + "auxiliary_loss_clip": 0.06527252, + "auxiliary_loss_mlp": 0.01291864, + "balance_loss_clip": 0.06297424, + "balance_loss_mlp": 0.01268964, + "epoch": 0.241154366451225, + "flos": 28188982546560.0, + "grad_norm": 2.0856120841249366, + "language_loss": 0.70933908, + "learning_rate": 3.5502357187573555e-06, + "loss": 0.78753024, + "num_input_tokens_seen": 86338325, + "router_z_loss_clip": 2.29882812, + "router_z_loss_mlp": 0.22912598, + "step": 4011, + "time_per_iteration": 2.630932092666626 + }, + { + "auxiliary_loss_clip": 0.06528456, + "auxiliary_loss_mlp": 0.0128714, + "balance_loss_clip": 0.06298923, + "balance_loss_mlp": 0.01265766, + "epoch": 0.24121448970389298, + "flos": 21696222821760.0, + "grad_norm": 1.7418824634594252, + "language_loss": 0.694484, + "learning_rate": 3.5499896209838118e-06, + "loss": 0.77263987, + "num_input_tokens_seen": 86357615, + "router_z_loss_clip": 2.29492188, + "router_z_loss_mlp": 0.21362305, + "step": 4012, + "time_per_iteration": 3.988281726837158 + }, + { + "auxiliary_loss_clip": 0.06528036, + "auxiliary_loss_mlp": 0.01287792, + "balance_loss_clip": 0.06296879, + "balance_loss_mlp": 0.01264391, + "epoch": 0.24127461295656094, + "flos": 39685530142080.0, + "grad_norm": 1.5971840931497265, + "language_loss": 0.74512959, + "learning_rate": 3.5497434644345073e-06, + "loss": 0.82328784, + "num_input_tokens_seen": 86380355, + "router_z_loss_clip": 2.31054688, + "router_z_loss_mlp": 0.23388672, + "step": 4013, + "time_per_iteration": 2.7159719467163086 + }, + { + "auxiliary_loss_clip": 0.06531674, + "auxiliary_loss_mlp": 0.01283711, + "balance_loss_clip": 0.0630402, + "balance_loss_mlp": 0.01263231, + "epoch": 0.2413347362092289, + "flos": 19141960884480.0, + "grad_norm": 1.667652232266074, + "language_loss": 0.89031768, + "learning_rate": 3.5494972491187753e-06, + "loss": 0.96847153, + "num_input_tokens_seen": 86399125, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.20483398, + "step": 4014, + "time_per_iteration": 2.5638303756713867 + }, + { + "auxiliary_loss_clip": 0.06538786, + "auxiliary_loss_mlp": 0.01289681, + "balance_loss_clip": 0.06304225, + "balance_loss_mlp": 0.01268831, + "epoch": 0.2413948594618969, + "flos": 26946099734400.0, + "grad_norm": 1.9521080560444544, + "language_loss": 0.95043075, + "learning_rate": 3.549250975045952e-06, + "loss": 1.02871537, + "num_input_tokens_seen": 86418625, + "router_z_loss_clip": 2.34179688, + "router_z_loss_mlp": 0.20849609, + "step": 4015, + "time_per_iteration": 2.5697052478790283 + }, + { + "auxiliary_loss_clip": 0.0653477, + "auxiliary_loss_mlp": 0.01278309, + "balance_loss_clip": 0.06306844, + "balance_loss_mlp": 0.01257781, + "epoch": 0.24145498271456486, + "flos": 25235077760640.0, + "grad_norm": 1.8045004389175856, + "language_loss": 0.83243644, + "learning_rate": 3.5490046422253768e-06, + "loss": 0.91056728, + "num_input_tokens_seen": 86438375, + "router_z_loss_clip": 2.27929688, + "router_z_loss_mlp": 0.2052002, + "step": 4016, + "time_per_iteration": 2.5709176063537598 + }, + { + "auxiliary_loss_clip": 0.06532364, + "auxiliary_loss_mlp": 0.01285254, + "balance_loss_clip": 0.06311545, + "balance_loss_mlp": 0.0126463, + "epoch": 0.24151510596723283, + "flos": 40671339027840.0, + "grad_norm": 2.079467312298135, + "language_loss": 0.69439638, + "learning_rate": 3.54875825066639e-06, + "loss": 0.77257252, + "num_input_tokens_seen": 86463230, + "router_z_loss_clip": 2.20507812, + "router_z_loss_mlp": 0.20617676, + "step": 4017, + "time_per_iteration": 2.6893186569213867 + }, + { + "auxiliary_loss_clip": 0.06536807, + "auxiliary_loss_mlp": 0.01288936, + "balance_loss_clip": 0.06305309, + "balance_loss_mlp": 0.01266286, + "epoch": 0.2415752292199008, + "flos": 18151917367680.0, + "grad_norm": 1.6840714927615923, + "language_loss": 0.84970623, + "learning_rate": 3.5485118003783353e-06, + "loss": 0.92796361, + "num_input_tokens_seen": 86481230, + "router_z_loss_clip": 2.31445312, + "router_z_loss_mlp": 0.2265625, + "step": 4018, + "time_per_iteration": 2.521129608154297 + }, + { + "auxiliary_loss_clip": 0.06448493, + "auxiliary_loss_mlp": 0.01257752, + "balance_loss_clip": 0.06334345, + "balance_loss_mlp": 0.01253335, + "epoch": 0.24163535247256876, + "flos": 67307213819520.0, + "grad_norm": 1.2396896293086193, + "language_loss": 0.6054306, + "learning_rate": 3.548265291370558e-06, + "loss": 0.68249303, + "num_input_tokens_seen": 86541260, + "router_z_loss_clip": 1.140625, + "router_z_loss_mlp": 0.04425049, + "step": 4019, + "time_per_iteration": 3.2191333770751953 + }, + { + "auxiliary_loss_clip": 0.06539527, + "auxiliary_loss_mlp": 0.0127314, + "balance_loss_clip": 0.06310145, + "balance_loss_mlp": 0.01253983, + "epoch": 0.24169547572523672, + "flos": 24935810503680.0, + "grad_norm": 1.839335570686334, + "language_loss": 0.73635018, + "learning_rate": 3.5480187236524055e-06, + "loss": 0.81447685, + "num_input_tokens_seen": 86559580, + "router_z_loss_clip": 2.29296875, + "router_z_loss_mlp": 0.19140625, + "step": 4020, + "time_per_iteration": 2.587033271789551 + }, + { + "auxiliary_loss_clip": 0.06547633, + "auxiliary_loss_mlp": 0.01279706, + "balance_loss_clip": 0.06321433, + "balance_loss_mlp": 0.01259094, + "epoch": 0.24175559897790472, + "flos": 18733303722240.0, + "grad_norm": 1.757855043925666, + "language_loss": 0.81927264, + "learning_rate": 3.5477720972332285e-06, + "loss": 0.89754599, + "num_input_tokens_seen": 86577560, + "router_z_loss_clip": 2.26171875, + "router_z_loss_mlp": 0.20617676, + "step": 4021, + "time_per_iteration": 2.516295909881592 + }, + { + "auxiliary_loss_clip": 0.06542306, + "auxiliary_loss_mlp": 0.0127859, + "balance_loss_clip": 0.06314138, + "balance_loss_mlp": 0.01255201, + "epoch": 0.24181572223057268, + "flos": 23045937989760.0, + "grad_norm": 1.9677245364232816, + "language_loss": 0.76831293, + "learning_rate": 3.547525412122378e-06, + "loss": 0.84652191, + "num_input_tokens_seen": 86595350, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.23388672, + "step": 4022, + "time_per_iteration": 2.560833692550659 + }, + { + "auxiliary_loss_clip": 0.0655847, + "auxiliary_loss_mlp": 0.01279281, + "balance_loss_clip": 0.06321847, + "balance_loss_mlp": 0.01257477, + "epoch": 0.24187584548324065, + "flos": 20382411928320.0, + "grad_norm": 1.7589452517035808, + "language_loss": 0.75334597, + "learning_rate": 3.5472786683292083e-06, + "loss": 0.83172357, + "num_input_tokens_seen": 86614805, + "router_z_loss_clip": 2.36523438, + "router_z_loss_mlp": 0.21789551, + "step": 4023, + "time_per_iteration": 2.5414137840270996 + }, + { + "auxiliary_loss_clip": 0.06554291, + "auxiliary_loss_mlp": 0.01279196, + "balance_loss_clip": 0.06325305, + "balance_loss_mlp": 0.01258466, + "epoch": 0.2419359687359086, + "flos": 21403915453440.0, + "grad_norm": 1.837159559636974, + "language_loss": 0.82581335, + "learning_rate": 3.5470318658630766e-06, + "loss": 0.90414816, + "num_input_tokens_seen": 86633700, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.20751953, + "step": 4024, + "time_per_iteration": 2.570636034011841 + }, + { + "auxiliary_loss_clip": 0.06544912, + "auxiliary_loss_mlp": 0.01281053, + "balance_loss_clip": 0.06319256, + "balance_loss_mlp": 0.01260394, + "epoch": 0.24199609198857658, + "flos": 18375309152640.0, + "grad_norm": 1.8763334718563411, + "language_loss": 0.86724782, + "learning_rate": 3.5467850047333424e-06, + "loss": 0.94550753, + "num_input_tokens_seen": 86650905, + "router_z_loss_clip": 2.25585938, + "router_z_loss_mlp": 0.20654297, + "step": 4025, + "time_per_iteration": 2.507725715637207 + }, + { + "auxiliary_loss_clip": 0.0654591, + "auxiliary_loss_mlp": 0.01281956, + "balance_loss_clip": 0.06312732, + "balance_loss_mlp": 0.01261905, + "epoch": 0.24205621524124454, + "flos": 19469962892160.0, + "grad_norm": 2.105058685916829, + "language_loss": 0.72386706, + "learning_rate": 3.546538084949365e-06, + "loss": 0.80214572, + "num_input_tokens_seen": 86669185, + "router_z_loss_clip": 2.33398438, + "router_z_loss_mlp": 0.20068359, + "step": 4026, + "time_per_iteration": 2.573822498321533 + }, + { + "auxiliary_loss_clip": 0.06536272, + "auxiliary_loss_mlp": 0.01278576, + "balance_loss_clip": 0.06314979, + "balance_loss_mlp": 0.01258191, + "epoch": 0.2421163384939125, + "flos": 14981706466560.0, + "grad_norm": 5.331027510747572, + "language_loss": 0.64474452, + "learning_rate": 3.546291106520509e-06, + "loss": 0.722893, + "num_input_tokens_seen": 86686805, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.20397949, + "step": 4027, + "time_per_iteration": 2.5038652420043945 + }, + { + "auxiliary_loss_clip": 0.06553975, + "auxiliary_loss_mlp": 0.01291382, + "balance_loss_clip": 0.063242, + "balance_loss_mlp": 0.01271069, + "epoch": 0.2421764617465805, + "flos": 18668161572480.0, + "grad_norm": 2.149571528027882, + "language_loss": 0.70816404, + "learning_rate": 3.5460440694561388e-06, + "loss": 0.78661758, + "num_input_tokens_seen": 86705520, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.203125, + "step": 4028, + "time_per_iteration": 2.5707366466522217 + }, + { + "auxiliary_loss_clip": 0.06448589, + "auxiliary_loss_mlp": 0.01261037, + "balance_loss_clip": 0.06335288, + "balance_loss_mlp": 0.01254865, + "epoch": 0.24223658499924847, + "flos": 64368025424640.0, + "grad_norm": 0.8397041896242922, + "language_loss": 0.55315495, + "learning_rate": 3.545796973765623e-06, + "loss": 0.63025129, + "num_input_tokens_seen": 86767320, + "router_z_loss_clip": 1.13085938, + "router_z_loss_mlp": 0.06170654, + "step": 4029, + "time_per_iteration": 3.149601936340332 + }, + { + "auxiliary_loss_clip": 0.06557409, + "auxiliary_loss_mlp": 0.01307587, + "balance_loss_clip": 0.06331506, + "balance_loss_mlp": 0.01284615, + "epoch": 0.24229670825191643, + "flos": 25782278849280.0, + "grad_norm": 2.2612571716693664, + "language_loss": 0.75111073, + "learning_rate": 3.54554981945833e-06, + "loss": 0.82976073, + "num_input_tokens_seen": 86788110, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.22998047, + "step": 4030, + "time_per_iteration": 2.5939297676086426 + }, + { + "auxiliary_loss_clip": 0.0654521, + "auxiliary_loss_mlp": 0.0130894, + "balance_loss_clip": 0.06321512, + "balance_loss_mlp": 0.01287733, + "epoch": 0.2423568315045844, + "flos": 20673251850240.0, + "grad_norm": 1.8607136485921192, + "language_loss": 0.77126729, + "learning_rate": 3.5453026065436343e-06, + "loss": 0.84980875, + "num_input_tokens_seen": 86807640, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.2121582, + "step": 4031, + "time_per_iteration": 2.5886638164520264 + }, + { + "auxiliary_loss_clip": 0.06556953, + "auxiliary_loss_mlp": 0.01312472, + "balance_loss_clip": 0.06323709, + "balance_loss_mlp": 0.01290252, + "epoch": 0.24241695475725236, + "flos": 22422987210240.0, + "grad_norm": 1.956173023936914, + "language_loss": 0.66108859, + "learning_rate": 3.5450553350309083e-06, + "loss": 0.73978281, + "num_input_tokens_seen": 86826795, + "router_z_loss_clip": 2.33203125, + "router_z_loss_mlp": 0.22216797, + "step": 4032, + "time_per_iteration": 2.5665037631988525 + }, + { + "auxiliary_loss_clip": 0.06539695, + "auxiliary_loss_mlp": 0.01309421, + "balance_loss_clip": 0.06316876, + "balance_loss_mlp": 0.0128751, + "epoch": 0.24247707800992033, + "flos": 17134732327680.0, + "grad_norm": 3.4494454498841725, + "language_loss": 0.81464761, + "learning_rate": 3.5448080049295286e-06, + "loss": 0.89313877, + "num_input_tokens_seen": 86843175, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.21911621, + "step": 4033, + "time_per_iteration": 2.5237317085266113 + }, + { + "auxiliary_loss_clip": 0.06538171, + "auxiliary_loss_mlp": 0.01328283, + "balance_loss_clip": 0.06318024, + "balance_loss_mlp": 0.01305359, + "epoch": 0.2425372012625883, + "flos": 31621885597440.0, + "grad_norm": 1.909836856098088, + "language_loss": 0.69935066, + "learning_rate": 3.5445606162488754e-06, + "loss": 0.7780152, + "num_input_tokens_seen": 86863185, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.22900391, + "step": 4034, + "time_per_iteration": 2.713991641998291 + }, + { + "auxiliary_loss_clip": 0.06546839, + "auxiliary_loss_mlp": 0.01319063, + "balance_loss_clip": 0.06324256, + "balance_loss_mlp": 0.01298273, + "epoch": 0.24259732451525629, + "flos": 16331589342720.0, + "grad_norm": 2.1729941621503532, + "language_loss": 0.96340013, + "learning_rate": 3.5443131689983283e-06, + "loss": 1.04205918, + "num_input_tokens_seen": 86880040, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.20776367, + "step": 4035, + "time_per_iteration": 2.532848596572876 + }, + { + "auxiliary_loss_clip": 0.06537193, + "auxiliary_loss_mlp": 0.01327475, + "balance_loss_clip": 0.06319901, + "balance_loss_mlp": 0.01307447, + "epoch": 0.24265744776792425, + "flos": 22863230161920.0, + "grad_norm": 1.6992215283488847, + "language_loss": 0.78653824, + "learning_rate": 3.5440656631872715e-06, + "loss": 0.8651849, + "num_input_tokens_seen": 86900610, + "router_z_loss_clip": 2.17578125, + "router_z_loss_mlp": 0.20019531, + "step": 4036, + "time_per_iteration": 2.6079328060150146 + }, + { + "auxiliary_loss_clip": 0.06539825, + "auxiliary_loss_mlp": 0.01304693, + "balance_loss_clip": 0.06315397, + "balance_loss_mlp": 0.01282806, + "epoch": 0.24271757102059222, + "flos": 21878008254720.0, + "grad_norm": 1.624872867937933, + "language_loss": 0.74970233, + "learning_rate": 3.5438180988250898e-06, + "loss": 0.82814753, + "num_input_tokens_seen": 86919385, + "router_z_loss_clip": 2.2421875, + "router_z_loss_mlp": 0.21887207, + "step": 4037, + "time_per_iteration": 2.561479091644287 + }, + { + "auxiliary_loss_clip": 0.06526245, + "auxiliary_loss_mlp": 0.01308805, + "balance_loss_clip": 0.06302498, + "balance_loss_mlp": 0.01287539, + "epoch": 0.24277769427326018, + "flos": 19214649901440.0, + "grad_norm": 4.15075765155633, + "language_loss": 0.76952362, + "learning_rate": 3.543570475921171e-06, + "loss": 0.84787416, + "num_input_tokens_seen": 86938885, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.21276855, + "step": 4038, + "time_per_iteration": 2.514899492263794 + }, + { + "auxiliary_loss_clip": 0.06539176, + "auxiliary_loss_mlp": 0.01295141, + "balance_loss_clip": 0.06314565, + "balance_loss_mlp": 0.01272992, + "epoch": 0.24283781752592815, + "flos": 19505909093760.0, + "grad_norm": 2.116114626089979, + "language_loss": 0.72802031, + "learning_rate": 3.543322794484905e-06, + "loss": 0.80636352, + "num_input_tokens_seen": 86957705, + "router_z_loss_clip": 2.24414062, + "router_z_loss_mlp": 0.22167969, + "step": 4039, + "time_per_iteration": 2.603787422180176 + }, + { + "auxiliary_loss_clip": 0.06537706, + "auxiliary_loss_mlp": 0.01290985, + "balance_loss_clip": 0.06312682, + "balance_loss_mlp": 0.01269372, + "epoch": 0.2428979407785961, + "flos": 19908444908160.0, + "grad_norm": 1.7691638050154863, + "language_loss": 0.78818536, + "learning_rate": 3.5430750545256843e-06, + "loss": 0.86647218, + "num_input_tokens_seen": 86975845, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.21606445, + "step": 4040, + "time_per_iteration": 2.570063829421997 + }, + { + "auxiliary_loss_clip": 0.06530759, + "auxiliary_loss_mlp": 0.01283615, + "balance_loss_clip": 0.06313588, + "balance_loss_mlp": 0.01265162, + "epoch": 0.2429580640312641, + "flos": 24722523135360.0, + "grad_norm": 1.6907745152184719, + "language_loss": 0.81039703, + "learning_rate": 3.5428272560529027e-06, + "loss": 0.8885408, + "num_input_tokens_seen": 86994800, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.18444824, + "step": 4041, + "time_per_iteration": 2.5693795680999756 + }, + { + "auxiliary_loss_clip": 0.06532191, + "auxiliary_loss_mlp": 0.01286793, + "balance_loss_clip": 0.06311769, + "balance_loss_mlp": 0.01267529, + "epoch": 0.24301818728393207, + "flos": 25637529720960.0, + "grad_norm": 3.2457124561568, + "language_loss": 0.77433085, + "learning_rate": 3.542579399075957e-06, + "loss": 0.8525207, + "num_input_tokens_seen": 87016845, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.19262695, + "step": 4042, + "time_per_iteration": 3.9626972675323486 + }, + { + "auxiliary_loss_clip": 0.0653407, + "auxiliary_loss_mlp": 0.01279857, + "balance_loss_clip": 0.06316316, + "balance_loss_mlp": 0.01260652, + "epoch": 0.24307831053660003, + "flos": 26148700753920.0, + "grad_norm": 1.8532279658121147, + "language_loss": 0.82188201, + "learning_rate": 3.542331483604246e-06, + "loss": 0.90002131, + "num_input_tokens_seen": 87036270, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.19226074, + "step": 4043, + "time_per_iteration": 2.598202705383301 + }, + { + "auxiliary_loss_clip": 0.06538229, + "auxiliary_loss_mlp": 0.0127841, + "balance_loss_clip": 0.06309159, + "balance_loss_mlp": 0.01256594, + "epoch": 0.243138433789268, + "flos": 14977136419200.0, + "grad_norm": 2.775508644952731, + "language_loss": 0.73897892, + "learning_rate": 3.5420835096471706e-06, + "loss": 0.81714529, + "num_input_tokens_seen": 87049920, + "router_z_loss_clip": 2.29101562, + "router_z_loss_mlp": 0.21801758, + "step": 4044, + "time_per_iteration": 2.483752489089966 + }, + { + "auxiliary_loss_clip": 0.06534028, + "auxiliary_loss_mlp": 0.01284645, + "balance_loss_clip": 0.0631184, + "balance_loss_mlp": 0.01263629, + "epoch": 0.24319855704193596, + "flos": 25198670361600.0, + "grad_norm": 2.3685654829247227, + "language_loss": 0.83778739, + "learning_rate": 3.5418354772141337e-06, + "loss": 0.91597402, + "num_input_tokens_seen": 87068230, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.21020508, + "step": 4045, + "time_per_iteration": 2.60435152053833 + }, + { + "auxiliary_loss_clip": 0.06529962, + "auxiliary_loss_mlp": 0.0127985, + "balance_loss_clip": 0.06307946, + "balance_loss_mlp": 0.01260323, + "epoch": 0.24325868029460393, + "flos": 22133740515840.0, + "grad_norm": 1.834350653864789, + "language_loss": 0.87040859, + "learning_rate": 3.541587386314541e-06, + "loss": 0.94850671, + "num_input_tokens_seen": 87086435, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.19519043, + "step": 4046, + "time_per_iteration": 3.990011692047119 + }, + { + "auxiliary_loss_clip": 0.0652798, + "auxiliary_loss_mlp": 0.01281438, + "balance_loss_clip": 0.06311028, + "balance_loss_mlp": 0.01260922, + "epoch": 0.2433188035472719, + "flos": 23588107833600.0, + "grad_norm": 2.274532821816236, + "language_loss": 0.72945291, + "learning_rate": 3.5413392369578e-06, + "loss": 0.80754709, + "num_input_tokens_seen": 87105340, + "router_z_loss_clip": 2.16894531, + "router_z_loss_mlp": 0.20495605, + "step": 4047, + "time_per_iteration": 2.552464246749878 + }, + { + "auxiliary_loss_clip": 0.06530058, + "auxiliary_loss_mlp": 0.01284969, + "balance_loss_clip": 0.06306041, + "balance_loss_mlp": 0.01263666, + "epoch": 0.2433789267999399, + "flos": 24469809621120.0, + "grad_norm": 3.993347012147321, + "language_loss": 0.74453223, + "learning_rate": 3.5410910291533213e-06, + "loss": 0.8226825, + "num_input_tokens_seen": 87125780, + "router_z_loss_clip": 2.23828125, + "router_z_loss_mlp": 0.21325684, + "step": 4048, + "time_per_iteration": 4.027734279632568 + }, + { + "auxiliary_loss_clip": 0.06529407, + "auxiliary_loss_mlp": 0.01275879, + "balance_loss_clip": 0.06309648, + "balance_loss_mlp": 0.0125671, + "epoch": 0.24343905005260785, + "flos": 16733622032640.0, + "grad_norm": 2.185429514920852, + "language_loss": 0.73832756, + "learning_rate": 3.5408427629105155e-06, + "loss": 0.81638038, + "num_input_tokens_seen": 87144470, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.19165039, + "step": 4049, + "time_per_iteration": 2.5527403354644775 + }, + { + "auxiliary_loss_clip": 0.06525055, + "auxiliary_loss_mlp": 0.01275563, + "balance_loss_clip": 0.06306046, + "balance_loss_mlp": 0.01256084, + "epoch": 0.24349917330527582, + "flos": 20049294821760.0, + "grad_norm": 1.6558681415401064, + "language_loss": 0.74824917, + "learning_rate": 3.5405944382387985e-06, + "loss": 0.82625538, + "num_input_tokens_seen": 87162830, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.19482422, + "step": 4050, + "time_per_iteration": 2.517671585083008 + }, + { + "auxiliary_loss_clip": 0.06520879, + "auxiliary_loss_mlp": 0.0127856, + "balance_loss_clip": 0.06303313, + "balance_loss_mlp": 0.01258187, + "epoch": 0.24355929655794378, + "flos": 17426285009280.0, + "grad_norm": 2.447710360159803, + "language_loss": 0.75780261, + "learning_rate": 3.5403460551475854e-06, + "loss": 0.83579695, + "num_input_tokens_seen": 87180905, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.20361328, + "step": 4051, + "time_per_iteration": 3.961841583251953 + }, + { + "auxiliary_loss_clip": 0.06532377, + "auxiliary_loss_mlp": 0.01277824, + "balance_loss_clip": 0.06310124, + "balance_loss_mlp": 0.01257343, + "epoch": 0.24361941981061175, + "flos": 25417995223680.0, + "grad_norm": 2.289221862828171, + "language_loss": 0.71344352, + "learning_rate": 3.540097613646296e-06, + "loss": 0.79154545, + "num_input_tokens_seen": 87202290, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.20471191, + "step": 4052, + "time_per_iteration": 2.5851869583129883 + }, + { + "auxiliary_loss_clip": 0.06524909, + "auxiliary_loss_mlp": 0.01278097, + "balance_loss_clip": 0.06306259, + "balance_loss_mlp": 0.01258583, + "epoch": 0.2436795430632797, + "flos": 22827493595520.0, + "grad_norm": 1.7731467261886882, + "language_loss": 0.82073057, + "learning_rate": 3.539849113744351e-06, + "loss": 0.89876068, + "num_input_tokens_seen": 87221650, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.19519043, + "step": 4053, + "time_per_iteration": 2.6217734813690186 + }, + { + "auxiliary_loss_clip": 0.06533736, + "auxiliary_loss_mlp": 0.01278722, + "balance_loss_clip": 0.06309207, + "balance_loss_mlp": 0.01260126, + "epoch": 0.2437396663159477, + "flos": 15163030702080.0, + "grad_norm": 1.5690390746940162, + "language_loss": 0.78588867, + "learning_rate": 3.539600555451172e-06, + "loss": 0.86401325, + "num_input_tokens_seen": 87238515, + "router_z_loss_clip": 2.24414062, + "router_z_loss_mlp": 0.18615723, + "step": 4054, + "time_per_iteration": 2.513720750808716 + }, + { + "auxiliary_loss_clip": 0.06529565, + "auxiliary_loss_mlp": 0.0128197, + "balance_loss_clip": 0.06307493, + "balance_loss_mlp": 0.01263111, + "epoch": 0.24379978956861567, + "flos": 22097710460160.0, + "grad_norm": 1.7039269278884617, + "language_loss": 0.84417951, + "learning_rate": 3.5393519387761866e-06, + "loss": 0.92229491, + "num_input_tokens_seen": 87256290, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.1887207, + "step": 4055, + "time_per_iteration": 2.557584524154663 + }, + { + "auxiliary_loss_clip": 0.06542832, + "auxiliary_loss_mlp": 0.01280691, + "balance_loss_clip": 0.06312343, + "balance_loss_mlp": 0.01259508, + "epoch": 0.24385991282128364, + "flos": 31475878657920.0, + "grad_norm": 2.786051029634521, + "language_loss": 0.56684959, + "learning_rate": 3.5391032637288217e-06, + "loss": 0.6450848, + "num_input_tokens_seen": 87277085, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.21179199, + "step": 4056, + "time_per_iteration": 2.6548893451690674 + }, + { + "auxiliary_loss_clip": 0.06533613, + "auxiliary_loss_mlp": 0.01283826, + "balance_loss_clip": 0.06307291, + "balance_loss_mlp": 0.01262321, + "epoch": 0.2439200360739516, + "flos": 23845055978880.0, + "grad_norm": 2.215401064957846, + "language_loss": 0.80586845, + "learning_rate": 3.538854530318506e-06, + "loss": 0.88404286, + "num_input_tokens_seen": 87293020, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.21520996, + "step": 4057, + "time_per_iteration": 2.5563580989837646 + }, + { + "auxiliary_loss_clip": 0.06533922, + "auxiliary_loss_mlp": 0.01279797, + "balance_loss_clip": 0.06311886, + "balance_loss_mlp": 0.01261009, + "epoch": 0.24398015932661957, + "flos": 19175684952960.0, + "grad_norm": 1.7331406857586058, + "language_loss": 0.79934907, + "learning_rate": 3.538605738554673e-06, + "loss": 0.87748623, + "num_input_tokens_seen": 87311445, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.18786621, + "step": 4058, + "time_per_iteration": 2.5552098751068115 + }, + { + "auxiliary_loss_clip": 0.06541391, + "auxiliary_loss_mlp": 0.01280168, + "balance_loss_clip": 0.06312001, + "balance_loss_mlp": 0.01259772, + "epoch": 0.24404028257928753, + "flos": 25269095318400.0, + "grad_norm": 1.7324044437804977, + "language_loss": 0.86104828, + "learning_rate": 3.538356888446756e-06, + "loss": 0.93926388, + "num_input_tokens_seen": 87332055, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.20410156, + "step": 4059, + "time_per_iteration": 2.575345754623413 + }, + { + "auxiliary_loss_clip": 0.06538763, + "auxiliary_loss_mlp": 0.01274337, + "balance_loss_clip": 0.06318676, + "balance_loss_mlp": 0.01255621, + "epoch": 0.2441004058319555, + "flos": 26474606409600.0, + "grad_norm": 1.5285193147278118, + "language_loss": 0.74698234, + "learning_rate": 3.5381079800041913e-06, + "loss": 0.8251133, + "num_input_tokens_seen": 87351295, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.18713379, + "step": 4060, + "time_per_iteration": 2.6277999877929688 + }, + { + "auxiliary_loss_clip": 0.06560756, + "auxiliary_loss_mlp": 0.01280844, + "balance_loss_clip": 0.06327853, + "balance_loss_mlp": 0.01259469, + "epoch": 0.2441605290846235, + "flos": 26767752318720.0, + "grad_norm": 1.6858410849727605, + "language_loss": 0.73894358, + "learning_rate": 3.5378590132364182e-06, + "loss": 0.81735957, + "num_input_tokens_seen": 87370650, + "router_z_loss_clip": 2.328125, + "router_z_loss_mlp": 0.21374512, + "step": 4061, + "time_per_iteration": 2.5895774364471436 + }, + { + "auxiliary_loss_clip": 0.06538899, + "auxiliary_loss_mlp": 0.01273593, + "balance_loss_clip": 0.0631846, + "balance_loss_mlp": 0.01254103, + "epoch": 0.24422065233729146, + "flos": 21112236990720.0, + "grad_norm": 1.7809128746808311, + "language_loss": 0.76782405, + "learning_rate": 3.5376099881528768e-06, + "loss": 0.84594905, + "num_input_tokens_seen": 87389020, + "router_z_loss_clip": 2.20507812, + "router_z_loss_mlp": 0.19494629, + "step": 4062, + "time_per_iteration": 2.5655109882354736 + }, + { + "auxiliary_loss_clip": 0.06538436, + "auxiliary_loss_mlp": 0.01278297, + "balance_loss_clip": 0.06319936, + "balance_loss_mlp": 0.01258019, + "epoch": 0.24428077558995942, + "flos": 25269891932160.0, + "grad_norm": 1.624722619478305, + "language_loss": 0.84975201, + "learning_rate": 3.537360904763011e-06, + "loss": 0.92791933, + "num_input_tokens_seen": 87409695, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.20263672, + "step": 4063, + "time_per_iteration": 2.569420576095581 + }, + { + "auxiliary_loss_clip": 0.06559969, + "auxiliary_loss_mlp": 0.01275678, + "balance_loss_clip": 0.06327148, + "balance_loss_mlp": 0.01254459, + "epoch": 0.24434089884262739, + "flos": 20491508344320.0, + "grad_norm": 2.099790248638241, + "language_loss": 0.68837494, + "learning_rate": 3.5371117630762656e-06, + "loss": 0.76673138, + "num_input_tokens_seen": 87428250, + "router_z_loss_clip": 2.328125, + "router_z_loss_mlp": 0.2121582, + "step": 4064, + "time_per_iteration": 2.560065984725952 + }, + { + "auxiliary_loss_clip": 0.06547809, + "auxiliary_loss_mlp": 0.01276127, + "balance_loss_clip": 0.06317605, + "balance_loss_mlp": 0.01255349, + "epoch": 0.24440102209529535, + "flos": 23628456374400.0, + "grad_norm": 1.7607893449036869, + "language_loss": 0.70700729, + "learning_rate": 3.536862563102088e-06, + "loss": 0.78524667, + "num_input_tokens_seen": 87449380, + "router_z_loss_clip": 2.29882812, + "router_z_loss_mlp": 0.20788574, + "step": 4065, + "time_per_iteration": 2.5619614124298096 + }, + { + "auxiliary_loss_clip": 0.06554856, + "auxiliary_loss_mlp": 0.0127847, + "balance_loss_clip": 0.06322616, + "balance_loss_mlp": 0.01256726, + "epoch": 0.24446114534796332, + "flos": 20560382000640.0, + "grad_norm": 2.0639555504298372, + "language_loss": 0.84639663, + "learning_rate": 3.5366133048499282e-06, + "loss": 0.92472994, + "num_input_tokens_seen": 87465365, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.21765137, + "step": 4066, + "time_per_iteration": 2.5640382766723633 + }, + { + "auxiliary_loss_clip": 0.0647334, + "auxiliary_loss_mlp": 0.01266455, + "balance_loss_clip": 0.06356817, + "balance_loss_mlp": 0.01260456, + "epoch": 0.24452126860063128, + "flos": 60406719327360.0, + "grad_norm": 0.7224646734980834, + "language_loss": 0.52123713, + "learning_rate": 3.5363639883292374e-06, + "loss": 0.59863508, + "num_input_tokens_seen": 87522525, + "router_z_loss_clip": 1.1640625, + "router_z_loss_mlp": 0.05990601, + "step": 4067, + "time_per_iteration": 3.067857503890991 + }, + { + "auxiliary_loss_clip": 0.06549152, + "auxiliary_loss_mlp": 0.01275932, + "balance_loss_clip": 0.063198, + "balance_loss_mlp": 0.01255106, + "epoch": 0.24458139185329927, + "flos": 15126958719360.0, + "grad_norm": 4.582785635832698, + "language_loss": 0.72625411, + "learning_rate": 3.5361146135494706e-06, + "loss": 0.80450499, + "num_input_tokens_seen": 87539170, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.20825195, + "step": 4068, + "time_per_iteration": 2.5490705966949463 + }, + { + "auxiliary_loss_clip": 0.06542531, + "auxiliary_loss_mlp": 0.0127677, + "balance_loss_clip": 0.06318012, + "balance_loss_mlp": 0.01256111, + "epoch": 0.24464151510596724, + "flos": 28005771594240.0, + "grad_norm": 1.4744908303961997, + "language_loss": 0.7839663, + "learning_rate": 3.5358651805200835e-06, + "loss": 0.86215931, + "num_input_tokens_seen": 87558875, + "router_z_loss_clip": 2.24609375, + "router_z_loss_mlp": 0.20654297, + "step": 4069, + "time_per_iteration": 2.6064302921295166 + }, + { + "auxiliary_loss_clip": 0.06535528, + "auxiliary_loss_mlp": 0.01277448, + "balance_loss_clip": 0.06312935, + "balance_loss_mlp": 0.01257493, + "epoch": 0.2447016383586352, + "flos": 19799138856960.0, + "grad_norm": 1.9167348410225946, + "language_loss": 0.80741036, + "learning_rate": 3.5356156892505347e-06, + "loss": 0.88554007, + "num_input_tokens_seen": 87576485, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.19946289, + "step": 4070, + "time_per_iteration": 2.633073568344116 + }, + { + "auxiliary_loss_clip": 0.06543916, + "auxiliary_loss_mlp": 0.0127809, + "balance_loss_clip": 0.06317008, + "balance_loss_mlp": 0.01258825, + "epoch": 0.24476176161130317, + "flos": 26074460436480.0, + "grad_norm": 1.476613235331205, + "language_loss": 0.8444066, + "learning_rate": 3.5353661397502854e-06, + "loss": 0.92262667, + "num_input_tokens_seen": 87598620, + "router_z_loss_clip": 2.2734375, + "router_z_loss_mlp": 0.19262695, + "step": 4071, + "time_per_iteration": 2.6165285110473633 + }, + { + "auxiliary_loss_clip": 0.06545337, + "auxiliary_loss_mlp": 0.01275719, + "balance_loss_clip": 0.06310376, + "balance_loss_mlp": 0.01254679, + "epoch": 0.24482188486397113, + "flos": 18849527735040.0, + "grad_norm": 2.1913275656577857, + "language_loss": 0.8027429, + "learning_rate": 3.535116532028798e-06, + "loss": 0.88095343, + "num_input_tokens_seen": 87616595, + "router_z_loss_clip": 2.3515625, + "router_z_loss_mlp": 0.21032715, + "step": 4072, + "time_per_iteration": 2.580077648162842 + }, + { + "auxiliary_loss_clip": 0.06531823, + "auxiliary_loss_mlp": 0.01275557, + "balance_loss_clip": 0.06311209, + "balance_loss_mlp": 0.01257031, + "epoch": 0.2448820081166391, + "flos": 21258202003200.0, + "grad_norm": 1.4781582217057618, + "language_loss": 0.7076053, + "learning_rate": 3.5348668660955382e-06, + "loss": 0.7856791, + "num_input_tokens_seen": 87635755, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.18505859, + "step": 4073, + "time_per_iteration": 2.5430707931518555 + }, + { + "auxiliary_loss_clip": 0.06525481, + "auxiliary_loss_mlp": 0.01279613, + "balance_loss_clip": 0.06303517, + "balance_loss_mlp": 0.01260921, + "epoch": 0.2449421313693071, + "flos": 23957254995840.0, + "grad_norm": 2.412576467354098, + "language_loss": 0.67577648, + "learning_rate": 3.5346171419599728e-06, + "loss": 0.75382745, + "num_input_tokens_seen": 87652885, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.18676758, + "step": 4074, + "time_per_iteration": 2.5616037845611572 + }, + { + "auxiliary_loss_clip": 0.06435025, + "auxiliary_loss_mlp": 0.01257107, + "balance_loss_clip": 0.06320108, + "balance_loss_mlp": 0.01251907, + "epoch": 0.24500225462197506, + "flos": 60705902730240.0, + "grad_norm": 0.8764237694402175, + "language_loss": 0.68656927, + "learning_rate": 3.5343673596315718e-06, + "loss": 0.76349056, + "num_input_tokens_seen": 87713220, + "router_z_loss_clip": 1.1484375, + "router_z_loss_mlp": 0.05203247, + "step": 4075, + "time_per_iteration": 3.2623581886291504 + }, + { + "auxiliary_loss_clip": 0.06527948, + "auxiliary_loss_mlp": 0.01276643, + "balance_loss_clip": 0.06305515, + "balance_loss_mlp": 0.01257414, + "epoch": 0.24506237787464302, + "flos": 26291018113920.0, + "grad_norm": 2.301278269127432, + "language_loss": 0.79781568, + "learning_rate": 3.5341175191198063e-06, + "loss": 0.87586164, + "num_input_tokens_seen": 87732680, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.19226074, + "step": 4076, + "time_per_iteration": 2.6342012882232666 + }, + { + "auxiliary_loss_clip": 0.06535772, + "auxiliary_loss_mlp": 0.01280909, + "balance_loss_clip": 0.06304428, + "balance_loss_mlp": 0.01258462, + "epoch": 0.245122501127311, + "flos": 20557530961920.0, + "grad_norm": 1.9232761502629154, + "language_loss": 0.82461953, + "learning_rate": 3.533867620434151e-06, + "loss": 0.90278631, + "num_input_tokens_seen": 87751880, + "router_z_loss_clip": 2.31445312, + "router_z_loss_mlp": 0.2244873, + "step": 4077, + "time_per_iteration": 2.5863101482391357 + }, + { + "auxiliary_loss_clip": 0.06532669, + "auxiliary_loss_mlp": 0.01279172, + "balance_loss_clip": 0.06305817, + "balance_loss_mlp": 0.01257774, + "epoch": 0.24518262437997895, + "flos": 29140312677120.0, + "grad_norm": 2.8377644839815357, + "language_loss": 0.63268852, + "learning_rate": 3.533617663584082e-06, + "loss": 0.71080685, + "num_input_tokens_seen": 87771795, + "router_z_loss_clip": 2.26953125, + "router_z_loss_mlp": 0.21398926, + "step": 4078, + "time_per_iteration": 2.6045711040496826 + }, + { + "auxiliary_loss_clip": 0.06522519, + "auxiliary_loss_mlp": 0.01277179, + "balance_loss_clip": 0.06301752, + "balance_loss_mlp": 0.01258249, + "epoch": 0.24524274763264692, + "flos": 23483623392000.0, + "grad_norm": 1.4700896000405594, + "language_loss": 0.75762683, + "learning_rate": 3.5333676485790765e-06, + "loss": 0.8356238, + "num_input_tokens_seen": 87793640, + "router_z_loss_clip": 2.20800781, + "router_z_loss_mlp": 0.18933105, + "step": 4079, + "time_per_iteration": 2.6327531337738037 + }, + { + "auxiliary_loss_clip": 0.06521107, + "auxiliary_loss_mlp": 0.01276139, + "balance_loss_clip": 0.06297373, + "balance_loss_mlp": 0.01256171, + "epoch": 0.24530287088531488, + "flos": 17206792439040.0, + "grad_norm": 1.743597814486786, + "language_loss": 0.75652814, + "learning_rate": 3.5331175754286173e-06, + "loss": 0.83450055, + "num_input_tokens_seen": 87812390, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.1998291, + "step": 4080, + "time_per_iteration": 2.5027806758880615 + }, + { + "auxiliary_loss_clip": 0.06517033, + "auxiliary_loss_mlp": 0.01282693, + "balance_loss_clip": 0.06296979, + "balance_loss_mlp": 0.01262129, + "epoch": 0.24536299413798288, + "flos": 14872903539840.0, + "grad_norm": 1.7999885027482954, + "language_loss": 0.83532149, + "learning_rate": 3.532867444142186e-06, + "loss": 0.91331875, + "num_input_tokens_seen": 87830640, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.20544434, + "step": 4081, + "time_per_iteration": 3.9672679901123047 + }, + { + "auxiliary_loss_clip": 0.06524678, + "auxiliary_loss_mlp": 0.01276758, + "balance_loss_clip": 0.06300613, + "balance_loss_mlp": 0.01257458, + "epoch": 0.24542311739065084, + "flos": 35270759347200.0, + "grad_norm": 2.0934334924975797, + "language_loss": 0.7376107, + "learning_rate": 3.532617254729267e-06, + "loss": 0.81562507, + "num_input_tokens_seen": 87850450, + "router_z_loss_clip": 2.2421875, + "router_z_loss_mlp": 0.19311523, + "step": 4082, + "time_per_iteration": 2.687596559524536 + }, + { + "auxiliary_loss_clip": 0.06520141, + "auxiliary_loss_mlp": 0.01272132, + "balance_loss_clip": 0.06301866, + "balance_loss_mlp": 0.01254334, + "epoch": 0.2454832406433188, + "flos": 21508903019520.0, + "grad_norm": 4.081398895882933, + "language_loss": 0.72681344, + "learning_rate": 3.5323670071993485e-06, + "loss": 0.8047362, + "num_input_tokens_seen": 87868810, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.17810059, + "step": 4083, + "time_per_iteration": 2.5715560913085938 + }, + { + "auxiliary_loss_clip": 0.06531677, + "auxiliary_loss_mlp": 0.01285124, + "balance_loss_clip": 0.06304878, + "balance_loss_mlp": 0.01263404, + "epoch": 0.24554336389598677, + "flos": 14761878480000.0, + "grad_norm": 2.078496591548884, + "language_loss": 0.75461411, + "learning_rate": 3.532116701561919e-06, + "loss": 0.83278215, + "num_input_tokens_seen": 87885685, + "router_z_loss_clip": 2.26953125, + "router_z_loss_mlp": 0.21704102, + "step": 4084, + "time_per_iteration": 2.527059316635132 + }, + { + "auxiliary_loss_clip": 0.06521569, + "auxiliary_loss_mlp": 0.01278312, + "balance_loss_clip": 0.06299873, + "balance_loss_mlp": 0.01259238, + "epoch": 0.24560348714865474, + "flos": 14981790320640.0, + "grad_norm": 1.9240939687866982, + "language_loss": 0.85311353, + "learning_rate": 3.531866337826471e-06, + "loss": 0.93111229, + "num_input_tokens_seen": 87903715, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.19055176, + "step": 4085, + "time_per_iteration": 4.107008695602417 + }, + { + "auxiliary_loss_clip": 0.06523392, + "auxiliary_loss_mlp": 0.01277742, + "balance_loss_clip": 0.06299591, + "balance_loss_mlp": 0.0125725, + "epoch": 0.2456636104013227, + "flos": 22682073634560.0, + "grad_norm": 1.671481131781836, + "language_loss": 0.79073685, + "learning_rate": 3.5316159160024982e-06, + "loss": 0.86874819, + "num_input_tokens_seen": 87923375, + "router_z_loss_clip": 2.23828125, + "router_z_loss_mlp": 0.20495605, + "step": 4086, + "time_per_iteration": 2.5609679222106934 + }, + { + "auxiliary_loss_clip": 0.06519614, + "auxiliary_loss_mlp": 0.01278477, + "balance_loss_clip": 0.06300113, + "balance_loss_mlp": 0.01260107, + "epoch": 0.2457237336539907, + "flos": 27425307634560.0, + "grad_norm": 1.6115503736345718, + "language_loss": 0.75352013, + "learning_rate": 3.531365436099496e-06, + "loss": 0.83150113, + "num_input_tokens_seen": 87943115, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.18359375, + "step": 4087, + "time_per_iteration": 4.046957015991211 + }, + { + "auxiliary_loss_clip": 0.06525059, + "auxiliary_loss_mlp": 0.01276774, + "balance_loss_clip": 0.06299827, + "balance_loss_mlp": 0.0125633, + "epoch": 0.24578385690665866, + "flos": 20418609692160.0, + "grad_norm": 2.7081304915573914, + "language_loss": 0.79987848, + "learning_rate": 3.5311148981269635e-06, + "loss": 0.87789685, + "num_input_tokens_seen": 87959505, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.20458984, + "step": 4088, + "time_per_iteration": 2.5119664669036865 + }, + { + "auxiliary_loss_clip": 0.06519316, + "auxiliary_loss_mlp": 0.01276403, + "balance_loss_clip": 0.06303117, + "balance_loss_mlp": 0.01258152, + "epoch": 0.24584398015932662, + "flos": 23922273116160.0, + "grad_norm": 2.802199957042034, + "language_loss": 0.77758735, + "learning_rate": 3.5308643020944e-06, + "loss": 0.85554451, + "num_input_tokens_seen": 87979725, + "router_z_loss_clip": 2.16601562, + "router_z_loss_mlp": 0.18249512, + "step": 4089, + "time_per_iteration": 2.5686089992523193 + }, + { + "auxiliary_loss_clip": 0.06525148, + "auxiliary_loss_mlp": 0.01281238, + "balance_loss_clip": 0.0630155, + "balance_loss_mlp": 0.01261021, + "epoch": 0.2459041034119946, + "flos": 41505313115520.0, + "grad_norm": 1.8031915906993192, + "language_loss": 0.81701422, + "learning_rate": 3.530613648011309e-06, + "loss": 0.89507812, + "num_input_tokens_seen": 87998270, + "router_z_loss_clip": 2.23828125, + "router_z_loss_mlp": 0.20214844, + "step": 4090, + "time_per_iteration": 2.678403377532959 + }, + { + "auxiliary_loss_clip": 0.065328, + "auxiliary_loss_mlp": 0.01279305, + "balance_loss_clip": 0.06309135, + "balance_loss_mlp": 0.01258861, + "epoch": 0.24596422666466256, + "flos": 19942755955200.0, + "grad_norm": 2.438516046551743, + "language_loss": 0.73629344, + "learning_rate": 3.5303629358871946e-06, + "loss": 0.8144145, + "num_input_tokens_seen": 88016760, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.20446777, + "step": 4091, + "time_per_iteration": 3.961276054382324 + }, + { + "auxiliary_loss_clip": 0.06539448, + "auxiliary_loss_mlp": 0.01279874, + "balance_loss_clip": 0.06316313, + "balance_loss_mlp": 0.0126148, + "epoch": 0.24602434991733052, + "flos": 21550970568960.0, + "grad_norm": 2.2480658521871897, + "language_loss": 0.77723873, + "learning_rate": 3.5301121657315653e-06, + "loss": 0.85543197, + "num_input_tokens_seen": 88036465, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.18408203, + "step": 4092, + "time_per_iteration": 2.5494375228881836 + }, + { + "auxiliary_loss_clip": 0.06537454, + "auxiliary_loss_mlp": 0.01278374, + "balance_loss_clip": 0.06307742, + "balance_loss_mlp": 0.01258907, + "epoch": 0.24608447316999849, + "flos": 23191735294080.0, + "grad_norm": 2.380112015735871, + "language_loss": 0.82381165, + "learning_rate": 3.5298613375539287e-06, + "loss": 0.90196991, + "num_input_tokens_seen": 88053270, + "router_z_loss_clip": 2.30078125, + "router_z_loss_mlp": 0.19470215, + "step": 4093, + "time_per_iteration": 2.5551040172576904 + }, + { + "auxiliary_loss_clip": 0.06532703, + "auxiliary_loss_mlp": 0.01285, + "balance_loss_clip": 0.06305315, + "balance_loss_mlp": 0.01264412, + "epoch": 0.24614459642266648, + "flos": 19647345985920.0, + "grad_norm": 21.11973952887688, + "language_loss": 0.87671578, + "learning_rate": 3.529610451363797e-06, + "loss": 0.95489287, + "num_input_tokens_seen": 88072305, + "router_z_loss_clip": 2.27148438, + "router_z_loss_mlp": 0.20581055, + "step": 4094, + "time_per_iteration": 2.534127712249756 + }, + { + "auxiliary_loss_clip": 0.06404499, + "auxiliary_loss_mlp": 0.01293713, + "balance_loss_clip": 0.06291573, + "balance_loss_mlp": 0.01289332, + "epoch": 0.24620471967533444, + "flos": 61757231109120.0, + "grad_norm": 0.7533459551406883, + "language_loss": 0.57023478, + "learning_rate": 3.5293595071706833e-06, + "loss": 0.64721692, + "num_input_tokens_seen": 88137995, + "router_z_loss_clip": 1.125, + "router_z_loss_mlp": 0.04388428, + "step": 4095, + "time_per_iteration": 3.238482713699341 + }, + { + "auxiliary_loss_clip": 0.06404348, + "auxiliary_loss_mlp": 0.01286038, + "balance_loss_clip": 0.06290346, + "balance_loss_mlp": 0.01281767, + "epoch": 0.2462648429280024, + "flos": 69174431003520.0, + "grad_norm": 0.6365745764429788, + "language_loss": 0.56240451, + "learning_rate": 3.5291085049841042e-06, + "loss": 0.63930833, + "num_input_tokens_seen": 88208490, + "router_z_loss_clip": 1.140625, + "router_z_loss_mlp": 0.04275513, + "step": 4096, + "time_per_iteration": 3.3192596435546875 + }, + { + "auxiliary_loss_clip": 0.06545975, + "auxiliary_loss_mlp": 0.01281956, + "balance_loss_clip": 0.06318395, + "balance_loss_mlp": 0.01262143, + "epoch": 0.24632496618067037, + "flos": 29467140727680.0, + "grad_norm": 1.505356285132213, + "language_loss": 0.78075927, + "learning_rate": 3.5288574448135773e-06, + "loss": 0.85903859, + "num_input_tokens_seen": 88228050, + "router_z_loss_clip": 2.2734375, + "router_z_loss_mlp": 0.19812012, + "step": 4097, + "time_per_iteration": 2.617108106613159 + }, + { + "auxiliary_loss_clip": 0.06547391, + "auxiliary_loss_mlp": 0.01279842, + "balance_loss_clip": 0.06315026, + "balance_loss_mlp": 0.01259993, + "epoch": 0.24638508943333834, + "flos": 24323341484160.0, + "grad_norm": 2.0372573834811267, + "language_loss": 0.77321315, + "learning_rate": 3.5286063266686235e-06, + "loss": 0.85148549, + "num_input_tokens_seen": 88248090, + "router_z_loss_clip": 2.32421875, + "router_z_loss_mlp": 0.1986084, + "step": 4098, + "time_per_iteration": 2.6069419384002686 + }, + { + "auxiliary_loss_clip": 0.06542017, + "auxiliary_loss_mlp": 0.01275521, + "balance_loss_clip": 0.0631687, + "balance_loss_mlp": 0.01257341, + "epoch": 0.2464452126860063, + "flos": 26620236005760.0, + "grad_norm": 2.17921698337753, + "language_loss": 0.69183016, + "learning_rate": 3.528355150558764e-06, + "loss": 0.77000558, + "num_input_tokens_seen": 88267545, + "router_z_loss_clip": 2.25, + "router_z_loss_mlp": 0.1817627, + "step": 4099, + "time_per_iteration": 2.655956506729126 + }, + { + "auxiliary_loss_clip": 0.06525709, + "auxiliary_loss_mlp": 0.01274552, + "balance_loss_clip": 0.06309929, + "balance_loss_mlp": 0.01256062, + "epoch": 0.24650533593867427, + "flos": 31220481813120.0, + "grad_norm": 2.2743270797915076, + "language_loss": 0.67268491, + "learning_rate": 3.5281039164935237e-06, + "loss": 0.75068748, + "num_input_tokens_seen": 88289785, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.18493652, + "step": 4100, + "time_per_iteration": 2.6497559547424316 + }, + { + "auxiliary_loss_clip": 0.0641202, + "auxiliary_loss_mlp": 0.01258309, + "balance_loss_clip": 0.06296985, + "balance_loss_mlp": 0.01253758, + "epoch": 0.24656545919134226, + "flos": 68513269962240.0, + "grad_norm": 0.6889590379062642, + "language_loss": 0.61607081, + "learning_rate": 3.5278526244824304e-06, + "loss": 0.69277412, + "num_input_tokens_seen": 88357320, + "router_z_loss_clip": 1.1484375, + "router_z_loss_mlp": 0.04559326, + "step": 4101, + "time_per_iteration": 3.2961082458496094 + }, + { + "auxiliary_loss_clip": 0.06538613, + "auxiliary_loss_mlp": 0.01277942, + "balance_loss_clip": 0.06317261, + "balance_loss_mlp": 0.01259, + "epoch": 0.24662558244401023, + "flos": 20090398049280.0, + "grad_norm": 1.6193028382456236, + "language_loss": 0.73591036, + "learning_rate": 3.527601274535012e-06, + "loss": 0.81407589, + "num_input_tokens_seen": 88377040, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.18945312, + "step": 4102, + "time_per_iteration": 2.542275905609131 + }, + { + "auxiliary_loss_clip": 0.0654332, + "auxiliary_loss_mlp": 0.01273749, + "balance_loss_clip": 0.06317908, + "balance_loss_mlp": 0.01255152, + "epoch": 0.2466857056966782, + "flos": 30709310780160.0, + "grad_norm": 2.0137613654817854, + "language_loss": 0.76325667, + "learning_rate": 3.5273498666608004e-06, + "loss": 0.84142733, + "num_input_tokens_seen": 88395085, + "router_z_loss_clip": 2.25, + "router_z_loss_mlp": 0.18603516, + "step": 4103, + "time_per_iteration": 2.6544189453125 + }, + { + "auxiliary_loss_clip": 0.06542745, + "auxiliary_loss_mlp": 0.01273413, + "balance_loss_clip": 0.06315098, + "balance_loss_mlp": 0.01253159, + "epoch": 0.24674582894934616, + "flos": 22535102373120.0, + "grad_norm": 2.0816413841430697, + "language_loss": 0.79265451, + "learning_rate": 3.5270984008693288e-06, + "loss": 0.87081611, + "num_input_tokens_seen": 88413205, + "router_z_loss_clip": 2.27929688, + "router_z_loss_mlp": 0.20275879, + "step": 4104, + "time_per_iteration": 2.5569820404052734 + }, + { + "auxiliary_loss_clip": 0.06525403, + "auxiliary_loss_mlp": 0.01276885, + "balance_loss_clip": 0.06306183, + "balance_loss_mlp": 0.01257251, + "epoch": 0.24680595220201412, + "flos": 20710581644160.0, + "grad_norm": 1.7450607123984514, + "language_loss": 0.83681756, + "learning_rate": 3.526846877170133e-06, + "loss": 0.9148404, + "num_input_tokens_seen": 88431525, + "router_z_loss_clip": 2.19335938, + "router_z_loss_mlp": 0.19641113, + "step": 4105, + "time_per_iteration": 2.553579330444336 + }, + { + "auxiliary_loss_clip": 0.06533727, + "auxiliary_loss_mlp": 0.01273598, + "balance_loss_clip": 0.06309752, + "balance_loss_mlp": 0.01255371, + "epoch": 0.2468660754546821, + "flos": 21836946954240.0, + "grad_norm": 1.9208859898797113, + "language_loss": 0.77469373, + "learning_rate": 3.52659529557275e-06, + "loss": 0.85276699, + "num_input_tokens_seen": 88451210, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.18212891, + "step": 4106, + "time_per_iteration": 2.5389256477355957 + }, + { + "auxiliary_loss_clip": 0.06534247, + "auxiliary_loss_mlp": 0.01276275, + "balance_loss_clip": 0.06310344, + "balance_loss_mlp": 0.01257463, + "epoch": 0.24692619870735008, + "flos": 15273049512960.0, + "grad_norm": 2.4615103155960485, + "language_loss": 0.73436344, + "learning_rate": 3.5263436560867205e-06, + "loss": 0.81246865, + "num_input_tokens_seen": 88467790, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.18798828, + "step": 4107, + "time_per_iteration": 2.5545566082000732 + }, + { + "auxiliary_loss_clip": 0.06538644, + "auxiliary_loss_mlp": 0.01275055, + "balance_loss_clip": 0.06314194, + "balance_loss_mlp": 0.01256745, + "epoch": 0.24698632196001805, + "flos": 29687933036160.0, + "grad_norm": 2.1377324014009504, + "language_loss": 0.66432422, + "learning_rate": 3.526091958721587e-06, + "loss": 0.7424612, + "num_input_tokens_seen": 88490330, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.18322754, + "step": 4108, + "time_per_iteration": 2.6196486949920654 + }, + { + "auxiliary_loss_clip": 0.06540007, + "auxiliary_loss_mlp": 0.01277779, + "balance_loss_clip": 0.06313555, + "balance_loss_mlp": 0.01259623, + "epoch": 0.247046445212686, + "flos": 39174736452480.0, + "grad_norm": 2.010829594577025, + "language_loss": 0.73608756, + "learning_rate": 3.5258402034868936e-06, + "loss": 0.81426549, + "num_input_tokens_seen": 88512435, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.18151855, + "step": 4109, + "time_per_iteration": 2.764406442642212 + }, + { + "auxiliary_loss_clip": 0.06534623, + "auxiliary_loss_mlp": 0.01277352, + "balance_loss_clip": 0.06311052, + "balance_loss_mlp": 0.01259077, + "epoch": 0.24710656846535398, + "flos": 23004834762240.0, + "grad_norm": 1.68605601916547, + "language_loss": 0.79419786, + "learning_rate": 3.5255883903921866e-06, + "loss": 0.87231761, + "num_input_tokens_seen": 88529780, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.18249512, + "step": 4110, + "time_per_iteration": 2.5460774898529053 + }, + { + "auxiliary_loss_clip": 0.06540776, + "auxiliary_loss_mlp": 0.01276666, + "balance_loss_clip": 0.06313831, + "balance_loss_mlp": 0.01257032, + "epoch": 0.24716669171802194, + "flos": 26440085727360.0, + "grad_norm": 2.6454329848736604, + "language_loss": 0.81789577, + "learning_rate": 3.5253365194470144e-06, + "loss": 0.89607012, + "num_input_tokens_seen": 88547200, + "router_z_loss_clip": 2.26953125, + "router_z_loss_mlp": 0.19628906, + "step": 4111, + "time_per_iteration": 2.632023811340332 + }, + { + "auxiliary_loss_clip": 0.06537174, + "auxiliary_loss_mlp": 0.0127416, + "balance_loss_clip": 0.06311068, + "balance_loss_mlp": 0.01256065, + "epoch": 0.2472268149706899, + "flos": 23336358641280.0, + "grad_norm": 1.983709335436533, + "language_loss": 0.75390071, + "learning_rate": 3.5250845906609294e-06, + "loss": 0.83201408, + "num_input_tokens_seen": 88566415, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.18115234, + "step": 4112, + "time_per_iteration": 2.5546083450317383 + }, + { + "auxiliary_loss_clip": 0.06533875, + "auxiliary_loss_mlp": 0.01274467, + "balance_loss_clip": 0.06308994, + "balance_loss_mlp": 0.01255548, + "epoch": 0.24728693822335787, + "flos": 23775469562880.0, + "grad_norm": 2.380234182887367, + "language_loss": 0.83472633, + "learning_rate": 3.5248326040434835e-06, + "loss": 0.91280973, + "num_input_tokens_seen": 88585225, + "router_z_loss_clip": 2.24804688, + "router_z_loss_mlp": 0.18920898, + "step": 4113, + "time_per_iteration": 2.6223254203796387 + }, + { + "auxiliary_loss_clip": 0.06540644, + "auxiliary_loss_mlp": 0.01276865, + "balance_loss_clip": 0.06315883, + "balance_loss_mlp": 0.01257279, + "epoch": 0.24734706147602586, + "flos": 19323494755200.0, + "grad_norm": 2.0367731486494636, + "language_loss": 0.87924093, + "learning_rate": 3.5245805596042322e-06, + "loss": 0.95741606, + "num_input_tokens_seen": 88603280, + "router_z_loss_clip": 2.24804688, + "router_z_loss_mlp": 0.19580078, + "step": 4114, + "time_per_iteration": 2.5495545864105225 + }, + { + "auxiliary_loss_clip": 0.06532501, + "auxiliary_loss_mlp": 0.01273212, + "balance_loss_clip": 0.06308883, + "balance_loss_mlp": 0.01255474, + "epoch": 0.24740718472869383, + "flos": 28044275345280.0, + "grad_norm": 1.9170399047542779, + "language_loss": 0.75640035, + "learning_rate": 3.524328457352734e-06, + "loss": 0.83445752, + "num_input_tokens_seen": 88624925, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.17736816, + "step": 4115, + "time_per_iteration": 2.6333982944488525 + }, + { + "auxiliary_loss_clip": 0.0642873, + "auxiliary_loss_mlp": 0.01264911, + "balance_loss_clip": 0.06315603, + "balance_loss_mlp": 0.01259151, + "epoch": 0.2474673079813618, + "flos": 68129265899520.0, + "grad_norm": 0.63897767002188, + "language_loss": 0.58004332, + "learning_rate": 3.5240762972985475e-06, + "loss": 0.65697974, + "num_input_tokens_seen": 88691475, + "router_z_loss_clip": 1.1328125, + "router_z_loss_mlp": 0.05752563, + "step": 4116, + "time_per_iteration": 3.251235246658325 + }, + { + "auxiliary_loss_clip": 0.06532618, + "auxiliary_loss_mlp": 0.01276179, + "balance_loss_clip": 0.063094, + "balance_loss_mlp": 0.01257022, + "epoch": 0.24752743123402976, + "flos": 29470075620480.0, + "grad_norm": 1.407143363910891, + "language_loss": 0.8425988, + "learning_rate": 3.523824079451235e-06, + "loss": 0.92068678, + "num_input_tokens_seen": 88713425, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.19152832, + "step": 4117, + "time_per_iteration": 2.640665292739868 + }, + { + "auxiliary_loss_clip": 0.06425081, + "auxiliary_loss_mlp": 0.01267093, + "balance_loss_clip": 0.0631275, + "balance_loss_mlp": 0.01262089, + "epoch": 0.24758755448669773, + "flos": 58367946908160.0, + "grad_norm": 0.8764773034828885, + "language_loss": 0.63508207, + "learning_rate": 3.5235718038203602e-06, + "loss": 0.71200383, + "num_input_tokens_seen": 88769995, + "router_z_loss_clip": 1.125, + "router_z_loss_mlp": 0.05001831, + "step": 4118, + "time_per_iteration": 3.052507162094116 + }, + { + "auxiliary_loss_clip": 0.0652981, + "auxiliary_loss_mlp": 0.01277419, + "balance_loss_clip": 0.06307684, + "balance_loss_mlp": 0.01258203, + "epoch": 0.2476476777393657, + "flos": 20490502095360.0, + "grad_norm": 1.7262960547494681, + "language_loss": 0.80051601, + "learning_rate": 3.523319470415491e-06, + "loss": 0.87858826, + "num_input_tokens_seen": 88789970, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.19238281, + "step": 4119, + "time_per_iteration": 2.554318428039551 + }, + { + "auxiliary_loss_clip": 0.06530587, + "auxiliary_loss_mlp": 0.01282865, + "balance_loss_clip": 0.06310613, + "balance_loss_mlp": 0.01265198, + "epoch": 0.24770780099203366, + "flos": 20492179176960.0, + "grad_norm": 2.4192345138137386, + "language_loss": 0.74556476, + "learning_rate": 3.5230670792461943e-06, + "loss": 0.8236993, + "num_input_tokens_seen": 88810000, + "router_z_loss_clip": 2.19921875, + "router_z_loss_mlp": 0.17663574, + "step": 4120, + "time_per_iteration": 3.996234655380249 + }, + { + "auxiliary_loss_clip": 0.06531808, + "auxiliary_loss_mlp": 0.01276043, + "balance_loss_clip": 0.06307146, + "balance_loss_mlp": 0.01256362, + "epoch": 0.24776792424470165, + "flos": 15157915603200.0, + "grad_norm": 2.13486110959629, + "language_loss": 0.89734054, + "learning_rate": 3.522814630322041e-06, + "loss": 0.97541904, + "num_input_tokens_seen": 88827515, + "router_z_loss_clip": 2.24804688, + "router_z_loss_mlp": 0.19689941, + "step": 4121, + "time_per_iteration": 2.5337533950805664 + }, + { + "auxiliary_loss_clip": 0.06540959, + "auxiliary_loss_mlp": 0.01278306, + "balance_loss_clip": 0.06314932, + "balance_loss_mlp": 0.01258744, + "epoch": 0.2478280474973696, + "flos": 21731833607040.0, + "grad_norm": 2.0829104418917646, + "language_loss": 0.69792116, + "learning_rate": 3.5225621236526045e-06, + "loss": 0.77611381, + "num_input_tokens_seen": 88845025, + "router_z_loss_clip": 2.25585938, + "router_z_loss_mlp": 0.19580078, + "step": 4122, + "time_per_iteration": 2.5857455730438232 + }, + { + "auxiliary_loss_clip": 0.06535036, + "auxiliary_loss_mlp": 0.01273779, + "balance_loss_clip": 0.0630946, + "balance_loss_mlp": 0.01254729, + "epoch": 0.24788817075003758, + "flos": 20418400056960.0, + "grad_norm": 2.5894895086667264, + "language_loss": 0.80832231, + "learning_rate": 3.5223095592474596e-06, + "loss": 0.88641047, + "num_input_tokens_seen": 88861740, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.19042969, + "step": 4123, + "time_per_iteration": 2.533696174621582 + }, + { + "auxiliary_loss_clip": 0.06528741, + "auxiliary_loss_mlp": 0.01276684, + "balance_loss_clip": 0.06306656, + "balance_loss_mlp": 0.01259625, + "epoch": 0.24794829400270554, + "flos": 22599867179520.0, + "grad_norm": 2.45373622595604, + "language_loss": 0.75091624, + "learning_rate": 3.5220569371161846e-06, + "loss": 0.82897043, + "num_input_tokens_seen": 88879740, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.1706543, + "step": 4124, + "time_per_iteration": 2.5478947162628174 + }, + { + "auxiliary_loss_clip": 0.06523614, + "auxiliary_loss_mlp": 0.01276208, + "balance_loss_clip": 0.06306844, + "balance_loss_mlp": 0.01258708, + "epoch": 0.2480084172553735, + "flos": 39685362433920.0, + "grad_norm": 1.4066224864196382, + "language_loss": 0.74510413, + "learning_rate": 3.521804257268357e-06, + "loss": 0.82310236, + "num_input_tokens_seen": 88904095, + "router_z_loss_clip": 2.16503906, + "router_z_loss_mlp": 0.17504883, + "step": 4125, + "time_per_iteration": 4.164500951766968 + }, + { + "auxiliary_loss_clip": 0.06546921, + "auxiliary_loss_mlp": 0.01279637, + "balance_loss_clip": 0.06313127, + "balance_loss_mlp": 0.01260599, + "epoch": 0.24806854050804147, + "flos": 22060129104000.0, + "grad_norm": 1.9518521214536066, + "language_loss": 0.69807184, + "learning_rate": 3.5215515197135595e-06, + "loss": 0.77633739, + "num_input_tokens_seen": 88920740, + "router_z_loss_clip": 2.34179688, + "router_z_loss_mlp": 0.19030762, + "step": 4126, + "time_per_iteration": 2.520550489425659 + }, + { + "auxiliary_loss_clip": 0.06526291, + "auxiliary_loss_mlp": 0.0127589, + "balance_loss_clip": 0.06304894, + "balance_loss_mlp": 0.01257281, + "epoch": 0.24812866376070947, + "flos": 15492164739840.0, + "grad_norm": 2.6036079521490834, + "language_loss": 0.81805199, + "learning_rate": 3.5212987244613764e-06, + "loss": 0.89607382, + "num_input_tokens_seen": 88938510, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.18615723, + "step": 4127, + "time_per_iteration": 4.052755832672119 + }, + { + "auxiliary_loss_clip": 0.06533966, + "auxiliary_loss_mlp": 0.012739, + "balance_loss_clip": 0.06306454, + "balance_loss_mlp": 0.01255494, + "epoch": 0.24818878701337743, + "flos": 14762758947840.0, + "grad_norm": 2.4130643839940746, + "language_loss": 0.85122234, + "learning_rate": 3.5210458715213927e-06, + "loss": 0.92930102, + "num_input_tokens_seen": 88955235, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.18395996, + "step": 4128, + "time_per_iteration": 2.5801029205322266 + }, + { + "auxiliary_loss_clip": 0.06541854, + "auxiliary_loss_mlp": 0.01278965, + "balance_loss_clip": 0.06316209, + "balance_loss_mlp": 0.01260821, + "epoch": 0.2482489102660454, + "flos": 27096886356480.0, + "grad_norm": 2.0112959815575713, + "language_loss": 0.66149813, + "learning_rate": 3.5207929609031973e-06, + "loss": 0.73970628, + "num_input_tokens_seen": 88975210, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.18151855, + "step": 4129, + "time_per_iteration": 2.5865726470947266 + }, + { + "auxiliary_loss_clip": 0.06528358, + "auxiliary_loss_mlp": 0.01276243, + "balance_loss_clip": 0.06307153, + "balance_loss_mlp": 0.01257444, + "epoch": 0.24830903351871336, + "flos": 26474522555520.0, + "grad_norm": 1.7021812681223303, + "language_loss": 0.75761282, + "learning_rate": 3.5205399926163806e-06, + "loss": 0.83565885, + "num_input_tokens_seen": 88996120, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.18811035, + "step": 4130, + "time_per_iteration": 2.6659512519836426 + }, + { + "auxiliary_loss_clip": 0.06526491, + "auxiliary_loss_mlp": 0.01274514, + "balance_loss_clip": 0.06302534, + "balance_loss_mlp": 0.01255, + "epoch": 0.24836915677138133, + "flos": 10232225337600.0, + "grad_norm": 2.0871707802719004, + "language_loss": 0.77625716, + "learning_rate": 3.520286966670535e-06, + "loss": 0.85426718, + "num_input_tokens_seen": 89008685, + "router_z_loss_clip": 2.23828125, + "router_z_loss_mlp": 0.19519043, + "step": 4131, + "time_per_iteration": 3.906522274017334 + }, + { + "auxiliary_loss_clip": 0.06519566, + "auxiliary_loss_mlp": 0.01270892, + "balance_loss_clip": 0.0630278, + "balance_loss_mlp": 0.01253582, + "epoch": 0.2484292800240493, + "flos": 30088162863360.0, + "grad_norm": 1.7622390062278706, + "language_loss": 0.84475207, + "learning_rate": 3.520033883075255e-06, + "loss": 0.92265671, + "num_input_tokens_seen": 89031160, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.17297363, + "step": 4132, + "time_per_iteration": 2.6436057090759277 + }, + { + "auxiliary_loss_clip": 0.06525066, + "auxiliary_loss_mlp": 0.01275924, + "balance_loss_clip": 0.06302708, + "balance_loss_mlp": 0.01256779, + "epoch": 0.24848940327671726, + "flos": 13447899878400.0, + "grad_norm": 1.545647189211169, + "language_loss": 0.71393758, + "learning_rate": 3.5197807418401386e-06, + "loss": 0.79194742, + "num_input_tokens_seen": 89047235, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.19152832, + "step": 4133, + "time_per_iteration": 2.5431106090545654 + }, + { + "auxiliary_loss_clip": 0.06542444, + "auxiliary_loss_mlp": 0.01275489, + "balance_loss_clip": 0.06309851, + "balance_loss_mlp": 0.01255116, + "epoch": 0.24854952652938525, + "flos": 19975683409920.0, + "grad_norm": 2.3352452144714513, + "language_loss": 0.6286931, + "learning_rate": 3.5195275429747834e-06, + "loss": 0.70687246, + "num_input_tokens_seen": 89064790, + "router_z_loss_clip": 2.328125, + "router_z_loss_mlp": 0.20373535, + "step": 4134, + "time_per_iteration": 2.571525812149048 + }, + { + "auxiliary_loss_clip": 0.06524864, + "auxiliary_loss_mlp": 0.01277288, + "balance_loss_clip": 0.06301688, + "balance_loss_mlp": 0.01258883, + "epoch": 0.24860964978205322, + "flos": 18156026217600.0, + "grad_norm": 1.960513817978903, + "language_loss": 0.79140246, + "learning_rate": 3.5192742864887914e-06, + "loss": 0.86942399, + "num_input_tokens_seen": 89083250, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.18383789, + "step": 4135, + "time_per_iteration": 2.588916301727295 + }, + { + "auxiliary_loss_clip": 0.06524552, + "auxiliary_loss_mlp": 0.01274974, + "balance_loss_clip": 0.06303368, + "balance_loss_mlp": 0.01256294, + "epoch": 0.24866977303472118, + "flos": 11733397960320.0, + "grad_norm": 2.2852251503119234, + "language_loss": 0.8410641, + "learning_rate": 3.5190209723917662e-06, + "loss": 0.9190594, + "num_input_tokens_seen": 89100905, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.18676758, + "step": 4136, + "time_per_iteration": 2.497654676437378 + }, + { + "auxiliary_loss_clip": 0.06524116, + "auxiliary_loss_mlp": 0.01273427, + "balance_loss_clip": 0.06297501, + "balance_loss_mlp": 0.01254521, + "epoch": 0.24872989628738915, + "flos": 34832109623040.0, + "grad_norm": 1.7046352309858128, + "language_loss": 0.71601558, + "learning_rate": 3.518767600693314e-06, + "loss": 0.79399109, + "num_input_tokens_seen": 89122630, + "router_z_loss_clip": 2.26757812, + "router_z_loss_mlp": 0.18908691, + "step": 4137, + "time_per_iteration": 2.732480764389038 + }, + { + "auxiliary_loss_clip": 0.06525281, + "auxiliary_loss_mlp": 0.01273776, + "balance_loss_clip": 0.06299166, + "balance_loss_mlp": 0.01255549, + "epoch": 0.2487900195400571, + "flos": 13704512607360.0, + "grad_norm": 2.5230361612400296, + "language_loss": 0.67583597, + "learning_rate": 3.518514171403042e-06, + "loss": 0.7538265, + "num_input_tokens_seen": 89141050, + "router_z_loss_clip": 2.26171875, + "router_z_loss_mlp": 0.18212891, + "step": 4138, + "time_per_iteration": 2.531855583190918 + }, + { + "auxiliary_loss_clip": 0.06519014, + "auxiliary_loss_mlp": 0.01272692, + "balance_loss_clip": 0.06298752, + "balance_loss_mlp": 0.01254501, + "epoch": 0.24885014279272508, + "flos": 25344845009280.0, + "grad_norm": 1.9341473695701388, + "language_loss": 0.83479851, + "learning_rate": 3.51826068453056e-06, + "loss": 0.91271555, + "num_input_tokens_seen": 89160810, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.18188477, + "step": 4139, + "time_per_iteration": 2.6051557064056396 + }, + { + "auxiliary_loss_clip": 0.06528804, + "auxiliary_loss_mlp": 0.01275882, + "balance_loss_clip": 0.06300579, + "balance_loss_mlp": 0.01255711, + "epoch": 0.24891026604539307, + "flos": 20637724919040.0, + "grad_norm": 1.6977646822397727, + "language_loss": 0.79297662, + "learning_rate": 3.518007140085481e-06, + "loss": 0.87102342, + "num_input_tokens_seen": 89180610, + "router_z_loss_clip": 2.2890625, + "router_z_loss_mlp": 0.20178223, + "step": 4140, + "time_per_iteration": 2.5448291301727295 + }, + { + "auxiliary_loss_clip": 0.0641291, + "auxiliary_loss_mlp": 0.01270262, + "balance_loss_clip": 0.06303305, + "balance_loss_mlp": 0.0126555, + "epoch": 0.24897038929806103, + "flos": 66979086030720.0, + "grad_norm": 0.8107945435966392, + "language_loss": 0.60717231, + "learning_rate": 3.51775353807742e-06, + "loss": 0.68400407, + "num_input_tokens_seen": 89241880, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.04705811, + "step": 4141, + "time_per_iteration": 3.2685940265655518 + }, + { + "auxiliary_loss_clip": 0.06525983, + "auxiliary_loss_mlp": 0.01275717, + "balance_loss_clip": 0.06301422, + "balance_loss_mlp": 0.01256894, + "epoch": 0.249030512550729, + "flos": 36401359288320.0, + "grad_norm": 1.7802793710753735, + "language_loss": 0.72871864, + "learning_rate": 3.5174998785159913e-06, + "loss": 0.80673563, + "num_input_tokens_seen": 89263340, + "router_z_loss_clip": 2.24609375, + "router_z_loss_mlp": 0.18823242, + "step": 4142, + "time_per_iteration": 2.6564056873321533 + }, + { + "auxiliary_loss_clip": 0.0652248, + "auxiliary_loss_mlp": 0.01276725, + "balance_loss_clip": 0.06302793, + "balance_loss_mlp": 0.0125789, + "epoch": 0.24909063580339696, + "flos": 20160361808640.0, + "grad_norm": 1.9535741137498925, + "language_loss": 0.81280798, + "learning_rate": 3.5172461614108157e-06, + "loss": 0.8908, + "num_input_tokens_seen": 89282870, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.18823242, + "step": 4143, + "time_per_iteration": 2.5795881748199463 + }, + { + "auxiliary_loss_clip": 0.06522508, + "auxiliary_loss_mlp": 0.01275624, + "balance_loss_clip": 0.06301625, + "balance_loss_mlp": 0.01257039, + "epoch": 0.24915075905606493, + "flos": 26403887963520.0, + "grad_norm": 1.964912825826696, + "language_loss": 0.59448719, + "learning_rate": 3.5169923867715137e-06, + "loss": 0.67246854, + "num_input_tokens_seen": 89303830, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.18579102, + "step": 4144, + "time_per_iteration": 2.5888898372650146 + }, + { + "auxiliary_loss_clip": 0.06520054, + "auxiliary_loss_mlp": 0.01279478, + "balance_loss_clip": 0.06300642, + "balance_loss_mlp": 0.01260608, + "epoch": 0.2492108823087329, + "flos": 27534655612800.0, + "grad_norm": 2.2926576094039253, + "language_loss": 0.79198605, + "learning_rate": 3.516738554607708e-06, + "loss": 0.86998141, + "num_input_tokens_seen": 89324350, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.18859863, + "step": 4145, + "time_per_iteration": 2.6068575382232666 + }, + { + "auxiliary_loss_clip": 0.06539698, + "auxiliary_loss_mlp": 0.01282889, + "balance_loss_clip": 0.06307465, + "balance_loss_mlp": 0.01262587, + "epoch": 0.24927100556140086, + "flos": 16697088852480.0, + "grad_norm": 2.388513156986414, + "language_loss": 0.65914291, + "learning_rate": 3.5164846649290253e-06, + "loss": 0.73736882, + "num_input_tokens_seen": 89342875, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.20300293, + "step": 4146, + "time_per_iteration": 2.550225019454956 + }, + { + "auxiliary_loss_clip": 0.06418058, + "auxiliary_loss_mlp": 0.01257626, + "balance_loss_clip": 0.06307501, + "balance_loss_mlp": 0.01252389, + "epoch": 0.24933112881406885, + "flos": 62791899724800.0, + "grad_norm": 0.9255702942051489, + "language_loss": 0.67495543, + "learning_rate": 3.5162307177450915e-06, + "loss": 0.75171226, + "num_input_tokens_seen": 89404925, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.05239868, + "step": 4147, + "time_per_iteration": 3.2676596641540527 + }, + { + "auxiliary_loss_clip": 0.06525366, + "auxiliary_loss_mlp": 0.01281982, + "balance_loss_clip": 0.06302118, + "balance_loss_mlp": 0.01261764, + "epoch": 0.24939125206673682, + "flos": 26659242881280.0, + "grad_norm": 1.678024692441642, + "language_loss": 0.89250457, + "learning_rate": 3.5159767130655366e-06, + "loss": 0.97057807, + "num_input_tokens_seen": 89425090, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.20214844, + "step": 4148, + "time_per_iteration": 2.5950350761413574 + }, + { + "auxiliary_loss_clip": 0.06529681, + "auxiliary_loss_mlp": 0.01281757, + "balance_loss_clip": 0.06300169, + "balance_loss_mlp": 0.0125968, + "epoch": 0.24945137531940478, + "flos": 20710623571200.0, + "grad_norm": 1.8952521518004763, + "language_loss": 0.68350649, + "learning_rate": 3.5157226508999935e-06, + "loss": 0.76162088, + "num_input_tokens_seen": 89442615, + "router_z_loss_clip": 2.29492188, + "router_z_loss_mlp": 0.22070312, + "step": 4149, + "time_per_iteration": 2.52567720413208 + }, + { + "auxiliary_loss_clip": 0.06528307, + "auxiliary_loss_mlp": 0.0128627, + "balance_loss_clip": 0.06306647, + "balance_loss_mlp": 0.01266398, + "epoch": 0.24951149857207275, + "flos": 23775385708800.0, + "grad_norm": 1.639238516163445, + "language_loss": 0.71759897, + "learning_rate": 3.515468531258095e-06, + "loss": 0.79574472, + "num_input_tokens_seen": 89463025, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.1986084, + "step": 4150, + "time_per_iteration": 2.580000877380371 + }, + { + "auxiliary_loss_clip": 0.06529218, + "auxiliary_loss_mlp": 0.01284871, + "balance_loss_clip": 0.06303831, + "balance_loss_mlp": 0.01264129, + "epoch": 0.2495716218247407, + "flos": 15669589760640.0, + "grad_norm": 1.939767404293352, + "language_loss": 0.73002028, + "learning_rate": 3.515214354149478e-06, + "loss": 0.80816114, + "num_input_tokens_seen": 89480225, + "router_z_loss_clip": 2.25390625, + "router_z_loss_mlp": 0.20739746, + "step": 4151, + "time_per_iteration": 2.4935879707336426 + }, + { + "auxiliary_loss_clip": 0.06534886, + "auxiliary_loss_mlp": 0.01281273, + "balance_loss_clip": 0.0630331, + "balance_loss_mlp": 0.01261055, + "epoch": 0.24963174507740868, + "flos": 24057924076800.0, + "grad_norm": 4.265592628376469, + "language_loss": 0.64070994, + "learning_rate": 3.514960119583781e-06, + "loss": 0.71887159, + "num_input_tokens_seen": 89496985, + "router_z_loss_clip": 2.31445312, + "router_z_loss_mlp": 0.20227051, + "step": 4152, + "time_per_iteration": 2.5687365531921387 + }, + { + "auxiliary_loss_clip": 0.06516105, + "auxiliary_loss_mlp": 0.01279803, + "balance_loss_clip": 0.06296911, + "balance_loss_mlp": 0.01259979, + "epoch": 0.24969186833007664, + "flos": 21806073924480.0, + "grad_norm": 2.335025994250793, + "language_loss": 0.7798419, + "learning_rate": 3.514705827570645e-06, + "loss": 0.85780108, + "num_input_tokens_seen": 89514420, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.19812012, + "step": 4153, + "time_per_iteration": 2.5565860271453857 + }, + { + "auxiliary_loss_clip": 0.06523906, + "auxiliary_loss_mlp": 0.01276939, + "balance_loss_clip": 0.06304043, + "balance_loss_mlp": 0.01257806, + "epoch": 0.24975199158274464, + "flos": 19944307255680.0, + "grad_norm": 2.3946475317027978, + "language_loss": 0.77287221, + "learning_rate": 3.514451478119711e-06, + "loss": 0.85088068, + "num_input_tokens_seen": 89532925, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.19152832, + "step": 4154, + "time_per_iteration": 2.5327064990997314 + }, + { + "auxiliary_loss_clip": 0.06533594, + "auxiliary_loss_mlp": 0.0128089, + "balance_loss_clip": 0.06299926, + "balance_loss_mlp": 0.01258145, + "epoch": 0.2498121148354126, + "flos": 25345515841920.0, + "grad_norm": 1.7912237432514402, + "language_loss": 0.71052945, + "learning_rate": 3.5141970712406258e-06, + "loss": 0.78867429, + "num_input_tokens_seen": 89552855, + "router_z_loss_clip": 2.33984375, + "router_z_loss_mlp": 0.22766113, + "step": 4155, + "time_per_iteration": 2.566044330596924 + }, + { + "auxiliary_loss_clip": 0.06528749, + "auxiliary_loss_mlp": 0.01277956, + "balance_loss_clip": 0.06300025, + "balance_loss_mlp": 0.01257809, + "epoch": 0.24987223808808057, + "flos": 20565119756160.0, + "grad_norm": 1.6974291352944781, + "language_loss": 0.75592315, + "learning_rate": 3.513942606943036e-06, + "loss": 0.83399028, + "num_input_tokens_seen": 89572830, + "router_z_loss_clip": 2.2890625, + "router_z_loss_mlp": 0.20141602, + "step": 4156, + "time_per_iteration": 2.5388355255126953 + }, + { + "auxiliary_loss_clip": 0.06524897, + "auxiliary_loss_mlp": 0.01278507, + "balance_loss_clip": 0.06303049, + "balance_loss_mlp": 0.0125842, + "epoch": 0.24993236134074853, + "flos": 19754052560640.0, + "grad_norm": 3.125892113983293, + "language_loss": 0.77757698, + "learning_rate": 3.513688085236591e-06, + "loss": 0.85561097, + "num_input_tokens_seen": 89590345, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.20068359, + "step": 4157, + "time_per_iteration": 2.5327329635620117 + }, + { + "auxiliary_loss_clip": 0.06527505, + "auxiliary_loss_mlp": 0.012775, + "balance_loss_clip": 0.06301083, + "balance_loss_mlp": 0.01257068, + "epoch": 0.2499924845934165, + "flos": 18776209812480.0, + "grad_norm": 1.8891569690037928, + "language_loss": 0.82203197, + "learning_rate": 3.513433506130942e-06, + "loss": 0.90008199, + "num_input_tokens_seen": 89610295, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.20422363, + "step": 4158, + "time_per_iteration": 2.5894827842712402 + }, + { + "auxiliary_loss_clip": 0.06518973, + "auxiliary_loss_mlp": 0.01272913, + "balance_loss_clip": 0.06295922, + "balance_loss_mlp": 0.012544, + "epoch": 0.25005260784608446, + "flos": 16877658401280.0, + "grad_norm": 2.206587551308884, + "language_loss": 0.75718945, + "learning_rate": 3.5131788696357427e-06, + "loss": 0.83510834, + "num_input_tokens_seen": 89627795, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.18505859, + "step": 4159, + "time_per_iteration": 2.5279693603515625 + }, + { + "auxiliary_loss_clip": 0.06529576, + "auxiliary_loss_mlp": 0.01278956, + "balance_loss_clip": 0.06300279, + "balance_loss_mlp": 0.01258142, + "epoch": 0.2501127310987524, + "flos": 22131057185280.0, + "grad_norm": 2.1699031495969354, + "language_loss": 0.71598893, + "learning_rate": 3.512924175760649e-06, + "loss": 0.7940743, + "num_input_tokens_seen": 89648090, + "router_z_loss_clip": 2.29492188, + "router_z_loss_mlp": 0.20812988, + "step": 4160, + "time_per_iteration": 3.9746532440185547 + }, + { + "auxiliary_loss_clip": 0.06424317, + "auxiliary_loss_mlp": 0.01267599, + "balance_loss_clip": 0.06313459, + "balance_loss_mlp": 0.01263326, + "epoch": 0.2501728543514204, + "flos": 69480071170560.0, + "grad_norm": 0.7438462037708533, + "language_loss": 0.56844532, + "learning_rate": 3.5126694245153186e-06, + "loss": 0.64536446, + "num_input_tokens_seen": 89710345, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.04278564, + "step": 4161, + "time_per_iteration": 3.233760356903076 + }, + { + "auxiliary_loss_clip": 0.06530809, + "auxiliary_loss_mlp": 0.01282686, + "balance_loss_clip": 0.06298731, + "balance_loss_mlp": 0.01261848, + "epoch": 0.25023297760408836, + "flos": 16295601214080.0, + "grad_norm": 2.49700797922569, + "language_loss": 0.8179751, + "learning_rate": 3.5124146159094125e-06, + "loss": 0.89611006, + "num_input_tokens_seen": 89729390, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.20849609, + "step": 4162, + "time_per_iteration": 2.553572654724121 + }, + { + "auxiliary_loss_clip": 0.0652239, + "auxiliary_loss_mlp": 0.01280647, + "balance_loss_clip": 0.06294353, + "balance_loss_mlp": 0.01260358, + "epoch": 0.2502931008567563, + "flos": 12242598422400.0, + "grad_norm": 2.2503072324763616, + "language_loss": 0.88019562, + "learning_rate": 3.5121597499525927e-06, + "loss": 0.95822597, + "num_input_tokens_seen": 89742805, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.203125, + "step": 4163, + "time_per_iteration": 2.531467914581299 + }, + { + "auxiliary_loss_clip": 0.06520548, + "auxiliary_loss_mlp": 0.01277405, + "balance_loss_clip": 0.06293885, + "balance_loss_mlp": 0.01257092, + "epoch": 0.25035322410942434, + "flos": 23188003787520.0, + "grad_norm": 1.6365124228332002, + "language_loss": 0.83867121, + "learning_rate": 3.5119048266545232e-06, + "loss": 0.91665077, + "num_input_tokens_seen": 89761145, + "router_z_loss_clip": 2.26757812, + "router_z_loss_mlp": 0.20300293, + "step": 4164, + "time_per_iteration": 4.068189382553101 + }, + { + "auxiliary_loss_clip": 0.06509531, + "auxiliary_loss_mlp": 0.01280667, + "balance_loss_clip": 0.06292763, + "balance_loss_mlp": 0.01262106, + "epoch": 0.2504133473620923, + "flos": 20922904690560.0, + "grad_norm": 1.788160941639295, + "language_loss": 0.7460506, + "learning_rate": 3.5116498460248716e-06, + "loss": 0.82395256, + "num_input_tokens_seen": 89780905, + "router_z_loss_clip": 2.16601562, + "router_z_loss_mlp": 0.18579102, + "step": 4165, + "time_per_iteration": 2.568701982498169 + }, + { + "auxiliary_loss_clip": 0.06526586, + "auxiliary_loss_mlp": 0.01278077, + "balance_loss_clip": 0.06293961, + "balance_loss_mlp": 0.01257883, + "epoch": 0.2504734706147603, + "flos": 20782725609600.0, + "grad_norm": 1.8100288551258081, + "language_loss": 0.74429101, + "learning_rate": 3.5113948080733062e-06, + "loss": 0.82233763, + "num_input_tokens_seen": 89799230, + "router_z_loss_clip": 2.33007812, + "router_z_loss_mlp": 0.2019043, + "step": 4166, + "time_per_iteration": 3.989368438720703 + }, + { + "auxiliary_loss_clip": 0.065147, + "auxiliary_loss_mlp": 0.01277163, + "balance_loss_clip": 0.06293219, + "balance_loss_mlp": 0.0125778, + "epoch": 0.25053359386742824, + "flos": 24355681960320.0, + "grad_norm": 1.5960764456675967, + "language_loss": 0.82469785, + "learning_rate": 3.5111397128094973e-06, + "loss": 0.90261644, + "num_input_tokens_seen": 89818240, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.19384766, + "step": 4167, + "time_per_iteration": 2.554733991622925 + }, + { + "auxiliary_loss_clip": 0.06513357, + "auxiliary_loss_mlp": 0.01280403, + "balance_loss_clip": 0.06292276, + "balance_loss_mlp": 0.01260614, + "epoch": 0.2505937171200962, + "flos": 21220578720000.0, + "grad_norm": 1.9887592956808484, + "language_loss": 0.80394876, + "learning_rate": 3.51088456024312e-06, + "loss": 0.88188636, + "num_input_tokens_seen": 89834485, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.19799805, + "step": 4168, + "time_per_iteration": 2.576969623565674 + }, + { + "auxiliary_loss_clip": 0.06531397, + "auxiliary_loss_mlp": 0.01277594, + "balance_loss_clip": 0.06300385, + "balance_loss_mlp": 0.01256196, + "epoch": 0.25065384037276417, + "flos": 41436816802560.0, + "grad_norm": 4.930314721126017, + "language_loss": 0.69985271, + "learning_rate": 3.510629350383849e-06, + "loss": 0.7779426, + "num_input_tokens_seen": 89855645, + "router_z_loss_clip": 2.31445312, + "router_z_loss_mlp": 0.21386719, + "step": 4169, + "time_per_iteration": 2.709149122238159 + }, + { + "auxiliary_loss_clip": 0.06511761, + "auxiliary_loss_mlp": 0.01277868, + "balance_loss_clip": 0.06292351, + "balance_loss_mlp": 0.0125827, + "epoch": 0.25071396362543213, + "flos": 26109274608000.0, + "grad_norm": 1.904216953279787, + "language_loss": 0.77927327, + "learning_rate": 3.510374083241361e-06, + "loss": 0.85716957, + "num_input_tokens_seen": 89874895, + "router_z_loss_clip": 2.19335938, + "router_z_loss_mlp": 0.19592285, + "step": 4170, + "time_per_iteration": 4.016170024871826 + }, + { + "auxiliary_loss_clip": 0.0651409, + "auxiliary_loss_mlp": 0.01278168, + "balance_loss_clip": 0.06291165, + "balance_loss_mlp": 0.01258975, + "epoch": 0.2507740868781001, + "flos": 19105008433920.0, + "grad_norm": 2.5077494433812966, + "language_loss": 0.76900339, + "learning_rate": 3.5101187588253368e-06, + "loss": 0.84692597, + "num_input_tokens_seen": 89891700, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.1920166, + "step": 4171, + "time_per_iteration": 2.5651609897613525 + }, + { + "auxiliary_loss_clip": 0.06406491, + "auxiliary_loss_mlp": 0.01262132, + "balance_loss_clip": 0.06297328, + "balance_loss_mlp": 0.01257083, + "epoch": 0.25083421013076806, + "flos": 64361652514560.0, + "grad_norm": 0.8214086964760371, + "language_loss": 0.6006844, + "learning_rate": 3.509863377145458e-06, + "loss": 0.67737067, + "num_input_tokens_seen": 89955775, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.05047607, + "step": 4172, + "time_per_iteration": 3.1837103366851807 + }, + { + "auxiliary_loss_clip": 0.06520402, + "auxiliary_loss_mlp": 0.01281138, + "balance_loss_clip": 0.06295419, + "balance_loss_mlp": 0.012603, + "epoch": 0.25089433338343603, + "flos": 24286430960640.0, + "grad_norm": 1.3489665028935822, + "language_loss": 0.79424238, + "learning_rate": 3.509607938211409e-06, + "loss": 0.87225777, + "num_input_tokens_seen": 89977150, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.20849609, + "step": 4173, + "time_per_iteration": 2.6214826107025146 + }, + { + "auxiliary_loss_clip": 0.06513289, + "auxiliary_loss_mlp": 0.01273745, + "balance_loss_clip": 0.06291197, + "balance_loss_mlp": 0.01254398, + "epoch": 0.250954456636104, + "flos": 14726896600320.0, + "grad_norm": 1.8312177549547823, + "language_loss": 0.83930022, + "learning_rate": 3.509352442032875e-06, + "loss": 0.91717052, + "num_input_tokens_seen": 89994925, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.19360352, + "step": 4174, + "time_per_iteration": 2.5973377227783203 + }, + { + "auxiliary_loss_clip": 0.06519122, + "auxiliary_loss_mlp": 0.0127901, + "balance_loss_clip": 0.0629285, + "balance_loss_mlp": 0.01259341, + "epoch": 0.25101457988877196, + "flos": 22280208652800.0, + "grad_norm": 2.088546315652338, + "language_loss": 0.71558678, + "learning_rate": 3.509096888619545e-06, + "loss": 0.79356813, + "num_input_tokens_seen": 90013235, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.19665527, + "step": 4175, + "time_per_iteration": 2.6718719005584717 + }, + { + "auxiliary_loss_clip": 0.06522886, + "auxiliary_loss_mlp": 0.01277602, + "balance_loss_clip": 0.06295571, + "balance_loss_mlp": 0.01256502, + "epoch": 0.2510747031414399, + "flos": 25195441979520.0, + "grad_norm": 1.9595604726907228, + "language_loss": 0.81335604, + "learning_rate": 3.50884127798111e-06, + "loss": 0.891361, + "num_input_tokens_seen": 90032150, + "router_z_loss_clip": 2.2734375, + "router_z_loss_mlp": 0.2109375, + "step": 4176, + "time_per_iteration": 2.5455691814422607 + }, + { + "auxiliary_loss_clip": 0.06515132, + "auxiliary_loss_mlp": 0.01279504, + "balance_loss_clip": 0.06292217, + "balance_loss_mlp": 0.01257319, + "epoch": 0.25113482639410795, + "flos": 20710455863040.0, + "grad_norm": 1.8805810902271358, + "language_loss": 0.83346581, + "learning_rate": 3.5085856101272623e-06, + "loss": 0.91141224, + "num_input_tokens_seen": 90049085, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.22167969, + "step": 4177, + "time_per_iteration": 2.5471949577331543 + }, + { + "auxiliary_loss_clip": 0.06520942, + "auxiliary_loss_mlp": 0.01276628, + "balance_loss_clip": 0.06300486, + "balance_loss_mlp": 0.01256375, + "epoch": 0.2511949496467759, + "flos": 21513347285760.0, + "grad_norm": 2.081094632338002, + "language_loss": 0.83410418, + "learning_rate": 3.508329885067698e-06, + "loss": 0.91207987, + "num_input_tokens_seen": 90067695, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.20251465, + "step": 4178, + "time_per_iteration": 2.5352370738983154 + }, + { + "auxiliary_loss_clip": 0.06514454, + "auxiliary_loss_mlp": 0.01274486, + "balance_loss_clip": 0.06294617, + "balance_loss_mlp": 0.01255949, + "epoch": 0.2512550728994439, + "flos": 20707898313600.0, + "grad_norm": 2.160080340734635, + "language_loss": 0.75744665, + "learning_rate": 3.508074102812112e-06, + "loss": 0.83533603, + "num_input_tokens_seen": 90083890, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.18554688, + "step": 4179, + "time_per_iteration": 2.560995578765869 + }, + { + "auxiliary_loss_clip": 0.0652363, + "auxiliary_loss_mlp": 0.0128226, + "balance_loss_clip": 0.06298499, + "balance_loss_mlp": 0.01261053, + "epoch": 0.25131519615211184, + "flos": 18484531349760.0, + "grad_norm": 2.0850842878171347, + "language_loss": 0.70515448, + "learning_rate": 3.507818263370206e-06, + "loss": 0.78321338, + "num_input_tokens_seen": 90100995, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.2121582, + "step": 4180, + "time_per_iteration": 2.510233163833618 + }, + { + "auxiliary_loss_clip": 0.06511761, + "auxiliary_loss_mlp": 0.01275296, + "balance_loss_clip": 0.06292045, + "balance_loss_mlp": 0.0125565, + "epoch": 0.2513753194047798, + "flos": 20491131000960.0, + "grad_norm": 1.8144815234901748, + "language_loss": 0.86591852, + "learning_rate": 3.5075623667516796e-06, + "loss": 0.94378912, + "num_input_tokens_seen": 90120365, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.19628906, + "step": 4181, + "time_per_iteration": 2.546736240386963 + }, + { + "auxiliary_loss_clip": 0.06519435, + "auxiliary_loss_mlp": 0.01276165, + "balance_loss_clip": 0.06298217, + "balance_loss_mlp": 0.01256555, + "epoch": 0.25143544265744777, + "flos": 37679182053120.0, + "grad_norm": 1.8572714108551465, + "language_loss": 0.68626046, + "learning_rate": 3.507306412966238e-06, + "loss": 0.76421642, + "num_input_tokens_seen": 90142610, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.19616699, + "step": 4182, + "time_per_iteration": 2.6632721424102783 + }, + { + "auxiliary_loss_clip": 0.06408723, + "auxiliary_loss_mlp": 0.01271787, + "balance_loss_clip": 0.0630056, + "balance_loss_mlp": 0.012679, + "epoch": 0.25149556591011574, + "flos": 69386502487680.0, + "grad_norm": 0.837431587640593, + "language_loss": 0.70118701, + "learning_rate": 3.5070504020235853e-06, + "loss": 0.77799207, + "num_input_tokens_seen": 90200555, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.03881836, + "step": 4183, + "time_per_iteration": 3.194293737411499 + }, + { + "auxiliary_loss_clip": 0.0651418, + "auxiliary_loss_mlp": 0.01278526, + "balance_loss_clip": 0.06292195, + "balance_loss_mlp": 0.01258725, + "epoch": 0.2515556891627837, + "flos": 13995478310400.0, + "grad_norm": 2.4106350957321805, + "language_loss": 0.74627292, + "learning_rate": 3.506794333933431e-06, + "loss": 0.82419991, + "num_input_tokens_seen": 90218120, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.19799805, + "step": 4184, + "time_per_iteration": 2.589237689971924 + }, + { + "auxiliary_loss_clip": 0.0652144, + "auxiliary_loss_mlp": 0.01279322, + "balance_loss_clip": 0.06299628, + "balance_loss_mlp": 0.01258496, + "epoch": 0.25161581241545167, + "flos": 22170022133760.0, + "grad_norm": 2.9216799071507964, + "language_loss": 0.83484751, + "learning_rate": 3.506538208705484e-06, + "loss": 0.91285515, + "num_input_tokens_seen": 90236790, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.20837402, + "step": 4185, + "time_per_iteration": 2.5535552501678467 + }, + { + "auxiliary_loss_clip": 0.06393237, + "auxiliary_loss_mlp": 0.01262208, + "balance_loss_clip": 0.06284703, + "balance_loss_mlp": 0.01258632, + "epoch": 0.25167593566811963, + "flos": 69375936873600.0, + "grad_norm": 0.7619629684954553, + "language_loss": 0.61517715, + "learning_rate": 3.5062820263494574e-06, + "loss": 0.69173163, + "num_input_tokens_seen": 90297070, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.03567505, + "step": 4186, + "time_per_iteration": 3.0749270915985107 + }, + { + "auxiliary_loss_clip": 0.06517404, + "auxiliary_loss_mlp": 0.01276354, + "balance_loss_clip": 0.06296861, + "balance_loss_mlp": 0.01256946, + "epoch": 0.2517360589207876, + "flos": 13266533715840.0, + "grad_norm": 1.9855339768496567, + "language_loss": 0.79795682, + "learning_rate": 3.5060257868750656e-06, + "loss": 0.87589443, + "num_input_tokens_seen": 90315255, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.1940918, + "step": 4187, + "time_per_iteration": 2.507354974746704 + }, + { + "auxiliary_loss_clip": 0.06517795, + "auxiliary_loss_mlp": 0.01276527, + "balance_loss_clip": 0.06298955, + "balance_loss_mlp": 0.01257001, + "epoch": 0.25179618217345556, + "flos": 20383208542080.0, + "grad_norm": 1.642205422551737, + "language_loss": 0.80147833, + "learning_rate": 3.5057694902920244e-06, + "loss": 0.87942159, + "num_input_tokens_seen": 90334990, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.19519043, + "step": 4188, + "time_per_iteration": 2.5763680934906006 + }, + { + "auxiliary_loss_clip": 0.06512115, + "auxiliary_loss_mlp": 0.01281194, + "balance_loss_clip": 0.06293076, + "balance_loss_mlp": 0.01261405, + "epoch": 0.25185630542612353, + "flos": 27670767770880.0, + "grad_norm": 1.9118309511671905, + "language_loss": 0.75198257, + "learning_rate": 3.5055131366100534e-06, + "loss": 0.8299157, + "num_input_tokens_seen": 90351825, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.19775391, + "step": 4189, + "time_per_iteration": 2.5764901638031006 + }, + { + "auxiliary_loss_clip": 0.06511948, + "auxiliary_loss_mlp": 0.01272813, + "balance_loss_clip": 0.06296545, + "balance_loss_mlp": 0.01255253, + "epoch": 0.25191642867879155, + "flos": 21002805158400.0, + "grad_norm": 1.9652552730181423, + "language_loss": 0.84938216, + "learning_rate": 3.5052567258388745e-06, + "loss": 0.92722976, + "num_input_tokens_seen": 90369860, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.17565918, + "step": 4190, + "time_per_iteration": 2.592289447784424 + }, + { + "auxiliary_loss_clip": 0.06519347, + "auxiliary_loss_mlp": 0.01277887, + "balance_loss_clip": 0.0629743, + "balance_loss_mlp": 0.01256513, + "epoch": 0.2519765519314595, + "flos": 21112027355520.0, + "grad_norm": 3.618444667756858, + "language_loss": 0.7581113, + "learning_rate": 3.5050002579882082e-06, + "loss": 0.83608365, + "num_input_tokens_seen": 90389245, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.21386719, + "step": 4191, + "time_per_iteration": 2.526263952255249 + }, + { + "auxiliary_loss_clip": 0.06391463, + "auxiliary_loss_mlp": 0.01256383, + "balance_loss_clip": 0.06282607, + "balance_loss_mlp": 0.01252372, + "epoch": 0.2520366751841275, + "flos": 62765932158720.0, + "grad_norm": 0.7119135795788611, + "language_loss": 0.56952, + "learning_rate": 3.5047437330677823e-06, + "loss": 0.64599848, + "num_input_tokens_seen": 90456735, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.0401001, + "step": 4192, + "time_per_iteration": 3.271810531616211 + }, + { + "auxiliary_loss_clip": 0.06513695, + "auxiliary_loss_mlp": 0.01277171, + "balance_loss_clip": 0.06298056, + "balance_loss_mlp": 0.01257835, + "epoch": 0.25209679843679544, + "flos": 22236254386560.0, + "grad_norm": 1.9003966807864532, + "language_loss": 0.77017993, + "learning_rate": 3.504487151087323e-06, + "loss": 0.84808856, + "num_input_tokens_seen": 90474165, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.19335938, + "step": 4193, + "time_per_iteration": 2.57377028465271 + }, + { + "auxiliary_loss_clip": 0.06516427, + "auxiliary_loss_mlp": 0.01274362, + "balance_loss_clip": 0.06290127, + "balance_loss_mlp": 0.01254573, + "epoch": 0.2521569216894634, + "flos": 12171502632960.0, + "grad_norm": 10.029516736128722, + "language_loss": 0.84954166, + "learning_rate": 3.5042305120565598e-06, + "loss": 0.92744958, + "num_input_tokens_seen": 90491660, + "router_z_loss_clip": 2.26171875, + "router_z_loss_mlp": 0.19787598, + "step": 4194, + "time_per_iteration": 2.553053140640259 + }, + { + "auxiliary_loss_clip": 0.06517825, + "auxiliary_loss_mlp": 0.01277837, + "balance_loss_clip": 0.06293463, + "balance_loss_mlp": 0.01258668, + "epoch": 0.2522170449421314, + "flos": 23707182885120.0, + "grad_norm": 1.454284137617771, + "language_loss": 0.88584, + "learning_rate": 3.5039738159852253e-06, + "loss": 0.96379662, + "num_input_tokens_seen": 90514025, + "router_z_loss_clip": 2.2421875, + "router_z_loss_mlp": 0.19165039, + "step": 4195, + "time_per_iteration": 2.576735734939575 + }, + { + "auxiliary_loss_clip": 0.06516481, + "auxiliary_loss_mlp": 0.01280538, + "balance_loss_clip": 0.06291771, + "balance_loss_mlp": 0.01258258, + "epoch": 0.25227716819479934, + "flos": 20961073025280.0, + "grad_norm": 2.023401186655312, + "language_loss": 0.86073804, + "learning_rate": 3.503717062883053e-06, + "loss": 0.93870831, + "num_input_tokens_seen": 90533530, + "router_z_loss_clip": 2.24609375, + "router_z_loss_mlp": 0.22290039, + "step": 4196, + "time_per_iteration": 2.561074733734131 + }, + { + "auxiliary_loss_clip": 0.06519768, + "auxiliary_loss_mlp": 0.01277786, + "balance_loss_clip": 0.06297043, + "balance_loss_mlp": 0.01258486, + "epoch": 0.2523372914474673, + "flos": 23338077649920.0, + "grad_norm": 1.7735111095668046, + "language_loss": 0.8382597, + "learning_rate": 3.5034602527597786e-06, + "loss": 0.91623521, + "num_input_tokens_seen": 90554025, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.19299316, + "step": 4197, + "time_per_iteration": 2.606966018676758 + }, + { + "auxiliary_loss_clip": 0.06523669, + "auxiliary_loss_mlp": 0.01282022, + "balance_loss_clip": 0.06298, + "balance_loss_mlp": 0.01260898, + "epoch": 0.25239741470013527, + "flos": 36978217522560.0, + "grad_norm": 2.239450775339409, + "language_loss": 0.72922301, + "learning_rate": 3.5032033856251405e-06, + "loss": 0.80727994, + "num_input_tokens_seen": 90576930, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.21130371, + "step": 4198, + "time_per_iteration": 2.6708526611328125 + }, + { + "auxiliary_loss_clip": 0.06527208, + "auxiliary_loss_mlp": 0.012804, + "balance_loss_clip": 0.06297485, + "balance_loss_mlp": 0.01258967, + "epoch": 0.25245753795280323, + "flos": 18521777289600.0, + "grad_norm": 2.0891954597653055, + "language_loss": 0.77475321, + "learning_rate": 3.50294646148888e-06, + "loss": 0.85282922, + "num_input_tokens_seen": 90595710, + "router_z_loss_clip": 2.29882812, + "router_z_loss_mlp": 0.21447754, + "step": 4199, + "time_per_iteration": 3.9535269737243652 + }, + { + "auxiliary_loss_clip": 0.06522667, + "auxiliary_loss_mlp": 0.01277202, + "balance_loss_clip": 0.06296766, + "balance_loss_mlp": 0.01257485, + "epoch": 0.2525176612054712, + "flos": 32353387741440.0, + "grad_norm": 1.7804914051128766, + "language_loss": 0.74169135, + "learning_rate": 3.502689480360739e-06, + "loss": 0.81969011, + "num_input_tokens_seen": 90617945, + "router_z_loss_clip": 2.25976562, + "router_z_loss_mlp": 0.19714355, + "step": 4200, + "time_per_iteration": 2.637592315673828 + }, + { + "auxiliary_loss_clip": 0.06517747, + "auxiliary_loss_mlp": 0.01275367, + "balance_loss_clip": 0.06294595, + "balance_loss_mlp": 0.01255602, + "epoch": 0.25257778445813917, + "flos": 45268440307200.0, + "grad_norm": 1.5897560976370495, + "language_loss": 0.82704282, + "learning_rate": 3.5024324422504616e-06, + "loss": 0.90497398, + "num_input_tokens_seen": 90640855, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.19775391, + "step": 4201, + "time_per_iteration": 2.740555763244629 + }, + { + "auxiliary_loss_clip": 0.06520839, + "auxiliary_loss_mlp": 0.01279409, + "balance_loss_clip": 0.06295383, + "balance_loss_mlp": 0.01259048, + "epoch": 0.25263790771080713, + "flos": 23374526976000.0, + "grad_norm": 1.712909977397354, + "language_loss": 0.75193971, + "learning_rate": 3.5021753471677965e-06, + "loss": 0.82994223, + "num_input_tokens_seen": 90661350, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.20361328, + "step": 4202, + "time_per_iteration": 2.55350661277771 + }, + { + "auxiliary_loss_clip": 0.06512797, + "auxiliary_loss_mlp": 0.01277812, + "balance_loss_clip": 0.06294158, + "balance_loss_mlp": 0.01258226, + "epoch": 0.25269803096347515, + "flos": 18520938748800.0, + "grad_norm": 3.10045167794265, + "language_loss": 0.73924601, + "learning_rate": 3.501918195122491e-06, + "loss": 0.81715208, + "num_input_tokens_seen": 90680540, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.19592285, + "step": 4203, + "time_per_iteration": 2.539475917816162 + }, + { + "auxiliary_loss_clip": 0.06523657, + "auxiliary_loss_mlp": 0.01272979, + "balance_loss_clip": 0.0629805, + "balance_loss_mlp": 0.01252964, + "epoch": 0.2527581542161431, + "flos": 24617870985600.0, + "grad_norm": 1.4931409888350198, + "language_loss": 0.78306639, + "learning_rate": 3.501660986124297e-06, + "loss": 0.86103272, + "num_input_tokens_seen": 90703460, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.20007324, + "step": 4204, + "time_per_iteration": 4.058368682861328 + }, + { + "auxiliary_loss_clip": 0.0651952, + "auxiliary_loss_mlp": 0.01278061, + "balance_loss_clip": 0.06294288, + "balance_loss_mlp": 0.01258427, + "epoch": 0.2528182774688111, + "flos": 12646266266880.0, + "grad_norm": 2.5678524165435928, + "language_loss": 0.72629768, + "learning_rate": 3.5014037201829684e-06, + "loss": 0.80427349, + "num_input_tokens_seen": 90718815, + "router_z_loss_clip": 2.25585938, + "router_z_loss_mlp": 0.19616699, + "step": 4205, + "time_per_iteration": 2.503054618835449 + }, + { + "auxiliary_loss_clip": 0.06508891, + "auxiliary_loss_mlp": 0.01281235, + "balance_loss_clip": 0.06295824, + "balance_loss_mlp": 0.01264164, + "epoch": 0.25287840072147905, + "flos": 46947331440000.0, + "grad_norm": 1.3326329418173375, + "language_loss": 0.76355231, + "learning_rate": 3.50114639730826e-06, + "loss": 0.84145361, + "num_input_tokens_seen": 90742125, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.17077637, + "step": 4206, + "time_per_iteration": 4.097341537475586 + }, + { + "auxiliary_loss_clip": 0.06516857, + "auxiliary_loss_mlp": 0.01278993, + "balance_loss_clip": 0.06295118, + "balance_loss_mlp": 0.0126042, + "epoch": 0.252938523974147, + "flos": 18885641644800.0, + "grad_norm": 1.8849973173990275, + "language_loss": 0.79775047, + "learning_rate": 3.5008890175099296e-06, + "loss": 0.875709, + "num_input_tokens_seen": 90760785, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.18579102, + "step": 4207, + "time_per_iteration": 2.545203447341919 + }, + { + "auxiliary_loss_clip": 0.06511112, + "auxiliary_loss_mlp": 0.01280475, + "balance_loss_clip": 0.06293532, + "balance_loss_mlp": 0.01261628, + "epoch": 0.252998647226815, + "flos": 21441245247360.0, + "grad_norm": 1.449056492648579, + "language_loss": 0.76862776, + "learning_rate": 3.5006315807977375e-06, + "loss": 0.84654361, + "num_input_tokens_seen": 90780045, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.18859863, + "step": 4208, + "time_per_iteration": 2.540531873703003 + }, + { + "auxiliary_loss_clip": 0.06512551, + "auxiliary_loss_mlp": 0.01282266, + "balance_loss_clip": 0.06295963, + "balance_loss_mlp": 0.01264098, + "epoch": 0.25305877047948294, + "flos": 25448365128960.0, + "grad_norm": 1.8025422596027827, + "language_loss": 0.70108622, + "learning_rate": 3.5003740871814456e-06, + "loss": 0.77903438, + "num_input_tokens_seen": 90797980, + "router_z_loss_clip": 2.1640625, + "router_z_loss_mlp": 0.1817627, + "step": 4209, + "time_per_iteration": 2.586179256439209 + }, + { + "auxiliary_loss_clip": 0.06401253, + "auxiliary_loss_mlp": 0.01256172, + "balance_loss_clip": 0.06294125, + "balance_loss_mlp": 0.01251663, + "epoch": 0.2531188937321509, + "flos": 60205213457280.0, + "grad_norm": 0.7328516672129679, + "language_loss": 0.55096745, + "learning_rate": 3.5001165366708175e-06, + "loss": 0.62754166, + "num_input_tokens_seen": 90864865, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.0451355, + "step": 4210, + "time_per_iteration": 4.676252841949463 + }, + { + "auxiliary_loss_clip": 0.06515378, + "auxiliary_loss_mlp": 0.01285614, + "balance_loss_clip": 0.06294395, + "balance_loss_mlp": 0.01265861, + "epoch": 0.25317901698481887, + "flos": 19688449213440.0, + "grad_norm": 2.0935195986224837, + "language_loss": 0.81166065, + "learning_rate": 3.4998589292756204e-06, + "loss": 0.88967055, + "num_input_tokens_seen": 90882885, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.19763184, + "step": 4211, + "time_per_iteration": 2.5251474380493164 + }, + { + "auxiliary_loss_clip": 0.06513076, + "auxiliary_loss_mlp": 0.01275756, + "balance_loss_clip": 0.06299528, + "balance_loss_mlp": 0.01258554, + "epoch": 0.25323914023748684, + "flos": 24431012380800.0, + "grad_norm": 1.7184165713115493, + "language_loss": 0.78543985, + "learning_rate": 3.499601265005622e-06, + "loss": 0.86332822, + "num_input_tokens_seen": 90902985, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.17199707, + "step": 4212, + "time_per_iteration": 2.609750986099243 + }, + { + "auxiliary_loss_clip": 0.06514729, + "auxiliary_loss_mlp": 0.01278491, + "balance_loss_clip": 0.06293602, + "balance_loss_mlp": 0.0125912, + "epoch": 0.2532992634901548, + "flos": 25454528403840.0, + "grad_norm": 1.862422609084939, + "language_loss": 0.53407073, + "learning_rate": 3.4993435438705938e-06, + "loss": 0.61200291, + "num_input_tokens_seen": 90923550, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.19384766, + "step": 4213, + "time_per_iteration": 2.5825159549713135 + }, + { + "auxiliary_loss_clip": 0.06517738, + "auxiliary_loss_mlp": 0.01278881, + "balance_loss_clip": 0.06296406, + "balance_loss_mlp": 0.01259832, + "epoch": 0.25335938674282277, + "flos": 18886605966720.0, + "grad_norm": 2.428420926128805, + "language_loss": 0.65041012, + "learning_rate": 3.499085765880308e-06, + "loss": 0.72837627, + "num_input_tokens_seen": 90943260, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.19030762, + "step": 4214, + "time_per_iteration": 2.567539930343628 + }, + { + "auxiliary_loss_clip": 0.06391697, + "auxiliary_loss_mlp": 0.01257675, + "balance_loss_clip": 0.06284457, + "balance_loss_mlp": 0.01253702, + "epoch": 0.25341950999549073, + "flos": 53079692025600.0, + "grad_norm": 0.8253897319773601, + "language_loss": 0.57886475, + "learning_rate": 3.4988279310445396e-06, + "loss": 0.65535849, + "num_input_tokens_seen": 90996295, + "router_z_loss_clip": 1.07324219, + "router_z_loss_mlp": 0.03970337, + "step": 4215, + "time_per_iteration": 2.941021680831909 + }, + { + "auxiliary_loss_clip": 0.06512114, + "auxiliary_loss_mlp": 0.01274398, + "balance_loss_clip": 0.0629489, + "balance_loss_mlp": 0.0125604, + "epoch": 0.2534796332481587, + "flos": 39029609980800.0, + "grad_norm": 1.6071125602920209, + "language_loss": 0.84078032, + "learning_rate": 3.498570039373066e-06, + "loss": 0.9186455, + "num_input_tokens_seen": 91017545, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.18359375, + "step": 4216, + "time_per_iteration": 2.732790946960449 + }, + { + "auxiliary_loss_clip": 0.06509562, + "auxiliary_loss_mlp": 0.0127764, + "balance_loss_clip": 0.06290903, + "balance_loss_mlp": 0.01259294, + "epoch": 0.2535397565008267, + "flos": 23593809911040.0, + "grad_norm": 1.7865601815504963, + "language_loss": 0.81036615, + "learning_rate": 3.498312090875666e-06, + "loss": 0.88823819, + "num_input_tokens_seen": 91037715, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.18371582, + "step": 4217, + "time_per_iteration": 2.5606398582458496 + }, + { + "auxiliary_loss_clip": 0.06514265, + "auxiliary_loss_mlp": 0.01279769, + "balance_loss_clip": 0.06294704, + "balance_loss_mlp": 0.01260255, + "epoch": 0.2535998797534947, + "flos": 19287422772480.0, + "grad_norm": 2.529157470409933, + "language_loss": 0.761132, + "learning_rate": 3.4980540855621218e-06, + "loss": 0.83907235, + "num_input_tokens_seen": 91055295, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.19519043, + "step": 4218, + "time_per_iteration": 2.623429298400879 + }, + { + "auxiliary_loss_clip": 0.06516235, + "auxiliary_loss_mlp": 0.01282224, + "balance_loss_clip": 0.06296211, + "balance_loss_mlp": 0.01262757, + "epoch": 0.25366000300616265, + "flos": 24031201824000.0, + "grad_norm": 1.721807278316132, + "language_loss": 0.75063616, + "learning_rate": 3.4977960234422167e-06, + "loss": 0.82862079, + "num_input_tokens_seen": 91075485, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.19482422, + "step": 4219, + "time_per_iteration": 2.564220428466797 + }, + { + "auxiliary_loss_clip": 0.06520407, + "auxiliary_loss_mlp": 0.0127968, + "balance_loss_clip": 0.06298073, + "balance_loss_mlp": 0.01259713, + "epoch": 0.2537201262588306, + "flos": 16294888454400.0, + "grad_norm": 1.6804083546431516, + "language_loss": 0.81834626, + "learning_rate": 3.497537904525736e-06, + "loss": 0.89634717, + "num_input_tokens_seen": 91093620, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.19970703, + "step": 4220, + "time_per_iteration": 2.576335906982422 + }, + { + "auxiliary_loss_clip": 0.0652357, + "auxiliary_loss_mlp": 0.01275521, + "balance_loss_clip": 0.06301299, + "balance_loss_mlp": 0.01256936, + "epoch": 0.2537802495114986, + "flos": 23301376761600.0, + "grad_norm": 2.4535775533256796, + "language_loss": 0.71752739, + "learning_rate": 3.497279728822468e-06, + "loss": 0.79551834, + "num_input_tokens_seen": 91114110, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.18579102, + "step": 4221, + "time_per_iteration": 2.561870813369751 + }, + { + "auxiliary_loss_clip": 0.06528511, + "auxiliary_loss_mlp": 0.01279389, + "balance_loss_clip": 0.0630452, + "balance_loss_mlp": 0.01259148, + "epoch": 0.25384037276416654, + "flos": 17644855184640.0, + "grad_norm": 1.5017476973585115, + "language_loss": 0.62507772, + "learning_rate": 3.497021496342202e-06, + "loss": 0.70315671, + "num_input_tokens_seen": 91133135, + "router_z_loss_clip": 2.23828125, + "router_z_loss_mlp": 0.20239258, + "step": 4222, + "time_per_iteration": 2.6921043395996094 + }, + { + "auxiliary_loss_clip": 0.06520825, + "auxiliary_loss_mlp": 0.01278393, + "balance_loss_clip": 0.06297866, + "balance_loss_mlp": 0.0125864, + "epoch": 0.2539004960168345, + "flos": 21513473066880.0, + "grad_norm": 1.6064438591236823, + "language_loss": 0.75066334, + "learning_rate": 3.496763207094731e-06, + "loss": 0.82865554, + "num_input_tokens_seen": 91151805, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.19763184, + "step": 4223, + "time_per_iteration": 2.525251626968384 + }, + { + "auxiliary_loss_clip": 0.06514867, + "auxiliary_loss_mlp": 0.01278352, + "balance_loss_clip": 0.06297616, + "balance_loss_mlp": 0.01260101, + "epoch": 0.2539606192695025, + "flos": 23957632339200.0, + "grad_norm": 1.753259760034452, + "language_loss": 0.80341679, + "learning_rate": 3.49650486108985e-06, + "loss": 0.88134897, + "num_input_tokens_seen": 91172270, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.18261719, + "step": 4224, + "time_per_iteration": 2.6002583503723145 + }, + { + "auxiliary_loss_clip": 0.06515887, + "auxiliary_loss_mlp": 0.01281311, + "balance_loss_clip": 0.0629767, + "balance_loss_mlp": 0.01261999, + "epoch": 0.25402074252217044, + "flos": 24176537930880.0, + "grad_norm": 1.4707313275482783, + "language_loss": 0.78211224, + "learning_rate": 3.496246458337354e-06, + "loss": 0.8600843, + "num_input_tokens_seen": 91192080, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.19299316, + "step": 4225, + "time_per_iteration": 2.5527138710021973 + }, + { + "auxiliary_loss_clip": 0.06521728, + "auxiliary_loss_mlp": 0.01282671, + "balance_loss_clip": 0.06302264, + "balance_loss_mlp": 0.01263013, + "epoch": 0.2540808657748384, + "flos": 22309320746880.0, + "grad_norm": 1.6188569007516582, + "language_loss": 0.85543132, + "learning_rate": 3.4959879988470426e-06, + "loss": 0.93347526, + "num_input_tokens_seen": 91211450, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.1965332, + "step": 4226, + "time_per_iteration": 2.5676872730255127 + }, + { + "auxiliary_loss_clip": 0.06515788, + "auxiliary_loss_mlp": 0.01277599, + "balance_loss_clip": 0.06296097, + "balance_loss_mlp": 0.01258883, + "epoch": 0.25414098902750637, + "flos": 27606883432320.0, + "grad_norm": 1.6805883261517605, + "language_loss": 0.71414381, + "learning_rate": 3.4957294826287164e-06, + "loss": 0.79207766, + "num_input_tokens_seen": 91231835, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.18713379, + "step": 4227, + "time_per_iteration": 2.5918691158294678 + }, + { + "auxiliary_loss_clip": 0.06387169, + "auxiliary_loss_mlp": 0.01261576, + "balance_loss_clip": 0.06279954, + "balance_loss_mlp": 0.01257166, + "epoch": 0.25420111228017434, + "flos": 58188760951680.0, + "grad_norm": 0.9697801274632529, + "language_loss": 0.61857057, + "learning_rate": 3.4954709096921785e-06, + "loss": 0.69505799, + "num_input_tokens_seen": 91288755, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.04418945, + "step": 4228, + "time_per_iteration": 3.01169490814209 + }, + { + "auxiliary_loss_clip": 0.06514917, + "auxiliary_loss_mlp": 0.01279347, + "balance_loss_clip": 0.0629469, + "balance_loss_mlp": 0.01258235, + "epoch": 0.2542612355328423, + "flos": 11467645136640.0, + "grad_norm": 2.3876652287650577, + "language_loss": 0.8721081, + "learning_rate": 3.4952122800472336e-06, + "loss": 0.95005071, + "num_input_tokens_seen": 91302485, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.21130371, + "step": 4229, + "time_per_iteration": 2.5960769653320312 + }, + { + "auxiliary_loss_clip": 0.06519967, + "auxiliary_loss_mlp": 0.01277589, + "balance_loss_clip": 0.06299049, + "balance_loss_mlp": 0.01257836, + "epoch": 0.2543213587855103, + "flos": 22972452359040.0, + "grad_norm": 2.100172466954555, + "language_loss": 0.78119314, + "learning_rate": 3.4949535937036892e-06, + "loss": 0.85916877, + "num_input_tokens_seen": 91321120, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.19775391, + "step": 4230, + "time_per_iteration": 2.5483899116516113 + }, + { + "auxiliary_loss_clip": 0.06511904, + "auxiliary_loss_mlp": 0.01277721, + "balance_loss_clip": 0.06292608, + "balance_loss_mlp": 0.01257622, + "epoch": 0.2543814820381783, + "flos": 18257953109760.0, + "grad_norm": 2.00545114565419, + "language_loss": 0.75687885, + "learning_rate": 3.4946948506713544e-06, + "loss": 0.83477509, + "num_input_tokens_seen": 91338575, + "router_z_loss_clip": 2.19335938, + "router_z_loss_mlp": 0.20092773, + "step": 4231, + "time_per_iteration": 2.566326379776001 + }, + { + "auxiliary_loss_clip": 0.06520282, + "auxiliary_loss_mlp": 0.01278584, + "balance_loss_clip": 0.06300422, + "balance_loss_mlp": 0.01259761, + "epoch": 0.25444160529084625, + "flos": 15638129752320.0, + "grad_norm": 1.7887257039808522, + "language_loss": 0.74637282, + "learning_rate": 3.4944360509600416e-06, + "loss": 0.82436144, + "num_input_tokens_seen": 91357355, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.18823242, + "step": 4232, + "time_per_iteration": 2.5229685306549072 + }, + { + "auxiliary_loss_clip": 0.0652221, + "auxiliary_loss_mlp": 0.01293975, + "balance_loss_clip": 0.06303085, + "balance_loss_mlp": 0.01272947, + "epoch": 0.2545017285435142, + "flos": 24607431152640.0, + "grad_norm": 1.8617746927090988, + "language_loss": 0.87183899, + "learning_rate": 3.4941771945795637e-06, + "loss": 0.95000088, + "num_input_tokens_seen": 91376515, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.21032715, + "step": 4233, + "time_per_iteration": 2.6281485557556152 + }, + { + "auxiliary_loss_clip": 0.06505871, + "auxiliary_loss_mlp": 0.01278753, + "balance_loss_clip": 0.06294682, + "balance_loss_mlp": 0.01260442, + "epoch": 0.2545618517961822, + "flos": 24685654538880.0, + "grad_norm": 1.601433299567329, + "language_loss": 0.75604707, + "learning_rate": 3.493918281539737e-06, + "loss": 0.8338933, + "num_input_tokens_seen": 91397595, + "router_z_loss_clip": 2.11328125, + "router_z_loss_mlp": 0.18322754, + "step": 4234, + "time_per_iteration": 2.596642017364502 + }, + { + "auxiliary_loss_clip": 0.06514844, + "auxiliary_loss_mlp": 0.01287463, + "balance_loss_clip": 0.06292339, + "balance_loss_mlp": 0.01268938, + "epoch": 0.25462197504885015, + "flos": 23921937699840.0, + "grad_norm": 1.4560099290474922, + "language_loss": 0.75372213, + "learning_rate": 3.493659311850379e-06, + "loss": 0.83174521, + "num_input_tokens_seen": 91417775, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.18518066, + "step": 4235, + "time_per_iteration": 2.592942953109741 + }, + { + "auxiliary_loss_clip": 0.06532556, + "auxiliary_loss_mlp": 0.01283911, + "balance_loss_clip": 0.06299181, + "balance_loss_mlp": 0.01261797, + "epoch": 0.2546820983015181, + "flos": 24796134547200.0, + "grad_norm": 1.9414760170646592, + "language_loss": 0.65519691, + "learning_rate": 3.4934002855213106e-06, + "loss": 0.73336154, + "num_input_tokens_seen": 91437665, + "router_z_loss_clip": 2.33398438, + "router_z_loss_mlp": 0.22131348, + "step": 4236, + "time_per_iteration": 2.5583407878875732 + }, + { + "auxiliary_loss_clip": 0.06512251, + "auxiliary_loss_mlp": 0.01281938, + "balance_loss_clip": 0.06294776, + "balance_loss_mlp": 0.01262984, + "epoch": 0.2547422215541861, + "flos": 18740095902720.0, + "grad_norm": 1.5016735811799797, + "language_loss": 0.678509, + "learning_rate": 3.493141202562354e-06, + "loss": 0.75645095, + "num_input_tokens_seen": 91456705, + "router_z_loss_clip": 2.17578125, + "router_z_loss_mlp": 0.18945312, + "step": 4237, + "time_per_iteration": 2.5650389194488525 + }, + { + "auxiliary_loss_clip": 0.0651492, + "auxiliary_loss_mlp": 0.01282053, + "balance_loss_clip": 0.06293051, + "balance_loss_mlp": 0.01261394, + "epoch": 0.25480234480685404, + "flos": 21038751360000.0, + "grad_norm": 2.061881611294133, + "language_loss": 0.75628269, + "learning_rate": 3.492882062983333e-06, + "loss": 0.83425242, + "num_input_tokens_seen": 91475535, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.20654297, + "step": 4238, + "time_per_iteration": 2.529883861541748 + }, + { + "auxiliary_loss_clip": 0.06513957, + "auxiliary_loss_mlp": 0.0127785, + "balance_loss_clip": 0.06292559, + "balance_loss_mlp": 0.01258287, + "epoch": 0.254862468059522, + "flos": 25089112748160.0, + "grad_norm": 1.8905919191970875, + "language_loss": 0.81253731, + "learning_rate": 3.492622866794074e-06, + "loss": 0.89045537, + "num_input_tokens_seen": 91499140, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.19555664, + "step": 4239, + "time_per_iteration": 4.02100944519043 + }, + { + "auxiliary_loss_clip": 0.06508629, + "auxiliary_loss_mlp": 0.01294237, + "balance_loss_clip": 0.06291452, + "balance_loss_mlp": 0.01273471, + "epoch": 0.25492259131219, + "flos": 20564658558720.0, + "grad_norm": 1.7183169382614727, + "language_loss": 0.7800405, + "learning_rate": 3.492363614004407e-06, + "loss": 0.85806918, + "num_input_tokens_seen": 91518335, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.2076416, + "step": 4240, + "time_per_iteration": 2.5323400497436523 + }, + { + "auxiliary_loss_clip": 0.06515411, + "auxiliary_loss_mlp": 0.01282684, + "balance_loss_clip": 0.06290809, + "balance_loss_mlp": 0.01262037, + "epoch": 0.25498271456485794, + "flos": 25048889988480.0, + "grad_norm": 1.7684080721058644, + "language_loss": 0.83764112, + "learning_rate": 3.492104304624162e-06, + "loss": 0.915622, + "num_input_tokens_seen": 91537655, + "router_z_loss_clip": 2.24804688, + "router_z_loss_mlp": 0.20629883, + "step": 4241, + "time_per_iteration": 2.618563413619995 + }, + { + "auxiliary_loss_clip": 0.06511963, + "auxiliary_loss_mlp": 0.01282405, + "balance_loss_clip": 0.06292334, + "balance_loss_mlp": 0.01262676, + "epoch": 0.2550428378175259, + "flos": 26185820912640.0, + "grad_norm": 1.7847215082139707, + "language_loss": 0.73873413, + "learning_rate": 3.4918449386631725e-06, + "loss": 0.81667781, + "num_input_tokens_seen": 91557545, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.19750977, + "step": 4242, + "time_per_iteration": 2.6289515495300293 + }, + { + "auxiliary_loss_clip": 0.06517772, + "auxiliary_loss_mlp": 0.01279972, + "balance_loss_clip": 0.06296564, + "balance_loss_mlp": 0.01260398, + "epoch": 0.2551029610701939, + "flos": 15272420607360.0, + "grad_norm": 2.4567533637161896, + "language_loss": 0.72771823, + "learning_rate": 3.491585516131273e-06, + "loss": 0.80569565, + "num_input_tokens_seen": 91574405, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.19567871, + "step": 4243, + "time_per_iteration": 3.9432499408721924 + }, + { + "auxiliary_loss_clip": 0.06515735, + "auxiliary_loss_mlp": 0.0127996, + "balance_loss_clip": 0.06295779, + "balance_loss_mlp": 0.01260195, + "epoch": 0.2551630843228619, + "flos": 18117774028800.0, + "grad_norm": 1.7474968125895491, + "language_loss": 0.82239074, + "learning_rate": 3.491326037038301e-06, + "loss": 0.90034771, + "num_input_tokens_seen": 91593755, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.19750977, + "step": 4244, + "time_per_iteration": 2.6024672985076904 + }, + { + "auxiliary_loss_clip": 0.06397872, + "auxiliary_loss_mlp": 0.01258297, + "balance_loss_clip": 0.06291912, + "balance_loss_mlp": 0.01253388, + "epoch": 0.25522320757552985, + "flos": 70543055266560.0, + "grad_norm": 0.6771353060664416, + "language_loss": 0.57579219, + "learning_rate": 3.4910665013940967e-06, + "loss": 0.65235388, + "num_input_tokens_seen": 91660335, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.04904175, + "step": 4245, + "time_per_iteration": 4.687421083450317 + }, + { + "auxiliary_loss_clip": 0.06516664, + "auxiliary_loss_mlp": 0.01277203, + "balance_loss_clip": 0.06290803, + "balance_loss_mlp": 0.01256628, + "epoch": 0.2552833308281978, + "flos": 22899679488000.0, + "grad_norm": 2.827648139992037, + "language_loss": 0.65781415, + "learning_rate": 3.4908069092085015e-06, + "loss": 0.73575282, + "num_input_tokens_seen": 91678500, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.20593262, + "step": 4246, + "time_per_iteration": 2.542945384979248 + }, + { + "auxiliary_loss_clip": 0.06504452, + "auxiliary_loss_mlp": 0.01278422, + "balance_loss_clip": 0.06290503, + "balance_loss_mlp": 0.01258455, + "epoch": 0.2553434540808658, + "flos": 22060003322880.0, + "grad_norm": 2.2137811054544003, + "language_loss": 0.82470047, + "learning_rate": 3.4905472604913585e-06, + "loss": 0.90252924, + "num_input_tokens_seen": 91696430, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.19970703, + "step": 4247, + "time_per_iteration": 2.5786685943603516 + }, + { + "auxiliary_loss_clip": 0.06521233, + "auxiliary_loss_mlp": 0.01279993, + "balance_loss_clip": 0.062906, + "balance_loss_mlp": 0.01257271, + "epoch": 0.25540357733353375, + "flos": 16549656393600.0, + "grad_norm": 2.135954108256579, + "language_loss": 0.83991635, + "learning_rate": 3.490287555252514e-06, + "loss": 0.91792852, + "num_input_tokens_seen": 91713270, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.22729492, + "step": 4248, + "time_per_iteration": 2.5408127307891846 + }, + { + "auxiliary_loss_clip": 0.06511332, + "auxiliary_loss_mlp": 0.01273979, + "balance_loss_clip": 0.062884, + "balance_loss_mlp": 0.01253773, + "epoch": 0.2554637005862017, + "flos": 17570531013120.0, + "grad_norm": 2.3193810219262585, + "language_loss": 0.84631854, + "learning_rate": 3.4900277935018166e-06, + "loss": 0.92417163, + "num_input_tokens_seen": 91728865, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.20202637, + "step": 4249, + "time_per_iteration": 4.003984212875366 + }, + { + "auxiliary_loss_clip": 0.06380495, + "auxiliary_loss_mlp": 0.01253384, + "balance_loss_clip": 0.06276014, + "balance_loss_mlp": 0.01249388, + "epoch": 0.2555238238388697, + "flos": 72263441698560.0, + "grad_norm": 0.7365466774710785, + "language_loss": 0.56168175, + "learning_rate": 3.489767975249115e-06, + "loss": 0.63802058, + "num_input_tokens_seen": 91787470, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.03994751, + "step": 4250, + "time_per_iteration": 3.169614553451538 + }, + { + "auxiliary_loss_clip": 0.06511974, + "auxiliary_loss_mlp": 0.01277356, + "balance_loss_clip": 0.06289789, + "balance_loss_mlp": 0.01255433, + "epoch": 0.25558394709153764, + "flos": 24396323990400.0, + "grad_norm": 2.4378887831258527, + "language_loss": 0.81129342, + "learning_rate": 3.4895081005042632e-06, + "loss": 0.88918668, + "num_input_tokens_seen": 91805640, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.21936035, + "step": 4251, + "time_per_iteration": 2.576631784439087 + }, + { + "auxiliary_loss_clip": 0.06382731, + "auxiliary_loss_mlp": 0.01258719, + "balance_loss_clip": 0.06278136, + "balance_loss_mlp": 0.01254794, + "epoch": 0.2556440703442056, + "flos": 69251857776000.0, + "grad_norm": 0.7756464213587903, + "language_loss": 0.66132653, + "learning_rate": 3.4892481692771146e-06, + "loss": 0.73774105, + "num_input_tokens_seen": 91869695, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.03921509, + "step": 4252, + "time_per_iteration": 3.2080140113830566 + }, + { + "auxiliary_loss_clip": 0.06505658, + "auxiliary_loss_mlp": 0.0127465, + "balance_loss_clip": 0.06288829, + "balance_loss_mlp": 0.01255922, + "epoch": 0.2557041935968736, + "flos": 24870919916160.0, + "grad_norm": 1.8769862610793295, + "language_loss": 0.74028432, + "learning_rate": 3.4889881815775267e-06, + "loss": 0.81808746, + "num_input_tokens_seen": 91889920, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.18737793, + "step": 4253, + "time_per_iteration": 2.569730520248413 + }, + { + "auxiliary_loss_clip": 0.06509089, + "auxiliary_loss_mlp": 0.01282548, + "balance_loss_clip": 0.06290878, + "balance_loss_mlp": 0.01261746, + "epoch": 0.25576431684954154, + "flos": 22498694974080.0, + "grad_norm": 4.507455095580577, + "language_loss": 0.742535, + "learning_rate": 3.488728137415357e-06, + "loss": 0.82045132, + "num_input_tokens_seen": 91908665, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.20800781, + "step": 4254, + "time_per_iteration": 2.58933424949646 + }, + { + "auxiliary_loss_clip": 0.0651402, + "auxiliary_loss_mlp": 0.0127796, + "balance_loss_clip": 0.06292839, + "balance_loss_mlp": 0.01257253, + "epoch": 0.2558244401022095, + "flos": 19832569436160.0, + "grad_norm": 1.7853658258569405, + "language_loss": 0.81599152, + "learning_rate": 3.4884680368004675e-06, + "loss": 0.89391136, + "num_input_tokens_seen": 91927855, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.20703125, + "step": 4255, + "time_per_iteration": 2.5198400020599365 + }, + { + "auxiliary_loss_clip": 0.06507239, + "auxiliary_loss_mlp": 0.01282593, + "balance_loss_clip": 0.06290218, + "balance_loss_mlp": 0.01262304, + "epoch": 0.2558845633548775, + "flos": 23226968736000.0, + "grad_norm": 1.3889535500711463, + "language_loss": 0.85781598, + "learning_rate": 3.488207879742721e-06, + "loss": 0.93571424, + "num_input_tokens_seen": 91948500, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.20275879, + "step": 4256, + "time_per_iteration": 2.6466193199157715 + }, + { + "auxiliary_loss_clip": 0.06518268, + "auxiliary_loss_mlp": 0.01279996, + "balance_loss_clip": 0.06292354, + "balance_loss_mlp": 0.01259432, + "epoch": 0.2559446866075455, + "flos": 16843682770560.0, + "grad_norm": 2.0395659723156814, + "language_loss": 0.75505483, + "learning_rate": 3.4879476662519826e-06, + "loss": 0.83303738, + "num_input_tokens_seen": 91968375, + "router_z_loss_clip": 2.25585938, + "router_z_loss_mlp": 0.20556641, + "step": 4257, + "time_per_iteration": 2.5399420261383057 + }, + { + "auxiliary_loss_clip": 0.06380453, + "auxiliary_loss_mlp": 0.01254162, + "balance_loss_clip": 0.06277193, + "balance_loss_mlp": 0.01249772, + "epoch": 0.25600480986021346, + "flos": 57612741258240.0, + "grad_norm": 0.7838298602570629, + "language_loss": 0.65205377, + "learning_rate": 3.4876873963381196e-06, + "loss": 0.72839993, + "num_input_tokens_seen": 92028490, + "router_z_loss_clip": 1.03125, + "router_z_loss_mlp": 0.04397583, + "step": 4258, + "time_per_iteration": 3.1310055255889893 + }, + { + "auxiliary_loss_clip": 0.06504042, + "auxiliary_loss_mlp": 0.01278745, + "balance_loss_clip": 0.06291071, + "balance_loss_mlp": 0.01257192, + "epoch": 0.2560649331128814, + "flos": 27827088762240.0, + "grad_norm": 1.6413095395992356, + "language_loss": 0.76769841, + "learning_rate": 3.4874270700110013e-06, + "loss": 0.84552622, + "num_input_tokens_seen": 92048060, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.2154541, + "step": 4259, + "time_per_iteration": 2.6200387477874756 + }, + { + "auxiliary_loss_clip": 0.06386054, + "auxiliary_loss_mlp": 0.01255029, + "balance_loss_clip": 0.06282675, + "balance_loss_mlp": 0.01250824, + "epoch": 0.2561250563655494, + "flos": 70972187552640.0, + "grad_norm": 0.7732791072218576, + "language_loss": 0.58378285, + "learning_rate": 3.4871666872804994e-06, + "loss": 0.66019368, + "num_input_tokens_seen": 92118180, + "router_z_loss_clip": 1.03125, + "router_z_loss_mlp": 0.04208374, + "step": 4260, + "time_per_iteration": 3.2671031951904297 + }, + { + "auxiliary_loss_clip": 0.06510498, + "auxiliary_loss_mlp": 0.01277826, + "balance_loss_clip": 0.06290598, + "balance_loss_mlp": 0.0125824, + "epoch": 0.25618517961821735, + "flos": 27018998386560.0, + "grad_norm": 1.6762593333812295, + "language_loss": 0.77063274, + "learning_rate": 3.4869062481564875e-06, + "loss": 0.84851599, + "num_input_tokens_seen": 92137570, + "router_z_loss_clip": 2.19921875, + "router_z_loss_mlp": 0.19580078, + "step": 4261, + "time_per_iteration": 2.6590030193328857 + }, + { + "auxiliary_loss_clip": 0.06510883, + "auxiliary_loss_mlp": 0.01281621, + "balance_loss_clip": 0.06293076, + "balance_loss_mlp": 0.01261534, + "epoch": 0.2562453028708853, + "flos": 23073708418560.0, + "grad_norm": 1.5026397479094624, + "language_loss": 0.83196223, + "learning_rate": 3.486645752648842e-06, + "loss": 0.90988725, + "num_input_tokens_seen": 92157625, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.20080566, + "step": 4262, + "time_per_iteration": 2.606386661529541 + }, + { + "auxiliary_loss_clip": 0.06520962, + "auxiliary_loss_mlp": 0.01278022, + "balance_loss_clip": 0.06295811, + "balance_loss_mlp": 0.0125778, + "epoch": 0.2563054261235533, + "flos": 15126120178560.0, + "grad_norm": 2.976746783245639, + "language_loss": 0.7460134, + "learning_rate": 3.4863852007674405e-06, + "loss": 0.82400322, + "num_input_tokens_seen": 92175350, + "router_z_loss_clip": 2.25585938, + "router_z_loss_mlp": 0.20239258, + "step": 4263, + "time_per_iteration": 2.573204517364502 + }, + { + "auxiliary_loss_clip": 0.06511976, + "auxiliary_loss_mlp": 0.01275308, + "balance_loss_clip": 0.0629802, + "balance_loss_mlp": 0.01256008, + "epoch": 0.25636554937622125, + "flos": 27862238350080.0, + "grad_norm": 1.7189236473805392, + "language_loss": 0.83209884, + "learning_rate": 3.486124592522163e-06, + "loss": 0.90997171, + "num_input_tokens_seen": 92196070, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.19299316, + "step": 4264, + "time_per_iteration": 2.5768978595733643 + }, + { + "auxiliary_loss_clip": 0.06522107, + "auxiliary_loss_mlp": 0.01275609, + "balance_loss_clip": 0.06300539, + "balance_loss_mlp": 0.01255403, + "epoch": 0.2564256726288892, + "flos": 28912979750400.0, + "grad_norm": 2.7518222985569247, + "language_loss": 0.75264466, + "learning_rate": 3.4858639279228924e-06, + "loss": 0.83062184, + "num_input_tokens_seen": 92216310, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.20202637, + "step": 4265, + "time_per_iteration": 2.6022770404815674 + }, + { + "auxiliary_loss_clip": 0.06514756, + "auxiliary_loss_mlp": 0.01276084, + "balance_loss_clip": 0.06293936, + "balance_loss_mlp": 0.01256701, + "epoch": 0.2564857958815572, + "flos": 18520812967680.0, + "grad_norm": 2.7205564726060754, + "language_loss": 0.82059085, + "learning_rate": 3.485603206979513e-06, + "loss": 0.89849925, + "num_input_tokens_seen": 92234510, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.19396973, + "step": 4266, + "time_per_iteration": 2.5768039226531982 + }, + { + "auxiliary_loss_clip": 0.06513181, + "auxiliary_loss_mlp": 0.01282165, + "balance_loss_clip": 0.06295994, + "balance_loss_mlp": 0.01263199, + "epoch": 0.25654591913422514, + "flos": 25814745106560.0, + "grad_norm": 2.256505464235654, + "language_loss": 0.79590619, + "learning_rate": 3.4853424297019103e-06, + "loss": 0.8738597, + "num_input_tokens_seen": 92254070, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.1895752, + "step": 4267, + "time_per_iteration": 2.58900785446167 + }, + { + "auxiliary_loss_clip": 0.06512932, + "auxiliary_loss_mlp": 0.01282882, + "balance_loss_clip": 0.06302384, + "balance_loss_mlp": 0.01263439, + "epoch": 0.2566060423868931, + "flos": 19105805047680.0, + "grad_norm": 1.7450924080459818, + "language_loss": 0.79543281, + "learning_rate": 3.4850815960999736e-06, + "loss": 0.87339091, + "num_input_tokens_seen": 92275060, + "router_z_loss_clip": 2.10546875, + "router_z_loss_mlp": 0.19421387, + "step": 4268, + "time_per_iteration": 2.532245635986328 + }, + { + "auxiliary_loss_clip": 0.06515032, + "auxiliary_loss_mlp": 0.01281336, + "balance_loss_clip": 0.06296947, + "balance_loss_mlp": 0.01261166, + "epoch": 0.25666616563956113, + "flos": 23849584099200.0, + "grad_norm": 1.6329297187056233, + "language_loss": 0.69106698, + "learning_rate": 3.484820706183595e-06, + "loss": 0.76903057, + "num_input_tokens_seen": 92293610, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.20153809, + "step": 4269, + "time_per_iteration": 2.7064032554626465 + }, + { + "auxiliary_loss_clip": 0.06520134, + "auxiliary_loss_mlp": 0.01278603, + "balance_loss_clip": 0.06299803, + "balance_loss_mlp": 0.01259016, + "epoch": 0.2567262888922291, + "flos": 14608366600320.0, + "grad_norm": 2.976489070793836, + "language_loss": 0.79361498, + "learning_rate": 3.484559759962666e-06, + "loss": 0.8716023, + "num_input_tokens_seen": 92308305, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.19580078, + "step": 4270, + "time_per_iteration": 2.5247366428375244 + }, + { + "auxiliary_loss_clip": 0.06528008, + "auxiliary_loss_mlp": 0.01281711, + "balance_loss_clip": 0.0630113, + "balance_loss_mlp": 0.0125899, + "epoch": 0.25678641214489706, + "flos": 32930791027200.0, + "grad_norm": 2.0785991894062104, + "language_loss": 0.68438745, + "learning_rate": 3.4842987574470816e-06, + "loss": 0.76248461, + "num_input_tokens_seen": 92329875, + "router_z_loss_clip": 2.26757812, + "router_z_loss_mlp": 0.22717285, + "step": 4271, + "time_per_iteration": 2.6327364444732666 + }, + { + "auxiliary_loss_clip": 0.06521121, + "auxiliary_loss_mlp": 0.01277495, + "balance_loss_clip": 0.06297284, + "balance_loss_mlp": 0.01256395, + "epoch": 0.256846535397565, + "flos": 24106029120000.0, + "grad_norm": 1.3298745054932861, + "language_loss": 0.87827712, + "learning_rate": 3.4840376986467403e-06, + "loss": 0.9562633, + "num_input_tokens_seen": 92348780, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.2109375, + "step": 4272, + "time_per_iteration": 2.5886576175689697 + }, + { + "auxiliary_loss_clip": 0.06520741, + "auxiliary_loss_mlp": 0.0127846, + "balance_loss_clip": 0.06299604, + "balance_loss_mlp": 0.01256204, + "epoch": 0.256906658650233, + "flos": 19724437342080.0, + "grad_norm": 1.6471317846086577, + "language_loss": 0.8228811, + "learning_rate": 3.483776583571541e-06, + "loss": 0.90087312, + "num_input_tokens_seen": 92368175, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.22253418, + "step": 4273, + "time_per_iteration": 2.5273654460906982 + }, + { + "auxiliary_loss_clip": 0.06513067, + "auxiliary_loss_mlp": 0.0127658, + "balance_loss_clip": 0.06299708, + "balance_loss_mlp": 0.01257638, + "epoch": 0.25696678190290095, + "flos": 22932019964160.0, + "grad_norm": 1.4706338186359442, + "language_loss": 0.77439249, + "learning_rate": 3.4835154122313846e-06, + "loss": 0.85228896, + "num_input_tokens_seen": 92387755, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.18933105, + "step": 4274, + "time_per_iteration": 2.5805962085723877 + }, + { + "auxiliary_loss_clip": 0.06508841, + "auxiliary_loss_mlp": 0.01274973, + "balance_loss_clip": 0.06295496, + "balance_loss_mlp": 0.0125435, + "epoch": 0.2570269051555689, + "flos": 27315163042560.0, + "grad_norm": 1.5809391622925344, + "language_loss": 0.84101403, + "learning_rate": 3.4832541846361743e-06, + "loss": 0.91885215, + "num_input_tokens_seen": 92409850, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.20629883, + "step": 4275, + "time_per_iteration": 2.5743672847747803 + }, + { + "auxiliary_loss_clip": 0.0652002, + "auxiliary_loss_mlp": 0.01273541, + "balance_loss_clip": 0.06298091, + "balance_loss_mlp": 0.01252965, + "epoch": 0.2570870284082369, + "flos": 27570811449600.0, + "grad_norm": 2.3295240533415016, + "language_loss": 0.78590673, + "learning_rate": 3.4829929007958175e-06, + "loss": 0.86384231, + "num_input_tokens_seen": 92431250, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.20581055, + "step": 4276, + "time_per_iteration": 2.631866216659546 + }, + { + "auxiliary_loss_clip": 0.06515533, + "auxiliary_loss_mlp": 0.01279943, + "balance_loss_clip": 0.06298599, + "balance_loss_mlp": 0.01260237, + "epoch": 0.25714715166090485, + "flos": 28738405768320.0, + "grad_norm": 1.6396366021430353, + "language_loss": 0.79803967, + "learning_rate": 3.4827315607202214e-06, + "loss": 0.8759945, + "num_input_tokens_seen": 92452065, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.19714355, + "step": 4277, + "time_per_iteration": 2.5990161895751953 + }, + { + "auxiliary_loss_clip": 0.06513472, + "auxiliary_loss_mlp": 0.01272259, + "balance_loss_clip": 0.06296529, + "balance_loss_mlp": 0.01254377, + "epoch": 0.2572072749135728, + "flos": 20121606495360.0, + "grad_norm": 1.9596681746733369, + "language_loss": 0.78998482, + "learning_rate": 3.482470164419295e-06, + "loss": 0.8678422, + "num_input_tokens_seen": 92470025, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.17883301, + "step": 4278, + "time_per_iteration": 4.02304744720459 + }, + { + "auxiliary_loss_clip": 0.06522302, + "auxiliary_loss_mlp": 0.01278536, + "balance_loss_clip": 0.06301469, + "balance_loss_mlp": 0.01256911, + "epoch": 0.2572673981662408, + "flos": 26037969183360.0, + "grad_norm": 2.3063853220673067, + "language_loss": 0.75400203, + "learning_rate": 3.482208711902952e-06, + "loss": 0.83201039, + "num_input_tokens_seen": 92489825, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.21618652, + "step": 4279, + "time_per_iteration": 2.5523123741149902 + }, + { + "auxiliary_loss_clip": 0.06516609, + "auxiliary_loss_mlp": 0.0128394, + "balance_loss_clip": 0.06297271, + "balance_loss_mlp": 0.01262721, + "epoch": 0.25732752141890874, + "flos": 16112054845440.0, + "grad_norm": 3.423283610494841, + "language_loss": 0.85997081, + "learning_rate": 3.4819472031811065e-06, + "loss": 0.9379763, + "num_input_tokens_seen": 92507270, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.2121582, + "step": 4280, + "time_per_iteration": 2.5104546546936035 + }, + { + "auxiliary_loss_clip": 0.06517641, + "auxiliary_loss_mlp": 0.01282108, + "balance_loss_clip": 0.06295675, + "balance_loss_mlp": 0.0126133, + "epoch": 0.2573876446715767, + "flos": 22530322690560.0, + "grad_norm": 2.5830483171875955, + "language_loss": 0.78735828, + "learning_rate": 3.4816856382636744e-06, + "loss": 0.86535579, + "num_input_tokens_seen": 92526300, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.20788574, + "step": 4281, + "time_per_iteration": 2.511723279953003 + }, + { + "auxiliary_loss_clip": 0.06512952, + "auxiliary_loss_mlp": 0.01285256, + "balance_loss_clip": 0.06294534, + "balance_loss_mlp": 0.01264048, + "epoch": 0.2574477679242447, + "flos": 23957548485120.0, + "grad_norm": 1.8266556980022217, + "language_loss": 0.87782013, + "learning_rate": 3.4814240171605737e-06, + "loss": 0.9558022, + "num_input_tokens_seen": 92546465, + "router_z_loss_clip": 2.18066406, + "router_z_loss_mlp": 0.21203613, + "step": 4282, + "time_per_iteration": 2.5573971271514893 + }, + { + "auxiliary_loss_clip": 0.06509817, + "auxiliary_loss_mlp": 0.0128236, + "balance_loss_clip": 0.06291438, + "balance_loss_mlp": 0.01262905, + "epoch": 0.2575078911769127, + "flos": 21988278627840.0, + "grad_norm": 1.3881538001933933, + "language_loss": 0.71042287, + "learning_rate": 3.4811623398817267e-06, + "loss": 0.78834462, + "num_input_tokens_seen": 92567260, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.19470215, + "step": 4283, + "time_per_iteration": 3.9826109409332275 + }, + { + "auxiliary_loss_clip": 0.06500088, + "auxiliary_loss_mlp": 0.01289815, + "balance_loss_clip": 0.06290558, + "balance_loss_mlp": 0.01271051, + "epoch": 0.25756801442958066, + "flos": 21951997009920.0, + "grad_norm": 1.9398744879334104, + "language_loss": 0.80991805, + "learning_rate": 3.4809006064370553e-06, + "loss": 0.88781703, + "num_input_tokens_seen": 92585425, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 0.18762207, + "step": 4284, + "time_per_iteration": 2.512471914291382 + }, + { + "auxiliary_loss_clip": 0.06508928, + "auxiliary_loss_mlp": 0.01294414, + "balance_loss_clip": 0.06291771, + "balance_loss_mlp": 0.01274923, + "epoch": 0.2576281376822486, + "flos": 35270675493120.0, + "grad_norm": 2.158245566426343, + "language_loss": 0.70814562, + "learning_rate": 3.4806388168364835e-06, + "loss": 0.78617907, + "num_input_tokens_seen": 92604770, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.19494629, + "step": 4285, + "time_per_iteration": 4.088344097137451 + }, + { + "auxiliary_loss_clip": 0.06504595, + "auxiliary_loss_mlp": 0.0128171, + "balance_loss_clip": 0.06288387, + "balance_loss_mlp": 0.01262505, + "epoch": 0.2576882609349166, + "flos": 14136705567360.0, + "grad_norm": 1.771877130646751, + "language_loss": 0.58818436, + "learning_rate": 3.4803769710899402e-06, + "loss": 0.66604745, + "num_input_tokens_seen": 92622635, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.1920166, + "step": 4286, + "time_per_iteration": 2.5344176292419434 + }, + { + "auxiliary_loss_clip": 0.0650837, + "auxiliary_loss_mlp": 0.01278621, + "balance_loss_clip": 0.06287025, + "balance_loss_mlp": 0.01259118, + "epoch": 0.25774838418758456, + "flos": 23265053216640.0, + "grad_norm": 2.057811055203196, + "language_loss": 0.6464054, + "learning_rate": 3.480115069207354e-06, + "loss": 0.72427529, + "num_input_tokens_seen": 92642960, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.19494629, + "step": 4287, + "time_per_iteration": 2.5958328247070312 + }, + { + "auxiliary_loss_clip": 0.0650748, + "auxiliary_loss_mlp": 0.01286721, + "balance_loss_clip": 0.06287187, + "balance_loss_mlp": 0.01265824, + "epoch": 0.2578085074402525, + "flos": 22608378368640.0, + "grad_norm": 1.9946373780944937, + "language_loss": 0.7222265, + "learning_rate": 3.4798531111986557e-06, + "loss": 0.80016851, + "num_input_tokens_seen": 92662455, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.2088623, + "step": 4288, + "time_per_iteration": 2.5767109394073486 + }, + { + "auxiliary_loss_clip": 0.06504134, + "auxiliary_loss_mlp": 0.01288175, + "balance_loss_clip": 0.06288374, + "balance_loss_mlp": 0.01268851, + "epoch": 0.2578686306929205, + "flos": 24578780256000.0, + "grad_norm": 1.4737569046844996, + "language_loss": 0.77657092, + "learning_rate": 3.4795910970737786e-06, + "loss": 0.85449398, + "num_input_tokens_seen": 92683520, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.1932373, + "step": 4289, + "time_per_iteration": 3.9734480381011963 + }, + { + "auxiliary_loss_clip": 0.0651005, + "auxiliary_loss_mlp": 0.01285951, + "balance_loss_clip": 0.06290901, + "balance_loss_mlp": 0.012641, + "epoch": 0.25792875394558845, + "flos": 18119828453760.0, + "grad_norm": 2.192134211179858, + "language_loss": 0.8580482, + "learning_rate": 3.4793290268426592e-06, + "loss": 0.93600821, + "num_input_tokens_seen": 92701450, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.21838379, + "step": 4290, + "time_per_iteration": 2.5564229488372803 + }, + { + "auxiliary_loss_clip": 0.0651224, + "auxiliary_loss_mlp": 0.01283874, + "balance_loss_clip": 0.06293762, + "balance_loss_mlp": 0.01263573, + "epoch": 0.2579888771982564, + "flos": 17718760085760.0, + "grad_norm": 2.0247866667145344, + "language_loss": 0.73390263, + "learning_rate": 3.4790669005152354e-06, + "loss": 0.81186378, + "num_input_tokens_seen": 92720355, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.20300293, + "step": 4291, + "time_per_iteration": 2.497671365737915 + }, + { + "auxiliary_loss_clip": 0.06508101, + "auxiliary_loss_mlp": 0.01275245, + "balance_loss_clip": 0.06287237, + "balance_loss_mlp": 0.01255647, + "epoch": 0.2580490004509244, + "flos": 16440350342400.0, + "grad_norm": 2.23272675200871, + "language_loss": 0.82139969, + "learning_rate": 3.4788047181014458e-06, + "loss": 0.8992331, + "num_input_tokens_seen": 92736755, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.19604492, + "step": 4292, + "time_per_iteration": 2.5467498302459717 + }, + { + "auxiliary_loss_clip": 0.06505652, + "auxiliary_loss_mlp": 0.01282583, + "balance_loss_clip": 0.06289525, + "balance_loss_mlp": 0.01262532, + "epoch": 0.25810912370359235, + "flos": 33842946574080.0, + "grad_norm": 1.9023591833174374, + "language_loss": 0.67644775, + "learning_rate": 3.4785424796112337e-06, + "loss": 0.7543301, + "num_input_tokens_seen": 92757655, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.20043945, + "step": 4293, + "time_per_iteration": 2.626880168914795 + }, + { + "auxiliary_loss_clip": 0.06507371, + "auxiliary_loss_mlp": 0.01275889, + "balance_loss_clip": 0.06295517, + "balance_loss_mlp": 0.01257244, + "epoch": 0.2581692469562603, + "flos": 25199257340160.0, + "grad_norm": 2.9603548878770387, + "language_loss": 0.76158464, + "learning_rate": 3.478280185054542e-06, + "loss": 0.83941722, + "num_input_tokens_seen": 92776100, + "router_z_loss_clip": 2.12109375, + "router_z_loss_mlp": 0.18640137, + "step": 4294, + "time_per_iteration": 2.5711581707000732 + }, + { + "auxiliary_loss_clip": 0.06508272, + "auxiliary_loss_mlp": 0.01277058, + "balance_loss_clip": 0.06293358, + "balance_loss_mlp": 0.01257866, + "epoch": 0.2582293702089283, + "flos": 34940619060480.0, + "grad_norm": 2.382767918587226, + "language_loss": 0.81769538, + "learning_rate": 3.478017834441318e-06, + "loss": 0.8955487, + "num_input_tokens_seen": 92798880, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.1920166, + "step": 4295, + "time_per_iteration": 2.635817766189575 + }, + { + "auxiliary_loss_clip": 0.06519823, + "auxiliary_loss_mlp": 0.01276702, + "balance_loss_clip": 0.06295969, + "balance_loss_mlp": 0.01256496, + "epoch": 0.2582894934615963, + "flos": 26841028314240.0, + "grad_norm": 1.964012337767824, + "language_loss": 0.72949934, + "learning_rate": 3.4777554277815096e-06, + "loss": 0.80746454, + "num_input_tokens_seen": 92817750, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.20214844, + "step": 4296, + "time_per_iteration": 2.569481134414673 + }, + { + "auxiliary_loss_clip": 0.06514452, + "auxiliary_loss_mlp": 0.01277621, + "balance_loss_clip": 0.0629501, + "balance_loss_mlp": 0.0125732, + "epoch": 0.25834961671426426, + "flos": 23522252924160.0, + "grad_norm": 1.7245670135783875, + "language_loss": 0.87440747, + "learning_rate": 3.477492965085067e-06, + "loss": 0.95232815, + "num_input_tokens_seen": 92837995, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.20288086, + "step": 4297, + "time_per_iteration": 2.5871896743774414 + }, + { + "auxiliary_loss_clip": 0.06510781, + "auxiliary_loss_mlp": 0.01279012, + "balance_loss_clip": 0.062922, + "balance_loss_mlp": 0.01260558, + "epoch": 0.25840973996693223, + "flos": 22456837059840.0, + "grad_norm": 2.9037965134923076, + "language_loss": 0.84894854, + "learning_rate": 3.477230446361943e-06, + "loss": 0.9268465, + "num_input_tokens_seen": 92857245, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.18469238, + "step": 4298, + "time_per_iteration": 2.5290613174438477 + }, + { + "auxiliary_loss_clip": 0.06510766, + "auxiliary_loss_mlp": 0.01276006, + "balance_loss_clip": 0.06292143, + "balance_loss_mlp": 0.01256158, + "epoch": 0.2584698632196002, + "flos": 11295544849920.0, + "grad_norm": 2.12928453409433, + "language_loss": 0.83727312, + "learning_rate": 3.4769678716220927e-06, + "loss": 0.91514087, + "num_input_tokens_seen": 92873265, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.1986084, + "step": 4299, + "time_per_iteration": 2.5314571857452393 + }, + { + "auxiliary_loss_clip": 0.06506392, + "auxiliary_loss_mlp": 0.01272204, + "balance_loss_clip": 0.06292024, + "balance_loss_mlp": 0.01253214, + "epoch": 0.25852998647226816, + "flos": 17935569325440.0, + "grad_norm": 2.08690605682093, + "language_loss": 0.83303946, + "learning_rate": 3.4767052408754726e-06, + "loss": 0.91082543, + "num_input_tokens_seen": 92890880, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.18981934, + "step": 4300, + "time_per_iteration": 2.494170904159546 + }, + { + "auxiliary_loss_clip": 0.06507458, + "auxiliary_loss_mlp": 0.01272704, + "balance_loss_clip": 0.06287713, + "balance_loss_mlp": 0.01254012, + "epoch": 0.2585901097249361, + "flos": 33264620893440.0, + "grad_norm": 3.3706811216639307, + "language_loss": 0.67941749, + "learning_rate": 3.4764425541320417e-06, + "loss": 0.75721914, + "num_input_tokens_seen": 92910770, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.18688965, + "step": 4301, + "time_per_iteration": 2.6923537254333496 + }, + { + "auxiliary_loss_clip": 0.06512292, + "auxiliary_loss_mlp": 0.01275999, + "balance_loss_clip": 0.06289004, + "balance_loss_mlp": 0.01257009, + "epoch": 0.2586502329776041, + "flos": 18447033847680.0, + "grad_norm": 2.7819934823512282, + "language_loss": 0.83073664, + "learning_rate": 3.4761798114017617e-06, + "loss": 0.90861952, + "num_input_tokens_seen": 92929520, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.18994141, + "step": 4302, + "time_per_iteration": 2.5102365016937256 + }, + { + "auxiliary_loss_clip": 0.06508462, + "auxiliary_loss_mlp": 0.01276586, + "balance_loss_clip": 0.06292115, + "balance_loss_mlp": 0.01257358, + "epoch": 0.25871035623027205, + "flos": 17973989222400.0, + "grad_norm": 1.7107484291097332, + "language_loss": 0.91874599, + "learning_rate": 3.475917012694595e-06, + "loss": 0.99659652, + "num_input_tokens_seen": 92947890, + "router_z_loss_clip": 2.1640625, + "router_z_loss_mlp": 0.19238281, + "step": 4303, + "time_per_iteration": 2.5386602878570557 + }, + { + "auxiliary_loss_clip": 0.06508803, + "auxiliary_loss_mlp": 0.01277595, + "balance_loss_clip": 0.0629281, + "balance_loss_mlp": 0.01258569, + "epoch": 0.25877047948294, + "flos": 27784392307200.0, + "grad_norm": 1.7938003883067368, + "language_loss": 0.67601281, + "learning_rate": 3.475654158020507e-06, + "loss": 0.75387681, + "num_input_tokens_seen": 92967690, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.19018555, + "step": 4304, + "time_per_iteration": 2.5739033222198486 + }, + { + "auxiliary_loss_clip": 0.06507856, + "auxiliary_loss_mlp": 0.01276896, + "balance_loss_clip": 0.06286401, + "balance_loss_mlp": 0.01257477, + "epoch": 0.258830602735608, + "flos": 27133209901440.0, + "grad_norm": 2.1929382614593242, + "language_loss": 0.73436916, + "learning_rate": 3.4753912473894657e-06, + "loss": 0.81221676, + "num_input_tokens_seen": 92986830, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.1940918, + "step": 4305, + "time_per_iteration": 2.5877888202667236 + }, + { + "auxiliary_loss_clip": 0.06515621, + "auxiliary_loss_mlp": 0.01276889, + "balance_loss_clip": 0.06293313, + "balance_loss_mlp": 0.01255992, + "epoch": 0.25889072598827595, + "flos": 17896730158080.0, + "grad_norm": 1.8662067033328453, + "language_loss": 0.76418924, + "learning_rate": 3.4751282808114403e-06, + "loss": 0.84211433, + "num_input_tokens_seen": 93002740, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.20898438, + "step": 4306, + "time_per_iteration": 2.482933282852173 + }, + { + "auxiliary_loss_clip": 0.06403579, + "auxiliary_loss_mlp": 0.01258203, + "balance_loss_clip": 0.06296976, + "balance_loss_mlp": 0.01253566, + "epoch": 0.2589508492409439, + "flos": 53951582885760.0, + "grad_norm": 0.8023409981232837, + "language_loss": 0.56592381, + "learning_rate": 3.474865258296403e-06, + "loss": 0.64254159, + "num_input_tokens_seen": 93058645, + "router_z_loss_clip": 1.06347656, + "router_z_loss_mlp": 0.04629517, + "step": 4307, + "time_per_iteration": 3.1265084743499756 + }, + { + "auxiliary_loss_clip": 0.06500413, + "auxiliary_loss_mlp": 0.0127407, + "balance_loss_clip": 0.06289256, + "balance_loss_mlp": 0.01256105, + "epoch": 0.2590109724936119, + "flos": 22132063434240.0, + "grad_norm": 1.735104377472534, + "language_loss": 0.71851504, + "learning_rate": 3.474602179854327e-06, + "loss": 0.79625988, + "num_input_tokens_seen": 93077140, + "router_z_loss_clip": 2.11328125, + "router_z_loss_mlp": 0.17956543, + "step": 4308, + "time_per_iteration": 2.5442304611206055 + }, + { + "auxiliary_loss_clip": 0.06513858, + "auxiliary_loss_mlp": 0.01278615, + "balance_loss_clip": 0.0629196, + "balance_loss_mlp": 0.01258993, + "epoch": 0.2590710957462799, + "flos": 13478395564800.0, + "grad_norm": 2.8033587428294657, + "language_loss": 0.84278727, + "learning_rate": 3.4743390454951886e-06, + "loss": 0.92071199, + "num_input_tokens_seen": 93093580, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.19628906, + "step": 4309, + "time_per_iteration": 2.546034336090088 + }, + { + "auxiliary_loss_clip": 0.06504438, + "auxiliary_loss_mlp": 0.01276588, + "balance_loss_clip": 0.06290606, + "balance_loss_mlp": 0.01258814, + "epoch": 0.25913121899894787, + "flos": 22313219961600.0, + "grad_norm": 1.5400127324827177, + "language_loss": 0.84972912, + "learning_rate": 3.474075855228966e-06, + "loss": 0.92753935, + "num_input_tokens_seen": 93112345, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.17785645, + "step": 4310, + "time_per_iteration": 2.5188028812408447 + }, + { + "auxiliary_loss_clip": 0.06511362, + "auxiliary_loss_mlp": 0.0127375, + "balance_loss_clip": 0.06293052, + "balance_loss_mlp": 0.01254533, + "epoch": 0.25919134225161583, + "flos": 25818770102400.0, + "grad_norm": 1.8118221315599161, + "language_loss": 0.78088975, + "learning_rate": 3.473812609065639e-06, + "loss": 0.85874081, + "num_input_tokens_seen": 93131545, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.19213867, + "step": 4311, + "time_per_iteration": 2.6044604778289795 + }, + { + "auxiliary_loss_clip": 0.06511068, + "auxiliary_loss_mlp": 0.01275144, + "balance_loss_clip": 0.06293963, + "balance_loss_mlp": 0.01256666, + "epoch": 0.2592514655042838, + "flos": 31220314104960.0, + "grad_norm": 4.381167674093932, + "language_loss": 0.73062587, + "learning_rate": 3.4735493070151904e-06, + "loss": 0.80848801, + "num_input_tokens_seen": 93150730, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.18469238, + "step": 4312, + "time_per_iteration": 2.587942600250244 + }, + { + "auxiliary_loss_clip": 0.06508243, + "auxiliary_loss_mlp": 0.01275986, + "balance_loss_clip": 0.06291987, + "balance_loss_mlp": 0.012569, + "epoch": 0.25931158875695176, + "flos": 18480296718720.0, + "grad_norm": 1.7543304647253515, + "language_loss": 0.70305753, + "learning_rate": 3.4732859490876044e-06, + "loss": 0.78089976, + "num_input_tokens_seen": 93167895, + "router_z_loss_clip": 2.16113281, + "router_z_loss_mlp": 0.19091797, + "step": 4313, + "time_per_iteration": 2.5092732906341553 + }, + { + "auxiliary_loss_clip": 0.06508952, + "auxiliary_loss_mlp": 0.01278616, + "balance_loss_clip": 0.06293979, + "balance_loss_mlp": 0.0125971, + "epoch": 0.2593717120096197, + "flos": 19213895214720.0, + "grad_norm": 1.751562510714179, + "language_loss": 0.81158572, + "learning_rate": 3.473022535292867e-06, + "loss": 0.8894614, + "num_input_tokens_seen": 93187650, + "router_z_loss_clip": 2.15332031, + "router_z_loss_mlp": 0.18908691, + "step": 4314, + "time_per_iteration": 2.5584335327148438 + }, + { + "auxiliary_loss_clip": 0.06515148, + "auxiliary_loss_mlp": 0.01278316, + "balance_loss_clip": 0.06292658, + "balance_loss_mlp": 0.01257359, + "epoch": 0.2594318352622877, + "flos": 31256050671360.0, + "grad_norm": 1.9178095473181331, + "language_loss": 0.67283171, + "learning_rate": 3.472759065640968e-06, + "loss": 0.7507664, + "num_input_tokens_seen": 93207370, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.20959473, + "step": 4315, + "time_per_iteration": 2.6295278072357178 + }, + { + "auxiliary_loss_clip": 0.06506292, + "auxiliary_loss_mlp": 0.01277654, + "balance_loss_clip": 0.06292329, + "balance_loss_mlp": 0.01259463, + "epoch": 0.25949195851495566, + "flos": 22243759326720.0, + "grad_norm": 1.412764147956583, + "language_loss": 0.80242419, + "learning_rate": 3.4724955401418976e-06, + "loss": 0.88026369, + "num_input_tokens_seen": 93227925, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.18212891, + "step": 4316, + "time_per_iteration": 2.5277152061462402 + }, + { + "auxiliary_loss_clip": 0.06510989, + "auxiliary_loss_mlp": 0.01277583, + "balance_loss_clip": 0.06290686, + "balance_loss_mlp": 0.01256781, + "epoch": 0.2595520817676236, + "flos": 28083449928960.0, + "grad_norm": 1.6660208675023864, + "language_loss": 0.78127223, + "learning_rate": 3.4722319588056487e-06, + "loss": 0.85915792, + "num_input_tokens_seen": 93250020, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.20812988, + "step": 4317, + "time_per_iteration": 2.6210665702819824 + }, + { + "auxiliary_loss_clip": 0.06507257, + "auxiliary_loss_mlp": 0.01281581, + "balance_loss_clip": 0.06291957, + "balance_loss_mlp": 0.01262054, + "epoch": 0.2596122050202916, + "flos": 20196727280640.0, + "grad_norm": 2.4040812102587377, + "language_loss": 0.78420109, + "learning_rate": 3.4719683216422163e-06, + "loss": 0.86208946, + "num_input_tokens_seen": 93269070, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.19519043, + "step": 4318, + "time_per_iteration": 3.9600155353546143 + }, + { + "auxiliary_loss_clip": 0.06505568, + "auxiliary_loss_mlp": 0.01276855, + "balance_loss_clip": 0.06290057, + "balance_loss_mlp": 0.01256637, + "epoch": 0.25967232827295955, + "flos": 22534431540480.0, + "grad_norm": 2.66294558684285, + "language_loss": 0.77022719, + "learning_rate": 3.471704628661598e-06, + "loss": 0.84805143, + "num_input_tokens_seen": 93290250, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.20227051, + "step": 4319, + "time_per_iteration": 2.544752836227417 + }, + { + "auxiliary_loss_clip": 0.0650554, + "auxiliary_loss_mlp": 0.01280509, + "balance_loss_clip": 0.06290743, + "balance_loss_mlp": 0.01261555, + "epoch": 0.2597324515256275, + "flos": 21074445999360.0, + "grad_norm": 1.7925219732685136, + "language_loss": 0.77426791, + "learning_rate": 3.4714408798737925e-06, + "loss": 0.85212845, + "num_input_tokens_seen": 93310090, + "router_z_loss_clip": 2.14941406, + "router_z_loss_mlp": 0.18945312, + "step": 4320, + "time_per_iteration": 2.569967269897461 + }, + { + "auxiliary_loss_clip": 0.06508496, + "auxiliary_loss_mlp": 0.01273671, + "balance_loss_clip": 0.06289761, + "balance_loss_mlp": 0.01254634, + "epoch": 0.2597925747782955, + "flos": 22055810618880.0, + "grad_norm": 1.593385908573569, + "language_loss": 0.71533716, + "learning_rate": 3.471177075288801e-06, + "loss": 0.79315877, + "num_input_tokens_seen": 93329570, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.19042969, + "step": 4321, + "time_per_iteration": 2.5314829349517822 + }, + { + "auxiliary_loss_clip": 0.0650996, + "auxiliary_loss_mlp": 0.01274348, + "balance_loss_clip": 0.06287652, + "balance_loss_mlp": 0.01254011, + "epoch": 0.2598526980309635, + "flos": 19543071179520.0, + "grad_norm": 2.282331155451991, + "language_loss": 0.75262189, + "learning_rate": 3.4709132149166277e-06, + "loss": 0.83046496, + "num_input_tokens_seen": 93347920, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.20336914, + "step": 4322, + "time_per_iteration": 2.525724411010742 + }, + { + "auxiliary_loss_clip": 0.06509394, + "auxiliary_loss_mlp": 0.01275417, + "balance_loss_clip": 0.06289983, + "balance_loss_mlp": 0.0125533, + "epoch": 0.25991282128363147, + "flos": 24501521191680.0, + "grad_norm": 2.623736611083137, + "language_loss": 0.7442928, + "learning_rate": 3.470649298767278e-06, + "loss": 0.82214087, + "num_input_tokens_seen": 93367145, + "router_z_loss_clip": 2.19335938, + "router_z_loss_mlp": 0.20092773, + "step": 4323, + "time_per_iteration": 3.957674026489258 + }, + { + "auxiliary_loss_clip": 0.06515582, + "auxiliary_loss_mlp": 0.01279409, + "balance_loss_clip": 0.0628901, + "balance_loss_mlp": 0.01258893, + "epoch": 0.25997294453629943, + "flos": 24207410960640.0, + "grad_norm": 1.7976461796423409, + "language_loss": 0.68052149, + "learning_rate": 3.4703853268507597e-06, + "loss": 0.75847143, + "num_input_tokens_seen": 93386555, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.20495605, + "step": 4324, + "time_per_iteration": 4.001135349273682 + }, + { + "auxiliary_loss_clip": 0.06505544, + "auxiliary_loss_mlp": 0.01272584, + "balance_loss_clip": 0.06286605, + "balance_loss_mlp": 0.01254608, + "epoch": 0.2600330677889674, + "flos": 31439597040000.0, + "grad_norm": 1.7946989584541546, + "language_loss": 0.71402133, + "learning_rate": 3.470121299177082e-06, + "loss": 0.79180264, + "num_input_tokens_seen": 93405590, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.1796875, + "step": 4325, + "time_per_iteration": 2.6213603019714355 + }, + { + "auxiliary_loss_clip": 0.06501837, + "auxiliary_loss_mlp": 0.01274613, + "balance_loss_clip": 0.06284901, + "balance_loss_mlp": 0.01255004, + "epoch": 0.26009319104163536, + "flos": 32274116179200.0, + "grad_norm": 1.826124228611905, + "language_loss": 0.73262805, + "learning_rate": 3.469857215756257e-06, + "loss": 0.81039256, + "num_input_tokens_seen": 93424750, + "router_z_loss_clip": 2.16601562, + "router_z_loss_mlp": 0.19592285, + "step": 4326, + "time_per_iteration": 2.593801736831665 + }, + { + "auxiliary_loss_clip": 0.06500994, + "auxiliary_loss_mlp": 0.01276886, + "balance_loss_clip": 0.06288173, + "balance_loss_mlp": 0.01258051, + "epoch": 0.26015331429430333, + "flos": 26293994933760.0, + "grad_norm": 1.858424121782002, + "language_loss": 0.8722446, + "learning_rate": 3.4695930765982997e-06, + "loss": 0.95002341, + "num_input_tokens_seen": 93443465, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.18835449, + "step": 4327, + "time_per_iteration": 2.5950510501861572 + }, + { + "auxiliary_loss_clip": 0.06508228, + "auxiliary_loss_mlp": 0.01274095, + "balance_loss_clip": 0.06287643, + "balance_loss_mlp": 0.01254271, + "epoch": 0.2602134375469713, + "flos": 21148728243840.0, + "grad_norm": 1.765295937421399, + "language_loss": 0.8100785, + "learning_rate": 3.4693288817132255e-06, + "loss": 0.88790172, + "num_input_tokens_seen": 93462580, + "router_z_loss_clip": 2.20507812, + "router_z_loss_mlp": 0.19824219, + "step": 4328, + "time_per_iteration": 3.923682928085327 + }, + { + "auxiliary_loss_clip": 0.06502862, + "auxiliary_loss_mlp": 0.01277051, + "balance_loss_clip": 0.06285354, + "balance_loss_mlp": 0.01258704, + "epoch": 0.26027356079963926, + "flos": 25928411569920.0, + "grad_norm": 1.3948699622732248, + "language_loss": 0.88172936, + "learning_rate": 3.4690646311110525e-06, + "loss": 0.95952845, + "num_input_tokens_seen": 93482790, + "router_z_loss_clip": 2.17578125, + "router_z_loss_mlp": 0.18347168, + "step": 4329, + "time_per_iteration": 2.5685267448425293 + }, + { + "auxiliary_loss_clip": 0.06502585, + "auxiliary_loss_mlp": 0.01271461, + "balance_loss_clip": 0.06288581, + "balance_loss_mlp": 0.0125327, + "epoch": 0.2603336840523072, + "flos": 26366390461440.0, + "grad_norm": 1.8811175805050973, + "language_loss": 0.77705932, + "learning_rate": 3.468800324801802e-06, + "loss": 0.85479975, + "num_input_tokens_seen": 93498795, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.18188477, + "step": 4330, + "time_per_iteration": 2.6185224056243896 + }, + { + "auxiliary_loss_clip": 0.06508863, + "auxiliary_loss_mlp": 0.01277238, + "balance_loss_clip": 0.06289242, + "balance_loss_mlp": 0.0125826, + "epoch": 0.2603938073049752, + "flos": 23520408134400.0, + "grad_norm": 1.5596482888270802, + "language_loss": 0.76200908, + "learning_rate": 3.4685359627954958e-06, + "loss": 0.8398701, + "num_input_tokens_seen": 93518335, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.18981934, + "step": 4331, + "time_per_iteration": 2.5152506828308105 + }, + { + "auxiliary_loss_clip": 0.06507871, + "auxiliary_loss_mlp": 0.01272217, + "balance_loss_clip": 0.06292268, + "balance_loss_mlp": 0.01254527, + "epoch": 0.26045393055764315, + "flos": 25381336262400.0, + "grad_norm": 1.426884348550376, + "language_loss": 0.69540298, + "learning_rate": 3.4682715451021584e-06, + "loss": 0.77320385, + "num_input_tokens_seen": 93539170, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.17700195, + "step": 4332, + "time_per_iteration": 2.5776190757751465 + }, + { + "auxiliary_loss_clip": 0.06511752, + "auxiliary_loss_mlp": 0.01275479, + "balance_loss_clip": 0.0629351, + "balance_loss_mlp": 0.0125693, + "epoch": 0.2605140538103111, + "flos": 27642494217600.0, + "grad_norm": 1.8844860211449586, + "language_loss": 0.79951644, + "learning_rate": 3.4680070717318174e-06, + "loss": 0.87738872, + "num_input_tokens_seen": 93558480, + "router_z_loss_clip": 2.18164062, + "router_z_loss_mlp": 0.1854248, + "step": 4333, + "time_per_iteration": 2.5523998737335205 + }, + { + "auxiliary_loss_clip": 0.06501235, + "auxiliary_loss_mlp": 0.01272154, + "balance_loss_clip": 0.06290703, + "balance_loss_mlp": 0.01254714, + "epoch": 0.2605741770629791, + "flos": 13774602147840.0, + "grad_norm": 1.6726919145500945, + "language_loss": 0.81128466, + "learning_rate": 3.467742542694501e-06, + "loss": 0.8890186, + "num_input_tokens_seen": 93575220, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 0.17443848, + "step": 4334, + "time_per_iteration": 2.522210121154785 + }, + { + "auxiliary_loss_clip": 0.06510483, + "auxiliary_loss_mlp": 0.01278802, + "balance_loss_clip": 0.06293018, + "balance_loss_mlp": 0.01259859, + "epoch": 0.26063430031564705, + "flos": 26038933505280.0, + "grad_norm": 1.7438742011205015, + "language_loss": 0.80170292, + "learning_rate": 3.46747795800024e-06, + "loss": 0.87959582, + "num_input_tokens_seen": 93597015, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.18945312, + "step": 4335, + "time_per_iteration": 2.582817792892456 + }, + { + "auxiliary_loss_clip": 0.06403506, + "auxiliary_loss_mlp": 0.01257225, + "balance_loss_clip": 0.06297, + "balance_loss_mlp": 0.01252544, + "epoch": 0.26069442356831507, + "flos": 62463143030400.0, + "grad_norm": 0.8284851894367303, + "language_loss": 0.60816151, + "learning_rate": 3.467213317659068e-06, + "loss": 0.6847688, + "num_input_tokens_seen": 93657775, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.04672241, + "step": 4336, + "time_per_iteration": 3.2036406993865967 + }, + { + "auxiliary_loss_clip": 0.0651319, + "auxiliary_loss_mlp": 0.0127574, + "balance_loss_clip": 0.06294517, + "balance_loss_mlp": 0.01257405, + "epoch": 0.26075454682098304, + "flos": 13631530101120.0, + "grad_norm": 1.8662385080657846, + "language_loss": 0.78028893, + "learning_rate": 3.46694862168102e-06, + "loss": 0.85817826, + "num_input_tokens_seen": 93676145, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.18322754, + "step": 4337, + "time_per_iteration": 2.4899747371673584 + }, + { + "auxiliary_loss_clip": 0.06515083, + "auxiliary_loss_mlp": 0.01276173, + "balance_loss_clip": 0.06295171, + "balance_loss_mlp": 0.01256289, + "epoch": 0.260814670073651, + "flos": 12130776748800.0, + "grad_norm": 2.165940638299647, + "language_loss": 0.74851859, + "learning_rate": 3.4666838700761334e-06, + "loss": 0.82643116, + "num_input_tokens_seen": 93692480, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.19897461, + "step": 4338, + "time_per_iteration": 2.5323259830474854 + }, + { + "auxiliary_loss_clip": 0.06522977, + "auxiliary_loss_mlp": 0.01274339, + "balance_loss_clip": 0.0629933, + "balance_loss_mlp": 0.01255039, + "epoch": 0.26087479332631897, + "flos": 15127964968320.0, + "grad_norm": 2.9662822483112388, + "language_loss": 0.81419933, + "learning_rate": 3.466419062854447e-06, + "loss": 0.89217252, + "num_input_tokens_seen": 93710165, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.19287109, + "step": 4339, + "time_per_iteration": 2.486024856567383 + }, + { + "auxiliary_loss_clip": 0.06514673, + "auxiliary_loss_mlp": 0.0127648, + "balance_loss_clip": 0.06300991, + "balance_loss_mlp": 0.01259278, + "epoch": 0.26093491657898693, + "flos": 24687834744960.0, + "grad_norm": 1.5467473582016638, + "language_loss": 0.77106607, + "learning_rate": 3.4661542000260033e-06, + "loss": 0.84897768, + "num_input_tokens_seen": 93730185, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.17199707, + "step": 4340, + "time_per_iteration": 2.570777416229248 + }, + { + "auxiliary_loss_clip": 0.06513949, + "auxiliary_loss_mlp": 0.01274956, + "balance_loss_clip": 0.062961, + "balance_loss_mlp": 0.01255788, + "epoch": 0.2609950398316549, + "flos": 25122669108480.0, + "grad_norm": 1.4533527138525517, + "language_loss": 0.82740015, + "learning_rate": 3.465889281600845e-06, + "loss": 0.90528917, + "num_input_tokens_seen": 93747690, + "router_z_loss_clip": 2.18164062, + "router_z_loss_mlp": 0.19177246, + "step": 4341, + "time_per_iteration": 2.5946342945098877 + }, + { + "auxiliary_loss_clip": 0.06519589, + "auxiliary_loss_mlp": 0.01282035, + "balance_loss_clip": 0.06303687, + "balance_loss_mlp": 0.01261794, + "epoch": 0.26105516308432286, + "flos": 28556159137920.0, + "grad_norm": 1.7858700463590271, + "language_loss": 0.77163744, + "learning_rate": 3.4656243075890183e-06, + "loss": 0.84965372, + "num_input_tokens_seen": 93767405, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.20251465, + "step": 4342, + "time_per_iteration": 2.5742342472076416 + }, + { + "auxiliary_loss_clip": 0.06521034, + "auxiliary_loss_mlp": 0.01277248, + "balance_loss_clip": 0.06303718, + "balance_loss_mlp": 0.01258115, + "epoch": 0.2611152863369908, + "flos": 39539984400000.0, + "grad_norm": 1.7100835603344944, + "language_loss": 0.66681403, + "learning_rate": 3.4653592780005707e-06, + "loss": 0.74479687, + "num_input_tokens_seen": 93789950, + "router_z_loss_clip": 2.17578125, + "router_z_loss_mlp": 0.19140625, + "step": 4343, + "time_per_iteration": 2.662271738052368 + }, + { + "auxiliary_loss_clip": 0.06524731, + "auxiliary_loss_mlp": 0.01280109, + "balance_loss_clip": 0.0630408, + "balance_loss_mlp": 0.01261917, + "epoch": 0.2611754095896588, + "flos": 13740416881920.0, + "grad_norm": 1.8127929734390111, + "language_loss": 0.74220115, + "learning_rate": 3.465094192845553e-06, + "loss": 0.82024956, + "num_input_tokens_seen": 93807835, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.18200684, + "step": 4344, + "time_per_iteration": 2.5201361179351807 + }, + { + "auxiliary_loss_clip": 0.06524797, + "auxiliary_loss_mlp": 0.01284917, + "balance_loss_clip": 0.06307752, + "balance_loss_mlp": 0.01264484, + "epoch": 0.26123553284232676, + "flos": 21513011869440.0, + "grad_norm": 2.1854473316742338, + "language_loss": 0.8696478, + "learning_rate": 3.4648290521340165e-06, + "loss": 0.94774491, + "num_input_tokens_seen": 93825670, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.20422363, + "step": 4345, + "time_per_iteration": 2.510000228881836 + }, + { + "auxiliary_loss_clip": 0.06521724, + "auxiliary_loss_mlp": 0.01276675, + "balance_loss_clip": 0.06307776, + "balance_loss_mlp": 0.01258293, + "epoch": 0.2612956560949947, + "flos": 21145751424000.0, + "grad_norm": 2.0739898036059095, + "language_loss": 0.76897335, + "learning_rate": 3.464563855876015e-06, + "loss": 0.84695733, + "num_input_tokens_seen": 93844045, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.18371582, + "step": 4346, + "time_per_iteration": 2.5322000980377197 + }, + { + "auxiliary_loss_clip": 0.06522055, + "auxiliary_loss_mlp": 0.01275162, + "balance_loss_clip": 0.06305227, + "balance_loss_mlp": 0.01256911, + "epoch": 0.2613557793476627, + "flos": 25126023271680.0, + "grad_norm": 1.5562871556893731, + "language_loss": 0.76140273, + "learning_rate": 3.464298604081606e-06, + "loss": 0.83937496, + "num_input_tokens_seen": 93864380, + "router_z_loss_clip": 2.16894531, + "router_z_loss_mlp": 0.18249512, + "step": 4347, + "time_per_iteration": 2.557077169418335 + }, + { + "auxiliary_loss_clip": 0.06522661, + "auxiliary_loss_mlp": 0.01286127, + "balance_loss_clip": 0.06307539, + "balance_loss_mlp": 0.01267208, + "epoch": 0.26141590260033065, + "flos": 26074879706880.0, + "grad_norm": 1.3369896368920637, + "language_loss": 0.7377249, + "learning_rate": 3.4640332967608476e-06, + "loss": 0.81581283, + "num_input_tokens_seen": 93885475, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.18920898, + "step": 4348, + "time_per_iteration": 2.5915603637695312 + }, + { + "auxiliary_loss_clip": 0.06527912, + "auxiliary_loss_mlp": 0.01280562, + "balance_loss_clip": 0.06309946, + "balance_loss_mlp": 0.01260881, + "epoch": 0.2614760258529987, + "flos": 25708415875200.0, + "grad_norm": 1.876318754691465, + "language_loss": 0.9123491, + "learning_rate": 3.463767933923799e-06, + "loss": 0.99043381, + "num_input_tokens_seen": 93905545, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.19689941, + "step": 4349, + "time_per_iteration": 2.594332218170166 + }, + { + "auxiliary_loss_clip": 0.06524529, + "auxiliary_loss_mlp": 0.01276126, + "balance_loss_clip": 0.0631379, + "balance_loss_mlp": 0.01256695, + "epoch": 0.26153614910566664, + "flos": 17462902043520.0, + "grad_norm": 1.601755901803269, + "language_loss": 0.80459869, + "learning_rate": 3.463502515580524e-06, + "loss": 0.8826052, + "num_input_tokens_seen": 93924185, + "router_z_loss_clip": 2.10546875, + "router_z_loss_mlp": 0.19433594, + "step": 4350, + "time_per_iteration": 2.509274482727051 + }, + { + "auxiliary_loss_clip": 0.06520928, + "auxiliary_loss_mlp": 0.01277683, + "balance_loss_clip": 0.0631097, + "balance_loss_mlp": 0.01259063, + "epoch": 0.2615962723583346, + "flos": 17718676231680.0, + "grad_norm": 1.8928977658247819, + "language_loss": 0.62482548, + "learning_rate": 3.4632370417410866e-06, + "loss": 0.7028116, + "num_input_tokens_seen": 93942825, + "router_z_loss_clip": 2.1015625, + "router_z_loss_mlp": 0.18615723, + "step": 4351, + "time_per_iteration": 2.522862672805786 + }, + { + "auxiliary_loss_clip": 0.06526107, + "auxiliary_loss_mlp": 0.01278827, + "balance_loss_clip": 0.06308405, + "balance_loss_mlp": 0.01259396, + "epoch": 0.26165639561100257, + "flos": 23264340456960.0, + "grad_norm": 2.4783042039829546, + "language_loss": 0.84264326, + "learning_rate": 3.462971512415555e-06, + "loss": 0.92069256, + "num_input_tokens_seen": 93962045, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.19445801, + "step": 4352, + "time_per_iteration": 2.5326311588287354 + }, + { + "auxiliary_loss_clip": 0.06398427, + "auxiliary_loss_mlp": 0.01261209, + "balance_loss_clip": 0.06294002, + "balance_loss_mlp": 0.01256817, + "epoch": 0.26171651886367053, + "flos": 66756155443200.0, + "grad_norm": 0.7669563885543124, + "language_loss": 0.7057451, + "learning_rate": 3.462705927613996e-06, + "loss": 0.78234154, + "num_input_tokens_seen": 94021175, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.04397583, + "step": 4353, + "time_per_iteration": 3.093543529510498 + }, + { + "auxiliary_loss_clip": 0.06517833, + "auxiliary_loss_mlp": 0.01279039, + "balance_loss_clip": 0.06305997, + "balance_loss_mlp": 0.01259619, + "epoch": 0.2617766421163385, + "flos": 22356713030400.0, + "grad_norm": 1.943198757771125, + "language_loss": 0.77770078, + "learning_rate": 3.4624402873464816e-06, + "loss": 0.8556695, + "num_input_tokens_seen": 94043370, + "router_z_loss_clip": 2.1171875, + "router_z_loss_mlp": 0.19433594, + "step": 4354, + "time_per_iteration": 2.5782573223114014 + }, + { + "auxiliary_loss_clip": 0.06522856, + "auxiliary_loss_mlp": 0.01279183, + "balance_loss_clip": 0.06303968, + "balance_loss_mlp": 0.01259907, + "epoch": 0.26183676536900646, + "flos": 26074208874240.0, + "grad_norm": 2.16382169558429, + "language_loss": 0.68941987, + "learning_rate": 3.462174591623085e-06, + "loss": 0.7674402, + "num_input_tokens_seen": 94063510, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.19274902, + "step": 4355, + "time_per_iteration": 2.608482599258423 + }, + { + "auxiliary_loss_clip": 0.06517249, + "auxiliary_loss_mlp": 0.01282478, + "balance_loss_clip": 0.06301509, + "balance_loss_mlp": 0.01260889, + "epoch": 0.26189688862167443, + "flos": 21002847085440.0, + "grad_norm": 2.1598133279644554, + "language_loss": 0.68533909, + "learning_rate": 3.4619088404538815e-06, + "loss": 0.76333642, + "num_input_tokens_seen": 94083865, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.21594238, + "step": 4356, + "time_per_iteration": 2.526376247406006 + }, + { + "auxiliary_loss_clip": 0.06398848, + "auxiliary_loss_mlp": 0.01254107, + "balance_loss_clip": 0.06295048, + "balance_loss_mlp": 0.01249723, + "epoch": 0.2619570118743424, + "flos": 65817780768000.0, + "grad_norm": 0.6753767209108164, + "language_loss": 0.5316326, + "learning_rate": 3.4616430338489487e-06, + "loss": 0.60816211, + "num_input_tokens_seen": 94144095, + "router_z_loss_clip": 1.03125, + "router_z_loss_mlp": 0.04391479, + "step": 4357, + "time_per_iteration": 4.58653450012207 + }, + { + "auxiliary_loss_clip": 0.065238, + "auxiliary_loss_mlp": 0.01280125, + "balance_loss_clip": 0.06302594, + "balance_loss_mlp": 0.01261183, + "epoch": 0.26201713512701036, + "flos": 28774310042880.0, + "grad_norm": 1.9589657113609436, + "language_loss": 0.85308599, + "learning_rate": 3.4613771718183654e-06, + "loss": 0.93112528, + "num_input_tokens_seen": 94163035, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.18933105, + "step": 4358, + "time_per_iteration": 2.65427303314209 + }, + { + "auxiliary_loss_clip": 0.0652793, + "auxiliary_loss_mlp": 0.0127535, + "balance_loss_clip": 0.06300082, + "balance_loss_mlp": 0.01254917, + "epoch": 0.2620772583796783, + "flos": 26439750311040.0, + "grad_norm": 2.2013035586341663, + "language_loss": 0.68206531, + "learning_rate": 3.4611112543722127e-06, + "loss": 0.7600981, + "num_input_tokens_seen": 94182520, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.20422363, + "step": 4359, + "time_per_iteration": 2.5460946559906006 + }, + { + "auxiliary_loss_clip": 0.06517753, + "auxiliary_loss_mlp": 0.01278599, + "balance_loss_clip": 0.06299832, + "balance_loss_mlp": 0.01258763, + "epoch": 0.2621373816323463, + "flos": 20162667795840.0, + "grad_norm": 1.9413360196767273, + "language_loss": 0.7857362, + "learning_rate": 3.4608452815205757e-06, + "loss": 0.86369967, + "num_input_tokens_seen": 94201795, + "router_z_loss_clip": 2.18164062, + "router_z_loss_mlp": 0.19848633, + "step": 4360, + "time_per_iteration": 2.5442395210266113 + }, + { + "auxiliary_loss_clip": 0.06513859, + "auxiliary_loss_mlp": 0.01282389, + "balance_loss_clip": 0.06305451, + "balance_loss_mlp": 0.01262839, + "epoch": 0.26219750488501425, + "flos": 28628764300800.0, + "grad_norm": 1.9016418571028826, + "language_loss": 0.68632245, + "learning_rate": 3.4605792532735387e-06, + "loss": 0.76428491, + "num_input_tokens_seen": 94222390, + "router_z_loss_clip": 2.08105469, + "router_z_loss_mlp": 0.19519043, + "step": 4361, + "time_per_iteration": 2.5506739616394043 + }, + { + "auxiliary_loss_clip": 0.0652248, + "auxiliary_loss_mlp": 0.01277506, + "balance_loss_clip": 0.06302515, + "balance_loss_mlp": 0.01256298, + "epoch": 0.2622576281376823, + "flos": 15046806689280.0, + "grad_norm": 1.72568625675014, + "language_loss": 0.84433615, + "learning_rate": 3.46031316964119e-06, + "loss": 0.92233592, + "num_input_tokens_seen": 94239980, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.21179199, + "step": 4362, + "time_per_iteration": 3.9455041885375977 + }, + { + "auxiliary_loss_clip": 0.06516212, + "auxiliary_loss_mlp": 0.01274047, + "balance_loss_clip": 0.06303745, + "balance_loss_mlp": 0.01254914, + "epoch": 0.26231775139035024, + "flos": 26403426766080.0, + "grad_norm": 1.7310155723144771, + "language_loss": 0.65182602, + "learning_rate": 3.4600470306336197e-06, + "loss": 0.72972858, + "num_input_tokens_seen": 94260715, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.19140625, + "step": 4363, + "time_per_iteration": 2.5710229873657227 + }, + { + "auxiliary_loss_clip": 0.06417713, + "auxiliary_loss_mlp": 0.01270336, + "balance_loss_clip": 0.06313097, + "balance_loss_mlp": 0.01263804, + "epoch": 0.2623778746430182, + "flos": 65430380615040.0, + "grad_norm": 0.9022976396731897, + "language_loss": 0.61189461, + "learning_rate": 3.4597808362609194e-06, + "loss": 0.68877506, + "num_input_tokens_seen": 94321285, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.06542969, + "step": 4364, + "time_per_iteration": 4.728578805923462 + }, + { + "auxiliary_loss_clip": 0.06528256, + "auxiliary_loss_mlp": 0.01280703, + "balance_loss_clip": 0.06308191, + "balance_loss_mlp": 0.01260402, + "epoch": 0.26243799789568617, + "flos": 12609104181120.0, + "grad_norm": 2.531531320883944, + "language_loss": 0.72247571, + "learning_rate": 3.459514586533184e-06, + "loss": 0.80056524, + "num_input_tokens_seen": 94335420, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.20300293, + "step": 4365, + "time_per_iteration": 2.5567469596862793 + }, + { + "auxiliary_loss_clip": 0.06519997, + "auxiliary_loss_mlp": 0.01276088, + "balance_loss_clip": 0.06307054, + "balance_loss_mlp": 0.01257146, + "epoch": 0.26249812114835414, + "flos": 28631783047680.0, + "grad_norm": 1.7351756990107399, + "language_loss": 0.78023124, + "learning_rate": 3.459248281460509e-06, + "loss": 0.85819209, + "num_input_tokens_seen": 94357440, + "router_z_loss_clip": 2.12792969, + "router_z_loss_mlp": 0.18945312, + "step": 4366, + "time_per_iteration": 2.6212668418884277 + }, + { + "auxiliary_loss_clip": 0.06522524, + "auxiliary_loss_mlp": 0.01276459, + "balance_loss_clip": 0.06305946, + "balance_loss_mlp": 0.01258351, + "epoch": 0.2625582444010221, + "flos": 14470661214720.0, + "grad_norm": 1.579355851615032, + "language_loss": 0.77007079, + "learning_rate": 3.4589819210529927e-06, + "loss": 0.84806067, + "num_input_tokens_seen": 94375690, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.18103027, + "step": 4367, + "time_per_iteration": 2.602072238922119 + }, + { + "auxiliary_loss_clip": 0.06517363, + "auxiliary_loss_mlp": 0.01271186, + "balance_loss_clip": 0.06304537, + "balance_loss_mlp": 0.01253471, + "epoch": 0.26261836765369007, + "flos": 16617984998400.0, + "grad_norm": 1.5269013949985815, + "language_loss": 0.70157337, + "learning_rate": 3.458715505320736e-06, + "loss": 0.77945888, + "num_input_tokens_seen": 94393190, + "router_z_loss_clip": 2.13183594, + "router_z_loss_mlp": 0.17700195, + "step": 4368, + "time_per_iteration": 4.012764930725098 + }, + { + "auxiliary_loss_clip": 0.06516206, + "auxiliary_loss_mlp": 0.01278713, + "balance_loss_clip": 0.06299432, + "balance_loss_mlp": 0.01256635, + "epoch": 0.26267849090635803, + "flos": 20525861318400.0, + "grad_norm": 1.916794033771568, + "language_loss": 0.79240829, + "learning_rate": 3.458449034273841e-06, + "loss": 0.87035751, + "num_input_tokens_seen": 94410975, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.22070312, + "step": 4369, + "time_per_iteration": 2.51906418800354 + }, + { + "auxiliary_loss_clip": 0.06514631, + "auxiliary_loss_mlp": 0.01276005, + "balance_loss_clip": 0.06301987, + "balance_loss_mlp": 0.01256883, + "epoch": 0.262738614159026, + "flos": 21330220187520.0, + "grad_norm": 3.2285566965587873, + "language_loss": 0.83905816, + "learning_rate": 3.4581825079224133e-06, + "loss": 0.91696453, + "num_input_tokens_seen": 94429985, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.19116211, + "step": 4370, + "time_per_iteration": 2.562302589416504 + }, + { + "auxiliary_loss_clip": 0.06520583, + "auxiliary_loss_mlp": 0.01275132, + "balance_loss_clip": 0.06299531, + "balance_loss_mlp": 0.01253972, + "epoch": 0.26279873741169396, + "flos": 17609454034560.0, + "grad_norm": 1.7096089610285066, + "language_loss": 0.71678042, + "learning_rate": 3.4579159262765575e-06, + "loss": 0.79473758, + "num_input_tokens_seen": 94448660, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.21179199, + "step": 4371, + "time_per_iteration": 2.4965152740478516 + }, + { + "auxiliary_loss_clip": 0.06398421, + "auxiliary_loss_mlp": 0.01256739, + "balance_loss_clip": 0.0629326, + "balance_loss_mlp": 0.01252516, + "epoch": 0.2628588606643619, + "flos": 60969139931520.0, + "grad_norm": 0.666639264120038, + "language_loss": 0.56056166, + "learning_rate": 3.457649289346384e-06, + "loss": 0.63711321, + "num_input_tokens_seen": 94515630, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.04226685, + "step": 4372, + "time_per_iteration": 3.2867443561553955 + }, + { + "auxiliary_loss_clip": 0.06512036, + "auxiliary_loss_mlp": 0.01277679, + "balance_loss_clip": 0.06298684, + "balance_loss_mlp": 0.01259178, + "epoch": 0.2629189839170299, + "flos": 27023652288000.0, + "grad_norm": 1.5439358769508327, + "language_loss": 0.78190762, + "learning_rate": 3.4573825971420042e-06, + "loss": 0.85980475, + "num_input_tokens_seen": 94535385, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.18505859, + "step": 4373, + "time_per_iteration": 2.577479362487793 + }, + { + "auxiliary_loss_clip": 0.06510606, + "auxiliary_loss_mlp": 0.01278833, + "balance_loss_clip": 0.06297645, + "balance_loss_mlp": 0.01260427, + "epoch": 0.26297910716969786, + "flos": 17025635911680.0, + "grad_norm": 2.1443132622279664, + "language_loss": 0.723768, + "learning_rate": 3.4571158496735294e-06, + "loss": 0.80166239, + "num_input_tokens_seen": 94552650, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.18383789, + "step": 4374, + "time_per_iteration": 2.5588772296905518 + }, + { + "auxiliary_loss_clip": 0.06517059, + "auxiliary_loss_mlp": 0.01278902, + "balance_loss_clip": 0.0630156, + "balance_loss_mlp": 0.01258505, + "epoch": 0.2630392304223659, + "flos": 24903889297920.0, + "grad_norm": 2.1190930293084933, + "language_loss": 0.81199759, + "learning_rate": 3.4568490469510756e-06, + "loss": 0.88995719, + "num_input_tokens_seen": 94574075, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.20373535, + "step": 4375, + "time_per_iteration": 2.591381311416626 + }, + { + "auxiliary_loss_clip": 0.0651055, + "auxiliary_loss_mlp": 0.01275326, + "balance_loss_clip": 0.0629838, + "balance_loss_mlp": 0.01257289, + "epoch": 0.26309935367503384, + "flos": 32862336641280.0, + "grad_norm": 1.9139045559413268, + "language_loss": 0.66626596, + "learning_rate": 3.4565821889847603e-06, + "loss": 0.74412477, + "num_input_tokens_seen": 94594255, + "router_z_loss_clip": 2.12304688, + "router_z_loss_mlp": 0.18041992, + "step": 4376, + "time_per_iteration": 2.643944025039673 + }, + { + "auxiliary_loss_clip": 0.06515232, + "auxiliary_loss_mlp": 0.01276237, + "balance_loss_clip": 0.06297503, + "balance_loss_mlp": 0.01257485, + "epoch": 0.2631594769277018, + "flos": 15893400816000.0, + "grad_norm": 1.6251454157029055, + "language_loss": 0.70145154, + "learning_rate": 3.4563152757847026e-06, + "loss": 0.77936625, + "num_input_tokens_seen": 94611410, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.1875, + "step": 4377, + "time_per_iteration": 2.5593788623809814 + }, + { + "auxiliary_loss_clip": 0.06513406, + "auxiliary_loss_mlp": 0.01274994, + "balance_loss_clip": 0.06296869, + "balance_loss_mlp": 0.01255408, + "epoch": 0.2632196001803698, + "flos": 50816242811520.0, + "grad_norm": 1.6666327452584295, + "language_loss": 0.80235565, + "learning_rate": 3.4560483073610233e-06, + "loss": 0.88023967, + "num_input_tokens_seen": 94636575, + "router_z_loss_clip": 2.16601562, + "router_z_loss_mlp": 0.19592285, + "step": 4378, + "time_per_iteration": 2.794290065765381 + }, + { + "auxiliary_loss_clip": 0.0651051, + "auxiliary_loss_mlp": 0.01272396, + "balance_loss_clip": 0.06297652, + "balance_loss_mlp": 0.0125492, + "epoch": 0.26327972343303774, + "flos": 13737733551360.0, + "grad_norm": 2.7188396998417548, + "language_loss": 0.77230549, + "learning_rate": 3.455781283723846e-06, + "loss": 0.85013449, + "num_input_tokens_seen": 94654345, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.17480469, + "step": 4379, + "time_per_iteration": 2.542442560195923 + }, + { + "auxiliary_loss_clip": 0.06519607, + "auxiliary_loss_mlp": 0.01274968, + "balance_loss_clip": 0.06299821, + "balance_loss_mlp": 0.01255084, + "epoch": 0.2633398466857057, + "flos": 23775846906240.0, + "grad_norm": 1.9724368576120554, + "language_loss": 0.78418016, + "learning_rate": 3.4555142048832975e-06, + "loss": 0.86212587, + "num_input_tokens_seen": 94673985, + "router_z_loss_clip": 2.19921875, + "router_z_loss_mlp": 0.19897461, + "step": 4380, + "time_per_iteration": 2.529573440551758 + }, + { + "auxiliary_loss_clip": 0.06516172, + "auxiliary_loss_mlp": 0.012759, + "balance_loss_clip": 0.06296928, + "balance_loss_mlp": 0.01257518, + "epoch": 0.26339996993837367, + "flos": 27607680046080.0, + "grad_norm": 1.9046534185934374, + "language_loss": 0.6460917, + "learning_rate": 3.4552470708495036e-06, + "loss": 0.72401243, + "num_input_tokens_seen": 94693145, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.18383789, + "step": 4381, + "time_per_iteration": 2.5774149894714355 + }, + { + "auxiliary_loss_clip": 0.06511073, + "auxiliary_loss_mlp": 0.01273848, + "balance_loss_clip": 0.06295128, + "balance_loss_mlp": 0.01255394, + "epoch": 0.26346009319104163, + "flos": 16951982572800.0, + "grad_norm": 1.8115834165165374, + "language_loss": 0.8293367, + "learning_rate": 3.454979881632595e-06, + "loss": 0.90718591, + "num_input_tokens_seen": 94710185, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.18444824, + "step": 4382, + "time_per_iteration": 2.503119945526123 + }, + { + "auxiliary_loss_clip": 0.06526808, + "auxiliary_loss_mlp": 0.01282548, + "balance_loss_clip": 0.06304507, + "balance_loss_mlp": 0.0126196, + "epoch": 0.2635202164437096, + "flos": 37241245088640.0, + "grad_norm": 2.8611377763647363, + "language_loss": 0.70728219, + "learning_rate": 3.4547126372427035e-06, + "loss": 0.78537577, + "num_input_tokens_seen": 94730280, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.20581055, + "step": 4383, + "time_per_iteration": 2.7256851196289062 + }, + { + "auxiliary_loss_clip": 0.06511825, + "auxiliary_loss_mlp": 0.01278143, + "balance_loss_clip": 0.0629648, + "balance_loss_mlp": 0.01260214, + "epoch": 0.26358033969637756, + "flos": 21002721304320.0, + "grad_norm": 1.8636489890531567, + "language_loss": 0.69725919, + "learning_rate": 3.4544453376899638e-06, + "loss": 0.77515888, + "num_input_tokens_seen": 94748560, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.17919922, + "step": 4384, + "time_per_iteration": 2.526306629180908 + }, + { + "auxiliary_loss_clip": 0.06514609, + "auxiliary_loss_mlp": 0.01274952, + "balance_loss_clip": 0.06301568, + "balance_loss_mlp": 0.01256355, + "epoch": 0.26364046294904553, + "flos": 27753561204480.0, + "grad_norm": 2.704228439938978, + "language_loss": 0.70769042, + "learning_rate": 3.45417798298451e-06, + "loss": 0.785586, + "num_input_tokens_seen": 94767570, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.18603516, + "step": 4385, + "time_per_iteration": 2.6091294288635254 + }, + { + "auxiliary_loss_clip": 0.06510788, + "auxiliary_loss_mlp": 0.01275036, + "balance_loss_clip": 0.06297003, + "balance_loss_mlp": 0.01255903, + "epoch": 0.2637005862017135, + "flos": 22899679488000.0, + "grad_norm": 1.8400483569046413, + "language_loss": 0.85200071, + "learning_rate": 3.453910573136482e-06, + "loss": 0.92985892, + "num_input_tokens_seen": 94784985, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.19116211, + "step": 4386, + "time_per_iteration": 2.5284476280212402 + }, + { + "auxiliary_loss_clip": 0.06516191, + "auxiliary_loss_mlp": 0.01275321, + "balance_loss_clip": 0.06302508, + "balance_loss_mlp": 0.01255759, + "epoch": 0.26376070945438146, + "flos": 15054143921280.0, + "grad_norm": 1.9881194524454247, + "language_loss": 0.77597183, + "learning_rate": 3.4536431081560196e-06, + "loss": 0.85388696, + "num_input_tokens_seen": 94802545, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.19567871, + "step": 4387, + "time_per_iteration": 2.522135019302368 + }, + { + "auxiliary_loss_clip": 0.0651316, + "auxiliary_loss_mlp": 0.01278261, + "balance_loss_clip": 0.06301039, + "balance_loss_mlp": 0.01259378, + "epoch": 0.2638208327070494, + "flos": 21148141265280.0, + "grad_norm": 2.1303107819849316, + "language_loss": 0.76193964, + "learning_rate": 3.453375588053264e-06, + "loss": 0.83985388, + "num_input_tokens_seen": 94820730, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.1887207, + "step": 4388, + "time_per_iteration": 2.5082008838653564 + }, + { + "auxiliary_loss_clip": 0.06516623, + "auxiliary_loss_mlp": 0.01271478, + "balance_loss_clip": 0.06302176, + "balance_loss_mlp": 0.01253681, + "epoch": 0.26388095595971744, + "flos": 21732001315200.0, + "grad_norm": 2.125202232596161, + "language_loss": 0.86967361, + "learning_rate": 3.4531080128383617e-06, + "loss": 0.94755471, + "num_input_tokens_seen": 94839175, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.17785645, + "step": 4389, + "time_per_iteration": 2.570643901824951 + }, + { + "auxiliary_loss_clip": 0.06416489, + "auxiliary_loss_mlp": 0.01268137, + "balance_loss_clip": 0.0630957, + "balance_loss_mlp": 0.01263464, + "epoch": 0.2639410792123854, + "flos": 65536542138240.0, + "grad_norm": 0.8199197454978128, + "language_loss": 0.60138249, + "learning_rate": 3.452840382521457e-06, + "loss": 0.6782288, + "num_input_tokens_seen": 94898865, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.04666138, + "step": 4390, + "time_per_iteration": 3.174226999282837 + }, + { + "auxiliary_loss_clip": 0.06524064, + "auxiliary_loss_mlp": 0.01274153, + "balance_loss_clip": 0.06302064, + "balance_loss_mlp": 0.01255008, + "epoch": 0.2640012024650534, + "flos": 23954907081600.0, + "grad_norm": 1.739207981028, + "language_loss": 0.77995527, + "learning_rate": 3.4525726971127e-06, + "loss": 0.85793746, + "num_input_tokens_seen": 94917490, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.19152832, + "step": 4391, + "time_per_iteration": 2.5869362354278564 + }, + { + "auxiliary_loss_clip": 0.06415629, + "auxiliary_loss_mlp": 0.01265443, + "balance_loss_clip": 0.06309642, + "balance_loss_mlp": 0.0126082, + "epoch": 0.26406132571772134, + "flos": 56462420880000.0, + "grad_norm": 0.8885893091984226, + "language_loss": 0.58835375, + "learning_rate": 3.45230495662224e-06, + "loss": 0.66516447, + "num_input_tokens_seen": 94969065, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.04620361, + "step": 4392, + "time_per_iteration": 3.1856343746185303 + }, + { + "auxiliary_loss_clip": 0.0652501, + "auxiliary_loss_mlp": 0.0127481, + "balance_loss_clip": 0.06303259, + "balance_loss_mlp": 0.01256631, + "epoch": 0.2641214489703893, + "flos": 22097039627520.0, + "grad_norm": 1.7095674260711007, + "language_loss": 0.69284153, + "learning_rate": 3.4520371610602306e-06, + "loss": 0.77083969, + "num_input_tokens_seen": 94988540, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.1817627, + "step": 4393, + "time_per_iteration": 2.5519895553588867 + }, + { + "auxiliary_loss_clip": 0.06526117, + "auxiliary_loss_mlp": 0.01277548, + "balance_loss_clip": 0.06302489, + "balance_loss_mlp": 0.01255959, + "epoch": 0.26418157222305727, + "flos": 16550327226240.0, + "grad_norm": 2.304177456685855, + "language_loss": 0.84805501, + "learning_rate": 3.4517693104368267e-06, + "loss": 0.92609167, + "num_input_tokens_seen": 95004810, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.21594238, + "step": 4394, + "time_per_iteration": 2.5253031253814697 + }, + { + "auxiliary_loss_clip": 0.06528334, + "auxiliary_loss_mlp": 0.01280976, + "balance_loss_clip": 0.06304967, + "balance_loss_mlp": 0.01260066, + "epoch": 0.26424169547572524, + "flos": 18008006780160.0, + "grad_norm": 1.9555526734650441, + "language_loss": 0.70342916, + "learning_rate": 3.4515014047621856e-06, + "loss": 0.78152227, + "num_input_tokens_seen": 95024085, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.20910645, + "step": 4395, + "time_per_iteration": 2.5117664337158203 + }, + { + "auxiliary_loss_clip": 0.06512758, + "auxiliary_loss_mlp": 0.01272399, + "balance_loss_clip": 0.06300145, + "balance_loss_mlp": 0.01253171, + "epoch": 0.2643018187283932, + "flos": 16988893096320.0, + "grad_norm": 1.791387622967983, + "language_loss": 0.87312353, + "learning_rate": 3.4512334440464655e-06, + "loss": 0.95097506, + "num_input_tokens_seen": 95042515, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.19238281, + "step": 4396, + "time_per_iteration": 2.566774368286133 + }, + { + "auxiliary_loss_clip": 0.06404904, + "auxiliary_loss_mlp": 0.01257464, + "balance_loss_clip": 0.06300922, + "balance_loss_mlp": 0.01252997, + "epoch": 0.26436194198106117, + "flos": 59682135144960.0, + "grad_norm": 0.7723405564107855, + "language_loss": 0.54990101, + "learning_rate": 3.4509654282998277e-06, + "loss": 0.62652469, + "num_input_tokens_seen": 95094835, + "router_z_loss_clip": 1.0390625, + "router_z_loss_mlp": 0.04473877, + "step": 4397, + "time_per_iteration": 4.373678684234619 + }, + { + "auxiliary_loss_clip": 0.06510547, + "auxiliary_loss_mlp": 0.01274266, + "balance_loss_clip": 0.06297219, + "balance_loss_mlp": 0.01255633, + "epoch": 0.26442206523372913, + "flos": 32928694675200.0, + "grad_norm": 2.4292177107300224, + "language_loss": 0.78606653, + "learning_rate": 3.450697357532435e-06, + "loss": 0.86391467, + "num_input_tokens_seen": 95113480, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.1862793, + "step": 4398, + "time_per_iteration": 2.6890292167663574 + }, + { + "auxiliary_loss_clip": 0.06511252, + "auxiliary_loss_mlp": 0.01279415, + "balance_loss_clip": 0.06294377, + "balance_loss_mlp": 0.01259244, + "epoch": 0.2644821884863971, + "flos": 21037409694720.0, + "grad_norm": 1.6698754866149341, + "language_loss": 0.67733896, + "learning_rate": 3.4504292317544534e-06, + "loss": 0.75524557, + "num_input_tokens_seen": 95132580, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.20178223, + "step": 4399, + "time_per_iteration": 2.5403761863708496 + }, + { + "auxiliary_loss_clip": 0.06507229, + "auxiliary_loss_mlp": 0.01274507, + "balance_loss_clip": 0.06301808, + "balance_loss_mlp": 0.01256841, + "epoch": 0.26454231173906506, + "flos": 20783019098880.0, + "grad_norm": 1.5093240378212085, + "language_loss": 0.8695311, + "learning_rate": 3.4501610509760504e-06, + "loss": 0.94734848, + "num_input_tokens_seen": 95152375, + "router_z_loss_clip": 2.05175781, + "router_z_loss_mlp": 0.17675781, + "step": 4400, + "time_per_iteration": 2.546402931213379 + }, + { + "auxiliary_loss_clip": 0.06514899, + "auxiliary_loss_mlp": 0.01275157, + "balance_loss_clip": 0.06298938, + "balance_loss_mlp": 0.01255404, + "epoch": 0.264602434991733, + "flos": 16624399835520.0, + "grad_norm": 2.9592381962347076, + "language_loss": 0.77008456, + "learning_rate": 3.4498928152073944e-06, + "loss": 0.84798515, + "num_input_tokens_seen": 95170265, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.19750977, + "step": 4401, + "time_per_iteration": 4.000045537948608 + }, + { + "auxiliary_loss_clip": 0.06515318, + "auxiliary_loss_mlp": 0.01277892, + "balance_loss_clip": 0.0629567, + "balance_loss_mlp": 0.01257149, + "epoch": 0.26466255824440105, + "flos": 19068726816000.0, + "grad_norm": 1.7667226788610035, + "language_loss": 0.88791883, + "learning_rate": 3.4496245244586577e-06, + "loss": 0.96585095, + "num_input_tokens_seen": 95188655, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.20739746, + "step": 4402, + "time_per_iteration": 2.504951000213623 + }, + { + "auxiliary_loss_clip": 0.06514971, + "auxiliary_loss_mlp": 0.01280074, + "balance_loss_clip": 0.06299384, + "balance_loss_mlp": 0.01261203, + "epoch": 0.264722681497069, + "flos": 22645246965120.0, + "grad_norm": 2.1016866817380944, + "language_loss": 0.78604829, + "learning_rate": 3.4493561787400137e-06, + "loss": 0.86399865, + "num_input_tokens_seen": 95209615, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.18884277, + "step": 4403, + "time_per_iteration": 3.9830996990203857 + }, + { + "auxiliary_loss_clip": 0.06513863, + "auxiliary_loss_mlp": 0.01273109, + "balance_loss_clip": 0.0629956, + "balance_loss_mlp": 0.01254322, + "epoch": 0.264782804749737, + "flos": 22498862682240.0, + "grad_norm": 2.2718142403423887, + "language_loss": 0.88776851, + "learning_rate": 3.4490877780616387e-06, + "loss": 0.96563816, + "num_input_tokens_seen": 95227810, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.18774414, + "step": 4404, + "time_per_iteration": 2.5655670166015625 + }, + { + "auxiliary_loss_clip": 0.06512003, + "auxiliary_loss_mlp": 0.01272083, + "balance_loss_clip": 0.06294957, + "balance_loss_mlp": 0.01253666, + "epoch": 0.26484292800240494, + "flos": 16805891779200.0, + "grad_norm": 1.6853243703943699, + "language_loss": 0.77144921, + "learning_rate": 3.448819322433709e-06, + "loss": 0.84929001, + "num_input_tokens_seen": 95245890, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.18408203, + "step": 4405, + "time_per_iteration": 2.5151660442352295 + }, + { + "auxiliary_loss_clip": 0.06518488, + "auxiliary_loss_mlp": 0.01280263, + "balance_loss_clip": 0.06303011, + "balance_loss_mlp": 0.0126113, + "epoch": 0.2649030512550729, + "flos": 20455939486080.0, + "grad_norm": 1.6552463254663874, + "language_loss": 0.70570582, + "learning_rate": 3.4485508118664066e-06, + "loss": 0.78369337, + "num_input_tokens_seen": 95264955, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.19152832, + "step": 4406, + "time_per_iteration": 2.5817081928253174 + }, + { + "auxiliary_loss_clip": 0.06515051, + "auxiliary_loss_mlp": 0.01282775, + "balance_loss_clip": 0.06304015, + "balance_loss_mlp": 0.01264071, + "epoch": 0.2649631745077409, + "flos": 22422190596480.0, + "grad_norm": 1.6043271976664373, + "language_loss": 0.84213567, + "learning_rate": 3.448282246369912e-06, + "loss": 0.92011392, + "num_input_tokens_seen": 95284245, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 0.18701172, + "step": 4407, + "time_per_iteration": 2.5317513942718506 + }, + { + "auxiliary_loss_clip": 0.06506669, + "auxiliary_loss_mlp": 0.01274017, + "balance_loss_clip": 0.06294346, + "balance_loss_mlp": 0.01255384, + "epoch": 0.26502329776040884, + "flos": 35124794334720.0, + "grad_norm": 1.8863485028384246, + "language_loss": 0.76080608, + "learning_rate": 3.4480136259544084e-06, + "loss": 0.83861291, + "num_input_tokens_seen": 95307125, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.18615723, + "step": 4408, + "time_per_iteration": 4.144388675689697 + }, + { + "auxiliary_loss_clip": 0.06504838, + "auxiliary_loss_mlp": 0.01278565, + "balance_loss_clip": 0.06293095, + "balance_loss_mlp": 0.01259765, + "epoch": 0.2650834210130768, + "flos": 38696073603840.0, + "grad_norm": 1.6572856868324277, + "language_loss": 0.71237993, + "learning_rate": 3.447744950630084e-06, + "loss": 0.79021394, + "num_input_tokens_seen": 95329150, + "router_z_loss_clip": 2.11523438, + "router_z_loss_mlp": 0.18786621, + "step": 4409, + "time_per_iteration": 2.6830790042877197 + }, + { + "auxiliary_loss_clip": 0.06513892, + "auxiliary_loss_mlp": 0.01277956, + "balance_loss_clip": 0.06296389, + "balance_loss_mlp": 0.01258513, + "epoch": 0.26514354426574477, + "flos": 24723655165440.0, + "grad_norm": 1.9985850932403133, + "language_loss": 0.74335337, + "learning_rate": 3.4474762204071253e-06, + "loss": 0.82127184, + "num_input_tokens_seen": 95349880, + "router_z_loss_clip": 2.17578125, + "router_z_loss_mlp": 0.19445801, + "step": 4410, + "time_per_iteration": 2.5640783309936523 + }, + { + "auxiliary_loss_clip": 0.06510055, + "auxiliary_loss_mlp": 0.01275315, + "balance_loss_clip": 0.06293881, + "balance_loss_mlp": 0.01256873, + "epoch": 0.26520366751841273, + "flos": 20346381872640.0, + "grad_norm": 1.7362440314024254, + "language_loss": 0.74604267, + "learning_rate": 3.4472074352957244e-06, + "loss": 0.82389635, + "num_input_tokens_seen": 95368570, + "router_z_loss_clip": 2.1640625, + "router_z_loss_mlp": 0.18457031, + "step": 4411, + "time_per_iteration": 2.681392192840576 + }, + { + "auxiliary_loss_clip": 0.06503807, + "auxiliary_loss_mlp": 0.0127974, + "balance_loss_clip": 0.06292095, + "balance_loss_mlp": 0.01260941, + "epoch": 0.2652637907710807, + "flos": 22350046631040.0, + "grad_norm": 1.9068391403977176, + "language_loss": 0.83043784, + "learning_rate": 3.446938595306071e-06, + "loss": 0.90827328, + "num_input_tokens_seen": 95387065, + "router_z_loss_clip": 2.1171875, + "router_z_loss_mlp": 0.18798828, + "step": 4412, + "time_per_iteration": 2.570462942123413 + }, + { + "auxiliary_loss_clip": 0.06509882, + "auxiliary_loss_mlp": 0.01280008, + "balance_loss_clip": 0.0629638, + "balance_loss_mlp": 0.01260327, + "epoch": 0.26532391402374866, + "flos": 19360279497600.0, + "grad_norm": 1.6015505507863077, + "language_loss": 0.75010121, + "learning_rate": 3.4466697004483622e-06, + "loss": 0.82800013, + "num_input_tokens_seen": 95406345, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.19677734, + "step": 4413, + "time_per_iteration": 2.5575060844421387 + }, + { + "auxiliary_loss_clip": 0.06392879, + "auxiliary_loss_mlp": 0.01259819, + "balance_loss_clip": 0.06288524, + "balance_loss_mlp": 0.01255307, + "epoch": 0.26538403727641663, + "flos": 44804479121280.0, + "grad_norm": 0.9088609657061584, + "language_loss": 0.57055008, + "learning_rate": 3.446400750732793e-06, + "loss": 0.64707708, + "num_input_tokens_seen": 95463595, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.04522705, + "step": 4414, + "time_per_iteration": 3.090242624282837 + }, + { + "auxiliary_loss_clip": 0.06501576, + "auxiliary_loss_mlp": 0.01278206, + "balance_loss_clip": 0.06294522, + "balance_loss_mlp": 0.01260587, + "epoch": 0.26544416052908465, + "flos": 28189359889920.0, + "grad_norm": 1.5322949912702364, + "language_loss": 0.74997067, + "learning_rate": 3.4461317461695625e-06, + "loss": 0.82776845, + "num_input_tokens_seen": 95484115, + "router_z_loss_clip": 2.0703125, + "router_z_loss_mlp": 0.17626953, + "step": 4415, + "time_per_iteration": 2.6143665313720703 + }, + { + "auxiliary_loss_clip": 0.06505995, + "auxiliary_loss_mlp": 0.01278176, + "balance_loss_clip": 0.06289595, + "balance_loss_mlp": 0.0125791, + "epoch": 0.2655042837817526, + "flos": 17570824502400.0, + "grad_norm": 4.108925661978825, + "language_loss": 0.87716872, + "learning_rate": 3.4458626867688707e-06, + "loss": 0.95501041, + "num_input_tokens_seen": 95501435, + "router_z_loss_clip": 2.1640625, + "router_z_loss_mlp": 0.20263672, + "step": 4416, + "time_per_iteration": 2.4974279403686523 + }, + { + "auxiliary_loss_clip": 0.06510112, + "auxiliary_loss_mlp": 0.01280216, + "balance_loss_clip": 0.0629703, + "balance_loss_mlp": 0.0126094, + "epoch": 0.2655644070344206, + "flos": 23411437499520.0, + "grad_norm": 1.4955026126411677, + "language_loss": 0.77089638, + "learning_rate": 3.4455935725409217e-06, + "loss": 0.84879971, + "num_input_tokens_seen": 95520135, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.19274902, + "step": 4417, + "time_per_iteration": 2.576826572418213 + }, + { + "auxiliary_loss_clip": 0.0650158, + "auxiliary_loss_mlp": 0.01274734, + "balance_loss_clip": 0.06292985, + "balance_loss_mlp": 0.01255946, + "epoch": 0.26562453028708854, + "flos": 26475612658560.0, + "grad_norm": 1.3751463134954343, + "language_loss": 0.80062425, + "learning_rate": 3.4453244034959196e-06, + "loss": 0.87838733, + "num_input_tokens_seen": 95541705, + "router_z_loss_clip": 2.08203125, + "router_z_loss_mlp": 0.18786621, + "step": 4418, + "time_per_iteration": 2.573490619659424 + }, + { + "auxiliary_loss_clip": 0.06510676, + "auxiliary_loss_mlp": 0.01274316, + "balance_loss_clip": 0.06295326, + "balance_loss_mlp": 0.01254945, + "epoch": 0.2656846535397565, + "flos": 19213475944320.0, + "grad_norm": 2.092556142181657, + "language_loss": 0.67613918, + "learning_rate": 3.445055179644071e-06, + "loss": 0.7539891, + "num_input_tokens_seen": 95560300, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.19372559, + "step": 4419, + "time_per_iteration": 2.5705552101135254 + }, + { + "auxiliary_loss_clip": 0.06507199, + "auxiliary_loss_mlp": 0.01281966, + "balance_loss_clip": 0.06293494, + "balance_loss_mlp": 0.01262153, + "epoch": 0.2657447767924245, + "flos": 30558566085120.0, + "grad_norm": 1.8356097714997412, + "language_loss": 0.79905182, + "learning_rate": 3.444785900995585e-06, + "loss": 0.87694353, + "num_input_tokens_seen": 95580150, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.19799805, + "step": 4420, + "time_per_iteration": 2.5966663360595703 + }, + { + "auxiliary_loss_clip": 0.06514539, + "auxiliary_loss_mlp": 0.01276693, + "balance_loss_clip": 0.06294198, + "balance_loss_mlp": 0.01256367, + "epoch": 0.26580490004509244, + "flos": 20928984111360.0, + "grad_norm": 2.015825119850129, + "language_loss": 0.81966692, + "learning_rate": 3.444516567560673e-06, + "loss": 0.89757919, + "num_input_tokens_seen": 95597570, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.20324707, + "step": 4421, + "time_per_iteration": 2.5285565853118896 + }, + { + "auxiliary_loss_clip": 0.06503608, + "auxiliary_loss_mlp": 0.01277509, + "balance_loss_clip": 0.06293386, + "balance_loss_mlp": 0.01259341, + "epoch": 0.2658650232977604, + "flos": 43955845297920.0, + "grad_norm": 1.6494646012937118, + "language_loss": 0.66448712, + "learning_rate": 3.444247179349548e-06, + "loss": 0.74229831, + "num_input_tokens_seen": 95619415, + "router_z_loss_clip": 2.1015625, + "router_z_loss_mlp": 0.1817627, + "step": 4422, + "time_per_iteration": 2.715272903442383 + }, + { + "auxiliary_loss_clip": 0.0650918, + "auxiliary_loss_mlp": 0.01275047, + "balance_loss_clip": 0.06296968, + "balance_loss_mlp": 0.01257011, + "epoch": 0.26592514655042837, + "flos": 29724256581120.0, + "grad_norm": 6.571308072686312, + "language_loss": 0.75332773, + "learning_rate": 3.4439777363724252e-06, + "loss": 0.83116996, + "num_input_tokens_seen": 95639155, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.18029785, + "step": 4423, + "time_per_iteration": 2.5891942977905273 + }, + { + "auxiliary_loss_clip": 0.06514621, + "auxiliary_loss_mlp": 0.01277348, + "balance_loss_clip": 0.06297594, + "balance_loss_mlp": 0.01257619, + "epoch": 0.26598526980309634, + "flos": 46687616110080.0, + "grad_norm": 1.5716819541281883, + "language_loss": 0.78054529, + "learning_rate": 3.443708238639522e-06, + "loss": 0.85846502, + "num_input_tokens_seen": 95663320, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.19726562, + "step": 4424, + "time_per_iteration": 2.731308698654175 + }, + { + "auxiliary_loss_clip": 0.06513417, + "auxiliary_loss_mlp": 0.01282972, + "balance_loss_clip": 0.06298374, + "balance_loss_mlp": 0.01263147, + "epoch": 0.2660453930557643, + "flos": 11514115025280.0, + "grad_norm": 1.8953438163908696, + "language_loss": 0.7980895, + "learning_rate": 3.4434386861610573e-06, + "loss": 0.87605333, + "num_input_tokens_seen": 95680260, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.19824219, + "step": 4425, + "time_per_iteration": 2.536639928817749 + }, + { + "auxiliary_loss_clip": 0.0650531, + "auxiliary_loss_mlp": 0.01275945, + "balance_loss_clip": 0.06295954, + "balance_loss_mlp": 0.01257837, + "epoch": 0.26610551630843227, + "flos": 24798692096640.0, + "grad_norm": 1.624984400061838, + "language_loss": 0.81150436, + "learning_rate": 3.4431690789472532e-06, + "loss": 0.88931698, + "num_input_tokens_seen": 95701140, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 0.18103027, + "step": 4426, + "time_per_iteration": 2.55570912361145 + }, + { + "auxiliary_loss_clip": 0.06512492, + "auxiliary_loss_mlp": 0.01281328, + "balance_loss_clip": 0.06298596, + "balance_loss_mlp": 0.01262302, + "epoch": 0.26616563956110023, + "flos": 27643793955840.0, + "grad_norm": 1.6446869519549492, + "language_loss": 0.77695107, + "learning_rate": 3.442899417008333e-06, + "loss": 0.85488927, + "num_input_tokens_seen": 95722060, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.19042969, + "step": 4427, + "time_per_iteration": 2.609236001968384 + }, + { + "auxiliary_loss_clip": 0.06512281, + "auxiliary_loss_mlp": 0.01275028, + "balance_loss_clip": 0.06306126, + "balance_loss_mlp": 0.01257588, + "epoch": 0.26622576281376825, + "flos": 28369887511680.0, + "grad_norm": 1.5754757805335664, + "language_loss": 0.77615106, + "learning_rate": 3.4426297003545227e-06, + "loss": 0.85402417, + "num_input_tokens_seen": 95742495, + "router_z_loss_clip": 2.0625, + "router_z_loss_mlp": 0.17443848, + "step": 4428, + "time_per_iteration": 2.5886542797088623 + }, + { + "auxiliary_loss_clip": 0.06507164, + "auxiliary_loss_mlp": 0.01273818, + "balance_loss_clip": 0.06292614, + "balance_loss_mlp": 0.0125627, + "epoch": 0.2662858860664362, + "flos": 18047265217920.0, + "grad_norm": 1.9210496781424948, + "language_loss": 0.83184117, + "learning_rate": 3.4423599289960495e-06, + "loss": 0.90965092, + "num_input_tokens_seen": 95761510, + "router_z_loss_clip": 2.14746094, + "router_z_loss_mlp": 0.17541504, + "step": 4429, + "time_per_iteration": 2.5387768745422363 + }, + { + "auxiliary_loss_clip": 0.06512052, + "auxiliary_loss_mlp": 0.01276801, + "balance_loss_clip": 0.06301999, + "balance_loss_mlp": 0.01256762, + "epoch": 0.2663460093191042, + "flos": 22752163175040.0, + "grad_norm": 1.799497911690532, + "language_loss": 0.73120302, + "learning_rate": 3.442090102943143e-06, + "loss": 0.80909157, + "num_input_tokens_seen": 95782385, + "router_z_loss_clip": 2.1015625, + "router_z_loss_mlp": 0.20043945, + "step": 4430, + "time_per_iteration": 2.6026084423065186 + }, + { + "auxiliary_loss_clip": 0.06508531, + "auxiliary_loss_mlp": 0.0127429, + "balance_loss_clip": 0.06296858, + "balance_loss_mlp": 0.012548, + "epoch": 0.26640613257177215, + "flos": 16514422951680.0, + "grad_norm": 2.040164300856009, + "language_loss": 0.83262235, + "learning_rate": 3.441820222206035e-06, + "loss": 0.91045058, + "num_input_tokens_seen": 95800595, + "router_z_loss_clip": 2.1171875, + "router_z_loss_mlp": 0.19482422, + "step": 4431, + "time_per_iteration": 2.5464959144592285 + }, + { + "auxiliary_loss_clip": 0.0651544, + "auxiliary_loss_mlp": 0.01281122, + "balance_loss_clip": 0.06296271, + "balance_loss_mlp": 0.01261488, + "epoch": 0.2664662558244401, + "flos": 23082638878080.0, + "grad_norm": 2.4012085548553537, + "language_loss": 0.76319212, + "learning_rate": 3.44155028679496e-06, + "loss": 0.84115773, + "num_input_tokens_seen": 95818480, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.19641113, + "step": 4432, + "time_per_iteration": 2.5570900440216064 + }, + { + "auxiliary_loss_clip": 0.06513382, + "auxiliary_loss_mlp": 0.01279336, + "balance_loss_clip": 0.0629918, + "balance_loss_mlp": 0.01259011, + "epoch": 0.2665263790771081, + "flos": 23776098468480.0, + "grad_norm": 1.7645797084145118, + "language_loss": 0.8352288, + "learning_rate": 3.441280296720154e-06, + "loss": 0.91315603, + "num_input_tokens_seen": 95837205, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.20324707, + "step": 4433, + "time_per_iteration": 2.5431323051452637 + }, + { + "auxiliary_loss_clip": 0.06506403, + "auxiliary_loss_mlp": 0.01279917, + "balance_loss_clip": 0.06294529, + "balance_loss_mlp": 0.01260248, + "epoch": 0.26658650232977604, + "flos": 28008748414080.0, + "grad_norm": 2.0130085710694097, + "language_loss": 0.77006185, + "learning_rate": 3.441010251991854e-06, + "loss": 0.84792507, + "num_input_tokens_seen": 95858395, + "router_z_loss_clip": 2.11230469, + "router_z_loss_mlp": 0.19677734, + "step": 4434, + "time_per_iteration": 2.626286268234253 + }, + { + "auxiliary_loss_clip": 0.06505096, + "auxiliary_loss_mlp": 0.01274565, + "balance_loss_clip": 0.06296869, + "balance_loss_mlp": 0.01255563, + "epoch": 0.266646625582444, + "flos": 22170147914880.0, + "grad_norm": 1.9216331890087734, + "language_loss": 0.82643783, + "learning_rate": 3.440740152620301e-06, + "loss": 0.90423441, + "num_input_tokens_seen": 95877875, + "router_z_loss_clip": 2.07910156, + "router_z_loss_mlp": 0.18994141, + "step": 4435, + "time_per_iteration": 2.519731283187866 + }, + { + "auxiliary_loss_clip": 0.06515168, + "auxiliary_loss_mlp": 0.01287569, + "balance_loss_clip": 0.06296054, + "balance_loss_mlp": 0.01267065, + "epoch": 0.266706748835112, + "flos": 27860687049600.0, + "grad_norm": 2.5550616111147257, + "language_loss": 0.88173652, + "learning_rate": 3.4404699986157376e-06, + "loss": 0.95976388, + "num_input_tokens_seen": 95895820, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.2052002, + "step": 4436, + "time_per_iteration": 2.5790481567382812 + }, + { + "auxiliary_loss_clip": 0.0650726, + "auxiliary_loss_mlp": 0.01276794, + "balance_loss_clip": 0.0629128, + "balance_loss_mlp": 0.01258507, + "epoch": 0.26676687208777994, + "flos": 25819231299840.0, + "grad_norm": 5.920609689832761, + "language_loss": 0.79025435, + "learning_rate": 3.440199789988407e-06, + "loss": 0.86809486, + "num_input_tokens_seen": 95918025, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.1829834, + "step": 4437, + "time_per_iteration": 3.9761762619018555 + }, + { + "auxiliary_loss_clip": 0.06508271, + "auxiliary_loss_mlp": 0.01274387, + "balance_loss_clip": 0.06295269, + "balance_loss_mlp": 0.01256065, + "epoch": 0.2668269953404479, + "flos": 36073399207680.0, + "grad_norm": 3.5501154130665333, + "language_loss": 0.64866304, + "learning_rate": 3.439929526748556e-06, + "loss": 0.72648954, + "num_input_tokens_seen": 95937725, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.18322754, + "step": 4438, + "time_per_iteration": 2.655214786529541 + }, + { + "auxiliary_loss_clip": 0.0650841, + "auxiliary_loss_mlp": 0.01282243, + "balance_loss_clip": 0.0629243, + "balance_loss_mlp": 0.01263015, + "epoch": 0.26688711859311587, + "flos": 26576994499200.0, + "grad_norm": 1.9779853569110368, + "language_loss": 0.76120412, + "learning_rate": 3.4396592089064334e-06, + "loss": 0.83911061, + "num_input_tokens_seen": 95956335, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.1920166, + "step": 4439, + "time_per_iteration": 2.5468099117279053 + }, + { + "auxiliary_loss_clip": 0.06509372, + "auxiliary_loss_mlp": 0.01279302, + "balance_loss_clip": 0.06293344, + "balance_loss_mlp": 0.01259156, + "epoch": 0.26694724184578383, + "flos": 26768968202880.0, + "grad_norm": 1.7452542153948158, + "language_loss": 0.71747917, + "learning_rate": 3.4393888364722897e-06, + "loss": 0.79536593, + "num_input_tokens_seen": 95977135, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.20141602, + "step": 4440, + "time_per_iteration": 2.5845727920532227 + }, + { + "auxiliary_loss_clip": 0.06513558, + "auxiliary_loss_mlp": 0.01278841, + "balance_loss_clip": 0.06297302, + "balance_loss_mlp": 0.01258003, + "epoch": 0.2670073650984518, + "flos": 20965894634880.0, + "grad_norm": 2.018310090260772, + "language_loss": 0.67180222, + "learning_rate": 3.439118409456376e-06, + "loss": 0.74972624, + "num_input_tokens_seen": 95995435, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.20837402, + "step": 4441, + "time_per_iteration": 4.018662691116333 + }, + { + "auxiliary_loss_clip": 0.06511593, + "auxiliary_loss_mlp": 0.01281375, + "balance_loss_clip": 0.06295494, + "balance_loss_mlp": 0.01260692, + "epoch": 0.2670674883511198, + "flos": 28373577091200.0, + "grad_norm": 1.7028334543675463, + "language_loss": 0.77360296, + "learning_rate": 3.4388479278689486e-06, + "loss": 0.8515327, + "num_input_tokens_seen": 96016340, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.20690918, + "step": 4442, + "time_per_iteration": 2.613529682159424 + }, + { + "auxiliary_loss_clip": 0.06397913, + "auxiliary_loss_mlp": 0.0126448, + "balance_loss_clip": 0.06295023, + "balance_loss_mlp": 0.01259818, + "epoch": 0.2671276116037878, + "flos": 58989010970880.0, + "grad_norm": 0.9159689493293411, + "language_loss": 0.61561328, + "learning_rate": 3.4385773917202637e-06, + "loss": 0.6922372, + "num_input_tokens_seen": 96071205, + "router_z_loss_clip": 1.03125, + "router_z_loss_mlp": 0.04653931, + "step": 4443, + "time_per_iteration": 4.460381031036377 + }, + { + "auxiliary_loss_clip": 0.06510781, + "auxiliary_loss_mlp": 0.01278926, + "balance_loss_clip": 0.06294855, + "balance_loss_mlp": 0.0126021, + "epoch": 0.26718773485645575, + "flos": 43955132538240.0, + "grad_norm": 8.593795125602613, + "language_loss": 0.76795793, + "learning_rate": 3.4383068010205793e-06, + "loss": 0.845855, + "num_input_tokens_seen": 96094240, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.18725586, + "step": 4444, + "time_per_iteration": 2.7442104816436768 + }, + { + "auxiliary_loss_clip": 0.06512623, + "auxiliary_loss_mlp": 0.0127732, + "balance_loss_clip": 0.06297334, + "balance_loss_mlp": 0.01256255, + "epoch": 0.2672478581091237, + "flos": 25235329322880.0, + "grad_norm": 2.0392997213265867, + "language_loss": 0.81111336, + "learning_rate": 3.438036155780158e-06, + "loss": 0.88901269, + "num_input_tokens_seen": 96114105, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.21057129, + "step": 4445, + "time_per_iteration": 2.5493359565734863 + }, + { + "auxiliary_loss_clip": 0.06511448, + "auxiliary_loss_mlp": 0.01275318, + "balance_loss_clip": 0.0629541, + "balance_loss_mlp": 0.01256054, + "epoch": 0.2673079813617917, + "flos": 15273594564480.0, + "grad_norm": 1.8279407549944744, + "language_loss": 0.89906365, + "learning_rate": 3.43776545600926e-06, + "loss": 0.97693127, + "num_input_tokens_seen": 96132140, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.19262695, + "step": 4446, + "time_per_iteration": 2.536916971206665 + }, + { + "auxiliary_loss_clip": 0.06512347, + "auxiliary_loss_mlp": 0.01275408, + "balance_loss_clip": 0.06295379, + "balance_loss_mlp": 0.01256894, + "epoch": 0.26736810461445965, + "flos": 25819944059520.0, + "grad_norm": 1.8969857257431861, + "language_loss": 0.68977708, + "learning_rate": 3.437494701718153e-06, + "loss": 0.76765466, + "num_input_tokens_seen": 96152090, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.18518066, + "step": 4447, + "time_per_iteration": 4.071701526641846 + }, + { + "auxiliary_loss_clip": 0.06511723, + "auxiliary_loss_mlp": 0.01279215, + "balance_loss_clip": 0.06295793, + "balance_loss_mlp": 0.01259116, + "epoch": 0.2674282278671276, + "flos": 24318981072000.0, + "grad_norm": 1.8615578685879888, + "language_loss": 0.83522677, + "learning_rate": 3.4372238929171026e-06, + "loss": 0.91313618, + "num_input_tokens_seen": 96170015, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.2010498, + "step": 4448, + "time_per_iteration": 2.581207036972046 + }, + { + "auxiliary_loss_clip": 0.06506026, + "auxiliary_loss_mlp": 0.0127612, + "balance_loss_clip": 0.06295379, + "balance_loss_mlp": 0.01256855, + "epoch": 0.2674883511197956, + "flos": 22821330320640.0, + "grad_norm": 1.5806903023960923, + "language_loss": 0.84385109, + "learning_rate": 3.436953029616378e-06, + "loss": 0.92167258, + "num_input_tokens_seen": 96188065, + "router_z_loss_clip": 2.10742188, + "router_z_loss_mlp": 0.19262695, + "step": 4449, + "time_per_iteration": 2.556368827819824 + }, + { + "auxiliary_loss_clip": 0.06523807, + "auxiliary_loss_mlp": 0.01278506, + "balance_loss_clip": 0.06298804, + "balance_loss_mlp": 0.01256679, + "epoch": 0.26754847437246354, + "flos": 25376514652800.0, + "grad_norm": 2.5106466446094275, + "language_loss": 0.84170121, + "learning_rate": 3.4366821118262506e-06, + "loss": 0.91972435, + "num_input_tokens_seen": 96205780, + "router_z_loss_clip": 2.25390625, + "router_z_loss_mlp": 0.21838379, + "step": 4450, + "time_per_iteration": 2.540792465209961 + }, + { + "auxiliary_loss_clip": 0.06503032, + "auxiliary_loss_mlp": 0.01274274, + "balance_loss_clip": 0.06293193, + "balance_loss_mlp": 0.01255248, + "epoch": 0.2676085976251315, + "flos": 20236698478080.0, + "grad_norm": 1.7838817445044992, + "language_loss": 0.81239712, + "learning_rate": 3.4364111395569937e-06, + "loss": 0.8901701, + "num_input_tokens_seen": 96224990, + "router_z_loss_clip": 2.09570312, + "router_z_loss_mlp": 0.19042969, + "step": 4451, + "time_per_iteration": 2.552764892578125 + }, + { + "auxiliary_loss_clip": 0.06515267, + "auxiliary_loss_mlp": 0.01275992, + "balance_loss_clip": 0.06304526, + "balance_loss_mlp": 0.01257324, + "epoch": 0.26766872087779947, + "flos": 28045784718720.0, + "grad_norm": 1.859886698365648, + "language_loss": 0.87156057, + "learning_rate": 3.436140112818882e-06, + "loss": 0.94947314, + "num_input_tokens_seen": 96245345, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.18664551, + "step": 4452, + "time_per_iteration": 2.580838918685913 + }, + { + "auxiliary_loss_clip": 0.06515863, + "auxiliary_loss_mlp": 0.01278142, + "balance_loss_clip": 0.06301846, + "balance_loss_mlp": 0.01258377, + "epoch": 0.26772884413046744, + "flos": 18329803585920.0, + "grad_norm": 2.0572254627861577, + "language_loss": 0.84003425, + "learning_rate": 3.435869031622194e-06, + "loss": 0.91797435, + "num_input_tokens_seen": 96259000, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.19775391, + "step": 4453, + "time_per_iteration": 2.5120368003845215 + }, + { + "auxiliary_loss_clip": 0.06513035, + "auxiliary_loss_mlp": 0.01281566, + "balance_loss_clip": 0.06298169, + "balance_loss_mlp": 0.01261992, + "epoch": 0.2677889673831354, + "flos": 22134075932160.0, + "grad_norm": 1.66096029715733, + "language_loss": 0.79950684, + "learning_rate": 3.435597895977208e-06, + "loss": 0.87745285, + "num_input_tokens_seen": 96277000, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.19580078, + "step": 4454, + "time_per_iteration": 2.5411524772644043 + }, + { + "auxiliary_loss_clip": 0.06518991, + "auxiliary_loss_mlp": 0.0127963, + "balance_loss_clip": 0.0630191, + "balance_loss_mlp": 0.01259949, + "epoch": 0.2678490906358034, + "flos": 23736001489920.0, + "grad_norm": 1.4726826789128313, + "language_loss": 0.72626883, + "learning_rate": 3.435326705894206e-06, + "loss": 0.80425501, + "num_input_tokens_seen": 96297010, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.19689941, + "step": 4455, + "time_per_iteration": 2.600341558456421 + }, + { + "auxiliary_loss_clip": 0.0650526, + "auxiliary_loss_mlp": 0.01280807, + "balance_loss_clip": 0.06295176, + "balance_loss_mlp": 0.01262675, + "epoch": 0.2679092138884714, + "flos": 21769414963200.0, + "grad_norm": 1.6724393178855028, + "language_loss": 0.74066579, + "learning_rate": 3.435055461383471e-06, + "loss": 0.81852639, + "num_input_tokens_seen": 96315780, + "router_z_loss_clip": 2.09863281, + "router_z_loss_mlp": 0.18139648, + "step": 4456, + "time_per_iteration": 2.5469894409179688 + }, + { + "auxiliary_loss_clip": 0.06520095, + "auxiliary_loss_mlp": 0.01278452, + "balance_loss_clip": 0.06300029, + "balance_loss_mlp": 0.01258127, + "epoch": 0.26796933714113935, + "flos": 19866670848000.0, + "grad_norm": 2.417277333537857, + "language_loss": 0.71260488, + "learning_rate": 3.4347841624552896e-06, + "loss": 0.79059041, + "num_input_tokens_seen": 96333465, + "router_z_loss_clip": 2.20507812, + "router_z_loss_mlp": 0.20324707, + "step": 4457, + "time_per_iteration": 2.592397451400757 + }, + { + "auxiliary_loss_clip": 0.06517951, + "auxiliary_loss_mlp": 0.01279854, + "balance_loss_clip": 0.06301091, + "balance_loss_mlp": 0.01259183, + "epoch": 0.2680294603938073, + "flos": 20054116431360.0, + "grad_norm": 2.0107664890053143, + "language_loss": 0.79466271, + "learning_rate": 3.4345128091199493e-06, + "loss": 0.87264079, + "num_input_tokens_seen": 96352005, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.20666504, + "step": 4458, + "time_per_iteration": 2.5134661197662354 + }, + { + "auxiliary_loss_clip": 0.06383923, + "auxiliary_loss_mlp": 0.01263146, + "balance_loss_clip": 0.06281242, + "balance_loss_mlp": 0.01258718, + "epoch": 0.2680895836464753, + "flos": 72134918334720.0, + "grad_norm": 0.8734266993254428, + "language_loss": 0.5870322, + "learning_rate": 3.434241401387739e-06, + "loss": 0.66350281, + "num_input_tokens_seen": 96406265, + "router_z_loss_clip": 1.02734375, + "router_z_loss_mlp": 0.04437256, + "step": 4459, + "time_per_iteration": 3.2277050018310547 + }, + { + "auxiliary_loss_clip": 0.06506394, + "auxiliary_loss_mlp": 0.01274552, + "balance_loss_clip": 0.06292672, + "balance_loss_mlp": 0.01255633, + "epoch": 0.26814970689914325, + "flos": 20455310580480.0, + "grad_norm": 1.8403982609946155, + "language_loss": 0.85477257, + "learning_rate": 3.4339699392689507e-06, + "loss": 0.93258202, + "num_input_tokens_seen": 96425225, + "router_z_loss_clip": 2.13769531, + "router_z_loss_mlp": 0.18920898, + "step": 4460, + "time_per_iteration": 2.513317346572876 + }, + { + "auxiliary_loss_clip": 0.06504844, + "auxiliary_loss_mlp": 0.01281285, + "balance_loss_clip": 0.06292892, + "balance_loss_mlp": 0.01261866, + "epoch": 0.2682098301518112, + "flos": 17572459656960.0, + "grad_norm": 1.8133404743184358, + "language_loss": 0.69389015, + "learning_rate": 3.4336984227738796e-06, + "loss": 0.7717514, + "num_input_tokens_seen": 96443780, + "router_z_loss_clip": 2.12011719, + "router_z_loss_mlp": 0.19421387, + "step": 4461, + "time_per_iteration": 2.5566093921661377 + }, + { + "auxiliary_loss_clip": 0.06506921, + "auxiliary_loss_mlp": 0.01281085, + "balance_loss_clip": 0.06293105, + "balance_loss_mlp": 0.01260152, + "epoch": 0.2682699534044792, + "flos": 18339237169920.0, + "grad_norm": 1.6584506269914416, + "language_loss": 0.67031932, + "learning_rate": 3.43342685191282e-06, + "loss": 0.74819934, + "num_input_tokens_seen": 96464530, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.20935059, + "step": 4462, + "time_per_iteration": 2.5427775382995605 + }, + { + "auxiliary_loss_clip": 0.06508102, + "auxiliary_loss_mlp": 0.01282385, + "balance_loss_clip": 0.0629629, + "balance_loss_mlp": 0.01263287, + "epoch": 0.26833007665714714, + "flos": 25308311829120.0, + "grad_norm": 1.7808644454945033, + "language_loss": 0.69747704, + "learning_rate": 3.4331552266960705e-06, + "loss": 0.77538192, + "num_input_tokens_seen": 96483345, + "router_z_loss_clip": 2.11523438, + "router_z_loss_mlp": 0.19116211, + "step": 4463, + "time_per_iteration": 2.6194493770599365 + }, + { + "auxiliary_loss_clip": 0.06508362, + "auxiliary_loss_mlp": 0.01280959, + "balance_loss_clip": 0.06291216, + "balance_loss_mlp": 0.0126092, + "epoch": 0.2683901999098151, + "flos": 16104046780800.0, + "grad_norm": 2.9245690778148465, + "language_loss": 0.78600121, + "learning_rate": 3.432883547133931e-06, + "loss": 0.86389446, + "num_input_tokens_seen": 96498305, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.20056152, + "step": 4464, + "time_per_iteration": 2.463418483734131 + }, + { + "auxiliary_loss_clip": 0.06508331, + "auxiliary_loss_mlp": 0.01281824, + "balance_loss_clip": 0.06294504, + "balance_loss_mlp": 0.01262154, + "epoch": 0.2684503231624831, + "flos": 27315414604800.0, + "grad_norm": 1.7531136867378412, + "language_loss": 0.71091688, + "learning_rate": 3.432611813236704e-06, + "loss": 0.78881842, + "num_input_tokens_seen": 96519740, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.19665527, + "step": 4465, + "time_per_iteration": 2.6083028316497803 + }, + { + "auxiliary_loss_clip": 0.06379254, + "auxiliary_loss_mlp": 0.01259677, + "balance_loss_clip": 0.0627647, + "balance_loss_mlp": 0.01255094, + "epoch": 0.26851044641515104, + "flos": 71879060292480.0, + "grad_norm": 0.6551429372657154, + "language_loss": 0.52683848, + "learning_rate": 3.4323400250146943e-06, + "loss": 0.60322779, + "num_input_tokens_seen": 96588870, + "router_z_loss_clip": 1.02929688, + "router_z_loss_mlp": 0.04577637, + "step": 4466, + "time_per_iteration": 3.2851803302764893 + }, + { + "auxiliary_loss_clip": 0.06507096, + "auxiliary_loss_mlp": 0.01283442, + "balance_loss_clip": 0.06291512, + "balance_loss_mlp": 0.01263105, + "epoch": 0.268570569667819, + "flos": 18739676632320.0, + "grad_norm": 10.994589827837663, + "language_loss": 0.74195564, + "learning_rate": 3.4320681824782057e-06, + "loss": 0.81986099, + "num_input_tokens_seen": 96605100, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.20324707, + "step": 4467, + "time_per_iteration": 2.4971463680267334 + }, + { + "auxiliary_loss_clip": 0.06517448, + "auxiliary_loss_mlp": 0.01283031, + "balance_loss_clip": 0.06297839, + "balance_loss_mlp": 0.01264005, + "epoch": 0.268630692920487, + "flos": 18182832324480.0, + "grad_norm": 2.2391086352503504, + "language_loss": 0.81577581, + "learning_rate": 3.4317962856375493e-06, + "loss": 0.89378059, + "num_input_tokens_seen": 96621410, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.19042969, + "step": 4468, + "time_per_iteration": 2.547626256942749 + }, + { + "auxiliary_loss_clip": 0.06377872, + "auxiliary_loss_mlp": 0.01264177, + "balance_loss_clip": 0.06275174, + "balance_loss_mlp": 0.01259552, + "epoch": 0.268690816173155, + "flos": 68754229176960.0, + "grad_norm": 0.8279608156690638, + "language_loss": 0.59413958, + "learning_rate": 3.4315243345030334e-06, + "loss": 0.67056012, + "num_input_tokens_seen": 96684810, + "router_z_loss_clip": 1.03125, + "router_z_loss_mlp": 0.0461731, + "step": 4469, + "time_per_iteration": 3.2565419673919678 + }, + { + "auxiliary_loss_clip": 0.06507242, + "auxiliary_loss_mlp": 0.01284548, + "balance_loss_clip": 0.06292132, + "balance_loss_mlp": 0.01263304, + "epoch": 0.26875093942582295, + "flos": 23300160877440.0, + "grad_norm": 1.9707129205098373, + "language_loss": 0.8163017, + "learning_rate": 3.431252329084972e-06, + "loss": 0.89421958, + "num_input_tokens_seen": 96701920, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.21240234, + "step": 4470, + "time_per_iteration": 2.542893171310425 + }, + { + "auxiliary_loss_clip": 0.06497125, + "auxiliary_loss_mlp": 0.0128145, + "balance_loss_clip": 0.0628901, + "balance_loss_mlp": 0.012619, + "epoch": 0.2688110626784909, + "flos": 21549880465920.0, + "grad_norm": 1.5945085425671264, + "language_loss": 0.83326346, + "learning_rate": 3.4309802693936786e-06, + "loss": 0.91104919, + "num_input_tokens_seen": 96721260, + "router_z_loss_clip": 2.08496094, + "router_z_loss_mlp": 0.19555664, + "step": 4471, + "time_per_iteration": 2.5213489532470703 + }, + { + "auxiliary_loss_clip": 0.06497657, + "auxiliary_loss_mlp": 0.01284463, + "balance_loss_clip": 0.06289607, + "balance_loss_mlp": 0.01264365, + "epoch": 0.2688711859311589, + "flos": 28407804284160.0, + "grad_norm": 1.9607526414443455, + "language_loss": 0.70046443, + "learning_rate": 3.43070815543947e-06, + "loss": 0.77828562, + "num_input_tokens_seen": 96740385, + "router_z_loss_clip": 2.08007812, + "router_z_loss_mlp": 0.20092773, + "step": 4472, + "time_per_iteration": 2.6251678466796875 + }, + { + "auxiliary_loss_clip": 0.06504884, + "auxiliary_loss_mlp": 0.0128234, + "balance_loss_clip": 0.06293008, + "balance_loss_mlp": 0.01263112, + "epoch": 0.26893130918382685, + "flos": 26002148762880.0, + "grad_norm": 1.9293915951077794, + "language_loss": 0.68364072, + "learning_rate": 3.4304359872326656e-06, + "loss": 0.76151299, + "num_input_tokens_seen": 96761860, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.19213867, + "step": 4473, + "time_per_iteration": 2.5682830810546875 + }, + { + "auxiliary_loss_clip": 0.06499921, + "auxiliary_loss_mlp": 0.01278958, + "balance_loss_clip": 0.06292213, + "balance_loss_mlp": 0.01259467, + "epoch": 0.2689914324364948, + "flos": 20345878748160.0, + "grad_norm": 1.608174101079712, + "language_loss": 0.83682281, + "learning_rate": 3.4301637647835843e-06, + "loss": 0.91461158, + "num_input_tokens_seen": 96781890, + "router_z_loss_clip": 2.08007812, + "router_z_loss_mlp": 0.19470215, + "step": 4474, + "time_per_iteration": 2.554151773452759 + }, + { + "auxiliary_loss_clip": 0.06502855, + "auxiliary_loss_mlp": 0.01275806, + "balance_loss_clip": 0.06296148, + "balance_loss_mlp": 0.01256482, + "epoch": 0.2690515556891628, + "flos": 19470759505920.0, + "grad_norm": 1.847749203594977, + "language_loss": 0.70725596, + "learning_rate": 3.4298914881025494e-06, + "loss": 0.78504252, + "num_input_tokens_seen": 96800390, + "router_z_loss_clip": 2.06835938, + "router_z_loss_mlp": 0.19348145, + "step": 4475, + "time_per_iteration": 2.5116677284240723 + }, + { + "auxiliary_loss_clip": 0.06503256, + "auxiliary_loss_mlp": 0.01277275, + "balance_loss_clip": 0.06287575, + "balance_loss_mlp": 0.01257188, + "epoch": 0.26911167894183075, + "flos": 18151875440640.0, + "grad_norm": 2.2814450019498236, + "language_loss": 0.73125452, + "learning_rate": 3.4296191571998863e-06, + "loss": 0.80905986, + "num_input_tokens_seen": 96816685, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.20092773, + "step": 4476, + "time_per_iteration": 3.923501968383789 + }, + { + "auxiliary_loss_clip": 0.0650249, + "auxiliary_loss_mlp": 0.01274263, + "balance_loss_clip": 0.06291398, + "balance_loss_mlp": 0.01255511, + "epoch": 0.2691718021944987, + "flos": 19981385487360.0, + "grad_norm": 1.4862356596427981, + "language_loss": 0.80676347, + "learning_rate": 3.429346772085922e-06, + "loss": 0.88453096, + "num_input_tokens_seen": 96836285, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 0.18762207, + "step": 4477, + "time_per_iteration": 2.562681198120117 + }, + { + "auxiliary_loss_clip": 0.06506729, + "auxiliary_loss_mlp": 0.01275723, + "balance_loss_clip": 0.06289821, + "balance_loss_mlp": 0.01254873, + "epoch": 0.2692319254471667, + "flos": 37455622560000.0, + "grad_norm": 1.8507584096301994, + "language_loss": 0.65612036, + "learning_rate": 3.429074332770984e-06, + "loss": 0.73394483, + "num_input_tokens_seen": 96857745, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.20861816, + "step": 4478, + "time_per_iteration": 2.6743321418762207 + }, + { + "auxiliary_loss_clip": 0.06505084, + "auxiliary_loss_mlp": 0.01278495, + "balance_loss_clip": 0.06291381, + "balance_loss_mlp": 0.01259242, + "epoch": 0.26929204869983464, + "flos": 22134411348480.0, + "grad_norm": 2.2415663972983864, + "language_loss": 0.81841063, + "learning_rate": 3.4288018392654047e-06, + "loss": 0.89624637, + "num_input_tokens_seen": 96877295, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.19250488, + "step": 4479, + "time_per_iteration": 2.563365936279297 + }, + { + "auxiliary_loss_clip": 0.06510025, + "auxiliary_loss_mlp": 0.01277354, + "balance_loss_clip": 0.06295313, + "balance_loss_mlp": 0.01258305, + "epoch": 0.2693521719525026, + "flos": 19799055002880.0, + "grad_norm": 1.97047433874797, + "language_loss": 0.81362212, + "learning_rate": 3.4285292915795166e-06, + "loss": 0.89149588, + "num_input_tokens_seen": 96896160, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.19055176, + "step": 4480, + "time_per_iteration": 2.505098342895508 + }, + { + "auxiliary_loss_clip": 0.06504171, + "auxiliary_loss_mlp": 0.01276381, + "balance_loss_clip": 0.06296593, + "balance_loss_mlp": 0.01257677, + "epoch": 0.2694122952051706, + "flos": 21000415317120.0, + "grad_norm": 1.6210366032838512, + "language_loss": 0.7826978, + "learning_rate": 3.4282566897236543e-06, + "loss": 0.86050338, + "num_input_tokens_seen": 96915410, + "router_z_loss_clip": 2.07421875, + "router_z_loss_mlp": 0.18713379, + "step": 4481, + "time_per_iteration": 4.100890874862671 + }, + { + "auxiliary_loss_clip": 0.06511036, + "auxiliary_loss_mlp": 0.01275006, + "balance_loss_clip": 0.06298155, + "balance_loss_mlp": 0.01254192, + "epoch": 0.2694724184578386, + "flos": 25856519166720.0, + "grad_norm": 1.8924674974759383, + "language_loss": 0.74293458, + "learning_rate": 3.4279840337081547e-06, + "loss": 0.820795, + "num_input_tokens_seen": 96937865, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.20788574, + "step": 4482, + "time_per_iteration": 4.145740747451782 + }, + { + "auxiliary_loss_clip": 0.06511661, + "auxiliary_loss_mlp": 0.01276613, + "balance_loss_clip": 0.06299306, + "balance_loss_mlp": 0.01256836, + "epoch": 0.26953254171050656, + "flos": 21733594542720.0, + "grad_norm": 2.48131981073459, + "language_loss": 0.72700799, + "learning_rate": 3.4277113235433584e-06, + "loss": 0.80489069, + "num_input_tokens_seen": 96957710, + "router_z_loss_clip": 2.12109375, + "router_z_loss_mlp": 0.19763184, + "step": 4483, + "time_per_iteration": 2.5375680923461914 + }, + { + "auxiliary_loss_clip": 0.06523035, + "auxiliary_loss_mlp": 0.01278438, + "balance_loss_clip": 0.0630566, + "balance_loss_mlp": 0.01257994, + "epoch": 0.2695926649631745, + "flos": 19689078119040.0, + "grad_norm": 2.054691934345778, + "language_loss": 0.87485874, + "learning_rate": 3.427438559239605e-06, + "loss": 0.95287347, + "num_input_tokens_seen": 96975890, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.20446777, + "step": 4484, + "time_per_iteration": 2.541909694671631 + }, + { + "auxiliary_loss_clip": 0.06515766, + "auxiliary_loss_mlp": 0.01278738, + "balance_loss_clip": 0.06300886, + "balance_loss_mlp": 0.01259474, + "epoch": 0.2696527882158425, + "flos": 32894257847040.0, + "grad_norm": 2.0183728032076966, + "language_loss": 0.66971946, + "learning_rate": 3.427165740807239e-06, + "loss": 0.74766451, + "num_input_tokens_seen": 96998595, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.19262695, + "step": 4485, + "time_per_iteration": 2.623896598815918 + }, + { + "auxiliary_loss_clip": 0.06514997, + "auxiliary_loss_mlp": 0.01282999, + "balance_loss_clip": 0.06301111, + "balance_loss_mlp": 0.01262877, + "epoch": 0.26971291146851045, + "flos": 12128806177920.0, + "grad_norm": 3.3281733059389498, + "language_loss": 0.74281263, + "learning_rate": 3.426892868256604e-06, + "loss": 0.82079262, + "num_input_tokens_seen": 97013715, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.2010498, + "step": 4486, + "time_per_iteration": 2.525820016860962 + }, + { + "auxiliary_loss_clip": 0.06519947, + "auxiliary_loss_mlp": 0.01289409, + "balance_loss_clip": 0.06302445, + "balance_loss_mlp": 0.01268846, + "epoch": 0.2697730347211784, + "flos": 22640467282560.0, + "grad_norm": 2.8316541967285183, + "language_loss": 0.84592897, + "learning_rate": 3.4266199415980495e-06, + "loss": 0.92402256, + "num_input_tokens_seen": 97031570, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.20556641, + "step": 4487, + "time_per_iteration": 3.936244249343872 + }, + { + "auxiliary_loss_clip": 0.06520635, + "auxiliary_loss_mlp": 0.01285695, + "balance_loss_clip": 0.06303369, + "balance_loss_mlp": 0.01264845, + "epoch": 0.2698331579738464, + "flos": 23519695374720.0, + "grad_norm": 2.431656191901387, + "language_loss": 0.73194599, + "learning_rate": 3.4263469608419234e-06, + "loss": 0.81000936, + "num_input_tokens_seen": 97049815, + "router_z_loss_clip": 2.17578125, + "router_z_loss_mlp": 0.20861816, + "step": 4488, + "time_per_iteration": 2.522861957550049 + }, + { + "auxiliary_loss_clip": 0.06516892, + "auxiliary_loss_mlp": 0.0127853, + "balance_loss_clip": 0.06303044, + "balance_loss_mlp": 0.01258681, + "epoch": 0.26989328122651435, + "flos": 24647360423040.0, + "grad_norm": 1.6427618857215789, + "language_loss": 0.84162384, + "learning_rate": 3.426073925998578e-06, + "loss": 0.91957808, + "num_input_tokens_seen": 97067570, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.1986084, + "step": 4489, + "time_per_iteration": 2.558133602142334 + }, + { + "auxiliary_loss_clip": 0.06523076, + "auxiliary_loss_mlp": 0.0128704, + "balance_loss_clip": 0.0630331, + "balance_loss_mlp": 0.01265821, + "epoch": 0.2699534044791823, + "flos": 10775904554880.0, + "grad_norm": 2.0847356564254014, + "language_loss": 0.90199494, + "learning_rate": 3.4258008370783656e-06, + "loss": 0.98009604, + "num_input_tokens_seen": 97082180, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.21228027, + "step": 4490, + "time_per_iteration": 2.461840867996216 + }, + { + "auxiliary_loss_clip": 0.06505966, + "auxiliary_loss_mlp": 0.01275421, + "balance_loss_clip": 0.06297465, + "balance_loss_mlp": 0.01256288, + "epoch": 0.2700135277318503, + "flos": 36180021928320.0, + "grad_norm": 2.13129158363681, + "language_loss": 0.73836827, + "learning_rate": 3.4255276940916434e-06, + "loss": 0.81618214, + "num_input_tokens_seen": 97103470, + "router_z_loss_clip": 2.08496094, + "router_z_loss_mlp": 0.19128418, + "step": 4491, + "time_per_iteration": 2.6479640007019043 + }, + { + "auxiliary_loss_clip": 0.06516409, + "auxiliary_loss_mlp": 0.01284517, + "balance_loss_clip": 0.06303698, + "balance_loss_mlp": 0.01264788, + "epoch": 0.27007365098451824, + "flos": 17424020949120.0, + "grad_norm": 2.8438546283757793, + "language_loss": 0.74296927, + "learning_rate": 3.4252544970487676e-06, + "loss": 0.82097852, + "num_input_tokens_seen": 97118100, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.19726562, + "step": 4492, + "time_per_iteration": 2.462226629257202 + }, + { + "auxiliary_loss_clip": 0.06510016, + "auxiliary_loss_mlp": 0.01279369, + "balance_loss_clip": 0.06300159, + "balance_loss_mlp": 0.01259926, + "epoch": 0.2701337742371862, + "flos": 23192448053760.0, + "grad_norm": 1.7359009481863723, + "language_loss": 0.88954818, + "learning_rate": 3.4249812459600986e-06, + "loss": 0.96744204, + "num_input_tokens_seen": 97136765, + "router_z_loss_clip": 2.09960938, + "router_z_loss_mlp": 0.19445801, + "step": 4493, + "time_per_iteration": 2.5385639667510986 + }, + { + "auxiliary_loss_clip": 0.06509903, + "auxiliary_loss_mlp": 0.01283619, + "balance_loss_clip": 0.06296834, + "balance_loss_mlp": 0.01265201, + "epoch": 0.2701938974898542, + "flos": 24396365917440.0, + "grad_norm": 1.3961943163888275, + "language_loss": 0.71571529, + "learning_rate": 3.424707940835998e-06, + "loss": 0.79365045, + "num_input_tokens_seen": 97157470, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.1842041, + "step": 4494, + "time_per_iteration": 2.542644500732422 + }, + { + "auxiliary_loss_clip": 0.06499185, + "auxiliary_loss_mlp": 0.01282381, + "balance_loss_clip": 0.0629191, + "balance_loss_mlp": 0.01263713, + "epoch": 0.2702540207425222, + "flos": 26221641333120.0, + "grad_norm": 2.6689304552375366, + "language_loss": 0.8697859, + "learning_rate": 3.42443458168683e-06, + "loss": 0.94760156, + "num_input_tokens_seen": 97176905, + "router_z_loss_clip": 2.07226562, + "router_z_loss_mlp": 0.18652344, + "step": 4495, + "time_per_iteration": 2.6052844524383545 + }, + { + "auxiliary_loss_clip": 0.06507061, + "auxiliary_loss_mlp": 0.01284126, + "balance_loss_clip": 0.06293719, + "balance_loss_mlp": 0.01263944, + "epoch": 0.27031414399519016, + "flos": 22932439234560.0, + "grad_norm": 1.7866659337876034, + "language_loss": 0.76608586, + "learning_rate": 3.424161168522959e-06, + "loss": 0.84399772, + "num_input_tokens_seen": 97196380, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.20166016, + "step": 4496, + "time_per_iteration": 2.5191855430603027 + }, + { + "auxiliary_loss_clip": 0.06445029, + "auxiliary_loss_mlp": 0.01262058, + "balance_loss_clip": 0.06340651, + "balance_loss_mlp": 0.01257498, + "epoch": 0.2703742672478581, + "flos": 63037904912640.0, + "grad_norm": 0.6591771406427821, + "language_loss": 0.49976462, + "learning_rate": 3.423887701354754e-06, + "loss": 0.57683551, + "num_input_tokens_seen": 97260100, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.0456543, + "step": 4497, + "time_per_iteration": 3.2403736114501953 + }, + { + "auxiliary_loss_clip": 0.06506558, + "auxiliary_loss_mlp": 0.01283587, + "balance_loss_clip": 0.06295481, + "balance_loss_mlp": 0.01266039, + "epoch": 0.2704343905005261, + "flos": 18846341280000.0, + "grad_norm": 2.8639988273107657, + "language_loss": 0.72431815, + "learning_rate": 3.4236141801925847e-06, + "loss": 0.80221957, + "num_input_tokens_seen": 97277935, + "router_z_loss_clip": 2.11328125, + "router_z_loss_mlp": 0.17553711, + "step": 4498, + "time_per_iteration": 2.509298086166382 + }, + { + "auxiliary_loss_clip": 0.06432115, + "auxiliary_loss_mlp": 0.01259251, + "balance_loss_clip": 0.06327531, + "balance_loss_mlp": 0.01254679, + "epoch": 0.27049451375319405, + "flos": 71253635817600.0, + "grad_norm": 0.9422572009255263, + "language_loss": 0.5900467, + "learning_rate": 3.4233406050468237e-06, + "loss": 0.66696036, + "num_input_tokens_seen": 97338845, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.04577637, + "step": 4499, + "time_per_iteration": 3.2116270065307617 + }, + { + "auxiliary_loss_clip": 0.06502165, + "auxiliary_loss_mlp": 0.01281307, + "balance_loss_clip": 0.06292122, + "balance_loss_mlp": 0.01261422, + "epoch": 0.270554637005862, + "flos": 24285257003520.0, + "grad_norm": 2.589715304320551, + "language_loss": 0.73975158, + "learning_rate": 3.4230669759278438e-06, + "loss": 0.8175863, + "num_input_tokens_seen": 97356640, + "router_z_loss_clip": 2.09960938, + "router_z_loss_mlp": 0.19897461, + "step": 4500, + "time_per_iteration": 2.537710189819336 + }, + { + "auxiliary_loss_clip": 0.06501484, + "auxiliary_loss_mlp": 0.01276741, + "balance_loss_clip": 0.06289591, + "balance_loss_mlp": 0.01257965, + "epoch": 0.27061476025853, + "flos": 17636889047040.0, + "grad_norm": 2.788947169536346, + "language_loss": 0.81470346, + "learning_rate": 3.4227932928460215e-06, + "loss": 0.89248574, + "num_input_tokens_seen": 97372585, + "router_z_loss_clip": 2.11523438, + "router_z_loss_mlp": 0.18774414, + "step": 4501, + "time_per_iteration": 2.5423648357391357 + }, + { + "auxiliary_loss_clip": 0.06510358, + "auxiliary_loss_mlp": 0.01287368, + "balance_loss_clip": 0.06294559, + "balance_loss_mlp": 0.01267579, + "epoch": 0.27067488351119795, + "flos": 22716594316800.0, + "grad_norm": 1.5278818221734496, + "language_loss": 0.7303015, + "learning_rate": 3.422519555811735e-06, + "loss": 0.8082788, + "num_input_tokens_seen": 97393315, + "router_z_loss_clip": 2.15917969, + "router_z_loss_mlp": 0.19775391, + "step": 4502, + "time_per_iteration": 2.5804011821746826 + }, + { + "auxiliary_loss_clip": 0.06507368, + "auxiliary_loss_mlp": 0.01278341, + "balance_loss_clip": 0.06289332, + "balance_loss_mlp": 0.01258576, + "epoch": 0.2707350067638659, + "flos": 41729333806080.0, + "grad_norm": 1.6949775973694576, + "language_loss": 0.69090897, + "learning_rate": 3.4222457648353642e-06, + "loss": 0.76876605, + "num_input_tokens_seen": 97417860, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.19763184, + "step": 4503, + "time_per_iteration": 2.740292549133301 + }, + { + "auxiliary_loss_clip": 0.06502387, + "auxiliary_loss_mlp": 0.0128307, + "balance_loss_clip": 0.06290283, + "balance_loss_mlp": 0.01263746, + "epoch": 0.2707951300165339, + "flos": 20199159048960.0, + "grad_norm": 1.9752400870870641, + "language_loss": 0.69172543, + "learning_rate": 3.4219719199272918e-06, + "loss": 0.76958001, + "num_input_tokens_seen": 97436780, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.1932373, + "step": 4504, + "time_per_iteration": 2.548069477081299 + }, + { + "auxiliary_loss_clip": 0.06502561, + "auxiliary_loss_mlp": 0.0128216, + "balance_loss_clip": 0.06291538, + "balance_loss_mlp": 0.01263492, + "epoch": 0.27085525326920185, + "flos": 21440364779520.0, + "grad_norm": 2.9855030089462993, + "language_loss": 0.76122642, + "learning_rate": 3.421698021097902e-06, + "loss": 0.8390736, + "num_input_tokens_seen": 97456190, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 0.18652344, + "step": 4505, + "time_per_iteration": 2.527165651321411 + }, + { + "auxiliary_loss_clip": 0.06505956, + "auxiliary_loss_mlp": 0.0128432, + "balance_loss_clip": 0.06289993, + "balance_loss_mlp": 0.01264459, + "epoch": 0.2709153765218698, + "flos": 17680885240320.0, + "grad_norm": 2.0693026918396487, + "language_loss": 0.73959178, + "learning_rate": 3.42142406835758e-06, + "loss": 0.81749451, + "num_input_tokens_seen": 97474545, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.1986084, + "step": 4506, + "time_per_iteration": 2.5131149291992188 + }, + { + "auxiliary_loss_clip": 0.0650361, + "auxiliary_loss_mlp": 0.01278265, + "balance_loss_clip": 0.06290495, + "balance_loss_mlp": 0.01258595, + "epoch": 0.2709754997745378, + "flos": 24462136972800.0, + "grad_norm": 1.8128724600792683, + "language_loss": 0.81647539, + "learning_rate": 3.421150061716715e-06, + "loss": 0.89429414, + "num_input_tokens_seen": 97494520, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.1965332, + "step": 4507, + "time_per_iteration": 2.684535503387451 + }, + { + "auxiliary_loss_clip": 0.06395597, + "auxiliary_loss_mlp": 0.01254395, + "balance_loss_clip": 0.0629042, + "balance_loss_mlp": 0.01250205, + "epoch": 0.2710356230272058, + "flos": 65229602232960.0, + "grad_norm": 0.712447813073055, + "language_loss": 0.50718415, + "learning_rate": 3.420876001185698e-06, + "loss": 0.58368409, + "num_input_tokens_seen": 97552455, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.04193115, + "step": 4508, + "time_per_iteration": 3.111752986907959 + }, + { + "auxiliary_loss_clip": 0.0649793, + "auxiliary_loss_mlp": 0.01272465, + "balance_loss_clip": 0.06289998, + "balance_loss_mlp": 0.01255263, + "epoch": 0.27109574627987376, + "flos": 25491606635520.0, + "grad_norm": 2.0258218163980213, + "language_loss": 0.75015354, + "learning_rate": 3.4206018867749197e-06, + "loss": 0.82785749, + "num_input_tokens_seen": 97572650, + "router_z_loss_clip": 2.08007812, + "router_z_loss_mlp": 0.171875, + "step": 4509, + "time_per_iteration": 2.555316209793091 + }, + { + "auxiliary_loss_clip": 0.06495094, + "auxiliary_loss_mlp": 0.01275639, + "balance_loss_clip": 0.06289092, + "balance_loss_mlp": 0.01256947, + "epoch": 0.2711558695325417, + "flos": 19688910410880.0, + "grad_norm": 2.3712253737099767, + "language_loss": 0.71864915, + "learning_rate": 3.4203277184947757e-06, + "loss": 0.79635644, + "num_input_tokens_seen": 97591150, + "router_z_loss_clip": 2.05859375, + "router_z_loss_mlp": 0.18688965, + "step": 4510, + "time_per_iteration": 2.5428407192230225 + }, + { + "auxiliary_loss_clip": 0.06499062, + "auxiliary_loss_mlp": 0.01279444, + "balance_loss_clip": 0.0629103, + "balance_loss_mlp": 0.012608, + "epoch": 0.2712159927852097, + "flos": 18593627765760.0, + "grad_norm": 2.5496745820614515, + "language_loss": 0.71357799, + "learning_rate": 3.4200534963556627e-06, + "loss": 0.791363, + "num_input_tokens_seen": 97607410, + "router_z_loss_clip": 2.07910156, + "router_z_loss_mlp": 0.1862793, + "step": 4511, + "time_per_iteration": 2.483739137649536 + }, + { + "auxiliary_loss_clip": 0.06505338, + "auxiliary_loss_mlp": 0.01274141, + "balance_loss_clip": 0.06292383, + "balance_loss_mlp": 0.01254817, + "epoch": 0.27127611603787766, + "flos": 25637403939840.0, + "grad_norm": 1.9202075405224084, + "language_loss": 0.81604505, + "learning_rate": 3.419779220367979e-06, + "loss": 0.89383984, + "num_input_tokens_seen": 97626870, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.1932373, + "step": 4512, + "time_per_iteration": 2.593388795852661 + }, + { + "auxiliary_loss_clip": 0.06503928, + "auxiliary_loss_mlp": 0.01273233, + "balance_loss_clip": 0.06296667, + "balance_loss_mlp": 0.01255554, + "epoch": 0.2713362392905456, + "flos": 23155663311360.0, + "grad_norm": 1.8072498717910284, + "language_loss": 0.809147, + "learning_rate": 3.419504890542124e-06, + "loss": 0.88691866, + "num_input_tokens_seen": 97646595, + "router_z_loss_clip": 2.07226562, + "router_z_loss_mlp": 0.17663574, + "step": 4513, + "time_per_iteration": 2.519502639770508 + }, + { + "auxiliary_loss_clip": 0.06501831, + "auxiliary_loss_mlp": 0.01278947, + "balance_loss_clip": 0.0628939, + "balance_loss_mlp": 0.01261018, + "epoch": 0.2713963625432136, + "flos": 18371409937920.0, + "grad_norm": 3.81368034370299, + "language_loss": 0.88867396, + "learning_rate": 3.4192305068885026e-06, + "loss": 0.96648169, + "num_input_tokens_seen": 97665485, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.17932129, + "step": 4514, + "time_per_iteration": 2.54484224319458 + }, + { + "auxiliary_loss_clip": 0.06502509, + "auxiliary_loss_mlp": 0.01277056, + "balance_loss_clip": 0.06292502, + "balance_loss_mlp": 0.01258709, + "epoch": 0.27145648579588155, + "flos": 22498275703680.0, + "grad_norm": 1.610354502574947, + "language_loss": 0.92402363, + "learning_rate": 3.418956069417517e-06, + "loss": 1.00181937, + "num_input_tokens_seen": 97683800, + "router_z_loss_clip": 2.09863281, + "router_z_loss_mlp": 0.18347168, + "step": 4515, + "time_per_iteration": 2.5121350288391113 + }, + { + "auxiliary_loss_clip": 0.06511631, + "auxiliary_loss_mlp": 0.01281138, + "balance_loss_clip": 0.06296228, + "balance_loss_mlp": 0.01259669, + "epoch": 0.2715166090485495, + "flos": 19244265120000.0, + "grad_norm": 2.423654901761582, + "language_loss": 0.73979908, + "learning_rate": 3.4186815781395756e-06, + "loss": 0.81772685, + "num_input_tokens_seen": 97700505, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.21435547, + "step": 4516, + "time_per_iteration": 3.917318344116211 + }, + { + "auxiliary_loss_clip": 0.06498563, + "auxiliary_loss_mlp": 0.01272915, + "balance_loss_clip": 0.06289151, + "balance_loss_mlp": 0.01253627, + "epoch": 0.2715767323012175, + "flos": 17714902798080.0, + "grad_norm": 1.854313921742246, + "language_loss": 0.76927733, + "learning_rate": 3.4184070330650866e-06, + "loss": 0.84699214, + "num_input_tokens_seen": 97717410, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 0.19287109, + "step": 4517, + "time_per_iteration": 2.576723098754883 + }, + { + "auxiliary_loss_clip": 0.06500702, + "auxiliary_loss_mlp": 0.01276287, + "balance_loss_clip": 0.06291518, + "balance_loss_mlp": 0.01256701, + "epoch": 0.27163685555388545, + "flos": 22389430849920.0, + "grad_norm": 2.0334929641517956, + "language_loss": 0.7833634, + "learning_rate": 3.4181324342044607e-06, + "loss": 0.86113334, + "num_input_tokens_seen": 97734545, + "router_z_loss_clip": 2.09179688, + "router_z_loss_mlp": 0.19592285, + "step": 4518, + "time_per_iteration": 2.5335004329681396 + }, + { + "auxiliary_loss_clip": 0.06502728, + "auxiliary_loss_mlp": 0.01276691, + "balance_loss_clip": 0.06292961, + "balance_loss_mlp": 0.0125925, + "epoch": 0.2716969788065534, + "flos": 22353358867200.0, + "grad_norm": 1.6261203259974584, + "language_loss": 0.68873644, + "learning_rate": 3.41785778156811e-06, + "loss": 0.76653063, + "num_input_tokens_seen": 97754000, + "router_z_loss_clip": 2.09863281, + "router_z_loss_mlp": 0.17443848, + "step": 4519, + "time_per_iteration": 2.60939359664917 + }, + { + "auxiliary_loss_clip": 0.06500532, + "auxiliary_loss_mlp": 0.0127723, + "balance_loss_clip": 0.06291862, + "balance_loss_mlp": 0.01260302, + "epoch": 0.2717571020592214, + "flos": 25235497031040.0, + "grad_norm": 1.9620818548787327, + "language_loss": 0.75925875, + "learning_rate": 3.417583075166451e-06, + "loss": 0.83703637, + "num_input_tokens_seen": 97772080, + "router_z_loss_clip": 2.08789062, + "router_z_loss_mlp": 0.16931152, + "step": 4520, + "time_per_iteration": 3.988518238067627 + }, + { + "auxiliary_loss_clip": 0.06503896, + "auxiliary_loss_mlp": 0.012736, + "balance_loss_clip": 0.06291716, + "balance_loss_mlp": 0.01253942, + "epoch": 0.2718172253118894, + "flos": 20195343688320.0, + "grad_norm": 3.05783023991908, + "language_loss": 0.76690799, + "learning_rate": 3.4173083150099e-06, + "loss": 0.84468293, + "num_input_tokens_seen": 97789370, + "router_z_loss_clip": 2.12304688, + "router_z_loss_mlp": 0.1965332, + "step": 4521, + "time_per_iteration": 3.9463987350463867 + }, + { + "auxiliary_loss_clip": 0.0650706, + "auxiliary_loss_mlp": 0.0127528, + "balance_loss_clip": 0.06291709, + "balance_loss_mlp": 0.01255432, + "epoch": 0.27187734856455736, + "flos": 14324318858880.0, + "grad_norm": 2.0792585055499435, + "language_loss": 0.74927616, + "learning_rate": 3.417033501108875e-06, + "loss": 0.82709956, + "num_input_tokens_seen": 97807385, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.19824219, + "step": 4522, + "time_per_iteration": 2.576792001724243 + }, + { + "auxiliary_loss_clip": 0.06503602, + "auxiliary_loss_mlp": 0.01276885, + "balance_loss_clip": 0.06291734, + "balance_loss_mlp": 0.01258884, + "epoch": 0.27193747181722533, + "flos": 21114375269760.0, + "grad_norm": 1.7974712998396492, + "language_loss": 0.73055947, + "learning_rate": 3.416758633473798e-06, + "loss": 0.80836433, + "num_input_tokens_seen": 97827930, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.17993164, + "step": 4523, + "time_per_iteration": 2.5116758346557617 + }, + { + "auxiliary_loss_clip": 0.06493908, + "auxiliary_loss_mlp": 0.01278011, + "balance_loss_clip": 0.06286807, + "balance_loss_mlp": 0.01259665, + "epoch": 0.2719975950698933, + "flos": 19688910410880.0, + "grad_norm": 1.3231652709358832, + "language_loss": 0.74779463, + "learning_rate": 3.4164837121150915e-06, + "loss": 0.82551384, + "num_input_tokens_seen": 97847440, + "router_z_loss_clip": 2.0703125, + "router_z_loss_mlp": 0.18334961, + "step": 4524, + "time_per_iteration": 2.5318901538848877 + }, + { + "auxiliary_loss_clip": 0.06503987, + "auxiliary_loss_mlp": 0.01277059, + "balance_loss_clip": 0.06291917, + "balance_loss_mlp": 0.01258248, + "epoch": 0.27205771832256126, + "flos": 24761488083840.0, + "grad_norm": 2.222226091972884, + "language_loss": 0.76783192, + "learning_rate": 3.4162087370431803e-06, + "loss": 0.84564239, + "num_input_tokens_seen": 97867620, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.18811035, + "step": 4525, + "time_per_iteration": 2.594209909439087 + }, + { + "auxiliary_loss_clip": 0.06492639, + "auxiliary_loss_mlp": 0.01271759, + "balance_loss_clip": 0.0628486, + "balance_loss_mlp": 0.01254712, + "epoch": 0.2721178415752292, + "flos": 21760903774080.0, + "grad_norm": 1.8877793172534498, + "language_loss": 0.82166058, + "learning_rate": 3.4159337082684926e-06, + "loss": 0.89930463, + "num_input_tokens_seen": 97884345, + "router_z_loss_clip": 2.07617188, + "router_z_loss_mlp": 0.17041016, + "step": 4526, + "time_per_iteration": 3.9739785194396973 + }, + { + "auxiliary_loss_clip": 0.06510428, + "auxiliary_loss_mlp": 0.01273954, + "balance_loss_clip": 0.06292043, + "balance_loss_mlp": 0.01254189, + "epoch": 0.2721779648278972, + "flos": 12681667416960.0, + "grad_norm": 2.608637418907724, + "language_loss": 0.77407986, + "learning_rate": 3.4156586258014566e-06, + "loss": 0.8519237, + "num_input_tokens_seen": 97901500, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.19763184, + "step": 4527, + "time_per_iteration": 2.5017969608306885 + }, + { + "auxiliary_loss_clip": 0.06502572, + "auxiliary_loss_mlp": 0.01278457, + "balance_loss_clip": 0.0629287, + "balance_loss_mlp": 0.01260194, + "epoch": 0.27223808808056515, + "flos": 16258774544640.0, + "grad_norm": 2.1231016049423608, + "language_loss": 0.82676923, + "learning_rate": 3.415383489652503e-06, + "loss": 0.90457952, + "num_input_tokens_seen": 97917800, + "router_z_loss_clip": 2.09570312, + "router_z_loss_mlp": 0.18249512, + "step": 4528, + "time_per_iteration": 2.5011186599731445 + }, + { + "auxiliary_loss_clip": 0.06500327, + "auxiliary_loss_mlp": 0.012781, + "balance_loss_clip": 0.06293638, + "balance_loss_mlp": 0.01260064, + "epoch": 0.2722982113332331, + "flos": 27753225788160.0, + "grad_norm": 1.6573852241711216, + "language_loss": 0.77553773, + "learning_rate": 3.4151082998320666e-06, + "loss": 0.85332191, + "num_input_tokens_seen": 97937225, + "router_z_loss_clip": 2.06640625, + "router_z_loss_mlp": 0.18041992, + "step": 4529, + "time_per_iteration": 2.5810396671295166 + }, + { + "auxiliary_loss_clip": 0.06499013, + "auxiliary_loss_mlp": 0.01277211, + "balance_loss_clip": 0.06287834, + "balance_loss_mlp": 0.01259055, + "epoch": 0.2723583345859011, + "flos": 21732756001920.0, + "grad_norm": 2.1115027178358354, + "language_loss": 0.82665265, + "learning_rate": 3.4148330563505805e-06, + "loss": 0.90441489, + "num_input_tokens_seen": 97956845, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.18164062, + "step": 4530, + "time_per_iteration": 2.586454391479492 + }, + { + "auxiliary_loss_clip": 0.06502904, + "auxiliary_loss_mlp": 0.01282339, + "balance_loss_clip": 0.06295159, + "balance_loss_mlp": 0.0126379, + "epoch": 0.27241845783856905, + "flos": 17352925159680.0, + "grad_norm": 2.154635693147181, + "language_loss": 0.92694783, + "learning_rate": 3.4145577592184838e-06, + "loss": 1.0048002, + "num_input_tokens_seen": 97972465, + "router_z_loss_clip": 2.078125, + "router_z_loss_mlp": 0.18530273, + "step": 4531, + "time_per_iteration": 2.5160703659057617 + }, + { + "auxiliary_loss_clip": 0.06501545, + "auxiliary_loss_mlp": 0.01275858, + "balance_loss_clip": 0.06288925, + "balance_loss_mlp": 0.01257928, + "epoch": 0.272478581091237, + "flos": 24761278448640.0, + "grad_norm": 1.903467624841223, + "language_loss": 0.76781744, + "learning_rate": 3.4142824084462155e-06, + "loss": 0.84559143, + "num_input_tokens_seen": 97990770, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.17919922, + "step": 4532, + "time_per_iteration": 2.568319082260132 + }, + { + "auxiliary_loss_clip": 0.06500092, + "auxiliary_loss_mlp": 0.01272039, + "balance_loss_clip": 0.06296673, + "balance_loss_mlp": 0.0125448, + "epoch": 0.272538704343905, + "flos": 17895723909120.0, + "grad_norm": 2.5230523304945685, + "language_loss": 0.89717656, + "learning_rate": 3.4140070040442162e-06, + "loss": 0.97489792, + "num_input_tokens_seen": 98005775, + "router_z_loss_clip": 2.03125, + "router_z_loss_mlp": 0.17565918, + "step": 4533, + "time_per_iteration": 2.538637399673462 + }, + { + "auxiliary_loss_clip": 0.06497633, + "auxiliary_loss_mlp": 0.01272152, + "balance_loss_clip": 0.06294405, + "balance_loss_mlp": 0.01255559, + "epoch": 0.272598827596573, + "flos": 22939021779840.0, + "grad_norm": 1.9282389689502992, + "language_loss": 0.72213519, + "learning_rate": 3.413731546022929e-06, + "loss": 0.79983306, + "num_input_tokens_seen": 98025750, + "router_z_loss_clip": 2.03027344, + "router_z_loss_mlp": 0.16589355, + "step": 4534, + "time_per_iteration": 2.5503549575805664 + }, + { + "auxiliary_loss_clip": 0.06500763, + "auxiliary_loss_mlp": 0.01275564, + "balance_loss_clip": 0.06290451, + "balance_loss_mlp": 0.01255847, + "epoch": 0.27265895084924097, + "flos": 24244447265280.0, + "grad_norm": 1.8514773269853142, + "language_loss": 0.91784394, + "learning_rate": 3.4134560343928005e-06, + "loss": 0.99560714, + "num_input_tokens_seen": 98044955, + "router_z_loss_clip": 2.10449219, + "router_z_loss_mlp": 0.19702148, + "step": 4535, + "time_per_iteration": 2.558943510055542 + }, + { + "auxiliary_loss_clip": 0.06506651, + "auxiliary_loss_mlp": 0.01276542, + "balance_loss_clip": 0.06297188, + "balance_loss_mlp": 0.01258768, + "epoch": 0.27271907410190893, + "flos": 27019962708480.0, + "grad_norm": 1.7799258806344853, + "language_loss": 0.73195565, + "learning_rate": 3.4131804691642778e-06, + "loss": 0.80978757, + "num_input_tokens_seen": 98065860, + "router_z_loss_clip": 2.09472656, + "router_z_loss_mlp": 0.17773438, + "step": 4536, + "time_per_iteration": 2.5590782165527344 + }, + { + "auxiliary_loss_clip": 0.06502935, + "auxiliary_loss_mlp": 0.01275256, + "balance_loss_clip": 0.0629502, + "balance_loss_mlp": 0.01257351, + "epoch": 0.2727791973545769, + "flos": 34460027568000.0, + "grad_norm": 1.8462150885541477, + "language_loss": 0.72167033, + "learning_rate": 3.41290485034781e-06, + "loss": 0.79945225, + "num_input_tokens_seen": 98085450, + "router_z_loss_clip": 2.07714844, + "router_z_loss_mlp": 0.17907715, + "step": 4537, + "time_per_iteration": 2.680515766143799 + }, + { + "auxiliary_loss_clip": 0.06501988, + "auxiliary_loss_mlp": 0.01276469, + "balance_loss_clip": 0.06293489, + "balance_loss_mlp": 0.0125829, + "epoch": 0.27283932060724486, + "flos": 15045842367360.0, + "grad_norm": 2.3888098238231503, + "language_loss": 0.78421736, + "learning_rate": 3.4126291779538485e-06, + "loss": 0.8620019, + "num_input_tokens_seen": 98099115, + "router_z_loss_clip": 2.08496094, + "router_z_loss_mlp": 0.1817627, + "step": 4538, + "time_per_iteration": 2.4626059532165527 + }, + { + "auxiliary_loss_clip": 0.06506806, + "auxiliary_loss_mlp": 0.01275863, + "balance_loss_clip": 0.06298484, + "balance_loss_mlp": 0.01258566, + "epoch": 0.2728994438599128, + "flos": 21658767246720.0, + "grad_norm": 1.6357140094020364, + "language_loss": 0.90640903, + "learning_rate": 3.412353451992847e-06, + "loss": 0.9842357, + "num_input_tokens_seen": 98118415, + "router_z_loss_clip": 2.08007812, + "router_z_loss_mlp": 0.17297363, + "step": 4539, + "time_per_iteration": 2.5629584789276123 + }, + { + "auxiliary_loss_clip": 0.06501281, + "auxiliary_loss_mlp": 0.01272923, + "balance_loss_clip": 0.06294584, + "balance_loss_mlp": 0.01253778, + "epoch": 0.2729595671125808, + "flos": 17493313875840.0, + "grad_norm": 1.7229738452441967, + "language_loss": 0.88610893, + "learning_rate": 3.4120776724752607e-06, + "loss": 0.96385098, + "num_input_tokens_seen": 98136300, + "router_z_loss_clip": 2.06542969, + "router_z_loss_mlp": 0.19140625, + "step": 4540, + "time_per_iteration": 2.4959304332733154 + }, + { + "auxiliary_loss_clip": 0.06504017, + "auxiliary_loss_mlp": 0.01274499, + "balance_loss_clip": 0.06294081, + "balance_loss_mlp": 0.0125744, + "epoch": 0.27301969036524876, + "flos": 19324249441920.0, + "grad_norm": 2.2191409784662, + "language_loss": 0.8242712, + "learning_rate": 3.4118018394115476e-06, + "loss": 0.9020564, + "num_input_tokens_seen": 98154580, + "router_z_loss_clip": 2.09960938, + "router_z_loss_mlp": 0.17053223, + "step": 4541, + "time_per_iteration": 2.550239086151123 + }, + { + "auxiliary_loss_clip": 0.06500127, + "auxiliary_loss_mlp": 0.01279087, + "balance_loss_clip": 0.06291916, + "balance_loss_mlp": 0.01260431, + "epoch": 0.2730798136179167, + "flos": 21071427252480.0, + "grad_norm": 2.3060281935178795, + "language_loss": 0.80131608, + "learning_rate": 3.4115259528121678e-06, + "loss": 0.87910819, + "num_input_tokens_seen": 98173115, + "router_z_loss_clip": 2.08007812, + "router_z_loss_mlp": 0.18664551, + "step": 4542, + "time_per_iteration": 2.519717216491699 + }, + { + "auxiliary_loss_clip": 0.06509651, + "auxiliary_loss_mlp": 0.01276731, + "balance_loss_clip": 0.06301565, + "balance_loss_mlp": 0.01258599, + "epoch": 0.2731399368705847, + "flos": 19177739377920.0, + "grad_norm": 1.9524817452008785, + "language_loss": 0.89606124, + "learning_rate": 3.411250012687582e-06, + "loss": 0.97392499, + "num_input_tokens_seen": 98190260, + "router_z_loss_clip": 2.08105469, + "router_z_loss_mlp": 0.18139648, + "step": 4543, + "time_per_iteration": 2.5182156562805176 + }, + { + "auxiliary_loss_clip": 0.06509942, + "auxiliary_loss_mlp": 0.01279235, + "balance_loss_clip": 0.06297313, + "balance_loss_mlp": 0.012604, + "epoch": 0.27320006012325265, + "flos": 18294989414400.0, + "grad_norm": 2.101118642115193, + "language_loss": 0.64112943, + "learning_rate": 3.410974019048255e-06, + "loss": 0.7190212, + "num_input_tokens_seen": 98207115, + "router_z_loss_clip": 2.12597656, + "router_z_loss_mlp": 0.18823242, + "step": 4544, + "time_per_iteration": 2.482348918914795 + }, + { + "auxiliary_loss_clip": 0.06504791, + "auxiliary_loss_mlp": 0.01282982, + "balance_loss_clip": 0.06296986, + "balance_loss_mlp": 0.01264231, + "epoch": 0.2732601833759206, + "flos": 34869607125120.0, + "grad_norm": 1.6845842729353224, + "language_loss": 0.70290005, + "learning_rate": 3.410697971904651e-06, + "loss": 0.78077781, + "num_input_tokens_seen": 98230610, + "router_z_loss_clip": 2.07714844, + "router_z_loss_mlp": 0.1875, + "step": 4545, + "time_per_iteration": 2.6779940128326416 + }, + { + "auxiliary_loss_clip": 0.06375119, + "auxiliary_loss_mlp": 0.01256033, + "balance_loss_clip": 0.06273499, + "balance_loss_mlp": 0.01252296, + "epoch": 0.2733203066285886, + "flos": 53929514534400.0, + "grad_norm": 0.7176798913576009, + "language_loss": 0.61676908, + "learning_rate": 3.4104218712672383e-06, + "loss": 0.6930806, + "num_input_tokens_seen": 98293585, + "router_z_loss_clip": 1.015625, + "router_z_loss_mlp": 0.03729248, + "step": 4546, + "time_per_iteration": 3.1508243083953857 + }, + { + "auxiliary_loss_clip": 0.06510071, + "auxiliary_loss_mlp": 0.01277702, + "balance_loss_clip": 0.06301852, + "balance_loss_mlp": 0.01258843, + "epoch": 0.2733804298812566, + "flos": 20665411493760.0, + "grad_norm": 1.9095347334938924, + "language_loss": 0.65170372, + "learning_rate": 3.410145717146488e-06, + "loss": 0.72958136, + "num_input_tokens_seen": 98311680, + "router_z_loss_clip": 2.08105469, + "router_z_loss_mlp": 0.1887207, + "step": 4547, + "time_per_iteration": 2.57828426361084 + }, + { + "auxiliary_loss_clip": 0.06498976, + "auxiliary_loss_mlp": 0.0127425, + "balance_loss_clip": 0.06296893, + "balance_loss_mlp": 0.01257799, + "epoch": 0.27344055313392457, + "flos": 25891333338240.0, + "grad_norm": 2.438857151480637, + "language_loss": 0.78365928, + "learning_rate": 3.4098695095528694e-06, + "loss": 0.86139154, + "num_input_tokens_seen": 98330770, + "router_z_loss_clip": 2.02246094, + "router_z_loss_mlp": 0.16455078, + "step": 4548, + "time_per_iteration": 2.566077470779419 + }, + { + "auxiliary_loss_clip": 0.0650417, + "auxiliary_loss_mlp": 0.01276844, + "balance_loss_clip": 0.06295689, + "balance_loss_mlp": 0.01259785, + "epoch": 0.27350067638659253, + "flos": 22936380376320.0, + "grad_norm": 2.3129649243249157, + "language_loss": 0.83350241, + "learning_rate": 3.4095932484968585e-06, + "loss": 0.91131258, + "num_input_tokens_seen": 98349860, + "router_z_loss_clip": 2.08496094, + "router_z_loss_mlp": 0.17053223, + "step": 4549, + "time_per_iteration": 2.560349941253662 + }, + { + "auxiliary_loss_clip": 0.06503863, + "auxiliary_loss_mlp": 0.01277801, + "balance_loss_clip": 0.06292209, + "balance_loss_mlp": 0.0125707, + "epoch": 0.2735607996392605, + "flos": 16579313539200.0, + "grad_norm": 2.1355332193902568, + "language_loss": 0.71687186, + "learning_rate": 3.4093169339889305e-06, + "loss": 0.79468852, + "num_input_tokens_seen": 98367040, + "router_z_loss_clip": 2.11328125, + "router_z_loss_mlp": 0.20727539, + "step": 4550, + "time_per_iteration": 2.4829771518707275 + }, + { + "auxiliary_loss_clip": 0.06503724, + "auxiliary_loss_mlp": 0.01271817, + "balance_loss_clip": 0.06298332, + "balance_loss_mlp": 0.01253435, + "epoch": 0.27362092289192846, + "flos": 19651245200640.0, + "grad_norm": 2.4590448673698546, + "language_loss": 0.79561722, + "learning_rate": 3.409040566039563e-06, + "loss": 0.87337267, + "num_input_tokens_seen": 98384010, + "router_z_loss_clip": 2.05078125, + "router_z_loss_mlp": 0.18371582, + "step": 4551, + "time_per_iteration": 2.5074269771575928 + }, + { + "auxiliary_loss_clip": 0.06500211, + "auxiliary_loss_mlp": 0.01281852, + "balance_loss_clip": 0.06290769, + "balance_loss_mlp": 0.01263565, + "epoch": 0.27368104614459643, + "flos": 17644855184640.0, + "grad_norm": 2.2858009613836465, + "language_loss": 0.71362597, + "learning_rate": 3.4087641446592362e-06, + "loss": 0.79144663, + "num_input_tokens_seen": 98399625, + "router_z_loss_clip": 2.09765625, + "router_z_loss_mlp": 0.18286133, + "step": 4552, + "time_per_iteration": 2.478208541870117 + }, + { + "auxiliary_loss_clip": 0.0650662, + "auxiliary_loss_mlp": 0.01277463, + "balance_loss_clip": 0.06295393, + "balance_loss_mlp": 0.01258759, + "epoch": 0.2737411693972644, + "flos": 21586455573120.0, + "grad_norm": 1.8660820035104149, + "language_loss": 0.71756262, + "learning_rate": 3.408487669858431e-06, + "loss": 0.79540348, + "num_input_tokens_seen": 98417310, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.18701172, + "step": 4553, + "time_per_iteration": 2.5268712043762207 + }, + { + "auxiliary_loss_clip": 0.0650337, + "auxiliary_loss_mlp": 0.01274907, + "balance_loss_clip": 0.06293483, + "balance_loss_mlp": 0.01255738, + "epoch": 0.27380129264993236, + "flos": 25491145438080.0, + "grad_norm": 1.7561499880950933, + "language_loss": 0.60065031, + "learning_rate": 3.4082111416476337e-06, + "loss": 0.67843306, + "num_input_tokens_seen": 98438670, + "router_z_loss_clip": 2.09765625, + "router_z_loss_mlp": 0.19177246, + "step": 4554, + "time_per_iteration": 2.5836522579193115 + }, + { + "auxiliary_loss_clip": 0.06509934, + "auxiliary_loss_mlp": 0.01281174, + "balance_loss_clip": 0.06291255, + "balance_loss_mlp": 0.01261838, + "epoch": 0.2738614159026003, + "flos": 18667155323520.0, + "grad_norm": 1.5632450212680145, + "language_loss": 0.74850649, + "learning_rate": 3.4079345600373275e-06, + "loss": 0.82641757, + "num_input_tokens_seen": 98456060, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.1932373, + "step": 4555, + "time_per_iteration": 3.9590039253234863 + }, + { + "auxiliary_loss_clip": 0.06511028, + "auxiliary_loss_mlp": 0.01279514, + "balance_loss_clip": 0.0629926, + "balance_loss_mlp": 0.0125982, + "epoch": 0.2739215391552683, + "flos": 23483874954240.0, + "grad_norm": 6.994475758797384, + "language_loss": 0.7822473, + "learning_rate": 3.407657925038002e-06, + "loss": 0.86015272, + "num_input_tokens_seen": 98473765, + "router_z_loss_clip": 2.11523438, + "router_z_loss_mlp": 0.19677734, + "step": 4556, + "time_per_iteration": 2.5688674449920654 + }, + { + "auxiliary_loss_clip": 0.06517123, + "auxiliary_loss_mlp": 0.01280796, + "balance_loss_clip": 0.06293104, + "balance_loss_mlp": 0.01260125, + "epoch": 0.27398166240793626, + "flos": 17134313057280.0, + "grad_norm": 1.8677949115203087, + "language_loss": 0.83077759, + "learning_rate": 3.4073812366601473e-06, + "loss": 0.90875673, + "num_input_tokens_seen": 98490590, + "router_z_loss_clip": 2.2421875, + "router_z_loss_mlp": 0.20690918, + "step": 4557, + "time_per_iteration": 2.490562915802002 + }, + { + "auxiliary_loss_clip": 0.06504503, + "auxiliary_loss_mlp": 0.01276773, + "balance_loss_clip": 0.06292793, + "balance_loss_mlp": 0.01256292, + "epoch": 0.2740417856606042, + "flos": 23411563280640.0, + "grad_norm": 1.9738441909854203, + "language_loss": 0.73066616, + "learning_rate": 3.4071044949142547e-06, + "loss": 0.80847895, + "num_input_tokens_seen": 98510590, + "router_z_loss_clip": 2.1171875, + "router_z_loss_mlp": 0.20483398, + "step": 4558, + "time_per_iteration": 2.5761232376098633 + }, + { + "auxiliary_loss_clip": 0.06504066, + "auxiliary_loss_mlp": 0.01276845, + "balance_loss_clip": 0.06292865, + "balance_loss_mlp": 0.01256651, + "epoch": 0.2741019089132722, + "flos": 12784307068800.0, + "grad_norm": 2.149984670873407, + "language_loss": 0.68751299, + "learning_rate": 3.406827699810819e-06, + "loss": 0.76532209, + "num_input_tokens_seen": 98527875, + "router_z_loss_clip": 2.10839844, + "router_z_loss_mlp": 0.2019043, + "step": 4559, + "time_per_iteration": 2.4976439476013184 + }, + { + "auxiliary_loss_clip": 0.06501673, + "auxiliary_loss_mlp": 0.01278249, + "balance_loss_clip": 0.0629222, + "balance_loss_mlp": 0.01259676, + "epoch": 0.27416203216594015, + "flos": 20637850700160.0, + "grad_norm": 1.7403202614473876, + "language_loss": 0.72741163, + "learning_rate": 3.4065508513603353e-06, + "loss": 0.80521083, + "num_input_tokens_seen": 98547575, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 0.18566895, + "step": 4560, + "time_per_iteration": 4.005557537078857 + }, + { + "auxiliary_loss_clip": 0.06501405, + "auxiliary_loss_mlp": 0.01278052, + "balance_loss_clip": 0.06289977, + "balance_loss_mlp": 0.01259718, + "epoch": 0.27422215541860817, + "flos": 26548762872960.0, + "grad_norm": 1.7791790627265829, + "language_loss": 0.82245278, + "learning_rate": 3.406273949573303e-06, + "loss": 0.90024734, + "num_input_tokens_seen": 98566290, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.18334961, + "step": 4561, + "time_per_iteration": 4.059048652648926 + }, + { + "auxiliary_loss_clip": 0.06510133, + "auxiliary_loss_mlp": 0.01276094, + "balance_loss_clip": 0.06296331, + "balance_loss_mlp": 0.012564, + "epoch": 0.27428227867127614, + "flos": 23337868014720.0, + "grad_norm": 1.9098162884662422, + "language_loss": 0.75760031, + "learning_rate": 3.4059969944602214e-06, + "loss": 0.83546257, + "num_input_tokens_seen": 98586255, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.19702148, + "step": 4562, + "time_per_iteration": 2.558397054672241 + }, + { + "auxiliary_loss_clip": 0.06506505, + "auxiliary_loss_mlp": 0.01277189, + "balance_loss_clip": 0.06293164, + "balance_loss_mlp": 0.01258092, + "epoch": 0.2743424019239441, + "flos": 23041074453120.0, + "grad_norm": 1.577834756327151, + "language_loss": 0.75198597, + "learning_rate": 3.4057199860315928e-06, + "loss": 0.8298229, + "num_input_tokens_seen": 98606030, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.19091797, + "step": 4563, + "time_per_iteration": 2.5698354244232178 + }, + { + "auxiliary_loss_clip": 0.06524341, + "auxiliary_loss_mlp": 0.01283879, + "balance_loss_clip": 0.06305183, + "balance_loss_mlp": 0.01262302, + "epoch": 0.27440252517661207, + "flos": 21987565868160.0, + "grad_norm": 2.0193615345580085, + "language_loss": 0.6348893, + "learning_rate": 3.4054429242979213e-06, + "loss": 0.71297145, + "num_input_tokens_seen": 98625225, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.21569824, + "step": 4564, + "time_per_iteration": 2.545741558074951 + }, + { + "auxiliary_loss_clip": 0.06513885, + "auxiliary_loss_mlp": 0.01280066, + "balance_loss_clip": 0.06299828, + "balance_loss_mlp": 0.01260647, + "epoch": 0.27446264842928003, + "flos": 40196952737280.0, + "grad_norm": 2.2005709679787153, + "language_loss": 0.7878077, + "learning_rate": 3.4051658092697135e-06, + "loss": 0.86574721, + "num_input_tokens_seen": 98649470, + "router_z_loss_clip": 2.14160156, + "router_z_loss_mlp": 0.19433594, + "step": 4565, + "time_per_iteration": 2.7061169147491455 + }, + { + "auxiliary_loss_clip": 0.0650921, + "auxiliary_loss_mlp": 0.01277346, + "balance_loss_clip": 0.06296623, + "balance_loss_mlp": 0.01257903, + "epoch": 0.274522771681948, + "flos": 13484684620800.0, + "grad_norm": 1.9604173340299715, + "language_loss": 0.69729757, + "learning_rate": 3.404888640957477e-06, + "loss": 0.77516317, + "num_input_tokens_seen": 98666915, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.19458008, + "step": 4566, + "time_per_iteration": 3.9156126976013184 + }, + { + "auxiliary_loss_clip": 0.06511474, + "auxiliary_loss_mlp": 0.0128161, + "balance_loss_clip": 0.06300822, + "balance_loss_mlp": 0.0126318, + "epoch": 0.27458289493461596, + "flos": 28629812476800.0, + "grad_norm": 1.605297231279352, + "language_loss": 0.61699307, + "learning_rate": 3.404611419371723e-06, + "loss": 0.69492388, + "num_input_tokens_seen": 98688240, + "router_z_loss_clip": 2.10546875, + "router_z_loss_mlp": 0.18432617, + "step": 4567, + "time_per_iteration": 2.5721306800842285 + }, + { + "auxiliary_loss_clip": 0.06514515, + "auxiliary_loss_mlp": 0.01275467, + "balance_loss_clip": 0.06299441, + "balance_loss_mlp": 0.01255511, + "epoch": 0.2746430181872839, + "flos": 20125883053440.0, + "grad_norm": 1.9422441687055725, + "language_loss": 0.83055782, + "learning_rate": 3.4043341445229627e-06, + "loss": 0.90845764, + "num_input_tokens_seen": 98708245, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.19970703, + "step": 4568, + "time_per_iteration": 2.5616700649261475 + }, + { + "auxiliary_loss_clip": 0.06521738, + "auxiliary_loss_mlp": 0.01275653, + "balance_loss_clip": 0.06304733, + "balance_loss_mlp": 0.01255709, + "epoch": 0.2747031414399519, + "flos": 20199662173440.0, + "grad_norm": 2.1285143693034367, + "language_loss": 0.6896143, + "learning_rate": 3.4040568164217117e-06, + "loss": 0.76758814, + "num_input_tokens_seen": 98724575, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.19934082, + "step": 4569, + "time_per_iteration": 2.531096935272217 + }, + { + "auxiliary_loss_clip": 0.06517979, + "auxiliary_loss_mlp": 0.01281496, + "balance_loss_clip": 0.06303072, + "balance_loss_mlp": 0.0126216, + "epoch": 0.27476326469261986, + "flos": 13521385509120.0, + "grad_norm": 2.4613635331126926, + "language_loss": 0.71897286, + "learning_rate": 3.4037794350784848e-06, + "loss": 0.79696763, + "num_input_tokens_seen": 98740700, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.19360352, + "step": 4570, + "time_per_iteration": 2.5235774517059326 + }, + { + "auxiliary_loss_clip": 0.06414898, + "auxiliary_loss_mlp": 0.01257276, + "balance_loss_clip": 0.06312878, + "balance_loss_mlp": 0.01253897, + "epoch": 0.2748233879452878, + "flos": 65955486153600.0, + "grad_norm": 0.6977768363268191, + "language_loss": 0.5577414, + "learning_rate": 3.4035020005038014e-06, + "loss": 0.63446319, + "num_input_tokens_seen": 98803030, + "router_z_loss_clip": 1.015625, + "router_z_loss_mlp": 0.03387451, + "step": 4571, + "time_per_iteration": 3.234433889389038 + }, + { + "auxiliary_loss_clip": 0.06526154, + "auxiliary_loss_mlp": 0.01279423, + "balance_loss_clip": 0.06308736, + "balance_loss_mlp": 0.01260326, + "epoch": 0.2748835111979558, + "flos": 17389961464320.0, + "grad_norm": 2.165338105639142, + "language_loss": 0.78105313, + "learning_rate": 3.4032245127081812e-06, + "loss": 0.85910892, + "num_input_tokens_seen": 98820505, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.19104004, + "step": 4572, + "time_per_iteration": 2.562450647354126 + }, + { + "auxiliary_loss_clip": 0.06506811, + "auxiliary_loss_mlp": 0.01278507, + "balance_loss_clip": 0.06298923, + "balance_loss_mlp": 0.01261711, + "epoch": 0.27494363445062375, + "flos": 23594480743680.0, + "grad_norm": 2.0912194071895014, + "language_loss": 0.81855798, + "learning_rate": 3.402946971702147e-06, + "loss": 0.89641118, + "num_input_tokens_seen": 98842150, + "router_z_loss_clip": 2.078125, + "router_z_loss_mlp": 0.16809082, + "step": 4573, + "time_per_iteration": 2.575467824935913 + }, + { + "auxiliary_loss_clip": 0.06512269, + "auxiliary_loss_mlp": 0.01277933, + "balance_loss_clip": 0.06303579, + "balance_loss_mlp": 0.01258585, + "epoch": 0.2750037577032918, + "flos": 17170175404800.0, + "grad_norm": 1.5550185346959569, + "language_loss": 0.79688454, + "learning_rate": 3.402669377496223e-06, + "loss": 0.87478662, + "num_input_tokens_seen": 98861050, + "router_z_loss_clip": 2.08398438, + "router_z_loss_mlp": 0.19360352, + "step": 4574, + "time_per_iteration": 2.522381067276001 + }, + { + "auxiliary_loss_clip": 0.06514049, + "auxiliary_loss_mlp": 0.012813, + "balance_loss_clip": 0.06300252, + "balance_loss_mlp": 0.01263383, + "epoch": 0.27506388095595974, + "flos": 24497663904000.0, + "grad_norm": 1.9638366231768782, + "language_loss": 0.75217533, + "learning_rate": 3.402391730100936e-06, + "loss": 0.83012879, + "num_input_tokens_seen": 98879695, + "router_z_loss_clip": 2.14160156, + "router_z_loss_mlp": 0.17907715, + "step": 4575, + "time_per_iteration": 2.564023971557617 + }, + { + "auxiliary_loss_clip": 0.06513455, + "auxiliary_loss_mlp": 0.01285217, + "balance_loss_clip": 0.06304657, + "balance_loss_mlp": 0.01267562, + "epoch": 0.2751240042086277, + "flos": 38774003500800.0, + "grad_norm": 1.5894976166299741, + "language_loss": 0.71788073, + "learning_rate": 3.402114029526814e-06, + "loss": 0.79586744, + "num_input_tokens_seen": 98902035, + "router_z_loss_clip": 2.08789062, + "router_z_loss_mlp": 0.17663574, + "step": 4576, + "time_per_iteration": 2.6856141090393066 + }, + { + "auxiliary_loss_clip": 0.06515673, + "auxiliary_loss_mlp": 0.01294199, + "balance_loss_clip": 0.06304252, + "balance_loss_mlp": 0.0127447, + "epoch": 0.27518412746129567, + "flos": 26914388163840.0, + "grad_norm": 1.693116107866749, + "language_loss": 0.73358452, + "learning_rate": 3.4018362757843866e-06, + "loss": 0.81168324, + "num_input_tokens_seen": 98921835, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.19726562, + "step": 4577, + "time_per_iteration": 2.5795719623565674 + }, + { + "auxiliary_loss_clip": 0.06517484, + "auxiliary_loss_mlp": 0.01279945, + "balance_loss_clip": 0.0630409, + "balance_loss_mlp": 0.01260514, + "epoch": 0.27524425071396363, + "flos": 24907578877440.0, + "grad_norm": 1.9498672791378742, + "language_loss": 0.76234132, + "learning_rate": 3.401558468884188e-06, + "loss": 0.84031564, + "num_input_tokens_seen": 98939610, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.19433594, + "step": 4578, + "time_per_iteration": 2.5547378063201904 + }, + { + "auxiliary_loss_clip": 0.06518476, + "auxiliary_loss_mlp": 0.01286331, + "balance_loss_clip": 0.06304644, + "balance_loss_mlp": 0.01265255, + "epoch": 0.2753043739666316, + "flos": 26295504307200.0, + "grad_norm": 1.3718100748583155, + "language_loss": 0.66504484, + "learning_rate": 3.4012806088367516e-06, + "loss": 0.74309289, + "num_input_tokens_seen": 98962250, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.21069336, + "step": 4579, + "time_per_iteration": 2.6126484870910645 + }, + { + "auxiliary_loss_clip": 0.06516613, + "auxiliary_loss_mlp": 0.01291851, + "balance_loss_clip": 0.06301446, + "balance_loss_mlp": 0.01271753, + "epoch": 0.27536449721929956, + "flos": 24213616162560.0, + "grad_norm": 3.1986582184359853, + "language_loss": 0.80722374, + "learning_rate": 3.4010026956526137e-06, + "loss": 0.88530838, + "num_input_tokens_seen": 98981845, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.2010498, + "step": 4580, + "time_per_iteration": 2.571364164352417 + }, + { + "auxiliary_loss_clip": 0.06513728, + "auxiliary_loss_mlp": 0.01285107, + "balance_loss_clip": 0.06304168, + "balance_loss_mlp": 0.01264305, + "epoch": 0.27542462047196753, + "flos": 19543448522880.0, + "grad_norm": 1.580662182314359, + "language_loss": 0.68234229, + "learning_rate": 3.4007247293423137e-06, + "loss": 0.76033062, + "num_input_tokens_seen": 99001855, + "router_z_loss_clip": 2.09570312, + "router_z_loss_mlp": 0.20788574, + "step": 4581, + "time_per_iteration": 2.5507936477661133 + }, + { + "auxiliary_loss_clip": 0.06515522, + "auxiliary_loss_mlp": 0.01276377, + "balance_loss_clip": 0.06298342, + "balance_loss_mlp": 0.01258448, + "epoch": 0.2754847437246355, + "flos": 14324360785920.0, + "grad_norm": 1.5474830525473977, + "language_loss": 0.78408682, + "learning_rate": 3.400446709916392e-06, + "loss": 0.86200583, + "num_input_tokens_seen": 99019880, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.17919922, + "step": 4582, + "time_per_iteration": 2.511134624481201 + }, + { + "auxiliary_loss_clip": 0.06505451, + "auxiliary_loss_mlp": 0.01284248, + "balance_loss_clip": 0.06298563, + "balance_loss_mlp": 0.01266605, + "epoch": 0.27554486697730346, + "flos": 18843951438720.0, + "grad_norm": 1.627014419094476, + "language_loss": 0.84829235, + "learning_rate": 3.4001686373853895e-06, + "loss": 0.92618936, + "num_input_tokens_seen": 99037570, + "router_z_loss_clip": 2.06933594, + "router_z_loss_mlp": 0.17663574, + "step": 4583, + "time_per_iteration": 2.5625038146972656 + }, + { + "auxiliary_loss_clip": 0.065156, + "auxiliary_loss_mlp": 0.01295136, + "balance_loss_clip": 0.0629985, + "balance_loss_mlp": 0.01274799, + "epoch": 0.2756049902299714, + "flos": 22388801944320.0, + "grad_norm": 2.5216327683147104, + "language_loss": 0.67592049, + "learning_rate": 3.3998905117598528e-06, + "loss": 0.75402784, + "num_input_tokens_seen": 99056875, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.20349121, + "step": 4584, + "time_per_iteration": 2.5712413787841797 + }, + { + "auxiliary_loss_clip": 0.06508277, + "auxiliary_loss_mlp": 0.01286302, + "balance_loss_clip": 0.06299593, + "balance_loss_mlp": 0.01268385, + "epoch": 0.2756651134826394, + "flos": 19580107484160.0, + "grad_norm": 1.7056038485870715, + "language_loss": 0.77640843, + "learning_rate": 3.399612333050327e-06, + "loss": 0.8543542, + "num_input_tokens_seen": 99074685, + "router_z_loss_clip": 2.08789062, + "router_z_loss_mlp": 0.17919922, + "step": 4585, + "time_per_iteration": 2.5581910610198975 + }, + { + "auxiliary_loss_clip": 0.06520131, + "auxiliary_loss_mlp": 0.01290999, + "balance_loss_clip": 0.06302814, + "balance_loss_mlp": 0.01271151, + "epoch": 0.27572523673530736, + "flos": 23593306786560.0, + "grad_norm": 1.6012607614221503, + "language_loss": 0.72652835, + "learning_rate": 3.399334101267362e-06, + "loss": 0.8046397, + "num_input_tokens_seen": 99095300, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.1986084, + "step": 4586, + "time_per_iteration": 2.5581955909729004 + }, + { + "auxiliary_loss_clip": 0.06512299, + "auxiliary_loss_mlp": 0.01283131, + "balance_loss_clip": 0.06300563, + "balance_loss_mlp": 0.01264475, + "epoch": 0.2757853599879754, + "flos": 22826696981760.0, + "grad_norm": 1.4211606049909042, + "language_loss": 0.8102116, + "learning_rate": 3.3990558164215073e-06, + "loss": 0.88816595, + "num_input_tokens_seen": 99115965, + "router_z_loss_clip": 2.11523438, + "router_z_loss_mlp": 0.18664551, + "step": 4587, + "time_per_iteration": 2.6184678077697754 + }, + { + "auxiliary_loss_clip": 0.0651072, + "auxiliary_loss_mlp": 0.01292397, + "balance_loss_clip": 0.06300361, + "balance_loss_mlp": 0.01273037, + "epoch": 0.27584548324064334, + "flos": 18557639637120.0, + "grad_norm": 2.3677019636161716, + "language_loss": 0.83699477, + "learning_rate": 3.398777478523316e-06, + "loss": 0.91502589, + "num_input_tokens_seen": 99134265, + "router_z_loss_clip": 2.10351562, + "router_z_loss_mlp": 0.19348145, + "step": 4588, + "time_per_iteration": 2.5100526809692383 + }, + { + "auxiliary_loss_clip": 0.06502403, + "auxiliary_loss_mlp": 0.01287014, + "balance_loss_clip": 0.06294176, + "balance_loss_mlp": 0.0126856, + "epoch": 0.2759056064933113, + "flos": 23776811228160.0, + "grad_norm": 1.8520309888563375, + "language_loss": 0.76066566, + "learning_rate": 3.398499087583342e-06, + "loss": 0.83855987, + "num_input_tokens_seen": 99156185, + "router_z_loss_clip": 2.08398438, + "router_z_loss_mlp": 0.18457031, + "step": 4589, + "time_per_iteration": 2.5906028747558594 + }, + { + "auxiliary_loss_clip": 0.06503198, + "auxiliary_loss_mlp": 0.01281135, + "balance_loss_clip": 0.06293473, + "balance_loss_mlp": 0.01261703, + "epoch": 0.27596572974597927, + "flos": 24289114291200.0, + "grad_norm": 1.7619688929899446, + "language_loss": 0.88857687, + "learning_rate": 3.398220643612143e-06, + "loss": 0.96642017, + "num_input_tokens_seen": 99176735, + "router_z_loss_clip": 2.09570312, + "router_z_loss_mlp": 0.19421387, + "step": 4590, + "time_per_iteration": 2.5526933670043945 + }, + { + "auxiliary_loss_clip": 0.0650104, + "auxiliary_loss_mlp": 0.01279948, + "balance_loss_clip": 0.06291595, + "balance_loss_mlp": 0.01261041, + "epoch": 0.27602585299864724, + "flos": 35049296206080.0, + "grad_norm": 1.573202994920717, + "language_loss": 0.71835011, + "learning_rate": 3.397942146620277e-06, + "loss": 0.79615998, + "num_input_tokens_seen": 99199765, + "router_z_loss_clip": 2.09667969, + "router_z_loss_mlp": 0.18908691, + "step": 4591, + "time_per_iteration": 2.659573554992676 + }, + { + "auxiliary_loss_clip": 0.06502488, + "auxiliary_loss_mlp": 0.01277501, + "balance_loss_clip": 0.06290874, + "balance_loss_mlp": 0.01258964, + "epoch": 0.2760859762513152, + "flos": 24315123784320.0, + "grad_norm": 2.0980893762293866, + "language_loss": 0.80327255, + "learning_rate": 3.3976635966183046e-06, + "loss": 0.8810724, + "num_input_tokens_seen": 99218435, + "router_z_loss_clip": 2.11621094, + "router_z_loss_mlp": 0.18530273, + "step": 4592, + "time_per_iteration": 2.5534770488739014 + }, + { + "auxiliary_loss_clip": 0.06405188, + "auxiliary_loss_mlp": 0.01272136, + "balance_loss_clip": 0.06302959, + "balance_loss_mlp": 0.0126841, + "epoch": 0.27614609950398317, + "flos": 71279435675520.0, + "grad_norm": 0.6848268802880488, + "language_loss": 0.6162945, + "learning_rate": 3.3973849936167886e-06, + "loss": 0.69306767, + "num_input_tokens_seen": 99276200, + "router_z_loss_clip": 1.0234375, + "router_z_loss_mlp": 0.03717041, + "step": 4593, + "time_per_iteration": 3.127192735671997 + }, + { + "auxiliary_loss_clip": 0.06506699, + "auxiliary_loss_mlp": 0.01276217, + "balance_loss_clip": 0.0629646, + "balance_loss_mlp": 0.01256881, + "epoch": 0.27620622275665113, + "flos": 29681811688320.0, + "grad_norm": 2.6081053554454363, + "language_loss": 0.77380788, + "learning_rate": 3.3971063376262937e-06, + "loss": 0.85163713, + "num_input_tokens_seen": 99297625, + "router_z_loss_clip": 2.1015625, + "router_z_loss_mlp": 0.1932373, + "step": 4594, + "time_per_iteration": 2.5809319019317627 + }, + { + "auxiliary_loss_clip": 0.06503148, + "auxiliary_loss_mlp": 0.01273163, + "balance_loss_clip": 0.06295307, + "balance_loss_mlp": 0.01255138, + "epoch": 0.2762663460093191, + "flos": 15383571448320.0, + "grad_norm": 1.4453472339612206, + "language_loss": 0.9229176, + "learning_rate": 3.3968276286573866e-06, + "loss": 1.00068069, + "num_input_tokens_seen": 99315790, + "router_z_loss_clip": 2.08007812, + "router_z_loss_mlp": 0.18029785, + "step": 4595, + "time_per_iteration": 3.9466536045074463 + }, + { + "auxiliary_loss_clip": 0.06509015, + "auxiliary_loss_mlp": 0.01281786, + "balance_loss_clip": 0.06294905, + "balance_loss_mlp": 0.01261592, + "epoch": 0.27632646926198706, + "flos": 20710330081920.0, + "grad_norm": 1.8151181533722092, + "language_loss": 0.69491673, + "learning_rate": 3.3965488667206353e-06, + "loss": 0.77282476, + "num_input_tokens_seen": 99334615, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.2019043, + "step": 4596, + "time_per_iteration": 2.552893877029419 + }, + { + "auxiliary_loss_clip": 0.06517404, + "auxiliary_loss_mlp": 0.01272476, + "balance_loss_clip": 0.0629788, + "balance_loss_mlp": 0.0125382, + "epoch": 0.276386592514655, + "flos": 32820981851520.0, + "grad_norm": 1.6734752779014743, + "language_loss": 0.64091378, + "learning_rate": 3.3962700518266113e-06, + "loss": 0.71881258, + "num_input_tokens_seen": 99356685, + "router_z_loss_clip": 2.19335938, + "router_z_loss_mlp": 0.18652344, + "step": 4597, + "time_per_iteration": 2.61291766166687 + }, + { + "auxiliary_loss_clip": 0.06500123, + "auxiliary_loss_mlp": 0.01279427, + "balance_loss_clip": 0.0629456, + "balance_loss_mlp": 0.01260616, + "epoch": 0.276446715767323, + "flos": 18557639637120.0, + "grad_norm": 1.8925825739150304, + "language_loss": 0.86690855, + "learning_rate": 3.395991183985887e-06, + "loss": 0.94470406, + "num_input_tokens_seen": 99374810, + "router_z_loss_clip": 2.05664062, + "router_z_loss_mlp": 0.18835449, + "step": 4598, + "time_per_iteration": 2.5411598682403564 + }, + { + "auxiliary_loss_clip": 0.0650408, + "auxiliary_loss_mlp": 0.0127527, + "balance_loss_clip": 0.06291056, + "balance_loss_mlp": 0.01256554, + "epoch": 0.27650683901999096, + "flos": 22826110003200.0, + "grad_norm": 2.378506410601605, + "language_loss": 0.79588032, + "learning_rate": 3.395712263209037e-06, + "loss": 0.8736738, + "num_input_tokens_seen": 99391290, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.18725586, + "step": 4599, + "time_per_iteration": 2.515411138534546 + }, + { + "auxiliary_loss_clip": 0.06518425, + "auxiliary_loss_mlp": 0.01279235, + "balance_loss_clip": 0.06301137, + "balance_loss_mlp": 0.01259756, + "epoch": 0.276566962272659, + "flos": 21368011178880.0, + "grad_norm": 2.1602669865212487, + "language_loss": 0.80043805, + "learning_rate": 3.395433289506639e-06, + "loss": 0.87841463, + "num_input_tokens_seen": 99409120, + "router_z_loss_clip": 2.17480469, + "router_z_loss_mlp": 0.19482422, + "step": 4600, + "time_per_iteration": 5.317862033843994 + }, + { + "auxiliary_loss_clip": 0.06511359, + "auxiliary_loss_mlp": 0.01276749, + "balance_loss_clip": 0.06296661, + "balance_loss_mlp": 0.01258843, + "epoch": 0.27662708552532694, + "flos": 17716076755200.0, + "grad_norm": 12.932121146702709, + "language_loss": 0.73461431, + "learning_rate": 3.3951542628892694e-06, + "loss": 0.81249541, + "num_input_tokens_seen": 99426180, + "router_z_loss_clip": 2.14550781, + "router_z_loss_mlp": 0.17907715, + "step": 4601, + "time_per_iteration": 2.5192854404449463 + }, + { + "auxiliary_loss_clip": 0.0650773, + "auxiliary_loss_mlp": 0.01282643, + "balance_loss_clip": 0.06297003, + "balance_loss_mlp": 0.01263676, + "epoch": 0.2766872087779949, + "flos": 21259292106240.0, + "grad_norm": 1.833059055741047, + "language_loss": 0.8051585, + "learning_rate": 3.3948751833675113e-06, + "loss": 0.88306224, + "num_input_tokens_seen": 99447720, + "router_z_loss_clip": 2.10742188, + "router_z_loss_mlp": 0.18981934, + "step": 4602, + "time_per_iteration": 2.635265350341797 + }, + { + "auxiliary_loss_clip": 0.06517955, + "auxiliary_loss_mlp": 0.01279657, + "balance_loss_clip": 0.06297721, + "balance_loss_mlp": 0.01259749, + "epoch": 0.2767473320306629, + "flos": 12936728845440.0, + "grad_norm": 2.082735068257359, + "language_loss": 0.7691201, + "learning_rate": 3.3945960509519455e-06, + "loss": 0.8470962, + "num_input_tokens_seen": 99464720, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.19921875, + "step": 4603, + "time_per_iteration": 2.6102261543273926 + }, + { + "auxiliary_loss_clip": 0.06506386, + "auxiliary_loss_mlp": 0.01276601, + "balance_loss_clip": 0.06300791, + "balance_loss_mlp": 0.01259017, + "epoch": 0.27680745528333084, + "flos": 15018239646720.0, + "grad_norm": 1.5173997695974415, + "language_loss": 0.81704807, + "learning_rate": 3.3943168656531585e-06, + "loss": 0.89487797, + "num_input_tokens_seen": 99482310, + "router_z_loss_clip": 2.05859375, + "router_z_loss_mlp": 0.17578125, + "step": 4604, + "time_per_iteration": 2.5022366046905518 + }, + { + "auxiliary_loss_clip": 0.06510165, + "auxiliary_loss_mlp": 0.01279666, + "balance_loss_clip": 0.06295862, + "balance_loss_mlp": 0.01261367, + "epoch": 0.2768675785359988, + "flos": 22644408424320.0, + "grad_norm": 1.8407701121062605, + "language_loss": 0.70736969, + "learning_rate": 3.3940376274817363e-06, + "loss": 0.78526795, + "num_input_tokens_seen": 99501255, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.18310547, + "step": 4605, + "time_per_iteration": 4.068409442901611 + }, + { + "auxiliary_loss_clip": 0.06402105, + "auxiliary_loss_mlp": 0.01269906, + "balance_loss_clip": 0.0629937, + "balance_loss_mlp": 0.01266097, + "epoch": 0.27692770178866677, + "flos": 66150772093440.0, + "grad_norm": 0.7075303746126435, + "language_loss": 0.57218695, + "learning_rate": 3.3937583364482673e-06, + "loss": 0.64890707, + "num_input_tokens_seen": 99568925, + "router_z_loss_clip": 1.03027344, + "router_z_loss_mlp": 0.0380249, + "step": 4606, + "time_per_iteration": 3.269275426864624 + }, + { + "auxiliary_loss_clip": 0.06516754, + "auxiliary_loss_mlp": 0.01286288, + "balance_loss_clip": 0.06299627, + "balance_loss_mlp": 0.01266118, + "epoch": 0.27698782504133473, + "flos": 26471545735680.0, + "grad_norm": 1.9632725808751148, + "language_loss": 0.69427574, + "learning_rate": 3.3934789925633424e-06, + "loss": 0.77230614, + "num_input_tokens_seen": 99588455, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.20153809, + "step": 4607, + "time_per_iteration": 2.566908836364746 + }, + { + "auxiliary_loss_clip": 0.06512889, + "auxiliary_loss_mlp": 0.01276778, + "balance_loss_clip": 0.06304939, + "balance_loss_mlp": 0.01258849, + "epoch": 0.2770479482940027, + "flos": 25891878389760.0, + "grad_norm": 1.6636880421304368, + "language_loss": 0.70338356, + "learning_rate": 3.393199595837555e-06, + "loss": 0.78128028, + "num_input_tokens_seen": 99609355, + "router_z_loss_clip": 2.08203125, + "router_z_loss_mlp": 0.17919922, + "step": 4608, + "time_per_iteration": 2.709989309310913 + }, + { + "auxiliary_loss_clip": 0.06514756, + "auxiliary_loss_mlp": 0.01279509, + "balance_loss_clip": 0.06298438, + "balance_loss_mlp": 0.01260781, + "epoch": 0.27710807154667066, + "flos": 22863942921600.0, + "grad_norm": 1.8326330841759049, + "language_loss": 0.73323762, + "learning_rate": 3.392920146281499e-06, + "loss": 0.81118023, + "num_input_tokens_seen": 99628780, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.18725586, + "step": 4609, + "time_per_iteration": 2.530625581741333 + }, + { + "auxiliary_loss_clip": 0.06522895, + "auxiliary_loss_mlp": 0.01276886, + "balance_loss_clip": 0.063067, + "balance_loss_mlp": 0.0125749, + "epoch": 0.27716819479933863, + "flos": 17716621806720.0, + "grad_norm": 2.1915868475112714, + "language_loss": 0.84688777, + "learning_rate": 3.3926406439057714e-06, + "loss": 0.92488557, + "num_input_tokens_seen": 99644545, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.19396973, + "step": 4610, + "time_per_iteration": 2.578780174255371 + }, + { + "auxiliary_loss_clip": 0.06521606, + "auxiliary_loss_mlp": 0.01280928, + "balance_loss_clip": 0.06305112, + "balance_loss_mlp": 0.01260054, + "epoch": 0.2772283180520066, + "flos": 19652125668480.0, + "grad_norm": 1.9738462991775114, + "language_loss": 0.69718874, + "learning_rate": 3.3923610887209705e-06, + "loss": 0.77521408, + "num_input_tokens_seen": 99663125, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.20874023, + "step": 4611, + "time_per_iteration": 2.5499660968780518 + }, + { + "auxiliary_loss_clip": 0.0651576, + "auxiliary_loss_mlp": 0.0127314, + "balance_loss_clip": 0.06309414, + "balance_loss_mlp": 0.01254997, + "epoch": 0.27728844130467456, + "flos": 21038960995200.0, + "grad_norm": 1.8677227151172762, + "language_loss": 0.74507141, + "learning_rate": 3.392081480737698e-06, + "loss": 0.82296044, + "num_input_tokens_seen": 99682645, + "router_z_loss_clip": 2.06347656, + "router_z_loss_mlp": 0.18151855, + "step": 4612, + "time_per_iteration": 2.567218065261841 + }, + { + "auxiliary_loss_clip": 0.06522087, + "auxiliary_loss_mlp": 0.01282319, + "balance_loss_clip": 0.06306847, + "balance_loss_mlp": 0.01263067, + "epoch": 0.2773485645573425, + "flos": 18995157331200.0, + "grad_norm": 2.3882423035535063, + "language_loss": 0.67084455, + "learning_rate": 3.3918018199665563e-06, + "loss": 0.74888861, + "num_input_tokens_seen": 99700520, + "router_z_loss_clip": 2.14941406, + "router_z_loss_mlp": 0.19250488, + "step": 4613, + "time_per_iteration": 2.5458126068115234 + }, + { + "auxiliary_loss_clip": 0.06515062, + "auxiliary_loss_mlp": 0.01275499, + "balance_loss_clip": 0.06304698, + "balance_loss_mlp": 0.0125577, + "epoch": 0.27740868781001055, + "flos": 21474508118400.0, + "grad_norm": 1.6100748666203144, + "language_loss": 0.79936564, + "learning_rate": 3.39152210641815e-06, + "loss": 0.87727129, + "num_input_tokens_seen": 99720355, + "router_z_loss_clip": 2.1015625, + "router_z_loss_mlp": 0.19750977, + "step": 4614, + "time_per_iteration": 2.5586962699890137 + }, + { + "auxiliary_loss_clip": 0.06520429, + "auxiliary_loss_mlp": 0.01279079, + "balance_loss_clip": 0.06305806, + "balance_loss_mlp": 0.01257884, + "epoch": 0.2774688110626785, + "flos": 19833827247360.0, + "grad_norm": 2.249482091575283, + "language_loss": 0.81082475, + "learning_rate": 3.3912423401030865e-06, + "loss": 0.88881981, + "num_input_tokens_seen": 99736090, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.21179199, + "step": 4615, + "time_per_iteration": 2.5192136764526367 + }, + { + "auxiliary_loss_clip": 0.0652476, + "auxiliary_loss_mlp": 0.0127518, + "balance_loss_clip": 0.06306368, + "balance_loss_mlp": 0.01256655, + "epoch": 0.2775289343153465, + "flos": 18220916805120.0, + "grad_norm": 2.6879454427381715, + "language_loss": 0.64382082, + "learning_rate": 3.3909625210319735e-06, + "loss": 0.72182024, + "num_input_tokens_seen": 99751805, + "router_z_loss_clip": 2.18164062, + "router_z_loss_mlp": 0.18518066, + "step": 4616, + "time_per_iteration": 2.528766393661499 + }, + { + "auxiliary_loss_clip": 0.06523173, + "auxiliary_loss_mlp": 0.01284441, + "balance_loss_clip": 0.06308753, + "balance_loss_mlp": 0.0126377, + "epoch": 0.27758905756801444, + "flos": 16478141333760.0, + "grad_norm": 2.0768832102625296, + "language_loss": 0.82857239, + "learning_rate": 3.3906826492154226e-06, + "loss": 0.90664852, + "num_input_tokens_seen": 99770610, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.20678711, + "step": 4617, + "time_per_iteration": 2.5130555629730225 + }, + { + "auxiliary_loss_clip": 0.06522305, + "auxiliary_loss_mlp": 0.01278739, + "balance_loss_clip": 0.06308021, + "balance_loss_mlp": 0.01260059, + "epoch": 0.2776491808206824, + "flos": 18733219868160.0, + "grad_norm": 2.583119020836192, + "language_loss": 0.77338278, + "learning_rate": 3.3904027246640458e-06, + "loss": 0.85139322, + "num_input_tokens_seen": 99787305, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.18676758, + "step": 4618, + "time_per_iteration": 2.5491156578063965 + }, + { + "auxiliary_loss_clip": 0.06524394, + "auxiliary_loss_mlp": 0.01277476, + "balance_loss_clip": 0.06309742, + "balance_loss_mlp": 0.01260191, + "epoch": 0.27770930407335037, + "flos": 28045742791680.0, + "grad_norm": 1.764934716544716, + "language_loss": 0.85733759, + "learning_rate": 3.390122747388459e-06, + "loss": 0.93535626, + "num_input_tokens_seen": 99808940, + "router_z_loss_clip": 2.14648438, + "router_z_loss_mlp": 0.17297363, + "step": 4619, + "time_per_iteration": 2.5741615295410156 + }, + { + "auxiliary_loss_clip": 0.06514929, + "auxiliary_loss_mlp": 0.01285121, + "balance_loss_clip": 0.06308962, + "balance_loss_mlp": 0.01266798, + "epoch": 0.27776942732601834, + "flos": 23556522044160.0, + "grad_norm": 1.4813387132666624, + "language_loss": 0.77092409, + "learning_rate": 3.3898427173992778e-06, + "loss": 0.84892452, + "num_input_tokens_seen": 99829575, + "router_z_loss_clip": 2.0625, + "router_z_loss_mlp": 0.18322754, + "step": 4620, + "time_per_iteration": 2.690934658050537 + }, + { + "auxiliary_loss_clip": 0.0651743, + "auxiliary_loss_mlp": 0.01277569, + "balance_loss_clip": 0.06309397, + "balance_loss_mlp": 0.0125821, + "epoch": 0.2778295505786863, + "flos": 23914474686720.0, + "grad_norm": 1.8907472710416175, + "language_loss": 0.78585863, + "learning_rate": 3.389562634707122e-06, + "loss": 0.86380863, + "num_input_tokens_seen": 99847575, + "router_z_loss_clip": 2.07910156, + "router_z_loss_mlp": 0.19360352, + "step": 4621, + "time_per_iteration": 2.5846168994903564 + }, + { + "auxiliary_loss_clip": 0.06522836, + "auxiliary_loss_mlp": 0.01279001, + "balance_loss_clip": 0.0630835, + "balance_loss_mlp": 0.01259701, + "epoch": 0.27788967383135427, + "flos": 25561276905600.0, + "grad_norm": 2.170367430288875, + "language_loss": 0.88217753, + "learning_rate": 3.389282499322611e-06, + "loss": 0.96019584, + "num_input_tokens_seen": 99864995, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.1932373, + "step": 4622, + "time_per_iteration": 2.6036407947540283 + }, + { + "auxiliary_loss_clip": 0.06512653, + "auxiliary_loss_mlp": 0.01273349, + "balance_loss_clip": 0.06299745, + "balance_loss_mlp": 0.01254919, + "epoch": 0.27794979708402223, + "flos": 16258103712000.0, + "grad_norm": 2.5896700244630018, + "language_loss": 0.81515396, + "learning_rate": 3.389002311256369e-06, + "loss": 0.89301395, + "num_input_tokens_seen": 99881540, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.18432617, + "step": 4623, + "time_per_iteration": 2.539655923843384 + }, + { + "auxiliary_loss_clip": 0.06518189, + "auxiliary_loss_mlp": 0.01278229, + "balance_loss_clip": 0.06306686, + "balance_loss_mlp": 0.01258941, + "epoch": 0.2780099203366902, + "flos": 20673880755840.0, + "grad_norm": 1.9609752985345037, + "language_loss": 0.82099682, + "learning_rate": 3.3887220705190204e-06, + "loss": 0.89896095, + "num_input_tokens_seen": 99899595, + "router_z_loss_clip": 2.11621094, + "router_z_loss_mlp": 0.19274902, + "step": 4624, + "time_per_iteration": 2.5662107467651367 + }, + { + "auxiliary_loss_clip": 0.06512089, + "auxiliary_loss_mlp": 0.01276338, + "balance_loss_clip": 0.06303106, + "balance_loss_mlp": 0.01258004, + "epoch": 0.27807004358935816, + "flos": 17743805256960.0, + "grad_norm": 3.013190567677447, + "language_loss": 0.77269506, + "learning_rate": 3.388441777121191e-06, + "loss": 0.85057938, + "num_input_tokens_seen": 99913020, + "router_z_loss_clip": 2.08789062, + "router_z_loss_mlp": 0.18322754, + "step": 4625, + "time_per_iteration": 2.5685927867889404 + }, + { + "auxiliary_loss_clip": 0.06507699, + "auxiliary_loss_mlp": 0.01271539, + "balance_loss_clip": 0.06299223, + "balance_loss_mlp": 0.01253658, + "epoch": 0.2781301668420261, + "flos": 16732699637760.0, + "grad_norm": 1.9769276375727096, + "language_loss": 0.70884871, + "learning_rate": 3.388161431073511e-06, + "loss": 0.78664112, + "num_input_tokens_seen": 99931405, + "router_z_loss_clip": 2.08398438, + "router_z_loss_mlp": 0.17883301, + "step": 4626, + "time_per_iteration": 2.527975559234619 + }, + { + "auxiliary_loss_clip": 0.06520554, + "auxiliary_loss_mlp": 0.01273216, + "balance_loss_clip": 0.06304689, + "balance_loss_mlp": 0.01254798, + "epoch": 0.27819029009469415, + "flos": 13849848714240.0, + "grad_norm": 2.4481240639566013, + "language_loss": 0.93016249, + "learning_rate": 3.38788103238661e-06, + "loss": 1.00810015, + "num_input_tokens_seen": 99948100, + "router_z_loss_clip": 2.15917969, + "router_z_loss_mlp": 0.18432617, + "step": 4627, + "time_per_iteration": 2.551558494567871 + }, + { + "auxiliary_loss_clip": 0.06514014, + "auxiliary_loss_mlp": 0.01276758, + "balance_loss_clip": 0.06298277, + "balance_loss_mlp": 0.01258364, + "epoch": 0.2782504133473621, + "flos": 27096634794240.0, + "grad_norm": 1.6603793888564844, + "language_loss": 0.85558021, + "learning_rate": 3.387600581071121e-06, + "loss": 0.93348801, + "num_input_tokens_seen": 99966470, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.1842041, + "step": 4628, + "time_per_iteration": 2.56680965423584 + }, + { + "auxiliary_loss_clip": 0.06511193, + "auxiliary_loss_mlp": 0.01275379, + "balance_loss_clip": 0.06301076, + "balance_loss_mlp": 0.01257569, + "epoch": 0.2783105366000301, + "flos": 21075116832000.0, + "grad_norm": 1.7183700627805243, + "language_loss": 0.79370463, + "learning_rate": 3.387320077137679e-06, + "loss": 0.87157035, + "num_input_tokens_seen": 99985930, + "router_z_loss_clip": 2.1015625, + "router_z_loss_mlp": 0.17810059, + "step": 4629, + "time_per_iteration": 2.579024076461792 + }, + { + "auxiliary_loss_clip": 0.06504764, + "auxiliary_loss_mlp": 0.01277211, + "balance_loss_clip": 0.06300465, + "balance_loss_mlp": 0.01259699, + "epoch": 0.27837065985269804, + "flos": 26508456259200.0, + "grad_norm": 2.4632649346037856, + "language_loss": 0.84664094, + "learning_rate": 3.3870395205969208e-06, + "loss": 0.92446071, + "num_input_tokens_seen": 100006235, + "router_z_loss_clip": 2.04296875, + "router_z_loss_mlp": 0.17529297, + "step": 4630, + "time_per_iteration": 2.568190336227417 + }, + { + "auxiliary_loss_clip": 0.06516108, + "auxiliary_loss_mlp": 0.01271169, + "balance_loss_clip": 0.06302783, + "balance_loss_mlp": 0.01253395, + "epoch": 0.278430783105366, + "flos": 20228271143040.0, + "grad_norm": 1.8872458968592738, + "language_loss": 0.80858278, + "learning_rate": 3.386758911459485e-06, + "loss": 0.8864556, + "num_input_tokens_seen": 100023655, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.17773438, + "step": 4631, + "time_per_iteration": 2.5658912658691406 + }, + { + "auxiliary_loss_clip": 0.06512441, + "auxiliary_loss_mlp": 0.01275522, + "balance_loss_clip": 0.06299636, + "balance_loss_mlp": 0.01256866, + "epoch": 0.278490906358034, + "flos": 25599906437760.0, + "grad_norm": 2.407277572133289, + "language_loss": 0.715128, + "learning_rate": 3.3864782497360126e-06, + "loss": 0.79300761, + "num_input_tokens_seen": 100043280, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.18652344, + "step": 4632, + "time_per_iteration": 2.620729446411133 + }, + { + "auxiliary_loss_clip": 0.06502309, + "auxiliary_loss_mlp": 0.01270787, + "balance_loss_clip": 0.06296511, + "balance_loss_mlp": 0.01253502, + "epoch": 0.27855102961070194, + "flos": 16175645694720.0, + "grad_norm": 1.8302171024684264, + "language_loss": 0.82394838, + "learning_rate": 3.386197535437145e-06, + "loss": 0.9016794, + "num_input_tokens_seen": 100057690, + "router_z_loss_clip": 2.05859375, + "router_z_loss_mlp": 0.17297363, + "step": 4633, + "time_per_iteration": 2.513705015182495 + }, + { + "auxiliary_loss_clip": 0.06511516, + "auxiliary_loss_mlp": 0.01278904, + "balance_loss_clip": 0.06299913, + "balance_loss_mlp": 0.012597, + "epoch": 0.2786111528633699, + "flos": 22933864753920.0, + "grad_norm": 1.5843012688553681, + "language_loss": 0.8872478, + "learning_rate": 3.385916768573529e-06, + "loss": 0.96515197, + "num_input_tokens_seen": 100075875, + "router_z_loss_clip": 2.11523438, + "router_z_loss_mlp": 0.19213867, + "step": 4634, + "time_per_iteration": 2.5471088886260986 + }, + { + "auxiliary_loss_clip": 0.06514788, + "auxiliary_loss_mlp": 0.01276007, + "balance_loss_clip": 0.06301814, + "balance_loss_mlp": 0.01256588, + "epoch": 0.27867127611603787, + "flos": 23410934375040.0, + "grad_norm": 1.5369483246730489, + "language_loss": 0.77466059, + "learning_rate": 3.38563594915581e-06, + "loss": 0.85256851, + "num_input_tokens_seen": 100092930, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.19433594, + "step": 4635, + "time_per_iteration": 3.9016311168670654 + }, + { + "auxiliary_loss_clip": 0.06508552, + "auxiliary_loss_mlp": 0.01273062, + "balance_loss_clip": 0.06295648, + "balance_loss_mlp": 0.01254859, + "epoch": 0.27873139936870583, + "flos": 19835210839680.0, + "grad_norm": 1.7801998538005617, + "language_loss": 0.66571766, + "learning_rate": 3.385355077194637e-06, + "loss": 0.74353385, + "num_input_tokens_seen": 100110790, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.18188477, + "step": 4636, + "time_per_iteration": 2.5264599323272705 + }, + { + "auxiliary_loss_clip": 0.06519878, + "auxiliary_loss_mlp": 0.01275894, + "balance_loss_clip": 0.06302889, + "balance_loss_mlp": 0.01256392, + "epoch": 0.2787915226213738, + "flos": 17712638737920.0, + "grad_norm": 2.933733922484583, + "language_loss": 0.83255613, + "learning_rate": 3.3850741527006604e-06, + "loss": 0.91051382, + "num_input_tokens_seen": 100126970, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.19506836, + "step": 4637, + "time_per_iteration": 2.5344014167785645 + }, + { + "auxiliary_loss_clip": 0.06505676, + "auxiliary_loss_mlp": 0.01276787, + "balance_loss_clip": 0.06297021, + "balance_loss_mlp": 0.01258918, + "epoch": 0.27885164587404176, + "flos": 22097039627520.0, + "grad_norm": 1.4932909871395708, + "language_loss": 0.76038569, + "learning_rate": 3.384793175684533e-06, + "loss": 0.83821034, + "num_input_tokens_seen": 100146720, + "router_z_loss_clip": 2.08691406, + "router_z_loss_mlp": 0.17871094, + "step": 4638, + "time_per_iteration": 2.544187068939209 + }, + { + "auxiliary_loss_clip": 0.06510019, + "auxiliary_loss_mlp": 0.01280274, + "balance_loss_clip": 0.06297282, + "balance_loss_mlp": 0.01262511, + "epoch": 0.27891176912670973, + "flos": 19213601725440.0, + "grad_norm": 2.235877812045319, + "language_loss": 0.72492748, + "learning_rate": 3.38451214615691e-06, + "loss": 0.8028304, + "num_input_tokens_seen": 100165920, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.17749023, + "step": 4639, + "time_per_iteration": 4.002680063247681 + }, + { + "auxiliary_loss_clip": 0.06515414, + "auxiliary_loss_mlp": 0.0127372, + "balance_loss_clip": 0.06300536, + "balance_loss_mlp": 0.01254813, + "epoch": 0.27897189237937775, + "flos": 27607428483840.0, + "grad_norm": 1.8877142592522154, + "language_loss": 0.66217673, + "learning_rate": 3.384231064128447e-06, + "loss": 0.74006808, + "num_input_tokens_seen": 100185525, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.18896484, + "step": 4640, + "time_per_iteration": 4.054874420166016 + }, + { + "auxiliary_loss_clip": 0.0651349, + "auxiliary_loss_mlp": 0.01272631, + "balance_loss_clip": 0.06301108, + "balance_loss_mlp": 0.01254654, + "epoch": 0.2790320156320457, + "flos": 21184506737280.0, + "grad_norm": 2.077527470737851, + "language_loss": 0.72818768, + "learning_rate": 3.383949929609804e-06, + "loss": 0.80604887, + "num_input_tokens_seen": 100204850, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.1796875, + "step": 4641, + "time_per_iteration": 2.566758155822754 + }, + { + "auxiliary_loss_clip": 0.06517549, + "auxiliary_loss_mlp": 0.01276062, + "balance_loss_clip": 0.06298883, + "balance_loss_mlp": 0.01256488, + "epoch": 0.2790921388847137, + "flos": 22790541144960.0, + "grad_norm": 1.8548696214163785, + "language_loss": 0.75277239, + "learning_rate": 3.383668742611641e-06, + "loss": 0.8307085, + "num_input_tokens_seen": 100224520, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.19567871, + "step": 4642, + "time_per_iteration": 2.5531389713287354 + }, + { + "auxiliary_loss_clip": 0.0651103, + "auxiliary_loss_mlp": 0.01281312, + "balance_loss_clip": 0.06296819, + "balance_loss_mlp": 0.01261631, + "epoch": 0.27915226213738165, + "flos": 23406783598080.0, + "grad_norm": 1.8301300365045747, + "language_loss": 0.85787475, + "learning_rate": 3.3833875031446205e-06, + "loss": 0.93579817, + "num_input_tokens_seen": 100243935, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.19689941, + "step": 4643, + "time_per_iteration": 2.561692714691162 + }, + { + "auxiliary_loss_clip": 0.06505755, + "auxiliary_loss_mlp": 0.01281002, + "balance_loss_clip": 0.06292956, + "balance_loss_mlp": 0.01262572, + "epoch": 0.2792123853900496, + "flos": 22754469162240.0, + "grad_norm": 2.128449816262669, + "language_loss": 0.83027583, + "learning_rate": 3.383106211219407e-06, + "loss": 0.9081434, + "num_input_tokens_seen": 100262290, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.1842041, + "step": 4644, + "time_per_iteration": 2.5298962593078613 + }, + { + "auxiliary_loss_clip": 0.06505448, + "auxiliary_loss_mlp": 0.01273805, + "balance_loss_clip": 0.0629155, + "balance_loss_mlp": 0.01256174, + "epoch": 0.2792725086427176, + "flos": 15054772826880.0, + "grad_norm": 1.7497246062339578, + "language_loss": 0.79546082, + "learning_rate": 3.3828248668466673e-06, + "loss": 0.87325335, + "num_input_tokens_seen": 100280015, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.17626953, + "step": 4645, + "time_per_iteration": 3.9172677993774414 + }, + { + "auxiliary_loss_clip": 0.06419063, + "auxiliary_loss_mlp": 0.01254208, + "balance_loss_clip": 0.0631457, + "balance_loss_mlp": 0.0125017, + "epoch": 0.27933263189538554, + "flos": 62562805862400.0, + "grad_norm": 0.7707831229317741, + "language_loss": 0.62136066, + "learning_rate": 3.3825434700370705e-06, + "loss": 0.6980933, + "num_input_tokens_seen": 100338935, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.04037476, + "step": 4646, + "time_per_iteration": 3.1527390480041504 + }, + { + "auxiliary_loss_clip": 0.06500821, + "auxiliary_loss_mlp": 0.01275319, + "balance_loss_clip": 0.0629313, + "balance_loss_mlp": 0.01257581, + "epoch": 0.2793927551480535, + "flos": 25125268584960.0, + "grad_norm": 1.6018723981737446, + "language_loss": 0.89582062, + "learning_rate": 3.3822620208012865e-06, + "loss": 0.97358203, + "num_input_tokens_seen": 100359905, + "router_z_loss_clip": 2.078125, + "router_z_loss_mlp": 0.17736816, + "step": 4647, + "time_per_iteration": 2.564333915710449 + }, + { + "auxiliary_loss_clip": 0.06509704, + "auxiliary_loss_mlp": 0.01277108, + "balance_loss_clip": 0.06292088, + "balance_loss_mlp": 0.01258142, + "epoch": 0.27945287840072147, + "flos": 21330974874240.0, + "grad_norm": 1.6381839497334347, + "language_loss": 0.87525821, + "learning_rate": 3.381980519149988e-06, + "loss": 0.95312631, + "num_input_tokens_seen": 100376955, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.1895752, + "step": 4648, + "time_per_iteration": 2.5516953468322754 + }, + { + "auxiliary_loss_clip": 0.06507549, + "auxiliary_loss_mlp": 0.01274847, + "balance_loss_clip": 0.06291072, + "balance_loss_mlp": 0.01256643, + "epoch": 0.27951300165338944, + "flos": 27457354621440.0, + "grad_norm": 2.652634800411286, + "language_loss": 0.73020303, + "learning_rate": 3.38169896509385e-06, + "loss": 0.80802703, + "num_input_tokens_seen": 100397545, + "router_z_loss_clip": 2.1640625, + "router_z_loss_mlp": 0.18212891, + "step": 4649, + "time_per_iteration": 2.5767719745635986 + }, + { + "auxiliary_loss_clip": 0.06508242, + "auxiliary_loss_mlp": 0.01277361, + "balance_loss_clip": 0.0629622, + "balance_loss_mlp": 0.01259003, + "epoch": 0.2795731249060574, + "flos": 15164456221440.0, + "grad_norm": 2.110277953429804, + "language_loss": 0.81314564, + "learning_rate": 3.381417358643549e-06, + "loss": 0.8910017, + "num_input_tokens_seen": 100415080, + "router_z_loss_clip": 2.12109375, + "router_z_loss_mlp": 0.18347168, + "step": 4650, + "time_per_iteration": 2.663588285446167 + }, + { + "auxiliary_loss_clip": 0.06406052, + "auxiliary_loss_mlp": 0.01252705, + "balance_loss_clip": 0.06303374, + "balance_loss_mlp": 0.01248944, + "epoch": 0.27963324815872537, + "flos": 60140951775360.0, + "grad_norm": 0.800089640521837, + "language_loss": 0.5874877, + "learning_rate": 3.3811356998097624e-06, + "loss": 0.66407531, + "num_input_tokens_seen": 100471105, + "router_z_loss_clip": 1.0234375, + "router_z_loss_mlp": 0.03753662, + "step": 4651, + "time_per_iteration": 3.205563545227051 + }, + { + "auxiliary_loss_clip": 0.06513405, + "auxiliary_loss_mlp": 0.01276159, + "balance_loss_clip": 0.06293929, + "balance_loss_mlp": 0.01257205, + "epoch": 0.27969337141139333, + "flos": 21773020688640.0, + "grad_norm": 1.70848848544609, + "language_loss": 0.74928713, + "learning_rate": 3.3808539886031726e-06, + "loss": 0.82718277, + "num_input_tokens_seen": 100492520, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.18945312, + "step": 4652, + "time_per_iteration": 2.620284080505371 + }, + { + "auxiliary_loss_clip": 0.06513481, + "auxiliary_loss_mlp": 0.01277362, + "balance_loss_clip": 0.06297033, + "balance_loss_mlp": 0.01259517, + "epoch": 0.27975349466406135, + "flos": 39859559072640.0, + "grad_norm": 2.257859492249039, + "language_loss": 0.81193566, + "learning_rate": 3.380572225034461e-06, + "loss": 0.88984406, + "num_input_tokens_seen": 100512870, + "router_z_loss_clip": 2.16601562, + "router_z_loss_mlp": 0.17834473, + "step": 4653, + "time_per_iteration": 2.6902103424072266 + }, + { + "auxiliary_loss_clip": 0.06505801, + "auxiliary_loss_mlp": 0.01275903, + "balance_loss_clip": 0.06293398, + "balance_loss_mlp": 0.01257939, + "epoch": 0.2798136179167293, + "flos": 21586204010880.0, + "grad_norm": 2.2005279612587647, + "language_loss": 0.78939915, + "learning_rate": 3.380290409114312e-06, + "loss": 0.86721623, + "num_input_tokens_seen": 100531655, + "router_z_loss_clip": 2.12304688, + "router_z_loss_mlp": 0.17956543, + "step": 4654, + "time_per_iteration": 2.5862321853637695 + }, + { + "auxiliary_loss_clip": 0.06514826, + "auxiliary_loss_mlp": 0.01276603, + "balance_loss_clip": 0.06294681, + "balance_loss_mlp": 0.01256457, + "epoch": 0.2798737411693973, + "flos": 21543130212480.0, + "grad_norm": 2.786817882874951, + "language_loss": 0.81491858, + "learning_rate": 3.3800085408534127e-06, + "loss": 0.89283288, + "num_input_tokens_seen": 100548005, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.20153809, + "step": 4655, + "time_per_iteration": 2.5335962772369385 + }, + { + "auxiliary_loss_clip": 0.06503223, + "auxiliary_loss_mlp": 0.01276615, + "balance_loss_clip": 0.06287771, + "balance_loss_mlp": 0.0125778, + "epoch": 0.27993386442206525, + "flos": 26988586554240.0, + "grad_norm": 1.7572759264995625, + "language_loss": 0.82015479, + "learning_rate": 3.3797266202624506e-06, + "loss": 0.89795309, + "num_input_tokens_seen": 100567980, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.18847656, + "step": 4656, + "time_per_iteration": 2.5953826904296875 + }, + { + "auxiliary_loss_clip": 0.0650457, + "auxiliary_loss_mlp": 0.01280726, + "balance_loss_clip": 0.06291523, + "balance_loss_mlp": 0.01261319, + "epoch": 0.2799939876747332, + "flos": 24356268938880.0, + "grad_norm": 1.602501989097996, + "language_loss": 0.83292782, + "learning_rate": 3.3794446473521176e-06, + "loss": 0.91078079, + "num_input_tokens_seen": 100588630, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.19396973, + "step": 4657, + "time_per_iteration": 2.546698808670044 + }, + { + "auxiliary_loss_clip": 0.06501682, + "auxiliary_loss_mlp": 0.01283943, + "balance_loss_clip": 0.06287715, + "balance_loss_mlp": 0.01265847, + "epoch": 0.2800541109274012, + "flos": 33665479626240.0, + "grad_norm": 2.056920585114217, + "language_loss": 0.64474404, + "learning_rate": 3.379162622133105e-06, + "loss": 0.72260022, + "num_input_tokens_seen": 100608775, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.18103027, + "step": 4658, + "time_per_iteration": 2.633352041244507 + }, + { + "auxiliary_loss_clip": 0.0650496, + "auxiliary_loss_mlp": 0.01278289, + "balance_loss_clip": 0.06292152, + "balance_loss_mlp": 0.01258298, + "epoch": 0.28011423418006914, + "flos": 21620515057920.0, + "grad_norm": 1.9139831777919125, + "language_loss": 0.78200769, + "learning_rate": 3.3788805446161073e-06, + "loss": 0.85984015, + "num_input_tokens_seen": 100627975, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.19995117, + "step": 4659, + "time_per_iteration": 2.5146000385284424 + }, + { + "auxiliary_loss_clip": 0.06512548, + "auxiliary_loss_mlp": 0.01279668, + "balance_loss_clip": 0.06298335, + "balance_loss_mlp": 0.01260582, + "epoch": 0.2801743574327371, + "flos": 23119130131200.0, + "grad_norm": 1.8180566150817747, + "language_loss": 0.79711032, + "learning_rate": 3.3785984148118215e-06, + "loss": 0.87503254, + "num_input_tokens_seen": 100645430, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.1907959, + "step": 4660, + "time_per_iteration": 2.5558273792266846 + }, + { + "auxiliary_loss_clip": 0.06502102, + "auxiliary_loss_mlp": 0.01275983, + "balance_loss_clip": 0.06293646, + "balance_loss_mlp": 0.01257732, + "epoch": 0.2802344806854051, + "flos": 12646433975040.0, + "grad_norm": 2.0195446081970685, + "language_loss": 0.8127892, + "learning_rate": 3.3783162327309453e-06, + "loss": 0.89057004, + "num_input_tokens_seen": 100663775, + "router_z_loss_clip": 2.08300781, + "router_z_loss_mlp": 0.18237305, + "step": 4661, + "time_per_iteration": 2.475562572479248 + }, + { + "auxiliary_loss_clip": 0.06508808, + "auxiliary_loss_mlp": 0.01277709, + "balance_loss_clip": 0.06296618, + "balance_loss_mlp": 0.01258898, + "epoch": 0.28029460393807304, + "flos": 37276772019840.0, + "grad_norm": 2.0240330571158904, + "language_loss": 0.79226935, + "learning_rate": 3.3780339983841794e-06, + "loss": 0.87013447, + "num_input_tokens_seen": 100686085, + "router_z_loss_clip": 2.12109375, + "router_z_loss_mlp": 0.18823242, + "step": 4662, + "time_per_iteration": 2.6644277572631836 + }, + { + "auxiliary_loss_clip": 0.06515819, + "auxiliary_loss_mlp": 0.01277387, + "balance_loss_clip": 0.06296565, + "balance_loss_mlp": 0.01258349, + "epoch": 0.280354727190741, + "flos": 20747450240640.0, + "grad_norm": 1.722651872041065, + "language_loss": 0.70744783, + "learning_rate": 3.377751711782227e-06, + "loss": 0.78537989, + "num_input_tokens_seen": 100705135, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.19042969, + "step": 4663, + "time_per_iteration": 2.5365068912506104 + }, + { + "auxiliary_loss_clip": 0.06510712, + "auxiliary_loss_mlp": 0.01280818, + "balance_loss_clip": 0.06293653, + "balance_loss_mlp": 0.01259312, + "epoch": 0.28041485044340897, + "flos": 21477526865280.0, + "grad_norm": 1.8007469711633386, + "language_loss": 0.77919745, + "learning_rate": 3.377469372935791e-06, + "loss": 0.85711277, + "num_input_tokens_seen": 100724960, + "router_z_loss_clip": 2.17089844, + "router_z_loss_mlp": 0.21520996, + "step": 4664, + "time_per_iteration": 2.578552484512329 + }, + { + "auxiliary_loss_clip": 0.06500383, + "auxiliary_loss_mlp": 0.01277041, + "balance_loss_clip": 0.06293675, + "balance_loss_mlp": 0.01259374, + "epoch": 0.28047497369607693, + "flos": 14799669471360.0, + "grad_norm": 1.9758280924180103, + "language_loss": 0.80386382, + "learning_rate": 3.377186981855578e-06, + "loss": 0.88163805, + "num_input_tokens_seen": 100741995, + "router_z_loss_clip": 2.06738281, + "router_z_loss_mlp": 0.17675781, + "step": 4665, + "time_per_iteration": 2.5088212490081787 + }, + { + "auxiliary_loss_clip": 0.06506059, + "auxiliary_loss_mlp": 0.01274647, + "balance_loss_clip": 0.06294893, + "balance_loss_mlp": 0.01257397, + "epoch": 0.2805350969487449, + "flos": 23076559457280.0, + "grad_norm": 2.052054159073397, + "language_loss": 0.81109238, + "learning_rate": 3.3769045385522968e-06, + "loss": 0.88889945, + "num_input_tokens_seen": 100758985, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.17236328, + "step": 4666, + "time_per_iteration": 2.5765438079833984 + }, + { + "auxiliary_loss_clip": 0.06505027, + "auxiliary_loss_mlp": 0.01282246, + "balance_loss_clip": 0.0629367, + "balance_loss_mlp": 0.01263149, + "epoch": 0.2805952202014129, + "flos": 20485177361280.0, + "grad_norm": 2.1346617464039395, + "language_loss": 0.84940714, + "learning_rate": 3.376622043036658e-06, + "loss": 0.92727995, + "num_input_tokens_seen": 100777820, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.19104004, + "step": 4667, + "time_per_iteration": 2.536466360092163 + }, + { + "auxiliary_loss_clip": 0.06510031, + "auxiliary_loss_mlp": 0.01284991, + "balance_loss_clip": 0.0629562, + "balance_loss_mlp": 0.0126581, + "epoch": 0.2806553434540809, + "flos": 27424678728960.0, + "grad_norm": 1.8168022919289022, + "language_loss": 0.80077279, + "learning_rate": 3.376339495319373e-06, + "loss": 0.87872303, + "num_input_tokens_seen": 100798205, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.19177246, + "step": 4668, + "time_per_iteration": 2.620793581008911 + }, + { + "auxiliary_loss_clip": 0.06508033, + "auxiliary_loss_mlp": 0.01279574, + "balance_loss_clip": 0.06290744, + "balance_loss_mlp": 0.0126124, + "epoch": 0.28071546670674885, + "flos": 26512187765760.0, + "grad_norm": 1.3575587104794173, + "language_loss": 0.76748574, + "learning_rate": 3.3760568954111563e-06, + "loss": 0.84536183, + "num_input_tokens_seen": 100819800, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.18334961, + "step": 4669, + "time_per_iteration": 2.629755973815918 + }, + { + "auxiliary_loss_clip": 0.06511085, + "auxiliary_loss_mlp": 0.01281258, + "balance_loss_clip": 0.06298456, + "balance_loss_mlp": 0.01263376, + "epoch": 0.2807755899594168, + "flos": 20564993975040.0, + "grad_norm": 1.8976620486576934, + "language_loss": 0.79953671, + "learning_rate": 3.375774243322725e-06, + "loss": 0.87746012, + "num_input_tokens_seen": 100837880, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.17883301, + "step": 4670, + "time_per_iteration": 2.630960702896118 + }, + { + "auxiliary_loss_clip": 0.06512859, + "auxiliary_loss_mlp": 0.0128758, + "balance_loss_clip": 0.06296019, + "balance_loss_mlp": 0.0126859, + "epoch": 0.2808357132120848, + "flos": 24319693831680.0, + "grad_norm": 2.1242803821214915, + "language_loss": 0.79548872, + "learning_rate": 3.3754915390647955e-06, + "loss": 0.87349308, + "num_input_tokens_seen": 100856350, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.18981934, + "step": 4671, + "time_per_iteration": 2.5943963527679443 + }, + { + "auxiliary_loss_clip": 0.06499608, + "auxiliary_loss_mlp": 0.01282791, + "balance_loss_clip": 0.06293108, + "balance_loss_mlp": 0.01265124, + "epoch": 0.28089583646475275, + "flos": 26439624529920.0, + "grad_norm": 1.773606658736433, + "language_loss": 0.75789028, + "learning_rate": 3.37520878264809e-06, + "loss": 0.83571434, + "num_input_tokens_seen": 100876135, + "router_z_loss_clip": 2.06640625, + "router_z_loss_mlp": 0.17663574, + "step": 4672, + "time_per_iteration": 2.5819919109344482 + }, + { + "auxiliary_loss_clip": 0.06515782, + "auxiliary_loss_mlp": 0.0128082, + "balance_loss_clip": 0.06299746, + "balance_loss_mlp": 0.01260412, + "epoch": 0.2809559597174207, + "flos": 23118417371520.0, + "grad_norm": 2.723902952009536, + "language_loss": 0.76012361, + "learning_rate": 3.3749259740833286e-06, + "loss": 0.83808959, + "num_input_tokens_seen": 100894790, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.20410156, + "step": 4673, + "time_per_iteration": 2.579460859298706 + }, + { + "auxiliary_loss_clip": 0.06510463, + "auxiliary_loss_mlp": 0.01285315, + "balance_loss_clip": 0.06297876, + "balance_loss_mlp": 0.0126704, + "epoch": 0.2810160829700887, + "flos": 20929864579200.0, + "grad_norm": 1.8153863613356214, + "language_loss": 0.72824192, + "learning_rate": 3.374643113381237e-06, + "loss": 0.80619967, + "num_input_tokens_seen": 100915100, + "router_z_loss_clip": 2.12792969, + "router_z_loss_mlp": 0.18261719, + "step": 4674, + "time_per_iteration": 4.0586278438568115 + }, + { + "auxiliary_loss_clip": 0.06522093, + "auxiliary_loss_mlp": 0.01283708, + "balance_loss_clip": 0.06307152, + "balance_loss_mlp": 0.0126405, + "epoch": 0.28107620622275664, + "flos": 14361145528320.0, + "grad_norm": 1.8954321480679195, + "language_loss": 0.77875817, + "learning_rate": 3.374360200552541e-06, + "loss": 0.85681611, + "num_input_tokens_seen": 100932795, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.1965332, + "step": 4675, + "time_per_iteration": 2.550075054168701 + }, + { + "auxiliary_loss_clip": 0.06512761, + "auxiliary_loss_mlp": 0.01288962, + "balance_loss_clip": 0.06296991, + "balance_loss_mlp": 0.01269531, + "epoch": 0.2811363294754246, + "flos": 20924707553280.0, + "grad_norm": 3.9789590396078784, + "language_loss": 0.70705891, + "learning_rate": 3.374077235607968e-06, + "loss": 0.78507614, + "num_input_tokens_seen": 100950505, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.19433594, + "step": 4676, + "time_per_iteration": 2.519028425216675 + }, + { + "auxiliary_loss_clip": 0.06504105, + "auxiliary_loss_mlp": 0.01278874, + "balance_loss_clip": 0.0629884, + "balance_loss_mlp": 0.01260611, + "epoch": 0.28119645272809257, + "flos": 20601107884800.0, + "grad_norm": 1.5779309471284284, + "language_loss": 0.70529211, + "learning_rate": 3.3737942185582487e-06, + "loss": 0.78312188, + "num_input_tokens_seen": 100968790, + "router_z_loss_clip": 2.04882812, + "router_z_loss_mlp": 0.18286133, + "step": 4677, + "time_per_iteration": 2.5834195613861084 + }, + { + "auxiliary_loss_clip": 0.06516379, + "auxiliary_loss_mlp": 0.01281791, + "balance_loss_clip": 0.06302937, + "balance_loss_mlp": 0.0126193, + "epoch": 0.28125657598076054, + "flos": 25344383811840.0, + "grad_norm": 1.5021857900224345, + "language_loss": 0.64105308, + "learning_rate": 3.3735111494141153e-06, + "loss": 0.71903479, + "num_input_tokens_seen": 100990205, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.1986084, + "step": 4678, + "time_per_iteration": 2.618948221206665 + }, + { + "auxiliary_loss_clip": 0.06517099, + "auxiliary_loss_mlp": 0.01278079, + "balance_loss_clip": 0.06306246, + "balance_loss_mlp": 0.01259947, + "epoch": 0.2813166992334285, + "flos": 24834051319680.0, + "grad_norm": 1.437486997447774, + "language_loss": 0.71167207, + "learning_rate": 3.3732280281863013e-06, + "loss": 0.7896238, + "num_input_tokens_seen": 101009815, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.18139648, + "step": 4679, + "time_per_iteration": 5.466668128967285 + }, + { + "auxiliary_loss_clip": 0.06520079, + "auxiliary_loss_mlp": 0.0127734, + "balance_loss_clip": 0.06306013, + "balance_loss_mlp": 0.01257491, + "epoch": 0.2813768224860965, + "flos": 21766941267840.0, + "grad_norm": 1.8819388160659554, + "language_loss": 0.75122017, + "learning_rate": 3.3729448548855422e-06, + "loss": 0.82919437, + "num_input_tokens_seen": 101026780, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.19848633, + "step": 4680, + "time_per_iteration": 2.5146636962890625 + }, + { + "auxiliary_loss_clip": 0.06519224, + "auxiliary_loss_mlp": 0.01276065, + "balance_loss_clip": 0.06307293, + "balance_loss_mlp": 0.01257969, + "epoch": 0.2814369457387645, + "flos": 24323760754560.0, + "grad_norm": 2.4475033368931984, + "language_loss": 0.77670574, + "learning_rate": 3.3726616295225774e-06, + "loss": 0.8546586, + "num_input_tokens_seen": 101046215, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.18103027, + "step": 4681, + "time_per_iteration": 2.576263189315796 + }, + { + "auxiliary_loss_clip": 0.06524731, + "auxiliary_loss_mlp": 0.01277602, + "balance_loss_clip": 0.06309941, + "balance_loss_mlp": 0.01259208, + "epoch": 0.28149706899143245, + "flos": 18521274165120.0, + "grad_norm": 2.513172937911882, + "language_loss": 0.7420646, + "learning_rate": 3.372378352108146e-06, + "loss": 0.82008791, + "num_input_tokens_seen": 101063365, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.18383789, + "step": 4682, + "time_per_iteration": 2.5019047260284424 + }, + { + "auxiliary_loss_clip": 0.06516165, + "auxiliary_loss_mlp": 0.01280522, + "balance_loss_clip": 0.06307921, + "balance_loss_mlp": 0.01262879, + "epoch": 0.2815571922441004, + "flos": 24870165229440.0, + "grad_norm": 1.4634735151261165, + "language_loss": 0.81619561, + "learning_rate": 3.3720950226529894e-06, + "loss": 0.89416242, + "num_input_tokens_seen": 101083835, + "router_z_loss_clip": 2.08105469, + "router_z_loss_mlp": 0.17626953, + "step": 4683, + "time_per_iteration": 2.6108040809631348 + }, + { + "auxiliary_loss_clip": 0.06511167, + "auxiliary_loss_mlp": 0.01277368, + "balance_loss_clip": 0.06297079, + "balance_loss_mlp": 0.01258771, + "epoch": 0.2816173154967684, + "flos": 19907774075520.0, + "grad_norm": 1.6126473409715323, + "language_loss": 0.76514447, + "learning_rate": 3.371811641167852e-06, + "loss": 0.8430298, + "num_input_tokens_seen": 101101740, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.18579102, + "step": 4684, + "time_per_iteration": 3.9593515396118164 + }, + { + "auxiliary_loss_clip": 0.06509569, + "auxiliary_loss_mlp": 0.0127644, + "balance_loss_clip": 0.06298888, + "balance_loss_mlp": 0.01257474, + "epoch": 0.28167743874943635, + "flos": 17496709966080.0, + "grad_norm": 1.741664239740996, + "language_loss": 0.76634955, + "learning_rate": 3.3715282076634807e-06, + "loss": 0.84420967, + "num_input_tokens_seen": 101120480, + "router_z_loss_clip": 2.10742188, + "router_z_loss_mlp": 0.18969727, + "step": 4685, + "time_per_iteration": 2.533033847808838 + }, + { + "auxiliary_loss_clip": 0.06512235, + "auxiliary_loss_mlp": 0.01277016, + "balance_loss_clip": 0.06303049, + "balance_loss_mlp": 0.01258002, + "epoch": 0.2817375620021043, + "flos": 25309276151040.0, + "grad_norm": 1.5379443905684582, + "language_loss": 0.76075816, + "learning_rate": 3.3712447221506218e-06, + "loss": 0.8386507, + "num_input_tokens_seen": 101142910, + "router_z_loss_clip": 2.09179688, + "router_z_loss_mlp": 0.19006348, + "step": 4686, + "time_per_iteration": 2.5632452964782715 + }, + { + "auxiliary_loss_clip": 0.0651376, + "auxiliary_loss_mlp": 0.01282744, + "balance_loss_clip": 0.06298173, + "balance_loss_mlp": 0.01262705, + "epoch": 0.2817976852547723, + "flos": 18698447623680.0, + "grad_norm": 3.4763910689128945, + "language_loss": 0.63974833, + "learning_rate": 3.370961184640025e-06, + "loss": 0.71771336, + "num_input_tokens_seen": 101160030, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.20043945, + "step": 4687, + "time_per_iteration": 2.5520877838134766 + }, + { + "auxiliary_loss_clip": 0.0651626, + "auxiliary_loss_mlp": 0.01278308, + "balance_loss_clip": 0.06302825, + "balance_loss_mlp": 0.01258889, + "epoch": 0.28185780850744024, + "flos": 22748012398080.0, + "grad_norm": 2.5451270798344208, + "language_loss": 0.76514482, + "learning_rate": 3.3706775951424433e-06, + "loss": 0.84309042, + "num_input_tokens_seen": 101177675, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.1940918, + "step": 4688, + "time_per_iteration": 2.5427582263946533 + }, + { + "auxiliary_loss_clip": 0.06506021, + "auxiliary_loss_mlp": 0.01276039, + "balance_loss_clip": 0.06297493, + "balance_loss_mlp": 0.01258622, + "epoch": 0.2819179317601082, + "flos": 14938297251840.0, + "grad_norm": 2.0673048339937394, + "language_loss": 0.79160047, + "learning_rate": 3.37039395366863e-06, + "loss": 0.86942106, + "num_input_tokens_seen": 101192225, + "router_z_loss_clip": 2.0859375, + "router_z_loss_mlp": 0.17407227, + "step": 4689, + "time_per_iteration": 2.514857769012451 + }, + { + "auxiliary_loss_clip": 0.06505655, + "auxiliary_loss_mlp": 0.01279731, + "balance_loss_clip": 0.06295724, + "balance_loss_mlp": 0.0126098, + "epoch": 0.2819780550127762, + "flos": 23151428680320.0, + "grad_norm": 2.0480677905828664, + "language_loss": 0.78403682, + "learning_rate": 3.37011026022934e-06, + "loss": 0.86189067, + "num_input_tokens_seen": 101210870, + "router_z_loss_clip": 2.1015625, + "router_z_loss_mlp": 0.18762207, + "step": 4690, + "time_per_iteration": 2.5567362308502197 + }, + { + "auxiliary_loss_clip": 0.06514366, + "auxiliary_loss_mlp": 0.01275411, + "balance_loss_clip": 0.06301816, + "balance_loss_mlp": 0.01256981, + "epoch": 0.28203817826544414, + "flos": 21622779118080.0, + "grad_norm": 2.5530247222146976, + "language_loss": 0.87619591, + "learning_rate": 3.369826514835332e-06, + "loss": 0.95409369, + "num_input_tokens_seen": 101229965, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.18432617, + "step": 4691, + "time_per_iteration": 2.5987935066223145 + }, + { + "auxiliary_loss_clip": 0.0651565, + "auxiliary_loss_mlp": 0.0127711, + "balance_loss_clip": 0.0629878, + "balance_loss_mlp": 0.01258787, + "epoch": 0.2820983015181121, + "flos": 24034010935680.0, + "grad_norm": 1.7719901211447804, + "language_loss": 0.82443225, + "learning_rate": 3.3695427174973654e-06, + "loss": 0.90235984, + "num_input_tokens_seen": 101250980, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.18322754, + "step": 4692, + "time_per_iteration": 2.607388496398926 + }, + { + "auxiliary_loss_clip": 0.06515577, + "auxiliary_loss_mlp": 0.01278887, + "balance_loss_clip": 0.06304249, + "balance_loss_mlp": 0.01259921, + "epoch": 0.2821584247707801, + "flos": 30015725408640.0, + "grad_norm": 1.5203777397001885, + "language_loss": 0.74437934, + "learning_rate": 3.3692588682262022e-06, + "loss": 0.82232404, + "num_input_tokens_seen": 101273335, + "router_z_loss_clip": 2.11328125, + "router_z_loss_mlp": 0.1895752, + "step": 4693, + "time_per_iteration": 2.6104559898376465 + }, + { + "auxiliary_loss_clip": 0.06512225, + "auxiliary_loss_mlp": 0.0127432, + "balance_loss_clip": 0.06298921, + "balance_loss_mlp": 0.01255593, + "epoch": 0.2822185480234481, + "flos": 21403034985600.0, + "grad_norm": 1.7641787467317929, + "language_loss": 0.77641487, + "learning_rate": 3.3689749670326046e-06, + "loss": 0.85428035, + "num_input_tokens_seen": 101292110, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.18737793, + "step": 4694, + "time_per_iteration": 2.5619184970855713 + }, + { + "auxiliary_loss_clip": 0.06513312, + "auxiliary_loss_mlp": 0.01274888, + "balance_loss_clip": 0.0630666, + "balance_loss_mlp": 0.01255898, + "epoch": 0.28227867127611606, + "flos": 27459996024960.0, + "grad_norm": 2.064814820064932, + "language_loss": 0.67270994, + "learning_rate": 3.3686910139273392e-06, + "loss": 0.75059193, + "num_input_tokens_seen": 101312815, + "router_z_loss_clip": 2.06640625, + "router_z_loss_mlp": 0.18969727, + "step": 4695, + "time_per_iteration": 2.5849459171295166 + }, + { + "auxiliary_loss_clip": 0.06524754, + "auxiliary_loss_mlp": 0.01277288, + "balance_loss_clip": 0.06312457, + "balance_loss_mlp": 0.01255914, + "epoch": 0.282338794528784, + "flos": 22599028638720.0, + "grad_norm": 2.3022925444863747, + "language_loss": 0.75992346, + "learning_rate": 3.3684070089211736e-06, + "loss": 0.83794391, + "num_input_tokens_seen": 101329045, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.21362305, + "step": 4696, + "time_per_iteration": 2.5599312782287598 + }, + { + "auxiliary_loss_clip": 0.06528555, + "auxiliary_loss_mlp": 0.01276345, + "balance_loss_clip": 0.06319815, + "balance_loss_mlp": 0.01257915, + "epoch": 0.282398917781452, + "flos": 42020592998400.0, + "grad_norm": 1.6923608864022255, + "language_loss": 0.62607121, + "learning_rate": 3.368122952024877e-06, + "loss": 0.70412022, + "num_input_tokens_seen": 101352715, + "router_z_loss_clip": 2.08691406, + "router_z_loss_mlp": 0.1842041, + "step": 4697, + "time_per_iteration": 2.719783067703247 + }, + { + "auxiliary_loss_clip": 0.0651894, + "auxiliary_loss_mlp": 0.01278397, + "balance_loss_clip": 0.0631054, + "balance_loss_mlp": 0.01260564, + "epoch": 0.28245904103411995, + "flos": 23231916126720.0, + "grad_norm": 1.330125700327103, + "language_loss": 0.73835146, + "learning_rate": 3.3678388432492214e-06, + "loss": 0.81632483, + "num_input_tokens_seen": 101374640, + "router_z_loss_clip": 2.08398438, + "router_z_loss_mlp": 0.17834473, + "step": 4698, + "time_per_iteration": 2.671154260635376 + }, + { + "auxiliary_loss_clip": 0.06520095, + "auxiliary_loss_mlp": 0.01274177, + "balance_loss_clip": 0.06314629, + "balance_loss_mlp": 0.01255699, + "epoch": 0.2825191642867879, + "flos": 25381713605760.0, + "grad_norm": 1.8806904568543696, + "language_loss": 0.75498992, + "learning_rate": 3.3675546826049788e-06, + "loss": 0.83293265, + "num_input_tokens_seen": 101393595, + "router_z_loss_clip": 2.05371094, + "router_z_loss_mlp": 0.18481445, + "step": 4699, + "time_per_iteration": 2.749073028564453 + }, + { + "auxiliary_loss_clip": 0.06532586, + "auxiliary_loss_mlp": 0.0127858, + "balance_loss_clip": 0.06318063, + "balance_loss_mlp": 0.01257969, + "epoch": 0.2825792875394559, + "flos": 17242277443200.0, + "grad_norm": 2.5468251061801697, + "language_loss": 0.80103695, + "learning_rate": 3.3672704701029265e-06, + "loss": 0.87914866, + "num_input_tokens_seen": 101409265, + "router_z_loss_clip": 2.14355469, + "router_z_loss_mlp": 0.20617676, + "step": 4700, + "time_per_iteration": 2.539794683456421 + }, + { + "auxiliary_loss_clip": 0.06516679, + "auxiliary_loss_mlp": 0.01274136, + "balance_loss_clip": 0.06314512, + "balance_loss_mlp": 0.01257006, + "epoch": 0.28263941079212385, + "flos": 26731177211520.0, + "grad_norm": 2.1068022199140213, + "language_loss": 0.8243857, + "learning_rate": 3.3669862057538402e-06, + "loss": 0.90229392, + "num_input_tokens_seen": 101428365, + "router_z_loss_clip": 2.02246094, + "router_z_loss_mlp": 0.17114258, + "step": 4701, + "time_per_iteration": 2.5763485431671143 + }, + { + "auxiliary_loss_clip": 0.06520683, + "auxiliary_loss_mlp": 0.01274057, + "balance_loss_clip": 0.06312392, + "balance_loss_mlp": 0.01256116, + "epoch": 0.2826995340447918, + "flos": 25928411569920.0, + "grad_norm": 2.2990609650841276, + "language_loss": 0.73153478, + "learning_rate": 3.3667018895685004e-06, + "loss": 0.80948216, + "num_input_tokens_seen": 101447280, + "router_z_loss_clip": 2.08007812, + "router_z_loss_mlp": 0.17956543, + "step": 4702, + "time_per_iteration": 2.5968289375305176 + }, + { + "auxiliary_loss_clip": 0.06520355, + "auxiliary_loss_mlp": 0.01275823, + "balance_loss_clip": 0.06316096, + "balance_loss_mlp": 0.01258848, + "epoch": 0.2827596572974598, + "flos": 22385783197440.0, + "grad_norm": 1.6603391807745085, + "language_loss": 0.78883457, + "learning_rate": 3.3664175215576886e-06, + "loss": 0.86679637, + "num_input_tokens_seen": 101465435, + "router_z_loss_clip": 2.04296875, + "router_z_loss_mlp": 0.1697998, + "step": 4703, + "time_per_iteration": 2.56088924407959 + }, + { + "auxiliary_loss_clip": 0.06518066, + "auxiliary_loss_mlp": 0.01281519, + "balance_loss_clip": 0.06307587, + "balance_loss_mlp": 0.01261885, + "epoch": 0.28281978055012774, + "flos": 33555544669440.0, + "grad_norm": 1.530922589206002, + "language_loss": 0.69937778, + "learning_rate": 3.3661331017321867e-06, + "loss": 0.77737355, + "num_input_tokens_seen": 101486355, + "router_z_loss_clip": 2.10253906, + "router_z_loss_mlp": 0.19628906, + "step": 4704, + "time_per_iteration": 2.725234031677246 + }, + { + "auxiliary_loss_clip": 0.0652602, + "auxiliary_loss_mlp": 0.01283133, + "balance_loss_clip": 0.06319317, + "balance_loss_mlp": 0.01264119, + "epoch": 0.2828799038027957, + "flos": 23447635263360.0, + "grad_norm": 1.9265232828394878, + "language_loss": 0.70927215, + "learning_rate": 3.3658486301027807e-06, + "loss": 0.78736377, + "num_input_tokens_seen": 101505875, + "router_z_loss_clip": 2.06445312, + "router_z_loss_mlp": 0.19006348, + "step": 4705, + "time_per_iteration": 2.5391383171081543 + }, + { + "auxiliary_loss_clip": 0.06482799, + "auxiliary_loss_mlp": 0.0126888, + "balance_loss_clip": 0.06378852, + "balance_loss_mlp": 0.01263947, + "epoch": 0.2829400270554637, + "flos": 69892055297280.0, + "grad_norm": 0.9159756060868983, + "language_loss": 0.59201139, + "learning_rate": 3.3655641066802577e-06, + "loss": 0.66952819, + "num_input_tokens_seen": 101565045, + "router_z_loss_clip": 1.0390625, + "router_z_loss_mlp": 0.04928589, + "step": 4706, + "time_per_iteration": 3.219618797302246 + }, + { + "auxiliary_loss_clip": 0.06512764, + "auxiliary_loss_mlp": 0.01277701, + "balance_loss_clip": 0.06312177, + "balance_loss_mlp": 0.01260547, + "epoch": 0.2830001503081317, + "flos": 24795715276800.0, + "grad_norm": 1.373077415158703, + "language_loss": 0.82380199, + "learning_rate": 3.365279531475407e-06, + "loss": 0.90170658, + "num_input_tokens_seen": 101585825, + "router_z_loss_clip": 2.00683594, + "router_z_loss_mlp": 0.17138672, + "step": 4707, + "time_per_iteration": 2.5680840015411377 + }, + { + "auxiliary_loss_clip": 0.06518079, + "auxiliary_loss_mlp": 0.01276357, + "balance_loss_clip": 0.06304221, + "balance_loss_mlp": 0.01257391, + "epoch": 0.28306027356079966, + "flos": 27676218286080.0, + "grad_norm": 1.5569970524845527, + "language_loss": 0.81077999, + "learning_rate": 3.36499490449902e-06, + "loss": 0.88872433, + "num_input_tokens_seen": 101606105, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.18969727, + "step": 4708, + "time_per_iteration": 2.643389940261841 + }, + { + "auxiliary_loss_clip": 0.06443536, + "auxiliary_loss_mlp": 0.01268639, + "balance_loss_clip": 0.06339511, + "balance_loss_mlp": 0.01264025, + "epoch": 0.2831203968134676, + "flos": 60543837734400.0, + "grad_norm": 0.8586282544888121, + "language_loss": 0.62812036, + "learning_rate": 3.3647102257618895e-06, + "loss": 0.7052421, + "num_input_tokens_seen": 101656875, + "router_z_loss_clip": 1.04199219, + "router_z_loss_mlp": 0.04608154, + "step": 4709, + "time_per_iteration": 3.0554397106170654 + }, + { + "auxiliary_loss_clip": 0.06507774, + "auxiliary_loss_mlp": 0.01270408, + "balance_loss_clip": 0.06301016, + "balance_loss_mlp": 0.01253015, + "epoch": 0.2831805200661356, + "flos": 22061386915200.0, + "grad_norm": 1.4201642822404892, + "language_loss": 0.74412584, + "learning_rate": 3.3644254952748103e-06, + "loss": 0.82190764, + "num_input_tokens_seen": 101676225, + "router_z_loss_clip": 2.06835938, + "router_z_loss_mlp": 0.1739502, + "step": 4710, + "time_per_iteration": 2.555367946624756 + }, + { + "auxiliary_loss_clip": 0.06514937, + "auxiliary_loss_mlp": 0.01275331, + "balance_loss_clip": 0.06306514, + "balance_loss_mlp": 0.01256627, + "epoch": 0.28324064331880355, + "flos": 22607120557440.0, + "grad_norm": 1.9767009095982746, + "language_loss": 0.8018595, + "learning_rate": 3.364140713048579e-06, + "loss": 0.87976217, + "num_input_tokens_seen": 101693710, + "router_z_loss_clip": 2.08300781, + "router_z_loss_mlp": 0.18713379, + "step": 4711, + "time_per_iteration": 2.610027313232422 + }, + { + "auxiliary_loss_clip": 0.06509729, + "auxiliary_loss_mlp": 0.01278493, + "balance_loss_clip": 0.06300638, + "balance_loss_mlp": 0.01260385, + "epoch": 0.2833007665714715, + "flos": 30411133626240.0, + "grad_norm": 1.982526263820073, + "language_loss": 0.70604694, + "learning_rate": 3.363855879093996e-06, + "loss": 0.78392917, + "num_input_tokens_seen": 101714010, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 0.18103027, + "step": 4712, + "time_per_iteration": 2.602795124053955 + }, + { + "auxiliary_loss_clip": 0.06508194, + "auxiliary_loss_mlp": 0.01282495, + "balance_loss_clip": 0.06299947, + "balance_loss_mlp": 0.01262992, + "epoch": 0.2833608898241395, + "flos": 23556144700800.0, + "grad_norm": 1.7823239687069516, + "language_loss": 0.8193841, + "learning_rate": 3.3635709934218605e-06, + "loss": 0.89729095, + "num_input_tokens_seen": 101732995, + "router_z_loss_clip": 2.08398438, + "router_z_loss_mlp": 0.19494629, + "step": 4713, + "time_per_iteration": 2.6088523864746094 + }, + { + "auxiliary_loss_clip": 0.06512519, + "auxiliary_loss_mlp": 0.01275048, + "balance_loss_clip": 0.06304006, + "balance_loss_mlp": 0.01255236, + "epoch": 0.28342101307680745, + "flos": 20272980096000.0, + "grad_norm": 2.6212370689858493, + "language_loss": 0.75431275, + "learning_rate": 3.3632860560429766e-06, + "loss": 0.83218849, + "num_input_tokens_seen": 101751385, + "router_z_loss_clip": 2.08398438, + "router_z_loss_mlp": 0.19799805, + "step": 4714, + "time_per_iteration": 3.986696243286133 + }, + { + "auxiliary_loss_clip": 0.06505996, + "auxiliary_loss_mlp": 0.01276776, + "balance_loss_clip": 0.06297115, + "balance_loss_mlp": 0.01259324, + "epoch": 0.2834811363294754, + "flos": 30854982303360.0, + "grad_norm": 1.3268888753773178, + "language_loss": 0.78198218, + "learning_rate": 3.3630010669681494e-06, + "loss": 0.85980994, + "num_input_tokens_seen": 101773825, + "router_z_loss_clip": 2.08691406, + "router_z_loss_mlp": 0.17468262, + "step": 4715, + "time_per_iteration": 2.652470111846924 + }, + { + "auxiliary_loss_clip": 0.06506517, + "auxiliary_loss_mlp": 0.01277278, + "balance_loss_clip": 0.06300199, + "balance_loss_mlp": 0.01260088, + "epoch": 0.2835412595821434, + "flos": 22717642492800.0, + "grad_norm": 1.6173599581374518, + "language_loss": 0.74551272, + "learning_rate": 3.3627160262081845e-06, + "loss": 0.82335067, + "num_input_tokens_seen": 101791920, + "router_z_loss_clip": 2.06542969, + "router_z_loss_mlp": 0.17175293, + "step": 4716, + "time_per_iteration": 2.597083806991577 + }, + { + "auxiliary_loss_clip": 0.06516325, + "auxiliary_loss_mlp": 0.01281584, + "balance_loss_clip": 0.06298752, + "balance_loss_mlp": 0.0126189, + "epoch": 0.28360138283481134, + "flos": 18083630689920.0, + "grad_norm": 2.1150039301458112, + "language_loss": 0.75477433, + "learning_rate": 3.3624309337738917e-06, + "loss": 0.83275348, + "num_input_tokens_seen": 101809515, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.19702148, + "step": 4717, + "time_per_iteration": 2.5648136138916016 + }, + { + "auxiliary_loss_clip": 0.06514253, + "auxiliary_loss_mlp": 0.01277656, + "balance_loss_clip": 0.06302426, + "balance_loss_mlp": 0.01258606, + "epoch": 0.2836615060874793, + "flos": 17859987342720.0, + "grad_norm": 1.540618458402471, + "language_loss": 0.67445159, + "learning_rate": 3.3621457896760813e-06, + "loss": 0.75237072, + "num_input_tokens_seen": 101827735, + "router_z_loss_clip": 2.1171875, + "router_z_loss_mlp": 0.19042969, + "step": 4718, + "time_per_iteration": 3.962265968322754 + }, + { + "auxiliary_loss_clip": 0.06507722, + "auxiliary_loss_mlp": 0.01278787, + "balance_loss_clip": 0.06295013, + "balance_loss_mlp": 0.01258772, + "epoch": 0.2837216293401473, + "flos": 25747590458880.0, + "grad_norm": 1.8038295919740834, + "language_loss": 0.73164374, + "learning_rate": 3.361860593925566e-06, + "loss": 0.8095088, + "num_input_tokens_seen": 101845970, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.20007324, + "step": 4719, + "time_per_iteration": 4.095008134841919 + }, + { + "auxiliary_loss_clip": 0.0650832, + "auxiliary_loss_mlp": 0.01277839, + "balance_loss_clip": 0.06301163, + "balance_loss_mlp": 0.01259386, + "epoch": 0.2837817525928153, + "flos": 20929906506240.0, + "grad_norm": 1.8981156672354917, + "language_loss": 0.80600828, + "learning_rate": 3.3615753465331605e-06, + "loss": 0.88386989, + "num_input_tokens_seen": 101865040, + "router_z_loss_clip": 2.07324219, + "router_z_loss_mlp": 0.18444824, + "step": 4720, + "time_per_iteration": 2.53869366645813 + }, + { + "auxiliary_loss_clip": 0.06515027, + "auxiliary_loss_mlp": 0.01280641, + "balance_loss_clip": 0.06304276, + "balance_loss_mlp": 0.01261687, + "epoch": 0.28384187584548326, + "flos": 18922719876480.0, + "grad_norm": 1.7940545446838874, + "language_loss": 0.7966662, + "learning_rate": 3.3612900475096817e-06, + "loss": 0.87462288, + "num_input_tokens_seen": 101883735, + "router_z_loss_clip": 2.10742188, + "router_z_loss_mlp": 0.18945312, + "step": 4721, + "time_per_iteration": 2.5736734867095947 + }, + { + "auxiliary_loss_clip": 0.06507237, + "auxiliary_loss_mlp": 0.01272866, + "balance_loss_clip": 0.06298702, + "balance_loss_mlp": 0.01254996, + "epoch": 0.2839019990981512, + "flos": 27351235025280.0, + "grad_norm": 1.8504915753410351, + "language_loss": 0.83238685, + "learning_rate": 3.3610046968659474e-06, + "loss": 0.91018784, + "num_input_tokens_seen": 101903025, + "router_z_loss_clip": 2.0859375, + "router_z_loss_mlp": 0.17871094, + "step": 4722, + "time_per_iteration": 2.5798823833465576 + }, + { + "auxiliary_loss_clip": 0.06511718, + "auxiliary_loss_mlp": 0.01273786, + "balance_loss_clip": 0.06302544, + "balance_loss_mlp": 0.01255547, + "epoch": 0.2839621223508192, + "flos": 18120247724160.0, + "grad_norm": 1.9056364243243222, + "language_loss": 0.71157932, + "learning_rate": 3.3607192946127785e-06, + "loss": 0.78943431, + "num_input_tokens_seen": 101922255, + "router_z_loss_clip": 2.09179688, + "router_z_loss_mlp": 0.18225098, + "step": 4723, + "time_per_iteration": 2.5472381114959717 + }, + { + "auxiliary_loss_clip": 0.06508423, + "auxiliary_loss_mlp": 0.01279225, + "balance_loss_clip": 0.06299602, + "balance_loss_mlp": 0.01259937, + "epoch": 0.28402224560348716, + "flos": 26365384212480.0, + "grad_norm": 1.5487216964387416, + "language_loss": 0.7882036, + "learning_rate": 3.360433840760998e-06, + "loss": 0.86608005, + "num_input_tokens_seen": 101943100, + "router_z_loss_clip": 2.09082031, + "router_z_loss_mlp": 0.19299316, + "step": 4724, + "time_per_iteration": 4.039300203323364 + }, + { + "auxiliary_loss_clip": 0.0650482, + "auxiliary_loss_mlp": 0.01275588, + "balance_loss_clip": 0.06294143, + "balance_loss_mlp": 0.0125754, + "epoch": 0.2840823688561551, + "flos": 24067609223040.0, + "grad_norm": 1.5786087270385247, + "language_loss": 0.92781484, + "learning_rate": 3.36014833532143e-06, + "loss": 1.00561893, + "num_input_tokens_seen": 101963160, + "router_z_loss_clip": 2.10546875, + "router_z_loss_mlp": 0.18066406, + "step": 4725, + "time_per_iteration": 2.5839502811431885 + }, + { + "auxiliary_loss_clip": 0.06504668, + "auxiliary_loss_mlp": 0.01283756, + "balance_loss_clip": 0.06292438, + "balance_loss_mlp": 0.01263097, + "epoch": 0.2841424921088231, + "flos": 29467392289920.0, + "grad_norm": 1.5513315701194426, + "language_loss": 0.89446843, + "learning_rate": 3.3598627783049e-06, + "loss": 0.97235262, + "num_input_tokens_seen": 101984300, + "router_z_loss_clip": 2.12304688, + "router_z_loss_mlp": 0.20666504, + "step": 4726, + "time_per_iteration": 2.617002010345459 + }, + { + "auxiliary_loss_clip": 0.06507252, + "auxiliary_loss_mlp": 0.01284138, + "balance_loss_clip": 0.0629679, + "balance_loss_mlp": 0.01264409, + "epoch": 0.28420261536149105, + "flos": 48110439565440.0, + "grad_norm": 2.259876030173266, + "language_loss": 0.79337573, + "learning_rate": 3.359577169722238e-06, + "loss": 0.87128961, + "num_input_tokens_seen": 102005765, + "router_z_loss_clip": 2.1015625, + "router_z_loss_mlp": 0.19763184, + "step": 4727, + "time_per_iteration": 2.774508476257324 + }, + { + "auxiliary_loss_clip": 0.06499238, + "auxiliary_loss_mlp": 0.01275292, + "balance_loss_clip": 0.06294493, + "balance_loss_mlp": 0.01257483, + "epoch": 0.284262738614159, + "flos": 25673224360320.0, + "grad_norm": 2.051338722061539, + "language_loss": 0.67073631, + "learning_rate": 3.3592915095842733e-06, + "loss": 0.74848163, + "num_input_tokens_seen": 102022755, + "router_z_loss_clip": 2.04589844, + "router_z_loss_mlp": 0.17810059, + "step": 4728, + "time_per_iteration": 2.614614725112915 + }, + { + "auxiliary_loss_clip": 0.06494898, + "auxiliary_loss_mlp": 0.01274263, + "balance_loss_clip": 0.06287634, + "balance_loss_mlp": 0.01255702, + "epoch": 0.284322861866827, + "flos": 19725066247680.0, + "grad_norm": 2.0236031999203132, + "language_loss": 0.76682353, + "learning_rate": 3.3590057979018386e-06, + "loss": 0.84451514, + "num_input_tokens_seen": 102041850, + "router_z_loss_clip": 2.07617188, + "router_z_loss_mlp": 0.18554688, + "step": 4729, + "time_per_iteration": 2.542400360107422 + }, + { + "auxiliary_loss_clip": 0.06505589, + "auxiliary_loss_mlp": 0.01273011, + "balance_loss_clip": 0.06292985, + "balance_loss_mlp": 0.0125414, + "epoch": 0.28438298511949495, + "flos": 23922105408000.0, + "grad_norm": 1.7626205541686495, + "language_loss": 0.67443657, + "learning_rate": 3.3587200346857674e-06, + "loss": 0.75222254, + "num_input_tokens_seen": 102059500, + "router_z_loss_clip": 2.12304688, + "router_z_loss_mlp": 0.1887207, + "step": 4730, + "time_per_iteration": 2.6005139350891113 + }, + { + "auxiliary_loss_clip": 0.06503962, + "auxiliary_loss_mlp": 0.01275972, + "balance_loss_clip": 0.06292562, + "balance_loss_mlp": 0.01256219, + "epoch": 0.2844431083721629, + "flos": 26074460436480.0, + "grad_norm": 1.9951841893982447, + "language_loss": 0.74777246, + "learning_rate": 3.3584342199468965e-06, + "loss": 0.82557184, + "num_input_tokens_seen": 102080460, + "router_z_loss_clip": 2.1171875, + "router_z_loss_mlp": 0.1973877, + "step": 4731, + "time_per_iteration": 2.571259021759033 + }, + { + "auxiliary_loss_clip": 0.06501718, + "auxiliary_loss_mlp": 0.01275528, + "balance_loss_clip": 0.06291741, + "balance_loss_mlp": 0.01257384, + "epoch": 0.2845032316248309, + "flos": 25817260728960.0, + "grad_norm": 1.5216025808612688, + "language_loss": 0.8435545, + "learning_rate": 3.3581483536960638e-06, + "loss": 0.92132688, + "num_input_tokens_seen": 102100950, + "router_z_loss_clip": 2.09570312, + "router_z_loss_mlp": 0.18139648, + "step": 4732, + "time_per_iteration": 2.604717254638672 + }, + { + "auxiliary_loss_clip": 0.06508272, + "auxiliary_loss_mlp": 0.01277146, + "balance_loss_clip": 0.06295733, + "balance_loss_mlp": 0.01256082, + "epoch": 0.2845633548774989, + "flos": 19828418659200.0, + "grad_norm": 1.722472955192697, + "language_loss": 0.79522747, + "learning_rate": 3.357862435944109e-06, + "loss": 0.87308168, + "num_input_tokens_seen": 102119345, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.21069336, + "step": 4733, + "time_per_iteration": 2.517216444015503 + }, + { + "auxiliary_loss_clip": 0.06511072, + "auxiliary_loss_mlp": 0.01275761, + "balance_loss_clip": 0.06296709, + "balance_loss_mlp": 0.01256878, + "epoch": 0.28462347813016686, + "flos": 23189093890560.0, + "grad_norm": 2.336729990473161, + "language_loss": 0.72093451, + "learning_rate": 3.357576466701875e-06, + "loss": 0.79880273, + "num_input_tokens_seen": 102139050, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.1887207, + "step": 4734, + "time_per_iteration": 2.5948264598846436 + }, + { + "auxiliary_loss_clip": 0.06501292, + "auxiliary_loss_mlp": 0.01274129, + "balance_loss_clip": 0.06292972, + "balance_loss_mlp": 0.01256283, + "epoch": 0.2846836013828348, + "flos": 18666316782720.0, + "grad_norm": 1.7839237241912007, + "language_loss": 0.74739748, + "learning_rate": 3.3572904459802056e-06, + "loss": 0.82515168, + "num_input_tokens_seen": 102157935, + "router_z_loss_clip": 2.08203125, + "router_z_loss_mlp": 0.1784668, + "step": 4735, + "time_per_iteration": 2.5192623138427734 + }, + { + "auxiliary_loss_clip": 0.06500865, + "auxiliary_loss_mlp": 0.01274478, + "balance_loss_clip": 0.06291883, + "balance_loss_mlp": 0.01256096, + "epoch": 0.2847437246355028, + "flos": 14178731189760.0, + "grad_norm": 1.8549790130823454, + "language_loss": 0.81047934, + "learning_rate": 3.357004373789946e-06, + "loss": 0.88823277, + "num_input_tokens_seen": 102175325, + "router_z_loss_clip": 2.09179688, + "router_z_loss_mlp": 0.18383789, + "step": 4736, + "time_per_iteration": 2.593890905380249 + }, + { + "auxiliary_loss_clip": 0.06503595, + "auxiliary_loss_mlp": 0.01274596, + "balance_loss_clip": 0.06293313, + "balance_loss_mlp": 0.01256285, + "epoch": 0.28480384788817076, + "flos": 29286068054400.0, + "grad_norm": 3.1700593253391895, + "language_loss": 0.60580242, + "learning_rate": 3.3567182501419453e-06, + "loss": 0.68358433, + "num_input_tokens_seen": 102196625, + "router_z_loss_clip": 2.10644531, + "router_z_loss_mlp": 0.18310547, + "step": 4737, + "time_per_iteration": 2.591672897338867 + }, + { + "auxiliary_loss_clip": 0.06501776, + "auxiliary_loss_mlp": 0.01274486, + "balance_loss_clip": 0.06295541, + "balance_loss_mlp": 0.01256855, + "epoch": 0.2848639711408387, + "flos": 22607875244160.0, + "grad_norm": 1.8212806326874897, + "language_loss": 0.86685491, + "learning_rate": 3.356432075047052e-06, + "loss": 0.94461757, + "num_input_tokens_seen": 102214975, + "router_z_loss_clip": 2.06347656, + "router_z_loss_mlp": 0.1763916, + "step": 4738, + "time_per_iteration": 2.5788064002990723 + }, + { + "auxiliary_loss_clip": 0.06504256, + "auxiliary_loss_mlp": 0.01280924, + "balance_loss_clip": 0.06291994, + "balance_loss_mlp": 0.01260575, + "epoch": 0.2849240943935067, + "flos": 17604632424960.0, + "grad_norm": 2.187311269731562, + "language_loss": 0.90640962, + "learning_rate": 3.356145848516118e-06, + "loss": 0.98426139, + "num_input_tokens_seen": 102231885, + "router_z_loss_clip": 2.12304688, + "router_z_loss_mlp": 0.20336914, + "step": 4739, + "time_per_iteration": 2.491391897201538 + }, + { + "auxiliary_loss_clip": 0.06502014, + "auxiliary_loss_mlp": 0.01271887, + "balance_loss_clip": 0.06294325, + "balance_loss_mlp": 0.01254363, + "epoch": 0.28498421764617465, + "flos": 24869368615680.0, + "grad_norm": 1.2838984451042732, + "language_loss": 0.72652215, + "learning_rate": 3.355859570559998e-06, + "loss": 0.80426115, + "num_input_tokens_seen": 102252725, + "router_z_loss_clip": 2.07714844, + "router_z_loss_mlp": 0.17529297, + "step": 4740, + "time_per_iteration": 2.628420352935791 + }, + { + "auxiliary_loss_clip": 0.06497836, + "auxiliary_loss_mlp": 0.01273023, + "balance_loss_clip": 0.06293581, + "balance_loss_mlp": 0.01254069, + "epoch": 0.2850443408988426, + "flos": 22788947917440.0, + "grad_norm": 1.7372555552312992, + "language_loss": 0.77982342, + "learning_rate": 3.3555732411895477e-06, + "loss": 0.85753202, + "num_input_tokens_seen": 102271730, + "router_z_loss_clip": 2.04296875, + "router_z_loss_mlp": 0.1895752, + "step": 4741, + "time_per_iteration": 2.5205776691436768 + }, + { + "auxiliary_loss_clip": 0.06505083, + "auxiliary_loss_mlp": 0.01279172, + "balance_loss_clip": 0.06290049, + "balance_loss_mlp": 0.01260278, + "epoch": 0.2851044641515106, + "flos": 18850114713600.0, + "grad_norm": 2.3624012556043246, + "language_loss": 0.7702412, + "learning_rate": 3.3552868604156235e-06, + "loss": 0.84808373, + "num_input_tokens_seen": 102291325, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.18896484, + "step": 4742, + "time_per_iteration": 2.5852768421173096 + }, + { + "auxiliary_loss_clip": 0.06507465, + "auxiliary_loss_mlp": 0.01281198, + "balance_loss_clip": 0.06292667, + "balance_loss_mlp": 0.01260252, + "epoch": 0.28516458740417855, + "flos": 18886564039680.0, + "grad_norm": 2.066213096861692, + "language_loss": 0.57976151, + "learning_rate": 3.355000428249086e-06, + "loss": 0.65764809, + "num_input_tokens_seen": 102309000, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.20959473, + "step": 4743, + "time_per_iteration": 2.562298059463501 + }, + { + "auxiliary_loss_clip": 0.06507643, + "auxiliary_loss_mlp": 0.01278324, + "balance_loss_clip": 0.06297275, + "balance_loss_mlp": 0.01259787, + "epoch": 0.2852247106568465, + "flos": 25306592820480.0, + "grad_norm": 1.602300087654556, + "language_loss": 0.75013685, + "learning_rate": 3.354713944700797e-06, + "loss": 0.82799655, + "num_input_tokens_seen": 102329240, + "router_z_loss_clip": 2.10546875, + "router_z_loss_mlp": 0.1854248, + "step": 4744, + "time_per_iteration": 2.610302209854126 + }, + { + "auxiliary_loss_clip": 0.06500175, + "auxiliary_loss_mlp": 0.01276557, + "balance_loss_clip": 0.06292172, + "balance_loss_mlp": 0.01258794, + "epoch": 0.2852848339095145, + "flos": 11660080037760.0, + "grad_norm": 2.2644691376510844, + "language_loss": 0.78515136, + "learning_rate": 3.3544274097816185e-06, + "loss": 0.86291873, + "num_input_tokens_seen": 102344440, + "router_z_loss_clip": 2.08105469, + "router_z_loss_mlp": 0.17749023, + "step": 4745, + "time_per_iteration": 2.5170419216156006 + }, + { + "auxiliary_loss_clip": 0.06491117, + "auxiliary_loss_mlp": 0.01272956, + "balance_loss_clip": 0.06290857, + "balance_loss_mlp": 0.01254836, + "epoch": 0.2853449571621825, + "flos": 12938280145920.0, + "grad_norm": 1.7221704990089022, + "language_loss": 0.83220983, + "learning_rate": 3.3541408235024173e-06, + "loss": 0.9098506, + "num_input_tokens_seen": 102360985, + "router_z_loss_clip": 2.00097656, + "router_z_loss_mlp": 0.18127441, + "step": 4746, + "time_per_iteration": 2.6257071495056152 + }, + { + "auxiliary_loss_clip": 0.06514393, + "auxiliary_loss_mlp": 0.01278566, + "balance_loss_clip": 0.06295399, + "balance_loss_mlp": 0.01257943, + "epoch": 0.28540508041485046, + "flos": 20016660856320.0, + "grad_norm": 1.8084134515670756, + "language_loss": 0.80507863, + "learning_rate": 3.3538541858740604e-06, + "loss": 0.88300824, + "num_input_tokens_seen": 102380320, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.20617676, + "step": 4747, + "time_per_iteration": 2.5699074268341064 + }, + { + "auxiliary_loss_clip": 0.06375369, + "auxiliary_loss_mlp": 0.0127529, + "balance_loss_clip": 0.0627491, + "balance_loss_mlp": 0.01269043, + "epoch": 0.28546520366751843, + "flos": 68160264710400.0, + "grad_norm": 0.7514031277524565, + "language_loss": 0.60153103, + "learning_rate": 3.3535674969074173e-06, + "loss": 0.67803764, + "num_input_tokens_seen": 102439140, + "router_z_loss_clip": 1.0078125, + "router_z_loss_mlp": 0.06237793, + "step": 4748, + "time_per_iteration": 3.1155877113342285 + }, + { + "auxiliary_loss_clip": 0.06492989, + "auxiliary_loss_mlp": 0.01272874, + "balance_loss_clip": 0.06285426, + "balance_loss_mlp": 0.01255791, + "epoch": 0.2855253269201864, + "flos": 13254961852800.0, + "grad_norm": 2.1744647780903352, + "language_loss": 0.80643219, + "learning_rate": 3.3532807566133592e-06, + "loss": 0.88409078, + "num_input_tokens_seen": 102450990, + "router_z_loss_clip": 2.07617188, + "router_z_loss_mlp": 0.17089844, + "step": 4749, + "time_per_iteration": 2.5422439575195312 + }, + { + "auxiliary_loss_clip": 0.06506198, + "auxiliary_loss_mlp": 0.01278695, + "balance_loss_clip": 0.06295547, + "balance_loss_mlp": 0.0126011, + "epoch": 0.28558545017285436, + "flos": 28628345030400.0, + "grad_norm": 1.9900791940744995, + "language_loss": 0.70889151, + "learning_rate": 3.3529939650027587e-06, + "loss": 0.78674042, + "num_input_tokens_seen": 102471820, + "router_z_loss_clip": 2.10644531, + "router_z_loss_mlp": 0.18579102, + "step": 4750, + "time_per_iteration": 2.6223177909851074 + }, + { + "auxiliary_loss_clip": 0.06498066, + "auxiliary_loss_mlp": 0.01278692, + "balance_loss_clip": 0.06294224, + "balance_loss_mlp": 0.01261562, + "epoch": 0.2856455734255223, + "flos": 34138901594880.0, + "grad_norm": 1.523200352045364, + "language_loss": 0.82438904, + "learning_rate": 3.3527071220864917e-06, + "loss": 0.90215659, + "num_input_tokens_seen": 102492625, + "router_z_loss_clip": 2.04101562, + "router_z_loss_mlp": 0.17138672, + "step": 4751, + "time_per_iteration": 2.710822582244873 + }, + { + "auxiliary_loss_clip": 0.06498431, + "auxiliary_loss_mlp": 0.01276615, + "balance_loss_clip": 0.06291521, + "balance_loss_mlp": 0.01258424, + "epoch": 0.2857056966781903, + "flos": 39795590880000.0, + "grad_norm": 1.6833478059847915, + "language_loss": 0.80598158, + "learning_rate": 3.3524202278754353e-06, + "loss": 0.88373208, + "num_input_tokens_seen": 102514145, + "router_z_loss_clip": 2.06835938, + "router_z_loss_mlp": 0.1817627, + "step": 4752, + "time_per_iteration": 2.685669422149658 + }, + { + "auxiliary_loss_clip": 0.0649987, + "auxiliary_loss_mlp": 0.01272426, + "balance_loss_clip": 0.06292621, + "balance_loss_mlp": 0.01254223, + "epoch": 0.28576581993085826, + "flos": 21878846795520.0, + "grad_norm": 1.793038640961372, + "language_loss": 0.79062063, + "learning_rate": 3.3521332823804676e-06, + "loss": 0.86834359, + "num_input_tokens_seen": 102532365, + "router_z_loss_clip": 2.07324219, + "router_z_loss_mlp": 0.18200684, + "step": 4753, + "time_per_iteration": 2.612639904022217 + }, + { + "auxiliary_loss_clip": 0.06511062, + "auxiliary_loss_mlp": 0.01278051, + "balance_loss_clip": 0.06298037, + "balance_loss_mlp": 0.01257523, + "epoch": 0.2858259431835262, + "flos": 19096455317760.0, + "grad_norm": 2.5775982542053963, + "language_loss": 0.89774185, + "learning_rate": 3.3518462856124704e-06, + "loss": 0.97563303, + "num_input_tokens_seen": 102548425, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.20532227, + "step": 4754, + "time_per_iteration": 3.914802312850952 + }, + { + "auxiliary_loss_clip": 0.06494384, + "auxiliary_loss_mlp": 0.01278048, + "balance_loss_clip": 0.06293342, + "balance_loss_mlp": 0.01259988, + "epoch": 0.2858860664361942, + "flos": 20339673546240.0, + "grad_norm": 1.9874166310668562, + "language_loss": 0.82672411, + "learning_rate": 3.3515592375823267e-06, + "loss": 0.90444839, + "num_input_tokens_seen": 102566370, + "router_z_loss_clip": 2.01171875, + "router_z_loss_mlp": 0.18066406, + "step": 4755, + "time_per_iteration": 2.673158884048462 + }, + { + "auxiliary_loss_clip": 0.06498866, + "auxiliary_loss_mlp": 0.01274185, + "balance_loss_clip": 0.06291682, + "balance_loss_mlp": 0.0125721, + "epoch": 0.28594618968886215, + "flos": 24468551809920.0, + "grad_norm": 1.6562500913369433, + "language_loss": 0.83843541, + "learning_rate": 3.351272138300922e-06, + "loss": 0.91616589, + "num_input_tokens_seen": 102588715, + "router_z_loss_clip": 2.06835938, + "router_z_loss_mlp": 0.16992188, + "step": 4756, + "time_per_iteration": 2.6029391288757324 + }, + { + "auxiliary_loss_clip": 0.06377822, + "auxiliary_loss_mlp": 0.01262219, + "balance_loss_clip": 0.06279279, + "balance_loss_mlp": 0.01256002, + "epoch": 0.2860063129415301, + "flos": 71676170830080.0, + "grad_norm": 1.4612509113917642, + "language_loss": 0.6086607, + "learning_rate": 3.350984987779142e-06, + "loss": 0.68506116, + "num_input_tokens_seen": 102656715, + "router_z_loss_clip": 0.984375, + "router_z_loss_mlp": 0.06207275, + "step": 4757, + "time_per_iteration": 3.326833963394165 + }, + { + "auxiliary_loss_clip": 0.0650306, + "auxiliary_loss_mlp": 0.01277184, + "balance_loss_clip": 0.06298901, + "balance_loss_mlp": 0.01260459, + "epoch": 0.2860664361941981, + "flos": 20564993975040.0, + "grad_norm": 2.5468639815388996, + "language_loss": 0.66759324, + "learning_rate": 3.3506977860278756e-06, + "loss": 0.74539566, + "num_input_tokens_seen": 102676545, + "router_z_loss_clip": 2.04101562, + "router_z_loss_mlp": 0.1673584, + "step": 4758, + "time_per_iteration": 5.454218626022339 + }, + { + "auxiliary_loss_clip": 0.06503905, + "auxiliary_loss_mlp": 0.01277556, + "balance_loss_clip": 0.06294875, + "balance_loss_mlp": 0.01258817, + "epoch": 0.2861265594468661, + "flos": 36005992997760.0, + "grad_norm": 1.4420872105733484, + "language_loss": 0.63405287, + "learning_rate": 3.3504105330580143e-06, + "loss": 0.71186751, + "num_input_tokens_seen": 102702875, + "router_z_loss_clip": 2.09082031, + "router_z_loss_mlp": 0.1875, + "step": 4759, + "time_per_iteration": 2.745704174041748 + }, + { + "auxiliary_loss_clip": 0.06510226, + "auxiliary_loss_mlp": 0.01276918, + "balance_loss_clip": 0.06302258, + "balance_loss_mlp": 0.01257892, + "epoch": 0.28618668269953407, + "flos": 20053571379840.0, + "grad_norm": 2.14199936751817, + "language_loss": 0.74684435, + "learning_rate": 3.3501232288804496e-06, + "loss": 0.82471573, + "num_input_tokens_seen": 102723160, + "router_z_loss_clip": 2.078125, + "router_z_loss_mlp": 0.19030762, + "step": 4760, + "time_per_iteration": 2.541759490966797 + }, + { + "auxiliary_loss_clip": 0.06496474, + "auxiliary_loss_mlp": 0.01276289, + "balance_loss_clip": 0.06297328, + "balance_loss_mlp": 0.01260482, + "epoch": 0.28624680595220203, + "flos": 24978632739840.0, + "grad_norm": 1.8333731861449165, + "language_loss": 0.72652757, + "learning_rate": 3.3498358735060773e-06, + "loss": 0.80425525, + "num_input_tokens_seen": 102743855, + "router_z_loss_clip": 1.9921875, + "router_z_loss_mlp": 0.15795898, + "step": 4761, + "time_per_iteration": 2.57940673828125 + }, + { + "auxiliary_loss_clip": 0.06509258, + "auxiliary_loss_mlp": 0.01273154, + "balance_loss_clip": 0.06299996, + "balance_loss_mlp": 0.01256095, + "epoch": 0.28630692920487, + "flos": 22498862682240.0, + "grad_norm": 1.9183655494362113, + "language_loss": 0.74669504, + "learning_rate": 3.349548466945793e-06, + "loss": 0.82451922, + "num_input_tokens_seen": 102761370, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 0.1706543, + "step": 4762, + "time_per_iteration": 2.5321590900421143 + }, + { + "auxiliary_loss_clip": 0.06505883, + "auxiliary_loss_mlp": 0.01274368, + "balance_loss_clip": 0.06301434, + "balance_loss_mlp": 0.0125694, + "epoch": 0.28636705245753796, + "flos": 21255979870080.0, + "grad_norm": 2.6303759088840413, + "language_loss": 0.76297629, + "learning_rate": 3.349261009210496e-06, + "loss": 0.84077883, + "num_input_tokens_seen": 102780885, + "router_z_loss_clip": 2.04101562, + "router_z_loss_mlp": 0.17443848, + "step": 4763, + "time_per_iteration": 3.979782819747925 + }, + { + "auxiliary_loss_clip": 0.06506684, + "auxiliary_loss_mlp": 0.01275654, + "balance_loss_clip": 0.06298703, + "balance_loss_mlp": 0.012572, + "epoch": 0.28642717571020593, + "flos": 24102339540480.0, + "grad_norm": 1.7484925103151405, + "language_loss": 0.77499843, + "learning_rate": 3.348973500311086e-06, + "loss": 0.85282177, + "num_input_tokens_seen": 102801000, + "router_z_loss_clip": 2.08105469, + "router_z_loss_mlp": 0.18444824, + "step": 4764, + "time_per_iteration": 2.6036336421966553 + }, + { + "auxiliary_loss_clip": 0.0651267, + "auxiliary_loss_mlp": 0.01277486, + "balance_loss_clip": 0.06302905, + "balance_loss_mlp": 0.01257829, + "epoch": 0.2864872989628739, + "flos": 22607959098240.0, + "grad_norm": 5.154577786286556, + "language_loss": 0.71671587, + "learning_rate": 3.348685940258466e-06, + "loss": 0.79461741, + "num_input_tokens_seen": 102820230, + "router_z_loss_clip": 2.09765625, + "router_z_loss_mlp": 0.1965332, + "step": 4765, + "time_per_iteration": 2.5488131046295166 + }, + { + "auxiliary_loss_clip": 0.0651048, + "auxiliary_loss_mlp": 0.01272743, + "balance_loss_clip": 0.06304644, + "balance_loss_mlp": 0.01255684, + "epoch": 0.28654742221554186, + "flos": 32753449860480.0, + "grad_norm": 1.504395922922802, + "language_loss": 0.7630865, + "learning_rate": 3.3483983290635395e-06, + "loss": 0.84091872, + "num_input_tokens_seen": 102842670, + "router_z_loss_clip": 2.05957031, + "router_z_loss_mlp": 0.17053223, + "step": 4766, + "time_per_iteration": 2.659499406814575 + }, + { + "auxiliary_loss_clip": 0.0650377, + "auxiliary_loss_mlp": 0.01271145, + "balance_loss_clip": 0.0630042, + "balance_loss_mlp": 0.01254277, + "epoch": 0.2866075454682098, + "flos": 26989257386880.0, + "grad_norm": 2.0841406955827075, + "language_loss": 0.78443938, + "learning_rate": 3.348110666737214e-06, + "loss": 0.86218858, + "num_input_tokens_seen": 102864480, + "router_z_loss_clip": 2.03320312, + "router_z_loss_mlp": 0.16870117, + "step": 4767, + "time_per_iteration": 2.5891125202178955 + }, + { + "auxiliary_loss_clip": 0.06511022, + "auxiliary_loss_mlp": 0.01279425, + "balance_loss_clip": 0.06305116, + "balance_loss_mlp": 0.01261746, + "epoch": 0.2866676687208778, + "flos": 23259812336640.0, + "grad_norm": 2.0448044221544737, + "language_loss": 0.65430236, + "learning_rate": 3.3478229532903956e-06, + "loss": 0.73220682, + "num_input_tokens_seen": 102883740, + "router_z_loss_clip": 2.05761719, + "router_z_loss_mlp": 0.17675781, + "step": 4768, + "time_per_iteration": 2.572230815887451 + }, + { + "auxiliary_loss_clip": 0.0651636, + "auxiliary_loss_mlp": 0.01271508, + "balance_loss_clip": 0.06302489, + "balance_loss_mlp": 0.01253782, + "epoch": 0.28672779197354575, + "flos": 21586120156800.0, + "grad_norm": 1.6016626643500549, + "language_loss": 0.71173406, + "learning_rate": 3.3475351887339967e-06, + "loss": 0.78961271, + "num_input_tokens_seen": 102902945, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.17724609, + "step": 4769, + "time_per_iteration": 2.5180304050445557 + }, + { + "auxiliary_loss_clip": 0.06513099, + "auxiliary_loss_mlp": 0.01273812, + "balance_loss_clip": 0.06304821, + "balance_loss_mlp": 0.01256562, + "epoch": 0.2867879152262137, + "flos": 19871785946880.0, + "grad_norm": 1.7128041826885096, + "language_loss": 0.75347042, + "learning_rate": 3.3472473730789288e-06, + "loss": 0.83133948, + "num_input_tokens_seen": 102922405, + "router_z_loss_clip": 2.08496094, + "router_z_loss_mlp": 0.17248535, + "step": 4770, + "time_per_iteration": 2.575993537902832 + }, + { + "auxiliary_loss_clip": 0.06514675, + "auxiliary_loss_mlp": 0.01275884, + "balance_loss_clip": 0.06304142, + "balance_loss_mlp": 0.01257967, + "epoch": 0.2868480384788817, + "flos": 28219687868160.0, + "grad_norm": 4.606069071133779, + "language_loss": 0.68064034, + "learning_rate": 3.3469595063361045e-06, + "loss": 0.75854599, + "num_input_tokens_seen": 102938980, + "router_z_loss_clip": 2.10546875, + "router_z_loss_mlp": 0.17907715, + "step": 4771, + "time_per_iteration": 2.5533907413482666 + }, + { + "auxiliary_loss_clip": 0.06411134, + "auxiliary_loss_mlp": 0.0125763, + "balance_loss_clip": 0.06311508, + "balance_loss_mlp": 0.01253345, + "epoch": 0.2869081617315497, + "flos": 65442218768640.0, + "grad_norm": 0.7478629548239109, + "language_loss": 0.56696546, + "learning_rate": 3.3466715885164414e-06, + "loss": 0.64365304, + "num_input_tokens_seen": 103000405, + "router_z_loss_clip": 0.99609375, + "router_z_loss_mlp": 0.04290771, + "step": 4772, + "time_per_iteration": 3.1295437812805176 + }, + { + "auxiliary_loss_clip": 0.06515288, + "auxiliary_loss_mlp": 0.01274714, + "balance_loss_clip": 0.06305212, + "balance_loss_mlp": 0.01256165, + "epoch": 0.28696828498421767, + "flos": 18666610272000.0, + "grad_norm": 3.729070810615603, + "language_loss": 0.84013474, + "learning_rate": 3.346383619630856e-06, + "loss": 0.91803479, + "num_input_tokens_seen": 103017970, + "router_z_loss_clip": 2.09765625, + "router_z_loss_mlp": 0.1854248, + "step": 4773, + "time_per_iteration": 2.5181708335876465 + }, + { + "auxiliary_loss_clip": 0.06518447, + "auxiliary_loss_mlp": 0.01274166, + "balance_loss_clip": 0.06306095, + "balance_loss_mlp": 0.01254985, + "epoch": 0.28702840823688563, + "flos": 23666540855040.0, + "grad_norm": 2.856350636496585, + "language_loss": 0.78241181, + "learning_rate": 3.34609559969027e-06, + "loss": 0.86033797, + "num_input_tokens_seen": 103036385, + "router_z_loss_clip": 2.12109375, + "router_z_loss_mlp": 0.19177246, + "step": 4774, + "time_per_iteration": 2.5831918716430664 + }, + { + "auxiliary_loss_clip": 0.06519175, + "auxiliary_loss_mlp": 0.01275468, + "balance_loss_clip": 0.06307949, + "balance_loss_mlp": 0.01255703, + "epoch": 0.2870885314895536, + "flos": 13809248611200.0, + "grad_norm": 1.8762920881530476, + "language_loss": 0.74056339, + "learning_rate": 3.3458075287056034e-06, + "loss": 0.81850982, + "num_input_tokens_seen": 103052170, + "router_z_loss_clip": 2.11328125, + "router_z_loss_mlp": 0.19763184, + "step": 4775, + "time_per_iteration": 2.505293369293213 + }, + { + "auxiliary_loss_clip": 0.06520346, + "auxiliary_loss_mlp": 0.01275844, + "balance_loss_clip": 0.06309157, + "balance_loss_mlp": 0.01258142, + "epoch": 0.28714865474222157, + "flos": 17792790768000.0, + "grad_norm": 1.8823617406689648, + "language_loss": 0.88338864, + "learning_rate": 3.34551940668778e-06, + "loss": 0.96135056, + "num_input_tokens_seen": 103070510, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.17687988, + "step": 4776, + "time_per_iteration": 2.5638997554779053 + }, + { + "auxiliary_loss_clip": 0.06511634, + "auxiliary_loss_mlp": 0.01275769, + "balance_loss_clip": 0.06302971, + "balance_loss_mlp": 0.01258269, + "epoch": 0.28720877799488953, + "flos": 16002958429440.0, + "grad_norm": 2.648093963017482, + "language_loss": 0.74451852, + "learning_rate": 3.345231233647726e-06, + "loss": 0.82239252, + "num_input_tokens_seen": 103089590, + "router_z_loss_clip": 2.08789062, + "router_z_loss_mlp": 0.17492676, + "step": 4777, + "time_per_iteration": 2.5142223834991455 + }, + { + "auxiliary_loss_clip": 0.06527238, + "auxiliary_loss_mlp": 0.01280106, + "balance_loss_clip": 0.06311023, + "balance_loss_mlp": 0.01259924, + "epoch": 0.2872689012475575, + "flos": 20929445308800.0, + "grad_norm": 2.200879096052639, + "language_loss": 0.80539143, + "learning_rate": 3.3449430095963696e-06, + "loss": 0.88346487, + "num_input_tokens_seen": 103109080, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.20202637, + "step": 4778, + "time_per_iteration": 2.563994884490967 + }, + { + "auxiliary_loss_clip": 0.06511427, + "auxiliary_loss_mlp": 0.01281129, + "balance_loss_clip": 0.06304548, + "balance_loss_mlp": 0.01263223, + "epoch": 0.28732902450022546, + "flos": 21331603779840.0, + "grad_norm": 1.7996465112645923, + "language_loss": 0.73886508, + "learning_rate": 3.3446547345446386e-06, + "loss": 0.8167907, + "num_input_tokens_seen": 103127755, + "router_z_loss_clip": 2.06542969, + "router_z_loss_mlp": 0.17895508, + "step": 4779, + "time_per_iteration": 2.5394158363342285 + }, + { + "auxiliary_loss_clip": 0.06518923, + "auxiliary_loss_mlp": 0.01275383, + "balance_loss_clip": 0.06307982, + "balance_loss_mlp": 0.01255379, + "epoch": 0.2873891477528934, + "flos": 20856714364800.0, + "grad_norm": 1.509851280453794, + "language_loss": 0.76844704, + "learning_rate": 3.3443664085034656e-06, + "loss": 0.84639007, + "num_input_tokens_seen": 103147035, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.19995117, + "step": 4780, + "time_per_iteration": 2.5928425788879395 + }, + { + "auxiliary_loss_clip": 0.06507713, + "auxiliary_loss_mlp": 0.01271777, + "balance_loss_clip": 0.06302975, + "balance_loss_mlp": 0.01254014, + "epoch": 0.2874492710055614, + "flos": 17425698030720.0, + "grad_norm": 1.6471362454858889, + "language_loss": 0.81874287, + "learning_rate": 3.344078031483784e-06, + "loss": 0.89653778, + "num_input_tokens_seen": 103165410, + "router_z_loss_clip": 2.04296875, + "router_z_loss_mlp": 0.17773438, + "step": 4781, + "time_per_iteration": 2.6121537685394287 + }, + { + "auxiliary_loss_clip": 0.06521222, + "auxiliary_loss_mlp": 0.0127902, + "balance_loss_clip": 0.06306514, + "balance_loss_mlp": 0.01257002, + "epoch": 0.28750939425822936, + "flos": 13411827895680.0, + "grad_norm": 2.0671181517724966, + "language_loss": 0.86987036, + "learning_rate": 3.3437896034965283e-06, + "loss": 0.94787276, + "num_input_tokens_seen": 103183710, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.22009277, + "step": 4782, + "time_per_iteration": 2.554326057434082 + }, + { + "auxiliary_loss_clip": 0.06525762, + "auxiliary_loss_mlp": 0.01282396, + "balance_loss_clip": 0.06310341, + "balance_loss_mlp": 0.01262238, + "epoch": 0.2875695175108973, + "flos": 21876205392000.0, + "grad_norm": 1.4282255381090248, + "language_loss": 0.71525908, + "learning_rate": 3.3435011245526357e-06, + "loss": 0.79334062, + "num_input_tokens_seen": 103203790, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.20153809, + "step": 4783, + "time_per_iteration": 2.5632100105285645 + }, + { + "auxiliary_loss_clip": 0.06514136, + "auxiliary_loss_mlp": 0.01279499, + "balance_loss_clip": 0.06305264, + "balance_loss_mlp": 0.01259186, + "epoch": 0.2876296407635653, + "flos": 26251885457280.0, + "grad_norm": 1.5568964680804804, + "language_loss": 0.77152872, + "learning_rate": 3.343212594663047e-06, + "loss": 0.84946513, + "num_input_tokens_seen": 103223925, + "router_z_loss_clip": 2.08886719, + "router_z_loss_mlp": 0.203125, + "step": 4784, + "time_per_iteration": 2.589073657989502 + }, + { + "auxiliary_loss_clip": 0.06506136, + "auxiliary_loss_mlp": 0.01278073, + "balance_loss_clip": 0.06301259, + "balance_loss_mlp": 0.01257914, + "epoch": 0.28768976401623325, + "flos": 25380581575680.0, + "grad_norm": 1.5725877671574655, + "language_loss": 0.76106405, + "learning_rate": 3.3429240138387015e-06, + "loss": 0.83890617, + "num_input_tokens_seen": 103244760, + "router_z_loss_clip": 2.04785156, + "router_z_loss_mlp": 0.20153809, + "step": 4785, + "time_per_iteration": 2.6051061153411865 + }, + { + "auxiliary_loss_clip": 0.06513079, + "auxiliary_loss_mlp": 0.0127873, + "balance_loss_clip": 0.06302914, + "balance_loss_mlp": 0.01259394, + "epoch": 0.28774988726890127, + "flos": 30672232548480.0, + "grad_norm": 2.246179731229797, + "language_loss": 0.83339965, + "learning_rate": 3.3426353820905425e-06, + "loss": 0.91131771, + "num_input_tokens_seen": 103261995, + "router_z_loss_clip": 2.1015625, + "router_z_loss_mlp": 0.19348145, + "step": 4786, + "time_per_iteration": 2.6064071655273438 + }, + { + "auxiliary_loss_clip": 0.06512371, + "auxiliary_loss_mlp": 0.01278365, + "balance_loss_clip": 0.06303188, + "balance_loss_mlp": 0.01258934, + "epoch": 0.28781001052156924, + "flos": 20601820644480.0, + "grad_norm": 2.4876341958211037, + "language_loss": 0.80607671, + "learning_rate": 3.342346699429516e-06, + "loss": 0.88398409, + "num_input_tokens_seen": 103279780, + "router_z_loss_clip": 2.09179688, + "router_z_loss_mlp": 0.19433594, + "step": 4787, + "time_per_iteration": 2.574558973312378 + }, + { + "auxiliary_loss_clip": 0.06516974, + "auxiliary_loss_mlp": 0.01280481, + "balance_loss_clip": 0.0630367, + "balance_loss_mlp": 0.01260191, + "epoch": 0.2878701337742372, + "flos": 26549643340800.0, + "grad_norm": 1.713934654291453, + "language_loss": 0.84188497, + "learning_rate": 3.3420579658665677e-06, + "loss": 0.91985947, + "num_input_tokens_seen": 103300580, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.20288086, + "step": 4788, + "time_per_iteration": 2.610520362854004 + }, + { + "auxiliary_loss_clip": 0.06528202, + "auxiliary_loss_mlp": 0.01278372, + "balance_loss_clip": 0.06311956, + "balance_loss_mlp": 0.01257594, + "epoch": 0.28793025702690517, + "flos": 28154294156160.0, + "grad_norm": 1.8819133496848792, + "language_loss": 0.73887986, + "learning_rate": 3.3417691814126468e-06, + "loss": 0.81694555, + "num_input_tokens_seen": 103320430, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.2076416, + "step": 4789, + "time_per_iteration": 2.637234687805176 + }, + { + "auxiliary_loss_clip": 0.06504419, + "auxiliary_loss_mlp": 0.0127649, + "balance_loss_clip": 0.06300576, + "balance_loss_mlp": 0.01259014, + "epoch": 0.28799038027957313, + "flos": 23812254305280.0, + "grad_norm": 1.6484379512289788, + "language_loss": 0.84411776, + "learning_rate": 3.341480346078704e-06, + "loss": 0.92192692, + "num_input_tokens_seen": 103337695, + "router_z_loss_clip": 2.03808594, + "router_z_loss_mlp": 0.17492676, + "step": 4790, + "time_per_iteration": 2.5587222576141357 + }, + { + "auxiliary_loss_clip": 0.06518544, + "auxiliary_loss_mlp": 0.01278217, + "balance_loss_clip": 0.06308021, + "balance_loss_mlp": 0.01259728, + "epoch": 0.2880505035322411, + "flos": 22350340120320.0, + "grad_norm": 1.9872780385985664, + "language_loss": 0.78222489, + "learning_rate": 3.3411914598756922e-06, + "loss": 0.86019248, + "num_input_tokens_seen": 103357010, + "router_z_loss_clip": 2.10546875, + "router_z_loss_mlp": 0.18481445, + "step": 4791, + "time_per_iteration": 2.624457359313965 + }, + { + "auxiliary_loss_clip": 0.06518695, + "auxiliary_loss_mlp": 0.01277015, + "balance_loss_clip": 0.06302316, + "balance_loss_mlp": 0.01257286, + "epoch": 0.28811062678490906, + "flos": 18010061205120.0, + "grad_norm": 3.7561845310327002, + "language_loss": 0.71278274, + "learning_rate": 3.3409025228145654e-06, + "loss": 0.79073977, + "num_input_tokens_seen": 103375600, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.19726562, + "step": 4792, + "time_per_iteration": 2.5208675861358643 + }, + { + "auxiliary_loss_clip": 0.06512474, + "auxiliary_loss_mlp": 0.01276677, + "balance_loss_clip": 0.06301394, + "balance_loss_mlp": 0.01258391, + "epoch": 0.28817075003757703, + "flos": 22097416970880.0, + "grad_norm": 1.8001054572072859, + "language_loss": 0.80413318, + "learning_rate": 3.3406135349062812e-06, + "loss": 0.88202471, + "num_input_tokens_seen": 103395225, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 0.18286133, + "step": 4793, + "time_per_iteration": 4.170284271240234 + }, + { + "auxiliary_loss_clip": 0.06499149, + "auxiliary_loss_mlp": 0.01283104, + "balance_loss_clip": 0.06297339, + "balance_loss_mlp": 0.01264484, + "epoch": 0.288230873290245, + "flos": 41692842552960.0, + "grad_norm": 1.6709200510021447, + "language_loss": 0.78107667, + "learning_rate": 3.340324496161797e-06, + "loss": 0.85889918, + "num_input_tokens_seen": 103417245, + "router_z_loss_clip": 2.01757812, + "router_z_loss_mlp": 0.18603516, + "step": 4794, + "time_per_iteration": 2.8557510375976562 + }, + { + "auxiliary_loss_clip": 0.06510264, + "auxiliary_loss_mlp": 0.01279527, + "balance_loss_clip": 0.06298079, + "balance_loss_mlp": 0.01260882, + "epoch": 0.28829099654291296, + "flos": 18630328654080.0, + "grad_norm": 2.1208293695579608, + "language_loss": 0.83245766, + "learning_rate": 3.340035406592074e-06, + "loss": 0.91035557, + "num_input_tokens_seen": 103435500, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.18652344, + "step": 4795, + "time_per_iteration": 2.535163164138794 + }, + { + "auxiliary_loss_clip": 0.06498718, + "auxiliary_loss_mlp": 0.0128311, + "balance_loss_clip": 0.06297053, + "balance_loss_mlp": 0.01266099, + "epoch": 0.2883511197955809, + "flos": 24680707148160.0, + "grad_norm": 2.078774389913416, + "language_loss": 0.75219119, + "learning_rate": 3.339746266208074e-06, + "loss": 0.83000946, + "num_input_tokens_seen": 103451040, + "router_z_loss_clip": 2.015625, + "router_z_loss_mlp": 0.17004395, + "step": 4796, + "time_per_iteration": 2.567488670349121 + }, + { + "auxiliary_loss_clip": 0.06509424, + "auxiliary_loss_mlp": 0.01276979, + "balance_loss_clip": 0.06296358, + "balance_loss_mlp": 0.01257798, + "epoch": 0.2884112430482489, + "flos": 23118794714880.0, + "grad_norm": 2.1968759883463513, + "language_loss": 0.73290622, + "learning_rate": 3.3394570750207614e-06, + "loss": 0.81077027, + "num_input_tokens_seen": 103471330, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.19189453, + "step": 4797, + "time_per_iteration": 3.975389242172241 + }, + { + "auxiliary_loss_clip": 0.06507025, + "auxiliary_loss_mlp": 0.01273799, + "balance_loss_clip": 0.0629791, + "balance_loss_mlp": 0.0125556, + "epoch": 0.28847136630091685, + "flos": 16879000066560.0, + "grad_norm": 2.2937655739300373, + "language_loss": 0.74862409, + "learning_rate": 3.3391678330411017e-06, + "loss": 0.82643229, + "num_input_tokens_seen": 103488060, + "router_z_loss_clip": 2.08984375, + "router_z_loss_mlp": 0.18212891, + "step": 4798, + "time_per_iteration": 3.9849729537963867 + }, + { + "auxiliary_loss_clip": 0.06517179, + "auxiliary_loss_mlp": 0.01285883, + "balance_loss_clip": 0.06306559, + "balance_loss_mlp": 0.01266381, + "epoch": 0.2885314895535849, + "flos": 25663161870720.0, + "grad_norm": 2.626807334731923, + "language_loss": 0.65891635, + "learning_rate": 3.3388785402800642e-06, + "loss": 0.736947, + "num_input_tokens_seen": 103503600, + "router_z_loss_clip": 2.09960938, + "router_z_loss_mlp": 0.19494629, + "step": 4799, + "time_per_iteration": 2.6063008308410645 + }, + { + "auxiliary_loss_clip": 0.06513311, + "auxiliary_loss_mlp": 0.01278669, + "balance_loss_clip": 0.06300591, + "balance_loss_mlp": 0.01260013, + "epoch": 0.28859161280625284, + "flos": 21113872145280.0, + "grad_norm": 1.5942901452973643, + "language_loss": 0.82659006, + "learning_rate": 3.3385891967486178e-06, + "loss": 0.9045099, + "num_input_tokens_seen": 103524195, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.18664551, + "step": 4800, + "time_per_iteration": 2.5522704124450684 + }, + { + "auxiliary_loss_clip": 0.06498213, + "auxiliary_loss_mlp": 0.01277775, + "balance_loss_clip": 0.06294428, + "balance_loss_mlp": 0.01260609, + "epoch": 0.2886517360589208, + "flos": 26476870469760.0, + "grad_norm": 1.7957021177556654, + "language_loss": 0.91005886, + "learning_rate": 3.3382998024577347e-06, + "loss": 0.98781872, + "num_input_tokens_seen": 103545235, + "router_z_loss_clip": 2.03613281, + "router_z_loss_mlp": 0.17175293, + "step": 4801, + "time_per_iteration": 2.648975372314453 + }, + { + "auxiliary_loss_clip": 0.06509861, + "auxiliary_loss_mlp": 0.01278615, + "balance_loss_clip": 0.06299478, + "balance_loss_mlp": 0.01260722, + "epoch": 0.28871185931158877, + "flos": 25272365627520.0, + "grad_norm": 1.8432796050129874, + "language_loss": 0.74294543, + "learning_rate": 3.33801035741839e-06, + "loss": 0.82083023, + "num_input_tokens_seen": 103563305, + "router_z_loss_clip": 2.10546875, + "router_z_loss_mlp": 0.17895508, + "step": 4802, + "time_per_iteration": 2.5519795417785645 + }, + { + "auxiliary_loss_clip": 0.0639186, + "auxiliary_loss_mlp": 0.01290861, + "balance_loss_clip": 0.06293292, + "balance_loss_mlp": 0.01286456, + "epoch": 0.28877198256425674, + "flos": 66683676061440.0, + "grad_norm": 0.7742675136744124, + "language_loss": 0.62925327, + "learning_rate": 3.337720861641558e-06, + "loss": 0.70608056, + "num_input_tokens_seen": 103625025, + "router_z_loss_clip": 0.984375, + "router_z_loss_mlp": 0.04412842, + "step": 4803, + "time_per_iteration": 4.557742595672607 + }, + { + "auxiliary_loss_clip": 0.06504417, + "auxiliary_loss_mlp": 0.01273971, + "balance_loss_clip": 0.06297504, + "balance_loss_mlp": 0.01256721, + "epoch": 0.2888321058169247, + "flos": 20309261713920.0, + "grad_norm": 2.312081796144873, + "language_loss": 0.71418971, + "learning_rate": 3.3374313151382165e-06, + "loss": 0.79197359, + "num_input_tokens_seen": 103644235, + "router_z_loss_clip": 2.06738281, + "router_z_loss_mlp": 0.17248535, + "step": 4804, + "time_per_iteration": 2.5679221153259277 + }, + { + "auxiliary_loss_clip": 0.06511839, + "auxiliary_loss_mlp": 0.01276786, + "balance_loss_clip": 0.06299883, + "balance_loss_mlp": 0.01258892, + "epoch": 0.28889222906959267, + "flos": 25523192424960.0, + "grad_norm": 2.035708939634364, + "language_loss": 0.68254268, + "learning_rate": 3.337141717919346e-06, + "loss": 0.76042891, + "num_input_tokens_seen": 103664700, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.17907715, + "step": 4805, + "time_per_iteration": 2.5894699096679688 + }, + { + "auxiliary_loss_clip": 0.06510667, + "auxiliary_loss_mlp": 0.01276264, + "balance_loss_clip": 0.06300112, + "balance_loss_mlp": 0.01258955, + "epoch": 0.28895235232226063, + "flos": 32679544959360.0, + "grad_norm": 1.67836402891337, + "language_loss": 0.69622278, + "learning_rate": 3.3368520699959272e-06, + "loss": 0.77409214, + "num_input_tokens_seen": 103686595, + "router_z_loss_clip": 2.10546875, + "router_z_loss_mlp": 0.1730957, + "step": 4806, + "time_per_iteration": 2.6661036014556885 + }, + { + "auxiliary_loss_clip": 0.06499489, + "auxiliary_loss_mlp": 0.01273073, + "balance_loss_clip": 0.06297253, + "balance_loss_mlp": 0.01256133, + "epoch": 0.2890124755749286, + "flos": 29722202156160.0, + "grad_norm": 1.5048672267596763, + "language_loss": 0.71718901, + "learning_rate": 3.3365623713789443e-06, + "loss": 0.7949146, + "num_input_tokens_seen": 103707525, + "router_z_loss_clip": 2.02441406, + "router_z_loss_mlp": 0.16931152, + "step": 4807, + "time_per_iteration": 2.6082210540771484 + }, + { + "auxiliary_loss_clip": 0.06506096, + "auxiliary_loss_mlp": 0.01273173, + "balance_loss_clip": 0.06298453, + "balance_loss_mlp": 0.01255769, + "epoch": 0.28907259882759656, + "flos": 22681067385600.0, + "grad_norm": 1.6103433555287536, + "language_loss": 0.8189373, + "learning_rate": 3.336272622079382e-06, + "loss": 0.89672995, + "num_input_tokens_seen": 103727905, + "router_z_loss_clip": 2.07421875, + "router_z_loss_mlp": 0.17407227, + "step": 4808, + "time_per_iteration": 2.575005292892456 + }, + { + "auxiliary_loss_clip": 0.0649471, + "auxiliary_loss_mlp": 0.01279377, + "balance_loss_clip": 0.06293811, + "balance_loss_mlp": 0.01261543, + "epoch": 0.2891327220802645, + "flos": 22572809510400.0, + "grad_norm": 1.6658984409983257, + "language_loss": 0.79128641, + "learning_rate": 3.3359828221082276e-06, + "loss": 0.86902726, + "num_input_tokens_seen": 103748335, + "router_z_loss_clip": 2.00878906, + "router_z_loss_mlp": 0.17834473, + "step": 4809, + "time_per_iteration": 2.563202142715454 + }, + { + "auxiliary_loss_clip": 0.06509645, + "auxiliary_loss_mlp": 0.01274578, + "balance_loss_clip": 0.06294866, + "balance_loss_mlp": 0.01256411, + "epoch": 0.2891928453329325, + "flos": 21659228444160.0, + "grad_norm": 1.9154470794900575, + "language_loss": 0.79370517, + "learning_rate": 3.3356929714764714e-06, + "loss": 0.8715474, + "num_input_tokens_seen": 103767020, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.18151855, + "step": 4810, + "time_per_iteration": 2.555290460586548 + }, + { + "auxiliary_loss_clip": 0.06499892, + "auxiliary_loss_mlp": 0.01275722, + "balance_loss_clip": 0.06295595, + "balance_loss_mlp": 0.01259259, + "epoch": 0.28925296858560046, + "flos": 23228855452800.0, + "grad_norm": 1.5886971021791327, + "language_loss": 0.77595514, + "learning_rate": 3.3354030701951032e-06, + "loss": 0.85371131, + "num_input_tokens_seen": 103786355, + "router_z_loss_clip": 2.04199219, + "router_z_loss_mlp": 0.16467285, + "step": 4811, + "time_per_iteration": 2.5522642135620117 + }, + { + "auxiliary_loss_clip": 0.06509165, + "auxiliary_loss_mlp": 0.01277164, + "balance_loss_clip": 0.06302579, + "balance_loss_mlp": 0.01259497, + "epoch": 0.2893130918382685, + "flos": 28629267425280.0, + "grad_norm": 1.704164513062304, + "language_loss": 0.78002596, + "learning_rate": 3.335113118275117e-06, + "loss": 0.85788929, + "num_input_tokens_seen": 103809345, + "router_z_loss_clip": 2.06933594, + "router_z_loss_mlp": 0.17675781, + "step": 4812, + "time_per_iteration": 2.6069154739379883 + }, + { + "auxiliary_loss_clip": 0.06384769, + "auxiliary_loss_mlp": 0.01270413, + "balance_loss_clip": 0.06288065, + "balance_loss_mlp": 0.01266965, + "epoch": 0.28937321509093644, + "flos": 72323328240000.0, + "grad_norm": 0.7614773045430072, + "language_loss": 0.60086656, + "learning_rate": 3.3348231157275085e-06, + "loss": 0.67741829, + "num_input_tokens_seen": 103871180, + "router_z_loss_clip": 0.96875, + "router_z_loss_mlp": 0.03457642, + "step": 4813, + "time_per_iteration": 3.3377795219421387 + }, + { + "auxiliary_loss_clip": 0.06503347, + "auxiliary_loss_mlp": 0.01279669, + "balance_loss_clip": 0.0629978, + "balance_loss_mlp": 0.01262253, + "epoch": 0.2894333383436044, + "flos": 16221905948160.0, + "grad_norm": 2.095142654160917, + "language_loss": 0.83059847, + "learning_rate": 3.3345330625632725e-06, + "loss": 0.90842861, + "num_input_tokens_seen": 103889040, + "router_z_loss_clip": 2.03515625, + "router_z_loss_mlp": 0.17407227, + "step": 4814, + "time_per_iteration": 2.519822120666504 + }, + { + "auxiliary_loss_clip": 0.06510264, + "auxiliary_loss_mlp": 0.0128276, + "balance_loss_clip": 0.06297985, + "balance_loss_mlp": 0.01264389, + "epoch": 0.2894934615962724, + "flos": 24835434912000.0, + "grad_norm": 1.4921373382431753, + "language_loss": 0.72583377, + "learning_rate": 3.3342429587934094e-06, + "loss": 0.80376399, + "num_input_tokens_seen": 103910380, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.18371582, + "step": 4815, + "time_per_iteration": 2.613424301147461 + }, + { + "auxiliary_loss_clip": 0.06496876, + "auxiliary_loss_mlp": 0.01270189, + "balance_loss_clip": 0.06299625, + "balance_loss_mlp": 0.01253858, + "epoch": 0.28955358484894034, + "flos": 20456400683520.0, + "grad_norm": 1.478095248571898, + "language_loss": 0.71455014, + "learning_rate": 3.3339528044289198e-06, + "loss": 0.79222083, + "num_input_tokens_seen": 103929955, + "router_z_loss_clip": 1.97265625, + "router_z_loss_mlp": 0.16345215, + "step": 4816, + "time_per_iteration": 2.523789644241333 + }, + { + "auxiliary_loss_clip": 0.0651416, + "auxiliary_loss_mlp": 0.01273853, + "balance_loss_clip": 0.06301913, + "balance_loss_mlp": 0.01256007, + "epoch": 0.2896137081016083, + "flos": 22571803261440.0, + "grad_norm": 2.1886400582799643, + "language_loss": 0.75928313, + "learning_rate": 3.3336625994808055e-06, + "loss": 0.83716327, + "num_input_tokens_seen": 103948020, + "router_z_loss_clip": 2.12109375, + "router_z_loss_mlp": 0.17834473, + "step": 4817, + "time_per_iteration": 2.5829625129699707 + }, + { + "auxiliary_loss_clip": 0.0650699, + "auxiliary_loss_mlp": 0.0127444, + "balance_loss_clip": 0.06299114, + "balance_loss_mlp": 0.01255486, + "epoch": 0.28967383135427627, + "flos": 26695231009920.0, + "grad_norm": 2.009148210409016, + "language_loss": 0.77384543, + "learning_rate": 3.3333723439600723e-06, + "loss": 0.85165972, + "num_input_tokens_seen": 103968740, + "router_z_loss_clip": 2.078125, + "router_z_loss_mlp": 0.18933105, + "step": 4818, + "time_per_iteration": 2.583580732345581 + }, + { + "auxiliary_loss_clip": 0.06511898, + "auxiliary_loss_mlp": 0.01274642, + "balance_loss_clip": 0.063049, + "balance_loss_mlp": 0.01257833, + "epoch": 0.28973395460694423, + "flos": 15563428237440.0, + "grad_norm": 1.8180363278883531, + "language_loss": 0.80166686, + "learning_rate": 3.3330820378777263e-06, + "loss": 0.87953222, + "num_input_tokens_seen": 103986005, + "router_z_loss_clip": 2.07128906, + "router_z_loss_mlp": 0.16833496, + "step": 4819, + "time_per_iteration": 2.58598256111145 + }, + { + "auxiliary_loss_clip": 0.06512412, + "auxiliary_loss_mlp": 0.01275212, + "balance_loss_clip": 0.06301294, + "balance_loss_mlp": 0.01256543, + "epoch": 0.2897940778596122, + "flos": 18703395014400.0, + "grad_norm": 1.8889731698350438, + "language_loss": 0.79784238, + "learning_rate": 3.332791681244776e-06, + "loss": 0.87571859, + "num_input_tokens_seen": 104005070, + "router_z_loss_clip": 2.11328125, + "router_z_loss_mlp": 0.18664551, + "step": 4820, + "time_per_iteration": 2.514738082885742 + }, + { + "auxiliary_loss_clip": 0.06519003, + "auxiliary_loss_mlp": 0.01272112, + "balance_loss_clip": 0.06309246, + "balance_loss_mlp": 0.01254612, + "epoch": 0.28985420111228016, + "flos": 18776209812480.0, + "grad_norm": 1.948801074603747, + "language_loss": 0.73537958, + "learning_rate": 3.332501274072231e-06, + "loss": 0.81329072, + "num_input_tokens_seen": 104022945, + "router_z_loss_clip": 2.09863281, + "router_z_loss_mlp": 0.17492676, + "step": 4821, + "time_per_iteration": 2.6552352905273438 + }, + { + "auxiliary_loss_clip": 0.06509826, + "auxiliary_loss_mlp": 0.01279091, + "balance_loss_clip": 0.06303322, + "balance_loss_mlp": 0.01260733, + "epoch": 0.28991432436494813, + "flos": 23075511281280.0, + "grad_norm": 1.9415887628712303, + "language_loss": 0.7256397, + "learning_rate": 3.332210816371104e-06, + "loss": 0.8035289, + "num_input_tokens_seen": 104042080, + "router_z_loss_clip": 2.06347656, + "router_z_loss_mlp": 0.18347168, + "step": 4822, + "time_per_iteration": 2.5311806201934814 + }, + { + "auxiliary_loss_clip": 0.06508678, + "auxiliary_loss_mlp": 0.0127532, + "balance_loss_clip": 0.06304502, + "balance_loss_mlp": 0.01258237, + "epoch": 0.2899744476176161, + "flos": 17608992837120.0, + "grad_norm": 1.6868082855094653, + "language_loss": 0.66498971, + "learning_rate": 3.3319203081524102e-06, + "loss": 0.74282968, + "num_input_tokens_seen": 104060975, + "router_z_loss_clip": 2.04003906, + "router_z_loss_mlp": 0.17077637, + "step": 4823, + "time_per_iteration": 2.5582497119903564 + }, + { + "auxiliary_loss_clip": 0.06507877, + "auxiliary_loss_mlp": 0.0127093, + "balance_loss_clip": 0.06303018, + "balance_loss_mlp": 0.01253728, + "epoch": 0.29003457087028406, + "flos": 22315861365120.0, + "grad_norm": 2.007628710478466, + "language_loss": 0.81589168, + "learning_rate": 3.331629749427164e-06, + "loss": 0.89367974, + "num_input_tokens_seen": 104081395, + "router_z_loss_clip": 2.04882812, + "router_z_loss_mlp": 0.171875, + "step": 4824, + "time_per_iteration": 2.5258595943450928 + }, + { + "auxiliary_loss_clip": 0.06510833, + "auxiliary_loss_mlp": 0.01276305, + "balance_loss_clip": 0.06301483, + "balance_loss_mlp": 0.01258376, + "epoch": 0.2900946941229521, + "flos": 21951493885440.0, + "grad_norm": 1.837693758429887, + "language_loss": 0.73192668, + "learning_rate": 3.331339140206385e-06, + "loss": 0.80979806, + "num_input_tokens_seen": 104099995, + "router_z_loss_clip": 2.09179688, + "router_z_loss_mlp": 0.17932129, + "step": 4825, + "time_per_iteration": 2.558096170425415 + }, + { + "auxiliary_loss_clip": 0.0651435, + "auxiliary_loss_mlp": 0.01275324, + "balance_loss_clip": 0.06305824, + "balance_loss_mlp": 0.01257049, + "epoch": 0.29015481737562004, + "flos": 17938126874880.0, + "grad_norm": 2.202818652908599, + "language_loss": 0.7426061, + "learning_rate": 3.331048480501092e-06, + "loss": 0.82050288, + "num_input_tokens_seen": 104118930, + "router_z_loss_clip": 2.08496094, + "router_z_loss_mlp": 0.18273926, + "step": 4826, + "time_per_iteration": 2.497711420059204 + }, + { + "auxiliary_loss_clip": 0.06516986, + "auxiliary_loss_mlp": 0.01278902, + "balance_loss_clip": 0.06309567, + "balance_loss_mlp": 0.01262141, + "epoch": 0.290214940628288, + "flos": 22790079947520.0, + "grad_norm": 1.934932602801083, + "language_loss": 0.69077051, + "learning_rate": 3.3307577703223073e-06, + "loss": 0.76872945, + "num_input_tokens_seen": 104136940, + "router_z_loss_clip": 2.07324219, + "router_z_loss_mlp": 0.16748047, + "step": 4827, + "time_per_iteration": 2.5729641914367676 + }, + { + "auxiliary_loss_clip": 0.06517433, + "auxiliary_loss_mlp": 0.0127379, + "balance_loss_clip": 0.06311382, + "balance_loss_mlp": 0.01255646, + "epoch": 0.290275063880956, + "flos": 20011881173760.0, + "grad_norm": 1.8047855406998587, + "language_loss": 0.80766201, + "learning_rate": 3.3304670096810545e-06, + "loss": 0.88557422, + "num_input_tokens_seen": 104154280, + "router_z_loss_clip": 2.06054688, + "router_z_loss_mlp": 0.18151855, + "step": 4828, + "time_per_iteration": 2.5190348625183105 + }, + { + "auxiliary_loss_clip": 0.0651058, + "auxiliary_loss_mlp": 0.01278642, + "balance_loss_clip": 0.06308287, + "balance_loss_mlp": 0.01260809, + "epoch": 0.29033518713362394, + "flos": 22060003322880.0, + "grad_norm": 1.646725141321262, + "language_loss": 0.80908686, + "learning_rate": 3.33017619858836e-06, + "loss": 0.8869791, + "num_input_tokens_seen": 104172605, + "router_z_loss_clip": 2.02246094, + "router_z_loss_mlp": 0.17822266, + "step": 4829, + "time_per_iteration": 2.564837694168091 + }, + { + "auxiliary_loss_clip": 0.06503877, + "auxiliary_loss_mlp": 0.01277613, + "balance_loss_clip": 0.06304269, + "balance_loss_mlp": 0.0126059, + "epoch": 0.2903953103862919, + "flos": 25637194304640.0, + "grad_norm": 1.4271698228137566, + "language_loss": 0.82616186, + "learning_rate": 3.329885337055249e-06, + "loss": 0.90397674, + "num_input_tokens_seen": 104194120, + "router_z_loss_clip": 1.99414062, + "router_z_loss_mlp": 0.17016602, + "step": 4830, + "time_per_iteration": 2.557326555252075 + }, + { + "auxiliary_loss_clip": 0.0652103, + "auxiliary_loss_mlp": 0.01280335, + "balance_loss_clip": 0.06313583, + "balance_loss_mlp": 0.01262036, + "epoch": 0.29045543363895987, + "flos": 16951437521280.0, + "grad_norm": 2.247105417787089, + "language_loss": 0.79901475, + "learning_rate": 3.3295944250927546e-06, + "loss": 0.87702841, + "num_input_tokens_seen": 104210875, + "router_z_loss_clip": 2.07421875, + "router_z_loss_mlp": 0.18310547, + "step": 4831, + "time_per_iteration": 2.5306637287139893 + }, + { + "auxiliary_loss_clip": 0.06507042, + "auxiliary_loss_mlp": 0.01277723, + "balance_loss_clip": 0.06307022, + "balance_loss_mlp": 0.01261392, + "epoch": 0.29051555689162784, + "flos": 26402630152320.0, + "grad_norm": 2.3059080747570775, + "language_loss": 0.75331926, + "learning_rate": 3.3293034627119055e-06, + "loss": 0.83116686, + "num_input_tokens_seen": 104229875, + "router_z_loss_clip": 2.00097656, + "router_z_loss_mlp": 0.16333008, + "step": 4832, + "time_per_iteration": 2.5603439807891846 + }, + { + "auxiliary_loss_clip": 0.06503655, + "auxiliary_loss_mlp": 0.01283448, + "balance_loss_clip": 0.06302731, + "balance_loss_mlp": 0.01267271, + "epoch": 0.2905756801442958, + "flos": 21109931003520.0, + "grad_norm": 1.626645949157208, + "language_loss": 0.76312864, + "learning_rate": 3.329012449923736e-06, + "loss": 0.8409996, + "num_input_tokens_seen": 104250405, + "router_z_loss_clip": 2.00878906, + "router_z_loss_mlp": 0.16162109, + "step": 4833, + "time_per_iteration": 4.029958963394165 + }, + { + "auxiliary_loss_clip": 0.06504881, + "auxiliary_loss_mlp": 0.01280243, + "balance_loss_clip": 0.06303954, + "balance_loss_mlp": 0.01263363, + "epoch": 0.29063580339696377, + "flos": 15711573456000.0, + "grad_norm": 1.645904053352059, + "language_loss": 0.65383506, + "learning_rate": 3.3287213867392813e-06, + "loss": 0.73168635, + "num_input_tokens_seen": 104269185, + "router_z_loss_clip": 2.00976562, + "router_z_loss_mlp": 0.16882324, + "step": 4834, + "time_per_iteration": 2.5233187675476074 + }, + { + "auxiliary_loss_clip": 0.06499655, + "auxiliary_loss_mlp": 0.01274915, + "balance_loss_clip": 0.06299647, + "balance_loss_mlp": 0.01258893, + "epoch": 0.29069592664963173, + "flos": 24651972397440.0, + "grad_norm": 1.808411103531711, + "language_loss": 0.71914709, + "learning_rate": 3.3284302731695783e-06, + "loss": 0.79689276, + "num_input_tokens_seen": 104289400, + "router_z_loss_clip": 1.99902344, + "router_z_loss_mlp": 0.16027832, + "step": 4835, + "time_per_iteration": 2.555670738220215 + }, + { + "auxiliary_loss_clip": 0.06500543, + "auxiliary_loss_mlp": 0.01275594, + "balance_loss_clip": 0.06299368, + "balance_loss_mlp": 0.01259536, + "epoch": 0.2907560499022997, + "flos": 24980854872960.0, + "grad_norm": 1.750724607078226, + "language_loss": 0.80319953, + "learning_rate": 3.3281391092256668e-06, + "loss": 0.88096082, + "num_input_tokens_seen": 104310485, + "router_z_loss_clip": 2.00878906, + "router_z_loss_mlp": 0.16052246, + "step": 4836, + "time_per_iteration": 3.9953579902648926 + }, + { + "auxiliary_loss_clip": 0.0650623, + "auxiliary_loss_mlp": 0.01276306, + "balance_loss_clip": 0.06305872, + "balance_loss_mlp": 0.01260236, + "epoch": 0.29081617315496766, + "flos": 18662836838400.0, + "grad_norm": 1.8282626295265978, + "language_loss": 0.81337535, + "learning_rate": 3.3278478949185865e-06, + "loss": 0.89120078, + "num_input_tokens_seen": 104327330, + "router_z_loss_clip": 2.00585938, + "router_z_loss_mlp": 0.16064453, + "step": 4837, + "time_per_iteration": 3.9492576122283936 + }, + { + "auxiliary_loss_clip": 0.06508449, + "auxiliary_loss_mlp": 0.01275256, + "balance_loss_clip": 0.06305645, + "balance_loss_mlp": 0.01257362, + "epoch": 0.2908762964076356, + "flos": 35339087952000.0, + "grad_norm": 1.819350457328488, + "language_loss": 0.67809796, + "learning_rate": 3.327556630259381e-06, + "loss": 0.75593495, + "num_input_tokens_seen": 104350350, + "router_z_loss_clip": 2.02734375, + "router_z_loss_mlp": 0.17895508, + "step": 4838, + "time_per_iteration": 2.6575772762298584 + }, + { + "auxiliary_loss_clip": 0.06511781, + "auxiliary_loss_mlp": 0.01274117, + "balance_loss_clip": 0.06305051, + "balance_loss_mlp": 0.01256688, + "epoch": 0.29093641966030365, + "flos": 23083058148480.0, + "grad_norm": 2.3112745331966185, + "language_loss": 0.71775508, + "learning_rate": 3.327265315259095e-06, + "loss": 0.79561406, + "num_input_tokens_seen": 104369995, + "router_z_loss_clip": 2.06640625, + "router_z_loss_mlp": 0.17419434, + "step": 4839, + "time_per_iteration": 2.6057844161987305 + }, + { + "auxiliary_loss_clip": 0.06504601, + "auxiliary_loss_mlp": 0.0127457, + "balance_loss_clip": 0.06301045, + "balance_loss_mlp": 0.01258071, + "epoch": 0.2909965429129716, + "flos": 35964260864640.0, + "grad_norm": 1.8988017352340443, + "language_loss": 0.75792682, + "learning_rate": 3.326973949928776e-06, + "loss": 0.83571851, + "num_input_tokens_seen": 104392285, + "router_z_loss_clip": 2.03515625, + "router_z_loss_mlp": 0.16503906, + "step": 4840, + "time_per_iteration": 2.7049334049224854 + }, + { + "auxiliary_loss_clip": 0.06503059, + "auxiliary_loss_mlp": 0.0127188, + "balance_loss_clip": 0.06299757, + "balance_loss_mlp": 0.01255417, + "epoch": 0.2910566661656396, + "flos": 30887616268800.0, + "grad_norm": 1.8129671702232821, + "language_loss": 0.60949063, + "learning_rate": 3.326682534279471e-06, + "loss": 0.68724, + "num_input_tokens_seen": 104412640, + "router_z_loss_clip": 2.03125, + "router_z_loss_mlp": 0.16479492, + "step": 4841, + "time_per_iteration": 2.7237274646759033 + }, + { + "auxiliary_loss_clip": 0.06506652, + "auxiliary_loss_mlp": 0.01272342, + "balance_loss_clip": 0.06303366, + "balance_loss_mlp": 0.01255021, + "epoch": 0.29111678941830754, + "flos": 30018366812160.0, + "grad_norm": 1.3487344136639734, + "language_loss": 0.71762401, + "learning_rate": 3.326391068322232e-06, + "loss": 0.79541385, + "num_input_tokens_seen": 104435245, + "router_z_loss_clip": 2.03320312, + "router_z_loss_mlp": 0.17333984, + "step": 4842, + "time_per_iteration": 4.036385774612427 + }, + { + "auxiliary_loss_clip": 0.06507391, + "auxiliary_loss_mlp": 0.01271836, + "balance_loss_clip": 0.06304808, + "balance_loss_mlp": 0.01256423, + "epoch": 0.2911769126709755, + "flos": 22864110629760.0, + "grad_norm": 1.4808705717301018, + "language_loss": 0.74052906, + "learning_rate": 3.3260995520681098e-06, + "loss": 0.81832135, + "num_input_tokens_seen": 104455395, + "router_z_loss_clip": 2.0234375, + "router_z_loss_mlp": 0.1541748, + "step": 4843, + "time_per_iteration": 2.565093755722046 + }, + { + "auxiliary_loss_clip": 0.06510359, + "auxiliary_loss_mlp": 0.01272609, + "balance_loss_clip": 0.06305443, + "balance_loss_mlp": 0.01256742, + "epoch": 0.2912370359236435, + "flos": 21656545113600.0, + "grad_norm": 3.6041214714298806, + "language_loss": 0.5879783, + "learning_rate": 3.3258079855281602e-06, + "loss": 0.66580796, + "num_input_tokens_seen": 104473350, + "router_z_loss_clip": 2.04785156, + "router_z_loss_mlp": 0.15856934, + "step": 4844, + "time_per_iteration": 2.636667490005493 + }, + { + "auxiliary_loss_clip": 0.06518383, + "auxiliary_loss_mlp": 0.01278792, + "balance_loss_clip": 0.06309091, + "balance_loss_mlp": 0.01261566, + "epoch": 0.29129715917631144, + "flos": 22899972977280.0, + "grad_norm": 1.9195914149996331, + "language_loss": 0.86846137, + "learning_rate": 3.3255163687134396e-06, + "loss": 0.94643313, + "num_input_tokens_seen": 104492265, + "router_z_loss_clip": 2.09277344, + "router_z_loss_mlp": 0.17224121, + "step": 4845, + "time_per_iteration": 2.549297571182251 + }, + { + "auxiliary_loss_clip": 0.06508736, + "auxiliary_loss_mlp": 0.01273322, + "balance_loss_clip": 0.06304652, + "balance_loss_mlp": 0.01256144, + "epoch": 0.2913572824289794, + "flos": 22681067385600.0, + "grad_norm": 1.8711717874469986, + "language_loss": 0.67698014, + "learning_rate": 3.3252247016350046e-06, + "loss": 0.75480074, + "num_input_tokens_seen": 104510755, + "router_z_loss_clip": 2.04199219, + "router_z_loss_mlp": 0.17175293, + "step": 4846, + "time_per_iteration": 2.607025146484375 + }, + { + "auxiliary_loss_clip": 0.06502484, + "auxiliary_loss_mlp": 0.01275425, + "balance_loss_clip": 0.06301165, + "balance_loss_mlp": 0.01258771, + "epoch": 0.29141740568164737, + "flos": 23113260345600.0, + "grad_norm": 4.990917175371688, + "language_loss": 0.708718, + "learning_rate": 3.3249329843039166e-06, + "loss": 0.78649712, + "num_input_tokens_seen": 104530830, + "router_z_loss_clip": 2.01171875, + "router_z_loss_mlp": 0.16674805, + "step": 4847, + "time_per_iteration": 2.5293991565704346 + }, + { + "auxiliary_loss_clip": 0.06504785, + "auxiliary_loss_mlp": 0.01278673, + "balance_loss_clip": 0.06301495, + "balance_loss_mlp": 0.01261877, + "epoch": 0.29147752893431533, + "flos": 23593851838080.0, + "grad_norm": 1.4565796817402286, + "language_loss": 0.74258435, + "learning_rate": 3.324641216731237e-06, + "loss": 0.82041889, + "num_input_tokens_seen": 104550115, + "router_z_loss_clip": 2.03125, + "router_z_loss_mlp": 0.16796875, + "step": 4848, + "time_per_iteration": 2.585296630859375 + }, + { + "auxiliary_loss_clip": 0.06502895, + "auxiliary_loss_mlp": 0.01276049, + "balance_loss_clip": 0.06298006, + "balance_loss_mlp": 0.01259729, + "epoch": 0.2915376521869833, + "flos": 20597753721600.0, + "grad_norm": 2.1223800155182624, + "language_loss": 0.77561575, + "learning_rate": 3.3243493989280295e-06, + "loss": 0.85340518, + "num_input_tokens_seen": 104566255, + "router_z_loss_clip": 2.046875, + "router_z_loss_mlp": 0.16333008, + "step": 4849, + "time_per_iteration": 2.4936819076538086 + }, + { + "auxiliary_loss_clip": 0.06514408, + "auxiliary_loss_mlp": 0.01274174, + "balance_loss_clip": 0.06305997, + "balance_loss_mlp": 0.01257723, + "epoch": 0.29159777543965126, + "flos": 20817414000000.0, + "grad_norm": 1.652469266745217, + "language_loss": 0.79415965, + "learning_rate": 3.3240575309053596e-06, + "loss": 0.87204546, + "num_input_tokens_seen": 104585235, + "router_z_loss_clip": 2.08789062, + "router_z_loss_mlp": 0.16442871, + "step": 4850, + "time_per_iteration": 2.55340313911438 + }, + { + "auxiliary_loss_clip": 0.06494947, + "auxiliary_loss_mlp": 0.0127524, + "balance_loss_clip": 0.06295137, + "balance_loss_mlp": 0.01258479, + "epoch": 0.29165789869231923, + "flos": 24251155591680.0, + "grad_norm": 1.7747423674847125, + "language_loss": 0.76365012, + "learning_rate": 3.323765612674296e-06, + "loss": 0.84135199, + "num_input_tokens_seen": 104605315, + "router_z_loss_clip": 1.99902344, + "router_z_loss_mlp": 0.16748047, + "step": 4851, + "time_per_iteration": 2.5335612297058105 + }, + { + "auxiliary_loss_clip": 0.06499958, + "auxiliary_loss_mlp": 0.01272557, + "balance_loss_clip": 0.06300404, + "balance_loss_mlp": 0.01256929, + "epoch": 0.29171802194498725, + "flos": 28957562922240.0, + "grad_norm": 1.3481127708223366, + "language_loss": 0.7781775, + "learning_rate": 3.3234736442459078e-06, + "loss": 0.85590267, + "num_input_tokens_seen": 104626055, + "router_z_loss_clip": 1.99609375, + "router_z_loss_mlp": 0.15612793, + "step": 4852, + "time_per_iteration": 2.6266329288482666 + }, + { + "auxiliary_loss_clip": 0.06501517, + "auxiliary_loss_mlp": 0.0127959, + "balance_loss_clip": 0.06297216, + "balance_loss_mlp": 0.01261697, + "epoch": 0.2917781451976552, + "flos": 22604269518720.0, + "grad_norm": 1.5006442804531215, + "language_loss": 0.78676021, + "learning_rate": 3.3231816256312665e-06, + "loss": 0.86457133, + "num_input_tokens_seen": 104646005, + "router_z_loss_clip": 2.04296875, + "router_z_loss_mlp": 0.17883301, + "step": 4853, + "time_per_iteration": 2.5417568683624268 + }, + { + "auxiliary_loss_clip": 0.06501997, + "auxiliary_loss_mlp": 0.01270758, + "balance_loss_clip": 0.06296347, + "balance_loss_mlp": 0.01253818, + "epoch": 0.2918382684503232, + "flos": 21579956881920.0, + "grad_norm": 4.190137743849971, + "language_loss": 0.88580358, + "learning_rate": 3.322889556841445e-06, + "loss": 0.96353114, + "num_input_tokens_seen": 104661620, + "router_z_loss_clip": 2.05761719, + "router_z_loss_mlp": 0.16943359, + "step": 4854, + "time_per_iteration": 2.537247896194458 + }, + { + "auxiliary_loss_clip": 0.06492339, + "auxiliary_loss_mlp": 0.01274317, + "balance_loss_clip": 0.06290923, + "balance_loss_mlp": 0.01255517, + "epoch": 0.29189839170299114, + "flos": 24360503569920.0, + "grad_norm": 1.79615422427109, + "language_loss": 0.86863208, + "learning_rate": 3.322597437887519e-06, + "loss": 0.94629866, + "num_input_tokens_seen": 104681445, + "router_z_loss_clip": 2.01464844, + "router_z_loss_mlp": 0.18798828, + "step": 4855, + "time_per_iteration": 2.5408217906951904 + }, + { + "auxiliary_loss_clip": 0.06394155, + "auxiliary_loss_mlp": 0.01254999, + "balance_loss_clip": 0.0629582, + "balance_loss_mlp": 0.01250765, + "epoch": 0.2919585149556591, + "flos": 71338693311360.0, + "grad_norm": 0.8469602753394808, + "language_loss": 0.60232264, + "learning_rate": 3.322305268780566e-06, + "loss": 0.67881417, + "num_input_tokens_seen": 104747945, + "router_z_loss_clip": 0.98388672, + "router_z_loss_mlp": 0.04238892, + "step": 4856, + "time_per_iteration": 3.245720863342285 + }, + { + "auxiliary_loss_clip": 0.06496054, + "auxiliary_loss_mlp": 0.01271452, + "balance_loss_clip": 0.06293447, + "balance_loss_mlp": 0.01254966, + "epoch": 0.2920186382083271, + "flos": 15638716730880.0, + "grad_norm": 1.9340338412348166, + "language_loss": 0.69134986, + "learning_rate": 3.322013049531664e-06, + "loss": 0.76902497, + "num_input_tokens_seen": 104766225, + "router_z_loss_clip": 2.02734375, + "router_z_loss_mlp": 0.16479492, + "step": 4857, + "time_per_iteration": 2.492515802383423 + }, + { + "auxiliary_loss_clip": 0.0649875, + "auxiliary_loss_mlp": 0.01275648, + "balance_loss_clip": 0.06298544, + "balance_loss_mlp": 0.01258863, + "epoch": 0.29207876146099504, + "flos": 28373535164160.0, + "grad_norm": 2.0544380804392346, + "language_loss": 0.84425288, + "learning_rate": 3.321720780151895e-06, + "loss": 0.92199689, + "num_input_tokens_seen": 104785345, + "router_z_loss_clip": 2.00195312, + "router_z_loss_mlp": 0.16772461, + "step": 4858, + "time_per_iteration": 2.596036434173584 + }, + { + "auxiliary_loss_clip": 0.06500848, + "auxiliary_loss_mlp": 0.01274974, + "balance_loss_clip": 0.06300872, + "balance_loss_mlp": 0.01257879, + "epoch": 0.292138884713663, + "flos": 21877295495040.0, + "grad_norm": 1.6880642207641439, + "language_loss": 0.781169, + "learning_rate": 3.321428460652342e-06, + "loss": 0.85892725, + "num_input_tokens_seen": 104804560, + "router_z_loss_clip": 2.0, + "router_z_loss_mlp": 0.17102051, + "step": 4859, + "time_per_iteration": 2.5885818004608154 + }, + { + "auxiliary_loss_clip": 0.06508546, + "auxiliary_loss_mlp": 0.01274065, + "balance_loss_clip": 0.06301034, + "balance_loss_mlp": 0.0125684, + "epoch": 0.29219900796633097, + "flos": 20998277038080.0, + "grad_norm": 2.276956308498861, + "language_loss": 0.68823123, + "learning_rate": 3.3211360910440885e-06, + "loss": 0.76605731, + "num_input_tokens_seen": 104821105, + "router_z_loss_clip": 2.07421875, + "router_z_loss_mlp": 0.17224121, + "step": 4860, + "time_per_iteration": 2.6006133556365967 + }, + { + "auxiliary_loss_clip": 0.06497137, + "auxiliary_loss_mlp": 0.01273361, + "balance_loss_clip": 0.06296673, + "balance_loss_mlp": 0.01256743, + "epoch": 0.29225913121899894, + "flos": 35012930734080.0, + "grad_norm": 1.9621079535677741, + "language_loss": 0.75927335, + "learning_rate": 3.320843671338222e-06, + "loss": 0.83697826, + "num_input_tokens_seen": 104841440, + "router_z_loss_clip": 2.00292969, + "router_z_loss_mlp": 0.16625977, + "step": 4861, + "time_per_iteration": 2.6738815307617188 + }, + { + "auxiliary_loss_clip": 0.06498605, + "auxiliary_loss_mlp": 0.01278705, + "balance_loss_clip": 0.06298269, + "balance_loss_mlp": 0.0126229, + "epoch": 0.2923192544716669, + "flos": 13520588895360.0, + "grad_norm": 2.4944662876521027, + "language_loss": 0.91953582, + "learning_rate": 3.320551201545832e-06, + "loss": 0.99730897, + "num_input_tokens_seen": 104858210, + "router_z_loss_clip": 2.00097656, + "router_z_loss_mlp": 0.16418457, + "step": 4862, + "time_per_iteration": 2.523393392562866 + }, + { + "auxiliary_loss_clip": 0.06498902, + "auxiliary_loss_mlp": 0.01275122, + "balance_loss_clip": 0.06296849, + "balance_loss_mlp": 0.01258325, + "epoch": 0.29237937772433487, + "flos": 19469543621760.0, + "grad_norm": 2.367835349845546, + "language_loss": 0.74302417, + "learning_rate": 3.320258681678008e-06, + "loss": 0.82076436, + "num_input_tokens_seen": 104875620, + "router_z_loss_clip": 2.02441406, + "router_z_loss_mlp": 0.16809082, + "step": 4863, + "time_per_iteration": 2.5615665912628174 + }, + { + "auxiliary_loss_clip": 0.06495367, + "auxiliary_loss_mlp": 0.01274458, + "balance_loss_clip": 0.06298485, + "balance_loss_mlp": 0.01257041, + "epoch": 0.29243950097700283, + "flos": 20856965927040.0, + "grad_norm": 1.6096808438714836, + "language_loss": 0.78180861, + "learning_rate": 3.319966111745842e-06, + "loss": 0.85950685, + "num_input_tokens_seen": 104894600, + "router_z_loss_clip": 1.96679688, + "router_z_loss_mlp": 0.17419434, + "step": 4864, + "time_per_iteration": 2.543239116668701 + }, + { + "auxiliary_loss_clip": 0.06506015, + "auxiliary_loss_mlp": 0.01278091, + "balance_loss_clip": 0.06299396, + "balance_loss_mlp": 0.01260127, + "epoch": 0.29249962422967085, + "flos": 23590581528960.0, + "grad_norm": 1.7200803595236853, + "language_loss": 0.82166076, + "learning_rate": 3.319673491760429e-06, + "loss": 0.8995018, + "num_input_tokens_seen": 104914530, + "router_z_loss_clip": 2.06640625, + "router_z_loss_mlp": 0.1796875, + "step": 4865, + "time_per_iteration": 2.6162562370300293 + }, + { + "auxiliary_loss_clip": 0.06504746, + "auxiliary_loss_mlp": 0.01277785, + "balance_loss_clip": 0.06300808, + "balance_loss_mlp": 0.01258783, + "epoch": 0.2925597474823388, + "flos": 22279915163520.0, + "grad_norm": 1.8207973709117147, + "language_loss": 0.85861242, + "learning_rate": 3.3193808217328645e-06, + "loss": 0.93643779, + "num_input_tokens_seen": 104933460, + "router_z_loss_clip": 2.04003906, + "router_z_loss_mlp": 0.18994141, + "step": 4866, + "time_per_iteration": 2.5991125106811523 + }, + { + "auxiliary_loss_clip": 0.06498669, + "auxiliary_loss_mlp": 0.01277634, + "balance_loss_clip": 0.06298468, + "balance_loss_mlp": 0.0126005, + "epoch": 0.2926198707350068, + "flos": 34464136417920.0, + "grad_norm": 1.677629799943763, + "language_loss": 0.76065934, + "learning_rate": 3.3190881016742476e-06, + "loss": 0.83842242, + "num_input_tokens_seen": 104954495, + "router_z_loss_clip": 2.00097656, + "router_z_loss_mlp": 0.17578125, + "step": 4867, + "time_per_iteration": 2.652083396911621 + }, + { + "auxiliary_loss_clip": 0.06508122, + "auxiliary_loss_mlp": 0.01277995, + "balance_loss_clip": 0.06302974, + "balance_loss_mlp": 0.01260483, + "epoch": 0.29267999398767475, + "flos": 20710413936000.0, + "grad_norm": 2.5581846543962197, + "language_loss": 0.73412025, + "learning_rate": 3.3187953315956776e-06, + "loss": 0.81198144, + "num_input_tokens_seen": 104971915, + "router_z_loss_clip": 2.04980469, + "router_z_loss_mlp": 0.1751709, + "step": 4868, + "time_per_iteration": 2.5104074478149414 + }, + { + "auxiliary_loss_clip": 0.06504919, + "auxiliary_loss_mlp": 0.0127382, + "balance_loss_clip": 0.06304781, + "balance_loss_mlp": 0.01256558, + "epoch": 0.2927401172403427, + "flos": 18374470611840.0, + "grad_norm": 1.376823387605754, + "language_loss": 0.74768585, + "learning_rate": 3.3185025115082566e-06, + "loss": 0.82547319, + "num_input_tokens_seen": 104991335, + "router_z_loss_clip": 2.00292969, + "router_z_loss_mlp": 0.17260742, + "step": 4869, + "time_per_iteration": 2.517545461654663 + }, + { + "auxiliary_loss_clip": 0.06509744, + "auxiliary_loss_mlp": 0.01275578, + "balance_loss_clip": 0.06308037, + "balance_loss_mlp": 0.01258627, + "epoch": 0.2928002404930107, + "flos": 26111203251840.0, + "grad_norm": 1.453461002371515, + "language_loss": 0.76538026, + "learning_rate": 3.318209641423088e-06, + "loss": 0.84323347, + "num_input_tokens_seen": 105012015, + "router_z_loss_clip": 2.01660156, + "router_z_loss_mlp": 0.16931152, + "step": 4870, + "time_per_iteration": 2.571554183959961 + }, + { + "auxiliary_loss_clip": 0.06512202, + "auxiliary_loss_mlp": 0.01274146, + "balance_loss_clip": 0.06304315, + "balance_loss_mlp": 0.01255967, + "epoch": 0.29286036374567864, + "flos": 21331142582400.0, + "grad_norm": 3.1299518178223726, + "language_loss": 0.67793286, + "learning_rate": 3.3179167213512777e-06, + "loss": 0.75579637, + "num_input_tokens_seen": 105031460, + "router_z_loss_clip": 2.078125, + "router_z_loss_mlp": 0.18188477, + "step": 4871, + "time_per_iteration": 2.5867390632629395 + }, + { + "auxiliary_loss_clip": 0.06504084, + "auxiliary_loss_mlp": 0.01272553, + "balance_loss_clip": 0.0630291, + "balance_loss_mlp": 0.01256973, + "epoch": 0.2929204869983466, + "flos": 29577117611520.0, + "grad_norm": 1.7840080197301964, + "language_loss": 0.78071094, + "learning_rate": 3.317623751303933e-06, + "loss": 0.85847723, + "num_input_tokens_seen": 105052965, + "router_z_loss_clip": 2.01171875, + "router_z_loss_mlp": 0.15588379, + "step": 4872, + "time_per_iteration": 2.598357915878296 + }, + { + "auxiliary_loss_clip": 0.06511893, + "auxiliary_loss_mlp": 0.01279899, + "balance_loss_clip": 0.06305112, + "balance_loss_mlp": 0.01260313, + "epoch": 0.2929806102510146, + "flos": 19063569790080.0, + "grad_norm": 1.7763964443019538, + "language_loss": 0.72879624, + "learning_rate": 3.317330731292164e-06, + "loss": 0.80671406, + "num_input_tokens_seen": 105071840, + "router_z_loss_clip": 2.06640625, + "router_z_loss_mlp": 0.19580078, + "step": 4873, + "time_per_iteration": 3.9404540061950684 + }, + { + "auxiliary_loss_clip": 0.06511085, + "auxiliary_loss_mlp": 0.01274077, + "balance_loss_clip": 0.06303495, + "balance_loss_mlp": 0.01256386, + "epoch": 0.29304073350368254, + "flos": 21950613417600.0, + "grad_norm": 1.85182595241139, + "language_loss": 0.79023468, + "learning_rate": 3.3170376613270812e-06, + "loss": 0.86808634, + "num_input_tokens_seen": 105089445, + "router_z_loss_clip": 2.07617188, + "router_z_loss_mlp": 0.17675781, + "step": 4874, + "time_per_iteration": 2.523942470550537 + }, + { + "auxiliary_loss_clip": 0.06517696, + "auxiliary_loss_mlp": 0.01272827, + "balance_loss_clip": 0.06305568, + "balance_loss_mlp": 0.01255315, + "epoch": 0.2931008567563505, + "flos": 15456302392320.0, + "grad_norm": 2.3441988108556377, + "language_loss": 0.7791701, + "learning_rate": 3.3167445414197985e-06, + "loss": 0.85707539, + "num_input_tokens_seen": 105106210, + "router_z_loss_clip": 2.12304688, + "router_z_loss_mlp": 0.17504883, + "step": 4875, + "time_per_iteration": 2.4990556240081787 + }, + { + "auxiliary_loss_clip": 0.06506883, + "auxiliary_loss_mlp": 0.01280573, + "balance_loss_clip": 0.06301031, + "balance_loss_mlp": 0.01263252, + "epoch": 0.29316098000901847, + "flos": 16988893096320.0, + "grad_norm": 1.859745338516673, + "language_loss": 0.70031023, + "learning_rate": 3.316451371581431e-06, + "loss": 0.77818477, + "num_input_tokens_seen": 105124200, + "router_z_loss_clip": 2.05859375, + "router_z_loss_mlp": 0.17321777, + "step": 4876, + "time_per_iteration": 5.4681243896484375 + }, + { + "auxiliary_loss_clip": 0.06504045, + "auxiliary_loss_mlp": 0.01275518, + "balance_loss_clip": 0.06302452, + "balance_loss_mlp": 0.01259174, + "epoch": 0.29322110326168643, + "flos": 16362462372480.0, + "grad_norm": 1.8247622937841679, + "language_loss": 0.82480925, + "learning_rate": 3.316158151823096e-06, + "loss": 0.90260488, + "num_input_tokens_seen": 105140400, + "router_z_loss_clip": 2.015625, + "router_z_loss_mlp": 0.16345215, + "step": 4877, + "time_per_iteration": 2.5517635345458984 + }, + { + "auxiliary_loss_clip": 0.06509132, + "auxiliary_loss_mlp": 0.01278665, + "balance_loss_clip": 0.06299806, + "balance_loss_mlp": 0.0126064, + "epoch": 0.29328122651435445, + "flos": 13996023361920.0, + "grad_norm": 2.6416558700601334, + "language_loss": 0.6810987, + "learning_rate": 3.315864882155911e-06, + "loss": 0.75897658, + "num_input_tokens_seen": 105157535, + "router_z_loss_clip": 2.09179688, + "router_z_loss_mlp": 0.18017578, + "step": 4878, + "time_per_iteration": 2.511922597885132 + }, + { + "auxiliary_loss_clip": 0.0649902, + "auxiliary_loss_mlp": 0.01275226, + "balance_loss_clip": 0.06298085, + "balance_loss_mlp": 0.01257697, + "epoch": 0.2933413497670224, + "flos": 25271569013760.0, + "grad_norm": 1.8820124674491874, + "language_loss": 0.74030542, + "learning_rate": 3.3155715625909982e-06, + "loss": 0.81804794, + "num_input_tokens_seen": 105175185, + "router_z_loss_clip": 2.01074219, + "router_z_loss_mlp": 0.17510986, + "step": 4879, + "time_per_iteration": 2.6044318675994873 + }, + { + "auxiliary_loss_clip": 0.06501681, + "auxiliary_loss_mlp": 0.01277426, + "balance_loss_clip": 0.0629803, + "balance_loss_mlp": 0.01259187, + "epoch": 0.2934014730196904, + "flos": 32131840746240.0, + "grad_norm": 2.9151820016542183, + "language_loss": 0.67178017, + "learning_rate": 3.3152781931394803e-06, + "loss": 0.7495712, + "num_input_tokens_seen": 105194540, + "router_z_loss_clip": 2.03515625, + "router_z_loss_mlp": 0.18237305, + "step": 4880, + "time_per_iteration": 2.603761672973633 + }, + { + "auxiliary_loss_clip": 0.06503071, + "auxiliary_loss_mlp": 0.01271949, + "balance_loss_clip": 0.0629775, + "balance_loss_mlp": 0.01255367, + "epoch": 0.29346159627235835, + "flos": 24359329612800.0, + "grad_norm": 2.6105900749093633, + "language_loss": 0.71260536, + "learning_rate": 3.314984773812481e-06, + "loss": 0.79035556, + "num_input_tokens_seen": 105213215, + "router_z_loss_clip": 2.05273438, + "router_z_loss_mlp": 0.16577148, + "step": 4881, + "time_per_iteration": 2.593226432800293 + }, + { + "auxiliary_loss_clip": 0.06502824, + "auxiliary_loss_mlp": 0.01275283, + "balance_loss_clip": 0.06298223, + "balance_loss_mlp": 0.01256603, + "epoch": 0.2935217195250263, + "flos": 22753253278080.0, + "grad_norm": 1.6618295774620153, + "language_loss": 0.83893931, + "learning_rate": 3.314691304621127e-06, + "loss": 0.91672039, + "num_input_tokens_seen": 105231585, + "router_z_loss_clip": 2.04589844, + "router_z_loss_mlp": 0.18688965, + "step": 4882, + "time_per_iteration": 3.9488399028778076 + }, + { + "auxiliary_loss_clip": 0.06502259, + "auxiliary_loss_mlp": 0.01273532, + "balance_loss_clip": 0.06293593, + "balance_loss_mlp": 0.01255961, + "epoch": 0.2935818427776943, + "flos": 21731959388160.0, + "grad_norm": 4.210124979545191, + "language_loss": 0.72920972, + "learning_rate": 3.314397785576548e-06, + "loss": 0.80696762, + "num_input_tokens_seen": 105250120, + "router_z_loss_clip": 2.08789062, + "router_z_loss_mlp": 0.17565918, + "step": 4883, + "time_per_iteration": 2.557283878326416 + }, + { + "auxiliary_loss_clip": 0.06496279, + "auxiliary_loss_mlp": 0.01274258, + "balance_loss_clip": 0.06292833, + "balance_loss_mlp": 0.01257103, + "epoch": 0.29364196603036224, + "flos": 23811667326720.0, + "grad_norm": 2.0649535872154217, + "language_loss": 0.93051624, + "learning_rate": 3.3141042166898726e-06, + "loss": 1.00822163, + "num_input_tokens_seen": 105266065, + "router_z_loss_clip": 2.03417969, + "router_z_loss_mlp": 0.17150879, + "step": 4884, + "time_per_iteration": 2.5359458923339844 + }, + { + "auxiliary_loss_clip": 0.06506841, + "auxiliary_loss_mlp": 0.01273123, + "balance_loss_clip": 0.06302871, + "balance_loss_mlp": 0.01255409, + "epoch": 0.2937020892830302, + "flos": 23475615327360.0, + "grad_norm": 2.6201562161688017, + "language_loss": 0.73813069, + "learning_rate": 3.313810597972234e-06, + "loss": 0.81593031, + "num_input_tokens_seen": 105282155, + "router_z_loss_clip": 2.04101562, + "router_z_loss_mlp": 0.17712402, + "step": 4885, + "time_per_iteration": 2.547731637954712 + }, + { + "auxiliary_loss_clip": 0.06506574, + "auxiliary_loss_mlp": 0.01271233, + "balance_loss_clip": 0.06302118, + "balance_loss_mlp": 0.01253936, + "epoch": 0.2937622125356982, + "flos": 24278422896000.0, + "grad_norm": 2.0067568315745907, + "language_loss": 0.8568837, + "learning_rate": 3.3135169294347655e-06, + "loss": 0.93466175, + "num_input_tokens_seen": 105299225, + "router_z_loss_clip": 2.04492188, + "router_z_loss_mlp": 0.1730957, + "step": 4886, + "time_per_iteration": 2.5345749855041504 + }, + { + "auxiliary_loss_clip": 0.06516494, + "auxiliary_loss_mlp": 0.01282352, + "balance_loss_clip": 0.06309356, + "balance_loss_mlp": 0.01266223, + "epoch": 0.29382233578836614, + "flos": 20667843262080.0, + "grad_norm": 2.2972144011917863, + "language_loss": 0.7819618, + "learning_rate": 3.313223211088603e-06, + "loss": 0.85995024, + "num_input_tokens_seen": 105315710, + "router_z_loss_clip": 2.07128906, + "router_z_loss_mlp": 0.16137695, + "step": 4887, + "time_per_iteration": 2.5718464851379395 + }, + { + "auxiliary_loss_clip": 0.06508423, + "auxiliary_loss_mlp": 0.01281343, + "balance_loss_clip": 0.06301117, + "balance_loss_mlp": 0.01263962, + "epoch": 0.2938824590410341, + "flos": 16550662642560.0, + "grad_norm": 2.5346543108244366, + "language_loss": 0.80135798, + "learning_rate": 3.3129294429448855e-06, + "loss": 0.87925565, + "num_input_tokens_seen": 105333505, + "router_z_loss_clip": 2.07226562, + "router_z_loss_mlp": 0.1739502, + "step": 4888, + "time_per_iteration": 2.5823678970336914 + }, + { + "auxiliary_loss_clip": 0.06512221, + "auxiliary_loss_mlp": 0.01274662, + "balance_loss_clip": 0.06308408, + "balance_loss_mlp": 0.01257878, + "epoch": 0.29394258229370207, + "flos": 37934620824960.0, + "grad_norm": 1.521834171262281, + "language_loss": 0.55984998, + "learning_rate": 3.3126356250147517e-06, + "loss": 0.63771886, + "num_input_tokens_seen": 105355605, + "router_z_loss_clip": 2.04101562, + "router_z_loss_mlp": 0.16784668, + "step": 4889, + "time_per_iteration": 2.6925320625305176 + }, + { + "auxiliary_loss_clip": 0.06519246, + "auxiliary_loss_mlp": 0.01278013, + "balance_loss_clip": 0.06313413, + "balance_loss_mlp": 0.0126056, + "epoch": 0.29400270554637004, + "flos": 20050384924800.0, + "grad_norm": 1.7589662768394465, + "language_loss": 0.85257453, + "learning_rate": 3.3123417573093434e-06, + "loss": 0.93054712, + "num_input_tokens_seen": 105374225, + "router_z_loss_clip": 2.05761719, + "router_z_loss_mlp": 0.17443848, + "step": 4890, + "time_per_iteration": 2.546391010284424 + }, + { + "auxiliary_loss_clip": 0.06513973, + "auxiliary_loss_mlp": 0.01284253, + "balance_loss_clip": 0.06307942, + "balance_loss_mlp": 0.01266288, + "epoch": 0.294062828799038, + "flos": 15271498212480.0, + "grad_norm": 1.9077501912209676, + "language_loss": 0.73679662, + "learning_rate": 3.3120478398398046e-06, + "loss": 0.81477886, + "num_input_tokens_seen": 105391565, + "router_z_loss_clip": 2.06152344, + "router_z_loss_mlp": 0.17956543, + "step": 4891, + "time_per_iteration": 2.496230125427246 + }, + { + "auxiliary_loss_clip": 0.06519526, + "auxiliary_loss_mlp": 0.01285439, + "balance_loss_clip": 0.06312989, + "balance_loss_mlp": 0.01267468, + "epoch": 0.294122952051706, + "flos": 22753714475520.0, + "grad_norm": 1.802215562222595, + "language_loss": 0.77636111, + "learning_rate": 3.3117538726172797e-06, + "loss": 0.85441071, + "num_input_tokens_seen": 105409840, + "router_z_loss_clip": 2.06445312, + "router_z_loss_mlp": 0.17974854, + "step": 4892, + "time_per_iteration": 2.556626796722412 + }, + { + "auxiliary_loss_clip": 0.06508264, + "auxiliary_loss_mlp": 0.01274763, + "balance_loss_clip": 0.06305899, + "balance_loss_mlp": 0.01257096, + "epoch": 0.294183075304374, + "flos": 24979848624000.0, + "grad_norm": 1.857019535889917, + "language_loss": 0.78546309, + "learning_rate": 3.3114598556529164e-06, + "loss": 0.86329335, + "num_input_tokens_seen": 105428645, + "router_z_loss_clip": 2.02539062, + "router_z_loss_mlp": 0.17675781, + "step": 4893, + "time_per_iteration": 2.5583088397979736 + }, + { + "auxiliary_loss_clip": 0.06512541, + "auxiliary_loss_mlp": 0.01279131, + "balance_loss_clip": 0.06308632, + "balance_loss_mlp": 0.01262764, + "epoch": 0.29424319855704195, + "flos": 30960347212800.0, + "grad_norm": 7.778949224672863, + "language_loss": 0.85594332, + "learning_rate": 3.311165788957864e-06, + "loss": 0.93386006, + "num_input_tokens_seen": 105447480, + "router_z_loss_clip": 2.04101562, + "router_z_loss_mlp": 0.16357422, + "step": 4894, + "time_per_iteration": 2.642275094985962 + }, + { + "auxiliary_loss_clip": 0.06515005, + "auxiliary_loss_mlp": 0.01277674, + "balance_loss_clip": 0.06308285, + "balance_loss_mlp": 0.01260639, + "epoch": 0.2943033218097099, + "flos": 15236977530240.0, + "grad_norm": 2.7328127009682617, + "language_loss": 0.91485763, + "learning_rate": 3.310871672543274e-06, + "loss": 0.99278444, + "num_input_tokens_seen": 105464600, + "router_z_loss_clip": 2.06640625, + "router_z_loss_mlp": 0.17028809, + "step": 4895, + "time_per_iteration": 2.499884605407715 + }, + { + "auxiliary_loss_clip": 0.06521617, + "auxiliary_loss_mlp": 0.01275591, + "balance_loss_clip": 0.06309959, + "balance_loss_mlp": 0.01257519, + "epoch": 0.2943634450623779, + "flos": 21732336731520.0, + "grad_norm": 1.9156960384195119, + "language_loss": 0.86768568, + "learning_rate": 3.3105775064202982e-06, + "loss": 0.94565773, + "num_input_tokens_seen": 105481510, + "router_z_loss_clip": 2.11328125, + "router_z_loss_mlp": 0.18078613, + "step": 4896, + "time_per_iteration": 2.5482704639434814 + }, + { + "auxiliary_loss_clip": 0.06512056, + "auxiliary_loss_mlp": 0.01275376, + "balance_loss_clip": 0.06306215, + "balance_loss_mlp": 0.01257996, + "epoch": 0.29442356831504585, + "flos": 22608797639040.0, + "grad_norm": 2.0283086901116354, + "language_loss": 0.73915696, + "learning_rate": 3.3102832906000924e-06, + "loss": 0.81703126, + "num_input_tokens_seen": 105501390, + "router_z_loss_clip": 2.05566406, + "router_z_loss_mlp": 0.17382812, + "step": 4897, + "time_per_iteration": 2.5434658527374268 + }, + { + "auxiliary_loss_clip": 0.0652054, + "auxiliary_loss_mlp": 0.01280641, + "balance_loss_clip": 0.06307404, + "balance_loss_mlp": 0.01262378, + "epoch": 0.2944836915677138, + "flos": 20017625178240.0, + "grad_norm": 1.9321922101744466, + "language_loss": 0.74697995, + "learning_rate": 3.309989025093813e-06, + "loss": 0.82499176, + "num_input_tokens_seen": 105519600, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.18261719, + "step": 4898, + "time_per_iteration": 2.5770161151885986 + }, + { + "auxiliary_loss_clip": 0.06516017, + "auxiliary_loss_mlp": 0.01278564, + "balance_loss_clip": 0.06305353, + "balance_loss_mlp": 0.01259586, + "epoch": 0.2945438148203818, + "flos": 20051768517120.0, + "grad_norm": 2.462097706840479, + "language_loss": 0.71617198, + "learning_rate": 3.309694709912618e-06, + "loss": 0.79411781, + "num_input_tokens_seen": 105535970, + "router_z_loss_clip": 2.10742188, + "router_z_loss_mlp": 0.18969727, + "step": 4899, + "time_per_iteration": 2.5297536849975586 + }, + { + "auxiliary_loss_clip": 0.06510775, + "auxiliary_loss_mlp": 0.01278061, + "balance_loss_clip": 0.06304912, + "balance_loss_mlp": 0.01259727, + "epoch": 0.29460393807304974, + "flos": 23740487683200.0, + "grad_norm": 9.70716698994663, + "language_loss": 0.79828262, + "learning_rate": 3.3094003450676685e-06, + "loss": 0.87617099, + "num_input_tokens_seen": 105556735, + "router_z_loss_clip": 2.05859375, + "router_z_loss_mlp": 0.18322754, + "step": 4900, + "time_per_iteration": 2.589350461959839 + }, + { + "auxiliary_loss_clip": 0.06501958, + "auxiliary_loss_mlp": 0.01277561, + "balance_loss_clip": 0.06297968, + "balance_loss_mlp": 0.01260025, + "epoch": 0.2946640613257177, + "flos": 14981412977280.0, + "grad_norm": 1.6788003410312407, + "language_loss": 0.81419849, + "learning_rate": 3.3091059305701268e-06, + "loss": 0.89199364, + "num_input_tokens_seen": 105574875, + "router_z_loss_clip": 2.0390625, + "router_z_loss_mlp": 0.1751709, + "step": 4901, + "time_per_iteration": 2.4958457946777344 + }, + { + "auxiliary_loss_clip": 0.06498285, + "auxiliary_loss_mlp": 0.01276891, + "balance_loss_clip": 0.0630265, + "balance_loss_mlp": 0.01261095, + "epoch": 0.2947241845783857, + "flos": 24250862102400.0, + "grad_norm": 2.051988062923015, + "language_loss": 0.58211619, + "learning_rate": 3.308811466431157e-06, + "loss": 0.659868, + "num_input_tokens_seen": 105594225, + "router_z_loss_clip": 1.95800781, + "router_z_loss_mlp": 0.15783691, + "step": 4902, + "time_per_iteration": 2.5867393016815186 + }, + { + "auxiliary_loss_clip": 0.06509895, + "auxiliary_loss_mlp": 0.01278228, + "balance_loss_clip": 0.06304582, + "balance_loss_mlp": 0.01261825, + "epoch": 0.29478430783105364, + "flos": 19944600744960.0, + "grad_norm": 1.670035021285574, + "language_loss": 0.75883406, + "learning_rate": 3.308516952661925e-06, + "loss": 0.83671534, + "num_input_tokens_seen": 105614000, + "router_z_loss_clip": 2.05371094, + "router_z_loss_mlp": 0.16418457, + "step": 4903, + "time_per_iteration": 2.5120930671691895 + }, + { + "auxiliary_loss_clip": 0.06499215, + "auxiliary_loss_mlp": 0.01273387, + "balance_loss_clip": 0.06295954, + "balance_loss_mlp": 0.01255612, + "epoch": 0.2948444310837216, + "flos": 27388774454400.0, + "grad_norm": 1.8166217426315454, + "language_loss": 0.6305517, + "learning_rate": 3.3082223892736e-06, + "loss": 0.7082777, + "num_input_tokens_seen": 105634575, + "router_z_loss_clip": 2.03320312, + "router_z_loss_mlp": 0.17773438, + "step": 4904, + "time_per_iteration": 2.610600709915161 + }, + { + "auxiliary_loss_clip": 0.06509106, + "auxiliary_loss_mlp": 0.01272684, + "balance_loss_clip": 0.06301488, + "balance_loss_mlp": 0.01255983, + "epoch": 0.2949045543363896, + "flos": 23412401821440.0, + "grad_norm": 1.721115639485294, + "language_loss": 0.73724848, + "learning_rate": 3.3079277762773496e-06, + "loss": 0.8150664, + "num_input_tokens_seen": 105654385, + "router_z_loss_clip": 2.07519531, + "router_z_loss_mlp": 0.16711426, + "step": 4905, + "time_per_iteration": 2.5330429077148438 + }, + { + "auxiliary_loss_clip": 0.06501255, + "auxiliary_loss_mlp": 0.01270139, + "balance_loss_clip": 0.06297939, + "balance_loss_mlp": 0.01252508, + "epoch": 0.2949646775890576, + "flos": 23958303171840.0, + "grad_norm": 1.607284793713989, + "language_loss": 0.81930244, + "learning_rate": 3.3076331136843476e-06, + "loss": 0.89701641, + "num_input_tokens_seen": 105673570, + "router_z_loss_clip": 2.03320312, + "router_z_loss_mlp": 0.17614746, + "step": 4906, + "time_per_iteration": 2.5717568397521973 + }, + { + "auxiliary_loss_clip": 0.06499709, + "auxiliary_loss_mlp": 0.0127024, + "balance_loss_clip": 0.06300811, + "balance_loss_mlp": 0.01254051, + "epoch": 0.29502480084172555, + "flos": 22791002342400.0, + "grad_norm": 1.8767623479937394, + "language_loss": 0.88041449, + "learning_rate": 3.3073384015057667e-06, + "loss": 0.95811397, + "num_input_tokens_seen": 105691940, + "router_z_loss_clip": 1.98632812, + "router_z_loss_mlp": 0.16186523, + "step": 4907, + "time_per_iteration": 2.532233238220215 + }, + { + "auxiliary_loss_clip": 0.06504819, + "auxiliary_loss_mlp": 0.01277393, + "balance_loss_clip": 0.06294614, + "balance_loss_mlp": 0.01257592, + "epoch": 0.2950849240943935, + "flos": 19652838428160.0, + "grad_norm": 2.2863974346720837, + "language_loss": 0.82530308, + "learning_rate": 3.307043639752782e-06, + "loss": 0.90312517, + "num_input_tokens_seen": 105709825, + "router_z_loss_clip": 2.10058594, + "router_z_loss_mlp": 0.19812012, + "step": 4908, + "time_per_iteration": 2.6338536739349365 + }, + { + "auxiliary_loss_clip": 0.06393203, + "auxiliary_loss_mlp": 0.01256311, + "balance_loss_clip": 0.06296152, + "balance_loss_mlp": 0.01251251, + "epoch": 0.2951450473470615, + "flos": 71021062010880.0, + "grad_norm": 0.749349843123412, + "language_loss": 0.57384133, + "learning_rate": 3.3067488284365728e-06, + "loss": 0.65033644, + "num_input_tokens_seen": 105766880, + "router_z_loss_clip": 0.96875, + "router_z_loss_mlp": 0.05059814, + "step": 4909, + "time_per_iteration": 3.0084846019744873 + }, + { + "auxiliary_loss_clip": 0.06500423, + "auxiliary_loss_mlp": 0.01279147, + "balance_loss_clip": 0.06298146, + "balance_loss_mlp": 0.0126278, + "epoch": 0.29520517059972945, + "flos": 22972955483520.0, + "grad_norm": 1.5167904233162786, + "language_loss": 0.87274551, + "learning_rate": 3.3064539675683163e-06, + "loss": 0.9505412, + "num_input_tokens_seen": 105786875, + "router_z_loss_clip": 2.02441406, + "router_z_loss_mlp": 0.16381836, + "step": 4910, + "time_per_iteration": 2.615015745162964 + }, + { + "auxiliary_loss_clip": 0.06494174, + "auxiliary_loss_mlp": 0.01271849, + "balance_loss_clip": 0.06294993, + "balance_loss_mlp": 0.01255017, + "epoch": 0.2952652938523974, + "flos": 20491969541760.0, + "grad_norm": 1.9871602841434197, + "language_loss": 0.72998595, + "learning_rate": 3.3061590571591946e-06, + "loss": 0.80764621, + "num_input_tokens_seen": 105805315, + "router_z_loss_clip": 1.99316406, + "router_z_loss_mlp": 0.16821289, + "step": 4911, + "time_per_iteration": 2.5274527072906494 + }, + { + "auxiliary_loss_clip": 0.06493053, + "auxiliary_loss_mlp": 0.01276167, + "balance_loss_clip": 0.06295265, + "balance_loss_mlp": 0.01260122, + "epoch": 0.2953254171050654, + "flos": 19652754574080.0, + "grad_norm": 1.8153147203758204, + "language_loss": 0.90350848, + "learning_rate": 3.3058640972203904e-06, + "loss": 0.98120075, + "num_input_tokens_seen": 105825125, + "router_z_loss_clip": 1.9765625, + "router_z_loss_mlp": 0.16040039, + "step": 4912, + "time_per_iteration": 4.015045881271362 + }, + { + "auxiliary_loss_clip": 0.06500725, + "auxiliary_loss_mlp": 0.01273843, + "balance_loss_clip": 0.06298609, + "balance_loss_mlp": 0.01256474, + "epoch": 0.29538554035773334, + "flos": 22754678797440.0, + "grad_norm": 1.456675217678442, + "language_loss": 0.83491737, + "learning_rate": 3.3055690877630894e-06, + "loss": 0.91266304, + "num_input_tokens_seen": 105846085, + "router_z_loss_clip": 2.0234375, + "router_z_loss_mlp": 0.17370605, + "step": 4913, + "time_per_iteration": 2.5691113471984863 + }, + { + "auxiliary_loss_clip": 0.06499185, + "auxiliary_loss_mlp": 0.01271149, + "balance_loss_clip": 0.06297807, + "balance_loss_mlp": 0.01255163, + "epoch": 0.2954456636104013, + "flos": 21878343671040.0, + "grad_norm": 1.7751266266229593, + "language_loss": 0.77296054, + "learning_rate": 3.3052740287984765e-06, + "loss": 0.85066384, + "num_input_tokens_seen": 105865400, + "router_z_loss_clip": 2.01367188, + "router_z_loss_mlp": 0.15991211, + "step": 4914, + "time_per_iteration": 2.5379679203033447 + }, + { + "auxiliary_loss_clip": 0.06494316, + "auxiliary_loss_mlp": 0.01276253, + "balance_loss_clip": 0.06294423, + "balance_loss_mlp": 0.01259563, + "epoch": 0.2955057868630693, + "flos": 40452056092800.0, + "grad_norm": 1.8412710776020966, + "language_loss": 0.81848276, + "learning_rate": 3.3049789203377424e-06, + "loss": 0.89618844, + "num_input_tokens_seen": 105887920, + "router_z_loss_clip": 2.0, + "router_z_loss_mlp": 0.16674805, + "step": 4915, + "time_per_iteration": 4.123507261276245 + }, + { + "auxiliary_loss_clip": 0.06504083, + "auxiliary_loss_mlp": 0.01278112, + "balance_loss_clip": 0.06299824, + "balance_loss_mlp": 0.01260707, + "epoch": 0.29556591011573724, + "flos": 22571006647680.0, + "grad_norm": 1.7265680083109098, + "language_loss": 0.85337454, + "learning_rate": 3.3046837623920772e-06, + "loss": 0.93119645, + "num_input_tokens_seen": 105904035, + "router_z_loss_clip": 2.04296875, + "router_z_loss_mlp": 0.1739502, + "step": 4916, + "time_per_iteration": 3.964902400970459 + }, + { + "auxiliary_loss_clip": 0.06496175, + "auxiliary_loss_mlp": 0.01273483, + "balance_loss_clip": 0.06292706, + "balance_loss_mlp": 0.01257187, + "epoch": 0.2956260333684052, + "flos": 22095572181120.0, + "grad_norm": 2.6877460244099254, + "language_loss": 0.71410239, + "learning_rate": 3.3043885549726723e-06, + "loss": 0.79179895, + "num_input_tokens_seen": 105922685, + "router_z_loss_clip": 2.03613281, + "router_z_loss_mlp": 0.16296387, + "step": 4917, + "time_per_iteration": 2.510061502456665 + }, + { + "auxiliary_loss_clip": 0.06495264, + "auxiliary_loss_mlp": 0.01273068, + "balance_loss_clip": 0.06293347, + "balance_loss_mlp": 0.01255771, + "epoch": 0.2956861566210732, + "flos": 16441063102080.0, + "grad_norm": 1.9904514264943383, + "language_loss": 0.9154985, + "learning_rate": 3.3040932980907226e-06, + "loss": 0.99318182, + "num_input_tokens_seen": 105940425, + "router_z_loss_clip": 2.01757812, + "router_z_loss_mlp": 0.1730957, + "step": 4918, + "time_per_iteration": 2.5177812576293945 + }, + { + "auxiliary_loss_clip": 0.06500694, + "auxiliary_loss_mlp": 0.01270804, + "balance_loss_clip": 0.0629639, + "balance_loss_mlp": 0.01252887, + "epoch": 0.2957462798737412, + "flos": 25819189372800.0, + "grad_norm": 2.9632565132584587, + "language_loss": 0.73171133, + "learning_rate": 3.303797991757425e-06, + "loss": 0.80942631, + "num_input_tokens_seen": 105960550, + "router_z_loss_clip": 2.04394531, + "router_z_loss_mlp": 0.17919922, + "step": 4919, + "time_per_iteration": 2.548271656036377 + }, + { + "auxiliary_loss_clip": 0.06494663, + "auxiliary_loss_mlp": 0.01276246, + "balance_loss_clip": 0.062939, + "balance_loss_mlp": 0.01259104, + "epoch": 0.29580640312640916, + "flos": 16696459946880.0, + "grad_norm": 2.067015346809242, + "language_loss": 0.76653767, + "learning_rate": 3.3035026359839763e-06, + "loss": 0.84424675, + "num_input_tokens_seen": 105978820, + "router_z_loss_clip": 2.00878906, + "router_z_loss_mlp": 0.17138672, + "step": 4920, + "time_per_iteration": 2.5283315181732178 + }, + { + "auxiliary_loss_clip": 0.06505087, + "auxiliary_loss_mlp": 0.01280613, + "balance_loss_clip": 0.06298134, + "balance_loss_mlp": 0.01262886, + "epoch": 0.2958665263790771, + "flos": 23951427137280.0, + "grad_norm": 2.1683803944953786, + "language_loss": 0.69314063, + "learning_rate": 3.3032072307815774e-06, + "loss": 0.77099764, + "num_input_tokens_seen": 105997545, + "router_z_loss_clip": 2.06738281, + "router_z_loss_mlp": 0.17724609, + "step": 4921, + "time_per_iteration": 3.9904286861419678 + }, + { + "auxiliary_loss_clip": 0.06507339, + "auxiliary_loss_mlp": 0.01279047, + "balance_loss_clip": 0.06297763, + "balance_loss_mlp": 0.01261023, + "epoch": 0.2959266496317451, + "flos": 18484279787520.0, + "grad_norm": 1.8551497184563221, + "language_loss": 0.75478184, + "learning_rate": 3.3029117761614298e-06, + "loss": 0.83264565, + "num_input_tokens_seen": 106015320, + "router_z_loss_clip": 2.09570312, + "router_z_loss_mlp": 0.18017578, + "step": 4922, + "time_per_iteration": 2.5025644302368164 + }, + { + "auxiliary_loss_clip": 0.06508595, + "auxiliary_loss_mlp": 0.01277113, + "balance_loss_clip": 0.06298192, + "balance_loss_mlp": 0.01258051, + "epoch": 0.29598677288441305, + "flos": 25964525479680.0, + "grad_norm": 1.7877276864194063, + "language_loss": 0.77317607, + "learning_rate": 3.302616272134737e-06, + "loss": 0.85103309, + "num_input_tokens_seen": 106034555, + "router_z_loss_clip": 2.10742188, + "router_z_loss_mlp": 0.19067383, + "step": 4923, + "time_per_iteration": 2.57328462600708 + }, + { + "auxiliary_loss_clip": 0.06498858, + "auxiliary_loss_mlp": 0.01279587, + "balance_loss_clip": 0.06293048, + "balance_loss_mlp": 0.01262016, + "epoch": 0.296046896137081, + "flos": 25163101503360.0, + "grad_norm": 2.2992847921393174, + "language_loss": 0.8687042, + "learning_rate": 3.3023207187127042e-06, + "loss": 0.94648862, + "num_input_tokens_seen": 106054200, + "router_z_loss_clip": 2.05859375, + "router_z_loss_mlp": 0.17565918, + "step": 4924, + "time_per_iteration": 2.569819450378418 + }, + { + "auxiliary_loss_clip": 0.06495638, + "auxiliary_loss_mlp": 0.01274356, + "balance_loss_clip": 0.06293976, + "balance_loss_mlp": 0.01256891, + "epoch": 0.296107019389749, + "flos": 21767402465280.0, + "grad_norm": 1.4490170840920502, + "language_loss": 0.823627, + "learning_rate": 3.3020251159065396e-06, + "loss": 0.90132689, + "num_input_tokens_seen": 106074700, + "router_z_loss_clip": 2.015625, + "router_z_loss_mlp": 0.17468262, + "step": 4925, + "time_per_iteration": 2.586395025253296 + }, + { + "auxiliary_loss_clip": 0.06496158, + "auxiliary_loss_mlp": 0.01278426, + "balance_loss_clip": 0.06294197, + "balance_loss_mlp": 0.01261415, + "epoch": 0.29616714264241695, + "flos": 17964555638400.0, + "grad_norm": 3.115838377994743, + "language_loss": 0.87332439, + "learning_rate": 3.301729463727452e-06, + "loss": 0.95107025, + "num_input_tokens_seen": 106091415, + "router_z_loss_clip": 2.01953125, + "router_z_loss_mlp": 0.17016602, + "step": 4926, + "time_per_iteration": 2.480851411819458 + }, + { + "auxiliary_loss_clip": 0.06502646, + "auxiliary_loss_mlp": 0.01277188, + "balance_loss_clip": 0.06295682, + "balance_loss_mlp": 0.0125995, + "epoch": 0.2962272658950849, + "flos": 15018155792640.0, + "grad_norm": 2.5897634799766296, + "language_loss": 0.86097062, + "learning_rate": 3.3014337621866527e-06, + "loss": 0.93876898, + "num_input_tokens_seen": 106109135, + "router_z_loss_clip": 2.06738281, + "router_z_loss_mlp": 0.17236328, + "step": 4927, + "time_per_iteration": 2.524277687072754 + }, + { + "auxiliary_loss_clip": 0.06496821, + "auxiliary_loss_mlp": 0.01273329, + "balance_loss_clip": 0.06295302, + "balance_loss_mlp": 0.01256545, + "epoch": 0.2962873891477529, + "flos": 14726183840640.0, + "grad_norm": 1.628327768422068, + "language_loss": 0.80864251, + "learning_rate": 3.3011380112953553e-06, + "loss": 0.88634396, + "num_input_tokens_seen": 106125750, + "router_z_loss_clip": 2.01367188, + "router_z_loss_mlp": 0.16772461, + "step": 4928, + "time_per_iteration": 2.495842933654785 + }, + { + "auxiliary_loss_clip": 0.06510531, + "auxiliary_loss_mlp": 0.01280378, + "balance_loss_clip": 0.0629655, + "balance_loss_mlp": 0.012609, + "epoch": 0.29634751240042084, + "flos": 26730967576320.0, + "grad_norm": 3.186979474193142, + "language_loss": 0.72557974, + "learning_rate": 3.300842211064773e-06, + "loss": 0.80348885, + "num_input_tokens_seen": 106142835, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.19482422, + "step": 4929, + "time_per_iteration": 2.5845630168914795 + }, + { + "auxiliary_loss_clip": 0.06503193, + "auxiliary_loss_mlp": 0.01287506, + "balance_loss_clip": 0.06293295, + "balance_loss_mlp": 0.01268456, + "epoch": 0.2964076356530888, + "flos": 14575984197120.0, + "grad_norm": 2.811052251549286, + "language_loss": 0.73200721, + "learning_rate": 3.3005463615061246e-06, + "loss": 0.80991417, + "num_input_tokens_seen": 106160680, + "router_z_loss_clip": 2.09960938, + "router_z_loss_mlp": 0.19042969, + "step": 4930, + "time_per_iteration": 2.488785982131958 + }, + { + "auxiliary_loss_clip": 0.06387739, + "auxiliary_loss_mlp": 0.01269345, + "balance_loss_clip": 0.06290003, + "balance_loss_mlp": 0.0126519, + "epoch": 0.29646775890575683, + "flos": 63124387925760.0, + "grad_norm": 0.773484435694784, + "language_loss": 0.60626972, + "learning_rate": 3.3002504626306275e-06, + "loss": 0.68284053, + "num_input_tokens_seen": 106224415, + "router_z_loss_clip": 0.9765625, + "router_z_loss_mlp": 0.04156494, + "step": 4931, + "time_per_iteration": 3.1399567127227783 + }, + { + "auxiliary_loss_clip": 0.06390411, + "auxiliary_loss_mlp": 0.01264384, + "balance_loss_clip": 0.06293079, + "balance_loss_mlp": 0.0126054, + "epoch": 0.2965278821584248, + "flos": 63087728964480.0, + "grad_norm": 0.7260178151779769, + "language_loss": 0.52335358, + "learning_rate": 3.2999545144495023e-06, + "loss": 0.59990156, + "num_input_tokens_seen": 106279140, + "router_z_loss_clip": 0.97265625, + "router_z_loss_mlp": 0.03839111, + "step": 4932, + "time_per_iteration": 3.0242393016815186 + }, + { + "auxiliary_loss_clip": 0.06496995, + "auxiliary_loss_mlp": 0.01277379, + "balance_loss_clip": 0.06294326, + "balance_loss_mlp": 0.01260368, + "epoch": 0.29658800541109276, + "flos": 23775469562880.0, + "grad_norm": 1.6744964780290639, + "language_loss": 0.82042706, + "learning_rate": 3.299658516973972e-06, + "loss": 0.89817077, + "num_input_tokens_seen": 106298190, + "router_z_loss_clip": 2.02734375, + "router_z_loss_mlp": 0.17028809, + "step": 4933, + "time_per_iteration": 2.5955240726470947 + }, + { + "auxiliary_loss_clip": 0.06493178, + "auxiliary_loss_mlp": 0.01272888, + "balance_loss_clip": 0.06293809, + "balance_loss_mlp": 0.01256377, + "epoch": 0.2966481286637607, + "flos": 23995465257600.0, + "grad_norm": 1.8381459517159284, + "language_loss": 0.75639498, + "learning_rate": 3.299362470215261e-06, + "loss": 0.83405566, + "num_input_tokens_seen": 106319065, + "router_z_loss_clip": 1.99414062, + "router_z_loss_mlp": 0.16503906, + "step": 4934, + "time_per_iteration": 2.5714681148529053 + }, + { + "auxiliary_loss_clip": 0.06508597, + "auxiliary_loss_mlp": 0.01280413, + "balance_loss_clip": 0.06299804, + "balance_loss_mlp": 0.01261697, + "epoch": 0.2967082519164287, + "flos": 17170846237440.0, + "grad_norm": 1.723450067314057, + "language_loss": 0.63127494, + "learning_rate": 3.299066374184594e-06, + "loss": 0.70916504, + "num_input_tokens_seen": 106338040, + "router_z_loss_clip": 2.0859375, + "router_z_loss_mlp": 0.18713379, + "step": 4935, + "time_per_iteration": 2.513557195663452 + }, + { + "auxiliary_loss_clip": 0.06500618, + "auxiliary_loss_mlp": 0.01281806, + "balance_loss_clip": 0.06298316, + "balance_loss_mlp": 0.01263424, + "epoch": 0.29676837516909665, + "flos": 29395416032640.0, + "grad_norm": 1.6887254989691298, + "language_loss": 0.80239189, + "learning_rate": 3.2987702288932e-06, + "loss": 0.88021612, + "num_input_tokens_seen": 106358900, + "router_z_loss_clip": 2.02148438, + "router_z_loss_mlp": 0.18383789, + "step": 4936, + "time_per_iteration": 2.6222426891326904 + }, + { + "auxiliary_loss_clip": 0.06510909, + "auxiliary_loss_mlp": 0.0128109, + "balance_loss_clip": 0.06301413, + "balance_loss_mlp": 0.01261444, + "epoch": 0.2968284984217646, + "flos": 34759839876480.0, + "grad_norm": 1.4826285887608224, + "language_loss": 0.74831104, + "learning_rate": 3.298474034352309e-06, + "loss": 0.826231, + "num_input_tokens_seen": 106381805, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 0.19665527, + "step": 4937, + "time_per_iteration": 2.7231242656707764 + }, + { + "auxiliary_loss_clip": 0.06501779, + "auxiliary_loss_mlp": 0.01274387, + "balance_loss_clip": 0.06297591, + "balance_loss_mlp": 0.01256768, + "epoch": 0.2968886216744326, + "flos": 21550635152640.0, + "grad_norm": 1.507706154697653, + "language_loss": 0.78372371, + "learning_rate": 3.2981777905731526e-06, + "loss": 0.86148536, + "num_input_tokens_seen": 106402365, + "router_z_loss_clip": 2.04101562, + "router_z_loss_mlp": 0.17614746, + "step": 4938, + "time_per_iteration": 2.564958095550537 + }, + { + "auxiliary_loss_clip": 0.06506119, + "auxiliary_loss_mlp": 0.01279001, + "balance_loss_clip": 0.06296918, + "balance_loss_mlp": 0.01260643, + "epoch": 0.29694874492710055, + "flos": 12792357060480.0, + "grad_norm": 3.019574533594622, + "language_loss": 0.76788878, + "learning_rate": 3.297881497566964e-06, + "loss": 0.84574002, + "num_input_tokens_seen": 106419800, + "router_z_loss_clip": 2.09179688, + "router_z_loss_mlp": 0.18359375, + "step": 4939, + "time_per_iteration": 2.514143943786621 + }, + { + "auxiliary_loss_clip": 0.06509334, + "auxiliary_loss_mlp": 0.01271997, + "balance_loss_clip": 0.06296703, + "balance_loss_mlp": 0.01254259, + "epoch": 0.2970088681797685, + "flos": 24576600049920.0, + "grad_norm": 1.687046897883716, + "language_loss": 0.78335512, + "learning_rate": 3.297585155344979e-06, + "loss": 0.86116844, + "num_input_tokens_seen": 106440300, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.17736816, + "step": 4940, + "time_per_iteration": 2.570279359817505 + }, + { + "auxiliary_loss_clip": 0.06508817, + "auxiliary_loss_mlp": 0.01275865, + "balance_loss_clip": 0.06300067, + "balance_loss_mlp": 0.01257113, + "epoch": 0.2970689914324365, + "flos": 23665870022400.0, + "grad_norm": 1.5281741947741105, + "language_loss": 0.75415564, + "learning_rate": 3.297288763918435e-06, + "loss": 0.8320024, + "num_input_tokens_seen": 106460035, + "router_z_loss_clip": 2.08789062, + "router_z_loss_mlp": 0.1875, + "step": 4941, + "time_per_iteration": 2.549976348876953 + }, + { + "auxiliary_loss_clip": 0.06509985, + "auxiliary_loss_mlp": 0.01274098, + "balance_loss_clip": 0.06298217, + "balance_loss_mlp": 0.01254667, + "epoch": 0.29712911468510445, + "flos": 39678654107520.0, + "grad_norm": 2.245999939669129, + "language_loss": 0.74959898, + "learning_rate": 3.2969923232985712e-06, + "loss": 0.82743979, + "num_input_tokens_seen": 106481095, + "router_z_loss_clip": 2.1171875, + "router_z_loss_mlp": 0.19445801, + "step": 4942, + "time_per_iteration": 2.7199416160583496 + }, + { + "auxiliary_loss_clip": 0.0651295, + "auxiliary_loss_mlp": 0.01282177, + "balance_loss_clip": 0.06299168, + "balance_loss_mlp": 0.01261744, + "epoch": 0.2971892379377724, + "flos": 26402420517120.0, + "grad_norm": 1.727137408051059, + "language_loss": 0.70931113, + "learning_rate": 3.2966958334966287e-06, + "loss": 0.78726244, + "num_input_tokens_seen": 106501590, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.2043457, + "step": 4943, + "time_per_iteration": 2.5410006046295166 + }, + { + "auxiliary_loss_clip": 0.06508674, + "auxiliary_loss_mlp": 0.01274303, + "balance_loss_clip": 0.06296329, + "balance_loss_mlp": 0.01255599, + "epoch": 0.2972493611904404, + "flos": 17608992837120.0, + "grad_norm": 2.280832061666768, + "language_loss": 0.8012532, + "learning_rate": 3.2963992945238497e-06, + "loss": 0.87908292, + "num_input_tokens_seen": 106519430, + "router_z_loss_clip": 2.12207031, + "router_z_loss_mlp": 0.18725586, + "step": 4944, + "time_per_iteration": 2.5628697872161865 + }, + { + "auxiliary_loss_clip": 0.06495067, + "auxiliary_loss_mlp": 0.01272551, + "balance_loss_clip": 0.06293043, + "balance_loss_mlp": 0.01255194, + "epoch": 0.2973094844431084, + "flos": 20419070889600.0, + "grad_norm": 2.0196449856406704, + "language_loss": 0.83490258, + "learning_rate": 3.2961027063914795e-06, + "loss": 0.91257876, + "num_input_tokens_seen": 106535870, + "router_z_loss_clip": 2.01953125, + "router_z_loss_mlp": 0.17346191, + "step": 4945, + "time_per_iteration": 2.5184381008148193 + }, + { + "auxiliary_loss_clip": 0.06494735, + "auxiliary_loss_mlp": 0.01274271, + "balance_loss_clip": 0.0629338, + "balance_loss_mlp": 0.01257081, + "epoch": 0.29736960769577636, + "flos": 17499225588480.0, + "grad_norm": 1.8481246337269472, + "language_loss": 0.67665654, + "learning_rate": 3.2958060691107654e-06, + "loss": 0.75434661, + "num_input_tokens_seen": 106553560, + "router_z_loss_clip": 2.00976562, + "router_z_loss_mlp": 0.171875, + "step": 4946, + "time_per_iteration": 2.524073362350464 + }, + { + "auxiliary_loss_clip": 0.06500807, + "auxiliary_loss_mlp": 0.01272914, + "balance_loss_clip": 0.06294695, + "balance_loss_mlp": 0.01255462, + "epoch": 0.2974297309484443, + "flos": 26111119397760.0, + "grad_norm": 1.9041348906467674, + "language_loss": 0.74493206, + "learning_rate": 3.2955093826929547e-06, + "loss": 0.82266927, + "num_input_tokens_seen": 106574115, + "router_z_loss_clip": 2.05957031, + "router_z_loss_mlp": 0.17443848, + "step": 4947, + "time_per_iteration": 2.55096435546875 + }, + { + "auxiliary_loss_clip": 0.06508033, + "auxiliary_loss_mlp": 0.01274303, + "balance_loss_clip": 0.06299601, + "balance_loss_mlp": 0.01255396, + "epoch": 0.2974898542011123, + "flos": 25673559776640.0, + "grad_norm": 5.5840313105791894, + "language_loss": 0.73332673, + "learning_rate": 3.2952126471492985e-06, + "loss": 0.81115007, + "num_input_tokens_seen": 106593070, + "router_z_loss_clip": 2.08496094, + "router_z_loss_mlp": 0.18896484, + "step": 4948, + "time_per_iteration": 2.604213237762451 + }, + { + "auxiliary_loss_clip": 0.06494544, + "auxiliary_loss_mlp": 0.01275305, + "balance_loss_clip": 0.06292598, + "balance_loss_mlp": 0.01258687, + "epoch": 0.29754997745378026, + "flos": 18667323031680.0, + "grad_norm": 1.916403484704169, + "language_loss": 0.84057009, + "learning_rate": 3.2949158624910497e-06, + "loss": 0.91826856, + "num_input_tokens_seen": 106610695, + "router_z_loss_clip": 2.015625, + "router_z_loss_mlp": 0.1661377, + "step": 4949, + "time_per_iteration": 2.4725756645202637 + }, + { + "auxiliary_loss_clip": 0.06495193, + "auxiliary_loss_mlp": 0.01276752, + "balance_loss_clip": 0.06291104, + "balance_loss_mlp": 0.01258692, + "epoch": 0.2976101007064482, + "flos": 22281382609920.0, + "grad_norm": 2.0864257908602464, + "language_loss": 0.71227181, + "learning_rate": 3.2946190287294603e-06, + "loss": 0.78999126, + "num_input_tokens_seen": 106631300, + "router_z_loss_clip": 2.04101562, + "router_z_loss_mlp": 0.18078613, + "step": 4950, + "time_per_iteration": 2.5644164085388184 + }, + { + "auxiliary_loss_clip": 0.06486266, + "auxiliary_loss_mlp": 0.01272795, + "balance_loss_clip": 0.06290439, + "balance_loss_mlp": 0.01256308, + "epoch": 0.2976702239591162, + "flos": 21952290499200.0, + "grad_norm": 2.1576156011429597, + "language_loss": 0.83112931, + "learning_rate": 3.294322145875789e-06, + "loss": 0.9087199, + "num_input_tokens_seen": 106650065, + "router_z_loss_clip": 1.95898438, + "router_z_loss_mlp": 0.16467285, + "step": 4951, + "time_per_iteration": 2.5149009227752686 + }, + { + "auxiliary_loss_clip": 0.06493516, + "auxiliary_loss_mlp": 0.01274653, + "balance_loss_clip": 0.06287138, + "balance_loss_mlp": 0.01257248, + "epoch": 0.29773034721178415, + "flos": 24642874229760.0, + "grad_norm": 2.538162384222029, + "language_loss": 0.73777694, + "learning_rate": 3.2940252139412912e-06, + "loss": 0.81545866, + "num_input_tokens_seen": 106668230, + "router_z_loss_clip": 2.06347656, + "router_z_loss_mlp": 0.17407227, + "step": 4952, + "time_per_iteration": 3.9977774620056152 + }, + { + "auxiliary_loss_clip": 0.06494328, + "auxiliary_loss_mlp": 0.01279914, + "balance_loss_clip": 0.06291338, + "balance_loss_mlp": 0.01261472, + "epoch": 0.2977904704644521, + "flos": 20563694236800.0, + "grad_norm": 1.830993802630573, + "language_loss": 0.8420608, + "learning_rate": 3.293728232937228e-06, + "loss": 0.91980314, + "num_input_tokens_seen": 106687785, + "router_z_loss_clip": 2.03027344, + "router_z_loss_mlp": 0.18444824, + "step": 4953, + "time_per_iteration": 2.556278944015503 + }, + { + "auxiliary_loss_clip": 0.0649702, + "auxiliary_loss_mlp": 0.01271138, + "balance_loss_clip": 0.06289494, + "balance_loss_mlp": 0.01254246, + "epoch": 0.2978505937171201, + "flos": 18922426387200.0, + "grad_norm": 2.0824874332629113, + "language_loss": 0.74276727, + "learning_rate": 3.2934312028748597e-06, + "loss": 0.82044888, + "num_input_tokens_seen": 106706875, + "router_z_loss_clip": 2.07617188, + "router_z_loss_mlp": 0.16894531, + "step": 4954, + "time_per_iteration": 3.9108667373657227 + }, + { + "auxiliary_loss_clip": 0.06489201, + "auxiliary_loss_mlp": 0.01275174, + "balance_loss_clip": 0.06286507, + "balance_loss_mlp": 0.01259164, + "epoch": 0.29791071696978805, + "flos": 19323788244480.0, + "grad_norm": 1.865430683209025, + "language_loss": 0.75582623, + "learning_rate": 3.293134123765452e-06, + "loss": 0.83346999, + "num_input_tokens_seen": 106725105, + "router_z_loss_clip": 2.02929688, + "router_z_loss_mlp": 0.16003418, + "step": 4955, + "time_per_iteration": 4.034101724624634 + }, + { + "auxiliary_loss_clip": 0.06493168, + "auxiliary_loss_mlp": 0.01273359, + "balance_loss_clip": 0.06285557, + "balance_loss_mlp": 0.0125593, + "epoch": 0.297970840222456, + "flos": 18812742992640.0, + "grad_norm": 1.8893942834003292, + "language_loss": 0.72569048, + "learning_rate": 3.2928369956202684e-06, + "loss": 0.80335575, + "num_input_tokens_seen": 106744780, + "router_z_loss_clip": 2.07421875, + "router_z_loss_mlp": 0.17419434, + "step": 4956, + "time_per_iteration": 2.523688793182373 + }, + { + "auxiliary_loss_clip": 0.06498902, + "auxiliary_loss_mlp": 0.01272155, + "balance_loss_clip": 0.06287451, + "balance_loss_mlp": 0.01253141, + "epoch": 0.298030963475124, + "flos": 22858702041600.0, + "grad_norm": 1.7093127439145954, + "language_loss": 0.79588521, + "learning_rate": 3.2925398184505754e-06, + "loss": 0.87359571, + "num_input_tokens_seen": 106764670, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.19006348, + "step": 4957, + "time_per_iteration": 2.5350780487060547 + }, + { + "auxiliary_loss_clip": 0.0648672, + "auxiliary_loss_mlp": 0.01278155, + "balance_loss_clip": 0.06281397, + "balance_loss_mlp": 0.01261084, + "epoch": 0.298091086727792, + "flos": 21874402529280.0, + "grad_norm": 1.5033412482034976, + "language_loss": 0.70601791, + "learning_rate": 3.2922425922676437e-06, + "loss": 0.78366661, + "num_input_tokens_seen": 106783695, + "router_z_loss_clip": 2.05273438, + "router_z_loss_mlp": 0.17077637, + "step": 4958, + "time_per_iteration": 2.52998948097229 + }, + { + "auxiliary_loss_clip": 0.06484255, + "auxiliary_loss_mlp": 0.01275467, + "balance_loss_clip": 0.06283475, + "balance_loss_mlp": 0.01256954, + "epoch": 0.29815120998045996, + "flos": 21180775230720.0, + "grad_norm": 1.4471916983062794, + "language_loss": 0.78955591, + "learning_rate": 3.291945317082743e-06, + "loss": 0.86715317, + "num_input_tokens_seen": 106803150, + "router_z_loss_clip": 2.00683594, + "router_z_loss_mlp": 0.18505859, + "step": 4959, + "time_per_iteration": 2.5247116088867188 + }, + { + "auxiliary_loss_clip": 0.06484501, + "auxiliary_loss_mlp": 0.01275754, + "balance_loss_clip": 0.06281502, + "balance_loss_mlp": 0.01258183, + "epoch": 0.29821133323312793, + "flos": 19901526946560.0, + "grad_norm": 1.8097637226237389, + "language_loss": 0.79637736, + "learning_rate": 3.291647992907147e-06, + "loss": 0.87397993, + "num_input_tokens_seen": 106820705, + "router_z_loss_clip": 2.02734375, + "router_z_loss_mlp": 0.17578125, + "step": 4960, + "time_per_iteration": 2.544517755508423 + }, + { + "auxiliary_loss_clip": 0.06493803, + "auxiliary_loss_mlp": 0.01272699, + "balance_loss_clip": 0.06284714, + "balance_loss_mlp": 0.01254483, + "epoch": 0.2982714564857959, + "flos": 12755781953280.0, + "grad_norm": 2.226713674353186, + "language_loss": 0.74493575, + "learning_rate": 3.291350619752129e-06, + "loss": 0.82260078, + "num_input_tokens_seen": 106837335, + "router_z_loss_clip": 2.09082031, + "router_z_loss_mlp": 0.18225098, + "step": 4961, + "time_per_iteration": 3.9662065505981445 + }, + { + "auxiliary_loss_clip": 0.06486452, + "auxiliary_loss_mlp": 0.01274578, + "balance_loss_clip": 0.062804, + "balance_loss_mlp": 0.01256756, + "epoch": 0.29833157973846386, + "flos": 22278238081920.0, + "grad_norm": 2.8000667311611167, + "language_loss": 0.62968349, + "learning_rate": 3.291053197628967e-06, + "loss": 0.70729387, + "num_input_tokens_seen": 106856250, + "router_z_loss_clip": 2.06054688, + "router_z_loss_mlp": 0.17810059, + "step": 4962, + "time_per_iteration": 2.533984661102295 + }, + { + "auxiliary_loss_clip": 0.06485053, + "auxiliary_loss_mlp": 0.01276691, + "balance_loss_clip": 0.06281514, + "balance_loss_mlp": 0.01259596, + "epoch": 0.2983917029911318, + "flos": 15377659735680.0, + "grad_norm": 1.6706058401186525, + "language_loss": 0.83686638, + "learning_rate": 3.2907557265489375e-06, + "loss": 0.91448379, + "num_input_tokens_seen": 106873370, + "router_z_loss_clip": 2.03613281, + "router_z_loss_mlp": 0.17102051, + "step": 4963, + "time_per_iteration": 2.524486780166626 + }, + { + "auxiliary_loss_clip": 0.0648464, + "auxiliary_loss_mlp": 0.01276785, + "balance_loss_clip": 0.06283776, + "balance_loss_mlp": 0.01259572, + "epoch": 0.2984518262437998, + "flos": 15383068323840.0, + "grad_norm": 2.213795741630968, + "language_loss": 0.66932309, + "learning_rate": 3.290458206523322e-06, + "loss": 0.74693739, + "num_input_tokens_seen": 106890330, + "router_z_loss_clip": 2.01074219, + "router_z_loss_mlp": 0.17224121, + "step": 4964, + "time_per_iteration": 2.5100491046905518 + }, + { + "auxiliary_loss_clip": 0.06485043, + "auxiliary_loss_mlp": 0.01273472, + "balance_loss_clip": 0.06283367, + "balance_loss_mlp": 0.01257701, + "epoch": 0.29851194949646775, + "flos": 18113413616640.0, + "grad_norm": 1.8232440195867097, + "language_loss": 0.72163451, + "learning_rate": 3.2901606375634015e-06, + "loss": 0.79921961, + "num_input_tokens_seen": 106909190, + "router_z_loss_clip": 2.01660156, + "router_z_loss_mlp": 0.15771484, + "step": 4965, + "time_per_iteration": 2.5180373191833496 + }, + { + "auxiliary_loss_clip": 0.06490128, + "auxiliary_loss_mlp": 0.01278877, + "balance_loss_clip": 0.06284484, + "balance_loss_mlp": 0.01261139, + "epoch": 0.2985720727491357, + "flos": 22024811808000.0, + "grad_norm": 1.7919900337102326, + "language_loss": 0.66928089, + "learning_rate": 3.289863019680461e-06, + "loss": 0.74697095, + "num_input_tokens_seen": 106927825, + "router_z_loss_clip": 2.05664062, + "router_z_loss_mlp": 0.17724609, + "step": 4966, + "time_per_iteration": 2.5509839057922363 + }, + { + "auxiliary_loss_clip": 0.06492805, + "auxiliary_loss_mlp": 0.01279859, + "balance_loss_clip": 0.06288783, + "balance_loss_mlp": 0.01262026, + "epoch": 0.2986321960018037, + "flos": 13046202604800.0, + "grad_norm": 2.9983208236286862, + "language_loss": 0.74761832, + "learning_rate": 3.289565352885785e-06, + "loss": 0.82534492, + "num_input_tokens_seen": 106943155, + "router_z_loss_clip": 2.04199219, + "router_z_loss_mlp": 0.17822266, + "step": 4967, + "time_per_iteration": 2.5119001865386963 + }, + { + "auxiliary_loss_clip": 0.06492577, + "auxiliary_loss_mlp": 0.01276602, + "balance_loss_clip": 0.06288804, + "balance_loss_mlp": 0.01260294, + "epoch": 0.29869231925447165, + "flos": 14470241944320.0, + "grad_norm": 1.9901449284839132, + "language_loss": 0.72232509, + "learning_rate": 3.2892676371906614e-06, + "loss": 0.80001682, + "num_input_tokens_seen": 106960295, + "router_z_loss_clip": 2.03613281, + "router_z_loss_mlp": 0.16308594, + "step": 4968, + "time_per_iteration": 2.49646258354187 + }, + { + "auxiliary_loss_clip": 0.06497695, + "auxiliary_loss_mlp": 0.01278817, + "balance_loss_clip": 0.06290321, + "balance_loss_mlp": 0.01261007, + "epoch": 0.2987524425071396, + "flos": 31658376850560.0, + "grad_norm": 1.780098836704026, + "language_loss": 0.76775402, + "learning_rate": 3.2889698726063805e-06, + "loss": 0.84551913, + "num_input_tokens_seen": 106982870, + "router_z_loss_clip": 2.07324219, + "router_z_loss_mlp": 0.17810059, + "step": 4969, + "time_per_iteration": 2.677133321762085 + }, + { + "auxiliary_loss_clip": 0.0649517, + "auxiliary_loss_mlp": 0.01279823, + "balance_loss_clip": 0.06290856, + "balance_loss_mlp": 0.0126355, + "epoch": 0.2988125657598076, + "flos": 21439735873920.0, + "grad_norm": 1.6530964666677603, + "language_loss": 0.702811, + "learning_rate": 3.2886720591442327e-06, + "loss": 0.78056097, + "num_input_tokens_seen": 107002405, + "router_z_loss_clip": 2.04589844, + "router_z_loss_mlp": 0.16271973, + "step": 4970, + "time_per_iteration": 2.542041301727295 + }, + { + "auxiliary_loss_clip": 0.06501894, + "auxiliary_loss_mlp": 0.01279087, + "balance_loss_clip": 0.06289935, + "balance_loss_mlp": 0.01260336, + "epoch": 0.2988726890124756, + "flos": 18082750222080.0, + "grad_norm": 2.836679638175962, + "language_loss": 0.84790057, + "learning_rate": 3.2883741968155103e-06, + "loss": 0.92571044, + "num_input_tokens_seen": 107017310, + "router_z_loss_clip": 2.11816406, + "router_z_loss_mlp": 0.18737793, + "step": 4971, + "time_per_iteration": 2.5460052490234375 + }, + { + "auxiliary_loss_clip": 0.06490934, + "auxiliary_loss_mlp": 0.01273993, + "balance_loss_clip": 0.06292243, + "balance_loss_mlp": 0.01257691, + "epoch": 0.29893281226514357, + "flos": 21760987628160.0, + "grad_norm": 1.7104631490326472, + "language_loss": 0.79530191, + "learning_rate": 3.2880762856315107e-06, + "loss": 0.87295115, + "num_input_tokens_seen": 107034645, + "router_z_loss_clip": 1.98925781, + "router_z_loss_mlp": 0.16314697, + "step": 4972, + "time_per_iteration": 2.521575689315796 + }, + { + "auxiliary_loss_clip": 0.0650093, + "auxiliary_loss_mlp": 0.01282709, + "balance_loss_clip": 0.06297094, + "balance_loss_mlp": 0.01266234, + "epoch": 0.29899293551781153, + "flos": 16842341105280.0, + "grad_norm": 1.7682293865220609, + "language_loss": 0.85643351, + "learning_rate": 3.2877783256035285e-06, + "loss": 0.93426991, + "num_input_tokens_seen": 107051125, + "router_z_loss_clip": 2.04101562, + "router_z_loss_mlp": 0.16467285, + "step": 4973, + "time_per_iteration": 2.546552896499634 + }, + { + "auxiliary_loss_clip": 0.06486042, + "auxiliary_loss_mlp": 0.01280538, + "balance_loss_clip": 0.06291717, + "balance_loss_mlp": 0.01263539, + "epoch": 0.2990530587704795, + "flos": 11734068792960.0, + "grad_norm": 1.5403026658154284, + "language_loss": 0.78163445, + "learning_rate": 3.287480316742863e-06, + "loss": 0.85930026, + "num_input_tokens_seen": 107068815, + "router_z_loss_clip": 1.94433594, + "router_z_loss_mlp": 0.17004395, + "step": 4974, + "time_per_iteration": 2.519416093826294 + }, + { + "auxiliary_loss_clip": 0.06492939, + "auxiliary_loss_mlp": 0.01274131, + "balance_loss_clip": 0.06288281, + "balance_loss_mlp": 0.01257001, + "epoch": 0.29911318202314746, + "flos": 28047713362560.0, + "grad_norm": 1.767842246111843, + "language_loss": 0.73036933, + "learning_rate": 3.287182259060815e-06, + "loss": 0.80804002, + "num_input_tokens_seen": 107090420, + "router_z_loss_clip": 2.04589844, + "router_z_loss_mlp": 0.17126465, + "step": 4975, + "time_per_iteration": 2.6099252700805664 + }, + { + "auxiliary_loss_clip": 0.0649198, + "auxiliary_loss_mlp": 0.01278331, + "balance_loss_clip": 0.06288506, + "balance_loss_mlp": 0.0126163, + "epoch": 0.2991733052758154, + "flos": 18739425070080.0, + "grad_norm": 3.7568061887968374, + "language_loss": 0.76564699, + "learning_rate": 3.286884152568687e-06, + "loss": 0.84335011, + "num_input_tokens_seen": 107107255, + "router_z_loss_clip": 2.03222656, + "router_z_loss_mlp": 0.16711426, + "step": 4976, + "time_per_iteration": 2.4865057468414307 + }, + { + "auxiliary_loss_clip": 0.0649081, + "auxiliary_loss_mlp": 0.01274025, + "balance_loss_clip": 0.06290253, + "balance_loss_mlp": 0.01257574, + "epoch": 0.2992334285284834, + "flos": 15564476413440.0, + "grad_norm": 2.0027584051633256, + "language_loss": 0.86547983, + "learning_rate": 3.2865859972777827e-06, + "loss": 0.94312823, + "num_input_tokens_seen": 107123840, + "router_z_loss_clip": 2.00585938, + "router_z_loss_mlp": 0.16455078, + "step": 4977, + "time_per_iteration": 2.5564377307891846 + }, + { + "auxiliary_loss_clip": 0.06492308, + "auxiliary_loss_mlp": 0.01273791, + "balance_loss_clip": 0.06289831, + "balance_loss_mlp": 0.0125684, + "epoch": 0.29929355178115136, + "flos": 21803809864320.0, + "grad_norm": 1.498415139231663, + "language_loss": 0.69035208, + "learning_rate": 3.2862877931994088e-06, + "loss": 0.76801312, + "num_input_tokens_seen": 107143475, + "router_z_loss_clip": 2.02539062, + "router_z_loss_mlp": 0.16943359, + "step": 4978, + "time_per_iteration": 2.519927978515625 + }, + { + "auxiliary_loss_clip": 0.06498158, + "auxiliary_loss_mlp": 0.01273756, + "balance_loss_clip": 0.06295491, + "balance_loss_mlp": 0.0125634, + "epoch": 0.2993536750338193, + "flos": 21184884080640.0, + "grad_norm": 2.2981139003330924, + "language_loss": 0.76821494, + "learning_rate": 3.2859895403448726e-06, + "loss": 0.84593409, + "num_input_tokens_seen": 107161725, + "router_z_loss_clip": 2.02832031, + "router_z_loss_mlp": 0.17407227, + "step": 4979, + "time_per_iteration": 2.5783658027648926 + }, + { + "auxiliary_loss_clip": 0.06495501, + "auxiliary_loss_mlp": 0.01275001, + "balance_loss_clip": 0.06288472, + "balance_loss_mlp": 0.0125762, + "epoch": 0.2994137982864873, + "flos": 32129954029440.0, + "grad_norm": 1.9038495469030372, + "language_loss": 0.69286489, + "learning_rate": 3.285691238725484e-06, + "loss": 0.77056986, + "num_input_tokens_seen": 107183935, + "router_z_loss_clip": 2.06835938, + "router_z_loss_mlp": 0.17382812, + "step": 4980, + "time_per_iteration": 2.582043170928955 + }, + { + "auxiliary_loss_clip": 0.06490306, + "auxiliary_loss_mlp": 0.01274236, + "balance_loss_clip": 0.06288646, + "balance_loss_mlp": 0.01257177, + "epoch": 0.29947392153915525, + "flos": 21111733866240.0, + "grad_norm": 1.7308746684442236, + "language_loss": 0.74001658, + "learning_rate": 3.285392888352555e-06, + "loss": 0.817662, + "num_input_tokens_seen": 107204285, + "router_z_loss_clip": 2.015625, + "router_z_loss_mlp": 0.17053223, + "step": 4981, + "time_per_iteration": 2.580580711364746 + }, + { + "auxiliary_loss_clip": 0.06490904, + "auxiliary_loss_mlp": 0.01273981, + "balance_loss_clip": 0.06281741, + "balance_loss_mlp": 0.0125635, + "epoch": 0.2995340447918232, + "flos": 21548916144000.0, + "grad_norm": 1.9422940804684126, + "language_loss": 0.86877131, + "learning_rate": 3.2850944892373987e-06, + "loss": 0.94642013, + "num_input_tokens_seen": 107225265, + "router_z_loss_clip": 2.08984375, + "router_z_loss_mlp": 0.17626953, + "step": 4982, + "time_per_iteration": 2.4962990283966064 + }, + { + "auxiliary_loss_clip": 0.06497963, + "auxiliary_loss_mlp": 0.0127461, + "balance_loss_clip": 0.06287588, + "balance_loss_mlp": 0.01257241, + "epoch": 0.2995941680444912, + "flos": 16730393650560.0, + "grad_norm": 2.5640920256819886, + "language_loss": 0.87797368, + "learning_rate": 3.2847960413913307e-06, + "loss": 0.95569938, + "num_input_tokens_seen": 107241335, + "router_z_loss_clip": 2.1015625, + "router_z_loss_mlp": 0.17382812, + "step": 4983, + "time_per_iteration": 2.5295448303222656 + }, + { + "auxiliary_loss_clip": 0.0649021, + "auxiliary_loss_mlp": 0.01273363, + "balance_loss_clip": 0.06287163, + "balance_loss_mlp": 0.012569, + "epoch": 0.2996542912971592, + "flos": 20929864579200.0, + "grad_norm": 2.1931631477553943, + "language_loss": 0.78985476, + "learning_rate": 3.284497544825668e-06, + "loss": 0.86749053, + "num_input_tokens_seen": 107259375, + "router_z_loss_clip": 2.03027344, + "router_z_loss_mlp": 0.16467285, + "step": 4984, + "time_per_iteration": 2.510861873626709 + }, + { + "auxiliary_loss_clip": 0.06490169, + "auxiliary_loss_mlp": 0.01276988, + "balance_loss_clip": 0.06284384, + "balance_loss_mlp": 0.01259702, + "epoch": 0.29971441454982717, + "flos": 25086429417600.0, + "grad_norm": 1.6549542244227224, + "language_loss": 0.78558743, + "learning_rate": 3.2841989995517303e-06, + "loss": 0.86325896, + "num_input_tokens_seen": 107279890, + "router_z_loss_clip": 2.05859375, + "router_z_loss_mlp": 0.17285156, + "step": 4985, + "time_per_iteration": 2.6011219024658203 + }, + { + "auxiliary_loss_clip": 0.06501257, + "auxiliary_loss_mlp": 0.01278562, + "balance_loss_clip": 0.06288561, + "balance_loss_mlp": 0.0125968, + "epoch": 0.29977453780249513, + "flos": 52567445617920.0, + "grad_norm": 2.1128232330624757, + "language_loss": 0.71929544, + "learning_rate": 3.283900405580837e-06, + "loss": 0.79709363, + "num_input_tokens_seen": 107303430, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.1887207, + "step": 4986, + "time_per_iteration": 2.8261890411376953 + }, + { + "auxiliary_loss_clip": 0.06496918, + "auxiliary_loss_mlp": 0.01277715, + "balance_loss_clip": 0.06288348, + "balance_loss_mlp": 0.0125981, + "epoch": 0.2998346610551631, + "flos": 22243759326720.0, + "grad_norm": 2.0495005677193703, + "language_loss": 0.73353851, + "learning_rate": 3.283601762924312e-06, + "loss": 0.81128478, + "num_input_tokens_seen": 107323700, + "router_z_loss_clip": 2.08203125, + "router_z_loss_mlp": 0.17907715, + "step": 4987, + "time_per_iteration": 2.5969009399414062 + }, + { + "auxiliary_loss_clip": 0.06487568, + "auxiliary_loss_mlp": 0.01277048, + "balance_loss_clip": 0.06283796, + "balance_loss_mlp": 0.01260561, + "epoch": 0.29989478430783106, + "flos": 16878832358400.0, + "grad_norm": 1.677350703029162, + "language_loss": 0.80982405, + "learning_rate": 3.2833030715934793e-06, + "loss": 0.88747025, + "num_input_tokens_seen": 107341965, + "router_z_loss_clip": 2.03710938, + "router_z_loss_mlp": 0.16479492, + "step": 4988, + "time_per_iteration": 2.4802756309509277 + }, + { + "auxiliary_loss_clip": 0.06489251, + "auxiliary_loss_mlp": 0.0127416, + "balance_loss_clip": 0.06285515, + "balance_loss_mlp": 0.0125759, + "epoch": 0.29995490756049903, + "flos": 23775637271040.0, + "grad_norm": 1.830625198484136, + "language_loss": 0.7097913, + "learning_rate": 3.2830043315996658e-06, + "loss": 0.7874254, + "num_input_tokens_seen": 107362615, + "router_z_loss_clip": 2.03613281, + "router_z_loss_mlp": 0.16577148, + "step": 4989, + "time_per_iteration": 2.5968902111053467 + }, + { + "auxiliary_loss_clip": 0.06498987, + "auxiliary_loss_mlp": 0.01283365, + "balance_loss_clip": 0.0628901, + "balance_loss_mlp": 0.01264948, + "epoch": 0.300015030813167, + "flos": 14470577360640.0, + "grad_norm": 2.8004651200920576, + "language_loss": 0.85787904, + "learning_rate": 3.282705542954199e-06, + "loss": 0.93570256, + "num_input_tokens_seen": 107378980, + "router_z_loss_clip": 2.09570312, + "router_z_loss_mlp": 0.18408203, + "step": 4990, + "time_per_iteration": 2.4837355613708496 + }, + { + "auxiliary_loss_clip": 0.06499861, + "auxiliary_loss_mlp": 0.01278121, + "balance_loss_clip": 0.06287368, + "balance_loss_mlp": 0.01260204, + "epoch": 0.30007515406583496, + "flos": 25199005777920.0, + "grad_norm": 1.6608247288012334, + "language_loss": 0.67339301, + "learning_rate": 3.28240670566841e-06, + "loss": 0.75117278, + "num_input_tokens_seen": 107397640, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.17919922, + "step": 4991, + "time_per_iteration": 4.060553312301636 + }, + { + "auxiliary_loss_clip": 0.0649571, + "auxiliary_loss_mlp": 0.01277369, + "balance_loss_clip": 0.06284688, + "balance_loss_mlp": 0.01259022, + "epoch": 0.3001352773185029, + "flos": 19397315802240.0, + "grad_norm": 1.7545259775845383, + "language_loss": 0.79479051, + "learning_rate": 3.28210781975363e-06, + "loss": 0.87252128, + "num_input_tokens_seen": 107416020, + "router_z_loss_clip": 2.10546875, + "router_z_loss_mlp": 0.18347168, + "step": 4992, + "time_per_iteration": 2.5394246578216553 + }, + { + "auxiliary_loss_clip": 0.06496455, + "auxiliary_loss_mlp": 0.01272727, + "balance_loss_clip": 0.06291893, + "balance_loss_mlp": 0.01255061, + "epoch": 0.3001954005711709, + "flos": 21550341663360.0, + "grad_norm": 1.8174225064451806, + "language_loss": 0.83191693, + "learning_rate": 3.281808885221193e-06, + "loss": 0.90960872, + "num_input_tokens_seen": 107436340, + "router_z_loss_clip": 2.04589844, + "router_z_loss_mlp": 0.17675781, + "step": 4993, + "time_per_iteration": 2.536900520324707 + }, + { + "auxiliary_loss_clip": 0.06502604, + "auxiliary_loss_mlp": 0.0127659, + "balance_loss_clip": 0.06290129, + "balance_loss_mlp": 0.01257051, + "epoch": 0.30025552382383885, + "flos": 17390087245440.0, + "grad_norm": 2.3964724385856955, + "language_loss": 0.8713994, + "learning_rate": 3.2815099020824345e-06, + "loss": 0.94919133, + "num_input_tokens_seen": 107454585, + "router_z_loss_clip": 2.12109375, + "router_z_loss_mlp": 0.1953125, + "step": 4994, + "time_per_iteration": 5.451568603515625 + }, + { + "auxiliary_loss_clip": 0.06500117, + "auxiliary_loss_mlp": 0.01272707, + "balance_loss_clip": 0.06293428, + "balance_loss_mlp": 0.01255696, + "epoch": 0.3003156470765068, + "flos": 29541003701760.0, + "grad_norm": 1.492375768993242, + "language_loss": 0.81277597, + "learning_rate": 3.2812108703486924e-06, + "loss": 0.89050424, + "num_input_tokens_seen": 107477180, + "router_z_loss_clip": 2.06445312, + "router_z_loss_mlp": 0.17016602, + "step": 4995, + "time_per_iteration": 2.6498701572418213 + }, + { + "auxiliary_loss_clip": 0.06495272, + "auxiliary_loss_mlp": 0.01276355, + "balance_loss_clip": 0.06291361, + "balance_loss_mlp": 0.01257818, + "epoch": 0.3003757703291748, + "flos": 43655278302720.0, + "grad_norm": 1.561088997277918, + "language_loss": 0.67591625, + "learning_rate": 3.2809117900313055e-06, + "loss": 0.75363255, + "num_input_tokens_seen": 107500250, + "router_z_loss_clip": 2.03808594, + "router_z_loss_mlp": 0.18530273, + "step": 4996, + "time_per_iteration": 2.6940386295318604 + }, + { + "auxiliary_loss_clip": 0.06490915, + "auxiliary_loss_mlp": 0.01277922, + "balance_loss_clip": 0.06287466, + "balance_loss_mlp": 0.0125985, + "epoch": 0.30043589358184275, + "flos": 22534934664960.0, + "grad_norm": 1.8202769971321224, + "language_loss": 0.76585484, + "learning_rate": 3.280612661141615e-06, + "loss": 0.84354323, + "num_input_tokens_seen": 107520070, + "router_z_loss_clip": 2.03515625, + "router_z_loss_mlp": 0.18054199, + "step": 4997, + "time_per_iteration": 2.551025629043579 + }, + { + "auxiliary_loss_clip": 0.06488951, + "auxiliary_loss_mlp": 0.01282226, + "balance_loss_clip": 0.06286483, + "balance_loss_mlp": 0.01264785, + "epoch": 0.30049601683451077, + "flos": 21002176252800.0, + "grad_norm": 1.7136041248753544, + "language_loss": 0.78929758, + "learning_rate": 3.2803134836909646e-06, + "loss": 0.86700928, + "num_input_tokens_seen": 107539285, + "router_z_loss_clip": 2.0234375, + "router_z_loss_mlp": 0.17443848, + "step": 4998, + "time_per_iteration": 2.4853529930114746 + }, + { + "auxiliary_loss_clip": 0.06495959, + "auxiliary_loss_mlp": 0.01277634, + "balance_loss_clip": 0.06296599, + "balance_loss_mlp": 0.0126104, + "epoch": 0.30055614008717874, + "flos": 23922985875840.0, + "grad_norm": 1.6408959445510187, + "language_loss": 0.73985869, + "learning_rate": 3.2800142576906985e-06, + "loss": 0.81759465, + "num_input_tokens_seen": 107560260, + "router_z_loss_clip": 1.99121094, + "router_z_loss_mlp": 0.16589355, + "step": 4999, + "time_per_iteration": 2.565272331237793 + }, + { + "auxiliary_loss_clip": 0.06497648, + "auxiliary_loss_mlp": 0.01276599, + "balance_loss_clip": 0.06290608, + "balance_loss_mlp": 0.01258837, + "epoch": 0.3006162633398467, + "flos": 19175475317760.0, + "grad_norm": 1.6585129963537202, + "language_loss": 0.76246512, + "learning_rate": 3.2797149831521626e-06, + "loss": 0.84020758, + "num_input_tokens_seen": 107579260, + "router_z_loss_clip": 2.0703125, + "router_z_loss_mlp": 0.1776123, + "step": 5000, + "time_per_iteration": 3.978001117706299 + }, + { + "auxiliary_loss_clip": 0.06488875, + "auxiliary_loss_mlp": 0.01280464, + "balance_loss_clip": 0.06288455, + "balance_loss_mlp": 0.0126244, + "epoch": 0.30067638659251467, + "flos": 14683697020800.0, + "grad_norm": 1.838860389970219, + "language_loss": 0.81972182, + "learning_rate": 3.2794156600867073e-06, + "loss": 0.89741528, + "num_input_tokens_seen": 107595245, + "router_z_loss_clip": 2.00390625, + "router_z_loss_mlp": 0.18041992, + "step": 5001, + "time_per_iteration": 2.4995031356811523 + }, + { + "auxiliary_loss_clip": 0.06495227, + "auxiliary_loss_mlp": 0.01279132, + "balance_loss_clip": 0.06291329, + "balance_loss_mlp": 0.01261322, + "epoch": 0.30073650984518263, + "flos": 23374778538240.0, + "grad_norm": 1.6002838962292127, + "language_loss": 0.81160742, + "learning_rate": 3.2791162885056815e-06, + "loss": 0.88935101, + "num_input_tokens_seen": 107613985, + "router_z_loss_clip": 2.04003906, + "router_z_loss_mlp": 0.17797852, + "step": 5002, + "time_per_iteration": 2.549882650375366 + }, + { + "auxiliary_loss_clip": 0.06502556, + "auxiliary_loss_mlp": 0.01273216, + "balance_loss_clip": 0.06290467, + "balance_loss_mlp": 0.01255728, + "epoch": 0.3007966330978506, + "flos": 22973332826880.0, + "grad_norm": 1.7018817575326768, + "language_loss": 0.71524274, + "learning_rate": 3.2788168684204376e-06, + "loss": 0.79300046, + "num_input_tokens_seen": 107631435, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.17504883, + "step": 5003, + "time_per_iteration": 2.537760019302368 + }, + { + "auxiliary_loss_clip": 0.06502316, + "auxiliary_loss_mlp": 0.01275597, + "balance_loss_clip": 0.06290226, + "balance_loss_mlp": 0.01257441, + "epoch": 0.30085675635051856, + "flos": 27825830951040.0, + "grad_norm": 1.9954765529899763, + "language_loss": 0.706792, + "learning_rate": 3.27851739984233e-06, + "loss": 0.78457117, + "num_input_tokens_seen": 107650530, + "router_z_loss_clip": 2.12304688, + "router_z_loss_mlp": 0.18151855, + "step": 5004, + "time_per_iteration": 2.6357674598693848 + }, + { + "auxiliary_loss_clip": 0.06504735, + "auxiliary_loss_mlp": 0.01282861, + "balance_loss_clip": 0.06296123, + "balance_loss_mlp": 0.01263513, + "epoch": 0.3009168796031865, + "flos": 10886216855040.0, + "grad_norm": 2.7451882694975662, + "language_loss": 0.81914413, + "learning_rate": 3.278217882782715e-06, + "loss": 0.89702016, + "num_input_tokens_seen": 107662240, + "router_z_loss_clip": 2.08398438, + "router_z_loss_mlp": 0.19335938, + "step": 5005, + "time_per_iteration": 2.4386463165283203 + }, + { + "auxiliary_loss_clip": 0.06497307, + "auxiliary_loss_mlp": 0.01278667, + "balance_loss_clip": 0.06293161, + "balance_loss_mlp": 0.01261179, + "epoch": 0.3009770028558545, + "flos": 23812170451200.0, + "grad_norm": 3.689468326241579, + "language_loss": 0.74513727, + "learning_rate": 3.2779183172529497e-06, + "loss": 0.82289702, + "num_input_tokens_seen": 107680330, + "router_z_loss_clip": 2.04296875, + "router_z_loss_mlp": 0.17492676, + "step": 5006, + "time_per_iteration": 2.6309902667999268 + }, + { + "auxiliary_loss_clip": 0.06490835, + "auxiliary_loss_mlp": 0.01274011, + "balance_loss_clip": 0.06288077, + "balance_loss_mlp": 0.01255247, + "epoch": 0.30103712610852246, + "flos": 26475319169280.0, + "grad_norm": 1.9837745378518294, + "language_loss": 0.71514297, + "learning_rate": 3.2776187032643932e-06, + "loss": 0.79279143, + "num_input_tokens_seen": 107700020, + "router_z_loss_clip": 2.0234375, + "router_z_loss_mlp": 0.18762207, + "step": 5007, + "time_per_iteration": 2.5425140857696533 + }, + { + "auxiliary_loss_clip": 0.06499007, + "auxiliary_loss_mlp": 0.01277558, + "balance_loss_clip": 0.062922, + "balance_loss_mlp": 0.01258961, + "epoch": 0.3010972493611904, + "flos": 22863020526720.0, + "grad_norm": 2.135948160193648, + "language_loss": 0.76715112, + "learning_rate": 3.2773190408284075e-06, + "loss": 0.84491682, + "num_input_tokens_seen": 107718575, + "router_z_loss_clip": 2.06835938, + "router_z_loss_mlp": 0.18579102, + "step": 5008, + "time_per_iteration": 2.560136556625366 + }, + { + "auxiliary_loss_clip": 0.06498778, + "auxiliary_loss_mlp": 0.01277162, + "balance_loss_clip": 0.06291865, + "balance_loss_mlp": 0.01258959, + "epoch": 0.3011573726138584, + "flos": 24059307669120.0, + "grad_norm": 1.8647165617813573, + "language_loss": 0.85181898, + "learning_rate": 3.2770193299563564e-06, + "loss": 0.92957842, + "num_input_tokens_seen": 107738635, + "router_z_loss_clip": 2.06835938, + "router_z_loss_mlp": 0.18200684, + "step": 5009, + "time_per_iteration": 2.5235841274261475 + }, + { + "auxiliary_loss_clip": 0.06506295, + "auxiliary_loss_mlp": 0.01281474, + "balance_loss_clip": 0.06291408, + "balance_loss_mlp": 0.0126041, + "epoch": 0.30121749586652635, + "flos": 20264762396160.0, + "grad_norm": 1.8315766872525614, + "language_loss": 0.84202898, + "learning_rate": 3.276719570659604e-06, + "loss": 0.91990662, + "num_input_tokens_seen": 107753415, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.21069336, + "step": 5010, + "time_per_iteration": 2.5768747329711914 + }, + { + "auxiliary_loss_clip": 0.06499103, + "auxiliary_loss_mlp": 0.01276454, + "balance_loss_clip": 0.06292678, + "balance_loss_mlp": 0.01258728, + "epoch": 0.3012776191191944, + "flos": 26950334365440.0, + "grad_norm": 2.3479091749479593, + "language_loss": 0.85299456, + "learning_rate": 3.2764197629495176e-06, + "loss": 0.93075019, + "num_input_tokens_seen": 107773840, + "router_z_loss_clip": 2.06445312, + "router_z_loss_mlp": 0.17724609, + "step": 5011, + "time_per_iteration": 2.5496773719787598 + }, + { + "auxiliary_loss_clip": 0.06498772, + "auxiliary_loss_mlp": 0.01276485, + "balance_loss_clip": 0.06287067, + "balance_loss_mlp": 0.01258472, + "epoch": 0.30133774237186234, + "flos": 20418525838080.0, + "grad_norm": 2.2969937551574615, + "language_loss": 0.73043567, + "learning_rate": 3.2761199068374656e-06, + "loss": 0.80818832, + "num_input_tokens_seen": 107792020, + "router_z_loss_clip": 2.1171875, + "router_z_loss_mlp": 0.18017578, + "step": 5012, + "time_per_iteration": 2.5352632999420166 + }, + { + "auxiliary_loss_clip": 0.06502604, + "auxiliary_loss_mlp": 0.01275987, + "balance_loss_clip": 0.06294451, + "balance_loss_mlp": 0.01257581, + "epoch": 0.3013978656245303, + "flos": 19798635732480.0, + "grad_norm": 2.0714365992737247, + "language_loss": 0.88282806, + "learning_rate": 3.275820002334819e-06, + "loss": 0.96061397, + "num_input_tokens_seen": 107809595, + "router_z_loss_clip": 2.08007812, + "router_z_loss_mlp": 0.1842041, + "step": 5013, + "time_per_iteration": 2.5217273235321045 + }, + { + "auxiliary_loss_clip": 0.06510235, + "auxiliary_loss_mlp": 0.01281959, + "balance_loss_clip": 0.06297365, + "balance_loss_mlp": 0.01261956, + "epoch": 0.30145798887719827, + "flos": 16254623767680.0, + "grad_norm": 2.0397198762739253, + "language_loss": 0.8413021, + "learning_rate": 3.2755200494529496e-06, + "loss": 0.91922402, + "num_input_tokens_seen": 107827230, + "router_z_loss_clip": 2.12988281, + "router_z_loss_mlp": 0.19995117, + "step": 5014, + "time_per_iteration": 2.543929100036621 + }, + { + "auxiliary_loss_clip": 0.06496109, + "auxiliary_loss_mlp": 0.01278136, + "balance_loss_clip": 0.06295025, + "balance_loss_mlp": 0.01260934, + "epoch": 0.30151811212986623, + "flos": 24578654474880.0, + "grad_norm": 1.6793816963153507, + "language_loss": 0.68929201, + "learning_rate": 3.2752200482032323e-06, + "loss": 0.76703441, + "num_input_tokens_seen": 107847195, + "router_z_loss_clip": 2.0078125, + "router_z_loss_mlp": 0.17199707, + "step": 5015, + "time_per_iteration": 2.5488638877868652 + }, + { + "auxiliary_loss_clip": 0.06498226, + "auxiliary_loss_mlp": 0.01282599, + "balance_loss_clip": 0.06293575, + "balance_loss_mlp": 0.01262989, + "epoch": 0.3015782353825342, + "flos": 21878595233280.0, + "grad_norm": 2.19954780338382, + "language_loss": 0.75070626, + "learning_rate": 3.2749199985970436e-06, + "loss": 0.82851446, + "num_input_tokens_seen": 107866420, + "router_z_loss_clip": 2.046875, + "router_z_loss_mlp": 0.19604492, + "step": 5016, + "time_per_iteration": 2.6430094242095947 + }, + { + "auxiliary_loss_clip": 0.06498955, + "auxiliary_loss_mlp": 0.01278069, + "balance_loss_clip": 0.06290609, + "balance_loss_mlp": 0.01260009, + "epoch": 0.30163835863520216, + "flos": 28777244935680.0, + "grad_norm": 1.487936670829871, + "language_loss": 0.657938, + "learning_rate": 3.2746199006457603e-06, + "loss": 0.73570824, + "num_input_tokens_seen": 107889090, + "router_z_loss_clip": 2.08300781, + "router_z_loss_mlp": 0.18041992, + "step": 5017, + "time_per_iteration": 2.62882661819458 + }, + { + "auxiliary_loss_clip": 0.06504996, + "auxiliary_loss_mlp": 0.0127571, + "balance_loss_clip": 0.06297189, + "balance_loss_mlp": 0.01258019, + "epoch": 0.30169848188787013, + "flos": 22972829702400.0, + "grad_norm": 1.7163502989136974, + "language_loss": 0.68538272, + "learning_rate": 3.2743197543607628e-06, + "loss": 0.76318979, + "num_input_tokens_seen": 107907520, + "router_z_loss_clip": 2.078125, + "router_z_loss_mlp": 0.17675781, + "step": 5018, + "time_per_iteration": 2.5743629932403564 + }, + { + "auxiliary_loss_clip": 0.06490742, + "auxiliary_loss_mlp": 0.01280876, + "balance_loss_clip": 0.06291413, + "balance_loss_mlp": 0.01263102, + "epoch": 0.3017586051405381, + "flos": 21841726636800.0, + "grad_norm": 1.8632302123292983, + "language_loss": 0.79424834, + "learning_rate": 3.2740195597534327e-06, + "loss": 0.87196445, + "num_input_tokens_seen": 107925650, + "router_z_loss_clip": 1.99023438, + "router_z_loss_mlp": 0.17773438, + "step": 5019, + "time_per_iteration": 2.490190029144287 + }, + { + "auxiliary_loss_clip": 0.06497257, + "auxiliary_loss_mlp": 0.01272585, + "balance_loss_clip": 0.06291286, + "balance_loss_mlp": 0.01255932, + "epoch": 0.30181872839320606, + "flos": 22166374481280.0, + "grad_norm": 1.9171916392208899, + "language_loss": 0.70839167, + "learning_rate": 3.2737193168351527e-06, + "loss": 0.78609014, + "num_input_tokens_seen": 107943975, + "router_z_loss_clip": 2.05957031, + "router_z_loss_mlp": 0.16650391, + "step": 5020, + "time_per_iteration": 2.5635480880737305 + }, + { + "auxiliary_loss_clip": 0.06504546, + "auxiliary_loss_mlp": 0.01281398, + "balance_loss_clip": 0.06293903, + "balance_loss_mlp": 0.01263063, + "epoch": 0.301878851645874, + "flos": 18120080016000.0, + "grad_norm": 1.792157390717078, + "language_loss": 0.78276378, + "learning_rate": 3.2734190256173085e-06, + "loss": 0.86062324, + "num_input_tokens_seen": 107962950, + "router_z_loss_clip": 2.10742188, + "router_z_loss_mlp": 0.18347168, + "step": 5021, + "time_per_iteration": 2.4956390857696533 + }, + { + "auxiliary_loss_clip": 0.06497782, + "auxiliary_loss_mlp": 0.01276425, + "balance_loss_clip": 0.06289995, + "balance_loss_mlp": 0.01258758, + "epoch": 0.301938974898542, + "flos": 17607860807040.0, + "grad_norm": 2.1405998927344774, + "language_loss": 0.77019519, + "learning_rate": 3.2731186861112877e-06, + "loss": 0.84793723, + "num_input_tokens_seen": 107979700, + "router_z_loss_clip": 2.078125, + "router_z_loss_mlp": 0.17663574, + "step": 5022, + "time_per_iteration": 2.5157957077026367 + }, + { + "auxiliary_loss_clip": 0.06495966, + "auxiliary_loss_mlp": 0.01276385, + "balance_loss_clip": 0.0628897, + "balance_loss_mlp": 0.01258766, + "epoch": 0.30199909815120995, + "flos": 11185861455360.0, + "grad_norm": 1.768248661027107, + "language_loss": 0.70051187, + "learning_rate": 3.2728182983284793e-06, + "loss": 0.77823544, + "num_input_tokens_seen": 107996645, + "router_z_loss_clip": 2.06835938, + "router_z_loss_mlp": 0.17626953, + "step": 5023, + "time_per_iteration": 2.466554641723633 + }, + { + "auxiliary_loss_clip": 0.06500031, + "auxiliary_loss_mlp": 0.01272609, + "balance_loss_clip": 0.0628899, + "balance_loss_mlp": 0.0125586, + "epoch": 0.302059221403878, + "flos": 21914247945600.0, + "grad_norm": 1.9915350532209553, + "language_loss": 0.72159773, + "learning_rate": 3.2725178622802724e-06, + "loss": 0.7993241, + "num_input_tokens_seen": 108015020, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.16748047, + "step": 5024, + "time_per_iteration": 2.550529956817627 + }, + { + "auxiliary_loss_clip": 0.06490807, + "auxiliary_loss_mlp": 0.0127689, + "balance_loss_clip": 0.06288145, + "balance_loss_mlp": 0.01259068, + "epoch": 0.30211934465654594, + "flos": 26403678328320.0, + "grad_norm": 1.894121412902458, + "language_loss": 0.74805325, + "learning_rate": 3.272217377978061e-06, + "loss": 0.8257302, + "num_input_tokens_seen": 108036430, + "router_z_loss_clip": 2.02832031, + "router_z_loss_mlp": 0.17822266, + "step": 5025, + "time_per_iteration": 2.566805124282837 + }, + { + "auxiliary_loss_clip": 0.06489006, + "auxiliary_loss_mlp": 0.01277493, + "balance_loss_clip": 0.06288895, + "balance_loss_mlp": 0.01260649, + "epoch": 0.3021794679092139, + "flos": 23406573962880.0, + "grad_norm": 1.5421556017832176, + "language_loss": 0.67708206, + "learning_rate": 3.2719168454332387e-06, + "loss": 0.75474703, + "num_input_tokens_seen": 108054250, + "router_z_loss_clip": 1.99902344, + "router_z_loss_mlp": 0.16845703, + "step": 5026, + "time_per_iteration": 2.5388495922088623 + }, + { + "auxiliary_loss_clip": 0.06496219, + "auxiliary_loss_mlp": 0.01276315, + "balance_loss_clip": 0.0629091, + "balance_loss_mlp": 0.0125829, + "epoch": 0.30223959116188187, + "flos": 20266271769600.0, + "grad_norm": 1.7822947119811494, + "language_loss": 0.851165, + "learning_rate": 3.2716162646572034e-06, + "loss": 0.92889023, + "num_input_tokens_seen": 108071495, + "router_z_loss_clip": 2.05175781, + "router_z_loss_mlp": 0.18017578, + "step": 5027, + "time_per_iteration": 2.4944281578063965 + }, + { + "auxiliary_loss_clip": 0.06486274, + "auxiliary_loss_mlp": 0.01272499, + "balance_loss_clip": 0.06286463, + "balance_loss_mlp": 0.012555, + "epoch": 0.30229971441454984, + "flos": 26695105228800.0, + "grad_norm": 1.4959542036115716, + "language_loss": 0.79103637, + "learning_rate": 3.271315635661351e-06, + "loss": 0.86862409, + "num_input_tokens_seen": 108092135, + "router_z_loss_clip": 1.99804688, + "router_z_loss_mlp": 0.17004395, + "step": 5028, + "time_per_iteration": 2.559110403060913 + }, + { + "auxiliary_loss_clip": 0.06488896, + "auxiliary_loss_mlp": 0.01272685, + "balance_loss_clip": 0.06286621, + "balance_loss_mlp": 0.01255114, + "epoch": 0.3023598376672178, + "flos": 34353111358080.0, + "grad_norm": 2.034560710438702, + "language_loss": 0.777421, + "learning_rate": 3.2710149584570826e-06, + "loss": 0.8550368, + "num_input_tokens_seen": 108112945, + "router_z_loss_clip": 2.02246094, + "router_z_loss_mlp": 0.17553711, + "step": 5029, + "time_per_iteration": 2.616746187210083 + }, + { + "auxiliary_loss_clip": 0.06491397, + "auxiliary_loss_mlp": 0.012793, + "balance_loss_clip": 0.06285096, + "balance_loss_mlp": 0.0126112, + "epoch": 0.30241996091988577, + "flos": 23118794714880.0, + "grad_norm": 1.8709670039612754, + "language_loss": 0.83096594, + "learning_rate": 3.2707142330557993e-06, + "loss": 0.90867293, + "num_input_tokens_seen": 108130325, + "router_z_loss_clip": 2.06152344, + "router_z_loss_mlp": 0.1817627, + "step": 5030, + "time_per_iteration": 2.56754994392395 + }, + { + "auxiliary_loss_clip": 0.06496526, + "auxiliary_loss_mlp": 0.01269852, + "balance_loss_clip": 0.06289787, + "balance_loss_mlp": 0.01252817, + "epoch": 0.30248008417255373, + "flos": 19395932209920.0, + "grad_norm": 1.6009792224367259, + "language_loss": 0.70107001, + "learning_rate": 3.270413459468905e-06, + "loss": 0.77873379, + "num_input_tokens_seen": 108150300, + "router_z_loss_clip": 2.06542969, + "router_z_loss_mlp": 0.17028809, + "step": 5031, + "time_per_iteration": 3.9598355293273926 + }, + { + "auxiliary_loss_clip": 0.06489968, + "auxiliary_loss_mlp": 0.01272903, + "balance_loss_clip": 0.06286315, + "balance_loss_mlp": 0.01254843, + "epoch": 0.3025402074252217, + "flos": 23776601592960.0, + "grad_norm": 1.6577801639127376, + "language_loss": 0.83241403, + "learning_rate": 3.2701126377078047e-06, + "loss": 0.91004276, + "num_input_tokens_seen": 108170330, + "router_z_loss_clip": 2.03417969, + "router_z_loss_mlp": 0.18066406, + "step": 5032, + "time_per_iteration": 2.5589263439178467 + }, + { + "auxiliary_loss_clip": 0.064991, + "auxiliary_loss_mlp": 0.01275787, + "balance_loss_clip": 0.06290475, + "balance_loss_mlp": 0.01257846, + "epoch": 0.30260033067788966, + "flos": 26001184440960.0, + "grad_norm": 2.284722647008976, + "language_loss": 0.73521686, + "learning_rate": 3.269811767783906e-06, + "loss": 0.81296575, + "num_input_tokens_seen": 108191265, + "router_z_loss_clip": 2.08984375, + "router_z_loss_mlp": 0.17956543, + "step": 5033, + "time_per_iteration": 4.029735088348389 + }, + { + "auxiliary_loss_clip": 0.06487451, + "auxiliary_loss_mlp": 0.01273985, + "balance_loss_clip": 0.06287168, + "balance_loss_mlp": 0.01257201, + "epoch": 0.3026604539305576, + "flos": 25381629751680.0, + "grad_norm": 1.972268943863271, + "language_loss": 0.74434245, + "learning_rate": 3.2695108497086185e-06, + "loss": 0.82195687, + "num_input_tokens_seen": 108211615, + "router_z_loss_clip": 2.0, + "router_z_loss_mlp": 0.16784668, + "step": 5034, + "time_per_iteration": 4.0717785358428955 + }, + { + "auxiliary_loss_clip": 0.06489293, + "auxiliary_loss_mlp": 0.01272883, + "balance_loss_clip": 0.06285236, + "balance_loss_mlp": 0.01253785, + "epoch": 0.3027205771832256, + "flos": 25819944059520.0, + "grad_norm": 2.1341895685230434, + "language_loss": 0.72872615, + "learning_rate": 3.269209883493352e-06, + "loss": 0.80634785, + "num_input_tokens_seen": 108231080, + "router_z_loss_clip": 2.04003906, + "router_z_loss_mlp": 0.19104004, + "step": 5035, + "time_per_iteration": 2.552910804748535 + }, + { + "auxiliary_loss_clip": 0.06487517, + "auxiliary_loss_mlp": 0.01272592, + "balance_loss_clip": 0.06287874, + "balance_loss_mlp": 0.01255545, + "epoch": 0.30278070043589356, + "flos": 27351905857920.0, + "grad_norm": 2.3429469920607384, + "language_loss": 0.87837774, + "learning_rate": 3.2689088691495196e-06, + "loss": 0.95597875, + "num_input_tokens_seen": 108251125, + "router_z_loss_clip": 1.99414062, + "router_z_loss_mlp": 0.17041016, + "step": 5036, + "time_per_iteration": 2.5958964824676514 + }, + { + "auxiliary_loss_clip": 0.06487815, + "auxiliary_loss_mlp": 0.01273729, + "balance_loss_clip": 0.06288295, + "balance_loss_mlp": 0.0125574, + "epoch": 0.3028408236885616, + "flos": 24792444967680.0, + "grad_norm": 1.4626052772561229, + "language_loss": 0.77969307, + "learning_rate": 3.268607806688536e-06, + "loss": 0.85730845, + "num_input_tokens_seen": 108272545, + "router_z_loss_clip": 1.99609375, + "router_z_loss_mlp": 0.17980957, + "step": 5037, + "time_per_iteration": 2.556859016418457 + }, + { + "auxiliary_loss_clip": 0.06492691, + "auxiliary_loss_mlp": 0.01276846, + "balance_loss_clip": 0.06287664, + "balance_loss_mlp": 0.01258381, + "epoch": 0.30290094694122954, + "flos": 12937399678080.0, + "grad_norm": 2.1717737457337236, + "language_loss": 0.78095227, + "learning_rate": 3.268306696121816e-06, + "loss": 0.85864764, + "num_input_tokens_seen": 108289725, + "router_z_loss_clip": 2.04882812, + "router_z_loss_mlp": 0.18469238, + "step": 5038, + "time_per_iteration": 2.534095525741577 + }, + { + "auxiliary_loss_clip": 0.06487858, + "auxiliary_loss_mlp": 0.01274285, + "balance_loss_clip": 0.06289861, + "balance_loss_mlp": 0.01257631, + "epoch": 0.3029610701938975, + "flos": 25922709492480.0, + "grad_norm": 1.6864855803341283, + "language_loss": 0.74257523, + "learning_rate": 3.2680055374607804e-06, + "loss": 0.82019669, + "num_input_tokens_seen": 108310690, + "router_z_loss_clip": 1.9765625, + "router_z_loss_mlp": 0.16650391, + "step": 5039, + "time_per_iteration": 3.9620656967163086 + }, + { + "auxiliary_loss_clip": 0.06482661, + "auxiliary_loss_mlp": 0.01275025, + "balance_loss_clip": 0.06285235, + "balance_loss_mlp": 0.0125923, + "epoch": 0.3030211934465655, + "flos": 21987440087040.0, + "grad_norm": 1.8054159725903498, + "language_loss": 0.80141723, + "learning_rate": 3.267704330716847e-06, + "loss": 0.87899411, + "num_input_tokens_seen": 108328905, + "router_z_loss_clip": 1.97070312, + "router_z_loss_mlp": 0.15795898, + "step": 5040, + "time_per_iteration": 2.5038623809814453 + }, + { + "auxiliary_loss_clip": 0.06493679, + "auxiliary_loss_mlp": 0.01273287, + "balance_loss_clip": 0.06295684, + "balance_loss_mlp": 0.01256705, + "epoch": 0.30308131669923344, + "flos": 20997606205440.0, + "grad_norm": 1.5545793881611087, + "language_loss": 0.82498085, + "learning_rate": 3.267403075901438e-06, + "loss": 0.90265048, + "num_input_tokens_seen": 108346680, + "router_z_loss_clip": 1.97851562, + "router_z_loss_mlp": 0.16589355, + "step": 5041, + "time_per_iteration": 2.5619800090789795 + }, + { + "auxiliary_loss_clip": 0.06388037, + "auxiliary_loss_mlp": 0.01273694, + "balance_loss_clip": 0.062912, + "balance_loss_mlp": 0.012703, + "epoch": 0.3031414399519014, + "flos": 60568281198720.0, + "grad_norm": 0.7609258494567089, + "language_loss": 0.59132683, + "learning_rate": 3.267101773025978e-06, + "loss": 0.66794419, + "num_input_tokens_seen": 108413885, + "router_z_loss_clip": 0.96875, + "router_z_loss_mlp": 0.0340271, + "step": 5042, + "time_per_iteration": 3.2389016151428223 + }, + { + "auxiliary_loss_clip": 0.06493344, + "auxiliary_loss_mlp": 0.01274817, + "balance_loss_clip": 0.06288245, + "balance_loss_mlp": 0.0125808, + "epoch": 0.30320156320456937, + "flos": 21914038310400.0, + "grad_norm": 1.8743682054895758, + "language_loss": 0.71638298, + "learning_rate": 3.266800422101892e-06, + "loss": 0.79406464, + "num_input_tokens_seen": 108433640, + "router_z_loss_clip": 2.05175781, + "router_z_loss_mlp": 0.1673584, + "step": 5043, + "time_per_iteration": 2.5684726238250732 + }, + { + "auxiliary_loss_clip": 0.06492111, + "auxiliary_loss_mlp": 0.01274799, + "balance_loss_clip": 0.06289819, + "balance_loss_mlp": 0.01258121, + "epoch": 0.30326168645723733, + "flos": 21659186517120.0, + "grad_norm": 1.7052050019212173, + "language_loss": 0.70087332, + "learning_rate": 3.266499023140606e-06, + "loss": 0.7785424, + "num_input_tokens_seen": 108452640, + "router_z_loss_clip": 2.02441406, + "router_z_loss_mlp": 0.16699219, + "step": 5044, + "time_per_iteration": 2.517548084259033 + }, + { + "auxiliary_loss_clip": 0.06487354, + "auxiliary_loss_mlp": 0.01273722, + "balance_loss_clip": 0.06289065, + "balance_loss_mlp": 0.01257641, + "epoch": 0.3033218097099053, + "flos": 21877672838400.0, + "grad_norm": 1.4072868323237386, + "language_loss": 0.77798641, + "learning_rate": 3.2661975761535513e-06, + "loss": 0.85559714, + "num_input_tokens_seen": 108472470, + "router_z_loss_clip": 1.98242188, + "router_z_loss_mlp": 0.16088867, + "step": 5045, + "time_per_iteration": 2.5525407791137695 + }, + { + "auxiliary_loss_clip": 0.06487602, + "auxiliary_loss_mlp": 0.01277286, + "balance_loss_clip": 0.06286202, + "balance_loss_mlp": 0.01260096, + "epoch": 0.30338193296257326, + "flos": 27097137918720.0, + "grad_norm": 1.6677605508610576, + "language_loss": 0.72664404, + "learning_rate": 3.2658960811521564e-06, + "loss": 0.80429292, + "num_input_tokens_seen": 108493025, + "router_z_loss_clip": 2.01171875, + "router_z_loss_mlp": 0.171875, + "step": 5046, + "time_per_iteration": 2.5747427940368652 + }, + { + "auxiliary_loss_clip": 0.06495762, + "auxiliary_loss_mlp": 0.01276721, + "balance_loss_clip": 0.0629053, + "balance_loss_mlp": 0.0125897, + "epoch": 0.30344205621524123, + "flos": 19540052432640.0, + "grad_norm": 1.932306391246397, + "language_loss": 0.81483316, + "learning_rate": 3.2655945381478564e-06, + "loss": 0.89255798, + "num_input_tokens_seen": 108513480, + "router_z_loss_clip": 2.0546875, + "router_z_loss_mlp": 0.1776123, + "step": 5047, + "time_per_iteration": 2.5763392448425293 + }, + { + "auxiliary_loss_clip": 0.0648682, + "auxiliary_loss_mlp": 0.01271507, + "balance_loss_clip": 0.06287121, + "balance_loss_mlp": 0.01255568, + "epoch": 0.3035021794679092, + "flos": 23917116090240.0, + "grad_norm": 1.635585540948891, + "language_loss": 0.72204739, + "learning_rate": 3.265292947152084e-06, + "loss": 0.7996307, + "num_input_tokens_seen": 108533155, + "router_z_loss_clip": 1.99707031, + "router_z_loss_mlp": 0.15942383, + "step": 5048, + "time_per_iteration": 2.5134665966033936 + }, + { + "auxiliary_loss_clip": 0.06488065, + "auxiliary_loss_mlp": 0.01279017, + "balance_loss_clip": 0.0628863, + "balance_loss_mlp": 0.0126296, + "epoch": 0.30356230272057716, + "flos": 16149133077120.0, + "grad_norm": 2.0386560470204804, + "language_loss": 0.75622666, + "learning_rate": 3.2649913081762763e-06, + "loss": 0.83389747, + "num_input_tokens_seen": 108551900, + "router_z_loss_clip": 1.99707031, + "router_z_loss_mlp": 0.16052246, + "step": 5049, + "time_per_iteration": 2.516463279724121 + }, + { + "auxiliary_loss_clip": 0.06494351, + "auxiliary_loss_mlp": 0.01274287, + "balance_loss_clip": 0.06289351, + "balance_loss_mlp": 0.01257597, + "epoch": 0.3036224259732452, + "flos": 28922539115520.0, + "grad_norm": 1.525083803020086, + "language_loss": 0.82698894, + "learning_rate": 3.2646896212318717e-06, + "loss": 0.90467536, + "num_input_tokens_seen": 108574005, + "router_z_loss_clip": 2.05078125, + "router_z_loss_mlp": 0.16687012, + "step": 5050, + "time_per_iteration": 2.558199405670166 + }, + { + "auxiliary_loss_clip": 0.0649763, + "auxiliary_loss_mlp": 0.01273759, + "balance_loss_clip": 0.06295735, + "balance_loss_mlp": 0.01256617, + "epoch": 0.30368254922591315, + "flos": 21111943501440.0, + "grad_norm": 2.311701267026144, + "language_loss": 0.74346399, + "learning_rate": 3.2643878863303106e-06, + "loss": 0.82117784, + "num_input_tokens_seen": 108592715, + "router_z_loss_clip": 2.01953125, + "router_z_loss_mlp": 0.17150879, + "step": 5051, + "time_per_iteration": 2.530457019805908 + }, + { + "auxiliary_loss_clip": 0.06494159, + "auxiliary_loss_mlp": 0.01277624, + "balance_loss_clip": 0.06292571, + "balance_loss_mlp": 0.01260339, + "epoch": 0.3037426724785811, + "flos": 23008859758080.0, + "grad_norm": 1.7255753861859113, + "language_loss": 0.76444, + "learning_rate": 3.264086103483033e-06, + "loss": 0.84215784, + "num_input_tokens_seen": 108611770, + "router_z_loss_clip": 2.01367188, + "router_z_loss_mlp": 0.17297363, + "step": 5052, + "time_per_iteration": 2.596210479736328 + }, + { + "auxiliary_loss_clip": 0.06501957, + "auxiliary_loss_mlp": 0.01280226, + "balance_loss_clip": 0.06295583, + "balance_loss_mlp": 0.01262332, + "epoch": 0.3038027957312491, + "flos": 15638129752320.0, + "grad_norm": 1.9820354931454651, + "language_loss": 0.83096367, + "learning_rate": 3.2637842727014836e-06, + "loss": 0.90878546, + "num_input_tokens_seen": 108629070, + "router_z_loss_clip": 2.0625, + "router_z_loss_mlp": 0.17871094, + "step": 5053, + "time_per_iteration": 2.5384886264801025 + }, + { + "auxiliary_loss_clip": 0.06489826, + "auxiliary_loss_mlp": 0.0127909, + "balance_loss_clip": 0.06288566, + "balance_loss_mlp": 0.01262174, + "epoch": 0.30386291898391704, + "flos": 12718955283840.0, + "grad_norm": 1.6755872357210637, + "language_loss": 0.7197504, + "learning_rate": 3.2634823939971083e-06, + "loss": 0.79743958, + "num_input_tokens_seen": 108646315, + "router_z_loss_clip": 2.01171875, + "router_z_loss_mlp": 0.16906738, + "step": 5054, + "time_per_iteration": 2.4787559509277344 + }, + { + "auxiliary_loss_clip": 0.06500221, + "auxiliary_loss_mlp": 0.01282757, + "balance_loss_clip": 0.06298432, + "balance_loss_mlp": 0.01265805, + "epoch": 0.303923042236585, + "flos": 26366642023680.0, + "grad_norm": 1.8480883425842163, + "language_loss": 0.70137346, + "learning_rate": 3.2631804673813545e-06, + "loss": 0.77920318, + "num_input_tokens_seen": 108665920, + "router_z_loss_clip": 2.01757812, + "router_z_loss_mlp": 0.16943359, + "step": 5055, + "time_per_iteration": 2.5929152965545654 + }, + { + "auxiliary_loss_clip": 0.06494389, + "auxiliary_loss_mlp": 0.01279452, + "balance_loss_clip": 0.0629337, + "balance_loss_mlp": 0.01262488, + "epoch": 0.30398316548925297, + "flos": 19725359736960.0, + "grad_norm": 2.1405790356583516, + "language_loss": 0.68347496, + "learning_rate": 3.2628784928656707e-06, + "loss": 0.7612133, + "num_input_tokens_seen": 108683485, + "router_z_loss_clip": 2.00878906, + "router_z_loss_mlp": 0.16955566, + "step": 5056, + "time_per_iteration": 2.531677007675171 + }, + { + "auxiliary_loss_clip": 0.06490116, + "auxiliary_loss_mlp": 0.01281162, + "balance_loss_clip": 0.06292629, + "balance_loss_mlp": 0.01264377, + "epoch": 0.30404328874192094, + "flos": 24246124346880.0, + "grad_norm": 1.6503197514246037, + "language_loss": 0.83083463, + "learning_rate": 3.262576470461507e-06, + "loss": 0.9085474, + "num_input_tokens_seen": 108702700, + "router_z_loss_clip": 1.97753906, + "router_z_loss_mlp": 0.16796875, + "step": 5057, + "time_per_iteration": 2.5836069583892822 + }, + { + "auxiliary_loss_clip": 0.06484263, + "auxiliary_loss_mlp": 0.01272995, + "balance_loss_clip": 0.06286788, + "balance_loss_mlp": 0.01256603, + "epoch": 0.3041034119945889, + "flos": 24505881603840.0, + "grad_norm": 1.6860023663091837, + "language_loss": 0.89784855, + "learning_rate": 3.2622744001803176e-06, + "loss": 0.97542113, + "num_input_tokens_seen": 108721860, + "router_z_loss_clip": 1.97265625, + "router_z_loss_mlp": 0.16394043, + "step": 5058, + "time_per_iteration": 2.589932918548584 + }, + { + "auxiliary_loss_clip": 0.06495658, + "auxiliary_loss_mlp": 0.01274369, + "balance_loss_clip": 0.06294262, + "balance_loss_mlp": 0.01256524, + "epoch": 0.30416353524725687, + "flos": 28295689121280.0, + "grad_norm": 2.5117349508823392, + "language_loss": 0.71471179, + "learning_rate": 3.2619722820335564e-06, + "loss": 0.79241204, + "num_input_tokens_seen": 108743215, + "router_z_loss_clip": 2.01464844, + "router_z_loss_mlp": 0.17858887, + "step": 5059, + "time_per_iteration": 2.5827505588531494 + }, + { + "auxiliary_loss_clip": 0.06486548, + "auxiliary_loss_mlp": 0.01273567, + "balance_loss_clip": 0.06286812, + "balance_loss_mlp": 0.01257367, + "epoch": 0.30422365849992483, + "flos": 23667295541760.0, + "grad_norm": 1.868956784724377, + "language_loss": 0.73344606, + "learning_rate": 3.26167011603268e-06, + "loss": 0.8110472, + "num_input_tokens_seen": 108765505, + "router_z_loss_clip": 1.99609375, + "router_z_loss_mlp": 0.16174316, + "step": 5060, + "time_per_iteration": 2.624408006668091 + }, + { + "auxiliary_loss_clip": 0.06490071, + "auxiliary_loss_mlp": 0.01273663, + "balance_loss_clip": 0.06289257, + "balance_loss_mlp": 0.01257451, + "epoch": 0.3042837817525928, + "flos": 23004750908160.0, + "grad_norm": 1.75217091558972, + "language_loss": 0.7751621, + "learning_rate": 3.2613679021891463e-06, + "loss": 0.85279948, + "num_input_tokens_seen": 108783370, + "router_z_loss_clip": 2.0078125, + "router_z_loss_mlp": 0.16210938, + "step": 5061, + "time_per_iteration": 2.542299509048462 + }, + { + "auxiliary_loss_clip": 0.06496524, + "auxiliary_loss_mlp": 0.01274148, + "balance_loss_clip": 0.06292392, + "balance_loss_mlp": 0.01256362, + "epoch": 0.30434390500526076, + "flos": 22087438335360.0, + "grad_norm": 2.647933932315435, + "language_loss": 0.8275395, + "learning_rate": 3.261065640514415e-06, + "loss": 0.90524626, + "num_input_tokens_seen": 108797430, + "router_z_loss_clip": 2.03808594, + "router_z_loss_mlp": 0.17773438, + "step": 5062, + "time_per_iteration": 2.5313212871551514 + }, + { + "auxiliary_loss_clip": 0.06485732, + "auxiliary_loss_mlp": 0.01270116, + "balance_loss_clip": 0.06286077, + "balance_loss_mlp": 0.01253689, + "epoch": 0.3044040282579287, + "flos": 25490516532480.0, + "grad_norm": 1.803893214603413, + "language_loss": 0.74348861, + "learning_rate": 3.2607633310199483e-06, + "loss": 0.82104707, + "num_input_tokens_seen": 108816945, + "router_z_loss_clip": 1.99609375, + "router_z_loss_mlp": 0.16394043, + "step": 5063, + "time_per_iteration": 2.553527355194092 + }, + { + "auxiliary_loss_clip": 0.0649004, + "auxiliary_loss_mlp": 0.01274813, + "balance_loss_clip": 0.06291289, + "balance_loss_mlp": 0.01256753, + "epoch": 0.30446415151059675, + "flos": 21952080864000.0, + "grad_norm": 1.6090072895521823, + "language_loss": 0.84824491, + "learning_rate": 3.26046097371721e-06, + "loss": 0.92589343, + "num_input_tokens_seen": 108836615, + "router_z_loss_clip": 1.98925781, + "router_z_loss_mlp": 0.18066406, + "step": 5064, + "time_per_iteration": 2.558650493621826 + }, + { + "auxiliary_loss_clip": 0.06490266, + "auxiliary_loss_mlp": 0.01274023, + "balance_loss_clip": 0.06290541, + "balance_loss_mlp": 0.0125644, + "epoch": 0.3045242747632647, + "flos": 16440979248000.0, + "grad_norm": 2.1763674367183965, + "language_loss": 0.76565492, + "learning_rate": 3.2601585686176655e-06, + "loss": 0.84329784, + "num_input_tokens_seen": 108855165, + "router_z_loss_clip": 1.99804688, + "router_z_loss_mlp": 0.17578125, + "step": 5065, + "time_per_iteration": 2.50644588470459 + }, + { + "auxiliary_loss_clip": 0.06490786, + "auxiliary_loss_mlp": 0.01279051, + "balance_loss_clip": 0.06288782, + "balance_loss_mlp": 0.01260586, + "epoch": 0.3045843980159327, + "flos": 31548399966720.0, + "grad_norm": 1.8114152917186497, + "language_loss": 0.62859941, + "learning_rate": 3.2598561157327814e-06, + "loss": 0.70629776, + "num_input_tokens_seen": 108874690, + "router_z_loss_clip": 2.01953125, + "router_z_loss_mlp": 0.18469238, + "step": 5066, + "time_per_iteration": 2.6319751739501953 + }, + { + "auxiliary_loss_clip": 0.06499436, + "auxiliary_loss_mlp": 0.01273162, + "balance_loss_clip": 0.0629437, + "balance_loss_mlp": 0.01255602, + "epoch": 0.30464452126860064, + "flos": 17858645677440.0, + "grad_norm": 2.0549933694905653, + "language_loss": 0.82941914, + "learning_rate": 3.2595536150740265e-06, + "loss": 0.90714514, + "num_input_tokens_seen": 108893140, + "router_z_loss_clip": 2.05078125, + "router_z_loss_mlp": 0.17565918, + "step": 5067, + "time_per_iteration": 2.483863592147827 + }, + { + "auxiliary_loss_clip": 0.06485019, + "auxiliary_loss_mlp": 0.0127176, + "balance_loss_clip": 0.06289113, + "balance_loss_mlp": 0.01255643, + "epoch": 0.3047046445212686, + "flos": 20637682992000.0, + "grad_norm": 1.9234738451458053, + "language_loss": 0.63749218, + "learning_rate": 3.259251066652873e-06, + "loss": 0.71506, + "num_input_tokens_seen": 108911880, + "router_z_loss_clip": 1.95800781, + "router_z_loss_mlp": 0.16113281, + "step": 5068, + "time_per_iteration": 2.5133988857269287 + }, + { + "auxiliary_loss_clip": 0.06487909, + "auxiliary_loss_mlp": 0.01273097, + "balance_loss_clip": 0.06291264, + "balance_loss_mlp": 0.01256884, + "epoch": 0.3047647677739366, + "flos": 21293896642560.0, + "grad_norm": 1.767828765686575, + "language_loss": 0.75521863, + "learning_rate": 3.258948470480793e-06, + "loss": 0.8328287, + "num_input_tokens_seen": 108930440, + "router_z_loss_clip": 1.96679688, + "router_z_loss_mlp": 0.1619873, + "step": 5069, + "time_per_iteration": 2.5039985179901123 + }, + { + "auxiliary_loss_clip": 0.06492448, + "auxiliary_loss_mlp": 0.01270604, + "balance_loss_clip": 0.06298955, + "balance_loss_mlp": 0.01255047, + "epoch": 0.30482489102660454, + "flos": 21002218179840.0, + "grad_norm": 2.053197356954631, + "language_loss": 0.76551294, + "learning_rate": 3.258645826569261e-06, + "loss": 0.84314346, + "num_input_tokens_seen": 108949125, + "router_z_loss_clip": 1.93457031, + "router_z_loss_mlp": 0.15551758, + "step": 5070, + "time_per_iteration": 2.56703519821167 + }, + { + "auxiliary_loss_clip": 0.06501058, + "auxiliary_loss_mlp": 0.01275886, + "balance_loss_clip": 0.06296416, + "balance_loss_mlp": 0.01257742, + "epoch": 0.3048850142792725, + "flos": 26298732689280.0, + "grad_norm": 1.581704774716999, + "language_loss": 0.82567108, + "learning_rate": 3.2583431349297527e-06, + "loss": 0.90344059, + "num_input_tokens_seen": 108972190, + "router_z_loss_clip": 2.046875, + "router_z_loss_mlp": 0.18139648, + "step": 5071, + "time_per_iteration": 3.9534900188446045 + }, + { + "auxiliary_loss_clip": 0.06502657, + "auxiliary_loss_mlp": 0.01270862, + "balance_loss_clip": 0.06296133, + "balance_loss_mlp": 0.01253374, + "epoch": 0.30494513753194047, + "flos": 22352813815680.0, + "grad_norm": 1.6603887086526505, + "language_loss": 0.76386344, + "learning_rate": 3.2580403955737467e-06, + "loss": 0.84159869, + "num_input_tokens_seen": 108990325, + "router_z_loss_clip": 2.06542969, + "router_z_loss_mlp": 0.17492676, + "step": 5072, + "time_per_iteration": 3.9736859798431396 + }, + { + "auxiliary_loss_clip": 0.06492919, + "auxiliary_loss_mlp": 0.01277102, + "balance_loss_clip": 0.06293403, + "balance_loss_mlp": 0.01260544, + "epoch": 0.30500526078460843, + "flos": 19543909720320.0, + "grad_norm": 1.870095200943675, + "language_loss": 0.71741343, + "learning_rate": 3.257737608512723e-06, + "loss": 0.79511362, + "num_input_tokens_seen": 109009505, + "router_z_loss_clip": 1.99609375, + "router_z_loss_mlp": 0.16564941, + "step": 5073, + "time_per_iteration": 3.961787700653076 + }, + { + "auxiliary_loss_clip": 0.064973, + "auxiliary_loss_mlp": 0.01276358, + "balance_loss_clip": 0.06293514, + "balance_loss_mlp": 0.01259752, + "epoch": 0.3050653840372764, + "flos": 14470577360640.0, + "grad_norm": 2.0196062448027843, + "language_loss": 0.76699424, + "learning_rate": 3.257434773758163e-06, + "loss": 0.84473085, + "num_input_tokens_seen": 109026350, + "router_z_loss_clip": 2.03613281, + "router_z_loss_mlp": 0.16601562, + "step": 5074, + "time_per_iteration": 2.498986005783081 + }, + { + "auxiliary_loss_clip": 0.06498405, + "auxiliary_loss_mlp": 0.01271199, + "balance_loss_clip": 0.06298129, + "balance_loss_mlp": 0.01254534, + "epoch": 0.30512550728994436, + "flos": 24250736321280.0, + "grad_norm": 2.0830863268570496, + "language_loss": 0.75075227, + "learning_rate": 3.25713189132155e-06, + "loss": 0.8284483, + "num_input_tokens_seen": 109044165, + "router_z_loss_clip": 2.00292969, + "router_z_loss_mlp": 0.16662598, + "step": 5075, + "time_per_iteration": 2.586857557296753 + }, + { + "auxiliary_loss_clip": 0.06500411, + "auxiliary_loss_mlp": 0.01274386, + "balance_loss_clip": 0.06294686, + "balance_loss_mlp": 0.01256004, + "epoch": 0.30518563054261233, + "flos": 16365774608640.0, + "grad_norm": 1.8100237719305525, + "language_loss": 0.75655556, + "learning_rate": 3.2568289612143703e-06, + "loss": 0.8343035, + "num_input_tokens_seen": 109060665, + "router_z_loss_clip": 2.0546875, + "router_z_loss_mlp": 0.18371582, + "step": 5076, + "time_per_iteration": 2.4945309162139893 + }, + { + "auxiliary_loss_clip": 0.06496741, + "auxiliary_loss_mlp": 0.01270713, + "balance_loss_clip": 0.06296699, + "balance_loss_mlp": 0.01252712, + "epoch": 0.30524575379528035, + "flos": 21585952448640.0, + "grad_norm": 4.173383760279569, + "language_loss": 0.79782987, + "learning_rate": 3.25652598344811e-06, + "loss": 0.87550437, + "num_input_tokens_seen": 109080035, + "router_z_loss_clip": 2.0, + "router_z_loss_mlp": 0.17993164, + "step": 5077, + "time_per_iteration": 2.534932851791382 + }, + { + "auxiliary_loss_clip": 0.06489561, + "auxiliary_loss_mlp": 0.01270916, + "balance_loss_clip": 0.06295882, + "balance_loss_mlp": 0.01254012, + "epoch": 0.3053058770479483, + "flos": 16550872277760.0, + "grad_norm": 2.5701417949840146, + "language_loss": 0.7555238, + "learning_rate": 3.256222958034259e-06, + "loss": 0.83312857, + "num_input_tokens_seen": 109097385, + "router_z_loss_clip": 1.93652344, + "router_z_loss_mlp": 0.16894531, + "step": 5078, + "time_per_iteration": 2.530031442642212 + }, + { + "auxiliary_loss_clip": 0.06495726, + "auxiliary_loss_mlp": 0.01279629, + "balance_loss_clip": 0.06297612, + "balance_loss_mlp": 0.01262487, + "epoch": 0.3053660003006163, + "flos": 12317844988800.0, + "grad_norm": 1.8416681282179364, + "language_loss": 0.67517591, + "learning_rate": 3.255919884984307e-06, + "loss": 0.75292945, + "num_input_tokens_seen": 109115495, + "router_z_loss_clip": 1.98046875, + "router_z_loss_mlp": 0.17126465, + "step": 5079, + "time_per_iteration": 3.8981266021728516 + }, + { + "auxiliary_loss_clip": 0.06496017, + "auxiliary_loss_mlp": 0.01271448, + "balance_loss_clip": 0.06296019, + "balance_loss_mlp": 0.01253757, + "epoch": 0.30542612355328425, + "flos": 23118962423040.0, + "grad_norm": 1.7235884914338329, + "language_loss": 0.8044346, + "learning_rate": 3.2556167643097477e-06, + "loss": 0.88210917, + "num_input_tokens_seen": 109134235, + "router_z_loss_clip": 1.99707031, + "router_z_loss_mlp": 0.17687988, + "step": 5080, + "time_per_iteration": 2.562946081161499 + }, + { + "auxiliary_loss_clip": 0.06497588, + "auxiliary_loss_mlp": 0.01276495, + "balance_loss_clip": 0.06297643, + "balance_loss_mlp": 0.01259377, + "epoch": 0.3054862468059522, + "flos": 24396365917440.0, + "grad_norm": 2.5665035909877725, + "language_loss": 0.81653202, + "learning_rate": 3.255313596022074e-06, + "loss": 0.89427292, + "num_input_tokens_seen": 109152760, + "router_z_loss_clip": 1.99707031, + "router_z_loss_mlp": 0.17114258, + "step": 5081, + "time_per_iteration": 2.6026763916015625 + }, + { + "auxiliary_loss_clip": 0.06490453, + "auxiliary_loss_mlp": 0.0127058, + "balance_loss_clip": 0.06291625, + "balance_loss_mlp": 0.01253962, + "epoch": 0.3055463700586202, + "flos": 29393529315840.0, + "grad_norm": 1.580638075296793, + "language_loss": 0.72516012, + "learning_rate": 3.255010380132783e-06, + "loss": 0.80277044, + "num_input_tokens_seen": 109173925, + "router_z_loss_clip": 1.98632812, + "router_z_loss_mlp": 0.16619873, + "step": 5082, + "time_per_iteration": 2.650310516357422 + }, + { + "auxiliary_loss_clip": 0.06499462, + "auxiliary_loss_mlp": 0.01274957, + "balance_loss_clip": 0.06293429, + "balance_loss_mlp": 0.01257159, + "epoch": 0.30560649331128814, + "flos": 25598606699520.0, + "grad_norm": 2.3807589086926533, + "language_loss": 0.73733467, + "learning_rate": 3.2547071166533736e-06, + "loss": 0.81507885, + "num_input_tokens_seen": 109192510, + "router_z_loss_clip": 2.05761719, + "router_z_loss_mlp": 0.17797852, + "step": 5083, + "time_per_iteration": 2.595439910888672 + }, + { + "auxiliary_loss_clip": 0.06488115, + "auxiliary_loss_mlp": 0.01272372, + "balance_loss_clip": 0.0628676, + "balance_loss_mlp": 0.01254729, + "epoch": 0.3056666165639561, + "flos": 19133156206080.0, + "grad_norm": 1.8141392710911106, + "language_loss": 0.71165347, + "learning_rate": 3.254403805595344e-06, + "loss": 0.78925836, + "num_input_tokens_seen": 109210885, + "router_z_loss_clip": 2.01367188, + "router_z_loss_mlp": 0.17626953, + "step": 5084, + "time_per_iteration": 2.499873161315918 + }, + { + "auxiliary_loss_clip": 0.06505337, + "auxiliary_loss_mlp": 0.01276239, + "balance_loss_clip": 0.063004, + "balance_loss_mlp": 0.01260194, + "epoch": 0.30572673981662407, + "flos": 15529368752640.0, + "grad_norm": 2.0821129981034567, + "language_loss": 0.79337353, + "learning_rate": 3.2541004469701962e-06, + "loss": 0.87118936, + "num_input_tokens_seen": 109229180, + "router_z_loss_clip": 2.04882812, + "router_z_loss_mlp": 0.16027832, + "step": 5085, + "time_per_iteration": 2.479790449142456 + }, + { + "auxiliary_loss_clip": 0.06486039, + "auxiliary_loss_mlp": 0.01278912, + "balance_loss_clip": 0.06289506, + "balance_loss_mlp": 0.01260602, + "epoch": 0.30578686306929204, + "flos": 21512886088320.0, + "grad_norm": 2.123366644532801, + "language_loss": 0.78524947, + "learning_rate": 3.2537970407894342e-06, + "loss": 0.86289901, + "num_input_tokens_seen": 109249510, + "router_z_loss_clip": 1.96484375, + "router_z_loss_mlp": 0.18310547, + "step": 5086, + "time_per_iteration": 2.5372772216796875 + }, + { + "auxiliary_loss_clip": 0.06487311, + "auxiliary_loss_mlp": 0.01277834, + "balance_loss_clip": 0.06289313, + "balance_loss_mlp": 0.01259797, + "epoch": 0.30584698632196, + "flos": 20959689432960.0, + "grad_norm": 1.7535206397091907, + "language_loss": 0.77160186, + "learning_rate": 3.253493587064563e-06, + "loss": 0.8492533, + "num_input_tokens_seen": 109268200, + "router_z_loss_clip": 1.98144531, + "router_z_loss_mlp": 0.18041992, + "step": 5087, + "time_per_iteration": 2.4971578121185303 + }, + { + "auxiliary_loss_clip": 0.06492934, + "auxiliary_loss_mlp": 0.01277252, + "balance_loss_clip": 0.06288779, + "balance_loss_mlp": 0.01258154, + "epoch": 0.30590710957462797, + "flos": 24688044380160.0, + "grad_norm": 1.802467786704899, + "language_loss": 0.7266196, + "learning_rate": 3.2531900858070885e-06, + "loss": 0.80432141, + "num_input_tokens_seen": 109288370, + "router_z_loss_clip": 2.04492188, + "router_z_loss_mlp": 0.19091797, + "step": 5088, + "time_per_iteration": 2.5416259765625 + }, + { + "auxiliary_loss_clip": 0.06501624, + "auxiliary_loss_mlp": 0.0127311, + "balance_loss_clip": 0.06292014, + "balance_loss_mlp": 0.01253893, + "epoch": 0.30596723282729593, + "flos": 17091700456320.0, + "grad_norm": 2.3226252492467037, + "language_loss": 0.79702371, + "learning_rate": 3.252886537028521e-06, + "loss": 0.874771, + "num_input_tokens_seen": 109306730, + "router_z_loss_clip": 2.09570312, + "router_z_loss_mlp": 0.19226074, + "step": 5089, + "time_per_iteration": 2.4745559692382812 + }, + { + "auxiliary_loss_clip": 0.06491631, + "auxiliary_loss_mlp": 0.01275196, + "balance_loss_clip": 0.06291364, + "balance_loss_mlp": 0.01256981, + "epoch": 0.30602735607996395, + "flos": 22863775213440.0, + "grad_norm": 6.857787253608019, + "language_loss": 0.77299303, + "learning_rate": 3.2525829407403703e-06, + "loss": 0.85066134, + "num_input_tokens_seen": 109327360, + "router_z_loss_clip": 2.00097656, + "router_z_loss_mlp": 0.18225098, + "step": 5090, + "time_per_iteration": 2.5330631732940674 + }, + { + "auxiliary_loss_clip": 0.06500913, + "auxiliary_loss_mlp": 0.01279012, + "balance_loss_clip": 0.06295903, + "balance_loss_mlp": 0.01260773, + "epoch": 0.3060874793326319, + "flos": 29869173417600.0, + "grad_norm": 1.854909004407163, + "language_loss": 0.76970392, + "learning_rate": 3.2522792969541488e-06, + "loss": 0.84750324, + "num_input_tokens_seen": 109348135, + "router_z_loss_clip": 2.04882812, + "router_z_loss_mlp": 0.18237305, + "step": 5091, + "time_per_iteration": 2.561894178390503 + }, + { + "auxiliary_loss_clip": 0.06491988, + "auxiliary_loss_mlp": 0.01272552, + "balance_loss_clip": 0.06287533, + "balance_loss_mlp": 0.01254551, + "epoch": 0.3061476025852999, + "flos": 20454765528960.0, + "grad_norm": 1.7300285931862276, + "language_loss": 0.72878456, + "learning_rate": 3.2519756056813705e-06, + "loss": 0.80642998, + "num_input_tokens_seen": 109366220, + "router_z_loss_clip": 2.04589844, + "router_z_loss_mlp": 0.18005371, + "step": 5092, + "time_per_iteration": 2.5661561489105225 + }, + { + "auxiliary_loss_clip": 0.06495406, + "auxiliary_loss_mlp": 0.01276172, + "balance_loss_clip": 0.06294402, + "balance_loss_mlp": 0.01258696, + "epoch": 0.30620772583796785, + "flos": 19397651218560.0, + "grad_norm": 1.8286917674158676, + "language_loss": 0.83293521, + "learning_rate": 3.2516718669335522e-06, + "loss": 0.91065109, + "num_input_tokens_seen": 109385260, + "router_z_loss_clip": 2.00976562, + "router_z_loss_mlp": 0.17468262, + "step": 5093, + "time_per_iteration": 2.49686336517334 + }, + { + "auxiliary_loss_clip": 0.06495437, + "auxiliary_loss_mlp": 0.01277069, + "balance_loss_clip": 0.06295857, + "balance_loss_mlp": 0.01259652, + "epoch": 0.3062678490906358, + "flos": 24031411459200.0, + "grad_norm": 1.7386581048181018, + "language_loss": 0.74963737, + "learning_rate": 3.2513680807222114e-06, + "loss": 0.82736242, + "num_input_tokens_seen": 109405025, + "router_z_loss_clip": 1.99414062, + "router_z_loss_mlp": 0.17419434, + "step": 5094, + "time_per_iteration": 2.5497004985809326 + }, + { + "auxiliary_loss_clip": 0.06491575, + "auxiliary_loss_mlp": 0.01272234, + "balance_loss_clip": 0.06293601, + "balance_loss_mlp": 0.01255735, + "epoch": 0.3063279723433038, + "flos": 19760593178880.0, + "grad_norm": 1.8971341227661025, + "language_loss": 0.76389223, + "learning_rate": 3.251064247058868e-06, + "loss": 0.84153032, + "num_input_tokens_seen": 109422465, + "router_z_loss_clip": 1.9765625, + "router_z_loss_mlp": 0.16503906, + "step": 5095, + "time_per_iteration": 2.493479013442993 + }, + { + "auxiliary_loss_clip": 0.06485657, + "auxiliary_loss_mlp": 0.0128124, + "balance_loss_clip": 0.06288686, + "balance_loss_mlp": 0.01262727, + "epoch": 0.30638809559597174, + "flos": 22455663102720.0, + "grad_norm": 1.6310889817091494, + "language_loss": 0.81246006, + "learning_rate": 3.250760365955042e-06, + "loss": 0.89012897, + "num_input_tokens_seen": 109440575, + "router_z_loss_clip": 1.97070312, + "router_z_loss_mlp": 0.18518066, + "step": 5096, + "time_per_iteration": 2.606100559234619 + }, + { + "auxiliary_loss_clip": 0.06500001, + "auxiliary_loss_mlp": 0.01286183, + "balance_loss_clip": 0.06297529, + "balance_loss_mlp": 0.01269947, + "epoch": 0.3064482188486397, + "flos": 17170846237440.0, + "grad_norm": 2.1701963694762862, + "language_loss": 0.81871414, + "learning_rate": 3.250456437422258e-06, + "loss": 0.89657605, + "num_input_tokens_seen": 109459050, + "router_z_loss_clip": 2.0234375, + "router_z_loss_mlp": 0.16235352, + "step": 5097, + "time_per_iteration": 2.506908893585205 + }, + { + "auxiliary_loss_clip": 0.06498241, + "auxiliary_loss_mlp": 0.01288982, + "balance_loss_clip": 0.06297113, + "balance_loss_mlp": 0.01269647, + "epoch": 0.3065083421013077, + "flos": 23775176073600.0, + "grad_norm": 2.1266024193404385, + "language_loss": 0.7855283, + "learning_rate": 3.250152461472041e-06, + "loss": 0.86340058, + "num_input_tokens_seen": 109475860, + "router_z_loss_clip": 2.0078125, + "router_z_loss_mlp": 0.19335938, + "step": 5098, + "time_per_iteration": 2.546875238418579 + }, + { + "auxiliary_loss_clip": 0.06494713, + "auxiliary_loss_mlp": 0.01291897, + "balance_loss_clip": 0.06296527, + "balance_loss_mlp": 0.0127367, + "epoch": 0.30656846535397564, + "flos": 26438953697280.0, + "grad_norm": 1.8261556885246946, + "language_loss": 0.84430897, + "learning_rate": 3.249848438115917e-06, + "loss": 0.92217511, + "num_input_tokens_seen": 109494760, + "router_z_loss_clip": 1.98242188, + "router_z_loss_mlp": 0.18225098, + "step": 5099, + "time_per_iteration": 2.5726583003997803 + }, + { + "auxiliary_loss_clip": 0.06498358, + "auxiliary_loss_mlp": 0.01287293, + "balance_loss_clip": 0.06295489, + "balance_loss_mlp": 0.01268434, + "epoch": 0.3066285886066436, + "flos": 26659117100160.0, + "grad_norm": 1.588615118025773, + "language_loss": 0.86241573, + "learning_rate": 3.2495443673654148e-06, + "loss": 0.94027227, + "num_input_tokens_seen": 109516480, + "router_z_loss_clip": 2.02832031, + "router_z_loss_mlp": 0.18859863, + "step": 5100, + "time_per_iteration": 2.5711421966552734 + }, + { + "auxiliary_loss_clip": 0.06496789, + "auxiliary_loss_mlp": 0.01283562, + "balance_loss_clip": 0.06296922, + "balance_loss_mlp": 0.01264345, + "epoch": 0.30668871185931157, + "flos": 15055443659520.0, + "grad_norm": 1.7244173580954059, + "language_loss": 0.79369497, + "learning_rate": 3.249240249232065e-06, + "loss": 0.87149858, + "num_input_tokens_seen": 109534615, + "router_z_loss_clip": 1.99804688, + "router_z_loss_mlp": 0.19226074, + "step": 5101, + "time_per_iteration": 2.539132833480835 + }, + { + "auxiliary_loss_clip": 0.0650195, + "auxiliary_loss_mlp": 0.01287055, + "balance_loss_clip": 0.06299084, + "balance_loss_mlp": 0.01268172, + "epoch": 0.30674883511197953, + "flos": 20087966280960.0, + "grad_norm": 1.7739241542858428, + "language_loss": 0.80435872, + "learning_rate": 3.2489360837273998e-06, + "loss": 0.88224876, + "num_input_tokens_seen": 109554040, + "router_z_loss_clip": 2.02832031, + "router_z_loss_mlp": 0.1887207, + "step": 5102, + "time_per_iteration": 2.5558016300201416 + }, + { + "auxiliary_loss_clip": 0.06503183, + "auxiliary_loss_mlp": 0.01284648, + "balance_loss_clip": 0.06301928, + "balance_loss_mlp": 0.01265253, + "epoch": 0.30680895836464755, + "flos": 22900518028800.0, + "grad_norm": 1.6865927559982214, + "language_loss": 0.89335668, + "learning_rate": 3.2486318708629532e-06, + "loss": 0.97123504, + "num_input_tokens_seen": 109574345, + "router_z_loss_clip": 2.015625, + "router_z_loss_mlp": 0.19396973, + "step": 5103, + "time_per_iteration": 2.542555570602417 + }, + { + "auxiliary_loss_clip": 0.06501935, + "auxiliary_loss_mlp": 0.01286618, + "balance_loss_clip": 0.06302223, + "balance_loss_mlp": 0.0126876, + "epoch": 0.3068690816173155, + "flos": 23702948254080.0, + "grad_norm": 2.119732369805114, + "language_loss": 0.74448419, + "learning_rate": 3.2483276106502607e-06, + "loss": 0.82236969, + "num_input_tokens_seen": 109593670, + "router_z_loss_clip": 1.99707031, + "router_z_loss_mlp": 0.17871094, + "step": 5104, + "time_per_iteration": 2.560253143310547 + }, + { + "auxiliary_loss_clip": 0.06502049, + "auxiliary_loss_mlp": 0.01274873, + "balance_loss_clip": 0.06295487, + "balance_loss_mlp": 0.01257552, + "epoch": 0.3069292048699835, + "flos": 23557947563520.0, + "grad_norm": 1.7334515387821061, + "language_loss": 0.72909176, + "learning_rate": 3.2480233031008605e-06, + "loss": 0.80686092, + "num_input_tokens_seen": 109613385, + "router_z_loss_clip": 2.06835938, + "router_z_loss_mlp": 0.17321777, + "step": 5105, + "time_per_iteration": 2.5751454830169678 + }, + { + "auxiliary_loss_clip": 0.06498945, + "auxiliary_loss_mlp": 0.01282015, + "balance_loss_clip": 0.06297372, + "balance_loss_mlp": 0.01263907, + "epoch": 0.30698932812265145, + "flos": 24537970517760.0, + "grad_norm": 2.0977567017321608, + "language_loss": 0.87578112, + "learning_rate": 3.2477189482262916e-06, + "loss": 0.95359075, + "num_input_tokens_seen": 109632395, + "router_z_loss_clip": 2.01464844, + "router_z_loss_mlp": 0.18103027, + "step": 5106, + "time_per_iteration": 2.54413104057312 + }, + { + "auxiliary_loss_clip": 0.06503764, + "auxiliary_loss_mlp": 0.01279082, + "balance_loss_clip": 0.06296381, + "balance_loss_mlp": 0.01261189, + "epoch": 0.3070494513753194, + "flos": 21002805158400.0, + "grad_norm": 2.310425767564757, + "language_loss": 0.72092319, + "learning_rate": 3.2474145460380945e-06, + "loss": 0.79875165, + "num_input_tokens_seen": 109651380, + "router_z_loss_clip": 2.0703125, + "router_z_loss_mlp": 0.17883301, + "step": 5107, + "time_per_iteration": 2.571430206298828 + }, + { + "auxiliary_loss_clip": 0.06493405, + "auxiliary_loss_mlp": 0.01275594, + "balance_loss_clip": 0.06294269, + "balance_loss_mlp": 0.01256735, + "epoch": 0.3071095746279874, + "flos": 19031942073600.0, + "grad_norm": 1.99593781887154, + "language_loss": 0.72653455, + "learning_rate": 3.247110096547814e-06, + "loss": 0.80422449, + "num_input_tokens_seen": 109670240, + "router_z_loss_clip": 1.99121094, + "router_z_loss_mlp": 0.18847656, + "step": 5108, + "time_per_iteration": 2.497788190841675 + }, + { + "auxiliary_loss_clip": 0.06499057, + "auxiliary_loss_mlp": 0.01277116, + "balance_loss_clip": 0.06297708, + "balance_loss_mlp": 0.01259533, + "epoch": 0.30716969788065535, + "flos": 21221962312320.0, + "grad_norm": 1.48656392648579, + "language_loss": 0.86441541, + "learning_rate": 3.2468055997669926e-06, + "loss": 0.94217712, + "num_input_tokens_seen": 109690810, + "router_z_loss_clip": 2.01074219, + "router_z_loss_mlp": 0.17578125, + "step": 5109, + "time_per_iteration": 2.563480854034424 + }, + { + "auxiliary_loss_clip": 0.06501789, + "auxiliary_loss_mlp": 0.01278566, + "balance_loss_clip": 0.063005, + "balance_loss_mlp": 0.01260541, + "epoch": 0.3072298211333233, + "flos": 25779385883520.0, + "grad_norm": 1.8235353484155168, + "language_loss": 0.67904091, + "learning_rate": 3.2465010557071788e-06, + "loss": 0.75684446, + "num_input_tokens_seen": 109711145, + "router_z_loss_clip": 2.01171875, + "router_z_loss_mlp": 0.18029785, + "step": 5110, + "time_per_iteration": 3.9785540103912354 + }, + { + "auxiliary_loss_clip": 0.06493396, + "auxiliary_loss_mlp": 0.01273369, + "balance_loss_clip": 0.06295427, + "balance_loss_mlp": 0.01256727, + "epoch": 0.3072899443859913, + "flos": 25856099896320.0, + "grad_norm": 1.4123986071879864, + "language_loss": 0.76984161, + "learning_rate": 3.246196464379919e-06, + "loss": 0.84750926, + "num_input_tokens_seen": 109731425, + "router_z_loss_clip": 1.97753906, + "router_z_loss_mlp": 0.16638184, + "step": 5111, + "time_per_iteration": 2.5771117210388184 + }, + { + "auxiliary_loss_clip": 0.06498265, + "auxiliary_loss_mlp": 0.01277301, + "balance_loss_clip": 0.06293567, + "balance_loss_mlp": 0.01258585, + "epoch": 0.30735006763865924, + "flos": 25930130578560.0, + "grad_norm": 2.349951455822933, + "language_loss": 0.67755288, + "learning_rate": 3.245891825796765e-06, + "loss": 0.75530857, + "num_input_tokens_seen": 109752720, + "router_z_loss_clip": 2.04394531, + "router_z_loss_mlp": 0.18713379, + "step": 5112, + "time_per_iteration": 3.963136672973633 + }, + { + "auxiliary_loss_clip": 0.0650286, + "auxiliary_loss_mlp": 0.01277737, + "balance_loss_clip": 0.06295824, + "balance_loss_mlp": 0.01257614, + "epoch": 0.3074101908913272, + "flos": 30924442938240.0, + "grad_norm": 2.270303220058131, + "language_loss": 0.79939896, + "learning_rate": 3.2455871399692678e-06, + "loss": 0.87720484, + "num_input_tokens_seen": 109772840, + "router_z_loss_clip": 2.06640625, + "router_z_loss_mlp": 0.20117188, + "step": 5113, + "time_per_iteration": 4.084795236587524 + }, + { + "auxiliary_loss_clip": 0.06502695, + "auxiliary_loss_mlp": 0.01276516, + "balance_loss_clip": 0.06297943, + "balance_loss_mlp": 0.01258599, + "epoch": 0.30747031414399517, + "flos": 18406182182400.0, + "grad_norm": 2.072714063381377, + "language_loss": 0.77269047, + "learning_rate": 3.2452824069089815e-06, + "loss": 0.85048258, + "num_input_tokens_seen": 109790150, + "router_z_loss_clip": 2.04589844, + "router_z_loss_mlp": 0.17919922, + "step": 5114, + "time_per_iteration": 2.4906773567199707 + }, + { + "auxiliary_loss_clip": 0.06498024, + "auxiliary_loss_mlp": 0.01283612, + "balance_loss_clip": 0.06298083, + "balance_loss_mlp": 0.01265087, + "epoch": 0.30753043739666314, + "flos": 22638957909120.0, + "grad_norm": 1.8131309248321845, + "language_loss": 0.62640405, + "learning_rate": 3.2449776266274623e-06, + "loss": 0.70422041, + "num_input_tokens_seen": 109807985, + "router_z_loss_clip": 2.0, + "router_z_loss_mlp": 0.18530273, + "step": 5115, + "time_per_iteration": 2.5328574180603027 + }, + { + "auxiliary_loss_clip": 0.06499057, + "auxiliary_loss_mlp": 0.01274347, + "balance_loss_clip": 0.06295817, + "balance_loss_mlp": 0.0125513, + "epoch": 0.3075905606493311, + "flos": 27351360806400.0, + "grad_norm": 1.7894066300170501, + "language_loss": 0.83589995, + "learning_rate": 3.2446727991362657e-06, + "loss": 0.91363406, + "num_input_tokens_seen": 109825920, + "router_z_loss_clip": 2.03222656, + "router_z_loss_mlp": 0.19213867, + "step": 5116, + "time_per_iteration": 2.562014102935791 + }, + { + "auxiliary_loss_clip": 0.06500115, + "auxiliary_loss_mlp": 0.01273454, + "balance_loss_clip": 0.06298394, + "balance_loss_mlp": 0.0125512, + "epoch": 0.3076506839019991, + "flos": 22097333116800.0, + "grad_norm": 1.8649453582041782, + "language_loss": 0.76016742, + "learning_rate": 3.244367924446952e-06, + "loss": 0.83790314, + "num_input_tokens_seen": 109846220, + "router_z_loss_clip": 2.01660156, + "router_z_loss_mlp": 0.18322754, + "step": 5117, + "time_per_iteration": 2.5509209632873535 + }, + { + "auxiliary_loss_clip": 0.06498168, + "auxiliary_loss_mlp": 0.01274202, + "balance_loss_clip": 0.0629583, + "balance_loss_mlp": 0.01256142, + "epoch": 0.3077108071546671, + "flos": 21296160702720.0, + "grad_norm": 2.167097847201453, + "language_loss": 0.72108531, + "learning_rate": 3.2440630025710826e-06, + "loss": 0.79880905, + "num_input_tokens_seen": 109863870, + "router_z_loss_clip": 2.02246094, + "router_z_loss_mlp": 0.18054199, + "step": 5118, + "time_per_iteration": 2.5190913677215576 + }, + { + "auxiliary_loss_clip": 0.06502286, + "auxiliary_loss_mlp": 0.01275745, + "balance_loss_clip": 0.06299888, + "balance_loss_mlp": 0.01258198, + "epoch": 0.30777093040733505, + "flos": 21436884835200.0, + "grad_norm": 2.760855389686565, + "language_loss": 0.74956095, + "learning_rate": 3.243758033520219e-06, + "loss": 0.82734126, + "num_input_tokens_seen": 109883500, + "router_z_loss_clip": 2.0234375, + "router_z_loss_mlp": 0.17553711, + "step": 5119, + "time_per_iteration": 3.973721981048584 + }, + { + "auxiliary_loss_clip": 0.06494488, + "auxiliary_loss_mlp": 0.01279388, + "balance_loss_clip": 0.06289928, + "balance_loss_mlp": 0.01259814, + "epoch": 0.307831053660003, + "flos": 23156040654720.0, + "grad_norm": 1.7924264386276263, + "language_loss": 0.80264926, + "learning_rate": 3.243453017305926e-06, + "loss": 0.88038802, + "num_input_tokens_seen": 109904620, + "router_z_loss_clip": 2.04101562, + "router_z_loss_mlp": 0.19580078, + "step": 5120, + "time_per_iteration": 2.54705548286438 + }, + { + "auxiliary_loss_clip": 0.06492078, + "auxiliary_loss_mlp": 0.01273208, + "balance_loss_clip": 0.06292595, + "balance_loss_mlp": 0.01255445, + "epoch": 0.307891176912671, + "flos": 17025510130560.0, + "grad_norm": 1.642273509687288, + "language_loss": 0.80521786, + "learning_rate": 3.24314795393977e-06, + "loss": 0.88287073, + "num_input_tokens_seen": 109922275, + "router_z_loss_clip": 1.99707031, + "router_z_loss_mlp": 0.1776123, + "step": 5121, + "time_per_iteration": 2.515054702758789 + }, + { + "auxiliary_loss_clip": 0.06496292, + "auxiliary_loss_mlp": 0.0127453, + "balance_loss_clip": 0.06298114, + "balance_loss_mlp": 0.01256875, + "epoch": 0.30795130016533895, + "flos": 27711745217280.0, + "grad_norm": 1.3913461280715187, + "language_loss": 0.82847351, + "learning_rate": 3.242842843433319e-06, + "loss": 0.90618169, + "num_input_tokens_seen": 109944265, + "router_z_loss_clip": 1.98339844, + "router_z_loss_mlp": 0.17651367, + "step": 5122, + "time_per_iteration": 2.5832252502441406 + }, + { + "auxiliary_loss_clip": 0.06416376, + "auxiliary_loss_mlp": 0.01252861, + "balance_loss_clip": 0.0632116, + "balance_loss_mlp": 0.01249526, + "epoch": 0.3080114234180069, + "flos": 69080973373440.0, + "grad_norm": 0.7221499072225652, + "language_loss": 0.58650029, + "learning_rate": 3.242537685798143e-06, + "loss": 0.66319263, + "num_input_tokens_seen": 110014160, + "router_z_loss_clip": 0.953125, + "router_z_loss_mlp": 0.03341675, + "step": 5123, + "time_per_iteration": 3.3316402435302734 + }, + { + "auxiliary_loss_clip": 0.06503562, + "auxiliary_loss_mlp": 0.01279925, + "balance_loss_clip": 0.06296872, + "balance_loss_mlp": 0.01260744, + "epoch": 0.3080715466706749, + "flos": 24066938390400.0, + "grad_norm": 1.6584153298959496, + "language_loss": 0.83586073, + "learning_rate": 3.242232481045813e-06, + "loss": 0.91369557, + "num_input_tokens_seen": 110034865, + "router_z_loss_clip": 2.06640625, + "router_z_loss_mlp": 0.1920166, + "step": 5124, + "time_per_iteration": 2.589906930923462 + }, + { + "auxiliary_loss_clip": 0.06498908, + "auxiliary_loss_mlp": 0.01271737, + "balance_loss_clip": 0.06294107, + "balance_loss_mlp": 0.01253629, + "epoch": 0.30813166992334284, + "flos": 25855806407040.0, + "grad_norm": 2.061271988083176, + "language_loss": 0.79248756, + "learning_rate": 3.2419272291879035e-06, + "loss": 0.87019402, + "num_input_tokens_seen": 110052930, + "router_z_loss_clip": 2.04589844, + "router_z_loss_mlp": 0.1809082, + "step": 5125, + "time_per_iteration": 2.550884485244751 + }, + { + "auxiliary_loss_clip": 0.06501068, + "auxiliary_loss_mlp": 0.012774, + "balance_loss_clip": 0.06292764, + "balance_loss_mlp": 0.01258374, + "epoch": 0.3081917931760108, + "flos": 20455981413120.0, + "grad_norm": 2.085029494567846, + "language_loss": 0.64930958, + "learning_rate": 3.241621930235989e-06, + "loss": 0.72709423, + "num_input_tokens_seen": 110071765, + "router_z_loss_clip": 2.08203125, + "router_z_loss_mlp": 0.19018555, + "step": 5126, + "time_per_iteration": 2.576352119445801 + }, + { + "auxiliary_loss_clip": 0.06490224, + "auxiliary_loss_mlp": 0.01277045, + "balance_loss_clip": 0.06294391, + "balance_loss_mlp": 0.01259533, + "epoch": 0.3082519164286788, + "flos": 22173208588800.0, + "grad_norm": 1.5681866965441809, + "language_loss": 0.87117672, + "learning_rate": 3.241316584201646e-06, + "loss": 0.94884944, + "num_input_tokens_seen": 110092660, + "router_z_loss_clip": 1.95800781, + "router_z_loss_mlp": 0.17504883, + "step": 5127, + "time_per_iteration": 2.567615270614624 + }, + { + "auxiliary_loss_clip": 0.0649047, + "auxiliary_loss_mlp": 0.01273562, + "balance_loss_clip": 0.06291968, + "balance_loss_mlp": 0.0125593, + "epoch": 0.30831203968134674, + "flos": 28921029742080.0, + "grad_norm": 1.4544126326452276, + "language_loss": 0.69282925, + "learning_rate": 3.2410111910964538e-06, + "loss": 0.77046961, + "num_input_tokens_seen": 110114960, + "router_z_loss_clip": 1.984375, + "router_z_loss_mlp": 0.1763916, + "step": 5128, + "time_per_iteration": 2.6129322052001953 + }, + { + "auxiliary_loss_clip": 0.06499469, + "auxiliary_loss_mlp": 0.01276178, + "balance_loss_clip": 0.06295171, + "balance_loss_mlp": 0.01257843, + "epoch": 0.3083721629340147, + "flos": 25675069150080.0, + "grad_norm": 2.0282558045061396, + "language_loss": 0.7195785, + "learning_rate": 3.240705750931993e-06, + "loss": 0.79733503, + "num_input_tokens_seen": 110135750, + "router_z_loss_clip": 2.04394531, + "router_z_loss_mlp": 0.18334961, + "step": 5129, + "time_per_iteration": 2.5587165355682373 + }, + { + "auxiliary_loss_clip": 0.06388761, + "auxiliary_loss_mlp": 0.01275431, + "balance_loss_clip": 0.06292662, + "balance_loss_mlp": 0.01271816, + "epoch": 0.3084322861866827, + "flos": 68233666487040.0, + "grad_norm": 0.8077979927321801, + "language_loss": 0.58935201, + "learning_rate": 3.240400263719846e-06, + "loss": 0.66599393, + "num_input_tokens_seen": 110189480, + "router_z_loss_clip": 0.9609375, + "router_z_loss_mlp": 0.03607178, + "step": 5130, + "time_per_iteration": 3.2353098392486572 + }, + { + "auxiliary_loss_clip": 0.06498231, + "auxiliary_loss_mlp": 0.012758, + "balance_loss_clip": 0.0629265, + "balance_loss_mlp": 0.01258443, + "epoch": 0.3084924094393507, + "flos": 20301630992640.0, + "grad_norm": 2.071340626605126, + "language_loss": 0.73298538, + "learning_rate": 3.2400947294715957e-06, + "loss": 0.81072569, + "num_input_tokens_seen": 110206445, + "router_z_loss_clip": 2.05664062, + "router_z_loss_mlp": 0.17370605, + "step": 5131, + "time_per_iteration": 2.523510456085205 + }, + { + "auxiliary_loss_clip": 0.06487547, + "auxiliary_loss_mlp": 0.01274811, + "balance_loss_clip": 0.06290068, + "balance_loss_mlp": 0.01257728, + "epoch": 0.30855253269201866, + "flos": 23956374528000.0, + "grad_norm": 1.6208223340220833, + "language_loss": 0.71358359, + "learning_rate": 3.2397891481988303e-06, + "loss": 0.79120713, + "num_input_tokens_seen": 110226845, + "router_z_loss_clip": 1.97558594, + "router_z_loss_mlp": 0.17077637, + "step": 5132, + "time_per_iteration": 2.581470012664795 + }, + { + "auxiliary_loss_clip": 0.06487268, + "auxiliary_loss_mlp": 0.01273323, + "balance_loss_clip": 0.06290212, + "balance_loss_mlp": 0.01255262, + "epoch": 0.3086126559446866, + "flos": 19288009751040.0, + "grad_norm": 1.7801590489825803, + "language_loss": 0.90374929, + "learning_rate": 3.239483519913136e-06, + "loss": 0.98135513, + "num_input_tokens_seen": 110244095, + "router_z_loss_clip": 1.96972656, + "router_z_loss_mlp": 0.18066406, + "step": 5133, + "time_per_iteration": 2.5197763442993164 + }, + { + "auxiliary_loss_clip": 0.06499831, + "auxiliary_loss_mlp": 0.01275105, + "balance_loss_clip": 0.06295495, + "balance_loss_mlp": 0.01257105, + "epoch": 0.3086727791973546, + "flos": 33768328913280.0, + "grad_norm": 1.8524807236065886, + "language_loss": 0.67443442, + "learning_rate": 3.239177844626102e-06, + "loss": 0.75218379, + "num_input_tokens_seen": 110264240, + "router_z_loss_clip": 2.04101562, + "router_z_loss_mlp": 0.18017578, + "step": 5134, + "time_per_iteration": 2.664303779602051 + }, + { + "auxiliary_loss_clip": 0.06498815, + "auxiliary_loss_mlp": 0.01275704, + "balance_loss_clip": 0.06293166, + "balance_loss_mlp": 0.01257167, + "epoch": 0.30873290245002255, + "flos": 16039659317760.0, + "grad_norm": 1.8927812104332384, + "language_loss": 0.83517784, + "learning_rate": 3.2388721223493197e-06, + "loss": 0.91292304, + "num_input_tokens_seen": 110282450, + "router_z_loss_clip": 2.05664062, + "router_z_loss_mlp": 0.18518066, + "step": 5135, + "time_per_iteration": 2.505138397216797 + }, + { + "auxiliary_loss_clip": 0.06377634, + "auxiliary_loss_mlp": 0.01258895, + "balance_loss_clip": 0.06282344, + "balance_loss_mlp": 0.01255602, + "epoch": 0.3087930257026905, + "flos": 65070415474560.0, + "grad_norm": 0.6863645266912056, + "language_loss": 0.55337238, + "learning_rate": 3.2385663530943824e-06, + "loss": 0.62973773, + "num_input_tokens_seen": 110343715, + "router_z_loss_clip": 0.953125, + "router_z_loss_mlp": 0.0329895, + "step": 5136, + "time_per_iteration": 3.179166555404663 + }, + { + "auxiliary_loss_clip": 0.06488921, + "auxiliary_loss_mlp": 0.01274465, + "balance_loss_clip": 0.06290261, + "balance_loss_mlp": 0.01257085, + "epoch": 0.3088531489553585, + "flos": 74754001733760.0, + "grad_norm": 1.8635236180899502, + "language_loss": 0.76610464, + "learning_rate": 3.2382605368728852e-06, + "loss": 0.8437385, + "num_input_tokens_seen": 110368430, + "router_z_loss_clip": 1.98632812, + "router_z_loss_mlp": 0.1739502, + "step": 5137, + "time_per_iteration": 2.9993999004364014 + }, + { + "auxiliary_loss_clip": 0.06489644, + "auxiliary_loss_mlp": 0.01272707, + "balance_loss_clip": 0.06290223, + "balance_loss_mlp": 0.01255458, + "epoch": 0.30891327220802645, + "flos": 21148686316800.0, + "grad_norm": 1.7480087539569926, + "language_loss": 0.80450445, + "learning_rate": 3.237954673696424e-06, + "loss": 0.882128, + "num_input_tokens_seen": 110386735, + "router_z_loss_clip": 1.9921875, + "router_z_loss_mlp": 0.17248535, + "step": 5138, + "time_per_iteration": 2.531916856765747 + }, + { + "auxiliary_loss_clip": 0.06496161, + "auxiliary_loss_mlp": 0.01276289, + "balance_loss_clip": 0.06294001, + "balance_loss_mlp": 0.01258896, + "epoch": 0.3089733954606944, + "flos": 25671295716480.0, + "grad_norm": 1.629930216805369, + "language_loss": 0.81626344, + "learning_rate": 3.2376487635765983e-06, + "loss": 0.89398789, + "num_input_tokens_seen": 110406820, + "router_z_loss_clip": 2.02050781, + "router_z_loss_mlp": 0.1739502, + "step": 5139, + "time_per_iteration": 2.585380792617798 + }, + { + "auxiliary_loss_clip": 0.06501773, + "auxiliary_loss_mlp": 0.01277306, + "balance_loss_clip": 0.06292425, + "balance_loss_mlp": 0.01258817, + "epoch": 0.3090335187133624, + "flos": 19433429712000.0, + "grad_norm": 2.0033599705043854, + "language_loss": 0.77724934, + "learning_rate": 3.2373428065250067e-06, + "loss": 0.85504013, + "num_input_tokens_seen": 110424225, + "router_z_loss_clip": 2.09179688, + "router_z_loss_mlp": 0.18481445, + "step": 5140, + "time_per_iteration": 2.504387617111206 + }, + { + "auxiliary_loss_clip": 0.06482549, + "auxiliary_loss_mlp": 0.01272919, + "balance_loss_clip": 0.06290817, + "balance_loss_mlp": 0.0125741, + "epoch": 0.30909364196603034, + "flos": 20017541324160.0, + "grad_norm": 1.9132937458234096, + "language_loss": 0.78916645, + "learning_rate": 3.237036802553252e-06, + "loss": 0.86672109, + "num_input_tokens_seen": 110443310, + "router_z_loss_clip": 1.91601562, + "router_z_loss_mlp": 0.15515137, + "step": 5141, + "time_per_iteration": 2.5588464736938477 + }, + { + "auxiliary_loss_clip": 0.06494773, + "auxiliary_loss_mlp": 0.01277459, + "balance_loss_clip": 0.06291379, + "balance_loss_mlp": 0.01260543, + "epoch": 0.3091537652186983, + "flos": 19682830990080.0, + "grad_norm": 2.2087235088394728, + "language_loss": 0.8789897, + "learning_rate": 3.2367307516729377e-06, + "loss": 0.95671201, + "num_input_tokens_seen": 110460215, + "router_z_loss_clip": 2.03222656, + "router_z_loss_mlp": 0.16906738, + "step": 5142, + "time_per_iteration": 2.52750825881958 + }, + { + "auxiliary_loss_clip": 0.06498981, + "auxiliary_loss_mlp": 0.01276818, + "balance_loss_clip": 0.06294474, + "balance_loss_mlp": 0.01259438, + "epoch": 0.3092138884713663, + "flos": 17025845546880.0, + "grad_norm": 2.3473661014686984, + "language_loss": 0.7985431, + "learning_rate": 3.23642465389567e-06, + "loss": 0.87630117, + "num_input_tokens_seen": 110479385, + "router_z_loss_clip": 2.04785156, + "router_z_loss_mlp": 0.17382812, + "step": 5143, + "time_per_iteration": 2.658299207687378 + }, + { + "auxiliary_loss_clip": 0.06489455, + "auxiliary_loss_mlp": 0.01277055, + "balance_loss_clip": 0.06291586, + "balance_loss_mlp": 0.01260378, + "epoch": 0.3092740117240343, + "flos": 25017052636800.0, + "grad_norm": 1.6187717199492768, + "language_loss": 0.72479737, + "learning_rate": 3.236118509233055e-06, + "loss": 0.8024624, + "num_input_tokens_seen": 110499885, + "router_z_loss_clip": 1.97949219, + "router_z_loss_mlp": 0.16662598, + "step": 5144, + "time_per_iteration": 2.547358989715576 + }, + { + "auxiliary_loss_clip": 0.06496169, + "auxiliary_loss_mlp": 0.01272398, + "balance_loss_clip": 0.06292454, + "balance_loss_mlp": 0.01256138, + "epoch": 0.30933413497670226, + "flos": 25597013472000.0, + "grad_norm": 2.2714150562550466, + "language_loss": 0.74676621, + "learning_rate": 3.235812317696702e-06, + "loss": 0.82445192, + "num_input_tokens_seen": 110519690, + "router_z_loss_clip": 2.03515625, + "router_z_loss_mlp": 0.16271973, + "step": 5145, + "time_per_iteration": 2.6273365020751953 + }, + { + "auxiliary_loss_clip": 0.06490701, + "auxiliary_loss_mlp": 0.01273039, + "balance_loss_clip": 0.06289125, + "balance_loss_mlp": 0.01256296, + "epoch": 0.3093942582293702, + "flos": 24396617479680.0, + "grad_norm": 1.731689317121935, + "language_loss": 0.76830649, + "learning_rate": 3.2355060792982224e-06, + "loss": 0.84594393, + "num_input_tokens_seen": 110540520, + "router_z_loss_clip": 2.01464844, + "router_z_loss_mlp": 0.16729736, + "step": 5146, + "time_per_iteration": 2.5352702140808105 + }, + { + "auxiliary_loss_clip": 0.06485911, + "auxiliary_loss_mlp": 0.01273533, + "balance_loss_clip": 0.06287882, + "balance_loss_mlp": 0.0125707, + "epoch": 0.3094543814820382, + "flos": 19652586865920.0, + "grad_norm": 1.8011449994622988, + "language_loss": 0.66675043, + "learning_rate": 3.2351997940492286e-06, + "loss": 0.74434483, + "num_input_tokens_seen": 110557950, + "router_z_loss_clip": 1.97949219, + "router_z_loss_mlp": 0.16467285, + "step": 5147, + "time_per_iteration": 2.545940637588501 + }, + { + "auxiliary_loss_clip": 0.06492072, + "auxiliary_loss_mlp": 0.01271267, + "balance_loss_clip": 0.0628895, + "balance_loss_mlp": 0.01253731, + "epoch": 0.30951450473470615, + "flos": 25670499102720.0, + "grad_norm": 1.8580519203508368, + "language_loss": 0.74971956, + "learning_rate": 3.2348934619613346e-06, + "loss": 0.82735288, + "num_input_tokens_seen": 110578215, + "router_z_loss_clip": 2.03125, + "router_z_loss_mlp": 0.17529297, + "step": 5148, + "time_per_iteration": 2.5673537254333496 + }, + { + "auxiliary_loss_clip": 0.06501722, + "auxiliary_loss_mlp": 0.01278545, + "balance_loss_clip": 0.06290632, + "balance_loss_mlp": 0.01260342, + "epoch": 0.3095746279873741, + "flos": 12025202204160.0, + "grad_norm": 2.1335435485893166, + "language_loss": 0.73367, + "learning_rate": 3.2345870830461567e-06, + "loss": 0.81147265, + "num_input_tokens_seen": 110592990, + "router_z_loss_clip": 2.11230469, + "router_z_loss_mlp": 0.18212891, + "step": 5149, + "time_per_iteration": 2.682609796524048 + }, + { + "auxiliary_loss_clip": 0.06497431, + "auxiliary_loss_mlp": 0.01277143, + "balance_loss_clip": 0.06292653, + "balance_loss_mlp": 0.01258534, + "epoch": 0.3096347512400421, + "flos": 23629798039680.0, + "grad_norm": 1.913638713978071, + "language_loss": 0.85296845, + "learning_rate": 3.2342806573153132e-06, + "loss": 0.93071413, + "num_input_tokens_seen": 110612130, + "router_z_loss_clip": 2.04785156, + "router_z_loss_mlp": 0.18591309, + "step": 5150, + "time_per_iteration": 3.9813008308410645 + }, + { + "auxiliary_loss_clip": 0.06483387, + "auxiliary_loss_mlp": 0.01274387, + "balance_loss_clip": 0.06285527, + "balance_loss_mlp": 0.01256815, + "epoch": 0.30969487449271005, + "flos": 22536024768000.0, + "grad_norm": 1.8960829077128427, + "language_loss": 0.79181123, + "learning_rate": 3.233974184780424e-06, + "loss": 0.86938894, + "num_input_tokens_seen": 110632045, + "router_z_loss_clip": 1.97851562, + "router_z_loss_mlp": 0.17565918, + "step": 5151, + "time_per_iteration": 2.5336477756500244 + }, + { + "auxiliary_loss_clip": 0.06493182, + "auxiliary_loss_mlp": 0.01274378, + "balance_loss_clip": 0.06291731, + "balance_loss_mlp": 0.01257426, + "epoch": 0.309754997745378, + "flos": 15273301075200.0, + "grad_norm": 2.079664023782487, + "language_loss": 0.67843604, + "learning_rate": 3.2336676654531084e-06, + "loss": 0.75611162, + "num_input_tokens_seen": 110649340, + "router_z_loss_clip": 2.01367188, + "router_z_loss_mlp": 0.16931152, + "step": 5152, + "time_per_iteration": 5.332815647125244 + }, + { + "auxiliary_loss_clip": 0.06492282, + "auxiliary_loss_mlp": 0.01278303, + "balance_loss_clip": 0.06293005, + "balance_loss_mlp": 0.01261888, + "epoch": 0.309815120998046, + "flos": 26986532129280.0, + "grad_norm": 1.9990242894688834, + "language_loss": 0.83170605, + "learning_rate": 3.2333610993449926e-06, + "loss": 0.90941191, + "num_input_tokens_seen": 110668450, + "router_z_loss_clip": 1.98925781, + "router_z_loss_mlp": 0.16394043, + "step": 5153, + "time_per_iteration": 2.5944862365722656 + }, + { + "auxiliary_loss_clip": 0.06488585, + "auxiliary_loss_mlp": 0.0127453, + "balance_loss_clip": 0.06290878, + "balance_loss_mlp": 0.0125709, + "epoch": 0.30987524425071394, + "flos": 21149692565760.0, + "grad_norm": 1.7708804151784365, + "language_loss": 0.74136615, + "learning_rate": 3.2330544864676997e-06, + "loss": 0.81899732, + "num_input_tokens_seen": 110689410, + "router_z_loss_clip": 1.9765625, + "router_z_loss_mlp": 0.17456055, + "step": 5154, + "time_per_iteration": 2.529526948928833 + }, + { + "auxiliary_loss_clip": 0.0648791, + "auxiliary_loss_mlp": 0.01284436, + "balance_loss_clip": 0.06292189, + "balance_loss_mlp": 0.01267544, + "epoch": 0.3099353675033819, + "flos": 15273720345600.0, + "grad_norm": 2.7515131151360763, + "language_loss": 0.76419097, + "learning_rate": 3.232747826832858e-06, + "loss": 0.84191442, + "num_input_tokens_seen": 110707350, + "router_z_loss_clip": 1.95605469, + "router_z_loss_mlp": 0.16882324, + "step": 5155, + "time_per_iteration": 2.5338993072509766 + }, + { + "auxiliary_loss_clip": 0.06490543, + "auxiliary_loss_mlp": 0.01273122, + "balance_loss_clip": 0.06289169, + "balance_loss_mlp": 0.01256373, + "epoch": 0.30999549075604993, + "flos": 15419182233600.0, + "grad_norm": 1.684257178792462, + "language_loss": 0.79886794, + "learning_rate": 3.232441120452094e-06, + "loss": 0.87650466, + "num_input_tokens_seen": 110724910, + "router_z_loss_clip": 2.01367188, + "router_z_loss_mlp": 0.1673584, + "step": 5156, + "time_per_iteration": 2.5190272331237793 + }, + { + "auxiliary_loss_clip": 0.06493768, + "auxiliary_loss_mlp": 0.01281451, + "balance_loss_clip": 0.06290715, + "balance_loss_mlp": 0.01264821, + "epoch": 0.3100556140087179, + "flos": 23191106388480.0, + "grad_norm": 2.1803769191775197, + "language_loss": 0.74967813, + "learning_rate": 3.23213436733704e-06, + "loss": 0.82743037, + "num_input_tokens_seen": 110744010, + "router_z_loss_clip": 2.03125, + "router_z_loss_mlp": 0.16625977, + "step": 5157, + "time_per_iteration": 2.59045147895813 + }, + { + "auxiliary_loss_clip": 0.06486322, + "auxiliary_loss_mlp": 0.01274347, + "balance_loss_clip": 0.06289537, + "balance_loss_mlp": 0.01258921, + "epoch": 0.31011573726138586, + "flos": 25749770664960.0, + "grad_norm": 2.4337865277632065, + "language_loss": 0.69860423, + "learning_rate": 3.231827567499327e-06, + "loss": 0.7762109, + "num_input_tokens_seen": 110765835, + "router_z_loss_clip": 1.96777344, + "router_z_loss_mlp": 0.1541748, + "step": 5158, + "time_per_iteration": 4.041999578475952 + }, + { + "auxiliary_loss_clip": 0.06488799, + "auxiliary_loss_mlp": 0.0127365, + "balance_loss_clip": 0.0629247, + "balance_loss_mlp": 0.0125795, + "epoch": 0.3101758605140538, + "flos": 20017541324160.0, + "grad_norm": 2.0387737109261477, + "language_loss": 0.84883308, + "learning_rate": 3.2315207209505896e-06, + "loss": 0.92645758, + "num_input_tokens_seen": 110784655, + "router_z_loss_clip": 1.96484375, + "router_z_loss_mlp": 0.15673828, + "step": 5159, + "time_per_iteration": 2.5081369876861572 + }, + { + "auxiliary_loss_clip": 0.06491841, + "auxiliary_loss_mlp": 0.0127455, + "balance_loss_clip": 0.06293043, + "balance_loss_mlp": 0.01257002, + "epoch": 0.3102359837667218, + "flos": 19141751249280.0, + "grad_norm": 1.926707434190644, + "language_loss": 0.85498118, + "learning_rate": 3.231213827702462e-06, + "loss": 0.93264508, + "num_input_tokens_seen": 110802545, + "router_z_loss_clip": 1.98828125, + "router_z_loss_mlp": 0.17529297, + "step": 5160, + "time_per_iteration": 2.5466468334198 + }, + { + "auxiliary_loss_clip": 0.06486624, + "auxiliary_loss_mlp": 0.01270062, + "balance_loss_clip": 0.06291263, + "balance_loss_mlp": 0.01253945, + "epoch": 0.31029610701938976, + "flos": 22270649287680.0, + "grad_norm": 1.6869427612303989, + "language_loss": 0.75787026, + "learning_rate": 3.230906887766584e-06, + "loss": 0.83543712, + "num_input_tokens_seen": 110820265, + "router_z_loss_clip": 1.95214844, + "router_z_loss_mlp": 0.16113281, + "step": 5161, + "time_per_iteration": 2.518521785736084 + }, + { + "auxiliary_loss_clip": 0.06491208, + "auxiliary_loss_mlp": 0.0127494, + "balance_loss_clip": 0.06289751, + "balance_loss_mlp": 0.01256915, + "epoch": 0.3103562302720577, + "flos": 20810244476160.0, + "grad_norm": 2.463900279304932, + "language_loss": 0.8222912, + "learning_rate": 3.2305999011545924e-06, + "loss": 0.89995265, + "num_input_tokens_seen": 110836195, + "router_z_loss_clip": 2.01367188, + "router_z_loss_mlp": 0.18029785, + "step": 5162, + "time_per_iteration": 2.5057315826416016 + }, + { + "auxiliary_loss_clip": 0.06485277, + "auxiliary_loss_mlp": 0.01269002, + "balance_loss_clip": 0.06289959, + "balance_loss_mlp": 0.01253594, + "epoch": 0.3104163535247257, + "flos": 22350382047360.0, + "grad_norm": 1.4717884967200954, + "language_loss": 0.83087295, + "learning_rate": 3.2302928678781295e-06, + "loss": 0.90841573, + "num_input_tokens_seen": 110856420, + "router_z_loss_clip": 1.95507812, + "router_z_loss_mlp": 0.15423584, + "step": 5163, + "time_per_iteration": 2.542052745819092 + }, + { + "auxiliary_loss_clip": 0.06490193, + "auxiliary_loss_mlp": 0.01271791, + "balance_loss_clip": 0.06290418, + "balance_loss_mlp": 0.0125559, + "epoch": 0.31047647677739365, + "flos": 21695803551360.0, + "grad_norm": 1.756895513371669, + "language_loss": 0.76630449, + "learning_rate": 3.2299857879488376e-06, + "loss": 0.84392428, + "num_input_tokens_seen": 110876650, + "router_z_loss_clip": 1.99707031, + "router_z_loss_mlp": 0.16186523, + "step": 5164, + "time_per_iteration": 2.5616652965545654 + }, + { + "auxiliary_loss_clip": 0.06486434, + "auxiliary_loss_mlp": 0.01274591, + "balance_loss_clip": 0.0628885, + "balance_loss_mlp": 0.01258331, + "epoch": 0.3105366000300616, + "flos": 18923390709120.0, + "grad_norm": 1.866784827400394, + "language_loss": 0.75307393, + "learning_rate": 3.2296786613783626e-06, + "loss": 0.83068419, + "num_input_tokens_seen": 110894445, + "router_z_loss_clip": 1.97558594, + "router_z_loss_mlp": 0.16271973, + "step": 5165, + "time_per_iteration": 2.5190699100494385 + }, + { + "auxiliary_loss_clip": 0.06483215, + "auxiliary_loss_mlp": 0.01271797, + "balance_loss_clip": 0.062862, + "balance_loss_mlp": 0.01255096, + "epoch": 0.3105967232827296, + "flos": 18266380444800.0, + "grad_norm": 1.5432274368627708, + "language_loss": 0.76476973, + "learning_rate": 3.229371488178348e-06, + "loss": 0.84231985, + "num_input_tokens_seen": 110912855, + "router_z_loss_clip": 1.97070312, + "router_z_loss_mlp": 0.16699219, + "step": 5166, + "time_per_iteration": 2.5421557426452637 + }, + { + "auxiliary_loss_clip": 0.06486712, + "auxiliary_loss_mlp": 0.01273485, + "balance_loss_clip": 0.06287863, + "balance_loss_mlp": 0.01256796, + "epoch": 0.31065684653539755, + "flos": 17677279514880.0, + "grad_norm": 2.119255684006569, + "language_loss": 0.74129677, + "learning_rate": 3.229064268360444e-06, + "loss": 0.81889874, + "num_input_tokens_seen": 110928025, + "router_z_loss_clip": 1.98828125, + "router_z_loss_mlp": 0.16687012, + "step": 5167, + "time_per_iteration": 2.5039737224578857 + }, + { + "auxiliary_loss_clip": 0.06378125, + "auxiliary_loss_mlp": 0.01261765, + "balance_loss_clip": 0.06284033, + "balance_loss_mlp": 0.01258356, + "epoch": 0.3107169697880655, + "flos": 68551522151040.0, + "grad_norm": 0.7172817016896729, + "language_loss": 0.53065968, + "learning_rate": 3.2287570019362997e-06, + "loss": 0.60705864, + "num_input_tokens_seen": 110992215, + "router_z_loss_clip": 0.93945312, + "router_z_loss_mlp": 0.03417969, + "step": 5168, + "time_per_iteration": 3.211498737335205 + }, + { + "auxiliary_loss_clip": 0.06491841, + "auxiliary_loss_mlp": 0.0127061, + "balance_loss_clip": 0.06290184, + "balance_loss_mlp": 0.01254052, + "epoch": 0.3107770930407335, + "flos": 13193844698880.0, + "grad_norm": 1.7226101243088363, + "language_loss": 0.79536855, + "learning_rate": 3.2284496889175668e-06, + "loss": 0.87299311, + "num_input_tokens_seen": 111010400, + "router_z_loss_clip": 2.01757812, + "router_z_loss_mlp": 0.16552734, + "step": 5169, + "time_per_iteration": 2.526906728744507 + }, + { + "auxiliary_loss_clip": 0.06491011, + "auxiliary_loss_mlp": 0.01271387, + "balance_loss_clip": 0.06288561, + "balance_loss_mlp": 0.01254328, + "epoch": 0.3108372162934015, + "flos": 31589587048320.0, + "grad_norm": 1.7384868970357352, + "language_loss": 0.6439994, + "learning_rate": 3.2281423293158986e-06, + "loss": 0.7216233, + "num_input_tokens_seen": 111033960, + "router_z_loss_clip": 2.02636719, + "router_z_loss_mlp": 0.17077637, + "step": 5170, + "time_per_iteration": 2.659008264541626 + }, + { + "auxiliary_loss_clip": 0.06488822, + "auxiliary_loss_mlp": 0.01276189, + "balance_loss_clip": 0.06288925, + "balance_loss_mlp": 0.01258927, + "epoch": 0.31089733954606946, + "flos": 28737231811200.0, + "grad_norm": 2.2754975952460086, + "language_loss": 0.77238673, + "learning_rate": 3.22783492314295e-06, + "loss": 0.8500368, + "num_input_tokens_seen": 111053265, + "router_z_loss_clip": 1.99902344, + "router_z_loss_mlp": 0.17260742, + "step": 5171, + "time_per_iteration": 2.5726847648620605 + }, + { + "auxiliary_loss_clip": 0.06489364, + "auxiliary_loss_mlp": 0.01274912, + "balance_loss_clip": 0.06290348, + "balance_loss_mlp": 0.01258294, + "epoch": 0.3109574627987374, + "flos": 19689455462400.0, + "grad_norm": 1.774750718996553, + "language_loss": 0.84023309, + "learning_rate": 3.2275274704103785e-06, + "loss": 0.91787583, + "num_input_tokens_seen": 111071130, + "router_z_loss_clip": 1.9921875, + "router_z_loss_mlp": 0.16625977, + "step": 5172, + "time_per_iteration": 2.5289804935455322 + }, + { + "auxiliary_loss_clip": 0.06485899, + "auxiliary_loss_mlp": 0.01271683, + "balance_loss_clip": 0.06284268, + "balance_loss_mlp": 0.01254493, + "epoch": 0.3110175860514054, + "flos": 14689231390080.0, + "grad_norm": 2.444929493076507, + "language_loss": 0.8466565, + "learning_rate": 3.227219971129842e-06, + "loss": 0.92423236, + "num_input_tokens_seen": 111089560, + "router_z_loss_clip": 2.01855469, + "router_z_loss_mlp": 0.17199707, + "step": 5173, + "time_per_iteration": 2.477851629257202 + }, + { + "auxiliary_loss_clip": 0.06478094, + "auxiliary_loss_mlp": 0.01270979, + "balance_loss_clip": 0.06285643, + "balance_loss_mlp": 0.01255279, + "epoch": 0.31107770930407336, + "flos": 25746835772160.0, + "grad_norm": 1.6684709759498597, + "language_loss": 0.83928138, + "learning_rate": 3.226912425313001e-06, + "loss": 0.91677213, + "num_input_tokens_seen": 111109960, + "router_z_loss_clip": 1.92382812, + "router_z_loss_mlp": 0.15698242, + "step": 5174, + "time_per_iteration": 2.6188318729400635 + }, + { + "auxiliary_loss_clip": 0.06483682, + "auxiliary_loss_mlp": 0.0127308, + "balance_loss_clip": 0.06284115, + "balance_loss_mlp": 0.01256057, + "epoch": 0.3111378325567413, + "flos": 19214272558080.0, + "grad_norm": 2.0188284806938945, + "language_loss": 0.85820258, + "learning_rate": 3.2266048329715183e-06, + "loss": 0.93577021, + "num_input_tokens_seen": 111127960, + "router_z_loss_clip": 1.99414062, + "router_z_loss_mlp": 0.17016602, + "step": 5175, + "time_per_iteration": 2.489356756210327 + }, + { + "auxiliary_loss_clip": 0.06477995, + "auxiliary_loss_mlp": 0.01275126, + "balance_loss_clip": 0.0628527, + "balance_loss_mlp": 0.01257352, + "epoch": 0.3111979558094093, + "flos": 23703199816320.0, + "grad_norm": 1.907748003287586, + "language_loss": 0.84357607, + "learning_rate": 3.2262971941170575e-06, + "loss": 0.92110729, + "num_input_tokens_seen": 111146730, + "router_z_loss_clip": 1.92675781, + "router_z_loss_mlp": 0.17773438, + "step": 5176, + "time_per_iteration": 2.599229574203491 + }, + { + "auxiliary_loss_clip": 0.06476277, + "auxiliary_loss_mlp": 0.01273206, + "balance_loss_clip": 0.06279132, + "balance_loss_mlp": 0.01255468, + "epoch": 0.31125807906207725, + "flos": 21039422192640.0, + "grad_norm": 2.9714078029027977, + "language_loss": 0.80720133, + "learning_rate": 3.2259895087612837e-06, + "loss": 0.88469613, + "num_input_tokens_seen": 111166295, + "router_z_loss_clip": 1.96972656, + "router_z_loss_mlp": 0.17736816, + "step": 5177, + "time_per_iteration": 2.500892162322998 + }, + { + "auxiliary_loss_clip": 0.06482373, + "auxiliary_loss_mlp": 0.01272639, + "balance_loss_clip": 0.06283157, + "balance_loss_mlp": 0.01255353, + "epoch": 0.3113182023147452, + "flos": 23083435491840.0, + "grad_norm": 1.9531801027744504, + "language_loss": 0.81037831, + "learning_rate": 3.2256817769158657e-06, + "loss": 0.88792837, + "num_input_tokens_seen": 111185665, + "router_z_loss_clip": 1.9921875, + "router_z_loss_mlp": 0.17285156, + "step": 5178, + "time_per_iteration": 2.6086864471435547 + }, + { + "auxiliary_loss_clip": 0.06483644, + "auxiliary_loss_mlp": 0.01276661, + "balance_loss_clip": 0.06283852, + "balance_loss_mlp": 0.01259316, + "epoch": 0.3113783255674132, + "flos": 11843919895680.0, + "grad_norm": 1.9055325557306373, + "language_loss": 0.81524587, + "learning_rate": 3.225373998592471e-06, + "loss": 0.89284897, + "num_input_tokens_seen": 111201615, + "router_z_loss_clip": 1.99511719, + "router_z_loss_mlp": 0.17346191, + "step": 5179, + "time_per_iteration": 2.4582295417785645 + }, + { + "auxiliary_loss_clip": 0.06482498, + "auxiliary_loss_mlp": 0.01272412, + "balance_loss_clip": 0.06285708, + "balance_loss_mlp": 0.01255926, + "epoch": 0.31143844882008115, + "flos": 16295098089600.0, + "grad_norm": 1.625598326664227, + "language_loss": 0.78714401, + "learning_rate": 3.2250661738027715e-06, + "loss": 0.86469316, + "num_input_tokens_seen": 111220515, + "router_z_loss_clip": 1.96582031, + "router_z_loss_mlp": 0.16491699, + "step": 5180, + "time_per_iteration": 2.4980807304382324 + }, + { + "auxiliary_loss_clip": 0.06486566, + "auxiliary_loss_mlp": 0.01274849, + "balance_loss_clip": 0.06288585, + "balance_loss_mlp": 0.01257742, + "epoch": 0.3114985720727491, + "flos": 23223824208000.0, + "grad_norm": 4.8505374097148595, + "language_loss": 0.83649975, + "learning_rate": 3.22475830255844e-06, + "loss": 0.91411394, + "num_input_tokens_seen": 111240395, + "router_z_loss_clip": 1.98144531, + "router_z_loss_mlp": 0.17102051, + "step": 5181, + "time_per_iteration": 2.519810438156128 + }, + { + "auxiliary_loss_clip": 0.0648061, + "auxiliary_loss_mlp": 0.01273344, + "balance_loss_clip": 0.06285872, + "balance_loss_mlp": 0.01258348, + "epoch": 0.3115586953254171, + "flos": 30052468224000.0, + "grad_norm": 1.6592506395593873, + "language_loss": 0.74442661, + "learning_rate": 3.2244503848711516e-06, + "loss": 0.82196611, + "num_input_tokens_seen": 111261100, + "router_z_loss_clip": 1.94726562, + "router_z_loss_mlp": 0.15002441, + "step": 5182, + "time_per_iteration": 2.6227729320526123 + }, + { + "auxiliary_loss_clip": 0.06490366, + "auxiliary_loss_mlp": 0.01270872, + "balance_loss_clip": 0.06288615, + "balance_loss_mlp": 0.01254362, + "epoch": 0.3116188185780851, + "flos": 25673433995520.0, + "grad_norm": 2.0195817263542852, + "language_loss": 0.70974112, + "learning_rate": 3.2241424207525815e-06, + "loss": 0.78735352, + "num_input_tokens_seen": 111281320, + "router_z_loss_clip": 2.015625, + "router_z_loss_mlp": 0.16503906, + "step": 5183, + "time_per_iteration": 2.5801775455474854 + }, + { + "auxiliary_loss_clip": 0.06369011, + "auxiliary_loss_mlp": 0.0126694, + "balance_loss_clip": 0.06276023, + "balance_loss_mlp": 0.0126376, + "epoch": 0.31167894183075306, + "flos": 69528568285440.0, + "grad_norm": 0.9410725627351464, + "language_loss": 0.59133947, + "learning_rate": 3.223834410214408e-06, + "loss": 0.66769892, + "num_input_tokens_seen": 111341405, + "router_z_loss_clip": 0.9296875, + "router_z_loss_mlp": 0.03182983, + "step": 5184, + "time_per_iteration": 3.1446807384490967 + }, + { + "auxiliary_loss_clip": 0.06488199, + "auxiliary_loss_mlp": 0.01277241, + "balance_loss_clip": 0.06288702, + "balance_loss_mlp": 0.01260206, + "epoch": 0.31173906508342103, + "flos": 14945215213440.0, + "grad_norm": 2.5697318046341424, + "language_loss": 0.69689488, + "learning_rate": 3.223526353268311e-06, + "loss": 0.77454925, + "num_input_tokens_seen": 111358975, + "router_z_loss_clip": 1.99121094, + "router_z_loss_mlp": 0.17041016, + "step": 5185, + "time_per_iteration": 2.51505446434021 + }, + { + "auxiliary_loss_clip": 0.06492566, + "auxiliary_loss_mlp": 0.01273506, + "balance_loss_clip": 0.06291321, + "balance_loss_mlp": 0.01256507, + "epoch": 0.311799188336089, + "flos": 16180886574720.0, + "grad_norm": 2.500262239817252, + "language_loss": 0.63946617, + "learning_rate": 3.2232182499259725e-06, + "loss": 0.71712691, + "num_input_tokens_seen": 111375845, + "router_z_loss_clip": 2.01269531, + "router_z_loss_mlp": 0.17004395, + "step": 5186, + "time_per_iteration": 2.505030870437622 + }, + { + "auxiliary_loss_clip": 0.06492127, + "auxiliary_loss_mlp": 0.01277284, + "balance_loss_clip": 0.06286798, + "balance_loss_mlp": 0.01258592, + "epoch": 0.31185931158875696, + "flos": 25016633366400.0, + "grad_norm": 2.1681671670490603, + "language_loss": 0.86641979, + "learning_rate": 3.2229101001990747e-06, + "loss": 0.94411391, + "num_input_tokens_seen": 111394150, + "router_z_loss_clip": 2.05664062, + "router_z_loss_mlp": 0.18688965, + "step": 5187, + "time_per_iteration": 2.583510160446167 + }, + { + "auxiliary_loss_clip": 0.06487665, + "auxiliary_loss_mlp": 0.01281669, + "balance_loss_clip": 0.06287494, + "balance_loss_mlp": 0.01264527, + "epoch": 0.3119194348414249, + "flos": 37242041702400.0, + "grad_norm": 1.4465041932602023, + "language_loss": 0.6305244, + "learning_rate": 3.2226019040993036e-06, + "loss": 0.70821768, + "num_input_tokens_seen": 111418355, + "router_z_loss_clip": 2.00097656, + "router_z_loss_mlp": 0.17138672, + "step": 5188, + "time_per_iteration": 2.7036139965057373 + }, + { + "auxiliary_loss_clip": 0.06486794, + "auxiliary_loss_mlp": 0.01278194, + "balance_loss_clip": 0.06286722, + "balance_loss_mlp": 0.01261397, + "epoch": 0.3119795580940929, + "flos": 15018155792640.0, + "grad_norm": 2.1005201528303683, + "language_loss": 0.83722234, + "learning_rate": 3.222293661638346e-06, + "loss": 0.91487223, + "num_input_tokens_seen": 111435445, + "router_z_loss_clip": 1.99902344, + "router_z_loss_mlp": 0.16796875, + "step": 5189, + "time_per_iteration": 3.933061361312866 + }, + { + "auxiliary_loss_clip": 0.06481164, + "auxiliary_loss_mlp": 0.0127866, + "balance_loss_clip": 0.06286235, + "balance_loss_mlp": 0.01262602, + "epoch": 0.31203968134676086, + "flos": 16003755043200.0, + "grad_norm": 2.4405990352060862, + "language_loss": 0.79429829, + "learning_rate": 3.22198537282789e-06, + "loss": 0.87189662, + "num_input_tokens_seen": 111453430, + "router_z_loss_clip": 1.94824219, + "router_z_loss_mlp": 0.16064453, + "step": 5190, + "time_per_iteration": 2.479335308074951 + }, + { + "auxiliary_loss_clip": 0.0648755, + "auxiliary_loss_mlp": 0.01275874, + "balance_loss_clip": 0.06287287, + "balance_loss_mlp": 0.01259292, + "epoch": 0.3120998045994288, + "flos": 23843378897280.0, + "grad_norm": 1.451249914697294, + "language_loss": 0.75502658, + "learning_rate": 3.2216770376796262e-06, + "loss": 0.83266091, + "num_input_tokens_seen": 111475325, + "router_z_loss_clip": 2.00195312, + "router_z_loss_mlp": 0.16589355, + "step": 5191, + "time_per_iteration": 3.997621536254883 + }, + { + "auxiliary_loss_clip": 0.06364973, + "auxiliary_loss_mlp": 0.01267778, + "balance_loss_clip": 0.0627303, + "balance_loss_mlp": 0.01264178, + "epoch": 0.3121599278520968, + "flos": 69203081900160.0, + "grad_norm": 0.8286054534369729, + "language_loss": 0.63964236, + "learning_rate": 3.221368656205247e-06, + "loss": 0.71596992, + "num_input_tokens_seen": 111533960, + "router_z_loss_clip": 0.91796875, + "router_z_loss_mlp": 0.03594971, + "step": 5192, + "time_per_iteration": 4.631687879562378 + }, + { + "auxiliary_loss_clip": 0.06487048, + "auxiliary_loss_mlp": 0.01274026, + "balance_loss_clip": 0.06284614, + "balance_loss_mlp": 0.01254916, + "epoch": 0.31222005110476475, + "flos": 23813302481280.0, + "grad_norm": 1.6272414578256373, + "language_loss": 0.80280936, + "learning_rate": 3.221060228416446e-06, + "loss": 0.88042009, + "num_input_tokens_seen": 111554055, + "router_z_loss_clip": 2.02441406, + "router_z_loss_mlp": 0.19116211, + "step": 5193, + "time_per_iteration": 2.5469777584075928 + }, + { + "auxiliary_loss_clip": 0.06487141, + "auxiliary_loss_mlp": 0.01274246, + "balance_loss_clip": 0.06286725, + "balance_loss_mlp": 0.01255244, + "epoch": 0.3122801743574327, + "flos": 25232771773440.0, + "grad_norm": 1.8740192083695482, + "language_loss": 0.72266662, + "learning_rate": 3.2207517543249183e-06, + "loss": 0.80028057, + "num_input_tokens_seen": 111574305, + "router_z_loss_clip": 2.00292969, + "router_z_loss_mlp": 0.19006348, + "step": 5194, + "time_per_iteration": 2.5416929721832275 + }, + { + "auxiliary_loss_clip": 0.06483766, + "auxiliary_loss_mlp": 0.01273792, + "balance_loss_clip": 0.06285778, + "balance_loss_mlp": 0.01257604, + "epoch": 0.3123402976101007, + "flos": 22973165118720.0, + "grad_norm": 1.4810805631902553, + "language_loss": 0.77076054, + "learning_rate": 3.2204432339423616e-06, + "loss": 0.8483361, + "num_input_tokens_seen": 111595680, + "router_z_loss_clip": 1.97949219, + "router_z_loss_mlp": 0.16186523, + "step": 5195, + "time_per_iteration": 2.5890305042266846 + }, + { + "auxiliary_loss_clip": 0.06489303, + "auxiliary_loss_mlp": 0.01273064, + "balance_loss_clip": 0.06285589, + "balance_loss_mlp": 0.01256268, + "epoch": 0.3124004208627687, + "flos": 25199131559040.0, + "grad_norm": 1.3828607146804377, + "language_loss": 0.78218812, + "learning_rate": 3.220134667280476e-06, + "loss": 0.85981178, + "num_input_tokens_seen": 111618135, + "router_z_loss_clip": 2.03710938, + "router_z_loss_mlp": 0.16796875, + "step": 5196, + "time_per_iteration": 2.608607769012451 + }, + { + "auxiliary_loss_clip": 0.06360652, + "auxiliary_loss_mlp": 0.0126022, + "balance_loss_clip": 0.06268834, + "balance_loss_mlp": 0.01256831, + "epoch": 0.31246054411543667, + "flos": 67506398974080.0, + "grad_norm": 0.7576873975695796, + "language_loss": 0.54860902, + "learning_rate": 3.2198260543509613e-06, + "loss": 0.62481773, + "num_input_tokens_seen": 111682220, + "router_z_loss_clip": 0.91455078, + "router_z_loss_mlp": 0.03396606, + "step": 5197, + "time_per_iteration": 4.588749170303345 + }, + { + "auxiliary_loss_clip": 0.06482677, + "auxiliary_loss_mlp": 0.0127766, + "balance_loss_clip": 0.06286696, + "balance_loss_mlp": 0.01261424, + "epoch": 0.31252066736810463, + "flos": 17864347754880.0, + "grad_norm": 1.7824095594325715, + "language_loss": 0.67078102, + "learning_rate": 3.21951739516552e-06, + "loss": 0.74838442, + "num_input_tokens_seen": 111700815, + "router_z_loss_clip": 1.9609375, + "router_z_loss_mlp": 0.16247559, + "step": 5198, + "time_per_iteration": 2.5304651260375977 + }, + { + "auxiliary_loss_clip": 0.06490927, + "auxiliary_loss_mlp": 0.01280145, + "balance_loss_clip": 0.06286959, + "balance_loss_mlp": 0.01261596, + "epoch": 0.3125807906207726, + "flos": 18480338645760.0, + "grad_norm": 2.4146329055675264, + "language_loss": 0.70401263, + "learning_rate": 3.219208689735857e-06, + "loss": 0.78172338, + "num_input_tokens_seen": 111718195, + "router_z_loss_clip": 2.04101562, + "router_z_loss_mlp": 0.1854248, + "step": 5199, + "time_per_iteration": 2.5358517169952393 + }, + { + "auxiliary_loss_clip": 0.06486207, + "auxiliary_loss_mlp": 0.01275953, + "balance_loss_clip": 0.06286721, + "balance_loss_mlp": 0.01258751, + "epoch": 0.31264091387344056, + "flos": 18951454627200.0, + "grad_norm": 1.7917967449154466, + "language_loss": 0.79258394, + "learning_rate": 3.2188999380736785e-06, + "loss": 0.87020558, + "num_input_tokens_seen": 111734440, + "router_z_loss_clip": 1.99316406, + "router_z_loss_mlp": 0.17211914, + "step": 5200, + "time_per_iteration": 2.5519278049468994 + }, + { + "auxiliary_loss_clip": 0.06479242, + "auxiliary_loss_mlp": 0.0127792, + "balance_loss_clip": 0.06284697, + "balance_loss_mlp": 0.01261195, + "epoch": 0.3127010371261085, + "flos": 21474591972480.0, + "grad_norm": 1.8808343302197998, + "language_loss": 0.83758473, + "learning_rate": 3.2185911401906917e-06, + "loss": 0.91515636, + "num_input_tokens_seen": 111751960, + "router_z_loss_clip": 1.94726562, + "router_z_loss_mlp": 0.16711426, + "step": 5201, + "time_per_iteration": 2.509331226348877 + }, + { + "auxiliary_loss_clip": 0.06487838, + "auxiliary_loss_mlp": 0.0127922, + "balance_loss_clip": 0.06288306, + "balance_loss_mlp": 0.01262006, + "epoch": 0.3127611603787765, + "flos": 15340623431040.0, + "grad_norm": 2.173524859167814, + "language_loss": 0.69690537, + "learning_rate": 3.2182822960986072e-06, + "loss": 0.77457595, + "num_input_tokens_seen": 111769585, + "router_z_loss_clip": 1.99414062, + "router_z_loss_mlp": 0.17224121, + "step": 5202, + "time_per_iteration": 2.52652907371521 + }, + { + "auxiliary_loss_clip": 0.06486704, + "auxiliary_loss_mlp": 0.01278352, + "balance_loss_clip": 0.06286184, + "balance_loss_mlp": 0.01261257, + "epoch": 0.31282128363144446, + "flos": 17608741274880.0, + "grad_norm": 2.6038382996561604, + "language_loss": 0.83874559, + "learning_rate": 3.2179734058091358e-06, + "loss": 0.91639626, + "num_input_tokens_seen": 111787880, + "router_z_loss_clip": 2.00488281, + "router_z_loss_mlp": 0.17077637, + "step": 5203, + "time_per_iteration": 2.502721071243286 + }, + { + "auxiliary_loss_clip": 0.06488604, + "auxiliary_loss_mlp": 0.01274199, + "balance_loss_clip": 0.06287186, + "balance_loss_mlp": 0.01256604, + "epoch": 0.3128814068841124, + "flos": 26763349979520.0, + "grad_norm": 2.412675439541041, + "language_loss": 0.61310971, + "learning_rate": 3.2176644693339913e-06, + "loss": 0.69073772, + "num_input_tokens_seen": 111805950, + "router_z_loss_clip": 2.01464844, + "router_z_loss_mlp": 0.17602539, + "step": 5204, + "time_per_iteration": 2.62591814994812 + }, + { + "auxiliary_loss_clip": 0.06482827, + "auxiliary_loss_mlp": 0.01275158, + "balance_loss_clip": 0.0628654, + "balance_loss_mlp": 0.01259553, + "epoch": 0.3129415301367804, + "flos": 22278783133440.0, + "grad_norm": 1.7324044566720012, + "language_loss": 0.66418731, + "learning_rate": 3.217355486684887e-06, + "loss": 0.74176717, + "num_input_tokens_seen": 111826135, + "router_z_loss_clip": 1.96191406, + "router_z_loss_mlp": 0.15582275, + "step": 5205, + "time_per_iteration": 2.512777328491211 + }, + { + "auxiliary_loss_clip": 0.06487758, + "auxiliary_loss_mlp": 0.01277628, + "balance_loss_clip": 0.06287788, + "balance_loss_mlp": 0.01260021, + "epoch": 0.31300165338944835, + "flos": 26471461881600.0, + "grad_norm": 1.8344199627772577, + "language_loss": 0.77298087, + "learning_rate": 3.2170464578735414e-06, + "loss": 0.85063475, + "num_input_tokens_seen": 111844700, + "router_z_loss_clip": 1.99902344, + "router_z_loss_mlp": 0.17614746, + "step": 5206, + "time_per_iteration": 2.5712244510650635 + }, + { + "auxiliary_loss_clip": 0.06485735, + "auxiliary_loss_mlp": 0.01271701, + "balance_loss_clip": 0.06288184, + "balance_loss_mlp": 0.01255488, + "epoch": 0.3130617766421163, + "flos": 21951116542080.0, + "grad_norm": 2.0121384013718226, + "language_loss": 0.83184564, + "learning_rate": 3.216737382911672e-06, + "loss": 0.90941995, + "num_input_tokens_seen": 111861585, + "router_z_loss_clip": 1.97558594, + "router_z_loss_mlp": 0.16210938, + "step": 5207, + "time_per_iteration": 2.5004825592041016 + }, + { + "auxiliary_loss_clip": 0.06481713, + "auxiliary_loss_mlp": 0.01271341, + "balance_loss_clip": 0.06286129, + "balance_loss_mlp": 0.0125489, + "epoch": 0.3131218998947843, + "flos": 23299154628480.0, + "grad_norm": 2.0890442442793478, + "language_loss": 0.71795774, + "learning_rate": 3.216428261810999e-06, + "loss": 0.79548824, + "num_input_tokens_seen": 111882950, + "router_z_loss_clip": 1.95605469, + "router_z_loss_mlp": 0.16442871, + "step": 5208, + "time_per_iteration": 2.5763585567474365 + }, + { + "auxiliary_loss_clip": 0.06485837, + "auxiliary_loss_mlp": 0.01275661, + "balance_loss_clip": 0.06287587, + "balance_loss_mlp": 0.0125927, + "epoch": 0.3131820231474523, + "flos": 21145583715840.0, + "grad_norm": 1.890905451265213, + "language_loss": 0.74832964, + "learning_rate": 3.2161190945832445e-06, + "loss": 0.82594466, + "num_input_tokens_seen": 111901640, + "router_z_loss_clip": 1.98046875, + "router_z_loss_mlp": 0.1640625, + "step": 5209, + "time_per_iteration": 2.510582685470581 + }, + { + "auxiliary_loss_clip": 0.06483819, + "auxiliary_loss_mlp": 0.01271712, + "balance_loss_clip": 0.06284019, + "balance_loss_mlp": 0.01255678, + "epoch": 0.31324214640012027, + "flos": 23915816352000.0, + "grad_norm": 1.8368712630160764, + "language_loss": 0.77846575, + "learning_rate": 3.2158098812401325e-06, + "loss": 0.85602105, + "num_input_tokens_seen": 111919615, + "router_z_loss_clip": 1.99609375, + "router_z_loss_mlp": 0.16027832, + "step": 5210, + "time_per_iteration": 2.5457394123077393 + }, + { + "auxiliary_loss_clip": 0.06472643, + "auxiliary_loss_mlp": 0.01278598, + "balance_loss_clip": 0.06280389, + "balance_loss_mlp": 0.01262963, + "epoch": 0.31330226965278823, + "flos": 22243507764480.0, + "grad_norm": 1.7690758446531836, + "language_loss": 0.79563594, + "learning_rate": 3.2155006217933874e-06, + "loss": 0.87314838, + "num_input_tokens_seen": 111938485, + "router_z_loss_clip": 1.92285156, + "router_z_loss_mlp": 0.15643311, + "step": 5211, + "time_per_iteration": 2.5383517742156982 + }, + { + "auxiliary_loss_clip": 0.0648172, + "auxiliary_loss_mlp": 0.01270065, + "balance_loss_clip": 0.06285914, + "balance_loss_mlp": 0.01254699, + "epoch": 0.3133623929054562, + "flos": 19759838492160.0, + "grad_norm": 1.6892345584465767, + "language_loss": 0.79993588, + "learning_rate": 3.2151913162547367e-06, + "loss": 0.87745374, + "num_input_tokens_seen": 111956425, + "router_z_loss_clip": 1.95996094, + "router_z_loss_mlp": 0.15368652, + "step": 5212, + "time_per_iteration": 2.5550856590270996 + }, + { + "auxiliary_loss_clip": 0.06489062, + "auxiliary_loss_mlp": 0.01276168, + "balance_loss_clip": 0.06287421, + "balance_loss_mlp": 0.01258919, + "epoch": 0.31342251615812416, + "flos": 27169617300480.0, + "grad_norm": 2.030797991853156, + "language_loss": 0.71651685, + "learning_rate": 3.2148819646359097e-06, + "loss": 0.79416913, + "num_input_tokens_seen": 111975915, + "router_z_loss_clip": 2.01660156, + "router_z_loss_mlp": 0.17248535, + "step": 5213, + "time_per_iteration": 2.5827908515930176 + }, + { + "auxiliary_loss_clip": 0.06486979, + "auxiliary_loss_mlp": 0.01275678, + "balance_loss_clip": 0.06285015, + "balance_loss_mlp": 0.01258763, + "epoch": 0.31348263941079213, + "flos": 20235985718400.0, + "grad_norm": 2.164105834219518, + "language_loss": 0.77949297, + "learning_rate": 3.2145725669486374e-06, + "loss": 0.85711956, + "num_input_tokens_seen": 111995055, + "router_z_loss_clip": 2.01757812, + "router_z_loss_mlp": 0.16918945, + "step": 5214, + "time_per_iteration": 2.539149761199951 + }, + { + "auxiliary_loss_clip": 0.06478322, + "auxiliary_loss_mlp": 0.0127674, + "balance_loss_clip": 0.06285194, + "balance_loss_mlp": 0.01261267, + "epoch": 0.3135427626634601, + "flos": 24614474895360.0, + "grad_norm": 1.5354860146289633, + "language_loss": 0.82935429, + "learning_rate": 3.2142631232046517e-06, + "loss": 0.90690494, + "num_input_tokens_seen": 112015830, + "router_z_loss_clip": 1.92871094, + "router_z_loss_mlp": 0.15472412, + "step": 5215, + "time_per_iteration": 2.541269302368164 + }, + { + "auxiliary_loss_clip": 0.06486098, + "auxiliary_loss_mlp": 0.01273565, + "balance_loss_clip": 0.06288007, + "balance_loss_mlp": 0.01257186, + "epoch": 0.31360288591612806, + "flos": 20966230051200.0, + "grad_norm": 1.8278899125375987, + "language_loss": 0.79790628, + "learning_rate": 3.213953633415686e-06, + "loss": 0.87550294, + "num_input_tokens_seen": 112035065, + "router_z_loss_clip": 1.98144531, + "router_z_loss_mlp": 0.16369629, + "step": 5216, + "time_per_iteration": 2.5465261936187744 + }, + { + "auxiliary_loss_clip": 0.06489767, + "auxiliary_loss_mlp": 0.0127801, + "balance_loss_clip": 0.06286536, + "balance_loss_mlp": 0.01258722, + "epoch": 0.313663009168796, + "flos": 26987957648640.0, + "grad_norm": 1.8964979694160957, + "language_loss": 0.68953168, + "learning_rate": 3.213644097593477e-06, + "loss": 0.76720947, + "num_input_tokens_seen": 112058405, + "router_z_loss_clip": 2.03125, + "router_z_loss_mlp": 0.19299316, + "step": 5217, + "time_per_iteration": 2.5518875122070312 + }, + { + "auxiliary_loss_clip": 0.06480299, + "auxiliary_loss_mlp": 0.01275451, + "balance_loss_clip": 0.06283456, + "balance_loss_mlp": 0.01259298, + "epoch": 0.313723132421464, + "flos": 18046762093440.0, + "grad_norm": 1.6389262097165689, + "language_loss": 0.80772746, + "learning_rate": 3.2133345157497624e-06, + "loss": 0.88528496, + "num_input_tokens_seen": 112076420, + "router_z_loss_clip": 1.96679688, + "router_z_loss_mlp": 0.16149902, + "step": 5218, + "time_per_iteration": 2.5255727767944336 + }, + { + "auxiliary_loss_clip": 0.06485314, + "auxiliary_loss_mlp": 0.0127641, + "balance_loss_clip": 0.06285116, + "balance_loss_mlp": 0.01259363, + "epoch": 0.31378325567413196, + "flos": 22494963467520.0, + "grad_norm": 2.253901481236794, + "language_loss": 0.70057523, + "learning_rate": 3.2130248878962813e-06, + "loss": 0.77819252, + "num_input_tokens_seen": 112090775, + "router_z_loss_clip": 2.00195312, + "router_z_loss_mlp": 0.17047119, + "step": 5219, + "time_per_iteration": 2.487877368927002 + }, + { + "auxiliary_loss_clip": 0.06483484, + "auxiliary_loss_mlp": 0.0127692, + "balance_loss_clip": 0.06284904, + "balance_loss_mlp": 0.01259181, + "epoch": 0.3138433789267999, + "flos": 22425838248960.0, + "grad_norm": 1.9320324134388631, + "language_loss": 0.80156839, + "learning_rate": 3.2127152140447747e-06, + "loss": 0.87917244, + "num_input_tokens_seen": 112110980, + "router_z_loss_clip": 1.98535156, + "router_z_loss_mlp": 0.17736816, + "step": 5220, + "time_per_iteration": 2.5364530086517334 + }, + { + "auxiliary_loss_clip": 0.06484166, + "auxiliary_loss_mlp": 0.01276534, + "balance_loss_clip": 0.06287254, + "balance_loss_mlp": 0.01260751, + "epoch": 0.3139035021794679, + "flos": 13010927235840.0, + "grad_norm": 1.8390249578816682, + "language_loss": 0.73235905, + "learning_rate": 3.212405494206986e-06, + "loss": 0.80996603, + "num_input_tokens_seen": 112129020, + "router_z_loss_clip": 1.96777344, + "router_z_loss_mlp": 0.15771484, + "step": 5221, + "time_per_iteration": 2.477369546890259 + }, + { + "auxiliary_loss_clip": 0.06480553, + "auxiliary_loss_mlp": 0.0127616, + "balance_loss_clip": 0.0628504, + "balance_loss_mlp": 0.0125996, + "epoch": 0.31396362543213585, + "flos": 16951605229440.0, + "grad_norm": 1.9354629264259422, + "language_loss": 0.81906354, + "learning_rate": 3.2120957283946588e-06, + "loss": 0.89663064, + "num_input_tokens_seen": 112147865, + "router_z_loss_clip": 1.95605469, + "router_z_loss_mlp": 0.16223145, + "step": 5222, + "time_per_iteration": 2.5057129859924316 + }, + { + "auxiliary_loss_clip": 0.06490297, + "auxiliary_loss_mlp": 0.01284294, + "balance_loss_clip": 0.06288279, + "balance_loss_mlp": 0.01266555, + "epoch": 0.31402374868480387, + "flos": 20162877431040.0, + "grad_norm": 1.9084075298763516, + "language_loss": 0.70490289, + "learning_rate": 3.2117859166195407e-06, + "loss": 0.78264874, + "num_input_tokens_seen": 112166745, + "router_z_loss_clip": 2.02050781, + "router_z_loss_mlp": 0.17749023, + "step": 5223, + "time_per_iteration": 2.4747233390808105 + }, + { + "auxiliary_loss_clip": 0.06484593, + "auxiliary_loss_mlp": 0.01276253, + "balance_loss_clip": 0.06287414, + "balance_loss_mlp": 0.01259718, + "epoch": 0.31408387193747184, + "flos": 21257363462400.0, + "grad_norm": 1.5262001080385015, + "language_loss": 0.80608702, + "learning_rate": 3.211476058893379e-06, + "loss": 0.88369542, + "num_input_tokens_seen": 112185895, + "router_z_loss_clip": 1.96972656, + "router_z_loss_mlp": 0.1652832, + "step": 5224, + "time_per_iteration": 2.576864004135132 + }, + { + "auxiliary_loss_clip": 0.06497495, + "auxiliary_loss_mlp": 0.01279621, + "balance_loss_clip": 0.06291461, + "balance_loss_mlp": 0.01261632, + "epoch": 0.3141439951901398, + "flos": 27490617492480.0, + "grad_norm": 2.962077450034062, + "language_loss": 0.58624607, + "learning_rate": 3.2111661552279243e-06, + "loss": 0.66401726, + "num_input_tokens_seen": 112204465, + "router_z_loss_clip": 2.05859375, + "router_z_loss_mlp": 0.17993164, + "step": 5225, + "time_per_iteration": 2.558159828186035 + }, + { + "auxiliary_loss_clip": 0.06482717, + "auxiliary_loss_mlp": 0.0128044, + "balance_loss_clip": 0.06289019, + "balance_loss_mlp": 0.0126505, + "epoch": 0.31420411844280777, + "flos": 17857010522880.0, + "grad_norm": 1.7568792542410607, + "language_loss": 0.81975454, + "learning_rate": 3.2108562056349273e-06, + "loss": 0.89738619, + "num_input_tokens_seen": 112221635, + "router_z_loss_clip": 1.93847656, + "router_z_loss_mlp": 0.15380859, + "step": 5226, + "time_per_iteration": 2.5197925567626953 + }, + { + "auxiliary_loss_clip": 0.06493273, + "auxiliary_loss_mlp": 0.01283534, + "balance_loss_clip": 0.0629416, + "balance_loss_mlp": 0.01265998, + "epoch": 0.31426424169547573, + "flos": 21623491877760.0, + "grad_norm": 1.9094319640845634, + "language_loss": 0.74358761, + "learning_rate": 3.210546210126141e-06, + "loss": 0.8213557, + "num_input_tokens_seen": 112241240, + "router_z_loss_clip": 1.99023438, + "router_z_loss_mlp": 0.17529297, + "step": 5227, + "time_per_iteration": 2.6723456382751465 + }, + { + "auxiliary_loss_clip": 0.06493893, + "auxiliary_loss_mlp": 0.01287677, + "balance_loss_clip": 0.0629607, + "balance_loss_mlp": 0.01270392, + "epoch": 0.3143243649481437, + "flos": 30928677569280.0, + "grad_norm": 1.9492252245216757, + "language_loss": 0.68802202, + "learning_rate": 3.2102361687133213e-06, + "loss": 0.76583767, + "num_input_tokens_seen": 112262350, + "router_z_loss_clip": 1.97949219, + "router_z_loss_mlp": 0.17297363, + "step": 5228, + "time_per_iteration": 2.724705934524536 + }, + { + "auxiliary_loss_clip": 0.06488988, + "auxiliary_loss_mlp": 0.01281066, + "balance_loss_clip": 0.06292454, + "balance_loss_mlp": 0.01265044, + "epoch": 0.31438448820081166, + "flos": 22828206355200.0, + "grad_norm": 1.7089427628420442, + "language_loss": 0.80276144, + "learning_rate": 3.2099260814082254e-06, + "loss": 0.88046199, + "num_input_tokens_seen": 112283710, + "router_z_loss_clip": 1.96484375, + "router_z_loss_mlp": 0.16015625, + "step": 5229, + "time_per_iteration": 4.091265678405762 + }, + { + "auxiliary_loss_clip": 0.06481495, + "auxiliary_loss_mlp": 0.01275808, + "balance_loss_clip": 0.06287295, + "balance_loss_mlp": 0.01259428, + "epoch": 0.3144446114534796, + "flos": 23298399941760.0, + "grad_norm": 1.658320923858175, + "language_loss": 0.70112014, + "learning_rate": 3.209615948222611e-06, + "loss": 0.7786932, + "num_input_tokens_seen": 112304285, + "router_z_loss_clip": 1.93945312, + "router_z_loss_mlp": 0.16381836, + "step": 5230, + "time_per_iteration": 2.5652499198913574 + }, + { + "auxiliary_loss_clip": 0.06489812, + "auxiliary_loss_mlp": 0.01281571, + "balance_loss_clip": 0.06291179, + "balance_loss_mlp": 0.01264572, + "epoch": 0.3145047347061476, + "flos": 31363679640960.0, + "grad_norm": 2.930398163442548, + "language_loss": 0.80236816, + "learning_rate": 3.209305769168239e-06, + "loss": 0.88008201, + "num_input_tokens_seen": 112325110, + "router_z_loss_clip": 1.984375, + "router_z_loss_mlp": 0.17004395, + "step": 5231, + "time_per_iteration": 5.461926698684692 + }, + { + "auxiliary_loss_clip": 0.06483024, + "auxiliary_loss_mlp": 0.01279077, + "balance_loss_clip": 0.062879, + "balance_loss_mlp": 0.01262912, + "epoch": 0.31456485795881556, + "flos": 10894182992640.0, + "grad_norm": 3.377505802107346, + "language_loss": 0.85102671, + "learning_rate": 3.2089955442568704e-06, + "loss": 0.92864776, + "num_input_tokens_seen": 112339855, + "router_z_loss_clip": 1.95117188, + "router_z_loss_mlp": 0.16149902, + "step": 5232, + "time_per_iteration": 2.549555778503418 + }, + { + "auxiliary_loss_clip": 0.06479923, + "auxiliary_loss_mlp": 0.01286528, + "balance_loss_clip": 0.06287266, + "balance_loss_mlp": 0.01269779, + "epoch": 0.3146249812114835, + "flos": 17098157220480.0, + "grad_norm": 1.5771176865385883, + "language_loss": 0.80666757, + "learning_rate": 3.2086852735002692e-06, + "loss": 0.88433212, + "num_input_tokens_seen": 112358480, + "router_z_loss_clip": 1.92675781, + "router_z_loss_mlp": 0.16748047, + "step": 5233, + "time_per_iteration": 2.502790927886963 + }, + { + "auxiliary_loss_clip": 0.06496342, + "auxiliary_loss_mlp": 0.01276742, + "balance_loss_clip": 0.06294576, + "balance_loss_mlp": 0.01260768, + "epoch": 0.3146851044641515, + "flos": 55303283352960.0, + "grad_norm": 1.6501859452394316, + "language_loss": 0.71124518, + "learning_rate": 3.2083749569102024e-06, + "loss": 0.78897607, + "num_input_tokens_seen": 112382350, + "router_z_loss_clip": 2.01660156, + "router_z_loss_mlp": 0.15966797, + "step": 5234, + "time_per_iteration": 2.8301026821136475 + }, + { + "auxiliary_loss_clip": 0.06491733, + "auxiliary_loss_mlp": 0.01276589, + "balance_loss_clip": 0.06292239, + "balance_loss_mlp": 0.01259566, + "epoch": 0.31474522771681945, + "flos": 27023149163520.0, + "grad_norm": 1.9231261360365097, + "language_loss": 0.73437119, + "learning_rate": 3.2080645944984356e-06, + "loss": 0.8120544, + "num_input_tokens_seen": 112400260, + "router_z_loss_clip": 1.99902344, + "router_z_loss_mlp": 0.17004395, + "step": 5235, + "time_per_iteration": 2.543799638748169 + }, + { + "auxiliary_loss_clip": 0.0648193, + "auxiliary_loss_mlp": 0.0127527, + "balance_loss_clip": 0.0628682, + "balance_loss_mlp": 0.01259308, + "epoch": 0.3148053509694875, + "flos": 21258369711360.0, + "grad_norm": 1.9283939280374622, + "language_loss": 0.79554284, + "learning_rate": 3.2077541862767384e-06, + "loss": 0.87311482, + "num_input_tokens_seen": 112419400, + "router_z_loss_clip": 1.95214844, + "router_z_loss_mlp": 0.15942383, + "step": 5236, + "time_per_iteration": 2.5356431007385254 + }, + { + "auxiliary_loss_clip": 0.06493077, + "auxiliary_loss_mlp": 0.01277667, + "balance_loss_clip": 0.06288847, + "balance_loss_mlp": 0.01260942, + "epoch": 0.31486547422215544, + "flos": 31256721504000.0, + "grad_norm": 2.880510555000243, + "language_loss": 0.76337612, + "learning_rate": 3.207443732256881e-06, + "loss": 0.84108353, + "num_input_tokens_seen": 112440825, + "router_z_loss_clip": 2.04101562, + "router_z_loss_mlp": 0.16723633, + "step": 5237, + "time_per_iteration": 4.129598379135132 + }, + { + "auxiliary_loss_clip": 0.0648271, + "auxiliary_loss_mlp": 0.01277744, + "balance_loss_clip": 0.06291585, + "balance_loss_mlp": 0.01262843, + "epoch": 0.3149255974748234, + "flos": 19834749642240.0, + "grad_norm": 1.6736027402410734, + "language_loss": 0.7951014, + "learning_rate": 3.2071332324506372e-06, + "loss": 0.87270594, + "num_input_tokens_seen": 112459180, + "router_z_loss_clip": 1.91015625, + "router_z_loss_mlp": 0.14916992, + "step": 5238, + "time_per_iteration": 2.504612445831299 + }, + { + "auxiliary_loss_clip": 0.06376656, + "auxiliary_loss_mlp": 0.01267743, + "balance_loss_clip": 0.06282751, + "balance_loss_mlp": 0.01263604, + "epoch": 0.31498572072749137, + "flos": 67701867350400.0, + "grad_norm": 0.8276402478045692, + "language_loss": 0.68007928, + "learning_rate": 3.2068226868697795e-06, + "loss": 0.75652325, + "num_input_tokens_seen": 112516680, + "router_z_loss_clip": 0.9375, + "router_z_loss_mlp": 0.04141235, + "step": 5239, + "time_per_iteration": 3.174287796020508 + }, + { + "auxiliary_loss_clip": 0.06498836, + "auxiliary_loss_mlp": 0.01274257, + "balance_loss_clip": 0.06292844, + "balance_loss_mlp": 0.01256376, + "epoch": 0.31504584398015933, + "flos": 19799432346240.0, + "grad_norm": 2.176171670908613, + "language_loss": 0.82951081, + "learning_rate": 3.2065120955260846e-06, + "loss": 0.9072417, + "num_input_tokens_seen": 112535895, + "router_z_loss_clip": 2.0625, + "router_z_loss_mlp": 0.17883301, + "step": 5240, + "time_per_iteration": 2.509793996810913 + }, + { + "auxiliary_loss_clip": 0.06485248, + "auxiliary_loss_mlp": 0.01278588, + "balance_loss_clip": 0.06288239, + "balance_loss_mlp": 0.01262125, + "epoch": 0.3151059672328273, + "flos": 26622751628160.0, + "grad_norm": 1.8077188253124041, + "language_loss": 0.81193888, + "learning_rate": 3.2062014584313302e-06, + "loss": 0.88957721, + "num_input_tokens_seen": 112557490, + "router_z_loss_clip": 1.97363281, + "router_z_loss_mlp": 0.16455078, + "step": 5241, + "time_per_iteration": 2.571192502975464 + }, + { + "auxiliary_loss_clip": 0.06482153, + "auxiliary_loss_mlp": 0.01277268, + "balance_loss_clip": 0.06291743, + "balance_loss_mlp": 0.01260912, + "epoch": 0.31516609048549526, + "flos": 24210890904960.0, + "grad_norm": 1.4478120037649602, + "language_loss": 0.74484038, + "learning_rate": 3.2058907755972956e-06, + "loss": 0.82243454, + "num_input_tokens_seen": 112577075, + "router_z_loss_clip": 1.90332031, + "router_z_loss_mlp": 0.16357422, + "step": 5242, + "time_per_iteration": 2.526357650756836 + }, + { + "auxiliary_loss_clip": 0.06487267, + "auxiliary_loss_mlp": 0.01275494, + "balance_loss_clip": 0.06292535, + "balance_loss_mlp": 0.01259163, + "epoch": 0.31522621373816323, + "flos": 25965950999040.0, + "grad_norm": 1.6442244241642663, + "language_loss": 0.73668325, + "learning_rate": 3.2055800470357626e-06, + "loss": 0.81431091, + "num_input_tokens_seen": 112597620, + "router_z_loss_clip": 1.95019531, + "router_z_loss_mlp": 0.16320801, + "step": 5243, + "time_per_iteration": 2.606276273727417 + }, + { + "auxiliary_loss_clip": 0.06485401, + "auxiliary_loss_mlp": 0.01273843, + "balance_loss_clip": 0.0628818, + "balance_loss_mlp": 0.0125713, + "epoch": 0.3152863369908312, + "flos": 21915379975680.0, + "grad_norm": 1.7357669101009914, + "language_loss": 0.64914608, + "learning_rate": 3.205269272758513e-06, + "loss": 0.72673857, + "num_input_tokens_seen": 112617150, + "router_z_loss_clip": 1.97265625, + "router_z_loss_mlp": 0.16711426, + "step": 5244, + "time_per_iteration": 2.5950305461883545 + }, + { + "auxiliary_loss_clip": 0.06492754, + "auxiliary_loss_mlp": 0.01274277, + "balance_loss_clip": 0.06292984, + "balance_loss_mlp": 0.01257743, + "epoch": 0.31534646024349916, + "flos": 16285203308160.0, + "grad_norm": 2.8540583379791005, + "language_loss": 0.91357732, + "learning_rate": 3.2049584527773313e-06, + "loss": 0.99124765, + "num_input_tokens_seen": 112631090, + "router_z_loss_clip": 2.0, + "router_z_loss_mlp": 0.16540527, + "step": 5245, + "time_per_iteration": 2.510085105895996 + }, + { + "auxiliary_loss_clip": 0.06488977, + "auxiliary_loss_mlp": 0.01277309, + "balance_loss_clip": 0.06291293, + "balance_loss_mlp": 0.01260596, + "epoch": 0.3154065834961671, + "flos": 24724116362880.0, + "grad_norm": 1.9445780779956967, + "language_loss": 0.75699973, + "learning_rate": 3.2046475871040048e-06, + "loss": 0.83466256, + "num_input_tokens_seen": 112651220, + "router_z_loss_clip": 1.97851562, + "router_z_loss_mlp": 0.1673584, + "step": 5246, + "time_per_iteration": 2.543600559234619 + }, + { + "auxiliary_loss_clip": 0.06488622, + "auxiliary_loss_mlp": 0.01279725, + "balance_loss_clip": 0.06290317, + "balance_loss_mlp": 0.01262833, + "epoch": 0.3154667067488351, + "flos": 35379813836160.0, + "grad_norm": 1.6152414177037249, + "language_loss": 0.61608225, + "learning_rate": 3.204336675750321e-06, + "loss": 0.69376576, + "num_input_tokens_seen": 112671560, + "router_z_loss_clip": 1.98144531, + "router_z_loss_mlp": 0.16882324, + "step": 5247, + "time_per_iteration": 2.6849827766418457 + }, + { + "auxiliary_loss_clip": 0.06491058, + "auxiliary_loss_mlp": 0.01281873, + "balance_loss_clip": 0.06290263, + "balance_loss_mlp": 0.0126417, + "epoch": 0.31552683000150306, + "flos": 17462105429760.0, + "grad_norm": 2.6938697298202667, + "language_loss": 0.82848823, + "learning_rate": 3.2040257187280693e-06, + "loss": 0.90621758, + "num_input_tokens_seen": 112689790, + "router_z_loss_clip": 2.00585938, + "router_z_loss_mlp": 0.17687988, + "step": 5248, + "time_per_iteration": 2.4956586360931396 + }, + { + "auxiliary_loss_clip": 0.06488842, + "auxiliary_loss_mlp": 0.01282146, + "balance_loss_clip": 0.06291078, + "balance_loss_mlp": 0.01264121, + "epoch": 0.3155869532541711, + "flos": 18411674624640.0, + "grad_norm": 4.654519722073602, + "language_loss": 0.85721719, + "learning_rate": 3.2037147160490423e-06, + "loss": 0.93492711, + "num_input_tokens_seen": 112708265, + "router_z_loss_clip": 1.9765625, + "router_z_loss_mlp": 0.18029785, + "step": 5249, + "time_per_iteration": 2.568054437637329 + }, + { + "auxiliary_loss_clip": 0.06489561, + "auxiliary_loss_mlp": 0.01280069, + "balance_loss_clip": 0.06290483, + "balance_loss_mlp": 0.01261198, + "epoch": 0.31564707650683904, + "flos": 21586162083840.0, + "grad_norm": 1.7795262086342007, + "language_loss": 0.86067384, + "learning_rate": 3.2034036677250322e-06, + "loss": 0.93837023, + "num_input_tokens_seen": 112727820, + "router_z_loss_clip": 1.98730469, + "router_z_loss_mlp": 0.1887207, + "step": 5250, + "time_per_iteration": 2.508528709411621 + }, + { + "auxiliary_loss_clip": 0.06486481, + "auxiliary_loss_mlp": 0.01279989, + "balance_loss_clip": 0.06289366, + "balance_loss_mlp": 0.01262334, + "epoch": 0.315707199759507, + "flos": 21037032351360.0, + "grad_norm": 2.1261014211455063, + "language_loss": 0.6942147, + "learning_rate": 3.203092573767835e-06, + "loss": 0.77187943, + "num_input_tokens_seen": 112743140, + "router_z_loss_clip": 1.96777344, + "router_z_loss_mlp": 0.1763916, + "step": 5251, + "time_per_iteration": 2.526685953140259 + }, + { + "auxiliary_loss_clip": 0.06487083, + "auxiliary_loss_mlp": 0.01273182, + "balance_loss_clip": 0.06288725, + "balance_loss_mlp": 0.01255586, + "epoch": 0.31576732301217497, + "flos": 26835326236800.0, + "grad_norm": 2.019211823887184, + "language_loss": 0.78895354, + "learning_rate": 3.202781434189246e-06, + "loss": 0.86655623, + "num_input_tokens_seen": 112764705, + "router_z_loss_clip": 1.98339844, + "router_z_loss_mlp": 0.17602539, + "step": 5252, + "time_per_iteration": 2.570160150527954 + }, + { + "auxiliary_loss_clip": 0.06486022, + "auxiliary_loss_mlp": 0.01277329, + "balance_loss_clip": 0.06289184, + "balance_loss_mlp": 0.01261664, + "epoch": 0.31582744626484294, + "flos": 22717810200960.0, + "grad_norm": 1.5436537660689573, + "language_loss": 0.74377203, + "learning_rate": 3.202470249001066e-06, + "loss": 0.82140553, + "num_input_tokens_seen": 112785310, + "router_z_loss_clip": 1.96679688, + "router_z_loss_mlp": 0.15661621, + "step": 5253, + "time_per_iteration": 2.587277412414551 + }, + { + "auxiliary_loss_clip": 0.06489179, + "auxiliary_loss_mlp": 0.01281773, + "balance_loss_clip": 0.06290863, + "balance_loss_mlp": 0.01264309, + "epoch": 0.3158875695175109, + "flos": 23958806296320.0, + "grad_norm": 1.6773864910066614, + "language_loss": 0.73971915, + "learning_rate": 3.2021590182150924e-06, + "loss": 0.81742871, + "num_input_tokens_seen": 112802905, + "router_z_loss_clip": 1.98339844, + "router_z_loss_mlp": 0.17456055, + "step": 5254, + "time_per_iteration": 2.588543653488159 + }, + { + "auxiliary_loss_clip": 0.06491473, + "auxiliary_loss_mlp": 0.01275265, + "balance_loss_clip": 0.06290045, + "balance_loss_mlp": 0.01257408, + "epoch": 0.31594769277017887, + "flos": 13267036840320.0, + "grad_norm": 2.7381317978754933, + "language_loss": 0.78115344, + "learning_rate": 3.201847741843128e-06, + "loss": 0.85882092, + "num_input_tokens_seen": 112820305, + "router_z_loss_clip": 2.01367188, + "router_z_loss_mlp": 0.17858887, + "step": 5255, + "time_per_iteration": 2.5159435272216797 + }, + { + "auxiliary_loss_clip": 0.0648552, + "auxiliary_loss_mlp": 0.01275031, + "balance_loss_clip": 0.06288838, + "balance_loss_mlp": 0.01255921, + "epoch": 0.31600781602284683, + "flos": 23375072027520.0, + "grad_norm": 2.9601180138118286, + "language_loss": 0.78838313, + "learning_rate": 3.2015364198969772e-06, + "loss": 0.86598861, + "num_input_tokens_seen": 112841185, + "router_z_loss_clip": 1.96875, + "router_z_loss_mlp": 0.19104004, + "step": 5256, + "time_per_iteration": 2.560702085494995 + }, + { + "auxiliary_loss_clip": 0.06480406, + "auxiliary_loss_mlp": 0.01272902, + "balance_loss_clip": 0.06291319, + "balance_loss_mlp": 0.01257352, + "epoch": 0.3160679392755148, + "flos": 19834707715200.0, + "grad_norm": 1.443888473305352, + "language_loss": 0.71476674, + "learning_rate": 3.2012250523884453e-06, + "loss": 0.79229981, + "num_input_tokens_seen": 112860570, + "router_z_loss_clip": 1.89257812, + "router_z_loss_mlp": 0.15533447, + "step": 5257, + "time_per_iteration": 2.515044927597046 + }, + { + "auxiliary_loss_clip": 0.06490695, + "auxiliary_loss_mlp": 0.01275192, + "balance_loss_clip": 0.06291541, + "balance_loss_mlp": 0.01257787, + "epoch": 0.31612806252818276, + "flos": 20199368684160.0, + "grad_norm": 3.1125237193001967, + "language_loss": 0.77181315, + "learning_rate": 3.2009136393293393e-06, + "loss": 0.84947205, + "num_input_tokens_seen": 112877975, + "router_z_loss_clip": 1.9921875, + "router_z_loss_mlp": 0.17419434, + "step": 5258, + "time_per_iteration": 2.544926166534424 + }, + { + "auxiliary_loss_clip": 0.06484105, + "auxiliary_loss_mlp": 0.01276302, + "balance_loss_clip": 0.06286652, + "balance_loss_mlp": 0.01258624, + "epoch": 0.31618818578085073, + "flos": 24241596226560.0, + "grad_norm": 2.554871248122792, + "language_loss": 0.73012489, + "learning_rate": 3.200602180731467e-06, + "loss": 0.80772901, + "num_input_tokens_seen": 112896170, + "router_z_loss_clip": 1.97363281, + "router_z_loss_mlp": 0.17675781, + "step": 5259, + "time_per_iteration": 2.5244109630584717 + }, + { + "auxiliary_loss_clip": 0.06490766, + "auxiliary_loss_mlp": 0.01272581, + "balance_loss_clip": 0.06291697, + "balance_loss_mlp": 0.01256106, + "epoch": 0.3162483090335187, + "flos": 25088735404800.0, + "grad_norm": 2.502439629336286, + "language_loss": 0.66774327, + "learning_rate": 3.20029067660664e-06, + "loss": 0.74537671, + "num_input_tokens_seen": 112916180, + "router_z_loss_clip": 1.9921875, + "router_z_loss_mlp": 0.16455078, + "step": 5260, + "time_per_iteration": 2.575772762298584 + }, + { + "auxiliary_loss_clip": 0.06481651, + "auxiliary_loss_mlp": 0.01272837, + "balance_loss_clip": 0.06285223, + "balance_loss_mlp": 0.01256386, + "epoch": 0.31630843228618666, + "flos": 26330653895040.0, + "grad_norm": 2.0766337978972023, + "language_loss": 0.72817439, + "learning_rate": 3.1999791269666706e-06, + "loss": 0.80571926, + "num_input_tokens_seen": 112936745, + "router_z_loss_clip": 1.96679688, + "router_z_loss_mlp": 0.16455078, + "step": 5261, + "time_per_iteration": 2.559112548828125 + }, + { + "auxiliary_loss_clip": 0.06366719, + "auxiliary_loss_mlp": 0.01254616, + "balance_loss_clip": 0.06272718, + "balance_loss_mlp": 0.01250792, + "epoch": 0.3163685555388547, + "flos": 66780053856000.0, + "grad_norm": 0.7132570662369885, + "language_loss": 0.50697625, + "learning_rate": 3.1996675318233716e-06, + "loss": 0.58318961, + "num_input_tokens_seen": 112994845, + "router_z_loss_clip": 0.9375, + "router_z_loss_mlp": 0.03817749, + "step": 5262, + "time_per_iteration": 3.1381468772888184 + }, + { + "auxiliary_loss_clip": 0.06487425, + "auxiliary_loss_mlp": 0.01273056, + "balance_loss_clip": 0.06289163, + "balance_loss_mlp": 0.01256224, + "epoch": 0.31642867879152264, + "flos": 26002987303680.0, + "grad_norm": 1.713052875923359, + "language_loss": 0.85966682, + "learning_rate": 3.19935589118856e-06, + "loss": 0.9372716, + "num_input_tokens_seen": 113015125, + "router_z_loss_clip": 1.98046875, + "router_z_loss_mlp": 0.16833496, + "step": 5263, + "time_per_iteration": 2.5385844707489014 + }, + { + "auxiliary_loss_clip": 0.0647549, + "auxiliary_loss_mlp": 0.01273956, + "balance_loss_clip": 0.06283621, + "balance_loss_mlp": 0.01257695, + "epoch": 0.3164888020441906, + "flos": 25781943432960.0, + "grad_norm": 1.4697461293234868, + "language_loss": 0.82077682, + "learning_rate": 3.1990442050740535e-06, + "loss": 0.89827132, + "num_input_tokens_seen": 113035535, + "router_z_loss_clip": 1.91894531, + "router_z_loss_mlp": 0.16247559, + "step": 5264, + "time_per_iteration": 2.558708429336548 + }, + { + "auxiliary_loss_clip": 0.06488511, + "auxiliary_loss_mlp": 0.01271533, + "balance_loss_clip": 0.06288397, + "balance_loss_mlp": 0.01254117, + "epoch": 0.3165489252968586, + "flos": 19762437968640.0, + "grad_norm": 1.8601211050375244, + "language_loss": 0.80259931, + "learning_rate": 3.19873247349167e-06, + "loss": 0.88019973, + "num_input_tokens_seen": 113052720, + "router_z_loss_clip": 2.00097656, + "router_z_loss_mlp": 0.17419434, + "step": 5265, + "time_per_iteration": 2.492342948913574 + }, + { + "auxiliary_loss_clip": 0.06481829, + "auxiliary_loss_mlp": 0.01275233, + "balance_loss_clip": 0.06283312, + "balance_loss_mlp": 0.01257148, + "epoch": 0.31660904854952654, + "flos": 23190393628800.0, + "grad_norm": 2.032053662698869, + "language_loss": 0.75410831, + "learning_rate": 3.1984206964532307e-06, + "loss": 0.83167893, + "num_input_tokens_seen": 113071435, + "router_z_loss_clip": 1.98730469, + "router_z_loss_mlp": 0.1809082, + "step": 5266, + "time_per_iteration": 2.5563931465148926 + }, + { + "auxiliary_loss_clip": 0.06488708, + "auxiliary_loss_mlp": 0.01276821, + "balance_loss_clip": 0.06287502, + "balance_loss_mlp": 0.01258308, + "epoch": 0.3166691718021945, + "flos": 20414081571840.0, + "grad_norm": 2.020882594632444, + "language_loss": 0.79489279, + "learning_rate": 3.1981088739705585e-06, + "loss": 0.87254804, + "num_input_tokens_seen": 113088645, + "router_z_loss_clip": 2.01269531, + "router_z_loss_mlp": 0.18518066, + "step": 5267, + "time_per_iteration": 2.509413242340088 + }, + { + "auxiliary_loss_clip": 0.06371635, + "auxiliary_loss_mlp": 0.01254873, + "balance_loss_clip": 0.06277829, + "balance_loss_mlp": 0.01251359, + "epoch": 0.31672929505486247, + "flos": 70165816185600.0, + "grad_norm": 1.145238273522293, + "language_loss": 0.57623893, + "learning_rate": 3.197797006055478e-06, + "loss": 0.65250397, + "num_input_tokens_seen": 113152775, + "router_z_loss_clip": 0.9375, + "router_z_loss_mlp": 0.03518677, + "step": 5268, + "time_per_iteration": 4.6658477783203125 + }, + { + "auxiliary_loss_clip": 0.06486145, + "auxiliary_loss_mlp": 0.01271551, + "balance_loss_clip": 0.06287054, + "balance_loss_mlp": 0.01253884, + "epoch": 0.31678941830753043, + "flos": 14360977820160.0, + "grad_norm": 2.2953322915245784, + "language_loss": 0.73492396, + "learning_rate": 3.197485092719815e-06, + "loss": 0.81250083, + "num_input_tokens_seen": 113171410, + "router_z_loss_clip": 1.98925781, + "router_z_loss_mlp": 0.17651367, + "step": 5269, + "time_per_iteration": 2.500276565551758 + }, + { + "auxiliary_loss_clip": 0.06490922, + "auxiliary_loss_mlp": 0.01279355, + "balance_loss_clip": 0.06295022, + "balance_loss_mlp": 0.01261652, + "epoch": 0.3168495415601984, + "flos": 22754385308160.0, + "grad_norm": 1.8930521062253438, + "language_loss": 0.80391312, + "learning_rate": 3.1971731339753973e-06, + "loss": 0.88161588, + "num_input_tokens_seen": 113189965, + "router_z_loss_clip": 1.9609375, + "router_z_loss_mlp": 0.17700195, + "step": 5270, + "time_per_iteration": 4.030852794647217 + }, + { + "auxiliary_loss_clip": 0.0648749, + "auxiliary_loss_mlp": 0.01275027, + "balance_loss_clip": 0.06288311, + "balance_loss_mlp": 0.01257742, + "epoch": 0.31690966481286637, + "flos": 20120558319360.0, + "grad_norm": 2.0275703030815744, + "language_loss": 0.79860884, + "learning_rate": 3.1968611298340545e-06, + "loss": 0.87623405, + "num_input_tokens_seen": 113206355, + "router_z_loss_clip": 1.99121094, + "router_z_loss_mlp": 0.17285156, + "step": 5271, + "time_per_iteration": 3.963491201400757 + }, + { + "auxiliary_loss_clip": 0.06485552, + "auxiliary_loss_mlp": 0.01274595, + "balance_loss_clip": 0.06286864, + "balance_loss_mlp": 0.01256344, + "epoch": 0.31696978806553433, + "flos": 21185345278080.0, + "grad_norm": 2.0532864997035616, + "language_loss": 0.7348994, + "learning_rate": 3.1965490803076173e-06, + "loss": 0.81250083, + "num_input_tokens_seen": 113225440, + "router_z_loss_clip": 1.98828125, + "router_z_loss_mlp": 0.18237305, + "step": 5272, + "time_per_iteration": 2.5324926376342773 + }, + { + "auxiliary_loss_clip": 0.06497657, + "auxiliary_loss_mlp": 0.01275072, + "balance_loss_clip": 0.06294467, + "balance_loss_mlp": 0.01255629, + "epoch": 0.3170299113182023, + "flos": 43007030789760.0, + "grad_norm": 2.3636013379780083, + "language_loss": 0.69916022, + "learning_rate": 3.1962369854079194e-06, + "loss": 0.77688754, + "num_input_tokens_seen": 113248840, + "router_z_loss_clip": 2.03320312, + "router_z_loss_mlp": 0.19458008, + "step": 5273, + "time_per_iteration": 2.8313193321228027 + }, + { + "auxiliary_loss_clip": 0.0648469, + "auxiliary_loss_mlp": 0.01272488, + "balance_loss_clip": 0.06288255, + "balance_loss_mlp": 0.01255954, + "epoch": 0.31709003457087026, + "flos": 24466707020160.0, + "grad_norm": 3.373298123766896, + "language_loss": 0.68486917, + "learning_rate": 3.195924845146795e-06, + "loss": 0.76244098, + "num_input_tokens_seen": 113269630, + "router_z_loss_clip": 1.9609375, + "router_z_loss_mlp": 0.1652832, + "step": 5274, + "time_per_iteration": 2.5647053718566895 + }, + { + "auxiliary_loss_clip": 0.06486842, + "auxiliary_loss_mlp": 0.01272159, + "balance_loss_clip": 0.06295811, + "balance_loss_mlp": 0.01256114, + "epoch": 0.3171501578235382, + "flos": 24142394592000.0, + "grad_norm": 1.437173314012816, + "language_loss": 0.8105545, + "learning_rate": 3.195612659536081e-06, + "loss": 0.88814449, + "num_input_tokens_seen": 113291200, + "router_z_loss_clip": 1.91308594, + "router_z_loss_mlp": 0.16052246, + "step": 5275, + "time_per_iteration": 2.545689821243286 + }, + { + "auxiliary_loss_clip": 0.06496362, + "auxiliary_loss_mlp": 0.01272499, + "balance_loss_clip": 0.0629561, + "balance_loss_mlp": 0.01254296, + "epoch": 0.31721028107620625, + "flos": 18885641644800.0, + "grad_norm": 1.7797970991839078, + "language_loss": 0.73459136, + "learning_rate": 3.1953004285876147e-06, + "loss": 0.81228, + "num_input_tokens_seen": 113310170, + "router_z_loss_clip": 2.00683594, + "router_z_loss_mlp": 0.18212891, + "step": 5276, + "time_per_iteration": 3.978994131088257 + }, + { + "auxiliary_loss_clip": 0.06480486, + "auxiliary_loss_mlp": 0.01276369, + "balance_loss_clip": 0.06288992, + "balance_loss_mlp": 0.01259811, + "epoch": 0.3172704043288742, + "flos": 23154405500160.0, + "grad_norm": 1.4192945576637652, + "language_loss": 0.78409082, + "learning_rate": 3.194988152313236e-06, + "loss": 0.86165935, + "num_input_tokens_seen": 113331140, + "router_z_loss_clip": 1.91699219, + "router_z_loss_mlp": 0.16552734, + "step": 5277, + "time_per_iteration": 2.6181840896606445 + }, + { + "auxiliary_loss_clip": 0.06493685, + "auxiliary_loss_mlp": 0.01273951, + "balance_loss_clip": 0.06294833, + "balance_loss_mlp": 0.01256653, + "epoch": 0.3173305275815422, + "flos": 17864347754880.0, + "grad_norm": 1.9934204528772321, + "language_loss": 0.79709554, + "learning_rate": 3.1946758307247878e-06, + "loss": 0.87477195, + "num_input_tokens_seen": 113350030, + "router_z_loss_clip": 1.98828125, + "router_z_loss_mlp": 0.17297363, + "step": 5278, + "time_per_iteration": 2.4955894947052 + }, + { + "auxiliary_loss_clip": 0.06380783, + "auxiliary_loss_mlp": 0.01265109, + "balance_loss_clip": 0.06288064, + "balance_loss_mlp": 0.01260886, + "epoch": 0.31739065083421014, + "flos": 59988083529600.0, + "grad_norm": 0.841903886868049, + "language_loss": 0.62797457, + "learning_rate": 3.1943634638341114e-06, + "loss": 0.7044335, + "num_input_tokens_seen": 113395820, + "router_z_loss_clip": 0.92724609, + "router_z_loss_mlp": 0.04226685, + "step": 5279, + "time_per_iteration": 2.920987367630005 + }, + { + "auxiliary_loss_clip": 0.06489395, + "auxiliary_loss_mlp": 0.01285376, + "balance_loss_clip": 0.06287935, + "balance_loss_mlp": 0.01265265, + "epoch": 0.3174507740868781, + "flos": 23807013425280.0, + "grad_norm": 2.0709232065681475, + "language_loss": 0.81487882, + "learning_rate": 3.194051051653053e-06, + "loss": 0.89262652, + "num_input_tokens_seen": 113416835, + "router_z_loss_clip": 2.01367188, + "router_z_loss_mlp": 0.2010498, + "step": 5280, + "time_per_iteration": 2.537612199783325 + }, + { + "auxiliary_loss_clip": 0.06483282, + "auxiliary_loss_mlp": 0.01281645, + "balance_loss_clip": 0.06291374, + "balance_loss_mlp": 0.01264276, + "epoch": 0.31751089733954607, + "flos": 27646728848640.0, + "grad_norm": 1.437826441265799, + "language_loss": 0.78464299, + "learning_rate": 3.19373859419346e-06, + "loss": 0.86229229, + "num_input_tokens_seen": 113440850, + "router_z_loss_clip": 1.91699219, + "router_z_loss_mlp": 0.17358398, + "step": 5281, + "time_per_iteration": 2.6482186317443848 + }, + { + "auxiliary_loss_clip": 0.06485789, + "auxiliary_loss_mlp": 0.01283007, + "balance_loss_clip": 0.06290175, + "balance_loss_mlp": 0.01265424, + "epoch": 0.31757102059221404, + "flos": 23776098468480.0, + "grad_norm": 1.5338111796323235, + "language_loss": 0.78882301, + "learning_rate": 3.193426091467179e-06, + "loss": 0.86651099, + "num_input_tokens_seen": 113461000, + "router_z_loss_clip": 1.95410156, + "router_z_loss_mlp": 0.17590332, + "step": 5282, + "time_per_iteration": 2.5157217979431152 + }, + { + "auxiliary_loss_clip": 0.06494205, + "auxiliary_loss_mlp": 0.01276135, + "balance_loss_clip": 0.0629286, + "balance_loss_mlp": 0.01258373, + "epoch": 0.317631143844882, + "flos": 25271485159680.0, + "grad_norm": 2.0006947857157753, + "language_loss": 0.67952389, + "learning_rate": 3.193113543486061e-06, + "loss": 0.7572273, + "num_input_tokens_seen": 113480820, + "router_z_loss_clip": 2.01367188, + "router_z_loss_mlp": 0.1776123, + "step": 5283, + "time_per_iteration": 2.565925359725952 + }, + { + "auxiliary_loss_clip": 0.06373101, + "auxiliary_loss_mlp": 0.01271528, + "balance_loss_clip": 0.0628058, + "balance_loss_mlp": 0.01267352, + "epoch": 0.31769126709754997, + "flos": 55841832743040.0, + "grad_norm": 0.7241871595116953, + "language_loss": 0.52631503, + "learning_rate": 3.192800950261958e-06, + "loss": 0.60276127, + "num_input_tokens_seen": 113536910, + "router_z_loss_clip": 0.921875, + "router_z_loss_mlp": 0.04177856, + "step": 5284, + "time_per_iteration": 3.1037213802337646 + }, + { + "auxiliary_loss_clip": 0.0649649, + "auxiliary_loss_mlp": 0.01274319, + "balance_loss_clip": 0.06291351, + "balance_loss_mlp": 0.01257225, + "epoch": 0.31775139035021793, + "flos": 16696124530560.0, + "grad_norm": 2.2460762000689294, + "language_loss": 0.70842284, + "learning_rate": 3.1924883118067235e-06, + "loss": 0.78613091, + "num_input_tokens_seen": 113555480, + "router_z_loss_clip": 2.04980469, + "router_z_loss_mlp": 0.17102051, + "step": 5285, + "time_per_iteration": 2.5407655239105225 + }, + { + "auxiliary_loss_clip": 0.06366412, + "auxiliary_loss_mlp": 0.01262401, + "balance_loss_clip": 0.06274283, + "balance_loss_mlp": 0.01258384, + "epoch": 0.3178115136028859, + "flos": 64246141261440.0, + "grad_norm": 1.0137073922687154, + "language_loss": 0.60545647, + "learning_rate": 3.1921756281322123e-06, + "loss": 0.68174458, + "num_input_tokens_seen": 113616790, + "router_z_loss_clip": 0.921875, + "router_z_loss_mlp": 0.04016113, + "step": 5286, + "time_per_iteration": 3.1833202838897705 + }, + { + "auxiliary_loss_clip": 0.06498363, + "auxiliary_loss_mlp": 0.01284909, + "balance_loss_clip": 0.06297486, + "balance_loss_mlp": 0.01267051, + "epoch": 0.31787163685555386, + "flos": 18703395014400.0, + "grad_norm": 1.7319286904547555, + "language_loss": 0.72404122, + "learning_rate": 3.1918628992502826e-06, + "loss": 0.80187392, + "num_input_tokens_seen": 113635320, + "router_z_loss_clip": 2.0078125, + "router_z_loss_mlp": 0.17871094, + "step": 5287, + "time_per_iteration": 2.50571608543396 + }, + { + "auxiliary_loss_clip": 0.06495041, + "auxiliary_loss_mlp": 0.01276683, + "balance_loss_clip": 0.06292516, + "balance_loss_mlp": 0.012578, + "epoch": 0.31793176010822183, + "flos": 21331184509440.0, + "grad_norm": 1.978321388726588, + "language_loss": 0.76231503, + "learning_rate": 3.191550125172792e-06, + "loss": 0.84003228, + "num_input_tokens_seen": 113654000, + "router_z_loss_clip": 2.02539062, + "router_z_loss_mlp": 0.18884277, + "step": 5288, + "time_per_iteration": 2.5568416118621826 + }, + { + "auxiliary_loss_clip": 0.06485806, + "auxiliary_loss_mlp": 0.01283528, + "balance_loss_clip": 0.06293501, + "balance_loss_mlp": 0.01267816, + "epoch": 0.31799188336088985, + "flos": 20964846458880.0, + "grad_norm": 1.7076221862053031, + "language_loss": 0.88265222, + "learning_rate": 3.1912373059116007e-06, + "loss": 0.96034551, + "num_input_tokens_seen": 113672375, + "router_z_loss_clip": 1.92578125, + "router_z_loss_mlp": 0.15710449, + "step": 5289, + "time_per_iteration": 2.5359349250793457 + }, + { + "auxiliary_loss_clip": 0.06488061, + "auxiliary_loss_mlp": 0.01286652, + "balance_loss_clip": 0.06295781, + "balance_loss_mlp": 0.01269724, + "epoch": 0.3180520066135578, + "flos": 22498485338880.0, + "grad_norm": 1.4069348748047803, + "language_loss": 0.68210149, + "learning_rate": 3.190924441478572e-06, + "loss": 0.75984859, + "num_input_tokens_seen": 113692385, + "router_z_loss_clip": 1.92382812, + "router_z_loss_mlp": 0.16906738, + "step": 5290, + "time_per_iteration": 2.5393311977386475 + }, + { + "auxiliary_loss_clip": 0.06494544, + "auxiliary_loss_mlp": 0.0128386, + "balance_loss_clip": 0.06290419, + "balance_loss_mlp": 0.01265788, + "epoch": 0.3181121298662258, + "flos": 27242725587840.0, + "grad_norm": 3.4346413288346, + "language_loss": 0.79944348, + "learning_rate": 3.1906115318855687e-06, + "loss": 0.87722754, + "num_input_tokens_seen": 113712145, + "router_z_loss_clip": 2.04003906, + "router_z_loss_mlp": 0.18066406, + "step": 5291, + "time_per_iteration": 2.564091444015503 + }, + { + "auxiliary_loss_clip": 0.06485635, + "auxiliary_loss_mlp": 0.01278435, + "balance_loss_clip": 0.06287642, + "balance_loss_mlp": 0.01259361, + "epoch": 0.31817225311889374, + "flos": 23185991289600.0, + "grad_norm": 2.0451390273410004, + "language_loss": 0.79931051, + "learning_rate": 3.1902985771444577e-06, + "loss": 0.87695122, + "num_input_tokens_seen": 113731435, + "router_z_loss_clip": 1.97949219, + "router_z_loss_mlp": 0.19067383, + "step": 5292, + "time_per_iteration": 2.743156671524048 + }, + { + "auxiliary_loss_clip": 0.06476898, + "auxiliary_loss_mlp": 0.01275055, + "balance_loss_clip": 0.06287324, + "balance_loss_mlp": 0.01258044, + "epoch": 0.3182323763715617, + "flos": 23265598268160.0, + "grad_norm": 1.819133879513315, + "language_loss": 0.75602406, + "learning_rate": 3.1899855772671043e-06, + "loss": 0.8335436, + "num_input_tokens_seen": 113750825, + "router_z_loss_clip": 1.89550781, + "router_z_loss_mlp": 0.17004395, + "step": 5293, + "time_per_iteration": 2.523386001586914 + }, + { + "auxiliary_loss_clip": 0.06482453, + "auxiliary_loss_mlp": 0.01276012, + "balance_loss_clip": 0.06290737, + "balance_loss_mlp": 0.01260204, + "epoch": 0.3182924996242297, + "flos": 29023292050560.0, + "grad_norm": 2.0524562129349526, + "language_loss": 0.75145984, + "learning_rate": 3.189672532265379e-06, + "loss": 0.82904446, + "num_input_tokens_seen": 113770010, + "router_z_loss_clip": 1.91601562, + "router_z_loss_mlp": 0.15808105, + "step": 5294, + "time_per_iteration": 2.607849597930908 + }, + { + "auxiliary_loss_clip": 0.06489888, + "auxiliary_loss_mlp": 0.01277785, + "balance_loss_clip": 0.06293514, + "balance_loss_mlp": 0.01259201, + "epoch": 0.31835262287689764, + "flos": 20455478288640.0, + "grad_norm": 2.029675905915872, + "language_loss": 0.76497674, + "learning_rate": 3.189359442151152e-06, + "loss": 0.84265351, + "num_input_tokens_seen": 113788640, + "router_z_loss_clip": 1.96582031, + "router_z_loss_mlp": 0.18591309, + "step": 5295, + "time_per_iteration": 2.4980461597442627 + }, + { + "auxiliary_loss_clip": 0.06494178, + "auxiliary_loss_mlp": 0.01278535, + "balance_loss_clip": 0.06293284, + "balance_loss_mlp": 0.01261166, + "epoch": 0.3184127461295656, + "flos": 25126568323200.0, + "grad_norm": 2.03182891885516, + "language_loss": 0.70142519, + "learning_rate": 3.189046306936296e-06, + "loss": 0.77915227, + "num_input_tokens_seen": 113809515, + "router_z_loss_clip": 2.0078125, + "router_z_loss_mlp": 0.17358398, + "step": 5296, + "time_per_iteration": 2.610671043395996 + }, + { + "auxiliary_loss_clip": 0.06483515, + "auxiliary_loss_mlp": 0.01274893, + "balance_loss_clip": 0.0628704, + "balance_loss_mlp": 0.01258371, + "epoch": 0.31847286938223357, + "flos": 25557377690880.0, + "grad_norm": 1.5251920176335134, + "language_loss": 0.77957898, + "learning_rate": 3.1887331266326846e-06, + "loss": 0.85716307, + "num_input_tokens_seen": 113829770, + "router_z_loss_clip": 1.96484375, + "router_z_loss_mlp": 0.16516113, + "step": 5297, + "time_per_iteration": 2.539649486541748 + }, + { + "auxiliary_loss_clip": 0.06479752, + "auxiliary_loss_mlp": 0.01272766, + "balance_loss_clip": 0.06283344, + "balance_loss_mlp": 0.01255516, + "epoch": 0.31853299263490154, + "flos": 27789926676480.0, + "grad_norm": 1.8177911904554251, + "language_loss": 0.80074358, + "learning_rate": 3.1884199012521942e-06, + "loss": 0.87826872, + "num_input_tokens_seen": 113849320, + "router_z_loss_clip": 1.96386719, + "router_z_loss_mlp": 0.17248535, + "step": 5298, + "time_per_iteration": 2.6127634048461914 + }, + { + "auxiliary_loss_clip": 0.06487016, + "auxiliary_loss_mlp": 0.0127216, + "balance_loss_clip": 0.06284906, + "balance_loss_mlp": 0.01254815, + "epoch": 0.3185931158875695, + "flos": 22712653175040.0, + "grad_norm": 1.6158824069779534, + "language_loss": 0.74615932, + "learning_rate": 3.1881066308067016e-06, + "loss": 0.82375109, + "num_input_tokens_seen": 113867860, + "router_z_loss_clip": 2.02148438, + "router_z_loss_mlp": 0.17346191, + "step": 5299, + "time_per_iteration": 2.570178508758545 + }, + { + "auxiliary_loss_clip": 0.06491919, + "auxiliary_loss_mlp": 0.01275355, + "balance_loss_clip": 0.06290901, + "balance_loss_mlp": 0.01258249, + "epoch": 0.31865323914023747, + "flos": 24578402912640.0, + "grad_norm": 1.9760141697724851, + "language_loss": 0.78568625, + "learning_rate": 3.1877933153080873e-06, + "loss": 0.86335897, + "num_input_tokens_seen": 113886375, + "router_z_loss_clip": 2.0078125, + "router_z_loss_mlp": 0.17102051, + "step": 5300, + "time_per_iteration": 2.7260777950286865 + }, + { + "auxiliary_loss_clip": 0.06483838, + "auxiliary_loss_mlp": 0.01272854, + "balance_loss_clip": 0.06287212, + "balance_loss_mlp": 0.01254495, + "epoch": 0.31871336239290543, + "flos": 18192391689600.0, + "grad_norm": 2.1538981188283195, + "language_loss": 0.84250915, + "learning_rate": 3.1874799547682304e-06, + "loss": 0.92007607, + "num_input_tokens_seen": 113904065, + "router_z_loss_clip": 1.96386719, + "router_z_loss_mlp": 0.18347168, + "step": 5301, + "time_per_iteration": 2.485152244567871 + }, + { + "auxiliary_loss_clip": 0.06484723, + "auxiliary_loss_mlp": 0.01274861, + "balance_loss_clip": 0.06291914, + "balance_loss_mlp": 0.01256777, + "epoch": 0.31877348564557345, + "flos": 21831789928320.0, + "grad_norm": 2.0482094969798696, + "language_loss": 0.7812382, + "learning_rate": 3.187166549199015e-06, + "loss": 0.85883403, + "num_input_tokens_seen": 113918415, + "router_z_loss_clip": 1.92675781, + "router_z_loss_mlp": 0.18066406, + "step": 5302, + "time_per_iteration": 2.528764247894287 + }, + { + "auxiliary_loss_clip": 0.0648333, + "auxiliary_loss_mlp": 0.01275814, + "balance_loss_clip": 0.06290714, + "balance_loss_mlp": 0.01258159, + "epoch": 0.3188336088982414, + "flos": 22021331863680.0, + "grad_norm": 1.6144767194600491, + "language_loss": 0.79736584, + "learning_rate": 3.1868530986123255e-06, + "loss": 0.8749572, + "num_input_tokens_seen": 113938135, + "router_z_loss_clip": 1.92675781, + "router_z_loss_mlp": 0.17651367, + "step": 5303, + "time_per_iteration": 2.5235095024108887 + }, + { + "auxiliary_loss_clip": 0.06497993, + "auxiliary_loss_mlp": 0.01276899, + "balance_loss_clip": 0.06290174, + "balance_loss_mlp": 0.01258159, + "epoch": 0.3188937321509094, + "flos": 20054116431360.0, + "grad_norm": 1.7320090718032515, + "language_loss": 0.73529422, + "learning_rate": 3.186539603020047e-06, + "loss": 0.81304312, + "num_input_tokens_seen": 113957125, + "router_z_loss_clip": 2.08007812, + "router_z_loss_mlp": 0.18737793, + "step": 5304, + "time_per_iteration": 2.5141329765319824 + }, + { + "auxiliary_loss_clip": 0.06481734, + "auxiliary_loss_mlp": 0.01278154, + "balance_loss_clip": 0.06290816, + "balance_loss_mlp": 0.01260928, + "epoch": 0.31895385540357735, + "flos": 25855135574400.0, + "grad_norm": 1.8091269764667626, + "language_loss": 0.72548914, + "learning_rate": 3.186226062434068e-06, + "loss": 0.80308801, + "num_input_tokens_seen": 113974875, + "router_z_loss_clip": 1.91113281, + "router_z_loss_mlp": 0.17236328, + "step": 5305, + "time_per_iteration": 2.5648975372314453 + }, + { + "auxiliary_loss_clip": 0.06487268, + "auxiliary_loss_mlp": 0.01270708, + "balance_loss_clip": 0.06292576, + "balance_loss_mlp": 0.01254603, + "epoch": 0.3190139786562453, + "flos": 23484545786880.0, + "grad_norm": 2.116447005947582, + "language_loss": 0.64815247, + "learning_rate": 3.1859124768662778e-06, + "loss": 0.72573221, + "num_input_tokens_seen": 113994450, + "router_z_loss_clip": 1.94628906, + "router_z_loss_mlp": 0.16113281, + "step": 5306, + "time_per_iteration": 2.5745668411254883 + }, + { + "auxiliary_loss_clip": 0.06483987, + "auxiliary_loss_mlp": 0.01282676, + "balance_loss_clip": 0.0628574, + "balance_loss_mlp": 0.01264413, + "epoch": 0.3190741019089133, + "flos": 29103150591360.0, + "grad_norm": 2.0084949709877726, + "language_loss": 0.79260421, + "learning_rate": 3.1855988463285678e-06, + "loss": 0.87027091, + "num_input_tokens_seen": 114013945, + "router_z_loss_clip": 1.98144531, + "router_z_loss_mlp": 0.18273926, + "step": 5307, + "time_per_iteration": 2.557509183883667 + }, + { + "auxiliary_loss_clip": 0.06481419, + "auxiliary_loss_mlp": 0.01278653, + "balance_loss_clip": 0.06289747, + "balance_loss_mlp": 0.01260736, + "epoch": 0.31913422516158124, + "flos": 17135361233280.0, + "grad_norm": 3.9021838038471097, + "language_loss": 0.78660965, + "learning_rate": 3.1852851708328308e-06, + "loss": 0.86421037, + "num_input_tokens_seen": 114031375, + "router_z_loss_clip": 1.91796875, + "router_z_loss_mlp": 0.17907715, + "step": 5308, + "time_per_iteration": 3.906280994415283 + }, + { + "auxiliary_loss_clip": 0.06493698, + "auxiliary_loss_mlp": 0.01278493, + "balance_loss_clip": 0.06287338, + "balance_loss_mlp": 0.01259408, + "epoch": 0.3191943484142492, + "flos": 16075228176000.0, + "grad_norm": 3.1945469837170215, + "language_loss": 0.74758154, + "learning_rate": 3.184971450390961e-06, + "loss": 0.82530349, + "num_input_tokens_seen": 114048465, + "router_z_loss_clip": 2.06152344, + "router_z_loss_mlp": 0.19091797, + "step": 5309, + "time_per_iteration": 2.4796438217163086 + }, + { + "auxiliary_loss_clip": 0.06480245, + "auxiliary_loss_mlp": 0.01274762, + "balance_loss_clip": 0.06283399, + "balance_loss_mlp": 0.01257954, + "epoch": 0.3192544716669172, + "flos": 22972787775360.0, + "grad_norm": 1.6995242114780418, + "language_loss": 0.83242565, + "learning_rate": 3.184657685014856e-06, + "loss": 0.90997577, + "num_input_tokens_seen": 114068415, + "router_z_loss_clip": 1.96777344, + "router_z_loss_mlp": 0.16809082, + "step": 5310, + "time_per_iteration": 5.470219373703003 + }, + { + "auxiliary_loss_clip": 0.06475915, + "auxiliary_loss_mlp": 0.01272391, + "balance_loss_clip": 0.06281388, + "balance_loss_mlp": 0.01255868, + "epoch": 0.31931459491958514, + "flos": 26877645348480.0, + "grad_norm": 1.407923936832892, + "language_loss": 0.78906345, + "learning_rate": 3.184343874716412e-06, + "loss": 0.86654651, + "num_input_tokens_seen": 114088565, + "router_z_loss_clip": 1.94628906, + "router_z_loss_mlp": 0.1652832, + "step": 5311, + "time_per_iteration": 2.546112298965454 + }, + { + "auxiliary_loss_clip": 0.06477334, + "auxiliary_loss_mlp": 0.01272194, + "balance_loss_clip": 0.06282097, + "balance_loss_mlp": 0.01255254, + "epoch": 0.3193747181722531, + "flos": 21843194083200.0, + "grad_norm": 1.8192899238067177, + "language_loss": 0.84889889, + "learning_rate": 3.1840300195075295e-06, + "loss": 0.92639416, + "num_input_tokens_seen": 114107160, + "router_z_loss_clip": 1.95117188, + "router_z_loss_mlp": 0.16943359, + "step": 5312, + "time_per_iteration": 2.5534987449645996 + }, + { + "auxiliary_loss_clip": 0.06489489, + "auxiliary_loss_mlp": 0.01274677, + "balance_loss_clip": 0.06284228, + "balance_loss_mlp": 0.012567, + "epoch": 0.31943484142492107, + "flos": 18329593950720.0, + "grad_norm": 3.1557419136729536, + "language_loss": 0.79280984, + "learning_rate": 3.1837161194001102e-06, + "loss": 0.87045145, + "num_input_tokens_seen": 114123420, + "router_z_loss_clip": 2.05078125, + "router_z_loss_mlp": 0.17980957, + "step": 5313, + "time_per_iteration": 2.47098445892334 + }, + { + "auxiliary_loss_clip": 0.06477478, + "auxiliary_loss_mlp": 0.01274452, + "balance_loss_clip": 0.06281047, + "balance_loss_mlp": 0.01256618, + "epoch": 0.31949496467758903, + "flos": 21622150212480.0, + "grad_norm": 2.7721598847405584, + "language_loss": 0.86245549, + "learning_rate": 3.183402174406057e-06, + "loss": 0.93997484, + "num_input_tokens_seen": 114139230, + "router_z_loss_clip": 1.96484375, + "router_z_loss_mlp": 0.17834473, + "step": 5314, + "time_per_iteration": 2.531196117401123 + }, + { + "auxiliary_loss_clip": 0.0647811, + "auxiliary_loss_mlp": 0.0127239, + "balance_loss_clip": 0.06281686, + "balance_loss_mlp": 0.01255188, + "epoch": 0.31955508793025705, + "flos": 21766312362240.0, + "grad_norm": 1.712027342879292, + "language_loss": 0.80238831, + "learning_rate": 3.1830881845372747e-06, + "loss": 0.8798933, + "num_input_tokens_seen": 114159290, + "router_z_loss_clip": 1.96386719, + "router_z_loss_mlp": 0.17199707, + "step": 5315, + "time_per_iteration": 2.5066771507263184 + }, + { + "auxiliary_loss_clip": 0.06485026, + "auxiliary_loss_mlp": 0.01283831, + "balance_loss_clip": 0.06286455, + "balance_loss_mlp": 0.01265854, + "epoch": 0.319615211182925, + "flos": 17169881915520.0, + "grad_norm": 2.687676993792702, + "language_loss": 0.67569852, + "learning_rate": 3.18277414980567e-06, + "loss": 0.75338709, + "num_input_tokens_seen": 114177655, + "router_z_loss_clip": 1.98632812, + "router_z_loss_mlp": 0.17980957, + "step": 5316, + "time_per_iteration": 3.943110942840576 + }, + { + "auxiliary_loss_clip": 0.0648303, + "auxiliary_loss_mlp": 0.01272207, + "balance_loss_clip": 0.0628902, + "balance_loss_mlp": 0.01255566, + "epoch": 0.319675334435593, + "flos": 28120653941760.0, + "grad_norm": 1.5692381446514811, + "language_loss": 0.69637752, + "learning_rate": 3.1824600702231515e-06, + "loss": 0.77392983, + "num_input_tokens_seen": 114200880, + "router_z_loss_clip": 1.93652344, + "router_z_loss_mlp": 0.16650391, + "step": 5317, + "time_per_iteration": 2.642251491546631 + }, + { + "auxiliary_loss_clip": 0.06377298, + "auxiliary_loss_mlp": 0.0129256, + "balance_loss_clip": 0.06285109, + "balance_loss_mlp": 0.01288716, + "epoch": 0.31973545768826095, + "flos": 69524235072000.0, + "grad_norm": 0.7198160842036254, + "language_loss": 0.5281924, + "learning_rate": 3.182145945801628e-06, + "loss": 0.60489094, + "num_input_tokens_seen": 114267145, + "router_z_loss_clip": 0.921875, + "router_z_loss_mlp": 0.03839111, + "step": 5318, + "time_per_iteration": 3.2718679904937744 + }, + { + "auxiliary_loss_clip": 0.06479475, + "auxiliary_loss_mlp": 0.01271921, + "balance_loss_clip": 0.0628712, + "balance_loss_mlp": 0.01254969, + "epoch": 0.3197955809409289, + "flos": 13704344899200.0, + "grad_norm": 1.5995609143402318, + "language_loss": 0.84504628, + "learning_rate": 3.181831776553012e-06, + "loss": 0.92256021, + "num_input_tokens_seen": 114284630, + "router_z_loss_clip": 1.92480469, + "router_z_loss_mlp": 0.16955566, + "step": 5319, + "time_per_iteration": 2.5372629165649414 + }, + { + "auxiliary_loss_clip": 0.06480815, + "auxiliary_loss_mlp": 0.01279474, + "balance_loss_clip": 0.06286162, + "balance_loss_mlp": 0.01261199, + "epoch": 0.3198557041935969, + "flos": 33226368704640.0, + "grad_norm": 1.6136244255626262, + "language_loss": 0.64208525, + "learning_rate": 3.1815175624892165e-06, + "loss": 0.71968812, + "num_input_tokens_seen": 114305830, + "router_z_loss_clip": 1.9453125, + "router_z_loss_mlp": 0.18273926, + "step": 5320, + "time_per_iteration": 2.675477981567383 + }, + { + "auxiliary_loss_clip": 0.0648189, + "auxiliary_loss_mlp": 0.01271878, + "balance_loss_clip": 0.06280586, + "balance_loss_mlp": 0.01254402, + "epoch": 0.31991582744626484, + "flos": 23738726747520.0, + "grad_norm": 1.9696222638037655, + "language_loss": 0.71059012, + "learning_rate": 3.1812033036221567e-06, + "loss": 0.78812778, + "num_input_tokens_seen": 114325165, + "router_z_loss_clip": 2.01171875, + "router_z_loss_mlp": 0.17480469, + "step": 5321, + "time_per_iteration": 2.6383230686187744 + }, + { + "auxiliary_loss_clip": 0.06491005, + "auxiliary_loss_mlp": 0.01288903, + "balance_loss_clip": 0.06286187, + "balance_loss_mlp": 0.01270318, + "epoch": 0.3199759506989328, + "flos": 18556633388160.0, + "grad_norm": 2.30981924299517, + "language_loss": 0.86988461, + "learning_rate": 3.180888999963749e-06, + "loss": 0.94768369, + "num_input_tokens_seen": 114341310, + "router_z_loss_clip": 2.04980469, + "router_z_loss_mlp": 0.18591309, + "step": 5322, + "time_per_iteration": 2.4862442016601562 + }, + { + "auxiliary_loss_clip": 0.0648296, + "auxiliary_loss_mlp": 0.01273077, + "balance_loss_clip": 0.06285054, + "balance_loss_mlp": 0.01256722, + "epoch": 0.3200360739516008, + "flos": 22425418978560.0, + "grad_norm": 1.6041292280722281, + "language_loss": 0.83380175, + "learning_rate": 3.1805746515259123e-06, + "loss": 0.91136217, + "num_input_tokens_seen": 114360355, + "router_z_loss_clip": 1.9765625, + "router_z_loss_mlp": 0.16369629, + "step": 5323, + "time_per_iteration": 2.5262420177459717 + }, + { + "auxiliary_loss_clip": 0.06476378, + "auxiliary_loss_mlp": 0.01277972, + "balance_loss_clip": 0.06284457, + "balance_loss_mlp": 0.01258529, + "epoch": 0.32009619720426874, + "flos": 20601569082240.0, + "grad_norm": 1.775654796490425, + "language_loss": 0.78471839, + "learning_rate": 3.1802602583205663e-06, + "loss": 0.86226195, + "num_input_tokens_seen": 114379220, + "router_z_loss_clip": 1.91796875, + "router_z_loss_mlp": 0.19433594, + "step": 5324, + "time_per_iteration": 2.492380380630493 + }, + { + "auxiliary_loss_clip": 0.06478705, + "auxiliary_loss_mlp": 0.01274174, + "balance_loss_clip": 0.06283212, + "balance_loss_mlp": 0.01256042, + "epoch": 0.3201563204569367, + "flos": 18153049397760.0, + "grad_norm": 1.7224742254360714, + "language_loss": 0.80742848, + "learning_rate": 3.1799458203596333e-06, + "loss": 0.88495719, + "num_input_tokens_seen": 114396365, + "router_z_loss_clip": 1.95507812, + "router_z_loss_mlp": 0.18139648, + "step": 5325, + "time_per_iteration": 2.4962642192840576 + }, + { + "auxiliary_loss_clip": 0.06478769, + "auxiliary_loss_mlp": 0.01277308, + "balance_loss_clip": 0.06280222, + "balance_loss_mlp": 0.01259701, + "epoch": 0.32021644370960467, + "flos": 31691975137920.0, + "grad_norm": 1.8321318923341703, + "language_loss": 0.75898254, + "learning_rate": 3.179631337655037e-06, + "loss": 0.83654332, + "num_input_tokens_seen": 114416780, + "router_z_loss_clip": 1.98632812, + "router_z_loss_mlp": 0.17602539, + "step": 5326, + "time_per_iteration": 2.5752692222595215 + }, + { + "auxiliary_loss_clip": 0.06472234, + "auxiliary_loss_mlp": 0.01278108, + "balance_loss_clip": 0.06281741, + "balance_loss_mlp": 0.01260918, + "epoch": 0.32027656696227264, + "flos": 26872488322560.0, + "grad_norm": 1.458996564995821, + "language_loss": 0.81400204, + "learning_rate": 3.179316810218701e-06, + "loss": 0.89150548, + "num_input_tokens_seen": 114437405, + "router_z_loss_clip": 1.90527344, + "router_z_loss_mlp": 0.171875, + "step": 5327, + "time_per_iteration": 2.5635383129119873 + }, + { + "auxiliary_loss_clip": 0.06486546, + "auxiliary_loss_mlp": 0.01273421, + "balance_loss_clip": 0.062847, + "balance_loss_mlp": 0.01256207, + "epoch": 0.32033669021494066, + "flos": 24176705639040.0, + "grad_norm": 1.3787000535244864, + "language_loss": 0.77910948, + "learning_rate": 3.179002238062554e-06, + "loss": 0.85670912, + "num_input_tokens_seen": 114458505, + "router_z_loss_clip": 2.01855469, + "router_z_loss_mlp": 0.17211914, + "step": 5328, + "time_per_iteration": 2.514646053314209 + }, + { + "auxiliary_loss_clip": 0.06484267, + "auxiliary_loss_mlp": 0.01278516, + "balance_loss_clip": 0.06287045, + "balance_loss_mlp": 0.0125992, + "epoch": 0.3203968134676086, + "flos": 24467419779840.0, + "grad_norm": 1.5501370939230803, + "language_loss": 0.74267161, + "learning_rate": 3.178687621198524e-06, + "loss": 0.82029939, + "num_input_tokens_seen": 114479050, + "router_z_loss_clip": 1.97265625, + "router_z_loss_mlp": 0.18591309, + "step": 5329, + "time_per_iteration": 2.5436654090881348 + }, + { + "auxiliary_loss_clip": 0.06471072, + "auxiliary_loss_mlp": 0.01278598, + "balance_loss_clip": 0.06282842, + "balance_loss_mlp": 0.01262434, + "epoch": 0.3204569367202766, + "flos": 18010606256640.0, + "grad_norm": 1.7046636031855489, + "language_loss": 0.71222955, + "learning_rate": 3.1783729596385415e-06, + "loss": 0.78972626, + "num_input_tokens_seen": 114497415, + "router_z_loss_clip": 1.8828125, + "router_z_loss_mlp": 0.16162109, + "step": 5330, + "time_per_iteration": 2.479647397994995 + }, + { + "auxiliary_loss_clip": 0.06485157, + "auxiliary_loss_mlp": 0.01277162, + "balance_loss_clip": 0.0628237, + "balance_loss_mlp": 0.0125791, + "epoch": 0.32051705997294455, + "flos": 30597237544320.0, + "grad_norm": 1.705143811074938, + "language_loss": 0.80496192, + "learning_rate": 3.1780582533945376e-06, + "loss": 0.88258511, + "num_input_tokens_seen": 114518785, + "router_z_loss_clip": 2.02832031, + "router_z_loss_mlp": 0.19250488, + "step": 5331, + "time_per_iteration": 2.5741958618164062 + }, + { + "auxiliary_loss_clip": 0.06384323, + "auxiliary_loss_mlp": 0.0125803, + "balance_loss_clip": 0.06292279, + "balance_loss_mlp": 0.01253741, + "epoch": 0.3205771832256125, + "flos": 68436723657600.0, + "grad_norm": 0.7949538218297083, + "language_loss": 0.5776577, + "learning_rate": 3.177743502478447e-06, + "loss": 0.65408123, + "num_input_tokens_seen": 114577710, + "router_z_loss_clip": 0.921875, + "router_z_loss_mlp": 0.04293823, + "step": 5332, + "time_per_iteration": 3.084747314453125 + }, + { + "auxiliary_loss_clip": 0.06488422, + "auxiliary_loss_mlp": 0.01272523, + "balance_loss_clip": 0.06286052, + "balance_loss_mlp": 0.01255154, + "epoch": 0.3206373064782805, + "flos": 30451524094080.0, + "grad_norm": 1.5377704746044631, + "language_loss": 0.73702615, + "learning_rate": 3.177428706902205e-06, + "loss": 0.81463563, + "num_input_tokens_seen": 114598640, + "router_z_loss_clip": 2.0234375, + "router_z_loss_mlp": 0.17358398, + "step": 5333, + "time_per_iteration": 2.6130683422088623 + }, + { + "auxiliary_loss_clip": 0.06480561, + "auxiliary_loss_mlp": 0.01273615, + "balance_loss_clip": 0.06284031, + "balance_loss_mlp": 0.01256246, + "epoch": 0.32069742973094845, + "flos": 22061051498880.0, + "grad_norm": 1.6882238799892797, + "language_loss": 0.70957875, + "learning_rate": 3.1771138666777485e-06, + "loss": 0.78712052, + "num_input_tokens_seen": 114618780, + "router_z_loss_clip": 1.96484375, + "router_z_loss_mlp": 0.17382812, + "step": 5334, + "time_per_iteration": 2.5501654148101807 + }, + { + "auxiliary_loss_clip": 0.06476508, + "auxiliary_loss_mlp": 0.01276305, + "balance_loss_clip": 0.06281763, + "balance_loss_mlp": 0.01257947, + "epoch": 0.3207575529836164, + "flos": 22060464520320.0, + "grad_norm": 1.723674002448169, + "language_loss": 0.77349097, + "learning_rate": 3.1767989818170156e-06, + "loss": 0.85101908, + "num_input_tokens_seen": 114637525, + "router_z_loss_clip": 1.94433594, + "router_z_loss_mlp": 0.18347168, + "step": 5335, + "time_per_iteration": 2.5194711685180664 + }, + { + "auxiliary_loss_clip": 0.06479798, + "auxiliary_loss_mlp": 0.0127571, + "balance_loss_clip": 0.06285612, + "balance_loss_mlp": 0.0125889, + "epoch": 0.3208176762362844, + "flos": 34065961015680.0, + "grad_norm": 1.52521333905674, + "language_loss": 0.68891776, + "learning_rate": 3.1764840523319477e-06, + "loss": 0.76647282, + "num_input_tokens_seen": 114659705, + "router_z_loss_clip": 1.94238281, + "router_z_loss_mlp": 0.16809082, + "step": 5336, + "time_per_iteration": 2.6550848484039307 + }, + { + "auxiliary_loss_clip": 0.06481949, + "auxiliary_loss_mlp": 0.01285819, + "balance_loss_clip": 0.06286713, + "balance_loss_mlp": 0.01268343, + "epoch": 0.32087779948895234, + "flos": 21805151529600.0, + "grad_norm": 1.6666772631518172, + "language_loss": 0.79367507, + "learning_rate": 3.176169078234487e-06, + "loss": 0.87135273, + "num_input_tokens_seen": 114678340, + "router_z_loss_clip": 1.95410156, + "router_z_loss_mlp": 0.17480469, + "step": 5337, + "time_per_iteration": 2.5133795738220215 + }, + { + "auxiliary_loss_clip": 0.06473362, + "auxiliary_loss_mlp": 0.01277197, + "balance_loss_clip": 0.06284505, + "balance_loss_mlp": 0.01260865, + "epoch": 0.3209379227416203, + "flos": 21440532487680.0, + "grad_norm": 1.6244255970978692, + "language_loss": 0.75145769, + "learning_rate": 3.1758540595365766e-06, + "loss": 0.82896328, + "num_input_tokens_seen": 114696980, + "router_z_loss_clip": 1.88964844, + "router_z_loss_mlp": 0.16320801, + "step": 5338, + "time_per_iteration": 2.526841402053833 + }, + { + "auxiliary_loss_clip": 0.06482957, + "auxiliary_loss_mlp": 0.01277739, + "balance_loss_clip": 0.06285477, + "balance_loss_mlp": 0.01260216, + "epoch": 0.3209980459942883, + "flos": 25856267604480.0, + "grad_norm": 1.7965894601451369, + "language_loss": 0.63241929, + "learning_rate": 3.1755389962501626e-06, + "loss": 0.7100262, + "num_input_tokens_seen": 114717330, + "router_z_loss_clip": 1.97558594, + "router_z_loss_mlp": 0.17504883, + "step": 5339, + "time_per_iteration": 2.5847740173339844 + }, + { + "auxiliary_loss_clip": 0.06482022, + "auxiliary_loss_mlp": 0.0127165, + "balance_loss_clip": 0.06283947, + "balance_loss_mlp": 0.01255151, + "epoch": 0.32105816924695624, + "flos": 19105218069120.0, + "grad_norm": 2.418138513897033, + "language_loss": 0.81912339, + "learning_rate": 3.175223888387192e-06, + "loss": 0.89666009, + "num_input_tokens_seen": 114736320, + "router_z_loss_clip": 1.97949219, + "router_z_loss_mlp": 0.16491699, + "step": 5340, + "time_per_iteration": 2.5764145851135254 + }, + { + "auxiliary_loss_clip": 0.06475554, + "auxiliary_loss_mlp": 0.01271917, + "balance_loss_clip": 0.06281976, + "balance_loss_mlp": 0.01254774, + "epoch": 0.3211182924996242, + "flos": 16587531239040.0, + "grad_norm": 1.7719401771551753, + "language_loss": 0.76604897, + "learning_rate": 3.1749087359596137e-06, + "loss": 0.84352368, + "num_input_tokens_seen": 114754575, + "router_z_loss_clip": 1.93652344, + "router_z_loss_mlp": 0.17150879, + "step": 5341, + "time_per_iteration": 2.505668878555298 + }, + { + "auxiliary_loss_clip": 0.06474154, + "auxiliary_loss_mlp": 0.01272611, + "balance_loss_clip": 0.0628191, + "balance_loss_mlp": 0.01255969, + "epoch": 0.3211784157522922, + "flos": 22678425982080.0, + "grad_norm": 1.4764530250267398, + "language_loss": 0.79422891, + "learning_rate": 3.1745935389793786e-06, + "loss": 0.87169659, + "num_input_tokens_seen": 114773590, + "router_z_loss_clip": 1.92382812, + "router_z_loss_mlp": 0.16662598, + "step": 5342, + "time_per_iteration": 2.5391595363616943 + }, + { + "auxiliary_loss_clip": 0.06483465, + "auxiliary_loss_mlp": 0.01277474, + "balance_loss_clip": 0.06286988, + "balance_loss_mlp": 0.01260141, + "epoch": 0.3212385390049602, + "flos": 20565119756160.0, + "grad_norm": 2.45787142613039, + "language_loss": 0.75074786, + "learning_rate": 3.174278297458438e-06, + "loss": 0.82835722, + "num_input_tokens_seen": 114790775, + "router_z_loss_clip": 1.96484375, + "router_z_loss_mlp": 0.17321777, + "step": 5343, + "time_per_iteration": 2.4957783222198486 + }, + { + "auxiliary_loss_clip": 0.06479985, + "auxiliary_loss_mlp": 0.01272066, + "balance_loss_clip": 0.06286201, + "balance_loss_mlp": 0.01255043, + "epoch": 0.32129866225762815, + "flos": 24798188972160.0, + "grad_norm": 1.5494427093400844, + "language_loss": 0.82596725, + "learning_rate": 3.173963011408748e-06, + "loss": 0.9034878, + "num_input_tokens_seen": 114809835, + "router_z_loss_clip": 1.94042969, + "router_z_loss_mlp": 0.17028809, + "step": 5344, + "time_per_iteration": 2.5672519207000732 + }, + { + "auxiliary_loss_clip": 0.06478736, + "auxiliary_loss_mlp": 0.01273821, + "balance_loss_clip": 0.06282513, + "balance_loss_mlp": 0.0125731, + "epoch": 0.3213587855102961, + "flos": 18372374259840.0, + "grad_norm": 1.9111940233558649, + "language_loss": 0.80321491, + "learning_rate": 3.173647680842262e-06, + "loss": 0.8807404, + "num_input_tokens_seen": 114826505, + "router_z_loss_clip": 1.96289062, + "router_z_loss_mlp": 0.16516113, + "step": 5345, + "time_per_iteration": 2.479442834854126 + }, + { + "auxiliary_loss_clip": 0.06478975, + "auxiliary_loss_mlp": 0.01271046, + "balance_loss_clip": 0.06283471, + "balance_loss_mlp": 0.01254321, + "epoch": 0.3214189087629641, + "flos": 27023274944640.0, + "grad_norm": 1.7019036305222461, + "language_loss": 0.83604348, + "learning_rate": 3.1733323057709384e-06, + "loss": 0.9135437, + "num_input_tokens_seen": 114846140, + "router_z_loss_clip": 1.95507812, + "router_z_loss_mlp": 0.16723633, + "step": 5346, + "time_per_iteration": 2.549257755279541 + }, + { + "auxiliary_loss_clip": 0.0648382, + "auxiliary_loss_mlp": 0.01272196, + "balance_loss_clip": 0.06285056, + "balance_loss_mlp": 0.0125528, + "epoch": 0.32147903201563205, + "flos": 23154866697600.0, + "grad_norm": 1.4545038816344273, + "language_loss": 0.81656283, + "learning_rate": 3.1730168862067366e-06, + "loss": 0.89412296, + "num_input_tokens_seen": 114866660, + "router_z_loss_clip": 1.98925781, + "router_z_loss_mlp": 0.16918945, + "step": 5347, + "time_per_iteration": 2.5096054077148438 + }, + { + "auxiliary_loss_clip": 0.06480029, + "auxiliary_loss_mlp": 0.01274054, + "balance_loss_clip": 0.06286772, + "balance_loss_mlp": 0.01256673, + "epoch": 0.3215391552683, + "flos": 16586231500800.0, + "grad_norm": 2.536962878441814, + "language_loss": 0.80386555, + "learning_rate": 3.1727014221616164e-06, + "loss": 0.88140643, + "num_input_tokens_seen": 114882820, + "router_z_loss_clip": 1.9296875, + "router_z_loss_mlp": 0.1739502, + "step": 5348, + "time_per_iteration": 3.9639015197753906 + }, + { + "auxiliary_loss_clip": 0.06474565, + "auxiliary_loss_mlp": 0.01276371, + "balance_loss_clip": 0.06280862, + "balance_loss_mlp": 0.01259431, + "epoch": 0.321599278520968, + "flos": 17827604939520.0, + "grad_norm": 2.026618804026968, + "language_loss": 0.85758352, + "learning_rate": 3.172385913647542e-06, + "loss": 0.93509287, + "num_input_tokens_seen": 114900745, + "router_z_loss_clip": 1.93554688, + "router_z_loss_mlp": 0.16943359, + "step": 5349, + "time_per_iteration": 3.8848202228546143 + }, + { + "auxiliary_loss_clip": 0.06481349, + "auxiliary_loss_mlp": 0.01274724, + "balance_loss_clip": 0.06286412, + "balance_loss_mlp": 0.01257022, + "epoch": 0.32165940177363594, + "flos": 16257097463040.0, + "grad_norm": 1.7607877661370477, + "language_loss": 0.8123306, + "learning_rate": 3.172070360676475e-06, + "loss": 0.88989133, + "num_input_tokens_seen": 114917940, + "router_z_loss_clip": 1.95019531, + "router_z_loss_mlp": 0.17700195, + "step": 5350, + "time_per_iteration": 3.9589500427246094 + }, + { + "auxiliary_loss_clip": 0.06471309, + "auxiliary_loss_mlp": 0.01270957, + "balance_loss_clip": 0.06282239, + "balance_loss_mlp": 0.01255055, + "epoch": 0.3217195250263039, + "flos": 27607302702720.0, + "grad_norm": 1.8529018663543275, + "language_loss": 0.80116528, + "learning_rate": 3.1717547632603828e-06, + "loss": 0.87858802, + "num_input_tokens_seen": 114937735, + "router_z_loss_clip": 1.89257812, + "router_z_loss_mlp": 0.15905762, + "step": 5351, + "time_per_iteration": 2.562232732772827 + }, + { + "auxiliary_loss_clip": 0.06476732, + "auxiliary_loss_mlp": 0.01274907, + "balance_loss_clip": 0.06284767, + "balance_loss_mlp": 0.01256668, + "epoch": 0.3217796482789719, + "flos": 21477023740800.0, + "grad_norm": 2.0321110975992562, + "language_loss": 0.7641573, + "learning_rate": 3.1714391214112326e-06, + "loss": 0.84167361, + "num_input_tokens_seen": 114956630, + "router_z_loss_clip": 1.91699219, + "router_z_loss_mlp": 0.18249512, + "step": 5352, + "time_per_iteration": 2.5320773124694824 + }, + { + "auxiliary_loss_clip": 0.0648407, + "auxiliary_loss_mlp": 0.01278365, + "balance_loss_clip": 0.06291708, + "balance_loss_mlp": 0.0126133, + "epoch": 0.32183977153163984, + "flos": 21222046166400.0, + "grad_norm": 1.9188598206640457, + "language_loss": 0.82159722, + "learning_rate": 3.1711234351409933e-06, + "loss": 0.89922154, + "num_input_tokens_seen": 114976470, + "router_z_loss_clip": 1.92089844, + "router_z_loss_mlp": 0.17028809, + "step": 5353, + "time_per_iteration": 2.5061802864074707 + }, + { + "auxiliary_loss_clip": 0.06480308, + "auxiliary_loss_mlp": 0.01275858, + "balance_loss_clip": 0.0629053, + "balance_loss_mlp": 0.0125837, + "epoch": 0.3218998947843078, + "flos": 24615103800960.0, + "grad_norm": 1.8505936463490174, + "language_loss": 0.74125177, + "learning_rate": 3.1708077044616365e-06, + "loss": 0.81881344, + "num_input_tokens_seen": 114996710, + "router_z_loss_clip": 1.89941406, + "router_z_loss_mlp": 0.17480469, + "step": 5354, + "time_per_iteration": 2.5725185871124268 + }, + { + "auxiliary_loss_clip": 0.06479903, + "auxiliary_loss_mlp": 0.01277081, + "balance_loss_clip": 0.06284866, + "balance_loss_mlp": 0.01259951, + "epoch": 0.3219600180369758, + "flos": 22276686781440.0, + "grad_norm": 2.612968571970558, + "language_loss": 0.83769405, + "learning_rate": 3.1704919293851334e-06, + "loss": 0.91526389, + "num_input_tokens_seen": 115015775, + "router_z_loss_clip": 1.95019531, + "router_z_loss_mlp": 0.17126465, + "step": 5355, + "time_per_iteration": 3.985846757888794 + }, + { + "auxiliary_loss_clip": 0.0647967, + "auxiliary_loss_mlp": 0.01272253, + "balance_loss_clip": 0.0628608, + "balance_loss_mlp": 0.01255528, + "epoch": 0.3220201412896438, + "flos": 14944376672640.0, + "grad_norm": 1.8959584470465125, + "language_loss": 0.71344721, + "learning_rate": 3.1701761099234597e-06, + "loss": 0.79096651, + "num_input_tokens_seen": 115034265, + "router_z_loss_clip": 1.93554688, + "router_z_loss_mlp": 0.1673584, + "step": 5356, + "time_per_iteration": 2.5644400119781494 + }, + { + "auxiliary_loss_clip": 0.06494904, + "auxiliary_loss_mlp": 0.01280986, + "balance_loss_clip": 0.0629259, + "balance_loss_mlp": 0.01263367, + "epoch": 0.32208026454231176, + "flos": 22672807758720.0, + "grad_norm": 2.5335154176231525, + "language_loss": 0.67879629, + "learning_rate": 3.1698602460885903e-06, + "loss": 0.7565552, + "num_input_tokens_seen": 115051945, + "router_z_loss_clip": 2.02050781, + "router_z_loss_mlp": 0.17614746, + "step": 5357, + "time_per_iteration": 2.546654224395752 + }, + { + "auxiliary_loss_clip": 0.06384487, + "auxiliary_loss_mlp": 0.01261366, + "balance_loss_clip": 0.06294875, + "balance_loss_mlp": 0.01257649, + "epoch": 0.3221403877949797, + "flos": 64626273308160.0, + "grad_norm": 0.6824166316331671, + "language_loss": 0.58314437, + "learning_rate": 3.1695443378925035e-06, + "loss": 0.65960288, + "num_input_tokens_seen": 115119090, + "router_z_loss_clip": 0.89648438, + "router_z_loss_mlp": 0.03707886, + "step": 5358, + "time_per_iteration": 3.2290756702423096 + }, + { + "auxiliary_loss_clip": 0.06481851, + "auxiliary_loss_mlp": 0.01282518, + "balance_loss_clip": 0.06287378, + "balance_loss_mlp": 0.01264839, + "epoch": 0.3222005110476477, + "flos": 20163212847360.0, + "grad_norm": 1.9186908993809755, + "language_loss": 0.84190667, + "learning_rate": 3.1692283853471777e-06, + "loss": 0.91955042, + "num_input_tokens_seen": 115137755, + "router_z_loss_clip": 1.94628906, + "router_z_loss_mlp": 0.17675781, + "step": 5359, + "time_per_iteration": 2.531033754348755 + }, + { + "auxiliary_loss_clip": 0.06480163, + "auxiliary_loss_mlp": 0.01277134, + "balance_loss_clip": 0.06287846, + "balance_loss_mlp": 0.01260051, + "epoch": 0.32226063430031565, + "flos": 22680731969280.0, + "grad_norm": 1.6695480137557102, + "language_loss": 0.79997146, + "learning_rate": 3.168912388464595e-06, + "loss": 0.87754452, + "num_input_tokens_seen": 115158150, + "router_z_loss_clip": 1.921875, + "router_z_loss_mlp": 0.17077637, + "step": 5360, + "time_per_iteration": 2.544461727142334 + }, + { + "auxiliary_loss_clip": 0.06382456, + "auxiliary_loss_mlp": 0.01256795, + "balance_loss_clip": 0.06292457, + "balance_loss_mlp": 0.01253353, + "epoch": 0.3223207575529836, + "flos": 63847798151040.0, + "grad_norm": 0.6356253914940931, + "language_loss": 0.56731617, + "learning_rate": 3.168596347256737e-06, + "loss": 0.64370871, + "num_input_tokens_seen": 115212755, + "router_z_loss_clip": 0.8984375, + "router_z_loss_mlp": 0.03451538, + "step": 5361, + "time_per_iteration": 3.0336568355560303 + }, + { + "auxiliary_loss_clip": 0.06478466, + "auxiliary_loss_mlp": 0.01277797, + "balance_loss_clip": 0.06288562, + "balance_loss_mlp": 0.01261346, + "epoch": 0.3223808808056516, + "flos": 26877393786240.0, + "grad_norm": 2.167930910708006, + "language_loss": 0.71792114, + "learning_rate": 3.168280261735588e-06, + "loss": 0.79548371, + "num_input_tokens_seen": 115233090, + "router_z_loss_clip": 1.89746094, + "router_z_loss_mlp": 0.16442871, + "step": 5362, + "time_per_iteration": 2.561345338821411 + }, + { + "auxiliary_loss_clip": 0.06483887, + "auxiliary_loss_mlp": 0.01279203, + "balance_loss_clip": 0.06293412, + "balance_loss_mlp": 0.01262692, + "epoch": 0.32244100405831955, + "flos": 26768716640640.0, + "grad_norm": 1.5327886568658977, + "language_loss": 0.73854291, + "learning_rate": 3.167964131913135e-06, + "loss": 0.81617379, + "num_input_tokens_seen": 115252645, + "router_z_loss_clip": 1.90722656, + "router_z_loss_mlp": 0.16503906, + "step": 5363, + "time_per_iteration": 2.583064556121826 + }, + { + "auxiliary_loss_clip": 0.06489229, + "auxiliary_loss_mlp": 0.01275466, + "balance_loss_clip": 0.06291971, + "balance_loss_mlp": 0.01258717, + "epoch": 0.3225011273109875, + "flos": 23809403266560.0, + "grad_norm": 2.354374584633167, + "language_loss": 0.76664144, + "learning_rate": 3.167647957801365e-06, + "loss": 0.84428835, + "num_input_tokens_seen": 115269085, + "router_z_loss_clip": 1.97265625, + "router_z_loss_mlp": 0.16748047, + "step": 5364, + "time_per_iteration": 2.5177268981933594 + }, + { + "auxiliary_loss_clip": 0.06479897, + "auxiliary_loss_mlp": 0.01275674, + "balance_loss_clip": 0.06290577, + "balance_loss_mlp": 0.01259473, + "epoch": 0.3225612505636555, + "flos": 17280194215680.0, + "grad_norm": 2.1891061142162327, + "language_loss": 0.7715044, + "learning_rate": 3.1673317394122672e-06, + "loss": 0.84906018, + "num_input_tokens_seen": 115286470, + "router_z_loss_clip": 1.89257812, + "router_z_loss_mlp": 0.1619873, + "step": 5365, + "time_per_iteration": 2.5122928619384766 + }, + { + "auxiliary_loss_clip": 0.06484331, + "auxiliary_loss_mlp": 0.01277663, + "balance_loss_clip": 0.06292351, + "balance_loss_mlp": 0.01260711, + "epoch": 0.32262137381632344, + "flos": 23372724113280.0, + "grad_norm": 2.314444268247813, + "language_loss": 0.77153468, + "learning_rate": 3.1670154767578333e-06, + "loss": 0.84915465, + "num_input_tokens_seen": 115307000, + "router_z_loss_clip": 1.92089844, + "router_z_loss_mlp": 0.16955566, + "step": 5366, + "time_per_iteration": 2.514768362045288 + }, + { + "auxiliary_loss_clip": 0.06481092, + "auxiliary_loss_mlp": 0.01280366, + "balance_loss_clip": 0.0629226, + "balance_loss_mlp": 0.0126388, + "epoch": 0.3226814970689914, + "flos": 23265598268160.0, + "grad_norm": 1.8642315088319754, + "language_loss": 0.72423649, + "learning_rate": 3.166699169850055e-06, + "loss": 0.80185115, + "num_input_tokens_seen": 115325925, + "router_z_loss_clip": 1.88769531, + "router_z_loss_mlp": 0.16491699, + "step": 5367, + "time_per_iteration": 2.544145345687866 + }, + { + "auxiliary_loss_clip": 0.06480073, + "auxiliary_loss_mlp": 0.01278287, + "balance_loss_clip": 0.06290721, + "balance_loss_mlp": 0.01262248, + "epoch": 0.32274162032165943, + "flos": 16400127582720.0, + "grad_norm": 1.9542840286813894, + "language_loss": 0.74559301, + "learning_rate": 3.1663828187009274e-06, + "loss": 0.82317662, + "num_input_tokens_seen": 115343705, + "router_z_loss_clip": 1.89453125, + "router_z_loss_mlp": 0.16033936, + "step": 5368, + "time_per_iteration": 2.4653942584991455 + }, + { + "auxiliary_loss_clip": 0.06481207, + "auxiliary_loss_mlp": 0.01271425, + "balance_loss_clip": 0.06294385, + "balance_loss_mlp": 0.01255874, + "epoch": 0.3228017435743274, + "flos": 27862489912320.0, + "grad_norm": 2.016369988637382, + "language_loss": 0.79033995, + "learning_rate": 3.1660664233224467e-06, + "loss": 0.86786628, + "num_input_tokens_seen": 115364170, + "router_z_loss_clip": 1.86621094, + "router_z_loss_mlp": 0.15533447, + "step": 5369, + "time_per_iteration": 2.6923141479492188 + }, + { + "auxiliary_loss_clip": 0.06471382, + "auxiliary_loss_mlp": 0.01280148, + "balance_loss_clip": 0.0628759, + "balance_loss_mlp": 0.01264567, + "epoch": 0.32286186682699536, + "flos": 19614712020480.0, + "grad_norm": 1.8619928029866217, + "language_loss": 0.83607441, + "learning_rate": 3.16574998372661e-06, + "loss": 0.91358972, + "num_input_tokens_seen": 115382495, + "router_z_loss_clip": 1.83984375, + "router_z_loss_mlp": 0.15576172, + "step": 5370, + "time_per_iteration": 2.4963490962982178 + }, + { + "auxiliary_loss_clip": 0.06481104, + "auxiliary_loss_mlp": 0.01278081, + "balance_loss_clip": 0.062904, + "balance_loss_mlp": 0.01262703, + "epoch": 0.3229219900796633, + "flos": 24140885218560.0, + "grad_norm": 2.7780356443351146, + "language_loss": 0.83346975, + "learning_rate": 3.1654334999254177e-06, + "loss": 0.91106164, + "num_input_tokens_seen": 115399450, + "router_z_loss_clip": 1.90625, + "router_z_loss_mlp": 0.15368652, + "step": 5371, + "time_per_iteration": 2.554034948348999 + }, + { + "auxiliary_loss_clip": 0.06486623, + "auxiliary_loss_mlp": 0.01278101, + "balance_loss_clip": 0.0629211, + "balance_loss_mlp": 0.01260434, + "epoch": 0.3229821133323313, + "flos": 17754454725120.0, + "grad_norm": 2.279534384310274, + "language_loss": 0.89153087, + "learning_rate": 3.1651169719308695e-06, + "loss": 0.96917808, + "num_input_tokens_seen": 115417700, + "router_z_loss_clip": 1.94433594, + "router_z_loss_mlp": 0.17663574, + "step": 5372, + "time_per_iteration": 2.468693971633911 + }, + { + "auxiliary_loss_clip": 0.06478924, + "auxiliary_loss_mlp": 0.01278448, + "balance_loss_clip": 0.06288313, + "balance_loss_mlp": 0.01261843, + "epoch": 0.32304223658499925, + "flos": 22352562253440.0, + "grad_norm": 1.986067660558338, + "language_loss": 0.730793, + "learning_rate": 3.1648003997549694e-06, + "loss": 0.80836678, + "num_input_tokens_seen": 115435840, + "router_z_loss_clip": 1.90332031, + "router_z_loss_mlp": 0.16601562, + "step": 5373, + "time_per_iteration": 2.5757906436920166 + }, + { + "auxiliary_loss_clip": 0.06476311, + "auxiliary_loss_mlp": 0.0127432, + "balance_loss_clip": 0.06293686, + "balance_loss_mlp": 0.01258227, + "epoch": 0.3231023598376672, + "flos": 18484154006400.0, + "grad_norm": 2.1970042176000963, + "language_loss": 0.82592154, + "learning_rate": 3.1644837834097214e-06, + "loss": 0.90342778, + "num_input_tokens_seen": 115454210, + "router_z_loss_clip": 1.82714844, + "router_z_loss_mlp": 0.1607666, + "step": 5374, + "time_per_iteration": 2.4853713512420654 + }, + { + "auxiliary_loss_clip": 0.06474404, + "auxiliary_loss_mlp": 0.01271223, + "balance_loss_clip": 0.06291121, + "balance_loss_mlp": 0.0125544, + "epoch": 0.3231624830903352, + "flos": 27643710101760.0, + "grad_norm": 1.9120740622639463, + "language_loss": 0.88405079, + "learning_rate": 3.1641671229071317e-06, + "loss": 0.96150708, + "num_input_tokens_seen": 115471785, + "router_z_loss_clip": 1.83007812, + "router_z_loss_mlp": 0.15783691, + "step": 5375, + "time_per_iteration": 2.58644700050354 + }, + { + "auxiliary_loss_clip": 0.06483716, + "auxiliary_loss_mlp": 0.01275166, + "balance_loss_clip": 0.06289761, + "balance_loss_mlp": 0.01258799, + "epoch": 0.32322260634300315, + "flos": 21732965637120.0, + "grad_norm": 2.2884949024183983, + "language_loss": 0.76224899, + "learning_rate": 3.1638504182592076e-06, + "loss": 0.83983773, + "num_input_tokens_seen": 115491405, + "router_z_loss_clip": 1.94042969, + "router_z_loss_mlp": 0.16345215, + "step": 5376, + "time_per_iteration": 2.5090999603271484 + }, + { + "auxiliary_loss_clip": 0.0647772, + "auxiliary_loss_mlp": 0.01272254, + "balance_loss_clip": 0.06289793, + "balance_loss_mlp": 0.01256649, + "epoch": 0.3232827295956711, + "flos": 22644198789120.0, + "grad_norm": 1.5259481118475857, + "language_loss": 0.67275858, + "learning_rate": 3.1635336694779594e-06, + "loss": 0.75025833, + "num_input_tokens_seen": 115511555, + "router_z_loss_clip": 1.88085938, + "router_z_loss_mlp": 0.15594482, + "step": 5377, + "time_per_iteration": 2.592737913131714 + }, + { + "auxiliary_loss_clip": 0.06482306, + "auxiliary_loss_mlp": 0.01279693, + "balance_loss_clip": 0.06294581, + "balance_loss_mlp": 0.01262158, + "epoch": 0.3233428528483391, + "flos": 26329731500160.0, + "grad_norm": 1.747214931760967, + "language_loss": 0.73022175, + "learning_rate": 3.1632168765753982e-06, + "loss": 0.80784178, + "num_input_tokens_seen": 115532860, + "router_z_loss_clip": 1.87597656, + "router_z_loss_mlp": 0.17541504, + "step": 5378, + "time_per_iteration": 2.560969114303589 + }, + { + "auxiliary_loss_clip": 0.06476232, + "auxiliary_loss_mlp": 0.01272167, + "balance_loss_clip": 0.06289409, + "balance_loss_mlp": 0.01256598, + "epoch": 0.32340297610100704, + "flos": 28592818099200.0, + "grad_norm": 2.0362074337070832, + "language_loss": 0.82332939, + "learning_rate": 3.1629000395635357e-06, + "loss": 0.90081334, + "num_input_tokens_seen": 115553850, + "router_z_loss_clip": 1.86816406, + "router_z_loss_mlp": 0.15563965, + "step": 5379, + "time_per_iteration": 2.661787986755371 + }, + { + "auxiliary_loss_clip": 0.06481552, + "auxiliary_loss_mlp": 0.01276474, + "balance_loss_clip": 0.06288823, + "balance_loss_mlp": 0.01260548, + "epoch": 0.323463099353675, + "flos": 30781664380800.0, + "grad_norm": 1.6212615798097256, + "language_loss": 0.78942055, + "learning_rate": 3.162583158454388e-06, + "loss": 0.86700082, + "num_input_tokens_seen": 115575530, + "router_z_loss_clip": 1.92285156, + "router_z_loss_mlp": 0.15942383, + "step": 5380, + "time_per_iteration": 2.593618631362915 + }, + { + "auxiliary_loss_clip": 0.06489569, + "auxiliary_loss_mlp": 0.01272069, + "balance_loss_clip": 0.06298643, + "balance_loss_mlp": 0.01255368, + "epoch": 0.32352322260634303, + "flos": 25235664739200.0, + "grad_norm": 1.685322069138263, + "language_loss": 0.77853882, + "learning_rate": 3.1622662332599697e-06, + "loss": 0.85615522, + "num_input_tokens_seen": 115594885, + "router_z_loss_clip": 1.91015625, + "router_z_loss_mlp": 0.16699219, + "step": 5381, + "time_per_iteration": 2.5967609882354736 + }, + { + "auxiliary_loss_clip": 0.06476527, + "auxiliary_loss_mlp": 0.01269308, + "balance_loss_clip": 0.06292967, + "balance_loss_mlp": 0.01255438, + "epoch": 0.323583345859011, + "flos": 23337071400960.0, + "grad_norm": 1.9004028984655497, + "language_loss": 0.72391021, + "learning_rate": 3.1619492639922998e-06, + "loss": 0.80136859, + "num_input_tokens_seen": 115614080, + "router_z_loss_clip": 1.83496094, + "router_z_loss_mlp": 0.13848877, + "step": 5382, + "time_per_iteration": 2.5095293521881104 + }, + { + "auxiliary_loss_clip": 0.06488711, + "auxiliary_loss_mlp": 0.01277606, + "balance_loss_clip": 0.06295708, + "balance_loss_mlp": 0.01262157, + "epoch": 0.32364346911167896, + "flos": 26213675195520.0, + "grad_norm": 2.3447859303702883, + "language_loss": 0.71528596, + "learning_rate": 3.1616322506633964e-06, + "loss": 0.79294908, + "num_input_tokens_seen": 115632820, + "router_z_loss_clip": 1.9296875, + "router_z_loss_mlp": 0.15441895, + "step": 5383, + "time_per_iteration": 2.5806562900543213 + }, + { + "auxiliary_loss_clip": 0.06476977, + "auxiliary_loss_mlp": 0.01276799, + "balance_loss_clip": 0.06292375, + "balance_loss_mlp": 0.01261564, + "epoch": 0.3237035923643469, + "flos": 23702487056640.0, + "grad_norm": 1.948915226701978, + "language_loss": 0.78857487, + "learning_rate": 3.161315193285283e-06, + "loss": 0.86611259, + "num_input_tokens_seen": 115652860, + "router_z_loss_clip": 1.84472656, + "router_z_loss_mlp": 0.15234375, + "step": 5384, + "time_per_iteration": 2.548797369003296 + }, + { + "auxiliary_loss_clip": 0.06481218, + "auxiliary_loss_mlp": 0.01274762, + "balance_loss_clip": 0.06288576, + "balance_loss_mlp": 0.0125793, + "epoch": 0.3237637156170149, + "flos": 14433960326400.0, + "grad_norm": 1.885180362402172, + "language_loss": 0.75034815, + "learning_rate": 3.16099809186998e-06, + "loss": 0.82790792, + "num_input_tokens_seen": 115670940, + "router_z_loss_clip": 1.92675781, + "router_z_loss_mlp": 0.16821289, + "step": 5385, + "time_per_iteration": 2.577547073364258 + }, + { + "auxiliary_loss_clip": 0.06486371, + "auxiliary_loss_mlp": 0.0127007, + "balance_loss_clip": 0.06298091, + "balance_loss_mlp": 0.01255032, + "epoch": 0.32382383886968286, + "flos": 31070449877760.0, + "grad_norm": 1.8174179211363362, + "language_loss": 0.72224641, + "learning_rate": 3.1606809464295145e-06, + "loss": 0.79981083, + "num_input_tokens_seen": 115691155, + "router_z_loss_clip": 1.8828125, + "router_z_loss_mlp": 0.15032959, + "step": 5386, + "time_per_iteration": 2.585822820663452 + }, + { + "auxiliary_loss_clip": 0.06485418, + "auxiliary_loss_mlp": 0.01273325, + "balance_loss_clip": 0.06292341, + "balance_loss_mlp": 0.01256803, + "epoch": 0.3238839621223508, + "flos": 23263418062080.0, + "grad_norm": 3.182973165751226, + "language_loss": 0.95573068, + "learning_rate": 3.1603637569759095e-06, + "loss": 1.03331804, + "num_input_tokens_seen": 115710340, + "router_z_loss_clip": 1.93261719, + "router_z_loss_mlp": 0.16503906, + "step": 5387, + "time_per_iteration": 4.075104236602783 + }, + { + "auxiliary_loss_clip": 0.06490889, + "auxiliary_loss_mlp": 0.01270509, + "balance_loss_clip": 0.06298059, + "balance_loss_mlp": 0.0125376, + "epoch": 0.3239440853750188, + "flos": 22971026839680.0, + "grad_norm": 2.142304582151843, + "language_loss": 0.78141761, + "learning_rate": 3.1600465235211956e-06, + "loss": 0.85903162, + "num_input_tokens_seen": 115726745, + "router_z_loss_clip": 1.92675781, + "router_z_loss_mlp": 0.16748047, + "step": 5388, + "time_per_iteration": 2.623976707458496 + }, + { + "auxiliary_loss_clip": 0.06478786, + "auxiliary_loss_mlp": 0.01276501, + "balance_loss_clip": 0.06289905, + "balance_loss_mlp": 0.01259704, + "epoch": 0.32400420862768675, + "flos": 36255394275840.0, + "grad_norm": 1.9954909505528162, + "language_loss": 0.71735168, + "learning_rate": 3.1597292460774006e-06, + "loss": 0.79490453, + "num_input_tokens_seen": 115749385, + "router_z_loss_clip": 1.88867188, + "router_z_loss_mlp": 0.16796875, + "step": 5389, + "time_per_iteration": 4.133269309997559 + }, + { + "auxiliary_loss_clip": 0.06479806, + "auxiliary_loss_mlp": 0.01273464, + "balance_loss_clip": 0.06294239, + "balance_loss_mlp": 0.01257872, + "epoch": 0.3240643318803547, + "flos": 21622946826240.0, + "grad_norm": 1.7464997421167434, + "language_loss": 0.81443554, + "learning_rate": 3.159411924656557e-06, + "loss": 0.89196825, + "num_input_tokens_seen": 115768105, + "router_z_loss_clip": 1.85449219, + "router_z_loss_mlp": 0.15588379, + "step": 5390, + "time_per_iteration": 3.9378364086151123 + }, + { + "auxiliary_loss_clip": 0.06491944, + "auxiliary_loss_mlp": 0.01278594, + "balance_loss_clip": 0.06301276, + "balance_loss_mlp": 0.01261296, + "epoch": 0.3241244551330227, + "flos": 23302466864640.0, + "grad_norm": 1.9807661160762629, + "language_loss": 0.73182476, + "learning_rate": 3.1590945592706967e-06, + "loss": 0.80953014, + "num_input_tokens_seen": 115787340, + "router_z_loss_clip": 1.90625, + "router_z_loss_mlp": 0.1730957, + "step": 5391, + "time_per_iteration": 2.532317638397217 + }, + { + "auxiliary_loss_clip": 0.06482222, + "auxiliary_loss_mlp": 0.01278908, + "balance_loss_clip": 0.06294864, + "balance_loss_mlp": 0.0126241, + "epoch": 0.32418457838569065, + "flos": 14101891395840.0, + "grad_norm": 1.5457442510257688, + "language_loss": 0.77541089, + "learning_rate": 3.158777149931855e-06, + "loss": 0.85302216, + "num_input_tokens_seen": 115805565, + "router_z_loss_clip": 1.87304688, + "router_z_loss_mlp": 0.16491699, + "step": 5392, + "time_per_iteration": 2.486161470413208 + }, + { + "auxiliary_loss_clip": 0.06490408, + "auxiliary_loss_mlp": 0.01278183, + "balance_loss_clip": 0.0629712, + "balance_loss_mlp": 0.01261411, + "epoch": 0.3242447016383586, + "flos": 29760454344960.0, + "grad_norm": 1.849936210081937, + "language_loss": 0.63213563, + "learning_rate": 3.158459696652067e-06, + "loss": 0.70982158, + "num_input_tokens_seen": 115826725, + "router_z_loss_clip": 1.93066406, + "router_z_loss_mlp": 0.16760254, + "step": 5393, + "time_per_iteration": 2.5853707790374756 + }, + { + "auxiliary_loss_clip": 0.06489256, + "auxiliary_loss_mlp": 0.01282677, + "balance_loss_clip": 0.06301466, + "balance_loss_mlp": 0.01266011, + "epoch": 0.3243048248910266, + "flos": 24357820239360.0, + "grad_norm": 1.7023503315224988, + "language_loss": 0.82889545, + "learning_rate": 3.158142199443371e-06, + "loss": 0.90661478, + "num_input_tokens_seen": 115846955, + "router_z_loss_clip": 1.88085938, + "router_z_loss_mlp": 0.16674805, + "step": 5394, + "time_per_iteration": 3.946955680847168 + }, + { + "auxiliary_loss_clip": 0.06480435, + "auxiliary_loss_mlp": 0.01285084, + "balance_loss_clip": 0.06298714, + "balance_loss_mlp": 0.01269825, + "epoch": 0.3243649481436946, + "flos": 24359958518400.0, + "grad_norm": 2.1573093021253333, + "language_loss": 0.82280314, + "learning_rate": 3.1578246583178076e-06, + "loss": 0.90045834, + "num_input_tokens_seen": 115865975, + "router_z_loss_clip": 1.81738281, + "router_z_loss_mlp": 0.15270996, + "step": 5395, + "time_per_iteration": 2.537313222885132 + }, + { + "auxiliary_loss_clip": 0.06480338, + "auxiliary_loss_mlp": 0.01292267, + "balance_loss_clip": 0.06300412, + "balance_loss_mlp": 0.01276424, + "epoch": 0.32442507139636256, + "flos": 22931097569280.0, + "grad_norm": 1.7302006802896392, + "language_loss": 0.839818, + "learning_rate": 3.157507073287417e-06, + "loss": 0.91754401, + "num_input_tokens_seen": 115884950, + "router_z_loss_clip": 1.79785156, + "router_z_loss_mlp": 0.15844727, + "step": 5396, + "time_per_iteration": 2.6440067291259766 + }, + { + "auxiliary_loss_clip": 0.06491997, + "auxiliary_loss_mlp": 0.01291538, + "balance_loss_clip": 0.06299315, + "balance_loss_mlp": 0.01274121, + "epoch": 0.32448519464903053, + "flos": 22206723022080.0, + "grad_norm": 1.8684779143202024, + "language_loss": 0.76113403, + "learning_rate": 3.1571894443642414e-06, + "loss": 0.83896935, + "num_input_tokens_seen": 115904170, + "router_z_loss_clip": 1.92578125, + "router_z_loss_mlp": 0.17419434, + "step": 5397, + "time_per_iteration": 2.506601095199585 + }, + { + "auxiliary_loss_clip": 0.06473789, + "auxiliary_loss_mlp": 0.01290487, + "balance_loss_clip": 0.06290997, + "balance_loss_mlp": 0.0127387, + "epoch": 0.3245453179016985, + "flos": 18843574095360.0, + "grad_norm": 2.304762567896747, + "language_loss": 0.67975587, + "learning_rate": 3.1568717715603263e-06, + "loss": 0.75739866, + "num_input_tokens_seen": 115919255, + "router_z_loss_clip": 1.828125, + "router_z_loss_mlp": 0.1661377, + "step": 5398, + "time_per_iteration": 2.50168514251709 + }, + { + "auxiliary_loss_clip": 0.06478744, + "auxiliary_loss_mlp": 0.01288926, + "balance_loss_clip": 0.06293125, + "balance_loss_mlp": 0.01272189, + "epoch": 0.32460544115436646, + "flos": 21184716372480.0, + "grad_norm": 1.3685049489713428, + "language_loss": 0.73232323, + "learning_rate": 3.156554054887718e-06, + "loss": 0.80999994, + "num_input_tokens_seen": 115938535, + "router_z_loss_clip": 1.85742188, + "router_z_loss_mlp": 0.16748047, + "step": 5399, + "time_per_iteration": 2.5114216804504395 + }, + { + "auxiliary_loss_clip": 0.0648094, + "auxiliary_loss_mlp": 0.01289931, + "balance_loss_clip": 0.06293677, + "balance_loss_mlp": 0.01273241, + "epoch": 0.3246655644070344, + "flos": 21987607795200.0, + "grad_norm": 2.072173153822147, + "language_loss": 0.71044981, + "learning_rate": 3.1562362943584645e-06, + "loss": 0.78815848, + "num_input_tokens_seen": 115955005, + "router_z_loss_clip": 1.87207031, + "router_z_loss_mlp": 0.16687012, + "step": 5400, + "time_per_iteration": 2.548757314682007 + }, + { + "auxiliary_loss_clip": 0.06480449, + "auxiliary_loss_mlp": 0.01279651, + "balance_loss_clip": 0.06289301, + "balance_loss_mlp": 0.01263355, + "epoch": 0.3247256876597024, + "flos": 32167745020800.0, + "grad_norm": 2.104371315429844, + "language_loss": 0.80626661, + "learning_rate": 3.155918489984614e-06, + "loss": 0.88386756, + "num_input_tokens_seen": 115975305, + "router_z_loss_clip": 1.91210938, + "router_z_loss_mlp": 0.16296387, + "step": 5401, + "time_per_iteration": 2.59226393699646 + }, + { + "auxiliary_loss_clip": 0.06483636, + "auxiliary_loss_mlp": 0.01281263, + "balance_loss_clip": 0.06294005, + "balance_loss_mlp": 0.01264073, + "epoch": 0.32478581091237035, + "flos": 21004104896640.0, + "grad_norm": 1.4796090680940444, + "language_loss": 0.87935805, + "learning_rate": 3.1556006417782196e-06, + "loss": 0.95700705, + "num_input_tokens_seen": 115994810, + "router_z_loss_clip": 1.89746094, + "router_z_loss_mlp": 0.17175293, + "step": 5402, + "time_per_iteration": 2.5548956394195557 + }, + { + "auxiliary_loss_clip": 0.06474966, + "auxiliary_loss_mlp": 0.0127368, + "balance_loss_clip": 0.06291528, + "balance_loss_mlp": 0.01258767, + "epoch": 0.3248459341650383, + "flos": 17929741466880.0, + "grad_norm": 2.584856005153906, + "language_loss": 0.85243386, + "learning_rate": 3.155282749751332e-06, + "loss": 0.92992032, + "num_input_tokens_seen": 116011095, + "router_z_loss_clip": 1.83691406, + "router_z_loss_mlp": 0.14904785, + "step": 5403, + "time_per_iteration": 2.479205369949341 + }, + { + "auxiliary_loss_clip": 0.06468324, + "auxiliary_loss_mlp": 0.01277336, + "balance_loss_clip": 0.06290223, + "balance_loss_mlp": 0.01262667, + "epoch": 0.3249060574177063, + "flos": 24542582492160.0, + "grad_norm": 2.1052258035485214, + "language_loss": 0.8828373, + "learning_rate": 3.154964813916007e-06, + "loss": 0.96029389, + "num_input_tokens_seen": 116028805, + "router_z_loss_clip": 1.77832031, + "router_z_loss_mlp": 0.14672852, + "step": 5404, + "time_per_iteration": 2.5845093727111816 + }, + { + "auxiliary_loss_clip": 0.06473936, + "auxiliary_loss_mlp": 0.01275771, + "balance_loss_clip": 0.06291413, + "balance_loss_mlp": 0.01259368, + "epoch": 0.32496618067037425, + "flos": 26001939127680.0, + "grad_norm": 1.6833557203411496, + "language_loss": 0.72900558, + "learning_rate": 3.1546468342843008e-06, + "loss": 0.80650264, + "num_input_tokens_seen": 116047765, + "router_z_loss_clip": 1.82617188, + "router_z_loss_mlp": 0.1640625, + "step": 5405, + "time_per_iteration": 2.542433500289917 + }, + { + "auxiliary_loss_clip": 0.06474283, + "auxiliary_loss_mlp": 0.01273684, + "balance_loss_clip": 0.06290333, + "balance_loss_mlp": 0.01258264, + "epoch": 0.3250263039230422, + "flos": 19579939776000.0, + "grad_norm": 1.7320098663924197, + "language_loss": 0.83355331, + "learning_rate": 3.1543288108682707e-06, + "loss": 0.91103297, + "num_input_tokens_seen": 116068385, + "router_z_loss_clip": 1.83789062, + "router_z_loss_mlp": 0.15435791, + "step": 5406, + "time_per_iteration": 2.591207265853882 + }, + { + "auxiliary_loss_clip": 0.06474167, + "auxiliary_loss_mlp": 0.01270112, + "balance_loss_clip": 0.06290454, + "balance_loss_mlp": 0.01254949, + "epoch": 0.3250864271757102, + "flos": 16769232817920.0, + "grad_norm": 2.13827452533593, + "language_loss": 0.87879711, + "learning_rate": 3.1540107436799764e-06, + "loss": 0.95623994, + "num_input_tokens_seen": 116085350, + "router_z_loss_clip": 1.83886719, + "router_z_loss_mlp": 0.15161133, + "step": 5407, + "time_per_iteration": 2.4856173992156982 + }, + { + "auxiliary_loss_clip": 0.06469748, + "auxiliary_loss_mlp": 0.01276836, + "balance_loss_clip": 0.06284758, + "balance_loss_mlp": 0.01261195, + "epoch": 0.3251465504283782, + "flos": 27827004908160.0, + "grad_norm": 2.430972813034592, + "language_loss": 0.69975567, + "learning_rate": 3.153692632731479e-06, + "loss": 0.77722144, + "num_input_tokens_seen": 116107560, + "router_z_loss_clip": 1.84960938, + "router_z_loss_mlp": 0.15649414, + "step": 5408, + "time_per_iteration": 2.5838799476623535 + }, + { + "auxiliary_loss_clip": 0.06481153, + "auxiliary_loss_mlp": 0.01282988, + "balance_loss_clip": 0.06286341, + "balance_loss_mlp": 0.01267396, + "epoch": 0.32520667368104617, + "flos": 19069271867520.0, + "grad_norm": 3.909403651515765, + "language_loss": 0.78053123, + "learning_rate": 3.153374478034841e-06, + "loss": 0.85817266, + "num_input_tokens_seen": 116125980, + "router_z_loss_clip": 1.94824219, + "router_z_loss_mlp": 0.15588379, + "step": 5409, + "time_per_iteration": 2.5178377628326416 + }, + { + "auxiliary_loss_clip": 0.06473932, + "auxiliary_loss_mlp": 0.01272582, + "balance_loss_clip": 0.06286227, + "balance_loss_mlp": 0.01256202, + "epoch": 0.32526679693371413, + "flos": 29388917341440.0, + "grad_norm": 1.8050072916987376, + "language_loss": 0.83473468, + "learning_rate": 3.1530562796021285e-06, + "loss": 0.91219985, + "num_input_tokens_seen": 116146530, + "router_z_loss_clip": 1.87695312, + "router_z_loss_mlp": 0.16381836, + "step": 5410, + "time_per_iteration": 2.5948092937469482 + }, + { + "auxiliary_loss_clip": 0.06466505, + "auxiliary_loss_mlp": 0.01275621, + "balance_loss_clip": 0.06286819, + "balance_loss_mlp": 0.01261274, + "epoch": 0.3253269201863821, + "flos": 20710833206400.0, + "grad_norm": 1.580323990141508, + "language_loss": 0.72005814, + "learning_rate": 3.152738037445405e-06, + "loss": 0.79747939, + "num_input_tokens_seen": 116165695, + "router_z_loss_clip": 1.79589844, + "router_z_loss_mlp": 0.14349365, + "step": 5411, + "time_per_iteration": 2.515542507171631 + }, + { + "auxiliary_loss_clip": 0.06472497, + "auxiliary_loss_mlp": 0.0127713, + "balance_loss_clip": 0.06287136, + "balance_loss_mlp": 0.01261632, + "epoch": 0.32538704343905006, + "flos": 29101515436800.0, + "grad_norm": 1.470162471805647, + "language_loss": 0.83496881, + "learning_rate": 3.1524197515767403e-06, + "loss": 0.91246504, + "num_input_tokens_seen": 116185375, + "router_z_loss_clip": 1.85351562, + "router_z_loss_mlp": 0.15490723, + "step": 5412, + "time_per_iteration": 2.55008602142334 + }, + { + "auxiliary_loss_clip": 0.06476887, + "auxiliary_loss_mlp": 0.01277617, + "balance_loss_clip": 0.06287435, + "balance_loss_mlp": 0.01260904, + "epoch": 0.325447166691718, + "flos": 24682216521600.0, + "grad_norm": 1.5504273053971407, + "language_loss": 0.8129071, + "learning_rate": 3.152101422008203e-06, + "loss": 0.89045215, + "num_input_tokens_seen": 116204335, + "router_z_loss_clip": 1.89550781, + "router_z_loss_mlp": 0.16711426, + "step": 5413, + "time_per_iteration": 2.54195499420166 + }, + { + "auxiliary_loss_clip": 0.06477104, + "auxiliary_loss_mlp": 0.0127801, + "balance_loss_clip": 0.0628976, + "balance_loss_mlp": 0.01261643, + "epoch": 0.325507289944386, + "flos": 21549503122560.0, + "grad_norm": 1.5527044192655586, + "language_loss": 0.76985061, + "learning_rate": 3.151783048751864e-06, + "loss": 0.84740174, + "num_input_tokens_seen": 116222840, + "router_z_loss_clip": 1.87402344, + "router_z_loss_mlp": 0.16363525, + "step": 5414, + "time_per_iteration": 2.5435919761657715 + }, + { + "auxiliary_loss_clip": 0.063807, + "auxiliary_loss_mlp": 0.01284661, + "balance_loss_clip": 0.06291388, + "balance_loss_mlp": 0.01280793, + "epoch": 0.32556741319705396, + "flos": 71537893194240.0, + "grad_norm": 0.9015335749308697, + "language_loss": 0.64095414, + "learning_rate": 3.1514646318197965e-06, + "loss": 0.71760774, + "num_input_tokens_seen": 116274940, + "router_z_loss_clip": 0.890625, + "router_z_loss_mlp": 0.03863525, + "step": 5415, + "time_per_iteration": 3.0875957012176514 + }, + { + "auxiliary_loss_clip": 0.0647157, + "auxiliary_loss_mlp": 0.01275105, + "balance_loss_clip": 0.06285933, + "balance_loss_mlp": 0.01258845, + "epoch": 0.3256275364497219, + "flos": 23739187944960.0, + "grad_norm": 1.4815485577141352, + "language_loss": 0.74123245, + "learning_rate": 3.151146171224075e-06, + "loss": 0.81869924, + "num_input_tokens_seen": 116297300, + "router_z_loss_clip": 1.85546875, + "router_z_loss_mlp": 0.16235352, + "step": 5416, + "time_per_iteration": 2.5792665481567383 + }, + { + "auxiliary_loss_clip": 0.06381539, + "auxiliary_loss_mlp": 0.01266569, + "balance_loss_clip": 0.06293018, + "balance_loss_mlp": 0.01262769, + "epoch": 0.3256876597023899, + "flos": 67308136214400.0, + "grad_norm": 0.7704887993649999, + "language_loss": 0.57850802, + "learning_rate": 3.1508276669767757e-06, + "loss": 0.65498912, + "num_input_tokens_seen": 116362370, + "router_z_loss_clip": 0.88378906, + "router_z_loss_mlp": 0.03793335, + "step": 5417, + "time_per_iteration": 3.2770884037017822 + }, + { + "auxiliary_loss_clip": 0.06373264, + "auxiliary_loss_mlp": 0.01258837, + "balance_loss_clip": 0.06284805, + "balance_loss_mlp": 0.01254933, + "epoch": 0.32574778295505785, + "flos": 71304633826560.0, + "grad_norm": 0.8775074523137479, + "language_loss": 0.63674986, + "learning_rate": 3.150509119089975e-06, + "loss": 0.71307087, + "num_input_tokens_seen": 116430365, + "router_z_loss_clip": 0.8828125, + "router_z_loss_mlp": 0.03900146, + "step": 5418, + "time_per_iteration": 3.315948724746704 + }, + { + "auxiliary_loss_clip": 0.06476019, + "auxiliary_loss_mlp": 0.01273465, + "balance_loss_clip": 0.06290952, + "balance_loss_mlp": 0.01258111, + "epoch": 0.3258079062077258, + "flos": 20782515974400.0, + "grad_norm": 1.8847025208507953, + "language_loss": 0.6957128, + "learning_rate": 3.1501905275757537e-06, + "loss": 0.77320766, + "num_input_tokens_seen": 116447525, + "router_z_loss_clip": 1.85058594, + "router_z_loss_mlp": 0.15344238, + "step": 5419, + "time_per_iteration": 2.5722780227661133 + }, + { + "auxiliary_loss_clip": 0.06480842, + "auxiliary_loss_mlp": 0.01275789, + "balance_loss_clip": 0.06291591, + "balance_loss_mlp": 0.01260006, + "epoch": 0.3258680294603938, + "flos": 22241788755840.0, + "grad_norm": 2.023173952709465, + "language_loss": 0.77398664, + "learning_rate": 3.1498718924461926e-06, + "loss": 0.85155296, + "num_input_tokens_seen": 116466310, + "router_z_loss_clip": 1.890625, + "router_z_loss_mlp": 0.15783691, + "step": 5420, + "time_per_iteration": 2.5199873447418213 + }, + { + "auxiliary_loss_clip": 0.06478356, + "auxiliary_loss_mlp": 0.0127343, + "balance_loss_clip": 0.06290038, + "balance_loss_mlp": 0.0125798, + "epoch": 0.3259281527130618, + "flos": 26987328743040.0, + "grad_norm": 1.5124533627457746, + "language_loss": 0.80826706, + "learning_rate": 3.1495532137133736e-06, + "loss": 0.88578492, + "num_input_tokens_seen": 116487825, + "router_z_loss_clip": 1.88378906, + "router_z_loss_mlp": 0.15441895, + "step": 5421, + "time_per_iteration": 2.6014363765716553 + }, + { + "auxiliary_loss_clip": 0.06476312, + "auxiliary_loss_mlp": 0.0127337, + "balance_loss_clip": 0.06293876, + "balance_loss_mlp": 0.01258982, + "epoch": 0.32598827596572977, + "flos": 26221557479040.0, + "grad_norm": 1.4846059645471, + "language_loss": 0.76098251, + "learning_rate": 3.149234491389381e-06, + "loss": 0.8384794, + "num_input_tokens_seen": 116509950, + "router_z_loss_clip": 1.82421875, + "router_z_loss_mlp": 0.1439209, + "step": 5422, + "time_per_iteration": 2.5738978385925293 + }, + { + "auxiliary_loss_clip": 0.06480287, + "auxiliary_loss_mlp": 0.01270664, + "balance_loss_clip": 0.06288645, + "balance_loss_mlp": 0.01255095, + "epoch": 0.32604839921839773, + "flos": 17645567944320.0, + "grad_norm": 2.282982793788361, + "language_loss": 0.63826233, + "learning_rate": 3.1489157254863026e-06, + "loss": 0.71577179, + "num_input_tokens_seen": 116527695, + "router_z_loss_clip": 1.91699219, + "router_z_loss_mlp": 0.15576172, + "step": 5423, + "time_per_iteration": 2.5513644218444824 + }, + { + "auxiliary_loss_clip": 0.06469624, + "auxiliary_loss_mlp": 0.01273816, + "balance_loss_clip": 0.06290927, + "balance_loss_mlp": 0.01258748, + "epoch": 0.3261085224710657, + "flos": 23629420696320.0, + "grad_norm": 1.6690467832946037, + "language_loss": 0.75170749, + "learning_rate": 3.148596916016224e-06, + "loss": 0.82914186, + "num_input_tokens_seen": 116547800, + "router_z_loss_clip": 1.78710938, + "router_z_loss_mlp": 0.1505127, + "step": 5424, + "time_per_iteration": 2.546074151992798 + }, + { + "auxiliary_loss_clip": 0.06470636, + "auxiliary_loss_mlp": 0.01274311, + "balance_loss_clip": 0.06288706, + "balance_loss_mlp": 0.01258945, + "epoch": 0.32616864572373366, + "flos": 23267526912000.0, + "grad_norm": 1.6415169459291201, + "language_loss": 0.7718606, + "learning_rate": 3.1482780629912355e-06, + "loss": 0.84931004, + "num_input_tokens_seen": 116568460, + "router_z_loss_clip": 1.81933594, + "router_z_loss_mlp": 0.15368652, + "step": 5425, + "time_per_iteration": 2.5883710384368896 + }, + { + "auxiliary_loss_clip": 0.06476015, + "auxiliary_loss_mlp": 0.01273254, + "balance_loss_clip": 0.06284191, + "balance_loss_mlp": 0.01256589, + "epoch": 0.32622876897640163, + "flos": 25600535343360.0, + "grad_norm": 2.4681515054731924, + "language_loss": 0.78599709, + "learning_rate": 3.147959166423428e-06, + "loss": 0.86348987, + "num_input_tokens_seen": 116588705, + "router_z_loss_clip": 1.91992188, + "router_z_loss_mlp": 0.16650391, + "step": 5426, + "time_per_iteration": 2.569566488265991 + }, + { + "auxiliary_loss_clip": 0.06473041, + "auxiliary_loss_mlp": 0.01273059, + "balance_loss_clip": 0.06286261, + "balance_loss_mlp": 0.0125749, + "epoch": 0.3262888922290696, + "flos": 22425544759680.0, + "grad_norm": 1.6671872965592953, + "language_loss": 0.74719262, + "learning_rate": 3.147640226324893e-06, + "loss": 0.82465363, + "num_input_tokens_seen": 116608845, + "router_z_loss_clip": 1.86816406, + "router_z_loss_mlp": 0.15563965, + "step": 5427, + "time_per_iteration": 3.941770315170288 + }, + { + "auxiliary_loss_clip": 0.06474692, + "auxiliary_loss_mlp": 0.0127251, + "balance_loss_clip": 0.06285059, + "balance_loss_mlp": 0.01256154, + "epoch": 0.32634901548173756, + "flos": 19724982393600.0, + "grad_norm": 2.0508761677602965, + "language_loss": 0.79472262, + "learning_rate": 3.1473212427077266e-06, + "loss": 0.87219465, + "num_input_tokens_seen": 116628145, + "router_z_loss_clip": 1.89550781, + "router_z_loss_mlp": 0.16357422, + "step": 5428, + "time_per_iteration": 3.9950850009918213 + }, + { + "auxiliary_loss_clip": 0.06475013, + "auxiliary_loss_mlp": 0.01275116, + "balance_loss_clip": 0.0628937, + "balance_loss_mlp": 0.01259309, + "epoch": 0.3264091387344055, + "flos": 16148336463360.0, + "grad_norm": 1.5445825374219135, + "language_loss": 0.71770716, + "learning_rate": 3.147002215584023e-06, + "loss": 0.79520845, + "num_input_tokens_seen": 116646920, + "router_z_loss_clip": 1.85644531, + "router_z_loss_mlp": 0.15808105, + "step": 5429, + "time_per_iteration": 3.922197103500366 + }, + { + "auxiliary_loss_clip": 0.06468233, + "auxiliary_loss_mlp": 0.01269844, + "balance_loss_clip": 0.06283497, + "balance_loss_mlp": 0.01254466, + "epoch": 0.3264692619870735, + "flos": 16404655703040.0, + "grad_norm": 1.5791835311639297, + "language_loss": 0.78689212, + "learning_rate": 3.146683144965881e-06, + "loss": 0.86427283, + "num_input_tokens_seen": 116665100, + "router_z_loss_clip": 1.84863281, + "router_z_loss_mlp": 0.15380859, + "step": 5430, + "time_per_iteration": 2.4873790740966797 + }, + { + "auxiliary_loss_clip": 0.06468185, + "auxiliary_loss_mlp": 0.0127668, + "balance_loss_clip": 0.06281599, + "balance_loss_mlp": 0.01259561, + "epoch": 0.32652938523974145, + "flos": 22388843871360.0, + "grad_norm": 1.9481749952405665, + "language_loss": 0.84556186, + "learning_rate": 3.146364030865399e-06, + "loss": 0.92301053, + "num_input_tokens_seen": 116682205, + "router_z_loss_clip": 1.86425781, + "router_z_loss_mlp": 0.17126465, + "step": 5431, + "time_per_iteration": 2.522075653076172 + }, + { + "auxiliary_loss_clip": 0.06468672, + "auxiliary_loss_mlp": 0.01274085, + "balance_loss_clip": 0.06286903, + "balance_loss_mlp": 0.01259327, + "epoch": 0.3265895084924094, + "flos": 21914499507840.0, + "grad_norm": 1.6266920997971765, + "language_loss": 0.71123517, + "learning_rate": 3.146044873294678e-06, + "loss": 0.78866279, + "num_input_tokens_seen": 116702575, + "router_z_loss_clip": 1.81835938, + "router_z_loss_mlp": 0.14758301, + "step": 5432, + "time_per_iteration": 2.513209104537964 + }, + { + "auxiliary_loss_clip": 0.06469099, + "auxiliary_loss_mlp": 0.01272277, + "balance_loss_clip": 0.06282821, + "balance_loss_mlp": 0.01257424, + "epoch": 0.3266496317450774, + "flos": 16072083648000.0, + "grad_norm": 1.3982751613904698, + "language_loss": 0.84207368, + "learning_rate": 3.1457256722658203e-06, + "loss": 0.91948748, + "num_input_tokens_seen": 116720885, + "router_z_loss_clip": 1.86328125, + "router_z_loss_mlp": 0.14855957, + "step": 5433, + "time_per_iteration": 2.5324172973632812 + }, + { + "auxiliary_loss_clip": 0.06463822, + "auxiliary_loss_mlp": 0.01279207, + "balance_loss_clip": 0.06283711, + "balance_loss_mlp": 0.01264049, + "epoch": 0.3267097549977454, + "flos": 22534766956800.0, + "grad_norm": 1.4562075652627795, + "language_loss": 0.85916972, + "learning_rate": 3.145406427790931e-06, + "loss": 0.93660003, + "num_input_tokens_seen": 116740395, + "router_z_loss_clip": 1.80175781, + "router_z_loss_mlp": 0.15155029, + "step": 5434, + "time_per_iteration": 3.9434614181518555 + }, + { + "auxiliary_loss_clip": 0.06468898, + "auxiliary_loss_mlp": 0.01277076, + "balance_loss_clip": 0.06281307, + "balance_loss_mlp": 0.0126134, + "epoch": 0.32676987825041337, + "flos": 27277581686400.0, + "grad_norm": 1.6909362765146225, + "language_loss": 0.88470823, + "learning_rate": 3.1450871398821147e-06, + "loss": 0.96216792, + "num_input_tokens_seen": 116758870, + "router_z_loss_clip": 1.875, + "router_z_loss_mlp": 0.1574707, + "step": 5435, + "time_per_iteration": 2.5430006980895996 + }, + { + "auxiliary_loss_clip": 0.06469613, + "auxiliary_loss_mlp": 0.01271625, + "balance_loss_clip": 0.06283396, + "balance_loss_mlp": 0.01256306, + "epoch": 0.32683000150308134, + "flos": 11512731432960.0, + "grad_norm": 2.3091497119382733, + "language_loss": 0.77129918, + "learning_rate": 3.144767808551479e-06, + "loss": 0.84871155, + "num_input_tokens_seen": 116773440, + "router_z_loss_clip": 1.86035156, + "router_z_loss_mlp": 0.15307617, + "step": 5436, + "time_per_iteration": 2.486003875732422 + }, + { + "auxiliary_loss_clip": 0.06464212, + "auxiliary_loss_mlp": 0.01277236, + "balance_loss_clip": 0.0628064, + "balance_loss_mlp": 0.01261977, + "epoch": 0.3268901247557493, + "flos": 25637362012800.0, + "grad_norm": 1.5303988762112921, + "language_loss": 0.72448635, + "learning_rate": 3.144448433811134e-06, + "loss": 0.80190074, + "num_input_tokens_seen": 116794375, + "router_z_loss_clip": 1.83398438, + "router_z_loss_mlp": 0.15270996, + "step": 5437, + "time_per_iteration": 2.545548915863037 + }, + { + "auxiliary_loss_clip": 0.06472606, + "auxiliary_loss_mlp": 0.01275014, + "balance_loss_clip": 0.06282267, + "balance_loss_mlp": 0.01258253, + "epoch": 0.32695024800841727, + "flos": 24867356117760.0, + "grad_norm": 1.604360978002023, + "language_loss": 0.64194709, + "learning_rate": 3.144129015673189e-06, + "loss": 0.71942323, + "num_input_tokens_seen": 116815095, + "router_z_loss_clip": 1.90429688, + "router_z_loss_mlp": 0.16760254, + "step": 5438, + "time_per_iteration": 2.5657694339752197 + }, + { + "auxiliary_loss_clip": 0.06462848, + "auxiliary_loss_mlp": 0.01273041, + "balance_loss_clip": 0.0627985, + "balance_loss_mlp": 0.01257246, + "epoch": 0.32701037126108523, + "flos": 28846663643520.0, + "grad_norm": 1.637174889107761, + "language_loss": 0.74795192, + "learning_rate": 3.1438095541497576e-06, + "loss": 0.82531083, + "num_input_tokens_seen": 116836630, + "router_z_loss_clip": 1.82910156, + "router_z_loss_mlp": 0.15795898, + "step": 5439, + "time_per_iteration": 2.5655689239501953 + }, + { + "auxiliary_loss_clip": 0.06473309, + "auxiliary_loss_mlp": 0.01272512, + "balance_loss_clip": 0.06286814, + "balance_loss_mlp": 0.01257087, + "epoch": 0.3270704945137532, + "flos": 27972592577280.0, + "grad_norm": 1.745503595629167, + "language_loss": 0.74950606, + "learning_rate": 3.1434900492529527e-06, + "loss": 0.82696426, + "num_input_tokens_seen": 116856880, + "router_z_loss_clip": 1.86425781, + "router_z_loss_mlp": 0.1541748, + "step": 5440, + "time_per_iteration": 2.601821184158325 + }, + { + "auxiliary_loss_clip": 0.06460315, + "auxiliary_loss_mlp": 0.01269526, + "balance_loss_clip": 0.06277528, + "balance_loss_mlp": 0.01254947, + "epoch": 0.32713061776642116, + "flos": 23696575344000.0, + "grad_norm": 1.95462638600934, + "language_loss": 0.84695202, + "learning_rate": 3.1431705009948914e-06, + "loss": 0.92425048, + "num_input_tokens_seen": 116873770, + "router_z_loss_clip": 1.82910156, + "router_z_loss_mlp": 0.14599609, + "step": 5441, + "time_per_iteration": 2.5020570755004883 + }, + { + "auxiliary_loss_clip": 0.06466734, + "auxiliary_loss_mlp": 0.01272021, + "balance_loss_clip": 0.06280614, + "balance_loss_mlp": 0.01256798, + "epoch": 0.3271907410190891, + "flos": 22462203720960.0, + "grad_norm": 1.9620532707625304, + "language_loss": 0.86928713, + "learning_rate": 3.1428509093876897e-06, + "loss": 0.9466747, + "num_input_tokens_seen": 116891225, + "router_z_loss_clip": 1.86035156, + "router_z_loss_mlp": 0.15222168, + "step": 5442, + "time_per_iteration": 2.5388059616088867 + }, + { + "auxiliary_loss_clip": 0.06470812, + "auxiliary_loss_mlp": 0.0126936, + "balance_loss_clip": 0.06282146, + "balance_loss_mlp": 0.01254399, + "epoch": 0.3272508642717571, + "flos": 22826696981760.0, + "grad_norm": 1.5979656279548642, + "language_loss": 0.77388418, + "learning_rate": 3.1425312744434668e-06, + "loss": 0.85128593, + "num_input_tokens_seen": 116912300, + "router_z_loss_clip": 1.88671875, + "router_z_loss_mlp": 0.1496582, + "step": 5443, + "time_per_iteration": 2.5765621662139893 + }, + { + "auxiliary_loss_clip": 0.0646731, + "auxiliary_loss_mlp": 0.01271075, + "balance_loss_clip": 0.06280384, + "balance_loss_mlp": 0.01255518, + "epoch": 0.32731098752442506, + "flos": 11806086977280.0, + "grad_norm": 2.2200780771744073, + "language_loss": 0.82818562, + "learning_rate": 3.142211596174343e-06, + "loss": 0.90556955, + "num_input_tokens_seen": 116929425, + "router_z_loss_clip": 1.8671875, + "router_z_loss_mlp": 0.15551758, + "step": 5444, + "time_per_iteration": 2.5514841079711914 + }, + { + "auxiliary_loss_clip": 0.06468201, + "auxiliary_loss_mlp": 0.01274937, + "balance_loss_clip": 0.06282412, + "balance_loss_mlp": 0.01258295, + "epoch": 0.327371110777093, + "flos": 21033300844800.0, + "grad_norm": 2.365977713323657, + "language_loss": 0.59248179, + "learning_rate": 3.1418918745924423e-06, + "loss": 0.66991317, + "num_input_tokens_seen": 116948255, + "router_z_loss_clip": 1.85742188, + "router_z_loss_mlp": 0.16638184, + "step": 5445, + "time_per_iteration": 2.5325539112091064 + }, + { + "auxiliary_loss_clip": 0.06469189, + "auxiliary_loss_mlp": 0.01278146, + "balance_loss_clip": 0.0628283, + "balance_loss_mlp": 0.01261278, + "epoch": 0.327431234029761, + "flos": 19068055983360.0, + "grad_norm": 2.7570820492615886, + "language_loss": 0.89260846, + "learning_rate": 3.1415721097098865e-06, + "loss": 0.97008175, + "num_input_tokens_seen": 116964905, + "router_z_loss_clip": 1.86425781, + "router_z_loss_mlp": 0.16870117, + "step": 5446, + "time_per_iteration": 2.576833724975586 + }, + { + "auxiliary_loss_clip": 0.06476346, + "auxiliary_loss_mlp": 0.01274903, + "balance_loss_clip": 0.06282137, + "balance_loss_mlp": 0.01257403, + "epoch": 0.32749135728242895, + "flos": 25856435312640.0, + "grad_norm": 1.9641165872810087, + "language_loss": 0.79404771, + "learning_rate": 3.141252301538802e-06, + "loss": 0.87156022, + "num_input_tokens_seen": 116983650, + "router_z_loss_clip": 1.94238281, + "router_z_loss_mlp": 0.17480469, + "step": 5447, + "time_per_iteration": 2.5539090633392334 + }, + { + "auxiliary_loss_clip": 0.06462374, + "auxiliary_loss_mlp": 0.01278273, + "balance_loss_clip": 0.06277826, + "balance_loss_mlp": 0.01263277, + "epoch": 0.327551480535097, + "flos": 20126721594240.0, + "grad_norm": 1.953936246680755, + "language_loss": 0.73150277, + "learning_rate": 3.1409324500913157e-06, + "loss": 0.80890924, + "num_input_tokens_seen": 117003265, + "router_z_loss_clip": 1.84667969, + "router_z_loss_mlp": 0.14990234, + "step": 5448, + "time_per_iteration": 2.633612871170044 + }, + { + "auxiliary_loss_clip": 0.06464307, + "auxiliary_loss_mlp": 0.01272265, + "balance_loss_clip": 0.0628064, + "balance_loss_mlp": 0.01256291, + "epoch": 0.32761160378776494, + "flos": 28811094785280.0, + "grad_norm": 1.3623614976773524, + "language_loss": 0.67002481, + "learning_rate": 3.1406125553795567e-06, + "loss": 0.74739063, + "num_input_tokens_seen": 117025370, + "router_z_loss_clip": 1.83789062, + "router_z_loss_mlp": 0.15966797, + "step": 5449, + "time_per_iteration": 2.5777859687805176 + }, + { + "auxiliary_loss_clip": 0.0647198, + "auxiliary_loss_mlp": 0.01270062, + "balance_loss_clip": 0.0628611, + "balance_loss_mlp": 0.01254493, + "epoch": 0.3276717270404329, + "flos": 26944171090560.0, + "grad_norm": 1.378619651715801, + "language_loss": 0.65736711, + "learning_rate": 3.1402926174156556e-06, + "loss": 0.73478758, + "num_input_tokens_seen": 117044350, + "router_z_loss_clip": 1.85644531, + "router_z_loss_mlp": 0.15576172, + "step": 5450, + "time_per_iteration": 2.5971169471740723 + }, + { + "auxiliary_loss_clip": 0.06468028, + "auxiliary_loss_mlp": 0.01275162, + "balance_loss_clip": 0.06280884, + "balance_loss_mlp": 0.01258509, + "epoch": 0.32773185029310087, + "flos": 25345557768960.0, + "grad_norm": 7.041147023955008, + "language_loss": 0.77832162, + "learning_rate": 3.1399726362117437e-06, + "loss": 0.85575354, + "num_input_tokens_seen": 117064450, + "router_z_loss_clip": 1.87207031, + "router_z_loss_mlp": 0.16662598, + "step": 5451, + "time_per_iteration": 2.572112560272217 + }, + { + "auxiliary_loss_clip": 0.06472664, + "auxiliary_loss_mlp": 0.01278588, + "balance_loss_clip": 0.06283467, + "balance_loss_mlp": 0.01262042, + "epoch": 0.32779197354576883, + "flos": 26398227813120.0, + "grad_norm": 1.9495025825112327, + "language_loss": 0.70696288, + "learning_rate": 3.1396526117799555e-06, + "loss": 0.78447533, + "num_input_tokens_seen": 117083060, + "router_z_loss_clip": 1.88867188, + "router_z_loss_mlp": 0.16540527, + "step": 5452, + "time_per_iteration": 2.6081676483154297 + }, + { + "auxiliary_loss_clip": 0.0646618, + "auxiliary_loss_mlp": 0.01272924, + "balance_loss_clip": 0.06283787, + "balance_loss_mlp": 0.01256938, + "epoch": 0.3278520967984368, + "flos": 24906237212160.0, + "grad_norm": 1.6132254933408041, + "language_loss": 0.7924304, + "learning_rate": 3.1393325441324256e-06, + "loss": 0.86982143, + "num_input_tokens_seen": 117101860, + "router_z_loss_clip": 1.82519531, + "router_z_loss_mlp": 0.15979004, + "step": 5453, + "time_per_iteration": 2.5893869400024414 + }, + { + "auxiliary_loss_clip": 0.06469721, + "auxiliary_loss_mlp": 0.01274795, + "balance_loss_clip": 0.06282013, + "balance_loss_mlp": 0.01259309, + "epoch": 0.32791222005110476, + "flos": 29760831688320.0, + "grad_norm": 2.0442879632543476, + "language_loss": 0.758448, + "learning_rate": 3.1390124332812916e-06, + "loss": 0.83589315, + "num_input_tokens_seen": 117123100, + "router_z_loss_clip": 1.87792969, + "router_z_loss_mlp": 0.15478516, + "step": 5454, + "time_per_iteration": 2.590080499649048 + }, + { + "auxiliary_loss_clip": 0.06461332, + "auxiliary_loss_mlp": 0.01271865, + "balance_loss_clip": 0.06280516, + "balance_loss_mlp": 0.01257536, + "epoch": 0.32797234330377273, + "flos": 16513584410880.0, + "grad_norm": 2.183253633037468, + "language_loss": 0.77119774, + "learning_rate": 3.1386922792386924e-06, + "loss": 0.8485297, + "num_input_tokens_seen": 117140515, + "router_z_loss_clip": 1.80761719, + "router_z_loss_mlp": 0.14318848, + "step": 5455, + "time_per_iteration": 2.4873318672180176 + }, + { + "auxiliary_loss_clip": 0.06482153, + "auxiliary_loss_mlp": 0.01285817, + "balance_loss_clip": 0.06290287, + "balance_loss_mlp": 0.01268377, + "epoch": 0.3280324665564407, + "flos": 26585086417920.0, + "grad_norm": 1.6915080932551223, + "language_loss": 0.74407738, + "learning_rate": 3.138372082016768e-06, + "loss": 0.82175708, + "num_input_tokens_seen": 117161485, + "router_z_loss_clip": 1.91894531, + "router_z_loss_mlp": 0.17443848, + "step": 5456, + "time_per_iteration": 2.593258857727051 + }, + { + "auxiliary_loss_clip": 0.0646835, + "auxiliary_loss_mlp": 0.01277637, + "balance_loss_clip": 0.06282572, + "balance_loss_mlp": 0.01261306, + "epoch": 0.32809258980910866, + "flos": 22936631938560.0, + "grad_norm": 1.4862092693082851, + "language_loss": 0.78666067, + "learning_rate": 3.1380518416276596e-06, + "loss": 0.8641206, + "num_input_tokens_seen": 117181870, + "router_z_loss_clip": 1.85546875, + "router_z_loss_mlp": 0.16345215, + "step": 5457, + "time_per_iteration": 2.523540496826172 + }, + { + "auxiliary_loss_clip": 0.06473868, + "auxiliary_loss_mlp": 0.01274513, + "balance_loss_clip": 0.06281006, + "balance_loss_mlp": 0.0125873, + "epoch": 0.3281527130617766, + "flos": 22790457290880.0, + "grad_norm": 2.0769759307730644, + "language_loss": 0.78958774, + "learning_rate": 3.1377315580835115e-06, + "loss": 0.86707151, + "num_input_tokens_seen": 117201380, + "router_z_loss_clip": 1.92773438, + "router_z_loss_mlp": 0.15795898, + "step": 5458, + "time_per_iteration": 2.552680015563965 + }, + { + "auxiliary_loss_clip": 0.06469774, + "auxiliary_loss_mlp": 0.01274499, + "balance_loss_clip": 0.06284518, + "balance_loss_mlp": 0.01258215, + "epoch": 0.3282128363144446, + "flos": 21256902264960.0, + "grad_norm": 1.5512978296749391, + "language_loss": 0.73655844, + "learning_rate": 3.1374112313964686e-06, + "loss": 0.8140012, + "num_input_tokens_seen": 117221040, + "router_z_loss_clip": 1.84960938, + "router_z_loss_mlp": 0.1628418, + "step": 5459, + "time_per_iteration": 2.5166404247283936 + }, + { + "auxiliary_loss_clip": 0.0647283, + "auxiliary_loss_mlp": 0.01274033, + "balance_loss_clip": 0.0628351, + "balance_loss_mlp": 0.01257761, + "epoch": 0.32827295956711255, + "flos": 30850328401920.0, + "grad_norm": 2.2277675097031993, + "language_loss": 0.84476066, + "learning_rate": 3.1370908615786783e-06, + "loss": 0.92222929, + "num_input_tokens_seen": 117241395, + "router_z_loss_clip": 1.89257812, + "router_z_loss_mlp": 0.16271973, + "step": 5460, + "time_per_iteration": 2.6067721843719482 + }, + { + "auxiliary_loss_clip": 0.06469227, + "auxiliary_loss_mlp": 0.01276293, + "balance_loss_clip": 0.06282166, + "balance_loss_mlp": 0.01260319, + "epoch": 0.3283330828197806, + "flos": 25921032410880.0, + "grad_norm": 2.3722751928185297, + "language_loss": 0.78114808, + "learning_rate": 3.136770448642288e-06, + "loss": 0.8586033, + "num_input_tokens_seen": 117259340, + "router_z_loss_clip": 1.86914062, + "router_z_loss_mlp": 0.15991211, + "step": 5461, + "time_per_iteration": 2.550417184829712 + }, + { + "auxiliary_loss_clip": 0.06469681, + "auxiliary_loss_mlp": 0.01279493, + "balance_loss_clip": 0.06282061, + "balance_loss_mlp": 0.01261361, + "epoch": 0.32839320607244854, + "flos": 38591295672960.0, + "grad_norm": 1.5965953358146812, + "language_loss": 0.62925887, + "learning_rate": 3.1364499925994484e-06, + "loss": 0.70675063, + "num_input_tokens_seen": 117282375, + "router_z_loss_clip": 1.87695312, + "router_z_loss_mlp": 0.18115234, + "step": 5462, + "time_per_iteration": 2.7004194259643555 + }, + { + "auxiliary_loss_clip": 0.06467308, + "auxiliary_loss_mlp": 0.0128086, + "balance_loss_clip": 0.06284478, + "balance_loss_mlp": 0.01265077, + "epoch": 0.3284533293251165, + "flos": 26658068924160.0, + "grad_norm": 1.3126719376538145, + "language_loss": 0.78502059, + "learning_rate": 3.1361294934623115e-06, + "loss": 0.86250222, + "num_input_tokens_seen": 117303830, + "router_z_loss_clip": 1.828125, + "router_z_loss_mlp": 0.15783691, + "step": 5463, + "time_per_iteration": 2.6072070598602295 + }, + { + "auxiliary_loss_clip": 0.0647091, + "auxiliary_loss_mlp": 0.01272646, + "balance_loss_clip": 0.06283993, + "balance_loss_mlp": 0.01256589, + "epoch": 0.32851345257778447, + "flos": 15309498839040.0, + "grad_norm": 1.727782559794916, + "language_loss": 0.70068884, + "learning_rate": 3.1358089512430303e-06, + "loss": 0.77812445, + "num_input_tokens_seen": 117320665, + "router_z_loss_clip": 1.87109375, + "router_z_loss_mlp": 0.16064453, + "step": 5464, + "time_per_iteration": 2.519319534301758 + }, + { + "auxiliary_loss_clip": 0.06466094, + "auxiliary_loss_mlp": 0.01275271, + "balance_loss_clip": 0.06284432, + "balance_loss_mlp": 0.01257938, + "epoch": 0.32857357583045244, + "flos": 23520491988480.0, + "grad_norm": 1.6619431416557902, + "language_loss": 0.72759986, + "learning_rate": 3.1354883659537594e-06, + "loss": 0.80501354, + "num_input_tokens_seen": 117339795, + "router_z_loss_clip": 1.81445312, + "router_z_loss_mlp": 0.17333984, + "step": 5465, + "time_per_iteration": 2.573444366455078 + }, + { + "auxiliary_loss_clip": 0.06473792, + "auxiliary_loss_mlp": 0.01281793, + "balance_loss_clip": 0.06287415, + "balance_loss_mlp": 0.01265509, + "epoch": 0.3286336990831204, + "flos": 21001379639040.0, + "grad_norm": 1.5232981833560715, + "language_loss": 0.82967317, + "learning_rate": 3.1351677376066567e-06, + "loss": 0.90722907, + "num_input_tokens_seen": 117359525, + "router_z_loss_clip": 1.86328125, + "router_z_loss_mlp": 0.16271973, + "step": 5466, + "time_per_iteration": 4.012515306472778 + }, + { + "auxiliary_loss_clip": 0.0647275, + "auxiliary_loss_mlp": 0.01271061, + "balance_loss_clip": 0.06285034, + "balance_loss_mlp": 0.01254932, + "epoch": 0.32869382233578837, + "flos": 23665450752000.0, + "grad_norm": 1.6606265994221874, + "language_loss": 0.79192597, + "learning_rate": 3.134847066213879e-06, + "loss": 0.86936402, + "num_input_tokens_seen": 117380320, + "router_z_loss_clip": 1.87597656, + "router_z_loss_mlp": 0.16125488, + "step": 5467, + "time_per_iteration": 4.000247955322266 + }, + { + "auxiliary_loss_clip": 0.06467809, + "auxiliary_loss_mlp": 0.01271951, + "balance_loss_clip": 0.06279044, + "balance_loss_mlp": 0.01255333, + "epoch": 0.32875394558845633, + "flos": 25343335635840.0, + "grad_norm": 1.5510134892276737, + "language_loss": 0.74865687, + "learning_rate": 3.134526351787587e-06, + "loss": 0.82605445, + "num_input_tokens_seen": 117400695, + "router_z_loss_clip": 1.88574219, + "router_z_loss_mlp": 0.16601562, + "step": 5468, + "time_per_iteration": 2.5805253982543945 + }, + { + "auxiliary_loss_clip": 0.06474267, + "auxiliary_loss_mlp": 0.01276703, + "balance_loss_clip": 0.0628129, + "balance_loss_mlp": 0.01259108, + "epoch": 0.3288140688411243, + "flos": 14908430471040.0, + "grad_norm": 1.672146103500693, + "language_loss": 0.78728724, + "learning_rate": 3.134205594339942e-06, + "loss": 0.86479694, + "num_input_tokens_seen": 117418800, + "router_z_loss_clip": 1.9296875, + "router_z_loss_mlp": 0.17614746, + "step": 5469, + "time_per_iteration": 3.955373525619507 + }, + { + "auxiliary_loss_clip": 0.06466976, + "auxiliary_loss_mlp": 0.01273245, + "balance_loss_clip": 0.06279504, + "balance_loss_mlp": 0.01257224, + "epoch": 0.32887419209379226, + "flos": 18557220366720.0, + "grad_norm": 1.6018901390748483, + "language_loss": 0.82183433, + "learning_rate": 3.133884793883107e-06, + "loss": 0.89923656, + "num_input_tokens_seen": 117438220, + "router_z_loss_clip": 1.87402344, + "router_z_loss_mlp": 0.16015625, + "step": 5470, + "time_per_iteration": 2.5481319427490234 + }, + { + "auxiliary_loss_clip": 0.06467617, + "auxiliary_loss_mlp": 0.01271427, + "balance_loss_clip": 0.06279681, + "balance_loss_mlp": 0.01254869, + "epoch": 0.3289343153464602, + "flos": 48116560913280.0, + "grad_norm": 1.6166643495117736, + "language_loss": 0.68441176, + "learning_rate": 3.1335639504292478e-06, + "loss": 0.76180226, + "num_input_tokens_seen": 117462560, + "router_z_loss_clip": 1.87890625, + "router_z_loss_mlp": 0.16564941, + "step": 5471, + "time_per_iteration": 2.780454158782959 + }, + { + "auxiliary_loss_clip": 0.06479289, + "auxiliary_loss_mlp": 0.012789, + "balance_loss_clip": 0.06285035, + "balance_loss_mlp": 0.01260637, + "epoch": 0.3289944385991282, + "flos": 27607763900160.0, + "grad_norm": 1.5078842371471577, + "language_loss": 0.65564525, + "learning_rate": 3.1332430639905288e-06, + "loss": 0.73322713, + "num_input_tokens_seen": 117483665, + "router_z_loss_clip": 1.94140625, + "router_z_loss_mlp": 0.18273926, + "step": 5472, + "time_per_iteration": 2.580644369125366 + }, + { + "auxiliary_loss_clip": 0.06472386, + "auxiliary_loss_mlp": 0.01277133, + "balance_loss_clip": 0.06281875, + "balance_loss_mlp": 0.01259144, + "epoch": 0.32905456185179616, + "flos": 20126470032000.0, + "grad_norm": 1.614198879205061, + "language_loss": 0.88538003, + "learning_rate": 3.13292213457912e-06, + "loss": 0.96287525, + "num_input_tokens_seen": 117503565, + "router_z_loss_clip": 1.90527344, + "router_z_loss_mlp": 0.17993164, + "step": 5473, + "time_per_iteration": 4.021254062652588 + }, + { + "auxiliary_loss_clip": 0.06475069, + "auxiliary_loss_mlp": 0.01270272, + "balance_loss_clip": 0.06285396, + "balance_loss_mlp": 0.01253714, + "epoch": 0.3291146851044642, + "flos": 23186075143680.0, + "grad_norm": 1.7643015597930078, + "language_loss": 0.78719336, + "learning_rate": 3.1326011622071903e-06, + "loss": 0.86464679, + "num_input_tokens_seen": 117521460, + "router_z_loss_clip": 1.89453125, + "router_z_loss_mlp": 0.16552734, + "step": 5474, + "time_per_iteration": 2.5416688919067383 + }, + { + "auxiliary_loss_clip": 0.06379573, + "auxiliary_loss_mlp": 0.0134405, + "balance_loss_clip": 0.06291323, + "balance_loss_mlp": 0.01340224, + "epoch": 0.32917480835713214, + "flos": 67641630664320.0, + "grad_norm": 0.8577160187921843, + "language_loss": 0.60258645, + "learning_rate": 3.132280146886911e-06, + "loss": 0.67982268, + "num_input_tokens_seen": 117580550, + "router_z_loss_clip": 0.8828125, + "router_z_loss_mlp": 0.03820801, + "step": 5475, + "time_per_iteration": 3.1267805099487305 + }, + { + "auxiliary_loss_clip": 0.06479369, + "auxiliary_loss_mlp": 0.01279647, + "balance_loss_clip": 0.06284596, + "balance_loss_mlp": 0.01261599, + "epoch": 0.3292349316098001, + "flos": 27971963671680.0, + "grad_norm": 3.252822648856248, + "language_loss": 0.7712574, + "learning_rate": 3.131959088630455e-06, + "loss": 0.84884757, + "num_input_tokens_seen": 117600645, + "router_z_loss_clip": 1.94824219, + "router_z_loss_mlp": 0.18041992, + "step": 5476, + "time_per_iteration": 2.5819692611694336 + }, + { + "auxiliary_loss_clip": 0.06469015, + "auxiliary_loss_mlp": 0.01275163, + "balance_loss_clip": 0.06282525, + "balance_loss_mlp": 0.01258956, + "epoch": 0.3292950548624681, + "flos": 20269416297600.0, + "grad_norm": 1.7333439092472165, + "language_loss": 0.7556808, + "learning_rate": 3.131637987449997e-06, + "loss": 0.83312255, + "num_input_tokens_seen": 117618880, + "router_z_loss_clip": 1.86425781, + "router_z_loss_mlp": 0.1619873, + "step": 5477, + "time_per_iteration": 2.532106637954712 + }, + { + "auxiliary_loss_clip": 0.06470291, + "auxiliary_loss_mlp": 0.01275718, + "balance_loss_clip": 0.0628788, + "balance_loss_mlp": 0.01259541, + "epoch": 0.32935517811513604, + "flos": 20819174935680.0, + "grad_norm": 2.104456143380591, + "language_loss": 0.75728148, + "learning_rate": 3.131316843357713e-06, + "loss": 0.83474159, + "num_input_tokens_seen": 117636445, + "router_z_loss_clip": 1.82226562, + "router_z_loss_mlp": 0.16174316, + "step": 5478, + "time_per_iteration": 2.5293543338775635 + }, + { + "auxiliary_loss_clip": 0.06470281, + "auxiliary_loss_mlp": 0.01278094, + "balance_loss_clip": 0.06287058, + "balance_loss_mlp": 0.01261631, + "epoch": 0.329415301367804, + "flos": 18447704680320.0, + "grad_norm": 2.368560120299576, + "language_loss": 0.80772918, + "learning_rate": 3.1309956563657807e-06, + "loss": 0.8852129, + "num_input_tokens_seen": 117653105, + "router_z_loss_clip": 1.83203125, + "router_z_loss_mlp": 0.16455078, + "step": 5479, + "time_per_iteration": 2.5154647827148438 + }, + { + "auxiliary_loss_clip": 0.06362775, + "auxiliary_loss_mlp": 0.01272199, + "balance_loss_clip": 0.06275004, + "balance_loss_mlp": 0.01268579, + "epoch": 0.32947542462047197, + "flos": 66344967930240.0, + "grad_norm": 0.7366188072531391, + "language_loss": 0.56333017, + "learning_rate": 3.1306744264863804e-06, + "loss": 0.63967991, + "num_input_tokens_seen": 117719225, + "router_z_loss_clip": 0.87597656, + "router_z_loss_mlp": 0.03616333, + "step": 5480, + "time_per_iteration": 3.2369706630706787 + }, + { + "auxiliary_loss_clip": 0.06477214, + "auxiliary_loss_mlp": 0.01278618, + "balance_loss_clip": 0.06290235, + "balance_loss_mlp": 0.01262179, + "epoch": 0.32953554787313993, + "flos": 23228268474240.0, + "grad_norm": 1.631877255513098, + "language_loss": 0.7736274, + "learning_rate": 3.1303531537316915e-06, + "loss": 0.85118574, + "num_input_tokens_seen": 117738725, + "router_z_loss_clip": 1.8671875, + "router_z_loss_mlp": 0.16442871, + "step": 5481, + "time_per_iteration": 2.5206968784332275 + }, + { + "auxiliary_loss_clip": 0.06479073, + "auxiliary_loss_mlp": 0.01277292, + "balance_loss_clip": 0.0628771, + "balance_loss_mlp": 0.01260686, + "epoch": 0.3295956711258079, + "flos": 27015686150400.0, + "grad_norm": 1.3752047504599005, + "language_loss": 0.78639877, + "learning_rate": 3.130031838113899e-06, + "loss": 0.86396235, + "num_input_tokens_seen": 117757765, + "router_z_loss_clip": 1.91601562, + "router_z_loss_mlp": 0.16601562, + "step": 5482, + "time_per_iteration": 2.604720115661621 + }, + { + "auxiliary_loss_clip": 0.06475698, + "auxiliary_loss_mlp": 0.01274916, + "balance_loss_clip": 0.06286834, + "balance_loss_mlp": 0.01258274, + "epoch": 0.32965579437847586, + "flos": 19177697450880.0, + "grad_norm": 2.0027782692889358, + "language_loss": 0.74399549, + "learning_rate": 3.129710479645185e-06, + "loss": 0.82150161, + "num_input_tokens_seen": 117776810, + "router_z_loss_clip": 1.88964844, + "router_z_loss_mlp": 0.16662598, + "step": 5483, + "time_per_iteration": 2.5124409198760986 + }, + { + "auxiliary_loss_clip": 0.06472629, + "auxiliary_loss_mlp": 0.01273838, + "balance_loss_clip": 0.06286867, + "balance_loss_mlp": 0.01258472, + "epoch": 0.32971591763114383, + "flos": 30490447115520.0, + "grad_norm": 1.7640387903996015, + "language_loss": 0.7588225, + "learning_rate": 3.1293890783377366e-06, + "loss": 0.83628714, + "num_input_tokens_seen": 117797730, + "router_z_loss_clip": 1.85742188, + "router_z_loss_mlp": 0.15368652, + "step": 5484, + "time_per_iteration": 2.64021635055542 + }, + { + "auxiliary_loss_clip": 0.06469439, + "auxiliary_loss_mlp": 0.01274788, + "balance_loss_clip": 0.06284587, + "balance_loss_mlp": 0.01259232, + "epoch": 0.3297760408838118, + "flos": 16295140016640.0, + "grad_norm": 1.7787654746377481, + "language_loss": 0.72680974, + "learning_rate": 3.129067634203742e-06, + "loss": 0.80425203, + "num_input_tokens_seen": 117815365, + "router_z_loss_clip": 1.84960938, + "router_z_loss_mlp": 0.15563965, + "step": 5485, + "time_per_iteration": 2.516080379486084 + }, + { + "auxiliary_loss_clip": 0.06466281, + "auxiliary_loss_mlp": 0.01274799, + "balance_loss_clip": 0.06281459, + "balance_loss_mlp": 0.0125991, + "epoch": 0.32983616413647976, + "flos": 29538194590080.0, + "grad_norm": 2.336444213272706, + "language_loss": 0.80720758, + "learning_rate": 3.128746147255388e-06, + "loss": 0.8846184, + "num_input_tokens_seen": 117836095, + "router_z_loss_clip": 1.84765625, + "router_z_loss_mlp": 0.14904785, + "step": 5486, + "time_per_iteration": 2.633730173110962 + }, + { + "auxiliary_loss_clip": 0.06467714, + "auxiliary_loss_mlp": 0.01276658, + "balance_loss_clip": 0.06283799, + "balance_loss_mlp": 0.01261828, + "epoch": 0.3298962873891478, + "flos": 20637682992000.0, + "grad_norm": 1.9361428819205904, + "language_loss": 0.84726417, + "learning_rate": 3.1284246175048683e-06, + "loss": 0.92470789, + "num_input_tokens_seen": 117854655, + "router_z_loss_clip": 1.83886719, + "router_z_loss_mlp": 0.14819336, + "step": 5487, + "time_per_iteration": 2.5073888301849365 + }, + { + "auxiliary_loss_clip": 0.06473765, + "auxiliary_loss_mlp": 0.01275689, + "balance_loss_clip": 0.06283425, + "balance_loss_mlp": 0.01258845, + "epoch": 0.32995641064181574, + "flos": 14981329123200.0, + "grad_norm": 2.0510786453666707, + "language_loss": 0.74805683, + "learning_rate": 3.1281030449643735e-06, + "loss": 0.82555139, + "num_input_tokens_seen": 117873300, + "router_z_loss_clip": 1.90429688, + "router_z_loss_mlp": 0.16833496, + "step": 5488, + "time_per_iteration": 2.5195999145507812 + }, + { + "auxiliary_loss_clip": 0.06475645, + "auxiliary_loss_mlp": 0.01276585, + "balance_loss_clip": 0.06288432, + "balance_loss_mlp": 0.012611, + "epoch": 0.3300165338944837, + "flos": 18667448812800.0, + "grad_norm": 2.2567239989743912, + "language_loss": 0.73048651, + "learning_rate": 3.127781429646098e-06, + "loss": 0.80800879, + "num_input_tokens_seen": 117891540, + "router_z_loss_clip": 1.87207031, + "router_z_loss_mlp": 0.15466309, + "step": 5489, + "time_per_iteration": 2.489529609680176 + }, + { + "auxiliary_loss_clip": 0.06468415, + "auxiliary_loss_mlp": 0.01275877, + "balance_loss_clip": 0.06282636, + "balance_loss_mlp": 0.01260987, + "epoch": 0.3300766571471517, + "flos": 25589215042560.0, + "grad_norm": 2.1838257682132256, + "language_loss": 0.89381063, + "learning_rate": 3.127459771562238e-06, + "loss": 0.97125351, + "num_input_tokens_seen": 117907690, + "router_z_loss_clip": 1.859375, + "router_z_loss_mlp": 0.14898682, + "step": 5490, + "time_per_iteration": 2.583505153656006 + }, + { + "auxiliary_loss_clip": 0.06470391, + "auxiliary_loss_mlp": 0.01273693, + "balance_loss_clip": 0.06285221, + "balance_loss_mlp": 0.01258339, + "epoch": 0.33013678039981964, + "flos": 11368150012800.0, + "grad_norm": 1.8708534793530802, + "language_loss": 0.82974613, + "learning_rate": 3.1271380707249907e-06, + "loss": 0.90718699, + "num_input_tokens_seen": 117925640, + "router_z_loss_clip": 1.85253906, + "router_z_loss_mlp": 0.15344238, + "step": 5491, + "time_per_iteration": 2.4903311729431152 + }, + { + "auxiliary_loss_clip": 0.06473103, + "auxiliary_loss_mlp": 0.01274646, + "balance_loss_clip": 0.06285959, + "balance_loss_mlp": 0.01258589, + "epoch": 0.3301969036524876, + "flos": 24827175285120.0, + "grad_norm": 1.8609460693795263, + "language_loss": 0.77910721, + "learning_rate": 3.126816327146554e-06, + "loss": 0.85658479, + "num_input_tokens_seen": 117944525, + "router_z_loss_clip": 1.87402344, + "router_z_loss_mlp": 0.16052246, + "step": 5492, + "time_per_iteration": 2.5615334510803223 + }, + { + "auxiliary_loss_clip": 0.06478797, + "auxiliary_loss_mlp": 0.01277822, + "balance_loss_clip": 0.06287751, + "balance_loss_mlp": 0.01261324, + "epoch": 0.33025702690515557, + "flos": 15966634884480.0, + "grad_norm": 2.4722908606070875, + "language_loss": 0.75614154, + "learning_rate": 3.12649454083913e-06, + "loss": 0.83370769, + "num_input_tokens_seen": 117962515, + "router_z_loss_clip": 1.91015625, + "router_z_loss_mlp": 0.16503906, + "step": 5493, + "time_per_iteration": 2.489143133163452 + }, + { + "auxiliary_loss_clip": 0.06366986, + "auxiliary_loss_mlp": 0.01258616, + "balance_loss_clip": 0.06280049, + "balance_loss_mlp": 0.0125515, + "epoch": 0.33031715015782354, + "flos": 59435794540800.0, + "grad_norm": 0.7878547289977352, + "language_loss": 0.54030049, + "learning_rate": 3.12617271181492e-06, + "loss": 0.61655653, + "num_input_tokens_seen": 118018780, + "router_z_loss_clip": 0.86767578, + "router_z_loss_mlp": 0.03475952, + "step": 5494, + "time_per_iteration": 3.0869832038879395 + }, + { + "auxiliary_loss_clip": 0.06482484, + "auxiliary_loss_mlp": 0.01281394, + "balance_loss_clip": 0.0629174, + "balance_loss_mlp": 0.01264753, + "epoch": 0.3303772734104915, + "flos": 23190896753280.0, + "grad_norm": 1.4215593277180028, + "language_loss": 0.87367666, + "learning_rate": 3.1258508400861276e-06, + "loss": 0.9513154, + "num_input_tokens_seen": 118038610, + "router_z_loss_clip": 1.90625, + "router_z_loss_mlp": 0.16625977, + "step": 5495, + "time_per_iteration": 2.5188820362091064 + }, + { + "auxiliary_loss_clip": 0.06477214, + "auxiliary_loss_mlp": 0.0127749, + "balance_loss_clip": 0.06287535, + "balance_loss_mlp": 0.01260038, + "epoch": 0.33043739666315947, + "flos": 33080068275840.0, + "grad_norm": 2.0083800771900995, + "language_loss": 0.74168754, + "learning_rate": 3.1255289256649587e-06, + "loss": 0.81923461, + "num_input_tokens_seen": 118055905, + "router_z_loss_clip": 1.89550781, + "router_z_loss_mlp": 0.17443848, + "step": 5496, + "time_per_iteration": 2.6151347160339355 + }, + { + "auxiliary_loss_clip": 0.06470463, + "auxiliary_loss_mlp": 0.01272194, + "balance_loss_clip": 0.0628539, + "balance_loss_mlp": 0.01256434, + "epoch": 0.33049751991582743, + "flos": 24901625237760.0, + "grad_norm": 1.9468549986980455, + "language_loss": 0.72676557, + "learning_rate": 3.1252069685636196e-06, + "loss": 0.80419219, + "num_input_tokens_seen": 118073695, + "router_z_loss_clip": 1.84960938, + "router_z_loss_mlp": 0.15759277, + "step": 5497, + "time_per_iteration": 2.51874041557312 + }, + { + "auxiliary_loss_clip": 0.06472345, + "auxiliary_loss_mlp": 0.0127459, + "balance_loss_clip": 0.06286049, + "balance_loss_mlp": 0.01259343, + "epoch": 0.3305576431684954, + "flos": 29468272757760.0, + "grad_norm": 1.8137955115189202, + "language_loss": 0.80825889, + "learning_rate": 3.124884968794321e-06, + "loss": 0.88572824, + "num_input_tokens_seen": 118094030, + "router_z_loss_clip": 1.86328125, + "router_z_loss_mlp": 0.15234375, + "step": 5498, + "time_per_iteration": 2.6010656356811523 + }, + { + "auxiliary_loss_clip": 0.06476308, + "auxiliary_loss_mlp": 0.0127559, + "balance_loss_clip": 0.0628619, + "balance_loss_mlp": 0.01258281, + "epoch": 0.33061776642116336, + "flos": 22637951660160.0, + "grad_norm": 1.8227647554707032, + "language_loss": 0.76843095, + "learning_rate": 3.12456292636927e-06, + "loss": 0.84594989, + "num_input_tokens_seen": 118111665, + "router_z_loss_clip": 1.90039062, + "router_z_loss_mlp": 0.1730957, + "step": 5499, + "time_per_iteration": 2.5308327674865723 + }, + { + "auxiliary_loss_clip": 0.06475572, + "auxiliary_loss_mlp": 0.01277032, + "balance_loss_clip": 0.06287447, + "balance_loss_mlp": 0.01260832, + "epoch": 0.3306778896738313, + "flos": 25783536660480.0, + "grad_norm": 1.5377855738322084, + "language_loss": 0.79203349, + "learning_rate": 3.124240841300681e-06, + "loss": 0.86955953, + "num_input_tokens_seen": 118132435, + "router_z_loss_clip": 1.88183594, + "router_z_loss_mlp": 0.16186523, + "step": 5500, + "time_per_iteration": 2.5970370769500732 + }, + { + "auxiliary_loss_clip": 0.0648918, + "auxiliary_loss_mlp": 0.01275283, + "balance_loss_clip": 0.06298861, + "balance_loss_mlp": 0.01257544, + "epoch": 0.33073801292649935, + "flos": 36949566625920.0, + "grad_norm": 1.9211086255091194, + "language_loss": 0.66916561, + "learning_rate": 3.1239187136007665e-06, + "loss": 0.7468102, + "num_input_tokens_seen": 118155255, + "router_z_loss_clip": 1.90527344, + "router_z_loss_mlp": 0.17724609, + "step": 5501, + "time_per_iteration": 2.687847375869751 + }, + { + "auxiliary_loss_clip": 0.06481969, + "auxiliary_loss_mlp": 0.01273275, + "balance_loss_clip": 0.06291866, + "balance_loss_mlp": 0.01255763, + "epoch": 0.3307981361791673, + "flos": 12972465411840.0, + "grad_norm": 2.0893698607967957, + "language_loss": 0.77978551, + "learning_rate": 3.1235965432817417e-06, + "loss": 0.85733795, + "num_input_tokens_seen": 118169865, + "router_z_loss_clip": 1.90039062, + "router_z_loss_mlp": 0.17504883, + "step": 5502, + "time_per_iteration": 2.500303268432617 + }, + { + "auxiliary_loss_clip": 0.06481159, + "auxiliary_loss_mlp": 0.0127421, + "balance_loss_clip": 0.06290131, + "balance_loss_mlp": 0.01256424, + "epoch": 0.3308582594318353, + "flos": 25381420116480.0, + "grad_norm": 1.7450780858535315, + "language_loss": 0.72841054, + "learning_rate": 3.123274330355824e-06, + "loss": 0.80596423, + "num_input_tokens_seen": 118190760, + "router_z_loss_clip": 1.90917969, + "router_z_loss_mlp": 0.17773438, + "step": 5503, + "time_per_iteration": 2.5851874351501465 + }, + { + "auxiliary_loss_clip": 0.06475106, + "auxiliary_loss_mlp": 0.01274446, + "balance_loss_clip": 0.06287622, + "balance_loss_mlp": 0.01257769, + "epoch": 0.33091838268450324, + "flos": 26475738439680.0, + "grad_norm": 1.4901464435255347, + "language_loss": 0.7565586, + "learning_rate": 3.12295207483523e-06, + "loss": 0.83405411, + "num_input_tokens_seen": 118213620, + "router_z_loss_clip": 1.87597656, + "router_z_loss_mlp": 0.16674805, + "step": 5504, + "time_per_iteration": 2.5670559406280518 + }, + { + "auxiliary_loss_clip": 0.06476955, + "auxiliary_loss_mlp": 0.01276594, + "balance_loss_clip": 0.06289346, + "balance_loss_mlp": 0.01261025, + "epoch": 0.3309785059371712, + "flos": 24977836126080.0, + "grad_norm": 1.5646403370775293, + "language_loss": 0.70214427, + "learning_rate": 3.1226297767321816e-06, + "loss": 0.77967972, + "num_input_tokens_seen": 118235010, + "router_z_loss_clip": 1.87597656, + "router_z_loss_mlp": 0.15545654, + "step": 5505, + "time_per_iteration": 2.628267288208008 + }, + { + "auxiliary_loss_clip": 0.06474259, + "auxiliary_loss_mlp": 0.01275018, + "balance_loss_clip": 0.06288742, + "balance_loss_mlp": 0.01258543, + "epoch": 0.3310386291898392, + "flos": 20452585322880.0, + "grad_norm": 1.7982072656373813, + "language_loss": 0.8240785, + "learning_rate": 3.122307436058899e-06, + "loss": 0.90157127, + "num_input_tokens_seen": 118255820, + "router_z_loss_clip": 1.85644531, + "router_z_loss_mlp": 0.16467285, + "step": 5506, + "time_per_iteration": 4.10949444770813 + }, + { + "auxiliary_loss_clip": 0.06476486, + "auxiliary_loss_mlp": 0.01275135, + "balance_loss_clip": 0.0628888, + "balance_loss_mlp": 0.01258428, + "epoch": 0.33109875244250714, + "flos": 23188926182400.0, + "grad_norm": 1.740251919086934, + "language_loss": 0.79860532, + "learning_rate": 3.121985052827606e-06, + "loss": 0.87612152, + "num_input_tokens_seen": 118274160, + "router_z_loss_clip": 1.875, + "router_z_loss_mlp": 0.16705322, + "step": 5507, + "time_per_iteration": 4.12217903137207 + }, + { + "auxiliary_loss_clip": 0.06468768, + "auxiliary_loss_mlp": 0.01276749, + "balance_loss_clip": 0.06281893, + "balance_loss_mlp": 0.01260477, + "epoch": 0.3311588756951751, + "flos": 24174902776320.0, + "grad_norm": 1.6433149866128014, + "language_loss": 0.71967649, + "learning_rate": 3.1216626270505274e-06, + "loss": 0.79713166, + "num_input_tokens_seen": 118294385, + "router_z_loss_clip": 1.87109375, + "router_z_loss_mlp": 0.1628418, + "step": 5508, + "time_per_iteration": 2.5890002250671387 + }, + { + "auxiliary_loss_clip": 0.06468692, + "auxiliary_loss_mlp": 0.01272213, + "balance_loss_clip": 0.06284875, + "balance_loss_mlp": 0.01256788, + "epoch": 0.33121899894784307, + "flos": 28152994417920.0, + "grad_norm": 1.6757523088462936, + "language_loss": 0.71588784, + "learning_rate": 3.12134015873989e-06, + "loss": 0.79329687, + "num_input_tokens_seen": 118313105, + "router_z_loss_clip": 1.83691406, + "router_z_loss_mlp": 0.15429688, + "step": 5509, + "time_per_iteration": 3.976996660232544 + }, + { + "auxiliary_loss_clip": 0.06471643, + "auxiliary_loss_mlp": 0.01279857, + "balance_loss_clip": 0.06286702, + "balance_loss_mlp": 0.01264396, + "epoch": 0.33127912220051103, + "flos": 29574979332480.0, + "grad_norm": 1.5753317257606638, + "language_loss": 0.73806137, + "learning_rate": 3.121017647907921e-06, + "loss": 0.81557631, + "num_input_tokens_seen": 118335250, + "router_z_loss_clip": 1.84960938, + "router_z_loss_mlp": 0.15460205, + "step": 5510, + "time_per_iteration": 2.576838731765747 + }, + { + "auxiliary_loss_clip": 0.06473264, + "auxiliary_loss_mlp": 0.01276647, + "balance_loss_clip": 0.06286872, + "balance_loss_mlp": 0.01261019, + "epoch": 0.331339245453179, + "flos": 14434086107520.0, + "grad_norm": 2.529546935928515, + "language_loss": 0.88507652, + "learning_rate": 3.1206950945668508e-06, + "loss": 0.96257567, + "num_input_tokens_seen": 118351470, + "router_z_loss_clip": 1.86328125, + "router_z_loss_mlp": 0.15612793, + "step": 5511, + "time_per_iteration": 2.550442695617676 + }, + { + "auxiliary_loss_clip": 0.06464168, + "auxiliary_loss_mlp": 0.01275515, + "balance_loss_clip": 0.06286532, + "balance_loss_mlp": 0.01260494, + "epoch": 0.33139936870584696, + "flos": 20893499107200.0, + "grad_norm": 1.6341387009287651, + "language_loss": 0.73559558, + "learning_rate": 3.12037249872891e-06, + "loss": 0.81299245, + "num_input_tokens_seen": 118370970, + "router_z_loss_clip": 1.77441406, + "router_z_loss_mlp": 0.15026855, + "step": 5512, + "time_per_iteration": 2.5596871376037598 + }, + { + "auxiliary_loss_clip": 0.06468001, + "auxiliary_loss_mlp": 0.01278341, + "balance_loss_clip": 0.06286225, + "balance_loss_mlp": 0.01262438, + "epoch": 0.33145949195851493, + "flos": 36293352975360.0, + "grad_norm": 1.8738374179289, + "language_loss": 0.72677827, + "learning_rate": 3.1200498604063317e-06, + "loss": 0.80424166, + "num_input_tokens_seen": 118393125, + "router_z_loss_clip": 1.81738281, + "router_z_loss_mlp": 0.15905762, + "step": 5513, + "time_per_iteration": 4.148774147033691 + }, + { + "auxiliary_loss_clip": 0.06472933, + "auxiliary_loss_mlp": 0.01275876, + "balance_loss_clip": 0.06284368, + "balance_loss_mlp": 0.0125958, + "epoch": 0.33151961521118295, + "flos": 14284431515520.0, + "grad_norm": 1.8311253656567958, + "language_loss": 0.69026303, + "learning_rate": 3.1197271796113507e-06, + "loss": 0.7677511, + "num_input_tokens_seen": 118410860, + "router_z_loss_clip": 1.8828125, + "router_z_loss_mlp": 0.16296387, + "step": 5514, + "time_per_iteration": 2.486818313598633 + }, + { + "auxiliary_loss_clip": 0.06477968, + "auxiliary_loss_mlp": 0.0127816, + "balance_loss_clip": 0.06291951, + "balance_loss_mlp": 0.01261089, + "epoch": 0.3315797384638509, + "flos": 20780126133120.0, + "grad_norm": 1.9656560392088134, + "language_loss": 0.66393441, + "learning_rate": 3.1194044563562026e-06, + "loss": 0.74149573, + "num_input_tokens_seen": 118429570, + "router_z_loss_clip": 1.859375, + "router_z_loss_mlp": 0.17053223, + "step": 5515, + "time_per_iteration": 2.531658411026001 + }, + { + "auxiliary_loss_clip": 0.06473279, + "auxiliary_loss_mlp": 0.01275122, + "balance_loss_clip": 0.06286342, + "balance_loss_mlp": 0.01258885, + "epoch": 0.3316398617165189, + "flos": 24686115736320.0, + "grad_norm": 3.8914339391091732, + "language_loss": 0.69369388, + "learning_rate": 3.1190816906531257e-06, + "loss": 0.77117789, + "num_input_tokens_seen": 118450285, + "router_z_loss_clip": 1.8671875, + "router_z_loss_mlp": 0.16235352, + "step": 5516, + "time_per_iteration": 2.5392425060272217 + }, + { + "auxiliary_loss_clip": 0.06476592, + "auxiliary_loss_mlp": 0.01274968, + "balance_loss_clip": 0.06287026, + "balance_loss_mlp": 0.0125959, + "epoch": 0.33169998496918685, + "flos": 18593879328000.0, + "grad_norm": 2.757231582138207, + "language_loss": 0.80914545, + "learning_rate": 3.118758882514359e-06, + "loss": 0.88666099, + "num_input_tokens_seen": 118468270, + "router_z_loss_clip": 1.89550781, + "router_z_loss_mlp": 0.15368652, + "step": 5517, + "time_per_iteration": 2.4851818084716797 + }, + { + "auxiliary_loss_clip": 0.06465174, + "auxiliary_loss_mlp": 0.01279818, + "balance_loss_clip": 0.06284687, + "balance_loss_mlp": 0.01264142, + "epoch": 0.3317601082218548, + "flos": 20199871808640.0, + "grad_norm": 1.6705032998917397, + "language_loss": 0.74656814, + "learning_rate": 3.118436031952143e-06, + "loss": 0.82401806, + "num_input_tokens_seen": 118486615, + "router_z_loss_clip": 1.8046875, + "router_z_loss_mlp": 0.15686035, + "step": 5518, + "time_per_iteration": 2.518036127090454 + }, + { + "auxiliary_loss_clip": 0.06372921, + "auxiliary_loss_mlp": 0.01283465, + "balance_loss_clip": 0.06286249, + "balance_loss_mlp": 0.01279764, + "epoch": 0.3318202314745228, + "flos": 68995119265920.0, + "grad_norm": 0.7149144856696655, + "language_loss": 0.54263318, + "learning_rate": 3.1181131389787206e-06, + "loss": 0.61919701, + "num_input_tokens_seen": 118553580, + "router_z_loss_clip": 0.8671875, + "router_z_loss_mlp": 0.03692627, + "step": 5519, + "time_per_iteration": 3.246586322784424 + }, + { + "auxiliary_loss_clip": 0.06472577, + "auxiliary_loss_mlp": 0.01276695, + "balance_loss_clip": 0.06288108, + "balance_loss_mlp": 0.0125966, + "epoch": 0.33188035472719074, + "flos": 21505381148160.0, + "grad_norm": 2.182658812554146, + "language_loss": 0.79452467, + "learning_rate": 3.117790203606336e-06, + "loss": 0.87201744, + "num_input_tokens_seen": 118570280, + "router_z_loss_clip": 1.84375, + "router_z_loss_mlp": 0.17028809, + "step": 5520, + "time_per_iteration": 2.517853260040283 + }, + { + "auxiliary_loss_clip": 0.06465811, + "auxiliary_loss_mlp": 0.01271287, + "balance_loss_clip": 0.06283027, + "balance_loss_mlp": 0.01256279, + "epoch": 0.3319404779798587, + "flos": 28877033548800.0, + "grad_norm": 1.8300903967069966, + "language_loss": 0.77067709, + "learning_rate": 3.1174672258472344e-06, + "loss": 0.84804809, + "num_input_tokens_seen": 118590455, + "router_z_loss_clip": 1.82910156, + "router_z_loss_mlp": 0.15002441, + "step": 5521, + "time_per_iteration": 2.555697441101074 + }, + { + "auxiliary_loss_clip": 0.06478226, + "auxiliary_loss_mlp": 0.01278256, + "balance_loss_clip": 0.06288885, + "balance_loss_mlp": 0.01261542, + "epoch": 0.33200060123252667, + "flos": 23083770908160.0, + "grad_norm": 1.9119948906690396, + "language_loss": 0.70441258, + "learning_rate": 3.117144205713664e-06, + "loss": 0.78197736, + "num_input_tokens_seen": 118609495, + "router_z_loss_clip": 1.89355469, + "router_z_loss_mlp": 0.16699219, + "step": 5522, + "time_per_iteration": 2.5673933029174805 + }, + { + "auxiliary_loss_clip": 0.06474358, + "auxiliary_loss_mlp": 0.01271133, + "balance_loss_clip": 0.06290573, + "balance_loss_mlp": 0.01255255, + "epoch": 0.33206072448519464, + "flos": 21148895952000.0, + "grad_norm": 1.6906348218339255, + "language_loss": 0.74640656, + "learning_rate": 3.1168211432178735e-06, + "loss": 0.82386148, + "num_input_tokens_seen": 118628720, + "router_z_loss_clip": 1.83789062, + "router_z_loss_mlp": 0.15881348, + "step": 5523, + "time_per_iteration": 2.516275405883789 + }, + { + "auxiliary_loss_clip": 0.06473421, + "auxiliary_loss_mlp": 0.01271212, + "balance_loss_clip": 0.06292297, + "balance_loss_mlp": 0.01255763, + "epoch": 0.3321208477378626, + "flos": 13084161304320.0, + "grad_norm": 2.1726495268835024, + "language_loss": 0.82172406, + "learning_rate": 3.116498038372114e-06, + "loss": 0.8991704, + "num_input_tokens_seen": 118645955, + "router_z_loss_clip": 1.80957031, + "router_z_loss_mlp": 0.15454102, + "step": 5524, + "time_per_iteration": 2.557941198348999 + }, + { + "auxiliary_loss_clip": 0.06473367, + "auxiliary_loss_mlp": 0.01272915, + "balance_loss_clip": 0.06289522, + "balance_loss_mlp": 0.01257251, + "epoch": 0.33218097099053057, + "flos": 21221836531200.0, + "grad_norm": 1.6566666481357326, + "language_loss": 0.83100772, + "learning_rate": 3.116174891188636e-06, + "loss": 0.90847051, + "num_input_tokens_seen": 118665605, + "router_z_loss_clip": 1.8359375, + "router_z_loss_mlp": 0.15649414, + "step": 5525, + "time_per_iteration": 2.527944564819336 + }, + { + "auxiliary_loss_clip": 0.06379532, + "auxiliary_loss_mlp": 0.01265433, + "balance_loss_clip": 0.06292765, + "balance_loss_mlp": 0.01261484, + "epoch": 0.33224109424319853, + "flos": 64369954068480.0, + "grad_norm": 0.7407224947932968, + "language_loss": 0.52533764, + "learning_rate": 3.1158517016796945e-06, + "loss": 0.60178727, + "num_input_tokens_seen": 118728155, + "router_z_loss_clip": 0.8671875, + "router_z_loss_mlp": 0.03945923, + "step": 5526, + "time_per_iteration": 3.1679162979125977 + }, + { + "auxiliary_loss_clip": 0.0647909, + "auxiliary_loss_mlp": 0.01274604, + "balance_loss_clip": 0.06291543, + "balance_loss_mlp": 0.01258391, + "epoch": 0.33230121749586655, + "flos": 17351457713280.0, + "grad_norm": 1.970764365513445, + "language_loss": 0.79041827, + "learning_rate": 3.1155284698575445e-06, + "loss": 0.86795521, + "num_input_tokens_seen": 118743955, + "router_z_loss_clip": 1.87597656, + "router_z_loss_mlp": 0.1619873, + "step": 5527, + "time_per_iteration": 2.5327274799346924 + }, + { + "auxiliary_loss_clip": 0.06477004, + "auxiliary_loss_mlp": 0.01278538, + "balance_loss_clip": 0.06294803, + "balance_loss_mlp": 0.01263458, + "epoch": 0.3323613407485345, + "flos": 21003517918080.0, + "grad_norm": 1.6591522480418575, + "language_loss": 0.72383821, + "learning_rate": 3.1152051957344434e-06, + "loss": 0.80139363, + "num_input_tokens_seen": 118763275, + "router_z_loss_clip": 1.82226562, + "router_z_loss_mlp": 0.15063477, + "step": 5528, + "time_per_iteration": 2.6072213649749756 + }, + { + "auxiliary_loss_clip": 0.06477713, + "auxiliary_loss_mlp": 0.01274869, + "balance_loss_clip": 0.06292165, + "balance_loss_mlp": 0.01259396, + "epoch": 0.3324214640012025, + "flos": 13157688862080.0, + "grad_norm": 1.8543805866880412, + "language_loss": 0.8336091, + "learning_rate": 3.1148818793226497e-06, + "loss": 0.91113496, + "num_input_tokens_seen": 118781110, + "router_z_loss_clip": 1.85546875, + "router_z_loss_mlp": 0.15466309, + "step": 5529, + "time_per_iteration": 2.5001087188720703 + }, + { + "auxiliary_loss_clip": 0.06479646, + "auxiliary_loss_mlp": 0.01270144, + "balance_loss_clip": 0.06290583, + "balance_loss_mlp": 0.01254587, + "epoch": 0.33248158725387045, + "flos": 22280124798720.0, + "grad_norm": 1.7380748666321508, + "language_loss": 0.70133483, + "learning_rate": 3.114558520634423e-06, + "loss": 0.77883273, + "num_input_tokens_seen": 118800620, + "router_z_loss_clip": 1.89355469, + "router_z_loss_mlp": 0.15551758, + "step": 5530, + "time_per_iteration": 2.5806338787078857 + }, + { + "auxiliary_loss_clip": 0.06479505, + "auxiliary_loss_mlp": 0.01275357, + "balance_loss_clip": 0.06291899, + "balance_loss_mlp": 0.01258751, + "epoch": 0.3325417105065384, + "flos": 20747324459520.0, + "grad_norm": 2.7342028000668552, + "language_loss": 0.77694213, + "learning_rate": 3.1142351196820256e-06, + "loss": 0.85449082, + "num_input_tokens_seen": 118818725, + "router_z_loss_clip": 1.87597656, + "router_z_loss_mlp": 0.16589355, + "step": 5531, + "time_per_iteration": 2.5307323932647705 + }, + { + "auxiliary_loss_clip": 0.06477839, + "auxiliary_loss_mlp": 0.01280766, + "balance_loss_clip": 0.06290089, + "balance_loss_mlp": 0.01263552, + "epoch": 0.3326018337592064, + "flos": 24797476212480.0, + "grad_norm": 1.9473942094883194, + "language_loss": 0.73779702, + "learning_rate": 3.1139116764777206e-06, + "loss": 0.81538308, + "num_input_tokens_seen": 118839390, + "router_z_loss_clip": 1.87597656, + "router_z_loss_mlp": 0.17211914, + "step": 5532, + "time_per_iteration": 2.5989890098571777 + }, + { + "auxiliary_loss_clip": 0.06472681, + "auxiliary_loss_mlp": 0.01278728, + "balance_loss_clip": 0.06288014, + "balance_loss_mlp": 0.01263147, + "epoch": 0.33266195701187434, + "flos": 14506942832640.0, + "grad_norm": 1.825417572799306, + "language_loss": 0.66042602, + "learning_rate": 3.1135881910337735e-06, + "loss": 0.73794013, + "num_input_tokens_seen": 118856275, + "router_z_loss_clip": 1.84667969, + "router_z_loss_mlp": 0.15576172, + "step": 5533, + "time_per_iteration": 2.47566294670105 + }, + { + "auxiliary_loss_clip": 0.06474279, + "auxiliary_loss_mlp": 0.012755, + "balance_loss_clip": 0.06289338, + "balance_loss_mlp": 0.01258954, + "epoch": 0.3327220802645423, + "flos": 15309792328320.0, + "grad_norm": 1.6677538876536442, + "language_loss": 0.71568084, + "learning_rate": 3.113264663362451e-06, + "loss": 0.79317868, + "num_input_tokens_seen": 118873830, + "router_z_loss_clip": 1.84863281, + "router_z_loss_mlp": 0.16552734, + "step": 5534, + "time_per_iteration": 2.5140762329101562 + }, + { + "auxiliary_loss_clip": 0.06474573, + "auxiliary_loss_mlp": 0.01273002, + "balance_loss_clip": 0.06290095, + "balance_loss_mlp": 0.01257088, + "epoch": 0.3327822035172103, + "flos": 23484336151680.0, + "grad_norm": 1.635346823223845, + "language_loss": 0.67885029, + "learning_rate": 3.1129410934760204e-06, + "loss": 0.75632608, + "num_input_tokens_seen": 118891560, + "router_z_loss_clip": 1.84277344, + "router_z_loss_mlp": 0.15917969, + "step": 5535, + "time_per_iteration": 2.522270917892456 + }, + { + "auxiliary_loss_clip": 0.0647034, + "auxiliary_loss_mlp": 0.01273438, + "balance_loss_clip": 0.06284929, + "balance_loss_mlp": 0.01257547, + "epoch": 0.33284232676987824, + "flos": 25381587824640.0, + "grad_norm": 2.3715726564419155, + "language_loss": 0.72782886, + "learning_rate": 3.1126174813867517e-06, + "loss": 0.80526668, + "num_input_tokens_seen": 118910260, + "router_z_loss_clip": 1.85449219, + "router_z_loss_mlp": 0.15893555, + "step": 5536, + "time_per_iteration": 2.5831825733184814 + }, + { + "auxiliary_loss_clip": 0.06470598, + "auxiliary_loss_mlp": 0.01270866, + "balance_loss_clip": 0.06285597, + "balance_loss_mlp": 0.01255464, + "epoch": 0.3329024500225462, + "flos": 23700851902080.0, + "grad_norm": 1.6831469867631554, + "language_loss": 0.81958938, + "learning_rate": 3.112293827106917e-06, + "loss": 0.89700401, + "num_input_tokens_seen": 118929985, + "router_z_loss_clip": 1.84960938, + "router_z_loss_mlp": 0.15405273, + "step": 5537, + "time_per_iteration": 2.520211935043335 + }, + { + "auxiliary_loss_clip": 0.06473641, + "auxiliary_loss_mlp": 0.01270298, + "balance_loss_clip": 0.06284811, + "balance_loss_mlp": 0.01253799, + "epoch": 0.33296257327521417, + "flos": 31731317429760.0, + "grad_norm": 1.8576028267218818, + "language_loss": 0.71933794, + "learning_rate": 3.111970130648789e-06, + "loss": 0.79677737, + "num_input_tokens_seen": 118951355, + "router_z_loss_clip": 1.88671875, + "router_z_loss_mlp": 0.16491699, + "step": 5538, + "time_per_iteration": 2.6061229705810547 + }, + { + "auxiliary_loss_clip": 0.06466128, + "auxiliary_loss_mlp": 0.01271828, + "balance_loss_clip": 0.06283107, + "balance_loss_mlp": 0.01256784, + "epoch": 0.33302269652788213, + "flos": 22750863436800.0, + "grad_norm": 1.8542539639588682, + "language_loss": 0.75063813, + "learning_rate": 3.1116463920246424e-06, + "loss": 0.82801771, + "num_input_tokens_seen": 118970910, + "router_z_loss_clip": 1.83007812, + "router_z_loss_mlp": 0.15039062, + "step": 5539, + "time_per_iteration": 2.5176634788513184 + }, + { + "auxiliary_loss_clip": 0.06473792, + "auxiliary_loss_mlp": 0.0127244, + "balance_loss_clip": 0.06284824, + "balance_loss_mlp": 0.01255739, + "epoch": 0.33308281978055015, + "flos": 11478546167040.0, + "grad_norm": 1.8040392528519402, + "language_loss": 0.71489209, + "learning_rate": 3.1113226112467527e-06, + "loss": 0.79235446, + "num_input_tokens_seen": 118989200, + "router_z_loss_clip": 1.890625, + "router_z_loss_mlp": 0.16699219, + "step": 5540, + "time_per_iteration": 2.536752939224243 + }, + { + "auxiliary_loss_clip": 0.06462967, + "auxiliary_loss_mlp": 0.01271775, + "balance_loss_clip": 0.06280267, + "balance_loss_mlp": 0.01256576, + "epoch": 0.3331429430332181, + "flos": 38222274291840.0, + "grad_norm": 3.095851444688792, + "language_loss": 0.60970843, + "learning_rate": 3.1109987883273983e-06, + "loss": 0.68705589, + "num_input_tokens_seen": 119011030, + "router_z_loss_clip": 1.82519531, + "router_z_loss_mlp": 0.15197754, + "step": 5541, + "time_per_iteration": 2.6592354774475098 + }, + { + "auxiliary_loss_clip": 0.06472225, + "auxiliary_loss_mlp": 0.01276024, + "balance_loss_clip": 0.06284402, + "balance_loss_mlp": 0.01259872, + "epoch": 0.3332030662858861, + "flos": 22535270081280.0, + "grad_norm": 1.770287690308821, + "language_loss": 0.69711685, + "learning_rate": 3.1106749232788584e-06, + "loss": 0.77459931, + "num_input_tokens_seen": 119030620, + "router_z_loss_clip": 1.87597656, + "router_z_loss_mlp": 0.16149902, + "step": 5542, + "time_per_iteration": 2.5427184104919434 + }, + { + "auxiliary_loss_clip": 0.06473213, + "auxiliary_loss_mlp": 0.01276881, + "balance_loss_clip": 0.06286451, + "balance_loss_mlp": 0.01261658, + "epoch": 0.33326318953855405, + "flos": 16003293845760.0, + "grad_norm": 1.6729265705607443, + "language_loss": 0.75927889, + "learning_rate": 3.110351016113414e-06, + "loss": 0.83677983, + "num_input_tokens_seen": 119048015, + "router_z_loss_clip": 1.86914062, + "router_z_loss_mlp": 0.15222168, + "step": 5543, + "time_per_iteration": 2.4745616912841797 + }, + { + "auxiliary_loss_clip": 0.06475509, + "auxiliary_loss_mlp": 0.01276899, + "balance_loss_clip": 0.06287046, + "balance_loss_mlp": 0.01260281, + "epoch": 0.333323312791222, + "flos": 25600661124480.0, + "grad_norm": 1.7242995092969657, + "language_loss": 0.75332278, + "learning_rate": 3.110027066843348e-06, + "loss": 0.83084685, + "num_input_tokens_seen": 119066280, + "router_z_loss_clip": 1.88476562, + "router_z_loss_mlp": 0.16601562, + "step": 5544, + "time_per_iteration": 2.565572738647461 + }, + { + "auxiliary_loss_clip": 0.06467521, + "auxiliary_loss_mlp": 0.01270286, + "balance_loss_clip": 0.06283619, + "balance_loss_mlp": 0.01254848, + "epoch": 0.33338343604389, + "flos": 25126652177280.0, + "grad_norm": 1.4364166263140996, + "language_loss": 0.71556139, + "learning_rate": 3.1097030754809456e-06, + "loss": 0.79293942, + "num_input_tokens_seen": 119087680, + "router_z_loss_clip": 1.83984375, + "router_z_loss_mlp": 0.1541748, + "step": 5545, + "time_per_iteration": 3.9951117038726807 + }, + { + "auxiliary_loss_clip": 0.0646642, + "auxiliary_loss_mlp": 0.01275763, + "balance_loss_clip": 0.0628425, + "balance_loss_mlp": 0.01260063, + "epoch": 0.33344355929655795, + "flos": 16953114602880.0, + "grad_norm": 1.5928525652704049, + "language_loss": 0.69892073, + "learning_rate": 3.1093790420384894e-06, + "loss": 0.77634251, + "num_input_tokens_seen": 119105820, + "router_z_loss_clip": 1.8203125, + "router_z_loss_mlp": 0.15722656, + "step": 5546, + "time_per_iteration": 4.069552659988403 + }, + { + "auxiliary_loss_clip": 0.06469481, + "auxiliary_loss_mlp": 0.01273771, + "balance_loss_clip": 0.06280591, + "balance_loss_mlp": 0.01257308, + "epoch": 0.3335036825492259, + "flos": 27896675178240.0, + "grad_norm": 1.5973320112543803, + "language_loss": 0.65030676, + "learning_rate": 3.1090549665282702e-06, + "loss": 0.72773933, + "num_input_tokens_seen": 119126630, + "router_z_loss_clip": 1.88964844, + "router_z_loss_mlp": 0.16455078, + "step": 5547, + "time_per_iteration": 2.578320026397705 + }, + { + "auxiliary_loss_clip": 0.06468174, + "auxiliary_loss_mlp": 0.01274769, + "balance_loss_clip": 0.06284153, + "balance_loss_mlp": 0.01258736, + "epoch": 0.3335638058018939, + "flos": 16184995424640.0, + "grad_norm": 1.9789366990729325, + "language_loss": 0.85645819, + "learning_rate": 3.1087308489625742e-06, + "loss": 0.9338876, + "num_input_tokens_seen": 119143375, + "router_z_loss_clip": 1.84082031, + "router_z_loss_mlp": 0.16040039, + "step": 5548, + "time_per_iteration": 3.917346477508545 + }, + { + "auxiliary_loss_clip": 0.06473708, + "auxiliary_loss_mlp": 0.01275416, + "balance_loss_clip": 0.06283803, + "balance_loss_mlp": 0.01259264, + "epoch": 0.33362392905456184, + "flos": 39905651617920.0, + "grad_norm": 1.927393858225298, + "language_loss": 0.74956143, + "learning_rate": 3.1084066893536945e-06, + "loss": 0.82705271, + "num_input_tokens_seen": 119166450, + "router_z_loss_clip": 1.89453125, + "router_z_loss_mlp": 0.16149902, + "step": 5549, + "time_per_iteration": 2.662152051925659 + }, + { + "auxiliary_loss_clip": 0.0647629, + "auxiliary_loss_mlp": 0.01276829, + "balance_loss_clip": 0.06287523, + "balance_loss_mlp": 0.0125946, + "epoch": 0.3336840523072298, + "flos": 44280954339840.0, + "grad_norm": 3.284743863263659, + "language_loss": 0.68874133, + "learning_rate": 3.108082487713921e-06, + "loss": 0.76627254, + "num_input_tokens_seen": 119189645, + "router_z_loss_clip": 1.88476562, + "router_z_loss_mlp": 0.17370605, + "step": 5550, + "time_per_iteration": 2.703099250793457 + }, + { + "auxiliary_loss_clip": 0.06476407, + "auxiliary_loss_mlp": 0.01275354, + "balance_loss_clip": 0.06290508, + "balance_loss_mlp": 0.01259488, + "epoch": 0.33374417555989777, + "flos": 15091054444800.0, + "grad_norm": 2.6465919002896436, + "language_loss": 0.60992151, + "learning_rate": 3.1077582440555495e-06, + "loss": 0.6874392, + "num_input_tokens_seen": 119208045, + "router_z_loss_clip": 1.85839844, + "router_z_loss_mlp": 0.15856934, + "step": 5551, + "time_per_iteration": 2.5024354457855225 + }, + { + "auxiliary_loss_clip": 0.06471356, + "auxiliary_loss_mlp": 0.01275291, + "balance_loss_clip": 0.06287605, + "balance_loss_mlp": 0.01259985, + "epoch": 0.33380429881256574, + "flos": 15854226232320.0, + "grad_norm": 1.6170207033712265, + "language_loss": 0.71155131, + "learning_rate": 3.1074339583908746e-06, + "loss": 0.78901786, + "num_input_tokens_seen": 119224910, + "router_z_loss_clip": 1.83691406, + "router_z_loss_mlp": 0.15307617, + "step": 5552, + "time_per_iteration": 4.0786826610565186 + }, + { + "auxiliary_loss_clip": 0.06476602, + "auxiliary_loss_mlp": 0.01270143, + "balance_loss_clip": 0.06291272, + "balance_loss_mlp": 0.01255182, + "epoch": 0.33386442206523376, + "flos": 13485439307520.0, + "grad_norm": 2.244029622012826, + "language_loss": 0.83864999, + "learning_rate": 3.107109630732192e-06, + "loss": 0.91611743, + "num_input_tokens_seen": 119243290, + "router_z_loss_clip": 1.85644531, + "router_z_loss_mlp": 0.1496582, + "step": 5553, + "time_per_iteration": 2.603986978530884 + }, + { + "auxiliary_loss_clip": 0.06474789, + "auxiliary_loss_mlp": 0.0127187, + "balance_loss_clip": 0.06288507, + "balance_loss_mlp": 0.01255562, + "epoch": 0.3339245453179017, + "flos": 16696250311680.0, + "grad_norm": 2.098616423404285, + "language_loss": 0.81424135, + "learning_rate": 3.1067852610918017e-06, + "loss": 0.89170802, + "num_input_tokens_seen": 119261195, + "router_z_loss_clip": 1.86132812, + "router_z_loss_mlp": 0.16320801, + "step": 5554, + "time_per_iteration": 2.4884121417999268 + }, + { + "auxiliary_loss_clip": 0.06477922, + "auxiliary_loss_mlp": 0.01277907, + "balance_loss_clip": 0.06288742, + "balance_loss_mlp": 0.01261647, + "epoch": 0.3339846685705697, + "flos": 24617954839680.0, + "grad_norm": 1.4369599322997015, + "language_loss": 0.81866252, + "learning_rate": 3.1064608494820032e-06, + "loss": 0.89622086, + "num_input_tokens_seen": 119282845, + "router_z_loss_clip": 1.89160156, + "router_z_loss_mlp": 0.16259766, + "step": 5555, + "time_per_iteration": 2.6273152828216553 + }, + { + "auxiliary_loss_clip": 0.06478396, + "auxiliary_loss_mlp": 0.01271619, + "balance_loss_clip": 0.06292441, + "balance_loss_mlp": 0.01256325, + "epoch": 0.33404479182323765, + "flos": 30961311534720.0, + "grad_norm": 1.7387044564853729, + "language_loss": 0.74836755, + "learning_rate": 3.106136395915099e-06, + "loss": 0.82586771, + "num_input_tokens_seen": 119304430, + "router_z_loss_clip": 1.85839844, + "router_z_loss_mlp": 0.1529541, + "step": 5556, + "time_per_iteration": 2.5936899185180664 + }, + { + "auxiliary_loss_clip": 0.06476042, + "auxiliary_loss_mlp": 0.01275785, + "balance_loss_clip": 0.06293188, + "balance_loss_mlp": 0.01260562, + "epoch": 0.3341049150759056, + "flos": 23519988864000.0, + "grad_norm": 1.3815052276914728, + "language_loss": 0.82545519, + "learning_rate": 3.105811900403391e-06, + "loss": 0.90297353, + "num_input_tokens_seen": 119323830, + "router_z_loss_clip": 1.82617188, + "router_z_loss_mlp": 0.15222168, + "step": 5557, + "time_per_iteration": 2.5862598419189453 + }, + { + "auxiliary_loss_clip": 0.06477734, + "auxiliary_loss_mlp": 0.01279505, + "balance_loss_clip": 0.0629133, + "balance_loss_mlp": 0.01264067, + "epoch": 0.3341650383285736, + "flos": 24034052862720.0, + "grad_norm": 2.760917503655681, + "language_loss": 0.80188966, + "learning_rate": 3.1054873629591855e-06, + "loss": 0.87946206, + "num_input_tokens_seen": 119346340, + "router_z_loss_clip": 1.86230469, + "router_z_loss_mlp": 0.15429688, + "step": 5558, + "time_per_iteration": 2.596344232559204 + }, + { + "auxiliary_loss_clip": 0.06475051, + "auxiliary_loss_mlp": 0.01282797, + "balance_loss_clip": 0.06287208, + "balance_loss_mlp": 0.01267646, + "epoch": 0.33422516158124155, + "flos": 24909255959040.0, + "grad_norm": 1.7423955567809428, + "language_loss": 0.81954122, + "learning_rate": 3.105162783594788e-06, + "loss": 0.8971197, + "num_input_tokens_seen": 119367285, + "router_z_loss_clip": 1.87792969, + "router_z_loss_mlp": 0.1517334, + "step": 5559, + "time_per_iteration": 2.587005376815796 + }, + { + "auxiliary_loss_clip": 0.06467593, + "auxiliary_loss_mlp": 0.01279767, + "balance_loss_clip": 0.06286522, + "balance_loss_mlp": 0.01265224, + "epoch": 0.3342852848339095, + "flos": 18339404878080.0, + "grad_norm": 2.1220335034517093, + "language_loss": 0.72058392, + "learning_rate": 3.1048381623225074e-06, + "loss": 0.79805756, + "num_input_tokens_seen": 119385370, + "router_z_loss_clip": 1.81054688, + "router_z_loss_mlp": 0.14550781, + "step": 5560, + "time_per_iteration": 2.536546230316162 + }, + { + "auxiliary_loss_clip": 0.06481705, + "auxiliary_loss_mlp": 0.01285397, + "balance_loss_clip": 0.06292065, + "balance_loss_mlp": 0.01269458, + "epoch": 0.3343454080865775, + "flos": 30054690357120.0, + "grad_norm": 1.596178779859494, + "language_loss": 0.75386882, + "learning_rate": 3.1045134991546526e-06, + "loss": 0.83153981, + "num_input_tokens_seen": 119409150, + "router_z_loss_clip": 1.89453125, + "router_z_loss_mlp": 0.15930176, + "step": 5561, + "time_per_iteration": 2.672700881958008 + }, + { + "auxiliary_loss_clip": 0.06477022, + "auxiliary_loss_mlp": 0.01277798, + "balance_loss_clip": 0.06291385, + "balance_loss_mlp": 0.01262551, + "epoch": 0.33440553133924544, + "flos": 16404362213760.0, + "grad_norm": 1.6462526862455489, + "language_loss": 0.70108986, + "learning_rate": 3.1041887941035355e-06, + "loss": 0.77863806, + "num_input_tokens_seen": 119426475, + "router_z_loss_clip": 1.85742188, + "router_z_loss_mlp": 0.15246582, + "step": 5562, + "time_per_iteration": 2.501317024230957 + }, + { + "auxiliary_loss_clip": 0.06472157, + "auxiliary_loss_mlp": 0.01280428, + "balance_loss_clip": 0.06287345, + "balance_loss_mlp": 0.01265396, + "epoch": 0.3344656545919134, + "flos": 24248723823360.0, + "grad_norm": 1.5361546803562123, + "language_loss": 0.65648419, + "learning_rate": 3.1038640471814685e-06, + "loss": 0.7340101, + "num_input_tokens_seen": 119446900, + "router_z_loss_clip": 1.84667969, + "router_z_loss_mlp": 0.15026855, + "step": 5563, + "time_per_iteration": 2.5564165115356445 + }, + { + "auxiliary_loss_clip": 0.06477885, + "auxiliary_loss_mlp": 0.01282181, + "balance_loss_clip": 0.06290222, + "balance_loss_mlp": 0.01264752, + "epoch": 0.3345257778445814, + "flos": 52130431048320.0, + "grad_norm": 1.3531042812140452, + "language_loss": 0.74246049, + "learning_rate": 3.103539258400766e-06, + "loss": 0.82006115, + "num_input_tokens_seen": 119470945, + "router_z_loss_clip": 1.87792969, + "router_z_loss_mlp": 0.17431641, + "step": 5564, + "time_per_iteration": 2.810534715652466 + }, + { + "auxiliary_loss_clip": 0.06356741, + "auxiliary_loss_mlp": 0.01295627, + "balance_loss_clip": 0.06270711, + "balance_loss_mlp": 0.01291562, + "epoch": 0.33458590109724934, + "flos": 68066528319360.0, + "grad_norm": 0.78222915395806, + "language_loss": 0.55275309, + "learning_rate": 3.103214427773745e-06, + "loss": 0.62927675, + "num_input_tokens_seen": 119529925, + "router_z_loss_clip": 0.859375, + "router_z_loss_mlp": 0.04064941, + "step": 5565, + "time_per_iteration": 3.1279821395874023 + }, + { + "auxiliary_loss_clip": 0.06471252, + "auxiliary_loss_mlp": 0.01279791, + "balance_loss_clip": 0.06288698, + "balance_loss_mlp": 0.01264163, + "epoch": 0.3346460243499173, + "flos": 37423869062400.0, + "grad_norm": 1.705115292174207, + "language_loss": 0.65565574, + "learning_rate": 3.102889555312721e-06, + "loss": 0.73316622, + "num_input_tokens_seen": 119550700, + "router_z_loss_clip": 1.82519531, + "router_z_loss_mlp": 0.15625, + "step": 5566, + "time_per_iteration": 2.712435245513916 + }, + { + "auxiliary_loss_clip": 0.0647177, + "auxiliary_loss_mlp": 0.01282122, + "balance_loss_clip": 0.06289912, + "balance_loss_mlp": 0.01266529, + "epoch": 0.3347061476025853, + "flos": 18703269233280.0, + "grad_norm": 1.6655571733561654, + "language_loss": 0.77372861, + "learning_rate": 3.102564641030016e-06, + "loss": 0.85126758, + "num_input_tokens_seen": 119569295, + "router_z_loss_clip": 1.81835938, + "router_z_loss_mlp": 0.15588379, + "step": 5567, + "time_per_iteration": 2.4871251583099365 + }, + { + "auxiliary_loss_clip": 0.06471208, + "auxiliary_loss_mlp": 0.01275703, + "balance_loss_clip": 0.06285998, + "balance_loss_mlp": 0.01259491, + "epoch": 0.3347662708552533, + "flos": 13922957001600.0, + "grad_norm": 1.6558873666299474, + "language_loss": 0.77099127, + "learning_rate": 3.102239684937949e-06, + "loss": 0.84846038, + "num_input_tokens_seen": 119587375, + "router_z_loss_clip": 1.8515625, + "router_z_loss_mlp": 0.16223145, + "step": 5568, + "time_per_iteration": 2.5343427658081055 + }, + { + "auxiliary_loss_clip": 0.06472506, + "auxiliary_loss_mlp": 0.01277777, + "balance_loss_clip": 0.06286565, + "balance_loss_mlp": 0.01262136, + "epoch": 0.33482639410792125, + "flos": 19755645788160.0, + "grad_norm": 1.9310298365294178, + "language_loss": 0.71334505, + "learning_rate": 3.101914687048842e-06, + "loss": 0.7908479, + "num_input_tokens_seen": 119604530, + "router_z_loss_clip": 1.85742188, + "router_z_loss_mlp": 0.15643311, + "step": 5569, + "time_per_iteration": 2.5091118812561035 + }, + { + "auxiliary_loss_clip": 0.06473939, + "auxiliary_loss_mlp": 0.01271857, + "balance_loss_clip": 0.06285448, + "balance_loss_mlp": 0.01256479, + "epoch": 0.3348865173605892, + "flos": 16107820214400.0, + "grad_norm": 1.931700529164995, + "language_loss": 0.90211284, + "learning_rate": 3.10158964737502e-06, + "loss": 0.97957081, + "num_input_tokens_seen": 119621025, + "router_z_loss_clip": 1.88476562, + "router_z_loss_mlp": 0.15380859, + "step": 5570, + "time_per_iteration": 2.6067447662353516 + }, + { + "auxiliary_loss_clip": 0.06465288, + "auxiliary_loss_mlp": 0.01272678, + "balance_loss_clip": 0.06282274, + "balance_loss_mlp": 0.01257264, + "epoch": 0.3349466406132572, + "flos": 25015836752640.0, + "grad_norm": 1.5216158426421846, + "language_loss": 0.79890078, + "learning_rate": 3.101264565928808e-06, + "loss": 0.87628049, + "num_input_tokens_seen": 119641725, + "router_z_loss_clip": 1.82910156, + "router_z_loss_mlp": 0.15405273, + "step": 5571, + "time_per_iteration": 2.5423781871795654 + }, + { + "auxiliary_loss_clip": 0.06342317, + "auxiliary_loss_mlp": 0.01254883, + "balance_loss_clip": 0.06257176, + "balance_loss_mlp": 0.01251411, + "epoch": 0.33500676386592515, + "flos": 54340058413440.0, + "grad_norm": 0.8278358272998855, + "language_loss": 0.55695772, + "learning_rate": 3.1009394427225335e-06, + "loss": 0.63292974, + "num_input_tokens_seen": 119693560, + "router_z_loss_clip": 0.8515625, + "router_z_loss_mlp": 0.03482056, + "step": 5572, + "time_per_iteration": 3.1027615070343018 + }, + { + "auxiliary_loss_clip": 0.06472763, + "auxiliary_loss_mlp": 0.0127696, + "balance_loss_clip": 0.06287524, + "balance_loss_mlp": 0.01261677, + "epoch": 0.3350668871185931, + "flos": 26804620915200.0, + "grad_norm": 1.9863197052332227, + "language_loss": 0.78856999, + "learning_rate": 3.1006142777685257e-06, + "loss": 0.86606717, + "num_input_tokens_seen": 119712935, + "router_z_loss_clip": 1.85253906, + "router_z_loss_mlp": 0.15283203, + "step": 5573, + "time_per_iteration": 2.571803331375122 + }, + { + "auxiliary_loss_clip": 0.06473139, + "auxiliary_loss_mlp": 0.01274748, + "balance_loss_clip": 0.06286675, + "balance_loss_mlp": 0.01257999, + "epoch": 0.3351270103712611, + "flos": 33518885708160.0, + "grad_norm": 2.2174625445936256, + "language_loss": 0.72959399, + "learning_rate": 3.1002890710791133e-06, + "loss": 0.80707288, + "num_input_tokens_seen": 119731680, + "router_z_loss_clip": 1.86523438, + "router_z_loss_mlp": 0.16723633, + "step": 5574, + "time_per_iteration": 2.660301923751831 + }, + { + "auxiliary_loss_clip": 0.06465638, + "auxiliary_loss_mlp": 0.01271718, + "balance_loss_clip": 0.06284496, + "balance_loss_mlp": 0.01256042, + "epoch": 0.33518713362392905, + "flos": 26513613285120.0, + "grad_norm": 1.6818935039401424, + "language_loss": 0.88364851, + "learning_rate": 3.0999638226666287e-06, + "loss": 0.96102208, + "num_input_tokens_seen": 119752155, + "router_z_loss_clip": 1.81152344, + "router_z_loss_mlp": 0.15661621, + "step": 5575, + "time_per_iteration": 2.5729191303253174 + }, + { + "auxiliary_loss_clip": 0.0648465, + "auxiliary_loss_mlp": 0.01276363, + "balance_loss_clip": 0.06290504, + "balance_loss_mlp": 0.01259316, + "epoch": 0.335247256876597, + "flos": 17237078490240.0, + "grad_norm": 1.9893319880263207, + "language_loss": 0.83043218, + "learning_rate": 3.0996385325434063e-06, + "loss": 0.90804225, + "num_input_tokens_seen": 119769195, + "router_z_loss_clip": 1.94042969, + "router_z_loss_mlp": 0.17053223, + "step": 5576, + "time_per_iteration": 2.5360445976257324 + }, + { + "auxiliary_loss_clip": 0.06478332, + "auxiliary_loss_mlp": 0.01275534, + "balance_loss_clip": 0.06288211, + "balance_loss_mlp": 0.01259095, + "epoch": 0.335307380129265, + "flos": 25636397690880.0, + "grad_norm": 2.0001339744496622, + "language_loss": 0.73279572, + "learning_rate": 3.0993132007217806e-06, + "loss": 0.81033432, + "num_input_tokens_seen": 119786810, + "router_z_loss_clip": 1.89941406, + "router_z_loss_mlp": 0.16442871, + "step": 5577, + "time_per_iteration": 2.575026750564575 + }, + { + "auxiliary_loss_clip": 0.06475031, + "auxiliary_loss_mlp": 0.01274987, + "balance_loss_clip": 0.0628825, + "balance_loss_mlp": 0.01257689, + "epoch": 0.33536750338193294, + "flos": 19685765882880.0, + "grad_norm": 1.6019428598408136, + "language_loss": 0.82233781, + "learning_rate": 3.0989878272140883e-06, + "loss": 0.89983797, + "num_input_tokens_seen": 119805395, + "router_z_loss_clip": 1.86425781, + "router_z_loss_mlp": 0.17297363, + "step": 5578, + "time_per_iteration": 2.544978380203247 + }, + { + "auxiliary_loss_clip": 0.06461956, + "auxiliary_loss_mlp": 0.01278493, + "balance_loss_clip": 0.06282087, + "balance_loss_mlp": 0.01262907, + "epoch": 0.3354276266346009, + "flos": 18338482483200.0, + "grad_norm": 1.788420802177993, + "language_loss": 0.72050315, + "learning_rate": 3.0986624120326676e-06, + "loss": 0.79790771, + "num_input_tokens_seen": 119823135, + "router_z_loss_clip": 1.796875, + "router_z_loss_mlp": 0.15582275, + "step": 5579, + "time_per_iteration": 2.50080943107605 + }, + { + "auxiliary_loss_clip": 0.06478497, + "auxiliary_loss_mlp": 0.01282646, + "balance_loss_clip": 0.06290549, + "balance_loss_mlp": 0.01266898, + "epoch": 0.3354877498872689, + "flos": 17864389681920.0, + "grad_norm": 2.052679713623706, + "language_loss": 0.81401342, + "learning_rate": 3.0983369551898573e-06, + "loss": 0.89162487, + "num_input_tokens_seen": 119842265, + "router_z_loss_clip": 1.87597656, + "router_z_loss_mlp": 0.15734863, + "step": 5580, + "time_per_iteration": 2.566675901412964 + }, + { + "auxiliary_loss_clip": 0.06473458, + "auxiliary_loss_mlp": 0.0128019, + "balance_loss_clip": 0.06284851, + "balance_loss_mlp": 0.01263691, + "epoch": 0.3355478731399369, + "flos": 24724703341440.0, + "grad_norm": 1.6024353673136869, + "language_loss": 0.78190315, + "learning_rate": 3.0980114566980003e-06, + "loss": 0.85943961, + "num_input_tokens_seen": 119862500, + "router_z_loss_clip": 1.88378906, + "router_z_loss_mlp": 0.16485596, + "step": 5581, + "time_per_iteration": 2.539208173751831 + }, + { + "auxiliary_loss_clip": 0.06482114, + "auxiliary_loss_mlp": 0.01276643, + "balance_loss_clip": 0.06289735, + "balance_loss_mlp": 0.01259084, + "epoch": 0.33560799639260486, + "flos": 16879628972160.0, + "grad_norm": 2.359779356701633, + "language_loss": 0.74923486, + "learning_rate": 3.0976859165694384e-06, + "loss": 0.8268224, + "num_input_tokens_seen": 119880160, + "router_z_loss_clip": 1.92480469, + "router_z_loss_mlp": 0.17565918, + "step": 5582, + "time_per_iteration": 2.5489563941955566 + }, + { + "auxiliary_loss_clip": 0.06478906, + "auxiliary_loss_mlp": 0.01276582, + "balance_loss_clip": 0.06287926, + "balance_loss_mlp": 0.01260191, + "epoch": 0.3356681196452728, + "flos": 18339530659200.0, + "grad_norm": 1.5985505462491367, + "language_loss": 0.82591236, + "learning_rate": 3.0973603348165166e-06, + "loss": 0.90346718, + "num_input_tokens_seen": 119899040, + "router_z_loss_clip": 1.91113281, + "router_z_loss_mlp": 0.16369629, + "step": 5583, + "time_per_iteration": 2.4985439777374268 + }, + { + "auxiliary_loss_clip": 0.06466989, + "auxiliary_loss_mlp": 0.01276424, + "balance_loss_clip": 0.06282677, + "balance_loss_mlp": 0.01260664, + "epoch": 0.3357282428979408, + "flos": 34759127116800.0, + "grad_norm": 1.8261350586664176, + "language_loss": 0.77844834, + "learning_rate": 3.097034711451581e-06, + "loss": 0.85588253, + "num_input_tokens_seen": 119921120, + "router_z_loss_clip": 1.84277344, + "router_z_loss_mlp": 0.15771484, + "step": 5584, + "time_per_iteration": 2.649090051651001 + }, + { + "auxiliary_loss_clip": 0.06475179, + "auxiliary_loss_mlp": 0.01274752, + "balance_loss_clip": 0.06285385, + "balance_loss_mlp": 0.01259427, + "epoch": 0.33578836615060875, + "flos": 21586539427200.0, + "grad_norm": 1.6814695059799305, + "language_loss": 0.76339197, + "learning_rate": 3.0967090464869795e-06, + "loss": 0.84089124, + "num_input_tokens_seen": 119940165, + "router_z_loss_clip": 1.89941406, + "router_z_loss_mlp": 0.15313721, + "step": 5585, + "time_per_iteration": 5.408076763153076 + }, + { + "auxiliary_loss_clip": 0.06463687, + "auxiliary_loss_mlp": 0.01277288, + "balance_loss_clip": 0.06280811, + "balance_loss_mlp": 0.0126054, + "epoch": 0.3358484894032767, + "flos": 24536377290240.0, + "grad_norm": 1.7085225722674646, + "language_loss": 0.78121984, + "learning_rate": 3.0963833399350608e-06, + "loss": 0.85862964, + "num_input_tokens_seen": 119959730, + "router_z_loss_clip": 1.83007812, + "router_z_loss_mlp": 0.16760254, + "step": 5586, + "time_per_iteration": 2.5785536766052246 + }, + { + "auxiliary_loss_clip": 0.06482486, + "auxiliary_loss_mlp": 0.01271246, + "balance_loss_clip": 0.06290784, + "balance_loss_mlp": 0.01254902, + "epoch": 0.3359086126559447, + "flos": 22462161793920.0, + "grad_norm": 1.9607494340110725, + "language_loss": 0.81952178, + "learning_rate": 3.0960575918081756e-06, + "loss": 0.89705908, + "num_input_tokens_seen": 119979315, + "router_z_loss_clip": 1.91796875, + "router_z_loss_mlp": 0.16357422, + "step": 5587, + "time_per_iteration": 3.9456732273101807 + }, + { + "auxiliary_loss_clip": 0.06460288, + "auxiliary_loss_mlp": 0.01274939, + "balance_loss_clip": 0.06281327, + "balance_loss_mlp": 0.01259692, + "epoch": 0.33596873590861265, + "flos": 16549069415040.0, + "grad_norm": 1.7386991231776667, + "language_loss": 0.67118108, + "learning_rate": 3.095731802118677e-06, + "loss": 0.74853337, + "num_input_tokens_seen": 119996140, + "router_z_loss_clip": 1.79003906, + "router_z_loss_mlp": 0.15234375, + "step": 5588, + "time_per_iteration": 2.6328773498535156 + }, + { + "auxiliary_loss_clip": 0.06471635, + "auxiliary_loss_mlp": 0.01272286, + "balance_loss_clip": 0.0628484, + "balance_loss_mlp": 0.01255215, + "epoch": 0.3360288591612806, + "flos": 31183864778880.0, + "grad_norm": 2.547244730124186, + "language_loss": 0.70319438, + "learning_rate": 3.095405970878919e-06, + "loss": 0.78063357, + "num_input_tokens_seen": 120017720, + "router_z_loss_clip": 1.86621094, + "router_z_loss_mlp": 0.17077637, + "step": 5589, + "time_per_iteration": 2.631972074508667 + }, + { + "auxiliary_loss_clip": 0.06473772, + "auxiliary_loss_mlp": 0.01270331, + "balance_loss_clip": 0.06286001, + "balance_loss_mlp": 0.01255096, + "epoch": 0.3360889824139486, + "flos": 23703828721920.0, + "grad_norm": 1.7722032929069027, + "language_loss": 0.67818141, + "learning_rate": 3.0950800981012567e-06, + "loss": 0.75562239, + "num_input_tokens_seen": 120036335, + "router_z_loss_clip": 1.87695312, + "router_z_loss_mlp": 0.15258789, + "step": 5590, + "time_per_iteration": 2.582160711288452 + }, + { + "auxiliary_loss_clip": 0.0646477, + "auxiliary_loss_mlp": 0.01273314, + "balance_loss_clip": 0.06283349, + "balance_loss_mlp": 0.01257972, + "epoch": 0.33614910566661654, + "flos": 19324207514880.0, + "grad_norm": 1.8733623292805037, + "language_loss": 0.73821473, + "learning_rate": 3.094754183798047e-06, + "loss": 0.81559563, + "num_input_tokens_seen": 120056120, + "router_z_loss_clip": 1.8125, + "router_z_loss_mlp": 0.15344238, + "step": 5591, + "time_per_iteration": 2.5325355529785156 + }, + { + "auxiliary_loss_clip": 0.06462986, + "auxiliary_loss_mlp": 0.01270586, + "balance_loss_clip": 0.06280106, + "balance_loss_mlp": 0.01254945, + "epoch": 0.3362092289192845, + "flos": 16477889771520.0, + "grad_norm": 3.0838875929044036, + "language_loss": 0.70195794, + "learning_rate": 3.0944282279816493e-06, + "loss": 0.77929366, + "num_input_tokens_seen": 120073650, + "router_z_loss_clip": 1.828125, + "router_z_loss_mlp": 0.15637207, + "step": 5592, + "time_per_iteration": 3.919609546661377 + }, + { + "auxiliary_loss_clip": 0.06466913, + "auxiliary_loss_mlp": 0.01271712, + "balance_loss_clip": 0.06283789, + "balance_loss_mlp": 0.01257014, + "epoch": 0.33626935217195253, + "flos": 24250484759040.0, + "grad_norm": 2.017741256836838, + "language_loss": 0.76621854, + "learning_rate": 3.094102230664423e-06, + "loss": 0.8436048, + "num_input_tokens_seen": 120093260, + "router_z_loss_clip": 1.83007812, + "router_z_loss_mlp": 0.14697266, + "step": 5593, + "time_per_iteration": 2.582902431488037 + }, + { + "auxiliary_loss_clip": 0.06476289, + "auxiliary_loss_mlp": 0.01272909, + "balance_loss_clip": 0.06285767, + "balance_loss_mlp": 0.01255445, + "epoch": 0.3363294754246205, + "flos": 19724814685440.0, + "grad_norm": 3.212319882003512, + "language_loss": 0.72710228, + "learning_rate": 3.093776191858731e-06, + "loss": 0.80459422, + "num_input_tokens_seen": 120111830, + "router_z_loss_clip": 1.90625, + "router_z_loss_mlp": 0.17456055, + "step": 5594, + "time_per_iteration": 2.495196580886841 + }, + { + "auxiliary_loss_clip": 0.06477273, + "auxiliary_loss_mlp": 0.01272377, + "balance_loss_clip": 0.06289684, + "balance_loss_mlp": 0.01256379, + "epoch": 0.33638959867728846, + "flos": 22602005458560.0, + "grad_norm": 1.7565144487218112, + "language_loss": 0.8009572, + "learning_rate": 3.0934501115769363e-06, + "loss": 0.87845373, + "num_input_tokens_seen": 120130470, + "router_z_loss_clip": 1.87402344, + "router_z_loss_mlp": 0.16003418, + "step": 5595, + "time_per_iteration": 2.5639891624450684 + }, + { + "auxiliary_loss_clip": 0.06468762, + "auxiliary_loss_mlp": 0.01271282, + "balance_loss_clip": 0.06285411, + "balance_loss_mlp": 0.01256691, + "epoch": 0.3364497219299564, + "flos": 21000834587520.0, + "grad_norm": 1.6187307873664143, + "language_loss": 0.81718135, + "learning_rate": 3.0931239898314037e-06, + "loss": 0.89458185, + "num_input_tokens_seen": 120150735, + "router_z_loss_clip": 1.83203125, + "router_z_loss_mlp": 0.14587402, + "step": 5596, + "time_per_iteration": 2.579089403152466 + }, + { + "auxiliary_loss_clip": 0.06470582, + "auxiliary_loss_mlp": 0.01270351, + "balance_loss_clip": 0.06285384, + "balance_loss_mlp": 0.01256034, + "epoch": 0.3365098451826244, + "flos": 25235664739200.0, + "grad_norm": 1.5539796133352632, + "language_loss": 0.76225436, + "learning_rate": 3.0927978266344995e-06, + "loss": 0.83966368, + "num_input_tokens_seen": 120173230, + "router_z_loss_clip": 1.8515625, + "router_z_loss_mlp": 0.14318848, + "step": 5597, + "time_per_iteration": 2.6059625148773193 + }, + { + "auxiliary_loss_clip": 0.06473622, + "auxiliary_loss_mlp": 0.01271725, + "balance_loss_clip": 0.06290761, + "balance_loss_mlp": 0.01257206, + "epoch": 0.33656996843529235, + "flos": 24578612547840.0, + "grad_norm": 1.67554812607641, + "language_loss": 0.78886169, + "learning_rate": 3.0924716219985916e-06, + "loss": 0.86631513, + "num_input_tokens_seen": 120191860, + "router_z_loss_clip": 1.83007812, + "router_z_loss_mlp": 0.14520264, + "step": 5598, + "time_per_iteration": 2.54971981048584 + }, + { + "auxiliary_loss_clip": 0.06487022, + "auxiliary_loss_mlp": 0.01275679, + "balance_loss_clip": 0.0629402, + "balance_loss_mlp": 0.01259353, + "epoch": 0.3366300916879603, + "flos": 44101223331840.0, + "grad_norm": 1.966389459711274, + "language_loss": 0.64792764, + "learning_rate": 3.0921453759360514e-06, + "loss": 0.7255547, + "num_input_tokens_seen": 120219195, + "router_z_loss_clip": 1.92871094, + "router_z_loss_mlp": 0.16326904, + "step": 5599, + "time_per_iteration": 2.741544723510742 + }, + { + "auxiliary_loss_clip": 0.06483869, + "auxiliary_loss_mlp": 0.01276046, + "balance_loss_clip": 0.06290758, + "balance_loss_mlp": 0.01259118, + "epoch": 0.3366902149406283, + "flos": 13884746739840.0, + "grad_norm": 2.857086104177812, + "language_loss": 0.82787466, + "learning_rate": 3.091819088459249e-06, + "loss": 0.90547383, + "num_input_tokens_seen": 120232950, + "router_z_loss_clip": 1.93164062, + "router_z_loss_mlp": 0.16906738, + "step": 5600, + "time_per_iteration": 2.4761526584625244 + }, + { + "auxiliary_loss_clip": 0.06480727, + "auxiliary_loss_mlp": 0.01272907, + "balance_loss_clip": 0.06289887, + "balance_loss_mlp": 0.01257255, + "epoch": 0.33675033819329625, + "flos": 16258648763520.0, + "grad_norm": 2.1921833677853853, + "language_loss": 0.83268821, + "learning_rate": 3.0914927595805573e-06, + "loss": 0.91022456, + "num_input_tokens_seen": 120248865, + "router_z_loss_clip": 1.90625, + "router_z_loss_mlp": 0.15649414, + "step": 5601, + "time_per_iteration": 2.5205788612365723 + }, + { + "auxiliary_loss_clip": 0.06469133, + "auxiliary_loss_mlp": 0.01269312, + "balance_loss_clip": 0.0628977, + "balance_loss_mlp": 0.01255382, + "epoch": 0.3368104614459642, + "flos": 17061498259200.0, + "grad_norm": 1.6270640398275205, + "language_loss": 0.83791035, + "learning_rate": 3.0911663893123507e-06, + "loss": 0.91529477, + "num_input_tokens_seen": 120267820, + "router_z_loss_clip": 1.79199219, + "router_z_loss_mlp": 0.1394043, + "step": 5602, + "time_per_iteration": 2.5069589614868164 + }, + { + "auxiliary_loss_clip": 0.06479525, + "auxiliary_loss_mlp": 0.01274011, + "balance_loss_clip": 0.06294133, + "balance_loss_mlp": 0.01258645, + "epoch": 0.3368705846986322, + "flos": 17864473536000.0, + "grad_norm": 2.666791314538914, + "language_loss": 0.69934028, + "learning_rate": 3.0908399776670048e-06, + "loss": 0.77687562, + "num_input_tokens_seen": 120286540, + "router_z_loss_clip": 1.85351562, + "router_z_loss_mlp": 0.15380859, + "step": 5603, + "time_per_iteration": 2.5512561798095703 + }, + { + "auxiliary_loss_clip": 0.0648806, + "auxiliary_loss_mlp": 0.01271029, + "balance_loss_clip": 0.06298037, + "balance_loss_mlp": 0.01255376, + "epoch": 0.33693070795130015, + "flos": 22936086887040.0, + "grad_norm": 1.5393691582180518, + "language_loss": 0.83336604, + "learning_rate": 3.090513524656898e-06, + "loss": 0.91095686, + "num_input_tokens_seen": 120307305, + "router_z_loss_clip": 1.90234375, + "router_z_loss_mlp": 0.15661621, + "step": 5604, + "time_per_iteration": 2.542419910430908 + }, + { + "auxiliary_loss_clip": 0.06487563, + "auxiliary_loss_mlp": 0.01271201, + "balance_loss_clip": 0.06296179, + "balance_loss_mlp": 0.01255, + "epoch": 0.3369908312039681, + "flos": 22023889413120.0, + "grad_norm": 1.7290560496085086, + "language_loss": 0.74166059, + "learning_rate": 3.090187030294409e-06, + "loss": 0.8192482, + "num_input_tokens_seen": 120327845, + "router_z_loss_clip": 1.91015625, + "router_z_loss_mlp": 0.1619873, + "step": 5605, + "time_per_iteration": 2.551250696182251 + }, + { + "auxiliary_loss_clip": 0.0648852, + "auxiliary_loss_mlp": 0.01268868, + "balance_loss_clip": 0.06295876, + "balance_loss_mlp": 0.01253347, + "epoch": 0.33705095445663613, + "flos": 11806799736960.0, + "grad_norm": 2.683910051705504, + "language_loss": 0.84068418, + "learning_rate": 3.089860494591919e-06, + "loss": 0.91825807, + "num_input_tokens_seen": 120343255, + "router_z_loss_clip": 1.92675781, + "router_z_loss_mlp": 0.15515137, + "step": 5606, + "time_per_iteration": 2.4841489791870117 + }, + { + "auxiliary_loss_clip": 0.0647673, + "auxiliary_loss_mlp": 0.01269431, + "balance_loss_clip": 0.06290583, + "balance_loss_mlp": 0.01254721, + "epoch": 0.3371110777093041, + "flos": 25053460035840.0, + "grad_norm": 1.669780314791874, + "language_loss": 0.68210214, + "learning_rate": 3.089533917561809e-06, + "loss": 0.7595638, + "num_input_tokens_seen": 120361745, + "router_z_loss_clip": 1.85839844, + "router_z_loss_mlp": 0.14709473, + "step": 5607, + "time_per_iteration": 2.6018009185791016 + }, + { + "auxiliary_loss_clip": 0.0648887, + "auxiliary_loss_mlp": 0.01274582, + "balance_loss_clip": 0.06295381, + "balance_loss_mlp": 0.01258131, + "epoch": 0.33717120096197206, + "flos": 26586386156160.0, + "grad_norm": 1.643709475435958, + "language_loss": 0.71566343, + "learning_rate": 3.089207299216464e-06, + "loss": 0.79329789, + "num_input_tokens_seen": 120380565, + "router_z_loss_clip": 1.93261719, + "router_z_loss_mlp": 0.16442871, + "step": 5608, + "time_per_iteration": 2.5980639457702637 + }, + { + "auxiliary_loss_clip": 0.06479236, + "auxiliary_loss_mlp": 0.01274936, + "balance_loss_clip": 0.06291037, + "balance_loss_mlp": 0.01258712, + "epoch": 0.33723132421464, + "flos": 15163911169920.0, + "grad_norm": 1.8781248289320855, + "language_loss": 0.79662472, + "learning_rate": 3.088880639568269e-06, + "loss": 0.87416643, + "num_input_tokens_seen": 120399235, + "router_z_loss_clip": 1.8828125, + "router_z_loss_mlp": 0.16223145, + "step": 5609, + "time_per_iteration": 2.6196935176849365 + }, + { + "auxiliary_loss_clip": 0.06480544, + "auxiliary_loss_mlp": 0.01274048, + "balance_loss_clip": 0.06290779, + "balance_loss_mlp": 0.01256262, + "epoch": 0.337291447467308, + "flos": 23442058967040.0, + "grad_norm": 1.7293742366408622, + "language_loss": 0.83075953, + "learning_rate": 3.0885539386296114e-06, + "loss": 0.90830547, + "num_input_tokens_seen": 120420095, + "router_z_loss_clip": 1.89550781, + "router_z_loss_mlp": 0.17785645, + "step": 5610, + "time_per_iteration": 2.53485369682312 + }, + { + "auxiliary_loss_clip": 0.06471263, + "auxiliary_loss_mlp": 0.01269511, + "balance_loss_clip": 0.06288794, + "balance_loss_mlp": 0.01254097, + "epoch": 0.33735157071997596, + "flos": 17243870670720.0, + "grad_norm": 1.916021570377688, + "language_loss": 0.82657987, + "learning_rate": 3.088227196412879e-06, + "loss": 0.90398765, + "num_input_tokens_seen": 120437690, + "router_z_loss_clip": 1.82519531, + "router_z_loss_mlp": 0.1541748, + "step": 5611, + "time_per_iteration": 2.5164084434509277 + }, + { + "auxiliary_loss_clip": 0.06478009, + "auxiliary_loss_mlp": 0.01278112, + "balance_loss_clip": 0.0629037, + "balance_loss_mlp": 0.01260005, + "epoch": 0.3374116939726439, + "flos": 28265025726720.0, + "grad_norm": 3.0042840390827106, + "language_loss": 0.79815799, + "learning_rate": 3.0879004129304626e-06, + "loss": 0.87571925, + "num_input_tokens_seen": 120459240, + "router_z_loss_clip": 1.87597656, + "router_z_loss_mlp": 0.18084717, + "step": 5612, + "time_per_iteration": 2.582742929458618 + }, + { + "auxiliary_loss_clip": 0.06476334, + "auxiliary_loss_mlp": 0.0127707, + "balance_loss_clip": 0.06288031, + "balance_loss_mlp": 0.01261597, + "epoch": 0.3374718172253119, + "flos": 35928314663040.0, + "grad_norm": 2.3711016444568003, + "language_loss": 0.69757682, + "learning_rate": 3.087573588194753e-06, + "loss": 0.7751109, + "num_input_tokens_seen": 120481090, + "router_z_loss_clip": 1.88378906, + "router_z_loss_mlp": 0.15466309, + "step": 5613, + "time_per_iteration": 2.6553308963775635 + }, + { + "auxiliary_loss_clip": 0.06477948, + "auxiliary_loss_mlp": 0.01274833, + "balance_loss_clip": 0.06288674, + "balance_loss_mlp": 0.01259181, + "epoch": 0.33753194047797985, + "flos": 18192517470720.0, + "grad_norm": 1.7341744507496721, + "language_loss": 0.80043244, + "learning_rate": 3.087246722218144e-06, + "loss": 0.87796032, + "num_input_tokens_seen": 120500045, + "router_z_loss_clip": 1.89257812, + "router_z_loss_mlp": 0.15673828, + "step": 5614, + "time_per_iteration": 2.5162055492401123 + }, + { + "auxiliary_loss_clip": 0.06479318, + "auxiliary_loss_mlp": 0.01274123, + "balance_loss_clip": 0.06289384, + "balance_loss_mlp": 0.01257684, + "epoch": 0.3375920637306478, + "flos": 23155621384320.0, + "grad_norm": 1.8737965791301845, + "language_loss": 0.91138643, + "learning_rate": 3.086919815013031e-06, + "loss": 0.98892087, + "num_input_tokens_seen": 120521125, + "router_z_loss_clip": 1.90234375, + "router_z_loss_mlp": 0.16430664, + "step": 5615, + "time_per_iteration": 2.5491819381713867 + }, + { + "auxiliary_loss_clip": 0.0646698, + "auxiliary_loss_mlp": 0.01277747, + "balance_loss_clip": 0.06282586, + "balance_loss_mlp": 0.01261857, + "epoch": 0.3376521869833158, + "flos": 23118878568960.0, + "grad_norm": 1.8899714235087088, + "language_loss": 0.81227732, + "learning_rate": 3.086592866591809e-06, + "loss": 0.88972461, + "num_input_tokens_seen": 120539180, + "router_z_loss_clip": 1.84375, + "router_z_loss_mlp": 0.15881348, + "step": 5616, + "time_per_iteration": 2.551891803741455 + }, + { + "auxiliary_loss_clip": 0.0647929, + "auxiliary_loss_mlp": 0.01281624, + "balance_loss_clip": 0.06285349, + "balance_loss_mlp": 0.01263576, + "epoch": 0.33771231023598375, + "flos": 19279498561920.0, + "grad_norm": 1.7280186066143421, + "language_loss": 0.84097004, + "learning_rate": 3.0862658769668774e-06, + "loss": 0.91857922, + "num_input_tokens_seen": 120556280, + "router_z_loss_clip": 1.9375, + "router_z_loss_mlp": 0.18054199, + "step": 5617, + "time_per_iteration": 2.532703161239624 + }, + { + "auxiliary_loss_clip": 0.06466082, + "auxiliary_loss_mlp": 0.01273548, + "balance_loss_clip": 0.06279126, + "balance_loss_mlp": 0.01257073, + "epoch": 0.3377724334886517, + "flos": 18156026217600.0, + "grad_norm": 1.631465963150073, + "language_loss": 0.80857313, + "learning_rate": 3.0859388461506343e-06, + "loss": 0.8859694, + "num_input_tokens_seen": 120575395, + "router_z_loss_clip": 1.86816406, + "router_z_loss_mlp": 0.16467285, + "step": 5618, + "time_per_iteration": 2.5592081546783447 + }, + { + "auxiliary_loss_clip": 0.06473768, + "auxiliary_loss_mlp": 0.01275311, + "balance_loss_clip": 0.06286047, + "balance_loss_mlp": 0.01258514, + "epoch": 0.3378325567413197, + "flos": 25783159317120.0, + "grad_norm": 2.0305417192076267, + "language_loss": 0.71181929, + "learning_rate": 3.085611774155481e-06, + "loss": 0.7893101, + "num_input_tokens_seen": 120596075, + "router_z_loss_clip": 1.87695312, + "router_z_loss_mlp": 0.16809082, + "step": 5619, + "time_per_iteration": 2.5726358890533447 + }, + { + "auxiliary_loss_clip": 0.06476114, + "auxiliary_loss_mlp": 0.01271613, + "balance_loss_clip": 0.06289306, + "balance_loss_mlp": 0.01256688, + "epoch": 0.3378926799939877, + "flos": 21322254049920.0, + "grad_norm": 2.6280659122339496, + "language_loss": 0.70615005, + "learning_rate": 3.085284660993821e-06, + "loss": 0.78362733, + "num_input_tokens_seen": 120614195, + "router_z_loss_clip": 1.86523438, + "router_z_loss_mlp": 0.14929199, + "step": 5620, + "time_per_iteration": 2.604161500930786 + }, + { + "auxiliary_loss_clip": 0.06467394, + "auxiliary_loss_mlp": 0.0127348, + "balance_loss_clip": 0.0628472, + "balance_loss_mlp": 0.01258054, + "epoch": 0.33795280324665566, + "flos": 24906991898880.0, + "grad_norm": 2.3940060195146384, + "language_loss": 0.6847257, + "learning_rate": 3.084957506678058e-06, + "loss": 0.76213443, + "num_input_tokens_seen": 120634475, + "router_z_loss_clip": 1.82421875, + "router_z_loss_mlp": 0.1541748, + "step": 5621, + "time_per_iteration": 2.559730052947998 + }, + { + "auxiliary_loss_clip": 0.06469798, + "auxiliary_loss_mlp": 0.01273981, + "balance_loss_clip": 0.06287812, + "balance_loss_mlp": 0.0125914, + "epoch": 0.33801292649932363, + "flos": 24760859178240.0, + "grad_norm": 1.8671152624425502, + "language_loss": 0.82685888, + "learning_rate": 3.0846303112205975e-06, + "loss": 0.90429658, + "num_input_tokens_seen": 120654980, + "router_z_loss_clip": 1.81933594, + "router_z_loss_mlp": 0.1484375, + "step": 5622, + "time_per_iteration": 2.5722928047180176 + }, + { + "auxiliary_loss_clip": 0.06466316, + "auxiliary_loss_mlp": 0.01274625, + "balance_loss_clip": 0.06284748, + "balance_loss_mlp": 0.01260564, + "epoch": 0.3380730497519916, + "flos": 26731177211520.0, + "grad_norm": 1.4865849557607265, + "language_loss": 0.74114043, + "learning_rate": 3.0843030746338464e-06, + "loss": 0.81854987, + "num_input_tokens_seen": 120676245, + "router_z_loss_clip": 1.81738281, + "router_z_loss_mlp": 0.14056396, + "step": 5623, + "time_per_iteration": 2.5830907821655273 + }, + { + "auxiliary_loss_clip": 0.06389539, + "auxiliary_loss_mlp": 0.01273334, + "balance_loss_clip": 0.06299451, + "balance_loss_mlp": 0.01265943, + "epoch": 0.33813317300465956, + "flos": 70056845550720.0, + "grad_norm": 0.7132848624035326, + "language_loss": 0.54856884, + "learning_rate": 3.083975796930215e-06, + "loss": 0.62519753, + "num_input_tokens_seen": 120741965, + "router_z_loss_clip": 0.8984375, + "router_z_loss_mlp": 0.07373047, + "step": 5624, + "time_per_iteration": 4.680114030838013 + }, + { + "auxiliary_loss_clip": 0.06475174, + "auxiliary_loss_mlp": 0.01272775, + "balance_loss_clip": 0.06285602, + "balance_loss_mlp": 0.01256086, + "epoch": 0.3381932962573275, + "flos": 24104142403200.0, + "grad_norm": 3.6042241236842267, + "language_loss": 0.73496938, + "learning_rate": 3.083648478122111e-06, + "loss": 0.81244886, + "num_input_tokens_seen": 120760410, + "router_z_loss_clip": 1.89550781, + "router_z_loss_mlp": 0.16687012, + "step": 5625, + "time_per_iteration": 4.002846956253052 + }, + { + "auxiliary_loss_clip": 0.06480759, + "auxiliary_loss_mlp": 0.01274878, + "balance_loss_clip": 0.06288841, + "balance_loss_mlp": 0.01257021, + "epoch": 0.3382534195099955, + "flos": 19283775120000.0, + "grad_norm": 1.9831743515273117, + "language_loss": 0.7176404, + "learning_rate": 3.0833211182219497e-06, + "loss": 0.79519677, + "num_input_tokens_seen": 120777705, + "router_z_loss_clip": 1.91796875, + "router_z_loss_mlp": 0.17858887, + "step": 5626, + "time_per_iteration": 2.4999427795410156 + }, + { + "auxiliary_loss_clip": 0.06468458, + "auxiliary_loss_mlp": 0.01272986, + "balance_loss_clip": 0.06287608, + "balance_loss_mlp": 0.01257739, + "epoch": 0.33831354276266346, + "flos": 25232897554560.0, + "grad_norm": 2.987617225478933, + "language_loss": 0.81275499, + "learning_rate": 3.0829937172421425e-06, + "loss": 0.8901695, + "num_input_tokens_seen": 120798660, + "router_z_loss_clip": 1.80957031, + "router_z_loss_mlp": 0.15246582, + "step": 5627, + "time_per_iteration": 3.951984405517578 + }, + { + "auxiliary_loss_clip": 0.06478465, + "auxiliary_loss_mlp": 0.01272976, + "balance_loss_clip": 0.06288861, + "balance_loss_mlp": 0.0125668, + "epoch": 0.3383736660153314, + "flos": 23118627006720.0, + "grad_norm": 1.844905449272807, + "language_loss": 0.80405974, + "learning_rate": 3.0826662751951055e-06, + "loss": 0.88157415, + "num_input_tokens_seen": 120816705, + "router_z_loss_clip": 1.89648438, + "router_z_loss_mlp": 0.16296387, + "step": 5628, + "time_per_iteration": 2.5670697689056396 + }, + { + "auxiliary_loss_clip": 0.06477988, + "auxiliary_loss_mlp": 0.01270735, + "balance_loss_clip": 0.06288996, + "balance_loss_mlp": 0.0125457, + "epoch": 0.3384337892679994, + "flos": 23483874954240.0, + "grad_norm": 2.662319374226008, + "language_loss": 0.77757806, + "learning_rate": 3.082338792093254e-06, + "loss": 0.85506529, + "num_input_tokens_seen": 120835375, + "router_z_loss_clip": 1.88867188, + "router_z_loss_mlp": 0.16174316, + "step": 5629, + "time_per_iteration": 2.5463128089904785 + }, + { + "auxiliary_loss_clip": 0.06482605, + "auxiliary_loss_mlp": 0.01280413, + "balance_loss_clip": 0.06291752, + "balance_loss_mlp": 0.01262758, + "epoch": 0.33849391252066735, + "flos": 19431626849280.0, + "grad_norm": 1.826421419331283, + "language_loss": 0.85789764, + "learning_rate": 3.0820112679490074e-06, + "loss": 0.9355278, + "num_input_tokens_seen": 120854260, + "router_z_loss_clip": 1.90722656, + "router_z_loss_mlp": 0.17663574, + "step": 5630, + "time_per_iteration": 2.5818262100219727 + }, + { + "auxiliary_loss_clip": 0.06476109, + "auxiliary_loss_mlp": 0.01274878, + "balance_loss_clip": 0.06290477, + "balance_loss_mlp": 0.01260073, + "epoch": 0.3385540357733353, + "flos": 21070462930560.0, + "grad_norm": 2.179516256809373, + "language_loss": 0.72520673, + "learning_rate": 3.0816837027747857e-06, + "loss": 0.80271661, + "num_input_tokens_seen": 120871590, + "router_z_loss_clip": 1.85351562, + "router_z_loss_mlp": 0.14807129, + "step": 5631, + "time_per_iteration": 3.9340498447418213 + }, + { + "auxiliary_loss_clip": 0.06388511, + "auxiliary_loss_mlp": 0.01280567, + "balance_loss_clip": 0.06298131, + "balance_loss_mlp": 0.01272211, + "epoch": 0.3386141590260033, + "flos": 69224772908160.0, + "grad_norm": 0.8339652565495183, + "language_loss": 0.56105018, + "learning_rate": 3.0813560965830084e-06, + "loss": 0.63774097, + "num_input_tokens_seen": 120925550, + "router_z_loss_clip": 0.90234375, + "router_z_loss_mlp": 0.08361816, + "step": 5632, + "time_per_iteration": 3.215395450592041 + }, + { + "auxiliary_loss_clip": 0.06477562, + "auxiliary_loss_mlp": 0.01271677, + "balance_loss_clip": 0.06290288, + "balance_loss_mlp": 0.01256573, + "epoch": 0.3386742822786713, + "flos": 25526420807040.0, + "grad_norm": 3.459768837753136, + "language_loss": 0.81030583, + "learning_rate": 3.0810284493861005e-06, + "loss": 0.88779831, + "num_input_tokens_seen": 120947620, + "router_z_loss_clip": 1.87011719, + "router_z_loss_mlp": 0.15112305, + "step": 5633, + "time_per_iteration": 2.6278936862945557 + }, + { + "auxiliary_loss_clip": 0.06473435, + "auxiliary_loss_mlp": 0.01274796, + "balance_loss_clip": 0.06287597, + "balance_loss_mlp": 0.01258942, + "epoch": 0.33873440553133927, + "flos": 23629881893760.0, + "grad_norm": 2.634738846372382, + "language_loss": 0.59410667, + "learning_rate": 3.0807007611964855e-06, + "loss": 0.67158902, + "num_input_tokens_seen": 120965205, + "router_z_loss_clip": 1.85839844, + "router_z_loss_mlp": 0.15856934, + "step": 5634, + "time_per_iteration": 2.565622091293335 + }, + { + "auxiliary_loss_clip": 0.06475686, + "auxiliary_loss_mlp": 0.01270379, + "balance_loss_clip": 0.0628805, + "balance_loss_mlp": 0.01255216, + "epoch": 0.33879452878400723, + "flos": 17094006443520.0, + "grad_norm": 1.81394172090833, + "language_loss": 0.92877531, + "learning_rate": 3.080373032026589e-06, + "loss": 1.00623596, + "num_input_tokens_seen": 120983560, + "router_z_loss_clip": 1.87597656, + "router_z_loss_mlp": 0.15161133, + "step": 5635, + "time_per_iteration": 2.539051055908203 + }, + { + "auxiliary_loss_clip": 0.06470082, + "auxiliary_loss_mlp": 0.01273079, + "balance_loss_clip": 0.0629005, + "balance_loss_mlp": 0.01257457, + "epoch": 0.3388546520366752, + "flos": 15747477730560.0, + "grad_norm": 1.8703432540182672, + "language_loss": 0.75823128, + "learning_rate": 3.0800452618888386e-06, + "loss": 0.83566296, + "num_input_tokens_seen": 121001400, + "router_z_loss_clip": 1.80078125, + "router_z_loss_mlp": 0.15618896, + "step": 5636, + "time_per_iteration": 2.4998726844787598 + }, + { + "auxiliary_loss_clip": 0.064714, + "auxiliary_loss_mlp": 0.01275037, + "balance_loss_clip": 0.06288341, + "balance_loss_mlp": 0.01258848, + "epoch": 0.33891477528934316, + "flos": 22425251270400.0, + "grad_norm": 1.6981405891584176, + "language_loss": 0.83775222, + "learning_rate": 3.0797174507956637e-06, + "loss": 0.91521657, + "num_input_tokens_seen": 121021760, + "router_z_loss_clip": 1.828125, + "router_z_loss_mlp": 0.1619873, + "step": 5637, + "time_per_iteration": 2.551074981689453 + }, + { + "auxiliary_loss_clip": 0.06474115, + "auxiliary_loss_mlp": 0.01272331, + "balance_loss_clip": 0.06286962, + "balance_loss_mlp": 0.01254736, + "epoch": 0.3389748985420111, + "flos": 17280571559040.0, + "grad_norm": 1.787045955061502, + "language_loss": 0.70609659, + "learning_rate": 3.079389598759495e-06, + "loss": 0.78356105, + "num_input_tokens_seen": 121041070, + "router_z_loss_clip": 1.87304688, + "router_z_loss_mlp": 0.17590332, + "step": 5638, + "time_per_iteration": 2.5479955673217773 + }, + { + "auxiliary_loss_clip": 0.06478329, + "auxiliary_loss_mlp": 0.01289332, + "balance_loss_clip": 0.06293231, + "balance_loss_mlp": 0.01272404, + "epoch": 0.3390350217946791, + "flos": 27752261466240.0, + "grad_norm": 1.7018866339003167, + "language_loss": 0.81276166, + "learning_rate": 3.079061705792765e-06, + "loss": 0.89043832, + "num_input_tokens_seen": 121060890, + "router_z_loss_clip": 1.84960938, + "router_z_loss_mlp": 0.16931152, + "step": 5639, + "time_per_iteration": 2.614819288253784 + }, + { + "auxiliary_loss_clip": 0.06487049, + "auxiliary_loss_mlp": 0.01288743, + "balance_loss_clip": 0.06296147, + "balance_loss_mlp": 0.01270635, + "epoch": 0.33909514504734706, + "flos": 20346088383360.0, + "grad_norm": 6.449374256721531, + "language_loss": 0.68149316, + "learning_rate": 3.078733771907907e-06, + "loss": 0.75925112, + "num_input_tokens_seen": 121079135, + "router_z_loss_clip": 1.90917969, + "router_z_loss_mlp": 0.18103027, + "step": 5640, + "time_per_iteration": 2.496300220489502 + }, + { + "auxiliary_loss_clip": 0.06471096, + "auxiliary_loss_mlp": 0.01277542, + "balance_loss_clip": 0.06286727, + "balance_loss_mlp": 0.0125978, + "epoch": 0.339155268300015, + "flos": 14835322183680.0, + "grad_norm": 1.7549267997867504, + "language_loss": 0.70165765, + "learning_rate": 3.0784057971173554e-06, + "loss": 0.77914405, + "num_input_tokens_seen": 121097685, + "router_z_loss_clip": 1.84570312, + "router_z_loss_mlp": 0.1776123, + "step": 5641, + "time_per_iteration": 2.524548053741455 + }, + { + "auxiliary_loss_clip": 0.0647646, + "auxiliary_loss_mlp": 0.0128105, + "balance_loss_clip": 0.06289618, + "balance_loss_mlp": 0.01264611, + "epoch": 0.339215391552683, + "flos": 26075173196160.0, + "grad_norm": 2.2643311920206592, + "language_loss": 0.88204467, + "learning_rate": 3.0780777814335483e-06, + "loss": 0.95961982, + "num_input_tokens_seen": 121115640, + "router_z_loss_clip": 1.86914062, + "router_z_loss_mlp": 0.16430664, + "step": 5642, + "time_per_iteration": 2.551790237426758 + }, + { + "auxiliary_loss_clip": 0.06466684, + "auxiliary_loss_mlp": 0.01272488, + "balance_loss_clip": 0.06289211, + "balance_loss_mlp": 0.01258195, + "epoch": 0.33927551480535095, + "flos": 14579967265920.0, + "grad_norm": 2.023061860440481, + "language_loss": 0.84285331, + "learning_rate": 3.077749724868924e-06, + "loss": 0.92024505, + "num_input_tokens_seen": 121132485, + "router_z_loss_clip": 1.7734375, + "router_z_loss_mlp": 0.1428833, + "step": 5643, + "time_per_iteration": 2.542921304702759 + }, + { + "auxiliary_loss_clip": 0.06468654, + "auxiliary_loss_mlp": 0.01272873, + "balance_loss_clip": 0.06285787, + "balance_loss_mlp": 0.01256708, + "epoch": 0.3393356380580189, + "flos": 23812380086400.0, + "grad_norm": 6.736940029896959, + "language_loss": 0.77634799, + "learning_rate": 3.077421627435922e-06, + "loss": 0.85376322, + "num_input_tokens_seen": 121152935, + "router_z_loss_clip": 1.828125, + "router_z_loss_mlp": 0.16162109, + "step": 5644, + "time_per_iteration": 2.523386240005493 + }, + { + "auxiliary_loss_clip": 0.06472027, + "auxiliary_loss_mlp": 0.01274584, + "balance_loss_clip": 0.06288091, + "balance_loss_mlp": 0.0125873, + "epoch": 0.3393957613106869, + "flos": 17353637919360.0, + "grad_norm": 2.9654561398927752, + "language_loss": 0.6324017, + "learning_rate": 3.0770934891469832e-06, + "loss": 0.70986784, + "num_input_tokens_seen": 121169835, + "router_z_loss_clip": 1.84179688, + "router_z_loss_mlp": 0.15856934, + "step": 5645, + "time_per_iteration": 2.51273775100708 + }, + { + "auxiliary_loss_clip": 0.06466414, + "auxiliary_loss_mlp": 0.01272532, + "balance_loss_clip": 0.06285059, + "balance_loss_mlp": 0.01256284, + "epoch": 0.3394558845633549, + "flos": 28440647884800.0, + "grad_norm": 2.089100449350665, + "language_loss": 0.77295536, + "learning_rate": 3.076765310014552e-06, + "loss": 0.8503449, + "num_input_tokens_seen": 121190290, + "router_z_loss_clip": 1.8125, + "router_z_loss_mlp": 0.16247559, + "step": 5646, + "time_per_iteration": 2.5461859703063965 + }, + { + "auxiliary_loss_clip": 0.06477356, + "auxiliary_loss_mlp": 0.01274638, + "balance_loss_clip": 0.06287819, + "balance_loss_mlp": 0.01257568, + "epoch": 0.33951600781602287, + "flos": 22092804996480.0, + "grad_norm": 2.533529984962848, + "language_loss": 0.79702288, + "learning_rate": 3.0764370900510727e-06, + "loss": 0.87454283, + "num_input_tokens_seen": 121209060, + "router_z_loss_clip": 1.89550781, + "router_z_loss_mlp": 0.17077637, + "step": 5647, + "time_per_iteration": 2.5699684619903564 + }, + { + "auxiliary_loss_clip": 0.0647471, + "auxiliary_loss_mlp": 0.01272685, + "balance_loss_clip": 0.06288452, + "balance_loss_mlp": 0.01256067, + "epoch": 0.33957613106869083, + "flos": 23885027176320.0, + "grad_norm": 2.1454269075726535, + "language_loss": 0.78001738, + "learning_rate": 3.0761088292689904e-06, + "loss": 0.85749137, + "num_input_tokens_seen": 121227480, + "router_z_loss_clip": 1.86328125, + "router_z_loss_mlp": 0.16625977, + "step": 5648, + "time_per_iteration": 2.5294926166534424 + }, + { + "auxiliary_loss_clip": 0.063921, + "auxiliary_loss_mlp": 0.01261966, + "balance_loss_clip": 0.0630298, + "balance_loss_mlp": 0.01254759, + "epoch": 0.3396362543213588, + "flos": 71264411066880.0, + "grad_norm": 0.7604552176896413, + "language_loss": 0.56109136, + "learning_rate": 3.075780527680754e-06, + "loss": 0.63763207, + "num_input_tokens_seen": 121291305, + "router_z_loss_clip": 0.890625, + "router_z_loss_mlp": 0.07196045, + "step": 5649, + "time_per_iteration": 3.2003703117370605 + }, + { + "auxiliary_loss_clip": 0.06473398, + "auxiliary_loss_mlp": 0.01280094, + "balance_loss_clip": 0.06287606, + "balance_loss_mlp": 0.01263274, + "epoch": 0.33969637757402676, + "flos": 25928746986240.0, + "grad_norm": 1.4812234353432667, + "language_loss": 0.85783911, + "learning_rate": 3.0754521852988117e-06, + "loss": 0.93537402, + "num_input_tokens_seen": 121312740, + "router_z_loss_clip": 1.85839844, + "router_z_loss_mlp": 0.16821289, + "step": 5650, + "time_per_iteration": 2.551633834838867 + }, + { + "auxiliary_loss_clip": 0.06475022, + "auxiliary_loss_mlp": 0.01277613, + "balance_loss_clip": 0.06292272, + "balance_loss_mlp": 0.01261841, + "epoch": 0.33975650082669473, + "flos": 35270382003840.0, + "grad_norm": 3.382903843955623, + "language_loss": 0.71404934, + "learning_rate": 3.0751238021356152e-06, + "loss": 0.79157567, + "num_input_tokens_seen": 121334220, + "router_z_loss_clip": 1.82617188, + "router_z_loss_mlp": 0.15759277, + "step": 5651, + "time_per_iteration": 2.665083885192871 + }, + { + "auxiliary_loss_clip": 0.06471914, + "auxiliary_loss_mlp": 0.01278706, + "balance_loss_clip": 0.06286959, + "balance_loss_mlp": 0.01261922, + "epoch": 0.3398166240793627, + "flos": 16651373650560.0, + "grad_norm": 4.478617872089092, + "language_loss": 0.81850624, + "learning_rate": 3.074795378203616e-06, + "loss": 0.89601243, + "num_input_tokens_seen": 121351870, + "router_z_loss_clip": 1.84960938, + "router_z_loss_mlp": 0.16772461, + "step": 5652, + "time_per_iteration": 2.5136160850524902 + }, + { + "auxiliary_loss_clip": 0.06483054, + "auxiliary_loss_mlp": 0.01281024, + "balance_loss_clip": 0.06293614, + "balance_loss_mlp": 0.0126344, + "epoch": 0.33987674733203066, + "flos": 24069244377600.0, + "grad_norm": 3.0225456344203088, + "language_loss": 0.77707815, + "learning_rate": 3.0744669135152685e-06, + "loss": 0.85471892, + "num_input_tokens_seen": 121373400, + "router_z_loss_clip": 1.89257812, + "router_z_loss_mlp": 0.17590332, + "step": 5653, + "time_per_iteration": 2.6221256256103516 + }, + { + "auxiliary_loss_clip": 0.06478614, + "auxiliary_loss_mlp": 0.01275428, + "balance_loss_clip": 0.06293246, + "balance_loss_mlp": 0.01259788, + "epoch": 0.3399368705846986, + "flos": 13253955603840.0, + "grad_norm": 4.6454995512067745, + "language_loss": 0.86809218, + "learning_rate": 3.0741384080830278e-06, + "loss": 0.94563264, + "num_input_tokens_seen": 121385225, + "router_z_loss_clip": 1.85449219, + "router_z_loss_mlp": 0.15625, + "step": 5654, + "time_per_iteration": 2.4661965370178223 + }, + { + "auxiliary_loss_clip": 0.06476527, + "auxiliary_loss_mlp": 0.01283952, + "balance_loss_clip": 0.06292438, + "balance_loss_mlp": 0.01267584, + "epoch": 0.3399969938373666, + "flos": 27019585365120.0, + "grad_norm": 2.782601809339298, + "language_loss": 0.65974486, + "learning_rate": 3.073809861919351e-06, + "loss": 0.73734963, + "num_input_tokens_seen": 121404735, + "router_z_loss_clip": 1.83984375, + "router_z_loss_mlp": 0.16369629, + "step": 5655, + "time_per_iteration": 2.555647611618042 + }, + { + "auxiliary_loss_clip": 0.06478781, + "auxiliary_loss_mlp": 0.01275484, + "balance_loss_clip": 0.06293027, + "balance_loss_mlp": 0.01259558, + "epoch": 0.34005711709003456, + "flos": 28557920073600.0, + "grad_norm": 1.4106761603755547, + "language_loss": 0.76612461, + "learning_rate": 3.073481275036697e-06, + "loss": 0.84366733, + "num_input_tokens_seen": 121426780, + "router_z_loss_clip": 1.85839844, + "router_z_loss_mlp": 0.15917969, + "step": 5656, + "time_per_iteration": 2.644866466522217 + }, + { + "auxiliary_loss_clip": 0.06484362, + "auxiliary_loss_mlp": 0.01277113, + "balance_loss_clip": 0.06293096, + "balance_loss_mlp": 0.01260436, + "epoch": 0.3401172403427025, + "flos": 21623533804800.0, + "grad_norm": 1.950261924987131, + "language_loss": 0.83422613, + "learning_rate": 3.073152647447525e-06, + "loss": 0.9118408, + "num_input_tokens_seen": 121447245, + "router_z_loss_clip": 1.91210938, + "router_z_loss_mlp": 0.16674805, + "step": 5657, + "time_per_iteration": 2.701688051223755 + }, + { + "auxiliary_loss_clip": 0.06477939, + "auxiliary_loss_mlp": 0.01276671, + "balance_loss_clip": 0.06292981, + "balance_loss_mlp": 0.01259851, + "epoch": 0.3401773635953705, + "flos": 25893010419840.0, + "grad_norm": 5.064784702806917, + "language_loss": 0.86277437, + "learning_rate": 3.0728239791642976e-06, + "loss": 0.94032043, + "num_input_tokens_seen": 121468165, + "router_z_loss_clip": 1.84765625, + "router_z_loss_mlp": 0.16833496, + "step": 5658, + "time_per_iteration": 2.622107744216919 + }, + { + "auxiliary_loss_clip": 0.06400045, + "auxiliary_loss_mlp": 0.01275632, + "balance_loss_clip": 0.06310016, + "balance_loss_mlp": 0.01268671, + "epoch": 0.3402374868480385, + "flos": 65527737459840.0, + "grad_norm": 0.8082747939523138, + "language_loss": 0.59960568, + "learning_rate": 3.072495270199477e-06, + "loss": 0.67636251, + "num_input_tokens_seen": 121523795, + "router_z_loss_clip": 0.8984375, + "router_z_loss_mlp": 0.06970215, + "step": 5659, + "time_per_iteration": 3.1002566814422607 + }, + { + "auxiliary_loss_clip": 0.0647618, + "auxiliary_loss_mlp": 0.01281423, + "balance_loss_clip": 0.06294397, + "balance_loss_mlp": 0.01264591, + "epoch": 0.34029761010070647, + "flos": 24067357660800.0, + "grad_norm": 2.7764582815625514, + "language_loss": 0.68693221, + "learning_rate": 3.0721665205655284e-06, + "loss": 0.76450825, + "num_input_tokens_seen": 121542950, + "router_z_loss_clip": 1.81542969, + "router_z_loss_mlp": 0.16821289, + "step": 5660, + "time_per_iteration": 2.620135545730591 + }, + { + "auxiliary_loss_clip": 0.06473149, + "auxiliary_loss_mlp": 0.01278369, + "balance_loss_clip": 0.06289428, + "balance_loss_mlp": 0.01262157, + "epoch": 0.34035773335337444, + "flos": 27607093067520.0, + "grad_norm": 2.0682817387265477, + "language_loss": 0.6727913, + "learning_rate": 3.071837730274918e-06, + "loss": 0.75030649, + "num_input_tokens_seen": 121562765, + "router_z_loss_clip": 1.83691406, + "router_z_loss_mlp": 0.16210938, + "step": 5661, + "time_per_iteration": 2.56429123878479 + }, + { + "auxiliary_loss_clip": 0.06469939, + "auxiliary_loss_mlp": 0.01280149, + "balance_loss_clip": 0.06289508, + "balance_loss_mlp": 0.01264175, + "epoch": 0.3404178566060424, + "flos": 20818923373440.0, + "grad_norm": 1.802665197928241, + "language_loss": 0.79380333, + "learning_rate": 3.071508899340113e-06, + "loss": 0.87130427, + "num_input_tokens_seen": 121581610, + "router_z_loss_clip": 1.80566406, + "router_z_loss_mlp": 0.15966797, + "step": 5662, + "time_per_iteration": 2.552755832672119 + }, + { + "auxiliary_loss_clip": 0.06474, + "auxiliary_loss_mlp": 0.01278156, + "balance_loss_clip": 0.06290844, + "balance_loss_mlp": 0.01260454, + "epoch": 0.34047797985871037, + "flos": 26840818679040.0, + "grad_norm": 2.1558050020889894, + "language_loss": 0.73809367, + "learning_rate": 3.0711800277735833e-06, + "loss": 0.8156153, + "num_input_tokens_seen": 121601885, + "router_z_loss_clip": 1.83007812, + "router_z_loss_mlp": 0.17700195, + "step": 5663, + "time_per_iteration": 2.5490622520446777 + }, + { + "auxiliary_loss_clip": 0.06470126, + "auxiliary_loss_mlp": 0.01281986, + "balance_loss_clip": 0.06290488, + "balance_loss_mlp": 0.01265714, + "epoch": 0.34053810311137833, + "flos": 19688742702720.0, + "grad_norm": 1.852400144955729, + "language_loss": 0.86839676, + "learning_rate": 3.0708511155877997e-06, + "loss": 0.94591784, + "num_input_tokens_seen": 121621335, + "router_z_loss_clip": 1.796875, + "router_z_loss_mlp": 0.16259766, + "step": 5664, + "time_per_iteration": 5.419060707092285 + }, + { + "auxiliary_loss_clip": 0.06483276, + "auxiliary_loss_mlp": 0.01274362, + "balance_loss_clip": 0.06295361, + "balance_loss_mlp": 0.01257423, + "epoch": 0.3405982263640463, + "flos": 21732169023360.0, + "grad_norm": 1.8640809787797845, + "language_loss": 0.69509971, + "learning_rate": 3.070522162795235e-06, + "loss": 0.77267611, + "num_input_tokens_seen": 121641310, + "router_z_loss_clip": 1.88183594, + "router_z_loss_mlp": 0.16943359, + "step": 5665, + "time_per_iteration": 2.547194719314575 + }, + { + "auxiliary_loss_clip": 0.06482168, + "auxiliary_loss_mlp": 0.01274659, + "balance_loss_clip": 0.0629427, + "balance_loss_mlp": 0.01257648, + "epoch": 0.34065834961671426, + "flos": 18047600634240.0, + "grad_norm": 2.6257214905883237, + "language_loss": 0.73526829, + "learning_rate": 3.0701931694083626e-06, + "loss": 0.81283653, + "num_input_tokens_seen": 121659625, + "router_z_loss_clip": 1.87890625, + "router_z_loss_mlp": 0.17016602, + "step": 5666, + "time_per_iteration": 2.527994155883789 + }, + { + "auxiliary_loss_clip": 0.06482688, + "auxiliary_loss_mlp": 0.01272217, + "balance_loss_clip": 0.06295212, + "balance_loss_mlp": 0.01255373, + "epoch": 0.3407184728693822, + "flos": 21403705818240.0, + "grad_norm": 1.661941695135435, + "language_loss": 0.74005675, + "learning_rate": 3.0698641354396576e-06, + "loss": 0.81760579, + "num_input_tokens_seen": 121679205, + "router_z_loss_clip": 1.87304688, + "router_z_loss_mlp": 0.1685791, + "step": 5667, + "time_per_iteration": 4.029574155807495 + }, + { + "auxiliary_loss_clip": 0.06378959, + "auxiliary_loss_mlp": 0.01268313, + "balance_loss_clip": 0.06290369, + "balance_loss_mlp": 0.01260898, + "epoch": 0.3407785961220502, + "flos": 68709352515840.0, + "grad_norm": 0.8062084259911544, + "language_loss": 0.63318539, + "learning_rate": 3.069535060901597e-06, + "loss": 0.70965815, + "num_input_tokens_seen": 121751085, + "router_z_loss_clip": 0.8828125, + "router_z_loss_mlp": 0.07397461, + "step": 5668, + "time_per_iteration": 3.3641560077667236 + }, + { + "auxiliary_loss_clip": 0.06472414, + "auxiliary_loss_mlp": 0.01272754, + "balance_loss_clip": 0.0628752, + "balance_loss_mlp": 0.01256863, + "epoch": 0.34083871937471816, + "flos": 14069634773760.0, + "grad_norm": 2.007810831329869, + "language_loss": 0.73127198, + "learning_rate": 3.0692059458066596e-06, + "loss": 0.80872369, + "num_input_tokens_seen": 121768565, + "router_z_loss_clip": 1.84765625, + "router_z_loss_mlp": 0.15893555, + "step": 5669, + "time_per_iteration": 2.4918038845062256 + }, + { + "auxiliary_loss_clip": 0.06479842, + "auxiliary_loss_mlp": 0.0127954, + "balance_loss_clip": 0.06292197, + "balance_loss_mlp": 0.01263423, + "epoch": 0.3408988426273861, + "flos": 17089981447680.0, + "grad_norm": 2.0642744441347287, + "language_loss": 0.80626565, + "learning_rate": 3.0688767901673265e-06, + "loss": 0.88385952, + "num_input_tokens_seen": 121784925, + "router_z_loss_clip": 1.87597656, + "router_z_loss_mlp": 0.16125488, + "step": 5670, + "time_per_iteration": 2.5270040035247803 + }, + { + "auxiliary_loss_clip": 0.06481062, + "auxiliary_loss_mlp": 0.01275164, + "balance_loss_clip": 0.06291522, + "balance_loss_mlp": 0.0125838, + "epoch": 0.3409589658800541, + "flos": 24031411459200.0, + "grad_norm": 1.863009265742361, + "language_loss": 0.77916187, + "learning_rate": 3.068547593996078e-06, + "loss": 0.85672414, + "num_input_tokens_seen": 121804425, + "router_z_loss_clip": 1.89355469, + "router_z_loss_mlp": 0.16784668, + "step": 5671, + "time_per_iteration": 4.039815664291382 + }, + { + "auxiliary_loss_clip": 0.06473973, + "auxiliary_loss_mlp": 0.01276984, + "balance_loss_clip": 0.06289308, + "balance_loss_mlp": 0.01260712, + "epoch": 0.34101908913272205, + "flos": 21148350900480.0, + "grad_norm": 1.9142883162018633, + "language_loss": 0.74626315, + "learning_rate": 3.0682183573053974e-06, + "loss": 0.82377267, + "num_input_tokens_seen": 121825145, + "router_z_loss_clip": 1.84960938, + "router_z_loss_mlp": 0.16259766, + "step": 5672, + "time_per_iteration": 2.564887762069702 + }, + { + "auxiliary_loss_clip": 0.06475951, + "auxiliary_loss_mlp": 0.01275656, + "balance_loss_clip": 0.06287946, + "balance_loss_mlp": 0.01259265, + "epoch": 0.3410792123853901, + "flos": 15706835700480.0, + "grad_norm": 1.714309741158987, + "language_loss": 0.73791027, + "learning_rate": 3.06788908010777e-06, + "loss": 0.81542635, + "num_input_tokens_seen": 121842185, + "router_z_loss_clip": 1.88085938, + "router_z_loss_mlp": 0.16394043, + "step": 5673, + "time_per_iteration": 2.540194511413574 + }, + { + "auxiliary_loss_clip": 0.06466323, + "auxiliary_loss_mlp": 0.01283225, + "balance_loss_clip": 0.06284231, + "balance_loss_mlp": 0.01266584, + "epoch": 0.34113933563805804, + "flos": 23042122629120.0, + "grad_norm": 1.8379615104267257, + "language_loss": 0.7978701, + "learning_rate": 3.067559762415682e-06, + "loss": 0.87536556, + "num_input_tokens_seen": 121862260, + "router_z_loss_clip": 1.81835938, + "router_z_loss_mlp": 0.16638184, + "step": 5674, + "time_per_iteration": 2.5462148189544678 + }, + { + "auxiliary_loss_clip": 0.06364837, + "auxiliary_loss_mlp": 0.01262017, + "balance_loss_clip": 0.06277348, + "balance_loss_mlp": 0.01255442, + "epoch": 0.341199458890726, + "flos": 69631878769920.0, + "grad_norm": 0.7752872762952348, + "language_loss": 0.56147063, + "learning_rate": 3.0672304042416198e-06, + "loss": 0.63773918, + "num_input_tokens_seen": 121923560, + "router_z_loss_clip": 0.875, + "router_z_loss_mlp": 0.06585693, + "step": 5675, + "time_per_iteration": 3.370281457901001 + }, + { + "auxiliary_loss_clip": 0.0645988, + "auxiliary_loss_mlp": 0.01273396, + "balance_loss_clip": 0.06281768, + "balance_loss_mlp": 0.01257398, + "epoch": 0.34125958214339397, + "flos": 22352939596800.0, + "grad_norm": 2.600205708544321, + "language_loss": 0.79689062, + "learning_rate": 3.0669010055980734e-06, + "loss": 0.87422335, + "num_input_tokens_seen": 121943515, + "router_z_loss_clip": 1.78125, + "router_z_loss_mlp": 0.16003418, + "step": 5676, + "time_per_iteration": 2.5312321186065674 + }, + { + "auxiliary_loss_clip": 0.06470488, + "auxiliary_loss_mlp": 0.01271752, + "balance_loss_clip": 0.06286064, + "balance_loss_mlp": 0.01255051, + "epoch": 0.34131970539606193, + "flos": 21878427525120.0, + "grad_norm": 2.203551534393157, + "language_loss": 0.8601976, + "learning_rate": 3.0665715664975357e-06, + "loss": 0.93761992, + "num_input_tokens_seen": 121962540, + "router_z_loss_clip": 1.84375, + "router_z_loss_mlp": 0.16699219, + "step": 5677, + "time_per_iteration": 2.555037260055542 + }, + { + "auxiliary_loss_clip": 0.06463757, + "auxiliary_loss_mlp": 0.01274207, + "balance_loss_clip": 0.06280699, + "balance_loss_mlp": 0.01257268, + "epoch": 0.3413798286487299, + "flos": 24942560757120.0, + "grad_norm": 2.786164717546535, + "language_loss": 0.80252033, + "learning_rate": 3.0662420869524966e-06, + "loss": 0.87989998, + "num_input_tokens_seen": 121979830, + "router_z_loss_clip": 1.83203125, + "router_z_loss_mlp": 0.16955566, + "step": 5678, + "time_per_iteration": 2.6321489810943604 + }, + { + "auxiliary_loss_clip": 0.06467854, + "auxiliary_loss_mlp": 0.01270663, + "balance_loss_clip": 0.06282793, + "balance_loss_mlp": 0.01255404, + "epoch": 0.34143995190139786, + "flos": 25381420116480.0, + "grad_norm": 1.8772848902338297, + "language_loss": 0.75927806, + "learning_rate": 3.0659125669754506e-06, + "loss": 0.83666325, + "num_input_tokens_seen": 121999055, + "router_z_loss_clip": 1.85253906, + "router_z_loss_mlp": 0.15246582, + "step": 5679, + "time_per_iteration": 2.5981781482696533 + }, + { + "auxiliary_loss_clip": 0.06365222, + "auxiliary_loss_mlp": 0.01260685, + "balance_loss_clip": 0.06278291, + "balance_loss_mlp": 0.01253538, + "epoch": 0.34150007515406583, + "flos": 67804785763200.0, + "grad_norm": 0.7019635675964923, + "language_loss": 0.59521842, + "learning_rate": 3.0655830065788923e-06, + "loss": 0.67147756, + "num_input_tokens_seen": 122067015, + "router_z_loss_clip": 0.8671875, + "router_z_loss_mlp": 0.0713501, + "step": 5680, + "time_per_iteration": 3.2768852710723877 + }, + { + "auxiliary_loss_clip": 0.06464119, + "auxiliary_loss_mlp": 0.01271493, + "balance_loss_clip": 0.06282759, + "balance_loss_mlp": 0.01255602, + "epoch": 0.3415601984067338, + "flos": 20308548954240.0, + "grad_norm": 1.756785442101194, + "language_loss": 0.72804415, + "learning_rate": 3.0652534057753206e-06, + "loss": 0.80540025, + "num_input_tokens_seen": 122085295, + "router_z_loss_clip": 1.81445312, + "router_z_loss_mlp": 0.15881348, + "step": 5681, + "time_per_iteration": 2.540839195251465 + }, + { + "auxiliary_loss_clip": 0.06462204, + "auxiliary_loss_mlp": 0.01272244, + "balance_loss_clip": 0.06283034, + "balance_loss_mlp": 0.01256806, + "epoch": 0.34162032165940176, + "flos": 26038346526720.0, + "grad_norm": 5.204332383129175, + "language_loss": 0.71220171, + "learning_rate": 3.064923764577233e-06, + "loss": 0.78954625, + "num_input_tokens_seen": 122104020, + "router_z_loss_clip": 1.79199219, + "router_z_loss_mlp": 0.15454102, + "step": 5682, + "time_per_iteration": 2.5933032035827637 + }, + { + "auxiliary_loss_clip": 0.06466864, + "auxiliary_loss_mlp": 0.0127503, + "balance_loss_clip": 0.06286532, + "balance_loss_mlp": 0.01258711, + "epoch": 0.3416804449120697, + "flos": 28810843223040.0, + "grad_norm": 1.4703350638010875, + "language_loss": 0.83879244, + "learning_rate": 3.0645940829971295e-06, + "loss": 0.91621137, + "num_input_tokens_seen": 122125080, + "router_z_loss_clip": 1.80273438, + "router_z_loss_mlp": 0.16320801, + "step": 5683, + "time_per_iteration": 2.595921277999878 + }, + { + "auxiliary_loss_clip": 0.06468399, + "auxiliary_loss_mlp": 0.01274924, + "balance_loss_clip": 0.06284815, + "balance_loss_mlp": 0.01258354, + "epoch": 0.3417405681647377, + "flos": 22608210660480.0, + "grad_norm": 1.8188343464074745, + "language_loss": 0.71334541, + "learning_rate": 3.0642643610475116e-06, + "loss": 0.79077864, + "num_input_tokens_seen": 122146350, + "router_z_loss_clip": 1.8359375, + "router_z_loss_mlp": 0.16577148, + "step": 5684, + "time_per_iteration": 2.5821194648742676 + }, + { + "auxiliary_loss_clip": 0.06462076, + "auxiliary_loss_mlp": 0.01268234, + "balance_loss_clip": 0.06284268, + "balance_loss_mlp": 0.01253816, + "epoch": 0.34180069141740566, + "flos": 24722942405760.0, + "grad_norm": 1.4943065575919134, + "language_loss": 0.75352108, + "learning_rate": 3.0639345987408823e-06, + "loss": 0.8308242, + "num_input_tokens_seen": 122168085, + "router_z_loss_clip": 1.77636719, + "router_z_loss_mlp": 0.144104, + "step": 5685, + "time_per_iteration": 2.545419216156006 + }, + { + "auxiliary_loss_clip": 0.06457227, + "auxiliary_loss_mlp": 0.01270508, + "balance_loss_clip": 0.06281762, + "balance_loss_mlp": 0.0125501, + "epoch": 0.3418608146700737, + "flos": 30526644879360.0, + "grad_norm": 1.8907916568784255, + "language_loss": 0.70833004, + "learning_rate": 3.0636047960897468e-06, + "loss": 0.7856074, + "num_input_tokens_seen": 122191040, + "router_z_loss_clip": 1.75390625, + "router_z_loss_mlp": 0.1550293, + "step": 5686, + "time_per_iteration": 2.645081043243408 + }, + { + "auxiliary_loss_clip": 0.06467415, + "auxiliary_loss_mlp": 0.01269453, + "balance_loss_clip": 0.06284459, + "balance_loss_mlp": 0.01253407, + "epoch": 0.34192093792274164, + "flos": 15127755333120.0, + "grad_norm": 2.1973050683231303, + "language_loss": 0.77864039, + "learning_rate": 3.06327495310661e-06, + "loss": 0.85600907, + "num_input_tokens_seen": 122209225, + "router_z_loss_clip": 1.82910156, + "router_z_loss_mlp": 0.16052246, + "step": 5687, + "time_per_iteration": 2.501957654953003 + }, + { + "auxiliary_loss_clip": 0.06462508, + "auxiliary_loss_mlp": 0.01273993, + "balance_loss_clip": 0.06286186, + "balance_loss_mlp": 0.01257435, + "epoch": 0.3419810611754096, + "flos": 13192754595840.0, + "grad_norm": 1.8198375176693335, + "language_loss": 0.87159389, + "learning_rate": 3.062945069803981e-06, + "loss": 0.94895893, + "num_input_tokens_seen": 122226160, + "router_z_loss_clip": 1.76269531, + "router_z_loss_mlp": 0.16552734, + "step": 5688, + "time_per_iteration": 2.514558792114258 + }, + { + "auxiliary_loss_clip": 0.06470017, + "auxiliary_loss_mlp": 0.01272882, + "balance_loss_clip": 0.06283651, + "balance_loss_mlp": 0.01255025, + "epoch": 0.34204118442807757, + "flos": 19542274565760.0, + "grad_norm": 1.9150705307332732, + "language_loss": 0.80177575, + "learning_rate": 3.0626151461943684e-06, + "loss": 0.87920475, + "num_input_tokens_seen": 122243115, + "router_z_loss_clip": 1.86328125, + "router_z_loss_mlp": 0.17858887, + "step": 5689, + "time_per_iteration": 2.4941842555999756 + }, + { + "auxiliary_loss_clip": 0.06471369, + "auxiliary_loss_mlp": 0.01270545, + "balance_loss_clip": 0.06286988, + "balance_loss_mlp": 0.01254476, + "epoch": 0.34210130768074554, + "flos": 15200192787840.0, + "grad_norm": 1.8413075326603192, + "language_loss": 0.74004579, + "learning_rate": 3.0622851822902834e-06, + "loss": 0.81746483, + "num_input_tokens_seen": 122261105, + "router_z_loss_clip": 1.84277344, + "router_z_loss_mlp": 0.1607666, + "step": 5690, + "time_per_iteration": 2.5133728981018066 + }, + { + "auxiliary_loss_clip": 0.06470567, + "auxiliary_loss_mlp": 0.01270745, + "balance_loss_clip": 0.06288044, + "balance_loss_mlp": 0.01254854, + "epoch": 0.3421614309334135, + "flos": 24943147735680.0, + "grad_norm": 2.8439157619722666, + "language_loss": 0.76563686, + "learning_rate": 3.061955178104237e-06, + "loss": 0.84305, + "num_input_tokens_seen": 122279995, + "router_z_loss_clip": 1.82519531, + "router_z_loss_mlp": 0.15893555, + "step": 5691, + "time_per_iteration": 2.5346477031707764 + }, + { + "auxiliary_loss_clip": 0.06465675, + "auxiliary_loss_mlp": 0.01269129, + "balance_loss_clip": 0.06286939, + "balance_loss_mlp": 0.01254395, + "epoch": 0.34222155418608147, + "flos": 21915170340480.0, + "grad_norm": 1.7269103068173344, + "language_loss": 0.6888957, + "learning_rate": 3.0616251336487447e-06, + "loss": 0.7662437, + "num_input_tokens_seen": 122299070, + "router_z_loss_clip": 1.78710938, + "router_z_loss_mlp": 0.1472168, + "step": 5692, + "time_per_iteration": 2.544475793838501 + }, + { + "auxiliary_loss_clip": 0.06469652, + "auxiliary_loss_mlp": 0.01276187, + "balance_loss_clip": 0.06286649, + "balance_loss_mlp": 0.01259069, + "epoch": 0.34228167743874943, + "flos": 18119954234880.0, + "grad_norm": 2.5543870280075494, + "language_loss": 0.72691154, + "learning_rate": 3.06129504893632e-06, + "loss": 0.80436993, + "num_input_tokens_seen": 122316800, + "router_z_loss_clip": 1.83203125, + "router_z_loss_mlp": 0.17126465, + "step": 5693, + "time_per_iteration": 2.4823062419891357 + }, + { + "auxiliary_loss_clip": 0.06469734, + "auxiliary_loss_mlp": 0.01268069, + "balance_loss_clip": 0.06291726, + "balance_loss_mlp": 0.01253049, + "epoch": 0.3423418006914174, + "flos": 21295070599680.0, + "grad_norm": 1.6526919771326485, + "language_loss": 0.76433146, + "learning_rate": 3.0609649239794813e-06, + "loss": 0.84170949, + "num_input_tokens_seen": 122335275, + "router_z_loss_clip": 1.78027344, + "router_z_loss_mlp": 0.15008545, + "step": 5694, + "time_per_iteration": 2.5759999752044678 + }, + { + "auxiliary_loss_clip": 0.06469683, + "auxiliary_loss_mlp": 0.01269733, + "balance_loss_clip": 0.06292015, + "balance_loss_mlp": 0.01254498, + "epoch": 0.34240192394408536, + "flos": 19828754075520.0, + "grad_norm": 1.7073290043069882, + "language_loss": 0.80359411, + "learning_rate": 3.060634758790747e-06, + "loss": 0.88098824, + "num_input_tokens_seen": 122353215, + "router_z_loss_clip": 1.77734375, + "router_z_loss_mlp": 0.15222168, + "step": 5695, + "time_per_iteration": 2.53019118309021 + }, + { + "auxiliary_loss_clip": 0.06473886, + "auxiliary_loss_mlp": 0.01274215, + "balance_loss_clip": 0.06292082, + "balance_loss_mlp": 0.01257335, + "epoch": 0.3424620471967533, + "flos": 24542498638080.0, + "grad_norm": 2.150928833794339, + "language_loss": 0.74189723, + "learning_rate": 3.060304553382635e-06, + "loss": 0.81937826, + "num_input_tokens_seen": 122372495, + "router_z_loss_clip": 1.81640625, + "router_z_loss_mlp": 0.16882324, + "step": 5696, + "time_per_iteration": 2.6046504974365234 + }, + { + "auxiliary_loss_clip": 0.06472932, + "auxiliary_loss_mlp": 0.01273918, + "balance_loss_clip": 0.062935, + "balance_loss_mlp": 0.0125786, + "epoch": 0.3425221704494213, + "flos": 25856057969280.0, + "grad_norm": 1.9268953245740004, + "language_loss": 0.71419311, + "learning_rate": 3.0599743077676685e-06, + "loss": 0.79166162, + "num_input_tokens_seen": 122394600, + "router_z_loss_clip": 1.79394531, + "router_z_loss_mlp": 0.16052246, + "step": 5697, + "time_per_iteration": 2.565295696258545 + }, + { + "auxiliary_loss_clip": 0.06469944, + "auxiliary_loss_mlp": 0.01270077, + "balance_loss_clip": 0.06292768, + "balance_loss_mlp": 0.01254293, + "epoch": 0.34258229370208926, + "flos": 21546442448640.0, + "grad_norm": 1.77565898086167, + "language_loss": 0.82456839, + "learning_rate": 3.05964402195837e-06, + "loss": 0.90196872, + "num_input_tokens_seen": 122414700, + "router_z_loss_clip": 1.77050781, + "router_z_loss_mlp": 0.15795898, + "step": 5698, + "time_per_iteration": 2.636547327041626 + }, + { + "auxiliary_loss_clip": 0.06476933, + "auxiliary_loss_mlp": 0.01277942, + "balance_loss_clip": 0.06293021, + "balance_loss_mlp": 0.01260573, + "epoch": 0.3426424169547573, + "flos": 23658407009280.0, + "grad_norm": 1.9460205950694964, + "language_loss": 0.69722092, + "learning_rate": 3.0593136959672645e-06, + "loss": 0.77476966, + "num_input_tokens_seen": 122432760, + "router_z_loss_clip": 1.83886719, + "router_z_loss_mlp": 0.17358398, + "step": 5699, + "time_per_iteration": 2.523766040802002 + }, + { + "auxiliary_loss_clip": 0.06469926, + "auxiliary_loss_mlp": 0.0127405, + "balance_loss_clip": 0.06289239, + "balance_loss_mlp": 0.01257719, + "epoch": 0.34270254020742524, + "flos": 24651846616320.0, + "grad_norm": 2.105384484263751, + "language_loss": 0.72511256, + "learning_rate": 3.058983329806877e-06, + "loss": 0.80255234, + "num_input_tokens_seen": 122449105, + "router_z_loss_clip": 1.80566406, + "router_z_loss_mlp": 0.16320801, + "step": 5700, + "time_per_iteration": 2.57511568069458 + }, + { + "auxiliary_loss_clip": 0.06467311, + "auxiliary_loss_mlp": 0.01271093, + "balance_loss_clip": 0.06288276, + "balance_loss_mlp": 0.01254273, + "epoch": 0.3427626634600932, + "flos": 21003182501760.0, + "grad_norm": 2.114283139984186, + "language_loss": 0.82378924, + "learning_rate": 3.0586529234897354e-06, + "loss": 0.90117323, + "num_input_tokens_seen": 122468700, + "router_z_loss_clip": 1.79003906, + "router_z_loss_mlp": 0.16821289, + "step": 5701, + "time_per_iteration": 2.496392250061035 + }, + { + "auxiliary_loss_clip": 0.06469429, + "auxiliary_loss_mlp": 0.0127326, + "balance_loss_clip": 0.06287375, + "balance_loss_mlp": 0.01256439, + "epoch": 0.3428227867127612, + "flos": 21440155144320.0, + "grad_norm": 1.6330699344557849, + "language_loss": 0.71898985, + "learning_rate": 3.0583224770283694e-06, + "loss": 0.79641676, + "num_input_tokens_seen": 122488160, + "router_z_loss_clip": 1.8203125, + "router_z_loss_mlp": 0.16821289, + "step": 5702, + "time_per_iteration": 2.566856861114502 + }, + { + "auxiliary_loss_clip": 0.06377172, + "auxiliary_loss_mlp": 0.01259818, + "balance_loss_clip": 0.06290582, + "balance_loss_mlp": 0.01252552, + "epoch": 0.34288290996542914, + "flos": 55750219902720.0, + "grad_norm": 0.7671857510805999, + "language_loss": 0.56708395, + "learning_rate": 3.057991990435309e-06, + "loss": 0.64345384, + "num_input_tokens_seen": 122542890, + "router_z_loss_clip": 0.8671875, + "router_z_loss_mlp": 0.07244873, + "step": 5703, + "time_per_iteration": 4.447732925415039 + }, + { + "auxiliary_loss_clip": 0.06465772, + "auxiliary_loss_mlp": 0.01272683, + "balance_loss_clip": 0.06283242, + "balance_loss_mlp": 0.01255207, + "epoch": 0.3429430332180971, + "flos": 20162961285120.0, + "grad_norm": 1.88810633796735, + "language_loss": 0.74954486, + "learning_rate": 3.057661463723086e-06, + "loss": 0.82692933, + "num_input_tokens_seen": 122561770, + "router_z_loss_clip": 1.82617188, + "router_z_loss_mlp": 0.17468262, + "step": 5704, + "time_per_iteration": 4.062070608139038 + }, + { + "auxiliary_loss_clip": 0.06463447, + "auxiliary_loss_mlp": 0.01275499, + "balance_loss_clip": 0.06284866, + "balance_loss_mlp": 0.01259716, + "epoch": 0.34300315647076507, + "flos": 17971347818880.0, + "grad_norm": 2.0890845856962565, + "language_loss": 0.73438597, + "learning_rate": 3.0573308969042346e-06, + "loss": 0.81177545, + "num_input_tokens_seen": 122580580, + "router_z_loss_clip": 1.78417969, + "router_z_loss_mlp": 0.15795898, + "step": 5705, + "time_per_iteration": 2.5125277042388916 + }, + { + "auxiliary_loss_clip": 0.06466857, + "auxiliary_loss_mlp": 0.01271633, + "balance_loss_clip": 0.0628458, + "balance_loss_mlp": 0.01255194, + "epoch": 0.34306327972343303, + "flos": 22092679215360.0, + "grad_norm": 2.3658652894382075, + "language_loss": 0.80144984, + "learning_rate": 3.057000289991289e-06, + "loss": 0.87883472, + "num_input_tokens_seen": 122599810, + "router_z_loss_clip": 1.82226562, + "router_z_loss_mlp": 0.16430664, + "step": 5706, + "time_per_iteration": 2.524531364440918 + }, + { + "auxiliary_loss_clip": 0.06468605, + "auxiliary_loss_mlp": 0.01272132, + "balance_loss_clip": 0.06282079, + "balance_loss_mlp": 0.0125493, + "epoch": 0.343123402976101, + "flos": 18448669002240.0, + "grad_norm": 1.9272208577124825, + "language_loss": 0.83210528, + "learning_rate": 3.056669642996787e-06, + "loss": 0.90951264, + "num_input_tokens_seen": 122616035, + "router_z_loss_clip": 1.86621094, + "router_z_loss_mlp": 0.17199707, + "step": 5707, + "time_per_iteration": 4.017935514450073 + }, + { + "auxiliary_loss_clip": 0.06464301, + "auxiliary_loss_mlp": 0.01275983, + "balance_loss_clip": 0.06283538, + "balance_loss_mlp": 0.01259544, + "epoch": 0.34318352622876896, + "flos": 17169127228800.0, + "grad_norm": 1.5274992455100316, + "language_loss": 0.74774885, + "learning_rate": 3.056338955933266e-06, + "loss": 0.82515168, + "num_input_tokens_seen": 122633785, + "router_z_loss_clip": 1.80566406, + "router_z_loss_mlp": 0.16442871, + "step": 5708, + "time_per_iteration": 2.6189568042755127 + }, + { + "auxiliary_loss_clip": 0.06460952, + "auxiliary_loss_mlp": 0.01273078, + "balance_loss_clip": 0.06282704, + "balance_loss_mlp": 0.01256365, + "epoch": 0.34324364948143693, + "flos": 26695482572160.0, + "grad_norm": 1.5717787719434457, + "language_loss": 0.80904007, + "learning_rate": 3.0560082288132662e-06, + "loss": 0.88638043, + "num_input_tokens_seen": 122652100, + "router_z_loss_clip": 1.78515625, + "router_z_loss_mlp": 0.16711426, + "step": 5709, + "time_per_iteration": 2.563938617706299 + }, + { + "auxiliary_loss_clip": 0.06471742, + "auxiliary_loss_mlp": 0.01280104, + "balance_loss_clip": 0.06286193, + "balance_loss_mlp": 0.01260685, + "epoch": 0.3433037727341049, + "flos": 21257950440960.0, + "grad_norm": 2.571520261591023, + "language_loss": 0.79460347, + "learning_rate": 3.055677461649329e-06, + "loss": 0.87212193, + "num_input_tokens_seen": 122669720, + "router_z_loss_clip": 1.85449219, + "router_z_loss_mlp": 0.1940918, + "step": 5710, + "time_per_iteration": 2.5515291690826416 + }, + { + "auxiliary_loss_clip": 0.06468266, + "auxiliary_loss_mlp": 0.0127181, + "balance_loss_clip": 0.06282788, + "balance_loss_mlp": 0.01254334, + "epoch": 0.34336389598677286, + "flos": 20635377004800.0, + "grad_norm": 1.916674758610419, + "language_loss": 0.70532334, + "learning_rate": 3.055346654453996e-06, + "loss": 0.78272408, + "num_input_tokens_seen": 122688715, + "router_z_loss_clip": 1.85253906, + "router_z_loss_mlp": 0.17468262, + "step": 5711, + "time_per_iteration": 3.958890914916992 + }, + { + "auxiliary_loss_clip": 0.06467056, + "auxiliary_loss_mlp": 0.01273896, + "balance_loss_clip": 0.0628437, + "balance_loss_mlp": 0.01256909, + "epoch": 0.3434240192394409, + "flos": 14543895283200.0, + "grad_norm": 2.810027228242578, + "language_loss": 0.67786914, + "learning_rate": 3.055015807239812e-06, + "loss": 0.75527865, + "num_input_tokens_seen": 122706970, + "router_z_loss_clip": 1.82421875, + "router_z_loss_mlp": 0.16992188, + "step": 5712, + "time_per_iteration": 2.4752726554870605 + }, + { + "auxiliary_loss_clip": 0.06366295, + "auxiliary_loss_mlp": 0.01262856, + "balance_loss_clip": 0.06280869, + "balance_loss_mlp": 0.01254685, + "epoch": 0.34348414249210885, + "flos": 58067799183360.0, + "grad_norm": 0.8383081559544242, + "language_loss": 0.58214718, + "learning_rate": 3.0546849200193226e-06, + "loss": 0.65843868, + "num_input_tokens_seen": 122758095, + "router_z_loss_clip": 0.85205078, + "router_z_loss_mlp": 0.08172607, + "step": 5713, + "time_per_iteration": 3.11580491065979 + }, + { + "auxiliary_loss_clip": 0.06465655, + "auxiliary_loss_mlp": 0.01274581, + "balance_loss_clip": 0.06281169, + "balance_loss_mlp": 0.01257308, + "epoch": 0.3435442657447768, + "flos": 20710749352320.0, + "grad_norm": 1.8141637433077298, + "language_loss": 0.81045675, + "learning_rate": 3.054353992805076e-06, + "loss": 0.88785917, + "num_input_tokens_seen": 122777815, + "router_z_loss_clip": 1.84472656, + "router_z_loss_mlp": 0.17272949, + "step": 5714, + "time_per_iteration": 2.510929822921753 + }, + { + "auxiliary_loss_clip": 0.0646632, + "auxiliary_loss_mlp": 0.01276019, + "balance_loss_clip": 0.06283875, + "balance_loss_mlp": 0.01260045, + "epoch": 0.3436043889974448, + "flos": 22936967354880.0, + "grad_norm": 2.602776673257047, + "language_loss": 0.72001171, + "learning_rate": 3.05402302560962e-06, + "loss": 0.79743505, + "num_input_tokens_seen": 122797555, + "router_z_loss_clip": 1.82421875, + "router_z_loss_mlp": 0.15991211, + "step": 5715, + "time_per_iteration": 2.5680224895477295 + }, + { + "auxiliary_loss_clip": 0.06365244, + "auxiliary_loss_mlp": 0.01259148, + "balance_loss_clip": 0.06280053, + "balance_loss_mlp": 0.01251191, + "epoch": 0.34366451225011274, + "flos": 58423514964480.0, + "grad_norm": 0.8879413605742031, + "language_loss": 0.65628481, + "learning_rate": 3.053692018445505e-06, + "loss": 0.73252875, + "num_input_tokens_seen": 122863955, + "router_z_loss_clip": 0.8515625, + "router_z_loss_mlp": 0.07952881, + "step": 5716, + "time_per_iteration": 3.184952735900879 + }, + { + "auxiliary_loss_clip": 0.06463662, + "auxiliary_loss_mlp": 0.01279768, + "balance_loss_clip": 0.0628469, + "balance_loss_mlp": 0.01264509, + "epoch": 0.3437246355027807, + "flos": 15601722353280.0, + "grad_norm": 1.9800950186090778, + "language_loss": 0.74289393, + "learning_rate": 3.0533609713252838e-06, + "loss": 0.82032824, + "num_input_tokens_seen": 122883000, + "router_z_loss_clip": 1.78808594, + "router_z_loss_mlp": 0.15252686, + "step": 5717, + "time_per_iteration": 2.5220494270324707 + }, + { + "auxiliary_loss_clip": 0.06466433, + "auxiliary_loss_mlp": 0.01278824, + "balance_loss_clip": 0.0628383, + "balance_loss_mlp": 0.01262946, + "epoch": 0.34378475875544867, + "flos": 27679572449280.0, + "grad_norm": 1.8348085520910409, + "language_loss": 0.75694019, + "learning_rate": 3.0530298842615077e-06, + "loss": 0.83439279, + "num_input_tokens_seen": 122903265, + "router_z_loss_clip": 1.82617188, + "router_z_loss_mlp": 0.15869141, + "step": 5718, + "time_per_iteration": 2.5983147621154785 + }, + { + "auxiliary_loss_clip": 0.06468937, + "auxiliary_loss_mlp": 0.01273829, + "balance_loss_clip": 0.06284641, + "balance_loss_mlp": 0.01256829, + "epoch": 0.34384488200811664, + "flos": 31439638967040.0, + "grad_norm": 1.8816683210791167, + "language_loss": 0.6437763, + "learning_rate": 3.052698757266734e-06, + "loss": 0.72120392, + "num_input_tokens_seen": 122923860, + "router_z_loss_clip": 1.84179688, + "router_z_loss_mlp": 0.17004395, + "step": 5719, + "time_per_iteration": 2.7075517177581787 + }, + { + "auxiliary_loss_clip": 0.06472047, + "auxiliary_loss_mlp": 0.0127673, + "balance_loss_clip": 0.06285335, + "balance_loss_mlp": 0.012596, + "epoch": 0.3439050052607846, + "flos": 24906866117760.0, + "grad_norm": 1.6709560385881974, + "language_loss": 0.73730874, + "learning_rate": 3.0523675903535183e-06, + "loss": 0.81479651, + "num_input_tokens_seen": 122945305, + "router_z_loss_clip": 1.86816406, + "router_z_loss_mlp": 0.17150879, + "step": 5720, + "time_per_iteration": 2.5936295986175537 + }, + { + "auxiliary_loss_clip": 0.06469208, + "auxiliary_loss_mlp": 0.01280833, + "balance_loss_clip": 0.06286804, + "balance_loss_mlp": 0.01264072, + "epoch": 0.34396512851345257, + "flos": 18155900436480.0, + "grad_norm": 1.8909667336437188, + "language_loss": 0.74550021, + "learning_rate": 3.0520363835344173e-06, + "loss": 0.82300061, + "num_input_tokens_seen": 122962535, + "router_z_loss_clip": 1.82421875, + "router_z_loss_mlp": 0.16748047, + "step": 5721, + "time_per_iteration": 2.5109763145446777 + }, + { + "auxiliary_loss_clip": 0.06468637, + "auxiliary_loss_mlp": 0.01276688, + "balance_loss_clip": 0.06284628, + "balance_loss_mlp": 0.01260208, + "epoch": 0.34402525176612053, + "flos": 16039994734080.0, + "grad_norm": 3.7669546448597497, + "language_loss": 0.80102623, + "learning_rate": 3.051705136821992e-06, + "loss": 0.87847948, + "num_input_tokens_seen": 122979750, + "router_z_loss_clip": 1.83984375, + "router_z_loss_mlp": 0.16479492, + "step": 5722, + "time_per_iteration": 2.5231471061706543 + }, + { + "auxiliary_loss_clip": 0.06467631, + "auxiliary_loss_mlp": 0.01281232, + "balance_loss_clip": 0.06286201, + "balance_loss_mlp": 0.01265806, + "epoch": 0.3440853750187885, + "flos": 21185009861760.0, + "grad_norm": 1.9591310013999468, + "language_loss": 0.82034022, + "learning_rate": 3.051373850228801e-06, + "loss": 0.89782888, + "num_input_tokens_seen": 122998955, + "router_z_loss_clip": 1.81445312, + "router_z_loss_mlp": 0.1541748, + "step": 5723, + "time_per_iteration": 2.5556578636169434 + }, + { + "auxiliary_loss_clip": 0.06471531, + "auxiliary_loss_mlp": 0.01281521, + "balance_loss_clip": 0.0628756, + "balance_loss_mlp": 0.0126588, + "epoch": 0.34414549827145646, + "flos": 12682883301120.0, + "grad_norm": 1.867182825140108, + "language_loss": 0.8172524, + "learning_rate": 3.0510425237674096e-06, + "loss": 0.8947829, + "num_input_tokens_seen": 123016165, + "router_z_loss_clip": 1.83984375, + "router_z_loss_mlp": 0.15661621, + "step": 5724, + "time_per_iteration": 2.509129524230957 + }, + { + "auxiliary_loss_clip": 0.06476942, + "auxiliary_loss_mlp": 0.01281282, + "balance_loss_clip": 0.06292838, + "balance_loss_mlp": 0.01265237, + "epoch": 0.3442056215241244, + "flos": 31292458070400.0, + "grad_norm": 1.852126712281853, + "language_loss": 0.69186389, + "learning_rate": 3.05071115745038e-06, + "loss": 0.76944625, + "num_input_tokens_seen": 123036900, + "router_z_loss_clip": 1.83886719, + "router_z_loss_mlp": 0.16040039, + "step": 5725, + "time_per_iteration": 2.6253697872161865 + }, + { + "auxiliary_loss_clip": 0.06482734, + "auxiliary_loss_mlp": 0.01284248, + "balance_loss_clip": 0.06293113, + "balance_loss_mlp": 0.01266462, + "epoch": 0.34426574477679245, + "flos": 23373939997440.0, + "grad_norm": 1.5373453518160676, + "language_loss": 0.69532049, + "learning_rate": 3.0503797512902773e-06, + "loss": 0.77299035, + "num_input_tokens_seen": 123057480, + "router_z_loss_clip": 1.89648438, + "router_z_loss_mlp": 0.17785645, + "step": 5726, + "time_per_iteration": 2.5495173931121826 + }, + { + "auxiliary_loss_clip": 0.06477433, + "auxiliary_loss_mlp": 0.01281684, + "balance_loss_clip": 0.06292193, + "balance_loss_mlp": 0.01265948, + "epoch": 0.3443258680294604, + "flos": 24542372856960.0, + "grad_norm": 3.3735616171284453, + "language_loss": 0.73631704, + "learning_rate": 3.0500483052996703e-06, + "loss": 0.81390822, + "num_input_tokens_seen": 123076890, + "router_z_loss_clip": 1.85253906, + "router_z_loss_mlp": 0.15734863, + "step": 5727, + "time_per_iteration": 2.5395119190216064 + }, + { + "auxiliary_loss_clip": 0.06474276, + "auxiliary_loss_mlp": 0.01274594, + "balance_loss_clip": 0.06292102, + "balance_loss_mlp": 0.01259049, + "epoch": 0.3443859912821284, + "flos": 20236363061760.0, + "grad_norm": 1.756953821036591, + "language_loss": 0.88303459, + "learning_rate": 3.0497168194911257e-06, + "loss": 0.96052337, + "num_input_tokens_seen": 123092530, + "router_z_loss_clip": 1.8203125, + "router_z_loss_mlp": 0.15551758, + "step": 5728, + "time_per_iteration": 2.5943620204925537 + }, + { + "auxiliary_loss_clip": 0.06472028, + "auxiliary_loss_mlp": 0.01275786, + "balance_loss_clip": 0.06289984, + "balance_loss_mlp": 0.01259382, + "epoch": 0.34444611453479634, + "flos": 24323425338240.0, + "grad_norm": 1.9801243778486481, + "language_loss": 0.70532095, + "learning_rate": 3.0493852938772143e-06, + "loss": 0.78279907, + "num_input_tokens_seen": 123110560, + "router_z_loss_clip": 1.8203125, + "router_z_loss_mlp": 0.1640625, + "step": 5729, + "time_per_iteration": 2.5122504234313965 + }, + { + "auxiliary_loss_clip": 0.06472413, + "auxiliary_loss_mlp": 0.01278834, + "balance_loss_clip": 0.06293523, + "balance_loss_mlp": 0.01263123, + "epoch": 0.3445062377874643, + "flos": 16989186585600.0, + "grad_norm": 2.065738946159642, + "language_loss": 0.74902749, + "learning_rate": 3.0490537284705078e-06, + "loss": 0.82653993, + "num_input_tokens_seen": 123128655, + "router_z_loss_clip": 1.79003906, + "router_z_loss_mlp": 0.15710449, + "step": 5730, + "time_per_iteration": 2.4971024990081787 + }, + { + "auxiliary_loss_clip": 0.06477457, + "auxiliary_loss_mlp": 0.01272788, + "balance_loss_clip": 0.06295118, + "balance_loss_mlp": 0.01256921, + "epoch": 0.3445663610401323, + "flos": 20308884370560.0, + "grad_norm": 2.25692333978076, + "language_loss": 0.79881716, + "learning_rate": 3.048722123283578e-06, + "loss": 0.87631959, + "num_input_tokens_seen": 123145130, + "router_z_loss_clip": 1.82128906, + "router_z_loss_mlp": 0.15869141, + "step": 5731, + "time_per_iteration": 2.5055606365203857 + }, + { + "auxiliary_loss_clip": 0.0647382, + "auxiliary_loss_mlp": 0.01272088, + "balance_loss_clip": 0.06289574, + "balance_loss_mlp": 0.01256532, + "epoch": 0.34462648429280024, + "flos": 15893568524160.0, + "grad_norm": 2.0529883798711586, + "language_loss": 0.78536034, + "learning_rate": 3.0483904783290006e-06, + "loss": 0.86281943, + "num_input_tokens_seen": 123162265, + "router_z_loss_clip": 1.84277344, + "router_z_loss_mlp": 0.15545654, + "step": 5732, + "time_per_iteration": 2.58428692817688 + }, + { + "auxiliary_loss_clip": 0.06393671, + "auxiliary_loss_mlp": 0.01269392, + "balance_loss_clip": 0.06309536, + "balance_loss_mlp": 0.01263571, + "epoch": 0.3446866075454682, + "flos": 59330681193600.0, + "grad_norm": 0.7296400398421587, + "language_loss": 0.53166986, + "learning_rate": 3.0480587936193505e-06, + "loss": 0.60830045, + "num_input_tokens_seen": 123218620, + "router_z_loss_clip": 0.84375, + "router_z_loss_mlp": 0.05813599, + "step": 5733, + "time_per_iteration": 3.1921679973602295 + }, + { + "auxiliary_loss_clip": 0.06473544, + "auxiliary_loss_mlp": 0.01275818, + "balance_loss_clip": 0.06292105, + "balance_loss_mlp": 0.01259248, + "epoch": 0.34474673079813617, + "flos": 22349962776960.0, + "grad_norm": 1.6143563972241732, + "language_loss": 0.83787543, + "learning_rate": 3.047727069167207e-06, + "loss": 0.91536903, + "num_input_tokens_seen": 123237325, + "router_z_loss_clip": 1.81542969, + "router_z_loss_mlp": 0.16564941, + "step": 5734, + "time_per_iteration": 2.5630810260772705 + }, + { + "auxiliary_loss_clip": 0.06472072, + "auxiliary_loss_mlp": 0.01278915, + "balance_loss_clip": 0.0628967, + "balance_loss_mlp": 0.01262834, + "epoch": 0.34480685405080413, + "flos": 27677098753920.0, + "grad_norm": 1.7144738343554842, + "language_loss": 0.93389094, + "learning_rate": 3.0473953049851478e-06, + "loss": 1.01140082, + "num_input_tokens_seen": 123258650, + "router_z_loss_clip": 1.82324219, + "router_z_loss_mlp": 0.1607666, + "step": 5735, + "time_per_iteration": 2.5621798038482666 + }, + { + "auxiliary_loss_clip": 0.06471383, + "auxiliary_loss_mlp": 0.01276844, + "balance_loss_clip": 0.06284925, + "balance_loss_mlp": 0.01259273, + "epoch": 0.3448669773034721, + "flos": 22462664918400.0, + "grad_norm": 1.7840822264419087, + "language_loss": 0.77095437, + "learning_rate": 3.0470635010857533e-06, + "loss": 0.84843659, + "num_input_tokens_seen": 123277155, + "router_z_loss_clip": 1.86523438, + "router_z_loss_mlp": 0.17578125, + "step": 5736, + "time_per_iteration": 2.5377349853515625 + }, + { + "auxiliary_loss_clip": 0.06471781, + "auxiliary_loss_mlp": 0.01270645, + "balance_loss_clip": 0.06287266, + "balance_loss_mlp": 0.01255326, + "epoch": 0.34492710055614006, + "flos": 24943105808640.0, + "grad_norm": 1.6287034776462515, + "language_loss": 0.79113513, + "learning_rate": 3.0467316574816064e-06, + "loss": 0.86855936, + "num_input_tokens_seen": 123297640, + "router_z_loss_clip": 1.84667969, + "router_z_loss_mlp": 0.15319824, + "step": 5737, + "time_per_iteration": 2.5471904277801514 + }, + { + "auxiliary_loss_clip": 0.06473309, + "auxiliary_loss_mlp": 0.01276485, + "balance_loss_clip": 0.06285917, + "balance_loss_mlp": 0.0125976, + "epoch": 0.34498722380880803, + "flos": 20127057010560.0, + "grad_norm": 2.191814396638409, + "language_loss": 0.72072059, + "learning_rate": 3.0463997741852893e-06, + "loss": 0.79821849, + "num_input_tokens_seen": 123314370, + "router_z_loss_clip": 1.875, + "router_z_loss_mlp": 0.16723633, + "step": 5738, + "time_per_iteration": 2.540442943572998 + }, + { + "auxiliary_loss_clip": 0.06471272, + "auxiliary_loss_mlp": 0.01272808, + "balance_loss_clip": 0.06284432, + "balance_loss_mlp": 0.01255821, + "epoch": 0.34504734706147605, + "flos": 28445511421440.0, + "grad_norm": 1.9413212194180998, + "language_loss": 0.82238245, + "learning_rate": 3.046067851209389e-06, + "loss": 0.89982325, + "num_input_tokens_seen": 123336085, + "router_z_loss_clip": 1.8671875, + "router_z_loss_mlp": 0.16992188, + "step": 5739, + "time_per_iteration": 2.57327938079834 + }, + { + "auxiliary_loss_clip": 0.06469989, + "auxiliary_loss_mlp": 0.0127904, + "balance_loss_clip": 0.06284826, + "balance_loss_mlp": 0.01261862, + "epoch": 0.345107470314144, + "flos": 22681067385600.0, + "grad_norm": 1.914547064909644, + "language_loss": 0.83564734, + "learning_rate": 3.0457358885664898e-06, + "loss": 0.91313767, + "num_input_tokens_seen": 123354460, + "router_z_loss_clip": 1.84960938, + "router_z_loss_mlp": 0.171875, + "step": 5740, + "time_per_iteration": 2.5514895915985107 + }, + { + "auxiliary_loss_clip": 0.06466584, + "auxiliary_loss_mlp": 0.01275646, + "balance_loss_clip": 0.06283005, + "balance_loss_mlp": 0.01258921, + "epoch": 0.345167593566812, + "flos": 20636886378240.0, + "grad_norm": 2.1474795597791734, + "language_loss": 0.76802379, + "learning_rate": 3.045403886269181e-06, + "loss": 0.84544611, + "num_input_tokens_seen": 123373420, + "router_z_loss_clip": 1.83496094, + "router_z_loss_mlp": 0.16723633, + "step": 5741, + "time_per_iteration": 2.511997699737549 + }, + { + "auxiliary_loss_clip": 0.06466299, + "auxiliary_loss_mlp": 0.0127053, + "balance_loss_clip": 0.06279384, + "balance_loss_mlp": 0.01254544, + "epoch": 0.34522771681947995, + "flos": 26221683260160.0, + "grad_norm": 1.6006732343467382, + "language_loss": 0.77803171, + "learning_rate": 3.045071844330053e-06, + "loss": 0.85540009, + "num_input_tokens_seen": 123394730, + "router_z_loss_clip": 1.87011719, + "router_z_loss_mlp": 0.15966797, + "step": 5742, + "time_per_iteration": 2.5593955516815186 + }, + { + "auxiliary_loss_clip": 0.06464212, + "auxiliary_loss_mlp": 0.01272894, + "balance_loss_clip": 0.06281982, + "balance_loss_mlp": 0.01256074, + "epoch": 0.3452878400721479, + "flos": 19068349472640.0, + "grad_norm": 2.2544306863162538, + "language_loss": 0.76459014, + "learning_rate": 3.0447397627616955e-06, + "loss": 0.84196126, + "num_input_tokens_seen": 123412895, + "router_z_loss_clip": 1.82421875, + "router_z_loss_mlp": 0.16821289, + "step": 5743, + "time_per_iteration": 3.996267557144165 + }, + { + "auxiliary_loss_clip": 0.06462429, + "auxiliary_loss_mlp": 0.0126984, + "balance_loss_clip": 0.06281956, + "balance_loss_mlp": 0.01255118, + "epoch": 0.3453479633248159, + "flos": 27937442989440.0, + "grad_norm": 1.578255214465821, + "language_loss": 0.7080915, + "learning_rate": 3.0444076415767016e-06, + "loss": 0.78541422, + "num_input_tokens_seen": 123432320, + "router_z_loss_clip": 1.80761719, + "router_z_loss_mlp": 0.14727783, + "step": 5744, + "time_per_iteration": 2.5594234466552734 + }, + { + "auxiliary_loss_clip": 0.06462625, + "auxiliary_loss_mlp": 0.01272389, + "balance_loss_clip": 0.0628416, + "balance_loss_mlp": 0.01256523, + "epoch": 0.34540808657748384, + "flos": 19611609419520.0, + "grad_norm": 1.8945383960499247, + "language_loss": 0.79877782, + "learning_rate": 3.044075480787665e-06, + "loss": 0.87612802, + "num_input_tokens_seen": 123450980, + "router_z_loss_clip": 1.78320312, + "router_z_loss_mlp": 0.15881348, + "step": 5745, + "time_per_iteration": 2.5577902793884277 + }, + { + "auxiliary_loss_clip": 0.0646376, + "auxiliary_loss_mlp": 0.0127446, + "balance_loss_clip": 0.0627804, + "balance_loss_mlp": 0.01258343, + "epoch": 0.3454682098301518, + "flos": 20417771151360.0, + "grad_norm": 2.2215207406176063, + "language_loss": 0.90027881, + "learning_rate": 3.043743280407182e-06, + "loss": 0.97766101, + "num_input_tokens_seen": 123469365, + "router_z_loss_clip": 1.85742188, + "router_z_loss_mlp": 0.16113281, + "step": 5746, + "time_per_iteration": 4.126953840255737 + }, + { + "auxiliary_loss_clip": 0.06469168, + "auxiliary_loss_mlp": 0.01271588, + "balance_loss_clip": 0.06281114, + "balance_loss_mlp": 0.01254648, + "epoch": 0.34552833308281977, + "flos": 21331603779840.0, + "grad_norm": 1.8420175913064167, + "language_loss": 0.65233189, + "learning_rate": 3.043411040447849e-06, + "loss": 0.72973943, + "num_input_tokens_seen": 123489425, + "router_z_loss_clip": 1.88085938, + "router_z_loss_mlp": 0.16931152, + "step": 5747, + "time_per_iteration": 2.6445960998535156 + }, + { + "auxiliary_loss_clip": 0.06461484, + "auxiliary_loss_mlp": 0.01274425, + "balance_loss_clip": 0.06279166, + "balance_loss_mlp": 0.01259166, + "epoch": 0.34558845633548774, + "flos": 36251914331520.0, + "grad_norm": 1.6152983170909512, + "language_loss": 0.72912234, + "learning_rate": 3.043078760922264e-06, + "loss": 0.80648136, + "num_input_tokens_seen": 123509970, + "router_z_loss_clip": 1.82226562, + "router_z_loss_mlp": 0.15246582, + "step": 5748, + "time_per_iteration": 2.668628692626953 + }, + { + "auxiliary_loss_clip": 0.0646018, + "auxiliary_loss_mlp": 0.01271906, + "balance_loss_clip": 0.06281725, + "balance_loss_mlp": 0.01257268, + "epoch": 0.3456485795881557, + "flos": 22456292008320.0, + "grad_norm": 2.139365243179929, + "language_loss": 0.75935584, + "learning_rate": 3.042746441843029e-06, + "loss": 0.83667672, + "num_input_tokens_seen": 123531055, + "router_z_loss_clip": 1.78320312, + "router_z_loss_mlp": 0.14648438, + "step": 5749, + "time_per_iteration": 2.533357620239258 + }, + { + "auxiliary_loss_clip": 0.06372777, + "auxiliary_loss_mlp": 0.01259534, + "balance_loss_clip": 0.06290279, + "balance_loss_mlp": 0.0125392, + "epoch": 0.34570870284082367, + "flos": 62023277422080.0, + "grad_norm": 0.8741398929973155, + "language_loss": 0.62861037, + "learning_rate": 3.0424140832227437e-06, + "loss": 0.70493352, + "num_input_tokens_seen": 123584720, + "router_z_loss_clip": 0.82470703, + "router_z_loss_mlp": 0.05612183, + "step": 5750, + "time_per_iteration": 4.42021369934082 + }, + { + "auxiliary_loss_clip": 0.06455849, + "auxiliary_loss_mlp": 0.0126761, + "balance_loss_clip": 0.06279862, + "balance_loss_mlp": 0.01253383, + "epoch": 0.34576882609349163, + "flos": 22788528647040.0, + "grad_norm": 2.5604939014714043, + "language_loss": 0.80745482, + "learning_rate": 3.042081685074012e-06, + "loss": 0.88468945, + "num_input_tokens_seen": 123604465, + "router_z_loss_clip": 1.7578125, + "router_z_loss_mlp": 0.14227295, + "step": 5751, + "time_per_iteration": 2.610229730606079 + }, + { + "auxiliary_loss_clip": 0.06461278, + "auxiliary_loss_mlp": 0.01273124, + "balance_loss_clip": 0.06282206, + "balance_loss_mlp": 0.01258199, + "epoch": 0.34582894934615965, + "flos": 12353665409280.0, + "grad_norm": 2.333174149642167, + "language_loss": 0.85112172, + "learning_rate": 3.041749247409439e-06, + "loss": 0.92846578, + "num_input_tokens_seen": 123622320, + "router_z_loss_clip": 1.79101562, + "router_z_loss_mlp": 0.14904785, + "step": 5752, + "time_per_iteration": 2.49895977973938 + }, + { + "auxiliary_loss_clip": 0.06379203, + "auxiliary_loss_mlp": 0.01260282, + "balance_loss_clip": 0.06296635, + "balance_loss_mlp": 0.01254092, + "epoch": 0.3458890725988276, + "flos": 70186459017600.0, + "grad_norm": 0.7233537791569425, + "language_loss": 0.63163221, + "learning_rate": 3.0414167702416296e-06, + "loss": 0.70802706, + "num_input_tokens_seen": 123678010, + "router_z_loss_clip": 0.828125, + "router_z_loss_mlp": 0.06185913, + "step": 5753, + "time_per_iteration": 3.0605263710021973 + }, + { + "auxiliary_loss_clip": 0.06463367, + "auxiliary_loss_mlp": 0.01274407, + "balance_loss_clip": 0.06282756, + "balance_loss_mlp": 0.01258498, + "epoch": 0.3459491958514956, + "flos": 17098324928640.0, + "grad_norm": 2.0282181813946116, + "language_loss": 0.71483171, + "learning_rate": 3.0410842535831914e-06, + "loss": 0.79220951, + "num_input_tokens_seen": 123696830, + "router_z_loss_clip": 1.80566406, + "router_z_loss_mlp": 0.15899658, + "step": 5754, + "time_per_iteration": 2.499213457107544 + }, + { + "auxiliary_loss_clip": 0.06470741, + "auxiliary_loss_mlp": 0.01271896, + "balance_loss_clip": 0.06282809, + "balance_loss_mlp": 0.01255898, + "epoch": 0.34600931910416355, + "flos": 16655985624960.0, + "grad_norm": 2.0834630321372534, + "language_loss": 0.7328862, + "learning_rate": 3.0407516974467343e-06, + "loss": 0.81031251, + "num_input_tokens_seen": 123714360, + "router_z_loss_clip": 1.87695312, + "router_z_loss_mlp": 0.15979004, + "step": 5755, + "time_per_iteration": 2.540292263031006 + }, + { + "auxiliary_loss_clip": 0.0646005, + "auxiliary_loss_mlp": 0.01272619, + "balance_loss_clip": 0.06280342, + "balance_loss_mlp": 0.01257801, + "epoch": 0.3460694423568315, + "flos": 38555517179520.0, + "grad_norm": 1.432388080922509, + "language_loss": 0.7255426, + "learning_rate": 3.040419101844869e-06, + "loss": 0.80286932, + "num_input_tokens_seen": 123739250, + "router_z_loss_clip": 1.79589844, + "router_z_loss_mlp": 0.14813232, + "step": 5756, + "time_per_iteration": 2.679203510284424 + }, + { + "auxiliary_loss_clip": 0.06371044, + "auxiliary_loss_mlp": 0.01257585, + "balance_loss_clip": 0.06288835, + "balance_loss_mlp": 0.01251058, + "epoch": 0.3461295656094995, + "flos": 72103332545280.0, + "grad_norm": 0.6902951700774806, + "language_loss": 0.62318385, + "learning_rate": 3.040086466790207e-06, + "loss": 0.69947016, + "num_input_tokens_seen": 123802845, + "router_z_loss_clip": 0.8203125, + "router_z_loss_mlp": 0.06536865, + "step": 5757, + "time_per_iteration": 3.209688901901245 + }, + { + "auxiliary_loss_clip": 0.06363717, + "auxiliary_loss_mlp": 0.01259824, + "balance_loss_clip": 0.06281064, + "balance_loss_mlp": 0.01253244, + "epoch": 0.34618968886216744, + "flos": 65477913408000.0, + "grad_norm": 0.8114970964410039, + "language_loss": 0.59130025, + "learning_rate": 3.039753792295362e-06, + "loss": 0.66753566, + "num_input_tokens_seen": 123861805, + "router_z_loss_clip": 0.828125, + "router_z_loss_mlp": 0.06591797, + "step": 5758, + "time_per_iteration": 3.139495372772217 + }, + { + "auxiliary_loss_clip": 0.06467785, + "auxiliary_loss_mlp": 0.01274731, + "balance_loss_clip": 0.06288655, + "balance_loss_mlp": 0.01259747, + "epoch": 0.3462498121148354, + "flos": 23478508293120.0, + "grad_norm": 1.7665020183034759, + "language_loss": 0.72321635, + "learning_rate": 3.0394210783729487e-06, + "loss": 0.80064148, + "num_input_tokens_seen": 123881820, + "router_z_loss_clip": 1.79101562, + "router_z_loss_mlp": 0.14990234, + "step": 5759, + "time_per_iteration": 2.575479745864868 + }, + { + "auxiliary_loss_clip": 0.06456805, + "auxiliary_loss_mlp": 0.01274415, + "balance_loss_clip": 0.06277698, + "balance_loss_mlp": 0.01258632, + "epoch": 0.3463099353675034, + "flos": 24177711888000.0, + "grad_norm": 1.8760422141660649, + "language_loss": 0.83568478, + "learning_rate": 3.0390883250355836e-06, + "loss": 0.91299695, + "num_input_tokens_seen": 123903700, + "router_z_loss_clip": 1.79199219, + "router_z_loss_mlp": 0.15771484, + "step": 5760, + "time_per_iteration": 2.5610272884368896 + }, + { + "auxiliary_loss_clip": 0.06358143, + "auxiliary_loss_mlp": 0.01257449, + "balance_loss_clip": 0.06276596, + "balance_loss_mlp": 0.0125125, + "epoch": 0.34637005862017134, + "flos": 63716773893120.0, + "grad_norm": 0.8043642187655193, + "language_loss": 0.56576806, + "learning_rate": 3.0387555322958865e-06, + "loss": 0.64192402, + "num_input_tokens_seen": 123960075, + "router_z_loss_clip": 0.81591797, + "router_z_loss_mlp": 0.06195068, + "step": 5761, + "time_per_iteration": 3.2343695163726807 + }, + { + "auxiliary_loss_clip": 0.06453449, + "auxiliary_loss_mlp": 0.01270941, + "balance_loss_clip": 0.06277917, + "balance_loss_mlp": 0.01256457, + "epoch": 0.3464301818728393, + "flos": 13149513089280.0, + "grad_norm": 1.936786863895872, + "language_loss": 0.9549523, + "learning_rate": 3.038422700166474e-06, + "loss": 1.03219616, + "num_input_tokens_seen": 123975805, + "router_z_loss_clip": 1.75585938, + "router_z_loss_mlp": 0.14477539, + "step": 5762, + "time_per_iteration": 2.496039390563965 + }, + { + "auxiliary_loss_clip": 0.06467324, + "auxiliary_loss_mlp": 0.01276759, + "balance_loss_clip": 0.06279808, + "balance_loss_mlp": 0.01260928, + "epoch": 0.34649030512550727, + "flos": 29322936650880.0, + "grad_norm": 1.870020160295256, + "language_loss": 0.69913763, + "learning_rate": 3.0380898286599692e-06, + "loss": 0.77657849, + "num_input_tokens_seen": 123997530, + "router_z_loss_clip": 1.875, + "router_z_loss_mlp": 0.15820312, + "step": 5763, + "time_per_iteration": 2.5929718017578125 + }, + { + "auxiliary_loss_clip": 0.06466965, + "auxiliary_loss_mlp": 0.01270957, + "balance_loss_clip": 0.06278971, + "balance_loss_mlp": 0.01253922, + "epoch": 0.34655042837817523, + "flos": 23737385082240.0, + "grad_norm": 1.7922805842181977, + "language_loss": 0.83863467, + "learning_rate": 3.0377569177889945e-06, + "loss": 0.9160139, + "num_input_tokens_seen": 124016375, + "router_z_loss_clip": 1.88183594, + "router_z_loss_mlp": 0.17028809, + "step": 5764, + "time_per_iteration": 2.634692668914795 + }, + { + "auxiliary_loss_clip": 0.06459094, + "auxiliary_loss_mlp": 0.01274486, + "balance_loss_clip": 0.06279744, + "balance_loss_mlp": 0.01259263, + "epoch": 0.34661055163084326, + "flos": 22060716082560.0, + "grad_norm": 2.9007104109569943, + "language_loss": 0.67647815, + "learning_rate": 3.0374239675661722e-06, + "loss": 0.75381392, + "num_input_tokens_seen": 124033975, + "router_z_loss_clip": 1.79394531, + "router_z_loss_mlp": 0.15234375, + "step": 5765, + "time_per_iteration": 2.5028090476989746 + }, + { + "auxiliary_loss_clip": 0.06460512, + "auxiliary_loss_mlp": 0.0127692, + "balance_loss_clip": 0.06280708, + "balance_loss_mlp": 0.01262233, + "epoch": 0.3466706748835112, + "flos": 21805738508160.0, + "grad_norm": 3.5961884004183426, + "language_loss": 0.77947313, + "learning_rate": 3.03709097800413e-06, + "loss": 0.85684741, + "num_input_tokens_seen": 124051930, + "router_z_loss_clip": 1.79980469, + "router_z_loss_mlp": 0.14709473, + "step": 5766, + "time_per_iteration": 2.5584661960601807 + }, + { + "auxiliary_loss_clip": 0.06460432, + "auxiliary_loss_mlp": 0.01274096, + "balance_loss_clip": 0.06278767, + "balance_loss_mlp": 0.01260614, + "epoch": 0.3467307981361792, + "flos": 19467405342720.0, + "grad_norm": 1.5497773141022704, + "language_loss": 0.73886019, + "learning_rate": 3.0367579491154943e-06, + "loss": 0.8162055, + "num_input_tokens_seen": 124071220, + "router_z_loss_clip": 1.81640625, + "router_z_loss_mlp": 0.13500977, + "step": 5767, + "time_per_iteration": 2.571500062942505 + }, + { + "auxiliary_loss_clip": 0.06461183, + "auxiliary_loss_mlp": 0.01276021, + "balance_loss_clip": 0.06279645, + "balance_loss_mlp": 0.01260107, + "epoch": 0.34679092138884715, + "flos": 24834470590080.0, + "grad_norm": 2.0350854996297696, + "language_loss": 0.78955162, + "learning_rate": 3.036424880912893e-06, + "loss": 0.86692369, + "num_input_tokens_seen": 124090140, + "router_z_loss_clip": 1.8125, + "router_z_loss_mlp": 0.15917969, + "step": 5768, + "time_per_iteration": 2.5747995376586914 + }, + { + "auxiliary_loss_clip": 0.06369781, + "auxiliary_loss_mlp": 0.01257254, + "balance_loss_clip": 0.06288455, + "balance_loss_mlp": 0.01251723, + "epoch": 0.3468510446415151, + "flos": 63253791757440.0, + "grad_norm": 0.7431238132649503, + "language_loss": 0.57319033, + "learning_rate": 3.036091773408956e-06, + "loss": 0.64946061, + "num_input_tokens_seen": 124152025, + "router_z_loss_clip": 0.8125, + "router_z_loss_mlp": 0.05535889, + "step": 5769, + "time_per_iteration": 3.176074981689453 + }, + { + "auxiliary_loss_clip": 0.06479758, + "auxiliary_loss_mlp": 0.01277235, + "balance_loss_clip": 0.06285711, + "balance_loss_mlp": 0.01260212, + "epoch": 0.3469111678941831, + "flos": 12123984568320.0, + "grad_norm": 2.4016361546378158, + "language_loss": 0.85419703, + "learning_rate": 3.0357586266163154e-06, + "loss": 0.93176699, + "num_input_tokens_seen": 124165795, + "router_z_loss_clip": 1.94042969, + "router_z_loss_mlp": 0.17028809, + "step": 5770, + "time_per_iteration": 2.5156779289245605 + }, + { + "auxiliary_loss_clip": 0.06372644, + "auxiliary_loss_mlp": 0.01258777, + "balance_loss_clip": 0.0629043, + "balance_loss_mlp": 0.01253087, + "epoch": 0.34697129114685105, + "flos": 65951964282240.0, + "grad_norm": 0.7493725348793998, + "language_loss": 0.59862447, + "learning_rate": 3.0354254405476036e-06, + "loss": 0.67493868, + "num_input_tokens_seen": 124222925, + "router_z_loss_clip": 0.82080078, + "router_z_loss_mlp": 0.05685425, + "step": 5771, + "time_per_iteration": 2.938957691192627 + }, + { + "auxiliary_loss_clip": 0.0646434, + "auxiliary_loss_mlp": 0.012787, + "balance_loss_clip": 0.06282143, + "balance_loss_mlp": 0.01263572, + "epoch": 0.347031414399519, + "flos": 34461914284800.0, + "grad_norm": 1.9396999801577832, + "language_loss": 0.72527683, + "learning_rate": 3.0350922152154557e-06, + "loss": 0.80270731, + "num_input_tokens_seen": 124240915, + "router_z_loss_clip": 1.82324219, + "router_z_loss_mlp": 0.15136719, + "step": 5772, + "time_per_iteration": 2.6529078483581543 + }, + { + "auxiliary_loss_clip": 0.06462972, + "auxiliary_loss_mlp": 0.01272172, + "balance_loss_clip": 0.06281382, + "balance_loss_mlp": 0.01256246, + "epoch": 0.347091537652187, + "flos": 26951592176640.0, + "grad_norm": 1.5709710398058576, + "language_loss": 0.76695967, + "learning_rate": 3.034758950632507e-06, + "loss": 0.84431112, + "num_input_tokens_seen": 124262770, + "router_z_loss_clip": 1.81445312, + "router_z_loss_mlp": 0.15924072, + "step": 5773, + "time_per_iteration": 2.5785317420959473 + }, + { + "auxiliary_loss_clip": 0.06466497, + "auxiliary_loss_mlp": 0.01271256, + "balance_loss_clip": 0.06280655, + "balance_loss_mlp": 0.01255366, + "epoch": 0.34715166090485494, + "flos": 21148602462720.0, + "grad_norm": 2.4326309651076463, + "language_loss": 0.70796078, + "learning_rate": 3.034425646811396e-06, + "loss": 0.78533834, + "num_input_tokens_seen": 124280950, + "router_z_loss_clip": 1.85644531, + "router_z_loss_mlp": 0.15893555, + "step": 5774, + "time_per_iteration": 2.5585873126983643 + }, + { + "auxiliary_loss_clip": 0.06458526, + "auxiliary_loss_mlp": 0.01271942, + "balance_loss_clip": 0.06278332, + "balance_loss_mlp": 0.01256707, + "epoch": 0.3472117841575229, + "flos": 23484881203200.0, + "grad_norm": 2.2084812675777474, + "language_loss": 0.76485682, + "learning_rate": 3.0340923037647602e-06, + "loss": 0.84216148, + "num_input_tokens_seen": 124299540, + "router_z_loss_clip": 1.80175781, + "router_z_loss_mlp": 0.15228271, + "step": 5775, + "time_per_iteration": 2.5899477005004883 + }, + { + "auxiliary_loss_clip": 0.06472419, + "auxiliary_loss_mlp": 0.01271173, + "balance_loss_clip": 0.06281743, + "balance_loss_mlp": 0.01255163, + "epoch": 0.34727190741019087, + "flos": 17498428974720.0, + "grad_norm": 2.2070819655775282, + "language_loss": 0.7869916, + "learning_rate": 3.0337589215052404e-06, + "loss": 0.86442757, + "num_input_tokens_seen": 124316285, + "router_z_loss_clip": 1.90625, + "router_z_loss_mlp": 0.16009521, + "step": 5776, + "time_per_iteration": 2.5874037742614746 + }, + { + "auxiliary_loss_clip": 0.0636313, + "auxiliary_loss_mlp": 0.01265305, + "balance_loss_clip": 0.06280468, + "balance_loss_mlp": 0.0125983, + "epoch": 0.34733203066285884, + "flos": 65287350495360.0, + "grad_norm": 0.8333293277096808, + "language_loss": 0.63448966, + "learning_rate": 3.033425500045478e-06, + "loss": 0.710774, + "num_input_tokens_seen": 124376650, + "router_z_loss_clip": 0.82666016, + "router_z_loss_mlp": 0.05477905, + "step": 5777, + "time_per_iteration": 3.168325185775757 + }, + { + "auxiliary_loss_clip": 0.0646584, + "auxiliary_loss_mlp": 0.01270867, + "balance_loss_clip": 0.06279471, + "balance_loss_mlp": 0.01255048, + "epoch": 0.3473921539155268, + "flos": 28666429511040.0, + "grad_norm": 3.258496862714712, + "language_loss": 0.65075529, + "learning_rate": 3.033092039398119e-06, + "loss": 0.72812235, + "num_input_tokens_seen": 124396475, + "router_z_loss_clip": 1.86425781, + "router_z_loss_mlp": 0.15808105, + "step": 5778, + "time_per_iteration": 2.5797836780548096 + }, + { + "auxiliary_loss_clip": 0.06467149, + "auxiliary_loss_mlp": 0.01271344, + "balance_loss_clip": 0.06278305, + "balance_loss_mlp": 0.0125633, + "epoch": 0.3474522771681948, + "flos": 40845284104320.0, + "grad_norm": 1.7195764072446118, + "language_loss": 0.722601, + "learning_rate": 3.0327585395758046e-06, + "loss": 0.79998595, + "num_input_tokens_seen": 124416480, + "router_z_loss_clip": 1.88769531, + "router_z_loss_mlp": 0.15008545, + "step": 5779, + "time_per_iteration": 2.6901330947875977 + }, + { + "auxiliary_loss_clip": 0.06474127, + "auxiliary_loss_mlp": 0.01275722, + "balance_loss_clip": 0.06282836, + "balance_loss_mlp": 0.01259092, + "epoch": 0.3475124004208628, + "flos": 24615564998400.0, + "grad_norm": 2.601451729132101, + "language_loss": 0.62399209, + "learning_rate": 3.0324250005911837e-06, + "loss": 0.70149052, + "num_input_tokens_seen": 124435950, + "router_z_loss_clip": 1.9140625, + "router_z_loss_mlp": 0.1663208, + "step": 5780, + "time_per_iteration": 2.5493476390838623 + }, + { + "auxiliary_loss_clip": 0.0647147, + "auxiliary_loss_mlp": 0.01271785, + "balance_loss_clip": 0.06285025, + "balance_loss_mlp": 0.01256264, + "epoch": 0.34757252367353075, + "flos": 22717977909120.0, + "grad_norm": 3.4183593986527043, + "language_loss": 0.72164977, + "learning_rate": 3.0320914224569033e-06, + "loss": 0.79908228, + "num_input_tokens_seen": 124455410, + "router_z_loss_clip": 1.86523438, + "router_z_loss_mlp": 0.15515137, + "step": 5781, + "time_per_iteration": 2.610198974609375 + }, + { + "auxiliary_loss_clip": 0.06471756, + "auxiliary_loss_mlp": 0.01273476, + "balance_loss_clip": 0.06282213, + "balance_loss_mlp": 0.01257228, + "epoch": 0.3476326469261987, + "flos": 19834246517760.0, + "grad_norm": 2.4264406265191325, + "language_loss": 0.77686667, + "learning_rate": 3.031757805185612e-06, + "loss": 0.85431898, + "num_input_tokens_seen": 124474870, + "router_z_loss_clip": 1.89648438, + "router_z_loss_mlp": 0.16235352, + "step": 5782, + "time_per_iteration": 3.918602705001831 + }, + { + "auxiliary_loss_clip": 0.06470296, + "auxiliary_loss_mlp": 0.01277549, + "balance_loss_clip": 0.0628626, + "balance_loss_mlp": 0.01262695, + "epoch": 0.3476927701788667, + "flos": 19944265328640.0, + "grad_norm": 2.639685157679876, + "language_loss": 0.63410383, + "learning_rate": 3.0314241487899622e-06, + "loss": 0.7115823, + "num_input_tokens_seen": 124494105, + "router_z_loss_clip": 1.84082031, + "router_z_loss_mlp": 0.14855957, + "step": 5783, + "time_per_iteration": 4.021190881729126 + }, + { + "auxiliary_loss_clip": 0.06469369, + "auxiliary_loss_mlp": 0.01277895, + "balance_loss_clip": 0.06290524, + "balance_loss_mlp": 0.01264121, + "epoch": 0.34775289343153465, + "flos": 20740448424960.0, + "grad_norm": 1.686879732071426, + "language_loss": 0.89054763, + "learning_rate": 3.031090453282605e-06, + "loss": 0.9680202, + "num_input_tokens_seen": 124512030, + "router_z_loss_clip": 1.78808594, + "router_z_loss_mlp": 0.13763428, + "step": 5784, + "time_per_iteration": 2.553847074508667 + }, + { + "auxiliary_loss_clip": 0.06470798, + "auxiliary_loss_mlp": 0.01275566, + "balance_loss_clip": 0.06289466, + "balance_loss_mlp": 0.01260903, + "epoch": 0.3478130166842026, + "flos": 19360992257280.0, + "grad_norm": 1.643062521609265, + "language_loss": 0.82068878, + "learning_rate": 3.0307567186761946e-06, + "loss": 0.89815247, + "num_input_tokens_seen": 124530980, + "router_z_loss_clip": 1.8125, + "router_z_loss_mlp": 0.14672852, + "step": 5785, + "time_per_iteration": 2.5452024936676025 + }, + { + "auxiliary_loss_clip": 0.06472684, + "auxiliary_loss_mlp": 0.01281071, + "balance_loss_clip": 0.06290045, + "balance_loss_mlp": 0.01267004, + "epoch": 0.3478731399368706, + "flos": 22057194211200.0, + "grad_norm": 1.6654216237849466, + "language_loss": 0.80731958, + "learning_rate": 3.0304229449833862e-06, + "loss": 0.88485718, + "num_input_tokens_seen": 124549330, + "router_z_loss_clip": 1.82617188, + "router_z_loss_mlp": 0.14074707, + "step": 5786, + "time_per_iteration": 4.040801286697388 + }, + { + "auxiliary_loss_clip": 0.06468868, + "auxiliary_loss_mlp": 0.01275893, + "balance_loss_clip": 0.06289011, + "balance_loss_mlp": 0.01260515, + "epoch": 0.34793326318953854, + "flos": 18047390999040.0, + "grad_norm": 1.5833193798509506, + "language_loss": 0.75743961, + "learning_rate": 3.030089132216836e-06, + "loss": 0.83488721, + "num_input_tokens_seen": 124567200, + "router_z_loss_clip": 1.79785156, + "router_z_loss_mlp": 0.15368652, + "step": 5787, + "time_per_iteration": 2.5231845378875732 + }, + { + "auxiliary_loss_clip": 0.06470607, + "auxiliary_loss_mlp": 0.01273428, + "balance_loss_clip": 0.06287535, + "balance_loss_mlp": 0.01259111, + "epoch": 0.3479933864422065, + "flos": 29322349672320.0, + "grad_norm": 1.5447805606313796, + "language_loss": 0.81661141, + "learning_rate": 3.029755280389203e-06, + "loss": 0.89405167, + "num_input_tokens_seen": 124587025, + "router_z_loss_clip": 1.83007812, + "router_z_loss_mlp": 0.14312744, + "step": 5788, + "time_per_iteration": 2.5828304290771484 + }, + { + "auxiliary_loss_clip": 0.064804, + "auxiliary_loss_mlp": 0.01277805, + "balance_loss_clip": 0.06290662, + "balance_loss_mlp": 0.01261831, + "epoch": 0.3480535096948745, + "flos": 20126931229440.0, + "grad_norm": 1.9688082680528027, + "language_loss": 0.85984367, + "learning_rate": 3.029421389513147e-06, + "loss": 0.93742573, + "num_input_tokens_seen": 124605860, + "router_z_loss_clip": 1.8984375, + "router_z_loss_mlp": 0.15979004, + "step": 5789, + "time_per_iteration": 2.582662343978882 + }, + { + "auxiliary_loss_clip": 0.06479242, + "auxiliary_loss_mlp": 0.0127695, + "balance_loss_clip": 0.06292568, + "balance_loss_mlp": 0.0126178, + "epoch": 0.34811363294754244, + "flos": 18554453182080.0, + "grad_norm": 1.6869236803506542, + "language_loss": 0.84773821, + "learning_rate": 3.029087459601328e-06, + "loss": 0.92530012, + "num_input_tokens_seen": 124624270, + "router_z_loss_clip": 1.86816406, + "router_z_loss_mlp": 0.15185547, + "step": 5790, + "time_per_iteration": 3.942929983139038 + }, + { + "auxiliary_loss_clip": 0.06469919, + "auxiliary_loss_mlp": 0.01274378, + "balance_loss_clip": 0.0628828, + "balance_loss_mlp": 0.01259465, + "epoch": 0.3481737562002104, + "flos": 26877603421440.0, + "grad_norm": 1.9257745343225423, + "language_loss": 0.81410027, + "learning_rate": 3.0287534906664097e-06, + "loss": 0.89154327, + "num_input_tokens_seen": 124644005, + "router_z_loss_clip": 1.81738281, + "router_z_loss_mlp": 0.14904785, + "step": 5791, + "time_per_iteration": 2.5533103942871094 + }, + { + "auxiliary_loss_clip": 0.06478444, + "auxiliary_loss_mlp": 0.01278573, + "balance_loss_clip": 0.0629065, + "balance_loss_mlp": 0.01263356, + "epoch": 0.3482338794528784, + "flos": 28915495372800.0, + "grad_norm": 1.656722788090249, + "language_loss": 0.78119808, + "learning_rate": 3.028419482721056e-06, + "loss": 0.85876822, + "num_input_tokens_seen": 124663020, + "router_z_loss_clip": 1.87988281, + "router_z_loss_mlp": 0.15216064, + "step": 5792, + "time_per_iteration": 2.5784294605255127 + }, + { + "auxiliary_loss_clip": 0.06471643, + "auxiliary_loss_mlp": 0.01270557, + "balance_loss_clip": 0.06287935, + "balance_loss_mlp": 0.01255989, + "epoch": 0.3482940027055464, + "flos": 22207393854720.0, + "grad_norm": 1.5928062225109956, + "language_loss": 0.82187879, + "learning_rate": 3.0280854357779325e-06, + "loss": 0.89930081, + "num_input_tokens_seen": 124682975, + "router_z_loss_clip": 1.83496094, + "router_z_loss_mlp": 0.14575195, + "step": 5793, + "time_per_iteration": 2.545158624649048 + }, + { + "auxiliary_loss_clip": 0.06472721, + "auxiliary_loss_mlp": 0.01275633, + "balance_loss_clip": 0.06286202, + "balance_loss_mlp": 0.01259438, + "epoch": 0.34835412595821436, + "flos": 20308884370560.0, + "grad_norm": 1.8552979095996294, + "language_loss": 0.7616328, + "learning_rate": 3.027751349849706e-06, + "loss": 0.83911633, + "num_input_tokens_seen": 124701340, + "router_z_loss_clip": 1.86328125, + "router_z_loss_mlp": 0.1618042, + "step": 5794, + "time_per_iteration": 2.548841953277588 + }, + { + "auxiliary_loss_clip": 0.06468202, + "auxiliary_loss_mlp": 0.01277142, + "balance_loss_clip": 0.06286102, + "balance_loss_mlp": 0.01262271, + "epoch": 0.3484142492108823, + "flos": 20456065267200.0, + "grad_norm": 2.5979910850639336, + "language_loss": 0.57406038, + "learning_rate": 3.0274172249490456e-06, + "loss": 0.65151387, + "num_input_tokens_seen": 124719165, + "router_z_loss_clip": 1.82226562, + "router_z_loss_mlp": 0.14868164, + "step": 5795, + "time_per_iteration": 2.5222668647766113 + }, + { + "auxiliary_loss_clip": 0.06465806, + "auxiliary_loss_mlp": 0.01271041, + "balance_loss_clip": 0.06285395, + "balance_loss_mlp": 0.01257469, + "epoch": 0.3484743724635503, + "flos": 24359832737280.0, + "grad_norm": 1.8988060542741243, + "language_loss": 0.83093596, + "learning_rate": 3.0270830610886213e-06, + "loss": 0.90830439, + "num_input_tokens_seen": 124738670, + "router_z_loss_clip": 1.80371094, + "router_z_loss_mlp": 0.13580322, + "step": 5796, + "time_per_iteration": 2.5901992321014404 + }, + { + "auxiliary_loss_clip": 0.06459932, + "auxiliary_loss_mlp": 0.01272067, + "balance_loss_clip": 0.06285086, + "balance_loss_mlp": 0.01258692, + "epoch": 0.34853449571621825, + "flos": 24359916591360.0, + "grad_norm": 1.6441838604480552, + "language_loss": 0.83544898, + "learning_rate": 3.0267488582811033e-06, + "loss": 0.91276896, + "num_input_tokens_seen": 124758760, + "router_z_loss_clip": 1.74511719, + "router_z_loss_mlp": 0.13378906, + "step": 5797, + "time_per_iteration": 2.5595455169677734 + }, + { + "auxiliary_loss_clip": 0.06466283, + "auxiliary_loss_mlp": 0.01269705, + "balance_loss_clip": 0.06287575, + "balance_loss_mlp": 0.01256055, + "epoch": 0.3485946189688862, + "flos": 27274395231360.0, + "grad_norm": 1.5517160717894904, + "language_loss": 0.73727238, + "learning_rate": 3.026414616539167e-06, + "loss": 0.81463224, + "num_input_tokens_seen": 124777765, + "router_z_loss_clip": 1.78808594, + "router_z_loss_mlp": 0.13647461, + "step": 5798, + "time_per_iteration": 2.716830015182495 + }, + { + "auxiliary_loss_clip": 0.06466942, + "auxiliary_loss_mlp": 0.012712, + "balance_loss_clip": 0.06280895, + "balance_loss_mlp": 0.0125618, + "epoch": 0.3486547422215542, + "flos": 20162835504000.0, + "grad_norm": 1.8098383323780278, + "language_loss": 0.76806593, + "learning_rate": 3.026080335875485e-06, + "loss": 0.84544736, + "num_input_tokens_seen": 124796775, + "router_z_loss_clip": 1.86035156, + "router_z_loss_mlp": 0.15014648, + "step": 5799, + "time_per_iteration": 2.550356149673462 + }, + { + "auxiliary_loss_clip": 0.06464861, + "auxiliary_loss_mlp": 0.01267271, + "balance_loss_clip": 0.06284796, + "balance_loss_mlp": 0.01253735, + "epoch": 0.34871486547422215, + "flos": 20236614624000.0, + "grad_norm": 2.6888551620055363, + "language_loss": 0.75880742, + "learning_rate": 3.025746016302734e-06, + "loss": 0.83612871, + "num_input_tokens_seen": 124815825, + "router_z_loss_clip": 1.79980469, + "router_z_loss_mlp": 0.13543701, + "step": 5800, + "time_per_iteration": 2.559406042098999 + }, + { + "auxiliary_loss_clip": 0.06468332, + "auxiliary_loss_mlp": 0.01272895, + "balance_loss_clip": 0.06284243, + "balance_loss_mlp": 0.01258375, + "epoch": 0.3487749887268901, + "flos": 44063096924160.0, + "grad_norm": 1.6752863637060063, + "language_loss": 0.67620414, + "learning_rate": 3.025411657833591e-06, + "loss": 0.75361645, + "num_input_tokens_seen": 124838420, + "router_z_loss_clip": 1.84082031, + "router_z_loss_mlp": 0.14538574, + "step": 5801, + "time_per_iteration": 2.7286293506622314 + }, + { + "auxiliary_loss_clip": 0.064619, + "auxiliary_loss_mlp": 0.01268462, + "balance_loss_clip": 0.06283519, + "balance_loss_mlp": 0.01253406, + "epoch": 0.3488351119795581, + "flos": 23301921813120.0, + "grad_norm": 1.7427843167651098, + "language_loss": 0.76900619, + "learning_rate": 3.025077260480735e-06, + "loss": 0.84630978, + "num_input_tokens_seen": 124857320, + "router_z_loss_clip": 1.78320312, + "router_z_loss_mlp": 0.15075684, + "step": 5802, + "time_per_iteration": 2.5632455348968506 + }, + { + "auxiliary_loss_clip": 0.0645422, + "auxiliary_loss_mlp": 0.01273067, + "balance_loss_clip": 0.06281535, + "balance_loss_mlp": 0.01260109, + "epoch": 0.34889523523222604, + "flos": 19940449968000.0, + "grad_norm": 1.7168444943641856, + "language_loss": 0.79347479, + "learning_rate": 3.0247428242568474e-06, + "loss": 0.87074769, + "num_input_tokens_seen": 124875685, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.12957764, + "step": 5803, + "time_per_iteration": 2.5202274322509766 + }, + { + "auxiliary_loss_clip": 0.06462935, + "auxiliary_loss_mlp": 0.01269017, + "balance_loss_clip": 0.06277519, + "balance_loss_mlp": 0.01255212, + "epoch": 0.348955358484894, + "flos": 30454123570560.0, + "grad_norm": 2.672940484210586, + "language_loss": 0.67680007, + "learning_rate": 3.0244083491746085e-06, + "loss": 0.75411958, + "num_input_tokens_seen": 124895960, + "router_z_loss_clip": 1.85351562, + "router_z_loss_mlp": 0.13812256, + "step": 5804, + "time_per_iteration": 2.636371374130249 + }, + { + "auxiliary_loss_clip": 0.06455779, + "auxiliary_loss_mlp": 0.01267233, + "balance_loss_clip": 0.06282568, + "balance_loss_mlp": 0.01253989, + "epoch": 0.349015481737562, + "flos": 18005071887360.0, + "grad_norm": 1.776416664420285, + "language_loss": 0.76608741, + "learning_rate": 3.024073835246702e-06, + "loss": 0.84331751, + "num_input_tokens_seen": 124914140, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.13238525, + "step": 5805, + "time_per_iteration": 2.4746642112731934 + }, + { + "auxiliary_loss_clip": 0.06461459, + "auxiliary_loss_mlp": 0.01269872, + "balance_loss_clip": 0.06281143, + "balance_loss_mlp": 0.0125568, + "epoch": 0.34907560499023, + "flos": 27205815064320.0, + "grad_norm": 2.094620432718779, + "language_loss": 0.67626035, + "learning_rate": 3.023739282485814e-06, + "loss": 0.7535736, + "num_input_tokens_seen": 124934180, + "router_z_loss_clip": 1.80273438, + "router_z_loss_mlp": 0.14178467, + "step": 5806, + "time_per_iteration": 2.6109619140625 + }, + { + "auxiliary_loss_clip": 0.06461781, + "auxiliary_loss_mlp": 0.01268424, + "balance_loss_clip": 0.06281736, + "balance_loss_mlp": 0.01254596, + "epoch": 0.34913572824289796, + "flos": 30234714854400.0, + "grad_norm": 1.7462714312606824, + "language_loss": 0.71972066, + "learning_rate": 3.023404690904629e-06, + "loss": 0.7970227, + "num_input_tokens_seen": 124956060, + "router_z_loss_clip": 1.80273438, + "router_z_loss_mlp": 0.1383667, + "step": 5807, + "time_per_iteration": 2.6023621559143066 + }, + { + "auxiliary_loss_clip": 0.06464535, + "auxiliary_loss_mlp": 0.01272433, + "balance_loss_clip": 0.06279333, + "balance_loss_mlp": 0.01257425, + "epoch": 0.3491958514955659, + "flos": 29979779207040.0, + "grad_norm": 2.0002365662223727, + "language_loss": 0.74799109, + "learning_rate": 3.0230700605158364e-06, + "loss": 0.82536077, + "num_input_tokens_seen": 124976070, + "router_z_loss_clip": 1.8515625, + "router_z_loss_mlp": 0.15002441, + "step": 5808, + "time_per_iteration": 2.661327362060547 + }, + { + "auxiliary_loss_clip": 0.0645329, + "auxiliary_loss_mlp": 0.01272203, + "balance_loss_clip": 0.06278954, + "balance_loss_mlp": 0.0125828, + "epoch": 0.3492559747482339, + "flos": 22789786458240.0, + "grad_norm": 1.539446612060682, + "language_loss": 0.84555626, + "learning_rate": 3.0227353913321238e-06, + "loss": 0.92281115, + "num_input_tokens_seen": 124996995, + "router_z_loss_clip": 1.74316406, + "router_z_loss_mlp": 0.13922119, + "step": 5809, + "time_per_iteration": 2.577709197998047 + }, + { + "auxiliary_loss_clip": 0.06454454, + "auxiliary_loss_mlp": 0.01270466, + "balance_loss_clip": 0.06282149, + "balance_loss_mlp": 0.0125755, + "epoch": 0.34931609800090185, + "flos": 26075257050240.0, + "grad_norm": 1.9706347482771516, + "language_loss": 0.80724359, + "learning_rate": 3.0224006833661835e-06, + "loss": 0.88449275, + "num_input_tokens_seen": 125015600, + "router_z_loss_clip": 1.72167969, + "router_z_loss_mlp": 0.12921143, + "step": 5810, + "time_per_iteration": 2.583709955215454 + }, + { + "auxiliary_loss_clip": 0.06460047, + "auxiliary_loss_mlp": 0.01274437, + "balance_loss_clip": 0.06281585, + "balance_loss_mlp": 0.01260477, + "epoch": 0.3493762212535698, + "flos": 29249744509440.0, + "grad_norm": 1.580057936247994, + "language_loss": 0.75975537, + "learning_rate": 3.0220659366307057e-06, + "loss": 0.83710015, + "num_input_tokens_seen": 125035290, + "router_z_loss_clip": 1.78515625, + "router_z_loss_mlp": 0.1395874, + "step": 5811, + "time_per_iteration": 2.6304807662963867 + }, + { + "auxiliary_loss_clip": 0.06459605, + "auxiliary_loss_mlp": 0.01268711, + "balance_loss_clip": 0.06280548, + "balance_loss_mlp": 0.01254746, + "epoch": 0.3494363445062378, + "flos": 27133461463680.0, + "grad_norm": 1.6291603050336358, + "language_loss": 0.80527401, + "learning_rate": 3.021731151138386e-06, + "loss": 0.88255721, + "num_input_tokens_seen": 125057130, + "router_z_loss_clip": 1.79003906, + "router_z_loss_mlp": 0.1395874, + "step": 5812, + "time_per_iteration": 2.657989025115967 + }, + { + "auxiliary_loss_clip": 0.06462281, + "auxiliary_loss_mlp": 0.01270882, + "balance_loss_clip": 0.0628228, + "balance_loss_mlp": 0.01257179, + "epoch": 0.34949646775890575, + "flos": 12281102173440.0, + "grad_norm": 2.0118644405033463, + "language_loss": 0.701132, + "learning_rate": 3.021396326901918e-06, + "loss": 0.7784636, + "num_input_tokens_seen": 125073720, + "router_z_loss_clip": 1.80273438, + "router_z_loss_mlp": 0.137146, + "step": 5813, + "time_per_iteration": 2.47231388092041 + }, + { + "auxiliary_loss_clip": 0.06457584, + "auxiliary_loss_mlp": 0.01270878, + "balance_loss_clip": 0.06281666, + "balance_loss_mlp": 0.01257401, + "epoch": 0.3495565910115737, + "flos": 17171265507840.0, + "grad_norm": 1.9224367307793844, + "language_loss": 0.76310062, + "learning_rate": 3.0210614639339998e-06, + "loss": 0.8403852, + "num_input_tokens_seen": 125090635, + "router_z_loss_clip": 1.75976562, + "router_z_loss_mlp": 0.13482666, + "step": 5814, + "time_per_iteration": 2.4967095851898193 + }, + { + "auxiliary_loss_clip": 0.06471042, + "auxiliary_loss_mlp": 0.01271787, + "balance_loss_clip": 0.06288652, + "balance_loss_mlp": 0.01257076, + "epoch": 0.3496167142642417, + "flos": 26472342349440.0, + "grad_norm": 1.8186936331307002, + "language_loss": 0.85099685, + "learning_rate": 3.020726562247328e-06, + "loss": 0.92842519, + "num_input_tokens_seen": 125110070, + "router_z_loss_clip": 1.82519531, + "router_z_loss_mlp": 0.1472168, + "step": 5815, + "time_per_iteration": 2.597399950027466 + }, + { + "auxiliary_loss_clip": 0.06466906, + "auxiliary_loss_mlp": 0.01275707, + "balance_loss_clip": 0.06285352, + "balance_loss_mlp": 0.01261712, + "epoch": 0.34967683751690964, + "flos": 17419618609920.0, + "grad_norm": 2.3640337842934565, + "language_loss": 0.78006089, + "learning_rate": 3.0203916218546024e-06, + "loss": 0.85748702, + "num_input_tokens_seen": 125125730, + "router_z_loss_clip": 1.81542969, + "router_z_loss_mlp": 0.13995361, + "step": 5816, + "time_per_iteration": 2.5164036750793457 + }, + { + "auxiliary_loss_clip": 0.0646984, + "auxiliary_loss_mlp": 0.01273456, + "balance_loss_clip": 0.06286636, + "balance_loss_mlp": 0.01258692, + "epoch": 0.3497369607695776, + "flos": 22606365870720.0, + "grad_norm": 1.8515414586733512, + "language_loss": 0.59787703, + "learning_rate": 3.0200566427685246e-06, + "loss": 0.6753099, + "num_input_tokens_seen": 125146195, + "router_z_loss_clip": 1.83398438, + "router_z_loss_mlp": 0.14764404, + "step": 5817, + "time_per_iteration": 2.542877674102783 + }, + { + "auxiliary_loss_clip": 0.06358884, + "auxiliary_loss_mlp": 0.01261904, + "balance_loss_clip": 0.06277611, + "balance_loss_mlp": 0.01257669, + "epoch": 0.34979708402224563, + "flos": 68548461477120.0, + "grad_norm": 0.858700346008579, + "language_loss": 0.59824663, + "learning_rate": 3.0197216250017975e-06, + "loss": 0.67445457, + "num_input_tokens_seen": 125207790, + "router_z_loss_clip": 0.8125, + "router_z_loss_mlp": 0.04238892, + "step": 5818, + "time_per_iteration": 3.1992976665496826 + }, + { + "auxiliary_loss_clip": 0.06459703, + "auxiliary_loss_mlp": 0.01271152, + "balance_loss_clip": 0.06283519, + "balance_loss_mlp": 0.01257109, + "epoch": 0.3498572072749136, + "flos": 18995660455680.0, + "grad_norm": 1.926998914600137, + "language_loss": 0.83806789, + "learning_rate": 3.019386568567123e-06, + "loss": 0.91537642, + "num_input_tokens_seen": 125226220, + "router_z_loss_clip": 1.76074219, + "router_z_loss_mlp": 0.14031982, + "step": 5819, + "time_per_iteration": 2.5241613388061523 + }, + { + "auxiliary_loss_clip": 0.06466879, + "auxiliary_loss_mlp": 0.01269175, + "balance_loss_clip": 0.0628517, + "balance_loss_mlp": 0.0125493, + "epoch": 0.34991733052758156, + "flos": 27826334075520.0, + "grad_norm": 2.092302610514248, + "language_loss": 0.71273863, + "learning_rate": 3.0190514734772083e-06, + "loss": 0.79009914, + "num_input_tokens_seen": 125247485, + "router_z_loss_clip": 1.81835938, + "router_z_loss_mlp": 0.14245605, + "step": 5820, + "time_per_iteration": 2.569838762283325 + }, + { + "auxiliary_loss_clip": 0.06470378, + "auxiliary_loss_mlp": 0.01270567, + "balance_loss_clip": 0.06288413, + "balance_loss_mlp": 0.01256292, + "epoch": 0.3499774537802495, + "flos": 33592706755200.0, + "grad_norm": 2.4345068466865083, + "language_loss": 0.70581877, + "learning_rate": 3.018716339744759e-06, + "loss": 0.78322828, + "num_input_tokens_seen": 125268625, + "router_z_loss_clip": 1.81835938, + "router_z_loss_mlp": 0.14294434, + "step": 5821, + "time_per_iteration": 2.6535534858703613 + }, + { + "auxiliary_loss_clip": 0.06479154, + "auxiliary_loss_mlp": 0.0127118, + "balance_loss_clip": 0.06291604, + "balance_loss_mlp": 0.01254538, + "epoch": 0.3500375770329175, + "flos": 23483413756800.0, + "grad_norm": 1.9533795991074365, + "language_loss": 0.74227631, + "learning_rate": 3.0183811673824842e-06, + "loss": 0.81977963, + "num_input_tokens_seen": 125287530, + "router_z_loss_clip": 1.87402344, + "router_z_loss_mlp": 0.16650391, + "step": 5822, + "time_per_iteration": 5.406672716140747 + }, + { + "auxiliary_loss_clip": 0.06470097, + "auxiliary_loss_mlp": 0.01273086, + "balance_loss_clip": 0.06285684, + "balance_loss_mlp": 0.01257588, + "epoch": 0.35009770028558546, + "flos": 19032067854720.0, + "grad_norm": 2.646032233627204, + "language_loss": 0.7905609, + "learning_rate": 3.018045956403094e-06, + "loss": 0.86799276, + "num_input_tokens_seen": 125307020, + "router_z_loss_clip": 1.84570312, + "router_z_loss_mlp": 0.15496826, + "step": 5823, + "time_per_iteration": 2.5048515796661377 + }, + { + "auxiliary_loss_clip": 0.06353101, + "auxiliary_loss_mlp": 0.01254576, + "balance_loss_clip": 0.06271273, + "balance_loss_mlp": 0.01249748, + "epoch": 0.3501578235382534, + "flos": 68371749216000.0, + "grad_norm": 0.6915411290730273, + "language_loss": 0.58945203, + "learning_rate": 3.017710706819298e-06, + "loss": 0.66552877, + "num_input_tokens_seen": 125370445, + "router_z_loss_clip": 0.81689453, + "router_z_loss_mlp": 0.04821777, + "step": 5824, + "time_per_iteration": 3.209726333618164 + }, + { + "auxiliary_loss_clip": 0.06465952, + "auxiliary_loss_mlp": 0.01274281, + "balance_loss_clip": 0.06284555, + "balance_loss_mlp": 0.01258045, + "epoch": 0.3502179467909214, + "flos": 21257153827200.0, + "grad_norm": 3.0621504018438164, + "language_loss": 0.85168576, + "learning_rate": 3.017375418643811e-06, + "loss": 0.92908812, + "num_input_tokens_seen": 125388900, + "router_z_loss_clip": 1.81445312, + "router_z_loss_mlp": 0.16223145, + "step": 5825, + "time_per_iteration": 2.513498067855835 + }, + { + "auxiliary_loss_clip": 0.06462917, + "auxiliary_loss_mlp": 0.01268842, + "balance_loss_clip": 0.06283134, + "balance_loss_mlp": 0.01254275, + "epoch": 0.35027807004358935, + "flos": 11946978817920.0, + "grad_norm": 2.498923152973308, + "language_loss": 0.83643848, + "learning_rate": 3.0170400918893464e-06, + "loss": 0.91375613, + "num_input_tokens_seen": 125402675, + "router_z_loss_clip": 1.79882812, + "router_z_loss_mlp": 0.14556885, + "step": 5826, + "time_per_iteration": 3.9313511848449707 + }, + { + "auxiliary_loss_clip": 0.06470059, + "auxiliary_loss_mlp": 0.01272158, + "balance_loss_clip": 0.06284411, + "balance_loss_mlp": 0.01254956, + "epoch": 0.3503381932962573, + "flos": 21477401084160.0, + "grad_norm": 2.100708343809493, + "language_loss": 0.81216669, + "learning_rate": 3.0167047265686186e-06, + "loss": 0.88958883, + "num_input_tokens_seen": 125421360, + "router_z_loss_clip": 1.85644531, + "router_z_loss_mlp": 0.17211914, + "step": 5827, + "time_per_iteration": 2.556704044342041 + }, + { + "auxiliary_loss_clip": 0.06462219, + "auxiliary_loss_mlp": 0.01272255, + "balance_loss_clip": 0.06283772, + "balance_loss_mlp": 0.01257473, + "epoch": 0.3503983165489253, + "flos": 21257405389440.0, + "grad_norm": 2.0166313071454858, + "language_loss": 0.71145403, + "learning_rate": 3.0163693226943467e-06, + "loss": 0.78879881, + "num_input_tokens_seen": 125440000, + "router_z_loss_clip": 1.78515625, + "router_z_loss_mlp": 0.14794922, + "step": 5828, + "time_per_iteration": 2.5173239707946777 + }, + { + "auxiliary_loss_clip": 0.06467165, + "auxiliary_loss_mlp": 0.01274622, + "balance_loss_clip": 0.06285597, + "balance_loss_mlp": 0.01257539, + "epoch": 0.35045843980159325, + "flos": 27822644496000.0, + "grad_norm": 1.678964319221545, + "language_loss": 0.79897165, + "learning_rate": 3.016033880279248e-06, + "loss": 0.8763895, + "num_input_tokens_seen": 125460390, + "router_z_loss_clip": 1.81542969, + "router_z_loss_mlp": 0.17077637, + "step": 5829, + "time_per_iteration": 4.086450099945068 + }, + { + "auxiliary_loss_clip": 0.06475446, + "auxiliary_loss_mlp": 0.01275238, + "balance_loss_clip": 0.06286699, + "balance_loss_mlp": 0.01257988, + "epoch": 0.3505185630542612, + "flos": 25928201934720.0, + "grad_norm": 1.7428196933402165, + "language_loss": 0.72440839, + "learning_rate": 3.0156983993360417e-06, + "loss": 0.80191517, + "num_input_tokens_seen": 125478410, + "router_z_loss_clip": 1.88769531, + "router_z_loss_mlp": 0.17248535, + "step": 5830, + "time_per_iteration": 2.625723361968994 + }, + { + "auxiliary_loss_clip": 0.06461293, + "auxiliary_loss_mlp": 0.01273843, + "balance_loss_clip": 0.06283247, + "balance_loss_mlp": 0.01259633, + "epoch": 0.35057868630692923, + "flos": 20527999597440.0, + "grad_norm": 2.5118715805025884, + "language_loss": 0.88613749, + "learning_rate": 3.0153628798774513e-06, + "loss": 0.96348894, + "num_input_tokens_seen": 125495975, + "router_z_loss_clip": 1.78222656, + "router_z_loss_mlp": 0.14221191, + "step": 5831, + "time_per_iteration": 2.577260732650757 + }, + { + "auxiliary_loss_clip": 0.06466363, + "auxiliary_loss_mlp": 0.01273549, + "balance_loss_clip": 0.06284641, + "balance_loss_mlp": 0.01258672, + "epoch": 0.3506388095595972, + "flos": 20454849383040.0, + "grad_norm": 2.013142681723478, + "language_loss": 0.78719735, + "learning_rate": 3.0150273219161985e-06, + "loss": 0.86459637, + "num_input_tokens_seen": 125515035, + "router_z_loss_clip": 1.81738281, + "router_z_loss_mlp": 0.14868164, + "step": 5832, + "time_per_iteration": 2.584496021270752 + }, + { + "auxiliary_loss_clip": 0.06470136, + "auxiliary_loss_mlp": 0.01274951, + "balance_loss_clip": 0.06284127, + "balance_loss_mlp": 0.01258536, + "epoch": 0.35069893281226516, + "flos": 23115901749120.0, + "grad_norm": 3.869403317005625, + "language_loss": 0.71628016, + "learning_rate": 3.014691725465008e-06, + "loss": 0.79373109, + "num_input_tokens_seen": 125535555, + "router_z_loss_clip": 1.86230469, + "router_z_loss_mlp": 0.1640625, + "step": 5833, + "time_per_iteration": 2.559213161468506 + }, + { + "auxiliary_loss_clip": 0.06462866, + "auxiliary_loss_mlp": 0.01271192, + "balance_loss_clip": 0.06285653, + "balance_loss_mlp": 0.01256291, + "epoch": 0.35075905606493313, + "flos": 27279426476160.0, + "grad_norm": 2.081089463640026, + "language_loss": 0.80963689, + "learning_rate": 3.014356090536606e-06, + "loss": 0.88697743, + "num_input_tokens_seen": 125558195, + "router_z_loss_clip": 1.77246094, + "router_z_loss_mlp": 0.14892578, + "step": 5834, + "time_per_iteration": 2.6462955474853516 + }, + { + "auxiliary_loss_clip": 0.06469317, + "auxiliary_loss_mlp": 0.0127505, + "balance_loss_clip": 0.06288308, + "balance_loss_mlp": 0.01258634, + "epoch": 0.3508191793176011, + "flos": 19133491622400.0, + "grad_norm": 2.5340357013843566, + "language_loss": 0.84608614, + "learning_rate": 3.0140204171437183e-06, + "loss": 0.92352986, + "num_input_tokens_seen": 125575375, + "router_z_loss_clip": 1.80957031, + "router_z_loss_mlp": 0.1640625, + "step": 5835, + "time_per_iteration": 2.5068061351776123 + }, + { + "auxiliary_loss_clip": 0.06463549, + "auxiliary_loss_mlp": 0.01274357, + "balance_loss_clip": 0.0628426, + "balance_loss_mlp": 0.01259122, + "epoch": 0.35087930257026906, + "flos": 25564798776960.0, + "grad_norm": 1.6798272602016127, + "language_loss": 0.77162683, + "learning_rate": 3.0136847052990754e-06, + "loss": 0.84900588, + "num_input_tokens_seen": 125596745, + "router_z_loss_clip": 1.79296875, + "router_z_loss_mlp": 0.15234375, + "step": 5836, + "time_per_iteration": 2.628737449645996 + }, + { + "auxiliary_loss_clip": 0.06462973, + "auxiliary_loss_mlp": 0.01284097, + "balance_loss_clip": 0.06285001, + "balance_loss_mlp": 0.01268767, + "epoch": 0.350939425822937, + "flos": 18010061205120.0, + "grad_norm": 1.7914903677000888, + "language_loss": 0.7777887, + "learning_rate": 3.0133489550154074e-06, + "loss": 0.85525942, + "num_input_tokens_seen": 125613980, + "router_z_loss_clip": 1.77929688, + "router_z_loss_mlp": 0.15325928, + "step": 5837, + "time_per_iteration": 2.4906866550445557 + }, + { + "auxiliary_loss_clip": 0.06464779, + "auxiliary_loss_mlp": 0.0127724, + "balance_loss_clip": 0.0628402, + "balance_loss_mlp": 0.01261575, + "epoch": 0.350999549075605, + "flos": 22279747455360.0, + "grad_norm": 2.3774474075228995, + "language_loss": 0.68712002, + "learning_rate": 3.0130131663054442e-06, + "loss": 0.7645402, + "num_input_tokens_seen": 125632100, + "router_z_loss_clip": 1.8046875, + "router_z_loss_mlp": 0.15649414, + "step": 5838, + "time_per_iteration": 2.616330862045288 + }, + { + "auxiliary_loss_clip": 0.06463079, + "auxiliary_loss_mlp": 0.01275242, + "balance_loss_clip": 0.0628327, + "balance_loss_mlp": 0.01259554, + "epoch": 0.35105967232827295, + "flos": 14397511000320.0, + "grad_norm": 2.135026117356547, + "language_loss": 0.83941519, + "learning_rate": 3.0126773391819215e-06, + "loss": 0.91679841, + "num_input_tokens_seen": 125649190, + "router_z_loss_clip": 1.79785156, + "router_z_loss_mlp": 0.15686035, + "step": 5839, + "time_per_iteration": 2.475210428237915 + }, + { + "auxiliary_loss_clip": 0.06472797, + "auxiliary_loss_mlp": 0.01274732, + "balance_loss_clip": 0.06285894, + "balance_loss_mlp": 0.01258376, + "epoch": 0.3511197955809409, + "flos": 25089322383360.0, + "grad_norm": 2.313381638226651, + "language_loss": 0.58970249, + "learning_rate": 3.012341473657572e-06, + "loss": 0.6671778, + "num_input_tokens_seen": 125668680, + "router_z_loss_clip": 1.86914062, + "router_z_loss_mlp": 0.16357422, + "step": 5840, + "time_per_iteration": 2.5654497146606445 + }, + { + "auxiliary_loss_clip": 0.06465258, + "auxiliary_loss_mlp": 0.01277785, + "balance_loss_clip": 0.06280696, + "balance_loss_mlp": 0.0126174, + "epoch": 0.3511799188336089, + "flos": 25891123703040.0, + "grad_norm": 2.5798747861510254, + "language_loss": 0.87567091, + "learning_rate": 3.0120055697451322e-06, + "loss": 0.9531014, + "num_input_tokens_seen": 125686935, + "router_z_loss_clip": 1.84570312, + "router_z_loss_mlp": 0.16040039, + "step": 5841, + "time_per_iteration": 2.5275204181671143 + }, + { + "auxiliary_loss_clip": 0.06473795, + "auxiliary_loss_mlp": 0.01278097, + "balance_loss_clip": 0.0628502, + "balance_loss_mlp": 0.01261038, + "epoch": 0.35124004208627685, + "flos": 20089852997760.0, + "grad_norm": 1.7442007932185601, + "language_loss": 0.7546367, + "learning_rate": 3.0116696274573406e-06, + "loss": 0.83215564, + "num_input_tokens_seen": 125707180, + "router_z_loss_clip": 1.88964844, + "router_z_loss_mlp": 0.17077637, + "step": 5842, + "time_per_iteration": 2.5876784324645996 + }, + { + "auxiliary_loss_clip": 0.06465417, + "auxiliary_loss_mlp": 0.01280375, + "balance_loss_clip": 0.06280544, + "balance_loss_mlp": 0.01265105, + "epoch": 0.3513001653389448, + "flos": 17788891553280.0, + "grad_norm": 2.704982383226077, + "language_loss": 0.68951106, + "learning_rate": 3.0113336468069346e-06, + "loss": 0.76696897, + "num_input_tokens_seen": 125722780, + "router_z_loss_clip": 1.84765625, + "router_z_loss_mlp": 0.15258789, + "step": 5843, + "time_per_iteration": 2.4710304737091064 + }, + { + "auxiliary_loss_clip": 0.06466319, + "auxiliary_loss_mlp": 0.01285229, + "balance_loss_clip": 0.0628369, + "balance_loss_mlp": 0.01268892, + "epoch": 0.3513602885916128, + "flos": 29394745200000.0, + "grad_norm": 2.1140022916881525, + "language_loss": 0.66181982, + "learning_rate": 3.010997627806655e-06, + "loss": 0.7393353, + "num_input_tokens_seen": 125742110, + "router_z_loss_clip": 1.82714844, + "router_z_loss_mlp": 0.16326904, + "step": 5844, + "time_per_iteration": 2.585793972015381 + }, + { + "auxiliary_loss_clip": 0.06472903, + "auxiliary_loss_mlp": 0.01282408, + "balance_loss_clip": 0.0628912, + "balance_loss_mlp": 0.01265761, + "epoch": 0.3514204118442808, + "flos": 16185372768000.0, + "grad_norm": 2.0590361589883206, + "language_loss": 0.75743866, + "learning_rate": 3.010661570469245e-06, + "loss": 0.83499175, + "num_input_tokens_seen": 125759980, + "router_z_loss_clip": 1.83789062, + "router_z_loss_mlp": 0.1663208, + "step": 5845, + "time_per_iteration": 2.50748348236084 + }, + { + "auxiliary_loss_clip": 0.06463686, + "auxiliary_loss_mlp": 0.01285129, + "balance_loss_clip": 0.06284383, + "balance_loss_mlp": 0.01270102, + "epoch": 0.35148053509694877, + "flos": 23840234369280.0, + "grad_norm": 5.020955850717412, + "language_loss": 0.73988718, + "learning_rate": 3.0103254748074465e-06, + "loss": 0.8173753, + "num_input_tokens_seen": 125772660, + "router_z_loss_clip": 1.79101562, + "router_z_loss_mlp": 0.15032959, + "step": 5846, + "time_per_iteration": 2.626898765563965 + }, + { + "auxiliary_loss_clip": 0.06470932, + "auxiliary_loss_mlp": 0.01280544, + "balance_loss_clip": 0.06285631, + "balance_loss_mlp": 0.01265482, + "epoch": 0.35154065834961673, + "flos": 20996809591680.0, + "grad_norm": 1.7410870567887373, + "language_loss": 0.75501883, + "learning_rate": 3.0099893408340046e-06, + "loss": 0.8325336, + "num_input_tokens_seen": 125791935, + "router_z_loss_clip": 1.85253906, + "router_z_loss_mlp": 0.1506958, + "step": 5847, + "time_per_iteration": 2.559131383895874 + }, + { + "auxiliary_loss_clip": 0.06472816, + "auxiliary_loss_mlp": 0.01272158, + "balance_loss_clip": 0.06284919, + "balance_loss_mlp": 0.01257316, + "epoch": 0.3516007816022847, + "flos": 33263866206720.0, + "grad_norm": 1.8955744454716683, + "language_loss": 0.72774404, + "learning_rate": 3.009653168561666e-06, + "loss": 0.80519378, + "num_input_tokens_seen": 125813455, + "router_z_loss_clip": 1.87792969, + "router_z_loss_mlp": 0.1484375, + "step": 5848, + "time_per_iteration": 2.6645965576171875 + }, + { + "auxiliary_loss_clip": 0.06467354, + "auxiliary_loss_mlp": 0.01280776, + "balance_loss_clip": 0.06280826, + "balance_loss_mlp": 0.01265124, + "epoch": 0.35166090485495266, + "flos": 11731427389440.0, + "grad_norm": 2.1922530808110983, + "language_loss": 0.90064394, + "learning_rate": 3.009316958003178e-06, + "loss": 0.97812521, + "num_input_tokens_seen": 125827660, + "router_z_loss_clip": 1.86425781, + "router_z_loss_mlp": 0.15655518, + "step": 5849, + "time_per_iteration": 2.4567575454711914 + }, + { + "auxiliary_loss_clip": 0.06464183, + "auxiliary_loss_mlp": 0.01272929, + "balance_loss_clip": 0.06281896, + "balance_loss_mlp": 0.01257461, + "epoch": 0.3517210281076206, + "flos": 22645121184000.0, + "grad_norm": 2.4964624006606946, + "language_loss": 0.75405449, + "learning_rate": 3.0089807091712897e-06, + "loss": 0.83142555, + "num_input_tokens_seen": 125846655, + "router_z_loss_clip": 1.82324219, + "router_z_loss_mlp": 0.15472412, + "step": 5850, + "time_per_iteration": 2.5980029106140137 + }, + { + "auxiliary_loss_clip": 0.06463099, + "auxiliary_loss_mlp": 0.01274678, + "balance_loss_clip": 0.06282984, + "balance_loss_mlp": 0.01259842, + "epoch": 0.3517811513602886, + "flos": 21328836595200.0, + "grad_norm": 2.0250770904548303, + "language_loss": 0.76385641, + "learning_rate": 3.0086444220787515e-06, + "loss": 0.84123409, + "num_input_tokens_seen": 125866290, + "router_z_loss_clip": 1.80371094, + "router_z_loss_mlp": 0.14825439, + "step": 5851, + "time_per_iteration": 2.5065958499908447 + }, + { + "auxiliary_loss_clip": 0.06463097, + "auxiliary_loss_mlp": 0.01275014, + "balance_loss_clip": 0.06281513, + "balance_loss_mlp": 0.01258933, + "epoch": 0.35184127461295656, + "flos": 21039254484480.0, + "grad_norm": 1.95256002439052, + "language_loss": 0.88133335, + "learning_rate": 3.0083080967383165e-06, + "loss": 0.95871449, + "num_input_tokens_seen": 125884620, + "router_z_loss_clip": 1.81738281, + "router_z_loss_mlp": 0.1607666, + "step": 5852, + "time_per_iteration": 2.571439266204834 + }, + { + "auxiliary_loss_clip": 0.06461711, + "auxiliary_loss_mlp": 0.01273084, + "balance_loss_clip": 0.06282608, + "balance_loss_mlp": 0.01258087, + "epoch": 0.3519013978656245, + "flos": 22461784450560.0, + "grad_norm": 2.1690150127965038, + "language_loss": 0.68480182, + "learning_rate": 3.007971733162737e-06, + "loss": 0.76214981, + "num_input_tokens_seen": 125902430, + "router_z_loss_clip": 1.79101562, + "router_z_loss_mlp": 0.14990234, + "step": 5853, + "time_per_iteration": 2.5121214389801025 + }, + { + "auxiliary_loss_clip": 0.06466305, + "auxiliary_loss_mlp": 0.0127272, + "balance_loss_clip": 0.06282477, + "balance_loss_mlp": 0.01256972, + "epoch": 0.3519615211182925, + "flos": 13120317141120.0, + "grad_norm": 2.1084516189193403, + "language_loss": 0.81284809, + "learning_rate": 3.0076353313647686e-06, + "loss": 0.89023829, + "num_input_tokens_seen": 125920570, + "router_z_loss_clip": 1.83691406, + "router_z_loss_mlp": 0.15734863, + "step": 5854, + "time_per_iteration": 2.644672155380249 + }, + { + "auxiliary_loss_clip": 0.06456967, + "auxiliary_loss_mlp": 0.01267471, + "balance_loss_clip": 0.06279022, + "balance_loss_mlp": 0.01253481, + "epoch": 0.35202164437096045, + "flos": 19141122343680.0, + "grad_norm": 1.5283351736697255, + "language_loss": 0.73366165, + "learning_rate": 3.0072988913571666e-06, + "loss": 0.81090605, + "num_input_tokens_seen": 125939800, + "router_z_loss_clip": 1.77832031, + "router_z_loss_mlp": 0.13970947, + "step": 5855, + "time_per_iteration": 2.489614486694336 + }, + { + "auxiliary_loss_clip": 0.06458069, + "auxiliary_loss_mlp": 0.01271058, + "balance_loss_clip": 0.06279419, + "balance_loss_mlp": 0.01256717, + "epoch": 0.3520817676236284, + "flos": 26549475632640.0, + "grad_norm": 1.8023400431296785, + "language_loss": 0.71055883, + "learning_rate": 3.006962413152691e-06, + "loss": 0.78785008, + "num_input_tokens_seen": 125958720, + "router_z_loss_clip": 1.78613281, + "router_z_loss_mlp": 0.14337158, + "step": 5856, + "time_per_iteration": 2.5643463134765625 + }, + { + "auxiliary_loss_clip": 0.064651, + "auxiliary_loss_mlp": 0.01271649, + "balance_loss_clip": 0.062787, + "balance_loss_mlp": 0.01255663, + "epoch": 0.3521418908762964, + "flos": 44903653557120.0, + "grad_norm": 1.9243906825553334, + "language_loss": 0.61456323, + "learning_rate": 3.0066258967640987e-06, + "loss": 0.69193071, + "num_input_tokens_seen": 125984310, + "router_z_loss_clip": 1.86523438, + "router_z_loss_mlp": 0.16003418, + "step": 5857, + "time_per_iteration": 2.723026752471924 + }, + { + "auxiliary_loss_clip": 0.06463988, + "auxiliary_loss_mlp": 0.0126934, + "balance_loss_clip": 0.06281644, + "balance_loss_mlp": 0.01253569, + "epoch": 0.3522020141289644, + "flos": 20192576503680.0, + "grad_norm": 1.9490734994800325, + "language_loss": 0.73682863, + "learning_rate": 3.006289342204152e-06, + "loss": 0.8141619, + "num_input_tokens_seen": 126002410, + "router_z_loss_clip": 1.82324219, + "router_z_loss_mlp": 0.15765381, + "step": 5858, + "time_per_iteration": 2.5245583057403564 + }, + { + "auxiliary_loss_clip": 0.0646653, + "auxiliary_loss_mlp": 0.01270245, + "balance_loss_clip": 0.06283493, + "balance_loss_mlp": 0.01255368, + "epoch": 0.35226213738163237, + "flos": 27571398428160.0, + "grad_norm": 1.5191641480211209, + "language_loss": 0.76385832, + "learning_rate": 3.0059527494856126e-06, + "loss": 0.8412261, + "num_input_tokens_seen": 126022490, + "router_z_loss_clip": 1.83105469, + "router_z_loss_mlp": 0.14880371, + "step": 5859, + "time_per_iteration": 2.5650510787963867 + }, + { + "auxiliary_loss_clip": 0.06474233, + "auxiliary_loss_mlp": 0.01272168, + "balance_loss_clip": 0.06283402, + "balance_loss_mlp": 0.01256862, + "epoch": 0.35232226063430033, + "flos": 22972955483520.0, + "grad_norm": 2.0210321352313305, + "language_loss": 0.72436023, + "learning_rate": 3.0056161186212435e-06, + "loss": 0.80182427, + "num_input_tokens_seen": 126042895, + "router_z_loss_clip": 1.90820312, + "router_z_loss_mlp": 0.15307617, + "step": 5860, + "time_per_iteration": 2.557419776916504 + }, + { + "auxiliary_loss_clip": 0.06468037, + "auxiliary_loss_mlp": 0.01273009, + "balance_loss_clip": 0.06280215, + "balance_loss_mlp": 0.01257304, + "epoch": 0.3523823838869683, + "flos": 19173714382080.0, + "grad_norm": 2.1675794505809076, + "language_loss": 0.66646308, + "learning_rate": 3.005279449623811e-06, + "loss": 0.74387354, + "num_input_tokens_seen": 126060130, + "router_z_loss_clip": 1.87890625, + "router_z_loss_mlp": 0.15704346, + "step": 5861, + "time_per_iteration": 5.330287218093872 + }, + { + "auxiliary_loss_clip": 0.06464717, + "auxiliary_loss_mlp": 0.01272322, + "balance_loss_clip": 0.06283809, + "balance_loss_mlp": 0.01257331, + "epoch": 0.35244250713963626, + "flos": 17936743282560.0, + "grad_norm": 1.8073030876467324, + "language_loss": 0.67339319, + "learning_rate": 3.0049427425060815e-06, + "loss": 0.7507636, + "num_input_tokens_seen": 126077850, + "router_z_loss_clip": 1.80859375, + "router_z_loss_mlp": 0.15002441, + "step": 5862, + "time_per_iteration": 2.545534372329712 + }, + { + "auxiliary_loss_clip": 0.06465253, + "auxiliary_loss_mlp": 0.01277428, + "balance_loss_clip": 0.06279148, + "balance_loss_mlp": 0.01260775, + "epoch": 0.35250263039230423, + "flos": 21438687697920.0, + "grad_norm": 2.06594301339393, + "language_loss": 0.76956195, + "learning_rate": 3.0046059972808215e-06, + "loss": 0.8469888, + "num_input_tokens_seen": 126095985, + "router_z_loss_clip": 1.859375, + "router_z_loss_mlp": 0.16650391, + "step": 5863, + "time_per_iteration": 2.5614800453186035 + }, + { + "auxiliary_loss_clip": 0.06466909, + "auxiliary_loss_mlp": 0.01270449, + "balance_loss_clip": 0.06283094, + "balance_loss_mlp": 0.01255846, + "epoch": 0.3525627536449722, + "flos": 27424133677440.0, + "grad_norm": 1.7204880099735786, + "language_loss": 0.75455201, + "learning_rate": 3.0042692139608024e-06, + "loss": 0.83192563, + "num_input_tokens_seen": 126116070, + "router_z_loss_clip": 1.83789062, + "router_z_loss_mlp": 0.14605713, + "step": 5864, + "time_per_iteration": 2.590428113937378 + }, + { + "auxiliary_loss_clip": 0.06465425, + "auxiliary_loss_mlp": 0.01271849, + "balance_loss_clip": 0.06283714, + "balance_loss_mlp": 0.01257306, + "epoch": 0.35262287689764016, + "flos": 24796637671680.0, + "grad_norm": 2.274548371802061, + "language_loss": 0.79325253, + "learning_rate": 3.003932392558793e-06, + "loss": 0.87062526, + "num_input_tokens_seen": 126135205, + "router_z_loss_clip": 1.81738281, + "router_z_loss_mlp": 0.14550781, + "step": 5865, + "time_per_iteration": 4.090251922607422 + }, + { + "auxiliary_loss_clip": 0.06479216, + "auxiliary_loss_mlp": 0.01273849, + "balance_loss_clip": 0.06290671, + "balance_loss_mlp": 0.01257935, + "epoch": 0.3526830001503081, + "flos": 17827353377280.0, + "grad_norm": 3.6346687905375155, + "language_loss": 0.81561065, + "learning_rate": 3.0035955330875677e-06, + "loss": 0.89314139, + "num_input_tokens_seen": 126151895, + "router_z_loss_clip": 1.88476562, + "router_z_loss_mlp": 0.15917969, + "step": 5866, + "time_per_iteration": 2.5417611598968506 + }, + { + "auxiliary_loss_clip": 0.06481875, + "auxiliary_loss_mlp": 0.01272499, + "balance_loss_clip": 0.06287797, + "balance_loss_mlp": 0.01255226, + "epoch": 0.3527431234029761, + "flos": 18084091887360.0, + "grad_norm": 2.1275369997353692, + "language_loss": 0.84947896, + "learning_rate": 3.0032586355598986e-06, + "loss": 0.9270227, + "num_input_tokens_seen": 126168515, + "router_z_loss_clip": 1.94042969, + "router_z_loss_mlp": 0.17272949, + "step": 5867, + "time_per_iteration": 2.487138509750366 + }, + { + "auxiliary_loss_clip": 0.06472977, + "auxiliary_loss_mlp": 0.01270369, + "balance_loss_clip": 0.06285943, + "balance_loss_mlp": 0.01254431, + "epoch": 0.35280324665564405, + "flos": 19433429712000.0, + "grad_norm": 2.157782607866355, + "language_loss": 0.74828005, + "learning_rate": 3.0029216999885613e-06, + "loss": 0.82571352, + "num_input_tokens_seen": 126186460, + "router_z_loss_clip": 1.86816406, + "router_z_loss_mlp": 0.15942383, + "step": 5868, + "time_per_iteration": 2.536522150039673 + }, + { + "auxiliary_loss_clip": 0.06471637, + "auxiliary_loss_mlp": 0.01277122, + "balance_loss_clip": 0.06284134, + "balance_loss_mlp": 0.01260277, + "epoch": 0.352863369908312, + "flos": 21509951195520.0, + "grad_norm": 2.023756469283546, + "language_loss": 0.6153, + "learning_rate": 3.0025847263863327e-06, + "loss": 0.69278765, + "num_input_tokens_seen": 126206170, + "router_z_loss_clip": 1.87402344, + "router_z_loss_mlp": 0.16845703, + "step": 5869, + "time_per_iteration": 3.977250099182129 + }, + { + "auxiliary_loss_clip": 0.06469242, + "auxiliary_loss_mlp": 0.01275411, + "balance_loss_clip": 0.06282457, + "balance_loss_mlp": 0.01259985, + "epoch": 0.35292349316098, + "flos": 22316029073280.0, + "grad_norm": 3.8155591266042173, + "language_loss": 0.75253737, + "learning_rate": 3.0022477147659917e-06, + "loss": 0.82998383, + "num_input_tokens_seen": 126225605, + "router_z_loss_clip": 1.86914062, + "router_z_loss_mlp": 0.1541748, + "step": 5870, + "time_per_iteration": 2.5275635719299316 + }, + { + "auxiliary_loss_clip": 0.06466261, + "auxiliary_loss_mlp": 0.01271259, + "balance_loss_clip": 0.06282211, + "balance_loss_mlp": 0.01255964, + "epoch": 0.352983616413648, + "flos": 33118152756480.0, + "grad_norm": 1.8217533687724534, + "language_loss": 0.72204906, + "learning_rate": 3.001910665140316e-06, + "loss": 0.79942429, + "num_input_tokens_seen": 126250230, + "router_z_loss_clip": 1.84082031, + "router_z_loss_mlp": 0.1529541, + "step": 5871, + "time_per_iteration": 2.660351037979126 + }, + { + "auxiliary_loss_clip": 0.06463222, + "auxiliary_loss_mlp": 0.012708, + "balance_loss_clip": 0.0628562, + "balance_loss_mlp": 0.01257389, + "epoch": 0.35304373966631597, + "flos": 18702388765440.0, + "grad_norm": 1.8432981727531608, + "language_loss": 0.73899144, + "learning_rate": 3.0015735775220873e-06, + "loss": 0.81633162, + "num_input_tokens_seen": 126268315, + "router_z_loss_clip": 1.77636719, + "router_z_loss_mlp": 0.13415527, + "step": 5872, + "time_per_iteration": 2.501868724822998 + }, + { + "auxiliary_loss_clip": 0.06467956, + "auxiliary_loss_mlp": 0.01269552, + "balance_loss_clip": 0.06285646, + "balance_loss_mlp": 0.01255163, + "epoch": 0.35310386291898394, + "flos": 23371214739840.0, + "grad_norm": 1.6596154000518588, + "language_loss": 0.83059716, + "learning_rate": 3.001236451924089e-06, + "loss": 0.90797222, + "num_input_tokens_seen": 126288390, + "router_z_loss_clip": 1.8203125, + "router_z_loss_mlp": 0.14404297, + "step": 5873, + "time_per_iteration": 2.6044130325317383 + }, + { + "auxiliary_loss_clip": 0.06475792, + "auxiliary_loss_mlp": 0.01275098, + "balance_loss_clip": 0.06285458, + "balance_loss_mlp": 0.0125879, + "epoch": 0.3531639861716519, + "flos": 24468803372160.0, + "grad_norm": 2.6977932070351183, + "language_loss": 0.65726781, + "learning_rate": 3.000899288359104e-06, + "loss": 0.73477674, + "num_input_tokens_seen": 126305750, + "router_z_loss_clip": 1.90234375, + "router_z_loss_mlp": 0.16308594, + "step": 5874, + "time_per_iteration": 2.558915138244629 + }, + { + "auxiliary_loss_clip": 0.06370112, + "auxiliary_loss_mlp": 0.01273024, + "balance_loss_clip": 0.06287491, + "balance_loss_mlp": 0.01268941, + "epoch": 0.35322410942431987, + "flos": 70331040437760.0, + "grad_norm": 0.7490717453474699, + "language_loss": 0.616135, + "learning_rate": 3.000562086839917e-06, + "loss": 0.69256639, + "num_input_tokens_seen": 126362495, + "router_z_loss_clip": 0.82714844, + "router_z_loss_mlp": 0.04083252, + "step": 5875, + "time_per_iteration": 3.1286721229553223 + }, + { + "auxiliary_loss_clip": 0.06475496, + "auxiliary_loss_mlp": 0.01277595, + "balance_loss_clip": 0.06289661, + "balance_loss_mlp": 0.01262086, + "epoch": 0.35328423267698783, + "flos": 19825735328640.0, + "grad_norm": 2.073373185113386, + "language_loss": 0.8042345, + "learning_rate": 3.0002248473793163e-06, + "loss": 0.88176548, + "num_input_tokens_seen": 126378320, + "router_z_loss_clip": 1.85742188, + "router_z_loss_mlp": 0.15509033, + "step": 5876, + "time_per_iteration": 2.5174875259399414 + }, + { + "auxiliary_loss_clip": 0.063563, + "auxiliary_loss_mlp": 0.01261292, + "balance_loss_clip": 0.06274077, + "balance_loss_mlp": 0.01257364, + "epoch": 0.3533443559296558, + "flos": 60843398480640.0, + "grad_norm": 0.6578323239794136, + "language_loss": 0.56720114, + "learning_rate": 2.999887569990088e-06, + "loss": 0.64337707, + "num_input_tokens_seen": 126442735, + "router_z_loss_clip": 0.82128906, + "router_z_loss_mlp": 0.03924561, + "step": 5877, + "time_per_iteration": 3.239800214767456 + }, + { + "auxiliary_loss_clip": 0.0647119, + "auxiliary_loss_mlp": 0.01275609, + "balance_loss_clip": 0.06286252, + "balance_loss_mlp": 0.01259301, + "epoch": 0.35340447918232376, + "flos": 24762997457280.0, + "grad_norm": 1.7728898292153, + "language_loss": 0.72425848, + "learning_rate": 2.999550254685024e-06, + "loss": 0.80172646, + "num_input_tokens_seen": 126463090, + "router_z_loss_clip": 1.84863281, + "router_z_loss_mlp": 0.16308594, + "step": 5878, + "time_per_iteration": 2.576354742050171 + }, + { + "auxiliary_loss_clip": 0.06470102, + "auxiliary_loss_mlp": 0.01272441, + "balance_loss_clip": 0.06286008, + "balance_loss_mlp": 0.01256789, + "epoch": 0.3534646024349917, + "flos": 21802342417920.0, + "grad_norm": 2.4353464978664494, + "language_loss": 0.78682542, + "learning_rate": 2.9992129014769136e-06, + "loss": 0.86425084, + "num_input_tokens_seen": 126482105, + "router_z_loss_clip": 1.84082031, + "router_z_loss_mlp": 0.15649414, + "step": 5879, + "time_per_iteration": 2.535600423812866 + }, + { + "auxiliary_loss_clip": 0.06481053, + "auxiliary_loss_mlp": 0.01271703, + "balance_loss_clip": 0.0628894, + "balance_loss_mlp": 0.01253714, + "epoch": 0.3535247256876597, + "flos": 20018463719040.0, + "grad_norm": 2.0590866059314035, + "language_loss": 0.63551295, + "learning_rate": 2.9988755103785493e-06, + "loss": 0.71304053, + "num_input_tokens_seen": 126502125, + "router_z_loss_clip": 1.91992188, + "router_z_loss_mlp": 0.17980957, + "step": 5880, + "time_per_iteration": 2.5576937198638916 + }, + { + "auxiliary_loss_clip": 0.06481048, + "auxiliary_loss_mlp": 0.01274855, + "balance_loss_clip": 0.06292346, + "balance_loss_mlp": 0.01258035, + "epoch": 0.35358484894032766, + "flos": 18193984917120.0, + "grad_norm": 2.6506562916801273, + "language_loss": 0.66346908, + "learning_rate": 2.998538081402727e-06, + "loss": 0.74102807, + "num_input_tokens_seen": 126521950, + "router_z_loss_clip": 1.88671875, + "router_z_loss_mlp": 0.16821289, + "step": 5881, + "time_per_iteration": 2.5375049114227295 + }, + { + "auxiliary_loss_clip": 0.06465093, + "auxiliary_loss_mlp": 0.01272514, + "balance_loss_clip": 0.06285467, + "balance_loss_mlp": 0.0125818, + "epoch": 0.3536449721929956, + "flos": 22826990471040.0, + "grad_norm": 1.7415962616346485, + "language_loss": 0.75838578, + "learning_rate": 2.998200614562239e-06, + "loss": 0.8357619, + "num_input_tokens_seen": 126542445, + "router_z_loss_clip": 1.79492188, + "router_z_loss_mlp": 0.14337158, + "step": 5882, + "time_per_iteration": 2.546163558959961 + }, + { + "auxiliary_loss_clip": 0.06472618, + "auxiliary_loss_mlp": 0.01271877, + "balance_loss_clip": 0.06285325, + "balance_loss_mlp": 0.01256189, + "epoch": 0.3537050954456636, + "flos": 26439540675840.0, + "grad_norm": 2.210270342508568, + "language_loss": 0.70790988, + "learning_rate": 2.9978631098698847e-06, + "loss": 0.78535485, + "num_input_tokens_seen": 126560690, + "router_z_loss_clip": 1.87402344, + "router_z_loss_mlp": 0.15692139, + "step": 5883, + "time_per_iteration": 2.5813896656036377 + }, + { + "auxiliary_loss_clip": 0.06481725, + "auxiliary_loss_mlp": 0.01274676, + "balance_loss_clip": 0.0628854, + "balance_loss_mlp": 0.01258105, + "epoch": 0.3537652186983316, + "flos": 17202096610560.0, + "grad_norm": 3.5308447991949348, + "language_loss": 0.7912811, + "learning_rate": 2.9975255673384614e-06, + "loss": 0.86884505, + "num_input_tokens_seen": 126577620, + "router_z_loss_clip": 1.9296875, + "router_z_loss_mlp": 0.16564941, + "step": 5884, + "time_per_iteration": 2.564178228378296 + }, + { + "auxiliary_loss_clip": 0.06469014, + "auxiliary_loss_mlp": 0.01273424, + "balance_loss_clip": 0.06285414, + "balance_loss_mlp": 0.01258142, + "epoch": 0.3538253419509996, + "flos": 19542861544320.0, + "grad_norm": 3.0890260502514173, + "language_loss": 0.76079619, + "learning_rate": 2.9971879869807673e-06, + "loss": 0.83822054, + "num_input_tokens_seen": 126596235, + "router_z_loss_clip": 1.83691406, + "router_z_loss_mlp": 0.15283203, + "step": 5885, + "time_per_iteration": 2.5860350131988525 + }, + { + "auxiliary_loss_clip": 0.06473316, + "auxiliary_loss_mlp": 0.01274145, + "balance_loss_clip": 0.06285691, + "balance_loss_mlp": 0.01257766, + "epoch": 0.35388546520366754, + "flos": 12133166590080.0, + "grad_norm": 4.983567417880078, + "language_loss": 0.83563066, + "learning_rate": 2.996850368809606e-06, + "loss": 0.91310525, + "num_input_tokens_seen": 126612830, + "router_z_loss_clip": 1.87597656, + "router_z_loss_mlp": 0.16357422, + "step": 5886, + "time_per_iteration": 2.549227714538574 + }, + { + "auxiliary_loss_clip": 0.06464715, + "auxiliary_loss_mlp": 0.01274591, + "balance_loss_clip": 0.06282739, + "balance_loss_mlp": 0.0125851, + "epoch": 0.3539455884563355, + "flos": 19683501822720.0, + "grad_norm": 3.219387216821374, + "language_loss": 0.78429639, + "learning_rate": 2.9965127128377787e-06, + "loss": 0.86168945, + "num_input_tokens_seen": 126630910, + "router_z_loss_clip": 1.82128906, + "router_z_loss_mlp": 0.16088867, + "step": 5887, + "time_per_iteration": 2.523743152618408 + }, + { + "auxiliary_loss_clip": 0.0646676, + "auxiliary_loss_mlp": 0.0127383, + "balance_loss_clip": 0.06282511, + "balance_loss_mlp": 0.01258631, + "epoch": 0.35400571170900347, + "flos": 18077006217600.0, + "grad_norm": 1.8956957640615841, + "language_loss": 0.66116667, + "learning_rate": 2.996175019078089e-06, + "loss": 0.7385726, + "num_input_tokens_seen": 126648365, + "router_z_loss_clip": 1.84570312, + "router_z_loss_mlp": 0.15197754, + "step": 5888, + "time_per_iteration": 2.5279300212860107 + }, + { + "auxiliary_loss_clip": 0.06467725, + "auxiliary_loss_mlp": 0.01271718, + "balance_loss_clip": 0.06284125, + "balance_loss_mlp": 0.01256185, + "epoch": 0.35406583496167143, + "flos": 26075298977280.0, + "grad_norm": 2.3097601077816443, + "language_loss": 0.76721621, + "learning_rate": 2.9958372875433437e-06, + "loss": 0.84461069, + "num_input_tokens_seen": 126667500, + "router_z_loss_clip": 1.8359375, + "router_z_loss_mlp": 0.15527344, + "step": 5889, + "time_per_iteration": 2.564761161804199 + }, + { + "auxiliary_loss_clip": 0.06465457, + "auxiliary_loss_mlp": 0.01270164, + "balance_loss_clip": 0.06283142, + "balance_loss_mlp": 0.01254357, + "epoch": 0.3541259582143394, + "flos": 19798635732480.0, + "grad_norm": 2.1640548649274116, + "language_loss": 0.81408846, + "learning_rate": 2.9954995182463478e-06, + "loss": 0.89144462, + "num_input_tokens_seen": 126686820, + "router_z_loss_clip": 1.82421875, + "router_z_loss_mlp": 0.15808105, + "step": 5890, + "time_per_iteration": 2.5614936351776123 + }, + { + "auxiliary_loss_clip": 0.06466024, + "auxiliary_loss_mlp": 0.01270173, + "balance_loss_clip": 0.06285816, + "balance_loss_mlp": 0.01256094, + "epoch": 0.35418608146700736, + "flos": 24028518493440.0, + "grad_norm": 1.6495661544524922, + "language_loss": 0.80017459, + "learning_rate": 2.99516171119991e-06, + "loss": 0.87753654, + "num_input_tokens_seen": 126706965, + "router_z_loss_clip": 1.80078125, + "router_z_loss_mlp": 0.14074707, + "step": 5891, + "time_per_iteration": 2.553158760070801 + }, + { + "auxiliary_loss_clip": 0.06471643, + "auxiliary_loss_mlp": 0.01282427, + "balance_loss_clip": 0.06289162, + "balance_loss_mlp": 0.01265928, + "epoch": 0.35424620471967533, + "flos": 12390701713920.0, + "grad_norm": 1.7694155250203176, + "language_loss": 0.73450041, + "learning_rate": 2.9948238664168415e-06, + "loss": 0.81204116, + "num_input_tokens_seen": 126724015, + "router_z_loss_clip": 1.82226562, + "router_z_loss_mlp": 0.16516113, + "step": 5892, + "time_per_iteration": 2.529136896133423 + }, + { + "auxiliary_loss_clip": 0.06470741, + "auxiliary_loss_mlp": 0.01274401, + "balance_loss_clip": 0.06286078, + "balance_loss_mlp": 0.01259059, + "epoch": 0.3543063279723433, + "flos": 19678219015680.0, + "grad_norm": 3.019670501918518, + "language_loss": 0.67408991, + "learning_rate": 2.9944859839099518e-06, + "loss": 0.75154132, + "num_input_tokens_seen": 126737565, + "router_z_loss_clip": 1.84667969, + "router_z_loss_mlp": 0.15344238, + "step": 5893, + "time_per_iteration": 2.507456064224243 + }, + { + "auxiliary_loss_clip": 0.06469926, + "auxiliary_loss_mlp": 0.01274247, + "balance_loss_clip": 0.06287754, + "balance_loss_mlp": 0.01257545, + "epoch": 0.35436645122501126, + "flos": 21915841173120.0, + "grad_norm": 1.8801549379271045, + "language_loss": 0.70079887, + "learning_rate": 2.9941480636920533e-06, + "loss": 0.77824062, + "num_input_tokens_seen": 126756095, + "router_z_loss_clip": 1.82128906, + "router_z_loss_mlp": 0.16711426, + "step": 5894, + "time_per_iteration": 2.5596466064453125 + }, + { + "auxiliary_loss_clip": 0.0646911, + "auxiliary_loss_mlp": 0.0127714, + "balance_loss_clip": 0.06291118, + "balance_loss_mlp": 0.0126256, + "epoch": 0.3544265744776792, + "flos": 21724915645440.0, + "grad_norm": 1.8040348457355686, + "language_loss": 0.74516678, + "learning_rate": 2.9938101057759615e-06, + "loss": 0.82262927, + "num_input_tokens_seen": 126775455, + "router_z_loss_clip": 1.78027344, + "router_z_loss_mlp": 0.14569092, + "step": 5895, + "time_per_iteration": 2.602884531021118 + }, + { + "auxiliary_loss_clip": 0.06476314, + "auxiliary_loss_mlp": 0.01274747, + "balance_loss_clip": 0.06292941, + "balance_loss_mlp": 0.01259643, + "epoch": 0.3544866977303472, + "flos": 21219278981760.0, + "grad_norm": 1.7647167527567422, + "language_loss": 0.83600783, + "learning_rate": 2.993472110174491e-06, + "loss": 0.91351843, + "num_input_tokens_seen": 126792320, + "router_z_loss_clip": 1.83300781, + "router_z_loss_mlp": 0.15100098, + "step": 5896, + "time_per_iteration": 2.5642035007476807 + }, + { + "auxiliary_loss_clip": 0.06472465, + "auxiliary_loss_mlp": 0.01278933, + "balance_loss_clip": 0.06292751, + "balance_loss_mlp": 0.01261576, + "epoch": 0.35454682098301515, + "flos": 29318534311680.0, + "grad_norm": 1.8515152904238923, + "language_loss": 0.70294917, + "learning_rate": 2.9931340769004576e-06, + "loss": 0.7804631, + "num_input_tokens_seen": 126813680, + "router_z_loss_clip": 1.79785156, + "router_z_loss_mlp": 0.17346191, + "step": 5897, + "time_per_iteration": 2.613032341003418 + }, + { + "auxiliary_loss_clip": 0.06475735, + "auxiliary_loss_mlp": 0.01274261, + "balance_loss_clip": 0.06293957, + "balance_loss_mlp": 0.01259205, + "epoch": 0.3546069442356832, + "flos": 24323509192320.0, + "grad_norm": 1.6960731630978507, + "language_loss": 0.81964374, + "learning_rate": 2.9927960059666816e-06, + "loss": 0.89714372, + "num_input_tokens_seen": 126834395, + "router_z_loss_clip": 1.81933594, + "router_z_loss_mlp": 0.15063477, + "step": 5898, + "time_per_iteration": 2.6033098697662354 + }, + { + "auxiliary_loss_clip": 0.06471986, + "auxiliary_loss_mlp": 0.01279895, + "balance_loss_clip": 0.0629501, + "balance_loss_mlp": 0.01265173, + "epoch": 0.35466706748835114, + "flos": 22863984848640.0, + "grad_norm": 1.4933011631381068, + "language_loss": 0.74405515, + "learning_rate": 2.9924578973859804e-06, + "loss": 0.82157397, + "num_input_tokens_seen": 126855145, + "router_z_loss_clip": 1.76855469, + "router_z_loss_mlp": 0.14727783, + "step": 5899, + "time_per_iteration": 2.5492894649505615 + }, + { + "auxiliary_loss_clip": 0.0647797, + "auxiliary_loss_mlp": 0.01272872, + "balance_loss_clip": 0.06294148, + "balance_loss_mlp": 0.01257196, + "epoch": 0.3547271907410191, + "flos": 28337714743680.0, + "grad_norm": 3.4583325446366673, + "language_loss": 0.80211669, + "learning_rate": 2.9921197511711763e-06, + "loss": 0.87962508, + "num_input_tokens_seen": 126873790, + "router_z_loss_clip": 1.83886719, + "router_z_loss_mlp": 0.15698242, + "step": 5900, + "time_per_iteration": 5.435121774673462 + }, + { + "auxiliary_loss_clip": 0.06478105, + "auxiliary_loss_mlp": 0.01279951, + "balance_loss_clip": 0.06296446, + "balance_loss_mlp": 0.01263607, + "epoch": 0.35478731399368707, + "flos": 23520911258880.0, + "grad_norm": 2.0942596894242533, + "language_loss": 0.8216058, + "learning_rate": 2.991781567335093e-06, + "loss": 0.89918637, + "num_input_tokens_seen": 126892865, + "router_z_loss_clip": 1.81445312, + "router_z_loss_mlp": 0.16357422, + "step": 5901, + "time_per_iteration": 2.603769540786743 + }, + { + "auxiliary_loss_clip": 0.06480999, + "auxiliary_loss_mlp": 0.01277169, + "balance_loss_clip": 0.06295676, + "balance_loss_mlp": 0.01261899, + "epoch": 0.35484743724635504, + "flos": 18630202872960.0, + "grad_norm": 2.2545917554681663, + "language_loss": 0.75979805, + "learning_rate": 2.9914433458905525e-06, + "loss": 0.83737969, + "num_input_tokens_seen": 126911935, + "router_z_loss_clip": 1.85253906, + "router_z_loss_mlp": 0.152771, + "step": 5902, + "time_per_iteration": 2.5356359481811523 + }, + { + "auxiliary_loss_clip": 0.06482422, + "auxiliary_loss_mlp": 0.01280542, + "balance_loss_clip": 0.06300852, + "balance_loss_mlp": 0.01265331, + "epoch": 0.354907560499023, + "flos": 17390296880640.0, + "grad_norm": 1.6908684001073404, + "language_loss": 0.70729327, + "learning_rate": 2.991105086850381e-06, + "loss": 0.78492296, + "num_input_tokens_seen": 126930040, + "router_z_loss_clip": 1.81640625, + "router_z_loss_mlp": 0.15209961, + "step": 5903, + "time_per_iteration": 2.52494478225708 + }, + { + "auxiliary_loss_clip": 0.06482972, + "auxiliary_loss_mlp": 0.01276075, + "balance_loss_clip": 0.06297173, + "balance_loss_mlp": 0.0125929, + "epoch": 0.35496768375169097, + "flos": 19214607974400.0, + "grad_norm": 2.9744492269587153, + "language_loss": 0.75001359, + "learning_rate": 2.9907667902274053e-06, + "loss": 0.82760406, + "num_input_tokens_seen": 126948390, + "router_z_loss_clip": 1.85742188, + "router_z_loss_mlp": 0.16784668, + "step": 5904, + "time_per_iteration": 2.5316994190216064 + }, + { + "auxiliary_loss_clip": 0.0648163, + "auxiliary_loss_mlp": 0.01277137, + "balance_loss_clip": 0.06297497, + "balance_loss_mlp": 0.01261902, + "epoch": 0.35502780700435893, + "flos": 18338692118400.0, + "grad_norm": 2.2144866791488536, + "language_loss": 0.78981996, + "learning_rate": 2.9904284560344536e-06, + "loss": 0.86740756, + "num_input_tokens_seen": 126964905, + "router_z_loss_clip": 1.84179688, + "router_z_loss_mlp": 0.15246582, + "step": 5905, + "time_per_iteration": 3.9867374897003174 + }, + { + "auxiliary_loss_clip": 0.06472038, + "auxiliary_loss_mlp": 0.01276232, + "balance_loss_clip": 0.06301226, + "balance_loss_mlp": 0.01262249, + "epoch": 0.3550879302570269, + "flos": 15453660988800.0, + "grad_norm": 1.8340819850757704, + "language_loss": 0.72531646, + "learning_rate": 2.990090084284356e-06, + "loss": 0.80279917, + "num_input_tokens_seen": 126982000, + "router_z_loss_clip": 1.70703125, + "router_z_loss_mlp": 0.13977051, + "step": 5906, + "time_per_iteration": 2.5326547622680664 + }, + { + "auxiliary_loss_clip": 0.06491787, + "auxiliary_loss_mlp": 0.01272032, + "balance_loss_clip": 0.06306198, + "balance_loss_mlp": 0.01256046, + "epoch": 0.35514805350969486, + "flos": 21985343735040.0, + "grad_norm": 1.9483914182465616, + "language_loss": 0.75052631, + "learning_rate": 2.9897516749899426e-06, + "loss": 0.82816458, + "num_input_tokens_seen": 126998390, + "router_z_loss_clip": 1.85839844, + "router_z_loss_mlp": 0.15991211, + "step": 5907, + "time_per_iteration": 2.526137113571167 + }, + { + "auxiliary_loss_clip": 0.06486456, + "auxiliary_loss_mlp": 0.01280245, + "balance_loss_clip": 0.06305459, + "balance_loss_mlp": 0.01264271, + "epoch": 0.3552081767623628, + "flos": 29869718469120.0, + "grad_norm": 2.2786495725258424, + "language_loss": 0.76563632, + "learning_rate": 2.989413228164047e-06, + "loss": 0.84330332, + "num_input_tokens_seen": 127020220, + "router_z_loss_clip": 1.81152344, + "router_z_loss_mlp": 0.15966797, + "step": 5908, + "time_per_iteration": 4.063998222351074 + }, + { + "auxiliary_loss_clip": 0.06491728, + "auxiliary_loss_mlp": 0.01276886, + "balance_loss_clip": 0.06310974, + "balance_loss_mlp": 0.0126146, + "epoch": 0.3552683000150308, + "flos": 26439456821760.0, + "grad_norm": 2.352503484530038, + "language_loss": 0.68572766, + "learning_rate": 2.989074743819502e-06, + "loss": 0.76341379, + "num_input_tokens_seen": 127038585, + "router_z_loss_clip": 1.80566406, + "router_z_loss_mlp": 0.15429688, + "step": 5909, + "time_per_iteration": 2.6902143955230713 + }, + { + "auxiliary_loss_clip": 0.0648414, + "auxiliary_loss_mlp": 0.01282146, + "balance_loss_clip": 0.06310885, + "balance_loss_mlp": 0.01268061, + "epoch": 0.35532842326769876, + "flos": 19791088865280.0, + "grad_norm": 1.9680680199916993, + "language_loss": 0.79103023, + "learning_rate": 2.988736221969144e-06, + "loss": 0.86869311, + "num_input_tokens_seen": 127056215, + "router_z_loss_clip": 1.73144531, + "router_z_loss_mlp": 0.14086914, + "step": 5910, + "time_per_iteration": 2.535050630569458 + }, + { + "auxiliary_loss_clip": 0.06495271, + "auxiliary_loss_mlp": 0.01274944, + "balance_loss_clip": 0.06310071, + "balance_loss_mlp": 0.0125841, + "epoch": 0.3553885465203668, + "flos": 17245170408960.0, + "grad_norm": 1.607302447744311, + "language_loss": 0.7130779, + "learning_rate": 2.98839766262581e-06, + "loss": 0.79078007, + "num_input_tokens_seen": 127075825, + "router_z_loss_clip": 1.85253906, + "router_z_loss_mlp": 0.1652832, + "step": 5911, + "time_per_iteration": 2.572942018508911 + }, + { + "auxiliary_loss_clip": 0.06485709, + "auxiliary_loss_mlp": 0.01272785, + "balance_loss_clip": 0.06309631, + "balance_loss_mlp": 0.01258313, + "epoch": 0.35544866977303474, + "flos": 14938800376320.0, + "grad_norm": 2.1423891041027514, + "language_loss": 0.87973344, + "learning_rate": 2.9880590658023366e-06, + "loss": 0.95731837, + "num_input_tokens_seen": 127091205, + "router_z_loss_clip": 1.76074219, + "router_z_loss_mlp": 0.14477539, + "step": 5912, + "time_per_iteration": 2.4826059341430664 + }, + { + "auxiliary_loss_clip": 0.0648666, + "auxiliary_loss_mlp": 0.01278679, + "balance_loss_clip": 0.0630875, + "balance_loss_mlp": 0.0126441, + "epoch": 0.3555087930257027, + "flos": 19762228333440.0, + "grad_norm": 2.0928412919366477, + "language_loss": 0.77506435, + "learning_rate": 2.9877204315115646e-06, + "loss": 0.8527177, + "num_input_tokens_seen": 127109210, + "router_z_loss_clip": 1.77832031, + "router_z_loss_mlp": 0.14251709, + "step": 5913, + "time_per_iteration": 2.577362060546875 + }, + { + "auxiliary_loss_clip": 0.06486008, + "auxiliary_loss_mlp": 0.01273445, + "balance_loss_clip": 0.06311025, + "balance_loss_mlp": 0.01258789, + "epoch": 0.3555689162783707, + "flos": 21074445999360.0, + "grad_norm": 5.920108951080063, + "language_loss": 0.82525283, + "learning_rate": 2.9873817597663353e-06, + "loss": 0.90284735, + "num_input_tokens_seen": 127128400, + "router_z_loss_clip": 1.75097656, + "router_z_loss_mlp": 0.14660645, + "step": 5914, + "time_per_iteration": 2.521756649017334 + }, + { + "auxiliary_loss_clip": 0.06490604, + "auxiliary_loss_mlp": 0.01268632, + "balance_loss_clip": 0.06310836, + "balance_loss_mlp": 0.01254118, + "epoch": 0.35562903953103864, + "flos": 33077426872320.0, + "grad_norm": 3.2692214801304686, + "language_loss": 0.7113682, + "learning_rate": 2.98704305057949e-06, + "loss": 0.78896052, + "num_input_tokens_seen": 127149965, + "router_z_loss_clip": 1.79882812, + "router_z_loss_mlp": 0.14508057, + "step": 5915, + "time_per_iteration": 2.6931562423706055 + }, + { + "auxiliary_loss_clip": 0.06477264, + "auxiliary_loss_mlp": 0.01269512, + "balance_loss_clip": 0.06297429, + "balance_loss_mlp": 0.01254814, + "epoch": 0.3556891627837066, + "flos": 20564029653120.0, + "grad_norm": 4.458093980019367, + "language_loss": 0.76718718, + "learning_rate": 2.9867043039638737e-06, + "loss": 0.84465492, + "num_input_tokens_seen": 127169865, + "router_z_loss_clip": 1.796875, + "router_z_loss_mlp": 0.14697266, + "step": 5916, + "time_per_iteration": 2.5489182472229004 + }, + { + "auxiliary_loss_clip": 0.06487325, + "auxiliary_loss_mlp": 0.01272059, + "balance_loss_clip": 0.06307879, + "balance_loss_mlp": 0.01256651, + "epoch": 0.35574928603637457, + "flos": 20709449614080.0, + "grad_norm": 1.674174142445476, + "language_loss": 0.88208687, + "learning_rate": 2.986365519932332e-06, + "loss": 0.95968074, + "num_input_tokens_seen": 127188075, + "router_z_loss_clip": 1.79589844, + "router_z_loss_mlp": 0.1539917, + "step": 5917, + "time_per_iteration": 2.6043195724487305 + }, + { + "auxiliary_loss_clip": 0.0649041, + "auxiliary_loss_mlp": 0.0126981, + "balance_loss_clip": 0.0631107, + "balance_loss_mlp": 0.01254289, + "epoch": 0.35580940928904253, + "flos": 15199899298560.0, + "grad_norm": 3.6980401889874086, + "language_loss": 0.75538862, + "learning_rate": 2.98602669849771e-06, + "loss": 0.83299077, + "num_input_tokens_seen": 127206065, + "router_z_loss_clip": 1.79296875, + "router_z_loss_mlp": 0.15515137, + "step": 5918, + "time_per_iteration": 2.5186190605163574 + }, + { + "auxiliary_loss_clip": 0.06461592, + "auxiliary_loss_mlp": 0.01285001, + "balance_loss_clip": 0.06381316, + "balance_loss_mlp": 0.01279086, + "epoch": 0.3558695325417105, + "flos": 58656145426560.0, + "grad_norm": 0.8458689331650495, + "language_loss": 0.63255095, + "learning_rate": 2.985687839672857e-06, + "loss": 0.71001691, + "num_input_tokens_seen": 127257885, + "router_z_loss_clip": 0.80224609, + "router_z_loss_mlp": 0.05911255, + "step": 5919, + "time_per_iteration": 2.9552297592163086 + }, + { + "auxiliary_loss_clip": 0.06485933, + "auxiliary_loss_mlp": 0.01271829, + "balance_loss_clip": 0.06302524, + "balance_loss_mlp": 0.01255998, + "epoch": 0.35592965579437846, + "flos": 22024811808000.0, + "grad_norm": 2.2679396062128188, + "language_loss": 0.74402696, + "learning_rate": 2.9853489434706223e-06, + "loss": 0.82160461, + "num_input_tokens_seen": 127275550, + "router_z_loss_clip": 1.83496094, + "router_z_loss_mlp": 0.1583252, + "step": 5920, + "time_per_iteration": 2.54848313331604 + }, + { + "auxiliary_loss_clip": 0.06483243, + "auxiliary_loss_mlp": 0.01277956, + "balance_loss_clip": 0.06304519, + "balance_loss_mlp": 0.01262638, + "epoch": 0.35598977904704643, + "flos": 23374401194880.0, + "grad_norm": 3.1552684799501733, + "language_loss": 0.77735227, + "learning_rate": 2.985010009903857e-06, + "loss": 0.85496426, + "num_input_tokens_seen": 127295110, + "router_z_loss_clip": 1.78710938, + "router_z_loss_mlp": 0.15332031, + "step": 5921, + "time_per_iteration": 2.6517810821533203 + }, + { + "auxiliary_loss_clip": 0.06490617, + "auxiliary_loss_mlp": 0.01276672, + "balance_loss_clip": 0.06309058, + "balance_loss_mlp": 0.01261329, + "epoch": 0.3560499022997144, + "flos": 17791113686400.0, + "grad_norm": 2.349487021583332, + "language_loss": 0.6770314, + "learning_rate": 2.9846710389854133e-06, + "loss": 0.75470436, + "num_input_tokens_seen": 127312865, + "router_z_loss_clip": 1.81640625, + "router_z_loss_mlp": 0.15332031, + "step": 5922, + "time_per_iteration": 2.525566577911377 + }, + { + "auxiliary_loss_clip": 0.06484485, + "auxiliary_loss_mlp": 0.0127389, + "balance_loss_clip": 0.06306913, + "balance_loss_mlp": 0.01258524, + "epoch": 0.35611002555238236, + "flos": 20746695553920.0, + "grad_norm": 2.231194122260979, + "language_loss": 0.79304701, + "learning_rate": 2.9843320307281454e-06, + "loss": 0.87063074, + "num_input_tokens_seen": 127331710, + "router_z_loss_clip": 1.77441406, + "router_z_loss_mlp": 0.15380859, + "step": 5923, + "time_per_iteration": 2.5809409618377686 + }, + { + "auxiliary_loss_clip": 0.06479051, + "auxiliary_loss_mlp": 0.01272719, + "balance_loss_clip": 0.06301268, + "balance_loss_mlp": 0.01257579, + "epoch": 0.3561701488050504, + "flos": 19468034248320.0, + "grad_norm": 1.61778925366919, + "language_loss": 0.8543126, + "learning_rate": 2.983992985144908e-06, + "loss": 0.93183035, + "num_input_tokens_seen": 127350950, + "router_z_loss_clip": 1.77929688, + "router_z_loss_mlp": 0.15148926, + "step": 5924, + "time_per_iteration": 2.524949312210083 + }, + { + "auxiliary_loss_clip": 0.06478724, + "auxiliary_loss_mlp": 0.01271843, + "balance_loss_clip": 0.06301951, + "balance_loss_mlp": 0.01255797, + "epoch": 0.35623027205771834, + "flos": 30783006046080.0, + "grad_norm": 1.9504196686726267, + "language_loss": 0.77609557, + "learning_rate": 2.9836539022485578e-06, + "loss": 0.85360122, + "num_input_tokens_seen": 127369385, + "router_z_loss_clip": 1.76660156, + "router_z_loss_mlp": 0.16033936, + "step": 5925, + "time_per_iteration": 2.6268069744110107 + }, + { + "auxiliary_loss_clip": 0.06472521, + "auxiliary_loss_mlp": 0.01273729, + "balance_loss_clip": 0.06292735, + "balance_loss_mlp": 0.01258291, + "epoch": 0.3562903953103863, + "flos": 16986461328000.0, + "grad_norm": 1.8072288436418724, + "language_loss": 0.76488966, + "learning_rate": 2.9833147820519535e-06, + "loss": 0.84235215, + "num_input_tokens_seen": 127386965, + "router_z_loss_clip": 1.79589844, + "router_z_loss_mlp": 0.15441895, + "step": 5926, + "time_per_iteration": 2.492009401321411 + }, + { + "auxiliary_loss_clip": 0.064781, + "auxiliary_loss_mlp": 0.01271518, + "balance_loss_clip": 0.06293385, + "balance_loss_mlp": 0.01255478, + "epoch": 0.3563505185630543, + "flos": 23846271863040.0, + "grad_norm": 2.038892178711472, + "language_loss": 0.69665909, + "learning_rate": 2.9829756245679544e-06, + "loss": 0.77415526, + "num_input_tokens_seen": 127406075, + "router_z_loss_clip": 1.84863281, + "router_z_loss_mlp": 0.16046143, + "step": 5927, + "time_per_iteration": 2.555192708969116 + }, + { + "auxiliary_loss_clip": 0.06471409, + "auxiliary_loss_mlp": 0.01273845, + "balance_loss_clip": 0.06293224, + "balance_loss_mlp": 0.0125889, + "epoch": 0.35641064181572224, + "flos": 22280040944640.0, + "grad_norm": 1.7768317666214009, + "language_loss": 0.79454333, + "learning_rate": 2.9826364298094212e-06, + "loss": 0.87199581, + "num_input_tokens_seen": 127425350, + "router_z_loss_clip": 1.78027344, + "router_z_loss_mlp": 0.1494751, + "step": 5928, + "time_per_iteration": 2.5192928314208984 + }, + { + "auxiliary_loss_clip": 0.06473258, + "auxiliary_loss_mlp": 0.01271381, + "balance_loss_clip": 0.06294424, + "balance_loss_mlp": 0.01256439, + "epoch": 0.3564707650683902, + "flos": 23007643873920.0, + "grad_norm": 1.230692465633979, + "language_loss": 0.8197661, + "learning_rate": 2.982297197789215e-06, + "loss": 0.89721251, + "num_input_tokens_seen": 127446335, + "router_z_loss_clip": 1.78808594, + "router_z_loss_mlp": 0.1494751, + "step": 5929, + "time_per_iteration": 2.6044368743896484 + }, + { + "auxiliary_loss_clip": 0.0646459, + "auxiliary_loss_mlp": 0.01268428, + "balance_loss_clip": 0.06289564, + "balance_loss_mlp": 0.01253765, + "epoch": 0.35653088832105817, + "flos": 14689566806400.0, + "grad_norm": 1.5209281639747478, + "language_loss": 0.70385516, + "learning_rate": 2.981957928520201e-06, + "loss": 0.78118533, + "num_input_tokens_seen": 127462795, + "router_z_loss_clip": 1.75097656, + "router_z_loss_mlp": 0.14685059, + "step": 5930, + "time_per_iteration": 2.498253107070923 + }, + { + "auxiliary_loss_clip": 0.06473252, + "auxiliary_loss_mlp": 0.01273096, + "balance_loss_clip": 0.06289653, + "balance_loss_mlp": 0.01256943, + "epoch": 0.35659101157372614, + "flos": 23483791100160.0, + "grad_norm": 2.174064041384607, + "language_loss": 0.68760598, + "learning_rate": 2.981618622015244e-06, + "loss": 0.76506943, + "num_input_tokens_seen": 127482675, + "router_z_loss_clip": 1.83496094, + "router_z_loss_mlp": 0.16162109, + "step": 5931, + "time_per_iteration": 2.5391998291015625 + }, + { + "auxiliary_loss_clip": 0.06463969, + "auxiliary_loss_mlp": 0.0126845, + "balance_loss_clip": 0.06288578, + "balance_loss_mlp": 0.01253788, + "epoch": 0.3566511348263941, + "flos": 26585966885760.0, + "grad_norm": 1.5444695234240167, + "language_loss": 0.68331707, + "learning_rate": 2.981279278287211e-06, + "loss": 0.76064122, + "num_input_tokens_seen": 127502275, + "router_z_loss_clip": 1.75195312, + "router_z_loss_mlp": 0.14660645, + "step": 5932, + "time_per_iteration": 2.553738832473755 + }, + { + "auxiliary_loss_clip": 0.06465189, + "auxiliary_loss_mlp": 0.01272147, + "balance_loss_clip": 0.06290227, + "balance_loss_mlp": 0.01257854, + "epoch": 0.35671125807906207, + "flos": 13119981724800.0, + "grad_norm": 2.4744838507658917, + "language_loss": 0.79635656, + "learning_rate": 2.980939897348969e-06, + "loss": 0.87372994, + "num_input_tokens_seen": 127520195, + "router_z_loss_clip": 1.74902344, + "router_z_loss_mlp": 0.14294434, + "step": 5933, + "time_per_iteration": 2.573812961578369 + }, + { + "auxiliary_loss_clip": 0.06470121, + "auxiliary_loss_mlp": 0.01270309, + "balance_loss_clip": 0.06288668, + "balance_loss_mlp": 0.01255372, + "epoch": 0.35677138133173003, + "flos": 33009014413440.0, + "grad_norm": 1.4096936090904761, + "language_loss": 0.69970256, + "learning_rate": 2.980600479213388e-06, + "loss": 0.77710688, + "num_input_tokens_seen": 127544495, + "router_z_loss_clip": 1.81445312, + "router_z_loss_mlp": 0.14929199, + "step": 5934, + "time_per_iteration": 2.6381173133850098 + }, + { + "auxiliary_loss_clip": 0.06481285, + "auxiliary_loss_mlp": 0.01277705, + "balance_loss_clip": 0.06294179, + "balance_loss_mlp": 0.01260741, + "epoch": 0.356831504584398, + "flos": 20784234983040.0, + "grad_norm": 2.103415594097178, + "language_loss": 0.72006869, + "learning_rate": 2.9802610238933384e-06, + "loss": 0.79765862, + "num_input_tokens_seen": 127563810, + "router_z_loss_clip": 1.87304688, + "router_z_loss_mlp": 0.16967773, + "step": 5935, + "time_per_iteration": 2.620471954345703 + }, + { + "auxiliary_loss_clip": 0.06467808, + "auxiliary_loss_mlp": 0.01275583, + "balance_loss_clip": 0.06287988, + "balance_loss_mlp": 0.01261004, + "epoch": 0.35689162783706596, + "flos": 12170244821760.0, + "grad_norm": 2.011082803426264, + "language_loss": 0.78423738, + "learning_rate": 2.979921531401692e-06, + "loss": 0.86167133, + "num_input_tokens_seen": 127579065, + "router_z_loss_clip": 1.796875, + "router_z_loss_mlp": 0.14569092, + "step": 5936, + "time_per_iteration": 2.4827091693878174 + }, + { + "auxiliary_loss_clip": 0.06466486, + "auxiliary_loss_mlp": 0.01273239, + "balance_loss_clip": 0.06289199, + "balance_loss_mlp": 0.01258147, + "epoch": 0.356951751089734, + "flos": 23848200506880.0, + "grad_norm": 1.8250890312079233, + "language_loss": 0.64893055, + "learning_rate": 2.9795820017513242e-06, + "loss": 0.72632784, + "num_input_tokens_seen": 127599105, + "router_z_loss_clip": 1.77636719, + "router_z_loss_mlp": 0.15100098, + "step": 5937, + "time_per_iteration": 2.5968148708343506 + }, + { + "auxiliary_loss_clip": 0.06470716, + "auxiliary_loss_mlp": 0.01277052, + "balance_loss_clip": 0.06291182, + "balance_loss_mlp": 0.01261644, + "epoch": 0.35701187434240195, + "flos": 11725851093120.0, + "grad_norm": 3.2825373138133633, + "language_loss": 0.79029787, + "learning_rate": 2.9792424349551073e-06, + "loss": 0.86777556, + "num_input_tokens_seen": 127614940, + "router_z_loss_clip": 1.79492188, + "router_z_loss_mlp": 0.15429688, + "step": 5938, + "time_per_iteration": 2.4724228382110596 + }, + { + "auxiliary_loss_clip": 0.0646999, + "auxiliary_loss_mlp": 0.01275118, + "balance_loss_clip": 0.06289655, + "balance_loss_mlp": 0.01259835, + "epoch": 0.3570719975950699, + "flos": 24905650233600.0, + "grad_norm": 2.3707612213619624, + "language_loss": 0.80684471, + "learning_rate": 2.9789028310259202e-06, + "loss": 0.88429582, + "num_input_tokens_seen": 127634960, + "router_z_loss_clip": 1.80566406, + "router_z_loss_mlp": 0.15307617, + "step": 5939, + "time_per_iteration": 4.067660331726074 + }, + { + "auxiliary_loss_clip": 0.06474897, + "auxiliary_loss_mlp": 0.01278586, + "balance_loss_clip": 0.06288245, + "balance_loss_mlp": 0.01263357, + "epoch": 0.3571321208477379, + "flos": 26002022981760.0, + "grad_norm": 1.7209958005115653, + "language_loss": 0.79509544, + "learning_rate": 2.9785631899766395e-06, + "loss": 0.8726303, + "num_input_tokens_seen": 127654545, + "router_z_loss_clip": 1.8671875, + "router_z_loss_mlp": 0.15228271, + "step": 5940, + "time_per_iteration": 3.961956262588501 + }, + { + "auxiliary_loss_clip": 0.06472583, + "auxiliary_loss_mlp": 0.01274024, + "balance_loss_clip": 0.0628977, + "balance_loss_mlp": 0.01258223, + "epoch": 0.35719224410040584, + "flos": 14506900905600.0, + "grad_norm": 2.455654522420387, + "language_loss": 0.72918689, + "learning_rate": 2.9782235118201443e-06, + "loss": 0.80665296, + "num_input_tokens_seen": 127672320, + "router_z_loss_clip": 1.82714844, + "router_z_loss_mlp": 0.15802002, + "step": 5941, + "time_per_iteration": 2.529376745223999 + }, + { + "auxiliary_loss_clip": 0.06469624, + "auxiliary_loss_mlp": 0.01274223, + "balance_loss_clip": 0.06291723, + "balance_loss_mlp": 0.01258577, + "epoch": 0.3572523673530738, + "flos": 31183445508480.0, + "grad_norm": 1.9522398224767823, + "language_loss": 0.64961332, + "learning_rate": 2.9778837965693154e-06, + "loss": 0.72705185, + "num_input_tokens_seen": 127693315, + "router_z_loss_clip": 1.77636719, + "router_z_loss_mlp": 0.15667725, + "step": 5942, + "time_per_iteration": 2.6694955825805664 + }, + { + "auxiliary_loss_clip": 0.06470639, + "auxiliary_loss_mlp": 0.01273062, + "balance_loss_clip": 0.06291504, + "balance_loss_mlp": 0.01257124, + "epoch": 0.3573124906057418, + "flos": 15857496541440.0, + "grad_norm": 1.9232266262089555, + "language_loss": 0.7463761, + "learning_rate": 2.9775440442370354e-06, + "loss": 0.82381314, + "num_input_tokens_seen": 127711570, + "router_z_loss_clip": 1.79101562, + "router_z_loss_mlp": 0.1595459, + "step": 5943, + "time_per_iteration": 2.5988807678222656 + }, + { + "auxiliary_loss_clip": 0.06416824, + "auxiliary_loss_mlp": 0.01259877, + "balance_loss_clip": 0.06336363, + "balance_loss_mlp": 0.01254631, + "epoch": 0.35737261385840974, + "flos": 60839163849600.0, + "grad_norm": 0.8122274991603828, + "language_loss": 0.60684133, + "learning_rate": 2.9772042548361867e-06, + "loss": 0.68360829, + "num_input_tokens_seen": 127772475, + "router_z_loss_clip": 0.8046875, + "router_z_loss_mlp": 0.05249023, + "step": 5944, + "time_per_iteration": 3.2639529705047607 + }, + { + "auxiliary_loss_clip": 0.06467592, + "auxiliary_loss_mlp": 0.01274246, + "balance_loss_clip": 0.06290887, + "balance_loss_mlp": 0.01259464, + "epoch": 0.3574327371110777, + "flos": 18849779297280.0, + "grad_norm": 1.8477550360079977, + "language_loss": 0.7280755, + "learning_rate": 2.976864428379655e-06, + "loss": 0.80549395, + "num_input_tokens_seen": 127790940, + "router_z_loss_clip": 1.765625, + "router_z_loss_mlp": 0.14782715, + "step": 5945, + "time_per_iteration": 3.974971294403076 + }, + { + "auxiliary_loss_clip": 0.06464474, + "auxiliary_loss_mlp": 0.01274521, + "balance_loss_clip": 0.06288721, + "balance_loss_mlp": 0.01259619, + "epoch": 0.35749286036374567, + "flos": 23556354336000.0, + "grad_norm": 1.6530257311602492, + "language_loss": 0.8152287, + "learning_rate": 2.976524564880326e-06, + "loss": 0.89261866, + "num_input_tokens_seen": 127808275, + "router_z_loss_clip": 1.7578125, + "router_z_loss_mlp": 0.14892578, + "step": 5946, + "time_per_iteration": 2.567702531814575 + }, + { + "auxiliary_loss_clip": 0.06472433, + "auxiliary_loss_mlp": 0.01275229, + "balance_loss_clip": 0.06292298, + "balance_loss_mlp": 0.01260036, + "epoch": 0.35755298361641363, + "flos": 21111817720320.0, + "grad_norm": 1.4004407917222146, + "language_loss": 0.69023073, + "learning_rate": 2.9761846643510882e-06, + "loss": 0.76770723, + "num_input_tokens_seen": 127828840, + "router_z_loss_clip": 1.80273438, + "router_z_loss_mlp": 0.15209961, + "step": 5947, + "time_per_iteration": 2.531938076019287 + }, + { + "auxiliary_loss_clip": 0.06458312, + "auxiliary_loss_mlp": 0.01270008, + "balance_loss_clip": 0.06284653, + "balance_loss_mlp": 0.01256109, + "epoch": 0.3576131068690816, + "flos": 19251099227520.0, + "grad_norm": 2.059659188145791, + "language_loss": 0.75891036, + "learning_rate": 2.9758447268048297e-06, + "loss": 0.83619356, + "num_input_tokens_seen": 127846240, + "router_z_loss_clip": 1.73535156, + "router_z_loss_mlp": 0.13916016, + "step": 5948, + "time_per_iteration": 3.9236361980438232 + }, + { + "auxiliary_loss_clip": 0.06466205, + "auxiliary_loss_mlp": 0.01276458, + "balance_loss_clip": 0.06287337, + "balance_loss_mlp": 0.01261462, + "epoch": 0.35767323012174956, + "flos": 28661733682560.0, + "grad_norm": 1.6908098548641093, + "language_loss": 0.71228039, + "learning_rate": 2.9755047522544415e-06, + "loss": 0.78970701, + "num_input_tokens_seen": 127866880, + "router_z_loss_clip": 1.78808594, + "router_z_loss_mlp": 0.15002441, + "step": 5949, + "time_per_iteration": 2.56809663772583 + }, + { + "auxiliary_loss_clip": 0.06464282, + "auxiliary_loss_mlp": 0.01281848, + "balance_loss_clip": 0.06286816, + "balance_loss_mlp": 0.01266995, + "epoch": 0.35773335337441753, + "flos": 17089897593600.0, + "grad_norm": 1.7763817610233048, + "language_loss": 0.77821207, + "learning_rate": 2.9751647407128154e-06, + "loss": 0.85567343, + "num_input_tokens_seen": 127883560, + "router_z_loss_clip": 1.7734375, + "router_z_loss_mlp": 0.1484375, + "step": 5950, + "time_per_iteration": 2.529543876647949 + }, + { + "auxiliary_loss_clip": 0.06465182, + "auxiliary_loss_mlp": 0.01276208, + "balance_loss_clip": 0.0628643, + "balance_loss_mlp": 0.01261331, + "epoch": 0.35779347662708555, + "flos": 15894155502720.0, + "grad_norm": 2.1549260339424725, + "language_loss": 0.73109937, + "learning_rate": 2.9748246921928445e-06, + "loss": 0.80851334, + "num_input_tokens_seen": 127902330, + "router_z_loss_clip": 1.78710938, + "router_z_loss_mlp": 0.14892578, + "step": 5951, + "time_per_iteration": 2.5201168060302734 + }, + { + "auxiliary_loss_clip": 0.06470691, + "auxiliary_loss_mlp": 0.01277881, + "balance_loss_clip": 0.06287189, + "balance_loss_mlp": 0.01262181, + "epoch": 0.3578535998797535, + "flos": 28666555292160.0, + "grad_norm": 1.9784791605149854, + "language_loss": 0.7026071, + "learning_rate": 2.9744846067074236e-06, + "loss": 0.78009284, + "num_input_tokens_seen": 127922325, + "router_z_loss_clip": 1.83691406, + "router_z_loss_mlp": 0.15698242, + "step": 5952, + "time_per_iteration": 2.5931434631347656 + }, + { + "auxiliary_loss_clip": 0.0646029, + "auxiliary_loss_mlp": 0.01277333, + "balance_loss_clip": 0.06284408, + "balance_loss_mlp": 0.01263069, + "epoch": 0.3579137231324215, + "flos": 37861554464640.0, + "grad_norm": 1.6267089711440414, + "language_loss": 0.69578886, + "learning_rate": 2.974144484269449e-06, + "loss": 0.77316511, + "num_input_tokens_seen": 127942635, + "router_z_loss_clip": 1.75878906, + "router_z_loss_mlp": 0.14276123, + "step": 5953, + "time_per_iteration": 2.668464422225952 + }, + { + "auxiliary_loss_clip": 0.0645823, + "auxiliary_loss_mlp": 0.01275685, + "balance_loss_clip": 0.06282876, + "balance_loss_mlp": 0.01261117, + "epoch": 0.35797384638508944, + "flos": 22353526575360.0, + "grad_norm": 1.5719996722989455, + "language_loss": 0.67333478, + "learning_rate": 2.9738043248918175e-06, + "loss": 0.75067389, + "num_input_tokens_seen": 127962520, + "router_z_loss_clip": 1.75390625, + "router_z_loss_mlp": 0.14562988, + "step": 5954, + "time_per_iteration": 2.5791454315185547 + }, + { + "auxiliary_loss_clip": 0.06459846, + "auxiliary_loss_mlp": 0.01278708, + "balance_loss_clip": 0.06287006, + "balance_loss_mlp": 0.0126414, + "epoch": 0.3580339696377574, + "flos": 13594829212800.0, + "grad_norm": 1.8066455981447187, + "language_loss": 0.75335681, + "learning_rate": 2.9734641285874282e-06, + "loss": 0.83074236, + "num_input_tokens_seen": 127981180, + "router_z_loss_clip": 1.73144531, + "router_z_loss_mlp": 0.14556885, + "step": 5955, + "time_per_iteration": 2.5049943923950195 + }, + { + "auxiliary_loss_clip": 0.06458074, + "auxiliary_loss_mlp": 0.01270596, + "balance_loss_clip": 0.06286005, + "balance_loss_mlp": 0.01256595, + "epoch": 0.3580940928904254, + "flos": 23774882584320.0, + "grad_norm": 1.7018331496498176, + "language_loss": 0.76155579, + "learning_rate": 2.973123895369182e-06, + "loss": 0.83884245, + "num_input_tokens_seen": 127999725, + "router_z_loss_clip": 1.72070312, + "router_z_loss_mlp": 0.14007568, + "step": 5956, + "time_per_iteration": 2.565455675125122 + }, + { + "auxiliary_loss_clip": 0.06456999, + "auxiliary_loss_mlp": 0.01278066, + "balance_loss_clip": 0.06286499, + "balance_loss_mlp": 0.01263415, + "epoch": 0.35815421614309334, + "flos": 19469962892160.0, + "grad_norm": 1.5319401259692025, + "language_loss": 0.73558611, + "learning_rate": 2.9727836252499805e-06, + "loss": 0.81293678, + "num_input_tokens_seen": 128018885, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.14642334, + "step": 5957, + "time_per_iteration": 2.5241572856903076 + }, + { + "auxiliary_loss_clip": 0.064648, + "auxiliary_loss_mlp": 0.01274688, + "balance_loss_clip": 0.06291045, + "balance_loss_mlp": 0.01260204, + "epoch": 0.3582143393957613, + "flos": 23374988173440.0, + "grad_norm": 2.1285308943055727, + "language_loss": 0.71748459, + "learning_rate": 2.972443318242726e-06, + "loss": 0.79487944, + "num_input_tokens_seen": 128037875, + "router_z_loss_clip": 1.73828125, + "router_z_loss_mlp": 0.14477539, + "step": 5958, + "time_per_iteration": 2.566181182861328 + }, + { + "auxiliary_loss_clip": 0.06459813, + "auxiliary_loss_mlp": 0.01267621, + "balance_loss_clip": 0.06289116, + "balance_loss_mlp": 0.0125415, + "epoch": 0.35827446264842927, + "flos": 26330528113920.0, + "grad_norm": 1.6357791647016078, + "language_loss": 0.88725436, + "learning_rate": 2.972102974360324e-06, + "loss": 0.96452874, + "num_input_tokens_seen": 128056045, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.13452148, + "step": 5959, + "time_per_iteration": 2.6218011379241943 + }, + { + "auxiliary_loss_clip": 0.06463417, + "auxiliary_loss_mlp": 0.01271505, + "balance_loss_clip": 0.06288788, + "balance_loss_mlp": 0.0125816, + "epoch": 0.35833458590109724, + "flos": 30454626695040.0, + "grad_norm": 1.5143701220572547, + "language_loss": 0.58769095, + "learning_rate": 2.971762593615679e-06, + "loss": 0.66504014, + "num_input_tokens_seen": 128077815, + "router_z_loss_clip": 1.74707031, + "router_z_loss_mlp": 0.13348389, + "step": 5960, + "time_per_iteration": 2.636439800262451 + }, + { + "auxiliary_loss_clip": 0.06462947, + "auxiliary_loss_mlp": 0.01269103, + "balance_loss_clip": 0.06286879, + "balance_loss_mlp": 0.01253469, + "epoch": 0.3583947091537652, + "flos": 14835154475520.0, + "grad_norm": 2.541265940729937, + "language_loss": 0.76686686, + "learning_rate": 2.9714221760216993e-06, + "loss": 0.84418738, + "num_input_tokens_seen": 128095460, + "router_z_loss_clip": 1.75976562, + "router_z_loss_mlp": 0.15631104, + "step": 5961, + "time_per_iteration": 2.523674249649048 + }, + { + "auxiliary_loss_clip": 0.06464821, + "auxiliary_loss_mlp": 0.01275426, + "balance_loss_clip": 0.06287968, + "balance_loss_mlp": 0.01261324, + "epoch": 0.35845483240643317, + "flos": 34249213895040.0, + "grad_norm": 1.6475679018941416, + "language_loss": 0.70478481, + "learning_rate": 2.971081721591294e-06, + "loss": 0.78218734, + "num_input_tokens_seen": 128118605, + "router_z_loss_clip": 1.76953125, + "router_z_loss_mlp": 0.14099121, + "step": 5962, + "time_per_iteration": 2.6199357509613037 + }, + { + "auxiliary_loss_clip": 0.06464063, + "auxiliary_loss_mlp": 0.01269417, + "balance_loss_clip": 0.06289653, + "balance_loss_mlp": 0.01255207, + "epoch": 0.35851495565910113, + "flos": 20966481613440.0, + "grad_norm": 1.6496872805273144, + "language_loss": 0.75120842, + "learning_rate": 2.9707412303373716e-06, + "loss": 0.82854319, + "num_input_tokens_seen": 128139205, + "router_z_loss_clip": 1.74609375, + "router_z_loss_mlp": 0.14221191, + "step": 5963, + "time_per_iteration": 2.5526950359344482 + }, + { + "auxiliary_loss_clip": 0.06467253, + "auxiliary_loss_mlp": 0.01271151, + "balance_loss_clip": 0.06291784, + "balance_loss_mlp": 0.01256322, + "epoch": 0.35857507891176915, + "flos": 22316448343680.0, + "grad_norm": 1.675466861885377, + "language_loss": 0.78945208, + "learning_rate": 2.9704007022728447e-06, + "loss": 0.86683613, + "num_input_tokens_seen": 128158765, + "router_z_loss_clip": 1.75292969, + "router_z_loss_mlp": 0.14831543, + "step": 5964, + "time_per_iteration": 2.5257983207702637 + }, + { + "auxiliary_loss_clip": 0.0647264, + "auxiliary_loss_mlp": 0.01272042, + "balance_loss_clip": 0.06292663, + "balance_loss_mlp": 0.0125726, + "epoch": 0.3586352021644371, + "flos": 23374610830080.0, + "grad_norm": 3.2898914726182684, + "language_loss": 0.667786, + "learning_rate": 2.970060137410626e-06, + "loss": 0.74523282, + "num_input_tokens_seen": 128177850, + "router_z_loss_clip": 1.79785156, + "router_z_loss_mlp": 0.14764404, + "step": 5965, + "time_per_iteration": 2.5664315223693848 + }, + { + "auxiliary_loss_clip": 0.06463271, + "auxiliary_loss_mlp": 0.01271526, + "balance_loss_clip": 0.06287476, + "balance_loss_mlp": 0.01256773, + "epoch": 0.3586953254171051, + "flos": 27855655804800.0, + "grad_norm": 1.5935311272675807, + "language_loss": 0.79428947, + "learning_rate": 2.9697195357636294e-06, + "loss": 0.87163734, + "num_input_tokens_seen": 128196925, + "router_z_loss_clip": 1.75585938, + "router_z_loss_mlp": 0.14746094, + "step": 5966, + "time_per_iteration": 2.576537609100342 + }, + { + "auxiliary_loss_clip": 0.06467331, + "auxiliary_loss_mlp": 0.01268742, + "balance_loss_clip": 0.06291486, + "balance_loss_mlp": 0.01254717, + "epoch": 0.35875544866977305, + "flos": 19506621853440.0, + "grad_norm": 2.077713447457672, + "language_loss": 0.91477883, + "learning_rate": 2.9693788973447715e-06, + "loss": 0.99213958, + "num_input_tokens_seen": 128213955, + "router_z_loss_clip": 1.75683594, + "router_z_loss_mlp": 0.14044189, + "step": 5967, + "time_per_iteration": 2.553084135055542 + }, + { + "auxiliary_loss_clip": 0.06466691, + "auxiliary_loss_mlp": 0.01272699, + "balance_loss_clip": 0.06288824, + "balance_loss_mlp": 0.01257261, + "epoch": 0.358815571922441, + "flos": 21477652646400.0, + "grad_norm": 1.8463229992001005, + "language_loss": 0.80835712, + "learning_rate": 2.9690382221669682e-06, + "loss": 0.88575101, + "num_input_tokens_seen": 128232980, + "router_z_loss_clip": 1.77832031, + "router_z_loss_mlp": 0.15435791, + "step": 5968, + "time_per_iteration": 2.526298761367798 + }, + { + "auxiliary_loss_clip": 0.06467028, + "auxiliary_loss_mlp": 0.0127428, + "balance_loss_clip": 0.06287041, + "balance_loss_mlp": 0.012587, + "epoch": 0.358875695175109, + "flos": 21841894344960.0, + "grad_norm": 1.8179824378655614, + "language_loss": 0.84621, + "learning_rate": 2.9686975102431384e-06, + "loss": 0.92362314, + "num_input_tokens_seen": 128252795, + "router_z_loss_clip": 1.80078125, + "router_z_loss_mlp": 0.15588379, + "step": 5969, + "time_per_iteration": 2.5340397357940674 + }, + { + "auxiliary_loss_clip": 0.0646342, + "auxiliary_loss_mlp": 0.0127204, + "balance_loss_clip": 0.06288599, + "balance_loss_mlp": 0.01258664, + "epoch": 0.35893581842777694, + "flos": 32019264385920.0, + "grad_norm": 1.8505987075691241, + "language_loss": 0.72233456, + "learning_rate": 2.968356761586202e-06, + "loss": 0.79968911, + "num_input_tokens_seen": 128273115, + "router_z_loss_clip": 1.74804688, + "router_z_loss_mlp": 0.13366699, + "step": 5970, + "time_per_iteration": 2.581071615219116 + }, + { + "auxiliary_loss_clip": 0.06468321, + "auxiliary_loss_mlp": 0.01272468, + "balance_loss_clip": 0.06292167, + "balance_loss_mlp": 0.01258056, + "epoch": 0.3589959416804449, + "flos": 20492137249920.0, + "grad_norm": 1.5610077365233734, + "language_loss": 0.79753757, + "learning_rate": 2.9680159762090805e-06, + "loss": 0.87494546, + "num_input_tokens_seen": 128292220, + "router_z_loss_clip": 1.76269531, + "router_z_loss_mlp": 0.14422607, + "step": 5971, + "time_per_iteration": 2.5405828952789307 + }, + { + "auxiliary_loss_clip": 0.0646906, + "auxiliary_loss_mlp": 0.01270026, + "balance_loss_clip": 0.06288019, + "balance_loss_mlp": 0.01255006, + "epoch": 0.3590560649331129, + "flos": 16186295162880.0, + "grad_norm": 1.6291573791515084, + "language_loss": 0.78869599, + "learning_rate": 2.967675154124696e-06, + "loss": 0.86608684, + "num_input_tokens_seen": 128310305, + "router_z_loss_clip": 1.81054688, + "router_z_loss_mlp": 0.15026855, + "step": 5972, + "time_per_iteration": 2.4778740406036377 + }, + { + "auxiliary_loss_clip": 0.06465904, + "auxiliary_loss_mlp": 0.01274602, + "balance_loss_clip": 0.06286226, + "balance_loss_mlp": 0.01260201, + "epoch": 0.35911618818578084, + "flos": 20381531460480.0, + "grad_norm": 2.0141455740295875, + "language_loss": 0.81742013, + "learning_rate": 2.9673342953459722e-06, + "loss": 0.89482516, + "num_input_tokens_seen": 128328305, + "router_z_loss_clip": 1.79492188, + "router_z_loss_mlp": 0.1439209, + "step": 5973, + "time_per_iteration": 2.532027006149292 + }, + { + "auxiliary_loss_clip": 0.06404248, + "auxiliary_loss_mlp": 0.01258065, + "balance_loss_clip": 0.06324309, + "balance_loss_mlp": 0.01254096, + "epoch": 0.3591763114384488, + "flos": 41250991645440.0, + "grad_norm": 0.9082562918021452, + "language_loss": 0.56514442, + "learning_rate": 2.9669933998858355e-06, + "loss": 0.64176756, + "num_input_tokens_seen": 128378380, + "router_z_loss_clip": 0.796875, + "router_z_loss_mlp": 0.03967285, + "step": 5974, + "time_per_iteration": 3.0029375553131104 + }, + { + "auxiliary_loss_clip": 0.06464389, + "auxiliary_loss_mlp": 0.01272027, + "balance_loss_clip": 0.06286667, + "balance_loss_mlp": 0.01257781, + "epoch": 0.35923643469111677, + "flos": 18701047100160.0, + "grad_norm": 1.9591615340661908, + "language_loss": 0.69342583, + "learning_rate": 2.9666524677572114e-06, + "loss": 0.77078998, + "num_input_tokens_seen": 128394315, + "router_z_loss_clip": 1.77636719, + "router_z_loss_mlp": 0.14227295, + "step": 5975, + "time_per_iteration": 2.5330698490142822 + }, + { + "auxiliary_loss_clip": 0.06462636, + "auxiliary_loss_mlp": 0.0126783, + "balance_loss_clip": 0.06286036, + "balance_loss_mlp": 0.0125325, + "epoch": 0.35929655794378473, + "flos": 25017010709760.0, + "grad_norm": 1.597565036747504, + "language_loss": 0.8049522, + "learning_rate": 2.96631149897303e-06, + "loss": 0.88225687, + "num_input_tokens_seen": 128414515, + "router_z_loss_clip": 1.765625, + "router_z_loss_mlp": 0.14575195, + "step": 5976, + "time_per_iteration": 2.5599968433380127 + }, + { + "auxiliary_loss_clip": 0.0646351, + "auxiliary_loss_mlp": 0.0126845, + "balance_loss_clip": 0.06286681, + "balance_loss_mlp": 0.01253489, + "epoch": 0.35935668119645275, + "flos": 14980825998720.0, + "grad_norm": 1.8019140268476472, + "language_loss": 0.79171205, + "learning_rate": 2.9659704935462194e-06, + "loss": 0.86903155, + "num_input_tokens_seen": 128430615, + "router_z_loss_clip": 1.76757812, + "router_z_loss_mlp": 0.1496582, + "step": 5977, + "time_per_iteration": 2.4876949787139893 + }, + { + "auxiliary_loss_clip": 0.06459211, + "auxiliary_loss_mlp": 0.01266574, + "balance_loss_clip": 0.0628271, + "balance_loss_mlp": 0.0125324, + "epoch": 0.3594168044491207, + "flos": 21184422883200.0, + "grad_norm": 1.897291031169604, + "language_loss": 0.80843097, + "learning_rate": 2.9656294514897102e-06, + "loss": 0.88568884, + "num_input_tokens_seen": 128449480, + "router_z_loss_clip": 1.76464844, + "router_z_loss_mlp": 0.13342285, + "step": 5978, + "time_per_iteration": 2.5270771980285645 + }, + { + "auxiliary_loss_clip": 0.06458849, + "auxiliary_loss_mlp": 0.01272545, + "balance_loss_clip": 0.06279429, + "balance_loss_mlp": 0.01257703, + "epoch": 0.3594769277017887, + "flos": 27679446668160.0, + "grad_norm": 1.6570486295636508, + "language_loss": 0.67797875, + "learning_rate": 2.965288372816436e-06, + "loss": 0.75529265, + "num_input_tokens_seen": 128471465, + "router_z_loss_clip": 1.79394531, + "router_z_loss_mlp": 0.14819336, + "step": 5979, + "time_per_iteration": 5.427239179611206 + }, + { + "auxiliary_loss_clip": 0.06460471, + "auxiliary_loss_mlp": 0.01270252, + "balance_loss_clip": 0.06282781, + "balance_loss_mlp": 0.01256323, + "epoch": 0.35953705095445665, + "flos": 23008901685120.0, + "grad_norm": 2.1534655116077928, + "language_loss": 0.67667198, + "learning_rate": 2.9649472575393296e-06, + "loss": 0.75397921, + "num_input_tokens_seen": 128490645, + "router_z_loss_clip": 1.77929688, + "router_z_loss_mlp": 0.13928223, + "step": 5980, + "time_per_iteration": 2.538149833679199 + }, + { + "auxiliary_loss_clip": 0.0647162, + "auxiliary_loss_mlp": 0.01273216, + "balance_loss_clip": 0.06285568, + "balance_loss_mlp": 0.01257146, + "epoch": 0.3595971742071246, + "flos": 25520005969920.0, + "grad_norm": 2.2162969460708597, + "language_loss": 0.71122372, + "learning_rate": 2.964606105671327e-06, + "loss": 0.78867209, + "num_input_tokens_seen": 128510225, + "router_z_loss_clip": 1.86132812, + "router_z_loss_mlp": 0.16064453, + "step": 5981, + "time_per_iteration": 2.5711326599121094 + }, + { + "auxiliary_loss_clip": 0.06464566, + "auxiliary_loss_mlp": 0.01272445, + "balance_loss_clip": 0.06283125, + "balance_loss_mlp": 0.01256709, + "epoch": 0.3596572974597926, + "flos": 29870431228800.0, + "grad_norm": 2.0278025655936958, + "language_loss": 0.71914935, + "learning_rate": 2.9642649172253635e-06, + "loss": 0.7965194, + "num_input_tokens_seen": 128530195, + "router_z_loss_clip": 1.81640625, + "router_z_loss_mlp": 0.1572876, + "step": 5982, + "time_per_iteration": 2.6292126178741455 + }, + { + "auxiliary_loss_clip": 0.06458835, + "auxiliary_loss_mlp": 0.01267882, + "balance_loss_clip": 0.06286852, + "balance_loss_mlp": 0.0125428, + "epoch": 0.35971742071246054, + "flos": 23119255912320.0, + "grad_norm": 1.6791573126106523, + "language_loss": 0.7649492, + "learning_rate": 2.9639236922143786e-06, + "loss": 0.84221637, + "num_input_tokens_seen": 128549990, + "router_z_loss_clip": 1.72070312, + "router_z_loss_mlp": 0.13598633, + "step": 5983, + "time_per_iteration": 2.540801763534546 + }, + { + "auxiliary_loss_clip": 0.06468493, + "auxiliary_loss_mlp": 0.01273263, + "balance_loss_clip": 0.06285352, + "balance_loss_mlp": 0.01257206, + "epoch": 0.3597775439651285, + "flos": 16730645212800.0, + "grad_norm": 1.651729152091261, + "language_loss": 0.77260226, + "learning_rate": 2.96358243065131e-06, + "loss": 0.85001981, + "num_input_tokens_seen": 128567925, + "router_z_loss_clip": 1.83007812, + "router_z_loss_mlp": 0.16052246, + "step": 5984, + "time_per_iteration": 2.5278737545013428 + }, + { + "auxiliary_loss_clip": 0.06458455, + "auxiliary_loss_mlp": 0.01270496, + "balance_loss_clip": 0.0628411, + "balance_loss_mlp": 0.01256155, + "epoch": 0.3598376672177965, + "flos": 19725653226240.0, + "grad_norm": 2.0268922239891163, + "language_loss": 0.87093443, + "learning_rate": 2.9632411325490993e-06, + "loss": 0.94822395, + "num_input_tokens_seen": 128585655, + "router_z_loss_clip": 1.7421875, + "router_z_loss_mlp": 0.14355469, + "step": 5985, + "time_per_iteration": 3.9569170475006104 + }, + { + "auxiliary_loss_clip": 0.06461216, + "auxiliary_loss_mlp": 0.01272807, + "balance_loss_clip": 0.06284203, + "balance_loss_mlp": 0.01258109, + "epoch": 0.35989779047046444, + "flos": 17317314374400.0, + "grad_norm": 1.4939910635791536, + "language_loss": 0.72980917, + "learning_rate": 2.9628997979206884e-06, + "loss": 0.80714941, + "num_input_tokens_seen": 128604820, + "router_z_loss_clip": 1.77246094, + "router_z_loss_mlp": 0.14709473, + "step": 5986, + "time_per_iteration": 2.5065739154815674 + }, + { + "auxiliary_loss_clip": 0.06469383, + "auxiliary_loss_mlp": 0.0126965, + "balance_loss_clip": 0.06283881, + "balance_loss_mlp": 0.01254761, + "epoch": 0.3599579137231324, + "flos": 22717894055040.0, + "grad_norm": 2.903112824764454, + "language_loss": 0.73792106, + "learning_rate": 2.9625584267790204e-06, + "loss": 0.81531143, + "num_input_tokens_seen": 128623070, + "router_z_loss_clip": 1.85449219, + "router_z_loss_mlp": 0.14892578, + "step": 5987, + "time_per_iteration": 3.961486339569092 + }, + { + "auxiliary_loss_clip": 0.06467381, + "auxiliary_loss_mlp": 0.01269998, + "balance_loss_clip": 0.06286356, + "balance_loss_mlp": 0.01255347, + "epoch": 0.36001803697580037, + "flos": 20966230051200.0, + "grad_norm": 1.8945086710394061, + "language_loss": 0.69721663, + "learning_rate": 2.9622170191370404e-06, + "loss": 0.77459043, + "num_input_tokens_seen": 128642430, + "router_z_loss_clip": 1.80859375, + "router_z_loss_mlp": 0.14648438, + "step": 5988, + "time_per_iteration": 2.5483100414276123 + }, + { + "auxiliary_loss_clip": 0.0647547, + "auxiliary_loss_mlp": 0.01273545, + "balance_loss_clip": 0.06292704, + "balance_loss_mlp": 0.01258209, + "epoch": 0.36007816022846834, + "flos": 20491843760640.0, + "grad_norm": 1.7927951606002523, + "language_loss": 0.7305057, + "learning_rate": 2.9618755750076953e-06, + "loss": 0.80799592, + "num_input_tokens_seen": 128661285, + "router_z_loss_clip": 1.82714844, + "router_z_loss_mlp": 0.15344238, + "step": 5989, + "time_per_iteration": 2.5010430812835693 + }, + { + "auxiliary_loss_clip": 0.06467338, + "auxiliary_loss_mlp": 0.01268061, + "balance_loss_clip": 0.06289014, + "balance_loss_mlp": 0.01254173, + "epoch": 0.36013828348113636, + "flos": 28008706487040.0, + "grad_norm": 1.4999082498201763, + "language_loss": 0.80117184, + "learning_rate": 2.961534094403931e-06, + "loss": 0.87852585, + "num_input_tokens_seen": 128682210, + "router_z_loss_clip": 1.78515625, + "router_z_loss_mlp": 0.13897705, + "step": 5990, + "time_per_iteration": 2.6733410358428955 + }, + { + "auxiliary_loss_clip": 0.06464024, + "auxiliary_loss_mlp": 0.01270971, + "balance_loss_clip": 0.0628631, + "balance_loss_mlp": 0.01255938, + "epoch": 0.3601984067338043, + "flos": 20088050135040.0, + "grad_norm": 1.799909646769202, + "language_loss": 0.84338784, + "learning_rate": 2.961192577338698e-06, + "loss": 0.92073774, + "num_input_tokens_seen": 128700445, + "router_z_loss_clip": 1.77441406, + "router_z_loss_mlp": 0.15032959, + "step": 5991, + "time_per_iteration": 2.518554925918579 + }, + { + "auxiliary_loss_clip": 0.06474696, + "auxiliary_loss_mlp": 0.01276578, + "balance_loss_clip": 0.06292041, + "balance_loss_mlp": 0.01261367, + "epoch": 0.3602585299864723, + "flos": 18622362516480.0, + "grad_norm": 1.891276760716041, + "language_loss": 0.76406145, + "learning_rate": 2.9608510238249463e-06, + "loss": 0.84157419, + "num_input_tokens_seen": 128716855, + "router_z_loss_clip": 1.82617188, + "router_z_loss_mlp": 0.1519165, + "step": 5992, + "time_per_iteration": 2.5224106311798096 + }, + { + "auxiliary_loss_clip": 0.06471405, + "auxiliary_loss_mlp": 0.01273147, + "balance_loss_clip": 0.06294376, + "balance_loss_mlp": 0.01258496, + "epoch": 0.36031865323914025, + "flos": 19579059308160.0, + "grad_norm": 2.086772991356176, + "language_loss": 0.78120929, + "learning_rate": 2.960509433875627e-06, + "loss": 0.8586548, + "num_input_tokens_seen": 128735835, + "router_z_loss_clip": 1.76953125, + "router_z_loss_mlp": 0.14648438, + "step": 5993, + "time_per_iteration": 2.5155129432678223 + }, + { + "auxiliary_loss_clip": 0.06474859, + "auxiliary_loss_mlp": 0.01271898, + "balance_loss_clip": 0.06293729, + "balance_loss_mlp": 0.01257807, + "epoch": 0.3603787764918082, + "flos": 17495871425280.0, + "grad_norm": 1.6487847999674183, + "language_loss": 0.74534261, + "learning_rate": 2.9601678075036943e-06, + "loss": 0.82281017, + "num_input_tokens_seen": 128752465, + "router_z_loss_clip": 1.81445312, + "router_z_loss_mlp": 0.14086914, + "step": 5994, + "time_per_iteration": 2.647794723510742 + }, + { + "auxiliary_loss_clip": 0.06474246, + "auxiliary_loss_mlp": 0.01268785, + "balance_loss_clip": 0.06290799, + "balance_loss_mlp": 0.01254415, + "epoch": 0.3604388997444762, + "flos": 15528823701120.0, + "grad_norm": 1.8873654318884407, + "language_loss": 0.69500113, + "learning_rate": 2.9598261447221024e-06, + "loss": 0.77243149, + "num_input_tokens_seen": 128770865, + "router_z_loss_clip": 1.83691406, + "router_z_loss_mlp": 0.14361572, + "step": 5995, + "time_per_iteration": 2.501981019973755 + }, + { + "auxiliary_loss_clip": 0.06479774, + "auxiliary_loss_mlp": 0.01276345, + "balance_loss_clip": 0.06295834, + "balance_loss_mlp": 0.01261688, + "epoch": 0.36049902299714415, + "flos": 17316559687680.0, + "grad_norm": 1.8201062799427143, + "language_loss": 0.8309989, + "learning_rate": 2.9594844455438057e-06, + "loss": 0.90856004, + "num_input_tokens_seen": 128789730, + "router_z_loss_clip": 1.83886719, + "router_z_loss_mlp": 0.14642334, + "step": 5996, + "time_per_iteration": 2.551095962524414 + }, + { + "auxiliary_loss_clip": 0.06472808, + "auxiliary_loss_mlp": 0.01275418, + "balance_loss_clip": 0.06293936, + "balance_loss_mlp": 0.01260493, + "epoch": 0.3605591462498121, + "flos": 17061749821440.0, + "grad_norm": 2.2503529028172804, + "language_loss": 0.73762429, + "learning_rate": 2.959142709981763e-06, + "loss": 0.81510657, + "num_input_tokens_seen": 128806610, + "router_z_loss_clip": 1.79003906, + "router_z_loss_mlp": 0.14910889, + "step": 5997, + "time_per_iteration": 2.493100881576538 + }, + { + "auxiliary_loss_clip": 0.06465439, + "auxiliary_loss_mlp": 0.0127421, + "balance_loss_clip": 0.06288476, + "balance_loss_mlp": 0.0125944, + "epoch": 0.3606192695024801, + "flos": 16842508813440.0, + "grad_norm": 2.0075843423569326, + "language_loss": 0.69582814, + "learning_rate": 2.9588009380489337e-06, + "loss": 0.77322465, + "num_input_tokens_seen": 128824830, + "router_z_loss_clip": 1.76855469, + "router_z_loss_mlp": 0.14758301, + "step": 5998, + "time_per_iteration": 2.54227352142334 + }, + { + "auxiliary_loss_clip": 0.06468997, + "auxiliary_loss_mlp": 0.01272453, + "balance_loss_clip": 0.06292363, + "balance_loss_mlp": 0.01258243, + "epoch": 0.36067939275514804, + "flos": 12134424401280.0, + "grad_norm": 2.607888629955908, + "language_loss": 0.77566224, + "learning_rate": 2.9584591297582758e-06, + "loss": 0.8530767, + "num_input_tokens_seen": 128838170, + "router_z_loss_clip": 1.76757812, + "router_z_loss_mlp": 0.14208984, + "step": 5999, + "time_per_iteration": 2.456887722015381 + }, + { + "auxiliary_loss_clip": 0.06474666, + "auxiliary_loss_mlp": 0.01272087, + "balance_loss_clip": 0.06294585, + "balance_loss_mlp": 0.01257776, + "epoch": 0.360739516007816, + "flos": 18047390999040.0, + "grad_norm": 1.725953097254869, + "language_loss": 0.78777629, + "learning_rate": 2.9581172851227516e-06, + "loss": 0.86524385, + "num_input_tokens_seen": 128855625, + "router_z_loss_clip": 1.80273438, + "router_z_loss_mlp": 0.14300537, + "step": 6000, + "time_per_iteration": 2.523397445678711 + }, + { + "auxiliary_loss_clip": 0.06471578, + "auxiliary_loss_mlp": 0.01271527, + "balance_loss_clip": 0.06294253, + "balance_loss_mlp": 0.01257854, + "epoch": 0.360799639260484, + "flos": 18555417504000.0, + "grad_norm": 1.7389483603698193, + "language_loss": 0.78602117, + "learning_rate": 2.9577754041553243e-06, + "loss": 0.86345226, + "num_input_tokens_seen": 128873540, + "router_z_loss_clip": 1.77148438, + "router_z_loss_mlp": 0.13671875, + "step": 6001, + "time_per_iteration": 2.4887304306030273 + }, + { + "auxiliary_loss_clip": 0.06462014, + "auxiliary_loss_mlp": 0.01269074, + "balance_loss_clip": 0.06287026, + "balance_loss_mlp": 0.012549, + "epoch": 0.36085976251315194, + "flos": 19688029943040.0, + "grad_norm": 2.5640130860082206, + "language_loss": 0.83264118, + "learning_rate": 2.9574334868689575e-06, + "loss": 0.90995204, + "num_input_tokens_seen": 128889925, + "router_z_loss_clip": 1.74902344, + "router_z_loss_mlp": 0.14178467, + "step": 6002, + "time_per_iteration": 2.523263931274414 + }, + { + "auxiliary_loss_clip": 0.06462792, + "auxiliary_loss_mlp": 0.01274754, + "balance_loss_clip": 0.06293326, + "balance_loss_mlp": 0.01262034, + "epoch": 0.3609198857658199, + "flos": 24204476067840.0, + "grad_norm": 2.058215255218527, + "language_loss": 0.91365647, + "learning_rate": 2.9570915332766165e-06, + "loss": 0.991032, + "num_input_tokens_seen": 128906890, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.12713623, + "step": 6003, + "time_per_iteration": 2.5147922039031982 + }, + { + "auxiliary_loss_clip": 0.06424739, + "auxiliary_loss_mlp": 0.01257394, + "balance_loss_clip": 0.06345953, + "balance_loss_mlp": 0.01254351, + "epoch": 0.3609800090184879, + "flos": 57134288044800.0, + "grad_norm": 0.8495896975763515, + "language_loss": 0.53457719, + "learning_rate": 2.9567495433912693e-06, + "loss": 0.61139846, + "num_input_tokens_seen": 128965940, + "router_z_loss_clip": 0.78857422, + "router_z_loss_mlp": 0.03041077, + "step": 6004, + "time_per_iteration": 3.1006038188934326 + }, + { + "auxiliary_loss_clip": 0.06473242, + "auxiliary_loss_mlp": 0.01270523, + "balance_loss_clip": 0.06291834, + "balance_loss_mlp": 0.0125549, + "epoch": 0.3610401322711559, + "flos": 20817120510720.0, + "grad_norm": 1.7032625156204924, + "language_loss": 0.78291458, + "learning_rate": 2.956407517225883e-06, + "loss": 0.86035228, + "num_input_tokens_seen": 128985835, + "router_z_loss_clip": 1.81640625, + "router_z_loss_mlp": 0.15026855, + "step": 6005, + "time_per_iteration": 2.507681369781494 + }, + { + "auxiliary_loss_clip": 0.06466124, + "auxiliary_loss_mlp": 0.01274708, + "balance_loss_clip": 0.06289654, + "balance_loss_mlp": 0.01260373, + "epoch": 0.36110025552382385, + "flos": 13704302972160.0, + "grad_norm": 1.9788670063291258, + "language_loss": 0.79365236, + "learning_rate": 2.956065454793429e-06, + "loss": 0.87106061, + "num_input_tokens_seen": 129003120, + "router_z_loss_clip": 1.76464844, + "router_z_loss_mlp": 0.14349365, + "step": 6006, + "time_per_iteration": 2.6221675872802734 + }, + { + "auxiliary_loss_clip": 0.06467897, + "auxiliary_loss_mlp": 0.01276481, + "balance_loss_clip": 0.06290089, + "balance_loss_mlp": 0.01260317, + "epoch": 0.3611603787764918, + "flos": 22461490961280.0, + "grad_norm": 1.8947484153914913, + "language_loss": 0.84532005, + "learning_rate": 2.955723356106876e-06, + "loss": 0.92276382, + "num_input_tokens_seen": 129021645, + "router_z_loss_clip": 1.77734375, + "router_z_loss_mlp": 0.16162109, + "step": 6007, + "time_per_iteration": 2.5697944164276123 + }, + { + "auxiliary_loss_clip": 0.06477423, + "auxiliary_loss_mlp": 0.01275582, + "balance_loss_clip": 0.06289505, + "balance_loss_mlp": 0.0126018, + "epoch": 0.3612205020291598, + "flos": 20892954055680.0, + "grad_norm": 2.2451481952848953, + "language_loss": 0.73192191, + "learning_rate": 2.955381221179198e-06, + "loss": 0.80945194, + "num_input_tokens_seen": 129038375, + "router_z_loss_clip": 1.87890625, + "router_z_loss_mlp": 0.1541748, + "step": 6008, + "time_per_iteration": 2.5410661697387695 + }, + { + "auxiliary_loss_clip": 0.06468849, + "auxiliary_loss_mlp": 0.01276747, + "balance_loss_clip": 0.06288531, + "balance_loss_mlp": 0.01262036, + "epoch": 0.36128062528182775, + "flos": 15747393876480.0, + "grad_norm": 2.0636796050179194, + "language_loss": 0.83194089, + "learning_rate": 2.955039050023368e-06, + "loss": 0.90939683, + "num_input_tokens_seen": 129056235, + "router_z_loss_clip": 1.80273438, + "router_z_loss_mlp": 0.1472168, + "step": 6009, + "time_per_iteration": 2.4896605014801025 + }, + { + "auxiliary_loss_clip": 0.06467466, + "auxiliary_loss_mlp": 0.01270582, + "balance_loss_clip": 0.0629051, + "balance_loss_mlp": 0.012553, + "epoch": 0.3613407485344957, + "flos": 16770239066880.0, + "grad_norm": 1.996577445690206, + "language_loss": 0.7613554, + "learning_rate": 2.954696842652362e-06, + "loss": 0.83873594, + "num_input_tokens_seen": 129072405, + "router_z_loss_clip": 1.76953125, + "router_z_loss_mlp": 0.15258789, + "step": 6010, + "time_per_iteration": 2.501328468322754 + }, + { + "auxiliary_loss_clip": 0.064712, + "auxiliary_loss_mlp": 0.0127317, + "balance_loss_clip": 0.06292284, + "balance_loss_mlp": 0.01258734, + "epoch": 0.3614008717871637, + "flos": 20376625996800.0, + "grad_norm": 1.7565456089129825, + "language_loss": 0.8353886, + "learning_rate": 2.9543545990791554e-06, + "loss": 0.91283226, + "num_input_tokens_seen": 129090225, + "router_z_loss_clip": 1.78710938, + "router_z_loss_mlp": 0.14440918, + "step": 6011, + "time_per_iteration": 2.5080785751342773 + }, + { + "auxiliary_loss_clip": 0.06473367, + "auxiliary_loss_mlp": 0.01273027, + "balance_loss_clip": 0.06288376, + "balance_loss_mlp": 0.0125784, + "epoch": 0.36146099503983165, + "flos": 22782071882880.0, + "grad_norm": 2.5852128775447536, + "language_loss": 0.62982023, + "learning_rate": 2.954012319316727e-06, + "loss": 0.70728415, + "num_input_tokens_seen": 129107685, + "router_z_loss_clip": 1.84863281, + "router_z_loss_mlp": 0.15185547, + "step": 6012, + "time_per_iteration": 2.5285983085632324 + }, + { + "auxiliary_loss_clip": 0.06468817, + "auxiliary_loss_mlp": 0.01279391, + "balance_loss_clip": 0.06292222, + "balance_loss_mlp": 0.01264728, + "epoch": 0.3615211182924996, + "flos": 23002277212800.0, + "grad_norm": 2.060645495819417, + "language_loss": 0.83850408, + "learning_rate": 2.9536700033780565e-06, + "loss": 0.91598618, + "num_input_tokens_seen": 129125315, + "router_z_loss_clip": 1.76660156, + "router_z_loss_mlp": 0.14648438, + "step": 6013, + "time_per_iteration": 2.511187791824341 + }, + { + "auxiliary_loss_clip": 0.06469796, + "auxiliary_loss_mlp": 0.01276155, + "balance_loss_clip": 0.06291521, + "balance_loss_mlp": 0.01259501, + "epoch": 0.3615812415451676, + "flos": 16652631461760.0, + "grad_norm": 1.9072870373759168, + "language_loss": 0.92107058, + "learning_rate": 2.9533276512761228e-06, + "loss": 0.99853015, + "num_input_tokens_seen": 129141600, + "router_z_loss_clip": 1.78417969, + "router_z_loss_mlp": 0.16638184, + "step": 6014, + "time_per_iteration": 2.498011350631714 + }, + { + "auxiliary_loss_clip": 0.06466013, + "auxiliary_loss_mlp": 0.01275475, + "balance_loss_clip": 0.06290498, + "balance_loss_mlp": 0.01260097, + "epoch": 0.36164136479783554, + "flos": 21325733994240.0, + "grad_norm": 8.045361949377702, + "language_loss": 0.73973721, + "learning_rate": 2.95298526302391e-06, + "loss": 0.81715214, + "num_input_tokens_seen": 129160665, + "router_z_loss_clip": 1.75683594, + "router_z_loss_mlp": 0.15393066, + "step": 6015, + "time_per_iteration": 2.5139665603637695 + }, + { + "auxiliary_loss_clip": 0.0646963, + "auxiliary_loss_mlp": 0.01277804, + "balance_loss_clip": 0.06291166, + "balance_loss_mlp": 0.01262151, + "epoch": 0.3617014880505035, + "flos": 24176286368640.0, + "grad_norm": 1.9455925595590893, + "language_loss": 0.65181047, + "learning_rate": 2.9526428386344e-06, + "loss": 0.72928476, + "num_input_tokens_seen": 129179220, + "router_z_loss_clip": 1.78320312, + "router_z_loss_mlp": 0.15637207, + "step": 6016, + "time_per_iteration": 2.5485315322875977 + }, + { + "auxiliary_loss_clip": 0.06469464, + "auxiliary_loss_mlp": 0.01276058, + "balance_loss_clip": 0.06288736, + "balance_loss_mlp": 0.01259261, + "epoch": 0.3617616113031715, + "flos": 39023278997760.0, + "grad_norm": 1.6846943976812254, + "language_loss": 0.72102833, + "learning_rate": 2.9523003781205785e-06, + "loss": 0.79848349, + "num_input_tokens_seen": 129200385, + "router_z_loss_clip": 1.81152344, + "router_z_loss_mlp": 0.16784668, + "step": 6017, + "time_per_iteration": 2.6685996055603027 + }, + { + "auxiliary_loss_clip": 0.06470844, + "auxiliary_loss_mlp": 0.01272479, + "balance_loss_clip": 0.06287402, + "balance_loss_mlp": 0.01256886, + "epoch": 0.3618217345558395, + "flos": 12135807993600.0, + "grad_norm": 2.3155685522099962, + "language_loss": 0.74387789, + "learning_rate": 2.9519578814954307e-06, + "loss": 0.82131112, + "num_input_tokens_seen": 129217395, + "router_z_loss_clip": 1.83398438, + "router_z_loss_mlp": 0.15600586, + "step": 6018, + "time_per_iteration": 3.93249249458313 + }, + { + "auxiliary_loss_clip": 0.06458628, + "auxiliary_loss_mlp": 0.01273986, + "balance_loss_clip": 0.06287278, + "balance_loss_mlp": 0.0125856, + "epoch": 0.36188185780850746, + "flos": 24941722216320.0, + "grad_norm": 2.406612181934337, + "language_loss": 0.69554305, + "learning_rate": 2.9516153487719448e-06, + "loss": 0.77286923, + "num_input_tokens_seen": 129238940, + "router_z_loss_clip": 1.71191406, + "router_z_loss_mlp": 0.1541748, + "step": 6019, + "time_per_iteration": 4.000872373580933 + }, + { + "auxiliary_loss_clip": 0.06472806, + "auxiliary_loss_mlp": 0.01271681, + "balance_loss_clip": 0.0628852, + "balance_loss_mlp": 0.01255815, + "epoch": 0.3619419810611754, + "flos": 20965014167040.0, + "grad_norm": 2.953778610066193, + "language_loss": 0.76874363, + "learning_rate": 2.95127277996311e-06, + "loss": 0.84618843, + "num_input_tokens_seen": 129258240, + "router_z_loss_clip": 1.84277344, + "router_z_loss_mlp": 0.15869141, + "step": 6020, + "time_per_iteration": 2.5465614795684814 + }, + { + "auxiliary_loss_clip": 0.06471147, + "auxiliary_loss_mlp": 0.01273965, + "balance_loss_clip": 0.06288891, + "balance_loss_mlp": 0.01257264, + "epoch": 0.3620021043138434, + "flos": 22535521643520.0, + "grad_norm": 2.2311166939070097, + "language_loss": 0.74090236, + "learning_rate": 2.9509301750819156e-06, + "loss": 0.81835353, + "num_input_tokens_seen": 129279040, + "router_z_loss_clip": 1.82519531, + "router_z_loss_mlp": 0.16687012, + "step": 6021, + "time_per_iteration": 2.57817006111145 + }, + { + "auxiliary_loss_clip": 0.06467178, + "auxiliary_loss_mlp": 0.01270658, + "balance_loss_clip": 0.0628859, + "balance_loss_mlp": 0.01255685, + "epoch": 0.36206222756651135, + "flos": 15602183550720.0, + "grad_norm": 5.238961551513005, + "language_loss": 0.81591839, + "learning_rate": 2.9505875341413533e-06, + "loss": 0.89329672, + "num_input_tokens_seen": 129295415, + "router_z_loss_clip": 1.78710938, + "router_z_loss_mlp": 0.1496582, + "step": 6022, + "time_per_iteration": 2.5385305881500244 + }, + { + "auxiliary_loss_clip": 0.06457289, + "auxiliary_loss_mlp": 0.0127544, + "balance_loss_clip": 0.06285636, + "balance_loss_mlp": 0.01260349, + "epoch": 0.3621223508191793, + "flos": 23594019546240.0, + "grad_norm": 2.318322058767841, + "language_loss": 0.81707698, + "learning_rate": 2.950244857154417e-06, + "loss": 0.89440429, + "num_input_tokens_seen": 129312620, + "router_z_loss_clip": 1.71484375, + "router_z_loss_mlp": 0.15075684, + "step": 6023, + "time_per_iteration": 2.604048013687134 + }, + { + "auxiliary_loss_clip": 0.0647051, + "auxiliary_loss_mlp": 0.01276448, + "balance_loss_clip": 0.06288643, + "balance_loss_mlp": 0.01259795, + "epoch": 0.3621824740718473, + "flos": 22316490270720.0, + "grad_norm": 2.4056275848880038, + "language_loss": 0.80008531, + "learning_rate": 2.9499021441341e-06, + "loss": 0.87755489, + "num_input_tokens_seen": 129331825, + "router_z_loss_clip": 1.81640625, + "router_z_loss_mlp": 0.16650391, + "step": 6024, + "time_per_iteration": 3.9998557567596436 + }, + { + "auxiliary_loss_clip": 0.06462081, + "auxiliary_loss_mlp": 0.01273703, + "balance_loss_clip": 0.06288754, + "balance_loss_mlp": 0.01258599, + "epoch": 0.36224259732451525, + "flos": 16769232817920.0, + "grad_norm": 2.2201652107227354, + "language_loss": 0.75149572, + "learning_rate": 2.9495593950933997e-06, + "loss": 0.82885349, + "num_input_tokens_seen": 129350400, + "router_z_loss_clip": 1.73535156, + "router_z_loss_mlp": 0.15112305, + "step": 6025, + "time_per_iteration": 2.5139317512512207 + }, + { + "auxiliary_loss_clip": 0.06466474, + "auxiliary_loss_mlp": 0.01274175, + "balance_loss_clip": 0.06290425, + "balance_loss_mlp": 0.01260198, + "epoch": 0.3623027205771832, + "flos": 23156585706240.0, + "grad_norm": 1.704945166995659, + "language_loss": 0.72471905, + "learning_rate": 2.9492166100453107e-06, + "loss": 0.80212557, + "num_input_tokens_seen": 129371155, + "router_z_loss_clip": 1.76074219, + "router_z_loss_mlp": 0.13989258, + "step": 6026, + "time_per_iteration": 3.974848985671997 + }, + { + "auxiliary_loss_clip": 0.06476888, + "auxiliary_loss_mlp": 0.01276899, + "balance_loss_clip": 0.06290971, + "balance_loss_mlp": 0.01260233, + "epoch": 0.3623628438298512, + "flos": 28556829970560.0, + "grad_norm": 1.945563554904942, + "language_loss": 0.79502189, + "learning_rate": 2.948873789002833e-06, + "loss": 0.87255979, + "num_input_tokens_seen": 129391230, + "router_z_loss_clip": 1.859375, + "router_z_loss_mlp": 0.16662598, + "step": 6027, + "time_per_iteration": 2.614713430404663 + }, + { + "auxiliary_loss_clip": 0.06469107, + "auxiliary_loss_mlp": 0.01272818, + "balance_loss_clip": 0.06288799, + "balance_loss_mlp": 0.01256427, + "epoch": 0.36242296708251914, + "flos": 25492193614080.0, + "grad_norm": 4.95803648299326, + "language_loss": 0.68042505, + "learning_rate": 2.9485309319789667e-06, + "loss": 0.75784421, + "num_input_tokens_seen": 129410065, + "router_z_loss_clip": 1.80371094, + "router_z_loss_mlp": 0.16381836, + "step": 6028, + "time_per_iteration": 2.5680782794952393 + }, + { + "auxiliary_loss_clip": 0.06467344, + "auxiliary_loss_mlp": 0.01275782, + "balance_loss_clip": 0.0629041, + "balance_loss_mlp": 0.01260273, + "epoch": 0.3624830903351871, + "flos": 16296062411520.0, + "grad_norm": 2.2968183263714983, + "language_loss": 0.85463655, + "learning_rate": 2.9481880389867117e-06, + "loss": 0.93206775, + "num_input_tokens_seen": 129428655, + "router_z_loss_clip": 1.76953125, + "router_z_loss_mlp": 0.1550293, + "step": 6029, + "time_per_iteration": 2.519960403442383 + }, + { + "auxiliary_loss_clip": 0.06462874, + "auxiliary_loss_mlp": 0.01270115, + "balance_loss_clip": 0.0628645, + "balance_loss_mlp": 0.01255107, + "epoch": 0.36254321358785513, + "flos": 18302200865280.0, + "grad_norm": 1.7460468862336926, + "language_loss": 0.72888201, + "learning_rate": 2.9478451100390714e-06, + "loss": 0.80621189, + "num_input_tokens_seen": 129447845, + "router_z_loss_clip": 1.76367188, + "router_z_loss_mlp": 0.15008545, + "step": 6030, + "time_per_iteration": 2.480053663253784 + }, + { + "auxiliary_loss_clip": 0.06476077, + "auxiliary_loss_mlp": 0.01274605, + "balance_loss_clip": 0.06291036, + "balance_loss_mlp": 0.01257558, + "epoch": 0.3626033368405231, + "flos": 14870387917440.0, + "grad_norm": 3.30241855147188, + "language_loss": 0.75249928, + "learning_rate": 2.94750214514905e-06, + "loss": 0.83000606, + "num_input_tokens_seen": 129463275, + "router_z_loss_clip": 1.84960938, + "router_z_loss_mlp": 0.17041016, + "step": 6031, + "time_per_iteration": 2.4887540340423584 + }, + { + "auxiliary_loss_clip": 0.06465365, + "auxiliary_loss_mlp": 0.01279599, + "balance_loss_clip": 0.06287815, + "balance_loss_mlp": 0.01264245, + "epoch": 0.36266346009319106, + "flos": 22312632983040.0, + "grad_norm": 2.377019393957944, + "language_loss": 0.73490477, + "learning_rate": 2.9471591443296516e-06, + "loss": 0.81235439, + "num_input_tokens_seen": 129483205, + "router_z_loss_clip": 1.77539062, + "router_z_loss_mlp": 0.15344238, + "step": 6032, + "time_per_iteration": 2.5194106101989746 + }, + { + "auxiliary_loss_clip": 0.06471337, + "auxiliary_loss_mlp": 0.01274047, + "balance_loss_clip": 0.06290144, + "balance_loss_mlp": 0.01258776, + "epoch": 0.362723583345859, + "flos": 18228044401920.0, + "grad_norm": 1.8908046818451942, + "language_loss": 0.78089464, + "learning_rate": 2.946816107593884e-06, + "loss": 0.85834849, + "num_input_tokens_seen": 129499885, + "router_z_loss_clip": 1.81054688, + "router_z_loss_mlp": 0.15270996, + "step": 6033, + "time_per_iteration": 2.6062612533569336 + }, + { + "auxiliary_loss_clip": 0.06434236, + "auxiliary_loss_mlp": 0.01267532, + "balance_loss_clip": 0.06350702, + "balance_loss_mlp": 0.01264055, + "epoch": 0.362783706598527, + "flos": 68519307456000.0, + "grad_norm": 0.7613876705351186, + "language_loss": 0.64809752, + "learning_rate": 2.9464730349547547e-06, + "loss": 0.72511524, + "num_input_tokens_seen": 129561885, + "router_z_loss_clip": 0.8359375, + "router_z_loss_mlp": 0.03485107, + "step": 6034, + "time_per_iteration": 3.216454267501831 + }, + { + "auxiliary_loss_clip": 0.06466131, + "auxiliary_loss_mlp": 0.01276184, + "balance_loss_clip": 0.06289437, + "balance_loss_mlp": 0.01260222, + "epoch": 0.36284382985119495, + "flos": 26583535117440.0, + "grad_norm": 2.053623051898619, + "language_loss": 0.89456552, + "learning_rate": 2.946129926425273e-06, + "loss": 0.97198874, + "num_input_tokens_seen": 129582325, + "router_z_loss_clip": 1.765625, + "router_z_loss_mlp": 0.15966797, + "step": 6035, + "time_per_iteration": 2.5606629848480225 + }, + { + "auxiliary_loss_clip": 0.06479318, + "auxiliary_loss_mlp": 0.01272395, + "balance_loss_clip": 0.06295764, + "balance_loss_mlp": 0.0125592, + "epoch": 0.3629039531038629, + "flos": 20162919358080.0, + "grad_norm": 1.7740824971358589, + "language_loss": 0.73855877, + "learning_rate": 2.9457867820184496e-06, + "loss": 0.81607592, + "num_input_tokens_seen": 129600350, + "router_z_loss_clip": 1.83398438, + "router_z_loss_mlp": 0.16455078, + "step": 6036, + "time_per_iteration": 2.5144500732421875 + }, + { + "auxiliary_loss_clip": 0.06482191, + "auxiliary_loss_mlp": 0.01272832, + "balance_loss_clip": 0.06296846, + "balance_loss_mlp": 0.01256823, + "epoch": 0.3629640763565309, + "flos": 18631838027520.0, + "grad_norm": 1.8050884717083873, + "language_loss": 0.76438695, + "learning_rate": 2.945443601747297e-06, + "loss": 0.84193718, + "num_input_tokens_seen": 129618425, + "router_z_loss_clip": 1.8515625, + "router_z_loss_mlp": 0.16015625, + "step": 6037, + "time_per_iteration": 2.5286643505096436 + }, + { + "auxiliary_loss_clip": 0.06467965, + "auxiliary_loss_mlp": 0.01277972, + "balance_loss_clip": 0.06292737, + "balance_loss_mlp": 0.01262546, + "epoch": 0.36302419960919885, + "flos": 19577256445440.0, + "grad_norm": 1.633141884703147, + "language_loss": 0.78871524, + "learning_rate": 2.945100385624828e-06, + "loss": 0.86617458, + "num_input_tokens_seen": 129636750, + "router_z_loss_clip": 1.75097656, + "router_z_loss_mlp": 0.1541748, + "step": 6038, + "time_per_iteration": 2.5062947273254395 + }, + { + "auxiliary_loss_clip": 0.06400688, + "auxiliary_loss_mlp": 0.01261234, + "balance_loss_clip": 0.06318134, + "balance_loss_mlp": 0.01257723, + "epoch": 0.3630843228618668, + "flos": 63817805589120.0, + "grad_norm": 0.8140528620617334, + "language_loss": 0.63225597, + "learning_rate": 2.9447571336640573e-06, + "loss": 0.70887518, + "num_input_tokens_seen": 129699030, + "router_z_loss_clip": 0.82470703, + "router_z_loss_mlp": 0.03512573, + "step": 6039, + "time_per_iteration": 3.269761323928833 + }, + { + "auxiliary_loss_clip": 0.06467007, + "auxiliary_loss_mlp": 0.01269703, + "balance_loss_clip": 0.06289599, + "balance_loss_mlp": 0.01253932, + "epoch": 0.3631444461145348, + "flos": 21841600855680.0, + "grad_norm": 2.592040544468795, + "language_loss": 0.71409321, + "learning_rate": 2.944413845878002e-06, + "loss": 0.79146034, + "num_input_tokens_seen": 129717135, + "router_z_loss_clip": 1.7734375, + "router_z_loss_mlp": 0.15783691, + "step": 6040, + "time_per_iteration": 2.5549709796905518 + }, + { + "auxiliary_loss_clip": 0.06477243, + "auxiliary_loss_mlp": 0.01276394, + "balance_loss_clip": 0.06293249, + "balance_loss_mlp": 0.01260277, + "epoch": 0.36320456936720275, + "flos": 21727850538240.0, + "grad_norm": 1.6745525965006305, + "language_loss": 0.81387192, + "learning_rate": 2.9440705222796783e-06, + "loss": 0.89140832, + "num_input_tokens_seen": 129735940, + "router_z_loss_clip": 1.83789062, + "router_z_loss_mlp": 0.16113281, + "step": 6041, + "time_per_iteration": 2.529555320739746 + }, + { + "auxiliary_loss_clip": 0.06473525, + "auxiliary_loss_mlp": 0.01278326, + "balance_loss_clip": 0.0629223, + "balance_loss_mlp": 0.01261291, + "epoch": 0.3632646926198707, + "flos": 17024713516800.0, + "grad_norm": 3.0330286867158547, + "language_loss": 0.8477391, + "learning_rate": 2.943727162882107e-06, + "loss": 0.92525762, + "num_input_tokens_seen": 129752790, + "router_z_loss_clip": 1.81347656, + "router_z_loss_mlp": 0.17016602, + "step": 6042, + "time_per_iteration": 2.52242112159729 + }, + { + "auxiliary_loss_clip": 0.06469671, + "auxiliary_loss_mlp": 0.01277961, + "balance_loss_clip": 0.06290909, + "balance_loss_mlp": 0.01261892, + "epoch": 0.36332481587253873, + "flos": 23337868014720.0, + "grad_norm": 1.7311470578574424, + "language_loss": 0.78563523, + "learning_rate": 2.9433837676983064e-06, + "loss": 0.86311156, + "num_input_tokens_seen": 129773655, + "router_z_loss_clip": 1.78808594, + "router_z_loss_mlp": 0.16088867, + "step": 6043, + "time_per_iteration": 2.5507187843322754 + }, + { + "auxiliary_loss_clip": 0.06465393, + "auxiliary_loss_mlp": 0.0127573, + "balance_loss_clip": 0.06289753, + "balance_loss_mlp": 0.01258755, + "epoch": 0.3633849391252067, + "flos": 10748134126080.0, + "grad_norm": 2.0752100798218245, + "language_loss": 0.66141021, + "learning_rate": 2.943040336741298e-06, + "loss": 0.73882145, + "num_input_tokens_seen": 129791605, + "router_z_loss_clip": 1.75683594, + "router_z_loss_mlp": 0.16967773, + "step": 6044, + "time_per_iteration": 2.5431315898895264 + }, + { + "auxiliary_loss_clip": 0.06470387, + "auxiliary_loss_mlp": 0.0127397, + "balance_loss_clip": 0.06293066, + "balance_loss_mlp": 0.01258794, + "epoch": 0.36344506237787466, + "flos": 25856351458560.0, + "grad_norm": 1.7019744870222642, + "language_loss": 0.81317604, + "learning_rate": 2.9426968700241066e-06, + "loss": 0.89061964, + "num_input_tokens_seen": 129811075, + "router_z_loss_clip": 1.7734375, + "router_z_loss_mlp": 0.15185547, + "step": 6045, + "time_per_iteration": 2.578608274459839 + }, + { + "auxiliary_loss_clip": 0.06471765, + "auxiliary_loss_mlp": 0.01277035, + "balance_loss_clip": 0.06291001, + "balance_loss_mlp": 0.01260977, + "epoch": 0.3635051856305426, + "flos": 30161900056320.0, + "grad_norm": 1.9031490691130954, + "language_loss": 0.64869618, + "learning_rate": 2.942353367559755e-06, + "loss": 0.72618413, + "num_input_tokens_seen": 129833755, + "router_z_loss_clip": 1.80761719, + "router_z_loss_mlp": 0.16064453, + "step": 6046, + "time_per_iteration": 2.6581788063049316 + }, + { + "auxiliary_loss_clip": 0.06469898, + "auxiliary_loss_mlp": 0.01279877, + "balance_loss_clip": 0.06291277, + "balance_loss_mlp": 0.01264082, + "epoch": 0.3635653088832106, + "flos": 22204626670080.0, + "grad_norm": 1.4883910134219482, + "language_loss": 0.77790976, + "learning_rate": 2.9420098293612692e-06, + "loss": 0.85540754, + "num_input_tokens_seen": 129854475, + "router_z_loss_clip": 1.78417969, + "router_z_loss_mlp": 0.15783691, + "step": 6047, + "time_per_iteration": 2.59384822845459 + }, + { + "auxiliary_loss_clip": 0.06482202, + "auxiliary_loss_mlp": 0.01277437, + "balance_loss_clip": 0.0629375, + "balance_loss_mlp": 0.01259794, + "epoch": 0.36362543213587856, + "flos": 24793409289600.0, + "grad_norm": 2.402065763679051, + "language_loss": 0.79315472, + "learning_rate": 2.9416662554416767e-06, + "loss": 0.87075114, + "num_input_tokens_seen": 129873530, + "router_z_loss_clip": 1.88574219, + "router_z_loss_mlp": 0.1763916, + "step": 6048, + "time_per_iteration": 2.586355447769165 + }, + { + "auxiliary_loss_clip": 0.06388409, + "auxiliary_loss_mlp": 0.01275978, + "balance_loss_clip": 0.06308184, + "balance_loss_mlp": 0.01272211, + "epoch": 0.3636855553885465, + "flos": 62547320056320.0, + "grad_norm": 0.756250652706744, + "language_loss": 0.52505761, + "learning_rate": 2.9413226458140054e-06, + "loss": 0.6017015, + "num_input_tokens_seen": 129940400, + "router_z_loss_clip": 0.80126953, + "router_z_loss_mlp": 0.03759766, + "step": 6049, + "time_per_iteration": 3.1991608142852783 + }, + { + "auxiliary_loss_clip": 0.06471006, + "auxiliary_loss_mlp": 0.01281005, + "balance_loss_clip": 0.06289691, + "balance_loss_mlp": 0.01264518, + "epoch": 0.3637456786412145, + "flos": 24067441514880.0, + "grad_norm": 1.9518715754512581, + "language_loss": 0.8677333, + "learning_rate": 2.9409790004912845e-06, + "loss": 0.94525343, + "num_input_tokens_seen": 129958635, + "router_z_loss_clip": 1.81152344, + "router_z_loss_mlp": 0.16467285, + "step": 6050, + "time_per_iteration": 2.619880437850952 + }, + { + "auxiliary_loss_clip": 0.06465575, + "auxiliary_loss_mlp": 0.01288294, + "balance_loss_clip": 0.06288004, + "balance_loss_mlp": 0.01271784, + "epoch": 0.36380580189388245, + "flos": 16697214633600.0, + "grad_norm": 2.0514222430242937, + "language_loss": 0.78671187, + "learning_rate": 2.940635319486546e-06, + "loss": 0.86425054, + "num_input_tokens_seen": 129977685, + "router_z_loss_clip": 1.77539062, + "router_z_loss_mlp": 0.16491699, + "step": 6051, + "time_per_iteration": 2.5192694664001465 + }, + { + "auxiliary_loss_clip": 0.064697, + "auxiliary_loss_mlp": 0.0128748, + "balance_loss_clip": 0.06289212, + "balance_loss_mlp": 0.01271315, + "epoch": 0.3638659251465504, + "flos": 25120279267200.0, + "grad_norm": 2.1218426019343943, + "language_loss": 0.82423818, + "learning_rate": 2.940291602812822e-06, + "loss": 0.90180993, + "num_input_tokens_seen": 129997530, + "router_z_loss_clip": 1.8046875, + "router_z_loss_mlp": 0.16174316, + "step": 6052, + "time_per_iteration": 2.6190178394317627 + }, + { + "auxiliary_loss_clip": 0.06462704, + "auxiliary_loss_mlp": 0.01293914, + "balance_loss_clip": 0.06289209, + "balance_loss_mlp": 0.0127831, + "epoch": 0.3639260483992184, + "flos": 23009698298880.0, + "grad_norm": 1.6976848198598335, + "language_loss": 0.72702307, + "learning_rate": 2.939947850483145e-06, + "loss": 0.80458927, + "num_input_tokens_seen": 130017955, + "router_z_loss_clip": 1.73339844, + "router_z_loss_mlp": 0.15588379, + "step": 6053, + "time_per_iteration": 2.5632545948028564 + }, + { + "auxiliary_loss_clip": 0.0637124, + "auxiliary_loss_mlp": 0.0126271, + "balance_loss_clip": 0.06291765, + "balance_loss_mlp": 0.01258046, + "epoch": 0.36398617165188635, + "flos": 70735043698560.0, + "grad_norm": 0.7367280535398725, + "language_loss": 0.61109686, + "learning_rate": 2.9396040625105532e-06, + "loss": 0.68743634, + "num_input_tokens_seen": 130074275, + "router_z_loss_clip": 0.79541016, + "router_z_loss_mlp": 0.04656982, + "step": 6054, + "time_per_iteration": 3.1670703887939453 + }, + { + "auxiliary_loss_clip": 0.06468257, + "auxiliary_loss_mlp": 0.01284514, + "balance_loss_clip": 0.06288631, + "balance_loss_mlp": 0.01267062, + "epoch": 0.3640462949045543, + "flos": 22241788755840.0, + "grad_norm": 2.4941401517388795, + "language_loss": 0.76399368, + "learning_rate": 2.9392602389080802e-06, + "loss": 0.84152138, + "num_input_tokens_seen": 130091375, + "router_z_loss_clip": 1.79589844, + "router_z_loss_mlp": 0.17456055, + "step": 6055, + "time_per_iteration": 2.5719425678253174 + }, + { + "auxiliary_loss_clip": 0.06463572, + "auxiliary_loss_mlp": 0.0128082, + "balance_loss_clip": 0.06286994, + "balance_loss_mlp": 0.01264023, + "epoch": 0.3641064181572223, + "flos": 21549964320000.0, + "grad_norm": 1.5003458585655993, + "language_loss": 0.75247842, + "learning_rate": 2.938916379688765e-06, + "loss": 0.82992232, + "num_input_tokens_seen": 130111595, + "router_z_loss_clip": 1.765625, + "router_z_loss_mlp": 0.16784668, + "step": 6056, + "time_per_iteration": 2.548563241958618 + }, + { + "auxiliary_loss_clip": 0.06463505, + "auxiliary_loss_mlp": 0.01288137, + "balance_loss_clip": 0.06286436, + "balance_loss_mlp": 0.01271805, + "epoch": 0.3641665414098903, + "flos": 22279873236480.0, + "grad_norm": 1.8427248639079936, + "language_loss": 0.80231911, + "learning_rate": 2.9385724848656468e-06, + "loss": 0.87983549, + "num_input_tokens_seen": 130131440, + "router_z_loss_clip": 1.77050781, + "router_z_loss_mlp": 0.16320801, + "step": 6057, + "time_per_iteration": 2.590890645980835 + }, + { + "auxiliary_loss_clip": 0.06463237, + "auxiliary_loss_mlp": 0.01288366, + "balance_loss_clip": 0.06286855, + "balance_loss_mlp": 0.01271259, + "epoch": 0.36422666466255826, + "flos": 28337211619200.0, + "grad_norm": 2.0267495677395106, + "language_loss": 0.80895132, + "learning_rate": 2.9382285544517647e-06, + "loss": 0.88646734, + "num_input_tokens_seen": 130151375, + "router_z_loss_clip": 1.76367188, + "router_z_loss_mlp": 0.17114258, + "step": 6058, + "time_per_iteration": 3.9912350177764893 + }, + { + "auxiliary_loss_clip": 0.06462751, + "auxiliary_loss_mlp": 0.01284352, + "balance_loss_clip": 0.06282878, + "balance_loss_mlp": 0.01267794, + "epoch": 0.36428678791522623, + "flos": 24177376471680.0, + "grad_norm": 1.829086801108262, + "language_loss": 0.84467566, + "learning_rate": 2.9378845884601636e-06, + "loss": 0.9221468, + "num_input_tokens_seen": 130169960, + "router_z_loss_clip": 1.80078125, + "router_z_loss_mlp": 0.16552734, + "step": 6059, + "time_per_iteration": 3.9484288692474365 + }, + { + "auxiliary_loss_clip": 0.06463904, + "auxiliary_loss_mlp": 0.01290231, + "balance_loss_clip": 0.06284287, + "balance_loss_mlp": 0.01274006, + "epoch": 0.3643469111678942, + "flos": 22535018519040.0, + "grad_norm": 1.8662633122766634, + "language_loss": 0.88296366, + "learning_rate": 2.937540586903884e-06, + "loss": 0.96050501, + "num_input_tokens_seen": 130189800, + "router_z_loss_clip": 1.796875, + "router_z_loss_mlp": 0.16223145, + "step": 6060, + "time_per_iteration": 2.580472946166992 + }, + { + "auxiliary_loss_clip": 0.06469811, + "auxiliary_loss_mlp": 0.01278183, + "balance_loss_clip": 0.06287585, + "balance_loss_mlp": 0.01260611, + "epoch": 0.36440703442056216, + "flos": 19432549244160.0, + "grad_norm": 2.050716636944588, + "language_loss": 0.66968513, + "learning_rate": 2.937196549795971e-06, + "loss": 0.74716496, + "num_input_tokens_seen": 130206370, + "router_z_loss_clip": 1.82226562, + "router_z_loss_mlp": 0.17578125, + "step": 6061, + "time_per_iteration": 2.4934303760528564 + }, + { + "auxiliary_loss_clip": 0.06472699, + "auxiliary_loss_mlp": 0.01276187, + "balance_loss_clip": 0.06290831, + "balance_loss_mlp": 0.01259283, + "epoch": 0.3644671576732301, + "flos": 18046300896000.0, + "grad_norm": 2.6099029342135838, + "language_loss": 0.76223081, + "learning_rate": 2.9368524771494718e-06, + "loss": 0.83971971, + "num_input_tokens_seen": 130224445, + "router_z_loss_clip": 1.81835938, + "router_z_loss_mlp": 0.16918945, + "step": 6062, + "time_per_iteration": 2.5342442989349365 + }, + { + "auxiliary_loss_clip": 0.06462175, + "auxiliary_loss_mlp": 0.01277866, + "balance_loss_clip": 0.06284274, + "balance_loss_mlp": 0.01261844, + "epoch": 0.3645272809258981, + "flos": 21549125779200.0, + "grad_norm": 1.679264330509425, + "language_loss": 0.7250427, + "learning_rate": 2.936508368977432e-06, + "loss": 0.80244315, + "num_input_tokens_seen": 130245380, + "router_z_loss_clip": 1.77929688, + "router_z_loss_mlp": 0.16027832, + "step": 6063, + "time_per_iteration": 2.560140609741211 + }, + { + "auxiliary_loss_clip": 0.06463223, + "auxiliary_loss_mlp": 0.01278838, + "balance_loss_clip": 0.0628884, + "balance_loss_mlp": 0.0126256, + "epoch": 0.36458740417856605, + "flos": 22753379059200.0, + "grad_norm": 1.9927269992491163, + "language_loss": 0.67982519, + "learning_rate": 2.936164225292901e-06, + "loss": 0.75724578, + "num_input_tokens_seen": 130265575, + "router_z_loss_clip": 1.74414062, + "router_z_loss_mlp": 0.16265869, + "step": 6064, + "time_per_iteration": 4.001475095748901 + }, + { + "auxiliary_loss_clip": 0.06469691, + "auxiliary_loss_mlp": 0.01281677, + "balance_loss_clip": 0.06288914, + "balance_loss_mlp": 0.01265131, + "epoch": 0.364647527431234, + "flos": 26147862213120.0, + "grad_norm": 2.2981357468080725, + "language_loss": 0.75006247, + "learning_rate": 2.9358200461089297e-06, + "loss": 0.82757616, + "num_input_tokens_seen": 130286195, + "router_z_loss_clip": 1.80761719, + "router_z_loss_mlp": 0.16540527, + "step": 6065, + "time_per_iteration": 2.557175397872925 + }, + { + "auxiliary_loss_clip": 0.06475934, + "auxiliary_loss_mlp": 0.01274844, + "balance_loss_clip": 0.06292161, + "balance_loss_mlp": 0.01257487, + "epoch": 0.364707650683902, + "flos": 31037941693440.0, + "grad_norm": 1.8804228270875918, + "language_loss": 0.75913531, + "learning_rate": 2.9354758314385676e-06, + "loss": 0.8366431, + "num_input_tokens_seen": 130306095, + "router_z_loss_clip": 1.83691406, + "router_z_loss_mlp": 0.17370605, + "step": 6066, + "time_per_iteration": 4.028696537017822 + }, + { + "auxiliary_loss_clip": 0.06465262, + "auxiliary_loss_mlp": 0.01275132, + "balance_loss_clip": 0.06290717, + "balance_loss_mlp": 0.01260124, + "epoch": 0.36476777393656995, + "flos": 19578933527040.0, + "grad_norm": 2.1324188585544293, + "language_loss": 0.77645338, + "learning_rate": 2.9351315812948684e-06, + "loss": 0.85385728, + "num_input_tokens_seen": 130324685, + "router_z_loss_clip": 1.74414062, + "router_z_loss_mlp": 0.15014648, + "step": 6067, + "time_per_iteration": 2.5697665214538574 + }, + { + "auxiliary_loss_clip": 0.06463823, + "auxiliary_loss_mlp": 0.01273764, + "balance_loss_clip": 0.06289702, + "balance_loss_mlp": 0.01258684, + "epoch": 0.3648278971892379, + "flos": 17754622433280.0, + "grad_norm": 1.930394247385299, + "language_loss": 0.71678597, + "learning_rate": 2.934787295690886e-06, + "loss": 0.7941618, + "num_input_tokens_seen": 130343855, + "router_z_loss_clip": 1.74023438, + "router_z_loss_mlp": 0.15063477, + "step": 6068, + "time_per_iteration": 2.4845492839813232 + }, + { + "auxiliary_loss_clip": 0.06473656, + "auxiliary_loss_mlp": 0.0127485, + "balance_loss_clip": 0.06290961, + "balance_loss_mlp": 0.01258005, + "epoch": 0.3648880204419059, + "flos": 17936952917760.0, + "grad_norm": 1.8532098574136342, + "language_loss": 0.73989958, + "learning_rate": 2.9344429746396755e-06, + "loss": 0.8173846, + "num_input_tokens_seen": 130362320, + "router_z_loss_clip": 1.82519531, + "router_z_loss_mlp": 0.16845703, + "step": 6069, + "time_per_iteration": 2.508863687515259 + }, + { + "auxiliary_loss_clip": 0.06469753, + "auxiliary_loss_mlp": 0.01277718, + "balance_loss_clip": 0.06287999, + "balance_loss_mlp": 0.01261684, + "epoch": 0.3649481436945739, + "flos": 22644911548800.0, + "grad_norm": 1.9157179359535086, + "language_loss": 0.66736126, + "learning_rate": 2.9340986181542945e-06, + "loss": 0.74483597, + "num_input_tokens_seen": 130383165, + "router_z_loss_clip": 1.81738281, + "router_z_loss_mlp": 0.16027832, + "step": 6070, + "time_per_iteration": 2.516735076904297 + }, + { + "auxiliary_loss_clip": 0.06467332, + "auxiliary_loss_mlp": 0.01274362, + "balance_loss_clip": 0.06291667, + "balance_loss_mlp": 0.01259169, + "epoch": 0.36500826694724187, + "flos": 21586036302720.0, + "grad_norm": 1.8858284323375742, + "language_loss": 0.7453323, + "learning_rate": 2.9337542262477994e-06, + "loss": 0.82274926, + "num_input_tokens_seen": 130402425, + "router_z_loss_clip": 1.75878906, + "router_z_loss_mlp": 0.1519165, + "step": 6071, + "time_per_iteration": 2.566274642944336 + }, + { + "auxiliary_loss_clip": 0.06468312, + "auxiliary_loss_mlp": 0.01272795, + "balance_loss_clip": 0.0629068, + "balance_loss_mlp": 0.0125703, + "epoch": 0.36506839019990983, + "flos": 13777746675840.0, + "grad_norm": 1.7184690359068113, + "language_loss": 0.88681865, + "learning_rate": 2.9334097989332506e-06, + "loss": 0.96422982, + "num_input_tokens_seen": 130419440, + "router_z_loss_clip": 1.77539062, + "router_z_loss_mlp": 0.15771484, + "step": 6072, + "time_per_iteration": 2.510390043258667 + }, + { + "auxiliary_loss_clip": 0.06471045, + "auxiliary_loss_mlp": 0.01276068, + "balance_loss_clip": 0.06292107, + "balance_loss_mlp": 0.01260285, + "epoch": 0.3651285134525778, + "flos": 17280739267200.0, + "grad_norm": 2.591250971390436, + "language_loss": 0.72601849, + "learning_rate": 2.9330653362237094e-06, + "loss": 0.80348963, + "num_input_tokens_seen": 130438495, + "router_z_loss_clip": 1.79101562, + "router_z_loss_mlp": 0.15771484, + "step": 6073, + "time_per_iteration": 2.5448079109191895 + }, + { + "auxiliary_loss_clip": 0.06476631, + "auxiliary_loss_mlp": 0.012706, + "balance_loss_clip": 0.06296042, + "balance_loss_mlp": 0.0125422, + "epoch": 0.36518863670524576, + "flos": 21914415653760.0, + "grad_norm": 2.188049192517554, + "language_loss": 0.66876209, + "learning_rate": 2.932720838132236e-06, + "loss": 0.74623442, + "num_input_tokens_seen": 130455575, + "router_z_loss_clip": 1.8046875, + "router_z_loss_mlp": 0.16394043, + "step": 6074, + "time_per_iteration": 2.5186121463775635 + }, + { + "auxiliary_loss_clip": 0.06466351, + "auxiliary_loss_mlp": 0.01270864, + "balance_loss_clip": 0.06289779, + "balance_loss_mlp": 0.01255319, + "epoch": 0.3652487599579137, + "flos": 27128933343360.0, + "grad_norm": 1.455377552522792, + "language_loss": 0.73552799, + "learning_rate": 2.9323763046718954e-06, + "loss": 0.81290013, + "num_input_tokens_seen": 130476385, + "router_z_loss_clip": 1.76269531, + "router_z_loss_mlp": 0.15551758, + "step": 6075, + "time_per_iteration": 2.5611414909362793 + }, + { + "auxiliary_loss_clip": 0.06476435, + "auxiliary_loss_mlp": 0.01270879, + "balance_loss_clip": 0.06292082, + "balance_loss_mlp": 0.01255107, + "epoch": 0.3653088832105817, + "flos": 19761683281920.0, + "grad_norm": 3.551310730384351, + "language_loss": 0.89872956, + "learning_rate": 2.9320317358557524e-06, + "loss": 0.97620273, + "num_input_tokens_seen": 130493630, + "router_z_loss_clip": 1.84179688, + "router_z_loss_mlp": 0.15771484, + "step": 6076, + "time_per_iteration": 2.491070508956909 + }, + { + "auxiliary_loss_clip": 0.06471214, + "auxiliary_loss_mlp": 0.01269524, + "balance_loss_clip": 0.06294619, + "balance_loss_mlp": 0.01253782, + "epoch": 0.36536900646324966, + "flos": 13119981724800.0, + "grad_norm": 1.9522812947590364, + "language_loss": 0.69894624, + "learning_rate": 2.931687131696872e-06, + "loss": 0.7763536, + "num_input_tokens_seen": 130510735, + "router_z_loss_clip": 1.76855469, + "router_z_loss_mlp": 0.15740967, + "step": 6077, + "time_per_iteration": 2.5298445224761963 + }, + { + "auxiliary_loss_clip": 0.06367216, + "auxiliary_loss_mlp": 0.01255974, + "balance_loss_clip": 0.06288684, + "balance_loss_mlp": 0.0125196, + "epoch": 0.3654291297159176, + "flos": 71122848393600.0, + "grad_norm": 0.715882721223993, + "language_loss": 0.61670828, + "learning_rate": 2.9313424922083224e-06, + "loss": 0.69294018, + "num_input_tokens_seen": 130577050, + "router_z_loss_clip": 0.78271484, + "router_z_loss_mlp": 0.04013062, + "step": 6078, + "time_per_iteration": 3.245680093765259 + }, + { + "auxiliary_loss_clip": 0.06468864, + "auxiliary_loss_mlp": 0.01269715, + "balance_loss_clip": 0.0628942, + "balance_loss_mlp": 0.01254217, + "epoch": 0.3654892529685856, + "flos": 23623299348480.0, + "grad_norm": 2.6954686860737427, + "language_loss": 0.78565228, + "learning_rate": 2.930997817403173e-06, + "loss": 0.86303806, + "num_input_tokens_seen": 130593780, + "router_z_loss_clip": 1.79492188, + "router_z_loss_mlp": 0.1550293, + "step": 6079, + "time_per_iteration": 2.5243916511535645 + }, + { + "auxiliary_loss_clip": 0.06474455, + "auxiliary_loss_mlp": 0.0127227, + "balance_loss_clip": 0.06293908, + "balance_loss_mlp": 0.01255557, + "epoch": 0.36554937622125355, + "flos": 43480788174720.0, + "grad_norm": 2.827080544182906, + "language_loss": 0.62854588, + "learning_rate": 2.9306531072944913e-06, + "loss": 0.70601308, + "num_input_tokens_seen": 130615510, + "router_z_loss_clip": 1.80371094, + "router_z_loss_mlp": 0.16711426, + "step": 6080, + "time_per_iteration": 2.755979299545288 + }, + { + "auxiliary_loss_clip": 0.06473932, + "auxiliary_loss_mlp": 0.01273454, + "balance_loss_clip": 0.06292675, + "balance_loss_mlp": 0.012568, + "epoch": 0.3656094994739215, + "flos": 23301334834560.0, + "grad_norm": 2.0380719718304046, + "language_loss": 0.68215913, + "learning_rate": 2.930308361895352e-06, + "loss": 0.75963295, + "num_input_tokens_seen": 130635410, + "router_z_loss_clip": 1.8125, + "router_z_loss_mlp": 0.16674805, + "step": 6081, + "time_per_iteration": 2.5318713188171387 + }, + { + "auxiliary_loss_clip": 0.06476995, + "auxiliary_loss_mlp": 0.01283221, + "balance_loss_clip": 0.06289314, + "balance_loss_mlp": 0.01267021, + "epoch": 0.3656696227265895, + "flos": 24578947964160.0, + "grad_norm": 1.6214502004720641, + "language_loss": 0.75242162, + "learning_rate": 2.9299635812188257e-06, + "loss": 0.83002377, + "num_input_tokens_seen": 130657725, + "router_z_loss_clip": 1.87597656, + "router_z_loss_mlp": 0.1619873, + "step": 6082, + "time_per_iteration": 2.614473819732666 + }, + { + "auxiliary_loss_clip": 0.06474194, + "auxiliary_loss_mlp": 0.0127049, + "balance_loss_clip": 0.06295186, + "balance_loss_mlp": 0.01255851, + "epoch": 0.3657297459792575, + "flos": 27935849761920.0, + "grad_norm": 4.519769037138984, + "language_loss": 0.83192384, + "learning_rate": 2.929618765277987e-06, + "loss": 0.90937066, + "num_input_tokens_seen": 130678360, + "router_z_loss_clip": 1.79199219, + "router_z_loss_mlp": 0.14660645, + "step": 6083, + "time_per_iteration": 2.569382429122925 + }, + { + "auxiliary_loss_clip": 0.06373743, + "auxiliary_loss_mlp": 0.01258609, + "balance_loss_clip": 0.06293802, + "balance_loss_mlp": 0.01254855, + "epoch": 0.36578986923192547, + "flos": 67410566231040.0, + "grad_norm": 0.7897440828264927, + "language_loss": 0.59315842, + "learning_rate": 2.9292739140859125e-06, + "loss": 0.66948193, + "num_input_tokens_seen": 130742110, + "router_z_loss_clip": 0.796875, + "router_z_loss_mlp": 0.03747559, + "step": 6084, + "time_per_iteration": 3.2453150749206543 + }, + { + "auxiliary_loss_clip": 0.0646999, + "auxiliary_loss_mlp": 0.0127453, + "balance_loss_clip": 0.06292025, + "balance_loss_mlp": 0.01258801, + "epoch": 0.36584999248459343, + "flos": 20233302387840.0, + "grad_norm": 1.9605927592145687, + "language_loss": 0.73469806, + "learning_rate": 2.9289290276556767e-06, + "loss": 0.81214333, + "num_input_tokens_seen": 130759870, + "router_z_loss_clip": 1.78320312, + "router_z_loss_mlp": 0.15734863, + "step": 6085, + "time_per_iteration": 2.5149080753326416 + }, + { + "auxiliary_loss_clip": 0.06475443, + "auxiliary_loss_mlp": 0.01272781, + "balance_loss_clip": 0.06296027, + "balance_loss_mlp": 0.01256974, + "epoch": 0.3659101157372614, + "flos": 19068475253760.0, + "grad_norm": 1.7755618246241633, + "language_loss": 0.78367889, + "learning_rate": 2.9285841060003604e-06, + "loss": 0.86116111, + "num_input_tokens_seen": 130778510, + "router_z_loss_clip": 1.79492188, + "router_z_loss_mlp": 0.15802002, + "step": 6086, + "time_per_iteration": 2.6959855556488037 + }, + { + "auxiliary_loss_clip": 0.06460601, + "auxiliary_loss_mlp": 0.01277624, + "balance_loss_clip": 0.0628686, + "balance_loss_mlp": 0.01262449, + "epoch": 0.36597023898992936, + "flos": 30818658758400.0, + "grad_norm": 2.7333963743808387, + "language_loss": 0.77419388, + "learning_rate": 2.9282391491330416e-06, + "loss": 0.85157609, + "num_input_tokens_seen": 130798535, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.15185547, + "step": 6087, + "time_per_iteration": 2.660513401031494 + }, + { + "auxiliary_loss_clip": 0.06470397, + "auxiliary_loss_mlp": 0.01281375, + "balance_loss_clip": 0.06288096, + "balance_loss_mlp": 0.0126543, + "epoch": 0.36603036224259733, + "flos": 20528041524480.0, + "grad_norm": 2.0856395013908005, + "language_loss": 0.70779794, + "learning_rate": 2.9278941570668002e-06, + "loss": 0.78531569, + "num_input_tokens_seen": 130816655, + "router_z_loss_clip": 1.82226562, + "router_z_loss_mlp": 0.15948486, + "step": 6088, + "time_per_iteration": 2.5904111862182617 + }, + { + "auxiliary_loss_clip": 0.064822, + "auxiliary_loss_mlp": 0.0127711, + "balance_loss_clip": 0.06290494, + "balance_loss_mlp": 0.01258835, + "epoch": 0.3660904854952653, + "flos": 38339043356160.0, + "grad_norm": 1.5018444157956148, + "language_loss": 0.8073988, + "learning_rate": 2.92754912981472e-06, + "loss": 0.88499188, + "num_input_tokens_seen": 130841225, + "router_z_loss_clip": 1.9140625, + "router_z_loss_mlp": 0.18273926, + "step": 6089, + "time_per_iteration": 2.695387125015259 + }, + { + "auxiliary_loss_clip": 0.06466638, + "auxiliary_loss_mlp": 0.0126828, + "balance_loss_clip": 0.06289521, + "balance_loss_mlp": 0.01254065, + "epoch": 0.36615060874793326, + "flos": 21842062053120.0, + "grad_norm": 1.783943984741075, + "language_loss": 0.71745276, + "learning_rate": 2.927204067389884e-06, + "loss": 0.79480195, + "num_input_tokens_seen": 130861050, + "router_z_loss_clip": 1.77050781, + "router_z_loss_mlp": 0.14208984, + "step": 6090, + "time_per_iteration": 2.5730583667755127 + }, + { + "auxiliary_loss_clip": 0.06467035, + "auxiliary_loss_mlp": 0.01270022, + "balance_loss_clip": 0.06292006, + "balance_loss_mlp": 0.01254585, + "epoch": 0.3662107320006012, + "flos": 16587153895680.0, + "grad_norm": 1.8168526275922985, + "language_loss": 0.74269617, + "learning_rate": 2.9268589698053763e-06, + "loss": 0.82006675, + "num_input_tokens_seen": 130879775, + "router_z_loss_clip": 1.75195312, + "router_z_loss_mlp": 0.1541748, + "step": 6091, + "time_per_iteration": 2.5094668865203857 + }, + { + "auxiliary_loss_clip": 0.06470925, + "auxiliary_loss_mlp": 0.01271934, + "balance_loss_clip": 0.062924, + "balance_loss_mlp": 0.01256699, + "epoch": 0.3662708552532692, + "flos": 20964469115520.0, + "grad_norm": 2.9410218249320796, + "language_loss": 0.72888803, + "learning_rate": 2.926513837074284e-06, + "loss": 0.80631661, + "num_input_tokens_seen": 130898070, + "router_z_loss_clip": 1.78613281, + "router_z_loss_mlp": 0.15234375, + "step": 6092, + "time_per_iteration": 2.525499105453491 + }, + { + "auxiliary_loss_clip": 0.06472248, + "auxiliary_loss_mlp": 0.01276986, + "balance_loss_clip": 0.06288992, + "balance_loss_mlp": 0.01260833, + "epoch": 0.36633097850593715, + "flos": 21908252378880.0, + "grad_norm": 2.382181592286333, + "language_loss": 0.78829455, + "learning_rate": 2.9261686692096942e-06, + "loss": 0.86578685, + "num_input_tokens_seen": 130915250, + "router_z_loss_clip": 1.83105469, + "router_z_loss_mlp": 0.16174316, + "step": 6093, + "time_per_iteration": 2.519925355911255 + }, + { + "auxiliary_loss_clip": 0.06470528, + "auxiliary_loss_mlp": 0.01272915, + "balance_loss_clip": 0.06288898, + "balance_loss_mlp": 0.0125743, + "epoch": 0.3663911017586051, + "flos": 32862462422400.0, + "grad_norm": 1.6789792555665461, + "language_loss": 0.74561131, + "learning_rate": 2.925823466224696e-06, + "loss": 0.82304573, + "num_input_tokens_seen": 130936995, + "router_z_loss_clip": 1.81640625, + "router_z_loss_mlp": 0.15478516, + "step": 6094, + "time_per_iteration": 2.6374077796936035 + }, + { + "auxiliary_loss_clip": 0.06470601, + "auxiliary_loss_mlp": 0.01277645, + "balance_loss_clip": 0.06289363, + "balance_loss_mlp": 0.01261421, + "epoch": 0.3664512250112731, + "flos": 27279132986880.0, + "grad_norm": 1.6273421100585188, + "language_loss": 0.7975142, + "learning_rate": 2.9254782281323785e-06, + "loss": 0.87499666, + "num_input_tokens_seen": 130957970, + "router_z_loss_clip": 1.81152344, + "router_z_loss_mlp": 0.16223145, + "step": 6095, + "time_per_iteration": 2.565009117126465 + }, + { + "auxiliary_loss_clip": 0.06480707, + "auxiliary_loss_mlp": 0.01275122, + "balance_loss_clip": 0.06295107, + "balance_loss_mlp": 0.01258552, + "epoch": 0.3665113482639411, + "flos": 17790065510400.0, + "grad_norm": 2.4875649346087725, + "language_loss": 0.73963505, + "learning_rate": 2.925132954945834e-06, + "loss": 0.81719339, + "num_input_tokens_seen": 130974915, + "router_z_loss_clip": 1.859375, + "router_z_loss_mlp": 0.16577148, + "step": 6096, + "time_per_iteration": 2.5150070190429688 + }, + { + "auxiliary_loss_clip": 0.06474067, + "auxiliary_loss_mlp": 0.01271541, + "balance_loss_clip": 0.06288943, + "balance_loss_mlp": 0.01255901, + "epoch": 0.36657147151660907, + "flos": 27861944860800.0, + "grad_norm": 1.9533584433338151, + "language_loss": 0.67592847, + "learning_rate": 2.924787646678155e-06, + "loss": 0.75338453, + "num_input_tokens_seen": 130995745, + "router_z_loss_clip": 1.8515625, + "router_z_loss_mlp": 0.15649414, + "step": 6097, + "time_per_iteration": 4.085919618606567 + }, + { + "auxiliary_loss_clip": 0.06474558, + "auxiliary_loss_mlp": 0.01273059, + "balance_loss_clip": 0.06292384, + "balance_loss_mlp": 0.01257204, + "epoch": 0.36663159476927704, + "flos": 25381000846080.0, + "grad_norm": 1.4284875999183062, + "language_loss": 0.77924675, + "learning_rate": 2.9244423033424365e-06, + "loss": 0.85672289, + "num_input_tokens_seen": 131015545, + "router_z_loss_clip": 1.82421875, + "router_z_loss_mlp": 0.15856934, + "step": 6098, + "time_per_iteration": 4.075935363769531 + }, + { + "auxiliary_loss_clip": 0.06469452, + "auxiliary_loss_mlp": 0.01270135, + "balance_loss_clip": 0.06291129, + "balance_loss_mlp": 0.01254751, + "epoch": 0.366691718021945, + "flos": 21362979934080.0, + "grad_norm": 2.6338542151665862, + "language_loss": 0.73907244, + "learning_rate": 2.9240969249517723e-06, + "loss": 0.81646824, + "num_input_tokens_seen": 131033990, + "router_z_loss_clip": 1.78320312, + "router_z_loss_mlp": 0.15386963, + "step": 6099, + "time_per_iteration": 2.5343947410583496 + }, + { + "auxiliary_loss_clip": 0.06462912, + "auxiliary_loss_mlp": 0.01271369, + "balance_loss_clip": 0.06286579, + "balance_loss_mlp": 0.01256695, + "epoch": 0.36675184127461297, + "flos": 16806017560320.0, + "grad_norm": 1.7024924966611934, + "language_loss": 0.84795189, + "learning_rate": 2.9237515115192602e-06, + "loss": 0.92529464, + "num_input_tokens_seen": 131050710, + "router_z_loss_clip": 1.76171875, + "router_z_loss_mlp": 0.14660645, + "step": 6100, + "time_per_iteration": 2.5503897666931152 + }, + { + "auxiliary_loss_clip": 0.06478457, + "auxiliary_loss_mlp": 0.01268408, + "balance_loss_clip": 0.06293124, + "balance_loss_mlp": 0.0125216, + "epoch": 0.36681196452728093, + "flos": 21912696645120.0, + "grad_norm": 2.268106387872694, + "language_loss": 0.712331, + "learning_rate": 2.9234060630579992e-06, + "loss": 0.78979969, + "num_input_tokens_seen": 131071435, + "router_z_loss_clip": 1.85351562, + "router_z_loss_mlp": 0.16235352, + "step": 6101, + "time_per_iteration": 2.5698294639587402 + }, + { + "auxiliary_loss_clip": 0.06474541, + "auxiliary_loss_mlp": 0.01273553, + "balance_loss_clip": 0.0629383, + "balance_loss_mlp": 0.01257137, + "epoch": 0.3668720877799489, + "flos": 17718215034240.0, + "grad_norm": 2.179497141372214, + "language_loss": 0.76701671, + "learning_rate": 2.9230605795810865e-06, + "loss": 0.84449768, + "num_input_tokens_seen": 131088775, + "router_z_loss_clip": 1.8046875, + "router_z_loss_mlp": 0.16418457, + "step": 6102, + "time_per_iteration": 2.653047561645508 + }, + { + "auxiliary_loss_clip": 0.06477299, + "auxiliary_loss_mlp": 0.01279444, + "balance_loss_clip": 0.06290299, + "balance_loss_mlp": 0.01262099, + "epoch": 0.36693221103261686, + "flos": 47055882804480.0, + "grad_norm": 1.641444039565929, + "language_loss": 0.70188046, + "learning_rate": 2.922715061101625e-06, + "loss": 0.77944791, + "num_input_tokens_seen": 131112800, + "router_z_loss_clip": 1.86621094, + "router_z_loss_mlp": 0.17333984, + "step": 6103, + "time_per_iteration": 2.7502424716949463 + }, + { + "auxiliary_loss_clip": 0.06472746, + "auxiliary_loss_mlp": 0.01272056, + "balance_loss_clip": 0.06290279, + "balance_loss_mlp": 0.01255581, + "epoch": 0.3669923342852848, + "flos": 15966383322240.0, + "grad_norm": 1.6662921664183201, + "language_loss": 0.71920598, + "learning_rate": 2.922369507632716e-06, + "loss": 0.79665399, + "num_input_tokens_seen": 131131150, + "router_z_loss_clip": 1.82324219, + "router_z_loss_mlp": 0.16467285, + "step": 6104, + "time_per_iteration": 3.993805408477783 + }, + { + "auxiliary_loss_clip": 0.0647142, + "auxiliary_loss_mlp": 0.01272456, + "balance_loss_clip": 0.06291486, + "balance_loss_mlp": 0.01256494, + "epoch": 0.3670524575379528, + "flos": 19980630800640.0, + "grad_norm": 1.7978052174853272, + "language_loss": 0.81448174, + "learning_rate": 2.9220239191874617e-06, + "loss": 0.89192045, + "num_input_tokens_seen": 131150365, + "router_z_loss_clip": 1.79882812, + "router_z_loss_mlp": 0.15966797, + "step": 6105, + "time_per_iteration": 3.907820463180542 + }, + { + "auxiliary_loss_clip": 0.06477002, + "auxiliary_loss_mlp": 0.01272813, + "balance_loss_clip": 0.06288886, + "balance_loss_mlp": 0.01254896, + "epoch": 0.36711258079062076, + "flos": 25710092956800.0, + "grad_norm": 1.7139492182529468, + "language_loss": 0.81421959, + "learning_rate": 2.9216782957789692e-06, + "loss": 0.89171767, + "num_input_tokens_seen": 131169310, + "router_z_loss_clip": 1.88183594, + "router_z_loss_mlp": 0.17919922, + "step": 6106, + "time_per_iteration": 2.5623860359191895 + }, + { + "auxiliary_loss_clip": 0.06422871, + "auxiliary_loss_mlp": 0.01259281, + "balance_loss_clip": 0.06342293, + "balance_loss_mlp": 0.01254903, + "epoch": 0.3671727040432887, + "flos": 60793014648960.0, + "grad_norm": 0.6928078159632836, + "language_loss": 0.59215379, + "learning_rate": 2.9213326374203426e-06, + "loss": 0.66897523, + "num_input_tokens_seen": 131232900, + "router_z_loss_clip": 0.8046875, + "router_z_loss_mlp": 0.04385376, + "step": 6107, + "time_per_iteration": 3.2451207637786865 + }, + { + "auxiliary_loss_clip": 0.06468046, + "auxiliary_loss_mlp": 0.01273048, + "balance_loss_clip": 0.06291793, + "balance_loss_mlp": 0.01257396, + "epoch": 0.3672328272959567, + "flos": 18667281104640.0, + "grad_norm": 1.5826982165866754, + "language_loss": 0.74750638, + "learning_rate": 2.92098694412469e-06, + "loss": 0.82491726, + "num_input_tokens_seen": 131250920, + "router_z_loss_clip": 1.76171875, + "router_z_loss_mlp": 0.15631104, + "step": 6108, + "time_per_iteration": 2.5317509174346924 + }, + { + "auxiliary_loss_clip": 0.06472465, + "auxiliary_loss_mlp": 0.01275992, + "balance_loss_clip": 0.06289458, + "balance_loss_mlp": 0.01260482, + "epoch": 0.3672929505486247, + "flos": 15054395483520.0, + "grad_norm": 2.0251921146130547, + "language_loss": 0.74524188, + "learning_rate": 2.9206412159051213e-06, + "loss": 0.82272649, + "num_input_tokens_seen": 131267910, + "router_z_loss_clip": 1.83203125, + "router_z_loss_mlp": 0.15490723, + "step": 6109, + "time_per_iteration": 2.530214309692383 + }, + { + "auxiliary_loss_clip": 0.06464404, + "auxiliary_loss_mlp": 0.01270146, + "balance_loss_clip": 0.06286883, + "balance_loss_mlp": 0.0125503, + "epoch": 0.3673530738012927, + "flos": 20594693047680.0, + "grad_norm": 1.6431777634434088, + "language_loss": 0.53560948, + "learning_rate": 2.920295452774744e-06, + "loss": 0.61295497, + "num_input_tokens_seen": 131287150, + "router_z_loss_clip": 1.77734375, + "router_z_loss_mlp": 0.15112305, + "step": 6110, + "time_per_iteration": 2.5247035026550293 + }, + { + "auxiliary_loss_clip": 0.06459565, + "auxiliary_loss_mlp": 0.01275062, + "balance_loss_clip": 0.06284792, + "balance_loss_mlp": 0.01258957, + "epoch": 0.36741319705396064, + "flos": 21696348602880.0, + "grad_norm": 1.814369900920369, + "language_loss": 0.80767608, + "learning_rate": 2.919949654746672e-06, + "loss": 0.8850224, + "num_input_tokens_seen": 131308225, + "router_z_loss_clip": 1.74707031, + "router_z_loss_mlp": 0.16088867, + "step": 6111, + "time_per_iteration": 2.6213719844818115 + }, + { + "auxiliary_loss_clip": 0.06459287, + "auxiliary_loss_mlp": 0.01273038, + "balance_loss_clip": 0.06284556, + "balance_loss_mlp": 0.01256861, + "epoch": 0.3674733203066286, + "flos": 29870011958400.0, + "grad_norm": 1.7131296557309772, + "language_loss": 0.72860467, + "learning_rate": 2.9196038218340163e-06, + "loss": 0.80592787, + "num_input_tokens_seen": 131332115, + "router_z_loss_clip": 1.74609375, + "router_z_loss_mlp": 0.16174316, + "step": 6112, + "time_per_iteration": 2.656101703643799 + }, + { + "auxiliary_loss_clip": 0.06459092, + "auxiliary_loss_mlp": 0.01272993, + "balance_loss_clip": 0.06283998, + "balance_loss_mlp": 0.01257866, + "epoch": 0.36753344355929657, + "flos": 18262439303040.0, + "grad_norm": 1.5099687925303509, + "language_loss": 0.85667342, + "learning_rate": 2.919257954049892e-06, + "loss": 0.93399429, + "num_input_tokens_seen": 131351885, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.15124512, + "step": 6113, + "time_per_iteration": 2.5230536460876465 + }, + { + "auxiliary_loss_clip": 0.06460717, + "auxiliary_loss_mlp": 0.01276985, + "balance_loss_clip": 0.06281444, + "balance_loss_mlp": 0.01260439, + "epoch": 0.36759356681196453, + "flos": 25308144120960.0, + "grad_norm": 1.9025835930032806, + "language_loss": 0.78706479, + "learning_rate": 2.918912051407413e-06, + "loss": 0.86444181, + "num_input_tokens_seen": 131370245, + "router_z_loss_clip": 1.79101562, + "router_z_loss_mlp": 0.16540527, + "step": 6114, + "time_per_iteration": 2.6091229915618896 + }, + { + "auxiliary_loss_clip": 0.06466475, + "auxiliary_loss_mlp": 0.01272915, + "balance_loss_clip": 0.0628548, + "balance_loss_mlp": 0.01255725, + "epoch": 0.3676536900646325, + "flos": 21039338338560.0, + "grad_norm": 1.6305517572579116, + "language_loss": 0.67626929, + "learning_rate": 2.918566113919698e-06, + "loss": 0.75366318, + "num_input_tokens_seen": 131388115, + "router_z_loss_clip": 1.80859375, + "router_z_loss_mlp": 0.17199707, + "step": 6115, + "time_per_iteration": 2.5226221084594727 + }, + { + "auxiliary_loss_clip": 0.06454025, + "auxiliary_loss_mlp": 0.01272139, + "balance_loss_clip": 0.06280309, + "balance_loss_mlp": 0.01257077, + "epoch": 0.36771381331730046, + "flos": 16293882205440.0, + "grad_norm": 2.2835896682412105, + "language_loss": 0.76996851, + "learning_rate": 2.9182201415998636e-06, + "loss": 0.84723008, + "num_input_tokens_seen": 131404595, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.15063477, + "step": 6116, + "time_per_iteration": 2.504951238632202 + }, + { + "auxiliary_loss_clip": 0.06459618, + "auxiliary_loss_mlp": 0.01274615, + "balance_loss_clip": 0.06282905, + "balance_loss_mlp": 0.01259153, + "epoch": 0.36777393656996843, + "flos": 22316574124800.0, + "grad_norm": 1.8264539284878285, + "language_loss": 0.62890095, + "learning_rate": 2.9178741344610286e-06, + "loss": 0.70624328, + "num_input_tokens_seen": 131423760, + "router_z_loss_clip": 1.76367188, + "router_z_loss_mlp": 0.15454102, + "step": 6117, + "time_per_iteration": 2.529193639755249 + }, + { + "auxiliary_loss_clip": 0.06458353, + "auxiliary_loss_mlp": 0.01270127, + "balance_loss_clip": 0.06285255, + "balance_loss_mlp": 0.01254749, + "epoch": 0.3678340598226364, + "flos": 26841405657600.0, + "grad_norm": 1.7359331247938332, + "language_loss": 0.73532575, + "learning_rate": 2.9175280925163156e-06, + "loss": 0.81261057, + "num_input_tokens_seen": 131444955, + "router_z_loss_clip": 1.73046875, + "router_z_loss_mlp": 0.15368652, + "step": 6118, + "time_per_iteration": 2.6261374950408936 + }, + { + "auxiliary_loss_clip": 0.06469986, + "auxiliary_loss_mlp": 0.01276003, + "balance_loss_clip": 0.06289329, + "balance_loss_mlp": 0.01259707, + "epoch": 0.36789418307530436, + "flos": 21768073297920.0, + "grad_norm": 1.5781425493049515, + "language_loss": 0.73047614, + "learning_rate": 2.9171820157788445e-06, + "loss": 0.80793607, + "num_input_tokens_seen": 131465720, + "router_z_loss_clip": 1.80566406, + "router_z_loss_mlp": 0.16320801, + "step": 6119, + "time_per_iteration": 2.5320048332214355 + }, + { + "auxiliary_loss_clip": 0.06466002, + "auxiliary_loss_mlp": 0.0127303, + "balance_loss_clip": 0.06290065, + "balance_loss_mlp": 0.0125789, + "epoch": 0.3679543063279723, + "flos": 15929598579840.0, + "grad_norm": 2.0565678381587307, + "language_loss": 0.8018201, + "learning_rate": 2.9168359042617404e-06, + "loss": 0.87921047, + "num_input_tokens_seen": 131483080, + "router_z_loss_clip": 1.75976562, + "router_z_loss_mlp": 0.15136719, + "step": 6120, + "time_per_iteration": 2.5085418224334717 + }, + { + "auxiliary_loss_clip": 0.06467941, + "auxiliary_loss_mlp": 0.01276389, + "balance_loss_clip": 0.0629365, + "balance_loss_mlp": 0.01260868, + "epoch": 0.3680144295806403, + "flos": 24281693205120.0, + "grad_norm": 2.0719591239633703, + "language_loss": 0.64803445, + "learning_rate": 2.916489757978126e-06, + "loss": 0.72547781, + "num_input_tokens_seen": 131502545, + "router_z_loss_clip": 1.74316406, + "router_z_loss_mlp": 0.15515137, + "step": 6121, + "time_per_iteration": 2.532470703125 + }, + { + "auxiliary_loss_clip": 0.06466727, + "auxiliary_loss_mlp": 0.01268749, + "balance_loss_clip": 0.06293779, + "balance_loss_mlp": 0.01254527, + "epoch": 0.36807455283330826, + "flos": 26111329032960.0, + "grad_norm": 1.9648479350594452, + "language_loss": 0.71416938, + "learning_rate": 2.9161435769411286e-06, + "loss": 0.79152405, + "num_input_tokens_seen": 131522155, + "router_z_loss_clip": 1.72851562, + "router_z_loss_mlp": 0.14221191, + "step": 6122, + "time_per_iteration": 2.5836074352264404 + }, + { + "auxiliary_loss_clip": 0.06461313, + "auxiliary_loss_mlp": 0.01273307, + "balance_loss_clip": 0.06291762, + "balance_loss_mlp": 0.0125831, + "epoch": 0.3681346760859763, + "flos": 24651972397440.0, + "grad_norm": 1.8972357597085572, + "language_loss": 0.69858962, + "learning_rate": 2.915797361163875e-06, + "loss": 0.77593577, + "num_input_tokens_seen": 131543865, + "router_z_loss_clip": 1.69726562, + "router_z_loss_mlp": 0.15002441, + "step": 6123, + "time_per_iteration": 2.5574307441711426 + }, + { + "auxiliary_loss_clip": 0.06474412, + "auxiliary_loss_mlp": 0.0127264, + "balance_loss_clip": 0.06293641, + "balance_loss_mlp": 0.01256094, + "epoch": 0.36819479933864424, + "flos": 23885152957440.0, + "grad_norm": 2.796866262853862, + "language_loss": 0.74766016, + "learning_rate": 2.9154511106594933e-06, + "loss": 0.8251307, + "num_input_tokens_seen": 131562155, + "router_z_loss_clip": 1.80957031, + "router_z_loss_mlp": 0.16540527, + "step": 6124, + "time_per_iteration": 2.5769121646881104 + }, + { + "auxiliary_loss_clip": 0.06470435, + "auxiliary_loss_mlp": 0.01274758, + "balance_loss_clip": 0.06295419, + "balance_loss_mlp": 0.01258116, + "epoch": 0.3682549225913122, + "flos": 25560606072960.0, + "grad_norm": 3.2532876436035236, + "language_loss": 0.74467599, + "learning_rate": 2.915104825441114e-06, + "loss": 0.82212794, + "num_input_tokens_seen": 131581695, + "router_z_loss_clip": 1.75195312, + "router_z_loss_mlp": 0.16625977, + "step": 6125, + "time_per_iteration": 2.5822880268096924 + }, + { + "auxiliary_loss_clip": 0.06476732, + "auxiliary_loss_mlp": 0.01270787, + "balance_loss_clip": 0.06296605, + "balance_loss_mlp": 0.01253967, + "epoch": 0.36831504584398017, + "flos": 16952317989120.0, + "grad_norm": 1.938795434914092, + "language_loss": 0.7843706, + "learning_rate": 2.9147585055218686e-06, + "loss": 0.86184579, + "num_input_tokens_seen": 131599465, + "router_z_loss_clip": 1.80078125, + "router_z_loss_mlp": 0.16809082, + "step": 6126, + "time_per_iteration": 2.5298731327056885 + }, + { + "auxiliary_loss_clip": 0.06483818, + "auxiliary_loss_mlp": 0.01275366, + "balance_loss_clip": 0.06301596, + "balance_loss_mlp": 0.01257413, + "epoch": 0.36837516909664814, + "flos": 19871198968320.0, + "grad_norm": 2.3034543329783173, + "language_loss": 0.66139042, + "learning_rate": 2.914412150914888e-06, + "loss": 0.73898232, + "num_input_tokens_seen": 131618330, + "router_z_loss_clip": 1.82128906, + "router_z_loss_mlp": 0.17980957, + "step": 6127, + "time_per_iteration": 2.5208253860473633 + }, + { + "auxiliary_loss_clip": 0.06475674, + "auxiliary_loss_mlp": 0.01272228, + "balance_loss_clip": 0.06294744, + "balance_loss_mlp": 0.01256409, + "epoch": 0.3684352923493161, + "flos": 37634976224640.0, + "grad_norm": 1.7597572196634643, + "language_loss": 0.70472896, + "learning_rate": 2.9140657616333074e-06, + "loss": 0.78220791, + "num_input_tokens_seen": 131638960, + "router_z_loss_clip": 1.80761719, + "router_z_loss_mlp": 0.15808105, + "step": 6128, + "time_per_iteration": 2.6984474658966064 + }, + { + "auxiliary_loss_clip": 0.06467833, + "auxiliary_loss_mlp": 0.01270944, + "balance_loss_clip": 0.06293194, + "balance_loss_mlp": 0.01255613, + "epoch": 0.36849541560198407, + "flos": 14470786995840.0, + "grad_norm": 1.6868142680460214, + "language_loss": 0.7591843, + "learning_rate": 2.9137193376902614e-06, + "loss": 0.83657211, + "num_input_tokens_seen": 131657440, + "router_z_loss_clip": 1.74511719, + "router_z_loss_mlp": 0.15332031, + "step": 6129, + "time_per_iteration": 2.49924898147583 + }, + { + "auxiliary_loss_clip": 0.06473218, + "auxiliary_loss_mlp": 0.01270816, + "balance_loss_clip": 0.06296876, + "balance_loss_mlp": 0.01255844, + "epoch": 0.36855553885465203, + "flos": 25777037969280.0, + "grad_norm": 1.6502765336301308, + "language_loss": 0.85087365, + "learning_rate": 2.9133728790988868e-06, + "loss": 0.92831397, + "num_input_tokens_seen": 131678035, + "router_z_loss_clip": 1.76464844, + "router_z_loss_mlp": 0.1496582, + "step": 6130, + "time_per_iteration": 2.604851484298706 + }, + { + "auxiliary_loss_clip": 0.06391466, + "auxiliary_loss_mlp": 0.01263828, + "balance_loss_clip": 0.06313837, + "balance_loss_mlp": 0.01261091, + "epoch": 0.36861566210732, + "flos": 65071715212800.0, + "grad_norm": 0.7916436629428728, + "language_loss": 0.60275888, + "learning_rate": 2.913026385872321e-06, + "loss": 0.67931175, + "num_input_tokens_seen": 131742470, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.02740479, + "step": 6131, + "time_per_iteration": 3.228571891784668 + }, + { + "auxiliary_loss_clip": 0.0647023, + "auxiliary_loss_mlp": 0.01270609, + "balance_loss_clip": 0.06296837, + "balance_loss_mlp": 0.01255332, + "epoch": 0.36867578535998796, + "flos": 30962108148480.0, + "grad_norm": 1.7580055354180455, + "language_loss": 0.73204952, + "learning_rate": 2.9126798580237034e-06, + "loss": 0.8094579, + "num_input_tokens_seen": 131764570, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.152771, + "step": 6132, + "time_per_iteration": 2.6286978721618652 + }, + { + "auxiliary_loss_clip": 0.06478602, + "auxiliary_loss_mlp": 0.01273616, + "balance_loss_clip": 0.0629575, + "balance_loss_mlp": 0.0125738, + "epoch": 0.3687359086126559, + "flos": 28845154270080.0, + "grad_norm": 1.8077518075699008, + "language_loss": 0.7455107, + "learning_rate": 2.9123332955661736e-06, + "loss": 0.82303286, + "num_input_tokens_seen": 131785720, + "router_z_loss_clip": 1.83007812, + "router_z_loss_mlp": 0.16235352, + "step": 6133, + "time_per_iteration": 2.6024398803710938 + }, + { + "auxiliary_loss_clip": 0.06463782, + "auxiliary_loss_mlp": 0.0127464, + "balance_loss_clip": 0.06292324, + "balance_loss_mlp": 0.01258618, + "epoch": 0.3687960318653239, + "flos": 21403076912640.0, + "grad_norm": 1.7721182564640174, + "language_loss": 0.7199074, + "learning_rate": 2.911986698512874e-06, + "loss": 0.79729164, + "num_input_tokens_seen": 131804430, + "router_z_loss_clip": 1.71386719, + "router_z_loss_mlp": 0.16027832, + "step": 6134, + "time_per_iteration": 2.646097421646118 + }, + { + "auxiliary_loss_clip": 0.0646476, + "auxiliary_loss_mlp": 0.0126875, + "balance_loss_clip": 0.06289706, + "balance_loss_mlp": 0.01252288, + "epoch": 0.36885615511799186, + "flos": 20272183482240.0, + "grad_norm": 4.124945820193244, + "language_loss": 0.7570188, + "learning_rate": 2.9116400668769477e-06, + "loss": 0.83435392, + "num_input_tokens_seen": 131822060, + "router_z_loss_clip": 1.74804688, + "router_z_loss_mlp": 0.16455078, + "step": 6135, + "time_per_iteration": 2.6019539833068848 + }, + { + "auxiliary_loss_clip": 0.06382909, + "auxiliary_loss_mlp": 0.01256883, + "balance_loss_clip": 0.06304377, + "balance_loss_mlp": 0.0125392, + "epoch": 0.3689162783706599, + "flos": 63106317371520.0, + "grad_norm": 0.7816734524389999, + "language_loss": 0.58664352, + "learning_rate": 2.9112934006715376e-06, + "loss": 0.66304147, + "num_input_tokens_seen": 131880715, + "router_z_loss_clip": 0.78515625, + "router_z_loss_mlp": 0.02960205, + "step": 6136, + "time_per_iteration": 3.139789342880249 + }, + { + "auxiliary_loss_clip": 0.06465235, + "auxiliary_loss_mlp": 0.01270986, + "balance_loss_clip": 0.06292487, + "balance_loss_mlp": 0.012563, + "epoch": 0.36897640162332784, + "flos": 10966536593280.0, + "grad_norm": 2.7370945268269806, + "language_loss": 0.79547632, + "learning_rate": 2.9109466999097918e-06, + "loss": 0.8728385, + "num_input_tokens_seen": 131895850, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.14678955, + "step": 6137, + "time_per_iteration": 3.937328577041626 + }, + { + "auxiliary_loss_clip": 0.06472172, + "auxiliary_loss_mlp": 0.01271273, + "balance_loss_clip": 0.06297816, + "balance_loss_mlp": 0.01255764, + "epoch": 0.3690365248759958, + "flos": 20710581644160.0, + "grad_norm": 1.9257362559650297, + "language_loss": 0.74479491, + "learning_rate": 2.9105999646048552e-06, + "loss": 0.82222939, + "num_input_tokens_seen": 131915775, + "router_z_loss_clip": 1.74414062, + "router_z_loss_mlp": 0.15515137, + "step": 6138, + "time_per_iteration": 4.004723072052002 + }, + { + "auxiliary_loss_clip": 0.06475753, + "auxiliary_loss_mlp": 0.01270871, + "balance_loss_clip": 0.06296947, + "balance_loss_mlp": 0.01255827, + "epoch": 0.3690966481286638, + "flos": 31833495884160.0, + "grad_norm": 1.986271481109943, + "language_loss": 0.65762347, + "learning_rate": 2.9102531947698764e-06, + "loss": 0.73508972, + "num_input_tokens_seen": 131935715, + "router_z_loss_clip": 1.78808594, + "router_z_loss_mlp": 0.1505127, + "step": 6139, + "time_per_iteration": 2.621832847595215 + }, + { + "auxiliary_loss_clip": 0.06460394, + "auxiliary_loss_mlp": 0.01271698, + "balance_loss_clip": 0.06290884, + "balance_loss_mlp": 0.0125626, + "epoch": 0.36915677138133174, + "flos": 13119897870720.0, + "grad_norm": 1.9334180469367421, + "language_loss": 0.72060692, + "learning_rate": 2.909906390418006e-06, + "loss": 0.7979278, + "num_input_tokens_seen": 131954120, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.15429688, + "step": 6140, + "time_per_iteration": 2.542410135269165 + }, + { + "auxiliary_loss_clip": 0.06370358, + "auxiliary_loss_mlp": 0.01255246, + "balance_loss_clip": 0.06292184, + "balance_loss_mlp": 0.01252388, + "epoch": 0.3692168946339997, + "flos": 68707926996480.0, + "grad_norm": 0.7297912869343693, + "language_loss": 0.59210759, + "learning_rate": 2.9095595515623934e-06, + "loss": 0.66836369, + "num_input_tokens_seen": 132017485, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 0.02853394, + "step": 6141, + "time_per_iteration": 3.242342710494995 + }, + { + "auxiliary_loss_clip": 0.06465677, + "auxiliary_loss_mlp": 0.01272477, + "balance_loss_clip": 0.06289662, + "balance_loss_mlp": 0.01256336, + "epoch": 0.36927701788666767, + "flos": 22024392537600.0, + "grad_norm": 1.6449420117919953, + "language_loss": 0.75489783, + "learning_rate": 2.909212678216192e-06, + "loss": 0.83227944, + "num_input_tokens_seen": 132036760, + "router_z_loss_clip": 1.76074219, + "router_z_loss_mlp": 0.16149902, + "step": 6142, + "time_per_iteration": 2.552541732788086 + }, + { + "auxiliary_loss_clip": 0.06459697, + "auxiliary_loss_mlp": 0.01271426, + "balance_loss_clip": 0.06287819, + "balance_loss_mlp": 0.01256883, + "epoch": 0.36933714113933563, + "flos": 21842103980160.0, + "grad_norm": 2.1834908331499694, + "language_loss": 0.77180201, + "learning_rate": 2.908865770392555e-06, + "loss": 0.84911323, + "num_input_tokens_seen": 132056935, + "router_z_loss_clip": 1.71777344, + "router_z_loss_mlp": 0.14544678, + "step": 6143, + "time_per_iteration": 3.990859031677246 + }, + { + "auxiliary_loss_clip": 0.06461622, + "auxiliary_loss_mlp": 0.01265429, + "balance_loss_clip": 0.06289461, + "balance_loss_mlp": 0.01251565, + "epoch": 0.3693972643920036, + "flos": 23697749301120.0, + "grad_norm": 1.9416354027972629, + "language_loss": 0.82307315, + "learning_rate": 2.9085188281046364e-06, + "loss": 0.9003436, + "num_input_tokens_seen": 132077285, + "router_z_loss_clip": 1.72265625, + "router_z_loss_mlp": 0.13867188, + "step": 6144, + "time_per_iteration": 2.5504705905914307 + }, + { + "auxiliary_loss_clip": 0.06462898, + "auxiliary_loss_mlp": 0.01269861, + "balance_loss_clip": 0.06287374, + "balance_loss_mlp": 0.01255586, + "epoch": 0.36945738764467156, + "flos": 22863355943040.0, + "grad_norm": 2.172105123479451, + "language_loss": 0.78995448, + "learning_rate": 2.908171851365593e-06, + "loss": 0.86728209, + "num_input_tokens_seen": 132095520, + "router_z_loss_clip": 1.75390625, + "router_z_loss_mlp": 0.14282227, + "step": 6145, + "time_per_iteration": 3.9733781814575195 + }, + { + "auxiliary_loss_clip": 0.06468924, + "auxiliary_loss_mlp": 0.01271457, + "balance_loss_clip": 0.06291068, + "balance_loss_mlp": 0.01256067, + "epoch": 0.36951751089733953, + "flos": 16621213380480.0, + "grad_norm": 1.6722610276638135, + "language_loss": 0.77129662, + "learning_rate": 2.9078248401885815e-06, + "loss": 0.8487004, + "num_input_tokens_seen": 132112810, + "router_z_loss_clip": 1.77832031, + "router_z_loss_mlp": 0.15380859, + "step": 6146, + "time_per_iteration": 2.5411174297332764 + }, + { + "auxiliary_loss_clip": 0.06466483, + "auxiliary_loss_mlp": 0.0127594, + "balance_loss_clip": 0.06289164, + "balance_loss_mlp": 0.01260419, + "epoch": 0.3695776341500075, + "flos": 18920204254080.0, + "grad_norm": 1.6293394058894772, + "language_loss": 0.81346822, + "learning_rate": 2.907477794586761e-06, + "loss": 0.89089251, + "num_input_tokens_seen": 132131615, + "router_z_loss_clip": 1.7734375, + "router_z_loss_mlp": 0.1550293, + "step": 6147, + "time_per_iteration": 2.5456924438476562 + }, + { + "auxiliary_loss_clip": 0.06463629, + "auxiliary_loss_mlp": 0.01275917, + "balance_loss_clip": 0.06286413, + "balance_loss_mlp": 0.01261684, + "epoch": 0.36963775740267546, + "flos": 20813892128640.0, + "grad_norm": 1.8090658573318705, + "language_loss": 0.83484954, + "learning_rate": 2.9071307145732926e-06, + "loss": 0.91224504, + "num_input_tokens_seen": 132149585, + "router_z_loss_clip": 1.77246094, + "router_z_loss_mlp": 0.14227295, + "step": 6148, + "time_per_iteration": 2.6318178176879883 + }, + { + "auxiliary_loss_clip": 0.06458767, + "auxiliary_loss_mlp": 0.01266964, + "balance_loss_clip": 0.06284354, + "balance_loss_mlp": 0.01252814, + "epoch": 0.3696978806553435, + "flos": 26068087526400.0, + "grad_norm": 2.191330684134815, + "language_loss": 0.74277508, + "learning_rate": 2.9067836001613357e-06, + "loss": 0.82003242, + "num_input_tokens_seen": 132165555, + "router_z_loss_clip": 1.74414062, + "router_z_loss_mlp": 0.14147949, + "step": 6149, + "time_per_iteration": 2.6037940979003906 + }, + { + "auxiliary_loss_clip": 0.06464496, + "auxiliary_loss_mlp": 0.01271867, + "balance_loss_clip": 0.06287233, + "balance_loss_mlp": 0.01256203, + "epoch": 0.36975800390801145, + "flos": 26841237949440.0, + "grad_norm": 2.856714094904378, + "language_loss": 0.71066409, + "learning_rate": 2.906436451364054e-06, + "loss": 0.78802776, + "num_input_tokens_seen": 132185100, + "router_z_loss_clip": 1.77050781, + "router_z_loss_mlp": 0.15667725, + "step": 6150, + "time_per_iteration": 2.612860918045044 + }, + { + "auxiliary_loss_clip": 0.06457143, + "auxiliary_loss_mlp": 0.01270306, + "balance_loss_clip": 0.06283612, + "balance_loss_mlp": 0.01256341, + "epoch": 0.3698181271606794, + "flos": 21149063660160.0, + "grad_norm": 1.8423166255946122, + "language_loss": 0.81970799, + "learning_rate": 2.906089268194611e-06, + "loss": 0.89698249, + "num_input_tokens_seen": 132203930, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.1395874, + "step": 6151, + "time_per_iteration": 2.535888195037842 + }, + { + "auxiliary_loss_clip": 0.0635625, + "auxiliary_loss_mlp": 0.01266021, + "balance_loss_clip": 0.06277541, + "balance_loss_mlp": 0.01262752, + "epoch": 0.3698782504133474, + "flos": 66761605958400.0, + "grad_norm": 0.7660918799950965, + "language_loss": 0.63089043, + "learning_rate": 2.9057420506661726e-06, + "loss": 0.70711315, + "num_input_tokens_seen": 132263845, + "router_z_loss_clip": 0.78857422, + "router_z_loss_mlp": 0.03274536, + "step": 6152, + "time_per_iteration": 3.27481746673584 + }, + { + "auxiliary_loss_clip": 0.06456928, + "auxiliary_loss_mlp": 0.01269625, + "balance_loss_clip": 0.06289765, + "balance_loss_mlp": 0.01256709, + "epoch": 0.36993837366601534, + "flos": 24317597479680.0, + "grad_norm": 2.4460843976292455, + "language_loss": 0.7067228, + "learning_rate": 2.9053947987919044e-06, + "loss": 0.78398836, + "num_input_tokens_seen": 132282350, + "router_z_loss_clip": 1.671875, + "router_z_loss_mlp": 0.12921143, + "step": 6153, + "time_per_iteration": 2.561366319656372 + }, + { + "auxiliary_loss_clip": 0.06461591, + "auxiliary_loss_mlp": 0.01272426, + "balance_loss_clip": 0.06285959, + "balance_loss_mlp": 0.0125796, + "epoch": 0.3699984969186833, + "flos": 24355472325120.0, + "grad_norm": 1.7390512131477307, + "language_loss": 0.72820848, + "learning_rate": 2.9050475125849755e-06, + "loss": 0.80554867, + "num_input_tokens_seen": 132301930, + "router_z_loss_clip": 1.75585938, + "router_z_loss_mlp": 0.14459229, + "step": 6154, + "time_per_iteration": 2.6359784603118896 + }, + { + "auxiliary_loss_clip": 0.06464534, + "auxiliary_loss_mlp": 0.01270069, + "balance_loss_clip": 0.06290819, + "balance_loss_mlp": 0.01256468, + "epoch": 0.37005862017135127, + "flos": 19835378547840.0, + "grad_norm": 1.7720975153034155, + "language_loss": 0.68251342, + "learning_rate": 2.9047001920585534e-06, + "loss": 0.75985944, + "num_input_tokens_seen": 132320915, + "router_z_loss_clip": 1.73828125, + "router_z_loss_mlp": 0.1361084, + "step": 6155, + "time_per_iteration": 2.6026792526245117 + }, + { + "auxiliary_loss_clip": 0.06462097, + "auxiliary_loss_mlp": 0.01275284, + "balance_loss_clip": 0.06290478, + "balance_loss_mlp": 0.01261551, + "epoch": 0.37011874342401924, + "flos": 19579981703040.0, + "grad_norm": 1.763175663447542, + "language_loss": 0.68228447, + "learning_rate": 2.9043528372258097e-06, + "loss": 0.75965828, + "num_input_tokens_seen": 132340415, + "router_z_loss_clip": 1.71679688, + "router_z_loss_mlp": 0.13745117, + "step": 6156, + "time_per_iteration": 2.5805797576904297 + }, + { + "auxiliary_loss_clip": 0.06460856, + "auxiliary_loss_mlp": 0.01276122, + "balance_loss_clip": 0.06292138, + "balance_loss_mlp": 0.01263051, + "epoch": 0.3701788666766872, + "flos": 20380315576320.0, + "grad_norm": 2.4756712581972673, + "language_loss": 0.82280111, + "learning_rate": 2.904005448099916e-06, + "loss": 0.9001708, + "num_input_tokens_seen": 132358600, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.13061523, + "step": 6157, + "time_per_iteration": 2.5239012241363525 + }, + { + "auxiliary_loss_clip": 0.06472905, + "auxiliary_loss_mlp": 0.01276517, + "balance_loss_clip": 0.06294029, + "balance_loss_mlp": 0.0126136, + "epoch": 0.37023898992935517, + "flos": 15346325508480.0, + "grad_norm": 2.1879647979069055, + "language_loss": 0.77007514, + "learning_rate": 2.9036580246940444e-06, + "loss": 0.84756935, + "num_input_tokens_seen": 132373160, + "router_z_loss_clip": 1.78710938, + "router_z_loss_mlp": 0.15142822, + "step": 6158, + "time_per_iteration": 2.5507380962371826 + }, + { + "auxiliary_loss_clip": 0.06472066, + "auxiliary_loss_mlp": 0.01273585, + "balance_loss_clip": 0.0629342, + "balance_loss_mlp": 0.0125872, + "epoch": 0.37029911318202313, + "flos": 19580149411200.0, + "grad_norm": 1.9796058392103062, + "language_loss": 0.68833315, + "learning_rate": 2.9033105670213708e-06, + "loss": 0.76578963, + "num_input_tokens_seen": 132392345, + "router_z_loss_clip": 1.78710938, + "router_z_loss_mlp": 0.14880371, + "step": 6159, + "time_per_iteration": 2.4941582679748535 + }, + { + "auxiliary_loss_clip": 0.06464109, + "auxiliary_loss_mlp": 0.01275069, + "balance_loss_clip": 0.06292266, + "balance_loss_mlp": 0.01261986, + "epoch": 0.3703592364346911, + "flos": 26220509303040.0, + "grad_norm": 1.9367461088396363, + "language_loss": 0.71322787, + "learning_rate": 2.9029630750950697e-06, + "loss": 0.79061961, + "num_input_tokens_seen": 132412620, + "router_z_loss_clip": 1.71679688, + "router_z_loss_mlp": 0.13079834, + "step": 6160, + "time_per_iteration": 2.5934555530548096 + }, + { + "auxiliary_loss_clip": 0.06465742, + "auxiliary_loss_mlp": 0.01272758, + "balance_loss_clip": 0.06295532, + "balance_loss_mlp": 0.0125958, + "epoch": 0.37041935968735906, + "flos": 20054619555840.0, + "grad_norm": 1.6534007301448785, + "language_loss": 0.78978807, + "learning_rate": 2.9026155489283176e-06, + "loss": 0.86717302, + "num_input_tokens_seen": 132431570, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.1317749, + "step": 6161, + "time_per_iteration": 2.5337588787078857 + }, + { + "auxiliary_loss_clip": 0.06465232, + "auxiliary_loss_mlp": 0.01270423, + "balance_loss_clip": 0.06291839, + "balance_loss_mlp": 0.01255837, + "epoch": 0.3704794829400271, + "flos": 24140633656320.0, + "grad_norm": 1.7631614273732186, + "language_loss": 0.79746109, + "learning_rate": 2.902267988534295e-06, + "loss": 0.87481761, + "num_input_tokens_seen": 132451525, + "router_z_loss_clip": 1.73339844, + "router_z_loss_mlp": 0.14587402, + "step": 6162, + "time_per_iteration": 2.5815200805664062 + }, + { + "auxiliary_loss_clip": 0.06466715, + "auxiliary_loss_mlp": 0.01274307, + "balance_loss_clip": 0.06292939, + "balance_loss_mlp": 0.01260717, + "epoch": 0.37053960619269505, + "flos": 14872232707200.0, + "grad_norm": 1.8866019587111915, + "language_loss": 0.80318987, + "learning_rate": 2.9019203939261783e-06, + "loss": 0.88060015, + "num_input_tokens_seen": 132469875, + "router_z_loss_clip": 1.73925781, + "router_z_loss_mlp": 0.13580322, + "step": 6163, + "time_per_iteration": 2.501971483230591 + }, + { + "auxiliary_loss_clip": 0.06466764, + "auxiliary_loss_mlp": 0.01273928, + "balance_loss_clip": 0.0629348, + "balance_loss_mlp": 0.01260315, + "epoch": 0.370599729445363, + "flos": 21367969251840.0, + "grad_norm": 1.81392406825425, + "language_loss": 0.68857837, + "learning_rate": 2.9015727651171507e-06, + "loss": 0.76598537, + "num_input_tokens_seen": 132488360, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.13598633, + "step": 6164, + "time_per_iteration": 2.557870388031006 + }, + { + "auxiliary_loss_clip": 0.06463528, + "auxiliary_loss_mlp": 0.01275542, + "balance_loss_clip": 0.06290606, + "balance_loss_mlp": 0.0126064, + "epoch": 0.370659852698031, + "flos": 26835535872000.0, + "grad_norm": 2.3609289004256984, + "language_loss": 0.83364576, + "learning_rate": 2.9012251021203935e-06, + "loss": 0.91103643, + "num_input_tokens_seen": 132508630, + "router_z_loss_clip": 1.72851562, + "router_z_loss_mlp": 0.14916992, + "step": 6165, + "time_per_iteration": 2.5597267150878906 + }, + { + "auxiliary_loss_clip": 0.06475651, + "auxiliary_loss_mlp": 0.01276631, + "balance_loss_clip": 0.06294797, + "balance_loss_mlp": 0.01261086, + "epoch": 0.37071997595069894, + "flos": 19105050360960.0, + "grad_norm": 1.8212520052796557, + "language_loss": 0.69703627, + "learning_rate": 2.9008774049490896e-06, + "loss": 0.77455908, + "num_input_tokens_seen": 132527465, + "router_z_loss_clip": 1.81054688, + "router_z_loss_mlp": 0.15551758, + "step": 6166, + "time_per_iteration": 2.7443737983703613 + }, + { + "auxiliary_loss_clip": 0.06351966, + "auxiliary_loss_mlp": 0.01259396, + "balance_loss_clip": 0.0627325, + "balance_loss_mlp": 0.01255936, + "epoch": 0.3707800992033669, + "flos": 52193839461120.0, + "grad_norm": 0.7767712005900987, + "language_loss": 0.55992532, + "learning_rate": 2.9005296736164244e-06, + "loss": 0.6360389, + "num_input_tokens_seen": 132579940, + "router_z_loss_clip": 0.78808594, + "router_z_loss_mlp": 0.03469849, + "step": 6167, + "time_per_iteration": 3.122786045074463 + }, + { + "auxiliary_loss_clip": 0.06470326, + "auxiliary_loss_mlp": 0.01270542, + "balance_loss_clip": 0.06298738, + "balance_loss_mlp": 0.01256553, + "epoch": 0.3708402224560349, + "flos": 19908025637760.0, + "grad_norm": 1.887650816435161, + "language_loss": 0.75851792, + "learning_rate": 2.900181908135584e-06, + "loss": 0.83592659, + "num_input_tokens_seen": 132598390, + "router_z_loss_clip": 1.71484375, + "router_z_loss_mlp": 0.13983154, + "step": 6168, + "time_per_iteration": 2.516329050064087 + }, + { + "auxiliary_loss_clip": 0.06462339, + "auxiliary_loss_mlp": 0.01269774, + "balance_loss_clip": 0.0628986, + "balance_loss_mlp": 0.01255833, + "epoch": 0.37090034570870284, + "flos": 20013222839040.0, + "grad_norm": 1.688087532093935, + "language_loss": 0.74697542, + "learning_rate": 2.899834108519755e-06, + "loss": 0.82429659, + "num_input_tokens_seen": 132616920, + "router_z_loss_clip": 1.72363281, + "router_z_loss_mlp": 0.13946533, + "step": 6169, + "time_per_iteration": 2.571059226989746 + }, + { + "auxiliary_loss_clip": 0.06462043, + "auxiliary_loss_mlp": 0.01269285, + "balance_loss_clip": 0.06291892, + "balance_loss_mlp": 0.0125526, + "epoch": 0.3709604689613708, + "flos": 24141681832320.0, + "grad_norm": 1.6120375976718775, + "language_loss": 0.79462636, + "learning_rate": 2.899486274782127e-06, + "loss": 0.87193966, + "num_input_tokens_seen": 132637660, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.14007568, + "step": 6170, + "time_per_iteration": 2.539099931716919 + }, + { + "auxiliary_loss_clip": 0.06461793, + "auxiliary_loss_mlp": 0.01268893, + "balance_loss_clip": 0.06289523, + "balance_loss_mlp": 0.01254183, + "epoch": 0.37102059221403877, + "flos": 23882469626880.0, + "grad_norm": 1.7170622011660002, + "language_loss": 0.76363444, + "learning_rate": 2.8991384069358885e-06, + "loss": 0.84094131, + "num_input_tokens_seen": 132657635, + "router_z_loss_clip": 1.72167969, + "router_z_loss_mlp": 0.14703369, + "step": 6171, + "time_per_iteration": 2.5565338134765625 + }, + { + "auxiliary_loss_clip": 0.06464403, + "auxiliary_loss_mlp": 0.01269741, + "balance_loss_clip": 0.06292279, + "balance_loss_mlp": 0.0125568, + "epoch": 0.37108071546670673, + "flos": 14506439708160.0, + "grad_norm": 2.2434941236901222, + "language_loss": 0.80974334, + "learning_rate": 2.898790504994232e-06, + "loss": 0.88708472, + "num_input_tokens_seen": 132674455, + "router_z_loss_clip": 1.71972656, + "router_z_loss_mlp": 0.140625, + "step": 6172, + "time_per_iteration": 2.496101140975952 + }, + { + "auxiliary_loss_clip": 0.06468061, + "auxiliary_loss_mlp": 0.01272991, + "balance_loss_clip": 0.06291698, + "balance_loss_mlp": 0.01258352, + "epoch": 0.3711408387193747, + "flos": 34570172160000.0, + "grad_norm": 1.701200983183655, + "language_loss": 0.59536189, + "learning_rate": 2.89844256897035e-06, + "loss": 0.67277241, + "num_input_tokens_seen": 132695140, + "router_z_loss_clip": 1.76367188, + "router_z_loss_mlp": 0.14648438, + "step": 6173, + "time_per_iteration": 2.68860125541687 + }, + { + "auxiliary_loss_clip": 0.06465948, + "auxiliary_loss_mlp": 0.01267208, + "balance_loss_clip": 0.06291407, + "balance_loss_mlp": 0.01252825, + "epoch": 0.37120096197204266, + "flos": 17316350052480.0, + "grad_norm": 3.482738270256764, + "language_loss": 0.81161231, + "learning_rate": 2.898094598877435e-06, + "loss": 0.88894391, + "num_input_tokens_seen": 132712470, + "router_z_loss_clip": 1.74414062, + "router_z_loss_mlp": 0.1439209, + "step": 6174, + "time_per_iteration": 2.498631238937378 + }, + { + "auxiliary_loss_clip": 0.06459825, + "auxiliary_loss_mlp": 0.01267088, + "balance_loss_clip": 0.06290745, + "balance_loss_mlp": 0.01253826, + "epoch": 0.37126108522471063, + "flos": 30671855205120.0, + "grad_norm": 1.7762050826086826, + "language_loss": 0.79733562, + "learning_rate": 2.8977465947286826e-06, + "loss": 0.87460476, + "num_input_tokens_seen": 132732945, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.13275146, + "step": 6175, + "time_per_iteration": 2.6155989170074463 + }, + { + "auxiliary_loss_clip": 0.06469794, + "auxiliary_loss_mlp": 0.01268815, + "balance_loss_clip": 0.06296568, + "balance_loss_mlp": 0.01253926, + "epoch": 0.37132120847737865, + "flos": 25162682232960.0, + "grad_norm": 2.183025760433602, + "language_loss": 0.8886646, + "learning_rate": 2.89739855653729e-06, + "loss": 0.96605068, + "num_input_tokens_seen": 132752470, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.14880371, + "step": 6176, + "time_per_iteration": 3.9855380058288574 + }, + { + "auxiliary_loss_clip": 0.06463525, + "auxiliary_loss_mlp": 0.01266267, + "balance_loss_clip": 0.0629091, + "balance_loss_mlp": 0.01252331, + "epoch": 0.3713813317300466, + "flos": 21219572471040.0, + "grad_norm": 1.8377156327305517, + "language_loss": 0.73693877, + "learning_rate": 2.8970504843164546e-06, + "loss": 0.8142367, + "num_input_tokens_seen": 132771485, + "router_z_loss_clip": 1.72363281, + "router_z_loss_mlp": 0.13952637, + "step": 6177, + "time_per_iteration": 2.584007501602173 + }, + { + "auxiliary_loss_clip": 0.06460603, + "auxiliary_loss_mlp": 0.01270943, + "balance_loss_clip": 0.06288581, + "balance_loss_mlp": 0.01256722, + "epoch": 0.3714414549827146, + "flos": 21623114534400.0, + "grad_norm": 3.348536242845292, + "language_loss": 0.75657964, + "learning_rate": 2.896702378079374e-06, + "loss": 0.83389515, + "num_input_tokens_seen": 132791465, + "router_z_loss_clip": 1.72265625, + "router_z_loss_mlp": 0.14227295, + "step": 6178, + "time_per_iteration": 4.047810077667236 + }, + { + "auxiliary_loss_clip": 0.06459013, + "auxiliary_loss_mlp": 0.01268256, + "balance_loss_clip": 0.06288654, + "balance_loss_mlp": 0.01253796, + "epoch": 0.37150157823538255, + "flos": 19978073251200.0, + "grad_norm": 1.677068577007521, + "language_loss": 0.7243154, + "learning_rate": 2.8963542378392502e-06, + "loss": 0.80158818, + "num_input_tokens_seen": 132810160, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.14465332, + "step": 6179, + "time_per_iteration": 2.525162696838379 + }, + { + "auxiliary_loss_clip": 0.06464912, + "auxiliary_loss_mlp": 0.01268865, + "balance_loss_clip": 0.06289817, + "balance_loss_mlp": 0.01254506, + "epoch": 0.3715617014880505, + "flos": 24867020701440.0, + "grad_norm": 1.5744290711880986, + "language_loss": 0.70164317, + "learning_rate": 2.896006063609283e-06, + "loss": 0.77898097, + "num_input_tokens_seen": 132831265, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.14361572, + "step": 6180, + "time_per_iteration": 2.564251661300659 + }, + { + "auxiliary_loss_clip": 0.06459807, + "auxiliary_loss_mlp": 0.01269776, + "balance_loss_clip": 0.0628929, + "balance_loss_mlp": 0.01255173, + "epoch": 0.3716218247407185, + "flos": 20455352507520.0, + "grad_norm": 1.6669585833251956, + "language_loss": 0.78357702, + "learning_rate": 2.8956578554026767e-06, + "loss": 0.86087286, + "num_input_tokens_seen": 132850005, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.14605713, + "step": 6181, + "time_per_iteration": 2.5857934951782227 + }, + { + "auxiliary_loss_clip": 0.06456675, + "auxiliary_loss_mlp": 0.01268697, + "balance_loss_clip": 0.06286183, + "balance_loss_mlp": 0.01254195, + "epoch": 0.37168194799338644, + "flos": 24140256312960.0, + "grad_norm": 1.7806049549646892, + "language_loss": 0.78926349, + "learning_rate": 2.8953096132326343e-06, + "loss": 0.86651719, + "num_input_tokens_seen": 132865790, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.14520264, + "step": 6182, + "time_per_iteration": 2.572563409805298 + }, + { + "auxiliary_loss_clip": 0.0637676, + "auxiliary_loss_mlp": 0.01256678, + "balance_loss_clip": 0.06297279, + "balance_loss_mlp": 0.01253508, + "epoch": 0.3717420712460544, + "flos": 67429601107200.0, + "grad_norm": 0.7782169453066291, + "language_loss": 0.57265592, + "learning_rate": 2.894961337112362e-06, + "loss": 0.64899027, + "num_input_tokens_seen": 132921775, + "router_z_loss_clip": 0.79296875, + "router_z_loss_mlp": 0.03170776, + "step": 6183, + "time_per_iteration": 4.616533279418945 + }, + { + "auxiliary_loss_clip": 0.06460768, + "auxiliary_loss_mlp": 0.0127302, + "balance_loss_clip": 0.06282368, + "balance_loss_mlp": 0.01258059, + "epoch": 0.37180219449872237, + "flos": 22382512888320.0, + "grad_norm": 2.288371354177028, + "language_loss": 0.77116179, + "learning_rate": 2.894613027055066e-06, + "loss": 0.84849966, + "num_input_tokens_seen": 132941060, + "router_z_loss_clip": 1.78320312, + "router_z_loss_mlp": 0.1496582, + "step": 6184, + "time_per_iteration": 2.5182292461395264 + }, + { + "auxiliary_loss_clip": 0.06457444, + "auxiliary_loss_mlp": 0.01269752, + "balance_loss_clip": 0.0628842, + "balance_loss_mlp": 0.01255739, + "epoch": 0.37186231775139034, + "flos": 21876037683840.0, + "grad_norm": 2.2342830987852023, + "language_loss": 0.72608167, + "learning_rate": 2.894264683073954e-06, + "loss": 0.80335367, + "num_input_tokens_seen": 132961850, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.14007568, + "step": 6185, + "time_per_iteration": 3.928272247314453 + }, + { + "auxiliary_loss_clip": 0.06453837, + "auxiliary_loss_mlp": 0.01267225, + "balance_loss_clip": 0.06286646, + "balance_loss_mlp": 0.01253075, + "epoch": 0.3719224410040583, + "flos": 22421142420480.0, + "grad_norm": 1.6056881027286982, + "language_loss": 0.77329034, + "learning_rate": 2.8939163051822363e-06, + "loss": 0.85050094, + "num_input_tokens_seen": 132981625, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.14160156, + "step": 6186, + "time_per_iteration": 2.549499988555908 + }, + { + "auxiliary_loss_clip": 0.0646092, + "auxiliary_loss_mlp": 0.01274226, + "balance_loss_clip": 0.06283663, + "balance_loss_mlp": 0.01258121, + "epoch": 0.37198256425672627, + "flos": 25157525207040.0, + "grad_norm": 1.8763954627941488, + "language_loss": 0.84227252, + "learning_rate": 2.8935678933931224e-06, + "loss": 0.91962403, + "num_input_tokens_seen": 133001225, + "router_z_loss_clip": 1.77246094, + "router_z_loss_mlp": 0.16101074, + "step": 6187, + "time_per_iteration": 2.542978048324585 + }, + { + "auxiliary_loss_clip": 0.06456143, + "auxiliary_loss_mlp": 0.01269651, + "balance_loss_clip": 0.06286585, + "balance_loss_mlp": 0.01255919, + "epoch": 0.37204268750939423, + "flos": 21144032415360.0, + "grad_norm": 2.100791898470326, + "language_loss": 0.84696567, + "learning_rate": 2.893219447719824e-06, + "loss": 0.9242236, + "num_input_tokens_seen": 133018820, + "router_z_loss_clip": 1.69433594, + "router_z_loss_mlp": 0.13726807, + "step": 6188, + "time_per_iteration": 2.626126766204834 + }, + { + "auxiliary_loss_clip": 0.06458837, + "auxiliary_loss_mlp": 0.01269894, + "balance_loss_clip": 0.06288396, + "balance_loss_mlp": 0.01256232, + "epoch": 0.37210281076206225, + "flos": 21513221504640.0, + "grad_norm": 2.2586863759616564, + "language_loss": 0.66390121, + "learning_rate": 2.8928709681755548e-06, + "loss": 0.74118853, + "num_input_tokens_seen": 133040205, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.13653564, + "step": 6189, + "time_per_iteration": 2.5793135166168213 + }, + { + "auxiliary_loss_clip": 0.06460261, + "auxiliary_loss_mlp": 0.01271387, + "balance_loss_clip": 0.0628726, + "balance_loss_mlp": 0.01255926, + "epoch": 0.3721629340147302, + "flos": 17353595992320.0, + "grad_norm": 2.971940637043147, + "language_loss": 0.84218514, + "learning_rate": 2.8925224547735293e-06, + "loss": 0.91950166, + "num_input_tokens_seen": 133058095, + "router_z_loss_clip": 1.72949219, + "router_z_loss_mlp": 0.15466309, + "step": 6190, + "time_per_iteration": 2.530977487564087 + }, + { + "auxiliary_loss_clip": 0.06464738, + "auxiliary_loss_mlp": 0.01270544, + "balance_loss_clip": 0.06287063, + "balance_loss_mlp": 0.01255905, + "epoch": 0.3722230572673982, + "flos": 16437457376640.0, + "grad_norm": 2.7368484374177076, + "language_loss": 0.89274895, + "learning_rate": 2.8921739075269633e-06, + "loss": 0.97010183, + "num_input_tokens_seen": 133071530, + "router_z_loss_clip": 1.77441406, + "router_z_loss_mlp": 0.14648438, + "step": 6191, + "time_per_iteration": 2.4786319732666016 + }, + { + "auxiliary_loss_clip": 0.06463645, + "auxiliary_loss_mlp": 0.01271285, + "balance_loss_clip": 0.06286322, + "balance_loss_mlp": 0.01254465, + "epoch": 0.37228318052006615, + "flos": 22681360874880.0, + "grad_norm": 2.1321020045013577, + "language_loss": 0.74374199, + "learning_rate": 2.891825326449073e-06, + "loss": 0.82109123, + "num_input_tokens_seen": 133091410, + "router_z_loss_clip": 1.77148438, + "router_z_loss_mlp": 0.16790771, + "step": 6192, + "time_per_iteration": 2.6107547283172607 + }, + { + "auxiliary_loss_clip": 0.06461145, + "auxiliary_loss_mlp": 0.01269074, + "balance_loss_clip": 0.06288278, + "balance_loss_mlp": 0.0125493, + "epoch": 0.3723433037727341, + "flos": 25272617189760.0, + "grad_norm": 2.3785606336548124, + "language_loss": 0.79934001, + "learning_rate": 2.8914767115530766e-06, + "loss": 0.87664223, + "num_input_tokens_seen": 133110365, + "router_z_loss_clip": 1.72753906, + "router_z_loss_mlp": 0.14154053, + "step": 6193, + "time_per_iteration": 2.5584514141082764 + }, + { + "auxiliary_loss_clip": 0.06469596, + "auxiliary_loss_mlp": 0.01270113, + "balance_loss_clip": 0.06293128, + "balance_loss_mlp": 0.01255594, + "epoch": 0.3724034270254021, + "flos": 10529228534400.0, + "grad_norm": 1.7620775512614164, + "language_loss": 0.84889179, + "learning_rate": 2.891128062852194e-06, + "loss": 0.92628884, + "num_input_tokens_seen": 133128255, + "router_z_loss_clip": 1.76269531, + "router_z_loss_mlp": 0.14526367, + "step": 6194, + "time_per_iteration": 2.5419061183929443 + }, + { + "auxiliary_loss_clip": 0.06460975, + "auxiliary_loss_mlp": 0.01266847, + "balance_loss_clip": 0.06288271, + "balance_loss_mlp": 0.01253317, + "epoch": 0.37246355027807004, + "flos": 20272393117440.0, + "grad_norm": 2.226391461709797, + "language_loss": 0.78030515, + "learning_rate": 2.890779380359646e-06, + "loss": 0.85758334, + "num_input_tokens_seen": 133143975, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.13543701, + "step": 6195, + "time_per_iteration": 2.51361346244812 + }, + { + "auxiliary_loss_clip": 0.06459115, + "auxiliary_loss_mlp": 0.01274112, + "balance_loss_clip": 0.06288831, + "balance_loss_mlp": 0.01258955, + "epoch": 0.372523673530738, + "flos": 19506705707520.0, + "grad_norm": 1.8216220923823887, + "language_loss": 0.79924363, + "learning_rate": 2.890430664088655e-06, + "loss": 0.87657595, + "num_input_tokens_seen": 133162935, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.15155029, + "step": 6196, + "time_per_iteration": 2.6005568504333496 + }, + { + "auxiliary_loss_clip": 0.06458211, + "auxiliary_loss_mlp": 0.01270847, + "balance_loss_clip": 0.06289028, + "balance_loss_mlp": 0.01256888, + "epoch": 0.372583796783406, + "flos": 16769945577600.0, + "grad_norm": 2.2795878215352396, + "language_loss": 0.84059894, + "learning_rate": 2.890081914052443e-06, + "loss": 0.91788948, + "num_input_tokens_seen": 133181180, + "router_z_loss_clip": 1.69140625, + "router_z_loss_mlp": 0.13952637, + "step": 6197, + "time_per_iteration": 2.538058042526245 + }, + { + "auxiliary_loss_clip": 0.06456813, + "auxiliary_loss_mlp": 0.01270068, + "balance_loss_clip": 0.06289704, + "balance_loss_mlp": 0.01255095, + "epoch": 0.37264392003607394, + "flos": 22644576132480.0, + "grad_norm": 1.7143100919816474, + "language_loss": 0.64964151, + "learning_rate": 2.889733130264237e-06, + "loss": 0.72691035, + "num_input_tokens_seen": 133199615, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.14971924, + "step": 6198, + "time_per_iteration": 2.5891072750091553 + }, + { + "auxiliary_loss_clip": 0.06454235, + "auxiliary_loss_mlp": 0.0127235, + "balance_loss_clip": 0.0628581, + "balance_loss_mlp": 0.01258367, + "epoch": 0.3727040432887419, + "flos": 19979037573120.0, + "grad_norm": 1.4303592099178044, + "language_loss": 0.74534631, + "learning_rate": 2.889384312737261e-06, + "loss": 0.82261217, + "num_input_tokens_seen": 133219650, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.13977051, + "step": 6199, + "time_per_iteration": 2.5612289905548096 + }, + { + "auxiliary_loss_clip": 0.06453978, + "auxiliary_loss_mlp": 0.01269323, + "balance_loss_clip": 0.06284302, + "balance_loss_mlp": 0.01255095, + "epoch": 0.37276416654140987, + "flos": 63911906853120.0, + "grad_norm": 1.6001689252403943, + "language_loss": 0.81250614, + "learning_rate": 2.889035461484742e-06, + "loss": 0.88973916, + "num_input_tokens_seen": 133245675, + "router_z_loss_clip": 1.69433594, + "router_z_loss_mlp": 0.14227295, + "step": 6200, + "time_per_iteration": 2.9802377223968506 + }, + { + "auxiliary_loss_clip": 0.06452343, + "auxiliary_loss_mlp": 0.01273173, + "balance_loss_clip": 0.06282811, + "balance_loss_mlp": 0.0125907, + "epoch": 0.37282428979407783, + "flos": 39795381244800.0, + "grad_norm": 2.0282879733455776, + "language_loss": 0.61128068, + "learning_rate": 2.88868657651991e-06, + "loss": 0.68853581, + "num_input_tokens_seen": 133266905, + "router_z_loss_clip": 1.69238281, + "router_z_loss_mlp": 0.14123535, + "step": 6201, + "time_per_iteration": 2.6786048412323 + }, + { + "auxiliary_loss_clip": 0.06460309, + "auxiliary_loss_mlp": 0.01271912, + "balance_loss_clip": 0.06284842, + "balance_loss_mlp": 0.01257166, + "epoch": 0.37288441304674586, + "flos": 22715336505600.0, + "grad_norm": 1.562126243298772, + "language_loss": 0.73424393, + "learning_rate": 2.8883376578559934e-06, + "loss": 0.81156611, + "num_input_tokens_seen": 133286865, + "router_z_loss_clip": 1.75390625, + "router_z_loss_mlp": 0.14746094, + "step": 6202, + "time_per_iteration": 2.5774593353271484 + }, + { + "auxiliary_loss_clip": 0.06450565, + "auxiliary_loss_mlp": 0.01268741, + "balance_loss_clip": 0.06279768, + "balance_loss_mlp": 0.01253697, + "epoch": 0.3729445362994138, + "flos": 18776209812480.0, + "grad_norm": 3.8476229642649895, + "language_loss": 0.73690808, + "learning_rate": 2.8879887055062243e-06, + "loss": 0.81410116, + "num_input_tokens_seen": 133305295, + "router_z_loss_clip": 1.70898438, + "router_z_loss_mlp": 0.1505127, + "step": 6203, + "time_per_iteration": 2.4786221981048584 + }, + { + "auxiliary_loss_clip": 0.06448745, + "auxiliary_loss_mlp": 0.0126679, + "balance_loss_clip": 0.06280582, + "balance_loss_mlp": 0.01253402, + "epoch": 0.3730046595520818, + "flos": 22462874553600.0, + "grad_norm": 1.6222639611717555, + "language_loss": 0.82113981, + "learning_rate": 2.8876397194838353e-06, + "loss": 0.89829516, + "num_input_tokens_seen": 133324625, + "router_z_loss_clip": 1.68066406, + "router_z_loss_mlp": 0.13391113, + "step": 6204, + "time_per_iteration": 2.5474419593811035 + }, + { + "auxiliary_loss_clip": 0.06456085, + "auxiliary_loss_mlp": 0.01267649, + "balance_loss_clip": 0.06282973, + "balance_loss_mlp": 0.01253094, + "epoch": 0.37306478280474975, + "flos": 24323257630080.0, + "grad_norm": 1.5013454609640156, + "language_loss": 0.75699729, + "learning_rate": 2.8872906998020577e-06, + "loss": 0.8342346, + "num_input_tokens_seen": 133344625, + "router_z_loss_clip": 1.73144531, + "router_z_loss_mlp": 0.14562988, + "step": 6205, + "time_per_iteration": 2.5284838676452637 + }, + { + "auxiliary_loss_clip": 0.06453846, + "auxiliary_loss_mlp": 0.01269403, + "balance_loss_clip": 0.06283775, + "balance_loss_mlp": 0.01254538, + "epoch": 0.3731249060574177, + "flos": 15820627944960.0, + "grad_norm": 2.409990557003708, + "language_loss": 0.78042793, + "learning_rate": 2.886941646474128e-06, + "loss": 0.85766041, + "num_input_tokens_seen": 133363605, + "router_z_loss_clip": 1.70019531, + "router_z_loss_mlp": 0.14868164, + "step": 6206, + "time_per_iteration": 2.5130996704101562 + }, + { + "auxiliary_loss_clip": 0.06455843, + "auxiliary_loss_mlp": 0.01268821, + "balance_loss_clip": 0.06284125, + "balance_loss_mlp": 0.01253085, + "epoch": 0.3731850293100857, + "flos": 19834120736640.0, + "grad_norm": 3.8358433201526334, + "language_loss": 0.93966329, + "learning_rate": 2.886592559513283e-06, + "loss": 1.01690984, + "num_input_tokens_seen": 133379405, + "router_z_loss_clip": 1.71484375, + "router_z_loss_mlp": 0.15734863, + "step": 6207, + "time_per_iteration": 2.4994020462036133 + }, + { + "auxiliary_loss_clip": 0.06459471, + "auxiliary_loss_mlp": 0.01269129, + "balance_loss_clip": 0.06283936, + "balance_loss_mlp": 0.01254561, + "epoch": 0.37324515256275365, + "flos": 19068349472640.0, + "grad_norm": 2.1400449567396826, + "language_loss": 0.82643408, + "learning_rate": 2.886243438932759e-06, + "loss": 0.90372002, + "num_input_tokens_seen": 133397585, + "router_z_loss_clip": 1.75488281, + "router_z_loss_mlp": 0.14575195, + "step": 6208, + "time_per_iteration": 2.5359628200531006 + }, + { + "auxiliary_loss_clip": 0.06460227, + "auxiliary_loss_mlp": 0.01272188, + "balance_loss_clip": 0.06285752, + "balance_loss_mlp": 0.01255904, + "epoch": 0.3733052758154216, + "flos": 20710623571200.0, + "grad_norm": 2.148305950788212, + "language_loss": 0.73528939, + "learning_rate": 2.8858942847457953e-06, + "loss": 0.81261349, + "num_input_tokens_seen": 133415365, + "router_z_loss_clip": 1.7421875, + "router_z_loss_mlp": 0.1628418, + "step": 6209, + "time_per_iteration": 2.499209403991699 + }, + { + "auxiliary_loss_clip": 0.06455819, + "auxiliary_loss_mlp": 0.01273959, + "balance_loss_clip": 0.06285547, + "balance_loss_mlp": 0.01258593, + "epoch": 0.3733653990680896, + "flos": 20199704100480.0, + "grad_norm": 2.014449395888949, + "language_loss": 0.71212471, + "learning_rate": 2.8855450969656305e-06, + "loss": 0.78942245, + "num_input_tokens_seen": 133435700, + "router_z_loss_clip": 1.70117188, + "router_z_loss_mlp": 0.15368652, + "step": 6210, + "time_per_iteration": 2.5324270725250244 + }, + { + "auxiliary_loss_clip": 0.06468424, + "auxiliary_loss_mlp": 0.01268574, + "balance_loss_clip": 0.06295058, + "balance_loss_mlp": 0.01253631, + "epoch": 0.37342552232075754, + "flos": 20345920675200.0, + "grad_norm": 1.543701660359285, + "language_loss": 0.7823801, + "learning_rate": 2.8851958756055073e-06, + "loss": 0.85975003, + "num_input_tokens_seen": 133455180, + "router_z_loss_clip": 1.73535156, + "router_z_loss_mlp": 0.1494751, + "step": 6211, + "time_per_iteration": 2.5388078689575195 + }, + { + "auxiliary_loss_clip": 0.06464606, + "auxiliary_loss_mlp": 0.01268752, + "balance_loss_clip": 0.06291494, + "balance_loss_mlp": 0.0125347, + "epoch": 0.3734856455734255, + "flos": 35526701243520.0, + "grad_norm": 1.6765525733287814, + "language_loss": 0.73612988, + "learning_rate": 2.884846620678668e-06, + "loss": 0.81346345, + "num_input_tokens_seen": 133476715, + "router_z_loss_clip": 1.73046875, + "router_z_loss_mlp": 0.15283203, + "step": 6212, + "time_per_iteration": 2.663950204849243 + }, + { + "auxiliary_loss_clip": 0.06477734, + "auxiliary_loss_mlp": 0.01272795, + "balance_loss_clip": 0.06294222, + "balance_loss_mlp": 0.01256345, + "epoch": 0.37354576882609347, + "flos": 21148686316800.0, + "grad_norm": 1.865900947954382, + "language_loss": 0.82430422, + "learning_rate": 2.884497332198356e-06, + "loss": 0.90180945, + "num_input_tokens_seen": 133494550, + "router_z_loss_clip": 1.83496094, + "router_z_loss_mlp": 0.16455078, + "step": 6213, + "time_per_iteration": 2.541431427001953 + }, + { + "auxiliary_loss_clip": 0.06467836, + "auxiliary_loss_mlp": 0.01271096, + "balance_loss_clip": 0.06295606, + "balance_loss_mlp": 0.01255623, + "epoch": 0.37360589207876144, + "flos": 21513179577600.0, + "grad_norm": 2.345206885791162, + "language_loss": 0.7896657, + "learning_rate": 2.8841480101778167e-06, + "loss": 0.86705506, + "num_input_tokens_seen": 133512640, + "router_z_loss_clip": 1.72070312, + "router_z_loss_mlp": 0.15466309, + "step": 6214, + "time_per_iteration": 2.545792579650879 + }, + { + "auxiliary_loss_clip": 0.06466322, + "auxiliary_loss_mlp": 0.01270745, + "balance_loss_clip": 0.06297071, + "balance_loss_mlp": 0.01255981, + "epoch": 0.37366601533142946, + "flos": 38444953317120.0, + "grad_norm": 1.6116656191599898, + "language_loss": 0.85112274, + "learning_rate": 2.883798654630296e-06, + "loss": 0.92849338, + "num_input_tokens_seen": 133535540, + "router_z_loss_clip": 1.69433594, + "router_z_loss_mlp": 0.14758301, + "step": 6215, + "time_per_iteration": 2.70700740814209 + }, + { + "auxiliary_loss_clip": 0.06472297, + "auxiliary_loss_mlp": 0.01270089, + "balance_loss_clip": 0.06296762, + "balance_loss_mlp": 0.01254044, + "epoch": 0.3737261385840974, + "flos": 18446908066560.0, + "grad_norm": 1.6510257786225762, + "language_loss": 0.6833967, + "learning_rate": 2.8834492655690423e-06, + "loss": 0.76082057, + "num_input_tokens_seen": 133555795, + "router_z_loss_clip": 1.75683594, + "router_z_loss_mlp": 0.16040039, + "step": 6216, + "time_per_iteration": 3.941821575164795 + }, + { + "auxiliary_loss_clip": 0.06466141, + "auxiliary_loss_mlp": 0.01276294, + "balance_loss_clip": 0.06293347, + "balance_loss_mlp": 0.01260224, + "epoch": 0.3737862618367654, + "flos": 22936506157440.0, + "grad_norm": 2.1208446300989983, + "language_loss": 0.6621505, + "learning_rate": 2.883099843007303e-06, + "loss": 0.73957485, + "num_input_tokens_seen": 133575905, + "router_z_loss_clip": 1.72753906, + "router_z_loss_mlp": 0.1607666, + "step": 6217, + "time_per_iteration": 4.067852258682251 + }, + { + "auxiliary_loss_clip": 0.06468368, + "auxiliary_loss_mlp": 0.01272371, + "balance_loss_clip": 0.06294458, + "balance_loss_mlp": 0.0125772, + "epoch": 0.37384638508943335, + "flos": 15414360624000.0, + "grad_norm": 1.5564133784357135, + "language_loss": 0.80760753, + "learning_rate": 2.88275038695833e-06, + "loss": 0.88501501, + "num_input_tokens_seen": 133592585, + "router_z_loss_clip": 1.73925781, + "router_z_loss_mlp": 0.1463623, + "step": 6218, + "time_per_iteration": 2.5253372192382812 + }, + { + "auxiliary_loss_clip": 0.06465785, + "auxiliary_loss_mlp": 0.01272039, + "balance_loss_clip": 0.06298652, + "balance_loss_mlp": 0.01256661, + "epoch": 0.3739065083421013, + "flos": 24287856480000.0, + "grad_norm": 2.4835018506755566, + "language_loss": 0.79185957, + "learning_rate": 2.8824008974353736e-06, + "loss": 0.86923778, + "num_input_tokens_seen": 133615070, + "router_z_loss_clip": 1.671875, + "router_z_loss_mlp": 0.15380859, + "step": 6219, + "time_per_iteration": 2.595684289932251 + }, + { + "auxiliary_loss_clip": 0.06464131, + "auxiliary_loss_mlp": 0.01274727, + "balance_loss_clip": 0.06296779, + "balance_loss_mlp": 0.01260177, + "epoch": 0.3739666315947693, + "flos": 23009488663680.0, + "grad_norm": 2.098390778414135, + "language_loss": 0.77614415, + "learning_rate": 2.8820513744516866e-06, + "loss": 0.85353279, + "num_input_tokens_seen": 133633490, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.14538574, + "step": 6220, + "time_per_iteration": 2.5899298191070557 + }, + { + "auxiliary_loss_clip": 0.06466513, + "auxiliary_loss_mlp": 0.01270657, + "balance_loss_clip": 0.06292208, + "balance_loss_mlp": 0.0125541, + "epoch": 0.37402675484743725, + "flos": 19397231948160.0, + "grad_norm": 1.5821121915867322, + "language_loss": 0.83564717, + "learning_rate": 2.8817018180205235e-06, + "loss": 0.91301888, + "num_input_tokens_seen": 133653425, + "router_z_loss_clip": 1.74121094, + "router_z_loss_mlp": 0.15240479, + "step": 6221, + "time_per_iteration": 2.540102481842041 + }, + { + "auxiliary_loss_clip": 0.06464627, + "auxiliary_loss_mlp": 0.0127713, + "balance_loss_clip": 0.06293692, + "balance_loss_mlp": 0.01262647, + "epoch": 0.3740868781001052, + "flos": 17131420091520.0, + "grad_norm": 1.6401420513761291, + "language_loss": 0.76738596, + "learning_rate": 2.8813522281551387e-06, + "loss": 0.84480345, + "num_input_tokens_seen": 133670220, + "router_z_loss_clip": 1.7109375, + "router_z_loss_mlp": 0.14477539, + "step": 6222, + "time_per_iteration": 4.020254850387573 + }, + { + "auxiliary_loss_clip": 0.06466988, + "auxiliary_loss_mlp": 0.01277801, + "balance_loss_clip": 0.06296736, + "balance_loss_mlp": 0.01263467, + "epoch": 0.3741470013527732, + "flos": 20049001332480.0, + "grad_norm": 1.799306271558528, + "language_loss": 0.70768011, + "learning_rate": 2.881002604868789e-06, + "loss": 0.785128, + "num_input_tokens_seen": 133688910, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.14349365, + "step": 6223, + "time_per_iteration": 2.6146726608276367 + }, + { + "auxiliary_loss_clip": 0.0646846, + "auxiliary_loss_mlp": 0.01272132, + "balance_loss_clip": 0.06299432, + "balance_loss_mlp": 0.01258954, + "epoch": 0.37420712460544114, + "flos": 36905151162240.0, + "grad_norm": 1.9191598081110601, + "language_loss": 0.69292819, + "learning_rate": 2.8806529481747325e-06, + "loss": 0.77033412, + "num_input_tokens_seen": 133708690, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.1317749, + "step": 6224, + "time_per_iteration": 4.144296407699585 + }, + { + "auxiliary_loss_clip": 0.06463895, + "auxiliary_loss_mlp": 0.01274949, + "balance_loss_clip": 0.06296779, + "balance_loss_mlp": 0.01260126, + "epoch": 0.3742672478581091, + "flos": 22207896979200.0, + "grad_norm": 1.811742579086715, + "language_loss": 0.70166373, + "learning_rate": 2.880303258086228e-06, + "loss": 0.77905214, + "num_input_tokens_seen": 133728095, + "router_z_loss_clip": 1.66894531, + "router_z_loss_mlp": 0.14819336, + "step": 6225, + "time_per_iteration": 2.562023162841797 + }, + { + "auxiliary_loss_clip": 0.06462345, + "auxiliary_loss_mlp": 0.0127698, + "balance_loss_clip": 0.06296264, + "balance_loss_mlp": 0.01262257, + "epoch": 0.3743273711107771, + "flos": 24688547504640.0, + "grad_norm": 2.0306145345851614, + "language_loss": 0.79386592, + "learning_rate": 2.879953534616536e-06, + "loss": 0.87125921, + "num_input_tokens_seen": 133745590, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.14715576, + "step": 6226, + "time_per_iteration": 2.5372707843780518 + }, + { + "auxiliary_loss_clip": 0.06464548, + "auxiliary_loss_mlp": 0.01273743, + "balance_loss_clip": 0.0629389, + "balance_loss_mlp": 0.01259021, + "epoch": 0.37438749436344504, + "flos": 24466078114560.0, + "grad_norm": 1.6346435650910545, + "language_loss": 0.68240035, + "learning_rate": 2.879603777778917e-06, + "loss": 0.75978327, + "num_input_tokens_seen": 133766155, + "router_z_loss_clip": 1.70800781, + "router_z_loss_mlp": 0.14733887, + "step": 6227, + "time_per_iteration": 2.5752079486846924 + }, + { + "auxiliary_loss_clip": 0.06464467, + "auxiliary_loss_mlp": 0.01270066, + "balance_loss_clip": 0.06297411, + "balance_loss_mlp": 0.0125588, + "epoch": 0.374447617616113, + "flos": 21805193456640.0, + "grad_norm": 1.6298548281431393, + "language_loss": 0.83520573, + "learning_rate": 2.879253987586635e-06, + "loss": 0.91255105, + "num_input_tokens_seen": 133783185, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.14190674, + "step": 6228, + "time_per_iteration": 2.605607748031616 + }, + { + "auxiliary_loss_clip": 0.06458256, + "auxiliary_loss_mlp": 0.01270458, + "balance_loss_clip": 0.06288552, + "balance_loss_mlp": 0.01256033, + "epoch": 0.374507740868781, + "flos": 17974073076480.0, + "grad_norm": 1.5343038876343353, + "language_loss": 0.75450277, + "learning_rate": 2.8789041640529535e-06, + "loss": 0.83178985, + "num_input_tokens_seen": 133800975, + "router_z_loss_clip": 1.69824219, + "router_z_loss_mlp": 0.14428711, + "step": 6229, + "time_per_iteration": 2.607506036758423 + }, + { + "auxiliary_loss_clip": 0.06464534, + "auxiliary_loss_mlp": 0.012714, + "balance_loss_clip": 0.06293011, + "balance_loss_mlp": 0.01256249, + "epoch": 0.374567864121449, + "flos": 16111132450560.0, + "grad_norm": 3.0205318355467083, + "language_loss": 0.84065855, + "learning_rate": 2.8785543071911383e-06, + "loss": 0.91801792, + "num_input_tokens_seen": 133818020, + "router_z_loss_clip": 1.71386719, + "router_z_loss_mlp": 0.15142822, + "step": 6230, + "time_per_iteration": 2.4964523315429688 + }, + { + "auxiliary_loss_clip": 0.06463904, + "auxiliary_loss_mlp": 0.01275239, + "balance_loss_clip": 0.06291893, + "balance_loss_mlp": 0.01259569, + "epoch": 0.37462798737411696, + "flos": 25779847080960.0, + "grad_norm": 1.7178487844900587, + "language_loss": 0.73793018, + "learning_rate": 2.878204417014456e-06, + "loss": 0.81532168, + "num_input_tokens_seen": 133840690, + "router_z_loss_clip": 1.72167969, + "router_z_loss_mlp": 0.15667725, + "step": 6231, + "time_per_iteration": 2.589771270751953 + }, + { + "auxiliary_loss_clip": 0.06465879, + "auxiliary_loss_mlp": 0.01270223, + "balance_loss_clip": 0.06291361, + "balance_loss_mlp": 0.01255298, + "epoch": 0.3746881106267849, + "flos": 16660136401920.0, + "grad_norm": 1.8762806294571872, + "language_loss": 0.74086344, + "learning_rate": 2.8778544935361735e-06, + "loss": 0.81822443, + "num_input_tokens_seen": 133858350, + "router_z_loss_clip": 1.74414062, + "router_z_loss_mlp": 0.14929199, + "step": 6232, + "time_per_iteration": 2.483219861984253 + }, + { + "auxiliary_loss_clip": 0.06463014, + "auxiliary_loss_mlp": 0.01270796, + "balance_loss_clip": 0.06290261, + "balance_loss_mlp": 0.0125605, + "epoch": 0.3747482338794529, + "flos": 26185317788160.0, + "grad_norm": 1.743409558247901, + "language_loss": 0.77404612, + "learning_rate": 2.877504536769561e-06, + "loss": 0.85138428, + "num_input_tokens_seen": 133879775, + "router_z_loss_clip": 1.72753906, + "router_z_loss_mlp": 0.14758301, + "step": 6233, + "time_per_iteration": 2.5796406269073486 + }, + { + "auxiliary_loss_clip": 0.06463634, + "auxiliary_loss_mlp": 0.01269135, + "balance_loss_clip": 0.06292734, + "balance_loss_mlp": 0.01255432, + "epoch": 0.37480835713212085, + "flos": 12025956890880.0, + "grad_norm": 1.7958128584553208, + "language_loss": 0.69650698, + "learning_rate": 2.8771545467278883e-06, + "loss": 0.77383471, + "num_input_tokens_seen": 133898295, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.13690186, + "step": 6234, + "time_per_iteration": 2.524226188659668 + }, + { + "auxiliary_loss_clip": 0.06464471, + "auxiliary_loss_mlp": 0.01267248, + "balance_loss_clip": 0.06295948, + "balance_loss_mlp": 0.0125311, + "epoch": 0.3748684803847888, + "flos": 19684801560960.0, + "grad_norm": 2.1537876510353597, + "language_loss": 0.83551729, + "learning_rate": 2.8768045234244276e-06, + "loss": 0.91283447, + "num_input_tokens_seen": 133915230, + "router_z_loss_clip": 1.68554688, + "router_z_loss_mlp": 0.14135742, + "step": 6235, + "time_per_iteration": 2.5380606651306152 + }, + { + "auxiliary_loss_clip": 0.06462481, + "auxiliary_loss_mlp": 0.01267157, + "balance_loss_clip": 0.06289958, + "balance_loss_mlp": 0.0125222, + "epoch": 0.3749286036374568, + "flos": 20527328764800.0, + "grad_norm": 1.8434440291752416, + "language_loss": 0.78213942, + "learning_rate": 2.8764544668724517e-06, + "loss": 0.8594358, + "num_input_tokens_seen": 133934110, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.14941406, + "step": 6236, + "time_per_iteration": 2.507180690765381 + }, + { + "auxiliary_loss_clip": 0.06465082, + "auxiliary_loss_mlp": 0.0127323, + "balance_loss_clip": 0.06288011, + "balance_loss_mlp": 0.0125616, + "epoch": 0.37498872689012475, + "flos": 20710958987520.0, + "grad_norm": 1.9437086154972172, + "language_loss": 0.73305297, + "learning_rate": 2.876104377085234e-06, + "loss": 0.81043607, + "num_input_tokens_seen": 133952395, + "router_z_loss_clip": 1.76953125, + "router_z_loss_mlp": 0.17077637, + "step": 6237, + "time_per_iteration": 2.5545706748962402 + }, + { + "auxiliary_loss_clip": 0.06460923, + "auxiliary_loss_mlp": 0.01271336, + "balance_loss_clip": 0.0628608, + "balance_loss_mlp": 0.01256548, + "epoch": 0.3750488501427927, + "flos": 21580418079360.0, + "grad_norm": 2.5847168840400787, + "language_loss": 0.93616223, + "learning_rate": 2.8757542540760508e-06, + "loss": 1.01348472, + "num_input_tokens_seen": 133969635, + "router_z_loss_clip": 1.74804688, + "router_z_loss_mlp": 0.14788818, + "step": 6238, + "time_per_iteration": 2.544524669647217 + }, + { + "auxiliary_loss_clip": 0.06457306, + "auxiliary_loss_mlp": 0.01272243, + "balance_loss_clip": 0.06286643, + "balance_loss_mlp": 0.01257127, + "epoch": 0.3751089733954607, + "flos": 15929221236480.0, + "grad_norm": 2.2437121352489093, + "language_loss": 0.71661341, + "learning_rate": 2.8754040978581777e-06, + "loss": 0.79390883, + "num_input_tokens_seen": 133987215, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.15106201, + "step": 6239, + "time_per_iteration": 2.519807815551758 + }, + { + "auxiliary_loss_clip": 0.06461261, + "auxiliary_loss_mlp": 0.01271582, + "balance_loss_clip": 0.06287319, + "balance_loss_mlp": 0.01256485, + "epoch": 0.37516909664812864, + "flos": 36293688391680.0, + "grad_norm": 1.5212724151961043, + "language_loss": 0.65758455, + "learning_rate": 2.875053908444895e-06, + "loss": 0.73491299, + "num_input_tokens_seen": 134009250, + "router_z_loss_clip": 1.73730469, + "router_z_loss_mlp": 0.15118408, + "step": 6240, + "time_per_iteration": 2.6838748455047607 + }, + { + "auxiliary_loss_clip": 0.06461462, + "auxiliary_loss_mlp": 0.0126514, + "balance_loss_clip": 0.06288624, + "balance_loss_mlp": 0.01251258, + "epoch": 0.3752292199007966, + "flos": 13520882384640.0, + "grad_norm": 2.454894337240739, + "language_loss": 0.76209545, + "learning_rate": 2.8747036858494795e-06, + "loss": 0.83936143, + "num_input_tokens_seen": 134026875, + "router_z_loss_clip": 1.73046875, + "router_z_loss_mlp": 0.13867188, + "step": 6241, + "time_per_iteration": 2.498286008834839 + }, + { + "auxiliary_loss_clip": 0.06461808, + "auxiliary_loss_mlp": 0.01268507, + "balance_loss_clip": 0.06289176, + "balance_loss_mlp": 0.01253206, + "epoch": 0.3752893431534646, + "flos": 27205353866880.0, + "grad_norm": 2.0832931967812853, + "language_loss": 0.84671998, + "learning_rate": 2.874353430085213e-06, + "loss": 0.92402315, + "num_input_tokens_seen": 134047185, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.15313721, + "step": 6242, + "time_per_iteration": 2.6289877891540527 + }, + { + "auxiliary_loss_clip": 0.06457841, + "auxiliary_loss_mlp": 0.01272178, + "balance_loss_clip": 0.06285247, + "balance_loss_mlp": 0.01257379, + "epoch": 0.3753494664061326, + "flos": 30015431919360.0, + "grad_norm": 2.6434313807577112, + "language_loss": 0.68551457, + "learning_rate": 2.8740031411653766e-06, + "loss": 0.76281476, + "num_input_tokens_seen": 134067330, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.14813232, + "step": 6243, + "time_per_iteration": 2.7211153507232666 + }, + { + "auxiliary_loss_clip": 0.0645824, + "auxiliary_loss_mlp": 0.01270289, + "balance_loss_clip": 0.06286814, + "balance_loss_mlp": 0.01254482, + "epoch": 0.37540958965880056, + "flos": 24468803372160.0, + "grad_norm": 1.7478523324296555, + "language_loss": 0.8397631, + "learning_rate": 2.8736528191032535e-06, + "loss": 0.91704839, + "num_input_tokens_seen": 134085525, + "router_z_loss_clip": 1.71386719, + "router_z_loss_mlp": 0.15808105, + "step": 6244, + "time_per_iteration": 2.5738887786865234 + }, + { + "auxiliary_loss_clip": 0.0645659, + "auxiliary_loss_mlp": 0.01266605, + "balance_loss_clip": 0.06290226, + "balance_loss_mlp": 0.01252842, + "epoch": 0.3754697129114685, + "flos": 16513961754240.0, + "grad_norm": 3.8447339818169257, + "language_loss": 0.83823436, + "learning_rate": 2.8733024639121277e-06, + "loss": 0.91546631, + "num_input_tokens_seen": 134101855, + "router_z_loss_clip": 1.66113281, + "router_z_loss_mlp": 0.13751221, + "step": 6245, + "time_per_iteration": 2.5320816040039062 + }, + { + "auxiliary_loss_clip": 0.06453504, + "auxiliary_loss_mlp": 0.0127263, + "balance_loss_clip": 0.06282875, + "balance_loss_mlp": 0.01257633, + "epoch": 0.3755298361641365, + "flos": 19396980385920.0, + "grad_norm": 2.4621620681348295, + "language_loss": 0.64685225, + "learning_rate": 2.8729520756052853e-06, + "loss": 0.72411358, + "num_input_tokens_seen": 134119360, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.14990234, + "step": 6246, + "time_per_iteration": 2.58577561378479 + }, + { + "auxiliary_loss_clip": 0.06466524, + "auxiliary_loss_mlp": 0.01278259, + "balance_loss_clip": 0.06289048, + "balance_loss_mlp": 0.01262428, + "epoch": 0.37558995941680445, + "flos": 14725638789120.0, + "grad_norm": 2.3474335464279648, + "language_loss": 0.75348055, + "learning_rate": 2.8726016541960124e-06, + "loss": 0.83092844, + "num_input_tokens_seen": 134137475, + "router_z_loss_clip": 1.77148438, + "router_z_loss_mlp": 0.1583252, + "step": 6247, + "time_per_iteration": 2.47930908203125 + }, + { + "auxiliary_loss_clip": 0.06456453, + "auxiliary_loss_mlp": 0.012715, + "balance_loss_clip": 0.06282347, + "balance_loss_mlp": 0.01255503, + "epoch": 0.3756500826694724, + "flos": 21696432456960.0, + "grad_norm": 3.5646784592424017, + "language_loss": 0.55380279, + "learning_rate": 2.872251199697598e-06, + "loss": 0.6310823, + "num_input_tokens_seen": 134154580, + "router_z_loss_clip": 1.74121094, + "router_z_loss_mlp": 0.16003418, + "step": 6248, + "time_per_iteration": 2.5266313552856445 + }, + { + "auxiliary_loss_clip": 0.06453443, + "auxiliary_loss_mlp": 0.01268535, + "balance_loss_clip": 0.06283841, + "balance_loss_mlp": 0.01253109, + "epoch": 0.3757102059221404, + "flos": 26512942452480.0, + "grad_norm": 1.7302245846967215, + "language_loss": 0.84781861, + "learning_rate": 2.8719007121233297e-06, + "loss": 0.92503834, + "num_input_tokens_seen": 134174285, + "router_z_loss_clip": 1.69433594, + "router_z_loss_mlp": 0.15429688, + "step": 6249, + "time_per_iteration": 2.5590078830718994 + }, + { + "auxiliary_loss_clip": 0.06456596, + "auxiliary_loss_mlp": 0.01267858, + "balance_loss_clip": 0.0628508, + "balance_loss_mlp": 0.01253481, + "epoch": 0.37577032917480835, + "flos": 37346526144000.0, + "grad_norm": 1.6299752789251518, + "language_loss": 0.68482721, + "learning_rate": 2.8715501914864993e-06, + "loss": 0.76207179, + "num_input_tokens_seen": 134195940, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.14361572, + "step": 6250, + "time_per_iteration": 2.6926450729370117 + }, + { + "auxiliary_loss_clip": 0.06454285, + "auxiliary_loss_mlp": 0.01268088, + "balance_loss_clip": 0.06283066, + "balance_loss_mlp": 0.01254099, + "epoch": 0.3758304524274763, + "flos": 21915128413440.0, + "grad_norm": 2.0147801854845895, + "language_loss": 0.78550422, + "learning_rate": 2.8711996378003987e-06, + "loss": 0.862728, + "num_input_tokens_seen": 134212235, + "router_z_loss_clip": 1.71191406, + "router_z_loss_mlp": 0.13995361, + "step": 6251, + "time_per_iteration": 2.5072193145751953 + }, + { + "auxiliary_loss_clip": 0.06455163, + "auxiliary_loss_mlp": 0.01271265, + "balance_loss_clip": 0.06285167, + "balance_loss_mlp": 0.01257139, + "epoch": 0.3758905756801443, + "flos": 36577233008640.0, + "grad_norm": 2.2428429985343543, + "language_loss": 0.58560276, + "learning_rate": 2.8708490510783203e-06, + "loss": 0.66286701, + "num_input_tokens_seen": 134233810, + "router_z_loss_clip": 1.69824219, + "router_z_loss_mlp": 0.14111328, + "step": 6252, + "time_per_iteration": 2.684899091720581 + }, + { + "auxiliary_loss_clip": 0.06456266, + "auxiliary_loss_mlp": 0.01271539, + "balance_loss_clip": 0.06283682, + "balance_loss_mlp": 0.01255649, + "epoch": 0.37595069893281224, + "flos": 24534616354560.0, + "grad_norm": 1.5871699178816958, + "language_loss": 0.8998009, + "learning_rate": 2.8704984313335584e-06, + "loss": 0.97707891, + "num_input_tokens_seen": 134252020, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.15869141, + "step": 6253, + "time_per_iteration": 2.539088010787964 + }, + { + "auxiliary_loss_clip": 0.0645566, + "auxiliary_loss_mlp": 0.01270173, + "balance_loss_clip": 0.06288448, + "balance_loss_mlp": 0.01255523, + "epoch": 0.3760108221854802, + "flos": 16440518050560.0, + "grad_norm": 2.3821241740713086, + "language_loss": 0.77027023, + "learning_rate": 2.8701477785794097e-06, + "loss": 0.84752858, + "num_input_tokens_seen": 134269495, + "router_z_loss_clip": 1.671875, + "router_z_loss_mlp": 0.14648438, + "step": 6254, + "time_per_iteration": 2.545330047607422 + }, + { + "auxiliary_loss_clip": 0.06454843, + "auxiliary_loss_mlp": 0.01270718, + "balance_loss_clip": 0.06281418, + "balance_loss_mlp": 0.01254386, + "epoch": 0.37607094543814823, + "flos": 13776824280960.0, + "grad_norm": 2.2494955117694007, + "language_loss": 0.62504637, + "learning_rate": 2.869797092829169e-06, + "loss": 0.70230198, + "num_input_tokens_seen": 134287035, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.16333008, + "step": 6255, + "time_per_iteration": 3.937791109085083 + }, + { + "auxiliary_loss_clip": 0.06456207, + "auxiliary_loss_mlp": 0.0127009, + "balance_loss_clip": 0.06282066, + "balance_loss_mlp": 0.01253758, + "epoch": 0.3761310686908162, + "flos": 19862855487360.0, + "grad_norm": 2.2501042164391634, + "language_loss": 0.74801397, + "learning_rate": 2.869446374096135e-06, + "loss": 0.82527697, + "num_input_tokens_seen": 134304840, + "router_z_loss_clip": 1.74023438, + "router_z_loss_mlp": 0.16345215, + "step": 6256, + "time_per_iteration": 2.52768611907959 + }, + { + "auxiliary_loss_clip": 0.06456085, + "auxiliary_loss_mlp": 0.01270671, + "balance_loss_clip": 0.06281887, + "balance_loss_mlp": 0.01254637, + "epoch": 0.37619119194348416, + "flos": 12755823880320.0, + "grad_norm": 1.8167076240371511, + "language_loss": 0.70818299, + "learning_rate": 2.8690956223936088e-06, + "loss": 0.78545058, + "num_input_tokens_seen": 134323180, + "router_z_loss_clip": 1.74023438, + "router_z_loss_mlp": 0.16040039, + "step": 6257, + "time_per_iteration": 4.052328824996948 + }, + { + "auxiliary_loss_clip": 0.06452011, + "auxiliary_loss_mlp": 0.01268418, + "balance_loss_clip": 0.0628053, + "balance_loss_mlp": 0.01253743, + "epoch": 0.3762513151961521, + "flos": 17536387674240.0, + "grad_norm": 1.6926603581335775, + "language_loss": 0.85114312, + "learning_rate": 2.868744837734889e-06, + "loss": 0.92834735, + "num_input_tokens_seen": 134341390, + "router_z_loss_clip": 1.71386719, + "router_z_loss_mlp": 0.14672852, + "step": 6258, + "time_per_iteration": 2.50252366065979 + }, + { + "auxiliary_loss_clip": 0.06455131, + "auxiliary_loss_mlp": 0.0127104, + "balance_loss_clip": 0.06282814, + "balance_loss_mlp": 0.01256503, + "epoch": 0.3763114384488201, + "flos": 23623215494400.0, + "grad_norm": 1.3678719492617617, + "language_loss": 0.81156051, + "learning_rate": 2.868394020133277e-06, + "loss": 0.8888222, + "num_input_tokens_seen": 134360425, + "router_z_loss_clip": 1.72265625, + "router_z_loss_mlp": 0.14532471, + "step": 6259, + "time_per_iteration": 2.5430314540863037 + }, + { + "auxiliary_loss_clip": 0.06458686, + "auxiliary_loss_mlp": 0.01274293, + "balance_loss_clip": 0.06282908, + "balance_loss_mlp": 0.0125696, + "epoch": 0.37637156170148806, + "flos": 25413383249280.0, + "grad_norm": 1.809326583941318, + "language_loss": 0.71774137, + "learning_rate": 2.8680431696020783e-06, + "loss": 0.79507113, + "num_input_tokens_seen": 134379775, + "router_z_loss_clip": 1.75585938, + "router_z_loss_mlp": 0.17321777, + "step": 6260, + "time_per_iteration": 2.566267490386963 + }, + { + "auxiliary_loss_clip": 0.06455691, + "auxiliary_loss_mlp": 0.0127871, + "balance_loss_clip": 0.06279852, + "balance_loss_mlp": 0.01262128, + "epoch": 0.376431684954156, + "flos": 23447677190400.0, + "grad_norm": 1.8475234283885087, + "language_loss": 0.78925788, + "learning_rate": 2.867692286154594e-06, + "loss": 0.86660182, + "num_input_tokens_seen": 134400315, + "router_z_loss_clip": 1.75878906, + "router_z_loss_mlp": 0.16589355, + "step": 6261, + "time_per_iteration": 2.5848124027252197 + }, + { + "auxiliary_loss_clip": 0.06455033, + "auxiliary_loss_mlp": 0.01273009, + "balance_loss_clip": 0.06278862, + "balance_loss_mlp": 0.01257607, + "epoch": 0.376491808206824, + "flos": 34213099985280.0, + "grad_norm": 2.1653724604475255, + "language_loss": 0.80626601, + "learning_rate": 2.867341369804132e-06, + "loss": 0.88354641, + "num_input_tokens_seen": 134422875, + "router_z_loss_clip": 1.76171875, + "router_z_loss_mlp": 0.15405273, + "step": 6262, + "time_per_iteration": 4.146479368209839 + }, + { + "auxiliary_loss_clip": 0.06453078, + "auxiliary_loss_mlp": 0.01268581, + "balance_loss_clip": 0.06282018, + "balance_loss_mlp": 0.01253799, + "epoch": 0.37655193145949195, + "flos": 35193793772160.0, + "grad_norm": 1.6953841761456194, + "language_loss": 0.81274903, + "learning_rate": 2.866990420563998e-06, + "loss": 0.88996559, + "num_input_tokens_seen": 134443025, + "router_z_loss_clip": 1.70996094, + "router_z_loss_mlp": 0.14794922, + "step": 6263, + "time_per_iteration": 2.6529650688171387 + }, + { + "auxiliary_loss_clip": 0.06460523, + "auxiliary_loss_mlp": 0.01276014, + "balance_loss_clip": 0.06286405, + "balance_loss_mlp": 0.01261172, + "epoch": 0.3766120547121599, + "flos": 16767136465920.0, + "grad_norm": 1.8888627452248796, + "language_loss": 0.79794824, + "learning_rate": 2.866639438447501e-06, + "loss": 0.87531358, + "num_input_tokens_seen": 134460945, + "router_z_loss_clip": 1.74121094, + "router_z_loss_mlp": 0.14831543, + "step": 6264, + "time_per_iteration": 3.9715349674224854 + }, + { + "auxiliary_loss_clip": 0.06455237, + "auxiliary_loss_mlp": 0.01269266, + "balance_loss_clip": 0.06284397, + "balance_loss_mlp": 0.0125396, + "epoch": 0.3766721779648279, + "flos": 23557150949760.0, + "grad_norm": 1.690336708132248, + "language_loss": 0.7363869, + "learning_rate": 2.8662884234679497e-06, + "loss": 0.81363189, + "num_input_tokens_seen": 134480440, + "router_z_loss_clip": 1.70898438, + "router_z_loss_mlp": 0.15307617, + "step": 6265, + "time_per_iteration": 2.5544657707214355 + }, + { + "auxiliary_loss_clip": 0.06451584, + "auxiliary_loss_mlp": 0.01276088, + "balance_loss_clip": 0.06283864, + "balance_loss_mlp": 0.01262486, + "epoch": 0.37673230121749585, + "flos": 29136329608320.0, + "grad_norm": 1.6256668529315172, + "language_loss": 0.6925773, + "learning_rate": 2.865937375638654e-06, + "loss": 0.76985407, + "num_input_tokens_seen": 134501110, + "router_z_loss_clip": 1.67675781, + "router_z_loss_mlp": 0.1361084, + "step": 6266, + "time_per_iteration": 2.5735552310943604 + }, + { + "auxiliary_loss_clip": 0.06456051, + "auxiliary_loss_mlp": 0.01274095, + "balance_loss_clip": 0.06279004, + "balance_loss_mlp": 0.01258825, + "epoch": 0.3767924244701638, + "flos": 28154210302080.0, + "grad_norm": 2.361518747365002, + "language_loss": 0.63358176, + "learning_rate": 2.8655862949729264e-06, + "loss": 0.7108832, + "num_input_tokens_seen": 134522460, + "router_z_loss_clip": 1.76855469, + "router_z_loss_mlp": 0.15270996, + "step": 6267, + "time_per_iteration": 2.6408746242523193 + }, + { + "auxiliary_loss_clip": 0.0637848, + "auxiliary_loss_mlp": 0.01265654, + "balance_loss_clip": 0.0630175, + "balance_loss_mlp": 0.01263043, + "epoch": 0.37685254772283183, + "flos": 60815460343680.0, + "grad_norm": 0.7019670976586264, + "language_loss": 0.58932841, + "learning_rate": 2.8652351814840795e-06, + "loss": 0.66576976, + "num_input_tokens_seen": 134589545, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.02612305, + "step": 6268, + "time_per_iteration": 3.3041250705718994 + }, + { + "auxiliary_loss_clip": 0.06448595, + "auxiliary_loss_mlp": 0.01272563, + "balance_loss_clip": 0.06277184, + "balance_loss_mlp": 0.01256756, + "epoch": 0.3769126709754998, + "flos": 26039939754240.0, + "grad_norm": 1.4401012750228117, + "language_loss": 0.65166855, + "learning_rate": 2.8648840351854283e-06, + "loss": 0.72888005, + "num_input_tokens_seen": 134610550, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.15795898, + "step": 6269, + "time_per_iteration": 2.654707670211792 + }, + { + "auxiliary_loss_clip": 0.06454687, + "auxiliary_loss_mlp": 0.01276662, + "balance_loss_clip": 0.06286559, + "balance_loss_mlp": 0.01261296, + "epoch": 0.37697279422816776, + "flos": 23585508357120.0, + "grad_norm": 1.4576669810179597, + "language_loss": 0.71144199, + "learning_rate": 2.8645328560902874e-06, + "loss": 0.78875554, + "num_input_tokens_seen": 134630485, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.15362549, + "step": 6270, + "time_per_iteration": 2.5369231700897217 + }, + { + "auxiliary_loss_clip": 0.06374384, + "auxiliary_loss_mlp": 0.0126987, + "balance_loss_clip": 0.062971, + "balance_loss_mlp": 0.01266305, + "epoch": 0.3770329174808357, + "flos": 64766242753920.0, + "grad_norm": 0.6950430831807741, + "language_loss": 0.56232381, + "learning_rate": 2.8641816442119746e-06, + "loss": 0.63876635, + "num_input_tokens_seen": 134693510, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.03561401, + "step": 6271, + "time_per_iteration": 3.1599924564361572 + }, + { + "auxiliary_loss_clip": 0.06448443, + "auxiliary_loss_mlp": 0.01272708, + "balance_loss_clip": 0.06279441, + "balance_loss_mlp": 0.0125696, + "epoch": 0.3770930407335037, + "flos": 21841768563840.0, + "grad_norm": 1.6801171250404496, + "language_loss": 0.80461442, + "learning_rate": 2.8638303995638066e-06, + "loss": 0.88182592, + "num_input_tokens_seen": 134713115, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.1574707, + "step": 6272, + "time_per_iteration": 2.524846076965332 + }, + { + "auxiliary_loss_clip": 0.06450769, + "auxiliary_loss_mlp": 0.01273349, + "balance_loss_clip": 0.06283743, + "balance_loss_mlp": 0.01258329, + "epoch": 0.37715316398617166, + "flos": 22754594943360.0, + "grad_norm": 1.6672783573066894, + "language_loss": 0.74972034, + "learning_rate": 2.863479122159103e-06, + "loss": 0.82696146, + "num_input_tokens_seen": 134732635, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.15026855, + "step": 6273, + "time_per_iteration": 2.5571129322052 + }, + { + "auxiliary_loss_clip": 0.06449255, + "auxiliary_loss_mlp": 0.01271721, + "balance_loss_clip": 0.06280608, + "balance_loss_mlp": 0.01257148, + "epoch": 0.3772132872388396, + "flos": 18920246181120.0, + "grad_norm": 1.32773283576084, + "language_loss": 0.72241038, + "learning_rate": 2.8631278120111858e-06, + "loss": 0.79962015, + "num_input_tokens_seen": 134750695, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.14569092, + "step": 6274, + "time_per_iteration": 2.4966516494750977 + }, + { + "auxiliary_loss_clip": 0.06454083, + "auxiliary_loss_mlp": 0.01271444, + "balance_loss_clip": 0.06282286, + "balance_loss_mlp": 0.01257467, + "epoch": 0.3772734104915076, + "flos": 17351709275520.0, + "grad_norm": 1.8983068498635614, + "language_loss": 0.84638643, + "learning_rate": 2.8627764691333742e-06, + "loss": 0.92364168, + "num_input_tokens_seen": 134768935, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.13983154, + "step": 6275, + "time_per_iteration": 2.534308910369873 + }, + { + "auxiliary_loss_clip": 0.06448515, + "auxiliary_loss_mlp": 0.01272502, + "balance_loss_clip": 0.06282812, + "balance_loss_mlp": 0.01258865, + "epoch": 0.37733353374417555, + "flos": 32350452848640.0, + "grad_norm": 1.3669254528099, + "language_loss": 0.75387293, + "learning_rate": 2.8624250935389935e-06, + "loss": 0.83108306, + "num_input_tokens_seen": 134791260, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.13641357, + "step": 6276, + "time_per_iteration": 2.6563172340393066 + }, + { + "auxiliary_loss_clip": 0.06453335, + "auxiliary_loss_mlp": 0.0127286, + "balance_loss_clip": 0.06282486, + "balance_loss_mlp": 0.0125803, + "epoch": 0.3773936569968435, + "flos": 23366225422080.0, + "grad_norm": 1.9054341571687776, + "language_loss": 0.86016738, + "learning_rate": 2.862073685241366e-06, + "loss": 0.93742937, + "num_input_tokens_seen": 134808350, + "router_z_loss_clip": 1.70898438, + "router_z_loss_mlp": 0.1484375, + "step": 6277, + "time_per_iteration": 2.6153500080108643 + }, + { + "auxiliary_loss_clip": 0.06448077, + "auxiliary_loss_mlp": 0.01271912, + "balance_loss_clip": 0.0628462, + "balance_loss_mlp": 0.01257488, + "epoch": 0.3774537802495115, + "flos": 21472579474560.0, + "grad_norm": 1.5956300393708251, + "language_loss": 0.78636366, + "learning_rate": 2.861722244253818e-06, + "loss": 0.86356354, + "num_input_tokens_seen": 134826005, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.14428711, + "step": 6278, + "time_per_iteration": 2.564234495162964 + }, + { + "auxiliary_loss_clip": 0.06459187, + "auxiliary_loss_mlp": 0.01270608, + "balance_loss_clip": 0.06284142, + "balance_loss_mlp": 0.01255075, + "epoch": 0.37751390350217945, + "flos": 24980812945920.0, + "grad_norm": 1.8067410295121689, + "language_loss": 0.8371948, + "learning_rate": 2.8613707705896767e-06, + "loss": 0.91449273, + "num_input_tokens_seen": 134844995, + "router_z_loss_clip": 1.75097656, + "router_z_loss_mlp": 0.15527344, + "step": 6279, + "time_per_iteration": 2.6134567260742188 + }, + { + "auxiliary_loss_clip": 0.06454675, + "auxiliary_loss_mlp": 0.01271405, + "balance_loss_clip": 0.06282948, + "balance_loss_mlp": 0.01257117, + "epoch": 0.3775740267548474, + "flos": 27826585637760.0, + "grad_norm": 1.84994794715845, + "language_loss": 0.74995327, + "learning_rate": 2.861019264262269e-06, + "loss": 0.82721412, + "num_input_tokens_seen": 134865285, + "router_z_loss_clip": 1.71777344, + "router_z_loss_mlp": 0.1428833, + "step": 6280, + "time_per_iteration": 2.6029937267303467 + }, + { + "auxiliary_loss_clip": 0.06448464, + "auxiliary_loss_mlp": 0.01272763, + "balance_loss_clip": 0.06282684, + "balance_loss_mlp": 0.01259156, + "epoch": 0.3776341500075154, + "flos": 22571845188480.0, + "grad_norm": 1.3018494364650444, + "language_loss": 0.76205039, + "learning_rate": 2.8606677252849242e-06, + "loss": 0.83926266, + "num_input_tokens_seen": 134886535, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.13592529, + "step": 6281, + "time_per_iteration": 2.524489641189575 + }, + { + "auxiliary_loss_clip": 0.06448536, + "auxiliary_loss_mlp": 0.01271342, + "balance_loss_clip": 0.06279069, + "balance_loss_mlp": 0.0125718, + "epoch": 0.3776942732601834, + "flos": 23084148251520.0, + "grad_norm": 1.5306913056637732, + "language_loss": 0.84658033, + "learning_rate": 2.860316153670974e-06, + "loss": 0.92377913, + "num_input_tokens_seen": 134907435, + "router_z_loss_clip": 1.69238281, + "router_z_loss_mlp": 0.14160156, + "step": 6282, + "time_per_iteration": 2.6190710067749023 + }, + { + "auxiliary_loss_clip": 0.06449918, + "auxiliary_loss_mlp": 0.01269426, + "balance_loss_clip": 0.06282572, + "balance_loss_mlp": 0.0125555, + "epoch": 0.37775439651285136, + "flos": 21730617722880.0, + "grad_norm": 1.840636786741823, + "language_loss": 0.70143461, + "learning_rate": 2.8599645494337484e-06, + "loss": 0.77862805, + "num_input_tokens_seen": 134925360, + "router_z_loss_clip": 1.67578125, + "router_z_loss_mlp": 0.13879395, + "step": 6283, + "time_per_iteration": 2.555816411972046 + }, + { + "auxiliary_loss_clip": 0.06452499, + "auxiliary_loss_mlp": 0.01274632, + "balance_loss_clip": 0.06285429, + "balance_loss_mlp": 0.01259957, + "epoch": 0.37781451976551933, + "flos": 23994542862720.0, + "grad_norm": 1.743481736886233, + "language_loss": 0.76856482, + "learning_rate": 2.859612912586581e-06, + "loss": 0.8458361, + "num_input_tokens_seen": 134944205, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.14648438, + "step": 6284, + "time_per_iteration": 2.560770034790039 + }, + { + "auxiliary_loss_clip": 0.06464045, + "auxiliary_loss_mlp": 0.01271283, + "balance_loss_clip": 0.06286186, + "balance_loss_mlp": 0.01254725, + "epoch": 0.3778746430181873, + "flos": 13731821838720.0, + "grad_norm": 2.746966655353194, + "language_loss": 0.85536617, + "learning_rate": 2.8592612431428055e-06, + "loss": 0.93271947, + "num_input_tokens_seen": 134960255, + "router_z_loss_clip": 1.77636719, + "router_z_loss_mlp": 0.16564941, + "step": 6285, + "time_per_iteration": 2.5006392002105713 + }, + { + "auxiliary_loss_clip": 0.06451872, + "auxiliary_loss_mlp": 0.01271139, + "balance_loss_clip": 0.06279811, + "balance_loss_mlp": 0.01256065, + "epoch": 0.37793476627085526, + "flos": 19466021750400.0, + "grad_norm": 1.7632018529100697, + "language_loss": 0.84913701, + "learning_rate": 2.858909541115758e-06, + "loss": 0.9263671, + "num_input_tokens_seen": 134978605, + "router_z_loss_clip": 1.71777344, + "router_z_loss_mlp": 0.1506958, + "step": 6286, + "time_per_iteration": 2.566092014312744 + }, + { + "auxiliary_loss_clip": 0.06452557, + "auxiliary_loss_mlp": 0.01269453, + "balance_loss_clip": 0.06281806, + "balance_loss_mlp": 0.01254182, + "epoch": 0.3779948895235232, + "flos": 10711600945920.0, + "grad_norm": 1.9010574176879877, + "language_loss": 0.823708, + "learning_rate": 2.858557806518775e-06, + "loss": 0.90092808, + "num_input_tokens_seen": 134995020, + "router_z_loss_clip": 1.70800781, + "router_z_loss_mlp": 0.15258789, + "step": 6287, + "time_per_iteration": 2.4892444610595703 + }, + { + "auxiliary_loss_clip": 0.06454234, + "auxiliary_loss_mlp": 0.01274095, + "balance_loss_clip": 0.06284317, + "balance_loss_mlp": 0.01258408, + "epoch": 0.3780550127761912, + "flos": 22316616051840.0, + "grad_norm": 2.1030531862013584, + "language_loss": 0.7330361, + "learning_rate": 2.8582060393651927e-06, + "loss": 0.81031942, + "num_input_tokens_seen": 135012620, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.15679932, + "step": 6288, + "time_per_iteration": 2.5415592193603516 + }, + { + "auxiliary_loss_clip": 0.06453485, + "auxiliary_loss_mlp": 0.01269135, + "balance_loss_clip": 0.06282276, + "balance_loss_mlp": 0.01254359, + "epoch": 0.37811513602885916, + "flos": 28958401463040.0, + "grad_norm": 1.6277535048544236, + "language_loss": 0.75782627, + "learning_rate": 2.857854239668352e-06, + "loss": 0.83505249, + "num_input_tokens_seen": 135033365, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.14770508, + "step": 6289, + "time_per_iteration": 2.5579047203063965 + }, + { + "auxiliary_loss_clip": 0.06454412, + "auxiliary_loss_mlp": 0.01273518, + "balance_loss_clip": 0.06284275, + "balance_loss_mlp": 0.01257925, + "epoch": 0.3781752592815271, + "flos": 23119717109760.0, + "grad_norm": 1.945372772068441, + "language_loss": 0.74155736, + "learning_rate": 2.857502407441593e-06, + "loss": 0.81883669, + "num_input_tokens_seen": 135052185, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.15588379, + "step": 6290, + "time_per_iteration": 2.5697786808013916 + }, + { + "auxiliary_loss_clip": 0.06458094, + "auxiliary_loss_mlp": 0.01273362, + "balance_loss_clip": 0.06281058, + "balance_loss_mlp": 0.0125653, + "epoch": 0.3782353825341951, + "flos": 19762102552320.0, + "grad_norm": 2.4066647483264596, + "language_loss": 0.80529308, + "learning_rate": 2.8571505426982566e-06, + "loss": 0.88260764, + "num_input_tokens_seen": 135070425, + "router_z_loss_clip": 1.77050781, + "router_z_loss_mlp": 0.16833496, + "step": 6291, + "time_per_iteration": 2.4970998764038086 + }, + { + "auxiliary_loss_clip": 0.06456125, + "auxiliary_loss_mlp": 0.01270776, + "balance_loss_clip": 0.06283687, + "balance_loss_mlp": 0.01254933, + "epoch": 0.37829550578686305, + "flos": 22056774940800.0, + "grad_norm": 1.7419894192909393, + "language_loss": 0.76369846, + "learning_rate": 2.8567986454516854e-06, + "loss": 0.84096742, + "num_input_tokens_seen": 135090525, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.1583252, + "step": 6292, + "time_per_iteration": 2.572916030883789 + }, + { + "auxiliary_loss_clip": 0.06452248, + "auxiliary_loss_mlp": 0.0127064, + "balance_loss_clip": 0.06281239, + "balance_loss_mlp": 0.01255631, + "epoch": 0.378355629039531, + "flos": 16475667638400.0, + "grad_norm": 1.682972265329385, + "language_loss": 0.70006013, + "learning_rate": 2.856446715715224e-06, + "loss": 0.77728903, + "num_input_tokens_seen": 135109575, + "router_z_loss_clip": 1.70996094, + "router_z_loss_mlp": 0.15014648, + "step": 6293, + "time_per_iteration": 2.5161240100860596 + }, + { + "auxiliary_loss_clip": 0.06449296, + "auxiliary_loss_mlp": 0.01271246, + "balance_loss_clip": 0.06281447, + "balance_loss_mlp": 0.01255934, + "epoch": 0.378415752292199, + "flos": 19981050071040.0, + "grad_norm": 1.9898859900525039, + "language_loss": 0.7173214, + "learning_rate": 2.8560947535022173e-06, + "loss": 0.79452682, + "num_input_tokens_seen": 135127000, + "router_z_loss_clip": 1.6796875, + "router_z_loss_mlp": 0.15332031, + "step": 6294, + "time_per_iteration": 3.9304022789001465 + }, + { + "auxiliary_loss_clip": 0.06465693, + "auxiliary_loss_mlp": 0.01279732, + "balance_loss_clip": 0.06285857, + "balance_loss_mlp": 0.01264068, + "epoch": 0.378475875544867, + "flos": 14652614355840.0, + "grad_norm": 2.57033704665896, + "language_loss": 0.83215445, + "learning_rate": 2.855742758826011e-06, + "loss": 0.90960872, + "num_input_tokens_seen": 135145285, + "router_z_loss_clip": 1.79980469, + "router_z_loss_mlp": 0.15655518, + "step": 6295, + "time_per_iteration": 2.488780975341797 + }, + { + "auxiliary_loss_clip": 0.06459963, + "auxiliary_loss_mlp": 0.01268811, + "balance_loss_clip": 0.06288329, + "balance_loss_mlp": 0.01253255, + "epoch": 0.37853599879753497, + "flos": 26658194705280.0, + "grad_norm": 1.6154959379599871, + "language_loss": 0.71442378, + "learning_rate": 2.8553907316999547e-06, + "loss": 0.79171151, + "num_input_tokens_seen": 135165240, + "router_z_loss_clip": 1.71484375, + "router_z_loss_mlp": 0.15563965, + "step": 6296, + "time_per_iteration": 4.0578773021698 + }, + { + "auxiliary_loss_clip": 0.06454356, + "auxiliary_loss_mlp": 0.01274534, + "balance_loss_clip": 0.06287888, + "balance_loss_mlp": 0.01260455, + "epoch": 0.37859612205020293, + "flos": 17317817498880.0, + "grad_norm": 1.7695984237012152, + "language_loss": 0.77514613, + "learning_rate": 2.855038672137396e-06, + "loss": 0.85243499, + "num_input_tokens_seen": 135184045, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.14074707, + "step": 6297, + "time_per_iteration": 2.54968523979187 + }, + { + "auxiliary_loss_clip": 0.06462398, + "auxiliary_loss_mlp": 0.01275228, + "balance_loss_clip": 0.0628902, + "balance_loss_mlp": 0.01259481, + "epoch": 0.3786562453028709, + "flos": 18225780341760.0, + "grad_norm": 1.977165612519376, + "language_loss": 0.80132794, + "learning_rate": 2.854686580151684e-06, + "loss": 0.87870419, + "num_input_tokens_seen": 135202365, + "router_z_loss_clip": 1.73339844, + "router_z_loss_mlp": 0.1574707, + "step": 6298, + "time_per_iteration": 2.5013349056243896 + }, + { + "auxiliary_loss_clip": 0.06454945, + "auxiliary_loss_mlp": 0.01270815, + "balance_loss_clip": 0.06285203, + "balance_loss_mlp": 0.01255711, + "epoch": 0.37871636855553886, + "flos": 21221207625600.0, + "grad_norm": 1.480969598733767, + "language_loss": 0.8501091, + "learning_rate": 2.8543344557561722e-06, + "loss": 0.92736673, + "num_input_tokens_seen": 135220955, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.15087891, + "step": 6299, + "time_per_iteration": 2.5749709606170654 + }, + { + "auxiliary_loss_clip": 0.06460874, + "auxiliary_loss_mlp": 0.01272586, + "balance_loss_clip": 0.06288288, + "balance_loss_mlp": 0.01256844, + "epoch": 0.3787764918082068, + "flos": 20957886570240.0, + "grad_norm": 2.4357425027716895, + "language_loss": 0.77022231, + "learning_rate": 2.8539822989642116e-06, + "loss": 0.84755683, + "num_input_tokens_seen": 135239715, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.15740967, + "step": 6300, + "time_per_iteration": 2.521772623062134 + }, + { + "auxiliary_loss_clip": 0.06472084, + "auxiliary_loss_mlp": 0.01275415, + "balance_loss_clip": 0.06293886, + "balance_loss_mlp": 0.01258177, + "epoch": 0.3788366150608748, + "flos": 17313205524480.0, + "grad_norm": 1.8143586204861406, + "language_loss": 0.83141446, + "learning_rate": 2.8536301097891577e-06, + "loss": 0.90888953, + "num_input_tokens_seen": 135257035, + "router_z_loss_clip": 1.78125, + "router_z_loss_mlp": 0.17236328, + "step": 6301, + "time_per_iteration": 3.982780933380127 + }, + { + "auxiliary_loss_clip": 0.0646001, + "auxiliary_loss_mlp": 0.01270469, + "balance_loss_clip": 0.06287184, + "balance_loss_mlp": 0.0125428, + "epoch": 0.37889673831354276, + "flos": 24317094355200.0, + "grad_norm": 1.8203378599779103, + "language_loss": 0.68096328, + "learning_rate": 2.8532778882443636e-06, + "loss": 0.75826812, + "num_input_tokens_seen": 135275720, + "router_z_loss_clip": 1.72753906, + "router_z_loss_mlp": 0.16186523, + "step": 6302, + "time_per_iteration": 2.5983002185821533 + }, + { + "auxiliary_loss_clip": 0.06455475, + "auxiliary_loss_mlp": 0.01270441, + "balance_loss_clip": 0.06284864, + "balance_loss_mlp": 0.01255718, + "epoch": 0.3789568615662107, + "flos": 26690157838080.0, + "grad_norm": 2.521279180058548, + "language_loss": 0.68357861, + "learning_rate": 2.8529256343431867e-06, + "loss": 0.76083779, + "num_input_tokens_seen": 135294140, + "router_z_loss_clip": 1.70703125, + "router_z_loss_mlp": 0.1472168, + "step": 6303, + "time_per_iteration": 2.5610175132751465 + }, + { + "auxiliary_loss_clip": 0.06458124, + "auxiliary_loss_mlp": 0.01272095, + "balance_loss_clip": 0.06285581, + "balance_loss_mlp": 0.01257265, + "epoch": 0.3790169848188787, + "flos": 23591713559040.0, + "grad_norm": 1.604251878296904, + "language_loss": 0.78095663, + "learning_rate": 2.8525733480989846e-06, + "loss": 0.85825884, + "num_input_tokens_seen": 135314845, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.14807129, + "step": 6304, + "time_per_iteration": 3.994072437286377 + }, + { + "auxiliary_loss_clip": 0.06468576, + "auxiliary_loss_mlp": 0.01269708, + "balance_loss_clip": 0.06292479, + "balance_loss_mlp": 0.01253806, + "epoch": 0.37907710807154665, + "flos": 18442547654400.0, + "grad_norm": 1.8924180649319282, + "language_loss": 0.80524492, + "learning_rate": 2.8522210295251146e-06, + "loss": 0.88262779, + "num_input_tokens_seen": 135333055, + "router_z_loss_clip": 1.76074219, + "router_z_loss_mlp": 0.15881348, + "step": 6305, + "time_per_iteration": 2.5073235034942627 + }, + { + "auxiliary_loss_clip": 0.06370047, + "auxiliary_loss_mlp": 0.01262008, + "balance_loss_clip": 0.06291789, + "balance_loss_mlp": 0.01258527, + "epoch": 0.3791372313242146, + "flos": 50123690887680.0, + "grad_norm": 0.9538902579511545, + "language_loss": 0.64400995, + "learning_rate": 2.8518686786349387e-06, + "loss": 0.72033048, + "num_input_tokens_seen": 135387865, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 0.03491211, + "step": 6306, + "time_per_iteration": 3.106515645980835 + }, + { + "auxiliary_loss_clip": 0.06464424, + "auxiliary_loss_mlp": 0.01273174, + "balance_loss_clip": 0.06292081, + "balance_loss_mlp": 0.01257683, + "epoch": 0.3791973545768826, + "flos": 24323467265280.0, + "grad_norm": 1.5167178412192643, + "language_loss": 0.73534656, + "learning_rate": 2.851516295441817e-06, + "loss": 0.8127225, + "num_input_tokens_seen": 135409095, + "router_z_loss_clip": 1.72363281, + "router_z_loss_mlp": 0.15484619, + "step": 6307, + "time_per_iteration": 2.6272099018096924 + }, + { + "auxiliary_loss_clip": 0.06462627, + "auxiliary_loss_mlp": 0.01270499, + "balance_loss_clip": 0.06287986, + "balance_loss_mlp": 0.0125505, + "epoch": 0.3792574778295506, + "flos": 21586329792000.0, + "grad_norm": 1.8539993286062635, + "language_loss": 0.78603798, + "learning_rate": 2.851163879959112e-06, + "loss": 0.86336923, + "num_input_tokens_seen": 135429585, + "router_z_loss_clip": 1.74511719, + "router_z_loss_mlp": 0.15441895, + "step": 6308, + "time_per_iteration": 2.518927574157715 + }, + { + "auxiliary_loss_clip": 0.06459265, + "auxiliary_loss_mlp": 0.01272841, + "balance_loss_clip": 0.06287025, + "balance_loss_mlp": 0.01257028, + "epoch": 0.37931760108221857, + "flos": 22279202403840.0, + "grad_norm": 4.0253147283534, + "language_loss": 0.73503512, + "learning_rate": 2.8508114322001876e-06, + "loss": 0.81235617, + "num_input_tokens_seen": 135446320, + "router_z_loss_clip": 1.72265625, + "router_z_loss_mlp": 0.15814209, + "step": 6309, + "time_per_iteration": 2.539158344268799 + }, + { + "auxiliary_loss_clip": 0.06457806, + "auxiliary_loss_mlp": 0.01274513, + "balance_loss_clip": 0.06288067, + "balance_loss_mlp": 0.0125963, + "epoch": 0.37937772433488653, + "flos": 19689161973120.0, + "grad_norm": 1.3654110952225158, + "language_loss": 0.79184294, + "learning_rate": 2.8504589521784083e-06, + "loss": 0.86916614, + "num_input_tokens_seen": 135465720, + "router_z_loss_clip": 1.69726562, + "router_z_loss_mlp": 0.14886475, + "step": 6310, + "time_per_iteration": 2.4997847080230713 + }, + { + "auxiliary_loss_clip": 0.06457442, + "auxiliary_loss_mlp": 0.01268809, + "balance_loss_clip": 0.06285986, + "balance_loss_mlp": 0.01253586, + "epoch": 0.3794378475875545, + "flos": 19105469631360.0, + "grad_norm": 1.8573579951480166, + "language_loss": 0.76741791, + "learning_rate": 2.8501064399071403e-06, + "loss": 0.84468043, + "num_input_tokens_seen": 135485155, + "router_z_loss_clip": 1.71484375, + "router_z_loss_mlp": 0.15222168, + "step": 6311, + "time_per_iteration": 2.5216546058654785 + }, + { + "auxiliary_loss_clip": 0.06457929, + "auxiliary_loss_mlp": 0.01276784, + "balance_loss_clip": 0.06287444, + "balance_loss_mlp": 0.01261746, + "epoch": 0.37949797084022246, + "flos": 20345920675200.0, + "grad_norm": 1.4012846072012495, + "language_loss": 0.71063423, + "learning_rate": 2.8497538953997504e-06, + "loss": 0.78798139, + "num_input_tokens_seen": 135502675, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.15032959, + "step": 6312, + "time_per_iteration": 2.4909064769744873 + }, + { + "auxiliary_loss_clip": 0.06361144, + "auxiliary_loss_mlp": 0.01254908, + "balance_loss_clip": 0.06283364, + "balance_loss_mlp": 0.01251185, + "epoch": 0.37955809409289043, + "flos": 63991121760000.0, + "grad_norm": 0.7457914665340521, + "language_loss": 0.55941355, + "learning_rate": 2.849401318669608e-06, + "loss": 0.63557404, + "num_input_tokens_seen": 135562005, + "router_z_loss_clip": 0.77734375, + "router_z_loss_mlp": 0.03713989, + "step": 6313, + "time_per_iteration": 3.1312170028686523 + }, + { + "auxiliary_loss_clip": 0.06457204, + "auxiliary_loss_mlp": 0.0127245, + "balance_loss_clip": 0.06285529, + "balance_loss_mlp": 0.01258211, + "epoch": 0.3796182173455584, + "flos": 31548777310080.0, + "grad_norm": 1.7202421351204062, + "language_loss": 0.71222353, + "learning_rate": 2.849048709730083e-06, + "loss": 0.78952008, + "num_input_tokens_seen": 135582600, + "router_z_loss_clip": 1.71679688, + "router_z_loss_mlp": 0.14233398, + "step": 6314, + "time_per_iteration": 2.5876691341400146 + }, + { + "auxiliary_loss_clip": 0.06465393, + "auxiliary_loss_mlp": 0.01270992, + "balance_loss_clip": 0.06290812, + "balance_loss_mlp": 0.01254922, + "epoch": 0.37967834059822636, + "flos": 12135766066560.0, + "grad_norm": 2.8019471516683985, + "language_loss": 0.74203241, + "learning_rate": 2.848696068594545e-06, + "loss": 0.81939626, + "num_input_tokens_seen": 135600280, + "router_z_loss_clip": 1.74511719, + "router_z_loss_mlp": 0.16064453, + "step": 6315, + "time_per_iteration": 2.5312654972076416 + }, + { + "auxiliary_loss_clip": 0.06455735, + "auxiliary_loss_mlp": 0.01269414, + "balance_loss_clip": 0.0628659, + "balance_loss_mlp": 0.01253512, + "epoch": 0.3797384638508943, + "flos": 39357989331840.0, + "grad_norm": 5.544256779510487, + "language_loss": 0.7095021, + "learning_rate": 2.8483433952763677e-06, + "loss": 0.78675354, + "num_input_tokens_seen": 135621560, + "router_z_loss_clip": 1.69238281, + "router_z_loss_mlp": 0.15905762, + "step": 6316, + "time_per_iteration": 2.642946481704712 + }, + { + "auxiliary_loss_clip": 0.06458603, + "auxiliary_loss_mlp": 0.0127094, + "balance_loss_clip": 0.06288237, + "balance_loss_mlp": 0.01255991, + "epoch": 0.3797985871035623, + "flos": 34061852165760.0, + "grad_norm": 2.4477129072331656, + "language_loss": 0.65612113, + "learning_rate": 2.847990689788923e-06, + "loss": 0.7334165, + "num_input_tokens_seen": 135641745, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.1496582, + "step": 6317, + "time_per_iteration": 2.634066104888916 + }, + { + "auxiliary_loss_clip": 0.0645286, + "auxiliary_loss_mlp": 0.0127098, + "balance_loss_clip": 0.06285463, + "balance_loss_mlp": 0.0125702, + "epoch": 0.37985871035623026, + "flos": 23228939306880.0, + "grad_norm": 1.9893651635894969, + "language_loss": 0.86348939, + "learning_rate": 2.8476379521455877e-06, + "loss": 0.94072783, + "num_input_tokens_seen": 135660650, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.13964844, + "step": 6318, + "time_per_iteration": 2.50665545463562 + }, + { + "auxiliary_loss_clip": 0.06460046, + "auxiliary_loss_mlp": 0.01273041, + "balance_loss_clip": 0.06287004, + "balance_loss_mlp": 0.01257675, + "epoch": 0.3799188336088982, + "flos": 18121002410880.0, + "grad_norm": 2.356531700065532, + "language_loss": 0.76647675, + "learning_rate": 2.8472851823597354e-06, + "loss": 0.84380764, + "num_input_tokens_seen": 135679980, + "router_z_loss_clip": 1.73046875, + "router_z_loss_mlp": 0.15368652, + "step": 6319, + "time_per_iteration": 2.50382137298584 + }, + { + "auxiliary_loss_clip": 0.06453398, + "auxiliary_loss_mlp": 0.01272745, + "balance_loss_clip": 0.06284256, + "balance_loss_mlp": 0.01258082, + "epoch": 0.3799789568615662, + "flos": 21878385598080.0, + "grad_norm": 6.804259628026359, + "language_loss": 0.6451484, + "learning_rate": 2.846932380444744e-06, + "loss": 0.72240984, + "num_input_tokens_seen": 135699400, + "router_z_loss_clip": 1.69140625, + "router_z_loss_mlp": 0.14660645, + "step": 6320, + "time_per_iteration": 2.516150712966919 + }, + { + "auxiliary_loss_clip": 0.06456275, + "auxiliary_loss_mlp": 0.01268463, + "balance_loss_clip": 0.06285265, + "balance_loss_mlp": 0.01252846, + "epoch": 0.3800390801142342, + "flos": 32971181495040.0, + "grad_norm": 1.7343317020382172, + "language_loss": 0.71855223, + "learning_rate": 2.846579546413992e-06, + "loss": 0.79579961, + "num_input_tokens_seen": 135723455, + "router_z_loss_clip": 1.71191406, + "router_z_loss_mlp": 0.15612793, + "step": 6321, + "time_per_iteration": 2.6204988956451416 + }, + { + "auxiliary_loss_clip": 0.06458073, + "auxiliary_loss_mlp": 0.01268703, + "balance_loss_clip": 0.06285845, + "balance_loss_mlp": 0.01253784, + "epoch": 0.38009920336690217, + "flos": 26914430090880.0, + "grad_norm": 1.8398392312515923, + "language_loss": 0.75578612, + "learning_rate": 2.846226680280859e-06, + "loss": 0.83305389, + "num_input_tokens_seen": 135744335, + "router_z_loss_clip": 1.72070312, + "router_z_loss_mlp": 0.14923096, + "step": 6322, + "time_per_iteration": 2.5463461875915527 + }, + { + "auxiliary_loss_clip": 0.06454624, + "auxiliary_loss_mlp": 0.01271033, + "balance_loss_clip": 0.06285781, + "balance_loss_mlp": 0.01256823, + "epoch": 0.38015932661957014, + "flos": 22494963467520.0, + "grad_norm": 1.8201003599281902, + "language_loss": 0.85709381, + "learning_rate": 2.845873782058725e-06, + "loss": 0.93435031, + "num_input_tokens_seen": 135761440, + "router_z_loss_clip": 1.6875, + "router_z_loss_mlp": 0.14215088, + "step": 6323, + "time_per_iteration": 2.4927124977111816 + }, + { + "auxiliary_loss_clip": 0.06458908, + "auxiliary_loss_mlp": 0.01270641, + "balance_loss_clip": 0.06286593, + "balance_loss_mlp": 0.01254596, + "epoch": 0.3802194498722381, + "flos": 21987440087040.0, + "grad_norm": 2.2452863694907426, + "language_loss": 0.73932886, + "learning_rate": 2.845520851760973e-06, + "loss": 0.81662428, + "num_input_tokens_seen": 135779955, + "router_z_loss_clip": 1.72265625, + "router_z_loss_mlp": 0.16027832, + "step": 6324, + "time_per_iteration": 2.4913861751556396 + }, + { + "auxiliary_loss_clip": 0.06464465, + "auxiliary_loss_mlp": 0.01272923, + "balance_loss_clip": 0.06288414, + "balance_loss_mlp": 0.01257724, + "epoch": 0.38027957312490607, + "flos": 21331310290560.0, + "grad_norm": 1.7884051563809298, + "language_loss": 0.84122628, + "learning_rate": 2.8451678894009847e-06, + "loss": 0.91860014, + "num_input_tokens_seen": 135799840, + "router_z_loss_clip": 1.76171875, + "router_z_loss_mlp": 0.15203857, + "step": 6325, + "time_per_iteration": 2.6119046211242676 + }, + { + "auxiliary_loss_clip": 0.06455745, + "auxiliary_loss_mlp": 0.01266737, + "balance_loss_clip": 0.06285073, + "balance_loss_mlp": 0.01252712, + "epoch": 0.38033969637757403, + "flos": 16696921144320.0, + "grad_norm": 2.2200302984742915, + "language_loss": 0.79868543, + "learning_rate": 2.8448148949921465e-06, + "loss": 0.87591028, + "num_input_tokens_seen": 135817880, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.14019775, + "step": 6326, + "time_per_iteration": 2.5188262462615967 + }, + { + "auxiliary_loss_clip": 0.06455691, + "auxiliary_loss_mlp": 0.01270203, + "balance_loss_clip": 0.06286497, + "balance_loss_mlp": 0.01255242, + "epoch": 0.380399819630242, + "flos": 36219741563520.0, + "grad_norm": 3.3742704435112025, + "language_loss": 0.73389304, + "learning_rate": 2.844461868547842e-06, + "loss": 0.81115204, + "num_input_tokens_seen": 135838940, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.14978027, + "step": 6327, + "time_per_iteration": 2.649383783340454 + }, + { + "auxiliary_loss_clip": 0.06459647, + "auxiliary_loss_mlp": 0.01269027, + "balance_loss_clip": 0.06290785, + "balance_loss_mlp": 0.01255145, + "epoch": 0.38045994288290996, + "flos": 21295364088960.0, + "grad_norm": 1.4936601975654378, + "language_loss": 0.83229524, + "learning_rate": 2.844108810081459e-06, + "loss": 0.90958202, + "num_input_tokens_seen": 135858325, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.13867188, + "step": 6328, + "time_per_iteration": 2.527261972427368 + }, + { + "auxiliary_loss_clip": 0.06452741, + "auxiliary_loss_mlp": 0.01268758, + "balance_loss_clip": 0.06281206, + "balance_loss_mlp": 0.01253755, + "epoch": 0.38052006613557793, + "flos": 20929151819520.0, + "grad_norm": 1.5056942690240434, + "language_loss": 0.61757982, + "learning_rate": 2.843755719606385e-06, + "loss": 0.69479483, + "num_input_tokens_seen": 135878430, + "router_z_loss_clip": 1.71679688, + "router_z_loss_mlp": 0.15008545, + "step": 6329, + "time_per_iteration": 2.54025936126709 + }, + { + "auxiliary_loss_clip": 0.0645529, + "auxiliary_loss_mlp": 0.01268187, + "balance_loss_clip": 0.06283917, + "balance_loss_mlp": 0.01254037, + "epoch": 0.3805801893882459, + "flos": 20996138759040.0, + "grad_norm": 2.0488191193117316, + "language_loss": 0.56127822, + "learning_rate": 2.8434025971360104e-06, + "loss": 0.63851297, + "num_input_tokens_seen": 135894755, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.14160156, + "step": 6330, + "time_per_iteration": 2.4913628101348877 + }, + { + "auxiliary_loss_clip": 0.06449446, + "auxiliary_loss_mlp": 0.01269693, + "balance_loss_clip": 0.06282543, + "balance_loss_mlp": 0.01255781, + "epoch": 0.38064031264091386, + "flos": 25565972734080.0, + "grad_norm": 1.4483276491856993, + "language_loss": 0.65912807, + "learning_rate": 2.8430494426837243e-06, + "loss": 0.73631942, + "num_input_tokens_seen": 135918275, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.13903809, + "step": 6331, + "time_per_iteration": 2.6071105003356934 + }, + { + "auxiliary_loss_clip": 0.0645493, + "auxiliary_loss_mlp": 0.01269934, + "balance_loss_clip": 0.06284193, + "balance_loss_mlp": 0.01254312, + "epoch": 0.3807004358935818, + "flos": 15091264080000.0, + "grad_norm": 1.528944840420101, + "language_loss": 0.7597304, + "learning_rate": 2.842696256262919e-06, + "loss": 0.83697909, + "num_input_tokens_seen": 135937430, + "router_z_loss_clip": 1.70898438, + "router_z_loss_mlp": 0.15618896, + "step": 6332, + "time_per_iteration": 2.4808928966522217 + }, + { + "auxiliary_loss_clip": 0.06456427, + "auxiliary_loss_mlp": 0.01273089, + "balance_loss_clip": 0.06283183, + "balance_loss_mlp": 0.01257943, + "epoch": 0.3807605591462498, + "flos": 16405033046400.0, + "grad_norm": 2.2042220893600226, + "language_loss": 0.82397389, + "learning_rate": 2.842343037886987e-06, + "loss": 0.90126908, + "num_input_tokens_seen": 135954210, + "router_z_loss_clip": 1.73144531, + "router_z_loss_mlp": 0.15142822, + "step": 6333, + "time_per_iteration": 2.5033013820648193 + }, + { + "auxiliary_loss_clip": 0.06454624, + "auxiliary_loss_mlp": 0.01269205, + "balance_loss_clip": 0.06283775, + "balance_loss_mlp": 0.01254655, + "epoch": 0.3808206823989178, + "flos": 29064353351040.0, + "grad_norm": 1.4831969327294916, + "language_loss": 0.86723578, + "learning_rate": 2.8419897875693226e-06, + "loss": 0.9444741, + "num_input_tokens_seen": 135974425, + "router_z_loss_clip": 1.70703125, + "router_z_loss_mlp": 0.14538574, + "step": 6334, + "time_per_iteration": 4.024240493774414 + }, + { + "auxiliary_loss_clip": 0.06455058, + "auxiliary_loss_mlp": 0.01270467, + "balance_loss_clip": 0.06282362, + "balance_loss_mlp": 0.01255155, + "epoch": 0.3808808056515858, + "flos": 15711321893760.0, + "grad_norm": 2.3448311359770795, + "language_loss": 0.79450226, + "learning_rate": 2.841636505323321e-06, + "loss": 0.87175757, + "num_input_tokens_seen": 135991985, + "router_z_loss_clip": 1.72753906, + "router_z_loss_mlp": 0.15301514, + "step": 6335, + "time_per_iteration": 2.4698357582092285 + }, + { + "auxiliary_loss_clip": 0.06453745, + "auxiliary_loss_mlp": 0.0127096, + "balance_loss_clip": 0.06281872, + "balance_loss_mlp": 0.0125517, + "epoch": 0.38094092890425374, + "flos": 20710917060480.0, + "grad_norm": 1.9128487431319638, + "language_loss": 0.72795898, + "learning_rate": 2.8412831911623795e-06, + "loss": 0.80520606, + "num_input_tokens_seen": 136010015, + "router_z_loss_clip": 1.71777344, + "router_z_loss_mlp": 0.15802002, + "step": 6336, + "time_per_iteration": 3.9780919551849365 + }, + { + "auxiliary_loss_clip": 0.06449959, + "auxiliary_loss_mlp": 0.01267203, + "balance_loss_clip": 0.06281384, + "balance_loss_mlp": 0.01252826, + "epoch": 0.3810010521569217, + "flos": 20674258099200.0, + "grad_norm": 2.2277206975915362, + "language_loss": 0.69756234, + "learning_rate": 2.840929845099894e-06, + "loss": 0.77473396, + "num_input_tokens_seen": 136028440, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.14373779, + "step": 6337, + "time_per_iteration": 2.5475378036499023 + }, + { + "auxiliary_loss_clip": 0.06454941, + "auxiliary_loss_mlp": 0.01273075, + "balance_loss_clip": 0.06282912, + "balance_loss_mlp": 0.012579, + "epoch": 0.38106117540958967, + "flos": 31834963330560.0, + "grad_norm": 1.987280020069696, + "language_loss": 0.64026022, + "learning_rate": 2.8405764671492652e-06, + "loss": 0.71754032, + "num_input_tokens_seen": 136048360, + "router_z_loss_clip": 1.71777344, + "router_z_loss_mlp": 0.1517334, + "step": 6338, + "time_per_iteration": 2.5795555114746094 + }, + { + "auxiliary_loss_clip": 0.06456137, + "auxiliary_loss_mlp": 0.01271603, + "balance_loss_clip": 0.06282276, + "balance_loss_mlp": 0.01255772, + "epoch": 0.38112129866225763, + "flos": 16907231692800.0, + "grad_norm": 1.6550535893348008, + "language_loss": 0.69685936, + "learning_rate": 2.8402230573238923e-06, + "loss": 0.77413678, + "num_input_tokens_seen": 136065500, + "router_z_loss_clip": 1.74121094, + "router_z_loss_mlp": 0.15856934, + "step": 6339, + "time_per_iteration": 2.48705792427063 + }, + { + "auxiliary_loss_clip": 0.06451984, + "auxiliary_loss_mlp": 0.01267449, + "balance_loss_clip": 0.06281533, + "balance_loss_mlp": 0.01253913, + "epoch": 0.3811814219149256, + "flos": 20893624888320.0, + "grad_norm": 2.252585455539085, + "language_loss": 0.68345773, + "learning_rate": 2.839869615637177e-06, + "loss": 0.76065207, + "num_input_tokens_seen": 136084060, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.13519287, + "step": 6340, + "time_per_iteration": 2.5142812728881836 + }, + { + "auxiliary_loss_clip": 0.06456652, + "auxiliary_loss_mlp": 0.01275426, + "balance_loss_clip": 0.06282599, + "balance_loss_mlp": 0.01260083, + "epoch": 0.38124154516759357, + "flos": 16696418019840.0, + "grad_norm": 2.4997436549257754, + "language_loss": 0.89721388, + "learning_rate": 2.839516142102522e-06, + "loss": 0.97453463, + "num_input_tokens_seen": 136102310, + "router_z_loss_clip": 1.74023438, + "router_z_loss_mlp": 0.15332031, + "step": 6341, + "time_per_iteration": 4.08266806602478 + }, + { + "auxiliary_loss_clip": 0.06461132, + "auxiliary_loss_mlp": 0.01272557, + "balance_loss_clip": 0.06284279, + "balance_loss_mlp": 0.01255427, + "epoch": 0.38130166842026153, + "flos": 19687946088960.0, + "grad_norm": 1.4891162994718032, + "language_loss": 0.75298452, + "learning_rate": 2.83916263673333e-06, + "loss": 0.83032143, + "num_input_tokens_seen": 136120725, + "router_z_loss_clip": 1.76855469, + "router_z_loss_mlp": 0.17138672, + "step": 6342, + "time_per_iteration": 2.496697425842285 + }, + { + "auxiliary_loss_clip": 0.06453368, + "auxiliary_loss_mlp": 0.01271075, + "balance_loss_clip": 0.06281647, + "balance_loss_mlp": 0.0125646, + "epoch": 0.3813617916729295, + "flos": 22204668597120.0, + "grad_norm": 1.7145643847071266, + "language_loss": 0.83785719, + "learning_rate": 2.838809099543007e-06, + "loss": 0.91510159, + "num_input_tokens_seen": 136139105, + "router_z_loss_clip": 1.71777344, + "router_z_loss_mlp": 0.14599609, + "step": 6343, + "time_per_iteration": 4.049302339553833 + }, + { + "auxiliary_loss_clip": 0.0645491, + "auxiliary_loss_mlp": 0.01269585, + "balance_loss_clip": 0.06281073, + "balance_loss_mlp": 0.01254905, + "epoch": 0.38142191492559746, + "flos": 19102576665600.0, + "grad_norm": 1.619462393744454, + "language_loss": 0.77529186, + "learning_rate": 2.838455530544959e-06, + "loss": 0.8525368, + "num_input_tokens_seen": 136158265, + "router_z_loss_clip": 1.73828125, + "router_z_loss_mlp": 0.14678955, + "step": 6344, + "time_per_iteration": 2.579394817352295 + }, + { + "auxiliary_loss_clip": 0.06456682, + "auxiliary_loss_mlp": 0.01271203, + "balance_loss_clip": 0.06285504, + "balance_loss_mlp": 0.01256635, + "epoch": 0.3814820381782654, + "flos": 24104645527680.0, + "grad_norm": 1.8871239884396722, + "language_loss": 0.74166036, + "learning_rate": 2.838101929752593e-06, + "loss": 0.81893921, + "num_input_tokens_seen": 136176100, + "router_z_loss_clip": 1.7109375, + "router_z_loss_mlp": 0.14587402, + "step": 6345, + "time_per_iteration": 2.5367093086242676 + }, + { + "auxiliary_loss_clip": 0.06457509, + "auxiliary_loss_mlp": 0.0127188, + "balance_loss_clip": 0.06287415, + "balance_loss_mlp": 0.01257765, + "epoch": 0.3815421614309334, + "flos": 15783927056640.0, + "grad_norm": 1.7118462514914357, + "language_loss": 0.69868183, + "learning_rate": 2.8377482971793187e-06, + "loss": 0.7759757, + "num_input_tokens_seen": 136195125, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.14111328, + "step": 6346, + "time_per_iteration": 2.5815930366516113 + }, + { + "auxiliary_loss_clip": 0.06466204, + "auxiliary_loss_mlp": 0.0127262, + "balance_loss_clip": 0.06290555, + "balance_loss_mlp": 0.01257236, + "epoch": 0.38160228468360136, + "flos": 19905593869440.0, + "grad_norm": 1.781545419456976, + "language_loss": 0.7611326, + "learning_rate": 2.8373946328385437e-06, + "loss": 0.83852088, + "num_input_tokens_seen": 136213885, + "router_z_loss_clip": 1.75585938, + "router_z_loss_mlp": 0.15374756, + "step": 6347, + "time_per_iteration": 2.5027284622192383 + }, + { + "auxiliary_loss_clip": 0.06456521, + "auxiliary_loss_mlp": 0.01269003, + "balance_loss_clip": 0.06283832, + "balance_loss_mlp": 0.012553, + "epoch": 0.3816624079362694, + "flos": 19287045429120.0, + "grad_norm": 1.488288802844173, + "language_loss": 0.75192666, + "learning_rate": 2.8370409367436813e-06, + "loss": 0.82918191, + "num_input_tokens_seen": 136232700, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.13702393, + "step": 6348, + "time_per_iteration": 2.559131383895874 + }, + { + "auxiliary_loss_clip": 0.0645996, + "auxiliary_loss_mlp": 0.01270391, + "balance_loss_clip": 0.06286097, + "balance_loss_mlp": 0.01256599, + "epoch": 0.38172253118893734, + "flos": 21183752050560.0, + "grad_norm": 1.729316797973715, + "language_loss": 0.88237411, + "learning_rate": 2.836687208908142e-06, + "loss": 0.95967764, + "num_input_tokens_seen": 136248975, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.13775635, + "step": 6349, + "time_per_iteration": 2.525542974472046 + }, + { + "auxiliary_loss_clip": 0.06453095, + "auxiliary_loss_mlp": 0.0126974, + "balance_loss_clip": 0.06281723, + "balance_loss_mlp": 0.01255149, + "epoch": 0.3817826544416053, + "flos": 17534836373760.0, + "grad_norm": 1.7576595366031973, + "language_loss": 0.76939785, + "learning_rate": 2.836333449345341e-06, + "loss": 0.84662628, + "num_input_tokens_seen": 136266710, + "router_z_loss_clip": 1.71386719, + "router_z_loss_mlp": 0.14593506, + "step": 6350, + "time_per_iteration": 2.532376289367676 + }, + { + "auxiliary_loss_clip": 0.06458531, + "auxiliary_loss_mlp": 0.01273484, + "balance_loss_clip": 0.06286063, + "balance_loss_mlp": 0.01258231, + "epoch": 0.38184277769427327, + "flos": 16332176321280.0, + "grad_norm": 2.21296257119241, + "language_loss": 0.77054518, + "learning_rate": 2.8359796580686907e-06, + "loss": 0.84786528, + "num_input_tokens_seen": 136284445, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.15264893, + "step": 6351, + "time_per_iteration": 2.4930031299591064 + }, + { + "auxiliary_loss_clip": 0.06457832, + "auxiliary_loss_mlp": 0.01273263, + "balance_loss_clip": 0.0628476, + "balance_loss_mlp": 0.012577, + "epoch": 0.38190290094694124, + "flos": 30450937115520.0, + "grad_norm": 2.2550067272061254, + "language_loss": 0.74895489, + "learning_rate": 2.8356258350916085e-06, + "loss": 0.82626581, + "num_input_tokens_seen": 136305730, + "router_z_loss_clip": 1.73144531, + "router_z_loss_mlp": 0.15563965, + "step": 6352, + "time_per_iteration": 2.6078808307647705 + }, + { + "auxiliary_loss_clip": 0.0645259, + "auxiliary_loss_mlp": 0.01270341, + "balance_loss_clip": 0.06283389, + "balance_loss_mlp": 0.0125659, + "epoch": 0.3819630241996092, + "flos": 14215138588800.0, + "grad_norm": 2.0554991668998777, + "language_loss": 0.63961715, + "learning_rate": 2.8352719804275104e-06, + "loss": 0.71684647, + "num_input_tokens_seen": 136323850, + "router_z_loss_clip": 1.69140625, + "router_z_loss_mlp": 0.13739014, + "step": 6353, + "time_per_iteration": 2.476759433746338 + }, + { + "auxiliary_loss_clip": 0.06456264, + "auxiliary_loss_mlp": 0.01279815, + "balance_loss_clip": 0.06283177, + "balance_loss_mlp": 0.01266112, + "epoch": 0.38202314745227717, + "flos": 25016717220480.0, + "grad_norm": 1.720129608989886, + "language_loss": 0.83556378, + "learning_rate": 2.834918094089816e-06, + "loss": 0.91292459, + "num_input_tokens_seen": 136344880, + "router_z_loss_clip": 1.72949219, + "router_z_loss_mlp": 0.13702393, + "step": 6354, + "time_per_iteration": 2.5726418495178223 + }, + { + "auxiliary_loss_clip": 0.06456912, + "auxiliary_loss_mlp": 0.01271961, + "balance_loss_clip": 0.06290418, + "balance_loss_mlp": 0.0125911, + "epoch": 0.38208327070494513, + "flos": 20820935871360.0, + "grad_norm": 1.6482101436629937, + "language_loss": 0.81480742, + "learning_rate": 2.834564176091943e-06, + "loss": 0.89209616, + "num_input_tokens_seen": 136366060, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.12854004, + "step": 6355, + "time_per_iteration": 2.5225114822387695 + }, + { + "auxiliary_loss_clip": 0.06459523, + "auxiliary_loss_mlp": 0.01273228, + "balance_loss_clip": 0.06289364, + "balance_loss_mlp": 0.01259179, + "epoch": 0.3821433939576131, + "flos": 22644282643200.0, + "grad_norm": 1.8808367718392982, + "language_loss": 0.75647783, + "learning_rate": 2.8342102264473125e-06, + "loss": 0.83380532, + "num_input_tokens_seen": 136385625, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.14031982, + "step": 6356, + "time_per_iteration": 2.5584537982940674 + }, + { + "auxiliary_loss_clip": 0.0646046, + "auxiliary_loss_mlp": 0.01272045, + "balance_loss_clip": 0.06287301, + "balance_loss_mlp": 0.01257645, + "epoch": 0.38220351721028106, + "flos": 26877100296960.0, + "grad_norm": 1.8976132208861074, + "language_loss": 0.82161039, + "learning_rate": 2.833856245169348e-06, + "loss": 0.89893544, + "num_input_tokens_seen": 136405750, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.14398193, + "step": 6357, + "time_per_iteration": 2.546190023422241 + }, + { + "auxiliary_loss_clip": 0.06463508, + "auxiliary_loss_mlp": 0.01275628, + "balance_loss_clip": 0.0629019, + "balance_loss_mlp": 0.01260035, + "epoch": 0.38226364046294903, + "flos": 23374149632640.0, + "grad_norm": 1.7334885634957151, + "language_loss": 0.78531659, + "learning_rate": 2.8335022322714695e-06, + "loss": 0.86270791, + "num_input_tokens_seen": 136426085, + "router_z_loss_clip": 1.73535156, + "router_z_loss_mlp": 0.15612793, + "step": 6358, + "time_per_iteration": 2.5330071449279785 + }, + { + "auxiliary_loss_clip": 0.06462916, + "auxiliary_loss_mlp": 0.01271369, + "balance_loss_clip": 0.06287834, + "balance_loss_mlp": 0.01256086, + "epoch": 0.382323763715617, + "flos": 19652335303680.0, + "grad_norm": 1.9007754709735623, + "language_loss": 0.79191673, + "learning_rate": 2.8331481877671036e-06, + "loss": 0.86925954, + "num_input_tokens_seen": 136442670, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.15270996, + "step": 6359, + "time_per_iteration": 2.5185654163360596 + }, + { + "auxiliary_loss_clip": 0.06457044, + "auxiliary_loss_mlp": 0.01275796, + "balance_loss_clip": 0.06287733, + "balance_loss_mlp": 0.01261884, + "epoch": 0.38238388696828496, + "flos": 54136527575040.0, + "grad_norm": 1.6591220194179586, + "language_loss": 0.70001733, + "learning_rate": 2.8327941116696754e-06, + "loss": 0.77734572, + "num_input_tokens_seen": 136465730, + "router_z_loss_clip": 1.69238281, + "router_z_loss_mlp": 0.13903809, + "step": 6360, + "time_per_iteration": 2.8067054748535156 + }, + { + "auxiliary_loss_clip": 0.06461466, + "auxiliary_loss_mlp": 0.01277777, + "balance_loss_clip": 0.06292595, + "balance_loss_mlp": 0.01262923, + "epoch": 0.382444010220953, + "flos": 24943105808640.0, + "grad_norm": 1.5737902616354833, + "language_loss": 0.79093289, + "learning_rate": 2.83244000399261e-06, + "loss": 0.86832535, + "num_input_tokens_seen": 136487215, + "router_z_loss_clip": 1.6875, + "router_z_loss_mlp": 0.14849854, + "step": 6361, + "time_per_iteration": 2.558579683303833 + }, + { + "auxiliary_loss_clip": 0.0645285, + "auxiliary_loss_mlp": 0.01272146, + "balance_loss_clip": 0.06286099, + "balance_loss_mlp": 0.01257996, + "epoch": 0.38250413347362094, + "flos": 42346750216320.0, + "grad_norm": 1.4645255919949542, + "language_loss": 0.65580732, + "learning_rate": 2.832085864749337e-06, + "loss": 0.73305726, + "num_input_tokens_seen": 136510365, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.14154053, + "step": 6362, + "time_per_iteration": 2.709390878677368 + }, + { + "auxiliary_loss_clip": 0.06459438, + "auxiliary_loss_mlp": 0.01270758, + "balance_loss_clip": 0.06287294, + "balance_loss_mlp": 0.01255415, + "epoch": 0.3825642567262889, + "flos": 16294720746240.0, + "grad_norm": 1.6166481183320216, + "language_loss": 0.8211807, + "learning_rate": 2.8317316939532848e-06, + "loss": 0.89848268, + "num_input_tokens_seen": 136527100, + "router_z_loss_clip": 1.72070312, + "router_z_loss_mlp": 0.15332031, + "step": 6363, + "time_per_iteration": 2.468846559524536 + }, + { + "auxiliary_loss_clip": 0.06453779, + "auxiliary_loss_mlp": 0.01274743, + "balance_loss_clip": 0.06286556, + "balance_loss_mlp": 0.01259401, + "epoch": 0.3826243799789569, + "flos": 45664267795200.0, + "grad_norm": 1.6258867054195516, + "language_loss": 0.59107661, + "learning_rate": 2.8313774916178825e-06, + "loss": 0.6683619, + "num_input_tokens_seen": 136550870, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.15356445, + "step": 6364, + "time_per_iteration": 2.745589256286621 + }, + { + "auxiliary_loss_clip": 0.06465845, + "auxiliary_loss_mlp": 0.0127531, + "balance_loss_clip": 0.06290866, + "balance_loss_mlp": 0.01261058, + "epoch": 0.38268450323162484, + "flos": 25308647245440.0, + "grad_norm": 2.2940920681906873, + "language_loss": 0.6951021, + "learning_rate": 2.8310232577565635e-06, + "loss": 0.77251363, + "num_input_tokens_seen": 136569895, + "router_z_loss_clip": 1.74902344, + "router_z_loss_mlp": 0.14257812, + "step": 6365, + "time_per_iteration": 2.561795473098755 + }, + { + "auxiliary_loss_clip": 0.06461614, + "auxiliary_loss_mlp": 0.01269888, + "balance_loss_clip": 0.06285347, + "balance_loss_mlp": 0.0125451, + "epoch": 0.3827446264842928, + "flos": 21842607104640.0, + "grad_norm": 2.2040506714686208, + "language_loss": 0.73211187, + "learning_rate": 2.830668992382758e-06, + "loss": 0.8094269, + "num_input_tokens_seen": 136588585, + "router_z_loss_clip": 1.76464844, + "router_z_loss_mlp": 0.15374756, + "step": 6366, + "time_per_iteration": 2.527252435684204 + }, + { + "auxiliary_loss_clip": 0.06455328, + "auxiliary_loss_mlp": 0.01270912, + "balance_loss_clip": 0.06284537, + "balance_loss_mlp": 0.0125703, + "epoch": 0.38280474973696077, + "flos": 25740924059520.0, + "grad_norm": 2.537372436592335, + "language_loss": 0.69208872, + "learning_rate": 2.830314695509902e-06, + "loss": 0.76935112, + "num_input_tokens_seen": 136606640, + "router_z_loss_clip": 1.70898438, + "router_z_loss_mlp": 0.13885498, + "step": 6367, + "time_per_iteration": 2.563174247741699 + }, + { + "auxiliary_loss_clip": 0.06445135, + "auxiliary_loss_mlp": 0.01267364, + "balance_loss_clip": 0.06281811, + "balance_loss_mlp": 0.01253482, + "epoch": 0.38286487298962874, + "flos": 24902212216320.0, + "grad_norm": 2.529219827632029, + "language_loss": 0.64519894, + "learning_rate": 2.82996036715143e-06, + "loss": 0.72232389, + "num_input_tokens_seen": 136624940, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.13897705, + "step": 6368, + "time_per_iteration": 2.5240230560302734 + }, + { + "auxiliary_loss_clip": 0.0644632, + "auxiliary_loss_mlp": 0.0126879, + "balance_loss_clip": 0.06279288, + "balance_loss_mlp": 0.01255111, + "epoch": 0.3829249962422967, + "flos": 28550457060480.0, + "grad_norm": 1.3073196657605344, + "language_loss": 0.68441451, + "learning_rate": 2.8296060073207763e-06, + "loss": 0.76156569, + "num_input_tokens_seen": 136645540, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.13677979, + "step": 6369, + "time_per_iteration": 2.623020887374878 + }, + { + "auxiliary_loss_clip": 0.06452611, + "auxiliary_loss_mlp": 0.01268713, + "balance_loss_clip": 0.0628352, + "balance_loss_mlp": 0.01254724, + "epoch": 0.38298511949496467, + "flos": 21477736500480.0, + "grad_norm": 1.6896603918496267, + "language_loss": 0.79100078, + "learning_rate": 2.8292516160313804e-06, + "loss": 0.86821401, + "num_input_tokens_seen": 136664530, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.13995361, + "step": 6370, + "time_per_iteration": 2.5265746116638184 + }, + { + "auxiliary_loss_clip": 0.06451623, + "auxiliary_loss_mlp": 0.0127085, + "balance_loss_clip": 0.06281339, + "balance_loss_mlp": 0.01256265, + "epoch": 0.38304524274763263, + "flos": 31687027747200.0, + "grad_norm": 2.908092380852583, + "language_loss": 0.651667, + "learning_rate": 2.8288971932966805e-06, + "loss": 0.72889173, + "num_input_tokens_seen": 136682315, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.14587402, + "step": 6371, + "time_per_iteration": 2.6345784664154053 + }, + { + "auxiliary_loss_clip": 0.06459577, + "auxiliary_loss_mlp": 0.01272301, + "balance_loss_clip": 0.06283382, + "balance_loss_mlp": 0.01257543, + "epoch": 0.3831053660003006, + "flos": 25082865619200.0, + "grad_norm": 2.362243450203488, + "language_loss": 0.73142469, + "learning_rate": 2.8285427391301155e-06, + "loss": 0.80874348, + "num_input_tokens_seen": 136701185, + "router_z_loss_clip": 1.76464844, + "router_z_loss_mlp": 0.14746094, + "step": 6372, + "time_per_iteration": 2.5150070190429688 + }, + { + "auxiliary_loss_clip": 0.06454702, + "auxiliary_loss_mlp": 0.01266707, + "balance_loss_clip": 0.06282556, + "balance_loss_mlp": 0.01252485, + "epoch": 0.38316548925296856, + "flos": 23265849830400.0, + "grad_norm": 1.5439174716844835, + "language_loss": 0.85255867, + "learning_rate": 2.8281882535451266e-06, + "loss": 0.92977273, + "num_input_tokens_seen": 136721265, + "router_z_loss_clip": 1.72167969, + "router_z_loss_mlp": 0.14221191, + "step": 6373, + "time_per_iteration": 4.056765794754028 + }, + { + "auxiliary_loss_clip": 0.0645606, + "auxiliary_loss_mlp": 0.01272183, + "balance_loss_clip": 0.06281903, + "balance_loss_mlp": 0.01257431, + "epoch": 0.3832256125056366, + "flos": 34432131358080.0, + "grad_norm": 8.29118461423438, + "language_loss": 0.75127506, + "learning_rate": 2.8278337365551567e-06, + "loss": 0.82855743, + "num_input_tokens_seen": 136741885, + "router_z_loss_clip": 1.74121094, + "router_z_loss_mlp": 0.14758301, + "step": 6374, + "time_per_iteration": 2.739825963973999 + }, + { + "auxiliary_loss_clip": 0.06457414, + "auxiliary_loss_mlp": 0.01272454, + "balance_loss_clip": 0.0628335, + "balance_loss_mlp": 0.01258042, + "epoch": 0.38328573575830455, + "flos": 21769289182080.0, + "grad_norm": 1.9434329018980874, + "language_loss": 0.76033717, + "learning_rate": 2.8274791881736485e-06, + "loss": 0.83763582, + "num_input_tokens_seen": 136760905, + "router_z_loss_clip": 1.74121094, + "router_z_loss_mlp": 0.14416504, + "step": 6375, + "time_per_iteration": 2.521092176437378 + }, + { + "auxiliary_loss_clip": 0.06457017, + "auxiliary_loss_mlp": 0.01267252, + "balance_loss_clip": 0.06283681, + "balance_loss_mlp": 0.01252541, + "epoch": 0.3833458590109725, + "flos": 17385056000640.0, + "grad_norm": 2.081333613596134, + "language_loss": 0.73067588, + "learning_rate": 2.8271246084140457e-06, + "loss": 0.80791855, + "num_input_tokens_seen": 136777240, + "router_z_loss_clip": 1.73046875, + "router_z_loss_mlp": 0.1472168, + "step": 6376, + "time_per_iteration": 3.913828134536743 + }, + { + "auxiliary_loss_clip": 0.06451094, + "auxiliary_loss_mlp": 0.01266207, + "balance_loss_clip": 0.06282462, + "balance_loss_mlp": 0.01251294, + "epoch": 0.3834059822636405, + "flos": 29432326556160.0, + "grad_norm": 1.6469866452188906, + "language_loss": 0.68444526, + "learning_rate": 2.826769997289796e-06, + "loss": 0.76161826, + "num_input_tokens_seen": 136801040, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.14916992, + "step": 6377, + "time_per_iteration": 2.552703857421875 + }, + { + "auxiliary_loss_clip": 0.0646103, + "auxiliary_loss_mlp": 0.01268999, + "balance_loss_clip": 0.06285432, + "balance_loss_mlp": 0.01253413, + "epoch": 0.38346610551630844, + "flos": 21477191448960.0, + "grad_norm": 1.937210921117629, + "language_loss": 0.73608565, + "learning_rate": 2.826415354814344e-06, + "loss": 0.8133859, + "num_input_tokens_seen": 136819495, + "router_z_loss_clip": 1.75488281, + "router_z_loss_mlp": 0.15582275, + "step": 6378, + "time_per_iteration": 2.554784059524536 + }, + { + "auxiliary_loss_clip": 0.06455162, + "auxiliary_loss_mlp": 0.01271763, + "balance_loss_clip": 0.06283469, + "balance_loss_mlp": 0.01257661, + "epoch": 0.3835262287689764, + "flos": 27568253900160.0, + "grad_norm": 1.6187724503548255, + "language_loss": 0.69142127, + "learning_rate": 2.8260606810011396e-06, + "loss": 0.76869053, + "num_input_tokens_seen": 136838840, + "router_z_loss_clip": 1.72070312, + "router_z_loss_mlp": 0.14099121, + "step": 6379, + "time_per_iteration": 2.540184736251831 + }, + { + "auxiliary_loss_clip": 0.06449591, + "auxiliary_loss_mlp": 0.01271871, + "balance_loss_clip": 0.06281038, + "balance_loss_mlp": 0.01258209, + "epoch": 0.3835863520216444, + "flos": 15529201044480.0, + "grad_norm": 1.7677581121541173, + "language_loss": 0.8420229, + "learning_rate": 2.8257059758636315e-06, + "loss": 0.91923743, + "num_input_tokens_seen": 136854425, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.13659668, + "step": 6380, + "time_per_iteration": 3.9425628185272217 + }, + { + "auxiliary_loss_clip": 0.06454644, + "auxiliary_loss_mlp": 0.01270371, + "balance_loss_clip": 0.06286694, + "balance_loss_mlp": 0.01255786, + "epoch": 0.38364647527431234, + "flos": 21910851855360.0, + "grad_norm": 1.4264464063638025, + "language_loss": 0.81255281, + "learning_rate": 2.8253512394152697e-06, + "loss": 0.88980293, + "num_input_tokens_seen": 136874355, + "router_z_loss_clip": 1.67871094, + "router_z_loss_mlp": 0.14569092, + "step": 6381, + "time_per_iteration": 2.5692083835601807 + }, + { + "auxiliary_loss_clip": 0.06363897, + "auxiliary_loss_mlp": 0.0126892, + "balance_loss_clip": 0.06286111, + "balance_loss_mlp": 0.01265082, + "epoch": 0.3837065985269803, + "flos": 65553076120320.0, + "grad_norm": 0.8198763586735168, + "language_loss": 0.60085058, + "learning_rate": 2.8249964716695068e-06, + "loss": 0.67717874, + "num_input_tokens_seen": 136937475, + "router_z_loss_clip": 0.77734375, + "router_z_loss_mlp": 0.03833008, + "step": 6382, + "time_per_iteration": 3.1118690967559814 + }, + { + "auxiliary_loss_clip": 0.06458844, + "auxiliary_loss_mlp": 0.0127264, + "balance_loss_clip": 0.06285119, + "balance_loss_mlp": 0.01257375, + "epoch": 0.38376672177964827, + "flos": 28264103331840.0, + "grad_norm": 2.361672223919581, + "language_loss": 0.67004663, + "learning_rate": 2.824641672639794e-06, + "loss": 0.74736154, + "num_input_tokens_seen": 136955805, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.15264893, + "step": 6383, + "time_per_iteration": 3.949587345123291 + }, + { + "auxiliary_loss_clip": 0.06458098, + "auxiliary_loss_mlp": 0.01270272, + "balance_loss_clip": 0.06285569, + "balance_loss_mlp": 0.01255919, + "epoch": 0.38382684503231623, + "flos": 20637641064960.0, + "grad_norm": 1.580160930907899, + "language_loss": 0.75169957, + "learning_rate": 2.824286842339587e-06, + "loss": 0.82898319, + "num_input_tokens_seen": 136975240, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.14355469, + "step": 6384, + "time_per_iteration": 2.5578341484069824 + }, + { + "auxiliary_loss_clip": 0.0645394, + "auxiliary_loss_mlp": 0.01272921, + "balance_loss_clip": 0.06286485, + "balance_loss_mlp": 0.01259819, + "epoch": 0.3838869682849842, + "flos": 19611274003200.0, + "grad_norm": 1.4416039952500834, + "language_loss": 0.76348937, + "learning_rate": 2.823931980782341e-06, + "loss": 0.84075809, + "num_input_tokens_seen": 136994985, + "router_z_loss_clip": 1.67578125, + "router_z_loss_mlp": 0.13092041, + "step": 6385, + "time_per_iteration": 2.5225770473480225 + }, + { + "auxiliary_loss_clip": 0.06357871, + "auxiliary_loss_mlp": 0.01265462, + "balance_loss_clip": 0.06280675, + "balance_loss_mlp": 0.01261296, + "epoch": 0.38394709153765216, + "flos": 56572202856960.0, + "grad_norm": 1.1093406194632214, + "language_loss": 0.67841589, + "learning_rate": 2.82357708798151e-06, + "loss": 0.75464916, + "num_input_tokens_seen": 137046290, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.04168701, + "step": 6386, + "time_per_iteration": 3.0481390953063965 + }, + { + "auxiliary_loss_clip": 0.06453113, + "auxiliary_loss_mlp": 0.01268796, + "balance_loss_clip": 0.06286535, + "balance_loss_mlp": 0.01254777, + "epoch": 0.3840072147903202, + "flos": 15894323210880.0, + "grad_norm": 1.5665063027995272, + "language_loss": 0.72740716, + "learning_rate": 2.8232221639505547e-06, + "loss": 0.80462623, + "num_input_tokens_seen": 137064725, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.14025879, + "step": 6387, + "time_per_iteration": 2.514692783355713 + }, + { + "auxiliary_loss_clip": 0.06447147, + "auxiliary_loss_mlp": 0.01275854, + "balance_loss_clip": 0.06283197, + "balance_loss_mlp": 0.0126187, + "epoch": 0.38406733804298815, + "flos": 28225180310400.0, + "grad_norm": 2.2869557055676095, + "language_loss": 0.81707162, + "learning_rate": 2.822867208702932e-06, + "loss": 0.89430165, + "num_input_tokens_seen": 137086030, + "router_z_loss_clip": 1.63867188, + "router_z_loss_mlp": 0.13989258, + "step": 6388, + "time_per_iteration": 2.6592257022857666 + }, + { + "auxiliary_loss_clip": 0.06454118, + "auxiliary_loss_mlp": 0.01267752, + "balance_loss_clip": 0.0628527, + "balance_loss_mlp": 0.01253888, + "epoch": 0.3841274612956561, + "flos": 18229511848320.0, + "grad_norm": 1.6912658906890043, + "language_loss": 0.76762819, + "learning_rate": 2.8225122222521026e-06, + "loss": 0.84484684, + "num_input_tokens_seen": 137105400, + "router_z_loss_clip": 1.6875, + "router_z_loss_mlp": 0.13873291, + "step": 6389, + "time_per_iteration": 2.5315403938293457 + }, + { + "auxiliary_loss_clip": 0.06454799, + "auxiliary_loss_mlp": 0.01270773, + "balance_loss_clip": 0.06281878, + "balance_loss_mlp": 0.01254847, + "epoch": 0.3841875845483241, + "flos": 19799138856960.0, + "grad_norm": 1.6723623276481432, + "language_loss": 0.76991975, + "learning_rate": 2.8221572046115273e-06, + "loss": 0.84717548, + "num_input_tokens_seen": 137124985, + "router_z_loss_clip": 1.73046875, + "router_z_loss_mlp": 0.15905762, + "step": 6390, + "time_per_iteration": 2.5315029621124268 + }, + { + "auxiliary_loss_clip": 0.0646126, + "auxiliary_loss_mlp": 0.01271779, + "balance_loss_clip": 0.06286746, + "balance_loss_mlp": 0.01255572, + "epoch": 0.38424770780099204, + "flos": 29906670919680.0, + "grad_norm": 1.876202489708209, + "language_loss": 0.70321602, + "learning_rate": 2.821802155794668e-06, + "loss": 0.78054643, + "num_input_tokens_seen": 137146745, + "router_z_loss_clip": 1.74414062, + "router_z_loss_mlp": 0.1618042, + "step": 6391, + "time_per_iteration": 2.6110270023345947 + }, + { + "auxiliary_loss_clip": 0.06455616, + "auxiliary_loss_mlp": 0.01272965, + "balance_loss_clip": 0.06284156, + "balance_loss_mlp": 0.01258499, + "epoch": 0.38430783105366, + "flos": 20820013476480.0, + "grad_norm": 1.8135855175826887, + "language_loss": 0.83923954, + "learning_rate": 2.8214470758149884e-06, + "loss": 0.91652524, + "num_input_tokens_seen": 137163195, + "router_z_loss_clip": 1.71386719, + "router_z_loss_mlp": 0.14459229, + "step": 6392, + "time_per_iteration": 2.5735576152801514 + }, + { + "auxiliary_loss_clip": 0.06461488, + "auxiliary_loss_mlp": 0.01268829, + "balance_loss_clip": 0.06290185, + "balance_loss_mlp": 0.01255162, + "epoch": 0.384367954306328, + "flos": 11003153627520.0, + "grad_norm": 1.9242234625767662, + "language_loss": 0.61454862, + "learning_rate": 2.8210919646859536e-06, + "loss": 0.69185179, + "num_input_tokens_seen": 137179330, + "router_z_loss_clip": 1.7109375, + "router_z_loss_mlp": 0.13677979, + "step": 6393, + "time_per_iteration": 2.4626450538635254 + }, + { + "auxiliary_loss_clip": 0.06467697, + "auxiliary_loss_mlp": 0.01271997, + "balance_loss_clip": 0.06290497, + "balance_loss_mlp": 0.01256071, + "epoch": 0.38442807755899594, + "flos": 25345096571520.0, + "grad_norm": 2.1306446802295325, + "language_loss": 0.71410203, + "learning_rate": 2.820736822421029e-06, + "loss": 0.79149896, + "num_input_tokens_seen": 137198655, + "router_z_loss_clip": 1.7734375, + "router_z_loss_mlp": 0.15905762, + "step": 6394, + "time_per_iteration": 2.5997071266174316 + }, + { + "auxiliary_loss_clip": 0.06463788, + "auxiliary_loss_mlp": 0.01269365, + "balance_loss_clip": 0.0628664, + "balance_loss_mlp": 0.01254082, + "epoch": 0.3844882008116639, + "flos": 21076206935040.0, + "grad_norm": 1.9216116882295546, + "language_loss": 0.82087183, + "learning_rate": 2.8203816490336822e-06, + "loss": 0.89820337, + "num_input_tokens_seen": 137217120, + "router_z_loss_clip": 1.76953125, + "router_z_loss_mlp": 0.1529541, + "step": 6395, + "time_per_iteration": 2.517411470413208 + }, + { + "auxiliary_loss_clip": 0.06460339, + "auxiliary_loss_mlp": 0.01275993, + "balance_loss_clip": 0.06287727, + "balance_loss_mlp": 0.01261831, + "epoch": 0.38454832406433187, + "flos": 17968287144960.0, + "grad_norm": 2.112818402600052, + "language_loss": 0.70801687, + "learning_rate": 2.8200264445373813e-06, + "loss": 0.78538024, + "num_input_tokens_seen": 137234410, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.14160156, + "step": 6396, + "time_per_iteration": 2.50288987159729 + }, + { + "auxiliary_loss_clip": 0.06365301, + "auxiliary_loss_mlp": 0.01257609, + "balance_loss_clip": 0.06287754, + "balance_loss_mlp": 0.01253767, + "epoch": 0.38460844731699984, + "flos": 67946641925760.0, + "grad_norm": 0.873922952794391, + "language_loss": 0.59863293, + "learning_rate": 2.8196712089455954e-06, + "loss": 0.67486203, + "num_input_tokens_seen": 137294940, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.0383606, + "step": 6397, + "time_per_iteration": 3.206678628921509 + }, + { + "auxiliary_loss_clip": 0.06450997, + "auxiliary_loss_mlp": 0.01276354, + "balance_loss_clip": 0.06284742, + "balance_loss_mlp": 0.0126187, + "epoch": 0.3846685705696678, + "flos": 25856267604480.0, + "grad_norm": 1.772406293141946, + "language_loss": 0.85227352, + "learning_rate": 2.819315942271794e-06, + "loss": 0.92954701, + "num_input_tokens_seen": 137315035, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.14477539, + "step": 6398, + "time_per_iteration": 2.5761947631835938 + }, + { + "auxiliary_loss_clip": 0.06453151, + "auxiliary_loss_mlp": 0.01277177, + "balance_loss_clip": 0.06285614, + "balance_loss_mlp": 0.01262467, + "epoch": 0.38472869382233577, + "flos": 16295852776320.0, + "grad_norm": 2.386881726324987, + "language_loss": 0.80489028, + "learning_rate": 2.8189606445294515e-06, + "loss": 0.88219357, + "num_input_tokens_seen": 137333155, + "router_z_loss_clip": 1.67578125, + "router_z_loss_mlp": 0.14715576, + "step": 6399, + "time_per_iteration": 2.4882943630218506 + }, + { + "auxiliary_loss_clip": 0.06455526, + "auxiliary_loss_mlp": 0.01279196, + "balance_loss_clip": 0.06283697, + "balance_loss_mlp": 0.01263592, + "epoch": 0.38478881707500373, + "flos": 19358979759360.0, + "grad_norm": 1.8772073039605681, + "language_loss": 0.67565721, + "learning_rate": 2.818605315732038e-06, + "loss": 0.75300437, + "num_input_tokens_seen": 137351515, + "router_z_loss_clip": 1.71972656, + "router_z_loss_mlp": 0.15588379, + "step": 6400, + "time_per_iteration": 2.5162830352783203 + }, + { + "auxiliary_loss_clip": 0.06460319, + "auxiliary_loss_mlp": 0.01269914, + "balance_loss_clip": 0.06288355, + "balance_loss_mlp": 0.01255454, + "epoch": 0.38484894032767175, + "flos": 24867356117760.0, + "grad_norm": 1.6933093627789975, + "language_loss": 0.7382642, + "learning_rate": 2.81824995589303e-06, + "loss": 0.81556654, + "num_input_tokens_seen": 137371255, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.14459229, + "step": 6401, + "time_per_iteration": 2.5274739265441895 + }, + { + "auxiliary_loss_clip": 0.06457724, + "auxiliary_loss_mlp": 0.01277936, + "balance_loss_clip": 0.06285743, + "balance_loss_mlp": 0.01262296, + "epoch": 0.3849090635803397, + "flos": 14507068613760.0, + "grad_norm": 1.836175131611194, + "language_loss": 0.72368169, + "learning_rate": 2.8178945650259012e-06, + "loss": 0.80103827, + "num_input_tokens_seen": 137388980, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.15637207, + "step": 6402, + "time_per_iteration": 2.509624481201172 + }, + { + "auxiliary_loss_clip": 0.06455728, + "auxiliary_loss_mlp": 0.01275333, + "balance_loss_clip": 0.06288305, + "balance_loss_mlp": 0.01261195, + "epoch": 0.3849691868330077, + "flos": 18521903070720.0, + "grad_norm": 1.8063322577059318, + "language_loss": 0.83321881, + "learning_rate": 2.817539143144128e-06, + "loss": 0.91052943, + "num_input_tokens_seen": 137406885, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.14147949, + "step": 6403, + "time_per_iteration": 2.469576835632324 + }, + { + "auxiliary_loss_clip": 0.06451748, + "auxiliary_loss_mlp": 0.01274136, + "balance_loss_clip": 0.06283461, + "balance_loss_mlp": 0.01259813, + "epoch": 0.38502931008567565, + "flos": 21622821045120.0, + "grad_norm": 1.901744090638215, + "language_loss": 0.83685166, + "learning_rate": 2.817183690261189e-06, + "loss": 0.91411054, + "num_input_tokens_seen": 137425535, + "router_z_loss_clip": 1.68066406, + "router_z_loss_mlp": 0.14331055, + "step": 6404, + "time_per_iteration": 2.53399920463562 + }, + { + "auxiliary_loss_clip": 0.06460617, + "auxiliary_loss_mlp": 0.01279935, + "balance_loss_clip": 0.06287636, + "balance_loss_mlp": 0.01265844, + "epoch": 0.3850894333383436, + "flos": 25423152249600.0, + "grad_norm": 1.4804001380923333, + "language_loss": 0.70053053, + "learning_rate": 2.816828206390563e-06, + "loss": 0.77793604, + "num_input_tokens_seen": 137447700, + "router_z_loss_clip": 1.72949219, + "router_z_loss_mlp": 0.14105225, + "step": 6405, + "time_per_iteration": 2.577394485473633 + }, + { + "auxiliary_loss_clip": 0.06446706, + "auxiliary_loss_mlp": 0.01276604, + "balance_loss_clip": 0.06280848, + "balance_loss_mlp": 0.01263628, + "epoch": 0.3851495565910116, + "flos": 20233721658240.0, + "grad_norm": 1.9002503642999313, + "language_loss": 0.7926501, + "learning_rate": 2.816472691545729e-06, + "loss": 0.86988324, + "num_input_tokens_seen": 137462245, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.12976074, + "step": 6406, + "time_per_iteration": 2.491785764694214 + }, + { + "auxiliary_loss_clip": 0.06454885, + "auxiliary_loss_mlp": 0.01271692, + "balance_loss_clip": 0.06282916, + "balance_loss_mlp": 0.01256516, + "epoch": 0.38520967984367954, + "flos": 16514045608320.0, + "grad_norm": 2.2453520034380463, + "language_loss": 0.84628403, + "learning_rate": 2.8161171457401694e-06, + "loss": 0.92354977, + "num_input_tokens_seen": 137476455, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.1517334, + "step": 6407, + "time_per_iteration": 2.461927890777588 + }, + { + "auxiliary_loss_clip": 0.06351051, + "auxiliary_loss_mlp": 0.01274061, + "balance_loss_clip": 0.06273395, + "balance_loss_mlp": 0.01270625, + "epoch": 0.3852698030963475, + "flos": 61333088140800.0, + "grad_norm": 0.7518927461814024, + "language_loss": 0.64829391, + "learning_rate": 2.815761568987365e-06, + "loss": 0.72454506, + "num_input_tokens_seen": 137539845, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.03445435, + "step": 6408, + "time_per_iteration": 3.195535659790039 + }, + { + "auxiliary_loss_clip": 0.06454469, + "auxiliary_loss_mlp": 0.01271284, + "balance_loss_clip": 0.06283102, + "balance_loss_mlp": 0.01256383, + "epoch": 0.3853299263490155, + "flos": 22899595633920.0, + "grad_norm": 1.3862214198415879, + "language_loss": 0.73785079, + "learning_rate": 2.8154059613008e-06, + "loss": 0.8151083, + "num_input_tokens_seen": 137559880, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.14904785, + "step": 6409, + "time_per_iteration": 2.5463829040527344 + }, + { + "auxiliary_loss_clip": 0.06465833, + "auxiliary_loss_mlp": 0.01272782, + "balance_loss_clip": 0.06287792, + "balance_loss_mlp": 0.01257667, + "epoch": 0.38539004960168344, + "flos": 20053655233920.0, + "grad_norm": 2.2638026574615076, + "language_loss": 0.70597708, + "learning_rate": 2.81505032269396e-06, + "loss": 0.78336322, + "num_input_tokens_seen": 137578225, + "router_z_loss_clip": 1.78027344, + "router_z_loss_mlp": 0.15100098, + "step": 6410, + "time_per_iteration": 2.4989383220672607 + }, + { + "auxiliary_loss_clip": 0.06347367, + "auxiliary_loss_mlp": 0.01259072, + "balance_loss_clip": 0.06269964, + "balance_loss_mlp": 0.01255689, + "epoch": 0.3854501728543514, + "flos": 68752971365760.0, + "grad_norm": 0.6472142759451909, + "language_loss": 0.6009953, + "learning_rate": 2.81469465318033e-06, + "loss": 0.67705965, + "num_input_tokens_seen": 137645770, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.03390503, + "step": 6411, + "time_per_iteration": 3.221977472305298 + }, + { + "auxiliary_loss_clip": 0.06456396, + "auxiliary_loss_mlp": 0.01271512, + "balance_loss_clip": 0.06285078, + "balance_loss_mlp": 0.01257266, + "epoch": 0.38551029610701937, + "flos": 20491214855040.0, + "grad_norm": 1.7976443608036217, + "language_loss": 0.78197634, + "learning_rate": 2.814338952773397e-06, + "loss": 0.85925543, + "num_input_tokens_seen": 137664090, + "router_z_loss_clip": 1.71484375, + "router_z_loss_mlp": 0.14245605, + "step": 6412, + "time_per_iteration": 2.5103437900543213 + }, + { + "auxiliary_loss_clip": 0.06460511, + "auxiliary_loss_mlp": 0.01272302, + "balance_loss_clip": 0.06287103, + "balance_loss_mlp": 0.01255267, + "epoch": 0.38557041935968733, + "flos": 23477627825280.0, + "grad_norm": 1.8586112834781277, + "language_loss": 0.78031844, + "learning_rate": 2.8139832214866493e-06, + "loss": 0.85764652, + "num_input_tokens_seen": 137683190, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.17041016, + "step": 6413, + "time_per_iteration": 3.933619499206543 + }, + { + "auxiliary_loss_clip": 0.06342902, + "auxiliary_loss_mlp": 0.01258937, + "balance_loss_clip": 0.06265719, + "balance_loss_mlp": 0.01255421, + "epoch": 0.38563054261235535, + "flos": 63984623068800.0, + "grad_norm": 0.7920557210391271, + "language_loss": 0.61310911, + "learning_rate": 2.813627459333576e-06, + "loss": 0.6891275, + "num_input_tokens_seen": 137737315, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.03527832, + "step": 6414, + "time_per_iteration": 3.063016891479492 + }, + { + "auxiliary_loss_clip": 0.06460327, + "auxiliary_loss_mlp": 0.0126994, + "balance_loss_clip": 0.06286235, + "balance_loss_mlp": 0.01255552, + "epoch": 0.3856906658650233, + "flos": 23994584789760.0, + "grad_norm": 1.981122511442252, + "language_loss": 0.78303337, + "learning_rate": 2.8132716663276685e-06, + "loss": 0.86033607, + "num_input_tokens_seen": 137753535, + "router_z_loss_clip": 1.74121094, + "router_z_loss_mlp": 0.14379883, + "step": 6415, + "time_per_iteration": 3.915883779525757 + }, + { + "auxiliary_loss_clip": 0.06448652, + "auxiliary_loss_mlp": 0.0126708, + "balance_loss_clip": 0.06285002, + "balance_loss_mlp": 0.01253842, + "epoch": 0.3857507891176913, + "flos": 25014075816960.0, + "grad_norm": 1.7132059772930233, + "language_loss": 0.8030045, + "learning_rate": 2.8129158424824173e-06, + "loss": 0.88016176, + "num_input_tokens_seen": 137773405, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.13244629, + "step": 6416, + "time_per_iteration": 2.5699849128723145 + }, + { + "auxiliary_loss_clip": 0.06451176, + "auxiliary_loss_mlp": 0.01270271, + "balance_loss_clip": 0.06281747, + "balance_loss_mlp": 0.01256353, + "epoch": 0.38581091237035925, + "flos": 21542082036480.0, + "grad_norm": 1.7425936217489657, + "language_loss": 0.79650658, + "learning_rate": 2.8125599878113155e-06, + "loss": 0.87372106, + "num_input_tokens_seen": 137790810, + "router_z_loss_clip": 1.69433594, + "router_z_loss_mlp": 0.13909912, + "step": 6417, + "time_per_iteration": 2.490114450454712 + }, + { + "auxiliary_loss_clip": 0.06448381, + "auxiliary_loss_mlp": 0.01266538, + "balance_loss_clip": 0.06279223, + "balance_loss_mlp": 0.01252602, + "epoch": 0.3858710356230272, + "flos": 17389584120960.0, + "grad_norm": 1.6880082960892822, + "language_loss": 0.80518526, + "learning_rate": 2.8122041023278583e-06, + "loss": 0.88233447, + "num_input_tokens_seen": 137810265, + "router_z_loss_clip": 1.69042969, + "router_z_loss_mlp": 0.13922119, + "step": 6418, + "time_per_iteration": 2.5246312618255615 + }, + { + "auxiliary_loss_clip": 0.06443715, + "auxiliary_loss_mlp": 0.01268216, + "balance_loss_clip": 0.06276865, + "balance_loss_mlp": 0.01254662, + "epoch": 0.3859311588756952, + "flos": 20345836821120.0, + "grad_norm": 1.685120659988575, + "language_loss": 0.79909503, + "learning_rate": 2.8118481860455407e-06, + "loss": 0.87621439, + "num_input_tokens_seen": 137828580, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.13568115, + "step": 6419, + "time_per_iteration": 3.9288835525512695 + }, + { + "auxiliary_loss_clip": 0.06446663, + "auxiliary_loss_mlp": 0.01270123, + "balance_loss_clip": 0.06280138, + "balance_loss_mlp": 0.01254745, + "epoch": 0.38599128212836314, + "flos": 26328054418560.0, + "grad_norm": 1.9252922162684358, + "language_loss": 0.67831242, + "learning_rate": 2.8114922389778573e-06, + "loss": 0.75548029, + "num_input_tokens_seen": 137846145, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.15362549, + "step": 6420, + "time_per_iteration": 2.5568132400512695 + }, + { + "auxiliary_loss_clip": 0.06447464, + "auxiliary_loss_mlp": 0.0127397, + "balance_loss_clip": 0.06282772, + "balance_loss_mlp": 0.01260267, + "epoch": 0.3860514053810311, + "flos": 13559050719360.0, + "grad_norm": 1.8138727093850848, + "language_loss": 0.81903851, + "learning_rate": 2.8111362611383076e-06, + "loss": 0.89625287, + "num_input_tokens_seen": 137863705, + "router_z_loss_clip": 1.64746094, + "router_z_loss_mlp": 0.13690186, + "step": 6421, + "time_per_iteration": 2.6095190048217773 + }, + { + "auxiliary_loss_clip": 0.06448883, + "auxiliary_loss_mlp": 0.01270223, + "balance_loss_clip": 0.06279142, + "balance_loss_mlp": 0.01254654, + "epoch": 0.3861115286336991, + "flos": 20959689432960.0, + "grad_norm": 1.9472147710185277, + "language_loss": 0.72463268, + "learning_rate": 2.8107802525403886e-06, + "loss": 0.80182374, + "num_input_tokens_seen": 137880285, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.15576172, + "step": 6422, + "time_per_iteration": 3.9032654762268066 + }, + { + "auxiliary_loss_clip": 0.06443937, + "auxiliary_loss_mlp": 0.01268443, + "balance_loss_clip": 0.06280221, + "balance_loss_mlp": 0.01254925, + "epoch": 0.38617165188636704, + "flos": 16368290231040.0, + "grad_norm": 1.6312257254810183, + "language_loss": 0.66935605, + "learning_rate": 2.8104242131976025e-06, + "loss": 0.74647987, + "num_input_tokens_seen": 137898335, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.13531494, + "step": 6423, + "time_per_iteration": 2.4858603477478027 + }, + { + "auxiliary_loss_clip": 0.06452656, + "auxiliary_loss_mlp": 0.01269446, + "balance_loss_clip": 0.06281117, + "balance_loss_mlp": 0.01254771, + "epoch": 0.386231775139035, + "flos": 34795828005120.0, + "grad_norm": 1.7836916741722195, + "language_loss": 0.69448572, + "learning_rate": 2.810068143123449e-06, + "loss": 0.77170676, + "num_input_tokens_seen": 137918605, + "router_z_loss_clip": 1.71679688, + "router_z_loss_mlp": 0.14685059, + "step": 6424, + "time_per_iteration": 2.636545181274414 + }, + { + "auxiliary_loss_clip": 0.06446116, + "auxiliary_loss_mlp": 0.01269815, + "balance_loss_clip": 0.0628031, + "balance_loss_mlp": 0.0125616, + "epoch": 0.38629189839170297, + "flos": 21732672147840.0, + "grad_norm": 1.4876753960050375, + "language_loss": 0.72829968, + "learning_rate": 2.809712042331429e-06, + "loss": 0.80545902, + "num_input_tokens_seen": 137938245, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.13677979, + "step": 6425, + "time_per_iteration": 2.520872116088867 + }, + { + "auxiliary_loss_clip": 0.06454374, + "auxiliary_loss_mlp": 0.01269159, + "balance_loss_clip": 0.06279134, + "balance_loss_mlp": 0.01254383, + "epoch": 0.38635202164437094, + "flos": 27930315392640.0, + "grad_norm": 3.253764220801107, + "language_loss": 0.8113848, + "learning_rate": 2.8093559108350484e-06, + "loss": 0.88862014, + "num_input_tokens_seen": 137956770, + "router_z_loss_clip": 1.75097656, + "router_z_loss_mlp": 0.14752197, + "step": 6426, + "time_per_iteration": 2.577439785003662 + }, + { + "auxiliary_loss_clip": 0.06458677, + "auxiliary_loss_mlp": 0.01277199, + "balance_loss_clip": 0.06288534, + "balance_loss_mlp": 0.01261797, + "epoch": 0.38641214489703896, + "flos": 23593390640640.0, + "grad_norm": 1.9966810796758758, + "language_loss": 0.75299263, + "learning_rate": 2.80899974864781e-06, + "loss": 0.83035141, + "num_input_tokens_seen": 137977040, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.15393066, + "step": 6427, + "time_per_iteration": 2.538494825363159 + }, + { + "auxiliary_loss_clip": 0.06449243, + "auxiliary_loss_mlp": 0.01269948, + "balance_loss_clip": 0.0627961, + "balance_loss_mlp": 0.01255512, + "epoch": 0.3864722681497069, + "flos": 12646224339840.0, + "grad_norm": 1.7399599530073546, + "language_loss": 0.70451963, + "learning_rate": 2.8086435557832203e-06, + "loss": 0.78171146, + "num_input_tokens_seen": 137993545, + "router_z_loss_clip": 1.69726562, + "router_z_loss_mlp": 0.14428711, + "step": 6428, + "time_per_iteration": 2.501620292663574 + }, + { + "auxiliary_loss_clip": 0.06450263, + "auxiliary_loss_mlp": 0.01273584, + "balance_loss_clip": 0.06279485, + "balance_loss_mlp": 0.01259517, + "epoch": 0.3865323914023749, + "flos": 17604003519360.0, + "grad_norm": 1.9791686977360912, + "language_loss": 0.84605539, + "learning_rate": 2.8082873322547863e-06, + "loss": 0.92329377, + "num_input_tokens_seen": 138010140, + "router_z_loss_clip": 1.70898438, + "router_z_loss_mlp": 0.14074707, + "step": 6429, + "time_per_iteration": 2.4769797325134277 + }, + { + "auxiliary_loss_clip": 0.06453393, + "auxiliary_loss_mlp": 0.01272687, + "balance_loss_clip": 0.06283154, + "balance_loss_mlp": 0.01258679, + "epoch": 0.38659251465504285, + "flos": 18484908693120.0, + "grad_norm": 1.8799663311521415, + "language_loss": 0.81149292, + "learning_rate": 2.807931078076015e-06, + "loss": 0.88875371, + "num_input_tokens_seen": 138028880, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.13995361, + "step": 6430, + "time_per_iteration": 2.552243232727051 + }, + { + "auxiliary_loss_clip": 0.06342202, + "auxiliary_loss_mlp": 0.0126596, + "balance_loss_clip": 0.06266356, + "balance_loss_mlp": 0.0126256, + "epoch": 0.3866526379077108, + "flos": 64186533480960.0, + "grad_norm": 0.7018569193916078, + "language_loss": 0.58841789, + "learning_rate": 2.807574793260416e-06, + "loss": 0.66449958, + "num_input_tokens_seen": 138098090, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.03408813, + "step": 6431, + "time_per_iteration": 3.1865365505218506 + }, + { + "auxiliary_loss_clip": 0.06457522, + "auxiliary_loss_mlp": 0.01268383, + "balance_loss_clip": 0.06283836, + "balance_loss_mlp": 0.01253464, + "epoch": 0.3867127611603788, + "flos": 14392857098880.0, + "grad_norm": 1.8389423140015868, + "language_loss": 0.79719216, + "learning_rate": 2.8072184778215004e-06, + "loss": 0.87445116, + "num_input_tokens_seen": 138114735, + "router_z_loss_clip": 1.73828125, + "router_z_loss_mlp": 0.14910889, + "step": 6432, + "time_per_iteration": 2.5060834884643555 + }, + { + "auxiliary_loss_clip": 0.06456694, + "auxiliary_loss_mlp": 0.0127456, + "balance_loss_clip": 0.06279335, + "balance_loss_mlp": 0.01259217, + "epoch": 0.38677288441304675, + "flos": 20016870491520.0, + "grad_norm": 2.041684818915054, + "language_loss": 0.80982423, + "learning_rate": 2.806862131772779e-06, + "loss": 0.88713682, + "num_input_tokens_seen": 138130480, + "router_z_loss_clip": 1.77050781, + "router_z_loss_mlp": 0.15350342, + "step": 6433, + "time_per_iteration": 2.4978644847869873 + }, + { + "auxiliary_loss_clip": 0.06452015, + "auxiliary_loss_mlp": 0.01268045, + "balance_loss_clip": 0.06280582, + "balance_loss_mlp": 0.01251725, + "epoch": 0.3868330076657147, + "flos": 22243465837440.0, + "grad_norm": 1.5518308416482827, + "language_loss": 0.71316475, + "learning_rate": 2.806505755127765e-06, + "loss": 0.79036534, + "num_input_tokens_seen": 138150640, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.16308594, + "step": 6434, + "time_per_iteration": 2.5623676776885986 + }, + { + "auxiliary_loss_clip": 0.06457677, + "auxiliary_loss_mlp": 0.01269901, + "balance_loss_clip": 0.06278059, + "balance_loss_mlp": 0.01254547, + "epoch": 0.3868931309183827, + "flos": 16733076981120.0, + "grad_norm": 1.5292505515468358, + "language_loss": 0.77740347, + "learning_rate": 2.806149347899972e-06, + "loss": 0.85467923, + "num_input_tokens_seen": 138169700, + "router_z_loss_clip": 1.79394531, + "router_z_loss_mlp": 0.15350342, + "step": 6435, + "time_per_iteration": 2.4930777549743652 + }, + { + "auxiliary_loss_clip": 0.06446007, + "auxiliary_loss_mlp": 0.01272949, + "balance_loss_clip": 0.0627854, + "balance_loss_mlp": 0.01257594, + "epoch": 0.38695325417105064, + "flos": 22681360874880.0, + "grad_norm": 2.334489182765127, + "language_loss": 0.79902756, + "learning_rate": 2.805792910102915e-06, + "loss": 0.87621707, + "num_input_tokens_seen": 138185835, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.15362549, + "step": 6436, + "time_per_iteration": 2.595480442047119 + }, + { + "auxiliary_loss_clip": 0.06446151, + "auxiliary_loss_mlp": 0.01269254, + "balance_loss_clip": 0.0628051, + "balance_loss_mlp": 0.01255312, + "epoch": 0.3870133774237186, + "flos": 23118668933760.0, + "grad_norm": 1.736913277816888, + "language_loss": 0.77232099, + "learning_rate": 2.8054364417501093e-06, + "loss": 0.84947503, + "num_input_tokens_seen": 138204080, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.13934326, + "step": 6437, + "time_per_iteration": 2.6555299758911133 + }, + { + "auxiliary_loss_clip": 0.064465, + "auxiliary_loss_mlp": 0.01272869, + "balance_loss_clip": 0.06279578, + "balance_loss_mlp": 0.01259422, + "epoch": 0.3870735006763866, + "flos": 17681430291840.0, + "grad_norm": 2.573442514460841, + "language_loss": 0.81961322, + "learning_rate": 2.805079942855074e-06, + "loss": 0.89680696, + "num_input_tokens_seen": 138220710, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.13452148, + "step": 6438, + "time_per_iteration": 2.55658221244812 + }, + { + "auxiliary_loss_clip": 0.06449786, + "auxiliary_loss_mlp": 0.01268651, + "balance_loss_clip": 0.06278464, + "balance_loss_mlp": 0.01253869, + "epoch": 0.38713362392905454, + "flos": 23302676499840.0, + "grad_norm": 1.3535213690135137, + "language_loss": 0.75684851, + "learning_rate": 2.804723413431326e-06, + "loss": 0.83403289, + "num_input_tokens_seen": 138241720, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.14782715, + "step": 6439, + "time_per_iteration": 2.5023999214172363 + }, + { + "auxiliary_loss_clip": 0.06452194, + "auxiliary_loss_mlp": 0.01275332, + "balance_loss_clip": 0.06287295, + "balance_loss_mlp": 0.0126083, + "epoch": 0.38719374718172256, + "flos": 21037283913600.0, + "grad_norm": 2.8624272787557556, + "language_loss": 0.74227071, + "learning_rate": 2.8043668534923855e-06, + "loss": 0.81954598, + "num_input_tokens_seen": 138261885, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.1449585, + "step": 6440, + "time_per_iteration": 2.5370354652404785 + }, + { + "auxiliary_loss_clip": 0.06454886, + "auxiliary_loss_mlp": 0.01272767, + "balance_loss_clip": 0.06279822, + "balance_loss_mlp": 0.01257401, + "epoch": 0.3872538704343905, + "flos": 19615885977600.0, + "grad_norm": 1.8472167429080706, + "language_loss": 0.82205182, + "learning_rate": 2.804010263051774e-06, + "loss": 0.89932835, + "num_input_tokens_seen": 138280255, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.15368652, + "step": 6441, + "time_per_iteration": 2.4829154014587402 + }, + { + "auxiliary_loss_clip": 0.06449816, + "auxiliary_loss_mlp": 0.01273448, + "balance_loss_clip": 0.0628119, + "balance_loss_mlp": 0.01258833, + "epoch": 0.3873139936870585, + "flos": 17535800695680.0, + "grad_norm": 2.061540845511299, + "language_loss": 0.80687004, + "learning_rate": 2.8036536421230118e-06, + "loss": 0.8841027, + "num_input_tokens_seen": 138296675, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.14593506, + "step": 6442, + "time_per_iteration": 2.5348403453826904 + }, + { + "auxiliary_loss_clip": 0.0645024, + "auxiliary_loss_mlp": 0.01274941, + "balance_loss_clip": 0.0628161, + "balance_loss_mlp": 0.01260302, + "epoch": 0.38737411693972645, + "flos": 17792539205760.0, + "grad_norm": 1.5850563005203315, + "language_loss": 0.84242606, + "learning_rate": 2.803296990719624e-06, + "loss": 0.91967785, + "num_input_tokens_seen": 138314985, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.14642334, + "step": 6443, + "time_per_iteration": 2.475142240524292 + }, + { + "auxiliary_loss_clip": 0.06346577, + "auxiliary_loss_mlp": 0.01257136, + "balance_loss_clip": 0.06270638, + "balance_loss_mlp": 0.01253804, + "epoch": 0.3874342401923944, + "flos": 58320554624640.0, + "grad_norm": 0.7460963165264183, + "language_loss": 0.5025984, + "learning_rate": 2.8029403088551327e-06, + "loss": 0.57863545, + "num_input_tokens_seen": 138373275, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.03338623, + "step": 6444, + "time_per_iteration": 3.146993398666382 + }, + { + "auxiliary_loss_clip": 0.06439754, + "auxiliary_loss_mlp": 0.01267857, + "balance_loss_clip": 0.0627708, + "balance_loss_mlp": 0.01254088, + "epoch": 0.3874943634450624, + "flos": 17717628055680.0, + "grad_norm": 1.4103476418524727, + "language_loss": 0.79081571, + "learning_rate": 2.802583596543065e-06, + "loss": 0.86789179, + "num_input_tokens_seen": 138391145, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.13757324, + "step": 6445, + "time_per_iteration": 2.4769954681396484 + }, + { + "auxiliary_loss_clip": 0.06442489, + "auxiliary_loss_mlp": 0.01275349, + "balance_loss_clip": 0.06277544, + "balance_loss_mlp": 0.01261497, + "epoch": 0.38755448669773035, + "flos": 19250889592320.0, + "grad_norm": 1.890349589911811, + "language_loss": 0.81530821, + "learning_rate": 2.8022268537969474e-06, + "loss": 0.89248657, + "num_input_tokens_seen": 138409875, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.13861084, + "step": 6446, + "time_per_iteration": 2.5224525928497314 + }, + { + "auxiliary_loss_clip": 0.06442682, + "auxiliary_loss_mlp": 0.01277068, + "balance_loss_clip": 0.06275576, + "balance_loss_mlp": 0.01262489, + "epoch": 0.3876146099503983, + "flos": 20600437052160.0, + "grad_norm": 2.019397578580159, + "language_loss": 0.77555805, + "learning_rate": 2.801870080630306e-06, + "loss": 0.85275555, + "num_input_tokens_seen": 138428965, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.14575195, + "step": 6447, + "time_per_iteration": 2.4808783531188965 + }, + { + "auxiliary_loss_clip": 0.06441282, + "auxiliary_loss_mlp": 0.01273458, + "balance_loss_clip": 0.06277911, + "balance_loss_mlp": 0.01259355, + "epoch": 0.3876747332030663, + "flos": 19287129283200.0, + "grad_norm": 1.5926200346390118, + "language_loss": 0.76299512, + "learning_rate": 2.801513277056671e-06, + "loss": 0.84014249, + "num_input_tokens_seen": 138448090, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.14099121, + "step": 6448, + "time_per_iteration": 2.532101631164551 + }, + { + "auxiliary_loss_clip": 0.06445228, + "auxiliary_loss_mlp": 0.01276025, + "balance_loss_clip": 0.06280892, + "balance_loss_mlp": 0.01262363, + "epoch": 0.38773485645573424, + "flos": 18950699940480.0, + "grad_norm": 1.5288018173805344, + "language_loss": 0.76734072, + "learning_rate": 2.8011564430895725e-06, + "loss": 0.84455323, + "num_input_tokens_seen": 138466105, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.13647461, + "step": 6449, + "time_per_iteration": 2.515660524368286 + }, + { + "auxiliary_loss_clip": 0.06448871, + "auxiliary_loss_mlp": 0.01273884, + "balance_loss_clip": 0.0627744, + "balance_loss_mlp": 0.01258673, + "epoch": 0.3877949797084022, + "flos": 23077272216960.0, + "grad_norm": 1.7542495709483765, + "language_loss": 0.78832948, + "learning_rate": 2.800799578742542e-06, + "loss": 0.86555696, + "num_input_tokens_seen": 138485160, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.15209961, + "step": 6450, + "time_per_iteration": 2.5662050247192383 + }, + { + "auxiliary_loss_clip": 0.06452119, + "auxiliary_loss_mlp": 0.01276385, + "balance_loss_clip": 0.06276712, + "balance_loss_mlp": 0.01261317, + "epoch": 0.3878551029610702, + "flos": 29103150591360.0, + "grad_norm": 2.1638461576043095, + "language_loss": 0.78188771, + "learning_rate": 2.8004426840291106e-06, + "loss": 0.8591727, + "num_input_tokens_seen": 138504135, + "router_z_loss_clip": 1.75390625, + "router_z_loss_mlp": 0.15063477, + "step": 6451, + "time_per_iteration": 2.5734686851501465 + }, + { + "auxiliary_loss_clip": 0.06442447, + "auxiliary_loss_mlp": 0.01277813, + "balance_loss_clip": 0.06278168, + "balance_loss_mlp": 0.01263967, + "epoch": 0.38791522621373814, + "flos": 21002763231360.0, + "grad_norm": 1.7745661107883532, + "language_loss": 0.76657486, + "learning_rate": 2.800085758962812e-06, + "loss": 0.84377748, + "num_input_tokens_seen": 138523955, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.13842773, + "step": 6452, + "time_per_iteration": 4.083965301513672 + }, + { + "auxiliary_loss_clip": 0.06445795, + "auxiliary_loss_mlp": 0.01272941, + "balance_loss_clip": 0.06277959, + "balance_loss_mlp": 0.01258457, + "epoch": 0.3879753494664061, + "flos": 15492248593920.0, + "grad_norm": 1.5775897118958155, + "language_loss": 0.80075014, + "learning_rate": 2.799728803557182e-06, + "loss": 0.87793756, + "num_input_tokens_seen": 138541655, + "router_z_loss_clip": 1.67871094, + "router_z_loss_mlp": 0.14483643, + "step": 6453, + "time_per_iteration": 2.5186924934387207 + }, + { + "auxiliary_loss_clip": 0.06452494, + "auxiliary_loss_mlp": 0.01273182, + "balance_loss_clip": 0.06277925, + "balance_loss_mlp": 0.01258472, + "epoch": 0.3880354727190741, + "flos": 22060422593280.0, + "grad_norm": 1.7271767654368522, + "language_loss": 0.71748114, + "learning_rate": 2.7993718178257555e-06, + "loss": 0.79473794, + "num_input_tokens_seen": 138560860, + "router_z_loss_clip": 1.74609375, + "router_z_loss_mlp": 0.14697266, + "step": 6454, + "time_per_iteration": 2.516023635864258 + }, + { + "auxiliary_loss_clip": 0.0645522, + "auxiliary_loss_mlp": 0.01280556, + "balance_loss_clip": 0.06279911, + "balance_loss_mlp": 0.01263986, + "epoch": 0.3880955959717421, + "flos": 20346675361920.0, + "grad_norm": 2.0562500360548452, + "language_loss": 0.77941358, + "learning_rate": 2.7990148017820694e-06, + "loss": 0.85677135, + "num_input_tokens_seen": 138580200, + "router_z_loss_clip": 1.75097656, + "router_z_loss_mlp": 0.16577148, + "step": 6455, + "time_per_iteration": 3.9251530170440674 + }, + { + "auxiliary_loss_clip": 0.0644723, + "auxiliary_loss_mlp": 0.0127199, + "balance_loss_clip": 0.062791, + "balance_loss_mlp": 0.01257804, + "epoch": 0.38815571922441006, + "flos": 23082009972480.0, + "grad_norm": 1.5355571660803105, + "language_loss": 0.76081556, + "learning_rate": 2.798657755439662e-06, + "loss": 0.83800781, + "num_input_tokens_seen": 138598315, + "router_z_loss_clip": 1.68066406, + "router_z_loss_mlp": 0.14196777, + "step": 6456, + "time_per_iteration": 2.5377979278564453 + }, + { + "auxiliary_loss_clip": 0.064498, + "auxiliary_loss_mlp": 0.01279611, + "balance_loss_clip": 0.06277888, + "balance_loss_mlp": 0.01264811, + "epoch": 0.388215842477078, + "flos": 20783186807040.0, + "grad_norm": 2.2521174172947838, + "language_loss": 0.60975528, + "learning_rate": 2.7983006788120726e-06, + "loss": 0.68704933, + "num_input_tokens_seen": 138615695, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.14801025, + "step": 6457, + "time_per_iteration": 2.500054121017456 + }, + { + "auxiliary_loss_clip": 0.06447765, + "auxiliary_loss_mlp": 0.01274853, + "balance_loss_clip": 0.06275971, + "balance_loss_mlp": 0.01259308, + "epoch": 0.388275965729746, + "flos": 20454304331520.0, + "grad_norm": 3.4499577756661384, + "language_loss": 0.80527538, + "learning_rate": 2.797943571912841e-06, + "loss": 0.88250154, + "num_input_tokens_seen": 138633180, + "router_z_loss_clip": 1.71777344, + "router_z_loss_mlp": 0.15551758, + "step": 6458, + "time_per_iteration": 2.5349881649017334 + }, + { + "auxiliary_loss_clip": 0.06448271, + "auxiliary_loss_mlp": 0.01274317, + "balance_loss_clip": 0.06278434, + "balance_loss_mlp": 0.0125938, + "epoch": 0.38833608898241395, + "flos": 27899945487360.0, + "grad_norm": 3.532155031934189, + "language_loss": 0.8156774, + "learning_rate": 2.797586434755509e-06, + "loss": 0.89290321, + "num_input_tokens_seen": 138654785, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.14941406, + "step": 6459, + "time_per_iteration": 4.015187978744507 + }, + { + "auxiliary_loss_clip": 0.0644253, + "auxiliary_loss_mlp": 0.01277266, + "balance_loss_clip": 0.06278129, + "balance_loss_mlp": 0.01263789, + "epoch": 0.3883962122350819, + "flos": 18082079389440.0, + "grad_norm": 1.6405749509561738, + "language_loss": 0.62564123, + "learning_rate": 2.7972292673536202e-06, + "loss": 0.7028392, + "num_input_tokens_seen": 138673330, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.13470459, + "step": 6460, + "time_per_iteration": 2.497053861618042 + }, + { + "auxiliary_loss_clip": 0.06445154, + "auxiliary_loss_mlp": 0.01273315, + "balance_loss_clip": 0.06277992, + "balance_loss_mlp": 0.01259374, + "epoch": 0.3884563354877499, + "flos": 23628875644800.0, + "grad_norm": 1.560750838950793, + "language_loss": 0.86785483, + "learning_rate": 2.796872069720717e-06, + "loss": 0.94503951, + "num_input_tokens_seen": 138694185, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.1394043, + "step": 6461, + "time_per_iteration": 2.5308427810668945 + }, + { + "auxiliary_loss_clip": 0.06442384, + "auxiliary_loss_mlp": 0.01273139, + "balance_loss_clip": 0.06272203, + "balance_loss_mlp": 0.01258369, + "epoch": 0.38851645874041785, + "flos": 27460834565760.0, + "grad_norm": 2.5738865735247285, + "language_loss": 0.71770304, + "learning_rate": 2.7965148418703456e-06, + "loss": 0.79485828, + "num_input_tokens_seen": 138714625, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.14782715, + "step": 6462, + "time_per_iteration": 3.942819833755493 + }, + { + "auxiliary_loss_clip": 0.06442184, + "auxiliary_loss_mlp": 0.01271045, + "balance_loss_clip": 0.0627287, + "balance_loss_mlp": 0.01256036, + "epoch": 0.3885765819930858, + "flos": 25235035833600.0, + "grad_norm": 2.2250707690072886, + "language_loss": 0.76693827, + "learning_rate": 2.796157583816052e-06, + "loss": 0.84407055, + "num_input_tokens_seen": 138733585, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.15014648, + "step": 6463, + "time_per_iteration": 2.577254056930542 + }, + { + "auxiliary_loss_clip": 0.06458563, + "auxiliary_loss_mlp": 0.01275367, + "balance_loss_clip": 0.06282724, + "balance_loss_mlp": 0.01259441, + "epoch": 0.3886367052457538, + "flos": 16952317989120.0, + "grad_norm": 2.5235079856597196, + "language_loss": 0.70838499, + "learning_rate": 2.795800295571382e-06, + "loss": 0.78572428, + "num_input_tokens_seen": 138752335, + "router_z_loss_clip": 1.75683594, + "router_z_loss_mlp": 0.15930176, + "step": 6464, + "time_per_iteration": 2.501830816268921 + }, + { + "auxiliary_loss_clip": 0.06442419, + "auxiliary_loss_mlp": 0.01270994, + "balance_loss_clip": 0.06275325, + "balance_loss_mlp": 0.01255699, + "epoch": 0.38869682849842174, + "flos": 27160141789440.0, + "grad_norm": 1.8571499226781363, + "language_loss": 0.69473737, + "learning_rate": 2.7954429771498858e-06, + "loss": 0.77187151, + "num_input_tokens_seen": 138768450, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.15301514, + "step": 6465, + "time_per_iteration": 2.6060595512390137 + }, + { + "auxiliary_loss_clip": 0.06446355, + "auxiliary_loss_mlp": 0.01273054, + "balance_loss_clip": 0.06276145, + "balance_loss_mlp": 0.01257271, + "epoch": 0.3887569517510897, + "flos": 21069037411200.0, + "grad_norm": 2.3078416168388243, + "language_loss": 0.78628361, + "learning_rate": 2.7950856285651117e-06, + "loss": 0.86347771, + "num_input_tokens_seen": 138786775, + "router_z_loss_clip": 1.70019531, + "router_z_loss_mlp": 0.15771484, + "step": 6466, + "time_per_iteration": 2.503218650817871 + }, + { + "auxiliary_loss_clip": 0.06447446, + "auxiliary_loss_mlp": 0.01269245, + "balance_loss_clip": 0.0627599, + "balance_loss_mlp": 0.01255, + "epoch": 0.38881707500375773, + "flos": 29505141354240.0, + "grad_norm": 1.7748655394270907, + "language_loss": 0.695912, + "learning_rate": 2.794728249830611e-06, + "loss": 0.77307892, + "num_input_tokens_seen": 138810100, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.1427002, + "step": 6467, + "time_per_iteration": 2.6156952381134033 + }, + { + "auxiliary_loss_clip": 0.0644877, + "auxiliary_loss_mlp": 0.01269809, + "balance_loss_clip": 0.06277345, + "balance_loss_mlp": 0.01255403, + "epoch": 0.3888771982564257, + "flos": 17493146167680.0, + "grad_norm": 2.2278384059050285, + "language_loss": 0.83988351, + "learning_rate": 2.794370840959936e-06, + "loss": 0.91706932, + "num_input_tokens_seen": 138825140, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.14404297, + "step": 6468, + "time_per_iteration": 2.446979522705078 + }, + { + "auxiliary_loss_clip": 0.0644114, + "auxiliary_loss_mlp": 0.01268766, + "balance_loss_clip": 0.06273733, + "balance_loss_mlp": 0.01254628, + "epoch": 0.38893732150909366, + "flos": 21948517065600.0, + "grad_norm": 2.4269891965149837, + "language_loss": 0.84667963, + "learning_rate": 2.7940134019666383e-06, + "loss": 0.92377871, + "num_input_tokens_seen": 138844115, + "router_z_loss_clip": 1.67480469, + "router_z_loss_mlp": 0.14141846, + "step": 6469, + "time_per_iteration": 2.6123251914978027 + }, + { + "auxiliary_loss_clip": 0.06445388, + "auxiliary_loss_mlp": 0.01267071, + "balance_loss_clip": 0.06276623, + "balance_loss_mlp": 0.01252575, + "epoch": 0.3889974447617616, + "flos": 24282657527040.0, + "grad_norm": 1.7885497899924685, + "language_loss": 0.75114912, + "learning_rate": 2.793655932864273e-06, + "loss": 0.82827377, + "num_input_tokens_seen": 138860860, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.14508057, + "step": 6470, + "time_per_iteration": 2.5293121337890625 + }, + { + "auxiliary_loss_clip": 0.06447375, + "auxiliary_loss_mlp": 0.01272376, + "balance_loss_clip": 0.06277959, + "balance_loss_mlp": 0.01257785, + "epoch": 0.3890575680144296, + "flos": 25674356390400.0, + "grad_norm": 2.975621998510204, + "language_loss": 0.75126278, + "learning_rate": 2.7932984336663953e-06, + "loss": 0.8284604, + "num_input_tokens_seen": 138881910, + "router_z_loss_clip": 1.69433594, + "router_z_loss_mlp": 0.14575195, + "step": 6471, + "time_per_iteration": 2.6211233139038086 + }, + { + "auxiliary_loss_clip": 0.0644885, + "auxiliary_loss_mlp": 0.01268799, + "balance_loss_clip": 0.06277963, + "balance_loss_mlp": 0.01254291, + "epoch": 0.38911769126709755, + "flos": 22861636934400.0, + "grad_norm": 1.6871762941495017, + "language_loss": 0.68158531, + "learning_rate": 2.792940904386562e-06, + "loss": 0.75876176, + "num_input_tokens_seen": 138900975, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.1451416, + "step": 6472, + "time_per_iteration": 2.5192203521728516 + }, + { + "auxiliary_loss_clip": 0.06449802, + "auxiliary_loss_mlp": 0.01271384, + "balance_loss_clip": 0.06278318, + "balance_loss_mlp": 0.01256739, + "epoch": 0.3891778145197655, + "flos": 25454612257920.0, + "grad_norm": 1.6537492711017865, + "language_loss": 0.76761287, + "learning_rate": 2.7925833450383293e-06, + "loss": 0.84482473, + "num_input_tokens_seen": 138920795, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.14654541, + "step": 6473, + "time_per_iteration": 2.588179349899292 + }, + { + "auxiliary_loss_clip": 0.06451473, + "auxiliary_loss_mlp": 0.01269072, + "balance_loss_clip": 0.0627984, + "balance_loss_mlp": 0.01254803, + "epoch": 0.3892379377724335, + "flos": 14033227374720.0, + "grad_norm": 1.8453216957475485, + "language_loss": 0.71886337, + "learning_rate": 2.792225755635257e-06, + "loss": 0.79606879, + "num_input_tokens_seen": 138938770, + "router_z_loss_clip": 1.71582031, + "router_z_loss_mlp": 0.1427002, + "step": 6474, + "time_per_iteration": 2.5054657459259033 + }, + { + "auxiliary_loss_clip": 0.06452703, + "auxiliary_loss_mlp": 0.01266582, + "balance_loss_clip": 0.06280853, + "balance_loss_mlp": 0.01252945, + "epoch": 0.38929806102510145, + "flos": 20163715971840.0, + "grad_norm": 1.4152146042292184, + "language_loss": 0.68943882, + "learning_rate": 2.7918681361909046e-06, + "loss": 0.76663172, + "num_input_tokens_seen": 138958880, + "router_z_loss_clip": 1.71582031, + "router_z_loss_mlp": 0.1362915, + "step": 6475, + "time_per_iteration": 2.5646328926086426 + }, + { + "auxiliary_loss_clip": 0.06459899, + "auxiliary_loss_mlp": 0.01272247, + "balance_loss_clip": 0.06281739, + "balance_loss_mlp": 0.01257107, + "epoch": 0.3893581842777694, + "flos": 22170525258240.0, + "grad_norm": 1.7897820076570896, + "language_loss": 0.75474584, + "learning_rate": 2.7915104867188332e-06, + "loss": 0.83206725, + "num_input_tokens_seen": 138977240, + "router_z_loss_clip": 1.78125, + "router_z_loss_mlp": 0.15142822, + "step": 6476, + "time_per_iteration": 2.515145778656006 + }, + { + "auxiliary_loss_clip": 0.06356712, + "auxiliary_loss_mlp": 0.01262119, + "balance_loss_clip": 0.06275933, + "balance_loss_mlp": 0.01259353, + "epoch": 0.3894183075304374, + "flos": 67322936459520.0, + "grad_norm": 0.7612569916112396, + "language_loss": 0.58157814, + "learning_rate": 2.7911528072326055e-06, + "loss": 0.65776634, + "num_input_tokens_seen": 139039035, + "router_z_loss_clip": 0.80615234, + "router_z_loss_mlp": 0.0276947, + "step": 6477, + "time_per_iteration": 3.147226572036743 + }, + { + "auxiliary_loss_clip": 0.06461065, + "auxiliary_loss_mlp": 0.0127366, + "balance_loss_clip": 0.06287047, + "balance_loss_mlp": 0.01258711, + "epoch": 0.38947843078310534, + "flos": 18552734173440.0, + "grad_norm": 2.207057593016708, + "language_loss": 0.77832031, + "learning_rate": 2.7907950977457832e-06, + "loss": 0.85566759, + "num_input_tokens_seen": 139055560, + "router_z_loss_clip": 1.73925781, + "router_z_loss_mlp": 0.14953613, + "step": 6478, + "time_per_iteration": 2.5238850116729736 + }, + { + "auxiliary_loss_clip": 0.06450923, + "auxiliary_loss_mlp": 0.01273895, + "balance_loss_clip": 0.06281843, + "balance_loss_mlp": 0.01260162, + "epoch": 0.3895385540357733, + "flos": 14610253317120.0, + "grad_norm": 2.187508322407885, + "language_loss": 0.83306336, + "learning_rate": 2.7904373582719317e-06, + "loss": 0.91031158, + "num_input_tokens_seen": 139071865, + "router_z_loss_clip": 1.69140625, + "router_z_loss_mlp": 0.13739014, + "step": 6479, + "time_per_iteration": 2.5355920791625977 + }, + { + "auxiliary_loss_clip": 0.06451993, + "auxiliary_loss_mlp": 0.0126931, + "balance_loss_clip": 0.06282853, + "balance_loss_mlp": 0.01254414, + "epoch": 0.38959867728844133, + "flos": 19981469341440.0, + "grad_norm": 1.7759645272954405, + "language_loss": 0.80297941, + "learning_rate": 2.790079588824617e-06, + "loss": 0.8801924, + "num_input_tokens_seen": 139089640, + "router_z_loss_clip": 1.69140625, + "router_z_loss_mlp": 0.14892578, + "step": 6480, + "time_per_iteration": 2.51645565032959 + }, + { + "auxiliary_loss_clip": 0.06447603, + "auxiliary_loss_mlp": 0.01270991, + "balance_loss_clip": 0.06278986, + "balance_loss_mlp": 0.01256924, + "epoch": 0.3896588005411093, + "flos": 22678342128000.0, + "grad_norm": 1.6438066173178132, + "language_loss": 0.83259583, + "learning_rate": 2.7897217894174038e-06, + "loss": 0.90978175, + "num_input_tokens_seen": 139109365, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.140625, + "step": 6481, + "time_per_iteration": 2.542642116546631 + }, + { + "auxiliary_loss_clip": 0.06446713, + "auxiliary_loss_mlp": 0.0127065, + "balance_loss_clip": 0.0628217, + "balance_loss_mlp": 0.01257204, + "epoch": 0.38971892379377726, + "flos": 21002343960960.0, + "grad_norm": 1.5951406272778517, + "language_loss": 0.75640547, + "learning_rate": 2.789363960063863e-06, + "loss": 0.83357906, + "num_input_tokens_seen": 139128260, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.13458252, + "step": 6482, + "time_per_iteration": 2.5500056743621826 + }, + { + "auxiliary_loss_clip": 0.06452929, + "auxiliary_loss_mlp": 0.01268783, + "balance_loss_clip": 0.06281099, + "balance_loss_mlp": 0.01254853, + "epoch": 0.3897790470464452, + "flos": 22535060446080.0, + "grad_norm": 1.9197222218969183, + "language_loss": 0.78993875, + "learning_rate": 2.78900610077756e-06, + "loss": 0.86715591, + "num_input_tokens_seen": 139147315, + "router_z_loss_clip": 1.71777344, + "router_z_loss_mlp": 0.13922119, + "step": 6483, + "time_per_iteration": 2.5677597522735596 + }, + { + "auxiliary_loss_clip": 0.06452915, + "auxiliary_loss_mlp": 0.01271475, + "balance_loss_clip": 0.06281908, + "balance_loss_mlp": 0.01256157, + "epoch": 0.3898391702991132, + "flos": 26216484307200.0, + "grad_norm": 1.4915682478636534, + "language_loss": 0.80430162, + "learning_rate": 2.788648211572067e-06, + "loss": 0.88154554, + "num_input_tokens_seen": 139167270, + "router_z_loss_clip": 1.70898438, + "router_z_loss_mlp": 0.15307617, + "step": 6484, + "time_per_iteration": 2.582933187484741 + }, + { + "auxiliary_loss_clip": 0.06455952, + "auxiliary_loss_mlp": 0.01270999, + "balance_loss_clip": 0.06285131, + "balance_loss_mlp": 0.01255347, + "epoch": 0.38989929355178116, + "flos": 21071301471360.0, + "grad_norm": 1.959559170578303, + "language_loss": 0.7792083, + "learning_rate": 2.7882902924609557e-06, + "loss": 0.8564778, + "num_input_tokens_seen": 139185970, + "router_z_loss_clip": 1.70800781, + "router_z_loss_mlp": 0.15637207, + "step": 6485, + "time_per_iteration": 2.532944917678833 + }, + { + "auxiliary_loss_clip": 0.06453831, + "auxiliary_loss_mlp": 0.01268339, + "balance_loss_clip": 0.06280229, + "balance_loss_mlp": 0.01253444, + "epoch": 0.3899594168044491, + "flos": 25491229292160.0, + "grad_norm": 2.289645436499478, + "language_loss": 0.84979439, + "learning_rate": 2.7879323434577965e-06, + "loss": 0.92701602, + "num_input_tokens_seen": 139203730, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.14898682, + "step": 6486, + "time_per_iteration": 2.5743820667266846 + }, + { + "auxiliary_loss_clip": 0.06453397, + "auxiliary_loss_mlp": 0.01267827, + "balance_loss_clip": 0.06278502, + "balance_loss_mlp": 0.01253141, + "epoch": 0.3900195400571171, + "flos": 31147415452800.0, + "grad_norm": 1.9273192838933928, + "language_loss": 0.85622168, + "learning_rate": 2.7875743645761645e-06, + "loss": 0.93343389, + "num_input_tokens_seen": 139222560, + "router_z_loss_clip": 1.74902344, + "router_z_loss_mlp": 0.14672852, + "step": 6487, + "time_per_iteration": 2.580012321472168 + }, + { + "auxiliary_loss_clip": 0.06449067, + "auxiliary_loss_mlp": 0.01273707, + "balance_loss_clip": 0.06279142, + "balance_loss_mlp": 0.01259121, + "epoch": 0.39007966330978505, + "flos": 20236111499520.0, + "grad_norm": 1.468779525903349, + "language_loss": 0.73436427, + "learning_rate": 2.787216355829633e-06, + "loss": 0.81159198, + "num_input_tokens_seen": 139242165, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.14569092, + "step": 6488, + "time_per_iteration": 2.54925274848938 + }, + { + "auxiliary_loss_clip": 0.06455337, + "auxiliary_loss_mlp": 0.0127042, + "balance_loss_clip": 0.06281433, + "balance_loss_mlp": 0.01255072, + "epoch": 0.390139786562453, + "flos": 22535353935360.0, + "grad_norm": 1.7339556546984902, + "language_loss": 0.68455738, + "learning_rate": 2.786858317231779e-06, + "loss": 0.76181495, + "num_input_tokens_seen": 139262525, + "router_z_loss_clip": 1.73730469, + "router_z_loss_mlp": 0.15344238, + "step": 6489, + "time_per_iteration": 2.529337167739868 + }, + { + "auxiliary_loss_clip": 0.06445001, + "auxiliary_loss_mlp": 0.01269777, + "balance_loss_clip": 0.0627808, + "balance_loss_mlp": 0.01256079, + "epoch": 0.390199909815121, + "flos": 26440211508480.0, + "grad_norm": 1.5752653046558913, + "language_loss": 0.81221771, + "learning_rate": 2.7865002487961788e-06, + "loss": 0.88936543, + "num_input_tokens_seen": 139282835, + "router_z_loss_clip": 1.66894531, + "router_z_loss_mlp": 0.13690186, + "step": 6490, + "time_per_iteration": 2.580287218093872 + }, + { + "auxiliary_loss_clip": 0.06445351, + "auxiliary_loss_mlp": 0.01270566, + "balance_loss_clip": 0.06275269, + "balance_loss_mlp": 0.01255784, + "epoch": 0.39026003306778895, + "flos": 17280278069760.0, + "grad_norm": 1.8612382479767444, + "language_loss": 0.89715946, + "learning_rate": 2.7861421505364104e-06, + "loss": 0.97431856, + "num_input_tokens_seen": 139299490, + "router_z_loss_clip": 1.69824219, + "router_z_loss_mlp": 0.14782715, + "step": 6491, + "time_per_iteration": 2.476393461227417 + }, + { + "auxiliary_loss_clip": 0.06446734, + "auxiliary_loss_mlp": 0.01271059, + "balance_loss_clip": 0.06275047, + "balance_loss_mlp": 0.01256325, + "epoch": 0.3903201563204569, + "flos": 24539354110080.0, + "grad_norm": 1.7715634168525083, + "language_loss": 0.78570807, + "learning_rate": 2.7857840224660523e-06, + "loss": 0.86288601, + "num_input_tokens_seen": 139317865, + "router_z_loss_clip": 1.71679688, + "router_z_loss_mlp": 0.14746094, + "step": 6492, + "time_per_iteration": 3.918022871017456 + }, + { + "auxiliary_loss_clip": 0.06448489, + "auxiliary_loss_mlp": 0.01270077, + "balance_loss_clip": 0.06278895, + "balance_loss_mlp": 0.01255528, + "epoch": 0.39038027957312493, + "flos": 23774547168000.0, + "grad_norm": 1.9649032306705667, + "language_loss": 0.74995399, + "learning_rate": 2.7854258645986857e-06, + "loss": 0.82713962, + "num_input_tokens_seen": 139339840, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.14544678, + "step": 6493, + "time_per_iteration": 2.5337636470794678 + }, + { + "auxiliary_loss_clip": 0.06457585, + "auxiliary_loss_mlp": 0.0126917, + "balance_loss_clip": 0.06280027, + "balance_loss_mlp": 0.0125341, + "epoch": 0.3904404028257929, + "flos": 14105832537600.0, + "grad_norm": 2.4323863844033498, + "language_loss": 0.76480663, + "learning_rate": 2.7850676769478916e-06, + "loss": 0.84207416, + "num_input_tokens_seen": 139357555, + "router_z_loss_clip": 1.77636719, + "router_z_loss_mlp": 0.15771484, + "step": 6494, + "time_per_iteration": 3.9828202724456787 + }, + { + "auxiliary_loss_clip": 0.06461826, + "auxiliary_loss_mlp": 0.01272307, + "balance_loss_clip": 0.06279928, + "balance_loss_mlp": 0.01255582, + "epoch": 0.39050052607846086, + "flos": 16915742881920.0, + "grad_norm": 1.9306711407360488, + "language_loss": 0.74818373, + "learning_rate": 2.7847094595272525e-06, + "loss": 0.82552505, + "num_input_tokens_seen": 139374455, + "router_z_loss_clip": 1.81933594, + "router_z_loss_mlp": 0.16723633, + "step": 6495, + "time_per_iteration": 2.5104000568389893 + }, + { + "auxiliary_loss_clip": 0.06450078, + "auxiliary_loss_mlp": 0.01273142, + "balance_loss_clip": 0.06281738, + "balance_loss_mlp": 0.01257358, + "epoch": 0.39056064933112883, + "flos": 25921912878720.0, + "grad_norm": 2.748187950361319, + "language_loss": 0.68202364, + "learning_rate": 2.784351212350352e-06, + "loss": 0.75925589, + "num_input_tokens_seen": 139394770, + "router_z_loss_clip": 1.68554688, + "router_z_loss_mlp": 0.15783691, + "step": 6496, + "time_per_iteration": 2.550957202911377 + }, + { + "auxiliary_loss_clip": 0.0637021, + "auxiliary_loss_mlp": 0.01254222, + "balance_loss_clip": 0.06292024, + "balance_loss_mlp": 0.01251394, + "epoch": 0.3906207725837968, + "flos": 60046125281280.0, + "grad_norm": 0.6447698339715318, + "language_loss": 0.53706288, + "learning_rate": 2.783992935430775e-06, + "loss": 0.61330724, + "num_input_tokens_seen": 139454760, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 0.02824402, + "step": 6497, + "time_per_iteration": 3.2988505363464355 + }, + { + "auxiliary_loss_clip": 0.06453034, + "auxiliary_loss_mlp": 0.01276113, + "balance_loss_clip": 0.06281406, + "balance_loss_mlp": 0.01261265, + "epoch": 0.39068089583646476, + "flos": 21074949123840.0, + "grad_norm": 2.0090604178847795, + "language_loss": 0.68947327, + "learning_rate": 2.7836346287821068e-06, + "loss": 0.76676476, + "num_input_tokens_seen": 139472645, + "router_z_loss_clip": 1.71679688, + "router_z_loss_mlp": 0.14837646, + "step": 6498, + "time_per_iteration": 3.9722609519958496 + }, + { + "auxiliary_loss_clip": 0.06365327, + "auxiliary_loss_mlp": 0.01254987, + "balance_loss_clip": 0.06287005, + "balance_loss_mlp": 0.01252178, + "epoch": 0.3907410190891327, + "flos": 70468269897600.0, + "grad_norm": 0.719858085665683, + "language_loss": 0.51721394, + "learning_rate": 2.783276292417936e-06, + "loss": 0.59341711, + "num_input_tokens_seen": 139536730, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 0.02807617, + "step": 6499, + "time_per_iteration": 3.209885835647583 + }, + { + "auxiliary_loss_clip": 0.06452541, + "auxiliary_loss_mlp": 0.01273785, + "balance_loss_clip": 0.06277416, + "balance_loss_mlp": 0.0125681, + "epoch": 0.3908011423418007, + "flos": 27969531903360.0, + "grad_norm": 1.5964691032272669, + "language_loss": 0.7347858, + "learning_rate": 2.7829179263518487e-06, + "loss": 0.81204903, + "num_input_tokens_seen": 139557540, + "router_z_loss_clip": 1.75195312, + "router_z_loss_mlp": 0.16992188, + "step": 6500, + "time_per_iteration": 2.5915534496307373 + }, + { + "auxiliary_loss_clip": 0.06456988, + "auxiliary_loss_mlp": 0.01269402, + "balance_loss_clip": 0.06284038, + "balance_loss_mlp": 0.01254728, + "epoch": 0.39086126559446865, + "flos": 24468971080320.0, + "grad_norm": 2.170342944486325, + "language_loss": 0.68858671, + "learning_rate": 2.7825595305974354e-06, + "loss": 0.7658506, + "num_input_tokens_seen": 139576875, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.14691162, + "step": 6501, + "time_per_iteration": 3.948155164718628 + }, + { + "auxiliary_loss_clip": 0.06445958, + "auxiliary_loss_mlp": 0.01271431, + "balance_loss_clip": 0.06277448, + "balance_loss_mlp": 0.01256327, + "epoch": 0.3909213888471366, + "flos": 16946406276480.0, + "grad_norm": 1.631531331045391, + "language_loss": 0.78994954, + "learning_rate": 2.782201105168287e-06, + "loss": 0.86712337, + "num_input_tokens_seen": 139594295, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.15100098, + "step": 6502, + "time_per_iteration": 2.505021810531616 + }, + { + "auxiliary_loss_clip": 0.06451446, + "auxiliary_loss_mlp": 0.01272758, + "balance_loss_clip": 0.06288067, + "balance_loss_mlp": 0.01259133, + "epoch": 0.3909815120998046, + "flos": 29286109981440.0, + "grad_norm": 4.8026818588998115, + "language_loss": 0.80286908, + "learning_rate": 2.7818426500779932e-06, + "loss": 0.88011116, + "num_input_tokens_seen": 139614080, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.13623047, + "step": 6503, + "time_per_iteration": 2.6041667461395264 + }, + { + "auxiliary_loss_clip": 0.06444375, + "auxiliary_loss_mlp": 0.01267951, + "balance_loss_clip": 0.06278107, + "balance_loss_mlp": 0.01253574, + "epoch": 0.39104163535247255, + "flos": 18956947069440.0, + "grad_norm": 1.8714653526076386, + "language_loss": 0.71717298, + "learning_rate": 2.7814841653401485e-06, + "loss": 0.79429626, + "num_input_tokens_seen": 139632755, + "router_z_loss_clip": 1.66113281, + "router_z_loss_mlp": 0.14379883, + "step": 6504, + "time_per_iteration": 2.499645471572876 + }, + { + "auxiliary_loss_clip": 0.06449269, + "auxiliary_loss_mlp": 0.01267487, + "balance_loss_clip": 0.06279607, + "balance_loss_mlp": 0.0125379, + "epoch": 0.3911017586051405, + "flos": 26330611968000.0, + "grad_norm": 1.7094242767760466, + "language_loss": 0.83403468, + "learning_rate": 2.7811256509683454e-06, + "loss": 0.91120219, + "num_input_tokens_seen": 139654205, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.137146, + "step": 6505, + "time_per_iteration": 2.5698060989379883 + }, + { + "auxiliary_loss_clip": 0.06447234, + "auxiliary_loss_mlp": 0.01267488, + "balance_loss_clip": 0.06281015, + "balance_loss_mlp": 0.01253022, + "epoch": 0.3911618818578085, + "flos": 21842313615360.0, + "grad_norm": 2.3254017668705083, + "language_loss": 0.71427596, + "learning_rate": 2.7807671069761797e-06, + "loss": 0.7914232, + "num_input_tokens_seen": 139673595, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.14465332, + "step": 6506, + "time_per_iteration": 2.4988996982574463 + }, + { + "auxiliary_loss_clip": 0.06443267, + "auxiliary_loss_mlp": 0.01271489, + "balance_loss_clip": 0.0628104, + "balance_loss_mlp": 0.01258149, + "epoch": 0.3912220051104765, + "flos": 16364768359680.0, + "grad_norm": 2.639532414168514, + "language_loss": 0.75588799, + "learning_rate": 2.7804085333772477e-06, + "loss": 0.83303547, + "num_input_tokens_seen": 139690565, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.13348389, + "step": 6507, + "time_per_iteration": 2.506723403930664 + }, + { + "auxiliary_loss_clip": 0.06355534, + "auxiliary_loss_mlp": 0.01255368, + "balance_loss_clip": 0.0627788, + "balance_loss_mlp": 0.01252429, + "epoch": 0.39128212836314447, + "flos": 71071179552000.0, + "grad_norm": 0.751869236178363, + "language_loss": 0.56649405, + "learning_rate": 2.7800499301851446e-06, + "loss": 0.64260316, + "num_input_tokens_seen": 139749420, + "router_z_loss_clip": 0.77441406, + "router_z_loss_mlp": 0.02935791, + "step": 6508, + "time_per_iteration": 3.282604455947876 + }, + { + "auxiliary_loss_clip": 0.06448714, + "auxiliary_loss_mlp": 0.01268575, + "balance_loss_clip": 0.06280237, + "balance_loss_mlp": 0.01254294, + "epoch": 0.39134225161581243, + "flos": 20336948288640.0, + "grad_norm": 1.8618605672003898, + "language_loss": 0.76758552, + "learning_rate": 2.779691297413471e-06, + "loss": 0.84475839, + "num_input_tokens_seen": 139766265, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.14276123, + "step": 6509, + "time_per_iteration": 2.5330445766448975 + }, + { + "auxiliary_loss_clip": 0.0644654, + "auxiliary_loss_mlp": 0.01272023, + "balance_loss_clip": 0.06278333, + "balance_loss_mlp": 0.01256073, + "epoch": 0.3914023748684804, + "flos": 17023916903040.0, + "grad_norm": 3.0317271524647427, + "language_loss": 0.83418059, + "learning_rate": 2.779332635075825e-06, + "loss": 0.91136616, + "num_input_tokens_seen": 139782400, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.1595459, + "step": 6510, + "time_per_iteration": 2.484149217605591 + }, + { + "auxiliary_loss_clip": 0.06450167, + "auxiliary_loss_mlp": 0.01268149, + "balance_loss_clip": 0.06277542, + "balance_loss_mlp": 0.01254463, + "epoch": 0.39146249812114836, + "flos": 18411045719040.0, + "grad_norm": 1.8343195842354416, + "language_loss": 0.77659726, + "learning_rate": 2.7789739431858073e-06, + "loss": 0.85378045, + "num_input_tokens_seen": 139801435, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.13684082, + "step": 6511, + "time_per_iteration": 2.493088722229004 + }, + { + "auxiliary_loss_clip": 0.06343137, + "auxiliary_loss_mlp": 0.01261237, + "balance_loss_clip": 0.06266295, + "balance_loss_mlp": 0.01258513, + "epoch": 0.3915226213738163, + "flos": 67659659291520.0, + "grad_norm": 0.7080449531762238, + "language_loss": 0.57720256, + "learning_rate": 2.7786152217570196e-06, + "loss": 0.65324628, + "num_input_tokens_seen": 139869700, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.02726746, + "step": 6512, + "time_per_iteration": 3.217658042907715 + }, + { + "auxiliary_loss_clip": 0.06445479, + "auxiliary_loss_mlp": 0.01273045, + "balance_loss_clip": 0.06275767, + "balance_loss_mlp": 0.01257452, + "epoch": 0.3915827446264843, + "flos": 26366516242560.0, + "grad_norm": 1.5252758876056967, + "language_loss": 0.69950658, + "learning_rate": 2.7782564708030647e-06, + "loss": 0.77669179, + "num_input_tokens_seen": 139890140, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.15600586, + "step": 6513, + "time_per_iteration": 2.560802936553955 + }, + { + "auxiliary_loss_clip": 0.06451759, + "auxiliary_loss_mlp": 0.01273121, + "balance_loss_clip": 0.06276838, + "balance_loss_mlp": 0.01258208, + "epoch": 0.39164286787915226, + "flos": 21950236074240.0, + "grad_norm": 2.7587511630204777, + "language_loss": 0.76322639, + "learning_rate": 2.7778976903375464e-06, + "loss": 0.8404752, + "num_input_tokens_seen": 139908020, + "router_z_loss_clip": 1.74707031, + "router_z_loss_mlp": 0.14916992, + "step": 6514, + "time_per_iteration": 2.499101400375366 + }, + { + "auxiliary_loss_clip": 0.0644438, + "auxiliary_loss_mlp": 0.01269565, + "balance_loss_clip": 0.06276566, + "balance_loss_mlp": 0.0125619, + "epoch": 0.3917029911318202, + "flos": 16405536170880.0, + "grad_norm": 1.811906351936664, + "language_loss": 0.782359, + "learning_rate": 2.7775388803740693e-06, + "loss": 0.8594985, + "num_input_tokens_seen": 139926180, + "router_z_loss_clip": 1.67773438, + "router_z_loss_mlp": 0.13378906, + "step": 6515, + "time_per_iteration": 2.5104947090148926 + }, + { + "auxiliary_loss_clip": 0.06443886, + "auxiliary_loss_mlp": 0.01270163, + "balance_loss_clip": 0.06277545, + "balance_loss_mlp": 0.0125705, + "epoch": 0.3917631143844882, + "flos": 26218580659200.0, + "grad_norm": 1.4298617884300358, + "language_loss": 0.79790455, + "learning_rate": 2.7771800409262406e-06, + "loss": 0.87504506, + "num_input_tokens_seen": 139947420, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.13122559, + "step": 6516, + "time_per_iteration": 2.5912764072418213 + }, + { + "auxiliary_loss_clip": 0.06446922, + "auxiliary_loss_mlp": 0.0126951, + "balance_loss_clip": 0.06278265, + "balance_loss_mlp": 0.0125511, + "epoch": 0.39182323763715615, + "flos": 18553740422400.0, + "grad_norm": 1.8457537699229483, + "language_loss": 0.70234001, + "learning_rate": 2.7768211720076665e-06, + "loss": 0.7795043, + "num_input_tokens_seen": 139965800, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.14404297, + "step": 6517, + "time_per_iteration": 2.630155324935913 + }, + { + "auxiliary_loss_clip": 0.06449963, + "auxiliary_loss_mlp": 0.01269735, + "balance_loss_clip": 0.06279542, + "balance_loss_mlp": 0.01254905, + "epoch": 0.3918833608898241, + "flos": 34322112547200.0, + "grad_norm": 1.6944592538331644, + "language_loss": 0.72209281, + "learning_rate": 2.776462273631956e-06, + "loss": 0.79928982, + "num_input_tokens_seen": 139988140, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.1484375, + "step": 6518, + "time_per_iteration": 2.6439340114593506 + }, + { + "auxiliary_loss_clip": 0.06453219, + "auxiliary_loss_mlp": 0.0127268, + "balance_loss_clip": 0.06280756, + "balance_loss_mlp": 0.0125751, + "epoch": 0.3919434841424921, + "flos": 36948434595840.0, + "grad_norm": 1.7409198797741048, + "language_loss": 0.62180024, + "learning_rate": 2.7761033458127177e-06, + "loss": 0.69905925, + "num_input_tokens_seen": 140010060, + "router_z_loss_clip": 1.72363281, + "router_z_loss_mlp": 0.15179443, + "step": 6519, + "time_per_iteration": 2.6407580375671387 + }, + { + "auxiliary_loss_clip": 0.06457552, + "auxiliary_loss_mlp": 0.01269986, + "balance_loss_clip": 0.06280086, + "balance_loss_mlp": 0.01253535, + "epoch": 0.3920036073951601, + "flos": 23514915692160.0, + "grad_norm": 2.3243103288051485, + "language_loss": 0.6728406, + "learning_rate": 2.775744388563563e-06, + "loss": 0.75011599, + "num_input_tokens_seen": 140029400, + "router_z_loss_clip": 1.77441406, + "router_z_loss_mlp": 0.16442871, + "step": 6520, + "time_per_iteration": 2.557736396789551 + }, + { + "auxiliary_loss_clip": 0.06452015, + "auxiliary_loss_mlp": 0.01272672, + "balance_loss_clip": 0.06281003, + "balance_loss_mlp": 0.0125845, + "epoch": 0.39206373064782807, + "flos": 18412051968000.0, + "grad_norm": 5.792319014223258, + "language_loss": 0.79119205, + "learning_rate": 2.775385401898104e-06, + "loss": 0.86843884, + "num_input_tokens_seen": 140048940, + "router_z_loss_clip": 1.70898438, + "router_z_loss_mlp": 0.14233398, + "step": 6521, + "time_per_iteration": 2.487144947052002 + }, + { + "auxiliary_loss_clip": 0.0645816, + "auxiliary_loss_mlp": 0.01271766, + "balance_loss_clip": 0.06282392, + "balance_loss_mlp": 0.01255297, + "epoch": 0.39212385390049603, + "flos": 12318012696960.0, + "grad_norm": 2.63137671789129, + "language_loss": 0.70893902, + "learning_rate": 2.775026385829952e-06, + "loss": 0.78623831, + "num_input_tokens_seen": 140066380, + "router_z_loss_clip": 1.75976562, + "router_z_loss_mlp": 0.16473389, + "step": 6522, + "time_per_iteration": 2.501777410507202 + }, + { + "auxiliary_loss_clip": 0.06455532, + "auxiliary_loss_mlp": 0.01272148, + "balance_loss_clip": 0.06282486, + "balance_loss_mlp": 0.01257693, + "epoch": 0.392183977153164, + "flos": 19725275882880.0, + "grad_norm": 2.1277990565539087, + "language_loss": 0.77424598, + "learning_rate": 2.774667340372722e-06, + "loss": 0.8515228, + "num_input_tokens_seen": 140085275, + "router_z_loss_clip": 1.73046875, + "router_z_loss_mlp": 0.14453125, + "step": 6523, + "time_per_iteration": 2.494900941848755 + }, + { + "auxiliary_loss_clip": 0.0645543, + "auxiliary_loss_mlp": 0.01272716, + "balance_loss_clip": 0.06282179, + "balance_loss_mlp": 0.01258769, + "epoch": 0.39224410040583196, + "flos": 33153092709120.0, + "grad_norm": 2.7826558407508855, + "language_loss": 0.62314886, + "learning_rate": 2.7743082655400293e-06, + "loss": 0.70043033, + "num_input_tokens_seen": 140105105, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.13964844, + "step": 6524, + "time_per_iteration": 2.6380085945129395 + }, + { + "auxiliary_loss_clip": 0.06452876, + "auxiliary_loss_mlp": 0.01268165, + "balance_loss_clip": 0.06281661, + "balance_loss_mlp": 0.01252895, + "epoch": 0.39230422365849993, + "flos": 27789884749440.0, + "grad_norm": 1.7105729654368218, + "language_loss": 0.74638754, + "learning_rate": 2.773949161345489e-06, + "loss": 0.82359803, + "num_input_tokens_seen": 140125645, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.15264893, + "step": 6525, + "time_per_iteration": 2.5430080890655518 + }, + { + "auxiliary_loss_clip": 0.06454577, + "auxiliary_loss_mlp": 0.0126824, + "balance_loss_clip": 0.06280737, + "balance_loss_mlp": 0.01253863, + "epoch": 0.3923643469111679, + "flos": 17937497969280.0, + "grad_norm": 2.1060109606385673, + "language_loss": 0.8182255, + "learning_rate": 2.773590027802719e-06, + "loss": 0.89545369, + "num_input_tokens_seen": 140141925, + "router_z_loss_clip": 1.73925781, + "router_z_loss_mlp": 0.14367676, + "step": 6526, + "time_per_iteration": 2.4994354248046875 + }, + { + "auxiliary_loss_clip": 0.06454204, + "auxiliary_loss_mlp": 0.01269978, + "balance_loss_clip": 0.06281518, + "balance_loss_mlp": 0.01255482, + "epoch": 0.39242447016383586, + "flos": 24066141776640.0, + "grad_norm": 1.5927090967738864, + "language_loss": 0.70157206, + "learning_rate": 2.7732308649253383e-06, + "loss": 0.77881384, + "num_input_tokens_seen": 140160965, + "router_z_loss_clip": 1.72753906, + "router_z_loss_mlp": 0.14501953, + "step": 6527, + "time_per_iteration": 2.5232326984405518 + }, + { + "auxiliary_loss_clip": 0.06452368, + "auxiliary_loss_mlp": 0.01268854, + "balance_loss_clip": 0.06281934, + "balance_loss_mlp": 0.01254245, + "epoch": 0.3924845934165038, + "flos": 10667562825600.0, + "grad_norm": 3.256824520755738, + "language_loss": 0.82039493, + "learning_rate": 2.772871672726965e-06, + "loss": 0.89760715, + "num_input_tokens_seen": 140177780, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.14605713, + "step": 6528, + "time_per_iteration": 2.498852014541626 + }, + { + "auxiliary_loss_clip": 0.06450985, + "auxiliary_loss_mlp": 0.0127277, + "balance_loss_clip": 0.06284485, + "balance_loss_mlp": 0.01258048, + "epoch": 0.3925447166691718, + "flos": 31253493121920.0, + "grad_norm": 1.712128770360143, + "language_loss": 0.68666142, + "learning_rate": 2.7725124512212205e-06, + "loss": 0.76389897, + "num_input_tokens_seen": 140201660, + "router_z_loss_clip": 1.66308594, + "router_z_loss_mlp": 0.14733887, + "step": 6529, + "time_per_iteration": 2.588303565979004 + }, + { + "auxiliary_loss_clip": 0.06454393, + "auxiliary_loss_mlp": 0.01267174, + "balance_loss_clip": 0.06281163, + "balance_loss_mlp": 0.01252213, + "epoch": 0.39260483992183975, + "flos": 29421215890560.0, + "grad_norm": 2.512935177473184, + "language_loss": 0.80622673, + "learning_rate": 2.7721532004217267e-06, + "loss": 0.8834424, + "num_input_tokens_seen": 140218585, + "router_z_loss_clip": 1.73339844, + "router_z_loss_mlp": 0.14959717, + "step": 6530, + "time_per_iteration": 2.5896732807159424 + }, + { + "auxiliary_loss_clip": 0.06449011, + "auxiliary_loss_mlp": 0.01267415, + "balance_loss_clip": 0.06279489, + "balance_loss_mlp": 0.0125252, + "epoch": 0.3926649631745077, + "flos": 22864571827200.0, + "grad_norm": 1.8446830755174628, + "language_loss": 0.76176864, + "learning_rate": 2.7717939203421063e-06, + "loss": 0.83893287, + "num_input_tokens_seen": 140239905, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.14892578, + "step": 6531, + "time_per_iteration": 3.9335060119628906 + }, + { + "auxiliary_loss_clip": 0.06348795, + "auxiliary_loss_mlp": 0.01256081, + "balance_loss_clip": 0.06271987, + "balance_loss_mlp": 0.01253434, + "epoch": 0.3927250864271757, + "flos": 63911892124800.0, + "grad_norm": 0.7987882767963658, + "language_loss": 0.6030035, + "learning_rate": 2.7714346109959822e-06, + "loss": 0.67905223, + "num_input_tokens_seen": 140293820, + "router_z_loss_clip": 0.76611328, + "router_z_loss_mlp": 0.02648926, + "step": 6532, + "time_per_iteration": 3.023615598678589 + }, + { + "auxiliary_loss_clip": 0.06346735, + "auxiliary_loss_mlp": 0.01258162, + "balance_loss_clip": 0.06270058, + "balance_loss_mlp": 0.01255445, + "epoch": 0.3927852096798437, + "flos": 68931486489600.0, + "grad_norm": 0.7618686105615924, + "language_loss": 0.55496854, + "learning_rate": 2.771075272396981e-06, + "loss": 0.63101745, + "num_input_tokens_seen": 140360420, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.02720642, + "step": 6533, + "time_per_iteration": 3.2504148483276367 + }, + { + "auxiliary_loss_clip": 0.06452841, + "auxiliary_loss_mlp": 0.01269959, + "balance_loss_clip": 0.06277935, + "balance_loss_mlp": 0.01254557, + "epoch": 0.39284533293251167, + "flos": 29723711529600.0, + "grad_norm": 1.823371664681604, + "language_loss": 0.76552856, + "learning_rate": 2.7707159045587284e-06, + "loss": 0.84275657, + "num_input_tokens_seen": 140381950, + "router_z_loss_clip": 1.74804688, + "router_z_loss_mlp": 0.15405273, + "step": 6534, + "time_per_iteration": 4.098775148391724 + }, + { + "auxiliary_loss_clip": 0.06459314, + "auxiliary_loss_mlp": 0.01269352, + "balance_loss_clip": 0.06282811, + "balance_loss_mlp": 0.01253974, + "epoch": 0.39290545618517964, + "flos": 18558016980480.0, + "grad_norm": 2.2164588420846267, + "language_loss": 0.78656316, + "learning_rate": 2.770356507494851e-06, + "loss": 0.86384982, + "num_input_tokens_seen": 140399410, + "router_z_loss_clip": 1.76464844, + "router_z_loss_mlp": 0.15380859, + "step": 6535, + "time_per_iteration": 2.4923341274261475 + }, + { + "auxiliary_loss_clip": 0.06449763, + "auxiliary_loss_mlp": 0.01266735, + "balance_loss_clip": 0.06282885, + "balance_loss_mlp": 0.01253592, + "epoch": 0.3929655794378476, + "flos": 26256581285760.0, + "grad_norm": 2.2738959430224326, + "language_loss": 0.69076276, + "learning_rate": 2.769997081218978e-06, + "loss": 0.76792771, + "num_input_tokens_seen": 140419055, + "router_z_loss_clip": 1.66699219, + "router_z_loss_mlp": 0.1315918, + "step": 6536, + "time_per_iteration": 2.5980727672576904 + }, + { + "auxiliary_loss_clip": 0.06448898, + "auxiliary_loss_mlp": 0.0127095, + "balance_loss_clip": 0.0628474, + "balance_loss_mlp": 0.01257265, + "epoch": 0.39302570269051557, + "flos": 29285564929920.0, + "grad_norm": 1.8741537429596062, + "language_loss": 0.69716197, + "learning_rate": 2.769637625744738e-06, + "loss": 0.77436042, + "num_input_tokens_seen": 140438800, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.13684082, + "step": 6537, + "time_per_iteration": 4.096014499664307 + }, + { + "auxiliary_loss_clip": 0.064602, + "auxiliary_loss_mlp": 0.01269576, + "balance_loss_clip": 0.06288625, + "balance_loss_mlp": 0.01255432, + "epoch": 0.39308582594318353, + "flos": 17353134794880.0, + "grad_norm": 1.7942703591990323, + "language_loss": 0.79606509, + "learning_rate": 2.769278141085763e-06, + "loss": 0.8733629, + "num_input_tokens_seen": 140456880, + "router_z_loss_clip": 1.71582031, + "router_z_loss_mlp": 0.14129639, + "step": 6538, + "time_per_iteration": 2.578815221786499 + }, + { + "auxiliary_loss_clip": 0.06359898, + "auxiliary_loss_mlp": 0.01255927, + "balance_loss_clip": 0.06283404, + "balance_loss_mlp": 0.0125297, + "epoch": 0.3931459491958515, + "flos": 61023884175360.0, + "grad_norm": 0.7947880980854773, + "language_loss": 0.61826062, + "learning_rate": 2.768918627255683e-06, + "loss": 0.69441885, + "num_input_tokens_seen": 140507510, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.02955627, + "step": 6539, + "time_per_iteration": 2.9553403854370117 + }, + { + "auxiliary_loss_clip": 0.06458268, + "auxiliary_loss_mlp": 0.01272646, + "balance_loss_clip": 0.06289513, + "balance_loss_mlp": 0.01257339, + "epoch": 0.39320607244851946, + "flos": 39024662590080.0, + "grad_norm": 2.4294685123961295, + "language_loss": 0.68263721, + "learning_rate": 2.7685590842681315e-06, + "loss": 0.75994635, + "num_input_tokens_seen": 140528740, + "router_z_loss_clip": 1.68554688, + "router_z_loss_mlp": 0.15307617, + "step": 6540, + "time_per_iteration": 2.732541799545288 + }, + { + "auxiliary_loss_clip": 0.06455955, + "auxiliary_loss_mlp": 0.01271651, + "balance_loss_clip": 0.06287128, + "balance_loss_mlp": 0.0125613, + "epoch": 0.3932661957011874, + "flos": 24686451152640.0, + "grad_norm": 1.7600019176005988, + "language_loss": 0.72681171, + "learning_rate": 2.7681995121367433e-06, + "loss": 0.80408776, + "num_input_tokens_seen": 140547560, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.15527344, + "step": 6541, + "time_per_iteration": 4.03834342956543 + }, + { + "auxiliary_loss_clip": 0.06358681, + "auxiliary_loss_mlp": 0.01262146, + "balance_loss_clip": 0.06282184, + "balance_loss_mlp": 0.01259297, + "epoch": 0.3933263189538554, + "flos": 70115614790400.0, + "grad_norm": 0.7938144397826515, + "language_loss": 0.60408866, + "learning_rate": 2.7678399108751516e-06, + "loss": 0.6802969, + "num_input_tokens_seen": 140601175, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.02844238, + "step": 6542, + "time_per_iteration": 3.0015151500701904 + }, + { + "auxiliary_loss_clip": 0.06453243, + "auxiliary_loss_mlp": 0.01279318, + "balance_loss_clip": 0.0628323, + "balance_loss_mlp": 0.01265305, + "epoch": 0.39338644220652336, + "flos": 22935583762560.0, + "grad_norm": 1.4413337304531033, + "language_loss": 0.82278919, + "learning_rate": 2.7674802804969947e-06, + "loss": 0.90011483, + "num_input_tokens_seen": 140622200, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.14013672, + "step": 6543, + "time_per_iteration": 2.6289048194885254 + }, + { + "auxiliary_loss_clip": 0.06454003, + "auxiliary_loss_mlp": 0.01270252, + "balance_loss_clip": 0.06284549, + "balance_loss_mlp": 0.01255768, + "epoch": 0.3934465654591913, + "flos": 30856282041600.0, + "grad_norm": 1.7408174737933344, + "language_loss": 0.69224536, + "learning_rate": 2.767120621015908e-06, + "loss": 0.76948798, + "num_input_tokens_seen": 140643125, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.14489746, + "step": 6544, + "time_per_iteration": 2.6554784774780273 + }, + { + "auxiliary_loss_clip": 0.06466363, + "auxiliary_loss_mlp": 0.01274712, + "balance_loss_clip": 0.06291823, + "balance_loss_mlp": 0.01258524, + "epoch": 0.3935066887118593, + "flos": 29243329672320.0, + "grad_norm": 2.0329338261061887, + "language_loss": 0.75462705, + "learning_rate": 2.76676093244553e-06, + "loss": 0.83203781, + "num_input_tokens_seen": 140662500, + "router_z_loss_clip": 1.74511719, + "router_z_loss_mlp": 0.1619873, + "step": 6545, + "time_per_iteration": 2.606234312057495 + }, + { + "auxiliary_loss_clip": 0.06446254, + "auxiliary_loss_mlp": 0.01275344, + "balance_loss_clip": 0.06285709, + "balance_loss_mlp": 0.01262309, + "epoch": 0.3935668119645273, + "flos": 19141290051840.0, + "grad_norm": 1.4467327313094591, + "language_loss": 0.75122333, + "learning_rate": 2.7664012147995015e-06, + "loss": 0.82843935, + "num_input_tokens_seen": 140681960, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.13043213, + "step": 6546, + "time_per_iteration": 2.5514185428619385 + }, + { + "auxiliary_loss_clip": 0.06461848, + "auxiliary_loss_mlp": 0.01270617, + "balance_loss_clip": 0.06285486, + "balance_loss_mlp": 0.01254822, + "epoch": 0.3936269352171953, + "flos": 18522196560000.0, + "grad_norm": 2.187625212538507, + "language_loss": 0.82285661, + "learning_rate": 2.7660414680914617e-06, + "loss": 0.90018129, + "num_input_tokens_seen": 140699170, + "router_z_loss_clip": 1.76367188, + "router_z_loss_mlp": 0.15783691, + "step": 6547, + "time_per_iteration": 2.536921501159668 + }, + { + "auxiliary_loss_clip": 0.06454909, + "auxiliary_loss_mlp": 0.01273072, + "balance_loss_clip": 0.06285325, + "balance_loss_mlp": 0.01259685, + "epoch": 0.39368705846986324, + "flos": 15638255533440.0, + "grad_norm": 1.8611217813328955, + "language_loss": 0.84309554, + "learning_rate": 2.7656816923350525e-06, + "loss": 0.92037535, + "num_input_tokens_seen": 140714920, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.1340332, + "step": 6548, + "time_per_iteration": 2.586596727371216 + }, + { + "auxiliary_loss_clip": 0.06451154, + "auxiliary_loss_mlp": 0.01275141, + "balance_loss_clip": 0.06285168, + "balance_loss_mlp": 0.01261325, + "epoch": 0.3937471817225312, + "flos": 21332442320640.0, + "grad_norm": 1.5541020214417252, + "language_loss": 0.7306931, + "learning_rate": 2.7653218875439174e-06, + "loss": 0.8079561, + "num_input_tokens_seen": 140734595, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.13842773, + "step": 6549, + "time_per_iteration": 2.5176355838775635 + }, + { + "auxiliary_loss_clip": 0.06453951, + "auxiliary_loss_mlp": 0.0127461, + "balance_loss_clip": 0.06282675, + "balance_loss_mlp": 0.01258398, + "epoch": 0.39380730497519917, + "flos": 20782893317760.0, + "grad_norm": 1.443831260247086, + "language_loss": 0.77958995, + "learning_rate": 2.764962053731699e-06, + "loss": 0.85687554, + "num_input_tokens_seen": 140754050, + "router_z_loss_clip": 1.71191406, + "router_z_loss_mlp": 0.16204834, + "step": 6550, + "time_per_iteration": 2.5665266513824463 + }, + { + "auxiliary_loss_clip": 0.06449334, + "auxiliary_loss_mlp": 0.01268564, + "balance_loss_clip": 0.0628082, + "balance_loss_mlp": 0.01254455, + "epoch": 0.39386742822786713, + "flos": 21615106469760.0, + "grad_norm": 1.5479702434138036, + "language_loss": 0.81395853, + "learning_rate": 2.7646021909120434e-06, + "loss": 0.89113748, + "num_input_tokens_seen": 140771440, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.14129639, + "step": 6551, + "time_per_iteration": 2.509472370147705 + }, + { + "auxiliary_loss_clip": 0.06452134, + "auxiliary_loss_mlp": 0.01274621, + "balance_loss_clip": 0.06282679, + "balance_loss_mlp": 0.01259791, + "epoch": 0.3939275514805351, + "flos": 12418304434560.0, + "grad_norm": 2.3772322810911892, + "language_loss": 0.80163503, + "learning_rate": 2.764242299098596e-06, + "loss": 0.87890255, + "num_input_tokens_seen": 140786715, + "router_z_loss_clip": 1.69433594, + "router_z_loss_mlp": 0.14825439, + "step": 6552, + "time_per_iteration": 2.512632369995117 + }, + { + "auxiliary_loss_clip": 0.06458388, + "auxiliary_loss_mlp": 0.01271806, + "balance_loss_clip": 0.06285821, + "balance_loss_mlp": 0.01256803, + "epoch": 0.39398767473320306, + "flos": 18558016980480.0, + "grad_norm": 1.9836463121020687, + "language_loss": 0.71468151, + "learning_rate": 2.763882378305003e-06, + "loss": 0.79198349, + "num_input_tokens_seen": 140804950, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.14996338, + "step": 6553, + "time_per_iteration": 2.4973459243774414 + }, + { + "auxiliary_loss_clip": 0.06447914, + "auxiliary_loss_mlp": 0.01269169, + "balance_loss_clip": 0.06280744, + "balance_loss_mlp": 0.0125422, + "epoch": 0.39404779798587103, + "flos": 29315599418880.0, + "grad_norm": 1.8230931816174483, + "language_loss": 0.64176017, + "learning_rate": 2.7635224285449144e-06, + "loss": 0.71893102, + "num_input_tokens_seen": 140822800, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.14941406, + "step": 6554, + "time_per_iteration": 2.6340816020965576 + }, + { + "auxiliary_loss_clip": 0.06448209, + "auxiliary_loss_mlp": 0.01269545, + "balance_loss_clip": 0.06281387, + "balance_loss_mlp": 0.0125561, + "epoch": 0.394107921238539, + "flos": 34905679107840.0, + "grad_norm": 1.8577413865682035, + "language_loss": 0.79801202, + "learning_rate": 2.7631624498319796e-06, + "loss": 0.8751896, + "num_input_tokens_seen": 140842940, + "router_z_loss_clip": 1.66894531, + "router_z_loss_mlp": 0.13934326, + "step": 6555, + "time_per_iteration": 2.673266887664795 + }, + { + "auxiliary_loss_clip": 0.06451041, + "auxiliary_loss_mlp": 0.01267708, + "balance_loss_clip": 0.06280783, + "balance_loss_mlp": 0.01252748, + "epoch": 0.39416804449120696, + "flos": 25088232280320.0, + "grad_norm": 1.8326733466575391, + "language_loss": 0.72028196, + "learning_rate": 2.7628024421798473e-06, + "loss": 0.79746938, + "num_input_tokens_seen": 140863060, + "router_z_loss_clip": 1.70117188, + "router_z_loss_mlp": 0.1496582, + "step": 6556, + "time_per_iteration": 2.572880744934082 + }, + { + "auxiliary_loss_clip": 0.06448796, + "auxiliary_loss_mlp": 0.01268731, + "balance_loss_clip": 0.06281175, + "balance_loss_mlp": 0.01254348, + "epoch": 0.3942281677438749, + "flos": 32314842063360.0, + "grad_norm": 2.2262653228658666, + "language_loss": 0.83903825, + "learning_rate": 2.7624424056021705e-06, + "loss": 0.91621351, + "num_input_tokens_seen": 140883795, + "router_z_loss_clip": 1.67675781, + "router_z_loss_mlp": 0.14373779, + "step": 6557, + "time_per_iteration": 2.605922222137451 + }, + { + "auxiliary_loss_clip": 0.06447846, + "auxiliary_loss_mlp": 0.01272636, + "balance_loss_clip": 0.06281336, + "balance_loss_mlp": 0.01258671, + "epoch": 0.3942882909965429, + "flos": 24943608933120.0, + "grad_norm": 2.1784611950300605, + "language_loss": 0.80248392, + "learning_rate": 2.7620823401126004e-06, + "loss": 0.87968874, + "num_input_tokens_seen": 140903055, + "router_z_loss_clip": 1.6640625, + "router_z_loss_mlp": 0.1395874, + "step": 6558, + "time_per_iteration": 2.5902092456817627 + }, + { + "auxiliary_loss_clip": 0.06445447, + "auxiliary_loss_mlp": 0.01267686, + "balance_loss_clip": 0.06280681, + "balance_loss_mlp": 0.01253816, + "epoch": 0.39434841424921085, + "flos": 11879614535040.0, + "grad_norm": 2.1357186014692546, + "language_loss": 0.71689725, + "learning_rate": 2.761722245724792e-06, + "loss": 0.79402852, + "num_input_tokens_seen": 140920685, + "router_z_loss_clip": 1.64648438, + "router_z_loss_mlp": 0.13873291, + "step": 6559, + "time_per_iteration": 2.4894917011260986 + }, + { + "auxiliary_loss_clip": 0.06456885, + "auxiliary_loss_mlp": 0.01269254, + "balance_loss_clip": 0.0628094, + "balance_loss_mlp": 0.01254622, + "epoch": 0.3944085375018789, + "flos": 16367032419840.0, + "grad_norm": 2.0841749511208705, + "language_loss": 0.81285572, + "learning_rate": 2.7613621224524003e-06, + "loss": 0.89011705, + "num_input_tokens_seen": 140937320, + "router_z_loss_clip": 1.7578125, + "router_z_loss_mlp": 0.14630127, + "step": 6560, + "time_per_iteration": 2.522434711456299 + }, + { + "auxiliary_loss_clip": 0.06452034, + "auxiliary_loss_mlp": 0.0126948, + "balance_loss_clip": 0.06282307, + "balance_loss_mlp": 0.01254078, + "epoch": 0.39446866075454684, + "flos": 10637821825920.0, + "grad_norm": 3.641985825462619, + "language_loss": 0.83127379, + "learning_rate": 2.7610019703090803e-06, + "loss": 0.90848899, + "num_input_tokens_seen": 140954855, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.15386963, + "step": 6561, + "time_per_iteration": 2.4804983139038086 + }, + { + "auxiliary_loss_clip": 0.06450383, + "auxiliary_loss_mlp": 0.0127031, + "balance_loss_clip": 0.06283262, + "balance_loss_mlp": 0.01257102, + "epoch": 0.3945287840072148, + "flos": 18193481792640.0, + "grad_norm": 2.043086634933395, + "language_loss": 0.80616236, + "learning_rate": 2.7606417893084887e-06, + "loss": 0.88336933, + "num_input_tokens_seen": 140973250, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.13208008, + "step": 6562, + "time_per_iteration": 2.5335006713867188 + }, + { + "auxiliary_loss_clip": 0.06448314, + "auxiliary_loss_mlp": 0.01268686, + "balance_loss_clip": 0.06283693, + "balance_loss_mlp": 0.01254476, + "epoch": 0.39458890725988277, + "flos": 23046650749440.0, + "grad_norm": 1.5717146465742573, + "language_loss": 0.81509531, + "learning_rate": 2.7602815794642853e-06, + "loss": 0.89226532, + "num_input_tokens_seen": 140993050, + "router_z_loss_clip": 1.64648438, + "router_z_loss_mlp": 0.14215088, + "step": 6563, + "time_per_iteration": 2.5315918922424316 + }, + { + "auxiliary_loss_clip": 0.06453238, + "auxiliary_loss_mlp": 0.01270349, + "balance_loss_clip": 0.0628344, + "balance_loss_mlp": 0.0125608, + "epoch": 0.39464903051255074, + "flos": 17163718640640.0, + "grad_norm": 1.8608988788141587, + "language_loss": 0.70080984, + "learning_rate": 2.759921340790127e-06, + "loss": 0.77804577, + "num_input_tokens_seen": 141010815, + "router_z_loss_clip": 1.69726562, + "router_z_loss_mlp": 0.14257812, + "step": 6564, + "time_per_iteration": 2.543459415435791 + }, + { + "auxiliary_loss_clip": 0.06449583, + "auxiliary_loss_mlp": 0.01269395, + "balance_loss_clip": 0.06281252, + "balance_loss_mlp": 0.01254648, + "epoch": 0.3947091537652187, + "flos": 15894616700160.0, + "grad_norm": 2.288586168499947, + "language_loss": 0.83967394, + "learning_rate": 2.759561073299676e-06, + "loss": 0.91686368, + "num_input_tokens_seen": 141028720, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.14746094, + "step": 6565, + "time_per_iteration": 2.5438666343688965 + }, + { + "auxiliary_loss_clip": 0.06447474, + "auxiliary_loss_mlp": 0.01269356, + "balance_loss_clip": 0.06280743, + "balance_loss_mlp": 0.01255229, + "epoch": 0.39476927701788667, + "flos": 18550386259200.0, + "grad_norm": 2.0020652066074285, + "language_loss": 0.83519006, + "learning_rate": 2.7592007770065937e-06, + "loss": 0.91235834, + "num_input_tokens_seen": 141046025, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.14129639, + "step": 6566, + "time_per_iteration": 2.550548791885376 + }, + { + "auxiliary_loss_clip": 0.06459671, + "auxiliary_loss_mlp": 0.01271058, + "balance_loss_clip": 0.06282969, + "balance_loss_mlp": 0.01255072, + "epoch": 0.39482940027055463, + "flos": 22282682348160.0, + "grad_norm": 1.770017298907609, + "language_loss": 0.77499187, + "learning_rate": 2.7588404519245403e-06, + "loss": 0.85229909, + "num_input_tokens_seen": 141066865, + "router_z_loss_clip": 1.76660156, + "router_z_loss_mlp": 0.15979004, + "step": 6567, + "time_per_iteration": 2.535980463027954 + }, + { + "auxiliary_loss_clip": 0.0644526, + "auxiliary_loss_mlp": 0.01270792, + "balance_loss_clip": 0.06283294, + "balance_loss_mlp": 0.01257851, + "epoch": 0.3948895235232226, + "flos": 14763010510080.0, + "grad_norm": 1.9280900707618294, + "language_loss": 0.80259991, + "learning_rate": 2.758480098067182e-06, + "loss": 0.87976044, + "num_input_tokens_seen": 141084210, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.12945557, + "step": 6568, + "time_per_iteration": 2.56528639793396 + }, + { + "auxiliary_loss_clip": 0.06451409, + "auxiliary_loss_mlp": 0.01272888, + "balance_loss_clip": 0.06283959, + "balance_loss_mlp": 0.01258356, + "epoch": 0.39494964677589056, + "flos": 22572474094080.0, + "grad_norm": 2.8189067544408166, + "language_loss": 0.84836519, + "learning_rate": 2.7581197154481816e-06, + "loss": 0.9256081, + "num_input_tokens_seen": 141103895, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.1451416, + "step": 6569, + "time_per_iteration": 2.512678623199463 + }, + { + "auxiliary_loss_clip": 0.06448043, + "auxiliary_loss_mlp": 0.01269688, + "balance_loss_clip": 0.06284526, + "balance_loss_mlp": 0.01255538, + "epoch": 0.3950097700285585, + "flos": 22969307831040.0, + "grad_norm": 1.7602858722639216, + "language_loss": 0.74665594, + "learning_rate": 2.7577593040812066e-06, + "loss": 0.82383323, + "num_input_tokens_seen": 141124000, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.14147949, + "step": 6570, + "time_per_iteration": 2.611072063446045 + }, + { + "auxiliary_loss_clip": 0.06447589, + "auxiliary_loss_mlp": 0.01270515, + "balance_loss_clip": 0.06279834, + "balance_loss_mlp": 0.01256305, + "epoch": 0.3950698932812265, + "flos": 20601569082240.0, + "grad_norm": 1.9769080404363342, + "language_loss": 0.80472994, + "learning_rate": 2.757398863979922e-06, + "loss": 0.88191104, + "num_input_tokens_seen": 141142535, + "router_z_loss_clip": 1.67871094, + "router_z_loss_mlp": 0.14196777, + "step": 6571, + "time_per_iteration": 4.037761688232422 + }, + { + "auxiliary_loss_clip": 0.06446905, + "auxiliary_loss_mlp": 0.012706, + "balance_loss_clip": 0.06278758, + "balance_loss_mlp": 0.01257022, + "epoch": 0.39513001653389446, + "flos": 20381992657920.0, + "grad_norm": 1.599556952476494, + "language_loss": 0.78081018, + "learning_rate": 2.757038395157997e-06, + "loss": 0.8579852, + "num_input_tokens_seen": 141161575, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.13574219, + "step": 6572, + "time_per_iteration": 2.542388439178467 + }, + { + "auxiliary_loss_clip": 0.06450671, + "auxiliary_loss_mlp": 0.01268422, + "balance_loss_clip": 0.06281148, + "balance_loss_mlp": 0.01253991, + "epoch": 0.3951901397865625, + "flos": 26469994435200.0, + "grad_norm": 1.9679034095416588, + "language_loss": 0.74861181, + "learning_rate": 2.7566778976291002e-06, + "loss": 0.8258028, + "num_input_tokens_seen": 141181150, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.14434814, + "step": 6573, + "time_per_iteration": 3.9954564571380615 + }, + { + "auxiliary_loss_clip": 0.06447303, + "auxiliary_loss_mlp": 0.012677, + "balance_loss_clip": 0.06282736, + "balance_loss_mlp": 0.0125492, + "epoch": 0.39525026303923044, + "flos": 43848845233920.0, + "grad_norm": 1.4348738267970096, + "language_loss": 0.67874503, + "learning_rate": 2.7563173714069017e-06, + "loss": 0.75589502, + "num_input_tokens_seen": 141206310, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.12799072, + "step": 6574, + "time_per_iteration": 2.75056791305542 + }, + { + "auxiliary_loss_clip": 0.06451984, + "auxiliary_loss_mlp": 0.01270185, + "balance_loss_clip": 0.06284595, + "balance_loss_mlp": 0.01255832, + "epoch": 0.3953103862918984, + "flos": 18046636312320.0, + "grad_norm": 3.0759560063082736, + "language_loss": 0.72770178, + "learning_rate": 2.755956816505072e-06, + "loss": 0.80492353, + "num_input_tokens_seen": 141223925, + "router_z_loss_clip": 1.67675781, + "router_z_loss_mlp": 0.14355469, + "step": 6575, + "time_per_iteration": 2.508314847946167 + }, + { + "auxiliary_loss_clip": 0.06452627, + "auxiliary_loss_mlp": 0.01270422, + "balance_loss_clip": 0.0628259, + "balance_loss_mlp": 0.01256015, + "epoch": 0.3953705095445664, + "flos": 16980549615360.0, + "grad_norm": 2.3956956088423382, + "language_loss": 0.73929548, + "learning_rate": 2.7555962329372845e-06, + "loss": 0.816526, + "num_input_tokens_seen": 141239010, + "router_z_loss_clip": 1.69824219, + "router_z_loss_mlp": 0.1439209, + "step": 6576, + "time_per_iteration": 2.4877238273620605 + }, + { + "auxiliary_loss_clip": 0.06453596, + "auxiliary_loss_mlp": 0.01269813, + "balance_loss_clip": 0.06286615, + "balance_loss_mlp": 0.0125704, + "epoch": 0.39543063279723434, + "flos": 17415300124800.0, + "grad_norm": 2.3089155525157397, + "language_loss": 0.8424108, + "learning_rate": 2.7552356207172124e-06, + "loss": 0.91964483, + "num_input_tokens_seen": 141252255, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.12786865, + "step": 6577, + "time_per_iteration": 3.9026546478271484 + }, + { + "auxiliary_loss_clip": 0.06447916, + "auxiliary_loss_mlp": 0.01269176, + "balance_loss_clip": 0.06283568, + "balance_loss_mlp": 0.01255788, + "epoch": 0.3954907560499023, + "flos": 22790876561280.0, + "grad_norm": 2.6090797034217603, + "language_loss": 0.90399998, + "learning_rate": 2.75487497985853e-06, + "loss": 0.98117089, + "num_input_tokens_seen": 141269325, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.1338501, + "step": 6578, + "time_per_iteration": 2.624901533126831 + }, + { + "auxiliary_loss_clip": 0.06451896, + "auxiliary_loss_mlp": 0.01269341, + "balance_loss_clip": 0.06281315, + "balance_loss_mlp": 0.01254284, + "epoch": 0.39555087930257027, + "flos": 21950823052800.0, + "grad_norm": 1.8247592517251146, + "language_loss": 0.78543842, + "learning_rate": 2.7545143103749117e-06, + "loss": 0.86265075, + "num_input_tokens_seen": 141288505, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.15063477, + "step": 6579, + "time_per_iteration": 2.5111443996429443 + }, + { + "auxiliary_loss_clip": 0.06456701, + "auxiliary_loss_mlp": 0.01273715, + "balance_loss_clip": 0.0628474, + "balance_loss_mlp": 0.01258492, + "epoch": 0.39561100255523823, + "flos": 20409553451520.0, + "grad_norm": 2.1653293739232753, + "language_loss": 0.68659246, + "learning_rate": 2.754153612280037e-06, + "loss": 0.76389658, + "num_input_tokens_seen": 141303680, + "router_z_loss_clip": 1.71972656, + "router_z_loss_mlp": 0.15216064, + "step": 6580, + "time_per_iteration": 4.038321495056152 + }, + { + "auxiliary_loss_clip": 0.06448758, + "auxiliary_loss_mlp": 0.01270958, + "balance_loss_clip": 0.06283981, + "balance_loss_mlp": 0.01256635, + "epoch": 0.3956711258079062, + "flos": 27972005598720.0, + "grad_norm": 1.867170796056586, + "language_loss": 0.58577931, + "learning_rate": 2.7537928855875797e-06, + "loss": 0.6629765, + "num_input_tokens_seen": 141324090, + "router_z_loss_clip": 1.64648438, + "router_z_loss_mlp": 0.14318848, + "step": 6581, + "time_per_iteration": 2.618917942047119 + }, + { + "auxiliary_loss_clip": 0.0645448, + "auxiliary_loss_mlp": 0.0127135, + "balance_loss_clip": 0.06288571, + "balance_loss_mlp": 0.01256413, + "epoch": 0.39573124906057416, + "flos": 14433457201920.0, + "grad_norm": 2.002939068333409, + "language_loss": 0.69910431, + "learning_rate": 2.7534321303112224e-06, + "loss": 0.77636254, + "num_input_tokens_seen": 141342235, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.14929199, + "step": 6582, + "time_per_iteration": 2.530895709991455 + }, + { + "auxiliary_loss_clip": 0.06451949, + "auxiliary_loss_mlp": 0.01273006, + "balance_loss_clip": 0.06283893, + "balance_loss_mlp": 0.01258546, + "epoch": 0.39579137231324213, + "flos": 18739592778240.0, + "grad_norm": 2.2302551557868457, + "language_loss": 0.76587689, + "learning_rate": 2.753071346464642e-06, + "loss": 0.84312642, + "num_input_tokens_seen": 141361195, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.14453125, + "step": 6583, + "time_per_iteration": 2.5276317596435547 + }, + { + "auxiliary_loss_clip": 0.0645259, + "auxiliary_loss_mlp": 0.0127002, + "balance_loss_clip": 0.06284047, + "balance_loss_mlp": 0.01256562, + "epoch": 0.3958514955659101, + "flos": 17682268832640.0, + "grad_norm": 1.926047340176765, + "language_loss": 0.66262352, + "learning_rate": 2.7527105340615207e-06, + "loss": 0.73984963, + "num_input_tokens_seen": 141378275, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.13458252, + "step": 6584, + "time_per_iteration": 2.501209259033203 + }, + { + "auxiliary_loss_clip": 0.06456675, + "auxiliary_loss_mlp": 0.01270923, + "balance_loss_clip": 0.06285589, + "balance_loss_mlp": 0.01256803, + "epoch": 0.39591161881857806, + "flos": 29315850981120.0, + "grad_norm": 1.992954295318491, + "language_loss": 0.72398281, + "learning_rate": 2.7523496931155413e-06, + "loss": 0.8012588, + "num_input_tokens_seen": 141396960, + "router_z_loss_clip": 1.7109375, + "router_z_loss_mlp": 0.14111328, + "step": 6585, + "time_per_iteration": 2.617694616317749 + }, + { + "auxiliary_loss_clip": 0.06457305, + "auxiliary_loss_mlp": 0.0127182, + "balance_loss_clip": 0.06288064, + "balance_loss_mlp": 0.01257336, + "epoch": 0.3959717420712461, + "flos": 25778295780480.0, + "grad_norm": 1.6889684303793513, + "language_loss": 0.73472714, + "learning_rate": 2.7519888236403856e-06, + "loss": 0.81201839, + "num_input_tokens_seen": 141417320, + "router_z_loss_clip": 1.69238281, + "router_z_loss_mlp": 0.14477539, + "step": 6586, + "time_per_iteration": 2.565883159637451 + }, + { + "auxiliary_loss_clip": 0.06454571, + "auxiliary_loss_mlp": 0.01267143, + "balance_loss_clip": 0.06286268, + "balance_loss_mlp": 0.01252969, + "epoch": 0.39603186532391405, + "flos": 20930199995520.0, + "grad_norm": 1.6150585752618039, + "language_loss": 0.71662915, + "learning_rate": 2.7516279256497382e-06, + "loss": 0.79384637, + "num_input_tokens_seen": 141435985, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.14160156, + "step": 6587, + "time_per_iteration": 2.5788414478302 + }, + { + "auxiliary_loss_clip": 0.06362241, + "auxiliary_loss_mlp": 0.01254401, + "balance_loss_clip": 0.06286076, + "balance_loss_mlp": 0.01251419, + "epoch": 0.396091988576582, + "flos": 54897336720000.0, + "grad_norm": 0.8108180128275717, + "language_loss": 0.60705078, + "learning_rate": 2.751266999157285e-06, + "loss": 0.68321717, + "num_input_tokens_seen": 141486075, + "router_z_loss_clip": 0.75830078, + "router_z_loss_mlp": 0.02980042, + "step": 6588, + "time_per_iteration": 2.973475217819214 + }, + { + "auxiliary_loss_clip": 0.06457016, + "auxiliary_loss_mlp": 0.01266428, + "balance_loss_clip": 0.06285909, + "balance_loss_mlp": 0.01251873, + "epoch": 0.39615211182925, + "flos": 20708946489600.0, + "grad_norm": 1.752385405351709, + "language_loss": 0.81335068, + "learning_rate": 2.7509060441767115e-06, + "loss": 0.89058518, + "num_input_tokens_seen": 141505280, + "router_z_loss_clip": 1.71191406, + "router_z_loss_mlp": 0.14575195, + "step": 6589, + "time_per_iteration": 2.557732582092285 + }, + { + "auxiliary_loss_clip": 0.06456019, + "auxiliary_loss_mlp": 0.01269797, + "balance_loss_clip": 0.06286196, + "balance_loss_mlp": 0.01254431, + "epoch": 0.39621223508191794, + "flos": 21000331463040.0, + "grad_norm": 1.8508577793480634, + "language_loss": 0.71167219, + "learning_rate": 2.7505450607217057e-06, + "loss": 0.7889303, + "num_input_tokens_seen": 141523930, + "router_z_loss_clip": 1.69726562, + "router_z_loss_mlp": 0.15368652, + "step": 6590, + "time_per_iteration": 2.5155017375946045 + }, + { + "auxiliary_loss_clip": 0.06451933, + "auxiliary_loss_mlp": 0.01266373, + "balance_loss_clip": 0.06285245, + "balance_loss_mlp": 0.01253284, + "epoch": 0.3962723583345859, + "flos": 23375742860160.0, + "grad_norm": 1.6853348593397999, + "language_loss": 0.75984478, + "learning_rate": 2.750184048805956e-06, + "loss": 0.83702791, + "num_input_tokens_seen": 141541320, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.13098145, + "step": 6591, + "time_per_iteration": 2.569958448410034 + }, + { + "auxiliary_loss_clip": 0.06454425, + "auxiliary_loss_mlp": 0.01268025, + "balance_loss_clip": 0.06288329, + "balance_loss_mlp": 0.01254215, + "epoch": 0.39633248158725387, + "flos": 25122040202880.0, + "grad_norm": 1.5542594066551045, + "language_loss": 0.78422546, + "learning_rate": 2.749823008443152e-06, + "loss": 0.8614499, + "num_input_tokens_seen": 141561880, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.13806152, + "step": 6592, + "time_per_iteration": 2.5509040355682373 + }, + { + "auxiliary_loss_clip": 0.06448938, + "auxiliary_loss_mlp": 0.0127036, + "balance_loss_clip": 0.062861, + "balance_loss_mlp": 0.01256615, + "epoch": 0.39639260483992184, + "flos": 39797309888640.0, + "grad_norm": 1.716432087396327, + "language_loss": 0.69405383, + "learning_rate": 2.7494619396469843e-06, + "loss": 0.77124685, + "num_input_tokens_seen": 141586460, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.13751221, + "step": 6593, + "time_per_iteration": 2.742421865463257 + }, + { + "auxiliary_loss_clip": 0.06455009, + "auxiliary_loss_mlp": 0.01268833, + "balance_loss_clip": 0.06285039, + "balance_loss_mlp": 0.01253896, + "epoch": 0.3964527280925898, + "flos": 17352673597440.0, + "grad_norm": 2.6756229463225134, + "language_loss": 0.78082192, + "learning_rate": 2.7491008424311452e-06, + "loss": 0.85806036, + "num_input_tokens_seen": 141605955, + "router_z_loss_clip": 1.69824219, + "router_z_loss_mlp": 0.14929199, + "step": 6594, + "time_per_iteration": 2.5240583419799805 + }, + { + "auxiliary_loss_clip": 0.06345355, + "auxiliary_loss_mlp": 0.01253278, + "balance_loss_clip": 0.06269702, + "balance_loss_mlp": 0.0125056, + "epoch": 0.39651285134525777, + "flos": 71739845533440.0, + "grad_norm": 0.9367359782969226, + "language_loss": 0.6293599, + "learning_rate": 2.7487397168093265e-06, + "loss": 0.70534623, + "num_input_tokens_seen": 141673140, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.02722168, + "step": 6595, + "time_per_iteration": 3.195411205291748 + }, + { + "auxiliary_loss_clip": 0.06455558, + "auxiliary_loss_mlp": 0.01273293, + "balance_loss_clip": 0.0628309, + "balance_loss_mlp": 0.0125714, + "epoch": 0.39657297459792573, + "flos": 25782823900800.0, + "grad_norm": 2.0629727816625656, + "language_loss": 0.63503623, + "learning_rate": 2.748378562795223e-06, + "loss": 0.71232474, + "num_input_tokens_seen": 141692955, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.16149902, + "step": 6596, + "time_per_iteration": 2.564436197280884 + }, + { + "auxiliary_loss_clip": 0.06445512, + "auxiliary_loss_mlp": 0.01270278, + "balance_loss_clip": 0.0628349, + "balance_loss_mlp": 0.01256086, + "epoch": 0.3966330978505937, + "flos": 20272267336320.0, + "grad_norm": 3.0845696935228646, + "language_loss": 0.79033494, + "learning_rate": 2.7480173804025293e-06, + "loss": 0.86749279, + "num_input_tokens_seen": 141710680, + "router_z_loss_clip": 1.61914062, + "router_z_loss_mlp": 0.14202881, + "step": 6597, + "time_per_iteration": 2.5187220573425293 + }, + { + "auxiliary_loss_clip": 0.0645806, + "auxiliary_loss_mlp": 0.01272047, + "balance_loss_clip": 0.06285266, + "balance_loss_mlp": 0.01257259, + "epoch": 0.39669322110326166, + "flos": 20637431429760.0, + "grad_norm": 1.9127598273467419, + "language_loss": 0.67675543, + "learning_rate": 2.747656169644941e-06, + "loss": 0.75405657, + "num_input_tokens_seen": 141729860, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.14776611, + "step": 6598, + "time_per_iteration": 2.5287654399871826 + }, + { + "auxiliary_loss_clip": 0.06448894, + "auxiliary_loss_mlp": 0.01270917, + "balance_loss_clip": 0.06280929, + "balance_loss_mlp": 0.01257643, + "epoch": 0.3967533443559297, + "flos": 21732546366720.0, + "grad_norm": 1.6941457063111416, + "language_loss": 0.79130334, + "learning_rate": 2.747294930536157e-06, + "loss": 0.86850142, + "num_input_tokens_seen": 141749060, + "router_z_loss_clip": 1.68066406, + "router_z_loss_mlp": 0.13269043, + "step": 6599, + "time_per_iteration": 2.564073324203491 + }, + { + "auxiliary_loss_clip": 0.06447926, + "auxiliary_loss_mlp": 0.01270436, + "balance_loss_clip": 0.06279482, + "balance_loss_mlp": 0.01254289, + "epoch": 0.39681346760859765, + "flos": 25491271219200.0, + "grad_norm": 1.7355689440790156, + "language_loss": 0.72895992, + "learning_rate": 2.7469336630898737e-06, + "loss": 0.80614352, + "num_input_tokens_seen": 141769860, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.16149902, + "step": 6600, + "time_per_iteration": 2.6141197681427 + }, + { + "auxiliary_loss_clip": 0.06448444, + "auxiliary_loss_mlp": 0.01274951, + "balance_loss_clip": 0.06280382, + "balance_loss_mlp": 0.01261045, + "epoch": 0.3968735908612656, + "flos": 20965894634880.0, + "grad_norm": 1.918502465070546, + "language_loss": 0.85902363, + "learning_rate": 2.746572367319791e-06, + "loss": 0.9362576, + "num_input_tokens_seen": 141788465, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.13909912, + "step": 6601, + "time_per_iteration": 2.539337396621704 + }, + { + "auxiliary_loss_clip": 0.06455625, + "auxiliary_loss_mlp": 0.01273924, + "balance_loss_clip": 0.06281834, + "balance_loss_mlp": 0.0125773, + "epoch": 0.3969337141139336, + "flos": 10711684800000.0, + "grad_norm": 2.4177834123100412, + "language_loss": 0.70406669, + "learning_rate": 2.7462110432396095e-06, + "loss": 0.78136218, + "num_input_tokens_seen": 141804955, + "router_z_loss_clip": 1.74023438, + "router_z_loss_mlp": 0.16192627, + "step": 6602, + "time_per_iteration": 2.5344958305358887 + }, + { + "auxiliary_loss_clip": 0.06450728, + "auxiliary_loss_mlp": 0.01272133, + "balance_loss_clip": 0.06280322, + "balance_loss_mlp": 0.01257583, + "epoch": 0.39699383736660154, + "flos": 17597924098560.0, + "grad_norm": 4.3880896635048865, + "language_loss": 0.84332073, + "learning_rate": 2.7458496908630305e-06, + "loss": 0.92054927, + "num_input_tokens_seen": 141820025, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.14550781, + "step": 6603, + "time_per_iteration": 2.4587697982788086 + }, + { + "auxiliary_loss_clip": 0.06445679, + "auxiliary_loss_mlp": 0.01276756, + "balance_loss_clip": 0.06278397, + "balance_loss_mlp": 0.01263017, + "epoch": 0.3970539606192695, + "flos": 17791826446080.0, + "grad_norm": 1.5258003920697418, + "language_loss": 0.7302916, + "learning_rate": 2.7454883102037563e-06, + "loss": 0.80751598, + "num_input_tokens_seen": 141838735, + "router_z_loss_clip": 1.67480469, + "router_z_loss_mlp": 0.13751221, + "step": 6604, + "time_per_iteration": 2.525475025177002 + }, + { + "auxiliary_loss_clip": 0.06437713, + "auxiliary_loss_mlp": 0.01269691, + "balance_loss_clip": 0.06277181, + "balance_loss_mlp": 0.0125609, + "epoch": 0.3971140838719375, + "flos": 24796260328320.0, + "grad_norm": 1.5312177971095886, + "language_loss": 0.82809514, + "learning_rate": 2.745126901275491e-06, + "loss": 0.90516913, + "num_input_tokens_seen": 141858090, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.13598633, + "step": 6605, + "time_per_iteration": 2.5601069927215576 + }, + { + "auxiliary_loss_clip": 0.06439412, + "auxiliary_loss_mlp": 0.01269635, + "balance_loss_clip": 0.06274941, + "balance_loss_mlp": 0.01256337, + "epoch": 0.39717420712460544, + "flos": 24250484759040.0, + "grad_norm": 1.721474173213711, + "language_loss": 0.74617773, + "learning_rate": 2.7447654640919383e-06, + "loss": 0.82326818, + "num_input_tokens_seen": 141877540, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.13293457, + "step": 6606, + "time_per_iteration": 2.570338726043701 + }, + { + "auxiliary_loss_clip": 0.06450282, + "auxiliary_loss_mlp": 0.01268462, + "balance_loss_clip": 0.06279129, + "balance_loss_mlp": 0.01255343, + "epoch": 0.3972343303772734, + "flos": 25891752608640.0, + "grad_norm": 1.7826498780228273, + "language_loss": 0.74625784, + "learning_rate": 2.744403998666805e-06, + "loss": 0.8234452, + "num_input_tokens_seen": 141897315, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.13122559, + "step": 6607, + "time_per_iteration": 2.554779052734375 + }, + { + "auxiliary_loss_clip": 0.06451584, + "auxiliary_loss_mlp": 0.01271624, + "balance_loss_clip": 0.0628166, + "balance_loss_mlp": 0.01257366, + "epoch": 0.39729445362994137, + "flos": 45634107525120.0, + "grad_norm": 2.013518755058626, + "language_loss": 0.68503535, + "learning_rate": 2.744042505013797e-06, + "loss": 0.76226741, + "num_input_tokens_seen": 141919580, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.1427002, + "step": 6608, + "time_per_iteration": 2.814741611480713 + }, + { + "auxiliary_loss_clip": 0.06453016, + "auxiliary_loss_mlp": 0.01270819, + "balance_loss_clip": 0.06280445, + "balance_loss_mlp": 0.01256496, + "epoch": 0.39735457688260933, + "flos": 20200249152000.0, + "grad_norm": 2.238404873213265, + "language_loss": 0.74168068, + "learning_rate": 2.7436809831466233e-06, + "loss": 0.818919, + "num_input_tokens_seen": 141937045, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.14318848, + "step": 6609, + "time_per_iteration": 2.549020767211914 + }, + { + "auxiliary_loss_clip": 0.06450722, + "auxiliary_loss_mlp": 0.01268408, + "balance_loss_clip": 0.06281993, + "balance_loss_mlp": 0.0125424, + "epoch": 0.3974147001352773, + "flos": 23337868014720.0, + "grad_norm": 1.4758458837885644, + "language_loss": 0.71468556, + "learning_rate": 2.7433194330789927e-06, + "loss": 0.79187685, + "num_input_tokens_seen": 141956695, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.14154053, + "step": 6610, + "time_per_iteration": 3.985957622528076 + }, + { + "auxiliary_loss_clip": 0.06440872, + "auxiliary_loss_mlp": 0.01270494, + "balance_loss_clip": 0.062764, + "balance_loss_mlp": 0.01256559, + "epoch": 0.39747482338794526, + "flos": 21694965010560.0, + "grad_norm": 1.555692262156073, + "language_loss": 0.7854501, + "learning_rate": 2.7429578548246133e-06, + "loss": 0.86256385, + "num_input_tokens_seen": 141975935, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.13934326, + "step": 6611, + "time_per_iteration": 2.5972208976745605 + }, + { + "auxiliary_loss_clip": 0.06447503, + "auxiliary_loss_mlp": 0.01268941, + "balance_loss_clip": 0.06280762, + "balance_loss_mlp": 0.01255065, + "epoch": 0.3975349466406133, + "flos": 30995957998080.0, + "grad_norm": 2.19308398220208, + "language_loss": 0.79606485, + "learning_rate": 2.7425962483971985e-06, + "loss": 0.87322932, + "num_input_tokens_seen": 141995750, + "router_z_loss_clip": 1.66699219, + "router_z_loss_mlp": 0.13891602, + "step": 6612, + "time_per_iteration": 2.6106274127960205 + }, + { + "auxiliary_loss_clip": 0.0634682, + "auxiliary_loss_mlp": 0.01253265, + "balance_loss_clip": 0.06270839, + "balance_loss_mlp": 0.01250469, + "epoch": 0.39759506989328125, + "flos": 63703426366080.0, + "grad_norm": 0.8245936024085626, + "language_loss": 0.6463905, + "learning_rate": 2.742234613810459e-06, + "loss": 0.72239137, + "num_input_tokens_seen": 142057655, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.02796936, + "step": 6613, + "time_per_iteration": 4.473678112030029 + }, + { + "auxiliary_loss_clip": 0.06450668, + "auxiliary_loss_mlp": 0.01269678, + "balance_loss_clip": 0.06282368, + "balance_loss_mlp": 0.01255367, + "epoch": 0.3976551931459492, + "flos": 23702570910720.0, + "grad_norm": 2.448614415916545, + "language_loss": 0.72596258, + "learning_rate": 2.741872951078109e-06, + "loss": 0.80316603, + "num_input_tokens_seen": 142076020, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.14312744, + "step": 6614, + "time_per_iteration": 2.5691444873809814 + }, + { + "auxiliary_loss_clip": 0.06449673, + "auxiliary_loss_mlp": 0.0127007, + "balance_loss_clip": 0.06283288, + "balance_loss_mlp": 0.01256051, + "epoch": 0.3977153163986172, + "flos": 15675166056960.0, + "grad_norm": 2.2284862441621995, + "language_loss": 0.81666011, + "learning_rate": 2.741511260213862e-06, + "loss": 0.89385748, + "num_input_tokens_seen": 142093790, + "router_z_loss_clip": 1.6640625, + "router_z_loss_mlp": 0.14013672, + "step": 6615, + "time_per_iteration": 2.55078387260437 + }, + { + "auxiliary_loss_clip": 0.06452717, + "auxiliary_loss_mlp": 0.01269531, + "balance_loss_clip": 0.06284063, + "balance_loss_mlp": 0.01255679, + "epoch": 0.39777543965128515, + "flos": 14070012117120.0, + "grad_norm": 1.96274897748641, + "language_loss": 0.67687142, + "learning_rate": 2.741149541231434e-06, + "loss": 0.75409389, + "num_input_tokens_seen": 142110545, + "router_z_loss_clip": 1.68554688, + "router_z_loss_mlp": 0.13842773, + "step": 6616, + "time_per_iteration": 2.533982992172241 + }, + { + "auxiliary_loss_clip": 0.06455097, + "auxiliary_loss_mlp": 0.0126897, + "balance_loss_clip": 0.06281532, + "balance_loss_mlp": 0.01253986, + "epoch": 0.3978355629039531, + "flos": 23374149632640.0, + "grad_norm": 2.1811174101900552, + "language_loss": 0.8396368, + "learning_rate": 2.740787794144541e-06, + "loss": 0.91687751, + "num_input_tokens_seen": 142128695, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.14978027, + "step": 6617, + "time_per_iteration": 3.9742090702056885 + }, + { + "auxiliary_loss_clip": 0.06446042, + "auxiliary_loss_mlp": 0.01268103, + "balance_loss_clip": 0.06283504, + "balance_loss_mlp": 0.01255556, + "epoch": 0.3978956861566211, + "flos": 19068852597120.0, + "grad_norm": 1.7253210008214133, + "language_loss": 0.73000187, + "learning_rate": 2.7404260189669e-06, + "loss": 0.80714333, + "num_input_tokens_seen": 142148375, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.12536621, + "step": 6618, + "time_per_iteration": 2.562913179397583 + }, + { + "auxiliary_loss_clip": 0.06454587, + "auxiliary_loss_mlp": 0.01274299, + "balance_loss_clip": 0.06285769, + "balance_loss_mlp": 0.01258576, + "epoch": 0.39795580940928904, + "flos": 30235679176320.0, + "grad_norm": 1.6365941861062427, + "language_loss": 0.65343797, + "learning_rate": 2.740064215712231e-06, + "loss": 0.73072684, + "num_input_tokens_seen": 142169735, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.15710449, + "step": 6619, + "time_per_iteration": 2.598667860031128 + }, + { + "auxiliary_loss_clip": 0.06341819, + "auxiliary_loss_mlp": 0.01254465, + "balance_loss_clip": 0.06266081, + "balance_loss_mlp": 0.01251738, + "epoch": 0.398015932661957, + "flos": 69867261688320.0, + "grad_norm": 0.7579483566665592, + "language_loss": 0.582268, + "learning_rate": 2.7397023843942527e-06, + "loss": 0.65823084, + "num_input_tokens_seen": 142229520, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.02731323, + "step": 6620, + "time_per_iteration": 4.528149604797363 + }, + { + "auxiliary_loss_clip": 0.06446633, + "auxiliary_loss_mlp": 0.01270084, + "balance_loss_clip": 0.06280729, + "balance_loss_mlp": 0.01256858, + "epoch": 0.39807605591462497, + "flos": 20164093315200.0, + "grad_norm": 1.5024608902652035, + "language_loss": 0.79499102, + "learning_rate": 2.739340525026686e-06, + "loss": 0.87215811, + "num_input_tokens_seen": 142247660, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.13232422, + "step": 6621, + "time_per_iteration": 2.559305191040039 + }, + { + "auxiliary_loss_clip": 0.06445563, + "auxiliary_loss_mlp": 0.01270989, + "balance_loss_clip": 0.06279579, + "balance_loss_mlp": 0.01257435, + "epoch": 0.39813617916729294, + "flos": 21148057411200.0, + "grad_norm": 1.7591122738615637, + "language_loss": 0.78347874, + "learning_rate": 2.738978637623252e-06, + "loss": 0.86064428, + "num_input_tokens_seen": 142266990, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.13568115, + "step": 6622, + "time_per_iteration": 2.541325569152832 + }, + { + "auxiliary_loss_clip": 0.06444648, + "auxiliary_loss_mlp": 0.01270694, + "balance_loss_clip": 0.06278688, + "balance_loss_mlp": 0.01255948, + "epoch": 0.3981963024199609, + "flos": 18994318790400.0, + "grad_norm": 9.51473607747463, + "language_loss": 0.75430334, + "learning_rate": 2.738616722197674e-06, + "loss": 0.83145678, + "num_input_tokens_seen": 142287170, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.14733887, + "step": 6623, + "time_per_iteration": 2.5859150886535645 + }, + { + "auxiliary_loss_clip": 0.06449074, + "auxiliary_loss_mlp": 0.0127457, + "balance_loss_clip": 0.06282511, + "balance_loss_mlp": 0.01260551, + "epoch": 0.39825642567262887, + "flos": 16579648955520.0, + "grad_norm": 1.7143371951380526, + "language_loss": 0.79926246, + "learning_rate": 2.7382547787636766e-06, + "loss": 0.87649894, + "num_input_tokens_seen": 142305405, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.14025879, + "step": 6624, + "time_per_iteration": 2.509500026702881 + }, + { + "auxiliary_loss_clip": 0.06454292, + "auxiliary_loss_mlp": 0.01269994, + "balance_loss_clip": 0.06280515, + "balance_loss_mlp": 0.01254234, + "epoch": 0.39831654892529683, + "flos": 22206303751680.0, + "grad_norm": 2.195062259081814, + "language_loss": 0.84314877, + "learning_rate": 2.7378928073349832e-06, + "loss": 0.92039162, + "num_input_tokens_seen": 142322710, + "router_z_loss_clip": 1.73535156, + "router_z_loss_mlp": 0.15759277, + "step": 6625, + "time_per_iteration": 2.5617175102233887 + }, + { + "auxiliary_loss_clip": 0.06446299, + "auxiliary_loss_mlp": 0.01272387, + "balance_loss_clip": 0.06279518, + "balance_loss_mlp": 0.01258517, + "epoch": 0.39837667217796485, + "flos": 10492485719040.0, + "grad_norm": 1.8250293636172175, + "language_loss": 0.8709324, + "learning_rate": 2.737530807925321e-06, + "loss": 0.94811928, + "num_input_tokens_seen": 142338535, + "router_z_loss_clip": 1.66699219, + "router_z_loss_mlp": 0.13867188, + "step": 6626, + "time_per_iteration": 2.72031307220459 + }, + { + "auxiliary_loss_clip": 0.06447423, + "auxiliary_loss_mlp": 0.01271086, + "balance_loss_clip": 0.0627908, + "balance_loss_mlp": 0.01256531, + "epoch": 0.3984367954306328, + "flos": 17970676986240.0, + "grad_norm": 2.760632977827581, + "language_loss": 0.84402627, + "learning_rate": 2.737168780548417e-06, + "loss": 0.9212113, + "num_input_tokens_seen": 142354570, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.14575195, + "step": 6627, + "time_per_iteration": 2.6228654384613037 + }, + { + "auxiliary_loss_clip": 0.06445234, + "auxiliary_loss_mlp": 0.01268693, + "balance_loss_clip": 0.0627917, + "balance_loss_mlp": 0.01255443, + "epoch": 0.3984969186833008, + "flos": 22717684419840.0, + "grad_norm": 3.2429830324928095, + "language_loss": 0.83402491, + "learning_rate": 2.736806725217998e-06, + "loss": 0.91116416, + "num_input_tokens_seen": 142374395, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.13250732, + "step": 6628, + "time_per_iteration": 2.6287484169006348 + }, + { + "auxiliary_loss_clip": 0.06449139, + "auxiliary_loss_mlp": 0.01271852, + "balance_loss_clip": 0.06279008, + "balance_loss_mlp": 0.01256981, + "epoch": 0.39855704193596875, + "flos": 23412779164800.0, + "grad_norm": 1.5731823007903518, + "language_loss": 0.71793973, + "learning_rate": 2.7364446419477945e-06, + "loss": 0.79514968, + "num_input_tokens_seen": 142396040, + "router_z_loss_clip": 1.70117188, + "router_z_loss_mlp": 0.14868164, + "step": 6629, + "time_per_iteration": 2.5752875804901123 + }, + { + "auxiliary_loss_clip": 0.06441505, + "auxiliary_loss_mlp": 0.01268472, + "balance_loss_clip": 0.06280406, + "balance_loss_mlp": 0.01254834, + "epoch": 0.3986171651886367, + "flos": 21258369711360.0, + "grad_norm": 2.035566678796665, + "language_loss": 0.80905473, + "learning_rate": 2.7360825307515366e-06, + "loss": 0.88615453, + "num_input_tokens_seen": 142415495, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.1362915, + "step": 6630, + "time_per_iteration": 2.5329513549804688 + }, + { + "auxiliary_loss_clip": 0.06445715, + "auxiliary_loss_mlp": 0.01269878, + "balance_loss_clip": 0.06276714, + "balance_loss_mlp": 0.01255693, + "epoch": 0.3986772884413047, + "flos": 12463642293120.0, + "grad_norm": 2.1251751047068783, + "language_loss": 0.75146663, + "learning_rate": 2.7357203916429555e-06, + "loss": 0.82862258, + "num_input_tokens_seen": 142431865, + "router_z_loss_clip": 1.69042969, + "router_z_loss_mlp": 0.14190674, + "step": 6631, + "time_per_iteration": 2.5500082969665527 + }, + { + "auxiliary_loss_clip": 0.06448178, + "auxiliary_loss_mlp": 0.01269111, + "balance_loss_clip": 0.06279311, + "balance_loss_mlp": 0.0125505, + "epoch": 0.39873741169397264, + "flos": 19652209522560.0, + "grad_norm": 1.6915315525927903, + "language_loss": 0.71496904, + "learning_rate": 2.735358224635783e-06, + "loss": 0.79214191, + "num_input_tokens_seen": 142450595, + "router_z_loss_clip": 1.69042969, + "router_z_loss_mlp": 0.140625, + "step": 6632, + "time_per_iteration": 2.563776731491089 + }, + { + "auxiliary_loss_clip": 0.06444843, + "auxiliary_loss_mlp": 0.01269449, + "balance_loss_clip": 0.06279632, + "balance_loss_mlp": 0.01255955, + "epoch": 0.3987975349466406, + "flos": 21690436890240.0, + "grad_norm": 1.8116978167005697, + "language_loss": 0.75623924, + "learning_rate": 2.7349960297437533e-06, + "loss": 0.83338219, + "num_input_tokens_seen": 142466650, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.13494873, + "step": 6633, + "time_per_iteration": 2.5171151161193848 + }, + { + "auxiliary_loss_clip": 0.06449188, + "auxiliary_loss_mlp": 0.01272467, + "balance_loss_clip": 0.06280442, + "balance_loss_mlp": 0.0125846, + "epoch": 0.3988576581993086, + "flos": 23920721815680.0, + "grad_norm": 1.9002609831735993, + "language_loss": 0.81678545, + "learning_rate": 2.7346338069806e-06, + "loss": 0.89400202, + "num_input_tokens_seen": 142486165, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.14001465, + "step": 6634, + "time_per_iteration": 2.539128065109253 + }, + { + "auxiliary_loss_clip": 0.06453361, + "auxiliary_loss_mlp": 0.01269766, + "balance_loss_clip": 0.06283009, + "balance_loss_mlp": 0.01255449, + "epoch": 0.39891778145197654, + "flos": 18155690801280.0, + "grad_norm": 1.9946050359209588, + "language_loss": 0.7547667, + "learning_rate": 2.7342715563600597e-06, + "loss": 0.83199799, + "num_input_tokens_seen": 142505035, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.14306641, + "step": 6635, + "time_per_iteration": 2.5426242351531982 + }, + { + "auxiliary_loss_clip": 0.06468328, + "auxiliary_loss_mlp": 0.01272826, + "balance_loss_clip": 0.06289048, + "balance_loss_mlp": 0.01256053, + "epoch": 0.3989779047046445, + "flos": 22600831501440.0, + "grad_norm": 1.9740114535883675, + "language_loss": 0.66474432, + "learning_rate": 2.733909277895868e-06, + "loss": 0.74215585, + "num_input_tokens_seen": 142521870, + "router_z_loss_clip": 1.79296875, + "router_z_loss_mlp": 0.16760254, + "step": 6636, + "time_per_iteration": 2.5290956497192383 + }, + { + "auxiliary_loss_clip": 0.06452767, + "auxiliary_loss_mlp": 0.01270258, + "balance_loss_clip": 0.06285115, + "balance_loss_mlp": 0.01255012, + "epoch": 0.39903802795731247, + "flos": 18083043711360.0, + "grad_norm": 1.6936131920640751, + "language_loss": 0.82211542, + "learning_rate": 2.733546971601763e-06, + "loss": 0.89934564, + "num_input_tokens_seen": 142540455, + "router_z_loss_clip": 1.67773438, + "router_z_loss_mlp": 0.15246582, + "step": 6637, + "time_per_iteration": 2.516279458999634 + }, + { + "auxiliary_loss_clip": 0.06353697, + "auxiliary_loss_mlp": 0.01252791, + "balance_loss_clip": 0.06278069, + "balance_loss_mlp": 0.01250418, + "epoch": 0.39909815120998043, + "flos": 70463238652800.0, + "grad_norm": 0.7262189478909644, + "language_loss": 0.531524, + "learning_rate": 2.733184637491484e-06, + "loss": 0.60758889, + "num_input_tokens_seen": 142599665, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.0236969, + "step": 6638, + "time_per_iteration": 3.2179603576660156 + }, + { + "auxiliary_loss_clip": 0.06449973, + "auxiliary_loss_mlp": 0.01277744, + "balance_loss_clip": 0.06279011, + "balance_loss_mlp": 0.0126304, + "epoch": 0.39915827446264845, + "flos": 18554788598400.0, + "grad_norm": 1.4980640352775056, + "language_loss": 0.75670731, + "learning_rate": 2.732822275578769e-06, + "loss": 0.83398449, + "num_input_tokens_seen": 142618845, + "router_z_loss_clip": 1.70898438, + "router_z_loss_mlp": 0.14715576, + "step": 6639, + "time_per_iteration": 2.5319011211395264 + }, + { + "auxiliary_loss_clip": 0.06442601, + "auxiliary_loss_mlp": 0.01272751, + "balance_loss_clip": 0.0627881, + "balance_loss_mlp": 0.01258249, + "epoch": 0.3992183977153164, + "flos": 29904826129920.0, + "grad_norm": 2.014095124557279, + "language_loss": 0.76376802, + "learning_rate": 2.7324598858773603e-06, + "loss": 0.84092152, + "num_input_tokens_seen": 142640885, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.1451416, + "step": 6640, + "time_per_iteration": 2.642223834991455 + }, + { + "auxiliary_loss_clip": 0.06449724, + "auxiliary_loss_mlp": 0.01270265, + "balance_loss_clip": 0.06280393, + "balance_loss_mlp": 0.01255757, + "epoch": 0.3992785209679844, + "flos": 22571677480320.0, + "grad_norm": 2.238528881986372, + "language_loss": 0.8211664, + "learning_rate": 2.7320974684009996e-06, + "loss": 0.89836633, + "num_input_tokens_seen": 142659340, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.14501953, + "step": 6641, + "time_per_iteration": 2.530189275741577 + }, + { + "auxiliary_loss_clip": 0.06456075, + "auxiliary_loss_mlp": 0.01270045, + "balance_loss_clip": 0.06284191, + "balance_loss_mlp": 0.01254971, + "epoch": 0.39933864422065235, + "flos": 19688784629760.0, + "grad_norm": 1.8306704082742173, + "language_loss": 0.77208257, + "learning_rate": 2.7317350231634288e-06, + "loss": 0.84934378, + "num_input_tokens_seen": 142677085, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.15081787, + "step": 6642, + "time_per_iteration": 2.5495219230651855 + }, + { + "auxiliary_loss_clip": 0.06453043, + "auxiliary_loss_mlp": 0.01270555, + "balance_loss_clip": 0.06281064, + "balance_loss_mlp": 0.01255564, + "epoch": 0.3993987674733203, + "flos": 23045015594880.0, + "grad_norm": 2.242078242091602, + "language_loss": 0.72883618, + "learning_rate": 2.731372550178393e-06, + "loss": 0.80607212, + "num_input_tokens_seen": 142694595, + "router_z_loss_clip": 1.71972656, + "router_z_loss_mlp": 0.14984131, + "step": 6643, + "time_per_iteration": 2.521857500076294 + }, + { + "auxiliary_loss_clip": 0.06456347, + "auxiliary_loss_mlp": 0.01273961, + "balance_loss_clip": 0.06283459, + "balance_loss_mlp": 0.01259317, + "epoch": 0.3994588907259883, + "flos": 19396896531840.0, + "grad_norm": 1.7649027305896348, + "language_loss": 0.66785717, + "learning_rate": 2.7310100494596375e-06, + "loss": 0.74516022, + "num_input_tokens_seen": 142714175, + "router_z_loss_clip": 1.72851562, + "router_z_loss_mlp": 0.14642334, + "step": 6644, + "time_per_iteration": 2.571690320968628 + }, + { + "auxiliary_loss_clip": 0.06454624, + "auxiliary_loss_mlp": 0.0127806, + "balance_loss_clip": 0.06282313, + "balance_loss_mlp": 0.01263737, + "epoch": 0.39951901397865625, + "flos": 13739326778880.0, + "grad_norm": 1.9095077452421072, + "language_loss": 0.78757256, + "learning_rate": 2.730647521020907e-06, + "loss": 0.86489946, + "num_input_tokens_seen": 142730955, + "router_z_loss_clip": 1.72363281, + "router_z_loss_mlp": 0.14312744, + "step": 6645, + "time_per_iteration": 2.499361753463745 + }, + { + "auxiliary_loss_clip": 0.06458238, + "auxiliary_loss_mlp": 0.01274341, + "balance_loss_clip": 0.06283879, + "balance_loss_mlp": 0.01259321, + "epoch": 0.3995791372313242, + "flos": 23593181005440.0, + "grad_norm": 1.5926569767996783, + "language_loss": 0.7044934, + "learning_rate": 2.73028496487595e-06, + "loss": 0.78181922, + "num_input_tokens_seen": 142751200, + "router_z_loss_clip": 1.74414062, + "router_z_loss_mlp": 0.15026855, + "step": 6646, + "time_per_iteration": 2.619114875793457 + }, + { + "auxiliary_loss_clip": 0.06456489, + "auxiliary_loss_mlp": 0.01271766, + "balance_loss_clip": 0.06284152, + "balance_loss_mlp": 0.01257103, + "epoch": 0.3996392604839922, + "flos": 21361428633600.0, + "grad_norm": 2.2667385155288917, + "language_loss": 0.72035694, + "learning_rate": 2.729922381038513e-06, + "loss": 0.79763949, + "num_input_tokens_seen": 142770170, + "router_z_loss_clip": 1.72265625, + "router_z_loss_mlp": 0.14660645, + "step": 6647, + "time_per_iteration": 2.58251953125 + }, + { + "auxiliary_loss_clip": 0.06449988, + "auxiliary_loss_mlp": 0.01272061, + "balance_loss_clip": 0.06284988, + "balance_loss_mlp": 0.01257195, + "epoch": 0.39969938373666014, + "flos": 26039604337920.0, + "grad_norm": 1.4692875023338006, + "language_loss": 0.74830031, + "learning_rate": 2.7295597695223463e-06, + "loss": 0.82552081, + "num_input_tokens_seen": 142792680, + "router_z_loss_clip": 1.64941406, + "router_z_loss_mlp": 0.14849854, + "step": 6648, + "time_per_iteration": 2.7020201683044434 + }, + { + "auxiliary_loss_clip": 0.06453955, + "auxiliary_loss_mlp": 0.0126884, + "balance_loss_clip": 0.06283584, + "balance_loss_mlp": 0.0125472, + "epoch": 0.3997595069893281, + "flos": 20121858057600.0, + "grad_norm": 2.0106261298514907, + "language_loss": 0.65986454, + "learning_rate": 2.7291971303412006e-06, + "loss": 0.73709244, + "num_input_tokens_seen": 142810510, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.14117432, + "step": 6649, + "time_per_iteration": 3.9323928356170654 + }, + { + "auxiliary_loss_clip": 0.06463098, + "auxiliary_loss_mlp": 0.0127713, + "balance_loss_clip": 0.06290667, + "balance_loss_mlp": 0.01260774, + "epoch": 0.39981963024199607, + "flos": 27791016779520.0, + "grad_norm": 1.831691866077207, + "language_loss": 0.75774682, + "learning_rate": 2.728834463508826e-06, + "loss": 0.83514905, + "num_input_tokens_seen": 142832455, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.16357422, + "step": 6650, + "time_per_iteration": 2.6374714374542236 + }, + { + "auxiliary_loss_clip": 0.06454846, + "auxiliary_loss_mlp": 0.01272611, + "balance_loss_clip": 0.06283177, + "balance_loss_mlp": 0.01257782, + "epoch": 0.39987975349466404, + "flos": 21950864979840.0, + "grad_norm": 1.4608995971033776, + "language_loss": 0.7199676, + "learning_rate": 2.728471769038975e-06, + "loss": 0.79724216, + "num_input_tokens_seen": 142852590, + "router_z_loss_clip": 1.71582031, + "router_z_loss_mlp": 0.14831543, + "step": 6651, + "time_per_iteration": 2.5789706707000732 + }, + { + "auxiliary_loss_clip": 0.06457064, + "auxiliary_loss_mlp": 0.01269592, + "balance_loss_clip": 0.06283179, + "balance_loss_mlp": 0.01255245, + "epoch": 0.39993987674733206, + "flos": 20710707425280.0, + "grad_norm": 1.930350074981486, + "language_loss": 0.73724478, + "learning_rate": 2.728109046945403e-06, + "loss": 0.8145113, + "num_input_tokens_seen": 142870595, + "router_z_loss_clip": 1.74023438, + "router_z_loss_mlp": 0.14331055, + "step": 6652, + "time_per_iteration": 3.9592838287353516 + }, + { + "auxiliary_loss_clip": 0.06347093, + "auxiliary_loss_mlp": 0.01255075, + "balance_loss_clip": 0.06271589, + "balance_loss_mlp": 0.01252878, + "epoch": 0.4, + "flos": 61543566397440.0, + "grad_norm": 0.8159851457251004, + "language_loss": 0.60542929, + "learning_rate": 2.727746297241862e-06, + "loss": 0.68145096, + "num_input_tokens_seen": 142925805, + "router_z_loss_clip": 0.75732422, + "router_z_loss_mlp": 0.02201843, + "step": 6653, + "time_per_iteration": 3.0700466632843018 + }, + { + "auxiliary_loss_clip": 0.06454087, + "auxiliary_loss_mlp": 0.01272182, + "balance_loss_clip": 0.0629051, + "balance_loss_mlp": 0.01257698, + "epoch": 0.400060123252668, + "flos": 14507655592320.0, + "grad_norm": 1.9278074838902122, + "language_loss": 0.66929328, + "learning_rate": 2.7273835199421085e-06, + "loss": 0.74655592, + "num_input_tokens_seen": 142943145, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.14477539, + "step": 6654, + "time_per_iteration": 2.5292413234710693 + }, + { + "auxiliary_loss_clip": 0.06457023, + "auxiliary_loss_mlp": 0.01271182, + "balance_loss_clip": 0.06287654, + "balance_loss_mlp": 0.01257396, + "epoch": 0.40012024650533595, + "flos": 19098383961600.0, + "grad_norm": 1.998304088554008, + "language_loss": 0.90550762, + "learning_rate": 2.7270207150599e-06, + "loss": 0.98278964, + "num_input_tokens_seen": 142956925, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.13775635, + "step": 6655, + "time_per_iteration": 2.529496192932129 + }, + { + "auxiliary_loss_clip": 0.06450539, + "auxiliary_loss_mlp": 0.012675, + "balance_loss_clip": 0.06286812, + "balance_loss_mlp": 0.01254899, + "epoch": 0.4001803697580039, + "flos": 29358673217280.0, + "grad_norm": 1.6559902316252946, + "language_loss": 0.73729336, + "learning_rate": 2.7266578826089917e-06, + "loss": 0.81447375, + "num_input_tokens_seen": 142978040, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.1260376, + "step": 6656, + "time_per_iteration": 4.062687158584595 + } + ], + "logging_steps": 1.0, + "max_steps": 16632, + "num_input_tokens_seen": 142978040, + "num_train_epochs": 1, + "save_steps": 3328, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.577520215501046e+17, + "train_batch_size": 5, + "trial_name": null, + "trial_params": null +} diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-6656/training_args.bin b/sft/revise_Full_smoe_tcmoe/checkpoint-6656/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..97c752df28a864c1e1da329f5474435eefe7778b --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-6656/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fda08a1e9d46ee3a47070dfbfdde239474b3b39c0e298dedbf0b0dd9cdd3c27e +size 7992 diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-6656/zero_to_fp32.py b/sft/revise_Full_smoe_tcmoe/checkpoint-6656/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-6656/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-9984/added_tokens.json b/sft/revise_Full_smoe_tcmoe/checkpoint-9984/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..c9d3d3a1b74d87e381e471f7b33784015d2dc0ea --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-9984/added_tokens.json @@ -0,0 +1,13 @@ +{ + "<|assistant|>": 32001, + "<|endoftext|>": 32000, + "<|end|>": 32007, + "<|placeholder1|>": 32002, + "<|placeholder2|>": 32003, + "<|placeholder3|>": 32004, + "<|placeholder4|>": 32005, + "<|placeholder5|>": 32008, + "<|placeholder6|>": 32009, + "<|system|>": 32006, + "<|user|>": 32010 +} diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-9984/config.json b/sft/revise_Full_smoe_tcmoe/checkpoint-9984/config.json new file mode 100644 index 0000000000000000000000000000000000000000..da3b0c65c0ef1d3a1c68ffdd7565996d4dd85a33 --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-9984/config.json @@ -0,0 +1,203 @@ +{ + "_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "architectures": [ + "LlavaPhiForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_phi3.Phi3Config", + "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM" + }, + "bal_comp_loss_coef": 0.01, + "balance_loss_coef": 0.01, + "bos_token_id": 1, + "clip_smoe": true, + "diversity_loss_coef": 0.01, + "dropout": false, + "e_loss_coef": 0.001, + "embd_pdrop": 0.0, + "entropy_advance_loss": false, + "eos_token_id": 32000, + "freeze_backbone": false, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 3072, + "hybrid": false, + "image_aspect_ratio": "pad", + "init_weight": true, + "initializer_range": 0.02, + "intermediate_size": 8192, + "is_cosine": false, + "is_norm_weight": false, + "local_rank": 0, + "loss1": "balanceloss", + "loss2": "zloss", + "luna": false, + "max_compete_in_iter": 3, + "max_position_embeddings": 131072, + "mlp_smoe": true, + "mm_hidden_size": 1152, + "mm_patch_merge_type": "flat", + "mm_projector_lr": null, + "mm_projector_type": "moe", + "mm_use_im_patch_token": false, + "mm_use_im_start_end": false, + "mm_vision_select_feature": "patch", + "mm_vision_select_layer": -2, + "mm_vision_tower": "google/siglip-so400m-patch14-224", + "model_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "model_type": "llava_phi", + "moe_name": "smoe_tcmoe", + "moe_relu_l1_reg_coeff_multiplier": 1.2, + "mp_pixel_shuffle_factor": 1, + "norm_softmax": false, + "normalization": true, + "num_attention_heads": 32, + "num_experts": 4, + "num_hidden_layers": 32, + "num_key_value_heads": 32, + "num_layers": 3, + "num_selected": 2, + "number_of_previous_tokens": 2, + "original_max_position_embeddings": 4096, + "pad_token_id": 32000, + "pretrain_mm_mlp_adapter": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft/mm_projector.bin", + "rate_compete": 0.2, + "rate_flip": 0.05, + "resid_pdrop": 0.0, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "long_factor": [ + 1.0800000429153442, + 1.1100000143051147, + 1.1399999856948853, + 1.340000033378601, + 1.5899999141693115, + 1.600000023841858, + 1.6200000047683716, + 2.620000123977661, + 3.2300000190734863, + 3.2300000190734863, + 4.789999961853027, + 7.400000095367432, + 7.700000286102295, + 9.09000015258789, + 12.199999809265137, + 17.670000076293945, + 24.46000099182129, + 28.57000160217285, + 30.420001983642578, + 30.840002059936523, + 32.590003967285156, + 32.93000411987305, + 42.320003509521484, + 44.96000289916992, + 50.340003967285156, + 50.45000457763672, + 57.55000305175781, + 57.93000411987305, + 58.21000289916992, + 60.1400032043457, + 62.61000442504883, + 62.62000274658203, + 62.71000289916992, + 63.1400032043457, + 63.1400032043457, + 63.77000427246094, + 63.93000411987305, + 63.96000289916992, + 63.970001220703125, + 64.02999877929688, + 64.06999969482422, + 64.08000183105469, + 64.12000274658203, + 64.41000366210938, + 64.4800033569336, + 64.51000213623047, + 64.52999877929688, + 64.83999633789062 + ], + "short_factor": [ + 1.0, + 1.0199999809265137, + 1.0299999713897705, + 1.0299999713897705, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0699999332427979, + 1.0999999046325684, + 1.1099998950958252, + 1.1599998474121094, + 1.1599998474121094, + 1.1699998378753662, + 1.2899998426437378, + 1.339999794960022, + 1.679999828338623, + 1.7899998426437378, + 1.8199998140335083, + 1.8499997854232788, + 1.8799997568130493, + 1.9099997282028198, + 1.9399996995925903, + 1.9899996519088745, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0799996852874756, + 2.0899996757507324, + 2.189999580383301, + 2.2199995517730713, + 2.5899994373321533, + 2.729999542236328, + 2.749999523162842, + 2.8399994373321533 + ], + "type": "longrope" + }, + "rope_theta": 10000.0, + "router_loss_coef": 0.01, + "router_theta": 0.1, + "router_z_loss_coef": 0.001, + "scales": [ + 1, + 3 + ], + "sliding_window": 262144, + "sparse_upcycling": true, + "std_gate": 0.02, + "strategy_train": "base", + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "topk_max": 2, + "topk_min": 1, + "torch_dtype": "bfloat16", + "training": true, + "transformers_version": "4.43.0", + "tune_mm_mlp_adapter": false, + "unit_test": true, + "use_cache": false, + "use_mm_proj": true, + "use_old": false, + "version": "phi35", + "vision_tower": "google/siglip-so400m-patch14-224", + "vision_tower_dir": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft/clip.bin", + "vocab_size": 32064, + "warm_up": 0.05 +} diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-9984/generation_config.json b/sft/revise_Full_smoe_tcmoe/checkpoint-9984/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..dad5c4578f0dc5969b38755d095fc30c368bb54a --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-9984/generation_config.json @@ -0,0 +1,12 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "do_sample": true, + "eos_token_id": [ + 32007, + 32001, + 32000 + ], + "pad_token_id": 32000, + "transformers_version": "4.43.0" +} diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-9984/latest b/sft/revise_Full_smoe_tcmoe/checkpoint-9984/latest new file mode 100644 index 0000000000000000000000000000000000000000..6c2bd85bc7c6d33e172c9c565d8517bade9572ca --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-9984/latest @@ -0,0 +1 @@ +global_step9984 \ No newline at end of file diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-9984/model-00001-of-00003.safetensors b/sft/revise_Full_smoe_tcmoe/checkpoint-9984/model-00001-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..8724b50dd2e7d6319abb32d497aac60a5ecbab81 --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-9984/model-00001-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bae11d99abd25ebe8362bc3f1ce339699e0b94d95a94a09ba9a9b5272794acb7 +size 4972489328 diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-9984/model-00002-of-00003.safetensors b/sft/revise_Full_smoe_tcmoe/checkpoint-9984/model-00002-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..67c34113efd91c1abcbdc4a3757455d5e5d6ca44 --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-9984/model-00002-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb46f1e2c7e1436a55dcca4188ad0d96d5d34eec77b40e74159f51dd8c0fb2c1 +size 4985902928 diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-9984/model-00003-of-00003.safetensors b/sft/revise_Full_smoe_tcmoe/checkpoint-9984/model-00003-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..5cd227c7092a58b41b99dd4f8ac6884abc0065d5 --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-9984/model-00003-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bbe999d11ffa7d055ecac36c9327a1bb7628bbdf23563c7517491ab40c6fb70c +size 248971200 diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-9984/model.safetensors.index.json b/sft/revise_Full_smoe_tcmoe/checkpoint-9984/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..3197289c4553bb4cba30dd31a8c232b7496a92b5 --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-9984/model.safetensors.index.json @@ -0,0 +1,1005 @@ +{ + "metadata": { + "total_size": 10207220352 + }, + "weight_map": { + "lm_head.weight": "model-00003-of-00003.safetensors", + "model.embed_tokens.weight": "model-00001-of-00003.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.16.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.16.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.17.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.17.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.18.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.18.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.19.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.19.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.21.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.25.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.25.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.25.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.26.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.26.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.26.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.27.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.27.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.27.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.28.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.28.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.28.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.29.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.29.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.29.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.30.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.30.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.30.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.31.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.31.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.31.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.mm_projector.moelayer.experts.0.0.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.0.0.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.0.2.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.0.2.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.1.0.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.1.0.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.1.2.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.1.2.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.2.0.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.2.0.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.2.2.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.2.2.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.3.0.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.3.0.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.3.2.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.3.2.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.gate.weight": "model-00003-of-00003.safetensors", + "model.norm.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00002-of-00003.safetensors" + } +} diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-9984/rng_state_0.pth b/sft/revise_Full_smoe_tcmoe/checkpoint-9984/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..9231f69f5fd461899867106a669ce247e70c72c2 --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-9984/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f9f23d807f0e704f4ca79670a6631cbff43189cf7f8ff4e1fc0a4330e636a798 +size 14960 diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-9984/rng_state_1.pth b/sft/revise_Full_smoe_tcmoe/checkpoint-9984/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..19fe2dcc766f192ea5de79cec4dcff17172a10f7 --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-9984/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d37f92f6aea5386e84d2d64a1a25d6ef96a10b3bbbfe63627981604c8934076 +size 14960 diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-9984/rng_state_2.pth b/sft/revise_Full_smoe_tcmoe/checkpoint-9984/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..bfe492519c6b79b07a8d68b98c5f3d0c073667aa --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-9984/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:667ebf727735115f00a6bdbe090344e9846c726d11bb555cdc201c415f27ad85 +size 14960 diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-9984/rng_state_3.pth b/sft/revise_Full_smoe_tcmoe/checkpoint-9984/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..838d42ad13e30851fdbd1d8801738a4106a9ce8b --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-9984/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49d306f8c511cba8a225e3b723c5fa79d8a6ecc922f834da914ff0780c78b1fc +size 14960 diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-9984/special_tokens_map.json b/sft/revise_Full_smoe_tcmoe/checkpoint-9984/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3e4d5a5bc1cb51753cc9ae0305ece0da60052b10 --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-9984/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-9984/tokenizer.model b/sft/revise_Full_smoe_tcmoe/checkpoint-9984/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-9984/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-9984/tokenizer_config.json b/sft/revise_Full_smoe_tcmoe/checkpoint-9984/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d579bb0b91b24b214ea3c2e487e27a65017cdc4a --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-9984/tokenizer_config.json @@ -0,0 +1,132 @@ +{ + "add_bos_token": false, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": false + }, + "32000": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32001": { + "content": "<|assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32002": { + "content": "<|placeholder1|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32003": { + "content": "<|placeholder2|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32004": { + "content": "<|placeholder3|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32005": { + "content": "<|placeholder4|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32006": { + "content": "<|system|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32007": { + "content": "<|end|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32008": { + "content": "<|placeholder5|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32009": { + "content": "<|placeholder6|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32010": { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "chat_template": "{% for message in messages %}{% if message['role'] == 'system' and message['content'] %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "legacy": false, + "model_max_length": 2048, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-9984/trainer_state.json b/sft/revise_Full_smoe_tcmoe/checkpoint-9984/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..e38467d3699cd5765e88df730bfbfc95a1609a85 --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-9984/trainer_state.json @@ -0,0 +1,169761 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.6002705546370058, + "eval_steps": 500, + "global_step": 9984, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "auxiliary_loss_clip": 0.20073968, + "auxiliary_loss_mlp": 1.0941844, + "balance_loss_clip": 0.12873733, + "balance_loss_mlp": 0.03705556, + "epoch": 6.012325266796934e-05, + "flos": 24462952254720.0, + "grad_norm": 941654.8300602314, + "language_loss": 24.32558632, + "learning_rate": 0.0, + "loss": 16.92002487, + "num_input_tokens_seen": 19155, + "router_z_loss_clip": 72.03125, + "router_z_loss_mlp": 1058.5, + "step": 1, + "time_per_iteration": 18.343486785888672 + }, + { + "auxiliary_loss_clip": 0.13316599, + "auxiliary_loss_mlp": 0.71558112, + "balance_loss_clip": 0.08576315, + "balance_loss_mlp": 0.02466314, + "epoch": 0.00012024650533593868, + "flos": 20231457598080.0, + "grad_norm": 271164.48776572174, + "language_loss": 15.90828419, + "learning_rate": 4.4628432569317594e-07, + "loss": 16.75703049, + "num_input_tokens_seen": 36175, + "router_z_loss_clip": 47.40625, + "router_z_loss_mlp": 691.5, + "step": 2, + "time_per_iteration": 2.4823946952819824 + }, + { + "auxiliary_loss_clip": 0.13345747, + "auxiliary_loss_mlp": 0.73460984, + "balance_loss_clip": 0.08591475, + "balance_loss_mlp": 0.02464893, + "epoch": 0.000180369758003908, + "flos": 22316532197760.0, + "grad_norm": 30890.300344628693, + "language_loss": 15.82156086, + "learning_rate": 7.073439208833112e-07, + "loss": 16.68962669, + "num_input_tokens_seen": 54870, + "router_z_loss_clip": 47.46875, + "router_z_loss_mlp": 711.0, + "step": 3, + "time_per_iteration": 2.4773216247558594 + }, + { + "auxiliary_loss_clip": 0.13399127, + "auxiliary_loss_mlp": 0.72687411, + "balance_loss_clip": 0.08587996, + "balance_loss_mlp": 0.02472562, + "epoch": 0.00024049301067187735, + "flos": 22420471587840.0, + "grad_norm": 3825.373736974443, + "language_loss": 15.7262888, + "learning_rate": 8.925686513863519e-07, + "loss": 16.58715439, + "num_input_tokens_seen": 74575, + "router_z_loss_clip": 48.15625, + "router_z_loss_mlp": 703.0, + "step": 4, + "time_per_iteration": 2.492133378982544 + }, + { + "auxiliary_loss_clip": 0.13353133, + "auxiliary_loss_mlp": 0.72775936, + "balance_loss_clip": 0.08579096, + "balance_loss_mlp": 0.02463434, + "epoch": 0.0003006162633398467, + "flos": 21403286547840.0, + "grad_norm": 4441.394942298188, + "language_loss": 15.57899952, + "learning_rate": 1.0362401141348472e-06, + "loss": 16.44029045, + "num_input_tokens_seen": 92580, + "router_z_loss_clip": 47.65625, + "router_z_loss_mlp": 704.0, + "step": 5, + "time_per_iteration": 2.7607173919677734 + }, + { + "auxiliary_loss_clip": 0.13327441, + "auxiliary_loss_mlp": 0.71557182, + "balance_loss_clip": 0.08570103, + "balance_loss_mlp": 0.02465384, + "epoch": 0.000360739516007816, + "flos": 21658725319680.0, + "grad_norm": 2540.715684092784, + "language_loss": 14.90827179, + "learning_rate": 1.153628246576487e-06, + "loss": 15.75711823, + "num_input_tokens_seen": 109705, + "router_z_loss_clip": 47.5625, + "router_z_loss_mlp": 691.5, + "step": 6, + "time_per_iteration": 2.6497979164123535 + }, + { + "auxiliary_loss_clip": 0.13351092, + "auxiliary_loss_mlp": 0.7340821, + "balance_loss_clip": 0.08562777, + "balance_loss_mlp": 0.02460942, + "epoch": 0.0004208627686757854, + "flos": 27166682407680.0, + "grad_norm": 2502.417206046203, + "language_loss": 14.593853, + "learning_rate": 1.2528784983718962e-06, + "loss": 15.46144581, + "num_input_tokens_seen": 129425, + "router_z_loss_clip": 47.875, + "router_z_loss_mlp": 710.5, + "step": 7, + "time_per_iteration": 2.7325549125671387 + }, + { + "auxiliary_loss_clip": 0.13360947, + "auxiliary_loss_mlp": 0.73910165, + "balance_loss_clip": 0.08574936, + "balance_loss_mlp": 0.02474618, + "epoch": 0.0004809860213437547, + "flos": 31326727190400.0, + "grad_norm": 4081.02679202092, + "language_loss": 14.47960091, + "learning_rate": 1.338852977079528e-06, + "loss": 15.35231113, + "num_input_tokens_seen": 149210, + "router_z_loss_clip": 47.78125, + "router_z_loss_mlp": 715.5, + "step": 8, + "time_per_iteration": 2.7674574851989746 + }, + { + "auxiliary_loss_clip": 0.13345738, + "auxiliary_loss_mlp": 0.74048162, + "balance_loss_clip": 0.08564517, + "balance_loss_mlp": 0.02466127, + "epoch": 0.000541109274011724, + "flos": 32168541634560.0, + "grad_norm": 2607.7195165159947, + "language_loss": 13.74505424, + "learning_rate": 1.4146878417666224e-06, + "loss": 14.61899281, + "num_input_tokens_seen": 169055, + "router_z_loss_clip": 47.78125, + "router_z_loss_mlp": 716.5, + "step": 9, + "time_per_iteration": 2.8135807514190674 + }, + { + "auxiliary_loss_clip": 0.13289651, + "auxiliary_loss_mlp": 0.7478379, + "balance_loss_clip": 0.08548209, + "balance_loss_mlp": 0.02469334, + "epoch": 0.0006012325266796934, + "flos": 18922845657600.0, + "grad_norm": 8226.203152944285, + "language_loss": 12.47718525, + "learning_rate": 1.4825244398280232e-06, + "loss": 13.35791969, + "num_input_tokens_seen": 188045, + "router_z_loss_clip": 47.375, + "router_z_loss_mlp": 724.5, + "step": 10, + "time_per_iteration": 2.665703296661377 + }, + { + "auxiliary_loss_clip": 0.1330242, + "auxiliary_loss_mlp": 0.74298382, + "balance_loss_clip": 0.08549603, + "balance_loss_mlp": 0.02472211, + "epoch": 0.0006613557793476627, + "flos": 20780755038720.0, + "grad_norm": 29924.608712817644, + "language_loss": 12.23305321, + "learning_rate": 1.5438901072051983e-06, + "loss": 13.10906219, + "num_input_tokens_seen": 207035, + "router_z_loss_clip": 47.53125, + "router_z_loss_mlp": 719.0, + "step": 11, + "time_per_iteration": 2.6799204349517822 + }, + { + "auxiliary_loss_clip": 0.133246, + "auxiliary_loss_mlp": 0.74782056, + "balance_loss_clip": 0.08560382, + "balance_loss_mlp": 0.02467602, + "epoch": 0.000721479032015632, + "flos": 16587321603840.0, + "grad_norm": 24119.088684995622, + "language_loss": 11.84583473, + "learning_rate": 1.5999125722696629e-06, + "loss": 12.72690105, + "num_input_tokens_seen": 223225, + "router_z_loss_clip": 47.6875, + "router_z_loss_mlp": 723.5, + "step": 12, + "time_per_iteration": 2.707231044769287 + }, + { + "auxiliary_loss_clip": 0.13276552, + "auxiliary_loss_mlp": 0.74238944, + "balance_loss_clip": 0.08559544, + "balance_loss_mlp": 0.02461605, + "epoch": 0.0007816022846836014, + "flos": 23812254305280.0, + "grad_norm": 118556.26638855682, + "language_loss": 11.36912918, + "learning_rate": 1.6514482443788434e-06, + "loss": 12.24428368, + "num_input_tokens_seen": 242570, + "router_z_loss_clip": 47.1875, + "router_z_loss_mlp": 718.0, + "step": 13, + "time_per_iteration": 2.696007251739502 + }, + { + "auxiliary_loss_clip": 0.13292459, + "auxiliary_loss_mlp": 0.74095768, + "balance_loss_clip": 0.0856985, + "balance_loss_mlp": 0.02464909, + "epoch": 0.0008417255373515708, + "flos": 19178284429440.0, + "grad_norm": 181106.81391623587, + "language_loss": 10.94849205, + "learning_rate": 1.6991628240650723e-06, + "loss": 11.82237434, + "num_input_tokens_seen": 261215, + "router_z_loss_clip": 47.1875, + "router_z_loss_mlp": 716.5, + "step": 14, + "time_per_iteration": 2.676393985748291 + }, + { + "auxiliary_loss_clip": 0.13372461, + "auxiliary_loss_mlp": 0.75321233, + "balance_loss_clip": 0.08592231, + "balance_loss_mlp": 0.02469672, + "epoch": 0.00090184879001954, + "flos": 26402714006400.0, + "grad_norm": 8872.944602873076, + "language_loss": 11.40745831, + "learning_rate": 1.7435840350181584e-06, + "loss": 12.29439545, + "num_input_tokens_seen": 280035, + "router_z_loss_clip": 47.78125, + "router_z_loss_mlp": 729.5, + "step": 15, + "time_per_iteration": 2.716722249984741 + }, + { + "auxiliary_loss_clip": 0.13287091, + "auxiliary_loss_mlp": 0.73999238, + "balance_loss_clip": 0.0855229, + "balance_loss_mlp": 0.02466036, + "epoch": 0.0009619720426875094, + "flos": 24686157663360.0, + "grad_norm": 5195.838129438997, + "language_loss": 10.71900749, + "learning_rate": 1.7851373027727038e-06, + "loss": 11.59187126, + "num_input_tokens_seen": 300265, + "router_z_loss_clip": 47.3125, + "router_z_loss_mlp": 716.5, + "step": 16, + "time_per_iteration": 2.744054079055786 + }, + { + "auxiliary_loss_clip": 0.13309729, + "auxiliary_loss_mlp": 0.76006317, + "balance_loss_clip": 0.08562544, + "balance_loss_mlp": 0.0247116, + "epoch": 0.0010220952953554788, + "flos": 18630454435200.0, + "grad_norm": 4421.362455936007, + "language_loss": 10.42590714, + "learning_rate": 1.8241705979033208e-06, + "loss": 11.319067, + "num_input_tokens_seen": 317375, + "router_z_loss_clip": 47.5, + "router_z_loss_mlp": 736.0, + "step": 17, + "time_per_iteration": 4.191499471664429 + }, + { + "auxiliary_loss_clip": 0.13315202, + "auxiliary_loss_mlp": 0.7600373, + "balance_loss_clip": 0.08556177, + "balance_loss_mlp": 0.02468574, + "epoch": 0.001082218548023448, + "flos": 26150042419200.0, + "grad_norm": 7888.125072686045, + "language_loss": 9.94283867, + "learning_rate": 1.860972167459798e-06, + "loss": 10.83602905, + "num_input_tokens_seen": 337975, + "router_z_loss_clip": 47.625, + "router_z_loss_mlp": 735.5, + "step": 18, + "time_per_iteration": 2.7808027267456055 + }, + { + "auxiliary_loss_clip": 0.13318592, + "auxiliary_loss_mlp": 0.73953104, + "balance_loss_clip": 0.08563764, + "balance_loss_mlp": 0.02468731, + "epoch": 0.0011423418006914173, + "flos": 19615885977600.0, + "grad_norm": 21999.592558043798, + "language_loss": 8.84625435, + "learning_rate": 1.89578346593066e-06, + "loss": 9.71897125, + "num_input_tokens_seen": 356635, + "router_z_loss_clip": 47.53125, + "router_z_loss_mlp": 716.0, + "step": 19, + "time_per_iteration": 4.131728172302246 + }, + { + "auxiliary_loss_clip": 0.13303626, + "auxiliary_loss_mlp": 0.74244332, + "balance_loss_clip": 0.08565694, + "balance_loss_mlp": 0.02466989, + "epoch": 0.0012024650533593868, + "flos": 17901258278400.0, + "grad_norm": 4121.169450537968, + "language_loss": 8.27947521, + "learning_rate": 1.928808765521199e-06, + "loss": 9.15495491, + "num_input_tokens_seen": 375625, + "router_z_loss_clip": 47.34375, + "router_z_loss_mlp": 718.5, + "step": 20, + "time_per_iteration": 2.708914279937744 + }, + { + "auxiliary_loss_clip": 0.13338368, + "auxiliary_loss_mlp": 0.76394671, + "balance_loss_clip": 0.08570746, + "balance_loss_mlp": 0.02468888, + "epoch": 0.001262588306027356, + "flos": 21258495492480.0, + "grad_norm": 4514.811048777073, + "language_loss": 8.72282791, + "learning_rate": 1.9602224192552076e-06, + "loss": 9.62015915, + "num_input_tokens_seen": 394350, + "router_z_loss_clip": 47.6875, + "router_z_loss_mlp": 740.0, + "step": 21, + "time_per_iteration": 2.685307502746582 + }, + { + "auxiliary_loss_clip": 0.13281943, + "auxiliary_loss_mlp": 0.75118458, + "balance_loss_clip": 0.08552284, + "balance_loss_mlp": 0.02462207, + "epoch": 0.0013227115586953253, + "flos": 26111245178880.0, + "grad_norm": 4471.445911682346, + "language_loss": 8.71503925, + "learning_rate": 1.9901744328983746e-06, + "loss": 9.5990448, + "num_input_tokens_seen": 413255, + "router_z_loss_clip": 47.28125, + "router_z_loss_mlp": 727.5, + "step": 22, + "time_per_iteration": 2.734961748123169 + }, + { + "auxiliary_loss_clip": 0.13285899, + "auxiliary_loss_mlp": 0.73805398, + "balance_loss_clip": 0.08560154, + "balance_loss_mlp": 0.02467511, + "epoch": 0.0013828348113632948, + "flos": 23958177390720.0, + "grad_norm": 2111.5818511880134, + "language_loss": 8.18912506, + "learning_rate": 2.018794797290208e-06, + "loss": 9.06003761, + "num_input_tokens_seen": 433065, + "router_z_loss_clip": 47.3125, + "router_z_loss_mlp": 714.5, + "step": 23, + "time_per_iteration": 2.756584882736206 + }, + { + "auxiliary_loss_clip": 0.13278747, + "auxiliary_loss_mlp": 0.74887347, + "balance_loss_clip": 0.08537573, + "balance_loss_mlp": 0.0247524, + "epoch": 0.001442958064031264, + "flos": 15965125511040.0, + "grad_norm": 1807.1551511559412, + "language_loss": 8.28752899, + "learning_rate": 2.046196897962839e-06, + "loss": 9.16918945, + "num_input_tokens_seen": 451175, + "router_z_loss_clip": 47.4375, + "router_z_loss_mlp": 724.5, + "step": 24, + "time_per_iteration": 2.6928858757019043 + }, + { + "auxiliary_loss_clip": 0.13229564, + "auxiliary_loss_mlp": 0.73557305, + "balance_loss_clip": 0.08544464, + "balance_loss_mlp": 0.02463556, + "epoch": 0.0015030813166992333, + "flos": 18113287835520.0, + "grad_norm": 1186.4376598888527, + "language_loss": 7.80813074, + "learning_rate": 2.0724802282696944e-06, + "loss": 8.67599869, + "num_input_tokens_seen": 468775, + "router_z_loss_clip": 46.875, + "router_z_loss_mlp": 712.0, + "step": 25, + "time_per_iteration": 2.7093117237091064 + }, + { + "auxiliary_loss_clip": 0.13238442, + "auxiliary_loss_mlp": 0.7248075, + "balance_loss_clip": 0.085484, + "balance_loss_mlp": 0.02461214, + "epoch": 0.0015632045693672028, + "flos": 22240740579840.0, + "grad_norm": 3090.3782450571143, + "language_loss": 8.51009178, + "learning_rate": 2.0977325700720194e-06, + "loss": 9.36728287, + "num_input_tokens_seen": 488530, + "router_z_loss_clip": 46.875, + "router_z_loss_mlp": 701.0, + "step": 26, + "time_per_iteration": 2.7142887115478516 + }, + { + "auxiliary_loss_clip": 0.13264546, + "auxiliary_loss_mlp": 0.74387956, + "balance_loss_clip": 0.085568, + "balance_loss_mlp": 0.02464127, + "epoch": 0.001623327822035172, + "flos": 23999448326400.0, + "grad_norm": 883.8040958014411, + "language_loss": 8.80418682, + "learning_rate": 2.122031762649933e-06, + "loss": 9.68071175, + "num_input_tokens_seen": 510495, + "router_z_loss_clip": 47.03125, + "router_z_loss_mlp": 720.5, + "step": 27, + "time_per_iteration": 2.739086389541626 + }, + { + "auxiliary_loss_clip": 0.13261499, + "auxiliary_loss_mlp": 0.74588925, + "balance_loss_clip": 0.08545862, + "balance_loss_mlp": 0.02469785, + "epoch": 0.0016834510747031415, + "flos": 19682914844160.0, + "grad_norm": 778.9563997110462, + "language_loss": 7.52667618, + "learning_rate": 2.1454471497582483e-06, + "loss": 8.40517998, + "num_input_tokens_seen": 528605, + "router_z_loss_clip": 47.125, + "router_z_loss_mlp": 722.0, + "step": 28, + "time_per_iteration": 2.684328079223633 + }, + { + "auxiliary_loss_clip": 0.1322532, + "auxiliary_loss_mlp": 0.72868228, + "balance_loss_clip": 0.08545788, + "balance_loss_mlp": 0.02458075, + "epoch": 0.0017435743273711108, + "flos": 20930241922560.0, + "grad_norm": 711.3301469780024, + "language_loss": 7.32490015, + "learning_rate": 2.1680407726407727e-06, + "loss": 8.18583584, + "num_input_tokens_seen": 548515, + "router_z_loss_clip": 46.84375, + "router_z_loss_mlp": 705.0, + "step": 29, + "time_per_iteration": 2.6822586059570312 + }, + { + "auxiliary_loss_clip": 0.13197789, + "auxiliary_loss_mlp": 0.72772777, + "balance_loss_clip": 0.08529261, + "balance_loss_mlp": 0.02460276, + "epoch": 0.00180369758003908, + "flos": 19533763376640.0, + "grad_norm": 596.7513494595695, + "language_loss": 7.62213326, + "learning_rate": 2.189868360711334e-06, + "loss": 8.48183823, + "num_input_tokens_seen": 564025, + "router_z_loss_clip": 46.65625, + "router_z_loss_mlp": 703.5, + "step": 30, + "time_per_iteration": 2.66929030418396 + }, + { + "auxiliary_loss_clip": 0.13220352, + "auxiliary_loss_mlp": 0.73066145, + "balance_loss_clip": 0.08544487, + "balance_loss_mlp": 0.02460678, + "epoch": 0.0018638208327070496, + "flos": 27460415295360.0, + "grad_norm": 562.9814252823624, + "language_loss": 6.46621895, + "learning_rate": 2.2109801597326265e-06, + "loss": 7.32908344, + "num_input_tokens_seen": 583345, + "router_z_loss_clip": 46.78125, + "router_z_loss_mlp": 707.0, + "step": 31, + "time_per_iteration": 2.769524574279785 + }, + { + "auxiliary_loss_clip": 0.13217463, + "auxiliary_loss_mlp": 0.72719908, + "balance_loss_clip": 0.08546316, + "balance_loss_mlp": 0.02456231, + "epoch": 0.0019239440853750188, + "flos": 13594535723520.0, + "grad_norm": 932.7202356227122, + "language_loss": 6.38840246, + "learning_rate": 2.2314216284658796e-06, + "loss": 7.24777603, + "num_input_tokens_seen": 600010, + "router_z_loss_clip": 46.65625, + "router_z_loss_mlp": 703.0, + "step": 32, + "time_per_iteration": 2.6535158157348633 + }, + { + "auxiliary_loss_clip": 0.13187753, + "auxiliary_loss_mlp": 0.73303366, + "balance_loss_clip": 0.08555806, + "balance_loss_mlp": 0.02453755, + "epoch": 0.001984067338042988, + "flos": 11258466618240.0, + "grad_norm": 1313.3745045414653, + "language_loss": 6.49637842, + "learning_rate": 2.2512340280885094e-06, + "loss": 7.36128998, + "num_input_tokens_seen": 616295, + "router_z_loss_clip": 46.34375, + "router_z_loss_mlp": 709.5, + "step": 33, + "time_per_iteration": 2.7210733890533447 + }, + { + "auxiliary_loss_clip": 0.13162288, + "auxiliary_loss_mlp": 0.73504317, + "balance_loss_clip": 0.08544378, + "balance_loss_mlp": 0.02459392, + "epoch": 0.0020441905907109576, + "flos": 22393413918720.0, + "grad_norm": 826.9088902553285, + "language_loss": 6.77253819, + "learning_rate": 2.270454923596497e-06, + "loss": 7.6392045, + "num_input_tokens_seen": 637640, + "router_z_loss_clip": 46.0625, + "router_z_loss_mlp": 711.5, + "step": 34, + "time_per_iteration": 2.7001218795776367 + }, + { + "auxiliary_loss_clip": 0.13097668, + "auxiliary_loss_mlp": 0.75116229, + "balance_loss_clip": 0.08524574, + "balance_loss_mlp": 0.02459984, + "epoch": 0.0021043138433789266, + "flos": 49788911427840.0, + "grad_norm": 577.9485802079388, + "language_loss": 6.20400715, + "learning_rate": 2.2891186125067434e-06, + "loss": 7.08614588, + "num_input_tokens_seen": 659710, + "router_z_loss_clip": 45.6875, + "router_z_loss_mlp": 727.0, + "step": 35, + "time_per_iteration": 3.031013250350952 + }, + { + "auxiliary_loss_clip": 0.13148203, + "auxiliary_loss_mlp": 0.75109303, + "balance_loss_clip": 0.08537915, + "balance_loss_mlp": 0.02453051, + "epoch": 0.002164437096046896, + "flos": 20564155434240.0, + "grad_norm": 623.9821605724222, + "language_loss": 6.06852198, + "learning_rate": 2.307256493152974e-06, + "loss": 6.95109653, + "num_input_tokens_seen": 679670, + "router_z_loss_clip": 46.0625, + "router_z_loss_mlp": 727.0, + "step": 36, + "time_per_iteration": 2.7437260150909424 + }, + { + "auxiliary_loss_clip": 0.13138273, + "auxiliary_loss_mlp": 0.77219343, + "balance_loss_clip": 0.08535384, + "balance_loss_mlp": 0.02463487, + "epoch": 0.0022245603487148656, + "flos": 26549601413760.0, + "grad_norm": 1356.3181729473308, + "language_loss": 6.23619747, + "learning_rate": 2.3248973825097614e-06, + "loss": 7.13977337, + "num_input_tokens_seen": 700170, + "router_z_loss_clip": 46.03125, + "router_z_loss_mlp": 747.5, + "step": 37, + "time_per_iteration": 2.761021375656128 + }, + { + "auxiliary_loss_clip": 0.1308586, + "auxiliary_loss_mlp": 0.75746208, + "balance_loss_clip": 0.0852948, + "balance_loss_mlp": 0.02455192, + "epoch": 0.0022846836013828346, + "flos": 20344201666560.0, + "grad_norm": 550.1318567752543, + "language_loss": 6.76989794, + "learning_rate": 2.3420677916238357e-06, + "loss": 7.65821838, + "num_input_tokens_seen": 718545, + "router_z_loss_clip": 45.53125, + "router_z_loss_mlp": 733.5, + "step": 38, + "time_per_iteration": 2.797001600265503 + }, + { + "auxiliary_loss_clip": 0.13035053, + "auxiliary_loss_mlp": 0.76824772, + "balance_loss_clip": 0.08534516, + "balance_loss_mlp": 0.02459541, + "epoch": 0.002344806854050804, + "flos": 26254359152640.0, + "grad_norm": 327.614641212253, + "language_loss": 6.69246101, + "learning_rate": 2.358792165262154e-06, + "loss": 7.59105968, + "num_input_tokens_seen": 739865, + "router_z_loss_clip": 45.0, + "router_z_loss_mlp": 744.0, + "step": 39, + "time_per_iteration": 2.7852022647857666 + }, + { + "auxiliary_loss_clip": 0.1300399, + "auxiliary_loss_mlp": 0.74368668, + "balance_loss_clip": 0.08536238, + "balance_loss_mlp": 0.0244484, + "epoch": 0.0024049301067187736, + "flos": 11806296612480.0, + "grad_norm": 474.92846081285364, + "language_loss": 5.92113161, + "learning_rate": 2.3750930912143747e-06, + "loss": 6.79485798, + "num_input_tokens_seen": 755770, + "router_z_loss_clip": 44.6875, + "router_z_loss_mlp": 720.0, + "step": 40, + "time_per_iteration": 2.679415464401245 + }, + { + "auxiliary_loss_clip": 0.1309007, + "auxiliary_loss_mlp": 0.78535652, + "balance_loss_clip": 0.08556648, + "balance_loss_mlp": 0.02461432, + "epoch": 0.0024650533593867426, + "flos": 20637808773120.0, + "grad_norm": 345.5419638030077, + "language_loss": 6.47731018, + "learning_rate": 2.3909914837471044e-06, + "loss": 7.39356709, + "num_input_tokens_seen": 773440, + "router_z_loss_clip": 45.3125, + "router_z_loss_mlp": 760.0, + "step": 41, + "time_per_iteration": 2.835094928741455 + }, + { + "auxiliary_loss_clip": 0.13010421, + "auxiliary_loss_mlp": 0.76229548, + "balance_loss_clip": 0.08534975, + "balance_loss_mlp": 0.02450255, + "epoch": 0.002525176612054712, + "flos": 18412093895040.0, + "grad_norm": 622.6550674421553, + "language_loss": 6.03043365, + "learning_rate": 2.4065067449483835e-06, + "loss": 6.92283392, + "num_input_tokens_seen": 790455, + "router_z_loss_clip": 44.75, + "router_z_loss_mlp": 738.0, + "step": 42, + "time_per_iteration": 2.66955828666687 + }, + { + "auxiliary_loss_clip": 0.13026509, + "auxiliary_loss_mlp": 0.76781166, + "balance_loss_clip": 0.08538143, + "balance_loss_mlp": 0.02464763, + "epoch": 0.0025852998647226816, + "flos": 28191582023040.0, + "grad_norm": 8462.035545761653, + "language_loss": 5.972929, + "learning_rate": 2.4216569070848724e-06, + "loss": 6.87100601, + "num_input_tokens_seen": 810645, + "router_z_loss_clip": 44.875, + "router_z_loss_mlp": 744.0, + "step": 43, + "time_per_iteration": 2.7703070640563965 + }, + { + "auxiliary_loss_clip": 0.13056265, + "auxiliary_loss_mlp": 0.74383116, + "balance_loss_clip": 0.0856277, + "balance_loss_mlp": 0.02459292, + "epoch": 0.0026454231173906506, + "flos": 14288372657280.0, + "grad_norm": 293.14149660558166, + "language_loss": 5.65497112, + "learning_rate": 2.4364587585915504e-06, + "loss": 6.52936459, + "num_input_tokens_seen": 827470, + "router_z_loss_clip": 44.875, + "router_z_loss_mlp": 720.0, + "step": 44, + "time_per_iteration": 2.655585527420044 + }, + { + "auxiliary_loss_clip": 0.13054577, + "auxiliary_loss_mlp": 0.75350422, + "balance_loss_clip": 0.08569255, + "balance_loss_mlp": 0.02450033, + "epoch": 0.00270554637005862, + "flos": 22425796321920.0, + "grad_norm": 174.2843578867089, + "language_loss": 6.01187468, + "learning_rate": 2.450927955901469e-06, + "loss": 6.89592457, + "num_input_tokens_seen": 847285, + "router_z_loss_clip": 44.84375, + "router_z_loss_mlp": 730.0, + "step": 45, + "time_per_iteration": 2.705265522003174 + }, + { + "auxiliary_loss_clip": 0.12984964, + "auxiliary_loss_mlp": 0.73199093, + "balance_loss_clip": 0.08560722, + "balance_loss_mlp": 0.02447144, + "epoch": 0.0027656696227265896, + "flos": 23992236875520.0, + "grad_norm": 191.3929439681521, + "language_loss": 6.48347139, + "learning_rate": 2.465079122983384e-06, + "loss": 7.34531212, + "num_input_tokens_seen": 867545, + "router_z_loss_clip": 44.1875, + "router_z_loss_mlp": 708.5, + "step": 46, + "time_per_iteration": 2.733833074569702 + }, + { + "auxiliary_loss_clip": 0.12997682, + "auxiliary_loss_mlp": 0.73999059, + "balance_loss_clip": 0.08536641, + "balance_loss_mlp": 0.02465855, + "epoch": 0.0028257928753945586, + "flos": 37678511220480.0, + "grad_norm": 214.21785552289575, + "language_loss": 5.68396425, + "learning_rate": 2.4789259401737868e-06, + "loss": 6.55393171, + "num_input_tokens_seen": 889915, + "router_z_loss_clip": 44.5625, + "router_z_loss_mlp": 716.0, + "step": 47, + "time_per_iteration": 2.8230926990509033 + }, + { + "auxiliary_loss_clip": 0.1297729, + "auxiliary_loss_mlp": 0.74471426, + "balance_loss_clip": 0.08536708, + "balance_loss_mlp": 0.0244994, + "epoch": 0.002885916128062528, + "flos": 22460945909760.0, + "grad_norm": 449.4004858001912, + "language_loss": 5.75540733, + "learning_rate": 2.492481223656015e-06, + "loss": 6.62989426, + "num_input_tokens_seen": 908975, + "router_z_loss_clip": 44.40625, + "router_z_loss_mlp": 721.5, + "step": 48, + "time_per_iteration": 2.7284624576568604 + }, + { + "auxiliary_loss_clip": 0.12959239, + "auxiliary_loss_mlp": 0.73848325, + "balance_loss_clip": 0.08549985, + "balance_loss_mlp": 0.02461606, + "epoch": 0.0029460393807304976, + "flos": 27019543438080.0, + "grad_norm": 230.30029270071188, + "language_loss": 6.70517731, + "learning_rate": 2.5057569967437924e-06, + "loss": 7.57325315, + "num_input_tokens_seen": 929810, + "router_z_loss_clip": 44.0625, + "router_z_loss_mlp": 715.0, + "step": 49, + "time_per_iteration": 2.792755603790283 + }, + { + "auxiliary_loss_clip": 0.12996669, + "auxiliary_loss_mlp": 0.71446228, + "balance_loss_clip": 0.08555867, + "balance_loss_mlp": 0.02452083, + "epoch": 0.0030061626333984666, + "flos": 15857328833280.0, + "grad_norm": 311.93786428729913, + "language_loss": 5.55702782, + "learning_rate": 2.51876455396287e-06, + "loss": 6.40145731, + "num_input_tokens_seen": 948650, + "router_z_loss_clip": 44.34375, + "router_z_loss_mlp": 690.5, + "step": 50, + "time_per_iteration": 2.689176559448242 + }, + { + "auxiliary_loss_clip": 0.12955803, + "auxiliary_loss_mlp": 0.71350002, + "balance_loss_clip": 0.08553191, + "balance_loss_mlp": 0.02453516, + "epoch": 0.003066285886066436, + "flos": 31834292497920.0, + "grad_norm": 326.0050772098012, + "language_loss": 6.42039013, + "learning_rate": 2.5315145187866316e-06, + "loss": 7.26344872, + "num_input_tokens_seen": 966455, + "router_z_loss_clip": 44.0, + "router_z_loss_mlp": 689.5, + "step": 51, + "time_per_iteration": 2.751997232437134 + }, + { + "auxiliary_loss_clip": 0.12936625, + "auxiliary_loss_mlp": 0.71062022, + "balance_loss_clip": 0.08552323, + "balance_loss_mlp": 0.02458507, + "epoch": 0.0031264091387344056, + "flos": 41437110291840.0, + "grad_norm": 467.7969407780881, + "language_loss": 5.78601551, + "learning_rate": 2.5440168957651953e-06, + "loss": 6.62600183, + "num_input_tokens_seen": 988110, + "router_z_loss_clip": 43.84375, + "router_z_loss_mlp": 686.5, + "step": 52, + "time_per_iteration": 2.8259687423706055 + }, + { + "auxiliary_loss_clip": 0.12935326, + "auxiliary_loss_mlp": 0.69343221, + "balance_loss_clip": 0.08550587, + "balance_loss_mlp": 0.02448688, + "epoch": 0.0031865323914023747, + "flos": 23447719117440.0, + "grad_norm": 4084.3297995155954, + "language_loss": 5.79331207, + "learning_rate": 2.5562811176888872e-06, + "loss": 6.61609745, + "num_input_tokens_seen": 1008550, + "router_z_loss_clip": 43.78125, + "router_z_loss_mlp": 669.0, + "step": 53, + "time_per_iteration": 2.6902496814727783 + }, + { + "auxiliary_loss_clip": 0.12926383, + "auxiliary_loss_mlp": 0.69104648, + "balance_loss_clip": 0.08542258, + "balance_loss_mlp": 0.02454257, + "epoch": 0.003246655644070344, + "flos": 14434505377920.0, + "grad_norm": 247.18448581495338, + "language_loss": 5.53028297, + "learning_rate": 2.5683160883431093e-06, + "loss": 6.35059309, + "num_input_tokens_seen": 1026840, + "router_z_loss_clip": 43.75, + "router_z_loss_mlp": 666.5, + "step": 54, + "time_per_iteration": 2.642801523208618 + }, + { + "auxiliary_loss_clip": 0.12913677, + "auxiliary_loss_mlp": 0.68966341, + "balance_loss_clip": 0.08543722, + "balance_loss_mlp": 0.02462436, + "epoch": 0.0033067788967383136, + "flos": 35926972997760.0, + "grad_norm": 431.229914559421, + "language_loss": 5.18386555, + "learning_rate": 2.580130221340046e-06, + "loss": 6.00266552, + "num_input_tokens_seen": 1048875, + "router_z_loss_clip": 43.6875, + "router_z_loss_mlp": 665.0, + "step": 55, + "time_per_iteration": 2.7916810512542725 + }, + { + "auxiliary_loss_clip": 0.12884736, + "auxiliary_loss_mlp": 0.68559694, + "balance_loss_clip": 0.08553176, + "balance_loss_mlp": 0.02446416, + "epoch": 0.003366902149406283, + "flos": 22964108878080.0, + "grad_norm": 559.5224439968259, + "language_loss": 5.74156904, + "learning_rate": 2.5917314754514246e-06, + "loss": 6.55601311, + "num_input_tokens_seen": 1066435, + "router_z_loss_clip": 43.28125, + "router_z_loss_mlp": 661.0, + "step": 56, + "time_per_iteration": 2.638873338699341 + }, + { + "auxiliary_loss_clip": 0.12877631, + "auxiliary_loss_mlp": 0.65916806, + "balance_loss_clip": 0.08553813, + "balance_loss_mlp": 0.02440244, + "epoch": 0.003427025402074252, + "flos": 26590830422400.0, + "grad_norm": 1293.1571760901363, + "language_loss": 6.61670828, + "learning_rate": 2.6031273868139713e-06, + "loss": 7.4046526, + "num_input_tokens_seen": 1090330, + "router_z_loss_clip": 43.28125, + "router_z_loss_mlp": 634.0, + "step": 57, + "time_per_iteration": 4.246931314468384 + }, + { + "auxiliary_loss_clip": 0.12864697, + "auxiliary_loss_mlp": 0.66109824, + "balance_loss_clip": 0.08544569, + "balance_loss_mlp": 0.02437945, + "epoch": 0.0034871486547422216, + "flos": 23957967755520.0, + "grad_norm": 1581.401693587077, + "language_loss": 6.75815916, + "learning_rate": 2.614325098333948e-06, + "loss": 7.54790401, + "num_input_tokens_seen": 1109840, + "router_z_loss_clip": 43.25, + "router_z_loss_mlp": 636.0, + "step": 58, + "time_per_iteration": 4.129940986633301 + }, + { + "auxiliary_loss_clip": 0.12923497, + "auxiliary_loss_mlp": 0.64957327, + "balance_loss_clip": 0.08577307, + "balance_loss_mlp": 0.02457325, + "epoch": 0.003547271907410191, + "flos": 21221333406720.0, + "grad_norm": 1242.7465016222895, + "language_loss": 5.84827662, + "learning_rate": 2.625331386578098e-06, + "loss": 6.62708521, + "num_input_tokens_seen": 1128415, + "router_z_loss_clip": 43.40625, + "router_z_loss_mlp": 624.0, + "step": 59, + "time_per_iteration": 2.81791090965271 + }, + { + "auxiliary_loss_clip": 0.1292145, + "auxiliary_loss_mlp": 0.65939367, + "balance_loss_clip": 0.08575267, + "balance_loss_mlp": 0.02462805, + "epoch": 0.00360739516007816, + "flos": 16509894831360.0, + "grad_norm": 2163.0106173410372, + "language_loss": 6.19513655, + "learning_rate": 2.63615268640451e-06, + "loss": 6.98374462, + "num_input_tokens_seen": 1146515, + "router_z_loss_clip": 43.4375, + "router_z_loss_mlp": 634.0, + "step": 60, + "time_per_iteration": 2.6462490558624268 + }, + { + "auxiliary_loss_clip": 0.12888563, + "auxiliary_loss_mlp": 0.64225286, + "balance_loss_clip": 0.08565725, + "balance_loss_mlp": 0.0245771, + "epoch": 0.0036675184127461296, + "flos": 19471052995200.0, + "grad_norm": 635.7445513752676, + "language_loss": 5.79569387, + "learning_rate": 2.6467951135575943e-06, + "loss": 6.56683254, + "num_input_tokens_seen": 1166330, + "router_z_loss_clip": 43.21875, + "router_z_loss_mlp": 617.0, + "step": 61, + "time_per_iteration": 2.681910753250122 + }, + { + "auxiliary_loss_clip": 0.12824672, + "auxiliary_loss_mlp": 0.63430971, + "balance_loss_clip": 0.08548941, + "balance_loss_mlp": 0.02444647, + "epoch": 0.003727641665414099, + "flos": 20963253231360.0, + "grad_norm": 899.0914058712833, + "language_loss": 5.87668133, + "learning_rate": 2.657264485425803e-06, + "loss": 6.63923836, + "num_input_tokens_seen": 1186010, + "router_z_loss_clip": 42.71875, + "router_z_loss_mlp": 609.0, + "step": 62, + "time_per_iteration": 2.6819515228271484 + }, + { + "auxiliary_loss_clip": 0.12823591, + "auxiliary_loss_mlp": 0.6255362, + "balance_loss_clip": 0.08562292, + "balance_loss_mlp": 0.02446202, + "epoch": 0.003787764918082068, + "flos": 18412010040960.0, + "grad_norm": 1285.0325266073119, + "language_loss": 5.71324301, + "learning_rate": 2.6675663401385186e-06, + "loss": 6.46701479, + "num_input_tokens_seen": 1204985, + "router_z_loss_clip": 42.59375, + "router_z_loss_mlp": 600.0, + "step": 63, + "time_per_iteration": 2.6705985069274902 + }, + { + "auxiliary_loss_clip": 0.12830947, + "auxiliary_loss_mlp": 0.62154531, + "balance_loss_clip": 0.08567161, + "balance_loss_mlp": 0.02437731, + "epoch": 0.0038478881707500376, + "flos": 12464271198720.0, + "grad_norm": 1843.6770385957534, + "language_loss": 5.25008583, + "learning_rate": 2.677705954159056e-06, + "loss": 5.99994087, + "num_input_tokens_seen": 1223545, + "router_z_loss_clip": 42.6875, + "router_z_loss_mlp": 597.0, + "step": 64, + "time_per_iteration": 2.7688894271850586 + }, + { + "auxiliary_loss_clip": 0.12807481, + "auxiliary_loss_mlp": 0.61575615, + "balance_loss_clip": 0.08564365, + "balance_loss_mlp": 0.02444756, + "epoch": 0.003908011423418007, + "flos": 13558463740800.0, + "grad_norm": 1007.498474071754, + "language_loss": 5.29735851, + "learning_rate": 2.6876883585136904e-06, + "loss": 6.04118919, + "num_input_tokens_seen": 1241175, + "router_z_loss_clip": 42.40625, + "router_z_loss_mlp": 590.5, + "step": 65, + "time_per_iteration": 2.7044079303741455 + }, + { + "auxiliary_loss_clip": 0.12739113, + "auxiliary_loss_mlp": 0.60150075, + "balance_loss_clip": 0.08550942, + "balance_loss_mlp": 0.02435229, + "epoch": 0.003968134676085976, + "flos": 18339488732160.0, + "grad_norm": 1472.5993340381553, + "language_loss": 5.05529404, + "learning_rate": 2.697518353781685e-06, + "loss": 5.78418589, + "num_input_tokens_seen": 1259315, + "router_z_loss_clip": 41.90625, + "router_z_loss_mlp": 577.0, + "step": 66, + "time_per_iteration": 2.639763116836548 + }, + { + "auxiliary_loss_clip": 0.12713413, + "auxiliary_loss_mlp": 0.58826029, + "balance_loss_clip": 0.08548602, + "balance_loss_mlp": 0.02429543, + "epoch": 0.004028257928753946, + "flos": 20491466417280.0, + "grad_norm": 2128.447716031984, + "language_loss": 5.57779789, + "learning_rate": 2.7072005239581103e-06, + "loss": 6.29319191, + "num_input_tokens_seen": 1277055, + "router_z_loss_clip": 41.65625, + "router_z_loss_mlp": 564.0, + "step": 67, + "time_per_iteration": 2.6764183044433594 + }, + { + "auxiliary_loss_clip": 0.12659386, + "auxiliary_loss_mlp": 0.59566367, + "balance_loss_clip": 0.08534892, + "balance_loss_mlp": 0.02437462, + "epoch": 0.004088381181421915, + "flos": 18849863151360.0, + "grad_norm": 1300.1095038466112, + "language_loss": 5.65431881, + "learning_rate": 2.7167392492896727e-06, + "loss": 6.37657642, + "num_input_tokens_seen": 1294355, + "router_z_loss_clip": 41.21875, + "router_z_loss_mlp": 571.5, + "step": 68, + "time_per_iteration": 2.6499533653259277 + }, + { + "auxiliary_loss_clip": 0.12670201, + "auxiliary_loss_mlp": 0.59023213, + "balance_loss_clip": 0.08528139, + "balance_loss_mlp": 0.02431421, + "epoch": 0.004148504434089885, + "flos": 19433974763520.0, + "grad_norm": 775.8661457915586, + "language_loss": 5.68540192, + "learning_rate": 2.7261387181735195e-06, + "loss": 6.40233564, + "num_input_tokens_seen": 1313525, + "router_z_loss_clip": 41.375, + "router_z_loss_mlp": 566.0, + "step": 69, + "time_per_iteration": 2.680570363998413 + }, + { + "auxiliary_loss_clip": 0.12638462, + "auxiliary_loss_mlp": 0.5930984, + "balance_loss_clip": 0.08532386, + "balance_loss_mlp": 0.02425073, + "epoch": 0.004208627686757853, + "flos": 20816868948480.0, + "grad_norm": 532.7078221445815, + "language_loss": 6.55753994, + "learning_rate": 2.7354029381999196e-06, + "loss": 7.27702332, + "num_input_tokens_seen": 1330505, + "router_z_loss_clip": 41.09375, + "router_z_loss_mlp": 570.0, + "step": 70, + "time_per_iteration": 2.6596553325653076 + }, + { + "auxiliary_loss_clip": 0.12589023, + "auxiliary_loss_mlp": 0.57596606, + "balance_loss_clip": 0.08525643, + "balance_loss_mlp": 0.02420826, + "epoch": 0.004268750939425823, + "flos": 19104589163520.0, + "grad_norm": 3523.620393185992, + "language_loss": 4.99572229, + "learning_rate": 2.7445357464116983e-06, + "loss": 5.69757891, + "num_input_tokens_seen": 1349615, + "router_z_loss_clip": 40.71875, + "router_z_loss_mlp": 552.5, + "step": 71, + "time_per_iteration": 2.6517086029052734 + }, + { + "auxiliary_loss_clip": 0.13345143, + "auxiliary_loss_mlp": 0.53337634, + "balance_loss_clip": 0.08910056, + "balance_loss_mlp": 0.02458726, + "epoch": 0.004328874192093792, + "flos": 52456112340480.0, + "grad_norm": 24.73254947156558, + "language_loss": 0.75920403, + "learning_rate": 2.75354081884615e-06, + "loss": 1.42603183, + "num_input_tokens_seen": 1410275, + "router_z_loss_clip": 44.375, + "router_z_loss_mlp": 508.25, + "step": 72, + "time_per_iteration": 3.4461121559143066 + }, + { + "auxiliary_loss_clip": 0.13279217, + "auxiliary_loss_mlp": 0.51093936, + "balance_loss_clip": 0.08903308, + "balance_loss_mlp": 0.02436709, + "epoch": 0.004388997444761762, + "flos": 66495922260480.0, + "grad_norm": 24.018429481505308, + "language_loss": 0.70889235, + "learning_rate": 2.7624216794188286e-06, + "loss": 1.35262394, + "num_input_tokens_seen": 1473020, + "router_z_loss_clip": 43.71875, + "router_z_loss_mlp": 486.25, + "step": 73, + "time_per_iteration": 3.8973076343536377 + }, + { + "auxiliary_loss_clip": 0.12491501, + "auxiliary_loss_mlp": 0.53349555, + "balance_loss_clip": 0.08502775, + "balance_loss_mlp": 0.02397403, + "epoch": 0.004449120697429731, + "flos": 18958959567360.0, + "grad_norm": 3320.4524015503866, + "language_loss": 5.2433157, + "learning_rate": 2.771181708202938e-06, + "loss": 5.90172577, + "num_input_tokens_seen": 1490385, + "router_z_loss_clip": 39.90625, + "router_z_loss_mlp": 509.5, + "step": 74, + "time_per_iteration": 2.6803529262542725 + }, + { + "auxiliary_loss_clip": 0.12445074, + "auxiliary_loss_mlp": 0.51731253, + "balance_loss_clip": 0.08501716, + "balance_loss_mlp": 0.02390428, + "epoch": 0.004509243950097701, + "flos": 21111817720320.0, + "grad_norm": 2097.466788992517, + "language_loss": 5.57566261, + "learning_rate": 2.779824149153005e-06, + "loss": 6.21742582, + "num_input_tokens_seen": 1509725, + "router_z_loss_clip": 39.4375, + "router_z_loss_mlp": 493.0, + "step": 75, + "time_per_iteration": 2.687678575515747 + }, + { + "auxiliary_loss_clip": 0.12385009, + "auxiliary_loss_mlp": 0.49917772, + "balance_loss_clip": 0.08505447, + "balance_loss_mlp": 0.0235918, + "epoch": 0.004569367202765669, + "flos": 20704082952960.0, + "grad_norm": 7030.779065512956, + "language_loss": 5.64007378, + "learning_rate": 2.788352117317012e-06, + "loss": 6.26310158, + "num_input_tokens_seen": 1527245, + "router_z_loss_clip": 38.8125, + "router_z_loss_mlp": 475.25, + "step": 76, + "time_per_iteration": 2.666630744934082 + }, + { + "auxiliary_loss_clip": 0.12336895, + "auxiliary_loss_mlp": 0.48941305, + "balance_loss_clip": 0.08483945, + "balance_loss_mlp": 0.02359273, + "epoch": 0.004629490455433639, + "flos": 28666136021760.0, + "grad_norm": 620.4309602119407, + "language_loss": 5.72052956, + "learning_rate": 2.796768605577095e-06, + "loss": 6.33331108, + "num_input_tokens_seen": 1548930, + "router_z_loss_clip": 38.5, + "router_z_loss_mlp": 465.5, + "step": 77, + "time_per_iteration": 2.7469568252563477 + }, + { + "auxiliary_loss_clip": 0.12308235, + "auxiliary_loss_mlp": 0.48191378, + "balance_loss_clip": 0.08460534, + "balance_loss_mlp": 0.02366182, + "epoch": 0.004689613708101608, + "flos": 11077142382720.0, + "grad_norm": 1643.3438058920954, + "language_loss": 5.09305811, + "learning_rate": 2.80507649095533e-06, + "loss": 5.69805431, + "num_input_tokens_seen": 1565695, + "router_z_loss_clip": 38.5, + "router_z_loss_mlp": 458.25, + "step": 78, + "time_per_iteration": 2.6558547019958496 + }, + { + "auxiliary_loss_clip": 0.12249273, + "auxiliary_loss_mlp": 0.46293706, + "balance_loss_clip": 0.08442898, + "balance_loss_mlp": 0.02348393, + "epoch": 0.004749736960769578, + "flos": 21805612727040.0, + "grad_norm": 2200.9167741447113, + "language_loss": 4.90451622, + "learning_rate": 2.813278540517843e-06, + "loss": 5.48994637, + "num_input_tokens_seen": 1582625, + "router_z_loss_clip": 38.0625, + "router_z_loss_mlp": 439.75, + "step": 79, + "time_per_iteration": 2.7162697315216064 + }, + { + "auxiliary_loss_clip": 0.12262511, + "auxiliary_loss_mlp": 0.46983981, + "balance_loss_clip": 0.08447941, + "balance_loss_mlp": 0.02355075, + "epoch": 0.004809860213437547, + "flos": 19798803440640.0, + "grad_norm": 344.66463824801895, + "language_loss": 5.05523586, + "learning_rate": 2.8213774169075505e-06, + "loss": 5.64770126, + "num_input_tokens_seen": 1601725, + "router_z_loss_clip": 38.125, + "router_z_loss_mlp": 446.75, + "step": 80, + "time_per_iteration": 2.687460422515869 + }, + { + "auxiliary_loss_clip": 0.12261841, + "auxiliary_loss_mlp": 0.45211679, + "balance_loss_clip": 0.08451226, + "balance_loss_mlp": 0.02364997, + "epoch": 0.004869983466105517, + "flos": 26580893713920.0, + "grad_norm": 1677.7099343970488, + "language_loss": 5.56453705, + "learning_rate": 2.829375683533245e-06, + "loss": 6.13927221, + "num_input_tokens_seen": 1622420, + "router_z_loss_clip": 38.125, + "router_z_loss_mlp": 428.5, + "step": 81, + "time_per_iteration": 2.7709527015686035 + }, + { + "auxiliary_loss_clip": 0.12245495, + "auxiliary_loss_mlp": 0.44303346, + "balance_loss_clip": 0.08439148, + "balance_loss_mlp": 0.02335574, + "epoch": 0.004930106718773485, + "flos": 12828345189120.0, + "grad_norm": 4679.4395433895315, + "language_loss": 4.60398674, + "learning_rate": 2.8372758094402803e-06, + "loss": 5.16947508, + "num_input_tokens_seen": 1640715, + "router_z_loss_clip": 38.125, + "router_z_loss_mlp": 419.75, + "step": 82, + "time_per_iteration": 2.6463286876678467 + }, + { + "auxiliary_loss_clip": 0.12233329, + "auxiliary_loss_mlp": 0.44903332, + "balance_loss_clip": 0.0843938, + "balance_loss_mlp": 0.0234962, + "epoch": 0.004990229971441455, + "flos": 25781901505920.0, + "grad_norm": 1468.5073951038269, + "language_loss": 5.41148376, + "learning_rate": 2.84508017388607e-06, + "loss": 5.98285007, + "num_input_tokens_seen": 1662210, + "router_z_loss_clip": 37.96875, + "router_z_loss_mlp": 425.5, + "step": 83, + "time_per_iteration": 2.751582145690918 + }, + { + "auxiliary_loss_clip": 0.12286501, + "auxiliary_loss_mlp": 0.44843888, + "balance_loss_clip": 0.08466095, + "balance_loss_mlp": 0.0236342, + "epoch": 0.005050353224109424, + "flos": 17463027824640.0, + "grad_norm": 333.54187308321605, + "language_loss": 4.89241934, + "learning_rate": 2.852791070641559e-06, + "loss": 5.46372318, + "num_input_tokens_seen": 1681070, + "router_z_loss_clip": 38.21875, + "router_z_loss_mlp": 425.0, + "step": 84, + "time_per_iteration": 2.6613667011260986 + }, + { + "auxiliary_loss_clip": 0.12715524, + "auxiliary_loss_mlp": 0.33666173, + "balance_loss_clip": 0.08695208, + "balance_loss_mlp": 0.02245275, + "epoch": 0.005110476476777394, + "flos": 69824607160320.0, + "grad_norm": 16.750834021856043, + "language_loss": 0.63998127, + "learning_rate": 2.8604107120381682e-06, + "loss": 1.10379827, + "num_input_tokens_seen": 1747140, + "router_z_loss_clip": 40.09375, + "router_z_loss_mlp": 313.75, + "step": 85, + "time_per_iteration": 3.4564764499664307 + }, + { + "auxiliary_loss_clip": 0.12209877, + "auxiliary_loss_mlp": 0.42757708, + "balance_loss_clip": 0.08426955, + "balance_loss_mlp": 0.02352437, + "epoch": 0.005170599729445363, + "flos": 24796973088000.0, + "grad_norm": 542.703970895993, + "language_loss": 4.92362881, + "learning_rate": 2.8679412327780482e-06, + "loss": 5.47330475, + "num_input_tokens_seen": 1767475, + "router_z_loss_clip": 37.90625, + "router_z_loss_mlp": 403.75, + "step": 86, + "time_per_iteration": 2.775689125061035 + }, + { + "auxiliary_loss_clip": 0.12224952, + "auxiliary_loss_mlp": 0.4164477, + "balance_loss_clip": 0.08412233, + "balance_loss_mlp": 0.02362544, + "epoch": 0.005230722982113333, + "flos": 23264717800320.0, + "grad_norm": 4371.207136836947, + "language_loss": 5.4414258, + "learning_rate": 2.8753846935240833e-06, + "loss": 5.98012304, + "num_input_tokens_seen": 1784980, + "router_z_loss_clip": 38.15625, + "router_z_loss_mlp": 392.25, + "step": 87, + "time_per_iteration": 2.7322311401367188 + }, + { + "auxiliary_loss_clip": 0.12200201, + "auxiliary_loss_mlp": 0.41744971, + "balance_loss_clip": 0.08406796, + "balance_loss_mlp": 0.02365087, + "epoch": 0.005290846234781301, + "flos": 16733622032640.0, + "grad_norm": 2919.861295310318, + "language_loss": 4.86351013, + "learning_rate": 2.8827430842847267e-06, + "loss": 5.40296173, + "num_input_tokens_seen": 1803030, + "router_z_loss_clip": 38.0, + "router_z_loss_mlp": 393.75, + "step": 88, + "time_per_iteration": 2.7260544300079346 + }, + { + "auxiliary_loss_clip": 0.1219901, + "auxiliary_loss_mlp": 0.40224642, + "balance_loss_clip": 0.08417168, + "balance_loss_mlp": 0.02358433, + "epoch": 0.005350969487449271, + "flos": 20892283223040.0, + "grad_norm": 1645.58162705774, + "language_loss": 5.16751766, + "learning_rate": 2.8900183276075957e-06, + "loss": 5.69175386, + "num_input_tokens_seen": 1822865, + "router_z_loss_clip": 37.875, + "router_z_loss_mlp": 378.5, + "step": 89, + "time_per_iteration": 2.674370288848877 + }, + { + "auxiliary_loss_clip": 0.12154645, + "auxiliary_loss_mlp": 0.38342261, + "balance_loss_clip": 0.0840472, + "balance_loss_mlp": 0.02331517, + "epoch": 0.00541109274011724, + "flos": 26216568161280.0, + "grad_norm": 1270.091627450628, + "language_loss": 4.37986279, + "learning_rate": 2.8972122815946455e-06, + "loss": 4.88483191, + "num_input_tokens_seen": 1842435, + "router_z_loss_clip": 37.5, + "router_z_loss_mlp": 360.75, + "step": 90, + "time_per_iteration": 2.7423648834228516 + }, + { + "auxiliary_loss_clip": 0.12150387, + "auxiliary_loss_mlp": 0.38653693, + "balance_loss_clip": 0.08385181, + "balance_loss_mlp": 0.02349981, + "epoch": 0.00547121599278521, + "flos": 21184926007680.0, + "grad_norm": 803.9563265609303, + "language_loss": 5.31085825, + "learning_rate": 2.90432674275074e-06, + "loss": 5.81889915, + "num_input_tokens_seen": 1860065, + "router_z_loss_clip": 37.6875, + "router_z_loss_mlp": 363.0, + "step": 91, + "time_per_iteration": 2.6603400707244873 + }, + { + "auxiliary_loss_clip": 0.12079477, + "auxiliary_loss_mlp": 0.37034535, + "balance_loss_clip": 0.08381163, + "balance_loss_mlp": 0.02342154, + "epoch": 0.005531339245453179, + "flos": 19724856612480.0, + "grad_norm": 829.7403965041182, + "language_loss": 4.4634366, + "learning_rate": 2.91136344867656e-06, + "loss": 4.95457649, + "num_input_tokens_seen": 1878135, + "router_z_loss_clip": 37.0, + "router_z_loss_mlp": 347.25, + "step": 92, + "time_per_iteration": 2.6818525791168213 + }, + { + "auxiliary_loss_clip": 0.1209444, + "auxiliary_loss_mlp": 0.35073167, + "balance_loss_clip": 0.08383686, + "balance_loss_mlp": 0.02309498, + "epoch": 0.005591462498121149, + "flos": 17641291386240.0, + "grad_norm": 1625.08326205636, + "language_loss": 4.56070709, + "learning_rate": 2.918324080615938e-06, + "loss": 5.03238297, + "num_input_tokens_seen": 1894895, + "router_z_loss_clip": 37.125, + "router_z_loss_mlp": 327.5, + "step": 93, + "time_per_iteration": 2.612030029296875 + }, + { + "auxiliary_loss_clip": 0.12023389, + "auxiliary_loss_mlp": 0.34590679, + "balance_loss_clip": 0.08357395, + "balance_loss_mlp": 0.02290875, + "epoch": 0.005651585750789117, + "flos": 20017415543040.0, + "grad_norm": 681.2724931544728, + "language_loss": 4.70847607, + "learning_rate": 2.925210265866963e-06, + "loss": 5.17461681, + "num_input_tokens_seen": 1913220, + "router_z_loss_clip": 36.625, + "router_z_loss_mlp": 322.75, + "step": 94, + "time_per_iteration": 2.6726646423339844 + }, + { + "auxiliary_loss_clip": 0.12331794, + "auxiliary_loss_mlp": 0.21429604, + "balance_loss_clip": 0.08515669, + "balance_loss_mlp": 0.01873939, + "epoch": 0.005711709003457087, + "flos": 59831202758400.0, + "grad_norm": 11.50707364837694, + "language_loss": 0.68575168, + "learning_rate": 2.932023580065507e-06, + "loss": 1.02336574, + "num_input_tokens_seen": 1970970, + "router_z_loss_clip": 38.0, + "router_z_loss_mlp": 195.25, + "step": 95, + "time_per_iteration": 3.168633222579956 + }, + { + "auxiliary_loss_clip": 0.11899618, + "auxiliary_loss_mlp": 0.32138801, + "balance_loss_clip": 0.08329217, + "balance_loss_mlp": 0.02231575, + "epoch": 0.005771832256125056, + "flos": 15564979537920.0, + "grad_norm": 1013.3395640383166, + "language_loss": 4.49414778, + "learning_rate": 2.9387655493491906e-06, + "loss": 4.93453217, + "num_input_tokens_seen": 1988930, + "router_z_loss_clip": 35.71875, + "router_z_loss_mlp": 298.5, + "step": 96, + "time_per_iteration": 5.5690062046051025 + }, + { + "auxiliary_loss_clip": 0.11822618, + "auxiliary_loss_mlp": 0.30064785, + "balance_loss_clip": 0.08285143, + "balance_loss_mlp": 0.02220548, + "epoch": 0.005831955508793026, + "flos": 22534934664960.0, + "grad_norm": 2356.5481695677104, + "language_loss": 5.16498899, + "learning_rate": 2.9454376524092147e-06, + "loss": 5.58386326, + "num_input_tokens_seen": 2006285, + "router_z_loss_clip": 35.4375, + "router_z_loss_mlp": 278.375, + "step": 97, + "time_per_iteration": 4.129577159881592 + }, + { + "auxiliary_loss_clip": 0.11772624, + "auxiliary_loss_mlp": 0.27429676, + "balance_loss_clip": 0.08268203, + "balance_loss_mlp": 0.02161121, + "epoch": 0.005892078761460995, + "flos": 22055600983680.0, + "grad_norm": 1442.767046866879, + "language_loss": 4.65611029, + "learning_rate": 2.952041322436969e-06, + "loss": 5.04813337, + "num_input_tokens_seen": 2024905, + "router_z_loss_clip": 35.03125, + "router_z_loss_mlp": 252.75, + "step": 98, + "time_per_iteration": 4.072925567626953 + }, + { + "auxiliary_loss_clip": 0.12124368, + "auxiliary_loss_mlp": 0.12855935, + "balance_loss_clip": 0.08381641, + "balance_loss_mlp": 0.01625466, + "epoch": 0.005952202014128965, + "flos": 68559865632000.0, + "grad_norm": 9.945172746585492, + "language_loss": 0.65681642, + "learning_rate": 2.9585779489718204e-06, + "loss": 0.90661949, + "num_input_tokens_seen": 2086220, + "router_z_loss_clip": 37.46875, + "router_z_loss_mlp": 112.4375, + "step": 99, + "time_per_iteration": 3.3806052207946777 + }, + { + "auxiliary_loss_clip": 0.11659142, + "auxiliary_loss_mlp": 0.25495899, + "balance_loss_clip": 0.08219896, + "balance_loss_mlp": 0.02095021, + "epoch": 0.006012325266796933, + "flos": 22966624500480.0, + "grad_norm": 5439.355539233552, + "language_loss": 4.89178705, + "learning_rate": 2.9650488796560464e-06, + "loss": 5.26333714, + "num_input_tokens_seen": 2103365, + "router_z_loss_clip": 34.34375, + "router_z_loss_mlp": 233.875, + "step": 100, + "time_per_iteration": 2.6920084953308105 + }, + { + "auxiliary_loss_clip": 0.11642508, + "auxiliary_loss_mlp": 0.23216301, + "balance_loss_clip": 0.08225508, + "balance_loss_mlp": 0.02037103, + "epoch": 0.006072448519464903, + "flos": 17353721773440.0, + "grad_norm": 71170.85330308754, + "language_loss": 4.95652103, + "learning_rate": 2.971455421902446e-06, + "loss": 5.30510902, + "num_input_tokens_seen": 2121995, + "router_z_loss_clip": 34.15625, + "router_z_loss_mlp": 211.875, + "step": 101, + "time_per_iteration": 2.652926206588745 + }, + { + "auxiliary_loss_clip": 0.11583164, + "auxiliary_loss_mlp": 0.214275, + "balance_loss_clip": 0.08206252, + "balance_loss_mlp": 0.01957287, + "epoch": 0.006132571772132872, + "flos": 24688044380160.0, + "grad_norm": 7482.306451170957, + "language_loss": 5.13341808, + "learning_rate": 2.9777988444798075e-06, + "loss": 5.4635253, + "num_input_tokens_seen": 2141815, + "router_z_loss_clip": 33.78125, + "router_z_loss_mlp": 194.625, + "step": 102, + "time_per_iteration": 2.7020983695983887 + }, + { + "auxiliary_loss_clip": 0.11553724, + "auxiliary_loss_mlp": 0.20282698, + "balance_loss_clip": 0.08193958, + "balance_loss_mlp": 0.01923322, + "epoch": 0.006192695024800842, + "flos": 21471279736320.0, + "grad_norm": 1966.1076689836887, + "language_loss": 4.95062399, + "learning_rate": 2.9840803790210285e-06, + "loss": 5.26898813, + "num_input_tokens_seen": 2161125, + "router_z_loss_clip": 33.59375, + "router_z_loss_mlp": 183.75, + "step": 103, + "time_per_iteration": 2.652406692504883 + }, + { + "auxiliary_loss_clip": 0.11498895, + "auxiliary_loss_mlp": 0.18188542, + "balance_loss_clip": 0.08159411, + "balance_loss_mlp": 0.01855535, + "epoch": 0.006252818277468811, + "flos": 17426117301120.0, + "grad_norm": 4017.94727583705, + "language_loss": 4.81252193, + "learning_rate": 2.990301221458371e-06, + "loss": 5.10939646, + "num_input_tokens_seen": 2179510, + "router_z_loss_clip": 33.40625, + "router_z_loss_mlp": 163.25, + "step": 104, + "time_per_iteration": 2.6669459342956543 + }, + { + "auxiliary_loss_clip": 0.11507185, + "auxiliary_loss_mlp": 0.18210354, + "balance_loss_clip": 0.081876, + "balance_loss_mlp": 0.01852931, + "epoch": 0.006312941530136781, + "flos": 19105679266560.0, + "grad_norm": 5275.119248926157, + "language_loss": 4.54453945, + "learning_rate": 2.9964625333900544e-06, + "loss": 4.84171486, + "num_input_tokens_seen": 2197870, + "router_z_loss_clip": 33.1875, + "router_z_loss_mlp": 163.625, + "step": 105, + "time_per_iteration": 2.6467208862304688 + }, + { + "auxiliary_loss_clip": 0.11489026, + "auxiliary_loss_mlp": 0.17571044, + "balance_loss_clip": 0.08164956, + "balance_loss_mlp": 0.01872801, + "epoch": 0.006373064782804749, + "flos": 24067651150080.0, + "grad_norm": 56669.614766689854, + "language_loss": 4.9280014, + "learning_rate": 3.002565443382063e-06, + "loss": 5.2186017, + "num_input_tokens_seen": 2217495, + "router_z_loss_clip": 33.1875, + "router_z_loss_mlp": 157.0, + "step": 106, + "time_per_iteration": 2.7375807762145996 + }, + { + "auxiliary_loss_clip": 0.11464141, + "auxiliary_loss_mlp": 0.16512999, + "balance_loss_clip": 0.08158538, + "balance_loss_mlp": 0.01815734, + "epoch": 0.006433188035472719, + "flos": 18338272848000.0, + "grad_norm": 94457.61945163306, + "language_loss": 4.08243847, + "learning_rate": 3.008611048208843e-06, + "loss": 4.36221027, + "num_input_tokens_seen": 2236520, + "router_z_loss_clip": 33.0625, + "router_z_loss_mlp": 146.875, + "step": 107, + "time_per_iteration": 2.6703994274139404 + }, + { + "auxiliary_loss_clip": 0.12281319, + "auxiliary_loss_mlp": 0.04033342, + "balance_loss_clip": 0.08292686, + "balance_loss_mlp": 0.01773516, + "epoch": 0.006493311288140688, + "flos": 62583266257920.0, + "grad_norm": 1.9990534397749096, + "language_loss": 0.6506741, + "learning_rate": 3.014600414036285e-06, + "loss": 0.81382072, + "num_input_tokens_seen": 2300140, + "router_z_loss_clip": 40.0, + "router_z_loss_mlp": 22.640625, + "step": 108, + "time_per_iteration": 3.3318073749542236 + }, + { + "auxiliary_loss_clip": 0.1146347, + "auxiliary_loss_mlp": 0.17600623, + "balance_loss_clip": 0.08161052, + "balance_loss_mlp": 0.01902381, + "epoch": 0.006553434540808658, + "flos": 19506202583040.0, + "grad_norm": 2213.052526088781, + "language_loss": 5.47699499, + "learning_rate": 3.0205345775501937e-06, + "loss": 5.76763535, + "num_input_tokens_seen": 2317320, + "router_z_loss_clip": 33.03125, + "router_z_loss_mlp": 156.875, + "step": 109, + "time_per_iteration": 2.719162940979004 + }, + { + "auxiliary_loss_clip": 0.11452536, + "auxiliary_loss_mlp": 0.16698027, + "balance_loss_clip": 0.08172794, + "balance_loss_mlp": 0.01903106, + "epoch": 0.006613557793476627, + "flos": 21111398449920.0, + "grad_norm": 8171.333832946622, + "language_loss": 4.33011436, + "learning_rate": 3.0264145470332218e-06, + "loss": 4.61161995, + "num_input_tokens_seen": 2337820, + "router_z_loss_clip": 32.765625, + "router_z_loss_mlp": 147.75, + "step": 110, + "time_per_iteration": 2.7021584510803223 + }, + { + "auxiliary_loss_clip": 0.11498255, + "auxiliary_loss_mlp": 0.16723976, + "balance_loss_clip": 0.08168858, + "balance_loss_mlp": 0.01916846, + "epoch": 0.006673681046144597, + "flos": 26037843402240.0, + "grad_norm": 85243.79091039153, + "language_loss": 5.33909988, + "learning_rate": 3.032241303393073e-06, + "loss": 5.62132263, + "num_input_tokens_seen": 2358560, + "router_z_loss_clip": 33.25, + "router_z_loss_mlp": 148.0625, + "step": 111, + "time_per_iteration": 2.763227939605713 + }, + { + "auxiliary_loss_clip": 0.11479855, + "auxiliary_loss_mlp": 0.17865081, + "balance_loss_clip": 0.08154993, + "balance_loss_mlp": 0.01983733, + "epoch": 0.006733804298812566, + "flos": 23154279719040.0, + "grad_norm": 75829.31622331966, + "language_loss": 4.96874857, + "learning_rate": 3.0380158011446e-06, + "loss": 5.26219797, + "num_input_tokens_seen": 2379005, + "router_z_loss_clip": 33.1875, + "router_z_loss_mlp": 158.875, + "step": 112, + "time_per_iteration": 2.656294822692871 + }, + { + "auxiliary_loss_clip": 0.1147141, + "auxiliary_loss_mlp": 0.17070231, + "balance_loss_clip": 0.08172764, + "balance_loss_mlp": 0.01933513, + "epoch": 0.006793927551480535, + "flos": 11769092599680.0, + "grad_norm": 3384.2074822155987, + "language_loss": 4.32218456, + "learning_rate": 3.0437389693482466e-06, + "loss": 4.60760117, + "num_input_tokens_seen": 2395610, + "router_z_loss_clip": 32.96875, + "router_z_loss_mlp": 151.25, + "step": 113, + "time_per_iteration": 2.6669225692749023 + }, + { + "auxiliary_loss_clip": 0.11510996, + "auxiliary_loss_mlp": 0.18198231, + "balance_loss_clip": 0.08184206, + "balance_loss_mlp": 0.019995, + "epoch": 0.006854050804148504, + "flos": 19177990940160.0, + "grad_norm": 1118.9556792976962, + "language_loss": 4.58965397, + "learning_rate": 3.0494117125071475e-06, + "loss": 4.88674641, + "num_input_tokens_seen": 2415005, + "router_z_loss_clip": 33.28125, + "router_z_loss_mlp": 161.875, + "step": 114, + "time_per_iteration": 2.6245124340057373 + }, + { + "auxiliary_loss_clip": 0.11491105, + "auxiliary_loss_mlp": 0.15876909, + "balance_loss_clip": 0.08183911, + "balance_loss_mlp": 0.01912064, + "epoch": 0.006914174056816474, + "flos": 21988488263040.0, + "grad_norm": 3570.8470324102345, + "language_loss": 4.92026377, + "learning_rate": 3.055034911425055e-06, + "loss": 5.19394398, + "num_input_tokens_seen": 2433965, + "router_z_loss_clip": 33.09375, + "router_z_loss_mlp": 139.625, + "step": 115, + "time_per_iteration": 2.694258689880371 + }, + { + "auxiliary_loss_clip": 0.11497033, + "auxiliary_loss_mlp": 0.17786066, + "balance_loss_clip": 0.08183155, + "balance_loss_mlp": 0.02014583, + "epoch": 0.006974297309484443, + "flos": 16294636892160.0, + "grad_norm": 28497.885490954828, + "language_loss": 4.11111546, + "learning_rate": 3.0606094240271244e-06, + "loss": 4.40394688, + "num_input_tokens_seen": 2451605, + "router_z_loss_clip": 33.125, + "router_z_loss_mlp": 157.75, + "step": 116, + "time_per_iteration": 2.6153717041015625 + }, + { + "auxiliary_loss_clip": 0.11479296, + "auxiliary_loss_mlp": 0.17568065, + "balance_loss_clip": 0.08183482, + "balance_loss_mlp": 0.02040722, + "epoch": 0.007034420562152413, + "flos": 26111161324800.0, + "grad_norm": 6129.230277666204, + "language_loss": 4.56221914, + "learning_rate": 3.0661360861454656e-06, + "loss": 4.8526926, + "num_input_tokens_seen": 2472035, + "router_z_loss_clip": 32.90625, + "router_z_loss_mlp": 155.25, + "step": 117, + "time_per_iteration": 2.698347568511963 + }, + { + "auxiliary_loss_clip": 0.11602448, + "auxiliary_loss_mlp": 0.18875569, + "balance_loss_clip": 0.08221327, + "balance_loss_mlp": 0.02151936, + "epoch": 0.007094543814820382, + "flos": 14208933386880.0, + "grad_norm": 568.8145863995832, + "language_loss": 4.50002289, + "learning_rate": 3.071615712271274e-06, + "loss": 4.80480337, + "num_input_tokens_seen": 2489285, + "router_z_loss_clip": 33.75, + "router_z_loss_mlp": 167.375, + "step": 118, + "time_per_iteration": 2.614288091659546 + }, + { + "auxiliary_loss_clip": 0.11586175, + "auxiliary_loss_mlp": 0.17393641, + "balance_loss_clip": 0.08235049, + "balance_loss_mlp": 0.02086024, + "epoch": 0.007154667067488351, + "flos": 14981329123200.0, + "grad_norm": 337.3163881950513, + "language_loss": 4.89806128, + "learning_rate": 3.0770490962752172e-06, + "loss": 5.18785954, + "num_input_tokens_seen": 2506460, + "router_z_loss_clip": 33.5625, + "router_z_loss_mlp": 153.0, + "step": 119, + "time_per_iteration": 2.6733670234680176 + }, + { + "auxiliary_loss_clip": 0.11613901, + "auxiliary_loss_mlp": 0.17884746, + "balance_loss_clip": 0.08224175, + "balance_loss_mlp": 0.02088849, + "epoch": 0.00721479032015632, + "flos": 20199452538240.0, + "grad_norm": 4431.2993639449, + "language_loss": 4.39706039, + "learning_rate": 3.082437012097686e-06, + "loss": 4.69204712, + "num_input_tokens_seen": 2525565, + "router_z_loss_clip": 33.9375, + "router_z_loss_mlp": 157.75, + "step": 120, + "time_per_iteration": 2.6733429431915283 + }, + { + "auxiliary_loss_clip": 0.11614023, + "auxiliary_loss_mlp": 0.18062758, + "balance_loss_clip": 0.0821183, + "balance_loss_mlp": 0.02144791, + "epoch": 0.00727491357282429, + "flos": 23153650813440.0, + "grad_norm": 6523.034573603343, + "language_loss": 5.06446743, + "learning_rate": 3.0877802144103967e-06, + "loss": 5.36123562, + "num_input_tokens_seen": 2546605, + "router_z_loss_clip": 34.0, + "router_z_loss_mlp": 159.0, + "step": 121, + "time_per_iteration": 2.726327419281006 + }, + { + "auxiliary_loss_clip": 0.11618941, + "auxiliary_loss_mlp": 0.17642631, + "balance_loss_clip": 0.08232379, + "balance_loss_mlp": 0.02127495, + "epoch": 0.007335036825492259, + "flos": 15526811203200.0, + "grad_norm": 1010.4173973733286, + "language_loss": 4.56235886, + "learning_rate": 3.09307943925077e-06, + "loss": 4.85497475, + "num_input_tokens_seen": 2560730, + "router_z_loss_clip": 33.875, + "router_z_loss_mlp": 155.125, + "step": 122, + "time_per_iteration": 2.640110969543457 + }, + { + "auxiliary_loss_clip": 0.11591011, + "auxiliary_loss_mlp": 0.16755471, + "balance_loss_clip": 0.08221178, + "balance_loss_mlp": 0.02094828, + "epoch": 0.007395160078160229, + "flos": 24250233196800.0, + "grad_norm": 4778.191954305265, + "language_loss": 4.97837877, + "learning_rate": 3.0983354046304154e-06, + "loss": 5.2618432, + "num_input_tokens_seen": 2579550, + "router_z_loss_clip": 33.625, + "router_z_loss_mlp": 146.625, + "step": 123, + "time_per_iteration": 2.689462661743164 + }, + { + "auxiliary_loss_clip": 0.11583175, + "auxiliary_loss_mlp": 0.16522312, + "balance_loss_clip": 0.08218054, + "balance_loss_mlp": 0.02069187, + "epoch": 0.007455283330828198, + "flos": 31767976391040.0, + "grad_norm": 918.147653305623, + "language_loss": 4.24658871, + "learning_rate": 3.103548811118979e-06, + "loss": 4.5276432, + "num_input_tokens_seen": 2600390, + "router_z_loss_clip": 33.6875, + "router_z_loss_mlp": 144.625, + "step": 124, + "time_per_iteration": 2.79850172996521 + }, + { + "auxiliary_loss_clip": 0.11631332, + "auxiliary_loss_mlp": 0.17508414, + "balance_loss_clip": 0.08243011, + "balance_loss_mlp": 0.02151969, + "epoch": 0.007515406583496167, + "flos": 26622458138880.0, + "grad_norm": 2521.4972321949017, + "language_loss": 4.22364092, + "learning_rate": 3.108720342404542e-06, + "loss": 4.51503849, + "num_input_tokens_seen": 2620770, + "router_z_loss_clip": 33.90625, + "router_z_loss_mlp": 153.375, + "step": 125, + "time_per_iteration": 2.699488401412964 + }, + { + "auxiliary_loss_clip": 0.11621339, + "auxiliary_loss_mlp": 0.16743667, + "balance_loss_clip": 0.08258513, + "balance_loss_mlp": 0.02131851, + "epoch": 0.007575529836164136, + "flos": 18229637629440.0, + "grad_norm": 2114.724785338214, + "language_loss": 4.42466068, + "learning_rate": 3.1138506658316945e-06, + "loss": 4.70831108, + "num_input_tokens_seen": 2639900, + "router_z_loss_clip": 33.625, + "router_z_loss_mlp": 146.125, + "step": 126, + "time_per_iteration": 2.65913987159729 + }, + { + "auxiliary_loss_clip": 0.11678092, + "auxiliary_loss_mlp": 0.16983882, + "balance_loss_clip": 0.08243092, + "balance_loss_mlp": 0.02127924, + "epoch": 0.007635653088832106, + "flos": 21586916770560.0, + "grad_norm": 719.841664884419, + "language_loss": 3.98921776, + "learning_rate": 3.1189404329183404e-06, + "loss": 4.2758379, + "num_input_tokens_seen": 2657450, + "router_z_loss_clip": 34.3125, + "router_z_loss_mlp": 148.625, + "step": 127, + "time_per_iteration": 2.6392276287078857 + }, + { + "auxiliary_loss_clip": 0.11679719, + "auxiliary_loss_mlp": 0.17065403, + "balance_loss_clip": 0.08245254, + "balance_loss_mlp": 0.02160617, + "epoch": 0.007695776341500075, + "flos": 25382216730240.0, + "grad_norm": 1269.777428310943, + "language_loss": 4.33711529, + "learning_rate": 3.1239902798522317e-06, + "loss": 4.62456656, + "num_input_tokens_seen": 2678150, + "router_z_loss_clip": 34.3125, + "router_z_loss_mlp": 149.125, + "step": 128, + "time_per_iteration": 2.698997974395752 + }, + { + "auxiliary_loss_clip": 0.11722346, + "auxiliary_loss_mlp": 0.16804715, + "balance_loss_clip": 0.08270991, + "balance_loss_mlp": 0.02131863, + "epoch": 0.007755899594168045, + "flos": 22350088558080.0, + "grad_norm": 1159.6537901720856, + "language_loss": 4.87967634, + "learning_rate": 3.129000827968184e-06, + "loss": 5.16494703, + "num_input_tokens_seen": 2698290, + "router_z_loss_clip": 34.5, + "router_z_loss_mlp": 146.625, + "step": 129, + "time_per_iteration": 2.6568491458892822 + }, + { + "auxiliary_loss_clip": 0.11725748, + "auxiliary_loss_mlp": 0.17228858, + "balance_loss_clip": 0.08278215, + "balance_loss_mlp": 0.02165382, + "epoch": 0.007816022846836013, + "flos": 22644869621760.0, + "grad_norm": 436.4430863377033, + "language_loss": 5.01482534, + "learning_rate": 3.133972684206866e-06, + "loss": 5.30437136, + "num_input_tokens_seen": 2717630, + "router_z_loss_clip": 34.4375, + "router_z_loss_mlp": 150.5, + "step": 130, + "time_per_iteration": 2.7268729209899902 + }, + { + "auxiliary_loss_clip": 0.11697873, + "auxiliary_loss_mlp": 0.16884172, + "balance_loss_clip": 0.08257942, + "balance_loss_mlp": 0.02162493, + "epoch": 0.007876146099503984, + "flos": 18188115131520.0, + "grad_norm": 1162.2622739405722, + "language_loss": 4.07958698, + "learning_rate": 3.138906441556014e-06, + "loss": 4.36540699, + "num_input_tokens_seen": 2735835, + "router_z_loss_clip": 34.40625, + "router_z_loss_mlp": 147.25, + "step": 131, + "time_per_iteration": 2.7109498977661133 + }, + { + "auxiliary_loss_clip": 0.11733647, + "auxiliary_loss_mlp": 0.16117501, + "balance_loss_clip": 0.08280095, + "balance_loss_mlp": 0.02128244, + "epoch": 0.007936269352171952, + "flos": 27125788815360.0, + "grad_norm": 7543.348079431309, + "language_loss": 4.20423412, + "learning_rate": 3.143802679474861e-06, + "loss": 4.48274565, + "num_input_tokens_seen": 2756335, + "router_z_loss_clip": 34.5, + "router_z_loss_mlp": 140.0, + "step": 132, + "time_per_iteration": 2.717806816101074 + }, + { + "auxiliary_loss_clip": 0.11797122, + "auxiliary_loss_mlp": 0.16945273, + "balance_loss_clip": 0.08290964, + "balance_loss_mlp": 0.0219918, + "epoch": 0.007996392604839923, + "flos": 19032403271040.0, + "grad_norm": 824.1057706186339, + "language_loss": 4.52130318, + "learning_rate": 3.1486619643025565e-06, + "loss": 4.80872679, + "num_input_tokens_seen": 2775090, + "router_z_loss_clip": 34.96875, + "router_z_loss_mlp": 147.375, + "step": 133, + "time_per_iteration": 2.6183056831359863 + }, + { + "auxiliary_loss_clip": 0.11778916, + "auxiliary_loss_mlp": 0.1607928, + "balance_loss_clip": 0.08279899, + "balance_loss_mlp": 0.02163264, + "epoch": 0.008056515857507891, + "flos": 25491271219200.0, + "grad_norm": 23901.09716796145, + "language_loss": 3.33778429, + "learning_rate": 3.153484849651286e-06, + "loss": 3.61636591, + "num_input_tokens_seen": 2795320, + "router_z_loss_clip": 34.96875, + "router_z_loss_mlp": 139.25, + "step": 134, + "time_per_iteration": 2.715651750564575 + }, + { + "auxiliary_loss_clip": 0.11796138, + "auxiliary_loss_mlp": 0.16928384, + "balance_loss_clip": 0.08284588, + "balance_loss_mlp": 0.02206703, + "epoch": 0.00811663911017586, + "flos": 20563694236800.0, + "grad_norm": 532.3002515432323, + "language_loss": 4.31598186, + "learning_rate": 3.1582718767847806e-06, + "loss": 4.60322666, + "num_input_tokens_seen": 2812815, + "router_z_loss_clip": 35.1875, + "router_z_loss_mlp": 147.25, + "step": 135, + "time_per_iteration": 2.658189296722412 + }, + { + "auxiliary_loss_clip": 0.11834078, + "auxiliary_loss_mlp": 0.17649791, + "balance_loss_clip": 0.08286304, + "balance_loss_mlp": 0.02256724, + "epoch": 0.00817676236284383, + "flos": 18804483365760.0, + "grad_norm": 591.2706889750153, + "language_loss": 4.16468382, + "learning_rate": 3.1630235749828485e-06, + "loss": 4.45952272, + "num_input_tokens_seen": 2830445, + "router_z_loss_clip": 35.4375, + "router_z_loss_mlp": 153.75, + "step": 136, + "time_per_iteration": 5.634068250656128 + }, + { + "auxiliary_loss_clip": 0.11831227, + "auxiliary_loss_mlp": 0.16616376, + "balance_loss_clip": 0.08291583, + "balance_loss_mlp": 0.02193768, + "epoch": 0.008236885615511799, + "flos": 23879576661120.0, + "grad_norm": 754.59577193491, + "language_loss": 4.28476763, + "learning_rate": 3.1677404618925676e-06, + "loss": 4.56924391, + "num_input_tokens_seen": 2846965, + "router_z_loss_clip": 35.46875, + "router_z_loss_mlp": 144.25, + "step": 137, + "time_per_iteration": 2.6984925270080566 + }, + { + "auxiliary_loss_clip": 0.11840196, + "auxiliary_loss_mlp": 0.16576298, + "balance_loss_clip": 0.08293904, + "balance_loss_mlp": 0.02214726, + "epoch": 0.00829700886817977, + "flos": 24650379169920.0, + "grad_norm": 767.1857414798482, + "language_loss": 4.50048828, + "learning_rate": 3.1724230438666953e-06, + "loss": 4.78465271, + "num_input_tokens_seen": 2867520, + "router_z_loss_clip": 35.5, + "router_z_loss_mlp": 143.5625, + "step": 138, + "time_per_iteration": 4.106135368347168 + }, + { + "auxiliary_loss_clip": 0.11846266, + "auxiliary_loss_mlp": 0.16453376, + "balance_loss_clip": 0.08313362, + "balance_loss_mlp": 0.02219978, + "epoch": 0.008357132120847738, + "flos": 25268550266880.0, + "grad_norm": 3135.202751990444, + "language_loss": 4.53827906, + "learning_rate": 3.177071816289865e-06, + "loss": 4.82127523, + "num_input_tokens_seen": 2885675, + "router_z_loss_clip": 35.34375, + "router_z_loss_mlp": 142.5, + "step": 139, + "time_per_iteration": 2.6956582069396973 + }, + { + "auxiliary_loss_clip": 0.11892673, + "auxiliary_loss_mlp": 0.17064422, + "balance_loss_clip": 0.08314734, + "balance_loss_mlp": 0.02245087, + "epoch": 0.008417255373515706, + "flos": 27352325128320.0, + "grad_norm": 729.9492101747932, + "language_loss": 3.41289186, + "learning_rate": 3.181687263893095e-06, + "loss": 3.70246267, + "num_input_tokens_seen": 2905960, + "router_z_loss_clip": 35.71875, + "router_z_loss_mlp": 148.125, + "step": 140, + "time_per_iteration": 2.6964235305786133 + }, + { + "auxiliary_loss_clip": 0.1186142, + "auxiliary_loss_mlp": 0.16847792, + "balance_loss_clip": 0.08325124, + "balance_loss_mlp": 0.02223768, + "epoch": 0.008477378626183677, + "flos": 17644771330560.0, + "grad_norm": 9248.736899536998, + "language_loss": 3.54738212, + "learning_rate": 3.186269861057098e-06, + "loss": 3.83447456, + "num_input_tokens_seen": 2922780, + "router_z_loss_clip": 35.375, + "router_z_loss_mlp": 146.125, + "step": 141, + "time_per_iteration": 2.6551992893218994 + }, + { + "auxiliary_loss_clip": 0.11875261, + "auxiliary_loss_mlp": 0.17182453, + "balance_loss_clip": 0.08333448, + "balance_loss_mlp": 0.02241047, + "epoch": 0.008537501878851645, + "flos": 13886465748480.0, + "grad_norm": 1195.8886145818353, + "language_loss": 3.75801992, + "learning_rate": 3.1908200721048745e-06, + "loss": 4.04859734, + "num_input_tokens_seen": 2938765, + "router_z_loss_clip": 35.46875, + "router_z_loss_mlp": 149.375, + "step": 142, + "time_per_iteration": 2.613173246383667 + }, + { + "auxiliary_loss_clip": 0.11767568, + "auxiliary_loss_mlp": 0.03479403, + "balance_loss_clip": 0.08269441, + "balance_loss_mlp": 0.01324862, + "epoch": 0.008597625131519616, + "flos": 71270783976960.0, + "grad_norm": 1.6897091068609469, + "language_loss": 0.6651473, + "learning_rate": 3.195338351584042e-06, + "loss": 0.81761706, + "num_input_tokens_seen": 3006665, + "router_z_loss_clip": 35.0, + "router_z_loss_mlp": 21.5625, + "step": 143, + "time_per_iteration": 3.571974754333496 + }, + { + "auxiliary_loss_clip": 0.11831102, + "auxiliary_loss_mlp": 0.18004906, + "balance_loss_clip": 0.08322103, + "balance_loss_mlp": 0.02245629, + "epoch": 0.008657748384187584, + "flos": 17608573566720.0, + "grad_norm": 764.3395719536082, + "language_loss": 4.02781963, + "learning_rate": 3.1998251445393258e-06, + "loss": 4.32617998, + "num_input_tokens_seen": 3024335, + "router_z_loss_clip": 35.125, + "router_z_loss_mlp": 157.625, + "step": 144, + "time_per_iteration": 2.950308322906494 + }, + { + "auxiliary_loss_clip": 0.11815393, + "auxiliary_loss_mlp": 0.1653876, + "balance_loss_clip": 0.08320558, + "balance_loss_mlp": 0.021955, + "epoch": 0.008717871636855555, + "flos": 19720789689600.0, + "grad_norm": 995.118837229873, + "language_loss": 3.85104275, + "learning_rate": 3.204280886775619e-06, + "loss": 4.13458443, + "num_input_tokens_seen": 3043300, + "router_z_loss_clip": 34.9375, + "router_z_loss_mlp": 143.625, + "step": 145, + "time_per_iteration": 2.704049587249756 + }, + { + "auxiliary_loss_clip": 0.11712223, + "auxiliary_loss_mlp": 0.1568643, + "balance_loss_clip": 0.08270143, + "balance_loss_mlp": 0.02154936, + "epoch": 0.008777994889523523, + "flos": 24724325998080.0, + "grad_norm": 15039.120691806027, + "language_loss": 3.98885298, + "learning_rate": 3.208706005112005e-06, + "loss": 4.26283932, + "num_input_tokens_seen": 3064610, + "router_z_loss_clip": 34.40625, + "router_z_loss_mlp": 135.4375, + "step": 146, + "time_per_iteration": 2.7329108715057373 + }, + { + "auxiliary_loss_clip": 0.11446112, + "auxiliary_loss_mlp": 0.02845502, + "balance_loss_clip": 0.08152023, + "balance_loss_mlp": 0.01408125, + "epoch": 0.008838118142191492, + "flos": 70150974013440.0, + "grad_norm": 1.1651618479175945, + "language_loss": 0.59517723, + "learning_rate": 3.213100917627104e-06, + "loss": 0.73809338, + "num_input_tokens_seen": 3130385, + "router_z_loss_clip": 33.0, + "router_z_loss_mlp": 14.3671875, + "step": 147, + "time_per_iteration": 3.3949942588806152 + }, + { + "auxiliary_loss_clip": 0.11677637, + "auxiliary_loss_mlp": 0.16713935, + "balance_loss_clip": 0.08274397, + "balance_loss_mlp": 0.02199776, + "epoch": 0.008898241394859462, + "flos": 20050510705920.0, + "grad_norm": 1889.1884601694564, + "language_loss": 4.35780334, + "learning_rate": 3.2174660338961135e-06, + "loss": 4.64171886, + "num_input_tokens_seen": 3149760, + "router_z_loss_clip": 33.96875, + "router_z_loss_mlp": 145.25, + "step": 148, + "time_per_iteration": 2.7146079540252686 + }, + { + "auxiliary_loss_clip": 0.1159438, + "auxiliary_loss_mlp": 0.16573352, + "balance_loss_clip": 0.08248326, + "balance_loss_mlp": 0.02217881, + "epoch": 0.008958364647527431, + "flos": 10748217980160.0, + "grad_norm": 637.0991660467967, + "language_loss": 4.14174032, + "learning_rate": 3.2218017552198588e-06, + "loss": 4.42341805, + "num_input_tokens_seen": 3164500, + "router_z_loss_clip": 33.4375, + "router_z_loss_mlp": 143.625, + "step": 149, + "time_per_iteration": 2.661672353744507 + }, + { + "auxiliary_loss_clip": 0.11618437, + "auxiliary_loss_mlp": 0.16563556, + "balance_loss_clip": 0.08263792, + "balance_loss_mlp": 0.02201984, + "epoch": 0.009018487900195401, + "flos": 29134317110400.0, + "grad_norm": 1769.3998229499293, + "language_loss": 4.95698929, + "learning_rate": 3.226108474846181e-06, + "loss": 5.23880959, + "num_input_tokens_seen": 3182455, + "router_z_loss_clip": 33.5625, + "router_z_loss_mlp": 143.6875, + "step": 150, + "time_per_iteration": 2.7311227321624756 + }, + { + "auxiliary_loss_clip": 0.11585926, + "auxiliary_loss_mlp": 0.16123089, + "balance_loss_clip": 0.08249478, + "balance_loss_mlp": 0.02219281, + "epoch": 0.00907861115286337, + "flos": 32972020035840.0, + "grad_norm": 2114.6136002652206, + "language_loss": 3.36094427, + "learning_rate": 3.2303865781839817e-06, + "loss": 3.63803458, + "num_input_tokens_seen": 3203995, + "router_z_loss_clip": 33.34375, + "router_z_loss_mlp": 139.125, + "step": 151, + "time_per_iteration": 2.7520253658294678 + }, + { + "auxiliary_loss_clip": 0.115492, + "auxiliary_loss_mlp": 0.15748456, + "balance_loss_clip": 0.08239767, + "balance_loss_mlp": 0.02198652, + "epoch": 0.009138734405531338, + "flos": 21768911838720.0, + "grad_norm": 3311.474565423633, + "language_loss": 3.73547316, + "learning_rate": 3.234636443010188e-06, + "loss": 4.00844955, + "num_input_tokens_seen": 3222575, + "router_z_loss_clip": 33.09375, + "router_z_loss_mlp": 135.625, + "step": 152, + "time_per_iteration": 2.694563865661621 + }, + { + "auxiliary_loss_clip": 0.1159073, + "auxiliary_loss_mlp": 0.1623821, + "balance_loss_clip": 0.08250044, + "balance_loss_mlp": 0.02248952, + "epoch": 0.009198857658199309, + "flos": 20847532343040.0, + "grad_norm": 1087.0956983151382, + "language_loss": 3.84302998, + "learning_rate": 3.238858439669943e-06, + "loss": 4.12131977, + "num_input_tokens_seen": 3240180, + "router_z_loss_clip": 33.40625, + "router_z_loss_mlp": 139.875, + "step": 153, + "time_per_iteration": 2.6366450786590576 + }, + { + "auxiliary_loss_clip": 0.11564142, + "auxiliary_loss_mlp": 0.15476364, + "balance_loss_clip": 0.08260261, + "balance_loss_mlp": 0.02207321, + "epoch": 0.009258980910867277, + "flos": 24834386736000.0, + "grad_norm": 8366.148944916698, + "language_loss": 4.13687325, + "learning_rate": 3.2430529312702712e-06, + "loss": 4.40727806, + "num_input_tokens_seen": 3259800, + "router_z_loss_clip": 33.03125, + "router_z_loss_mlp": 132.8125, + "step": 154, + "time_per_iteration": 2.7312138080596924 + }, + { + "auxiliary_loss_clip": 0.11535051, + "auxiliary_loss_mlp": 0.15077396, + "balance_loss_clip": 0.08268774, + "balance_loss_mlp": 0.02198978, + "epoch": 0.009319104163535248, + "flos": 28775442072960.0, + "grad_norm": 662.1258045248602, + "language_loss": 4.14579964, + "learning_rate": 3.2472202738674737e-06, + "loss": 4.41192484, + "num_input_tokens_seen": 3280400, + "router_z_loss_clip": 32.71875, + "router_z_loss_mlp": 128.6875, + "step": 155, + "time_per_iteration": 2.755199909210205 + }, + { + "auxiliary_loss_clip": 0.11566834, + "auxiliary_loss_mlp": 0.15004471, + "balance_loss_clip": 0.08261703, + "balance_loss_mlp": 0.02193191, + "epoch": 0.009379227416203216, + "flos": 16587698947200.0, + "grad_norm": 731.5664855161135, + "language_loss": 3.49704862, + "learning_rate": 3.2513608166485063e-06, + "loss": 3.76276183, + "num_input_tokens_seen": 3297600, + "router_z_loss_clip": 33.09375, + "router_z_loss_mlp": 128.125, + "step": 156, + "time_per_iteration": 2.7707407474517822 + }, + { + "auxiliary_loss_clip": 0.11568415, + "auxiliary_loss_mlp": 0.15332887, + "balance_loss_clip": 0.08266081, + "balance_loss_mlp": 0.02216432, + "epoch": 0.009439350668871187, + "flos": 18335337955200.0, + "grad_norm": 795.683005311381, + "language_loss": 3.94911337, + "learning_rate": 3.2554749021065498e-06, + "loss": 4.2181263, + "num_input_tokens_seen": 3313635, + "router_z_loss_clip": 32.96875, + "router_z_loss_mlp": 131.25, + "step": 157, + "time_per_iteration": 2.6737098693847656 + }, + { + "auxiliary_loss_clip": 0.11567172, + "auxiliary_loss_mlp": 0.15600383, + "balance_loss_clip": 0.0828969, + "balance_loss_mlp": 0.02264203, + "epoch": 0.009499473921539155, + "flos": 24356310865920.0, + "grad_norm": 748.6515809747107, + "language_loss": 3.9944849, + "learning_rate": 3.2595628662110186e-06, + "loss": 4.26616049, + "num_input_tokens_seen": 3333735, + "router_z_loss_clip": 32.75, + "router_z_loss_mlp": 133.5625, + "step": 158, + "time_per_iteration": 2.6704254150390625 + }, + { + "auxiliary_loss_clip": 0.11561831, + "auxiliary_loss_mlp": 0.15665153, + "balance_loss_clip": 0.08273103, + "balance_loss_mlp": 0.02231314, + "epoch": 0.009559597174207124, + "flos": 16404949192320.0, + "grad_norm": 1901.311070356518, + "language_loss": 3.80921197, + "learning_rate": 3.2636250385721982e-06, + "loss": 4.08148146, + "num_input_tokens_seen": 3348800, + "router_z_loss_clip": 32.90625, + "router_z_loss_mlp": 134.4375, + "step": 159, + "time_per_iteration": 2.6218996047973633 + }, + { + "auxiliary_loss_clip": 0.11580203, + "auxiliary_loss_mlp": 0.15643886, + "balance_loss_clip": 0.08278053, + "balance_loss_mlp": 0.02252773, + "epoch": 0.009619720426875094, + "flos": 22863523651200.0, + "grad_norm": 1785.522909187837, + "language_loss": 3.8831954, + "learning_rate": 3.2676617426007263e-06, + "loss": 4.15543652, + "num_input_tokens_seen": 3368595, + "router_z_loss_clip": 33.0, + "router_z_loss_mlp": 134.0, + "step": 160, + "time_per_iteration": 2.6699254512786865 + }, + { + "auxiliary_loss_clip": 0.11567888, + "auxiliary_loss_mlp": 0.15128596, + "balance_loss_clip": 0.08280417, + "balance_loss_mlp": 0.02237971, + "epoch": 0.009679843679543063, + "flos": 19140954635520.0, + "grad_norm": 1894.5705497879367, + "language_loss": 4.38242626, + "learning_rate": 3.2716732956621042e-06, + "loss": 4.6493907, + "num_input_tokens_seen": 3384975, + "router_z_loss_clip": 32.890625, + "router_z_loss_mlp": 129.0, + "step": 161, + "time_per_iteration": 2.692594289779663 + }, + { + "auxiliary_loss_clip": 0.11596949, + "auxiliary_loss_mlp": 0.15413821, + "balance_loss_clip": 0.08296333, + "balance_loss_mlp": 0.02279055, + "epoch": 0.009739966932211033, + "flos": 20309219786880.0, + "grad_norm": 1092.6315431795774, + "language_loss": 3.67637897, + "learning_rate": 3.2756600092264203e-06, + "loss": 3.94648647, + "num_input_tokens_seen": 3404755, + "router_z_loss_clip": 33.0, + "router_z_loss_mlp": 131.4375, + "step": 162, + "time_per_iteration": 2.684589147567749 + }, + { + "auxiliary_loss_clip": 0.10812573, + "auxiliary_loss_mlp": 0.02121325, + "balance_loss_clip": 0.08169468, + "balance_loss_mlp": 0.01469775, + "epoch": 0.009800090184879002, + "flos": 67053200567040.0, + "grad_norm": 1.455168404801105, + "language_loss": 0.72263706, + "learning_rate": 3.279622189013474e-06, + "loss": 0.85197604, + "num_input_tokens_seen": 3467210, + "router_z_loss_clip": 26.484375, + "router_z_loss_mlp": 6.515625, + "step": 163, + "time_per_iteration": 3.2609994411468506 + }, + { + "auxiliary_loss_clip": 0.1158057, + "auxiliary_loss_mlp": 0.15459523, + "balance_loss_clip": 0.08303102, + "balance_loss_mlp": 0.02282033, + "epoch": 0.00986021343754697, + "flos": 17170301185920.0, + "grad_norm": 728.8786194893343, + "language_loss": 3.07243919, + "learning_rate": 3.283560135133457e-06, + "loss": 3.34283996, + "num_input_tokens_seen": 3483220, + "router_z_loss_clip": 32.765625, + "router_z_loss_mlp": 131.8125, + "step": 164, + "time_per_iteration": 2.6558001041412354 + }, + { + "auxiliary_loss_clip": 0.11589515, + "auxiliary_loss_mlp": 0.15754591, + "balance_loss_clip": 0.08312181, + "balance_loss_mlp": 0.02308546, + "epoch": 0.00992033669021494, + "flos": 17755293265920.0, + "grad_norm": 847.0745501241739, + "language_loss": 3.51890922, + "learning_rate": 3.2874741422233565e-06, + "loss": 3.79235029, + "num_input_tokens_seen": 3501465, + "router_z_loss_clip": 32.78125, + "router_z_loss_mlp": 134.4375, + "step": 165, + "time_per_iteration": 2.661271095275879 + }, + { + "auxiliary_loss_clip": 0.11568248, + "auxiliary_loss_mlp": 0.15508898, + "balance_loss_clip": 0.08301617, + "balance_loss_mlp": 0.02294787, + "epoch": 0.00998045994288291, + "flos": 25303490219520.0, + "grad_norm": 327.0790624727143, + "language_loss": 3.23893571, + "learning_rate": 3.2913644995792465e-06, + "loss": 3.50970697, + "num_input_tokens_seen": 3520480, + "router_z_loss_clip": 32.6875, + "router_z_loss_mlp": 132.3125, + "step": 166, + "time_per_iteration": 2.710336923599243 + }, + { + "auxiliary_loss_clip": 0.11574914, + "auxiliary_loss_mlp": 0.14880663, + "balance_loss_clip": 0.08314175, + "balance_loss_mlp": 0.02301317, + "epoch": 0.01004058319555088, + "flos": 32305869676800.0, + "grad_norm": 776.5856268380442, + "language_loss": 4.07326555, + "learning_rate": 3.2952314912845914e-06, + "loss": 4.33782148, + "num_input_tokens_seen": 3539570, + "router_z_loss_clip": 32.609375, + "router_z_loss_mlp": 125.8125, + "step": 167, + "time_per_iteration": 2.779219150543213 + }, + { + "auxiliary_loss_clip": 0.1150827, + "auxiliary_loss_mlp": 0.15720402, + "balance_loss_clip": 0.083069, + "balance_loss_mlp": 0.02304874, + "epoch": 0.010100706448218848, + "flos": 11323399132800.0, + "grad_norm": 2394.835407434967, + "language_loss": 3.28905821, + "learning_rate": 3.299075396334735e-06, + "loss": 3.5613451, + "num_input_tokens_seen": 3555465, + "router_z_loss_clip": 32.0, + "router_z_loss_mlp": 134.25, + "step": 168, + "time_per_iteration": 2.6511645317077637 + }, + { + "auxiliary_loss_clip": 0.11477365, + "auxiliary_loss_mlp": 0.1529358, + "balance_loss_clip": 0.08283502, + "balance_loss_mlp": 0.02299196, + "epoch": 0.010160829700886819, + "flos": 29727820379520.0, + "grad_norm": 656.1528496227621, + "language_loss": 3.4663558, + "learning_rate": 3.3028964887576868e-06, + "loss": 3.73406529, + "num_input_tokens_seen": 3578970, + "router_z_loss_clip": 31.921875, + "router_z_loss_mlp": 130.0, + "step": 169, + "time_per_iteration": 2.744943141937256 + }, + { + "auxiliary_loss_clip": 0.1151928, + "auxiliary_loss_mlp": 0.1559048, + "balance_loss_clip": 0.08316396, + "balance_loss_mlp": 0.02315333, + "epoch": 0.010220952953554787, + "flos": 20418567765120.0, + "grad_norm": 1313.5821328962659, + "language_loss": 3.30928183, + "learning_rate": 3.306695037731344e-06, + "loss": 3.58037925, + "num_input_tokens_seen": 3597275, + "router_z_loss_clip": 32.03125, + "router_z_loss_mlp": 132.75, + "step": 170, + "time_per_iteration": 2.6904942989349365 + }, + { + "auxiliary_loss_clip": 0.11476055, + "auxiliary_loss_mlp": 0.14880618, + "balance_loss_clip": 0.08295664, + "balance_loss_mlp": 0.02301271, + "epoch": 0.010281076206222756, + "flos": 31293170830080.0, + "grad_norm": 1393.3935417181144, + "language_loss": 3.61100364, + "learning_rate": 3.3104713076972827e-06, + "loss": 3.87457037, + "num_input_tokens_seen": 3618905, + "router_z_loss_clip": 31.84375, + "router_z_loss_mlp": 125.75, + "step": 171, + "time_per_iteration": 2.7253830432891846 + }, + { + "auxiliary_loss_clip": 0.11506656, + "auxiliary_loss_mlp": 0.15002409, + "balance_loss_clip": 0.08299719, + "balance_loss_mlp": 0.02294889, + "epoch": 0.010341199458890726, + "flos": 21988949460480.0, + "grad_norm": 857.6014739419991, + "language_loss": 3.63604832, + "learning_rate": 3.314225558471224e-06, + "loss": 3.90113878, + "num_input_tokens_seen": 3639610, + "router_z_loss_clip": 32.015625, + "router_z_loss_mlp": 127.1875, + "step": 172, + "time_per_iteration": 2.687918186187744 + }, + { + "auxiliary_loss_clip": 0.11501465, + "auxiliary_loss_mlp": 0.15934135, + "balance_loss_clip": 0.08304699, + "balance_loss_mlp": 0.02359916, + "epoch": 0.010401322711558695, + "flos": 30818449123200.0, + "grad_norm": 2776.6711688344126, + "language_loss": 3.43709183, + "learning_rate": 3.317958045350308e-06, + "loss": 3.71144772, + "num_input_tokens_seen": 3664030, + "router_z_loss_clip": 31.9375, + "router_z_loss_mlp": 135.6875, + "step": 173, + "time_per_iteration": 2.760416030883789 + }, + { + "auxiliary_loss_clip": 0.11548179, + "auxiliary_loss_mlp": 0.15753293, + "balance_loss_clip": 0.08317138, + "balance_loss_mlp": 0.02337765, + "epoch": 0.010461445964226665, + "flos": 24721642667520.0, + "grad_norm": 1049.1047345334737, + "language_loss": 3.46181607, + "learning_rate": 3.3216690192172596e-06, + "loss": 3.73483086, + "num_input_tokens_seen": 3683615, + "router_z_loss_clip": 32.28125, + "router_z_loss_mlp": 134.125, + "step": 174, + "time_per_iteration": 2.8112432956695557 + }, + { + "auxiliary_loss_clip": 0.11529493, + "auxiliary_loss_mlp": 0.16248052, + "balance_loss_clip": 0.08304952, + "balance_loss_mlp": 0.02319829, + "epoch": 0.010521569216894634, + "flos": 27717950419200.0, + "grad_norm": 1443.6409322594398, + "language_loss": 3.14877939, + "learning_rate": 3.325358726641591e-06, + "loss": 3.42655468, + "num_input_tokens_seen": 3704540, + "router_z_loss_clip": 32.265625, + "router_z_loss_mlp": 139.25, + "step": 175, + "time_per_iteration": 5.6078009605407715 + }, + { + "auxiliary_loss_clip": 0.11549105, + "auxiliary_loss_mlp": 0.15645993, + "balance_loss_clip": 0.08317456, + "balance_loss_mlp": 0.02328122, + "epoch": 0.010581692469562603, + "flos": 12463223022720.0, + "grad_norm": 956.7802143525229, + "language_loss": 3.34866667, + "learning_rate": 3.329027409977902e-06, + "loss": 3.62061763, + "num_input_tokens_seen": 3721320, + "router_z_loss_clip": 32.34375, + "router_z_loss_mlp": 133.375, + "step": 176, + "time_per_iteration": 4.057558059692383 + }, + { + "auxiliary_loss_clip": 0.11580729, + "auxiliary_loss_mlp": 0.16905147, + "balance_loss_clip": 0.08321375, + "balance_loss_mlp": 0.02378779, + "epoch": 0.010641815722230573, + "flos": 19433723201280.0, + "grad_norm": 1505.424754847227, + "language_loss": 3.25544405, + "learning_rate": 3.3326753074614087e-06, + "loss": 3.54030275, + "num_input_tokens_seen": 3739385, + "router_z_loss_clip": 32.5625, + "router_z_loss_mlp": 145.25, + "step": 177, + "time_per_iteration": 4.175410032272339 + }, + { + "auxiliary_loss_clip": 0.11632887, + "auxiliary_loss_mlp": 0.17182559, + "balance_loss_clip": 0.08330977, + "balance_loss_mlp": 0.02387638, + "epoch": 0.010701938974898541, + "flos": 18338440556160.0, + "grad_norm": 1009.0094276513727, + "language_loss": 3.02760315, + "learning_rate": 3.3363026533007716e-06, + "loss": 3.31575751, + "num_input_tokens_seen": 3756360, + "router_z_loss_clip": 33.046875, + "router_z_loss_mlp": 148.0, + "step": 178, + "time_per_iteration": 2.6476314067840576 + }, + { + "auxiliary_loss_clip": 0.11659138, + "auxiliary_loss_mlp": 0.17559879, + "balance_loss_clip": 0.0834986, + "balance_loss_mlp": 0.02398745, + "epoch": 0.010762062227566512, + "flos": 19209283240320.0, + "grad_norm": 645.2944722680985, + "language_loss": 3.18850112, + "learning_rate": 3.3399096777683303e-06, + "loss": 3.48069143, + "num_input_tokens_seen": 3773930, + "router_z_loss_clip": 33.03125, + "router_z_loss_mlp": 151.5, + "step": 179, + "time_per_iteration": 2.673020601272583 + }, + { + "auxiliary_loss_clip": 0.11646449, + "auxiliary_loss_mlp": 0.17152536, + "balance_loss_clip": 0.0833544, + "balance_loss_mlp": 0.02369822, + "epoch": 0.01082218548023448, + "flos": 31432553297280.0, + "grad_norm": 1138.8337468152163, + "language_loss": 3.61664343, + "learning_rate": 3.3434966072878213e-06, + "loss": 3.90463305, + "num_input_tokens_seen": 3793630, + "router_z_loss_clip": 33.125, + "router_z_loss_mlp": 147.75, + "step": 180, + "time_per_iteration": 2.7129592895507812 + }, + { + "auxiliary_loss_clip": 0.1163583, + "auxiliary_loss_mlp": 0.17579561, + "balance_loss_clip": 0.08352019, + "balance_loss_mlp": 0.02406223, + "epoch": 0.01088230873290245, + "flos": 25053501962880.0, + "grad_norm": 1023.6426422721124, + "language_loss": 3.16591597, + "learning_rate": 3.3470636645196674e-06, + "loss": 3.45807004, + "num_input_tokens_seen": 3813610, + "router_z_loss_clip": 32.875, + "router_z_loss_mlp": 151.5, + "step": 181, + "time_per_iteration": 2.7088735103607178 + }, + { + "auxiliary_loss_clip": 0.11667231, + "auxiliary_loss_mlp": 0.17749819, + "balance_loss_clip": 0.08358228, + "balance_loss_mlp": 0.02381167, + "epoch": 0.01094243198557042, + "flos": 22900056831360.0, + "grad_norm": 355.45097956691654, + "language_loss": 3.57462454, + "learning_rate": 3.3506110684439156e-06, + "loss": 3.86879492, + "num_input_tokens_seen": 3831390, + "router_z_loss_clip": 33.03125, + "router_z_loss_mlp": 153.625, + "step": 182, + "time_per_iteration": 2.6655702590942383 + }, + { + "auxiliary_loss_clip": 0.11774068, + "auxiliary_loss_mlp": 0.186405, + "balance_loss_clip": 0.08392486, + "balance_loss_mlp": 0.02429562, + "epoch": 0.011002555238238388, + "flos": 17170720456320.0, + "grad_norm": 544.9308642616941, + "language_loss": 3.01895189, + "learning_rate": 3.3541390344409054e-06, + "loss": 3.32309771, + "num_input_tokens_seen": 3849705, + "router_z_loss_clip": 33.8125, + "router_z_loss_mlp": 162.0, + "step": 183, + "time_per_iteration": 2.672084331512451 + }, + { + "auxiliary_loss_clip": 0.11731043, + "auxiliary_loss_mlp": 0.17741105, + "balance_loss_clip": 0.0838448, + "balance_loss_mlp": 0.02409074, + "epoch": 0.011062678490906358, + "flos": 22316783760000.0, + "grad_norm": 900.0159693716428, + "language_loss": 3.54977012, + "learning_rate": 3.357647774369736e-06, + "loss": 3.84449148, + "num_input_tokens_seen": 3869230, + "router_z_loss_clip": 33.4375, + "router_z_loss_mlp": 153.25, + "step": 184, + "time_per_iteration": 2.664008140563965 + }, + { + "auxiliary_loss_clip": 0.11698474, + "auxiliary_loss_mlp": 0.18400645, + "balance_loss_clip": 0.08363934, + "balance_loss_mlp": 0.02433849, + "epoch": 0.011122801743574327, + "flos": 24395108106240.0, + "grad_norm": 434.928327577731, + "language_loss": 3.09638596, + "learning_rate": 3.3611374966446085e-06, + "loss": 3.39737701, + "num_input_tokens_seen": 3889735, + "router_z_loss_clip": 33.34375, + "router_z_loss_mlp": 159.5, + "step": 185, + "time_per_iteration": 2.726417303085327 + }, + { + "auxiliary_loss_clip": 0.11759127, + "auxiliary_loss_mlp": 0.17777845, + "balance_loss_clip": 0.08374798, + "balance_loss_mlp": 0.02421399, + "epoch": 0.011182924996242297, + "flos": 18156110071680.0, + "grad_norm": 629.7246053366609, + "language_loss": 2.4891119, + "learning_rate": 3.3646084063091142e-06, + "loss": 2.78448153, + "num_input_tokens_seen": 3908855, + "router_z_loss_clip": 33.84375, + "router_z_loss_mlp": 153.5, + "step": 186, + "time_per_iteration": 2.694352865219116 + }, + { + "auxiliary_loss_clip": 0.11730683, + "auxiliary_loss_mlp": 0.17846453, + "balance_loss_clip": 0.08379789, + "balance_loss_mlp": 0.0240456, + "epoch": 0.011243048248910266, + "flos": 15492206666880.0, + "grad_norm": 204.67136476740635, + "language_loss": 3.6299262, + "learning_rate": 3.3680607051085194e-06, + "loss": 3.9256978, + "num_input_tokens_seen": 3923865, + "router_z_loss_clip": 33.53125, + "router_z_loss_mlp": 154.25, + "step": 187, + "time_per_iteration": 2.6440258026123047 + }, + { + "auxiliary_loss_clip": 0.11782947, + "auxiliary_loss_mlp": 0.18885629, + "balance_loss_clip": 0.08391893, + "balance_loss_mlp": 0.02454964, + "epoch": 0.011303171501578235, + "flos": 40926442383360.0, + "grad_norm": 245.45256433797323, + "language_loss": 2.78124428, + "learning_rate": 3.371494591560139e-06, + "loss": 3.0879302, + "num_input_tokens_seen": 3946870, + "router_z_loss_clip": 33.875, + "router_z_loss_mlp": 164.25, + "step": 188, + "time_per_iteration": 2.8504083156585693 + }, + { + "auxiliary_loss_clip": 0.10094331, + "auxiliary_loss_mlp": 0.0271045, + "balance_loss_clip": 0.08081996, + "balance_loss_mlp": 0.01840699, + "epoch": 0.011363294754246205, + "flos": 66321237225600.0, + "grad_norm": 2.5418158680058287, + "language_loss": 0.5572542, + "learning_rate": 3.3749102610218297e-06, + "loss": 0.68530196, + "num_input_tokens_seen": 4010005, + "router_z_loss_clip": 20.140625, + "router_z_loss_mlp": 8.71875, + "step": 189, + "time_per_iteration": 3.351346492767334 + }, + { + "auxiliary_loss_clip": 0.11787133, + "auxiliary_loss_mlp": 0.18362574, + "balance_loss_clip": 0.08391854, + "balance_loss_mlp": 0.02444606, + "epoch": 0.011423418006914174, + "flos": 24907285388160.0, + "grad_norm": 1404.1743205968703, + "language_loss": 3.09611416, + "learning_rate": 3.3783079057586833e-06, + "loss": 3.39761114, + "num_input_tokens_seen": 4029035, + "router_z_loss_clip": 34.0, + "router_z_loss_mlp": 159.125, + "step": 190, + "time_per_iteration": 2.7106430530548096 + }, + { + "auxiliary_loss_clip": 0.11759384, + "auxiliary_loss_mlp": 0.1804318, + "balance_loss_clip": 0.08374631, + "balance_loss_mlp": 0.02442593, + "epoch": 0.011483541259582144, + "flos": 19797964899840.0, + "grad_norm": 958.8286854390585, + "language_loss": 3.06252718, + "learning_rate": 3.3816877150079665e-06, + "loss": 3.36055326, + "num_input_tokens_seen": 4046995, + "router_z_loss_clip": 33.875, + "router_z_loss_mlp": 156.0, + "step": 191, + "time_per_iteration": 2.6592226028442383 + }, + { + "auxiliary_loss_clip": 0.11741614, + "auxiliary_loss_mlp": 0.17628413, + "balance_loss_clip": 0.08397849, + "balance_loss_mlp": 0.02442867, + "epoch": 0.011543664512250112, + "flos": 26184101904000.0, + "grad_norm": 872.0200851454543, + "language_loss": 3.40287876, + "learning_rate": 3.385049875042367e-06, + "loss": 3.69657874, + "num_input_tokens_seen": 4065865, + "router_z_loss_clip": 33.4375, + "router_z_loss_mlp": 151.625, + "step": 192, + "time_per_iteration": 2.7246127128601074 + }, + { + "auxiliary_loss_clip": 0.11744646, + "auxiliary_loss_mlp": 0.1831618, + "balance_loss_clip": 0.08387344, + "balance_loss_mlp": 0.02459247, + "epoch": 0.011603787764918083, + "flos": 23775763052160.0, + "grad_norm": 255.22859463919886, + "language_loss": 3.03195429, + "learning_rate": 3.3883945692315938e-06, + "loss": 3.33256245, + "num_input_tokens_seen": 4085305, + "router_z_loss_clip": 33.59375, + "router_z_loss_mlp": 158.375, + "step": 193, + "time_per_iteration": 2.683800220489502 + }, + { + "auxiliary_loss_clip": 0.11792802, + "auxiliary_loss_mlp": 0.18172303, + "balance_loss_clip": 0.08409159, + "balance_loss_mlp": 0.02449647, + "epoch": 0.011663911017586051, + "flos": 25961255170560.0, + "grad_norm": 151.45813274947093, + "language_loss": 3.26517797, + "learning_rate": 3.3917219781023906e-06, + "loss": 3.56482911, + "num_input_tokens_seen": 4105185, + "router_z_loss_clip": 33.875, + "router_z_loss_mlp": 157.0, + "step": 194, + "time_per_iteration": 2.6878743171691895 + }, + { + "auxiliary_loss_clip": 0.11706592, + "auxiliary_loss_mlp": 0.17706957, + "balance_loss_clip": 0.08367997, + "balance_loss_mlp": 0.0244817, + "epoch": 0.01172403427025402, + "flos": 17901006716160.0, + "grad_norm": 341.36308265873936, + "language_loss": 3.21669102, + "learning_rate": 3.3950322793970014e-06, + "loss": 3.51082659, + "num_input_tokens_seen": 4123160, + "router_z_loss_clip": 33.375, + "router_z_loss_mlp": 152.25, + "step": 195, + "time_per_iteration": 2.6620969772338867 + }, + { + "auxiliary_loss_clip": 0.11741272, + "auxiliary_loss_mlp": 0.18081686, + "balance_loss_clip": 0.08387178, + "balance_loss_mlp": 0.02468893, + "epoch": 0.01178415752292199, + "flos": 17900293956480.0, + "grad_norm": 232.42067340374058, + "language_loss": 3.00283194, + "learning_rate": 3.3983256481301445e-06, + "loss": 3.30106115, + "num_input_tokens_seen": 4140425, + "router_z_loss_clip": 33.53125, + "router_z_loss_mlp": 156.0, + "step": 196, + "time_per_iteration": 2.608747720718384 + }, + { + "auxiliary_loss_clip": 0.11721249, + "auxiliary_loss_mlp": 0.17373422, + "balance_loss_clip": 0.08370736, + "balance_loss_mlp": 0.02444223, + "epoch": 0.011844280775589959, + "flos": 22900224539520.0, + "grad_norm": 115.37051275011517, + "language_loss": 2.93469787, + "learning_rate": 3.4016022566445335e-06, + "loss": 3.22564435, + "num_input_tokens_seen": 4159555, + "router_z_loss_clip": 33.5, + "router_z_loss_mlp": 149.0, + "step": 197, + "time_per_iteration": 2.6884865760803223 + }, + { + "auxiliary_loss_clip": 0.11780085, + "auxiliary_loss_mlp": 0.17500654, + "balance_loss_clip": 0.08412851, + "balance_loss_mlp": 0.02486004, + "epoch": 0.01190440402825793, + "flos": 26987748013440.0, + "grad_norm": 594.5655905086047, + "language_loss": 2.93459964, + "learning_rate": 3.4048622746649966e-06, + "loss": 3.22740698, + "num_input_tokens_seen": 4180480, + "router_z_loss_clip": 33.65625, + "router_z_loss_mlp": 150.25, + "step": 198, + "time_per_iteration": 2.7313427925109863 + }, + { + "auxiliary_loss_clip": 0.11754367, + "auxiliary_loss_mlp": 0.16903168, + "balance_loss_clip": 0.08420561, + "balance_loss_mlp": 0.02462251, + "epoch": 0.011964527280925898, + "flos": 20527789962240.0, + "grad_norm": 145.17481727818333, + "language_loss": 2.84690857, + "learning_rate": 3.4081058693512278e-06, + "loss": 3.13348389, + "num_input_tokens_seen": 4198835, + "router_z_loss_clip": 33.34375, + "router_z_loss_mlp": 144.5, + "step": 199, + "time_per_iteration": 2.688974618911743 + }, + { + "auxiliary_loss_clip": 0.11798929, + "auxiliary_loss_mlp": 0.17447452, + "balance_loss_clip": 0.08422767, + "balance_loss_mlp": 0.02481632, + "epoch": 0.012024650533593867, + "flos": 27753435423360.0, + "grad_norm": 82.0113766879368, + "language_loss": 2.56142473, + "learning_rate": 3.411333205349222e-06, + "loss": 2.85388851, + "num_input_tokens_seen": 4219335, + "router_z_loss_clip": 33.75, + "router_z_loss_mlp": 149.5, + "step": 200, + "time_per_iteration": 2.745638608932495 + }, + { + "auxiliary_loss_clip": 0.11760798, + "auxiliary_loss_mlp": 0.1661135, + "balance_loss_clip": 0.08439215, + "balance_loss_mlp": 0.02475607, + "epoch": 0.012084773786261837, + "flos": 10456623371520.0, + "grad_norm": 81.29107841083456, + "language_loss": 2.49306059, + "learning_rate": 3.4145444448414217e-06, + "loss": 2.77678204, + "num_input_tokens_seen": 4236940, + "router_z_loss_clip": 33.25, + "router_z_loss_mlp": 141.375, + "step": 201, + "time_per_iteration": 2.7527854442596436 + }, + { + "auxiliary_loss_clip": 0.1174719, + "auxiliary_loss_mlp": 0.16602293, + "balance_loss_clip": 0.08432734, + "balance_loss_mlp": 0.02490965, + "epoch": 0.012144897038929806, + "flos": 23111331701760.0, + "grad_norm": 843.8800494285322, + "language_loss": 2.70319819, + "learning_rate": 3.4177397475956223e-06, + "loss": 2.98669291, + "num_input_tokens_seen": 4256755, + "router_z_loss_clip": 33.21875, + "router_z_loss_mlp": 141.125, + "step": 202, + "time_per_iteration": 2.739138603210449 + }, + { + "auxiliary_loss_clip": 0.11772437, + "auxiliary_loss_mlp": 0.16814882, + "balance_loss_clip": 0.08448092, + "balance_loss_mlp": 0.02483826, + "epoch": 0.012205020291597776, + "flos": 21039631827840.0, + "grad_norm": 111.22984226607618, + "language_loss": 2.69834185, + "learning_rate": 3.4209192710126685e-06, + "loss": 2.98421502, + "num_input_tokens_seen": 4276505, + "router_z_loss_clip": 33.25, + "router_z_loss_mlp": 143.375, + "step": 203, + "time_per_iteration": 2.6849801540374756 + }, + { + "auxiliary_loss_clip": 0.09996115, + "auxiliary_loss_mlp": 0.01763683, + "balance_loss_clip": 0.08022483, + "balance_loss_mlp": 0.01355129, + "epoch": 0.012265143544265745, + "flos": 68465416481280.0, + "grad_norm": 2.5939001011358327, + "language_loss": 0.60663998, + "learning_rate": 3.4240831701729837e-06, + "loss": 0.72423798, + "num_input_tokens_seen": 4330965, + "router_z_loss_clip": 19.75, + "router_z_loss_mlp": 4.08984375, + "step": 204, + "time_per_iteration": 3.218200922012329 + }, + { + "auxiliary_loss_clip": 0.11829591, + "auxiliary_loss_mlp": 0.16426852, + "balance_loss_clip": 0.08460154, + "balance_loss_mlp": 0.02486424, + "epoch": 0.012325266796933715, + "flos": 17024923152000.0, + "grad_norm": 175.923318576614, + "language_loss": 2.6947825, + "learning_rate": 3.4272315978819516e-06, + "loss": 2.9773469, + "num_input_tokens_seen": 4348200, + "router_z_loss_clip": 33.6875, + "router_z_loss_mlp": 139.5, + "step": 205, + "time_per_iteration": 2.6580400466918945 + }, + { + "auxiliary_loss_clip": 0.11821875, + "auxiliary_loss_mlp": 0.15477848, + "balance_loss_clip": 0.0845597, + "balance_loss_mlp": 0.02483464, + "epoch": 0.012385390049601683, + "flos": 20195679104640.0, + "grad_norm": 179.20336452265943, + "language_loss": 2.76609898, + "learning_rate": 3.4303647047142043e-06, + "loss": 3.03909636, + "num_input_tokens_seen": 4365460, + "router_z_loss_clip": 33.71875, + "router_z_loss_mlp": 130.0625, + "step": 206, + "time_per_iteration": 2.732661724090576 + }, + { + "auxiliary_loss_clip": 0.11876252, + "auxiliary_loss_mlp": 0.15609139, + "balance_loss_clip": 0.0847889, + "balance_loss_mlp": 0.02498787, + "epoch": 0.012445513302269652, + "flos": 16258690690560.0, + "grad_norm": 37.57079461410369, + "language_loss": 2.63663292, + "learning_rate": 3.43348263905683e-06, + "loss": 2.91148686, + "num_input_tokens_seen": 4383650, + "router_z_loss_clip": 33.9375, + "router_z_loss_mlp": 131.25, + "step": 207, + "time_per_iteration": 2.655898332595825 + }, + { + "auxiliary_loss_clip": 0.11858118, + "auxiliary_loss_mlp": 0.15964949, + "balance_loss_clip": 0.08469288, + "balance_loss_mlp": 0.02500593, + "epoch": 0.012505636554937622, + "flos": 23776224249600.0, + "grad_norm": 80.16610328924297, + "language_loss": 2.31757832, + "learning_rate": 3.436585547151547e-06, + "loss": 2.59580898, + "num_input_tokens_seen": 4403765, + "router_z_loss_clip": 33.90625, + "router_z_loss_mlp": 134.8125, + "step": 208, + "time_per_iteration": 2.7096707820892334 + }, + { + "auxiliary_loss_clip": 0.11891477, + "auxiliary_loss_mlp": 0.15333374, + "balance_loss_clip": 0.08512411, + "balance_loss_mlp": 0.02509888, + "epoch": 0.012565759807605591, + "flos": 30599417750400.0, + "grad_norm": 94.61742092763181, + "language_loss": 2.89340639, + "learning_rate": 3.4396735731358586e-06, + "loss": 3.16565466, + "num_input_tokens_seen": 4421935, + "router_z_loss_clip": 33.8125, + "router_z_loss_mlp": 128.3125, + "step": 209, + "time_per_iteration": 2.7260549068450928 + }, + { + "auxiliary_loss_clip": 0.11866176, + "auxiliary_loss_mlp": 0.14843261, + "balance_loss_clip": 0.08489646, + "balance_loss_mlp": 0.02508056, + "epoch": 0.012625883060273561, + "flos": 40122838200960.0, + "grad_norm": 70.02885877178691, + "language_loss": 2.47040462, + "learning_rate": 3.4427468590832302e-06, + "loss": 2.737499, + "num_input_tokens_seen": 4441470, + "router_z_loss_clip": 33.78125, + "router_z_loss_mlp": 123.375, + "step": 210, + "time_per_iteration": 2.8969995975494385 + }, + { + "auxiliary_loss_clip": 0.1188697, + "auxiliary_loss_mlp": 0.14057073, + "balance_loss_clip": 0.08471721, + "balance_loss_mlp": 0.02497014, + "epoch": 0.01268600631294153, + "flos": 27096509013120.0, + "grad_norm": 122.06391807709156, + "language_loss": 2.54189563, + "learning_rate": 3.445805545042314e-06, + "loss": 2.80133629, + "num_input_tokens_seen": 4459950, + "router_z_loss_clip": 34.15625, + "router_z_loss_mlp": 115.625, + "step": 211, + "time_per_iteration": 2.708080768585205 + }, + { + "auxiliary_loss_clip": 0.11883873, + "auxiliary_loss_mlp": 0.13339609, + "balance_loss_clip": 0.08499163, + "balance_loss_mlp": 0.02499764, + "epoch": 0.012746129565609499, + "flos": 16988431898880.0, + "grad_norm": 126.44131700603937, + "language_loss": 2.37998009, + "learning_rate": 3.448849769075239e-06, + "loss": 2.63221502, + "num_input_tokens_seen": 4478390, + "router_z_loss_clip": 33.84375, + "router_z_loss_mlp": 108.375, + "step": 212, + "time_per_iteration": 2.6480045318603516 + }, + { + "auxiliary_loss_clip": 0.11928719, + "auxiliary_loss_mlp": 0.13044119, + "balance_loss_clip": 0.08510935, + "balance_loss_mlp": 0.02497243, + "epoch": 0.012806252818277469, + "flos": 46543621668480.0, + "grad_norm": 186.42729164055353, + "language_loss": 2.21970725, + "learning_rate": 3.4518796672950093e-06, + "loss": 2.46943569, + "num_input_tokens_seen": 4501665, + "router_z_loss_clip": 34.15625, + "router_z_loss_mlp": 105.5625, + "step": 213, + "time_per_iteration": 2.871330738067627 + }, + { + "auxiliary_loss_clip": 0.119517, + "auxiliary_loss_mlp": 0.12083894, + "balance_loss_clip": 0.08513753, + "balance_loss_mlp": 0.02489167, + "epoch": 0.012866376070945438, + "flos": 14393234442240.0, + "grad_norm": 59.129237382202305, + "language_loss": 2.15201378, + "learning_rate": 3.4548953739020187e-06, + "loss": 2.39236999, + "num_input_tokens_seen": 4519055, + "router_z_loss_clip": 34.40625, + "router_z_loss_mlp": 95.9375, + "step": 214, + "time_per_iteration": 2.677279472351074 + }, + { + "auxiliary_loss_clip": 0.11979187, + "auxiliary_loss_mlp": 0.11437444, + "balance_loss_clip": 0.08527225, + "balance_loss_mlp": 0.02483585, + "epoch": 0.012926499323613408, + "flos": 26148029921280.0, + "grad_norm": 82.8472801825022, + "language_loss": 2.01005268, + "learning_rate": 3.4578970212197196e-06, + "loss": 2.24421906, + "num_input_tokens_seen": 4540870, + "router_z_loss_clip": 34.5, + "router_z_loss_mlp": 89.625, + "step": 215, + "time_per_iteration": 5.505565881729126 + }, + { + "auxiliary_loss_clip": 0.11977073, + "auxiliary_loss_mlp": 0.10736242, + "balance_loss_clip": 0.08518873, + "balance_loss_mlp": 0.02484289, + "epoch": 0.012986622576281377, + "flos": 30124989532800.0, + "grad_norm": 444.29299491343255, + "language_loss": 2.23052669, + "learning_rate": 3.460884739729461e-06, + "loss": 2.45765996, + "num_input_tokens_seen": 4560395, + "router_z_loss_clip": 34.625, + "router_z_loss_mlp": 82.5, + "step": 216, + "time_per_iteration": 4.0875208377838135 + }, + { + "auxiliary_loss_clip": 0.11978886, + "auxiliary_loss_mlp": 0.10150906, + "balance_loss_clip": 0.0852896, + "balance_loss_mlp": 0.02478787, + "epoch": 0.013046745828949347, + "flos": 13959112838400.0, + "grad_norm": 45.21271501184753, + "language_loss": 2.33321786, + "learning_rate": 3.463858658104523e-06, + "loss": 2.55451584, + "num_input_tokens_seen": 4575785, + "router_z_loss_clip": 34.46875, + "router_z_loss_mlp": 76.625, + "step": 217, + "time_per_iteration": 4.032313585281372 + }, + { + "auxiliary_loss_clip": 0.11990365, + "auxiliary_loss_mlp": 0.09330522, + "balance_loss_clip": 0.08498306, + "balance_loss_mlp": 0.02482377, + "epoch": 0.013106869081617315, + "flos": 17353595992320.0, + "grad_norm": 48.7496700865691, + "language_loss": 2.077981, + "learning_rate": 3.4668189032433696e-06, + "loss": 2.29119015, + "num_input_tokens_seen": 4594985, + "router_z_loss_clip": 34.9375, + "router_z_loss_mlp": 68.625, + "step": 218, + "time_per_iteration": 2.655488967895508 + }, + { + "auxiliary_loss_clip": 0.12044869, + "auxiliary_loss_mlp": 0.08778962, + "balance_loss_clip": 0.08527655, + "balance_loss_mlp": 0.02477083, + "epoch": 0.013166992334285284, + "flos": 25892004170880.0, + "grad_norm": 58.49845250600888, + "language_loss": 2.1651845, + "learning_rate": 3.46976560030214e-06, + "loss": 2.3734231, + "num_input_tokens_seen": 4616125, + "router_z_loss_clip": 35.15625, + "router_z_loss_mlp": 63.0, + "step": 219, + "time_per_iteration": 2.7416553497314453 + }, + { + "auxiliary_loss_clip": 0.12097923, + "auxiliary_loss_mlp": 0.08351351, + "balance_loss_clip": 0.08555256, + "balance_loss_mlp": 0.0248282, + "epoch": 0.013227115586953254, + "flos": 31184032487040.0, + "grad_norm": 65.30096795058861, + "language_loss": 2.22661948, + "learning_rate": 3.4726988727263976e-06, + "loss": 2.43111229, + "num_input_tokens_seen": 4637795, + "router_z_loss_clip": 35.40625, + "router_z_loss_mlp": 58.625, + "step": 220, + "time_per_iteration": 2.825364351272583 + }, + { + "auxiliary_loss_clip": 0.12091806, + "auxiliary_loss_mlp": 0.07555279, + "balance_loss_clip": 0.08557573, + "balance_loss_mlp": 0.02477154, + "epoch": 0.013287238839621223, + "flos": 20415213601920.0, + "grad_norm": 85.51848477504389, + "language_loss": 2.08907223, + "learning_rate": 3.475618842282164e-06, + "loss": 2.2855432, + "num_input_tokens_seen": 4656835, + "router_z_loss_clip": 35.375, + "router_z_loss_mlp": 50.75, + "step": 221, + "time_per_iteration": 2.699341058731079 + }, + { + "auxiliary_loss_clip": 0.12102397, + "auxiliary_loss_mlp": 0.07188272, + "balance_loss_clip": 0.08552121, + "balance_loss_mlp": 0.02482462, + "epoch": 0.013347362092289193, + "flos": 14142365717760.0, + "grad_norm": 45.70301732891132, + "language_loss": 2.16536474, + "learning_rate": 3.4785256290862486e-06, + "loss": 2.3582716, + "num_input_tokens_seen": 4673015, + "router_z_loss_clip": 35.5, + "router_z_loss_mlp": 47.0, + "step": 222, + "time_per_iteration": 2.635849714279175 + }, + { + "auxiliary_loss_clip": 0.12141806, + "auxiliary_loss_mlp": 0.06919794, + "balance_loss_clip": 0.08555885, + "balance_loss_mlp": 0.0248864, + "epoch": 0.013407485344957162, + "flos": 21803977572480.0, + "grad_norm": 133.93360024755185, + "language_loss": 2.13315558, + "learning_rate": 3.481419351635897e-06, + "loss": 2.32377172, + "num_input_tokens_seen": 4692355, + "router_z_loss_clip": 35.84375, + "router_z_loss_mlp": 44.375, + "step": 223, + "time_per_iteration": 2.677440881729126 + }, + { + "auxiliary_loss_clip": 0.12133283, + "auxiliary_loss_mlp": 0.06662595, + "balance_loss_clip": 0.08527759, + "balance_loss_mlp": 0.0248779, + "epoch": 0.013467608597625132, + "flos": 18627058344960.0, + "grad_norm": 45.82649386348146, + "language_loss": 2.04508209, + "learning_rate": 3.484300126837776e-06, + "loss": 2.23304057, + "num_input_tokens_seen": 4710080, + "router_z_loss_clip": 36.0, + "router_z_loss_mlp": 41.71875, + "step": 224, + "time_per_iteration": 2.647221803665161 + }, + { + "auxiliary_loss_clip": 0.12132762, + "auxiliary_loss_mlp": 0.06591167, + "balance_loss_clip": 0.0855926, + "balance_loss_mlp": 0.02489604, + "epoch": 0.013527731850293101, + "flos": 18558352396800.0, + "grad_norm": 35.4602333373948, + "language_loss": 1.96751869, + "learning_rate": 3.487168070036317e-06, + "loss": 2.15475798, + "num_input_tokens_seen": 4728980, + "router_z_loss_clip": 35.71875, + "router_z_loss_mlp": 41.0, + "step": 225, + "time_per_iteration": 2.6572558879852295 + }, + { + "auxiliary_loss_clip": 0.12111218, + "auxiliary_loss_mlp": 0.06338836, + "balance_loss_clip": 0.08540972, + "balance_loss_mlp": 0.02487518, + "epoch": 0.01358785510296107, + "flos": 19170318291840.0, + "grad_norm": 35.010295897234684, + "language_loss": 2.14010954, + "learning_rate": 3.4900232950414224e-06, + "loss": 2.32460999, + "num_input_tokens_seen": 4747020, + "router_z_loss_clip": 35.6875, + "router_z_loss_mlp": 38.46875, + "step": 226, + "time_per_iteration": 2.6925666332244873 + }, + { + "auxiliary_loss_clip": 0.12106597, + "auxiliary_loss_mlp": 0.06106333, + "balance_loss_clip": 0.08537765, + "balance_loss_mlp": 0.02477793, + "epoch": 0.01364797835562904, + "flos": 23336442495360.0, + "grad_norm": 62.289483146556975, + "language_loss": 1.89336014, + "learning_rate": 3.4928659141555727e-06, + "loss": 2.07548952, + "num_input_tokens_seen": 4765000, + "router_z_loss_clip": 35.71875, + "router_z_loss_mlp": 36.25, + "step": 227, + "time_per_iteration": 2.662459373474121 + }, + { + "auxiliary_loss_clip": 0.09852038, + "auxiliary_loss_mlp": 0.02028254, + "balance_loss_clip": 0.08093569, + "balance_loss_mlp": 0.01678827, + "epoch": 0.013708101608297009, + "flos": 71016561089280.0, + "grad_norm": 1.118625578373922, + "language_loss": 0.572559, + "learning_rate": 3.4956960382003234e-06, + "loss": 0.6913619, + "num_input_tokens_seen": 4833210, + "router_z_loss_clip": 17.53125, + "router_z_loss_mlp": 3.49804688, + "step": 228, + "time_per_iteration": 3.3785295486450195 + }, + { + "auxiliary_loss_clip": 0.12056112, + "auxiliary_loss_mlp": 0.05858175, + "balance_loss_clip": 0.08522452, + "balance_loss_mlp": 0.02485983, + "epoch": 0.013768224860964979, + "flos": 16330583093760.0, + "grad_norm": 67.20403392826273, + "language_loss": 1.83727443, + "learning_rate": 3.4985137765422354e-06, + "loss": 2.0164175, + "num_input_tokens_seen": 4850120, + "router_z_loss_clip": 35.34375, + "router_z_loss_mlp": 33.765625, + "step": 229, + "time_per_iteration": 2.6247904300689697 + }, + { + "auxiliary_loss_clip": 0.11999249, + "auxiliary_loss_mlp": 0.05601757, + "balance_loss_clip": 0.08509874, + "balance_loss_mlp": 0.02482861, + "epoch": 0.013828348113632948, + "flos": 20199159048960.0, + "grad_norm": 53.50045183346903, + "language_loss": 1.8795563, + "learning_rate": 3.501319237118231e-06, + "loss": 2.05556631, + "num_input_tokens_seen": 4866215, + "router_z_loss_clip": 34.9375, + "router_z_loss_mlp": 31.1875, + "step": 230, + "time_per_iteration": 2.7507057189941406 + }, + { + "auxiliary_loss_clip": 0.12064129, + "auxiliary_loss_mlp": 0.05470717, + "balance_loss_clip": 0.08557475, + "balance_loss_mlp": 0.02487624, + "epoch": 0.013888471366300916, + "flos": 20747408313600.0, + "grad_norm": 34.266749882440614, + "language_loss": 1.64469385, + "learning_rate": 3.5041125264604056e-06, + "loss": 1.82004225, + "num_input_tokens_seen": 4885630, + "router_z_loss_clip": 35.09375, + "router_z_loss_mlp": 29.796875, + "step": 231, + "time_per_iteration": 2.641220808029175 + }, + { + "auxiliary_loss_clip": 0.12051, + "auxiliary_loss_mlp": 0.05321148, + "balance_loss_clip": 0.08549553, + "balance_loss_mlp": 0.02486065, + "epoch": 0.013948594618968886, + "flos": 22097123481600.0, + "grad_norm": 189.27377216215737, + "language_loss": 1.70564377, + "learning_rate": 3.5068937497203002e-06, + "loss": 1.87936521, + "num_input_tokens_seen": 4905570, + "router_z_loss_clip": 35.0, + "router_z_loss_mlp": 28.34375, + "step": 232, + "time_per_iteration": 2.6656322479248047 + }, + { + "auxiliary_loss_clip": 0.12035383, + "auxiliary_loss_mlp": 0.0510756, + "balance_loss_clip": 0.08542152, + "balance_loss_mlp": 0.02483049, + "epoch": 0.014008717871636855, + "flos": 19069229940480.0, + "grad_norm": 76.31242813901656, + "language_loss": 1.64492762, + "learning_rate": 3.509663010692652e-06, + "loss": 1.81635702, + "num_input_tokens_seen": 4923535, + "router_z_loss_clip": 34.96875, + "router_z_loss_mlp": 26.25, + "step": 233, + "time_per_iteration": 2.6354150772094727 + }, + { + "auxiliary_loss_clip": 0.12088259, + "auxiliary_loss_mlp": 0.05079982, + "balance_loss_clip": 0.08570465, + "balance_loss_mlp": 0.02490566, + "epoch": 0.014068841124304825, + "flos": 14534839042560.0, + "grad_norm": 50.00852440461159, + "language_loss": 1.75618017, + "learning_rate": 3.512420411838642e-06, + "loss": 1.92786264, + "num_input_tokens_seen": 4939200, + "router_z_loss_clip": 35.15625, + "router_z_loss_mlp": 25.890625, + "step": 234, + "time_per_iteration": 2.666630983352661 + }, + { + "auxiliary_loss_clip": 0.11989364, + "auxiliary_loss_mlp": 0.05021151, + "balance_loss_clip": 0.08533135, + "balance_loss_mlp": 0.0249277, + "epoch": 0.014128964376972794, + "flos": 18083253346560.0, + "grad_norm": 159.74277839526525, + "language_loss": 1.68861091, + "learning_rate": 3.515166054308634e-06, + "loss": 1.85871601, + "num_input_tokens_seen": 4956620, + "router_z_loss_clip": 34.625, + "router_z_loss_mlp": 25.28125, + "step": 235, + "time_per_iteration": 2.6749186515808105 + }, + { + "auxiliary_loss_clip": 0.12056133, + "auxiliary_loss_mlp": 0.04976581, + "balance_loss_clip": 0.08549982, + "balance_loss_mlp": 0.02495502, + "epoch": 0.014189087629640764, + "flos": 25340778086400.0, + "grad_norm": 181.61682318003585, + "language_loss": 1.60946572, + "learning_rate": 3.5179000379644498e-06, + "loss": 1.77979279, + "num_input_tokens_seen": 4975650, + "router_z_loss_clip": 35.03125, + "router_z_loss_mlp": 24.8125, + "step": 236, + "time_per_iteration": 2.744683027267456 + }, + { + "auxiliary_loss_clip": 0.11981137, + "auxiliary_loss_mlp": 0.04688486, + "balance_loss_clip": 0.08556408, + "balance_loss_mlp": 0.02492746, + "epoch": 0.014249210882308733, + "flos": 36148939263360.0, + "grad_norm": 53.559601436427585, + "language_loss": 1.50691867, + "learning_rate": 3.520622461401154e-06, + "loss": 1.67361498, + "num_input_tokens_seen": 4997415, + "router_z_loss_clip": 34.25, + "router_z_loss_mlp": 21.96875, + "step": 237, + "time_per_iteration": 2.845082998275757 + }, + { + "auxiliary_loss_clip": 0.12020621, + "auxiliary_loss_mlp": 0.04751597, + "balance_loss_clip": 0.08577786, + "balance_loss_mlp": 0.02497874, + "epoch": 0.014309334134976702, + "flos": 12937986656640.0, + "grad_norm": 74.10279300011292, + "language_loss": 1.46138978, + "learning_rate": 3.5233334219683935e-06, + "loss": 1.62911201, + "num_input_tokens_seen": 5013905, + "router_z_loss_clip": 34.4375, + "router_z_loss_mlp": 22.5625, + "step": 238, + "time_per_iteration": 2.658674716949463 + }, + { + "auxiliary_loss_clip": 0.11937614, + "auxiliary_loss_mlp": 0.04392426, + "balance_loss_clip": 0.08564249, + "balance_loss_mlp": 0.02485077, + "epoch": 0.014369457387644672, + "flos": 20783857639680.0, + "grad_norm": 42.588620022932425, + "language_loss": 1.53544843, + "learning_rate": 3.526033015791284e-06, + "loss": 1.69874883, + "num_input_tokens_seen": 5033645, + "router_z_loss_clip": 33.78125, + "router_z_loss_mlp": 19.046875, + "step": 239, + "time_per_iteration": 2.700894355773926 + }, + { + "auxiliary_loss_clip": 0.11902035, + "auxiliary_loss_mlp": 0.04253633, + "balance_loss_clip": 0.08564246, + "balance_loss_mlp": 0.02488191, + "epoch": 0.01442958064031264, + "flos": 25855638698880.0, + "grad_norm": 34.671761903295156, + "language_loss": 1.53386331, + "learning_rate": 3.528721337790862e-06, + "loss": 1.69542003, + "num_input_tokens_seen": 5052875, + "router_z_loss_clip": 33.4375, + "router_z_loss_mlp": 17.671875, + "step": 240, + "time_per_iteration": 2.712979555130005 + }, + { + "auxiliary_loss_clip": 0.11883197, + "auxiliary_loss_mlp": 0.04123231, + "balance_loss_clip": 0.08562298, + "balance_loss_mlp": 0.02487489, + "epoch": 0.014489703892980611, + "flos": 28227150881280.0, + "grad_norm": 79.00201559956153, + "language_loss": 1.47835279, + "learning_rate": 3.531398481704111e-06, + "loss": 1.63841701, + "num_input_tokens_seen": 5075005, + "router_z_loss_clip": 33.15625, + "router_z_loss_mlp": 16.359375, + "step": 241, + "time_per_iteration": 2.7748684883117676 + }, + { + "auxiliary_loss_clip": 0.11856598, + "auxiliary_loss_mlp": 0.0397551, + "balance_loss_clip": 0.08558369, + "balance_loss_mlp": 0.02488541, + "epoch": 0.01454982714564858, + "flos": 22497311381760.0, + "grad_norm": 26.156771136535646, + "language_loss": 1.46749806, + "learning_rate": 3.534064540103573e-06, + "loss": 1.62581909, + "num_input_tokens_seen": 5091875, + "router_z_loss_clip": 32.984375, + "router_z_loss_mlp": 14.875, + "step": 242, + "time_per_iteration": 2.69297456741333 + }, + { + "auxiliary_loss_clip": 0.11859537, + "auxiliary_loss_mlp": 0.03845835, + "balance_loss_clip": 0.08550237, + "balance_loss_mlp": 0.0248704, + "epoch": 0.014609950398316548, + "flos": 21659689641600.0, + "grad_norm": 40.62615504318681, + "language_loss": 1.44594622, + "learning_rate": 3.536719604416555e-06, + "loss": 1.60299993, + "num_input_tokens_seen": 5111290, + "router_z_loss_clip": 33.03125, + "router_z_loss_mlp": 13.5859375, + "step": 243, + "time_per_iteration": 2.7429516315460205 + }, + { + "auxiliary_loss_clip": 0.11778541, + "auxiliary_loss_mlp": 0.03809229, + "balance_loss_clip": 0.08539546, + "balance_loss_mlp": 0.02486292, + "epoch": 0.014670073650984519, + "flos": 21876163464960.0, + "grad_norm": 100.86422067940943, + "language_loss": 1.56203103, + "learning_rate": 3.5393637649439464e-06, + "loss": 1.71790862, + "num_input_tokens_seen": 5132265, + "router_z_loss_clip": 32.34375, + "router_z_loss_mlp": 13.2265625, + "step": 244, + "time_per_iteration": 2.6750683784484863 + }, + { + "auxiliary_loss_clip": 0.11823894, + "auxiliary_loss_mlp": 0.03778996, + "balance_loss_clip": 0.08550587, + "balance_loss_mlp": 0.02497257, + "epoch": 0.014730196903652487, + "flos": 23190142066560.0, + "grad_norm": 48.52251723310838, + "language_loss": 1.50476313, + "learning_rate": 3.54199711087864e-06, + "loss": 1.66079211, + "num_input_tokens_seen": 5148575, + "router_z_loss_clip": 32.71875, + "router_z_loss_mlp": 12.8125, + "step": 245, + "time_per_iteration": 2.72153639793396 + }, + { + "auxiliary_loss_clip": 0.11763392, + "auxiliary_loss_mlp": 0.03610927, + "balance_loss_clip": 0.08551488, + "balance_loss_mlp": 0.02484828, + "epoch": 0.014790320156320457, + "flos": 23229442431360.0, + "grad_norm": 98.70024924690004, + "language_loss": 1.52072549, + "learning_rate": 3.5446197303235913e-06, + "loss": 1.67446864, + "num_input_tokens_seen": 5170415, + "router_z_loss_clip": 32.078125, + "router_z_loss_mlp": 11.265625, + "step": 246, + "time_per_iteration": 2.739284038543701 + }, + { + "auxiliary_loss_clip": 0.11731501, + "auxiliary_loss_mlp": 0.03545591, + "balance_loss_clip": 0.08530955, + "balance_loss_mlp": 0.0246832, + "epoch": 0.014850443408988426, + "flos": 15821005288320.0, + "grad_norm": 33.98035395755878, + "language_loss": 1.40319586, + "learning_rate": 3.5472317103095034e-06, + "loss": 1.55596685, + "num_input_tokens_seen": 5188565, + "router_z_loss_clip": 31.96875, + "router_z_loss_mlp": 10.7734375, + "step": 247, + "time_per_iteration": 2.7273683547973633 + }, + { + "auxiliary_loss_clip": 0.1172208, + "auxiliary_loss_mlp": 0.03547119, + "balance_loss_clip": 0.08564139, + "balance_loss_mlp": 0.02478241, + "epoch": 0.014910566661656396, + "flos": 22787899741440.0, + "grad_norm": 52.371226674183355, + "language_loss": 1.30089116, + "learning_rate": 3.549833136812155e-06, + "loss": 1.453583, + "num_input_tokens_seen": 5207810, + "router_z_loss_clip": 31.578125, + "router_z_loss_mlp": 10.6953125, + "step": 248, + "time_per_iteration": 2.7991907596588135 + }, + { + "auxiliary_loss_clip": 0.11678547, + "auxiliary_loss_mlp": 0.03475812, + "balance_loss_clip": 0.08537906, + "balance_loss_mlp": 0.02466443, + "epoch": 0.014970689914324365, + "flos": 26871440146560.0, + "grad_norm": 39.139484540660874, + "language_loss": 1.33625245, + "learning_rate": 3.552424094769381e-06, + "loss": 1.48779607, + "num_input_tokens_seen": 5226210, + "router_z_loss_clip": 31.390625, + "router_z_loss_mlp": 10.0859375, + "step": 249, + "time_per_iteration": 2.7439961433410645 + }, + { + "auxiliary_loss_clip": 0.11684404, + "auxiliary_loss_mlp": 0.03406032, + "balance_loss_clip": 0.08537483, + "balance_loss_mlp": 0.02458461, + "epoch": 0.015030813166992334, + "flos": 13989943941120.0, + "grad_norm": 151.47532384589994, + "language_loss": 1.465379, + "learning_rate": 3.5550046680977174e-06, + "loss": 1.6162833, + "num_input_tokens_seen": 5241660, + "router_z_loss_clip": 31.46875, + "router_z_loss_mlp": 9.4765625, + "step": 250, + "time_per_iteration": 2.68412184715271 + }, + { + "auxiliary_loss_clip": 0.11659358, + "auxiliary_loss_mlp": 0.03389172, + "balance_loss_clip": 0.08554412, + "balance_loss_mlp": 0.02466397, + "epoch": 0.015090936419660304, + "flos": 24724787195520.0, + "grad_norm": 46.474949555678066, + "language_loss": 1.48383927, + "learning_rate": 3.5575749397087034e-06, + "loss": 1.63432467, + "num_input_tokens_seen": 5261090, + "router_z_loss_clip": 31.0625, + "router_z_loss_mlp": 9.22265625, + "step": 251, + "time_per_iteration": 2.7403595447540283 + }, + { + "auxiliary_loss_clip": 0.11684091, + "auxiliary_loss_mlp": 0.0341421, + "balance_loss_clip": 0.08552309, + "balance_loss_mlp": 0.02502498, + "epoch": 0.015151059672328273, + "flos": 25745829523200.0, + "grad_norm": 38.842940432028065, + "language_loss": 1.35644555, + "learning_rate": 3.5601349915248707e-06, + "loss": 1.50742865, + "num_input_tokens_seen": 5279175, + "router_z_loss_clip": 31.296875, + "router_z_loss_mlp": 9.1171875, + "step": 252, + "time_per_iteration": 2.791579246520996 + }, + { + "auxiliary_loss_clip": 0.11669001, + "auxiliary_loss_mlp": 0.03442915, + "balance_loss_clip": 0.08573347, + "balance_loss_mlp": 0.02537305, + "epoch": 0.015211182924996243, + "flos": 21877588984320.0, + "grad_norm": 62.5379323018988, + "language_loss": 1.55304623, + "learning_rate": 3.5626849044954064e-06, + "loss": 1.70416546, + "num_input_tokens_seen": 5296975, + "router_z_loss_clip": 30.96875, + "router_z_loss_mlp": 9.0625, + "step": 253, + "time_per_iteration": 2.6943836212158203 + }, + { + "auxiliary_loss_clip": 0.09242393, + "auxiliary_loss_mlp": 0.017157, + "balance_loss_clip": 0.07774388, + "balance_loss_mlp": 0.01455537, + "epoch": 0.015271306177664212, + "flos": 66915159765120.0, + "grad_norm": 1.2208472030610649, + "language_loss": 0.55767465, + "learning_rate": 3.5652247586115167e-06, + "loss": 0.66725558, + "num_input_tokens_seen": 5358375, + "router_z_loss_clip": 14.65625, + "router_z_loss_mlp": 2.6015625, + "step": 254, + "time_per_iteration": 4.672732353210449 + }, + { + "auxiliary_loss_clip": 0.11620437, + "auxiliary_loss_mlp": 0.03323486, + "balance_loss_clip": 0.08537702, + "balance_loss_mlp": 0.02497223, + "epoch": 0.01533142943033218, + "flos": 26841405657600.0, + "grad_norm": 25.800997540380294, + "language_loss": 1.37205672, + "learning_rate": 3.567754632921479e-06, + "loss": 1.52149594, + "num_input_tokens_seen": 5377255, + "router_z_loss_clip": 30.84375, + "router_z_loss_mlp": 8.265625, + "step": 255, + "time_per_iteration": 5.487545490264893 + }, + { + "auxiliary_loss_clip": 0.11549303, + "auxiliary_loss_mlp": 0.03243715, + "balance_loss_clip": 0.08531242, + "balance_loss_mlp": 0.02464373, + "epoch": 0.01539155268300015, + "flos": 20820055403520.0, + "grad_norm": 51.38147970022548, + "language_loss": 1.3568666, + "learning_rate": 3.5702746055454075e-06, + "loss": 1.50479686, + "num_input_tokens_seen": 5395320, + "router_z_loss_clip": 30.171875, + "router_z_loss_mlp": 7.7890625, + "step": 256, + "time_per_iteration": 2.7118937969207764 + }, + { + "auxiliary_loss_clip": 0.11515065, + "auxiliary_loss_mlp": 0.0323028, + "balance_loss_clip": 0.08509345, + "balance_loss_mlp": 0.02460093, + "epoch": 0.01545167593566812, + "flos": 15967473425280.0, + "grad_norm": 27.629045104410558, + "language_loss": 1.28094459, + "learning_rate": 3.5727847536897254e-06, + "loss": 1.42839789, + "num_input_tokens_seen": 5411970, + "router_z_loss_clip": 30.046875, + "router_z_loss_mlp": 7.69921875, + "step": 257, + "time_per_iteration": 4.093847751617432 + }, + { + "auxiliary_loss_clip": 0.11514995, + "auxiliary_loss_mlp": 0.03174197, + "balance_loss_clip": 0.08523524, + "balance_loss_mlp": 0.02457415, + "epoch": 0.01551179918833609, + "flos": 22608378368640.0, + "grad_norm": 22.193359085523966, + "language_loss": 1.37467206, + "learning_rate": 3.5752851536613596e-06, + "loss": 1.52156401, + "num_input_tokens_seen": 5430245, + "router_z_loss_clip": 29.921875, + "router_z_loss_mlp": 7.171875, + "step": 258, + "time_per_iteration": 2.6789233684539795 + }, + { + "auxiliary_loss_clip": 0.11490995, + "auxiliary_loss_mlp": 0.03125494, + "balance_loss_clip": 0.08525682, + "balance_loss_mlp": 0.02450675, + "epoch": 0.015571922441004058, + "flos": 22822713912960.0, + "grad_norm": 41.08352403819959, + "language_loss": 1.35431111, + "learning_rate": 3.577775880881658e-06, + "loss": 1.50047588, + "num_input_tokens_seen": 5448905, + "router_z_loss_clip": 29.640625, + "router_z_loss_mlp": 6.75390625, + "step": 259, + "time_per_iteration": 2.716095209121704 + }, + { + "auxiliary_loss_clip": 0.11409761, + "auxiliary_loss_mlp": 0.03065479, + "balance_loss_clip": 0.08500087, + "balance_loss_mlp": 0.02439868, + "epoch": 0.015632045693672027, + "flos": 18952502803200.0, + "grad_norm": 45.41794645804665, + "language_loss": 1.35833013, + "learning_rate": 3.5802570099000424e-06, + "loss": 1.50308251, + "num_input_tokens_seen": 5466405, + "router_z_loss_clip": 29.109375, + "router_z_loss_mlp": 6.25390625, + "step": 260, + "time_per_iteration": 2.63728666305542 + }, + { + "auxiliary_loss_clip": 0.11363758, + "auxiliary_loss_mlp": 0.03047284, + "balance_loss_clip": 0.0847533, + "balance_loss_mlp": 0.02422818, + "epoch": 0.015692168946339995, + "flos": 29979569571840.0, + "grad_norm": 14.449297272648009, + "language_loss": 1.30485594, + "learning_rate": 3.5827286144073947e-06, + "loss": 1.44896626, + "num_input_tokens_seen": 5487055, + "router_z_loss_clip": 28.921875, + "router_z_loss_mlp": 6.23828125, + "step": 261, + "time_per_iteration": 2.7847509384155273 + }, + { + "auxiliary_loss_clip": 0.11379428, + "auxiliary_loss_mlp": 0.03054321, + "balance_loss_clip": 0.08507971, + "balance_loss_mlp": 0.02459991, + "epoch": 0.015752292199007967, + "flos": 19398363978240.0, + "grad_norm": 31.701786044094614, + "language_loss": 1.03000259, + "learning_rate": 3.5851907672491904e-06, + "loss": 1.17434001, + "num_input_tokens_seen": 5506600, + "router_z_loss_clip": 28.71875, + "router_z_loss_mlp": 5.94140625, + "step": 262, + "time_per_iteration": 2.6821658611297607 + }, + { + "auxiliary_loss_clip": 0.11303549, + "auxiliary_loss_mlp": 0.02991728, + "balance_loss_clip": 0.0846238, + "balance_loss_mlp": 0.02461103, + "epoch": 0.015812415451675936, + "flos": 20346088383360.0, + "grad_norm": 21.20591685993131, + "language_loss": 1.06071973, + "learning_rate": 3.587643540438383e-06, + "loss": 1.20367253, + "num_input_tokens_seen": 5524350, + "router_z_loss_clip": 28.421875, + "router_z_loss_mlp": 5.30859375, + "step": 263, + "time_per_iteration": 2.6878163814544678 + }, + { + "auxiliary_loss_clip": 0.11343089, + "auxiliary_loss_mlp": 0.02942515, + "balance_loss_clip": 0.08484475, + "balance_loss_mlp": 0.0242982, + "epoch": 0.015872538704343905, + "flos": 17530392107520.0, + "grad_norm": 30.142563573193335, + "language_loss": 1.29773152, + "learning_rate": 3.590087005168037e-06, + "loss": 1.44058764, + "num_input_tokens_seen": 5542145, + "router_z_loss_clip": 28.59375, + "router_z_loss_mlp": 5.125, + "step": 264, + "time_per_iteration": 2.662154197692871 + }, + { + "auxiliary_loss_clip": 0.11317942, + "auxiliary_loss_mlp": 0.02875043, + "balance_loss_clip": 0.08491537, + "balance_loss_mlp": 0.02415754, + "epoch": 0.015932661957011873, + "flos": 15264622177920.0, + "grad_norm": 32.942584170075996, + "language_loss": 1.38455915, + "learning_rate": 3.5925212318237344e-06, + "loss": 1.52648902, + "num_input_tokens_seen": 5557920, + "router_z_loss_clip": 28.28125, + "router_z_loss_mlp": 4.59375, + "step": 265, + "time_per_iteration": 2.6390388011932373 + }, + { + "auxiliary_loss_clip": 0.11291553, + "auxiliary_loss_mlp": 0.02864291, + "balance_loss_clip": 0.08442727, + "balance_loss_mlp": 0.02421405, + "epoch": 0.015992785209679845, + "flos": 20308674735360.0, + "grad_norm": 55.122223701442024, + "language_loss": 1.13817394, + "learning_rate": 3.5949462899957323e-06, + "loss": 1.27973235, + "num_input_tokens_seen": 5576290, + "router_z_loss_clip": 28.484375, + "router_z_loss_mlp": 4.42773438, + "step": 266, + "time_per_iteration": 2.7511661052703857 + }, + { + "auxiliary_loss_clip": 0.11267024, + "auxiliary_loss_mlp": 0.02842336, + "balance_loss_clip": 0.08455394, + "balance_loss_mlp": 0.02423863, + "epoch": 0.016052908462347814, + "flos": 23368195992960.0, + "grad_norm": 26.951368678186665, + "language_loss": 1.23554707, + "learning_rate": 3.5973622484909068e-06, + "loss": 1.3766408, + "num_input_tokens_seen": 5595205, + "router_z_loss_clip": 28.140625, + "router_z_loss_mlp": 4.17773438, + "step": 267, + "time_per_iteration": 2.681403875350952 + }, + { + "auxiliary_loss_clip": 0.11252864, + "auxiliary_loss_mlp": 0.02837055, + "balance_loss_clip": 0.0845217, + "balance_loss_mlp": 0.02411335, + "epoch": 0.016113031715015783, + "flos": 21292722685440.0, + "grad_norm": 64.20150221953703, + "language_loss": 1.24742389, + "learning_rate": 3.599769175344462e-06, + "loss": 1.38832319, + "num_input_tokens_seen": 5612645, + "router_z_loss_clip": 28.0, + "router_z_loss_mlp": 4.2578125, + "step": 268, + "time_per_iteration": 2.72198224067688 + }, + { + "auxiliary_loss_clip": 0.11163211, + "auxiliary_loss_mlp": 0.02866759, + "balance_loss_clip": 0.08415397, + "balance_loss_mlp": 0.0243093, + "epoch": 0.01617315496768375, + "flos": 18920371962240.0, + "grad_norm": 170.41239636292127, + "language_loss": 1.22916961, + "learning_rate": 3.602167137831432e-06, + "loss": 1.3694694, + "num_input_tokens_seen": 5628345, + "router_z_loss_clip": 27.46875, + "router_z_loss_mlp": 4.36132812, + "step": 269, + "time_per_iteration": 2.6403703689575195 + }, + { + "auxiliary_loss_clip": 0.11217365, + "auxiliary_loss_mlp": 0.02780488, + "balance_loss_clip": 0.08470169, + "balance_loss_mlp": 0.02398446, + "epoch": 0.01623327822035172, + "flos": 16552339724160.0, + "grad_norm": 38.966481299889274, + "language_loss": 1.32494903, + "learning_rate": 3.6045562024779565e-06, + "loss": 1.46492743, + "num_input_tokens_seen": 5645940, + "router_z_loss_clip": 27.515625, + "router_z_loss_mlp": 3.82226562, + "step": 270, + "time_per_iteration": 2.7300021648406982 + }, + { + "auxiliary_loss_clip": 0.11115253, + "auxiliary_loss_mlp": 0.02879213, + "balance_loss_clip": 0.08416284, + "balance_loss_mlp": 0.02523302, + "epoch": 0.016293401473019692, + "flos": 23520198499200.0, + "grad_norm": 74.8782587112652, + "language_loss": 1.26303077, + "learning_rate": 3.606936435072361e-06, + "loss": 1.40297556, + "num_input_tokens_seen": 5665690, + "router_z_loss_clip": 26.984375, + "router_z_loss_mlp": 3.55859375, + "step": 271, + "time_per_iteration": 2.7073349952697754 + }, + { + "auxiliary_loss_clip": 0.11099713, + "auxiliary_loss_mlp": 0.02833465, + "balance_loss_clip": 0.08408779, + "balance_loss_mlp": 0.02473739, + "epoch": 0.01635352472568766, + "flos": 29022579290880.0, + "grad_norm": 92.09487601801163, + "language_loss": 1.22523308, + "learning_rate": 3.609307900676025e-06, + "loss": 1.36456478, + "num_input_tokens_seen": 5683190, + "router_z_loss_clip": 26.921875, + "router_z_loss_mlp": 3.59765625, + "step": 272, + "time_per_iteration": 2.767242670059204 + }, + { + "auxiliary_loss_clip": 0.11100094, + "auxiliary_loss_mlp": 0.02845915, + "balance_loss_clip": 0.08419856, + "balance_loss_mlp": 0.02489432, + "epoch": 0.01641364797835563, + "flos": 13375546277760.0, + "grad_norm": 162.68643260209848, + "language_loss": 1.12912893, + "learning_rate": 3.611670663634051e-06, + "loss": 1.26858902, + "num_input_tokens_seen": 5699780, + "router_z_loss_clip": 26.828125, + "router_z_loss_mlp": 3.5625, + "step": 273, + "time_per_iteration": 2.6756341457366943 + }, + { + "auxiliary_loss_clip": 0.11082844, + "auxiliary_loss_mlp": 0.02877946, + "balance_loss_clip": 0.08410685, + "balance_loss_mlp": 0.02487702, + "epoch": 0.016473771231023598, + "flos": 18883922636160.0, + "grad_norm": 33.34014800610017, + "language_loss": 1.30194449, + "learning_rate": 3.614024787585744e-06, + "loss": 1.44155228, + "num_input_tokens_seen": 5716980, + "router_z_loss_clip": 26.734375, + "router_z_loss_mlp": 3.90234375, + "step": 274, + "time_per_iteration": 2.7216930389404297 + }, + { + "auxiliary_loss_clip": 0.11044294, + "auxiliary_loss_mlp": 0.02852219, + "balance_loss_clip": 0.08402658, + "balance_loss_mlp": 0.02501839, + "epoch": 0.016533894483691566, + "flos": 22608252587520.0, + "grad_norm": 44.408233256015265, + "language_loss": 1.22405624, + "learning_rate": 3.6163703354748927e-06, + "loss": 1.36302137, + "num_input_tokens_seen": 5737780, + "router_z_loss_clip": 26.453125, + "router_z_loss_mlp": 3.50390625, + "step": 275, + "time_per_iteration": 2.6909008026123047 + }, + { + "auxiliary_loss_clip": 0.10985737, + "auxiliary_loss_mlp": 0.02874438, + "balance_loss_clip": 0.08389083, + "balance_loss_mlp": 0.02526728, + "epoch": 0.01659401773635954, + "flos": 21513640775040.0, + "grad_norm": 44.25598676438703, + "language_loss": 1.11958659, + "learning_rate": 3.6187073695598707e-06, + "loss": 1.25818849, + "num_input_tokens_seen": 5758330, + "router_z_loss_clip": 25.984375, + "router_z_loss_mlp": 3.4765625, + "step": 276, + "time_per_iteration": 2.700979471206665 + }, + { + "auxiliary_loss_clip": 0.10974017, + "auxiliary_loss_mlp": 0.02898641, + "balance_loss_clip": 0.08386508, + "balance_loss_mlp": 0.02528615, + "epoch": 0.016654140989027507, + "flos": 32858772842880.0, + "grad_norm": 42.11334181974309, + "language_loss": 1.14762068, + "learning_rate": 3.621035951423551e-06, + "loss": 1.28634739, + "num_input_tokens_seen": 5778340, + "router_z_loss_clip": 25.859375, + "router_z_loss_mlp": 3.703125, + "step": 277, + "time_per_iteration": 2.8497049808502197 + }, + { + "auxiliary_loss_clip": 0.10973347, + "auxiliary_loss_mlp": 0.02864523, + "balance_loss_clip": 0.08391111, + "balance_loss_mlp": 0.02533217, + "epoch": 0.016714264241695476, + "flos": 12310046559360.0, + "grad_norm": 887.2068563232498, + "language_loss": 1.11253488, + "learning_rate": 3.623356141983041e-06, + "loss": 1.25091362, + "num_input_tokens_seen": 5794295, + "router_z_loss_clip": 25.796875, + "router_z_loss_mlp": 3.3125, + "step": 278, + "time_per_iteration": 2.6813693046569824 + }, + { + "auxiliary_loss_clip": 0.10953625, + "auxiliary_loss_mlp": 0.02843702, + "balance_loss_clip": 0.08367237, + "balance_loss_mlp": 0.02501333, + "epoch": 0.016774387494363444, + "flos": 27130820060160.0, + "grad_norm": 34.273698880479216, + "language_loss": 1.25525784, + "learning_rate": 3.6256680014992486e-06, + "loss": 1.39323103, + "num_input_tokens_seen": 5814405, + "router_z_loss_clip": 25.859375, + "router_z_loss_mlp": 3.42382812, + "step": 279, + "time_per_iteration": 2.784980058670044 + }, + { + "auxiliary_loss_clip": 0.10968237, + "auxiliary_loss_mlp": 0.02757426, + "balance_loss_clip": 0.0838433, + "balance_loss_mlp": 0.02447863, + "epoch": 0.016834510747031413, + "flos": 20197356186240.0, + "grad_norm": 53.49395148263472, + "language_loss": 1.29536223, + "learning_rate": 3.6279715895862713e-06, + "loss": 1.43261886, + "num_input_tokens_seen": 5832795, + "router_z_loss_clip": 25.859375, + "router_z_loss_mlp": 3.09570312, + "step": 280, + "time_per_iteration": 2.681295871734619 + }, + { + "auxiliary_loss_clip": 0.10977297, + "auxiliary_loss_mlp": 0.02731509, + "balance_loss_clip": 0.083787, + "balance_loss_mlp": 0.02426143, + "epoch": 0.016894633999699385, + "flos": 27282067879680.0, + "grad_norm": 34.532536985404526, + "language_loss": 1.04021847, + "learning_rate": 3.6302669652206183e-06, + "loss": 1.17730653, + "num_input_tokens_seen": 5855750, + "router_z_loss_clip": 26.0, + "router_z_loss_mlp": 3.0546875, + "step": 281, + "time_per_iteration": 2.760214328765869 + }, + { + "auxiliary_loss_clip": 0.10965681, + "auxiliary_loss_mlp": 0.02675743, + "balance_loss_clip": 0.08379069, + "balance_loss_mlp": 0.02375717, + "epoch": 0.016954757252367354, + "flos": 14908262762880.0, + "grad_norm": 196.2497312811754, + "language_loss": 1.22675765, + "learning_rate": 3.632554186750274e-06, + "loss": 1.36317194, + "num_input_tokens_seen": 5872610, + "router_z_loss_clip": 25.875, + "router_z_loss_mlp": 2.99609375, + "step": 282, + "time_per_iteration": 2.619256019592285 + }, + { + "auxiliary_loss_clip": 0.10984524, + "auxiliary_loss_mlp": 0.02614953, + "balance_loss_clip": 0.0837212, + "balance_loss_mlp": 0.02316834, + "epoch": 0.017014880505035322, + "flos": 21364824723840.0, + "grad_norm": 113.89697119062544, + "language_loss": 1.1510148, + "learning_rate": 3.6348333119035937e-06, + "loss": 1.28700948, + "num_input_tokens_seen": 5892985, + "router_z_loss_clip": 26.125, + "router_z_loss_mlp": 2.98046875, + "step": 283, + "time_per_iteration": 2.7038846015930176 + }, + { + "auxiliary_loss_clip": 0.10939686, + "auxiliary_loss_mlp": 0.02615653, + "balance_loss_clip": 0.08368152, + "balance_loss_mlp": 0.02314101, + "epoch": 0.01707500375770329, + "flos": 35341561647360.0, + "grad_norm": 2832.5964725422496, + "language_loss": 1.17971587, + "learning_rate": 3.6371043977980503e-06, + "loss": 1.31526923, + "num_input_tokens_seen": 5914060, + "router_z_loss_clip": 25.703125, + "router_z_loss_mlp": 3.015625, + "step": 284, + "time_per_iteration": 2.779290199279785 + }, + { + "auxiliary_loss_clip": 0.11009269, + "auxiliary_loss_mlp": 0.02623795, + "balance_loss_clip": 0.08394658, + "balance_loss_mlp": 0.02300118, + "epoch": 0.01713512701037126, + "flos": 23588065906560.0, + "grad_norm": 202.09490986405962, + "language_loss": 1.3942194, + "learning_rate": 3.639367500948819e-06, + "loss": 1.53055, + "num_input_tokens_seen": 5932860, + "router_z_loss_clip": 26.15625, + "router_z_loss_mlp": 3.23632812, + "step": 285, + "time_per_iteration": 2.708090305328369 + }, + { + "auxiliary_loss_clip": 0.10991548, + "auxiliary_loss_mlp": 0.02635612, + "balance_loss_clip": 0.08366679, + "balance_loss_mlp": 0.02286949, + "epoch": 0.01719525026303923, + "flos": 27641781457920.0, + "grad_norm": 356.15135022069484, + "language_loss": 1.3973043, + "learning_rate": 3.6416226772772178e-06, + "loss": 1.53357589, + "num_input_tokens_seen": 5952725, + "router_z_loss_clip": 26.265625, + "router_z_loss_mlp": 3.48828125, + "step": 286, + "time_per_iteration": 2.719446897506714 + }, + { + "auxiliary_loss_clip": 0.11012185, + "auxiliary_loss_mlp": 0.02632762, + "balance_loss_clip": 0.08369677, + "balance_loss_mlp": 0.02288295, + "epoch": 0.0172553735157072, + "flos": 26987035253760.0, + "grad_norm": 104.57350843719594, + "language_loss": 1.20868826, + "learning_rate": 3.643869982119001e-06, + "loss": 1.34513772, + "num_input_tokens_seen": 5970560, + "router_z_loss_clip": 26.4375, + "router_z_loss_mlp": 3.44335938, + "step": 287, + "time_per_iteration": 2.729893207550049 + }, + { + "auxiliary_loss_clip": 0.10980022, + "auxiliary_loss_mlp": 0.02642429, + "balance_loss_clip": 0.08353196, + "balance_loss_mlp": 0.02284801, + "epoch": 0.01731549676837517, + "flos": 14060578533120.0, + "grad_norm": 166.25914626432441, + "language_loss": 1.43957901, + "learning_rate": 3.646109470232502e-06, + "loss": 1.57580352, + "num_input_tokens_seen": 5982980, + "router_z_loss_clip": 26.21875, + "router_z_loss_mlp": 3.57617188, + "step": 288, + "time_per_iteration": 2.649275779724121 + }, + { + "auxiliary_loss_clip": 0.08934768, + "auxiliary_loss_mlp": 0.02473956, + "balance_loss_clip": 0.07674165, + "balance_loss_mlp": 0.02246409, + "epoch": 0.017375620021043137, + "flos": 66533545543680.0, + "grad_norm": 1.4063062090104488, + "language_loss": 0.6396153, + "learning_rate": 3.6483411958066417e-06, + "loss": 0.75370252, + "num_input_tokens_seen": 6049445, + "router_z_loss_clip": 12.625, + "router_z_loss_mlp": 2.27734375, + "step": 289, + "time_per_iteration": 3.379565954208374 + }, + { + "auxiliary_loss_clip": 0.10942794, + "auxiliary_loss_mlp": 0.0259406, + "balance_loss_clip": 0.08345533, + "balance_loss_mlp": 0.02290982, + "epoch": 0.01743574327371111, + "flos": 15229472590080.0, + "grad_norm": 77.68078787610818, + "language_loss": 1.23036659, + "learning_rate": 3.6505652124687957e-06, + "loss": 1.36573505, + "num_input_tokens_seen": 6064150, + "router_z_loss_clip": 26.0, + "router_z_loss_mlp": 3.03320312, + "step": 290, + "time_per_iteration": 2.6509203910827637 + }, + { + "auxiliary_loss_clip": 0.10926615, + "auxiliary_loss_mlp": 0.02615048, + "balance_loss_clip": 0.08348773, + "balance_loss_mlp": 0.02310254, + "epoch": 0.017495866526379078, + "flos": 25380833137920.0, + "grad_norm": 27.564120325217353, + "language_loss": 1.14881706, + "learning_rate": 3.6527815732925258e-06, + "loss": 1.28423381, + "num_input_tokens_seen": 6083920, + "router_z_loss_clip": 25.796875, + "router_z_loss_mlp": 3.046875, + "step": 291, + "time_per_iteration": 2.7178046703338623 + }, + { + "auxiliary_loss_clip": 0.10883434, + "auxiliary_loss_mlp": 0.02591836, + "balance_loss_clip": 0.08332369, + "balance_loss_mlp": 0.02272164, + "epoch": 0.017555989779047047, + "flos": 26366683950720.0, + "grad_norm": 17.764405326344416, + "language_loss": 0.99533927, + "learning_rate": 3.6549903308051806e-06, + "loss": 1.13009202, + "num_input_tokens_seen": 6105460, + "router_z_loss_clip": 25.53125, + "router_z_loss_mlp": 3.1953125, + "step": 292, + "time_per_iteration": 2.788431406021118 + }, + { + "auxiliary_loss_clip": 0.10899352, + "auxiliary_loss_mlp": 0.02663543, + "balance_loss_clip": 0.08339885, + "balance_loss_mlp": 0.02329948, + "epoch": 0.017616113031715015, + "flos": 22344134918400.0, + "grad_norm": 26.042803645754148, + "language_loss": 1.17510223, + "learning_rate": 3.6571915369953646e-06, + "loss": 1.31073129, + "num_input_tokens_seen": 6122890, + "router_z_loss_clip": 25.59375, + "router_z_loss_mlp": 3.33398438, + "step": 293, + "time_per_iteration": 2.6952950954437256 + }, + { + "auxiliary_loss_clip": 0.10900117, + "auxiliary_loss_mlp": 0.02710556, + "balance_loss_clip": 0.08334709, + "balance_loss_mlp": 0.02379822, + "epoch": 0.017676236284382984, + "flos": 20163087066240.0, + "grad_norm": 32.066823918561106, + "language_loss": 1.13700342, + "learning_rate": 3.6593852433202797e-06, + "loss": 1.27311015, + "num_input_tokens_seen": 6142890, + "router_z_loss_clip": 25.640625, + "router_z_loss_mlp": 3.30859375, + "step": 294, + "time_per_iteration": 5.568135976791382 + }, + { + "auxiliary_loss_clip": 0.10885305, + "auxiliary_loss_mlp": 0.02641671, + "balance_loss_clip": 0.08332892, + "balance_loss_mlp": 0.02322953, + "epoch": 0.017736359537050956, + "flos": 25229501464320.0, + "grad_norm": 23.522869629200528, + "language_loss": 1.10671854, + "learning_rate": 3.6615715007129453e-06, + "loss": 1.24198818, + "num_input_tokens_seen": 6162030, + "router_z_loss_clip": 25.515625, + "router_z_loss_mlp": 3.1875, + "step": 295, + "time_per_iteration": 4.106949090957642 + }, + { + "auxiliary_loss_clip": 0.10915332, + "auxiliary_loss_mlp": 0.02662487, + "balance_loss_clip": 0.08334074, + "balance_loss_mlp": 0.02339572, + "epoch": 0.017796482789718925, + "flos": 20344914426240.0, + "grad_norm": 21.437764161161574, + "language_loss": 1.11617136, + "learning_rate": 3.6637503595892897e-06, + "loss": 1.25194955, + "num_input_tokens_seen": 6180540, + "router_z_loss_clip": 25.8125, + "router_z_loss_mlp": 3.22851562, + "step": 296, + "time_per_iteration": 2.6804072856903076 + }, + { + "auxiliary_loss_clip": 0.10889067, + "auxiliary_loss_mlp": 0.02644786, + "balance_loss_clip": 0.08324579, + "balance_loss_mlp": 0.02326259, + "epoch": 0.017856606042386893, + "flos": 22385196218880.0, + "grad_norm": 24.793293378850404, + "language_loss": 1.13374424, + "learning_rate": 3.665921869855132e-06, + "loss": 1.26908278, + "num_input_tokens_seen": 6199425, + "router_z_loss_clip": 25.671875, + "router_z_loss_mlp": 3.18554688, + "step": 297, + "time_per_iteration": 4.217481851577759 + }, + { + "auxiliary_loss_clip": 0.10852176, + "auxiliary_loss_mlp": 0.02688673, + "balance_loss_clip": 0.08303393, + "balance_loss_mlp": 0.02347639, + "epoch": 0.017916729295054862, + "flos": 20236279207680.0, + "grad_norm": 36.45374269731938, + "language_loss": 1.20502043, + "learning_rate": 3.6680860809130346e-06, + "loss": 1.34042883, + "num_input_tokens_seen": 6219170, + "router_z_loss_clip": 25.515625, + "router_z_loss_mlp": 3.40820312, + "step": 298, + "time_per_iteration": 2.6716575622558594 + }, + { + "auxiliary_loss_clip": 0.10865816, + "auxiliary_loss_mlp": 0.02644256, + "balance_loss_clip": 0.08315772, + "balance_loss_mlp": 0.02343848, + "epoch": 0.01797685254772283, + "flos": 19397064240000.0, + "grad_norm": 34.948505853119244, + "language_loss": 1.10227847, + "learning_rate": 3.6702430416690516e-06, + "loss": 1.23737931, + "num_input_tokens_seen": 6237930, + "router_z_loss_clip": 25.5, + "router_z_loss_mlp": 3.00390625, + "step": 299, + "time_per_iteration": 2.6678671836853027 + }, + { + "auxiliary_loss_clip": 0.10841461, + "auxiliary_loss_mlp": 0.02622314, + "balance_loss_clip": 0.08293117, + "balance_loss_mlp": 0.02329536, + "epoch": 0.018036975800390802, + "flos": 24432941024640.0, + "grad_norm": 19.38461643101093, + "language_loss": 0.93498641, + "learning_rate": 3.672392800539357e-06, + "loss": 1.06962407, + "num_input_tokens_seen": 6257170, + "router_z_loss_clip": 25.46875, + "router_z_loss_mlp": 2.92578125, + "step": 300, + "time_per_iteration": 2.678161382675171 + }, + { + "auxiliary_loss_clip": 0.10806506, + "auxiliary_loss_mlp": 0.02621871, + "balance_loss_clip": 0.08281456, + "balance_loss_mlp": 0.02336723, + "epoch": 0.01809709905305877, + "flos": 15784430181120.0, + "grad_norm": 20.696646248156853, + "language_loss": 1.21024799, + "learning_rate": 3.6745354054567686e-06, + "loss": 1.34453177, + "num_input_tokens_seen": 6274780, + "router_z_loss_clip": 25.265625, + "router_z_loss_mlp": 2.85351562, + "step": 301, + "time_per_iteration": 2.6817290782928467 + }, + { + "auxiliary_loss_clip": 0.0850801, + "auxiliary_loss_mlp": 0.01826254, + "balance_loss_clip": 0.07523113, + "balance_loss_mlp": 0.01690356, + "epoch": 0.01815722230572674, + "flos": 67371125356800.0, + "grad_norm": 1.2503467181890604, + "language_loss": 0.62148851, + "learning_rate": 3.676670903877158e-06, + "loss": 0.72483116, + "num_input_tokens_seen": 6340435, + "router_z_loss_clip": 9.859375, + "router_z_loss_mlp": 1.36035156, + "step": 302, + "time_per_iteration": 3.424029588699341 + }, + { + "auxiliary_loss_clip": 0.10791934, + "auxiliary_loss_mlp": 0.02578435, + "balance_loss_clip": 0.08265001, + "balance_loss_mlp": 0.02299963, + "epoch": 0.01821734555839471, + "flos": 15490823074560.0, + "grad_norm": 21.711544566316807, + "language_loss": 1.17839396, + "learning_rate": 3.6787993427857567e-06, + "loss": 1.31209755, + "num_input_tokens_seen": 6358160, + "router_z_loss_clip": 25.265625, + "router_z_loss_mlp": 2.78320312, + "step": 303, + "time_per_iteration": 2.6523215770721436 + }, + { + "auxiliary_loss_clip": 0.10728209, + "auxiliary_loss_mlp": 0.02544189, + "balance_loss_clip": 0.08224705, + "balance_loss_mlp": 0.02301288, + "epoch": 0.018277468811062677, + "flos": 24104268184320.0, + "grad_norm": 23.704422815160775, + "language_loss": 1.0746634, + "learning_rate": 3.680920768703364e-06, + "loss": 1.20738745, + "num_input_tokens_seen": 6378485, + "router_z_loss_clip": 25.03125, + "router_z_loss_mlp": 2.42675781, + "step": 304, + "time_per_iteration": 2.7344958782196045 + }, + { + "auxiliary_loss_clip": 0.1066777, + "auxiliary_loss_mlp": 0.02483555, + "balance_loss_clip": 0.08210013, + "balance_loss_mlp": 0.02260681, + "epoch": 0.01833759206373065, + "flos": 20965601145600.0, + "grad_norm": 30.99837504160223, + "language_loss": 1.03348625, + "learning_rate": 3.6830352276924415e-06, + "loss": 1.16499949, + "num_input_tokens_seen": 6397845, + "router_z_loss_clip": 24.5625, + "router_z_loss_mlp": 2.22949219, + "step": 305, + "time_per_iteration": 2.7260208129882812 + }, + { + "auxiliary_loss_clip": 0.10687442, + "auxiliary_loss_mlp": 0.0251225, + "balance_loss_clip": 0.08201034, + "balance_loss_mlp": 0.0229529, + "epoch": 0.018397715316398618, + "flos": 19396812677760.0, + "grad_norm": 19.918754118514013, + "language_loss": 1.13116205, + "learning_rate": 3.685142765363119e-06, + "loss": 1.26315892, + "num_input_tokens_seen": 6416475, + "router_z_loss_clip": 24.828125, + "router_z_loss_mlp": 2.16992188, + "step": 306, + "time_per_iteration": 2.691499948501587 + }, + { + "auxiliary_loss_clip": 0.10669354, + "auxiliary_loss_mlp": 0.02508631, + "balance_loss_clip": 0.08186156, + "balance_loss_mlp": 0.02314558, + "epoch": 0.018457838569066586, + "flos": 29140228823040.0, + "grad_norm": 47.10981354198648, + "language_loss": 1.13449669, + "learning_rate": 3.687243426879095e-06, + "loss": 1.2662766, + "num_input_tokens_seen": 6437520, + "router_z_loss_clip": 24.859375, + "router_z_loss_mlp": 1.94335938, + "step": 307, + "time_per_iteration": 2.7379393577575684 + }, + { + "auxiliary_loss_clip": 0.10625106, + "auxiliary_loss_mlp": 0.02487612, + "balance_loss_clip": 0.08165652, + "balance_loss_mlp": 0.02317095, + "epoch": 0.018517961821734555, + "flos": 19214733755520.0, + "grad_norm": 42.1678147839251, + "language_loss": 0.98589212, + "learning_rate": 3.6893372569634466e-06, + "loss": 1.11701941, + "num_input_tokens_seen": 6455680, + "router_z_loss_clip": 24.609375, + "router_z_loss_mlp": 1.70605469, + "step": 308, + "time_per_iteration": 2.702864646911621 + }, + { + "auxiliary_loss_clip": 0.1055109, + "auxiliary_loss_mlp": 0.02395341, + "balance_loss_clip": 0.08134291, + "balance_loss_mlp": 0.02218911, + "epoch": 0.018578085074402523, + "flos": 19868809127040.0, + "grad_norm": 28.65950876073581, + "language_loss": 1.1383698, + "learning_rate": 3.6914242999043395e-06, + "loss": 1.26783419, + "num_input_tokens_seen": 6474880, + "router_z_loss_clip": 24.171875, + "router_z_loss_mlp": 1.765625, + "step": 309, + "time_per_iteration": 2.6683051586151123 + }, + { + "auxiliary_loss_clip": 0.10586038, + "auxiliary_loss_mlp": 0.02405273, + "balance_loss_clip": 0.08121731, + "balance_loss_mlp": 0.02230465, + "epoch": 0.018638208327070496, + "flos": 29614740894720.0, + "grad_norm": 52.453360042586766, + "language_loss": 1.0296793, + "learning_rate": 3.69350459956065e-06, + "loss": 1.15959239, + "num_input_tokens_seen": 6495945, + "router_z_loss_clip": 24.625, + "router_z_loss_mlp": 1.74804688, + "step": 310, + "time_per_iteration": 2.775391101837158 + }, + { + "auxiliary_loss_clip": 0.10563378, + "auxiliary_loss_mlp": 0.02371235, + "balance_loss_clip": 0.08112171, + "balance_loss_mlp": 0.02215118, + "epoch": 0.018698331579738464, + "flos": 45741694567680.0, + "grad_norm": 23.410275827875097, + "language_loss": 0.97821265, + "learning_rate": 3.695578199367497e-06, + "loss": 1.10755873, + "num_input_tokens_seen": 6519930, + "router_z_loss_clip": 24.5, + "router_z_loss_mlp": 1.56054688, + "step": 311, + "time_per_iteration": 2.8839335441589355 + }, + { + "auxiliary_loss_clip": 0.10531655, + "auxiliary_loss_mlp": 0.02336008, + "balance_loss_clip": 0.08109175, + "balance_loss_mlp": 0.02177126, + "epoch": 0.018758454832406433, + "flos": 20489621627520.0, + "grad_norm": 82.59483456267918, + "language_loss": 1.18671477, + "learning_rate": 3.6976451423416825e-06, + "loss": 1.31539142, + "num_input_tokens_seen": 6535070, + "router_z_loss_clip": 24.203125, + "router_z_loss_mlp": 1.58886719, + "step": 312, + "time_per_iteration": 2.770037889480591 + }, + { + "auxiliary_loss_clip": 0.10558081, + "auxiliary_loss_mlp": 0.02280057, + "balance_loss_clip": 0.08105703, + "balance_loss_mlp": 0.02130998, + "epoch": 0.0188185780850744, + "flos": 15783088515840.0, + "grad_norm": 63.63527142809732, + "language_loss": 1.19325101, + "learning_rate": 3.699705471087043e-06, + "loss": 1.32163239, + "num_input_tokens_seen": 6554135, + "router_z_loss_clip": 24.515625, + "router_z_loss_mlp": 1.49121094, + "step": 313, + "time_per_iteration": 2.6673521995544434 + }, + { + "auxiliary_loss_clip": 0.10532573, + "auxiliary_loss_mlp": 0.02284473, + "balance_loss_clip": 0.08092797, + "balance_loss_mlp": 0.02119774, + "epoch": 0.018878701337742373, + "flos": 22462329502080.0, + "grad_norm": 55.57556601394066, + "language_loss": 1.1492281, + "learning_rate": 3.7017592277997256e-06, + "loss": 1.27739859, + "num_input_tokens_seen": 6572275, + "router_z_loss_clip": 24.375, + "router_z_loss_mlp": 1.6484375, + "step": 314, + "time_per_iteration": 2.6694388389587402 + }, + { + "auxiliary_loss_clip": 0.10578424, + "auxiliary_loss_mlp": 0.02246847, + "balance_loss_clip": 0.08105191, + "balance_loss_mlp": 0.02083482, + "epoch": 0.018938824590410342, + "flos": 31001576221440.0, + "grad_norm": 45.405049918855795, + "language_loss": 1.21203804, + "learning_rate": 3.7038064542733654e-06, + "loss": 1.34029078, + "num_input_tokens_seen": 6594520, + "router_z_loss_clip": 24.734375, + "router_z_loss_mlp": 1.6328125, + "step": 315, + "time_per_iteration": 2.7529938220977783 + }, + { + "auxiliary_loss_clip": 0.10473935, + "auxiliary_loss_mlp": 0.02224543, + "balance_loss_clip": 0.08059986, + "balance_loss_mlp": 0.02047731, + "epoch": 0.01899894784307831, + "flos": 23265724049280.0, + "grad_norm": 52.87369135887914, + "language_loss": 1.09085321, + "learning_rate": 3.7058471919041945e-06, + "loss": 1.21783805, + "num_input_tokens_seen": 6614245, + "router_z_loss_clip": 24.15625, + "router_z_loss_mlp": 1.76855469, + "step": 316, + "time_per_iteration": 2.7019717693328857 + }, + { + "auxiliary_loss_clip": 0.1049989, + "auxiliary_loss_mlp": 0.02224334, + "balance_loss_clip": 0.08073364, + "balance_loss_mlp": 0.02044757, + "epoch": 0.01905907109574628, + "flos": 17463782511360.0, + "grad_norm": 120.61991368810097, + "language_loss": 1.19369888, + "learning_rate": 3.7078814816960605e-06, + "loss": 1.32094109, + "num_input_tokens_seen": 6632015, + "router_z_loss_clip": 24.234375, + "router_z_loss_mlp": 1.79492188, + "step": 317, + "time_per_iteration": 2.6503257751464844 + }, + { + "auxiliary_loss_clip": 0.10466437, + "auxiliary_loss_mlp": 0.02269676, + "balance_loss_clip": 0.08054706, + "balance_loss_mlp": 0.02081039, + "epoch": 0.019119194348414248, + "flos": 14974578869760.0, + "grad_norm": 61.86297235247138, + "language_loss": 1.22225165, + "learning_rate": 3.709909364265374e-06, + "loss": 1.34961283, + "num_input_tokens_seen": 6649015, + "router_z_loss_clip": 24.109375, + "router_z_loss_mlp": 1.88769531, + "step": 318, + "time_per_iteration": 2.631645917892456 + }, + { + "auxiliary_loss_clip": 0.1039573, + "auxiliary_loss_mlp": 0.02220381, + "balance_loss_clip": 0.08026896, + "balance_loss_mlp": 0.02036608, + "epoch": 0.01917931760108222, + "flos": 25489719918720.0, + "grad_norm": 79.56078914423522, + "language_loss": 1.24628842, + "learning_rate": 3.7119308798459706e-06, + "loss": 1.3724494, + "num_input_tokens_seen": 6669225, + "router_z_loss_clip": 23.65625, + "router_z_loss_mlp": 1.83789062, + "step": 319, + "time_per_iteration": 2.723235607147217 + }, + { + "auxiliary_loss_clip": 0.08211939, + "auxiliary_loss_mlp": 0.01803451, + "balance_loss_clip": 0.07311222, + "balance_loss_mlp": 0.01697974, + "epoch": 0.01923944085375019, + "flos": 71576438872320.0, + "grad_norm": 0.9540157623115577, + "language_loss": 0.59494603, + "learning_rate": 3.7139460682939026e-06, + "loss": 0.69509989, + "num_input_tokens_seen": 6725775, + "router_z_loss_clip": 9.0, + "router_z_loss_mlp": 1.05664062, + "step": 320, + "time_per_iteration": 3.180224895477295 + }, + { + "auxiliary_loss_clip": 0.10427548, + "auxiliary_loss_mlp": 0.02254004, + "balance_loss_clip": 0.0803239, + "balance_loss_mlp": 0.02062601, + "epoch": 0.019299564106418157, + "flos": 19688574994560.0, + "grad_norm": 36.291900925718565, + "language_loss": 1.21542251, + "learning_rate": 3.715954969092154e-06, + "loss": 1.34223795, + "num_input_tokens_seen": 6744170, + "router_z_loss_clip": 23.921875, + "router_z_loss_mlp": 1.9140625, + "step": 321, + "time_per_iteration": 2.682126045227051 + }, + { + "auxiliary_loss_clip": 0.10335587, + "auxiliary_loss_mlp": 0.02247301, + "balance_loss_clip": 0.079924, + "balance_loss_mlp": 0.02050463, + "epoch": 0.019359687359086126, + "flos": 24393682586880.0, + "grad_norm": 33.259970226975035, + "language_loss": 1.13044763, + "learning_rate": 3.7179576213552805e-06, + "loss": 1.25627637, + "num_input_tokens_seen": 6764565, + "router_z_loss_clip": 23.40625, + "router_z_loss_mlp": 1.96972656, + "step": 322, + "time_per_iteration": 2.707108736038208 + }, + { + "auxiliary_loss_clip": 0.10356271, + "auxiliary_loss_mlp": 0.02232923, + "balance_loss_clip": 0.08007558, + "balance_loss_mlp": 0.02039518, + "epoch": 0.019419810611754094, + "flos": 23958177390720.0, + "grad_norm": 36.53278953975959, + "language_loss": 0.99391961, + "learning_rate": 3.719954063833981e-06, + "loss": 1.11981153, + "num_input_tokens_seen": 6785310, + "router_z_loss_clip": 23.46875, + "router_z_loss_mlp": 1.93554688, + "step": 323, + "time_per_iteration": 2.723851442337036 + }, + { + "auxiliary_loss_clip": 0.10368463, + "auxiliary_loss_mlp": 0.02256046, + "balance_loss_clip": 0.08015804, + "balance_loss_mlp": 0.02064739, + "epoch": 0.019479933864422067, + "flos": 22166164846080.0, + "grad_norm": 31.715264393756637, + "language_loss": 1.15310884, + "learning_rate": 3.721944334919596e-06, + "loss": 1.27935386, + "num_input_tokens_seen": 6803290, + "router_z_loss_clip": 23.5, + "router_z_loss_mlp": 1.9140625, + "step": 324, + "time_per_iteration": 2.696791887283325 + }, + { + "auxiliary_loss_clip": 0.10296808, + "auxiliary_loss_mlp": 0.02240866, + "balance_loss_clip": 0.08005355, + "balance_loss_mlp": 0.02052992, + "epoch": 0.019540057117090035, + "flos": 22243381983360.0, + "grad_norm": 43.49790109423306, + "language_loss": 0.94611681, + "learning_rate": 3.7239284726485375e-06, + "loss": 1.07149351, + "num_input_tokens_seen": 6822570, + "router_z_loss_clip": 22.90625, + "router_z_loss_mlp": 1.87890625, + "step": 325, + "time_per_iteration": 2.653348207473755 + }, + { + "auxiliary_loss_clip": 0.10282885, + "auxiliary_loss_mlp": 0.02182889, + "balance_loss_clip": 0.07997272, + "balance_loss_mlp": 0.02001023, + "epoch": 0.019600180369758004, + "flos": 23083603200000.0, + "grad_norm": 27.315965412731057, + "language_loss": 0.98057997, + "learning_rate": 3.72590651470665e-06, + "loss": 1.10523772, + "num_input_tokens_seen": 6841910, + "router_z_loss_clip": 22.859375, + "router_z_loss_mlp": 1.81835938, + "step": 326, + "time_per_iteration": 2.712902545928955 + }, + { + "auxiliary_loss_clip": 0.10212934, + "auxiliary_loss_mlp": 0.0211514, + "balance_loss_clip": 0.07960281, + "balance_loss_mlp": 0.01952062, + "epoch": 0.019660303622425972, + "flos": 25417911369600.0, + "grad_norm": 35.757935523376304, + "language_loss": 1.00482905, + "learning_rate": 3.727878498433505e-06, + "loss": 1.12810981, + "num_input_tokens_seen": 6862480, + "router_z_loss_clip": 22.53125, + "router_z_loss_mlp": 1.63085938, + "step": 327, + "time_per_iteration": 2.7241063117980957 + }, + { + "auxiliary_loss_clip": 0.10138492, + "auxiliary_loss_mlp": 0.02035691, + "balance_loss_clip": 0.07947245, + "balance_loss_mlp": 0.01881101, + "epoch": 0.01972042687509394, + "flos": 23663941378560.0, + "grad_norm": 104.32864902308236, + "language_loss": 1.03565025, + "learning_rate": 3.7298444608266328e-06, + "loss": 1.15739202, + "num_input_tokens_seen": 6882015, + "router_z_loss_clip": 21.9375, + "router_z_loss_mlp": 1.54492188, + "step": 328, + "time_per_iteration": 2.709101438522339 + }, + { + "auxiliary_loss_clip": 0.10164856, + "auxiliary_loss_mlp": 0.01970008, + "balance_loss_clip": 0.0795281, + "balance_loss_mlp": 0.01821044, + "epoch": 0.019780550127761913, + "flos": 18229386067200.0, + "grad_norm": 42.1606706132577, + "language_loss": 1.2875843, + "learning_rate": 3.731804438545683e-06, + "loss": 1.40893316, + "num_input_tokens_seen": 6899785, + "router_z_loss_clip": 22.125, + "router_z_loss_mlp": 1.49023438, + "step": 329, + "time_per_iteration": 2.6586227416992188 + }, + { + "auxiliary_loss_clip": 0.10175324, + "auxiliary_loss_mlp": 0.0194808, + "balance_loss_clip": 0.07956892, + "balance_loss_mlp": 0.0180417, + "epoch": 0.01984067338042988, + "flos": 22425293197440.0, + "grad_norm": 45.342797810033126, + "language_loss": 1.05014217, + "learning_rate": 3.7337584679165324e-06, + "loss": 1.17137623, + "num_input_tokens_seen": 6918575, + "router_z_loss_clip": 22.1875, + "router_z_loss_mlp": 1.43847656, + "step": 330, + "time_per_iteration": 2.7214515209198 + }, + { + "auxiliary_loss_clip": 0.10115402, + "auxiliary_loss_mlp": 0.01893459, + "balance_loss_clip": 0.07927606, + "balance_loss_mlp": 0.01745353, + "epoch": 0.01990079663309785, + "flos": 17060785499520.0, + "grad_norm": 59.15314637886723, + "language_loss": 1.25238144, + "learning_rate": 3.7357065849353186e-06, + "loss": 1.37247014, + "num_input_tokens_seen": 6936965, + "router_z_loss_clip": 21.890625, + "router_z_loss_mlp": 1.48046875, + "step": 331, + "time_per_iteration": 2.657338857650757 + }, + { + "auxiliary_loss_clip": 0.10080996, + "auxiliary_loss_mlp": 0.01847509, + "balance_loss_clip": 0.07917192, + "balance_loss_mlp": 0.01704076, + "epoch": 0.01996091988576582, + "flos": 15967389571200.0, + "grad_norm": 98.01539887897596, + "language_loss": 1.18547392, + "learning_rate": 3.737648825272422e-06, + "loss": 1.30475891, + "num_input_tokens_seen": 6953475, + "router_z_loss_clip": 21.625, + "router_z_loss_mlp": 1.43457031, + "step": 332, + "time_per_iteration": 2.653959035873413 + }, + { + "auxiliary_loss_clip": 0.10103545, + "auxiliary_loss_mlp": 0.01800932, + "balance_loss_clip": 0.07904914, + "balance_loss_mlp": 0.01663794, + "epoch": 0.02002104313843379, + "flos": 23593181005440.0, + "grad_norm": 35.094478760810134, + "language_loss": 1.10768199, + "learning_rate": 3.739585224276384e-06, + "loss": 1.22672677, + "num_input_tokens_seen": 6971630, + "router_z_loss_clip": 21.96875, + "router_z_loss_mlp": 1.37207031, + "step": 333, + "time_per_iteration": 4.1371009349823 + }, + { + "auxiliary_loss_clip": 0.10097618, + "auxiliary_loss_mlp": 0.01781343, + "balance_loss_clip": 0.07907948, + "balance_loss_mlp": 0.01654028, + "epoch": 0.02008116639110176, + "flos": 34103458517760.0, + "grad_norm": 136.68327853765982, + "language_loss": 1.06974816, + "learning_rate": 3.7415158169777673e-06, + "loss": 1.18853784, + "num_input_tokens_seen": 6992775, + "router_z_loss_clip": 21.921875, + "router_z_loss_mlp": 1.2734375, + "step": 334, + "time_per_iteration": 4.332135200500488 + }, + { + "auxiliary_loss_clip": 0.10031913, + "auxiliary_loss_mlp": 0.01781208, + "balance_loss_clip": 0.07884848, + "balance_loss_mlp": 0.01645405, + "epoch": 0.020141289643769728, + "flos": 19690000513920.0, + "grad_norm": 127.35413263461035, + "language_loss": 1.06165111, + "learning_rate": 3.7434406380929575e-06, + "loss": 1.17978239, + "num_input_tokens_seen": 7011425, + "router_z_loss_clip": 21.453125, + "router_z_loss_mlp": 1.35742188, + "step": 335, + "time_per_iteration": 2.6845688819885254 + }, + { + "auxiliary_loss_clip": 0.10012034, + "auxiliary_loss_mlp": 0.01785006, + "balance_loss_clip": 0.07876636, + "balance_loss_mlp": 0.01652064, + "epoch": 0.020201412896437697, + "flos": 20746821335040.0, + "grad_norm": 92.68671579424392, + "language_loss": 1.17325389, + "learning_rate": 3.745359722027911e-06, + "loss": 1.29122424, + "num_input_tokens_seen": 7029450, + "router_z_loss_clip": 21.34375, + "router_z_loss_mlp": 1.33007812, + "step": 336, + "time_per_iteration": 4.08910059928894 + }, + { + "auxiliary_loss_clip": 0.1002828, + "auxiliary_loss_mlp": 0.01777388, + "balance_loss_clip": 0.07887816, + "balance_loss_mlp": 0.01649119, + "epoch": 0.020261536149105665, + "flos": 20272728533760.0, + "grad_norm": 120.00954497896274, + "language_loss": 1.09627342, + "learning_rate": 3.7472731028818428e-06, + "loss": 1.21433008, + "num_input_tokens_seen": 7047555, + "router_z_loss_clip": 21.40625, + "router_z_loss_mlp": 1.28222656, + "step": 337, + "time_per_iteration": 2.805793285369873 + }, + { + "auxiliary_loss_clip": 0.09984031, + "auxiliary_loss_mlp": 0.01793779, + "balance_loss_clip": 0.07868993, + "balance_loss_mlp": 0.01666368, + "epoch": 0.020321659401773638, + "flos": 25855890261120.0, + "grad_norm": 28.99860578242643, + "language_loss": 1.06755781, + "learning_rate": 3.7491808144508626e-06, + "loss": 1.18533587, + "num_input_tokens_seen": 7068185, + "router_z_loss_clip": 21.15625, + "router_z_loss_mlp": 1.2734375, + "step": 338, + "time_per_iteration": 2.731576919555664 + }, + { + "auxiliary_loss_clip": 0.09960704, + "auxiliary_loss_mlp": 0.01799352, + "balance_loss_clip": 0.0785647, + "balance_loss_mlp": 0.01663931, + "epoch": 0.020381782654441606, + "flos": 17501028451200.0, + "grad_norm": 48.687202060804886, + "language_loss": 1.0690763, + "learning_rate": 3.7510828902315576e-06, + "loss": 1.18667698, + "num_input_tokens_seen": 7085955, + "router_z_loss_clip": 21.03125, + "router_z_loss_mlp": 1.35449219, + "step": 339, + "time_per_iteration": 2.6707966327667236 + }, + { + "auxiliary_loss_clip": 0.09979145, + "auxiliary_loss_mlp": 0.01800383, + "balance_loss_clip": 0.07839093, + "balance_loss_mlp": 0.01661433, + "epoch": 0.020441905907109575, + "flos": 24250904029440.0, + "grad_norm": 71.79969186636298, + "language_loss": 1.09025931, + "learning_rate": 3.75297936342452e-06, + "loss": 1.20805454, + "num_input_tokens_seen": 7106345, + "router_z_loss_clip": 21.4375, + "router_z_loss_mlp": 1.38964844, + "step": 340, + "time_per_iteration": 2.6860833168029785 + }, + { + "auxiliary_loss_clip": 0.09942168, + "auxiliary_loss_mlp": 0.01812594, + "balance_loss_clip": 0.07835533, + "balance_loss_mlp": 0.01670592, + "epoch": 0.020502029159777543, + "flos": 22239273133440.0, + "grad_norm": 33.37713513104353, + "language_loss": 1.09787846, + "learning_rate": 3.7548702669378253e-06, + "loss": 1.21542597, + "num_input_tokens_seen": 7125070, + "router_z_loss_clip": 21.09375, + "router_z_loss_mlp": 1.41992188, + "step": 341, + "time_per_iteration": 2.6922483444213867 + }, + { + "auxiliary_loss_clip": 0.09939329, + "auxiliary_loss_mlp": 0.01828812, + "balance_loss_clip": 0.07839939, + "balance_loss_mlp": 0.01694249, + "epoch": 0.020562152412445512, + "flos": 23994668643840.0, + "grad_norm": 29.77192234960925, + "language_loss": 1.11667454, + "learning_rate": 3.756755633390458e-06, + "loss": 1.23435605, + "num_input_tokens_seen": 7144675, + "router_z_loss_clip": 21.0, + "router_z_loss_mlp": 1.34472656, + "step": 342, + "time_per_iteration": 2.6834869384765625 + }, + { + "auxiliary_loss_clip": 0.09933892, + "auxiliary_loss_mlp": 0.01819402, + "balance_loss_clip": 0.07828948, + "balance_loss_mlp": 0.0168541, + "epoch": 0.020622275665113484, + "flos": 26981878227840.0, + "grad_norm": 22.197931915509507, + "language_loss": 1.07990003, + "learning_rate": 3.7586354951156886e-06, + "loss": 1.19743299, + "num_input_tokens_seen": 7165505, + "router_z_loss_clip": 21.0625, + "router_z_loss_mlp": 1.34082031, + "step": 343, + "time_per_iteration": 2.749616861343384 + }, + { + "auxiliary_loss_clip": 0.09917849, + "auxiliary_loss_mlp": 0.01848479, + "balance_loss_clip": 0.07828984, + "balance_loss_mlp": 0.01717921, + "epoch": 0.020682398917781453, + "flos": 22607162484480.0, + "grad_norm": 141.8901696404303, + "language_loss": 0.98407257, + "learning_rate": 3.7605098841644e-06, + "loss": 1.10173583, + "num_input_tokens_seen": 7184605, + "router_z_loss_clip": 20.859375, + "router_z_loss_mlp": 1.30566406, + "step": 344, + "time_per_iteration": 2.675349235534668 + }, + { + "auxiliary_loss_clip": 0.09898005, + "auxiliary_loss_mlp": 0.01869082, + "balance_loss_clip": 0.07812598, + "balance_loss_mlp": 0.01731467, + "epoch": 0.02074252217044942, + "flos": 15019120114560.0, + "grad_norm": 18.785611022256134, + "language_loss": 0.99672723, + "learning_rate": 3.7623788323083666e-06, + "loss": 1.11439812, + "num_input_tokens_seen": 7203065, + "router_z_loss_clip": 20.84375, + "router_z_loss_mlp": 1.37597656, + "step": 345, + "time_per_iteration": 2.692946434020996 + }, + { + "auxiliary_loss_clip": 0.09874325, + "auxiliary_loss_mlp": 0.01900277, + "balance_loss_clip": 0.07799722, + "balance_loss_mlp": 0.01757512, + "epoch": 0.02080264542311739, + "flos": 25345012717440.0, + "grad_norm": 55.83425603592709, + "language_loss": 1.104882, + "learning_rate": 3.7642423710434837e-06, + "loss": 1.222628, + "num_input_tokens_seen": 7222995, + "router_z_loss_clip": 20.734375, + "router_z_loss_mlp": 1.42871094, + "step": 346, + "time_per_iteration": 2.6843760013580322 + }, + { + "auxiliary_loss_clip": 0.09857361, + "auxiliary_loss_mlp": 0.01900508, + "balance_loss_clip": 0.07793791, + "balance_loss_mlp": 0.01751067, + "epoch": 0.02086276867578536, + "flos": 24395611230720.0, + "grad_norm": 77.40789728508068, + "language_loss": 1.02947056, + "learning_rate": 3.7661005315929563e-06, + "loss": 1.14704919, + "num_input_tokens_seen": 7244625, + "router_z_loss_clip": 20.625, + "router_z_loss_mlp": 1.49511719, + "step": 347, + "time_per_iteration": 2.7445502281188965 + }, + { + "auxiliary_loss_clip": 0.09829693, + "auxiliary_loss_mlp": 0.01850064, + "balance_loss_clip": 0.07772936, + "balance_loss_mlp": 0.01707585, + "epoch": 0.02092289192845333, + "flos": 24469096861440.0, + "grad_norm": 39.57326474220843, + "language_loss": 0.95316571, + "learning_rate": 3.7679533449104354e-06, + "loss": 1.06996334, + "num_input_tokens_seen": 7263255, + "router_z_loss_clip": 20.546875, + "router_z_loss_mlp": 1.42578125, + "step": 348, + "time_per_iteration": 2.8197853565216064 + }, + { + "auxiliary_loss_clip": 0.09904477, + "auxiliary_loss_mlp": 0.01869566, + "balance_loss_clip": 0.07792602, + "balance_loss_mlp": 0.01723273, + "epoch": 0.0209830151811213, + "flos": 17455942154880.0, + "grad_norm": 162.53223734199824, + "language_loss": 1.06930375, + "learning_rate": 3.7698008416831116e-06, + "loss": 1.18704414, + "num_input_tokens_seen": 7279275, + "router_z_loss_clip": 21.125, + "router_z_loss_mlp": 1.46289062, + "step": 349, + "time_per_iteration": 2.752092123031616 + }, + { + "auxiliary_loss_clip": 0.09846102, + "auxiliary_loss_mlp": 0.01921246, + "balance_loss_clip": 0.07772378, + "balance_loss_mlp": 0.01771328, + "epoch": 0.021043138433789268, + "flos": 24581295878400.0, + "grad_norm": 27.656933027979164, + "language_loss": 1.05012357, + "learning_rate": 3.7716430523347664e-06, + "loss": 1.16779709, + "num_input_tokens_seen": 7300180, + "router_z_loss_clip": 20.71875, + "router_z_loss_mlp": 1.49902344, + "step": 350, + "time_per_iteration": 2.766042947769165 + }, + { + "auxiliary_loss_clip": 0.0987936, + "auxiliary_loss_mlp": 0.01878538, + "balance_loss_clip": 0.07780807, + "balance_loss_mlp": 0.01733103, + "epoch": 0.021103261686457236, + "flos": 24459579423360.0, + "grad_norm": 79.75623451753691, + "language_loss": 0.99250925, + "learning_rate": 3.773480007028776e-06, + "loss": 1.11008823, + "num_input_tokens_seen": 7317430, + "router_z_loss_clip": 21.0, + "router_z_loss_mlp": 1.45507812, + "step": 351, + "time_per_iteration": 2.7852492332458496 + }, + { + "auxiliary_loss_clip": 0.09914102, + "auxiliary_loss_mlp": 0.01872584, + "balance_loss_clip": 0.07798491, + "balance_loss_mlp": 0.01732013, + "epoch": 0.021163384939125205, + "flos": 14688183214080.0, + "grad_norm": 45.172979776217204, + "language_loss": 1.05138326, + "learning_rate": 3.775311735671078e-06, + "loss": 1.16925001, + "num_input_tokens_seen": 7334875, + "router_z_loss_clip": 21.15625, + "router_z_loss_mlp": 1.40527344, + "step": 352, + "time_per_iteration": 2.670952558517456 + }, + { + "auxiliary_loss_clip": 0.09916839, + "auxiliary_loss_mlp": 0.0188162, + "balance_loss_clip": 0.07782572, + "balance_loss_mlp": 0.01727792, + "epoch": 0.021223508191793177, + "flos": 24499173277440.0, + "grad_norm": 32.69809617550279, + "language_loss": 1.02695966, + "learning_rate": 3.7771382679130878e-06, + "loss": 1.14494431, + "num_input_tokens_seen": 7355185, + "router_z_loss_clip": 21.375, + "router_z_loss_mlp": 1.5390625, + "step": 353, + "time_per_iteration": 2.7037458419799805 + }, + { + "auxiliary_loss_clip": 0.09877251, + "auxiliary_loss_mlp": 0.01866766, + "balance_loss_clip": 0.07783737, + "balance_loss_mlp": 0.01718565, + "epoch": 0.021283631444461146, + "flos": 24132667518720.0, + "grad_norm": 42.14264864151201, + "language_loss": 1.01166749, + "learning_rate": 3.7789596331545845e-06, + "loss": 1.12910759, + "num_input_tokens_seen": 7374425, + "router_z_loss_clip": 20.921875, + "router_z_loss_mlp": 1.48242188, + "step": 354, + "time_per_iteration": 2.692936658859253 + }, + { + "auxiliary_loss_clip": 0.0993467, + "auxiliary_loss_mlp": 0.0189021, + "balance_loss_clip": 0.07795032, + "balance_loss_mlp": 0.01743726, + "epoch": 0.021343754697129114, + "flos": 25199299267200.0, + "grad_norm": 49.082565254141, + "language_loss": 1.02249849, + "learning_rate": 3.780775860546545e-06, + "loss": 1.14074731, + "num_input_tokens_seen": 7394175, + "router_z_loss_clip": 21.359375, + "router_z_loss_mlp": 1.46484375, + "step": 355, + "time_per_iteration": 2.703904151916504 + }, + { + "auxiliary_loss_clip": 0.09890301, + "auxiliary_loss_mlp": 0.01933568, + "balance_loss_clip": 0.07771169, + "balance_loss_mlp": 0.01774495, + "epoch": 0.021403877949797083, + "flos": 17279816872320.0, + "grad_norm": 33.424095724347985, + "language_loss": 1.12320316, + "learning_rate": 3.7825869789939474e-06, + "loss": 1.24144173, + "num_input_tokens_seen": 7412645, + "router_z_loss_clip": 21.21875, + "router_z_loss_mlp": 1.58984375, + "step": 356, + "time_per_iteration": 2.7039332389831543 + }, + { + "auxiliary_loss_clip": 0.09926872, + "auxiliary_loss_mlp": 0.01913321, + "balance_loss_clip": 0.07763862, + "balance_loss_mlp": 0.01768648, + "epoch": 0.021464001202465055, + "flos": 30924946062720.0, + "grad_norm": 28.358403300745604, + "language_loss": 1.00492048, + "learning_rate": 3.784393017158528e-06, + "loss": 1.12332249, + "num_input_tokens_seen": 7432275, + "router_z_loss_clip": 21.640625, + "router_z_loss_mlp": 1.44628906, + "step": 357, + "time_per_iteration": 2.7567434310913086 + }, + { + "auxiliary_loss_clip": 0.09896905, + "auxiliary_loss_mlp": 0.0189471, + "balance_loss_clip": 0.0777001, + "balance_loss_mlp": 0.01751087, + "epoch": 0.021524124455133024, + "flos": 18192182054400.0, + "grad_norm": 311.83490549391024, + "language_loss": 1.00049341, + "learning_rate": 3.786194003461506e-06, + "loss": 1.11840951, + "num_input_tokens_seen": 7450245, + "router_z_loss_clip": 21.28125, + "router_z_loss_mlp": 1.43652344, + "step": 358, + "time_per_iteration": 2.697567939758301 + }, + { + "auxiliary_loss_clip": 0.09952264, + "auxiliary_loss_mlp": 0.01876113, + "balance_loss_clip": 0.0777906, + "balance_loss_mlp": 0.01737449, + "epoch": 0.021584247707800992, + "flos": 13810464495360.0, + "grad_norm": 74.44924093849752, + "language_loss": 1.11748183, + "learning_rate": 3.787989966086264e-06, + "loss": 1.2357657, + "num_input_tokens_seen": 7466845, + "router_z_loss_clip": 21.734375, + "router_z_loss_mlp": 1.38671875, + "step": 359, + "time_per_iteration": 2.683791399002075 + }, + { + "auxiliary_loss_clip": 0.09922898, + "auxiliary_loss_mlp": 0.01885242, + "balance_loss_clip": 0.07765573, + "balance_loss_mlp": 0.01746292, + "epoch": 0.02164437096046896, + "flos": 23301418688640.0, + "grad_norm": 64.98362502413198, + "language_loss": 1.06271791, + "learning_rate": 3.789780932980997e-06, + "loss": 1.18079925, + "num_input_tokens_seen": 7485450, + "router_z_loss_clip": 21.5625, + "router_z_loss_mlp": 1.38867188, + "step": 360, + "time_per_iteration": 2.7144362926483154 + }, + { + "auxiliary_loss_clip": 0.08207352, + "auxiliary_loss_mlp": 0.01776906, + "balance_loss_clip": 0.07236059, + "balance_loss_mlp": 0.01669809, + "epoch": 0.02170449421313693, + "flos": 68919621137280.0, + "grad_norm": 1.0217512577987982, + "language_loss": 0.65141213, + "learning_rate": 3.79156693186132e-06, + "loss": 0.75125468, + "num_input_tokens_seen": 7553780, + "router_z_loss_clip": 9.734375, + "router_z_loss_mlp": 1.07324219, + "step": 361, + "time_per_iteration": 3.3981525897979736 + }, + { + "auxiliary_loss_clip": 0.09926173, + "auxiliary_loss_mlp": 0.01850484, + "balance_loss_clip": 0.07767443, + "balance_loss_mlp": 0.01710961, + "epoch": 0.0217646174658049, + "flos": 25235580885120.0, + "grad_norm": 46.06075194478587, + "language_loss": 1.07240796, + "learning_rate": 3.7933479902128433e-06, + "loss": 1.19017458, + "num_input_tokens_seen": 7574155, + "router_z_loss_clip": 21.5625, + "router_z_loss_mlp": 1.39550781, + "step": 362, + "time_per_iteration": 2.7112934589385986 + }, + { + "auxiliary_loss_clip": 0.09902073, + "auxiliary_loss_mlp": 0.01838434, + "balance_loss_clip": 0.07771316, + "balance_loss_mlp": 0.01689852, + "epoch": 0.02182474071847287, + "flos": 22899721415040.0, + "grad_norm": 31.847388073363284, + "language_loss": 1.10624099, + "learning_rate": 3.7951241352937077e-06, + "loss": 1.22364616, + "num_input_tokens_seen": 7592320, + "router_z_loss_clip": 21.3125, + "router_z_loss_mlp": 1.48632812, + "step": 363, + "time_per_iteration": 2.7391881942749023 + }, + { + "auxiliary_loss_clip": 0.09905075, + "auxiliary_loss_mlp": 0.01804412, + "balance_loss_clip": 0.0776676, + "balance_loss_mlp": 0.01661742, + "epoch": 0.02188486397114084, + "flos": 23665660387200.0, + "grad_norm": 28.541039167709148, + "language_loss": 1.08880925, + "learning_rate": 3.7968953941370915e-06, + "loss": 1.20590401, + "num_input_tokens_seen": 7611185, + "router_z_loss_clip": 21.359375, + "router_z_loss_mlp": 1.42578125, + "step": 364, + "time_per_iteration": 2.7092103958129883 + }, + { + "auxiliary_loss_clip": 0.09940802, + "auxiliary_loss_mlp": 0.01790674, + "balance_loss_clip": 0.07771328, + "balance_loss_mlp": 0.01644666, + "epoch": 0.021944987223808807, + "flos": 21550090101120.0, + "grad_norm": 29.41270562877638, + "language_loss": 1.01945662, + "learning_rate": 3.798661793553676e-06, + "loss": 1.13677144, + "num_input_tokens_seen": 7631970, + "router_z_loss_clip": 21.6875, + "router_z_loss_mlp": 1.4609375, + "step": 365, + "time_per_iteration": 2.7039554119110107 + }, + { + "auxiliary_loss_clip": 0.09880184, + "auxiliary_loss_mlp": 0.01787501, + "balance_loss_clip": 0.07767902, + "balance_loss_mlp": 0.01639968, + "epoch": 0.022005110476476776, + "flos": 16076444060160.0, + "grad_norm": 25.357242967570325, + "language_loss": 1.00391948, + "learning_rate": 3.8004233601340808e-06, + "loss": 1.12059641, + "num_input_tokens_seen": 7649745, + "router_z_loss_clip": 21.125, + "router_z_loss_mlp": 1.47558594, + "step": 366, + "time_per_iteration": 2.6410672664642334 + }, + { + "auxiliary_loss_clip": 0.09886092, + "auxiliary_loss_mlp": 0.01802461, + "balance_loss_clip": 0.07774624, + "balance_loss_mlp": 0.01645009, + "epoch": 0.022065233729144748, + "flos": 21440071290240.0, + "grad_norm": 44.529255844390654, + "language_loss": 1.12988663, + "learning_rate": 3.8021801202512694e-06, + "loss": 1.24677217, + "num_input_tokens_seen": 7668830, + "router_z_loss_clip": 21.09375, + "router_z_loss_mlp": 1.57421875, + "step": 367, + "time_per_iteration": 2.742794990539551 + }, + { + "auxiliary_loss_clip": 0.09926969, + "auxiliary_loss_mlp": 0.01819149, + "balance_loss_clip": 0.0779452, + "balance_loss_mlp": 0.01654545, + "epoch": 0.022125356981812717, + "flos": 21550173955200.0, + "grad_norm": 31.338184320621753, + "language_loss": 1.07241869, + "learning_rate": 3.803932100062912e-06, + "loss": 1.18987989, + "num_input_tokens_seen": 7687240, + "router_z_loss_clip": 21.34375, + "router_z_loss_mlp": 1.64648438, + "step": 368, + "time_per_iteration": 2.660156488418579 + }, + { + "auxiliary_loss_clip": 0.09893043, + "auxiliary_loss_mlp": 0.01817736, + "balance_loss_clip": 0.07784697, + "balance_loss_mlp": 0.01649699, + "epoch": 0.022185480234480685, + "flos": 20710413936000.0, + "grad_norm": 81.09585500154182, + "language_loss": 1.0770272, + "learning_rate": 3.8056793255137264e-06, + "loss": 1.19413495, + "num_input_tokens_seen": 7704440, + "router_z_loss_clip": 21.09375, + "router_z_loss_mlp": 1.6796875, + "step": 369, + "time_per_iteration": 2.6966772079467773 + }, + { + "auxiliary_loss_clip": 0.09905175, + "auxiliary_loss_mlp": 0.01835143, + "balance_loss_clip": 0.07793829, + "balance_loss_mlp": 0.01659667, + "epoch": 0.022245603487148654, + "flos": 25200431297280.0, + "grad_norm": 48.526199326230525, + "language_loss": 1.05259717, + "learning_rate": 3.8074218223377844e-06, + "loss": 1.17000043, + "num_input_tokens_seen": 7727160, + "router_z_loss_clip": 21.09375, + "router_z_loss_mlp": 1.75585938, + "step": 370, + "time_per_iteration": 2.726882219314575 + }, + { + "auxiliary_loss_clip": 0.09840686, + "auxiliary_loss_mlp": 0.01849254, + "balance_loss_clip": 0.0775683, + "balance_loss_mlp": 0.01677497, + "epoch": 0.022305726739816623, + "flos": 21402070663680.0, + "grad_norm": 32.14486041550045, + "language_loss": 1.00516605, + "learning_rate": 3.8091596160607834e-06, + "loss": 1.12206554, + "num_input_tokens_seen": 7747730, + "router_z_loss_clip": 20.828125, + "router_z_loss_mlp": 1.71875, + "step": 371, + "time_per_iteration": 2.6846559047698975 + }, + { + "auxiliary_loss_clip": 0.09844472, + "auxiliary_loss_mlp": 0.01857578, + "balance_loss_clip": 0.07769165, + "balance_loss_mlp": 0.01683151, + "epoch": 0.022365849992484595, + "flos": 22498736901120.0, + "grad_norm": 33.301604666823, + "language_loss": 1.06231499, + "learning_rate": 3.8108927320022896e-06, + "loss": 1.17933559, + "num_input_tokens_seen": 7766765, + "router_z_loss_clip": 20.734375, + "router_z_loss_mlp": 1.74511719, + "step": 372, + "time_per_iteration": 2.7052745819091797 + }, + { + "auxiliary_loss_clip": 0.09826015, + "auxiliary_loss_mlp": 0.01853945, + "balance_loss_clip": 0.07764611, + "balance_loss_mlp": 0.01673796, + "epoch": 0.022425973245152563, + "flos": 17862083694720.0, + "grad_norm": 41.636352487556145, + "language_loss": 1.03913403, + "learning_rate": 3.8126211952779548e-06, + "loss": 1.15593362, + "num_input_tokens_seen": 7784010, + "router_z_loss_clip": 20.640625, + "router_z_loss_mlp": 1.80078125, + "step": 373, + "time_per_iteration": 4.106141090393066 + }, + { + "auxiliary_loss_clip": 0.09845725, + "auxiliary_loss_mlp": 0.01869282, + "balance_loss_clip": 0.07777153, + "balance_loss_mlp": 0.01685128, + "epoch": 0.022486096497820532, + "flos": 15487804327680.0, + "grad_norm": 61.54476347228186, + "language_loss": 1.0650835, + "learning_rate": 3.8143450308016952e-06, + "loss": 1.18223345, + "num_input_tokens_seen": 7801305, + "router_z_loss_clip": 20.703125, + "router_z_loss_mlp": 1.84277344, + "step": 374, + "time_per_iteration": 4.033753871917725 + }, + { + "auxiliary_loss_clip": 0.09812269, + "auxiliary_loss_mlp": 0.01856399, + "balance_loss_clip": 0.07757415, + "balance_loss_mlp": 0.01667095, + "epoch": 0.0225462197504885, + "flos": 27791897247360.0, + "grad_norm": 56.210759270114224, + "language_loss": 1.03319001, + "learning_rate": 3.8160642632878525e-06, + "loss": 1.14987683, + "num_input_tokens_seen": 7823965, + "router_z_loss_clip": 20.5625, + "router_z_loss_mlp": 1.89257812, + "step": 375, + "time_per_iteration": 2.7545790672302246 + }, + { + "auxiliary_loss_clip": 0.0981497, + "auxiliary_loss_mlp": 0.01843627, + "balance_loss_clip": 0.07751609, + "balance_loss_mlp": 0.01665767, + "epoch": 0.02260634300315647, + "flos": 19981804757760.0, + "grad_norm": 57.812718044092065, + "language_loss": 1.07001138, + "learning_rate": 3.817778917253314e-06, + "loss": 1.18659735, + "num_input_tokens_seen": 7842115, + "router_z_loss_clip": 20.625, + "router_z_loss_mlp": 1.77734375, + "step": 376, + "time_per_iteration": 4.076448202133179 + }, + { + "auxiliary_loss_clip": 0.09767978, + "auxiliary_loss_mlp": 0.01843169, + "balance_loss_clip": 0.07741934, + "balance_loss_mlp": 0.01659587, + "epoch": 0.02266646625582444, + "flos": 16032699429120.0, + "grad_norm": 49.61569881920644, + "language_loss": 1.03111744, + "learning_rate": 3.8194890170196155e-06, + "loss": 1.14722896, + "num_input_tokens_seen": 7857830, + "router_z_loss_clip": 20.265625, + "router_z_loss_mlp": 1.83691406, + "step": 377, + "time_per_iteration": 2.7254374027252197 + }, + { + "auxiliary_loss_clip": 0.09738941, + "auxiliary_loss_mlp": 0.01853994, + "balance_loss_clip": 0.07719769, + "balance_loss_mlp": 0.01670221, + "epoch": 0.02272658950849241, + "flos": 20409553451520.0, + "grad_norm": 48.84797020114705, + "language_loss": 1.2001133, + "learning_rate": 3.8211945867150055e-06, + "loss": 1.31604266, + "num_input_tokens_seen": 7875840, + "router_z_loss_clip": 20.171875, + "router_z_loss_mlp": 1.83691406, + "step": 378, + "time_per_iteration": 2.648167848587036 + }, + { + "auxiliary_loss_clip": 0.08046754, + "auxiliary_loss_mlp": 0.0138253, + "balance_loss_clip": 0.07155026, + "balance_loss_mlp": 0.01272953, + "epoch": 0.02278671276116038, + "flos": 69867387469440.0, + "grad_norm": 0.9915915427532991, + "language_loss": 0.75403833, + "learning_rate": 3.822895650276492e-06, + "loss": 0.84833115, + "num_input_tokens_seen": 7940190, + "router_z_loss_clip": 8.90625, + "router_z_loss_mlp": 1.09863281, + "step": 379, + "time_per_iteration": 3.301997661590576 + }, + { + "auxiliary_loss_clip": 0.09709425, + "auxiliary_loss_mlp": 0.01844372, + "balance_loss_clip": 0.07733691, + "balance_loss_mlp": 0.0167643, + "epoch": 0.022846836013828347, + "flos": 38517935823360.0, + "grad_norm": 57.599828595547535, + "language_loss": 1.02933359, + "learning_rate": 3.824592231451859e-06, + "loss": 1.14487147, + "num_input_tokens_seen": 7960840, + "router_z_loss_clip": 19.75, + "router_z_loss_mlp": 1.6796875, + "step": 380, + "time_per_iteration": 2.817310094833374 + }, + { + "auxiliary_loss_clip": 0.09699684, + "auxiliary_loss_mlp": 0.01850822, + "balance_loss_clip": 0.07715706, + "balance_loss_mlp": 0.01682976, + "epoch": 0.02290695926649632, + "flos": 20965768853760.0, + "grad_norm": 97.98649595332142, + "language_loss": 1.19140625, + "learning_rate": 3.826284353801652e-06, + "loss": 1.30691135, + "num_input_tokens_seen": 7975500, + "router_z_loss_clip": 19.875, + "router_z_loss_mlp": 1.6796875, + "step": 381, + "time_per_iteration": 2.6415421962738037 + }, + { + "auxiliary_loss_clip": 0.09691618, + "auxiliary_loss_mlp": 0.01878712, + "balance_loss_clip": 0.0772172, + "balance_loss_mlp": 0.01696942, + "epoch": 0.022967082519164288, + "flos": 24028895836800.0, + "grad_norm": 71.67825440631948, + "language_loss": 1.08586979, + "learning_rate": 3.827972040701142e-06, + "loss": 1.20157313, + "num_input_tokens_seen": 7993880, + "router_z_loss_clip": 19.703125, + "router_z_loss_mlp": 1.81640625, + "step": 382, + "time_per_iteration": 2.688380718231201 + }, + { + "auxiliary_loss_clip": 0.0969088, + "auxiliary_loss_mlp": 0.0187998, + "balance_loss_clip": 0.07735589, + "balance_loss_mlp": 0.01704695, + "epoch": 0.023027205771832256, + "flos": 21003643699200.0, + "grad_norm": 97.39739491884717, + "language_loss": 1.06533158, + "learning_rate": 3.829655315342268e-06, + "loss": 1.18104029, + "num_input_tokens_seen": 8012730, + "router_z_loss_clip": 19.53125, + "router_z_loss_mlp": 1.75292969, + "step": 383, + "time_per_iteration": 2.697038173675537 + }, + { + "auxiliary_loss_clip": 0.09652471, + "auxiliary_loss_mlp": 0.01917586, + "balance_loss_clip": 0.07717164, + "balance_loss_mlp": 0.017485, + "epoch": 0.023087329024500225, + "flos": 21367172638080.0, + "grad_norm": 19.8768776799836, + "language_loss": 1.04799581, + "learning_rate": 3.831334200735543e-06, + "loss": 1.16369653, + "num_input_tokens_seen": 8031275, + "router_z_loss_clip": 19.34375, + "router_z_loss_mlp": 1.68945312, + "step": 384, + "time_per_iteration": 2.778743028640747 + }, + { + "auxiliary_loss_clip": 0.09638548, + "auxiliary_loss_mlp": 0.01934173, + "balance_loss_clip": 0.07711613, + "balance_loss_mlp": 0.01771858, + "epoch": 0.023147452277168194, + "flos": 21879014503680.0, + "grad_norm": 73.36535290584087, + "language_loss": 1.05852127, + "learning_rate": 3.8330087197119426e-06, + "loss": 1.17424858, + "num_input_tokens_seen": 8051600, + "router_z_loss_clip": 19.265625, + "router_z_loss_mlp": 1.62402344, + "step": 385, + "time_per_iteration": 2.6939914226531982 + }, + { + "auxiliary_loss_clip": 0.09652182, + "auxiliary_loss_mlp": 0.01965061, + "balance_loss_clip": 0.07710169, + "balance_loss_mlp": 0.01799503, + "epoch": 0.023207575529836166, + "flos": 18922719876480.0, + "grad_norm": 50.36598663544367, + "language_loss": 0.83061486, + "learning_rate": 3.83467889492477e-06, + "loss": 0.9467873, + "num_input_tokens_seen": 8070600, + "router_z_loss_clip": 19.390625, + "router_z_loss_mlp": 1.65527344, + "step": 386, + "time_per_iteration": 2.655557870864868 + }, + { + "auxiliary_loss_clip": 0.09622966, + "auxiliary_loss_mlp": 0.01950141, + "balance_loss_clip": 0.07707699, + "balance_loss_mlp": 0.01772281, + "epoch": 0.023267698782504134, + "flos": 25052998838400.0, + "grad_norm": 988.1002722416383, + "language_loss": 1.04901791, + "learning_rate": 3.836344748851495e-06, + "loss": 1.16474891, + "num_input_tokens_seen": 8090680, + "router_z_loss_clip": 19.171875, + "router_z_loss_mlp": 1.77832031, + "step": 387, + "time_per_iteration": 2.7180447578430176 + }, + { + "auxiliary_loss_clip": 0.09642081, + "auxiliary_loss_mlp": 0.01949741, + "balance_loss_clip": 0.0771786, + "balance_loss_mlp": 0.0177932, + "epoch": 0.023327822035172103, + "flos": 28887221819520.0, + "grad_norm": 25.325317169555962, + "language_loss": 1.03613186, + "learning_rate": 3.838006303795566e-06, + "loss": 1.15205002, + "num_input_tokens_seen": 8114610, + "router_z_loss_clip": 19.21875, + "router_z_loss_mlp": 1.70410156, + "step": 388, + "time_per_iteration": 2.7562358379364014 + }, + { + "auxiliary_loss_clip": 0.09633669, + "auxiliary_loss_mlp": 0.01946229, + "balance_loss_clip": 0.0770783, + "balance_loss_mlp": 0.01764268, + "epoch": 0.02338794528784007, + "flos": 27128178656640.0, + "grad_norm": 20.981666659787948, + "language_loss": 1.1374321, + "learning_rate": 3.839663581888206e-06, + "loss": 1.25323105, + "num_input_tokens_seen": 8133975, + "router_z_loss_clip": 19.25, + "router_z_loss_mlp": 1.8203125, + "step": 389, + "time_per_iteration": 2.762704372406006 + }, + { + "auxiliary_loss_clip": 0.09556312, + "auxiliary_loss_mlp": 0.01957007, + "balance_loss_clip": 0.07663149, + "balance_loss_mlp": 0.01788016, + "epoch": 0.02344806854050804, + "flos": 21328375397760.0, + "grad_norm": 32.87948782751001, + "language_loss": 1.07566035, + "learning_rate": 3.841316605090178e-06, + "loss": 1.19079351, + "num_input_tokens_seen": 8153570, + "router_z_loss_clip": 18.921875, + "router_z_loss_mlp": 1.68945312, + "step": 390, + "time_per_iteration": 2.659283399581909 + }, + { + "auxiliary_loss_clip": 0.09492537, + "auxiliary_loss_mlp": 0.01896556, + "balance_loss_clip": 0.07636442, + "balance_loss_mlp": 0.01733001, + "epoch": 0.023508191793176012, + "flos": 24796847306880.0, + "grad_norm": 140.16785757024044, + "language_loss": 1.15910161, + "learning_rate": 3.842965395193529e-06, + "loss": 1.27299261, + "num_input_tokens_seen": 8170075, + "router_z_loss_clip": 18.546875, + "router_z_loss_mlp": 1.63476562, + "step": 391, + "time_per_iteration": 2.713545799255371 + }, + { + "auxiliary_loss_clip": 0.09538671, + "auxiliary_loss_mlp": 0.0188554, + "balance_loss_clip": 0.0766757, + "balance_loss_mlp": 0.01730473, + "epoch": 0.02356831504584398, + "flos": 26002651887360.0, + "grad_norm": 36.4029876381944, + "language_loss": 1.06844151, + "learning_rate": 3.84460997382332e-06, + "loss": 1.18268371, + "num_input_tokens_seen": 8190420, + "router_z_loss_clip": 18.6875, + "router_z_loss_mlp": 1.54882812, + "step": 392, + "time_per_iteration": 2.738403081893921 + }, + { + "auxiliary_loss_clip": 0.09424435, + "auxiliary_loss_mlp": 0.01937068, + "balance_loss_clip": 0.07618648, + "balance_loss_mlp": 0.01782287, + "epoch": 0.02362843829851195, + "flos": 19068475253760.0, + "grad_norm": 23.190572901307267, + "language_loss": 1.05277753, + "learning_rate": 3.8462503624393256e-06, + "loss": 1.16639256, + "num_input_tokens_seen": 8208790, + "router_z_loss_clip": 18.0625, + "router_z_loss_mlp": 1.546875, + "step": 393, + "time_per_iteration": 2.730311155319214 + }, + { + "auxiliary_loss_clip": 0.09391345, + "auxiliary_loss_mlp": 0.01894272, + "balance_loss_clip": 0.07595266, + "balance_loss_mlp": 0.01726616, + "epoch": 0.023688561551179918, + "flos": 16076611768320.0, + "grad_norm": 91.86478442531423, + "language_loss": 1.00682688, + "learning_rate": 3.84788658233771e-06, + "loss": 1.11968303, + "num_input_tokens_seen": 8226885, + "router_z_loss_clip": 17.953125, + "router_z_loss_mlp": 1.67578125, + "step": 394, + "time_per_iteration": 2.705462694168091 + }, + { + "auxiliary_loss_clip": 0.09387165, + "auxiliary_loss_mlp": 0.01881808, + "balance_loss_clip": 0.07597888, + "balance_loss_mlp": 0.01708144, + "epoch": 0.023748684803847887, + "flos": 21730575795840.0, + "grad_norm": 29.466731361634597, + "language_loss": 1.02469492, + "learning_rate": 3.84951865465269e-06, + "loss": 1.13738465, + "num_input_tokens_seen": 8246825, + "router_z_loss_clip": 17.875, + "router_z_loss_mlp": 1.73632812, + "step": 395, + "time_per_iteration": 2.67728328704834 + }, + { + "auxiliary_loss_clip": 0.07807533, + "auxiliary_loss_mlp": 0.01422272, + "balance_loss_clip": 0.06998962, + "balance_loss_mlp": 0.01324949, + "epoch": 0.02380880805651586, + "flos": 61944299349120.0, + "grad_norm": 0.9675883167947973, + "language_loss": 0.63979137, + "learning_rate": 3.851146600358172e-06, + "loss": 0.7320894, + "num_input_tokens_seen": 8302835, + "router_z_loss_clip": 8.09375, + "router_z_loss_mlp": 0.97216797, + "step": 396, + "time_per_iteration": 3.085773468017578 + }, + { + "auxiliary_loss_clip": 0.09369384, + "auxiliary_loss_mlp": 0.01878876, + "balance_loss_clip": 0.07592572, + "balance_loss_mlp": 0.01705307, + "epoch": 0.023868931309183827, + "flos": 20272518898560.0, + "grad_norm": 448.6329753345253, + "language_loss": 1.09206522, + "learning_rate": 3.852770440269372e-06, + "loss": 1.20454776, + "num_input_tokens_seen": 8320745, + "router_z_loss_clip": 17.765625, + "router_z_loss_mlp": 1.73632812, + "step": 397, + "time_per_iteration": 2.645312786102295 + }, + { + "auxiliary_loss_clip": 0.09360366, + "auxiliary_loss_mlp": 0.01887806, + "balance_loss_clip": 0.07592075, + "balance_loss_mlp": 0.01703461, + "epoch": 0.023929054561851796, + "flos": 21144954810240.0, + "grad_norm": 35.15382244199787, + "language_loss": 1.09138823, + "learning_rate": 3.854390195044404e-06, + "loss": 1.20386982, + "num_input_tokens_seen": 8339540, + "router_z_loss_clip": 17.671875, + "router_z_loss_mlp": 1.84277344, + "step": 398, + "time_per_iteration": 2.7186756134033203 + }, + { + "auxiliary_loss_clip": 0.09363802, + "auxiliary_loss_mlp": 0.01863352, + "balance_loss_clip": 0.07595689, + "balance_loss_mlp": 0.01681963, + "epoch": 0.023989177814519765, + "flos": 13703548285440.0, + "grad_norm": 79.14501576371894, + "language_loss": 1.17455924, + "learning_rate": 3.856005885185868e-06, + "loss": 1.2868309, + "num_input_tokens_seen": 8354890, + "router_z_loss_clip": 17.6875, + "router_z_loss_mlp": 1.81347656, + "step": 399, + "time_per_iteration": 2.6266868114471436 + }, + { + "auxiliary_loss_clip": 0.09350164, + "auxiliary_loss_mlp": 0.01862402, + "balance_loss_clip": 0.07603092, + "balance_loss_mlp": 0.0168683, + "epoch": 0.024049301067187733, + "flos": 26329060667520.0, + "grad_norm": 31.26445557719831, + "language_loss": 1.02793097, + "learning_rate": 3.857617531042398e-06, + "loss": 1.14005673, + "num_input_tokens_seen": 8375845, + "router_z_loss_clip": 17.46875, + "router_z_loss_mlp": 1.75585938, + "step": 400, + "time_per_iteration": 2.766996145248413 + }, + { + "auxiliary_loss_clip": 0.09326777, + "auxiliary_loss_mlp": 0.01879183, + "balance_loss_clip": 0.07581857, + "balance_loss_mlp": 0.01707522, + "epoch": 0.024109424319855705, + "flos": 24432270192000.0, + "grad_norm": 165.70452294486532, + "language_loss": 0.98901701, + "learning_rate": 3.8592251528102065e-06, + "loss": 1.1010766, + "num_input_tokens_seen": 8395240, + "router_z_loss_clip": 17.46875, + "router_z_loss_mlp": 1.71679688, + "step": 401, + "time_per_iteration": 2.6877481937408447 + }, + { + "auxiliary_loss_clip": 0.09325443, + "auxiliary_loss_mlp": 0.01927273, + "balance_loss_clip": 0.0761469, + "balance_loss_mlp": 0.01736538, + "epoch": 0.024169547572523674, + "flos": 29611764074880.0, + "grad_norm": 158.83382742696674, + "language_loss": 1.04086566, + "learning_rate": 3.8608287705345976e-06, + "loss": 1.15339279, + "num_input_tokens_seen": 8416950, + "router_z_loss_clip": 17.09375, + "router_z_loss_mlp": 1.90722656, + "step": 402, + "time_per_iteration": 2.7297163009643555 + }, + { + "auxiliary_loss_clip": 0.09320071, + "auxiliary_loss_mlp": 0.01914681, + "balance_loss_clip": 0.07593916, + "balance_loss_mlp": 0.01724327, + "epoch": 0.024229670825191642, + "flos": 22608042952320.0, + "grad_norm": 474.9195361774189, + "language_loss": 1.23886442, + "learning_rate": 3.86242840411147e-06, + "loss": 1.35121191, + "num_input_tokens_seen": 8433660, + "router_z_loss_clip": 17.265625, + "router_z_loss_mlp": 1.90234375, + "step": 403, + "time_per_iteration": 2.6663832664489746 + }, + { + "auxiliary_loss_clip": 0.09310063, + "auxiliary_loss_mlp": 0.01918458, + "balance_loss_clip": 0.07606195, + "balance_loss_mlp": 0.01729535, + "epoch": 0.02428979407785961, + "flos": 18156110071680.0, + "grad_norm": 557.4725363749534, + "language_loss": 1.23195148, + "learning_rate": 3.864024073288798e-06, + "loss": 1.34423661, + "num_input_tokens_seen": 8450180, + "router_z_loss_clip": 17.0625, + "router_z_loss_mlp": 1.88867188, + "step": 404, + "time_per_iteration": 2.6930551528930664 + }, + { + "auxiliary_loss_clip": 0.09236102, + "auxiliary_loss_mlp": 0.01972168, + "balance_loss_clip": 0.07543309, + "balance_loss_mlp": 0.01765125, + "epoch": 0.024349917330527583, + "flos": 15310463160960.0, + "grad_norm": 32.91094539461264, + "language_loss": 1.10026622, + "learning_rate": 3.865615797668091e-06, + "loss": 1.21234894, + "num_input_tokens_seen": 8467775, + "router_z_loss_clip": 16.921875, + "router_z_loss_mlp": 2.0703125, + "step": 405, + "time_per_iteration": 2.7313172817230225 + }, + { + "auxiliary_loss_clip": 0.09182028, + "auxiliary_loss_mlp": 0.01998566, + "balance_loss_clip": 0.0751636, + "balance_loss_mlp": 0.01782559, + "epoch": 0.024410040583195552, + "flos": 20779623008640.0, + "grad_norm": 51.884422925202074, + "language_loss": 1.20401216, + "learning_rate": 3.867203596705844e-06, + "loss": 1.31581819, + "num_input_tokens_seen": 8486765, + "router_z_loss_clip": 16.65625, + "router_z_loss_mlp": 2.16015625, + "step": 406, + "time_per_iteration": 2.687269449234009 + }, + { + "auxiliary_loss_clip": 0.09164648, + "auxiliary_loss_mlp": 0.02058169, + "balance_loss_clip": 0.07528092, + "balance_loss_mlp": 0.01824328, + "epoch": 0.02447016383586352, + "flos": 21805319237760.0, + "grad_norm": 51.34272238318618, + "language_loss": 1.09166133, + "learning_rate": 3.86878748971496e-06, + "loss": 1.20388949, + "num_input_tokens_seen": 8506515, + "router_z_loss_clip": 16.375, + "router_z_loss_mlp": 2.33789062, + "step": 407, + "time_per_iteration": 2.7443573474884033 + }, + { + "auxiliary_loss_clip": 0.0913244, + "auxiliary_loss_mlp": 0.02070529, + "balance_loss_clip": 0.07525964, + "balance_loss_mlp": 0.01834208, + "epoch": 0.02453028708853149, + "flos": 33956529183360.0, + "grad_norm": 76.90003006133684, + "language_loss": 0.92362475, + "learning_rate": 3.8703674958661596e-06, + "loss": 1.03565443, + "num_input_tokens_seen": 8528035, + "router_z_loss_clip": 16.0546875, + "router_z_loss_mlp": 2.36132812, + "step": 408, + "time_per_iteration": 2.78354549407959 + }, + { + "auxiliary_loss_clip": 0.09112523, + "auxiliary_loss_mlp": 0.02060747, + "balance_loss_clip": 0.07508834, + "balance_loss_mlp": 0.01828241, + "epoch": 0.024590410341199458, + "flos": 21798485130240.0, + "grad_norm": 96.45423831363296, + "language_loss": 1.18704772, + "learning_rate": 3.871943634189376e-06, + "loss": 1.29878044, + "num_input_tokens_seen": 8546455, + "router_z_loss_clip": 16.015625, + "router_z_loss_mlp": 2.32421875, + "step": 409, + "time_per_iteration": 2.7200136184692383 + }, + { + "auxiliary_loss_clip": 0.09154539, + "auxiliary_loss_mlp": 0.02068674, + "balance_loss_clip": 0.07541502, + "balance_loss_mlp": 0.01836741, + "epoch": 0.02465053359386743, + "flos": 35123243034240.0, + "grad_norm": 76.46793311342431, + "language_loss": 1.05106175, + "learning_rate": 3.873515923575128e-06, + "loss": 1.16329384, + "num_input_tokens_seen": 8568450, + "router_z_loss_clip": 16.1171875, + "router_z_loss_mlp": 2.3203125, + "step": 410, + "time_per_iteration": 2.7935402393341064 + }, + { + "auxiliary_loss_clip": 0.09179245, + "auxiliary_loss_mlp": 0.02052485, + "balance_loss_clip": 0.07555975, + "balance_loss_mlp": 0.01831042, + "epoch": 0.0247106568465354, + "flos": 27458360870400.0, + "grad_norm": 178.4501833385731, + "language_loss": 1.0301317, + "learning_rate": 3.875084382775879e-06, + "loss": 1.14244902, + "num_input_tokens_seen": 8589340, + "router_z_loss_clip": 16.25, + "router_z_loss_mlp": 2.21679688, + "step": 411, + "time_per_iteration": 2.810314416885376 + }, + { + "auxiliary_loss_clip": 0.09117973, + "auxiliary_loss_mlp": 0.02147569, + "balance_loss_clip": 0.07523946, + "balance_loss_mlp": 0.01899232, + "epoch": 0.024770780099203367, + "flos": 20709994665600.0, + "grad_norm": 31.381834451084366, + "language_loss": 1.07807076, + "learning_rate": 3.87664903040738e-06, + "loss": 1.19072616, + "num_input_tokens_seen": 8607150, + "router_z_loss_clip": 15.9375, + "router_z_loss_mlp": 2.48242188, + "step": 412, + "time_per_iteration": 4.135298252105713 + }, + { + "auxiliary_loss_clip": 0.0766484, + "auxiliary_loss_mlp": 0.01383218, + "balance_loss_clip": 0.06950212, + "balance_loss_mlp": 0.01289853, + "epoch": 0.024830903351871336, + "flos": 69571264740480.0, + "grad_norm": 0.8458100626859368, + "language_loss": 0.58554661, + "learning_rate": 3.878209884949994e-06, + "loss": 0.67602718, + "num_input_tokens_seen": 8669865, + "router_z_loss_clip": 7.13671875, + "router_z_loss_mlp": 0.93261719, + "step": 413, + "time_per_iteration": 4.813804864883423 + }, + { + "auxiliary_loss_clip": 0.09105721, + "auxiliary_loss_mlp": 0.02060854, + "balance_loss_clip": 0.07511897, + "balance_loss_mlp": 0.01837503, + "epoch": 0.024891026604539304, + "flos": 32278728153600.0, + "grad_norm": 48.89104730966055, + "language_loss": 0.9726972, + "learning_rate": 3.879766964750006e-06, + "loss": 1.08436298, + "num_input_tokens_seen": 8690235, + "router_z_loss_clip": 15.921875, + "router_z_loss_mlp": 2.234375, + "step": 414, + "time_per_iteration": 2.777872323989868 + }, + { + "auxiliary_loss_clip": 0.0905456, + "auxiliary_loss_mlp": 0.02077859, + "balance_loss_clip": 0.07483284, + "balance_loss_mlp": 0.0185365, + "epoch": 0.024951149857207276, + "flos": 18845712374400.0, + "grad_norm": 208.18956686369972, + "language_loss": 1.01095724, + "learning_rate": 3.881320288020917e-06, + "loss": 1.12228131, + "num_input_tokens_seen": 8706295, + "router_z_loss_clip": 15.71875, + "router_z_loss_mlp": 2.24023438, + "step": 415, + "time_per_iteration": 4.142550230026245 + }, + { + "auxiliary_loss_clip": 0.09080397, + "auxiliary_loss_mlp": 0.02074643, + "balance_loss_clip": 0.07484584, + "balance_loss_mlp": 0.0184805, + "epoch": 0.025011273109875245, + "flos": 15382565199360.0, + "grad_norm": 178.52142115782007, + "language_loss": 1.28543544, + "learning_rate": 3.882869872844723e-06, + "loss": 1.39698577, + "num_input_tokens_seen": 8724200, + "router_z_loss_clip": 15.953125, + "router_z_loss_mlp": 2.26757812, + "step": 416, + "time_per_iteration": 2.6912667751312256 + }, + { + "auxiliary_loss_clip": 0.09093624, + "auxiliary_loss_mlp": 0.02048458, + "balance_loss_clip": 0.07498566, + "balance_loss_mlp": 0.01806797, + "epoch": 0.025071396362543213, + "flos": 18921336284160.0, + "grad_norm": 52.83271193802728, + "language_loss": 0.94415307, + "learning_rate": 3.884415737173176e-06, + "loss": 1.05557394, + "num_input_tokens_seen": 8744170, + "router_z_loss_clip": 15.9609375, + "router_z_loss_mlp": 2.41796875, + "step": 417, + "time_per_iteration": 2.6768579483032227 + }, + { + "auxiliary_loss_clip": 0.0906695, + "auxiliary_loss_mlp": 0.02050523, + "balance_loss_clip": 0.07510033, + "balance_loss_mlp": 0.01817826, + "epoch": 0.025131519615211182, + "flos": 25345012717440.0, + "grad_norm": 47.28632079324067, + "language_loss": 0.95738804, + "learning_rate": 3.8859578988290344e-06, + "loss": 1.06856275, + "num_input_tokens_seen": 8765120, + "router_z_loss_clip": 15.5625, + "router_z_loss_mlp": 2.328125, + "step": 418, + "time_per_iteration": 2.7193026542663574 + }, + { + "auxiliary_loss_clip": 0.09048779, + "auxiliary_loss_mlp": 0.02107992, + "balance_loss_clip": 0.07468801, + "balance_loss_mlp": 0.01844969, + "epoch": 0.02519164286787915, + "flos": 18959169202560.0, + "grad_norm": 64.96228222580599, + "language_loss": 1.10502434, + "learning_rate": 3.887496375507294e-06, + "loss": 1.21659207, + "num_input_tokens_seen": 8783500, + "router_z_loss_clip": 15.7890625, + "router_z_loss_mlp": 2.62890625, + "step": 419, + "time_per_iteration": 2.661895513534546 + }, + { + "auxiliary_loss_clip": 0.09047179, + "auxiliary_loss_mlp": 0.02074314, + "balance_loss_clip": 0.07473344, + "balance_loss_mlp": 0.01826931, + "epoch": 0.025251766120547123, + "flos": 17426913914880.0, + "grad_norm": 60.48178105720379, + "language_loss": 0.91689897, + "learning_rate": 3.8890311847764065e-06, + "loss": 1.02811384, + "num_input_tokens_seen": 8801175, + "router_z_loss_clip": 15.7265625, + "router_z_loss_mlp": 2.47070312, + "step": 420, + "time_per_iteration": 2.690960168838501 + }, + { + "auxiliary_loss_clip": 0.09091747, + "auxiliary_loss_mlp": 0.02038651, + "balance_loss_clip": 0.07504605, + "balance_loss_mlp": 0.01800423, + "epoch": 0.02531188937321509, + "flos": 25052328005760.0, + "grad_norm": 83.61542449738408, + "language_loss": 0.95396888, + "learning_rate": 3.890562344079484e-06, + "loss": 1.06527293, + "num_input_tokens_seen": 8820215, + "router_z_loss_clip": 15.875, + "router_z_loss_mlp": 2.38085938, + "step": 421, + "time_per_iteration": 2.713627338409424 + }, + { + "auxiliary_loss_clip": 0.0910122, + "auxiliary_loss_mlp": 0.02078743, + "balance_loss_clip": 0.07504999, + "balance_loss_mlp": 0.0184185, + "epoch": 0.02537201262588306, + "flos": 30600214364160.0, + "grad_norm": 131.53322969932037, + "language_loss": 1.06396794, + "learning_rate": 3.89208987073549e-06, + "loss": 1.17576766, + "num_input_tokens_seen": 8839660, + "router_z_loss_clip": 15.96875, + "router_z_loss_mlp": 2.36914062, + "step": 422, + "time_per_iteration": 2.779984712600708 + }, + { + "auxiliary_loss_clip": 0.09149099, + "auxiliary_loss_mlp": 0.02005588, + "balance_loss_clip": 0.07524605, + "balance_loss_mlp": 0.01778041, + "epoch": 0.02543213587855103, + "flos": 26072154449280.0, + "grad_norm": 215.69560731113194, + "language_loss": 1.02335918, + "learning_rate": 3.893613781940409e-06, + "loss": 1.13490605, + "num_input_tokens_seen": 8859280, + "router_z_loss_clip": 16.2265625, + "router_z_loss_mlp": 2.27148438, + "step": 423, + "time_per_iteration": 2.72013783454895 + }, + { + "auxiliary_loss_clip": 0.09173086, + "auxiliary_loss_mlp": 0.0200403, + "balance_loss_clip": 0.07535084, + "balance_loss_mlp": 0.01785067, + "epoch": 0.025492259131218997, + "flos": 36030744679680.0, + "grad_norm": 27.081185373152007, + "language_loss": 0.91272038, + "learning_rate": 3.895134094768415e-06, + "loss": 1.02449155, + "num_input_tokens_seen": 8880560, + "router_z_loss_clip": 16.375, + "router_z_loss_mlp": 2.18945312, + "step": 424, + "time_per_iteration": 2.8317928314208984 + }, + { + "auxiliary_loss_clip": 0.09242675, + "auxiliary_loss_mlp": 0.01968499, + "balance_loss_clip": 0.07578178, + "balance_loss_mlp": 0.01753446, + "epoch": 0.02555238238388697, + "flos": 18593963182080.0, + "grad_norm": 166.26721899755887, + "language_loss": 1.05789995, + "learning_rate": 3.896650826173015e-06, + "loss": 1.17001164, + "num_input_tokens_seen": 8899155, + "router_z_loss_clip": 16.625, + "router_z_loss_mlp": 2.15332031, + "step": 425, + "time_per_iteration": 2.660106897354126 + }, + { + "auxiliary_loss_clip": 0.0923897, + "auxiliary_loss_mlp": 0.01943853, + "balance_loss_clip": 0.07566722, + "balance_loss_mlp": 0.01731852, + "epoch": 0.025612505636554938, + "flos": 24250023561600.0, + "grad_norm": 44.6180367993383, + "language_loss": 1.08164155, + "learning_rate": 3.898163992988186e-06, + "loss": 1.19346988, + "num_input_tokens_seen": 8917890, + "router_z_loss_clip": 16.703125, + "router_z_loss_mlp": 2.12109375, + "step": 426, + "time_per_iteration": 2.713566303253174 + }, + { + "auxiliary_loss_clip": 0.07567823, + "auxiliary_loss_mlp": 0.0137553, + "balance_loss_clip": 0.06925757, + "balance_loss_mlp": 0.01282499, + "epoch": 0.025672628889222907, + "flos": 60606617241600.0, + "grad_norm": 0.882551554014491, + "language_loss": 0.57127881, + "learning_rate": 3.899673611929491e-06, + "loss": 0.66071236, + "num_input_tokens_seen": 8978260, + "router_z_loss_clip": 6.43359375, + "router_z_loss_mlp": 0.92919922, + "step": 427, + "time_per_iteration": 3.3642380237579346 + }, + { + "auxiliary_loss_clip": 0.09344095, + "auxiliary_loss_mlp": 0.01954303, + "balance_loss_clip": 0.0761513, + "balance_loss_mlp": 0.01743541, + "epoch": 0.025732752141890875, + "flos": 19579352797440.0, + "grad_norm": 32.1114157010126, + "language_loss": 1.08901465, + "learning_rate": 3.901179699595194e-06, + "loss": 1.20199859, + "num_input_tokens_seen": 8994460, + "router_z_loss_clip": 17.296875, + "router_z_loss_mlp": 2.10839844, + "step": 428, + "time_per_iteration": 2.6606802940368652 + }, + { + "auxiliary_loss_clip": 0.09310514, + "auxiliary_loss_mlp": 0.01961632, + "balance_loss_clip": 0.07603246, + "balance_loss_mlp": 0.01752969, + "epoch": 0.025792875394558847, + "flos": 31292164581120.0, + "grad_norm": 36.551830180207176, + "language_loss": 1.00762367, + "learning_rate": 3.902682272467353e-06, + "loss": 1.12034512, + "num_input_tokens_seen": 9016670, + "router_z_loss_clip": 17.046875, + "router_z_loss_mlp": 2.08984375, + "step": 429, + "time_per_iteration": 2.8459787368774414 + }, + { + "auxiliary_loss_clip": 0.09338318, + "auxiliary_loss_mlp": 0.01955653, + "balance_loss_clip": 0.07623117, + "balance_loss_mlp": 0.01745367, + "epoch": 0.025852998647226816, + "flos": 32387824569600.0, + "grad_norm": 62.5354126598028, + "language_loss": 1.05025983, + "learning_rate": 3.904181346912895e-06, + "loss": 1.16319966, + "num_input_tokens_seen": 9039720, + "router_z_loss_clip": 17.15625, + "router_z_loss_mlp": 2.10644531, + "step": 430, + "time_per_iteration": 2.8446128368377686 + }, + { + "auxiliary_loss_clip": 0.09278628, + "auxiliary_loss_mlp": 0.01943414, + "balance_loss_clip": 0.07600376, + "balance_loss_mlp": 0.01729219, + "epoch": 0.025913121899894784, + "flos": 20199452538240.0, + "grad_norm": 28.225993864396795, + "language_loss": 1.00378919, + "learning_rate": 3.905676939184698e-06, + "loss": 1.11600959, + "num_input_tokens_seen": 9059850, + "router_z_loss_clip": 16.78125, + "router_z_loss_mlp": 2.14453125, + "step": 431, + "time_per_iteration": 2.735534906387329 + }, + { + "auxiliary_loss_clip": 0.09339449, + "auxiliary_loss_mlp": 0.01919694, + "balance_loss_clip": 0.07634744, + "balance_loss_mlp": 0.01714844, + "epoch": 0.025973245152562753, + "flos": 14725680716160.0, + "grad_norm": 242.91179280184718, + "language_loss": 1.11488628, + "learning_rate": 3.907169065422638e-06, + "loss": 1.22747779, + "num_input_tokens_seen": 9077590, + "router_z_loss_clip": 17.046875, + "router_z_loss_mlp": 2.04882812, + "step": 432, + "time_per_iteration": 2.6356372833251953 + }, + { + "auxiliary_loss_clip": 0.09349881, + "auxiliary_loss_mlp": 0.01923388, + "balance_loss_clip": 0.07619249, + "balance_loss_mlp": 0.01717585, + "epoch": 0.02603336840523072, + "flos": 31000947315840.0, + "grad_norm": 39.86728122976192, + "language_loss": 0.95303321, + "learning_rate": 3.908657741654636e-06, + "loss": 1.06576586, + "num_input_tokens_seen": 9099880, + "router_z_loss_clip": 17.328125, + "router_z_loss_mlp": 2.06054688, + "step": 433, + "time_per_iteration": 2.7784080505371094 + }, + { + "auxiliary_loss_clip": 0.09401309, + "auxiliary_loss_mlp": 0.0191169, + "balance_loss_clip": 0.07644869, + "balance_loss_mlp": 0.01712276, + "epoch": 0.026093491657898694, + "flos": 17679753210240.0, + "grad_norm": 1553.0281168066135, + "language_loss": 1.08543563, + "learning_rate": 3.910142983797699e-06, + "loss": 1.19856548, + "num_input_tokens_seen": 9118620, + "router_z_loss_clip": 17.5625, + "router_z_loss_mlp": 1.99511719, + "step": 434, + "time_per_iteration": 2.668267250061035 + }, + { + "auxiliary_loss_clip": 0.09433939, + "auxiliary_loss_mlp": 0.01869234, + "balance_loss_clip": 0.07651832, + "balance_loss_mlp": 0.01678308, + "epoch": 0.026153614910566662, + "flos": 17863593068160.0, + "grad_norm": 33.64342024905016, + "language_loss": 1.03063393, + "learning_rate": 3.9116248076589305e-06, + "loss": 1.14366555, + "num_input_tokens_seen": 9135655, + "router_z_loss_clip": 17.8125, + "router_z_loss_mlp": 1.90917969, + "step": 435, + "time_per_iteration": 2.6838159561157227 + }, + { + "auxiliary_loss_clip": 0.09478317, + "auxiliary_loss_mlp": 0.01863685, + "balance_loss_clip": 0.07678007, + "balance_loss_mlp": 0.01671615, + "epoch": 0.02621373816323463, + "flos": 20017289761920.0, + "grad_norm": 41.08687640619308, + "language_loss": 1.07638645, + "learning_rate": 3.913103228936546e-06, + "loss": 1.18980646, + "num_input_tokens_seen": 9153520, + "router_z_loss_clip": 18.0, + "router_z_loss_mlp": 1.91992188, + "step": 436, + "time_per_iteration": 2.760547399520874 + }, + { + "auxiliary_loss_clip": 0.09473966, + "auxiliary_loss_mlp": 0.0187601, + "balance_loss_clip": 0.07674257, + "balance_loss_mlp": 0.01688708, + "epoch": 0.0262738614159026, + "flos": 19287213137280.0, + "grad_norm": 53.25711722147742, + "language_loss": 0.98595166, + "learning_rate": 3.914578263220868e-06, + "loss": 1.09945142, + "num_input_tokens_seen": 9170750, + "router_z_loss_clip": 18.0, + "router_z_loss_mlp": 1.87402344, + "step": 437, + "time_per_iteration": 2.6779754161834717 + }, + { + "auxiliary_loss_clip": 0.0942243, + "auxiliary_loss_mlp": 0.01861842, + "balance_loss_clip": 0.0761686, + "balance_loss_mlp": 0.01679594, + "epoch": 0.026333984668570568, + "flos": 18813204190080.0, + "grad_norm": 25.40915552443808, + "language_loss": 1.10034943, + "learning_rate": 3.916049925995316e-06, + "loss": 1.21319222, + "num_input_tokens_seen": 9188430, + "router_z_loss_clip": 18.03125, + "router_z_loss_mlp": 1.82421875, + "step": 438, + "time_per_iteration": 2.6451144218444824 + }, + { + "auxiliary_loss_clip": 0.07475804, + "auxiliary_loss_mlp": 0.01367854, + "balance_loss_clip": 0.06865337, + "balance_loss_mlp": 0.01290463, + "epoch": 0.02639410792123854, + "flos": 64593723196800.0, + "grad_norm": 0.9063737016618233, + "language_loss": 0.62703174, + "learning_rate": 3.917518232637377e-06, + "loss": 0.71546829, + "num_input_tokens_seen": 9255835, + "router_z_loss_clip": 6.09765625, + "router_z_loss_mlp": 0.77294922, + "step": 439, + "time_per_iteration": 3.321974992752075 + }, + { + "auxiliary_loss_clip": 0.09522887, + "auxiliary_loss_mlp": 0.0184955, + "balance_loss_clip": 0.07696441, + "balance_loss_mlp": 0.01671499, + "epoch": 0.02645423117390651, + "flos": 28480661009280.0, + "grad_norm": 87.92324241889918, + "language_loss": 0.94047898, + "learning_rate": 3.918983198419573e-06, + "loss": 1.05420327, + "num_input_tokens_seen": 9276835, + "router_z_loss_clip": 18.25, + "router_z_loss_mlp": 1.78027344, + "step": 440, + "time_per_iteration": 2.7474722862243652 + }, + { + "auxiliary_loss_clip": 0.09507709, + "auxiliary_loss_mlp": 0.01844884, + "balance_loss_clip": 0.07691655, + "balance_loss_mlp": 0.01676846, + "epoch": 0.026514354426574478, + "flos": 18557094585600.0, + "grad_norm": 21.281112340814676, + "language_loss": 1.01854694, + "learning_rate": 3.920444838510415e-06, + "loss": 1.13207293, + "num_input_tokens_seen": 9295075, + "router_z_loss_clip": 18.171875, + "router_z_loss_mlp": 1.68066406, + "step": 441, + "time_per_iteration": 2.6456263065338135 + }, + { + "auxiliary_loss_clip": 0.09501958, + "auxiliary_loss_mlp": 0.01843855, + "balance_loss_clip": 0.07712354, + "balance_loss_mlp": 0.01682208, + "epoch": 0.026574477679242446, + "flos": 20674090391040.0, + "grad_norm": 41.33053095224922, + "language_loss": 0.97709602, + "learning_rate": 3.92190316797534e-06, + "loss": 1.09055424, + "num_input_tokens_seen": 9314205, + "router_z_loss_clip": 17.890625, + "router_z_loss_mlp": 1.61621094, + "step": 442, + "time_per_iteration": 2.672673463821411 + }, + { + "auxiliary_loss_clip": 0.07433579, + "auxiliary_loss_mlp": 0.01330966, + "balance_loss_clip": 0.06849352, + "balance_loss_mlp": 0.01265354, + "epoch": 0.026634600931910415, + "flos": 57974718896640.0, + "grad_norm": 0.9677279434812149, + "language_loss": 0.64635992, + "learning_rate": 3.92335820177765e-06, + "loss": 0.73400539, + "num_input_tokens_seen": 9367395, + "router_z_loss_clip": 5.83203125, + "router_z_loss_mlp": 0.65625, + "step": 443, + "time_per_iteration": 3.173064947128296 + }, + { + "auxiliary_loss_clip": 0.09527416, + "auxiliary_loss_mlp": 0.01860056, + "balance_loss_clip": 0.07710861, + "balance_loss_mlp": 0.01695928, + "epoch": 0.026694724184578387, + "flos": 15820586017920.0, + "grad_norm": 61.63283491372988, + "language_loss": 1.0548501, + "learning_rate": 3.924809954779425e-06, + "loss": 1.16872489, + "num_input_tokens_seen": 9385185, + "router_z_loss_clip": 18.15625, + "router_z_loss_mlp": 1.64160156, + "step": 444, + "time_per_iteration": 2.639677047729492 + }, + { + "auxiliary_loss_clip": 0.09502187, + "auxiliary_loss_mlp": 0.01838362, + "balance_loss_clip": 0.07703182, + "balance_loss_mlp": 0.01668608, + "epoch": 0.026754847437246355, + "flos": 23446922503680.0, + "grad_norm": 26.361183363910182, + "language_loss": 1.13923943, + "learning_rate": 3.9262584417424425e-06, + "loss": 1.2526449, + "num_input_tokens_seen": 9403225, + "router_z_loss_clip": 17.96875, + "router_z_loss_mlp": 1.69824219, + "step": 445, + "time_per_iteration": 2.6820874214172363 + }, + { + "auxiliary_loss_clip": 0.09478995, + "auxiliary_loss_mlp": 0.01847369, + "balance_loss_clip": 0.07693952, + "balance_loss_mlp": 0.01688678, + "epoch": 0.026814970689914324, + "flos": 17346552249600.0, + "grad_norm": 24.407324377890284, + "language_loss": 1.13474417, + "learning_rate": 3.9277036773290725e-06, + "loss": 1.24800777, + "num_input_tokens_seen": 9420540, + "router_z_loss_clip": 17.84375, + "router_z_loss_mlp": 1.5859375, + "step": 446, + "time_per_iteration": 2.6508054733276367 + }, + { + "auxiliary_loss_clip": 0.09462097, + "auxiliary_loss_mlp": 0.01860509, + "balance_loss_clip": 0.07703365, + "balance_loss_mlp": 0.01698385, + "epoch": 0.026875093942582293, + "flos": 17900503591680.0, + "grad_norm": 17.536194577693298, + "language_loss": 0.97970635, + "learning_rate": 3.92914567610317e-06, + "loss": 1.09293234, + "num_input_tokens_seen": 9438840, + "router_z_loss_clip": 17.609375, + "router_z_loss_mlp": 1.62109375, + "step": 447, + "time_per_iteration": 2.6584267616271973 + }, + { + "auxiliary_loss_clip": 0.0948635, + "auxiliary_loss_mlp": 0.01891451, + "balance_loss_clip": 0.0770483, + "balance_loss_mlp": 0.01723413, + "epoch": 0.026935217195250265, + "flos": 21730114598400.0, + "grad_norm": 21.562911901589327, + "language_loss": 1.05652094, + "learning_rate": 3.930584452530952e-06, + "loss": 1.17029905, + "num_input_tokens_seen": 9457215, + "router_z_loss_clip": 17.8125, + "router_z_loss_mlp": 1.67871094, + "step": 448, + "time_per_iteration": 2.672372341156006 + }, + { + "auxiliary_loss_clip": 0.09413482, + "auxiliary_loss_mlp": 0.01902533, + "balance_loss_clip": 0.07671943, + "balance_loss_mlp": 0.01741266, + "epoch": 0.026995340447918233, + "flos": 23629378769280.0, + "grad_norm": 23.02833788504926, + "language_loss": 1.03788567, + "learning_rate": 3.9320200209818755e-06, + "loss": 1.1510458, + "num_input_tokens_seen": 9475615, + "router_z_loss_clip": 17.421875, + "router_z_loss_mlp": 1.61328125, + "step": 449, + "time_per_iteration": 2.7325220108032227 + }, + { + "auxiliary_loss_clip": 0.09437311, + "auxiliary_loss_mlp": 0.01924822, + "balance_loss_clip": 0.07667883, + "balance_loss_mlp": 0.0175955, + "epoch": 0.027055463700586202, + "flos": 17937078698880.0, + "grad_norm": 25.829396596685555, + "language_loss": 1.03924859, + "learning_rate": 3.933452395729493e-06, + "loss": 1.15286994, + "num_input_tokens_seen": 9493975, + "router_z_loss_clip": 17.703125, + "router_z_loss_mlp": 1.65332031, + "step": 450, + "time_per_iteration": 2.7811074256896973 + }, + { + "auxiliary_loss_clip": 0.09359707, + "auxiliary_loss_mlp": 0.01970194, + "balance_loss_clip": 0.0764256, + "balance_loss_mlp": 0.01786802, + "epoch": 0.02711558695325417, + "flos": 25125897490560.0, + "grad_norm": 13.607653987068408, + "language_loss": 0.94443107, + "learning_rate": 3.934881590952304e-06, + "loss": 1.05773008, + "num_input_tokens_seen": 9514810, + "router_z_loss_clip": 17.171875, + "router_z_loss_mlp": 1.83398438, + "step": 451, + "time_per_iteration": 2.7412643432617188 + }, + { + "auxiliary_loss_clip": 0.09335385, + "auxiliary_loss_mlp": 0.02017307, + "balance_loss_clip": 0.07637483, + "balance_loss_mlp": 0.0183115, + "epoch": 0.02717571020592214, + "flos": 24245788930560.0, + "grad_norm": 37.22783951143226, + "language_loss": 0.88836813, + "learning_rate": 3.936307620734599e-06, + "loss": 1.00189495, + "num_input_tokens_seen": 9533635, + "router_z_loss_clip": 16.984375, + "router_z_loss_mlp": 1.86132812, + "step": 452, + "time_per_iteration": 4.115676403045654 + }, + { + "auxiliary_loss_clip": 0.09290475, + "auxiliary_loss_mlp": 0.0203207, + "balance_loss_clip": 0.07611442, + "balance_loss_mlp": 0.01843815, + "epoch": 0.02723583345859011, + "flos": 25125939417600.0, + "grad_norm": 26.908598142012707, + "language_loss": 0.85555518, + "learning_rate": 3.937730499067294e-06, + "loss": 0.96878058, + "num_input_tokens_seen": 9555420, + "router_z_loss_clip": 16.796875, + "router_z_loss_mlp": 1.88378906, + "step": 453, + "time_per_iteration": 4.138639211654663 + }, + { + "auxiliary_loss_clip": 0.09325944, + "auxiliary_loss_mlp": 0.02084866, + "balance_loss_clip": 0.07637945, + "balance_loss_mlp": 0.01890889, + "epoch": 0.02729595671125808, + "flos": 42751550090880.0, + "grad_norm": 24.937148454808558, + "language_loss": 1.02160192, + "learning_rate": 3.939150239848748e-06, + "loss": 1.13570988, + "num_input_tokens_seen": 9578950, + "router_z_loss_clip": 16.90625, + "router_z_loss_mlp": 1.94140625, + "step": 454, + "time_per_iteration": 2.851925849914551 + }, + { + "auxiliary_loss_clip": 0.09296365, + "auxiliary_loss_mlp": 0.02123722, + "balance_loss_clip": 0.07621342, + "balance_loss_mlp": 0.01917728, + "epoch": 0.02735607996392605, + "flos": 21436884835200.0, + "grad_norm": 33.11607572615514, + "language_loss": 0.89587128, + "learning_rate": 3.9405668568855866e-06, + "loss": 1.01007211, + "num_input_tokens_seen": 9598160, + "router_z_loss_clip": 16.734375, + "router_z_loss_mlp": 2.0625, + "step": 455, + "time_per_iteration": 4.109623432159424 + }, + { + "auxiliary_loss_clip": 0.09291606, + "auxiliary_loss_mlp": 0.02163595, + "balance_loss_clip": 0.07605162, + "balance_loss_mlp": 0.01945966, + "epoch": 0.027416203216594017, + "flos": 20857762540800.0, + "grad_norm": 21.694013226548094, + "language_loss": 0.99008209, + "learning_rate": 3.941980363893499e-06, + "loss": 1.10463405, + "num_input_tokens_seen": 9616010, + "router_z_loss_clip": 16.84375, + "router_z_loss_mlp": 2.17773438, + "step": 456, + "time_per_iteration": 2.6782984733581543 + }, + { + "auxiliary_loss_clip": 0.09230845, + "auxiliary_loss_mlp": 0.02187109, + "balance_loss_clip": 0.07574348, + "balance_loss_mlp": 0.01970243, + "epoch": 0.027476326469261986, + "flos": 13229497411200.0, + "grad_norm": 28.08353344684151, + "language_loss": 0.97085631, + "learning_rate": 3.9433907744980384e-06, + "loss": 1.0850358, + "num_input_tokens_seen": 9634000, + "router_z_loss_clip": 16.5625, + "router_z_loss_mlp": 2.16894531, + "step": 457, + "time_per_iteration": 2.6582846641540527 + }, + { + "auxiliary_loss_clip": 0.09249748, + "auxiliary_loss_mlp": 0.02209668, + "balance_loss_clip": 0.07581042, + "balance_loss_mlp": 0.01978497, + "epoch": 0.027536449721929958, + "flos": 24031369532160.0, + "grad_norm": 45.18041952436337, + "language_loss": 1.10011601, + "learning_rate": 3.944798102235412e-06, + "loss": 1.21471024, + "num_input_tokens_seen": 9653455, + "router_z_loss_clip": 16.671875, + "router_z_loss_mlp": 2.31054688, + "step": 458, + "time_per_iteration": 2.723140239715576 + }, + { + "auxiliary_loss_clip": 0.09220205, + "auxiliary_loss_mlp": 0.02210297, + "balance_loss_clip": 0.07555029, + "balance_loss_mlp": 0.01976265, + "epoch": 0.027596572974597926, + "flos": 13011094944000.0, + "grad_norm": 45.239920259124276, + "language_loss": 1.02681351, + "learning_rate": 3.9462023605532545e-06, + "loss": 1.14111853, + "num_input_tokens_seen": 9669650, + "router_z_loss_clip": 16.640625, + "router_z_loss_mlp": 2.33984375, + "step": 459, + "time_per_iteration": 2.671720027923584 + }, + { + "auxiliary_loss_clip": 0.09208341, + "auxiliary_loss_mlp": 0.02210187, + "balance_loss_clip": 0.07567435, + "balance_loss_mlp": 0.0198264, + "epoch": 0.027656696227265895, + "flos": 26150671324800.0, + "grad_norm": 19.623434288041715, + "language_loss": 0.97685856, + "learning_rate": 3.947603562811407e-06, + "loss": 1.09104395, + "num_input_tokens_seen": 9691415, + "router_z_loss_clip": 16.40625, + "router_z_loss_mlp": 2.2734375, + "step": 460, + "time_per_iteration": 2.757227897644043 + }, + { + "auxiliary_loss_clip": 0.07349286, + "auxiliary_loss_mlp": 0.01457289, + "balance_loss_clip": 0.06801966, + "balance_loss_mlp": 0.01381853, + "epoch": 0.027716819479933864, + "flos": 60717055322880.0, + "grad_norm": 1.34871546657126, + "language_loss": 0.73767412, + "learning_rate": 3.949001722282675e-06, + "loss": 0.8257398, + "num_input_tokens_seen": 9755605, + "router_z_loss_clip": 5.48828125, + "router_z_loss_mlp": 0.75292969, + "step": 461, + "time_per_iteration": 3.225203514099121 + }, + { + "auxiliary_loss_clip": 0.09153335, + "auxiliary_loss_mlp": 0.02158036, + "balance_loss_clip": 0.07562718, + "balance_loss_mlp": 0.01941456, + "epoch": 0.027776942732601832, + "flos": 31219936761600.0, + "grad_norm": 25.337070845847826, + "language_loss": 1.02236819, + "learning_rate": 3.950396852153582e-06, + "loss": 1.13548183, + "num_input_tokens_seen": 9776270, + "router_z_loss_clip": 15.921875, + "router_z_loss_mlp": 2.16503906, + "step": 462, + "time_per_iteration": 2.761122941970825 + }, + { + "auxiliary_loss_clip": 0.0917296, + "auxiliary_loss_mlp": 0.02143298, + "balance_loss_clip": 0.07564321, + "balance_loss_mlp": 0.01926432, + "epoch": 0.027837065985269804, + "flos": 22681277020800.0, + "grad_norm": 25.879214952659087, + "language_loss": 1.11945248, + "learning_rate": 3.951788965525118e-06, + "loss": 1.23261511, + "num_input_tokens_seen": 9794465, + "router_z_loss_clip": 16.09375, + "router_z_loss_mlp": 2.16796875, + "step": 463, + "time_per_iteration": 2.6517393589019775 + }, + { + "auxiliary_loss_clip": 0.07315847, + "auxiliary_loss_mlp": 0.01337025, + "balance_loss_clip": 0.06773283, + "balance_loss_mlp": 0.01272986, + "epoch": 0.027897189237937773, + "flos": 62200786296960.0, + "grad_norm": 0.9076693638551637, + "language_loss": 0.58966231, + "learning_rate": 3.953178075413476e-06, + "loss": 0.67619097, + "num_input_tokens_seen": 9849685, + "router_z_loss_clip": 5.4375, + "router_z_loss_mlp": 0.64013672, + "step": 464, + "time_per_iteration": 3.2396233081817627 + }, + { + "auxiliary_loss_clip": 0.09172998, + "auxiliary_loss_mlp": 0.02120585, + "balance_loss_clip": 0.07578301, + "balance_loss_mlp": 0.01918502, + "epoch": 0.02795731249060574, + "flos": 24499131350400.0, + "grad_norm": 45.20349334546378, + "language_loss": 1.03495145, + "learning_rate": 3.954564194750784e-06, + "loss": 1.14788723, + "num_input_tokens_seen": 9869505, + "router_z_loss_clip": 15.953125, + "router_z_loss_mlp": 2.02148438, + "step": 465, + "time_per_iteration": 2.725616931915283 + }, + { + "auxiliary_loss_clip": 0.09135859, + "auxiliary_loss_mlp": 0.0204377, + "balance_loss_clip": 0.07563674, + "balance_loss_mlp": 0.01849125, + "epoch": 0.02801743574327371, + "flos": 23739858777600.0, + "grad_norm": 33.78948466858622, + "language_loss": 0.95100033, + "learning_rate": 3.955947336385828e-06, + "loss": 1.06279659, + "num_input_tokens_seen": 9890950, + "router_z_loss_clip": 15.703125, + "router_z_loss_mlp": 1.94628906, + "step": 466, + "time_per_iteration": 2.7096307277679443 + }, + { + "auxiliary_loss_clip": 0.09162845, + "auxiliary_loss_mlp": 0.02091556, + "balance_loss_clip": 0.07588789, + "balance_loss_mlp": 0.0189424, + "epoch": 0.02807755899594168, + "flos": 20634999661440.0, + "grad_norm": 17.071922366982022, + "language_loss": 1.01469541, + "learning_rate": 3.957327513084761e-06, + "loss": 1.12723947, + "num_input_tokens_seen": 9911265, + "router_z_loss_clip": 15.75, + "router_z_loss_mlp": 1.97265625, + "step": 467, + "time_per_iteration": 2.697120189666748 + }, + { + "auxiliary_loss_clip": 0.0908498, + "auxiliary_loss_mlp": 0.02113688, + "balance_loss_clip": 0.07555597, + "balance_loss_mlp": 0.01908934, + "epoch": 0.02813768224860965, + "flos": 19250554176000.0, + "grad_norm": 23.52868546244156, + "language_loss": 1.03801823, + "learning_rate": 3.958704737531818e-06, + "loss": 1.15000498, + "num_input_tokens_seen": 9929025, + "router_z_loss_clip": 15.2734375, + "router_z_loss_mlp": 2.04882812, + "step": 468, + "time_per_iteration": 2.6348235607147217 + }, + { + "auxiliary_loss_clip": 0.09087479, + "auxiliary_loss_mlp": 0.02120186, + "balance_loss_clip": 0.07563758, + "balance_loss_mlp": 0.01912189, + "epoch": 0.02819780550127762, + "flos": 20820306965760.0, + "grad_norm": 34.78387665912523, + "language_loss": 1.11076498, + "learning_rate": 3.9600790223300065e-06, + "loss": 1.2228415, + "num_input_tokens_seen": 9945190, + "router_z_loss_clip": 15.2265625, + "router_z_loss_mlp": 2.08300781, + "step": 469, + "time_per_iteration": 2.6886401176452637 + }, + { + "auxiliary_loss_clip": 0.09051213, + "auxiliary_loss_mlp": 0.02126417, + "balance_loss_clip": 0.07552808, + "balance_loss_mlp": 0.01921949, + "epoch": 0.028257928753945588, + "flos": 19980211530240.0, + "grad_norm": 43.4409759227761, + "language_loss": 1.05499089, + "learning_rate": 3.96145038000181e-06, + "loss": 1.16676712, + "num_input_tokens_seen": 9962820, + "router_z_loss_clip": 15.0078125, + "router_z_loss_mlp": 2.046875, + "step": 470, + "time_per_iteration": 2.649240255355835 + }, + { + "auxiliary_loss_clip": 0.09054536, + "auxiliary_loss_mlp": 0.02164254, + "balance_loss_clip": 0.0753805, + "balance_loss_mlp": 0.0194281, + "epoch": 0.028318052006613557, + "flos": 20490585949440.0, + "grad_norm": 34.229925481391405, + "language_loss": 1.11025834, + "learning_rate": 3.962818822989861e-06, + "loss": 1.2224462, + "num_input_tokens_seen": 9982595, + "router_z_loss_clip": 15.1796875, + "router_z_loss_mlp": 2.21484375, + "step": 471, + "time_per_iteration": 2.694502592086792 + }, + { + "auxiliary_loss_clip": 0.0901389, + "auxiliary_loss_mlp": 0.02100335, + "balance_loss_clip": 0.07527161, + "balance_loss_mlp": 0.01902638, + "epoch": 0.02837817525928153, + "flos": 28522854339840.0, + "grad_norm": 28.640745518781863, + "language_loss": 0.93263328, + "learning_rate": 3.964184363657625e-06, + "loss": 1.04377556, + "num_input_tokens_seen": 10004645, + "router_z_loss_clip": 14.859375, + "router_z_loss_mlp": 1.9765625, + "step": 472, + "time_per_iteration": 2.723616123199463 + }, + { + "auxiliary_loss_clip": 0.09058346, + "auxiliary_loss_mlp": 0.02156495, + "balance_loss_clip": 0.07551048, + "balance_loss_mlp": 0.01941347, + "epoch": 0.028438298511949497, + "flos": 18557597710080.0, + "grad_norm": 31.883678895195217, + "language_loss": 1.09761989, + "learning_rate": 3.965547014290071e-06, + "loss": 1.2097683, + "num_input_tokens_seen": 10022555, + "router_z_loss_clip": 15.078125, + "router_z_loss_mlp": 2.15136719, + "step": 473, + "time_per_iteration": 2.678131580352783 + }, + { + "auxiliary_loss_clip": 0.09018995, + "auxiliary_loss_mlp": 0.02143272, + "balance_loss_clip": 0.07526669, + "balance_loss_mlp": 0.01926216, + "epoch": 0.028498421764617466, + "flos": 16915952517120.0, + "grad_norm": 82.06010961294956, + "language_loss": 1.11515367, + "learning_rate": 3.96690678709433e-06, + "loss": 1.22677636, + "num_input_tokens_seen": 10041025, + "router_z_loss_clip": 14.921875, + "router_z_loss_mlp": 2.171875, + "step": 474, + "time_per_iteration": 2.6410977840423584 + }, + { + "auxiliary_loss_clip": 0.08995185, + "auxiliary_loss_mlp": 0.02205209, + "balance_loss_clip": 0.0752454, + "balance_loss_mlp": 0.01985291, + "epoch": 0.028558545017285435, + "flos": 27785524337280.0, + "grad_norm": 24.826629982331372, + "language_loss": 0.97130352, + "learning_rate": 3.968263694200355e-06, + "loss": 1.0833075, + "num_input_tokens_seen": 10060775, + "router_z_loss_clip": 14.6953125, + "router_z_loss_mlp": 2.19726562, + "step": 475, + "time_per_iteration": 2.7301735877990723 + }, + { + "auxiliary_loss_clip": 0.07259832, + "auxiliary_loss_mlp": 0.01404773, + "balance_loss_clip": 0.06728013, + "balance_loss_mlp": 0.01346599, + "epoch": 0.028618668269953403, + "flos": 65674205596800.0, + "grad_norm": 0.9437348671950723, + "language_loss": 0.66932654, + "learning_rate": 3.969617747661569e-06, + "loss": 0.75597262, + "num_input_tokens_seen": 10120225, + "router_z_loss_clip": 5.3125, + "router_z_loss_mlp": 0.58154297, + "step": 476, + "time_per_iteration": 3.247438430786133 + }, + { + "auxiliary_loss_clip": 0.08952022, + "auxiliary_loss_mlp": 0.02252624, + "balance_loss_clip": 0.07508352, + "balance_loss_mlp": 0.02028701, + "epoch": 0.028678791522621375, + "flos": 21942269936640.0, + "grad_norm": 144.43661292546363, + "language_loss": 1.05051386, + "learning_rate": 3.970968959455509e-06, + "loss": 1.16256034, + "num_input_tokens_seen": 10137880, + "router_z_loss_clip": 14.4296875, + "router_z_loss_mlp": 2.24023438, + "step": 477, + "time_per_iteration": 2.6508686542510986 + }, + { + "auxiliary_loss_clip": 0.08993904, + "auxiliary_loss_mlp": 0.02256823, + "balance_loss_clip": 0.0754967, + "balance_loss_mlp": 0.02029467, + "epoch": 0.028738914775289344, + "flos": 24579115672320.0, + "grad_norm": 33.20185721324117, + "language_loss": 1.03065133, + "learning_rate": 3.97231734148446e-06, + "loss": 1.14315856, + "num_input_tokens_seen": 10156930, + "router_z_loss_clip": 14.453125, + "router_z_loss_mlp": 2.2734375, + "step": 478, + "time_per_iteration": 2.7467830181121826 + }, + { + "auxiliary_loss_clip": 0.08933547, + "auxiliary_loss_mlp": 0.0224041, + "balance_loss_clip": 0.07500903, + "balance_loss_mlp": 0.02019921, + "epoch": 0.028799038027957313, + "flos": 23264633946240.0, + "grad_norm": 28.885721108677235, + "language_loss": 1.00177026, + "learning_rate": 3.973662905576082e-06, + "loss": 1.11350989, + "num_input_tokens_seen": 10176295, + "router_z_loss_clip": 14.328125, + "router_z_loss_mlp": 2.20507812, + "step": 479, + "time_per_iteration": 2.7295467853546143 + }, + { + "auxiliary_loss_clip": 0.08948811, + "auxiliary_loss_mlp": 0.02267472, + "balance_loss_clip": 0.07523456, + "balance_loss_mlp": 0.02031152, + "epoch": 0.02885916128062528, + "flos": 22170692966400.0, + "grad_norm": 33.357673755660976, + "language_loss": 0.91625684, + "learning_rate": 3.975005663484038e-06, + "loss": 1.02841961, + "num_input_tokens_seen": 10195790, + "router_z_loss_clip": 14.25, + "router_z_loss_mlp": 2.36328125, + "step": 480, + "time_per_iteration": 2.766277551651001 + }, + { + "auxiliary_loss_clip": 0.08903027, + "auxiliary_loss_mlp": 0.02291788, + "balance_loss_clip": 0.07483099, + "balance_loss_mlp": 0.02045358, + "epoch": 0.02891928453329325, + "flos": 22939986101760.0, + "grad_norm": 22.287574516605755, + "language_loss": 1.01525128, + "learning_rate": 3.976345626888605e-06, + "loss": 1.12719941, + "num_input_tokens_seen": 10218405, + "router_z_loss_clip": 14.1875, + "router_z_loss_mlp": 2.4609375, + "step": 481, + "time_per_iteration": 2.692387580871582 + }, + { + "auxiliary_loss_clip": 0.07204929, + "auxiliary_loss_mlp": 0.0133661, + "balance_loss_clip": 0.06688471, + "balance_loss_mlp": 0.01279295, + "epoch": 0.028979407785961222, + "flos": 57449376524160.0, + "grad_norm": 0.8487290952821426, + "language_loss": 0.65879083, + "learning_rate": 3.9776828073972864e-06, + "loss": 0.74420619, + "num_input_tokens_seen": 10271005, + "router_z_loss_clip": 5.16015625, + "router_z_loss_mlp": 0.57275391, + "step": 482, + "time_per_iteration": 3.019406318664551 + }, + { + "auxiliary_loss_clip": 0.08916203, + "auxiliary_loss_mlp": 0.02251093, + "balance_loss_clip": 0.0748857, + "balance_loss_mlp": 0.02018397, + "epoch": 0.02903953103862919, + "flos": 16727584538880.0, + "grad_norm": 104.5991727322302, + "language_loss": 1.06331348, + "learning_rate": 3.979017216545415e-06, + "loss": 1.17498636, + "num_input_tokens_seen": 10288405, + "router_z_loss_clip": 14.28125, + "router_z_loss_mlp": 2.32421875, + "step": 483, + "time_per_iteration": 2.609882354736328 + }, + { + "auxiliary_loss_clip": 0.08908117, + "auxiliary_loss_mlp": 0.02236577, + "balance_loss_clip": 0.07510938, + "balance_loss_mlp": 0.02016469, + "epoch": 0.02909965429129716, + "flos": 16769232817920.0, + "grad_norm": 23.083678473769563, + "language_loss": 0.94234419, + "learning_rate": 3.980348865796749e-06, + "loss": 1.05379117, + "num_input_tokens_seen": 10306875, + "router_z_loss_clip": 13.96875, + "router_z_loss_mlp": 2.20507812, + "step": 484, + "time_per_iteration": 2.6507458686828613 + }, + { + "auxiliary_loss_clip": 0.08915585, + "auxiliary_loss_mlp": 0.02232887, + "balance_loss_clip": 0.07503805, + "balance_loss_mlp": 0.02011253, + "epoch": 0.029159777543965128, + "flos": 19790334178560.0, + "grad_norm": 110.91894314268477, + "language_loss": 1.00352454, + "learning_rate": 3.9816777665440615e-06, + "loss": 1.11500931, + "num_input_tokens_seen": 10323965, + "router_z_loss_clip": 14.125, + "router_z_loss_mlp": 2.21679688, + "step": 485, + "time_per_iteration": 2.7673757076263428 + }, + { + "auxiliary_loss_clip": 0.08880442, + "auxiliary_loss_mlp": 0.02237809, + "balance_loss_clip": 0.07482816, + "balance_loss_mlp": 0.02005876, + "epoch": 0.029219900796633096, + "flos": 19648184526720.0, + "grad_norm": 27.10228237086094, + "language_loss": 1.06272924, + "learning_rate": 3.983003930109732e-06, + "loss": 1.17391181, + "num_input_tokens_seen": 10342620, + "router_z_loss_clip": 13.96875, + "router_z_loss_mlp": 2.31835938, + "step": 486, + "time_per_iteration": 2.6508092880249023 + }, + { + "auxiliary_loss_clip": 0.08911004, + "auxiliary_loss_mlp": 0.02193732, + "balance_loss_clip": 0.0752122, + "balance_loss_mlp": 0.01974864, + "epoch": 0.02928002404930107, + "flos": 25892926565760.0, + "grad_norm": 15.693662583850747, + "language_loss": 1.04105806, + "learning_rate": 3.984327367746315e-06, + "loss": 1.15210545, + "num_input_tokens_seen": 10364610, + "router_z_loss_clip": 13.90625, + "router_z_loss_mlp": 2.19042969, + "step": 487, + "time_per_iteration": 2.81233286857605 + }, + { + "auxiliary_loss_clip": 0.0888624, + "auxiliary_loss_mlp": 0.02210903, + "balance_loss_clip": 0.07486838, + "balance_loss_mlp": 0.02002811, + "epoch": 0.029340147301969037, + "flos": 20665243785600.0, + "grad_norm": 49.61563210000309, + "language_loss": 1.12978697, + "learning_rate": 3.985648090637122e-06, + "loss": 1.24075842, + "num_input_tokens_seen": 10380910, + "router_z_loss_clip": 13.9921875, + "router_z_loss_mlp": 2.08300781, + "step": 488, + "time_per_iteration": 2.674189567565918 + }, + { + "auxiliary_loss_clip": 0.08953497, + "auxiliary_loss_mlp": 0.02211393, + "balance_loss_clip": 0.07543504, + "balance_loss_mlp": 0.02002347, + "epoch": 0.029400270554637006, + "flos": 24435288938880.0, + "grad_norm": 19.90256121713189, + "language_loss": 1.00477099, + "learning_rate": 3.986966109896785e-06, + "loss": 1.11641979, + "num_input_tokens_seen": 10400665, + "router_z_loss_clip": 14.1015625, + "router_z_loss_mlp": 2.09277344, + "step": 489, + "time_per_iteration": 2.7639148235321045 + }, + { + "auxiliary_loss_clip": 0.0892607, + "auxiliary_loss_mlp": 0.0220073, + "balance_loss_clip": 0.07529595, + "balance_loss_mlp": 0.01982529, + "epoch": 0.029460393807304974, + "flos": 20127140864640.0, + "grad_norm": 27.578366038116485, + "language_loss": 1.02338409, + "learning_rate": 3.988281436571815e-06, + "loss": 1.13465214, + "num_input_tokens_seen": 10420150, + "router_z_loss_clip": 13.96875, + "router_z_loss_mlp": 2.18359375, + "step": 490, + "time_per_iteration": 2.6444106101989746 + }, + { + "auxiliary_loss_clip": 0.08913176, + "auxiliary_loss_mlp": 0.02195572, + "balance_loss_clip": 0.07533699, + "balance_loss_mlp": 0.0197432, + "epoch": 0.029520517059972943, + "flos": 17681681854080.0, + "grad_norm": 29.015537112342308, + "language_loss": 1.11532688, + "learning_rate": 3.989594081641164e-06, + "loss": 1.22641444, + "num_input_tokens_seen": 10438210, + "router_z_loss_clip": 13.7890625, + "router_z_loss_mlp": 2.21289062, + "step": 491, + "time_per_iteration": 5.5153045654296875 + }, + { + "auxiliary_loss_clip": 0.08889591, + "auxiliary_loss_mlp": 0.02207651, + "balance_loss_clip": 0.07520857, + "balance_loss_mlp": 0.0199317, + "epoch": 0.029580640312640915, + "flos": 18959211129600.0, + "grad_norm": 14.57626480214455, + "language_loss": 0.9931764, + "learning_rate": 3.9909040560167675e-06, + "loss": 1.10414886, + "num_input_tokens_seen": 10455125, + "router_z_loss_clip": 13.6875, + "router_z_loss_mlp": 2.14550781, + "step": 492, + "time_per_iteration": 4.12203049659729 + }, + { + "auxiliary_loss_clip": 0.08912461, + "auxiliary_loss_mlp": 0.02272215, + "balance_loss_clip": 0.07548416, + "balance_loss_mlp": 0.02033606, + "epoch": 0.029640763565308884, + "flos": 18730746172800.0, + "grad_norm": 23.908228280746865, + "language_loss": 1.05753922, + "learning_rate": 3.992211370544093e-06, + "loss": 1.16938591, + "num_input_tokens_seen": 10470990, + "router_z_loss_clip": 13.625, + "router_z_loss_mlp": 2.3828125, + "step": 493, + "time_per_iteration": 2.6953020095825195 + }, + { + "auxiliary_loss_clip": 0.08946873, + "auxiliary_loss_mlp": 0.02207101, + "balance_loss_clip": 0.07561117, + "balance_loss_mlp": 0.01985753, + "epoch": 0.029700886817976852, + "flos": 20601652936320.0, + "grad_norm": 59.82783301164341, + "language_loss": 1.05118871, + "learning_rate": 3.99351603600268e-06, + "loss": 1.16272855, + "num_input_tokens_seen": 10490685, + "router_z_loss_clip": 13.8515625, + "router_z_loss_mlp": 2.21386719, + "step": 494, + "time_per_iteration": 2.6631805896759033 + }, + { + "auxiliary_loss_clip": 0.08915924, + "auxiliary_loss_mlp": 0.02239191, + "balance_loss_clip": 0.07543083, + "balance_loss_mlp": 0.0199753, + "epoch": 0.02976101007064482, + "flos": 22243423910400.0, + "grad_norm": 26.318413946561634, + "language_loss": 1.04354262, + "learning_rate": 3.994818063106668e-06, + "loss": 1.15509367, + "num_input_tokens_seen": 10509435, + "router_z_loss_clip": 13.7265625, + "router_z_loss_mlp": 2.4140625, + "step": 495, + "time_per_iteration": 4.107235908508301 + }, + { + "auxiliary_loss_clip": 0.08888054, + "auxiliary_loss_mlp": 0.02273613, + "balance_loss_clip": 0.07541628, + "balance_loss_mlp": 0.02036148, + "epoch": 0.029821133323312793, + "flos": 23739439507200.0, + "grad_norm": 14.252476342508674, + "language_loss": 0.79374158, + "learning_rate": 3.99611746250533e-06, + "loss": 0.9053582, + "num_input_tokens_seen": 10530050, + "router_z_loss_clip": 13.4609375, + "router_z_loss_mlp": 2.37304688, + "step": 496, + "time_per_iteration": 2.757887363433838 + }, + { + "auxiliary_loss_clip": 0.08908898, + "auxiliary_loss_mlp": 0.0225322, + "balance_loss_clip": 0.07561936, + "balance_loss_mlp": 0.02023385, + "epoch": 0.02988125657598076, + "flos": 22426131738240.0, + "grad_norm": 48.93797296748546, + "language_loss": 1.05435932, + "learning_rate": 3.997414244783595e-06, + "loss": 1.16598058, + "num_input_tokens_seen": 10551370, + "router_z_loss_clip": 13.453125, + "router_z_loss_mlp": 2.296875, + "step": 497, + "time_per_iteration": 2.698960781097412 + }, + { + "auxiliary_loss_clip": 0.08959304, + "auxiliary_loss_mlp": 0.0221962, + "balance_loss_clip": 0.07595803, + "balance_loss_mlp": 0.01998176, + "epoch": 0.02994137982864873, + "flos": 13850267984640.0, + "grad_norm": 57.28331954677374, + "language_loss": 1.09360301, + "learning_rate": 3.998708420462557e-06, + "loss": 1.20539236, + "num_input_tokens_seen": 10569225, + "router_z_loss_clip": 13.640625, + "router_z_loss_mlp": 2.21289062, + "step": 498, + "time_per_iteration": 2.699470281600952 + }, + { + "auxiliary_loss_clip": 0.08942117, + "auxiliary_loss_mlp": 0.02291662, + "balance_loss_clip": 0.07576901, + "balance_loss_mlp": 0.02053434, + "epoch": 0.0300015030813167, + "flos": 23914055416320.0, + "grad_norm": 30.471494656970325, + "language_loss": 1.05517888, + "learning_rate": 4e-06, + "loss": 1.16751671, + "num_input_tokens_seen": 10586170, + "router_z_loss_clip": 13.65625, + "router_z_loss_mlp": 2.37890625, + "step": 499, + "time_per_iteration": 2.6825146675109863 + }, + { + "auxiliary_loss_clip": 0.08909643, + "auxiliary_loss_mlp": 0.02277073, + "balance_loss_clip": 0.07578171, + "balance_loss_mlp": 0.02052769, + "epoch": 0.030061626333984667, + "flos": 22023134726400.0, + "grad_norm": 15.715356901732157, + "language_loss": 0.96281993, + "learning_rate": 3.9999999620799e-06, + "loss": 1.07468712, + "num_input_tokens_seen": 10606205, + "router_z_loss_clip": 13.3046875, + "router_z_loss_mlp": 2.24414062, + "step": 500, + "time_per_iteration": 2.7350914478302 + }, + { + "auxiliary_loss_clip": 0.08887713, + "auxiliary_loss_mlp": 0.02297984, + "balance_loss_clip": 0.07557485, + "balance_loss_mlp": 0.02069103, + "epoch": 0.03012174958665264, + "flos": 23046483041280.0, + "grad_norm": 15.325261953037035, + "language_loss": 1.09255648, + "learning_rate": 3.9999998483196e-06, + "loss": 1.20441341, + "num_input_tokens_seen": 10625995, + "router_z_loss_clip": 13.296875, + "router_z_loss_mlp": 2.2890625, + "step": 501, + "time_per_iteration": 2.6515860557556152 + }, + { + "auxiliary_loss_clip": 0.0895866, + "auxiliary_loss_mlp": 0.02279337, + "balance_loss_clip": 0.07618586, + "balance_loss_mlp": 0.02058275, + "epoch": 0.030181872839320608, + "flos": 18959294983680.0, + "grad_norm": 442.08874740717613, + "language_loss": 1.0616231, + "learning_rate": 3.9999996587191065e-06, + "loss": 1.17400312, + "num_input_tokens_seen": 10644105, + "router_z_loss_clip": 13.40625, + "router_z_loss_mlp": 2.21289062, + "step": 502, + "time_per_iteration": 2.6650314331054688 + }, + { + "auxiliary_loss_clip": 0.08926746, + "auxiliary_loss_mlp": 0.02313635, + "balance_loss_clip": 0.07593986, + "balance_loss_mlp": 0.02080176, + "epoch": 0.030241996091988577, + "flos": 16733747813760.0, + "grad_norm": 40.11923719359636, + "language_loss": 1.00487685, + "learning_rate": 3.999999393278425e-06, + "loss": 1.11728072, + "num_input_tokens_seen": 10661090, + "router_z_loss_clip": 13.3125, + "router_z_loss_mlp": 2.3359375, + "step": 503, + "time_per_iteration": 2.6301283836364746 + }, + { + "auxiliary_loss_clip": 0.08950677, + "auxiliary_loss_mlp": 0.02299167, + "balance_loss_clip": 0.07607222, + "balance_loss_mlp": 0.02070094, + "epoch": 0.030302119344656545, + "flos": 28628806227840.0, + "grad_norm": 16.096297116013613, + "language_loss": 1.02800179, + "learning_rate": 3.999999051997567e-06, + "loss": 1.14050031, + "num_input_tokens_seen": 10682380, + "router_z_loss_clip": 13.4375, + "router_z_loss_mlp": 2.28808594, + "step": 504, + "time_per_iteration": 2.7234466075897217 + }, + { + "auxiliary_loss_clip": 0.08954775, + "auxiliary_loss_mlp": 0.022733, + "balance_loss_clip": 0.07610564, + "balance_loss_mlp": 0.02054241, + "epoch": 0.030362242597324514, + "flos": 15674788713600.0, + "grad_norm": 53.80634610199122, + "language_loss": 0.90572113, + "learning_rate": 3.9999986348765425e-06, + "loss": 1.01800191, + "num_input_tokens_seen": 10699925, + "router_z_loss_clip": 13.453125, + "router_z_loss_mlp": 2.19042969, + "step": 505, + "time_per_iteration": 2.6355271339416504 + }, + { + "auxiliary_loss_clip": 0.07202613, + "auxiliary_loss_mlp": 0.01385887, + "balance_loss_clip": 0.06702607, + "balance_loss_mlp": 0.01312073, + "epoch": 0.030422365849992486, + "flos": 72149173528320.0, + "grad_norm": 1.0312424009228802, + "language_loss": 0.55707914, + "learning_rate": 3.999998141915371e-06, + "loss": 0.64296412, + "num_input_tokens_seen": 10766525, + "router_z_loss_clip": 5.0, + "router_z_loss_mlp": 0.73779297, + "step": 506, + "time_per_iteration": 3.4425716400146484 + }, + { + "auxiliary_loss_clip": 0.08947556, + "auxiliary_loss_mlp": 0.0229462, + "balance_loss_clip": 0.07588895, + "balance_loss_mlp": 0.02080234, + "epoch": 0.030482489102660455, + "flos": 19433974763520.0, + "grad_norm": 15.732874937996321, + "language_loss": 0.96318799, + "learning_rate": 3.999997573114069e-06, + "loss": 1.07560968, + "num_input_tokens_seen": 10786725, + "router_z_loss_clip": 13.5703125, + "router_z_loss_mlp": 2.14648438, + "step": 507, + "time_per_iteration": 2.6885857582092285 + }, + { + "auxiliary_loss_clip": 0.08928548, + "auxiliary_loss_mlp": 0.02259048, + "balance_loss_clip": 0.07588597, + "balance_loss_mlp": 0.02042945, + "epoch": 0.030542612355328423, + "flos": 20382034584960.0, + "grad_norm": 22.351883402694675, + "language_loss": 1.05944586, + "learning_rate": 3.999996928472659e-06, + "loss": 1.17132187, + "num_input_tokens_seen": 10805390, + "router_z_loss_clip": 13.3984375, + "router_z_loss_mlp": 2.15722656, + "step": 508, + "time_per_iteration": 2.659903049468994 + }, + { + "auxiliary_loss_clip": 0.08911724, + "auxiliary_loss_mlp": 0.02284852, + "balance_loss_clip": 0.07589735, + "balance_loss_mlp": 0.02067796, + "epoch": 0.030602735607996392, + "flos": 34685809194240.0, + "grad_norm": 36.57726962187856, + "language_loss": 0.84476292, + "learning_rate": 3.999996207991165e-06, + "loss": 0.95672864, + "num_input_tokens_seen": 10828030, + "router_z_loss_clip": 13.1953125, + "router_z_loss_mlp": 2.17089844, + "step": 509, + "time_per_iteration": 2.8194127082824707 + }, + { + "auxiliary_loss_clip": 0.08892205, + "auxiliary_loss_mlp": 0.02281797, + "balance_loss_clip": 0.07575735, + "balance_loss_mlp": 0.02065503, + "epoch": 0.03066285886066436, + "flos": 23665283043840.0, + "grad_norm": 17.47434487382061, + "language_loss": 0.97325271, + "learning_rate": 3.999995411669614e-06, + "loss": 1.08499277, + "num_input_tokens_seen": 10845240, + "router_z_loss_clip": 13.15625, + "router_z_loss_mlp": 2.16210938, + "step": 510, + "time_per_iteration": 2.6817235946655273 + }, + { + "auxiliary_loss_clip": 0.08892487, + "auxiliary_loss_mlp": 0.02360194, + "balance_loss_clip": 0.07583004, + "balance_loss_mlp": 0.02123492, + "epoch": 0.030722982113332332, + "flos": 23009656371840.0, + "grad_norm": 18.905046526469672, + "language_loss": 1.01792526, + "learning_rate": 3.999994539508036e-06, + "loss": 1.13045216, + "num_input_tokens_seen": 10864325, + "router_z_loss_clip": 13.109375, + "router_z_loss_mlp": 2.36328125, + "step": 511, + "time_per_iteration": 2.7218635082244873 + }, + { + "auxiliary_loss_clip": 0.08893925, + "auxiliary_loss_mlp": 0.02289988, + "balance_loss_clip": 0.07569309, + "balance_loss_mlp": 0.02083041, + "epoch": 0.0307831053660003, + "flos": 24757253452800.0, + "grad_norm": 19.668331583944035, + "language_loss": 0.98058987, + "learning_rate": 3.9999935915064655e-06, + "loss": 1.09242892, + "num_input_tokens_seen": 10883860, + "router_z_loss_clip": 13.25, + "router_z_loss_mlp": 2.07226562, + "step": 512, + "time_per_iteration": 2.6965620517730713 + }, + { + "auxiliary_loss_clip": 0.08852743, + "auxiliary_loss_mlp": 0.02379446, + "balance_loss_clip": 0.0755362, + "balance_loss_mlp": 0.02156858, + "epoch": 0.03084322861866827, + "flos": 26148113775360.0, + "grad_norm": 13.468181826610785, + "language_loss": 1.01916862, + "learning_rate": 3.9999925676649374e-06, + "loss": 1.13149047, + "num_input_tokens_seen": 10904555, + "router_z_loss_clip": 12.984375, + "router_z_loss_mlp": 2.22460938, + "step": 513, + "time_per_iteration": 2.711587429046631 + }, + { + "auxiliary_loss_clip": 0.08845583, + "auxiliary_loss_mlp": 0.02430958, + "balance_loss_clip": 0.07545915, + "balance_loss_mlp": 0.02204555, + "epoch": 0.03090335187133624, + "flos": 18777383769600.0, + "grad_norm": 6.55607776583441, + "language_loss": 0.95138013, + "learning_rate": 3.999991467983491e-06, + "loss": 1.06414557, + "num_input_tokens_seen": 10923700, + "router_z_loss_clip": 13.0, + "router_z_loss_mlp": 2.26269531, + "step": 514, + "time_per_iteration": 2.6500775814056396 + }, + { + "auxiliary_loss_clip": 0.08815307, + "auxiliary_loss_mlp": 0.02407072, + "balance_loss_clip": 0.07539771, + "balance_loss_mlp": 0.02187917, + "epoch": 0.030963475124004207, + "flos": 23228603890560.0, + "grad_norm": 18.204719930438795, + "language_loss": 0.97247916, + "learning_rate": 3.999990292462167e-06, + "loss": 1.08470297, + "num_input_tokens_seen": 10942730, + "router_z_loss_clip": 12.7578125, + "router_z_loss_mlp": 2.19335938, + "step": 515, + "time_per_iteration": 2.7167558670043945 + }, + { + "auxiliary_loss_clip": 0.08806405, + "auxiliary_loss_mlp": 0.02437712, + "balance_loss_clip": 0.0752582, + "balance_loss_mlp": 0.02208258, + "epoch": 0.03102359837667218, + "flos": 42535998662400.0, + "grad_norm": 5.904658856542002, + "language_loss": 1.00314569, + "learning_rate": 3.999989041101011e-06, + "loss": 1.11558676, + "num_input_tokens_seen": 10967120, + "router_z_loss_clip": 12.8203125, + "router_z_loss_mlp": 2.29492188, + "step": 516, + "time_per_iteration": 2.932173013687134 + }, + { + "auxiliary_loss_clip": 0.08796877, + "auxiliary_loss_mlp": 0.02455233, + "balance_loss_clip": 0.07514809, + "balance_loss_mlp": 0.02220629, + "epoch": 0.031083721629340148, + "flos": 21183039290880.0, + "grad_norm": 45.02393900109363, + "language_loss": 0.9180311, + "learning_rate": 3.999987713900071e-06, + "loss": 1.03055215, + "num_input_tokens_seen": 10986775, + "router_z_loss_clip": 12.8359375, + "router_z_loss_mlp": 2.34375, + "step": 517, + "time_per_iteration": 2.666154623031616 + }, + { + "auxiliary_loss_clip": 0.08820206, + "auxiliary_loss_mlp": 0.02414127, + "balance_loss_clip": 0.07551458, + "balance_loss_mlp": 0.02194306, + "epoch": 0.031143844882008116, + "flos": 29723963091840.0, + "grad_norm": 7.285252117980509, + "language_loss": 0.99479294, + "learning_rate": 3.999986310859396e-06, + "loss": 1.10713625, + "num_input_tokens_seen": 11011360, + "router_z_loss_clip": 12.6796875, + "router_z_loss_mlp": 2.19824219, + "step": 518, + "time_per_iteration": 2.752505302429199 + }, + { + "auxiliary_loss_clip": 0.08830461, + "auxiliary_loss_mlp": 0.024645, + "balance_loss_clip": 0.07556459, + "balance_loss_mlp": 0.02246586, + "epoch": 0.031203968134676085, + "flos": 23119172058240.0, + "grad_norm": 20.736865355911096, + "language_loss": 1.01917171, + "learning_rate": 3.999984831979039e-06, + "loss": 1.13212132, + "num_input_tokens_seen": 11030150, + "router_z_loss_clip": 12.734375, + "router_z_loss_mlp": 2.1796875, + "step": 519, + "time_per_iteration": 2.6659457683563232 + }, + { + "auxiliary_loss_clip": 0.08817208, + "auxiliary_loss_mlp": 0.02465606, + "balance_loss_clip": 0.07545176, + "balance_loss_mlp": 0.02241778, + "epoch": 0.03126409138734405, + "flos": 20959815214080.0, + "grad_norm": 7.142122271726701, + "language_loss": 1.00803113, + "learning_rate": 3.999983277259057e-06, + "loss": 1.12085938, + "num_input_tokens_seen": 11049145, + "router_z_loss_clip": 12.7265625, + "router_z_loss_mlp": 2.23632812, + "step": 520, + "time_per_iteration": 2.7612173557281494 + }, + { + "auxiliary_loss_clip": 0.08873951, + "auxiliary_loss_mlp": 0.02427922, + "balance_loss_clip": 0.07591425, + "balance_loss_mlp": 0.02219163, + "epoch": 0.031324214640012026, + "flos": 21656083916160.0, + "grad_norm": 5386.394179139514, + "language_loss": 1.03191018, + "learning_rate": 3.999981646699509e-06, + "loss": 1.14492893, + "num_input_tokens_seen": 11068835, + "router_z_loss_clip": 12.8203125, + "router_z_loss_mlp": 2.08886719, + "step": 521, + "time_per_iteration": 2.6934170722961426 + }, + { + "auxiliary_loss_clip": 0.08889641, + "auxiliary_loss_mlp": 0.02359363, + "balance_loss_clip": 0.07604645, + "balance_loss_mlp": 0.02163669, + "epoch": 0.03138433789267999, + "flos": 23448180314880.0, + "grad_norm": 8.073235529869596, + "language_loss": 0.83005708, + "learning_rate": 3.999979940300456e-06, + "loss": 0.94254714, + "num_input_tokens_seen": 11088980, + "router_z_loss_clip": 12.84375, + "router_z_loss_mlp": 1.95800781, + "step": 522, + "time_per_iteration": 2.8722758293151855 + }, + { + "auxiliary_loss_clip": 0.08903908, + "auxiliary_loss_mlp": 0.02254118, + "balance_loss_clip": 0.07622182, + "balance_loss_mlp": 0.0208465, + "epoch": 0.03144446114534796, + "flos": 18986939631360.0, + "grad_norm": 12.411483225368043, + "language_loss": 1.05680871, + "learning_rate": 3.999978158061963e-06, + "loss": 1.16838908, + "num_input_tokens_seen": 11104300, + "router_z_loss_clip": 12.8046875, + "router_z_loss_mlp": 1.6953125, + "step": 523, + "time_per_iteration": 2.650547742843628 + }, + { + "auxiliary_loss_clip": 0.08934012, + "auxiliary_loss_mlp": 0.02230434, + "balance_loss_clip": 0.07644011, + "balance_loss_mlp": 0.0206087, + "epoch": 0.031504584398015935, + "flos": 22644240716160.0, + "grad_norm": 13.96543726868128, + "language_loss": 1.08792841, + "learning_rate": 3.999976299984099e-06, + "loss": 1.1995728, + "num_input_tokens_seen": 11123335, + "router_z_loss_clip": 12.8984375, + "router_z_loss_mlp": 1.69628906, + "step": 524, + "time_per_iteration": 2.7135303020477295 + }, + { + "auxiliary_loss_clip": 0.08891568, + "auxiliary_loss_mlp": 0.02091454, + "balance_loss_clip": 0.07603844, + "balance_loss_mlp": 0.0193486, + "epoch": 0.0315647076506839, + "flos": 25303364438400.0, + "grad_norm": 13.325751395918596, + "language_loss": 0.96287918, + "learning_rate": 3.999974366066933e-06, + "loss": 1.07270944, + "num_input_tokens_seen": 11140880, + "router_z_loss_clip": 12.875, + "router_z_loss_mlp": 1.56542969, + "step": 525, + "time_per_iteration": 2.7008469104766846 + }, + { + "auxiliary_loss_clip": 0.08895689, + "auxiliary_loss_mlp": 0.02060743, + "balance_loss_clip": 0.07611247, + "balance_loss_mlp": 0.01902052, + "epoch": 0.03162483090335187, + "flos": 16988515752960.0, + "grad_norm": 10.865036443132793, + "language_loss": 0.93799376, + "learning_rate": 3.999972356310538e-06, + "loss": 1.04755807, + "num_input_tokens_seen": 11158710, + "router_z_loss_clip": 12.84375, + "router_z_loss_mlp": 1.58789062, + "step": 526, + "time_per_iteration": 2.6346511840820312 + }, + { + "auxiliary_loss_clip": 0.08917748, + "auxiliary_loss_mlp": 0.01935945, + "balance_loss_clip": 0.07596096, + "balance_loss_mlp": 0.01773629, + "epoch": 0.03168495415601984, + "flos": 18740515173120.0, + "grad_norm": 57.85895101220995, + "language_loss": 0.99752951, + "learning_rate": 3.999970270714991e-06, + "loss": 1.10606647, + "num_input_tokens_seen": 11177550, + "router_z_loss_clip": 13.2109375, + "router_z_loss_mlp": 1.62402344, + "step": 527, + "time_per_iteration": 2.679004669189453 + }, + { + "auxiliary_loss_clip": 0.08855803, + "auxiliary_loss_mlp": 0.01834989, + "balance_loss_clip": 0.07585346, + "balance_loss_mlp": 0.01673914, + "epoch": 0.03174507740868781, + "flos": 21221207625600.0, + "grad_norm": 46.02909291045389, + "language_loss": 1.11322296, + "learning_rate": 3.999968109280371e-06, + "loss": 1.22013092, + "num_input_tokens_seen": 11196230, + "router_z_loss_clip": 12.703125, + "router_z_loss_mlp": 1.61035156, + "step": 528, + "time_per_iteration": 2.6590561866760254 + }, + { + "auxiliary_loss_clip": 0.08896849, + "auxiliary_loss_mlp": 0.01846134, + "balance_loss_clip": 0.07587088, + "balance_loss_mlp": 0.01668655, + "epoch": 0.03180520066135578, + "flos": 24794122049280.0, + "grad_norm": 60.37354361545739, + "language_loss": 0.97275496, + "learning_rate": 3.99996587200676e-06, + "loss": 1.08018494, + "num_input_tokens_seen": 11214935, + "router_z_loss_clip": 13.09375, + "router_z_loss_mlp": 1.77539062, + "step": 529, + "time_per_iteration": 2.7260618209838867 + }, + { + "auxiliary_loss_clip": 0.08883977, + "auxiliary_loss_mlp": 0.01771414, + "balance_loss_clip": 0.07582102, + "balance_loss_mlp": 0.01579535, + "epoch": 0.03186532391402375, + "flos": 24871339186560.0, + "grad_norm": 10627.611218983826, + "language_loss": 1.18170238, + "learning_rate": 3.999963558894243e-06, + "loss": 1.28825641, + "num_input_tokens_seen": 11235310, + "router_z_loss_clip": 13.015625, + "router_z_loss_mlp": 1.91894531, + "step": 530, + "time_per_iteration": 2.7020938396453857 + }, + { + "auxiliary_loss_clip": 0.08833256, + "auxiliary_loss_mlp": 0.01774458, + "balance_loss_clip": 0.07546531, + "balance_loss_mlp": 0.01588683, + "epoch": 0.03192544716669172, + "flos": 21221417260800.0, + "grad_norm": 74.92861353079512, + "language_loss": 0.92192125, + "learning_rate": 3.999961169942907e-06, + "loss": 1.02799833, + "num_input_tokens_seen": 11254425, + "router_z_loss_clip": 12.8671875, + "router_z_loss_mlp": 1.85644531, + "step": 531, + "time_per_iteration": 5.536854028701782 + }, + { + "auxiliary_loss_clip": 0.08819988, + "auxiliary_loss_mlp": 0.0179185, + "balance_loss_clip": 0.07536054, + "balance_loss_mlp": 0.01611224, + "epoch": 0.03198557041935969, + "flos": 24360168153600.0, + "grad_norm": 15.362611414198588, + "language_loss": 1.04843593, + "learning_rate": 3.999958705152843e-06, + "loss": 1.15455437, + "num_input_tokens_seen": 11274595, + "router_z_loss_clip": 12.8359375, + "router_z_loss_mlp": 1.8046875, + "step": 532, + "time_per_iteration": 4.078269958496094 + }, + { + "auxiliary_loss_clip": 0.07593378, + "auxiliary_loss_mlp": 0.01964501, + "balance_loss_clip": 0.07000267, + "balance_loss_mlp": 0.01595619, + "epoch": 0.032045693672027656, + "flos": 61847235993600.0, + "grad_norm": 0.8955673428440366, + "language_loss": 0.58032346, + "learning_rate": 3.9999561645241445e-06, + "loss": 0.67590225, + "num_input_tokens_seen": 11336705, + "router_z_loss_clip": 5.9375, + "router_z_loss_mlp": 3.68554688, + "step": 533, + "time_per_iteration": 3.319361925125122 + }, + { + "auxiliary_loss_clip": 0.08788651, + "auxiliary_loss_mlp": 0.01742728, + "balance_loss_clip": 0.07528964, + "balance_loss_mlp": 0.01567061, + "epoch": 0.03210581692469563, + "flos": 28408475116800.0, + "grad_norm": 18.42557842883857, + "language_loss": 0.99417937, + "learning_rate": 3.999953548056907e-06, + "loss": 1.09949315, + "num_input_tokens_seen": 11356820, + "router_z_loss_clip": 12.5859375, + "router_z_loss_mlp": 1.75585938, + "step": 534, + "time_per_iteration": 4.265074729919434 + }, + { + "auxiliary_loss_clip": 0.08770919, + "auxiliary_loss_mlp": 0.0174947, + "balance_loss_clip": 0.07504185, + "balance_loss_mlp": 0.01577809, + "epoch": 0.03216594017736359, + "flos": 24724661414400.0, + "grad_norm": 508.9639434919875, + "language_loss": 0.94137996, + "learning_rate": 3.999950855751232e-06, + "loss": 1.04658389, + "num_input_tokens_seen": 11376645, + "router_z_loss_clip": 12.671875, + "router_z_loss_mlp": 1.71777344, + "step": 535, + "time_per_iteration": 2.7245981693267822 + }, + { + "auxiliary_loss_clip": 0.08758718, + "auxiliary_loss_mlp": 0.01725335, + "balance_loss_clip": 0.07518992, + "balance_loss_mlp": 0.01554437, + "epoch": 0.032226063430031565, + "flos": 31183445508480.0, + "grad_norm": 22.532643943929422, + "language_loss": 0.94802475, + "learning_rate": 3.999948087607219e-06, + "loss": 1.05286527, + "num_input_tokens_seen": 11397310, + "router_z_loss_clip": 12.390625, + "router_z_loss_mlp": 1.70996094, + "step": 536, + "time_per_iteration": 2.7583792209625244 + }, + { + "auxiliary_loss_clip": 0.08705089, + "auxiliary_loss_mlp": 0.01729852, + "balance_loss_clip": 0.07491484, + "balance_loss_mlp": 0.01569253, + "epoch": 0.03228618668269954, + "flos": 32206584188160.0, + "grad_norm": 18.146665662297185, + "language_loss": 0.83908743, + "learning_rate": 3.999945243624975e-06, + "loss": 0.94343686, + "num_input_tokens_seen": 11418475, + "router_z_loss_clip": 12.1484375, + "router_z_loss_mlp": 1.60546875, + "step": 537, + "time_per_iteration": 2.770418167114258 + }, + { + "auxiliary_loss_clip": 0.08731261, + "auxiliary_loss_mlp": 0.01758368, + "balance_loss_clip": 0.07496089, + "balance_loss_mlp": 0.0159672, + "epoch": 0.0323463099353675, + "flos": 22676036140800.0, + "grad_norm": 12.39933899749453, + "language_loss": 0.95942801, + "learning_rate": 3.999942323804607e-06, + "loss": 1.06432438, + "num_input_tokens_seen": 11436630, + "router_z_loss_clip": 12.3515625, + "router_z_loss_mlp": 1.6171875, + "step": 538, + "time_per_iteration": 2.7392029762268066 + }, + { + "auxiliary_loss_clip": 0.0875225, + "auxiliary_loss_mlp": 0.01750456, + "balance_loss_clip": 0.07507962, + "balance_loss_mlp": 0.01584802, + "epoch": 0.032406433188035474, + "flos": 26912207957760.0, + "grad_norm": 95.24255955505957, + "language_loss": 0.90228236, + "learning_rate": 3.999939328146225e-06, + "loss": 1.00730944, + "num_input_tokens_seen": 11457275, + "router_z_loss_clip": 12.4453125, + "router_z_loss_mlp": 1.65625, + "step": 539, + "time_per_iteration": 2.760545253753662 + }, + { + "auxiliary_loss_clip": 0.08700242, + "auxiliary_loss_mlp": 0.01788145, + "balance_loss_clip": 0.07481987, + "balance_loss_mlp": 0.0161162, + "epoch": 0.03246655644070344, + "flos": 31511992567680.0, + "grad_norm": 15.31403595077071, + "language_loss": 0.89398444, + "learning_rate": 3.999936256649943e-06, + "loss": 0.99886829, + "num_input_tokens_seen": 11476925, + "router_z_loss_clip": 12.1875, + "router_z_loss_mlp": 1.76757812, + "step": 540, + "time_per_iteration": 2.791525363922119 + }, + { + "auxiliary_loss_clip": 0.08740143, + "auxiliary_loss_mlp": 0.01834392, + "balance_loss_clip": 0.07499444, + "balance_loss_mlp": 0.01643276, + "epoch": 0.03252667969337141, + "flos": 23224453113600.0, + "grad_norm": 73.47244628512628, + "language_loss": 0.99572086, + "learning_rate": 3.999933109315878e-06, + "loss": 1.10146618, + "num_input_tokens_seen": 11496830, + "router_z_loss_clip": 12.40625, + "router_z_loss_mlp": 1.90917969, + "step": 541, + "time_per_iteration": 2.698315143585205 + }, + { + "auxiliary_loss_clip": 0.08765414, + "auxiliary_loss_mlp": 0.01821723, + "balance_loss_clip": 0.07523992, + "balance_loss_mlp": 0.01612201, + "epoch": 0.032586802946039384, + "flos": 14762800874880.0, + "grad_norm": 49.77821697975532, + "language_loss": 1.00654817, + "learning_rate": 3.9999298861441496e-06, + "loss": 1.11241961, + "num_input_tokens_seen": 11515605, + "router_z_loss_clip": 12.4296875, + "router_z_loss_mlp": 2.09667969, + "step": 542, + "time_per_iteration": 2.6720223426818848 + }, + { + "auxiliary_loss_clip": 0.08722232, + "auxiliary_loss_mlp": 0.01879557, + "balance_loss_clip": 0.07465587, + "balance_loss_mlp": 0.01644953, + "epoch": 0.03264692619870735, + "flos": 24287688771840.0, + "grad_norm": 65.19472082730613, + "language_loss": 0.83699101, + "learning_rate": 3.999926587134879e-06, + "loss": 0.9430089, + "num_input_tokens_seen": 11536230, + "router_z_loss_clip": 12.5625, + "router_z_loss_mlp": 2.34375, + "step": 543, + "time_per_iteration": 2.692474842071533 + }, + { + "auxiliary_loss_clip": 0.0878472, + "auxiliary_loss_mlp": 0.01882603, + "balance_loss_clip": 0.07507792, + "balance_loss_mlp": 0.01631214, + "epoch": 0.03270704945137532, + "flos": 22899763342080.0, + "grad_norm": 1912.553873416959, + "language_loss": 1.09316349, + "learning_rate": 3.999923212288192e-06, + "loss": 1.19983673, + "num_input_tokens_seen": 11554715, + "router_z_loss_clip": 12.7734375, + "router_z_loss_mlp": 2.51367188, + "step": 544, + "time_per_iteration": 2.663267135620117 + }, + { + "auxiliary_loss_clip": 0.0881625, + "auxiliary_loss_mlp": 0.01879222, + "balance_loss_clip": 0.07490219, + "balance_loss_mlp": 0.01537997, + "epoch": 0.032767172704043286, + "flos": 18046887874560.0, + "grad_norm": 1976.6790975556307, + "language_loss": 0.85651809, + "learning_rate": 3.999919761604216e-06, + "loss": 0.96347284, + "num_input_tokens_seen": 11571370, + "router_z_loss_clip": 13.265625, + "router_z_loss_mlp": 3.41210938, + "step": 545, + "time_per_iteration": 2.6566007137298584 + }, + { + "auxiliary_loss_clip": 0.08881226, + "auxiliary_loss_mlp": 0.01919651, + "balance_loss_clip": 0.07538594, + "balance_loss_mlp": 0.01591969, + "epoch": 0.03282729595671126, + "flos": 22535353935360.0, + "grad_norm": 36635.99630864103, + "language_loss": 1.19350576, + "learning_rate": 3.999916235083083e-06, + "loss": 1.30151451, + "num_input_tokens_seen": 11588560, + "router_z_loss_clip": 13.421875, + "router_z_loss_mlp": 3.27539062, + "step": 546, + "time_per_iteration": 2.6508443355560303 + }, + { + "auxiliary_loss_clip": 0.0885489, + "auxiliary_loss_mlp": 0.01969573, + "balance_loss_clip": 0.07525921, + "balance_loss_mlp": 0.01650092, + "epoch": 0.03288741920937923, + "flos": 20416555267200.0, + "grad_norm": 175.83782863941582, + "language_loss": 1.0484463, + "learning_rate": 3.999912632724925e-06, + "loss": 1.15669084, + "num_input_tokens_seen": 11605685, + "router_z_loss_clip": 13.28125, + "router_z_loss_mlp": 3.1953125, + "step": 547, + "time_per_iteration": 2.709317445755005 + }, + { + "auxiliary_loss_clip": 0.08846241, + "auxiliary_loss_mlp": 0.02054837, + "balance_loss_clip": 0.07521404, + "balance_loss_mlp": 0.01724484, + "epoch": 0.032947542462047195, + "flos": 20784402691200.0, + "grad_norm": 1231.4634556281662, + "language_loss": 0.99917918, + "learning_rate": 3.999908954529881e-06, + "loss": 1.10818994, + "num_input_tokens_seen": 11626290, + "router_z_loss_clip": 13.2578125, + "router_z_loss_mlp": 3.30664062, + "step": 548, + "time_per_iteration": 2.761152744293213 + }, + { + "auxiliary_loss_clip": 0.08837526, + "auxiliary_loss_mlp": 0.02099407, + "balance_loss_clip": 0.07500955, + "balance_loss_mlp": 0.01773059, + "epoch": 0.03300766571471517, + "flos": 19907354805120.0, + "grad_norm": 538.4476306780408, + "language_loss": 0.89559388, + "learning_rate": 3.999905200498087e-06, + "loss": 1.00496316, + "num_input_tokens_seen": 11643950, + "router_z_loss_clip": 13.3671875, + "router_z_loss_mlp": 3.26367188, + "step": 549, + "time_per_iteration": 2.7063941955566406 + }, + { + "auxiliary_loss_clip": 0.08802217, + "auxiliary_loss_mlp": 0.02104246, + "balance_loss_clip": 0.07490957, + "balance_loss_mlp": 0.0178324, + "epoch": 0.03306778896738313, + "flos": 17973569952000.0, + "grad_norm": 95.24031464069257, + "language_loss": 1.00179911, + "learning_rate": 3.999901370629689e-06, + "loss": 1.1108638, + "num_input_tokens_seen": 11662560, + "router_z_loss_clip": 13.125, + "router_z_loss_mlp": 3.20703125, + "step": 550, + "time_per_iteration": 2.6905276775360107 + }, + { + "auxiliary_loss_clip": 0.08789266, + "auxiliary_loss_mlp": 0.02134598, + "balance_loss_clip": 0.07500902, + "balance_loss_mlp": 0.01818551, + "epoch": 0.033127912220051105, + "flos": 21659899276800.0, + "grad_norm": 52.30662645055097, + "language_loss": 0.93777549, + "learning_rate": 3.99989746492483e-06, + "loss": 1.04701412, + "num_input_tokens_seen": 11682265, + "router_z_loss_clip": 12.8984375, + "router_z_loss_mlp": 3.16015625, + "step": 551, + "time_per_iteration": 2.7061314582824707 + }, + { + "auxiliary_loss_clip": 0.08738074, + "auxiliary_loss_mlp": 0.02134365, + "balance_loss_clip": 0.07474738, + "balance_loss_mlp": 0.01835484, + "epoch": 0.03318803547271908, + "flos": 30195875687040.0, + "grad_norm": 81.64424293941155, + "language_loss": 1.06586599, + "learning_rate": 3.999893483383658e-06, + "loss": 1.17459035, + "num_input_tokens_seen": 11699300, + "router_z_loss_clip": 12.6484375, + "router_z_loss_mlp": 2.98828125, + "step": 552, + "time_per_iteration": 2.7557857036590576 + }, + { + "auxiliary_loss_clip": 0.08738689, + "auxiliary_loss_mlp": 0.02132193, + "balance_loss_clip": 0.07474653, + "balance_loss_mlp": 0.01841513, + "epoch": 0.03324815872538704, + "flos": 20382286147200.0, + "grad_norm": 103.46520912531122, + "language_loss": 1.07230687, + "learning_rate": 3.999889426006326e-06, + "loss": 1.18101549, + "num_input_tokens_seen": 11716955, + "router_z_loss_clip": 12.6328125, + "router_z_loss_mlp": 2.90625, + "step": 553, + "time_per_iteration": 2.6690380573272705 + }, + { + "auxiliary_loss_clip": 0.0876793, + "auxiliary_loss_mlp": 0.02203825, + "balance_loss_clip": 0.07493228, + "balance_loss_mlp": 0.01878431, + "epoch": 0.033308281978055014, + "flos": 24500766504960.0, + "grad_norm": 2577.3704160991106, + "language_loss": 0.91311669, + "learning_rate": 3.999885292792986e-06, + "loss": 1.0228343, + "num_input_tokens_seen": 11736130, + "router_z_loss_clip": 12.75, + "router_z_loss_mlp": 3.25390625, + "step": 554, + "time_per_iteration": 2.690467119216919 + }, + { + "auxiliary_loss_clip": 0.08781252, + "auxiliary_loss_mlp": 0.02161472, + "balance_loss_clip": 0.0750941, + "balance_loss_mlp": 0.01854961, + "epoch": 0.03336840523072298, + "flos": 23406406254720.0, + "grad_norm": 23.66967902789698, + "language_loss": 0.92365468, + "learning_rate": 3.999881083743795e-06, + "loss": 1.03308201, + "num_input_tokens_seen": 11754425, + "router_z_loss_clip": 12.7265625, + "router_z_loss_mlp": 3.06445312, + "step": 555, + "time_per_iteration": 2.7009239196777344 + }, + { + "auxiliary_loss_clip": 0.0871176, + "auxiliary_loss_mlp": 0.02191896, + "balance_loss_clip": 0.0746032, + "balance_loss_mlp": 0.01904268, + "epoch": 0.03342852848339095, + "flos": 30557685617280.0, + "grad_norm": 32.47411862244808, + "language_loss": 1.03816569, + "learning_rate": 3.999876798858914e-06, + "loss": 1.14720225, + "num_input_tokens_seen": 11772845, + "router_z_loss_clip": 12.5234375, + "router_z_loss_mlp": 2.875, + "step": 556, + "time_per_iteration": 2.7751269340515137 + }, + { + "auxiliary_loss_clip": 0.08728363, + "auxiliary_loss_mlp": 0.02208938, + "balance_loss_clip": 0.07497713, + "balance_loss_mlp": 0.01914825, + "epoch": 0.03348865173605892, + "flos": 22899931050240.0, + "grad_norm": 26.350622314910414, + "language_loss": 0.97158062, + "learning_rate": 3.999872438138503e-06, + "loss": 1.0809536, + "num_input_tokens_seen": 11792850, + "router_z_loss_clip": 12.3046875, + "router_z_loss_mlp": 2.93945312, + "step": 557, + "time_per_iteration": 2.6803956031799316 + }, + { + "auxiliary_loss_clip": 0.08708371, + "auxiliary_loss_mlp": 0.02154386, + "balance_loss_clip": 0.0748485, + "balance_loss_mlp": 0.01905477, + "epoch": 0.03354877498872689, + "flos": 17681807635200.0, + "grad_norm": 18.772470179547817, + "language_loss": 1.10132766, + "learning_rate": 3.999868001582729e-06, + "loss": 1.20995522, + "num_input_tokens_seen": 11809670, + "router_z_loss_clip": 12.2265625, + "router_z_loss_mlp": 2.49023438, + "step": 558, + "time_per_iteration": 2.650348663330078 + }, + { + "auxiliary_loss_clip": 0.08667068, + "auxiliary_loss_mlp": 0.02131925, + "balance_loss_clip": 0.07472065, + "balance_loss_mlp": 0.01914487, + "epoch": 0.03360889824139486, + "flos": 21659438079360.0, + "grad_norm": 17.45552884003481, + "language_loss": 0.92322779, + "learning_rate": 3.99986348919176e-06, + "loss": 1.03121769, + "num_input_tokens_seen": 11829665, + "router_z_loss_clip": 11.953125, + "router_z_loss_mlp": 2.17578125, + "step": 559, + "time_per_iteration": 2.69866681098938 + }, + { + "auxiliary_loss_clip": 0.08715945, + "auxiliary_loss_mlp": 0.02064835, + "balance_loss_clip": 0.07521564, + "balance_loss_mlp": 0.01861607, + "epoch": 0.033669021494062826, + "flos": 21801671585280.0, + "grad_norm": 8.293279297555102, + "language_loss": 0.96911502, + "learning_rate": 3.9998589009657675e-06, + "loss": 1.07692266, + "num_input_tokens_seen": 11848190, + "router_z_loss_clip": 11.9453125, + "router_z_loss_mlp": 2.03417969, + "step": 560, + "time_per_iteration": 2.7140135765075684 + }, + { + "auxiliary_loss_clip": 0.08642244, + "auxiliary_loss_mlp": 0.01977364, + "balance_loss_clip": 0.07480196, + "balance_loss_mlp": 0.01790062, + "epoch": 0.0337291447467308, + "flos": 21871761125760.0, + "grad_norm": 36.168101096947126, + "language_loss": 0.91244531, + "learning_rate": 3.999854236904925e-06, + "loss": 1.01864135, + "num_input_tokens_seen": 11864795, + "router_z_loss_clip": 11.640625, + "router_z_loss_mlp": 1.875, + "step": 561, + "time_per_iteration": 2.6863293647766113 + }, + { + "auxiliary_loss_clip": 0.08645087, + "auxiliary_loss_mlp": 0.01996294, + "balance_loss_clip": 0.07495341, + "balance_loss_mlp": 0.01809374, + "epoch": 0.03378926799939877, + "flos": 24253251943680.0, + "grad_norm": 9.210066016696686, + "language_loss": 0.90415317, + "learning_rate": 3.999849497009409e-06, + "loss": 1.01056707, + "num_input_tokens_seen": 11885275, + "router_z_loss_clip": 11.4921875, + "router_z_loss_mlp": 1.86914062, + "step": 562, + "time_per_iteration": 2.724127769470215 + }, + { + "auxiliary_loss_clip": 0.08630846, + "auxiliary_loss_mlp": 0.01896325, + "balance_loss_clip": 0.07475269, + "balance_loss_mlp": 0.0172867, + "epoch": 0.033849391252066735, + "flos": 16513290921600.0, + "grad_norm": 8.70795014369516, + "language_loss": 0.93251538, + "learning_rate": 3.999844681279401e-06, + "loss": 1.03778696, + "num_input_tokens_seen": 11903595, + "router_z_loss_clip": 11.5546875, + "router_z_loss_mlp": 1.67773438, + "step": 563, + "time_per_iteration": 2.653869867324829 + }, + { + "auxiliary_loss_clip": 0.08601731, + "auxiliary_loss_mlp": 0.0185707, + "balance_loss_clip": 0.07466102, + "balance_loss_mlp": 0.01686648, + "epoch": 0.03390951450473471, + "flos": 15674746786560.0, + "grad_norm": 12.715008158349837, + "language_loss": 1.03361213, + "learning_rate": 3.99983978971508e-06, + "loss": 1.13820004, + "num_input_tokens_seen": 11917815, + "router_z_loss_clip": 11.34375, + "router_z_loss_mlp": 1.70507812, + "step": 564, + "time_per_iteration": 2.6272659301757812 + }, + { + "auxiliary_loss_clip": 0.08544251, + "auxiliary_loss_mlp": 0.01761406, + "balance_loss_clip": 0.07418631, + "balance_loss_mlp": 0.01609581, + "epoch": 0.03396963775740267, + "flos": 22681444728960.0, + "grad_norm": 17.830043780961535, + "language_loss": 1.06299067, + "learning_rate": 3.999834822316635e-06, + "loss": 1.1660471, + "num_input_tokens_seen": 11936305, + "router_z_loss_clip": 11.2578125, + "router_z_loss_mlp": 1.51855469, + "step": 565, + "time_per_iteration": 2.6662397384643555 + }, + { + "auxiliary_loss_clip": 0.07533604, + "auxiliary_loss_mlp": 0.01361189, + "balance_loss_clip": 0.07012594, + "balance_loss_mlp": 0.01291713, + "epoch": 0.034029761010070644, + "flos": 64414872656640.0, + "grad_norm": 1.941550580035849, + "language_loss": 0.56352836, + "learning_rate": 3.9998297790842535e-06, + "loss": 0.65247625, + "num_input_tokens_seen": 11998940, + "router_z_loss_clip": 5.203125, + "router_z_loss_mlp": 0.6953125, + "step": 566, + "time_per_iteration": 3.3542587757110596 + }, + { + "auxiliary_loss_clip": 0.08492532, + "auxiliary_loss_mlp": 0.0159982, + "balance_loss_clip": 0.07380439, + "balance_loss_mlp": 0.01460488, + "epoch": 0.034089884262738616, + "flos": 25010302383360.0, + "grad_norm": 17.320262523662066, + "language_loss": 0.91644871, + "learning_rate": 3.999824660018126e-06, + "loss": 1.01737225, + "num_input_tokens_seen": 12018860, + "router_z_loss_clip": 11.125, + "router_z_loss_mlp": 1.39355469, + "step": 567, + "time_per_iteration": 2.7798964977264404 + }, + { + "auxiliary_loss_clip": 0.08452182, + "auxiliary_loss_mlp": 0.01578824, + "balance_loss_clip": 0.07376789, + "balance_loss_mlp": 0.01451318, + "epoch": 0.03415000751540658, + "flos": 28446643451520.0, + "grad_norm": 16.848598157475653, + "language_loss": 0.91613495, + "learning_rate": 3.999819465118447e-06, + "loss": 1.01644492, + "num_input_tokens_seen": 12039675, + "router_z_loss_clip": 10.7578125, + "router_z_loss_mlp": 1.27539062, + "step": 568, + "time_per_iteration": 2.7506062984466553 + }, + { + "auxiliary_loss_clip": 0.08471178, + "auxiliary_loss_mlp": 0.01592293, + "balance_loss_clip": 0.07369491, + "balance_loss_mlp": 0.0146307, + "epoch": 0.034210130768074554, + "flos": 21474843534720.0, + "grad_norm": 19.531015605864777, + "language_loss": 0.96641582, + "learning_rate": 3.999814194385413e-06, + "loss": 1.06705046, + "num_input_tokens_seen": 12057680, + "router_z_loss_clip": 11.0234375, + "router_z_loss_mlp": 1.29199219, + "step": 569, + "time_per_iteration": 2.679094076156616 + }, + { + "auxiliary_loss_clip": 0.08444348, + "auxiliary_loss_mlp": 0.01572924, + "balance_loss_clip": 0.07354259, + "balance_loss_mlp": 0.01444559, + "epoch": 0.03427025402074252, + "flos": 18703436941440.0, + "grad_norm": 10.09748529662486, + "language_loss": 1.03407526, + "learning_rate": 3.9998088478192255e-06, + "loss": 1.13424802, + "num_input_tokens_seen": 12076135, + "router_z_loss_clip": 10.90625, + "router_z_loss_mlp": 1.28320312, + "step": 570, + "time_per_iteration": 5.62298059463501 + }, + { + "auxiliary_loss_clip": 0.08452979, + "auxiliary_loss_mlp": 0.01597574, + "balance_loss_clip": 0.07344566, + "balance_loss_mlp": 0.01465204, + "epoch": 0.03433037727341049, + "flos": 20856253167360.0, + "grad_norm": 7.817701028438559, + "language_loss": 0.91945982, + "learning_rate": 3.9998034254200846e-06, + "loss": 1.01996529, + "num_input_tokens_seen": 12094785, + "router_z_loss_clip": 11.09375, + "router_z_loss_mlp": 1.32421875, + "step": 571, + "time_per_iteration": 2.654836654663086 + }, + { + "auxiliary_loss_clip": 0.08401142, + "auxiliary_loss_mlp": 0.01674875, + "balance_loss_clip": 0.073204, + "balance_loss_mlp": 0.01534971, + "epoch": 0.03439050052607846, + "flos": 25417240536960.0, + "grad_norm": 10.131092922686104, + "language_loss": 0.93731064, + "learning_rate": 3.999797927188199e-06, + "loss": 1.0380708, + "num_input_tokens_seen": 12114590, + "router_z_loss_clip": 10.8046875, + "router_z_loss_mlp": 1.39941406, + "step": 572, + "time_per_iteration": 4.118088483810425 + }, + { + "auxiliary_loss_clip": 0.08396388, + "auxiliary_loss_mlp": 0.01765484, + "balance_loss_clip": 0.07306887, + "balance_loss_mlp": 0.01610417, + "epoch": 0.03445062377874643, + "flos": 17646029141760.0, + "grad_norm": 20.127104681387284, + "language_loss": 0.93513721, + "learning_rate": 3.999792353123774e-06, + "loss": 1.03675592, + "num_input_tokens_seen": 12132390, + "router_z_loss_clip": 10.8984375, + "router_z_loss_mlp": 1.55078125, + "step": 573, + "time_per_iteration": 2.743281841278076 + }, + { + "auxiliary_loss_clip": 0.08402257, + "auxiliary_loss_mlp": 0.01880152, + "balance_loss_clip": 0.07297936, + "balance_loss_mlp": 0.01694757, + "epoch": 0.0345107470314144, + "flos": 16770239066880.0, + "grad_norm": 36.525489937717154, + "language_loss": 0.90410393, + "learning_rate": 3.999786703227023e-06, + "loss": 1.00692797, + "num_input_tokens_seen": 12149035, + "router_z_loss_clip": 11.046875, + "router_z_loss_mlp": 1.85351562, + "step": 574, + "time_per_iteration": 4.080662250518799 + }, + { + "auxiliary_loss_clip": 0.08410574, + "auxiliary_loss_mlp": 0.01951083, + "balance_loss_clip": 0.0729783, + "balance_loss_mlp": 0.01742514, + "epoch": 0.03457087028408237, + "flos": 14689776441600.0, + "grad_norm": 44.337021824182244, + "language_loss": 0.94332999, + "learning_rate": 3.9997809774981606e-06, + "loss": 1.04694653, + "num_input_tokens_seen": 12167530, + "router_z_loss_clip": 11.125, + "router_z_loss_mlp": 2.08398438, + "step": 575, + "time_per_iteration": 2.6497297286987305 + }, + { + "auxiliary_loss_clip": 0.0841077, + "auxiliary_loss_mlp": 0.02005797, + "balance_loss_clip": 0.07284614, + "balance_loss_mlp": 0.01780635, + "epoch": 0.03463099353675034, + "flos": 20017499397120.0, + "grad_norm": 29.883353134979416, + "language_loss": 0.90882921, + "learning_rate": 3.9997751759374025e-06, + "loss": 1.01299489, + "num_input_tokens_seen": 12186340, + "router_z_loss_clip": 11.265625, + "router_z_loss_mlp": 2.24804688, + "step": 576, + "time_per_iteration": 2.67240309715271 + }, + { + "auxiliary_loss_clip": 0.08418353, + "auxiliary_loss_mlp": 0.02062659, + "balance_loss_clip": 0.07293572, + "balance_loss_mlp": 0.01817947, + "epoch": 0.03469111678941831, + "flos": 25308144120960.0, + "grad_norm": 230.42461275956111, + "language_loss": 0.94618452, + "learning_rate": 3.99976929854497e-06, + "loss": 1.05099463, + "num_input_tokens_seen": 12204090, + "router_z_loss_clip": 11.2421875, + "router_z_loss_mlp": 2.44921875, + "step": 577, + "time_per_iteration": 2.6817197799682617 + }, + { + "auxiliary_loss_clip": 0.08418664, + "auxiliary_loss_mlp": 0.02057238, + "balance_loss_clip": 0.07282382, + "balance_loss_mlp": 0.01803943, + "epoch": 0.034751240042086275, + "flos": 23266311027840.0, + "grad_norm": 40.134119868020754, + "language_loss": 0.81416667, + "learning_rate": 3.9997633453210845e-06, + "loss": 0.9189257, + "num_input_tokens_seen": 12224850, + "router_z_loss_clip": 11.359375, + "router_z_loss_mlp": 2.53320312, + "step": 578, + "time_per_iteration": 2.6971585750579834 + }, + { + "auxiliary_loss_clip": 0.08457734, + "auxiliary_loss_mlp": 0.0202791, + "balance_loss_clip": 0.07290839, + "balance_loss_mlp": 0.0177881, + "epoch": 0.03481136329475425, + "flos": 23776056541440.0, + "grad_norm": 24.631913893483972, + "language_loss": 0.86342728, + "learning_rate": 3.999757316265973e-06, + "loss": 0.96828371, + "num_input_tokens_seen": 12244935, + "router_z_loss_clip": 11.6640625, + "router_z_loss_mlp": 2.4921875, + "step": 579, + "time_per_iteration": 2.694719076156616 + }, + { + "auxiliary_loss_clip": 0.08425288, + "auxiliary_loss_mlp": 0.0202294, + "balance_loss_clip": 0.07289667, + "balance_loss_mlp": 0.01773459, + "epoch": 0.03487148654742222, + "flos": 20163799825920.0, + "grad_norm": 24.746236106534205, + "language_loss": 0.94137156, + "learning_rate": 3.999751211379863e-06, + "loss": 1.04585385, + "num_input_tokens_seen": 12262140, + "router_z_loss_clip": 11.3671875, + "router_z_loss_mlp": 2.49609375, + "step": 580, + "time_per_iteration": 2.6965222358703613 + }, + { + "auxiliary_loss_clip": 0.08429064, + "auxiliary_loss_mlp": 0.02027245, + "balance_loss_clip": 0.07292753, + "balance_loss_mlp": 0.01790066, + "epoch": 0.034931609800090184, + "flos": 15675082202880.0, + "grad_norm": 72.69729205239823, + "language_loss": 0.92401338, + "learning_rate": 3.999745030662987e-06, + "loss": 1.02857637, + "num_input_tokens_seen": 12280930, + "router_z_loss_clip": 11.34375, + "router_z_loss_mlp": 2.37011719, + "step": 581, + "time_per_iteration": 2.6485416889190674 + }, + { + "auxiliary_loss_clip": 0.08388546, + "auxiliary_loss_mlp": 0.01934185, + "balance_loss_clip": 0.07261664, + "balance_loss_mlp": 0.01722183, + "epoch": 0.034991733052758156, + "flos": 16367912887680.0, + "grad_norm": 7.903206829146829, + "language_loss": 0.86330044, + "learning_rate": 3.99973877411558e-06, + "loss": 0.96652782, + "num_input_tokens_seen": 12299125, + "router_z_loss_clip": 11.28125, + "router_z_loss_mlp": 2.11914062, + "step": 582, + "time_per_iteration": 2.649725914001465 + }, + { + "auxiliary_loss_clip": 0.08328964, + "auxiliary_loss_mlp": 0.01871683, + "balance_loss_clip": 0.07243238, + "balance_loss_mlp": 0.01678087, + "epoch": 0.03505185630542612, + "flos": 19392787681920.0, + "grad_norm": 16.174360943611433, + "language_loss": 0.95958614, + "learning_rate": 3.999732441737877e-06, + "loss": 1.06159258, + "num_input_tokens_seen": 12316905, + "router_z_loss_clip": 10.859375, + "router_z_loss_mlp": 1.9375, + "step": 583, + "time_per_iteration": 2.643488645553589 + }, + { + "auxiliary_loss_clip": 0.08363868, + "auxiliary_loss_mlp": 0.01881498, + "balance_loss_clip": 0.07254223, + "balance_loss_mlp": 0.0168199, + "epoch": 0.03511197955809409, + "flos": 21330094406400.0, + "grad_norm": 77.84633741200611, + "language_loss": 0.91128743, + "learning_rate": 3.99972603353012e-06, + "loss": 1.01374114, + "num_input_tokens_seen": 12335070, + "router_z_loss_clip": 11.09375, + "router_z_loss_mlp": 1.99511719, + "step": 584, + "time_per_iteration": 2.6665167808532715 + }, + { + "auxiliary_loss_clip": 0.08332659, + "auxiliary_loss_mlp": 0.01830344, + "balance_loss_clip": 0.07228079, + "balance_loss_mlp": 0.01642279, + "epoch": 0.035172102810762065, + "flos": 14141736812160.0, + "grad_norm": 18.638483190058057, + "language_loss": 1.05479646, + "learning_rate": 3.999719549492551e-06, + "loss": 1.15642655, + "num_input_tokens_seen": 12350315, + "router_z_loss_clip": 11.046875, + "router_z_loss_mlp": 1.88183594, + "step": 585, + "time_per_iteration": 2.6243345737457275 + }, + { + "auxiliary_loss_clip": 0.08346213, + "auxiliary_loss_mlp": 0.01757237, + "balance_loss_clip": 0.07237425, + "balance_loss_mlp": 0.01597305, + "epoch": 0.03523222606343003, + "flos": 20302092190080.0, + "grad_norm": 16.531437097419627, + "language_loss": 0.96612549, + "learning_rate": 3.9997129896254165e-06, + "loss": 1.06716001, + "num_input_tokens_seen": 12366030, + "router_z_loss_clip": 11.0859375, + "router_z_loss_mlp": 1.59960938, + "step": 586, + "time_per_iteration": 2.79085373878479 + }, + { + "auxiliary_loss_clip": 0.08346236, + "auxiliary_loss_mlp": 0.01816744, + "balance_loss_clip": 0.07224018, + "balance_loss_mlp": 0.01643652, + "epoch": 0.035292349316098, + "flos": 20382034584960.0, + "grad_norm": 18.968444028471765, + "language_loss": 0.85692161, + "learning_rate": 3.999706353928965e-06, + "loss": 0.95855141, + "num_input_tokens_seen": 12384895, + "router_z_loss_clip": 11.234375, + "router_z_loss_mlp": 1.73242188, + "step": 587, + "time_per_iteration": 2.6773126125335693 + }, + { + "auxiliary_loss_clip": 0.08336938, + "auxiliary_loss_mlp": 0.01864921, + "balance_loss_clip": 0.07205997, + "balance_loss_mlp": 0.01679527, + "epoch": 0.03535247256876597, + "flos": 21475011242880.0, + "grad_norm": 15.49018014588467, + "language_loss": 0.87486923, + "learning_rate": 3.999699642403449e-06, + "loss": 0.97688788, + "num_input_tokens_seen": 12404980, + "router_z_loss_clip": 11.3203125, + "router_z_loss_mlp": 1.85546875, + "step": 588, + "time_per_iteration": 2.7011075019836426 + }, + { + "auxiliary_loss_clip": 0.08372419, + "auxiliary_loss_mlp": 0.01837943, + "balance_loss_clip": 0.07240701, + "balance_loss_mlp": 0.01648257, + "epoch": 0.03541259582143394, + "flos": 23629798039680.0, + "grad_norm": 7.372880070726386, + "language_loss": 1.04957795, + "learning_rate": 3.99969285504912e-06, + "loss": 1.15168166, + "num_input_tokens_seen": 12423835, + "router_z_loss_clip": 11.3203125, + "router_z_loss_mlp": 1.8984375, + "step": 589, + "time_per_iteration": 2.6905288696289062 + }, + { + "auxiliary_loss_clip": 0.08381461, + "auxiliary_loss_mlp": 0.01904967, + "balance_loss_clip": 0.07235886, + "balance_loss_mlp": 0.0170708, + "epoch": 0.03547271907410191, + "flos": 33734269428480.0, + "grad_norm": 5.900447642035286, + "language_loss": 0.93457747, + "learning_rate": 3.99968599186624e-06, + "loss": 1.03744173, + "num_input_tokens_seen": 12443135, + "router_z_loss_clip": 11.4609375, + "router_z_loss_mlp": 1.98046875, + "step": 590, + "time_per_iteration": 2.7626585960388184 + }, + { + "auxiliary_loss_clip": 0.08363292, + "auxiliary_loss_mlp": 0.01913512, + "balance_loss_clip": 0.07212853, + "balance_loss_mlp": 0.01716864, + "epoch": 0.03553284232676988, + "flos": 21149147514240.0, + "grad_norm": 8.056614912073432, + "language_loss": 0.93932045, + "learning_rate": 3.999679052855065e-06, + "loss": 1.04208851, + "num_input_tokens_seen": 12462895, + "router_z_loss_clip": 11.5, + "router_z_loss_mlp": 1.96484375, + "step": 591, + "time_per_iteration": 2.6892929077148438 + }, + { + "auxiliary_loss_clip": 0.08372159, + "auxiliary_loss_mlp": 0.0192709, + "balance_loss_clip": 0.0721619, + "balance_loss_mlp": 0.01729871, + "epoch": 0.03559296557943785, + "flos": 20052607057920.0, + "grad_norm": 11.504016210282687, + "language_loss": 0.90931952, + "learning_rate": 3.999672038015861e-06, + "loss": 1.01231205, + "num_input_tokens_seen": 12481515, + "router_z_loss_clip": 11.5546875, + "router_z_loss_mlp": 1.97363281, + "step": 592, + "time_per_iteration": 2.682248830795288 + }, + { + "auxiliary_loss_clip": 0.07476875, + "auxiliary_loss_mlp": 0.01418694, + "balance_loss_clip": 0.06931903, + "balance_loss_mlp": 0.01348551, + "epoch": 0.035653088832105814, + "flos": 60354742268160.0, + "grad_norm": 1.7390456768388496, + "language_loss": 0.61271667, + "learning_rate": 3.999664947348893e-06, + "loss": 0.70167232, + "num_input_tokens_seen": 12548220, + "router_z_loss_clip": 5.4375, + "router_z_loss_mlp": 0.70214844, + "step": 593, + "time_per_iteration": 3.372291088104248 + }, + { + "auxiliary_loss_clip": 0.08396088, + "auxiliary_loss_mlp": 0.01873215, + "balance_loss_clip": 0.07235788, + "balance_loss_mlp": 0.0169402, + "epoch": 0.035713212084773786, + "flos": 20118084624000.0, + "grad_norm": 4.056543882896522, + "language_loss": 0.9366371, + "learning_rate": 3.999657780854429e-06, + "loss": 1.03933024, + "num_input_tokens_seen": 12566105, + "router_z_loss_clip": 11.609375, + "router_z_loss_mlp": 1.79199219, + "step": 594, + "time_per_iteration": 2.656702756881714 + }, + { + "auxiliary_loss_clip": 0.08370538, + "auxiliary_loss_mlp": 0.01864142, + "balance_loss_clip": 0.07210694, + "balance_loss_mlp": 0.01671786, + "epoch": 0.03577333533744176, + "flos": 26292862903680.0, + "grad_norm": 7.659859705492133, + "language_loss": 0.90299201, + "learning_rate": 3.999650538532742e-06, + "loss": 1.00533891, + "num_input_tokens_seen": 12586680, + "router_z_loss_clip": 11.609375, + "router_z_loss_mlp": 1.92480469, + "step": 595, + "time_per_iteration": 2.735182285308838 + }, + { + "auxiliary_loss_clip": 0.08357747, + "auxiliary_loss_mlp": 0.01819213, + "balance_loss_clip": 0.07199049, + "balance_loss_mlp": 0.01642402, + "epoch": 0.035833458590109724, + "flos": 10894392627840.0, + "grad_norm": 11.312857601205495, + "language_loss": 1.05936086, + "learning_rate": 3.999643220384106e-06, + "loss": 1.16113043, + "num_input_tokens_seen": 12601605, + "router_z_loss_clip": 11.6015625, + "router_z_loss_mlp": 1.76953125, + "step": 596, + "time_per_iteration": 2.6456210613250732 + }, + { + "auxiliary_loss_clip": 0.08308871, + "auxiliary_loss_mlp": 0.01797355, + "balance_loss_clip": 0.07171883, + "balance_loss_mlp": 0.01627124, + "epoch": 0.035893581842777696, + "flos": 22096620357120.0, + "grad_norm": 9.130935198122538, + "language_loss": 0.90824974, + "learning_rate": 3.999635826408799e-06, + "loss": 1.00931203, + "num_input_tokens_seen": 12620365, + "router_z_loss_clip": 11.3671875, + "router_z_loss_mlp": 1.70117188, + "step": 597, + "time_per_iteration": 2.6823341846466064 + }, + { + "auxiliary_loss_clip": 0.08270305, + "auxiliary_loss_mlp": 0.01746721, + "balance_loss_clip": 0.0715827, + "balance_loss_mlp": 0.01584406, + "epoch": 0.03595370509544566, + "flos": 23044847886720.0, + "grad_norm": 9.111056149089638, + "language_loss": 0.87109864, + "learning_rate": 3.999628356607101e-06, + "loss": 0.97126889, + "num_input_tokens_seen": 12641140, + "router_z_loss_clip": 11.1171875, + "router_z_loss_mlp": 1.62402344, + "step": 598, + "time_per_iteration": 2.720789670944214 + }, + { + "auxiliary_loss_clip": 0.08249436, + "auxiliary_loss_mlp": 0.01768458, + "balance_loss_clip": 0.07144348, + "balance_loss_mlp": 0.01596511, + "epoch": 0.03601382834811363, + "flos": 20784109201920.0, + "grad_norm": 3.8408259345244593, + "language_loss": 0.87403977, + "learning_rate": 3.999620810979295e-06, + "loss": 0.97421879, + "num_input_tokens_seen": 12661080, + "router_z_loss_clip": 11.0546875, + "router_z_loss_mlp": 1.71972656, + "step": 599, + "time_per_iteration": 2.648764133453369 + }, + { + "auxiliary_loss_clip": 0.08292407, + "auxiliary_loss_mlp": 0.01772624, + "balance_loss_clip": 0.07133689, + "balance_loss_mlp": 0.01594573, + "epoch": 0.036073951600781605, + "flos": 23958470880000.0, + "grad_norm": 6.448569836830266, + "language_loss": 0.96199447, + "learning_rate": 3.999613189525668e-06, + "loss": 1.06264472, + "num_input_tokens_seen": 12678270, + "router_z_loss_clip": 11.6015625, + "router_z_loss_mlp": 1.78027344, + "step": 600, + "time_per_iteration": 2.677182197570801 + }, + { + "auxiliary_loss_clip": 0.08248397, + "auxiliary_loss_mlp": 0.01755802, + "balance_loss_clip": 0.07142025, + "balance_loss_mlp": 0.01582996, + "epoch": 0.03613407485344957, + "flos": 18917562850560.0, + "grad_norm": 6.503034140887701, + "language_loss": 0.8985101, + "learning_rate": 3.999605492246508e-06, + "loss": 0.9985522, + "num_input_tokens_seen": 12697295, + "router_z_loss_clip": 11.0703125, + "router_z_loss_mlp": 1.72753906, + "step": 601, + "time_per_iteration": 2.6344988346099854 + }, + { + "auxiliary_loss_clip": 0.08262836, + "auxiliary_loss_mlp": 0.01796413, + "balance_loss_clip": 0.07111854, + "balance_loss_mlp": 0.01602054, + "epoch": 0.03619419810611754, + "flos": 23045057521920.0, + "grad_norm": 7.606856937764795, + "language_loss": 0.83811623, + "learning_rate": 3.999597719142107e-06, + "loss": 0.93870872, + "num_input_tokens_seen": 12716165, + "router_z_loss_clip": 11.5234375, + "router_z_loss_mlp": 1.94335938, + "step": 602, + "time_per_iteration": 2.6544992923736572 + }, + { + "auxiliary_loss_clip": 0.08245073, + "auxiliary_loss_mlp": 0.01805812, + "balance_loss_clip": 0.07111835, + "balance_loss_mlp": 0.01607543, + "epoch": 0.03625432135878551, + "flos": 29465002448640.0, + "grad_norm": 10.358505294515373, + "language_loss": 0.86272752, + "learning_rate": 3.999589870212761e-06, + "loss": 0.96323633, + "num_input_tokens_seen": 12735475, + "router_z_loss_clip": 11.328125, + "router_z_loss_mlp": 1.984375, + "step": 603, + "time_per_iteration": 2.7074103355407715 + }, + { + "auxiliary_loss_clip": 0.08216999, + "auxiliary_loss_mlp": 0.01791145, + "balance_loss_clip": 0.07080936, + "balance_loss_mlp": 0.01602794, + "epoch": 0.03631444461145348, + "flos": 23514412567680.0, + "grad_norm": 4.761739949728406, + "language_loss": 0.93545526, + "learning_rate": 3.9995819454587664e-06, + "loss": 1.03553677, + "num_input_tokens_seen": 12754540, + "router_z_loss_clip": 11.3671875, + "router_z_loss_mlp": 1.88574219, + "step": 604, + "time_per_iteration": 2.683458089828491 + }, + { + "auxiliary_loss_clip": 0.08179027, + "auxiliary_loss_mlp": 0.01779272, + "balance_loss_clip": 0.07038404, + "balance_loss_mlp": 0.01587965, + "epoch": 0.03637456786412145, + "flos": 16623770929920.0, + "grad_norm": 10.408229209770424, + "language_loss": 0.89575511, + "learning_rate": 3.999573944880424e-06, + "loss": 0.99533808, + "num_input_tokens_seen": 12773050, + "router_z_loss_clip": 11.4140625, + "router_z_loss_mlp": 1.91308594, + "step": 605, + "time_per_iteration": 2.6058335304260254 + }, + { + "auxiliary_loss_clip": 0.08185698, + "auxiliary_loss_mlp": 0.0179345, + "balance_loss_clip": 0.07041989, + "balance_loss_mlp": 0.01587933, + "epoch": 0.03643469111678942, + "flos": 15857328833280.0, + "grad_norm": 18.44965350869095, + "language_loss": 0.94496262, + "learning_rate": 3.9995658684780375e-06, + "loss": 1.04475403, + "num_input_tokens_seen": 12791240, + "router_z_loss_clip": 11.421875, + "router_z_loss_mlp": 2.05566406, + "step": 606, + "time_per_iteration": 2.6620774269104004 + }, + { + "auxiliary_loss_clip": 0.0816614, + "auxiliary_loss_mlp": 0.01748117, + "balance_loss_clip": 0.07028672, + "balance_loss_mlp": 0.01549944, + "epoch": 0.03649481436945739, + "flos": 23626695438720.0, + "grad_norm": 22.881578639374155, + "language_loss": 0.89864534, + "learning_rate": 3.999557716251912e-06, + "loss": 0.99778789, + "num_input_tokens_seen": 12812245, + "router_z_loss_clip": 11.3828125, + "router_z_loss_mlp": 1.98144531, + "step": 607, + "time_per_iteration": 2.643644332885742 + }, + { + "auxiliary_loss_clip": 0.08159362, + "auxiliary_loss_mlp": 0.01746593, + "balance_loss_clip": 0.07035235, + "balance_loss_mlp": 0.01550708, + "epoch": 0.036554937622125354, + "flos": 21760903774080.0, + "grad_norm": 5.869564247499357, + "language_loss": 0.89574814, + "learning_rate": 3.999549488202358e-06, + "loss": 0.99480766, + "num_input_tokens_seen": 12831085, + "router_z_loss_clip": 11.2421875, + "router_z_loss_mlp": 1.95800781, + "step": 608, + "time_per_iteration": 2.6450629234313965 + }, + { + "auxiliary_loss_clip": 0.08127657, + "auxiliary_loss_mlp": 0.01727103, + "balance_loss_clip": 0.07009961, + "balance_loss_mlp": 0.01525497, + "epoch": 0.036615060874793326, + "flos": 17825215098240.0, + "grad_norm": 10.044459064109706, + "language_loss": 0.90011758, + "learning_rate": 3.999541184329688e-06, + "loss": 0.99866509, + "num_input_tokens_seen": 12849115, + "router_z_loss_clip": 11.171875, + "router_z_loss_mlp": 2.01464844, + "step": 609, + "time_per_iteration": 4.030602216720581 + }, + { + "auxiliary_loss_clip": 0.08147175, + "auxiliary_loss_mlp": 0.01709632, + "balance_loss_clip": 0.07004737, + "balance_loss_mlp": 0.01506309, + "epoch": 0.0366751841274613, + "flos": 26759911962240.0, + "grad_norm": 23.288197653985222, + "language_loss": 0.89072526, + "learning_rate": 3.999532804634215e-06, + "loss": 0.98929334, + "num_input_tokens_seen": 12868005, + "router_z_loss_clip": 11.421875, + "router_z_loss_mlp": 2.03515625, + "step": 610, + "time_per_iteration": 4.13908052444458 + }, + { + "auxiliary_loss_clip": 0.08141156, + "auxiliary_loss_mlp": 0.01701532, + "balance_loss_clip": 0.06999695, + "balance_loss_mlp": 0.01503454, + "epoch": 0.03673530738012926, + "flos": 22202949588480.0, + "grad_norm": 12.716864123026268, + "language_loss": 0.93839324, + "learning_rate": 3.9995243491162575e-06, + "loss": 1.03682017, + "num_input_tokens_seen": 12886890, + "router_z_loss_clip": 11.421875, + "router_z_loss_mlp": 1.98046875, + "step": 611, + "time_per_iteration": 4.084355354309082 + }, + { + "auxiliary_loss_clip": 0.08129553, + "auxiliary_loss_mlp": 0.01677889, + "balance_loss_clip": 0.07002232, + "balance_loss_mlp": 0.01494783, + "epoch": 0.036795430632797235, + "flos": 24688673285760.0, + "grad_norm": 5.856966427284507, + "language_loss": 0.80289567, + "learning_rate": 3.999515817776136e-06, + "loss": 0.9009701, + "num_input_tokens_seen": 12906130, + "router_z_loss_clip": 11.296875, + "router_z_loss_mlp": 1.83007812, + "step": 612, + "time_per_iteration": 2.797450065612793 + }, + { + "auxiliary_loss_clip": 0.08124618, + "auxiliary_loss_mlp": 0.01670571, + "balance_loss_clip": 0.06981046, + "balance_loss_mlp": 0.01486607, + "epoch": 0.0368555538854652, + "flos": 17754706287360.0, + "grad_norm": 13.343841316796098, + "language_loss": 0.86962521, + "learning_rate": 3.999507210614175e-06, + "loss": 0.9675771, + "num_input_tokens_seen": 12925260, + "router_z_loss_clip": 11.4453125, + "router_z_loss_mlp": 1.83984375, + "step": 613, + "time_per_iteration": 4.1074419021606445 + }, + { + "auxiliary_loss_clip": 0.0806347, + "auxiliary_loss_mlp": 0.01642999, + "balance_loss_clip": 0.0695873, + "balance_loss_mlp": 0.01476392, + "epoch": 0.03691567713813317, + "flos": 20600772468480.0, + "grad_norm": 5.522225672422525, + "language_loss": 1.0065136, + "learning_rate": 3.9994985276307e-06, + "loss": 1.10357833, + "num_input_tokens_seen": 12944590, + "router_z_loss_clip": 11.046875, + "router_z_loss_mlp": 1.66699219, + "step": 614, + "time_per_iteration": 2.645425796508789 + }, + { + "auxiliary_loss_clip": 0.08091287, + "auxiliary_loss_mlp": 0.01664825, + "balance_loss_clip": 0.06965354, + "balance_loss_mlp": 0.01476188, + "epoch": 0.036975800390801145, + "flos": 33657765050880.0, + "grad_norm": 13.032636577175042, + "language_loss": 0.81820416, + "learning_rate": 3.999489768826041e-06, + "loss": 0.91576523, + "num_input_tokens_seen": 12964785, + "router_z_loss_clip": 11.265625, + "router_z_loss_mlp": 1.88671875, + "step": 615, + "time_per_iteration": 2.781172752380371 + }, + { + "auxiliary_loss_clip": 0.08073606, + "auxiliary_loss_mlp": 0.01648642, + "balance_loss_clip": 0.06957066, + "balance_loss_mlp": 0.01467158, + "epoch": 0.03703592364346911, + "flos": 28301307344640.0, + "grad_norm": 5.888176936290721, + "language_loss": 0.88226712, + "learning_rate": 3.999480934200528e-06, + "loss": 0.97948968, + "num_input_tokens_seen": 12986705, + "router_z_loss_clip": 11.171875, + "router_z_loss_mlp": 1.81445312, + "step": 616, + "time_per_iteration": 2.712480068206787 + }, + { + "auxiliary_loss_clip": 0.08063665, + "auxiliary_loss_mlp": 0.01595674, + "balance_loss_clip": 0.06951402, + "balance_loss_mlp": 0.01438985, + "epoch": 0.03709604689613708, + "flos": 31512327984000.0, + "grad_norm": 15.942016878304402, + "language_loss": 0.7623843, + "learning_rate": 3.999472023754499e-06, + "loss": 0.85897768, + "num_input_tokens_seen": 13010560, + "router_z_loss_clip": 11.1171875, + "router_z_loss_mlp": 1.56738281, + "step": 617, + "time_per_iteration": 2.738520622253418 + }, + { + "auxiliary_loss_clip": 0.08034836, + "auxiliary_loss_mlp": 0.01559373, + "balance_loss_clip": 0.06941325, + "balance_loss_mlp": 0.01401445, + "epoch": 0.03715617014880505, + "flos": 19615424780160.0, + "grad_norm": 6.714823910826054, + "language_loss": 0.88676983, + "learning_rate": 3.99946303748829e-06, + "loss": 0.98271191, + "num_input_tokens_seen": 13028935, + "router_z_loss_clip": 10.953125, + "router_z_loss_mlp": 1.57910156, + "step": 618, + "time_per_iteration": 2.6463687419891357 + }, + { + "auxiliary_loss_clip": 0.08035833, + "auxiliary_loss_mlp": 0.0158681, + "balance_loss_clip": 0.06917505, + "balance_loss_mlp": 0.01430789, + "epoch": 0.03721629340147302, + "flos": 15929598579840.0, + "grad_norm": 200.27470015941975, + "language_loss": 0.97611117, + "learning_rate": 3.999453975402242e-06, + "loss": 1.07233763, + "num_input_tokens_seen": 13046000, + "router_z_loss_clip": 11.171875, + "router_z_loss_mlp": 1.55957031, + "step": 619, + "time_per_iteration": 2.6415488719940186 + }, + { + "auxiliary_loss_clip": 0.08024481, + "auxiliary_loss_mlp": 0.01545146, + "balance_loss_clip": 0.06915386, + "balance_loss_mlp": 0.01399139, + "epoch": 0.03727641665414099, + "flos": 21110182565760.0, + "grad_norm": 5.601090655471351, + "language_loss": 1.00407517, + "learning_rate": 3.9994448374967e-06, + "loss": 1.0997715, + "num_input_tokens_seen": 13062995, + "router_z_loss_clip": 11.1015625, + "router_z_loss_mlp": 1.4609375, + "step": 620, + "time_per_iteration": 2.6569979190826416 + }, + { + "auxiliary_loss_clip": 0.08002374, + "auxiliary_loss_mlp": 0.01557386, + "balance_loss_clip": 0.06899319, + "balance_loss_mlp": 0.01406705, + "epoch": 0.037336539906808956, + "flos": 24138159960960.0, + "grad_norm": 36.40398806521908, + "language_loss": 0.83474398, + "learning_rate": 3.999435623772008e-06, + "loss": 0.9303416, + "num_input_tokens_seen": 13084120, + "router_z_loss_clip": 11.046875, + "router_z_loss_mlp": 1.5078125, + "step": 621, + "time_per_iteration": 2.690336227416992 + }, + { + "auxiliary_loss_clip": 0.07971206, + "auxiliary_loss_mlp": 0.01523645, + "balance_loss_clip": 0.06889994, + "balance_loss_mlp": 0.01385266, + "epoch": 0.03739666315947693, + "flos": 22352981523840.0, + "grad_norm": 9.446463642728892, + "language_loss": 0.92411411, + "learning_rate": 3.999426334228518e-06, + "loss": 1.01906252, + "num_input_tokens_seen": 13100035, + "router_z_loss_clip": 10.828125, + "router_z_loss_mlp": 1.38378906, + "step": 622, + "time_per_iteration": 2.658414363861084 + }, + { + "auxiliary_loss_clip": 0.07994708, + "auxiliary_loss_mlp": 0.01510841, + "balance_loss_clip": 0.06888318, + "balance_loss_mlp": 0.01382, + "epoch": 0.0374567864121449, + "flos": 20455855632000.0, + "grad_norm": 11.361437110202797, + "language_loss": 0.97279346, + "learning_rate": 3.999416968866581e-06, + "loss": 1.06784892, + "num_input_tokens_seen": 13118070, + "router_z_loss_clip": 11.0546875, + "router_z_loss_mlp": 1.2890625, + "step": 623, + "time_per_iteration": 2.641080617904663 + }, + { + "auxiliary_loss_clip": 0.07990901, + "auxiliary_loss_mlp": 0.01512746, + "balance_loss_clip": 0.06881022, + "balance_loss_mlp": 0.0138009, + "epoch": 0.037516909664812866, + "flos": 19214020995840.0, + "grad_norm": 6.5992711028490865, + "language_loss": 0.9044131, + "learning_rate": 3.999407527686551e-06, + "loss": 0.99944961, + "num_input_tokens_seen": 13136355, + "router_z_loss_clip": 11.1171875, + "router_z_loss_mlp": 1.32714844, + "step": 624, + "time_per_iteration": 2.6581132411956787 + }, + { + "auxiliary_loss_clip": 0.07970337, + "auxiliary_loss_mlp": 0.0150074, + "balance_loss_clip": 0.06882318, + "balance_loss_mlp": 0.01368561, + "epoch": 0.03757703291748084, + "flos": 35013643493760.0, + "grad_norm": 9.813739409664771, + "language_loss": 0.77213168, + "learning_rate": 3.999398010688788e-06, + "loss": 0.86684251, + "num_input_tokens_seen": 13155435, + "router_z_loss_clip": 10.8828125, + "router_z_loss_mlp": 1.32128906, + "step": 625, + "time_per_iteration": 2.741912603378296 + }, + { + "auxiliary_loss_clip": 0.07975402, + "auxiliary_loss_mlp": 0.01499832, + "balance_loss_clip": 0.06869578, + "balance_loss_mlp": 0.01362599, + "epoch": 0.0376371561701488, + "flos": 25490977729920.0, + "grad_norm": 10.795152981420221, + "language_loss": 0.84230971, + "learning_rate": 3.999388417873652e-06, + "loss": 0.93706203, + "num_input_tokens_seen": 13174295, + "router_z_loss_clip": 11.0625, + "router_z_loss_mlp": 1.37207031, + "step": 626, + "time_per_iteration": 2.7070746421813965 + }, + { + "auxiliary_loss_clip": 0.07968426, + "auxiliary_loss_mlp": 0.01497735, + "balance_loss_clip": 0.06873227, + "balance_loss_mlp": 0.01361264, + "epoch": 0.037697279422816775, + "flos": 18191301586560.0, + "grad_norm": 4.940336590948721, + "language_loss": 0.86271065, + "learning_rate": 3.999378749241506e-06, + "loss": 0.95737231, + "num_input_tokens_seen": 13192500, + "router_z_loss_clip": 10.953125, + "router_z_loss_mlp": 1.36425781, + "step": 627, + "time_per_iteration": 2.622081756591797 + }, + { + "auxiliary_loss_clip": 0.07952641, + "auxiliary_loss_mlp": 0.01462314, + "balance_loss_clip": 0.06847817, + "balance_loss_mlp": 0.01327273, + "epoch": 0.03775740267548475, + "flos": 24651133856640.0, + "grad_norm": 5.044807916969655, + "language_loss": 0.93558288, + "learning_rate": 3.999369004792719e-06, + "loss": 1.02973247, + "num_input_tokens_seen": 13213470, + "router_z_loss_clip": 11.046875, + "router_z_loss_mlp": 1.35058594, + "step": 628, + "time_per_iteration": 2.699890375137329 + }, + { + "auxiliary_loss_clip": 0.07954629, + "auxiliary_loss_mlp": 0.01473174, + "balance_loss_clip": 0.06867678, + "balance_loss_mlp": 0.01340232, + "epoch": 0.03781752592815271, + "flos": 21294609402240.0, + "grad_norm": 4.416786805856079, + "language_loss": 0.86205798, + "learning_rate": 3.999359184527658e-06, + "loss": 0.95633596, + "num_input_tokens_seen": 13232365, + "router_z_loss_clip": 10.8828125, + "router_z_loss_mlp": 1.32910156, + "step": 629, + "time_per_iteration": 2.629606246948242 + }, + { + "auxiliary_loss_clip": 0.07949786, + "auxiliary_loss_mlp": 0.01478041, + "balance_loss_clip": 0.06862906, + "balance_loss_mlp": 0.01348436, + "epoch": 0.037877649180820684, + "flos": 22095949524480.0, + "grad_norm": 11.02025815590499, + "language_loss": 0.82977569, + "learning_rate": 3.999349288446696e-06, + "loss": 0.92405391, + "num_input_tokens_seen": 13251920, + "router_z_loss_clip": 10.8671875, + "router_z_loss_mlp": 1.29589844, + "step": 630, + "time_per_iteration": 2.6579172611236572 + }, + { + "auxiliary_loss_clip": 0.07989411, + "auxiliary_loss_mlp": 0.01449511, + "balance_loss_clip": 0.06879212, + "balance_loss_mlp": 0.01315711, + "epoch": 0.03793777243348865, + "flos": 14506523562240.0, + "grad_norm": 6.642300097880606, + "language_loss": 0.99746037, + "learning_rate": 3.99933931655021e-06, + "loss": 1.09184957, + "num_input_tokens_seen": 13267440, + "router_z_loss_clip": 11.1015625, + "router_z_loss_mlp": 1.33789062, + "step": 631, + "time_per_iteration": 2.5856504440307617 + }, + { + "auxiliary_loss_clip": 0.079531, + "auxiliary_loss_mlp": 0.0144806, + "balance_loss_clip": 0.06880549, + "balance_loss_mlp": 0.01321221, + "epoch": 0.03799789568615662, + "flos": 21914918778240.0, + "grad_norm": 6.504165414948274, + "language_loss": 0.96511495, + "learning_rate": 3.999329268838575e-06, + "loss": 1.05912662, + "num_input_tokens_seen": 13287850, + "router_z_loss_clip": 10.71875, + "router_z_loss_mlp": 1.26953125, + "step": 632, + "time_per_iteration": 2.6638169288635254 + }, + { + "auxiliary_loss_clip": 0.07980786, + "auxiliary_loss_mlp": 0.01460671, + "balance_loss_clip": 0.06883863, + "balance_loss_mlp": 0.0132668, + "epoch": 0.03805801893882459, + "flos": 24833967465600.0, + "grad_norm": 3.720972995518591, + "language_loss": 0.88515753, + "learning_rate": 3.999319145312175e-06, + "loss": 0.97957206, + "num_input_tokens_seen": 13307760, + "router_z_loss_clip": 10.984375, + "router_z_loss_mlp": 1.33984375, + "step": 633, + "time_per_iteration": 2.7479147911071777 + }, + { + "auxiliary_loss_clip": 0.07973721, + "auxiliary_loss_mlp": 0.01476512, + "balance_loss_clip": 0.06873562, + "balance_loss_mlp": 0.01335273, + "epoch": 0.03811814219149256, + "flos": 30490950240000.0, + "grad_norm": 5.013866846245917, + "language_loss": 0.74909431, + "learning_rate": 3.999308945971392e-06, + "loss": 0.84359664, + "num_input_tokens_seen": 13331230, + "router_z_loss_clip": 11.0078125, + "router_z_loss_mlp": 1.4140625, + "step": 634, + "time_per_iteration": 2.7746760845184326 + }, + { + "auxiliary_loss_clip": 0.07892692, + "auxiliary_loss_mlp": 0.01617175, + "balance_loss_clip": 0.0733197, + "balance_loss_mlp": 0.01455336, + "epoch": 0.03817826544416053, + "flos": 67010671820160.0, + "grad_norm": 1.8703584651187424, + "language_loss": 0.63503969, + "learning_rate": 3.999298670816614e-06, + "loss": 0.73013842, + "num_input_tokens_seen": 13394760, + "router_z_loss_clip": 5.6171875, + "router_z_loss_mlp": 1.61816406, + "step": 635, + "time_per_iteration": 3.2972047328948975 + }, + { + "auxiliary_loss_clip": 0.08014892, + "auxiliary_loss_mlp": 0.01535345, + "balance_loss_clip": 0.06916042, + "balance_loss_mlp": 0.01392198, + "epoch": 0.038238388696828496, + "flos": 20491592198400.0, + "grad_norm": 9.695955755206388, + "language_loss": 0.90505767, + "learning_rate": 3.9992883198482294e-06, + "loss": 1.00056005, + "num_input_tokens_seen": 13412775, + "router_z_loss_clip": 10.9921875, + "router_z_loss_mlp": 1.43066406, + "step": 636, + "time_per_iteration": 2.6479721069335938 + }, + { + "auxiliary_loss_clip": 0.08042439, + "auxiliary_loss_mlp": 0.01559473, + "balance_loss_clip": 0.06923507, + "balance_loss_mlp": 0.01399637, + "epoch": 0.03829851194949647, + "flos": 17971389745920.0, + "grad_norm": 32.79410112755353, + "language_loss": 0.88142544, + "learning_rate": 3.999277893066632e-06, + "loss": 0.97744453, + "num_input_tokens_seen": 13427835, + "router_z_loss_clip": 11.1796875, + "router_z_loss_mlp": 1.59667969, + "step": 637, + "time_per_iteration": 2.6563000679016113 + }, + { + "auxiliary_loss_clip": 0.08110388, + "auxiliary_loss_mlp": 0.0159766, + "balance_loss_clip": 0.06951486, + "balance_loss_mlp": 0.0144078, + "epoch": 0.03835863520216444, + "flos": 22463251896960.0, + "grad_norm": 37.67076952511291, + "language_loss": 0.91187263, + "learning_rate": 3.999267390472215e-06, + "loss": 1.00895298, + "num_input_tokens_seen": 13447295, + "router_z_loss_clip": 11.578125, + "router_z_loss_mlp": 1.56933594, + "step": 638, + "time_per_iteration": 2.6984195709228516 + }, + { + "auxiliary_loss_clip": 0.08094786, + "auxiliary_loss_mlp": 0.01648944, + "balance_loss_clip": 0.0693827, + "balance_loss_mlp": 0.01462406, + "epoch": 0.038418758454832405, + "flos": 22171070309760.0, + "grad_norm": 8.895472090968715, + "language_loss": 0.76717615, + "learning_rate": 3.999256812065381e-06, + "loss": 0.86461353, + "num_input_tokens_seen": 13468455, + "router_z_loss_clip": 11.5703125, + "router_z_loss_mlp": 1.86621094, + "step": 639, + "time_per_iteration": 2.7338461875915527 + }, + { + "auxiliary_loss_clip": 0.08159171, + "auxiliary_loss_mlp": 0.0166434, + "balance_loss_clip": 0.06976852, + "balance_loss_mlp": 0.01475227, + "epoch": 0.03847888170750038, + "flos": 22754049891840.0, + "grad_norm": 14.750114797034104, + "language_loss": 0.93037415, + "learning_rate": 3.999246157846526e-06, + "loss": 1.02860928, + "num_input_tokens_seen": 13489085, + "router_z_loss_clip": 11.8203125, + "router_z_loss_mlp": 1.890625, + "step": 640, + "time_per_iteration": 2.6571292877197266 + }, + { + "auxiliary_loss_clip": 0.08171181, + "auxiliary_loss_mlp": 0.01715232, + "balance_loss_clip": 0.06975375, + "balance_loss_mlp": 0.01501704, + "epoch": 0.03853900496016834, + "flos": 22717852128000.0, + "grad_norm": 10.934463540103733, + "language_loss": 0.90094578, + "learning_rate": 3.9992354278160574e-06, + "loss": 0.99980986, + "num_input_tokens_seen": 13509120, + "router_z_loss_clip": 11.953125, + "router_z_loss_mlp": 2.1328125, + "step": 641, + "time_per_iteration": 2.6885619163513184 + }, + { + "auxiliary_loss_clip": 0.07644878, + "auxiliary_loss_mlp": 0.01447392, + "balance_loss_clip": 0.07120143, + "balance_loss_mlp": 0.01325512, + "epoch": 0.038599128212836314, + "flos": 70420039073280.0, + "grad_norm": 0.9281695288015585, + "language_loss": 0.65025115, + "learning_rate": 3.999224621974381e-06, + "loss": 0.74117386, + "num_input_tokens_seen": 13562005, + "router_z_loss_clip": 5.25, + "router_z_loss_mlp": 1.21679688, + "step": 642, + "time_per_iteration": 3.2678098678588867 + }, + { + "auxiliary_loss_clip": 0.08201542, + "auxiliary_loss_mlp": 0.01819887, + "balance_loss_clip": 0.07001273, + "balance_loss_mlp": 0.01562014, + "epoch": 0.03865925146550429, + "flos": 23301921813120.0, + "grad_norm": 11.481508748032715, + "language_loss": 0.86633605, + "learning_rate": 3.999213740321906e-06, + "loss": 0.96655035, + "num_input_tokens_seen": 13582185, + "router_z_loss_clip": 11.9921875, + "router_z_loss_mlp": 2.57617188, + "step": 643, + "time_per_iteration": 2.659075975418091 + }, + { + "auxiliary_loss_clip": 0.08181606, + "auxiliary_loss_mlp": 0.01825318, + "balance_loss_clip": 0.06992409, + "balance_loss_mlp": 0.01547799, + "epoch": 0.03871937471817225, + "flos": 21436255929600.0, + "grad_norm": 51.325604168223556, + "language_loss": 0.89457649, + "learning_rate": 3.999202782859046e-06, + "loss": 0.99464566, + "num_input_tokens_seen": 13599555, + "router_z_loss_clip": 11.890625, + "router_z_loss_mlp": 2.77539062, + "step": 644, + "time_per_iteration": 2.659674882888794 + }, + { + "auxiliary_loss_clip": 0.08227627, + "auxiliary_loss_mlp": 0.01840427, + "balance_loss_clip": 0.07032949, + "balance_loss_mlp": 0.01557186, + "epoch": 0.038779497970840224, + "flos": 34285914783360.0, + "grad_norm": 72.96819975442757, + "language_loss": 0.90063643, + "learning_rate": 3.9991917495862165e-06, + "loss": 1.00131702, + "num_input_tokens_seen": 13621160, + "router_z_loss_clip": 11.953125, + "router_z_loss_mlp": 2.83007812, + "step": 645, + "time_per_iteration": 2.732840061187744 + }, + { + "auxiliary_loss_clip": 0.08212948, + "auxiliary_loss_mlp": 0.01875445, + "balance_loss_clip": 0.07012647, + "balance_loss_mlp": 0.01580378, + "epoch": 0.03883962122350819, + "flos": 22754930359680.0, + "grad_norm": 12.262203154186425, + "language_loss": 0.90520537, + "learning_rate": 3.9991806405038345e-06, + "loss": 1.00608933, + "num_input_tokens_seen": 13641915, + "router_z_loss_clip": 12.0078125, + "router_z_loss_mlp": 2.95117188, + "step": 646, + "time_per_iteration": 2.6865735054016113 + }, + { + "auxiliary_loss_clip": 0.08250429, + "auxiliary_loss_mlp": 0.01894148, + "balance_loss_clip": 0.07030701, + "balance_loss_mlp": 0.01611288, + "epoch": 0.03889974447617616, + "flos": 21952500134400.0, + "grad_norm": 17.1595872898191, + "language_loss": 0.88891035, + "learning_rate": 3.999169455612323e-06, + "loss": 0.99035615, + "num_input_tokens_seen": 13661410, + "router_z_loss_clip": 12.1953125, + "router_z_loss_mlp": 2.83007812, + "step": 647, + "time_per_iteration": 2.648667097091675 + }, + { + "auxiliary_loss_clip": 0.08277115, + "auxiliary_loss_mlp": 0.01910975, + "balance_loss_clip": 0.0706424, + "balance_loss_mlp": 0.01610376, + "epoch": 0.03895986772884413, + "flos": 31513040743680.0, + "grad_norm": 19.91369953833428, + "language_loss": 0.91710514, + "learning_rate": 3.999158194912106e-06, + "loss": 1.01898599, + "num_input_tokens_seen": 13681705, + "router_z_loss_clip": 12.125, + "router_z_loss_mlp": 3.00585938, + "step": 648, + "time_per_iteration": 2.7659173011779785 + }, + { + "auxiliary_loss_clip": 0.08252379, + "auxiliary_loss_mlp": 0.0196062, + "balance_loss_clip": 0.0704875, + "balance_loss_mlp": 0.01647243, + "epoch": 0.0390199909815121, + "flos": 19907061315840.0, + "grad_norm": 11.116514995705378, + "language_loss": 0.90245318, + "learning_rate": 3.9991468584036086e-06, + "loss": 1.00458312, + "num_input_tokens_seen": 13700400, + "router_z_loss_clip": 12.0234375, + "router_z_loss_mlp": 3.1328125, + "step": 649, + "time_per_iteration": 4.126534938812256 + }, + { + "auxiliary_loss_clip": 0.08304022, + "auxiliary_loss_mlp": 0.01986477, + "balance_loss_clip": 0.07056045, + "balance_loss_mlp": 0.01679394, + "epoch": 0.03908011423418007, + "flos": 21618250997760.0, + "grad_norm": 9.336868328216912, + "language_loss": 0.85345471, + "learning_rate": 3.999135446087263e-06, + "loss": 0.95635974, + "num_input_tokens_seen": 13720145, + "router_z_loss_clip": 12.484375, + "router_z_loss_mlp": 3.07421875, + "step": 650, + "time_per_iteration": 4.1806252002716064 + }, + { + "auxiliary_loss_clip": 0.08239638, + "auxiliary_loss_mlp": 0.01912282, + "balance_loss_clip": 0.0705025, + "balance_loss_mlp": 0.01647351, + "epoch": 0.039140237486848035, + "flos": 18667406885760.0, + "grad_norm": 11.202480244033193, + "language_loss": 0.84588236, + "learning_rate": 3.9991239579635e-06, + "loss": 0.94740158, + "num_input_tokens_seen": 13737500, + "router_z_loss_clip": 11.890625, + "router_z_loss_mlp": 2.6484375, + "step": 651, + "time_per_iteration": 4.02846360206604 + }, + { + "auxiliary_loss_clip": 0.08228613, + "auxiliary_loss_mlp": 0.01893436, + "balance_loss_clip": 0.07038778, + "balance_loss_mlp": 0.01631557, + "epoch": 0.03920036073951601, + "flos": 18667071469440.0, + "grad_norm": 33.17940308554231, + "language_loss": 0.9516173, + "learning_rate": 3.999112394032757e-06, + "loss": 1.05283785, + "num_input_tokens_seen": 13754750, + "router_z_loss_clip": 11.90625, + "router_z_loss_mlp": 2.6171875, + "step": 652, + "time_per_iteration": 2.6877963542938232 + }, + { + "auxiliary_loss_clip": 0.08188264, + "auxiliary_loss_mlp": 0.01841461, + "balance_loss_clip": 0.07017257, + "balance_loss_mlp": 0.01607716, + "epoch": 0.03926048399218398, + "flos": 31361918705280.0, + "grad_norm": 14.717862862310868, + "language_loss": 0.87065995, + "learning_rate": 3.999100754295471e-06, + "loss": 0.97095722, + "num_input_tokens_seen": 13771990, + "router_z_loss_clip": 11.7109375, + "router_z_loss_mlp": 2.33691406, + "step": 653, + "time_per_iteration": 4.161829948425293 + }, + { + "auxiliary_loss_clip": 0.08235107, + "auxiliary_loss_mlp": 0.01869742, + "balance_loss_clip": 0.07023594, + "balance_loss_mlp": 0.01632659, + "epoch": 0.039320607244851945, + "flos": 29610715898880.0, + "grad_norm": 12.720561465838024, + "language_loss": 0.92308909, + "learning_rate": 3.999089038752085e-06, + "loss": 1.0241375, + "num_input_tokens_seen": 13792750, + "router_z_loss_clip": 12.125, + "router_z_loss_mlp": 2.37304688, + "step": 654, + "time_per_iteration": 2.7182300090789795 + }, + { + "auxiliary_loss_clip": 0.07219759, + "auxiliary_loss_mlp": 0.01432266, + "balance_loss_clip": 0.0672446, + "balance_loss_mlp": 0.01342621, + "epoch": 0.03938073049751992, + "flos": 66555362332800.0, + "grad_norm": 4.21609108891928, + "language_loss": 0.5259136, + "learning_rate": 3.999077247403041e-06, + "loss": 0.61243391, + "num_input_tokens_seen": 13858570, + "router_z_loss_clip": 4.94140625, + "router_z_loss_mlp": 0.89599609, + "step": 655, + "time_per_iteration": 3.3539531230926514 + }, + { + "auxiliary_loss_clip": 0.08163472, + "auxiliary_loss_mlp": 0.01789512, + "balance_loss_clip": 0.07021941, + "balance_loss_mlp": 0.01601352, + "epoch": 0.03944085375018788, + "flos": 23374568903040.0, + "grad_norm": 42.09331718280733, + "language_loss": 0.85369515, + "learning_rate": 3.9990653802487886e-06, + "loss": 0.95322502, + "num_input_tokens_seen": 13876335, + "router_z_loss_clip": 11.4140625, + "router_z_loss_mlp": 1.88183594, + "step": 656, + "time_per_iteration": 2.696443557739258 + }, + { + "auxiliary_loss_clip": 0.08208387, + "auxiliary_loss_mlp": 0.01830457, + "balance_loss_clip": 0.07014482, + "balance_loss_mlp": 0.01624177, + "epoch": 0.039500977002855854, + "flos": 18553656568320.0, + "grad_norm": 12.61442729870119, + "language_loss": 0.83751947, + "learning_rate": 3.999053437289776e-06, + "loss": 0.93790793, + "num_input_tokens_seen": 13892640, + "router_z_loss_clip": 11.9296875, + "router_z_loss_mlp": 2.06347656, + "step": 657, + "time_per_iteration": 2.6805458068847656 + }, + { + "auxiliary_loss_clip": 0.08160911, + "auxiliary_loss_mlp": 0.01759172, + "balance_loss_clip": 0.07011348, + "balance_loss_mlp": 0.0155871, + "epoch": 0.039561100255523826, + "flos": 25345264279680.0, + "grad_norm": 59.81491010429953, + "language_loss": 0.86573362, + "learning_rate": 3.999041418526457e-06, + "loss": 0.96493447, + "num_input_tokens_seen": 13910085, + "router_z_loss_clip": 11.5, + "router_z_loss_mlp": 2.00488281, + "step": 658, + "time_per_iteration": 2.7667956352233887 + }, + { + "auxiliary_loss_clip": 0.08139389, + "auxiliary_loss_mlp": 0.01752558, + "balance_loss_clip": 0.07002386, + "balance_loss_mlp": 0.01577368, + "epoch": 0.03962122350819179, + "flos": 18225193363200.0, + "grad_norm": 13.067415763006752, + "language_loss": 0.97220278, + "learning_rate": 3.999029323959287e-06, + "loss": 1.07112217, + "num_input_tokens_seen": 13928800, + "router_z_loss_clip": 11.375, + "router_z_loss_mlp": 1.75097656, + "step": 659, + "time_per_iteration": 2.7390072345733643 + }, + { + "auxiliary_loss_clip": 0.08160311, + "auxiliary_loss_mlp": 0.01767653, + "balance_loss_clip": 0.07020363, + "balance_loss_mlp": 0.01584643, + "epoch": 0.03968134676085976, + "flos": 20528544648960.0, + "grad_norm": 6.696604257077815, + "language_loss": 0.85069668, + "learning_rate": 3.999017153588724e-06, + "loss": 0.94997621, + "num_input_tokens_seen": 13948325, + "router_z_loss_clip": 11.40625, + "router_z_loss_mlp": 1.83203125, + "step": 660, + "time_per_iteration": 2.6942412853240967 + }, + { + "auxiliary_loss_clip": 0.08128712, + "auxiliary_loss_mlp": 0.01673628, + "balance_loss_clip": 0.07018431, + "balance_loss_mlp": 0.01512361, + "epoch": 0.03974147001352773, + "flos": 22429737463680.0, + "grad_norm": 7.3843033134333425, + "language_loss": 0.86255896, + "learning_rate": 3.999004907415231e-06, + "loss": 0.96058238, + "num_input_tokens_seen": 13969090, + "router_z_loss_clip": 11.109375, + "router_z_loss_mlp": 1.61132812, + "step": 661, + "time_per_iteration": 2.688343048095703 + }, + { + "auxiliary_loss_clip": 0.07200997, + "auxiliary_loss_mlp": 0.01397595, + "balance_loss_clip": 0.06707223, + "balance_loss_mlp": 0.01289354, + "epoch": 0.0398015932661957, + "flos": 71149780281600.0, + "grad_norm": 0.9134370604104062, + "language_loss": 0.69827634, + "learning_rate": 3.998992585439272e-06, + "loss": 0.78426224, + "num_input_tokens_seen": 14037555, + "router_z_loss_clip": 4.9375, + "router_z_loss_mlp": 1.08496094, + "step": 662, + "time_per_iteration": 3.4075381755828857 + }, + { + "auxiliary_loss_clip": 0.08114735, + "auxiliary_loss_mlp": 0.01667295, + "balance_loss_clip": 0.06992006, + "balance_loss_mlp": 0.01495347, + "epoch": 0.03986171651886367, + "flos": 16806688392960.0, + "grad_norm": 88.3041379662575, + "language_loss": 0.8901574, + "learning_rate": 3.998980187661314e-06, + "loss": 0.98797774, + "num_input_tokens_seen": 14055765, + "router_z_loss_clip": 11.234375, + "router_z_loss_mlp": 1.71875, + "step": 663, + "time_per_iteration": 2.6151316165924072 + }, + { + "auxiliary_loss_clip": 0.08116017, + "auxiliary_loss_mlp": 0.01665745, + "balance_loss_clip": 0.06974875, + "balance_loss_mlp": 0.01491318, + "epoch": 0.03992183977153164, + "flos": 24541953586560.0, + "grad_norm": 13.584726936237926, + "language_loss": 0.92355931, + "learning_rate": 3.998967714081826e-06, + "loss": 1.02137709, + "num_input_tokens_seen": 14074195, + "router_z_loss_clip": 11.3984375, + "router_z_loss_mlp": 1.74511719, + "step": 664, + "time_per_iteration": 2.7008705139160156 + }, + { + "auxiliary_loss_clip": 0.08040652, + "auxiliary_loss_mlp": 0.01593066, + "balance_loss_clip": 0.06989275, + "balance_loss_mlp": 0.01449252, + "epoch": 0.03998196302419961, + "flos": 15601261155840.0, + "grad_norm": 12.968973833741712, + "language_loss": 0.90573943, + "learning_rate": 3.998955164701281e-06, + "loss": 1.00207651, + "num_input_tokens_seen": 14090215, + "router_z_loss_clip": 10.5078125, + "router_z_loss_mlp": 1.43847656, + "step": 665, + "time_per_iteration": 2.588078737258911 + }, + { + "auxiliary_loss_clip": 0.0806282, + "auxiliary_loss_mlp": 0.01620663, + "balance_loss_clip": 0.06955597, + "balance_loss_mlp": 0.01454533, + "epoch": 0.04004208627686758, + "flos": 25312714168320.0, + "grad_norm": 13.194143098844163, + "language_loss": 0.86261296, + "learning_rate": 3.998942539520158e-06, + "loss": 0.9594478, + "num_input_tokens_seen": 14112150, + "router_z_loss_clip": 11.0859375, + "router_z_loss_mlp": 1.66113281, + "step": 666, + "time_per_iteration": 2.7150063514709473 + }, + { + "auxiliary_loss_clip": 0.08039176, + "auxiliary_loss_mlp": 0.01580059, + "balance_loss_clip": 0.06968041, + "balance_loss_mlp": 0.01428235, + "epoch": 0.04010220952953555, + "flos": 23482365580800.0, + "grad_norm": 143.76139759772911, + "language_loss": 0.91256213, + "learning_rate": 3.998929838538932e-06, + "loss": 1.00875449, + "num_input_tokens_seen": 14131475, + "router_z_loss_clip": 10.71875, + "router_z_loss_mlp": 1.51855469, + "step": 667, + "time_per_iteration": 2.6658053398132324 + }, + { + "auxiliary_loss_clip": 0.08004649, + "auxiliary_loss_mlp": 0.01530234, + "balance_loss_clip": 0.06972381, + "balance_loss_mlp": 0.01387469, + "epoch": 0.04016233278220352, + "flos": 18621691683840.0, + "grad_norm": 22.359711377029505, + "language_loss": 0.8821072, + "learning_rate": 3.998917061758087e-06, + "loss": 0.97745597, + "num_input_tokens_seen": 14146165, + "router_z_loss_clip": 10.3046875, + "router_z_loss_mlp": 1.42773438, + "step": 668, + "time_per_iteration": 2.6255545616149902 + }, + { + "auxiliary_loss_clip": 0.07152489, + "auxiliary_loss_mlp": 0.01341531, + "balance_loss_clip": 0.06666718, + "balance_loss_mlp": 0.01260421, + "epoch": 0.040222456034871484, + "flos": 70926556204800.0, + "grad_norm": 1.1799050230194268, + "language_loss": 0.60729092, + "learning_rate": 3.998904209178107e-06, + "loss": 0.69223112, + "num_input_tokens_seen": 14215005, + "router_z_loss_clip": 4.875, + "router_z_loss_mlp": 0.81103516, + "step": 669, + "time_per_iteration": 3.3595035076141357 + }, + { + "auxiliary_loss_clip": 0.08017544, + "auxiliary_loss_mlp": 0.01537312, + "balance_loss_clip": 0.06961209, + "balance_loss_mlp": 0.0138749, + "epoch": 0.040282579287539456, + "flos": 23770773734400.0, + "grad_norm": 21.749949136203163, + "language_loss": 0.91578722, + "learning_rate": 3.9988912807994785e-06, + "loss": 1.01133573, + "num_input_tokens_seen": 14235510, + "router_z_loss_clip": 10.5703125, + "router_z_loss_mlp": 1.49707031, + "step": 670, + "time_per_iteration": 2.66859769821167 + }, + { + "auxiliary_loss_clip": 0.08002704, + "auxiliary_loss_mlp": 0.01555976, + "balance_loss_clip": 0.0695509, + "balance_loss_mlp": 0.01413116, + "epoch": 0.04034270254020743, + "flos": 18484405568640.0, + "grad_norm": 9.221564261110139, + "language_loss": 0.80103904, + "learning_rate": 3.998878276622692e-06, + "loss": 0.89662588, + "num_input_tokens_seen": 14254565, + "router_z_loss_clip": 10.484375, + "router_z_loss_mlp": 1.4296875, + "step": 671, + "time_per_iteration": 2.6671946048736572 + }, + { + "auxiliary_loss_clip": 0.07994901, + "auxiliary_loss_mlp": 0.01548628, + "balance_loss_clip": 0.06957932, + "balance_loss_mlp": 0.01400332, + "epoch": 0.040402825792875394, + "flos": 17207589052800.0, + "grad_norm": 12.445045366932057, + "language_loss": 0.98976898, + "learning_rate": 3.998865196648242e-06, + "loss": 1.08520412, + "num_input_tokens_seen": 14271885, + "router_z_loss_clip": 10.375, + "router_z_loss_mlp": 1.484375, + "step": 672, + "time_per_iteration": 2.6043524742126465 + }, + { + "auxiliary_loss_clip": 0.08007569, + "auxiliary_loss_mlp": 0.01577526, + "balance_loss_clip": 0.06955793, + "balance_loss_mlp": 0.01428181, + "epoch": 0.040462949045543366, + "flos": 19178242502400.0, + "grad_norm": 16.68355787547426, + "language_loss": 0.95323932, + "learning_rate": 3.998852040876622e-06, + "loss": 1.04909039, + "num_input_tokens_seen": 14289670, + "router_z_loss_clip": 10.53125, + "router_z_loss_mlp": 1.49316406, + "step": 673, + "time_per_iteration": 2.67228102684021 + }, + { + "auxiliary_loss_clip": 0.07999671, + "auxiliary_loss_mlp": 0.01557213, + "balance_loss_clip": 0.06955186, + "balance_loss_mlp": 0.01413161, + "epoch": 0.04052307229821133, + "flos": 24025877089920.0, + "grad_norm": 7.385878323717427, + "language_loss": 0.80140877, + "learning_rate": 3.998838809308334e-06, + "loss": 0.89697754, + "num_input_tokens_seen": 14309285, + "router_z_loss_clip": 10.4375, + "router_z_loss_mlp": 1.43994141, + "step": 674, + "time_per_iteration": 2.6599738597869873 + }, + { + "auxiliary_loss_clip": 0.08032155, + "auxiliary_loss_mlp": 0.01590571, + "balance_loss_clip": 0.06966965, + "balance_loss_mlp": 0.01439795, + "epoch": 0.0405831955508793, + "flos": 16442362840320.0, + "grad_norm": 8.615330731484576, + "language_loss": 0.83709693, + "learning_rate": 3.9988255019438766e-06, + "loss": 0.93332422, + "num_input_tokens_seen": 14328300, + "router_z_loss_clip": 10.6484375, + "router_z_loss_mlp": 1.50683594, + "step": 675, + "time_per_iteration": 2.68145751953125 + }, + { + "auxiliary_loss_clip": 0.07989661, + "auxiliary_loss_mlp": 0.01530552, + "balance_loss_clip": 0.06954966, + "balance_loss_mlp": 0.01384926, + "epoch": 0.040643318803547275, + "flos": 24286808304000.0, + "grad_norm": 7.342047246701879, + "language_loss": 0.80985713, + "learning_rate": 3.998812118783757e-06, + "loss": 0.90505934, + "num_input_tokens_seen": 14346395, + "router_z_loss_clip": 10.3359375, + "router_z_loss_mlp": 1.45605469, + "step": 676, + "time_per_iteration": 2.6827666759490967 + }, + { + "auxiliary_loss_clip": 0.0800771, + "auxiliary_loss_mlp": 0.01548704, + "balance_loss_clip": 0.06941711, + "balance_loss_mlp": 0.01395925, + "epoch": 0.04070344205621524, + "flos": 17717795763840.0, + "grad_norm": 11.552804849972091, + "language_loss": 0.9000327, + "learning_rate": 3.9987986598284804e-06, + "loss": 0.99559683, + "num_input_tokens_seen": 14364605, + "router_z_loss_clip": 10.6640625, + "router_z_loss_mlp": 1.52734375, + "step": 677, + "time_per_iteration": 2.647284984588623 + }, + { + "auxiliary_loss_clip": 0.0795664, + "auxiliary_loss_mlp": 0.01525712, + "balance_loss_clip": 0.06946824, + "balance_loss_mlp": 0.01385522, + "epoch": 0.04076356530888321, + "flos": 26184940444800.0, + "grad_norm": 15.722345117009269, + "language_loss": 0.81235254, + "learning_rate": 3.998785125078559e-06, + "loss": 0.90717608, + "num_input_tokens_seen": 14385265, + "router_z_loss_clip": 10.09375, + "router_z_loss_mlp": 1.40039062, + "step": 678, + "time_per_iteration": 2.713604688644409 + }, + { + "auxiliary_loss_clip": 0.07982595, + "auxiliary_loss_mlp": 0.01542507, + "balance_loss_clip": 0.06946435, + "balance_loss_mlp": 0.01393447, + "epoch": 0.04082368856155118, + "flos": 35782349650560.0, + "grad_norm": 7.406308464158208, + "language_loss": 0.87816763, + "learning_rate": 3.998771514534505e-06, + "loss": 0.97341865, + "num_input_tokens_seen": 14406090, + "router_z_loss_clip": 10.3671875, + "router_z_loss_mlp": 1.4921875, + "step": 679, + "time_per_iteration": 2.7753264904022217 + }, + { + "auxiliary_loss_clip": 0.07950564, + "auxiliary_loss_mlp": 0.01522729, + "balance_loss_clip": 0.06942166, + "balance_loss_mlp": 0.01383969, + "epoch": 0.04088381181421915, + "flos": 28154042593920.0, + "grad_norm": 7.465466597866811, + "language_loss": 0.8230598, + "learning_rate": 3.998757828196835e-06, + "loss": 0.91779268, + "num_input_tokens_seen": 14425130, + "router_z_loss_clip": 10.0859375, + "router_z_loss_mlp": 1.38671875, + "step": 680, + "time_per_iteration": 2.729719400405884 + }, + { + "auxiliary_loss_clip": 0.07993592, + "auxiliary_loss_mlp": 0.01532905, + "balance_loss_clip": 0.06938143, + "balance_loss_mlp": 0.01378696, + "epoch": 0.04094393506688712, + "flos": 27604703226240.0, + "grad_norm": 9.665492233492547, + "language_loss": 0.8765927, + "learning_rate": 3.9987440660660685e-06, + "loss": 0.97185767, + "num_input_tokens_seen": 14447355, + "router_z_loss_clip": 10.5703125, + "router_z_loss_mlp": 1.54199219, + "step": 681, + "time_per_iteration": 2.752514600753784 + }, + { + "auxiliary_loss_clip": 0.07989424, + "auxiliary_loss_mlp": 0.01553673, + "balance_loss_clip": 0.0693374, + "balance_loss_mlp": 0.01390118, + "epoch": 0.04100405831955509, + "flos": 23118668933760.0, + "grad_norm": 7.019008438585821, + "language_loss": 0.77474326, + "learning_rate": 3.998730228142726e-06, + "loss": 0.87017429, + "num_input_tokens_seen": 14466790, + "router_z_loss_clip": 10.5546875, + "router_z_loss_mlp": 1.63476562, + "step": 682, + "time_per_iteration": 2.6727144718170166 + }, + { + "auxiliary_loss_clip": 0.07959605, + "auxiliary_loss_mlp": 0.01503527, + "balance_loss_clip": 0.06938009, + "balance_loss_mlp": 0.01370394, + "epoch": 0.04106418157222306, + "flos": 20162877431040.0, + "grad_norm": 10.358969831785554, + "language_loss": 0.77842575, + "learning_rate": 3.998716314427333e-06, + "loss": 0.87305701, + "num_input_tokens_seen": 14485195, + "router_z_loss_clip": 10.2109375, + "router_z_loss_mlp": 1.33007812, + "step": 683, + "time_per_iteration": 2.6043591499328613 + }, + { + "auxiliary_loss_clip": 0.07972776, + "auxiliary_loss_mlp": 0.01527418, + "balance_loss_clip": 0.06933653, + "balance_loss_mlp": 0.01377405, + "epoch": 0.041124304824891024, + "flos": 17426452717440.0, + "grad_norm": 41.27076771704703, + "language_loss": 0.86504227, + "learning_rate": 3.998702324920417e-06, + "loss": 0.96004421, + "num_input_tokens_seen": 14503370, + "router_z_loss_clip": 10.3984375, + "router_z_loss_mlp": 1.5, + "step": 684, + "time_per_iteration": 2.6286985874176025 + }, + { + "auxiliary_loss_clip": 0.07935933, + "auxiliary_loss_mlp": 0.01488839, + "balance_loss_clip": 0.06928104, + "balance_loss_mlp": 0.01343976, + "epoch": 0.041184428077558996, + "flos": 25788022853760.0, + "grad_norm": 3.9155930370094065, + "language_loss": 0.94948566, + "learning_rate": 3.9986882596225085e-06, + "loss": 1.04373336, + "num_input_tokens_seen": 14526415, + "router_z_loss_clip": 10.0859375, + "router_z_loss_mlp": 1.44824219, + "step": 685, + "time_per_iteration": 2.7345352172851562 + }, + { + "auxiliary_loss_clip": 0.07948299, + "auxiliary_loss_mlp": 0.0149691, + "balance_loss_clip": 0.06921411, + "balance_loss_mlp": 0.01346992, + "epoch": 0.04124455133022697, + "flos": 22971152620800.0, + "grad_norm": 3.7671102410224577, + "language_loss": 0.94070864, + "learning_rate": 3.998674118534141e-06, + "loss": 1.03516078, + "num_input_tokens_seen": 14546595, + "router_z_loss_clip": 10.2734375, + "router_z_loss_mlp": 1.5, + "step": 686, + "time_per_iteration": 2.6663894653320312 + }, + { + "auxiliary_loss_clip": 0.0795872, + "auxiliary_loss_mlp": 0.01501087, + "balance_loss_clip": 0.06920497, + "balance_loss_mlp": 0.01356414, + "epoch": 0.04130467458289493, + "flos": 21295615651200.0, + "grad_norm": 39.86585208650635, + "language_loss": 0.77225804, + "learning_rate": 3.998659901655851e-06, + "loss": 0.8668561, + "num_input_tokens_seen": 14566590, + "router_z_loss_clip": 10.3828125, + "router_z_loss_mlp": 1.44628906, + "step": 687, + "time_per_iteration": 2.6355550289154053 + }, + { + "auxiliary_loss_clip": 0.07898364, + "auxiliary_loss_mlp": 0.01464255, + "balance_loss_clip": 0.06899062, + "balance_loss_mlp": 0.01340564, + "epoch": 0.041364797835562905, + "flos": 19980337311360.0, + "grad_norm": 4.212344971526593, + "language_loss": 0.91093004, + "learning_rate": 3.998645608988177e-06, + "loss": 1.00455618, + "num_input_tokens_seen": 14585965, + "router_z_loss_clip": 10.0078125, + "router_z_loss_mlp": 1.23730469, + "step": 688, + "time_per_iteration": 4.057282209396362 + }, + { + "auxiliary_loss_clip": 0.07878294, + "auxiliary_loss_mlp": 0.01448978, + "balance_loss_clip": 0.06897704, + "balance_loss_mlp": 0.01329388, + "epoch": 0.04142492108823087, + "flos": 21912361228800.0, + "grad_norm": 22.971814885863903, + "language_loss": 0.88008463, + "learning_rate": 3.998631240531661e-06, + "loss": 0.97335738, + "num_input_tokens_seen": 14606015, + "router_z_loss_clip": 9.796875, + "router_z_loss_mlp": 1.19628906, + "step": 689, + "time_per_iteration": 4.07433295249939 + }, + { + "auxiliary_loss_clip": 0.07866906, + "auxiliary_loss_mlp": 0.01444557, + "balance_loss_clip": 0.06897521, + "balance_loss_mlp": 0.01326349, + "epoch": 0.04148504434089884, + "flos": 27647567389440.0, + "grad_norm": 6.767605845927541, + "language_loss": 0.72533339, + "learning_rate": 3.998616796286848e-06, + "loss": 0.81844807, + "num_input_tokens_seen": 14629955, + "router_z_loss_clip": 9.6953125, + "router_z_loss_mlp": 1.18212891, + "step": 690, + "time_per_iteration": 4.110247611999512 + }, + { + "auxiliary_loss_clip": 0.07835479, + "auxiliary_loss_mlp": 0.01439264, + "balance_loss_clip": 0.06874412, + "balance_loss_mlp": 0.01314809, + "epoch": 0.041545167593566815, + "flos": 20524058455680.0, + "grad_norm": 9.225891193910236, + "language_loss": 0.79284167, + "learning_rate": 3.998602276254286e-06, + "loss": 0.88558906, + "num_input_tokens_seen": 14648000, + "router_z_loss_clip": 9.6171875, + "router_z_loss_mlp": 1.24316406, + "step": 691, + "time_per_iteration": 2.667081594467163 + }, + { + "auxiliary_loss_clip": 0.07827538, + "auxiliary_loss_mlp": 0.01419803, + "balance_loss_clip": 0.06878158, + "balance_loss_mlp": 0.01303931, + "epoch": 0.04160529084623478, + "flos": 11872738500480.0, + "grad_norm": 5.1056325398424125, + "language_loss": 0.88591456, + "learning_rate": 3.998587680434526e-06, + "loss": 0.97838795, + "num_input_tokens_seen": 14662235, + "router_z_loss_clip": 9.484375, + "router_z_loss_mlp": 1.15820312, + "step": 692, + "time_per_iteration": 4.027364015579224 + }, + { + "auxiliary_loss_clip": 0.07869601, + "auxiliary_loss_mlp": 0.01461887, + "balance_loss_clip": 0.0685929, + "balance_loss_mlp": 0.01322936, + "epoch": 0.04166541409890275, + "flos": 14833309685760.0, + "grad_norm": 14.964488884578895, + "language_loss": 0.94025421, + "learning_rate": 3.99857300882812e-06, + "loss": 1.0335691, + "num_input_tokens_seen": 14676065, + "router_z_loss_clip": 10.1171875, + "router_z_loss_mlp": 1.38867188, + "step": 693, + "time_per_iteration": 2.6548287868499756 + }, + { + "auxiliary_loss_clip": 0.07852003, + "auxiliary_loss_mlp": 0.01436954, + "balance_loss_clip": 0.06875066, + "balance_loss_mlp": 0.01312977, + "epoch": 0.04172553735157072, + "flos": 25814577398400.0, + "grad_norm": 10.760604695701561, + "language_loss": 0.88156736, + "learning_rate": 3.998558261435626e-06, + "loss": 0.97445703, + "num_input_tokens_seen": 14694955, + "router_z_loss_clip": 9.765625, + "router_z_loss_mlp": 1.24023438, + "step": 694, + "time_per_iteration": 2.6794655323028564 + }, + { + "auxiliary_loss_clip": 0.07850839, + "auxiliary_loss_mlp": 0.01460734, + "balance_loss_clip": 0.0686307, + "balance_loss_mlp": 0.01329222, + "epoch": 0.04178566060423869, + "flos": 24286682522880.0, + "grad_norm": 6.107694720201945, + "language_loss": 0.89735746, + "learning_rate": 3.9985434382576015e-06, + "loss": 0.99047321, + "num_input_tokens_seen": 14715510, + "router_z_loss_clip": 9.890625, + "router_z_loss_mlp": 1.31445312, + "step": 695, + "time_per_iteration": 2.7562625408172607 + }, + { + "auxiliary_loss_clip": 0.07797342, + "auxiliary_loss_mlp": 0.01449631, + "balance_loss_clip": 0.0684258, + "balance_loss_mlp": 0.01321648, + "epoch": 0.04184578385690666, + "flos": 18227667058560.0, + "grad_norm": 4.8539800399764195, + "language_loss": 0.91097277, + "learning_rate": 3.99852853929461e-06, + "loss": 1.00344253, + "num_input_tokens_seen": 14731755, + "router_z_loss_clip": 9.5625, + "router_z_loss_mlp": 1.28027344, + "step": 696, + "time_per_iteration": 2.6180830001831055 + }, + { + "auxiliary_loss_clip": 0.07759669, + "auxiliary_loss_mlp": 0.01436884, + "balance_loss_clip": 0.06835265, + "balance_loss_mlp": 0.01318438, + "epoch": 0.041905907109574626, + "flos": 22781694539520.0, + "grad_norm": 8.248305080547661, + "language_loss": 0.97183168, + "learning_rate": 3.998513564547216e-06, + "loss": 1.06379724, + "num_input_tokens_seen": 14750810, + "router_z_loss_clip": 9.234375, + "router_z_loss_mlp": 1.18359375, + "step": 697, + "time_per_iteration": 2.6976754665374756 + }, + { + "auxiliary_loss_clip": 0.0775051, + "auxiliary_loss_mlp": 0.0142093, + "balance_loss_clip": 0.06823087, + "balance_loss_mlp": 0.01301005, + "epoch": 0.0419660303622426, + "flos": 20163128993280.0, + "grad_norm": 6.669627081417543, + "language_loss": 0.90090138, + "learning_rate": 3.998498514015987e-06, + "loss": 0.99261582, + "num_input_tokens_seen": 14768435, + "router_z_loss_clip": 9.2890625, + "router_z_loss_mlp": 1.20068359, + "step": 698, + "time_per_iteration": 2.6525814533233643 + }, + { + "auxiliary_loss_clip": 0.07798302, + "auxiliary_loss_mlp": 0.01439823, + "balance_loss_clip": 0.06844427, + "balance_loss_mlp": 0.01318039, + "epoch": 0.042026153614910564, + "flos": 23083142002560.0, + "grad_norm": 12.169844049295248, + "language_loss": 0.96140921, + "learning_rate": 3.998483387701495e-06, + "loss": 1.05379045, + "num_input_tokens_seen": 14786690, + "router_z_loss_clip": 9.546875, + "router_z_loss_mlp": 1.21728516, + "step": 699, + "time_per_iteration": 2.700636625289917 + }, + { + "auxiliary_loss_clip": 0.0715683, + "auxiliary_loss_mlp": 0.01383088, + "balance_loss_clip": 0.06685513, + "balance_loss_mlp": 0.01307272, + "epoch": 0.042086276867578536, + "flos": 64516296424320.0, + "grad_norm": 2.8955425132907755, + "language_loss": 0.7356112, + "learning_rate": 3.998468185604312e-06, + "loss": 0.82101035, + "num_input_tokens_seen": 14853840, + "router_z_loss_clip": 4.71875, + "router_z_loss_mlp": 0.75683594, + "step": 700, + "time_per_iteration": 3.2564964294433594 + }, + { + "auxiliary_loss_clip": 0.07741027, + "auxiliary_loss_mlp": 0.01429077, + "balance_loss_clip": 0.0681721, + "balance_loss_mlp": 0.01313587, + "epoch": 0.04214640012024651, + "flos": 15492458229120.0, + "grad_norm": 9.391497638208355, + "language_loss": 0.93962044, + "learning_rate": 3.998452907725016e-06, + "loss": 1.03132153, + "num_input_tokens_seen": 14869580, + "router_z_loss_clip": 9.2421875, + "router_z_loss_mlp": 1.15527344, + "step": 701, + "time_per_iteration": 2.66644024848938 + }, + { + "auxiliary_loss_clip": 0.07737128, + "auxiliary_loss_mlp": 0.01419929, + "balance_loss_clip": 0.06809002, + "balance_loss_mlp": 0.01302341, + "epoch": 0.04220652337291447, + "flos": 23883601656960.0, + "grad_norm": 33.27176662769112, + "language_loss": 0.71847737, + "learning_rate": 3.998437554064184e-06, + "loss": 0.81004792, + "num_input_tokens_seen": 14891065, + "router_z_loss_clip": 9.2890625, + "router_z_loss_mlp": 1.17529297, + "step": 702, + "time_per_iteration": 2.7162067890167236 + }, + { + "auxiliary_loss_clip": 0.07125677, + "auxiliary_loss_mlp": 0.01365095, + "balance_loss_clip": 0.06657615, + "balance_loss_mlp": 0.01297575, + "epoch": 0.042266646625582445, + "flos": 63815289966720.0, + "grad_norm": 0.8674304256332159, + "language_loss": 0.6110186, + "learning_rate": 3.9984221246224006e-06, + "loss": 0.69592631, + "num_input_tokens_seen": 14954815, + "router_z_loss_clip": 4.6875, + "router_z_loss_mlp": 0.67578125, + "step": 703, + "time_per_iteration": 3.3240442276000977 + }, + { + "auxiliary_loss_clip": 0.0710092, + "auxiliary_loss_mlp": 0.01355985, + "balance_loss_clip": 0.06631917, + "balance_loss_mlp": 0.01291803, + "epoch": 0.04232676987825041, + "flos": 50038912154880.0, + "grad_norm": 1.041495616235658, + "language_loss": 0.58151424, + "learning_rate": 3.9984066194002494e-06, + "loss": 0.66608322, + "num_input_tokens_seen": 15003050, + "router_z_loss_clip": 4.6875, + "router_z_loss_mlp": 0.64160156, + "step": 704, + "time_per_iteration": 3.174765110015869 + }, + { + "auxiliary_loss_clip": 0.07745479, + "auxiliary_loss_mlp": 0.01449155, + "balance_loss_clip": 0.06810448, + "balance_loss_mlp": 0.01329278, + "epoch": 0.04238689313091838, + "flos": 21622485628800.0, + "grad_norm": 12.557351496220864, + "language_loss": 0.93966371, + "learning_rate": 3.998391038398319e-06, + "loss": 1.03161013, + "num_input_tokens_seen": 15021990, + "router_z_loss_clip": 9.3515625, + "router_z_loss_mlp": 1.19775391, + "step": 705, + "time_per_iteration": 2.6435232162475586 + }, + { + "auxiliary_loss_clip": 0.07677379, + "auxiliary_loss_mlp": 0.01427121, + "balance_loss_clip": 0.06791299, + "balance_loss_mlp": 0.01325698, + "epoch": 0.042447016383586354, + "flos": 19141080416640.0, + "grad_norm": 3.7381942579388303, + "language_loss": 0.75889277, + "learning_rate": 3.998375381617201e-06, + "loss": 0.8499378, + "num_input_tokens_seen": 15040700, + "router_z_loss_clip": 8.8515625, + "router_z_loss_mlp": 1.01269531, + "step": 706, + "time_per_iteration": 2.671828508377075 + }, + { + "auxiliary_loss_clip": 0.07719514, + "auxiliary_loss_mlp": 0.01450054, + "balance_loss_clip": 0.06807585, + "balance_loss_mlp": 0.01336471, + "epoch": 0.04250713963625432, + "flos": 24432941024640.0, + "grad_norm": 29.794541170575812, + "language_loss": 0.97812521, + "learning_rate": 3.9983596490574875e-06, + "loss": 1.06982088, + "num_input_tokens_seen": 15056725, + "router_z_loss_clip": 9.1171875, + "router_z_loss_mlp": 1.13427734, + "step": 707, + "time_per_iteration": 2.6550920009613037 + }, + { + "auxiliary_loss_clip": 0.07717137, + "auxiliary_loss_mlp": 0.01443639, + "balance_loss_clip": 0.06809401, + "balance_loss_mlp": 0.01333776, + "epoch": 0.04256726288892229, + "flos": 30374348883840.0, + "grad_norm": 14.849267761051758, + "language_loss": 0.85616708, + "learning_rate": 3.998343840719776e-06, + "loss": 0.94777477, + "num_input_tokens_seen": 15077550, + "router_z_loss_clip": 9.09375, + "router_z_loss_mlp": 1.09863281, + "step": 708, + "time_per_iteration": 2.7447280883789062 + }, + { + "auxiliary_loss_clip": 0.07730591, + "auxiliary_loss_mlp": 0.01453146, + "balance_loss_clip": 0.06808455, + "balance_loss_mlp": 0.01341232, + "epoch": 0.04262738614159026, + "flos": 16368248304000.0, + "grad_norm": 3.836638557890093, + "language_loss": 0.88926339, + "learning_rate": 3.998327956604666e-06, + "loss": 0.98110074, + "num_input_tokens_seen": 15094955, + "router_z_loss_clip": 9.21875, + "router_z_loss_mlp": 1.11914062, + "step": 709, + "time_per_iteration": 2.632735252380371 + }, + { + "auxiliary_loss_clip": 0.07711782, + "auxiliary_loss_mlp": 0.01472, + "balance_loss_clip": 0.06786519, + "balance_loss_mlp": 0.01342396, + "epoch": 0.04268750939425823, + "flos": 20418609692160.0, + "grad_norm": 7.682824070104421, + "language_loss": 0.92841685, + "learning_rate": 3.99831199671276e-06, + "loss": 1.02025461, + "num_input_tokens_seen": 15113395, + "router_z_loss_clip": 9.2421875, + "router_z_loss_mlp": 1.296875, + "step": 710, + "time_per_iteration": 2.6799728870391846 + }, + { + "auxiliary_loss_clip": 0.07731062, + "auxiliary_loss_mlp": 0.01465957, + "balance_loss_clip": 0.06815341, + "balance_loss_mlp": 0.01351993, + "epoch": 0.0427476326469262, + "flos": 20309177859840.0, + "grad_norm": 5.073822997040578, + "language_loss": 0.89081585, + "learning_rate": 3.998295961044662e-06, + "loss": 0.98278606, + "num_input_tokens_seen": 15132920, + "router_z_loss_clip": 9.1484375, + "router_z_loss_mlp": 1.13867188, + "step": 711, + "time_per_iteration": 2.6377625465393066 + }, + { + "auxiliary_loss_clip": 0.07695919, + "auxiliary_loss_mlp": 0.01446717, + "balance_loss_clip": 0.06801347, + "balance_loss_mlp": 0.01336377, + "epoch": 0.042807755899594166, + "flos": 21656880529920.0, + "grad_norm": 4.571300727713509, + "language_loss": 0.91390419, + "learning_rate": 3.9982798496009804e-06, + "loss": 1.00533056, + "num_input_tokens_seen": 15153115, + "router_z_loss_clip": 8.9453125, + "router_z_loss_mlp": 1.10302734, + "step": 712, + "time_per_iteration": 2.6158323287963867 + }, + { + "auxiliary_loss_clip": 0.07722442, + "auxiliary_loss_mlp": 0.01473663, + "balance_loss_clip": 0.06794881, + "balance_loss_mlp": 0.01356647, + "epoch": 0.04286787915226214, + "flos": 21441580663680.0, + "grad_norm": 10.343893565695913, + "language_loss": 0.96509683, + "learning_rate": 3.998263662382328e-06, + "loss": 1.05705786, + "num_input_tokens_seen": 15172770, + "router_z_loss_clip": 9.265625, + "router_z_loss_mlp": 1.17041016, + "step": 713, + "time_per_iteration": 2.668109655380249 + }, + { + "auxiliary_loss_clip": 0.07025006, + "auxiliary_loss_mlp": 0.01310492, + "balance_loss_clip": 0.06573053, + "balance_loss_mlp": 0.01250029, + "epoch": 0.04292800240493011, + "flos": 66420256423680.0, + "grad_norm": 1.0671347208063184, + "language_loss": 0.65522671, + "learning_rate": 3.9982473993893165e-06, + "loss": 0.73858166, + "num_input_tokens_seen": 15240055, + "router_z_loss_clip": 4.5078125, + "router_z_loss_mlp": 0.60351562, + "step": 714, + "time_per_iteration": 3.317920207977295 + }, + { + "auxiliary_loss_clip": 0.07647526, + "auxiliary_loss_mlp": 0.01441108, + "balance_loss_clip": 0.0677468, + "balance_loss_mlp": 0.01326476, + "epoch": 0.042988125657598075, + "flos": 31658418777600.0, + "grad_norm": 3.6319248406792983, + "language_loss": 0.79793668, + "learning_rate": 3.998231060622563e-06, + "loss": 0.88882303, + "num_input_tokens_seen": 15261585, + "router_z_loss_clip": 8.73046875, + "router_z_loss_mlp": 1.14550781, + "step": 715, + "time_per_iteration": 2.717393398284912 + }, + { + "auxiliary_loss_clip": 0.07645463, + "auxiliary_loss_mlp": 0.01445614, + "balance_loss_clip": 0.06767702, + "balance_loss_mlp": 0.01331984, + "epoch": 0.04304824891026605, + "flos": 33255690433920.0, + "grad_norm": 29.540799393093693, + "language_loss": 0.77394652, + "learning_rate": 3.998214646082688e-06, + "loss": 0.86485732, + "num_input_tokens_seen": 15281160, + "router_z_loss_clip": 8.7890625, + "router_z_loss_mlp": 1.13623047, + "step": 716, + "time_per_iteration": 2.7298099994659424 + }, + { + "auxiliary_loss_clip": 0.07019071, + "auxiliary_loss_mlp": 0.01306888, + "balance_loss_clip": 0.06569381, + "balance_loss_mlp": 0.01252815, + "epoch": 0.04310837216293401, + "flos": 64086996430080.0, + "grad_norm": 0.9619131870502678, + "language_loss": 0.6602453, + "learning_rate": 3.998198155770314e-06, + "loss": 0.74350488, + "num_input_tokens_seen": 15344505, + "router_z_loss_clip": 4.5, + "router_z_loss_mlp": 0.54199219, + "step": 717, + "time_per_iteration": 3.2711920738220215 + }, + { + "auxiliary_loss_clip": 0.06998679, + "auxiliary_loss_mlp": 0.01302753, + "balance_loss_clip": 0.06550965, + "balance_loss_mlp": 0.01248918, + "epoch": 0.043168495415601985, + "flos": 61361990599680.0, + "grad_norm": 0.9806748941419274, + "language_loss": 0.58663344, + "learning_rate": 3.998181589686065e-06, + "loss": 0.66964775, + "num_input_tokens_seen": 15404050, + "router_z_loss_clip": 4.49609375, + "router_z_loss_mlp": 0.53955078, + "step": 718, + "time_per_iteration": 3.083362579345703 + }, + { + "auxiliary_loss_clip": 0.07634784, + "auxiliary_loss_mlp": 0.01408365, + "balance_loss_clip": 0.06757121, + "balance_loss_mlp": 0.01309135, + "epoch": 0.04322861866826996, + "flos": 20710539717120.0, + "grad_norm": 8.670927241625472, + "language_loss": 0.97469372, + "learning_rate": 3.99816494783057e-06, + "loss": 1.06512523, + "num_input_tokens_seen": 15424190, + "router_z_loss_clip": 8.78125, + "router_z_loss_mlp": 0.99316406, + "step": 719, + "time_per_iteration": 2.620244264602661 + }, + { + "auxiliary_loss_clip": 0.07617359, + "auxiliary_loss_mlp": 0.01437239, + "balance_loss_clip": 0.06746139, + "balance_loss_mlp": 0.01327042, + "epoch": 0.04328874192093792, + "flos": 30381308772480.0, + "grad_norm": 7.103043460272315, + "language_loss": 0.71241379, + "learning_rate": 3.99814823020446e-06, + "loss": 0.8029598, + "num_input_tokens_seen": 15446500, + "router_z_loss_clip": 8.703125, + "router_z_loss_mlp": 1.10253906, + "step": 720, + "time_per_iteration": 2.7137084007263184 + }, + { + "auxiliary_loss_clip": 0.07571768, + "auxiliary_loss_mlp": 0.01420566, + "balance_loss_clip": 0.06721878, + "balance_loss_mlp": 0.01314518, + "epoch": 0.043348865173605894, + "flos": 21951284250240.0, + "grad_norm": 7.242521234745598, + "language_loss": 0.82826072, + "learning_rate": 3.9981314368083684e-06, + "loss": 0.91818404, + "num_input_tokens_seen": 15465830, + "router_z_loss_clip": 8.5078125, + "router_z_loss_mlp": 1.06152344, + "step": 721, + "time_per_iteration": 2.6496849060058594 + }, + { + "auxiliary_loss_clip": 0.07618188, + "auxiliary_loss_mlp": 0.01421571, + "balance_loss_clip": 0.06749155, + "balance_loss_mlp": 0.01323009, + "epoch": 0.04340898842627386, + "flos": 15268982590080.0, + "grad_norm": 11.950148766430376, + "language_loss": 0.94630802, + "learning_rate": 3.998114567642933e-06, + "loss": 1.03670549, + "num_input_tokens_seen": 15479985, + "router_z_loss_clip": 8.6953125, + "router_z_loss_mlp": 0.98486328, + "step": 722, + "time_per_iteration": 2.665302038192749 + }, + { + "auxiliary_loss_clip": 0.07582939, + "auxiliary_loss_mlp": 0.01410079, + "balance_loss_clip": 0.06720737, + "balance_loss_mlp": 0.01309896, + "epoch": 0.04346911167894183, + "flos": 27973011847680.0, + "grad_norm": 7.626593725821058, + "language_loss": 0.90292984, + "learning_rate": 3.998097622708792e-06, + "loss": 0.99286008, + "num_input_tokens_seen": 15501545, + "router_z_loss_clip": 8.625, + "router_z_loss_mlp": 1.00195312, + "step": 723, + "time_per_iteration": 2.6893301010131836 + }, + { + "auxiliary_loss_clip": 0.0756183, + "auxiliary_loss_mlp": 0.01404071, + "balance_loss_clip": 0.06712201, + "balance_loss_mlp": 0.01307798, + "epoch": 0.0435292349316098, + "flos": 29249954144640.0, + "grad_norm": 5.654199567369001, + "language_loss": 0.8762064, + "learning_rate": 3.99808060200659e-06, + "loss": 0.96586531, + "num_input_tokens_seen": 15521725, + "router_z_loss_clip": 8.5, + "router_z_loss_mlp": 0.96337891, + "step": 724, + "time_per_iteration": 2.7862863540649414 + }, + { + "auxiliary_loss_clip": 0.07522231, + "auxiliary_loss_mlp": 0.01408898, + "balance_loss_clip": 0.06700347, + "balance_loss_mlp": 0.01310479, + "epoch": 0.04358935818427777, + "flos": 20564616631680.0, + "grad_norm": 17.469159252810304, + "language_loss": 0.84563124, + "learning_rate": 3.998063505536971e-06, + "loss": 0.93494248, + "num_input_tokens_seen": 15540910, + "router_z_loss_clip": 8.2109375, + "router_z_loss_mlp": 0.98339844, + "step": 725, + "time_per_iteration": 2.6348090171813965 + }, + { + "auxiliary_loss_clip": 0.07563804, + "auxiliary_loss_mlp": 0.01414464, + "balance_loss_clip": 0.06708695, + "balance_loss_mlp": 0.01317428, + "epoch": 0.04364948143694574, + "flos": 14470116163200.0, + "grad_norm": 13.275228581754149, + "language_loss": 0.94372833, + "learning_rate": 3.998046333300584e-06, + "loss": 1.03351104, + "num_input_tokens_seen": 15558640, + "router_z_loss_clip": 8.5546875, + "router_z_loss_mlp": 0.96972656, + "step": 726, + "time_per_iteration": 2.6198081970214844 + }, + { + "auxiliary_loss_clip": 0.06976914, + "auxiliary_loss_mlp": 0.01364793, + "balance_loss_clip": 0.0652867, + "balance_loss_mlp": 0.01297797, + "epoch": 0.043709604689613706, + "flos": 50083216565760.0, + "grad_norm": 0.973992689315138, + "language_loss": 0.56151426, + "learning_rate": 3.998029085298079e-06, + "loss": 0.64493132, + "num_input_tokens_seen": 15612975, + "router_z_loss_clip": 4.4921875, + "router_z_loss_mlp": 0.67041016, + "step": 727, + "time_per_iteration": 3.331416368484497 + }, + { + "auxiliary_loss_clip": 0.07546923, + "auxiliary_loss_mlp": 0.01412171, + "balance_loss_clip": 0.06696635, + "balance_loss_mlp": 0.01320475, + "epoch": 0.04376972794228168, + "flos": 13996861902720.0, + "grad_norm": 5.257747667032763, + "language_loss": 0.87717295, + "learning_rate": 3.998011761530112e-06, + "loss": 0.96676385, + "num_input_tokens_seen": 15631070, + "router_z_loss_clip": 8.51953125, + "router_z_loss_mlp": 0.91699219, + "step": 728, + "time_per_iteration": 3.989957571029663 + }, + { + "auxiliary_loss_clip": 0.07508835, + "auxiliary_loss_mlp": 0.01424416, + "balance_loss_clip": 0.06694756, + "balance_loss_mlp": 0.0133787, + "epoch": 0.04382985119494965, + "flos": 22015084734720.0, + "grad_norm": 7.636957371182376, + "language_loss": 0.80325305, + "learning_rate": 3.997994361997338e-06, + "loss": 0.89258564, + "num_input_tokens_seen": 15647825, + "router_z_loss_clip": 8.14453125, + "router_z_loss_mlp": 0.86572266, + "step": 729, + "time_per_iteration": 4.069265365600586 + }, + { + "auxiliary_loss_clip": 0.07515953, + "auxiliary_loss_mlp": 0.01429781, + "balance_loss_clip": 0.06682766, + "balance_loss_mlp": 0.01337561, + "epoch": 0.043889974447617615, + "flos": 24213322673280.0, + "grad_norm": 4.547809577279536, + "language_loss": 1.00979817, + "learning_rate": 3.997976886700417e-06, + "loss": 1.09925556, + "num_input_tokens_seen": 15668260, + "router_z_loss_clip": 8.33203125, + "router_z_loss_mlp": 0.92285156, + "step": 730, + "time_per_iteration": 4.043174982070923 + }, + { + "auxiliary_loss_clip": 0.07549515, + "auxiliary_loss_mlp": 0.01462607, + "balance_loss_clip": 0.06684491, + "balance_loss_mlp": 0.0135055, + "epoch": 0.04395009770028559, + "flos": 17280236142720.0, + "grad_norm": 42.34250232752857, + "language_loss": 0.93866402, + "learning_rate": 3.997959335640013e-06, + "loss": 1.02878523, + "num_input_tokens_seen": 15685630, + "router_z_loss_clip": 8.6640625, + "router_z_loss_mlp": 1.12011719, + "step": 731, + "time_per_iteration": 2.6158339977264404 + }, + { + "auxiliary_loss_clip": 0.07507139, + "auxiliary_loss_mlp": 0.01450773, + "balance_loss_clip": 0.06690555, + "balance_loss_mlp": 0.0135059, + "epoch": 0.04401022095295355, + "flos": 12314784314880.0, + "grad_norm": 29.143956092822908, + "language_loss": 0.9731133, + "learning_rate": 3.997941708816791e-06, + "loss": 1.0626924, + "num_input_tokens_seen": 15698645, + "router_z_loss_clip": 8.1640625, + "router_z_loss_mlp": 1.00146484, + "step": 732, + "time_per_iteration": 4.100733995437622 + }, + { + "auxiliary_loss_clip": 0.07525843, + "auxiliary_loss_mlp": 0.01458711, + "balance_loss_clip": 0.06679834, + "balance_loss_mlp": 0.01353854, + "epoch": 0.044070344205621524, + "flos": 20965978488960.0, + "grad_norm": 13.482370943505323, + "language_loss": 0.90961432, + "learning_rate": 3.997924006231419e-06, + "loss": 0.9994598, + "num_input_tokens_seen": 15716775, + "router_z_loss_clip": 8.46875, + "router_z_loss_mlp": 1.04785156, + "step": 733, + "time_per_iteration": 2.6597700119018555 + }, + { + "auxiliary_loss_clip": 0.07518548, + "auxiliary_loss_mlp": 0.01469977, + "balance_loss_clip": 0.06685109, + "balance_loss_mlp": 0.01364262, + "epoch": 0.044130467458289496, + "flos": 13850477619840.0, + "grad_norm": 7.4867822080691235, + "language_loss": 0.95689577, + "learning_rate": 3.9979062278845685e-06, + "loss": 1.04678106, + "num_input_tokens_seen": 15733320, + "router_z_loss_clip": 8.34375, + "router_z_loss_mlp": 1.05664062, + "step": 734, + "time_per_iteration": 2.5865581035614014 + }, + { + "auxiliary_loss_clip": 0.0748552, + "auxiliary_loss_mlp": 0.01451415, + "balance_loss_clip": 0.06673294, + "balance_loss_mlp": 0.01355809, + "epoch": 0.04419059071095746, + "flos": 28662152952960.0, + "grad_norm": 3.9560769382385237, + "language_loss": 0.82954776, + "learning_rate": 3.9978883737769125e-06, + "loss": 0.91891712, + "num_input_tokens_seen": 15752705, + "router_z_loss_clip": 8.12890625, + "router_z_loss_mlp": 0.95605469, + "step": 735, + "time_per_iteration": 2.7034595012664795 + }, + { + "auxiliary_loss_clip": 0.07501128, + "auxiliary_loss_mlp": 0.01471986, + "balance_loss_clip": 0.06663659, + "balance_loss_mlp": 0.01360931, + "epoch": 0.04425071396362543, + "flos": 28190743482240.0, + "grad_norm": 5.551572813958511, + "language_loss": 0.95522362, + "learning_rate": 3.9978704439091305e-06, + "loss": 1.04495478, + "num_input_tokens_seen": 15772800, + "router_z_loss_clip": 8.375, + "router_z_loss_mlp": 1.11132812, + "step": 736, + "time_per_iteration": 2.6946370601654053 + }, + { + "auxiliary_loss_clip": 0.07478474, + "auxiliary_loss_mlp": 0.01445427, + "balance_loss_clip": 0.06672784, + "balance_loss_mlp": 0.01338806, + "epoch": 0.0443108372162934, + "flos": 23665031481600.0, + "grad_norm": 16.744954570362566, + "language_loss": 0.88981938, + "learning_rate": 3.997852438281901e-06, + "loss": 0.97905844, + "num_input_tokens_seen": 15793665, + "router_z_loss_clip": 8.0625, + "router_z_loss_mlp": 1.06640625, + "step": 737, + "time_per_iteration": 2.715646266937256 + }, + { + "auxiliary_loss_clip": 0.07480585, + "auxiliary_loss_mlp": 0.01439926, + "balance_loss_clip": 0.0667211, + "balance_loss_mlp": 0.01326964, + "epoch": 0.04437096046896137, + "flos": 33987486067200.0, + "grad_norm": 222.55096495156016, + "language_loss": 0.89570022, + "learning_rate": 3.997834356895906e-06, + "loss": 0.98490536, + "num_input_tokens_seen": 15813175, + "router_z_loss_clip": 8.0859375, + "router_z_loss_mlp": 1.12988281, + "step": 738, + "time_per_iteration": 2.7859413623809814 + }, + { + "auxiliary_loss_clip": 0.06961473, + "auxiliary_loss_mlp": 0.01305245, + "balance_loss_clip": 0.06532852, + "balance_loss_mlp": 0.01250504, + "epoch": 0.04443108372162934, + "flos": 67416268308480.0, + "grad_norm": 0.9420923573397554, + "language_loss": 0.59376323, + "learning_rate": 3.9978161997518324e-06, + "loss": 0.67643034, + "num_input_tokens_seen": 15872050, + "router_z_loss_clip": 4.3046875, + "router_z_loss_mlp": 0.54882812, + "step": 739, + "time_per_iteration": 3.1967270374298096 + }, + { + "auxiliary_loss_clip": 0.07502826, + "auxiliary_loss_mlp": 0.01427717, + "balance_loss_clip": 0.06669345, + "balance_loss_mlp": 0.01320858, + "epoch": 0.04449120697429731, + "flos": 29760454344960.0, + "grad_norm": 6.6049127408313915, + "language_loss": 0.9770751, + "learning_rate": 3.997797966850369e-06, + "loss": 1.0663805, + "num_input_tokens_seen": 15891085, + "router_z_loss_clip": 8.3359375, + "router_z_loss_mlp": 1.06933594, + "step": 740, + "time_per_iteration": 2.768758535385132 + }, + { + "auxiliary_loss_clip": 0.07489674, + "auxiliary_loss_mlp": 0.0143368, + "balance_loss_clip": 0.06660549, + "balance_loss_mlp": 0.01330111, + "epoch": 0.04455133022696528, + "flos": 36510958828800.0, + "grad_norm": 21.062626098117025, + "language_loss": 0.76799577, + "learning_rate": 3.997779658192205e-06, + "loss": 0.85722935, + "num_input_tokens_seen": 15914225, + "router_z_loss_clip": 8.3046875, + "router_z_loss_mlp": 1.03515625, + "step": 741, + "time_per_iteration": 2.755948543548584 + }, + { + "auxiliary_loss_clip": 0.0744606, + "auxiliary_loss_mlp": 0.01441267, + "balance_loss_clip": 0.06655986, + "balance_loss_mlp": 0.01339128, + "epoch": 0.044611453479633245, + "flos": 28811220566400.0, + "grad_norm": 10.341428331493303, + "language_loss": 0.9204191, + "learning_rate": 3.997761273778037e-06, + "loss": 1.00929236, + "num_input_tokens_seen": 15934540, + "router_z_loss_clip": 7.90234375, + "router_z_loss_mlp": 1.02148438, + "step": 742, + "time_per_iteration": 2.6964497566223145 + }, + { + "auxiliary_loss_clip": 0.07461847, + "auxiliary_loss_mlp": 0.01424939, + "balance_loss_clip": 0.06654513, + "balance_loss_mlp": 0.01322085, + "epoch": 0.04467157673230122, + "flos": 20017122053760.0, + "grad_norm": 7.31366885778202, + "language_loss": 0.89204007, + "learning_rate": 3.997742813608561e-06, + "loss": 0.98090798, + "num_input_tokens_seen": 15952560, + "router_z_loss_clip": 8.078125, + "router_z_loss_mlp": 1.02880859, + "step": 743, + "time_per_iteration": 2.6080615520477295 + }, + { + "auxiliary_loss_clip": 0.07439004, + "auxiliary_loss_mlp": 0.01432385, + "balance_loss_clip": 0.06638713, + "balance_loss_mlp": 0.01329913, + "epoch": 0.04473169998496919, + "flos": 18010899745920.0, + "grad_norm": 13.675273731760388, + "language_loss": 0.85338962, + "learning_rate": 3.997724277684479e-06, + "loss": 0.94210356, + "num_input_tokens_seen": 15970620, + "router_z_loss_clip": 8.00390625, + "router_z_loss_mlp": 1.02490234, + "step": 744, + "time_per_iteration": 2.697763204574585 + }, + { + "auxiliary_loss_clip": 0.07427198, + "auxiliary_loss_mlp": 0.01407828, + "balance_loss_clip": 0.06637768, + "balance_loss_mlp": 0.01313938, + "epoch": 0.044791823237637154, + "flos": 20638060335360.0, + "grad_norm": 8.258556171326942, + "language_loss": 0.89771521, + "learning_rate": 3.99770566600649e-06, + "loss": 0.98606539, + "num_input_tokens_seen": 15987325, + "router_z_loss_clip": 7.89453125, + "router_z_loss_mlp": 0.93896484, + "step": 745, + "time_per_iteration": 2.609206438064575 + }, + { + "auxiliary_loss_clip": 0.07450528, + "auxiliary_loss_mlp": 0.01413412, + "balance_loss_clip": 0.06646559, + "balance_loss_mlp": 0.01313371, + "epoch": 0.04485194649030513, + "flos": 31184284049280.0, + "grad_norm": 12.351211228960139, + "language_loss": 0.73676586, + "learning_rate": 3.997686978575302e-06, + "loss": 0.82540524, + "num_input_tokens_seen": 16008310, + "router_z_loss_clip": 8.05859375, + "router_z_loss_mlp": 1.0, + "step": 746, + "time_per_iteration": 2.8217551708221436 + }, + { + "auxiliary_loss_clip": 0.07421336, + "auxiliary_loss_mlp": 0.01411005, + "balance_loss_clip": 0.06631814, + "balance_loss_mlp": 0.01308485, + "epoch": 0.04491206974297309, + "flos": 26150922887040.0, + "grad_norm": 4.52399420645529, + "language_loss": 0.7370531, + "learning_rate": 3.997668215391625e-06, + "loss": 0.82537645, + "num_input_tokens_seen": 16029620, + "router_z_loss_clip": 7.89453125, + "router_z_loss_mlp": 1.02587891, + "step": 747, + "time_per_iteration": 2.724240303039551 + }, + { + "auxiliary_loss_clip": 0.0741486, + "auxiliary_loss_mlp": 0.01407706, + "balance_loss_clip": 0.06629101, + "balance_loss_mlp": 0.0131005, + "epoch": 0.044972192995641064, + "flos": 20673922682880.0, + "grad_norm": 4.695342378066542, + "language_loss": 0.7142753, + "learning_rate": 3.997649376456168e-06, + "loss": 0.80250096, + "num_input_tokens_seen": 16049065, + "router_z_loss_clip": 7.859375, + "router_z_loss_mlp": 0.97607422, + "step": 748, + "time_per_iteration": 2.6020255088806152 + }, + { + "auxiliary_loss_clip": 0.0743566, + "auxiliary_loss_mlp": 0.01385894, + "balance_loss_clip": 0.06626688, + "balance_loss_mlp": 0.01281753, + "epoch": 0.045032316248309036, + "flos": 16112306407680.0, + "grad_norm": 6.462262226814603, + "language_loss": 0.81646264, + "learning_rate": 3.997630461769647e-06, + "loss": 0.90467817, + "num_input_tokens_seen": 16066765, + "router_z_loss_clip": 8.08984375, + "router_z_loss_mlp": 1.04199219, + "step": 749, + "time_per_iteration": 2.715440273284912 + }, + { + "auxiliary_loss_clip": 0.07424041, + "auxiliary_loss_mlp": 0.01391269, + "balance_loss_clip": 0.06627008, + "balance_loss_mlp": 0.01284601, + "epoch": 0.045092439500977, + "flos": 17864725098240.0, + "grad_norm": 4.760324696153287, + "language_loss": 0.94018352, + "learning_rate": 3.997611471332778e-06, + "loss": 1.02833652, + "num_input_tokens_seen": 16085980, + "router_z_loss_clip": 7.96484375, + "router_z_loss_mlp": 1.06542969, + "step": 750, + "time_per_iteration": 2.603782892227173 + }, + { + "auxiliary_loss_clip": 0.07430436, + "auxiliary_loss_mlp": 0.01400307, + "balance_loss_clip": 0.06634089, + "balance_loss_mlp": 0.01284579, + "epoch": 0.04515256275364497, + "flos": 24469809621120.0, + "grad_norm": 8.436133500985974, + "language_loss": 0.79776669, + "learning_rate": 3.9975924051462825e-06, + "loss": 0.88607413, + "num_input_tokens_seen": 16106260, + "router_z_loss_clip": 7.97265625, + "router_z_loss_mlp": 1.15673828, + "step": 751, + "time_per_iteration": 2.6831071376800537 + }, + { + "auxiliary_loss_clip": 0.07439418, + "auxiliary_loss_mlp": 0.01393415, + "balance_loss_clip": 0.06633066, + "balance_loss_mlp": 0.01282932, + "epoch": 0.04521268600631294, + "flos": 20921563025280.0, + "grad_norm": 6.241833654243461, + "language_loss": 0.75070345, + "learning_rate": 3.997573263210883e-06, + "loss": 0.83903182, + "num_input_tokens_seen": 16123475, + "router_z_loss_clip": 8.05859375, + "router_z_loss_mlp": 1.10351562, + "step": 752, + "time_per_iteration": 2.6177663803100586 + }, + { + "auxiliary_loss_clip": 0.07437599, + "auxiliary_loss_mlp": 0.01387858, + "balance_loss_clip": 0.06631324, + "balance_loss_mlp": 0.01275515, + "epoch": 0.04527280925898091, + "flos": 13376552526720.0, + "grad_norm": 9.915844804632899, + "language_loss": 0.97712451, + "learning_rate": 3.997554045527305e-06, + "loss": 1.06537914, + "num_input_tokens_seen": 16138335, + "router_z_loss_clip": 8.0703125, + "router_z_loss_mlp": 1.125, + "step": 753, + "time_per_iteration": 2.613664388656616 + }, + { + "auxiliary_loss_clip": 0.07467066, + "auxiliary_loss_mlp": 0.0138957, + "balance_loss_clip": 0.06645191, + "balance_loss_mlp": 0.01278133, + "epoch": 0.04533293251164888, + "flos": 23260650877440.0, + "grad_norm": 4.960920268809469, + "language_loss": 0.95308006, + "learning_rate": 3.997534752096277e-06, + "loss": 1.04164636, + "num_input_tokens_seen": 16157110, + "router_z_loss_clip": 8.23046875, + "router_z_loss_mlp": 1.11376953, + "step": 754, + "time_per_iteration": 2.6214957237243652 + }, + { + "auxiliary_loss_clip": 0.07402018, + "auxiliary_loss_mlp": 0.01373244, + "balance_loss_clip": 0.06614807, + "balance_loss_mlp": 0.01264812, + "epoch": 0.04539305576431685, + "flos": 12426899477760.0, + "grad_norm": 4.312204742226669, + "language_loss": 0.84473336, + "learning_rate": 3.997515382918531e-06, + "loss": 0.93248594, + "num_input_tokens_seen": 16174155, + "router_z_loss_clip": 7.87890625, + "router_z_loss_mlp": 1.08544922, + "step": 755, + "time_per_iteration": 2.659515857696533 + }, + { + "auxiliary_loss_clip": 0.07425568, + "auxiliary_loss_mlp": 0.01385083, + "balance_loss_clip": 0.06618007, + "balance_loss_mlp": 0.01261582, + "epoch": 0.04545317901698482, + "flos": 16076569841280.0, + "grad_norm": 4.663949688306233, + "language_loss": 0.85189492, + "learning_rate": 3.9974959379948015e-06, + "loss": 0.94000149, + "num_input_tokens_seen": 16192240, + "router_z_loss_clip": 8.078125, + "router_z_loss_mlp": 1.23632812, + "step": 756, + "time_per_iteration": 2.5948095321655273 + }, + { + "auxiliary_loss_clip": 0.0692629, + "auxiliary_loss_mlp": 0.01345145, + "balance_loss_clip": 0.06492035, + "balance_loss_mlp": 0.01295292, + "epoch": 0.045513302269652785, + "flos": 66418118144640.0, + "grad_norm": 0.7901603277703675, + "language_loss": 0.62960637, + "learning_rate": 3.997476417325827e-06, + "loss": 0.71232069, + "num_input_tokens_seen": 16255775, + "router_z_loss_clip": 4.34375, + "router_z_loss_mlp": 0.49829102, + "step": 757, + "time_per_iteration": 3.255581855773926 + }, + { + "auxiliary_loss_clip": 0.07416959, + "auxiliary_loss_mlp": 0.01380818, + "balance_loss_clip": 0.06624802, + "balance_loss_mlp": 0.01258747, + "epoch": 0.04557342552232076, + "flos": 21477694573440.0, + "grad_norm": 3.09506424046452, + "language_loss": 0.87773216, + "learning_rate": 3.997456820912346e-06, + "loss": 0.96570992, + "num_input_tokens_seen": 16277015, + "router_z_loss_clip": 7.921875, + "router_z_loss_mlp": 1.22070312, + "step": 758, + "time_per_iteration": 2.661123514175415 + }, + { + "auxiliary_loss_clip": 0.0740035, + "auxiliary_loss_mlp": 0.01375063, + "balance_loss_clip": 0.06621221, + "balance_loss_mlp": 0.01257952, + "epoch": 0.04563354877498873, + "flos": 23739481434240.0, + "grad_norm": 2.638413914831674, + "language_loss": 0.92492557, + "learning_rate": 3.997437148755101e-06, + "loss": 1.0126797, + "num_input_tokens_seen": 16296005, + "router_z_loss_clip": 7.78515625, + "router_z_loss_mlp": 1.17089844, + "step": 759, + "time_per_iteration": 2.668470859527588 + }, + { + "auxiliary_loss_clip": 0.07430892, + "auxiliary_loss_mlp": 0.01383461, + "balance_loss_clip": 0.06623936, + "balance_loss_mlp": 0.01266541, + "epoch": 0.045693672027656694, + "flos": 25742265724800.0, + "grad_norm": 3.8629420904701237, + "language_loss": 0.79697698, + "learning_rate": 3.9974174008548405e-06, + "loss": 0.88512051, + "num_input_tokens_seen": 16315300, + "router_z_loss_clip": 8.07421875, + "router_z_loss_mlp": 1.16992188, + "step": 760, + "time_per_iteration": 2.716425895690918 + }, + { + "auxiliary_loss_clip": 0.07406907, + "auxiliary_loss_mlp": 0.01369419, + "balance_loss_clip": 0.06620169, + "balance_loss_mlp": 0.01267519, + "epoch": 0.045753795280324666, + "flos": 19725108174720.0, + "grad_norm": 2.8686759977967458, + "language_loss": 0.87246794, + "learning_rate": 3.9973975772123105e-06, + "loss": 0.96023118, + "num_input_tokens_seen": 16333820, + "router_z_loss_clip": 7.87109375, + "router_z_loss_mlp": 1.01855469, + "step": 761, + "time_per_iteration": 2.6261487007141113 + }, + { + "auxiliary_loss_clip": 0.07379207, + "auxiliary_loss_mlp": 0.01371916, + "balance_loss_clip": 0.06607988, + "balance_loss_mlp": 0.01259764, + "epoch": 0.04581391853299264, + "flos": 23262076396800.0, + "grad_norm": 2.7268346941502273, + "language_loss": 0.83904314, + "learning_rate": 3.997377677828266e-06, + "loss": 0.92655438, + "num_input_tokens_seen": 16355290, + "router_z_loss_clip": 7.71875, + "router_z_loss_mlp": 1.12304688, + "step": 762, + "time_per_iteration": 2.677358627319336 + }, + { + "auxiliary_loss_clip": 0.06917945, + "auxiliary_loss_mlp": 0.01342542, + "balance_loss_clip": 0.06491472, + "balance_loss_mlp": 0.01301057, + "epoch": 0.0458740417856606, + "flos": 64250711308800.0, + "grad_norm": 0.9293980504879501, + "language_loss": 0.59131134, + "learning_rate": 3.9973577027034585e-06, + "loss": 0.67391622, + "num_input_tokens_seen": 16415995, + "router_z_loss_clip": 4.25, + "router_z_loss_mlp": 0.41503906, + "step": 763, + "time_per_iteration": 3.262456178665161 + }, + { + "auxiliary_loss_clip": 0.07421511, + "auxiliary_loss_mlp": 0.01399391, + "balance_loss_clip": 0.0662367, + "balance_loss_mlp": 0.01283425, + "epoch": 0.045934165038328575, + "flos": 20775220669440.0, + "grad_norm": 3.4758610459340535, + "language_loss": 0.92935646, + "learning_rate": 3.9973376518386475e-06, + "loss": 1.01756549, + "num_input_tokens_seen": 16433120, + "router_z_loss_clip": 7.98046875, + "router_z_loss_mlp": 1.15869141, + "step": 764, + "time_per_iteration": 2.66152024269104 + }, + { + "auxiliary_loss_clip": 0.07451791, + "auxiliary_loss_mlp": 0.01391333, + "balance_loss_clip": 0.06637829, + "balance_loss_mlp": 0.01274556, + "epoch": 0.04599428829099654, + "flos": 30270661056000.0, + "grad_norm": 3.768496915542153, + "language_loss": 0.90699267, + "learning_rate": 3.997317525234592e-06, + "loss": 0.99542397, + "num_input_tokens_seen": 16453360, + "router_z_loss_clip": 8.14453125, + "router_z_loss_mlp": 1.16845703, + "step": 765, + "time_per_iteration": 2.6835410594940186 + }, + { + "auxiliary_loss_clip": 0.07426902, + "auxiliary_loss_mlp": 0.01398616, + "balance_loss_clip": 0.0662117, + "balance_loss_mlp": 0.01278883, + "epoch": 0.04605441154366451, + "flos": 23045518719360.0, + "grad_norm": 7.076643019058991, + "language_loss": 0.94406933, + "learning_rate": 3.997297322892056e-06, + "loss": 1.03232455, + "num_input_tokens_seen": 16471160, + "router_z_loss_clip": 8.0625, + "router_z_loss_mlp": 1.19580078, + "step": 766, + "time_per_iteration": 2.6382553577423096 + }, + { + "auxiliary_loss_clip": 0.07415807, + "auxiliary_loss_mlp": 0.01393781, + "balance_loss_clip": 0.06614047, + "balance_loss_mlp": 0.01284967, + "epoch": 0.046114534796332485, + "flos": 22023847486080.0, + "grad_norm": 4.776611740874826, + "language_loss": 0.89285934, + "learning_rate": 3.997277044811806e-06, + "loss": 0.98095518, + "num_input_tokens_seen": 16488940, + "router_z_loss_clip": 8.01953125, + "router_z_loss_mlp": 1.08789062, + "step": 767, + "time_per_iteration": 4.195739984512329 + }, + { + "auxiliary_loss_clip": 0.07392205, + "auxiliary_loss_mlp": 0.01374375, + "balance_loss_clip": 0.0661349, + "balance_loss_mlp": 0.01267278, + "epoch": 0.04617465804900045, + "flos": 29870221593600.0, + "grad_norm": 7.642963435689524, + "language_loss": 0.92056656, + "learning_rate": 3.99725669099461e-06, + "loss": 1.00823236, + "num_input_tokens_seen": 16509505, + "router_z_loss_clip": 7.7890625, + "router_z_loss_mlp": 1.0703125, + "step": 768, + "time_per_iteration": 4.208758354187012 + }, + { + "auxiliary_loss_clip": 0.07427865, + "auxiliary_loss_mlp": 0.01386956, + "balance_loss_clip": 0.06619686, + "balance_loss_mlp": 0.01278571, + "epoch": 0.04623478130166842, + "flos": 25637194304640.0, + "grad_norm": 3.542997425401238, + "language_loss": 0.79400444, + "learning_rate": 3.9972362614412395e-06, + "loss": 0.88215268, + "num_input_tokens_seen": 16528840, + "router_z_loss_clip": 8.078125, + "router_z_loss_mlp": 1.08447266, + "step": 769, + "time_per_iteration": 4.17974328994751 + }, + { + "auxiliary_loss_clip": 0.07375413, + "auxiliary_loss_mlp": 0.01385881, + "balance_loss_clip": 0.06606276, + "balance_loss_mlp": 0.01275923, + "epoch": 0.04629490455433639, + "flos": 20455352507520.0, + "grad_norm": 2.7800745603564185, + "language_loss": 0.89842647, + "learning_rate": 3.997215756152471e-06, + "loss": 0.9860394, + "num_input_tokens_seen": 16548335, + "router_z_loss_clip": 7.69140625, + "router_z_loss_mlp": 1.10009766, + "step": 770, + "time_per_iteration": 2.656651735305786 + }, + { + "auxiliary_loss_clip": 0.07423855, + "auxiliary_loss_mlp": 0.01400348, + "balance_loss_clip": 0.06619771, + "balance_loss_mlp": 0.01292678, + "epoch": 0.04635502780700436, + "flos": 23155411749120.0, + "grad_norm": 4.755062709171144, + "language_loss": 0.92055309, + "learning_rate": 3.99719517512908e-06, + "loss": 1.00879514, + "num_input_tokens_seen": 16567725, + "router_z_loss_clip": 8.04296875, + "router_z_loss_mlp": 1.07714844, + "step": 771, + "time_per_iteration": 4.008092403411865 + }, + { + "auxiliary_loss_clip": 0.07446887, + "auxiliary_loss_mlp": 0.0141094, + "balance_loss_clip": 0.06623209, + "balance_loss_mlp": 0.01295641, + "epoch": 0.04641515105967233, + "flos": 23298274160640.0, + "grad_norm": 7.281609081858744, + "language_loss": 0.88918245, + "learning_rate": 3.997174518371848e-06, + "loss": 0.97776067, + "num_input_tokens_seen": 16588175, + "router_z_loss_clip": 8.2265625, + "router_z_loss_mlp": 1.15380859, + "step": 772, + "time_per_iteration": 2.6240971088409424 + }, + { + "auxiliary_loss_clip": 0.07388498, + "auxiliary_loss_mlp": 0.01396403, + "balance_loss_clip": 0.06612748, + "balance_loss_mlp": 0.01294503, + "epoch": 0.046475274312340296, + "flos": 25121579005440.0, + "grad_norm": 3.47084722704317, + "language_loss": 0.78166652, + "learning_rate": 3.997153785881557e-06, + "loss": 0.86951548, + "num_input_tokens_seen": 16607735, + "router_z_loss_clip": 7.765625, + "router_z_loss_mlp": 1.01904297, + "step": 773, + "time_per_iteration": 2.6761457920074463 + }, + { + "auxiliary_loss_clip": 0.07362784, + "auxiliary_loss_mlp": 0.01412458, + "balance_loss_clip": 0.06602354, + "balance_loss_mlp": 0.0130703, + "epoch": 0.04653539756500827, + "flos": 25271946357120.0, + "grad_norm": 3.68531082302782, + "language_loss": 0.82003927, + "learning_rate": 3.997132977658996e-06, + "loss": 0.90779173, + "num_input_tokens_seen": 16627225, + "router_z_loss_clip": 7.609375, + "router_z_loss_mlp": 1.05419922, + "step": 774, + "time_per_iteration": 2.6333625316619873 + }, + { + "auxiliary_loss_clip": 0.0737831, + "auxiliary_loss_mlp": 0.01410602, + "balance_loss_clip": 0.06605712, + "balance_loss_mlp": 0.0129783, + "epoch": 0.046595520817676234, + "flos": 35412238166400.0, + "grad_norm": 3.362442863286837, + "language_loss": 0.78172398, + "learning_rate": 3.997112093704952e-06, + "loss": 0.86961305, + "num_input_tokens_seen": 16647785, + "router_z_loss_clip": 7.73046875, + "router_z_loss_mlp": 1.12792969, + "step": 775, + "time_per_iteration": 2.7341220378875732 + }, + { + "auxiliary_loss_clip": 0.07397586, + "auxiliary_loss_mlp": 0.01408088, + "balance_loss_clip": 0.0662451, + "balance_loss_mlp": 0.01303994, + "epoch": 0.046655644070344206, + "flos": 18118151372160.0, + "grad_norm": 4.938605745427105, + "language_loss": 0.81674814, + "learning_rate": 3.997091134020217e-06, + "loss": 0.90480489, + "num_input_tokens_seen": 16667555, + "router_z_loss_clip": 7.734375, + "router_z_loss_mlp": 1.04052734, + "step": 776, + "time_per_iteration": 2.631185293197632 + }, + { + "auxiliary_loss_clip": 0.07349464, + "auxiliary_loss_mlp": 0.01382372, + "balance_loss_clip": 0.06605366, + "balance_loss_mlp": 0.01283905, + "epoch": 0.04671576732301218, + "flos": 29212959767040.0, + "grad_norm": 3.9530223985438724, + "language_loss": 0.76411474, + "learning_rate": 3.997070098605585e-06, + "loss": 0.85143304, + "num_input_tokens_seen": 16686875, + "router_z_loss_clip": 7.4375, + "router_z_loss_mlp": 0.98535156, + "step": 777, + "time_per_iteration": 2.6883299350738525 + }, + { + "auxiliary_loss_clip": 0.07356873, + "auxiliary_loss_mlp": 0.01403802, + "balance_loss_clip": 0.06604887, + "balance_loss_mlp": 0.0129618, + "epoch": 0.04677589057568014, + "flos": 30485541651840.0, + "grad_norm": 5.886017158674543, + "language_loss": 0.8144322, + "learning_rate": 3.997048987461856e-06, + "loss": 0.90203899, + "num_input_tokens_seen": 16706420, + "router_z_loss_clip": 7.52734375, + "router_z_loss_mlp": 1.07568359, + "step": 778, + "time_per_iteration": 2.685317277908325 + }, + { + "auxiliary_loss_clip": 0.07353938, + "auxiliary_loss_mlp": 0.01397494, + "balance_loss_clip": 0.06609853, + "balance_loss_mlp": 0.01301697, + "epoch": 0.046836013828348115, + "flos": 20563820017920.0, + "grad_norm": 3.1633004103469644, + "language_loss": 0.83870596, + "learning_rate": 3.997027800589829e-06, + "loss": 0.92622018, + "num_input_tokens_seen": 16726390, + "router_z_loss_clip": 7.4375, + "router_z_loss_mlp": 0.95849609, + "step": 779, + "time_per_iteration": 2.737780809402466 + }, + { + "auxiliary_loss_clip": 0.07349363, + "auxiliary_loss_mlp": 0.01400206, + "balance_loss_clip": 0.06610721, + "balance_loss_mlp": 0.01301119, + "epoch": 0.04689613708101608, + "flos": 25454444549760.0, + "grad_norm": 5.859193350473668, + "language_loss": 0.80411738, + "learning_rate": 3.997006537990308e-06, + "loss": 0.89161313, + "num_input_tokens_seen": 16748965, + "router_z_loss_clip": 7.38671875, + "router_z_loss_mlp": 0.99023438, + "step": 780, + "time_per_iteration": 2.7168006896972656 + }, + { + "auxiliary_loss_clip": 0.07343157, + "auxiliary_loss_mlp": 0.0140195, + "balance_loss_clip": 0.06612131, + "balance_loss_mlp": 0.01309253, + "epoch": 0.04695626033368405, + "flos": 23007811582080.0, + "grad_norm": 3.4762604948204707, + "language_loss": 0.80410504, + "learning_rate": 3.996985199664099e-06, + "loss": 0.89155614, + "num_input_tokens_seen": 16768620, + "router_z_loss_clip": 7.3125, + "router_z_loss_mlp": 0.92724609, + "step": 781, + "time_per_iteration": 2.6267943382263184 + }, + { + "auxiliary_loss_clip": 0.07401444, + "auxiliary_loss_mlp": 0.01433849, + "balance_loss_clip": 0.06619258, + "balance_loss_mlp": 0.01321363, + "epoch": 0.047016383586352024, + "flos": 29141193144960.0, + "grad_norm": 4.331089591937386, + "language_loss": 0.79331714, + "learning_rate": 3.99696378561201e-06, + "loss": 0.88167012, + "num_input_tokens_seen": 16789755, + "router_z_loss_clip": 7.83984375, + "router_z_loss_mlp": 1.12451172, + "step": 782, + "time_per_iteration": 2.7272114753723145 + }, + { + "auxiliary_loss_clip": 0.07364355, + "auxiliary_loss_mlp": 0.01439388, + "balance_loss_clip": 0.06623092, + "balance_loss_mlp": 0.01338251, + "epoch": 0.04707650683901999, + "flos": 14981706466560.0, + "grad_norm": 6.433414878185146, + "language_loss": 0.85460365, + "learning_rate": 3.996942295834855e-06, + "loss": 0.94264108, + "num_input_tokens_seen": 16807585, + "router_z_loss_clip": 7.421875, + "router_z_loss_mlp": 1.01269531, + "step": 783, + "time_per_iteration": 2.6950912475585938 + }, + { + "auxiliary_loss_clip": 0.07354224, + "auxiliary_loss_mlp": 0.01436959, + "balance_loss_clip": 0.06629962, + "balance_loss_mlp": 0.01332722, + "epoch": 0.04713663009168796, + "flos": 21657257873280.0, + "grad_norm": 5.367904788236997, + "language_loss": 0.87574267, + "learning_rate": 3.996920730333448e-06, + "loss": 0.96365452, + "num_input_tokens_seen": 16827220, + "router_z_loss_clip": 7.234375, + "router_z_loss_mlp": 1.04150391, + "step": 784, + "time_per_iteration": 2.649948835372925 + }, + { + "auxiliary_loss_clip": 0.07386977, + "auxiliary_loss_mlp": 0.01467498, + "balance_loss_clip": 0.06641141, + "balance_loss_mlp": 0.01344665, + "epoch": 0.04719675334435593, + "flos": 21331939196160.0, + "grad_norm": 33.75407076232228, + "language_loss": 0.85470867, + "learning_rate": 3.996899089108607e-06, + "loss": 0.9432534, + "num_input_tokens_seen": 16846230, + "router_z_loss_clip": 7.453125, + "router_z_loss_mlp": 1.22753906, + "step": 785, + "time_per_iteration": 2.641284227371216 + }, + { + "auxiliary_loss_clip": 0.07399641, + "auxiliary_loss_mlp": 0.01481075, + "balance_loss_clip": 0.06649202, + "balance_loss_mlp": 0.01357002, + "epoch": 0.0472568765970239, + "flos": 17937204480000.0, + "grad_norm": 4.826067054081543, + "language_loss": 0.94969213, + "learning_rate": 3.996877372161152e-06, + "loss": 1.03849936, + "num_input_tokens_seen": 16865325, + "router_z_loss_clip": 7.51953125, + "router_z_loss_mlp": 1.24023438, + "step": 786, + "time_per_iteration": 2.6160340309143066 + }, + { + "auxiliary_loss_clip": 0.07465263, + "auxiliary_loss_mlp": 0.01521969, + "balance_loss_clip": 0.06653383, + "balance_loss_mlp": 0.01371384, + "epoch": 0.04731699984969187, + "flos": 18083169492480.0, + "grad_norm": 10.690384669742231, + "language_loss": 0.84019518, + "learning_rate": 3.9968555794919065e-06, + "loss": 0.93006748, + "num_input_tokens_seen": 16882930, + "router_z_loss_clip": 8.1328125, + "router_z_loss_mlp": 1.50488281, + "step": 787, + "time_per_iteration": 2.5864908695220947 + }, + { + "auxiliary_loss_clip": 0.07389308, + "auxiliary_loss_mlp": 0.01468371, + "balance_loss_clip": 0.06647876, + "balance_loss_mlp": 0.01332663, + "epoch": 0.047377123102359836, + "flos": 23191735294080.0, + "grad_norm": 8.892570877156906, + "language_loss": 0.85964632, + "learning_rate": 3.996833711101698e-06, + "loss": 0.94822311, + "num_input_tokens_seen": 16900710, + "router_z_loss_clip": 7.41796875, + "router_z_loss_mlp": 1.35839844, + "step": 788, + "time_per_iteration": 2.6390748023986816 + }, + { + "auxiliary_loss_clip": 0.07401264, + "auxiliary_loss_mlp": 0.01469979, + "balance_loss_clip": 0.06672339, + "balance_loss_mlp": 0.01334367, + "epoch": 0.04743724635502781, + "flos": 22754469162240.0, + "grad_norm": 17.026258111429804, + "language_loss": 0.89192903, + "learning_rate": 3.996811766991355e-06, + "loss": 0.98064142, + "num_input_tokens_seen": 16919210, + "router_z_loss_clip": 7.29296875, + "router_z_loss_mlp": 1.35449219, + "step": 789, + "time_per_iteration": 2.6131770610809326 + }, + { + "auxiliary_loss_clip": 0.07421435, + "auxiliary_loss_mlp": 0.01479761, + "balance_loss_clip": 0.06683871, + "balance_loss_mlp": 0.01339475, + "epoch": 0.04749736960769577, + "flos": 17244499576320.0, + "grad_norm": 30.32315054606697, + "language_loss": 0.88307178, + "learning_rate": 3.996789747161709e-06, + "loss": 0.97208381, + "num_input_tokens_seen": 16937125, + "router_z_loss_clip": 7.37890625, + "router_z_loss_mlp": 1.40136719, + "step": 790, + "time_per_iteration": 2.618745803833008 + }, + { + "auxiliary_loss_clip": 0.07412322, + "auxiliary_loss_mlp": 0.01470303, + "balance_loss_clip": 0.06664298, + "balance_loss_mlp": 0.01331687, + "epoch": 0.047557492860363745, + "flos": 40488798908160.0, + "grad_norm": 154.88106341207603, + "language_loss": 0.94037831, + "learning_rate": 3.996767651613597e-06, + "loss": 1.02920461, + "num_input_tokens_seen": 16958610, + "router_z_loss_clip": 7.48046875, + "router_z_loss_mlp": 1.38623047, + "step": 791, + "time_per_iteration": 2.7700016498565674 + }, + { + "auxiliary_loss_clip": 0.07422841, + "auxiliary_loss_mlp": 0.01462484, + "balance_loss_clip": 0.06681914, + "balance_loss_mlp": 0.01322198, + "epoch": 0.04761761611303172, + "flos": 18704023920000.0, + "grad_norm": 23.33805920811653, + "language_loss": 0.9476828, + "learning_rate": 3.996745480347854e-06, + "loss": 1.03653598, + "num_input_tokens_seen": 16977300, + "router_z_loss_clip": 7.4140625, + "router_z_loss_mlp": 1.40332031, + "step": 792, + "time_per_iteration": 2.605254888534546 + }, + { + "auxiliary_loss_clip": 0.07424683, + "auxiliary_loss_mlp": 0.01473205, + "balance_loss_clip": 0.0668014, + "balance_loss_mlp": 0.01333396, + "epoch": 0.04767773936569968, + "flos": 20928103643520.0, + "grad_norm": 9.340139883580587, + "language_loss": 0.78320849, + "learning_rate": 3.996723233365324e-06, + "loss": 0.87218744, + "num_input_tokens_seen": 16994950, + "router_z_loss_clip": 7.44921875, + "router_z_loss_mlp": 1.39697266, + "step": 793, + "time_per_iteration": 2.589350938796997 + }, + { + "auxiliary_loss_clip": 0.07421647, + "auxiliary_loss_mlp": 0.01474475, + "balance_loss_clip": 0.06679038, + "balance_loss_mlp": 0.01333379, + "epoch": 0.047737862618367655, + "flos": 23739481434240.0, + "grad_norm": 17.45910394468578, + "language_loss": 0.91955769, + "learning_rate": 3.996700910666847e-06, + "loss": 1.00851893, + "num_input_tokens_seen": 17014760, + "router_z_loss_clip": 7.4296875, + "router_z_loss_mlp": 1.41064453, + "step": 794, + "time_per_iteration": 2.65012264251709 + }, + { + "auxiliary_loss_clip": 0.07410855, + "auxiliary_loss_mlp": 0.01451088, + "balance_loss_clip": 0.06674555, + "balance_loss_mlp": 0.01322247, + "epoch": 0.04779798587103562, + "flos": 23702487056640.0, + "grad_norm": 25.87656480685072, + "language_loss": 0.77586949, + "learning_rate": 3.996678512253272e-06, + "loss": 0.8644889, + "num_input_tokens_seen": 17032715, + "router_z_loss_clip": 7.3671875, + "router_z_loss_mlp": 1.28808594, + "step": 795, + "time_per_iteration": 2.6948788166046143 + }, + { + "auxiliary_loss_clip": 0.07379565, + "auxiliary_loss_mlp": 0.01431544, + "balance_loss_clip": 0.06667496, + "balance_loss_mlp": 0.01302989, + "epoch": 0.04785810912370359, + "flos": 23190058212480.0, + "grad_norm": 8.675826434601191, + "language_loss": 0.85312498, + "learning_rate": 3.996656038125449e-06, + "loss": 0.94123614, + "num_input_tokens_seen": 17052215, + "router_z_loss_clip": 7.12109375, + "router_z_loss_mlp": 1.28466797, + "step": 796, + "time_per_iteration": 2.7435877323150635 + }, + { + "auxiliary_loss_clip": 0.07385565, + "auxiliary_loss_mlp": 0.0140352, + "balance_loss_clip": 0.06662786, + "balance_loss_mlp": 0.01285074, + "epoch": 0.047918232376371564, + "flos": 18046426677120.0, + "grad_norm": 54.926272560680225, + "language_loss": 0.8855834, + "learning_rate": 3.996633488284228e-06, + "loss": 0.97347426, + "num_input_tokens_seen": 17069225, + "router_z_loss_clip": 7.23046875, + "router_z_loss_mlp": 1.18359375, + "step": 797, + "time_per_iteration": 2.6623764038085938 + }, + { + "auxiliary_loss_clip": 0.07094701, + "auxiliary_loss_mlp": 0.01316158, + "balance_loss_clip": 0.0666967, + "balance_loss_mlp": 0.01274649, + "epoch": 0.04797835562903953, + "flos": 62461717511040.0, + "grad_norm": 0.9155106497251145, + "language_loss": 0.64821255, + "learning_rate": 3.996610862730465e-06, + "loss": 0.73232114, + "num_input_tokens_seen": 17126680, + "router_z_loss_clip": 4.25, + "router_z_loss_mlp": 0.4152832, + "step": 798, + "time_per_iteration": 3.148404121398926 + }, + { + "auxiliary_loss_clip": 0.07427999, + "auxiliary_loss_mlp": 0.01422996, + "balance_loss_clip": 0.06684162, + "balance_loss_mlp": 0.01303215, + "epoch": 0.0480384788817075, + "flos": 21513766556160.0, + "grad_norm": 16.018908533164023, + "language_loss": 0.96157068, + "learning_rate": 3.996588161465018e-06, + "loss": 1.05008054, + "num_input_tokens_seen": 17144835, + "router_z_loss_clip": 7.4453125, + "router_z_loss_mlp": 1.19775391, + "step": 799, + "time_per_iteration": 2.6639058589935303 + }, + { + "auxiliary_loss_clip": 0.07364519, + "auxiliary_loss_mlp": 0.01407648, + "balance_loss_clip": 0.06657426, + "balance_loss_mlp": 0.01297594, + "epoch": 0.048098602134375466, + "flos": 21733301053440.0, + "grad_norm": 22.047266878511874, + "language_loss": 0.92366803, + "learning_rate": 3.996565384488748e-06, + "loss": 1.01138973, + "num_input_tokens_seen": 17165030, + "router_z_loss_clip": 7.07421875, + "router_z_loss_mlp": 1.10253906, + "step": 800, + "time_per_iteration": 2.646414041519165 + }, + { + "auxiliary_loss_clip": 0.07370388, + "auxiliary_loss_mlp": 0.01385117, + "balance_loss_clip": 0.06655432, + "balance_loss_mlp": 0.01282549, + "epoch": 0.04815872538704344, + "flos": 22937931676800.0, + "grad_norm": 10.357052219396058, + "language_loss": 0.89344579, + "learning_rate": 3.996542531802518e-06, + "loss": 0.98100084, + "num_input_tokens_seen": 17184895, + "router_z_loss_clip": 7.1484375, + "router_z_loss_mlp": 1.02636719, + "step": 801, + "time_per_iteration": 2.6882050037384033 + }, + { + "auxiliary_loss_clip": 0.07345966, + "auxiliary_loss_mlp": 0.01362249, + "balance_loss_clip": 0.06635958, + "balance_loss_mlp": 0.01265022, + "epoch": 0.04821884863971141, + "flos": 43183952686080.0, + "grad_norm": 6.136831614794949, + "language_loss": 0.85035717, + "learning_rate": 3.996519603407196e-06, + "loss": 0.93743926, + "num_input_tokens_seen": 17208225, + "router_z_loss_clip": 7.10546875, + "router_z_loss_mlp": 0.97216797, + "step": 802, + "time_per_iteration": 2.79622220993042 + }, + { + "auxiliary_loss_clip": 0.07318079, + "auxiliary_loss_mlp": 0.01347073, + "balance_loss_clip": 0.06636789, + "balance_loss_mlp": 0.01265057, + "epoch": 0.048278971892379376, + "flos": 18625171628160.0, + "grad_norm": 43.20373329941697, + "language_loss": 0.91245079, + "learning_rate": 3.996496599303649e-06, + "loss": 0.99910235, + "num_input_tokens_seen": 17226305, + "router_z_loss_clip": 6.81640625, + "router_z_loss_mlp": 0.81982422, + "step": 803, + "time_per_iteration": 2.624542236328125 + }, + { + "auxiliary_loss_clip": 0.07327777, + "auxiliary_loss_mlp": 0.01365974, + "balance_loss_clip": 0.06626104, + "balance_loss_mlp": 0.01271798, + "epoch": 0.04833909514504735, + "flos": 20236279207680.0, + "grad_norm": 95.48194102470296, + "language_loss": 0.905747, + "learning_rate": 3.996473519492753e-06, + "loss": 0.99268442, + "num_input_tokens_seen": 17244545, + "router_z_loss_clip": 7.01953125, + "router_z_loss_mlp": 0.94238281, + "step": 804, + "time_per_iteration": 2.597118854522705 + }, + { + "auxiliary_loss_clip": 0.07322634, + "auxiliary_loss_mlp": 0.01340955, + "balance_loss_clip": 0.0662351, + "balance_loss_mlp": 0.01259273, + "epoch": 0.04839921839771532, + "flos": 24652182032640.0, + "grad_norm": 4.3863417773594096, + "language_loss": 0.91238397, + "learning_rate": 3.99645036397538e-06, + "loss": 0.99901986, + "num_input_tokens_seen": 17265730, + "router_z_loss_clip": 6.9921875, + "router_z_loss_mlp": 0.81689453, + "step": 805, + "time_per_iteration": 2.6999049186706543 + }, + { + "auxiliary_loss_clip": 0.07332969, + "auxiliary_loss_mlp": 0.01347421, + "balance_loss_clip": 0.06628814, + "balance_loss_mlp": 0.01263783, + "epoch": 0.048459341650383285, + "flos": 24834470590080.0, + "grad_norm": 14.417666191465669, + "language_loss": 0.71703786, + "learning_rate": 3.9964271327524085e-06, + "loss": 0.80384171, + "num_input_tokens_seen": 17284820, + "router_z_loss_clip": 7.046875, + "router_z_loss_mlp": 0.8359375, + "step": 806, + "time_per_iteration": 4.025094985961914 + }, + { + "auxiliary_loss_clip": 0.07307116, + "auxiliary_loss_mlp": 0.01343001, + "balance_loss_clip": 0.06628814, + "balance_loss_mlp": 0.01262844, + "epoch": 0.04851946490305126, + "flos": 22169644790400.0, + "grad_norm": 6.037392612651371, + "language_loss": 0.81120235, + "learning_rate": 3.9964038258247214e-06, + "loss": 0.89770353, + "num_input_tokens_seen": 17305085, + "router_z_loss_clip": 6.7734375, + "router_z_loss_mlp": 0.80126953, + "step": 807, + "time_per_iteration": 4.06866717338562 + }, + { + "auxiliary_loss_clip": 0.07289852, + "auxiliary_loss_mlp": 0.01348053, + "balance_loss_clip": 0.06616738, + "balance_loss_mlp": 0.01266228, + "epoch": 0.04857958815571922, + "flos": 19798132608000.0, + "grad_norm": 11.228648532877324, + "language_loss": 0.92036742, + "learning_rate": 3.9963804431932005e-06, + "loss": 1.00674641, + "num_input_tokens_seen": 17322715, + "router_z_loss_clip": 6.73046875, + "router_z_loss_mlp": 0.81738281, + "step": 808, + "time_per_iteration": 3.9916791915893555 + }, + { + "auxiliary_loss_clip": 0.07360442, + "auxiliary_loss_mlp": 0.01352716, + "balance_loss_clip": 0.06635769, + "balance_loss_mlp": 0.01261115, + "epoch": 0.048639711408387194, + "flos": 18703981992960.0, + "grad_norm": 6.742572767322423, + "language_loss": 0.95677304, + "learning_rate": 3.996356984858732e-06, + "loss": 1.04390454, + "num_input_tokens_seen": 17341455, + "router_z_loss_clip": 7.2421875, + "router_z_loss_mlp": 0.91699219, + "step": 809, + "time_per_iteration": 2.6680333614349365 + }, + { + "auxiliary_loss_clip": 0.07315584, + "auxiliary_loss_mlp": 0.01344649, + "balance_loss_clip": 0.06624336, + "balance_loss_mlp": 0.01256863, + "epoch": 0.048699834661055166, + "flos": 24870458718720.0, + "grad_norm": 4.628704942448529, + "language_loss": 0.90077579, + "learning_rate": 3.996333450822208e-06, + "loss": 0.98737824, + "num_input_tokens_seen": 17360765, + "router_z_loss_clip": 6.92578125, + "router_z_loss_mlp": 0.87841797, + "step": 810, + "time_per_iteration": 2.6677091121673584 + }, + { + "auxiliary_loss_clip": 0.07363133, + "auxiliary_loss_mlp": 0.01339196, + "balance_loss_clip": 0.06638221, + "balance_loss_mlp": 0.0126109, + "epoch": 0.04875995791372313, + "flos": 20710246227840.0, + "grad_norm": 31.095133807277897, + "language_loss": 0.84460914, + "learning_rate": 3.99630984108452e-06, + "loss": 0.9316324, + "num_input_tokens_seen": 17380625, + "router_z_loss_clip": 7.25, + "router_z_loss_mlp": 0.78125, + "step": 811, + "time_per_iteration": 4.020594358444214 + }, + { + "auxiliary_loss_clip": 0.07316839, + "auxiliary_loss_mlp": 0.01338146, + "balance_loss_clip": 0.06624701, + "balance_loss_mlp": 0.01256941, + "epoch": 0.048820081166391104, + "flos": 18594256671360.0, + "grad_norm": 4.82975857058881, + "language_loss": 0.78335881, + "learning_rate": 3.9962861556465615e-06, + "loss": 0.86990869, + "num_input_tokens_seen": 17399355, + "router_z_loss_clip": 6.92578125, + "router_z_loss_mlp": 0.81152344, + "step": 812, + "time_per_iteration": 2.614077091217041 + }, + { + "auxiliary_loss_clip": 0.0728099, + "auxiliary_loss_mlp": 0.01351533, + "balance_loss_clip": 0.06610497, + "balance_loss_mlp": 0.0127009, + "epoch": 0.04888020441905907, + "flos": 22713324007680.0, + "grad_norm": 17.655616040127313, + "language_loss": 0.94109142, + "learning_rate": 3.996262394509233e-06, + "loss": 1.02741659, + "num_input_tokens_seen": 17418240, + "router_z_loss_clip": 6.703125, + "router_z_loss_mlp": 0.81494141, + "step": 813, + "time_per_iteration": 2.5956995487213135 + }, + { + "auxiliary_loss_clip": 0.07318511, + "auxiliary_loss_mlp": 0.01349544, + "balance_loss_clip": 0.0662335, + "balance_loss_mlp": 0.01262807, + "epoch": 0.04894032767172704, + "flos": 22791044269440.0, + "grad_norm": 7.289252550466507, + "language_loss": 0.78803051, + "learning_rate": 3.9962385576734335e-06, + "loss": 0.87471104, + "num_input_tokens_seen": 17436250, + "router_z_loss_clip": 6.953125, + "router_z_loss_mlp": 0.8671875, + "step": 814, + "time_per_iteration": 2.625399351119995 + }, + { + "auxiliary_loss_clip": 0.07335538, + "auxiliary_loss_mlp": 0.01355257, + "balance_loss_clip": 0.06626598, + "balance_loss_mlp": 0.01267948, + "epoch": 0.04900045092439501, + "flos": 25522521592320.0, + "grad_norm": 46.975949242566905, + "language_loss": 0.87790531, + "learning_rate": 3.9962146451400675e-06, + "loss": 0.96481323, + "num_input_tokens_seen": 17455750, + "router_z_loss_clip": 7.1015625, + "router_z_loss_mlp": 0.87451172, + "step": 815, + "time_per_iteration": 2.6799027919769287 + }, + { + "auxiliary_loss_clip": 0.0734727, + "auxiliary_loss_mlp": 0.0137345, + "balance_loss_clip": 0.06619896, + "balance_loss_mlp": 0.01271788, + "epoch": 0.04906057417706298, + "flos": 25965280166400.0, + "grad_norm": 11.89199068240792, + "language_loss": 0.95818853, + "learning_rate": 3.996190656910043e-06, + "loss": 1.04539561, + "num_input_tokens_seen": 17474995, + "router_z_loss_clip": 7.28125, + "router_z_loss_mlp": 1.01757812, + "step": 816, + "time_per_iteration": 2.668058395385742 + }, + { + "auxiliary_loss_clip": 0.07340101, + "auxiliary_loss_mlp": 0.01360138, + "balance_loss_clip": 0.066241, + "balance_loss_mlp": 0.01271828, + "epoch": 0.04912069742973095, + "flos": 18630580216320.0, + "grad_norm": 8.092720893633917, + "language_loss": 0.84299397, + "learning_rate": 3.996166592984268e-06, + "loss": 0.92999631, + "num_input_tokens_seen": 17493395, + "router_z_loss_clip": 7.1484375, + "router_z_loss_mlp": 0.88330078, + "step": 817, + "time_per_iteration": 2.5901565551757812 + }, + { + "auxiliary_loss_clip": 0.07312281, + "auxiliary_loss_mlp": 0.01371477, + "balance_loss_clip": 0.06618914, + "balance_loss_mlp": 0.01282404, + "epoch": 0.049180820682398915, + "flos": 23707182885120.0, + "grad_norm": 5.174214831161968, + "language_loss": 0.88566625, + "learning_rate": 3.996142453363656e-06, + "loss": 0.97250384, + "num_input_tokens_seen": 17514565, + "router_z_loss_clip": 6.93359375, + "router_z_loss_mlp": 0.89013672, + "step": 818, + "time_per_iteration": 2.6751646995544434 + }, + { + "auxiliary_loss_clip": 0.07361554, + "auxiliary_loss_mlp": 0.01384487, + "balance_loss_clip": 0.06625406, + "balance_loss_mlp": 0.01290598, + "epoch": 0.04924094393506689, + "flos": 22427179914240.0, + "grad_norm": 6.808629946314654, + "language_loss": 0.81731856, + "learning_rate": 3.996118238049124e-06, + "loss": 0.90477902, + "num_input_tokens_seen": 17534590, + "router_z_loss_clip": 7.36328125, + "router_z_loss_mlp": 0.93798828, + "step": 819, + "time_per_iteration": 2.638293504714966 + }, + { + "auxiliary_loss_clip": 0.07319279, + "auxiliary_loss_mlp": 0.01377789, + "balance_loss_clip": 0.06608901, + "balance_loss_mlp": 0.01285903, + "epoch": 0.04930106718773486, + "flos": 15743033464320.0, + "grad_norm": 10.609665501519604, + "language_loss": 0.88234192, + "learning_rate": 3.996093947041586e-06, + "loss": 0.96931261, + "num_input_tokens_seen": 17551900, + "router_z_loss_clip": 7.109375, + "router_z_loss_mlp": 0.91845703, + "step": 820, + "time_per_iteration": 2.6076858043670654 + }, + { + "auxiliary_loss_clip": 0.07310833, + "auxiliary_loss_mlp": 0.01372579, + "balance_loss_clip": 0.06604609, + "balance_loss_mlp": 0.01282171, + "epoch": 0.049361190440402825, + "flos": 26257922951040.0, + "grad_norm": 5.648893665912937, + "language_loss": 0.94581264, + "learning_rate": 3.996069580341966e-06, + "loss": 1.03264678, + "num_input_tokens_seen": 17571485, + "router_z_loss_clip": 7.0703125, + "router_z_loss_mlp": 0.90380859, + "step": 821, + "time_per_iteration": 2.7164249420166016 + }, + { + "auxiliary_loss_clip": 0.07296955, + "auxiliary_loss_mlp": 0.01366561, + "balance_loss_clip": 0.0660333, + "balance_loss_mlp": 0.01277488, + "epoch": 0.0494213136930708, + "flos": 21258872835840.0, + "grad_norm": 13.842694995476421, + "language_loss": 0.93458569, + "learning_rate": 3.996045137951188e-06, + "loss": 1.02122092, + "num_input_tokens_seen": 17591410, + "router_z_loss_clip": 6.9453125, + "router_z_loss_mlp": 0.890625, + "step": 822, + "time_per_iteration": 2.6453444957733154 + }, + { + "auxiliary_loss_clip": 0.07319045, + "auxiliary_loss_mlp": 0.01374655, + "balance_loss_clip": 0.06613644, + "balance_loss_mlp": 0.0128048, + "epoch": 0.04948143694573876, + "flos": 27973095701760.0, + "grad_norm": 7.088849816783062, + "language_loss": 0.7121917, + "learning_rate": 3.996020619870178e-06, + "loss": 0.79912865, + "num_input_tokens_seen": 17612010, + "router_z_loss_clip": 7.05078125, + "router_z_loss_mlp": 0.94238281, + "step": 823, + "time_per_iteration": 2.6804885864257812 + }, + { + "auxiliary_loss_clip": 0.06953795, + "auxiliary_loss_mlp": 0.01404355, + "balance_loss_clip": 0.06535611, + "balance_loss_mlp": 0.01345371, + "epoch": 0.049541560198406734, + "flos": 66197466345600.0, + "grad_norm": 1.28356919167216, + "language_loss": 0.63197851, + "learning_rate": 3.995996026099866e-06, + "loss": 0.71555996, + "num_input_tokens_seen": 17673430, + "router_z_loss_clip": 4.1875, + "router_z_loss_mlp": 0.58837891, + "step": 824, + "time_per_iteration": 3.3058674335479736 + }, + { + "auxiliary_loss_clip": 0.07323784, + "auxiliary_loss_mlp": 0.01374745, + "balance_loss_clip": 0.06612824, + "balance_loss_mlp": 0.01280998, + "epoch": 0.049601683451074706, + "flos": 22899218290560.0, + "grad_norm": 5.8210235967171435, + "language_loss": 0.9564544, + "learning_rate": 3.995971356641185e-06, + "loss": 1.04343963, + "num_input_tokens_seen": 17689545, + "router_z_loss_clip": 7.11328125, + "router_z_loss_mlp": 0.9375, + "step": 825, + "time_per_iteration": 2.62613844871521 + }, + { + "auxiliary_loss_clip": 0.07281419, + "auxiliary_loss_mlp": 0.01365594, + "balance_loss_clip": 0.06597939, + "balance_loss_mlp": 0.0127695, + "epoch": 0.04966180670374267, + "flos": 21439987436160.0, + "grad_norm": 7.03533776815666, + "language_loss": 0.71345061, + "learning_rate": 3.9959466114950695e-06, + "loss": 0.7999208, + "num_input_tokens_seen": 17705965, + "router_z_loss_clip": 6.83984375, + "router_z_loss_mlp": 0.88671875, + "step": 826, + "time_per_iteration": 2.607252359390259 + }, + { + "auxiliary_loss_clip": 0.07308409, + "auxiliary_loss_mlp": 0.01368352, + "balance_loss_clip": 0.06603594, + "balance_loss_mlp": 0.0127885, + "epoch": 0.04972192995641064, + "flos": 23113218418560.0, + "grad_norm": 6.719033594417253, + "language_loss": 0.82099521, + "learning_rate": 3.995921790662459e-06, + "loss": 0.90776283, + "num_input_tokens_seen": 17724580, + "router_z_loss_clip": 7.05078125, + "router_z_loss_mlp": 0.89550781, + "step": 827, + "time_per_iteration": 2.6468021869659424 + }, + { + "auxiliary_loss_clip": 0.07312737, + "auxiliary_loss_mlp": 0.01384514, + "balance_loss_clip": 0.06605525, + "balance_loss_mlp": 0.01293009, + "epoch": 0.04978205320907861, + "flos": 40415648693760.0, + "grad_norm": 3.6071356819257336, + "language_loss": 0.83064795, + "learning_rate": 3.995896894144294e-06, + "loss": 0.91762054, + "num_input_tokens_seen": 17747755, + "router_z_loss_clip": 7.05859375, + "router_z_loss_mlp": 0.91455078, + "step": 828, + "time_per_iteration": 2.7598366737365723 + }, + { + "auxiliary_loss_clip": 0.07248655, + "auxiliary_loss_mlp": 0.01357422, + "balance_loss_clip": 0.06587116, + "balance_loss_mlp": 0.01271687, + "epoch": 0.04984217646174658, + "flos": 25235580885120.0, + "grad_norm": 7.916023460171269, + "language_loss": 0.88066685, + "learning_rate": 3.995871921941519e-06, + "loss": 0.96672761, + "num_input_tokens_seen": 17768550, + "router_z_loss_clip": 6.609375, + "router_z_loss_mlp": 0.85791016, + "step": 829, + "time_per_iteration": 2.664443016052246 + }, + { + "auxiliary_loss_clip": 0.07290308, + "auxiliary_loss_mlp": 0.01371956, + "balance_loss_clip": 0.06599583, + "balance_loss_mlp": 0.01282025, + "epoch": 0.04990229971441455, + "flos": 15964873948800.0, + "grad_norm": 30.23399077612731, + "language_loss": 0.79482603, + "learning_rate": 3.99584687405508e-06, + "loss": 0.88144869, + "num_input_tokens_seen": 17786080, + "router_z_loss_clip": 6.90625, + "router_z_loss_mlp": 0.90039062, + "step": 830, + "time_per_iteration": 2.5562844276428223 + }, + { + "auxiliary_loss_clip": 0.07284638, + "auxiliary_loss_mlp": 0.01358745, + "balance_loss_clip": 0.06602956, + "balance_loss_mlp": 0.01273677, + "epoch": 0.04996242296708252, + "flos": 18410919937920.0, + "grad_norm": 6.720833612775693, + "language_loss": 0.82703733, + "learning_rate": 3.995821750485929e-06, + "loss": 0.91347122, + "num_input_tokens_seen": 17803635, + "router_z_loss_clip": 6.81640625, + "router_z_loss_mlp": 0.85058594, + "step": 831, + "time_per_iteration": 2.6576318740844727 + }, + { + "auxiliary_loss_clip": 0.07282449, + "auxiliary_loss_mlp": 0.01350763, + "balance_loss_clip": 0.06587234, + "balance_loss_mlp": 0.01262882, + "epoch": 0.05002254621975049, + "flos": 17863802703360.0, + "grad_norm": 5.424543563535015, + "language_loss": 0.97343409, + "learning_rate": 3.995796551235016e-06, + "loss": 1.05976629, + "num_input_tokens_seen": 17822190, + "router_z_loss_clip": 6.953125, + "router_z_loss_mlp": 0.87939453, + "step": 832, + "time_per_iteration": 2.5859360694885254 + }, + { + "auxiliary_loss_clip": 0.07242593, + "auxiliary_loss_mlp": 0.01355446, + "balance_loss_clip": 0.06576244, + "balance_loss_mlp": 0.01268804, + "epoch": 0.050082669472418455, + "flos": 45670682632320.0, + "grad_norm": 14.668918539875873, + "language_loss": 0.86283791, + "learning_rate": 3.9957712763032974e-06, + "loss": 0.94881833, + "num_input_tokens_seen": 17846915, + "router_z_loss_clip": 6.66015625, + "router_z_loss_mlp": 0.86621094, + "step": 833, + "time_per_iteration": 2.8055691719055176 + }, + { + "auxiliary_loss_clip": 0.07249285, + "auxiliary_loss_mlp": 0.01350346, + "balance_loss_clip": 0.06584433, + "balance_loss_mlp": 0.01262561, + "epoch": 0.05014279272508643, + "flos": 37971237859200.0, + "grad_norm": 3.800888643683855, + "language_loss": 0.8636179, + "learning_rate": 3.995745925691733e-06, + "loss": 0.94961417, + "num_input_tokens_seen": 17867270, + "router_z_loss_clip": 6.64453125, + "router_z_loss_mlp": 0.87695312, + "step": 834, + "time_per_iteration": 2.757873296737671 + }, + { + "auxiliary_loss_clip": 0.07281981, + "auxiliary_loss_mlp": 0.01348084, + "balance_loss_clip": 0.0659239, + "balance_loss_mlp": 0.01265353, + "epoch": 0.0502029159777544, + "flos": 21002511669120.0, + "grad_norm": 6.832202768967494, + "language_loss": 0.96576416, + "learning_rate": 3.995720499401282e-06, + "loss": 1.0520649, + "num_input_tokens_seen": 17884880, + "router_z_loss_clip": 6.890625, + "router_z_loss_mlp": 0.82666016, + "step": 835, + "time_per_iteration": 2.5905637741088867 + }, + { + "auxiliary_loss_clip": 0.07274499, + "auxiliary_loss_mlp": 0.01349147, + "balance_loss_clip": 0.06586967, + "balance_loss_mlp": 0.01266273, + "epoch": 0.050263039230422364, + "flos": 15893526597120.0, + "grad_norm": 5.723886418395804, + "language_loss": 0.82083344, + "learning_rate": 3.995694997432911e-06, + "loss": 0.90706992, + "num_input_tokens_seen": 17903695, + "router_z_loss_clip": 6.87890625, + "router_z_loss_mlp": 0.82861328, + "step": 836, + "time_per_iteration": 2.6167397499084473 + }, + { + "auxiliary_loss_clip": 0.0721738, + "auxiliary_loss_mlp": 0.01338932, + "balance_loss_clip": 0.06569374, + "balance_loss_mlp": 0.01261065, + "epoch": 0.050323162483090336, + "flos": 23739565288320.0, + "grad_norm": 23.66781297023958, + "language_loss": 0.88235295, + "learning_rate": 3.9956694197875855e-06, + "loss": 0.96791613, + "num_input_tokens_seen": 17920745, + "router_z_loss_clip": 6.48046875, + "router_z_loss_mlp": 0.77832031, + "step": 837, + "time_per_iteration": 2.614959955215454 + }, + { + "auxiliary_loss_clip": 0.07221343, + "auxiliary_loss_mlp": 0.01354096, + "balance_loss_clip": 0.06550418, + "balance_loss_mlp": 0.01265261, + "epoch": 0.0503832857357583, + "flos": 20272393117440.0, + "grad_norm": 6.0443181189796995, + "language_loss": 0.76965159, + "learning_rate": 3.995643766466275e-06, + "loss": 0.85540605, + "num_input_tokens_seen": 17938220, + "router_z_loss_clip": 6.7109375, + "router_z_loss_mlp": 0.88769531, + "step": 838, + "time_per_iteration": 2.622648239135742 + }, + { + "auxiliary_loss_clip": 0.0724083, + "auxiliary_loss_mlp": 0.01341893, + "balance_loss_clip": 0.06561115, + "balance_loss_mlp": 0.01259353, + "epoch": 0.05044340898842627, + "flos": 17790736343040.0, + "grad_norm": 4.747797763129113, + "language_loss": 0.86986995, + "learning_rate": 3.995618037469953e-06, + "loss": 0.95569718, + "num_input_tokens_seen": 17957325, + "router_z_loss_clip": 6.796875, + "router_z_loss_mlp": 0.82519531, + "step": 839, + "time_per_iteration": 2.5999207496643066 + }, + { + "auxiliary_loss_clip": 0.07210248, + "auxiliary_loss_mlp": 0.01342514, + "balance_loss_clip": 0.06558718, + "balance_loss_mlp": 0.01262024, + "epoch": 0.050503532241094246, + "flos": 22973207045760.0, + "grad_norm": 3.66950577076863, + "language_loss": 0.88844591, + "learning_rate": 3.995592232799595e-06, + "loss": 0.97397357, + "num_input_tokens_seen": 17975875, + "router_z_loss_clip": 6.51953125, + "router_z_loss_mlp": 0.80517578, + "step": 840, + "time_per_iteration": 2.688936948776245 + }, + { + "auxiliary_loss_clip": 0.07223296, + "auxiliary_loss_mlp": 0.01348235, + "balance_loss_clip": 0.06565775, + "balance_loss_mlp": 0.01264264, + "epoch": 0.05056365549376221, + "flos": 22782449226240.0, + "grad_norm": 5.237976654716359, + "language_loss": 0.98182797, + "learning_rate": 3.99556635245618e-06, + "loss": 1.06754327, + "num_input_tokens_seen": 17994340, + "router_z_loss_clip": 6.57421875, + "router_z_loss_mlp": 0.84033203, + "step": 841, + "time_per_iteration": 2.626171588897705 + }, + { + "auxiliary_loss_clip": 0.07216457, + "auxiliary_loss_mlp": 0.01346197, + "balance_loss_clip": 0.06556017, + "balance_loss_mlp": 0.01263227, + "epoch": 0.05062377874643018, + "flos": 30924401011200.0, + "grad_norm": 3.922284831716734, + "language_loss": 0.81540143, + "learning_rate": 3.995540396440688e-06, + "loss": 0.90102798, + "num_input_tokens_seen": 18015260, + "router_z_loss_clip": 6.609375, + "router_z_loss_mlp": 0.82958984, + "step": 842, + "time_per_iteration": 2.707146167755127 + }, + { + "auxiliary_loss_clip": 0.07236033, + "auxiliary_loss_mlp": 0.01355891, + "balance_loss_clip": 0.06555693, + "balance_loss_mlp": 0.0126391, + "epoch": 0.05068390199909815, + "flos": 19653425406720.0, + "grad_norm": 6.4717382946502635, + "language_loss": 0.81965601, + "learning_rate": 3.995514364754105e-06, + "loss": 0.90557522, + "num_input_tokens_seen": 18033960, + "router_z_loss_clip": 6.80078125, + "router_z_loss_mlp": 0.91943359, + "step": 843, + "time_per_iteration": 2.672064781188965 + }, + { + "auxiliary_loss_clip": 0.07235807, + "auxiliary_loss_mlp": 0.01361352, + "balance_loss_clip": 0.06552228, + "balance_loss_mlp": 0.01271992, + "epoch": 0.05074402525176612, + "flos": 37971279786240.0, + "grad_norm": 2.407141650516338, + "language_loss": 0.87016606, + "learning_rate": 3.995488257397417e-06, + "loss": 0.95613766, + "num_input_tokens_seen": 18056700, + "router_z_loss_clip": 6.83203125, + "router_z_loss_mlp": 0.89404297, + "step": 844, + "time_per_iteration": 2.7541916370391846 + }, + { + "auxiliary_loss_clip": 0.07238596, + "auxiliary_loss_mlp": 0.01357268, + "balance_loss_clip": 0.06561587, + "balance_loss_mlp": 0.01275109, + "epoch": 0.05080414850443409, + "flos": 22061177280000.0, + "grad_norm": 5.7438919546505876, + "language_loss": 0.80192208, + "learning_rate": 3.995462074371614e-06, + "loss": 0.8878808, + "num_input_tokens_seen": 18075815, + "router_z_loss_clip": 6.76953125, + "router_z_loss_mlp": 0.82226562, + "step": 845, + "time_per_iteration": 2.5944912433624268 + }, + { + "auxiliary_loss_clip": 0.07213366, + "auxiliary_loss_mlp": 0.01353915, + "balance_loss_clip": 0.06554674, + "balance_loss_mlp": 0.01268561, + "epoch": 0.05086427175710206, + "flos": 20231289889920.0, + "grad_norm": 4.0486216034950475, + "language_loss": 0.91612351, + "learning_rate": 3.99543581567769e-06, + "loss": 1.00179636, + "num_input_tokens_seen": 18095095, + "router_z_loss_clip": 6.578125, + "router_z_loss_mlp": 0.85400391, + "step": 846, + "time_per_iteration": 4.029407739639282 + }, + { + "auxiliary_loss_clip": 0.07198675, + "auxiliary_loss_mlp": 0.01353444, + "balance_loss_clip": 0.06555093, + "balance_loss_mlp": 0.01271094, + "epoch": 0.05092439500977003, + "flos": 15164707783680.0, + "grad_norm": 2.8334464640278307, + "language_loss": 0.91321969, + "learning_rate": 3.9954094813166394e-06, + "loss": 0.99874079, + "num_input_tokens_seen": 18112675, + "router_z_loss_clip": 6.4375, + "router_z_loss_mlp": 0.82324219, + "step": 847, + "time_per_iteration": 4.004042863845825 + }, + { + "auxiliary_loss_clip": 0.07199422, + "auxiliary_loss_mlp": 0.01355266, + "balance_loss_clip": 0.0654697, + "balance_loss_mlp": 0.01273202, + "epoch": 0.050984518262437994, + "flos": 22061806185600.0, + "grad_norm": 3.421485941815423, + "language_loss": 0.86160553, + "learning_rate": 3.995383071289462e-06, + "loss": 0.94715238, + "num_input_tokens_seen": 18130745, + "router_z_loss_clip": 6.52734375, + "router_z_loss_mlp": 0.82080078, + "step": 848, + "time_per_iteration": 4.033248662948608 + }, + { + "auxiliary_loss_clip": 0.07196971, + "auxiliary_loss_mlp": 0.01345708, + "balance_loss_clip": 0.06533228, + "balance_loss_mlp": 0.01262166, + "epoch": 0.05104464151510597, + "flos": 30232911991680.0, + "grad_norm": 3.7966495356829357, + "language_loss": 0.90386808, + "learning_rate": 3.995356585597158e-06, + "loss": 0.98929483, + "num_input_tokens_seen": 18152410, + "router_z_loss_clip": 6.640625, + "router_z_loss_mlp": 0.83544922, + "step": 849, + "time_per_iteration": 2.6612625122070312 + }, + { + "auxiliary_loss_clip": 0.07179346, + "auxiliary_loss_mlp": 0.01359214, + "balance_loss_clip": 0.06533284, + "balance_loss_mlp": 0.01279106, + "epoch": 0.05110476476777394, + "flos": 18338817899520.0, + "grad_norm": 8.277424439503498, + "language_loss": 0.88001835, + "learning_rate": 3.995330024240732e-06, + "loss": 0.96540397, + "num_input_tokens_seen": 18170870, + "router_z_loss_clip": 6.45703125, + "router_z_loss_mlp": 0.80126953, + "step": 850, + "time_per_iteration": 2.591169834136963 + }, + { + "auxiliary_loss_clip": 0.07213688, + "auxiliary_loss_mlp": 0.01358343, + "balance_loss_clip": 0.06542021, + "balance_loss_mlp": 0.01272131, + "epoch": 0.051164888020441904, + "flos": 38007938747520.0, + "grad_norm": 2.8793275004055894, + "language_loss": 0.702048, + "learning_rate": 3.995303387221192e-06, + "loss": 0.78776836, + "num_input_tokens_seen": 18191555, + "router_z_loss_clip": 6.72265625, + "router_z_loss_mlp": 0.86328125, + "step": 851, + "time_per_iteration": 4.218145132064819 + }, + { + "auxiliary_loss_clip": 0.07192284, + "auxiliary_loss_mlp": 0.0136467, + "balance_loss_clip": 0.06527439, + "balance_loss_mlp": 0.01276741, + "epoch": 0.051225011273109876, + "flos": 23045183303040.0, + "grad_norm": 3.6723766751173894, + "language_loss": 0.87184155, + "learning_rate": 3.995276674539547e-06, + "loss": 0.95741105, + "num_input_tokens_seen": 18208620, + "router_z_loss_clip": 6.66015625, + "router_z_loss_mlp": 0.87939453, + "step": 852, + "time_per_iteration": 2.629037380218506 + }, + { + "auxiliary_loss_clip": 0.07206973, + "auxiliary_loss_mlp": 0.01354841, + "balance_loss_clip": 0.06534127, + "balance_loss_mlp": 0.01269678, + "epoch": 0.05128513452577785, + "flos": 18265709612160.0, + "grad_norm": 3.821037496712823, + "language_loss": 0.8378402, + "learning_rate": 3.995249886196811e-06, + "loss": 0.92345834, + "num_input_tokens_seen": 18226370, + "router_z_loss_clip": 6.73046875, + "router_z_loss_mlp": 0.8515625, + "step": 853, + "time_per_iteration": 2.6316466331481934 + }, + { + "auxiliary_loss_clip": 0.07211602, + "auxiliary_loss_mlp": 0.01339797, + "balance_loss_clip": 0.06537303, + "balance_loss_mlp": 0.01257733, + "epoch": 0.05134525777844581, + "flos": 27206360115840.0, + "grad_norm": 3.182696022693741, + "language_loss": 0.80133533, + "learning_rate": 3.995223022193999e-06, + "loss": 0.88684934, + "num_input_tokens_seen": 18247075, + "router_z_loss_clip": 6.7421875, + "router_z_loss_mlp": 0.82080078, + "step": 854, + "time_per_iteration": 2.6477131843566895 + }, + { + "auxiliary_loss_clip": 0.07215541, + "auxiliary_loss_mlp": 0.01344733, + "balance_loss_clip": 0.0654063, + "balance_loss_mlp": 0.01263146, + "epoch": 0.051405381031113785, + "flos": 28369132824960.0, + "grad_norm": 35.99472555736179, + "language_loss": 0.85045469, + "learning_rate": 3.99519608253213e-06, + "loss": 0.93605745, + "num_input_tokens_seen": 18265680, + "router_z_loss_clip": 6.74609375, + "router_z_loss_mlp": 0.81542969, + "step": 855, + "time_per_iteration": 2.6279296875 + }, + { + "auxiliary_loss_clip": 0.06909335, + "auxiliary_loss_mlp": 0.01436301, + "balance_loss_clip": 0.0650633, + "balance_loss_mlp": 0.01398083, + "epoch": 0.05146550428378175, + "flos": 65638049760000.0, + "grad_norm": 0.9716530477482218, + "language_loss": 0.65818644, + "learning_rate": 3.995169067212227e-06, + "loss": 0.74164271, + "num_input_tokens_seen": 18327015, + "router_z_loss_clip": 4.01171875, + "router_z_loss_mlp": 0.3815918, + "step": 856, + "time_per_iteration": 3.1742889881134033 + }, + { + "auxiliary_loss_clip": 0.0715993, + "auxiliary_loss_mlp": 0.01330963, + "balance_loss_clip": 0.06518224, + "balance_loss_mlp": 0.01252571, + "epoch": 0.05152562753644972, + "flos": 22061470769280.0, + "grad_norm": 29.089515075725927, + "language_loss": 0.80351281, + "learning_rate": 3.9951419762353116e-06, + "loss": 0.88842171, + "num_input_tokens_seen": 18345235, + "router_z_loss_clip": 6.41796875, + "router_z_loss_mlp": 0.78417969, + "step": 857, + "time_per_iteration": 2.6136977672576904 + }, + { + "auxiliary_loss_clip": 0.07196955, + "auxiliary_loss_mlp": 0.01347875, + "balance_loss_clip": 0.06528607, + "balance_loss_mlp": 0.01259422, + "epoch": 0.051585750789117694, + "flos": 18514523911680.0, + "grad_norm": 4.501526487205694, + "language_loss": 0.9266271, + "learning_rate": 3.995114809602412e-06, + "loss": 1.01207542, + "num_input_tokens_seen": 18362350, + "router_z_loss_clip": 6.6875, + "router_z_loss_mlp": 0.88427734, + "step": 858, + "time_per_iteration": 2.606518268585205 + }, + { + "auxiliary_loss_clip": 0.07190363, + "auxiliary_loss_mlp": 0.0134683, + "balance_loss_clip": 0.06527077, + "balance_loss_mlp": 0.01261381, + "epoch": 0.05164587404178566, + "flos": 23736630395520.0, + "grad_norm": 4.049462391518637, + "language_loss": 0.80811787, + "learning_rate": 3.9950875673145605e-06, + "loss": 0.89348972, + "num_input_tokens_seen": 18383390, + "router_z_loss_clip": 6.6328125, + "router_z_loss_mlp": 0.85400391, + "step": 859, + "time_per_iteration": 2.624462604522705 + }, + { + "auxiliary_loss_clip": 0.07202329, + "auxiliary_loss_mlp": 0.01352935, + "balance_loss_clip": 0.06525081, + "balance_loss_mlp": 0.01264196, + "epoch": 0.05170599729445363, + "flos": 16258397201280.0, + "grad_norm": 12.806303000100046, + "language_loss": 0.95290452, + "learning_rate": 3.995060249372788e-06, + "loss": 1.03845716, + "num_input_tokens_seen": 18399220, + "router_z_loss_clip": 6.78125, + "router_z_loss_mlp": 0.88769531, + "step": 860, + "time_per_iteration": 2.6383068561553955 + }, + { + "auxiliary_loss_clip": 0.07167631, + "auxiliary_loss_mlp": 0.01344788, + "balance_loss_clip": 0.06524719, + "balance_loss_mlp": 0.01262868, + "epoch": 0.0517661205471216, + "flos": 23992404583680.0, + "grad_norm": 3.0591302489664116, + "language_loss": 0.86028093, + "learning_rate": 3.99503285577813e-06, + "loss": 0.94540519, + "num_input_tokens_seen": 18419005, + "router_z_loss_clip": 6.4375, + "router_z_loss_mlp": 0.81884766, + "step": 861, + "time_per_iteration": 2.6825718879699707 + }, + { + "auxiliary_loss_clip": 0.07179172, + "auxiliary_loss_mlp": 0.01338271, + "balance_loss_clip": 0.06521305, + "balance_loss_mlp": 0.01256732, + "epoch": 0.05182624379978957, + "flos": 29285313367680.0, + "grad_norm": 3.256695777108904, + "language_loss": 0.8236177, + "learning_rate": 3.995005386531627e-06, + "loss": 0.90879214, + "num_input_tokens_seen": 18440550, + "router_z_loss_clip": 6.578125, + "router_z_loss_mlp": 0.81542969, + "step": 862, + "time_per_iteration": 2.723032236099243 + }, + { + "auxiliary_loss_clip": 0.07146881, + "auxiliary_loss_mlp": 0.01338015, + "balance_loss_clip": 0.06502384, + "balance_loss_mlp": 0.01256428, + "epoch": 0.05188636705245754, + "flos": 24177753815040.0, + "grad_norm": 4.080001789672534, + "language_loss": 0.92516744, + "learning_rate": 3.9949778416343195e-06, + "loss": 1.01001632, + "num_input_tokens_seen": 18461950, + "router_z_loss_clip": 6.44140625, + "router_z_loss_mlp": 0.81591797, + "step": 863, + "time_per_iteration": 2.624147653579712 + }, + { + "auxiliary_loss_clip": 0.07156427, + "auxiliary_loss_mlp": 0.0133763, + "balance_loss_clip": 0.06515339, + "balance_loss_mlp": 0.01253897, + "epoch": 0.051946490305125506, + "flos": 26767961953920.0, + "grad_norm": 5.3541817649382875, + "language_loss": 0.7963919, + "learning_rate": 3.9949502210872525e-06, + "loss": 0.88133246, + "num_input_tokens_seen": 18480555, + "router_z_loss_clip": 6.41015625, + "router_z_loss_mlp": 0.83789062, + "step": 864, + "time_per_iteration": 2.6928389072418213 + }, + { + "auxiliary_loss_clip": 0.07167269, + "auxiliary_loss_mlp": 0.01333883, + "balance_loss_clip": 0.0651238, + "balance_loss_mlp": 0.01252963, + "epoch": 0.05200661355779348, + "flos": 21508190259840.0, + "grad_norm": 2.900845784392114, + "language_loss": 0.83983421, + "learning_rate": 3.994922524891474e-06, + "loss": 0.9248457, + "num_input_tokens_seen": 18499645, + "router_z_loss_clip": 6.546875, + "router_z_loss_mlp": 0.80908203, + "step": 865, + "time_per_iteration": 2.6349294185638428 + }, + { + "auxiliary_loss_clip": 0.07157271, + "auxiliary_loss_mlp": 0.01343197, + "balance_loss_clip": 0.06511506, + "balance_loss_mlp": 0.01259417, + "epoch": 0.05206673681046144, + "flos": 18120457359360.0, + "grad_norm": 4.23578044185309, + "language_loss": 0.89868104, + "learning_rate": 3.994894753048032e-06, + "loss": 0.98368573, + "num_input_tokens_seen": 18516810, + "router_z_loss_clip": 6.453125, + "router_z_loss_mlp": 0.83789062, + "step": 866, + "time_per_iteration": 2.605546236038208 + }, + { + "auxiliary_loss_clip": 0.07133412, + "auxiliary_loss_mlp": 0.01337077, + "balance_loss_clip": 0.06502427, + "balance_loss_mlp": 0.01258494, + "epoch": 0.052126860063129415, + "flos": 17528966588160.0, + "grad_norm": 5.089693219930068, + "language_loss": 0.91889334, + "learning_rate": 3.9948669055579815e-06, + "loss": 1.00359821, + "num_input_tokens_seen": 18532510, + "router_z_loss_clip": 6.30859375, + "router_z_loss_mlp": 0.78564453, + "step": 867, + "time_per_iteration": 2.5601866245269775 + }, + { + "auxiliary_loss_clip": 0.07109866, + "auxiliary_loss_mlp": 0.01340108, + "balance_loss_clip": 0.06500173, + "balance_loss_mlp": 0.0126019, + "epoch": 0.05218698331579739, + "flos": 32606227036800.0, + "grad_norm": 2.1025104258361558, + "language_loss": 0.66466248, + "learning_rate": 3.9948389824223785e-06, + "loss": 0.7491622, + "num_input_tokens_seen": 18557380, + "router_z_loss_clip": 6.09765625, + "router_z_loss_mlp": 0.79882812, + "step": 868, + "time_per_iteration": 2.6942384243011475 + }, + { + "auxiliary_loss_clip": 0.0714476, + "auxiliary_loss_mlp": 0.01358483, + "balance_loss_clip": 0.06494892, + "balance_loss_mlp": 0.01263545, + "epoch": 0.05224710656846535, + "flos": 22133824369920.0, + "grad_norm": 2.980657220865539, + "language_loss": 0.87344658, + "learning_rate": 3.994810983642281e-06, + "loss": 0.95847905, + "num_input_tokens_seen": 18575720, + "router_z_loss_clip": 6.5, + "router_z_loss_mlp": 0.94921875, + "step": 869, + "time_per_iteration": 2.5877575874328613 + }, + { + "auxiliary_loss_clip": 0.07143813, + "auxiliary_loss_mlp": 0.01349092, + "balance_loss_clip": 0.06488257, + "balance_loss_mlp": 0.01260353, + "epoch": 0.052307229821133325, + "flos": 11149789472640.0, + "grad_norm": 7.7840171376663285, + "language_loss": 0.91889322, + "learning_rate": 3.994782909218751e-06, + "loss": 1.00382233, + "num_input_tokens_seen": 18592185, + "router_z_loss_clip": 6.55859375, + "router_z_loss_mlp": 0.88720703, + "step": 870, + "time_per_iteration": 2.608442783355713 + }, + { + "auxiliary_loss_clip": 0.07122661, + "auxiliary_loss_mlp": 0.01356358, + "balance_loss_clip": 0.064864, + "balance_loss_mlp": 0.01265759, + "epoch": 0.05236735307380129, + "flos": 19132862716800.0, + "grad_norm": 2.918328667759454, + "language_loss": 0.843858, + "learning_rate": 3.994754759152854e-06, + "loss": 0.92864817, + "num_input_tokens_seen": 18609560, + "router_z_loss_clip": 6.3671875, + "router_z_loss_mlp": 0.90722656, + "step": 871, + "time_per_iteration": 2.5879244804382324 + }, + { + "auxiliary_loss_clip": 0.07078928, + "auxiliary_loss_mlp": 0.01364934, + "balance_loss_clip": 0.06478463, + "balance_loss_mlp": 0.01281488, + "epoch": 0.05242747632646926, + "flos": 20967152446080.0, + "grad_norm": 2.587533245039743, + "language_loss": 0.8462553, + "learning_rate": 3.994726533445656e-06, + "loss": 0.93069392, + "num_input_tokens_seen": 18629405, + "router_z_loss_clip": 6.0078125, + "router_z_loss_mlp": 0.83496094, + "step": 872, + "time_per_iteration": 2.6208133697509766 + }, + { + "auxiliary_loss_clip": 0.06844061, + "auxiliary_loss_mlp": 0.01482571, + "balance_loss_clip": 0.06436051, + "balance_loss_mlp": 0.0141405, + "epoch": 0.052487599579137234, + "flos": 65038005872640.0, + "grad_norm": 0.8977590463147395, + "language_loss": 0.61953008, + "learning_rate": 3.9946982320982274e-06, + "loss": 0.70279646, + "num_input_tokens_seen": 18681480, + "router_z_loss_clip": 4.0625, + "router_z_loss_mlp": 0.68603516, + "step": 873, + "time_per_iteration": 3.134603500366211 + }, + { + "auxiliary_loss_clip": 0.07129098, + "auxiliary_loss_mlp": 0.01340569, + "balance_loss_clip": 0.06492221, + "balance_loss_mlp": 0.01259269, + "epoch": 0.0525477228318052, + "flos": 23294584581120.0, + "grad_norm": 2.232892718211453, + "language_loss": 0.92670178, + "learning_rate": 3.994669855111643e-06, + "loss": 1.01139832, + "num_input_tokens_seen": 18700390, + "router_z_loss_clip": 6.37109375, + "router_z_loss_mlp": 0.81298828, + "step": 874, + "time_per_iteration": 2.6136653423309326 + }, + { + "auxiliary_loss_clip": 0.07136606, + "auxiliary_loss_mlp": 0.01342837, + "balance_loss_clip": 0.0649495, + "balance_loss_mlp": 0.01262681, + "epoch": 0.05260784608447317, + "flos": 32237834561280.0, + "grad_norm": 3.6657665933203796, + "language_loss": 0.78140688, + "learning_rate": 3.994641402486977e-06, + "loss": 0.86620128, + "num_input_tokens_seen": 18721280, + "router_z_loss_clip": 6.41796875, + "router_z_loss_mlp": 0.80175781, + "step": 875, + "time_per_iteration": 2.72760272026062 + }, + { + "auxiliary_loss_clip": 0.07132401, + "auxiliary_loss_mlp": 0.01330422, + "balance_loss_clip": 0.06503764, + "balance_loss_mlp": 0.01255511, + "epoch": 0.052667969337141136, + "flos": 24470270818560.0, + "grad_norm": 2.6184423818700684, + "language_loss": 0.96137547, + "learning_rate": 3.99461287422531e-06, + "loss": 1.04600358, + "num_input_tokens_seen": 18741545, + "router_z_loss_clip": 6.28515625, + "router_z_loss_mlp": 0.74902344, + "step": 876, + "time_per_iteration": 2.627152681350708 + }, + { + "auxiliary_loss_clip": 0.06850941, + "auxiliary_loss_mlp": 0.01378053, + "balance_loss_clip": 0.06451087, + "balance_loss_mlp": 0.01329487, + "epoch": 0.05272809258980911, + "flos": 57804673034880.0, + "grad_norm": 0.7984915998280667, + "language_loss": 0.63229537, + "learning_rate": 3.994584270327722e-06, + "loss": 0.7145853, + "num_input_tokens_seen": 18801400, + "router_z_loss_clip": 4.0, + "router_z_loss_mlp": 0.48510742, + "step": 877, + "time_per_iteration": 3.2541913986206055 + }, + { + "auxiliary_loss_clip": 0.0712804, + "auxiliary_loss_mlp": 0.01326088, + "balance_loss_clip": 0.06496318, + "balance_loss_mlp": 0.01255087, + "epoch": 0.05278821584247708, + "flos": 17426578498560.0, + "grad_norm": 2.7186428977077624, + "language_loss": 0.89685273, + "learning_rate": 3.994555590795299e-06, + "loss": 0.98139405, + "num_input_tokens_seen": 18819670, + "router_z_loss_clip": 6.3125, + "router_z_loss_mlp": 0.71044922, + "step": 878, + "time_per_iteration": 2.5782718658447266 + }, + { + "auxiliary_loss_clip": 0.07154611, + "auxiliary_loss_mlp": 0.0135536, + "balance_loss_clip": 0.06498797, + "balance_loss_mlp": 0.01272485, + "epoch": 0.052848339095145046, + "flos": 26143879144320.0, + "grad_norm": 3.677878171007489, + "language_loss": 0.873586, + "learning_rate": 3.9945268356291275e-06, + "loss": 0.9586857, + "num_input_tokens_seen": 18840580, + "router_z_loss_clip": 6.55859375, + "router_z_loss_mlp": 0.82910156, + "step": 879, + "time_per_iteration": 2.6588823795318604 + }, + { + "auxiliary_loss_clip": 0.07119917, + "auxiliary_loss_mlp": 0.01353348, + "balance_loss_clip": 0.06497534, + "balance_loss_mlp": 0.01274622, + "epoch": 0.05290846234781302, + "flos": 16477680136320.0, + "grad_norm": 3.320308324601447, + "language_loss": 0.88939857, + "learning_rate": 3.9944980048302985e-06, + "loss": 0.97413123, + "num_input_tokens_seen": 18859295, + "router_z_loss_clip": 6.2265625, + "router_z_loss_mlp": 0.78710938, + "step": 880, + "time_per_iteration": 2.578577756881714 + }, + { + "auxiliary_loss_clip": 0.07141528, + "auxiliary_loss_mlp": 0.01362108, + "balance_loss_clip": 0.06505635, + "balance_loss_mlp": 0.0127971, + "epoch": 0.05296858560048098, + "flos": 19871324749440.0, + "grad_norm": 13.59148063097553, + "language_loss": 0.93088204, + "learning_rate": 3.994469098399906e-06, + "loss": 1.01591837, + "num_input_tokens_seen": 18877485, + "router_z_loss_clip": 6.3671875, + "router_z_loss_mlp": 0.82421875, + "step": 881, + "time_per_iteration": 2.5984764099121094 + }, + { + "auxiliary_loss_clip": 0.07145406, + "auxiliary_loss_mlp": 0.01363259, + "balance_loss_clip": 0.06503064, + "balance_loss_mlp": 0.01280146, + "epoch": 0.053028708853148955, + "flos": 24395359668480.0, + "grad_norm": 2.511110361208876, + "language_loss": 0.91561359, + "learning_rate": 3.994440116339046e-06, + "loss": 1.00070024, + "num_input_tokens_seen": 18898275, + "router_z_loss_clip": 6.4140625, + "router_z_loss_mlp": 0.83203125, + "step": 882, + "time_per_iteration": 2.6321942806243896 + }, + { + "auxiliary_loss_clip": 0.07153618, + "auxiliary_loss_mlp": 0.01379213, + "balance_loss_clip": 0.06501983, + "balance_loss_mlp": 0.0129343, + "epoch": 0.05308883210581693, + "flos": 36402072048000.0, + "grad_norm": 3.8602802151834035, + "language_loss": 0.74549603, + "learning_rate": 3.994411058648816e-06, + "loss": 0.83082438, + "num_input_tokens_seen": 18920665, + "router_z_loss_clip": 6.515625, + "router_z_loss_mlp": 0.85839844, + "step": 883, + "time_per_iteration": 2.758694648742676 + }, + { + "auxiliary_loss_clip": 0.07123835, + "auxiliary_loss_mlp": 0.01365604, + "balance_loss_clip": 0.06493074, + "balance_loss_mlp": 0.01279965, + "epoch": 0.05314895535848489, + "flos": 22861427299200.0, + "grad_norm": 3.506018870992282, + "language_loss": 0.79542196, + "learning_rate": 3.994381925330319e-06, + "loss": 0.88031638, + "num_input_tokens_seen": 18939835, + "router_z_loss_clip": 6.3125, + "router_z_loss_mlp": 0.85644531, + "step": 884, + "time_per_iteration": 2.638016700744629 + }, + { + "auxiliary_loss_clip": 0.07094033, + "auxiliary_loss_mlp": 0.01359391, + "balance_loss_clip": 0.06489642, + "balance_loss_mlp": 0.01288057, + "epoch": 0.053209078611152864, + "flos": 12865381493760.0, + "grad_norm": 6.565904312623652, + "language_loss": 0.90469623, + "learning_rate": 3.994352716384659e-06, + "loss": 0.98923051, + "num_input_tokens_seen": 18958405, + "router_z_loss_clip": 6.0390625, + "router_z_loss_mlp": 0.71289062, + "step": 885, + "time_per_iteration": 2.5900588035583496 + }, + { + "auxiliary_loss_clip": 0.07139361, + "auxiliary_loss_mlp": 0.01377795, + "balance_loss_clip": 0.06508732, + "balance_loss_mlp": 0.0129225, + "epoch": 0.05326920186382083, + "flos": 12169112791680.0, + "grad_norm": 9.079017579739912, + "language_loss": 0.91530603, + "learning_rate": 3.994323431812945e-06, + "loss": 1.00047755, + "num_input_tokens_seen": 18975445, + "router_z_loss_clip": 6.3046875, + "router_z_loss_mlp": 0.85595703, + "step": 886, + "time_per_iteration": 4.099337339401245 + }, + { + "auxiliary_loss_clip": 0.07124092, + "auxiliary_loss_mlp": 0.01379295, + "balance_loss_clip": 0.06500152, + "balance_loss_mlp": 0.01295754, + "epoch": 0.0533293251164888, + "flos": 22710011771520.0, + "grad_norm": 3.9905004918105202, + "language_loss": 0.93810099, + "learning_rate": 3.994294071616286e-06, + "loss": 1.02313483, + "num_input_tokens_seen": 18991930, + "router_z_loss_clip": 6.23828125, + "router_z_loss_mlp": 0.83447266, + "step": 887, + "time_per_iteration": 2.5987393856048584 + }, + { + "auxiliary_loss_clip": 0.0714867, + "auxiliary_loss_mlp": 0.01405803, + "balance_loss_clip": 0.06507815, + "balance_loss_mlp": 0.01314536, + "epoch": 0.053389448369156774, + "flos": 26947860670080.0, + "grad_norm": 3.06900720752712, + "language_loss": 0.79354906, + "learning_rate": 3.994264635795796e-06, + "loss": 0.87909377, + "num_input_tokens_seen": 19009790, + "router_z_loss_clip": 6.40234375, + "router_z_loss_mlp": 0.91259766, + "step": 888, + "time_per_iteration": 4.025885820388794 + }, + { + "auxiliary_loss_clip": 0.07115386, + "auxiliary_loss_mlp": 0.01373999, + "balance_loss_clip": 0.06494455, + "balance_loss_mlp": 0.01293223, + "epoch": 0.05344957162182474, + "flos": 25563331330560.0, + "grad_norm": 6.088733603359691, + "language_loss": 0.92500973, + "learning_rate": 3.994235124352592e-06, + "loss": 1.00990355, + "num_input_tokens_seen": 19030170, + "router_z_loss_clip": 6.21484375, + "router_z_loss_mlp": 0.80761719, + "step": 889, + "time_per_iteration": 2.7182345390319824 + }, + { + "auxiliary_loss_clip": 0.07091353, + "auxiliary_loss_mlp": 0.01359755, + "balance_loss_clip": 0.06492079, + "balance_loss_mlp": 0.01289135, + "epoch": 0.05350969487449271, + "flos": 19725779007360.0, + "grad_norm": 3.9732892090836818, + "language_loss": 0.92642856, + "learning_rate": 3.994205537287791e-06, + "loss": 1.0109396, + "num_input_tokens_seen": 19048075, + "router_z_loss_clip": 5.99609375, + "router_z_loss_mlp": 0.70654297, + "step": 890, + "time_per_iteration": 4.055738925933838 + }, + { + "auxiliary_loss_clip": 0.071067, + "auxiliary_loss_mlp": 0.01356348, + "balance_loss_clip": 0.06478938, + "balance_loss_mlp": 0.01276573, + "epoch": 0.053569818127160676, + "flos": 27023694215040.0, + "grad_norm": 3.5767216506214523, + "language_loss": 0.98853362, + "learning_rate": 3.994175874602517e-06, + "loss": 1.07316399, + "num_input_tokens_seen": 19067465, + "router_z_loss_clip": 6.27734375, + "router_z_loss_mlp": 0.79785156, + "step": 891, + "time_per_iteration": 2.651681661605835 + }, + { + "auxiliary_loss_clip": 0.07084872, + "auxiliary_loss_mlp": 0.01351507, + "balance_loss_clip": 0.06476413, + "balance_loss_mlp": 0.01277788, + "epoch": 0.05362994137982865, + "flos": 13193383501440.0, + "grad_norm": 5.794831179079165, + "language_loss": 0.75768781, + "learning_rate": 3.994146136297893e-06, + "loss": 0.84205151, + "num_input_tokens_seen": 19085505, + "router_z_loss_clip": 6.08203125, + "router_z_loss_mlp": 0.73779297, + "step": 892, + "time_per_iteration": 2.5933892726898193 + }, + { + "auxiliary_loss_clip": 0.07096062, + "auxiliary_loss_mlp": 0.01350672, + "balance_loss_clip": 0.0647971, + "balance_loss_mlp": 0.01278002, + "epoch": 0.05369006463249662, + "flos": 28665590970240.0, + "grad_norm": 4.507397126758742, + "language_loss": 0.85958588, + "learning_rate": 3.994116322375049e-06, + "loss": 0.94405323, + "num_input_tokens_seen": 19104360, + "router_z_loss_clip": 6.16796875, + "router_z_loss_mlp": 0.7265625, + "step": 893, + "time_per_iteration": 2.6546378135681152 + }, + { + "auxiliary_loss_clip": 0.07101032, + "auxiliary_loss_mlp": 0.01336529, + "balance_loss_clip": 0.06474701, + "balance_loss_mlp": 0.01265099, + "epoch": 0.053750187885164585, + "flos": 28920736252800.0, + "grad_norm": 9.639579848612797, + "language_loss": 0.85423577, + "learning_rate": 3.994086432835114e-06, + "loss": 0.93861139, + "num_input_tokens_seen": 19124680, + "router_z_loss_clip": 6.265625, + "router_z_loss_mlp": 0.71484375, + "step": 894, + "time_per_iteration": 2.649336099624634 + }, + { + "auxiliary_loss_clip": 0.07051332, + "auxiliary_loss_mlp": 0.0132645, + "balance_loss_clip": 0.06452148, + "balance_loss_mlp": 0.01260742, + "epoch": 0.05381031113783256, + "flos": 15164246586240.0, + "grad_norm": 3.2292453008689215, + "language_loss": 0.79914492, + "learning_rate": 3.994056467679221e-06, + "loss": 0.88292277, + "num_input_tokens_seen": 19142895, + "router_z_loss_clip": 5.9921875, + "router_z_loss_mlp": 0.65722656, + "step": 895, + "time_per_iteration": 2.5825929641723633 + }, + { + "auxiliary_loss_clip": 0.07075687, + "auxiliary_loss_mlp": 0.01335812, + "balance_loss_clip": 0.06453281, + "balance_loss_mlp": 0.01257229, + "epoch": 0.05387043439050053, + "flos": 21841684709760.0, + "grad_norm": 4.836504932030544, + "language_loss": 0.91227436, + "learning_rate": 3.9940264269085065e-06, + "loss": 0.99638927, + "num_input_tokens_seen": 19163125, + "router_z_loss_clip": 6.2265625, + "router_z_loss_mlp": 0.78564453, + "step": 896, + "time_per_iteration": 2.657710313796997 + }, + { + "auxiliary_loss_clip": 0.07047559, + "auxiliary_loss_mlp": 0.0133946, + "balance_loss_clip": 0.06444345, + "balance_loss_mlp": 0.01266504, + "epoch": 0.053930557643168495, + "flos": 17315888855040.0, + "grad_norm": 5.716166538264852, + "language_loss": 0.91855001, + "learning_rate": 3.9939963105241115e-06, + "loss": 1.00242019, + "num_input_tokens_seen": 19179385, + "router_z_loss_clip": 6.0390625, + "router_z_loss_mlp": 0.72998047, + "step": 897, + "time_per_iteration": 2.5864884853363037 + }, + { + "auxiliary_loss_clip": 0.06997538, + "auxiliary_loss_mlp": 0.013383, + "balance_loss_clip": 0.06422779, + "balance_loss_mlp": 0.0126625, + "epoch": 0.05399068089583647, + "flos": 17354350679040.0, + "grad_norm": 28.355738836577903, + "language_loss": 0.93759477, + "learning_rate": 3.993966118527175e-06, + "loss": 1.02095306, + "num_input_tokens_seen": 19198725, + "router_z_loss_clip": 5.74609375, + "router_z_loss_mlp": 0.72070312, + "step": 898, + "time_per_iteration": 2.6132631301879883 + }, + { + "auxiliary_loss_clip": 0.07036521, + "auxiliary_loss_mlp": 0.01343105, + "balance_loss_clip": 0.06425488, + "balance_loss_mlp": 0.01264809, + "epoch": 0.05405080414850443, + "flos": 17491594867200.0, + "grad_norm": 4.630068897804509, + "language_loss": 0.97064686, + "learning_rate": 3.993935850918845e-06, + "loss": 1.05444312, + "num_input_tokens_seen": 19212380, + "router_z_loss_clip": 6.10546875, + "router_z_loss_mlp": 0.78320312, + "step": 899, + "time_per_iteration": 2.5816986560821533 + }, + { + "auxiliary_loss_clip": 0.07002847, + "auxiliary_loss_mlp": 0.01337851, + "balance_loss_clip": 0.06429946, + "balance_loss_mlp": 0.01263131, + "epoch": 0.054110927401172404, + "flos": 24503365981440.0, + "grad_norm": 5.469084454178289, + "language_loss": 0.79532343, + "learning_rate": 3.9939055077002665e-06, + "loss": 0.87873036, + "num_input_tokens_seen": 19232235, + "router_z_loss_clip": 5.73046875, + "router_z_loss_mlp": 0.74755859, + "step": 900, + "time_per_iteration": 2.6616973876953125 + }, + { + "auxiliary_loss_clip": 0.07026203, + "auxiliary_loss_mlp": 0.01335204, + "balance_loss_clip": 0.06429055, + "balance_loss_mlp": 0.01261628, + "epoch": 0.054171050653840376, + "flos": 22936715792640.0, + "grad_norm": 9.114074112173778, + "language_loss": 0.79687816, + "learning_rate": 3.993875088872592e-06, + "loss": 0.88049221, + "num_input_tokens_seen": 19251460, + "router_z_loss_clip": 5.9765625, + "router_z_loss_mlp": 0.73681641, + "step": 901, + "time_per_iteration": 2.6217994689941406 + }, + { + "auxiliary_loss_clip": 0.06969521, + "auxiliary_loss_mlp": 0.01353187, + "balance_loss_clip": 0.06413257, + "balance_loss_mlp": 0.01276941, + "epoch": 0.05423117390650834, + "flos": 12938238218880.0, + "grad_norm": 4.5794905652094675, + "language_loss": 0.8858788, + "learning_rate": 3.9938445944369745e-06, + "loss": 0.96910584, + "num_input_tokens_seen": 19269060, + "router_z_loss_clip": 5.56640625, + "router_z_loss_mlp": 0.76220703, + "step": 902, + "time_per_iteration": 2.600041151046753 + }, + { + "auxiliary_loss_clip": 0.07010742, + "auxiliary_loss_mlp": 0.01348168, + "balance_loss_clip": 0.0642361, + "balance_loss_mlp": 0.01272208, + "epoch": 0.05429129715917631, + "flos": 19907438659200.0, + "grad_norm": 3.5235627900978987, + "language_loss": 0.90038717, + "learning_rate": 3.993814024394569e-06, + "loss": 0.98397624, + "num_input_tokens_seen": 19288620, + "router_z_loss_clip": 5.875, + "router_z_loss_mlp": 0.75927734, + "step": 903, + "time_per_iteration": 2.654343843460083 + }, + { + "auxiliary_loss_clip": 0.07027672, + "auxiliary_loss_mlp": 0.01351984, + "balance_loss_clip": 0.06429485, + "balance_loss_mlp": 0.01276739, + "epoch": 0.05435142041184428, + "flos": 16914065800320.0, + "grad_norm": 3.6682943607818808, + "language_loss": 0.79433787, + "learning_rate": 3.993783378746537e-06, + "loss": 0.87813443, + "num_input_tokens_seen": 19306615, + "router_z_loss_clip": 5.98046875, + "router_z_loss_mlp": 0.75292969, + "step": 904, + "time_per_iteration": 2.5959675312042236 + }, + { + "auxiliary_loss_clip": 0.07042356, + "auxiliary_loss_mlp": 0.01361745, + "balance_loss_clip": 0.06427713, + "balance_loss_mlp": 0.01279062, + "epoch": 0.05441154366451225, + "flos": 23954613592320.0, + "grad_norm": 4.579053653377249, + "language_loss": 0.88901699, + "learning_rate": 3.993752657494039e-06, + "loss": 0.97305799, + "num_input_tokens_seen": 19321680, + "router_z_loss_clip": 6.140625, + "router_z_loss_mlp": 0.82714844, + "step": 905, + "time_per_iteration": 2.6219427585601807 + }, + { + "auxiliary_loss_clip": 0.06998053, + "auxiliary_loss_mlp": 0.01347731, + "balance_loss_clip": 0.06429392, + "balance_loss_mlp": 0.01274727, + "epoch": 0.05447166691718022, + "flos": 19981678976640.0, + "grad_norm": 3.7765145633999624, + "language_loss": 0.78233027, + "learning_rate": 3.993721860638241e-06, + "loss": 0.8657881, + "num_input_tokens_seen": 19339760, + "router_z_loss_clip": 5.6875, + "router_z_loss_mlp": 0.73046875, + "step": 906, + "time_per_iteration": 2.6213393211364746 + }, + { + "auxiliary_loss_clip": 0.07034522, + "auxiliary_loss_mlp": 0.01354415, + "balance_loss_clip": 0.06439427, + "balance_loss_mlp": 0.01281221, + "epoch": 0.05453179016984819, + "flos": 24943483152000.0, + "grad_norm": 3.1487164244038546, + "language_loss": 0.91526973, + "learning_rate": 3.993690988180309e-06, + "loss": 0.9991591, + "num_input_tokens_seen": 19359585, + "router_z_loss_clip": 5.94921875, + "router_z_loss_mlp": 0.73242188, + "step": 907, + "time_per_iteration": 2.6804075241088867 + }, + { + "auxiliary_loss_clip": 0.07033581, + "auxiliary_loss_mlp": 0.01357567, + "balance_loss_clip": 0.06437694, + "balance_loss_mlp": 0.01279461, + "epoch": 0.05459191342251616, + "flos": 18121170119040.0, + "grad_norm": 6.406912601020187, + "language_loss": 0.90540731, + "learning_rate": 3.9936600401214165e-06, + "loss": 0.98931873, + "num_input_tokens_seen": 19378590, + "router_z_loss_clip": 5.953125, + "router_z_loss_mlp": 0.78076172, + "step": 908, + "time_per_iteration": 2.645015001296997 + }, + { + "auxiliary_loss_clip": 0.07043326, + "auxiliary_loss_mlp": 0.01345219, + "balance_loss_clip": 0.06445918, + "balance_loss_mlp": 0.01274695, + "epoch": 0.054652036675184125, + "flos": 19214314485120.0, + "grad_norm": 7.110019645600745, + "language_loss": 0.94541007, + "learning_rate": 3.9936290164627345e-06, + "loss": 1.02929544, + "num_input_tokens_seen": 19397910, + "router_z_loss_clip": 5.96875, + "router_z_loss_mlp": 0.70507812, + "step": 909, + "time_per_iteration": 2.6648013591766357 + }, + { + "auxiliary_loss_clip": 0.07070212, + "auxiliary_loss_mlp": 0.01367531, + "balance_loss_clip": 0.06454301, + "balance_loss_mlp": 0.01287184, + "epoch": 0.0547121599278521, + "flos": 16331253926400.0, + "grad_norm": 4.130588011927331, + "language_loss": 0.76068008, + "learning_rate": 3.99359791720544e-06, + "loss": 0.84505749, + "num_input_tokens_seen": 19415950, + "router_z_loss_clip": 6.15625, + "router_z_loss_mlp": 0.80273438, + "step": 910, + "time_per_iteration": 2.588240146636963 + }, + { + "auxiliary_loss_clip": 0.07039558, + "auxiliary_loss_mlp": 0.0135407, + "balance_loss_clip": 0.06453503, + "balance_loss_mlp": 0.01281829, + "epoch": 0.05477228318052007, + "flos": 20345165988480.0, + "grad_norm": 30.49086914574189, + "language_loss": 0.86822844, + "learning_rate": 3.993566742350714e-06, + "loss": 0.95216471, + "num_input_tokens_seen": 19435275, + "router_z_loss_clip": 5.8671875, + "router_z_loss_mlp": 0.72265625, + "step": 911, + "time_per_iteration": 2.6324408054351807 + }, + { + "auxiliary_loss_clip": 0.07064489, + "auxiliary_loss_mlp": 0.01358074, + "balance_loss_clip": 0.06459624, + "balance_loss_mlp": 0.01280207, + "epoch": 0.054832406433188034, + "flos": 21978216138240.0, + "grad_norm": 33.1555590789585, + "language_loss": 0.80294693, + "learning_rate": 3.993535491899736e-06, + "loss": 0.88717258, + "num_input_tokens_seen": 19452090, + "router_z_loss_clip": 6.046875, + "router_z_loss_mlp": 0.77880859, + "step": 912, + "time_per_iteration": 2.590373992919922 + }, + { + "auxiliary_loss_clip": 0.0703726, + "auxiliary_loss_mlp": 0.01353834, + "balance_loss_clip": 0.06456903, + "balance_loss_mlp": 0.01284979, + "epoch": 0.054892529685856006, + "flos": 16404487994880.0, + "grad_norm": 20.678206909589232, + "language_loss": 0.87077272, + "learning_rate": 3.993504165853694e-06, + "loss": 0.9546836, + "num_input_tokens_seen": 19470865, + "router_z_loss_clip": 5.8046875, + "router_z_loss_mlp": 0.68896484, + "step": 913, + "time_per_iteration": 2.6207854747772217 + }, + { + "auxiliary_loss_clip": 0.07058232, + "auxiliary_loss_mlp": 0.01355937, + "balance_loss_clip": 0.06467378, + "balance_loss_mlp": 0.01279214, + "epoch": 0.05495265293852397, + "flos": 23918709317760.0, + "grad_norm": 2.929829982992902, + "language_loss": 0.86646307, + "learning_rate": 3.993472764213772e-06, + "loss": 0.9506048, + "num_input_tokens_seen": 19492145, + "router_z_loss_clip": 5.91015625, + "router_z_loss_mlp": 0.76708984, + "step": 914, + "time_per_iteration": 2.653738260269165 + }, + { + "auxiliary_loss_clip": 0.07080867, + "auxiliary_loss_mlp": 0.01347963, + "balance_loss_clip": 0.06487378, + "balance_loss_mlp": 0.01278583, + "epoch": 0.055012776191191944, + "flos": 23593767984000.0, + "grad_norm": 5.681880132712419, + "language_loss": 0.94313538, + "learning_rate": 3.9934412869811655e-06, + "loss": 1.02742374, + "num_input_tokens_seen": 19511015, + "router_z_loss_clip": 5.93359375, + "router_z_loss_mlp": 0.69433594, + "step": 915, + "time_per_iteration": 2.6307506561279297 + }, + { + "auxiliary_loss_clip": 0.07055361, + "auxiliary_loss_mlp": 0.01345822, + "balance_loss_clip": 0.06473369, + "balance_loss_mlp": 0.01276442, + "epoch": 0.055072899443859916, + "flos": 17533997832960.0, + "grad_norm": 9.383060565186796, + "language_loss": 0.9327727, + "learning_rate": 3.993409734157064e-06, + "loss": 1.01678455, + "num_input_tokens_seen": 19529040, + "router_z_loss_clip": 5.8203125, + "router_z_loss_mlp": 0.69384766, + "step": 916, + "time_per_iteration": 2.5821292400360107 + }, + { + "auxiliary_loss_clip": 0.0710435, + "auxiliary_loss_mlp": 0.01382873, + "balance_loss_clip": 0.06478155, + "balance_loss_mlp": 0.01299808, + "epoch": 0.05513302269652788, + "flos": 21693246001920.0, + "grad_norm": 9.219504726961107, + "language_loss": 0.83272588, + "learning_rate": 3.993378105742666e-06, + "loss": 0.91759813, + "num_input_tokens_seen": 19549540, + "router_z_loss_clip": 6.2578125, + "router_z_loss_mlp": 0.83056641, + "step": 917, + "time_per_iteration": 2.620739221572876 + }, + { + "auxiliary_loss_clip": 0.07102817, + "auxiliary_loss_mlp": 0.01375299, + "balance_loss_clip": 0.06484253, + "balance_loss_mlp": 0.01293473, + "epoch": 0.05519314594919585, + "flos": 21619257246720.0, + "grad_norm": 3.775060612193374, + "language_loss": 0.84478474, + "learning_rate": 3.9933464017391705e-06, + "loss": 0.92956591, + "num_input_tokens_seen": 19567570, + "router_z_loss_clip": 6.1875, + "router_z_loss_mlp": 0.81787109, + "step": 918, + "time_per_iteration": 2.594416379928589 + }, + { + "auxiliary_loss_clip": 0.07101964, + "auxiliary_loss_mlp": 0.01367305, + "balance_loss_clip": 0.06485492, + "balance_loss_mlp": 0.01289151, + "epoch": 0.05525326920186382, + "flos": 21804983821440.0, + "grad_norm": 30.311763596206674, + "language_loss": 0.92698455, + "learning_rate": 3.99331462214778e-06, + "loss": 1.01167727, + "num_input_tokens_seen": 19585330, + "router_z_loss_clip": 6.171875, + "router_z_loss_mlp": 0.78125, + "step": 919, + "time_per_iteration": 2.652820587158203 + }, + { + "auxiliary_loss_clip": 0.07067424, + "auxiliary_loss_mlp": 0.01355052, + "balance_loss_clip": 0.06469625, + "balance_loss_mlp": 0.01279807, + "epoch": 0.05531339245453179, + "flos": 28447272357120.0, + "grad_norm": 10.071293586926402, + "language_loss": 0.91352344, + "learning_rate": 3.993282766969699e-06, + "loss": 0.99774826, + "num_input_tokens_seen": 19604970, + "router_z_loss_clip": 5.97265625, + "router_z_loss_mlp": 0.75244141, + "step": 920, + "time_per_iteration": 2.676198720932007 + }, + { + "auxiliary_loss_clip": 0.0705073, + "auxiliary_loss_mlp": 0.01349539, + "balance_loss_clip": 0.06465692, + "balance_loss_mlp": 0.01277489, + "epoch": 0.05537351570719976, + "flos": 37383688229760.0, + "grad_norm": 4.912310342767309, + "language_loss": 0.69610375, + "learning_rate": 3.993250836206136e-06, + "loss": 0.78010643, + "num_input_tokens_seen": 19626235, + "router_z_loss_clip": 5.85546875, + "router_z_loss_mlp": 0.72021484, + "step": 921, + "time_per_iteration": 2.729602098464966 + }, + { + "auxiliary_loss_clip": 0.07080688, + "auxiliary_loss_mlp": 0.01369369, + "balance_loss_clip": 0.06465121, + "balance_loss_mlp": 0.01287687, + "epoch": 0.05543363895986773, + "flos": 20090733465600.0, + "grad_norm": 4.2535446135467785, + "language_loss": 0.76117694, + "learning_rate": 3.993218829858301e-06, + "loss": 0.8456775, + "num_input_tokens_seen": 19644305, + "router_z_loss_clip": 6.1640625, + "router_z_loss_mlp": 0.81689453, + "step": 922, + "time_per_iteration": 2.5846810340881348 + }, + { + "auxiliary_loss_clip": 0.07077445, + "auxiliary_loss_mlp": 0.01375095, + "balance_loss_clip": 0.06466563, + "balance_loss_mlp": 0.01293842, + "epoch": 0.0554937622125357, + "flos": 24539773380480.0, + "grad_norm": 5.782149663492731, + "language_loss": 0.86474669, + "learning_rate": 3.993186747927408e-06, + "loss": 0.9492721, + "num_input_tokens_seen": 19662130, + "router_z_loss_clip": 6.1171875, + "router_z_loss_mlp": 0.81298828, + "step": 923, + "time_per_iteration": 2.6038758754730225 + }, + { + "auxiliary_loss_clip": 0.07066977, + "auxiliary_loss_mlp": 0.01365852, + "balance_loss_clip": 0.06460079, + "balance_loss_mlp": 0.01286125, + "epoch": 0.055553885465203665, + "flos": 14325408961920.0, + "grad_norm": 4.5524709486596695, + "language_loss": 0.82890737, + "learning_rate": 3.993154590414675e-06, + "loss": 0.91323566, + "num_input_tokens_seen": 19680715, + "router_z_loss_clip": 6.0703125, + "router_z_loss_mlp": 0.79736328, + "step": 924, + "time_per_iteration": 2.563229560852051 + }, + { + "auxiliary_loss_clip": 0.07049644, + "auxiliary_loss_mlp": 0.01383238, + "balance_loss_clip": 0.06458092, + "balance_loss_mlp": 0.01303654, + "epoch": 0.05561400871787164, + "flos": 27388522892160.0, + "grad_norm": 5.4957057534226115, + "language_loss": 1.05798936, + "learning_rate": 3.993122357321319e-06, + "loss": 1.14231825, + "num_input_tokens_seen": 19700535, + "router_z_loss_clip": 5.9140625, + "router_z_loss_mlp": 0.79492188, + "step": 925, + "time_per_iteration": 4.167480230331421 + }, + { + "auxiliary_loss_clip": 0.07051321, + "auxiliary_loss_mlp": 0.01368022, + "balance_loss_clip": 0.06456822, + "balance_loss_mlp": 0.01291585, + "epoch": 0.05567413197053961, + "flos": 23227681495680.0, + "grad_norm": 4.150968516842117, + "language_loss": 0.85383534, + "learning_rate": 3.993090048648564e-06, + "loss": 0.93802875, + "num_input_tokens_seen": 19718825, + "router_z_loss_clip": 5.94921875, + "router_z_loss_mlp": 0.76367188, + "step": 926, + "time_per_iteration": 4.156589031219482 + }, + { + "auxiliary_loss_clip": 0.07111964, + "auxiliary_loss_mlp": 0.01390888, + "balance_loss_clip": 0.06470172, + "balance_loss_mlp": 0.0130129, + "epoch": 0.055734255223207574, + "flos": 25271988284160.0, + "grad_norm": 8.095313947782397, + "language_loss": 0.79582185, + "learning_rate": 3.993057664397634e-06, + "loss": 0.88085037, + "num_input_tokens_seen": 19739080, + "router_z_loss_clip": 6.42578125, + "router_z_loss_mlp": 0.89550781, + "step": 927, + "time_per_iteration": 2.6851751804351807 + }, + { + "auxiliary_loss_clip": 0.06860578, + "auxiliary_loss_mlp": 0.01306525, + "balance_loss_clip": 0.06486383, + "balance_loss_mlp": 0.01261607, + "epoch": 0.055794378475875546, + "flos": 66524698938240.0, + "grad_norm": 0.7865808163657396, + "language_loss": 0.59965324, + "learning_rate": 3.9930252045697585e-06, + "loss": 0.68132424, + "num_input_tokens_seen": 19802960, + "router_z_loss_clip": 3.74804688, + "router_z_loss_mlp": 0.44921875, + "step": 928, + "time_per_iteration": 4.694532632827759 + }, + { + "auxiliary_loss_clip": 0.0702403, + "auxiliary_loss_mlp": 0.01398439, + "balance_loss_clip": 0.06437568, + "balance_loss_mlp": 0.01313991, + "epoch": 0.05585450172854351, + "flos": 25344635374080.0, + "grad_norm": 5.300738051002958, + "language_loss": 0.99270105, + "learning_rate": 3.992992669166168e-06, + "loss": 1.07692575, + "num_input_tokens_seen": 19822765, + "router_z_loss_clip": 5.8671875, + "router_z_loss_mlp": 0.84472656, + "step": 929, + "time_per_iteration": 2.652329444885254 + }, + { + "auxiliary_loss_clip": 0.07033007, + "auxiliary_loss_mlp": 0.01402576, + "balance_loss_clip": 0.06441823, + "balance_loss_mlp": 0.01318938, + "epoch": 0.05591462498121148, + "flos": 33920163711360.0, + "grad_norm": 20.10669872289237, + "language_loss": 0.7473861, + "learning_rate": 3.992960058188094e-06, + "loss": 0.83174193, + "num_input_tokens_seen": 19843590, + "router_z_loss_clip": 5.91015625, + "router_z_loss_mlp": 0.83691406, + "step": 930, + "time_per_iteration": 4.218009948730469 + }, + { + "auxiliary_loss_clip": 0.0703931, + "auxiliary_loss_mlp": 0.01397804, + "balance_loss_clip": 0.06446733, + "balance_loss_mlp": 0.01313929, + "epoch": 0.055974748233879455, + "flos": 17936617501440.0, + "grad_norm": 4.521391546474749, + "language_loss": 0.88519967, + "learning_rate": 3.992927371636776e-06, + "loss": 0.96957082, + "num_input_tokens_seen": 19860230, + "router_z_loss_clip": 5.91796875, + "router_z_loss_mlp": 0.83886719, + "step": 931, + "time_per_iteration": 2.5678892135620117 + }, + { + "auxiliary_loss_clip": 0.07037735, + "auxiliary_loss_mlp": 0.01413156, + "balance_loss_clip": 0.06439222, + "balance_loss_mlp": 0.01325466, + "epoch": 0.05603487148654742, + "flos": 24028392712320.0, + "grad_norm": 3.3508446860260355, + "language_loss": 0.86982858, + "learning_rate": 3.9928946095134525e-06, + "loss": 0.95433742, + "num_input_tokens_seen": 19880795, + "router_z_loss_clip": 5.9921875, + "router_z_loss_mlp": 0.87695312, + "step": 932, + "time_per_iteration": 2.6454596519470215 + }, + { + "auxiliary_loss_clip": 0.07046005, + "auxiliary_loss_mlp": 0.01409303, + "balance_loss_clip": 0.06444195, + "balance_loss_mlp": 0.01322901, + "epoch": 0.05609499473921539, + "flos": 17312912035200.0, + "grad_norm": 4.63721211876497, + "language_loss": 0.79083282, + "learning_rate": 3.992861771819365e-06, + "loss": 0.87538588, + "num_input_tokens_seen": 19897960, + "router_z_loss_clip": 6.02734375, + "router_z_loss_mlp": 0.86328125, + "step": 933, + "time_per_iteration": 2.5537846088409424 + }, + { + "auxiliary_loss_clip": 0.07023589, + "auxiliary_loss_mlp": 0.01416541, + "balance_loss_clip": 0.06434061, + "balance_loss_mlp": 0.01334287, + "epoch": 0.05615511799188336, + "flos": 21000834587520.0, + "grad_norm": 6.948998666256607, + "language_loss": 0.90410703, + "learning_rate": 3.99282885855576e-06, + "loss": 0.98850828, + "num_input_tokens_seen": 19913315, + "router_z_loss_clip": 5.89453125, + "router_z_loss_mlp": 0.82275391, + "step": 934, + "time_per_iteration": 2.5762336254119873 + }, + { + "auxiliary_loss_clip": 0.06990926, + "auxiliary_loss_mlp": 0.01429171, + "balance_loss_clip": 0.06438624, + "balance_loss_mlp": 0.01345153, + "epoch": 0.05621524124455133, + "flos": 17279062185600.0, + "grad_norm": 7.5646674228018265, + "language_loss": 0.84164441, + "learning_rate": 3.992795869723885e-06, + "loss": 0.92584538, + "num_input_tokens_seen": 19928790, + "router_z_loss_clip": 5.52734375, + "router_z_loss_mlp": 0.83984375, + "step": 935, + "time_per_iteration": 2.6203958988189697 + }, + { + "auxiliary_loss_clip": 0.06841761, + "auxiliary_loss_mlp": 0.01418196, + "balance_loss_clip": 0.06462182, + "balance_loss_mlp": 0.01359927, + "epoch": 0.0562753644972193, + "flos": 58737597194880.0, + "grad_norm": 0.8140808506826857, + "language_loss": 0.69178045, + "learning_rate": 3.99276280532499e-06, + "loss": 0.77438003, + "num_input_tokens_seen": 19988785, + "router_z_loss_clip": 3.79101562, + "router_z_loss_mlp": 0.58105469, + "step": 936, + "time_per_iteration": 3.1629393100738525 + }, + { + "auxiliary_loss_clip": 0.070338, + "auxiliary_loss_mlp": 0.01416227, + "balance_loss_clip": 0.06443301, + "balance_loss_mlp": 0.0133178, + "epoch": 0.05633548774988727, + "flos": 17462776262400.0, + "grad_norm": 4.591481841632389, + "language_loss": 0.81027842, + "learning_rate": 3.992729665360331e-06, + "loss": 0.89477861, + "num_input_tokens_seen": 20007685, + "router_z_loss_clip": 5.8984375, + "router_z_loss_mlp": 0.84472656, + "step": 937, + "time_per_iteration": 2.650186538696289 + }, + { + "auxiliary_loss_clip": 0.0684337, + "auxiliary_loss_mlp": 0.01393468, + "balance_loss_clip": 0.06467308, + "balance_loss_mlp": 0.01340683, + "epoch": 0.05639561100255524, + "flos": 70675939042560.0, + "grad_norm": 0.8752420339339617, + "language_loss": 0.64563346, + "learning_rate": 3.992696449831162e-06, + "loss": 0.72800183, + "num_input_tokens_seen": 20072750, + "router_z_loss_clip": 3.75, + "router_z_loss_mlp": 0.52880859, + "step": 938, + "time_per_iteration": 3.200669050216675 + }, + { + "auxiliary_loss_clip": 0.07073379, + "auxiliary_loss_mlp": 0.01391777, + "balance_loss_clip": 0.06460777, + "balance_loss_mlp": 0.01309332, + "epoch": 0.056455734255223204, + "flos": 20492346885120.0, + "grad_norm": 5.43214954330628, + "language_loss": 0.84251928, + "learning_rate": 3.992663158738745e-06, + "loss": 0.92717087, + "num_input_tokens_seen": 20089070, + "router_z_loss_clip": 6.125, + "router_z_loss_mlp": 0.82373047, + "step": 939, + "time_per_iteration": 2.622727870941162 + }, + { + "auxiliary_loss_clip": 0.07029171, + "auxiliary_loss_mlp": 0.01403853, + "balance_loss_clip": 0.06452838, + "balance_loss_mlp": 0.01326081, + "epoch": 0.056515857507891176, + "flos": 22059961395840.0, + "grad_norm": 5.005416621507547, + "language_loss": 0.76388282, + "learning_rate": 3.992629792084341e-06, + "loss": 0.84821308, + "num_input_tokens_seen": 20108790, + "router_z_loss_clip": 5.765625, + "router_z_loss_mlp": 0.77734375, + "step": 940, + "time_per_iteration": 2.6560001373291016 + }, + { + "auxiliary_loss_clip": 0.07005631, + "auxiliary_loss_mlp": 0.01389365, + "balance_loss_clip": 0.06443679, + "balance_loss_mlp": 0.01314073, + "epoch": 0.05657598076055915, + "flos": 24032291927040.0, + "grad_norm": 11.024308816683174, + "language_loss": 0.7415117, + "learning_rate": 3.992596349869216e-06, + "loss": 0.82546163, + "num_input_tokens_seen": 20128455, + "router_z_loss_clip": 5.6171875, + "router_z_loss_mlp": 0.75341797, + "step": 941, + "time_per_iteration": 2.691328525543213 + }, + { + "auxiliary_loss_clip": 0.07028662, + "auxiliary_loss_mlp": 0.01392256, + "balance_loss_clip": 0.06448376, + "balance_loss_mlp": 0.0131496, + "epoch": 0.05663610401322711, + "flos": 20486057829120.0, + "grad_norm": 6.757951792278694, + "language_loss": 0.8311438, + "learning_rate": 3.992562832094637e-06, + "loss": 0.91535294, + "num_input_tokens_seen": 20145775, + "router_z_loss_clip": 5.80859375, + "router_z_loss_mlp": 0.77246094, + "step": 942, + "time_per_iteration": 2.5987863540649414 + }, + { + "auxiliary_loss_clip": 0.07036945, + "auxiliary_loss_mlp": 0.01378378, + "balance_loss_clip": 0.06460088, + "balance_loss_mlp": 0.01303896, + "epoch": 0.056696227265895086, + "flos": 21075368394240.0, + "grad_norm": 21.600438823460475, + "language_loss": 0.92831737, + "learning_rate": 3.9925292387618755e-06, + "loss": 1.01247072, + "num_input_tokens_seen": 20164315, + "router_z_loss_clip": 5.765625, + "router_z_loss_mlp": 0.74462891, + "step": 943, + "time_per_iteration": 2.62147855758667 + }, + { + "auxiliary_loss_clip": 0.07040788, + "auxiliary_loss_mlp": 0.01386269, + "balance_loss_clip": 0.06462353, + "balance_loss_mlp": 0.01313027, + "epoch": 0.05675635051856306, + "flos": 17827017960960.0, + "grad_norm": 6.279897483523164, + "language_loss": 0.7991842, + "learning_rate": 3.992495569872206e-06, + "loss": 0.8834548, + "num_input_tokens_seen": 20182760, + "router_z_loss_clip": 5.78125, + "router_z_loss_mlp": 0.73242188, + "step": 944, + "time_per_iteration": 2.5755181312561035 + }, + { + "auxiliary_loss_clip": 0.0704762, + "auxiliary_loss_mlp": 0.01372731, + "balance_loss_clip": 0.06471305, + "balance_loss_mlp": 0.01300109, + "epoch": 0.05681647377123102, + "flos": 23122065024000.0, + "grad_norm": 11.186502162192404, + "language_loss": 0.82437181, + "learning_rate": 3.992461825426906e-06, + "loss": 0.90857524, + "num_input_tokens_seen": 20203830, + "router_z_loss_clip": 5.76171875, + "router_z_loss_mlp": 0.7265625, + "step": 945, + "time_per_iteration": 2.646212339401245 + }, + { + "auxiliary_loss_clip": 0.07062095, + "auxiliary_loss_mlp": 0.01352146, + "balance_loss_clip": 0.06473356, + "balance_loss_mlp": 0.01276854, + "epoch": 0.056876597023898995, + "flos": 16076024789760.0, + "grad_norm": 6.503065924665904, + "language_loss": 0.86640823, + "learning_rate": 3.992428005427252e-06, + "loss": 0.95055068, + "num_input_tokens_seen": 20220365, + "router_z_loss_clip": 5.890625, + "router_z_loss_mlp": 0.75195312, + "step": 946, + "time_per_iteration": 2.5955421924591064 + }, + { + "auxiliary_loss_clip": 0.07105307, + "auxiliary_loss_mlp": 0.01349465, + "balance_loss_clip": 0.06487983, + "balance_loss_mlp": 0.01268975, + "epoch": 0.05693672027656696, + "flos": 16841083294080.0, + "grad_norm": 30.160109907470417, + "language_loss": 0.83428961, + "learning_rate": 3.992394109874529e-06, + "loss": 0.91883731, + "num_input_tokens_seen": 20238640, + "router_z_loss_clip": 6.171875, + "router_z_loss_mlp": 0.80517578, + "step": 947, + "time_per_iteration": 2.578885078430176 + }, + { + "auxiliary_loss_clip": 0.07120173, + "auxiliary_loss_mlp": 0.01346427, + "balance_loss_clip": 0.06479014, + "balance_loss_mlp": 0.01264888, + "epoch": 0.05699684352923493, + "flos": 21394104526080.0, + "grad_norm": 7.760122513642949, + "language_loss": 0.89679337, + "learning_rate": 3.9923601387700225e-06, + "loss": 0.98145938, + "num_input_tokens_seen": 20251025, + "router_z_loss_clip": 6.40625, + "router_z_loss_mlp": 0.81542969, + "step": 948, + "time_per_iteration": 2.6047542095184326 + }, + { + "auxiliary_loss_clip": 0.07067588, + "auxiliary_loss_mlp": 0.01342886, + "balance_loss_clip": 0.06478094, + "balance_loss_mlp": 0.01268786, + "epoch": 0.057056966781902904, + "flos": 15565818078720.0, + "grad_norm": 4.718676024566818, + "language_loss": 0.91130018, + "learning_rate": 3.992326092115019e-06, + "loss": 0.99540496, + "num_input_tokens_seen": 20269775, + "router_z_loss_clip": 5.8984375, + "router_z_loss_mlp": 0.74121094, + "step": 949, + "time_per_iteration": 2.59798526763916 + }, + { + "auxiliary_loss_clip": 0.07052803, + "auxiliary_loss_mlp": 0.01334514, + "balance_loss_clip": 0.06479354, + "balance_loss_mlp": 0.01265897, + "epoch": 0.05711709003457087, + "flos": 19943971839360.0, + "grad_norm": 5.50050902669799, + "language_loss": 0.81973231, + "learning_rate": 3.992291969910811e-06, + "loss": 0.90360546, + "num_input_tokens_seen": 20287715, + "router_z_loss_clip": 5.73828125, + "router_z_loss_mlp": 0.68603516, + "step": 950, + "time_per_iteration": 2.6259987354278564 + }, + { + "auxiliary_loss_clip": 0.07096414, + "auxiliary_loss_mlp": 0.01341844, + "balance_loss_clip": 0.06496268, + "balance_loss_mlp": 0.0126536, + "epoch": 0.05717721328723884, + "flos": 30339953982720.0, + "grad_norm": 5.942643661235501, + "language_loss": 0.85793424, + "learning_rate": 3.992257772158691e-06, + "loss": 0.94231689, + "num_input_tokens_seen": 20307070, + "router_z_loss_clip": 5.99609375, + "router_z_loss_mlp": 0.76464844, + "step": 951, + "time_per_iteration": 2.6625497341156006 + }, + { + "auxiliary_loss_clip": 0.07096014, + "auxiliary_loss_mlp": 0.01337385, + "balance_loss_clip": 0.06490001, + "balance_loss_mlp": 0.0125494, + "epoch": 0.05723733653990681, + "flos": 23660251799040.0, + "grad_norm": 12.14793274648965, + "language_loss": 0.90794688, + "learning_rate": 3.992223498859958e-06, + "loss": 0.9922809, + "num_input_tokens_seen": 20324945, + "router_z_loss_clip": 6.06640625, + "router_z_loss_mlp": 0.82373047, + "step": 952, + "time_per_iteration": 2.6754026412963867 + }, + { + "auxiliary_loss_clip": 0.07150276, + "auxiliary_loss_mlp": 0.01358536, + "balance_loss_clip": 0.06509267, + "balance_loss_mlp": 0.01266268, + "epoch": 0.05729745979257478, + "flos": 22062518945280.0, + "grad_norm": 4.876026783534778, + "language_loss": 0.83819556, + "learning_rate": 3.9921891500159084e-06, + "loss": 0.92328364, + "num_input_tokens_seen": 20346135, + "router_z_loss_clip": 6.4140625, + "router_z_loss_mlp": 0.92333984, + "step": 953, + "time_per_iteration": 2.6004669666290283 + }, + { + "auxiliary_loss_clip": 0.07094061, + "auxiliary_loss_mlp": 0.01342327, + "balance_loss_clip": 0.06495301, + "balance_loss_mlp": 0.01262409, + "epoch": 0.05735758304524275, + "flos": 19609554994560.0, + "grad_norm": 6.9064094964387, + "language_loss": 0.9058758, + "learning_rate": 3.992154725627848e-06, + "loss": 0.99023962, + "num_input_tokens_seen": 20364450, + "router_z_loss_clip": 5.98046875, + "router_z_loss_mlp": 0.79931641, + "step": 954, + "time_per_iteration": 2.6270759105682373 + }, + { + "auxiliary_loss_clip": 0.07104363, + "auxiliary_loss_mlp": 0.01340099, + "balance_loss_clip": 0.06505129, + "balance_loss_mlp": 0.01262661, + "epoch": 0.057417706297910716, + "flos": 19105050360960.0, + "grad_norm": 6.439393268367411, + "language_loss": 0.9193548, + "learning_rate": 3.9921202256970804e-06, + "loss": 1.00379944, + "num_input_tokens_seen": 20383500, + "router_z_loss_clip": 6.0, + "router_z_loss_mlp": 0.77490234, + "step": 955, + "time_per_iteration": 2.5784714221954346 + }, + { + "auxiliary_loss_clip": 0.07088242, + "auxiliary_loss_mlp": 0.01339912, + "balance_loss_clip": 0.06500716, + "balance_loss_mlp": 0.01263379, + "epoch": 0.05747782955057869, + "flos": 16660136401920.0, + "grad_norm": 130.9595542139282, + "language_loss": 0.93622941, + "learning_rate": 3.992085650224914e-06, + "loss": 1.02051091, + "num_input_tokens_seen": 20400295, + "router_z_loss_clip": 5.875, + "router_z_loss_mlp": 0.765625, + "step": 956, + "time_per_iteration": 2.654709815979004 + }, + { + "auxiliary_loss_clip": 0.07069805, + "auxiliary_loss_mlp": 0.01336322, + "balance_loss_clip": 0.06513655, + "balance_loss_mlp": 0.01263795, + "epoch": 0.05753795280324665, + "flos": 14507362103040.0, + "grad_norm": 7.35623901329006, + "language_loss": 0.79601187, + "learning_rate": 3.99205099921266e-06, + "loss": 0.88007313, + "num_input_tokens_seen": 20419085, + "router_z_loss_clip": 5.5625, + "router_z_loss_mlp": 0.72509766, + "step": 957, + "time_per_iteration": 2.5814363956451416 + }, + { + "auxiliary_loss_clip": 0.07102334, + "auxiliary_loss_mlp": 0.013347, + "balance_loss_clip": 0.06516448, + "balance_loss_mlp": 0.01260171, + "epoch": 0.057598076055914625, + "flos": 18081995535360.0, + "grad_norm": 9.445676211161578, + "language_loss": 0.8370564, + "learning_rate": 3.992016272661633e-06, + "loss": 0.92142671, + "num_input_tokens_seen": 20437465, + "router_z_loss_clip": 5.859375, + "router_z_loss_mlp": 0.74511719, + "step": 958, + "time_per_iteration": 2.6244523525238037 + }, + { + "auxiliary_loss_clip": 0.0710094, + "auxiliary_loss_mlp": 0.01346675, + "balance_loss_clip": 0.06526074, + "balance_loss_mlp": 0.01272241, + "epoch": 0.0576581993085826, + "flos": 22130679841920.0, + "grad_norm": 4.908180525960309, + "language_loss": 0.91401774, + "learning_rate": 3.99198147057315e-06, + "loss": 0.99849397, + "num_input_tokens_seen": 20456235, + "router_z_loss_clip": 5.74609375, + "router_z_loss_mlp": 0.74365234, + "step": 959, + "time_per_iteration": 2.5950703620910645 + }, + { + "auxiliary_loss_clip": 0.07097997, + "auxiliary_loss_mlp": 0.01349298, + "balance_loss_clip": 0.06514278, + "balance_loss_mlp": 0.01272431, + "epoch": 0.05771832256125056, + "flos": 33190003232640.0, + "grad_norm": 5.502917231642364, + "language_loss": 0.82885253, + "learning_rate": 3.991946592948529e-06, + "loss": 0.91332549, + "num_input_tokens_seen": 20476825, + "router_z_loss_clip": 5.83203125, + "router_z_loss_mlp": 0.76904297, + "step": 960, + "time_per_iteration": 2.7026655673980713 + }, + { + "auxiliary_loss_clip": 0.07121219, + "auxiliary_loss_mlp": 0.0136329, + "balance_loss_clip": 0.06516127, + "balance_loss_mlp": 0.01276888, + "epoch": 0.057778445813918534, + "flos": 24176957201280.0, + "grad_norm": 10.105803552355386, + "language_loss": 0.96418011, + "learning_rate": 3.991911639789094e-06, + "loss": 1.0490253, + "num_input_tokens_seen": 20496965, + "router_z_loss_clip": 6.0546875, + "router_z_loss_mlp": 0.86425781, + "step": 961, + "time_per_iteration": 2.621075391769409 + }, + { + "auxiliary_loss_clip": 0.07137178, + "auxiliary_loss_mlp": 0.0136525, + "balance_loss_clip": 0.06529568, + "balance_loss_mlp": 0.01280421, + "epoch": 0.0578385690665865, + "flos": 29650770950400.0, + "grad_norm": 15.740079848034652, + "language_loss": 0.72144246, + "learning_rate": 3.991876611096169e-06, + "loss": 0.80646676, + "num_input_tokens_seen": 20518035, + "router_z_loss_clip": 6.08203125, + "router_z_loss_mlp": 0.84863281, + "step": 962, + "time_per_iteration": 2.662982702255249 + }, + { + "auxiliary_loss_clip": 0.07124397, + "auxiliary_loss_mlp": 0.01385383, + "balance_loss_clip": 0.06529254, + "balance_loss_mlp": 0.01300888, + "epoch": 0.05789869231925447, + "flos": 20891528536320.0, + "grad_norm": 6.9214750574770765, + "language_loss": 0.92274594, + "learning_rate": 3.991841506871084e-06, + "loss": 1.00784373, + "num_input_tokens_seen": 20534740, + "router_z_loss_clip": 5.953125, + "router_z_loss_mlp": 0.84521484, + "step": 963, + "time_per_iteration": 2.6076695919036865 + }, + { + "auxiliary_loss_clip": 0.07119042, + "auxiliary_loss_mlp": 0.01381304, + "balance_loss_clip": 0.06523143, + "balance_loss_mlp": 0.01297953, + "epoch": 0.057958815571922444, + "flos": 26038262672640.0, + "grad_norm": 11.895031253661099, + "language_loss": 0.8968147, + "learning_rate": 3.99180632711517e-06, + "loss": 0.98181814, + "num_input_tokens_seen": 20553485, + "router_z_loss_clip": 5.96484375, + "router_z_loss_mlp": 0.83300781, + "step": 964, + "time_per_iteration": 2.686906337738037 + }, + { + "auxiliary_loss_clip": 0.07105853, + "auxiliary_loss_mlp": 0.01387507, + "balance_loss_clip": 0.06517063, + "balance_loss_mlp": 0.01305252, + "epoch": 0.05801893882459041, + "flos": 18083588762880.0, + "grad_norm": 5.536598394443464, + "language_loss": 0.80100715, + "learning_rate": 3.99177107182976e-06, + "loss": 0.88594079, + "num_input_tokens_seen": 20572155, + "router_z_loss_clip": 5.88671875, + "router_z_loss_mlp": 0.82275391, + "step": 965, + "time_per_iteration": 4.090426921844482 + }, + { + "auxiliary_loss_clip": 0.07108907, + "auxiliary_loss_mlp": 0.01388674, + "balance_loss_clip": 0.0653006, + "balance_loss_mlp": 0.01307803, + "epoch": 0.05807906207725838, + "flos": 17754664360320.0, + "grad_norm": 8.638909024191255, + "language_loss": 0.85803884, + "learning_rate": 3.99173574101619e-06, + "loss": 0.94301462, + "num_input_tokens_seen": 20590395, + "router_z_loss_clip": 5.79296875, + "router_z_loss_mlp": 0.80859375, + "step": 966, + "time_per_iteration": 2.593015670776367 + }, + { + "auxiliary_loss_clip": 0.07081844, + "auxiliary_loss_mlp": 0.01385278, + "balance_loss_clip": 0.06515825, + "balance_loss_mlp": 0.01308507, + "epoch": 0.058139185329926346, + "flos": 18046133187840.0, + "grad_norm": 11.004143242377477, + "language_loss": 0.80350578, + "learning_rate": 3.9917003346758035e-06, + "loss": 0.88817692, + "num_input_tokens_seen": 20608435, + "router_z_loss_clip": 5.671875, + "router_z_loss_mlp": 0.76855469, + "step": 967, + "time_per_iteration": 4.057944297790527 + }, + { + "auxiliary_loss_clip": 0.06839906, + "auxiliary_loss_mlp": 0.01357839, + "balance_loss_clip": 0.06483683, + "balance_loss_mlp": 0.01313065, + "epoch": 0.05819930858259432, + "flos": 62381355845760.0, + "grad_norm": 0.8360355245003168, + "language_loss": 0.57554376, + "learning_rate": 3.991664852809939e-06, + "loss": 0.65752125, + "num_input_tokens_seen": 20668575, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 0.44799805, + "step": 968, + "time_per_iteration": 3.167989730834961 + }, + { + "auxiliary_loss_clip": 0.07096039, + "auxiliary_loss_mlp": 0.01391053, + "balance_loss_clip": 0.06529184, + "balance_loss_mlp": 0.01317, + "epoch": 0.05825943183526229, + "flos": 19141373905920.0, + "grad_norm": 7.005112994692607, + "language_loss": 0.84630275, + "learning_rate": 3.991629295419945e-06, + "loss": 0.93117368, + "num_input_tokens_seen": 20687355, + "router_z_loss_clip": 5.67578125, + "router_z_loss_mlp": 0.74072266, + "step": 969, + "time_per_iteration": 4.074899911880493 + }, + { + "auxiliary_loss_clip": 0.07116528, + "auxiliary_loss_mlp": 0.0138256, + "balance_loss_clip": 0.06523499, + "balance_loss_mlp": 0.01301068, + "epoch": 0.058319555087930255, + "flos": 29030042304000.0, + "grad_norm": 8.083926871251307, + "language_loss": 0.82668531, + "learning_rate": 3.991593662507167e-06, + "loss": 0.91167617, + "num_input_tokens_seen": 20705710, + "router_z_loss_clip": 5.9296875, + "router_z_loss_mlp": 0.81542969, + "step": 970, + "time_per_iteration": 2.659989833831787 + }, + { + "auxiliary_loss_clip": 0.07099806, + "auxiliary_loss_mlp": 0.01400005, + "balance_loss_clip": 0.06510817, + "balance_loss_mlp": 0.01317321, + "epoch": 0.05837967834059823, + "flos": 18885977061120.0, + "grad_norm": 16.518563352615757, + "language_loss": 0.96487081, + "learning_rate": 3.991557954072958e-06, + "loss": 1.04986882, + "num_input_tokens_seen": 20722405, + "router_z_loss_clip": 5.890625, + "router_z_loss_mlp": 0.82714844, + "step": 971, + "time_per_iteration": 2.666133165359497 + }, + { + "auxiliary_loss_clip": 0.07087609, + "auxiliary_loss_mlp": 0.01388607, + "balance_loss_clip": 0.06502773, + "balance_loss_mlp": 0.01310834, + "epoch": 0.05843980159326619, + "flos": 25710218737920.0, + "grad_norm": 16.27135895590574, + "language_loss": 0.89295512, + "learning_rate": 3.991522170118673e-06, + "loss": 0.97771728, + "num_input_tokens_seen": 20741480, + "router_z_loss_clip": 5.84765625, + "router_z_loss_mlp": 0.77832031, + "step": 972, + "time_per_iteration": 2.655470848083496 + }, + { + "auxiliary_loss_clip": 0.07066658, + "auxiliary_loss_mlp": 0.01374677, + "balance_loss_clip": 0.0650342, + "balance_loss_mlp": 0.01301482, + "epoch": 0.058499924845934165, + "flos": 25558425866880.0, + "grad_norm": 4.193788183762945, + "language_loss": 0.90456176, + "learning_rate": 3.991486310645667e-06, + "loss": 0.98897511, + "num_input_tokens_seen": 20759685, + "router_z_loss_clip": 5.62890625, + "router_z_loss_mlp": 0.73144531, + "step": 973, + "time_per_iteration": 2.6482443809509277 + }, + { + "auxiliary_loss_clip": 0.0705331, + "auxiliary_loss_mlp": 0.01383547, + "balance_loss_clip": 0.06485617, + "balance_loss_mlp": 0.01307635, + "epoch": 0.05856004809860214, + "flos": 16441859715840.0, + "grad_norm": 11.262132273646074, + "language_loss": 0.77443254, + "learning_rate": 3.991450375655301e-06, + "loss": 0.85880107, + "num_input_tokens_seen": 20778180, + "router_z_loss_clip": 5.6796875, + "router_z_loss_mlp": 0.75878906, + "step": 974, + "time_per_iteration": 2.57619047164917 + }, + { + "auxiliary_loss_clip": 0.07050242, + "auxiliary_loss_mlp": 0.01379524, + "balance_loss_clip": 0.06485987, + "balance_loss_mlp": 0.01304852, + "epoch": 0.0586201713512701, + "flos": 39468385486080.0, + "grad_norm": 6.566272929573762, + "language_loss": 0.79448825, + "learning_rate": 3.991414365148936e-06, + "loss": 0.87878591, + "num_input_tokens_seen": 20802705, + "router_z_loss_clip": 5.640625, + "router_z_loss_mlp": 0.74707031, + "step": 975, + "time_per_iteration": 2.79398250579834 + }, + { + "auxiliary_loss_clip": 0.07056309, + "auxiliary_loss_mlp": 0.0138878, + "balance_loss_clip": 0.06472544, + "balance_loss_mlp": 0.01304809, + "epoch": 0.058680294603938074, + "flos": 23371466302080.0, + "grad_norm": 4.828568059250088, + "language_loss": 0.79758298, + "learning_rate": 3.99137827912794e-06, + "loss": 0.88203388, + "num_input_tokens_seen": 20822540, + "router_z_loss_clip": 5.83984375, + "router_z_loss_mlp": 0.83984375, + "step": 976, + "time_per_iteration": 2.6214101314544678 + }, + { + "auxiliary_loss_clip": 0.07040592, + "auxiliary_loss_mlp": 0.01371791, + "balance_loss_clip": 0.06474636, + "balance_loss_mlp": 0.01299216, + "epoch": 0.05874041785660604, + "flos": 32239930913280.0, + "grad_norm": 7.236872171762386, + "language_loss": 0.89953148, + "learning_rate": 3.991342117593679e-06, + "loss": 0.98365533, + "num_input_tokens_seen": 20844175, + "router_z_loss_clip": 5.66015625, + "router_z_loss_mlp": 0.72607422, + "step": 977, + "time_per_iteration": 2.681955099105835 + }, + { + "auxiliary_loss_clip": 0.07041348, + "auxiliary_loss_mlp": 0.01373201, + "balance_loss_clip": 0.06467118, + "balance_loss_mlp": 0.01295619, + "epoch": 0.05880054110927401, + "flos": 22316657978880.0, + "grad_norm": 7.280318669233247, + "language_loss": 0.82238227, + "learning_rate": 3.991305880547527e-06, + "loss": 0.90652776, + "num_input_tokens_seen": 20864730, + "router_z_loss_clip": 5.734375, + "router_z_loss_mlp": 0.77587891, + "step": 978, + "time_per_iteration": 2.614290952682495 + }, + { + "auxiliary_loss_clip": 0.0707294, + "auxiliary_loss_mlp": 0.0136034, + "balance_loss_clip": 0.06484175, + "balance_loss_mlp": 0.01280184, + "epoch": 0.05886066436194198, + "flos": 27387726278400.0, + "grad_norm": 155.96057049304315, + "language_loss": 0.83328485, + "learning_rate": 3.991269567990855e-06, + "loss": 0.91761768, + "num_input_tokens_seen": 20885200, + "router_z_loss_clip": 5.89453125, + "router_z_loss_mlp": 0.80175781, + "step": 979, + "time_per_iteration": 2.635091543197632 + }, + { + "auxiliary_loss_clip": 0.0672864, + "auxiliary_loss_mlp": 0.01304756, + "balance_loss_clip": 0.06376771, + "balance_loss_mlp": 0.01257311, + "epoch": 0.05892078761460995, + "flos": 59601102647040.0, + "grad_norm": 0.9093094214807238, + "language_loss": 0.59396595, + "learning_rate": 3.9912331799250415e-06, + "loss": 0.67429984, + "num_input_tokens_seen": 20940325, + "router_z_loss_clip": 3.51953125, + "router_z_loss_mlp": 0.47387695, + "step": 980, + "time_per_iteration": 3.1261343955993652 + }, + { + "auxiliary_loss_clip": 0.07034945, + "auxiliary_loss_mlp": 0.01348733, + "balance_loss_clip": 0.06472749, + "balance_loss_mlp": 0.01274394, + "epoch": 0.05898091086727792, + "flos": 15419517649920.0, + "grad_norm": 3.186788863209633, + "language_loss": 0.90080172, + "learning_rate": 3.9911967163514665e-06, + "loss": 0.98463851, + "num_input_tokens_seen": 20958220, + "router_z_loss_clip": 5.625, + "router_z_loss_mlp": 0.74267578, + "step": 981, + "time_per_iteration": 2.5808515548706055 + }, + { + "auxiliary_loss_clip": 0.0705516, + "auxiliary_loss_mlp": 0.01348366, + "balance_loss_clip": 0.06484837, + "balance_loss_mlp": 0.0127746, + "epoch": 0.059041034119945886, + "flos": 23661383829120.0, + "grad_norm": 5.662656134717616, + "language_loss": 0.82531273, + "learning_rate": 3.991160177271513e-06, + "loss": 0.90934801, + "num_input_tokens_seen": 20978920, + "router_z_loss_clip": 5.703125, + "router_z_loss_mlp": 0.70898438, + "step": 982, + "time_per_iteration": 2.7105038166046143 + }, + { + "auxiliary_loss_clip": 0.07084571, + "auxiliary_loss_mlp": 0.01361032, + "balance_loss_clip": 0.06488383, + "balance_loss_mlp": 0.01281162, + "epoch": 0.05910115737261386, + "flos": 24761026886400.0, + "grad_norm": 3.604575523078559, + "language_loss": 0.87251258, + "learning_rate": 3.9911235626865654e-06, + "loss": 0.95696855, + "num_input_tokens_seen": 20999490, + "router_z_loss_clip": 5.9609375, + "router_z_loss_mlp": 0.79882812, + "step": 983, + "time_per_iteration": 2.744180917739868 + }, + { + "auxiliary_loss_clip": 0.07044654, + "auxiliary_loss_mlp": 0.01351466, + "balance_loss_clip": 0.06470264, + "balance_loss_mlp": 0.01274648, + "epoch": 0.05916128062528183, + "flos": 11733523741440.0, + "grad_norm": 4.930042751750388, + "language_loss": 0.87498015, + "learning_rate": 3.9910868725980125e-06, + "loss": 0.95894134, + "num_input_tokens_seen": 21017865, + "router_z_loss_clip": 5.734375, + "router_z_loss_mlp": 0.76806641, + "step": 984, + "time_per_iteration": 2.651169538497925 + }, + { + "auxiliary_loss_clip": 0.0704496, + "auxiliary_loss_mlp": 0.01342068, + "balance_loss_clip": 0.06470487, + "balance_loss_mlp": 0.01264582, + "epoch": 0.059221403877949795, + "flos": 21908587795200.0, + "grad_norm": 5.844491017467261, + "language_loss": 0.80473924, + "learning_rate": 3.9910501070072465e-06, + "loss": 0.88860953, + "num_input_tokens_seen": 21035900, + "router_z_loss_clip": 5.74609375, + "router_z_loss_mlp": 0.77490234, + "step": 985, + "time_per_iteration": 2.6289291381835938 + }, + { + "auxiliary_loss_clip": 0.07058708, + "auxiliary_loss_mlp": 0.01361985, + "balance_loss_clip": 0.06475725, + "balance_loss_mlp": 0.01284213, + "epoch": 0.05928152713061777, + "flos": 20519614189440.0, + "grad_norm": 6.301686711015131, + "language_loss": 0.93571031, + "learning_rate": 3.991013265915661e-06, + "loss": 1.01991737, + "num_input_tokens_seen": 21053235, + "router_z_loss_clip": 5.83203125, + "router_z_loss_mlp": 0.77783203, + "step": 986, + "time_per_iteration": 2.655438184738159 + }, + { + "auxiliary_loss_clip": 0.0708475, + "auxiliary_loss_mlp": 0.01349267, + "balance_loss_clip": 0.06479746, + "balance_loss_mlp": 0.01270303, + "epoch": 0.05934165038328574, + "flos": 24501437337600.0, + "grad_norm": 4.15562600287031, + "language_loss": 0.79382873, + "learning_rate": 3.9909763493246525e-06, + "loss": 0.87816888, + "num_input_tokens_seen": 21073090, + "router_z_loss_clip": 6.0546875, + "router_z_loss_mlp": 0.79003906, + "step": 987, + "time_per_iteration": 2.635974168777466 + }, + { + "auxiliary_loss_clip": 0.07112011, + "auxiliary_loss_mlp": 0.01375395, + "balance_loss_clip": 0.06492966, + "balance_loss_mlp": 0.0128861, + "epoch": 0.059401773635953704, + "flos": 38737302612480.0, + "grad_norm": 3.024721532830348, + "language_loss": 0.74664164, + "learning_rate": 3.990939357235621e-06, + "loss": 0.83151573, + "num_input_tokens_seen": 21094895, + "router_z_loss_clip": 6.19140625, + "router_z_loss_mlp": 0.8671875, + "step": 988, + "time_per_iteration": 2.8440210819244385 + }, + { + "auxiliary_loss_clip": 0.06738614, + "auxiliary_loss_mlp": 0.01302441, + "balance_loss_clip": 0.06389277, + "balance_loss_mlp": 0.01254757, + "epoch": 0.059461896888621676, + "flos": 58041244638720.0, + "grad_norm": 0.9346440677006217, + "language_loss": 0.71295583, + "learning_rate": 3.99090228964997e-06, + "loss": 0.79336637, + "num_input_tokens_seen": 21147555, + "router_z_loss_clip": 3.5, + "router_z_loss_mlp": 0.4765625, + "step": 989, + "time_per_iteration": 3.0397932529449463 + }, + { + "auxiliary_loss_clip": 0.07105568, + "auxiliary_loss_mlp": 0.01373719, + "balance_loss_clip": 0.06490866, + "balance_loss_mlp": 0.01288604, + "epoch": 0.05952202014128964, + "flos": 22134369421440.0, + "grad_norm": 3.813782873152628, + "language_loss": 0.81950057, + "learning_rate": 3.990865146569105e-06, + "loss": 0.90429342, + "num_input_tokens_seen": 21167845, + "router_z_loss_clip": 6.1484375, + "router_z_loss_mlp": 0.85107422, + "step": 990, + "time_per_iteration": 2.679490804672241 + }, + { + "auxiliary_loss_clip": 0.07070604, + "auxiliary_loss_mlp": 0.0136635, + "balance_loss_clip": 0.0648191, + "balance_loss_mlp": 0.01286957, + "epoch": 0.059582143393957614, + "flos": 20451495219840.0, + "grad_norm": 3.1821025671437786, + "language_loss": 0.88952839, + "learning_rate": 3.990827927994434e-06, + "loss": 0.97389793, + "num_input_tokens_seen": 21185085, + "router_z_loss_clip": 5.890625, + "router_z_loss_mlp": 0.79443359, + "step": 991, + "time_per_iteration": 2.6212010383605957 + }, + { + "auxiliary_loss_clip": 0.07097097, + "auxiliary_loss_mlp": 0.01373652, + "balance_loss_clip": 0.06486384, + "balance_loss_mlp": 0.012893, + "epoch": 0.059642266646625586, + "flos": 20601149811840.0, + "grad_norm": 4.7552664277712475, + "language_loss": 0.80401003, + "learning_rate": 3.9907906339273674e-06, + "loss": 0.88871753, + "num_input_tokens_seen": 21204230, + "router_z_loss_clip": 6.1171875, + "router_z_loss_mlp": 0.84375, + "step": 992, + "time_per_iteration": 2.6194934844970703 + }, + { + "auxiliary_loss_clip": 0.07081859, + "auxiliary_loss_mlp": 0.01371261, + "balance_loss_clip": 0.06485239, + "balance_loss_mlp": 0.01292869, + "epoch": 0.05970238989929355, + "flos": 19358434707840.0, + "grad_norm": 7.615023287218043, + "language_loss": 0.78822339, + "learning_rate": 3.9907532643693215e-06, + "loss": 0.87275457, + "num_input_tokens_seen": 21222655, + "router_z_loss_clip": 5.96875, + "router_z_loss_mlp": 0.78417969, + "step": 993, + "time_per_iteration": 2.5962717533111572 + }, + { + "auxiliary_loss_clip": 0.07073358, + "auxiliary_loss_mlp": 0.01364747, + "balance_loss_clip": 0.06486119, + "balance_loss_mlp": 0.01289073, + "epoch": 0.05976251315196152, + "flos": 30272002721280.0, + "grad_norm": 5.1352604598244, + "language_loss": 0.83427668, + "learning_rate": 3.990715819321712e-06, + "loss": 0.91865766, + "num_input_tokens_seen": 21242310, + "router_z_loss_clip": 5.875, + "router_z_loss_mlp": 0.75634766, + "step": 994, + "time_per_iteration": 2.677586317062378 + }, + { + "auxiliary_loss_clip": 0.07096842, + "auxiliary_loss_mlp": 0.01391454, + "balance_loss_clip": 0.06492864, + "balance_loss_mlp": 0.01313491, + "epoch": 0.05982263640462949, + "flos": 23191819148160.0, + "grad_norm": 4.423928105923456, + "language_loss": 0.83424294, + "learning_rate": 3.99067829878596e-06, + "loss": 0.91912591, + "num_input_tokens_seen": 21261410, + "router_z_loss_clip": 6.046875, + "router_z_loss_mlp": 0.77978516, + "step": 995, + "time_per_iteration": 2.62821364402771 + }, + { + "auxiliary_loss_clip": 0.07109222, + "auxiliary_loss_mlp": 0.01389117, + "balance_loss_clip": 0.06503183, + "balance_loss_mlp": 0.01309247, + "epoch": 0.05988275965729746, + "flos": 27857584448640.0, + "grad_norm": 3.07551937102457, + "language_loss": 0.89631027, + "learning_rate": 3.990640702763487e-06, + "loss": 0.98129368, + "num_input_tokens_seen": 21280080, + "router_z_loss_clip": 6.05859375, + "router_z_loss_mlp": 0.79785156, + "step": 996, + "time_per_iteration": 2.6472525596618652 + }, + { + "auxiliary_loss_clip": 0.0709434, + "auxiliary_loss_mlp": 0.01374144, + "balance_loss_clip": 0.06487706, + "balance_loss_mlp": 0.01292461, + "epoch": 0.05994288290996543, + "flos": 24686744641920.0, + "grad_norm": 3.8490454271878023, + "language_loss": 0.91812748, + "learning_rate": 3.990603031255718e-06, + "loss": 1.00281239, + "num_input_tokens_seen": 21296765, + "router_z_loss_clip": 6.05078125, + "router_z_loss_mlp": 0.81689453, + "step": 997, + "time_per_iteration": 2.6353485584259033 + }, + { + "auxiliary_loss_clip": 0.06747872, + "auxiliary_loss_mlp": 0.0129538, + "balance_loss_clip": 0.06402076, + "balance_loss_mlp": 0.01256113, + "epoch": 0.0600030061626334, + "flos": 69951187152000.0, + "grad_norm": 1.0138660307708214, + "language_loss": 0.75495923, + "learning_rate": 3.990565284264083e-06, + "loss": 0.83539176, + "num_input_tokens_seen": 21363345, + "router_z_loss_clip": 3.4609375, + "router_z_loss_mlp": 0.39233398, + "step": 998, + "time_per_iteration": 3.2664620876312256 + }, + { + "auxiliary_loss_clip": 0.07050692, + "auxiliary_loss_mlp": 0.01361564, + "balance_loss_clip": 0.06468829, + "balance_loss_mlp": 0.01286844, + "epoch": 0.06006312941530137, + "flos": 26547085791360.0, + "grad_norm": 6.665102912139699, + "language_loss": 0.78679419, + "learning_rate": 3.990527461790013e-06, + "loss": 0.87091672, + "num_input_tokens_seen": 21385290, + "router_z_loss_clip": 5.8203125, + "router_z_loss_mlp": 0.74707031, + "step": 999, + "time_per_iteration": 2.6708481311798096 + }, + { + "auxiliary_loss_clip": 0.07090119, + "auxiliary_loss_mlp": 0.01381378, + "balance_loss_clip": 0.06486722, + "balance_loss_mlp": 0.01301603, + "epoch": 0.060123252667969335, + "flos": 27351276952320.0, + "grad_norm": 3.7400701542168013, + "language_loss": 0.85150427, + "learning_rate": 3.990489563834943e-06, + "loss": 0.93621922, + "num_input_tokens_seen": 21407625, + "router_z_loss_clip": 6.03515625, + "router_z_loss_mlp": 0.79833984, + "step": 1000, + "time_per_iteration": 2.643961191177368 + }, + { + "auxiliary_loss_clip": 0.07061431, + "auxiliary_loss_mlp": 0.01377664, + "balance_loss_clip": 0.06471995, + "balance_loss_mlp": 0.01297555, + "epoch": 0.06018337592063731, + "flos": 27024113485440.0, + "grad_norm": 4.060867986193189, + "language_loss": 0.88738573, + "learning_rate": 3.990451590400309e-06, + "loss": 0.97177666, + "num_input_tokens_seen": 21426835, + "router_z_loss_clip": 5.890625, + "router_z_loss_mlp": 0.80126953, + "step": 1001, + "time_per_iteration": 2.629136323928833 + }, + { + "auxiliary_loss_clip": 0.07032709, + "auxiliary_loss_mlp": 0.01355395, + "balance_loss_clip": 0.06470643, + "balance_loss_mlp": 0.01289306, + "epoch": 0.06024349917330528, + "flos": 25599990291840.0, + "grad_norm": 3.249124655019378, + "language_loss": 0.76097226, + "learning_rate": 3.990413541487551e-06, + "loss": 0.84485334, + "num_input_tokens_seen": 21444920, + "router_z_loss_clip": 5.6171875, + "router_z_loss_mlp": 0.66162109, + "step": 1002, + "time_per_iteration": 2.6258249282836914 + }, + { + "auxiliary_loss_clip": 0.07068716, + "auxiliary_loss_mlp": 0.01374313, + "balance_loss_clip": 0.06480874, + "balance_loss_mlp": 0.01298067, + "epoch": 0.060303622425973244, + "flos": 26139225242880.0, + "grad_norm": 4.8561241229026075, + "language_loss": 0.78990388, + "learning_rate": 3.990375417098112e-06, + "loss": 0.87433422, + "num_input_tokens_seen": 21463555, + "router_z_loss_clip": 5.8828125, + "router_z_loss_mlp": 0.76220703, + "step": 1003, + "time_per_iteration": 2.7662932872772217 + }, + { + "auxiliary_loss_clip": 0.0707517, + "auxiliary_loss_mlp": 0.01365139, + "balance_loss_clip": 0.0647432, + "balance_loss_mlp": 0.01284077, + "epoch": 0.060363745678641216, + "flos": 20383627812480.0, + "grad_norm": 4.219450714846169, + "language_loss": 0.73012471, + "learning_rate": 3.990337217233437e-06, + "loss": 0.81452775, + "num_input_tokens_seen": 21481990, + "router_z_loss_clip": 6.015625, + "router_z_loss_mlp": 0.81005859, + "step": 1004, + "time_per_iteration": 5.472697734832764 + }, + { + "auxiliary_loss_clip": 0.07068998, + "auxiliary_loss_mlp": 0.01370949, + "balance_loss_clip": 0.06471765, + "balance_loss_mlp": 0.0129313, + "epoch": 0.06042386893130918, + "flos": 17754999776640.0, + "grad_norm": 3.350107422381743, + "language_loss": 0.86839885, + "learning_rate": 3.990298941894976e-06, + "loss": 0.95279837, + "num_input_tokens_seen": 21500385, + "router_z_loss_clip": 5.96875, + "router_z_loss_mlp": 0.77832031, + "step": 1005, + "time_per_iteration": 2.628612518310547 + }, + { + "auxiliary_loss_clip": 0.06732726, + "auxiliary_loss_mlp": 0.01300149, + "balance_loss_clip": 0.06388327, + "balance_loss_mlp": 0.01255518, + "epoch": 0.06048399218397715, + "flos": 68559110945280.0, + "grad_norm": 0.8658661250215584, + "language_loss": 0.59003174, + "learning_rate": 3.9902605910841794e-06, + "loss": 0.67036045, + "num_input_tokens_seen": 21561040, + "router_z_loss_clip": 3.4375, + "router_z_loss_mlp": 0.4465332, + "step": 1006, + "time_per_iteration": 3.2709102630615234 + }, + { + "auxiliary_loss_clip": 0.07070711, + "auxiliary_loss_mlp": 0.01360281, + "balance_loss_clip": 0.06464767, + "balance_loss_mlp": 0.01278123, + "epoch": 0.060544115436645125, + "flos": 23265262851840.0, + "grad_norm": 3.0418653981095973, + "language_loss": 0.77645856, + "learning_rate": 3.990222164802503e-06, + "loss": 0.8607685, + "num_input_tokens_seen": 21580655, + "router_z_loss_clip": 6.05859375, + "router_z_loss_mlp": 0.82128906, + "step": 1007, + "time_per_iteration": 4.056382894515991 + }, + { + "auxiliary_loss_clip": 0.07091306, + "auxiliary_loss_mlp": 0.01370917, + "balance_loss_clip": 0.06486145, + "balance_loss_mlp": 0.01290475, + "epoch": 0.06060423868931309, + "flos": 23885236811520.0, + "grad_norm": 3.189900491688776, + "language_loss": 0.83630216, + "learning_rate": 3.9901836630514006e-06, + "loss": 0.92092443, + "num_input_tokens_seen": 21599650, + "router_z_loss_clip": 6.05859375, + "router_z_loss_mlp": 0.8046875, + "step": 1008, + "time_per_iteration": 2.6701247692108154 + }, + { + "auxiliary_loss_clip": 0.07042849, + "auxiliary_loss_mlp": 0.01344814, + "balance_loss_clip": 0.06474254, + "balance_loss_mlp": 0.01273718, + "epoch": 0.06066436194198106, + "flos": 18733010232960.0, + "grad_norm": 8.677434751337552, + "language_loss": 0.80948377, + "learning_rate": 3.990145085832335e-06, + "loss": 0.89336038, + "num_input_tokens_seen": 21617550, + "router_z_loss_clip": 5.6875, + "router_z_loss_mlp": 0.71142578, + "step": 1009, + "time_per_iteration": 4.013457536697388 + }, + { + "auxiliary_loss_clip": 0.07022181, + "auxiliary_loss_mlp": 0.01332483, + "balance_loss_clip": 0.06467105, + "balance_loss_mlp": 0.01266345, + "epoch": 0.06072448519464903, + "flos": 24646689590400.0, + "grad_norm": 3.258884654543471, + "language_loss": 0.95985019, + "learning_rate": 3.990106433146769e-06, + "loss": 1.04339683, + "num_input_tokens_seen": 21635865, + "router_z_loss_clip": 5.55078125, + "router_z_loss_mlp": 0.66162109, + "step": 1010, + "time_per_iteration": 2.631512403488159 + }, + { + "auxiliary_loss_clip": 0.07117961, + "auxiliary_loss_mlp": 0.01383111, + "balance_loss_clip": 0.06489638, + "balance_loss_mlp": 0.01291845, + "epoch": 0.060784608447317, + "flos": 17383672408320.0, + "grad_norm": 3.3823449890168145, + "language_loss": 0.75409305, + "learning_rate": 3.9900677049961665e-06, + "loss": 0.83910382, + "num_input_tokens_seen": 21653945, + "router_z_loss_clip": 6.28125, + "router_z_loss_mlp": 0.91259766, + "step": 1011, + "time_per_iteration": 2.5896708965301514 + }, + { + "auxiliary_loss_clip": 0.07033786, + "auxiliary_loss_mlp": 0.01345512, + "balance_loss_clip": 0.06462559, + "balance_loss_mlp": 0.0126526, + "epoch": 0.06084473169998497, + "flos": 23698336279680.0, + "grad_norm": 3.246815093008435, + "language_loss": 0.89853048, + "learning_rate": 3.990028901381999e-06, + "loss": 0.98232347, + "num_input_tokens_seen": 21671230, + "router_z_loss_clip": 5.7109375, + "router_z_loss_mlp": 0.80273438, + "step": 1012, + "time_per_iteration": 2.637019157409668 + }, + { + "auxiliary_loss_clip": 0.07040339, + "auxiliary_loss_mlp": 0.01338129, + "balance_loss_clip": 0.06458548, + "balance_loss_mlp": 0.01258211, + "epoch": 0.06090485495265294, + "flos": 23552455121280.0, + "grad_norm": 2.5392970439405116, + "language_loss": 0.79602826, + "learning_rate": 3.989990022305734e-06, + "loss": 0.8798129, + "num_input_tokens_seen": 21691155, + "router_z_loss_clip": 5.81640625, + "router_z_loss_mlp": 0.79980469, + "step": 1013, + "time_per_iteration": 2.658986806869507 + }, + { + "auxiliary_loss_clip": 0.0703081, + "auxiliary_loss_mlp": 0.01334151, + "balance_loss_clip": 0.06449694, + "balance_loss_mlp": 0.01255664, + "epoch": 0.06096497820532091, + "flos": 20345501404800.0, + "grad_norm": 3.5799775107607585, + "language_loss": 0.88768977, + "learning_rate": 3.98995106776885e-06, + "loss": 0.97133934, + "num_input_tokens_seen": 21707405, + "router_z_loss_clip": 5.8203125, + "router_z_loss_mlp": 0.78515625, + "step": 1014, + "time_per_iteration": 2.6026017665863037 + }, + { + "auxiliary_loss_clip": 0.07069368, + "auxiliary_loss_mlp": 0.01344703, + "balance_loss_clip": 0.06459542, + "balance_loss_mlp": 0.01260589, + "epoch": 0.061025101457988874, + "flos": 26945638536960.0, + "grad_norm": 5.148864357756937, + "language_loss": 0.77818727, + "learning_rate": 3.98991203777282e-06, + "loss": 0.86232805, + "num_input_tokens_seen": 21728090, + "router_z_loss_clip": 6.1015625, + "router_z_loss_mlp": 0.84082031, + "step": 1015, + "time_per_iteration": 2.6645917892456055 + }, + { + "auxiliary_loss_clip": 0.07000691, + "auxiliary_loss_mlp": 0.01326184, + "balance_loss_clip": 0.06455131, + "balance_loss_mlp": 0.01257949, + "epoch": 0.061085224710656846, + "flos": 25382216730240.0, + "grad_norm": 2.4567185281472868, + "language_loss": 0.82061088, + "learning_rate": 3.9898729323191275e-06, + "loss": 0.90387964, + "num_input_tokens_seen": 21747950, + "router_z_loss_clip": 5.453125, + "router_z_loss_mlp": 0.68359375, + "step": 1016, + "time_per_iteration": 2.631394863128662 + }, + { + "auxiliary_loss_clip": 0.07014458, + "auxiliary_loss_mlp": 0.01339398, + "balance_loss_clip": 0.06457797, + "balance_loss_mlp": 0.01263962, + "epoch": 0.06114534796332482, + "flos": 24831326062080.0, + "grad_norm": 2.2885034058804363, + "language_loss": 0.78705657, + "learning_rate": 3.989833751409254e-06, + "loss": 0.8705951, + "num_input_tokens_seen": 21767900, + "router_z_loss_clip": 5.55859375, + "router_z_loss_mlp": 0.75390625, + "step": 1017, + "time_per_iteration": 2.657306432723999 + }, + { + "auxiliary_loss_clip": 0.07054974, + "auxiliary_loss_mlp": 0.0134134, + "balance_loss_clip": 0.06458369, + "balance_loss_mlp": 0.01256225, + "epoch": 0.061205471215992784, + "flos": 20637724919040.0, + "grad_norm": 9.632952296777574, + "language_loss": 0.88575757, + "learning_rate": 3.989794495044685e-06, + "loss": 0.96972066, + "num_input_tokens_seen": 21787375, + "router_z_loss_clip": 5.97265625, + "router_z_loss_mlp": 0.85107422, + "step": 1018, + "time_per_iteration": 2.5989861488342285 + }, + { + "auxiliary_loss_clip": 0.07009743, + "auxiliary_loss_mlp": 0.01334982, + "balance_loss_clip": 0.06455217, + "balance_loss_mlp": 0.01259165, + "epoch": 0.061265594468660756, + "flos": 16513919827200.0, + "grad_norm": 8.927182809216816, + "language_loss": 0.8225174, + "learning_rate": 3.989755163226909e-06, + "loss": 0.90596467, + "num_input_tokens_seen": 21806275, + "router_z_loss_clip": 5.54296875, + "router_z_loss_mlp": 0.75878906, + "step": 1019, + "time_per_iteration": 2.596885919570923 + }, + { + "auxiliary_loss_clip": 0.07013386, + "auxiliary_loss_mlp": 0.01335228, + "balance_loss_clip": 0.06456258, + "balance_loss_mlp": 0.01263417, + "epoch": 0.06132571772132872, + "flos": 26252765925120.0, + "grad_norm": 3.333827515378615, + "language_loss": 0.86933666, + "learning_rate": 3.989715755957418e-06, + "loss": 0.9528228, + "num_input_tokens_seen": 21826430, + "router_z_loss_clip": 5.5625, + "router_z_loss_mlp": 0.71826172, + "step": 1020, + "time_per_iteration": 2.6224961280822754 + }, + { + "auxiliary_loss_clip": 0.06996658, + "auxiliary_loss_mlp": 0.01346945, + "balance_loss_clip": 0.06447957, + "balance_loss_mlp": 0.01273989, + "epoch": 0.06138584097399669, + "flos": 37423869062400.0, + "grad_norm": 2.8232559173096914, + "language_loss": 0.81487918, + "learning_rate": 3.989676273237705e-06, + "loss": 0.89831525, + "num_input_tokens_seen": 21847800, + "router_z_loss_clip": 5.48828125, + "router_z_loss_mlp": 0.72949219, + "step": 1021, + "time_per_iteration": 2.771052598953247 + }, + { + "auxiliary_loss_clip": 0.06976922, + "auxiliary_loss_mlp": 0.0136383, + "balance_loss_clip": 0.06428508, + "balance_loss_mlp": 0.01285295, + "epoch": 0.061445964226664665, + "flos": 17426410790400.0, + "grad_norm": 7.734725170769636, + "language_loss": 0.9093855, + "learning_rate": 3.9896367150692705e-06, + "loss": 0.99279296, + "num_input_tokens_seen": 21863385, + "router_z_loss_clip": 5.48046875, + "router_z_loss_mlp": 0.78466797, + "step": 1022, + "time_per_iteration": 2.5622968673706055 + }, + { + "auxiliary_loss_clip": 0.0697528, + "auxiliary_loss_mlp": 0.01365327, + "balance_loss_clip": 0.06437931, + "balance_loss_mlp": 0.01295518, + "epoch": 0.06150608747933263, + "flos": 22606365870720.0, + "grad_norm": 3.61040283013288, + "language_loss": 0.84977013, + "learning_rate": 3.989597081453611e-06, + "loss": 0.93317622, + "num_input_tokens_seen": 21881880, + "router_z_loss_clip": 5.37109375, + "router_z_loss_mlp": 0.69824219, + "step": 1023, + "time_per_iteration": 2.6407079696655273 + }, + { + "auxiliary_loss_clip": 0.0673309, + "auxiliary_loss_mlp": 0.01419946, + "balance_loss_clip": 0.06385664, + "balance_loss_mlp": 0.0137119, + "epoch": 0.0615662107320006, + "flos": 56758097139840.0, + "grad_norm": 0.9164460168563352, + "language_loss": 0.64884549, + "learning_rate": 3.989557372392231e-06, + "loss": 0.73037589, + "num_input_tokens_seen": 21940550, + "router_z_loss_clip": 3.46875, + "router_z_loss_mlp": 0.48706055, + "step": 1024, + "time_per_iteration": 3.240457534790039 + }, + { + "auxiliary_loss_clip": 0.06995942, + "auxiliary_loss_mlp": 0.01352799, + "balance_loss_clip": 0.06434722, + "balance_loss_mlp": 0.01272356, + "epoch": 0.06162633398466857, + "flos": 22571342064000.0, + "grad_norm": 2.66796346315112, + "language_loss": 0.91765183, + "learning_rate": 3.989517587886636e-06, + "loss": 1.00113928, + "num_input_tokens_seen": 21958390, + "router_z_loss_clip": 5.61328125, + "router_z_loss_mlp": 0.80371094, + "step": 1025, + "time_per_iteration": 2.6372737884521484 + }, + { + "auxiliary_loss_clip": 0.06986167, + "auxiliary_loss_mlp": 0.01374261, + "balance_loss_clip": 0.06435852, + "balance_loss_mlp": 0.01300828, + "epoch": 0.06168645723733654, + "flos": 25600158000000.0, + "grad_norm": 2.4272602971827535, + "language_loss": 0.871768, + "learning_rate": 3.989477727938335e-06, + "loss": 0.95537233, + "num_input_tokens_seen": 21978625, + "router_z_loss_clip": 5.50390625, + "router_z_loss_mlp": 0.73486328, + "step": 1026, + "time_per_iteration": 2.6508452892303467 + }, + { + "auxiliary_loss_clip": 0.06989977, + "auxiliary_loss_mlp": 0.01363012, + "balance_loss_clip": 0.06439693, + "balance_loss_mlp": 0.01286622, + "epoch": 0.06174658049000451, + "flos": 16003461553920.0, + "grad_norm": 3.495791258705881, + "language_loss": 0.8437736, + "learning_rate": 3.989437792548839e-06, + "loss": 0.92730343, + "num_input_tokens_seen": 21996035, + "router_z_loss_clip": 5.5, + "router_z_loss_mlp": 0.76416016, + "step": 1027, + "time_per_iteration": 2.613172769546509 + }, + { + "auxiliary_loss_clip": 0.06973707, + "auxiliary_loss_mlp": 0.01359003, + "balance_loss_clip": 0.0641673, + "balance_loss_mlp": 0.01281422, + "epoch": 0.06180670374267248, + "flos": 11289842772480.0, + "grad_norm": 3.8173647671524793, + "language_loss": 0.87086433, + "learning_rate": 3.989397781719663e-06, + "loss": 0.95419139, + "num_input_tokens_seen": 22011625, + "router_z_loss_clip": 5.5625, + "router_z_loss_mlp": 0.77539062, + "step": 1028, + "time_per_iteration": 2.6524107456207275 + }, + { + "auxiliary_loss_clip": 0.06704632, + "auxiliary_loss_mlp": 0.01372349, + "balance_loss_clip": 0.06357226, + "balance_loss_mlp": 0.01321519, + "epoch": 0.06186682699534045, + "flos": 65147647340160.0, + "grad_norm": 0.9176628937357996, + "language_loss": 0.60490429, + "learning_rate": 3.989357695452323e-06, + "loss": 0.68567419, + "num_input_tokens_seen": 22066035, + "router_z_loss_clip": 3.46875, + "router_z_loss_mlp": 0.50830078, + "step": 1029, + "time_per_iteration": 3.218085289001465 + }, + { + "auxiliary_loss_clip": 0.07009555, + "auxiliary_loss_mlp": 0.01372678, + "balance_loss_clip": 0.06434123, + "balance_loss_mlp": 0.01287372, + "epoch": 0.061926950248008414, + "flos": 21112111209600.0, + "grad_norm": 3.737194986722716, + "language_loss": 0.85668898, + "learning_rate": 3.98931753374834e-06, + "loss": 0.94051135, + "num_input_tokens_seen": 22085015, + "router_z_loss_clip": 5.75390625, + "router_z_loss_mlp": 0.85253906, + "step": 1030, + "time_per_iteration": 2.7052202224731445 + }, + { + "auxiliary_loss_clip": 0.06989674, + "auxiliary_loss_mlp": 0.01357455, + "balance_loss_clip": 0.06431329, + "balance_loss_mlp": 0.01280446, + "epoch": 0.061987073500676386, + "flos": 17754161235840.0, + "grad_norm": 3.4423452178420013, + "language_loss": 0.83235556, + "learning_rate": 3.989277296609237e-06, + "loss": 0.91582686, + "num_input_tokens_seen": 22102775, + "router_z_loss_clip": 5.5859375, + "router_z_loss_mlp": 0.77050781, + "step": 1031, + "time_per_iteration": 2.588575839996338 + }, + { + "auxiliary_loss_clip": 0.06983647, + "auxiliary_loss_mlp": 0.01355074, + "balance_loss_clip": 0.06433594, + "balance_loss_mlp": 0.01283453, + "epoch": 0.06204719675334436, + "flos": 21842858666880.0, + "grad_norm": 14.220096224086527, + "language_loss": 0.80345309, + "learning_rate": 3.98923698403654e-06, + "loss": 0.88684022, + "num_input_tokens_seen": 22121680, + "router_z_loss_clip": 5.50390625, + "router_z_loss_mlp": 0.71582031, + "step": 1032, + "time_per_iteration": 2.6636962890625 + }, + { + "auxiliary_loss_clip": 0.06996015, + "auxiliary_loss_mlp": 0.01349932, + "balance_loss_clip": 0.064355, + "balance_loss_mlp": 0.01272828, + "epoch": 0.06210732000601232, + "flos": 19359650592000.0, + "grad_norm": 3.724079257252284, + "language_loss": 0.9305315, + "learning_rate": 3.989196596031776e-06, + "loss": 1.01399088, + "num_input_tokens_seen": 22138155, + "router_z_loss_clip": 5.60546875, + "router_z_loss_mlp": 0.77197266, + "step": 1033, + "time_per_iteration": 2.5974748134613037 + }, + { + "auxiliary_loss_clip": 0.06988779, + "auxiliary_loss_mlp": 0.01347157, + "balance_loss_clip": 0.06438898, + "balance_loss_mlp": 0.0127525, + "epoch": 0.062167443258680295, + "flos": 24755534444160.0, + "grad_norm": 3.649174890809254, + "language_loss": 0.87141907, + "learning_rate": 3.989156132596479e-06, + "loss": 0.95477843, + "num_input_tokens_seen": 22157420, + "router_z_loss_clip": 5.49609375, + "router_z_loss_mlp": 0.71875, + "step": 1034, + "time_per_iteration": 2.6747853755950928 + }, + { + "auxiliary_loss_clip": 0.06962503, + "auxiliary_loss_mlp": 0.01360042, + "balance_loss_clip": 0.06434912, + "balance_loss_mlp": 0.01290854, + "epoch": 0.06222756651134827, + "flos": 34466903602560.0, + "grad_norm": 3.3762373845942313, + "language_loss": 0.84657645, + "learning_rate": 3.989115593732182e-06, + "loss": 0.92980194, + "num_input_tokens_seen": 22178620, + "router_z_loss_clip": 5.2734375, + "router_z_loss_mlp": 0.69189453, + "step": 1035, + "time_per_iteration": 2.690265655517578 + }, + { + "auxiliary_loss_clip": 0.06995995, + "auxiliary_loss_mlp": 0.01348638, + "balance_loss_clip": 0.06441504, + "balance_loss_mlp": 0.01275015, + "epoch": 0.06228768976401623, + "flos": 25673601703680.0, + "grad_norm": 4.464615872821339, + "language_loss": 0.81925672, + "learning_rate": 3.989074979440421e-06, + "loss": 0.90270305, + "num_input_tokens_seen": 22197125, + "router_z_loss_clip": 5.5390625, + "router_z_loss_mlp": 0.73583984, + "step": 1036, + "time_per_iteration": 2.6662774085998535 + }, + { + "auxiliary_loss_clip": 0.07003354, + "auxiliary_loss_mlp": 0.01370226, + "balance_loss_clip": 0.064463, + "balance_loss_mlp": 0.01293693, + "epoch": 0.062347813016684205, + "flos": 25301687356800.0, + "grad_norm": 3.754285367283167, + "language_loss": 0.89123344, + "learning_rate": 3.989034289722739e-06, + "loss": 0.97496927, + "num_input_tokens_seen": 22217575, + "router_z_loss_clip": 5.56640625, + "router_z_loss_mlp": 0.76513672, + "step": 1037, + "time_per_iteration": 2.609894037246704 + }, + { + "auxiliary_loss_clip": 0.07008456, + "auxiliary_loss_mlp": 0.01342836, + "balance_loss_clip": 0.06453587, + "balance_loss_mlp": 0.01269641, + "epoch": 0.06240793626935217, + "flos": 26914388163840.0, + "grad_norm": 15.327798453817612, + "language_loss": 0.8346867, + "learning_rate": 3.988993524580676e-06, + "loss": 0.91819966, + "num_input_tokens_seen": 22236840, + "router_z_loss_clip": 5.54296875, + "router_z_loss_mlp": 0.73095703, + "step": 1038, + "time_per_iteration": 2.6626057624816895 + }, + { + "auxiliary_loss_clip": 0.06993866, + "auxiliary_loss_mlp": 0.01340149, + "balance_loss_clip": 0.0645204, + "balance_loss_mlp": 0.01267956, + "epoch": 0.06246805952202014, + "flos": 21622108285440.0, + "grad_norm": 3.08050473605758, + "language_loss": 0.88628823, + "learning_rate": 3.98895268401578e-06, + "loss": 0.96962833, + "num_input_tokens_seen": 22256465, + "router_z_loss_clip": 5.41796875, + "router_z_loss_mlp": 0.72167969, + "step": 1039, + "time_per_iteration": 2.6248486042022705 + }, + { + "auxiliary_loss_clip": 0.0701851, + "auxiliary_loss_mlp": 0.01340836, + "balance_loss_clip": 0.06453219, + "balance_loss_mlp": 0.01264352, + "epoch": 0.0625281827746881, + "flos": 19316954136960.0, + "grad_norm": 4.220230384937809, + "language_loss": 0.85023952, + "learning_rate": 3.9889117680296e-06, + "loss": 0.933833, + "num_input_tokens_seen": 22274025, + "router_z_loss_clip": 5.6484375, + "router_z_loss_mlp": 0.76513672, + "step": 1040, + "time_per_iteration": 2.6646370887756348 + }, + { + "auxiliary_loss_clip": 0.07036482, + "auxiliary_loss_mlp": 0.01364298, + "balance_loss_clip": 0.06464302, + "balance_loss_mlp": 0.01274987, + "epoch": 0.06258830602735609, + "flos": 27753183861120.0, + "grad_norm": 4.590358257909823, + "language_loss": 0.72318321, + "learning_rate": 3.988870776623685e-06, + "loss": 0.80719095, + "num_input_tokens_seen": 22292245, + "router_z_loss_clip": 5.71875, + "router_z_loss_mlp": 0.89306641, + "step": 1041, + "time_per_iteration": 2.6730599403381348 + }, + { + "auxiliary_loss_clip": 0.07040736, + "auxiliary_loss_mlp": 0.01378227, + "balance_loss_clip": 0.06470466, + "balance_loss_mlp": 0.01298548, + "epoch": 0.06264842928002405, + "flos": 23229442431360.0, + "grad_norm": 2.706616424442574, + "language_loss": 0.84952104, + "learning_rate": 3.9888297097995905e-06, + "loss": 0.93371069, + "num_input_tokens_seen": 22311455, + "router_z_loss_clip": 5.6953125, + "router_z_loss_mlp": 0.796875, + "step": 1042, + "time_per_iteration": 2.6521389484405518 + }, + { + "auxiliary_loss_clip": 0.0703849, + "auxiliary_loss_mlp": 0.0134851, + "balance_loss_clip": 0.06476429, + "balance_loss_mlp": 0.01272598, + "epoch": 0.06270855253269202, + "flos": 38408671699200.0, + "grad_norm": 3.072391396873047, + "language_loss": 0.79772788, + "learning_rate": 3.988788567558874e-06, + "loss": 0.88159788, + "num_input_tokens_seen": 22333750, + "router_z_loss_clip": 5.62109375, + "router_z_loss_mlp": 0.75927734, + "step": 1043, + "time_per_iteration": 4.132354021072388 + }, + { + "auxiliary_loss_clip": 0.07023476, + "auxiliary_loss_mlp": 0.01365807, + "balance_loss_clip": 0.06473523, + "balance_loss_mlp": 0.01289656, + "epoch": 0.06276867578535998, + "flos": 22459771952640.0, + "grad_norm": 8.578696431093903, + "language_loss": 0.95484012, + "learning_rate": 3.988747349903097e-06, + "loss": 1.03873289, + "num_input_tokens_seen": 22351940, + "router_z_loss_clip": 5.49609375, + "router_z_loss_mlp": 0.76123047, + "step": 1044, + "time_per_iteration": 4.0872087478637695 + }, + { + "auxiliary_loss_clip": 0.0702454, + "auxiliary_loss_mlp": 0.0136404, + "balance_loss_clip": 0.06474113, + "balance_loss_mlp": 0.0129156, + "epoch": 0.06282879903802796, + "flos": 22937176990080.0, + "grad_norm": 5.298315501835511, + "language_loss": 0.88737643, + "learning_rate": 3.988706056833821e-06, + "loss": 0.97126228, + "num_input_tokens_seen": 22372085, + "router_z_loss_clip": 5.5, + "router_z_loss_mlp": 0.72412109, + "step": 1045, + "time_per_iteration": 2.6359164714813232 + }, + { + "auxiliary_loss_clip": 0.07016507, + "auxiliary_loss_mlp": 0.01377248, + "balance_loss_clip": 0.06467608, + "balance_loss_mlp": 0.01300334, + "epoch": 0.06288892229069593, + "flos": 34827036451200.0, + "grad_norm": 2.8748954821383803, + "language_loss": 0.81643683, + "learning_rate": 3.9886646883526125e-06, + "loss": 0.90037435, + "num_input_tokens_seen": 22392020, + "router_z_loss_clip": 5.484375, + "router_z_loss_mlp": 0.76855469, + "step": 1046, + "time_per_iteration": 4.205566883087158 + }, + { + "auxiliary_loss_clip": 0.07049687, + "auxiliary_loss_mlp": 0.01383919, + "balance_loss_clip": 0.0647831, + "balance_loss_mlp": 0.01309628, + "epoch": 0.06294904554336389, + "flos": 19433178149760.0, + "grad_norm": 3.049904917466256, + "language_loss": 0.8054778, + "learning_rate": 3.988623244461039e-06, + "loss": 0.8898139, + "num_input_tokens_seen": 22411180, + "router_z_loss_clip": 5.71484375, + "router_z_loss_mlp": 0.74267578, + "step": 1047, + "time_per_iteration": 2.628453493118286 + }, + { + "auxiliary_loss_clip": 0.07082113, + "auxiliary_loss_mlp": 0.01418593, + "balance_loss_clip": 0.06488797, + "balance_loss_mlp": 0.01332237, + "epoch": 0.06300916879603187, + "flos": 40671464808960.0, + "grad_norm": 5.477739593856775, + "language_loss": 0.80062962, + "learning_rate": 3.988581725160672e-06, + "loss": 0.88563669, + "num_input_tokens_seen": 22435105, + "router_z_loss_clip": 5.921875, + "router_z_loss_mlp": 0.86279297, + "step": 1048, + "time_per_iteration": 4.191184997558594 + }, + { + "auxiliary_loss_clip": 0.07059699, + "auxiliary_loss_mlp": 0.01409495, + "balance_loss_clip": 0.06479897, + "balance_loss_mlp": 0.01322902, + "epoch": 0.06306929204869983, + "flos": 23810703004800.0, + "grad_norm": 4.634968800445174, + "language_loss": 0.81291783, + "learning_rate": 3.988540130453087e-06, + "loss": 0.89760983, + "num_input_tokens_seen": 22452710, + "router_z_loss_clip": 5.796875, + "router_z_loss_mlp": 0.86669922, + "step": 1049, + "time_per_iteration": 2.650202989578247 + }, + { + "auxiliary_loss_clip": 0.07039324, + "auxiliary_loss_mlp": 0.01395065, + "balance_loss_clip": 0.06466646, + "balance_loss_mlp": 0.01316435, + "epoch": 0.0631294153013678, + "flos": 18921671700480.0, + "grad_norm": 5.321703459602036, + "language_loss": 0.85613585, + "learning_rate": 3.988498460339862e-06, + "loss": 0.9404797, + "num_input_tokens_seen": 22470175, + "router_z_loss_clip": 5.71875, + "router_z_loss_mlp": 0.78662109, + "step": 1050, + "time_per_iteration": 2.6393301486968994 + }, + { + "auxiliary_loss_clip": 0.07003346, + "auxiliary_loss_mlp": 0.01381224, + "balance_loss_clip": 0.06475418, + "balance_loss_mlp": 0.01309221, + "epoch": 0.06318953855403578, + "flos": 24287101793280.0, + "grad_norm": 2.921652621723748, + "language_loss": 0.80915332, + "learning_rate": 3.988456714822575e-06, + "loss": 0.89299899, + "num_input_tokens_seen": 22490020, + "router_z_loss_clip": 5.2734375, + "router_z_loss_mlp": 0.71972656, + "step": 1051, + "time_per_iteration": 2.6563098430633545 + }, + { + "auxiliary_loss_clip": 0.07019964, + "auxiliary_loss_mlp": 0.01395256, + "balance_loss_clip": 0.06461668, + "balance_loss_mlp": 0.01314957, + "epoch": 0.06324966180670374, + "flos": 22535563570560.0, + "grad_norm": 3.4102512673670256, + "language_loss": 0.84142733, + "learning_rate": 3.98841489390281e-06, + "loss": 0.92557955, + "num_input_tokens_seen": 22509685, + "router_z_loss_clip": 5.57421875, + "router_z_loss_mlp": 0.80224609, + "step": 1052, + "time_per_iteration": 2.6776039600372314 + }, + { + "auxiliary_loss_clip": 0.07036786, + "auxiliary_loss_mlp": 0.01379519, + "balance_loss_clip": 0.06459802, + "balance_loss_mlp": 0.01299411, + "epoch": 0.06330978505937171, + "flos": 15783465859200.0, + "grad_norm": 2.8507947153873663, + "language_loss": 0.80809307, + "learning_rate": 3.988372997582155e-06, + "loss": 0.89225614, + "num_input_tokens_seen": 22527905, + "router_z_loss_clip": 5.76953125, + "router_z_loss_mlp": 0.80175781, + "step": 1053, + "time_per_iteration": 2.6043174266815186 + }, + { + "auxiliary_loss_clip": 0.06984901, + "auxiliary_loss_mlp": 0.01368181, + "balance_loss_clip": 0.06446727, + "balance_loss_mlp": 0.0129532, + "epoch": 0.06336990831203967, + "flos": 21477610719360.0, + "grad_norm": 4.159955078588776, + "language_loss": 0.88012934, + "learning_rate": 3.988331025862195e-06, + "loss": 0.96366018, + "num_input_tokens_seen": 22546335, + "router_z_loss_clip": 5.3828125, + "router_z_loss_mlp": 0.72802734, + "step": 1054, + "time_per_iteration": 2.647026777267456 + }, + { + "auxiliary_loss_clip": 0.06987712, + "auxiliary_loss_mlp": 0.01370375, + "balance_loss_clip": 0.06445334, + "balance_loss_mlp": 0.01301568, + "epoch": 0.06343003156470765, + "flos": 18484824839040.0, + "grad_norm": 2.8104304693341837, + "language_loss": 0.89331806, + "learning_rate": 3.9882889787445225e-06, + "loss": 0.97689891, + "num_input_tokens_seen": 22563885, + "router_z_loss_clip": 5.421875, + "router_z_loss_mlp": 0.68798828, + "step": 1055, + "time_per_iteration": 2.5695717334747314 + }, + { + "auxiliary_loss_clip": 0.07031021, + "auxiliary_loss_mlp": 0.01393239, + "balance_loss_clip": 0.06440826, + "balance_loss_mlp": 0.01302354, + "epoch": 0.06349015481737562, + "flos": 25161801765120.0, + "grad_norm": 4.1133835551619224, + "language_loss": 0.85196388, + "learning_rate": 3.988246856230734e-06, + "loss": 0.93620646, + "num_input_tokens_seen": 22583035, + "router_z_loss_clip": 5.89453125, + "router_z_loss_mlp": 0.90820312, + "step": 1056, + "time_per_iteration": 2.685821056365967 + }, + { + "auxiliary_loss_clip": 0.07029925, + "auxiliary_loss_mlp": 0.01408784, + "balance_loss_clip": 0.06446205, + "balance_loss_mlp": 0.01319377, + "epoch": 0.06355027807004358, + "flos": 26879322430080.0, + "grad_norm": 5.02877545894497, + "language_loss": 0.84474576, + "learning_rate": 3.988204658322426e-06, + "loss": 0.92913282, + "num_input_tokens_seen": 22605055, + "router_z_loss_clip": 5.8359375, + "router_z_loss_mlp": 0.89501953, + "step": 1057, + "time_per_iteration": 2.6688387393951416 + }, + { + "auxiliary_loss_clip": 0.06953399, + "auxiliary_loss_mlp": 0.01345887, + "balance_loss_clip": 0.06428042, + "balance_loss_mlp": 0.01278987, + "epoch": 0.06361040132271156, + "flos": 21402951131520.0, + "grad_norm": 3.9641222811805337, + "language_loss": 0.85986251, + "learning_rate": 3.988162385021196e-06, + "loss": 0.94285542, + "num_input_tokens_seen": 22623760, + "router_z_loss_clip": 5.25, + "router_z_loss_mlp": 0.66845703, + "step": 1058, + "time_per_iteration": 2.6371591091156006 + }, + { + "auxiliary_loss_clip": 0.0698344, + "auxiliary_loss_mlp": 0.01353949, + "balance_loss_clip": 0.06427366, + "balance_loss_mlp": 0.01275796, + "epoch": 0.06367052457537953, + "flos": 25739959737600.0, + "grad_norm": 3.2277693096185125, + "language_loss": 0.90202904, + "learning_rate": 3.988120036328651e-06, + "loss": 0.98540288, + "num_input_tokens_seen": 22643000, + "router_z_loss_clip": 5.5625, + "router_z_loss_mlp": 0.78173828, + "step": 1059, + "time_per_iteration": 2.6188669204711914 + }, + { + "auxiliary_loss_clip": 0.06969759, + "auxiliary_loss_mlp": 0.01343893, + "balance_loss_clip": 0.06422018, + "balance_loss_mlp": 0.01267218, + "epoch": 0.0637306478280475, + "flos": 17635840871040.0, + "grad_norm": 3.450468160359764, + "language_loss": 0.94701946, + "learning_rate": 3.988077612246394e-06, + "loss": 1.0301559, + "num_input_tokens_seen": 22660460, + "router_z_loss_clip": 5.48046875, + "router_z_loss_mlp": 0.76708984, + "step": 1060, + "time_per_iteration": 2.659820079803467 + }, + { + "auxiliary_loss_clip": 0.06957703, + "auxiliary_loss_mlp": 0.0133292, + "balance_loss_clip": 0.06419823, + "balance_loss_mlp": 0.01262396, + "epoch": 0.06379077108071547, + "flos": 13667727864960.0, + "grad_norm": 3.5269486179455622, + "language_loss": 0.91039562, + "learning_rate": 3.988035112776035e-06, + "loss": 0.99330181, + "num_input_tokens_seen": 22679270, + "router_z_loss_clip": 5.38671875, + "router_z_loss_mlp": 0.70483398, + "step": 1061, + "time_per_iteration": 2.595237970352173 + }, + { + "auxiliary_loss_clip": 0.07004992, + "auxiliary_loss_mlp": 0.0134989, + "balance_loss_clip": 0.06433421, + "balance_loss_mlp": 0.01272071, + "epoch": 0.06385089433338344, + "flos": 28486950065280.0, + "grad_norm": 26.387846770017223, + "language_loss": 0.80432439, + "learning_rate": 3.987992537919185e-06, + "loss": 0.88787317, + "num_input_tokens_seen": 22699330, + "router_z_loss_clip": 5.7109375, + "router_z_loss_mlp": 0.77832031, + "step": 1062, + "time_per_iteration": 2.69326114654541 + }, + { + "auxiliary_loss_clip": 0.06971388, + "auxiliary_loss_mlp": 0.01333448, + "balance_loss_clip": 0.06420203, + "balance_loss_mlp": 0.01260349, + "epoch": 0.0639110175860514, + "flos": 24317052428160.0, + "grad_norm": 14.259145516712906, + "language_loss": 0.90426183, + "learning_rate": 3.987949887677459e-06, + "loss": 0.98731029, + "num_input_tokens_seen": 22717945, + "router_z_loss_clip": 5.5, + "router_z_loss_mlp": 0.73095703, + "step": 1063, + "time_per_iteration": 2.642476797103882 + }, + { + "auxiliary_loss_clip": 0.06974378, + "auxiliary_loss_mlp": 0.01332583, + "balance_loss_clip": 0.06425211, + "balance_loss_mlp": 0.01259436, + "epoch": 0.06397114083871938, + "flos": 22097291189760.0, + "grad_norm": 2.9601227778370176, + "language_loss": 0.82562792, + "learning_rate": 3.9879071620524744e-06, + "loss": 0.90869761, + "num_input_tokens_seen": 22736790, + "router_z_loss_clip": 5.48828125, + "router_z_loss_mlp": 0.73144531, + "step": 1064, + "time_per_iteration": 2.661435604095459 + }, + { + "auxiliary_loss_clip": 0.06941259, + "auxiliary_loss_mlp": 0.01342729, + "balance_loss_clip": 0.06412596, + "balance_loss_mlp": 0.01271298, + "epoch": 0.06403126409138735, + "flos": 19578849672960.0, + "grad_norm": 3.2505919469988727, + "language_loss": 0.86995006, + "learning_rate": 3.987864361045851e-06, + "loss": 0.95278984, + "num_input_tokens_seen": 22754745, + "router_z_loss_clip": 5.28515625, + "router_z_loss_mlp": 0.71386719, + "step": 1065, + "time_per_iteration": 2.5758113861083984 + }, + { + "auxiliary_loss_clip": 0.06963679, + "auxiliary_loss_mlp": 0.01340247, + "balance_loss_clip": 0.06401139, + "balance_loss_mlp": 0.01265669, + "epoch": 0.06409138734405531, + "flos": 40816968624000.0, + "grad_norm": 2.0842805851080395, + "language_loss": 0.71325147, + "learning_rate": 3.987821484659211e-06, + "loss": 0.79629076, + "num_input_tokens_seen": 22776780, + "router_z_loss_clip": 5.625, + "router_z_loss_mlp": 0.74609375, + "step": 1066, + "time_per_iteration": 2.7917239665985107 + }, + { + "auxiliary_loss_clip": 0.06944396, + "auxiliary_loss_mlp": 0.0133661, + "balance_loss_clip": 0.06404863, + "balance_loss_mlp": 0.01266419, + "epoch": 0.06415151059672328, + "flos": 20446631683200.0, + "grad_norm": 3.9323967107233093, + "language_loss": 0.93839109, + "learning_rate": 3.987778532894181e-06, + "loss": 1.02120125, + "num_input_tokens_seen": 22793915, + "router_z_loss_clip": 5.390625, + "router_z_loss_mlp": 0.70166016, + "step": 1067, + "time_per_iteration": 2.6115174293518066 + }, + { + "auxiliary_loss_clip": 0.06956208, + "auxiliary_loss_mlp": 0.0134, + "balance_loss_clip": 0.06410809, + "balance_loss_mlp": 0.01270954, + "epoch": 0.06421163384939126, + "flos": 18077006217600.0, + "grad_norm": 2.3907527813163947, + "language_loss": 0.86262715, + "learning_rate": 3.987735505752391e-06, + "loss": 0.94558918, + "num_input_tokens_seen": 22812670, + "router_z_loss_clip": 5.453125, + "router_z_loss_mlp": 0.68994141, + "step": 1068, + "time_per_iteration": 2.6069822311401367 + }, + { + "auxiliary_loss_clip": 0.06937677, + "auxiliary_loss_mlp": 0.01339596, + "balance_loss_clip": 0.0640877, + "balance_loss_mlp": 0.01269787, + "epoch": 0.06427175710205922, + "flos": 25126526396160.0, + "grad_norm": 3.0644651013361175, + "language_loss": 0.92719203, + "learning_rate": 3.987692403235471e-06, + "loss": 1.0099647, + "num_input_tokens_seen": 22832440, + "router_z_loss_clip": 5.296875, + "router_z_loss_mlp": 0.69775391, + "step": 1069, + "time_per_iteration": 2.6751255989074707 + }, + { + "auxiliary_loss_clip": 0.06952519, + "auxiliary_loss_mlp": 0.01331878, + "balance_loss_clip": 0.06402327, + "balance_loss_mlp": 0.01256777, + "epoch": 0.06433188035472719, + "flos": 17385684906240.0, + "grad_norm": 4.001862380962301, + "language_loss": 0.98985177, + "learning_rate": 3.987649225345056e-06, + "loss": 1.07269573, + "num_input_tokens_seen": 22845495, + "router_z_loss_clip": 5.5078125, + "router_z_loss_mlp": 0.75048828, + "step": 1070, + "time_per_iteration": 2.5646464824676514 + }, + { + "auxiliary_loss_clip": 0.06933151, + "auxiliary_loss_mlp": 0.01337757, + "balance_loss_clip": 0.0639724, + "balance_loss_mlp": 0.01267042, + "epoch": 0.06439200360739517, + "flos": 23552371267200.0, + "grad_norm": 2.5082910657712474, + "language_loss": 0.90418053, + "learning_rate": 3.987605972082782e-06, + "loss": 0.98688966, + "num_input_tokens_seen": 22865390, + "router_z_loss_clip": 5.359375, + "router_z_loss_mlp": 0.70703125, + "step": 1071, + "time_per_iteration": 2.6427106857299805 + }, + { + "auxiliary_loss_clip": 0.06918223, + "auxiliary_loss_mlp": 0.01334321, + "balance_loss_clip": 0.06398708, + "balance_loss_mlp": 0.01262414, + "epoch": 0.06445212686006313, + "flos": 21986014567680.0, + "grad_norm": 1.871300371090536, + "language_loss": 0.79228568, + "learning_rate": 3.987562643450292e-06, + "loss": 0.87481117, + "num_input_tokens_seen": 22885495, + "router_z_loss_clip": 5.19921875, + "router_z_loss_mlp": 0.71923828, + "step": 1072, + "time_per_iteration": 2.647038698196411 + }, + { + "auxiliary_loss_clip": 0.06937171, + "auxiliary_loss_mlp": 0.01329872, + "balance_loss_clip": 0.06401432, + "balance_loss_mlp": 0.01259205, + "epoch": 0.0645122501127311, + "flos": 25928369642880.0, + "grad_norm": 2.655186985808554, + "language_loss": 0.84775895, + "learning_rate": 3.987519239449226e-06, + "loss": 0.9304294, + "num_input_tokens_seen": 22904845, + "router_z_loss_clip": 5.35546875, + "router_z_loss_mlp": 0.70800781, + "step": 1073, + "time_per_iteration": 2.658341646194458 + }, + { + "auxiliary_loss_clip": 0.06906792, + "auxiliary_loss_mlp": 0.01330074, + "balance_loss_clip": 0.06396446, + "balance_loss_mlp": 0.01263412, + "epoch": 0.06457237336539907, + "flos": 25632498476160.0, + "grad_norm": 1.923481252052909, + "language_loss": 0.82366061, + "learning_rate": 3.987475760081233e-06, + "loss": 0.90602928, + "num_input_tokens_seen": 22925940, + "router_z_loss_clip": 5.09765625, + "router_z_loss_mlp": 0.66650391, + "step": 1074, + "time_per_iteration": 2.6500589847564697 + }, + { + "auxiliary_loss_clip": 0.06911084, + "auxiliary_loss_mlp": 0.01341632, + "balance_loss_clip": 0.0638795, + "balance_loss_mlp": 0.01268152, + "epoch": 0.06463249661806704, + "flos": 19470088673280.0, + "grad_norm": 4.283359791903129, + "language_loss": 0.82960403, + "learning_rate": 3.987432205347958e-06, + "loss": 0.91213125, + "num_input_tokens_seen": 22944375, + "router_z_loss_clip": 5.23046875, + "router_z_loss_mlp": 0.73486328, + "step": 1075, + "time_per_iteration": 2.620055675506592 + }, + { + "auxiliary_loss_clip": 0.06919183, + "auxiliary_loss_mlp": 0.01329908, + "balance_loss_clip": 0.06393343, + "balance_loss_mlp": 0.01260528, + "epoch": 0.064692619870735, + "flos": 24504833427840.0, + "grad_norm": 4.7074268898703, + "language_loss": 0.90130782, + "learning_rate": 3.987388575251055e-06, + "loss": 0.98379874, + "num_input_tokens_seen": 22959145, + "router_z_loss_clip": 5.26171875, + "router_z_loss_mlp": 0.69335938, + "step": 1076, + "time_per_iteration": 2.6410202980041504 + }, + { + "auxiliary_loss_clip": 0.06917243, + "auxiliary_loss_mlp": 0.01324517, + "balance_loss_clip": 0.06391963, + "balance_loss_mlp": 0.01256901, + "epoch": 0.06475274312340297, + "flos": 17024252319360.0, + "grad_norm": 4.89859871786138, + "language_loss": 0.84430212, + "learning_rate": 3.98734486979218e-06, + "loss": 0.92671967, + "num_input_tokens_seen": 22978100, + "router_z_loss_clip": 5.25390625, + "router_z_loss_mlp": 0.67578125, + "step": 1077, + "time_per_iteration": 2.6577157974243164 + }, + { + "auxiliary_loss_clip": 0.06961326, + "auxiliary_loss_mlp": 0.0134572, + "balance_loss_clip": 0.06399816, + "balance_loss_mlp": 0.01265659, + "epoch": 0.06481286637607095, + "flos": 24579409161600.0, + "grad_norm": 2.525164880783881, + "language_loss": 0.95071888, + "learning_rate": 3.987301088972986e-06, + "loss": 1.03378928, + "num_input_tokens_seen": 22997285, + "router_z_loss_clip": 5.609375, + "router_z_loss_mlp": 0.80078125, + "step": 1078, + "time_per_iteration": 2.60807466506958 + }, + { + "auxiliary_loss_clip": 0.0696152, + "auxiliary_loss_mlp": 0.01348441, + "balance_loss_clip": 0.0639492, + "balance_loss_mlp": 0.01266616, + "epoch": 0.06487298962873891, + "flos": 21111985428480.0, + "grad_norm": 2.577127703708103, + "language_loss": 0.81118071, + "learning_rate": 3.987257232795137e-06, + "loss": 0.89428037, + "num_input_tokens_seen": 23016285, + "router_z_loss_clip": 5.6640625, + "router_z_loss_mlp": 0.81835938, + "step": 1079, + "time_per_iteration": 2.6317968368530273 + }, + { + "auxiliary_loss_clip": 0.06928547, + "auxiliary_loss_mlp": 0.01328554, + "balance_loss_clip": 0.06390582, + "balance_loss_mlp": 0.01256837, + "epoch": 0.06493311288140688, + "flos": 24615103800960.0, + "grad_norm": 2.4676521714353865, + "language_loss": 0.72843546, + "learning_rate": 3.987213301260294e-06, + "loss": 0.81100643, + "num_input_tokens_seen": 23036420, + "router_z_loss_clip": 5.37109375, + "router_z_loss_mlp": 0.71728516, + "step": 1080, + "time_per_iteration": 2.6215646266937256 + }, + { + "auxiliary_loss_clip": 0.06919578, + "auxiliary_loss_mlp": 0.01334283, + "balance_loss_clip": 0.06385017, + "balance_loss_mlp": 0.01258323, + "epoch": 0.06499323613407486, + "flos": 25345054644480.0, + "grad_norm": 2.8195024652173233, + "language_loss": 0.76152724, + "learning_rate": 3.987169294370123e-06, + "loss": 0.8440659, + "num_input_tokens_seen": 23056945, + "router_z_loss_clip": 5.34375, + "router_z_loss_mlp": 0.75927734, + "step": 1081, + "time_per_iteration": 2.619861364364624 + }, + { + "auxiliary_loss_clip": 0.06903991, + "auxiliary_loss_mlp": 0.01330699, + "balance_loss_clip": 0.06382824, + "balance_loss_mlp": 0.01260985, + "epoch": 0.06505335938674282, + "flos": 20381908803840.0, + "grad_norm": 3.8302016885059436, + "language_loss": 0.87991226, + "learning_rate": 3.987125212126294e-06, + "loss": 0.96225917, + "num_input_tokens_seen": 23074940, + "router_z_loss_clip": 5.2109375, + "router_z_loss_mlp": 0.69726562, + "step": 1082, + "time_per_iteration": 3.9682254791259766 + }, + { + "auxiliary_loss_clip": 0.06965172, + "auxiliary_loss_mlp": 0.01343743, + "balance_loss_clip": 0.06394538, + "balance_loss_mlp": 0.01265304, + "epoch": 0.06511348263941079, + "flos": 25344970790400.0, + "grad_norm": 3.078052560557278, + "language_loss": 0.85807657, + "learning_rate": 3.987081054530478e-06, + "loss": 0.94116569, + "num_input_tokens_seen": 23093420, + "router_z_loss_clip": 5.70703125, + "router_z_loss_mlp": 0.78417969, + "step": 1083, + "time_per_iteration": 4.172176361083984 + }, + { + "auxiliary_loss_clip": 0.06918654, + "auxiliary_loss_mlp": 0.01347933, + "balance_loss_clip": 0.06379002, + "balance_loss_mlp": 0.01269684, + "epoch": 0.06517360589207877, + "flos": 20337912610560.0, + "grad_norm": 5.768369350853526, + "language_loss": 0.82737648, + "learning_rate": 3.987036821584348e-06, + "loss": 0.91004241, + "num_input_tokens_seen": 23111550, + "router_z_loss_clip": 5.40234375, + "router_z_loss_mlp": 0.78173828, + "step": 1084, + "time_per_iteration": 2.5647377967834473 + }, + { + "auxiliary_loss_clip": 0.06925946, + "auxiliary_loss_mlp": 0.01344614, + "balance_loss_clip": 0.06381474, + "balance_loss_mlp": 0.0126379, + "epoch": 0.06523372914474673, + "flos": 31688956391040.0, + "grad_norm": 2.8637661589946664, + "language_loss": 0.69041795, + "learning_rate": 3.986992513289584e-06, + "loss": 0.7731235, + "num_input_tokens_seen": 23130335, + "router_z_loss_clip": 5.44921875, + "router_z_loss_mlp": 0.80908203, + "step": 1085, + "time_per_iteration": 2.6726510524749756 + }, + { + "auxiliary_loss_clip": 0.06912835, + "auxiliary_loss_mlp": 0.01346265, + "balance_loss_clip": 0.06394207, + "balance_loss_mlp": 0.01271496, + "epoch": 0.0652938523974147, + "flos": 20784612326400.0, + "grad_norm": 3.652482458321433, + "language_loss": 0.80282378, + "learning_rate": 3.9869481296478645e-06, + "loss": 0.88541472, + "num_input_tokens_seen": 23152380, + "router_z_loss_clip": 5.1875, + "router_z_loss_mlp": 0.74707031, + "step": 1086, + "time_per_iteration": 4.0445778369903564 + }, + { + "auxiliary_loss_clip": 0.06903446, + "auxiliary_loss_mlp": 0.01343539, + "balance_loss_clip": 0.06383859, + "balance_loss_mlp": 0.01271489, + "epoch": 0.06535397565008266, + "flos": 16696627655040.0, + "grad_norm": 2.983342921031512, + "language_loss": 0.88718885, + "learning_rate": 3.986903670660872e-06, + "loss": 0.96965867, + "num_input_tokens_seen": 23171630, + "router_z_loss_clip": 5.1953125, + "router_z_loss_mlp": 0.72021484, + "step": 1087, + "time_per_iteration": 2.612272024154663 + }, + { + "auxiliary_loss_clip": 0.06922436, + "auxiliary_loss_mlp": 0.01359561, + "balance_loss_clip": 0.06381297, + "balance_loss_mlp": 0.01282457, + "epoch": 0.06541409890275064, + "flos": 26875171653120.0, + "grad_norm": 4.165814553604834, + "language_loss": 0.81038088, + "learning_rate": 3.9868591363302945e-06, + "loss": 0.89320087, + "num_input_tokens_seen": 23192520, + "router_z_loss_clip": 5.4140625, + "router_z_loss_mlp": 0.77099609, + "step": 1088, + "time_per_iteration": 4.128512620925903 + }, + { + "auxiliary_loss_clip": 0.06905861, + "auxiliary_loss_mlp": 0.01369914, + "balance_loss_clip": 0.0637981, + "balance_loss_mlp": 0.01292333, + "epoch": 0.06547422215541861, + "flos": 20527831889280.0, + "grad_norm": 2.3905965673188043, + "language_loss": 0.73899305, + "learning_rate": 3.9868145266578186e-06, + "loss": 0.82175082, + "num_input_tokens_seen": 23210710, + "router_z_loss_clip": 5.26171875, + "router_z_loss_mlp": 0.77587891, + "step": 1089, + "time_per_iteration": 2.5846424102783203 + }, + { + "auxiliary_loss_clip": 0.06903853, + "auxiliary_loss_mlp": 0.01367809, + "balance_loss_clip": 0.06390744, + "balance_loss_mlp": 0.01297094, + "epoch": 0.06553434540808657, + "flos": 22022925091200.0, + "grad_norm": 2.5933459275490005, + "language_loss": 0.88925481, + "learning_rate": 3.9867698416451366e-06, + "loss": 0.97197139, + "num_input_tokens_seen": 23230305, + "router_z_loss_clip": 5.12890625, + "router_z_loss_mlp": 0.70751953, + "step": 1090, + "time_per_iteration": 2.632730722427368 + }, + { + "auxiliary_loss_clip": 0.06923388, + "auxiliary_loss_mlp": 0.01379562, + "balance_loss_clip": 0.06394897, + "balance_loss_mlp": 0.01304031, + "epoch": 0.06559446866075455, + "flos": 24615648852480.0, + "grad_norm": 5.07637209675267, + "language_loss": 0.7519111, + "learning_rate": 3.9867250812939434e-06, + "loss": 0.83494061, + "num_input_tokens_seen": 23249015, + "router_z_loss_clip": 5.28125, + "router_z_loss_mlp": 0.75634766, + "step": 1091, + "time_per_iteration": 2.6071624755859375 + }, + { + "auxiliary_loss_clip": 0.06920849, + "auxiliary_loss_mlp": 0.01367283, + "balance_loss_clip": 0.06403629, + "balance_loss_mlp": 0.01298141, + "epoch": 0.06565459191342252, + "flos": 24280686956160.0, + "grad_norm": 3.183278775232349, + "language_loss": 0.85751635, + "learning_rate": 3.986680245605936e-06, + "loss": 0.94039762, + "num_input_tokens_seen": 23265105, + "router_z_loss_clip": 5.1640625, + "router_z_loss_mlp": 0.69091797, + "step": 1092, + "time_per_iteration": 2.605273962020874 + }, + { + "auxiliary_loss_clip": 0.06938382, + "auxiliary_loss_mlp": 0.01382517, + "balance_loss_clip": 0.06414036, + "balance_loss_mlp": 0.0131123, + "epoch": 0.06571471516609048, + "flos": 24793493143680.0, + "grad_norm": 3.590473362105347, + "language_loss": 0.74473059, + "learning_rate": 3.986635334582814e-06, + "loss": 0.82793957, + "num_input_tokens_seen": 23283950, + "router_z_loss_clip": 5.2421875, + "router_z_loss_mlp": 0.71337891, + "step": 1093, + "time_per_iteration": 2.638237237930298 + }, + { + "auxiliary_loss_clip": 0.06921268, + "auxiliary_loss_mlp": 0.01380472, + "balance_loss_clip": 0.06396792, + "balance_loss_mlp": 0.01303797, + "epoch": 0.06577483841875846, + "flos": 26221347843840.0, + "grad_norm": 88.21387149104662, + "language_loss": 0.90390575, + "learning_rate": 3.986590348226282e-06, + "loss": 0.98692322, + "num_input_tokens_seen": 23305005, + "router_z_loss_clip": 5.2421875, + "router_z_loss_mlp": 0.76660156, + "step": 1094, + "time_per_iteration": 2.6458590030670166 + }, + { + "auxiliary_loss_clip": 0.06927408, + "auxiliary_loss_mlp": 0.01386993, + "balance_loss_clip": 0.06403756, + "balance_loss_mlp": 0.01310603, + "epoch": 0.06583496167142643, + "flos": 25087519520640.0, + "grad_norm": 2.736930049066649, + "language_loss": 0.83897924, + "learning_rate": 3.986545286538044e-06, + "loss": 0.92212319, + "num_input_tokens_seen": 23323220, + "router_z_loss_clip": 5.23046875, + "router_z_loss_mlp": 0.76416016, + "step": 1095, + "time_per_iteration": 2.678666114807129 + }, + { + "auxiliary_loss_clip": 0.06935441, + "auxiliary_loss_mlp": 0.01385344, + "balance_loss_clip": 0.06404546, + "balance_loss_mlp": 0.01317443, + "epoch": 0.06589508492409439, + "flos": 25636900815360.0, + "grad_norm": 5.395614329655057, + "language_loss": 0.73154068, + "learning_rate": 3.986500149519811e-06, + "loss": 0.81474853, + "num_input_tokens_seen": 23342235, + "router_z_loss_clip": 5.3046875, + "router_z_loss_mlp": 0.67871094, + "step": 1096, + "time_per_iteration": 2.6446287631988525 + }, + { + "auxiliary_loss_clip": 0.06917029, + "auxiliary_loss_mlp": 0.01365132, + "balance_loss_clip": 0.06399326, + "balance_loss_mlp": 0.01297755, + "epoch": 0.06595520817676236, + "flos": 23627701687680.0, + "grad_norm": 3.583666651431395, + "language_loss": 0.80129099, + "learning_rate": 3.986454937173292e-06, + "loss": 0.8841126, + "num_input_tokens_seen": 23363680, + "router_z_loss_clip": 5.171875, + "router_z_loss_mlp": 0.67285156, + "step": 1097, + "time_per_iteration": 2.610381603240967 + }, + { + "auxiliary_loss_clip": 0.06948523, + "auxiliary_loss_mlp": 0.01368674, + "balance_loss_clip": 0.0639759, + "balance_loss_mlp": 0.01295384, + "epoch": 0.06601533142943034, + "flos": 33810019119360.0, + "grad_norm": 2.548144949478092, + "language_loss": 0.80388427, + "learning_rate": 3.986409649500203e-06, + "loss": 0.88705623, + "num_input_tokens_seen": 23385590, + "router_z_loss_clip": 5.50390625, + "router_z_loss_mlp": 0.73339844, + "step": 1098, + "time_per_iteration": 2.720482110977173 + }, + { + "auxiliary_loss_clip": 0.06938128, + "auxiliary_loss_mlp": 0.01366931, + "balance_loss_clip": 0.06409903, + "balance_loss_mlp": 0.01293498, + "epoch": 0.0660754546820983, + "flos": 20264175417600.0, + "grad_norm": 10.171489722923557, + "language_loss": 0.84726501, + "learning_rate": 3.986364286502261e-06, + "loss": 0.93031561, + "num_input_tokens_seen": 23402945, + "router_z_loss_clip": 5.28125, + "router_z_loss_mlp": 0.73486328, + "step": 1099, + "time_per_iteration": 2.598655939102173 + }, + { + "auxiliary_loss_clip": 0.06904539, + "auxiliary_loss_mlp": 0.01375441, + "balance_loss_clip": 0.0639468, + "balance_loss_mlp": 0.01307397, + "epoch": 0.06613557793476627, + "flos": 19360195643520.0, + "grad_norm": 3.568327868722517, + "language_loss": 0.8664155, + "learning_rate": 3.986318848181186e-06, + "loss": 0.94921529, + "num_input_tokens_seen": 23421410, + "router_z_loss_clip": 5.09765625, + "router_z_loss_mlp": 0.68066406, + "step": 1100, + "time_per_iteration": 2.577528238296509 + }, + { + "auxiliary_loss_clip": 0.06927315, + "auxiliary_loss_mlp": 0.01369622, + "balance_loss_clip": 0.06391686, + "balance_loss_mlp": 0.01299861, + "epoch": 0.06619570118743424, + "flos": 13777788602880.0, + "grad_norm": 2.758398197018795, + "language_loss": 0.76281518, + "learning_rate": 3.986273334538702e-06, + "loss": 0.84578454, + "num_input_tokens_seen": 23438870, + "router_z_loss_clip": 5.3515625, + "router_z_loss_mlp": 0.69775391, + "step": 1101, + "time_per_iteration": 2.6156139373779297 + }, + { + "auxiliary_loss_clip": 0.06904308, + "auxiliary_loss_mlp": 0.01359683, + "balance_loss_clip": 0.06387865, + "balance_loss_mlp": 0.01295215, + "epoch": 0.06625582444010221, + "flos": 17863593068160.0, + "grad_norm": 4.389912717391851, + "language_loss": 0.89471924, + "learning_rate": 3.986227745576533e-06, + "loss": 0.97735918, + "num_input_tokens_seen": 23456975, + "router_z_loss_clip": 5.16796875, + "router_z_loss_mlp": 0.64501953, + "step": 1102, + "time_per_iteration": 2.569350242614746 + }, + { + "auxiliary_loss_clip": 0.0692213, + "auxiliary_loss_mlp": 0.01377442, + "balance_loss_clip": 0.06385392, + "balance_loss_mlp": 0.01306584, + "epoch": 0.06631594769277017, + "flos": 11843584479360.0, + "grad_norm": 3.5425773042581055, + "language_loss": 0.86216784, + "learning_rate": 3.98618208129641e-06, + "loss": 0.94516355, + "num_input_tokens_seen": 23473440, + "router_z_loss_clip": 5.36328125, + "router_z_loss_mlp": 0.70898438, + "step": 1103, + "time_per_iteration": 2.6067960262298584 + }, + { + "auxiliary_loss_clip": 0.06886483, + "auxiliary_loss_mlp": 0.01371541, + "balance_loss_clip": 0.06376658, + "balance_loss_mlp": 0.01305547, + "epoch": 0.06637607094543815, + "flos": 19799683908480.0, + "grad_norm": 2.4626452299406383, + "language_loss": 0.8457936, + "learning_rate": 3.986136341700063e-06, + "loss": 0.92837381, + "num_input_tokens_seen": 23493880, + "router_z_loss_clip": 5.09765625, + "router_z_loss_mlp": 0.66015625, + "step": 1104, + "time_per_iteration": 2.5836308002471924 + }, + { + "auxiliary_loss_clip": 0.06882686, + "auxiliary_loss_mlp": 0.01367781, + "balance_loss_clip": 0.0637526, + "balance_loss_mlp": 0.01303408, + "epoch": 0.06643619419810612, + "flos": 25493032154880.0, + "grad_norm": 1.7655477747418094, + "language_loss": 0.83173895, + "learning_rate": 3.986090526789227e-06, + "loss": 0.91424364, + "num_input_tokens_seen": 23514920, + "router_z_loss_clip": 5.0703125, + "router_z_loss_mlp": 0.64306641, + "step": 1105, + "time_per_iteration": 2.662261486053467 + }, + { + "auxiliary_loss_clip": 0.06873615, + "auxiliary_loss_mlp": 0.01369586, + "balance_loss_clip": 0.06380346, + "balance_loss_mlp": 0.01308694, + "epoch": 0.06649631745077408, + "flos": 16952234135040.0, + "grad_norm": 2.812403865753697, + "language_loss": 0.99235487, + "learning_rate": 3.986044636565639e-06, + "loss": 1.0747869, + "num_input_tokens_seen": 23531635, + "router_z_loss_clip": 4.9296875, + "router_z_loss_mlp": 0.60839844, + "step": 1106, + "time_per_iteration": 2.55377459526062 + }, + { + "auxiliary_loss_clip": 0.0691068, + "auxiliary_loss_mlp": 0.01368117, + "balance_loss_clip": 0.06380811, + "balance_loss_mlp": 0.01299977, + "epoch": 0.06655644070344206, + "flos": 17864431608960.0, + "grad_norm": 9.796712570365342, + "language_loss": 0.85572082, + "learning_rate": 3.985998671031039e-06, + "loss": 0.93850881, + "num_input_tokens_seen": 23551020, + "router_z_loss_clip": 5.296875, + "router_z_loss_mlp": 0.68115234, + "step": 1107, + "time_per_iteration": 2.607999324798584 + }, + { + "auxiliary_loss_clip": 0.06769384, + "auxiliary_loss_mlp": 0.01408352, + "balance_loss_clip": 0.06440101, + "balance_loss_mlp": 0.01358189, + "epoch": 0.06661656395611003, + "flos": 61438033779840.0, + "grad_norm": 0.835907980773472, + "language_loss": 0.57139766, + "learning_rate": 3.9859526301871705e-06, + "loss": 0.653175, + "num_input_tokens_seen": 23610675, + "router_z_loss_clip": 3.28515625, + "router_z_loss_mlp": 0.50195312, + "step": 1108, + "time_per_iteration": 3.1505634784698486 + }, + { + "auxiliary_loss_clip": 0.06919513, + "auxiliary_loss_mlp": 0.01358617, + "balance_loss_clip": 0.06388947, + "balance_loss_mlp": 0.01289285, + "epoch": 0.066676687208778, + "flos": 20668304459520.0, + "grad_norm": 4.7813305453067985, + "language_loss": 0.74593651, + "learning_rate": 3.9859065140357795e-06, + "loss": 0.82871783, + "num_input_tokens_seen": 23628710, + "router_z_loss_clip": 5.30078125, + "router_z_loss_mlp": 0.69384766, + "step": 1109, + "time_per_iteration": 2.5951621532440186 + }, + { + "auxiliary_loss_clip": 0.06901313, + "auxiliary_loss_mlp": 0.01359309, + "balance_loss_clip": 0.06382284, + "balance_loss_mlp": 0.01292219, + "epoch": 0.06673681046144596, + "flos": 20929613016960.0, + "grad_norm": 2.4423466539648686, + "language_loss": 0.81162918, + "learning_rate": 3.985860322578614e-06, + "loss": 0.89423537, + "num_input_tokens_seen": 23649160, + "router_z_loss_clip": 5.18359375, + "router_z_loss_mlp": 0.66992188, + "step": 1110, + "time_per_iteration": 2.5594658851623535 + }, + { + "auxiliary_loss_clip": 0.06916048, + "auxiliary_loss_mlp": 0.01350686, + "balance_loss_clip": 0.06385787, + "balance_loss_mlp": 0.01283261, + "epoch": 0.06679693371411394, + "flos": 31073762113920.0, + "grad_norm": 3.192640550751645, + "language_loss": 0.74339402, + "learning_rate": 3.985814055817427e-06, + "loss": 0.82606131, + "num_input_tokens_seen": 23671995, + "router_z_loss_clip": 5.296875, + "router_z_loss_mlp": 0.67431641, + "step": 1111, + "time_per_iteration": 2.6675732135772705 + }, + { + "auxiliary_loss_clip": 0.0692247, + "auxiliary_loss_mlp": 0.01336011, + "balance_loss_clip": 0.0638883, + "balance_loss_mlp": 0.01269492, + "epoch": 0.0668570569667819, + "flos": 21732630220800.0, + "grad_norm": 3.09844838926034, + "language_loss": 0.81051421, + "learning_rate": 3.985767713753971e-06, + "loss": 0.89309895, + "num_input_tokens_seen": 23690705, + "router_z_loss_clip": 5.3359375, + "router_z_loss_mlp": 0.66455078, + "step": 1112, + "time_per_iteration": 2.5785021781921387 + }, + { + "auxiliary_loss_clip": 0.06900664, + "auxiliary_loss_mlp": 0.01347702, + "balance_loss_clip": 0.06384552, + "balance_loss_mlp": 0.01282185, + "epoch": 0.06691718021944987, + "flos": 22753840256640.0, + "grad_norm": 2.9756537070092466, + "language_loss": 0.82400674, + "learning_rate": 3.985721296390005e-06, + "loss": 0.90649039, + "num_input_tokens_seen": 23709990, + "router_z_loss_clip": 5.1640625, + "router_z_loss_mlp": 0.65576172, + "step": 1113, + "time_per_iteration": 2.6159799098968506 + }, + { + "auxiliary_loss_clip": 0.06872059, + "auxiliary_loss_mlp": 0.01337269, + "balance_loss_clip": 0.06376456, + "balance_loss_mlp": 0.01280382, + "epoch": 0.06697730347211785, + "flos": 16551333475200.0, + "grad_norm": 3.049422068587495, + "language_loss": 0.85146165, + "learning_rate": 3.985674803727289e-06, + "loss": 0.93355489, + "num_input_tokens_seen": 23728485, + "router_z_loss_clip": 4.95703125, + "router_z_loss_mlp": 0.56884766, + "step": 1114, + "time_per_iteration": 2.5442495346069336 + }, + { + "auxiliary_loss_clip": 0.06720632, + "auxiliary_loss_mlp": 0.01311166, + "balance_loss_clip": 0.06393555, + "balance_loss_mlp": 0.01264675, + "epoch": 0.06703742672478581, + "flos": 59801545612800.0, + "grad_norm": 0.814822871226623, + "language_loss": 0.58299243, + "learning_rate": 3.985628235767584e-06, + "loss": 0.66331041, + "num_input_tokens_seen": 23786650, + "router_z_loss_clip": 3.2734375, + "router_z_loss_mlp": 0.46435547, + "step": 1115, + "time_per_iteration": 3.1831469535827637 + }, + { + "auxiliary_loss_clip": 0.06912658, + "auxiliary_loss_mlp": 0.01326736, + "balance_loss_clip": 0.06393988, + "balance_loss_mlp": 0.01261314, + "epoch": 0.06709754997745378, + "flos": 16805807925120.0, + "grad_norm": 5.78180725653176, + "language_loss": 0.94695258, + "learning_rate": 3.985581592512658e-06, + "loss": 1.02934647, + "num_input_tokens_seen": 23802555, + "router_z_loss_clip": 5.1875, + "router_z_loss_mlp": 0.65332031, + "step": 1116, + "time_per_iteration": 2.6025443077087402 + }, + { + "auxiliary_loss_clip": 0.06950381, + "auxiliary_loss_mlp": 0.01352294, + "balance_loss_clip": 0.06407215, + "balance_loss_mlp": 0.01283105, + "epoch": 0.06715767323012176, + "flos": 22129883228160.0, + "grad_norm": 3.297350824619057, + "language_loss": 0.90161335, + "learning_rate": 3.985534873964279e-06, + "loss": 0.98464012, + "num_input_tokens_seen": 23822945, + "router_z_loss_clip": 5.42578125, + "router_z_loss_mlp": 0.69189453, + "step": 1117, + "time_per_iteration": 2.640014410018921 + }, + { + "auxiliary_loss_clip": 0.06703123, + "auxiliary_loss_mlp": 0.01296382, + "balance_loss_clip": 0.06378835, + "balance_loss_mlp": 0.01254898, + "epoch": 0.06721779648278972, + "flos": 66634522842240.0, + "grad_norm": 0.828477744144983, + "language_loss": 0.59793437, + "learning_rate": 3.985488080124218e-06, + "loss": 0.67792934, + "num_input_tokens_seen": 23874075, + "router_z_loss_clip": 3.24804688, + "router_z_loss_mlp": 0.41503906, + "step": 1118, + "time_per_iteration": 3.1895816326141357 + }, + { + "auxiliary_loss_clip": 0.0694533, + "auxiliary_loss_mlp": 0.0134688, + "balance_loss_clip": 0.06400572, + "balance_loss_mlp": 0.0127092, + "epoch": 0.06727791973545769, + "flos": 22389011579520.0, + "grad_norm": 4.072656467009049, + "language_loss": 0.87426257, + "learning_rate": 3.985441210994251e-06, + "loss": 0.95718467, + "num_input_tokens_seen": 23889720, + "router_z_loss_clip": 5.453125, + "router_z_loss_mlp": 0.76025391, + "step": 1119, + "time_per_iteration": 2.588590621948242 + }, + { + "auxiliary_loss_clip": 0.0690966, + "auxiliary_loss_mlp": 0.01331486, + "balance_loss_clip": 0.06396869, + "balance_loss_mlp": 0.01269116, + "epoch": 0.06733804298812565, + "flos": 24287143720320.0, + "grad_norm": 3.964620176038611, + "language_loss": 0.88010037, + "learning_rate": 3.9853942665761545e-06, + "loss": 0.9625119, + "num_input_tokens_seen": 23909385, + "router_z_loss_clip": 5.12109375, + "router_z_loss_mlp": 0.62451172, + "step": 1120, + "time_per_iteration": 2.6959142684936523 + }, + { + "auxiliary_loss_clip": 0.06922112, + "auxiliary_loss_mlp": 0.01340271, + "balance_loss_clip": 0.06406626, + "balance_loss_mlp": 0.01275421, + "epoch": 0.06739816624079363, + "flos": 15922638691200.0, + "grad_norm": 2.824028723834481, + "language_loss": 0.81958008, + "learning_rate": 3.985347246871708e-06, + "loss": 0.90220392, + "num_input_tokens_seen": 23926830, + "router_z_loss_clip": 5.15625, + "router_z_loss_mlp": 0.6484375, + "step": 1121, + "time_per_iteration": 2.5337889194488525 + }, + { + "auxiliary_loss_clip": 0.0669936, + "auxiliary_loss_mlp": 0.01328619, + "balance_loss_clip": 0.0637704, + "balance_loss_mlp": 0.01291044, + "epoch": 0.0674582894934616, + "flos": 71422031796480.0, + "grad_norm": 0.7591545371637793, + "language_loss": 0.58392835, + "learning_rate": 3.985300151882694e-06, + "loss": 0.66420811, + "num_input_tokens_seen": 23992640, + "router_z_loss_clip": 3.21875, + "router_z_loss_mlp": 0.375, + "step": 1122, + "time_per_iteration": 4.871971130371094 + }, + { + "auxiliary_loss_clip": 0.06934178, + "auxiliary_loss_mlp": 0.01339594, + "balance_loss_clip": 0.06410946, + "balance_loss_mlp": 0.01275269, + "epoch": 0.06751841274612956, + "flos": 25271988284160.0, + "grad_norm": 2.7004693252579286, + "language_loss": 0.75033748, + "learning_rate": 3.985252981610901e-06, + "loss": 0.83307523, + "num_input_tokens_seen": 24011135, + "router_z_loss_clip": 5.23046875, + "router_z_loss_mlp": 0.64355469, + "step": 1123, + "time_per_iteration": 4.122293472290039 + }, + { + "auxiliary_loss_clip": 0.06974602, + "auxiliary_loss_mlp": 0.0135696, + "balance_loss_clip": 0.06425263, + "balance_loss_mlp": 0.01278282, + "epoch": 0.06757853599879754, + "flos": 23809067850240.0, + "grad_norm": 9.643312426369809, + "language_loss": 0.82052922, + "learning_rate": 3.985205736058114e-06, + "loss": 0.90384483, + "num_input_tokens_seen": 24030695, + "router_z_loss_clip": 5.49609375, + "router_z_loss_mlp": 0.78637695, + "step": 1124, + "time_per_iteration": 2.6173415184020996 + }, + { + "auxiliary_loss_clip": 0.06911455, + "auxiliary_loss_mlp": 0.01341629, + "balance_loss_clip": 0.06401114, + "balance_loss_mlp": 0.01274705, + "epoch": 0.0676386592514655, + "flos": 21040260733440.0, + "grad_norm": 3.063274936287039, + "language_loss": 0.74925935, + "learning_rate": 3.985158415226128e-06, + "loss": 0.83179009, + "num_input_tokens_seen": 24050680, + "router_z_loss_clip": 5.10546875, + "router_z_loss_mlp": 0.66870117, + "step": 1125, + "time_per_iteration": 3.984415292739868 + }, + { + "auxiliary_loss_clip": 0.0694951, + "auxiliary_loss_mlp": 0.01360506, + "balance_loss_clip": 0.06422167, + "balance_loss_mlp": 0.01290745, + "epoch": 0.06769878250413347, + "flos": 25563331330560.0, + "grad_norm": 3.6371795971434935, + "language_loss": 0.84025776, + "learning_rate": 3.985111019116736e-06, + "loss": 0.92335784, + "num_input_tokens_seen": 24067205, + "router_z_loss_clip": 5.2734375, + "router_z_loss_mlp": 0.69726562, + "step": 1126, + "time_per_iteration": 2.6536872386932373 + }, + { + "auxiliary_loss_clip": 0.06684255, + "auxiliary_loss_mlp": 0.01367323, + "balance_loss_clip": 0.06366412, + "balance_loss_mlp": 0.01329891, + "epoch": 0.06775890575680145, + "flos": 70676316385920.0, + "grad_norm": 0.9685337357274917, + "language_loss": 0.60214978, + "learning_rate": 3.985063547731735e-06, + "loss": 0.68266553, + "num_input_tokens_seen": 24131320, + "router_z_loss_clip": 3.1796875, + "router_z_loss_mlp": 0.37353516, + "step": 1127, + "time_per_iteration": 3.2334144115448 + }, + { + "auxiliary_loss_clip": 0.06927685, + "auxiliary_loss_mlp": 0.01345826, + "balance_loss_clip": 0.0640737, + "balance_loss_mlp": 0.01276304, + "epoch": 0.06781902900946941, + "flos": 24241051175040.0, + "grad_norm": 3.0319163993738307, + "language_loss": 0.83925569, + "learning_rate": 3.985016001072925e-06, + "loss": 0.92199081, + "num_input_tokens_seen": 24149930, + "router_z_loss_clip": 5.19921875, + "router_z_loss_mlp": 0.6953125, + "step": 1128, + "time_per_iteration": 4.002989053726196 + }, + { + "auxiliary_loss_clip": 0.06986301, + "auxiliary_loss_mlp": 0.01369711, + "balance_loss_clip": 0.06426411, + "balance_loss_mlp": 0.01288792, + "epoch": 0.06787915226213738, + "flos": 22423825751040.0, + "grad_norm": 5.128906887201041, + "language_loss": 0.79490405, + "learning_rate": 3.984968379142109e-06, + "loss": 0.87846416, + "num_input_tokens_seen": 24169590, + "router_z_loss_clip": 5.59375, + "router_z_loss_mlp": 0.80908203, + "step": 1129, + "time_per_iteration": 2.6091246604919434 + }, + { + "auxiliary_loss_clip": 0.06950344, + "auxiliary_loss_mlp": 0.0134506, + "balance_loss_clip": 0.06413193, + "balance_loss_mlp": 0.01275251, + "epoch": 0.06793927551480534, + "flos": 37716092576640.0, + "grad_norm": 7.724208809946286, + "language_loss": 0.75193048, + "learning_rate": 3.984920681941094e-06, + "loss": 0.83488452, + "num_input_tokens_seen": 24189965, + "router_z_loss_clip": 5.37109375, + "router_z_loss_mlp": 0.69873047, + "step": 1130, + "time_per_iteration": 2.747319221496582 + }, + { + "auxiliary_loss_clip": 0.06924557, + "auxiliary_loss_mlp": 0.01342805, + "balance_loss_clip": 0.06402417, + "balance_loss_mlp": 0.01275428, + "epoch": 0.06799939876747332, + "flos": 20637682992000.0, + "grad_norm": 3.4742611596039583, + "language_loss": 0.83601421, + "learning_rate": 3.984872909471688e-06, + "loss": 0.91868782, + "num_input_tokens_seen": 24208045, + "router_z_loss_clip": 5.21484375, + "router_z_loss_mlp": 0.67333984, + "step": 1131, + "time_per_iteration": 2.619173765182495 + }, + { + "auxiliary_loss_clip": 0.06889838, + "auxiliary_loss_mlp": 0.01323899, + "balance_loss_clip": 0.06390625, + "balance_loss_mlp": 0.01266011, + "epoch": 0.06805952202014129, + "flos": 14869759011840.0, + "grad_norm": 6.452833361572522, + "language_loss": 0.83523953, + "learning_rate": 3.984825061735701e-06, + "loss": 0.91737688, + "num_input_tokens_seen": 24223805, + "router_z_loss_clip": 4.99609375, + "router_z_loss_mlp": 0.57958984, + "step": 1132, + "time_per_iteration": 2.5897791385650635 + }, + { + "auxiliary_loss_clip": 0.06909724, + "auxiliary_loss_mlp": 0.01329094, + "balance_loss_clip": 0.06400912, + "balance_loss_mlp": 0.0126813, + "epoch": 0.06811964527280925, + "flos": 48920710147200.0, + "grad_norm": 2.2815724812180056, + "language_loss": 0.66480637, + "learning_rate": 3.9847771387349495e-06, + "loss": 0.74719459, + "num_input_tokens_seen": 24249475, + "router_z_loss_clip": 5.09375, + "router_z_loss_mlp": 0.61035156, + "step": 1133, + "time_per_iteration": 2.830873966217041 + }, + { + "auxiliary_loss_clip": 0.06951424, + "auxiliary_loss_mlp": 0.01351356, + "balance_loss_clip": 0.06402567, + "balance_loss_mlp": 0.0127573, + "epoch": 0.06817976852547723, + "flos": 15382649053440.0, + "grad_norm": 2.526233551435035, + "language_loss": 0.78033423, + "learning_rate": 3.9847291404712506e-06, + "loss": 0.86336207, + "num_input_tokens_seen": 24267980, + "router_z_loss_clip": 5.484375, + "router_z_loss_mlp": 0.75634766, + "step": 1134, + "time_per_iteration": 2.5770034790039062 + }, + { + "auxiliary_loss_clip": 0.06920115, + "auxiliary_loss_mlp": 0.0133773, + "balance_loss_clip": 0.06399941, + "balance_loss_mlp": 0.01275216, + "epoch": 0.0682398917781452, + "flos": 20161661546880.0, + "grad_norm": 3.170480536995333, + "language_loss": 0.89855266, + "learning_rate": 3.984681066946423e-06, + "loss": 0.98113102, + "num_input_tokens_seen": 24286805, + "router_z_loss_clip": 5.19921875, + "router_z_loss_mlp": 0.625, + "step": 1135, + "time_per_iteration": 2.574153423309326 + }, + { + "auxiliary_loss_clip": 0.06912802, + "auxiliary_loss_mlp": 0.01339867, + "balance_loss_clip": 0.06390901, + "balance_loss_mlp": 0.01268723, + "epoch": 0.06830001503081316, + "flos": 23447341774080.0, + "grad_norm": 4.323885929511343, + "language_loss": 0.81566894, + "learning_rate": 3.984632918162291e-06, + "loss": 0.89819562, + "num_input_tokens_seen": 24305855, + "router_z_loss_clip": 5.2109375, + "router_z_loss_mlp": 0.7109375, + "step": 1136, + "time_per_iteration": 2.632093906402588 + }, + { + "auxiliary_loss_clip": 0.0691568, + "auxiliary_loss_mlp": 0.01339988, + "balance_loss_clip": 0.06395651, + "balance_loss_mlp": 0.01271133, + "epoch": 0.06836013828348114, + "flos": 34358352238080.0, + "grad_norm": 3.452027949613855, + "language_loss": 0.86628962, + "learning_rate": 3.984584694120679e-06, + "loss": 0.94884622, + "num_input_tokens_seen": 24326535, + "router_z_loss_clip": 5.1953125, + "router_z_loss_mlp": 0.68798828, + "step": 1137, + "time_per_iteration": 2.7281885147094727 + }, + { + "auxiliary_loss_clip": 0.0688309, + "auxiliary_loss_mlp": 0.01332345, + "balance_loss_clip": 0.06381994, + "balance_loss_mlp": 0.01269736, + "epoch": 0.06842026153614911, + "flos": 23155537530240.0, + "grad_norm": 8.291551749105667, + "language_loss": 0.81329322, + "learning_rate": 3.984536394823418e-06, + "loss": 0.89544761, + "num_input_tokens_seen": 24345810, + "router_z_loss_clip": 5.0078125, + "router_z_loss_mlp": 0.62646484, + "step": 1138, + "time_per_iteration": 2.605118989944458 + }, + { + "auxiliary_loss_clip": 0.06915967, + "auxiliary_loss_mlp": 0.01331137, + "balance_loss_clip": 0.06396595, + "balance_loss_mlp": 0.01263808, + "epoch": 0.06848038478881707, + "flos": 24616026195840.0, + "grad_norm": 3.6376188064113704, + "language_loss": 0.88301587, + "learning_rate": 3.984488020272336e-06, + "loss": 0.96548682, + "num_input_tokens_seen": 24366095, + "router_z_loss_clip": 5.203125, + "router_z_loss_mlp": 0.67382812, + "step": 1139, + "time_per_iteration": 2.5919554233551025 + }, + { + "auxiliary_loss_clip": 0.06913859, + "auxiliary_loss_mlp": 0.01335261, + "balance_loss_clip": 0.0640454, + "balance_loss_mlp": 0.01272175, + "epoch": 0.06854050804148504, + "flos": 40890663889920.0, + "grad_norm": 3.4360954602414515, + "language_loss": 0.78086925, + "learning_rate": 3.984439570469271e-06, + "loss": 0.8633604, + "num_input_tokens_seen": 24388665, + "router_z_loss_clip": 5.09375, + "router_z_loss_mlp": 0.6315918, + "step": 1140, + "time_per_iteration": 2.805285930633545 + }, + { + "auxiliary_loss_clip": 0.06922249, + "auxiliary_loss_mlp": 0.01343333, + "balance_loss_clip": 0.06401816, + "balance_loss_mlp": 0.01273191, + "epoch": 0.06860063129415302, + "flos": 31694448833280.0, + "grad_norm": 3.650068739701382, + "language_loss": 0.7214306, + "learning_rate": 3.9843910454160574e-06, + "loss": 0.80408645, + "num_input_tokens_seen": 24407705, + "router_z_loss_clip": 5.1953125, + "router_z_loss_mlp": 0.70166016, + "step": 1141, + "time_per_iteration": 2.661224603652954 + }, + { + "auxiliary_loss_clip": 0.06967719, + "auxiliary_loss_mlp": 0.0134803, + "balance_loss_clip": 0.06416196, + "balance_loss_mlp": 0.01274931, + "epoch": 0.06866075454682098, + "flos": 26549265997440.0, + "grad_norm": 3.4867433558806664, + "language_loss": 0.81973946, + "learning_rate": 3.984342445114538e-06, + "loss": 0.902897, + "num_input_tokens_seen": 24428390, + "router_z_loss_clip": 5.515625, + "router_z_loss_mlp": 0.73095703, + "step": 1142, + "time_per_iteration": 2.6615188121795654 + }, + { + "auxiliary_loss_clip": 0.06894746, + "auxiliary_loss_mlp": 0.01330861, + "balance_loss_clip": 0.06396586, + "balance_loss_mlp": 0.01266488, + "epoch": 0.06872087779948895, + "flos": 29797658357760.0, + "grad_norm": 2.7600235318020157, + "language_loss": 0.71011055, + "learning_rate": 3.984293769566553e-06, + "loss": 0.79236662, + "num_input_tokens_seen": 24450810, + "router_z_loss_clip": 4.97265625, + "router_z_loss_mlp": 0.64404297, + "step": 1143, + "time_per_iteration": 2.6366419792175293 + }, + { + "auxiliary_loss_clip": 0.06881121, + "auxiliary_loss_mlp": 0.01324263, + "balance_loss_clip": 0.06384973, + "balance_loss_mlp": 0.01260987, + "epoch": 0.06878100105215693, + "flos": 26948070305280.0, + "grad_norm": 2.948232373137099, + "language_loss": 0.77426863, + "learning_rate": 3.98424501877395e-06, + "loss": 0.85632247, + "num_input_tokens_seen": 24469965, + "router_z_loss_clip": 4.9609375, + "router_z_loss_mlp": 0.63232422, + "step": 1144, + "time_per_iteration": 2.6423499584198 + }, + { + "auxiliary_loss_clip": 0.06941762, + "auxiliary_loss_mlp": 0.01342145, + "balance_loss_clip": 0.0640377, + "balance_loss_mlp": 0.01268617, + "epoch": 0.06884112430482489, + "flos": 10675361255040.0, + "grad_norm": 11.35172742857112, + "language_loss": 0.95204943, + "learning_rate": 3.984196192738577e-06, + "loss": 1.03488851, + "num_input_tokens_seen": 24486370, + "router_z_loss_clip": 5.37890625, + "router_z_loss_mlp": 0.73486328, + "step": 1145, + "time_per_iteration": 2.5397605895996094 + }, + { + "auxiliary_loss_clip": 0.06956828, + "auxiliary_loss_mlp": 0.01350992, + "balance_loss_clip": 0.06409793, + "balance_loss_mlp": 0.01275032, + "epoch": 0.06890124755749286, + "flos": 20199871808640.0, + "grad_norm": 2.888200090327115, + "language_loss": 0.85492933, + "learning_rate": 3.984147291462285e-06, + "loss": 0.93800759, + "num_input_tokens_seen": 24503780, + "router_z_loss_clip": 5.47265625, + "router_z_loss_mlp": 0.76025391, + "step": 1146, + "time_per_iteration": 2.594526529312134 + }, + { + "auxiliary_loss_clip": 0.06872599, + "auxiliary_loss_mlp": 0.01322623, + "balance_loss_clip": 0.06383249, + "balance_loss_mlp": 0.01261373, + "epoch": 0.06896137081016084, + "flos": 20455520215680.0, + "grad_norm": 3.1845992476426472, + "language_loss": 0.87540007, + "learning_rate": 3.98409831494693e-06, + "loss": 0.95735222, + "num_input_tokens_seen": 24522320, + "router_z_loss_clip": 4.890625, + "router_z_loss_mlp": 0.61303711, + "step": 1147, + "time_per_iteration": 2.583275556564331 + }, + { + "auxiliary_loss_clip": 0.06904457, + "auxiliary_loss_mlp": 0.01331833, + "balance_loss_clip": 0.06408815, + "balance_loss_mlp": 0.01268628, + "epoch": 0.0690214940628288, + "flos": 18374512538880.0, + "grad_norm": 2.487655094523106, + "language_loss": 0.88253343, + "learning_rate": 3.984049263194367e-06, + "loss": 0.96489632, + "num_input_tokens_seen": 24540445, + "router_z_loss_clip": 4.9453125, + "router_z_loss_mlp": 0.63232422, + "step": 1148, + "time_per_iteration": 2.6046411991119385 + }, + { + "auxiliary_loss_clip": 0.06914362, + "auxiliary_loss_mlp": 0.01331137, + "balance_loss_clip": 0.0640358, + "balance_loss_mlp": 0.01259516, + "epoch": 0.06908161731549677, + "flos": 20564239288320.0, + "grad_norm": 4.03707404203517, + "language_loss": 0.7250514, + "learning_rate": 3.9840001362064575e-06, + "loss": 0.80750638, + "num_input_tokens_seen": 24557105, + "router_z_loss_clip": 5.10546875, + "router_z_loss_mlp": 0.71606445, + "step": 1149, + "time_per_iteration": 2.598886489868164 + }, + { + "auxiliary_loss_clip": 0.06921704, + "auxiliary_loss_mlp": 0.01339506, + "balance_loss_clip": 0.06409335, + "balance_loss_mlp": 0.01271891, + "epoch": 0.06914174056816474, + "flos": 27571104938880.0, + "grad_norm": 5.60622478722484, + "language_loss": 0.87750047, + "learning_rate": 3.983950933985064e-06, + "loss": 0.96011257, + "num_input_tokens_seen": 24578240, + "router_z_loss_clip": 5.125, + "router_z_loss_mlp": 0.67626953, + "step": 1150, + "time_per_iteration": 2.618924379348755 + }, + { + "auxiliary_loss_clip": 0.06931552, + "auxiliary_loss_mlp": 0.01344517, + "balance_loss_clip": 0.06421608, + "balance_loss_mlp": 0.01277283, + "epoch": 0.06920186382083271, + "flos": 15309331130880.0, + "grad_norm": 4.140310732721626, + "language_loss": 0.85321879, + "learning_rate": 3.983901656532052e-06, + "loss": 0.93597955, + "num_input_tokens_seen": 24593585, + "router_z_loss_clip": 5.08984375, + "router_z_loss_mlp": 0.671875, + "step": 1151, + "time_per_iteration": 2.561635971069336 + }, + { + "auxiliary_loss_clip": 0.06954889, + "auxiliary_loss_mlp": 0.01331032, + "balance_loss_clip": 0.06432007, + "balance_loss_mlp": 0.01262987, + "epoch": 0.06926198707350067, + "flos": 25198125310080.0, + "grad_norm": 6.641784633133515, + "language_loss": 0.8773886, + "learning_rate": 3.983852303849291e-06, + "loss": 0.96024776, + "num_input_tokens_seen": 24613110, + "router_z_loss_clip": 5.2265625, + "router_z_loss_mlp": 0.68066406, + "step": 1152, + "time_per_iteration": 2.610301971435547 + }, + { + "auxiliary_loss_clip": 0.06939621, + "auxiliary_loss_mlp": 0.01350234, + "balance_loss_clip": 0.06435804, + "balance_loss_mlp": 0.01282142, + "epoch": 0.06932211032616864, + "flos": 13260328513920.0, + "grad_norm": 2.8280818960049046, + "language_loss": 0.93534935, + "learning_rate": 3.983802875938651e-06, + "loss": 1.01824796, + "num_input_tokens_seen": 24628795, + "router_z_loss_clip": 5.03125, + "router_z_loss_mlp": 0.68066406, + "step": 1153, + "time_per_iteration": 2.595799207687378 + }, + { + "auxiliary_loss_clip": 0.06937614, + "auxiliary_loss_mlp": 0.01346443, + "balance_loss_clip": 0.06424908, + "balance_loss_mlp": 0.01280687, + "epoch": 0.06938223357883662, + "flos": 24834386736000.0, + "grad_norm": 3.275555077522592, + "language_loss": 0.83502865, + "learning_rate": 3.983753372802008e-06, + "loss": 0.91786921, + "num_input_tokens_seen": 24645480, + "router_z_loss_clip": 5.125, + "router_z_loss_mlp": 0.65771484, + "step": 1154, + "time_per_iteration": 2.615935802459717 + }, + { + "auxiliary_loss_clip": 0.06924553, + "auxiliary_loss_mlp": 0.01343071, + "balance_loss_clip": 0.06417688, + "balance_loss_mlp": 0.01275837, + "epoch": 0.06944235683150458, + "flos": 27274730647680.0, + "grad_norm": 2.790851822686811, + "language_loss": 0.77858025, + "learning_rate": 3.983703794441237e-06, + "loss": 0.86125654, + "num_input_tokens_seen": 24664630, + "router_z_loss_clip": 5.06640625, + "router_z_loss_mlp": 0.67285156, + "step": 1155, + "time_per_iteration": 2.6646928787231445 + }, + { + "auxiliary_loss_clip": 0.06934217, + "auxiliary_loss_mlp": 0.01349275, + "balance_loss_clip": 0.06429212, + "balance_loss_mlp": 0.01284616, + "epoch": 0.06950248008417255, + "flos": 25814493544320.0, + "grad_norm": 4.449978036613599, + "language_loss": 0.73122412, + "learning_rate": 3.98365414085822e-06, + "loss": 0.81405902, + "num_input_tokens_seen": 24684210, + "router_z_loss_clip": 5.05078125, + "router_z_loss_mlp": 0.64697266, + "step": 1156, + "time_per_iteration": 2.6129708290100098 + }, + { + "auxiliary_loss_clip": 0.06933945, + "auxiliary_loss_mlp": 0.0134792, + "balance_loss_clip": 0.06418756, + "balance_loss_mlp": 0.01275202, + "epoch": 0.06956260333684053, + "flos": 22277818811520.0, + "grad_norm": 6.490327446037073, + "language_loss": 0.77343124, + "learning_rate": 3.98360441205484e-06, + "loss": 0.85624993, + "num_input_tokens_seen": 24702490, + "router_z_loss_clip": 5.1484375, + "router_z_loss_mlp": 0.7265625, + "step": 1157, + "time_per_iteration": 2.617549419403076 + }, + { + "auxiliary_loss_clip": 0.06920086, + "auxiliary_loss_mlp": 0.01334116, + "balance_loss_clip": 0.06410048, + "balance_loss_mlp": 0.01268265, + "epoch": 0.0696227265895085, + "flos": 29689442409600.0, + "grad_norm": 3.2808507481159785, + "language_loss": 0.7421459, + "learning_rate": 3.983554608032982e-06, + "loss": 0.8246879, + "num_input_tokens_seen": 24724340, + "router_z_loss_clip": 5.1015625, + "router_z_loss_mlp": 0.65869141, + "step": 1158, + "time_per_iteration": 2.649886131286621 + }, + { + "auxiliary_loss_clip": 0.0693851, + "auxiliary_loss_mlp": 0.01343202, + "balance_loss_clip": 0.06428596, + "balance_loss_mlp": 0.01279401, + "epoch": 0.06968284984217646, + "flos": 25531158562560.0, + "grad_norm": 2.8574838231568687, + "language_loss": 0.82572293, + "learning_rate": 3.983504728794533e-06, + "loss": 0.90854007, + "num_input_tokens_seen": 24745550, + "router_z_loss_clip": 5.09765625, + "router_z_loss_mlp": 0.63818359, + "step": 1159, + "time_per_iteration": 2.657604694366455 + }, + { + "auxiliary_loss_clip": 0.06916194, + "auxiliary_loss_mlp": 0.01333029, + "balance_loss_clip": 0.06403087, + "balance_loss_mlp": 0.01260598, + "epoch": 0.06974297309484444, + "flos": 20703454047360.0, + "grad_norm": 4.319041132998911, + "language_loss": 0.83704364, + "learning_rate": 3.983454774341387e-06, + "loss": 0.91953588, + "num_input_tokens_seen": 24762575, + "router_z_loss_clip": 5.125, + "router_z_loss_mlp": 0.72460938, + "step": 1160, + "time_per_iteration": 2.5699267387390137 + }, + { + "auxiliary_loss_clip": 0.06909285, + "auxiliary_loss_mlp": 0.01331612, + "balance_loss_clip": 0.06406631, + "balance_loss_mlp": 0.01266857, + "epoch": 0.0698030963475124, + "flos": 26512397400960.0, + "grad_norm": 2.5893552087800598, + "language_loss": 0.78334123, + "learning_rate": 3.983404744675437e-06, + "loss": 0.86575019, + "num_input_tokens_seen": 24782605, + "router_z_loss_clip": 5.0234375, + "router_z_loss_mlp": 0.64794922, + "step": 1161, + "time_per_iteration": 4.190939664840698 + }, + { + "auxiliary_loss_clip": 0.06900249, + "auxiliary_loss_mlp": 0.0132851, + "balance_loss_clip": 0.06396457, + "balance_loss_mlp": 0.01263899, + "epoch": 0.06986321960018037, + "flos": 23047279655040.0, + "grad_norm": 6.695162889354259, + "language_loss": 0.8492136, + "learning_rate": 3.9833546397985794e-06, + "loss": 0.93150115, + "num_input_tokens_seen": 24802910, + "router_z_loss_clip": 5.0390625, + "router_z_loss_mlp": 0.64575195, + "step": 1162, + "time_per_iteration": 2.639911413192749 + }, + { + "auxiliary_loss_clip": 0.06873773, + "auxiliary_loss_mlp": 0.01325161, + "balance_loss_clip": 0.06388026, + "balance_loss_mlp": 0.01266557, + "epoch": 0.06992334285284833, + "flos": 28592356901760.0, + "grad_norm": 3.1892890701678778, + "language_loss": 0.82525402, + "learning_rate": 3.983304459712716e-06, + "loss": 0.90724337, + "num_input_tokens_seen": 24823305, + "router_z_loss_clip": 4.8515625, + "router_z_loss_mlp": 0.58642578, + "step": 1163, + "time_per_iteration": 4.1009368896484375 + }, + { + "auxiliary_loss_clip": 0.06902477, + "auxiliary_loss_mlp": 0.0132859, + "balance_loss_clip": 0.06390633, + "balance_loss_mlp": 0.01260832, + "epoch": 0.06998346610551631, + "flos": 20601694863360.0, + "grad_norm": 2.8425577951758956, + "language_loss": 0.8088491, + "learning_rate": 3.983254204419749e-06, + "loss": 0.89115977, + "num_input_tokens_seen": 24842155, + "router_z_loss_clip": 5.1171875, + "router_z_loss_mlp": 0.67773438, + "step": 1164, + "time_per_iteration": 2.6123766899108887 + }, + { + "auxiliary_loss_clip": 0.06897761, + "auxiliary_loss_mlp": 0.01323941, + "balance_loss_clip": 0.06385773, + "balance_loss_mlp": 0.012589, + "epoch": 0.07004358935818428, + "flos": 22535437789440.0, + "grad_norm": 2.2246598791524903, + "language_loss": 0.75642318, + "learning_rate": 3.983203873921583e-06, + "loss": 0.83864021, + "num_input_tokens_seen": 24862080, + "router_z_loss_clip": 5.1171875, + "router_z_loss_mlp": 0.64941406, + "step": 1165, + "time_per_iteration": 4.041048288345337 + }, + { + "auxiliary_loss_clip": 0.06871405, + "auxiliary_loss_mlp": 0.01319453, + "balance_loss_clip": 0.06375992, + "balance_loss_mlp": 0.01258847, + "epoch": 0.07010371261085224, + "flos": 28957646776320.0, + "grad_norm": 2.442665636555923, + "language_loss": 0.83451885, + "learning_rate": 3.983153468220128e-06, + "loss": 0.91642749, + "num_input_tokens_seen": 24886165, + "router_z_loss_clip": 4.94921875, + "router_z_loss_mlp": 0.60668945, + "step": 1166, + "time_per_iteration": 2.652954339981079 + }, + { + "auxiliary_loss_clip": 0.06883232, + "auxiliary_loss_mlp": 0.01318395, + "balance_loss_clip": 0.06374976, + "balance_loss_mlp": 0.01257599, + "epoch": 0.07016383586352022, + "flos": 23665870022400.0, + "grad_norm": 2.9279177018628393, + "language_loss": 0.87250483, + "learning_rate": 3.983102987317295e-06, + "loss": 0.95452112, + "num_input_tokens_seen": 24905775, + "router_z_loss_clip": 5.07421875, + "router_z_loss_mlp": 0.60791016, + "step": 1167, + "time_per_iteration": 3.997807502746582 + }, + { + "auxiliary_loss_clip": 0.06869654, + "auxiliary_loss_mlp": 0.01315759, + "balance_loss_clip": 0.0637234, + "balance_loss_mlp": 0.01256608, + "epoch": 0.07022395911618819, + "flos": 19798258389120.0, + "grad_norm": 3.2057139816430826, + "language_loss": 0.9293927, + "learning_rate": 3.983052431214997e-06, + "loss": 1.01124692, + "num_input_tokens_seen": 24924295, + "router_z_loss_clip": 4.97265625, + "router_z_loss_mlp": 0.59106445, + "step": 1168, + "time_per_iteration": 2.6452579498291016 + }, + { + "auxiliary_loss_clip": 0.06893629, + "auxiliary_loss_mlp": 0.01330714, + "balance_loss_clip": 0.06368282, + "balance_loss_mlp": 0.01258331, + "epoch": 0.07028408236885615, + "flos": 21695551989120.0, + "grad_norm": 11.495675802169094, + "language_loss": 0.91365838, + "learning_rate": 3.983001799915153e-06, + "loss": 0.99590182, + "num_input_tokens_seen": 24943210, + "router_z_loss_clip": 5.24609375, + "router_z_loss_mlp": 0.72363281, + "step": 1169, + "time_per_iteration": 2.647975444793701 + }, + { + "auxiliary_loss_clip": 0.06888205, + "auxiliary_loss_mlp": 0.01328046, + "balance_loss_clip": 0.06373216, + "balance_loss_mlp": 0.01262696, + "epoch": 0.07034420562152413, + "flos": 25637445866880.0, + "grad_norm": 2.8251979605986515, + "language_loss": 0.87019682, + "learning_rate": 3.982951093419681e-06, + "loss": 0.95235932, + "num_input_tokens_seen": 24960360, + "router_z_loss_clip": 5.14453125, + "router_z_loss_mlp": 0.65356445, + "step": 1170, + "time_per_iteration": 2.6168391704559326 + }, + { + "auxiliary_loss_clip": 0.06855451, + "auxiliary_loss_mlp": 0.01322256, + "balance_loss_clip": 0.06370235, + "balance_loss_mlp": 0.01265703, + "epoch": 0.0704043288741921, + "flos": 20816198115840.0, + "grad_norm": 5.8134102676021175, + "language_loss": 0.77777052, + "learning_rate": 3.982900311730506e-06, + "loss": 0.85954762, + "num_input_tokens_seen": 24978290, + "router_z_loss_clip": 4.84765625, + "router_z_loss_mlp": 0.56542969, + "step": 1171, + "time_per_iteration": 2.5752956867218018 + }, + { + "auxiliary_loss_clip": 0.06854077, + "auxiliary_loss_mlp": 0.01325506, + "balance_loss_clip": 0.06365283, + "balance_loss_mlp": 0.01268191, + "epoch": 0.07046445212686006, + "flos": 25600241854080.0, + "grad_norm": 2.1487650465547463, + "language_loss": 0.92066246, + "learning_rate": 3.9828494548495514e-06, + "loss": 1.00245833, + "num_input_tokens_seen": 24997055, + "router_z_loss_clip": 4.88671875, + "router_z_loss_mlp": 0.57373047, + "step": 1172, + "time_per_iteration": 2.6476805210113525 + }, + { + "auxiliary_loss_clip": 0.06885421, + "auxiliary_loss_mlp": 0.01324663, + "balance_loss_clip": 0.06371161, + "balance_loss_mlp": 0.01262006, + "epoch": 0.07052457537952803, + "flos": 25564086017280.0, + "grad_norm": 2.603738764291359, + "language_loss": 0.84748065, + "learning_rate": 3.982798522778748e-06, + "loss": 0.92958152, + "num_input_tokens_seen": 25017490, + "router_z_loss_clip": 5.140625, + "router_z_loss_mlp": 0.62695312, + "step": 1173, + "time_per_iteration": 2.6071321964263916 + }, + { + "auxiliary_loss_clip": 0.06857952, + "auxiliary_loss_mlp": 0.01331109, + "balance_loss_clip": 0.06368312, + "balance_loss_mlp": 0.01273054, + "epoch": 0.070584698632196, + "flos": 17974450419840.0, + "grad_norm": 3.5775835502164868, + "language_loss": 0.85116845, + "learning_rate": 3.9827475155200245e-06, + "loss": 0.9330591, + "num_input_tokens_seen": 25035660, + "router_z_loss_clip": 4.89453125, + "router_z_loss_mlp": 0.58129883, + "step": 1174, + "time_per_iteration": 2.57753324508667 + }, + { + "auxiliary_loss_clip": 0.06853965, + "auxiliary_loss_mlp": 0.01334878, + "balance_loss_clip": 0.06364483, + "balance_loss_mlp": 0.01276847, + "epoch": 0.07064482188486397, + "flos": 25377353193600.0, + "grad_norm": 2.5795508468108053, + "language_loss": 0.87789464, + "learning_rate": 3.982696433075317e-06, + "loss": 0.95978308, + "num_input_tokens_seen": 25054785, + "router_z_loss_clip": 4.89453125, + "router_z_loss_mlp": 0.58056641, + "step": 1175, + "time_per_iteration": 2.610611915588379 + }, + { + "auxiliary_loss_clip": 0.06871554, + "auxiliary_loss_mlp": 0.01331862, + "balance_loss_clip": 0.06373453, + "balance_loss_mlp": 0.0127116, + "epoch": 0.07070494513753194, + "flos": 24906782263680.0, + "grad_norm": 2.676154874226604, + "language_loss": 0.87147272, + "learning_rate": 3.982645275446563e-06, + "loss": 0.95350683, + "num_input_tokens_seen": 25075180, + "router_z_loss_clip": 4.97265625, + "router_z_loss_mlp": 0.60644531, + "step": 1176, + "time_per_iteration": 2.6749603748321533 + }, + { + "auxiliary_loss_clip": 0.06855497, + "auxiliary_loss_mlp": 0.01331059, + "balance_loss_clip": 0.06369121, + "balance_loss_mlp": 0.01272075, + "epoch": 0.07076506839019991, + "flos": 22343715648000.0, + "grad_norm": 7.137695949749425, + "language_loss": 0.76855987, + "learning_rate": 3.982594042635701e-06, + "loss": 0.85042542, + "num_input_tokens_seen": 25093035, + "router_z_loss_clip": 4.86328125, + "router_z_loss_mlp": 0.58984375, + "step": 1177, + "time_per_iteration": 2.57594895362854 + }, + { + "auxiliary_loss_clip": 0.06883623, + "auxiliary_loss_mlp": 0.0132835, + "balance_loss_clip": 0.06377017, + "balance_loss_mlp": 0.01265599, + "epoch": 0.07082519164286788, + "flos": 18666694126080.0, + "grad_norm": 2.8035814441303164, + "language_loss": 0.8769573, + "learning_rate": 3.982542734644673e-06, + "loss": 0.959077, + "num_input_tokens_seen": 25112520, + "router_z_loss_clip": 5.0625, + "router_z_loss_mlp": 0.62695312, + "step": 1178, + "time_per_iteration": 2.6013543605804443 + }, + { + "auxiliary_loss_clip": 0.06703987, + "auxiliary_loss_mlp": 0.0134181, + "balance_loss_clip": 0.06385635, + "balance_loss_mlp": 0.01304808, + "epoch": 0.07088531489553584, + "flos": 63674691615360.0, + "grad_norm": 0.8655968349167181, + "language_loss": 0.63642812, + "learning_rate": 3.982491351475427e-06, + "loss": 0.71688616, + "num_input_tokens_seen": 25177760, + "router_z_loss_clip": 3.1875, + "router_z_loss_mlp": 0.36938477, + "step": 1179, + "time_per_iteration": 3.3081142902374268 + }, + { + "auxiliary_loss_clip": 0.06890059, + "auxiliary_loss_mlp": 0.01335612, + "balance_loss_clip": 0.06383069, + "balance_loss_mlp": 0.01270047, + "epoch": 0.07094543814820382, + "flos": 21577902456960.0, + "grad_norm": 4.088495173814758, + "language_loss": 0.87769747, + "learning_rate": 3.98243989312991e-06, + "loss": 0.9599542, + "num_input_tokens_seen": 25195260, + "router_z_loss_clip": 5.0703125, + "router_z_loss_mlp": 0.65625, + "step": 1180, + "time_per_iteration": 2.559685707092285 + }, + { + "auxiliary_loss_clip": 0.06872466, + "auxiliary_loss_mlp": 0.01339604, + "balance_loss_clip": 0.06370541, + "balance_loss_mlp": 0.01274754, + "epoch": 0.07100556140087179, + "flos": 22096326867840.0, + "grad_norm": 6.479686279022214, + "language_loss": 0.90814912, + "learning_rate": 3.982388359610074e-06, + "loss": 0.99026984, + "num_input_tokens_seen": 25212740, + "router_z_loss_clip": 5.015625, + "router_z_loss_mlp": 0.6484375, + "step": 1181, + "time_per_iteration": 2.616978883743286 + }, + { + "auxiliary_loss_clip": 0.06848356, + "auxiliary_loss_mlp": 0.01339504, + "balance_loss_clip": 0.06372169, + "balance_loss_mlp": 0.01279351, + "epoch": 0.07106568465353975, + "flos": 47933056471680.0, + "grad_norm": 6.025910143763993, + "language_loss": 0.86037725, + "learning_rate": 3.9823367509178725e-06, + "loss": 0.94225585, + "num_input_tokens_seen": 25236420, + "router_z_loss_clip": 4.75390625, + "router_z_loss_mlp": 0.60131836, + "step": 1182, + "time_per_iteration": 2.7946407794952393 + }, + { + "auxiliary_loss_clip": 0.06876318, + "auxiliary_loss_mlp": 0.0134218, + "balance_loss_clip": 0.06371553, + "balance_loss_mlp": 0.01276806, + "epoch": 0.07112580790620772, + "flos": 23447551409280.0, + "grad_norm": 3.676638851024929, + "language_loss": 0.82862288, + "learning_rate": 3.982285067055262e-06, + "loss": 0.91080785, + "num_input_tokens_seen": 25255120, + "router_z_loss_clip": 5.04296875, + "router_z_loss_mlp": 0.65332031, + "step": 1183, + "time_per_iteration": 2.60546612739563 + }, + { + "auxiliary_loss_clip": 0.06882935, + "auxiliary_loss_mlp": 0.01336855, + "balance_loss_clip": 0.0637991, + "balance_loss_mlp": 0.01272101, + "epoch": 0.0711859311588757, + "flos": 31877030880000.0, + "grad_norm": 4.3786669508725335, + "language_loss": 0.81657791, + "learning_rate": 3.982233308024204e-06, + "loss": 0.8987757, + "num_input_tokens_seen": 25275150, + "router_z_loss_clip": 5.02734375, + "router_z_loss_mlp": 0.64794922, + "step": 1184, + "time_per_iteration": 2.651372194290161 + }, + { + "auxiliary_loss_clip": 0.06854693, + "auxiliary_loss_mlp": 0.013301, + "balance_loss_clip": 0.06374621, + "balance_loss_mlp": 0.01271926, + "epoch": 0.07124605441154366, + "flos": 19616514883200.0, + "grad_norm": 2.502972307695957, + "language_loss": 0.79704922, + "learning_rate": 3.98218147382666e-06, + "loss": 0.87889707, + "num_input_tokens_seen": 25293680, + "router_z_loss_clip": 4.796875, + "router_z_loss_mlp": 0.58178711, + "step": 1185, + "time_per_iteration": 2.591947555541992 + }, + { + "auxiliary_loss_clip": 0.06869413, + "auxiliary_loss_mlp": 0.01332248, + "balance_loss_clip": 0.06377724, + "balance_loss_mlp": 0.0127169, + "epoch": 0.07130617766421163, + "flos": 14689776441600.0, + "grad_norm": 8.952451247795917, + "language_loss": 0.68110502, + "learning_rate": 3.982129564464596e-06, + "loss": 0.7631216, + "num_input_tokens_seen": 25310050, + "router_z_loss_clip": 4.91015625, + "router_z_loss_mlp": 0.60546875, + "step": 1186, + "time_per_iteration": 2.52742862701416 + }, + { + "auxiliary_loss_clip": 0.06856332, + "auxiliary_loss_mlp": 0.01335213, + "balance_loss_clip": 0.06375858, + "balance_loss_mlp": 0.01277587, + "epoch": 0.07136630091687961, + "flos": 26075131269120.0, + "grad_norm": 3.0050123348369984, + "language_loss": 0.72187626, + "learning_rate": 3.98207757993998e-06, + "loss": 0.8037917, + "num_input_tokens_seen": 25331020, + "router_z_loss_clip": 4.796875, + "router_z_loss_mlp": 0.57641602, + "step": 1187, + "time_per_iteration": 2.6516740322113037 + }, + { + "auxiliary_loss_clip": 0.06852362, + "auxiliary_loss_mlp": 0.01318955, + "balance_loss_clip": 0.06373794, + "balance_loss_mlp": 0.01261901, + "epoch": 0.07142642416954757, + "flos": 15674621005440.0, + "grad_norm": 8.213543534109728, + "language_loss": 0.81159407, + "learning_rate": 3.9820255202547845e-06, + "loss": 0.89330727, + "num_input_tokens_seen": 25347875, + "router_z_loss_clip": 4.78125, + "router_z_loss_mlp": 0.57006836, + "step": 1188, + "time_per_iteration": 2.535729169845581 + }, + { + "auxiliary_loss_clip": 0.06864372, + "auxiliary_loss_mlp": 0.01337634, + "balance_loss_clip": 0.06379133, + "balance_loss_mlp": 0.01275216, + "epoch": 0.07148654742221554, + "flos": 19761389792640.0, + "grad_norm": 3.9335979273681794, + "language_loss": 0.87605166, + "learning_rate": 3.981973385410981e-06, + "loss": 0.95807171, + "num_input_tokens_seen": 25366715, + "router_z_loss_clip": 4.84765625, + "router_z_loss_mlp": 0.62402344, + "step": 1189, + "time_per_iteration": 2.6562387943267822 + }, + { + "auxiliary_loss_clip": 0.06861293, + "auxiliary_loss_mlp": 0.01342124, + "balance_loss_clip": 0.06382903, + "balance_loss_mlp": 0.01281685, + "epoch": 0.07154667067488352, + "flos": 23477669752320.0, + "grad_norm": 2.556740892092056, + "language_loss": 0.79916418, + "learning_rate": 3.9819211754105494e-06, + "loss": 0.88119841, + "num_input_tokens_seen": 25385450, + "router_z_loss_clip": 4.78125, + "router_z_loss_mlp": 0.60473633, + "step": 1190, + "time_per_iteration": 2.5854697227478027 + }, + { + "auxiliary_loss_clip": 0.06877136, + "auxiliary_loss_mlp": 0.01341277, + "balance_loss_clip": 0.06381981, + "balance_loss_mlp": 0.01274925, + "epoch": 0.07160679392755148, + "flos": 18338859826560.0, + "grad_norm": 3.405692469784563, + "language_loss": 0.78708088, + "learning_rate": 3.981868890255468e-06, + "loss": 0.86926508, + "num_input_tokens_seen": 25403940, + "router_z_loss_clip": 4.94140625, + "router_z_loss_mlp": 0.6628418, + "step": 1191, + "time_per_iteration": 2.638591766357422 + }, + { + "auxiliary_loss_clip": 0.06881537, + "auxiliary_loss_mlp": 0.01331932, + "balance_loss_clip": 0.06384552, + "balance_loss_mlp": 0.01271493, + "epoch": 0.07166691718021945, + "flos": 17752484154240.0, + "grad_norm": 4.470338815774188, + "language_loss": 0.76098609, + "learning_rate": 3.981816529947719e-06, + "loss": 0.84312069, + "num_input_tokens_seen": 25420410, + "router_z_loss_clip": 4.9609375, + "router_z_loss_mlp": 0.60424805, + "step": 1192, + "time_per_iteration": 2.5505447387695312 + }, + { + "auxiliary_loss_clip": 0.06871057, + "auxiliary_loss_mlp": 0.01335615, + "balance_loss_clip": 0.06381638, + "balance_loss_mlp": 0.01275009, + "epoch": 0.07172704043288743, + "flos": 22457885235840.0, + "grad_norm": 6.182703134969588, + "language_loss": 0.8089788, + "learning_rate": 3.9817640944892896e-06, + "loss": 0.89104557, + "num_input_tokens_seen": 25439415, + "router_z_loss_clip": 4.88671875, + "router_z_loss_mlp": 0.60644531, + "step": 1193, + "time_per_iteration": 2.633073329925537 + }, + { + "auxiliary_loss_clip": 0.06859954, + "auxiliary_loss_mlp": 0.01339771, + "balance_loss_clip": 0.06379488, + "balance_loss_mlp": 0.0127733, + "epoch": 0.07178716368555539, + "flos": 23228981233920.0, + "grad_norm": 5.198460731675794, + "language_loss": 0.88664103, + "learning_rate": 3.981711583882166e-06, + "loss": 0.96863824, + "num_input_tokens_seen": 25458715, + "router_z_loss_clip": 4.80078125, + "router_z_loss_mlp": 0.62426758, + "step": 1194, + "time_per_iteration": 2.5827341079711914 + }, + { + "auxiliary_loss_clip": 0.06866181, + "auxiliary_loss_mlp": 0.01325528, + "balance_loss_clip": 0.06383646, + "balance_loss_mlp": 0.01270096, + "epoch": 0.07184728693822336, + "flos": 25157064009600.0, + "grad_norm": 6.369260359442203, + "language_loss": 0.83872163, + "learning_rate": 3.981658998128341e-06, + "loss": 0.92063868, + "num_input_tokens_seen": 25477985, + "router_z_loss_clip": 4.828125, + "router_z_loss_mlp": 0.55444336, + "step": 1195, + "time_per_iteration": 2.6193504333496094 + }, + { + "auxiliary_loss_clip": 0.06856936, + "auxiliary_loss_mlp": 0.01324202, + "balance_loss_clip": 0.06375654, + "balance_loss_mlp": 0.01265241, + "epoch": 0.07190741019089132, + "flos": 22717894055040.0, + "grad_norm": 2.883346879050408, + "language_loss": 0.81836474, + "learning_rate": 3.981606337229808e-06, + "loss": 0.90017617, + "num_input_tokens_seen": 25497110, + "router_z_loss_clip": 4.8046875, + "router_z_loss_mlp": 0.58984375, + "step": 1196, + "time_per_iteration": 2.586151123046875 + }, + { + "auxiliary_loss_clip": 0.06870347, + "auxiliary_loss_mlp": 0.0135034, + "balance_loss_clip": 0.06381004, + "balance_loss_mlp": 0.0128828, + "epoch": 0.0719675334435593, + "flos": 29357247697920.0, + "grad_norm": 3.757214572000768, + "language_loss": 0.74150658, + "learning_rate": 3.9815536011885655e-06, + "loss": 0.82371342, + "num_input_tokens_seen": 25516555, + "router_z_loss_clip": 4.89453125, + "router_z_loss_mlp": 0.62109375, + "step": 1197, + "time_per_iteration": 2.653139114379883 + }, + { + "auxiliary_loss_clip": 0.06849834, + "auxiliary_loss_mlp": 0.01333514, + "balance_loss_clip": 0.0637273, + "balance_loss_mlp": 0.01277867, + "epoch": 0.07202765669622727, + "flos": 17645609871360.0, + "grad_norm": 7.565571046606514, + "language_loss": 0.88836908, + "learning_rate": 3.98150079000661e-06, + "loss": 0.97020251, + "num_input_tokens_seen": 25533895, + "router_z_loss_clip": 4.76953125, + "router_z_loss_mlp": 0.55664062, + "step": 1198, + "time_per_iteration": 2.558506727218628 + }, + { + "auxiliary_loss_clip": 0.06868395, + "auxiliary_loss_mlp": 0.01336115, + "balance_loss_clip": 0.06385568, + "balance_loss_mlp": 0.01278942, + "epoch": 0.07208777994889523, + "flos": 21440448633600.0, + "grad_norm": 9.650241915118821, + "language_loss": 0.86308157, + "learning_rate": 3.981447903685947e-06, + "loss": 0.94512665, + "num_input_tokens_seen": 25554195, + "router_z_loss_clip": 4.828125, + "router_z_loss_mlp": 0.57202148, + "step": 1199, + "time_per_iteration": 2.593768835067749 + }, + { + "auxiliary_loss_clip": 0.06879794, + "auxiliary_loss_mlp": 0.01340676, + "balance_loss_clip": 0.06389172, + "balance_loss_mlp": 0.01281167, + "epoch": 0.07214790320156321, + "flos": 26947776816000.0, + "grad_norm": 2.5713335496183136, + "language_loss": 0.78793061, + "learning_rate": 3.981394942228581e-06, + "loss": 0.87013531, + "num_input_tokens_seen": 25574155, + "router_z_loss_clip": 4.91015625, + "router_z_loss_mlp": 0.59521484, + "step": 1200, + "time_per_iteration": 2.6549324989318848 + }, + { + "auxiliary_loss_clip": 0.06889373, + "auxiliary_loss_mlp": 0.01341905, + "balance_loss_clip": 0.06398184, + "balance_loss_mlp": 0.01281109, + "epoch": 0.07220802645423118, + "flos": 23886997747200.0, + "grad_norm": 3.3919476714664185, + "language_loss": 0.84325218, + "learning_rate": 3.98134190563652e-06, + "loss": 0.925565, + "num_input_tokens_seen": 25592735, + "router_z_loss_clip": 4.91015625, + "router_z_loss_mlp": 0.60839844, + "step": 1201, + "time_per_iteration": 3.9977235794067383 + }, + { + "auxiliary_loss_clip": 0.06908435, + "auxiliary_loss_mlp": 0.01338574, + "balance_loss_clip": 0.06397285, + "balance_loss_mlp": 0.0127382, + "epoch": 0.07226814970689914, + "flos": 19249464072960.0, + "grad_norm": 2.7243272317134624, + "language_loss": 0.71221054, + "learning_rate": 3.981288793911775e-06, + "loss": 0.7946806, + "num_input_tokens_seen": 25611510, + "router_z_loss_clip": 5.109375, + "router_z_loss_mlp": 0.6472168, + "step": 1202, + "time_per_iteration": 4.006861925125122 + }, + { + "auxiliary_loss_clip": 0.06890082, + "auxiliary_loss_mlp": 0.01341886, + "balance_loss_clip": 0.06389347, + "balance_loss_mlp": 0.01278705, + "epoch": 0.07232827295956712, + "flos": 19178074794240.0, + "grad_norm": 3.218171076661328, + "language_loss": 0.89525115, + "learning_rate": 3.98123560705636e-06, + "loss": 0.97757077, + "num_input_tokens_seen": 25629560, + "router_z_loss_clip": 5.00390625, + "router_z_loss_mlp": 0.63232422, + "step": 1203, + "time_per_iteration": 2.6098897457122803 + }, + { + "auxiliary_loss_clip": 0.069024, + "auxiliary_loss_mlp": 0.01349525, + "balance_loss_clip": 0.06393193, + "balance_loss_mlp": 0.01279335, + "epoch": 0.07238839621223508, + "flos": 17645567944320.0, + "grad_norm": 3.0614329982122266, + "language_loss": 0.81485641, + "learning_rate": 3.981182345072293e-06, + "loss": 0.89737558, + "num_input_tokens_seen": 25648330, + "router_z_loss_clip": 5.09375, + "router_z_loss_mlp": 0.70214844, + "step": 1204, + "time_per_iteration": 3.999619960784912 + }, + { + "auxiliary_loss_clip": 0.06911701, + "auxiliary_loss_mlp": 0.01333494, + "balance_loss_clip": 0.06413823, + "balance_loss_mlp": 0.01269693, + "epoch": 0.07244851946490305, + "flos": 28299797971200.0, + "grad_norm": 3.782046298297649, + "language_loss": 0.84954846, + "learning_rate": 3.981129007961593e-06, + "loss": 0.9320004, + "num_input_tokens_seen": 25669470, + "router_z_loss_clip": 4.97265625, + "router_z_loss_mlp": 0.63818359, + "step": 1205, + "time_per_iteration": 2.658663272857666 + }, + { + "auxiliary_loss_clip": 0.06914138, + "auxiliary_loss_mlp": 0.0134752, + "balance_loss_clip": 0.06405394, + "balance_loss_mlp": 0.01278021, + "epoch": 0.07250864271757101, + "flos": 22571383991040.0, + "grad_norm": 9.50364615421703, + "language_loss": 0.78291214, + "learning_rate": 3.981075595726283e-06, + "loss": 0.86552876, + "num_input_tokens_seen": 25690470, + "router_z_loss_clip": 5.078125, + "router_z_loss_mlp": 0.69458008, + "step": 1206, + "time_per_iteration": 2.6500728130340576 + }, + { + "auxiliary_loss_clip": 0.06879818, + "auxiliary_loss_mlp": 0.01347642, + "balance_loss_clip": 0.06386471, + "balance_loss_mlp": 0.0128594, + "epoch": 0.072568765970239, + "flos": 21768869911680.0, + "grad_norm": 3.061800504881848, + "language_loss": 0.79528189, + "learning_rate": 3.981022108368387e-06, + "loss": 0.87755644, + "num_input_tokens_seen": 25709205, + "router_z_loss_clip": 4.9375, + "router_z_loss_mlp": 0.61767578, + "step": 1207, + "time_per_iteration": 4.111234903335571 + }, + { + "auxiliary_loss_clip": 0.06890166, + "auxiliary_loss_mlp": 0.0133734, + "balance_loss_clip": 0.06392397, + "balance_loss_mlp": 0.01278618, + "epoch": 0.07262888922290696, + "flos": 25526672369280.0, + "grad_norm": 2.516808639831756, + "language_loss": 0.82780725, + "learning_rate": 3.9809685458899345e-06, + "loss": 0.91008234, + "num_input_tokens_seen": 25728485, + "router_z_loss_clip": 4.98046875, + "router_z_loss_mlp": 0.58789062, + "step": 1208, + "time_per_iteration": 2.65267276763916 + }, + { + "auxiliary_loss_clip": 0.06873606, + "auxiliary_loss_mlp": 0.01329274, + "balance_loss_clip": 0.06393886, + "balance_loss_mlp": 0.01270813, + "epoch": 0.07268901247557492, + "flos": 21252080655360.0, + "grad_norm": 3.726862788271486, + "language_loss": 0.80825698, + "learning_rate": 3.980914908292955e-06, + "loss": 0.89028573, + "num_input_tokens_seen": 25747730, + "router_z_loss_clip": 4.80078125, + "router_z_loss_mlp": 0.58496094, + "step": 1209, + "time_per_iteration": 2.5653858184814453 + }, + { + "auxiliary_loss_clip": 0.06887256, + "auxiliary_loss_mlp": 0.01333341, + "balance_loss_clip": 0.06401981, + "balance_loss_mlp": 0.012714, + "epoch": 0.0727491357282429, + "flos": 25485611068800.0, + "grad_norm": 85.1554110577333, + "language_loss": 0.83058631, + "learning_rate": 3.980861195579486e-06, + "loss": 0.91279227, + "num_input_tokens_seen": 25768050, + "router_z_loss_clip": 4.84375, + "router_z_loss_mlp": 0.61962891, + "step": 1210, + "time_per_iteration": 2.6290841102600098 + }, + { + "auxiliary_loss_clip": 0.06912959, + "auxiliary_loss_mlp": 0.01335995, + "balance_loss_clip": 0.064188, + "balance_loss_mlp": 0.01275437, + "epoch": 0.07280925898091087, + "flos": 24469054934400.0, + "grad_norm": 2.3690681332483092, + "language_loss": 0.87872899, + "learning_rate": 3.98080740775156e-06, + "loss": 0.96121848, + "num_input_tokens_seen": 25787985, + "router_z_loss_clip": 4.93359375, + "router_z_loss_mlp": 0.60571289, + "step": 1211, + "time_per_iteration": 2.601407289505005 + }, + { + "auxiliary_loss_clip": 0.06907704, + "auxiliary_loss_mlp": 0.01325307, + "balance_loss_clip": 0.06408024, + "balance_loss_mlp": 0.01262221, + "epoch": 0.07286938223357883, + "flos": 18292725354240.0, + "grad_norm": 12.676001298421971, + "language_loss": 0.94102865, + "learning_rate": 3.98075354481122e-06, + "loss": 1.0233587, + "num_input_tokens_seen": 25803620, + "router_z_loss_clip": 5.0, + "router_z_loss_mlp": 0.63134766, + "step": 1212, + "time_per_iteration": 2.583038806915283 + }, + { + "auxiliary_loss_clip": 0.06906819, + "auxiliary_loss_mlp": 0.0132597, + "balance_loss_clip": 0.06410546, + "balance_loss_mlp": 0.01265579, + "epoch": 0.07292950548624681, + "flos": 21221123771520.0, + "grad_norm": 2.174057870864043, + "language_loss": 0.74973536, + "learning_rate": 3.9806996067605055e-06, + "loss": 0.8320632, + "num_input_tokens_seen": 25823315, + "router_z_loss_clip": 4.96484375, + "router_z_loss_mlp": 0.60449219, + "step": 1213, + "time_per_iteration": 2.58750319480896 + }, + { + "auxiliary_loss_clip": 0.06919889, + "auxiliary_loss_mlp": 0.01335737, + "balance_loss_clip": 0.06414144, + "balance_loss_mlp": 0.01270815, + "epoch": 0.07298962873891478, + "flos": 24648492453120.0, + "grad_norm": 3.5327448066046547, + "language_loss": 0.86681479, + "learning_rate": 3.980645593601465e-06, + "loss": 0.9493711, + "num_input_tokens_seen": 25842605, + "router_z_loss_clip": 5.046875, + "router_z_loss_mlp": 0.64868164, + "step": 1214, + "time_per_iteration": 2.6603875160217285 + }, + { + "auxiliary_loss_clip": 0.0691122, + "auxiliary_loss_mlp": 0.01328745, + "balance_loss_clip": 0.06415356, + "balance_loss_mlp": 0.01268855, + "epoch": 0.07304975199158274, + "flos": 27060101614080.0, + "grad_norm": 2.7007963802747197, + "language_loss": 0.87098217, + "learning_rate": 3.980591505336144e-06, + "loss": 0.95338178, + "num_input_tokens_seen": 25863030, + "router_z_loss_clip": 4.95703125, + "router_z_loss_mlp": 0.59863281, + "step": 1215, + "time_per_iteration": 2.6591246128082275 + }, + { + "auxiliary_loss_clip": 0.06944987, + "auxiliary_loss_mlp": 0.01336211, + "balance_loss_clip": 0.06434523, + "balance_loss_mlp": 0.01269025, + "epoch": 0.07310987524425071, + "flos": 33558353781120.0, + "grad_norm": 3.0486240121539385, + "language_loss": 0.83975989, + "learning_rate": 3.980537341966595e-06, + "loss": 0.9225719, + "num_input_tokens_seen": 25888015, + "router_z_loss_clip": 5.1015625, + "router_z_loss_mlp": 0.67138672, + "step": 1216, + "time_per_iteration": 2.7674107551574707 + }, + { + "auxiliary_loss_clip": 0.06944714, + "auxiliary_loss_mlp": 0.01339054, + "balance_loss_clip": 0.06429577, + "balance_loss_mlp": 0.01274585, + "epoch": 0.07316999849691869, + "flos": 28118473735680.0, + "grad_norm": 3.328421621220486, + "language_loss": 0.78921533, + "learning_rate": 3.980483103494872e-06, + "loss": 0.87205303, + "num_input_tokens_seen": 25908660, + "router_z_loss_clip": 5.1484375, + "router_z_loss_mlp": 0.64550781, + "step": 1217, + "time_per_iteration": 2.672692060470581 + }, + { + "auxiliary_loss_clip": 0.06904574, + "auxiliary_loss_mlp": 0.01321216, + "balance_loss_clip": 0.06406265, + "balance_loss_mlp": 0.01263614, + "epoch": 0.07323012174958665, + "flos": 14397888343680.0, + "grad_norm": 2.4648840381938752, + "language_loss": 0.88704532, + "learning_rate": 3.98042878992303e-06, + "loss": 0.96930325, + "num_input_tokens_seen": 25927215, + "router_z_loss_clip": 4.984375, + "router_z_loss_mlp": 0.57592773, + "step": 1218, + "time_per_iteration": 2.6067652702331543 + }, + { + "auxiliary_loss_clip": 0.06908453, + "auxiliary_loss_mlp": 0.01339024, + "balance_loss_clip": 0.06418494, + "balance_loss_mlp": 0.01277607, + "epoch": 0.07329024500225462, + "flos": 21622862972160.0, + "grad_norm": 2.509726295852636, + "language_loss": 0.89056909, + "learning_rate": 3.9803744012531305e-06, + "loss": 0.9730438, + "num_input_tokens_seen": 25945500, + "router_z_loss_clip": 4.89453125, + "router_z_loss_mlp": 0.61376953, + "step": 1219, + "time_per_iteration": 2.644948959350586 + }, + { + "auxiliary_loss_clip": 0.0689719, + "auxiliary_loss_mlp": 0.01336847, + "balance_loss_clip": 0.06407624, + "balance_loss_mlp": 0.01275287, + "epoch": 0.0733503682549226, + "flos": 13229078140800.0, + "grad_norm": 3.459180464583836, + "language_loss": 0.87265766, + "learning_rate": 3.980319937487235e-06, + "loss": 0.95499802, + "num_input_tokens_seen": 25963105, + "router_z_loss_clip": 4.8984375, + "router_z_loss_mlp": 0.61621094, + "step": 1220, + "time_per_iteration": 2.575570583343506 + }, + { + "auxiliary_loss_clip": 0.06925908, + "auxiliary_loss_mlp": 0.01352206, + "balance_loss_clip": 0.06422862, + "balance_loss_mlp": 0.0128974, + "epoch": 0.07341049150759056, + "flos": 20893331399040.0, + "grad_norm": 4.615259324948809, + "language_loss": 0.79933828, + "learning_rate": 3.98026539862741e-06, + "loss": 0.88211942, + "num_input_tokens_seen": 25981690, + "router_z_loss_clip": 5.03125, + "router_z_loss_mlp": 0.62451172, + "step": 1221, + "time_per_iteration": 2.6174440383911133 + }, + { + "auxiliary_loss_clip": 0.06900848, + "auxiliary_loss_mlp": 0.01351796, + "balance_loss_clip": 0.06404451, + "balance_loss_mlp": 0.01290761, + "epoch": 0.07347061476025853, + "flos": 15418972598400.0, + "grad_norm": 2.5998624424358106, + "language_loss": 0.95159388, + "learning_rate": 3.980210784675722e-06, + "loss": 1.03412032, + "num_input_tokens_seen": 25999890, + "router_z_loss_clip": 4.9609375, + "router_z_loss_mlp": 0.61035156, + "step": 1222, + "time_per_iteration": 2.5956273078918457 + }, + { + "auxiliary_loss_clip": 0.06908462, + "auxiliary_loss_mlp": 0.01358079, + "balance_loss_clip": 0.06414389, + "balance_loss_mlp": 0.01303147, + "epoch": 0.0735307380129265, + "flos": 11113591708800.0, + "grad_norm": 14.551194351183868, + "language_loss": 0.93725538, + "learning_rate": 3.980156095634242e-06, + "loss": 1.01992083, + "num_input_tokens_seen": 26016445, + "router_z_loss_clip": 4.94140625, + "router_z_loss_mlp": 0.54907227, + "step": 1223, + "time_per_iteration": 2.5886712074279785 + }, + { + "auxiliary_loss_clip": 0.06916398, + "auxiliary_loss_mlp": 0.01394841, + "balance_loss_clip": 0.06417241, + "balance_loss_mlp": 0.01330874, + "epoch": 0.07359086126559447, + "flos": 23739146017920.0, + "grad_norm": 2.48832330955176, + "language_loss": 0.84952593, + "learning_rate": 3.980101331505045e-06, + "loss": 0.93263835, + "num_input_tokens_seen": 26036080, + "router_z_loss_clip": 4.984375, + "router_z_loss_mlp": 0.63989258, + "step": 1224, + "time_per_iteration": 2.600796937942505 + }, + { + "auxiliary_loss_clip": 0.06916806, + "auxiliary_loss_mlp": 0.01413444, + "balance_loss_clip": 0.06410658, + "balance_loss_mlp": 0.0134354, + "epoch": 0.07365098451826244, + "flos": 20999115578880.0, + "grad_norm": 3.5000549679052932, + "language_loss": 0.86487269, + "learning_rate": 3.9800464922902076e-06, + "loss": 0.94817519, + "num_input_tokens_seen": 26055805, + "router_z_loss_clip": 5.0625, + "router_z_loss_mlp": 0.69921875, + "step": 1225, + "time_per_iteration": 2.6348657608032227 + }, + { + "auxiliary_loss_clip": 0.06893472, + "auxiliary_loss_mlp": 0.01405003, + "balance_loss_clip": 0.06406252, + "balance_loss_mlp": 0.01345017, + "epoch": 0.0737111077709304, + "flos": 19938982521600.0, + "grad_norm": 2.4160640893773544, + "language_loss": 0.93043572, + "learning_rate": 3.979991577991808e-06, + "loss": 1.01342046, + "num_input_tokens_seen": 26073905, + "router_z_loss_clip": 4.8671875, + "router_z_loss_mlp": 0.59960938, + "step": 1226, + "time_per_iteration": 2.5814220905303955 + }, + { + "auxiliary_loss_clip": 0.06951886, + "auxiliary_loss_mlp": 0.01454874, + "balance_loss_clip": 0.06431323, + "balance_loss_mlp": 0.01382633, + "epoch": 0.07377123102359838, + "flos": 16587153895680.0, + "grad_norm": 17.71044350544229, + "language_loss": 0.81177175, + "learning_rate": 3.97993658861193e-06, + "loss": 0.89583939, + "num_input_tokens_seen": 26091700, + "router_z_loss_clip": 5.21484375, + "router_z_loss_mlp": 0.72216797, + "step": 1227, + "time_per_iteration": 2.562495708465576 + }, + { + "auxiliary_loss_clip": 0.06910308, + "auxiliary_loss_mlp": 0.0141995, + "balance_loss_clip": 0.06419577, + "balance_loss_mlp": 0.01357318, + "epoch": 0.07383135427626634, + "flos": 28335911880960.0, + "grad_norm": 2.0840618907227113, + "language_loss": 0.88551241, + "learning_rate": 3.9798815241526575e-06, + "loss": 0.96881503, + "num_input_tokens_seen": 26114105, + "router_z_loss_clip": 4.90625, + "router_z_loss_mlp": 0.6262207, + "step": 1228, + "time_per_iteration": 2.6383354663848877 + }, + { + "auxiliary_loss_clip": 0.06927899, + "auxiliary_loss_mlp": 0.01421335, + "balance_loss_clip": 0.06420749, + "balance_loss_mlp": 0.01352098, + "epoch": 0.07389147752893431, + "flos": 20053277890560.0, + "grad_norm": 2.9618119227327493, + "language_loss": 0.82374752, + "learning_rate": 3.97982638461608e-06, + "loss": 0.90723979, + "num_input_tokens_seen": 26131165, + "router_z_loss_clip": 5.0625, + "router_z_loss_mlp": 0.69238281, + "step": 1229, + "time_per_iteration": 2.572110414505005 + }, + { + "auxiliary_loss_clip": 0.06918953, + "auxiliary_loss_mlp": 0.01426217, + "balance_loss_clip": 0.06413613, + "balance_loss_mlp": 0.01351926, + "epoch": 0.07395160078160229, + "flos": 18120038088960.0, + "grad_norm": 2.8764105468999697, + "language_loss": 0.81244183, + "learning_rate": 3.979771170004287e-06, + "loss": 0.89589357, + "num_input_tokens_seen": 26150040, + "router_z_loss_clip": 5.046875, + "router_z_loss_mlp": 0.74267578, + "step": 1230, + "time_per_iteration": 2.580080270767212 + }, + { + "auxiliary_loss_clip": 0.06901585, + "auxiliary_loss_mlp": 0.01391553, + "balance_loss_clip": 0.06406316, + "balance_loss_mlp": 0.01325273, + "epoch": 0.07401172403427025, + "flos": 23593726056960.0, + "grad_norm": 2.3354922031953547, + "language_loss": 0.83756942, + "learning_rate": 3.979715880319372e-06, + "loss": 0.92050081, + "num_input_tokens_seen": 26169380, + "router_z_loss_clip": 4.95703125, + "router_z_loss_mlp": 0.66210938, + "step": 1231, + "time_per_iteration": 2.6182961463928223 + }, + { + "auxiliary_loss_clip": 0.06916339, + "auxiliary_loss_mlp": 0.01398184, + "balance_loss_clip": 0.06416178, + "balance_loss_mlp": 0.01340868, + "epoch": 0.07407184728693822, + "flos": 26367187075200.0, + "grad_norm": 2.448759958115063, + "language_loss": 0.97958755, + "learning_rate": 3.979660515563434e-06, + "loss": 1.0627327, + "num_input_tokens_seen": 26189420, + "router_z_loss_clip": 5.0, + "router_z_loss_mlp": 0.57373047, + "step": 1232, + "time_per_iteration": 2.6219074726104736 + }, + { + "auxiliary_loss_clip": 0.06881506, + "auxiliary_loss_mlp": 0.01383375, + "balance_loss_clip": 0.06404279, + "balance_loss_mlp": 0.01327991, + "epoch": 0.0741319705396062, + "flos": 22207016511360.0, + "grad_norm": 2.790382340569057, + "language_loss": 0.83657277, + "learning_rate": 3.979605075738569e-06, + "loss": 0.91922164, + "num_input_tokens_seen": 26209300, + "router_z_loss_clip": 4.7734375, + "router_z_loss_mlp": 0.55395508, + "step": 1233, + "time_per_iteration": 2.6186439990997314 + }, + { + "auxiliary_loss_clip": 0.06909496, + "auxiliary_loss_mlp": 0.0136395, + "balance_loss_clip": 0.06408279, + "balance_loss_mlp": 0.01302462, + "epoch": 0.07419209379227416, + "flos": 39209508696960.0, + "grad_norm": 3.1172656995673393, + "language_loss": 0.73086953, + "learning_rate": 3.979549560846883e-06, + "loss": 0.813604, + "num_input_tokens_seen": 26228110, + "router_z_loss_clip": 5.0078125, + "router_z_loss_mlp": 0.61450195, + "step": 1234, + "time_per_iteration": 2.750397205352783 + }, + { + "auxiliary_loss_clip": 0.0689207, + "auxiliary_loss_mlp": 0.01355226, + "balance_loss_clip": 0.06398024, + "balance_loss_mlp": 0.01294786, + "epoch": 0.07425221704494213, + "flos": 22787899741440.0, + "grad_norm": 2.355636628350322, + "language_loss": 0.789891, + "learning_rate": 3.979493970890478e-06, + "loss": 0.87236392, + "num_input_tokens_seen": 26247020, + "router_z_loss_clip": 4.9375, + "router_z_loss_mlp": 0.60473633, + "step": 1235, + "time_per_iteration": 2.5847980976104736 + }, + { + "auxiliary_loss_clip": 0.06876536, + "auxiliary_loss_mlp": 0.0134157, + "balance_loss_clip": 0.0640441, + "balance_loss_mlp": 0.01286972, + "epoch": 0.0743123402976101, + "flos": 22279495893120.0, + "grad_norm": 4.38662001374288, + "language_loss": 0.84938204, + "learning_rate": 3.979438305871464e-06, + "loss": 0.93156314, + "num_input_tokens_seen": 26265750, + "router_z_loss_clip": 4.71484375, + "router_z_loss_mlp": 0.54589844, + "step": 1236, + "time_per_iteration": 2.6517555713653564 + }, + { + "auxiliary_loss_clip": 0.06904443, + "auxiliary_loss_mlp": 0.013457, + "balance_loss_clip": 0.06407445, + "balance_loss_mlp": 0.01288479, + "epoch": 0.07437246355027807, + "flos": 29322768942720.0, + "grad_norm": 2.2405587930301705, + "language_loss": 0.78282797, + "learning_rate": 3.979382565791951e-06, + "loss": 0.86532938, + "num_input_tokens_seen": 26287905, + "router_z_loss_clip": 4.96875, + "router_z_loss_mlp": 0.57275391, + "step": 1237, + "time_per_iteration": 2.729818105697632 + }, + { + "auxiliary_loss_clip": 0.06881858, + "auxiliary_loss_mlp": 0.01325868, + "balance_loss_clip": 0.06397796, + "balance_loss_mlp": 0.01274488, + "epoch": 0.07443258680294604, + "flos": 31953367549440.0, + "grad_norm": 2.5947803667316123, + "language_loss": 0.79746008, + "learning_rate": 3.979326750654053e-06, + "loss": 0.87953734, + "num_input_tokens_seen": 26311795, + "router_z_loss_clip": 4.84765625, + "router_z_loss_mlp": 0.51391602, + "step": 1238, + "time_per_iteration": 2.7127678394317627 + }, + { + "auxiliary_loss_clip": 0.06888152, + "auxiliary_loss_mlp": 0.01350045, + "balance_loss_clip": 0.06387939, + "balance_loss_mlp": 0.01285982, + "epoch": 0.074492710055614, + "flos": 22682031707520.0, + "grad_norm": 6.17193517167714, + "language_loss": 0.88359845, + "learning_rate": 3.9792708604598854e-06, + "loss": 0.96598047, + "num_input_tokens_seen": 26330330, + "router_z_loss_clip": 5.00390625, + "router_z_loss_mlp": 0.64038086, + "step": 1239, + "time_per_iteration": 2.5982487201690674 + }, + { + "auxiliary_loss_clip": 0.06867203, + "auxiliary_loss_mlp": 0.01339139, + "balance_loss_clip": 0.06376298, + "balance_loss_mlp": 0.01279201, + "epoch": 0.07455283330828198, + "flos": 21290752114560.0, + "grad_norm": 4.728508562946579, + "language_loss": 0.9183414, + "learning_rate": 3.979214895211569e-06, + "loss": 1.00040483, + "num_input_tokens_seen": 26348865, + "router_z_loss_clip": 4.90625, + "router_z_loss_mlp": 0.59960938, + "step": 1240, + "time_per_iteration": 3.982212781906128 + }, + { + "auxiliary_loss_clip": 0.0687404, + "auxiliary_loss_mlp": 0.01344277, + "balance_loss_clip": 0.06383809, + "balance_loss_mlp": 0.01287676, + "epoch": 0.07461295656094995, + "flos": 24395150033280.0, + "grad_norm": 2.7209561023558506, + "language_loss": 0.903265, + "learning_rate": 3.979158854911225e-06, + "loss": 0.98544812, + "num_input_tokens_seen": 26368210, + "router_z_loss_clip": 4.8984375, + "router_z_loss_mlp": 0.56616211, + "step": 1241, + "time_per_iteration": 2.622676372528076 + }, + { + "auxiliary_loss_clip": 0.06764787, + "auxiliary_loss_mlp": 0.01319561, + "balance_loss_clip": 0.06452408, + "balance_loss_mlp": 0.01283775, + "epoch": 0.07467307981361791, + "flos": 62127971498880.0, + "grad_norm": 0.8806411506129102, + "language_loss": 0.63242501, + "learning_rate": 3.979102739560979e-06, + "loss": 0.71326846, + "num_input_tokens_seen": 26424890, + "router_z_loss_clip": 3.125, + "router_z_loss_mlp": 0.35864258, + "step": 1242, + "time_per_iteration": 4.608001947402954 + }, + { + "auxiliary_loss_clip": 0.06884564, + "auxiliary_loss_mlp": 0.01350666, + "balance_loss_clip": 0.06376857, + "balance_loss_mlp": 0.01288319, + "epoch": 0.07473320306628589, + "flos": 24870039448320.0, + "grad_norm": 20.01115775481137, + "language_loss": 0.65988898, + "learning_rate": 3.9790465491629595e-06, + "loss": 0.74224126, + "num_input_tokens_seen": 26446405, + "router_z_loss_clip": 5.08203125, + "router_z_loss_mlp": 0.6237793, + "step": 1243, + "time_per_iteration": 2.686720371246338 + }, + { + "auxiliary_loss_clip": 0.068617, + "auxiliary_loss_mlp": 0.01347661, + "balance_loss_clip": 0.06381305, + "balance_loss_mlp": 0.01292491, + "epoch": 0.07479332631895386, + "flos": 24903973152000.0, + "grad_norm": 3.6813184842747346, + "language_loss": 0.78008217, + "learning_rate": 3.978990283719296e-06, + "loss": 0.86217576, + "num_input_tokens_seen": 26466070, + "router_z_loss_clip": 4.8046875, + "router_z_loss_mlp": 0.55175781, + "step": 1244, + "time_per_iteration": 4.040115833282471 + }, + { + "auxiliary_loss_clip": 0.06851211, + "auxiliary_loss_mlp": 0.01348909, + "balance_loss_clip": 0.06370524, + "balance_loss_mlp": 0.01292833, + "epoch": 0.07485344957162182, + "flos": 17819932291200.0, + "grad_norm": 21.86650929914808, + "language_loss": 0.72362238, + "learning_rate": 3.978933943232123e-06, + "loss": 0.80562365, + "num_input_tokens_seen": 26479350, + "router_z_loss_clip": 4.80078125, + "router_z_loss_mlp": 0.56103516, + "step": 1245, + "time_per_iteration": 2.524477481842041 + }, + { + "auxiliary_loss_clip": 0.06865877, + "auxiliary_loss_mlp": 0.01375645, + "balance_loss_clip": 0.06379819, + "balance_loss_mlp": 0.01317042, + "epoch": 0.0749135728242898, + "flos": 25017304199040.0, + "grad_norm": 2.436107230077969, + "language_loss": 0.90751457, + "learning_rate": 3.978877527703576e-06, + "loss": 0.98992985, + "num_input_tokens_seen": 26498255, + "router_z_loss_clip": 4.85546875, + "router_z_loss_mlp": 0.58642578, + "step": 1246, + "time_per_iteration": 4.0361082553863525 + }, + { + "auxiliary_loss_clip": 0.06889592, + "auxiliary_loss_mlp": 0.01353914, + "balance_loss_clip": 0.06373734, + "balance_loss_mlp": 0.0128978, + "epoch": 0.07497369607695777, + "flos": 17827898428800.0, + "grad_norm": 3.630435288529284, + "language_loss": 0.91536689, + "learning_rate": 3.9788210371357945e-06, + "loss": 0.99780184, + "num_input_tokens_seen": 26515375, + "router_z_loss_clip": 5.15234375, + "router_z_loss_mlp": 0.64111328, + "step": 1247, + "time_per_iteration": 2.558710813522339 + }, + { + "auxiliary_loss_clip": 0.06850724, + "auxiliary_loss_mlp": 0.01373111, + "balance_loss_clip": 0.06373762, + "balance_loss_mlp": 0.01312124, + "epoch": 0.07503381932962573, + "flos": 15126287886720.0, + "grad_norm": 2.9459859952497336, + "language_loss": 0.67146099, + "learning_rate": 3.978764471530921e-06, + "loss": 0.7536993, + "num_input_tokens_seen": 26533595, + "router_z_loss_clip": 4.7578125, + "router_z_loss_mlp": 0.60986328, + "step": 1248, + "time_per_iteration": 2.5451245307922363 + }, + { + "auxiliary_loss_clip": 0.06826814, + "auxiliary_loss_mlp": 0.0138466, + "balance_loss_clip": 0.06362367, + "balance_loss_mlp": 0.01326009, + "epoch": 0.0750939425822937, + "flos": 12820588686720.0, + "grad_norm": 4.865871965779137, + "language_loss": 0.76126468, + "learning_rate": 3.978707830891102e-06, + "loss": 0.84337938, + "num_input_tokens_seen": 26549405, + "router_z_loss_clip": 4.64453125, + "router_z_loss_mlp": 0.58642578, + "step": 1249, + "time_per_iteration": 2.547814130783081 + }, + { + "auxiliary_loss_clip": 0.06878477, + "auxiliary_loss_mlp": 0.01356674, + "balance_loss_clip": 0.06384575, + "balance_loss_mlp": 0.01291156, + "epoch": 0.07515406583496168, + "flos": 24213700016640.0, + "grad_norm": 3.3650478618726805, + "language_loss": 0.84855753, + "learning_rate": 3.978651115218482e-06, + "loss": 0.93090904, + "num_input_tokens_seen": 26567200, + "router_z_loss_clip": 4.9296875, + "router_z_loss_mlp": 0.65429688, + "step": 1250, + "time_per_iteration": 2.6201655864715576 + }, + { + "auxiliary_loss_clip": 0.0685844, + "auxiliary_loss_mlp": 0.01372833, + "balance_loss_clip": 0.06383228, + "balance_loss_mlp": 0.01312036, + "epoch": 0.07521418908762964, + "flos": 26695482572160.0, + "grad_norm": 2.950747307093222, + "language_loss": 0.7010417, + "learning_rate": 3.978594324515215e-06, + "loss": 0.7833544, + "num_input_tokens_seen": 26586190, + "router_z_loss_clip": 4.74609375, + "router_z_loss_mlp": 0.60742188, + "step": 1251, + "time_per_iteration": 2.6431658267974854 + }, + { + "auxiliary_loss_clip": 0.06735167, + "auxiliary_loss_mlp": 0.01321971, + "balance_loss_clip": 0.06424966, + "balance_loss_mlp": 0.0128411, + "epoch": 0.0752743123402976, + "flos": 59115255546240.0, + "grad_norm": 0.864981950603712, + "language_loss": 0.69976699, + "learning_rate": 3.9785374587834515e-06, + "loss": 0.78033841, + "num_input_tokens_seen": 26650710, + "router_z_loss_clip": 3.10351562, + "router_z_loss_mlp": 0.37792969, + "step": 1252, + "time_per_iteration": 3.2185781002044678 + }, + { + "auxiliary_loss_clip": 0.06854245, + "auxiliary_loss_mlp": 0.01348889, + "balance_loss_clip": 0.06374305, + "balance_loss_mlp": 0.01288426, + "epoch": 0.07533443559296558, + "flos": 23483749173120.0, + "grad_norm": 3.3162526589419876, + "language_loss": 0.82824075, + "learning_rate": 3.97848051802535e-06, + "loss": 0.91027212, + "num_input_tokens_seen": 26669000, + "router_z_loss_clip": 4.7890625, + "router_z_loss_mlp": 0.60498047, + "step": 1253, + "time_per_iteration": 2.6227848529815674 + }, + { + "auxiliary_loss_clip": 0.06867173, + "auxiliary_loss_mlp": 0.01358456, + "balance_loss_clip": 0.06365065, + "balance_loss_mlp": 0.01293749, + "epoch": 0.07539455884563355, + "flos": 20884149377280.0, + "grad_norm": 6.3858164660002625, + "language_loss": 0.96525204, + "learning_rate": 3.978423502243069e-06, + "loss": 1.04750824, + "num_input_tokens_seen": 26683075, + "router_z_loss_clip": 5.015625, + "router_z_loss_mlp": 0.64697266, + "step": 1254, + "time_per_iteration": 2.5511484146118164 + }, + { + "auxiliary_loss_clip": 0.06840456, + "auxiliary_loss_mlp": 0.0135521, + "balance_loss_clip": 0.06368542, + "balance_loss_mlp": 0.012916, + "epoch": 0.07545468209830151, + "flos": 27680327136000.0, + "grad_norm": 2.4514498349060307, + "language_loss": 0.9076122, + "learning_rate": 3.97836641143877e-06, + "loss": 0.98956883, + "num_input_tokens_seen": 26701875, + "router_z_loss_clip": 4.71875, + "router_z_loss_mlp": 0.63525391, + "step": 1255, + "time_per_iteration": 2.6308302879333496 + }, + { + "auxiliary_loss_clip": 0.06840869, + "auxiliary_loss_mlp": 0.01347194, + "balance_loss_clip": 0.06364559, + "balance_loss_mlp": 0.01285968, + "epoch": 0.0755148053509695, + "flos": 14142198009600.0, + "grad_norm": 2.7245497332904325, + "language_loss": 0.81970763, + "learning_rate": 3.978309245614618e-06, + "loss": 0.90158832, + "num_input_tokens_seen": 26719050, + "router_z_loss_clip": 4.75390625, + "router_z_loss_mlp": 0.61230469, + "step": 1256, + "time_per_iteration": 2.552151679992676 + }, + { + "auxiliary_loss_clip": 0.06681269, + "auxiliary_loss_mlp": 0.01315431, + "balance_loss_clip": 0.06378952, + "balance_loss_mlp": 0.01282076, + "epoch": 0.07557492860363746, + "flos": 58251764822400.0, + "grad_norm": 0.7695886437006154, + "language_loss": 0.58049726, + "learning_rate": 3.9782520047727825e-06, + "loss": 0.66046429, + "num_input_tokens_seen": 26780650, + "router_z_loss_clip": 3.03125, + "router_z_loss_mlp": 0.33374023, + "step": 1257, + "time_per_iteration": 3.304816246032715 + }, + { + "auxiliary_loss_clip": 0.06853162, + "auxiliary_loss_mlp": 0.0135189, + "balance_loss_clip": 0.0636155, + "balance_loss_mlp": 0.01284012, + "epoch": 0.07563505185630542, + "flos": 24651259637760.0, + "grad_norm": 2.373470459060695, + "language_loss": 0.93104446, + "learning_rate": 3.978194688915432e-06, + "loss": 1.0130949, + "num_input_tokens_seen": 26798725, + "router_z_loss_clip": 4.91015625, + "router_z_loss_mlp": 0.6784668, + "step": 1258, + "time_per_iteration": 2.6907479763031006 + }, + { + "auxiliary_loss_clip": 0.06829782, + "auxiliary_loss_mlp": 0.01330684, + "balance_loss_clip": 0.06361564, + "balance_loss_mlp": 0.01273559, + "epoch": 0.07569517510897339, + "flos": 15528362503680.0, + "grad_norm": 3.094615329702446, + "language_loss": 0.84079689, + "learning_rate": 3.978137298044741e-06, + "loss": 0.92240155, + "num_input_tokens_seen": 26817005, + "router_z_loss_clip": 4.6796875, + "router_z_loss_mlp": 0.57128906, + "step": 1259, + "time_per_iteration": 2.5581536293029785 + }, + { + "auxiliary_loss_clip": 0.06848526, + "auxiliary_loss_mlp": 0.0132832, + "balance_loss_clip": 0.06371632, + "balance_loss_mlp": 0.01271052, + "epoch": 0.07575529836164137, + "flos": 22934954856960.0, + "grad_norm": 3.148240250348832, + "language_loss": 0.77577376, + "learning_rate": 3.978079832162885e-06, + "loss": 0.85754222, + "num_input_tokens_seen": 26836655, + "router_z_loss_clip": 4.76171875, + "router_z_loss_mlp": 0.57275391, + "step": 1260, + "time_per_iteration": 2.601511240005493 + }, + { + "auxiliary_loss_clip": 0.06837059, + "auxiliary_loss_mlp": 0.01329742, + "balance_loss_clip": 0.06359653, + "balance_loss_mlp": 0.01268421, + "epoch": 0.07581542161430933, + "flos": 19506537999360.0, + "grad_norm": 2.0302273693268535, + "language_loss": 0.87771595, + "learning_rate": 3.978022291272044e-06, + "loss": 0.95938396, + "num_input_tokens_seen": 26854925, + "router_z_loss_clip": 4.77734375, + "router_z_loss_mlp": 0.61328125, + "step": 1261, + "time_per_iteration": 2.5501255989074707 + }, + { + "auxiliary_loss_clip": 0.06841564, + "auxiliary_loss_mlp": 0.01315914, + "balance_loss_clip": 0.06369701, + "balance_loss_mlp": 0.01256547, + "epoch": 0.0758755448669773, + "flos": 24980519456640.0, + "grad_norm": 2.7189086354386407, + "language_loss": 0.84886664, + "learning_rate": 3.977964675374399e-06, + "loss": 0.93044144, + "num_input_tokens_seen": 26876170, + "router_z_loss_clip": 4.70703125, + "router_z_loss_mlp": 0.59423828, + "step": 1262, + "time_per_iteration": 2.642197370529175 + }, + { + "auxiliary_loss_clip": 0.06848589, + "auxiliary_loss_mlp": 0.01328257, + "balance_loss_clip": 0.06354951, + "balance_loss_mlp": 0.01263312, + "epoch": 0.07593566811964528, + "flos": 22754678797440.0, + "grad_norm": 3.7332355829542183, + "language_loss": 0.84859836, + "learning_rate": 3.977906984472136e-06, + "loss": 0.93036681, + "num_input_tokens_seen": 26895005, + "router_z_loss_clip": 4.94140625, + "router_z_loss_mlp": 0.64941406, + "step": 1263, + "time_per_iteration": 2.5762293338775635 + }, + { + "auxiliary_loss_clip": 0.06852871, + "auxiliary_loss_mlp": 0.01316465, + "balance_loss_clip": 0.06365145, + "balance_loss_mlp": 0.0126039, + "epoch": 0.07599579137231324, + "flos": 23119088204160.0, + "grad_norm": 2.8380907470503036, + "language_loss": 0.78429461, + "learning_rate": 3.977849218567442e-06, + "loss": 0.86598796, + "num_input_tokens_seen": 26913930, + "router_z_loss_clip": 4.87890625, + "router_z_loss_mlp": 0.56103516, + "step": 1264, + "time_per_iteration": 2.7333550453186035 + }, + { + "auxiliary_loss_clip": 0.06862055, + "auxiliary_loss_mlp": 0.01331538, + "balance_loss_clip": 0.06363812, + "balance_loss_mlp": 0.01272362, + "epoch": 0.07605591462498121, + "flos": 14507362103040.0, + "grad_norm": 3.0292139687816455, + "language_loss": 0.84203875, + "learning_rate": 3.977791377662507e-06, + "loss": 0.92397463, + "num_input_tokens_seen": 26931485, + "router_z_loss_clip": 4.984375, + "router_z_loss_mlp": 0.59179688, + "step": 1265, + "time_per_iteration": 2.587218761444092 + }, + { + "auxiliary_loss_clip": 0.06855778, + "auxiliary_loss_mlp": 0.01328532, + "balance_loss_clip": 0.0636021, + "balance_loss_mlp": 0.01264779, + "epoch": 0.07611603787764919, + "flos": 23521037040000.0, + "grad_norm": 3.3546410086249976, + "language_loss": 0.67662913, + "learning_rate": 3.977733461759524e-06, + "loss": 0.7584722, + "num_input_tokens_seen": 26951670, + "router_z_loss_clip": 4.953125, + "router_z_loss_mlp": 0.63720703, + "step": 1266, + "time_per_iteration": 2.6307120323181152 + }, + { + "auxiliary_loss_clip": 0.06869242, + "auxiliary_loss_mlp": 0.01332957, + "balance_loss_clip": 0.06363578, + "balance_loss_mlp": 0.01267201, + "epoch": 0.07617616113031715, + "flos": 21513640775040.0, + "grad_norm": 2.4484297039949894, + "language_loss": 0.81777161, + "learning_rate": 3.977675470860691e-06, + "loss": 0.89979357, + "num_input_tokens_seen": 26970335, + "router_z_loss_clip": 5.0546875, + "router_z_loss_mlp": 0.65673828, + "step": 1267, + "time_per_iteration": 2.5816946029663086 + }, + { + "auxiliary_loss_clip": 0.06859374, + "auxiliary_loss_mlp": 0.01329793, + "balance_loss_clip": 0.06364329, + "balance_loss_mlp": 0.01269354, + "epoch": 0.07623628438298512, + "flos": 14578164403200.0, + "grad_norm": 3.901991680203772, + "language_loss": 0.74711108, + "learning_rate": 3.977617404968205e-06, + "loss": 0.82900274, + "num_input_tokens_seen": 26986025, + "router_z_loss_clip": 4.94140625, + "router_z_loss_mlp": 0.60498047, + "step": 1268, + "time_per_iteration": 2.5329971313476562 + }, + { + "auxiliary_loss_clip": 0.06849901, + "auxiliary_loss_mlp": 0.01321442, + "balance_loss_clip": 0.06367739, + "balance_loss_mlp": 0.01263959, + "epoch": 0.07629640763565308, + "flos": 14725638789120.0, + "grad_norm": 7.47291205592579, + "language_loss": 0.85124403, + "learning_rate": 3.977559264084269e-06, + "loss": 0.93295747, + "num_input_tokens_seen": 27004045, + "router_z_loss_clip": 4.8125, + "router_z_loss_mlp": 0.57421875, + "step": 1269, + "time_per_iteration": 2.5311200618743896 + }, + { + "auxiliary_loss_clip": 0.06839523, + "auxiliary_loss_mlp": 0.01320369, + "balance_loss_clip": 0.0637067, + "balance_loss_mlp": 0.01264126, + "epoch": 0.07635653088832106, + "flos": 14908220835840.0, + "grad_norm": 2.6697300314393355, + "language_loss": 0.91628265, + "learning_rate": 3.977501048211088e-06, + "loss": 0.99788159, + "num_input_tokens_seen": 27022070, + "router_z_loss_clip": 4.6875, + "router_z_loss_mlp": 0.5625, + "step": 1270, + "time_per_iteration": 2.590938091278076 + }, + { + "auxiliary_loss_clip": 0.06847905, + "auxiliary_loss_mlp": 0.01334774, + "balance_loss_clip": 0.06368862, + "balance_loss_mlp": 0.01272309, + "epoch": 0.07641665414098903, + "flos": 26658865537920.0, + "grad_norm": 4.240829447117421, + "language_loss": 0.73391259, + "learning_rate": 3.977442757350869e-06, + "loss": 0.81573939, + "num_input_tokens_seen": 27041755, + "router_z_loss_clip": 4.7890625, + "router_z_loss_mlp": 0.625, + "step": 1271, + "time_per_iteration": 2.5961694717407227 + }, + { + "auxiliary_loss_clip": 0.06838269, + "auxiliary_loss_mlp": 0.01329276, + "balance_loss_clip": 0.06381856, + "balance_loss_mlp": 0.01278445, + "epoch": 0.07647677739365699, + "flos": 25199970099840.0, + "grad_norm": 3.136617280050721, + "language_loss": 0.8526597, + "learning_rate": 3.977384391505823e-06, + "loss": 0.93433517, + "num_input_tokens_seen": 27061540, + "router_z_loss_clip": 4.55859375, + "router_z_loss_mlp": 0.50878906, + "step": 1272, + "time_per_iteration": 2.6091222763061523 + }, + { + "auxiliary_loss_clip": 0.06845278, + "auxiliary_loss_mlp": 0.01336295, + "balance_loss_clip": 0.06370107, + "balance_loss_mlp": 0.01279599, + "epoch": 0.07653690064632497, + "flos": 20564365069440.0, + "grad_norm": 3.1222866186562674, + "language_loss": 0.82570672, + "learning_rate": 3.977325950678162e-06, + "loss": 0.90752244, + "num_input_tokens_seen": 27081395, + "router_z_loss_clip": 4.75, + "router_z_loss_mlp": 0.56713867, + "step": 1273, + "time_per_iteration": 2.5675384998321533 + }, + { + "auxiliary_loss_clip": 0.06864737, + "auxiliary_loss_mlp": 0.01336748, + "balance_loss_clip": 0.06374316, + "balance_loss_mlp": 0.01277787, + "epoch": 0.07659702389899294, + "flos": 22275219335040.0, + "grad_norm": 2.5887634532412123, + "language_loss": 0.83504725, + "learning_rate": 3.977267434870103e-06, + "loss": 0.91706204, + "num_input_tokens_seen": 27101175, + "router_z_loss_clip": 4.90234375, + "router_z_loss_mlp": 0.58862305, + "step": 1274, + "time_per_iteration": 2.594106912612915 + }, + { + "auxiliary_loss_clip": 0.06835781, + "auxiliary_loss_mlp": 0.01338776, + "balance_loss_clip": 0.06372908, + "balance_loss_mlp": 0.01281961, + "epoch": 0.0766571471516609, + "flos": 32644563079680.0, + "grad_norm": 2.657989216371077, + "language_loss": 0.75383544, + "learning_rate": 3.977208844083865e-06, + "loss": 0.835581, + "num_input_tokens_seen": 27124505, + "router_z_loss_clip": 4.62109375, + "router_z_loss_mlp": 0.56835938, + "step": 1275, + "time_per_iteration": 2.6635921001434326 + }, + { + "auxiliary_loss_clip": 0.06867371, + "auxiliary_loss_mlp": 0.01354656, + "balance_loss_clip": 0.06370118, + "balance_loss_mlp": 0.01289377, + "epoch": 0.07671727040432888, + "flos": 15272672169600.0, + "grad_norm": 3.4268385774262637, + "language_loss": 0.82329005, + "learning_rate": 3.9771501783216685e-06, + "loss": 0.90551031, + "num_input_tokens_seen": 27140960, + "router_z_loss_clip": 4.98046875, + "router_z_loss_mlp": 0.65234375, + "step": 1276, + "time_per_iteration": 2.5468428134918213 + }, + { + "auxiliary_loss_clip": 0.06860888, + "auxiliary_loss_mlp": 0.01344496, + "balance_loss_clip": 0.06380928, + "balance_loss_mlp": 0.01285964, + "epoch": 0.07677739365699685, + "flos": 28191665877120.0, + "grad_norm": 8.54617583390301, + "language_loss": 0.61651218, + "learning_rate": 3.97709143758574e-06, + "loss": 0.69856602, + "num_input_tokens_seen": 27160985, + "router_z_loss_clip": 4.78515625, + "router_z_loss_mlp": 0.58544922, + "step": 1277, + "time_per_iteration": 2.6240146160125732 + }, + { + "auxiliary_loss_clip": 0.06864151, + "auxiliary_loss_mlp": 0.01358552, + "balance_loss_clip": 0.06375778, + "balance_loss_mlp": 0.01298471, + "epoch": 0.07683751690966481, + "flos": 18301991230080.0, + "grad_norm": 2.6958136098916565, + "language_loss": 0.76683849, + "learning_rate": 3.977032621878305e-06, + "loss": 0.84906554, + "num_input_tokens_seen": 27178390, + "router_z_loss_clip": 4.875, + "router_z_loss_mlp": 0.60058594, + "step": 1278, + "time_per_iteration": 2.595947742462158 + }, + { + "auxiliary_loss_clip": 0.06835216, + "auxiliary_loss_mlp": 0.01346069, + "balance_loss_clip": 0.06372848, + "balance_loss_mlp": 0.01289683, + "epoch": 0.07689764016233278, + "flos": 21987565868160.0, + "grad_norm": 3.428980152963994, + "language_loss": 0.90527773, + "learning_rate": 3.976973731201596e-06, + "loss": 0.98709059, + "num_input_tokens_seen": 27197505, + "router_z_loss_clip": 4.62109375, + "router_z_loss_mlp": 0.56420898, + "step": 1279, + "time_per_iteration": 3.962568521499634 + }, + { + "auxiliary_loss_clip": 0.06834365, + "auxiliary_loss_mlp": 0.01339419, + "balance_loss_clip": 0.06362047, + "balance_loss_mlp": 0.01287301, + "epoch": 0.07695776341500075, + "flos": 22242417661440.0, + "grad_norm": 3.3495960477632685, + "language_loss": 0.85256732, + "learning_rate": 3.976914765557845e-06, + "loss": 0.93430507, + "num_input_tokens_seen": 27214260, + "router_z_loss_clip": 4.71484375, + "router_z_loss_mlp": 0.52148438, + "step": 1280, + "time_per_iteration": 2.5692243576049805 + }, + { + "auxiliary_loss_clip": 0.06832324, + "auxiliary_loss_mlp": 0.01339262, + "balance_loss_clip": 0.06368576, + "balance_loss_mlp": 0.01283662, + "epoch": 0.07701788666766872, + "flos": 16149300785280.0, + "grad_norm": 2.5153075146211274, + "language_loss": 0.78576446, + "learning_rate": 3.9768557249492875e-06, + "loss": 0.8674804, + "num_input_tokens_seen": 27232525, + "router_z_loss_clip": 4.63671875, + "router_z_loss_mlp": 0.55541992, + "step": 1281, + "time_per_iteration": 4.005364894866943 + }, + { + "auxiliary_loss_clip": 0.06866302, + "auxiliary_loss_mlp": 0.01356763, + "balance_loss_clip": 0.06371205, + "balance_loss_mlp": 0.01291317, + "epoch": 0.07707800992033668, + "flos": 19468998570240.0, + "grad_norm": 5.650134420498799, + "language_loss": 0.77910447, + "learning_rate": 3.9767966093781634e-06, + "loss": 0.8613351, + "num_input_tokens_seen": 27249800, + "router_z_loss_clip": 4.95703125, + "router_z_loss_mlp": 0.65429688, + "step": 1282, + "time_per_iteration": 2.6096553802490234 + }, + { + "auxiliary_loss_clip": 0.06843832, + "auxiliary_loss_mlp": 0.01354603, + "balance_loss_clip": 0.06370867, + "balance_loss_mlp": 0.01298647, + "epoch": 0.07713813317300466, + "flos": 18996415142400.0, + "grad_norm": 3.5179830835441974, + "language_loss": 0.86225599, + "learning_rate": 3.976737418846713e-06, + "loss": 0.94424033, + "num_input_tokens_seen": 27268895, + "router_z_loss_clip": 4.72265625, + "router_z_loss_mlp": 0.55932617, + "step": 1283, + "time_per_iteration": 2.605346202850342 + }, + { + "auxiliary_loss_clip": 0.06835528, + "auxiliary_loss_mlp": 0.01347471, + "balance_loss_clip": 0.06358841, + "balance_loss_mlp": 0.01292039, + "epoch": 0.07719825642567263, + "flos": 18119828453760.0, + "grad_norm": 2.430743235056626, + "language_loss": 0.77539676, + "learning_rate": 3.976678153357181e-06, + "loss": 0.85722673, + "num_input_tokens_seen": 27288180, + "router_z_loss_clip": 4.76171875, + "router_z_loss_mlp": 0.55444336, + "step": 1284, + "time_per_iteration": 3.990124225616455 + }, + { + "auxiliary_loss_clip": 0.06827543, + "auxiliary_loss_mlp": 0.01355487, + "balance_loss_clip": 0.06358978, + "balance_loss_mlp": 0.01300294, + "epoch": 0.0772583796783406, + "flos": 42204307075200.0, + "grad_norm": 2.435341154952095, + "language_loss": 0.78285027, + "learning_rate": 3.976618812911817e-06, + "loss": 0.86468053, + "num_input_tokens_seen": 27311815, + "router_z_loss_clip": 4.6796875, + "router_z_loss_mlp": 0.55200195, + "step": 1285, + "time_per_iteration": 2.7569363117218018 + }, + { + "auxiliary_loss_clip": 0.06851525, + "auxiliary_loss_mlp": 0.01337351, + "balance_loss_clip": 0.06371935, + "balance_loss_mlp": 0.01278081, + "epoch": 0.07731850293100857, + "flos": 24760565688960.0, + "grad_norm": 2.195462031898389, + "language_loss": 0.86501926, + "learning_rate": 3.9765593975128685e-06, + "loss": 0.946908, + "num_input_tokens_seen": 27331890, + "router_z_loss_clip": 4.79296875, + "router_z_loss_mlp": 0.59277344, + "step": 1286, + "time_per_iteration": 4.058920383453369 + }, + { + "auxiliary_loss_clip": 0.06876462, + "auxiliary_loss_mlp": 0.01367501, + "balance_loss_clip": 0.0637191, + "balance_loss_mlp": 0.01299314, + "epoch": 0.07737862618367654, + "flos": 17571537262080.0, + "grad_norm": 2.773879522110049, + "language_loss": 0.79808044, + "learning_rate": 3.97649990716259e-06, + "loss": 0.88052011, + "num_input_tokens_seen": 27348320, + "router_z_loss_clip": 5.04296875, + "router_z_loss_mlp": 0.68212891, + "step": 1287, + "time_per_iteration": 2.562206506729126 + }, + { + "auxiliary_loss_clip": 0.06845251, + "auxiliary_loss_mlp": 0.01340112, + "balance_loss_clip": 0.06370382, + "balance_loss_mlp": 0.01288136, + "epoch": 0.0774387494363445, + "flos": 25633798214400.0, + "grad_norm": 2.3847373218246983, + "language_loss": 0.8715058, + "learning_rate": 3.976440341863237e-06, + "loss": 0.95335943, + "num_input_tokens_seen": 27367670, + "router_z_loss_clip": 4.74609375, + "router_z_loss_mlp": 0.51953125, + "step": 1288, + "time_per_iteration": 2.600308656692505 + }, + { + "auxiliary_loss_clip": 0.0688329, + "auxiliary_loss_mlp": 0.01364865, + "balance_loss_clip": 0.06375885, + "balance_loss_mlp": 0.01300611, + "epoch": 0.07749887268901248, + "flos": 12244778628480.0, + "grad_norm": 3.451146773235629, + "language_loss": 0.8824665, + "learning_rate": 3.976380701617068e-06, + "loss": 0.96494806, + "num_input_tokens_seen": 27385485, + "router_z_loss_clip": 5.0703125, + "router_z_loss_mlp": 0.64306641, + "step": 1289, + "time_per_iteration": 2.6120755672454834 + }, + { + "auxiliary_loss_clip": 0.06845821, + "auxiliary_loss_mlp": 0.01332003, + "balance_loss_clip": 0.06365949, + "balance_loss_mlp": 0.0127781, + "epoch": 0.07755899594168045, + "flos": 25088609623680.0, + "grad_norm": 3.9721153981819377, + "language_loss": 0.87731397, + "learning_rate": 3.976320986426344e-06, + "loss": 0.95909214, + "num_input_tokens_seen": 27405110, + "router_z_loss_clip": 4.80078125, + "router_z_loss_mlp": 0.54150391, + "step": 1290, + "time_per_iteration": 2.6039535999298096 + }, + { + "auxiliary_loss_clip": 0.06849636, + "auxiliary_loss_mlp": 0.0134794, + "balance_loss_clip": 0.0637328, + "balance_loss_mlp": 0.01286833, + "epoch": 0.07761911919434841, + "flos": 14251629841920.0, + "grad_norm": 2.80389948255575, + "language_loss": 0.9359982, + "learning_rate": 3.9762611962933315e-06, + "loss": 1.0179739, + "num_input_tokens_seen": 27422855, + "router_z_loss_clip": 4.765625, + "router_z_loss_mlp": 0.61157227, + "step": 1291, + "time_per_iteration": 2.620960235595703 + }, + { + "auxiliary_loss_clip": 0.06740145, + "auxiliary_loss_mlp": 0.01502792, + "balance_loss_clip": 0.06432445, + "balance_loss_mlp": 0.01475422, + "epoch": 0.07767924244701638, + "flos": 67259639099520.0, + "grad_norm": 0.9524065323514693, + "language_loss": 0.65448344, + "learning_rate": 3.9762013312202955e-06, + "loss": 0.73691273, + "num_input_tokens_seen": 27487190, + "router_z_loss_clip": 3.078125, + "router_z_loss_mlp": 0.27416992, + "step": 1292, + "time_per_iteration": 3.3147408962249756 + }, + { + "auxiliary_loss_clip": 0.06863274, + "auxiliary_loss_mlp": 0.01339428, + "balance_loss_clip": 0.06369414, + "balance_loss_mlp": 0.01279203, + "epoch": 0.07773936569968436, + "flos": 28558548979200.0, + "grad_norm": 5.92776916982661, + "language_loss": 0.89760518, + "learning_rate": 3.9761413912095075e-06, + "loss": 0.97963214, + "num_input_tokens_seen": 27510465, + "router_z_loss_clip": 4.9375, + "router_z_loss_mlp": 0.60229492, + "step": 1293, + "time_per_iteration": 2.649545431137085 + }, + { + "auxiliary_loss_clip": 0.06850281, + "auxiliary_loss_mlp": 0.0134015, + "balance_loss_clip": 0.06365186, + "balance_loss_mlp": 0.01280689, + "epoch": 0.07779948895235232, + "flos": 27497619308160.0, + "grad_norm": 4.7786851588669315, + "language_loss": 0.88117272, + "learning_rate": 3.976081376263239e-06, + "loss": 0.96307707, + "num_input_tokens_seen": 27528645, + "router_z_loss_clip": 4.84765625, + "router_z_loss_mlp": 0.59521484, + "step": 1294, + "time_per_iteration": 2.7246196269989014 + }, + { + "auxiliary_loss_clip": 0.06872948, + "auxiliary_loss_mlp": 0.01341599, + "balance_loss_clip": 0.06369777, + "balance_loss_mlp": 0.01276034, + "epoch": 0.07785961220502029, + "flos": 18229176432000.0, + "grad_norm": 2.917147299599652, + "language_loss": 0.82283127, + "learning_rate": 3.976021286383768e-06, + "loss": 0.90497679, + "num_input_tokens_seen": 27546165, + "router_z_loss_clip": 5.03125, + "router_z_loss_mlp": 0.65576172, + "step": 1295, + "time_per_iteration": 2.565981149673462 + }, + { + "auxiliary_loss_clip": 0.06823503, + "auxiliary_loss_mlp": 0.0131494, + "balance_loss_clip": 0.06354046, + "balance_loss_mlp": 0.01258459, + "epoch": 0.07791973545768827, + "flos": 24615145728000.0, + "grad_norm": 2.406299450212834, + "language_loss": 0.90690672, + "learning_rate": 3.975961121573371e-06, + "loss": 0.9882912, + "num_input_tokens_seen": 27566520, + "router_z_loss_clip": 4.69140625, + "router_z_loss_mlp": 0.56494141, + "step": 1296, + "time_per_iteration": 2.6269545555114746 + }, + { + "auxiliary_loss_clip": 0.06845632, + "auxiliary_loss_mlp": 0.01328069, + "balance_loss_clip": 0.06355733, + "balance_loss_mlp": 0.01267058, + "epoch": 0.07797985871035623, + "flos": 14287156773120.0, + "grad_norm": 2.6954148658412636, + "language_loss": 0.98733974, + "learning_rate": 3.9759008818343305e-06, + "loss": 1.06907678, + "num_input_tokens_seen": 27581960, + "router_z_loss_clip": 4.8984375, + "router_z_loss_mlp": 0.61010742, + "step": 1297, + "time_per_iteration": 2.550185441970825 + }, + { + "auxiliary_loss_clip": 0.06845116, + "auxiliary_loss_mlp": 0.01318807, + "balance_loss_clip": 0.06359702, + "balance_loss_mlp": 0.01258606, + "epoch": 0.0780399819630242, + "flos": 26616965696640.0, + "grad_norm": 2.8603722020093287, + "language_loss": 0.7874198, + "learning_rate": 3.97584056716893e-06, + "loss": 0.86905909, + "num_input_tokens_seen": 27601415, + "router_z_loss_clip": 4.8515625, + "router_z_loss_mlp": 0.60229492, + "step": 1298, + "time_per_iteration": 2.6391749382019043 + }, + { + "auxiliary_loss_clip": 0.06826787, + "auxiliary_loss_mlp": 0.01312488, + "balance_loss_clip": 0.06351642, + "balance_loss_mlp": 0.01258558, + "epoch": 0.07810010521569218, + "flos": 21840846168960.0, + "grad_norm": 2.2381109850938077, + "language_loss": 0.83600903, + "learning_rate": 3.9757801775794575e-06, + "loss": 0.91740179, + "num_input_tokens_seen": 27621490, + "router_z_loss_clip": 4.75, + "router_z_loss_mlp": 0.53979492, + "step": 1299, + "time_per_iteration": 2.5871427059173584 + }, + { + "auxiliary_loss_clip": 0.0681142, + "auxiliary_loss_mlp": 0.01314166, + "balance_loss_clip": 0.06352274, + "balance_loss_mlp": 0.01260713, + "epoch": 0.07816022846836014, + "flos": 25088022645120.0, + "grad_norm": 2.404074331576357, + "language_loss": 0.89199561, + "learning_rate": 3.975719713068202e-06, + "loss": 0.97325152, + "num_input_tokens_seen": 27640600, + "router_z_loss_clip": 4.58984375, + "router_z_loss_mlp": 0.53442383, + "step": 1300, + "time_per_iteration": 2.633734941482544 + }, + { + "auxiliary_loss_clip": 0.06848504, + "auxiliary_loss_mlp": 0.01319579, + "balance_loss_clip": 0.0636059, + "balance_loss_mlp": 0.0125964, + "epoch": 0.0782203517210281, + "flos": 40927197070080.0, + "grad_norm": 2.022718991796153, + "language_loss": 0.7445091, + "learning_rate": 3.975659173637458e-06, + "loss": 0.82618994, + "num_input_tokens_seen": 27663070, + "router_z_loss_clip": 4.87890625, + "router_z_loss_mlp": 0.59936523, + "step": 1301, + "time_per_iteration": 2.7330377101898193 + }, + { + "auxiliary_loss_clip": 0.06825704, + "auxiliary_loss_mlp": 0.01316028, + "balance_loss_clip": 0.0635405, + "balance_loss_mlp": 0.01261335, + "epoch": 0.07828047497369607, + "flos": 41181587665920.0, + "grad_norm": 2.1366155853756275, + "language_loss": 0.73607302, + "learning_rate": 3.97559855928952e-06, + "loss": 0.81749034, + "num_input_tokens_seen": 27686425, + "router_z_loss_clip": 4.7109375, + "router_z_loss_mlp": 0.54736328, + "step": 1302, + "time_per_iteration": 2.781339168548584 + }, + { + "auxiliary_loss_clip": 0.06820317, + "auxiliary_loss_mlp": 0.01324174, + "balance_loss_clip": 0.06356553, + "balance_loss_mlp": 0.01270124, + "epoch": 0.07834059822636405, + "flos": 23513951370240.0, + "grad_norm": 3.2246124193670433, + "language_loss": 0.84486687, + "learning_rate": 3.9755378700266864e-06, + "loss": 0.92631173, + "num_input_tokens_seen": 27704900, + "router_z_loss_clip": 4.640625, + "router_z_loss_mlp": 0.54101562, + "step": 1303, + "time_per_iteration": 2.5946569442749023 + }, + { + "auxiliary_loss_clip": 0.06814861, + "auxiliary_loss_mlp": 0.01309278, + "balance_loss_clip": 0.06343949, + "balance_loss_mlp": 0.01254919, + "epoch": 0.07840072147903202, + "flos": 20200165297920.0, + "grad_norm": 2.085099882897468, + "language_loss": 0.77159727, + "learning_rate": 3.9754771058512585e-06, + "loss": 0.85283864, + "num_input_tokens_seen": 27724890, + "router_z_loss_clip": 4.69921875, + "router_z_loss_mlp": 0.54394531, + "step": 1304, + "time_per_iteration": 2.5800909996032715 + }, + { + "auxiliary_loss_clip": 0.06828763, + "auxiliary_loss_mlp": 0.01313707, + "balance_loss_clip": 0.06349462, + "balance_loss_mlp": 0.01258799, + "epoch": 0.07846084473169998, + "flos": 21367172638080.0, + "grad_norm": 2.1177139553290734, + "language_loss": 0.7841258, + "learning_rate": 3.975416266765542e-06, + "loss": 0.86555046, + "num_input_tokens_seen": 27743115, + "router_z_loss_clip": 4.78515625, + "router_z_loss_mlp": 0.54882812, + "step": 1305, + "time_per_iteration": 2.569558620452881 + }, + { + "auxiliary_loss_clip": 0.06855056, + "auxiliary_loss_mlp": 0.01321096, + "balance_loss_clip": 0.06367438, + "balance_loss_mlp": 0.01261348, + "epoch": 0.07852096798436796, + "flos": 25418037150720.0, + "grad_norm": 3.9004874062794057, + "language_loss": 0.88314414, + "learning_rate": 3.975355352771841e-06, + "loss": 0.96490562, + "num_input_tokens_seen": 27763570, + "router_z_loss_clip": 4.87109375, + "router_z_loss_mlp": 0.59765625, + "step": 1306, + "time_per_iteration": 2.6575305461883545 + }, + { + "auxiliary_loss_clip": 0.06810681, + "auxiliary_loss_mlp": 0.01315273, + "balance_loss_clip": 0.06347391, + "balance_loss_mlp": 0.01263608, + "epoch": 0.07858109123703592, + "flos": 24578360985600.0, + "grad_norm": 4.395850337278793, + "language_loss": 0.93214571, + "learning_rate": 3.975294363872468e-06, + "loss": 1.01340532, + "num_input_tokens_seen": 27780030, + "router_z_loss_clip": 4.6328125, + "router_z_loss_mlp": 0.51660156, + "step": 1307, + "time_per_iteration": 2.592435359954834 + }, + { + "auxiliary_loss_clip": 0.0682511, + "auxiliary_loss_mlp": 0.0131993, + "balance_loss_clip": 0.06345625, + "balance_loss_mlp": 0.01262566, + "epoch": 0.07864121448970389, + "flos": 20704250661120.0, + "grad_norm": 3.2307026300408683, + "language_loss": 0.8507998, + "learning_rate": 3.975233300069735e-06, + "loss": 0.93225014, + "num_input_tokens_seen": 27796225, + "router_z_loss_clip": 4.7890625, + "router_z_loss_mlp": 0.57373047, + "step": 1308, + "time_per_iteration": 2.597881555557251 + }, + { + "auxiliary_loss_clip": 0.06792136, + "auxiliary_loss_mlp": 0.01314144, + "balance_loss_clip": 0.06338251, + "balance_loss_mlp": 0.01262598, + "epoch": 0.07870133774237187, + "flos": 22973207045760.0, + "grad_norm": 1.9389316858499817, + "language_loss": 0.79464692, + "learning_rate": 3.975172161365958e-06, + "loss": 0.87570971, + "num_input_tokens_seen": 27815975, + "router_z_loss_clip": 4.53515625, + "router_z_loss_mlp": 0.515625, + "step": 1309, + "time_per_iteration": 2.599799871444702 + }, + { + "auxiliary_loss_clip": 0.06823064, + "auxiliary_loss_mlp": 0.01328854, + "balance_loss_clip": 0.06347175, + "balance_loss_mlp": 0.01272683, + "epoch": 0.07876146099503983, + "flos": 18848689194240.0, + "grad_norm": 2.5866734138361345, + "language_loss": 0.83378398, + "learning_rate": 3.975110947763453e-06, + "loss": 0.91530323, + "num_input_tokens_seen": 27832255, + "router_z_loss_clip": 4.765625, + "router_z_loss_mlp": 0.56176758, + "step": 1310, + "time_per_iteration": 2.5724973678588867 + }, + { + "auxiliary_loss_clip": 0.0678651, + "auxiliary_loss_mlp": 0.01315999, + "balance_loss_clip": 0.06338531, + "balance_loss_mlp": 0.01264811, + "epoch": 0.0788215842477078, + "flos": 23812631648640.0, + "grad_norm": 2.2765510373912683, + "language_loss": 0.76230896, + "learning_rate": 3.9750496592645435e-06, + "loss": 0.84333402, + "num_input_tokens_seen": 27852180, + "router_z_loss_clip": 4.48046875, + "router_z_loss_mlp": 0.51123047, + "step": 1311, + "time_per_iteration": 2.632310628890991 + }, + { + "auxiliary_loss_clip": 0.0680154, + "auxiliary_loss_mlp": 0.01319845, + "balance_loss_clip": 0.06336971, + "balance_loss_mlp": 0.01265009, + "epoch": 0.07888170750037576, + "flos": 21586329792000.0, + "grad_norm": 3.554782909684318, + "language_loss": 0.88360095, + "learning_rate": 3.974988295871553e-06, + "loss": 0.96481478, + "num_input_tokens_seen": 27871435, + "router_z_loss_clip": 4.640625, + "router_z_loss_mlp": 0.54882812, + "step": 1312, + "time_per_iteration": 2.7384519577026367 + }, + { + "auxiliary_loss_clip": 0.06786558, + "auxiliary_loss_mlp": 0.01318936, + "balance_loss_clip": 0.06334423, + "balance_loss_mlp": 0.01270561, + "epoch": 0.07894183075304374, + "flos": 19870947406080.0, + "grad_norm": 2.1624292410526773, + "language_loss": 0.84578681, + "learning_rate": 3.9749268575868085e-06, + "loss": 0.92684174, + "num_input_tokens_seen": 27890625, + "router_z_loss_clip": 4.5234375, + "router_z_loss_mlp": 0.48388672, + "step": 1313, + "time_per_iteration": 2.6043031215667725 + }, + { + "auxiliary_loss_clip": 0.06836893, + "auxiliary_loss_mlp": 0.01334789, + "balance_loss_clip": 0.06342322, + "balance_loss_mlp": 0.01270368, + "epoch": 0.07900195400571171, + "flos": 16148965368960.0, + "grad_norm": 3.8741474948490717, + "language_loss": 0.75254732, + "learning_rate": 3.97486534441264e-06, + "loss": 0.83426416, + "num_input_tokens_seen": 27906530, + "router_z_loss_clip": 4.94140625, + "router_z_loss_mlp": 0.64404297, + "step": 1314, + "time_per_iteration": 2.532270669937134 + }, + { + "auxiliary_loss_clip": 0.06814209, + "auxiliary_loss_mlp": 0.01316459, + "balance_loss_clip": 0.06346349, + "balance_loss_mlp": 0.01263363, + "epoch": 0.07906207725837967, + "flos": 23736840030720.0, + "grad_norm": 2.0058439737114826, + "language_loss": 0.8208642, + "learning_rate": 3.974803756351379e-06, + "loss": 0.9021709, + "num_input_tokens_seen": 27926725, + "router_z_loss_clip": 4.67578125, + "router_z_loss_mlp": 0.53125, + "step": 1315, + "time_per_iteration": 2.6085028648376465 + }, + { + "auxiliary_loss_clip": 0.06824351, + "auxiliary_loss_mlp": 0.01326067, + "balance_loss_clip": 0.06345295, + "balance_loss_mlp": 0.01265914, + "epoch": 0.07912220051104765, + "flos": 24322712578560.0, + "grad_norm": 1.9106769346900934, + "language_loss": 0.76054502, + "learning_rate": 3.974742093405362e-06, + "loss": 0.84204924, + "num_input_tokens_seen": 27947875, + "router_z_loss_clip": 4.79296875, + "router_z_loss_mlp": 0.60083008, + "step": 1316, + "time_per_iteration": 2.586472749710083 + }, + { + "auxiliary_loss_clip": 0.0684765, + "auxiliary_loss_mlp": 0.01325754, + "balance_loss_clip": 0.06349534, + "balance_loss_mlp": 0.01266244, + "epoch": 0.07918232376371562, + "flos": 18886018988160.0, + "grad_norm": 4.4995832003619, + "language_loss": 0.68677568, + "learning_rate": 3.974680355576927e-06, + "loss": 0.76850969, + "num_input_tokens_seen": 27965040, + "router_z_loss_clip": 4.98046875, + "router_z_loss_mlp": 0.59472656, + "step": 1317, + "time_per_iteration": 2.5489861965179443 + }, + { + "auxiliary_loss_clip": 0.06869859, + "auxiliary_loss_mlp": 0.01349552, + "balance_loss_clip": 0.06357804, + "balance_loss_mlp": 0.01281912, + "epoch": 0.07924244701638358, + "flos": 27382862741760.0, + "grad_norm": 3.047310758275923, + "language_loss": 0.75324464, + "learning_rate": 3.974618542868415e-06, + "loss": 0.83543873, + "num_input_tokens_seen": 27985330, + "router_z_loss_clip": 5.12109375, + "router_z_loss_mlp": 0.67700195, + "step": 1318, + "time_per_iteration": 2.5918128490448 + }, + { + "auxiliary_loss_clip": 0.06830844, + "auxiliary_loss_mlp": 0.01322573, + "balance_loss_clip": 0.06359029, + "balance_loss_mlp": 0.01269692, + "epoch": 0.07930257026905156, + "flos": 25127574572160.0, + "grad_norm": 1.9442087070115428, + "language_loss": 0.92534363, + "learning_rate": 3.97455665528217e-06, + "loss": 1.0068779, + "num_input_tokens_seen": 28007615, + "router_z_loss_clip": 4.7109375, + "router_z_loss_mlp": 0.52929688, + "step": 1319, + "time_per_iteration": 3.993619203567505 + }, + { + "auxiliary_loss_clip": 0.06832193, + "auxiliary_loss_mlp": 0.0132254, + "balance_loss_clip": 0.06361841, + "balance_loss_mlp": 0.01272902, + "epoch": 0.07936269352171953, + "flos": 21840804241920.0, + "grad_norm": 2.144433650708689, + "language_loss": 0.81964207, + "learning_rate": 3.974494692820539e-06, + "loss": 0.90118945, + "num_input_tokens_seen": 28027765, + "router_z_loss_clip": 4.703125, + "router_z_loss_mlp": 0.49633789, + "step": 1320, + "time_per_iteration": 3.991323232650757 + }, + { + "auxiliary_loss_clip": 0.06858893, + "auxiliary_loss_mlp": 0.01331954, + "balance_loss_clip": 0.06361651, + "balance_loss_mlp": 0.01271801, + "epoch": 0.07942281677438749, + "flos": 16944477632640.0, + "grad_norm": 2.2380017082009576, + "language_loss": 0.71816266, + "learning_rate": 3.974432655485872e-06, + "loss": 0.80007118, + "num_input_tokens_seen": 28044225, + "router_z_loss_clip": 4.96484375, + "router_z_loss_mlp": 0.60205078, + "step": 1321, + "time_per_iteration": 2.5437092781066895 + }, + { + "auxiliary_loss_clip": 0.06835557, + "auxiliary_loss_mlp": 0.01340758, + "balance_loss_clip": 0.06363731, + "balance_loss_mlp": 0.01282297, + "epoch": 0.07948294002705546, + "flos": 18992515927680.0, + "grad_norm": 2.7756488817332943, + "language_loss": 0.86391938, + "learning_rate": 3.9743705432805195e-06, + "loss": 0.94568253, + "num_input_tokens_seen": 28062915, + "router_z_loss_clip": 4.7109375, + "router_z_loss_mlp": 0.5847168, + "step": 1322, + "time_per_iteration": 2.579061985015869 + }, + { + "auxiliary_loss_clip": 0.06837995, + "auxiliary_loss_mlp": 0.01339731, + "balance_loss_clip": 0.0636203, + "balance_loss_mlp": 0.01284681, + "epoch": 0.07954306327972344, + "flos": 21659983130880.0, + "grad_norm": 2.3668510426442144, + "language_loss": 0.92888951, + "learning_rate": 3.974308356206838e-06, + "loss": 1.01066673, + "num_input_tokens_seen": 28082175, + "router_z_loss_clip": 4.7578125, + "router_z_loss_mlp": 0.55053711, + "step": 1323, + "time_per_iteration": 3.9885079860687256 + }, + { + "auxiliary_loss_clip": 0.06820317, + "auxiliary_loss_mlp": 0.01320075, + "balance_loss_clip": 0.06361794, + "balance_loss_mlp": 0.01267504, + "epoch": 0.0796031865323914, + "flos": 23226717173760.0, + "grad_norm": 4.577989929254941, + "language_loss": 0.84617591, + "learning_rate": 3.974246094267187e-06, + "loss": 0.92757982, + "num_input_tokens_seen": 28102645, + "router_z_loss_clip": 4.58203125, + "router_z_loss_mlp": 0.52661133, + "step": 1324, + "time_per_iteration": 2.575162410736084 + }, + { + "auxiliary_loss_clip": 0.0682738, + "auxiliary_loss_mlp": 0.01317412, + "balance_loss_clip": 0.06365715, + "balance_loss_mlp": 0.0126372, + "epoch": 0.07966330978505937, + "flos": 23301209053440.0, + "grad_norm": 4.146924168553952, + "language_loss": 0.81619465, + "learning_rate": 3.974183757463925e-06, + "loss": 0.89764249, + "num_input_tokens_seen": 28122805, + "router_z_loss_clip": 4.6171875, + "router_z_loss_mlp": 0.53710938, + "step": 1325, + "time_per_iteration": 3.9960508346557617 + }, + { + "auxiliary_loss_clip": 0.06838783, + "auxiliary_loss_mlp": 0.01317663, + "balance_loss_clip": 0.06375229, + "balance_loss_mlp": 0.01262112, + "epoch": 0.07972343303772735, + "flos": 18368768534400.0, + "grad_norm": 3.482553532723253, + "language_loss": 0.90544963, + "learning_rate": 3.974121345799418e-06, + "loss": 0.98701411, + "num_input_tokens_seen": 28140530, + "router_z_loss_clip": 4.63671875, + "router_z_loss_mlp": 0.55493164, + "step": 1326, + "time_per_iteration": 2.5401828289031982 + }, + { + "auxiliary_loss_clip": 0.0682137, + "auxiliary_loss_mlp": 0.01316322, + "balance_loss_clip": 0.06366737, + "balance_loss_mlp": 0.01263488, + "epoch": 0.07978355629039531, + "flos": 21768995692800.0, + "grad_norm": 2.4962093100336085, + "language_loss": 0.85295928, + "learning_rate": 3.974058859276032e-06, + "loss": 0.93433619, + "num_input_tokens_seen": 28159640, + "router_z_loss_clip": 4.54296875, + "router_z_loss_mlp": 0.52856445, + "step": 1327, + "time_per_iteration": 2.6081485748291016 + }, + { + "auxiliary_loss_clip": 0.0686523, + "auxiliary_loss_mlp": 0.01320845, + "balance_loss_clip": 0.06376741, + "balance_loss_mlp": 0.01260119, + "epoch": 0.07984367954306328, + "flos": 18557178439680.0, + "grad_norm": 3.6856767873413077, + "language_loss": 0.82425529, + "learning_rate": 3.9739962978961354e-06, + "loss": 0.90611601, + "num_input_tokens_seen": 28177050, + "router_z_loss_clip": 4.88671875, + "router_z_loss_mlp": 0.60742188, + "step": 1328, + "time_per_iteration": 2.5963807106018066 + }, + { + "auxiliary_loss_clip": 0.06855517, + "auxiliary_loss_mlp": 0.01323941, + "balance_loss_clip": 0.06378672, + "balance_loss_mlp": 0.01266315, + "epoch": 0.07990380279573125, + "flos": 16908741066240.0, + "grad_norm": 2.810501054411486, + "language_loss": 0.77465802, + "learning_rate": 3.973933661662101e-06, + "loss": 0.85645258, + "num_input_tokens_seen": 28193245, + "router_z_loss_clip": 4.76953125, + "router_z_loss_mlp": 0.57666016, + "step": 1329, + "time_per_iteration": 2.5654993057250977 + }, + { + "auxiliary_loss_clip": 0.06870389, + "auxiliary_loss_mlp": 0.01332359, + "balance_loss_clip": 0.06403654, + "balance_loss_mlp": 0.01277785, + "epoch": 0.07996392604839922, + "flos": 24105358287360.0, + "grad_norm": 3.2158550447724354, + "language_loss": 0.83423603, + "learning_rate": 3.973870950576305e-06, + "loss": 0.91626346, + "num_input_tokens_seen": 28213570, + "router_z_loss_clip": 4.66796875, + "router_z_loss_mlp": 0.5456543, + "step": 1330, + "time_per_iteration": 2.689359426498413 + }, + { + "auxiliary_loss_clip": 0.06871998, + "auxiliary_loss_mlp": 0.01327325, + "balance_loss_clip": 0.06395264, + "balance_loss_mlp": 0.01271893, + "epoch": 0.08002404930106718, + "flos": 14283257558400.0, + "grad_norm": 2.3593668670474375, + "language_loss": 0.91363919, + "learning_rate": 3.9738081646411255e-06, + "loss": 0.99563241, + "num_input_tokens_seen": 28229980, + "router_z_loss_clip": 4.765625, + "router_z_loss_mlp": 0.5534668, + "step": 1331, + "time_per_iteration": 2.535022735595703 + }, + { + "auxiliary_loss_clip": 0.06886654, + "auxiliary_loss_mlp": 0.01331981, + "balance_loss_clip": 0.0639886, + "balance_loss_mlp": 0.01274283, + "epoch": 0.08008417255373516, + "flos": 40415732547840.0, + "grad_norm": 8.382777264974079, + "language_loss": 0.75984204, + "learning_rate": 3.973745303858942e-06, + "loss": 0.84202838, + "num_input_tokens_seen": 28253840, + "router_z_loss_clip": 4.875, + "router_z_loss_mlp": 0.57666016, + "step": 1332, + "time_per_iteration": 2.798543691635132 + }, + { + "auxiliary_loss_clip": 0.06853566, + "auxiliary_loss_mlp": 0.01322273, + "balance_loss_clip": 0.06399575, + "balance_loss_mlp": 0.01270894, + "epoch": 0.08014429580640313, + "flos": 18484866766080.0, + "grad_norm": 3.077187306300229, + "language_loss": 0.84502465, + "learning_rate": 3.973682368232138e-06, + "loss": 0.92678297, + "num_input_tokens_seen": 28271675, + "router_z_loss_clip": 4.54296875, + "router_z_loss_mlp": 0.51318359, + "step": 1333, + "time_per_iteration": 2.55322003364563 + }, + { + "auxiliary_loss_clip": 0.06860092, + "auxiliary_loss_mlp": 0.01337998, + "balance_loss_clip": 0.06402323, + "balance_loss_mlp": 0.01283972, + "epoch": 0.0802044190590711, + "flos": 22059835614720.0, + "grad_norm": 5.409358557797253, + "language_loss": 0.77425432, + "learning_rate": 3.9736193577631015e-06, + "loss": 0.85623527, + "num_input_tokens_seen": 28291850, + "router_z_loss_clip": 4.57421875, + "router_z_loss_mlp": 0.54052734, + "step": 1334, + "time_per_iteration": 2.6176130771636963 + }, + { + "auxiliary_loss_clip": 0.06866166, + "auxiliary_loss_mlp": 0.01339925, + "balance_loss_clip": 0.06404187, + "balance_loss_mlp": 0.01288045, + "epoch": 0.08026454231173906, + "flos": 24579115672320.0, + "grad_norm": 2.171957673256717, + "language_loss": 0.82094586, + "learning_rate": 3.973556272454221e-06, + "loss": 0.90300679, + "num_input_tokens_seen": 28310780, + "router_z_loss_clip": 4.6171875, + "router_z_loss_mlp": 0.51855469, + "step": 1335, + "time_per_iteration": 2.5995283126831055 + }, + { + "auxiliary_loss_clip": 0.0666078, + "auxiliary_loss_mlp": 0.0130214, + "balance_loss_clip": 0.06361455, + "balance_loss_mlp": 0.01275747, + "epoch": 0.08032466556440704, + "flos": 52597716940800.0, + "grad_norm": 0.7171954407460774, + "language_loss": 0.56264853, + "learning_rate": 3.973493112307889e-06, + "loss": 0.64227772, + "num_input_tokens_seen": 28369985, + "router_z_loss_clip": 3.0, + "router_z_loss_mlp": 0.2644043, + "step": 1336, + "time_per_iteration": 3.246748447418213 + }, + { + "auxiliary_loss_clip": 0.06839207, + "auxiliary_loss_mlp": 0.01326336, + "balance_loss_clip": 0.06379974, + "balance_loss_mlp": 0.01274528, + "epoch": 0.080384788817075, + "flos": 23849500245120.0, + "grad_norm": 4.030100704660237, + "language_loss": 0.70582694, + "learning_rate": 3.9734298773265005e-06, + "loss": 0.78748238, + "num_input_tokens_seen": 28388670, + "router_z_loss_clip": 4.58984375, + "router_z_loss_mlp": 0.51757812, + "step": 1337, + "time_per_iteration": 2.6283833980560303 + }, + { + "auxiliary_loss_clip": 0.06838794, + "auxiliary_loss_mlp": 0.01334035, + "balance_loss_clip": 0.06387126, + "balance_loss_mlp": 0.01282751, + "epoch": 0.08044491206974297, + "flos": 25307640996480.0, + "grad_norm": 2.123866739454124, + "language_loss": 0.89543176, + "learning_rate": 3.973366567512453e-06, + "loss": 0.97716004, + "num_input_tokens_seen": 28411845, + "router_z_loss_clip": 4.515625, + "router_z_loss_mlp": 0.51245117, + "step": 1338, + "time_per_iteration": 2.657308340072632 + }, + { + "auxiliary_loss_clip": 0.0684766, + "auxiliary_loss_mlp": 0.01327669, + "balance_loss_clip": 0.06375088, + "balance_loss_mlp": 0.01275956, + "epoch": 0.08050503532241095, + "flos": 22382093617920.0, + "grad_norm": 3.2141596734882705, + "language_loss": 0.89268589, + "learning_rate": 3.973303182868147e-06, + "loss": 0.97443926, + "num_input_tokens_seen": 28427875, + "router_z_loss_clip": 4.7265625, + "router_z_loss_mlp": 0.51708984, + "step": 1339, + "time_per_iteration": 2.592478036880493 + }, + { + "auxiliary_loss_clip": 0.06819817, + "auxiliary_loss_mlp": 0.01317452, + "balance_loss_clip": 0.06381136, + "balance_loss_mlp": 0.01272391, + "epoch": 0.08056515857507891, + "flos": 18375351079680.0, + "grad_norm": 3.0627135326619093, + "language_loss": 0.91607487, + "learning_rate": 3.973239723395988e-06, + "loss": 0.99744761, + "num_input_tokens_seen": 28446615, + "router_z_loss_clip": 4.390625, + "router_z_loss_mlp": 0.45019531, + "step": 1340, + "time_per_iteration": 2.576737403869629 + }, + { + "auxiliary_loss_clip": 0.06633395, + "auxiliary_loss_mlp": 0.01308679, + "balance_loss_clip": 0.06341641, + "balance_loss_mlp": 0.01279282, + "epoch": 0.08062528182774688, + "flos": 51364938545280.0, + "grad_norm": 0.8608858843500025, + "language_loss": 0.65432441, + "learning_rate": 3.97317618909838e-06, + "loss": 0.73374522, + "num_input_tokens_seen": 28505290, + "router_z_loss_clip": 2.91992188, + "router_z_loss_mlp": 0.29321289, + "step": 1341, + "time_per_iteration": 3.1589889526367188 + }, + { + "auxiliary_loss_clip": 0.06851779, + "auxiliary_loss_mlp": 0.01330947, + "balance_loss_clip": 0.06375904, + "balance_loss_mlp": 0.01274966, + "epoch": 0.08068540508041486, + "flos": 17604925914240.0, + "grad_norm": 3.057229978757205, + "language_loss": 0.9131434, + "learning_rate": 3.973112579977733e-06, + "loss": 0.99497068, + "num_input_tokens_seen": 28522735, + "router_z_loss_clip": 4.7578125, + "router_z_loss_mlp": 0.55932617, + "step": 1342, + "time_per_iteration": 2.5444014072418213 + }, + { + "auxiliary_loss_clip": 0.06830276, + "auxiliary_loss_mlp": 0.01334079, + "balance_loss_clip": 0.06376267, + "balance_loss_mlp": 0.01283748, + "epoch": 0.08074552833308282, + "flos": 10565761714560.0, + "grad_norm": 4.354152160697022, + "language_loss": 0.78571475, + "learning_rate": 3.973048896036459e-06, + "loss": 0.86735827, + "num_input_tokens_seen": 28539460, + "router_z_loss_clip": 4.54296875, + "router_z_loss_mlp": 0.50268555, + "step": 1343, + "time_per_iteration": 2.5960419178009033 + }, + { + "auxiliary_loss_clip": 0.06624237, + "auxiliary_loss_mlp": 0.01296199, + "balance_loss_clip": 0.06332739, + "balance_loss_mlp": 0.0127157, + "epoch": 0.08080565158575079, + "flos": 60859624245120.0, + "grad_norm": 0.7713053801929547, + "language_loss": 0.57751364, + "learning_rate": 3.972985137276974e-06, + "loss": 0.65671802, + "num_input_tokens_seen": 28599855, + "router_z_loss_clip": 2.90625, + "router_z_loss_mlp": 0.24609375, + "step": 1344, + "time_per_iteration": 3.101456880569458 + }, + { + "auxiliary_loss_clip": 0.06825489, + "auxiliary_loss_mlp": 0.01321695, + "balance_loss_clip": 0.06367917, + "balance_loss_mlp": 0.01271937, + "epoch": 0.08086577483841875, + "flos": 18338188993920.0, + "grad_norm": 5.096262211204216, + "language_loss": 0.90334368, + "learning_rate": 3.972921303701695e-06, + "loss": 0.98481554, + "num_input_tokens_seen": 28617585, + "router_z_loss_clip": 4.578125, + "router_z_loss_mlp": 0.49780273, + "step": 1345, + "time_per_iteration": 2.586388349533081 + }, + { + "auxiliary_loss_clip": 0.0679345, + "auxiliary_loss_mlp": 0.013189, + "balance_loss_clip": 0.06356402, + "balance_loss_mlp": 0.01272527, + "epoch": 0.08092589809108673, + "flos": 21550048174080.0, + "grad_norm": 2.3072860000969437, + "language_loss": 0.89656544, + "learning_rate": 3.972857395313042e-06, + "loss": 0.97768891, + "num_input_tokens_seen": 28636355, + "router_z_loss_clip": 4.37109375, + "router_z_loss_mlp": 0.46386719, + "step": 1346, + "time_per_iteration": 2.582712411880493 + }, + { + "auxiliary_loss_clip": 0.06790248, + "auxiliary_loss_mlp": 0.01314356, + "balance_loss_clip": 0.06353667, + "balance_loss_mlp": 0.0126734, + "epoch": 0.0809860213437547, + "flos": 22134662910720.0, + "grad_norm": 2.14729633171376, + "language_loss": 0.94647479, + "learning_rate": 3.972793412113439e-06, + "loss": 1.0275209, + "num_input_tokens_seen": 28656260, + "router_z_loss_clip": 4.36328125, + "router_z_loss_mlp": 0.47021484, + "step": 1347, + "time_per_iteration": 2.625967025756836 + }, + { + "auxiliary_loss_clip": 0.06793564, + "auxiliary_loss_mlp": 0.01318721, + "balance_loss_clip": 0.06355867, + "balance_loss_mlp": 0.01268487, + "epoch": 0.08104614459642266, + "flos": 21731875534080.0, + "grad_norm": 1.9969105850097444, + "language_loss": 0.91454613, + "learning_rate": 3.972729354105312e-06, + "loss": 0.99566901, + "num_input_tokens_seen": 28675865, + "router_z_loss_clip": 4.37109375, + "router_z_loss_mlp": 0.50219727, + "step": 1348, + "time_per_iteration": 2.5634779930114746 + }, + { + "auxiliary_loss_clip": 0.06800284, + "auxiliary_loss_mlp": 0.01324319, + "balance_loss_clip": 0.06360676, + "balance_loss_mlp": 0.01274585, + "epoch": 0.08110626784909064, + "flos": 23958764369280.0, + "grad_norm": 1.9721965286660104, + "language_loss": 0.78618681, + "learning_rate": 3.97266522129109e-06, + "loss": 0.86743283, + "num_input_tokens_seen": 28696255, + "router_z_loss_clip": 4.39453125, + "router_z_loss_mlp": 0.49731445, + "step": 1349, + "time_per_iteration": 2.6185498237609863 + }, + { + "auxiliary_loss_clip": 0.06800876, + "auxiliary_loss_mlp": 0.01313559, + "balance_loss_clip": 0.06350809, + "balance_loss_mlp": 0.01260082, + "epoch": 0.0811663911017586, + "flos": 19031648584320.0, + "grad_norm": 2.1691769325426407, + "language_loss": 0.90292668, + "learning_rate": 3.972601013673205e-06, + "loss": 0.98407102, + "num_input_tokens_seen": 28713905, + "router_z_loss_clip": 4.5, + "router_z_loss_mlp": 0.53491211, + "step": 1350, + "time_per_iteration": 2.5529837608337402 + }, + { + "auxiliary_loss_clip": 0.06778225, + "auxiliary_loss_mlp": 0.01313184, + "balance_loss_clip": 0.06345821, + "balance_loss_mlp": 0.01263522, + "epoch": 0.08122651435442657, + "flos": 15346744778880.0, + "grad_norm": 2.4256402439075524, + "language_loss": 0.84302771, + "learning_rate": 3.972536731254092e-06, + "loss": 0.92394179, + "num_input_tokens_seen": 28732075, + "router_z_loss_clip": 4.33203125, + "router_z_loss_mlp": 0.49633789, + "step": 1351, + "time_per_iteration": 2.574605941772461 + }, + { + "auxiliary_loss_clip": 0.06780043, + "auxiliary_loss_mlp": 0.01313675, + "balance_loss_clip": 0.06340061, + "balance_loss_mlp": 0.01260365, + "epoch": 0.08128663760709455, + "flos": 23228226547200.0, + "grad_norm": 2.4241077577089296, + "language_loss": 0.77524561, + "learning_rate": 3.972472374036189e-06, + "loss": 0.85618269, + "num_input_tokens_seen": 28751150, + "router_z_loss_clip": 4.39453125, + "router_z_loss_mlp": 0.53393555, + "step": 1352, + "time_per_iteration": 2.5638983249664307 + }, + { + "auxiliary_loss_clip": 0.06784214, + "auxiliary_loss_mlp": 0.01317971, + "balance_loss_clip": 0.06339107, + "balance_loss_mlp": 0.01263802, + "epoch": 0.08134676085976252, + "flos": 22972158869760.0, + "grad_norm": 2.0098905052691154, + "language_loss": 0.84226817, + "learning_rate": 3.972407942021935e-06, + "loss": 0.92329001, + "num_input_tokens_seen": 28773360, + "router_z_loss_clip": 4.44140625, + "router_z_loss_mlp": 0.54223633, + "step": 1353, + "time_per_iteration": 2.64945125579834 + }, + { + "auxiliary_loss_clip": 0.06608218, + "auxiliary_loss_mlp": 0.01309213, + "balance_loss_clip": 0.06325812, + "balance_loss_mlp": 0.01278219, + "epoch": 0.08140688411243048, + "flos": 64338592642560.0, + "grad_norm": 0.8262871142057754, + "language_loss": 0.5983628, + "learning_rate": 3.972343435213775e-06, + "loss": 0.67753708, + "num_input_tokens_seen": 28833390, + "router_z_loss_clip": 2.828125, + "router_z_loss_mlp": 0.30957031, + "step": 1354, + "time_per_iteration": 3.1732943058013916 + }, + { + "auxiliary_loss_clip": 0.06774879, + "auxiliary_loss_mlp": 0.0130121, + "balance_loss_clip": 0.0634238, + "balance_loss_mlp": 0.01251332, + "epoch": 0.08146700736509845, + "flos": 22498401484800.0, + "grad_norm": 1.9500881523267093, + "language_loss": 0.84588456, + "learning_rate": 3.972278853614154e-06, + "loss": 0.92664552, + "num_input_tokens_seen": 28852430, + "router_z_loss_clip": 4.32421875, + "router_z_loss_mlp": 0.49853516, + "step": 1355, + "time_per_iteration": 2.6024701595306396 + }, + { + "auxiliary_loss_clip": 0.06776839, + "auxiliary_loss_mlp": 0.01312133, + "balance_loss_clip": 0.06341404, + "balance_loss_mlp": 0.01258727, + "epoch": 0.08152713061776642, + "flos": 20453885061120.0, + "grad_norm": 2.065670918937768, + "language_loss": 0.73062277, + "learning_rate": 3.972214197225521e-06, + "loss": 0.81151247, + "num_input_tokens_seen": 28870685, + "router_z_loss_clip": 4.3515625, + "router_z_loss_mlp": 0.53393555, + "step": 1356, + "time_per_iteration": 2.72872257232666 + }, + { + "auxiliary_loss_clip": 0.06800745, + "auxiliary_loss_mlp": 0.01315187, + "balance_loss_clip": 0.06343117, + "balance_loss_mlp": 0.01261305, + "epoch": 0.08158725387043439, + "flos": 23556983241600.0, + "grad_norm": 2.136910900826005, + "language_loss": 0.72079623, + "learning_rate": 3.972149466050329e-06, + "loss": 0.80195546, + "num_input_tokens_seen": 28889860, + "router_z_loss_clip": 4.57421875, + "router_z_loss_mlp": 0.5390625, + "step": 1357, + "time_per_iteration": 2.5841641426086426 + }, + { + "auxiliary_loss_clip": 0.06792152, + "auxiliary_loss_mlp": 0.01312262, + "balance_loss_clip": 0.06345978, + "balance_loss_mlp": 0.01258093, + "epoch": 0.08164737712310235, + "flos": 22023763632000.0, + "grad_norm": 3.905031036394957, + "language_loss": 0.86688, + "learning_rate": 3.97208466009103e-06, + "loss": 0.94792414, + "num_input_tokens_seen": 28905865, + "router_z_loss_clip": 4.45703125, + "router_z_loss_mlp": 0.54150391, + "step": 1358, + "time_per_iteration": 4.091388940811157 + }, + { + "auxiliary_loss_clip": 0.0678063, + "auxiliary_loss_mlp": 0.01322843, + "balance_loss_clip": 0.06336431, + "balance_loss_mlp": 0.01268985, + "epoch": 0.08170750037577033, + "flos": 23374568903040.0, + "grad_norm": 2.183092150408785, + "language_loss": 1.0464294, + "learning_rate": 3.972019779350084e-06, + "loss": 1.12746406, + "num_input_tokens_seen": 28925250, + "router_z_loss_clip": 4.4453125, + "router_z_loss_mlp": 0.53857422, + "step": 1359, + "time_per_iteration": 2.638028860092163 + }, + { + "auxiliary_loss_clip": 0.06798591, + "auxiliary_loss_mlp": 0.01334932, + "balance_loss_clip": 0.06339104, + "balance_loss_mlp": 0.01274732, + "epoch": 0.0817676236284383, + "flos": 28404743610240.0, + "grad_norm": 2.2550025008974335, + "language_loss": 0.86049831, + "learning_rate": 3.971954823829951e-06, + "loss": 0.9418335, + "num_input_tokens_seen": 28943445, + "router_z_loss_clip": 4.59765625, + "router_z_loss_mlp": 0.60229492, + "step": 1360, + "time_per_iteration": 4.079089164733887 + }, + { + "auxiliary_loss_clip": 0.06791367, + "auxiliary_loss_mlp": 0.01327265, + "balance_loss_clip": 0.06338443, + "balance_loss_mlp": 0.01274146, + "epoch": 0.08182774688110626, + "flos": 19215027244800.0, + "grad_norm": 8.376592298607987, + "language_loss": 0.74940681, + "learning_rate": 3.971889793533093e-06, + "loss": 0.83059311, + "num_input_tokens_seen": 28962695, + "router_z_loss_clip": 4.53125, + "router_z_loss_mlp": 0.53125, + "step": 1361, + "time_per_iteration": 2.6070094108581543 + }, + { + "auxiliary_loss_clip": 0.06780887, + "auxiliary_loss_mlp": 0.01320749, + "balance_loss_clip": 0.06343664, + "balance_loss_mlp": 0.01270443, + "epoch": 0.08188787013377424, + "flos": 22790750780160.0, + "grad_norm": 2.8909747766913574, + "language_loss": 0.79067749, + "learning_rate": 3.971824688461976e-06, + "loss": 0.87169385, + "num_input_tokens_seen": 28982120, + "router_z_loss_clip": 4.3671875, + "router_z_loss_mlp": 0.50244141, + "step": 1362, + "time_per_iteration": 2.575406074523926 + }, + { + "auxiliary_loss_clip": 0.06776625, + "auxiliary_loss_mlp": 0.01317112, + "balance_loss_clip": 0.06338399, + "balance_loss_mlp": 0.01266543, + "epoch": 0.08194799338644221, + "flos": 16473026234880.0, + "grad_norm": 2.5840358465526787, + "language_loss": 0.74518561, + "learning_rate": 3.971759508619069e-06, + "loss": 0.826123, + "num_input_tokens_seen": 28998100, + "router_z_loss_clip": 4.3828125, + "router_z_loss_mlp": 0.50537109, + "step": 1363, + "time_per_iteration": 3.9524402618408203 + }, + { + "auxiliary_loss_clip": 0.06785508, + "auxiliary_loss_mlp": 0.01321755, + "balance_loss_clip": 0.06342393, + "balance_loss_mlp": 0.01265846, + "epoch": 0.08200811663911017, + "flos": 23920218691200.0, + "grad_norm": 2.478943630227512, + "language_loss": 0.79175317, + "learning_rate": 3.971694254006844e-06, + "loss": 0.87282574, + "num_input_tokens_seen": 29017095, + "router_z_loss_clip": 4.43359375, + "router_z_loss_mlp": 0.55859375, + "step": 1364, + "time_per_iteration": 2.607170343399048 + }, + { + "auxiliary_loss_clip": 0.06783722, + "auxiliary_loss_mlp": 0.01316868, + "balance_loss_clip": 0.06340142, + "balance_loss_mlp": 0.01262867, + "epoch": 0.08206823989177814, + "flos": 17902641870720.0, + "grad_norm": 2.8411268969790275, + "language_loss": 0.83563399, + "learning_rate": 3.971628924627776e-06, + "loss": 0.91663992, + "num_input_tokens_seen": 29037240, + "router_z_loss_clip": 4.4375, + "router_z_loss_mlp": 0.54003906, + "step": 1365, + "time_per_iteration": 4.020315647125244 + }, + { + "auxiliary_loss_clip": 0.06767645, + "auxiliary_loss_mlp": 0.01324198, + "balance_loss_clip": 0.06336691, + "balance_loss_mlp": 0.01274917, + "epoch": 0.08212836314444612, + "flos": 22094272442880.0, + "grad_norm": 1.9744562731627089, + "language_loss": 0.83576512, + "learning_rate": 3.97156352048434e-06, + "loss": 0.91668355, + "num_input_tokens_seen": 29056250, + "router_z_loss_clip": 4.30078125, + "router_z_loss_mlp": 0.49243164, + "step": 1366, + "time_per_iteration": 2.5904746055603027 + }, + { + "auxiliary_loss_clip": 0.06785953, + "auxiliary_loss_mlp": 0.01321056, + "balance_loss_clip": 0.06344087, + "balance_loss_mlp": 0.01269963, + "epoch": 0.08218848639711408, + "flos": 17602326437760.0, + "grad_norm": 2.595099293602591, + "language_loss": 0.84101415, + "learning_rate": 3.97149804157902e-06, + "loss": 0.92208421, + "num_input_tokens_seen": 29073380, + "router_z_loss_clip": 4.41015625, + "router_z_loss_mlp": 0.51074219, + "step": 1367, + "time_per_iteration": 2.547091007232666 + }, + { + "auxiliary_loss_clip": 0.06812844, + "auxiliary_loss_mlp": 0.01336623, + "balance_loss_clip": 0.06357861, + "balance_loss_mlp": 0.01283504, + "epoch": 0.08224860964978205, + "flos": 17863551141120.0, + "grad_norm": 3.794710967606561, + "language_loss": 0.85955203, + "learning_rate": 3.9714324879142946e-06, + "loss": 0.94104671, + "num_input_tokens_seen": 29091330, + "router_z_loss_clip": 4.546875, + "router_z_loss_mlp": 0.53100586, + "step": 1368, + "time_per_iteration": 2.6025125980377197 + }, + { + "auxiliary_loss_clip": 0.06754048, + "auxiliary_loss_mlp": 0.01305347, + "balance_loss_clip": 0.06340475, + "balance_loss_mlp": 0.01259881, + "epoch": 0.08230873290245003, + "flos": 25234406928000.0, + "grad_norm": 1.7485210372757418, + "language_loss": 0.82751203, + "learning_rate": 3.971366859492653e-06, + "loss": 0.90810603, + "num_input_tokens_seen": 29110375, + "router_z_loss_clip": 4.12890625, + "router_z_loss_mlp": 0.45458984, + "step": 1369, + "time_per_iteration": 2.6027116775512695 + }, + { + "auxiliary_loss_clip": 0.06772825, + "auxiliary_loss_mlp": 0.01314688, + "balance_loss_clip": 0.06341462, + "balance_loss_mlp": 0.01264811, + "epoch": 0.08236885615511799, + "flos": 31768144099200.0, + "grad_norm": 4.8921113569353425, + "language_loss": 0.77775633, + "learning_rate": 3.971301156316582e-06, + "loss": 0.85863149, + "num_input_tokens_seen": 29129395, + "router_z_loss_clip": 4.31640625, + "router_z_loss_mlp": 0.49902344, + "step": 1370, + "time_per_iteration": 2.685317039489746 + }, + { + "auxiliary_loss_clip": 0.06783543, + "auxiliary_loss_mlp": 0.01317271, + "balance_loss_clip": 0.06345622, + "balance_loss_mlp": 0.01265153, + "epoch": 0.08242897940778596, + "flos": 23192615761920.0, + "grad_norm": 2.053394395942029, + "language_loss": 0.76803637, + "learning_rate": 3.971235378388573e-06, + "loss": 0.84904444, + "num_input_tokens_seen": 29148650, + "router_z_loss_clip": 4.37890625, + "router_z_loss_mlp": 0.52124023, + "step": 1371, + "time_per_iteration": 2.6406354904174805 + }, + { + "auxiliary_loss_clip": 0.06769266, + "auxiliary_loss_mlp": 0.01317025, + "balance_loss_clip": 0.06335683, + "balance_loss_mlp": 0.01267625, + "epoch": 0.08248910266045394, + "flos": 34499327932800.0, + "grad_norm": 3.0324747361967557, + "language_loss": 0.72827047, + "learning_rate": 3.971169525711122e-06, + "loss": 0.80913335, + "num_input_tokens_seen": 29170785, + "router_z_loss_clip": 4.33203125, + "router_z_loss_mlp": 0.49438477, + "step": 1372, + "time_per_iteration": 2.709796905517578 + }, + { + "auxiliary_loss_clip": 0.06798708, + "auxiliary_loss_mlp": 0.01317216, + "balance_loss_clip": 0.06345405, + "balance_loss_mlp": 0.01260854, + "epoch": 0.0825492259131219, + "flos": 13440059521920.0, + "grad_norm": 3.0329353190283075, + "language_loss": 0.9010855, + "learning_rate": 3.9711035982867246e-06, + "loss": 0.98224467, + "num_input_tokens_seen": 29185210, + "router_z_loss_clip": 4.53125, + "router_z_loss_mlp": 0.56420898, + "step": 1373, + "time_per_iteration": 2.5570318698883057 + }, + { + "auxiliary_loss_clip": 0.06774755, + "auxiliary_loss_mlp": 0.01317124, + "balance_loss_clip": 0.0634156, + "balance_loss_mlp": 0.01267056, + "epoch": 0.08260934916578987, + "flos": 25819608643200.0, + "grad_norm": 3.0603308178325657, + "language_loss": 0.84582615, + "learning_rate": 3.971037596117882e-06, + "loss": 0.92674494, + "num_input_tokens_seen": 29205210, + "router_z_loss_clip": 4.3359375, + "router_z_loss_mlp": 0.50024414, + "step": 1374, + "time_per_iteration": 2.596226215362549 + }, + { + "auxiliary_loss_clip": 0.06626149, + "auxiliary_loss_mlp": 0.0129603, + "balance_loss_clip": 0.06341976, + "balance_loss_mlp": 0.01265918, + "epoch": 0.08266947241845783, + "flos": 63478609061760.0, + "grad_norm": 0.8009341803089134, + "language_loss": 0.60659707, + "learning_rate": 3.970971519207095e-06, + "loss": 0.68581879, + "num_input_tokens_seen": 29265350, + "router_z_loss_clip": 2.84375, + "router_z_loss_mlp": 0.30053711, + "step": 1375, + "time_per_iteration": 3.177459716796875 + }, + { + "auxiliary_loss_clip": 0.06618689, + "auxiliary_loss_mlp": 0.01286424, + "balance_loss_clip": 0.06334813, + "balance_loss_mlp": 0.01256718, + "epoch": 0.08272959567112581, + "flos": 70013855606400.0, + "grad_norm": 0.886054791003263, + "language_loss": 0.62275791, + "learning_rate": 3.970905367556871e-06, + "loss": 0.70180905, + "num_input_tokens_seen": 29321475, + "router_z_loss_clip": 2.84375, + "router_z_loss_mlp": 0.29638672, + "step": 1376, + "time_per_iteration": 3.1206676959991455 + }, + { + "auxiliary_loss_clip": 0.06771185, + "auxiliary_loss_mlp": 0.01316915, + "balance_loss_clip": 0.06341776, + "balance_loss_mlp": 0.01268611, + "epoch": 0.08278971892379378, + "flos": 20419574014080.0, + "grad_norm": 2.5198182509144735, + "language_loss": 0.84768277, + "learning_rate": 3.970839141169718e-06, + "loss": 0.92856377, + "num_input_tokens_seen": 29341405, + "router_z_loss_clip": 4.2890625, + "router_z_loss_mlp": 0.48266602, + "step": 1377, + "time_per_iteration": 2.6820216178894043 + }, + { + "auxiliary_loss_clip": 0.06764729, + "auxiliary_loss_mlp": 0.01308146, + "balance_loss_clip": 0.06342821, + "balance_loss_mlp": 0.0126144, + "epoch": 0.08284984217646174, + "flos": 26257461753600.0, + "grad_norm": 2.286420184169047, + "language_loss": 0.86602247, + "learning_rate": 3.970772840048147e-06, + "loss": 0.94675124, + "num_input_tokens_seen": 29361955, + "router_z_loss_clip": 4.2109375, + "router_z_loss_mlp": 0.46728516, + "step": 1378, + "time_per_iteration": 2.5983967781066895 + }, + { + "auxiliary_loss_clip": 0.06779523, + "auxiliary_loss_mlp": 0.01324128, + "balance_loss_clip": 0.06348801, + "balance_loss_mlp": 0.01275396, + "epoch": 0.08290996542912972, + "flos": 27201370798080.0, + "grad_norm": 4.155383498543994, + "language_loss": 0.9020921, + "learning_rate": 3.970706464194672e-06, + "loss": 0.98312867, + "num_input_tokens_seen": 29382395, + "router_z_loss_clip": 4.30664062, + "router_z_loss_mlp": 0.48779297, + "step": 1379, + "time_per_iteration": 2.6558284759521484 + }, + { + "auxiliary_loss_clip": 0.06771149, + "auxiliary_loss_mlp": 0.01307486, + "balance_loss_clip": 0.06347619, + "balance_loss_mlp": 0.01261972, + "epoch": 0.08297008868179769, + "flos": 38627367655680.0, + "grad_norm": 2.766384510146163, + "language_loss": 0.80964148, + "learning_rate": 3.970640013611812e-06, + "loss": 0.89042783, + "num_input_tokens_seen": 29404460, + "router_z_loss_clip": 4.23828125, + "router_z_loss_mlp": 0.45483398, + "step": 1380, + "time_per_iteration": 2.7228140830993652 + }, + { + "auxiliary_loss_clip": 0.06759404, + "auxiliary_loss_mlp": 0.01314619, + "balance_loss_clip": 0.06340429, + "balance_loss_mlp": 0.01265576, + "epoch": 0.08303021193446565, + "flos": 19980924289920.0, + "grad_norm": 2.7915027065661593, + "language_loss": 0.88561881, + "learning_rate": 3.970573488302083e-06, + "loss": 0.96635896, + "num_input_tokens_seen": 29422675, + "router_z_loss_clip": 4.18554688, + "router_z_loss_mlp": 0.49023438, + "step": 1381, + "time_per_iteration": 2.6598143577575684 + }, + { + "auxiliary_loss_clip": 0.06800985, + "auxiliary_loss_mlp": 0.0131809, + "balance_loss_clip": 0.06359053, + "balance_loss_mlp": 0.01265972, + "epoch": 0.08309033518713363, + "flos": 13667769792000.0, + "grad_norm": 3.693105114641136, + "language_loss": 0.91473186, + "learning_rate": 3.970506888268011e-06, + "loss": 0.99592257, + "num_input_tokens_seen": 29439840, + "router_z_loss_clip": 4.41796875, + "router_z_loss_mlp": 0.52148438, + "step": 1382, + "time_per_iteration": 2.5975959300994873 + }, + { + "auxiliary_loss_clip": 0.06790116, + "auxiliary_loss_mlp": 0.01312438, + "balance_loss_clip": 0.06361018, + "balance_loss_mlp": 0.01263229, + "epoch": 0.0831504584398016, + "flos": 17974492346880.0, + "grad_norm": 2.495217268396043, + "language_loss": 0.78734231, + "learning_rate": 3.970440213512121e-06, + "loss": 0.86836791, + "num_input_tokens_seen": 29457360, + "router_z_loss_clip": 4.28515625, + "router_z_loss_mlp": 0.49243164, + "step": 1383, + "time_per_iteration": 2.625793695449829 + }, + { + "auxiliary_loss_clip": 0.06786636, + "auxiliary_loss_mlp": 0.01320002, + "balance_loss_clip": 0.06359254, + "balance_loss_mlp": 0.01273797, + "epoch": 0.08321058169246956, + "flos": 22607959098240.0, + "grad_norm": 2.963836437118746, + "language_loss": 0.85324878, + "learning_rate": 3.97037346403694e-06, + "loss": 0.93431515, + "num_input_tokens_seen": 29477040, + "router_z_loss_clip": 4.26953125, + "router_z_loss_mlp": 0.46240234, + "step": 1384, + "time_per_iteration": 2.6376733779907227 + }, + { + "auxiliary_loss_clip": 0.06818897, + "auxiliary_loss_mlp": 0.01334638, + "balance_loss_clip": 0.06359202, + "balance_loss_mlp": 0.01276106, + "epoch": 0.08327070494513754, + "flos": 22855976784000.0, + "grad_norm": 3.1601990232642225, + "language_loss": 0.86789215, + "learning_rate": 3.970306639845e-06, + "loss": 0.94942749, + "num_input_tokens_seen": 29492010, + "router_z_loss_clip": 4.59375, + "router_z_loss_mlp": 0.58569336, + "step": 1385, + "time_per_iteration": 2.568554639816284 + }, + { + "auxiliary_loss_clip": 0.06798602, + "auxiliary_loss_mlp": 0.0132055, + "balance_loss_clip": 0.06352767, + "balance_loss_mlp": 0.01267978, + "epoch": 0.0833308281978055, + "flos": 22789451041920.0, + "grad_norm": 2.43217008586481, + "language_loss": 0.71394652, + "learning_rate": 3.970239740938835e-06, + "loss": 0.795138, + "num_input_tokens_seen": 29511850, + "router_z_loss_clip": 4.45703125, + "router_z_loss_mlp": 0.52563477, + "step": 1386, + "time_per_iteration": 2.6096982955932617 + }, + { + "auxiliary_loss_clip": 0.06791467, + "auxiliary_loss_mlp": 0.01322523, + "balance_loss_clip": 0.06356902, + "balance_loss_mlp": 0.01273099, + "epoch": 0.08339095145047347, + "flos": 20818713738240.0, + "grad_norm": 2.3900622326762133, + "language_loss": 0.84172809, + "learning_rate": 3.97017276732098e-06, + "loss": 0.92286795, + "num_input_tokens_seen": 29531415, + "router_z_loss_clip": 4.34375, + "router_z_loss_mlp": 0.49389648, + "step": 1387, + "time_per_iteration": 2.575343132019043 + }, + { + "auxiliary_loss_clip": 0.06797379, + "auxiliary_loss_mlp": 0.01318956, + "balance_loss_clip": 0.06353064, + "balance_loss_mlp": 0.01265598, + "epoch": 0.08345107470314143, + "flos": 18521274165120.0, + "grad_norm": 5.434584550719809, + "language_loss": 0.79640985, + "learning_rate": 3.970105718993978e-06, + "loss": 0.87757325, + "num_input_tokens_seen": 29549525, + "router_z_loss_clip": 4.44140625, + "router_z_loss_mlp": 0.53369141, + "step": 1388, + "time_per_iteration": 2.567218780517578 + }, + { + "auxiliary_loss_clip": 0.06780161, + "auxiliary_loss_mlp": 0.01317075, + "balance_loss_clip": 0.06354657, + "balance_loss_mlp": 0.0126932, + "epoch": 0.08351119795580941, + "flos": 18813623460480.0, + "grad_norm": 2.631761877844796, + "language_loss": 0.82141799, + "learning_rate": 3.970038595960369e-06, + "loss": 0.90239036, + "num_input_tokens_seen": 29568705, + "router_z_loss_clip": 4.2578125, + "router_z_loss_mlp": 0.47827148, + "step": 1389, + "time_per_iteration": 2.5653841495513916 + }, + { + "auxiliary_loss_clip": 0.06804, + "auxiliary_loss_mlp": 0.01321664, + "balance_loss_clip": 0.06357203, + "balance_loss_mlp": 0.01264014, + "epoch": 0.08357132120847738, + "flos": 18447662753280.0, + "grad_norm": 4.4672809610096005, + "language_loss": 0.89901805, + "learning_rate": 3.969971398222699e-06, + "loss": 0.9802748, + "num_input_tokens_seen": 29585855, + "router_z_loss_clip": 4.46484375, + "router_z_loss_mlp": 0.57666016, + "step": 1390, + "time_per_iteration": 2.5599520206451416 + }, + { + "auxiliary_loss_clip": 0.06784607, + "auxiliary_loss_mlp": 0.01318322, + "balance_loss_clip": 0.06351756, + "balance_loss_mlp": 0.01268469, + "epoch": 0.08363144446114534, + "flos": 25929585527040.0, + "grad_norm": 2.0099549817565, + "language_loss": 0.88354278, + "learning_rate": 3.969904125783517e-06, + "loss": 0.96457207, + "num_input_tokens_seen": 29607280, + "router_z_loss_clip": 4.328125, + "router_z_loss_mlp": 0.49853516, + "step": 1391, + "time_per_iteration": 2.611985921859741 + }, + { + "auxiliary_loss_clip": 0.06815389, + "auxiliary_loss_mlp": 0.01329624, + "balance_loss_clip": 0.06354406, + "balance_loss_mlp": 0.01268851, + "epoch": 0.08369156771381332, + "flos": 18047223290880.0, + "grad_norm": 3.4660821416963805, + "language_loss": 0.90262675, + "learning_rate": 3.969836778645371e-06, + "loss": 0.98407698, + "num_input_tokens_seen": 29624130, + "router_z_loss_clip": 4.609375, + "router_z_loss_mlp": 0.60791016, + "step": 1392, + "time_per_iteration": 2.5649681091308594 + }, + { + "auxiliary_loss_clip": 0.06784143, + "auxiliary_loss_mlp": 0.01319854, + "balance_loss_clip": 0.06346482, + "balance_loss_mlp": 0.01270025, + "epoch": 0.08375169096648129, + "flos": 22681822072320.0, + "grad_norm": 4.398591622405809, + "language_loss": 0.82388842, + "learning_rate": 3.969769356810819e-06, + "loss": 0.90492845, + "num_input_tokens_seen": 29643210, + "router_z_loss_clip": 4.375, + "router_z_loss_mlp": 0.4987793, + "step": 1393, + "time_per_iteration": 2.596484899520874 + }, + { + "auxiliary_loss_clip": 0.06777762, + "auxiliary_loss_mlp": 0.01325984, + "balance_loss_clip": 0.06353533, + "balance_loss_mlp": 0.01276679, + "epoch": 0.08381181421914925, + "flos": 26110238929920.0, + "grad_norm": 2.2804276198164386, + "language_loss": 0.86896241, + "learning_rate": 3.969701860282415e-06, + "loss": 0.94999981, + "num_input_tokens_seen": 29663920, + "router_z_loss_clip": 4.24414062, + "router_z_loss_mlp": 0.49291992, + "step": 1394, + "time_per_iteration": 2.6082303524017334 + }, + { + "auxiliary_loss_clip": 0.06795013, + "auxiliary_loss_mlp": 0.01318108, + "balance_loss_clip": 0.06360835, + "balance_loss_mlp": 0.01267063, + "epoch": 0.08387193747181723, + "flos": 20635796275200.0, + "grad_norm": 2.9482675367733306, + "language_loss": 0.84974355, + "learning_rate": 3.969634289062719e-06, + "loss": 0.93087476, + "num_input_tokens_seen": 29683825, + "router_z_loss_clip": 4.3359375, + "router_z_loss_mlp": 0.51098633, + "step": 1395, + "time_per_iteration": 2.579622745513916 + }, + { + "auxiliary_loss_clip": 0.06798401, + "auxiliary_loss_mlp": 0.01311309, + "balance_loss_clip": 0.06349191, + "balance_loss_mlp": 0.01256282, + "epoch": 0.0839320607244852, + "flos": 13448193367680.0, + "grad_norm": 3.513957453818194, + "language_loss": 0.85002828, + "learning_rate": 3.969566643154293e-06, + "loss": 0.93112534, + "num_input_tokens_seen": 29698775, + "router_z_loss_clip": 4.48828125, + "router_z_loss_mlp": 0.55078125, + "step": 1396, + "time_per_iteration": 2.5521080493927 + }, + { + "auxiliary_loss_clip": 0.06784061, + "auxiliary_loss_mlp": 0.0131232, + "balance_loss_clip": 0.06356047, + "balance_loss_mlp": 0.0126261, + "epoch": 0.08399218397715316, + "flos": 23484000735360.0, + "grad_norm": 4.145800578493811, + "language_loss": 0.79030329, + "learning_rate": 3.969498922559703e-06, + "loss": 0.87126708, + "num_input_tokens_seen": 29719430, + "router_z_loss_clip": 4.28515625, + "router_z_loss_mlp": 0.49682617, + "step": 1397, + "time_per_iteration": 4.026551961898804 + }, + { + "auxiliary_loss_clip": 0.06777123, + "auxiliary_loss_mlp": 0.01309701, + "balance_loss_clip": 0.06349255, + "balance_loss_mlp": 0.01258655, + "epoch": 0.08405230722982113, + "flos": 25927698810240.0, + "grad_norm": 3.1837358420566173, + "language_loss": 0.79802477, + "learning_rate": 3.969431127281516e-06, + "loss": 0.87889296, + "num_input_tokens_seen": 29739685, + "router_z_loss_clip": 4.27734375, + "router_z_loss_mlp": 0.51123047, + "step": 1398, + "time_per_iteration": 2.6027841567993164 + }, + { + "auxiliary_loss_clip": 0.06793746, + "auxiliary_loss_mlp": 0.01312625, + "balance_loss_clip": 0.06375143, + "balance_loss_mlp": 0.01265299, + "epoch": 0.0841124304824891, + "flos": 17973192608640.0, + "grad_norm": 3.0716222673767404, + "language_loss": 0.96745825, + "learning_rate": 3.969363257322304e-06, + "loss": 1.048522, + "num_input_tokens_seen": 29756165, + "router_z_loss_clip": 4.1875, + "router_z_loss_mlp": 0.47290039, + "step": 1399, + "time_per_iteration": 3.9915521144866943 + }, + { + "auxiliary_loss_clip": 0.06813341, + "auxiliary_loss_mlp": 0.01316281, + "balance_loss_clip": 0.06352973, + "balance_loss_mlp": 0.01258012, + "epoch": 0.08417255373515707, + "flos": 25636733107200.0, + "grad_norm": 6.6751707009018055, + "language_loss": 0.83959824, + "learning_rate": 3.96929531268464e-06, + "loss": 0.92089444, + "num_input_tokens_seen": 29776425, + "router_z_loss_clip": 4.6015625, + "router_z_loss_mlp": 0.58300781, + "step": 1400, + "time_per_iteration": 2.6097705364227295 + }, + { + "auxiliary_loss_clip": 0.06801295, + "auxiliary_loss_mlp": 0.01317439, + "balance_loss_clip": 0.06362335, + "balance_loss_mlp": 0.01264868, + "epoch": 0.08423267698782504, + "flos": 26256874775040.0, + "grad_norm": 2.3612401801911487, + "language_loss": 0.8841815, + "learning_rate": 3.969227293371099e-06, + "loss": 0.96536887, + "num_input_tokens_seen": 29796440, + "router_z_loss_clip": 4.38671875, + "router_z_loss_mlp": 0.52539062, + "step": 1401, + "time_per_iteration": 2.654085874557495 + }, + { + "auxiliary_loss_clip": 0.06806403, + "auxiliary_loss_mlp": 0.01316426, + "balance_loss_clip": 0.0637629, + "balance_loss_mlp": 0.01264594, + "epoch": 0.08429280024049302, + "flos": 20125757272320.0, + "grad_norm": 2.1446358728684753, + "language_loss": 0.90116793, + "learning_rate": 3.969159199384263e-06, + "loss": 0.98239625, + "num_input_tokens_seen": 29814755, + "router_z_loss_clip": 4.296875, + "router_z_loss_mlp": 0.51733398, + "step": 1402, + "time_per_iteration": 4.018750905990601 + }, + { + "auxiliary_loss_clip": 0.067935, + "auxiliary_loss_mlp": 0.01308153, + "balance_loss_clip": 0.06370865, + "balance_loss_mlp": 0.01261519, + "epoch": 0.08435292349316098, + "flos": 42934593335040.0, + "grad_norm": 3.3097945414979324, + "language_loss": 0.91613716, + "learning_rate": 3.9690910307267125e-06, + "loss": 0.99715364, + "num_input_tokens_seen": 29834785, + "router_z_loss_clip": 4.21484375, + "router_z_loss_mlp": 0.46655273, + "step": 1403, + "time_per_iteration": 2.75314998626709 + }, + { + "auxiliary_loss_clip": 0.06802634, + "auxiliary_loss_mlp": 0.01312918, + "balance_loss_clip": 0.0636553, + "balance_loss_mlp": 0.01259679, + "epoch": 0.08441304674582895, + "flos": 22863984848640.0, + "grad_norm": 2.1842752098613696, + "language_loss": 0.8341198, + "learning_rate": 3.969022787401033e-06, + "loss": 0.91527522, + "num_input_tokens_seen": 29854695, + "router_z_loss_clip": 4.37109375, + "router_z_loss_mlp": 0.5324707, + "step": 1404, + "time_per_iteration": 4.128188371658325 + }, + { + "auxiliary_loss_clip": 0.06814778, + "auxiliary_loss_mlp": 0.01317505, + "balance_loss_clip": 0.06364593, + "balance_loss_mlp": 0.01263884, + "epoch": 0.08447316999849692, + "flos": 18703436941440.0, + "grad_norm": 2.408821192970914, + "language_loss": 0.85791099, + "learning_rate": 3.968954469409811e-06, + "loss": 0.93923384, + "num_input_tokens_seen": 29872180, + "router_z_loss_clip": 4.5, + "router_z_loss_mlp": 0.53588867, + "step": 1405, + "time_per_iteration": 2.6186141967773438 + }, + { + "auxiliary_loss_clip": 0.06785356, + "auxiliary_loss_mlp": 0.01307288, + "balance_loss_clip": 0.06358731, + "balance_loss_mlp": 0.01261488, + "epoch": 0.08453329325116489, + "flos": 25491061584000.0, + "grad_norm": 2.376275583502495, + "language_loss": 0.82456648, + "learning_rate": 3.968886076755639e-06, + "loss": 0.9054929, + "num_input_tokens_seen": 29893205, + "router_z_loss_clip": 4.2578125, + "router_z_loss_mlp": 0.45825195, + "step": 1406, + "time_per_iteration": 2.620391845703125 + }, + { + "auxiliary_loss_clip": 0.06791453, + "auxiliary_loss_mlp": 0.01321291, + "balance_loss_clip": 0.06356591, + "balance_loss_mlp": 0.01271461, + "epoch": 0.08459341650383286, + "flos": 20925839583360.0, + "grad_norm": 2.994077443847897, + "language_loss": 0.81261843, + "learning_rate": 3.96881760944111e-06, + "loss": 0.8937459, + "num_input_tokens_seen": 29911970, + "router_z_loss_clip": 4.34765625, + "router_z_loss_mlp": 0.49853516, + "step": 1407, + "time_per_iteration": 2.6037673950195312 + }, + { + "auxiliary_loss_clip": 0.06790854, + "auxiliary_loss_mlp": 0.01321715, + "balance_loss_clip": 0.06351606, + "balance_loss_mlp": 0.01269525, + "epoch": 0.08465353975650082, + "flos": 13048215102720.0, + "grad_norm": 4.665844838977458, + "language_loss": 0.93093699, + "learning_rate": 3.968749067468819e-06, + "loss": 1.01206267, + "num_input_tokens_seen": 29929925, + "router_z_loss_clip": 4.39453125, + "router_z_loss_mlp": 0.52197266, + "step": 1408, + "time_per_iteration": 2.5401058197021484 + }, + { + "auxiliary_loss_clip": 0.06614841, + "auxiliary_loss_mlp": 0.0131788, + "balance_loss_clip": 0.06340891, + "balance_loss_mlp": 0.01289985, + "epoch": 0.0847136630091688, + "flos": 60896912112000.0, + "grad_norm": 0.8563868358173309, + "language_loss": 0.62132567, + "learning_rate": 3.968680450841368e-06, + "loss": 0.7006529, + "num_input_tokens_seen": 29985950, + "router_z_loss_clip": 2.75, + "router_z_loss_mlp": 0.27954102, + "step": 1409, + "time_per_iteration": 3.2652077674865723 + }, + { + "auxiliary_loss_clip": 0.06755531, + "auxiliary_loss_mlp": 0.01311791, + "balance_loss_clip": 0.06338526, + "balance_loss_mlp": 0.01266802, + "epoch": 0.08477378626183676, + "flos": 22051743696000.0, + "grad_norm": 2.2146573769232916, + "language_loss": 0.88621575, + "learning_rate": 3.968611759561355e-06, + "loss": 0.96688896, + "num_input_tokens_seen": 30004330, + "router_z_loss_clip": 4.1640625, + "router_z_loss_mlp": 0.44995117, + "step": 1410, + "time_per_iteration": 2.5771710872650146 + }, + { + "auxiliary_loss_clip": 0.06769306, + "auxiliary_loss_mlp": 0.01318797, + "balance_loss_clip": 0.06336072, + "balance_loss_mlp": 0.01268253, + "epoch": 0.08483390951450473, + "flos": 16695537552000.0, + "grad_norm": 2.3714211979189987, + "language_loss": 0.76187658, + "learning_rate": 3.968542993631388e-06, + "loss": 0.84275758, + "num_input_tokens_seen": 30022555, + "router_z_loss_clip": 4.328125, + "router_z_loss_mlp": 0.50585938, + "step": 1411, + "time_per_iteration": 2.5831918716430664 + }, + { + "auxiliary_loss_clip": 0.06605848, + "auxiliary_loss_mlp": 0.01302084, + "balance_loss_clip": 0.06331737, + "balance_loss_mlp": 0.01268491, + "epoch": 0.08489403276717271, + "flos": 51604430313600.0, + "grad_norm": 0.8982882759913209, + "language_loss": 0.57100856, + "learning_rate": 3.968474153054073e-06, + "loss": 0.65008789, + "num_input_tokens_seen": 30077220, + "router_z_loss_clip": 2.7421875, + "router_z_loss_mlp": 0.33618164, + "step": 1412, + "time_per_iteration": 3.1449196338653564 + }, + { + "auxiliary_loss_clip": 0.06776647, + "auxiliary_loss_mlp": 0.0131046, + "balance_loss_clip": 0.06348051, + "balance_loss_mlp": 0.01261393, + "epoch": 0.08495415601984067, + "flos": 17098031439360.0, + "grad_norm": 4.4528738806487, + "language_loss": 0.91184032, + "learning_rate": 3.96840523783202e-06, + "loss": 0.99271137, + "num_input_tokens_seen": 30094600, + "router_z_loss_clip": 4.28515625, + "router_z_loss_mlp": 0.49145508, + "step": 1413, + "time_per_iteration": 2.5736677646636963 + }, + { + "auxiliary_loss_clip": 0.06762269, + "auxiliary_loss_mlp": 0.01310346, + "balance_loss_clip": 0.06341726, + "balance_loss_mlp": 0.01261685, + "epoch": 0.08501427927250864, + "flos": 23155034405760.0, + "grad_norm": 2.1658829941413997, + "language_loss": 0.9017415, + "learning_rate": 3.968336247967844e-06, + "loss": 0.98246765, + "num_input_tokens_seen": 30114475, + "router_z_loss_clip": 4.20703125, + "router_z_loss_mlp": 0.48706055, + "step": 1414, + "time_per_iteration": 2.6087806224823 + }, + { + "auxiliary_loss_clip": 0.06782193, + "auxiliary_loss_mlp": 0.01303484, + "balance_loss_clip": 0.06352735, + "balance_loss_mlp": 0.01258423, + "epoch": 0.08507440252517662, + "flos": 19069649210880.0, + "grad_norm": 2.082765030572706, + "language_loss": 0.79920703, + "learning_rate": 3.96826718346416e-06, + "loss": 0.88006377, + "num_input_tokens_seen": 30133350, + "router_z_loss_clip": 4.2890625, + "router_z_loss_mlp": 0.45068359, + "step": 1415, + "time_per_iteration": 2.5629544258117676 + }, + { + "auxiliary_loss_clip": 0.06759159, + "auxiliary_loss_mlp": 0.01306699, + "balance_loss_clip": 0.06336564, + "balance_loss_mlp": 0.01259492, + "epoch": 0.08513452577784458, + "flos": 60195249550080.0, + "grad_norm": 8.264598666401978, + "language_loss": 0.72300386, + "learning_rate": 3.968198044323587e-06, + "loss": 0.80366242, + "num_input_tokens_seen": 30159005, + "router_z_loss_clip": 4.21875, + "router_z_loss_mlp": 0.47216797, + "step": 1416, + "time_per_iteration": 2.9444239139556885 + }, + { + "auxiliary_loss_clip": 0.06803774, + "auxiliary_loss_mlp": 0.01317561, + "balance_loss_clip": 0.0635466, + "balance_loss_mlp": 0.01264608, + "epoch": 0.08519464903051255, + "flos": 27315917729280.0, + "grad_norm": 2.5149113887395407, + "language_loss": 0.77021283, + "learning_rate": 3.968128830548748e-06, + "loss": 0.85142624, + "num_input_tokens_seen": 30179450, + "router_z_loss_clip": 4.48046875, + "router_z_loss_mlp": 0.5300293, + "step": 1417, + "time_per_iteration": 2.619328260421753 + }, + { + "auxiliary_loss_clip": 0.06779526, + "auxiliary_loss_mlp": 0.01310101, + "balance_loss_clip": 0.06341187, + "balance_loss_mlp": 0.01259341, + "epoch": 0.08525477228318051, + "flos": 20272644679680.0, + "grad_norm": 2.930615198621333, + "language_loss": 0.84423447, + "learning_rate": 3.968059542142265e-06, + "loss": 0.92513078, + "num_input_tokens_seen": 30197235, + "router_z_loss_clip": 4.37890625, + "router_z_loss_mlp": 0.5078125, + "step": 1418, + "time_per_iteration": 2.5782899856567383 + }, + { + "auxiliary_loss_clip": 0.06606524, + "auxiliary_loss_mlp": 0.01274095, + "balance_loss_clip": 0.06333332, + "balance_loss_mlp": 0.01249931, + "epoch": 0.08531489553584849, + "flos": 67633580672640.0, + "grad_norm": 0.9458512268838744, + "language_loss": 0.5659793, + "learning_rate": 3.9679901791067685e-06, + "loss": 0.64478552, + "num_input_tokens_seen": 30257410, + "router_z_loss_clip": 2.73046875, + "router_z_loss_mlp": 0.24157715, + "step": 1419, + "time_per_iteration": 3.1296868324279785 + }, + { + "auxiliary_loss_clip": 0.06790996, + "auxiliary_loss_mlp": 0.01306783, + "balance_loss_clip": 0.06354627, + "balance_loss_mlp": 0.01259004, + "epoch": 0.08537501878851646, + "flos": 27534362123520.0, + "grad_norm": 2.6126551890980076, + "language_loss": 0.72536588, + "learning_rate": 3.967920741444886e-06, + "loss": 0.80634367, + "num_input_tokens_seen": 30277865, + "router_z_loss_clip": 4.359375, + "router_z_loss_mlp": 0.4777832, + "step": 1420, + "time_per_iteration": 2.629305839538574 + }, + { + "auxiliary_loss_clip": 0.06772007, + "auxiliary_loss_mlp": 0.01307483, + "balance_loss_clip": 0.06343359, + "balance_loss_mlp": 0.01257272, + "epoch": 0.08543514204118442, + "flos": 22790918488320.0, + "grad_norm": 2.3388359886837917, + "language_loss": 0.89903885, + "learning_rate": 3.967851229159252e-06, + "loss": 0.97983378, + "num_input_tokens_seen": 30298545, + "router_z_loss_clip": 4.27929688, + "router_z_loss_mlp": 0.50244141, + "step": 1421, + "time_per_iteration": 2.5863590240478516 + }, + { + "auxiliary_loss_clip": 0.06597036, + "auxiliary_loss_mlp": 0.01275978, + "balance_loss_clip": 0.06325173, + "balance_loss_mlp": 0.01249919, + "epoch": 0.0854952652938524, + "flos": 61010872064640.0, + "grad_norm": 0.7745811005373293, + "language_loss": 0.63692141, + "learning_rate": 3.967781642252502e-06, + "loss": 0.71565151, + "num_input_tokens_seen": 30361725, + "router_z_loss_clip": 2.71875, + "router_z_loss_mlp": 0.26098633, + "step": 1422, + "time_per_iteration": 3.19461989402771 + }, + { + "auxiliary_loss_clip": 0.06765623, + "auxiliary_loss_mlp": 0.01311314, + "balance_loss_clip": 0.06344545, + "balance_loss_mlp": 0.01266444, + "epoch": 0.08555538854652037, + "flos": 28045575083520.0, + "grad_norm": 3.3087422543747205, + "language_loss": 0.84878761, + "learning_rate": 3.967711980727276e-06, + "loss": 0.92955703, + "num_input_tokens_seen": 30382180, + "router_z_loss_clip": 4.21289062, + "router_z_loss_mlp": 0.44873047, + "step": 1423, + "time_per_iteration": 2.6554226875305176 + }, + { + "auxiliary_loss_clip": 0.06776007, + "auxiliary_loss_mlp": 0.01303967, + "balance_loss_clip": 0.06351057, + "balance_loss_mlp": 0.01261314, + "epoch": 0.08561551179918833, + "flos": 23515293035520.0, + "grad_norm": 2.569087931646671, + "language_loss": 0.7765131, + "learning_rate": 3.967642244586213e-06, + "loss": 0.85731286, + "num_input_tokens_seen": 30402980, + "router_z_loss_clip": 4.24609375, + "router_z_loss_mlp": 0.42602539, + "step": 1424, + "time_per_iteration": 2.7058026790618896 + }, + { + "auxiliary_loss_clip": 0.06765693, + "auxiliary_loss_mlp": 0.01310667, + "balance_loss_clip": 0.06343248, + "balance_loss_mlp": 0.01265988, + "epoch": 0.08567563505185631, + "flos": 17932005527040.0, + "grad_norm": 1.9981101747379681, + "language_loss": 0.78279495, + "learning_rate": 3.96757243383196e-06, + "loss": 0.86355859, + "num_input_tokens_seen": 30420800, + "router_z_loss_clip": 4.22265625, + "router_z_loss_mlp": 0.44677734, + "step": 1425, + "time_per_iteration": 2.575941801071167 + }, + { + "auxiliary_loss_clip": 0.06768522, + "auxiliary_loss_mlp": 0.01310756, + "balance_loss_clip": 0.06347974, + "balance_loss_mlp": 0.01264074, + "epoch": 0.08573575830452428, + "flos": 19725695153280.0, + "grad_norm": 2.337358950389625, + "language_loss": 0.95636088, + "learning_rate": 3.9675025484671624e-06, + "loss": 1.03715372, + "num_input_tokens_seen": 30439620, + "router_z_loss_clip": 4.20507812, + "router_z_loss_mlp": 0.46679688, + "step": 1426, + "time_per_iteration": 2.5706772804260254 + }, + { + "auxiliary_loss_clip": 0.06791019, + "auxiliary_loss_mlp": 0.01318941, + "balance_loss_clip": 0.06355577, + "balance_loss_mlp": 0.01267776, + "epoch": 0.08579588155719224, + "flos": 17937414115200.0, + "grad_norm": 3.6077969135085945, + "language_loss": 0.78100324, + "learning_rate": 3.967432588494471e-06, + "loss": 0.86210281, + "num_input_tokens_seen": 30457300, + "router_z_loss_clip": 4.3515625, + "router_z_loss_mlp": 0.51196289, + "step": 1427, + "time_per_iteration": 2.620664119720459 + }, + { + "auxiliary_loss_clip": 0.06773555, + "auxiliary_loss_mlp": 0.01322231, + "balance_loss_clip": 0.06351949, + "balance_loss_mlp": 0.01272831, + "epoch": 0.08585600480986022, + "flos": 16038694995840.0, + "grad_norm": 4.670417341284444, + "language_loss": 0.84344131, + "learning_rate": 3.96736255391654e-06, + "loss": 0.92439914, + "num_input_tokens_seen": 30471580, + "router_z_loss_clip": 4.21679688, + "router_z_loss_mlp": 0.49414062, + "step": 1428, + "time_per_iteration": 2.5323448181152344 + }, + { + "auxiliary_loss_clip": 0.06797348, + "auxiliary_loss_mlp": 0.01327926, + "balance_loss_clip": 0.06359121, + "balance_loss_mlp": 0.01274211, + "epoch": 0.08591612806252819, + "flos": 28664920137600.0, + "grad_norm": 3.8563401660428136, + "language_loss": 0.82438064, + "learning_rate": 3.967292444736023e-06, + "loss": 0.90563333, + "num_input_tokens_seen": 30492720, + "router_z_loss_clip": 4.375, + "router_z_loss_mlp": 0.53710938, + "step": 1429, + "time_per_iteration": 2.6729156970977783 + }, + { + "auxiliary_loss_clip": 0.06787296, + "auxiliary_loss_mlp": 0.01320421, + "balance_loss_clip": 0.06368907, + "balance_loss_mlp": 0.0127586, + "epoch": 0.08597625131519615, + "flos": 20965349583360.0, + "grad_norm": 2.123464733030403, + "language_loss": 0.90146309, + "learning_rate": 3.967222260955578e-06, + "loss": 0.98254025, + "num_input_tokens_seen": 30509535, + "router_z_loss_clip": 4.18554688, + "router_z_loss_mlp": 0.44580078, + "step": 1430, + "time_per_iteration": 2.553527355194092 + }, + { + "auxiliary_loss_clip": 0.06773631, + "auxiliary_loss_mlp": 0.01318779, + "balance_loss_clip": 0.06357691, + "balance_loss_mlp": 0.01274552, + "epoch": 0.08603637456786412, + "flos": 23262747229440.0, + "grad_norm": 2.0722520617005924, + "language_loss": 0.84170914, + "learning_rate": 3.96715200257787e-06, + "loss": 0.92263317, + "num_input_tokens_seen": 30529490, + "router_z_loss_clip": 4.16015625, + "router_z_loss_mlp": 0.44213867, + "step": 1431, + "time_per_iteration": 2.5954349040985107 + }, + { + "auxiliary_loss_clip": 0.06773046, + "auxiliary_loss_mlp": 0.01317231, + "balance_loss_clip": 0.06352717, + "balance_loss_mlp": 0.01270858, + "epoch": 0.0860964978205321, + "flos": 28701704880000.0, + "grad_norm": 5.769747909175534, + "language_loss": 0.79544812, + "learning_rate": 3.967081669605559e-06, + "loss": 0.87635088, + "num_input_tokens_seen": 30550205, + "router_z_loss_clip": 4.19726562, + "router_z_loss_mlp": 0.46362305, + "step": 1432, + "time_per_iteration": 2.6024515628814697 + }, + { + "auxiliary_loss_clip": 0.06771973, + "auxiliary_loss_mlp": 0.01314171, + "balance_loss_clip": 0.06355675, + "balance_loss_mlp": 0.01269325, + "epoch": 0.08615662107320006, + "flos": 19324542931200.0, + "grad_norm": 3.3903634053002336, + "language_loss": 0.75487757, + "learning_rate": 3.967011262041315e-06, + "loss": 0.83573902, + "num_input_tokens_seen": 30568830, + "router_z_loss_clip": 4.1640625, + "router_z_loss_mlp": 0.44848633, + "step": 1433, + "time_per_iteration": 2.5895845890045166 + }, + { + "auxiliary_loss_clip": 0.06795658, + "auxiliary_loss_mlp": 0.01322619, + "balance_loss_clip": 0.0636312, + "balance_loss_mlp": 0.01272313, + "epoch": 0.08621674432586802, + "flos": 15857161125120.0, + "grad_norm": 4.641351982999466, + "language_loss": 0.88055921, + "learning_rate": 3.9669407798878065e-06, + "loss": 0.96174198, + "num_input_tokens_seen": 30585730, + "router_z_loss_clip": 4.328125, + "router_z_loss_mlp": 0.50268555, + "step": 1434, + "time_per_iteration": 2.5355098247528076 + }, + { + "auxiliary_loss_clip": 0.06779063, + "auxiliary_loss_mlp": 0.01311558, + "balance_loss_clip": 0.06353655, + "balance_loss_mlp": 0.01263803, + "epoch": 0.086276867578536, + "flos": 14105874464640.0, + "grad_norm": 4.793331202343017, + "language_loss": 0.80184627, + "learning_rate": 3.966870223147707e-06, + "loss": 0.88275254, + "num_input_tokens_seen": 30603180, + "router_z_loss_clip": 4.25195312, + "router_z_loss_mlp": 0.4777832, + "step": 1435, + "time_per_iteration": 2.57381272315979 + }, + { + "auxiliary_loss_clip": 0.06627634, + "auxiliary_loss_mlp": 0.01282391, + "balance_loss_clip": 0.06350996, + "balance_loss_mlp": 0.01255616, + "epoch": 0.08633699083120397, + "flos": 70206500142720.0, + "grad_norm": 0.941958531658993, + "language_loss": 0.58419931, + "learning_rate": 3.96679959182369e-06, + "loss": 0.66329956, + "num_input_tokens_seen": 30668895, + "router_z_loss_clip": 2.7734375, + "router_z_loss_mlp": 0.26831055, + "step": 1436, + "time_per_iteration": 3.282787561416626 + }, + { + "auxiliary_loss_clip": 0.06781173, + "auxiliary_loss_mlp": 0.01309156, + "balance_loss_clip": 0.06351152, + "balance_loss_mlp": 0.01261949, + "epoch": 0.08639711408387193, + "flos": 30306565330560.0, + "grad_norm": 3.136203943019662, + "language_loss": 0.71995145, + "learning_rate": 3.966728885918437e-06, + "loss": 0.80085474, + "num_input_tokens_seen": 30688955, + "router_z_loss_clip": 4.296875, + "router_z_loss_mlp": 0.47167969, + "step": 1437, + "time_per_iteration": 4.062320232391357 + }, + { + "auxiliary_loss_clip": 0.06771993, + "auxiliary_loss_mlp": 0.01311453, + "balance_loss_clip": 0.06345055, + "balance_loss_mlp": 0.01262553, + "epoch": 0.08645723733653991, + "flos": 20303014584960.0, + "grad_norm": 2.1552544434513154, + "language_loss": 0.74663305, + "learning_rate": 3.966658105434627e-06, + "loss": 0.82746744, + "num_input_tokens_seen": 30706095, + "router_z_loss_clip": 4.26757812, + "router_z_loss_mlp": 0.48925781, + "step": 1438, + "time_per_iteration": 2.5902743339538574 + }, + { + "auxiliary_loss_clip": 0.06752677, + "auxiliary_loss_mlp": 0.01311557, + "balance_loss_clip": 0.06331892, + "balance_loss_mlp": 0.01263468, + "epoch": 0.08651736058920788, + "flos": 32898911748480.0, + "grad_norm": 2.1102638652127093, + "language_loss": 0.6610049, + "learning_rate": 3.966587250374945e-06, + "loss": 0.7416473, + "num_input_tokens_seen": 30729025, + "router_z_loss_clip": 4.20703125, + "router_z_loss_mlp": 0.48071289, + "step": 1439, + "time_per_iteration": 4.177356719970703 + }, + { + "auxiliary_loss_clip": 0.06767576, + "auxiliary_loss_mlp": 0.01319213, + "balance_loss_clip": 0.06342776, + "balance_loss_mlp": 0.01270934, + "epoch": 0.08657748384187584, + "flos": 22643863372800.0, + "grad_norm": 6.195931442958794, + "language_loss": 0.89298683, + "learning_rate": 3.966516320742077e-06, + "loss": 0.97385472, + "num_input_tokens_seen": 30746155, + "router_z_loss_clip": 4.25, + "router_z_loss_mlp": 0.4831543, + "step": 1440, + "time_per_iteration": 2.5557472705841064 + }, + { + "auxiliary_loss_clip": 0.06781097, + "auxiliary_loss_mlp": 0.01307911, + "balance_loss_clip": 0.06338568, + "balance_loss_mlp": 0.01254028, + "epoch": 0.08663760709454381, + "flos": 23664947627520.0, + "grad_norm": 2.369224573412665, + "language_loss": 0.86471045, + "learning_rate": 3.9664453165387124e-06, + "loss": 0.94560057, + "num_input_tokens_seen": 30761410, + "router_z_loss_clip": 4.421875, + "router_z_loss_mlp": 0.53833008, + "step": 1441, + "time_per_iteration": 2.65085768699646 + }, + { + "auxiliary_loss_clip": 0.06611373, + "auxiliary_loss_mlp": 0.01295436, + "balance_loss_clip": 0.06333591, + "balance_loss_mlp": 0.01268138, + "epoch": 0.08669773034721179, + "flos": 62703823484160.0, + "grad_norm": 0.803695610307685, + "language_loss": 0.60671109, + "learning_rate": 3.966374237767545e-06, + "loss": 0.68577921, + "num_input_tokens_seen": 30823010, + "router_z_loss_clip": 2.78125, + "router_z_loss_mlp": 0.27368164, + "step": 1442, + "time_per_iteration": 4.761855125427246 + }, + { + "auxiliary_loss_clip": 0.0676527, + "auxiliary_loss_mlp": 0.0130763, + "balance_loss_clip": 0.06333362, + "balance_loss_mlp": 0.0125885, + "epoch": 0.08675785359987975, + "flos": 20673713047680.0, + "grad_norm": 2.753695330350272, + "language_loss": 0.81546146, + "learning_rate": 3.96630308443127e-06, + "loss": 0.8961904, + "num_input_tokens_seen": 30841980, + "router_z_loss_clip": 4.31640625, + "router_z_loss_mlp": 0.48803711, + "step": 1443, + "time_per_iteration": 2.581735134124756 + }, + { + "auxiliary_loss_clip": 0.06751874, + "auxiliary_loss_mlp": 0.01309584, + "balance_loss_clip": 0.06329648, + "balance_loss_mlp": 0.01264404, + "epoch": 0.08681797685254772, + "flos": 26948070305280.0, + "grad_norm": 2.052695672066824, + "language_loss": 0.83898687, + "learning_rate": 3.966231856532584e-06, + "loss": 0.91960144, + "num_input_tokens_seen": 30863280, + "router_z_loss_clip": 4.22460938, + "router_z_loss_mlp": 0.45166016, + "step": 1444, + "time_per_iteration": 4.03491473197937 + }, + { + "auxiliary_loss_clip": 0.06771353, + "auxiliary_loss_mlp": 0.01313762, + "balance_loss_clip": 0.063327, + "balance_loss_mlp": 0.01263408, + "epoch": 0.0868781001052157, + "flos": 17718676231680.0, + "grad_norm": 2.3029002758170236, + "language_loss": 0.89515543, + "learning_rate": 3.966160554074189e-06, + "loss": 0.97600663, + "num_input_tokens_seen": 30881710, + "router_z_loss_clip": 4.3828125, + "router_z_loss_mlp": 0.50341797, + "step": 1445, + "time_per_iteration": 2.53659987449646 + }, + { + "auxiliary_loss_clip": 0.06757164, + "auxiliary_loss_mlp": 0.01319102, + "balance_loss_clip": 0.0633342, + "balance_loss_mlp": 0.01269916, + "epoch": 0.08693822335788366, + "flos": 19901820435840.0, + "grad_norm": 2.912516601595955, + "language_loss": 0.84297967, + "learning_rate": 3.96608917705879e-06, + "loss": 0.92374229, + "num_input_tokens_seen": 30900225, + "router_z_loss_clip": 4.23632812, + "router_z_loss_mlp": 0.49169922, + "step": 1446, + "time_per_iteration": 2.5991437435150146 + }, + { + "auxiliary_loss_clip": 0.06602339, + "auxiliary_loss_mlp": 0.01278086, + "balance_loss_clip": 0.06327674, + "balance_loss_mlp": 0.01252623, + "epoch": 0.08699834661055163, + "flos": 67040957871360.0, + "grad_norm": 0.7332106315857324, + "language_loss": 0.54912937, + "learning_rate": 3.966017725489091e-06, + "loss": 0.62793368, + "num_input_tokens_seen": 30959580, + "router_z_loss_clip": 2.75, + "router_z_loss_mlp": 0.25488281, + "step": 1447, + "time_per_iteration": 3.2708306312561035 + }, + { + "auxiliary_loss_clip": 0.06739033, + "auxiliary_loss_mlp": 0.01328667, + "balance_loss_clip": 0.06324905, + "balance_loss_mlp": 0.01282223, + "epoch": 0.0870584698632196, + "flos": 13485648942720.0, + "grad_norm": 3.073032874929238, + "language_loss": 0.86241722, + "learning_rate": 3.965946199367804e-06, + "loss": 0.94309419, + "num_input_tokens_seen": 30976775, + "router_z_loss_clip": 4.140625, + "router_z_loss_mlp": 0.46508789, + "step": 1448, + "time_per_iteration": 2.537522792816162 + }, + { + "auxiliary_loss_clip": 0.067637, + "auxiliary_loss_mlp": 0.01323636, + "balance_loss_clip": 0.06333195, + "balance_loss_mlp": 0.01275666, + "epoch": 0.08711859311588757, + "flos": 16112516042880.0, + "grad_norm": 5.523495984670142, + "language_loss": 0.81949937, + "learning_rate": 3.965874598697638e-06, + "loss": 0.90037274, + "num_input_tokens_seen": 30990495, + "router_z_loss_clip": 4.3046875, + "router_z_loss_mlp": 0.47949219, + "step": 1449, + "time_per_iteration": 2.57389760017395 + }, + { + "auxiliary_loss_clip": 0.06749628, + "auxiliary_loss_mlp": 0.01305238, + "balance_loss_clip": 0.06335508, + "balance_loss_mlp": 0.01262227, + "epoch": 0.08717871636855554, + "flos": 38481528424320.0, + "grad_norm": 2.3810554922577354, + "language_loss": 0.73064238, + "learning_rate": 3.965802923481313e-06, + "loss": 0.81119096, + "num_input_tokens_seen": 31014080, + "router_z_loss_clip": 4.140625, + "router_z_loss_mlp": 0.43017578, + "step": 1450, + "time_per_iteration": 2.7252304553985596 + }, + { + "auxiliary_loss_clip": 0.06761701, + "auxiliary_loss_mlp": 0.01323911, + "balance_loss_clip": 0.06337759, + "balance_loss_mlp": 0.01275416, + "epoch": 0.0872388396212235, + "flos": 17605932163200.0, + "grad_norm": 2.1112425767796474, + "language_loss": 0.85553432, + "learning_rate": 3.965731173721542e-06, + "loss": 0.9363904, + "num_input_tokens_seen": 31031210, + "router_z_loss_clip": 4.24414062, + "router_z_loss_mlp": 0.48486328, + "step": 1451, + "time_per_iteration": 2.556896209716797 + }, + { + "auxiliary_loss_clip": 0.06751224, + "auxiliary_loss_mlp": 0.01307951, + "balance_loss_clip": 0.06344092, + "balance_loss_mlp": 0.01266395, + "epoch": 0.08729896287389148, + "flos": 25265489592960.0, + "grad_norm": 2.067410826923288, + "language_loss": 0.76721281, + "learning_rate": 3.965659349421049e-06, + "loss": 0.84780455, + "num_input_tokens_seen": 31049710, + "router_z_loss_clip": 4.0703125, + "router_z_loss_mlp": 0.41577148, + "step": 1452, + "time_per_iteration": 2.5980234146118164 + }, + { + "auxiliary_loss_clip": 0.06767467, + "auxiliary_loss_mlp": 0.01321022, + "balance_loss_clip": 0.06343699, + "balance_loss_mlp": 0.01272623, + "epoch": 0.08735908612655945, + "flos": 15637836263040.0, + "grad_norm": 4.836985480100509, + "language_loss": 0.8246457, + "learning_rate": 3.965587450582556e-06, + "loss": 0.90553057, + "num_input_tokens_seen": 31066160, + "router_z_loss_clip": 4.23828125, + "router_z_loss_mlp": 0.48364258, + "step": 1453, + "time_per_iteration": 2.5459630489349365 + }, + { + "auxiliary_loss_clip": 0.06754768, + "auxiliary_loss_mlp": 0.0129928, + "balance_loss_clip": 0.06342497, + "balance_loss_mlp": 0.0125646, + "epoch": 0.08741920937922741, + "flos": 20345920675200.0, + "grad_norm": 3.0656217118084, + "language_loss": 0.72998244, + "learning_rate": 3.96551547720879e-06, + "loss": 0.81052291, + "num_input_tokens_seen": 31085270, + "router_z_loss_clip": 4.12695312, + "router_z_loss_mlp": 0.42822266, + "step": 1454, + "time_per_iteration": 2.551548957824707 + }, + { + "auxiliary_loss_clip": 0.0662789, + "auxiliary_loss_mlp": 0.01303999, + "balance_loss_clip": 0.06353966, + "balance_loss_mlp": 0.01280789, + "epoch": 0.08747933263189539, + "flos": 62841052944000.0, + "grad_norm": 0.7529223255178736, + "language_loss": 0.58298737, + "learning_rate": 3.96544342930248e-06, + "loss": 0.66230631, + "num_input_tokens_seen": 31148445, + "router_z_loss_clip": 2.734375, + "router_z_loss_mlp": 0.23181152, + "step": 1455, + "time_per_iteration": 3.2130184173583984 + }, + { + "auxiliary_loss_clip": 0.06774339, + "auxiliary_loss_mlp": 0.01313917, + "balance_loss_clip": 0.06350334, + "balance_loss_mlp": 0.01265303, + "epoch": 0.08753945588456336, + "flos": 33044122074240.0, + "grad_norm": 1.7776650768799964, + "language_loss": 0.79278296, + "learning_rate": 3.965371306866359e-06, + "loss": 0.87366557, + "num_input_tokens_seen": 31168770, + "router_z_loss_clip": 4.23632812, + "router_z_loss_mlp": 0.4855957, + "step": 1456, + "time_per_iteration": 2.6745898723602295 + }, + { + "auxiliary_loss_clip": 0.06785175, + "auxiliary_loss_mlp": 0.01319613, + "balance_loss_clip": 0.06356893, + "balance_loss_mlp": 0.01271881, + "epoch": 0.08759957913723132, + "flos": 35554807088640.0, + "grad_norm": 2.255439619282858, + "language_loss": 0.74143755, + "learning_rate": 3.96529910990316e-06, + "loss": 0.82248545, + "num_input_tokens_seen": 31189270, + "router_z_loss_clip": 4.28515625, + "router_z_loss_mlp": 0.47753906, + "step": 1457, + "time_per_iteration": 2.6837821006774902 + }, + { + "auxiliary_loss_clip": 0.06763137, + "auxiliary_loss_mlp": 0.01308035, + "balance_loss_clip": 0.06348729, + "balance_loss_mlp": 0.01264738, + "epoch": 0.0876597023898993, + "flos": 23917283798400.0, + "grad_norm": 1.7808177247023305, + "language_loss": 0.88680792, + "learning_rate": 3.965226838415622e-06, + "loss": 0.96751964, + "num_input_tokens_seen": 31210385, + "router_z_loss_clip": 4.140625, + "router_z_loss_mlp": 0.43261719, + "step": 1458, + "time_per_iteration": 2.5912857055664062 + }, + { + "auxiliary_loss_clip": 0.0677645, + "auxiliary_loss_mlp": 0.01313188, + "balance_loss_clip": 0.06355318, + "balance_loss_mlp": 0.01268151, + "epoch": 0.08771982564256726, + "flos": 18119912307840.0, + "grad_norm": 3.1042726617035297, + "language_loss": 0.82429975, + "learning_rate": 3.965154492406486e-06, + "loss": 0.90519613, + "num_input_tokens_seen": 31229745, + "router_z_loss_clip": 4.20703125, + "router_z_loss_mlp": 0.45043945, + "step": 1459, + "time_per_iteration": 2.5870959758758545 + }, + { + "auxiliary_loss_clip": 0.0679104, + "auxiliary_loss_mlp": 0.01327895, + "balance_loss_clip": 0.06355593, + "balance_loss_mlp": 0.01275062, + "epoch": 0.08777994889523523, + "flos": 17717711909760.0, + "grad_norm": 7.236455309064537, + "language_loss": 0.8621763, + "learning_rate": 3.9650820718784945e-06, + "loss": 0.94336569, + "num_input_tokens_seen": 31248280, + "router_z_loss_clip": 4.35546875, + "router_z_loss_mlp": 0.52856445, + "step": 1460, + "time_per_iteration": 2.574669361114502 + }, + { + "auxiliary_loss_clip": 0.06771254, + "auxiliary_loss_mlp": 0.01315799, + "balance_loss_clip": 0.06352662, + "balance_loss_mlp": 0.01271215, + "epoch": 0.0878400721479032, + "flos": 12824320193280.0, + "grad_norm": 3.2811276479841847, + "language_loss": 0.83160508, + "learning_rate": 3.965009576834394e-06, + "loss": 0.91247559, + "num_input_tokens_seen": 31262190, + "router_z_loss_clip": 4.18359375, + "router_z_loss_mlp": 0.44580078, + "step": 1461, + "time_per_iteration": 2.575343608856201 + }, + { + "auxiliary_loss_clip": 0.06765963, + "auxiliary_loss_mlp": 0.01303985, + "balance_loss_clip": 0.06350134, + "balance_loss_mlp": 0.01261094, + "epoch": 0.08790019540057117, + "flos": 26399359843200.0, + "grad_norm": 3.960130795636661, + "language_loss": 0.77723432, + "learning_rate": 3.964937007276932e-06, + "loss": 0.85793376, + "num_input_tokens_seen": 31283690, + "router_z_loss_clip": 4.15625, + "router_z_loss_mlp": 0.42895508, + "step": 1462, + "time_per_iteration": 2.6177735328674316 + }, + { + "auxiliary_loss_clip": 0.06788168, + "auxiliary_loss_mlp": 0.01309058, + "balance_loss_clip": 0.06352487, + "balance_loss_mlp": 0.01258371, + "epoch": 0.08796031865323914, + "flos": 19139822605440.0, + "grad_norm": 5.369695457360621, + "language_loss": 0.76475191, + "learning_rate": 3.9648643632088634e-06, + "loss": 0.84572417, + "num_input_tokens_seen": 31302505, + "router_z_loss_clip": 4.359375, + "router_z_loss_mlp": 0.50732422, + "step": 1463, + "time_per_iteration": 2.532130241394043 + }, + { + "auxiliary_loss_clip": 0.06770946, + "auxiliary_loss_mlp": 0.01316317, + "balance_loss_clip": 0.06331752, + "balance_loss_mlp": 0.01261218, + "epoch": 0.0880204419059071, + "flos": 26070896638080.0, + "grad_norm": 3.6430076592813427, + "language_loss": 0.85532415, + "learning_rate": 3.964791644632941e-06, + "loss": 0.9361968, + "num_input_tokens_seen": 31323070, + "router_z_loss_clip": 4.39453125, + "router_z_loss_mlp": 0.55126953, + "step": 1464, + "time_per_iteration": 2.606081962585449 + }, + { + "auxiliary_loss_clip": 0.06766248, + "auxiliary_loss_mlp": 0.01314801, + "balance_loss_clip": 0.06340823, + "balance_loss_mlp": 0.01264948, + "epoch": 0.08808056515857508, + "flos": 22383602991360.0, + "grad_norm": 2.6056498019463774, + "language_loss": 0.80711126, + "learning_rate": 3.964718851551923e-06, + "loss": 0.88792181, + "num_input_tokens_seen": 31341880, + "router_z_loss_clip": 4.25, + "router_z_loss_mlp": 0.4987793, + "step": 1465, + "time_per_iteration": 2.555612325668335 + }, + { + "auxiliary_loss_clip": 0.06765096, + "auxiliary_loss_mlp": 0.0132391, + "balance_loss_clip": 0.06346563, + "balance_loss_mlp": 0.01275654, + "epoch": 0.08814068841124305, + "flos": 23191986856320.0, + "grad_norm": 5.208613872763048, + "language_loss": 0.8713969, + "learning_rate": 3.9646459839685675e-06, + "loss": 0.95228696, + "num_input_tokens_seen": 31361995, + "router_z_loss_clip": 4.18554688, + "router_z_loss_mlp": 0.48266602, + "step": 1466, + "time_per_iteration": 2.5865933895111084 + }, + { + "auxiliary_loss_clip": 0.067513, + "auxiliary_loss_mlp": 0.01319742, + "balance_loss_clip": 0.06332761, + "balance_loss_mlp": 0.01270842, + "epoch": 0.08820081166391101, + "flos": 25162262962560.0, + "grad_norm": 2.171865464101356, + "language_loss": 0.85806906, + "learning_rate": 3.964573041885641e-06, + "loss": 0.93877947, + "num_input_tokens_seen": 31381515, + "router_z_loss_clip": 4.18359375, + "router_z_loss_mlp": 0.48852539, + "step": 1467, + "time_per_iteration": 2.5861306190490723 + }, + { + "auxiliary_loss_clip": 0.06751268, + "auxiliary_loss_mlp": 0.0130998, + "balance_loss_clip": 0.06337693, + "balance_loss_mlp": 0.01262654, + "epoch": 0.08826093491657899, + "flos": 22237386416640.0, + "grad_norm": 2.29409858909566, + "language_loss": 0.78131318, + "learning_rate": 3.964500025305907e-06, + "loss": 0.86192572, + "num_input_tokens_seen": 31400345, + "router_z_loss_clip": 4.1328125, + "router_z_loss_mlp": 0.47387695, + "step": 1468, + "time_per_iteration": 2.5800206661224365 + }, + { + "auxiliary_loss_clip": 0.06742708, + "auxiliary_loss_mlp": 0.01311969, + "balance_loss_clip": 0.06332668, + "balance_loss_mlp": 0.01265501, + "epoch": 0.08832105816924696, + "flos": 22133279318400.0, + "grad_norm": 1.8356690071746322, + "language_loss": 0.82406783, + "learning_rate": 3.9644269342321355e-06, + "loss": 0.90461457, + "num_input_tokens_seen": 31419620, + "router_z_loss_clip": 4.09570312, + "router_z_loss_mlp": 0.46459961, + "step": 1469, + "time_per_iteration": 2.5584611892700195 + }, + { + "auxiliary_loss_clip": 0.06744162, + "auxiliary_loss_mlp": 0.01313281, + "balance_loss_clip": 0.06327502, + "balance_loss_mlp": 0.01264739, + "epoch": 0.08838118142191492, + "flos": 17572250021760.0, + "grad_norm": 2.2192924058432615, + "language_loss": 0.79711461, + "learning_rate": 3.9643537686670974e-06, + "loss": 0.877689, + "num_input_tokens_seen": 31437970, + "router_z_loss_clip": 4.16210938, + "router_z_loss_mlp": 0.48535156, + "step": 1470, + "time_per_iteration": 2.5447630882263184 + }, + { + "auxiliary_loss_clip": 0.06739189, + "auxiliary_loss_mlp": 0.01312164, + "balance_loss_clip": 0.06326798, + "balance_loss_mlp": 0.0126274, + "epoch": 0.0884413046745829, + "flos": 20783480296320.0, + "grad_norm": 2.030528760335608, + "language_loss": 0.86272311, + "learning_rate": 3.964280528613569e-06, + "loss": 0.94323671, + "num_input_tokens_seen": 31457040, + "router_z_loss_clip": 4.125, + "router_z_loss_mlp": 0.49511719, + "step": 1471, + "time_per_iteration": 2.7219297885894775 + }, + { + "auxiliary_loss_clip": 0.06719133, + "auxiliary_loss_mlp": 0.01304039, + "balance_loss_clip": 0.06321308, + "balance_loss_mlp": 0.01263222, + "epoch": 0.08850142792725087, + "flos": 22131686090880.0, + "grad_norm": 5.945068157557599, + "language_loss": 0.85369575, + "learning_rate": 3.964207214074324e-06, + "loss": 0.93392742, + "num_input_tokens_seen": 31477520, + "router_z_loss_clip": 3.97851562, + "router_z_loss_mlp": 0.40820312, + "step": 1472, + "time_per_iteration": 2.6007394790649414 + }, + { + "auxiliary_loss_clip": 0.06741676, + "auxiliary_loss_mlp": 0.01307162, + "balance_loss_clip": 0.06323978, + "balance_loss_mlp": 0.01258811, + "epoch": 0.08856155117991883, + "flos": 22425251270400.0, + "grad_norm": 4.024487815181785, + "language_loss": 0.85227764, + "learning_rate": 3.964133825052146e-06, + "loss": 0.93276608, + "num_input_tokens_seen": 31495575, + "router_z_loss_clip": 4.1796875, + "router_z_loss_mlp": 0.48388672, + "step": 1473, + "time_per_iteration": 2.610280752182007 + }, + { + "auxiliary_loss_clip": 0.06745915, + "auxiliary_loss_mlp": 0.01303107, + "balance_loss_clip": 0.0632661, + "balance_loss_mlp": 0.01257998, + "epoch": 0.0886216744325868, + "flos": 29945132743680.0, + "grad_norm": 1.5926466073589443, + "language_loss": 0.80301654, + "learning_rate": 3.964060361549816e-06, + "loss": 0.88350677, + "num_input_tokens_seen": 31520020, + "router_z_loss_clip": 4.1953125, + "router_z_loss_mlp": 0.45092773, + "step": 1474, + "time_per_iteration": 2.74392032623291 + }, + { + "auxiliary_loss_clip": 0.0673038, + "auxiliary_loss_mlp": 0.01308218, + "balance_loss_clip": 0.06324204, + "balance_loss_mlp": 0.01263062, + "epoch": 0.08868179768525478, + "flos": 23988798858240.0, + "grad_norm": 2.028999420252469, + "language_loss": 0.80928683, + "learning_rate": 3.963986823570121e-06, + "loss": 0.88967282, + "num_input_tokens_seen": 31539265, + "router_z_loss_clip": 4.05859375, + "router_z_loss_mlp": 0.45166016, + "step": 1475, + "time_per_iteration": 2.570007801055908 + }, + { + "auxiliary_loss_clip": 0.06742392, + "auxiliary_loss_mlp": 0.01303332, + "balance_loss_clip": 0.06327485, + "balance_loss_mlp": 0.01256387, + "epoch": 0.08874192093792274, + "flos": 43187264922240.0, + "grad_norm": 1.8785525854248355, + "language_loss": 0.76261604, + "learning_rate": 3.963913211115848e-06, + "loss": 0.84307337, + "num_input_tokens_seen": 31563425, + "router_z_loss_clip": 4.1484375, + "router_z_loss_mlp": 0.46972656, + "step": 1476, + "time_per_iteration": 4.163857460021973 + }, + { + "auxiliary_loss_clip": 0.06743093, + "auxiliary_loss_mlp": 0.01308468, + "balance_loss_clip": 0.06333718, + "balance_loss_mlp": 0.01262405, + "epoch": 0.0888020441905907, + "flos": 32860491851520.0, + "grad_norm": 1.6890231836232912, + "language_loss": 0.76270819, + "learning_rate": 3.9638395241897895e-06, + "loss": 0.84322381, + "num_input_tokens_seen": 31584525, + "router_z_loss_clip": 4.09375, + "router_z_loss_mlp": 0.46069336, + "step": 1477, + "time_per_iteration": 2.6772334575653076 + }, + { + "auxiliary_loss_clip": 0.06751049, + "auxiliary_loss_mlp": 0.01308123, + "balance_loss_clip": 0.06334269, + "balance_loss_mlp": 0.01263468, + "epoch": 0.08886216744325869, + "flos": 23156124508800.0, + "grad_norm": 2.600680931100332, + "language_loss": 0.88817739, + "learning_rate": 3.963765762794739e-06, + "loss": 0.96876919, + "num_input_tokens_seen": 31603325, + "router_z_loss_clip": 4.16601562, + "router_z_loss_mlp": 0.44677734, + "step": 1478, + "time_per_iteration": 4.08270525932312 + }, + { + "auxiliary_loss_clip": 0.0675, + "auxiliary_loss_mlp": 0.01309174, + "balance_loss_clip": 0.06336476, + "balance_loss_mlp": 0.01263803, + "epoch": 0.08892229069592665, + "flos": 23338371139200.0, + "grad_norm": 1.8272738608530537, + "language_loss": 0.79003656, + "learning_rate": 3.963691926933495e-06, + "loss": 0.87062836, + "num_input_tokens_seen": 31624820, + "router_z_loss_clip": 4.1328125, + "router_z_loss_mlp": 0.45361328, + "step": 1479, + "time_per_iteration": 2.5917623043060303 + }, + { + "auxiliary_loss_clip": 0.06747445, + "auxiliary_loss_mlp": 0.01303872, + "balance_loss_clip": 0.06333964, + "balance_loss_mlp": 0.01256665, + "epoch": 0.08898241394859462, + "flos": 26221012427520.0, + "grad_norm": 4.931621721483509, + "language_loss": 0.80906087, + "learning_rate": 3.9636180166088555e-06, + "loss": 0.88957405, + "num_input_tokens_seen": 31646080, + "router_z_loss_clip": 4.1328125, + "router_z_loss_mlp": 0.47265625, + "step": 1480, + "time_per_iteration": 2.6102962493896484 + }, + { + "auxiliary_loss_clip": 0.06771734, + "auxiliary_loss_mlp": 0.01331796, + "balance_loss_clip": 0.06338413, + "balance_loss_mlp": 0.01278986, + "epoch": 0.0890425372012626, + "flos": 23557444439040.0, + "grad_norm": 2.1143063599710135, + "language_loss": 0.68804622, + "learning_rate": 3.963544031823624e-06, + "loss": 0.76908153, + "num_input_tokens_seen": 31665770, + "router_z_loss_clip": 4.33203125, + "router_z_loss_mlp": 0.52807617, + "step": 1481, + "time_per_iteration": 4.085212707519531 + }, + { + "auxiliary_loss_clip": 0.06743339, + "auxiliary_loss_mlp": 0.01307322, + "balance_loss_clip": 0.06335256, + "balance_loss_mlp": 0.01264358, + "epoch": 0.08910266045393056, + "flos": 23009446736640.0, + "grad_norm": 2.5169726563525234, + "language_loss": 0.99559236, + "learning_rate": 3.9634699725806065e-06, + "loss": 1.07609892, + "num_input_tokens_seen": 31683805, + "router_z_loss_clip": 4.07617188, + "router_z_loss_mlp": 0.42993164, + "step": 1482, + "time_per_iteration": 2.564034938812256 + }, + { + "auxiliary_loss_clip": 0.06760907, + "auxiliary_loss_mlp": 0.0131259, + "balance_loss_clip": 0.06338564, + "balance_loss_mlp": 0.01264024, + "epoch": 0.08916278370659853, + "flos": 31943766257280.0, + "grad_norm": 3.2036096398767993, + "language_loss": 0.81227845, + "learning_rate": 3.96339583888261e-06, + "loss": 0.89301342, + "num_input_tokens_seen": 31704630, + "router_z_loss_clip": 4.22460938, + "router_z_loss_mlp": 0.48535156, + "step": 1483, + "time_per_iteration": 4.063607215881348 + }, + { + "auxiliary_loss_clip": 0.06743906, + "auxiliary_loss_mlp": 0.01316489, + "balance_loss_clip": 0.06329283, + "balance_loss_mlp": 0.01268519, + "epoch": 0.08922290695926649, + "flos": 17536219966080.0, + "grad_norm": 10.926297293099243, + "language_loss": 0.87554848, + "learning_rate": 3.963321630732448e-06, + "loss": 0.95615244, + "num_input_tokens_seen": 31723255, + "router_z_loss_clip": 4.140625, + "router_z_loss_mlp": 0.47998047, + "step": 1484, + "time_per_iteration": 2.5457398891448975 + }, + { + "auxiliary_loss_clip": 0.06757183, + "auxiliary_loss_mlp": 0.01321525, + "balance_loss_clip": 0.06330685, + "balance_loss_mlp": 0.01272315, + "epoch": 0.08928303021193447, + "flos": 32133392046720.0, + "grad_norm": 2.337720635500538, + "language_loss": 0.82324612, + "learning_rate": 3.963247348132932e-06, + "loss": 0.90403324, + "num_input_tokens_seen": 31747045, + "router_z_loss_clip": 4.265625, + "router_z_loss_mlp": 0.49267578, + "step": 1485, + "time_per_iteration": 2.6794724464416504 + }, + { + "auxiliary_loss_clip": 0.06736165, + "auxiliary_loss_mlp": 0.01302402, + "balance_loss_clip": 0.06326707, + "balance_loss_mlp": 0.01256125, + "epoch": 0.08934315346460243, + "flos": 22131392601600.0, + "grad_norm": 3.158284640334893, + "language_loss": 0.84766626, + "learning_rate": 3.96317299108688e-06, + "loss": 0.92805195, + "num_input_tokens_seen": 31766615, + "router_z_loss_clip": 4.09765625, + "router_z_loss_mlp": 0.46264648, + "step": 1486, + "time_per_iteration": 2.5732409954071045 + }, + { + "auxiliary_loss_clip": 0.06736217, + "auxiliary_loss_mlp": 0.0130934, + "balance_loss_clip": 0.06328043, + "balance_loss_mlp": 0.01267569, + "epoch": 0.0894032767172704, + "flos": 22572264458880.0, + "grad_norm": 1.7672180345851645, + "language_loss": 0.78605509, + "learning_rate": 3.963098559597111e-06, + "loss": 0.86651075, + "num_input_tokens_seen": 31785855, + "router_z_loss_clip": 4.07617188, + "router_z_loss_mlp": 0.41748047, + "step": 1487, + "time_per_iteration": 2.5952718257904053 + }, + { + "auxiliary_loss_clip": 0.06736919, + "auxiliary_loss_mlp": 0.01308401, + "balance_loss_clip": 0.06326038, + "balance_loss_mlp": 0.0126353, + "epoch": 0.08946339996993838, + "flos": 20199578319360.0, + "grad_norm": 4.25204894574284, + "language_loss": 0.85387635, + "learning_rate": 3.963024053666449e-06, + "loss": 0.93432951, + "num_input_tokens_seen": 31804210, + "router_z_loss_clip": 4.10546875, + "router_z_loss_mlp": 0.44873047, + "step": 1488, + "time_per_iteration": 2.5534958839416504 + }, + { + "auxiliary_loss_clip": 0.06725559, + "auxiliary_loss_mlp": 0.01303445, + "balance_loss_clip": 0.06320536, + "balance_loss_mlp": 0.01259838, + "epoch": 0.08952352322260634, + "flos": 48371035363200.0, + "grad_norm": 2.4620081078023173, + "language_loss": 0.74370039, + "learning_rate": 3.962949473297718e-06, + "loss": 0.82399046, + "num_input_tokens_seen": 31826150, + "router_z_loss_clip": 4.04882812, + "router_z_loss_mlp": 0.43554688, + "step": 1489, + "time_per_iteration": 2.780122756958008 + }, + { + "auxiliary_loss_clip": 0.06736162, + "auxiliary_loss_mlp": 0.01308033, + "balance_loss_clip": 0.06324734, + "balance_loss_mlp": 0.01264092, + "epoch": 0.08958364647527431, + "flos": 31800736137600.0, + "grad_norm": 2.6258968543660584, + "language_loss": 0.91654348, + "learning_rate": 3.962874818493745e-06, + "loss": 0.99698538, + "num_input_tokens_seen": 31848060, + "router_z_loss_clip": 4.11132812, + "router_z_loss_mlp": 0.43945312, + "step": 1490, + "time_per_iteration": 2.619051456451416 + }, + { + "auxiliary_loss_clip": 0.06748827, + "auxiliary_loss_mlp": 0.01303631, + "balance_loss_clip": 0.06332797, + "balance_loss_mlp": 0.01258737, + "epoch": 0.08964376972794229, + "flos": 23374988173440.0, + "grad_norm": 2.6637397886572076, + "language_loss": 0.76370478, + "learning_rate": 3.9628000892573635e-06, + "loss": 0.84422934, + "num_input_tokens_seen": 31870040, + "router_z_loss_clip": 4.1640625, + "router_z_loss_mlp": 0.44897461, + "step": 1491, + "time_per_iteration": 2.590679407119751 + }, + { + "auxiliary_loss_clip": 0.06728335, + "auxiliary_loss_mlp": 0.01302455, + "balance_loss_clip": 0.06325481, + "balance_loss_mlp": 0.01261804, + "epoch": 0.08970389298061025, + "flos": 23301502542720.0, + "grad_norm": 1.853626118240874, + "language_loss": 0.78431886, + "learning_rate": 3.9627252855914055e-06, + "loss": 0.86462677, + "num_input_tokens_seen": 31890400, + "router_z_loss_clip": 4.02539062, + "router_z_loss_mlp": 0.40673828, + "step": 1492, + "time_per_iteration": 2.5715339183807373 + }, + { + "auxiliary_loss_clip": 0.06729841, + "auxiliary_loss_mlp": 0.01304764, + "balance_loss_clip": 0.06324601, + "balance_loss_mlp": 0.01260298, + "epoch": 0.08976401623327822, + "flos": 33769419016320.0, + "grad_norm": 3.870321699477457, + "language_loss": 0.73167109, + "learning_rate": 3.962650407498707e-06, + "loss": 0.81201714, + "num_input_tokens_seen": 31913435, + "router_z_loss_clip": 4.046875, + "router_z_loss_mlp": 0.44433594, + "step": 1493, + "time_per_iteration": 2.6644091606140137 + }, + { + "auxiliary_loss_clip": 0.0673489, + "auxiliary_loss_mlp": 0.01306407, + "balance_loss_clip": 0.06327641, + "balance_loss_mlp": 0.01259987, + "epoch": 0.08982413948594618, + "flos": 23917535360640.0, + "grad_norm": 1.970514386565943, + "language_loss": 0.88832223, + "learning_rate": 3.962575454982109e-06, + "loss": 0.96873516, + "num_input_tokens_seen": 31932435, + "router_z_loss_clip": 4.07617188, + "router_z_loss_mlp": 0.46435547, + "step": 1494, + "time_per_iteration": 2.58363676071167 + }, + { + "auxiliary_loss_clip": 0.06728575, + "auxiliary_loss_mlp": 0.01309753, + "balance_loss_clip": 0.06328882, + "balance_loss_mlp": 0.01267792, + "epoch": 0.08988426273861416, + "flos": 16843305427200.0, + "grad_norm": 4.2307100076147774, + "language_loss": 0.84796005, + "learning_rate": 3.962500428044454e-06, + "loss": 0.92834336, + "num_input_tokens_seen": 31950125, + "router_z_loss_clip": 3.99414062, + "router_z_loss_mlp": 0.41967773, + "step": 1495, + "time_per_iteration": 2.5592563152313232 + }, + { + "auxiliary_loss_clip": 0.06737964, + "auxiliary_loss_mlp": 0.01307798, + "balance_loss_clip": 0.06329042, + "balance_loss_mlp": 0.01263476, + "epoch": 0.08994438599128213, + "flos": 14798621295360.0, + "grad_norm": 2.6872032858380885, + "language_loss": 0.72458923, + "learning_rate": 3.962425326688585e-06, + "loss": 0.80504692, + "num_input_tokens_seen": 31968050, + "router_z_loss_clip": 4.08203125, + "router_z_loss_mlp": 0.44287109, + "step": 1496, + "time_per_iteration": 2.527702569961548 + }, + { + "auxiliary_loss_clip": 0.06731858, + "auxiliary_loss_mlp": 0.01301643, + "balance_loss_clip": 0.06328158, + "balance_loss_mlp": 0.01259038, + "epoch": 0.09000450924395009, + "flos": 17390087245440.0, + "grad_norm": 1.9873412980644265, + "language_loss": 0.82173735, + "learning_rate": 3.962350150917351e-06, + "loss": 0.90207237, + "num_input_tokens_seen": 31985675, + "router_z_loss_clip": 4.03515625, + "router_z_loss_mlp": 0.42578125, + "step": 1497, + "time_per_iteration": 2.5877413749694824 + }, + { + "auxiliary_loss_clip": 0.06743819, + "auxiliary_loss_mlp": 0.01303103, + "balance_loss_clip": 0.06327296, + "balance_loss_mlp": 0.01257064, + "epoch": 0.09006463249661807, + "flos": 24287269501440.0, + "grad_norm": 4.64905554567639, + "language_loss": 0.85617393, + "learning_rate": 3.9622749007336035e-06, + "loss": 0.93664312, + "num_input_tokens_seen": 32005180, + "router_z_loss_clip": 4.1640625, + "router_z_loss_mlp": 0.4609375, + "step": 1498, + "time_per_iteration": 2.5904557704925537 + }, + { + "auxiliary_loss_clip": 0.06749868, + "auxiliary_loss_mlp": 0.01309538, + "balance_loss_clip": 0.06334974, + "balance_loss_mlp": 0.01263666, + "epoch": 0.09012475574928604, + "flos": 13666931251200.0, + "grad_norm": 3.85109419291821, + "language_loss": 0.81540704, + "learning_rate": 3.962199576140195e-06, + "loss": 0.89600116, + "num_input_tokens_seen": 32022970, + "router_z_loss_clip": 4.1484375, + "router_z_loss_mlp": 0.45849609, + "step": 1499, + "time_per_iteration": 2.5302114486694336 + }, + { + "auxiliary_loss_clip": 0.06728019, + "auxiliary_loss_mlp": 0.01300863, + "balance_loss_clip": 0.06331602, + "balance_loss_mlp": 0.01261142, + "epoch": 0.090184879001954, + "flos": 23333884945920.0, + "grad_norm": 2.0381377997897636, + "language_loss": 0.94349372, + "learning_rate": 3.962124177139981e-06, + "loss": 1.02378249, + "num_input_tokens_seen": 32043055, + "router_z_loss_clip": 3.96484375, + "router_z_loss_mlp": 0.3972168, + "step": 1500, + "time_per_iteration": 2.5795865058898926 + }, + { + "auxiliary_loss_clip": 0.0677222, + "auxiliary_loss_mlp": 0.01314156, + "balance_loss_clip": 0.06350215, + "balance_loss_mlp": 0.01263539, + "epoch": 0.09024500225462198, + "flos": 23009320955520.0, + "grad_norm": 3.436423392701186, + "language_loss": 0.77039468, + "learning_rate": 3.962048703735822e-06, + "loss": 0.8512584, + "num_input_tokens_seen": 32061900, + "router_z_loss_clip": 4.21875, + "router_z_loss_mlp": 0.50634766, + "step": 1501, + "time_per_iteration": 2.5764503479003906 + }, + { + "auxiliary_loss_clip": 0.06607839, + "auxiliary_loss_mlp": 0.01283791, + "balance_loss_clip": 0.06328217, + "balance_loss_mlp": 0.01261165, + "epoch": 0.09030512550728995, + "flos": 62208626653440.0, + "grad_norm": 0.7031155649326037, + "language_loss": 0.58089769, + "learning_rate": 3.96197315593058e-06, + "loss": 0.659814, + "num_input_tokens_seen": 32122745, + "router_z_loss_clip": 2.796875, + "router_z_loss_mlp": 0.22619629, + "step": 1502, + "time_per_iteration": 3.1644375324249268 + }, + { + "auxiliary_loss_clip": 0.06763642, + "auxiliary_loss_mlp": 0.01313188, + "balance_loss_clip": 0.06354539, + "balance_loss_mlp": 0.01269653, + "epoch": 0.09036524875995791, + "flos": 38809907775360.0, + "grad_norm": 3.4086152145479427, + "language_loss": 0.72101718, + "learning_rate": 3.961897533727119e-06, + "loss": 0.80178547, + "num_input_tokens_seen": 32145125, + "router_z_loss_clip": 4.09179688, + "router_z_loss_mlp": 0.43579102, + "step": 1503, + "time_per_iteration": 2.724386215209961 + }, + { + "auxiliary_loss_clip": 0.06781425, + "auxiliary_loss_mlp": 0.01307874, + "balance_loss_clip": 0.06363953, + "balance_loss_mlp": 0.01263075, + "epoch": 0.09042537201262588, + "flos": 21696642092160.0, + "grad_norm": 2.1842796361034793, + "language_loss": 0.881266, + "learning_rate": 3.961821837128306e-06, + "loss": 0.96215898, + "num_input_tokens_seen": 32166255, + "router_z_loss_clip": 4.1796875, + "router_z_loss_mlp": 0.44848633, + "step": 1504, + "time_per_iteration": 2.5873734951019287 + }, + { + "auxiliary_loss_clip": 0.06790902, + "auxiliary_loss_mlp": 0.01331983, + "balance_loss_clip": 0.06361797, + "balance_loss_mlp": 0.01280795, + "epoch": 0.09048549526529386, + "flos": 22272536004480.0, + "grad_norm": 3.0474410186464427, + "language_loss": 0.75017542, + "learning_rate": 3.961746066137014e-06, + "loss": 0.83140427, + "num_input_tokens_seen": 32184010, + "router_z_loss_clip": 4.2890625, + "router_z_loss_mlp": 0.51171875, + "step": 1505, + "time_per_iteration": 2.542175054550171 + }, + { + "auxiliary_loss_clip": 0.06765792, + "auxiliary_loss_mlp": 0.0131069, + "balance_loss_clip": 0.06354111, + "balance_loss_mlp": 0.01263936, + "epoch": 0.09054561851796182, + "flos": 14616165029760.0, + "grad_norm": 3.6481054719455166, + "language_loss": 0.83357459, + "learning_rate": 3.961670220756114e-06, + "loss": 0.91433942, + "num_input_tokens_seen": 32201635, + "router_z_loss_clip": 4.11914062, + "router_z_loss_mlp": 0.46777344, + "step": 1506, + "time_per_iteration": 2.5811927318573 + }, + { + "auxiliary_loss_clip": 0.06768796, + "auxiliary_loss_mlp": 0.01305475, + "balance_loss_clip": 0.06366544, + "balance_loss_mlp": 0.01262584, + "epoch": 0.09060574177062979, + "flos": 27643542393600.0, + "grad_norm": 2.7002549048976388, + "language_loss": 0.78016138, + "learning_rate": 3.961594300988482e-06, + "loss": 0.8609041, + "num_input_tokens_seen": 32221940, + "router_z_loss_clip": 4.02148438, + "router_z_loss_mlp": 0.42871094, + "step": 1507, + "time_per_iteration": 2.6117966175079346 + }, + { + "auxiliary_loss_clip": 0.06588461, + "auxiliary_loss_mlp": 0.01287299, + "balance_loss_clip": 0.06317182, + "balance_loss_mlp": 0.01264351, + "epoch": 0.09066586502329776, + "flos": 66104637621120.0, + "grad_norm": 0.7149959192610794, + "language_loss": 0.57417059, + "learning_rate": 3.961518306836998e-06, + "loss": 0.65292823, + "num_input_tokens_seen": 32276495, + "router_z_loss_clip": 2.71875, + "router_z_loss_mlp": 0.22924805, + "step": 1508, + "time_per_iteration": 3.055577516555786 + }, + { + "auxiliary_loss_clip": 0.06765939, + "auxiliary_loss_mlp": 0.01315934, + "balance_loss_clip": 0.06356797, + "balance_loss_mlp": 0.01271135, + "epoch": 0.09072598827596573, + "flos": 18922426387200.0, + "grad_norm": 2.757411639882116, + "language_loss": 0.87097013, + "learning_rate": 3.961442238304543e-06, + "loss": 0.95178884, + "num_input_tokens_seen": 32294130, + "router_z_loss_clip": 4.09179688, + "router_z_loss_mlp": 0.44775391, + "step": 1509, + "time_per_iteration": 2.5325253009796143 + }, + { + "auxiliary_loss_clip": 0.06796411, + "auxiliary_loss_mlp": 0.01325092, + "balance_loss_clip": 0.06366567, + "balance_loss_mlp": 0.01275358, + "epoch": 0.0907861115286337, + "flos": 24827804190720.0, + "grad_norm": 3.0354649762753896, + "language_loss": 0.86899114, + "learning_rate": 3.961366095394002e-06, + "loss": 0.95020616, + "num_input_tokens_seen": 32313555, + "router_z_loss_clip": 4.29492188, + "router_z_loss_mlp": 0.49707031, + "step": 1510, + "time_per_iteration": 2.608421564102173 + }, + { + "auxiliary_loss_clip": 0.06775412, + "auxiliary_loss_mlp": 0.01304282, + "balance_loss_clip": 0.06358128, + "balance_loss_mlp": 0.01260127, + "epoch": 0.09084623478130167, + "flos": 21659270371200.0, + "grad_norm": 2.4633218193770103, + "language_loss": 0.89968181, + "learning_rate": 3.961289878108262e-06, + "loss": 0.98047876, + "num_input_tokens_seen": 32331430, + "router_z_loss_clip": 4.17773438, + "router_z_loss_mlp": 0.44140625, + "step": 1511, + "time_per_iteration": 2.566403388977051 + }, + { + "auxiliary_loss_clip": 0.0674355, + "auxiliary_loss_mlp": 0.01315251, + "balance_loss_clip": 0.06338912, + "balance_loss_mlp": 0.01272121, + "epoch": 0.09090635803396964, + "flos": 27647148119040.0, + "grad_norm": 2.09202487509347, + "language_loss": 0.86417758, + "learning_rate": 3.9612135864502135e-06, + "loss": 0.94476557, + "num_input_tokens_seen": 32353705, + "router_z_loss_clip": 4.0546875, + "router_z_loss_mlp": 0.43164062, + "step": 1512, + "time_per_iteration": 2.665790319442749 + }, + { + "auxiliary_loss_clip": 0.06752454, + "auxiliary_loss_mlp": 0.0130495, + "balance_loss_clip": 0.06350584, + "balance_loss_mlp": 0.01262726, + "epoch": 0.0909664812866376, + "flos": 17673757643520.0, + "grad_norm": 2.5146334197942926, + "language_loss": 0.88217908, + "learning_rate": 3.961137220422749e-06, + "loss": 0.96275318, + "num_input_tokens_seen": 32370520, + "router_z_loss_clip": 4.02148438, + "router_z_loss_mlp": 0.42211914, + "step": 1513, + "time_per_iteration": 2.531816244125366 + }, + { + "auxiliary_loss_clip": 0.06760095, + "auxiliary_loss_mlp": 0.01314183, + "balance_loss_clip": 0.06354512, + "balance_loss_mlp": 0.01272078, + "epoch": 0.09102660453930557, + "flos": 23958261244800.0, + "grad_norm": 5.873122305201123, + "language_loss": 0.88520277, + "learning_rate": 3.961060780028764e-06, + "loss": 0.9659456, + "num_input_tokens_seen": 32389105, + "router_z_loss_clip": 4.05664062, + "router_z_loss_mlp": 0.42138672, + "step": 1514, + "time_per_iteration": 2.609802722930908 + }, + { + "auxiliary_loss_clip": 0.06748682, + "auxiliary_loss_mlp": 0.01305229, + "balance_loss_clip": 0.06345841, + "balance_loss_mlp": 0.01266104, + "epoch": 0.09108672779197355, + "flos": 25820195621760.0, + "grad_norm": 1.9733366853077507, + "language_loss": 0.91259241, + "learning_rate": 3.960984265271159e-06, + "loss": 0.99313152, + "num_input_tokens_seen": 32408065, + "router_z_loss_clip": 4.02929688, + "router_z_loss_mlp": 0.39111328, + "step": 1515, + "time_per_iteration": 2.626183271408081 + }, + { + "auxiliary_loss_clip": 0.06753635, + "auxiliary_loss_mlp": 0.01307479, + "balance_loss_clip": 0.06346089, + "balance_loss_mlp": 0.01264754, + "epoch": 0.09114685104464151, + "flos": 29646620173440.0, + "grad_norm": 2.1883056599674195, + "language_loss": 0.87669599, + "learning_rate": 3.9609076761528335e-06, + "loss": 0.9573071, + "num_input_tokens_seen": 32427225, + "router_z_loss_clip": 4.078125, + "router_z_loss_mlp": 0.42700195, + "step": 1516, + "time_per_iteration": 4.0171709060668945 + }, + { + "auxiliary_loss_clip": 0.06753673, + "auxiliary_loss_mlp": 0.01309986, + "balance_loss_clip": 0.06344739, + "balance_loss_mlp": 0.01267643, + "epoch": 0.09120697429730948, + "flos": 33738084789120.0, + "grad_norm": 1.96049698042547, + "language_loss": 0.82941747, + "learning_rate": 3.960831012676692e-06, + "loss": 0.91005409, + "num_input_tokens_seen": 32450510, + "router_z_loss_clip": 4.0859375, + "router_z_loss_mlp": 0.42285156, + "step": 1517, + "time_per_iteration": 4.134803056716919 + }, + { + "auxiliary_loss_clip": 0.06748644, + "auxiliary_loss_mlp": 0.01313239, + "balance_loss_clip": 0.06338718, + "balance_loss_mlp": 0.0127061, + "epoch": 0.09126709754997746, + "flos": 18406559525760.0, + "grad_norm": 1.9085933618955446, + "language_loss": 0.79150838, + "learning_rate": 3.960754274845642e-06, + "loss": 0.87212718, + "num_input_tokens_seen": 32468425, + "router_z_loss_clip": 4.09375, + "router_z_loss_mlp": 0.42626953, + "step": 1518, + "time_per_iteration": 2.609239101409912 + }, + { + "auxiliary_loss_clip": 0.06742416, + "auxiliary_loss_mlp": 0.01311508, + "balance_loss_clip": 0.0633543, + "balance_loss_mlp": 0.01267853, + "epoch": 0.09132722080264542, + "flos": 22098674782080.0, + "grad_norm": 1.8265694387954685, + "language_loss": 0.88381147, + "learning_rate": 3.960677462662594e-06, + "loss": 0.9643507, + "num_input_tokens_seen": 32487510, + "router_z_loss_clip": 4.0703125, + "router_z_loss_mlp": 0.43676758, + "step": 1519, + "time_per_iteration": 2.559178590774536 + }, + { + "auxiliary_loss_clip": 0.06749827, + "auxiliary_loss_mlp": 0.01303758, + "balance_loss_clip": 0.06334724, + "balance_loss_mlp": 0.01259507, + "epoch": 0.09138734405531339, + "flos": 21039547973760.0, + "grad_norm": 3.1504469624820497, + "language_loss": 0.75833631, + "learning_rate": 3.96060057613046e-06, + "loss": 0.83887213, + "num_input_tokens_seen": 32507250, + "router_z_loss_clip": 4.15625, + "router_z_loss_mlp": 0.44238281, + "step": 1520, + "time_per_iteration": 2.5994057655334473 + }, + { + "auxiliary_loss_clip": 0.06753822, + "auxiliary_loss_mlp": 0.0130995, + "balance_loss_clip": 0.06342606, + "balance_loss_mlp": 0.01263912, + "epoch": 0.09144746730798137, + "flos": 20090104560000.0, + "grad_norm": 3.4850769207863648, + "language_loss": 0.8813951, + "learning_rate": 3.960523615252156e-06, + "loss": 0.96203285, + "num_input_tokens_seen": 32526045, + "router_z_loss_clip": 4.1171875, + "router_z_loss_mlp": 0.45996094, + "step": 1521, + "time_per_iteration": 3.9595701694488525 + }, + { + "auxiliary_loss_clip": 0.06768003, + "auxiliary_loss_mlp": 0.0131471, + "balance_loss_clip": 0.06346045, + "balance_loss_mlp": 0.01269864, + "epoch": 0.09150759056064933, + "flos": 22783874745600.0, + "grad_norm": 2.490873911959668, + "language_loss": 0.85374022, + "learning_rate": 3.960446580030599e-06, + "loss": 0.93456733, + "num_input_tokens_seen": 32546575, + "router_z_loss_clip": 4.21875, + "router_z_loss_mlp": 0.44824219, + "step": 1522, + "time_per_iteration": 4.0201475620269775 + }, + { + "auxiliary_loss_clip": 0.06745256, + "auxiliary_loss_mlp": 0.01307893, + "balance_loss_clip": 0.06349748, + "balance_loss_mlp": 0.01265359, + "epoch": 0.0915677138133173, + "flos": 27571733844480.0, + "grad_norm": 3.0013683058651974, + "language_loss": 0.82841086, + "learning_rate": 3.960369470468711e-06, + "loss": 0.90894234, + "num_input_tokens_seen": 32568795, + "router_z_loss_clip": 3.95117188, + "router_z_loss_mlp": 0.42504883, + "step": 1523, + "time_per_iteration": 2.6468050479888916 + }, + { + "auxiliary_loss_clip": 0.0678298, + "auxiliary_loss_mlp": 0.01311185, + "balance_loss_clip": 0.06364655, + "balance_loss_mlp": 0.01265838, + "epoch": 0.09162783706598528, + "flos": 17680340188800.0, + "grad_norm": 4.7132272646544395, + "language_loss": 0.75685203, + "learning_rate": 3.960292286569418e-06, + "loss": 0.83779365, + "num_input_tokens_seen": 32587010, + "router_z_loss_clip": 4.1796875, + "router_z_loss_mlp": 0.45361328, + "step": 1524, + "time_per_iteration": 2.521636962890625 + }, + { + "auxiliary_loss_clip": 0.06770191, + "auxiliary_loss_mlp": 0.01303707, + "balance_loss_clip": 0.06361801, + "balance_loss_mlp": 0.01259814, + "epoch": 0.09168796031865324, + "flos": 18484028225280.0, + "grad_norm": 2.538080589714564, + "language_loss": 0.88912833, + "learning_rate": 3.960215028335644e-06, + "loss": 0.96986729, + "num_input_tokens_seen": 32602375, + "router_z_loss_clip": 4.08398438, + "router_z_loss_mlp": 0.43920898, + "step": 1525, + "time_per_iteration": 2.523988962173462 + }, + { + "auxiliary_loss_clip": 0.06788673, + "auxiliary_loss_mlp": 0.01309343, + "balance_loss_clip": 0.06375777, + "balance_loss_mlp": 0.01263877, + "epoch": 0.0917480835713212, + "flos": 29395290251520.0, + "grad_norm": 2.947838768384084, + "language_loss": 0.76479626, + "learning_rate": 3.96013769577032e-06, + "loss": 0.84577644, + "num_input_tokens_seen": 32621460, + "router_z_loss_clip": 4.125, + "router_z_loss_mlp": 0.45458984, + "step": 1526, + "time_per_iteration": 2.622180700302124 + }, + { + "auxiliary_loss_clip": 0.06764297, + "auxiliary_loss_mlp": 0.0130915, + "balance_loss_clip": 0.06361825, + "balance_loss_mlp": 0.01267212, + "epoch": 0.09180820682398917, + "flos": 19835504328960.0, + "grad_norm": 3.217414250452265, + "language_loss": 0.78915322, + "learning_rate": 3.960060288876378e-06, + "loss": 0.86988777, + "num_input_tokens_seen": 32640440, + "router_z_loss_clip": 4.01953125, + "router_z_loss_mlp": 0.41967773, + "step": 1527, + "time_per_iteration": 2.574036121368408 + }, + { + "auxiliary_loss_clip": 0.0678985, + "auxiliary_loss_mlp": 0.0131218, + "balance_loss_clip": 0.0637854, + "balance_loss_mlp": 0.01269146, + "epoch": 0.09186833007665715, + "flos": 23848619777280.0, + "grad_norm": 2.3845621342237284, + "language_loss": 0.81092995, + "learning_rate": 3.959982807656753e-06, + "loss": 0.89195025, + "num_input_tokens_seen": 32660020, + "router_z_loss_clip": 4.109375, + "router_z_loss_mlp": 0.42993164, + "step": 1528, + "time_per_iteration": 2.55942440032959 + }, + { + "auxiliary_loss_clip": 0.067963, + "auxiliary_loss_mlp": 0.01308536, + "balance_loss_clip": 0.06370017, + "balance_loss_mlp": 0.01259708, + "epoch": 0.09192845332932512, + "flos": 12937693167360.0, + "grad_norm": 3.969055249882827, + "language_loss": 0.79179597, + "learning_rate": 3.959905252114384e-06, + "loss": 0.87284434, + "num_input_tokens_seen": 32678170, + "router_z_loss_clip": 4.26171875, + "router_z_loss_mlp": 0.48828125, + "step": 1529, + "time_per_iteration": 2.559513807296753 + }, + { + "auxiliary_loss_clip": 0.06793401, + "auxiliary_loss_mlp": 0.01313121, + "balance_loss_clip": 0.06376834, + "balance_loss_mlp": 0.01266081, + "epoch": 0.09198857658199308, + "flos": 24574503697920.0, + "grad_norm": 2.3851695624911433, + "language_loss": 0.84393311, + "learning_rate": 3.959827622252211e-06, + "loss": 0.92499834, + "num_input_tokens_seen": 32697540, + "router_z_loss_clip": 4.1640625, + "router_z_loss_mlp": 0.47021484, + "step": 1530, + "time_per_iteration": 2.586825132369995 + }, + { + "auxiliary_loss_clip": 0.06782777, + "auxiliary_loss_mlp": 0.01307988, + "balance_loss_clip": 0.0637871, + "balance_loss_mlp": 0.01264596, + "epoch": 0.09204869983466106, + "flos": 20273231658240.0, + "grad_norm": 2.9699033759595728, + "language_loss": 0.85435712, + "learning_rate": 3.959749918073179e-06, + "loss": 0.93526471, + "num_input_tokens_seen": 32716805, + "router_z_loss_clip": 4.0390625, + "router_z_loss_mlp": 0.43383789, + "step": 1531, + "time_per_iteration": 2.592822313308716 + }, + { + "auxiliary_loss_clip": 0.06784501, + "auxiliary_loss_mlp": 0.01306885, + "balance_loss_clip": 0.06371005, + "balance_loss_mlp": 0.01261967, + "epoch": 0.09210882308732903, + "flos": 20891780098560.0, + "grad_norm": 2.1537883780568907, + "language_loss": 0.82955891, + "learning_rate": 3.959672139580233e-06, + "loss": 0.91047275, + "num_input_tokens_seen": 32736385, + "router_z_loss_clip": 4.13671875, + "router_z_loss_mlp": 0.44897461, + "step": 1532, + "time_per_iteration": 2.5733680725097656 + }, + { + "auxiliary_loss_clip": 0.06776289, + "auxiliary_loss_mlp": 0.01303592, + "balance_loss_clip": 0.06368969, + "balance_loss_mlp": 0.01262059, + "epoch": 0.09216894633999699, + "flos": 30964246427520.0, + "grad_norm": 3.2208618489711593, + "language_loss": 0.85266644, + "learning_rate": 3.9595942867763235e-06, + "loss": 0.93346524, + "num_input_tokens_seen": 32757140, + "router_z_loss_clip": 4.06835938, + "router_z_loss_mlp": 0.41552734, + "step": 1533, + "time_per_iteration": 2.640906810760498 + }, + { + "auxiliary_loss_clip": 0.06779255, + "auxiliary_loss_mlp": 0.01307047, + "balance_loss_clip": 0.06369043, + "balance_loss_mlp": 0.01263369, + "epoch": 0.09222906959266497, + "flos": 13156556832000.0, + "grad_norm": 2.5924628709665987, + "language_loss": 0.91772735, + "learning_rate": 3.959516359664402e-06, + "loss": 0.99859047, + "num_input_tokens_seen": 32774860, + "router_z_loss_clip": 4.09960938, + "router_z_loss_mlp": 0.43652344, + "step": 1534, + "time_per_iteration": 2.5586555004119873 + }, + { + "auxiliary_loss_clip": 0.06771498, + "auxiliary_loss_mlp": 0.01306705, + "balance_loss_clip": 0.06357232, + "balance_loss_mlp": 0.01260142, + "epoch": 0.09228919284533293, + "flos": 26001603711360.0, + "grad_norm": 3.0123317324125694, + "language_loss": 0.77440608, + "learning_rate": 3.959438358247424e-06, + "loss": 0.85518813, + "num_input_tokens_seen": 32795250, + "router_z_loss_clip": 4.14257812, + "router_z_loss_mlp": 0.46557617, + "step": 1535, + "time_per_iteration": 2.5873541831970215 + }, + { + "auxiliary_loss_clip": 0.06759383, + "auxiliary_loss_mlp": 0.0131007, + "balance_loss_clip": 0.06362146, + "balance_loss_mlp": 0.012688, + "epoch": 0.0923493160980009, + "flos": 18666694126080.0, + "grad_norm": 2.0947698011843707, + "language_loss": 0.83399653, + "learning_rate": 3.959360282528346e-06, + "loss": 0.91469115, + "num_input_tokens_seen": 32813805, + "router_z_loss_clip": 3.97070312, + "router_z_loss_mlp": 0.41235352, + "step": 1536, + "time_per_iteration": 2.5708868503570557 + }, + { + "auxiliary_loss_clip": 0.06743568, + "auxiliary_loss_mlp": 0.01297679, + "balance_loss_clip": 0.06350097, + "balance_loss_mlp": 0.01257767, + "epoch": 0.09240943935066886, + "flos": 21146673818880.0, + "grad_norm": 2.077431495660488, + "language_loss": 0.91567117, + "learning_rate": 3.959282132510131e-06, + "loss": 0.99608374, + "num_input_tokens_seen": 32830960, + "router_z_loss_clip": 3.93945312, + "router_z_loss_mlp": 0.39916992, + "step": 1537, + "time_per_iteration": 2.5669217109680176 + }, + { + "auxiliary_loss_clip": 0.06758659, + "auxiliary_loss_mlp": 0.01302061, + "balance_loss_clip": 0.06354217, + "balance_loss_mlp": 0.01258288, + "epoch": 0.09246956260333684, + "flos": 20598298773120.0, + "grad_norm": 2.764633424079652, + "language_loss": 0.82388502, + "learning_rate": 3.959203908195741e-06, + "loss": 0.9044922, + "num_input_tokens_seen": 32848275, + "router_z_loss_clip": 4.04296875, + "router_z_loss_mlp": 0.43774414, + "step": 1538, + "time_per_iteration": 2.5693938732147217 + }, + { + "auxiliary_loss_clip": 0.06616426, + "auxiliary_loss_mlp": 0.01331188, + "balance_loss_clip": 0.06353034, + "balance_loss_mlp": 0.01300217, + "epoch": 0.09252968585600481, + "flos": 67580052312960.0, + "grad_norm": 0.7302597602699774, + "language_loss": 0.57435596, + "learning_rate": 3.959125609588142e-06, + "loss": 0.65383208, + "num_input_tokens_seen": 32917730, + "router_z_loss_clip": 2.625, + "router_z_loss_mlp": 0.30932617, + "step": 1539, + "time_per_iteration": 3.310535430908203 + }, + { + "auxiliary_loss_clip": 0.06755982, + "auxiliary_loss_mlp": 0.01299614, + "balance_loss_clip": 0.06351999, + "balance_loss_mlp": 0.01256174, + "epoch": 0.09258980910867277, + "flos": 17389542193920.0, + "grad_norm": 3.846304679224495, + "language_loss": 0.7084049, + "learning_rate": 3.959047236690304e-06, + "loss": 0.78896087, + "num_input_tokens_seen": 32934910, + "router_z_loss_clip": 4.03320312, + "router_z_loss_mlp": 0.43457031, + "step": 1540, + "time_per_iteration": 2.5759708881378174 + }, + { + "auxiliary_loss_clip": 0.06744132, + "auxiliary_loss_mlp": 0.01299701, + "balance_loss_clip": 0.0634924, + "balance_loss_mlp": 0.0125824, + "epoch": 0.09264993236134075, + "flos": 19872205217280.0, + "grad_norm": 1.8486482297190108, + "language_loss": 0.8567428, + "learning_rate": 3.958968789505198e-06, + "loss": 0.93718112, + "num_input_tokens_seen": 32953840, + "router_z_loss_clip": 3.95117188, + "router_z_loss_mlp": 0.41455078, + "step": 1541, + "time_per_iteration": 2.5332911014556885 + }, + { + "auxiliary_loss_clip": 0.06613824, + "auxiliary_loss_mlp": 0.01296188, + "balance_loss_clip": 0.06351398, + "balance_loss_mlp": 0.01268222, + "epoch": 0.09271005561400872, + "flos": 62301455377920.0, + "grad_norm": 0.8853632542817719, + "language_loss": 0.62370431, + "learning_rate": 3.9588902680358e-06, + "loss": 0.70280445, + "num_input_tokens_seen": 33011410, + "router_z_loss_clip": 2.625, + "router_z_loss_mlp": 0.28027344, + "step": 1542, + "time_per_iteration": 3.234708309173584 + }, + { + "auxiliary_loss_clip": 0.06759306, + "auxiliary_loss_mlp": 0.01304245, + "balance_loss_clip": 0.06356558, + "balance_loss_mlp": 0.01259923, + "epoch": 0.09277017886667668, + "flos": 23336358641280.0, + "grad_norm": 2.3970894213309, + "language_loss": 0.84548283, + "learning_rate": 3.958811672285086e-06, + "loss": 0.92611837, + "num_input_tokens_seen": 33031675, + "router_z_loss_clip": 4.03320312, + "router_z_loss_mlp": 0.44360352, + "step": 1543, + "time_per_iteration": 2.5636215209960938 + }, + { + "auxiliary_loss_clip": 0.06747155, + "auxiliary_loss_mlp": 0.01303454, + "balance_loss_clip": 0.06351274, + "balance_loss_mlp": 0.01258178, + "epoch": 0.09283030211934466, + "flos": 54757088513280.0, + "grad_norm": 2.335606951107943, + "language_loss": 0.73961073, + "learning_rate": 3.958733002256038e-06, + "loss": 0.82011688, + "num_input_tokens_seen": 33056355, + "router_z_loss_clip": 3.96289062, + "router_z_loss_mlp": 0.45288086, + "step": 1544, + "time_per_iteration": 2.8664584159851074 + }, + { + "auxiliary_loss_clip": 0.06775358, + "auxiliary_loss_mlp": 0.01304809, + "balance_loss_clip": 0.06364222, + "balance_loss_mlp": 0.01260082, + "epoch": 0.09289042537201263, + "flos": 30342385751040.0, + "grad_norm": 2.3360980643139673, + "language_loss": 0.78971326, + "learning_rate": 3.958654257951637e-06, + "loss": 0.87051487, + "num_input_tokens_seen": 33079520, + "router_z_loss_clip": 4.109375, + "router_z_loss_mlp": 0.44750977, + "step": 1545, + "time_per_iteration": 2.6384429931640625 + }, + { + "auxiliary_loss_clip": 0.0674521, + "auxiliary_loss_mlp": 0.01308675, + "balance_loss_clip": 0.06349306, + "balance_loss_mlp": 0.01266499, + "epoch": 0.09295054862468059, + "flos": 17752274519040.0, + "grad_norm": 3.8854693427637796, + "language_loss": 0.77781618, + "learning_rate": 3.9585754393748706e-06, + "loss": 0.85835493, + "num_input_tokens_seen": 33096135, + "router_z_loss_clip": 3.95703125, + "router_z_loss_mlp": 0.42163086, + "step": 1546, + "time_per_iteration": 2.5352087020874023 + }, + { + "auxiliary_loss_clip": 0.06760454, + "auxiliary_loss_mlp": 0.01300982, + "balance_loss_clip": 0.06357808, + "balance_loss_mlp": 0.01258066, + "epoch": 0.09301067187734856, + "flos": 23664528357120.0, + "grad_norm": 2.488248885797729, + "language_loss": 0.85732055, + "learning_rate": 3.9584965465287275e-06, + "loss": 0.93793488, + "num_input_tokens_seen": 33115245, + "router_z_loss_clip": 4.02734375, + "router_z_loss_mlp": 0.42919922, + "step": 1547, + "time_per_iteration": 2.6185734272003174 + }, + { + "auxiliary_loss_clip": 0.0676943, + "auxiliary_loss_mlp": 0.01302462, + "balance_loss_clip": 0.06361516, + "balance_loss_mlp": 0.01256733, + "epoch": 0.09307079513001654, + "flos": 27535242591360.0, + "grad_norm": 10.105633046635301, + "language_loss": 0.69631422, + "learning_rate": 3.958417579416199e-06, + "loss": 0.77703309, + "num_input_tokens_seen": 33136640, + "router_z_loss_clip": 4.078125, + "router_z_loss_mlp": 0.45703125, + "step": 1548, + "time_per_iteration": 2.590592861175537 + }, + { + "auxiliary_loss_clip": 0.06756231, + "auxiliary_loss_mlp": 0.01308751, + "balance_loss_clip": 0.06351212, + "balance_loss_mlp": 0.01262164, + "epoch": 0.0931309183826845, + "flos": 20632945236480.0, + "grad_norm": 2.778765119974638, + "language_loss": 0.85783607, + "learning_rate": 3.9583385380402795e-06, + "loss": 0.93848586, + "num_input_tokens_seen": 33155060, + "router_z_loss_clip": 4.0546875, + "router_z_loss_mlp": 0.46582031, + "step": 1549, + "time_per_iteration": 2.5733652114868164 + }, + { + "auxiliary_loss_clip": 0.0674461, + "auxiliary_loss_mlp": 0.0130734, + "balance_loss_clip": 0.06348558, + "balance_loss_mlp": 0.01260515, + "epoch": 0.09319104163535247, + "flos": 29028239441280.0, + "grad_norm": 2.291130376172184, + "language_loss": 0.78293371, + "learning_rate": 3.958259422403966e-06, + "loss": 0.86345315, + "num_input_tokens_seen": 33175420, + "router_z_loss_clip": 3.96289062, + "router_z_loss_mlp": 0.46777344, + "step": 1550, + "time_per_iteration": 2.675468683242798 + }, + { + "auxiliary_loss_clip": 0.06764482, + "auxiliary_loss_mlp": 0.01307112, + "balance_loss_clip": 0.06363475, + "balance_loss_mlp": 0.01261932, + "epoch": 0.09325116488802045, + "flos": 25308605318400.0, + "grad_norm": 3.8025580487165827, + "language_loss": 0.85284662, + "learning_rate": 3.95818023251026e-06, + "loss": 0.93356252, + "num_input_tokens_seen": 33194120, + "router_z_loss_clip": 4.00976562, + "router_z_loss_mlp": 0.4519043, + "step": 1551, + "time_per_iteration": 2.6053500175476074 + }, + { + "auxiliary_loss_clip": 0.06596169, + "auxiliary_loss_mlp": 0.0130535, + "balance_loss_clip": 0.0633968, + "balance_loss_mlp": 0.01277837, + "epoch": 0.09331128814068841, + "flos": 61556144509440.0, + "grad_norm": 0.7233822491319317, + "language_loss": 0.61895663, + "learning_rate": 3.958100968362163e-06, + "loss": 0.69797182, + "num_input_tokens_seen": 33261080, + "router_z_loss_clip": 2.5625, + "router_z_loss_mlp": 0.27587891, + "step": 1552, + "time_per_iteration": 3.3384416103363037 + }, + { + "auxiliary_loss_clip": 0.06590016, + "auxiliary_loss_mlp": 0.01301581, + "balance_loss_clip": 0.06333126, + "balance_loss_mlp": 0.012734, + "epoch": 0.09337141139335638, + "flos": 53312810883840.0, + "grad_norm": 0.7946952857616146, + "language_loss": 0.59040678, + "learning_rate": 3.958021629962681e-06, + "loss": 0.66932273, + "num_input_tokens_seen": 33330235, + "router_z_loss_clip": 2.5625, + "router_z_loss_mlp": 0.28222656, + "step": 1553, + "time_per_iteration": 3.328634262084961 + }, + { + "auxiliary_loss_clip": 0.06762205, + "auxiliary_loss_mlp": 0.01305187, + "balance_loss_clip": 0.06356394, + "balance_loss_mlp": 0.01259005, + "epoch": 0.09343153464602436, + "flos": 23483539537920.0, + "grad_norm": 2.4998209031659853, + "language_loss": 0.888143, + "learning_rate": 3.957942217314823e-06, + "loss": 0.96881694, + "num_input_tokens_seen": 33349035, + "router_z_loss_clip": 4.05078125, + "router_z_loss_mlp": 0.46142578, + "step": 1554, + "time_per_iteration": 2.581807851791382 + }, + { + "auxiliary_loss_clip": 0.06741555, + "auxiliary_loss_mlp": 0.01307833, + "balance_loss_clip": 0.06351957, + "balance_loss_mlp": 0.01266014, + "epoch": 0.09349165789869232, + "flos": 19359399029760.0, + "grad_norm": 2.344370035353047, + "language_loss": 0.83131635, + "learning_rate": 3.957862730421599e-06, + "loss": 0.91181016, + "num_input_tokens_seen": 33368060, + "router_z_loss_clip": 3.89453125, + "router_z_loss_mlp": 0.41772461, + "step": 1555, + "time_per_iteration": 2.5902695655822754 + }, + { + "auxiliary_loss_clip": 0.06587426, + "auxiliary_loss_mlp": 0.01289293, + "balance_loss_clip": 0.06331394, + "balance_loss_mlp": 0.01264736, + "epoch": 0.09355178115136029, + "flos": 67520626968960.0, + "grad_norm": 0.861973728001382, + "language_loss": 0.59963852, + "learning_rate": 3.957783169286024e-06, + "loss": 0.67840576, + "num_input_tokens_seen": 33430825, + "router_z_loss_clip": 2.5625, + "router_z_loss_mlp": 0.2454834, + "step": 1556, + "time_per_iteration": 4.633097410202026 + }, + { + "auxiliary_loss_clip": 0.06743869, + "auxiliary_loss_mlp": 0.01306461, + "balance_loss_clip": 0.06350282, + "balance_loss_mlp": 0.01262378, + "epoch": 0.09361190440402825, + "flos": 37350676920960.0, + "grad_norm": 4.324378965941339, + "language_loss": 0.86094332, + "learning_rate": 3.9577035339111155e-06, + "loss": 0.94144666, + "num_input_tokens_seen": 33454855, + "router_z_loss_clip": 3.93359375, + "router_z_loss_mlp": 0.44091797, + "step": 1557, + "time_per_iteration": 4.159425258636475 + }, + { + "auxiliary_loss_clip": 0.06735416, + "auxiliary_loss_mlp": 0.01305568, + "balance_loss_clip": 0.0634184, + "balance_loss_mlp": 0.01261961, + "epoch": 0.09367202765669623, + "flos": 24906614555520.0, + "grad_norm": 1.8416864834979163, + "language_loss": 0.79618692, + "learning_rate": 3.957623824299893e-06, + "loss": 0.87659669, + "num_input_tokens_seen": 33476000, + "router_z_loss_clip": 3.9375, + "router_z_loss_mlp": 0.4362793, + "step": 1558, + "time_per_iteration": 2.592564105987549 + }, + { + "auxiliary_loss_clip": 0.0675108, + "auxiliary_loss_mlp": 0.01310633, + "balance_loss_clip": 0.06350247, + "balance_loss_mlp": 0.0126562, + "epoch": 0.0937321509093642, + "flos": 15710986477440.0, + "grad_norm": 2.1774663365636555, + "language_loss": 0.81722063, + "learning_rate": 3.957544040455379e-06, + "loss": 0.89783776, + "num_input_tokens_seen": 33493845, + "router_z_loss_clip": 4.00390625, + "router_z_loss_mlp": 0.44995117, + "step": 1559, + "time_per_iteration": 2.6032233238220215 + }, + { + "auxiliary_loss_clip": 0.06735763, + "auxiliary_loss_mlp": 0.01315647, + "balance_loss_clip": 0.06339972, + "balance_loss_mlp": 0.0126844, + "epoch": 0.09379227416203216, + "flos": 20489663554560.0, + "grad_norm": 4.6744208078316785, + "language_loss": 0.77938354, + "learning_rate": 3.957464182380599e-06, + "loss": 0.85989761, + "num_input_tokens_seen": 33510850, + "router_z_loss_clip": 3.95117188, + "router_z_loss_mlp": 0.47216797, + "step": 1560, + "time_per_iteration": 4.077486753463745 + }, + { + "auxiliary_loss_clip": 0.06748343, + "auxiliary_loss_mlp": 0.01308417, + "balance_loss_clip": 0.06347422, + "balance_loss_mlp": 0.01262736, + "epoch": 0.09385239741470014, + "flos": 24359329612800.0, + "grad_norm": 2.0394992370655975, + "language_loss": 0.82801652, + "learning_rate": 3.95738425007858e-06, + "loss": 0.90858412, + "num_input_tokens_seen": 33530430, + "router_z_loss_clip": 4.0078125, + "router_z_loss_mlp": 0.45678711, + "step": 1561, + "time_per_iteration": 2.596116781234741 + }, + { + "auxiliary_loss_clip": 0.06752103, + "auxiliary_loss_mlp": 0.01323602, + "balance_loss_clip": 0.06347683, + "balance_loss_mlp": 0.01280186, + "epoch": 0.0939125206673681, + "flos": 33299812408320.0, + "grad_norm": 7.4214047506541085, + "language_loss": 0.63655907, + "learning_rate": 3.957304243552354e-06, + "loss": 0.71731609, + "num_input_tokens_seen": 33551975, + "router_z_loss_clip": 4.046875, + "router_z_loss_mlp": 0.43457031, + "step": 1562, + "time_per_iteration": 4.075207710266113 + }, + { + "auxiliary_loss_clip": 0.06726522, + "auxiliary_loss_mlp": 0.01325114, + "balance_loss_clip": 0.06341539, + "balance_loss_mlp": 0.012796, + "epoch": 0.09397264392003607, + "flos": 19250973446400.0, + "grad_norm": 3.0209063418471516, + "language_loss": 0.87167883, + "learning_rate": 3.957224162804956e-06, + "loss": 0.95219523, + "num_input_tokens_seen": 33569850, + "router_z_loss_clip": 3.84960938, + "router_z_loss_mlp": 0.45556641, + "step": 1563, + "time_per_iteration": 2.5672974586486816 + }, + { + "auxiliary_loss_clip": 0.06731268, + "auxiliary_loss_mlp": 0.01318973, + "balance_loss_clip": 0.06341776, + "balance_loss_mlp": 0.01275843, + "epoch": 0.09403276717270405, + "flos": 19323997879680.0, + "grad_norm": 4.036825223775372, + "language_loss": 0.77853692, + "learning_rate": 3.9571440078394205e-06, + "loss": 0.85903931, + "num_input_tokens_seen": 33590510, + "router_z_loss_clip": 3.90039062, + "router_z_loss_mlp": 0.43139648, + "step": 1564, + "time_per_iteration": 2.586803913116455 + }, + { + "auxiliary_loss_clip": 0.06734219, + "auxiliary_loss_mlp": 0.0132655, + "balance_loss_clip": 0.06344242, + "balance_loss_mlp": 0.01285876, + "epoch": 0.09409289042537201, + "flos": 23589701061120.0, + "grad_norm": 2.2846066488683725, + "language_loss": 0.81194431, + "learning_rate": 3.9570637786587895e-06, + "loss": 0.89255196, + "num_input_tokens_seen": 33608810, + "router_z_loss_clip": 3.90039062, + "router_z_loss_mlp": 0.40649414, + "step": 1565, + "time_per_iteration": 2.5794317722320557 + }, + { + "auxiliary_loss_clip": 0.06753047, + "auxiliary_loss_mlp": 0.01322466, + "balance_loss_clip": 0.06351732, + "balance_loss_mlp": 0.01275616, + "epoch": 0.09415301367803998, + "flos": 20083689722880.0, + "grad_norm": 2.6435222335860984, + "language_loss": 0.77859378, + "learning_rate": 3.956983475266103e-06, + "loss": 0.85934889, + "num_input_tokens_seen": 33627265, + "router_z_loss_clip": 4.01171875, + "router_z_loss_mlp": 0.46850586, + "step": 1566, + "time_per_iteration": 2.585827112197876 + }, + { + "auxiliary_loss_clip": 0.06732298, + "auxiliary_loss_mlp": 0.01317656, + "balance_loss_clip": 0.06341095, + "balance_loss_mlp": 0.01273048, + "epoch": 0.09421313693070796, + "flos": 21067234548480.0, + "grad_norm": 2.512043511854747, + "language_loss": 0.79885954, + "learning_rate": 3.956903097664407e-06, + "loss": 0.87935913, + "num_input_tokens_seen": 33644810, + "router_z_loss_clip": 3.91015625, + "router_z_loss_mlp": 0.44555664, + "step": 1567, + "time_per_iteration": 2.6127569675445557 + }, + { + "auxiliary_loss_clip": 0.06736939, + "auxiliary_loss_mlp": 0.01312026, + "balance_loss_clip": 0.06345257, + "balance_loss_mlp": 0.01268467, + "epoch": 0.09427326018337592, + "flos": 24323006067840.0, + "grad_norm": 2.023408518632979, + "language_loss": 0.8442241, + "learning_rate": 3.956822645856749e-06, + "loss": 0.92471373, + "num_input_tokens_seen": 33665665, + "router_z_loss_clip": 3.91796875, + "router_z_loss_mlp": 0.43505859, + "step": 1568, + "time_per_iteration": 2.569720506668091 + }, + { + "auxiliary_loss_clip": 0.06755883, + "auxiliary_loss_mlp": 0.01306618, + "balance_loss_clip": 0.06353641, + "balance_loss_mlp": 0.01263583, + "epoch": 0.09433338343604389, + "flos": 20269667859840.0, + "grad_norm": 2.477497103121254, + "language_loss": 0.77784359, + "learning_rate": 3.9567421198461814e-06, + "loss": 0.85846859, + "num_input_tokens_seen": 33684760, + "router_z_loss_clip": 4.01757812, + "router_z_loss_mlp": 0.43041992, + "step": 1569, + "time_per_iteration": 2.573776960372925 + }, + { + "auxiliary_loss_clip": 0.06750233, + "auxiliary_loss_mlp": 0.01322236, + "balance_loss_clip": 0.06360742, + "balance_loss_mlp": 0.01281443, + "epoch": 0.09439350668871185, + "flos": 12746683785600.0, + "grad_norm": 3.1104432371221495, + "language_loss": 0.87103617, + "learning_rate": 3.956661519635756e-06, + "loss": 0.95176083, + "num_input_tokens_seen": 33700750, + "router_z_loss_clip": 3.8984375, + "router_z_loss_mlp": 0.40795898, + "step": 1570, + "time_per_iteration": 2.5129590034484863 + }, + { + "auxiliary_loss_clip": 0.06749961, + "auxiliary_loss_mlp": 0.01311255, + "balance_loss_clip": 0.06350505, + "balance_loss_mlp": 0.01269007, + "epoch": 0.09445362994137983, + "flos": 25970101776000.0, + "grad_norm": 2.3671248077954297, + "language_loss": 0.7803812, + "learning_rate": 3.95658084522853e-06, + "loss": 0.86099339, + "num_input_tokens_seen": 33724430, + "router_z_loss_clip": 3.99609375, + "router_z_loss_mlp": 0.42236328, + "step": 1571, + "time_per_iteration": 2.7541556358337402 + }, + { + "auxiliary_loss_clip": 0.0672407, + "auxiliary_loss_mlp": 0.01308455, + "balance_loss_clip": 0.06346194, + "balance_loss_mlp": 0.01269807, + "epoch": 0.0945137531940478, + "flos": 19720831616640.0, + "grad_norm": 2.4306247586771934, + "language_loss": 0.81068146, + "learning_rate": 3.956500096627561e-06, + "loss": 0.89100671, + "num_input_tokens_seen": 33743455, + "router_z_loss_clip": 3.78125, + "router_z_loss_mlp": 0.38623047, + "step": 1572, + "time_per_iteration": 2.5679988861083984 + }, + { + "auxiliary_loss_clip": 0.06744019, + "auxiliary_loss_mlp": 0.01308416, + "balance_loss_clip": 0.06344286, + "balance_loss_mlp": 0.01265691, + "epoch": 0.09457387644671576, + "flos": 23622796224000.0, + "grad_norm": 3.3370924728894185, + "language_loss": 0.8915112, + "learning_rate": 3.956419273835913e-06, + "loss": 0.97203565, + "num_input_tokens_seen": 33763435, + "router_z_loss_clip": 3.99804688, + "router_z_loss_mlp": 0.42700195, + "step": 1573, + "time_per_iteration": 2.607600688934326 + }, + { + "auxiliary_loss_clip": 0.06757497, + "auxiliary_loss_mlp": 0.01304776, + "balance_loss_clip": 0.0635422, + "balance_loss_mlp": 0.0125919, + "epoch": 0.09463399969938374, + "flos": 26914681653120.0, + "grad_norm": 3.5983977458342764, + "language_loss": 0.83351094, + "learning_rate": 3.95633837685665e-06, + "loss": 0.91413361, + "num_input_tokens_seen": 33784325, + "router_z_loss_clip": 4.02734375, + "router_z_loss_mlp": 0.45605469, + "step": 1574, + "time_per_iteration": 2.629686117172241 + }, + { + "auxiliary_loss_clip": 0.06738517, + "auxiliary_loss_mlp": 0.01306377, + "balance_loss_clip": 0.06343692, + "balance_loss_mlp": 0.01264463, + "epoch": 0.0946941229520517, + "flos": 23666331219840.0, + "grad_norm": 2.307572986084867, + "language_loss": 0.82900977, + "learning_rate": 3.95625740569284e-06, + "loss": 0.9094587, + "num_input_tokens_seen": 33802510, + "router_z_loss_clip": 3.9453125, + "router_z_loss_mlp": 0.41918945, + "step": 1575, + "time_per_iteration": 2.6788809299468994 + }, + { + "auxiliary_loss_clip": 0.06738277, + "auxiliary_loss_mlp": 0.013099, + "balance_loss_clip": 0.06341611, + "balance_loss_mlp": 0.01265912, + "epoch": 0.09475424620471967, + "flos": 24140927145600.0, + "grad_norm": 3.091827797586119, + "language_loss": 0.88420904, + "learning_rate": 3.956176360347553e-06, + "loss": 0.9646908, + "num_input_tokens_seen": 33819980, + "router_z_loss_clip": 3.96875, + "router_z_loss_mlp": 0.43969727, + "step": 1576, + "time_per_iteration": 2.579481840133667 + }, + { + "auxiliary_loss_clip": 0.06599005, + "auxiliary_loss_mlp": 0.01293963, + "balance_loss_clip": 0.06343846, + "balance_loss_mlp": 0.01269894, + "epoch": 0.09481436945738765, + "flos": 68446283022720.0, + "grad_norm": 0.9736372426009887, + "language_loss": 0.66026628, + "learning_rate": 3.956095240823862e-06, + "loss": 0.73919594, + "num_input_tokens_seen": 33878925, + "router_z_loss_clip": 2.5546875, + "router_z_loss_mlp": 0.24060059, + "step": 1577, + "time_per_iteration": 3.1515533924102783 + }, + { + "auxiliary_loss_clip": 0.06730399, + "auxiliary_loss_mlp": 0.01300904, + "balance_loss_clip": 0.06338648, + "balance_loss_mlp": 0.01260373, + "epoch": 0.09487449271005562, + "flos": 16659633277440.0, + "grad_norm": 8.095983487206498, + "language_loss": 0.81352609, + "learning_rate": 3.956014047124844e-06, + "loss": 0.89383912, + "num_input_tokens_seen": 33897600, + "router_z_loss_clip": 3.91601562, + "router_z_loss_mlp": 0.40551758, + "step": 1578, + "time_per_iteration": 2.5477943420410156 + }, + { + "auxiliary_loss_clip": 0.06728384, + "auxiliary_loss_mlp": 0.01305272, + "balance_loss_clip": 0.06339101, + "balance_loss_mlp": 0.01262261, + "epoch": 0.09493461596272358, + "flos": 24281860913280.0, + "grad_norm": 2.2398618164761674, + "language_loss": 0.79482144, + "learning_rate": 3.955932779253578e-06, + "loss": 0.87515795, + "num_input_tokens_seen": 33917365, + "router_z_loss_clip": 3.89453125, + "router_z_loss_mlp": 0.43017578, + "step": 1579, + "time_per_iteration": 2.5968475341796875 + }, + { + "auxiliary_loss_clip": 0.06732477, + "auxiliary_loss_mlp": 0.01300696, + "balance_loss_clip": 0.06336749, + "balance_loss_mlp": 0.012579, + "epoch": 0.09499473921539155, + "flos": 21876373100160.0, + "grad_norm": 2.5076146880491406, + "language_loss": 0.75397295, + "learning_rate": 3.955851437213144e-06, + "loss": 0.83430469, + "num_input_tokens_seen": 33936680, + "router_z_loss_clip": 3.95703125, + "router_z_loss_mlp": 0.42822266, + "step": 1580, + "time_per_iteration": 2.570138931274414 + }, + { + "auxiliary_loss_clip": 0.06724589, + "auxiliary_loss_mlp": 0.01310914, + "balance_loss_clip": 0.06333821, + "balance_loss_mlp": 0.01268666, + "epoch": 0.09505486246805953, + "flos": 33555544669440.0, + "grad_norm": 5.064476993970354, + "language_loss": 0.78532892, + "learning_rate": 3.955770021006627e-06, + "loss": 0.86568391, + "num_input_tokens_seen": 33960685, + "router_z_loss_clip": 3.90625, + "router_z_loss_mlp": 0.42236328, + "step": 1581, + "time_per_iteration": 2.6650803089141846 + }, + { + "auxiliary_loss_clip": 0.06722299, + "auxiliary_loss_mlp": 0.01301656, + "balance_loss_clip": 0.06332248, + "balance_loss_mlp": 0.01261006, + "epoch": 0.09511498572072749, + "flos": 21221752677120.0, + "grad_norm": 5.1362606458817925, + "language_loss": 0.89191097, + "learning_rate": 3.955688530637116e-06, + "loss": 0.97215056, + "num_input_tokens_seen": 33980015, + "router_z_loss_clip": 3.90429688, + "router_z_loss_mlp": 0.40698242, + "step": 1582, + "time_per_iteration": 2.5564815998077393 + }, + { + "auxiliary_loss_clip": 0.06727481, + "auxiliary_loss_mlp": 0.01303544, + "balance_loss_clip": 0.06332925, + "balance_loss_mlp": 0.01261773, + "epoch": 0.09517510897339546, + "flos": 14616542373120.0, + "grad_norm": 2.3229781210723393, + "language_loss": 0.68368226, + "learning_rate": 3.955606966107699e-06, + "loss": 0.76399243, + "num_input_tokens_seen": 33997705, + "router_z_loss_clip": 3.94140625, + "router_z_loss_mlp": 0.41772461, + "step": 1583, + "time_per_iteration": 2.6164753437042236 + }, + { + "auxiliary_loss_clip": 0.06727771, + "auxiliary_loss_mlp": 0.01304751, + "balance_loss_clip": 0.06331809, + "balance_loss_mlp": 0.01261048, + "epoch": 0.09523523222606343, + "flos": 27824531212800.0, + "grad_norm": 3.115442275670272, + "language_loss": 0.72724044, + "learning_rate": 3.95552532742147e-06, + "loss": 0.80756557, + "num_input_tokens_seen": 34017465, + "router_z_loss_clip": 3.95703125, + "router_z_loss_mlp": 0.43725586, + "step": 1584, + "time_per_iteration": 2.604071855545044 + }, + { + "auxiliary_loss_clip": 0.06722259, + "auxiliary_loss_mlp": 0.01304961, + "balance_loss_clip": 0.06331295, + "balance_loss_mlp": 0.01265431, + "epoch": 0.0952953554787314, + "flos": 20712887631360.0, + "grad_norm": 1.6075041233622491, + "language_loss": 0.82572448, + "learning_rate": 3.955443614581525e-06, + "loss": 0.90599668, + "num_input_tokens_seen": 34038550, + "router_z_loss_clip": 3.91210938, + "router_z_loss_mlp": 0.39550781, + "step": 1585, + "time_per_iteration": 2.586507797241211 + }, + { + "auxiliary_loss_clip": 0.0673333, + "auxiliary_loss_mlp": 0.01317767, + "balance_loss_clip": 0.06331026, + "balance_loss_mlp": 0.01272039, + "epoch": 0.09535547873139937, + "flos": 24794080122240.0, + "grad_norm": 2.5515489551775854, + "language_loss": 0.74444079, + "learning_rate": 3.955361827590961e-06, + "loss": 0.82495177, + "num_input_tokens_seen": 34058665, + "router_z_loss_clip": 4.01953125, + "router_z_loss_mlp": 0.45727539, + "step": 1586, + "time_per_iteration": 2.629486083984375 + }, + { + "auxiliary_loss_clip": 0.06581648, + "auxiliary_loss_mlp": 0.01282136, + "balance_loss_clip": 0.06328419, + "balance_loss_mlp": 0.01258128, + "epoch": 0.09541560198406734, + "flos": 71930114956800.0, + "grad_norm": 0.7905774049307454, + "language_loss": 0.55110765, + "learning_rate": 3.955279966452883e-06, + "loss": 0.62974548, + "num_input_tokens_seen": 34109655, + "router_z_loss_clip": 2.53125, + "router_z_loss_mlp": 0.23974609, + "step": 1587, + "time_per_iteration": 2.9765305519104004 + }, + { + "auxiliary_loss_clip": 0.06737173, + "auxiliary_loss_mlp": 0.01308566, + "balance_loss_clip": 0.06336194, + "balance_loss_mlp": 0.01264316, + "epoch": 0.09547572523673531, + "flos": 28989609909120.0, + "grad_norm": 3.1625529132554835, + "language_loss": 0.82650244, + "learning_rate": 3.955198031170391e-06, + "loss": 0.90695989, + "num_input_tokens_seen": 34131115, + "router_z_loss_clip": 4.01171875, + "router_z_loss_mlp": 0.44213867, + "step": 1588, + "time_per_iteration": 2.6358370780944824 + }, + { + "auxiliary_loss_clip": 0.06726347, + "auxiliary_loss_mlp": 0.01313798, + "balance_loss_clip": 0.06331095, + "balance_loss_mlp": 0.01270716, + "epoch": 0.09553584848940327, + "flos": 24140759437440.0, + "grad_norm": 5.541794796195464, + "language_loss": 0.83084911, + "learning_rate": 3.955116021746594e-06, + "loss": 0.91125059, + "num_input_tokens_seen": 34151925, + "router_z_loss_clip": 3.95507812, + "router_z_loss_mlp": 0.43066406, + "step": 1589, + "time_per_iteration": 2.609682559967041 + }, + { + "auxiliary_loss_clip": 0.06720543, + "auxiliary_loss_mlp": 0.01306342, + "balance_loss_clip": 0.06330015, + "balance_loss_mlp": 0.01265263, + "epoch": 0.09559597174207124, + "flos": 42861401193600.0, + "grad_norm": 2.659540476465126, + "language_loss": 0.66428804, + "learning_rate": 3.955033938184601e-06, + "loss": 0.7445569, + "num_input_tokens_seen": 34175395, + "router_z_loss_clip": 3.91015625, + "router_z_loss_mlp": 0.41113281, + "step": 1590, + "time_per_iteration": 2.7904412746429443 + }, + { + "auxiliary_loss_clip": 0.06727439, + "auxiliary_loss_mlp": 0.01307692, + "balance_loss_clip": 0.06336293, + "balance_loss_mlp": 0.01267947, + "epoch": 0.09565609499473922, + "flos": 32678999907840.0, + "grad_norm": 1.976054240399588, + "language_loss": 0.84640449, + "learning_rate": 3.954951780487526e-06, + "loss": 0.92675579, + "num_input_tokens_seen": 34197760, + "router_z_loss_clip": 3.91210938, + "router_z_loss_mlp": 0.39746094, + "step": 1591, + "time_per_iteration": 2.677856683731079 + }, + { + "auxiliary_loss_clip": 0.0673625, + "auxiliary_loss_mlp": 0.01301164, + "balance_loss_clip": 0.06335758, + "balance_loss_mlp": 0.01259751, + "epoch": 0.09571621824740718, + "flos": 18484279787520.0, + "grad_norm": 3.2019409014799245, + "language_loss": 0.76485634, + "learning_rate": 3.9548695486584835e-06, + "loss": 0.84523046, + "num_input_tokens_seen": 34215330, + "router_z_loss_clip": 4.00976562, + "router_z_loss_mlp": 0.41381836, + "step": 1592, + "time_per_iteration": 2.5469346046447754 + }, + { + "auxiliary_loss_clip": 0.06718349, + "auxiliary_loss_mlp": 0.01308454, + "balance_loss_clip": 0.06327368, + "balance_loss_mlp": 0.01266444, + "epoch": 0.09577634150007515, + "flos": 29395164470400.0, + "grad_norm": 2.5830614134690757, + "language_loss": 0.75440031, + "learning_rate": 3.954787242700592e-06, + "loss": 0.8346684, + "num_input_tokens_seen": 34237745, + "router_z_loss_clip": 3.90625, + "router_z_loss_mlp": 0.42041016, + "step": 1593, + "time_per_iteration": 2.6077914237976074 + }, + { + "auxiliary_loss_clip": 0.06715257, + "auxiliary_loss_mlp": 0.01313469, + "balance_loss_clip": 0.06327495, + "balance_loss_mlp": 0.01269863, + "epoch": 0.09583646475274313, + "flos": 22754511089280.0, + "grad_norm": 3.098780608368182, + "language_loss": 0.70938909, + "learning_rate": 3.954704862616971e-06, + "loss": 0.78967637, + "num_input_tokens_seen": 34256565, + "router_z_loss_clip": 3.87890625, + "router_z_loss_mlp": 0.4362793, + "step": 1594, + "time_per_iteration": 2.6091833114624023 + }, + { + "auxiliary_loss_clip": 0.06719844, + "auxiliary_loss_mlp": 0.01312184, + "balance_loss_clip": 0.06326512, + "balance_loss_mlp": 0.01271247, + "epoch": 0.0958965880054111, + "flos": 23224495040640.0, + "grad_norm": 3.065197690061672, + "language_loss": 0.83355862, + "learning_rate": 3.954622408410747e-06, + "loss": 0.91387886, + "num_input_tokens_seen": 34275970, + "router_z_loss_clip": 3.92773438, + "router_z_loss_mlp": 0.40942383, + "step": 1595, + "time_per_iteration": 3.978273630142212 + }, + { + "auxiliary_loss_clip": 0.06729501, + "auxiliary_loss_mlp": 0.01321195, + "balance_loss_clip": 0.06329941, + "balance_loss_mlp": 0.01278638, + "epoch": 0.09595671125807906, + "flos": 21330807166080.0, + "grad_norm": 2.8509518249201866, + "language_loss": 0.87066317, + "learning_rate": 3.954539880085045e-06, + "loss": 0.95117009, + "num_input_tokens_seen": 34295490, + "router_z_loss_clip": 3.99414062, + "router_z_loss_mlp": 0.42529297, + "step": 1596, + "time_per_iteration": 4.032626390457153 + }, + { + "auxiliary_loss_clip": 0.06723377, + "auxiliary_loss_mlp": 0.01316069, + "balance_loss_clip": 0.06335501, + "balance_loss_mlp": 0.01273273, + "epoch": 0.09601683451074704, + "flos": 39612841125120.0, + "grad_norm": 3.1423731979310587, + "language_loss": 0.70766866, + "learning_rate": 3.9544572776429945e-06, + "loss": 0.78806317, + "num_input_tokens_seen": 34319990, + "router_z_loss_clip": 3.8828125, + "router_z_loss_mlp": 0.42773438, + "step": 1597, + "time_per_iteration": 2.7174298763275146 + }, + { + "auxiliary_loss_clip": 0.06742129, + "auxiliary_loss_mlp": 0.01306146, + "balance_loss_clip": 0.06339651, + "balance_loss_mlp": 0.01265687, + "epoch": 0.096076957763415, + "flos": 23739523361280.0, + "grad_norm": 3.050895337571829, + "language_loss": 0.77272135, + "learning_rate": 3.954374601087729e-06, + "loss": 0.85320413, + "num_input_tokens_seen": 34339225, + "router_z_loss_clip": 4.02734375, + "router_z_loss_mlp": 0.40429688, + "step": 1598, + "time_per_iteration": 2.5799829959869385 + }, + { + "auxiliary_loss_clip": 0.06737213, + "auxiliary_loss_mlp": 0.01319114, + "balance_loss_clip": 0.06339812, + "balance_loss_mlp": 0.01276103, + "epoch": 0.09613708101608297, + "flos": 34686689662080.0, + "grad_norm": 4.982256482437043, + "language_loss": 0.70875788, + "learning_rate": 3.954291850422382e-06, + "loss": 0.78932118, + "num_input_tokens_seen": 34361020, + "router_z_loss_clip": 3.96679688, + "router_z_loss_mlp": 0.43041992, + "step": 1599, + "time_per_iteration": 4.165144443511963 + }, + { + "auxiliary_loss_clip": 0.0672265, + "auxiliary_loss_mlp": 0.01315059, + "balance_loss_clip": 0.06336158, + "balance_loss_mlp": 0.01275029, + "epoch": 0.09619720426875093, + "flos": 20746192429440.0, + "grad_norm": 2.7563705555600655, + "language_loss": 0.85738063, + "learning_rate": 3.954209025650093e-06, + "loss": 0.93775773, + "num_input_tokens_seen": 34378630, + "router_z_loss_clip": 3.86523438, + "router_z_loss_mlp": 0.40014648, + "step": 1600, + "time_per_iteration": 2.583336591720581 + }, + { + "auxiliary_loss_clip": 0.06737998, + "auxiliary_loss_mlp": 0.01310218, + "balance_loss_clip": 0.06341977, + "balance_loss_mlp": 0.01270641, + "epoch": 0.09625732752141891, + "flos": 13047795832320.0, + "grad_norm": 2.909698328635622, + "language_loss": 0.82446879, + "learning_rate": 3.954126126774001e-06, + "loss": 0.90495098, + "num_input_tokens_seen": 34397110, + "router_z_loss_clip": 3.96484375, + "router_z_loss_mlp": 0.39599609, + "step": 1601, + "time_per_iteration": 3.9834721088409424 + }, + { + "auxiliary_loss_clip": 0.06743482, + "auxiliary_loss_mlp": 0.01303448, + "balance_loss_clip": 0.06337628, + "balance_loss_mlp": 0.01262368, + "epoch": 0.09631745077408688, + "flos": 22280250579840.0, + "grad_norm": 5.887605287140624, + "language_loss": 0.84592891, + "learning_rate": 3.954043153797251e-06, + "loss": 0.92639828, + "num_input_tokens_seen": 34414165, + "router_z_loss_clip": 4.0546875, + "router_z_loss_mlp": 0.41088867, + "step": 1602, + "time_per_iteration": 2.5633962154388428 + }, + { + "auxiliary_loss_clip": 0.06747036, + "auxiliary_loss_mlp": 0.01307728, + "balance_loss_clip": 0.06349348, + "balance_loss_mlp": 0.012661, + "epoch": 0.09637757402675484, + "flos": 24761236521600.0, + "grad_norm": 2.955003508709107, + "language_loss": 0.65285349, + "learning_rate": 3.953960106722989e-06, + "loss": 0.73340118, + "num_input_tokens_seen": 34434445, + "router_z_loss_clip": 3.97851562, + "router_z_loss_mlp": 0.41625977, + "step": 1603, + "time_per_iteration": 2.6790709495544434 + }, + { + "auxiliary_loss_clip": 0.06770037, + "auxiliary_loss_mlp": 0.01301761, + "balance_loss_clip": 0.06360609, + "balance_loss_mlp": 0.01258321, + "epoch": 0.09643769727942282, + "flos": 22531873991040.0, + "grad_norm": 5.353230367509213, + "language_loss": 0.72867018, + "learning_rate": 3.953876985554364e-06, + "loss": 0.80938816, + "num_input_tokens_seen": 34453095, + "router_z_loss_clip": 4.09570312, + "router_z_loss_mlp": 0.43505859, + "step": 1604, + "time_per_iteration": 2.608727216720581 + }, + { + "auxiliary_loss_clip": 0.06740201, + "auxiliary_loss_mlp": 0.01291258, + "balance_loss_clip": 0.06351058, + "balance_loss_mlp": 0.01254327, + "epoch": 0.09649782053209079, + "flos": 30929138766720.0, + "grad_norm": 4.793252253869783, + "language_loss": 0.80923069, + "learning_rate": 3.953793790294527e-06, + "loss": 0.88954532, + "num_input_tokens_seen": 34473680, + "router_z_loss_clip": 3.890625, + "router_z_loss_mlp": 0.36938477, + "step": 1605, + "time_per_iteration": 2.6763031482696533 + }, + { + "auxiliary_loss_clip": 0.06759577, + "auxiliary_loss_mlp": 0.01298287, + "balance_loss_clip": 0.06351094, + "balance_loss_mlp": 0.01258805, + "epoch": 0.09655794378475875, + "flos": 25344635374080.0, + "grad_norm": 2.3859738867756524, + "language_loss": 0.77227855, + "learning_rate": 3.953710520946634e-06, + "loss": 0.85285711, + "num_input_tokens_seen": 34492610, + "router_z_loss_clip": 4.08203125, + "router_z_loss_mlp": 0.39501953, + "step": 1606, + "time_per_iteration": 2.5902390480041504 + }, + { + "auxiliary_loss_clip": 0.0675118, + "auxiliary_loss_mlp": 0.0129606, + "balance_loss_clip": 0.06355944, + "balance_loss_mlp": 0.01258009, + "epoch": 0.09661806703742673, + "flos": 22352604180480.0, + "grad_norm": 2.2398823980048133, + "language_loss": 0.77161521, + "learning_rate": 3.953627177513843e-06, + "loss": 0.85208762, + "num_input_tokens_seen": 34511855, + "router_z_loss_clip": 3.953125, + "router_z_loss_mlp": 0.38085938, + "step": 1607, + "time_per_iteration": 2.5747807025909424 + }, + { + "auxiliary_loss_clip": 0.06767638, + "auxiliary_loss_mlp": 0.01306362, + "balance_loss_clip": 0.06365312, + "balance_loss_mlp": 0.01268597, + "epoch": 0.0966781902900947, + "flos": 17463405168000.0, + "grad_norm": 2.424309477239619, + "language_loss": 0.89527833, + "learning_rate": 3.953543759999312e-06, + "loss": 0.97601831, + "num_input_tokens_seen": 34528905, + "router_z_loss_clip": 4.02539062, + "router_z_loss_mlp": 0.37768555, + "step": 1608, + "time_per_iteration": 2.528881072998047 + }, + { + "auxiliary_loss_clip": 0.06782863, + "auxiliary_loss_mlp": 0.01306552, + "balance_loss_clip": 0.06378618, + "balance_loss_mlp": 0.01264471, + "epoch": 0.09673831354276266, + "flos": 36912991518720.0, + "grad_norm": 7.970472148643012, + "language_loss": 0.74000025, + "learning_rate": 3.953460268406207e-06, + "loss": 0.82089442, + "num_input_tokens_seen": 34548480, + "router_z_loss_clip": 4.0390625, + "router_z_loss_mlp": 0.4206543, + "step": 1609, + "time_per_iteration": 2.734060764312744 + }, + { + "auxiliary_loss_clip": 0.06767572, + "auxiliary_loss_mlp": 0.01304591, + "balance_loss_clip": 0.06368488, + "balance_loss_mlp": 0.01264418, + "epoch": 0.09679843679543064, + "flos": 20707185553920.0, + "grad_norm": 3.4585784172758123, + "language_loss": 0.86017323, + "learning_rate": 3.953376702737693e-06, + "loss": 0.94089484, + "num_input_tokens_seen": 34565410, + "router_z_loss_clip": 3.99414062, + "router_z_loss_mlp": 0.40185547, + "step": 1610, + "time_per_iteration": 2.6115059852600098 + }, + { + "auxiliary_loss_clip": 0.06763892, + "auxiliary_loss_mlp": 0.01304909, + "balance_loss_clip": 0.06364195, + "balance_loss_mlp": 0.01263877, + "epoch": 0.0968585600480986, + "flos": 23521288602240.0, + "grad_norm": 2.270672864322457, + "language_loss": 0.68734491, + "learning_rate": 3.953293062996939e-06, + "loss": 0.76803291, + "num_input_tokens_seen": 34584840, + "router_z_loss_clip": 3.9921875, + "router_z_loss_mlp": 0.41040039, + "step": 1611, + "time_per_iteration": 2.614010810852051 + }, + { + "auxiliary_loss_clip": 0.06775121, + "auxiliary_loss_mlp": 0.01302817, + "balance_loss_clip": 0.06373329, + "balance_loss_mlp": 0.01263239, + "epoch": 0.09691868330076657, + "flos": 20127350499840.0, + "grad_norm": 2.139701940573329, + "language_loss": 0.82997268, + "learning_rate": 3.953209349187115e-06, + "loss": 0.91075206, + "num_input_tokens_seen": 34603360, + "router_z_loss_clip": 4.0234375, + "router_z_loss_mlp": 0.39599609, + "step": 1612, + "time_per_iteration": 2.5493521690368652 + }, + { + "auxiliary_loss_clip": 0.06771481, + "auxiliary_loss_mlp": 0.01301111, + "balance_loss_clip": 0.06373016, + "balance_loss_mlp": 0.01260509, + "epoch": 0.09697880655343454, + "flos": 16550243372160.0, + "grad_norm": 8.083682244788854, + "language_loss": 0.82256299, + "learning_rate": 3.953125561311398e-06, + "loss": 0.90328896, + "num_input_tokens_seen": 34620760, + "router_z_loss_clip": 3.984375, + "router_z_loss_mlp": 0.40600586, + "step": 1613, + "time_per_iteration": 2.597912311553955 + }, + { + "auxiliary_loss_clip": 0.06750716, + "auxiliary_loss_mlp": 0.01299993, + "balance_loss_clip": 0.06359349, + "balance_loss_mlp": 0.01259724, + "epoch": 0.09703892980610251, + "flos": 26111370960000.0, + "grad_norm": 2.0260319330855654, + "language_loss": 0.86653531, + "learning_rate": 3.953041699372964e-06, + "loss": 0.94704247, + "num_input_tokens_seen": 34640695, + "router_z_loss_clip": 3.91210938, + "router_z_loss_mlp": 0.40258789, + "step": 1614, + "time_per_iteration": 2.6904046535491943 + }, + { + "auxiliary_loss_clip": 0.06673412, + "auxiliary_loss_mlp": 0.0133076, + "balance_loss_clip": 0.06412064, + "balance_loss_mlp": 0.01308611, + "epoch": 0.09709905305877048, + "flos": 60463712903040.0, + "grad_norm": 0.7036996820791193, + "language_loss": 0.54819673, + "learning_rate": 3.952957763374992e-06, + "loss": 0.6282385, + "num_input_tokens_seen": 34702395, + "router_z_loss_clip": 2.61914062, + "router_z_loss_mlp": 0.22180176, + "step": 1615, + "time_per_iteration": 3.235962152481079 + }, + { + "auxiliary_loss_clip": 0.06658442, + "auxiliary_loss_mlp": 0.01303789, + "balance_loss_clip": 0.06397749, + "balance_loss_mlp": 0.01282129, + "epoch": 0.09715917631143844, + "flos": 57660510885120.0, + "grad_norm": 0.7526049722603284, + "language_loss": 0.58190084, + "learning_rate": 3.952873753320666e-06, + "loss": 0.66152322, + "num_input_tokens_seen": 34768910, + "router_z_loss_clip": 2.59765625, + "router_z_loss_mlp": 0.21691895, + "step": 1616, + "time_per_iteration": 3.387523889541626 + }, + { + "auxiliary_loss_clip": 0.06757308, + "auxiliary_loss_mlp": 0.01307733, + "balance_loss_clip": 0.06359798, + "balance_loss_mlp": 0.01265652, + "epoch": 0.09721929956410642, + "flos": 20564448923520.0, + "grad_norm": 2.209089082853045, + "language_loss": 0.70192569, + "learning_rate": 3.952789669213172e-06, + "loss": 0.78257608, + "num_input_tokens_seen": 34787680, + "router_z_loss_clip": 3.97265625, + "router_z_loss_mlp": 0.42041016, + "step": 1617, + "time_per_iteration": 2.5756118297576904 + }, + { + "auxiliary_loss_clip": 0.06757677, + "auxiliary_loss_mlp": 0.0131002, + "balance_loss_clip": 0.06358766, + "balance_loss_mlp": 0.01269298, + "epoch": 0.09727942281677439, + "flos": 27351696222720.0, + "grad_norm": 2.235248973511229, + "language_loss": 0.81849337, + "learning_rate": 3.952705511055698e-06, + "loss": 0.89917034, + "num_input_tokens_seen": 34808330, + "router_z_loss_clip": 3.98632812, + "router_z_loss_mlp": 0.40722656, + "step": 1618, + "time_per_iteration": 2.6768393516540527 + }, + { + "auxiliary_loss_clip": 0.0674091, + "auxiliary_loss_mlp": 0.01309795, + "balance_loss_clip": 0.06356256, + "balance_loss_mlp": 0.01273293, + "epoch": 0.09733954606944235, + "flos": 24906991898880.0, + "grad_norm": 1.9369475823390685, + "language_loss": 0.94461536, + "learning_rate": 3.952621278851435e-06, + "loss": 1.0251224, + "num_input_tokens_seen": 34830020, + "router_z_loss_clip": 3.84375, + "router_z_loss_mlp": 0.36474609, + "step": 1619, + "time_per_iteration": 2.6324799060821533 + }, + { + "auxiliary_loss_clip": 0.06749003, + "auxiliary_loss_mlp": 0.01319848, + "balance_loss_clip": 0.06356695, + "balance_loss_mlp": 0.01280556, + "epoch": 0.09739966932211033, + "flos": 31511992567680.0, + "grad_norm": 2.8077555075872183, + "language_loss": 0.90160304, + "learning_rate": 3.9525369726035784e-06, + "loss": 0.98229158, + "num_input_tokens_seen": 34850330, + "router_z_loss_clip": 3.91601562, + "router_z_loss_mlp": 0.39257812, + "step": 1620, + "time_per_iteration": 2.658043146133423 + }, + { + "auxiliary_loss_clip": 0.06742691, + "auxiliary_loss_mlp": 0.01310778, + "balance_loss_clip": 0.06352507, + "balance_loss_mlp": 0.01268602, + "epoch": 0.0974597925747783, + "flos": 23885614154880.0, + "grad_norm": 11.754534189846764, + "language_loss": 0.78833234, + "learning_rate": 3.952452592315324e-06, + "loss": 0.86886704, + "num_input_tokens_seen": 34871640, + "router_z_loss_clip": 3.90429688, + "router_z_loss_mlp": 0.421875, + "step": 1621, + "time_per_iteration": 2.575810432434082 + }, + { + "auxiliary_loss_clip": 0.06744215, + "auxiliary_loss_mlp": 0.01311535, + "balance_loss_clip": 0.06357577, + "balance_loss_mlp": 0.01271863, + "epoch": 0.09751991582744626, + "flos": 17025300495360.0, + "grad_norm": 3.321884403192612, + "language_loss": 0.7956326, + "learning_rate": 3.952368137989871e-06, + "loss": 0.87619019, + "num_input_tokens_seen": 34888100, + "router_z_loss_clip": 3.86523438, + "router_z_loss_mlp": 0.39648438, + "step": 1622, + "time_per_iteration": 2.5544931888580322 + }, + { + "auxiliary_loss_clip": 0.06764823, + "auxiliary_loss_mlp": 0.01312235, + "balance_loss_clip": 0.06359966, + "balance_loss_mlp": 0.0127199, + "epoch": 0.09758003908011423, + "flos": 28410403760640.0, + "grad_norm": 4.629544309513281, + "language_loss": 0.86985308, + "learning_rate": 3.9522836096304225e-06, + "loss": 0.95062363, + "num_input_tokens_seen": 34910485, + "router_z_loss_clip": 4.046875, + "router_z_loss_mlp": 0.40209961, + "step": 1623, + "time_per_iteration": 2.612455129623413 + }, + { + "auxiliary_loss_clip": 0.06759211, + "auxiliary_loss_mlp": 0.01313929, + "balance_loss_clip": 0.06368798, + "balance_loss_mlp": 0.01275353, + "epoch": 0.09764016233278221, + "flos": 18149150183040.0, + "grad_norm": 2.3724260177997, + "language_loss": 0.82168519, + "learning_rate": 3.952199007240184e-06, + "loss": 0.90241659, + "num_input_tokens_seen": 34928615, + "router_z_loss_clip": 3.90429688, + "router_z_loss_mlp": 0.38598633, + "step": 1624, + "time_per_iteration": 2.572327136993408 + }, + { + "auxiliary_loss_clip": 0.06750062, + "auxiliary_loss_mlp": 0.01321107, + "balance_loss_clip": 0.06362263, + "balance_loss_mlp": 0.01284462, + "epoch": 0.09770028558545017, + "flos": 15270869306880.0, + "grad_norm": 2.8002590375685195, + "language_loss": 0.87639892, + "learning_rate": 3.952114330822364e-06, + "loss": 0.95711064, + "num_input_tokens_seen": 34946045, + "router_z_loss_clip": 3.87890625, + "router_z_loss_mlp": 0.36645508, + "step": 1625, + "time_per_iteration": 2.5327792167663574 + }, + { + "auxiliary_loss_clip": 0.06781108, + "auxiliary_loss_mlp": 0.01314743, + "balance_loss_clip": 0.06374431, + "balance_loss_mlp": 0.01273353, + "epoch": 0.09776040883811814, + "flos": 23478382512000.0, + "grad_norm": 2.111707696763749, + "language_loss": 0.8695811, + "learning_rate": 3.952029580380172e-06, + "loss": 0.95053965, + "num_input_tokens_seen": 34962865, + "router_z_loss_clip": 4.06445312, + "router_z_loss_mlp": 0.4140625, + "step": 1626, + "time_per_iteration": 2.631251096725464 + }, + { + "auxiliary_loss_clip": 0.067652, + "auxiliary_loss_mlp": 0.01306731, + "balance_loss_clip": 0.06367379, + "balance_loss_mlp": 0.01267177, + "epoch": 0.09782053209078612, + "flos": 24506510509440.0, + "grad_norm": 2.38090987978409, + "language_loss": 0.84928203, + "learning_rate": 3.9519447559168234e-06, + "loss": 0.93000138, + "num_input_tokens_seen": 34983505, + "router_z_loss_clip": 3.97460938, + "router_z_loss_mlp": 0.39550781, + "step": 1627, + "time_per_iteration": 2.6171953678131104 + }, + { + "auxiliary_loss_clip": 0.06749414, + "auxiliary_loss_mlp": 0.01311575, + "balance_loss_clip": 0.06362557, + "balance_loss_mlp": 0.01274334, + "epoch": 0.09788065534345408, + "flos": 21586623281280.0, + "grad_norm": 2.0465991602511107, + "language_loss": 0.86433482, + "learning_rate": 3.951859857435534e-06, + "loss": 0.94494474, + "num_input_tokens_seen": 35001825, + "router_z_loss_clip": 3.8671875, + "router_z_loss_mlp": 0.37255859, + "step": 1628, + "time_per_iteration": 2.5730161666870117 + }, + { + "auxiliary_loss_clip": 0.06751154, + "auxiliary_loss_mlp": 0.013221, + "balance_loss_clip": 0.06365977, + "balance_loss_mlp": 0.0128362, + "epoch": 0.09794077859612205, + "flos": 23849332536960.0, + "grad_norm": 2.074450963540643, + "language_loss": 0.76707101, + "learning_rate": 3.951774884939523e-06, + "loss": 0.84780353, + "num_input_tokens_seen": 35023075, + "router_z_loss_clip": 3.8515625, + "router_z_loss_mlp": 0.38452148, + "step": 1629, + "time_per_iteration": 2.615643262863159 + }, + { + "auxiliary_loss_clip": 0.06753751, + "auxiliary_loss_mlp": 0.01312675, + "balance_loss_clip": 0.06363355, + "balance_loss_mlp": 0.01273288, + "epoch": 0.09800090184879003, + "flos": 23666708563200.0, + "grad_norm": 2.0658158581699806, + "language_loss": 0.79474878, + "learning_rate": 3.951689838432013e-06, + "loss": 0.87541306, + "num_input_tokens_seen": 35043480, + "router_z_loss_clip": 3.91015625, + "router_z_loss_mlp": 0.39379883, + "step": 1630, + "time_per_iteration": 2.5846662521362305 + }, + { + "auxiliary_loss_clip": 0.06751612, + "auxiliary_loss_mlp": 0.01306103, + "balance_loss_clip": 0.06359278, + "balance_loss_mlp": 0.01266335, + "epoch": 0.09806102510145799, + "flos": 17061456332160.0, + "grad_norm": 3.092577982684634, + "language_loss": 0.88391125, + "learning_rate": 3.951604717916228e-06, + "loss": 0.96448845, + "num_input_tokens_seen": 35061490, + "router_z_loss_clip": 3.92382812, + "router_z_loss_mlp": 0.39770508, + "step": 1631, + "time_per_iteration": 2.545468807220459 + }, + { + "auxiliary_loss_clip": 0.06742664, + "auxiliary_loss_mlp": 0.01296447, + "balance_loss_clip": 0.06359032, + "balance_loss_mlp": 0.01259039, + "epoch": 0.09812114835412596, + "flos": 23885278738560.0, + "grad_norm": 2.2303411170681566, + "language_loss": 0.8421644, + "learning_rate": 3.9515195233953975e-06, + "loss": 0.92255551, + "num_input_tokens_seen": 35079670, + "router_z_loss_clip": 3.8359375, + "router_z_loss_mlp": 0.37426758, + "step": 1632, + "time_per_iteration": 2.5765457153320312 + }, + { + "auxiliary_loss_clip": 0.06746343, + "auxiliary_loss_mlp": 0.01300275, + "balance_loss_clip": 0.0636283, + "balance_loss_mlp": 0.01262557, + "epoch": 0.09818127160679392, + "flos": 20601862571520.0, + "grad_norm": 2.054168262723839, + "language_loss": 0.80421484, + "learning_rate": 3.951434254872751e-06, + "loss": 0.88468099, + "num_input_tokens_seen": 35099205, + "router_z_loss_clip": 3.83984375, + "router_z_loss_mlp": 0.37744141, + "step": 1633, + "time_per_iteration": 2.5900163650512695 + }, + { + "auxiliary_loss_clip": 0.06752759, + "auxiliary_loss_mlp": 0.01296054, + "balance_loss_clip": 0.06366011, + "balance_loss_mlp": 0.01257931, + "epoch": 0.0982413948594619, + "flos": 15492835572480.0, + "grad_norm": 3.0165255601535743, + "language_loss": 0.74936914, + "learning_rate": 3.951348912351521e-06, + "loss": 0.82985729, + "num_input_tokens_seen": 35115270, + "router_z_loss_clip": 3.86914062, + "router_z_loss_mlp": 0.38134766, + "step": 1634, + "time_per_iteration": 3.9524917602539062 + }, + { + "auxiliary_loss_clip": 0.06754396, + "auxiliary_loss_mlp": 0.01296894, + "balance_loss_clip": 0.06358244, + "balance_loss_mlp": 0.01258485, + "epoch": 0.09830151811212987, + "flos": 24214999754880.0, + "grad_norm": 4.629396807552869, + "language_loss": 0.75166363, + "learning_rate": 3.951263495834947e-06, + "loss": 0.83217651, + "num_input_tokens_seen": 35134065, + "router_z_loss_clip": 3.95898438, + "router_z_loss_mlp": 0.3840332, + "step": 1635, + "time_per_iteration": 2.619173049926758 + }, + { + "auxiliary_loss_clip": 0.06750873, + "auxiliary_loss_mlp": 0.01303971, + "balance_loss_clip": 0.0635405, + "balance_loss_mlp": 0.01262486, + "epoch": 0.09836164136479783, + "flos": 20600814395520.0, + "grad_norm": 5.1262872331137945, + "language_loss": 0.79884511, + "learning_rate": 3.951178005326264e-06, + "loss": 0.87939358, + "num_input_tokens_seen": 35154870, + "router_z_loss_clip": 3.96875, + "router_z_loss_mlp": 0.41455078, + "step": 1636, + "time_per_iteration": 4.063632965087891 + }, + { + "auxiliary_loss_clip": 0.06755228, + "auxiliary_loss_mlp": 0.0130259, + "balance_loss_clip": 0.06357834, + "balance_loss_mlp": 0.01260486, + "epoch": 0.09842176461746581, + "flos": 19939653354240.0, + "grad_norm": 2.182253503011162, + "language_loss": 0.72318256, + "learning_rate": 3.951092440828715e-06, + "loss": 0.80376077, + "num_input_tokens_seen": 35171850, + "router_z_loss_clip": 3.97460938, + "router_z_loss_mlp": 0.42163086, + "step": 1637, + "time_per_iteration": 2.573108196258545 + }, + { + "auxiliary_loss_clip": 0.0673624, + "auxiliary_loss_mlp": 0.01302289, + "balance_loss_clip": 0.06349343, + "balance_loss_mlp": 0.01263045, + "epoch": 0.09848188787013377, + "flos": 21220956063360.0, + "grad_norm": 2.9423896219595016, + "language_loss": 0.79459947, + "learning_rate": 3.951006802345545e-06, + "loss": 0.87498474, + "num_input_tokens_seen": 35188795, + "router_z_loss_clip": 3.87304688, + "router_z_loss_mlp": 0.39257812, + "step": 1638, + "time_per_iteration": 2.620058536529541 + }, + { + "auxiliary_loss_clip": 0.06725241, + "auxiliary_loss_mlp": 0.01294434, + "balance_loss_clip": 0.06345727, + "balance_loss_mlp": 0.01258027, + "epoch": 0.09854201112280174, + "flos": 30162109691520.0, + "grad_norm": 1.743966069044169, + "language_loss": 0.7446866, + "learning_rate": 3.950921089880003e-06, + "loss": 0.82488334, + "num_input_tokens_seen": 35212100, + "router_z_loss_clip": 3.79296875, + "router_z_loss_mlp": 0.36401367, + "step": 1639, + "time_per_iteration": 4.186578750610352 + }, + { + "auxiliary_loss_clip": 0.06740695, + "auxiliary_loss_mlp": 0.01301032, + "balance_loss_clip": 0.06346842, + "balance_loss_mlp": 0.01260025, + "epoch": 0.09860213437546972, + "flos": 21801671585280.0, + "grad_norm": 2.1837560711862114, + "language_loss": 0.90050477, + "learning_rate": 3.950835303435337e-06, + "loss": 0.9809221, + "num_input_tokens_seen": 35230390, + "router_z_loss_clip": 3.93945312, + "router_z_loss_mlp": 0.41040039, + "step": 1640, + "time_per_iteration": 2.571072816848755 + }, + { + "auxiliary_loss_clip": 0.06734361, + "auxiliary_loss_mlp": 0.01304387, + "balance_loss_clip": 0.06346233, + "balance_loss_mlp": 0.01265548, + "epoch": 0.09866225762813768, + "flos": 21842062053120.0, + "grad_norm": 2.730520486163119, + "language_loss": 0.82726961, + "learning_rate": 3.950749443014801e-06, + "loss": 0.90765709, + "num_input_tokens_seen": 35250405, + "router_z_loss_clip": 3.87695312, + "router_z_loss_mlp": 0.38818359, + "step": 1641, + "time_per_iteration": 3.9849867820739746 + }, + { + "auxiliary_loss_clip": 0.06739942, + "auxiliary_loss_mlp": 0.01313392, + "balance_loss_clip": 0.06347778, + "balance_loss_mlp": 0.01271692, + "epoch": 0.09872238088080565, + "flos": 17605093622400.0, + "grad_norm": 3.096093902434135, + "language_loss": 0.88531339, + "learning_rate": 3.95066350862165e-06, + "loss": 0.96584678, + "num_input_tokens_seen": 35262820, + "router_z_loss_clip": 3.92382812, + "router_z_loss_mlp": 0.41699219, + "step": 1642, + "time_per_iteration": 2.516415596008301 + }, + { + "auxiliary_loss_clip": 0.06737699, + "auxiliary_loss_mlp": 0.01318919, + "balance_loss_clip": 0.06353228, + "balance_loss_mlp": 0.01281606, + "epoch": 0.09878250413347361, + "flos": 27643500466560.0, + "grad_norm": 2.0791034906225883, + "language_loss": 0.82263941, + "learning_rate": 3.950577500259144e-06, + "loss": 0.90320563, + "num_input_tokens_seen": 35284490, + "router_z_loss_clip": 3.84179688, + "router_z_loss_mlp": 0.37304688, + "step": 1643, + "time_per_iteration": 2.647494077682495 + }, + { + "auxiliary_loss_clip": 0.06734201, + "auxiliary_loss_mlp": 0.01331721, + "balance_loss_clip": 0.06346507, + "balance_loss_mlp": 0.01293407, + "epoch": 0.0988426273861416, + "flos": 16550285299200.0, + "grad_norm": 2.4456553195112574, + "language_loss": 0.84032261, + "learning_rate": 3.950491417930543e-06, + "loss": 0.92098182, + "num_input_tokens_seen": 35302815, + "router_z_loss_clip": 3.88085938, + "router_z_loss_mlp": 0.3828125, + "step": 1644, + "time_per_iteration": 2.532773733139038 + }, + { + "auxiliary_loss_clip": 0.06725995, + "auxiliary_loss_mlp": 0.01324281, + "balance_loss_clip": 0.06350633, + "balance_loss_mlp": 0.0128499, + "epoch": 0.09890275063880956, + "flos": 21221668823040.0, + "grad_norm": 2.0467133061416956, + "language_loss": 0.70372713, + "learning_rate": 3.9504052616391124e-06, + "loss": 0.78422999, + "num_input_tokens_seen": 35321175, + "router_z_loss_clip": 3.75390625, + "router_z_loss_mlp": 0.39282227, + "step": 1645, + "time_per_iteration": 2.622675657272339 + }, + { + "auxiliary_loss_clip": 0.06615774, + "auxiliary_loss_mlp": 0.01318713, + "balance_loss_clip": 0.06367776, + "balance_loss_mlp": 0.01297721, + "epoch": 0.09896287389147752, + "flos": 59398255111680.0, + "grad_norm": 0.866313536392572, + "language_loss": 0.6076256, + "learning_rate": 3.950319031388119e-06, + "loss": 0.68697047, + "num_input_tokens_seen": 35381740, + "router_z_loss_clip": 2.48242188, + "router_z_loss_mlp": 0.21008301, + "step": 1646, + "time_per_iteration": 3.1056430339813232 + }, + { + "auxiliary_loss_clip": 0.06736847, + "auxiliary_loss_mlp": 0.01330956, + "balance_loss_clip": 0.06343894, + "balance_loss_mlp": 0.01288517, + "epoch": 0.0990229971441455, + "flos": 29650351680000.0, + "grad_norm": 13.669187568501263, + "language_loss": 0.74906254, + "learning_rate": 3.950232727180833e-06, + "loss": 0.82974058, + "num_input_tokens_seen": 35403760, + "router_z_loss_clip": 3.93164062, + "router_z_loss_mlp": 0.42456055, + "step": 1647, + "time_per_iteration": 2.6270813941955566 + }, + { + "auxiliary_loss_clip": 0.06742343, + "auxiliary_loss_mlp": 0.01344997, + "balance_loss_clip": 0.0635362, + "balance_loss_mlp": 0.01305277, + "epoch": 0.09908312039681347, + "flos": 21841265439360.0, + "grad_norm": 3.219880040136517, + "language_loss": 0.86054468, + "learning_rate": 3.950146349020525e-06, + "loss": 0.94141805, + "num_input_tokens_seen": 35424050, + "router_z_loss_clip": 3.88671875, + "router_z_loss_mlp": 0.3972168, + "step": 1648, + "time_per_iteration": 2.6192800998687744 + }, + { + "auxiliary_loss_clip": 0.06595583, + "auxiliary_loss_mlp": 0.01312987, + "balance_loss_clip": 0.06350748, + "balance_loss_mlp": 0.01292542, + "epoch": 0.09914324364948143, + "flos": 57584425777920.0, + "grad_norm": 0.7273762983113155, + "language_loss": 0.5560773, + "learning_rate": 3.950059896910473e-06, + "loss": 0.63516295, + "num_input_tokens_seen": 35481690, + "router_z_loss_clip": 2.44140625, + "router_z_loss_mlp": 0.20446777, + "step": 1649, + "time_per_iteration": 3.1318249702453613 + }, + { + "auxiliary_loss_clip": 0.06736004, + "auxiliary_loss_mlp": 0.01331784, + "balance_loss_clip": 0.06347787, + "balance_loss_mlp": 0.01293232, + "epoch": 0.09920336690214941, + "flos": 34131270873600.0, + "grad_norm": 3.80404299498915, + "language_loss": 0.92154968, + "learning_rate": 3.949973370853954e-06, + "loss": 1.00222754, + "num_input_tokens_seen": 35498635, + "router_z_loss_clip": 3.88085938, + "router_z_loss_mlp": 0.38574219, + "step": 1650, + "time_per_iteration": 2.640519142150879 + }, + { + "auxiliary_loss_clip": 0.06583999, + "auxiliary_loss_mlp": 0.012899, + "balance_loss_clip": 0.06337862, + "balance_loss_mlp": 0.012688, + "epoch": 0.09926349015481738, + "flos": 71239910947200.0, + "grad_norm": 0.7750953568391499, + "language_loss": 0.63578606, + "learning_rate": 3.94988677085425e-06, + "loss": 0.71452504, + "num_input_tokens_seen": 35565720, + "router_z_loss_clip": 2.46679688, + "router_z_loss_mlp": 0.21118164, + "step": 1651, + "time_per_iteration": 3.380758047103882 + }, + { + "auxiliary_loss_clip": 0.06739324, + "auxiliary_loss_mlp": 0.01313359, + "balance_loss_clip": 0.06352896, + "balance_loss_mlp": 0.01275236, + "epoch": 0.09932361340748534, + "flos": 23155369822080.0, + "grad_norm": 3.694899481712973, + "language_loss": 0.89802289, + "learning_rate": 3.949800096914643e-06, + "loss": 0.97854972, + "num_input_tokens_seen": 35586000, + "router_z_loss_clip": 3.8671875, + "router_z_loss_mlp": 0.38110352, + "step": 1652, + "time_per_iteration": 2.571901321411133 + }, + { + "auxiliary_loss_clip": 0.06737585, + "auxiliary_loss_mlp": 0.01305643, + "balance_loss_clip": 0.06349514, + "balance_loss_mlp": 0.01267735, + "epoch": 0.09938373666015332, + "flos": 19834791569280.0, + "grad_norm": 2.586330184077195, + "language_loss": 0.8401894, + "learning_rate": 3.949713349038422e-06, + "loss": 0.92062169, + "num_input_tokens_seen": 35604355, + "router_z_loss_clip": 3.88085938, + "router_z_loss_mlp": 0.37890625, + "step": 1653, + "time_per_iteration": 2.5631346702575684 + }, + { + "auxiliary_loss_clip": 0.0674301, + "auxiliary_loss_mlp": 0.01306602, + "balance_loss_clip": 0.06348432, + "balance_loss_mlp": 0.01266428, + "epoch": 0.09944385991282129, + "flos": 22097165408640.0, + "grad_norm": 3.5179958225358914, + "language_loss": 0.81669748, + "learning_rate": 3.949626527228875e-06, + "loss": 0.89719361, + "num_input_tokens_seen": 35625495, + "router_z_loss_clip": 3.94335938, + "router_z_loss_mlp": 0.40136719, + "step": 1654, + "time_per_iteration": 2.602562427520752 + }, + { + "auxiliary_loss_clip": 0.06716993, + "auxiliary_loss_mlp": 0.01303058, + "balance_loss_clip": 0.0634619, + "balance_loss_mlp": 0.01268178, + "epoch": 0.09950398316548925, + "flos": 19835043131520.0, + "grad_norm": 8.671208784933132, + "language_loss": 0.83012509, + "learning_rate": 3.949539631489295e-06, + "loss": 0.91032565, + "num_input_tokens_seen": 35645030, + "router_z_loss_clip": 3.70703125, + "router_z_loss_mlp": 0.34863281, + "step": 1655, + "time_per_iteration": 2.5673985481262207 + }, + { + "auxiliary_loss_clip": 0.06726938, + "auxiliary_loss_mlp": 0.01297279, + "balance_loss_clip": 0.06340201, + "balance_loss_mlp": 0.01259799, + "epoch": 0.09956410641815722, + "flos": 25009715404800.0, + "grad_norm": 2.461628043042503, + "language_loss": 0.82767576, + "learning_rate": 3.9494526618229765e-06, + "loss": 0.90791798, + "num_input_tokens_seen": 35664305, + "router_z_loss_clip": 3.86523438, + "router_z_loss_mlp": 0.37475586, + "step": 1656, + "time_per_iteration": 2.581664800643921 + }, + { + "auxiliary_loss_clip": 0.06710893, + "auxiliary_loss_mlp": 0.01307317, + "balance_loss_clip": 0.06336491, + "balance_loss_mlp": 0.01268812, + "epoch": 0.0996242296708252, + "flos": 19323746317440.0, + "grad_norm": 1.719286888169867, + "language_loss": 0.90283895, + "learning_rate": 3.949365618233217e-06, + "loss": 0.98302102, + "num_input_tokens_seen": 35684060, + "router_z_loss_clip": 3.74804688, + "router_z_loss_mlp": 0.38525391, + "step": 1657, + "time_per_iteration": 2.57688045501709 + }, + { + "auxiliary_loss_clip": 0.06739774, + "auxiliary_loss_mlp": 0.01311666, + "balance_loss_clip": 0.06340782, + "balance_loss_mlp": 0.01267869, + "epoch": 0.09968435292349316, + "flos": 21878050181760.0, + "grad_norm": 2.9029706728478533, + "language_loss": 0.87311482, + "learning_rate": 3.9492785007233195e-06, + "loss": 0.95362926, + "num_input_tokens_seen": 35703250, + "router_z_loss_clip": 3.98632812, + "router_z_loss_mlp": 0.43823242, + "step": 1658, + "time_per_iteration": 2.628093719482422 + }, + { + "auxiliary_loss_clip": 0.06571998, + "auxiliary_loss_mlp": 0.01376397, + "balance_loss_clip": 0.06328425, + "balance_loss_mlp": 0.01349933, + "epoch": 0.09974447617616113, + "flos": 65401912154880.0, + "grad_norm": 0.9037243571562794, + "language_loss": 0.60433233, + "learning_rate": 3.949191309296585e-06, + "loss": 0.68381631, + "num_input_tokens_seen": 35762165, + "router_z_loss_clip": 2.4375, + "router_z_loss_mlp": 0.26513672, + "step": 1659, + "time_per_iteration": 3.2305996417999268 + }, + { + "auxiliary_loss_clip": 0.06713426, + "auxiliary_loss_mlp": 0.01317119, + "balance_loss_clip": 0.06331229, + "balance_loss_mlp": 0.0127735, + "epoch": 0.0998045994288291, + "flos": 23666624709120.0, + "grad_norm": 2.0571407511312865, + "language_loss": 0.87086773, + "learning_rate": 3.949104043956321e-06, + "loss": 0.95117325, + "num_input_tokens_seen": 35781520, + "router_z_loss_clip": 3.82226562, + "router_z_loss_mlp": 0.39746094, + "step": 1660, + "time_per_iteration": 2.5779190063476562 + }, + { + "auxiliary_loss_clip": 0.0670151, + "auxiliary_loss_mlp": 0.01332109, + "balance_loss_clip": 0.06323117, + "balance_loss_mlp": 0.01290529, + "epoch": 0.09986472268149707, + "flos": 19615802123520.0, + "grad_norm": 2.4762315311071315, + "language_loss": 0.80644435, + "learning_rate": 3.949016704705836e-06, + "loss": 0.88678062, + "num_input_tokens_seen": 35799565, + "router_z_loss_clip": 3.78710938, + "router_z_loss_mlp": 0.41552734, + "step": 1661, + "time_per_iteration": 2.691804885864258 + }, + { + "auxiliary_loss_clip": 0.06725313, + "auxiliary_loss_mlp": 0.0132162, + "balance_loss_clip": 0.0632514, + "balance_loss_mlp": 0.01278443, + "epoch": 0.09992484593416504, + "flos": 26220467376000.0, + "grad_norm": 2.2620896744149412, + "language_loss": 0.8613416, + "learning_rate": 3.948929291548443e-06, + "loss": 0.94181097, + "num_input_tokens_seen": 35821085, + "router_z_loss_clip": 4.00585938, + "router_z_loss_mlp": 0.43164062, + "step": 1662, + "time_per_iteration": 2.6255035400390625 + }, + { + "auxiliary_loss_clip": 0.06704119, + "auxiliary_loss_mlp": 0.0133037, + "balance_loss_clip": 0.06321694, + "balance_loss_mlp": 0.0128941, + "epoch": 0.09998496918683301, + "flos": 17499393296640.0, + "grad_norm": 2.3672212997838993, + "language_loss": 0.90448183, + "learning_rate": 3.9488418044874546e-06, + "loss": 0.98482674, + "num_input_tokens_seen": 35839840, + "router_z_loss_clip": 3.82226562, + "router_z_loss_mlp": 0.40966797, + "step": 1663, + "time_per_iteration": 2.6671247482299805 + }, + { + "auxiliary_loss_clip": 0.06712753, + "auxiliary_loss_mlp": 0.01334758, + "balance_loss_clip": 0.06319161, + "balance_loss_mlp": 0.01292105, + "epoch": 0.10004509243950098, + "flos": 22791715102080.0, + "grad_norm": 2.952995005402735, + "language_loss": 0.72149938, + "learning_rate": 3.948754243526191e-06, + "loss": 0.80197442, + "num_input_tokens_seen": 35861545, + "router_z_loss_clip": 3.93164062, + "router_z_loss_mlp": 0.42651367, + "step": 1664, + "time_per_iteration": 2.619164228439331 + }, + { + "auxiliary_loss_clip": 0.06713652, + "auxiliary_loss_mlp": 0.01325429, + "balance_loss_clip": 0.06323303, + "balance_loss_mlp": 0.01284159, + "epoch": 0.10010521569216894, + "flos": 16258984179840.0, + "grad_norm": 39.90990553234195, + "language_loss": 0.80576968, + "learning_rate": 3.94866660866797e-06, + "loss": 0.88616049, + "num_input_tokens_seen": 35878295, + "router_z_loss_clip": 3.90429688, + "router_z_loss_mlp": 0.41235352, + "step": 1665, + "time_per_iteration": 2.605639934539795 + }, + { + "auxiliary_loss_clip": 0.06714154, + "auxiliary_loss_mlp": 0.01316999, + "balance_loss_clip": 0.06327689, + "balance_loss_mlp": 0.01278017, + "epoch": 0.10016533894483691, + "flos": 23409047658240.0, + "grad_norm": 2.1899546372821566, + "language_loss": 0.71735048, + "learning_rate": 3.9485788999161165e-06, + "loss": 0.79766202, + "num_input_tokens_seen": 35898990, + "router_z_loss_clip": 3.86523438, + "router_z_loss_mlp": 0.38964844, + "step": 1666, + "time_per_iteration": 2.565112352371216 + }, + { + "auxiliary_loss_clip": 0.06721501, + "auxiliary_loss_mlp": 0.01334152, + "balance_loss_clip": 0.06329556, + "balance_loss_mlp": 0.01286492, + "epoch": 0.10022546219750489, + "flos": 19360195643520.0, + "grad_norm": 2.4453770076419055, + "language_loss": 0.80451995, + "learning_rate": 3.948491117273956e-06, + "loss": 0.88507646, + "num_input_tokens_seen": 35916225, + "router_z_loss_clip": 3.921875, + "router_z_loss_mlp": 0.47680664, + "step": 1667, + "time_per_iteration": 2.5686376094818115 + }, + { + "auxiliary_loss_clip": 0.06714002, + "auxiliary_loss_mlp": 0.01313023, + "balance_loss_clip": 0.06328776, + "balance_loss_mlp": 0.01272492, + "epoch": 0.10028558545017285, + "flos": 27092525944320.0, + "grad_norm": 3.3659339438704357, + "language_loss": 0.79832667, + "learning_rate": 3.948403260744817e-06, + "loss": 0.8785969, + "num_input_tokens_seen": 35934630, + "router_z_loss_clip": 3.84960938, + "router_z_loss_mlp": 0.40551758, + "step": 1668, + "time_per_iteration": 2.5726866722106934 + }, + { + "auxiliary_loss_clip": 0.0670673, + "auxiliary_loss_mlp": 0.013093, + "balance_loss_clip": 0.06318925, + "balance_loss_mlp": 0.01268101, + "epoch": 0.10034570870284082, + "flos": 25854003544320.0, + "grad_norm": 2.568927800509246, + "language_loss": 0.79338908, + "learning_rate": 3.948315330332031e-06, + "loss": 0.87354934, + "num_input_tokens_seen": 35953855, + "router_z_loss_clip": 3.87695312, + "router_z_loss_mlp": 0.41235352, + "step": 1669, + "time_per_iteration": 2.6188042163848877 + }, + { + "auxiliary_loss_clip": 0.06725293, + "auxiliary_loss_mlp": 0.0130808, + "balance_loss_clip": 0.06329028, + "balance_loss_mlp": 0.01264497, + "epoch": 0.1004058319555088, + "flos": 26256707066880.0, + "grad_norm": 15.895164476932296, + "language_loss": 0.87389982, + "learning_rate": 3.948227326038933e-06, + "loss": 0.95423353, + "num_input_tokens_seen": 35974555, + "router_z_loss_clip": 3.9609375, + "router_z_loss_mlp": 0.43579102, + "step": 1670, + "time_per_iteration": 2.6586272716522217 + }, + { + "auxiliary_loss_clip": 0.06691795, + "auxiliary_loss_mlp": 0.01298769, + "balance_loss_clip": 0.06322314, + "balance_loss_mlp": 0.0126098, + "epoch": 0.10046595520817676, + "flos": 25381545897600.0, + "grad_norm": 1.8967452212827218, + "language_loss": 0.7865597, + "learning_rate": 3.9481392478688586e-06, + "loss": 0.86646533, + "num_input_tokens_seen": 35996830, + "router_z_loss_clip": 3.69335938, + "router_z_loss_mlp": 0.37817383, + "step": 1671, + "time_per_iteration": 2.6737799644470215 + }, + { + "auxiliary_loss_clip": 0.06549042, + "auxiliary_loss_mlp": 0.01335852, + "balance_loss_clip": 0.06305933, + "balance_loss_mlp": 0.01310293, + "epoch": 0.10052607846084473, + "flos": 67479146398080.0, + "grad_norm": 0.7871321089675286, + "language_loss": 0.60865933, + "learning_rate": 3.948051095825149e-06, + "loss": 0.68750823, + "num_input_tokens_seen": 36054465, + "router_z_loss_clip": 2.4375, + "router_z_loss_mlp": 0.25585938, + "step": 1672, + "time_per_iteration": 3.1528263092041016 + }, + { + "auxiliary_loss_clip": 0.06706591, + "auxiliary_loss_mlp": 0.01299319, + "balance_loss_clip": 0.06322384, + "balance_loss_mlp": 0.01258406, + "epoch": 0.10058620171351271, + "flos": 21366795294720.0, + "grad_norm": 25.353895208902486, + "language_loss": 0.78260916, + "learning_rate": 3.947962869911147e-06, + "loss": 0.86266828, + "num_input_tokens_seen": 36073480, + "router_z_loss_clip": 3.83984375, + "router_z_loss_mlp": 0.40917969, + "step": 1673, + "time_per_iteration": 2.548840045928955 + }, + { + "auxiliary_loss_clip": 0.06713213, + "auxiliary_loss_mlp": 0.01301927, + "balance_loss_clip": 0.06326719, + "balance_loss_mlp": 0.01261419, + "epoch": 0.10064632496618067, + "flos": 16805724071040.0, + "grad_norm": 3.2623460746575867, + "language_loss": 0.75444734, + "learning_rate": 3.947874570130197e-06, + "loss": 0.83459872, + "num_input_tokens_seen": 36091830, + "router_z_loss_clip": 3.8671875, + "router_z_loss_mlp": 0.4050293, + "step": 1674, + "time_per_iteration": 3.9417338371276855 + }, + { + "auxiliary_loss_clip": 0.06701215, + "auxiliary_loss_mlp": 0.01303034, + "balance_loss_clip": 0.0631593, + "balance_loss_mlp": 0.01264124, + "epoch": 0.10070644821884864, + "flos": 23631433194240.0, + "grad_norm": 2.3845334341515905, + "language_loss": 0.80716002, + "learning_rate": 3.947786196485649e-06, + "loss": 0.88720256, + "num_input_tokens_seen": 36111400, + "router_z_loss_clip": 3.8515625, + "router_z_loss_mlp": 0.38891602, + "step": 1675, + "time_per_iteration": 2.6035287380218506 + }, + { + "auxiliary_loss_clip": 0.06711227, + "auxiliary_loss_mlp": 0.01308342, + "balance_loss_clip": 0.06320765, + "balance_loss_mlp": 0.01266404, + "epoch": 0.1007665714715166, + "flos": 24469516131840.0, + "grad_norm": 3.2401043480386122, + "language_loss": 0.82723379, + "learning_rate": 3.947697748980853e-06, + "loss": 0.90742946, + "num_input_tokens_seen": 36129345, + "router_z_loss_clip": 3.90234375, + "router_z_loss_mlp": 0.41943359, + "step": 1676, + "time_per_iteration": 4.029613256454468 + }, + { + "auxiliary_loss_clip": 0.06714617, + "auxiliary_loss_mlp": 0.01315911, + "balance_loss_clip": 0.0632771, + "balance_loss_mlp": 0.0127476, + "epoch": 0.10082669472418458, + "flos": 16804550113920.0, + "grad_norm": 2.3128991920650295, + "language_loss": 0.87477523, + "learning_rate": 3.947609227619163e-06, + "loss": 0.95508051, + "num_input_tokens_seen": 36146255, + "router_z_loss_clip": 3.87109375, + "router_z_loss_mlp": 0.41113281, + "step": 1677, + "time_per_iteration": 2.593122720718384 + }, + { + "auxiliary_loss_clip": 0.06712872, + "auxiliary_loss_mlp": 0.01323048, + "balance_loss_clip": 0.06321359, + "balance_loss_mlp": 0.01280586, + "epoch": 0.10088681797685255, + "flos": 13558673376000.0, + "grad_norm": 2.3885344519990017, + "language_loss": 0.87886804, + "learning_rate": 3.947520632403936e-06, + "loss": 0.9592272, + "num_input_tokens_seen": 36164050, + "router_z_loss_clip": 3.9140625, + "router_z_loss_mlp": 0.42480469, + "step": 1678, + "time_per_iteration": 4.02148962020874 + }, + { + "auxiliary_loss_clip": 0.06711318, + "auxiliary_loss_mlp": 0.01321227, + "balance_loss_clip": 0.06328011, + "balance_loss_mlp": 0.01282985, + "epoch": 0.10094694122952051, + "flos": 25272868752000.0, + "grad_norm": 13.556620814946344, + "language_loss": 0.91124773, + "learning_rate": 3.947431963338532e-06, + "loss": 0.99157315, + "num_input_tokens_seen": 36183530, + "router_z_loss_clip": 3.83007812, + "router_z_loss_mlp": 0.38256836, + "step": 1679, + "time_per_iteration": 2.593204975128174 + }, + { + "auxiliary_loss_clip": 0.06551328, + "auxiliary_loss_mlp": 0.01270219, + "balance_loss_clip": 0.06307815, + "balance_loss_mlp": 0.01249143, + "epoch": 0.10100706448218849, + "flos": 69875521315200.0, + "grad_norm": 0.8658555731993547, + "language_loss": 0.53157437, + "learning_rate": 3.947343220426312e-06, + "loss": 0.60978985, + "num_input_tokens_seen": 36248550, + "router_z_loss_clip": 2.4375, + "router_z_loss_mlp": 0.2109375, + "step": 1680, + "time_per_iteration": 4.680401802062988 + }, + { + "auxiliary_loss_clip": 0.06706315, + "auxiliary_loss_mlp": 0.01330393, + "balance_loss_clip": 0.06326837, + "balance_loss_mlp": 0.0129103, + "epoch": 0.10106718773485646, + "flos": 20012677787520.0, + "grad_norm": 2.2086252291478403, + "language_loss": 0.78363287, + "learning_rate": 3.947254403670641e-06, + "loss": 0.86399996, + "num_input_tokens_seen": 36266065, + "router_z_loss_clip": 3.79296875, + "router_z_loss_mlp": 0.39331055, + "step": 1681, + "time_per_iteration": 2.5842180252075195 + }, + { + "auxiliary_loss_clip": 0.06727763, + "auxiliary_loss_mlp": 0.0133733, + "balance_loss_clip": 0.06334171, + "balance_loss_mlp": 0.01293271, + "epoch": 0.10112731098752442, + "flos": 13484852328960.0, + "grad_norm": 2.7825426019965707, + "language_loss": 0.9580273, + "learning_rate": 3.947165513074889e-06, + "loss": 1.03867817, + "num_input_tokens_seen": 36280960, + "router_z_loss_clip": 3.9375, + "router_z_loss_mlp": 0.44067383, + "step": 1682, + "time_per_iteration": 2.5091476440429688 + }, + { + "auxiliary_loss_clip": 0.06722884, + "auxiliary_loss_mlp": 0.01333979, + "balance_loss_clip": 0.06334428, + "balance_loss_mlp": 0.01291803, + "epoch": 0.1011874342401924, + "flos": 18521944997760.0, + "grad_norm": 4.013093374062749, + "language_loss": 0.88974559, + "learning_rate": 3.947076548642425e-06, + "loss": 0.97031426, + "num_input_tokens_seen": 36299010, + "router_z_loss_clip": 3.8828125, + "router_z_loss_mlp": 0.421875, + "step": 1683, + "time_per_iteration": 2.583263635635376 + }, + { + "auxiliary_loss_clip": 0.0671032, + "auxiliary_loss_mlp": 0.01319793, + "balance_loss_clip": 0.06327897, + "balance_loss_mlp": 0.0128074, + "epoch": 0.10124755749286037, + "flos": 20708904562560.0, + "grad_norm": 3.51695946667963, + "language_loss": 0.76482016, + "learning_rate": 3.946987510376624e-06, + "loss": 0.84512126, + "num_input_tokens_seen": 36318400, + "router_z_loss_clip": 3.82226562, + "router_z_loss_mlp": 0.390625, + "step": 1684, + "time_per_iteration": 2.5566201210021973 + }, + { + "auxiliary_loss_clip": 0.06545618, + "auxiliary_loss_mlp": 0.01270157, + "balance_loss_clip": 0.06304231, + "balance_loss_mlp": 0.01252085, + "epoch": 0.10130768074552833, + "flos": 56130100387200.0, + "grad_norm": 0.7359306974182547, + "language_loss": 0.6108619, + "learning_rate": 3.9468983982808615e-06, + "loss": 0.68901968, + "num_input_tokens_seen": 36381815, + "router_z_loss_clip": 2.40625, + "router_z_loss_mlp": 0.1809082, + "step": 1685, + "time_per_iteration": 3.2871286869049072 + }, + { + "auxiliary_loss_clip": 0.06715102, + "auxiliary_loss_mlp": 0.01314643, + "balance_loss_clip": 0.06328554, + "balance_loss_mlp": 0.01273612, + "epoch": 0.1013678039981963, + "flos": 33410921322240.0, + "grad_norm": 2.782312478618552, + "language_loss": 0.61882973, + "learning_rate": 3.946809212358516e-06, + "loss": 0.6991272, + "num_input_tokens_seen": 36404320, + "router_z_loss_clip": 3.86328125, + "router_z_loss_mlp": 0.41064453, + "step": 1686, + "time_per_iteration": 2.6534583568573 + }, + { + "auxiliary_loss_clip": 0.0670934, + "auxiliary_loss_mlp": 0.01311437, + "balance_loss_clip": 0.0633449, + "balance_loss_mlp": 0.01272622, + "epoch": 0.10142792725086427, + "flos": 31913480206080.0, + "grad_norm": 4.585581221965215, + "language_loss": 0.8288697, + "learning_rate": 3.946719952612972e-06, + "loss": 0.90907753, + "num_input_tokens_seen": 36427510, + "router_z_loss_clip": 3.74804688, + "router_z_loss_mlp": 0.38793945, + "step": 1687, + "time_per_iteration": 2.6766278743743896 + }, + { + "auxiliary_loss_clip": 0.06718412, + "auxiliary_loss_mlp": 0.0131249, + "balance_loss_clip": 0.06331126, + "balance_loss_mlp": 0.01271601, + "epoch": 0.10148805050353224, + "flos": 28483512048000.0, + "grad_norm": 2.9352499009147386, + "language_loss": 0.73686063, + "learning_rate": 3.94663061904761e-06, + "loss": 0.81716961, + "num_input_tokens_seen": 36448230, + "router_z_loss_clip": 3.88085938, + "router_z_loss_mlp": 0.40917969, + "step": 1688, + "time_per_iteration": 2.625084400177002 + }, + { + "auxiliary_loss_clip": 0.06704164, + "auxiliary_loss_mlp": 0.01310415, + "balance_loss_clip": 0.06328401, + "balance_loss_mlp": 0.01267905, + "epoch": 0.1015481737562002, + "flos": 25154799949440.0, + "grad_norm": 2.7691275113498293, + "language_loss": 0.88195848, + "learning_rate": 3.94654121166582e-06, + "loss": 0.9621042, + "num_input_tokens_seen": 36464395, + "router_z_loss_clip": 3.7578125, + "router_z_loss_mlp": 0.42480469, + "step": 1689, + "time_per_iteration": 2.595492362976074 + }, + { + "auxiliary_loss_clip": 0.06716056, + "auxiliary_loss_mlp": 0.01310716, + "balance_loss_clip": 0.06332745, + "balance_loss_mlp": 0.01270328, + "epoch": 0.10160829700886818, + "flos": 30890593088640.0, + "grad_norm": 2.202394662859946, + "language_loss": 0.89776945, + "learning_rate": 3.946451730470993e-06, + "loss": 0.97803724, + "num_input_tokens_seen": 36486475, + "router_z_loss_clip": 3.83398438, + "router_z_loss_mlp": 0.40429688, + "step": 1690, + "time_per_iteration": 2.6406383514404297 + }, + { + "auxiliary_loss_clip": 0.06720668, + "auxiliary_loss_mlp": 0.01309465, + "balance_loss_clip": 0.06337205, + "balance_loss_mlp": 0.01267932, + "epoch": 0.10166842026153615, + "flos": 20418190421760.0, + "grad_norm": 2.5850789066585595, + "language_loss": 0.85274917, + "learning_rate": 3.946362175466521e-06, + "loss": 0.93305051, + "num_input_tokens_seen": 36505310, + "router_z_loss_clip": 3.83789062, + "router_z_loss_mlp": 0.4152832, + "step": 1691, + "time_per_iteration": 2.6336474418640137 + }, + { + "auxiliary_loss_clip": 0.06720576, + "auxiliary_loss_mlp": 0.01308382, + "balance_loss_clip": 0.06329723, + "balance_loss_mlp": 0.01266039, + "epoch": 0.10172854351420411, + "flos": 33485832472320.0, + "grad_norm": 1.9210168222319979, + "language_loss": 0.67985535, + "learning_rate": 3.946272546655801e-06, + "loss": 0.76014495, + "num_input_tokens_seen": 36529820, + "router_z_loss_clip": 3.91210938, + "router_z_loss_mlp": 0.4230957, + "step": 1692, + "time_per_iteration": 2.7298569679260254 + }, + { + "auxiliary_loss_clip": 0.0670909, + "auxiliary_loss_mlp": 0.01313275, + "balance_loss_clip": 0.06329532, + "balance_loss_mlp": 0.01271933, + "epoch": 0.1017886667668721, + "flos": 23557109022720.0, + "grad_norm": 2.364359015626866, + "language_loss": 0.77791357, + "learning_rate": 3.94618284404223e-06, + "loss": 0.85813725, + "num_input_tokens_seen": 36549000, + "router_z_loss_clip": 3.79101562, + "router_z_loss_mlp": 0.41333008, + "step": 1693, + "time_per_iteration": 2.5772159099578857 + }, + { + "auxiliary_loss_clip": 0.06718149, + "auxiliary_loss_mlp": 0.01308582, + "balance_loss_clip": 0.06332842, + "balance_loss_mlp": 0.01267813, + "epoch": 0.10184879001954006, + "flos": 23303011916160.0, + "grad_norm": 1.7868831519316952, + "language_loss": 0.88559091, + "learning_rate": 3.9460930676292105e-06, + "loss": 0.96585822, + "num_input_tokens_seen": 36567515, + "router_z_loss_clip": 3.85742188, + "router_z_loss_mlp": 0.4074707, + "step": 1694, + "time_per_iteration": 2.6128172874450684 + }, + { + "auxiliary_loss_clip": 0.06728393, + "auxiliary_loss_mlp": 0.01308189, + "balance_loss_clip": 0.06335086, + "balance_loss_mlp": 0.01266681, + "epoch": 0.10190891327220802, + "flos": 18339069461760.0, + "grad_norm": 12.701803193315635, + "language_loss": 0.81483626, + "learning_rate": 3.946003217420147e-06, + "loss": 0.89520216, + "num_input_tokens_seen": 36586190, + "router_z_loss_clip": 3.9375, + "router_z_loss_mlp": 0.41503906, + "step": 1695, + "time_per_iteration": 2.5383710861206055 + }, + { + "auxiliary_loss_clip": 0.06719907, + "auxiliary_loss_mlp": 0.01309327, + "balance_loss_clip": 0.06335149, + "balance_loss_mlp": 0.01268152, + "epoch": 0.10196903652487599, + "flos": 26472006933120.0, + "grad_norm": 2.5208321376903173, + "language_loss": 0.87899506, + "learning_rate": 3.945913293418447e-06, + "loss": 0.95928741, + "num_input_tokens_seen": 36607495, + "router_z_loss_clip": 3.84765625, + "router_z_loss_mlp": 0.41186523, + "step": 1696, + "time_per_iteration": 2.651993989944458 + }, + { + "auxiliary_loss_clip": 0.067072, + "auxiliary_loss_mlp": 0.01308456, + "balance_loss_clip": 0.06329801, + "balance_loss_mlp": 0.01268545, + "epoch": 0.10202915977754397, + "flos": 21875618413440.0, + "grad_norm": 1.9807901580601361, + "language_loss": 0.83342528, + "learning_rate": 3.945823295627519e-06, + "loss": 0.91358191, + "num_input_tokens_seen": 36628555, + "router_z_loss_clip": 3.77734375, + "router_z_loss_mlp": 0.39916992, + "step": 1697, + "time_per_iteration": 2.5826144218444824 + }, + { + "auxiliary_loss_clip": 0.06717139, + "auxiliary_loss_mlp": 0.01309728, + "balance_loss_clip": 0.06333424, + "balance_loss_mlp": 0.01268339, + "epoch": 0.10208928303021193, + "flos": 22316322562560.0, + "grad_norm": 4.080073154744023, + "language_loss": 0.82607067, + "learning_rate": 3.9457332240507775e-06, + "loss": 0.90633935, + "num_input_tokens_seen": 36646250, + "router_z_loss_clip": 3.83789062, + "router_z_loss_mlp": 0.4140625, + "step": 1698, + "time_per_iteration": 2.6105751991271973 + }, + { + "auxiliary_loss_clip": 0.06711876, + "auxiliary_loss_mlp": 0.01312643, + "balance_loss_clip": 0.06331024, + "balance_loss_mlp": 0.01272541, + "epoch": 0.1021494062828799, + "flos": 22131811872000.0, + "grad_norm": 3.7730678992984594, + "language_loss": 0.78052682, + "learning_rate": 3.945643078691637e-06, + "loss": 0.86077201, + "num_input_tokens_seen": 36666675, + "router_z_loss_clip": 3.8046875, + "router_z_loss_mlp": 0.40112305, + "step": 1699, + "time_per_iteration": 2.554769515991211 + }, + { + "auxiliary_loss_clip": 0.06706256, + "auxiliary_loss_mlp": 0.01310666, + "balance_loss_clip": 0.06325917, + "balance_loss_mlp": 0.01269253, + "epoch": 0.10220952953554788, + "flos": 19652922282240.0, + "grad_norm": 2.595218153740113, + "language_loss": 0.81135154, + "learning_rate": 3.945552859553516e-06, + "loss": 0.89152074, + "num_input_tokens_seen": 36685225, + "router_z_loss_clip": 3.80273438, + "router_z_loss_mlp": 0.41430664, + "step": 1700, + "time_per_iteration": 2.6276824474334717 + }, + { + "auxiliary_loss_clip": 0.06713387, + "auxiliary_loss_mlp": 0.01308957, + "balance_loss_clip": 0.06330973, + "balance_loss_mlp": 0.01269284, + "epoch": 0.10226965278821584, + "flos": 29794765392000.0, + "grad_norm": 1.915620858004171, + "language_loss": 0.78195202, + "learning_rate": 3.945462566639836e-06, + "loss": 0.86217546, + "num_input_tokens_seen": 36705985, + "router_z_loss_clip": 3.82421875, + "router_z_loss_mlp": 0.39697266, + "step": 1701, + "time_per_iteration": 2.6159350872039795 + }, + { + "auxiliary_loss_clip": 0.06729369, + "auxiliary_loss_mlp": 0.01324821, + "balance_loss_clip": 0.06331599, + "balance_loss_mlp": 0.01279617, + "epoch": 0.10232977604088381, + "flos": 27024239266560.0, + "grad_norm": 2.5261274720011473, + "language_loss": 0.79135132, + "learning_rate": 3.945372199954019e-06, + "loss": 0.87189317, + "num_input_tokens_seen": 36725815, + "router_z_loss_clip": 3.98046875, + "router_z_loss_mlp": 0.4519043, + "step": 1702, + "time_per_iteration": 2.629913806915283 + }, + { + "auxiliary_loss_clip": 0.06706569, + "auxiliary_loss_mlp": 0.01317465, + "balance_loss_clip": 0.06326532, + "balance_loss_mlp": 0.01277983, + "epoch": 0.10238989929355179, + "flos": 20783857639680.0, + "grad_norm": 2.3222724065629494, + "language_loss": 0.95639896, + "learning_rate": 3.945281759499494e-06, + "loss": 1.03663921, + "num_input_tokens_seen": 36742345, + "router_z_loss_clip": 3.80078125, + "router_z_loss_mlp": 0.39501953, + "step": 1703, + "time_per_iteration": 2.601848840713501 + }, + { + "auxiliary_loss_clip": 0.06547229, + "auxiliary_loss_mlp": 0.01318477, + "balance_loss_clip": 0.06308849, + "balance_loss_mlp": 0.01299118, + "epoch": 0.10245002254621975, + "flos": 57716471013120.0, + "grad_norm": 0.8331319138238726, + "language_loss": 0.55242068, + "learning_rate": 3.94519124527969e-06, + "loss": 0.63107777, + "num_input_tokens_seen": 36798775, + "router_z_loss_clip": 2.375, + "router_z_loss_mlp": 0.19335938, + "step": 1704, + "time_per_iteration": 3.1248717308044434 + }, + { + "auxiliary_loss_clip": 0.06706051, + "auxiliary_loss_mlp": 0.01308758, + "balance_loss_clip": 0.06321411, + "balance_loss_mlp": 0.0126775, + "epoch": 0.10251014579888772, + "flos": 16805724071040.0, + "grad_norm": 2.30707717904525, + "language_loss": 0.8659755, + "learning_rate": 3.945100657298039e-06, + "loss": 0.94612348, + "num_input_tokens_seen": 36816295, + "router_z_loss_clip": 3.84960938, + "router_z_loss_mlp": 0.41015625, + "step": 1705, + "time_per_iteration": 2.5850555896759033 + }, + { + "auxiliary_loss_clip": 0.06541149, + "auxiliary_loss_mlp": 0.01304681, + "balance_loss_clip": 0.06304285, + "balance_loss_mlp": 0.01286478, + "epoch": 0.1025702690515557, + "flos": 68584533459840.0, + "grad_norm": 0.7436655566620352, + "language_loss": 0.60505682, + "learning_rate": 3.9450099955579765e-06, + "loss": 0.68351519, + "num_input_tokens_seen": 36882030, + "router_z_loss_clip": 2.375, + "router_z_loss_mlp": 0.18212891, + "step": 1706, + "time_per_iteration": 3.239501953125 + }, + { + "auxiliary_loss_clip": 0.06703549, + "auxiliary_loss_mlp": 0.01305907, + "balance_loss_clip": 0.0632052, + "balance_loss_mlp": 0.01262729, + "epoch": 0.10263039230422366, + "flos": 14871939217920.0, + "grad_norm": 2.8485004441458637, + "language_loss": 0.88280994, + "learning_rate": 3.94491926006294e-06, + "loss": 0.96290451, + "num_input_tokens_seen": 36899245, + "router_z_loss_clip": 3.828125, + "router_z_loss_mlp": 0.43188477, + "step": 1707, + "time_per_iteration": 2.6399993896484375 + }, + { + "auxiliary_loss_clip": 0.0669533, + "auxiliary_loss_mlp": 0.01302799, + "balance_loss_clip": 0.06323209, + "balance_loss_mlp": 0.01262887, + "epoch": 0.10269051555689163, + "flos": 25344593447040.0, + "grad_norm": 2.5980108077369604, + "language_loss": 0.74784869, + "learning_rate": 3.944828450816369e-06, + "loss": 0.82783002, + "num_input_tokens_seen": 36920950, + "router_z_loss_clip": 3.71875, + "router_z_loss_mlp": 0.39892578, + "step": 1708, + "time_per_iteration": 2.654852867126465 + }, + { + "auxiliary_loss_clip": 0.06703041, + "auxiliary_loss_mlp": 0.01305178, + "balance_loss_clip": 0.06323138, + "balance_loss_mlp": 0.01263049, + "epoch": 0.10275063880955959, + "flos": 21075116832000.0, + "grad_norm": 2.060667127210552, + "language_loss": 0.92398179, + "learning_rate": 3.944737567821709e-06, + "loss": 1.00406396, + "num_input_tokens_seen": 36938900, + "router_z_loss_clip": 3.80078125, + "router_z_loss_mlp": 0.42114258, + "step": 1709, + "time_per_iteration": 2.573854446411133 + }, + { + "auxiliary_loss_clip": 0.06702737, + "auxiliary_loss_mlp": 0.01298282, + "balance_loss_clip": 0.06322797, + "balance_loss_mlp": 0.01257703, + "epoch": 0.10281076206222757, + "flos": 30373636124160.0, + "grad_norm": 12.814317235362356, + "language_loss": 0.90276158, + "learning_rate": 3.944646611082406e-06, + "loss": 0.98277175, + "num_input_tokens_seen": 36957010, + "router_z_loss_clip": 3.79882812, + "router_z_loss_mlp": 0.40551758, + "step": 1710, + "time_per_iteration": 2.6228139400482178 + }, + { + "auxiliary_loss_clip": 0.06701953, + "auxiliary_loss_mlp": 0.01305177, + "balance_loss_clip": 0.06325494, + "balance_loss_mlp": 0.01263096, + "epoch": 0.10287088531489554, + "flos": 22424748145920.0, + "grad_norm": 2.0240875797159554, + "language_loss": 0.80754149, + "learning_rate": 3.944555580601908e-06, + "loss": 0.88761282, + "num_input_tokens_seen": 36977690, + "router_z_loss_clip": 3.75976562, + "router_z_loss_mlp": 0.42089844, + "step": 1711, + "time_per_iteration": 2.583343982696533 + }, + { + "auxiliary_loss_clip": 0.06708579, + "auxiliary_loss_mlp": 0.01306816, + "balance_loss_clip": 0.06325286, + "balance_loss_mlp": 0.01263447, + "epoch": 0.1029310085675635, + "flos": 25122501400320.0, + "grad_norm": 2.3794944473216684, + "language_loss": 0.74649823, + "learning_rate": 3.944464476383668e-06, + "loss": 0.82665217, + "num_input_tokens_seen": 36997300, + "router_z_loss_clip": 3.83398438, + "router_z_loss_mlp": 0.43383789, + "step": 1712, + "time_per_iteration": 2.571152687072754 + }, + { + "auxiliary_loss_clip": 0.06692443, + "auxiliary_loss_mlp": 0.01305083, + "balance_loss_clip": 0.0632696, + "balance_loss_mlp": 0.01265911, + "epoch": 0.10299113182023148, + "flos": 19871869800960.0, + "grad_norm": 3.881117444097493, + "language_loss": 0.88232982, + "learning_rate": 3.94437329843114e-06, + "loss": 0.96230507, + "num_input_tokens_seen": 37016110, + "router_z_loss_clip": 3.65429688, + "router_z_loss_mlp": 0.3918457, + "step": 1713, + "time_per_iteration": 4.005250453948975 + }, + { + "auxiliary_loss_clip": 0.06698017, + "auxiliary_loss_mlp": 0.01309494, + "balance_loss_clip": 0.06326848, + "balance_loss_mlp": 0.0126789, + "epoch": 0.10305125507289944, + "flos": 20453633498880.0, + "grad_norm": 1.7755930908575366, + "language_loss": 0.74034607, + "learning_rate": 3.944282046747782e-06, + "loss": 0.82042122, + "num_input_tokens_seen": 37036405, + "router_z_loss_clip": 3.70703125, + "router_z_loss_mlp": 0.41601562, + "step": 1714, + "time_per_iteration": 2.5871846675872803 + }, + { + "auxiliary_loss_clip": 0.06718543, + "auxiliary_loss_mlp": 0.01323459, + "balance_loss_clip": 0.06333546, + "balance_loss_mlp": 0.01278446, + "epoch": 0.10311137832556741, + "flos": 26258090659200.0, + "grad_norm": 2.9350503756017425, + "language_loss": 0.92344153, + "learning_rate": 3.944190721337053e-06, + "loss": 1.00386155, + "num_input_tokens_seen": 37057580, + "router_z_loss_clip": 3.84765625, + "router_z_loss_mlp": 0.45043945, + "step": 1715, + "time_per_iteration": 4.0185253620147705 + }, + { + "auxiliary_loss_clip": 0.06704861, + "auxiliary_loss_mlp": 0.01311537, + "balance_loss_clip": 0.06330159, + "balance_loss_mlp": 0.01269957, + "epoch": 0.10317150157823539, + "flos": 35307711797760.0, + "grad_norm": 2.2230189858401834, + "language_loss": 0.77534348, + "learning_rate": 3.944099322202418e-06, + "loss": 0.85550749, + "num_input_tokens_seen": 37079120, + "router_z_loss_clip": 3.74804688, + "router_z_loss_mlp": 0.41577148, + "step": 1716, + "time_per_iteration": 2.6924543380737305 + }, + { + "auxiliary_loss_clip": 0.06704281, + "auxiliary_loss_mlp": 0.01322549, + "balance_loss_clip": 0.06326932, + "balance_loss_mlp": 0.01278037, + "epoch": 0.10323162483090335, + "flos": 25747171188480.0, + "grad_norm": 4.647251493858166, + "language_loss": 0.87329108, + "learning_rate": 3.944007849347342e-06, + "loss": 0.9535594, + "num_input_tokens_seen": 37099710, + "router_z_loss_clip": 3.76757812, + "router_z_loss_mlp": 0.44506836, + "step": 1717, + "time_per_iteration": 2.5771939754486084 + }, + { + "auxiliary_loss_clip": 0.06709914, + "auxiliary_loss_mlp": 0.01337871, + "balance_loss_clip": 0.06322803, + "balance_loss_mlp": 0.0129393, + "epoch": 0.10329174808357132, + "flos": 16295475432960.0, + "grad_norm": 2.5245058321168297, + "language_loss": 0.84142077, + "learning_rate": 3.943916302775292e-06, + "loss": 0.9218986, + "num_input_tokens_seen": 37117775, + "router_z_loss_clip": 3.87109375, + "router_z_loss_mlp": 0.43945312, + "step": 1718, + "time_per_iteration": 3.9576940536499023 + }, + { + "auxiliary_loss_clip": 0.06693481, + "auxiliary_loss_mlp": 0.01328919, + "balance_loss_clip": 0.06322589, + "balance_loss_mlp": 0.01288626, + "epoch": 0.10335187133623928, + "flos": 36696475768320.0, + "grad_norm": 4.723677538171457, + "language_loss": 0.75181365, + "learning_rate": 3.943824682489742e-06, + "loss": 0.83203769, + "num_input_tokens_seen": 37140280, + "router_z_loss_clip": 3.70703125, + "router_z_loss_mlp": 0.40283203, + "step": 1719, + "time_per_iteration": 4.132940769195557 + }, + { + "auxiliary_loss_clip": 0.06689329, + "auxiliary_loss_mlp": 0.01317642, + "balance_loss_clip": 0.06317558, + "balance_loss_mlp": 0.01278064, + "epoch": 0.10341199458890726, + "flos": 14980909852800.0, + "grad_norm": 1.9928809485399477, + "language_loss": 0.94301736, + "learning_rate": 3.9437329884941665e-06, + "loss": 1.02308702, + "num_input_tokens_seen": 37158350, + "router_z_loss_clip": 3.71679688, + "router_z_loss_mlp": 0.39575195, + "step": 1720, + "time_per_iteration": 2.53070068359375 + }, + { + "auxiliary_loss_clip": 0.06693915, + "auxiliary_loss_mlp": 0.01322313, + "balance_loss_clip": 0.06316631, + "balance_loss_mlp": 0.0127811, + "epoch": 0.10347211784157523, + "flos": 21037745111040.0, + "grad_norm": 2.2577738133608944, + "language_loss": 0.80850732, + "learning_rate": 3.943641220792039e-06, + "loss": 0.88866961, + "num_input_tokens_seen": 37177120, + "router_z_loss_clip": 3.77148438, + "router_z_loss_mlp": 0.44213867, + "step": 1721, + "time_per_iteration": 2.6165122985839844 + }, + { + "auxiliary_loss_clip": 0.06711201, + "auxiliary_loss_mlp": 0.01332384, + "balance_loss_clip": 0.06324577, + "balance_loss_mlp": 0.01286345, + "epoch": 0.1035322410942432, + "flos": 19798216462080.0, + "grad_norm": 2.2916288774806137, + "language_loss": 0.81885946, + "learning_rate": 3.9435493793868434e-06, + "loss": 0.89929533, + "num_input_tokens_seen": 37195895, + "router_z_loss_clip": 3.87109375, + "router_z_loss_mlp": 0.46044922, + "step": 1722, + "time_per_iteration": 2.585881471633911 + }, + { + "auxiliary_loss_clip": 0.06547391, + "auxiliary_loss_mlp": 0.01290481, + "balance_loss_clip": 0.06313527, + "balance_loss_mlp": 0.01272635, + "epoch": 0.10359236434691117, + "flos": 52716037305600.0, + "grad_norm": 0.9610809671594381, + "language_loss": 0.66722119, + "learning_rate": 3.943457464282059e-06, + "loss": 0.74559999, + "num_input_tokens_seen": 37247270, + "router_z_loss_clip": 2.34375, + "router_z_loss_mlp": 0.17883301, + "step": 1723, + "time_per_iteration": 2.9245951175689697 + }, + { + "auxiliary_loss_clip": 0.0669903, + "auxiliary_loss_mlp": 0.01310212, + "balance_loss_clip": 0.06318312, + "balance_loss_mlp": 0.01267582, + "epoch": 0.10365248759957914, + "flos": 18411255354240.0, + "grad_norm": 3.390195963482514, + "language_loss": 0.78785694, + "learning_rate": 3.9433654754811745e-06, + "loss": 0.86794937, + "num_input_tokens_seen": 37265595, + "router_z_loss_clip": 3.8046875, + "router_z_loss_mlp": 0.42651367, + "step": 1724, + "time_per_iteration": 2.587998151779175 + }, + { + "auxiliary_loss_clip": 0.06701188, + "auxiliary_loss_mlp": 0.01310671, + "balance_loss_clip": 0.06321733, + "balance_loss_mlp": 0.01269663, + "epoch": 0.1037126108522471, + "flos": 47563615820160.0, + "grad_norm": 2.288753840195378, + "language_loss": 0.76223904, + "learning_rate": 3.943273412987676e-06, + "loss": 0.84235764, + "num_input_tokens_seen": 37286660, + "router_z_loss_clip": 3.79296875, + "router_z_loss_mlp": 0.41015625, + "step": 1725, + "time_per_iteration": 2.7683663368225098 + }, + { + "auxiliary_loss_clip": 0.06675334, + "auxiliary_loss_mlp": 0.01298882, + "balance_loss_clip": 0.06309348, + "balance_loss_mlp": 0.01258041, + "epoch": 0.10377273410491508, + "flos": 22822671985920.0, + "grad_norm": 2.2764288322332265, + "language_loss": 0.76062018, + "learning_rate": 3.943181276805054e-06, + "loss": 0.84036231, + "num_input_tokens_seen": 37304915, + "router_z_loss_clip": 3.66601562, + "router_z_loss_mlp": 0.40869141, + "step": 1726, + "time_per_iteration": 2.587892770767212 + }, + { + "auxiliary_loss_clip": 0.06701919, + "auxiliary_loss_mlp": 0.01307243, + "balance_loss_clip": 0.0631658, + "balance_loss_mlp": 0.0126316, + "epoch": 0.10383285735758305, + "flos": 26145556225920.0, + "grad_norm": 2.697441848061202, + "language_loss": 0.76235563, + "learning_rate": 3.9430890669368035e-06, + "loss": 0.84244722, + "num_input_tokens_seen": 37325265, + "router_z_loss_clip": 3.85351562, + "router_z_loss_mlp": 0.44042969, + "step": 1727, + "time_per_iteration": 2.6308248043060303 + }, + { + "auxiliary_loss_clip": 0.06691539, + "auxiliary_loss_mlp": 0.0130793, + "balance_loss_clip": 0.0631765, + "balance_loss_mlp": 0.01265277, + "epoch": 0.10389298061025101, + "flos": 17097402533760.0, + "grad_norm": 2.4502843901442315, + "language_loss": 0.86415958, + "learning_rate": 3.942996783386422e-06, + "loss": 0.94415426, + "num_input_tokens_seen": 37341650, + "router_z_loss_clip": 3.73828125, + "router_z_loss_mlp": 0.42675781, + "step": 1728, + "time_per_iteration": 2.5618197917938232 + }, + { + "auxiliary_loss_clip": 0.06685561, + "auxiliary_loss_mlp": 0.01302161, + "balance_loss_clip": 0.06312057, + "balance_loss_mlp": 0.01259484, + "epoch": 0.10395310386291898, + "flos": 20782683682560.0, + "grad_norm": 2.0546311064170726, + "language_loss": 0.71406788, + "learning_rate": 3.942904426157406e-06, + "loss": 0.79394507, + "num_input_tokens_seen": 37360270, + "router_z_loss_clip": 3.73632812, + "router_z_loss_mlp": 0.42675781, + "step": 1729, + "time_per_iteration": 2.5618793964385986 + }, + { + "auxiliary_loss_clip": 0.06693864, + "auxiliary_loss_mlp": 0.01305753, + "balance_loss_clip": 0.06314608, + "balance_loss_mlp": 0.01260954, + "epoch": 0.10401322711558696, + "flos": 12825032952960.0, + "grad_norm": 2.8841772006205617, + "language_loss": 0.83575559, + "learning_rate": 3.9428119952532605e-06, + "loss": 0.91575181, + "num_input_tokens_seen": 37375225, + "router_z_loss_clip": 3.79492188, + "router_z_loss_mlp": 0.44775391, + "step": 1730, + "time_per_iteration": 2.623878002166748 + }, + { + "auxiliary_loss_clip": 0.06680113, + "auxiliary_loss_mlp": 0.01302214, + "balance_loss_clip": 0.06313114, + "balance_loss_mlp": 0.01260681, + "epoch": 0.10407335036825492, + "flos": 23191274096640.0, + "grad_norm": 1.835927341089653, + "language_loss": 0.77408624, + "learning_rate": 3.942719490677489e-06, + "loss": 0.85390949, + "num_input_tokens_seen": 37395165, + "router_z_loss_clip": 3.67382812, + "router_z_loss_mlp": 0.4152832, + "step": 1731, + "time_per_iteration": 2.5633392333984375 + }, + { + "auxiliary_loss_clip": 0.0668644, + "auxiliary_loss_mlp": 0.01313118, + "balance_loss_clip": 0.0632073, + "balance_loss_mlp": 0.01273159, + "epoch": 0.10413347362092289, + "flos": 26111370960000.0, + "grad_norm": 1.90471773366097, + "language_loss": 0.84198594, + "learning_rate": 3.9426269124336e-06, + "loss": 0.92198151, + "num_input_tokens_seen": 37414845, + "router_z_loss_clip": 3.65625, + "router_z_loss_mlp": 0.39941406, + "step": 1732, + "time_per_iteration": 2.6176345348358154 + }, + { + "auxiliary_loss_clip": 0.06683554, + "auxiliary_loss_mlp": 0.01314534, + "balance_loss_clip": 0.06312263, + "balance_loss_mlp": 0.01271905, + "epoch": 0.10419359687359087, + "flos": 12646014704640.0, + "grad_norm": 2.549467420686237, + "language_loss": 0.8515988, + "learning_rate": 3.942534260525104e-06, + "loss": 0.93157971, + "num_input_tokens_seen": 37432490, + "router_z_loss_clip": 3.71679688, + "router_z_loss_mlp": 0.42626953, + "step": 1733, + "time_per_iteration": 2.529829978942871 + }, + { + "auxiliary_loss_clip": 0.06699164, + "auxiliary_loss_mlp": 0.01313294, + "balance_loss_clip": 0.06323372, + "balance_loss_mlp": 0.01269139, + "epoch": 0.10425372012625883, + "flos": 12129099667200.0, + "grad_norm": 4.348408719624472, + "language_loss": 0.78445566, + "learning_rate": 3.942441534955514e-06, + "loss": 0.86458015, + "num_input_tokens_seen": 37449435, + "router_z_loss_clip": 3.7578125, + "router_z_loss_mlp": 0.44165039, + "step": 1734, + "time_per_iteration": 2.5436649322509766 + }, + { + "auxiliary_loss_clip": 0.06683113, + "auxiliary_loss_mlp": 0.01310658, + "balance_loss_clip": 0.06320634, + "balance_loss_mlp": 0.01270937, + "epoch": 0.1043138433789268, + "flos": 25344551520000.0, + "grad_norm": 1.8276863047745044, + "language_loss": 0.76546466, + "learning_rate": 3.9423487357283465e-06, + "loss": 0.84540236, + "num_input_tokens_seen": 37469105, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 0.3972168, + "step": 1735, + "time_per_iteration": 2.6129813194274902 + }, + { + "auxiliary_loss_clip": 0.06697765, + "auxiliary_loss_mlp": 0.01313856, + "balance_loss_clip": 0.06318491, + "balance_loss_mlp": 0.01269438, + "epoch": 0.10437396663159478, + "flos": 29174539870080.0, + "grad_norm": 2.0479038136948735, + "language_loss": 0.80253965, + "learning_rate": 3.94225586284712e-06, + "loss": 0.88265586, + "num_input_tokens_seen": 37490540, + "router_z_loss_clip": 3.79492188, + "router_z_loss_mlp": 0.44360352, + "step": 1736, + "time_per_iteration": 2.6438446044921875 + }, + { + "auxiliary_loss_clip": 0.06694648, + "auxiliary_loss_mlp": 0.01312039, + "balance_loss_clip": 0.06322388, + "balance_loss_mlp": 0.01269267, + "epoch": 0.10443408988426274, + "flos": 25087687228800.0, + "grad_norm": 4.638523885209388, + "language_loss": 0.71961701, + "learning_rate": 3.942162916315356e-06, + "loss": 0.79968387, + "num_input_tokens_seen": 37511905, + "router_z_loss_clip": 3.72460938, + "router_z_loss_mlp": 0.42773438, + "step": 1737, + "time_per_iteration": 2.5947039127349854 + }, + { + "auxiliary_loss_clip": 0.06704547, + "auxiliary_loss_mlp": 0.01309535, + "balance_loss_clip": 0.06322168, + "balance_loss_mlp": 0.01263305, + "epoch": 0.1044942131369307, + "flos": 26766746069760.0, + "grad_norm": 2.5677527060209715, + "language_loss": 0.83228981, + "learning_rate": 3.942069896136581e-06, + "loss": 0.91243058, + "num_input_tokens_seen": 37533635, + "router_z_loss_clip": 3.82617188, + "router_z_loss_mlp": 0.46191406, + "step": 1738, + "time_per_iteration": 2.615252733230591 + }, + { + "auxiliary_loss_clip": 0.06695886, + "auxiliary_loss_mlp": 0.01310975, + "balance_loss_clip": 0.06315427, + "balance_loss_mlp": 0.01265747, + "epoch": 0.10455433638959867, + "flos": 18448543221120.0, + "grad_norm": 2.179337588406841, + "language_loss": 0.76366144, + "learning_rate": 3.9419768023143196e-06, + "loss": 0.84373009, + "num_input_tokens_seen": 37552035, + "router_z_loss_clip": 3.80273438, + "router_z_loss_mlp": 0.45239258, + "step": 1739, + "time_per_iteration": 2.5386781692504883 + }, + { + "auxiliary_loss_clip": 0.06684839, + "auxiliary_loss_mlp": 0.01316183, + "balance_loss_clip": 0.06310752, + "balance_loss_mlp": 0.01271456, + "epoch": 0.10461445964226665, + "flos": 23225207800320.0, + "grad_norm": 1.9549702888486553, + "language_loss": 0.7847473, + "learning_rate": 3.941883634852104e-06, + "loss": 0.86475754, + "num_input_tokens_seen": 37571540, + "router_z_loss_clip": 3.74023438, + "router_z_loss_mlp": 0.44775391, + "step": 1740, + "time_per_iteration": 2.6215531826019287 + }, + { + "auxiliary_loss_clip": 0.06687017, + "auxiliary_loss_mlp": 0.01315844, + "balance_loss_clip": 0.06320937, + "balance_loss_mlp": 0.01273953, + "epoch": 0.10467458289493461, + "flos": 24350860350720.0, + "grad_norm": 2.5281783737696246, + "language_loss": 0.86859214, + "learning_rate": 3.941790393753467e-06, + "loss": 0.94862068, + "num_input_tokens_seen": 37588265, + "router_z_loss_clip": 3.66210938, + "router_z_loss_mlp": 0.41894531, + "step": 1741, + "time_per_iteration": 2.5947859287261963 + }, + { + "auxiliary_loss_clip": 0.06689818, + "auxiliary_loss_mlp": 0.01306432, + "balance_loss_clip": 0.06307445, + "balance_loss_mlp": 0.01259201, + "epoch": 0.10473470614760258, + "flos": 21294315912960.0, + "grad_norm": 3.2114625668667367, + "language_loss": 0.76732343, + "learning_rate": 3.941697079021942e-06, + "loss": 0.84728593, + "num_input_tokens_seen": 37606860, + "router_z_loss_clip": 3.82421875, + "router_z_loss_mlp": 0.47265625, + "step": 1742, + "time_per_iteration": 2.5832579135894775 + }, + { + "auxiliary_loss_clip": 0.06678567, + "auxiliary_loss_mlp": 0.01303781, + "balance_loss_clip": 0.06306475, + "balance_loss_mlp": 0.01260628, + "epoch": 0.10479482940027056, + "flos": 21693287928960.0, + "grad_norm": 9.553870000179, + "language_loss": 0.89069176, + "learning_rate": 3.94160369066107e-06, + "loss": 0.97051525, + "num_input_tokens_seen": 37625210, + "router_z_loss_clip": 3.71875, + "router_z_loss_mlp": 0.43164062, + "step": 1743, + "time_per_iteration": 2.5764474868774414 + }, + { + "auxiliary_loss_clip": 0.06671779, + "auxiliary_loss_mlp": 0.01307955, + "balance_loss_clip": 0.06307401, + "balance_loss_mlp": 0.01264801, + "epoch": 0.10485495265293852, + "flos": 21579076414080.0, + "grad_norm": 2.2332748103162907, + "language_loss": 0.77711093, + "learning_rate": 3.941510228674391e-06, + "loss": 0.8569082, + "num_input_tokens_seen": 37644110, + "router_z_loss_clip": 3.64648438, + "router_z_loss_mlp": 0.43164062, + "step": 1744, + "time_per_iteration": 2.5712687969207764 + }, + { + "auxiliary_loss_clip": 0.06674588, + "auxiliary_loss_mlp": 0.01310978, + "balance_loss_clip": 0.06307609, + "balance_loss_mlp": 0.01270685, + "epoch": 0.10491507590560649, + "flos": 37971070151040.0, + "grad_norm": 4.071178521090377, + "language_loss": 0.81752264, + "learning_rate": 3.941416693065451e-06, + "loss": 0.89737833, + "num_input_tokens_seen": 37665800, + "router_z_loss_clip": 3.671875, + "router_z_loss_mlp": 0.40332031, + "step": 1745, + "time_per_iteration": 2.7351014614105225 + }, + { + "auxiliary_loss_clip": 0.06685829, + "auxiliary_loss_mlp": 0.01305127, + "balance_loss_clip": 0.0631006, + "balance_loss_mlp": 0.01260472, + "epoch": 0.10497519915827447, + "flos": 26403552547200.0, + "grad_norm": 2.408878958176613, + "language_loss": 0.84535897, + "learning_rate": 3.941323083837794e-06, + "loss": 0.92526853, + "num_input_tokens_seen": 37685095, + "router_z_loss_clip": 3.7578125, + "router_z_loss_mlp": 0.44628906, + "step": 1746, + "time_per_iteration": 2.6103639602661133 + }, + { + "auxiliary_loss_clip": 0.06678679, + "auxiliary_loss_mlp": 0.01312181, + "balance_loss_clip": 0.06308784, + "balance_loss_mlp": 0.01272174, + "epoch": 0.10503532241094243, + "flos": 40671842152320.0, + "grad_norm": 2.4792988701606444, + "language_loss": 0.72187877, + "learning_rate": 3.941229400994971e-06, + "loss": 0.80178738, + "num_input_tokens_seen": 37707445, + "router_z_loss_clip": 3.69921875, + "router_z_loss_mlp": 0.40014648, + "step": 1747, + "time_per_iteration": 2.7907614707946777 + }, + { + "auxiliary_loss_clip": 0.06697921, + "auxiliary_loss_mlp": 0.01310121, + "balance_loss_clip": 0.06312211, + "balance_loss_mlp": 0.0126432, + "epoch": 0.1050954456636104, + "flos": 29797239087360.0, + "grad_norm": 4.268942313212568, + "language_loss": 0.86334866, + "learning_rate": 3.941135644540535e-06, + "loss": 0.94342911, + "num_input_tokens_seen": 37728325, + "router_z_loss_clip": 3.859375, + "router_z_loss_mlp": 0.45825195, + "step": 1748, + "time_per_iteration": 2.6081960201263428 + }, + { + "auxiliary_loss_clip": 0.06687598, + "auxiliary_loss_mlp": 0.01305718, + "balance_loss_clip": 0.06311792, + "balance_loss_mlp": 0.0126409, + "epoch": 0.10515556891627838, + "flos": 23955116716800.0, + "grad_norm": 1.9464829787737532, + "language_loss": 0.73449892, + "learning_rate": 3.941041814478041e-06, + "loss": 0.81443208, + "num_input_tokens_seen": 37748910, + "router_z_loss_clip": 3.76171875, + "router_z_loss_mlp": 0.41625977, + "step": 1749, + "time_per_iteration": 2.628052234649658 + }, + { + "auxiliary_loss_clip": 0.06669957, + "auxiliary_loss_mlp": 0.01310674, + "balance_loss_clip": 0.0630856, + "balance_loss_mlp": 0.01270882, + "epoch": 0.10521569216894634, + "flos": 18265458049920.0, + "grad_norm": 3.456638635747079, + "language_loss": 0.84465253, + "learning_rate": 3.940947910811047e-06, + "loss": 0.92445886, + "num_input_tokens_seen": 37765745, + "router_z_loss_clip": 3.61328125, + "router_z_loss_mlp": 0.39794922, + "step": 1750, + "time_per_iteration": 2.537736177444458 + }, + { + "auxiliary_loss_clip": 0.06687038, + "auxiliary_loss_mlp": 0.01306152, + "balance_loss_clip": 0.06307652, + "balance_loss_mlp": 0.01264238, + "epoch": 0.10527581542161431, + "flos": 15636033400320.0, + "grad_norm": 3.4228490231822364, + "language_loss": 0.94313812, + "learning_rate": 3.940853933543114e-06, + "loss": 1.0230701, + "num_input_tokens_seen": 37780520, + "router_z_loss_clip": 3.79101562, + "router_z_loss_mlp": 0.41918945, + "step": 1751, + "time_per_iteration": 2.525054931640625 + }, + { + "auxiliary_loss_clip": 0.06674927, + "auxiliary_loss_mlp": 0.01302904, + "balance_loss_clip": 0.06309814, + "balance_loss_mlp": 0.01265686, + "epoch": 0.10533593867428227, + "flos": 18302494354560.0, + "grad_norm": 3.1318677329631757, + "language_loss": 0.8055681, + "learning_rate": 3.940759882677805e-06, + "loss": 0.88534641, + "num_input_tokens_seen": 37799515, + "router_z_loss_clip": 3.65234375, + "router_z_loss_mlp": 0.37207031, + "step": 1752, + "time_per_iteration": 2.61299467086792 + }, + { + "auxiliary_loss_clip": 0.06668897, + "auxiliary_loss_mlp": 0.01309257, + "balance_loss_clip": 0.06304127, + "balance_loss_mlp": 0.01268869, + "epoch": 0.10539606192695025, + "flos": 29030922771840.0, + "grad_norm": 1.9587092194109417, + "language_loss": 0.77260768, + "learning_rate": 3.940665758218686e-06, + "loss": 0.85238922, + "num_input_tokens_seen": 37818695, + "router_z_loss_clip": 3.64453125, + "router_z_loss_mlp": 0.40356445, + "step": 1753, + "time_per_iteration": 3.9985692501068115 + }, + { + "auxiliary_loss_clip": 0.06682716, + "auxiliary_loss_mlp": 0.01311036, + "balance_loss_clip": 0.06304091, + "balance_loss_mlp": 0.01267, + "epoch": 0.10545618517961822, + "flos": 19974593306880.0, + "grad_norm": 2.3568862676270244, + "language_loss": 0.85363507, + "learning_rate": 3.940571560169328e-06, + "loss": 0.93357253, + "num_input_tokens_seen": 37837860, + "router_z_loss_clip": 3.78710938, + "router_z_loss_mlp": 0.44067383, + "step": 1754, + "time_per_iteration": 2.5938985347747803 + }, + { + "auxiliary_loss_clip": 0.06682456, + "auxiliary_loss_mlp": 0.01316264, + "balance_loss_clip": 0.06304919, + "balance_loss_mlp": 0.012723, + "epoch": 0.10551630843228618, + "flos": 16148923441920.0, + "grad_norm": 4.265882829931168, + "language_loss": 0.71315837, + "learning_rate": 3.940477288533302e-06, + "loss": 0.7931456, + "num_input_tokens_seen": 37856260, + "router_z_loss_clip": 3.77148438, + "router_z_loss_mlp": 0.43969727, + "step": 1755, + "time_per_iteration": 3.9860999584198 + }, + { + "auxiliary_loss_clip": 0.06684709, + "auxiliary_loss_mlp": 0.01318348, + "balance_loss_clip": 0.06302933, + "balance_loss_mlp": 0.01273025, + "epoch": 0.10557643168495416, + "flos": 23446754795520.0, + "grad_norm": 2.7157076999837364, + "language_loss": 0.78681093, + "learning_rate": 3.940382943314182e-06, + "loss": 0.86684155, + "num_input_tokens_seen": 37876960, + "router_z_loss_clip": 3.8203125, + "router_z_loss_mlp": 0.453125, + "step": 1756, + "time_per_iteration": 2.616227149963379 + }, + { + "auxiliary_loss_clip": 0.06683522, + "auxiliary_loss_mlp": 0.01310683, + "balance_loss_clip": 0.06306458, + "balance_loss_mlp": 0.0126927, + "epoch": 0.10563655493762213, + "flos": 21805528872960.0, + "grad_norm": 1.8370818155350874, + "language_loss": 0.81619543, + "learning_rate": 3.940288524515547e-06, + "loss": 0.89613748, + "num_input_tokens_seen": 37897070, + "router_z_loss_clip": 3.77148438, + "router_z_loss_mlp": 0.41381836, + "step": 1757, + "time_per_iteration": 2.5410592555999756 + }, + { + "auxiliary_loss_clip": 0.06685489, + "auxiliary_loss_mlp": 0.01318192, + "balance_loss_clip": 0.06307954, + "balance_loss_mlp": 0.01272177, + "epoch": 0.10569667819029009, + "flos": 53813347176960.0, + "grad_norm": 2.270274116106966, + "language_loss": 0.800345, + "learning_rate": 3.940194032140976e-06, + "loss": 0.88038182, + "num_input_tokens_seen": 37923635, + "router_z_loss_clip": 3.77734375, + "router_z_loss_mlp": 0.46020508, + "step": 1758, + "time_per_iteration": 4.229799032211304 + }, + { + "auxiliary_loss_clip": 0.06687906, + "auxiliary_loss_mlp": 0.01314474, + "balance_loss_clip": 0.06312382, + "balance_loss_mlp": 0.01272537, + "epoch": 0.10575680144295807, + "flos": 22931432985600.0, + "grad_norm": 1.92460183667747, + "language_loss": 0.93262696, + "learning_rate": 3.940099466194054e-06, + "loss": 1.01265085, + "num_input_tokens_seen": 37942650, + "router_z_loss_clip": 3.75585938, + "router_z_loss_mlp": 0.41967773, + "step": 1759, + "time_per_iteration": 4.090106248855591 + }, + { + "auxiliary_loss_clip": 0.066918, + "auxiliary_loss_mlp": 0.01305635, + "balance_loss_clip": 0.06315835, + "balance_loss_mlp": 0.01262219, + "epoch": 0.10581692469562604, + "flos": 14141820666240.0, + "grad_norm": 3.0343588084928204, + "language_loss": 0.78992438, + "learning_rate": 3.940004826678365e-06, + "loss": 0.86989868, + "num_input_tokens_seen": 37960660, + "router_z_loss_clip": 3.75976562, + "router_z_loss_mlp": 0.43383789, + "step": 1760, + "time_per_iteration": 2.5582082271575928 + }, + { + "auxiliary_loss_clip": 0.06697676, + "auxiliary_loss_mlp": 0.0131432, + "balance_loss_clip": 0.06312977, + "balance_loss_mlp": 0.01266588, + "epoch": 0.105877047948294, + "flos": 25965909072000.0, + "grad_norm": 2.31808263898244, + "language_loss": 0.91032952, + "learning_rate": 3.939910113597498e-06, + "loss": 0.99044949, + "num_input_tokens_seen": 37978625, + "router_z_loss_clip": 3.8515625, + "router_z_loss_mlp": 0.47729492, + "step": 1761, + "time_per_iteration": 2.5757992267608643 + }, + { + "auxiliary_loss_clip": 0.06676473, + "auxiliary_loss_mlp": 0.01306238, + "balance_loss_clip": 0.06308871, + "balance_loss_mlp": 0.01264229, + "epoch": 0.10593717120096197, + "flos": 30672693745920.0, + "grad_norm": 2.4539135080814862, + "language_loss": 0.79606199, + "learning_rate": 3.9398153269550464e-06, + "loss": 0.87588912, + "num_input_tokens_seen": 38000005, + "router_z_loss_clip": 3.6796875, + "router_z_loss_mlp": 0.42041016, + "step": 1762, + "time_per_iteration": 2.6716315746307373 + }, + { + "auxiliary_loss_clip": 0.06617578, + "auxiliary_loss_mlp": 0.01351391, + "balance_loss_clip": 0.06387473, + "balance_loss_mlp": 0.01331745, + "epoch": 0.10599729445362994, + "flos": 66459347153280.0, + "grad_norm": 0.7549006377741803, + "language_loss": 0.60690284, + "learning_rate": 3.939720466754602e-06, + "loss": 0.68659246, + "num_input_tokens_seen": 38066165, + "router_z_loss_clip": 2.31054688, + "router_z_loss_mlp": 0.19628906, + "step": 1763, + "time_per_iteration": 3.3268401622772217 + }, + { + "auxiliary_loss_clip": 0.06678826, + "auxiliary_loss_mlp": 0.01304205, + "balance_loss_clip": 0.06307326, + "balance_loss_mlp": 0.01263221, + "epoch": 0.10605741770629791, + "flos": 23954445884160.0, + "grad_norm": 2.5468873407149744, + "language_loss": 0.81550586, + "learning_rate": 3.939625532999763e-06, + "loss": 0.89533615, + "num_input_tokens_seen": 38086150, + "router_z_loss_clip": 3.71289062, + "router_z_loss_mlp": 0.40991211, + "step": 1764, + "time_per_iteration": 2.6332688331604004 + }, + { + "auxiliary_loss_clip": 0.06680285, + "auxiliary_loss_mlp": 0.01305528, + "balance_loss_clip": 0.06314, + "balance_loss_mlp": 0.0126359, + "epoch": 0.10611754095896588, + "flos": 19393039244160.0, + "grad_norm": 2.1888720223736384, + "language_loss": 0.81130767, + "learning_rate": 3.9395305256941314e-06, + "loss": 0.89116579, + "num_input_tokens_seen": 38104205, + "router_z_loss_clip": 3.66210938, + "router_z_loss_mlp": 0.41943359, + "step": 1765, + "time_per_iteration": 2.5613298416137695 + }, + { + "auxiliary_loss_clip": 0.0667872, + "auxiliary_loss_mlp": 0.01306506, + "balance_loss_clip": 0.06306241, + "balance_loss_mlp": 0.01263328, + "epoch": 0.10617766421163385, + "flos": 22244472086400.0, + "grad_norm": 2.2657345433152853, + "language_loss": 0.78213799, + "learning_rate": 3.939435444841306e-06, + "loss": 0.86199021, + "num_input_tokens_seen": 38122005, + "router_z_loss_clip": 3.72460938, + "router_z_loss_mlp": 0.43188477, + "step": 1766, + "time_per_iteration": 2.596531867980957 + }, + { + "auxiliary_loss_clip": 0.0668143, + "auxiliary_loss_mlp": 0.01312404, + "balance_loss_clip": 0.06318849, + "balance_loss_mlp": 0.01270705, + "epoch": 0.10623778746430182, + "flos": 28412248550400.0, + "grad_norm": 1.8379569457301719, + "language_loss": 0.78568375, + "learning_rate": 3.939340290444895e-06, + "loss": 0.8656221, + "num_input_tokens_seen": 38143365, + "router_z_loss_clip": 3.62304688, + "router_z_loss_mlp": 0.41674805, + "step": 1767, + "time_per_iteration": 2.6066575050354004 + }, + { + "auxiliary_loss_clip": 0.06566842, + "auxiliary_loss_mlp": 0.01278755, + "balance_loss_clip": 0.06337046, + "balance_loss_mlp": 0.01260039, + "epoch": 0.10629791071696978, + "flos": 64254778231680.0, + "grad_norm": 0.6896173149576642, + "language_loss": 0.57757622, + "learning_rate": 3.939245062508506e-06, + "loss": 0.6560322, + "num_input_tokens_seen": 38210035, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.18688965, + "step": 1768, + "time_per_iteration": 3.3073205947875977 + }, + { + "auxiliary_loss_clip": 0.06681848, + "auxiliary_loss_mlp": 0.01302238, + "balance_loss_clip": 0.06313933, + "balance_loss_mlp": 0.01260634, + "epoch": 0.10635803396963776, + "flos": 22754217600000.0, + "grad_norm": 1.7735238866189138, + "language_loss": 0.88016206, + "learning_rate": 3.939149761035749e-06, + "loss": 0.9600029, + "num_input_tokens_seen": 38231230, + "router_z_loss_clip": 3.67578125, + "router_z_loss_mlp": 0.41625977, + "step": 1769, + "time_per_iteration": 2.59757924079895 + }, + { + "auxiliary_loss_clip": 0.06688489, + "auxiliary_loss_mlp": 0.01307377, + "balance_loss_clip": 0.06315921, + "balance_loss_mlp": 0.01266035, + "epoch": 0.10641815722230573, + "flos": 31403818546560.0, + "grad_norm": 1.8774824554466385, + "language_loss": 0.62396371, + "learning_rate": 3.9390543860302395e-06, + "loss": 0.70392233, + "num_input_tokens_seen": 38253890, + "router_z_loss_clip": 3.72460938, + "router_z_loss_mlp": 0.41357422, + "step": 1770, + "time_per_iteration": 2.619767904281616 + }, + { + "auxiliary_loss_clip": 0.06544405, + "auxiliary_loss_mlp": 0.01277398, + "balance_loss_clip": 0.06314689, + "balance_loss_mlp": 0.01260136, + "epoch": 0.1064782804749737, + "flos": 58567230645120.0, + "grad_norm": 0.8566843095142983, + "language_loss": 0.57127362, + "learning_rate": 3.9389589374955925e-06, + "loss": 0.64949167, + "num_input_tokens_seen": 38304290, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.17285156, + "step": 1771, + "time_per_iteration": 3.075225353240967 + }, + { + "auxiliary_loss_clip": 0.06680871, + "auxiliary_loss_mlp": 0.01316894, + "balance_loss_clip": 0.06314114, + "balance_loss_mlp": 0.01274432, + "epoch": 0.10653840372764166, + "flos": 23994626716800.0, + "grad_norm": 1.9413884947034454, + "language_loss": 0.90273499, + "learning_rate": 3.938863415435429e-06, + "loss": 0.98271263, + "num_input_tokens_seen": 38324725, + "router_z_loss_clip": 3.66601562, + "router_z_loss_mlp": 0.42431641, + "step": 1772, + "time_per_iteration": 2.5640146732330322 + }, + { + "auxiliary_loss_clip": 0.06695, + "auxiliary_loss_mlp": 0.01317722, + "balance_loss_clip": 0.0631227, + "balance_loss_mlp": 0.01272828, + "epoch": 0.10659852698030964, + "flos": 18300272221440.0, + "grad_norm": 4.259637608820723, + "language_loss": 0.78636491, + "learning_rate": 3.93876781985337e-06, + "loss": 0.86649209, + "num_input_tokens_seen": 38340735, + "router_z_loss_clip": 3.83203125, + "router_z_loss_mlp": 0.44824219, + "step": 1773, + "time_per_iteration": 2.528411626815796 + }, + { + "auxiliary_loss_clip": 0.06679896, + "auxiliary_loss_mlp": 0.01313366, + "balance_loss_clip": 0.06312554, + "balance_loss_mlp": 0.01272024, + "epoch": 0.1066586502329776, + "flos": 32168751269760.0, + "grad_norm": 2.123173958110219, + "language_loss": 0.84472597, + "learning_rate": 3.938672150753041e-06, + "loss": 0.92465854, + "num_input_tokens_seen": 38361315, + "router_z_loss_clip": 3.67578125, + "router_z_loss_mlp": 0.41333008, + "step": 1774, + "time_per_iteration": 2.6232900619506836 + }, + { + "auxiliary_loss_clip": 0.06689709, + "auxiliary_loss_mlp": 0.01315484, + "balance_loss_clip": 0.06314571, + "balance_loss_mlp": 0.0127245, + "epoch": 0.10671877348564557, + "flos": 17790904051200.0, + "grad_norm": 3.7633279602301326, + "language_loss": 0.78288794, + "learning_rate": 3.9385764081380704e-06, + "loss": 0.86293983, + "num_input_tokens_seen": 38377425, + "router_z_loss_clip": 3.75585938, + "router_z_loss_mlp": 0.43066406, + "step": 1775, + "time_per_iteration": 2.5444161891937256 + }, + { + "auxiliary_loss_clip": 0.06541309, + "auxiliary_loss_mlp": 0.01282331, + "balance_loss_clip": 0.06314777, + "balance_loss_mlp": 0.0126594, + "epoch": 0.10677889673831355, + "flos": 63531074517120.0, + "grad_norm": 0.8449773894494127, + "language_loss": 0.57561356, + "learning_rate": 3.9384805920120876e-06, + "loss": 0.65384996, + "num_input_tokens_seen": 38440275, + "router_z_loss_clip": 2.265625, + "router_z_loss_mlp": 0.16394043, + "step": 1776, + "time_per_iteration": 3.194715976715088 + }, + { + "auxiliary_loss_clip": 0.06668387, + "auxiliary_loss_mlp": 0.01308478, + "balance_loss_clip": 0.063052, + "balance_loss_mlp": 0.01266421, + "epoch": 0.10683901999098151, + "flos": 22024182902400.0, + "grad_norm": 4.182030492494299, + "language_loss": 0.84917277, + "learning_rate": 3.938384702378727e-06, + "loss": 0.92894137, + "num_input_tokens_seen": 38461820, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 0.42041016, + "step": 1777, + "time_per_iteration": 2.595827102661133 + }, + { + "auxiliary_loss_clip": 0.06665277, + "auxiliary_loss_mlp": 0.01305083, + "balance_loss_clip": 0.06308808, + "balance_loss_mlp": 0.01265076, + "epoch": 0.10689914324364948, + "flos": 25049435040000.0, + "grad_norm": 3.105295988575609, + "language_loss": 0.89778632, + "learning_rate": 3.938288739241625e-06, + "loss": 0.97748995, + "num_input_tokens_seen": 38482235, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 0.40014648, + "step": 1778, + "time_per_iteration": 2.5659501552581787 + }, + { + "auxiliary_loss_clip": 0.06673209, + "auxiliary_loss_mlp": 0.0130986, + "balance_loss_clip": 0.06311059, + "balance_loss_mlp": 0.01270068, + "epoch": 0.10695926649631746, + "flos": 16440643831680.0, + "grad_norm": 2.394911901784639, + "language_loss": 0.85383832, + "learning_rate": 3.938192702604417e-06, + "loss": 0.93366897, + "num_input_tokens_seen": 38500690, + "router_z_loss_clip": 3.62304688, + "router_z_loss_mlp": 0.39794922, + "step": 1779, + "time_per_iteration": 2.593081474304199 + }, + { + "auxiliary_loss_clip": 0.06673639, + "auxiliary_loss_mlp": 0.01307049, + "balance_loss_clip": 0.06310658, + "balance_loss_mlp": 0.01266255, + "epoch": 0.10701938974898542, + "flos": 16984281121920.0, + "grad_norm": 6.263456292034634, + "language_loss": 0.689089, + "learning_rate": 3.9380965924707495e-06, + "loss": 0.76889586, + "num_input_tokens_seen": 38518405, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 0.40844727, + "step": 1780, + "time_per_iteration": 2.5288658142089844 + }, + { + "auxiliary_loss_clip": 0.06670965, + "auxiliary_loss_mlp": 0.01308635, + "balance_loss_clip": 0.06307741, + "balance_loss_mlp": 0.01267675, + "epoch": 0.10707951300165339, + "flos": 15893568524160.0, + "grad_norm": 2.7813039840033116, + "language_loss": 0.94183797, + "learning_rate": 3.938000408844265e-06, + "loss": 1.02163386, + "num_input_tokens_seen": 38535060, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 0.40942383, + "step": 1781, + "time_per_iteration": 2.5472099781036377 + }, + { + "auxiliary_loss_clip": 0.06674273, + "auxiliary_loss_mlp": 0.01309874, + "balance_loss_clip": 0.06307364, + "balance_loss_mlp": 0.01267793, + "epoch": 0.10713963625432135, + "flos": 14252510309760.0, + "grad_norm": 2.902551508287184, + "language_loss": 0.80661923, + "learning_rate": 3.9379041517286105e-06, + "loss": 0.88646066, + "num_input_tokens_seen": 38552855, + "router_z_loss_clip": 3.671875, + "router_z_loss_mlp": 0.4206543, + "step": 1782, + "time_per_iteration": 2.510643482208252 + }, + { + "auxiliary_loss_clip": 0.06686161, + "auxiliary_loss_mlp": 0.01310662, + "balance_loss_clip": 0.06313431, + "balance_loss_mlp": 0.01267341, + "epoch": 0.10719975950698933, + "flos": 16761224753280.0, + "grad_norm": 2.870404925374148, + "language_loss": 0.80170923, + "learning_rate": 3.937807821127436e-06, + "loss": 0.88167745, + "num_input_tokens_seen": 38570075, + "router_z_loss_clip": 3.7265625, + "router_z_loss_mlp": 0.43334961, + "step": 1783, + "time_per_iteration": 2.5342109203338623 + }, + { + "auxiliary_loss_clip": 0.06683534, + "auxiliary_loss_mlp": 0.01311834, + "balance_loss_clip": 0.063077, + "balance_loss_mlp": 0.0126818, + "epoch": 0.1072598827596573, + "flos": 22717181295360.0, + "grad_norm": 2.882000106412139, + "language_loss": 0.88123596, + "learning_rate": 3.937711417044395e-06, + "loss": 0.96118969, + "num_input_tokens_seen": 38587970, + "router_z_loss_clip": 3.75390625, + "router_z_loss_mlp": 0.4362793, + "step": 1784, + "time_per_iteration": 2.5347747802734375 + }, + { + "auxiliary_loss_clip": 0.0667218, + "auxiliary_loss_mlp": 0.0129997, + "balance_loss_clip": 0.06303082, + "balance_loss_mlp": 0.01257484, + "epoch": 0.10732000601232526, + "flos": 23264969362560.0, + "grad_norm": 3.307544320202646, + "language_loss": 1.02124667, + "learning_rate": 3.937614939483143e-06, + "loss": 1.10096812, + "num_input_tokens_seen": 38605840, + "router_z_loss_clip": 3.6875, + "router_z_loss_mlp": 0.42480469, + "step": 1785, + "time_per_iteration": 2.573028802871704 + }, + { + "auxiliary_loss_clip": 0.06653184, + "auxiliary_loss_mlp": 0.01298346, + "balance_loss_clip": 0.06301528, + "balance_loss_mlp": 0.01260676, + "epoch": 0.10738012926499324, + "flos": 24213951578880.0, + "grad_norm": 1.5126040850021356, + "language_loss": 0.86291718, + "learning_rate": 3.937518388447339e-06, + "loss": 0.94243246, + "num_input_tokens_seen": 38627070, + "router_z_loss_clip": 3.51757812, + "router_z_loss_mlp": 0.37670898, + "step": 1786, + "time_per_iteration": 2.583588123321533 + }, + { + "auxiliary_loss_clip": 0.06674268, + "auxiliary_loss_mlp": 0.01305446, + "balance_loss_clip": 0.06299917, + "balance_loss_mlp": 0.01260337, + "epoch": 0.1074402525176612, + "flos": 20929361454720.0, + "grad_norm": 2.204457856509681, + "language_loss": 0.80718577, + "learning_rate": 3.937421763940642e-06, + "loss": 0.88698298, + "num_input_tokens_seen": 38645840, + "router_z_loss_clip": 3.74414062, + "router_z_loss_mlp": 0.45092773, + "step": 1787, + "time_per_iteration": 2.5648107528686523 + }, + { + "auxiliary_loss_clip": 0.06675328, + "auxiliary_loss_mlp": 0.01304706, + "balance_loss_clip": 0.06304328, + "balance_loss_mlp": 0.01262769, + "epoch": 0.10750037577032917, + "flos": 16952695332480.0, + "grad_norm": 2.64327450986053, + "language_loss": 0.8385697, + "learning_rate": 3.937325065966719e-06, + "loss": 0.91837001, + "num_input_tokens_seen": 38664770, + "router_z_loss_clip": 3.71484375, + "router_z_loss_mlp": 0.41943359, + "step": 1788, + "time_per_iteration": 2.5402321815490723 + }, + { + "auxiliary_loss_clip": 0.06668989, + "auxiliary_loss_mlp": 0.01316653, + "balance_loss_clip": 0.0630315, + "balance_loss_mlp": 0.01276384, + "epoch": 0.10756049902299715, + "flos": 20272770460800.0, + "grad_norm": 2.8631598958886135, + "language_loss": 0.79821587, + "learning_rate": 3.9372282945292335e-06, + "loss": 0.87807226, + "num_input_tokens_seen": 38683865, + "router_z_loss_clip": 3.66015625, + "router_z_loss_mlp": 0.40258789, + "step": 1789, + "time_per_iteration": 2.5255203247070312 + }, + { + "auxiliary_loss_clip": 0.06671752, + "auxiliary_loss_mlp": 0.01304626, + "balance_loss_clip": 0.06304207, + "balance_loss_mlp": 0.01261019, + "epoch": 0.10762062227566511, + "flos": 23593264859520.0, + "grad_norm": 3.1602441142249584, + "language_loss": 0.75890934, + "learning_rate": 3.937131449631859e-06, + "loss": 0.83867311, + "num_input_tokens_seen": 38702485, + "router_z_loss_clip": 3.671875, + "router_z_loss_mlp": 0.43603516, + "step": 1790, + "time_per_iteration": 2.6021804809570312 + }, + { + "auxiliary_loss_clip": 0.06681746, + "auxiliary_loss_mlp": 0.01304108, + "balance_loss_clip": 0.06308466, + "balance_loss_mlp": 0.01261741, + "epoch": 0.10768074552833308, + "flos": 24316549303680.0, + "grad_norm": 2.153087509424505, + "language_loss": 0.80275488, + "learning_rate": 3.9370345312782645e-06, + "loss": 0.88261342, + "num_input_tokens_seen": 38722475, + "router_z_loss_clip": 3.734375, + "router_z_loss_mlp": 0.42333984, + "step": 1791, + "time_per_iteration": 2.546696662902832 + }, + { + "auxiliary_loss_clip": 0.06660049, + "auxiliary_loss_mlp": 0.01311951, + "balance_loss_clip": 0.06307684, + "balance_loss_mlp": 0.01273255, + "epoch": 0.10774086878100106, + "flos": 25306760528640.0, + "grad_norm": 1.9333309848647533, + "language_loss": 0.72259545, + "learning_rate": 3.936937539472126e-06, + "loss": 0.80231547, + "num_input_tokens_seen": 38743285, + "router_z_loss_clip": 3.5234375, + "router_z_loss_mlp": 0.38647461, + "step": 1792, + "time_per_iteration": 3.9801604747772217 + }, + { + "auxiliary_loss_clip": 0.06673245, + "auxiliary_loss_mlp": 0.01302989, + "balance_loss_clip": 0.06307209, + "balance_loss_mlp": 0.01260813, + "epoch": 0.10780099203366902, + "flos": 22060506447360.0, + "grad_norm": 2.562098500680419, + "language_loss": 0.78115147, + "learning_rate": 3.9368404742171236e-06, + "loss": 0.86091387, + "num_input_tokens_seen": 38763035, + "router_z_loss_clip": 3.65820312, + "router_z_loss_mlp": 0.42163086, + "step": 1793, + "time_per_iteration": 2.5435540676116943 + }, + { + "auxiliary_loss_clip": 0.06668183, + "auxiliary_loss_mlp": 0.01304414, + "balance_loss_clip": 0.06312631, + "balance_loss_mlp": 0.01268151, + "epoch": 0.10786111528633699, + "flos": 22754091818880.0, + "grad_norm": 1.5894120102976992, + "language_loss": 0.86093199, + "learning_rate": 3.936743335516936e-06, + "loss": 0.94065803, + "num_input_tokens_seen": 38784900, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 0.36279297, + "step": 1794, + "time_per_iteration": 4.001549482345581 + }, + { + "auxiliary_loss_clip": 0.0669271, + "auxiliary_loss_mlp": 0.01312602, + "balance_loss_clip": 0.06319374, + "balance_loss_mlp": 0.01269472, + "epoch": 0.10792123853900495, + "flos": 20857510978560.0, + "grad_norm": 2.1590787324009257, + "language_loss": 0.77325815, + "learning_rate": 3.936646123375246e-06, + "loss": 0.8533113, + "num_input_tokens_seen": 38804695, + "router_z_loss_clip": 3.734375, + "router_z_loss_mlp": 0.43115234, + "step": 1795, + "time_per_iteration": 2.601548910140991 + }, + { + "auxiliary_loss_clip": 0.06686068, + "auxiliary_loss_mlp": 0.01301313, + "balance_loss_clip": 0.06317562, + "balance_loss_mlp": 0.01262212, + "epoch": 0.10798136179167293, + "flos": 17754454725120.0, + "grad_norm": 3.0035183040345306, + "language_loss": 0.83787191, + "learning_rate": 3.936548837795741e-06, + "loss": 0.91774577, + "num_input_tokens_seen": 38822395, + "router_z_loss_clip": 3.68359375, + "router_z_loss_mlp": 0.39086914, + "step": 1796, + "time_per_iteration": 2.506821870803833 + }, + { + "auxiliary_loss_clip": 0.06692545, + "auxiliary_loss_mlp": 0.01329164, + "balance_loss_clip": 0.06318776, + "balance_loss_mlp": 0.01285318, + "epoch": 0.1080414850443409, + "flos": 13594745358720.0, + "grad_norm": 2.560788533662373, + "language_loss": 0.7551347, + "learning_rate": 3.936451478782111e-06, + "loss": 0.83535177, + "num_input_tokens_seen": 38839865, + "router_z_loss_clip": 3.73828125, + "router_z_loss_mlp": 0.43847656, + "step": 1797, + "time_per_iteration": 3.9367597103118896 + }, + { + "auxiliary_loss_clip": 0.06662647, + "auxiliary_loss_mlp": 0.01300606, + "balance_loss_clip": 0.06308785, + "balance_loss_mlp": 0.0126265, + "epoch": 0.10810160829700886, + "flos": 16259026106880.0, + "grad_norm": 2.354924251941542, + "language_loss": 0.83353364, + "learning_rate": 3.936354046338046e-06, + "loss": 0.91316622, + "num_input_tokens_seen": 38857300, + "router_z_loss_clip": 3.53515625, + "router_z_loss_mlp": 0.37939453, + "step": 1798, + "time_per_iteration": 4.009509086608887 + }, + { + "auxiliary_loss_clip": 0.06672391, + "auxiliary_loss_mlp": 0.01305094, + "balance_loss_clip": 0.06315865, + "balance_loss_mlp": 0.01265635, + "epoch": 0.10816173154967684, + "flos": 15163282264320.0, + "grad_norm": 3.5539012768628786, + "language_loss": 0.87248892, + "learning_rate": 3.936256540467242e-06, + "loss": 0.95226371, + "num_input_tokens_seen": 38874960, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 0.39477539, + "step": 1799, + "time_per_iteration": 2.5058934688568115 + }, + { + "auxiliary_loss_clip": 0.06677136, + "auxiliary_loss_mlp": 0.01305557, + "balance_loss_clip": 0.06318786, + "balance_loss_mlp": 0.01268459, + "epoch": 0.10822185480234481, + "flos": 17791113686400.0, + "grad_norm": 2.263102555339672, + "language_loss": 0.78951424, + "learning_rate": 3.9361589611733955e-06, + "loss": 0.86934125, + "num_input_tokens_seen": 38893610, + "router_z_loss_clip": 3.58203125, + "router_z_loss_mlp": 0.37084961, + "step": 1800, + "time_per_iteration": 2.546147584915161 + }, + { + "auxiliary_loss_clip": 0.06672224, + "auxiliary_loss_mlp": 0.01299, + "balance_loss_clip": 0.06316296, + "balance_loss_mlp": 0.01262546, + "epoch": 0.10828197805501277, + "flos": 25563708673920.0, + "grad_norm": 5.510395821762047, + "language_loss": 0.74356997, + "learning_rate": 3.9360613084602075e-06, + "loss": 0.82328218, + "num_input_tokens_seen": 38913485, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 0.36425781, + "step": 1801, + "time_per_iteration": 2.6982262134552 + }, + { + "auxiliary_loss_clip": 0.06691626, + "auxiliary_loss_mlp": 0.01309625, + "balance_loss_clip": 0.06324095, + "balance_loss_mlp": 0.01272813, + "epoch": 0.10834210130768075, + "flos": 28991748188160.0, + "grad_norm": 2.1562213268616355, + "language_loss": 0.67963791, + "learning_rate": 3.935963582331381e-06, + "loss": 0.75965041, + "num_input_tokens_seen": 38935650, + "router_z_loss_clip": 3.67578125, + "router_z_loss_mlp": 0.3684082, + "step": 1802, + "time_per_iteration": 2.633770704269409 + }, + { + "auxiliary_loss_clip": 0.06676073, + "auxiliary_loss_mlp": 0.01309023, + "balance_loss_clip": 0.0632169, + "balance_loss_mlp": 0.01273379, + "epoch": 0.10840222456034872, + "flos": 20270045203200.0, + "grad_norm": 4.600711865085207, + "language_loss": 0.83367407, + "learning_rate": 3.935865782790621e-06, + "loss": 0.9135251, + "num_input_tokens_seen": 38954130, + "router_z_loss_clip": 3.55078125, + "router_z_loss_mlp": 0.35668945, + "step": 1803, + "time_per_iteration": 2.5231714248657227 + }, + { + "auxiliary_loss_clip": 0.06688153, + "auxiliary_loss_mlp": 0.01302267, + "balance_loss_clip": 0.06328186, + "balance_loss_mlp": 0.01263286, + "epoch": 0.10846234781301668, + "flos": 19868851054080.0, + "grad_norm": 2.166179009667806, + "language_loss": 0.92279881, + "learning_rate": 3.9357679098416365e-06, + "loss": 1.00270307, + "num_input_tokens_seen": 38972905, + "router_z_loss_clip": 3.59765625, + "router_z_loss_mlp": 0.39013672, + "step": 1804, + "time_per_iteration": 2.5790512561798096 + }, + { + "auxiliary_loss_clip": 0.06684472, + "auxiliary_loss_mlp": 0.01313096, + "balance_loss_clip": 0.06322414, + "balance_loss_mlp": 0.01273327, + "epoch": 0.10852247106568465, + "flos": 26476283491200.0, + "grad_norm": 2.1541825231451384, + "language_loss": 0.7834245, + "learning_rate": 3.935669963488139e-06, + "loss": 0.8634001, + "num_input_tokens_seen": 38993255, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 0.39794922, + "step": 1805, + "time_per_iteration": 2.579225778579712 + }, + { + "auxiliary_loss_clip": 0.06686831, + "auxiliary_loss_mlp": 0.01314489, + "balance_loss_clip": 0.06327775, + "balance_loss_mlp": 0.01276938, + "epoch": 0.10858259431835263, + "flos": 30089420674560.0, + "grad_norm": 1.8150777160293243, + "language_loss": 0.87391019, + "learning_rate": 3.935571943733843e-06, + "loss": 0.95392346, + "num_input_tokens_seen": 39012610, + "router_z_loss_clip": 3.58789062, + "router_z_loss_mlp": 0.37548828, + "step": 1806, + "time_per_iteration": 2.6113767623901367 + }, + { + "auxiliary_loss_clip": 0.06674515, + "auxiliary_loss_mlp": 0.01306373, + "balance_loss_clip": 0.06320654, + "balance_loss_mlp": 0.01270038, + "epoch": 0.10864271757102059, + "flos": 19069313794560.0, + "grad_norm": 2.587857349139583, + "language_loss": 0.81862879, + "learning_rate": 3.9354738505824635e-06, + "loss": 0.89843768, + "num_input_tokens_seen": 39030120, + "router_z_loss_clip": 3.53515625, + "router_z_loss_mlp": 0.36328125, + "step": 1807, + "time_per_iteration": 2.5133659839630127 + }, + { + "auxiliary_loss_clip": 0.06671922, + "auxiliary_loss_mlp": 0.01298096, + "balance_loss_clip": 0.06316403, + "balance_loss_mlp": 0.01264193, + "epoch": 0.10870284082368856, + "flos": 24721558813440.0, + "grad_norm": 5.872677105154593, + "language_loss": 0.80080831, + "learning_rate": 3.9353756840377225e-06, + "loss": 0.88050854, + "num_input_tokens_seen": 39049875, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 0.33911133, + "step": 1808, + "time_per_iteration": 2.615813732147217 + }, + { + "auxiliary_loss_clip": 0.06679243, + "auxiliary_loss_mlp": 0.01305785, + "balance_loss_clip": 0.06317936, + "balance_loss_mlp": 0.0126926, + "epoch": 0.10876296407635654, + "flos": 20633322579840.0, + "grad_norm": 1.9478579539752536, + "language_loss": 0.80837792, + "learning_rate": 3.935277444103342e-06, + "loss": 0.88822818, + "num_input_tokens_seen": 39068935, + "router_z_loss_clip": 3.61328125, + "router_z_loss_mlp": 0.36523438, + "step": 1809, + "time_per_iteration": 2.5448191165924072 + }, + { + "auxiliary_loss_clip": 0.0666375, + "auxiliary_loss_mlp": 0.01303981, + "balance_loss_clip": 0.06309726, + "balance_loss_mlp": 0.01265119, + "epoch": 0.1088230873290245, + "flos": 21586245937920.0, + "grad_norm": 2.4636813373380213, + "language_loss": 0.86466354, + "learning_rate": 3.935179130783046e-06, + "loss": 0.94434083, + "num_input_tokens_seen": 39087370, + "router_z_loss_clip": 3.54101562, + "router_z_loss_mlp": 0.38891602, + "step": 1810, + "time_per_iteration": 2.603607654571533 + }, + { + "auxiliary_loss_clip": 0.06689243, + "auxiliary_loss_mlp": 0.01306323, + "balance_loss_clip": 0.06319645, + "balance_loss_mlp": 0.01268367, + "epoch": 0.10888321058169247, + "flos": 26476283491200.0, + "grad_norm": 1.9747664396184277, + "language_loss": 0.65524805, + "learning_rate": 3.935080744080564e-06, + "loss": 0.73520374, + "num_input_tokens_seen": 39106635, + "router_z_loss_clip": 3.69726562, + "router_z_loss_mlp": 0.37939453, + "step": 1811, + "time_per_iteration": 2.581341505050659 + }, + { + "auxiliary_loss_clip": 0.0667599, + "auxiliary_loss_mlp": 0.01304861, + "balance_loss_clip": 0.06313843, + "balance_loss_mlp": 0.01266166, + "epoch": 0.10894333383436045, + "flos": 25855722552960.0, + "grad_norm": 2.675746043218001, + "language_loss": 0.75747859, + "learning_rate": 3.934982283999626e-06, + "loss": 0.83728707, + "num_input_tokens_seen": 39126335, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 0.38671875, + "step": 1812, + "time_per_iteration": 2.6015379428863525 + }, + { + "auxiliary_loss_clip": 0.06657378, + "auxiliary_loss_mlp": 0.01303294, + "balance_loss_clip": 0.06303936, + "balance_loss_mlp": 0.01265219, + "epoch": 0.10900345708702841, + "flos": 19543238887680.0, + "grad_norm": 2.31852988369708, + "language_loss": 0.74425399, + "learning_rate": 3.934883750543966e-06, + "loss": 0.82386076, + "num_input_tokens_seen": 39144820, + "router_z_loss_clip": 3.53320312, + "router_z_loss_mlp": 0.38085938, + "step": 1813, + "time_per_iteration": 2.5689308643341064 + }, + { + "auxiliary_loss_clip": 0.06659622, + "auxiliary_loss_mlp": 0.01293341, + "balance_loss_clip": 0.06308373, + "balance_loss_mlp": 0.01258556, + "epoch": 0.10906358033969638, + "flos": 23630091528960.0, + "grad_norm": 1.8365155089256564, + "language_loss": 0.84168994, + "learning_rate": 3.93478514371732e-06, + "loss": 0.92121959, + "num_input_tokens_seen": 39165945, + "router_z_loss_clip": 3.51367188, + "router_z_loss_mlp": 0.34790039, + "step": 1814, + "time_per_iteration": 2.5616791248321533 + }, + { + "auxiliary_loss_clip": 0.06670845, + "auxiliary_loss_mlp": 0.01300399, + "balance_loss_clip": 0.06307411, + "balance_loss_mlp": 0.01261036, + "epoch": 0.10912370359236434, + "flos": 21221039917440.0, + "grad_norm": 3.301230683958358, + "language_loss": 0.85154849, + "learning_rate": 3.934686463523429e-06, + "loss": 0.93126094, + "num_input_tokens_seen": 39183520, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 0.39355469, + "step": 1815, + "time_per_iteration": 2.57688307762146 + }, + { + "auxiliary_loss_clip": 0.06661555, + "auxiliary_loss_mlp": 0.01302183, + "balance_loss_clip": 0.06312289, + "balance_loss_mlp": 0.01263726, + "epoch": 0.10918382684503232, + "flos": 13558296032640.0, + "grad_norm": 2.7300514950641714, + "language_loss": 0.73428917, + "learning_rate": 3.9345877099660315e-06, + "loss": 0.81392652, + "num_input_tokens_seen": 39201190, + "router_z_loss_clip": 3.4921875, + "router_z_loss_mlp": 0.38476562, + "step": 1816, + "time_per_iteration": 2.503822088241577 + }, + { + "auxiliary_loss_clip": 0.06674603, + "auxiliary_loss_mlp": 0.01310351, + "balance_loss_clip": 0.06307942, + "balance_loss_mlp": 0.01269105, + "epoch": 0.10924395009770028, + "flos": 27971712109440.0, + "grad_norm": 2.9873916021139078, + "language_loss": 0.74010128, + "learning_rate": 3.9344888830488744e-06, + "loss": 0.81995082, + "num_input_tokens_seen": 39221210, + "router_z_loss_clip": 3.66601562, + "router_z_loss_mlp": 0.41235352, + "step": 1817, + "time_per_iteration": 2.636141300201416 + }, + { + "auxiliary_loss_clip": 0.06667508, + "auxiliary_loss_mlp": 0.01306282, + "balance_loss_clip": 0.06316356, + "balance_loss_mlp": 0.01268659, + "epoch": 0.10930407335036825, + "flos": 25600912686720.0, + "grad_norm": 1.8767258076281454, + "language_loss": 0.68811858, + "learning_rate": 3.934389982775706e-06, + "loss": 0.76785648, + "num_input_tokens_seen": 39242025, + "router_z_loss_clip": 3.51171875, + "router_z_loss_mlp": 0.37597656, + "step": 1818, + "time_per_iteration": 2.5823488235473633 + }, + { + "auxiliary_loss_clip": 0.06675036, + "auxiliary_loss_mlp": 0.01306463, + "balance_loss_clip": 0.06313543, + "balance_loss_mlp": 0.01266575, + "epoch": 0.10936419660303623, + "flos": 18412177749120.0, + "grad_norm": 2.168064712705315, + "language_loss": 0.74997962, + "learning_rate": 3.934291009150275e-06, + "loss": 0.82979459, + "num_input_tokens_seen": 39259870, + "router_z_loss_clip": 3.61523438, + "router_z_loss_mlp": 0.39892578, + "step": 1819, + "time_per_iteration": 2.5780999660491943 + }, + { + "auxiliary_loss_clip": 0.0666959, + "auxiliary_loss_mlp": 0.01302484, + "balance_loss_clip": 0.06316112, + "balance_loss_mlp": 0.01264123, + "epoch": 0.1094243198557042, + "flos": 23846523425280.0, + "grad_norm": 2.805852177899608, + "language_loss": 0.75565147, + "learning_rate": 3.934191962176335e-06, + "loss": 0.83537227, + "num_input_tokens_seen": 39278500, + "router_z_loss_clip": 3.53515625, + "router_z_loss_mlp": 0.38354492, + "step": 1820, + "time_per_iteration": 2.55102801322937 + }, + { + "auxiliary_loss_clip": 0.06670672, + "auxiliary_loss_mlp": 0.01301119, + "balance_loss_clip": 0.06311028, + "balance_loss_mlp": 0.01261065, + "epoch": 0.10948444310837216, + "flos": 14648589360000.0, + "grad_norm": 3.185311290283081, + "language_loss": 0.84421206, + "learning_rate": 3.934092841857642e-06, + "loss": 0.92392999, + "num_input_tokens_seen": 39294800, + "router_z_loss_clip": 3.59765625, + "router_z_loss_mlp": 0.40039062, + "step": 1821, + "time_per_iteration": 2.557086229324341 + }, + { + "auxiliary_loss_clip": 0.06666994, + "auxiliary_loss_mlp": 0.01310986, + "balance_loss_clip": 0.06314231, + "balance_loss_mlp": 0.01271409, + "epoch": 0.10954456636104014, + "flos": 27826250221440.0, + "grad_norm": 3.7637860321271117, + "language_loss": 0.78284943, + "learning_rate": 3.933993648197955e-06, + "loss": 0.86262918, + "num_input_tokens_seen": 39314625, + "router_z_loss_clip": 3.52734375, + "router_z_loss_mlp": 0.39575195, + "step": 1822, + "time_per_iteration": 2.607753038406372 + }, + { + "auxiliary_loss_clip": 0.06665225, + "auxiliary_loss_mlp": 0.01305751, + "balance_loss_clip": 0.06311564, + "balance_loss_mlp": 0.01267271, + "epoch": 0.1096046896137081, + "flos": 33629491497600.0, + "grad_norm": 2.4721955378281133, + "language_loss": 0.81345534, + "learning_rate": 3.933894381201034e-06, + "loss": 0.89316511, + "num_input_tokens_seen": 39336465, + "router_z_loss_clip": 3.53515625, + "router_z_loss_mlp": 0.38525391, + "step": 1823, + "time_per_iteration": 2.7046356201171875 + }, + { + "auxiliary_loss_clip": 0.06663416, + "auxiliary_loss_mlp": 0.01297526, + "balance_loss_clip": 0.06311031, + "balance_loss_mlp": 0.01260643, + "epoch": 0.10966481286637607, + "flos": 26987370670080.0, + "grad_norm": 1.5405254615008266, + "language_loss": 0.8184576, + "learning_rate": 3.933795040870645e-06, + "loss": 0.898067, + "num_input_tokens_seen": 39357930, + "router_z_loss_clip": 3.5234375, + "router_z_loss_mlp": 0.36889648, + "step": 1824, + "time_per_iteration": 2.6020491123199463 + }, + { + "auxiliary_loss_clip": 0.06675697, + "auxiliary_loss_mlp": 0.01302612, + "balance_loss_clip": 0.06317075, + "balance_loss_mlp": 0.01262796, + "epoch": 0.10972493611904403, + "flos": 23042751534720.0, + "grad_norm": 2.030784567379419, + "language_loss": 0.88740194, + "learning_rate": 3.933695627210554e-06, + "loss": 0.96718502, + "num_input_tokens_seen": 39376380, + "router_z_loss_clip": 3.5859375, + "router_z_loss_mlp": 0.3984375, + "step": 1825, + "time_per_iteration": 2.6143786907196045 + }, + { + "auxiliary_loss_clip": 0.06672946, + "auxiliary_loss_mlp": 0.01304094, + "balance_loss_clip": 0.06315491, + "balance_loss_mlp": 0.01265113, + "epoch": 0.10978505937171201, + "flos": 38113261729920.0, + "grad_norm": 4.39958169553056, + "language_loss": 0.77133435, + "learning_rate": 3.933596140224532e-06, + "loss": 0.85110474, + "num_input_tokens_seen": 39399935, + "router_z_loss_clip": 3.57617188, + "router_z_loss_mlp": 0.39013672, + "step": 1826, + "time_per_iteration": 2.6767754554748535 + }, + { + "auxiliary_loss_clip": 0.06562361, + "auxiliary_loss_mlp": 0.01306115, + "balance_loss_clip": 0.06342762, + "balance_loss_mlp": 0.01289641, + "epoch": 0.10984518262437998, + "flos": 59867987500800.0, + "grad_norm": 0.8265503512589908, + "language_loss": 0.55217832, + "learning_rate": 3.93349657991635e-06, + "loss": 0.63086313, + "num_input_tokens_seen": 39460685, + "router_z_loss_clip": 2.19921875, + "router_z_loss_mlp": 0.16479492, + "step": 1827, + "time_per_iteration": 3.2042500972747803 + }, + { + "auxiliary_loss_clip": 0.06558152, + "auxiliary_loss_mlp": 0.01284859, + "balance_loss_clip": 0.06338888, + "balance_loss_mlp": 0.01267704, + "epoch": 0.10990530587704794, + "flos": 66741088907520.0, + "grad_norm": 0.7202592314019287, + "language_loss": 0.55369592, + "learning_rate": 3.933396946289784e-06, + "loss": 0.63212597, + "num_input_tokens_seen": 39524765, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.17175293, + "step": 1828, + "time_per_iteration": 3.2514500617980957 + }, + { + "auxiliary_loss_clip": 0.06692256, + "auxiliary_loss_mlp": 0.01311884, + "balance_loss_clip": 0.06327218, + "balance_loss_mlp": 0.01270018, + "epoch": 0.10996542912971592, + "flos": 25454234914560.0, + "grad_norm": 6.114677648786519, + "language_loss": 0.86263084, + "learning_rate": 3.933297239348612e-06, + "loss": 0.94267225, + "num_input_tokens_seen": 39543640, + "router_z_loss_clip": 3.65234375, + "router_z_loss_mlp": 0.41918945, + "step": 1829, + "time_per_iteration": 2.586923360824585 + }, + { + "auxiliary_loss_clip": 0.06682983, + "auxiliary_loss_mlp": 0.01319143, + "balance_loss_clip": 0.06320649, + "balance_loss_mlp": 0.01279207, + "epoch": 0.11002555238238389, + "flos": 44028282752640.0, + "grad_norm": 2.5270889660052025, + "language_loss": 0.90112162, + "learning_rate": 3.933197459096614e-06, + "loss": 0.98114288, + "num_input_tokens_seen": 39567525, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 0.3996582, + "step": 1830, + "time_per_iteration": 2.8102030754089355 + }, + { + "auxiliary_loss_clip": 0.06544227, + "auxiliary_loss_mlp": 0.01284934, + "balance_loss_clip": 0.06324031, + "balance_loss_mlp": 0.01268376, + "epoch": 0.11008567563505185, + "flos": 54085248547200.0, + "grad_norm": 0.6738836054555057, + "language_loss": 0.55525172, + "learning_rate": 3.9330976055375756e-06, + "loss": 0.63354337, + "num_input_tokens_seen": 39628470, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.16564941, + "step": 1831, + "time_per_iteration": 4.652044057846069 + }, + { + "auxiliary_loss_clip": 0.06700309, + "auxiliary_loss_mlp": 0.01328613, + "balance_loss_clip": 0.06332322, + "balance_loss_mlp": 0.01284744, + "epoch": 0.11014579888771983, + "flos": 24249981634560.0, + "grad_norm": 4.072580491450979, + "language_loss": 0.92313743, + "learning_rate": 3.932997678675282e-06, + "loss": 1.00342667, + "num_input_tokens_seen": 39646670, + "router_z_loss_clip": 3.67773438, + "router_z_loss_mlp": 0.43823242, + "step": 1832, + "time_per_iteration": 2.6010701656341553 + }, + { + "auxiliary_loss_clip": 0.06543858, + "auxiliary_loss_mlp": 0.01268849, + "balance_loss_clip": 0.06322708, + "balance_loss_mlp": 0.0125247, + "epoch": 0.1102059221403878, + "flos": 57763653661440.0, + "grad_norm": 0.681716215184674, + "language_loss": 0.59753174, + "learning_rate": 3.932897678513523e-06, + "loss": 0.67565876, + "num_input_tokens_seen": 39712915, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.16381836, + "step": 1833, + "time_per_iteration": 3.3245253562927246 + }, + { + "auxiliary_loss_clip": 0.0668912, + "auxiliary_loss_mlp": 0.01321784, + "balance_loss_clip": 0.06319445, + "balance_loss_mlp": 0.01278773, + "epoch": 0.11026604539305576, + "flos": 16800818607360.0, + "grad_norm": 5.311308312768562, + "language_loss": 0.81575066, + "learning_rate": 3.93279760505609e-06, + "loss": 0.89585972, + "num_input_tokens_seen": 39730650, + "router_z_loss_clip": 3.703125, + "router_z_loss_mlp": 0.42993164, + "step": 1834, + "time_per_iteration": 4.020633697509766 + }, + { + "auxiliary_loss_clip": 0.0668771, + "auxiliary_loss_mlp": 0.01323505, + "balance_loss_clip": 0.0632341, + "balance_loss_mlp": 0.01282997, + "epoch": 0.11032616864572373, + "flos": 23994920206080.0, + "grad_norm": 4.522465656610911, + "language_loss": 0.91756475, + "learning_rate": 3.932697458306779e-06, + "loss": 0.99767691, + "num_input_tokens_seen": 39751065, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 0.40478516, + "step": 1835, + "time_per_iteration": 2.5956919193267822 + }, + { + "auxiliary_loss_clip": 0.06685364, + "auxiliary_loss_mlp": 0.01321402, + "balance_loss_clip": 0.06324954, + "balance_loss_mlp": 0.01281729, + "epoch": 0.1103862918983917, + "flos": 19689329681280.0, + "grad_norm": 3.000861759629478, + "language_loss": 0.66412532, + "learning_rate": 3.932597238269386e-06, + "loss": 0.74419296, + "num_input_tokens_seen": 39769245, + "router_z_loss_clip": 3.60546875, + "router_z_loss_mlp": 0.39648438, + "step": 1836, + "time_per_iteration": 2.5927958488464355 + }, + { + "auxiliary_loss_clip": 0.06670263, + "auxiliary_loss_mlp": 0.01319261, + "balance_loss_clip": 0.06317647, + "balance_loss_mlp": 0.01279541, + "epoch": 0.11044641515105967, + "flos": 32169086686080.0, + "grad_norm": 2.1343283023714865, + "language_loss": 0.74546272, + "learning_rate": 3.932496944947711e-06, + "loss": 0.82535791, + "num_input_tokens_seen": 39790830, + "router_z_loss_clip": 3.52539062, + "router_z_loss_mlp": 0.3972168, + "step": 1837, + "time_per_iteration": 5.453325033187866 + }, + { + "auxiliary_loss_clip": 0.06688204, + "auxiliary_loss_mlp": 0.01319143, + "balance_loss_clip": 0.06321806, + "balance_loss_mlp": 0.01281496, + "epoch": 0.11050653840372764, + "flos": 16694573230080.0, + "grad_norm": 2.107729732197389, + "language_loss": 0.79967713, + "learning_rate": 3.93239657834556e-06, + "loss": 0.87975061, + "num_input_tokens_seen": 39809475, + "router_z_loss_clip": 3.66210938, + "router_z_loss_mlp": 0.3762207, + "step": 1838, + "time_per_iteration": 2.5330708026885986 + }, + { + "auxiliary_loss_clip": 0.06681567, + "auxiliary_loss_mlp": 0.01310209, + "balance_loss_clip": 0.06323014, + "balance_loss_mlp": 0.01271013, + "epoch": 0.11056666165639562, + "flos": 21214205809920.0, + "grad_norm": 1.83916180844076, + "language_loss": 0.72651547, + "learning_rate": 3.932296138466736e-06, + "loss": 0.8064332, + "num_input_tokens_seen": 39826355, + "router_z_loss_clip": 3.58398438, + "router_z_loss_mlp": 0.39160156, + "step": 1839, + "time_per_iteration": 2.5494542121887207 + }, + { + "auxiliary_loss_clip": 0.06685573, + "auxiliary_loss_mlp": 0.01308897, + "balance_loss_clip": 0.06317459, + "balance_loss_mlp": 0.0126777, + "epoch": 0.11062678490906358, + "flos": 19170444072960.0, + "grad_norm": 2.2710606045718835, + "language_loss": 0.80620813, + "learning_rate": 3.93219562531505e-06, + "loss": 0.88615286, + "num_input_tokens_seen": 39845335, + "router_z_loss_clip": 3.68359375, + "router_z_loss_mlp": 0.41137695, + "step": 1840, + "time_per_iteration": 2.525967836380005 + }, + { + "auxiliary_loss_clip": 0.0666925, + "auxiliary_loss_mlp": 0.01306907, + "balance_loss_clip": 0.06314851, + "balance_loss_mlp": 0.01271287, + "epoch": 0.11068690816173155, + "flos": 24901457529600.0, + "grad_norm": 1.7471100044619239, + "language_loss": 0.89207804, + "learning_rate": 3.932095038894311e-06, + "loss": 0.97183955, + "num_input_tokens_seen": 39865065, + "router_z_loss_clip": 3.54296875, + "router_z_loss_mlp": 0.35620117, + "step": 1841, + "time_per_iteration": 2.6120924949645996 + }, + { + "auxiliary_loss_clip": 0.06674149, + "auxiliary_loss_mlp": 0.01316221, + "balance_loss_clip": 0.06318908, + "balance_loss_mlp": 0.01276739, + "epoch": 0.11074703141439952, + "flos": 16478015552640.0, + "grad_norm": 2.1111741847875822, + "language_loss": 0.92148924, + "learning_rate": 3.931994379208334e-06, + "loss": 1.00139296, + "num_input_tokens_seen": 39882780, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 0.39477539, + "step": 1842, + "time_per_iteration": 2.5187559127807617 + }, + { + "auxiliary_loss_clip": 0.06674332, + "auxiliary_loss_mlp": 0.01308171, + "balance_loss_clip": 0.06317849, + "balance_loss_mlp": 0.01269166, + "epoch": 0.11080715466706749, + "flos": 19178535991680.0, + "grad_norm": 2.023955120097268, + "language_loss": 0.87531722, + "learning_rate": 3.931893646260937e-06, + "loss": 0.95514226, + "num_input_tokens_seen": 39900295, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 0.39038086, + "step": 1843, + "time_per_iteration": 2.6090967655181885 + }, + { + "auxiliary_loss_clip": 0.06693342, + "auxiliary_loss_mlp": 0.01302224, + "balance_loss_clip": 0.0632928, + "balance_loss_mlp": 0.01261073, + "epoch": 0.11086727791973545, + "flos": 27711325946880.0, + "grad_norm": 2.219830309112563, + "language_loss": 0.75884986, + "learning_rate": 3.931792840055941e-06, + "loss": 0.8388055, + "num_input_tokens_seen": 39922075, + "router_z_loss_clip": 3.64257812, + "router_z_loss_mlp": 0.41137695, + "step": 1844, + "time_per_iteration": 2.6051831245422363 + }, + { + "auxiliary_loss_clip": 0.06685966, + "auxiliary_loss_mlp": 0.01305534, + "balance_loss_clip": 0.06324236, + "balance_loss_mlp": 0.01264311, + "epoch": 0.11092740117240343, + "flos": 18520854894720.0, + "grad_norm": 2.695467374521673, + "language_loss": 0.77040052, + "learning_rate": 3.931691960597165e-06, + "loss": 0.85031545, + "num_input_tokens_seen": 39940115, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 0.41235352, + "step": 1845, + "time_per_iteration": 2.6330642700195312 + }, + { + "auxiliary_loss_clip": 0.06677614, + "auxiliary_loss_mlp": 0.01301707, + "balance_loss_clip": 0.06324686, + "balance_loss_mlp": 0.01264681, + "epoch": 0.1109875244250714, + "flos": 20528796211200.0, + "grad_norm": 2.004922205839187, + "language_loss": 0.77657044, + "learning_rate": 3.9315910078884375e-06, + "loss": 0.85636371, + "num_input_tokens_seen": 39959920, + "router_z_loss_clip": 3.52929688, + "router_z_loss_mlp": 0.37036133, + "step": 1846, + "time_per_iteration": 2.5549449920654297 + }, + { + "auxiliary_loss_clip": 0.06701723, + "auxiliary_loss_mlp": 0.01300229, + "balance_loss_clip": 0.0633509, + "balance_loss_mlp": 0.01259627, + "epoch": 0.11104764767773936, + "flos": 14103484623360.0, + "grad_norm": 2.935889161115543, + "language_loss": 0.88190699, + "learning_rate": 3.931489981933584e-06, + "loss": 0.96192646, + "num_input_tokens_seen": 39974755, + "router_z_loss_clip": 3.671875, + "router_z_loss_mlp": 0.40600586, + "step": 1847, + "time_per_iteration": 2.544952869415283 + }, + { + "auxiliary_loss_clip": 0.06695546, + "auxiliary_loss_mlp": 0.01304809, + "balance_loss_clip": 0.06331737, + "balance_loss_mlp": 0.01263944, + "epoch": 0.11110777093040733, + "flos": 20600730541440.0, + "grad_norm": 2.320230631722476, + "language_loss": 0.79106438, + "learning_rate": 3.931388882736438e-06, + "loss": 0.87106788, + "num_input_tokens_seen": 39993355, + "router_z_loss_clip": 3.63476562, + "router_z_loss_mlp": 0.40893555, + "step": 1848, + "time_per_iteration": 2.6920952796936035 + }, + { + "auxiliary_loss_clip": 0.0668249, + "auxiliary_loss_mlp": 0.01302322, + "balance_loss_clip": 0.06330639, + "balance_loss_mlp": 0.01266702, + "epoch": 0.11116789418307531, + "flos": 21876247319040.0, + "grad_norm": 2.02298107620041, + "language_loss": 0.79027736, + "learning_rate": 3.931287710300832e-06, + "loss": 0.87012547, + "num_input_tokens_seen": 40012410, + "router_z_loss_clip": 3.51757812, + "router_z_loss_mlp": 0.35595703, + "step": 1849, + "time_per_iteration": 2.630244255065918 + }, + { + "auxiliary_loss_clip": 0.0669456, + "auxiliary_loss_mlp": 0.01300991, + "balance_loss_clip": 0.06327619, + "balance_loss_mlp": 0.01259363, + "epoch": 0.11122801743574327, + "flos": 15528488284800.0, + "grad_norm": 3.153012159345978, + "language_loss": 0.73516262, + "learning_rate": 3.931186464630601e-06, + "loss": 0.81511813, + "num_input_tokens_seen": 40029315, + "router_z_loss_clip": 3.66601562, + "router_z_loss_mlp": 0.41625977, + "step": 1850, + "time_per_iteration": 2.5095834732055664 + }, + { + "auxiliary_loss_clip": 0.06693517, + "auxiliary_loss_mlp": 0.01305101, + "balance_loss_clip": 0.06331346, + "balance_loss_mlp": 0.01265952, + "epoch": 0.11128814068841124, + "flos": 14397511000320.0, + "grad_norm": 2.7195587095410594, + "language_loss": 0.83262205, + "learning_rate": 3.931085145729588e-06, + "loss": 0.91260827, + "num_input_tokens_seen": 40045765, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 0.39135742, + "step": 1851, + "time_per_iteration": 2.5094821453094482 + }, + { + "auxiliary_loss_clip": 0.06681279, + "auxiliary_loss_mlp": 0.01301356, + "balance_loss_clip": 0.06326675, + "balance_loss_mlp": 0.01266285, + "epoch": 0.11134826394107922, + "flos": 16659465569280.0, + "grad_norm": 3.1935743698172874, + "language_loss": 0.90682918, + "learning_rate": 3.930983753601631e-06, + "loss": 0.98665553, + "num_input_tokens_seen": 40061660, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 0.35083008, + "step": 1852, + "time_per_iteration": 2.5097947120666504 + }, + { + "auxiliary_loss_clip": 0.06688742, + "auxiliary_loss_mlp": 0.0130004, + "balance_loss_clip": 0.06332849, + "balance_loss_mlp": 0.01261392, + "epoch": 0.11140838719374718, + "flos": 16696627655040.0, + "grad_norm": 2.055655946127079, + "language_loss": 0.73742187, + "learning_rate": 3.930882288250578e-06, + "loss": 0.81730974, + "num_input_tokens_seen": 40080180, + "router_z_loss_clip": 3.56445312, + "router_z_loss_mlp": 0.38647461, + "step": 1853, + "time_per_iteration": 2.5568370819091797 + }, + { + "auxiliary_loss_clip": 0.06563053, + "auxiliary_loss_mlp": 0.01299008, + "balance_loss_clip": 0.06346013, + "balance_loss_mlp": 0.01281771, + "epoch": 0.11146851044641515, + "flos": 60994101248640.0, + "grad_norm": 0.7599812832333546, + "language_loss": 0.53835392, + "learning_rate": 3.930780749680273e-06, + "loss": 0.61697447, + "num_input_tokens_seen": 40138910, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.17260742, + "step": 1854, + "time_per_iteration": 3.1410884857177734 + }, + { + "auxiliary_loss_clip": 0.06710939, + "auxiliary_loss_mlp": 0.01301728, + "balance_loss_clip": 0.06327829, + "balance_loss_mlp": 0.01258336, + "epoch": 0.11152863369908313, + "flos": 22199301936000.0, + "grad_norm": 2.170007206040738, + "language_loss": 0.86019069, + "learning_rate": 3.9306791378945705e-06, + "loss": 0.94031739, + "num_input_tokens_seen": 40157745, + "router_z_loss_clip": 3.83398438, + "router_z_loss_mlp": 0.43383789, + "step": 1855, + "time_per_iteration": 2.5451245307922363 + }, + { + "auxiliary_loss_clip": 0.06687084, + "auxiliary_loss_mlp": 0.01297488, + "balance_loss_clip": 0.0632429, + "balance_loss_mlp": 0.01258745, + "epoch": 0.11158875695175109, + "flos": 19543742012160.0, + "grad_norm": 2.6985711919434054, + "language_loss": 0.83108622, + "learning_rate": 3.9305774528973205e-06, + "loss": 0.91093194, + "num_input_tokens_seen": 40175375, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 0.38720703, + "step": 1856, + "time_per_iteration": 2.578641653060913 + }, + { + "auxiliary_loss_clip": 0.06667097, + "auxiliary_loss_mlp": 0.01293205, + "balance_loss_clip": 0.06315985, + "balance_loss_mlp": 0.01257824, + "epoch": 0.11164888020441906, + "flos": 25448994034560.0, + "grad_norm": 1.90457681551641, + "language_loss": 0.84520233, + "learning_rate": 3.93047569469238e-06, + "loss": 0.92480534, + "num_input_tokens_seen": 40195715, + "router_z_loss_clip": 3.51171875, + "router_z_loss_mlp": 0.35375977, + "step": 1857, + "time_per_iteration": 2.581700086593628 + }, + { + "auxiliary_loss_clip": 0.06686676, + "auxiliary_loss_mlp": 0.01304106, + "balance_loss_clip": 0.06318156, + "balance_loss_mlp": 0.01263289, + "epoch": 0.11170900345708702, + "flos": 15638171679360.0, + "grad_norm": 2.609725880853407, + "language_loss": 0.85109961, + "learning_rate": 3.930373863283608e-06, + "loss": 0.9310075, + "num_input_tokens_seen": 40213975, + "router_z_loss_clip": 3.68164062, + "router_z_loss_mlp": 0.40795898, + "step": 1858, + "time_per_iteration": 2.536013603210449 + }, + { + "auxiliary_loss_clip": 0.0668328, + "auxiliary_loss_mlp": 0.01297406, + "balance_loss_clip": 0.06323688, + "balance_loss_mlp": 0.01259569, + "epoch": 0.111769126709755, + "flos": 23046105697920.0, + "grad_norm": 2.4700078024873102, + "language_loss": 0.92790282, + "learning_rate": 3.930271958674866e-06, + "loss": 1.00770962, + "num_input_tokens_seen": 40233905, + "router_z_loss_clip": 3.59570312, + "router_z_loss_mlp": 0.37841797, + "step": 1859, + "time_per_iteration": 2.541881799697876 + }, + { + "auxiliary_loss_clip": 0.06691643, + "auxiliary_loss_mlp": 0.01299678, + "balance_loss_clip": 0.06318307, + "balance_loss_mlp": 0.0125774, + "epoch": 0.11182924996242297, + "flos": 20857091708160.0, + "grad_norm": 2.367815973832506, + "language_loss": 0.8396585, + "learning_rate": 3.930169980870018e-06, + "loss": 0.9195717, + "num_input_tokens_seen": 40252810, + "router_z_loss_clip": 3.72851562, + "router_z_loss_mlp": 0.41943359, + "step": 1860, + "time_per_iteration": 2.565051555633545 + }, + { + "auxiliary_loss_clip": 0.06669357, + "auxiliary_loss_mlp": 0.01300378, + "balance_loss_clip": 0.06315688, + "balance_loss_mlp": 0.01263065, + "epoch": 0.11188937321509093, + "flos": 17460763764480.0, + "grad_norm": 2.7908462123762026, + "language_loss": 0.7628203, + "learning_rate": 3.930067929872931e-06, + "loss": 0.84251761, + "num_input_tokens_seen": 40272000, + "router_z_loss_clip": 3.53710938, + "router_z_loss_mlp": 0.37304688, + "step": 1861, + "time_per_iteration": 2.5033557415008545 + }, + { + "auxiliary_loss_clip": 0.06670874, + "auxiliary_loss_mlp": 0.01303256, + "balance_loss_clip": 0.0631748, + "balance_loss_mlp": 0.01266635, + "epoch": 0.11194949646775891, + "flos": 24102507248640.0, + "grad_norm": 2.306450242478339, + "language_loss": 0.90480924, + "learning_rate": 3.929965805687474e-06, + "loss": 0.9845506, + "num_input_tokens_seen": 40290660, + "router_z_loss_clip": 3.53515625, + "router_z_loss_mlp": 0.3659668, + "step": 1862, + "time_per_iteration": 2.582846164703369 + }, + { + "auxiliary_loss_clip": 0.06675294, + "auxiliary_loss_mlp": 0.01301536, + "balance_loss_clip": 0.0632014, + "balance_loss_mlp": 0.01265273, + "epoch": 0.11200961972042688, + "flos": 25160627808000.0, + "grad_norm": 2.402216402179579, + "language_loss": 0.88216799, + "learning_rate": 3.92986360831752e-06, + "loss": 0.9619363, + "num_input_tokens_seen": 40307820, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 0.36279297, + "step": 1863, + "time_per_iteration": 2.548849105834961 + }, + { + "auxiliary_loss_clip": 0.06661677, + "auxiliary_loss_mlp": 0.01299701, + "balance_loss_clip": 0.06311835, + "balance_loss_mlp": 0.01259933, + "epoch": 0.11206974297309484, + "flos": 21294735183360.0, + "grad_norm": 3.3365899426908574, + "language_loss": 0.65844059, + "learning_rate": 3.929761337766945e-06, + "loss": 0.73805434, + "num_input_tokens_seen": 40327430, + "router_z_loss_clip": 3.5, + "router_z_loss_mlp": 0.39770508, + "step": 1864, + "time_per_iteration": 2.5405185222625732 + }, + { + "auxiliary_loss_clip": 0.06660779, + "auxiliary_loss_mlp": 0.01305926, + "balance_loss_clip": 0.06303211, + "balance_loss_mlp": 0.01270211, + "epoch": 0.11212986622576282, + "flos": 18921881335680.0, + "grad_norm": 2.2819326265061717, + "language_loss": 0.75939113, + "learning_rate": 3.929658994039627e-06, + "loss": 0.83905816, + "num_input_tokens_seen": 40344545, + "router_z_loss_clip": 3.58007812, + "router_z_loss_mlp": 0.35693359, + "step": 1865, + "time_per_iteration": 2.518132209777832 + }, + { + "auxiliary_loss_clip": 0.06676203, + "auxiliary_loss_mlp": 0.01303479, + "balance_loss_clip": 0.06308948, + "balance_loss_mlp": 0.01262066, + "epoch": 0.11218998947843078, + "flos": 22061344988160.0, + "grad_norm": 2.4630430297676087, + "language_loss": 0.86701274, + "learning_rate": 3.929556577139446e-06, + "loss": 0.94680953, + "num_input_tokens_seen": 40362300, + "router_z_loss_clip": 3.67382812, + "router_z_loss_mlp": 0.4140625, + "step": 1866, + "time_per_iteration": 2.559826135635376 + }, + { + "auxiliary_loss_clip": 0.06668604, + "auxiliary_loss_mlp": 0.0129946, + "balance_loss_clip": 0.06308037, + "balance_loss_mlp": 0.01259405, + "epoch": 0.11225011273109875, + "flos": 24578612547840.0, + "grad_norm": 1.6697676286935108, + "language_loss": 0.82806516, + "learning_rate": 3.929454087070286e-06, + "loss": 0.90774584, + "num_input_tokens_seen": 40384720, + "router_z_loss_clip": 3.60546875, + "router_z_loss_mlp": 0.40014648, + "step": 1867, + "time_per_iteration": 2.6024861335754395 + }, + { + "auxiliary_loss_clip": 0.06666633, + "auxiliary_loss_mlp": 0.01303841, + "balance_loss_clip": 0.06308746, + "balance_loss_mlp": 0.01266099, + "epoch": 0.11231023598376672, + "flos": 28446140327040.0, + "grad_norm": 2.646357828465267, + "language_loss": 0.88275552, + "learning_rate": 3.929351523836035e-06, + "loss": 0.96246034, + "num_input_tokens_seen": 40404000, + "router_z_loss_clip": 3.58007812, + "router_z_loss_mlp": 0.37744141, + "step": 1868, + "time_per_iteration": 2.6040542125701904 + }, + { + "auxiliary_loss_clip": 0.06659871, + "auxiliary_loss_mlp": 0.01297203, + "balance_loss_clip": 0.06306987, + "balance_loss_mlp": 0.01259866, + "epoch": 0.1123703592364347, + "flos": 14431318922880.0, + "grad_norm": 2.6026187077821796, + "language_loss": 0.69696379, + "learning_rate": 3.9292488874405795e-06, + "loss": 0.77653456, + "num_input_tokens_seen": 40418665, + "router_z_loss_clip": 3.52929688, + "router_z_loss_mlp": 0.3737793, + "step": 1869, + "time_per_iteration": 2.562173843383789 + }, + { + "auxiliary_loss_clip": 0.06669002, + "auxiliary_loss_mlp": 0.01308207, + "balance_loss_clip": 0.06307223, + "balance_loss_mlp": 0.01267629, + "epoch": 0.11243048248910266, + "flos": 22242753077760.0, + "grad_norm": 2.004713314117072, + "language_loss": 0.78550231, + "learning_rate": 3.929146177887814e-06, + "loss": 0.86527443, + "num_input_tokens_seen": 40437870, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 0.40600586, + "step": 1870, + "time_per_iteration": 2.5912842750549316 + }, + { + "auxiliary_loss_clip": 0.06677727, + "auxiliary_loss_mlp": 0.01300065, + "balance_loss_clip": 0.06308755, + "balance_loss_mlp": 0.01259462, + "epoch": 0.11249060574177062, + "flos": 18589435061760.0, + "grad_norm": 2.325375460191994, + "language_loss": 0.77409399, + "learning_rate": 3.929043395181631e-06, + "loss": 0.85387194, + "num_input_tokens_seen": 40455570, + "router_z_loss_clip": 3.69140625, + "router_z_loss_mlp": 0.40625, + "step": 1871, + "time_per_iteration": 3.970134735107422 + }, + { + "auxiliary_loss_clip": 0.06669156, + "auxiliary_loss_mlp": 0.01304929, + "balance_loss_clip": 0.06304972, + "balance_loss_mlp": 0.01264803, + "epoch": 0.1125507289944386, + "flos": 22863146307840.0, + "grad_norm": 2.5010943819542395, + "language_loss": 0.83236814, + "learning_rate": 3.928940539325929e-06, + "loss": 0.91210902, + "num_input_tokens_seen": 40473600, + "router_z_loss_clip": 3.64257812, + "router_z_loss_mlp": 0.40112305, + "step": 1872, + "time_per_iteration": 2.53498911857605 + }, + { + "auxiliary_loss_clip": 0.0666475, + "auxiliary_loss_mlp": 0.0132478, + "balance_loss_clip": 0.06302819, + "balance_loss_mlp": 0.01284344, + "epoch": 0.11261085224710657, + "flos": 19681447397760.0, + "grad_norm": 2.9026103981965963, + "language_loss": 0.84496641, + "learning_rate": 3.9288376103246095e-06, + "loss": 0.92486167, + "num_input_tokens_seen": 40490025, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 0.40454102, + "step": 1873, + "time_per_iteration": 3.988614082336426 + }, + { + "auxiliary_loss_clip": 0.06668855, + "auxiliary_loss_mlp": 0.01305813, + "balance_loss_clip": 0.06300959, + "balance_loss_mlp": 0.01266664, + "epoch": 0.11267097549977453, + "flos": 26069680753920.0, + "grad_norm": 2.0146094287088454, + "language_loss": 0.92890203, + "learning_rate": 3.928734608181575e-06, + "loss": 1.00864863, + "num_input_tokens_seen": 40511580, + "router_z_loss_clip": 3.67578125, + "router_z_loss_mlp": 0.3918457, + "step": 1874, + "time_per_iteration": 2.594095230102539 + }, + { + "auxiliary_loss_clip": 0.06647091, + "auxiliary_loss_mlp": 0.01311618, + "balance_loss_clip": 0.06294458, + "balance_loss_mlp": 0.01272589, + "epoch": 0.11273109875244251, + "flos": 21074194437120.0, + "grad_norm": 2.447545582518425, + "language_loss": 0.7598331, + "learning_rate": 3.928631532900729e-06, + "loss": 0.8394202, + "num_input_tokens_seen": 40530155, + "router_z_loss_clip": 3.52539062, + "router_z_loss_mlp": 0.39038086, + "step": 1875, + "time_per_iteration": 2.5846669673919678 + }, + { + "auxiliary_loss_clip": 0.06650866, + "auxiliary_loss_mlp": 0.01305089, + "balance_loss_clip": 0.06300622, + "balance_loss_mlp": 0.01270042, + "epoch": 0.11279122200511048, + "flos": 27096299377920.0, + "grad_norm": 2.1373581639008603, + "language_loss": 0.73336905, + "learning_rate": 3.928528384485984e-06, + "loss": 0.81292862, + "num_input_tokens_seen": 40549500, + "router_z_loss_clip": 3.50585938, + "router_z_loss_mlp": 0.3503418, + "step": 1876, + "time_per_iteration": 3.9819693565368652 + }, + { + "auxiliary_loss_clip": 0.06655607, + "auxiliary_loss_mlp": 0.01304943, + "balance_loss_clip": 0.06303705, + "balance_loss_mlp": 0.01268489, + "epoch": 0.11285134525777844, + "flos": 20193163482240.0, + "grad_norm": 1.9863695087931013, + "language_loss": 0.78284073, + "learning_rate": 3.9284251629412475e-06, + "loss": 0.86244625, + "num_input_tokens_seen": 40567475, + "router_z_loss_clip": 3.51953125, + "router_z_loss_mlp": 0.36474609, + "step": 1877, + "time_per_iteration": 4.03458046913147 + }, + { + "auxiliary_loss_clip": 0.06652889, + "auxiliary_loss_mlp": 0.01306338, + "balance_loss_clip": 0.06294097, + "balance_loss_mlp": 0.01265139, + "epoch": 0.11291146851044641, + "flos": 12463348803840.0, + "grad_norm": 2.614643448765401, + "language_loss": 0.8943826, + "learning_rate": 3.928321868270436e-06, + "loss": 0.97397494, + "num_input_tokens_seen": 40583280, + "router_z_loss_clip": 3.58789062, + "router_z_loss_mlp": 0.41186523, + "step": 1878, + "time_per_iteration": 2.5039942264556885 + }, + { + "auxiliary_loss_clip": 0.06650617, + "auxiliary_loss_mlp": 0.01298934, + "balance_loss_clip": 0.0629722, + "balance_loss_mlp": 0.01262981, + "epoch": 0.11297159176311439, + "flos": 23849164828800.0, + "grad_norm": 2.5452203644148748, + "language_loss": 0.83347368, + "learning_rate": 3.928218500477466e-06, + "loss": 0.91296917, + "num_input_tokens_seen": 40603080, + "router_z_loss_clip": 3.53710938, + "router_z_loss_mlp": 0.35961914, + "step": 1879, + "time_per_iteration": 2.597705125808716 + }, + { + "auxiliary_loss_clip": 0.06658179, + "auxiliary_loss_mlp": 0.01304624, + "balance_loss_clip": 0.06296952, + "balance_loss_mlp": 0.01265333, + "epoch": 0.11303171501578235, + "flos": 29937585876480.0, + "grad_norm": 2.2031468075921765, + "language_loss": 0.71889591, + "learning_rate": 3.928115059566259e-06, + "loss": 0.79852396, + "num_input_tokens_seen": 40623255, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 0.39306641, + "step": 1880, + "time_per_iteration": 2.5943877696990967 + }, + { + "auxiliary_loss_clip": 0.06640352, + "auxiliary_loss_mlp": 0.01299738, + "balance_loss_clip": 0.06297569, + "balance_loss_mlp": 0.01262163, + "epoch": 0.11309183826845032, + "flos": 16186169381760.0, + "grad_norm": 2.477930763311184, + "language_loss": 0.74137151, + "learning_rate": 3.928011545540734e-06, + "loss": 0.82077241, + "num_input_tokens_seen": 40641570, + "router_z_loss_clip": 3.43164062, + "router_z_loss_mlp": 0.37573242, + "step": 1881, + "time_per_iteration": 2.5628225803375244 + }, + { + "auxiliary_loss_clip": 0.06661209, + "auxiliary_loss_mlp": 0.01303844, + "balance_loss_clip": 0.06301182, + "balance_loss_mlp": 0.01264767, + "epoch": 0.1131519615211183, + "flos": 12025537620480.0, + "grad_norm": 2.71671437451568, + "language_loss": 0.75070721, + "learning_rate": 3.927907958404819e-06, + "loss": 0.83035773, + "num_input_tokens_seen": 40658775, + "router_z_loss_clip": 3.59765625, + "router_z_loss_mlp": 0.39111328, + "step": 1882, + "time_per_iteration": 2.5252811908721924 + }, + { + "auxiliary_loss_clip": 0.06659748, + "auxiliary_loss_mlp": 0.01301896, + "balance_loss_clip": 0.06302463, + "balance_loss_mlp": 0.0126363, + "epoch": 0.11321208477378626, + "flos": 26257335972480.0, + "grad_norm": 2.360500107686341, + "language_loss": 0.81115943, + "learning_rate": 3.92780429816244e-06, + "loss": 0.89077592, + "num_input_tokens_seen": 40679555, + "router_z_loss_clip": 3.57226562, + "router_z_loss_mlp": 0.3828125, + "step": 1883, + "time_per_iteration": 2.6215126514434814 + }, + { + "auxiliary_loss_clip": 0.06662337, + "auxiliary_loss_mlp": 0.01301794, + "balance_loss_clip": 0.06305344, + "balance_loss_mlp": 0.01264076, + "epoch": 0.11327220802645423, + "flos": 13631530101120.0, + "grad_norm": 4.398339236734383, + "language_loss": 0.78793007, + "learning_rate": 3.927700564817529e-06, + "loss": 0.86757141, + "num_input_tokens_seen": 40697295, + "router_z_loss_clip": 3.56835938, + "router_z_loss_mlp": 0.37719727, + "step": 1884, + "time_per_iteration": 2.5176398754119873 + }, + { + "auxiliary_loss_clip": 0.06509344, + "auxiliary_loss_mlp": 0.0129272, + "balance_loss_clip": 0.06295511, + "balance_loss_mlp": 0.0127789, + "epoch": 0.1133323312791222, + "flos": 57210582787200.0, + "grad_norm": 0.8090343621743066, + "language_loss": 0.55328304, + "learning_rate": 3.927596758374019e-06, + "loss": 0.63130367, + "num_input_tokens_seen": 40758095, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.14794922, + "step": 1885, + "time_per_iteration": 3.0971505641937256 + }, + { + "auxiliary_loss_clip": 0.06646755, + "auxiliary_loss_mlp": 0.01313183, + "balance_loss_clip": 0.06301701, + "balance_loss_mlp": 0.01277062, + "epoch": 0.11339245453179017, + "flos": 24358407217920.0, + "grad_norm": 2.1975512476365444, + "language_loss": 0.917539, + "learning_rate": 3.927492878835848e-06, + "loss": 0.99713838, + "num_input_tokens_seen": 40777140, + "router_z_loss_clip": 3.45117188, + "router_z_loss_mlp": 0.36132812, + "step": 1886, + "time_per_iteration": 2.557039260864258 + }, + { + "auxiliary_loss_clip": 0.06661782, + "auxiliary_loss_mlp": 0.01305618, + "balance_loss_clip": 0.06311518, + "balance_loss_mlp": 0.01271882, + "epoch": 0.11345257778445814, + "flos": 22676665046400.0, + "grad_norm": 2.7768273002598427, + "language_loss": 0.86747134, + "learning_rate": 3.927388926206953e-06, + "loss": 0.94714534, + "num_input_tokens_seen": 40797505, + "router_z_loss_clip": 3.50195312, + "router_z_loss_mlp": 0.33740234, + "step": 1887, + "time_per_iteration": 2.586071014404297 + }, + { + "auxiliary_loss_clip": 0.06653242, + "auxiliary_loss_mlp": 0.01304972, + "balance_loss_clip": 0.06302808, + "balance_loss_mlp": 0.01268279, + "epoch": 0.11351270103712612, + "flos": 20993245793280.0, + "grad_norm": 4.850859640376328, + "language_loss": 0.7868247, + "learning_rate": 3.927284900491277e-06, + "loss": 0.86640686, + "num_input_tokens_seen": 40812970, + "router_z_loss_clip": 3.50195312, + "router_z_loss_mlp": 0.36694336, + "step": 1888, + "time_per_iteration": 2.5445072650909424 + }, + { + "auxiliary_loss_clip": 0.06662205, + "auxiliary_loss_mlp": 0.01311301, + "balance_loss_clip": 0.06306933, + "balance_loss_mlp": 0.01271366, + "epoch": 0.11357282428979408, + "flos": 37358014152960.0, + "grad_norm": 2.243152205453325, + "language_loss": 0.69439191, + "learning_rate": 3.927180801692764e-06, + "loss": 0.77412695, + "num_input_tokens_seen": 40837745, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 0.39916992, + "step": 1889, + "time_per_iteration": 2.7570948600769043 + }, + { + "auxiliary_loss_clip": 0.06658383, + "auxiliary_loss_mlp": 0.01303074, + "balance_loss_clip": 0.06306529, + "balance_loss_mlp": 0.01266811, + "epoch": 0.11363294754246205, + "flos": 21762580855680.0, + "grad_norm": 2.3560992330068, + "language_loss": 0.85365129, + "learning_rate": 3.927076629815362e-06, + "loss": 0.93326581, + "num_input_tokens_seen": 40856490, + "router_z_loss_clip": 3.51953125, + "router_z_loss_mlp": 0.36279297, + "step": 1890, + "time_per_iteration": 2.539299964904785 + }, + { + "auxiliary_loss_clip": 0.06646931, + "auxiliary_loss_mlp": 0.0130946, + "balance_loss_clip": 0.06299055, + "balance_loss_mlp": 0.01272887, + "epoch": 0.11369307079513001, + "flos": 22608252587520.0, + "grad_norm": 3.2867804654433734, + "language_loss": 0.66679269, + "learning_rate": 3.926972384863022e-06, + "loss": 0.74635661, + "num_input_tokens_seen": 40874070, + "router_z_loss_clip": 3.48046875, + "router_z_loss_mlp": 0.36572266, + "step": 1891, + "time_per_iteration": 2.5804758071899414 + }, + { + "auxiliary_loss_clip": 0.06662975, + "auxiliary_loss_mlp": 0.01306025, + "balance_loss_clip": 0.06305033, + "balance_loss_mlp": 0.01268188, + "epoch": 0.11375319404779799, + "flos": 21950655344640.0, + "grad_norm": 2.3010503008358887, + "language_loss": 0.89755237, + "learning_rate": 3.9268680668396956e-06, + "loss": 0.97724235, + "num_input_tokens_seen": 40892425, + "router_z_loss_clip": 3.58398438, + "router_z_loss_mlp": 0.37817383, + "step": 1892, + "time_per_iteration": 2.5231149196624756 + }, + { + "auxiliary_loss_clip": 0.06664805, + "auxiliary_loss_mlp": 0.01310273, + "balance_loss_clip": 0.06304479, + "balance_loss_mlp": 0.01271149, + "epoch": 0.11381331730046595, + "flos": 26402588225280.0, + "grad_norm": 2.9760722646413966, + "language_loss": 0.75163257, + "learning_rate": 3.926763675749339e-06, + "loss": 0.83138341, + "num_input_tokens_seen": 40912190, + "router_z_loss_clip": 3.60546875, + "router_z_loss_mlp": 0.39111328, + "step": 1893, + "time_per_iteration": 2.6722171306610107 + }, + { + "auxiliary_loss_clip": 0.06657124, + "auxiliary_loss_mlp": 0.0130867, + "balance_loss_clip": 0.06306865, + "balance_loss_mlp": 0.01271405, + "epoch": 0.11387344055313392, + "flos": 23811373837440.0, + "grad_norm": 2.1739305302665417, + "language_loss": 0.81218535, + "learning_rate": 3.92665921159591e-06, + "loss": 0.89184326, + "num_input_tokens_seen": 40928395, + "router_z_loss_clip": 3.5, + "router_z_loss_mlp": 0.37255859, + "step": 1894, + "time_per_iteration": 2.5737743377685547 + }, + { + "auxiliary_loss_clip": 0.06661002, + "auxiliary_loss_mlp": 0.01313123, + "balance_loss_clip": 0.06302214, + "balance_loss_mlp": 0.01272187, + "epoch": 0.1139335638058019, + "flos": 34529865546240.0, + "grad_norm": 3.0499673553250317, + "language_loss": 0.81167793, + "learning_rate": 3.926554674383371e-06, + "loss": 0.89141917, + "num_input_tokens_seen": 40946555, + "router_z_loss_clip": 3.5859375, + "router_z_loss_mlp": 0.40991211, + "step": 1895, + "time_per_iteration": 2.6510303020477295 + }, + { + "auxiliary_loss_clip": 0.06495596, + "auxiliary_loss_mlp": 0.01269878, + "balance_loss_clip": 0.06284232, + "balance_loss_mlp": 0.01256026, + "epoch": 0.11399368705846986, + "flos": 70609790643840.0, + "grad_norm": 0.7664991761837657, + "language_loss": 0.63306981, + "learning_rate": 3.926450064115686e-06, + "loss": 0.71072453, + "num_input_tokens_seen": 41004910, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.13891602, + "step": 1896, + "time_per_iteration": 3.2715020179748535 + }, + { + "auxiliary_loss_clip": 0.06653456, + "auxiliary_loss_mlp": 0.01306088, + "balance_loss_clip": 0.06306494, + "balance_loss_mlp": 0.01266224, + "epoch": 0.11405381031113783, + "flos": 21330597530880.0, + "grad_norm": 2.7976416245645988, + "language_loss": 0.86136234, + "learning_rate": 3.926345380796821e-06, + "loss": 0.94095778, + "num_input_tokens_seen": 41026385, + "router_z_loss_clip": 3.47070312, + "router_z_loss_mlp": 0.3984375, + "step": 1897, + "time_per_iteration": 2.602890729904175 + }, + { + "auxiliary_loss_clip": 0.06656732, + "auxiliary_loss_mlp": 0.01307974, + "balance_loss_clip": 0.06304093, + "balance_loss_mlp": 0.01270041, + "epoch": 0.11411393356380581, + "flos": 19725820934400.0, + "grad_norm": 2.6374143353220068, + "language_loss": 0.80644619, + "learning_rate": 3.9262406244307465e-06, + "loss": 0.88609326, + "num_input_tokens_seen": 41045315, + "router_z_loss_clip": 3.52734375, + "router_z_loss_mlp": 0.37915039, + "step": 1898, + "time_per_iteration": 2.5834596157073975 + }, + { + "auxiliary_loss_clip": 0.06665078, + "auxiliary_loss_mlp": 0.0130214, + "balance_loss_clip": 0.06307302, + "balance_loss_mlp": 0.01261823, + "epoch": 0.11417405681647377, + "flos": 17536261893120.0, + "grad_norm": 3.558801225381502, + "language_loss": 0.74948764, + "learning_rate": 3.926135795021435e-06, + "loss": 0.82915986, + "num_input_tokens_seen": 41063390, + "router_z_loss_clip": 3.57226562, + "router_z_loss_mlp": 0.40283203, + "step": 1899, + "time_per_iteration": 2.5195093154907227 + }, + { + "auxiliary_loss_clip": 0.06484325, + "auxiliary_loss_mlp": 0.01277698, + "balance_loss_clip": 0.06276824, + "balance_loss_mlp": 0.01262463, + "epoch": 0.11423418006914174, + "flos": 59694168205440.0, + "grad_norm": 0.8563849035990295, + "language_loss": 0.63607001, + "learning_rate": 3.92603089257286e-06, + "loss": 0.71369016, + "num_input_tokens_seen": 41124180, + "router_z_loss_clip": 2.078125, + "router_z_loss_mlp": 0.15209961, + "step": 1900, + "time_per_iteration": 3.140596389770508 + }, + { + "auxiliary_loss_clip": 0.06654657, + "auxiliary_loss_mlp": 0.01295658, + "balance_loss_clip": 0.06311209, + "balance_loss_mlp": 0.01260706, + "epoch": 0.1142943033218097, + "flos": 22969223976960.0, + "grad_norm": 2.413799712437086, + "language_loss": 0.7948848, + "learning_rate": 3.925925917089001e-06, + "loss": 0.87438798, + "num_input_tokens_seen": 41143485, + "router_z_loss_clip": 3.43359375, + "router_z_loss_mlp": 0.34960938, + "step": 1901, + "time_per_iteration": 2.5521771907806396 + }, + { + "auxiliary_loss_clip": 0.06657314, + "auxiliary_loss_mlp": 0.01303255, + "balance_loss_clip": 0.06311248, + "balance_loss_mlp": 0.01264011, + "epoch": 0.11435442657447768, + "flos": 18261558835200.0, + "grad_norm": 2.3832212906881862, + "language_loss": 0.8530966, + "learning_rate": 3.925820868573839e-06, + "loss": 0.93270218, + "num_input_tokens_seen": 41161695, + "router_z_loss_clip": 3.4609375, + "router_z_loss_mlp": 0.39257812, + "step": 1902, + "time_per_iteration": 2.538130521774292 + }, + { + "auxiliary_loss_clip": 0.06657556, + "auxiliary_loss_mlp": 0.01298528, + "balance_loss_clip": 0.06305373, + "balance_loss_mlp": 0.01259737, + "epoch": 0.11441454982714565, + "flos": 24068070420480.0, + "grad_norm": 1.6413453356185448, + "language_loss": 0.79046285, + "learning_rate": 3.925715747031356e-06, + "loss": 0.87002361, + "num_input_tokens_seen": 41181715, + "router_z_loss_clip": 3.52539062, + "router_z_loss_mlp": 0.38793945, + "step": 1903, + "time_per_iteration": 2.5491714477539062 + }, + { + "auxiliary_loss_clip": 0.0665084, + "auxiliary_loss_mlp": 0.01296782, + "balance_loss_clip": 0.06302907, + "balance_loss_mlp": 0.01262021, + "epoch": 0.11447467307981361, + "flos": 25344719228160.0, + "grad_norm": 2.444047148927425, + "language_loss": 0.7716713, + "learning_rate": 3.925610552465539e-06, + "loss": 0.85114753, + "num_input_tokens_seen": 41201770, + "router_z_loss_clip": 3.47851562, + "router_z_loss_mlp": 0.34765625, + "step": 1904, + "time_per_iteration": 2.581732749938965 + }, + { + "auxiliary_loss_clip": 0.0665014, + "auxiliary_loss_mlp": 0.01305214, + "balance_loss_clip": 0.0630317, + "balance_loss_mlp": 0.01263967, + "epoch": 0.11453479633248159, + "flos": 21732546366720.0, + "grad_norm": 2.531757155305884, + "language_loss": 0.9328481, + "learning_rate": 3.9255052848803764e-06, + "loss": 1.01240158, + "num_input_tokens_seen": 41220590, + "router_z_loss_clip": 3.46484375, + "router_z_loss_mlp": 0.41259766, + "step": 1905, + "time_per_iteration": 2.5455148220062256 + }, + { + "auxiliary_loss_clip": 0.06677254, + "auxiliary_loss_mlp": 0.01302143, + "balance_loss_clip": 0.06310458, + "balance_loss_mlp": 0.0126185, + "epoch": 0.11459491958514956, + "flos": 12974771399040.0, + "grad_norm": 15.201644676234393, + "language_loss": 0.79179782, + "learning_rate": 3.925399944279861e-06, + "loss": 0.87159181, + "num_input_tokens_seen": 41237250, + "router_z_loss_clip": 3.66992188, + "router_z_loss_mlp": 0.40258789, + "step": 1906, + "time_per_iteration": 2.557220220565796 + }, + { + "auxiliary_loss_clip": 0.06651148, + "auxiliary_loss_mlp": 0.01309487, + "balance_loss_clip": 0.06300925, + "balance_loss_mlp": 0.0127022, + "epoch": 0.11465504283781752, + "flos": 22717935982080.0, + "grad_norm": 2.7916231383135903, + "language_loss": 0.84417903, + "learning_rate": 3.925294530667986e-06, + "loss": 0.92378545, + "num_input_tokens_seen": 41256680, + "router_z_loss_clip": 3.50195312, + "router_z_loss_mlp": 0.39257812, + "step": 1907, + "time_per_iteration": 2.538357734680176 + }, + { + "auxiliary_loss_clip": 0.06659371, + "auxiliary_loss_mlp": 0.01305713, + "balance_loss_clip": 0.06306633, + "balance_loss_mlp": 0.01266064, + "epoch": 0.1147151660904855, + "flos": 23404142194560.0, + "grad_norm": 5.983288386648609, + "language_loss": 0.85784996, + "learning_rate": 3.92518904404875e-06, + "loss": 0.93750072, + "num_input_tokens_seen": 41270955, + "router_z_loss_clip": 3.53125, + "router_z_loss_mlp": 0.39648438, + "step": 1908, + "time_per_iteration": 2.566323757171631 + }, + { + "auxiliary_loss_clip": 0.06483665, + "auxiliary_loss_mlp": 0.01269821, + "balance_loss_clip": 0.0627609, + "balance_loss_mlp": 0.01254252, + "epoch": 0.11477528934315347, + "flos": 63028639036800.0, + "grad_norm": 0.8722245963969955, + "language_loss": 0.60927975, + "learning_rate": 3.925083484426153e-06, + "loss": 0.68681461, + "num_input_tokens_seen": 41319180, + "router_z_loss_clip": 2.07617188, + "router_z_loss_mlp": 0.15551758, + "step": 1909, + "time_per_iteration": 2.9047083854675293 + }, + { + "auxiliary_loss_clip": 0.06651932, + "auxiliary_loss_mlp": 0.01304657, + "balance_loss_clip": 0.06305454, + "balance_loss_mlp": 0.01265223, + "epoch": 0.11483541259582143, + "flos": 16331086218240.0, + "grad_norm": 2.669666495614271, + "language_loss": 0.8074221, + "learning_rate": 3.924977851804197e-06, + "loss": 0.88698798, + "num_input_tokens_seen": 41337480, + "router_z_loss_clip": 3.46289062, + "router_z_loss_mlp": 0.39404297, + "step": 1910, + "time_per_iteration": 2.5531835556030273 + }, + { + "auxiliary_loss_clip": 0.06656756, + "auxiliary_loss_mlp": 0.01297855, + "balance_loss_clip": 0.06303862, + "balance_loss_mlp": 0.01258516, + "epoch": 0.1148955358484894, + "flos": 21586916770560.0, + "grad_norm": 2.9098941838716046, + "language_loss": 0.78589714, + "learning_rate": 3.9248721461868875e-06, + "loss": 0.86544329, + "num_input_tokens_seen": 41354650, + "router_z_loss_clip": 3.52734375, + "router_z_loss_mlp": 0.39331055, + "step": 1911, + "time_per_iteration": 3.928828477859497 + }, + { + "auxiliary_loss_clip": 0.06639488, + "auxiliary_loss_mlp": 0.01303362, + "balance_loss_clip": 0.06301475, + "balance_loss_mlp": 0.01266931, + "epoch": 0.11495565910115738, + "flos": 27681249530880.0, + "grad_norm": 2.02553210679246, + "language_loss": 0.80990648, + "learning_rate": 3.9247663675782336e-06, + "loss": 0.88933504, + "num_input_tokens_seen": 41376935, + "router_z_loss_clip": 3.38085938, + "router_z_loss_mlp": 0.36401367, + "step": 1912, + "time_per_iteration": 2.5985615253448486 + }, + { + "auxiliary_loss_clip": 0.06649567, + "auxiliary_loss_mlp": 0.01304436, + "balance_loss_clip": 0.06303079, + "balance_loss_mlp": 0.01266575, + "epoch": 0.11501578235382534, + "flos": 20638815022080.0, + "grad_norm": 2.0778571754475124, + "language_loss": 0.79150605, + "learning_rate": 3.924660515982246e-06, + "loss": 0.87104607, + "num_input_tokens_seen": 41396105, + "router_z_loss_clip": 3.46484375, + "router_z_loss_mlp": 0.37866211, + "step": 1913, + "time_per_iteration": 3.9840147495269775 + }, + { + "auxiliary_loss_clip": 0.06649221, + "auxiliary_loss_mlp": 0.01302596, + "balance_loss_clip": 0.06302825, + "balance_loss_mlp": 0.01266214, + "epoch": 0.1150759056064933, + "flos": 19835252766720.0, + "grad_norm": 2.174223201073213, + "language_loss": 0.71977127, + "learning_rate": 3.924554591402939e-06, + "loss": 0.79928941, + "num_input_tokens_seen": 41415600, + "router_z_loss_clip": 3.46484375, + "router_z_loss_mlp": 0.36352539, + "step": 1914, + "time_per_iteration": 2.564162492752075 + }, + { + "auxiliary_loss_clip": 0.06490675, + "auxiliary_loss_mlp": 0.01271492, + "balance_loss_clip": 0.06283194, + "balance_loss_mlp": 0.01257139, + "epoch": 0.11513602885916129, + "flos": 70068543194880.0, + "grad_norm": 0.7330745369663106, + "language_loss": 0.61048496, + "learning_rate": 3.92444859384433e-06, + "loss": 0.68810666, + "num_input_tokens_seen": 41478760, + "router_z_loss_clip": 2.0625, + "router_z_loss_mlp": 0.14343262, + "step": 1915, + "time_per_iteration": 4.616885662078857 + }, + { + "auxiliary_loss_clip": 0.06646329, + "auxiliary_loss_mlp": 0.01309796, + "balance_loss_clip": 0.06301694, + "balance_loss_mlp": 0.01271697, + "epoch": 0.11519615211182925, + "flos": 15747100387200.0, + "grad_norm": 2.8536727053056077, + "language_loss": 0.94662005, + "learning_rate": 3.924342523310436e-06, + "loss": 1.02618122, + "num_input_tokens_seen": 41495720, + "router_z_loss_clip": 3.44726562, + "router_z_loss_mlp": 0.38085938, + "step": 1916, + "time_per_iteration": 2.544074058532715 + }, + { + "auxiliary_loss_clip": 0.06649305, + "auxiliary_loss_mlp": 0.01297855, + "balance_loss_clip": 0.06298364, + "balance_loss_mlp": 0.01258945, + "epoch": 0.11525627536449722, + "flos": 20673880755840.0, + "grad_norm": 1.9176091228095486, + "language_loss": 0.73714519, + "learning_rate": 3.9242363798052806e-06, + "loss": 0.81661683, + "num_input_tokens_seen": 41513585, + "router_z_loss_clip": 3.5078125, + "router_z_loss_mlp": 0.3894043, + "step": 1917, + "time_per_iteration": 3.988520383834839 + }, + { + "auxiliary_loss_clip": 0.06637132, + "auxiliary_loss_mlp": 0.01303977, + "balance_loss_clip": 0.06296226, + "balance_loss_mlp": 0.01264876, + "epoch": 0.1153163986171652, + "flos": 20309555203200.0, + "grad_norm": 2.2006178662795546, + "language_loss": 0.7638135, + "learning_rate": 3.92413016333289e-06, + "loss": 0.84322459, + "num_input_tokens_seen": 41533390, + "router_z_loss_clip": 3.40820312, + "router_z_loss_mlp": 0.39135742, + "step": 1918, + "time_per_iteration": 2.531501531600952 + }, + { + "auxiliary_loss_clip": 0.06653848, + "auxiliary_loss_mlp": 0.01302011, + "balance_loss_clip": 0.06300295, + "balance_loss_mlp": 0.01263983, + "epoch": 0.11537652186983316, + "flos": 17645064819840.0, + "grad_norm": 6.624924967769877, + "language_loss": 0.87652063, + "learning_rate": 3.92402387389729e-06, + "loss": 0.95607924, + "num_input_tokens_seen": 41551015, + "router_z_loss_clip": 3.53320312, + "router_z_loss_mlp": 0.38037109, + "step": 1919, + "time_per_iteration": 2.5388336181640625 + }, + { + "auxiliary_loss_clip": 0.06642918, + "auxiliary_loss_mlp": 0.01303256, + "balance_loss_clip": 0.06300272, + "balance_loss_mlp": 0.01265872, + "epoch": 0.11543664512250112, + "flos": 21075787664640.0, + "grad_norm": 2.5165855021660697, + "language_loss": 0.87737721, + "learning_rate": 3.923917511502512e-06, + "loss": 0.95683897, + "num_input_tokens_seen": 41568055, + "router_z_loss_clip": 3.42773438, + "router_z_loss_mlp": 0.37402344, + "step": 1920, + "time_per_iteration": 2.536255121231079 + }, + { + "auxiliary_loss_clip": 0.0663945, + "auxiliary_loss_mlp": 0.01300031, + "balance_loss_clip": 0.06300904, + "balance_loss_mlp": 0.01262671, + "epoch": 0.11549676837516909, + "flos": 22754175672960.0, + "grad_norm": 2.0755692503441696, + "language_loss": 0.81216776, + "learning_rate": 3.923811076152589e-06, + "loss": 0.89156258, + "num_input_tokens_seen": 41587435, + "router_z_loss_clip": 3.3828125, + "router_z_loss_mlp": 0.3737793, + "step": 1921, + "time_per_iteration": 2.5809693336486816 + }, + { + "auxiliary_loss_clip": 0.06661837, + "auxiliary_loss_mlp": 0.01301821, + "balance_loss_clip": 0.06303193, + "balance_loss_mlp": 0.0126036, + "epoch": 0.11555689162783707, + "flos": 19174510995840.0, + "grad_norm": 2.11935003712056, + "language_loss": 0.79765266, + "learning_rate": 3.923704567851557e-06, + "loss": 0.87728924, + "num_input_tokens_seen": 41604975, + "router_z_loss_clip": 3.5859375, + "router_z_loss_mlp": 0.41455078, + "step": 1922, + "time_per_iteration": 2.521562099456787 + }, + { + "auxiliary_loss_clip": 0.06651014, + "auxiliary_loss_mlp": 0.01303966, + "balance_loss_clip": 0.06302896, + "balance_loss_mlp": 0.01265939, + "epoch": 0.11561701488050503, + "flos": 24579031818240.0, + "grad_norm": 1.9630494189649508, + "language_loss": 0.85855269, + "learning_rate": 3.923597986603456e-06, + "loss": 0.93810248, + "num_input_tokens_seen": 41626155, + "router_z_loss_clip": 3.48046875, + "router_z_loss_mlp": 0.38037109, + "step": 1923, + "time_per_iteration": 2.6439831256866455 + }, + { + "auxiliary_loss_clip": 0.06647194, + "auxiliary_loss_mlp": 0.01294133, + "balance_loss_clip": 0.0630134, + "balance_loss_mlp": 0.01258465, + "epoch": 0.115677138133173, + "flos": 17098283001600.0, + "grad_norm": 2.06344411433486, + "language_loss": 0.8208636, + "learning_rate": 3.9234913324123264e-06, + "loss": 0.90027684, + "num_input_tokens_seen": 41644805, + "router_z_loss_clip": 3.4609375, + "router_z_loss_mlp": 0.35668945, + "step": 1924, + "time_per_iteration": 2.5213494300842285 + }, + { + "auxiliary_loss_clip": 0.06494077, + "auxiliary_loss_mlp": 0.01268349, + "balance_loss_clip": 0.06289093, + "balance_loss_mlp": 0.01252459, + "epoch": 0.11573726138584098, + "flos": 62724032317440.0, + "grad_norm": 0.8075731701213882, + "language_loss": 0.60936594, + "learning_rate": 3.923384605282212e-06, + "loss": 0.6869902, + "num_input_tokens_seen": 41709345, + "router_z_loss_clip": 2.06054688, + "router_z_loss_mlp": 0.15881348, + "step": 1925, + "time_per_iteration": 3.2047207355499268 + }, + { + "auxiliary_loss_clip": 0.06648477, + "auxiliary_loss_mlp": 0.01300045, + "balance_loss_clip": 0.06303966, + "balance_loss_mlp": 0.01261016, + "epoch": 0.11579738463850894, + "flos": 22607665608960.0, + "grad_norm": 2.013389480073572, + "language_loss": 0.76518846, + "learning_rate": 3.923277805217161e-06, + "loss": 0.84467369, + "num_input_tokens_seen": 41730210, + "router_z_loss_clip": 3.44726562, + "router_z_loss_mlp": 0.39038086, + "step": 1926, + "time_per_iteration": 2.55283784866333 + }, + { + "auxiliary_loss_clip": 0.06666763, + "auxiliary_loss_mlp": 0.01299238, + "balance_loss_clip": 0.06301835, + "balance_loss_mlp": 0.01255583, + "epoch": 0.11585750789117691, + "flos": 21732630220800.0, + "grad_norm": 5.887246019394102, + "language_loss": 0.7431767, + "learning_rate": 3.923170932221222e-06, + "loss": 0.82283664, + "num_input_tokens_seen": 41750270, + "router_z_loss_clip": 3.64648438, + "router_z_loss_mlp": 0.43652344, + "step": 1927, + "time_per_iteration": 2.560518503189087 + }, + { + "auxiliary_loss_clip": 0.06652652, + "auxiliary_loss_mlp": 0.01306042, + "balance_loss_clip": 0.0630243, + "balance_loss_mlp": 0.01264986, + "epoch": 0.11591763114384489, + "flos": 26294917328640.0, + "grad_norm": 2.5509114333241873, + "language_loss": 0.88765574, + "learning_rate": 3.92306398629845e-06, + "loss": 0.96724266, + "num_input_tokens_seen": 41772975, + "router_z_loss_clip": 3.50585938, + "router_z_loss_mlp": 0.41064453, + "step": 1928, + "time_per_iteration": 2.6590919494628906 + }, + { + "auxiliary_loss_clip": 0.06657438, + "auxiliary_loss_mlp": 0.01301093, + "balance_loss_clip": 0.06300268, + "balance_loss_mlp": 0.01261468, + "epoch": 0.11597775439651285, + "flos": 23006721479040.0, + "grad_norm": 2.0893495121762844, + "language_loss": 0.7806766, + "learning_rate": 3.922956967452898e-06, + "loss": 0.86026198, + "num_input_tokens_seen": 41791765, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 0.39648438, + "step": 1929, + "time_per_iteration": 2.5792133808135986 + }, + { + "auxiliary_loss_clip": 0.06650299, + "auxiliary_loss_mlp": 0.01295794, + "balance_loss_clip": 0.06304935, + "balance_loss_mlp": 0.01259626, + "epoch": 0.11603787764918082, + "flos": 31949845678080.0, + "grad_norm": 1.6257603780251215, + "language_loss": 0.78351086, + "learning_rate": 3.922849875688626e-06, + "loss": 0.86297178, + "num_input_tokens_seen": 41815615, + "router_z_loss_clip": 3.453125, + "router_z_loss_mlp": 0.36181641, + "step": 1930, + "time_per_iteration": 2.6880123615264893 + }, + { + "auxiliary_loss_clip": 0.06647912, + "auxiliary_loss_mlp": 0.01295728, + "balance_loss_clip": 0.06302683, + "balance_loss_mlp": 0.01257438, + "epoch": 0.1160980009018488, + "flos": 22277944592640.0, + "grad_norm": 1.7868265367767153, + "language_loss": 0.73173678, + "learning_rate": 3.922742711009693e-06, + "loss": 0.81117314, + "num_input_tokens_seen": 41834810, + "router_z_loss_clip": 3.453125, + "router_z_loss_mlp": 0.3828125, + "step": 1931, + "time_per_iteration": 2.5717685222625732 + }, + { + "auxiliary_loss_clip": 0.06652078, + "auxiliary_loss_mlp": 0.01303044, + "balance_loss_clip": 0.06304099, + "balance_loss_mlp": 0.01264539, + "epoch": 0.11615812415451676, + "flos": 22790205728640.0, + "grad_norm": 1.6665760080165584, + "language_loss": 0.8340829, + "learning_rate": 3.922635473420164e-06, + "loss": 0.91363412, + "num_input_tokens_seen": 41854975, + "router_z_loss_clip": 3.47851562, + "router_z_loss_mlp": 0.38500977, + "step": 1932, + "time_per_iteration": 2.601752519607544 + }, + { + "auxiliary_loss_clip": 0.0648433, + "auxiliary_loss_mlp": 0.01265345, + "balance_loss_clip": 0.06279489, + "balance_loss_mlp": 0.01250242, + "epoch": 0.11621824740718473, + "flos": 67165483438080.0, + "grad_norm": 0.7530575515980809, + "language_loss": 0.61312342, + "learning_rate": 3.922528162924105e-06, + "loss": 0.69062018, + "num_input_tokens_seen": 41911105, + "router_z_loss_clip": 2.046875, + "router_z_loss_mlp": 0.15075684, + "step": 1933, + "time_per_iteration": 3.078101873397827 + }, + { + "auxiliary_loss_clip": 0.06656399, + "auxiliary_loss_mlp": 0.01297791, + "balance_loss_clip": 0.06303177, + "balance_loss_mlp": 0.01259239, + "epoch": 0.11627837065985269, + "flos": 20382160366080.0, + "grad_norm": 2.5724054750959446, + "language_loss": 0.8773917, + "learning_rate": 3.922420779525586e-06, + "loss": 0.95693362, + "num_input_tokens_seen": 41931750, + "router_z_loss_clip": 3.52929688, + "router_z_loss_mlp": 0.38574219, + "step": 1934, + "time_per_iteration": 2.5999112129211426 + }, + { + "auxiliary_loss_clip": 0.06669597, + "auxiliary_loss_mlp": 0.01303802, + "balance_loss_clip": 0.0630424, + "balance_loss_mlp": 0.01260386, + "epoch": 0.11633849391252067, + "flos": 21732252877440.0, + "grad_norm": 3.12484100633917, + "language_loss": 0.67964768, + "learning_rate": 3.9223133232286776e-06, + "loss": 0.75938165, + "num_input_tokens_seen": 41949400, + "router_z_loss_clip": 3.65625, + "router_z_loss_mlp": 0.43408203, + "step": 1935, + "time_per_iteration": 2.5801587104797363 + }, + { + "auxiliary_loss_clip": 0.06657647, + "auxiliary_loss_mlp": 0.01296559, + "balance_loss_clip": 0.06305058, + "balance_loss_mlp": 0.01259485, + "epoch": 0.11639861716518864, + "flos": 18811023984000.0, + "grad_norm": 1.935927362539055, + "language_loss": 0.77021551, + "learning_rate": 3.922205794037456e-06, + "loss": 0.84975761, + "num_input_tokens_seen": 41968100, + "router_z_loss_clip": 3.52734375, + "router_z_loss_mlp": 0.37084961, + "step": 1936, + "time_per_iteration": 2.5624840259552 + }, + { + "auxiliary_loss_clip": 0.06655373, + "auxiliary_loss_mlp": 0.01299017, + "balance_loss_clip": 0.06303351, + "balance_loss_mlp": 0.01259678, + "epoch": 0.1164587404178566, + "flos": 21221333406720.0, + "grad_norm": 1.9207342779057202, + "language_loss": 0.85928023, + "learning_rate": 3.922098191955998e-06, + "loss": 0.93882406, + "num_input_tokens_seen": 41986375, + "router_z_loss_clip": 3.51953125, + "router_z_loss_mlp": 0.39355469, + "step": 1937, + "time_per_iteration": 2.5510001182556152 + }, + { + "auxiliary_loss_clip": 0.06649198, + "auxiliary_loss_mlp": 0.01298206, + "balance_loss_clip": 0.06305847, + "balance_loss_mlp": 0.01261561, + "epoch": 0.11651886367052458, + "flos": 27826040586240.0, + "grad_norm": 2.6065443485594613, + "language_loss": 0.78032261, + "learning_rate": 3.921990516988384e-06, + "loss": 0.85979664, + "num_input_tokens_seen": 42006055, + "router_z_loss_clip": 3.43359375, + "router_z_loss_mlp": 0.36645508, + "step": 1938, + "time_per_iteration": 2.6225640773773193 + }, + { + "auxiliary_loss_clip": 0.06663075, + "auxiliary_loss_mlp": 0.01303768, + "balance_loss_clip": 0.06310252, + "balance_loss_mlp": 0.01266098, + "epoch": 0.11657898692319255, + "flos": 22895570638080.0, + "grad_norm": 1.931552039208485, + "language_loss": 0.80530608, + "learning_rate": 3.921882769138696e-06, + "loss": 0.88497448, + "num_input_tokens_seen": 42024995, + "router_z_loss_clip": 3.52929688, + "router_z_loss_mlp": 0.37670898, + "step": 1939, + "time_per_iteration": 2.5451977252960205 + }, + { + "auxiliary_loss_clip": 0.06656967, + "auxiliary_loss_mlp": 0.01296552, + "balance_loss_clip": 0.06312265, + "balance_loss_mlp": 0.01261409, + "epoch": 0.11663911017586051, + "flos": 24322712578560.0, + "grad_norm": 2.6690615994939795, + "language_loss": 0.88347197, + "learning_rate": 3.9217749484110215e-06, + "loss": 0.96300709, + "num_input_tokens_seen": 42042640, + "router_z_loss_clip": 3.4453125, + "router_z_loss_mlp": 0.3515625, + "step": 1940, + "time_per_iteration": 2.572737216949463 + }, + { + "auxiliary_loss_clip": 0.06642211, + "auxiliary_loss_mlp": 0.01298321, + "balance_loss_clip": 0.06303503, + "balance_loss_mlp": 0.01262987, + "epoch": 0.11669923342852849, + "flos": 42350020525440.0, + "grad_norm": 1.538525373225641, + "language_loss": 0.7696858, + "learning_rate": 3.921667054809449e-06, + "loss": 0.84909111, + "num_input_tokens_seen": 42067005, + "router_z_loss_clip": 3.38671875, + "router_z_loss_mlp": 0.35327148, + "step": 1941, + "time_per_iteration": 2.72994065284729 + }, + { + "auxiliary_loss_clip": 0.06658466, + "auxiliary_loss_mlp": 0.01294978, + "balance_loss_clip": 0.06313083, + "balance_loss_mlp": 0.01259525, + "epoch": 0.11675935668119646, + "flos": 14646660716160.0, + "grad_norm": 2.147321627209633, + "language_loss": 0.9028796, + "learning_rate": 3.921559088338068e-06, + "loss": 0.98241401, + "num_input_tokens_seen": 42082295, + "router_z_loss_clip": 3.453125, + "router_z_loss_mlp": 0.35449219, + "step": 1942, + "time_per_iteration": 2.550832986831665 + }, + { + "auxiliary_loss_clip": 0.06645136, + "auxiliary_loss_mlp": 0.0129601, + "balance_loss_clip": 0.06305736, + "balance_loss_mlp": 0.01262154, + "epoch": 0.11681947993386442, + "flos": 35125213605120.0, + "grad_norm": 1.8932460092328547, + "language_loss": 0.69414169, + "learning_rate": 3.921451049000975e-06, + "loss": 0.77355313, + "num_input_tokens_seen": 42105295, + "router_z_loss_clip": 3.39453125, + "router_z_loss_mlp": 0.33813477, + "step": 1943, + "time_per_iteration": 2.6689436435699463 + }, + { + "auxiliary_loss_clip": 0.06646268, + "auxiliary_loss_mlp": 0.01301771, + "balance_loss_clip": 0.06305961, + "balance_loss_mlp": 0.01264721, + "epoch": 0.11687960318653239, + "flos": 38992531749120.0, + "grad_norm": 3.030291623904481, + "language_loss": 0.71275461, + "learning_rate": 3.921342936802265e-06, + "loss": 0.79223496, + "num_input_tokens_seen": 42125520, + "router_z_loss_clip": 3.40429688, + "router_z_loss_mlp": 0.37060547, + "step": 1944, + "time_per_iteration": 2.8050050735473633 + }, + { + "auxiliary_loss_clip": 0.06641431, + "auxiliary_loss_mlp": 0.01296797, + "balance_loss_clip": 0.06306483, + "balance_loss_mlp": 0.01261606, + "epoch": 0.11693972643920036, + "flos": 26002190689920.0, + "grad_norm": 1.654338946560172, + "language_loss": 0.83736217, + "learning_rate": 3.921234751746038e-06, + "loss": 0.91674441, + "num_input_tokens_seen": 42146335, + "router_z_loss_clip": 3.34765625, + "router_z_loss_mlp": 0.35205078, + "step": 1945, + "time_per_iteration": 2.6361136436462402 + }, + { + "auxiliary_loss_clip": 0.06650846, + "auxiliary_loss_mlp": 0.01293506, + "balance_loss_clip": 0.06312834, + "balance_loss_mlp": 0.01259579, + "epoch": 0.11699984969186833, + "flos": 27279552257280.0, + "grad_norm": 2.078454883436641, + "language_loss": 0.78074771, + "learning_rate": 3.9211264938363975e-06, + "loss": 0.86019123, + "num_input_tokens_seen": 42165320, + "router_z_loss_clip": 3.38085938, + "router_z_loss_mlp": 0.33935547, + "step": 1946, + "time_per_iteration": 2.6417500972747803 + }, + { + "auxiliary_loss_clip": 0.06645864, + "auxiliary_loss_mlp": 0.01291798, + "balance_loss_clip": 0.06307344, + "balance_loss_mlp": 0.01256083, + "epoch": 0.1170599729445363, + "flos": 15273217221120.0, + "grad_norm": 2.310732730392425, + "language_loss": 0.70257539, + "learning_rate": 3.921018163077448e-06, + "loss": 0.78195202, + "num_input_tokens_seen": 42182955, + "router_z_loss_clip": 3.38671875, + "router_z_loss_mlp": 0.35717773, + "step": 1947, + "time_per_iteration": 2.536513090133667 + }, + { + "auxiliary_loss_clip": 0.0665355, + "auxiliary_loss_mlp": 0.01301689, + "balance_loss_clip": 0.0630812, + "balance_loss_mlp": 0.01263113, + "epoch": 0.11712009619720427, + "flos": 17170007696640.0, + "grad_norm": 1.8188768357243443, + "language_loss": 0.86507225, + "learning_rate": 3.920909759473295e-06, + "loss": 0.94462466, + "num_input_tokens_seen": 42200760, + "router_z_loss_clip": 3.45507812, + "router_z_loss_mlp": 0.38574219, + "step": 1948, + "time_per_iteration": 2.515779495239258 + }, + { + "auxiliary_loss_clip": 0.06494473, + "auxiliary_loss_mlp": 0.01265792, + "balance_loss_clip": 0.06290484, + "balance_loss_mlp": 0.01249031, + "epoch": 0.11718021944987224, + "flos": 70961076887040.0, + "grad_norm": 2.567078438362061, + "language_loss": 0.65165019, + "learning_rate": 3.920801283028054e-06, + "loss": 0.72925287, + "num_input_tokens_seen": 42265745, + "router_z_loss_clip": 2.04101562, + "router_z_loss_mlp": 0.16772461, + "step": 1949, + "time_per_iteration": 3.177534341812134 + }, + { + "auxiliary_loss_clip": 0.06637877, + "auxiliary_loss_mlp": 0.0129446, + "balance_loss_clip": 0.06306669, + "balance_loss_mlp": 0.01261344, + "epoch": 0.1172403427025402, + "flos": 27460750711680.0, + "grad_norm": 1.6361907196052987, + "language_loss": 0.73358595, + "learning_rate": 3.920692733745835e-06, + "loss": 0.81290931, + "num_input_tokens_seen": 42286245, + "router_z_loss_clip": 3.31054688, + "router_z_loss_mlp": 0.33129883, + "step": 1950, + "time_per_iteration": 4.022751808166504 + }, + { + "auxiliary_loss_clip": 0.06660106, + "auxiliary_loss_mlp": 0.01302647, + "balance_loss_clip": 0.063132, + "balance_loss_mlp": 0.01265382, + "epoch": 0.11730046595520818, + "flos": 15674075953920.0, + "grad_norm": 2.7331916034067363, + "language_loss": 0.77657926, + "learning_rate": 3.920584111630755e-06, + "loss": 0.85620677, + "num_input_tokens_seen": 42302710, + "router_z_loss_clip": 3.46875, + "router_z_loss_mlp": 0.37280273, + "step": 1951, + "time_per_iteration": 2.5281777381896973 + }, + { + "auxiliary_loss_clip": 0.06648034, + "auxiliary_loss_mlp": 0.01294944, + "balance_loss_clip": 0.06303104, + "balance_loss_mlp": 0.01259801, + "epoch": 0.11736058920787615, + "flos": 25637320085760.0, + "grad_norm": 1.948975435069226, + "language_loss": 0.77674389, + "learning_rate": 3.9204754166869325e-06, + "loss": 0.85617363, + "num_input_tokens_seen": 42324115, + "router_z_loss_clip": 3.44921875, + "router_z_loss_mlp": 0.35131836, + "step": 1952, + "time_per_iteration": 4.001826286315918 + }, + { + "auxiliary_loss_clip": 0.06657356, + "auxiliary_loss_mlp": 0.01307688, + "balance_loss_clip": 0.06309209, + "balance_loss_mlp": 0.01270828, + "epoch": 0.11742071246054411, + "flos": 21440742122880.0, + "grad_norm": 9.62552088472932, + "language_loss": 0.73713255, + "learning_rate": 3.920366648918491e-06, + "loss": 0.81678301, + "num_input_tokens_seen": 42342505, + "router_z_loss_clip": 3.48242188, + "router_z_loss_mlp": 0.3684082, + "step": 1953, + "time_per_iteration": 2.5549252033233643 + }, + { + "auxiliary_loss_clip": 0.06670918, + "auxiliary_loss_mlp": 0.0130466, + "balance_loss_clip": 0.06317334, + "balance_loss_mlp": 0.01266203, + "epoch": 0.11748083571321208, + "flos": 16003377699840.0, + "grad_norm": 2.536716983337743, + "language_loss": 0.80894691, + "learning_rate": 3.920257808329552e-06, + "loss": 0.88870263, + "num_input_tokens_seen": 42360525, + "router_z_loss_clip": 3.53710938, + "router_z_loss_mlp": 0.38452148, + "step": 1954, + "time_per_iteration": 2.5963521003723145 + }, + { + "auxiliary_loss_clip": 0.06659664, + "auxiliary_loss_mlp": 0.01298566, + "balance_loss_clip": 0.06309056, + "balance_loss_mlp": 0.01260037, + "epoch": 0.11754095896588006, + "flos": 16185582403200.0, + "grad_norm": 1.9904438509588216, + "language_loss": 0.86966431, + "learning_rate": 3.920148894924246e-06, + "loss": 0.94924664, + "num_input_tokens_seen": 42377045, + "router_z_loss_clip": 3.50195312, + "router_z_loss_mlp": 0.38500977, + "step": 1955, + "time_per_iteration": 3.9597103595733643 + }, + { + "auxiliary_loss_clip": 0.06656501, + "auxiliary_loss_mlp": 0.0129708, + "balance_loss_clip": 0.06311554, + "balance_loss_mlp": 0.01262962, + "epoch": 0.11760108221854802, + "flos": 13266701424000.0, + "grad_norm": 2.228472811519511, + "language_loss": 0.79745102, + "learning_rate": 3.920039908706701e-06, + "loss": 0.8769868, + "num_input_tokens_seen": 42393960, + "router_z_loss_clip": 3.44726562, + "router_z_loss_mlp": 0.34130859, + "step": 1956, + "time_per_iteration": 3.990912437438965 + }, + { + "auxiliary_loss_clip": 0.0665153, + "auxiliary_loss_mlp": 0.01299416, + "balance_loss_clip": 0.06313992, + "balance_loss_mlp": 0.01266014, + "epoch": 0.11766120547121599, + "flos": 24505294625280.0, + "grad_norm": 2.0751916947238755, + "language_loss": 0.81691504, + "learning_rate": 3.91993084968105e-06, + "loss": 0.89642453, + "num_input_tokens_seen": 42413160, + "router_z_loss_clip": 3.37695312, + "router_z_loss_mlp": 0.33398438, + "step": 1957, + "time_per_iteration": 2.6472387313842773 + }, + { + "auxiliary_loss_clip": 0.06660254, + "auxiliary_loss_mlp": 0.01296947, + "balance_loss_clip": 0.06313962, + "balance_loss_mlp": 0.01261757, + "epoch": 0.11772132872388397, + "flos": 17789562385920.0, + "grad_norm": 3.000987002447453, + "language_loss": 0.80231309, + "learning_rate": 3.919821717851428e-06, + "loss": 0.88188511, + "num_input_tokens_seen": 42432590, + "router_z_loss_clip": 3.45898438, + "router_z_loss_mlp": 0.35180664, + "step": 1958, + "time_per_iteration": 2.5531046390533447 + }, + { + "auxiliary_loss_clip": 0.06667449, + "auxiliary_loss_mlp": 0.01302997, + "balance_loss_clip": 0.06316346, + "balance_loss_mlp": 0.01263968, + "epoch": 0.11778145197655193, + "flos": 13220776586880.0, + "grad_norm": 3.2848276198767725, + "language_loss": 0.78886813, + "learning_rate": 3.919712513221976e-06, + "loss": 0.86857259, + "num_input_tokens_seen": 42450135, + "router_z_loss_clip": 3.51171875, + "router_z_loss_mlp": 0.39038086, + "step": 1959, + "time_per_iteration": 2.57987642288208 + }, + { + "auxiliary_loss_clip": 0.06661299, + "auxiliary_loss_mlp": 0.01290487, + "balance_loss_clip": 0.06313363, + "balance_loss_mlp": 0.0125656, + "epoch": 0.1178415752292199, + "flos": 20236446915840.0, + "grad_norm": 2.2069161558777033, + "language_loss": 0.72216022, + "learning_rate": 3.919603235796832e-06, + "loss": 0.80167806, + "num_input_tokens_seen": 42470050, + "router_z_loss_clip": 3.47851562, + "router_z_loss_mlp": 0.33911133, + "step": 1960, + "time_per_iteration": 2.568760633468628 + }, + { + "auxiliary_loss_clip": 0.06675136, + "auxiliary_loss_mlp": 0.0129754, + "balance_loss_clip": 0.0632275, + "balance_loss_mlp": 0.01260156, + "epoch": 0.11790169848188788, + "flos": 13044777085440.0, + "grad_norm": 2.729190408722114, + "language_loss": 0.83173323, + "learning_rate": 3.9194938855801406e-06, + "loss": 0.91146004, + "num_input_tokens_seen": 42484335, + "router_z_loss_clip": 3.51953125, + "router_z_loss_mlp": 0.3737793, + "step": 1961, + "time_per_iteration": 2.5375704765319824 + }, + { + "auxiliary_loss_clip": 0.06648357, + "auxiliary_loss_mlp": 0.01294811, + "balance_loss_clip": 0.06310797, + "balance_loss_mlp": 0.01261671, + "epoch": 0.11796182173455584, + "flos": 22271026631040.0, + "grad_norm": 1.7537121481691995, + "language_loss": 0.93383837, + "learning_rate": 3.919384462576049e-06, + "loss": 1.01327002, + "num_input_tokens_seen": 42502720, + "router_z_loss_clip": 3.375, + "router_z_loss_mlp": 0.33105469, + "step": 1962, + "time_per_iteration": 2.5976755619049072 + }, + { + "auxiliary_loss_clip": 0.06656337, + "auxiliary_loss_mlp": 0.01295869, + "balance_loss_clip": 0.06308894, + "balance_loss_mlp": 0.0125994, + "epoch": 0.1180219449872238, + "flos": 10639750469760.0, + "grad_norm": 2.255465148131723, + "language_loss": 0.89418864, + "learning_rate": 3.919274966788707e-06, + "loss": 0.97371072, + "num_input_tokens_seen": 42519460, + "router_z_loss_clip": 3.4765625, + "router_z_loss_mlp": 0.35961914, + "step": 1963, + "time_per_iteration": 2.543811321258545 + }, + { + "auxiliary_loss_clip": 0.06669922, + "auxiliary_loss_mlp": 0.01296273, + "balance_loss_clip": 0.0631619, + "balance_loss_mlp": 0.01260963, + "epoch": 0.11808206823989177, + "flos": 20929906506240.0, + "grad_norm": 1.978622705265592, + "language_loss": 0.85645056, + "learning_rate": 3.919165398222265e-06, + "loss": 0.93611252, + "num_input_tokens_seen": 42539420, + "router_z_loss_clip": 3.5390625, + "router_z_loss_mlp": 0.35327148, + "step": 1964, + "time_per_iteration": 2.623378276824951 + }, + { + "auxiliary_loss_clip": 0.06654269, + "auxiliary_loss_mlp": 0.01293841, + "balance_loss_clip": 0.06309862, + "balance_loss_mlp": 0.01258722, + "epoch": 0.11814219149255975, + "flos": 20784151128960.0, + "grad_norm": 2.5088973707394833, + "language_loss": 0.84141672, + "learning_rate": 3.919055756880879e-06, + "loss": 0.92089784, + "num_input_tokens_seen": 42558225, + "router_z_loss_clip": 3.44335938, + "router_z_loss_mlp": 0.35107422, + "step": 1965, + "time_per_iteration": 2.5660836696624756 + }, + { + "auxiliary_loss_clip": 0.0666364, + "auxiliary_loss_mlp": 0.01301878, + "balance_loss_clip": 0.06310593, + "balance_loss_mlp": 0.01261681, + "epoch": 0.11820231474522772, + "flos": 48770594357760.0, + "grad_norm": 7.622964926374016, + "language_loss": 0.75756431, + "learning_rate": 3.918946042768707e-06, + "loss": 0.83721948, + "num_input_tokens_seen": 42580790, + "router_z_loss_clip": 3.53320312, + "router_z_loss_mlp": 0.40185547, + "step": 1966, + "time_per_iteration": 2.82966947555542 + }, + { + "auxiliary_loss_clip": 0.06671088, + "auxiliary_loss_mlp": 0.01309316, + "balance_loss_clip": 0.06322029, + "balance_loss_mlp": 0.01273887, + "epoch": 0.11826243799789568, + "flos": 16696166457600.0, + "grad_norm": 4.386609320764267, + "language_loss": 0.74750423, + "learning_rate": 3.918836255889908e-06, + "loss": 0.8273083, + "num_input_tokens_seen": 42597355, + "router_z_loss_clip": 3.49414062, + "router_z_loss_mlp": 0.35449219, + "step": 1967, + "time_per_iteration": 2.5282158851623535 + }, + { + "auxiliary_loss_clip": 0.06658092, + "auxiliary_loss_mlp": 0.01304409, + "balance_loss_clip": 0.06307551, + "balance_loss_mlp": 0.01268003, + "epoch": 0.11832256125056366, + "flos": 16915533246720.0, + "grad_norm": 2.9401944207789934, + "language_loss": 0.90244436, + "learning_rate": 3.9187263962486456e-06, + "loss": 0.98206937, + "num_input_tokens_seen": 42616060, + "router_z_loss_clip": 3.5078125, + "router_z_loss_mlp": 0.36401367, + "step": 1968, + "time_per_iteration": 2.573209285736084 + }, + { + "auxiliary_loss_clip": 0.06659393, + "auxiliary_loss_mlp": 0.01300215, + "balance_loss_clip": 0.06313194, + "balance_loss_mlp": 0.01266264, + "epoch": 0.11838268450323162, + "flos": 22827032398080.0, + "grad_norm": 2.909458687960279, + "language_loss": 0.68506658, + "learning_rate": 3.918616463849087e-06, + "loss": 0.76466268, + "num_input_tokens_seen": 42636285, + "router_z_loss_clip": 3.46289062, + "router_z_loss_mlp": 0.33935547, + "step": 1969, + "time_per_iteration": 2.574584484100342 + }, + { + "auxiliary_loss_clip": 0.06652254, + "auxiliary_loss_mlp": 0.01317322, + "balance_loss_clip": 0.06307729, + "balance_loss_mlp": 0.01281034, + "epoch": 0.11844280775589959, + "flos": 33554035296000.0, + "grad_norm": 1.9192483322460232, + "language_loss": 0.81922328, + "learning_rate": 3.918506458695399e-06, + "loss": 0.89891899, + "num_input_tokens_seen": 42658320, + "router_z_loss_clip": 3.44335938, + "router_z_loss_mlp": 0.36303711, + "step": 1970, + "time_per_iteration": 2.688477039337158 + }, + { + "auxiliary_loss_clip": 0.06493312, + "auxiliary_loss_mlp": 0.01272243, + "balance_loss_clip": 0.06287479, + "balance_loss_mlp": 0.01257163, + "epoch": 0.11850293100856757, + "flos": 66371522474880.0, + "grad_norm": 0.7778041955901001, + "language_loss": 0.66349763, + "learning_rate": 3.918396380791754e-06, + "loss": 0.74115324, + "num_input_tokens_seen": 42721500, + "router_z_loss_clip": 2.0625, + "router_z_loss_mlp": 0.1505127, + "step": 1971, + "time_per_iteration": 3.1715264320373535 + }, + { + "auxiliary_loss_clip": 0.06664559, + "auxiliary_loss_mlp": 0.01309662, + "balance_loss_clip": 0.06317366, + "balance_loss_mlp": 0.01274996, + "epoch": 0.11856305426123553, + "flos": 24687960526080.0, + "grad_norm": 2.78038897761295, + "language_loss": 0.81843936, + "learning_rate": 3.918286230142327e-06, + "loss": 0.89818156, + "num_input_tokens_seen": 42739825, + "router_z_loss_clip": 3.47070312, + "router_z_loss_mlp": 0.34643555, + "step": 1972, + "time_per_iteration": 2.6285483837127686 + }, + { + "auxiliary_loss_clip": 0.06645221, + "auxiliary_loss_mlp": 0.01320916, + "balance_loss_clip": 0.06308153, + "balance_loss_mlp": 0.01286179, + "epoch": 0.1186231775139035, + "flos": 24287017939200.0, + "grad_norm": 2.7493832888964116, + "language_loss": 0.746387, + "learning_rate": 3.918176006751292e-06, + "loss": 0.82604837, + "num_input_tokens_seen": 42758695, + "router_z_loss_clip": 3.36914062, + "router_z_loss_mlp": 0.34716797, + "step": 1973, + "time_per_iteration": 2.607680082321167 + }, + { + "auxiliary_loss_clip": 0.06639803, + "auxiliary_loss_mlp": 0.0131421, + "balance_loss_clip": 0.06300108, + "balance_loss_mlp": 0.01277851, + "epoch": 0.11868330076657148, + "flos": 21763042053120.0, + "grad_norm": 1.6365219196166583, + "language_loss": 0.73750299, + "learning_rate": 3.918065710622832e-06, + "loss": 0.81704313, + "num_input_tokens_seen": 42778510, + "router_z_loss_clip": 3.3984375, + "router_z_loss_mlp": 0.36352539, + "step": 1974, + "time_per_iteration": 2.603078603744507 + }, + { + "auxiliary_loss_clip": 0.06653641, + "auxiliary_loss_mlp": 0.01323127, + "balance_loss_clip": 0.06305285, + "balance_loss_mlp": 0.01286196, + "epoch": 0.11874342401923944, + "flos": 17197568490240.0, + "grad_norm": 3.7102130607090893, + "language_loss": 0.79475862, + "learning_rate": 3.917955341761128e-06, + "loss": 0.87452626, + "num_input_tokens_seen": 42793995, + "router_z_loss_clip": 3.48242188, + "router_z_loss_mlp": 0.36914062, + "step": 1975, + "time_per_iteration": 2.529472827911377 + }, + { + "auxiliary_loss_clip": 0.06637481, + "auxiliary_loss_mlp": 0.01318957, + "balance_loss_clip": 0.06305119, + "balance_loss_mlp": 0.01286246, + "epoch": 0.11880354727190741, + "flos": 15234629616000.0, + "grad_norm": 3.277775960681522, + "language_loss": 0.77101427, + "learning_rate": 3.917844900170364e-06, + "loss": 0.85057861, + "num_input_tokens_seen": 42809000, + "router_z_loss_clip": 3.3203125, + "router_z_loss_mlp": 0.32714844, + "step": 1976, + "time_per_iteration": 2.5576260089874268 + }, + { + "auxiliary_loss_clip": 0.06648317, + "auxiliary_loss_mlp": 0.01301156, + "balance_loss_clip": 0.0630495, + "balance_loss_mlp": 0.0126537, + "epoch": 0.11886367052457537, + "flos": 27317343248640.0, + "grad_norm": 1.6788870618385208, + "language_loss": 0.76201534, + "learning_rate": 3.91773438585473e-06, + "loss": 0.84151006, + "num_input_tokens_seen": 42831585, + "router_z_loss_clip": 3.43164062, + "router_z_loss_mlp": 0.35791016, + "step": 1977, + "time_per_iteration": 2.6103506088256836 + }, + { + "auxiliary_loss_clip": 0.06654633, + "auxiliary_loss_mlp": 0.01297753, + "balance_loss_clip": 0.06301999, + "balance_loss_mlp": 0.01261346, + "epoch": 0.11892379377724335, + "flos": 21804648405120.0, + "grad_norm": 2.329560685386949, + "language_loss": 0.75601208, + "learning_rate": 3.9176237988184165e-06, + "loss": 0.835536, + "num_input_tokens_seen": 42848420, + "router_z_loss_clip": 3.5234375, + "router_z_loss_mlp": 0.36401367, + "step": 1978, + "time_per_iteration": 2.556502103805542 + }, + { + "auxiliary_loss_clip": 0.06647499, + "auxiliary_loss_mlp": 0.01294249, + "balance_loss_clip": 0.06307921, + "balance_loss_mlp": 0.0126068, + "epoch": 0.11898391702991132, + "flos": 13996191070080.0, + "grad_norm": 1.8023230195278173, + "language_loss": 0.74423146, + "learning_rate": 3.917513139065616e-06, + "loss": 0.82364893, + "num_input_tokens_seen": 42866645, + "router_z_loss_clip": 3.40039062, + "router_z_loss_mlp": 0.33569336, + "step": 1979, + "time_per_iteration": 2.595372200012207 + }, + { + "auxiliary_loss_clip": 0.0664144, + "auxiliary_loss_mlp": 0.01296465, + "balance_loss_clip": 0.06302245, + "balance_loss_mlp": 0.01261965, + "epoch": 0.11904404028257928, + "flos": 32242907733120.0, + "grad_norm": 1.646895354500375, + "language_loss": 0.99974936, + "learning_rate": 3.917402406600525e-06, + "loss": 1.07912838, + "num_input_tokens_seen": 42888515, + "router_z_loss_clip": 3.39257812, + "router_z_loss_mlp": 0.34521484, + "step": 1980, + "time_per_iteration": 2.6381077766418457 + }, + { + "auxiliary_loss_clip": 0.06647406, + "auxiliary_loss_mlp": 0.01292706, + "balance_loss_clip": 0.0630409, + "balance_loss_mlp": 0.01256299, + "epoch": 0.11910416353524726, + "flos": 23592971370240.0, + "grad_norm": 2.6857595325388095, + "language_loss": 0.87083352, + "learning_rate": 3.917291601427342e-06, + "loss": 0.95023465, + "num_input_tokens_seen": 42909035, + "router_z_loss_clip": 3.43359375, + "router_z_loss_mlp": 0.36401367, + "step": 1981, + "time_per_iteration": 2.5953710079193115 + }, + { + "auxiliary_loss_clip": 0.0664432, + "auxiliary_loss_mlp": 0.01298025, + "balance_loss_clip": 0.06305191, + "balance_loss_mlp": 0.01263287, + "epoch": 0.11916428678791523, + "flos": 25339268712960.0, + "grad_norm": 1.936683956575477, + "language_loss": 0.86578631, + "learning_rate": 3.91718072355027e-06, + "loss": 0.94520986, + "num_input_tokens_seen": 42927555, + "router_z_loss_clip": 3.38867188, + "router_z_loss_mlp": 0.34765625, + "step": 1982, + "time_per_iteration": 2.5845234394073486 + }, + { + "auxiliary_loss_clip": 0.06636401, + "auxiliary_loss_mlp": 0.01296498, + "balance_loss_clip": 0.06295685, + "balance_loss_mlp": 0.0126095, + "epoch": 0.11922441004058319, + "flos": 19793939904000.0, + "grad_norm": 2.0505681107153273, + "language_loss": 0.86230731, + "learning_rate": 3.917069772973513e-06, + "loss": 0.94163632, + "num_input_tokens_seen": 42945300, + "router_z_loss_clip": 3.40820312, + "router_z_loss_mlp": 0.35571289, + "step": 1983, + "time_per_iteration": 2.554844379425049 + }, + { + "auxiliary_loss_clip": 0.06654783, + "auxiliary_loss_mlp": 0.01292763, + "balance_loss_clip": 0.06302382, + "balance_loss_mlp": 0.01256858, + "epoch": 0.11928453329325117, + "flos": 21541578912000.0, + "grad_norm": 3.6464912777756373, + "language_loss": 0.78593659, + "learning_rate": 3.916958749701277e-06, + "loss": 0.86541206, + "num_input_tokens_seen": 42961295, + "router_z_loss_clip": 3.51757812, + "router_z_loss_mlp": 0.35913086, + "step": 1984, + "time_per_iteration": 2.5320324897766113 + }, + { + "auxiliary_loss_clip": 0.06647135, + "auxiliary_loss_mlp": 0.01292695, + "balance_loss_clip": 0.0630364, + "balance_loss_mlp": 0.0125574, + "epoch": 0.11934465654591914, + "flos": 20821522849920.0, + "grad_norm": 1.8707303629344072, + "language_loss": 0.84522444, + "learning_rate": 3.9168476537377745e-06, + "loss": 0.92462277, + "num_input_tokens_seen": 42980330, + "router_z_loss_clip": 3.43164062, + "router_z_loss_mlp": 0.36962891, + "step": 1985, + "time_per_iteration": 2.6096858978271484 + }, + { + "auxiliary_loss_clip": 0.06641059, + "auxiliary_loss_mlp": 0.01296367, + "balance_loss_clip": 0.06304613, + "balance_loss_mlp": 0.01263346, + "epoch": 0.1194047797985871, + "flos": 19066169266560.0, + "grad_norm": 3.6983230286651945, + "language_loss": 0.75468755, + "learning_rate": 3.916736485087216e-06, + "loss": 0.83406186, + "num_input_tokens_seen": 42996125, + "router_z_loss_clip": 3.3671875, + "router_z_loss_mlp": 0.33007812, + "step": 1986, + "time_per_iteration": 2.497166633605957 + }, + { + "auxiliary_loss_clip": 0.06650525, + "auxiliary_loss_mlp": 0.01300056, + "balance_loss_clip": 0.06311469, + "balance_loss_mlp": 0.01265771, + "epoch": 0.11946490305125507, + "flos": 27196842677760.0, + "grad_norm": 2.5090300356015227, + "language_loss": 0.73365855, + "learning_rate": 3.916625243753819e-06, + "loss": 0.81316435, + "num_input_tokens_seen": 43014180, + "router_z_loss_clip": 3.390625, + "router_z_loss_mlp": 0.34301758, + "step": 1987, + "time_per_iteration": 2.6316466331481934 + }, + { + "auxiliary_loss_clip": 0.06659403, + "auxiliary_loss_mlp": 0.01313937, + "balance_loss_clip": 0.06313819, + "balance_loss_mlp": 0.01275886, + "epoch": 0.11952502630392305, + "flos": 21146925381120.0, + "grad_norm": 1.9895182313514284, + "language_loss": 0.73564172, + "learning_rate": 3.916513929741799e-06, + "loss": 0.81537521, + "num_input_tokens_seen": 43032120, + "router_z_loss_clip": 3.453125, + "router_z_loss_mlp": 0.38012695, + "step": 1988, + "time_per_iteration": 2.538780450820923 + }, + { + "auxiliary_loss_clip": 0.06646325, + "auxiliary_loss_mlp": 0.01300531, + "balance_loss_clip": 0.06309503, + "balance_loss_mlp": 0.01265817, + "epoch": 0.11958514955659101, + "flos": 22130260571520.0, + "grad_norm": 2.1843811344265434, + "language_loss": 0.82602763, + "learning_rate": 3.91640254305538e-06, + "loss": 0.90549618, + "num_input_tokens_seen": 43052215, + "router_z_loss_clip": 3.3671875, + "router_z_loss_mlp": 0.34716797, + "step": 1989, + "time_per_iteration": 2.6741979122161865 + }, + { + "auxiliary_loss_clip": 0.06651568, + "auxiliary_loss_mlp": 0.01303723, + "balance_loss_clip": 0.06308736, + "balance_loss_mlp": 0.01266482, + "epoch": 0.11964527280925898, + "flos": 17427333185280.0, + "grad_norm": 3.1495832164614828, + "language_loss": 0.77526391, + "learning_rate": 3.916291083698784e-06, + "loss": 0.85481679, + "num_input_tokens_seen": 43069720, + "router_z_loss_clip": 3.42578125, + "router_z_loss_mlp": 0.37255859, + "step": 1990, + "time_per_iteration": 3.9906837940216064 + }, + { + "auxiliary_loss_clip": 0.06541168, + "auxiliary_loss_mlp": 0.0131986, + "balance_loss_clip": 0.06337936, + "balance_loss_mlp": 0.01304852, + "epoch": 0.11970539606192696, + "flos": 70698804007680.0, + "grad_norm": 0.8660684283454352, + "language_loss": 0.55407226, + "learning_rate": 3.916179551676238e-06, + "loss": 0.63268256, + "num_input_tokens_seen": 43123130, + "router_z_loss_clip": 2.03125, + "router_z_loss_mlp": 0.14978027, + "step": 1991, + "time_per_iteration": 4.6956093311309814 + }, + { + "auxiliary_loss_clip": 0.06638116, + "auxiliary_loss_mlp": 0.01295675, + "balance_loss_clip": 0.06307568, + "balance_loss_mlp": 0.01263345, + "epoch": 0.11976551931459492, + "flos": 21221375333760.0, + "grad_norm": 2.476959921909238, + "language_loss": 0.79074007, + "learning_rate": 3.916067946991971e-06, + "loss": 0.87007797, + "num_input_tokens_seen": 43140015, + "router_z_loss_clip": 3.3046875, + "router_z_loss_mlp": 0.32348633, + "step": 1992, + "time_per_iteration": 2.5945029258728027 + }, + { + "auxiliary_loss_clip": 0.06650865, + "auxiliary_loss_mlp": 0.01302479, + "balance_loss_clip": 0.06309184, + "balance_loss_mlp": 0.01267647, + "epoch": 0.11982564256726289, + "flos": 25995566217600.0, + "grad_norm": 2.0953190944700215, + "language_loss": 0.800017, + "learning_rate": 3.915956269650216e-06, + "loss": 0.87955046, + "num_input_tokens_seen": 43160105, + "router_z_loss_clip": 3.41601562, + "router_z_loss_mlp": 0.34838867, + "step": 1993, + "time_per_iteration": 2.5923471450805664 + }, + { + "auxiliary_loss_clip": 0.06641386, + "auxiliary_loss_mlp": 0.0130103, + "balance_loss_clip": 0.06304519, + "balance_loss_mlp": 0.01266793, + "epoch": 0.11988576581993086, + "flos": 21656964384000.0, + "grad_norm": 1.8929635889117382, + "language_loss": 0.83093858, + "learning_rate": 3.915844519655208e-06, + "loss": 0.91036278, + "num_input_tokens_seen": 43179835, + "router_z_loss_clip": 3.37304688, + "router_z_loss_mlp": 0.3425293, + "step": 1994, + "time_per_iteration": 2.58314847946167 + }, + { + "auxiliary_loss_clip": 0.06638885, + "auxiliary_loss_mlp": 0.01299925, + "balance_loss_clip": 0.06306463, + "balance_loss_mlp": 0.01265617, + "epoch": 0.11994588907259883, + "flos": 17863048016640.0, + "grad_norm": 2.42141016996774, + "language_loss": 0.90494514, + "learning_rate": 3.915732697011183e-06, + "loss": 0.98433328, + "num_input_tokens_seen": 43197210, + "router_z_loss_clip": 3.3203125, + "router_z_loss_mlp": 0.34301758, + "step": 1995, + "time_per_iteration": 5.38932991027832 + }, + { + "auxiliary_loss_clip": 0.06647271, + "auxiliary_loss_mlp": 0.01300085, + "balance_loss_clip": 0.06306107, + "balance_loss_mlp": 0.01263583, + "epoch": 0.1200060123252668, + "flos": 24469725767040.0, + "grad_norm": 3.463827549229225, + "language_loss": 0.75938386, + "learning_rate": 3.9156208017223825e-06, + "loss": 0.83885741, + "num_input_tokens_seen": 43215050, + "router_z_loss_clip": 3.41015625, + "router_z_loss_mlp": 0.36523438, + "step": 1996, + "time_per_iteration": 2.630936861038208 + }, + { + "auxiliary_loss_clip": 0.06633951, + "auxiliary_loss_mlp": 0.01306595, + "balance_loss_clip": 0.06300932, + "balance_loss_mlp": 0.01273097, + "epoch": 0.12006613557793476, + "flos": 18737831842560.0, + "grad_norm": 2.002664476767551, + "language_loss": 0.88733006, + "learning_rate": 3.915508833793048e-06, + "loss": 0.96673548, + "num_input_tokens_seen": 43233900, + "router_z_loss_clip": 3.328125, + "router_z_loss_mlp": 0.33496094, + "step": 1997, + "time_per_iteration": 2.542490243911743 + }, + { + "auxiliary_loss_clip": 0.06639601, + "auxiliary_loss_mlp": 0.01299934, + "balance_loss_clip": 0.06303362, + "balance_loss_mlp": 0.01265864, + "epoch": 0.12012625883060274, + "flos": 22273374545280.0, + "grad_norm": 2.268718132008626, + "language_loss": 0.8047471, + "learning_rate": 3.915396793227428e-06, + "loss": 0.88414252, + "num_input_tokens_seen": 43252105, + "router_z_loss_clip": 3.36523438, + "router_z_loss_mlp": 0.34033203, + "step": 1998, + "time_per_iteration": 2.6070334911346436 + }, + { + "auxiliary_loss_clip": 0.06640439, + "auxiliary_loss_mlp": 0.01306471, + "balance_loss_clip": 0.06312488, + "balance_loss_mlp": 0.01272401, + "epoch": 0.1201863820832707, + "flos": 21764761061760.0, + "grad_norm": 2.100057893204002, + "language_loss": 0.73916173, + "learning_rate": 3.915284680029769e-06, + "loss": 0.81863081, + "num_input_tokens_seen": 43270315, + "router_z_loss_clip": 3.27929688, + "router_z_loss_mlp": 0.34033203, + "step": 1999, + "time_per_iteration": 2.5563113689422607 + }, + { + "auxiliary_loss_clip": 0.0664693, + "auxiliary_loss_mlp": 0.01298334, + "balance_loss_clip": 0.06304446, + "balance_loss_mlp": 0.01263763, + "epoch": 0.12024650533593867, + "flos": 21914415653760.0, + "grad_norm": 2.961282874650153, + "language_loss": 0.76137137, + "learning_rate": 3.915172494204323e-06, + "loss": 0.84082401, + "num_input_tokens_seen": 43289935, + "router_z_loss_clip": 3.42578125, + "router_z_loss_mlp": 0.34545898, + "step": 2000, + "time_per_iteration": 2.6174545288085938 + }, + { + "auxiliary_loss_clip": 0.0664265, + "auxiliary_loss_mlp": 0.0131017, + "balance_loss_clip": 0.06307586, + "balance_loss_mlp": 0.012756, + "epoch": 0.12030662858860665, + "flos": 21695635843200.0, + "grad_norm": 1.7187756113932227, + "language_loss": 0.86554497, + "learning_rate": 3.915060235755344e-06, + "loss": 0.94507325, + "num_input_tokens_seen": 43309325, + "router_z_loss_clip": 3.34960938, + "router_z_loss_mlp": 0.34545898, + "step": 2001, + "time_per_iteration": 2.575740098953247 + }, + { + "auxiliary_loss_clip": 0.06635608, + "auxiliary_loss_mlp": 0.01303825, + "balance_loss_clip": 0.06303231, + "balance_loss_mlp": 0.01270232, + "epoch": 0.12036675184127461, + "flos": 12938280145920.0, + "grad_norm": 3.0530773908117297, + "language_loss": 0.75370091, + "learning_rate": 3.91494790468709e-06, + "loss": 0.83309525, + "num_input_tokens_seen": 43327010, + "router_z_loss_clip": 3.32226562, + "router_z_loss_mlp": 0.33618164, + "step": 2002, + "time_per_iteration": 2.5708627700805664 + }, + { + "auxiliary_loss_clip": 0.06653483, + "auxiliary_loss_mlp": 0.01301657, + "balance_loss_clip": 0.06308778, + "balance_loss_mlp": 0.01265322, + "epoch": 0.12042687509394258, + "flos": 20857469051520.0, + "grad_norm": 3.724600785525669, + "language_loss": 0.79714429, + "learning_rate": 3.9148355010038185e-06, + "loss": 0.87669575, + "num_input_tokens_seen": 43345650, + "router_z_loss_clip": 3.44726562, + "router_z_loss_mlp": 0.36352539, + "step": 2003, + "time_per_iteration": 2.5530362129211426 + }, + { + "auxiliary_loss_clip": 0.06639621, + "auxiliary_loss_mlp": 0.01310661, + "balance_loss_clip": 0.06304517, + "balance_loss_mlp": 0.01276638, + "epoch": 0.12048699834661056, + "flos": 23885320665600.0, + "grad_norm": 3.082354768272036, + "language_loss": 0.72748882, + "learning_rate": 3.914723024709793e-06, + "loss": 0.80699164, + "num_input_tokens_seen": 43365555, + "router_z_loss_clip": 3.3515625, + "router_z_loss_mlp": 0.34008789, + "step": 2004, + "time_per_iteration": 2.583922863006592 + }, + { + "auxiliary_loss_clip": 0.06642192, + "auxiliary_loss_mlp": 0.01300449, + "balance_loss_clip": 0.06302966, + "balance_loss_mlp": 0.01263899, + "epoch": 0.12054712159927852, + "flos": 19762605676800.0, + "grad_norm": 1.8151207739831152, + "language_loss": 0.79435182, + "learning_rate": 3.914610475809279e-06, + "loss": 0.87377822, + "num_input_tokens_seen": 43384990, + "router_z_loss_clip": 3.39257812, + "router_z_loss_mlp": 0.36547852, + "step": 2005, + "time_per_iteration": 2.5544016361236572 + }, + { + "auxiliary_loss_clip": 0.06498255, + "auxiliary_loss_mlp": 0.01304889, + "balance_loss_clip": 0.06296292, + "balance_loss_mlp": 0.01289821, + "epoch": 0.12060724485194649, + "flos": 51688999411200.0, + "grad_norm": 0.895152271859771, + "language_loss": 0.5819217, + "learning_rate": 3.914497854306543e-06, + "loss": 0.65995312, + "num_input_tokens_seen": 43436335, + "router_z_loss_clip": 2.02539062, + "router_z_loss_mlp": 0.15039062, + "step": 2006, + "time_per_iteration": 2.9925737380981445 + }, + { + "auxiliary_loss_clip": 0.06637617, + "auxiliary_loss_mlp": 0.01298518, + "balance_loss_clip": 0.06307045, + "balance_loss_mlp": 0.01264042, + "epoch": 0.12066736810461445, + "flos": 18996582850560.0, + "grad_norm": 2.2145885601274653, + "language_loss": 0.77570707, + "learning_rate": 3.9143851602058575e-06, + "loss": 0.85506845, + "num_input_tokens_seen": 43456495, + "router_z_loss_clip": 3.30664062, + "router_z_loss_mlp": 0.34472656, + "step": 2007, + "time_per_iteration": 2.5426108837127686 + }, + { + "auxiliary_loss_clip": 0.0663473, + "auxiliary_loss_mlp": 0.01296019, + "balance_loss_clip": 0.06301288, + "balance_loss_mlp": 0.01260352, + "epoch": 0.12072749135728243, + "flos": 16477554355200.0, + "grad_norm": 3.5055454300142346, + "language_loss": 0.8601926, + "learning_rate": 3.914272393511494e-06, + "loss": 0.93950009, + "num_input_tokens_seen": 43473085, + "router_z_loss_clip": 3.328125, + "router_z_loss_mlp": 0.35668945, + "step": 2008, + "time_per_iteration": 2.5499417781829834 + }, + { + "auxiliary_loss_clip": 0.06641807, + "auxiliary_loss_mlp": 0.01291488, + "balance_loss_clip": 0.06305657, + "balance_loss_mlp": 0.0125768, + "epoch": 0.1207876146099504, + "flos": 18082917930240.0, + "grad_norm": 2.14462830622821, + "language_loss": 0.84945571, + "learning_rate": 3.91415955422773e-06, + "loss": 0.92878866, + "num_input_tokens_seen": 43491135, + "router_z_loss_clip": 3.359375, + "router_z_loss_mlp": 0.33813477, + "step": 2009, + "time_per_iteration": 2.5377557277679443 + }, + { + "auxiliary_loss_clip": 0.06634751, + "auxiliary_loss_mlp": 0.01300176, + "balance_loss_clip": 0.06306206, + "balance_loss_mlp": 0.01266225, + "epoch": 0.12084773786261836, + "flos": 21878008254720.0, + "grad_norm": 2.1676887329617336, + "language_loss": 0.85496145, + "learning_rate": 3.914046642358844e-06, + "loss": 0.93431073, + "num_input_tokens_seen": 43510440, + "router_z_loss_clip": 3.29101562, + "router_z_loss_mlp": 0.33959961, + "step": 2010, + "time_per_iteration": 2.577526330947876 + }, + { + "auxiliary_loss_clip": 0.06654292, + "auxiliary_loss_mlp": 0.0131443, + "balance_loss_clip": 0.06313477, + "balance_loss_mlp": 0.01277666, + "epoch": 0.12090786111528634, + "flos": 18338985607680.0, + "grad_norm": 2.943319840268963, + "language_loss": 0.85397738, + "learning_rate": 3.9139336579091174e-06, + "loss": 0.93366468, + "num_input_tokens_seen": 43530145, + "router_z_loss_clip": 3.40625, + "router_z_loss_mlp": 0.36767578, + "step": 2011, + "time_per_iteration": 2.5281803607940674 + }, + { + "auxiliary_loss_clip": 0.06651285, + "auxiliary_loss_mlp": 0.01306451, + "balance_loss_clip": 0.06310041, + "balance_loss_mlp": 0.01270975, + "epoch": 0.1209679843679543, + "flos": 21112236990720.0, + "grad_norm": 2.078534673475464, + "language_loss": 0.97477353, + "learning_rate": 3.913820600882834e-06, + "loss": 1.05435085, + "num_input_tokens_seen": 43549315, + "router_z_loss_clip": 3.4140625, + "router_z_loss_mlp": 0.35498047, + "step": 2012, + "time_per_iteration": 2.607473611831665 + }, + { + "auxiliary_loss_clip": 0.06639741, + "auxiliary_loss_mlp": 0.01302196, + "balance_loss_clip": 0.06309405, + "balance_loss_mlp": 0.01268865, + "epoch": 0.12102810762062227, + "flos": 29248612479360.0, + "grad_norm": 1.9848767494674133, + "language_loss": 0.81610048, + "learning_rate": 3.913707471284283e-06, + "loss": 0.89551985, + "num_input_tokens_seen": 43569240, + "router_z_loss_clip": 3.30078125, + "router_z_loss_mlp": 0.33325195, + "step": 2013, + "time_per_iteration": 2.616990566253662 + }, + { + "auxiliary_loss_clip": 0.06652003, + "auxiliary_loss_mlp": 0.01311561, + "balance_loss_clip": 0.06309032, + "balance_loss_mlp": 0.0127525, + "epoch": 0.12108823087329025, + "flos": 17936407866240.0, + "grad_norm": 5.4278493881784415, + "language_loss": 0.78293782, + "learning_rate": 3.9135942691177515e-06, + "loss": 0.8625735, + "num_input_tokens_seen": 43587710, + "router_z_loss_clip": 3.43164062, + "router_z_loss_mlp": 0.36328125, + "step": 2014, + "time_per_iteration": 2.651820421218872 + }, + { + "auxiliary_loss_clip": 0.06640598, + "auxiliary_loss_mlp": 0.01320367, + "balance_loss_clip": 0.0630708, + "balance_loss_mlp": 0.01286344, + "epoch": 0.12114835412595822, + "flos": 22098549000960.0, + "grad_norm": 2.982829144387911, + "language_loss": 0.88284999, + "learning_rate": 3.913480994387535e-06, + "loss": 0.96245968, + "num_input_tokens_seen": 43606000, + "router_z_loss_clip": 3.33398438, + "router_z_loss_mlp": 0.34008789, + "step": 2015, + "time_per_iteration": 2.5447444915771484 + }, + { + "auxiliary_loss_clip": 0.06640744, + "auxiliary_loss_mlp": 0.01318151, + "balance_loss_clip": 0.06308715, + "balance_loss_mlp": 0.01284534, + "epoch": 0.12120847737862618, + "flos": 20418567765120.0, + "grad_norm": 2.096885211944344, + "language_loss": 0.70457768, + "learning_rate": 3.913367647097926e-06, + "loss": 0.78416657, + "num_input_tokens_seen": 43624815, + "router_z_loss_clip": 3.32226562, + "router_z_loss_mlp": 0.3359375, + "step": 2016, + "time_per_iteration": 2.596148729324341 + }, + { + "auxiliary_loss_clip": 0.06646016, + "auxiliary_loss_mlp": 0.01314653, + "balance_loss_clip": 0.06304827, + "balance_loss_mlp": 0.01276792, + "epoch": 0.12126860063129415, + "flos": 22315484021760.0, + "grad_norm": 2.9748504234470214, + "language_loss": 0.80719239, + "learning_rate": 3.913254227253225e-06, + "loss": 0.8867991, + "num_input_tokens_seen": 43643960, + "router_z_loss_clip": 3.41015625, + "router_z_loss_mlp": 0.37890625, + "step": 2017, + "time_per_iteration": 2.531651020050049 + }, + { + "auxiliary_loss_clip": 0.06646961, + "auxiliary_loss_mlp": 0.01325201, + "balance_loss_clip": 0.06301364, + "balance_loss_mlp": 0.01289128, + "epoch": 0.12132872388396213, + "flos": 13704428753280.0, + "grad_norm": 11.74399096976628, + "language_loss": 0.70780957, + "learning_rate": 3.913140734857731e-06, + "loss": 0.78753114, + "num_input_tokens_seen": 43662650, + "router_z_loss_clip": 3.45507812, + "router_z_loss_mlp": 0.3605957, + "step": 2018, + "time_per_iteration": 2.555253267288208 + }, + { + "auxiliary_loss_clip": 0.06636061, + "auxiliary_loss_mlp": 0.01298517, + "balance_loss_clip": 0.06301028, + "balance_loss_mlp": 0.01264828, + "epoch": 0.12138884713663009, + "flos": 26473851722880.0, + "grad_norm": 2.8042762769346714, + "language_loss": 0.73802805, + "learning_rate": 3.91302716991575e-06, + "loss": 0.81737387, + "num_input_tokens_seen": 43684205, + "router_z_loss_clip": 3.34765625, + "router_z_loss_mlp": 0.33691406, + "step": 2019, + "time_per_iteration": 2.6203458309173584 + }, + { + "auxiliary_loss_clip": 0.06639916, + "auxiliary_loss_mlp": 0.01311356, + "balance_loss_clip": 0.06299765, + "balance_loss_mlp": 0.01277238, + "epoch": 0.12144897038929806, + "flos": 26148952316160.0, + "grad_norm": 1.829808829925435, + "language_loss": 0.93501657, + "learning_rate": 3.912913532431586e-06, + "loss": 1.01452923, + "num_input_tokens_seen": 43706320, + "router_z_loss_clip": 3.40234375, + "router_z_loss_mlp": 0.34130859, + "step": 2020, + "time_per_iteration": 2.5888445377349854 + }, + { + "auxiliary_loss_clip": 0.06633772, + "auxiliary_loss_mlp": 0.01299116, + "balance_loss_clip": 0.06297548, + "balance_loss_mlp": 0.01263568, + "epoch": 0.12150909364196603, + "flos": 24724451779200.0, + "grad_norm": 2.526616616661372, + "language_loss": 0.78976464, + "learning_rate": 3.912799822409549e-06, + "loss": 0.86909354, + "num_input_tokens_seen": 43724805, + "router_z_loss_clip": 3.36328125, + "router_z_loss_mlp": 0.35546875, + "step": 2021, + "time_per_iteration": 2.6022841930389404 + }, + { + "auxiliary_loss_clip": 0.0663517, + "auxiliary_loss_mlp": 0.01299013, + "balance_loss_clip": 0.06302813, + "balance_loss_mlp": 0.01266898, + "epoch": 0.121569216894634, + "flos": 25193177919360.0, + "grad_norm": 2.2515588789305645, + "language_loss": 0.8175382, + "learning_rate": 3.912686039853952e-06, + "loss": 0.89688003, + "num_input_tokens_seen": 43742320, + "router_z_loss_clip": 3.32421875, + "router_z_loss_mlp": 0.32128906, + "step": 2022, + "time_per_iteration": 2.5850207805633545 + }, + { + "auxiliary_loss_clip": 0.0664625, + "auxiliary_loss_mlp": 0.01295093, + "balance_loss_clip": 0.06304103, + "balance_loss_mlp": 0.0125964, + "epoch": 0.12162934014730196, + "flos": 13449241543680.0, + "grad_norm": 2.226180845904462, + "language_loss": 0.8644762, + "learning_rate": 3.912572184769108e-06, + "loss": 0.94388956, + "num_input_tokens_seen": 43760665, + "router_z_loss_clip": 3.421875, + "router_z_loss_mlp": 0.35424805, + "step": 2023, + "time_per_iteration": 2.541822671890259 + }, + { + "auxiliary_loss_clip": 0.06652313, + "auxiliary_loss_mlp": 0.01299326, + "balance_loss_clip": 0.06306356, + "balance_loss_mlp": 0.01261394, + "epoch": 0.12168946339996994, + "flos": 16951772937600.0, + "grad_norm": 3.6496728157667477, + "language_loss": 0.87528783, + "learning_rate": 3.912458257159335e-06, + "loss": 0.95480424, + "num_input_tokens_seen": 43779020, + "router_z_loss_clip": 3.4609375, + "router_z_loss_mlp": 0.37963867, + "step": 2024, + "time_per_iteration": 2.510047674179077 + }, + { + "auxiliary_loss_clip": 0.06637174, + "auxiliary_loss_mlp": 0.01298516, + "balance_loss_clip": 0.06299831, + "balance_loss_mlp": 0.01262872, + "epoch": 0.12174958665263791, + "flos": 29828699095680.0, + "grad_norm": 2.180683853985422, + "language_loss": 0.73548269, + "learning_rate": 3.912344257028954e-06, + "loss": 0.8148396, + "num_input_tokens_seen": 43798850, + "router_z_loss_clip": 3.37695312, + "router_z_loss_mlp": 0.35620117, + "step": 2025, + "time_per_iteration": 2.612072229385376 + }, + { + "auxiliary_loss_clip": 0.06640136, + "auxiliary_loss_mlp": 0.01296236, + "balance_loss_clip": 0.06301836, + "balance_loss_mlp": 0.01260425, + "epoch": 0.12180970990530587, + "flos": 24648366672000.0, + "grad_norm": 1.6158057232252747, + "language_loss": 0.77162802, + "learning_rate": 3.912230184382286e-06, + "loss": 0.85099173, + "num_input_tokens_seen": 43820130, + "router_z_loss_clip": 3.38476562, + "router_z_loss_mlp": 0.35766602, + "step": 2026, + "time_per_iteration": 2.5995230674743652 + }, + { + "auxiliary_loss_clip": 0.06645372, + "auxiliary_loss_mlp": 0.01300506, + "balance_loss_clip": 0.06307228, + "balance_loss_mlp": 0.01264219, + "epoch": 0.12186983315797385, + "flos": 20527915743360.0, + "grad_norm": 2.387338120412035, + "language_loss": 0.90280318, + "learning_rate": 3.912116039223659e-06, + "loss": 0.9822619, + "num_input_tokens_seen": 43838485, + "router_z_loss_clip": 3.37890625, + "router_z_loss_mlp": 0.36254883, + "step": 2027, + "time_per_iteration": 2.534867763519287 + }, + { + "auxiliary_loss_clip": 0.06634748, + "auxiliary_loss_mlp": 0.0129945, + "balance_loss_clip": 0.06304284, + "balance_loss_mlp": 0.01266905, + "epoch": 0.12192995641064182, + "flos": 27825705169920.0, + "grad_norm": 2.1781707070906644, + "language_loss": 0.76798415, + "learning_rate": 3.912001821557399e-06, + "loss": 0.84732616, + "num_input_tokens_seen": 43859080, + "router_z_loss_clip": 3.30859375, + "router_z_loss_mlp": 0.32543945, + "step": 2028, + "time_per_iteration": 2.578725576400757 + }, + { + "auxiliary_loss_clip": 0.0664517, + "auxiliary_loss_mlp": 0.01295232, + "balance_loss_clip": 0.06306128, + "balance_loss_mlp": 0.012614, + "epoch": 0.12199007966330978, + "flos": 22023512069760.0, + "grad_norm": 2.4518178731886318, + "language_loss": 0.78897178, + "learning_rate": 3.911887531387839e-06, + "loss": 0.86837584, + "num_input_tokens_seen": 43879030, + "router_z_loss_clip": 3.39257812, + "router_z_loss_mlp": 0.33813477, + "step": 2029, + "time_per_iteration": 2.5508341789245605 + }, + { + "auxiliary_loss_clip": 0.06643746, + "auxiliary_loss_mlp": 0.01296807, + "balance_loss_clip": 0.06307071, + "balance_loss_mlp": 0.01262475, + "epoch": 0.12205020291597775, + "flos": 23302005667200.0, + "grad_norm": 2.091887383256169, + "language_loss": 0.80821085, + "learning_rate": 3.911773168719313e-06, + "loss": 0.8876164, + "num_input_tokens_seen": 43898505, + "router_z_loss_clip": 3.36328125, + "router_z_loss_mlp": 0.34326172, + "step": 2030, + "time_per_iteration": 3.9340591430664062 + }, + { + "auxiliary_loss_clip": 0.06641008, + "auxiliary_loss_mlp": 0.01296523, + "balance_loss_clip": 0.06307271, + "balance_loss_mlp": 0.01263097, + "epoch": 0.12211032616864573, + "flos": 26038849651200.0, + "grad_norm": 4.123821558530392, + "language_loss": 0.75410855, + "learning_rate": 3.911658733556155e-06, + "loss": 0.83348382, + "num_input_tokens_seen": 43917945, + "router_z_loss_clip": 3.33984375, + "router_z_loss_mlp": 0.33398438, + "step": 2031, + "time_per_iteration": 4.0164101123809814 + }, + { + "auxiliary_loss_clip": 0.06642319, + "auxiliary_loss_mlp": 0.01298968, + "balance_loss_clip": 0.06307532, + "balance_loss_mlp": 0.01265947, + "epoch": 0.12217044942131369, + "flos": 20416932610560.0, + "grad_norm": 1.945082071582731, + "language_loss": 0.76790285, + "learning_rate": 3.911544225902707e-06, + "loss": 0.84731567, + "num_input_tokens_seen": 43937385, + "router_z_loss_clip": 3.34765625, + "router_z_loss_mlp": 0.33032227, + "step": 2032, + "time_per_iteration": 2.5583930015563965 + }, + { + "auxiliary_loss_clip": 0.0663031, + "auxiliary_loss_mlp": 0.01300948, + "balance_loss_clip": 0.06305249, + "balance_loss_mlp": 0.01266901, + "epoch": 0.12223057267398166, + "flos": 22863817140480.0, + "grad_norm": 1.7389762148633483, + "language_loss": 0.89850545, + "learning_rate": 3.911429645763311e-06, + "loss": 0.97781807, + "num_input_tokens_seen": 43958130, + "router_z_loss_clip": 3.24609375, + "router_z_loss_mlp": 0.34057617, + "step": 2033, + "time_per_iteration": 2.5717952251434326 + }, + { + "auxiliary_loss_clip": 0.06656118, + "auxiliary_loss_mlp": 0.01295873, + "balance_loss_clip": 0.06305313, + "balance_loss_mlp": 0.01260063, + "epoch": 0.12229069592664964, + "flos": 20053739088000.0, + "grad_norm": 2.329108980084039, + "language_loss": 0.67293733, + "learning_rate": 3.911314993142311e-06, + "loss": 0.75245726, + "num_input_tokens_seen": 43976800, + "router_z_loss_clip": 3.50585938, + "router_z_loss_mlp": 0.3581543, + "step": 2034, + "time_per_iteration": 5.42257833480835 + }, + { + "auxiliary_loss_clip": 0.06636314, + "auxiliary_loss_mlp": 0.01296044, + "balance_loss_clip": 0.06304356, + "balance_loss_mlp": 0.0126164, + "epoch": 0.1223508191793176, + "flos": 22280963339520.0, + "grad_norm": 1.830897331176389, + "language_loss": 0.77330279, + "learning_rate": 3.911200268044055e-06, + "loss": 0.85262644, + "num_input_tokens_seen": 43996620, + "router_z_loss_clip": 3.32226562, + "router_z_loss_mlp": 0.34375, + "step": 2035, + "time_per_iteration": 2.636413097381592 + }, + { + "auxiliary_loss_clip": 0.06651293, + "auxiliary_loss_mlp": 0.01293249, + "balance_loss_clip": 0.06302898, + "balance_loss_mlp": 0.01258893, + "epoch": 0.12241094243198557, + "flos": 21292009925760.0, + "grad_norm": 2.7740017238095187, + "language_loss": 0.73084652, + "learning_rate": 3.911085470472892e-06, + "loss": 0.81029195, + "num_input_tokens_seen": 44016175, + "router_z_loss_clip": 3.48632812, + "router_z_loss_mlp": 0.34350586, + "step": 2036, + "time_per_iteration": 2.528167724609375 + }, + { + "auxiliary_loss_clip": 0.06639268, + "auxiliary_loss_mlp": 0.01290851, + "balance_loss_clip": 0.06301118, + "balance_loss_mlp": 0.01256185, + "epoch": 0.12247106568465355, + "flos": 17387823185280.0, + "grad_norm": 1.824605307650974, + "language_loss": 0.84228837, + "learning_rate": 3.910970600433178e-06, + "loss": 0.92158961, + "num_input_tokens_seen": 44035060, + "router_z_loss_clip": 3.3828125, + "router_z_loss_mlp": 0.34692383, + "step": 2037, + "time_per_iteration": 2.554356575012207 + }, + { + "auxiliary_loss_clip": 0.06640968, + "auxiliary_loss_mlp": 0.0129909, + "balance_loss_clip": 0.06304546, + "balance_loss_mlp": 0.01265043, + "epoch": 0.12253118893732151, + "flos": 27051548497920.0, + "grad_norm": 3.231665500772768, + "language_loss": 0.81365263, + "learning_rate": 3.910855657929267e-06, + "loss": 0.89305323, + "num_input_tokens_seen": 44053330, + "router_z_loss_clip": 3.36523438, + "router_z_loss_mlp": 0.34057617, + "step": 2038, + "time_per_iteration": 2.5666050910949707 + }, + { + "auxiliary_loss_clip": 0.0649721, + "auxiliary_loss_mlp": 0.01268916, + "balance_loss_clip": 0.06293084, + "balance_loss_mlp": 0.01256113, + "epoch": 0.12259131218998948, + "flos": 53878055328000.0, + "grad_norm": 0.7896182211698063, + "language_loss": 0.58607936, + "learning_rate": 3.910740642965518e-06, + "loss": 0.66374058, + "num_input_tokens_seen": 44107575, + "router_z_loss_clip": 2.04101562, + "router_z_loss_mlp": 0.12817383, + "step": 2039, + "time_per_iteration": 3.1232099533081055 + }, + { + "auxiliary_loss_clip": 0.06641525, + "auxiliary_loss_mlp": 0.0129467, + "balance_loss_clip": 0.06306375, + "balance_loss_mlp": 0.01261053, + "epoch": 0.12265143544265744, + "flos": 17897233282560.0, + "grad_norm": 3.4610063472864065, + "language_loss": 0.82137585, + "learning_rate": 3.910625555546292e-06, + "loss": 0.90073782, + "num_input_tokens_seen": 44126075, + "router_z_loss_clip": 3.34765625, + "router_z_loss_mlp": 0.33569336, + "step": 2040, + "time_per_iteration": 2.5443432331085205 + }, + { + "auxiliary_loss_clip": 0.06629258, + "auxiliary_loss_mlp": 0.01288004, + "balance_loss_clip": 0.06301395, + "balance_loss_mlp": 0.01255031, + "epoch": 0.12271155869532542, + "flos": 21806577048960.0, + "grad_norm": 2.3749836007198546, + "language_loss": 0.84196723, + "learning_rate": 3.910510395675953e-06, + "loss": 0.92113984, + "num_input_tokens_seen": 44145605, + "router_z_loss_clip": 3.27539062, + "router_z_loss_mlp": 0.32983398, + "step": 2041, + "time_per_iteration": 2.5387189388275146 + }, + { + "auxiliary_loss_clip": 0.06646631, + "auxiliary_loss_mlp": 0.01292367, + "balance_loss_clip": 0.06301489, + "balance_loss_mlp": 0.0125627, + "epoch": 0.12277168194799339, + "flos": 19834917350400.0, + "grad_norm": 2.032940304960421, + "language_loss": 0.68564701, + "learning_rate": 3.9103951633588694e-06, + "loss": 0.76503706, + "num_input_tokens_seen": 44164770, + "router_z_loss_clip": 3.44726562, + "router_z_loss_mlp": 0.36083984, + "step": 2042, + "time_per_iteration": 2.5871469974517822 + }, + { + "auxiliary_loss_clip": 0.06626363, + "auxiliary_loss_mlp": 0.01291525, + "balance_loss_clip": 0.06293724, + "balance_loss_mlp": 0.01258957, + "epoch": 0.12283180520066135, + "flos": 23227597641600.0, + "grad_norm": 4.507885061874762, + "language_loss": 0.82501084, + "learning_rate": 3.910279858599409e-06, + "loss": 0.90418965, + "num_input_tokens_seen": 44184025, + "router_z_loss_clip": 3.328125, + "router_z_loss_mlp": 0.32568359, + "step": 2043, + "time_per_iteration": 2.5436289310455322 + }, + { + "auxiliary_loss_clip": 0.06642601, + "auxiliary_loss_mlp": 0.01293474, + "balance_loss_clip": 0.06301275, + "balance_loss_mlp": 0.01260501, + "epoch": 0.12289192845332933, + "flos": 18594466306560.0, + "grad_norm": 1.8262165625903515, + "language_loss": 0.8169322, + "learning_rate": 3.910164481401946e-06, + "loss": 0.89629292, + "num_input_tokens_seen": 44202950, + "router_z_loss_clip": 3.41210938, + "router_z_loss_mlp": 0.32983398, + "step": 2044, + "time_per_iteration": 2.5594139099121094 + }, + { + "auxiliary_loss_clip": 0.06635186, + "auxiliary_loss_mlp": 0.0128851, + "balance_loss_clip": 0.06299295, + "balance_loss_mlp": 0.01254416, + "epoch": 0.1229520517059973, + "flos": 25775612449920.0, + "grad_norm": 1.8452303970598702, + "language_loss": 0.79028547, + "learning_rate": 3.910049031770853e-06, + "loss": 0.86952239, + "num_input_tokens_seen": 44221115, + "router_z_loss_clip": 3.35742188, + "router_z_loss_mlp": 0.34082031, + "step": 2045, + "time_per_iteration": 2.5465781688690186 + }, + { + "auxiliary_loss_clip": 0.06636953, + "auxiliary_loss_mlp": 0.01295167, + "balance_loss_clip": 0.06298777, + "balance_loss_mlp": 0.01262408, + "epoch": 0.12301217495866526, + "flos": 20893541034240.0, + "grad_norm": 1.9769865564806426, + "language_loss": 0.69156218, + "learning_rate": 3.90993350971051e-06, + "loss": 0.77088338, + "num_input_tokens_seen": 44240575, + "router_z_loss_clip": 3.38671875, + "router_z_loss_mlp": 0.32763672, + "step": 2046, + "time_per_iteration": 2.5848565101623535 + }, + { + "auxiliary_loss_clip": 0.06628656, + "auxiliary_loss_mlp": 0.01290131, + "balance_loss_clip": 0.06297234, + "balance_loss_mlp": 0.01257277, + "epoch": 0.12307229821133324, + "flos": 22384735021440.0, + "grad_norm": 2.0992511324886713, + "language_loss": 0.73182803, + "learning_rate": 3.909817915225297e-06, + "loss": 0.8110159, + "num_input_tokens_seen": 44257145, + "router_z_loss_clip": 3.31640625, + "router_z_loss_mlp": 0.32861328, + "step": 2047, + "time_per_iteration": 2.5309009552001953 + }, + { + "auxiliary_loss_clip": 0.06630135, + "auxiliary_loss_mlp": 0.0129866, + "balance_loss_clip": 0.06297912, + "balance_loss_mlp": 0.01263732, + "epoch": 0.1231324214640012, + "flos": 23374065778560.0, + "grad_norm": 2.486188262823441, + "language_loss": 0.77457881, + "learning_rate": 3.909702248319597e-06, + "loss": 0.85386682, + "num_input_tokens_seen": 44278035, + "router_z_loss_clip": 3.32226562, + "router_z_loss_mlp": 0.34912109, + "step": 2048, + "time_per_iteration": 2.6273012161254883 + }, + { + "auxiliary_loss_clip": 0.06627734, + "auxiliary_loss_mlp": 0.01290224, + "balance_loss_clip": 0.06297483, + "balance_loss_mlp": 0.01258514, + "epoch": 0.12319254471666917, + "flos": 23773624773120.0, + "grad_norm": 1.9256853930308273, + "language_loss": 0.8659687, + "learning_rate": 3.909586508997797e-06, + "loss": 0.94514829, + "num_input_tokens_seen": 44296980, + "router_z_loss_clip": 3.30273438, + "router_z_loss_mlp": 0.31665039, + "step": 2049, + "time_per_iteration": 2.559253692626953 + }, + { + "auxiliary_loss_clip": 0.06639866, + "auxiliary_loss_mlp": 0.01291416, + "balance_loss_clip": 0.06300847, + "balance_loss_mlp": 0.01257751, + "epoch": 0.12325266796933713, + "flos": 23556899387520.0, + "grad_norm": 2.574663902354124, + "language_loss": 0.76814753, + "learning_rate": 3.909470697264285e-06, + "loss": 0.84746033, + "num_input_tokens_seen": 44318005, + "router_z_loss_clip": 3.390625, + "router_z_loss_mlp": 0.33691406, + "step": 2050, + "time_per_iteration": 2.6138648986816406 + }, + { + "auxiliary_loss_clip": 0.06634495, + "auxiliary_loss_mlp": 0.0128935, + "balance_loss_clip": 0.06301371, + "balance_loss_mlp": 0.01256353, + "epoch": 0.12331279122200511, + "flos": 24430593110400.0, + "grad_norm": 2.4676515957678826, + "language_loss": 0.82809746, + "learning_rate": 3.909354813123452e-06, + "loss": 0.90733588, + "num_input_tokens_seen": 44335260, + "router_z_loss_clip": 3.328125, + "router_z_loss_mlp": 0.32983398, + "step": 2051, + "time_per_iteration": 2.53440260887146 + }, + { + "auxiliary_loss_clip": 0.06631288, + "auxiliary_loss_mlp": 0.01288335, + "balance_loss_clip": 0.06299216, + "balance_loss_mlp": 0.01256625, + "epoch": 0.12337291447467308, + "flos": 25491438927360.0, + "grad_norm": 2.0266783151609666, + "language_loss": 0.81273621, + "learning_rate": 3.909238856579693e-06, + "loss": 0.89193243, + "num_input_tokens_seen": 44355315, + "router_z_loss_clip": 3.32421875, + "router_z_loss_mlp": 0.3170166, + "step": 2052, + "time_per_iteration": 2.5801045894622803 + }, + { + "auxiliary_loss_clip": 0.06643972, + "auxiliary_loss_mlp": 0.012894, + "balance_loss_clip": 0.06304064, + "balance_loss_mlp": 0.0125533, + "epoch": 0.12343303772734104, + "flos": 23556731679360.0, + "grad_norm": 2.520879144307052, + "language_loss": 0.75331706, + "learning_rate": 3.909122827637406e-06, + "loss": 0.83265078, + "num_input_tokens_seen": 44373020, + "router_z_loss_clip": 3.40039062, + "router_z_loss_mlp": 0.34082031, + "step": 2053, + "time_per_iteration": 2.5373435020446777 + }, + { + "auxiliary_loss_clip": 0.06645267, + "auxiliary_loss_mlp": 0.01289892, + "balance_loss_clip": 0.06306874, + "balance_loss_mlp": 0.01256919, + "epoch": 0.12349316098000902, + "flos": 47567724670080.0, + "grad_norm": 1.6252086945457442, + "language_loss": 0.75631851, + "learning_rate": 3.909006726300991e-06, + "loss": 0.83567011, + "num_input_tokens_seen": 44397525, + "router_z_loss_clip": 3.37890625, + "router_z_loss_mlp": 0.32983398, + "step": 2054, + "time_per_iteration": 2.7952961921691895 + }, + { + "auxiliary_loss_clip": 0.06634779, + "auxiliary_loss_mlp": 0.01287596, + "balance_loss_clip": 0.06307411, + "balance_loss_mlp": 0.0125715, + "epoch": 0.12355328423267699, + "flos": 25052956911360.0, + "grad_norm": 1.7485213657356729, + "language_loss": 0.86270738, + "learning_rate": 3.908890552574849e-06, + "loss": 0.94193119, + "num_input_tokens_seen": 44415890, + "router_z_loss_clip": 3.27148438, + "router_z_loss_mlp": 0.30419922, + "step": 2055, + "time_per_iteration": 2.553056001663208 + }, + { + "auxiliary_loss_clip": 0.06643809, + "auxiliary_loss_mlp": 0.01295066, + "balance_loss_clip": 0.06311696, + "balance_loss_mlp": 0.0126226, + "epoch": 0.12361340748534495, + "flos": 27716524899840.0, + "grad_norm": 2.053117172443155, + "language_loss": 0.78908336, + "learning_rate": 3.908774306463384e-06, + "loss": 0.86847222, + "num_input_tokens_seen": 44436625, + "router_z_loss_clip": 3.32421875, + "router_z_loss_mlp": 0.328125, + "step": 2056, + "time_per_iteration": 2.632049322128296 + }, + { + "auxiliary_loss_clip": 0.06652766, + "auxiliary_loss_mlp": 0.01294236, + "balance_loss_clip": 0.06316112, + "balance_loss_mlp": 0.01262002, + "epoch": 0.12367353073801293, + "flos": 26147778359040.0, + "grad_norm": 2.0516910638510835, + "language_loss": 0.84512216, + "learning_rate": 3.908657987971009e-06, + "loss": 0.92459214, + "num_input_tokens_seen": 44455265, + "router_z_loss_clip": 3.3671875, + "router_z_loss_mlp": 0.32226562, + "step": 2057, + "time_per_iteration": 2.5529589653015137 + }, + { + "auxiliary_loss_clip": 0.06650747, + "auxiliary_loss_mlp": 0.0129436, + "balance_loss_clip": 0.06317189, + "balance_loss_mlp": 0.01261553, + "epoch": 0.1237336539906809, + "flos": 25163143430400.0, + "grad_norm": 1.8863431007110945, + "language_loss": 0.7932052, + "learning_rate": 3.90854159710213e-06, + "loss": 0.87265623, + "num_input_tokens_seen": 44475815, + "router_z_loss_clip": 3.33398438, + "router_z_loss_mlp": 0.328125, + "step": 2058, + "time_per_iteration": 2.636936902999878 + }, + { + "auxiliary_loss_clip": 0.06652544, + "auxiliary_loss_mlp": 0.01294377, + "balance_loss_clip": 0.06313539, + "balance_loss_mlp": 0.01259782, + "epoch": 0.12379377724334886, + "flos": 15310001963520.0, + "grad_norm": 2.1631103181071865, + "language_loss": 0.84899569, + "learning_rate": 3.9084251338611624e-06, + "loss": 0.92846489, + "num_input_tokens_seen": 44494045, + "router_z_loss_clip": 3.38671875, + "router_z_loss_mlp": 0.34619141, + "step": 2059, + "time_per_iteration": 2.534330129623413 + }, + { + "auxiliary_loss_clip": 0.06649262, + "auxiliary_loss_mlp": 0.01290616, + "balance_loss_clip": 0.06311791, + "balance_loss_mlp": 0.01258405, + "epoch": 0.12385390049601683, + "flos": 21321792852480.0, + "grad_norm": 2.425291985469593, + "language_loss": 0.82626045, + "learning_rate": 3.908308598252523e-06, + "loss": 0.90565926, + "num_input_tokens_seen": 44509120, + "router_z_loss_clip": 3.37695312, + "router_z_loss_mlp": 0.32177734, + "step": 2060, + "time_per_iteration": 2.6014535427093506 + }, + { + "auxiliary_loss_clip": 0.06642138, + "auxiliary_loss_mlp": 0.01290673, + "balance_loss_clip": 0.06310271, + "balance_loss_mlp": 0.01256579, + "epoch": 0.1239140237486848, + "flos": 15120711590400.0, + "grad_norm": 2.0800945388405734, + "language_loss": 0.87935984, + "learning_rate": 3.9081919902806306e-06, + "loss": 0.95868802, + "num_input_tokens_seen": 44525780, + "router_z_loss_clip": 3.31640625, + "router_z_loss_mlp": 0.34082031, + "step": 2061, + "time_per_iteration": 2.494584321975708 + }, + { + "auxiliary_loss_clip": 0.0663335, + "auxiliary_loss_mlp": 0.01291205, + "balance_loss_clip": 0.06306711, + "balance_loss_mlp": 0.01260259, + "epoch": 0.12397414700135277, + "flos": 21982534623360.0, + "grad_norm": 1.9753177189275368, + "language_loss": 0.85858583, + "learning_rate": 3.908075309949906e-06, + "loss": 0.9378314, + "num_input_tokens_seen": 44543125, + "router_z_loss_clip": 3.26953125, + "router_z_loss_mlp": 0.30932617, + "step": 2062, + "time_per_iteration": 2.5650103092193604 + }, + { + "auxiliary_loss_clip": 0.06642005, + "auxiliary_loss_mlp": 0.01288926, + "balance_loss_clip": 0.06311023, + "balance_loss_mlp": 0.01256549, + "epoch": 0.12403427025402074, + "flos": 13404909934080.0, + "grad_norm": 1.7604795458830171, + "language_loss": 0.80305374, + "learning_rate": 3.907958557264774e-06, + "loss": 0.88236302, + "num_input_tokens_seen": 44560275, + "router_z_loss_clip": 3.30859375, + "router_z_loss_mlp": 0.32373047, + "step": 2063, + "time_per_iteration": 2.5019121170043945 + }, + { + "auxiliary_loss_clip": 0.06644779, + "auxiliary_loss_mlp": 0.0129093, + "balance_loss_clip": 0.06312533, + "balance_loss_mlp": 0.01257146, + "epoch": 0.12409439350668872, + "flos": 15309750401280.0, + "grad_norm": 2.5047408324670832, + "language_loss": 0.80646086, + "learning_rate": 3.907841732229663e-06, + "loss": 0.885818, + "num_input_tokens_seen": 44577640, + "router_z_loss_clip": 3.32421875, + "router_z_loss_mlp": 0.33789062, + "step": 2064, + "time_per_iteration": 2.5915873050689697 + }, + { + "auxiliary_loss_clip": 0.06642206, + "auxiliary_loss_mlp": 0.01295102, + "balance_loss_clip": 0.06310631, + "balance_loss_mlp": 0.01263583, + "epoch": 0.12415451675935668, + "flos": 25016339877120.0, + "grad_norm": 2.4114555321806677, + "language_loss": 0.93642998, + "learning_rate": 3.907724834849002e-06, + "loss": 1.0158031, + "num_input_tokens_seen": 44594860, + "router_z_loss_clip": 3.31445312, + "router_z_loss_mlp": 0.31542969, + "step": 2065, + "time_per_iteration": 2.561858892440796 + }, + { + "auxiliary_loss_clip": 0.06650305, + "auxiliary_loss_mlp": 0.01289676, + "balance_loss_clip": 0.06313996, + "balance_loss_mlp": 0.01256845, + "epoch": 0.12421464001202465, + "flos": 23666457000960.0, + "grad_norm": 2.189266948105698, + "language_loss": 0.81909287, + "learning_rate": 3.907607865127225e-06, + "loss": 0.89849269, + "num_input_tokens_seen": 44614780, + "router_z_loss_clip": 3.36523438, + "router_z_loss_mlp": 0.32836914, + "step": 2066, + "time_per_iteration": 2.593202590942383 + }, + { + "auxiliary_loss_clip": 0.06490391, + "auxiliary_loss_mlp": 0.01263186, + "balance_loss_clip": 0.06289329, + "balance_loss_mlp": 0.01251599, + "epoch": 0.12427476326469263, + "flos": 65753686794240.0, + "grad_norm": 0.8319051039342746, + "language_loss": 0.63633674, + "learning_rate": 3.907490823068766e-06, + "loss": 0.71387255, + "num_input_tokens_seen": 44671240, + "router_z_loss_clip": 2.00195312, + "router_z_loss_mlp": 0.11578369, + "step": 2067, + "time_per_iteration": 3.1761627197265625 + }, + { + "auxiliary_loss_clip": 0.06645706, + "auxiliary_loss_mlp": 0.01298846, + "balance_loss_clip": 0.0631035, + "balance_loss_mlp": 0.01263441, + "epoch": 0.12433488651736059, + "flos": 24542372856960.0, + "grad_norm": 1.826307317776044, + "language_loss": 0.94409752, + "learning_rate": 3.907373708678063e-06, + "loss": 1.023543, + "num_input_tokens_seen": 44691050, + "router_z_loss_clip": 3.3515625, + "router_z_loss_mlp": 0.35375977, + "step": 2068, + "time_per_iteration": 2.548051357269287 + }, + { + "auxiliary_loss_clip": 0.06634392, + "auxiliary_loss_mlp": 0.01295819, + "balance_loss_clip": 0.06307046, + "balance_loss_mlp": 0.01265087, + "epoch": 0.12439500977002856, + "flos": 21037828965120.0, + "grad_norm": 2.192174211914145, + "language_loss": 0.82850045, + "learning_rate": 3.9072565219595596e-06, + "loss": 0.90780252, + "num_input_tokens_seen": 44709850, + "router_z_loss_clip": 3.2734375, + "router_z_loss_mlp": 0.30712891, + "step": 2069, + "time_per_iteration": 3.9771463871002197 + }, + { + "auxiliary_loss_clip": 0.0664653, + "auxiliary_loss_mlp": 0.01287176, + "balance_loss_clip": 0.06312294, + "balance_loss_mlp": 0.01255276, + "epoch": 0.12445513302269653, + "flos": 26837380661760.0, + "grad_norm": 2.140489528942806, + "language_loss": 0.78554291, + "learning_rate": 3.907139262917696e-06, + "loss": 0.86487997, + "num_input_tokens_seen": 44731475, + "router_z_loss_clip": 3.33984375, + "router_z_loss_mlp": 0.31884766, + "step": 2070, + "time_per_iteration": 2.5697221755981445 + }, + { + "auxiliary_loss_clip": 0.06645045, + "auxiliary_loss_mlp": 0.01288939, + "balance_loss_clip": 0.06311486, + "balance_loss_mlp": 0.01258469, + "epoch": 0.1245152562753645, + "flos": 18374764101120.0, + "grad_norm": 2.28424874253062, + "language_loss": 0.81667042, + "learning_rate": 3.907021931556922e-06, + "loss": 0.89601028, + "num_input_tokens_seen": 44749685, + "router_z_loss_clip": 3.3359375, + "router_z_loss_mlp": 0.3046875, + "step": 2071, + "time_per_iteration": 3.9356284141540527 + }, + { + "auxiliary_loss_clip": 0.06624742, + "auxiliary_loss_mlp": 0.01289094, + "balance_loss_clip": 0.06303577, + "balance_loss_mlp": 0.01256407, + "epoch": 0.12457537952803246, + "flos": 33116098331520.0, + "grad_norm": 2.0527550980706626, + "language_loss": 0.79415953, + "learning_rate": 3.906904527881684e-06, + "loss": 0.87329787, + "num_input_tokens_seen": 44772165, + "router_z_loss_clip": 3.20898438, + "router_z_loss_mlp": 0.32666016, + "step": 2072, + "time_per_iteration": 2.659824848175049 + }, + { + "auxiliary_loss_clip": 0.06639021, + "auxiliary_loss_mlp": 0.01293554, + "balance_loss_clip": 0.06306598, + "balance_loss_mlp": 0.01260819, + "epoch": 0.12463550278070043, + "flos": 22276267511040.0, + "grad_norm": 2.0170209718237144, + "language_loss": 0.76458508, + "learning_rate": 3.9067870518964355e-06, + "loss": 0.84391081, + "num_input_tokens_seen": 44790580, + "router_z_loss_clip": 3.32226562, + "router_z_loss_mlp": 0.32739258, + "step": 2073, + "time_per_iteration": 4.0372233390808105 + }, + { + "auxiliary_loss_clip": 0.06627664, + "auxiliary_loss_mlp": 0.012867, + "balance_loss_clip": 0.06303963, + "balance_loss_mlp": 0.01255491, + "epoch": 0.12469562603336841, + "flos": 14683445458560.0, + "grad_norm": 1.9751185197934578, + "language_loss": 0.9136548, + "learning_rate": 3.906669503605631e-06, + "loss": 0.99279845, + "num_input_tokens_seen": 44806730, + "router_z_loss_clip": 3.24023438, + "router_z_loss_mlp": 0.3125, + "step": 2074, + "time_per_iteration": 3.880718946456909 + }, + { + "auxiliary_loss_clip": 0.06644025, + "auxiliary_loss_mlp": 0.01296508, + "balance_loss_clip": 0.06306964, + "balance_loss_mlp": 0.0126065, + "epoch": 0.12475574928603637, + "flos": 24651720835200.0, + "grad_norm": 2.411338932827457, + "language_loss": 0.85379255, + "learning_rate": 3.906551883013728e-06, + "loss": 0.93319792, + "num_input_tokens_seen": 44825550, + "router_z_loss_clip": 3.36914062, + "router_z_loss_mlp": 0.35839844, + "step": 2075, + "time_per_iteration": 2.593402147293091 + }, + { + "auxiliary_loss_clip": 0.06632458, + "auxiliary_loss_mlp": 0.01300353, + "balance_loss_clip": 0.06302904, + "balance_loss_mlp": 0.01267166, + "epoch": 0.12481587253870434, + "flos": 21769540744320.0, + "grad_norm": 1.9904013424210072, + "language_loss": 0.73795271, + "learning_rate": 3.9064341901251865e-06, + "loss": 0.81728083, + "num_input_tokens_seen": 44844155, + "router_z_loss_clip": 3.29296875, + "router_z_loss_mlp": 0.33227539, + "step": 2076, + "time_per_iteration": 2.5252525806427 + }, + { + "auxiliary_loss_clip": 0.06619625, + "auxiliary_loss_mlp": 0.01296003, + "balance_loss_clip": 0.06298469, + "balance_loss_mlp": 0.0126632, + "epoch": 0.12487599579137232, + "flos": 21438687697920.0, + "grad_norm": 2.119852671968812, + "language_loss": 0.76853049, + "learning_rate": 3.906316424944469e-06, + "loss": 0.84768671, + "num_input_tokens_seen": 44863780, + "router_z_loss_clip": 3.2109375, + "router_z_loss_mlp": 0.29663086, + "step": 2077, + "time_per_iteration": 2.5812795162200928 + }, + { + "auxiliary_loss_clip": 0.06627834, + "auxiliary_loss_mlp": 0.01294428, + "balance_loss_clip": 0.06298409, + "balance_loss_mlp": 0.01261503, + "epoch": 0.12493611904404028, + "flos": 16113228802560.0, + "grad_norm": 2.6079444778137906, + "language_loss": 0.83980322, + "learning_rate": 3.906198587476043e-06, + "loss": 0.9190259, + "num_input_tokens_seen": 44881480, + "router_z_loss_clip": 3.296875, + "router_z_loss_mlp": 0.3293457, + "step": 2078, + "time_per_iteration": 2.5144779682159424 + }, + { + "auxiliary_loss_clip": 0.06633472, + "auxiliary_loss_mlp": 0.01297977, + "balance_loss_clip": 0.06301548, + "balance_loss_mlp": 0.01265337, + "epoch": 0.12499624229670825, + "flos": 21586749062400.0, + "grad_norm": 2.088353376240652, + "language_loss": 0.7681694, + "learning_rate": 3.906080677724374e-06, + "loss": 0.84748387, + "num_input_tokens_seen": 44900390, + "router_z_loss_clip": 3.32226562, + "router_z_loss_mlp": 0.32617188, + "step": 2079, + "time_per_iteration": 2.638761043548584 + }, + { + "auxiliary_loss_clip": 0.06640807, + "auxiliary_loss_mlp": 0.01295919, + "balance_loss_clip": 0.06307015, + "balance_loss_mlp": 0.01263351, + "epoch": 0.1250563655493762, + "flos": 25705522909440.0, + "grad_norm": 2.3726479932939064, + "language_loss": 0.85245967, + "learning_rate": 3.905962695693935e-06, + "loss": 0.93182695, + "num_input_tokens_seen": 44920375, + "router_z_loss_clip": 3.33984375, + "router_z_loss_mlp": 0.32592773, + "step": 2080, + "time_per_iteration": 2.5898683071136475 + }, + { + "auxiliary_loss_clip": 0.06618196, + "auxiliary_loss_mlp": 0.0130361, + "balance_loss_clip": 0.06292213, + "balance_loss_mlp": 0.01269993, + "epoch": 0.12511648880204418, + "flos": 16915113976320.0, + "grad_norm": 2.1047824756143263, + "language_loss": 0.86146665, + "learning_rate": 3.9058446413892e-06, + "loss": 0.94068468, + "num_input_tokens_seen": 44938415, + "router_z_loss_clip": 3.25976562, + "router_z_loss_mlp": 0.3359375, + "step": 2081, + "time_per_iteration": 2.5291430950164795 + }, + { + "auxiliary_loss_clip": 0.06628423, + "auxiliary_loss_mlp": 0.01299212, + "balance_loss_clip": 0.06304745, + "balance_loss_mlp": 0.01268289, + "epoch": 0.12517661205471217, + "flos": 17573423978880.0, + "grad_norm": 1.9525319716543403, + "language_loss": 0.77591729, + "learning_rate": 3.905726514814646e-06, + "loss": 0.85519361, + "num_input_tokens_seen": 44957135, + "router_z_loss_clip": 3.23632812, + "router_z_loss_mlp": 0.30908203, + "step": 2082, + "time_per_iteration": 2.5817041397094727 + }, + { + "auxiliary_loss_clip": 0.06645833, + "auxiliary_loss_mlp": 0.01295307, + "balance_loss_clip": 0.06304055, + "balance_loss_mlp": 0.01261118, + "epoch": 0.12523673530738014, + "flos": 16039240047360.0, + "grad_norm": 3.06086551706414, + "language_loss": 0.80167735, + "learning_rate": 3.9056083159747495e-06, + "loss": 0.88108873, + "num_input_tokens_seen": 44974480, + "router_z_loss_clip": 3.421875, + "router_z_loss_mlp": 0.34179688, + "step": 2083, + "time_per_iteration": 2.6278059482574463 + }, + { + "auxiliary_loss_clip": 0.06632711, + "auxiliary_loss_mlp": 0.01297422, + "balance_loss_clip": 0.06298797, + "balance_loss_mlp": 0.0126297, + "epoch": 0.1252968585600481, + "flos": 18813833095680.0, + "grad_norm": 3.451384720222282, + "language_loss": 0.92214763, + "learning_rate": 3.9054900448739966e-06, + "loss": 1.00144899, + "num_input_tokens_seen": 44990310, + "router_z_loss_clip": 3.33789062, + "router_z_loss_mlp": 0.34472656, + "step": 2084, + "time_per_iteration": 2.501530647277832 + }, + { + "auxiliary_loss_clip": 0.0662484, + "auxiliary_loss_mlp": 0.01295191, + "balance_loss_clip": 0.06296518, + "balance_loss_mlp": 0.01263171, + "epoch": 0.12535698181271607, + "flos": 27278923351680.0, + "grad_norm": 1.9702751102582312, + "language_loss": 0.81308639, + "learning_rate": 3.905371701516869e-06, + "loss": 0.89228666, + "num_input_tokens_seen": 45010720, + "router_z_loss_clip": 3.28320312, + "router_z_loss_mlp": 0.32006836, + "step": 2085, + "time_per_iteration": 2.5993080139160156 + }, + { + "auxiliary_loss_clip": 0.06621981, + "auxiliary_loss_mlp": 0.01314133, + "balance_loss_clip": 0.06297316, + "balance_loss_mlp": 0.01281469, + "epoch": 0.12541710506538403, + "flos": 22060590301440.0, + "grad_norm": 2.513443994409739, + "language_loss": 0.89793539, + "learning_rate": 3.905253285907856e-06, + "loss": 0.97729653, + "num_input_tokens_seen": 45030360, + "router_z_loss_clip": 3.24609375, + "router_z_loss_mlp": 0.32641602, + "step": 2086, + "time_per_iteration": 2.526017427444458 + }, + { + "auxiliary_loss_clip": 0.0661508, + "auxiliary_loss_mlp": 0.01297904, + "balance_loss_clip": 0.06298057, + "balance_loss_mlp": 0.01269651, + "epoch": 0.125477228318052, + "flos": 12607888296960.0, + "grad_norm": 2.458580206146656, + "language_loss": 0.88740981, + "learning_rate": 3.905134798051447e-06, + "loss": 0.96653962, + "num_input_tokens_seen": 45045085, + "router_z_loss_clip": 3.16992188, + "router_z_loss_mlp": 0.28271484, + "step": 2087, + "time_per_iteration": 2.6768429279327393 + }, + { + "auxiliary_loss_clip": 0.06626555, + "auxiliary_loss_mlp": 0.0130267, + "balance_loss_clip": 0.06301963, + "balance_loss_mlp": 0.0127077, + "epoch": 0.12553735157071996, + "flos": 23885362592640.0, + "grad_norm": 1.907782132807464, + "language_loss": 0.74902099, + "learning_rate": 3.905016237952136e-06, + "loss": 0.82831323, + "num_input_tokens_seen": 45065145, + "router_z_loss_clip": 3.24804688, + "router_z_loss_mlp": 0.3190918, + "step": 2088, + "time_per_iteration": 2.584322690963745 + }, + { + "auxiliary_loss_clip": 0.06515329, + "auxiliary_loss_mlp": 0.01279784, + "balance_loss_clip": 0.06318291, + "balance_loss_mlp": 0.01264752, + "epoch": 0.12559747482338796, + "flos": 69940998881280.0, + "grad_norm": 0.7370797813517723, + "language_loss": 0.61766195, + "learning_rate": 3.904897605614418e-06, + "loss": 0.69561303, + "num_input_tokens_seen": 45126230, + "router_z_loss_clip": 1.96875, + "router_z_loss_mlp": 0.15002441, + "step": 2089, + "time_per_iteration": 3.1401424407958984 + }, + { + "auxiliary_loss_clip": 0.06624255, + "auxiliary_loss_mlp": 0.01293606, + "balance_loss_clip": 0.06302167, + "balance_loss_mlp": 0.01262707, + "epoch": 0.12565759807605592, + "flos": 24286389033600.0, + "grad_norm": 1.9922861494736146, + "language_loss": 0.80224949, + "learning_rate": 3.904778901042793e-06, + "loss": 0.88142806, + "num_input_tokens_seen": 45145545, + "router_z_loss_clip": 3.22070312, + "router_z_loss_mlp": 0.30883789, + "step": 2090, + "time_per_iteration": 2.6044373512268066 + }, + { + "auxiliary_loss_clip": 0.0651547, + "auxiliary_loss_mlp": 0.01267283, + "balance_loss_clip": 0.06318653, + "balance_loss_mlp": 0.01254635, + "epoch": 0.12571772132872389, + "flos": 56468011904640.0, + "grad_norm": 0.7384472353065198, + "language_loss": 0.58865118, + "learning_rate": 3.90466012424176e-06, + "loss": 0.66647875, + "num_input_tokens_seen": 45206845, + "router_z_loss_clip": 1.96875, + "router_z_loss_mlp": 0.12646484, + "step": 2091, + "time_per_iteration": 3.1160824298858643 + }, + { + "auxiliary_loss_clip": 0.06630008, + "auxiliary_loss_mlp": 0.01289162, + "balance_loss_clip": 0.06302688, + "balance_loss_mlp": 0.0125781, + "epoch": 0.12577784458139185, + "flos": 41255576421120.0, + "grad_norm": 1.8290499485408422, + "language_loss": 0.65244853, + "learning_rate": 3.904541275215825e-06, + "loss": 0.73164022, + "num_input_tokens_seen": 45228495, + "router_z_loss_clip": 3.26953125, + "router_z_loss_mlp": 0.31347656, + "step": 2092, + "time_per_iteration": 2.743067741394043 + }, + { + "auxiliary_loss_clip": 0.06640761, + "auxiliary_loss_mlp": 0.01299851, + "balance_loss_clip": 0.06305548, + "balance_loss_mlp": 0.01265542, + "epoch": 0.12583796783405982, + "flos": 19761599427840.0, + "grad_norm": 2.082922063254684, + "language_loss": 0.82319552, + "learning_rate": 3.904422353969493e-06, + "loss": 0.9026016, + "num_input_tokens_seen": 45245720, + "router_z_loss_clip": 3.34960938, + "router_z_loss_mlp": 0.34277344, + "step": 2093, + "time_per_iteration": 2.5252139568328857 + }, + { + "auxiliary_loss_clip": 0.06622188, + "auxiliary_loss_mlp": 0.01291379, + "balance_loss_clip": 0.06303331, + "balance_loss_mlp": 0.01260766, + "epoch": 0.12589809108672778, + "flos": 22608797639040.0, + "grad_norm": 2.0047110075262635, + "language_loss": 0.76888406, + "learning_rate": 3.904303360507276e-06, + "loss": 0.84801972, + "num_input_tokens_seen": 45265650, + "router_z_loss_clip": 3.1875, + "router_z_loss_mlp": 0.30639648, + "step": 2094, + "time_per_iteration": 2.5590462684631348 + }, + { + "auxiliary_loss_clip": 0.06619669, + "auxiliary_loss_mlp": 0.01298019, + "balance_loss_clip": 0.06299751, + "balance_loss_mlp": 0.01266309, + "epoch": 0.12595821433939577, + "flos": 45233248792320.0, + "grad_norm": 1.7774170004570267, + "language_loss": 0.78170305, + "learning_rate": 3.9041842948336835e-06, + "loss": 0.8608799, + "num_input_tokens_seen": 45287790, + "router_z_loss_clip": 3.20117188, + "router_z_loss_mlp": 0.31689453, + "step": 2095, + "time_per_iteration": 2.7437078952789307 + }, + { + "auxiliary_loss_clip": 0.06632219, + "auxiliary_loss_mlp": 0.01294772, + "balance_loss_clip": 0.06299502, + "balance_loss_mlp": 0.01263492, + "epoch": 0.12601833759206374, + "flos": 14325115472640.0, + "grad_norm": 2.871933509106217, + "language_loss": 0.84611917, + "learning_rate": 3.904065156953232e-06, + "loss": 0.92538905, + "num_input_tokens_seen": 45305720, + "router_z_loss_clip": 3.328125, + "router_z_loss_mlp": 0.31274414, + "step": 2096, + "time_per_iteration": 2.530060052871704 + }, + { + "auxiliary_loss_clip": 0.06630743, + "auxiliary_loss_mlp": 0.01306013, + "balance_loss_clip": 0.06297809, + "balance_loss_mlp": 0.01272038, + "epoch": 0.1260784608447317, + "flos": 21294651329280.0, + "grad_norm": 2.3649533335504365, + "language_loss": 0.7677502, + "learning_rate": 3.903945946870439e-06, + "loss": 0.84711778, + "num_input_tokens_seen": 45325290, + "router_z_loss_clip": 3.33203125, + "router_z_loss_mlp": 0.33984375, + "step": 2097, + "time_per_iteration": 2.5258843898773193 + }, + { + "auxiliary_loss_clip": 0.06624204, + "auxiliary_loss_mlp": 0.01300362, + "balance_loss_clip": 0.06299201, + "balance_loss_mlp": 0.0127025, + "epoch": 0.12613858409739967, + "flos": 26258719564800.0, + "grad_norm": 2.151256625756143, + "language_loss": 0.88275403, + "learning_rate": 3.9038266645898246e-06, + "loss": 0.96199965, + "num_input_tokens_seen": 45344465, + "router_z_loss_clip": 3.25195312, + "router_z_loss_mlp": 0.30102539, + "step": 2098, + "time_per_iteration": 2.5916357040405273 + }, + { + "auxiliary_loss_clip": 0.0664238, + "auxiliary_loss_mlp": 0.01307801, + "balance_loss_clip": 0.06306277, + "balance_loss_mlp": 0.0127149, + "epoch": 0.12619870735006763, + "flos": 21586413646080.0, + "grad_norm": 1.8808679634119545, + "language_loss": 0.71169508, + "learning_rate": 3.903707310115912e-06, + "loss": 0.79119694, + "num_input_tokens_seen": 45362465, + "router_z_loss_clip": 3.359375, + "router_z_loss_mlp": 0.36303711, + "step": 2099, + "time_per_iteration": 2.525548219680786 + }, + { + "auxiliary_loss_clip": 0.06636767, + "auxiliary_loss_mlp": 0.01301654, + "balance_loss_clip": 0.06306287, + "balance_loss_mlp": 0.0126756, + "epoch": 0.1262588306027356, + "flos": 23373646508160.0, + "grad_norm": 3.191355313927065, + "language_loss": 0.83154678, + "learning_rate": 3.903587883453228e-06, + "loss": 0.91093099, + "num_input_tokens_seen": 45382700, + "router_z_loss_clip": 3.30664062, + "router_z_loss_mlp": 0.34106445, + "step": 2100, + "time_per_iteration": 2.581777572631836 + }, + { + "auxiliary_loss_clip": 0.06632592, + "auxiliary_loss_mlp": 0.01304584, + "balance_loss_clip": 0.06304123, + "balance_loss_mlp": 0.01271325, + "epoch": 0.12631895385540357, + "flos": 23955619841280.0, + "grad_norm": 1.9586534535799036, + "language_loss": 0.81579792, + "learning_rate": 3.903468384606302e-06, + "loss": 0.89516962, + "num_input_tokens_seen": 45401005, + "router_z_loss_clip": 3.28515625, + "router_z_loss_mlp": 0.33227539, + "step": 2101, + "time_per_iteration": 2.579571008682251 + }, + { + "auxiliary_loss_clip": 0.06508025, + "auxiliary_loss_mlp": 0.01260999, + "balance_loss_clip": 0.06310984, + "balance_loss_mlp": 0.0125033, + "epoch": 0.12637907710807156, + "flos": 70301760635520.0, + "grad_norm": 0.6797956524806741, + "language_loss": 0.57154572, + "learning_rate": 3.903348813579662e-06, + "loss": 0.6492359, + "num_input_tokens_seen": 45466555, + "router_z_loss_clip": 1.96875, + "router_z_loss_mlp": 0.10681152, + "step": 2102, + "time_per_iteration": 3.2542574405670166 + }, + { + "auxiliary_loss_clip": 0.06635006, + "auxiliary_loss_mlp": 0.0129624, + "balance_loss_clip": 0.06302785, + "balance_loss_mlp": 0.01264888, + "epoch": 0.12643920036073952, + "flos": 18920833159680.0, + "grad_norm": 2.1103424848105177, + "language_loss": 0.95015359, + "learning_rate": 3.903229170377845e-06, + "loss": 1.02946603, + "num_input_tokens_seen": 45485165, + "router_z_loss_clip": 3.32617188, + "router_z_loss_mlp": 0.31396484, + "step": 2103, + "time_per_iteration": 2.554858684539795 + }, + { + "auxiliary_loss_clip": 0.06615217, + "auxiliary_loss_mlp": 0.01290733, + "balance_loss_clip": 0.0629935, + "balance_loss_mlp": 0.0126099, + "epoch": 0.1264993236134075, + "flos": 27789926676480.0, + "grad_norm": 1.8409874759375768, + "language_loss": 0.79467118, + "learning_rate": 3.903109455005387e-06, + "loss": 0.8737306, + "num_input_tokens_seen": 45504630, + "router_z_loss_clip": 3.15820312, + "router_z_loss_mlp": 0.29711914, + "step": 2104, + "time_per_iteration": 2.6194100379943848 + }, + { + "auxiliary_loss_clip": 0.06630556, + "auxiliary_loss_mlp": 0.01294269, + "balance_loss_clip": 0.06301397, + "balance_loss_mlp": 0.0126256, + "epoch": 0.12655944686607545, + "flos": 24761739646080.0, + "grad_norm": 2.4857210053550625, + "language_loss": 0.82356828, + "learning_rate": 3.902989667466828e-06, + "loss": 0.90281653, + "num_input_tokens_seen": 45524885, + "router_z_loss_clip": 3.29296875, + "router_z_loss_mlp": 0.31713867, + "step": 2105, + "time_per_iteration": 2.6011011600494385 + }, + { + "auxiliary_loss_clip": 0.06645899, + "auxiliary_loss_mlp": 0.01301591, + "balance_loss_clip": 0.0630343, + "balance_loss_mlp": 0.01263587, + "epoch": 0.12661957011874342, + "flos": 24139753188480.0, + "grad_norm": 2.6380144602222653, + "language_loss": 0.84079802, + "learning_rate": 3.90286980776671e-06, + "loss": 0.92027295, + "num_input_tokens_seen": 45545000, + "router_z_loss_clip": 3.421875, + "router_z_loss_mlp": 0.37963867, + "step": 2106, + "time_per_iteration": 2.572817087173462 + }, + { + "auxiliary_loss_clip": 0.0662559, + "auxiliary_loss_mlp": 0.012898, + "balance_loss_clip": 0.06298016, + "balance_loss_mlp": 0.01256422, + "epoch": 0.12667969337141138, + "flos": 24576180779520.0, + "grad_norm": 1.9395738781277843, + "language_loss": 0.74407184, + "learning_rate": 3.902749875909578e-06, + "loss": 0.82322574, + "num_input_tokens_seen": 45564210, + "router_z_loss_clip": 3.2734375, + "router_z_loss_mlp": 0.33374023, + "step": 2107, + "time_per_iteration": 2.6193723678588867 + }, + { + "auxiliary_loss_clip": 0.06622959, + "auxiliary_loss_mlp": 0.01290393, + "balance_loss_clip": 0.06299001, + "balance_loss_mlp": 0.01259017, + "epoch": 0.12673981662407935, + "flos": 22967546895360.0, + "grad_norm": 2.0472212441306175, + "language_loss": 0.80444276, + "learning_rate": 3.90262987189998e-06, + "loss": 0.88357627, + "num_input_tokens_seen": 45583030, + "router_z_loss_clip": 3.24023438, + "router_z_loss_mlp": 0.31396484, + "step": 2108, + "time_per_iteration": 2.5497617721557617 + }, + { + "auxiliary_loss_clip": 0.06627882, + "auxiliary_loss_mlp": 0.01288653, + "balance_loss_clip": 0.06298642, + "balance_loss_mlp": 0.01256562, + "epoch": 0.12679993987674734, + "flos": 17280613486080.0, + "grad_norm": 2.14760795310841, + "language_loss": 0.77326792, + "learning_rate": 3.902509795742467e-06, + "loss": 0.85243326, + "num_input_tokens_seen": 45602265, + "router_z_loss_clip": 3.29296875, + "router_z_loss_mlp": 0.32080078, + "step": 2109, + "time_per_iteration": 3.9535577297210693 + }, + { + "auxiliary_loss_clip": 0.06619301, + "auxiliary_loss_mlp": 0.01294051, + "balance_loss_clip": 0.0629691, + "balance_loss_mlp": 0.01260672, + "epoch": 0.1268600631294153, + "flos": 17280865048320.0, + "grad_norm": 1.6861552096477337, + "language_loss": 0.83234507, + "learning_rate": 3.902389647441592e-06, + "loss": 0.91147858, + "num_input_tokens_seen": 45620595, + "router_z_loss_clip": 3.22265625, + "router_z_loss_mlp": 0.33374023, + "step": 2110, + "time_per_iteration": 3.975102424621582 + }, + { + "auxiliary_loss_clip": 0.06634356, + "auxiliary_loss_mlp": 0.01289468, + "balance_loss_clip": 0.06303843, + "balance_loss_mlp": 0.01256661, + "epoch": 0.12692018638208327, + "flos": 24067902712320.0, + "grad_norm": 1.6854035382994426, + "language_loss": 0.79946983, + "learning_rate": 3.90226942700191e-06, + "loss": 0.878708, + "num_input_tokens_seen": 45641140, + "router_z_loss_clip": 3.31054688, + "router_z_loss_mlp": 0.32788086, + "step": 2111, + "time_per_iteration": 2.549649953842163 + }, + { + "auxiliary_loss_clip": 0.06640926, + "auxiliary_loss_mlp": 0.0129832, + "balance_loss_clip": 0.06299084, + "balance_loss_mlp": 0.01261199, + "epoch": 0.12698030963475124, + "flos": 31839952648320.0, + "grad_norm": 2.9365318295255984, + "language_loss": 0.78364569, + "learning_rate": 3.902149134427982e-06, + "loss": 0.86303812, + "num_input_tokens_seen": 45662315, + "router_z_loss_clip": 3.4140625, + "router_z_loss_mlp": 0.37109375, + "step": 2112, + "time_per_iteration": 2.641850233078003 + }, + { + "auxiliary_loss_clip": 0.06616612, + "auxiliary_loss_mlp": 0.01293574, + "balance_loss_clip": 0.062942, + "balance_loss_mlp": 0.01262342, + "epoch": 0.1270404328874192, + "flos": 25194058387200.0, + "grad_norm": 2.0317084660262688, + "language_loss": 0.86970478, + "learning_rate": 3.902028769724367e-06, + "loss": 0.94880664, + "num_input_tokens_seen": 45680335, + "router_z_loss_clip": 3.22851562, + "router_z_loss_mlp": 0.31225586, + "step": 2113, + "time_per_iteration": 5.534189224243164 + }, + { + "auxiliary_loss_clip": 0.06626937, + "auxiliary_loss_mlp": 0.01298292, + "balance_loss_clip": 0.06295247, + "balance_loss_mlp": 0.01265462, + "epoch": 0.12710055614008717, + "flos": 16002790721280.0, + "grad_norm": 2.427248740860799, + "language_loss": 0.75266403, + "learning_rate": 3.9019083328956315e-06, + "loss": 0.83191633, + "num_input_tokens_seen": 45696240, + "router_z_loss_clip": 3.3203125, + "router_z_loss_mlp": 0.32788086, + "step": 2114, + "time_per_iteration": 2.491520643234253 + }, + { + "auxiliary_loss_clip": 0.06621046, + "auxiliary_loss_mlp": 0.01302494, + "balance_loss_clip": 0.06295703, + "balance_loss_mlp": 0.01270975, + "epoch": 0.12716067939275516, + "flos": 15091012517760.0, + "grad_norm": 2.3252793600318125, + "language_loss": 0.85064435, + "learning_rate": 3.901787823946341e-06, + "loss": 0.92987972, + "num_input_tokens_seen": 45713695, + "router_z_loss_clip": 3.2578125, + "router_z_loss_mlp": 0.31518555, + "step": 2115, + "time_per_iteration": 2.5152101516723633 + }, + { + "auxiliary_loss_clip": 0.06622103, + "auxiliary_loss_mlp": 0.01292068, + "balance_loss_clip": 0.06295006, + "balance_loss_mlp": 0.01260787, + "epoch": 0.12722080264542313, + "flos": 28374373704960.0, + "grad_norm": 1.6080767966631377, + "language_loss": 0.88167703, + "learning_rate": 3.901667242881065e-06, + "loss": 0.96081877, + "num_input_tokens_seen": 45736655, + "router_z_loss_clip": 3.2734375, + "router_z_loss_mlp": 0.3125, + "step": 2116, + "time_per_iteration": 2.61238169670105 + }, + { + "auxiliary_loss_clip": 0.06614063, + "auxiliary_loss_mlp": 0.01310146, + "balance_loss_clip": 0.06294715, + "balance_loss_mlp": 0.0127877, + "epoch": 0.1272809258980911, + "flos": 32388159985920.0, + "grad_norm": 4.443941469464488, + "language_loss": 0.72083235, + "learning_rate": 3.9015465897043775e-06, + "loss": 0.8000744, + "num_input_tokens_seen": 45758195, + "router_z_loss_clip": 3.1953125, + "router_z_loss_mlp": 0.3137207, + "step": 2117, + "time_per_iteration": 2.6185410022735596 + }, + { + "auxiliary_loss_clip": 0.06630652, + "auxiliary_loss_mlp": 0.01300593, + "balance_loss_clip": 0.06301345, + "balance_loss_mlp": 0.0126781, + "epoch": 0.12734104915075906, + "flos": 16039952807040.0, + "grad_norm": 1.9850917523754936, + "language_loss": 0.87703407, + "learning_rate": 3.901425864420852e-06, + "loss": 0.95634645, + "num_input_tokens_seen": 45774280, + "router_z_loss_clip": 3.29101562, + "router_z_loss_mlp": 0.32739258, + "step": 2118, + "time_per_iteration": 2.503112316131592 + }, + { + "auxiliary_loss_clip": 0.06623712, + "auxiliary_loss_mlp": 0.01308307, + "balance_loss_clip": 0.06299254, + "balance_loss_mlp": 0.01276359, + "epoch": 0.12740117240342702, + "flos": 18266296590720.0, + "grad_norm": 1.8669738886398666, + "language_loss": 0.88737518, + "learning_rate": 3.901305067035068e-06, + "loss": 0.96669531, + "num_input_tokens_seen": 45792760, + "router_z_loss_clip": 3.24414062, + "router_z_loss_mlp": 0.31945801, + "step": 2119, + "time_per_iteration": 2.541663885116577 + }, + { + "auxiliary_loss_clip": 0.06633841, + "auxiliary_loss_mlp": 0.01294245, + "balance_loss_clip": 0.06305236, + "balance_loss_mlp": 0.01260652, + "epoch": 0.127461295656095, + "flos": 12125242379520.0, + "grad_norm": 2.4570566612421154, + "language_loss": 0.88616729, + "learning_rate": 3.901184197551605e-06, + "loss": 0.96544814, + "num_input_tokens_seen": 45804300, + "router_z_loss_clip": 3.2890625, + "router_z_loss_mlp": 0.33569336, + "step": 2120, + "time_per_iteration": 2.481060743331909 + }, + { + "auxiliary_loss_clip": 0.06631807, + "auxiliary_loss_mlp": 0.01302004, + "balance_loss_clip": 0.06303513, + "balance_loss_mlp": 0.01269079, + "epoch": 0.12752141890876295, + "flos": 23155831019520.0, + "grad_norm": 1.9663880058350043, + "language_loss": 0.7779758, + "learning_rate": 3.901063255975046e-06, + "loss": 0.85731387, + "num_input_tokens_seen": 45823780, + "router_z_loss_clip": 3.28515625, + "router_z_loss_mlp": 0.3293457, + "step": 2121, + "time_per_iteration": 2.5578267574310303 + }, + { + "auxiliary_loss_clip": 0.06632394, + "auxiliary_loss_mlp": 0.01293067, + "balance_loss_clip": 0.06304775, + "balance_loss_mlp": 0.01258949, + "epoch": 0.12758154216143094, + "flos": 21622359847680.0, + "grad_norm": 2.5772818076611976, + "language_loss": 0.84019601, + "learning_rate": 3.900942242309978e-06, + "loss": 0.91945064, + "num_input_tokens_seen": 45840495, + "router_z_loss_clip": 3.28125, + "router_z_loss_mlp": 0.34106445, + "step": 2122, + "time_per_iteration": 2.5861244201660156 + }, + { + "auxiliary_loss_clip": 0.06629082, + "auxiliary_loss_mlp": 0.01293636, + "balance_loss_clip": 0.06302215, + "balance_loss_mlp": 0.01260162, + "epoch": 0.1276416654140989, + "flos": 15930395193600.0, + "grad_norm": 1.9995911681983476, + "language_loss": 0.80520052, + "learning_rate": 3.90082115656099e-06, + "loss": 0.88442767, + "num_input_tokens_seen": 45857735, + "router_z_loss_clip": 3.26953125, + "router_z_loss_mlp": 0.33496094, + "step": 2123, + "time_per_iteration": 2.543966770172119 + }, + { + "auxiliary_loss_clip": 0.06636834, + "auxiliary_loss_mlp": 0.01289825, + "balance_loss_clip": 0.06312384, + "balance_loss_mlp": 0.01257687, + "epoch": 0.12770178866676687, + "flos": 22389263141760.0, + "grad_norm": 1.6312979029769639, + "language_loss": 0.80678988, + "learning_rate": 3.900699998732673e-06, + "loss": 0.88605642, + "num_input_tokens_seen": 45876485, + "router_z_loss_clip": 3.24023438, + "router_z_loss_mlp": 0.3215332, + "step": 2124, + "time_per_iteration": 2.590118169784546 + }, + { + "auxiliary_loss_clip": 0.06636873, + "auxiliary_loss_mlp": 0.01291865, + "balance_loss_clip": 0.06307361, + "balance_loss_mlp": 0.01261228, + "epoch": 0.12776191191943484, + "flos": 21658851100800.0, + "grad_norm": 2.2926076774548765, + "language_loss": 0.76290202, + "learning_rate": 3.900578768829623e-06, + "loss": 0.84218943, + "num_input_tokens_seen": 45894645, + "router_z_loss_clip": 3.29492188, + "router_z_loss_mlp": 0.30639648, + "step": 2125, + "time_per_iteration": 2.5684149265289307 + }, + { + "auxiliary_loss_clip": 0.06631321, + "auxiliary_loss_mlp": 0.01289055, + "balance_loss_clip": 0.0630435, + "balance_loss_mlp": 0.01257011, + "epoch": 0.1278220351721028, + "flos": 25742056089600.0, + "grad_norm": 2.526811883204058, + "language_loss": 0.79172325, + "learning_rate": 3.900457466856434e-06, + "loss": 0.87092698, + "num_input_tokens_seen": 45913755, + "router_z_loss_clip": 3.26757812, + "router_z_loss_mlp": 0.3203125, + "step": 2126, + "time_per_iteration": 2.6264641284942627 + }, + { + "auxiliary_loss_clip": 0.06645348, + "auxiliary_loss_mlp": 0.01292083, + "balance_loss_clip": 0.06316036, + "balance_loss_mlp": 0.01259563, + "epoch": 0.12788215842477077, + "flos": 41252515747200.0, + "grad_norm": 1.559600581864003, + "language_loss": 0.70510435, + "learning_rate": 3.9003360928177085e-06, + "loss": 0.7844786, + "num_input_tokens_seen": 45936095, + "router_z_loss_clip": 3.29101562, + "router_z_loss_mlp": 0.32543945, + "step": 2127, + "time_per_iteration": 2.7501988410949707 + }, + { + "auxiliary_loss_clip": 0.06512339, + "auxiliary_loss_mlp": 0.01271557, + "balance_loss_clip": 0.06312746, + "balance_loss_mlp": 0.01259123, + "epoch": 0.12794228167743876, + "flos": 70899079265280.0, + "grad_norm": 0.8027421200972868, + "language_loss": 0.6268698, + "learning_rate": 3.900214646718047e-06, + "loss": 0.70470876, + "num_input_tokens_seen": 46004655, + "router_z_loss_clip": 2.0, + "router_z_loss_mlp": 0.12438965, + "step": 2128, + "time_per_iteration": 3.2327187061309814 + }, + { + "auxiliary_loss_clip": 0.06647713, + "auxiliary_loss_mlp": 0.0129082, + "balance_loss_clip": 0.06314018, + "balance_loss_mlp": 0.01255987, + "epoch": 0.12800240493010673, + "flos": 16295307724800.0, + "grad_norm": 3.2224372102485757, + "language_loss": 0.78878236, + "learning_rate": 3.900093128562056e-06, + "loss": 0.86816764, + "num_input_tokens_seen": 46023610, + "router_z_loss_clip": 3.3359375, + "router_z_loss_mlp": 0.34790039, + "step": 2129, + "time_per_iteration": 2.513296365737915 + }, + { + "auxiliary_loss_clip": 0.06653494, + "auxiliary_loss_mlp": 0.01302761, + "balance_loss_clip": 0.06312658, + "balance_loss_mlp": 0.012649, + "epoch": 0.1280625281827747, + "flos": 20637850700160.0, + "grad_norm": 2.4415165367574394, + "language_loss": 0.80974901, + "learning_rate": 3.899971538354343e-06, + "loss": 0.88931155, + "num_input_tokens_seen": 46041725, + "router_z_loss_clip": 3.40820312, + "router_z_loss_mlp": 0.37866211, + "step": 2130, + "time_per_iteration": 2.551335573196411 + }, + { + "auxiliary_loss_clip": 0.06635942, + "auxiliary_loss_mlp": 0.01301168, + "balance_loss_clip": 0.06304602, + "balance_loss_mlp": 0.01268457, + "epoch": 0.12812265143544266, + "flos": 22644869621760.0, + "grad_norm": 1.8063453022697407, + "language_loss": 0.73535526, + "learning_rate": 3.899849876099518e-06, + "loss": 0.81472635, + "num_input_tokens_seen": 46061095, + "router_z_loss_clip": 3.31445312, + "router_z_loss_mlp": 0.3269043, + "step": 2131, + "time_per_iteration": 2.591715097427368 + }, + { + "auxiliary_loss_clip": 0.06649061, + "auxiliary_loss_mlp": 0.01307481, + "balance_loss_clip": 0.06316839, + "balance_loss_mlp": 0.01274961, + "epoch": 0.12818277468811062, + "flos": 34723306696320.0, + "grad_norm": 2.4480572994081213, + "language_loss": 0.74477613, + "learning_rate": 3.899728141802197e-06, + "loss": 0.8243416, + "num_input_tokens_seen": 46082670, + "router_z_loss_clip": 3.32226562, + "router_z_loss_mlp": 0.32519531, + "step": 2132, + "time_per_iteration": 2.644005060195923 + }, + { + "auxiliary_loss_clip": 0.06630264, + "auxiliary_loss_mlp": 0.01301188, + "balance_loss_clip": 0.06311467, + "balance_loss_mlp": 0.01268573, + "epoch": 0.1282428979407786, + "flos": 23118752787840.0, + "grad_norm": 2.134664592917613, + "language_loss": 0.83662349, + "learning_rate": 3.8996063354669935e-06, + "loss": 0.91593802, + "num_input_tokens_seen": 46102410, + "router_z_loss_clip": 3.1875, + "router_z_loss_mlp": 0.32617188, + "step": 2133, + "time_per_iteration": 2.526437520980835 + }, + { + "auxiliary_loss_clip": 0.06657492, + "auxiliary_loss_mlp": 0.01312656, + "balance_loss_clip": 0.06318928, + "balance_loss_mlp": 0.01277823, + "epoch": 0.12830302119344655, + "flos": 20892786347520.0, + "grad_norm": 3.0593036297338223, + "language_loss": 0.82609046, + "learning_rate": 3.899484457098528e-06, + "loss": 0.90579188, + "num_input_tokens_seen": 46121145, + "router_z_loss_clip": 3.38476562, + "router_z_loss_mlp": 0.34814453, + "step": 2134, + "time_per_iteration": 2.57069993019104 + }, + { + "auxiliary_loss_clip": 0.06644946, + "auxiliary_loss_mlp": 0.01299694, + "balance_loss_clip": 0.0631265, + "balance_loss_mlp": 0.01266363, + "epoch": 0.12836314444611455, + "flos": 21404208942720.0, + "grad_norm": 1.8809028559826366, + "language_loss": 0.84531921, + "learning_rate": 3.899362506701421e-06, + "loss": 0.92476559, + "num_input_tokens_seen": 46140740, + "router_z_loss_clip": 3.3203125, + "router_z_loss_mlp": 0.33325195, + "step": 2135, + "time_per_iteration": 2.5816993713378906 + }, + { + "auxiliary_loss_clip": 0.06641332, + "auxiliary_loss_mlp": 0.01305378, + "balance_loss_clip": 0.06312244, + "balance_loss_mlp": 0.01272142, + "epoch": 0.1284232676987825, + "flos": 13667560156800.0, + "grad_norm": 3.0323333945799176, + "language_loss": 0.78892457, + "learning_rate": 3.899240484280298e-06, + "loss": 0.86839169, + "num_input_tokens_seen": 46156805, + "router_z_loss_clip": 3.2890625, + "router_z_loss_mlp": 0.33227539, + "step": 2136, + "time_per_iteration": 2.529231548309326 + }, + { + "auxiliary_loss_clip": 0.06499572, + "auxiliary_loss_mlp": 0.01289102, + "balance_loss_clip": 0.06299701, + "balance_loss_mlp": 0.01276156, + "epoch": 0.12848339095145048, + "flos": 60012904337280.0, + "grad_norm": 0.8797489168749767, + "language_loss": 0.5947628, + "learning_rate": 3.899118389839785e-06, + "loss": 0.67264956, + "num_input_tokens_seen": 46222085, + "router_z_loss_clip": 2.0, + "router_z_loss_mlp": 0.12957764, + "step": 2137, + "time_per_iteration": 3.308232545852661 + }, + { + "auxiliary_loss_clip": 0.06652065, + "auxiliary_loss_mlp": 0.01307251, + "balance_loss_clip": 0.06317523, + "balance_loss_mlp": 0.01273515, + "epoch": 0.12854351420411844, + "flos": 13886507675520.0, + "grad_norm": 2.603073013301421, + "language_loss": 0.84481782, + "learning_rate": 3.898996223384512e-06, + "loss": 0.924411, + "num_input_tokens_seen": 46239970, + "router_z_loss_clip": 3.34765625, + "router_z_loss_mlp": 0.3371582, + "step": 2138, + "time_per_iteration": 2.5150487422943115 + }, + { + "auxiliary_loss_clip": 0.0665133, + "auxiliary_loss_mlp": 0.01300544, + "balance_loss_clip": 0.06310506, + "balance_loss_mlp": 0.01263136, + "epoch": 0.1286036374567864, + "flos": 22644534205440.0, + "grad_norm": 2.3721539245571237, + "language_loss": 0.79668736, + "learning_rate": 3.898873984919113e-06, + "loss": 0.87620616, + "num_input_tokens_seen": 46257740, + "router_z_loss_clip": 3.41015625, + "router_z_loss_mlp": 0.37402344, + "step": 2139, + "time_per_iteration": 2.5760304927825928 + }, + { + "auxiliary_loss_clip": 0.06645858, + "auxiliary_loss_mlp": 0.01289965, + "balance_loss_clip": 0.06314536, + "balance_loss_mlp": 0.0125754, + "epoch": 0.12866376070945437, + "flos": 16330121896320.0, + "grad_norm": 1.944874099387006, + "language_loss": 0.86374593, + "learning_rate": 3.8987516744482215e-06, + "loss": 0.94310415, + "num_input_tokens_seen": 46275445, + "router_z_loss_clip": 3.30859375, + "router_z_loss_mlp": 0.32421875, + "step": 2140, + "time_per_iteration": 2.5656511783599854 + }, + { + "auxiliary_loss_clip": 0.06634524, + "auxiliary_loss_mlp": 0.01284799, + "balance_loss_clip": 0.06308289, + "balance_loss_mlp": 0.01254496, + "epoch": 0.12872388396212234, + "flos": 11879321045760.0, + "grad_norm": 2.00800168780761, + "language_loss": 0.87046349, + "learning_rate": 3.898629291976476e-06, + "loss": 0.94965667, + "num_input_tokens_seen": 46291710, + "router_z_loss_clip": 3.2578125, + "router_z_loss_mlp": 0.30322266, + "step": 2141, + "time_per_iteration": 2.589749336242676 + }, + { + "auxiliary_loss_clip": 0.06646, + "auxiliary_loss_mlp": 0.01294177, + "balance_loss_clip": 0.06311622, + "balance_loss_mlp": 0.01261037, + "epoch": 0.12878400721479033, + "flos": 28374331777920.0, + "grad_norm": 2.3143248810569563, + "language_loss": 0.69344199, + "learning_rate": 3.898506837508518e-06, + "loss": 0.77284372, + "num_input_tokens_seen": 46311335, + "router_z_loss_clip": 3.34375, + "router_z_loss_mlp": 0.33154297, + "step": 2142, + "time_per_iteration": 2.631613254547119 + }, + { + "auxiliary_loss_clip": 0.06645877, + "auxiliary_loss_mlp": 0.01292532, + "balance_loss_clip": 0.06308207, + "balance_loss_mlp": 0.01257723, + "epoch": 0.1288441304674583, + "flos": 25892842711680.0, + "grad_norm": 1.8471793604151003, + "language_loss": 0.84538341, + "learning_rate": 3.89838431104899e-06, + "loss": 0.92476749, + "num_input_tokens_seen": 46330985, + "router_z_loss_clip": 3.38085938, + "router_z_loss_mlp": 0.34814453, + "step": 2143, + "time_per_iteration": 2.62510085105896 + }, + { + "auxiliary_loss_clip": 0.06646847, + "auxiliary_loss_mlp": 0.01296075, + "balance_loss_clip": 0.06309757, + "balance_loss_mlp": 0.01261194, + "epoch": 0.12890425372012626, + "flos": 20820097330560.0, + "grad_norm": 2.9481033880232284, + "language_loss": 0.82936227, + "learning_rate": 3.898261712602539e-06, + "loss": 0.90879142, + "num_input_tokens_seen": 46351295, + "router_z_loss_clip": 3.37109375, + "router_z_loss_mlp": 0.34912109, + "step": 2144, + "time_per_iteration": 2.562148332595825 + }, + { + "auxiliary_loss_clip": 0.06632444, + "auxiliary_loss_mlp": 0.01299578, + "balance_loss_clip": 0.06302489, + "balance_loss_mlp": 0.01263196, + "epoch": 0.12896437697279423, + "flos": 22572599875200.0, + "grad_norm": 2.2245116542983046, + "language_loss": 0.80073792, + "learning_rate": 3.898139042173813e-06, + "loss": 0.88005811, + "num_input_tokens_seen": 46368600, + "router_z_loss_clip": 3.29882812, + "router_z_loss_mlp": 0.36376953, + "step": 2145, + "time_per_iteration": 2.5510518550872803 + }, + { + "auxiliary_loss_clip": 0.06636346, + "auxiliary_loss_mlp": 0.0130688, + "balance_loss_clip": 0.06306225, + "balance_loss_mlp": 0.01269877, + "epoch": 0.1290245002254622, + "flos": 17499561004800.0, + "grad_norm": 2.1761731102138686, + "language_loss": 0.83456767, + "learning_rate": 3.898016299767465e-06, + "loss": 0.91399992, + "num_input_tokens_seen": 46387370, + "router_z_loss_clip": 3.30273438, + "router_z_loss_mlp": 0.36987305, + "step": 2146, + "time_per_iteration": 2.5113868713378906 + }, + { + "auxiliary_loss_clip": 0.06626259, + "auxiliary_loss_mlp": 0.01301495, + "balance_loss_clip": 0.06300884, + "balance_loss_mlp": 0.01266042, + "epoch": 0.12908462347813016, + "flos": 36324142151040.0, + "grad_norm": 4.395125583857354, + "language_loss": 0.72594023, + "learning_rate": 3.897893485388149e-06, + "loss": 0.8052178, + "num_input_tokens_seen": 46409570, + "router_z_loss_clip": 3.25585938, + "router_z_loss_mlp": 0.35449219, + "step": 2147, + "time_per_iteration": 2.7282183170318604 + }, + { + "auxiliary_loss_clip": 0.06638759, + "auxiliary_loss_mlp": 0.01311135, + "balance_loss_clip": 0.0630547, + "balance_loss_mlp": 0.0127685, + "epoch": 0.12914474673079815, + "flos": 22535312008320.0, + "grad_norm": 2.709676387149746, + "language_loss": 0.73026669, + "learning_rate": 3.897770599040521e-06, + "loss": 0.80976564, + "num_input_tokens_seen": 46429320, + "router_z_loss_clip": 3.33203125, + "router_z_loss_mlp": 0.34326172, + "step": 2148, + "time_per_iteration": 2.5520236492156982 + }, + { + "auxiliary_loss_clip": 0.0663462, + "auxiliary_loss_mlp": 0.01329577, + "balance_loss_clip": 0.06310473, + "balance_loss_mlp": 0.01295626, + "epoch": 0.12920486998346611, + "flos": 21478533114240.0, + "grad_norm": 1.8799370652963014, + "language_loss": 0.80598587, + "learning_rate": 3.897647640729242e-06, + "loss": 0.88562787, + "num_input_tokens_seen": 46450155, + "router_z_loss_clip": 3.2421875, + "router_z_loss_mlp": 0.33959961, + "step": 2149, + "time_per_iteration": 3.9808621406555176 + }, + { + "auxiliary_loss_clip": 0.06633235, + "auxiliary_loss_mlp": 0.01311577, + "balance_loss_clip": 0.06304948, + "balance_loss_mlp": 0.01273907, + "epoch": 0.12926499323613408, + "flos": 27316001583360.0, + "grad_norm": 1.9848043356035314, + "language_loss": 0.77766216, + "learning_rate": 3.897524610458975e-06, + "loss": 0.85711026, + "num_input_tokens_seen": 46470280, + "router_z_loss_clip": 3.27929688, + "router_z_loss_mlp": 0.37646484, + "step": 2150, + "time_per_iteration": 4.050567388534546 + }, + { + "auxiliary_loss_clip": 0.06637069, + "auxiliary_loss_mlp": 0.01309125, + "balance_loss_clip": 0.06305329, + "balance_loss_mlp": 0.01273791, + "epoch": 0.12932511648880204, + "flos": 22097710460160.0, + "grad_norm": 2.600129389398131, + "language_loss": 0.71828127, + "learning_rate": 3.8974015082343835e-06, + "loss": 0.79774326, + "num_input_tokens_seen": 46487605, + "router_z_loss_clip": 3.31835938, + "router_z_loss_mlp": 0.35351562, + "step": 2151, + "time_per_iteration": 2.539199113845825 + }, + { + "auxiliary_loss_clip": 0.06638855, + "auxiliary_loss_mlp": 0.01316478, + "balance_loss_clip": 0.06308948, + "balance_loss_mlp": 0.01280716, + "epoch": 0.12938523974147, + "flos": 20308968224640.0, + "grad_norm": 2.09152011854814, + "language_loss": 0.85415232, + "learning_rate": 3.897278334060137e-06, + "loss": 0.93370569, + "num_input_tokens_seen": 46505100, + "router_z_loss_clip": 3.296875, + "router_z_loss_mlp": 0.35766602, + "step": 2152, + "time_per_iteration": 4.064931631088257 + }, + { + "auxiliary_loss_clip": 0.06626976, + "auxiliary_loss_mlp": 0.0130895, + "balance_loss_clip": 0.06301983, + "balance_loss_mlp": 0.01275118, + "epoch": 0.12944536299413797, + "flos": 19505992947840.0, + "grad_norm": 2.0734690645371865, + "language_loss": 0.79983026, + "learning_rate": 3.897155087940906e-06, + "loss": 0.87918949, + "num_input_tokens_seen": 46524020, + "router_z_loss_clip": 3.25, + "router_z_loss_mlp": 0.33837891, + "step": 2153, + "time_per_iteration": 3.9787750244140625 + }, + { + "auxiliary_loss_clip": 0.06634978, + "auxiliary_loss_mlp": 0.01296438, + "balance_loss_clip": 0.06309275, + "balance_loss_mlp": 0.01262845, + "epoch": 0.12950548624680594, + "flos": 27715099380480.0, + "grad_norm": 1.6134334939452253, + "language_loss": 0.81228089, + "learning_rate": 3.897031769881364e-06, + "loss": 0.89159513, + "num_input_tokens_seen": 46544640, + "router_z_loss_clip": 3.2578125, + "router_z_loss_mlp": 0.3359375, + "step": 2154, + "time_per_iteration": 2.6176583766937256 + }, + { + "auxiliary_loss_clip": 0.06634305, + "auxiliary_loss_mlp": 0.01301182, + "balance_loss_clip": 0.06307935, + "balance_loss_mlp": 0.01267756, + "epoch": 0.12956560949947393, + "flos": 17571369553920.0, + "grad_norm": 5.013009585067341, + "language_loss": 0.84744835, + "learning_rate": 3.896908379886188e-06, + "loss": 0.92680323, + "num_input_tokens_seen": 46561395, + "router_z_loss_clip": 3.265625, + "router_z_loss_mlp": 0.33422852, + "step": 2155, + "time_per_iteration": 2.512476921081543 + }, + { + "auxiliary_loss_clip": 0.06635429, + "auxiliary_loss_mlp": 0.01300286, + "balance_loss_clip": 0.06301479, + "balance_loss_mlp": 0.01265668, + "epoch": 0.1296257327521419, + "flos": 20746989043200.0, + "grad_norm": 7.629659850029062, + "language_loss": 0.77301121, + "learning_rate": 3.896784917960055e-06, + "loss": 0.85236835, + "num_input_tokens_seen": 46579395, + "router_z_loss_clip": 3.34179688, + "router_z_loss_mlp": 0.34619141, + "step": 2156, + "time_per_iteration": 2.5492148399353027 + }, + { + "auxiliary_loss_clip": 0.06627367, + "auxiliary_loss_mlp": 0.01301012, + "balance_loss_clip": 0.06305566, + "balance_loss_mlp": 0.01268063, + "epoch": 0.12968585600480986, + "flos": 16400756488320.0, + "grad_norm": 2.322189413476167, + "language_loss": 0.88143146, + "learning_rate": 3.896661384107648e-06, + "loss": 0.96071517, + "num_input_tokens_seen": 46597090, + "router_z_loss_clip": 3.21875, + "router_z_loss_mlp": 0.32910156, + "step": 2157, + "time_per_iteration": 2.571720838546753 + }, + { + "auxiliary_loss_clip": 0.06642087, + "auxiliary_loss_mlp": 0.0129196, + "balance_loss_clip": 0.06308718, + "balance_loss_mlp": 0.01257699, + "epoch": 0.12974597925747783, + "flos": 28337043911040.0, + "grad_norm": 2.3553612027238753, + "language_loss": 0.82135451, + "learning_rate": 3.896537778333651e-06, + "loss": 0.90069497, + "num_input_tokens_seen": 46617355, + "router_z_loss_clip": 3.33398438, + "router_z_loss_mlp": 0.34277344, + "step": 2158, + "time_per_iteration": 2.5973830223083496 + }, + { + "auxiliary_loss_clip": 0.06639753, + "auxiliary_loss_mlp": 0.0129687, + "balance_loss_clip": 0.06306097, + "balance_loss_mlp": 0.01263467, + "epoch": 0.1298061025101458, + "flos": 9687036746880.0, + "grad_norm": 2.577133138726625, + "language_loss": 0.76591945, + "learning_rate": 3.896414100642752e-06, + "loss": 0.84528571, + "num_input_tokens_seen": 46633130, + "router_z_loss_clip": 3.33984375, + "router_z_loss_mlp": 0.33422852, + "step": 2159, + "time_per_iteration": 2.4932103157043457 + }, + { + "auxiliary_loss_clip": 0.06634657, + "auxiliary_loss_mlp": 0.01294131, + "balance_loss_clip": 0.06308954, + "balance_loss_mlp": 0.01261086, + "epoch": 0.12986622576281376, + "flos": 27716986097280.0, + "grad_norm": 2.475517406269625, + "language_loss": 0.83553314, + "learning_rate": 3.89629035103964e-06, + "loss": 0.91482103, + "num_input_tokens_seen": 46650575, + "router_z_loss_clip": 3.26171875, + "router_z_loss_mlp": 0.33056641, + "step": 2160, + "time_per_iteration": 2.603818655014038 + }, + { + "auxiliary_loss_clip": 0.06627609, + "auxiliary_loss_mlp": 0.01293116, + "balance_loss_clip": 0.06306535, + "balance_loss_mlp": 0.01259118, + "epoch": 0.12992634901548175, + "flos": 18807963310080.0, + "grad_norm": 1.593154120113757, + "language_loss": 0.83271182, + "learning_rate": 3.896166529529008e-06, + "loss": 0.91191912, + "num_input_tokens_seen": 46668780, + "router_z_loss_clip": 3.21484375, + "router_z_loss_mlp": 0.33984375, + "step": 2161, + "time_per_iteration": 2.5266897678375244 + }, + { + "auxiliary_loss_clip": 0.06639621, + "auxiliary_loss_mlp": 0.01302779, + "balance_loss_clip": 0.06313581, + "balance_loss_mlp": 0.01268423, + "epoch": 0.12998647226814972, + "flos": 29134442891520.0, + "grad_norm": 2.3185391348432254, + "language_loss": 0.83230841, + "learning_rate": 3.896042636115551e-06, + "loss": 0.91173244, + "num_input_tokens_seen": 46687550, + "router_z_loss_clip": 3.25976562, + "router_z_loss_mlp": 0.34375, + "step": 2162, + "time_per_iteration": 2.65075945854187 + }, + { + "auxiliary_loss_clip": 0.06644595, + "auxiliary_loss_mlp": 0.0130915, + "balance_loss_clip": 0.06308532, + "balance_loss_mlp": 0.01275485, + "epoch": 0.13004659552081768, + "flos": 19579855921920.0, + "grad_norm": 2.844531827385147, + "language_loss": 0.74537766, + "learning_rate": 3.895918670803968e-06, + "loss": 0.82491517, + "num_input_tokens_seen": 46706730, + "router_z_loss_clip": 3.36132812, + "router_z_loss_mlp": 0.33666992, + "step": 2163, + "time_per_iteration": 2.54642653465271 + }, + { + "auxiliary_loss_clip": 0.06640218, + "auxiliary_loss_mlp": 0.0130695, + "balance_loss_clip": 0.06307475, + "balance_loss_mlp": 0.01271259, + "epoch": 0.13010671877348565, + "flos": 22497059819520.0, + "grad_norm": 2.8300840640024605, + "language_loss": 0.82687104, + "learning_rate": 3.895794633598958e-06, + "loss": 0.90634274, + "num_input_tokens_seen": 46724250, + "router_z_loss_clip": 3.33007812, + "router_z_loss_mlp": 0.35668945, + "step": 2164, + "time_per_iteration": 2.5606889724731445 + }, + { + "auxiliary_loss_clip": 0.06643611, + "auxiliary_loss_mlp": 0.01308241, + "balance_loss_clip": 0.0631078, + "balance_loss_mlp": 0.0127317, + "epoch": 0.1301668420261536, + "flos": 23884985249280.0, + "grad_norm": 2.1372618334431004, + "language_loss": 0.72789967, + "learning_rate": 3.8956705245052256e-06, + "loss": 0.80741817, + "num_input_tokens_seen": 46744105, + "router_z_loss_clip": 3.33007812, + "router_z_loss_mlp": 0.35058594, + "step": 2165, + "time_per_iteration": 2.5799126625061035 + }, + { + "auxiliary_loss_clip": 0.06653779, + "auxiliary_loss_mlp": 0.01315345, + "balance_loss_clip": 0.06317334, + "balance_loss_mlp": 0.0127932, + "epoch": 0.13022696527882158, + "flos": 23156963049600.0, + "grad_norm": 2.4025078023781563, + "language_loss": 0.76332915, + "learning_rate": 3.8955463435274765e-06, + "loss": 0.84302044, + "num_input_tokens_seen": 46764250, + "router_z_loss_clip": 3.3671875, + "router_z_loss_mlp": 0.35986328, + "step": 2166, + "time_per_iteration": 2.6160640716552734 + }, + { + "auxiliary_loss_clip": 0.06650659, + "auxiliary_loss_mlp": 0.01325427, + "balance_loss_clip": 0.06318434, + "balance_loss_mlp": 0.01292144, + "epoch": 0.13028708853148954, + "flos": 26916149099520.0, + "grad_norm": 2.7267776489226945, + "language_loss": 0.84227574, + "learning_rate": 3.895422090670421e-06, + "loss": 0.92203659, + "num_input_tokens_seen": 46786865, + "router_z_loss_clip": 3.3203125, + "router_z_loss_mlp": 0.33276367, + "step": 2167, + "time_per_iteration": 2.6118650436401367 + }, + { + "auxiliary_loss_clip": 0.0665281, + "auxiliary_loss_mlp": 0.01322266, + "balance_loss_clip": 0.06323615, + "balance_loss_mlp": 0.01284524, + "epoch": 0.13034721178415754, + "flos": 21257824659840.0, + "grad_norm": 1.882236850474067, + "language_loss": 0.84621233, + "learning_rate": 3.89529776593877e-06, + "loss": 0.9259631, + "num_input_tokens_seen": 46807030, + "router_z_loss_clip": 3.29101562, + "router_z_loss_mlp": 0.37719727, + "step": 2168, + "time_per_iteration": 2.599341869354248 + }, + { + "auxiliary_loss_clip": 0.06651181, + "auxiliary_loss_mlp": 0.01330045, + "balance_loss_clip": 0.0631827, + "balance_loss_mlp": 0.01296166, + "epoch": 0.1304073350368255, + "flos": 18772646014080.0, + "grad_norm": 2.6769280516725495, + "language_loss": 0.81258374, + "learning_rate": 3.8951733693372375e-06, + "loss": 0.89239597, + "num_input_tokens_seen": 46826280, + "router_z_loss_clip": 3.32617188, + "router_z_loss_mlp": 0.33886719, + "step": 2169, + "time_per_iteration": 2.551320791244507 + }, + { + "auxiliary_loss_clip": 0.06645042, + "auxiliary_loss_mlp": 0.01325755, + "balance_loss_clip": 0.06314517, + "balance_loss_mlp": 0.01290898, + "epoch": 0.13046745828949347, + "flos": 28371941936640.0, + "grad_norm": 2.6264294111585285, + "language_loss": 0.6902529, + "learning_rate": 3.8950489008705406e-06, + "loss": 0.76996082, + "num_input_tokens_seen": 46846505, + "router_z_loss_clip": 3.30664062, + "router_z_loss_mlp": 0.34838867, + "step": 2170, + "time_per_iteration": 2.636103868484497 + }, + { + "auxiliary_loss_clip": 0.06639146, + "auxiliary_loss_mlp": 0.01323013, + "balance_loss_clip": 0.063104, + "balance_loss_mlp": 0.01289826, + "epoch": 0.13052758154216143, + "flos": 29612518761600.0, + "grad_norm": 2.576487358768087, + "language_loss": 0.68392706, + "learning_rate": 3.8949243605434e-06, + "loss": 0.76354867, + "num_input_tokens_seen": 46867380, + "router_z_loss_clip": 3.28125, + "router_z_loss_mlp": 0.33178711, + "step": 2171, + "time_per_iteration": 2.6055140495300293 + }, + { + "auxiliary_loss_clip": 0.06645554, + "auxiliary_loss_mlp": 0.01327149, + "balance_loss_clip": 0.06309786, + "balance_loss_mlp": 0.0129215, + "epoch": 0.1305877047948294, + "flos": 19396938458880.0, + "grad_norm": 3.1003670458212973, + "language_loss": 0.73706764, + "learning_rate": 3.894799748360537e-06, + "loss": 0.81679469, + "num_input_tokens_seen": 46886810, + "router_z_loss_clip": 3.35742188, + "router_z_loss_mlp": 0.35009766, + "step": 2172, + "time_per_iteration": 2.541368007659912 + }, + { + "auxiliary_loss_clip": 0.06633269, + "auxiliary_loss_mlp": 0.01311381, + "balance_loss_clip": 0.06310625, + "balance_loss_mlp": 0.01278884, + "epoch": 0.13064782804749736, + "flos": 16879209701760.0, + "grad_norm": 2.044770569718403, + "language_loss": 0.7695576, + "learning_rate": 3.894675064326678e-06, + "loss": 0.84900403, + "num_input_tokens_seen": 46905620, + "router_z_loss_clip": 3.22460938, + "router_z_loss_mlp": 0.32470703, + "step": 2173, + "time_per_iteration": 2.5094704627990723 + }, + { + "auxiliary_loss_clip": 0.06648449, + "auxiliary_loss_mlp": 0.0132515, + "balance_loss_clip": 0.06310691, + "balance_loss_mlp": 0.01289125, + "epoch": 0.13070795130016533, + "flos": 24506049312000.0, + "grad_norm": 2.8505370909687575, + "language_loss": 0.725703, + "learning_rate": 3.894550308446551e-06, + "loss": 0.805439, + "num_input_tokens_seen": 46925120, + "router_z_loss_clip": 3.3828125, + "router_z_loss_mlp": 0.36035156, + "step": 2174, + "time_per_iteration": 2.5734338760375977 + }, + { + "auxiliary_loss_clip": 0.06505907, + "auxiliary_loss_mlp": 0.01291883, + "balance_loss_clip": 0.0631025, + "balance_loss_mlp": 0.0128004, + "epoch": 0.13076807455283332, + "flos": 71075288401920.0, + "grad_norm": 0.7747015133023086, + "language_loss": 0.58868217, + "learning_rate": 3.894425480724886e-06, + "loss": 0.66666007, + "num_input_tokens_seen": 46988195, + "router_z_loss_clip": 1.953125, + "router_z_loss_mlp": 0.11834717, + "step": 2175, + "time_per_iteration": 3.2926440238952637 + }, + { + "auxiliary_loss_clip": 0.0663542, + "auxiliary_loss_mlp": 0.01313196, + "balance_loss_clip": 0.06304372, + "balance_loss_mlp": 0.01276337, + "epoch": 0.13082819780550128, + "flos": 20270380619520.0, + "grad_norm": 2.4663196598164543, + "language_loss": 0.8129558, + "learning_rate": 3.894300581166417e-06, + "loss": 0.89244199, + "num_input_tokens_seen": 47004720, + "router_z_loss_clip": 3.30664062, + "router_z_loss_mlp": 0.36865234, + "step": 2176, + "time_per_iteration": 2.509202480316162 + }, + { + "auxiliary_loss_clip": 0.06636009, + "auxiliary_loss_mlp": 0.01308249, + "balance_loss_clip": 0.06307728, + "balance_loss_mlp": 0.01275204, + "epoch": 0.13088832105816925, + "flos": 34211884101120.0, + "grad_norm": 2.555490160200695, + "language_loss": 0.75945169, + "learning_rate": 3.894175609775881e-06, + "loss": 0.83889425, + "num_input_tokens_seen": 47024255, + "router_z_loss_clip": 3.28710938, + "router_z_loss_mlp": 0.33056641, + "step": 2177, + "time_per_iteration": 2.666957378387451 + }, + { + "auxiliary_loss_clip": 0.06632685, + "auxiliary_loss_mlp": 0.01303929, + "balance_loss_clip": 0.0630488, + "balance_loss_mlp": 0.01266378, + "epoch": 0.13094844431083721, + "flos": 17900797080960.0, + "grad_norm": 1.8104390236362107, + "language_loss": 0.8256914, + "learning_rate": 3.894050566558015e-06, + "loss": 0.90505755, + "num_input_tokens_seen": 47042465, + "router_z_loss_clip": 3.27929688, + "router_z_loss_mlp": 0.37548828, + "step": 2178, + "time_per_iteration": 2.5337579250335693 + }, + { + "auxiliary_loss_clip": 0.06635031, + "auxiliary_loss_mlp": 0.01298768, + "balance_loss_clip": 0.06305701, + "balance_loss_mlp": 0.01263625, + "epoch": 0.13100856756350518, + "flos": 17317062812160.0, + "grad_norm": 2.2347658227591327, + "language_loss": 0.76173234, + "learning_rate": 3.893925451517562e-06, + "loss": 0.84107035, + "num_input_tokens_seen": 47060370, + "router_z_loss_clip": 3.296875, + "router_z_loss_mlp": 0.35131836, + "step": 2179, + "time_per_iteration": 2.606982469558716 + }, + { + "auxiliary_loss_clip": 0.06624588, + "auxiliary_loss_mlp": 0.01289469, + "balance_loss_clip": 0.0630476, + "balance_loss_mlp": 0.01256281, + "epoch": 0.13106869081617314, + "flos": 22207142292480.0, + "grad_norm": 2.1299268574103074, + "language_loss": 0.85375142, + "learning_rate": 3.893800264659266e-06, + "loss": 0.93289196, + "num_input_tokens_seen": 47081415, + "router_z_loss_clip": 3.19921875, + "router_z_loss_mlp": 0.33154297, + "step": 2180, + "time_per_iteration": 2.585345506668091 + }, + { + "auxiliary_loss_clip": 0.06632008, + "auxiliary_loss_mlp": 0.01298661, + "balance_loss_clip": 0.06304625, + "balance_loss_mlp": 0.01265282, + "epoch": 0.13112881406884114, + "flos": 21769708452480.0, + "grad_norm": 1.7694842435775522, + "language_loss": 0.9062323, + "learning_rate": 3.8936750059878746e-06, + "loss": 0.98553902, + "num_input_tokens_seen": 47099860, + "router_z_loss_clip": 3.2734375, + "router_z_loss_mlp": 0.33374023, + "step": 2181, + "time_per_iteration": 2.5587892532348633 + }, + { + "auxiliary_loss_clip": 0.06634288, + "auxiliary_loss_mlp": 0.01294395, + "balance_loss_clip": 0.06307417, + "balance_loss_mlp": 0.01259776, + "epoch": 0.1311889373215091, + "flos": 23337784160640.0, + "grad_norm": 2.2247782487696557, + "language_loss": 0.70639372, + "learning_rate": 3.893549675508137e-06, + "loss": 0.78568053, + "num_input_tokens_seen": 47118540, + "router_z_loss_clip": 3.26953125, + "router_z_loss_mlp": 0.34594727, + "step": 2182, + "time_per_iteration": 2.5555248260498047 + }, + { + "auxiliary_loss_clip": 0.06638541, + "auxiliary_loss_mlp": 0.0130911, + "balance_loss_clip": 0.06305085, + "balance_loss_mlp": 0.01272799, + "epoch": 0.13124906057417707, + "flos": 21473250307200.0, + "grad_norm": 2.348832160211932, + "language_loss": 0.79619586, + "learning_rate": 3.893424273224806e-06, + "loss": 0.8756724, + "num_input_tokens_seen": 47136710, + "router_z_loss_clip": 3.33789062, + "router_z_loss_mlp": 0.36303711, + "step": 2183, + "time_per_iteration": 2.6583075523376465 + }, + { + "auxiliary_loss_clip": 0.06622553, + "auxiliary_loss_mlp": 0.01296715, + "balance_loss_clip": 0.06301284, + "balance_loss_mlp": 0.0126379, + "epoch": 0.13130918382684503, + "flos": 23261531345280.0, + "grad_norm": 1.7633024883927577, + "language_loss": 0.86310816, + "learning_rate": 3.893298799142636e-06, + "loss": 0.94230086, + "num_input_tokens_seen": 47157155, + "router_z_loss_clip": 3.2109375, + "router_z_loss_mlp": 0.32910156, + "step": 2184, + "time_per_iteration": 2.565059185028076 + }, + { + "auxiliary_loss_clip": 0.06636564, + "auxiliary_loss_mlp": 0.01289356, + "balance_loss_clip": 0.06310757, + "balance_loss_mlp": 0.0125593, + "epoch": 0.131369307079513, + "flos": 20856588583680.0, + "grad_norm": 2.0374007595813106, + "language_loss": 0.83394486, + "learning_rate": 3.893173253266387e-06, + "loss": 0.91320401, + "num_input_tokens_seen": 47176820, + "router_z_loss_clip": 3.25585938, + "router_z_loss_mlp": 0.33447266, + "step": 2185, + "time_per_iteration": 2.581048011779785 + }, + { + "auxiliary_loss_clip": 0.06633392, + "auxiliary_loss_mlp": 0.01301523, + "balance_loss_clip": 0.063053, + "balance_loss_mlp": 0.012675, + "epoch": 0.13142943033218096, + "flos": 17864138119680.0, + "grad_norm": 2.061355049120503, + "language_loss": 0.7394222, + "learning_rate": 3.893047635600818e-06, + "loss": 0.8187713, + "num_input_tokens_seen": 47195855, + "router_z_loss_clip": 3.27734375, + "router_z_loss_mlp": 0.33984375, + "step": 2186, + "time_per_iteration": 2.5314900875091553 + }, + { + "auxiliary_loss_clip": 0.06633774, + "auxiliary_loss_mlp": 0.01305006, + "balance_loss_clip": 0.06309012, + "balance_loss_mlp": 0.01268337, + "epoch": 0.13148955358484893, + "flos": 21002343960960.0, + "grad_norm": 2.3237992911957748, + "language_loss": 0.8187871, + "learning_rate": 3.892921946150693e-06, + "loss": 0.89817482, + "num_input_tokens_seen": 47214535, + "router_z_loss_clip": 3.24609375, + "router_z_loss_mlp": 0.36669922, + "step": 2187, + "time_per_iteration": 2.575146198272705 + }, + { + "auxiliary_loss_clip": 0.0650041, + "auxiliary_loss_mlp": 0.01303078, + "balance_loss_clip": 0.06306808, + "balance_loss_mlp": 0.01287998, + "epoch": 0.13154967683751692, + "flos": 70192035313920.0, + "grad_norm": 0.8229480574179819, + "language_loss": 0.58883667, + "learning_rate": 3.892796184920778e-06, + "loss": 0.66687155, + "num_input_tokens_seen": 47270300, + "router_z_loss_clip": 1.9375, + "router_z_loss_mlp": 0.1505127, + "step": 2188, + "time_per_iteration": 4.631601572036743 + }, + { + "auxiliary_loss_clip": 0.06627252, + "auxiliary_loss_mlp": 0.01301964, + "balance_loss_clip": 0.06307825, + "balance_loss_mlp": 0.01268609, + "epoch": 0.1316098000901849, + "flos": 20382411928320.0, + "grad_norm": 1.8739878728488704, + "language_loss": 0.75486964, + "learning_rate": 3.892670351915842e-06, + "loss": 0.83416182, + "num_input_tokens_seen": 47290720, + "router_z_loss_clip": 3.1953125, + "router_z_loss_mlp": 0.33300781, + "step": 2189, + "time_per_iteration": 4.007068395614624 + }, + { + "auxiliary_loss_clip": 0.06638934, + "auxiliary_loss_mlp": 0.01302262, + "balance_loss_clip": 0.06312171, + "balance_loss_mlp": 0.01267691, + "epoch": 0.13166992334285285, + "flos": 23227723422720.0, + "grad_norm": 2.019862807668573, + "language_loss": 0.73193908, + "learning_rate": 3.892544447140657e-06, + "loss": 0.81135106, + "num_input_tokens_seen": 47311820, + "router_z_loss_clip": 3.26757812, + "router_z_loss_mlp": 0.34570312, + "step": 2190, + "time_per_iteration": 2.5776755809783936 + }, + { + "auxiliary_loss_clip": 0.06636755, + "auxiliary_loss_mlp": 0.01299753, + "balance_loss_clip": 0.06315562, + "balance_loss_mlp": 0.01266828, + "epoch": 0.13173004659552082, + "flos": 23337616452480.0, + "grad_norm": 1.8457361126651268, + "language_loss": 0.75608957, + "learning_rate": 3.892418470599996e-06, + "loss": 0.83545464, + "num_input_tokens_seen": 47331605, + "router_z_loss_clip": 3.21484375, + "router_z_loss_mlp": 0.32958984, + "step": 2191, + "time_per_iteration": 2.580988645553589 + }, + { + "auxiliary_loss_clip": 0.06637161, + "auxiliary_loss_mlp": 0.01295844, + "balance_loss_clip": 0.06311083, + "balance_loss_mlp": 0.01258699, + "epoch": 0.13179016984818878, + "flos": 21257866586880.0, + "grad_norm": 2.0212941585210613, + "language_loss": 0.80481809, + "learning_rate": 3.892292422298637e-06, + "loss": 0.88414812, + "num_input_tokens_seen": 47350455, + "router_z_loss_clip": 3.2578125, + "router_z_loss_mlp": 0.37133789, + "step": 2192, + "time_per_iteration": 5.4770941734313965 + }, + { + "auxiliary_loss_clip": 0.06644538, + "auxiliary_loss_mlp": 0.01301425, + "balance_loss_clip": 0.06318243, + "balance_loss_mlp": 0.01265758, + "epoch": 0.13185029310085675, + "flos": 17783357184000.0, + "grad_norm": 2.540381366914011, + "language_loss": 0.86697793, + "learning_rate": 3.892166302241361e-06, + "loss": 0.94643748, + "num_input_tokens_seen": 47368225, + "router_z_loss_clip": 3.26757812, + "router_z_loss_mlp": 0.35693359, + "step": 2193, + "time_per_iteration": 2.5420453548431396 + }, + { + "auxiliary_loss_clip": 0.06500036, + "auxiliary_loss_mlp": 0.01269775, + "balance_loss_clip": 0.06307782, + "balance_loss_mlp": 0.01257103, + "epoch": 0.1319104163535247, + "flos": 69872586422400.0, + "grad_norm": 0.721919772393688, + "language_loss": 0.54093373, + "learning_rate": 3.8920401104329475e-06, + "loss": 0.61863184, + "num_input_tokens_seen": 47427125, + "router_z_loss_clip": 1.921875, + "router_z_loss_mlp": 0.12683105, + "step": 2194, + "time_per_iteration": 3.1521217823028564 + }, + { + "auxiliary_loss_clip": 0.06633582, + "auxiliary_loss_mlp": 0.01294441, + "balance_loss_clip": 0.06310762, + "balance_loss_mlp": 0.01261277, + "epoch": 0.1319705396061927, + "flos": 25200305516160.0, + "grad_norm": 1.726437316735012, + "language_loss": 0.7434622, + "learning_rate": 3.891913846878185e-06, + "loss": 0.82274246, + "num_input_tokens_seen": 47450275, + "router_z_loss_clip": 3.23046875, + "router_z_loss_mlp": 0.33154297, + "step": 2195, + "time_per_iteration": 2.593909740447998 + }, + { + "auxiliary_loss_clip": 0.06639563, + "auxiliary_loss_mlp": 0.01299138, + "balance_loss_clip": 0.0630713, + "balance_loss_mlp": 0.01264305, + "epoch": 0.13203066285886067, + "flos": 20746695553920.0, + "grad_norm": 1.9416785711103928, + "language_loss": 0.79390305, + "learning_rate": 3.891787511581859e-06, + "loss": 0.87329006, + "num_input_tokens_seen": 47469155, + "router_z_loss_clip": 3.32421875, + "router_z_loss_mlp": 0.34838867, + "step": 2196, + "time_per_iteration": 2.5824716091156006 + }, + { + "auxiliary_loss_clip": 0.06635743, + "auxiliary_loss_mlp": 0.01302288, + "balance_loss_clip": 0.06304654, + "balance_loss_mlp": 0.01269148, + "epoch": 0.13209078611152864, + "flos": 22060925717760.0, + "grad_norm": 8.075867999821003, + "language_loss": 0.76482284, + "learning_rate": 3.89166110454876e-06, + "loss": 0.84420311, + "num_input_tokens_seen": 47488405, + "router_z_loss_clip": 3.31054688, + "router_z_loss_mlp": 0.33105469, + "step": 2197, + "time_per_iteration": 2.5501832962036133 + }, + { + "auxiliary_loss_clip": 0.06635305, + "auxiliary_loss_mlp": 0.01300777, + "balance_loss_clip": 0.06304274, + "balance_loss_mlp": 0.01266063, + "epoch": 0.1321509093641966, + "flos": 16289731428480.0, + "grad_norm": 2.9293196732039126, + "language_loss": 0.81022984, + "learning_rate": 3.891534625783685e-06, + "loss": 0.88959062, + "num_input_tokens_seen": 47505650, + "router_z_loss_clip": 3.31054688, + "router_z_loss_mlp": 0.34716797, + "step": 2198, + "time_per_iteration": 2.570861577987671 + }, + { + "auxiliary_loss_clip": 0.06631541, + "auxiliary_loss_mlp": 0.01313296, + "balance_loss_clip": 0.06305937, + "balance_loss_mlp": 0.01279513, + "epoch": 0.13221103261686457, + "flos": 16988725388160.0, + "grad_norm": 2.4451285716665914, + "language_loss": 0.83851683, + "learning_rate": 3.891408075291425e-06, + "loss": 0.91796517, + "num_input_tokens_seen": 47521540, + "router_z_loss_clip": 3.25390625, + "router_z_loss_mlp": 0.33764648, + "step": 2199, + "time_per_iteration": 2.521033525466919 + }, + { + "auxiliary_loss_clip": 0.06631772, + "auxiliary_loss_mlp": 0.01306909, + "balance_loss_clip": 0.06307507, + "balance_loss_mlp": 0.01272887, + "epoch": 0.13227115586953253, + "flos": 34240996195200.0, + "grad_norm": 1.9425616182298255, + "language_loss": 0.71189994, + "learning_rate": 3.8912814530767826e-06, + "loss": 0.79128671, + "num_input_tokens_seen": 47543625, + "router_z_loss_clip": 3.24023438, + "router_z_loss_mlp": 0.34033203, + "step": 2200, + "time_per_iteration": 2.670046806335449 + }, + { + "auxiliary_loss_clip": 0.06617988, + "auxiliary_loss_mlp": 0.01304715, + "balance_loss_clip": 0.06300868, + "balance_loss_mlp": 0.01274341, + "epoch": 0.13233127912220052, + "flos": 20711000914560.0, + "grad_norm": 2.1724926946699754, + "language_loss": 0.86090875, + "learning_rate": 3.891154759144557e-06, + "loss": 0.94013584, + "num_input_tokens_seen": 47563740, + "router_z_loss_clip": 3.17773438, + "router_z_loss_mlp": 0.30371094, + "step": 2201, + "time_per_iteration": 2.570223569869995 + }, + { + "auxiliary_loss_clip": 0.06631213, + "auxiliary_loss_mlp": 0.01297349, + "balance_loss_clip": 0.06304044, + "balance_loss_mlp": 0.01263828, + "epoch": 0.1323914023748685, + "flos": 25810971672960.0, + "grad_norm": 1.9172071001088793, + "language_loss": 0.87768662, + "learning_rate": 3.891027993499554e-06, + "loss": 0.95697218, + "num_input_tokens_seen": 47582655, + "router_z_loss_clip": 3.2734375, + "router_z_loss_mlp": 0.33496094, + "step": 2202, + "time_per_iteration": 2.6102631092071533 + }, + { + "auxiliary_loss_clip": 0.06636258, + "auxiliary_loss_mlp": 0.012969, + "balance_loss_clip": 0.06311007, + "balance_loss_mlp": 0.01264427, + "epoch": 0.13245152562753645, + "flos": 21257908513920.0, + "grad_norm": 2.5432278039111202, + "language_loss": 0.73953617, + "learning_rate": 3.89090115614658e-06, + "loss": 0.81886774, + "num_input_tokens_seen": 47600875, + "router_z_loss_clip": 3.25390625, + "router_z_loss_mlp": 0.32470703, + "step": 2203, + "time_per_iteration": 2.582125425338745 + }, + { + "auxiliary_loss_clip": 0.0663885, + "auxiliary_loss_mlp": 0.01297802, + "balance_loss_clip": 0.06312627, + "balance_loss_mlp": 0.01266879, + "epoch": 0.13251164888020442, + "flos": 26617552675200.0, + "grad_norm": 2.0999892579623918, + "language_loss": 0.74886954, + "learning_rate": 3.890774247090444e-06, + "loss": 0.82823604, + "num_input_tokens_seen": 47619250, + "router_z_loss_clip": 3.26171875, + "router_z_loss_mlp": 0.30883789, + "step": 2204, + "time_per_iteration": 2.634873867034912 + }, + { + "auxiliary_loss_clip": 0.06637383, + "auxiliary_loss_mlp": 0.01309474, + "balance_loss_clip": 0.06314126, + "balance_loss_mlp": 0.01276119, + "epoch": 0.13257177213287238, + "flos": 29834485027200.0, + "grad_norm": 2.4895096645832235, + "language_loss": 0.79621047, + "learning_rate": 3.89064726633596e-06, + "loss": 0.87567902, + "num_input_tokens_seen": 47639445, + "router_z_loss_clip": 3.23242188, + "router_z_loss_mlp": 0.33349609, + "step": 2205, + "time_per_iteration": 2.619999647140503 + }, + { + "auxiliary_loss_clip": 0.06630976, + "auxiliary_loss_mlp": 0.01295213, + "balance_loss_clip": 0.06307817, + "balance_loss_mlp": 0.01261548, + "epoch": 0.13263189538554035, + "flos": 21294902891520.0, + "grad_norm": 2.228894402461185, + "language_loss": 0.80627573, + "learning_rate": 3.890520213887941e-06, + "loss": 0.88553762, + "num_input_tokens_seen": 47658740, + "router_z_loss_clip": 3.234375, + "router_z_loss_mlp": 0.33666992, + "step": 2206, + "time_per_iteration": 2.5711123943328857 + }, + { + "auxiliary_loss_clip": 0.06638241, + "auxiliary_loss_mlp": 0.01297492, + "balance_loss_clip": 0.06313571, + "balance_loss_mlp": 0.0126676, + "epoch": 0.13269201863820831, + "flos": 16879880534400.0, + "grad_norm": 2.2771237083056297, + "language_loss": 0.76153713, + "learning_rate": 3.890393089751208e-06, + "loss": 0.84089446, + "num_input_tokens_seen": 47676880, + "router_z_loss_clip": 3.2421875, + "router_z_loss_mlp": 0.30688477, + "step": 2207, + "time_per_iteration": 2.5054686069488525 + }, + { + "auxiliary_loss_clip": 0.06632576, + "auxiliary_loss_mlp": 0.01289317, + "balance_loss_clip": 0.06313936, + "balance_loss_mlp": 0.01259014, + "epoch": 0.1327521418908763, + "flos": 23775679198080.0, + "grad_norm": 2.287917678450009, + "language_loss": 0.85195792, + "learning_rate": 3.890265893930578e-06, + "loss": 0.9311769, + "num_input_tokens_seen": 47696635, + "router_z_loss_clip": 3.18359375, + "router_z_loss_mlp": 0.30322266, + "step": 2208, + "time_per_iteration": 2.609978675842285 + }, + { + "auxiliary_loss_clip": 0.0661916, + "auxiliary_loss_mlp": 0.0129287, + "balance_loss_clip": 0.06309634, + "balance_loss_mlp": 0.01263712, + "epoch": 0.13281226514354427, + "flos": 26512858598400.0, + "grad_norm": 2.1774657992842923, + "language_loss": 0.86578667, + "learning_rate": 3.890138626430876e-06, + "loss": 0.94490695, + "num_input_tokens_seen": 47717760, + "router_z_loss_clip": 3.09765625, + "router_z_loss_mlp": 0.29174805, + "step": 2209, + "time_per_iteration": 2.5905022621154785 + }, + { + "auxiliary_loss_clip": 0.06630558, + "auxiliary_loss_mlp": 0.01296527, + "balance_loss_clip": 0.06307525, + "balance_loss_mlp": 0.01264817, + "epoch": 0.13287238839621224, + "flos": 24505671968640.0, + "grad_norm": 2.0974790857001255, + "language_loss": 0.83324587, + "learning_rate": 3.890011287256929e-06, + "loss": 0.91251671, + "num_input_tokens_seen": 47737685, + "router_z_loss_clip": 3.234375, + "router_z_loss_mlp": 0.31689453, + "step": 2210, + "time_per_iteration": 2.605640172958374 + }, + { + "auxiliary_loss_clip": 0.06520031, + "auxiliary_loss_mlp": 0.01268108, + "balance_loss_clip": 0.06330763, + "balance_loss_mlp": 0.01256634, + "epoch": 0.1329325116488802, + "flos": 67713984264960.0, + "grad_norm": 0.7321997743468096, + "language_loss": 0.57977009, + "learning_rate": 3.889883876413563e-06, + "loss": 0.65765154, + "num_input_tokens_seen": 47802415, + "router_z_loss_clip": 1.890625, + "router_z_loss_mlp": 0.11456299, + "step": 2211, + "time_per_iteration": 3.2822937965393066 + }, + { + "auxiliary_loss_clip": 0.06521661, + "auxiliary_loss_mlp": 0.01258942, + "balance_loss_clip": 0.0633207, + "balance_loss_mlp": 0.01247897, + "epoch": 0.13299263490154817, + "flos": 72283440896640.0, + "grad_norm": 0.7669964089142771, + "language_loss": 0.54991639, + "learning_rate": 3.889756393905611e-06, + "loss": 0.62772238, + "num_input_tokens_seen": 47871485, + "router_z_loss_clip": 1.89648438, + "router_z_loss_mlp": 0.1105957, + "step": 2212, + "time_per_iteration": 3.2838916778564453 + }, + { + "auxiliary_loss_clip": 0.0664072, + "auxiliary_loss_mlp": 0.01298095, + "balance_loss_clip": 0.06314459, + "balance_loss_mlp": 0.012661, + "epoch": 0.13305275815421613, + "flos": 17937078698880.0, + "grad_norm": 3.2445802523020144, + "language_loss": 0.75483733, + "learning_rate": 3.889628839737908e-06, + "loss": 0.83422554, + "num_input_tokens_seen": 47888315, + "router_z_loss_clip": 3.26367188, + "router_z_loss_mlp": 0.31982422, + "step": 2213, + "time_per_iteration": 2.599457025527954 + }, + { + "auxiliary_loss_clip": 0.06623878, + "auxiliary_loss_mlp": 0.01290528, + "balance_loss_clip": 0.06308766, + "balance_loss_mlp": 0.01260917, + "epoch": 0.13311288140688413, + "flos": 22346566686720.0, + "grad_norm": 1.7850496574832224, + "language_loss": 0.80468798, + "learning_rate": 3.889501213915291e-06, + "loss": 0.88383198, + "num_input_tokens_seen": 47906600, + "router_z_loss_clip": 3.15429688, + "router_z_loss_mlp": 0.29614258, + "step": 2214, + "time_per_iteration": 2.572476625442505 + }, + { + "auxiliary_loss_clip": 0.06633762, + "auxiliary_loss_mlp": 0.01291249, + "balance_loss_clip": 0.06310902, + "balance_loss_mlp": 0.01259992, + "epoch": 0.1331730046595521, + "flos": 31877030880000.0, + "grad_norm": 1.879682062967662, + "language_loss": 0.71106076, + "learning_rate": 3.889373516442597e-06, + "loss": 0.79031086, + "num_input_tokens_seen": 47927630, + "router_z_loss_clip": 3.22851562, + "router_z_loss_mlp": 0.3125, + "step": 2215, + "time_per_iteration": 2.6289784908294678 + }, + { + "auxiliary_loss_clip": 0.06635362, + "auxiliary_loss_mlp": 0.01297639, + "balance_loss_clip": 0.06308068, + "balance_loss_mlp": 0.01264762, + "epoch": 0.13323312791222006, + "flos": 22573438416000.0, + "grad_norm": 2.1877299894623063, + "language_loss": 0.81866241, + "learning_rate": 3.889245747324671e-06, + "loss": 0.89799237, + "num_input_tokens_seen": 47947935, + "router_z_loss_clip": 3.27148438, + "router_z_loss_mlp": 0.32861328, + "step": 2216, + "time_per_iteration": 2.5978689193725586 + }, + { + "auxiliary_loss_clip": 0.06628902, + "auxiliary_loss_mlp": 0.01291342, + "balance_loss_clip": 0.06306753, + "balance_loss_mlp": 0.01260229, + "epoch": 0.13329325116488802, + "flos": 15090635174400.0, + "grad_norm": 1.945076656101512, + "language_loss": 0.8810879, + "learning_rate": 3.889117906566356e-06, + "loss": 0.96029037, + "num_input_tokens_seen": 47965515, + "router_z_loss_clip": 3.22265625, + "router_z_loss_mlp": 0.3112793, + "step": 2217, + "time_per_iteration": 2.5901639461517334 + }, + { + "auxiliary_loss_clip": 0.0662536, + "auxiliary_loss_mlp": 0.0129587, + "balance_loss_clip": 0.06307805, + "balance_loss_mlp": 0.01262563, + "epoch": 0.133353374417556, + "flos": 27461002273920.0, + "grad_norm": 2.771116888328456, + "language_loss": 0.75384659, + "learning_rate": 3.888989994172501e-06, + "loss": 0.83305889, + "num_input_tokens_seen": 47985675, + "router_z_loss_clip": 3.1796875, + "router_z_loss_mlp": 0.33349609, + "step": 2218, + "time_per_iteration": 2.5716331005096436 + }, + { + "auxiliary_loss_clip": 0.06631406, + "auxiliary_loss_mlp": 0.01293158, + "balance_loss_clip": 0.06307958, + "balance_loss_mlp": 0.01259875, + "epoch": 0.13341349767022395, + "flos": 24101081729280.0, + "grad_norm": 1.6852729372488615, + "language_loss": 0.88550645, + "learning_rate": 3.8888620101479565e-06, + "loss": 0.96475214, + "num_input_tokens_seen": 48004985, + "router_z_loss_clip": 3.23242188, + "router_z_loss_mlp": 0.33300781, + "step": 2219, + "time_per_iteration": 2.6070170402526855 + }, + { + "auxiliary_loss_clip": 0.06621003, + "auxiliary_loss_mlp": 0.01287851, + "balance_loss_clip": 0.06303806, + "balance_loss_mlp": 0.01257381, + "epoch": 0.13347362092289192, + "flos": 24140088604800.0, + "grad_norm": 2.0906842838932556, + "language_loss": 0.7815029, + "learning_rate": 3.888733954497574e-06, + "loss": 0.86059141, + "num_input_tokens_seen": 48024965, + "router_z_loss_clip": 3.16992188, + "router_z_loss_mlp": 0.3046875, + "step": 2220, + "time_per_iteration": 2.5560426712036133 + }, + { + "auxiliary_loss_clip": 0.06625573, + "auxiliary_loss_mlp": 0.01294385, + "balance_loss_clip": 0.06307516, + "balance_loss_mlp": 0.0126432, + "epoch": 0.1335337441755599, + "flos": 18441499478400.0, + "grad_norm": 3.5848326197945974, + "language_loss": 0.80259734, + "learning_rate": 3.888605827226212e-06, + "loss": 0.88179696, + "num_input_tokens_seen": 48040890, + "router_z_loss_clip": 3.18164062, + "router_z_loss_mlp": 0.30078125, + "step": 2221, + "time_per_iteration": 2.554230213165283 + }, + { + "auxiliary_loss_clip": 0.06500886, + "auxiliary_loss_mlp": 0.01279151, + "balance_loss_clip": 0.06314573, + "balance_loss_mlp": 0.01265382, + "epoch": 0.13359386742822787, + "flos": 50627608542720.0, + "grad_norm": 0.9620548374199929, + "language_loss": 0.69134498, + "learning_rate": 3.8884776283387275e-06, + "loss": 0.76914537, + "num_input_tokens_seen": 48091855, + "router_z_loss_clip": 1.86621094, + "router_z_loss_mlp": 0.13806152, + "step": 2222, + "time_per_iteration": 3.0396814346313477 + }, + { + "auxiliary_loss_clip": 0.0662626, + "auxiliary_loss_mlp": 0.01285858, + "balance_loss_clip": 0.06309929, + "balance_loss_mlp": 0.01257987, + "epoch": 0.13365399068089584, + "flos": 22784294016000.0, + "grad_norm": 6.993006748631453, + "language_loss": 0.68394774, + "learning_rate": 3.888349357839982e-06, + "loss": 0.76306891, + "num_input_tokens_seen": 48111350, + "router_z_loss_clip": 3.16015625, + "router_z_loss_mlp": 0.27856445, + "step": 2223, + "time_per_iteration": 2.6058313846588135 + }, + { + "auxiliary_loss_clip": 0.06624826, + "auxiliary_loss_mlp": 0.01288517, + "balance_loss_clip": 0.06304329, + "balance_loss_mlp": 0.01257296, + "epoch": 0.1337141139335638, + "flos": 12536540945280.0, + "grad_norm": 2.4608215865303937, + "language_loss": 0.8412739, + "learning_rate": 3.88822101573484e-06, + "loss": 0.9204073, + "num_input_tokens_seen": 48129840, + "router_z_loss_clip": 3.2109375, + "router_z_loss_mlp": 0.31213379, + "step": 2224, + "time_per_iteration": 2.5372307300567627 + }, + { + "auxiliary_loss_clip": 0.066294, + "auxiliary_loss_mlp": 0.01287352, + "balance_loss_clip": 0.06301981, + "balance_loss_mlp": 0.01255499, + "epoch": 0.13377423718623177, + "flos": 23045560646400.0, + "grad_norm": 2.2168840240666294, + "language_loss": 0.67877412, + "learning_rate": 3.888092602028167e-06, + "loss": 0.7579416, + "num_input_tokens_seen": 48149240, + "router_z_loss_clip": 3.27734375, + "router_z_loss_mlp": 0.31835938, + "step": 2225, + "time_per_iteration": 2.567253589630127 + }, + { + "auxiliary_loss_clip": 0.06627665, + "auxiliary_loss_mlp": 0.01285599, + "balance_loss_clip": 0.06307095, + "balance_loss_mlp": 0.01257406, + "epoch": 0.13383436043889974, + "flos": 16221905948160.0, + "grad_norm": 2.1695875347778184, + "language_loss": 0.90785301, + "learning_rate": 3.887964116724835e-06, + "loss": 0.98698568, + "num_input_tokens_seen": 48166330, + "router_z_loss_clip": 3.20703125, + "router_z_loss_mlp": 0.28186035, + "step": 2226, + "time_per_iteration": 2.6064305305480957 + }, + { + "auxiliary_loss_clip": 0.06623043, + "auxiliary_loss_mlp": 0.0129267, + "balance_loss_clip": 0.06300287, + "balance_loss_mlp": 0.01261771, + "epoch": 0.1338944836915677, + "flos": 24286514814720.0, + "grad_norm": 2.574481606503262, + "language_loss": 0.75021911, + "learning_rate": 3.887835559829712e-06, + "loss": 0.82937622, + "num_input_tokens_seen": 48187600, + "router_z_loss_clip": 3.23046875, + "router_z_loss_mlp": 0.30883789, + "step": 2227, + "time_per_iteration": 4.016468286514282 + }, + { + "auxiliary_loss_clip": 0.06618345, + "auxiliary_loss_mlp": 0.01292665, + "balance_loss_clip": 0.0629885, + "balance_loss_mlp": 0.01261265, + "epoch": 0.1339546069442357, + "flos": 17603793884160.0, + "grad_norm": 2.0025343623105214, + "language_loss": 0.8591758, + "learning_rate": 3.8877069313476764e-06, + "loss": 0.93828595, + "num_input_tokens_seen": 48204400, + "router_z_loss_clip": 3.1953125, + "router_z_loss_mlp": 0.31396484, + "step": 2228, + "time_per_iteration": 2.55798077583313 + }, + { + "auxiliary_loss_clip": 0.06615113, + "auxiliary_loss_mlp": 0.01284588, + "balance_loss_clip": 0.06298958, + "balance_loss_mlp": 0.01255548, + "epoch": 0.13401473019690366, + "flos": 18996163580160.0, + "grad_norm": 1.8879365390563052, + "language_loss": 0.82201439, + "learning_rate": 3.8875782312836054e-06, + "loss": 0.90101147, + "num_input_tokens_seen": 48222180, + "router_z_loss_clip": 3.15820312, + "router_z_loss_mlp": 0.29052734, + "step": 2229, + "time_per_iteration": 4.120098829269409 + }, + { + "auxiliary_loss_clip": 0.06619616, + "auxiliary_loss_mlp": 0.01290736, + "balance_loss_clip": 0.06300908, + "balance_loss_mlp": 0.01259849, + "epoch": 0.13407485344957162, + "flos": 26951214833280.0, + "grad_norm": 2.2979177943800386, + "language_loss": 0.7564404, + "learning_rate": 3.887449459642378e-06, + "loss": 0.83554387, + "num_input_tokens_seen": 48243245, + "router_z_loss_clip": 3.1875, + "router_z_loss_mlp": 0.30871582, + "step": 2230, + "time_per_iteration": 2.6150131225585938 + }, + { + "auxiliary_loss_clip": 0.06620437, + "auxiliary_loss_mlp": 0.01289621, + "balance_loss_clip": 0.06302108, + "balance_loss_mlp": 0.01261059, + "epoch": 0.1341349767022396, + "flos": 20345585258880.0, + "grad_norm": 1.8496833611889134, + "language_loss": 0.81113201, + "learning_rate": 3.8873206164288785e-06, + "loss": 0.89023262, + "num_input_tokens_seen": 48262600, + "router_z_loss_clip": 3.1796875, + "router_z_loss_mlp": 0.28564453, + "step": 2231, + "time_per_iteration": 2.5791971683502197 + }, + { + "auxiliary_loss_clip": 0.06629717, + "auxiliary_loss_mlp": 0.01304097, + "balance_loss_clip": 0.0629984, + "balance_loss_mlp": 0.01268811, + "epoch": 0.13419509995490755, + "flos": 29869802323200.0, + "grad_norm": 3.0058197712179218, + "language_loss": 0.73244405, + "learning_rate": 3.887191701647992e-06, + "loss": 0.81178224, + "num_input_tokens_seen": 48285075, + "router_z_loss_clip": 3.30273438, + "router_z_loss_mlp": 0.3527832, + "step": 2232, + "time_per_iteration": 4.126416444778442 + }, + { + "auxiliary_loss_clip": 0.06625827, + "auxiliary_loss_mlp": 0.01292477, + "balance_loss_clip": 0.06298069, + "balance_loss_mlp": 0.01260052, + "epoch": 0.13425522320757552, + "flos": 26950250511360.0, + "grad_norm": 2.8502119867979823, + "language_loss": 0.67005944, + "learning_rate": 3.8870627153046066e-06, + "loss": 0.74924242, + "num_input_tokens_seen": 48301285, + "router_z_loss_clip": 3.27539062, + "router_z_loss_mlp": 0.32421875, + "step": 2233, + "time_per_iteration": 2.57535457611084 + }, + { + "auxiliary_loss_clip": 0.0661561, + "auxiliary_loss_mlp": 0.01292122, + "balance_loss_clip": 0.0629602, + "balance_loss_mlp": 0.0126096, + "epoch": 0.1343153464602435, + "flos": 15782501537280.0, + "grad_norm": 2.818232021038303, + "language_loss": 0.82633889, + "learning_rate": 3.886933657403615e-06, + "loss": 0.90541625, + "num_input_tokens_seen": 48317835, + "router_z_loss_clip": 3.19726562, + "router_z_loss_mlp": 0.31176758, + "step": 2234, + "time_per_iteration": 2.5729787349700928 + }, + { + "auxiliary_loss_clip": 0.06617501, + "auxiliary_loss_mlp": 0.01296303, + "balance_loss_clip": 0.06299153, + "balance_loss_mlp": 0.01266668, + "epoch": 0.13437546971291148, + "flos": 24321370913280.0, + "grad_norm": 2.028590274897441, + "language_loss": 0.82841778, + "learning_rate": 3.886804527949909e-06, + "loss": 0.90755594, + "num_input_tokens_seen": 48335670, + "router_z_loss_clip": 3.18359375, + "router_z_loss_mlp": 0.29638672, + "step": 2235, + "time_per_iteration": 2.593050241470337 + }, + { + "auxiliary_loss_clip": 0.06612507, + "auxiliary_loss_mlp": 0.01293723, + "balance_loss_clip": 0.06294179, + "balance_loss_mlp": 0.01261989, + "epoch": 0.13443559296557944, + "flos": 26657817361920.0, + "grad_norm": 1.9716678370354759, + "language_loss": 0.87708902, + "learning_rate": 3.8866753269483864e-06, + "loss": 0.95615125, + "num_input_tokens_seen": 48357805, + "router_z_loss_clip": 3.18554688, + "router_z_loss_mlp": 0.31738281, + "step": 2236, + "time_per_iteration": 2.5910720825195312 + }, + { + "auxiliary_loss_clip": 0.06621092, + "auxiliary_loss_mlp": 0.01294743, + "balance_loss_clip": 0.06297852, + "balance_loss_mlp": 0.012627, + "epoch": 0.1344957162182474, + "flos": 21802216636800.0, + "grad_norm": 1.7646832896946034, + "language_loss": 0.78455186, + "learning_rate": 3.886546054403946e-06, + "loss": 0.86371022, + "num_input_tokens_seen": 48377845, + "router_z_loss_clip": 3.23632812, + "router_z_loss_mlp": 0.32080078, + "step": 2237, + "time_per_iteration": 2.5423593521118164 + }, + { + "auxiliary_loss_clip": 0.06621015, + "auxiliary_loss_mlp": 0.01296744, + "balance_loss_clip": 0.06297819, + "balance_loss_mlp": 0.01263746, + "epoch": 0.13455583947091537, + "flos": 19871785946880.0, + "grad_norm": 2.139876962287315, + "language_loss": 0.80559266, + "learning_rate": 3.886416710321491e-06, + "loss": 0.88477021, + "num_input_tokens_seen": 48394735, + "router_z_loss_clip": 3.23046875, + "router_z_loss_mlp": 0.33007812, + "step": 2238, + "time_per_iteration": 2.547511100769043 + }, + { + "auxiliary_loss_clip": 0.0662026, + "auxiliary_loss_mlp": 0.01290468, + "balance_loss_clip": 0.06300892, + "balance_loss_mlp": 0.0125945, + "epoch": 0.13461596272358334, + "flos": 30854730741120.0, + "grad_norm": 2.2946937997388983, + "language_loss": 0.69019175, + "learning_rate": 3.886287294705924e-06, + "loss": 0.76929903, + "num_input_tokens_seen": 48414200, + "router_z_loss_clip": 3.19335938, + "router_z_loss_mlp": 0.31005859, + "step": 2239, + "time_per_iteration": 2.6161396503448486 + }, + { + "auxiliary_loss_clip": 0.06626255, + "auxiliary_loss_mlp": 0.0129458, + "balance_loss_clip": 0.06302193, + "balance_loss_mlp": 0.01262609, + "epoch": 0.1346760859762513, + "flos": 12499253078400.0, + "grad_norm": 2.740092234793679, + "language_loss": 0.83294439, + "learning_rate": 3.8861578075621555e-06, + "loss": 0.91215271, + "num_input_tokens_seen": 48431065, + "router_z_loss_clip": 3.2421875, + "router_z_loss_mlp": 0.31958008, + "step": 2240, + "time_per_iteration": 2.531810998916626 + }, + { + "auxiliary_loss_clip": 0.06621873, + "auxiliary_loss_mlp": 0.01289824, + "balance_loss_clip": 0.06297486, + "balance_loss_mlp": 0.01256278, + "epoch": 0.1347362092289193, + "flos": 21842607104640.0, + "grad_norm": 1.6487000610588447, + "language_loss": 0.78665066, + "learning_rate": 3.886028248895093e-06, + "loss": 0.86576766, + "num_input_tokens_seen": 48450335, + "router_z_loss_clip": 3.2421875, + "router_z_loss_mlp": 0.33569336, + "step": 2241, + "time_per_iteration": 2.5346198081970215 + }, + { + "auxiliary_loss_clip": 0.06618196, + "auxiliary_loss_mlp": 0.01285675, + "balance_loss_clip": 0.06305367, + "balance_loss_mlp": 0.01256636, + "epoch": 0.13479633248158726, + "flos": 23515502670720.0, + "grad_norm": 1.8184249012274396, + "language_loss": 0.84641361, + "learning_rate": 3.88589861870965e-06, + "loss": 0.92545235, + "num_input_tokens_seen": 48468555, + "router_z_loss_clip": 3.12890625, + "router_z_loss_mlp": 0.29052734, + "step": 2242, + "time_per_iteration": 2.6532411575317383 + }, + { + "auxiliary_loss_clip": 0.0662721, + "auxiliary_loss_mlp": 0.01293952, + "balance_loss_clip": 0.06304164, + "balance_loss_mlp": 0.01261098, + "epoch": 0.13485645573425523, + "flos": 29350874787840.0, + "grad_norm": 2.677815565759994, + "language_loss": 0.66332561, + "learning_rate": 3.885768917010744e-06, + "loss": 0.74253726, + "num_input_tokens_seen": 48488515, + "router_z_loss_clip": 3.22851562, + "router_z_loss_mlp": 0.32836914, + "step": 2243, + "time_per_iteration": 2.599304437637329 + }, + { + "auxiliary_loss_clip": 0.06611082, + "auxiliary_loss_mlp": 0.01284736, + "balance_loss_clip": 0.06295401, + "balance_loss_mlp": 0.01256042, + "epoch": 0.1349165789869232, + "flos": 28044484980480.0, + "grad_norm": 1.4756823100545766, + "language_loss": 0.73444742, + "learning_rate": 3.8856391438032895e-06, + "loss": 0.81340563, + "num_input_tokens_seen": 48510515, + "router_z_loss_clip": 3.15234375, + "router_z_loss_mlp": 0.28662109, + "step": 2244, + "time_per_iteration": 2.640366554260254 + }, + { + "auxiliary_loss_clip": 0.06614108, + "auxiliary_loss_mlp": 0.01291938, + "balance_loss_clip": 0.062971, + "balance_loss_mlp": 0.01260133, + "epoch": 0.13497670223959116, + "flos": 22859834071680.0, + "grad_norm": 7.9965666613423, + "language_loss": 0.87522435, + "learning_rate": 3.88550929909221e-06, + "loss": 0.95428485, + "num_input_tokens_seen": 48529940, + "router_z_loss_clip": 3.17578125, + "router_z_loss_mlp": 0.31787109, + "step": 2245, + "time_per_iteration": 2.537259340286255 + }, + { + "auxiliary_loss_clip": 0.06609753, + "auxiliary_loss_mlp": 0.01291064, + "balance_loss_clip": 0.06298651, + "balance_loss_mlp": 0.0126119, + "epoch": 0.13503682549225912, + "flos": 16509517488000.0, + "grad_norm": 1.6351770671547161, + "language_loss": 0.80275553, + "learning_rate": 3.88537938288243e-06, + "loss": 0.88176376, + "num_input_tokens_seen": 48548190, + "router_z_loss_clip": 3.11328125, + "router_z_loss_mlp": 0.29858398, + "step": 2246, + "time_per_iteration": 2.576324224472046 + }, + { + "auxiliary_loss_clip": 0.06503996, + "auxiliary_loss_mlp": 0.01268266, + "balance_loss_clip": 0.06311698, + "balance_loss_mlp": 0.01256631, + "epoch": 0.1350969487449271, + "flos": 70775979217920.0, + "grad_norm": 0.7288766997222871, + "language_loss": 0.60674834, + "learning_rate": 3.885249395178874e-06, + "loss": 0.68447095, + "num_input_tokens_seen": 48613165, + "router_z_loss_clip": 1.921875, + "router_z_loss_mlp": 0.11621094, + "step": 2247, + "time_per_iteration": 3.295891046524048 + }, + { + "auxiliary_loss_clip": 0.06638567, + "auxiliary_loss_mlp": 0.01298182, + "balance_loss_clip": 0.06305797, + "balance_loss_mlp": 0.01262229, + "epoch": 0.13515707199759508, + "flos": 23082680805120.0, + "grad_norm": 2.7104639981136662, + "language_loss": 0.82279253, + "learning_rate": 3.885119335986473e-06, + "loss": 0.90216005, + "num_input_tokens_seen": 48631705, + "router_z_loss_clip": 3.33203125, + "router_z_loss_mlp": 0.359375, + "step": 2248, + "time_per_iteration": 2.575490951538086 + }, + { + "auxiliary_loss_clip": 0.06606994, + "auxiliary_loss_mlp": 0.01284005, + "balance_loss_clip": 0.0629556, + "balance_loss_mlp": 0.01255013, + "epoch": 0.13521719525026304, + "flos": 23193244667520.0, + "grad_norm": 1.8435286673705464, + "language_loss": 0.7853781, + "learning_rate": 3.884989205310157e-06, + "loss": 0.86428809, + "num_input_tokens_seen": 48649740, + "router_z_loss_clip": 3.11328125, + "router_z_loss_mlp": 0.2902832, + "step": 2249, + "time_per_iteration": 2.5745737552642822 + }, + { + "auxiliary_loss_clip": 0.06615513, + "auxiliary_loss_mlp": 0.01290474, + "balance_loss_clip": 0.06300813, + "balance_loss_mlp": 0.01262293, + "epoch": 0.135277318502931, + "flos": 24797937409920.0, + "grad_norm": 1.7186486055988894, + "language_loss": 0.86064833, + "learning_rate": 3.884859003154862e-06, + "loss": 0.93970823, + "num_input_tokens_seen": 48671565, + "router_z_loss_clip": 3.14453125, + "router_z_loss_mlp": 0.28210449, + "step": 2250, + "time_per_iteration": 2.594517469406128 + }, + { + "auxiliary_loss_clip": 0.06621417, + "auxiliary_loss_mlp": 0.01303153, + "balance_loss_clip": 0.06298415, + "balance_loss_mlp": 0.01270108, + "epoch": 0.13533744175559898, + "flos": 21915044559360.0, + "grad_norm": 3.4195422131585564, + "language_loss": 0.83116192, + "learning_rate": 3.884728729525524e-06, + "loss": 0.91040766, + "num_input_tokens_seen": 48690425, + "router_z_loss_clip": 3.22851562, + "router_z_loss_mlp": 0.33032227, + "step": 2251, + "time_per_iteration": 2.5615222454071045 + }, + { + "auxiliary_loss_clip": 0.066163, + "auxiliary_loss_mlp": 0.01290158, + "balance_loss_clip": 0.06295693, + "balance_loss_mlp": 0.01258579, + "epoch": 0.13539756500826694, + "flos": 21217434192000.0, + "grad_norm": 1.7358628614083547, + "language_loss": 0.86943758, + "learning_rate": 3.884598384427084e-06, + "loss": 0.94850212, + "num_input_tokens_seen": 48707505, + "router_z_loss_clip": 3.20703125, + "router_z_loss_mlp": 0.31555176, + "step": 2252, + "time_per_iteration": 2.5325772762298584 + }, + { + "auxiliary_loss_clip": 0.06482528, + "auxiliary_loss_mlp": 0.01279879, + "balance_loss_clip": 0.06294215, + "balance_loss_mlp": 0.01267404, + "epoch": 0.1354576882609349, + "flos": 63260835500160.0, + "grad_norm": 0.7528010548037618, + "language_loss": 0.61151105, + "learning_rate": 3.884467967864485e-06, + "loss": 0.68913507, + "num_input_tokens_seen": 48775895, + "router_z_loss_clip": 1.88867188, + "router_z_loss_mlp": 0.12481689, + "step": 2253, + "time_per_iteration": 3.2731101512908936 + }, + { + "auxiliary_loss_clip": 0.06617865, + "auxiliary_loss_mlp": 0.01297527, + "balance_loss_clip": 0.06298327, + "balance_loss_mlp": 0.01266961, + "epoch": 0.1355178115136029, + "flos": 25489971480960.0, + "grad_norm": 1.734180018549956, + "language_loss": 0.90171039, + "learning_rate": 3.884337479842671e-06, + "loss": 0.98086423, + "num_input_tokens_seen": 48798370, + "router_z_loss_clip": 3.19726562, + "router_z_loss_mlp": 0.30517578, + "step": 2254, + "time_per_iteration": 2.5830373764038086 + }, + { + "auxiliary_loss_clip": 0.06624171, + "auxiliary_loss_mlp": 0.01291824, + "balance_loss_clip": 0.06297128, + "balance_loss_mlp": 0.01259709, + "epoch": 0.13557793476627086, + "flos": 21623491877760.0, + "grad_norm": 2.5405517045767865, + "language_loss": 0.85834336, + "learning_rate": 3.884206920366591e-06, + "loss": 0.93750322, + "num_input_tokens_seen": 48817955, + "router_z_loss_clip": 3.26757812, + "router_z_loss_mlp": 0.32104492, + "step": 2255, + "time_per_iteration": 2.5849766731262207 + }, + { + "auxiliary_loss_clip": 0.06615041, + "auxiliary_loss_mlp": 0.01294235, + "balance_loss_clip": 0.06296261, + "balance_loss_mlp": 0.01264862, + "epoch": 0.13563805801893883, + "flos": 24933839932800.0, + "grad_norm": 2.4937460094050534, + "language_loss": 0.7602762, + "learning_rate": 3.884076289441196e-06, + "loss": 0.83936894, + "num_input_tokens_seen": 48836330, + "router_z_loss_clip": 3.18945312, + "router_z_loss_mlp": 0.29370117, + "step": 2256, + "time_per_iteration": 2.5914275646209717 + }, + { + "auxiliary_loss_clip": 0.06621285, + "auxiliary_loss_mlp": 0.01288962, + "balance_loss_clip": 0.06294358, + "balance_loss_mlp": 0.01257563, + "epoch": 0.1356981812716068, + "flos": 14754415466880.0, + "grad_norm": 2.129121942862091, + "language_loss": 0.84234703, + "learning_rate": 3.88394558707144e-06, + "loss": 0.92144954, + "num_input_tokens_seen": 48851890, + "router_z_loss_clip": 3.26367188, + "router_z_loss_mlp": 0.31420898, + "step": 2257, + "time_per_iteration": 2.5664286613464355 + }, + { + "auxiliary_loss_clip": 0.06630847, + "auxiliary_loss_mlp": 0.0129256, + "balance_loss_clip": 0.06299773, + "balance_loss_mlp": 0.01259658, + "epoch": 0.13575830452427476, + "flos": 11113256292480.0, + "grad_norm": 1.9364367185101232, + "language_loss": 0.83362973, + "learning_rate": 3.883814813262277e-06, + "loss": 0.91286373, + "num_input_tokens_seen": 48865510, + "router_z_loss_clip": 3.3125, + "router_z_loss_mlp": 0.32910156, + "step": 2258, + "time_per_iteration": 2.521657705307007 + }, + { + "auxiliary_loss_clip": 0.06621088, + "auxiliary_loss_mlp": 0.01297355, + "balance_loss_clip": 0.0629478, + "balance_loss_mlp": 0.01264858, + "epoch": 0.13581842777694272, + "flos": 17964849127680.0, + "grad_norm": 2.721301656824917, + "language_loss": 0.83752787, + "learning_rate": 3.883683968018669e-06, + "loss": 0.91671234, + "num_input_tokens_seen": 48882360, + "router_z_loss_clip": 3.26171875, + "router_z_loss_mlp": 0.32519531, + "step": 2259, + "time_per_iteration": 2.521693706512451 + }, + { + "auxiliary_loss_clip": 0.0660786, + "auxiliary_loss_mlp": 0.01289157, + "balance_loss_clip": 0.06291058, + "balance_loss_mlp": 0.01260952, + "epoch": 0.1358785510296107, + "flos": 22863817140480.0, + "grad_norm": 2.0214358343175927, + "language_loss": 0.74903429, + "learning_rate": 3.8835530513455755e-06, + "loss": 0.82800448, + "num_input_tokens_seen": 48902700, + "router_z_loss_clip": 3.16992188, + "router_z_loss_mlp": 0.28198242, + "step": 2260, + "time_per_iteration": 2.5302374362945557 + }, + { + "auxiliary_loss_clip": 0.0660997, + "auxiliary_loss_mlp": 0.0129096, + "balance_loss_clip": 0.06293269, + "balance_loss_mlp": 0.01260859, + "epoch": 0.13593867428227868, + "flos": 25746542282880.0, + "grad_norm": 2.2338901691781925, + "language_loss": 0.76686287, + "learning_rate": 3.883422063247961e-06, + "loss": 0.84587216, + "num_input_tokens_seen": 48922525, + "router_z_loss_clip": 3.16796875, + "router_z_loss_mlp": 0.30114746, + "step": 2261, + "time_per_iteration": 2.5939574241638184 + }, + { + "auxiliary_loss_clip": 0.06616522, + "auxiliary_loss_mlp": 0.01291008, + "balance_loss_clip": 0.0629552, + "balance_loss_mlp": 0.01259132, + "epoch": 0.13599879753494665, + "flos": 31257350409600.0, + "grad_norm": 2.2895573692407547, + "language_loss": 0.6521523, + "learning_rate": 3.883291003730794e-06, + "loss": 0.73122764, + "num_input_tokens_seen": 48942510, + "router_z_loss_clip": 3.21289062, + "router_z_loss_mlp": 0.31884766, + "step": 2262, + "time_per_iteration": 2.615324020385742 + }, + { + "auxiliary_loss_clip": 0.0662135, + "auxiliary_loss_mlp": 0.01300411, + "balance_loss_clip": 0.06298003, + "balance_loss_mlp": 0.01269584, + "epoch": 0.1360589207876146, + "flos": 23921853845760.0, + "grad_norm": 2.421989013841254, + "language_loss": 0.84175652, + "learning_rate": 3.883159872799043e-06, + "loss": 0.92097414, + "num_input_tokens_seen": 48962625, + "router_z_loss_clip": 3.23632812, + "router_z_loss_mlp": 0.30859375, + "step": 2263, + "time_per_iteration": 2.5566399097442627 + }, + { + "auxiliary_loss_clip": 0.06629188, + "auxiliary_loss_mlp": 0.01291754, + "balance_loss_clip": 0.06304573, + "balance_loss_mlp": 0.0125859, + "epoch": 0.13611904404028258, + "flos": 19980295384320.0, + "grad_norm": 2.5264058207475215, + "language_loss": 0.89336157, + "learning_rate": 3.8830286704576815e-06, + "loss": 0.97257102, + "num_input_tokens_seen": 48982525, + "router_z_loss_clip": 3.24804688, + "router_z_loss_mlp": 0.33178711, + "step": 2264, + "time_per_iteration": 2.5305962562561035 + }, + { + "auxiliary_loss_clip": 0.06637362, + "auxiliary_loss_mlp": 0.0129781, + "balance_loss_clip": 0.06308438, + "balance_loss_mlp": 0.01265195, + "epoch": 0.13617916729295054, + "flos": 15345990092160.0, + "grad_norm": 2.7927094576438716, + "language_loss": 0.71764517, + "learning_rate": 3.882897396711683e-06, + "loss": 0.79699689, + "num_input_tokens_seen": 48997605, + "router_z_loss_clip": 3.28515625, + "router_z_loss_mlp": 0.32617188, + "step": 2265, + "time_per_iteration": 2.561797857284546 + }, + { + "auxiliary_loss_clip": 0.06615983, + "auxiliary_loss_mlp": 0.01290453, + "balance_loss_clip": 0.06299248, + "balance_loss_mlp": 0.01262034, + "epoch": 0.1362392905456185, + "flos": 27458402797440.0, + "grad_norm": 2.5604448311617825, + "language_loss": 0.67458075, + "learning_rate": 3.882766051566027e-06, + "loss": 0.75364506, + "num_input_tokens_seen": 49018535, + "router_z_loss_clip": 3.16992188, + "router_z_loss_mlp": 0.28381348, + "step": 2266, + "time_per_iteration": 2.5694286823272705 + }, + { + "auxiliary_loss_clip": 0.06624304, + "auxiliary_loss_mlp": 0.01294932, + "balance_loss_clip": 0.06304609, + "balance_loss_mlp": 0.01263711, + "epoch": 0.1362994137982865, + "flos": 25015920606720.0, + "grad_norm": 2.0527906242943983, + "language_loss": 0.77445233, + "learning_rate": 3.882634635025694e-06, + "loss": 0.85364473, + "num_input_tokens_seen": 49038865, + "router_z_loss_clip": 3.1953125, + "router_z_loss_mlp": 0.31237793, + "step": 2267, + "time_per_iteration": 4.004362106323242 + }, + { + "auxiliary_loss_clip": 0.06632047, + "auxiliary_loss_mlp": 0.01290209, + "balance_loss_clip": 0.0631062, + "balance_loss_mlp": 0.01259882, + "epoch": 0.13635953705095447, + "flos": 20309261713920.0, + "grad_norm": 1.8370610095313742, + "language_loss": 0.836191, + "learning_rate": 3.882503147095667e-06, + "loss": 0.91541362, + "num_input_tokens_seen": 49058010, + "router_z_loss_clip": 3.21875, + "router_z_loss_mlp": 0.30322266, + "step": 2268, + "time_per_iteration": 3.9506208896636963 + }, + { + "auxiliary_loss_clip": 0.06630498, + "auxiliary_loss_mlp": 0.01294319, + "balance_loss_clip": 0.06311751, + "balance_loss_mlp": 0.01262013, + "epoch": 0.13641966030362243, + "flos": 31366530679680.0, + "grad_norm": 1.9828007462930386, + "language_loss": 0.7747438, + "learning_rate": 3.882371587780931e-06, + "loss": 0.85399193, + "num_input_tokens_seen": 49080330, + "router_z_loss_clip": 3.1875, + "router_z_loss_mlp": 0.32299805, + "step": 2269, + "time_per_iteration": 2.653453826904297 + }, + { + "auxiliary_loss_clip": 0.06638865, + "auxiliary_loss_mlp": 0.01296587, + "balance_loss_clip": 0.06316057, + "balance_loss_mlp": 0.0126545, + "epoch": 0.1364797835562904, + "flos": 20483122936320.0, + "grad_norm": 2.359526754249971, + "language_loss": 0.8236903, + "learning_rate": 3.882239957086477e-06, + "loss": 0.90304482, + "num_input_tokens_seen": 49097035, + "router_z_loss_clip": 3.22851562, + "router_z_loss_mlp": 0.31152344, + "step": 2270, + "time_per_iteration": 2.5675413608551025 + }, + { + "auxiliary_loss_clip": 0.06635441, + "auxiliary_loss_mlp": 0.01293131, + "balance_loss_clip": 0.06311204, + "balance_loss_mlp": 0.01261254, + "epoch": 0.13653990680895836, + "flos": 13083280836480.0, + "grad_norm": 2.670574241660613, + "language_loss": 0.77002323, + "learning_rate": 3.882108255017295e-06, + "loss": 0.84930891, + "num_input_tokens_seen": 49113945, + "router_z_loss_clip": 3.24414062, + "router_z_loss_mlp": 0.31884766, + "step": 2271, + "time_per_iteration": 3.976745367050171 + }, + { + "auxiliary_loss_clip": 0.06636623, + "auxiliary_loss_mlp": 0.01296686, + "balance_loss_clip": 0.06313315, + "balance_loss_mlp": 0.0126419, + "epoch": 0.13660003006162633, + "flos": 16952443770240.0, + "grad_norm": 2.320627701174975, + "language_loss": 0.81754398, + "learning_rate": 3.881976481578379e-06, + "loss": 0.89687717, + "num_input_tokens_seen": 49132855, + "router_z_loss_clip": 3.23242188, + "router_z_loss_mlp": 0.32495117, + "step": 2272, + "time_per_iteration": 4.03596043586731 + }, + { + "auxiliary_loss_clip": 0.0650102, + "auxiliary_loss_mlp": 0.01266825, + "balance_loss_clip": 0.06312356, + "balance_loss_mlp": 0.01255327, + "epoch": 0.1366601533142943, + "flos": 68703105386880.0, + "grad_norm": 0.6745755938751765, + "language_loss": 0.60570937, + "learning_rate": 3.8818446367747255e-06, + "loss": 0.68338782, + "num_input_tokens_seen": 49198310, + "router_z_loss_clip": 1.890625, + "router_z_loss_mlp": 0.11480713, + "step": 2273, + "time_per_iteration": 3.287332534790039 + }, + { + "auxiliary_loss_clip": 0.06625689, + "auxiliary_loss_mlp": 0.01290706, + "balance_loss_clip": 0.06308322, + "balance_loss_mlp": 0.01259831, + "epoch": 0.13672027656696228, + "flos": 19250176832640.0, + "grad_norm": 1.730825672757131, + "language_loss": 0.79225731, + "learning_rate": 3.881712720611336e-06, + "loss": 0.87142122, + "num_input_tokens_seen": 49217250, + "router_z_loss_clip": 3.171875, + "router_z_loss_mlp": 0.30883789, + "step": 2274, + "time_per_iteration": 2.562556743621826 + }, + { + "auxiliary_loss_clip": 0.06626303, + "auxiliary_loss_mlp": 0.01302977, + "balance_loss_clip": 0.06308225, + "balance_loss_mlp": 0.01270457, + "epoch": 0.13678039981963025, + "flos": 24541785878400.0, + "grad_norm": 2.937872524874316, + "language_loss": 0.79763901, + "learning_rate": 3.881580733093211e-06, + "loss": 0.87693179, + "num_input_tokens_seen": 49236615, + "router_z_loss_clip": 3.17773438, + "router_z_loss_mlp": 0.32519531, + "step": 2275, + "time_per_iteration": 2.560577630996704 + }, + { + "auxiliary_loss_clip": 0.06630076, + "auxiliary_loss_mlp": 0.01293627, + "balance_loss_clip": 0.06306267, + "balance_loss_mlp": 0.01259914, + "epoch": 0.13684052307229821, + "flos": 15674788713600.0, + "grad_norm": 2.8834689051693196, + "language_loss": 0.82202291, + "learning_rate": 3.881448674225356e-06, + "loss": 0.9012599, + "num_input_tokens_seen": 49253935, + "router_z_loss_clip": 3.2421875, + "router_z_loss_mlp": 0.33691406, + "step": 2276, + "time_per_iteration": 2.6382758617401123 + }, + { + "auxiliary_loss_clip": 0.06636757, + "auxiliary_loss_mlp": 0.01296316, + "balance_loss_clip": 0.06304651, + "balance_loss_mlp": 0.01260839, + "epoch": 0.13690064632496618, + "flos": 28371983863680.0, + "grad_norm": 2.682466270477189, + "language_loss": 0.71951526, + "learning_rate": 3.881316544012779e-06, + "loss": 0.79884601, + "num_input_tokens_seen": 49273605, + "router_z_loss_clip": 3.32421875, + "router_z_loss_mlp": 0.35473633, + "step": 2277, + "time_per_iteration": 2.59140944480896 + }, + { + "auxiliary_loss_clip": 0.06638919, + "auxiliary_loss_mlp": 0.01298071, + "balance_loss_clip": 0.06309501, + "balance_loss_mlp": 0.01265312, + "epoch": 0.13696076957763414, + "flos": 23411605207680.0, + "grad_norm": 2.2485386037649144, + "language_loss": 0.82153767, + "learning_rate": 3.88118434246049e-06, + "loss": 0.90090752, + "num_input_tokens_seen": 49291785, + "router_z_loss_clip": 3.296875, + "router_z_loss_mlp": 0.32739258, + "step": 2278, + "time_per_iteration": 2.5540530681610107 + }, + { + "auxiliary_loss_clip": 0.06627095, + "auxiliary_loss_mlp": 0.01287889, + "balance_loss_clip": 0.06304022, + "balance_loss_mlp": 0.01256358, + "epoch": 0.1370208928303021, + "flos": 37205760084480.0, + "grad_norm": 2.776511982198055, + "language_loss": 0.76353186, + "learning_rate": 3.881052069573502e-06, + "loss": 0.84268171, + "num_input_tokens_seen": 49311405, + "router_z_loss_clip": 3.234375, + "router_z_loss_mlp": 0.31506348, + "step": 2279, + "time_per_iteration": 2.659834623336792 + }, + { + "auxiliary_loss_clip": 0.06632279, + "auxiliary_loss_mlp": 0.01290702, + "balance_loss_clip": 0.06309781, + "balance_loss_mlp": 0.01260041, + "epoch": 0.13708101608297008, + "flos": 26983052184960.0, + "grad_norm": 1.8236300001025265, + "language_loss": 0.78161544, + "learning_rate": 3.880919725356831e-06, + "loss": 0.86084521, + "num_input_tokens_seen": 49331835, + "router_z_loss_clip": 3.22460938, + "router_z_loss_mlp": 0.30639648, + "step": 2280, + "time_per_iteration": 2.5933265686035156 + }, + { + "auxiliary_loss_clip": 0.06616117, + "auxiliary_loss_mlp": 0.01291386, + "balance_loss_clip": 0.06299774, + "balance_loss_mlp": 0.01259009, + "epoch": 0.13714113933563807, + "flos": 32564243341440.0, + "grad_norm": 2.0971089694494003, + "language_loss": 0.80573678, + "learning_rate": 3.880787309815496e-06, + "loss": 0.88481188, + "num_input_tokens_seen": 49352290, + "router_z_loss_clip": 3.1640625, + "router_z_loss_mlp": 0.32373047, + "step": 2281, + "time_per_iteration": 2.6089248657226562 + }, + { + "auxiliary_loss_clip": 0.06637304, + "auxiliary_loss_mlp": 0.01294147, + "balance_loss_clip": 0.06310696, + "balance_loss_mlp": 0.01260601, + "epoch": 0.13720126258830603, + "flos": 16105807716480.0, + "grad_norm": 1.9438647514298306, + "language_loss": 0.84104228, + "learning_rate": 3.880654822954518e-06, + "loss": 0.92035675, + "num_input_tokens_seen": 49370285, + "router_z_loss_clip": 3.26367188, + "router_z_loss_mlp": 0.33544922, + "step": 2282, + "time_per_iteration": 2.6252219676971436 + }, + { + "auxiliary_loss_clip": 0.06621532, + "auxiliary_loss_mlp": 0.01288566, + "balance_loss_clip": 0.06310192, + "balance_loss_mlp": 0.01258716, + "epoch": 0.137261385840974, + "flos": 18959630400000.0, + "grad_norm": 1.6598116001029841, + "language_loss": 0.74414694, + "learning_rate": 3.8805222647789195e-06, + "loss": 0.82324791, + "num_input_tokens_seen": 49389610, + "router_z_loss_clip": 3.1171875, + "router_z_loss_mlp": 0.29858398, + "step": 2283, + "time_per_iteration": 2.510495185852051 + }, + { + "auxiliary_loss_clip": 0.06626984, + "auxiliary_loss_mlp": 0.01293133, + "balance_loss_clip": 0.06314456, + "balance_loss_mlp": 0.01261686, + "epoch": 0.13732150909364196, + "flos": 23302173375360.0, + "grad_norm": 4.31542841231349, + "language_loss": 0.85737264, + "learning_rate": 3.880389635293729e-06, + "loss": 0.93657386, + "num_input_tokens_seen": 49408390, + "router_z_loss_clip": 3.125, + "router_z_loss_mlp": 0.31445312, + "step": 2284, + "time_per_iteration": 2.569772720336914 + }, + { + "auxiliary_loss_clip": 0.06637374, + "auxiliary_loss_mlp": 0.01296079, + "balance_loss_clip": 0.06309589, + "balance_loss_mlp": 0.01263702, + "epoch": 0.13738163234630993, + "flos": 29358966706560.0, + "grad_norm": 2.3287060101811643, + "language_loss": 0.76374751, + "learning_rate": 3.880256934503974e-06, + "loss": 0.84308201, + "num_input_tokens_seen": 49427725, + "router_z_loss_clip": 3.27734375, + "router_z_loss_mlp": 0.32348633, + "step": 2285, + "time_per_iteration": 2.618502140045166 + }, + { + "auxiliary_loss_clip": 0.06630811, + "auxiliary_loss_mlp": 0.01295468, + "balance_loss_clip": 0.06312186, + "balance_loss_mlp": 0.0126619, + "epoch": 0.1374417555989779, + "flos": 26658572048640.0, + "grad_norm": 1.8592668297074675, + "language_loss": 0.76012349, + "learning_rate": 3.880124162414689e-06, + "loss": 0.83938622, + "num_input_tokens_seen": 49449000, + "router_z_loss_clip": 3.18164062, + "router_z_loss_mlp": 0.29296875, + "step": 2286, + "time_per_iteration": 2.7475874423980713 + }, + { + "auxiliary_loss_clip": 0.06634222, + "auxiliary_loss_mlp": 0.01290764, + "balance_loss_clip": 0.06310531, + "balance_loss_mlp": 0.01258029, + "epoch": 0.1375018788516459, + "flos": 28411074593280.0, + "grad_norm": 5.375995383381602, + "language_loss": 0.87619269, + "learning_rate": 3.879991319030908e-06, + "loss": 0.95544249, + "num_input_tokens_seen": 49468360, + "router_z_loss_clip": 3.23242188, + "router_z_loss_mlp": 0.32763672, + "step": 2287, + "time_per_iteration": 2.7319629192352295 + }, + { + "auxiliary_loss_clip": 0.06638976, + "auxiliary_loss_mlp": 0.01305844, + "balance_loss_clip": 0.06320731, + "balance_loss_mlp": 0.01274683, + "epoch": 0.13756200210431385, + "flos": 37422695105280.0, + "grad_norm": 2.4551568049715486, + "language_loss": 0.70291626, + "learning_rate": 3.879858404357666e-06, + "loss": 0.78236449, + "num_input_tokens_seen": 49493450, + "router_z_loss_clip": 3.18359375, + "router_z_loss_mlp": 0.3112793, + "step": 2288, + "time_per_iteration": 2.6788651943206787 + }, + { + "auxiliary_loss_clip": 0.06632806, + "auxiliary_loss_mlp": 0.01293292, + "balance_loss_clip": 0.06312902, + "balance_loss_mlp": 0.01262667, + "epoch": 0.13762212535698182, + "flos": 22717642492800.0, + "grad_norm": 3.117032975681255, + "language_loss": 0.88826561, + "learning_rate": 3.879725418400005e-06, + "loss": 0.96752661, + "num_input_tokens_seen": 49511220, + "router_z_loss_clip": 3.1953125, + "router_z_loss_mlp": 0.30651855, + "step": 2289, + "time_per_iteration": 2.5602166652679443 + }, + { + "auxiliary_loss_clip": 0.06632558, + "auxiliary_loss_mlp": 0.01293233, + "balance_loss_clip": 0.06320693, + "balance_loss_mlp": 0.01263181, + "epoch": 0.13768224860964978, + "flos": 23959057858560.0, + "grad_norm": 1.9772525840465298, + "language_loss": 0.75630605, + "learning_rate": 3.879592361162969e-06, + "loss": 0.8355639, + "num_input_tokens_seen": 49529820, + "router_z_loss_clip": 3.11914062, + "router_z_loss_mlp": 0.30065918, + "step": 2290, + "time_per_iteration": 2.5592398643493652 + }, + { + "auxiliary_loss_clip": 0.06540786, + "auxiliary_loss_mlp": 0.01268874, + "balance_loss_clip": 0.06353199, + "balance_loss_mlp": 0.01257585, + "epoch": 0.13774237186231775, + "flos": 63612568212480.0, + "grad_norm": 0.6705422790130379, + "language_loss": 0.51642907, + "learning_rate": 3.8794592326516015e-06, + "loss": 0.59452564, + "num_input_tokens_seen": 49595325, + "router_z_loss_clip": 1.875, + "router_z_loss_mlp": 0.112854, + "step": 2291, + "time_per_iteration": 3.2724592685699463 + }, + { + "auxiliary_loss_clip": 0.06630601, + "auxiliary_loss_mlp": 0.01294866, + "balance_loss_clip": 0.0631279, + "balance_loss_mlp": 0.01263657, + "epoch": 0.1378024951149857, + "flos": 24286263252480.0, + "grad_norm": 2.140362896023876, + "language_loss": 0.72877645, + "learning_rate": 3.879326032870952e-06, + "loss": 0.80803108, + "num_input_tokens_seen": 49615850, + "router_z_loss_clip": 3.1796875, + "router_z_loss_mlp": 0.31201172, + "step": 2292, + "time_per_iteration": 2.571537971496582 + }, + { + "auxiliary_loss_clip": 0.0663756, + "auxiliary_loss_mlp": 0.01294271, + "balance_loss_clip": 0.06317808, + "balance_loss_mlp": 0.01261179, + "epoch": 0.13786261836765368, + "flos": 14025722434560.0, + "grad_norm": 2.9525020540096842, + "language_loss": 0.81376028, + "learning_rate": 3.879192761826071e-06, + "loss": 0.89307863, + "num_input_tokens_seen": 49631860, + "router_z_loss_clip": 3.19726562, + "router_z_loss_mlp": 0.33056641, + "step": 2293, + "time_per_iteration": 2.520320177078247 + }, + { + "auxiliary_loss_clip": 0.06629369, + "auxiliary_loss_mlp": 0.01294538, + "balance_loss_clip": 0.06308883, + "balance_loss_mlp": 0.01262065, + "epoch": 0.13792274162032167, + "flos": 28886592913920.0, + "grad_norm": 15.103956304175181, + "language_loss": 0.79534554, + "learning_rate": 3.879059419522011e-06, + "loss": 0.87458467, + "num_input_tokens_seen": 49652145, + "router_z_loss_clip": 3.20507812, + "router_z_loss_mlp": 0.32470703, + "step": 2294, + "time_per_iteration": 2.5958240032196045 + }, + { + "auxiliary_loss_clip": 0.06628333, + "auxiliary_loss_mlp": 0.01293802, + "balance_loss_clip": 0.06314936, + "balance_loss_mlp": 0.01264739, + "epoch": 0.13798286487298964, + "flos": 21147344651520.0, + "grad_norm": 2.1249265647314575, + "language_loss": 0.82119411, + "learning_rate": 3.878926005963831e-06, + "loss": 0.90041548, + "num_input_tokens_seen": 49669880, + "router_z_loss_clip": 3.1328125, + "router_z_loss_mlp": 0.29040527, + "step": 2295, + "time_per_iteration": 2.5259695053100586 + }, + { + "auxiliary_loss_clip": 0.06624444, + "auxiliary_loss_mlp": 0.0128892, + "balance_loss_clip": 0.06304439, + "balance_loss_mlp": 0.01258569, + "epoch": 0.1380429881256576, + "flos": 22493286385920.0, + "grad_norm": 1.9411162070190993, + "language_loss": 0.79297817, + "learning_rate": 3.878792521156588e-06, + "loss": 0.8721118, + "num_input_tokens_seen": 49687255, + "router_z_loss_clip": 3.203125, + "router_z_loss_mlp": 0.3034668, + "step": 2296, + "time_per_iteration": 2.5404605865478516 + }, + { + "auxiliary_loss_clip": 0.06623581, + "auxiliary_loss_mlp": 0.01292011, + "balance_loss_clip": 0.06309658, + "balance_loss_mlp": 0.01261755, + "epoch": 0.13810311137832557, + "flos": 21399429260160.0, + "grad_norm": 1.8193304302063846, + "language_loss": 0.79101717, + "learning_rate": 3.8786589651053446e-06, + "loss": 0.87017298, + "num_input_tokens_seen": 49706650, + "router_z_loss_clip": 3.13867188, + "router_z_loss_mlp": 0.30249023, + "step": 2297, + "time_per_iteration": 2.544902801513672 + }, + { + "auxiliary_loss_clip": 0.06617336, + "auxiliary_loss_mlp": 0.01292431, + "balance_loss_clip": 0.06304273, + "balance_loss_mlp": 0.01261162, + "epoch": 0.13816323463099353, + "flos": 25996195123200.0, + "grad_norm": 2.1649336589446113, + "language_loss": 0.70034248, + "learning_rate": 3.878525337815164e-06, + "loss": 0.77944016, + "num_input_tokens_seen": 49725715, + "router_z_loss_clip": 3.12890625, + "router_z_loss_mlp": 0.31286621, + "step": 2298, + "time_per_iteration": 2.7027747631073 + }, + { + "auxiliary_loss_clip": 0.06625488, + "auxiliary_loss_mlp": 0.01293838, + "balance_loss_clip": 0.06304887, + "balance_loss_mlp": 0.01263511, + "epoch": 0.1382233578836615, + "flos": 19250260686720.0, + "grad_norm": 1.8032659924791181, + "language_loss": 0.87816125, + "learning_rate": 3.878391639291116e-06, + "loss": 0.95735455, + "num_input_tokens_seen": 49744710, + "router_z_loss_clip": 3.20703125, + "router_z_loss_mlp": 0.30310059, + "step": 2299, + "time_per_iteration": 2.5216784477233887 + }, + { + "auxiliary_loss_clip": 0.06619459, + "auxiliary_loss_mlp": 0.01291843, + "balance_loss_clip": 0.06297824, + "balance_loss_mlp": 0.01258965, + "epoch": 0.1382834811363295, + "flos": 25673392068480.0, + "grad_norm": 1.8041271752460513, + "language_loss": 0.77313578, + "learning_rate": 3.878257869538267e-06, + "loss": 0.85224879, + "num_input_tokens_seen": 49764300, + "router_z_loss_clip": 3.21484375, + "router_z_loss_mlp": 0.32910156, + "step": 2300, + "time_per_iteration": 2.5652666091918945 + }, + { + "auxiliary_loss_clip": 0.06615824, + "auxiliary_loss_mlp": 0.01292831, + "balance_loss_clip": 0.06301995, + "balance_loss_mlp": 0.01263219, + "epoch": 0.13834360438899745, + "flos": 19788992513280.0, + "grad_norm": 2.607101946436598, + "language_loss": 0.84398985, + "learning_rate": 3.878124028561692e-06, + "loss": 0.92307633, + "num_input_tokens_seen": 49778380, + "router_z_loss_clip": 3.13867188, + "router_z_loss_mlp": 0.29589844, + "step": 2301, + "time_per_iteration": 2.5100109577178955 + }, + { + "auxiliary_loss_clip": 0.06616862, + "auxiliary_loss_mlp": 0.01292457, + "balance_loss_clip": 0.06302989, + "balance_loss_mlp": 0.01262631, + "epoch": 0.13840372764166542, + "flos": 26659200954240.0, + "grad_norm": 1.960897603887865, + "language_loss": 0.87807304, + "learning_rate": 3.877990116366466e-06, + "loss": 0.95716619, + "num_input_tokens_seen": 49797460, + "router_z_loss_clip": 3.140625, + "router_z_loss_mlp": 0.2980957, + "step": 2302, + "time_per_iteration": 2.5661840438842773 + }, + { + "auxiliary_loss_clip": 0.0648245, + "auxiliary_loss_mlp": 0.01256791, + "balance_loss_clip": 0.06296428, + "balance_loss_mlp": 0.01245943, + "epoch": 0.13846385089433338, + "flos": 70532321944320.0, + "grad_norm": 0.7317106160807376, + "language_loss": 0.65412122, + "learning_rate": 3.877856132957667e-06, + "loss": 0.73151362, + "num_input_tokens_seen": 49868005, + "router_z_loss_clip": 1.859375, + "router_z_loss_mlp": 0.10864258, + "step": 2303, + "time_per_iteration": 3.325839042663574 + }, + { + "auxiliary_loss_clip": 0.06609396, + "auxiliary_loss_mlp": 0.01287851, + "balance_loss_clip": 0.0630075, + "balance_loss_mlp": 0.01258263, + "epoch": 0.13852397414700135, + "flos": 17354644168320.0, + "grad_norm": 2.0774651772022885, + "language_loss": 0.79740053, + "learning_rate": 3.877722078340374e-06, + "loss": 0.87637299, + "num_input_tokens_seen": 49885825, + "router_z_loss_clip": 3.09179688, + "router_z_loss_mlp": 0.29589844, + "step": 2304, + "time_per_iteration": 2.543011426925659 + }, + { + "auxiliary_loss_clip": 0.06619786, + "auxiliary_loss_mlp": 0.01290997, + "balance_loss_clip": 0.06300867, + "balance_loss_mlp": 0.01261147, + "epoch": 0.13858409739966931, + "flos": 21550257809280.0, + "grad_norm": 3.5409811557707527, + "language_loss": 0.78727001, + "learning_rate": 3.877587952519672e-06, + "loss": 0.86637783, + "num_input_tokens_seen": 49905975, + "router_z_loss_clip": 3.19335938, + "router_z_loss_mlp": 0.2980957, + "step": 2305, + "time_per_iteration": 2.546365261077881 + }, + { + "auxiliary_loss_clip": 0.06604174, + "auxiliary_loss_mlp": 0.01290068, + "balance_loss_clip": 0.06297874, + "balance_loss_mlp": 0.01261624, + "epoch": 0.13864422065233728, + "flos": 21586329792000.0, + "grad_norm": 1.8829847036148735, + "language_loss": 0.89061654, + "learning_rate": 3.877453755500647e-06, + "loss": 0.96955895, + "num_input_tokens_seen": 49925800, + "router_z_loss_clip": 3.06054688, + "router_z_loss_mlp": 0.28442383, + "step": 2306, + "time_per_iteration": 2.564483165740967 + }, + { + "auxiliary_loss_clip": 0.06468673, + "auxiliary_loss_mlp": 0.0125835, + "balance_loss_clip": 0.0628318, + "balance_loss_mlp": 0.01247258, + "epoch": 0.13870434390500527, + "flos": 53384927650560.0, + "grad_norm": 0.8396257339497795, + "language_loss": 0.58554721, + "learning_rate": 3.877319487288387e-06, + "loss": 0.66281742, + "num_input_tokens_seen": 49977620, + "router_z_loss_clip": 1.859375, + "router_z_loss_mlp": 0.11108398, + "step": 2307, + "time_per_iteration": 4.632705450057983 + }, + { + "auxiliary_loss_clip": 0.0661881, + "auxiliary_loss_mlp": 0.01288588, + "balance_loss_clip": 0.06295981, + "balance_loss_mlp": 0.01258022, + "epoch": 0.13876446715767324, + "flos": 22572641802240.0, + "grad_norm": 1.7746642333134461, + "language_loss": 0.80762124, + "learning_rate": 3.877185147887984e-06, + "loss": 0.88669527, + "num_input_tokens_seen": 49996650, + "router_z_loss_clip": 3.23046875, + "router_z_loss_mlp": 0.30566406, + "step": 2308, + "time_per_iteration": 3.985261917114258 + }, + { + "auxiliary_loss_clip": 0.06612652, + "auxiliary_loss_mlp": 0.0129232, + "balance_loss_clip": 0.06302111, + "balance_loss_mlp": 0.01262208, + "epoch": 0.1388245904103412, + "flos": 20711671747200.0, + "grad_norm": 2.3070434354932425, + "language_loss": 0.7942912, + "learning_rate": 3.877050737304533e-06, + "loss": 0.8733409, + "num_input_tokens_seen": 50015640, + "router_z_loss_clip": 3.10742188, + "router_z_loss_mlp": 0.30102539, + "step": 2309, + "time_per_iteration": 2.5814623832702637 + }, + { + "auxiliary_loss_clip": 0.06621584, + "auxiliary_loss_mlp": 0.01295268, + "balance_loss_clip": 0.06297516, + "balance_loss_mlp": 0.0126444, + "epoch": 0.13888471366300917, + "flos": 20560382000640.0, + "grad_norm": 2.2863258472271437, + "language_loss": 0.6975733, + "learning_rate": 3.876916255543129e-06, + "loss": 0.77674186, + "num_input_tokens_seen": 50033500, + "router_z_loss_clip": 3.24023438, + "router_z_loss_mlp": 0.30786133, + "step": 2310, + "time_per_iteration": 2.5402469635009766 + }, + { + "auxiliary_loss_clip": 0.06612189, + "auxiliary_loss_mlp": 0.01299127, + "balance_loss_clip": 0.06296849, + "balance_loss_mlp": 0.01268967, + "epoch": 0.13894483691567713, + "flos": 13842008357760.0, + "grad_norm": 1.8909078278877924, + "language_loss": 0.85131961, + "learning_rate": 3.8767817026088725e-06, + "loss": 0.9304328, + "num_input_tokens_seen": 50050075, + "router_z_loss_clip": 3.15625, + "router_z_loss_mlp": 0.30126953, + "step": 2311, + "time_per_iteration": 5.377658128738403 + }, + { + "auxiliary_loss_clip": 0.06618226, + "auxiliary_loss_mlp": 0.01294733, + "balance_loss_clip": 0.06296492, + "balance_loss_mlp": 0.01264358, + "epoch": 0.1390049601683451, + "flos": 28037567018880.0, + "grad_norm": 2.5894979273704783, + "language_loss": 0.83215213, + "learning_rate": 3.876647078506866e-06, + "loss": 0.9112817, + "num_input_tokens_seen": 50070080, + "router_z_loss_clip": 3.21679688, + "router_z_loss_mlp": 0.30395508, + "step": 2312, + "time_per_iteration": 2.6039178371429443 + }, + { + "auxiliary_loss_clip": 0.06618522, + "auxiliary_loss_mlp": 0.01290839, + "balance_loss_clip": 0.06296252, + "balance_loss_mlp": 0.01259964, + "epoch": 0.13906508342101306, + "flos": 26763475760640.0, + "grad_norm": 1.7282329609081795, + "language_loss": 0.87823701, + "learning_rate": 3.876512383242215e-06, + "loss": 0.95733058, + "num_input_tokens_seen": 50090040, + "router_z_loss_clip": 3.22460938, + "router_z_loss_mlp": 0.30883789, + "step": 2313, + "time_per_iteration": 2.6105740070343018 + }, + { + "auxiliary_loss_clip": 0.06614069, + "auxiliary_loss_mlp": 0.01289702, + "balance_loss_clip": 0.06295129, + "balance_loss_mlp": 0.01259185, + "epoch": 0.13912520667368106, + "flos": 24541995513600.0, + "grad_norm": 1.8286826676096326, + "language_loss": 0.81090409, + "learning_rate": 3.876377616820024e-06, + "loss": 0.88994175, + "num_input_tokens_seen": 50110595, + "router_z_loss_clip": 3.1875, + "router_z_loss_mlp": 0.30541992, + "step": 2314, + "time_per_iteration": 2.581137180328369 + }, + { + "auxiliary_loss_clip": 0.06609131, + "auxiliary_loss_mlp": 0.0129379, + "balance_loss_clip": 0.06295457, + "balance_loss_mlp": 0.01263678, + "epoch": 0.13918532992634902, + "flos": 19388007999360.0, + "grad_norm": 4.757536248820732, + "language_loss": 0.86588097, + "learning_rate": 3.876242779245409e-06, + "loss": 0.94491017, + "num_input_tokens_seen": 50125430, + "router_z_loss_clip": 3.140625, + "router_z_loss_mlp": 0.30126953, + "step": 2315, + "time_per_iteration": 2.5262932777404785 + }, + { + "auxiliary_loss_clip": 0.06611065, + "auxiliary_loss_mlp": 0.01285772, + "balance_loss_clip": 0.06296186, + "balance_loss_mlp": 0.01255159, + "epoch": 0.139245453179017, + "flos": 21330010552320.0, + "grad_norm": 2.405797075318415, + "language_loss": 0.78922898, + "learning_rate": 3.876107870523477e-06, + "loss": 0.86819738, + "num_input_tokens_seen": 50144120, + "router_z_loss_clip": 3.15039062, + "router_z_loss_mlp": 0.30615234, + "step": 2316, + "time_per_iteration": 2.529972553253174 + }, + { + "auxiliary_loss_clip": 0.06613404, + "auxiliary_loss_mlp": 0.01292141, + "balance_loss_clip": 0.06296711, + "balance_loss_mlp": 0.01260026, + "epoch": 0.13930557643168495, + "flos": 19506747634560.0, + "grad_norm": 1.7528689753979556, + "language_loss": 0.77613419, + "learning_rate": 3.875972890659349e-06, + "loss": 0.85518968, + "num_input_tokens_seen": 50162500, + "router_z_loss_clip": 3.16992188, + "router_z_loss_mlp": 0.32116699, + "step": 2317, + "time_per_iteration": 2.5425355434417725 + }, + { + "auxiliary_loss_clip": 0.06624125, + "auxiliary_loss_mlp": 0.01286591, + "balance_loss_clip": 0.0630217, + "balance_loss_mlp": 0.01257027, + "epoch": 0.13936569968435292, + "flos": 25417869442560.0, + "grad_norm": 1.999588880264202, + "language_loss": 0.81447107, + "learning_rate": 3.875837839658139e-06, + "loss": 0.89357817, + "num_input_tokens_seen": 50182415, + "router_z_loss_clip": 3.22265625, + "router_z_loss_mlp": 0.2956543, + "step": 2318, + "time_per_iteration": 2.577786922454834 + }, + { + "auxiliary_loss_clip": 0.06479447, + "auxiliary_loss_mlp": 0.01268448, + "balance_loss_clip": 0.06297284, + "balance_loss_mlp": 0.01257373, + "epoch": 0.13942582293702088, + "flos": 70793211231360.0, + "grad_norm": 0.8224169172372592, + "language_loss": 0.59232461, + "learning_rate": 3.87570271752497e-06, + "loss": 0.66980362, + "num_input_tokens_seen": 50245160, + "router_z_loss_clip": 1.82421875, + "router_z_loss_mlp": 0.11090088, + "step": 2319, + "time_per_iteration": 3.204317092895508 + }, + { + "auxiliary_loss_clip": 0.06613657, + "auxiliary_loss_mlp": 0.01294413, + "balance_loss_clip": 0.06293797, + "balance_loss_mlp": 0.01263514, + "epoch": 0.13948594618968888, + "flos": 35599725676800.0, + "grad_norm": 2.1444622790100762, + "language_loss": 0.66576529, + "learning_rate": 3.875567524264967e-06, + "loss": 0.74484605, + "num_input_tokens_seen": 50268215, + "router_z_loss_clip": 3.20117188, + "router_z_loss_mlp": 0.30957031, + "step": 2320, + "time_per_iteration": 2.677716016769409 + }, + { + "auxiliary_loss_clip": 0.06604615, + "auxiliary_loss_mlp": 0.01292225, + "balance_loss_clip": 0.062957, + "balance_loss_mlp": 0.01263245, + "epoch": 0.13954606944235684, + "flos": 21111482304000.0, + "grad_norm": 1.7128433163135388, + "language_loss": 0.7132194, + "learning_rate": 3.875432259883256e-06, + "loss": 0.79218775, + "num_input_tokens_seen": 50288575, + "router_z_loss_clip": 3.08984375, + "router_z_loss_mlp": 0.28967285, + "step": 2321, + "time_per_iteration": 2.5557823181152344 + }, + { + "auxiliary_loss_clip": 0.06610114, + "auxiliary_loss_mlp": 0.01289737, + "balance_loss_clip": 0.06294077, + "balance_loss_mlp": 0.01258158, + "epoch": 0.1396061926950248, + "flos": 25051154048640.0, + "grad_norm": 2.1088337541486215, + "language_loss": 0.87096989, + "learning_rate": 3.875296924384965e-06, + "loss": 0.9499684, + "num_input_tokens_seen": 50308735, + "router_z_loss_clip": 3.15820312, + "router_z_loss_mlp": 0.3157959, + "step": 2322, + "time_per_iteration": 2.563751459121704 + }, + { + "auxiliary_loss_clip": 0.06602737, + "auxiliary_loss_mlp": 0.01287424, + "balance_loss_clip": 0.06298044, + "balance_loss_mlp": 0.01258718, + "epoch": 0.13966631594769277, + "flos": 37643193924480.0, + "grad_norm": 1.6181543517844332, + "language_loss": 0.68045509, + "learning_rate": 3.875161517775226e-06, + "loss": 0.75935674, + "num_input_tokens_seen": 50331025, + "router_z_loss_clip": 3.05078125, + "router_z_loss_mlp": 0.28710938, + "step": 2323, + "time_per_iteration": 2.8503611087799072 + }, + { + "auxiliary_loss_clip": 0.06623898, + "auxiliary_loss_mlp": 0.01286528, + "balance_loss_clip": 0.06301014, + "balance_loss_mlp": 0.01257393, + "epoch": 0.13972643920036074, + "flos": 16696627655040.0, + "grad_norm": 2.142170673512178, + "language_loss": 0.90579832, + "learning_rate": 3.875026040059175e-06, + "loss": 0.98490262, + "num_input_tokens_seen": 50349725, + "router_z_loss_clip": 3.23046875, + "router_z_loss_mlp": 0.29150391, + "step": 2324, + "time_per_iteration": 2.5540571212768555 + }, + { + "auxiliary_loss_clip": 0.06618317, + "auxiliary_loss_mlp": 0.01286509, + "balance_loss_clip": 0.0630155, + "balance_loss_mlp": 0.01256659, + "epoch": 0.1397865624530287, + "flos": 23337742233600.0, + "grad_norm": 4.139742528061125, + "language_loss": 0.72620469, + "learning_rate": 3.8748904912419485e-06, + "loss": 0.80525297, + "num_input_tokens_seen": 50367965, + "router_z_loss_clip": 3.16992188, + "router_z_loss_mlp": 0.29821777, + "step": 2325, + "time_per_iteration": 2.5619618892669678 + }, + { + "auxiliary_loss_clip": 0.0662512, + "auxiliary_loss_mlp": 0.01293129, + "balance_loss_clip": 0.06308709, + "balance_loss_mlp": 0.01264591, + "epoch": 0.13984668570569667, + "flos": 22784000526720.0, + "grad_norm": 2.1958407614138, + "language_loss": 0.83206451, + "learning_rate": 3.874754871328688e-06, + "loss": 0.91124701, + "num_input_tokens_seen": 50385605, + "router_z_loss_clip": 3.16210938, + "router_z_loss_mlp": 0.28503418, + "step": 2326, + "time_per_iteration": 2.544154167175293 + }, + { + "auxiliary_loss_clip": 0.06607386, + "auxiliary_loss_mlp": 0.0128622, + "balance_loss_clip": 0.06303836, + "balance_loss_mlp": 0.01256764, + "epoch": 0.13990680895836466, + "flos": 19470759505920.0, + "grad_norm": 1.8381162719470834, + "language_loss": 0.90198052, + "learning_rate": 3.874619180324534e-06, + "loss": 0.98091662, + "num_input_tokens_seen": 50403985, + "router_z_loss_clip": 3.03515625, + "router_z_loss_mlp": 0.2947998, + "step": 2327, + "time_per_iteration": 2.544022798538208 + }, + { + "auxiliary_loss_clip": 0.06612301, + "auxiliary_loss_mlp": 0.01294926, + "balance_loss_clip": 0.06299497, + "balance_loss_mlp": 0.01263479, + "epoch": 0.13996693221103262, + "flos": 20309555203200.0, + "grad_norm": 2.1153988454525927, + "language_loss": 0.86492193, + "learning_rate": 3.874483418234632e-06, + "loss": 0.9439941, + "num_input_tokens_seen": 50421590, + "router_z_loss_clip": 3.12890625, + "router_z_loss_mlp": 0.31433105, + "step": 2328, + "time_per_iteration": 2.498436212539673 + }, + { + "auxiliary_loss_clip": 0.06619829, + "auxiliary_loss_mlp": 0.01290779, + "balance_loss_clip": 0.06303053, + "balance_loss_mlp": 0.01261239, + "epoch": 0.1400270554637006, + "flos": 26625434958720.0, + "grad_norm": 2.232478376897894, + "language_loss": 0.74862719, + "learning_rate": 3.874347585064131e-06, + "loss": 0.82773322, + "num_input_tokens_seen": 50443945, + "router_z_loss_clip": 3.16992188, + "router_z_loss_mlp": 0.29541016, + "step": 2329, + "time_per_iteration": 2.625213146209717 + }, + { + "auxiliary_loss_clip": 0.06613478, + "auxiliary_loss_mlp": 0.01291404, + "balance_loss_clip": 0.06302348, + "balance_loss_mlp": 0.01261912, + "epoch": 0.14008717871636855, + "flos": 19397651218560.0, + "grad_norm": 2.9962397362189797, + "language_loss": 0.79502976, + "learning_rate": 3.874211680818183e-06, + "loss": 0.87407863, + "num_input_tokens_seen": 50462065, + "router_z_loss_clip": 3.11132812, + "router_z_loss_mlp": 0.29516602, + "step": 2330, + "time_per_iteration": 2.526705265045166 + }, + { + "auxiliary_loss_clip": 0.06610473, + "auxiliary_loss_mlp": 0.01292963, + "balance_loss_clip": 0.06299306, + "balance_loss_mlp": 0.01265187, + "epoch": 0.14014730196903652, + "flos": 15309624620160.0, + "grad_norm": 3.126642482841082, + "language_loss": 0.73399383, + "learning_rate": 3.87407570550194e-06, + "loss": 0.81302822, + "num_input_tokens_seen": 50479565, + "router_z_loss_clip": 3.11328125, + "router_z_loss_mlp": 0.27783203, + "step": 2331, + "time_per_iteration": 2.5545501708984375 + }, + { + "auxiliary_loss_clip": 0.06595145, + "auxiliary_loss_mlp": 0.01295524, + "balance_loss_clip": 0.06296061, + "balance_loss_mlp": 0.01267176, + "epoch": 0.14020742522170448, + "flos": 14945047505280.0, + "grad_norm": 1.5446780905805184, + "language_loss": 0.73888373, + "learning_rate": 3.873939659120557e-06, + "loss": 0.81779039, + "num_input_tokens_seen": 50497305, + "router_z_loss_clip": 2.9921875, + "router_z_loss_mlp": 0.28344727, + "step": 2332, + "time_per_iteration": 2.5132856369018555 + }, + { + "auxiliary_loss_clip": 0.06469279, + "auxiliary_loss_mlp": 0.01265718, + "balance_loss_clip": 0.0628898, + "balance_loss_mlp": 0.01254947, + "epoch": 0.14026754847437245, + "flos": 48839956410240.0, + "grad_norm": 0.7856293848414069, + "language_loss": 0.55978549, + "learning_rate": 3.873803541679196e-06, + "loss": 0.63713545, + "num_input_tokens_seen": 50549735, + "router_z_loss_clip": 1.80371094, + "router_z_loss_mlp": 0.10784912, + "step": 2333, + "time_per_iteration": 3.0545504093170166 + }, + { + "auxiliary_loss_clip": 0.06614032, + "auxiliary_loss_mlp": 0.01304219, + "balance_loss_clip": 0.06302805, + "balance_loss_mlp": 0.01274512, + "epoch": 0.14032767172704044, + "flos": 25779972862080.0, + "grad_norm": 1.7607916686559548, + "language_loss": 0.83699584, + "learning_rate": 3.873667353183016e-06, + "loss": 0.91617835, + "num_input_tokens_seen": 50570100, + "router_z_loss_clip": 3.11328125, + "router_z_loss_mlp": 0.29699707, + "step": 2334, + "time_per_iteration": 2.6067097187042236 + }, + { + "auxiliary_loss_clip": 0.06611067, + "auxiliary_loss_mlp": 0.01296359, + "balance_loss_clip": 0.06295306, + "balance_loss_mlp": 0.01268023, + "epoch": 0.1403877949797084, + "flos": 21222884707200.0, + "grad_norm": 3.2536049566200846, + "language_loss": 0.81910211, + "learning_rate": 3.8735310936371825e-06, + "loss": 0.89817637, + "num_input_tokens_seen": 50589185, + "router_z_loss_clip": 3.15625, + "router_z_loss_mlp": 0.28356934, + "step": 2335, + "time_per_iteration": 2.5793120861053467 + }, + { + "auxiliary_loss_clip": 0.06618994, + "auxiliary_loss_mlp": 0.0129466, + "balance_loss_clip": 0.06299357, + "balance_loss_mlp": 0.01262044, + "epoch": 0.14044791823237637, + "flos": 22754678797440.0, + "grad_norm": 1.8425920337650705, + "language_loss": 0.83025301, + "learning_rate": 3.873394763046862e-06, + "loss": 0.9093895, + "num_input_tokens_seen": 50609645, + "router_z_loss_clip": 3.19335938, + "router_z_loss_mlp": 0.32617188, + "step": 2336, + "time_per_iteration": 2.5754895210266113 + }, + { + "auxiliary_loss_clip": 0.0660933, + "auxiliary_loss_mlp": 0.0129189, + "balance_loss_clip": 0.06299824, + "balance_loss_mlp": 0.01261516, + "epoch": 0.14050804148504434, + "flos": 22970775277440.0, + "grad_norm": 1.9428001111866895, + "language_loss": 0.81449389, + "learning_rate": 3.873258361417225e-06, + "loss": 0.89350611, + "num_input_tokens_seen": 50628385, + "router_z_loss_clip": 3.09765625, + "router_z_loss_mlp": 0.30371094, + "step": 2337, + "time_per_iteration": 2.542494773864746 + }, + { + "auxiliary_loss_clip": 0.06620462, + "auxiliary_loss_mlp": 0.01292117, + "balance_loss_clip": 0.06304233, + "balance_loss_mlp": 0.01262493, + "epoch": 0.1405681647377123, + "flos": 22206890730240.0, + "grad_norm": 2.099495755823345, + "language_loss": 0.80428421, + "learning_rate": 3.873121888753442e-06, + "loss": 0.88341004, + "num_input_tokens_seen": 50647260, + "router_z_loss_clip": 3.16796875, + "router_z_loss_mlp": 0.29626465, + "step": 2338, + "time_per_iteration": 2.5587832927703857 + }, + { + "auxiliary_loss_clip": 0.06618391, + "auxiliary_loss_mlp": 0.01291133, + "balance_loss_clip": 0.06299177, + "balance_loss_mlp": 0.01259447, + "epoch": 0.14062828799038027, + "flos": 23739607215360.0, + "grad_norm": 2.563407914599119, + "language_loss": 0.81585765, + "learning_rate": 3.87298534506069e-06, + "loss": 0.89495289, + "num_input_tokens_seen": 50666130, + "router_z_loss_clip": 3.19140625, + "router_z_loss_mlp": 0.31689453, + "step": 2339, + "time_per_iteration": 2.541985273361206 + }, + { + "auxiliary_loss_clip": 0.06608106, + "auxiliary_loss_mlp": 0.01284227, + "balance_loss_clip": 0.06301871, + "balance_loss_mlp": 0.01254735, + "epoch": 0.14068841124304826, + "flos": 39211856611200.0, + "grad_norm": 1.7427009821835167, + "language_loss": 0.66622555, + "learning_rate": 3.872848730344146e-06, + "loss": 0.7451489, + "num_input_tokens_seen": 50687440, + "router_z_loss_clip": 3.06640625, + "router_z_loss_mlp": 0.29492188, + "step": 2340, + "time_per_iteration": 2.7599191665649414 + }, + { + "auxiliary_loss_clip": 0.06615461, + "auxiliary_loss_mlp": 0.01296967, + "balance_loss_clip": 0.06309174, + "balance_loss_mlp": 0.01267952, + "epoch": 0.14074853449571623, + "flos": 20198278581120.0, + "grad_norm": 2.455789479029152, + "language_loss": 0.80003643, + "learning_rate": 3.87271204460899e-06, + "loss": 0.87916064, + "num_input_tokens_seen": 50704030, + "router_z_loss_clip": 3.06445312, + "router_z_loss_mlp": 0.2902832, + "step": 2341, + "time_per_iteration": 2.5097782611846924 + }, + { + "auxiliary_loss_clip": 0.06617275, + "auxiliary_loss_mlp": 0.01290109, + "balance_loss_clip": 0.06306843, + "balance_loss_mlp": 0.01261118, + "epoch": 0.1408086577483842, + "flos": 18411800405760.0, + "grad_norm": 1.7920815266740484, + "language_loss": 0.81707942, + "learning_rate": 3.8725752878604066e-06, + "loss": 0.89615333, + "num_input_tokens_seen": 50723305, + "router_z_loss_clip": 3.10351562, + "router_z_loss_mlp": 0.29003906, + "step": 2342, + "time_per_iteration": 2.5234599113464355 + }, + { + "auxiliary_loss_clip": 0.06617711, + "auxiliary_loss_mlp": 0.01285014, + "balance_loss_clip": 0.06315217, + "balance_loss_mlp": 0.01257858, + "epoch": 0.14086878100105216, + "flos": 25271569013760.0, + "grad_norm": 1.8907393143090194, + "language_loss": 0.79096431, + "learning_rate": 3.87243846010358e-06, + "loss": 0.8699916, + "num_input_tokens_seen": 50743270, + "router_z_loss_clip": 3.02734375, + "router_z_loss_mlp": 0.27172852, + "step": 2343, + "time_per_iteration": 2.566734552383423 + }, + { + "auxiliary_loss_clip": 0.0648333, + "auxiliary_loss_mlp": 0.01280273, + "balance_loss_clip": 0.06304723, + "balance_loss_mlp": 0.01268566, + "epoch": 0.14092890425372012, + "flos": 65997553703040.0, + "grad_norm": 0.8105470614930316, + "language_loss": 0.61667693, + "learning_rate": 3.872301561343699e-06, + "loss": 0.69431293, + "num_input_tokens_seen": 50802710, + "router_z_loss_clip": 1.78125, + "router_z_loss_mlp": 0.11694336, + "step": 2344, + "time_per_iteration": 3.107311964035034 + }, + { + "auxiliary_loss_clip": 0.06612515, + "auxiliary_loss_mlp": 0.01296816, + "balance_loss_clip": 0.06307824, + "balance_loss_mlp": 0.01267514, + "epoch": 0.1409890275063881, + "flos": 23701564661760.0, + "grad_norm": 1.4479662088391603, + "language_loss": 0.66076458, + "learning_rate": 3.872164591585956e-06, + "loss": 0.73985791, + "num_input_tokens_seen": 50822625, + "router_z_loss_clip": 3.046875, + "router_z_loss_mlp": 0.29321289, + "step": 2345, + "time_per_iteration": 2.548482656478882 + }, + { + "auxiliary_loss_clip": 0.06630909, + "auxiliary_loss_mlp": 0.0129167, + "balance_loss_clip": 0.06307563, + "balance_loss_mlp": 0.01260676, + "epoch": 0.14104915075905605, + "flos": 23629923820800.0, + "grad_norm": 2.297389176264822, + "language_loss": 0.7525146, + "learning_rate": 3.8720275508355435e-06, + "loss": 0.83174026, + "num_input_tokens_seen": 50842330, + "router_z_loss_clip": 3.23242188, + "router_z_loss_mlp": 0.31005859, + "step": 2346, + "time_per_iteration": 3.9794979095458984 + }, + { + "auxiliary_loss_clip": 0.06626198, + "auxiliary_loss_mlp": 0.01293091, + "balance_loss_clip": 0.06312405, + "balance_loss_mlp": 0.0126162, + "epoch": 0.14110927401172405, + "flos": 20601485228160.0, + "grad_norm": 2.0524474508447876, + "language_loss": 0.7827574, + "learning_rate": 3.8718904390976585e-06, + "loss": 0.86195028, + "num_input_tokens_seen": 50861035, + "router_z_loss_clip": 3.13867188, + "router_z_loss_mlp": 0.31445312, + "step": 2347, + "time_per_iteration": 3.98130202293396 + }, + { + "auxiliary_loss_clip": 0.06624688, + "auxiliary_loss_mlp": 0.01292693, + "balance_loss_clip": 0.06315368, + "balance_loss_mlp": 0.01263725, + "epoch": 0.141169397264392, + "flos": 28555530232320.0, + "grad_norm": 2.266106813963602, + "language_loss": 0.77906024, + "learning_rate": 3.8717532563775e-06, + "loss": 0.85823405, + "num_input_tokens_seen": 50880105, + "router_z_loss_clip": 3.09765625, + "router_z_loss_mlp": 0.28955078, + "step": 2348, + "time_per_iteration": 2.594891309738159 + }, + { + "auxiliary_loss_clip": 0.06614843, + "auxiliary_loss_mlp": 0.01295406, + "balance_loss_clip": 0.06306847, + "balance_loss_mlp": 0.01267558, + "epoch": 0.14122952051705998, + "flos": 17097947585280.0, + "grad_norm": 2.2615839491571097, + "language_loss": 0.88040984, + "learning_rate": 3.871616002680272e-06, + "loss": 0.95951235, + "num_input_tokens_seen": 50897720, + "router_z_loss_clip": 3.08007812, + "router_z_loss_mlp": 0.27856445, + "step": 2349, + "time_per_iteration": 2.547189712524414 + }, + { + "auxiliary_loss_clip": 0.06613597, + "auxiliary_loss_mlp": 0.01290937, + "balance_loss_clip": 0.06307055, + "balance_loss_mlp": 0.01260754, + "epoch": 0.14128964376972794, + "flos": 28953915269760.0, + "grad_norm": 1.755772853620136, + "language_loss": 0.89833802, + "learning_rate": 3.871478678011177e-06, + "loss": 0.97738338, + "num_input_tokens_seen": 50918385, + "router_z_loss_clip": 3.06445312, + "router_z_loss_mlp": 0.30200195, + "step": 2350, + "time_per_iteration": 2.5965797901153564 + }, + { + "auxiliary_loss_clip": 0.06614771, + "auxiliary_loss_mlp": 0.01295884, + "balance_loss_clip": 0.06303953, + "balance_loss_mlp": 0.0126626, + "epoch": 0.1413497670223959, + "flos": 18995828163840.0, + "grad_norm": 2.169076392434691, + "language_loss": 0.81670076, + "learning_rate": 3.871341282375423e-06, + "loss": 0.89580733, + "num_input_tokens_seen": 50938270, + "router_z_loss_clip": 3.10742188, + "router_z_loss_mlp": 0.29638672, + "step": 2351, + "time_per_iteration": 4.039130687713623 + }, + { + "auxiliary_loss_clip": 0.06617273, + "auxiliary_loss_mlp": 0.012885, + "balance_loss_clip": 0.06303668, + "balance_loss_mlp": 0.01259246, + "epoch": 0.14140989027506387, + "flos": 29870053885440.0, + "grad_norm": 2.711725731055931, + "language_loss": 0.85320342, + "learning_rate": 3.871203815778219e-06, + "loss": 0.93226123, + "num_input_tokens_seen": 50958155, + "router_z_loss_clip": 3.1328125, + "router_z_loss_mlp": 0.29223633, + "step": 2352, + "time_per_iteration": 2.6179373264312744 + }, + { + "auxiliary_loss_clip": 0.06476805, + "auxiliary_loss_mlp": 0.01279755, + "balance_loss_clip": 0.06299812, + "balance_loss_mlp": 0.01267614, + "epoch": 0.14147001352773186, + "flos": 62098901331840.0, + "grad_norm": 0.8822482530682503, + "language_loss": 0.61915213, + "learning_rate": 3.87106627822478e-06, + "loss": 0.69671774, + "num_input_tokens_seen": 51020705, + "router_z_loss_clip": 1.76953125, + "router_z_loss_mlp": 0.12139893, + "step": 2353, + "time_per_iteration": 3.087498188018799 + }, + { + "auxiliary_loss_clip": 0.06606863, + "auxiliary_loss_mlp": 0.01289785, + "balance_loss_clip": 0.06297718, + "balance_loss_mlp": 0.01259458, + "epoch": 0.14153013678039983, + "flos": 22023973267200.0, + "grad_norm": 1.6072508509392793, + "language_loss": 0.88457793, + "learning_rate": 3.8709286697203196e-06, + "loss": 0.96354443, + "num_input_tokens_seen": 51039995, + "router_z_loss_clip": 3.09179688, + "router_z_loss_mlp": 0.30297852, + "step": 2354, + "time_per_iteration": 2.5465357303619385 + }, + { + "auxiliary_loss_clip": 0.06612588, + "auxiliary_loss_mlp": 0.01286583, + "balance_loss_clip": 0.0630111, + "balance_loss_mlp": 0.01255231, + "epoch": 0.1415902600330678, + "flos": 19726365985920.0, + "grad_norm": 1.842515646240357, + "language_loss": 0.75627196, + "learning_rate": 3.870790990270057e-06, + "loss": 0.83526361, + "num_input_tokens_seen": 51059075, + "router_z_loss_clip": 3.11523438, + "router_z_loss_mlp": 0.31347656, + "step": 2355, + "time_per_iteration": 2.5172102451324463 + }, + { + "auxiliary_loss_clip": 0.0647012, + "auxiliary_loss_mlp": 0.01269619, + "balance_loss_clip": 0.06293327, + "balance_loss_mlp": 0.01258312, + "epoch": 0.14165038328573576, + "flos": 65919330316800.0, + "grad_norm": 0.6582247032564781, + "language_loss": 0.51791292, + "learning_rate": 3.870653239879212e-06, + "loss": 0.59531033, + "num_input_tokens_seen": 51120380, + "router_z_loss_clip": 1.765625, + "router_z_loss_mlp": 0.11303711, + "step": 2356, + "time_per_iteration": 3.150625228881836 + }, + { + "auxiliary_loss_clip": 0.06615196, + "auxiliary_loss_mlp": 0.01292015, + "balance_loss_clip": 0.06304777, + "balance_loss_mlp": 0.01263262, + "epoch": 0.14171050653840372, + "flos": 12135011379840.0, + "grad_norm": 2.2420127528599973, + "language_loss": 0.71637189, + "learning_rate": 3.8705154185530095e-06, + "loss": 0.79544401, + "num_input_tokens_seen": 51136950, + "router_z_loss_clip": 3.10742188, + "router_z_loss_mlp": 0.28759766, + "step": 2357, + "time_per_iteration": 2.552600383758545 + }, + { + "auxiliary_loss_clip": 0.06616427, + "auxiliary_loss_mlp": 0.01288449, + "balance_loss_clip": 0.06301764, + "balance_loss_mlp": 0.01259624, + "epoch": 0.1417706297910717, + "flos": 20418735473280.0, + "grad_norm": 1.865810969860464, + "language_loss": 0.83125997, + "learning_rate": 3.870377526296674e-06, + "loss": 0.91030866, + "num_input_tokens_seen": 51155175, + "router_z_loss_clip": 3.15039062, + "router_z_loss_mlp": 0.28833008, + "step": 2358, + "time_per_iteration": 2.5359318256378174 + }, + { + "auxiliary_loss_clip": 0.06627464, + "auxiliary_loss_mlp": 0.01304325, + "balance_loss_clip": 0.06307626, + "balance_loss_mlp": 0.01270685, + "epoch": 0.14183075304373965, + "flos": 22386831373440.0, + "grad_norm": 2.098054947183796, + "language_loss": 0.72660583, + "learning_rate": 3.870239563115436e-06, + "loss": 0.8059237, + "num_input_tokens_seen": 51174500, + "router_z_loss_clip": 3.19726562, + "router_z_loss_mlp": 0.33642578, + "step": 2359, + "time_per_iteration": 2.5888121128082275 + }, + { + "auxiliary_loss_clip": 0.06615248, + "auxiliary_loss_mlp": 0.01292517, + "balance_loss_clip": 0.06299685, + "balance_loss_mlp": 0.0126126, + "epoch": 0.14189087629640765, + "flos": 21587503749120.0, + "grad_norm": 2.25647767982073, + "language_loss": 0.77278101, + "learning_rate": 3.870101529014526e-06, + "loss": 0.85185868, + "num_input_tokens_seen": 51194270, + "router_z_loss_clip": 3.16015625, + "router_z_loss_mlp": 0.31225586, + "step": 2360, + "time_per_iteration": 2.579084634780884 + }, + { + "auxiliary_loss_clip": 0.06601179, + "auxiliary_loss_mlp": 0.01289048, + "balance_loss_clip": 0.06295604, + "balance_loss_mlp": 0.01258936, + "epoch": 0.1419509995490756, + "flos": 20014312942080.0, + "grad_norm": 2.059957260866831, + "language_loss": 0.83125579, + "learning_rate": 3.869963423999178e-06, + "loss": 0.91015804, + "num_input_tokens_seen": 51211850, + "router_z_loss_clip": 3.0546875, + "router_z_loss_mlp": 0.30102539, + "step": 2361, + "time_per_iteration": 2.5846474170684814 + }, + { + "auxiliary_loss_clip": 0.06605215, + "auxiliary_loss_mlp": 0.01291381, + "balance_loss_clip": 0.06296673, + "balance_loss_mlp": 0.01261745, + "epoch": 0.14201112280174358, + "flos": 31949552188800.0, + "grad_norm": 1.940007653055607, + "language_loss": 0.75587547, + "learning_rate": 3.86982524807463e-06, + "loss": 0.83484137, + "num_input_tokens_seen": 51233545, + "router_z_loss_clip": 3.08789062, + "router_z_loss_mlp": 0.29663086, + "step": 2362, + "time_per_iteration": 2.6412899494171143 + }, + { + "auxiliary_loss_clip": 0.06603248, + "auxiliary_loss_mlp": 0.01291653, + "balance_loss_clip": 0.06299227, + "balance_loss_mlp": 0.01262948, + "epoch": 0.14207124605441154, + "flos": 41473811180160.0, + "grad_norm": 1.7220107932789903, + "language_loss": 0.74775076, + "learning_rate": 3.869687001246122e-06, + "loss": 0.82669979, + "num_input_tokens_seen": 51257615, + "router_z_loss_clip": 3.04296875, + "router_z_loss_mlp": 0.28686523, + "step": 2363, + "time_per_iteration": 2.7700705528259277 + }, + { + "auxiliary_loss_clip": 0.0660228, + "auxiliary_loss_mlp": 0.01297174, + "balance_loss_clip": 0.06294844, + "balance_loss_mlp": 0.01268051, + "epoch": 0.1421313693070795, + "flos": 31913186716800.0, + "grad_norm": 1.995738601500514, + "language_loss": 0.74229443, + "learning_rate": 3.8695486835188946e-06, + "loss": 0.82128894, + "num_input_tokens_seen": 51279645, + "router_z_loss_clip": 3.07226562, + "router_z_loss_mlp": 0.2911377, + "step": 2364, + "time_per_iteration": 2.636725664138794 + }, + { + "auxiliary_loss_clip": 0.06596863, + "auxiliary_loss_mlp": 0.01292827, + "balance_loss_clip": 0.06297632, + "balance_loss_mlp": 0.01264741, + "epoch": 0.14219149255974747, + "flos": 26878609670400.0, + "grad_norm": 3.4348232103303853, + "language_loss": 0.91282582, + "learning_rate": 3.869410294898195e-06, + "loss": 0.9917227, + "num_input_tokens_seen": 51299775, + "router_z_loss_clip": 2.9921875, + "router_z_loss_mlp": 0.28100586, + "step": 2365, + "time_per_iteration": 2.6131789684295654 + }, + { + "auxiliary_loss_clip": 0.06604894, + "auxiliary_loss_mlp": 0.01286963, + "balance_loss_clip": 0.06295748, + "balance_loss_mlp": 0.01257613, + "epoch": 0.14225161581241544, + "flos": 27461882741760.0, + "grad_norm": 1.7987446671320764, + "language_loss": 0.67002726, + "learning_rate": 3.869271835389268e-06, + "loss": 0.74894583, + "num_input_tokens_seen": 51319430, + "router_z_loss_clip": 3.09570312, + "router_z_loss_mlp": 0.29345703, + "step": 2366, + "time_per_iteration": 2.5887913703918457 + }, + { + "auxiliary_loss_clip": 0.06604536, + "auxiliary_loss_mlp": 0.01294035, + "balance_loss_clip": 0.06302322, + "balance_loss_mlp": 0.01266069, + "epoch": 0.14231173906508343, + "flos": 10566055203840.0, + "grad_norm": 1.9092553080536903, + "language_loss": 0.81985664, + "learning_rate": 3.8691333049973665e-06, + "loss": 0.89884233, + "num_input_tokens_seen": 51336045, + "router_z_loss_clip": 3.0234375, + "router_z_loss_mlp": 0.27978516, + "step": 2367, + "time_per_iteration": 2.5478296279907227 + }, + { + "auxiliary_loss_clip": 0.06620896, + "auxiliary_loss_mlp": 0.01287452, + "balance_loss_clip": 0.06312472, + "balance_loss_mlp": 0.01257244, + "epoch": 0.1423718623177514, + "flos": 28367539597440.0, + "grad_norm": 1.7968709236925184, + "language_loss": 0.83861458, + "learning_rate": 3.868994703727742e-06, + "loss": 0.91769814, + "num_input_tokens_seen": 51357030, + "router_z_loss_clip": 3.08984375, + "router_z_loss_mlp": 0.30224609, + "step": 2368, + "time_per_iteration": 2.6346163749694824 + }, + { + "auxiliary_loss_clip": 0.06607647, + "auxiliary_loss_mlp": 0.01292051, + "balance_loss_clip": 0.06299834, + "balance_loss_mlp": 0.01262558, + "epoch": 0.14243198557041936, + "flos": 19360279497600.0, + "grad_norm": 2.15297979683556, + "language_loss": 0.8844623, + "learning_rate": 3.868856031585652e-06, + "loss": 0.96345925, + "num_input_tokens_seen": 51374890, + "router_z_loss_clip": 3.078125, + "router_z_loss_mlp": 0.29516602, + "step": 2369, + "time_per_iteration": 2.539616823196411 + }, + { + "auxiliary_loss_clip": 0.06609218, + "auxiliary_loss_mlp": 0.01286988, + "balance_loss_clip": 0.06298466, + "balance_loss_mlp": 0.01257067, + "epoch": 0.14249210882308733, + "flos": 28814952072960.0, + "grad_norm": 1.4943626605358518, + "language_loss": 0.76837498, + "learning_rate": 3.868717288576354e-06, + "loss": 0.84733701, + "num_input_tokens_seen": 51398100, + "router_z_loss_clip": 3.10546875, + "router_z_loss_mlp": 0.29931641, + "step": 2370, + "time_per_iteration": 2.6086556911468506 + }, + { + "auxiliary_loss_clip": 0.06600792, + "auxiliary_loss_mlp": 0.01298284, + "balance_loss_clip": 0.06298122, + "balance_loss_mlp": 0.01270198, + "epoch": 0.1425522320757553, + "flos": 21841433147520.0, + "grad_norm": 1.5553091357309907, + "language_loss": 0.83888042, + "learning_rate": 3.868578474705109e-06, + "loss": 0.91787124, + "num_input_tokens_seen": 51418745, + "router_z_loss_clip": 3.02734375, + "router_z_loss_mlp": 0.28076172, + "step": 2371, + "time_per_iteration": 2.5464093685150146 + }, + { + "auxiliary_loss_clip": 0.06608661, + "auxiliary_loss_mlp": 0.01298037, + "balance_loss_clip": 0.06299958, + "balance_loss_mlp": 0.01267448, + "epoch": 0.14261235532842326, + "flos": 17317230520320.0, + "grad_norm": 1.80299500179396, + "language_loss": 0.84039259, + "learning_rate": 3.868439589977181e-06, + "loss": 0.91945958, + "num_input_tokens_seen": 51437455, + "router_z_loss_clip": 3.08984375, + "router_z_loss_mlp": 0.30615234, + "step": 2372, + "time_per_iteration": 2.6340725421905518 + }, + { + "auxiliary_loss_clip": 0.0660327, + "auxiliary_loss_mlp": 0.01297499, + "balance_loss_clip": 0.06296232, + "balance_loss_mlp": 0.01267149, + "epoch": 0.14267247858109125, + "flos": 18812659138560.0, + "grad_norm": 1.948811934487197, + "language_loss": 0.8570497, + "learning_rate": 3.868300634397836e-06, + "loss": 0.93605745, + "num_input_tokens_seen": 51455710, + "router_z_loss_clip": 3.07226562, + "router_z_loss_mlp": 0.30322266, + "step": 2373, + "time_per_iteration": 2.580719232559204 + }, + { + "auxiliary_loss_clip": 0.06601362, + "auxiliary_loss_mlp": 0.01295253, + "balance_loss_clip": 0.06296989, + "balance_loss_mlp": 0.01266547, + "epoch": 0.14273260183375922, + "flos": 11362783351680.0, + "grad_norm": 1.9518464435556906, + "language_loss": 0.87130672, + "learning_rate": 3.8681616079723445e-06, + "loss": 0.95027292, + "num_input_tokens_seen": 51471270, + "router_z_loss_clip": 3.04101562, + "router_z_loss_mlp": 0.28710938, + "step": 2374, + "time_per_iteration": 2.499939441680908 + }, + { + "auxiliary_loss_clip": 0.0660402, + "auxiliary_loss_mlp": 0.01294805, + "balance_loss_clip": 0.06292336, + "balance_loss_mlp": 0.01264526, + "epoch": 0.14279272508642718, + "flos": 27575800767360.0, + "grad_norm": 1.5586534981326832, + "language_loss": 0.79946959, + "learning_rate": 3.868022510705977e-06, + "loss": 0.87845778, + "num_input_tokens_seen": 51492705, + "router_z_loss_clip": 3.1171875, + "router_z_loss_mlp": 0.30273438, + "step": 2375, + "time_per_iteration": 2.610959768295288 + }, + { + "auxiliary_loss_clip": 0.06608847, + "auxiliary_loss_mlp": 0.01308792, + "balance_loss_clip": 0.06302035, + "balance_loss_mlp": 0.01278454, + "epoch": 0.14285284833909515, + "flos": 16258019857920.0, + "grad_norm": 4.976375068021591, + "language_loss": 0.77988309, + "learning_rate": 3.867883342604009e-06, + "loss": 0.85905945, + "num_input_tokens_seen": 51510780, + "router_z_loss_clip": 3.0703125, + "router_z_loss_mlp": 0.30310059, + "step": 2376, + "time_per_iteration": 2.5109288692474365 + }, + { + "auxiliary_loss_clip": 0.06606634, + "auxiliary_loss_mlp": 0.01292138, + "balance_loss_clip": 0.06302465, + "balance_loss_mlp": 0.01263742, + "epoch": 0.1429129715917631, + "flos": 19761725208960.0, + "grad_norm": 1.9346292161061796, + "language_loss": 0.94255036, + "learning_rate": 3.867744103671717e-06, + "loss": 1.02153814, + "num_input_tokens_seen": 51531400, + "router_z_loss_clip": 3.04101562, + "router_z_loss_mlp": 0.28393555, + "step": 2377, + "time_per_iteration": 2.5885112285614014 + }, + { + "auxiliary_loss_clip": 0.06608409, + "auxiliary_loss_mlp": 0.01297565, + "balance_loss_clip": 0.06298596, + "balance_loss_mlp": 0.01267524, + "epoch": 0.14297309484443108, + "flos": 21142606896000.0, + "grad_norm": 1.9262255620531108, + "language_loss": 0.92638403, + "learning_rate": 3.867604793914382e-06, + "loss": 1.00544381, + "num_input_tokens_seen": 51548215, + "router_z_loss_clip": 3.09960938, + "router_z_loss_mlp": 0.30029297, + "step": 2378, + "time_per_iteration": 2.5396018028259277 + }, + { + "auxiliary_loss_clip": 0.06602019, + "auxiliary_loss_mlp": 0.01288289, + "balance_loss_clip": 0.06294227, + "balance_loss_mlp": 0.01259667, + "epoch": 0.14303321809709904, + "flos": 23593432567680.0, + "grad_norm": 1.925396398414909, + "language_loss": 0.7506215, + "learning_rate": 3.8674654133372864e-06, + "loss": 0.82952458, + "num_input_tokens_seen": 51566820, + "router_z_loss_clip": 3.07421875, + "router_z_loss_mlp": 0.28649902, + "step": 2379, + "time_per_iteration": 2.5452654361724854 + }, + { + "auxiliary_loss_clip": 0.06604548, + "auxiliary_loss_mlp": 0.01289072, + "balance_loss_clip": 0.06300471, + "balance_loss_mlp": 0.01259342, + "epoch": 0.14309334134976703, + "flos": 15893778159360.0, + "grad_norm": 2.089306422098332, + "language_loss": 0.80051982, + "learning_rate": 3.867325961945714e-06, + "loss": 0.87945604, + "num_input_tokens_seen": 51585075, + "router_z_loss_clip": 3.0390625, + "router_z_loss_mlp": 0.29736328, + "step": 2380, + "time_per_iteration": 2.526667594909668 + }, + { + "auxiliary_loss_clip": 0.06614038, + "auxiliary_loss_mlp": 0.01293901, + "balance_loss_clip": 0.06305015, + "balance_loss_mlp": 0.01263348, + "epoch": 0.143153464602435, + "flos": 16331086218240.0, + "grad_norm": 2.094305551914021, + "language_loss": 0.88833153, + "learning_rate": 3.867186439744955e-06, + "loss": 0.96741092, + "num_input_tokens_seen": 51603185, + "router_z_loss_clip": 3.09179688, + "router_z_loss_mlp": 0.30578613, + "step": 2381, + "time_per_iteration": 2.5728068351745605 + }, + { + "auxiliary_loss_clip": 0.06602444, + "auxiliary_loss_mlp": 0.0128486, + "balance_loss_clip": 0.06299065, + "balance_loss_mlp": 0.01256226, + "epoch": 0.14321358785510296, + "flos": 17097737950080.0, + "grad_norm": 2.316632685614806, + "language_loss": 0.77740443, + "learning_rate": 3.867046846740299e-06, + "loss": 0.85627747, + "num_input_tokens_seen": 51620880, + "router_z_loss_clip": 3.02929688, + "router_z_loss_mlp": 0.28625488, + "step": 2382, + "time_per_iteration": 2.5297727584838867 + }, + { + "auxiliary_loss_clip": 0.06601999, + "auxiliary_loss_mlp": 0.01286872, + "balance_loss_clip": 0.06297128, + "balance_loss_mlp": 0.01257904, + "epoch": 0.14327371110777093, + "flos": 26330108843520.0, + "grad_norm": 2.004241684907444, + "language_loss": 0.78048921, + "learning_rate": 3.866907182937039e-06, + "loss": 0.85937786, + "num_input_tokens_seen": 51640170, + "router_z_loss_clip": 3.05078125, + "router_z_loss_mlp": 0.28955078, + "step": 2383, + "time_per_iteration": 2.598944664001465 + }, + { + "auxiliary_loss_clip": 0.06614614, + "auxiliary_loss_mlp": 0.01292365, + "balance_loss_clip": 0.06304877, + "balance_loss_mlp": 0.01261513, + "epoch": 0.1433338343604389, + "flos": 18082163243520.0, + "grad_norm": 3.628436675924041, + "language_loss": 0.88476908, + "learning_rate": 3.866767448340471e-06, + "loss": 0.96383882, + "num_input_tokens_seen": 51656580, + "router_z_loss_clip": 3.09765625, + "router_z_loss_mlp": 0.30834961, + "step": 2384, + "time_per_iteration": 2.5066895484924316 + }, + { + "auxiliary_loss_clip": 0.06611983, + "auxiliary_loss_mlp": 0.01297446, + "balance_loss_clip": 0.06300933, + "balance_loss_mlp": 0.0126719, + "epoch": 0.14339395761310686, + "flos": 15528110941440.0, + "grad_norm": 5.651210237348795, + "language_loss": 0.81964046, + "learning_rate": 3.866627642955895e-06, + "loss": 0.89873475, + "num_input_tokens_seen": 51674645, + "router_z_loss_clip": 3.11132812, + "router_z_loss_mlp": 0.30273438, + "step": 2385, + "time_per_iteration": 3.9016833305358887 + }, + { + "auxiliary_loss_clip": 0.06612079, + "auxiliary_loss_mlp": 0.01294874, + "balance_loss_clip": 0.06302845, + "balance_loss_mlp": 0.01266406, + "epoch": 0.14345408086577485, + "flos": 28556368773120.0, + "grad_norm": 2.028141972046204, + "language_loss": 0.76766604, + "learning_rate": 3.866487766788612e-06, + "loss": 0.8467356, + "num_input_tokens_seen": 51695770, + "router_z_loss_clip": 3.09570312, + "router_z_loss_mlp": 0.28479004, + "step": 2386, + "time_per_iteration": 4.032405376434326 + }, + { + "auxiliary_loss_clip": 0.06616995, + "auxiliary_loss_mlp": 0.01287556, + "balance_loss_clip": 0.06312285, + "balance_loss_mlp": 0.01258958, + "epoch": 0.14351420411844282, + "flos": 20236279207680.0, + "grad_norm": 2.123480501578919, + "language_loss": 0.79237044, + "learning_rate": 3.866347819843925e-06, + "loss": 0.87141591, + "num_input_tokens_seen": 51714165, + "router_z_loss_clip": 3.046875, + "router_z_loss_mlp": 0.28601074, + "step": 2387, + "time_per_iteration": 2.5608971118927 + }, + { + "auxiliary_loss_clip": 0.06612308, + "auxiliary_loss_mlp": 0.01293206, + "balance_loss_clip": 0.06306893, + "balance_loss_mlp": 0.01263023, + "epoch": 0.14357432737111078, + "flos": 19871157041280.0, + "grad_norm": 2.5788985385847396, + "language_loss": 0.83602524, + "learning_rate": 3.866207802127143e-06, + "loss": 0.91508037, + "num_input_tokens_seen": 51734440, + "router_z_loss_clip": 3.06054688, + "router_z_loss_mlp": 0.30200195, + "step": 2388, + "time_per_iteration": 2.5413224697113037 + }, + { + "auxiliary_loss_clip": 0.06619543, + "auxiliary_loss_mlp": 0.01287669, + "balance_loss_clip": 0.06312172, + "balance_loss_mlp": 0.0126006, + "epoch": 0.14363445062377875, + "flos": 28264354894080.0, + "grad_norm": 2.5598639084548176, + "language_loss": 0.83343434, + "learning_rate": 3.866067713643573e-06, + "loss": 0.91250646, + "num_input_tokens_seen": 51753730, + "router_z_loss_clip": 3.07421875, + "router_z_loss_mlp": 0.27648926, + "step": 2389, + "time_per_iteration": 2.6027376651763916 + }, + { + "auxiliary_loss_clip": 0.06612187, + "auxiliary_loss_mlp": 0.01286457, + "balance_loss_clip": 0.06301727, + "balance_loss_mlp": 0.01257013, + "epoch": 0.1436945738764467, + "flos": 18192517470720.0, + "grad_norm": 2.036228542153499, + "language_loss": 0.84029567, + "learning_rate": 3.8659275543985285e-06, + "loss": 0.91928208, + "num_input_tokens_seen": 51771195, + "router_z_loss_clip": 3.10546875, + "router_z_loss_mlp": 0.29467773, + "step": 2390, + "time_per_iteration": 5.428901672363281 + }, + { + "auxiliary_loss_clip": 0.06612678, + "auxiliary_loss_mlp": 0.01293631, + "balance_loss_clip": 0.06306715, + "balance_loss_mlp": 0.01264282, + "epoch": 0.14375469712911468, + "flos": 27315246896640.0, + "grad_norm": 2.34202135113637, + "language_loss": 0.75496042, + "learning_rate": 3.865787324397324e-06, + "loss": 0.83402348, + "num_input_tokens_seen": 51792290, + "router_z_loss_clip": 3.06054688, + "router_z_loss_mlp": 0.29345703, + "step": 2391, + "time_per_iteration": 2.599823236465454 + }, + { + "auxiliary_loss_clip": 0.06462222, + "auxiliary_loss_mlp": 0.01318708, + "balance_loss_clip": 0.06290679, + "balance_loss_mlp": 0.01307848, + "epoch": 0.14381482038178264, + "flos": 56908757980800.0, + "grad_norm": 0.847659725006037, + "language_loss": 0.61820173, + "learning_rate": 3.865647023645277e-06, + "loss": 0.69601095, + "num_input_tokens_seen": 51843675, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.10876465, + "step": 2392, + "time_per_iteration": 3.007570266723633 + }, + { + "auxiliary_loss_clip": 0.06623066, + "auxiliary_loss_mlp": 0.01297432, + "balance_loss_clip": 0.06308551, + "balance_loss_mlp": 0.01267105, + "epoch": 0.14387494363445064, + "flos": 14287282554240.0, + "grad_norm": 6.716541515366395, + "language_loss": 0.77778554, + "learning_rate": 3.865506652147709e-06, + "loss": 0.85699052, + "num_input_tokens_seen": 51860285, + "router_z_loss_clip": 3.14453125, + "router_z_loss_mlp": 0.30322266, + "step": 2393, + "time_per_iteration": 2.5064942836761475 + }, + { + "auxiliary_loss_clip": 0.06614703, + "auxiliary_loss_mlp": 0.01296275, + "balance_loss_clip": 0.06308223, + "balance_loss_mlp": 0.01266687, + "epoch": 0.1439350668871186, + "flos": 26768884348800.0, + "grad_norm": 2.0037821703408287, + "language_loss": 0.78038269, + "learning_rate": 3.865366209909941e-06, + "loss": 0.85949242, + "num_input_tokens_seen": 51880105, + "router_z_loss_clip": 3.06445312, + "router_z_loss_mlp": 0.2956543, + "step": 2394, + "time_per_iteration": 2.6112003326416016 + }, + { + "auxiliary_loss_clip": 0.06611894, + "auxiliary_loss_mlp": 0.01285238, + "balance_loss_clip": 0.06308618, + "balance_loss_mlp": 0.01256866, + "epoch": 0.14399519013978657, + "flos": 40709926632960.0, + "grad_norm": 2.2776605014778, + "language_loss": 0.87247694, + "learning_rate": 3.8652256969372994e-06, + "loss": 0.95144826, + "num_input_tokens_seen": 51905175, + "router_z_loss_clip": 3.03320312, + "router_z_loss_mlp": 0.28381348, + "step": 2395, + "time_per_iteration": 2.708005428314209 + }, + { + "auxiliary_loss_clip": 0.06606728, + "auxiliary_loss_mlp": 0.0129272, + "balance_loss_clip": 0.06306736, + "balance_loss_mlp": 0.01262846, + "epoch": 0.14405531339245453, + "flos": 20563652309760.0, + "grad_norm": 1.5258430726739798, + "language_loss": 0.83690441, + "learning_rate": 3.865085113235113e-06, + "loss": 0.91589892, + "num_input_tokens_seen": 51924490, + "router_z_loss_clip": 3.00195312, + "router_z_loss_mlp": 0.29882812, + "step": 2396, + "time_per_iteration": 2.554426431655884 + }, + { + "auxiliary_loss_clip": 0.06608565, + "auxiliary_loss_mlp": 0.01286347, + "balance_loss_clip": 0.06309813, + "balance_loss_mlp": 0.0125664, + "epoch": 0.1441154366451225, + "flos": 19578975454080.0, + "grad_norm": 3.4820488024482787, + "language_loss": 0.83915055, + "learning_rate": 3.864944458808712e-06, + "loss": 0.9180997, + "num_input_tokens_seen": 51940490, + "router_z_loss_clip": 2.98828125, + "router_z_loss_mlp": 0.29711914, + "step": 2397, + "time_per_iteration": 2.504763603210449 + }, + { + "auxiliary_loss_clip": 0.0661477, + "auxiliary_loss_mlp": 0.01289633, + "balance_loss_clip": 0.0631109, + "balance_loss_mlp": 0.01261452, + "epoch": 0.14417555989779046, + "flos": 18521735362560.0, + "grad_norm": 2.264494400552882, + "language_loss": 0.81188649, + "learning_rate": 3.86480373366343e-06, + "loss": 0.89093053, + "num_input_tokens_seen": 51957910, + "router_z_loss_clip": 3.03710938, + "router_z_loss_mlp": 0.28186035, + "step": 2398, + "time_per_iteration": 2.5385115146636963 + }, + { + "auxiliary_loss_clip": 0.0661198, + "auxiliary_loss_mlp": 0.01292634, + "balance_loss_clip": 0.06310214, + "balance_loss_mlp": 0.01263535, + "epoch": 0.14423568315045843, + "flos": 26038933505280.0, + "grad_norm": 2.0391001830721014, + "language_loss": 0.65964776, + "learning_rate": 3.864662937804603e-06, + "loss": 0.73869389, + "num_input_tokens_seen": 51978010, + "router_z_loss_clip": 3.01367188, + "router_z_loss_mlp": 0.2911377, + "step": 2399, + "time_per_iteration": 2.5843687057495117 + }, + { + "auxiliary_loss_clip": 0.06611193, + "auxiliary_loss_mlp": 0.01283302, + "balance_loss_clip": 0.06308104, + "balance_loss_mlp": 0.01253953, + "epoch": 0.14429580640312642, + "flos": 21295238307840.0, + "grad_norm": 1.6766317515480094, + "language_loss": 0.83645046, + "learning_rate": 3.864522071237571e-06, + "loss": 0.91539544, + "num_input_tokens_seen": 51998515, + "router_z_loss_clip": 3.03320312, + "router_z_loss_mlp": 0.29321289, + "step": 2400, + "time_per_iteration": 2.555400848388672 + }, + { + "auxiliary_loss_clip": 0.06611119, + "auxiliary_loss_mlp": 0.01295227, + "balance_loss_clip": 0.06304638, + "balance_loss_mlp": 0.01263494, + "epoch": 0.14435592965579438, + "flos": 25634636755200.0, + "grad_norm": 1.4775307939223221, + "language_loss": 0.75889075, + "learning_rate": 3.864381133967676e-06, + "loss": 0.83795416, + "num_input_tokens_seen": 52019270, + "router_z_loss_clip": 3.06445312, + "router_z_loss_mlp": 0.31738281, + "step": 2401, + "time_per_iteration": 2.595510959625244 + }, + { + "auxiliary_loss_clip": 0.06599294, + "auxiliary_loss_mlp": 0.01290815, + "balance_loss_clip": 0.06300685, + "balance_loss_mlp": 0.01262991, + "epoch": 0.14441605290846235, + "flos": 22971488037120.0, + "grad_norm": 3.551603969288966, + "language_loss": 0.81723303, + "learning_rate": 3.86424012600026e-06, + "loss": 0.89613414, + "num_input_tokens_seen": 52039315, + "router_z_loss_clip": 2.984375, + "router_z_loss_mlp": 0.27832031, + "step": 2402, + "time_per_iteration": 2.586766242980957 + }, + { + "auxiliary_loss_clip": 0.06609451, + "auxiliary_loss_mlp": 0.0129576, + "balance_loss_clip": 0.06306025, + "balance_loss_mlp": 0.01267246, + "epoch": 0.14447617616113032, + "flos": 17353386357120.0, + "grad_norm": 2.060017923221776, + "language_loss": 0.8556419, + "learning_rate": 3.864099047340673e-06, + "loss": 0.93469405, + "num_input_tokens_seen": 52056555, + "router_z_loss_clip": 3.03320312, + "router_z_loss_mlp": 0.28491211, + "step": 2403, + "time_per_iteration": 2.607682943344116 + }, + { + "auxiliary_loss_clip": 0.06604473, + "auxiliary_loss_mlp": 0.01296468, + "balance_loss_clip": 0.06304755, + "balance_loss_mlp": 0.01267644, + "epoch": 0.14453629941379828, + "flos": 24066896463360.0, + "grad_norm": 1.6573993279871784, + "language_loss": 0.71218109, + "learning_rate": 3.863957897994262e-06, + "loss": 0.79119051, + "num_input_tokens_seen": 52075800, + "router_z_loss_clip": 2.99609375, + "router_z_loss_mlp": 0.28833008, + "step": 2404, + "time_per_iteration": 2.5632174015045166 + }, + { + "auxiliary_loss_clip": 0.06603173, + "auxiliary_loss_mlp": 0.0129217, + "balance_loss_clip": 0.06303019, + "balance_loss_mlp": 0.0126282, + "epoch": 0.14459642266646625, + "flos": 14434924648320.0, + "grad_norm": 2.334574719230043, + "language_loss": 0.74209595, + "learning_rate": 3.863816677966381e-06, + "loss": 0.82104933, + "num_input_tokens_seen": 52092585, + "router_z_loss_clip": 3.0, + "router_z_loss_mlp": 0.29345703, + "step": 2405, + "time_per_iteration": 2.520474910736084 + }, + { + "auxiliary_loss_clip": 0.06599967, + "auxiliary_loss_mlp": 0.01307828, + "balance_loss_clip": 0.06301095, + "balance_loss_mlp": 0.01279647, + "epoch": 0.14465654591913424, + "flos": 9871337802240.0, + "grad_norm": 2.8694662985653245, + "language_loss": 0.74507034, + "learning_rate": 3.863675387262386e-06, + "loss": 0.8241483, + "num_input_tokens_seen": 52108990, + "router_z_loss_clip": 2.984375, + "router_z_loss_mlp": 0.28173828, + "step": 2406, + "time_per_iteration": 2.5204012393951416 + }, + { + "auxiliary_loss_clip": 0.0660891, + "auxiliary_loss_mlp": 0.01299289, + "balance_loss_clip": 0.06308217, + "balance_loss_mlp": 0.01270584, + "epoch": 0.1447166691718022, + "flos": 24979890551040.0, + "grad_norm": 2.4466515535741027, + "language_loss": 0.77524543, + "learning_rate": 3.8635340258876325e-06, + "loss": 0.85432744, + "num_input_tokens_seen": 52125385, + "router_z_loss_clip": 3.00976562, + "router_z_loss_mlp": 0.28686523, + "step": 2407, + "time_per_iteration": 2.5871012210845947 + }, + { + "auxiliary_loss_clip": 0.06596132, + "auxiliary_loss_mlp": 0.01309759, + "balance_loss_clip": 0.06298497, + "balance_loss_mlp": 0.01281459, + "epoch": 0.14477679242447017, + "flos": 21914457580800.0, + "grad_norm": 2.4005439664015156, + "language_loss": 0.80167431, + "learning_rate": 3.8633925938474826e-06, + "loss": 0.88073325, + "num_input_tokens_seen": 52144985, + "router_z_loss_clip": 2.9765625, + "router_z_loss_mlp": 0.28320312, + "step": 2408, + "time_per_iteration": 2.5400643348693848 + }, + { + "auxiliary_loss_clip": 0.06608425, + "auxiliary_loss_mlp": 0.01300861, + "balance_loss_clip": 0.06305376, + "balance_loss_mlp": 0.0126939, + "epoch": 0.14483691567713813, + "flos": 20747030970240.0, + "grad_norm": 2.230633188895553, + "language_loss": 0.83653724, + "learning_rate": 3.863251091147299e-06, + "loss": 0.9156301, + "num_input_tokens_seen": 52163885, + "router_z_loss_clip": 3.03320312, + "router_z_loss_mlp": 0.31445312, + "step": 2409, + "time_per_iteration": 2.5423808097839355 + }, + { + "auxiliary_loss_clip": 0.06608373, + "auxiliary_loss_mlp": 0.0129938, + "balance_loss_clip": 0.06298821, + "balance_loss_mlp": 0.0126978, + "epoch": 0.1448970389298061, + "flos": 35416388943360.0, + "grad_norm": 2.041474654068305, + "language_loss": 0.76231539, + "learning_rate": 3.863109517792446e-06, + "loss": 0.84139293, + "num_input_tokens_seen": 52184325, + "router_z_loss_clip": 3.09765625, + "router_z_loss_mlp": 0.29602051, + "step": 2410, + "time_per_iteration": 2.6380317211151123 + }, + { + "auxiliary_loss_clip": 0.0660304, + "auxiliary_loss_mlp": 0.01294458, + "balance_loss_clip": 0.06304888, + "balance_loss_mlp": 0.01265491, + "epoch": 0.14495716218247406, + "flos": 15419853066240.0, + "grad_norm": 1.847852108753089, + "language_loss": 0.8233192, + "learning_rate": 3.8629678737882945e-06, + "loss": 0.90229416, + "num_input_tokens_seen": 52202740, + "router_z_loss_clip": 2.98046875, + "router_z_loss_mlp": 0.28942871, + "step": 2411, + "time_per_iteration": 2.5439260005950928 + }, + { + "auxiliary_loss_clip": 0.06610366, + "auxiliary_loss_mlp": 0.0129153, + "balance_loss_clip": 0.06308557, + "balance_loss_mlp": 0.01262514, + "epoch": 0.14501728543514203, + "flos": 33701677390080.0, + "grad_norm": 2.23940850930143, + "language_loss": 0.71979284, + "learning_rate": 3.862826159140214e-06, + "loss": 0.79881179, + "num_input_tokens_seen": 52223100, + "router_z_loss_clip": 3.01757812, + "router_z_loss_mlp": 0.29003906, + "step": 2412, + "time_per_iteration": 2.654892921447754 + }, + { + "auxiliary_loss_clip": 0.06603752, + "auxiliary_loss_mlp": 0.01292883, + "balance_loss_clip": 0.06306557, + "balance_loss_mlp": 0.01265465, + "epoch": 0.14507740868781002, + "flos": 15601512718080.0, + "grad_norm": 1.90667529133839, + "language_loss": 0.78426313, + "learning_rate": 3.862684373853579e-06, + "loss": 0.86322957, + "num_input_tokens_seen": 52239690, + "router_z_loss_clip": 2.96875, + "router_z_loss_mlp": 0.27441406, + "step": 2413, + "time_per_iteration": 2.5105841159820557 + }, + { + "auxiliary_loss_clip": 0.06474504, + "auxiliary_loss_mlp": 0.01256457, + "balance_loss_clip": 0.06298508, + "balance_loss_mlp": 0.01246152, + "epoch": 0.145137531940478, + "flos": 66695247924480.0, + "grad_norm": 0.8850823768955927, + "language_loss": 0.58774322, + "learning_rate": 3.8625425179337656e-06, + "loss": 0.66505289, + "num_input_tokens_seen": 52296705, + "router_z_loss_clip": 1.76269531, + "router_z_loss_mlp": 0.10308838, + "step": 2414, + "time_per_iteration": 3.0886166095733643 + }, + { + "auxiliary_loss_clip": 0.06466582, + "auxiliary_loss_mlp": 0.01255839, + "balance_loss_clip": 0.06291236, + "balance_loss_mlp": 0.01245486, + "epoch": 0.14519765519314595, + "flos": 67542806373120.0, + "grad_norm": 0.8215511806181923, + "language_loss": 0.61917955, + "learning_rate": 3.862400591386154e-06, + "loss": 0.69640374, + "num_input_tokens_seen": 52361830, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.10357666, + "step": 2415, + "time_per_iteration": 3.1800529956817627 + }, + { + "auxiliary_loss_clip": 0.06605236, + "auxiliary_loss_mlp": 0.0128974, + "balance_loss_clip": 0.06304489, + "balance_loss_mlp": 0.01261226, + "epoch": 0.14525777844581392, + "flos": 17204151035520.0, + "grad_norm": 1.9287382315286696, + "language_loss": 0.72791839, + "learning_rate": 3.8622585942161245e-06, + "loss": 0.80686808, + "num_input_tokens_seen": 52379420, + "router_z_loss_clip": 3.0078125, + "router_z_loss_mlp": 0.28540039, + "step": 2416, + "time_per_iteration": 2.5888171195983887 + }, + { + "auxiliary_loss_clip": 0.06466876, + "auxiliary_loss_mlp": 0.01256349, + "balance_loss_clip": 0.06292045, + "balance_loss_mlp": 0.01246574, + "epoch": 0.14531790169848188, + "flos": 65425349370240.0, + "grad_norm": 0.6779730680906524, + "language_loss": 0.60441911, + "learning_rate": 3.8621165264290635e-06, + "loss": 0.68165135, + "num_input_tokens_seen": 52446290, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.09765625, + "step": 2417, + "time_per_iteration": 3.256091356277466 + }, + { + "auxiliary_loss_clip": 0.06611343, + "auxiliary_loss_mlp": 0.01295709, + "balance_loss_clip": 0.06300741, + "balance_loss_mlp": 0.0126543, + "epoch": 0.14537802495114985, + "flos": 32570783959680.0, + "grad_norm": 9.327498524911116, + "language_loss": 0.80428064, + "learning_rate": 3.861974388030356e-06, + "loss": 0.88335121, + "num_input_tokens_seen": 52467295, + "router_z_loss_clip": 3.10742188, + "router_z_loss_mlp": 0.30297852, + "step": 2418, + "time_per_iteration": 2.6627931594848633 + }, + { + "auxiliary_loss_clip": 0.06597205, + "auxiliary_loss_mlp": 0.01293692, + "balance_loss_clip": 0.06300168, + "balance_loss_mlp": 0.01265952, + "epoch": 0.1454381482038178, + "flos": 20232338065920.0, + "grad_norm": 1.7107019560934957, + "language_loss": 0.72557437, + "learning_rate": 3.861832179025394e-06, + "loss": 0.80448335, + "num_input_tokens_seen": 52487295, + "router_z_loss_clip": 2.97265625, + "router_z_loss_mlp": 0.27746582, + "step": 2419, + "time_per_iteration": 2.55110764503479 + }, + { + "auxiliary_loss_clip": 0.06605242, + "auxiliary_loss_mlp": 0.01287615, + "balance_loss_clip": 0.06300443, + "balance_loss_mlp": 0.01258563, + "epoch": 0.1454982714564858, + "flos": 22899721415040.0, + "grad_norm": 2.764675065682222, + "language_loss": 0.91167969, + "learning_rate": 3.861689899419569e-06, + "loss": 0.99060822, + "num_input_tokens_seen": 52504220, + "router_z_loss_clip": 3.04882812, + "router_z_loss_mlp": 0.29064941, + "step": 2420, + "time_per_iteration": 2.554682731628418 + }, + { + "auxiliary_loss_clip": 0.06610379, + "auxiliary_loss_mlp": 0.01289829, + "balance_loss_clip": 0.06309067, + "balance_loss_mlp": 0.01262757, + "epoch": 0.14555839470915377, + "flos": 20236027645440.0, + "grad_norm": 2.2697741355192034, + "language_loss": 0.83967364, + "learning_rate": 3.861547549218276e-06, + "loss": 0.91867572, + "num_input_tokens_seen": 52521900, + "router_z_loss_clip": 3.015625, + "router_z_loss_mlp": 0.27050781, + "step": 2421, + "time_per_iteration": 2.5464484691619873 + }, + { + "auxiliary_loss_clip": 0.06610221, + "auxiliary_loss_mlp": 0.01287397, + "balance_loss_clip": 0.0630337, + "balance_loss_mlp": 0.01259216, + "epoch": 0.14561851796182174, + "flos": 22242753077760.0, + "grad_norm": 1.9618808249376125, + "language_loss": 0.82542074, + "learning_rate": 3.861405128426914e-06, + "loss": 0.90439695, + "num_input_tokens_seen": 52540495, + "router_z_loss_clip": 3.06835938, + "router_z_loss_mlp": 0.28173828, + "step": 2422, + "time_per_iteration": 2.5524632930755615 + }, + { + "auxiliary_loss_clip": 0.06461698, + "auxiliary_loss_mlp": 0.01262269, + "balance_loss_clip": 0.06287467, + "balance_loss_mlp": 0.01252607, + "epoch": 0.1456786412144897, + "flos": 52655758692480.0, + "grad_norm": 0.899920685315801, + "language_loss": 0.63252938, + "learning_rate": 3.861262637050883e-06, + "loss": 0.70976901, + "num_input_tokens_seen": 52603305, + "router_z_loss_clip": 1.74316406, + "router_z_loss_mlp": 0.09649658, + "step": 2423, + "time_per_iteration": 3.186488151550293 + }, + { + "auxiliary_loss_clip": 0.06612016, + "auxiliary_loss_mlp": 0.01288368, + "balance_loss_clip": 0.06311088, + "balance_loss_mlp": 0.01261402, + "epoch": 0.14573876446715767, + "flos": 23228352328320.0, + "grad_norm": 1.6675722488639018, + "language_loss": 0.82883829, + "learning_rate": 3.861120075095585e-06, + "loss": 0.90784216, + "num_input_tokens_seen": 52623435, + "router_z_loss_clip": 3.00976562, + "router_z_loss_mlp": 0.26928711, + "step": 2424, + "time_per_iteration": 2.6136088371276855 + }, + { + "auxiliary_loss_clip": 0.0660837, + "auxiliary_loss_mlp": 0.01282475, + "balance_loss_clip": 0.06310098, + "balance_loss_mlp": 0.01254246, + "epoch": 0.14579888771982563, + "flos": 18120331578240.0, + "grad_norm": 3.5994104334935733, + "language_loss": 0.79757202, + "learning_rate": 3.860977442566429e-06, + "loss": 0.87648046, + "num_input_tokens_seen": 52642255, + "router_z_loss_clip": 2.98242188, + "router_z_loss_mlp": 0.28271484, + "step": 2425, + "time_per_iteration": 4.07472825050354 + }, + { + "auxiliary_loss_clip": 0.06616544, + "auxiliary_loss_mlp": 0.01291448, + "balance_loss_clip": 0.06312044, + "balance_loss_mlp": 0.01263577, + "epoch": 0.14585901097249362, + "flos": 23007476165760.0, + "grad_norm": 3.905152777460985, + "language_loss": 0.84682351, + "learning_rate": 3.860834739468821e-06, + "loss": 0.92590338, + "num_input_tokens_seen": 52658700, + "router_z_loss_clip": 3.046875, + "router_z_loss_mlp": 0.27893066, + "step": 2426, + "time_per_iteration": 3.9595530033111572 + }, + { + "auxiliary_loss_clip": 0.066182, + "auxiliary_loss_mlp": 0.01297578, + "balance_loss_clip": 0.06312812, + "balance_loss_mlp": 0.0126904, + "epoch": 0.1459191342251616, + "flos": 21915212267520.0, + "grad_norm": 3.268887858496738, + "language_loss": 0.87538207, + "learning_rate": 3.860691965808173e-06, + "loss": 0.95453984, + "num_input_tokens_seen": 52678140, + "router_z_loss_clip": 3.0546875, + "router_z_loss_mlp": 0.28564453, + "step": 2427, + "time_per_iteration": 2.5644760131835938 + }, + { + "auxiliary_loss_clip": 0.0661422, + "auxiliary_loss_mlp": 0.01289371, + "balance_loss_clip": 0.06305077, + "balance_loss_mlp": 0.01258805, + "epoch": 0.14597925747782955, + "flos": 14980742144640.0, + "grad_norm": 1.9191014162631195, + "language_loss": 0.67673224, + "learning_rate": 3.8605491215899e-06, + "loss": 0.75576818, + "num_input_tokens_seen": 52696825, + "router_z_loss_clip": 3.09179688, + "router_z_loss_mlp": 0.3059082, + "step": 2428, + "time_per_iteration": 2.507455348968506 + }, + { + "auxiliary_loss_clip": 0.06609876, + "auxiliary_loss_mlp": 0.01290631, + "balance_loss_clip": 0.06305391, + "balance_loss_mlp": 0.01261807, + "epoch": 0.14603938073049752, + "flos": 21075200686080.0, + "grad_norm": 1.7530902442774277, + "language_loss": 0.84668899, + "learning_rate": 3.860406206819417e-06, + "loss": 0.92569411, + "num_input_tokens_seen": 52715125, + "router_z_loss_clip": 3.046875, + "router_z_loss_mlp": 0.28833008, + "step": 2429, + "time_per_iteration": 2.5743284225463867 + }, + { + "auxiliary_loss_clip": 0.06606025, + "auxiliary_loss_mlp": 0.01297985, + "balance_loss_clip": 0.06307633, + "balance_loss_mlp": 0.01269661, + "epoch": 0.14609950398316549, + "flos": 19870863552000.0, + "grad_norm": 1.787324656259552, + "language_loss": 0.80119967, + "learning_rate": 3.860263221502145e-06, + "loss": 0.88023973, + "num_input_tokens_seen": 52734015, + "router_z_loss_clip": 2.98242188, + "router_z_loss_mlp": 0.28308105, + "step": 2430, + "time_per_iteration": 3.9587552547454834 + }, + { + "auxiliary_loss_clip": 0.06618911, + "auxiliary_loss_mlp": 0.01299566, + "balance_loss_clip": 0.06312407, + "balance_loss_mlp": 0.01271552, + "epoch": 0.14615962723583345, + "flos": 22425377051520.0, + "grad_norm": 2.031204881913862, + "language_loss": 0.84236491, + "learning_rate": 3.860120165643504e-06, + "loss": 0.92154968, + "num_input_tokens_seen": 52753025, + "router_z_loss_clip": 3.0625, + "router_z_loss_mlp": 0.28051758, + "step": 2431, + "time_per_iteration": 2.5258126258850098 + }, + { + "auxiliary_loss_clip": 0.06622316, + "auxiliary_loss_mlp": 0.01304388, + "balance_loss_clip": 0.06307245, + "balance_loss_mlp": 0.01273823, + "epoch": 0.14621975048850142, + "flos": 22352813815680.0, + "grad_norm": 2.3067012157334976, + "language_loss": 0.79905456, + "learning_rate": 3.859977039248921e-06, + "loss": 0.87832165, + "num_input_tokens_seen": 52773420, + "router_z_loss_clip": 3.15039062, + "router_z_loss_mlp": 0.30566406, + "step": 2432, + "time_per_iteration": 2.5560994148254395 + }, + { + "auxiliary_loss_clip": 0.06613283, + "auxiliary_loss_mlp": 0.01299078, + "balance_loss_clip": 0.06307921, + "balance_loss_mlp": 0.01268894, + "epoch": 0.1462798737411694, + "flos": 24396030501120.0, + "grad_norm": 3.9772219479987796, + "language_loss": 0.8163479, + "learning_rate": 3.859833842323822e-06, + "loss": 0.89547151, + "num_input_tokens_seen": 52792870, + "router_z_loss_clip": 3.0546875, + "router_z_loss_mlp": 0.30175781, + "step": 2433, + "time_per_iteration": 2.5528087615966797 + }, + { + "auxiliary_loss_clip": 0.06603821, + "auxiliary_loss_mlp": 0.01308033, + "balance_loss_clip": 0.06304027, + "balance_loss_mlp": 0.0128052, + "epoch": 0.14633999699383737, + "flos": 19250679957120.0, + "grad_norm": 5.860215383122996, + "language_loss": 0.79175711, + "learning_rate": 3.859690574873638e-06, + "loss": 0.87087572, + "num_input_tokens_seen": 52811615, + "router_z_loss_clip": 3.0, + "router_z_loss_mlp": 0.27526855, + "step": 2434, + "time_per_iteration": 2.5396053791046143 + }, + { + "auxiliary_loss_clip": 0.0649661, + "auxiliary_loss_mlp": 0.01339476, + "balance_loss_clip": 0.0632303, + "balance_loss_mlp": 0.01328705, + "epoch": 0.14640012024650534, + "flos": 62679658780800.0, + "grad_norm": 0.822335797554765, + "language_loss": 0.58256161, + "learning_rate": 3.8595472369038e-06, + "loss": 0.66092247, + "num_input_tokens_seen": 52873230, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.10784912, + "step": 2435, + "time_per_iteration": 3.147134304046631 + }, + { + "auxiliary_loss_clip": 0.06602708, + "auxiliary_loss_mlp": 0.0130236, + "balance_loss_clip": 0.06305322, + "balance_loss_mlp": 0.01274036, + "epoch": 0.1464602434991733, + "flos": 12281144100480.0, + "grad_norm": 2.2533392469478453, + "language_loss": 0.89637053, + "learning_rate": 3.859403828419744e-06, + "loss": 0.97542119, + "num_input_tokens_seen": 52889325, + "router_z_loss_clip": 2.97460938, + "router_z_loss_mlp": 0.28320312, + "step": 2436, + "time_per_iteration": 2.5397794246673584 + }, + { + "auxiliary_loss_clip": 0.06608147, + "auxiliary_loss_mlp": 0.01302382, + "balance_loss_clip": 0.06305888, + "balance_loss_mlp": 0.01274391, + "epoch": 0.14652036675184127, + "flos": 20928480986880.0, + "grad_norm": 2.9920720004583194, + "language_loss": 0.75810778, + "learning_rate": 3.85926034942691e-06, + "loss": 0.83721304, + "num_input_tokens_seen": 52909705, + "router_z_loss_clip": 3.02148438, + "router_z_loss_mlp": 0.2800293, + "step": 2437, + "time_per_iteration": 2.5667972564697266 + }, + { + "auxiliary_loss_clip": 0.06610391, + "auxiliary_loss_mlp": 0.01306019, + "balance_loss_clip": 0.06306973, + "balance_loss_mlp": 0.01277123, + "epoch": 0.14658049000450923, + "flos": 27710151989760.0, + "grad_norm": 2.606428121821339, + "language_loss": 0.7401824, + "learning_rate": 3.859116799930736e-06, + "loss": 0.81934643, + "num_input_tokens_seen": 52930300, + "router_z_loss_clip": 3.03515625, + "router_z_loss_mlp": 0.28857422, + "step": 2438, + "time_per_iteration": 2.6476521492004395 + }, + { + "auxiliary_loss_clip": 0.06605977, + "auxiliary_loss_mlp": 0.01303285, + "balance_loss_clip": 0.06305391, + "balance_loss_mlp": 0.01274865, + "epoch": 0.14664061325717723, + "flos": 24943483152000.0, + "grad_norm": 2.0459162456522595, + "language_loss": 0.7577256, + "learning_rate": 3.858973179936668e-06, + "loss": 0.83681822, + "num_input_tokens_seen": 52949955, + "router_z_loss_clip": 3.00585938, + "router_z_loss_mlp": 0.28442383, + "step": 2439, + "time_per_iteration": 2.5789241790771484 + }, + { + "auxiliary_loss_clip": 0.06618818, + "auxiliary_loss_mlp": 0.01305858, + "balance_loss_clip": 0.06318325, + "balance_loss_mlp": 0.01278261, + "epoch": 0.1467007365098452, + "flos": 40307306964480.0, + "grad_norm": 4.636382420589035, + "language_loss": 0.74925351, + "learning_rate": 3.85882948945015e-06, + "loss": 0.82850027, + "num_input_tokens_seen": 52972905, + "router_z_loss_clip": 3.00976562, + "router_z_loss_mlp": 0.27624512, + "step": 2440, + "time_per_iteration": 2.7299485206604004 + }, + { + "auxiliary_loss_clip": 0.06605764, + "auxiliary_loss_mlp": 0.01314168, + "balance_loss_clip": 0.06310172, + "balance_loss_mlp": 0.01287667, + "epoch": 0.14676085976251316, + "flos": 26548175894400.0, + "grad_norm": 2.8544116905201755, + "language_loss": 0.84429544, + "learning_rate": 3.85868572847663e-06, + "loss": 0.92349476, + "num_input_tokens_seen": 52994850, + "router_z_loss_clip": 2.95703125, + "router_z_loss_mlp": 0.26513672, + "step": 2441, + "time_per_iteration": 2.631824016571045 + }, + { + "auxiliary_loss_clip": 0.0662398, + "auxiliary_loss_mlp": 0.01301683, + "balance_loss_clip": 0.06313129, + "balance_loss_mlp": 0.0127188, + "epoch": 0.14682098301518112, + "flos": 23556857460480.0, + "grad_norm": 2.3203183858424175, + "language_loss": 0.73868263, + "learning_rate": 3.858541897021563e-06, + "loss": 0.81793922, + "num_input_tokens_seen": 53014740, + "router_z_loss_clip": 3.11132812, + "router_z_loss_mlp": 0.29785156, + "step": 2442, + "time_per_iteration": 2.549813747406006 + }, + { + "auxiliary_loss_clip": 0.06618661, + "auxiliary_loss_mlp": 0.01300103, + "balance_loss_clip": 0.06309915, + "balance_loss_mlp": 0.01271934, + "epoch": 0.1468811062678491, + "flos": 11655048792960.0, + "grad_norm": 3.9053582460255756, + "language_loss": 0.82657981, + "learning_rate": 3.8583979950904e-06, + "loss": 0.90576744, + "num_input_tokens_seen": 53029780, + "router_z_loss_clip": 3.08789062, + "router_z_loss_mlp": 0.28161621, + "step": 2443, + "time_per_iteration": 2.5171542167663574 + }, + { + "auxiliary_loss_clip": 0.06611481, + "auxiliary_loss_mlp": 0.01308471, + "balance_loss_clip": 0.06310362, + "balance_loss_mlp": 0.0128184, + "epoch": 0.14694122952051705, + "flos": 23009237101440.0, + "grad_norm": 2.0286604977239477, + "language_loss": 0.84266245, + "learning_rate": 3.858254022688599e-06, + "loss": 0.92186195, + "num_input_tokens_seen": 53048620, + "router_z_loss_clip": 3.01171875, + "router_z_loss_mlp": 0.26635742, + "step": 2444, + "time_per_iteration": 2.5373833179473877 + }, + { + "auxiliary_loss_clip": 0.06614003, + "auxiliary_loss_mlp": 0.01304434, + "balance_loss_clip": 0.0631294, + "balance_loss_mlp": 0.0127692, + "epoch": 0.14700135277318502, + "flos": 26509797924480.0, + "grad_norm": 1.800920496835182, + "language_loss": 0.72034383, + "learning_rate": 3.85810997982162e-06, + "loss": 0.79952818, + "num_input_tokens_seen": 53070055, + "router_z_loss_clip": 3.0078125, + "router_z_loss_mlp": 0.27539062, + "step": 2445, + "time_per_iteration": 2.6035430431365967 + }, + { + "auxiliary_loss_clip": 0.0652153, + "auxiliary_loss_mlp": 0.01258872, + "balance_loss_clip": 0.06346728, + "balance_loss_mlp": 0.01251392, + "epoch": 0.147061476025853, + "flos": 59467841527680.0, + "grad_norm": 0.7965915579325233, + "language_loss": 0.62555134, + "learning_rate": 3.857965866494923e-06, + "loss": 0.70335531, + "num_input_tokens_seen": 53126945, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.074646, + "step": 2446, + "time_per_iteration": 3.0864346027374268 + }, + { + "auxiliary_loss_clip": 0.06631434, + "auxiliary_loss_mlp": 0.01305294, + "balance_loss_clip": 0.06324492, + "balance_loss_mlp": 0.01278603, + "epoch": 0.14712159927852098, + "flos": 28338637138560.0, + "grad_norm": 5.819879904445231, + "language_loss": 0.75890815, + "learning_rate": 3.857821682713975e-06, + "loss": 0.83827543, + "num_input_tokens_seen": 53149130, + "router_z_loss_clip": 3.0703125, + "router_z_loss_mlp": 0.26708984, + "step": 2447, + "time_per_iteration": 2.6405458450317383 + }, + { + "auxiliary_loss_clip": 0.0662236, + "auxiliary_loss_mlp": 0.01295421, + "balance_loss_clip": 0.06319176, + "balance_loss_mlp": 0.01267097, + "epoch": 0.14718172253118894, + "flos": 27097263699840.0, + "grad_norm": 3.1585594254982094, + "language_loss": 0.86766493, + "learning_rate": 3.857677428484242e-06, + "loss": 0.94684267, + "num_input_tokens_seen": 53167120, + "router_z_loss_clip": 3.03515625, + "router_z_loss_mlp": 0.28344727, + "step": 2448, + "time_per_iteration": 2.588178873062134 + }, + { + "auxiliary_loss_clip": 0.06500641, + "auxiliary_loss_mlp": 0.01262898, + "balance_loss_clip": 0.0632707, + "balance_loss_mlp": 0.01254792, + "epoch": 0.1472418457838569, + "flos": 66725827464960.0, + "grad_norm": 0.7311302410121435, + "language_loss": 0.56820273, + "learning_rate": 3.857533103811195e-06, + "loss": 0.64583808, + "num_input_tokens_seen": 53227945, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.08105469, + "step": 2449, + "time_per_iteration": 3.1432383060455322 + }, + { + "auxiliary_loss_clip": 0.06619844, + "auxiliary_loss_mlp": 0.01304126, + "balance_loss_clip": 0.06319091, + "balance_loss_mlp": 0.01278663, + "epoch": 0.14730196903652487, + "flos": 19579730140800.0, + "grad_norm": 2.3714801519715185, + "language_loss": 0.86300421, + "learning_rate": 3.857388708700307e-06, + "loss": 0.94224387, + "num_input_tokens_seen": 53244615, + "router_z_loss_clip": 3.0078125, + "router_z_loss_mlp": 0.2545166, + "step": 2450, + "time_per_iteration": 2.6230788230895996 + }, + { + "auxiliary_loss_clip": 0.06624465, + "auxiliary_loss_mlp": 0.01292799, + "balance_loss_clip": 0.06318057, + "balance_loss_mlp": 0.01265774, + "epoch": 0.14736209228919284, + "flos": 16076611768320.0, + "grad_norm": 3.0293103266492336, + "language_loss": 0.76407862, + "learning_rate": 3.857244243157052e-06, + "loss": 0.84325123, + "num_input_tokens_seen": 53262205, + "router_z_loss_clip": 3.06445312, + "router_z_loss_mlp": 0.2701416, + "step": 2451, + "time_per_iteration": 2.562429428100586 + }, + { + "auxiliary_loss_clip": 0.06606978, + "auxiliary_loss_mlp": 0.0129124, + "balance_loss_clip": 0.0631422, + "balance_loss_mlp": 0.01263881, + "epoch": 0.1474222155418608, + "flos": 23046147624960.0, + "grad_norm": 2.189425489790517, + "language_loss": 0.82725209, + "learning_rate": 3.85709970718691e-06, + "loss": 0.90623426, + "num_input_tokens_seen": 53282445, + "router_z_loss_clip": 2.92578125, + "router_z_loss_mlp": 0.27355957, + "step": 2452, + "time_per_iteration": 2.5850419998168945 + }, + { + "auxiliary_loss_clip": 0.06614233, + "auxiliary_loss_mlp": 0.01290168, + "balance_loss_clip": 0.06316262, + "balance_loss_mlp": 0.01264562, + "epoch": 0.1474823387945288, + "flos": 17024210392320.0, + "grad_norm": 1.704036472783103, + "language_loss": 0.7534892, + "learning_rate": 3.856955100795361e-06, + "loss": 0.83253324, + "num_input_tokens_seen": 53299060, + "router_z_loss_clip": 2.98046875, + "router_z_loss_mlp": 0.2565918, + "step": 2453, + "time_per_iteration": 2.56315016746521 + }, + { + "auxiliary_loss_clip": 0.06629206, + "auxiliary_loss_mlp": 0.01291559, + "balance_loss_clip": 0.06321974, + "balance_loss_mlp": 0.01263521, + "epoch": 0.14754246204719676, + "flos": 17900880935040.0, + "grad_norm": 2.0859032314961836, + "language_loss": 0.7740314, + "learning_rate": 3.856810423987889e-06, + "loss": 0.853239, + "num_input_tokens_seen": 53315970, + "router_z_loss_clip": 3.06835938, + "router_z_loss_mlp": 0.28076172, + "step": 2454, + "time_per_iteration": 2.512051582336426 + }, + { + "auxiliary_loss_clip": 0.06621231, + "auxiliary_loss_mlp": 0.01296513, + "balance_loss_clip": 0.06321682, + "balance_loss_mlp": 0.01269392, + "epoch": 0.14760258529986472, + "flos": 13084161304320.0, + "grad_norm": 2.060710477094934, + "language_loss": 0.84565163, + "learning_rate": 3.856665676769979e-06, + "loss": 0.92482901, + "num_input_tokens_seen": 53332940, + "router_z_loss_clip": 2.99609375, + "router_z_loss_mlp": 0.2713623, + "step": 2455, + "time_per_iteration": 2.5514066219329834 + }, + { + "auxiliary_loss_clip": 0.06633241, + "auxiliary_loss_mlp": 0.01283691, + "balance_loss_clip": 0.06325488, + "balance_loss_mlp": 0.01257393, + "epoch": 0.1476627085525327, + "flos": 30813627513600.0, + "grad_norm": 5.872574686414898, + "language_loss": 0.85135001, + "learning_rate": 3.85652085914712e-06, + "loss": 0.93051934, + "num_input_tokens_seen": 53353295, + "router_z_loss_clip": 3.07421875, + "router_z_loss_mlp": 0.26281738, + "step": 2456, + "time_per_iteration": 2.638485908508301 + }, + { + "auxiliary_loss_clip": 0.0661984, + "auxiliary_loss_mlp": 0.01288462, + "balance_loss_clip": 0.06324227, + "balance_loss_mlp": 0.01261926, + "epoch": 0.14772283180520066, + "flos": 21695887405440.0, + "grad_norm": 3.5788318870076674, + "language_loss": 0.85374033, + "learning_rate": 3.856375971124805e-06, + "loss": 0.93282336, + "num_input_tokens_seen": 53373410, + "router_z_loss_clip": 2.9609375, + "router_z_loss_mlp": 0.26550293, + "step": 2457, + "time_per_iteration": 2.5397539138793945 + }, + { + "auxiliary_loss_clip": 0.06612187, + "auxiliary_loss_mlp": 0.01285174, + "balance_loss_clip": 0.06322154, + "balance_loss_mlp": 0.01258817, + "epoch": 0.14778295505786862, + "flos": 18776335593600.0, + "grad_norm": 2.2072082990650896, + "language_loss": 0.76667166, + "learning_rate": 3.856231012708527e-06, + "loss": 0.84564531, + "num_input_tokens_seen": 53391430, + "router_z_loss_clip": 2.90039062, + "router_z_loss_mlp": 0.26379395, + "step": 2458, + "time_per_iteration": 2.5479953289031982 + }, + { + "auxiliary_loss_clip": 0.0664083, + "auxiliary_loss_mlp": 0.01290982, + "balance_loss_clip": 0.06331704, + "balance_loss_mlp": 0.01262992, + "epoch": 0.1478430783105366, + "flos": 22900224539520.0, + "grad_norm": 2.4431680555354185, + "language_loss": 0.84230208, + "learning_rate": 3.856085983903782e-06, + "loss": 0.92162013, + "num_input_tokens_seen": 53409960, + "router_z_loss_clip": 3.09375, + "router_z_loss_mlp": 0.28027344, + "step": 2459, + "time_per_iteration": 2.555878162384033 + }, + { + "auxiliary_loss_clip": 0.06625295, + "auxiliary_loss_mlp": 0.01283208, + "balance_loss_clip": 0.06332543, + "balance_loss_mlp": 0.01257983, + "epoch": 0.14790320156320458, + "flos": 15090635174400.0, + "grad_norm": 2.440333441232677, + "language_loss": 0.76468259, + "learning_rate": 3.855940884716071e-06, + "loss": 0.84376764, + "num_input_tokens_seen": 53426160, + "router_z_loss_clip": 2.92773438, + "router_z_loss_mlp": 0.2520752, + "step": 2460, + "time_per_iteration": 2.528325319290161 + }, + { + "auxiliary_loss_clip": 0.06624737, + "auxiliary_loss_mlp": 0.01287086, + "balance_loss_clip": 0.06318681, + "balance_loss_mlp": 0.0125912, + "epoch": 0.14796332481587254, + "flos": 26511894276480.0, + "grad_norm": 1.7434250987621476, + "language_loss": 0.82039559, + "learning_rate": 3.855795715150896e-06, + "loss": 0.89951384, + "num_input_tokens_seen": 53448530, + "router_z_loss_clip": 3.0625, + "router_z_loss_mlp": 0.27941895, + "step": 2461, + "time_per_iteration": 2.609023332595825 + }, + { + "auxiliary_loss_clip": 0.06627606, + "auxiliary_loss_mlp": 0.0129144, + "balance_loss_clip": 0.06326235, + "balance_loss_mlp": 0.01263497, + "epoch": 0.1480234480685405, + "flos": 17568392734080.0, + "grad_norm": 4.638743932579621, + "language_loss": 0.6665929, + "learning_rate": 3.855650475213761e-06, + "loss": 0.74578333, + "num_input_tokens_seen": 53465915, + "router_z_loss_clip": 3.015625, + "router_z_loss_mlp": 0.27954102, + "step": 2462, + "time_per_iteration": 2.5234897136688232 + }, + { + "auxiliary_loss_clip": 0.06619708, + "auxiliary_loss_mlp": 0.01287497, + "balance_loss_clip": 0.06320504, + "balance_loss_mlp": 0.01260925, + "epoch": 0.14808357132120847, + "flos": 53594693147520.0, + "grad_norm": 12.154278546197556, + "language_loss": 0.68225503, + "learning_rate": 3.8555051649101745e-06, + "loss": 0.76132703, + "num_input_tokens_seen": 53496055, + "router_z_loss_clip": 2.9921875, + "router_z_loss_mlp": 0.26574707, + "step": 2463, + "time_per_iteration": 2.847352981567383 + }, + { + "auxiliary_loss_clip": 0.06631631, + "auxiliary_loss_mlp": 0.01292564, + "balance_loss_clip": 0.06328086, + "balance_loss_mlp": 0.01264788, + "epoch": 0.14814369457387644, + "flos": 19835420474880.0, + "grad_norm": 2.5558663587768917, + "language_loss": 0.77389717, + "learning_rate": 3.855359784245646e-06, + "loss": 0.85313916, + "num_input_tokens_seen": 53513790, + "router_z_loss_clip": 3.03515625, + "router_z_loss_mlp": 0.27783203, + "step": 2464, + "time_per_iteration": 3.9868950843811035 + }, + { + "auxiliary_loss_clip": 0.0661262, + "auxiliary_loss_mlp": 0.01291855, + "balance_loss_clip": 0.06322042, + "balance_loss_mlp": 0.01266356, + "epoch": 0.1482038178265444, + "flos": 23921769991680.0, + "grad_norm": 1.9637026483751652, + "language_loss": 0.80667269, + "learning_rate": 3.855214333225688e-06, + "loss": 0.88571739, + "num_input_tokens_seen": 53533410, + "router_z_loss_clip": 2.90820312, + "router_z_loss_mlp": 0.25500488, + "step": 2465, + "time_per_iteration": 4.024165630340576 + }, + { + "auxiliary_loss_clip": 0.06628035, + "auxiliary_loss_mlp": 0.01295444, + "balance_loss_clip": 0.06321928, + "balance_loss_mlp": 0.01265976, + "epoch": 0.1482639410792124, + "flos": 24177376471680.0, + "grad_norm": 3.100026638907138, + "language_loss": 0.77266049, + "learning_rate": 3.855068811855817e-06, + "loss": 0.85189527, + "num_input_tokens_seen": 53554775, + "router_z_loss_clip": 3.06054688, + "router_z_loss_mlp": 0.29467773, + "step": 2466, + "time_per_iteration": 2.583932638168335 + }, + { + "auxiliary_loss_clip": 0.06510445, + "auxiliary_loss_mlp": 0.01275284, + "balance_loss_clip": 0.06339325, + "balance_loss_mlp": 0.012657, + "epoch": 0.14832406433188036, + "flos": 66209205916800.0, + "grad_norm": 0.9642098795906485, + "language_loss": 0.60506117, + "learning_rate": 3.854923220141551e-06, + "loss": 0.68291849, + "num_input_tokens_seen": 53609675, + "router_z_loss_clip": 1.71191406, + "router_z_loss_mlp": 0.09570312, + "step": 2467, + "time_per_iteration": 3.206559419631958 + }, + { + "auxiliary_loss_clip": 0.06627056, + "auxiliary_loss_mlp": 0.0129155, + "balance_loss_clip": 0.06326642, + "balance_loss_mlp": 0.01264573, + "epoch": 0.14838418758454833, + "flos": 25418372567040.0, + "grad_norm": 2.1383686818257877, + "language_loss": 0.88646448, + "learning_rate": 3.85477755808841e-06, + "loss": 0.96565056, + "num_input_tokens_seen": 53626950, + "router_z_loss_clip": 3.00390625, + "router_z_loss_mlp": 0.26965332, + "step": 2468, + "time_per_iteration": 2.586428642272949 + }, + { + "auxiliary_loss_clip": 0.06632069, + "auxiliary_loss_mlp": 0.01295941, + "balance_loss_clip": 0.0632536, + "balance_loss_mlp": 0.01267236, + "epoch": 0.1484443108372163, + "flos": 23295800465280.0, + "grad_norm": 2.089009169061615, + "language_loss": 0.76661634, + "learning_rate": 3.854631825701919e-06, + "loss": 0.84589648, + "num_input_tokens_seen": 53644200, + "router_z_loss_clip": 3.0625, + "router_z_loss_mlp": 0.28686523, + "step": 2469, + "time_per_iteration": 5.45016884803772 + }, + { + "auxiliary_loss_clip": 0.06630681, + "auxiliary_loss_mlp": 0.01291477, + "balance_loss_clip": 0.06328478, + "balance_loss_mlp": 0.01264131, + "epoch": 0.14850443408988426, + "flos": 14652949772160.0, + "grad_norm": 3.485678754962802, + "language_loss": 0.76790643, + "learning_rate": 3.854486022987603e-06, + "loss": 0.84712803, + "num_input_tokens_seen": 53659650, + "router_z_loss_clip": 3.0234375, + "router_z_loss_mlp": 0.2734375, + "step": 2470, + "time_per_iteration": 2.514772653579712 + }, + { + "auxiliary_loss_clip": 0.06622952, + "auxiliary_loss_mlp": 0.01299835, + "balance_loss_clip": 0.06329592, + "balance_loss_mlp": 0.0127324, + "epoch": 0.14856455734255222, + "flos": 23554761108480.0, + "grad_norm": 3.1357945603829576, + "language_loss": 0.73019731, + "learning_rate": 3.8543401499509905e-06, + "loss": 0.80942523, + "num_input_tokens_seen": 53680275, + "router_z_loss_clip": 2.93359375, + "router_z_loss_mlp": 0.26623535, + "step": 2471, + "time_per_iteration": 2.5867044925689697 + }, + { + "auxiliary_loss_clip": 0.06632146, + "auxiliary_loss_mlp": 0.01309567, + "balance_loss_clip": 0.06325525, + "balance_loss_mlp": 0.01281862, + "epoch": 0.1486246805952202, + "flos": 18083127565440.0, + "grad_norm": 2.6270207816723894, + "language_loss": 0.90878981, + "learning_rate": 3.854194206597615e-06, + "loss": 0.98820698, + "num_input_tokens_seen": 53698270, + "router_z_loss_clip": 3.0625, + "router_z_loss_mlp": 0.27709961, + "step": 2472, + "time_per_iteration": 2.5934388637542725 + }, + { + "auxiliary_loss_clip": 0.06620878, + "auxiliary_loss_mlp": 0.01314043, + "balance_loss_clip": 0.06322667, + "balance_loss_mlp": 0.01286136, + "epoch": 0.14868480384788818, + "flos": 19359566737920.0, + "grad_norm": 2.5877207728101332, + "language_loss": 0.81794894, + "learning_rate": 3.854048192933008e-06, + "loss": 0.89729816, + "num_input_tokens_seen": 53716845, + "router_z_loss_clip": 2.984375, + "router_z_loss_mlp": 0.2791748, + "step": 2473, + "time_per_iteration": 2.551769256591797 + }, + { + "auxiliary_loss_clip": 0.06630681, + "auxiliary_loss_mlp": 0.01339003, + "balance_loss_clip": 0.06328606, + "balance_loss_mlp": 0.01311346, + "epoch": 0.14874492710055615, + "flos": 22206723022080.0, + "grad_norm": 2.4925002468384423, + "language_loss": 0.79495537, + "learning_rate": 3.853902108962709e-06, + "loss": 0.87465227, + "num_input_tokens_seen": 53734970, + "router_z_loss_clip": 3.01953125, + "router_z_loss_mlp": 0.27624512, + "step": 2474, + "time_per_iteration": 2.55029034614563 + }, + { + "auxiliary_loss_clip": 0.06643772, + "auxiliary_loss_mlp": 0.01336817, + "balance_loss_clip": 0.06335679, + "balance_loss_mlp": 0.01309256, + "epoch": 0.1488050503532241, + "flos": 21109427879040.0, + "grad_norm": 2.598618910298095, + "language_loss": 0.8324194, + "learning_rate": 3.853755954692255e-06, + "loss": 0.91222525, + "num_input_tokens_seen": 53753415, + "router_z_loss_clip": 3.08007812, + "router_z_loss_mlp": 0.27575684, + "step": 2475, + "time_per_iteration": 2.557748794555664 + }, + { + "auxiliary_loss_clip": 0.06641456, + "auxiliary_loss_mlp": 0.01357893, + "balance_loss_clip": 0.06342697, + "balance_loss_mlp": 0.01329998, + "epoch": 0.14886517360589208, + "flos": 12791476592640.0, + "grad_norm": 3.118918756982401, + "language_loss": 0.81896377, + "learning_rate": 3.85360973012719e-06, + "loss": 0.89895725, + "num_input_tokens_seen": 53770305, + "router_z_loss_clip": 2.984375, + "router_z_loss_mlp": 0.27929688, + "step": 2476, + "time_per_iteration": 2.5228424072265625 + }, + { + "auxiliary_loss_clip": 0.06643493, + "auxiliary_loss_mlp": 0.01381513, + "balance_loss_clip": 0.06351461, + "balance_loss_mlp": 0.01354202, + "epoch": 0.14892529685856004, + "flos": 29030503501440.0, + "grad_norm": 5.933104141951435, + "language_loss": 0.78306687, + "learning_rate": 3.853463435273058e-06, + "loss": 0.86331695, + "num_input_tokens_seen": 53788895, + "router_z_loss_clip": 2.921875, + "router_z_loss_mlp": 0.27307129, + "step": 2477, + "time_per_iteration": 2.6379337310791016 + }, + { + "auxiliary_loss_clip": 0.06518018, + "auxiliary_loss_mlp": 0.01346882, + "balance_loss_clip": 0.06346889, + "balance_loss_mlp": 0.01337793, + "epoch": 0.148985420111228, + "flos": 61944215495040.0, + "grad_norm": 0.7948106415234558, + "language_loss": 0.60108519, + "learning_rate": 3.853317070135407e-06, + "loss": 0.67973411, + "num_input_tokens_seen": 53850260, + "router_z_loss_clip": 1.7109375, + "router_z_loss_mlp": 0.09100342, + "step": 2478, + "time_per_iteration": 3.2091856002807617 + }, + { + "auxiliary_loss_clip": 0.06656381, + "auxiliary_loss_mlp": 0.01381988, + "balance_loss_clip": 0.06356013, + "balance_loss_mlp": 0.01354606, + "epoch": 0.149045543363896, + "flos": 23921937699840.0, + "grad_norm": 3.933079411076695, + "language_loss": 0.71247137, + "learning_rate": 3.853170634719787e-06, + "loss": 0.79285508, + "num_input_tokens_seen": 53867520, + "router_z_loss_clip": 3.00195312, + "router_z_loss_mlp": 0.27392578, + "step": 2479, + "time_per_iteration": 2.613901376724243 + }, + { + "auxiliary_loss_clip": 0.06657803, + "auxiliary_loss_mlp": 0.01383638, + "balance_loss_clip": 0.06357619, + "balance_loss_mlp": 0.01356411, + "epoch": 0.14910566661656396, + "flos": 23660293726080.0, + "grad_norm": 3.520474403550157, + "language_loss": 0.82057166, + "learning_rate": 3.853024129031751e-06, + "loss": 0.90098608, + "num_input_tokens_seen": 53886620, + "router_z_loss_clip": 3.00390625, + "router_z_loss_mlp": 0.27246094, + "step": 2480, + "time_per_iteration": 2.6175220012664795 + }, + { + "auxiliary_loss_clip": 0.06659204, + "auxiliary_loss_mlp": 0.01416958, + "balance_loss_clip": 0.06354087, + "balance_loss_mlp": 0.01387727, + "epoch": 0.14916578986923193, + "flos": 20520452730240.0, + "grad_norm": 2.2296604280919805, + "language_loss": 0.85048115, + "learning_rate": 3.852877553076854e-06, + "loss": 0.9312427, + "num_input_tokens_seen": 53902230, + "router_z_loss_clip": 3.05273438, + "router_z_loss_mlp": 0.29248047, + "step": 2481, + "time_per_iteration": 2.617551565170288 + }, + { + "auxiliary_loss_clip": 0.06647365, + "auxiliary_loss_mlp": 0.01423314, + "balance_loss_clip": 0.06347671, + "balance_loss_mlp": 0.01393416, + "epoch": 0.1492259131218999, + "flos": 22498359557760.0, + "grad_norm": 1.912212150867571, + "language_loss": 0.78788674, + "learning_rate": 3.8527309068606546e-06, + "loss": 0.86859351, + "num_input_tokens_seen": 53919475, + "router_z_loss_clip": 2.9921875, + "router_z_loss_mlp": 0.29882812, + "step": 2482, + "time_per_iteration": 2.5733768939971924 + }, + { + "auxiliary_loss_clip": 0.06663539, + "auxiliary_loss_mlp": 0.0143468, + "balance_loss_clip": 0.06351975, + "balance_loss_mlp": 0.01405808, + "epoch": 0.14928603637456786, + "flos": 23192657688960.0, + "grad_norm": 2.2991604479376777, + "language_loss": 0.80652654, + "learning_rate": 3.852584190388713e-06, + "loss": 0.88750875, + "num_input_tokens_seen": 53939150, + "router_z_loss_clip": 3.1171875, + "router_z_loss_mlp": 0.28857422, + "step": 2483, + "time_per_iteration": 2.597843647003174 + }, + { + "auxiliary_loss_clip": 0.06641878, + "auxiliary_loss_mlp": 0.01472083, + "balance_loss_clip": 0.06352127, + "balance_loss_mlp": 0.01442948, + "epoch": 0.14934615962723582, + "flos": 21659731568640.0, + "grad_norm": 2.0225233992765728, + "language_loss": 0.71627355, + "learning_rate": 3.852437403666595e-06, + "loss": 0.79741317, + "num_input_tokens_seen": 53958735, + "router_z_loss_clip": 2.90039062, + "router_z_loss_mlp": 0.2911377, + "step": 2484, + "time_per_iteration": 2.5717227458953857 + }, + { + "auxiliary_loss_clip": 0.06650308, + "auxiliary_loss_mlp": 0.01467216, + "balance_loss_clip": 0.06347484, + "balance_loss_mlp": 0.01435006, + "epoch": 0.1494062828799038, + "flos": 27016356983040.0, + "grad_norm": 2.0068383034806154, + "language_loss": 0.85284823, + "learning_rate": 3.852290546699863e-06, + "loss": 0.9340235, + "num_input_tokens_seen": 53975065, + "router_z_loss_clip": 3.02539062, + "router_z_loss_mlp": 0.32226562, + "step": 2485, + "time_per_iteration": 2.7037456035614014 + }, + { + "auxiliary_loss_clip": 0.0664534, + "auxiliary_loss_mlp": 0.01441016, + "balance_loss_clip": 0.06342804, + "balance_loss_mlp": 0.01410952, + "epoch": 0.14946640613257178, + "flos": 21221291479680.0, + "grad_norm": 2.0879118929126133, + "language_loss": 0.85614496, + "learning_rate": 3.8521436194940894e-06, + "loss": 0.93700856, + "num_input_tokens_seen": 53993330, + "router_z_loss_clip": 3.02539062, + "router_z_loss_mlp": 0.30053711, + "step": 2486, + "time_per_iteration": 2.5492942333221436 + }, + { + "auxiliary_loss_clip": 0.06628142, + "auxiliary_loss_mlp": 0.01484598, + "balance_loss_clip": 0.06337839, + "balance_loss_mlp": 0.01454963, + "epoch": 0.14952652938523975, + "flos": 13375965548160.0, + "grad_norm": 2.5864541617313805, + "language_loss": 0.75625527, + "learning_rate": 3.851996622054842e-06, + "loss": 0.83738261, + "num_input_tokens_seen": 54010515, + "router_z_loss_clip": 2.90039062, + "router_z_loss_mlp": 0.29638672, + "step": 2487, + "time_per_iteration": 2.6050243377685547 + }, + { + "auxiliary_loss_clip": 0.06636909, + "auxiliary_loss_mlp": 0.01458272, + "balance_loss_clip": 0.06336737, + "balance_loss_mlp": 0.01427635, + "epoch": 0.1495866526379077, + "flos": 35526491608320.0, + "grad_norm": 2.6345212857914415, + "language_loss": 0.72756326, + "learning_rate": 3.8518495543877e-06, + "loss": 0.80851501, + "num_input_tokens_seen": 54031315, + "router_z_loss_clip": 3.0, + "router_z_loss_mlp": 0.30639648, + "step": 2488, + "time_per_iteration": 2.7038300037384033 + }, + { + "auxiliary_loss_clip": 0.06629623, + "auxiliary_loss_mlp": 0.01463441, + "balance_loss_clip": 0.06324254, + "balance_loss_mlp": 0.01431421, + "epoch": 0.14964677589057568, + "flos": 17637392171520.0, + "grad_norm": 3.2533111651102633, + "language_loss": 0.71329439, + "learning_rate": 3.851702416498235e-06, + "loss": 0.79422504, + "num_input_tokens_seen": 54045965, + "router_z_loss_clip": 3.05273438, + "router_z_loss_mlp": 0.3203125, + "step": 2489, + "time_per_iteration": 2.6397132873535156 + }, + { + "auxiliary_loss_clip": 0.06627091, + "auxiliary_loss_mlp": 0.01445303, + "balance_loss_clip": 0.06321006, + "balance_loss_mlp": 0.01412807, + "epoch": 0.14970689914324364, + "flos": 20190102808320.0, + "grad_norm": 15.387963507460157, + "language_loss": 0.82698536, + "learning_rate": 3.8515552083920295e-06, + "loss": 0.90770924, + "num_input_tokens_seen": 54059960, + "router_z_loss_clip": 3.05859375, + "router_z_loss_mlp": 0.32446289, + "step": 2490, + "time_per_iteration": 2.560051918029785 + }, + { + "auxiliary_loss_clip": 0.0662353, + "auxiliary_loss_mlp": 0.01421627, + "balance_loss_clip": 0.06318316, + "balance_loss_mlp": 0.013913, + "epoch": 0.1497670223959116, + "flos": 37237136238720.0, + "grad_norm": 2.555318554574921, + "language_loss": 0.81524169, + "learning_rate": 3.851407930074666e-06, + "loss": 0.8956933, + "num_input_tokens_seen": 54079330, + "router_z_loss_clip": 3.05273438, + "router_z_loss_mlp": 0.30322266, + "step": 2491, + "time_per_iteration": 2.7191121578216553 + }, + { + "auxiliary_loss_clip": 0.06628857, + "auxiliary_loss_mlp": 0.01437567, + "balance_loss_clip": 0.06323408, + "balance_loss_mlp": 0.01406072, + "epoch": 0.1498271456485796, + "flos": 24461675775360.0, + "grad_norm": 2.0859620961652032, + "language_loss": 0.91616488, + "learning_rate": 3.851260581551727e-06, + "loss": 0.99682909, + "num_input_tokens_seen": 54097555, + "router_z_loss_clip": 3.05273438, + "router_z_loss_mlp": 0.31469727, + "step": 2492, + "time_per_iteration": 2.5775644779205322 + }, + { + "auxiliary_loss_clip": 0.06620014, + "auxiliary_loss_mlp": 0.01407656, + "balance_loss_clip": 0.06319647, + "balance_loss_mlp": 0.01375589, + "epoch": 0.14988726890124757, + "flos": 16259235742080.0, + "grad_norm": 4.194340578044498, + "language_loss": 0.80698526, + "learning_rate": 3.851113162828802e-06, + "loss": 0.88726199, + "num_input_tokens_seen": 54115600, + "router_z_loss_clip": 3.00585938, + "router_z_loss_mlp": 0.3203125, + "step": 2493, + "time_per_iteration": 2.522217273712158 + }, + { + "auxiliary_loss_clip": 0.06625558, + "auxiliary_loss_mlp": 0.01423964, + "balance_loss_clip": 0.06320652, + "balance_loss_mlp": 0.01391014, + "epoch": 0.14994739215391553, + "flos": 20672622944640.0, + "grad_norm": 1.92476481647275, + "language_loss": 0.81586623, + "learning_rate": 3.85096567391148e-06, + "loss": 0.89636147, + "num_input_tokens_seen": 54135220, + "router_z_loss_clip": 3.05078125, + "router_z_loss_mlp": 0.32958984, + "step": 2494, + "time_per_iteration": 2.5768370628356934 + }, + { + "auxiliary_loss_clip": 0.06620924, + "auxiliary_loss_mlp": 0.01381746, + "balance_loss_clip": 0.06323613, + "balance_loss_mlp": 0.01351562, + "epoch": 0.1500075154065835, + "flos": 70666855603200.0, + "grad_norm": 1.9921469546830013, + "language_loss": 0.67712897, + "learning_rate": 3.850818114805354e-06, + "loss": 0.75715572, + "num_input_tokens_seen": 54161065, + "router_z_loss_clip": 2.97851562, + "router_z_loss_mlp": 0.30187988, + "step": 2495, + "time_per_iteration": 2.9661571979522705 + }, + { + "auxiliary_loss_clip": 0.06548879, + "auxiliary_loss_mlp": 0.01321563, + "balance_loss_clip": 0.06377496, + "balance_loss_mlp": 0.01310876, + "epoch": 0.15006763865925146, + "flos": 68029827431040.0, + "grad_norm": 0.8769612772619841, + "language_loss": 0.5954529, + "learning_rate": 3.850670485516019e-06, + "loss": 0.67415726, + "num_input_tokens_seen": 54225095, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.10699463, + "step": 2496, + "time_per_iteration": 3.202047109603882 + }, + { + "auxiliary_loss_clip": 0.06631249, + "auxiliary_loss_mlp": 0.0133476, + "balance_loss_clip": 0.06323538, + "balance_loss_mlp": 0.01304254, + "epoch": 0.15012776191191943, + "flos": 18922216752000.0, + "grad_norm": 2.34505525234942, + "language_loss": 0.66916072, + "learning_rate": 3.850522786049075e-06, + "loss": 0.74882078, + "num_input_tokens_seen": 54243750, + "router_z_loss_clip": 3.08007812, + "router_z_loss_mlp": 0.30505371, + "step": 2497, + "time_per_iteration": 2.5355312824249268 + }, + { + "auxiliary_loss_clip": 0.06621728, + "auxiliary_loss_mlp": 0.01327478, + "balance_loss_clip": 0.06319709, + "balance_loss_mlp": 0.01299762, + "epoch": 0.1501878851645874, + "flos": 23708985747840.0, + "grad_norm": 1.6926191632820315, + "language_loss": 0.76545727, + "learning_rate": 3.850375016410121e-06, + "loss": 0.84494931, + "num_input_tokens_seen": 54266185, + "router_z_loss_clip": 3.0234375, + "router_z_loss_mlp": 0.27746582, + "step": 2498, + "time_per_iteration": 2.6315629482269287 + }, + { + "auxiliary_loss_clip": 0.06625126, + "auxiliary_loss_mlp": 0.0132033, + "balance_loss_clip": 0.06315958, + "balance_loss_mlp": 0.01288454, + "epoch": 0.15024800841725539, + "flos": 20418777400320.0, + "grad_norm": 2.3031515729251377, + "language_loss": 0.72851908, + "learning_rate": 3.850227176604761e-06, + "loss": 0.80797374, + "num_input_tokens_seen": 54283940, + "router_z_loss_clip": 3.09570312, + "router_z_loss_mlp": 0.3190918, + "step": 2499, + "time_per_iteration": 2.550572395324707 + }, + { + "auxiliary_loss_clip": 0.06615321, + "auxiliary_loss_mlp": 0.01299804, + "balance_loss_clip": 0.06312654, + "balance_loss_mlp": 0.01270002, + "epoch": 0.15030813166992335, + "flos": 31838904472320.0, + "grad_norm": 2.1036429780105204, + "language_loss": 0.72527623, + "learning_rate": 3.850079266638601e-06, + "loss": 0.80442744, + "num_input_tokens_seen": 54304830, + "router_z_loss_clip": 3.02539062, + "router_z_loss_mlp": 0.29760742, + "step": 2500, + "time_per_iteration": 2.66140079498291 + }, + { + "auxiliary_loss_clip": 0.06611083, + "auxiliary_loss_mlp": 0.01296332, + "balance_loss_clip": 0.06309603, + "balance_loss_mlp": 0.0126765, + "epoch": 0.15036825492259132, + "flos": 35665664440320.0, + "grad_norm": 2.1651988912264697, + "language_loss": 0.6639303, + "learning_rate": 3.849931286517249e-06, + "loss": 0.74300444, + "num_input_tokens_seen": 54325595, + "router_z_loss_clip": 3.01953125, + "router_z_loss_mlp": 0.28686523, + "step": 2501, + "time_per_iteration": 2.6920387744903564 + }, + { + "auxiliary_loss_clip": 0.06617519, + "auxiliary_loss_mlp": 0.0129342, + "balance_loss_clip": 0.06313312, + "balance_loss_mlp": 0.01262283, + "epoch": 0.15042837817525928, + "flos": 18843238679040.0, + "grad_norm": 2.189390095106363, + "language_loss": 0.84965289, + "learning_rate": 3.849783236246318e-06, + "loss": 0.92876226, + "num_input_tokens_seen": 54342180, + "router_z_loss_clip": 3.04101562, + "router_z_loss_mlp": 0.31152344, + "step": 2502, + "time_per_iteration": 2.5896334648132324 + }, + { + "auxiliary_loss_clip": 0.06611362, + "auxiliary_loss_mlp": 0.01289243, + "balance_loss_clip": 0.06310903, + "balance_loss_mlp": 0.0126142, + "epoch": 0.15048850142792725, + "flos": 19541436024960.0, + "grad_norm": 2.1165990533687746, + "language_loss": 0.78282011, + "learning_rate": 3.849635115831421e-06, + "loss": 0.86182618, + "num_input_tokens_seen": 54360255, + "router_z_loss_clip": 3.00585938, + "router_z_loss_mlp": 0.2779541, + "step": 2503, + "time_per_iteration": 3.9853694438934326 + }, + { + "auxiliary_loss_clip": 0.06603716, + "auxiliary_loss_mlp": 0.01289674, + "balance_loss_clip": 0.06307186, + "balance_loss_mlp": 0.01263102, + "epoch": 0.1505486246805952, + "flos": 22024015194240.0, + "grad_norm": 1.9675013040349558, + "language_loss": 0.8635025, + "learning_rate": 3.849486925278176e-06, + "loss": 0.94243646, + "num_input_tokens_seen": 54378260, + "router_z_loss_clip": 2.96679688, + "router_z_loss_mlp": 0.26586914, + "step": 2504, + "time_per_iteration": 2.544656991958618 + }, + { + "auxiliary_loss_clip": 0.06603047, + "auxiliary_loss_mlp": 0.0129183, + "balance_loss_clip": 0.06305411, + "balance_loss_mlp": 0.01264794, + "epoch": 0.15060874793326318, + "flos": 20749840081920.0, + "grad_norm": 2.8187796049403127, + "language_loss": 0.83803535, + "learning_rate": 3.8493386645922e-06, + "loss": 0.91698414, + "num_input_tokens_seen": 54399745, + "router_z_loss_clip": 2.97851562, + "router_z_loss_mlp": 0.27050781, + "step": 2505, + "time_per_iteration": 3.988954544067383 + }, + { + "auxiliary_loss_clip": 0.06600159, + "auxiliary_loss_mlp": 0.01291215, + "balance_loss_clip": 0.06303117, + "balance_loss_mlp": 0.01263249, + "epoch": 0.15066887118593117, + "flos": 16477470501120.0, + "grad_norm": 1.903749804745976, + "language_loss": 0.77148849, + "learning_rate": 3.849190333779117e-06, + "loss": 0.85040224, + "num_input_tokens_seen": 54417105, + "router_z_loss_clip": 2.97070312, + "router_z_loss_mlp": 0.27978516, + "step": 2506, + "time_per_iteration": 2.548551559448242 + }, + { + "auxiliary_loss_clip": 0.06619012, + "auxiliary_loss_mlp": 0.01287214, + "balance_loss_clip": 0.06307869, + "balance_loss_mlp": 0.01257722, + "epoch": 0.15072899443859913, + "flos": 19864490641920.0, + "grad_norm": 4.281401041045214, + "language_loss": 0.78119665, + "learning_rate": 3.849041932844552e-06, + "loss": 0.86025894, + "num_input_tokens_seen": 54433920, + "router_z_loss_clip": 3.11328125, + "router_z_loss_mlp": 0.29467773, + "step": 2507, + "time_per_iteration": 2.494123697280884 + }, + { + "auxiliary_loss_clip": 0.06598042, + "auxiliary_loss_mlp": 0.01289211, + "balance_loss_clip": 0.06304646, + "balance_loss_mlp": 0.01262532, + "epoch": 0.1507891176912671, + "flos": 20782348266240.0, + "grad_norm": 1.9743385281698682, + "language_loss": 0.69510758, + "learning_rate": 3.848893461794131e-06, + "loss": 0.77398014, + "num_input_tokens_seen": 54451540, + "router_z_loss_clip": 2.93554688, + "router_z_loss_mlp": 0.26647949, + "step": 2508, + "time_per_iteration": 2.53487491607666 + }, + { + "auxiliary_loss_clip": 0.06608425, + "auxiliary_loss_mlp": 0.01288258, + "balance_loss_clip": 0.06303222, + "balance_loss_mlp": 0.01259946, + "epoch": 0.15084924094393506, + "flos": 23593390640640.0, + "grad_norm": 1.8413842263271991, + "language_loss": 0.78278601, + "learning_rate": 3.8487449206334845e-06, + "loss": 0.86175287, + "num_input_tokens_seen": 54470800, + "router_z_loss_clip": 3.0546875, + "router_z_loss_mlp": 0.28320312, + "step": 2509, + "time_per_iteration": 5.512920141220093 + }, + { + "auxiliary_loss_clip": 0.06619874, + "auxiliary_loss_mlp": 0.01301611, + "balance_loss_clip": 0.06305903, + "balance_loss_mlp": 0.01270879, + "epoch": 0.15090936419660303, + "flos": 18916430820480.0, + "grad_norm": 3.8878243194331756, + "language_loss": 0.82607746, + "learning_rate": 3.848596309368246e-06, + "loss": 0.90529227, + "num_input_tokens_seen": 54486525, + "router_z_loss_clip": 3.140625, + "router_z_loss_mlp": 0.30688477, + "step": 2510, + "time_per_iteration": 2.4956603050231934 + }, + { + "auxiliary_loss_clip": 0.0661021, + "auxiliary_loss_mlp": 0.01290438, + "balance_loss_clip": 0.06301613, + "balance_loss_mlp": 0.01258919, + "epoch": 0.150969487449271, + "flos": 17933514900480.0, + "grad_norm": 2.455863983709149, + "language_loss": 0.74876237, + "learning_rate": 3.8484476280040495e-06, + "loss": 0.82776886, + "num_input_tokens_seen": 54503795, + "router_z_loss_clip": 3.08789062, + "router_z_loss_mlp": 0.31518555, + "step": 2511, + "time_per_iteration": 2.551175832748413 + }, + { + "auxiliary_loss_clip": 0.06603982, + "auxiliary_loss_mlp": 0.0129301, + "balance_loss_clip": 0.06306278, + "balance_loss_mlp": 0.012649, + "epoch": 0.151029610701939, + "flos": 24249897780480.0, + "grad_norm": 3.2919067663681854, + "language_loss": 0.6990515, + "learning_rate": 3.848298876546534e-06, + "loss": 0.77802145, + "num_input_tokens_seen": 54523025, + "router_z_loss_clip": 2.9765625, + "router_z_loss_mlp": 0.28100586, + "step": 2512, + "time_per_iteration": 2.592564344406128 + }, + { + "auxiliary_loss_clip": 0.06602003, + "auxiliary_loss_mlp": 0.01290201, + "balance_loss_clip": 0.06302576, + "balance_loss_mlp": 0.01260136, + "epoch": 0.15108973395460695, + "flos": 30270199858560.0, + "grad_norm": 3.311694411348407, + "language_loss": 0.75370401, + "learning_rate": 3.84815005500134e-06, + "loss": 0.8326261, + "num_input_tokens_seen": 54545025, + "router_z_loss_clip": 2.99609375, + "router_z_loss_mlp": 0.30078125, + "step": 2513, + "time_per_iteration": 2.675105571746826 + }, + { + "auxiliary_loss_clip": 0.06516539, + "auxiliary_loss_mlp": 0.01341982, + "balance_loss_clip": 0.06344443, + "balance_loss_mlp": 0.01333804, + "epoch": 0.15114985720727492, + "flos": 60456711087360.0, + "grad_norm": 0.8564181084280313, + "language_loss": 0.64582717, + "learning_rate": 3.84800116337411e-06, + "loss": 0.72441238, + "num_input_tokens_seen": 54604545, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.08178711, + "step": 2514, + "time_per_iteration": 3.1119604110717773 + }, + { + "auxiliary_loss_clip": 0.06602134, + "auxiliary_loss_mlp": 0.01300136, + "balance_loss_clip": 0.06303127, + "balance_loss_mlp": 0.01271299, + "epoch": 0.15120998045994288, + "flos": 20527915743360.0, + "grad_norm": 2.3848506685629487, + "language_loss": 0.74193883, + "learning_rate": 3.8478522016704916e-06, + "loss": 0.82096153, + "num_input_tokens_seen": 54620590, + "router_z_loss_clip": 2.9921875, + "router_z_loss_mlp": 0.28869629, + "step": 2515, + "time_per_iteration": 2.554006814956665 + }, + { + "auxiliary_loss_clip": 0.06601816, + "auxiliary_loss_mlp": 0.01297055, + "balance_loss_clip": 0.06304994, + "balance_loss_mlp": 0.01269577, + "epoch": 0.15127010371261085, + "flos": 21185303351040.0, + "grad_norm": 1.9231590772251361, + "language_loss": 0.78707075, + "learning_rate": 3.8477031698961325e-06, + "loss": 0.86605948, + "num_input_tokens_seen": 54640410, + "router_z_loss_clip": 2.96875, + "router_z_loss_mlp": 0.27490234, + "step": 2516, + "time_per_iteration": 2.5447309017181396 + }, + { + "auxiliary_loss_clip": 0.06496674, + "auxiliary_loss_mlp": 0.01300995, + "balance_loss_clip": 0.063242, + "balance_loss_mlp": 0.01292406, + "epoch": 0.1513302269652788, + "flos": 65339537189760.0, + "grad_norm": 0.7164418146378366, + "language_loss": 0.54901356, + "learning_rate": 3.8475540680566835e-06, + "loss": 0.62699026, + "num_input_tokens_seen": 54701430, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.08599854, + "step": 2517, + "time_per_iteration": 3.1926348209381104 + }, + { + "auxiliary_loss_clip": 0.06606746, + "auxiliary_loss_mlp": 0.01299298, + "balance_loss_clip": 0.06308446, + "balance_loss_mlp": 0.01269257, + "epoch": 0.15139035021794678, + "flos": 19141918957440.0, + "grad_norm": 1.8480469380115683, + "language_loss": 0.79359663, + "learning_rate": 3.8474048961577995e-06, + "loss": 0.87265706, + "num_input_tokens_seen": 54720845, + "router_z_loss_clip": 2.98046875, + "router_z_loss_mlp": 0.30078125, + "step": 2518, + "time_per_iteration": 2.563261032104492 + }, + { + "auxiliary_loss_clip": 0.06615496, + "auxiliary_loss_mlp": 0.01294147, + "balance_loss_clip": 0.06308527, + "balance_loss_mlp": 0.01264154, + "epoch": 0.15145047347061477, + "flos": 26585841104640.0, + "grad_norm": 2.595059574569343, + "language_loss": 0.71604168, + "learning_rate": 3.847255654205137e-06, + "loss": 0.79513812, + "num_input_tokens_seen": 54740495, + "router_z_loss_clip": 3.0703125, + "router_z_loss_mlp": 0.29980469, + "step": 2519, + "time_per_iteration": 2.5810017585754395 + }, + { + "auxiliary_loss_clip": 0.06607082, + "auxiliary_loss_mlp": 0.01285902, + "balance_loss_clip": 0.06307598, + "balance_loss_mlp": 0.01257483, + "epoch": 0.15151059672328274, + "flos": 20309177859840.0, + "grad_norm": 2.5486902935962368, + "language_loss": 0.80309343, + "learning_rate": 3.847106342204354e-06, + "loss": 0.88202327, + "num_input_tokens_seen": 54758415, + "router_z_loss_clip": 2.99804688, + "router_z_loss_mlp": 0.28393555, + "step": 2520, + "time_per_iteration": 2.5701065063476562 + }, + { + "auxiliary_loss_clip": 0.06607689, + "auxiliary_loss_mlp": 0.01293848, + "balance_loss_clip": 0.06306153, + "balance_loss_mlp": 0.01262853, + "epoch": 0.1515707199759507, + "flos": 27234591742080.0, + "grad_norm": 2.513682116437687, + "language_loss": 0.7522434, + "learning_rate": 3.846956960161114e-06, + "loss": 0.83125877, + "num_input_tokens_seen": 54779355, + "router_z_loss_clip": 3.01953125, + "router_z_loss_mlp": 0.31005859, + "step": 2521, + "time_per_iteration": 2.6066393852233887 + }, + { + "auxiliary_loss_clip": 0.06609409, + "auxiliary_loss_mlp": 0.01293912, + "balance_loss_clip": 0.06305401, + "balance_loss_mlp": 0.012643, + "epoch": 0.15163084322861867, + "flos": 23594229181440.0, + "grad_norm": 3.360256579964136, + "language_loss": 0.82804251, + "learning_rate": 3.84680750808108e-06, + "loss": 0.9070757, + "num_input_tokens_seen": 54799465, + "router_z_loss_clip": 3.04101562, + "router_z_loss_mlp": 0.29614258, + "step": 2522, + "time_per_iteration": 2.6204471588134766 + }, + { + "auxiliary_loss_clip": 0.06466869, + "auxiliary_loss_mlp": 0.01261371, + "balance_loss_clip": 0.06295171, + "balance_loss_mlp": 0.01253491, + "epoch": 0.15169096648128663, + "flos": 66908786855040.0, + "grad_norm": 0.8016115215940587, + "language_loss": 0.58029842, + "learning_rate": 3.846657985969922e-06, + "loss": 0.65758073, + "num_input_tokens_seen": 54857665, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.07873535, + "step": 2523, + "time_per_iteration": 3.1140880584716797 + }, + { + "auxiliary_loss_clip": 0.06599564, + "auxiliary_loss_mlp": 0.0128657, + "balance_loss_clip": 0.0630584, + "balance_loss_mlp": 0.0125821, + "epoch": 0.1517510897339546, + "flos": 29103024810240.0, + "grad_norm": 3.3848907238065324, + "language_loss": 0.7552231, + "learning_rate": 3.8465083938333066e-06, + "loss": 0.83408445, + "num_input_tokens_seen": 54879895, + "router_z_loss_clip": 2.93945312, + "router_z_loss_mlp": 0.2833252, + "step": 2524, + "time_per_iteration": 2.6701698303222656 + }, + { + "auxiliary_loss_clip": 0.066016, + "auxiliary_loss_mlp": 0.01289357, + "balance_loss_clip": 0.0629995, + "balance_loss_mlp": 0.01259889, + "epoch": 0.1518112129866226, + "flos": 18412597019520.0, + "grad_norm": 1.915224291313093, + "language_loss": 0.75580716, + "learning_rate": 3.8463587316769085e-06, + "loss": 0.8347168, + "num_input_tokens_seen": 54898245, + "router_z_loss_clip": 3.01757812, + "router_z_loss_mlp": 0.29443359, + "step": 2525, + "time_per_iteration": 2.5224146842956543 + }, + { + "auxiliary_loss_clip": 0.06610245, + "auxiliary_loss_mlp": 0.01284071, + "balance_loss_clip": 0.06303011, + "balance_loss_mlp": 0.01254436, + "epoch": 0.15187133623929056, + "flos": 19431165651840.0, + "grad_norm": 1.8765466933559616, + "language_loss": 0.80763042, + "learning_rate": 3.846208999506402e-06, + "loss": 0.88657361, + "num_input_tokens_seen": 54917060, + "router_z_loss_clip": 3.0703125, + "router_z_loss_mlp": 0.29638672, + "step": 2526, + "time_per_iteration": 2.6248834133148193 + }, + { + "auxiliary_loss_clip": 0.06594585, + "auxiliary_loss_mlp": 0.01286752, + "balance_loss_clip": 0.06300339, + "balance_loss_mlp": 0.01258869, + "epoch": 0.15193145949195852, + "flos": 17571914605440.0, + "grad_norm": 1.7842428302313325, + "language_loss": 0.8627159, + "learning_rate": 3.846059197327466e-06, + "loss": 0.94152921, + "num_input_tokens_seen": 54936365, + "router_z_loss_clip": 2.94335938, + "router_z_loss_mlp": 0.27893066, + "step": 2527, + "time_per_iteration": 2.5703248977661133 + }, + { + "auxiliary_loss_clip": 0.06595106, + "auxiliary_loss_mlp": 0.01287139, + "balance_loss_clip": 0.06298759, + "balance_loss_mlp": 0.01258386, + "epoch": 0.15199158274462649, + "flos": 36185472443520.0, + "grad_norm": 2.5277358880769034, + "language_loss": 0.69832277, + "learning_rate": 3.845909325145779e-06, + "loss": 0.77714521, + "num_input_tokens_seen": 54961365, + "router_z_loss_clip": 2.9609375, + "router_z_loss_mlp": 0.28710938, + "step": 2528, + "time_per_iteration": 2.6980392932891846 + }, + { + "auxiliary_loss_clip": 0.06594975, + "auxiliary_loss_mlp": 0.01296705, + "balance_loss_clip": 0.06302442, + "balance_loss_mlp": 0.01268142, + "epoch": 0.15205170599729445, + "flos": 23080416744960.0, + "grad_norm": 1.7045403282780136, + "language_loss": 0.87845027, + "learning_rate": 3.845759382967026e-06, + "loss": 0.95736718, + "num_input_tokens_seen": 54980750, + "router_z_loss_clip": 2.92382812, + "router_z_loss_mlp": 0.28588867, + "step": 2529, + "time_per_iteration": 2.557424545288086 + }, + { + "auxiliary_loss_clip": 0.06594887, + "auxiliary_loss_mlp": 0.01282389, + "balance_loss_clip": 0.06300049, + "balance_loss_mlp": 0.01254446, + "epoch": 0.15211182924996242, + "flos": 21914876851200.0, + "grad_norm": 2.4637975770903227, + "language_loss": 0.84209996, + "learning_rate": 3.845609370796893e-06, + "loss": 0.92087275, + "num_input_tokens_seen": 54999675, + "router_z_loss_clip": 2.9453125, + "router_z_loss_mlp": 0.27929688, + "step": 2530, + "time_per_iteration": 2.567228317260742 + }, + { + "auxiliary_loss_clip": 0.06598973, + "auxiliary_loss_mlp": 0.01283946, + "balance_loss_clip": 0.06302072, + "balance_loss_mlp": 0.01255336, + "epoch": 0.15217195250263038, + "flos": 13886675383680.0, + "grad_norm": 2.4321779104905312, + "language_loss": 0.82142234, + "learning_rate": 3.845459288641066e-06, + "loss": 0.90025157, + "num_input_tokens_seen": 55018295, + "router_z_loss_clip": 2.96875, + "router_z_loss_mlp": 0.28637695, + "step": 2531, + "time_per_iteration": 2.516402006149292 + }, + { + "auxiliary_loss_clip": 0.06592906, + "auxiliary_loss_mlp": 0.01285145, + "balance_loss_clip": 0.06298403, + "balance_loss_mlp": 0.01258085, + "epoch": 0.15223207575529837, + "flos": 24542247075840.0, + "grad_norm": 1.9096136580750296, + "language_loss": 0.79480046, + "learning_rate": 3.8453091365052394e-06, + "loss": 0.87358099, + "num_input_tokens_seen": 55037975, + "router_z_loss_clip": 2.94726562, + "router_z_loss_mlp": 0.27050781, + "step": 2532, + "time_per_iteration": 2.602570056915283 + }, + { + "auxiliary_loss_clip": 0.06598103, + "auxiliary_loss_mlp": 0.01292588, + "balance_loss_clip": 0.06306568, + "balance_loss_mlp": 0.01264038, + "epoch": 0.15229219900796634, + "flos": 25563876382080.0, + "grad_norm": 2.360683407186041, + "language_loss": 0.88639164, + "learning_rate": 3.845158914395105e-06, + "loss": 0.96529853, + "num_input_tokens_seen": 55057135, + "router_z_loss_clip": 2.91796875, + "router_z_loss_mlp": 0.28552246, + "step": 2533, + "time_per_iteration": 2.5762295722961426 + }, + { + "auxiliary_loss_clip": 0.06594107, + "auxiliary_loss_mlp": 0.01284606, + "balance_loss_clip": 0.06298208, + "balance_loss_mlp": 0.01254935, + "epoch": 0.1523523222606343, + "flos": 18222761594880.0, + "grad_norm": 2.499608410280873, + "language_loss": 0.79898536, + "learning_rate": 3.84500862231636e-06, + "loss": 0.87777245, + "num_input_tokens_seen": 55075525, + "router_z_loss_clip": 2.95898438, + "router_z_loss_mlp": 0.29650879, + "step": 2534, + "time_per_iteration": 2.5181829929351807 + }, + { + "auxiliary_loss_clip": 0.06609488, + "auxiliary_loss_mlp": 0.01289006, + "balance_loss_clip": 0.0630374, + "balance_loss_mlp": 0.01258965, + "epoch": 0.15241244551330227, + "flos": 13264940488320.0, + "grad_norm": 3.191609676619316, + "language_loss": 0.77956164, + "learning_rate": 3.844858260274702e-06, + "loss": 0.8585465, + "num_input_tokens_seen": 55090845, + "router_z_loss_clip": 3.05859375, + "router_z_loss_mlp": 0.30029297, + "step": 2535, + "time_per_iteration": 2.503119945526123 + }, + { + "auxiliary_loss_clip": 0.06608094, + "auxiliary_loss_mlp": 0.01284526, + "balance_loss_clip": 0.06304367, + "balance_loss_mlp": 0.01254271, + "epoch": 0.15247256876597023, + "flos": 19721083178880.0, + "grad_norm": 3.2947050027003066, + "language_loss": 0.79165435, + "learning_rate": 3.844707828275835e-06, + "loss": 0.87058055, + "num_input_tokens_seen": 55108750, + "router_z_loss_clip": 3.03710938, + "router_z_loss_mlp": 0.30249023, + "step": 2536, + "time_per_iteration": 2.5530476570129395 + }, + { + "auxiliary_loss_clip": 0.06598002, + "auxiliary_loss_mlp": 0.0128534, + "balance_loss_clip": 0.06305596, + "balance_loss_mlp": 0.01255537, + "epoch": 0.1525326920186382, + "flos": 20382076512000.0, + "grad_norm": 2.2639852442912174, + "language_loss": 0.76164496, + "learning_rate": 3.844557326325461e-06, + "loss": 0.84047836, + "num_input_tokens_seen": 55126750, + "router_z_loss_clip": 2.92578125, + "router_z_loss_mlp": 0.29785156, + "step": 2537, + "time_per_iteration": 2.5634751319885254 + }, + { + "auxiliary_loss_clip": 0.06616107, + "auxiliary_loss_mlp": 0.01291403, + "balance_loss_clip": 0.06314284, + "balance_loss_mlp": 0.0126017, + "epoch": 0.15259281527130616, + "flos": 13595122702080.0, + "grad_norm": 2.083719097909717, + "language_loss": 0.78846097, + "learning_rate": 3.8444067544292896e-06, + "loss": 0.86753607, + "num_input_tokens_seen": 55144690, + "router_z_loss_clip": 3.01953125, + "router_z_loss_mlp": 0.31225586, + "step": 2538, + "time_per_iteration": 2.525216579437256 + }, + { + "auxiliary_loss_clip": 0.0661103, + "auxiliary_loss_mlp": 0.01284923, + "balance_loss_clip": 0.06318808, + "balance_loss_mlp": 0.0125735, + "epoch": 0.15265293852397416, + "flos": 22867590574080.0, + "grad_norm": 1.595971485409624, + "language_loss": 0.90629852, + "learning_rate": 3.844256112593029e-06, + "loss": 0.98525798, + "num_input_tokens_seen": 55166055, + "router_z_loss_clip": 2.921875, + "router_z_loss_mlp": 0.27600098, + "step": 2539, + "time_per_iteration": 2.5915887355804443 + }, + { + "auxiliary_loss_clip": 0.06619261, + "auxiliary_loss_mlp": 0.01284998, + "balance_loss_clip": 0.06323005, + "balance_loss_mlp": 0.01258056, + "epoch": 0.15271306177664212, + "flos": 29245174462080.0, + "grad_norm": 1.9545185046664433, + "language_loss": 0.94507146, + "learning_rate": 3.844105400822391e-06, + "loss": 1.02411401, + "num_input_tokens_seen": 55186285, + "router_z_loss_clip": 2.9609375, + "router_z_loss_mlp": 0.26953125, + "step": 2540, + "time_per_iteration": 2.5957653522491455 + }, + { + "auxiliary_loss_clip": 0.06626961, + "auxiliary_loss_mlp": 0.01293534, + "balance_loss_clip": 0.06334557, + "balance_loss_mlp": 0.01266021, + "epoch": 0.1527731850293101, + "flos": 31253912392320.0, + "grad_norm": 1.8583637495379903, + "language_loss": 0.76235664, + "learning_rate": 3.843954619123092e-06, + "loss": 0.84156162, + "num_input_tokens_seen": 55207915, + "router_z_loss_clip": 2.92578125, + "router_z_loss_mlp": 0.27490234, + "step": 2541, + "time_per_iteration": 2.6641690731048584 + }, + { + "auxiliary_loss_clip": 0.06626125, + "auxiliary_loss_mlp": 0.01288118, + "balance_loss_clip": 0.06332077, + "balance_loss_mlp": 0.01259139, + "epoch": 0.15283330828197805, + "flos": 22388550382080.0, + "grad_norm": 1.961487412354616, + "language_loss": 0.82183802, + "learning_rate": 3.84380376750085e-06, + "loss": 0.90098047, + "num_input_tokens_seen": 55227860, + "router_z_loss_clip": 2.94140625, + "router_z_loss_mlp": 0.28991699, + "step": 2542, + "time_per_iteration": 2.5667076110839844 + }, + { + "auxiliary_loss_clip": 0.06644198, + "auxiliary_loss_mlp": 0.01293823, + "balance_loss_clip": 0.0634245, + "balance_loss_mlp": 0.01263568, + "epoch": 0.15289343153464602, + "flos": 25527175493760.0, + "grad_norm": 2.1541705335190597, + "language_loss": 0.78364998, + "learning_rate": 3.843652845961383e-06, + "loss": 0.8630302, + "num_input_tokens_seen": 55247330, + "router_z_loss_clip": 3.01953125, + "router_z_loss_mlp": 0.3026123, + "step": 2543, + "time_per_iteration": 3.986154556274414 + }, + { + "auxiliary_loss_clip": 0.06638096, + "auxiliary_loss_mlp": 0.01299522, + "balance_loss_clip": 0.06343587, + "balance_loss_mlp": 0.01271616, + "epoch": 0.15295355478731398, + "flos": 22716468535680.0, + "grad_norm": 3.1436155023596886, + "language_loss": 0.88072753, + "learning_rate": 3.843501854510416e-06, + "loss": 0.96010375, + "num_input_tokens_seen": 55266195, + "router_z_loss_clip": 2.9453125, + "router_z_loss_mlp": 0.27905273, + "step": 2544, + "time_per_iteration": 3.9873733520507812 + }, + { + "auxiliary_loss_clip": 0.06648069, + "auxiliary_loss_mlp": 0.01297216, + "balance_loss_clip": 0.06342938, + "balance_loss_mlp": 0.01266937, + "epoch": 0.15301367803998198, + "flos": 23257548276480.0, + "grad_norm": 3.867712661232465, + "language_loss": 0.83686781, + "learning_rate": 3.843350793153673e-06, + "loss": 0.91632062, + "num_input_tokens_seen": 55283305, + "router_z_loss_clip": 3.05078125, + "router_z_loss_mlp": 0.30273438, + "step": 2545, + "time_per_iteration": 2.5443849563598633 + }, + { + "auxiliary_loss_clip": 0.06650628, + "auxiliary_loss_mlp": 0.01286742, + "balance_loss_clip": 0.06356554, + "balance_loss_mlp": 0.01259086, + "epoch": 0.15307380129264994, + "flos": 25893597398400.0, + "grad_norm": 2.572032347282614, + "language_loss": 0.71873057, + "learning_rate": 3.843199661896884e-06, + "loss": 0.79810423, + "num_input_tokens_seen": 55303035, + "router_z_loss_clip": 2.94335938, + "router_z_loss_mlp": 0.27661133, + "step": 2546, + "time_per_iteration": 2.650826930999756 + }, + { + "auxiliary_loss_clip": 0.06637084, + "auxiliary_loss_mlp": 0.0129342, + "balance_loss_clip": 0.06340081, + "balance_loss_mlp": 0.01263164, + "epoch": 0.1531339245453179, + "flos": 46983780766080.0, + "grad_norm": 1.694960648035813, + "language_loss": 0.78831929, + "learning_rate": 3.843048460745779e-06, + "loss": 0.86762434, + "num_input_tokens_seen": 55327570, + "router_z_loss_clip": 2.96875, + "router_z_loss_mlp": 0.30249023, + "step": 2547, + "time_per_iteration": 2.7530312538146973 + }, + { + "auxiliary_loss_clip": 0.06643492, + "auxiliary_loss_mlp": 0.01284901, + "balance_loss_clip": 0.06342105, + "balance_loss_mlp": 0.0125579, + "epoch": 0.15319404779798587, + "flos": 35890817160960.0, + "grad_norm": 3.38346990001629, + "language_loss": 0.75178528, + "learning_rate": 3.842897189706092e-06, + "loss": 0.83106923, + "num_input_tokens_seen": 55351090, + "router_z_loss_clip": 3.015625, + "router_z_loss_mlp": 0.29138184, + "step": 2548, + "time_per_iteration": 4.090601682662964 + }, + { + "auxiliary_loss_clip": 0.06638174, + "auxiliary_loss_mlp": 0.01283175, + "balance_loss_clip": 0.06343598, + "balance_loss_mlp": 0.01255757, + "epoch": 0.15325417105065384, + "flos": 25671463424640.0, + "grad_norm": 1.8173203040893826, + "language_loss": 0.82054353, + "learning_rate": 3.842745848783558e-06, + "loss": 0.89975703, + "num_input_tokens_seen": 55371050, + "router_z_loss_clip": 2.94335938, + "router_z_loss_mlp": 0.27416992, + "step": 2549, + "time_per_iteration": 4.0024590492248535 + }, + { + "auxiliary_loss_clip": 0.06642953, + "auxiliary_loss_mlp": 0.01284523, + "balance_loss_clip": 0.06343073, + "balance_loss_mlp": 0.01256366, + "epoch": 0.1533142943033218, + "flos": 18776838718080.0, + "grad_norm": 1.6738213226373704, + "language_loss": 0.76089072, + "learning_rate": 3.842594437983917e-06, + "loss": 0.84016538, + "num_input_tokens_seen": 55390375, + "router_z_loss_clip": 2.99804688, + "router_z_loss_mlp": 0.28137207, + "step": 2550, + "time_per_iteration": 2.5584487915039062 + }, + { + "auxiliary_loss_clip": 0.06640078, + "auxiliary_loss_mlp": 0.01284284, + "balance_loss_clip": 0.063375, + "balance_loss_mlp": 0.01257093, + "epoch": 0.15337441755598977, + "flos": 23113218418560.0, + "grad_norm": 2.77223179347166, + "language_loss": 0.78078097, + "learning_rate": 3.8424429573129115e-06, + "loss": 0.86002457, + "num_input_tokens_seen": 55408890, + "router_z_loss_clip": 3.02734375, + "router_z_loss_mlp": 0.27172852, + "step": 2551, + "time_per_iteration": 2.5581319332122803 + }, + { + "auxiliary_loss_clip": 0.06594751, + "auxiliary_loss_mlp": 0.01264842, + "balance_loss_clip": 0.0641477, + "balance_loss_mlp": 0.01255657, + "epoch": 0.15343454080865776, + "flos": 59881278372480.0, + "grad_norm": 0.9086682427744472, + "language_loss": 0.56718183, + "learning_rate": 3.842291406776283e-06, + "loss": 0.6457777, + "num_input_tokens_seen": 55463815, + "router_z_loss_clip": 1.796875, + "router_z_loss_mlp": 0.09179688, + "step": 2552, + "time_per_iteration": 3.099020004272461 + }, + { + "auxiliary_loss_clip": 0.06649399, + "auxiliary_loss_mlp": 0.01294284, + "balance_loss_clip": 0.06343735, + "balance_loss_mlp": 0.01263695, + "epoch": 0.15349466406132573, + "flos": 11915644590720.0, + "grad_norm": 7.1683362370520625, + "language_loss": 0.89047897, + "learning_rate": 3.84213978637978e-06, + "loss": 0.96991581, + "num_input_tokens_seen": 55481050, + "router_z_loss_clip": 3.05664062, + "router_z_loss_mlp": 0.30615234, + "step": 2553, + "time_per_iteration": 2.5545389652252197 + }, + { + "auxiliary_loss_clip": 0.06633511, + "auxiliary_loss_mlp": 0.01288342, + "balance_loss_clip": 0.0633003, + "balance_loss_mlp": 0.01258575, + "epoch": 0.1535547873139937, + "flos": 24103681205760.0, + "grad_norm": 2.37345039804312, + "language_loss": 0.79193908, + "learning_rate": 3.841988096129152e-06, + "loss": 0.87115765, + "num_input_tokens_seen": 55500050, + "router_z_loss_clip": 3.03515625, + "router_z_loss_mlp": 0.29748535, + "step": 2554, + "time_per_iteration": 2.5949606895446777 + }, + { + "auxiliary_loss_clip": 0.06630482, + "auxiliary_loss_mlp": 0.01286402, + "balance_loss_clip": 0.06329404, + "balance_loss_mlp": 0.01256278, + "epoch": 0.15361491056666166, + "flos": 17572208094720.0, + "grad_norm": 5.650486163134607, + "language_loss": 0.79014289, + "learning_rate": 3.841836336030151e-06, + "loss": 0.86931169, + "num_input_tokens_seen": 55518125, + "router_z_loss_clip": 3.01171875, + "router_z_loss_mlp": 0.3013916, + "step": 2555, + "time_per_iteration": 2.5340495109558105 + }, + { + "auxiliary_loss_clip": 0.0662353, + "auxiliary_loss_mlp": 0.01288339, + "balance_loss_clip": 0.06330266, + "balance_loss_mlp": 0.01260671, + "epoch": 0.15367503381932962, + "flos": 25053040765440.0, + "grad_norm": 1.6796179562313394, + "language_loss": 0.78025055, + "learning_rate": 3.8416845060885305e-06, + "loss": 0.85936922, + "num_input_tokens_seen": 55540960, + "router_z_loss_clip": 2.93554688, + "router_z_loss_mlp": 0.2767334, + "step": 2556, + "time_per_iteration": 2.623685121536255 + }, + { + "auxiliary_loss_clip": 0.06620497, + "auxiliary_loss_mlp": 0.01288231, + "balance_loss_clip": 0.0633128, + "balance_loss_mlp": 0.01260086, + "epoch": 0.15373515707199759, + "flos": 21513808483200.0, + "grad_norm": 2.256114728182097, + "language_loss": 0.91304088, + "learning_rate": 3.84153260631005e-06, + "loss": 0.99212819, + "num_input_tokens_seen": 55559210, + "router_z_loss_clip": 2.890625, + "router_z_loss_mlp": 0.28161621, + "step": 2557, + "time_per_iteration": 2.6546642780303955 + }, + { + "auxiliary_loss_clip": 0.06632135, + "auxiliary_loss_mlp": 0.01294079, + "balance_loss_clip": 0.0633366, + "balance_loss_mlp": 0.0126411, + "epoch": 0.15379528032466555, + "flos": 26001897200640.0, + "grad_norm": 2.0796567985016656, + "language_loss": 0.71532625, + "learning_rate": 3.841380636700468e-06, + "loss": 0.79458839, + "num_input_tokens_seen": 55578925, + "router_z_loss_clip": 2.98632812, + "router_z_loss_mlp": 0.29980469, + "step": 2558, + "time_per_iteration": 2.604158401489258 + }, + { + "auxiliary_loss_clip": 0.06622511, + "auxiliary_loss_mlp": 0.01287721, + "balance_loss_clip": 0.06324002, + "balance_loss_mlp": 0.01258336, + "epoch": 0.15385540357733354, + "flos": 19282685016960.0, + "grad_norm": 2.0921223854633166, + "language_loss": 0.93401122, + "learning_rate": 3.841228597265548e-06, + "loss": 1.0131135, + "num_input_tokens_seen": 55597255, + "router_z_loss_clip": 2.98242188, + "router_z_loss_mlp": 0.29382324, + "step": 2559, + "time_per_iteration": 2.546621799468994 + }, + { + "auxiliary_loss_clip": 0.06626738, + "auxiliary_loss_mlp": 0.01291924, + "balance_loss_clip": 0.06328855, + "balance_loss_mlp": 0.01262289, + "epoch": 0.1539155268300015, + "flos": 28556788043520.0, + "grad_norm": 2.7498914144184994, + "language_loss": 0.65563196, + "learning_rate": 3.841076488011055e-06, + "loss": 0.73481858, + "num_input_tokens_seen": 55619515, + "router_z_loss_clip": 2.97851562, + "router_z_loss_mlp": 0.29638672, + "step": 2560, + "time_per_iteration": 2.633558511734009 + }, + { + "auxiliary_loss_clip": 0.06620878, + "auxiliary_loss_mlp": 0.01293003, + "balance_loss_clip": 0.06320217, + "balance_loss_mlp": 0.01262927, + "epoch": 0.15397565008266947, + "flos": 23554257984000.0, + "grad_norm": 1.9722034302545564, + "language_loss": 0.89109504, + "learning_rate": 3.8409243089427574e-06, + "loss": 0.9702338, + "num_input_tokens_seen": 55640050, + "router_z_loss_clip": 3.00585938, + "router_z_loss_mlp": 0.30065918, + "step": 2561, + "time_per_iteration": 2.593822479248047 + }, + { + "auxiliary_loss_clip": 0.06618848, + "auxiliary_loss_mlp": 0.01287729, + "balance_loss_clip": 0.06331521, + "balance_loss_mlp": 0.01260811, + "epoch": 0.15403577333533744, + "flos": 17135696649600.0, + "grad_norm": 2.292455015225775, + "language_loss": 0.83781528, + "learning_rate": 3.840772060066425e-06, + "loss": 0.91688108, + "num_input_tokens_seen": 55658695, + "router_z_loss_clip": 2.87109375, + "router_z_loss_mlp": 0.26928711, + "step": 2562, + "time_per_iteration": 2.5630288124084473 + }, + { + "auxiliary_loss_clip": 0.06628443, + "auxiliary_loss_mlp": 0.01297123, + "balance_loss_clip": 0.06321231, + "balance_loss_mlp": 0.01265175, + "epoch": 0.1540958965880054, + "flos": 17900252029440.0, + "grad_norm": 3.685635027542056, + "language_loss": 0.75855017, + "learning_rate": 3.840619741387832e-06, + "loss": 0.83780587, + "num_input_tokens_seen": 55676340, + "router_z_loss_clip": 3.0703125, + "router_z_loss_mlp": 0.31958008, + "step": 2563, + "time_per_iteration": 2.5140066146850586 + }, + { + "auxiliary_loss_clip": 0.06627464, + "auxiliary_loss_mlp": 0.01290382, + "balance_loss_clip": 0.06320702, + "balance_loss_mlp": 0.01258481, + "epoch": 0.15415601984067337, + "flos": 32169296321280.0, + "grad_norm": 2.478610974211426, + "language_loss": 0.77803361, + "learning_rate": 3.8404673529127534e-06, + "loss": 0.85721207, + "num_input_tokens_seen": 55698890, + "router_z_loss_clip": 3.0703125, + "router_z_loss_mlp": 0.3190918, + "step": 2564, + "time_per_iteration": 2.659982681274414 + }, + { + "auxiliary_loss_clip": 0.06615369, + "auxiliary_loss_mlp": 0.0129364, + "balance_loss_clip": 0.06320594, + "balance_loss_mlp": 0.01264267, + "epoch": 0.15421614309334136, + "flos": 24031243751040.0, + "grad_norm": 1.9916685694635767, + "language_loss": 0.71840364, + "learning_rate": 3.840314894646969e-06, + "loss": 0.7974937, + "num_input_tokens_seen": 55718535, + "router_z_loss_clip": 2.94726562, + "router_z_loss_mlp": 0.29321289, + "step": 2565, + "time_per_iteration": 2.553128480911255 + }, + { + "auxiliary_loss_clip": 0.06614129, + "auxiliary_loss_mlp": 0.01296634, + "balance_loss_clip": 0.06317951, + "balance_loss_mlp": 0.01266212, + "epoch": 0.15427626634600933, + "flos": 24392676337920.0, + "grad_norm": 2.5526224211901676, + "language_loss": 0.72527832, + "learning_rate": 3.840162366596259e-06, + "loss": 0.8043859, + "num_input_tokens_seen": 55738970, + "router_z_loss_clip": 2.9609375, + "router_z_loss_mlp": 0.30419922, + "step": 2566, + "time_per_iteration": 2.6016533374786377 + }, + { + "auxiliary_loss_clip": 0.06605071, + "auxiliary_loss_mlp": 0.01292884, + "balance_loss_clip": 0.06314062, + "balance_loss_mlp": 0.01265263, + "epoch": 0.1543363895986773, + "flos": 23338287285120.0, + "grad_norm": 2.301564838599309, + "language_loss": 0.86417472, + "learning_rate": 3.840009768766408e-06, + "loss": 0.94315434, + "num_input_tokens_seen": 55759585, + "router_z_loss_clip": 2.9140625, + "router_z_loss_mlp": 0.27612305, + "step": 2567, + "time_per_iteration": 2.5882625579833984 + }, + { + "auxiliary_loss_clip": 0.06608227, + "auxiliary_loss_mlp": 0.01293398, + "balance_loss_clip": 0.06315389, + "balance_loss_mlp": 0.01265348, + "epoch": 0.15439651285134526, + "flos": 24280225758720.0, + "grad_norm": 2.3922484360691576, + "language_loss": 0.79661417, + "learning_rate": 3.839857101163202e-06, + "loss": 0.87563044, + "num_input_tokens_seen": 55779250, + "router_z_loss_clip": 2.93164062, + "router_z_loss_mlp": 0.28039551, + "step": 2568, + "time_per_iteration": 2.6128549575805664 + }, + { + "auxiliary_loss_clip": 0.06604031, + "auxiliary_loss_mlp": 0.01296391, + "balance_loss_clip": 0.06313319, + "balance_loss_mlp": 0.01268103, + "epoch": 0.15445663610401322, + "flos": 22462832626560.0, + "grad_norm": 2.2987457723616482, + "language_loss": 0.71156412, + "learning_rate": 3.83970436379243e-06, + "loss": 0.79056835, + "num_input_tokens_seen": 55800470, + "router_z_loss_clip": 2.90820312, + "router_z_loss_mlp": 0.28295898, + "step": 2569, + "time_per_iteration": 2.555661916732788 + }, + { + "auxiliary_loss_clip": 0.06609643, + "auxiliary_loss_mlp": 0.0129108, + "balance_loss_clip": 0.06317194, + "balance_loss_mlp": 0.0126197, + "epoch": 0.1545167593566812, + "flos": 22055223640320.0, + "grad_norm": 2.1871959478456433, + "language_loss": 0.7775144, + "learning_rate": 3.839551556659884e-06, + "loss": 0.85652161, + "num_input_tokens_seen": 55817795, + "router_z_loss_clip": 2.92773438, + "router_z_loss_mlp": 0.29150391, + "step": 2570, + "time_per_iteration": 2.5834736824035645 + }, + { + "auxiliary_loss_clip": 0.06598657, + "auxiliary_loss_mlp": 0.01290077, + "balance_loss_clip": 0.06308745, + "balance_loss_mlp": 0.01260513, + "epoch": 0.15457688260934915, + "flos": 19324375223040.0, + "grad_norm": 2.749201239461968, + "language_loss": 0.7861867, + "learning_rate": 3.839398679771359e-06, + "loss": 0.86507404, + "num_input_tokens_seen": 55836125, + "router_z_loss_clip": 2.8984375, + "router_z_loss_mlp": 0.29541016, + "step": 2571, + "time_per_iteration": 2.5391428470611572 + }, + { + "auxiliary_loss_clip": 0.06606804, + "auxiliary_loss_mlp": 0.01294872, + "balance_loss_clip": 0.06313352, + "balance_loss_mlp": 0.01265785, + "epoch": 0.15463700586201715, + "flos": 24140843291520.0, + "grad_norm": 1.901838675989398, + "language_loss": 0.83756542, + "learning_rate": 3.839245733132652e-06, + "loss": 0.91658223, + "num_input_tokens_seen": 55855280, + "router_z_loss_clip": 2.93554688, + "router_z_loss_mlp": 0.29101562, + "step": 2572, + "time_per_iteration": 2.597111463546753 + }, + { + "auxiliary_loss_clip": 0.06611877, + "auxiliary_loss_mlp": 0.01296064, + "balance_loss_clip": 0.06316563, + "balance_loss_mlp": 0.01266393, + "epoch": 0.1546971291146851, + "flos": 22427808819840.0, + "grad_norm": 2.3334374955274466, + "language_loss": 0.91633451, + "learning_rate": 3.839092716749563e-06, + "loss": 0.9954139, + "num_input_tokens_seen": 55875695, + "router_z_loss_clip": 2.95898438, + "router_z_loss_mlp": 0.29699707, + "step": 2573, + "time_per_iteration": 2.553586721420288 + }, + { + "auxiliary_loss_clip": 0.06606219, + "auxiliary_loss_mlp": 0.01288918, + "balance_loss_clip": 0.06312492, + "balance_loss_mlp": 0.01258639, + "epoch": 0.15475725236735308, + "flos": 17536010330880.0, + "grad_norm": 1.5970575826599196, + "language_loss": 0.71088636, + "learning_rate": 3.838939630627893e-06, + "loss": 0.78983772, + "num_input_tokens_seen": 55894575, + "router_z_loss_clip": 2.93554688, + "router_z_loss_mlp": 0.30249023, + "step": 2574, + "time_per_iteration": 2.5485129356384277 + }, + { + "auxiliary_loss_clip": 0.06606239, + "auxiliary_loss_mlp": 0.01287836, + "balance_loss_clip": 0.06312916, + "balance_loss_mlp": 0.01258439, + "epoch": 0.15481737562002104, + "flos": 22567778265600.0, + "grad_norm": 2.064736624590997, + "language_loss": 0.83194166, + "learning_rate": 3.838786474773448e-06, + "loss": 0.91088241, + "num_input_tokens_seen": 55912855, + "router_z_loss_clip": 2.93359375, + "router_z_loss_mlp": 0.29394531, + "step": 2575, + "time_per_iteration": 2.5202696323394775 + }, + { + "auxiliary_loss_clip": 0.06611623, + "auxiliary_loss_mlp": 0.01295032, + "balance_loss_clip": 0.06317705, + "balance_loss_mlp": 0.01267137, + "epoch": 0.154877498872689, + "flos": 24907620804480.0, + "grad_norm": 1.9923268704643078, + "language_loss": 0.8600359, + "learning_rate": 3.838633249192036e-06, + "loss": 0.93910241, + "num_input_tokens_seen": 55932375, + "router_z_loss_clip": 2.93945312, + "router_z_loss_mlp": 0.27929688, + "step": 2576, + "time_per_iteration": 2.5677525997161865 + }, + { + "auxiliary_loss_clip": 0.06609543, + "auxiliary_loss_mlp": 0.01301269, + "balance_loss_clip": 0.06318229, + "balance_loss_mlp": 0.01275126, + "epoch": 0.15493762212535697, + "flos": 28155048842880.0, + "grad_norm": 2.065090565667539, + "language_loss": 0.82887769, + "learning_rate": 3.838479953889465e-06, + "loss": 0.90798575, + "num_input_tokens_seen": 55953970, + "router_z_loss_clip": 2.91796875, + "router_z_loss_mlp": 0.26147461, + "step": 2577, + "time_per_iteration": 2.5728230476379395 + }, + { + "auxiliary_loss_clip": 0.06618612, + "auxiliary_loss_mlp": 0.01306082, + "balance_loss_clip": 0.06324668, + "balance_loss_mlp": 0.01276852, + "epoch": 0.15499774537802496, + "flos": 25418162931840.0, + "grad_norm": 2.85112064725787, + "language_loss": 0.77597427, + "learning_rate": 3.8383265888715525e-06, + "loss": 0.85522127, + "num_input_tokens_seen": 55973120, + "router_z_loss_clip": 2.94335938, + "router_z_loss_mlp": 0.29199219, + "step": 2578, + "time_per_iteration": 2.5934667587280273 + }, + { + "auxiliary_loss_clip": 0.06630063, + "auxiliary_loss_mlp": 0.01289241, + "balance_loss_clip": 0.06328662, + "balance_loss_mlp": 0.01259224, + "epoch": 0.15505786863069293, + "flos": 22098213584640.0, + "grad_norm": 1.7655677053725216, + "language_loss": 0.8325448, + "learning_rate": 3.83817315414411e-06, + "loss": 0.91173792, + "num_input_tokens_seen": 55993260, + "router_z_loss_clip": 3.01367188, + "router_z_loss_mlp": 0.30004883, + "step": 2579, + "time_per_iteration": 2.5339503288269043 + }, + { + "auxiliary_loss_clip": 0.06624122, + "auxiliary_loss_mlp": 0.01293638, + "balance_loss_clip": 0.06327586, + "balance_loss_mlp": 0.01264074, + "epoch": 0.1551179918833609, + "flos": 18923223000960.0, + "grad_norm": 3.703462791860066, + "language_loss": 0.81290895, + "learning_rate": 3.838019649712958e-06, + "loss": 0.89208651, + "num_input_tokens_seen": 56012130, + "router_z_loss_clip": 2.96484375, + "router_z_loss_mlp": 0.2956543, + "step": 2580, + "time_per_iteration": 2.547076940536499 + }, + { + "auxiliary_loss_clip": 0.06553604, + "auxiliary_loss_mlp": 0.01296097, + "balance_loss_clip": 0.06379167, + "balance_loss_mlp": 0.01287341, + "epoch": 0.15517811513602886, + "flos": 66259281530880.0, + "grad_norm": 0.8290210768149422, + "language_loss": 0.59028411, + "learning_rate": 3.8378660755839166e-06, + "loss": 0.6687811, + "num_input_tokens_seen": 56079045, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.08770752, + "step": 2581, + "time_per_iteration": 4.748734712600708 + }, + { + "auxiliary_loss_clip": 0.06615421, + "auxiliary_loss_mlp": 0.01287932, + "balance_loss_clip": 0.06319774, + "balance_loss_mlp": 0.01259286, + "epoch": 0.15523823838869683, + "flos": 24027344536320.0, + "grad_norm": 2.048194408824491, + "language_loss": 0.86481762, + "learning_rate": 3.8377124317628095e-06, + "loss": 0.94385123, + "num_input_tokens_seen": 56098745, + "router_z_loss_clip": 2.95703125, + "router_z_loss_mlp": 0.28625488, + "step": 2582, + "time_per_iteration": 2.5417592525482178 + }, + { + "auxiliary_loss_clip": 0.0661144, + "auxiliary_loss_mlp": 0.01292493, + "balance_loss_clip": 0.06316175, + "balance_loss_mlp": 0.01262262, + "epoch": 0.1552983616413648, + "flos": 20491256782080.0, + "grad_norm": 2.196568898095916, + "language_loss": 0.79934382, + "learning_rate": 3.8375587182554625e-06, + "loss": 0.87838316, + "num_input_tokens_seen": 56117655, + "router_z_loss_clip": 2.95117188, + "router_z_loss_mlp": 0.30236816, + "step": 2583, + "time_per_iteration": 4.1261961460113525 + }, + { + "auxiliary_loss_clip": 0.06610835, + "auxiliary_loss_mlp": 0.01301507, + "balance_loss_clip": 0.06316249, + "balance_loss_mlp": 0.01272956, + "epoch": 0.15535848489403276, + "flos": 32131798819200.0, + "grad_norm": 2.2182475294075643, + "language_loss": 0.77203268, + "learning_rate": 3.837404935067705e-06, + "loss": 0.85115612, + "num_input_tokens_seen": 56141960, + "router_z_loss_clip": 2.9453125, + "router_z_loss_mlp": 0.28515625, + "step": 2584, + "time_per_iteration": 2.71648907661438 + }, + { + "auxiliary_loss_clip": 0.06603897, + "auxiliary_loss_mlp": 0.01292119, + "balance_loss_clip": 0.06309253, + "balance_loss_mlp": 0.01263676, + "epoch": 0.15541860814670075, + "flos": 19104379528320.0, + "grad_norm": 2.0708341386331157, + "language_loss": 0.76718783, + "learning_rate": 3.837251082205368e-06, + "loss": 0.84614801, + "num_input_tokens_seen": 56161430, + "router_z_loss_clip": 2.9453125, + "router_z_loss_mlp": 0.28442383, + "step": 2585, + "time_per_iteration": 2.548250198364258 + }, + { + "auxiliary_loss_clip": 0.06590863, + "auxiliary_loss_mlp": 0.01288896, + "balance_loss_clip": 0.06303678, + "balance_loss_mlp": 0.01260607, + "epoch": 0.1554787313993687, + "flos": 19178158648320.0, + "grad_norm": 2.0117198745869134, + "language_loss": 0.6235339, + "learning_rate": 3.837097159674286e-06, + "loss": 0.70233154, + "num_input_tokens_seen": 56179390, + "router_z_loss_clip": 2.87109375, + "router_z_loss_mlp": 0.28283691, + "step": 2586, + "time_per_iteration": 2.5397160053253174 + }, + { + "auxiliary_loss_clip": 0.06596754, + "auxiliary_loss_mlp": 0.01289508, + "balance_loss_clip": 0.0630295, + "balance_loss_mlp": 0.0126023, + "epoch": 0.15553885465203668, + "flos": 16149384639360.0, + "grad_norm": 2.0060039427442065, + "language_loss": 0.82540935, + "learning_rate": 3.836943167480296e-06, + "loss": 0.90427202, + "num_input_tokens_seen": 56198020, + "router_z_loss_clip": 2.9453125, + "router_z_loss_mlp": 0.29321289, + "step": 2587, + "time_per_iteration": 2.5246498584747314 + }, + { + "auxiliary_loss_clip": 0.06596097, + "auxiliary_loss_mlp": 0.01287288, + "balance_loss_clip": 0.06299823, + "balance_loss_mlp": 0.01257152, + "epoch": 0.15559897790470464, + "flos": 25344803082240.0, + "grad_norm": 1.8823875807099288, + "language_loss": 0.8996799, + "learning_rate": 3.836789105629236e-06, + "loss": 0.97851378, + "num_input_tokens_seen": 56218165, + "router_z_loss_clip": 2.9609375, + "router_z_loss_mlp": 0.30126953, + "step": 2588, + "time_per_iteration": 4.054608345031738 + }, + { + "auxiliary_loss_clip": 0.06588855, + "auxiliary_loss_mlp": 0.01285264, + "balance_loss_clip": 0.06298578, + "balance_loss_mlp": 0.01255628, + "epoch": 0.1556591011573726, + "flos": 23155453676160.0, + "grad_norm": 2.3276735592444253, + "language_loss": 0.65979421, + "learning_rate": 3.83663497412695e-06, + "loss": 0.7385354, + "num_input_tokens_seen": 56237160, + "router_z_loss_clip": 2.90234375, + "router_z_loss_mlp": 0.29614258, + "step": 2589, + "time_per_iteration": 2.5870378017425537 + }, + { + "auxiliary_loss_clip": 0.06587367, + "auxiliary_loss_mlp": 0.01283128, + "balance_loss_clip": 0.06296779, + "balance_loss_mlp": 0.01254554, + "epoch": 0.15571922441004057, + "flos": 25377353193600.0, + "grad_norm": 1.8444510343536653, + "language_loss": 0.83209628, + "learning_rate": 3.836480772979281e-06, + "loss": 0.91080129, + "num_input_tokens_seen": 56257610, + "router_z_loss_clip": 2.90820312, + "router_z_loss_mlp": 0.2857666, + "step": 2590, + "time_per_iteration": 2.567789316177368 + }, + { + "auxiliary_loss_clip": 0.06586926, + "auxiliary_loss_mlp": 0.01284797, + "balance_loss_clip": 0.06295232, + "balance_loss_mlp": 0.0125819, + "epoch": 0.15577934766270854, + "flos": 14506565489280.0, + "grad_norm": 2.5394168350381956, + "language_loss": 0.80645335, + "learning_rate": 3.836326502192077e-06, + "loss": 0.88517064, + "num_input_tokens_seen": 56275215, + "router_z_loss_clip": 2.91796875, + "router_z_loss_mlp": 0.26635742, + "step": 2591, + "time_per_iteration": 2.552945852279663 + }, + { + "auxiliary_loss_clip": 0.06583126, + "auxiliary_loss_mlp": 0.0128094, + "balance_loss_clip": 0.06296018, + "balance_loss_mlp": 0.01255953, + "epoch": 0.15583947091537653, + "flos": 37423575573120.0, + "grad_norm": 4.213698124732034, + "language_loss": 0.6586749, + "learning_rate": 3.836172161771189e-06, + "loss": 0.73731554, + "num_input_tokens_seen": 56297130, + "router_z_loss_clip": 2.87109375, + "router_z_loss_mlp": 0.25024414, + "step": 2592, + "time_per_iteration": 2.6843414306640625 + }, + { + "auxiliary_loss_clip": 0.06601857, + "auxiliary_loss_mlp": 0.01282978, + "balance_loss_clip": 0.06306329, + "balance_loss_mlp": 0.01254547, + "epoch": 0.1558995941680445, + "flos": 21841097731200.0, + "grad_norm": 2.3724666239354804, + "language_loss": 0.83576721, + "learning_rate": 3.836017751722467e-06, + "loss": 0.91461557, + "num_input_tokens_seen": 56314995, + "router_z_loss_clip": 2.95898438, + "router_z_loss_mlp": 0.28442383, + "step": 2593, + "time_per_iteration": 2.565361738204956 + }, + { + "auxiliary_loss_clip": 0.06586924, + "auxiliary_loss_mlp": 0.01289301, + "balance_loss_clip": 0.06303876, + "balance_loss_mlp": 0.01261526, + "epoch": 0.15595971742071246, + "flos": 19798845367680.0, + "grad_norm": 2.2297480783075847, + "language_loss": 0.74099863, + "learning_rate": 3.8358632720517695e-06, + "loss": 0.8197608, + "num_input_tokens_seen": 56334005, + "router_z_loss_clip": 2.83203125, + "router_z_loss_mlp": 0.27819824, + "step": 2594, + "time_per_iteration": 2.55253267288208 + }, + { + "auxiliary_loss_clip": 0.06601368, + "auxiliary_loss_mlp": 0.01282916, + "balance_loss_clip": 0.06319516, + "balance_loss_mlp": 0.01257346, + "epoch": 0.15601984067338043, + "flos": 26729038932480.0, + "grad_norm": 2.826820029132309, + "language_loss": 0.82562411, + "learning_rate": 3.835708722764952e-06, + "loss": 0.90446699, + "num_input_tokens_seen": 56353795, + "router_z_loss_clip": 2.81835938, + "router_z_loss_mlp": 0.2557373, + "step": 2595, + "time_per_iteration": 2.640240430831909 + }, + { + "auxiliary_loss_clip": 0.06626514, + "auxiliary_loss_mlp": 0.01281437, + "balance_loss_clip": 0.06334631, + "balance_loss_mlp": 0.01254936, + "epoch": 0.1560799639260484, + "flos": 18375183371520.0, + "grad_norm": 9.37489887619581, + "language_loss": 0.87632233, + "learning_rate": 3.835554103867876e-06, + "loss": 0.95540184, + "num_input_tokens_seen": 56373195, + "router_z_loss_clip": 2.91992188, + "router_z_loss_mlp": 0.26538086, + "step": 2596, + "time_per_iteration": 2.529327869415283 + }, + { + "auxiliary_loss_clip": 0.06606492, + "auxiliary_loss_mlp": 0.01287289, + "balance_loss_clip": 0.06323552, + "balance_loss_mlp": 0.01261015, + "epoch": 0.15614008717871636, + "flos": 22605149986560.0, + "grad_norm": 2.807545322610708, + "language_loss": 0.69688505, + "learning_rate": 3.835399415366404e-06, + "loss": 0.77582288, + "num_input_tokens_seen": 56391525, + "router_z_loss_clip": 2.82617188, + "router_z_loss_mlp": 0.26306152, + "step": 2597, + "time_per_iteration": 2.5685815811157227 + }, + { + "auxiliary_loss_clip": 0.0662894, + "auxiliary_loss_mlp": 0.01280666, + "balance_loss_clip": 0.06348241, + "balance_loss_mlp": 0.01256455, + "epoch": 0.15620021043138435, + "flos": 22753379059200.0, + "grad_norm": 2.0232351113841514, + "language_loss": 0.80914307, + "learning_rate": 3.8352446572664035e-06, + "loss": 0.88823915, + "num_input_tokens_seen": 56410715, + "router_z_loss_clip": 2.81054688, + "router_z_loss_mlp": 0.2421875, + "step": 2598, + "time_per_iteration": 2.554202079772949 + }, + { + "auxiliary_loss_clip": 0.0662708, + "auxiliary_loss_mlp": 0.01284312, + "balance_loss_clip": 0.06344105, + "balance_loss_mlp": 0.01257895, + "epoch": 0.15626033368405232, + "flos": 13119897870720.0, + "grad_norm": 2.0408523791990016, + "language_loss": 0.83276039, + "learning_rate": 3.8350898295737405e-06, + "loss": 0.91187429, + "num_input_tokens_seen": 56429170, + "router_z_loss_clip": 2.83007812, + "router_z_loss_mlp": 0.26391602, + "step": 2599, + "time_per_iteration": 2.66353702545166 + }, + { + "auxiliary_loss_clip": 0.06639346, + "auxiliary_loss_mlp": 0.01292644, + "balance_loss_clip": 0.06344323, + "balance_loss_mlp": 0.0126469, + "epoch": 0.15632045693672028, + "flos": 16477931698560.0, + "grad_norm": 2.3045518919772046, + "language_loss": 0.82379115, + "learning_rate": 3.834934932294287e-06, + "loss": 0.9031111, + "num_input_tokens_seen": 56445685, + "router_z_loss_clip": 2.95117188, + "router_z_loss_mlp": 0.27941895, + "step": 2600, + "time_per_iteration": 2.50607967376709 + }, + { + "auxiliary_loss_clip": 0.06646761, + "auxiliary_loss_mlp": 0.01287391, + "balance_loss_clip": 0.0635706, + "balance_loss_mlp": 0.01259305, + "epoch": 0.15638058018938825, + "flos": 20856672437760.0, + "grad_norm": 2.020166421544308, + "language_loss": 0.88839436, + "learning_rate": 3.834779965433917e-06, + "loss": 0.96773589, + "num_input_tokens_seen": 56465900, + "router_z_loss_clip": 2.8984375, + "router_z_loss_mlp": 0.28076172, + "step": 2601, + "time_per_iteration": 2.574437141418457 + }, + { + "auxiliary_loss_clip": 0.06648471, + "auxiliary_loss_mlp": 0.01294906, + "balance_loss_clip": 0.06352241, + "balance_loss_mlp": 0.01267989, + "epoch": 0.1564407034420562, + "flos": 21878762941440.0, + "grad_norm": 2.51177361833528, + "language_loss": 0.79510248, + "learning_rate": 3.834624928998508e-06, + "loss": 0.87453628, + "num_input_tokens_seen": 56485020, + "router_z_loss_clip": 2.96679688, + "router_z_loss_mlp": 0.26940918, + "step": 2602, + "time_per_iteration": 2.5957844257354736 + }, + { + "auxiliary_loss_clip": 0.06633168, + "auxiliary_loss_mlp": 0.01292264, + "balance_loss_clip": 0.06345348, + "balance_loss_mlp": 0.01265979, + "epoch": 0.15650082669472418, + "flos": 21840888096000.0, + "grad_norm": 1.9170738392352888, + "language_loss": 0.7431488, + "learning_rate": 3.8344698229939376e-06, + "loss": 0.82240313, + "num_input_tokens_seen": 56505205, + "router_z_loss_clip": 2.875, + "router_z_loss_mlp": 0.26293945, + "step": 2603, + "time_per_iteration": 2.5696704387664795 + }, + { + "auxiliary_loss_clip": 0.06625052, + "auxiliary_loss_mlp": 0.01287753, + "balance_loss_clip": 0.06337333, + "balance_loss_mlp": 0.01261217, + "epoch": 0.15656094994739214, + "flos": 13804343147520.0, + "grad_norm": 2.480258971716289, + "language_loss": 0.88529468, + "learning_rate": 3.8343146474260865e-06, + "loss": 0.9644227, + "num_input_tokens_seen": 56521495, + "router_z_loss_clip": 2.87890625, + "router_z_loss_mlp": 0.26538086, + "step": 2604, + "time_per_iteration": 2.5110373497009277 + }, + { + "auxiliary_loss_clip": 0.06634312, + "auxiliary_loss_mlp": 0.01291425, + "balance_loss_clip": 0.06341597, + "balance_loss_mlp": 0.01266558, + "epoch": 0.15662107320006013, + "flos": 27315582312960.0, + "grad_norm": 2.192350516429204, + "language_loss": 0.85880566, + "learning_rate": 3.834159402300841e-06, + "loss": 0.93806314, + "num_input_tokens_seen": 56540665, + "router_z_loss_clip": 2.9296875, + "router_z_loss_mlp": 0.2487793, + "step": 2605, + "time_per_iteration": 2.6109507083892822 + }, + { + "auxiliary_loss_clip": 0.06649123, + "auxiliary_loss_mlp": 0.01294389, + "balance_loss_clip": 0.06348212, + "balance_loss_mlp": 0.01265802, + "epoch": 0.1566811964527281, + "flos": 26691876846720.0, + "grad_norm": 1.9127965853266395, + "language_loss": 0.73996091, + "learning_rate": 3.834004087624087e-06, + "loss": 0.81939602, + "num_input_tokens_seen": 56560805, + "router_z_loss_clip": 3.00976562, + "router_z_loss_mlp": 0.28564453, + "step": 2606, + "time_per_iteration": 2.7345151901245117 + }, + { + "auxiliary_loss_clip": 0.06621392, + "auxiliary_loss_mlp": 0.01286091, + "balance_loss_clip": 0.06334884, + "balance_loss_mlp": 0.01260246, + "epoch": 0.15674131970539606, + "flos": 16108323338880.0, + "grad_norm": 2.273122789948623, + "language_loss": 0.77297181, + "learning_rate": 3.8338487034017145e-06, + "loss": 0.85204661, + "num_input_tokens_seen": 56576335, + "router_z_loss_clip": 2.8671875, + "router_z_loss_mlp": 0.25842285, + "step": 2607, + "time_per_iteration": 2.571983575820923 + }, + { + "auxiliary_loss_clip": 0.06614074, + "auxiliary_loss_mlp": 0.01286338, + "balance_loss_clip": 0.06327923, + "balance_loss_mlp": 0.01260791, + "epoch": 0.15680144295806403, + "flos": 19175349536640.0, + "grad_norm": 1.917731361959034, + "language_loss": 0.8328836, + "learning_rate": 3.833693249639615e-06, + "loss": 0.91188771, + "num_input_tokens_seen": 56595880, + "router_z_loss_clip": 2.86328125, + "router_z_loss_mlp": 0.25598145, + "step": 2608, + "time_per_iteration": 2.5823540687561035 + }, + { + "auxiliary_loss_clip": 0.06622173, + "auxiliary_loss_mlp": 0.01295073, + "balance_loss_clip": 0.06326167, + "balance_loss_mlp": 0.01264901, + "epoch": 0.156861566210732, + "flos": 20819678060160.0, + "grad_norm": 2.1481617307418017, + "language_loss": 0.73101258, + "learning_rate": 3.833537726343684e-06, + "loss": 0.81018502, + "num_input_tokens_seen": 56615130, + "router_z_loss_clip": 2.9609375, + "router_z_loss_mlp": 0.30163574, + "step": 2609, + "time_per_iteration": 2.572356700897217 + }, + { + "auxiliary_loss_clip": 0.06605803, + "auxiliary_loss_mlp": 0.01286832, + "balance_loss_clip": 0.06311236, + "balance_loss_mlp": 0.01260928, + "epoch": 0.15692168946339996, + "flos": 20054158358400.0, + "grad_norm": 2.0130429141277446, + "language_loss": 0.73445058, + "learning_rate": 3.833382133519818e-06, + "loss": 0.8133769, + "num_input_tokens_seen": 56634005, + "router_z_loss_clip": 2.94726562, + "router_z_loss_mlp": 0.2590332, + "step": 2610, + "time_per_iteration": 2.567537784576416 + }, + { + "auxiliary_loss_clip": 0.06606032, + "auxiliary_loss_mlp": 0.01288962, + "balance_loss_clip": 0.06310159, + "balance_loss_mlp": 0.01258873, + "epoch": 0.15698181271606793, + "flos": 21404502432000.0, + "grad_norm": 1.9787082052238874, + "language_loss": 0.73279381, + "learning_rate": 3.833226471173919e-06, + "loss": 0.81174374, + "num_input_tokens_seen": 56653480, + "router_z_loss_clip": 2.95898438, + "router_z_loss_mlp": 0.30065918, + "step": 2611, + "time_per_iteration": 2.582390308380127 + }, + { + "auxiliary_loss_clip": 0.06594902, + "auxiliary_loss_mlp": 0.01284265, + "balance_loss_clip": 0.06304685, + "balance_loss_mlp": 0.01259172, + "epoch": 0.15704193596873592, + "flos": 20851347703680.0, + "grad_norm": 2.098501694873674, + "language_loss": 0.71879792, + "learning_rate": 3.833070739311887e-06, + "loss": 0.79758954, + "num_input_tokens_seen": 56672270, + "router_z_loss_clip": 2.90234375, + "router_z_loss_mlp": 0.25097656, + "step": 2612, + "time_per_iteration": 2.577627658843994 + }, + { + "auxiliary_loss_clip": 0.0659887, + "auxiliary_loss_mlp": 0.01283795, + "balance_loss_clip": 0.06308534, + "balance_loss_mlp": 0.0125832, + "epoch": 0.15710205922140388, + "flos": 21769456890240.0, + "grad_norm": 2.359608918603851, + "language_loss": 0.77193695, + "learning_rate": 3.83291493793963e-06, + "loss": 0.85076362, + "num_input_tokens_seen": 56691510, + "router_z_loss_clip": 2.90429688, + "router_z_loss_mlp": 0.2545166, + "step": 2613, + "time_per_iteration": 2.5632479190826416 + }, + { + "auxiliary_loss_clip": 0.06608421, + "auxiliary_loss_mlp": 0.01292559, + "balance_loss_clip": 0.06315231, + "balance_loss_mlp": 0.01266106, + "epoch": 0.15716218247407185, + "flos": 25014453160320.0, + "grad_norm": 1.6622650675423762, + "language_loss": 0.66684031, + "learning_rate": 3.832759067063055e-06, + "loss": 0.74585009, + "num_input_tokens_seen": 56712230, + "router_z_loss_clip": 2.93359375, + "router_z_loss_mlp": 0.26428223, + "step": 2614, + "time_per_iteration": 2.684286117553711 + }, + { + "auxiliary_loss_clip": 0.0661184, + "auxiliary_loss_mlp": 0.01292567, + "balance_loss_clip": 0.06314493, + "balance_loss_mlp": 0.01264255, + "epoch": 0.1572223057267398, + "flos": 20197691602560.0, + "grad_norm": 3.2869095787841576, + "language_loss": 0.76402575, + "learning_rate": 3.832603126688072e-06, + "loss": 0.84306979, + "num_input_tokens_seen": 56727490, + "router_z_loss_clip": 2.9765625, + "router_z_loss_mlp": 0.28308105, + "step": 2615, + "time_per_iteration": 2.551769971847534 + }, + { + "auxiliary_loss_clip": 0.06589202, + "auxiliary_loss_mlp": 0.01285836, + "balance_loss_clip": 0.06304425, + "balance_loss_mlp": 0.01260587, + "epoch": 0.15728242897940778, + "flos": 20965810780800.0, + "grad_norm": 1.7986527043954237, + "language_loss": 0.74040192, + "learning_rate": 3.832447116820594e-06, + "loss": 0.81915236, + "num_input_tokens_seen": 56747385, + "router_z_loss_clip": 2.84960938, + "router_z_loss_mlp": 0.25256348, + "step": 2616, + "time_per_iteration": 2.5935630798339844 + }, + { + "auxiliary_loss_clip": 0.06601542, + "auxiliary_loss_mlp": 0.01283526, + "balance_loss_clip": 0.06305884, + "balance_loss_mlp": 0.01256966, + "epoch": 0.15734255223207574, + "flos": 23044764032640.0, + "grad_norm": 2.1005464521191426, + "language_loss": 0.73305666, + "learning_rate": 3.832291037466539e-06, + "loss": 0.81190741, + "num_input_tokens_seen": 56768055, + "router_z_loss_clip": 2.95703125, + "router_z_loss_mlp": 0.265625, + "step": 2617, + "time_per_iteration": 2.6116013526916504 + }, + { + "auxiliary_loss_clip": 0.06593003, + "auxiliary_loss_mlp": 0.01287239, + "balance_loss_clip": 0.06306564, + "balance_loss_mlp": 0.012605, + "epoch": 0.15740267548474374, + "flos": 20556357004800.0, + "grad_norm": 2.1735503953171813, + "language_loss": 0.75337285, + "learning_rate": 3.8321348886318235e-06, + "loss": 0.83217525, + "num_input_tokens_seen": 56785110, + "router_z_loss_clip": 2.86523438, + "router_z_loss_mlp": 0.26745605, + "step": 2618, + "time_per_iteration": 2.558271884918213 + }, + { + "auxiliary_loss_clip": 0.06606486, + "auxiliary_loss_mlp": 0.01288019, + "balance_loss_clip": 0.06305802, + "balance_loss_mlp": 0.01260052, + "epoch": 0.1574627987374117, + "flos": 22672262707200.0, + "grad_norm": 2.4653942739702277, + "language_loss": 0.79897004, + "learning_rate": 3.8319786703223695e-06, + "loss": 0.87791508, + "num_input_tokens_seen": 56804975, + "router_z_loss_clip": 3.00585938, + "router_z_loss_mlp": 0.2800293, + "step": 2619, + "time_per_iteration": 2.5732688903808594 + }, + { + "auxiliary_loss_clip": 0.06592336, + "auxiliary_loss_mlp": 0.01289339, + "balance_loss_clip": 0.06304029, + "balance_loss_mlp": 0.01263304, + "epoch": 0.15752292199007967, + "flos": 16806352976640.0, + "grad_norm": 1.8956550238632917, + "language_loss": 0.77960408, + "learning_rate": 3.831822382544101e-06, + "loss": 0.85842085, + "num_input_tokens_seen": 56822470, + "router_z_loss_clip": 2.88476562, + "router_z_loss_mlp": 0.26013184, + "step": 2620, + "time_per_iteration": 2.556342363357544 + }, + { + "auxiliary_loss_clip": 0.06608844, + "auxiliary_loss_mlp": 0.01287118, + "balance_loss_clip": 0.06316274, + "balance_loss_mlp": 0.01259843, + "epoch": 0.15758304524274763, + "flos": 29833856121600.0, + "grad_norm": 1.8795614053933318, + "language_loss": 0.72243416, + "learning_rate": 3.831666025302944e-06, + "loss": 0.80139381, + "num_input_tokens_seen": 56842100, + "router_z_loss_clip": 2.92382812, + "router_z_loss_mlp": 0.27282715, + "step": 2621, + "time_per_iteration": 4.014448881149292 + }, + { + "auxiliary_loss_clip": 0.06605494, + "auxiliary_loss_mlp": 0.01287754, + "balance_loss_clip": 0.06309334, + "balance_loss_mlp": 0.01260813, + "epoch": 0.1576431684954156, + "flos": 53589116851200.0, + "grad_norm": 5.362699165833927, + "language_loss": 0.73428345, + "learning_rate": 3.831509598604828e-06, + "loss": 0.81321585, + "num_input_tokens_seen": 56865920, + "router_z_loss_clip": 2.96484375, + "router_z_loss_mlp": 0.26940918, + "step": 2622, + "time_per_iteration": 2.9332852363586426 + }, + { + "auxiliary_loss_clip": 0.06587812, + "auxiliary_loss_mlp": 0.01287353, + "balance_loss_clip": 0.06302886, + "balance_loss_mlp": 0.01262284, + "epoch": 0.15770329174808356, + "flos": 20819887695360.0, + "grad_norm": 1.8034719431418926, + "language_loss": 0.88731241, + "learning_rate": 3.831353102455684e-06, + "loss": 0.96606404, + "num_input_tokens_seen": 56885265, + "router_z_loss_clip": 2.84765625, + "router_z_loss_mlp": 0.25085449, + "step": 2623, + "time_per_iteration": 3.993907928466797 + }, + { + "auxiliary_loss_clip": 0.06595732, + "auxiliary_loss_mlp": 0.01282154, + "balance_loss_clip": 0.0630911, + "balance_loss_mlp": 0.01255594, + "epoch": 0.15776341500075153, + "flos": 24981148362240.0, + "grad_norm": 2.539905380031208, + "language_loss": 0.82629728, + "learning_rate": 3.831196536861448e-06, + "loss": 0.90507615, + "num_input_tokens_seen": 56906710, + "router_z_loss_clip": 2.86523438, + "router_z_loss_mlp": 0.265625, + "step": 2624, + "time_per_iteration": 2.5706846714019775 + }, + { + "auxiliary_loss_clip": 0.06606949, + "auxiliary_loss_mlp": 0.01292533, + "balance_loss_clip": 0.06309812, + "balance_loss_mlp": 0.01266093, + "epoch": 0.15782353825341952, + "flos": 21914331799680.0, + "grad_norm": 3.0693090763099815, + "language_loss": 0.81940538, + "learning_rate": 3.831039901828054e-06, + "loss": 0.89840019, + "num_input_tokens_seen": 56924275, + "router_z_loss_clip": 2.9765625, + "router_z_loss_mlp": 0.26452637, + "step": 2625, + "time_per_iteration": 2.569840669631958 + }, + { + "auxiliary_loss_clip": 0.06593765, + "auxiliary_loss_mlp": 0.01293944, + "balance_loss_clip": 0.06303135, + "balance_loss_mlp": 0.01268064, + "epoch": 0.15788366150608749, + "flos": 26184395393280.0, + "grad_norm": 2.523517901800404, + "language_loss": 0.81776226, + "learning_rate": 3.830883197361445e-06, + "loss": 0.89663935, + "num_input_tokens_seen": 56941525, + "router_z_loss_clip": 2.90429688, + "router_z_loss_mlp": 0.25891113, + "step": 2626, + "time_per_iteration": 2.561379909515381 + }, + { + "auxiliary_loss_clip": 0.06594853, + "auxiliary_loss_mlp": 0.01294161, + "balance_loss_clip": 0.06304863, + "balance_loss_mlp": 0.01267434, + "epoch": 0.15794378475875545, + "flos": 27717321513600.0, + "grad_norm": 1.6929688421529916, + "language_loss": 0.7457962, + "learning_rate": 3.830726423467561e-06, + "loss": 0.82468635, + "num_input_tokens_seen": 56962145, + "router_z_loss_clip": 2.90039062, + "router_z_loss_mlp": 0.26708984, + "step": 2627, + "time_per_iteration": 2.596707344055176 + }, + { + "auxiliary_loss_clip": 0.06587663, + "auxiliary_loss_mlp": 0.01294139, + "balance_loss_clip": 0.06296949, + "balance_loss_mlp": 0.01267007, + "epoch": 0.15800390801142342, + "flos": 12135011379840.0, + "grad_norm": 2.3877400099999413, + "language_loss": 0.87097675, + "learning_rate": 3.830569580152348e-06, + "loss": 0.94979477, + "num_input_tokens_seen": 56977505, + "router_z_loss_clip": 2.91015625, + "router_z_loss_mlp": 0.27172852, + "step": 2628, + "time_per_iteration": 5.372643709182739 + }, + { + "auxiliary_loss_clip": 0.06588875, + "auxiliary_loss_mlp": 0.01280598, + "balance_loss_clip": 0.06300817, + "balance_loss_mlp": 0.0125548, + "epoch": 0.15806403126409138, + "flos": 20711084768640.0, + "grad_norm": 2.1789511738163236, + "language_loss": 0.77439439, + "learning_rate": 3.830412667421752e-06, + "loss": 0.85308909, + "num_input_tokens_seen": 56996770, + "router_z_loss_clip": 2.88085938, + "router_z_loss_mlp": 0.25097656, + "step": 2629, + "time_per_iteration": 2.571425199508667 + }, + { + "auxiliary_loss_clip": 0.06593206, + "auxiliary_loss_mlp": 0.0128531, + "balance_loss_clip": 0.06298864, + "balance_loss_mlp": 0.01257117, + "epoch": 0.15812415451675935, + "flos": 17827479158400.0, + "grad_norm": 2.6284348264521853, + "language_loss": 0.74838495, + "learning_rate": 3.8302556852817245e-06, + "loss": 0.82717013, + "num_input_tokens_seen": 57014970, + "router_z_loss_clip": 2.94726562, + "router_z_loss_mlp": 0.28186035, + "step": 2630, + "time_per_iteration": 2.538496971130371 + }, + { + "auxiliary_loss_clip": 0.06592915, + "auxiliary_loss_mlp": 0.01286291, + "balance_loss_clip": 0.06295606, + "balance_loss_mlp": 0.0125904, + "epoch": 0.15818427776942734, + "flos": 20090230341120.0, + "grad_norm": 3.888480122572148, + "language_loss": 0.84692156, + "learning_rate": 3.8300986337382184e-06, + "loss": 0.9257136, + "num_input_tokens_seen": 57034045, + "router_z_loss_clip": 2.97070312, + "router_z_loss_mlp": 0.27270508, + "step": 2631, + "time_per_iteration": 2.6821517944335938 + }, + { + "auxiliary_loss_clip": 0.06584532, + "auxiliary_loss_mlp": 0.01280599, + "balance_loss_clip": 0.06294788, + "balance_loss_mlp": 0.01253563, + "epoch": 0.1582444010220953, + "flos": 21221249552640.0, + "grad_norm": 8.851391146614638, + "language_loss": 0.79768324, + "learning_rate": 3.8299415127971895e-06, + "loss": 0.87633461, + "num_input_tokens_seen": 57053695, + "router_z_loss_clip": 2.8984375, + "router_z_loss_mlp": 0.27050781, + "step": 2632, + "time_per_iteration": 2.5977976322174072 + }, + { + "auxiliary_loss_clip": 0.06588165, + "auxiliary_loss_mlp": 0.01281414, + "balance_loss_clip": 0.06294183, + "balance_loss_mlp": 0.01255414, + "epoch": 0.15830452427476327, + "flos": 17864138119680.0, + "grad_norm": 1.985726901466477, + "language_loss": 0.83594966, + "learning_rate": 3.829784322464594e-06, + "loss": 0.91464543, + "num_input_tokens_seen": 57071290, + "router_z_loss_clip": 2.93945312, + "router_z_loss_mlp": 0.2598877, + "step": 2633, + "time_per_iteration": 2.569474220275879 + }, + { + "auxiliary_loss_clip": 0.0658908, + "auxiliary_loss_mlp": 0.0128242, + "balance_loss_clip": 0.0629508, + "balance_loss_mlp": 0.01256265, + "epoch": 0.15836464752743123, + "flos": 24541827805440.0, + "grad_norm": 1.6688248008006443, + "language_loss": 0.78379452, + "learning_rate": 3.829627062746394e-06, + "loss": 0.86250955, + "num_input_tokens_seen": 57091465, + "router_z_loss_clip": 2.93945312, + "router_z_loss_mlp": 0.26196289, + "step": 2634, + "time_per_iteration": 2.5919923782348633 + }, + { + "auxiliary_loss_clip": 0.06593279, + "auxiliary_loss_mlp": 0.01291316, + "balance_loss_clip": 0.06295943, + "balance_loss_mlp": 0.01263337, + "epoch": 0.1584247707800992, + "flos": 20127057010560.0, + "grad_norm": 2.0830753641117306, + "language_loss": 0.89997375, + "learning_rate": 3.829469733648552e-06, + "loss": 0.97881973, + "num_input_tokens_seen": 57110075, + "router_z_loss_clip": 2.9765625, + "router_z_loss_mlp": 0.27966309, + "step": 2635, + "time_per_iteration": 2.5786406993865967 + }, + { + "auxiliary_loss_clip": 0.06588058, + "auxiliary_loss_mlp": 0.01288113, + "balance_loss_clip": 0.06292774, + "balance_loss_mlp": 0.01260218, + "epoch": 0.15848489403276717, + "flos": 20382202293120.0, + "grad_norm": 2.014850044069841, + "language_loss": 0.7709136, + "learning_rate": 3.829312335177034e-06, + "loss": 0.8496753, + "num_input_tokens_seen": 57128945, + "router_z_loss_clip": 2.95703125, + "router_z_loss_mlp": 0.27868652, + "step": 2636, + "time_per_iteration": 2.6201331615448 + }, + { + "auxiliary_loss_clip": 0.06586573, + "auxiliary_loss_mlp": 0.0128751, + "balance_loss_clip": 0.06290652, + "balance_loss_mlp": 0.0126101, + "epoch": 0.15854501728543513, + "flos": 39356018760960.0, + "grad_norm": 2.044553358008507, + "language_loss": 0.73238122, + "learning_rate": 3.82915486733781e-06, + "loss": 0.81112206, + "num_input_tokens_seen": 57152385, + "router_z_loss_clip": 2.95898438, + "router_z_loss_mlp": 0.26489258, + "step": 2637, + "time_per_iteration": 2.742854595184326 + }, + { + "auxiliary_loss_clip": 0.06583421, + "auxiliary_loss_mlp": 0.01288932, + "balance_loss_clip": 0.06292208, + "balance_loss_mlp": 0.01262468, + "epoch": 0.15860514053810312, + "flos": 24871297259520.0, + "grad_norm": 1.8074381255816763, + "language_loss": 0.79285657, + "learning_rate": 3.82899733013685e-06, + "loss": 0.87158012, + "num_input_tokens_seen": 57172620, + "router_z_loss_clip": 2.91601562, + "router_z_loss_mlp": 0.26489258, + "step": 2638, + "time_per_iteration": 2.5642874240875244 + }, + { + "auxiliary_loss_clip": 0.06588158, + "auxiliary_loss_mlp": 0.01287351, + "balance_loss_clip": 0.06294204, + "balance_loss_mlp": 0.01258908, + "epoch": 0.1586652637907711, + "flos": 26184982371840.0, + "grad_norm": 2.3471549301232844, + "language_loss": 0.76132977, + "learning_rate": 3.828839723580128e-06, + "loss": 0.84008479, + "num_input_tokens_seen": 57194680, + "router_z_loss_clip": 2.94335938, + "router_z_loss_mlp": 0.28491211, + "step": 2639, + "time_per_iteration": 2.615779399871826 + }, + { + "auxiliary_loss_clip": 0.06586854, + "auxiliary_loss_mlp": 0.01295396, + "balance_loss_clip": 0.06293523, + "balance_loss_mlp": 0.01267299, + "epoch": 0.15872538704343905, + "flos": 19798174535040.0, + "grad_norm": 1.8583301329388602, + "language_loss": 0.82681525, + "learning_rate": 3.82868204767362e-06, + "loss": 0.90563774, + "num_input_tokens_seen": 57214675, + "router_z_loss_clip": 2.93164062, + "router_z_loss_mlp": 0.28076172, + "step": 2640, + "time_per_iteration": 2.5406789779663086 + }, + { + "auxiliary_loss_clip": 0.06583565, + "auxiliary_loss_mlp": 0.0129063, + "balance_loss_clip": 0.06294291, + "balance_loss_mlp": 0.01262342, + "epoch": 0.15878551029610702, + "flos": 28482883142400.0, + "grad_norm": 1.847395702831907, + "language_loss": 0.67676318, + "learning_rate": 3.828524302423306e-06, + "loss": 0.75550508, + "num_input_tokens_seen": 57235830, + "router_z_loss_clip": 2.89453125, + "router_z_loss_mlp": 0.28308105, + "step": 2641, + "time_per_iteration": 2.6107757091522217 + }, + { + "auxiliary_loss_clip": 0.06593709, + "auxiliary_loss_mlp": 0.01287834, + "balance_loss_clip": 0.06291051, + "balance_loss_mlp": 0.01259199, + "epoch": 0.15884563354877498, + "flos": 24213532308480.0, + "grad_norm": 2.4455482341546366, + "language_loss": 0.77487421, + "learning_rate": 3.828366487835167e-06, + "loss": 0.85368967, + "num_input_tokens_seen": 57255970, + "router_z_loss_clip": 3.02929688, + "router_z_loss_mlp": 0.28674316, + "step": 2642, + "time_per_iteration": 2.549790382385254 + }, + { + "auxiliary_loss_clip": 0.06588584, + "auxiliary_loss_mlp": 0.01290508, + "balance_loss_clip": 0.06297128, + "balance_loss_mlp": 0.0126303, + "epoch": 0.15890575680144295, + "flos": 23956332600960.0, + "grad_norm": 2.206510162678276, + "language_loss": 0.71574652, + "learning_rate": 3.828208603915186e-06, + "loss": 0.79453743, + "num_input_tokens_seen": 57274435, + "router_z_loss_clip": 2.91992188, + "router_z_loss_mlp": 0.27478027, + "step": 2643, + "time_per_iteration": 2.5622386932373047 + }, + { + "auxiliary_loss_clip": 0.06581764, + "auxiliary_loss_mlp": 0.01292278, + "balance_loss_clip": 0.06295977, + "balance_loss_mlp": 0.01265432, + "epoch": 0.15896588005411091, + "flos": 21221375333760.0, + "grad_norm": 1.9554363630175624, + "language_loss": 0.78877175, + "learning_rate": 3.828050650669353e-06, + "loss": 0.86751211, + "num_input_tokens_seen": 57293115, + "router_z_loss_clip": 2.859375, + "router_z_loss_mlp": 0.26867676, + "step": 2644, + "time_per_iteration": 2.519049644470215 + }, + { + "auxiliary_loss_clip": 0.06584983, + "auxiliary_loss_mlp": 0.01285638, + "balance_loss_clip": 0.06294875, + "balance_loss_mlp": 0.01257588, + "epoch": 0.1590260033067789, + "flos": 24359203831680.0, + "grad_norm": 1.8306681743440225, + "language_loss": 0.83401352, + "learning_rate": 3.827892628103657e-06, + "loss": 0.91271967, + "num_input_tokens_seen": 57312565, + "router_z_loss_clip": 2.90234375, + "router_z_loss_mlp": 0.28039551, + "step": 2645, + "time_per_iteration": 2.5938899517059326 + }, + { + "auxiliary_loss_clip": 0.06594808, + "auxiliary_loss_mlp": 0.01293395, + "balance_loss_clip": 0.063001, + "balance_loss_mlp": 0.01263914, + "epoch": 0.15908612655944687, + "flos": 32056719960960.0, + "grad_norm": 2.510422612834076, + "language_loss": 0.70788723, + "learning_rate": 3.827734536224087e-06, + "loss": 0.78676921, + "num_input_tokens_seen": 57333360, + "router_z_loss_clip": 2.94921875, + "router_z_loss_mlp": 0.2947998, + "step": 2646, + "time_per_iteration": 2.6329824924468994 + }, + { + "auxiliary_loss_clip": 0.06588359, + "auxiliary_loss_mlp": 0.01289443, + "balance_loss_clip": 0.06303679, + "balance_loss_mlp": 0.01262728, + "epoch": 0.15914624981211484, + "flos": 17791155613440.0, + "grad_norm": 1.930709185953096, + "language_loss": 0.63532102, + "learning_rate": 3.827576375036642e-06, + "loss": 0.71409905, + "num_input_tokens_seen": 57350575, + "router_z_loss_clip": 2.84765625, + "router_z_loss_mlp": 0.26696777, + "step": 2647, + "time_per_iteration": 2.5299501419067383 + }, + { + "auxiliary_loss_clip": 0.06584711, + "auxiliary_loss_mlp": 0.01288467, + "balance_loss_clip": 0.06297973, + "balance_loss_mlp": 0.0126174, + "epoch": 0.1592063730647828, + "flos": 17718298888320.0, + "grad_norm": 2.1247786745604818, + "language_loss": 0.90530396, + "learning_rate": 3.827418144547318e-06, + "loss": 0.98403573, + "num_input_tokens_seen": 57367570, + "router_z_loss_clip": 2.86914062, + "router_z_loss_mlp": 0.26757812, + "step": 2648, + "time_per_iteration": 2.5112242698669434 + }, + { + "auxiliary_loss_clip": 0.06582057, + "auxiliary_loss_mlp": 0.01285915, + "balance_loss_clip": 0.06301906, + "balance_loss_mlp": 0.01259915, + "epoch": 0.15926649631745077, + "flos": 18808927632000.0, + "grad_norm": 2.0063837423825044, + "language_loss": 0.92929685, + "learning_rate": 3.827259844762114e-06, + "loss": 1.00797653, + "num_input_tokens_seen": 57383980, + "router_z_loss_clip": 2.80078125, + "router_z_loss_mlp": 0.26013184, + "step": 2649, + "time_per_iteration": 2.5400166511535645 + }, + { + "auxiliary_loss_clip": 0.06614827, + "auxiliary_loss_mlp": 0.01289461, + "balance_loss_clip": 0.0630791, + "balance_loss_mlp": 0.01258156, + "epoch": 0.15932661957011873, + "flos": 17571956532480.0, + "grad_norm": 3.5338623134858924, + "language_loss": 0.73033249, + "learning_rate": 3.827101475687033e-06, + "loss": 0.80937541, + "num_input_tokens_seen": 57400840, + "router_z_loss_clip": 3.07226562, + "router_z_loss_mlp": 0.31311035, + "step": 2650, + "time_per_iteration": 2.499260187149048 + }, + { + "auxiliary_loss_clip": 0.06585062, + "auxiliary_loss_mlp": 0.01286624, + "balance_loss_clip": 0.06301688, + "balance_loss_mlp": 0.01259837, + "epoch": 0.15938674282278673, + "flos": 13339432368000.0, + "grad_norm": 2.105429239138805, + "language_loss": 0.72751939, + "learning_rate": 3.826943037328082e-06, + "loss": 0.80623615, + "num_input_tokens_seen": 57419230, + "router_z_loss_clip": 2.83398438, + "router_z_loss_mlp": 0.2677002, + "step": 2651, + "time_per_iteration": 2.5559604167938232 + }, + { + "auxiliary_loss_clip": 0.06597096, + "auxiliary_loss_mlp": 0.01284795, + "balance_loss_clip": 0.06307643, + "balance_loss_mlp": 0.01257925, + "epoch": 0.1594468660754547, + "flos": 22494879613440.0, + "grad_norm": 1.8417049105495777, + "language_loss": 0.80598879, + "learning_rate": 3.8267845296912674e-06, + "loss": 0.88480765, + "num_input_tokens_seen": 57439315, + "router_z_loss_clip": 2.89257812, + "router_z_loss_mlp": 0.26855469, + "step": 2652, + "time_per_iteration": 2.562206745147705 + }, + { + "auxiliary_loss_clip": 0.06582868, + "auxiliary_loss_mlp": 0.01288009, + "balance_loss_clip": 0.06299073, + "balance_loss_mlp": 0.01260745, + "epoch": 0.15950698932812266, + "flos": 15011782882560.0, + "grad_norm": 3.0665030726784233, + "language_loss": 0.71219099, + "learning_rate": 3.826625952782601e-06, + "loss": 0.79089975, + "num_input_tokens_seen": 57454635, + "router_z_loss_clip": 2.83984375, + "router_z_loss_mlp": 0.27258301, + "step": 2653, + "time_per_iteration": 2.5217130184173584 + }, + { + "auxiliary_loss_clip": 0.06588405, + "auxiliary_loss_mlp": 0.01286539, + "balance_loss_clip": 0.06299819, + "balance_loss_mlp": 0.01261064, + "epoch": 0.15956711258079062, + "flos": 30163074013440.0, + "grad_norm": 3.2964270915620655, + "language_loss": 0.78400207, + "learning_rate": 3.826467306608095e-06, + "loss": 0.86275154, + "num_input_tokens_seen": 57476805, + "router_z_loss_clip": 2.890625, + "router_z_loss_mlp": 0.25488281, + "step": 2654, + "time_per_iteration": 2.68938946723938 + }, + { + "auxiliary_loss_clip": 0.06585521, + "auxiliary_loss_mlp": 0.01284621, + "balance_loss_clip": 0.06301536, + "balance_loss_mlp": 0.01259265, + "epoch": 0.1596272358334586, + "flos": 21039044849280.0, + "grad_norm": 1.8634603693624054, + "language_loss": 0.82786137, + "learning_rate": 3.826308591173765e-06, + "loss": 0.90656281, + "num_input_tokens_seen": 57496400, + "router_z_loss_clip": 2.83984375, + "router_z_loss_mlp": 0.25341797, + "step": 2655, + "time_per_iteration": 2.5611259937286377 + }, + { + "auxiliary_loss_clip": 0.06585874, + "auxiliary_loss_mlp": 0.01285173, + "balance_loss_clip": 0.06296754, + "balance_loss_mlp": 0.01259937, + "epoch": 0.15968735908612655, + "flos": 15273426856320.0, + "grad_norm": 1.9406686852412747, + "language_loss": 0.74707991, + "learning_rate": 3.826149806485631e-06, + "loss": 0.82579041, + "num_input_tokens_seen": 57513700, + "router_z_loss_clip": 2.890625, + "router_z_loss_mlp": 0.25244141, + "step": 2656, + "time_per_iteration": 2.510824680328369 + }, + { + "auxiliary_loss_clip": 0.06577112, + "auxiliary_loss_mlp": 0.0129381, + "balance_loss_clip": 0.06299932, + "balance_loss_mlp": 0.01268705, + "epoch": 0.15974748233879452, + "flos": 52677338647680.0, + "grad_norm": 1.8958398061879393, + "language_loss": 0.78470719, + "learning_rate": 3.825990952549713e-06, + "loss": 0.86341643, + "num_input_tokens_seen": 57536180, + "router_z_loss_clip": 2.7734375, + "router_z_loss_mlp": 0.25109863, + "step": 2657, + "time_per_iteration": 2.8164706230163574 + }, + { + "auxiliary_loss_clip": 0.06582649, + "auxiliary_loss_mlp": 0.01286585, + "balance_loss_clip": 0.062974, + "balance_loss_mlp": 0.01260514, + "epoch": 0.1598076055914625, + "flos": 18739047726720.0, + "grad_norm": 1.7078792593137306, + "language_loss": 0.75124943, + "learning_rate": 3.825832029372035e-06, + "loss": 0.82994181, + "num_input_tokens_seen": 57555025, + "router_z_loss_clip": 2.84960938, + "router_z_loss_mlp": 0.26098633, + "step": 2658, + "time_per_iteration": 2.539357900619507 + }, + { + "auxiliary_loss_clip": 0.06584077, + "auxiliary_loss_mlp": 0.01290613, + "balance_loss_clip": 0.06297718, + "balance_loss_mlp": 0.0126354, + "epoch": 0.15986772884413047, + "flos": 34357681405440.0, + "grad_norm": 1.7106510421340806, + "language_loss": 0.76173538, + "learning_rate": 3.825673036958624e-06, + "loss": 0.84048235, + "num_input_tokens_seen": 57577660, + "router_z_loss_clip": 2.86328125, + "router_z_loss_mlp": 0.27087402, + "step": 2659, + "time_per_iteration": 2.7063279151916504 + }, + { + "auxiliary_loss_clip": 0.06590043, + "auxiliary_loss_mlp": 0.01292057, + "balance_loss_clip": 0.06300306, + "balance_loss_mlp": 0.01265164, + "epoch": 0.15992785209679844, + "flos": 22061596550400.0, + "grad_norm": 2.109703300615196, + "language_loss": 0.91436422, + "learning_rate": 3.825513975315508e-06, + "loss": 0.99318516, + "num_input_tokens_seen": 57596335, + "router_z_loss_clip": 2.8984375, + "router_z_loss_mlp": 0.26855469, + "step": 2660, + "time_per_iteration": 3.960657835006714 + }, + { + "auxiliary_loss_clip": 0.06587565, + "auxiliary_loss_mlp": 0.01283697, + "balance_loss_clip": 0.06297715, + "balance_loss_mlp": 0.01257018, + "epoch": 0.1599879753494664, + "flos": 33073946928000.0, + "grad_norm": 2.772952590222661, + "language_loss": 0.79090029, + "learning_rate": 3.82535484444872e-06, + "loss": 0.86961293, + "num_input_tokens_seen": 57616830, + "router_z_loss_clip": 2.90039062, + "router_z_loss_mlp": 0.26647949, + "step": 2661, + "time_per_iteration": 2.64117693901062 + }, + { + "auxiliary_loss_clip": 0.0657732, + "auxiliary_loss_mlp": 0.01287922, + "balance_loss_clip": 0.06293119, + "balance_loss_mlp": 0.01262495, + "epoch": 0.16004809860213437, + "flos": 28045533156480.0, + "grad_norm": 1.8363743510340895, + "language_loss": 0.74837106, + "learning_rate": 3.825195644364292e-06, + "loss": 0.82702351, + "num_input_tokens_seen": 57635515, + "router_z_loss_clip": 2.84375, + "router_z_loss_mlp": 0.25390625, + "step": 2662, + "time_per_iteration": 4.100783586502075 + }, + { + "auxiliary_loss_clip": 0.06590086, + "auxiliary_loss_mlp": 0.01285907, + "balance_loss_clip": 0.06299042, + "balance_loss_mlp": 0.01259967, + "epoch": 0.16010822185480234, + "flos": 22786096878720.0, + "grad_norm": 1.8771670502098623, + "language_loss": 0.82632995, + "learning_rate": 3.825036375068263e-06, + "loss": 0.90508991, + "num_input_tokens_seen": 57654250, + "router_z_loss_clip": 2.91015625, + "router_z_loss_mlp": 0.25964355, + "step": 2663, + "time_per_iteration": 2.5558366775512695 + }, + { + "auxiliary_loss_clip": 0.06586467, + "auxiliary_loss_mlp": 0.01285272, + "balance_loss_clip": 0.06297847, + "balance_loss_mlp": 0.01260011, + "epoch": 0.16016834510747033, + "flos": 20090188414080.0, + "grad_norm": 3.3923647685745344, + "language_loss": 0.81316251, + "learning_rate": 3.824877036566672e-06, + "loss": 0.89187992, + "num_input_tokens_seen": 57672645, + "router_z_loss_clip": 2.88671875, + "router_z_loss_mlp": 0.25268555, + "step": 2664, + "time_per_iteration": 2.5118319988250732 + }, + { + "auxiliary_loss_clip": 0.06584498, + "auxiliary_loss_mlp": 0.01285586, + "balance_loss_clip": 0.06298545, + "balance_loss_mlp": 0.01259038, + "epoch": 0.1602284683601383, + "flos": 21179391638400.0, + "grad_norm": 1.6927431664351194, + "language_loss": 0.94832575, + "learning_rate": 3.824717628865561e-06, + "loss": 1.02702665, + "num_input_tokens_seen": 57691055, + "router_z_loss_clip": 2.86132812, + "router_z_loss_mlp": 0.26550293, + "step": 2665, + "time_per_iteration": 2.54654860496521 + }, + { + "auxiliary_loss_clip": 0.06588221, + "auxiliary_loss_mlp": 0.0128992, + "balance_loss_clip": 0.06298642, + "balance_loss_mlp": 0.01263051, + "epoch": 0.16028859161280626, + "flos": 14652823991040.0, + "grad_norm": 2.069431022104881, + "language_loss": 0.85796285, + "learning_rate": 3.824558151970974e-06, + "loss": 0.93674427, + "num_input_tokens_seen": 57707235, + "router_z_loss_clip": 2.89648438, + "router_z_loss_mlp": 0.26879883, + "step": 2666, + "time_per_iteration": 2.483457088470459 + }, + { + "auxiliary_loss_clip": 0.06582008, + "auxiliary_loss_mlp": 0.01292714, + "balance_loss_clip": 0.06294423, + "balance_loss_mlp": 0.01268645, + "epoch": 0.16034871486547422, + "flos": 20995677561600.0, + "grad_norm": 1.9110296287370478, + "language_loss": 0.82042331, + "learning_rate": 3.8243986058889595e-06, + "loss": 0.89917052, + "num_input_tokens_seen": 57724190, + "router_z_loss_clip": 2.875, + "router_z_loss_mlp": 0.24072266, + "step": 2667, + "time_per_iteration": 3.9772729873657227 + }, + { + "auxiliary_loss_clip": 0.06585021, + "auxiliary_loss_mlp": 0.01299108, + "balance_loss_clip": 0.06302348, + "balance_loss_mlp": 0.01272608, + "epoch": 0.1604088381181422, + "flos": 21404167015680.0, + "grad_norm": 2.2548046072843664, + "language_loss": 0.74520987, + "learning_rate": 3.824238990625567e-06, + "loss": 0.82405114, + "num_input_tokens_seen": 57743620, + "router_z_loss_clip": 2.82421875, + "router_z_loss_mlp": 0.26513672, + "step": 2668, + "time_per_iteration": 2.5379245281219482 + }, + { + "auxiliary_loss_clip": 0.06581191, + "auxiliary_loss_mlp": 0.01286404, + "balance_loss_clip": 0.06295477, + "balance_loss_mlp": 0.01259296, + "epoch": 0.16046896137081015, + "flos": 23883601656960.0, + "grad_norm": 1.6904761581724046, + "language_loss": 0.78225315, + "learning_rate": 3.824079306186848e-06, + "loss": 0.86092913, + "num_input_tokens_seen": 57764810, + "router_z_loss_clip": 2.86132812, + "router_z_loss_mlp": 0.27124023, + "step": 2669, + "time_per_iteration": 2.5322623252868652 + }, + { + "auxiliary_loss_clip": 0.06461855, + "auxiliary_loss_mlp": 0.01262059, + "balance_loss_clip": 0.06292316, + "balance_loss_mlp": 0.01253518, + "epoch": 0.16052908462347812, + "flos": 59823907453440.0, + "grad_norm": 0.8025105121256505, + "language_loss": 0.55497211, + "learning_rate": 3.823919552578861e-06, + "loss": 0.63221133, + "num_input_tokens_seen": 57824390, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.08551025, + "step": 2670, + "time_per_iteration": 3.0635480880737305 + }, + { + "auxiliary_loss_clip": 0.06584324, + "auxiliary_loss_mlp": 0.01300694, + "balance_loss_clip": 0.06294604, + "balance_loss_mlp": 0.01273097, + "epoch": 0.1605892078761461, + "flos": 18302494354560.0, + "grad_norm": 1.9278903563018932, + "language_loss": 0.79113603, + "learning_rate": 3.82375972980766e-06, + "loss": 0.86998624, + "num_input_tokens_seen": 57843665, + "router_z_loss_clip": 2.8984375, + "router_z_loss_mlp": 0.27587891, + "step": 2671, + "time_per_iteration": 2.5478527545928955 + }, + { + "auxiliary_loss_clip": 0.06586512, + "auxiliary_loss_mlp": 0.01285282, + "balance_loss_clip": 0.06298812, + "balance_loss_mlp": 0.01259914, + "epoch": 0.16064933112881408, + "flos": 32168918977920.0, + "grad_norm": 2.1901870356390964, + "language_loss": 0.65440154, + "learning_rate": 3.8235998378793086e-06, + "loss": 0.73311949, + "num_input_tokens_seen": 57863305, + "router_z_loss_clip": 2.87890625, + "router_z_loss_mlp": 0.25378418, + "step": 2672, + "time_per_iteration": 2.659353494644165 + }, + { + "auxiliary_loss_clip": 0.06589735, + "auxiliary_loss_mlp": 0.01293218, + "balance_loss_clip": 0.06296135, + "balance_loss_mlp": 0.01263916, + "epoch": 0.16070945438148204, + "flos": 19834959277440.0, + "grad_norm": 2.1290275432047037, + "language_loss": 0.86193001, + "learning_rate": 3.8234398767998675e-06, + "loss": 0.94075954, + "num_input_tokens_seen": 57883025, + "router_z_loss_clip": 2.93554688, + "router_z_loss_mlp": 0.29296875, + "step": 2673, + "time_per_iteration": 2.5288193225860596 + }, + { + "auxiliary_loss_clip": 0.06583102, + "auxiliary_loss_mlp": 0.01291016, + "balance_loss_clip": 0.06295739, + "balance_loss_mlp": 0.0126572, + "epoch": 0.16076957763415, + "flos": 18918569099520.0, + "grad_norm": 2.3065631305512473, + "language_loss": 0.73982865, + "learning_rate": 3.823279846575403e-06, + "loss": 0.81856978, + "num_input_tokens_seen": 57901430, + "router_z_loss_clip": 2.87304688, + "router_z_loss_mlp": 0.25305176, + "step": 2674, + "time_per_iteration": 2.524121046066284 + }, + { + "auxiliary_loss_clip": 0.06576435, + "auxiliary_loss_mlp": 0.0128192, + "balance_loss_clip": 0.06293078, + "balance_loss_mlp": 0.01255086, + "epoch": 0.16082970088681797, + "flos": 16770071358720.0, + "grad_norm": 3.691225614104051, + "language_loss": 0.85411537, + "learning_rate": 3.823119747211986e-06, + "loss": 0.93269891, + "num_input_tokens_seen": 57919550, + "router_z_loss_clip": 2.83007812, + "router_z_loss_mlp": 0.26806641, + "step": 2675, + "time_per_iteration": 2.4984703063964844 + }, + { + "auxiliary_loss_clip": 0.06581541, + "auxiliary_loss_mlp": 0.01285801, + "balance_loss_clip": 0.06293826, + "balance_loss_mlp": 0.01259468, + "epoch": 0.16088982413948594, + "flos": 35158560330240.0, + "grad_norm": 1.8394721735800996, + "language_loss": 0.83251232, + "learning_rate": 3.822959578715685e-06, + "loss": 0.91118574, + "num_input_tokens_seen": 57939890, + "router_z_loss_clip": 2.875, + "router_z_loss_mlp": 0.26306152, + "step": 2676, + "time_per_iteration": 2.6714260578155518 + }, + { + "auxiliary_loss_clip": 0.06567734, + "auxiliary_loss_mlp": 0.01280714, + "balance_loss_clip": 0.06290022, + "balance_loss_mlp": 0.01257456, + "epoch": 0.1609499473921539, + "flos": 18631125267840.0, + "grad_norm": 4.8459600996760805, + "language_loss": 0.74951547, + "learning_rate": 3.822799341092573e-06, + "loss": 0.82799989, + "num_input_tokens_seen": 57957410, + "router_z_loss_clip": 2.77539062, + "router_z_loss_mlp": 0.23266602, + "step": 2677, + "time_per_iteration": 2.5061256885528564 + }, + { + "auxiliary_loss_clip": 0.06577247, + "auxiliary_loss_mlp": 0.01283067, + "balance_loss_clip": 0.06292509, + "balance_loss_mlp": 0.01258164, + "epoch": 0.1610100706448219, + "flos": 33154057031040.0, + "grad_norm": 1.8038433202406936, + "language_loss": 0.77285242, + "learning_rate": 3.822639034348728e-06, + "loss": 0.85145557, + "num_input_tokens_seen": 57977900, + "router_z_loss_clip": 2.84765625, + "router_z_loss_mlp": 0.24926758, + "step": 2678, + "time_per_iteration": 2.6886472702026367 + }, + { + "auxiliary_loss_clip": 0.06581186, + "auxiliary_loss_mlp": 0.01287879, + "balance_loss_clip": 0.06295253, + "balance_loss_mlp": 0.01261271, + "epoch": 0.16107019389748986, + "flos": 34685054507520.0, + "grad_norm": 1.8476006870379242, + "language_loss": 0.71465111, + "learning_rate": 3.822478658490228e-06, + "loss": 0.79334176, + "num_input_tokens_seen": 57998210, + "router_z_loss_clip": 2.85742188, + "router_z_loss_mlp": 0.26611328, + "step": 2679, + "time_per_iteration": 2.646629810333252 + }, + { + "auxiliary_loss_clip": 0.06453654, + "auxiliary_loss_mlp": 0.01258662, + "balance_loss_clip": 0.06285442, + "balance_loss_mlp": 0.01250973, + "epoch": 0.16113031715015783, + "flos": 65730920411520.0, + "grad_norm": 0.7655469055577169, + "language_loss": 0.51874888, + "learning_rate": 3.822318213523154e-06, + "loss": 0.59587204, + "num_input_tokens_seen": 58059420, + "router_z_loss_clip": 1.6875, + "router_z_loss_mlp": 0.07678223, + "step": 2680, + "time_per_iteration": 3.3470637798309326 + }, + { + "auxiliary_loss_clip": 0.06584955, + "auxiliary_loss_mlp": 0.01288163, + "balance_loss_clip": 0.06295321, + "balance_loss_mlp": 0.01259363, + "epoch": 0.1611904404028258, + "flos": 20816156188800.0, + "grad_norm": 2.2126972690115476, + "language_loss": 0.81079412, + "learning_rate": 3.8221576994535925e-06, + "loss": 0.88952529, + "num_input_tokens_seen": 58078370, + "router_z_loss_clip": 2.8984375, + "router_z_loss_mlp": 0.28808594, + "step": 2681, + "time_per_iteration": 2.5526723861694336 + }, + { + "auxiliary_loss_clip": 0.06577247, + "auxiliary_loss_mlp": 0.01287934, + "balance_loss_clip": 0.06295492, + "balance_loss_mlp": 0.01262029, + "epoch": 0.16125056365549376, + "flos": 27020172343680.0, + "grad_norm": 2.1176985882953647, + "language_loss": 0.70093226, + "learning_rate": 3.821997116287627e-06, + "loss": 0.77958405, + "num_input_tokens_seen": 58097395, + "router_z_loss_clip": 2.81835938, + "router_z_loss_mlp": 0.25891113, + "step": 2682, + "time_per_iteration": 2.5618250370025635 + }, + { + "auxiliary_loss_clip": 0.0657934, + "auxiliary_loss_mlp": 0.01288185, + "balance_loss_clip": 0.06295457, + "balance_loss_mlp": 0.01261708, + "epoch": 0.16131068690816172, + "flos": 19281762622080.0, + "grad_norm": 2.105414566897303, + "language_loss": 0.88063419, + "learning_rate": 3.821836464031348e-06, + "loss": 0.9593094, + "num_input_tokens_seen": 58115630, + "router_z_loss_clip": 2.83789062, + "router_z_loss_mlp": 0.26464844, + "step": 2683, + "time_per_iteration": 2.528503656387329 + }, + { + "auxiliary_loss_clip": 0.06581098, + "auxiliary_loss_mlp": 0.01286491, + "balance_loss_clip": 0.06297365, + "balance_loss_mlp": 0.01260718, + "epoch": 0.16137081016082971, + "flos": 35347137943680.0, + "grad_norm": 2.6304159370219447, + "language_loss": 0.75242329, + "learning_rate": 3.821675742690849e-06, + "loss": 0.83109927, + "num_input_tokens_seen": 58138655, + "router_z_loss_clip": 2.83984375, + "router_z_loss_mlp": 0.25744629, + "step": 2684, + "time_per_iteration": 2.6683855056762695 + }, + { + "auxiliary_loss_clip": 0.06584509, + "auxiliary_loss_mlp": 0.01281022, + "balance_loss_clip": 0.0629454, + "balance_loss_mlp": 0.01253831, + "epoch": 0.16143093341349768, + "flos": 34242924839040.0, + "grad_norm": 3.4255618739056395, + "language_loss": 0.70703149, + "learning_rate": 3.821514952272223e-06, + "loss": 0.78568679, + "num_input_tokens_seen": 58157440, + "router_z_loss_clip": 2.90039062, + "router_z_loss_mlp": 0.27185059, + "step": 2685, + "time_per_iteration": 2.6502463817596436 + }, + { + "auxiliary_loss_clip": 0.06573574, + "auxiliary_loss_mlp": 0.01295712, + "balance_loss_clip": 0.06295055, + "balance_loss_mlp": 0.01269724, + "epoch": 0.16149105666616564, + "flos": 28006400499840.0, + "grad_norm": 2.7207808014988495, + "language_loss": 0.72642833, + "learning_rate": 3.821354092781567e-06, + "loss": 0.80512118, + "num_input_tokens_seen": 58176660, + "router_z_loss_clip": 2.78515625, + "router_z_loss_mlp": 0.26000977, + "step": 2686, + "time_per_iteration": 2.5685417652130127 + }, + { + "auxiliary_loss_clip": 0.06583634, + "auxiliary_loss_mlp": 0.01298345, + "balance_loss_clip": 0.06294057, + "balance_loss_mlp": 0.01269628, + "epoch": 0.1615511799188336, + "flos": 19427434145280.0, + "grad_norm": 2.058545535595822, + "language_loss": 0.82461345, + "learning_rate": 3.821193164224981e-06, + "loss": 0.90343326, + "num_input_tokens_seen": 58195085, + "router_z_loss_clip": 2.90234375, + "router_z_loss_mlp": 0.2869873, + "step": 2687, + "time_per_iteration": 2.5222442150115967 + }, + { + "auxiliary_loss_clip": 0.06594162, + "auxiliary_loss_mlp": 0.01299687, + "balance_loss_clip": 0.06299458, + "balance_loss_mlp": 0.01269109, + "epoch": 0.16161130317150157, + "flos": 22861217664000.0, + "grad_norm": 2.6401237934402575, + "language_loss": 0.72416258, + "learning_rate": 3.821032166608568e-06, + "loss": 0.80310106, + "num_input_tokens_seen": 58213540, + "router_z_loss_clip": 2.9453125, + "router_z_loss_mlp": 0.30578613, + "step": 2688, + "time_per_iteration": 2.5157902240753174 + }, + { + "auxiliary_loss_clip": 0.06589709, + "auxiliary_loss_mlp": 0.01309231, + "balance_loss_clip": 0.06303161, + "balance_loss_mlp": 0.0128161, + "epoch": 0.16167142642416954, + "flos": 26118833973120.0, + "grad_norm": 1.7781492277957918, + "language_loss": 0.76426512, + "learning_rate": 3.8208710999384325e-06, + "loss": 0.84325451, + "num_input_tokens_seen": 58236995, + "router_z_loss_clip": 2.8671875, + "router_z_loss_mlp": 0.27636719, + "step": 2689, + "time_per_iteration": 2.61681866645813 + }, + { + "auxiliary_loss_clip": 0.06586435, + "auxiliary_loss_mlp": 0.01313647, + "balance_loss_clip": 0.06302889, + "balance_loss_mlp": 0.01286182, + "epoch": 0.1617315496768375, + "flos": 22785551827200.0, + "grad_norm": 2.168912849024457, + "language_loss": 0.883026, + "learning_rate": 3.820709964220683e-06, + "loss": 0.96202683, + "num_input_tokens_seen": 58257230, + "router_z_loss_clip": 2.8359375, + "router_z_loss_mlp": 0.27478027, + "step": 2690, + "time_per_iteration": 2.542171001434326 + }, + { + "auxiliary_loss_clip": 0.06581193, + "auxiliary_loss_mlp": 0.01303059, + "balance_loss_clip": 0.06297438, + "balance_loss_mlp": 0.01277, + "epoch": 0.1617916729295055, + "flos": 22023721704960.0, + "grad_norm": 1.681429316785462, + "language_loss": 0.88894439, + "learning_rate": 3.8205487594614284e-06, + "loss": 0.96778685, + "num_input_tokens_seen": 58277080, + "router_z_loss_clip": 2.83398438, + "router_z_loss_mlp": 0.26049805, + "step": 2691, + "time_per_iteration": 2.5444743633270264 + }, + { + "auxiliary_loss_clip": 0.06592601, + "auxiliary_loss_mlp": 0.01300554, + "balance_loss_clip": 0.06297764, + "balance_loss_mlp": 0.01270108, + "epoch": 0.16185179618217346, + "flos": 23444574589440.0, + "grad_norm": 5.894128293889176, + "language_loss": 0.8353231, + "learning_rate": 3.820387485666784e-06, + "loss": 0.91425461, + "num_input_tokens_seen": 58294815, + "router_z_loss_clip": 2.94921875, + "router_z_loss_mlp": 0.30456543, + "step": 2692, + "time_per_iteration": 2.5367183685302734 + }, + { + "auxiliary_loss_clip": 0.06601407, + "auxiliary_loss_mlp": 0.01299753, + "balance_loss_clip": 0.06306131, + "balance_loss_mlp": 0.01270404, + "epoch": 0.16191191943484143, + "flos": 25673182433280.0, + "grad_norm": 2.87727514771051, + "language_loss": 0.82700074, + "learning_rate": 3.820226142842862e-06, + "loss": 0.9060123, + "num_input_tokens_seen": 58313215, + "router_z_loss_clip": 2.953125, + "router_z_loss_mlp": 0.29333496, + "step": 2693, + "time_per_iteration": 2.6187057495117188 + }, + { + "auxiliary_loss_clip": 0.06582904, + "auxiliary_loss_mlp": 0.01312533, + "balance_loss_clip": 0.06302174, + "balance_loss_mlp": 0.01286724, + "epoch": 0.1619720426875094, + "flos": 23484126516480.0, + "grad_norm": 1.4528149346161843, + "language_loss": 0.85022998, + "learning_rate": 3.820064730995783e-06, + "loss": 0.92918432, + "num_input_tokens_seen": 58333215, + "router_z_loss_clip": 2.80664062, + "router_z_loss_mlp": 0.25793457, + "step": 2694, + "time_per_iteration": 2.5672922134399414 + }, + { + "auxiliary_loss_clip": 0.06594259, + "auxiliary_loss_mlp": 0.01304563, + "balance_loss_clip": 0.0630251, + "balance_loss_mlp": 0.0127612, + "epoch": 0.16203216594017736, + "flos": 24140465948160.0, + "grad_norm": 2.1096932177369654, + "language_loss": 0.70739377, + "learning_rate": 3.819903250131667e-06, + "loss": 0.78638196, + "num_input_tokens_seen": 58351160, + "router_z_loss_clip": 2.921875, + "router_z_loss_mlp": 0.28442383, + "step": 2695, + "time_per_iteration": 2.5555880069732666 + }, + { + "auxiliary_loss_clip": 0.0659132, + "auxiliary_loss_mlp": 0.01297552, + "balance_loss_clip": 0.0630125, + "balance_loss_mlp": 0.01269943, + "epoch": 0.16209228919284532, + "flos": 22346566686720.0, + "grad_norm": 2.7194545314545153, + "language_loss": 0.83673584, + "learning_rate": 3.819741700256637e-06, + "loss": 0.91562462, + "num_input_tokens_seen": 58368505, + "router_z_loss_clip": 2.90429688, + "router_z_loss_mlp": 0.27600098, + "step": 2696, + "time_per_iteration": 2.520920753479004 + }, + { + "auxiliary_loss_clip": 0.06605247, + "auxiliary_loss_mlp": 0.01295053, + "balance_loss_clip": 0.06302903, + "balance_loss_mlp": 0.01263773, + "epoch": 0.1621524124455133, + "flos": 15820586017920.0, + "grad_norm": 2.3129442406301766, + "language_loss": 0.89183378, + "learning_rate": 3.8195800813768194e-06, + "loss": 0.97083676, + "num_input_tokens_seen": 58385085, + "router_z_loss_clip": 3.02539062, + "router_z_loss_mlp": 0.31274414, + "step": 2697, + "time_per_iteration": 2.5259652137756348 + }, + { + "auxiliary_loss_clip": 0.0658388, + "auxiliary_loss_mlp": 0.01292599, + "balance_loss_clip": 0.06303512, + "balance_loss_mlp": 0.01267004, + "epoch": 0.16221253569818128, + "flos": 30193905116160.0, + "grad_norm": 1.495271767432462, + "language_loss": 0.81588805, + "learning_rate": 3.819418393498343e-06, + "loss": 0.89465284, + "num_input_tokens_seen": 58406985, + "router_z_loss_clip": 2.8046875, + "router_z_loss_mlp": 0.25598145, + "step": 2698, + "time_per_iteration": 2.595975160598755 + }, + { + "auxiliary_loss_clip": 0.06588376, + "auxiliary_loss_mlp": 0.01284743, + "balance_loss_clip": 0.06309167, + "balance_loss_mlp": 0.01257765, + "epoch": 0.16227265895084925, + "flos": 24612546251520.0, + "grad_norm": 1.6873939512975982, + "language_loss": 0.78418016, + "learning_rate": 3.819256636627339e-06, + "loss": 0.86291134, + "num_input_tokens_seen": 58426205, + "router_z_loss_clip": 2.79296875, + "router_z_loss_mlp": 0.26965332, + "step": 2699, + "time_per_iteration": 2.5874006748199463 + }, + { + "auxiliary_loss_clip": 0.06599343, + "auxiliary_loss_mlp": 0.01283682, + "balance_loss_clip": 0.06313124, + "balance_loss_mlp": 0.0125754, + "epoch": 0.1623327822035172, + "flos": 19579436651520.0, + "grad_norm": 5.305505294911747, + "language_loss": 0.86966538, + "learning_rate": 3.81909481076994e-06, + "loss": 0.94849563, + "num_input_tokens_seen": 58443830, + "router_z_loss_clip": 2.859375, + "router_z_loss_mlp": 0.2611084, + "step": 2700, + "time_per_iteration": 4.029258966445923 + }, + { + "auxiliary_loss_clip": 0.06593184, + "auxiliary_loss_mlp": 0.01283437, + "balance_loss_clip": 0.06310724, + "balance_loss_mlp": 0.01256042, + "epoch": 0.16239290545618518, + "flos": 26475612658560.0, + "grad_norm": 1.7724025685719413, + "language_loss": 0.80958557, + "learning_rate": 3.818932915932284e-06, + "loss": 0.8883518, + "num_input_tokens_seen": 58464405, + "router_z_loss_clip": 2.83007812, + "router_z_loss_mlp": 0.27404785, + "step": 2701, + "time_per_iteration": 2.5998921394348145 + }, + { + "auxiliary_loss_clip": 0.06590648, + "auxiliary_loss_mlp": 0.01284929, + "balance_loss_clip": 0.06304645, + "balance_loss_mlp": 0.01256271, + "epoch": 0.16245302870885314, + "flos": 15857454614400.0, + "grad_norm": 1.7204107394325303, + "language_loss": 0.74345064, + "learning_rate": 3.818770952120511e-06, + "loss": 0.8222065, + "num_input_tokens_seen": 58483295, + "router_z_loss_clip": 2.859375, + "router_z_loss_mlp": 0.28649902, + "step": 2702, + "time_per_iteration": 3.937354803085327 + }, + { + "auxiliary_loss_clip": 0.06603839, + "auxiliary_loss_mlp": 0.0128822, + "balance_loss_clip": 0.06313589, + "balance_loss_mlp": 0.01259252, + "epoch": 0.1625131519615211, + "flos": 14761710771840.0, + "grad_norm": 9.119129404803312, + "language_loss": 0.7369948, + "learning_rate": 3.81860891934076e-06, + "loss": 0.81591535, + "num_input_tokens_seen": 58501205, + "router_z_loss_clip": 2.90234375, + "router_z_loss_mlp": 0.28955078, + "step": 2703, + "time_per_iteration": 2.5070807933807373 + }, + { + "auxiliary_loss_clip": 0.066023, + "auxiliary_loss_mlp": 0.01283178, + "balance_loss_clip": 0.0631163, + "balance_loss_mlp": 0.01255033, + "epoch": 0.1625732752141891, + "flos": 28228073276160.0, + "grad_norm": 2.112253840465368, + "language_loss": 0.70914233, + "learning_rate": 3.818446817599176e-06, + "loss": 0.78799713, + "num_input_tokens_seen": 58522315, + "router_z_loss_clip": 2.90820312, + "router_z_loss_mlp": 0.28112793, + "step": 2704, + "time_per_iteration": 2.6071994304656982 + }, + { + "auxiliary_loss_clip": 0.06486984, + "auxiliary_loss_mlp": 0.01271467, + "balance_loss_clip": 0.06323022, + "balance_loss_mlp": 0.01264725, + "epoch": 0.16263339846685707, + "flos": 67347268871040.0, + "grad_norm": 0.7781332743607355, + "language_loss": 0.53379726, + "learning_rate": 3.818284646901907e-06, + "loss": 0.61138183, + "num_input_tokens_seen": 58586695, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.06756592, + "step": 2705, + "time_per_iteration": 3.1592283248901367 + }, + { + "auxiliary_loss_clip": 0.06599878, + "auxiliary_loss_mlp": 0.01288619, + "balance_loss_clip": 0.06308411, + "balance_loss_mlp": 0.01259854, + "epoch": 0.16269352171952503, + "flos": 14324360785920.0, + "grad_norm": 2.6444300047772575, + "language_loss": 0.76420808, + "learning_rate": 3.818122407255102e-06, + "loss": 0.84309304, + "num_input_tokens_seen": 58602435, + "router_z_loss_clip": 2.9140625, + "router_z_loss_mlp": 0.2878418, + "step": 2706, + "time_per_iteration": 2.494798183441162 + }, + { + "auxiliary_loss_clip": 0.06595413, + "auxiliary_loss_mlp": 0.01288657, + "balance_loss_clip": 0.06307741, + "balance_loss_mlp": 0.01263015, + "epoch": 0.162753644972193, + "flos": 28367916940800.0, + "grad_norm": 2.0996317585826727, + "language_loss": 0.73324966, + "learning_rate": 3.817960098664914e-06, + "loss": 0.8120904, + "num_input_tokens_seen": 58621275, + "router_z_loss_clip": 2.87695312, + "router_z_loss_mlp": 0.25646973, + "step": 2707, + "time_per_iteration": 5.361986875534058 + }, + { + "auxiliary_loss_clip": 0.06597963, + "auxiliary_loss_mlp": 0.01297936, + "balance_loss_clip": 0.06310263, + "balance_loss_mlp": 0.01270721, + "epoch": 0.16281376822486096, + "flos": 19943971839360.0, + "grad_norm": 3.72169556400114, + "language_loss": 0.83658004, + "learning_rate": 3.817797721137495e-06, + "loss": 0.91553903, + "num_input_tokens_seen": 58637550, + "router_z_loss_clip": 2.875, + "router_z_loss_mlp": 0.27233887, + "step": 2708, + "time_per_iteration": 2.528703451156616 + }, + { + "auxiliary_loss_clip": 0.0659356, + "auxiliary_loss_mlp": 0.01292098, + "balance_loss_clip": 0.06302815, + "balance_loss_mlp": 0.01262701, + "epoch": 0.16287389147752893, + "flos": 21258118149120.0, + "grad_norm": 2.208557612842335, + "language_loss": 0.86945301, + "learning_rate": 3.817635274679006e-06, + "loss": 0.94830966, + "num_input_tokens_seen": 58654135, + "router_z_loss_clip": 2.91015625, + "router_z_loss_mlp": 0.29394531, + "step": 2709, + "time_per_iteration": 2.5158472061157227 + }, + { + "auxiliary_loss_clip": 0.06590779, + "auxiliary_loss_mlp": 0.01297599, + "balance_loss_clip": 0.06302857, + "balance_loss_mlp": 0.0127123, + "epoch": 0.1629340147301969, + "flos": 19250679957120.0, + "grad_norm": 2.0845626973210942, + "language_loss": 0.926085, + "learning_rate": 3.817472759295605e-06, + "loss": 1.00496876, + "num_input_tokens_seen": 58674320, + "router_z_loss_clip": 2.87695312, + "router_z_loss_mlp": 0.26367188, + "step": 2710, + "time_per_iteration": 2.566678762435913 + }, + { + "auxiliary_loss_clip": 0.06590527, + "auxiliary_loss_mlp": 0.01299634, + "balance_loss_clip": 0.06304915, + "balance_loss_mlp": 0.01271691, + "epoch": 0.16299413798286488, + "flos": 21255896016000.0, + "grad_norm": 2.354283395736919, + "language_loss": 0.82405818, + "learning_rate": 3.817310174993453e-06, + "loss": 0.90295976, + "num_input_tokens_seen": 58691000, + "router_z_loss_clip": 2.859375, + "router_z_loss_mlp": 0.27954102, + "step": 2711, + "time_per_iteration": 2.5129330158233643 + }, + { + "auxiliary_loss_clip": 0.06600536, + "auxiliary_loss_mlp": 0.01290666, + "balance_loss_clip": 0.0630425, + "balance_loss_mlp": 0.0126115, + "epoch": 0.16305426123553285, + "flos": 18776545228800.0, + "grad_norm": 3.9666408475565462, + "language_loss": 0.82468587, + "learning_rate": 3.817147521778719e-06, + "loss": 0.90359789, + "num_input_tokens_seen": 58710230, + "router_z_loss_clip": 2.96289062, + "router_z_loss_mlp": 0.29516602, + "step": 2712, + "time_per_iteration": 2.5337300300598145 + }, + { + "auxiliary_loss_clip": 0.06597727, + "auxiliary_loss_mlp": 0.01290483, + "balance_loss_clip": 0.06302102, + "balance_loss_mlp": 0.01261563, + "epoch": 0.16311438448820081, + "flos": 22093643537280.0, + "grad_norm": 1.9569381877955756, + "language_loss": 0.78029472, + "learning_rate": 3.816984799657568e-06, + "loss": 0.85917681, + "num_input_tokens_seen": 58728610, + "router_z_loss_clip": 2.95898438, + "router_z_loss_mlp": 0.28942871, + "step": 2713, + "time_per_iteration": 2.5238146781921387 + }, + { + "auxiliary_loss_clip": 0.06594867, + "auxiliary_loss_mlp": 0.0130017, + "balance_loss_clip": 0.06315845, + "balance_loss_mlp": 0.01271799, + "epoch": 0.16317450774086878, + "flos": 16472565037440.0, + "grad_norm": 2.250248562702171, + "language_loss": 0.80385303, + "learning_rate": 3.8168220086361715e-06, + "loss": 0.88280344, + "num_input_tokens_seen": 58744385, + "router_z_loss_clip": 2.79101562, + "router_z_loss_mlp": 0.28369141, + "step": 2714, + "time_per_iteration": 2.5166831016540527 + }, + { + "auxiliary_loss_clip": 0.06589634, + "auxiliary_loss_mlp": 0.01294838, + "balance_loss_clip": 0.06306746, + "balance_loss_mlp": 0.01269899, + "epoch": 0.16323463099353674, + "flos": 24359832737280.0, + "grad_norm": 1.8056327126335605, + "language_loss": 0.78403461, + "learning_rate": 3.816659148720702e-06, + "loss": 0.8628794, + "num_input_tokens_seen": 58763905, + "router_z_loss_clip": 2.828125, + "router_z_loss_mlp": 0.24951172, + "step": 2715, + "time_per_iteration": 2.5939090251922607 + }, + { + "auxiliary_loss_clip": 0.06588797, + "auxiliary_loss_mlp": 0.01288106, + "balance_loss_clip": 0.06304932, + "balance_loss_mlp": 0.01261952, + "epoch": 0.1632947542462047, + "flos": 24907872366720.0, + "grad_norm": 2.046246244819102, + "language_loss": 0.82485706, + "learning_rate": 3.816496219917336e-06, + "loss": 0.90362608, + "num_input_tokens_seen": 58785580, + "router_z_loss_clip": 2.84179688, + "router_z_loss_mlp": 0.26147461, + "step": 2716, + "time_per_iteration": 2.593174457550049 + }, + { + "auxiliary_loss_clip": 0.06597836, + "auxiliary_loss_mlp": 0.01294616, + "balance_loss_clip": 0.06307962, + "balance_loss_mlp": 0.01266017, + "epoch": 0.1633548774988727, + "flos": 24907285388160.0, + "grad_norm": 1.9895193792693864, + "language_loss": 0.87446529, + "learning_rate": 3.816333222232251e-06, + "loss": 0.95338982, + "num_input_tokens_seen": 58806075, + "router_z_loss_clip": 2.90234375, + "router_z_loss_mlp": 0.28613281, + "step": 2717, + "time_per_iteration": 2.55460262298584 + }, + { + "auxiliary_loss_clip": 0.0659758, + "auxiliary_loss_mlp": 0.01288078, + "balance_loss_clip": 0.06314965, + "balance_loss_mlp": 0.01262413, + "epoch": 0.16341500075154067, + "flos": 30449008471680.0, + "grad_norm": 1.9093048334188691, + "language_loss": 0.77648151, + "learning_rate": 3.816170155671629e-06, + "loss": 0.8553381, + "num_input_tokens_seen": 58827405, + "router_z_loss_clip": 2.82617188, + "router_z_loss_mlp": 0.25671387, + "step": 2718, + "time_per_iteration": 2.6473746299743652 + }, + { + "auxiliary_loss_clip": 0.06597009, + "auxiliary_loss_mlp": 0.01285687, + "balance_loss_clip": 0.0631033, + "balance_loss_mlp": 0.01259783, + "epoch": 0.16347512400420863, + "flos": 22791253904640.0, + "grad_norm": 2.222005290704418, + "language_loss": 0.74954313, + "learning_rate": 3.816007020241652e-06, + "loss": 0.82837009, + "num_input_tokens_seen": 58847205, + "router_z_loss_clip": 2.8671875, + "router_z_loss_mlp": 0.25866699, + "step": 2719, + "time_per_iteration": 2.551116704940796 + }, + { + "auxiliary_loss_clip": 0.0659292, + "auxiliary_loss_mlp": 0.01283628, + "balance_loss_clip": 0.0630803, + "balance_loss_mlp": 0.01257831, + "epoch": 0.1635352472568766, + "flos": 22639083690240.0, + "grad_norm": 1.7533438569003168, + "language_loss": 0.73446441, + "learning_rate": 3.815843815948507e-06, + "loss": 0.81322992, + "num_input_tokens_seen": 58866865, + "router_z_loss_clip": 2.84765625, + "router_z_loss_mlp": 0.25805664, + "step": 2720, + "time_per_iteration": 2.5771543979644775 + }, + { + "auxiliary_loss_clip": 0.06588636, + "auxiliary_loss_mlp": 0.01282225, + "balance_loss_clip": 0.0630826, + "balance_loss_mlp": 0.01254949, + "epoch": 0.16359537050954456, + "flos": 15528362503680.0, + "grad_norm": 2.643329433322918, + "language_loss": 0.7707237, + "learning_rate": 3.8156805427983824e-06, + "loss": 0.84943235, + "num_input_tokens_seen": 58885200, + "router_z_loss_clip": 2.80664062, + "router_z_loss_mlp": 0.27294922, + "step": 2721, + "time_per_iteration": 2.4961769580841064 + }, + { + "auxiliary_loss_clip": 0.06596414, + "auxiliary_loss_mlp": 0.0128382, + "balance_loss_clip": 0.0630523, + "balance_loss_mlp": 0.01256175, + "epoch": 0.16365549376221253, + "flos": 22096578430080.0, + "grad_norm": 2.1311655694461917, + "language_loss": 0.79885328, + "learning_rate": 3.8155172007974695e-06, + "loss": 0.87765563, + "num_input_tokens_seen": 58906385, + "router_z_loss_clip": 2.91601562, + "router_z_loss_mlp": 0.27648926, + "step": 2722, + "time_per_iteration": 2.614875078201294 + }, + { + "auxiliary_loss_clip": 0.06605944, + "auxiliary_loss_mlp": 0.01289108, + "balance_loss_clip": 0.06310583, + "balance_loss_mlp": 0.01258602, + "epoch": 0.1637156170148805, + "flos": 24067148025600.0, + "grad_norm": 1.9382892216015752, + "language_loss": 0.85628319, + "learning_rate": 3.8153537899519624e-06, + "loss": 0.93523371, + "num_input_tokens_seen": 58925040, + "router_z_loss_clip": 2.95507812, + "router_z_loss_mlp": 0.30493164, + "step": 2723, + "time_per_iteration": 2.531521797180176 + }, + { + "auxiliary_loss_clip": 0.0658607, + "auxiliary_loss_mlp": 0.01286244, + "balance_loss_clip": 0.06307479, + "balance_loss_mlp": 0.01259732, + "epoch": 0.1637757402675485, + "flos": 26692212263040.0, + "grad_norm": 4.459915510598608, + "language_loss": 0.71697843, + "learning_rate": 3.815190310268058e-06, + "loss": 0.7957015, + "num_input_tokens_seen": 58944790, + "router_z_loss_clip": 2.79101562, + "router_z_loss_mlp": 0.26477051, + "step": 2724, + "time_per_iteration": 2.577958822250366 + }, + { + "auxiliary_loss_clip": 0.06581962, + "auxiliary_loss_mlp": 0.01288602, + "balance_loss_clip": 0.06304826, + "balance_loss_mlp": 0.01263521, + "epoch": 0.16383586352021645, + "flos": 16112432188800.0, + "grad_norm": 1.9457979219444324, + "language_loss": 0.71286237, + "learning_rate": 3.815026761751955e-06, + "loss": 0.79156804, + "num_input_tokens_seen": 58962500, + "router_z_loss_clip": 2.77539062, + "router_z_loss_mlp": 0.25085449, + "step": 2725, + "time_per_iteration": 2.497311592102051 + }, + { + "auxiliary_loss_clip": 0.06590257, + "auxiliary_loss_mlp": 0.01285785, + "balance_loss_clip": 0.06310654, + "balance_loss_mlp": 0.01259761, + "epoch": 0.16389598677288442, + "flos": 19171031051520.0, + "grad_norm": 2.1904929355188325, + "language_loss": 0.89010125, + "learning_rate": 3.814863144409855e-06, + "loss": 0.96886164, + "num_input_tokens_seen": 58980355, + "router_z_loss_clip": 2.79101562, + "router_z_loss_mlp": 0.26013184, + "step": 2726, + "time_per_iteration": 2.5101511478424072 + }, + { + "auxiliary_loss_clip": 0.06595127, + "auxiliary_loss_mlp": 0.01285031, + "balance_loss_clip": 0.06307214, + "balance_loss_mlp": 0.01257732, + "epoch": 0.16395611002555238, + "flos": 21513431139840.0, + "grad_norm": 1.9675738265317178, + "language_loss": 0.75618744, + "learning_rate": 3.814699458247963e-06, + "loss": 0.83498907, + "num_input_tokens_seen": 58999505, + "router_z_loss_clip": 2.87890625, + "router_z_loss_mlp": 0.27331543, + "step": 2727, + "time_per_iteration": 2.5322039127349854 + }, + { + "auxiliary_loss_clip": 0.06578872, + "auxiliary_loss_mlp": 0.012812, + "balance_loss_clip": 0.06301126, + "balance_loss_mlp": 0.01257298, + "epoch": 0.16401623327822035, + "flos": 21477401084160.0, + "grad_norm": 2.357425852181157, + "language_loss": 0.82921708, + "learning_rate": 3.8145357032724855e-06, + "loss": 0.90781784, + "num_input_tokens_seen": 59017930, + "router_z_loss_clip": 2.78125, + "router_z_loss_mlp": 0.23913574, + "step": 2728, + "time_per_iteration": 2.538081407546997 + }, + { + "auxiliary_loss_clip": 0.06590319, + "auxiliary_loss_mlp": 0.01282423, + "balance_loss_clip": 0.0630119, + "balance_loss_mlp": 0.01255685, + "epoch": 0.1640763565308883, + "flos": 13631362392960.0, + "grad_norm": 3.359167938327165, + "language_loss": 0.85634404, + "learning_rate": 3.814371879489633e-06, + "loss": 0.93507141, + "num_input_tokens_seen": 59035130, + "router_z_loss_clip": 2.890625, + "router_z_loss_mlp": 0.26745605, + "step": 2729, + "time_per_iteration": 2.555157423019409 + }, + { + "auxiliary_loss_clip": 0.06590364, + "auxiliary_loss_mlp": 0.01282244, + "balance_loss_clip": 0.06303068, + "balance_loss_mlp": 0.01255732, + "epoch": 0.16413647978355628, + "flos": 15457057079040.0, + "grad_norm": 2.0375012641424193, + "language_loss": 0.73386455, + "learning_rate": 3.814207986905616e-06, + "loss": 0.81259066, + "num_input_tokens_seen": 59053080, + "router_z_loss_clip": 2.87695312, + "router_z_loss_mlp": 0.26477051, + "step": 2730, + "time_per_iteration": 2.5347042083740234 + }, + { + "auxiliary_loss_clip": 0.06593673, + "auxiliary_loss_mlp": 0.01289719, + "balance_loss_clip": 0.06303447, + "balance_loss_mlp": 0.01261967, + "epoch": 0.16419660303622427, + "flos": 45889043172480.0, + "grad_norm": 1.5633038653846945, + "language_loss": 0.75101161, + "learning_rate": 3.814044025526651e-06, + "loss": 0.82984555, + "num_input_tokens_seen": 59075610, + "router_z_loss_clip": 2.90234375, + "router_z_loss_mlp": 0.27734375, + "step": 2731, + "time_per_iteration": 2.7257211208343506 + }, + { + "auxiliary_loss_clip": 0.06592289, + "auxiliary_loss_mlp": 0.012866, + "balance_loss_clip": 0.06302358, + "balance_loss_mlp": 0.01258967, + "epoch": 0.16425672628889224, + "flos": 18958791859200.0, + "grad_norm": 2.3112437011786238, + "language_loss": 0.79966319, + "learning_rate": 3.8138799953589548e-06, + "loss": 0.87845206, + "num_input_tokens_seen": 59094555, + "router_z_loss_clip": 2.90039062, + "router_z_loss_mlp": 0.27648926, + "step": 2732, + "time_per_iteration": 2.5160276889801025 + }, + { + "auxiliary_loss_clip": 0.06590726, + "auxiliary_loss_mlp": 0.01293299, + "balance_loss_clip": 0.06301223, + "balance_loss_mlp": 0.01263854, + "epoch": 0.1643168495415602, + "flos": 24319316488320.0, + "grad_norm": 2.024679597680736, + "language_loss": 0.69993633, + "learning_rate": 3.8137158964087473e-06, + "loss": 0.77877665, + "num_input_tokens_seen": 59113515, + "router_z_loss_clip": 2.89648438, + "router_z_loss_mlp": 0.29467773, + "step": 2733, + "time_per_iteration": 2.53328537940979 + }, + { + "auxiliary_loss_clip": 0.06586764, + "auxiliary_loss_mlp": 0.0128512, + "balance_loss_clip": 0.06300272, + "balance_loss_mlp": 0.01256426, + "epoch": 0.16437697279422817, + "flos": 26434970628480.0, + "grad_norm": 2.0387940274909537, + "language_loss": 0.81552017, + "learning_rate": 3.8135517286822508e-06, + "loss": 0.89423895, + "num_input_tokens_seen": 59133275, + "router_z_loss_clip": 2.86132812, + "router_z_loss_mlp": 0.28674316, + "step": 2734, + "time_per_iteration": 2.567229747772217 + }, + { + "auxiliary_loss_clip": 0.0658897, + "auxiliary_loss_mlp": 0.01289023, + "balance_loss_clip": 0.06299339, + "balance_loss_mlp": 0.01261271, + "epoch": 0.16443709604689613, + "flos": 34540808503680.0, + "grad_norm": 4.048112349799869, + "language_loss": 0.82907999, + "learning_rate": 3.8133874921856914e-06, + "loss": 0.90785992, + "num_input_tokens_seen": 59154095, + "router_z_loss_clip": 2.89257812, + "router_z_loss_mlp": 0.27758789, + "step": 2735, + "time_per_iteration": 2.63996958732605 + }, + { + "auxiliary_loss_clip": 0.06579679, + "auxiliary_loss_mlp": 0.01279603, + "balance_loss_clip": 0.06297098, + "balance_loss_mlp": 0.01254783, + "epoch": 0.1644972192995641, + "flos": 23264717800320.0, + "grad_norm": 2.4207218830736417, + "language_loss": 0.80072814, + "learning_rate": 3.813223186925296e-06, + "loss": 0.87932098, + "num_input_tokens_seen": 59173795, + "router_z_loss_clip": 2.828125, + "router_z_loss_mlp": 0.24816895, + "step": 2736, + "time_per_iteration": 2.546694755554199 + }, + { + "auxiliary_loss_clip": 0.0658504, + "auxiliary_loss_mlp": 0.0128325, + "balance_loss_clip": 0.06300261, + "balance_loss_mlp": 0.01256499, + "epoch": 0.1645573425522321, + "flos": 26986825618560.0, + "grad_norm": 1.6682039059194231, + "language_loss": 0.82238322, + "learning_rate": 3.8130588129072964e-06, + "loss": 0.90106606, + "num_input_tokens_seen": 59191610, + "router_z_loss_clip": 2.84765625, + "router_z_loss_mlp": 0.2677002, + "step": 2737, + "time_per_iteration": 2.5593652725219727 + }, + { + "auxiliary_loss_clip": 0.06591076, + "auxiliary_loss_mlp": 0.0128149, + "balance_loss_clip": 0.06302774, + "balance_loss_mlp": 0.01256087, + "epoch": 0.16461746580490005, + "flos": 28739495871360.0, + "grad_norm": 1.7184215818783282, + "language_loss": 0.88135791, + "learning_rate": 3.8128943701379246e-06, + "loss": 0.96008366, + "num_input_tokens_seen": 59213000, + "router_z_loss_clip": 2.88476562, + "router_z_loss_mlp": 0.25402832, + "step": 2738, + "time_per_iteration": 2.6650192737579346 + }, + { + "auxiliary_loss_clip": 0.06589583, + "auxiliary_loss_mlp": 0.0128808, + "balance_loss_clip": 0.06299618, + "balance_loss_mlp": 0.01259446, + "epoch": 0.16467758905756802, + "flos": 24936062065920.0, + "grad_norm": 2.428798415539057, + "language_loss": 0.72705042, + "learning_rate": 3.8127298586234167e-06, + "loss": 0.80582702, + "num_input_tokens_seen": 59232340, + "router_z_loss_clip": 2.90234375, + "router_z_loss_mlp": 0.28649902, + "step": 2739, + "time_per_iteration": 4.007360935211182 + }, + { + "auxiliary_loss_clip": 0.06576341, + "auxiliary_loss_mlp": 0.0128871, + "balance_loss_clip": 0.06294868, + "balance_loss_mlp": 0.01261435, + "epoch": 0.16473771231023598, + "flos": 24833380487040.0, + "grad_norm": 2.4914045636792133, + "language_loss": 0.82377362, + "learning_rate": 3.8125652783700104e-06, + "loss": 0.90242416, + "num_input_tokens_seen": 59253950, + "router_z_loss_clip": 2.81445312, + "router_z_loss_mlp": 0.27270508, + "step": 2740, + "time_per_iteration": 2.5806076526641846 + }, + { + "auxiliary_loss_clip": 0.06593102, + "auxiliary_loss_mlp": 0.01294674, + "balance_loss_clip": 0.0629887, + "balance_loss_mlp": 0.01265218, + "epoch": 0.16479783556290395, + "flos": 39905609690880.0, + "grad_norm": 2.0874742304604785, + "language_loss": 0.6960665, + "learning_rate": 3.8124006293839475e-06, + "loss": 0.77494431, + "num_input_tokens_seen": 59275545, + "router_z_loss_clip": 2.9453125, + "router_z_loss_mlp": 0.29431152, + "step": 2741, + "time_per_iteration": 2.67899489402771 + }, + { + "auxiliary_loss_clip": 0.06583216, + "auxiliary_loss_mlp": 0.01289999, + "balance_loss_clip": 0.06296665, + "balance_loss_mlp": 0.0126295, + "epoch": 0.16485795881557191, + "flos": 19902449341440.0, + "grad_norm": 1.99300527848014, + "language_loss": 0.80380434, + "learning_rate": 3.812235911671472e-06, + "loss": 0.88253653, + "num_input_tokens_seen": 59293480, + "router_z_loss_clip": 2.86523438, + "router_z_loss_mlp": 0.27062988, + "step": 2742, + "time_per_iteration": 4.01186203956604 + }, + { + "auxiliary_loss_clip": 0.06583486, + "auxiliary_loss_mlp": 0.0128544, + "balance_loss_clip": 0.06299208, + "balance_loss_mlp": 0.01258034, + "epoch": 0.16491808206823988, + "flos": 20562017155200.0, + "grad_norm": 1.859989576393153, + "language_loss": 0.85480952, + "learning_rate": 3.8120711252388274e-06, + "loss": 0.9334988, + "num_input_tokens_seen": 59313435, + "router_z_loss_clip": 2.84375, + "router_z_loss_mlp": 0.27392578, + "step": 2743, + "time_per_iteration": 2.531813859939575 + }, + { + "auxiliary_loss_clip": 0.06583907, + "auxiliary_loss_mlp": 0.01288972, + "balance_loss_clip": 0.06300064, + "balance_loss_mlp": 0.01261018, + "epoch": 0.16497820532090787, + "flos": 23806803790080.0, + "grad_norm": 1.9796677960929725, + "language_loss": 0.87141418, + "learning_rate": 3.811906270092265e-06, + "loss": 0.95014304, + "num_input_tokens_seen": 59331535, + "router_z_loss_clip": 2.8359375, + "router_z_loss_mlp": 0.27966309, + "step": 2744, + "time_per_iteration": 2.5968780517578125 + }, + { + "auxiliary_loss_clip": 0.06573457, + "auxiliary_loss_mlp": 0.01283559, + "balance_loss_clip": 0.0629618, + "balance_loss_mlp": 0.01258847, + "epoch": 0.16503832857357584, + "flos": 25489510283520.0, + "grad_norm": 2.535956000825199, + "language_loss": 0.83221614, + "learning_rate": 3.811741346238036e-06, + "loss": 0.91078633, + "num_input_tokens_seen": 59350680, + "router_z_loss_clip": 2.76953125, + "router_z_loss_mlp": 0.24743652, + "step": 2745, + "time_per_iteration": 2.5640015602111816 + }, + { + "auxiliary_loss_clip": 0.06588263, + "auxiliary_loss_mlp": 0.01287637, + "balance_loss_clip": 0.06305014, + "balance_loss_mlp": 0.01261196, + "epoch": 0.1650984518262438, + "flos": 17681849562240.0, + "grad_norm": 2.0373309792274883, + "language_loss": 0.7743578, + "learning_rate": 3.8115763536823923e-06, + "loss": 0.85311675, + "num_input_tokens_seen": 59367020, + "router_z_loss_clip": 2.83007812, + "router_z_loss_mlp": 0.26452637, + "step": 2746, + "time_per_iteration": 5.4125282764434814 + }, + { + "auxiliary_loss_clip": 0.06589019, + "auxiliary_loss_mlp": 0.01289439, + "balance_loss_clip": 0.06303473, + "balance_loss_mlp": 0.01261723, + "epoch": 0.16515857507891177, + "flos": 18704401263360.0, + "grad_norm": 1.60188965958096, + "language_loss": 0.81673479, + "learning_rate": 3.811411292431592e-06, + "loss": 0.89551938, + "num_input_tokens_seen": 59386075, + "router_z_loss_clip": 2.85742188, + "router_z_loss_mlp": 0.27685547, + "step": 2747, + "time_per_iteration": 2.5460550785064697 + }, + { + "auxiliary_loss_clip": 0.06594047, + "auxiliary_loss_mlp": 0.0128679, + "balance_loss_clip": 0.06307407, + "balance_loss_mlp": 0.01260707, + "epoch": 0.16521869833157973, + "flos": 15015472462080.0, + "grad_norm": 2.468884923074517, + "language_loss": 0.71168172, + "learning_rate": 3.8112461624918945e-06, + "loss": 0.79049003, + "num_input_tokens_seen": 59402690, + "router_z_loss_clip": 2.86328125, + "router_z_loss_mlp": 0.26074219, + "step": 2748, + "time_per_iteration": 2.493168592453003 + }, + { + "auxiliary_loss_clip": 0.06589203, + "auxiliary_loss_mlp": 0.01284146, + "balance_loss_clip": 0.06305005, + "balance_loss_mlp": 0.01259732, + "epoch": 0.1652788215842477, + "flos": 22126654846080.0, + "grad_norm": 5.244624397631241, + "language_loss": 0.8897143, + "learning_rate": 3.811080963869561e-06, + "loss": 0.9684478, + "num_input_tokens_seen": 59421130, + "router_z_loss_clip": 2.84179688, + "router_z_loss_mlp": 0.24401855, + "step": 2749, + "time_per_iteration": 2.6453802585601807 + }, + { + "auxiliary_loss_clip": 0.0659653, + "auxiliary_loss_mlp": 0.01290094, + "balance_loss_clip": 0.06307155, + "balance_loss_mlp": 0.01261913, + "epoch": 0.16533894483691566, + "flos": 18339027534720.0, + "grad_norm": 3.9658549336517446, + "language_loss": 0.79764348, + "learning_rate": 3.8109156965708557e-06, + "loss": 0.87650967, + "num_input_tokens_seen": 59438970, + "router_z_loss_clip": 2.88867188, + "router_z_loss_mlp": 0.28210449, + "step": 2750, + "time_per_iteration": 2.5099878311157227 + }, + { + "auxiliary_loss_clip": 0.06587892, + "auxiliary_loss_mlp": 0.01285687, + "balance_loss_clip": 0.06303497, + "balance_loss_mlp": 0.01257673, + "epoch": 0.16539906808958366, + "flos": 22388592309120.0, + "grad_norm": 1.8681239023451541, + "language_loss": 0.95973986, + "learning_rate": 3.8107503606020455e-06, + "loss": 1.03847575, + "num_input_tokens_seen": 59458510, + "router_z_loss_clip": 2.84570312, + "router_z_loss_mlp": 0.2800293, + "step": 2751, + "time_per_iteration": 2.580857753753662 + }, + { + "auxiliary_loss_clip": 0.06591333, + "auxiliary_loss_mlp": 0.01293333, + "balance_loss_clip": 0.06311293, + "balance_loss_mlp": 0.01266344, + "epoch": 0.16545919134225162, + "flos": 22717726346880.0, + "grad_norm": 2.017884310231, + "language_loss": 0.71926272, + "learning_rate": 3.8105849559693997e-06, + "loss": 0.79810935, + "num_input_tokens_seen": 59477110, + "router_z_loss_clip": 2.79882812, + "router_z_loss_mlp": 0.26965332, + "step": 2752, + "time_per_iteration": 2.5533626079559326 + }, + { + "auxiliary_loss_clip": 0.06474683, + "auxiliary_loss_mlp": 0.01280412, + "balance_loss_clip": 0.06313415, + "balance_loss_mlp": 0.01272663, + "epoch": 0.1655193145949196, + "flos": 67822493702400.0, + "grad_norm": 0.7367497765392101, + "language_loss": 0.5395115, + "learning_rate": 3.810419482679192e-06, + "loss": 0.61706245, + "num_input_tokens_seen": 59541155, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.07739258, + "step": 2753, + "time_per_iteration": 3.283729314804077 + }, + { + "auxiliary_loss_clip": 0.06593385, + "auxiliary_loss_mlp": 0.01285286, + "balance_loss_clip": 0.06311026, + "balance_loss_mlp": 0.01258547, + "epoch": 0.16557943784758755, + "flos": 24287353355520.0, + "grad_norm": 1.793852310261697, + "language_loss": 0.75999093, + "learning_rate": 3.8102539407376954e-06, + "loss": 0.8387776, + "num_input_tokens_seen": 59561155, + "router_z_loss_clip": 2.82421875, + "router_z_loss_mlp": 0.26757812, + "step": 2754, + "time_per_iteration": 2.608365297317505 + }, + { + "auxiliary_loss_clip": 0.06608296, + "auxiliary_loss_mlp": 0.01288183, + "balance_loss_clip": 0.06315503, + "balance_loss_mlp": 0.01260575, + "epoch": 0.16563956110025552, + "flos": 20089727216640.0, + "grad_norm": 2.367713266740868, + "language_loss": 0.87993264, + "learning_rate": 3.810088330151188e-06, + "loss": 0.95889747, + "num_input_tokens_seen": 59580460, + "router_z_loss_clip": 2.9296875, + "router_z_loss_mlp": 0.27600098, + "step": 2755, + "time_per_iteration": 2.5239596366882324 + }, + { + "auxiliary_loss_clip": 0.06584992, + "auxiliary_loss_mlp": 0.01279054, + "balance_loss_clip": 0.06303753, + "balance_loss_mlp": 0.01253877, + "epoch": 0.16569968435292348, + "flos": 28041382379520.0, + "grad_norm": 1.6563009546595795, + "language_loss": 0.7383014, + "learning_rate": 3.80992265092595e-06, + "loss": 0.81694186, + "num_input_tokens_seen": 59600025, + "router_z_loss_clip": 2.8125, + "router_z_loss_mlp": 0.25195312, + "step": 2756, + "time_per_iteration": 2.6032936573028564 + }, + { + "auxiliary_loss_clip": 0.06582732, + "auxiliary_loss_mlp": 0.01284003, + "balance_loss_clip": 0.06305105, + "balance_loss_mlp": 0.0125817, + "epoch": 0.16575980760559147, + "flos": 26257461753600.0, + "grad_norm": 1.6426190009356174, + "language_loss": 0.75875264, + "learning_rate": 3.8097569030682636e-06, + "loss": 0.83741999, + "num_input_tokens_seen": 59620600, + "router_z_loss_clip": 2.7734375, + "router_z_loss_mlp": 0.25817871, + "step": 2757, + "time_per_iteration": 2.5745790004730225 + }, + { + "auxiliary_loss_clip": 0.06586438, + "auxiliary_loss_mlp": 0.01285191, + "balance_loss_clip": 0.063063, + "balance_loss_mlp": 0.01258822, + "epoch": 0.16581993085825944, + "flos": 26951382541440.0, + "grad_norm": 1.7077128151850376, + "language_loss": 0.85793787, + "learning_rate": 3.8095910865844137e-06, + "loss": 0.93665409, + "num_input_tokens_seen": 59641385, + "router_z_loss_clip": 2.80078125, + "router_z_loss_mlp": 0.26391602, + "step": 2758, + "time_per_iteration": 2.6094768047332764 + }, + { + "auxiliary_loss_clip": 0.06582282, + "auxiliary_loss_mlp": 0.01281611, + "balance_loss_clip": 0.06301229, + "balance_loss_mlp": 0.01255981, + "epoch": 0.1658800541109274, + "flos": 21660192766080.0, + "grad_norm": 2.0058299268215602, + "language_loss": 0.79821748, + "learning_rate": 3.809425201480689e-06, + "loss": 0.87685645, + "num_input_tokens_seen": 59659865, + "router_z_loss_clip": 2.81054688, + "router_z_loss_mlp": 0.25646973, + "step": 2759, + "time_per_iteration": 2.5326881408691406 + }, + { + "auxiliary_loss_clip": 0.06584738, + "auxiliary_loss_mlp": 0.01287284, + "balance_loss_clip": 0.06296851, + "balance_loss_mlp": 0.01258721, + "epoch": 0.16594017736359537, + "flos": 16441063102080.0, + "grad_norm": 2.640523985370613, + "language_loss": 0.76520288, + "learning_rate": 3.8092592477633793e-06, + "loss": 0.84392309, + "num_input_tokens_seen": 59678780, + "router_z_loss_clip": 2.8828125, + "router_z_loss_mlp": 0.28588867, + "step": 2760, + "time_per_iteration": 2.5365755558013916 + }, + { + "auxiliary_loss_clip": 0.06596339, + "auxiliary_loss_mlp": 0.01287081, + "balance_loss_clip": 0.06307873, + "balance_loss_mlp": 0.01260986, + "epoch": 0.16600030061626334, + "flos": 22643779518720.0, + "grad_norm": 1.8139140163731928, + "language_loss": 0.74449325, + "learning_rate": 3.8090932254387774e-06, + "loss": 0.82332754, + "num_input_tokens_seen": 59698795, + "router_z_loss_clip": 2.88671875, + "router_z_loss_mlp": 0.26086426, + "step": 2761, + "time_per_iteration": 2.5551891326904297 + }, + { + "auxiliary_loss_clip": 0.06586796, + "auxiliary_loss_mlp": 0.01291841, + "balance_loss_clip": 0.0630264, + "balance_loss_mlp": 0.01263922, + "epoch": 0.1660604238689313, + "flos": 26403887963520.0, + "grad_norm": 1.8147235749558717, + "language_loss": 0.89404368, + "learning_rate": 3.8089271345131788e-06, + "loss": 0.97283, + "num_input_tokens_seen": 59718795, + "router_z_loss_clip": 2.83984375, + "router_z_loss_mlp": 0.27905273, + "step": 2762, + "time_per_iteration": 2.587952136993408 + }, + { + "auxiliary_loss_clip": 0.0659417, + "auxiliary_loss_mlp": 0.01281866, + "balance_loss_clip": 0.0630425, + "balance_loss_mlp": 0.01255282, + "epoch": 0.16612054712159927, + "flos": 23046776530560.0, + "grad_norm": 1.779645358746394, + "language_loss": 0.8912673, + "learning_rate": 3.8087609749928822e-06, + "loss": 0.97002763, + "num_input_tokens_seen": 59737555, + "router_z_loss_clip": 2.90234375, + "router_z_loss_mlp": 0.26611328, + "step": 2763, + "time_per_iteration": 2.5509772300720215 + }, + { + "auxiliary_loss_clip": 0.06462647, + "auxiliary_loss_mlp": 0.01266671, + "balance_loss_clip": 0.06301262, + "balance_loss_mlp": 0.01259697, + "epoch": 0.16618067037426726, + "flos": 59261388266880.0, + "grad_norm": 0.7675418877188291, + "language_loss": 0.59855133, + "learning_rate": 3.8085947468841885e-06, + "loss": 0.67584455, + "num_input_tokens_seen": 59800915, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.06988525, + "step": 2764, + "time_per_iteration": 3.221308708190918 + }, + { + "auxiliary_loss_clip": 0.06595036, + "auxiliary_loss_mlp": 0.0129625, + "balance_loss_clip": 0.06311496, + "balance_loss_mlp": 0.01269607, + "epoch": 0.16624079362693522, + "flos": 27206192407680.0, + "grad_norm": 22.231303672766604, + "language_loss": 0.8298772, + "learning_rate": 3.808428450193401e-06, + "loss": 0.90879005, + "num_input_tokens_seen": 59822910, + "router_z_loss_clip": 2.83203125, + "router_z_loss_mlp": 0.26635742, + "step": 2765, + "time_per_iteration": 2.5886435508728027 + }, + { + "auxiliary_loss_clip": 0.06603917, + "auxiliary_loss_mlp": 0.0129703, + "balance_loss_clip": 0.06306268, + "balance_loss_mlp": 0.01269099, + "epoch": 0.1663009168796032, + "flos": 10929542215680.0, + "grad_norm": 2.384069935097126, + "language_loss": 0.7120772, + "learning_rate": 3.8082620849268244e-06, + "loss": 0.79108667, + "num_input_tokens_seen": 59838805, + "router_z_loss_clip": 2.97851562, + "router_z_loss_mlp": 0.27941895, + "step": 2766, + "time_per_iteration": 2.526913642883301 + }, + { + "auxiliary_loss_clip": 0.06591118, + "auxiliary_loss_mlp": 0.0128837, + "balance_loss_clip": 0.06309089, + "balance_loss_mlp": 0.01262526, + "epoch": 0.16636104013227115, + "flos": 17900168175360.0, + "grad_norm": 2.2120517261374593, + "language_loss": 0.89624047, + "learning_rate": 3.808095651090769e-06, + "loss": 0.97503531, + "num_input_tokens_seen": 59855345, + "router_z_loss_clip": 2.82226562, + "router_z_loss_mlp": 0.25830078, + "step": 2767, + "time_per_iteration": 2.4989144802093506 + }, + { + "auxiliary_loss_clip": 0.06446301, + "auxiliary_loss_mlp": 0.0126062, + "balance_loss_clip": 0.0628543, + "balance_loss_mlp": 0.01253307, + "epoch": 0.16642116338493912, + "flos": 66748342285440.0, + "grad_norm": 0.6237778354152628, + "language_loss": 0.52864301, + "learning_rate": 3.8079291486915447e-06, + "loss": 0.60571223, + "num_input_tokens_seen": 59917710, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.07293701, + "step": 2768, + "time_per_iteration": 3.263981580734253 + }, + { + "auxiliary_loss_clip": 0.06597716, + "auxiliary_loss_mlp": 0.01287278, + "balance_loss_clip": 0.06305783, + "balance_loss_mlp": 0.0126048, + "epoch": 0.16648128663760708, + "flos": 19032067854720.0, + "grad_norm": 2.5043941820877524, + "language_loss": 0.85743988, + "learning_rate": 3.8077625777354667e-06, + "loss": 0.93628991, + "num_input_tokens_seen": 59935105, + "router_z_loss_clip": 2.91796875, + "router_z_loss_mlp": 0.26782227, + "step": 2769, + "time_per_iteration": 2.5169060230255127 + }, + { + "auxiliary_loss_clip": 0.06441471, + "auxiliary_loss_mlp": 0.01258691, + "balance_loss_clip": 0.06281121, + "balance_loss_mlp": 0.01251771, + "epoch": 0.16654140989027508, + "flos": 70154370103680.0, + "grad_norm": 0.7855037683883999, + "language_loss": 0.57378197, + "learning_rate": 3.80759593822885e-06, + "loss": 0.65078354, + "num_input_tokens_seen": 59984085, + "router_z_loss_clip": 1.60351562, + "router_z_loss_mlp": 0.06939697, + "step": 2770, + "time_per_iteration": 3.0450947284698486 + }, + { + "auxiliary_loss_clip": 0.0643771, + "auxiliary_loss_mlp": 0.01261832, + "balance_loss_clip": 0.06278233, + "balance_loss_mlp": 0.01254959, + "epoch": 0.16660153314294304, + "flos": 70290398407680.0, + "grad_norm": 0.8814976481921372, + "language_loss": 0.5630703, + "learning_rate": 3.807429230178015e-06, + "loss": 0.64006579, + "num_input_tokens_seen": 60043470, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.06890869, + "step": 2771, + "time_per_iteration": 3.0379133224487305 + }, + { + "auxiliary_loss_clip": 0.06582694, + "auxiliary_loss_mlp": 0.01286148, + "balance_loss_clip": 0.06303653, + "balance_loss_mlp": 0.01260756, + "epoch": 0.166661656395611, + "flos": 23081590702080.0, + "grad_norm": 2.5291823890046534, + "language_loss": 0.71466291, + "learning_rate": 3.8072624535892817e-06, + "loss": 0.79335129, + "num_input_tokens_seen": 60063045, + "router_z_loss_clip": 2.79101562, + "router_z_loss_mlp": 0.25378418, + "step": 2772, + "time_per_iteration": 2.551870584487915 + }, + { + "auxiliary_loss_clip": 0.06576528, + "auxiliary_loss_mlp": 0.01281534, + "balance_loss_clip": 0.06298962, + "balance_loss_mlp": 0.01255082, + "epoch": 0.16672177964827897, + "flos": 28373912507520.0, + "grad_norm": 1.9791838329774285, + "language_loss": 0.87486583, + "learning_rate": 3.807095608468975e-06, + "loss": 0.95344645, + "num_input_tokens_seen": 60081945, + "router_z_loss_clip": 2.77929688, + "router_z_loss_mlp": 0.26452637, + "step": 2773, + "time_per_iteration": 2.613593339920044 + }, + { + "auxiliary_loss_clip": 0.06585228, + "auxiliary_loss_mlp": 0.01284542, + "balance_loss_clip": 0.06305268, + "balance_loss_mlp": 0.01259532, + "epoch": 0.16678190290094694, + "flos": 19095700631040.0, + "grad_norm": 2.4658170667158545, + "language_loss": 0.8279835, + "learning_rate": 3.8069286948234224e-06, + "loss": 0.90668118, + "num_input_tokens_seen": 60096820, + "router_z_loss_clip": 2.80273438, + "router_z_loss_mlp": 0.25012207, + "step": 2774, + "time_per_iteration": 2.5196969509124756 + }, + { + "auxiliary_loss_clip": 0.06592362, + "auxiliary_loss_mlp": 0.01284239, + "balance_loss_clip": 0.06307152, + "balance_loss_mlp": 0.01258871, + "epoch": 0.1668420261536149, + "flos": 21805612727040.0, + "grad_norm": 2.7739422626660053, + "language_loss": 0.84618509, + "learning_rate": 3.806761712658952e-06, + "loss": 0.92495108, + "num_input_tokens_seen": 60116140, + "router_z_loss_clip": 2.85546875, + "router_z_loss_mlp": 0.25354004, + "step": 2775, + "time_per_iteration": 2.5799014568328857 + }, + { + "auxiliary_loss_clip": 0.06591405, + "auxiliary_loss_mlp": 0.01285295, + "balance_loss_clip": 0.06311037, + "balance_loss_mlp": 0.01260702, + "epoch": 0.16690214940628287, + "flos": 19068559107840.0, + "grad_norm": 2.4582225386756793, + "language_loss": 0.81805599, + "learning_rate": 3.806594661981897e-06, + "loss": 0.89682293, + "num_input_tokens_seen": 60134235, + "router_z_loss_clip": 2.8046875, + "router_z_loss_mlp": 0.24584961, + "step": 2776, + "time_per_iteration": 2.547075033187866 + }, + { + "auxiliary_loss_clip": 0.06574798, + "auxiliary_loss_mlp": 0.01281066, + "balance_loss_clip": 0.06304269, + "balance_loss_mlp": 0.01257188, + "epoch": 0.16696227265895086, + "flos": 18594550160640.0, + "grad_norm": 2.127036404214793, + "language_loss": 0.80698764, + "learning_rate": 3.8064275427985906e-06, + "loss": 0.88554621, + "num_input_tokens_seen": 60153275, + "router_z_loss_clip": 2.703125, + "router_z_loss_mlp": 0.2388916, + "step": 2777, + "time_per_iteration": 2.701383352279663 + }, + { + "auxiliary_loss_clip": 0.06586365, + "auxiliary_loss_mlp": 0.0128362, + "balance_loss_clip": 0.06303923, + "balance_loss_mlp": 0.01258323, + "epoch": 0.16702239591161883, + "flos": 23300747856000.0, + "grad_norm": 1.7658630551266277, + "language_loss": 0.85838449, + "learning_rate": 3.806260355115371e-06, + "loss": 0.93708432, + "num_input_tokens_seen": 60173215, + "router_z_loss_clip": 2.82421875, + "router_z_loss_mlp": 0.25305176, + "step": 2778, + "time_per_iteration": 4.054275989532471 + }, + { + "auxiliary_loss_clip": 0.06594409, + "auxiliary_loss_mlp": 0.01286931, + "balance_loss_clip": 0.06310806, + "balance_loss_mlp": 0.01260908, + "epoch": 0.1670825191642868, + "flos": 24432521754240.0, + "grad_norm": 2.130533626904146, + "language_loss": 0.75036883, + "learning_rate": 3.8060930989385778e-06, + "loss": 0.82918215, + "num_input_tokens_seen": 60190515, + "router_z_loss_clip": 2.83789062, + "router_z_loss_mlp": 0.26013184, + "step": 2779, + "time_per_iteration": 2.5570623874664307 + }, + { + "auxiliary_loss_clip": 0.06586824, + "auxiliary_loss_mlp": 0.01289404, + "balance_loss_clip": 0.06304757, + "balance_loss_mlp": 0.01263237, + "epoch": 0.16714264241695476, + "flos": 26804830550400.0, + "grad_norm": 2.754931380433817, + "language_loss": 0.66534865, + "learning_rate": 3.805925774274554e-06, + "loss": 0.74411094, + "num_input_tokens_seen": 60211655, + "router_z_loss_clip": 2.8203125, + "router_z_loss_mlp": 0.26147461, + "step": 2780, + "time_per_iteration": 2.5990118980407715 + }, + { + "auxiliary_loss_clip": 0.06585376, + "auxiliary_loss_mlp": 0.01289397, + "balance_loss_clip": 0.06306757, + "balance_loss_mlp": 0.01263075, + "epoch": 0.16720276566962272, + "flos": 21841768563840.0, + "grad_norm": 3.156228906236902, + "language_loss": 0.80115324, + "learning_rate": 3.805758381129643e-06, + "loss": 0.87990093, + "num_input_tokens_seen": 60230860, + "router_z_loss_clip": 2.78320312, + "router_z_loss_mlp": 0.26318359, + "step": 2781, + "time_per_iteration": 3.9395251274108887 + }, + { + "auxiliary_loss_clip": 0.06586023, + "auxiliary_loss_mlp": 0.01284981, + "balance_loss_clip": 0.06303258, + "balance_loss_mlp": 0.01258791, + "epoch": 0.1672628889222907, + "flos": 21476814105600.0, + "grad_norm": 1.4411022993090745, + "language_loss": 0.75756633, + "learning_rate": 3.805590919510193e-06, + "loss": 0.83627641, + "num_input_tokens_seen": 60250535, + "router_z_loss_clip": 2.828125, + "router_z_loss_mlp": 0.26171875, + "step": 2782, + "time_per_iteration": 2.6298012733459473 + }, + { + "auxiliary_loss_clip": 0.06600203, + "auxiliary_loss_mlp": 0.01288992, + "balance_loss_clip": 0.06305742, + "balance_loss_mlp": 0.0126242, + "epoch": 0.16732301217495865, + "flos": 30781915943040.0, + "grad_norm": 2.647632172572772, + "language_loss": 0.6861552, + "learning_rate": 3.8054233894225547e-06, + "loss": 0.76504719, + "num_input_tokens_seen": 60269530, + "router_z_loss_clip": 2.94335938, + "router_z_loss_mlp": 0.26550293, + "step": 2783, + "time_per_iteration": 2.5996532440185547 + }, + { + "auxiliary_loss_clip": 0.06581019, + "auxiliary_loss_mlp": 0.01284416, + "balance_loss_clip": 0.06301262, + "balance_loss_mlp": 0.0125931, + "epoch": 0.16738313542762664, + "flos": 23480940061440.0, + "grad_norm": 1.7043112393392166, + "language_loss": 0.70624614, + "learning_rate": 3.805255790873081e-06, + "loss": 0.78490055, + "num_input_tokens_seen": 60289900, + "router_z_loss_clip": 2.79492188, + "router_z_loss_mlp": 0.25109863, + "step": 2784, + "time_per_iteration": 2.5658257007598877 + }, + { + "auxiliary_loss_clip": 0.06592201, + "auxiliary_loss_mlp": 0.01289494, + "balance_loss_clip": 0.06306473, + "balance_loss_mlp": 0.01263041, + "epoch": 0.1674432586802946, + "flos": 29796861744000.0, + "grad_norm": 2.259998214947441, + "language_loss": 0.61717749, + "learning_rate": 3.805088123868126e-06, + "loss": 0.69599444, + "num_input_tokens_seen": 60310025, + "router_z_loss_clip": 2.85742188, + "router_z_loss_mlp": 0.2644043, + "step": 2785, + "time_per_iteration": 4.003845691680908 + }, + { + "auxiliary_loss_clip": 0.064503, + "auxiliary_loss_mlp": 0.01262182, + "balance_loss_clip": 0.06288917, + "balance_loss_mlp": 0.01255161, + "epoch": 0.16750338193296258, + "flos": 66157228857600.0, + "grad_norm": 0.7834191651915974, + "language_loss": 0.58330011, + "learning_rate": 3.8049203884140492e-06, + "loss": 0.66042489, + "num_input_tokens_seen": 60377800, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.07037354, + "step": 2786, + "time_per_iteration": 4.598146200180054 + }, + { + "auxiliary_loss_clip": 0.06587794, + "auxiliary_loss_mlp": 0.01289611, + "balance_loss_clip": 0.06301168, + "balance_loss_mlp": 0.0126298, + "epoch": 0.16756350518563054, + "flos": 25702881505920.0, + "grad_norm": 2.328984985341375, + "language_loss": 0.76757109, + "learning_rate": 3.80475258451721e-06, + "loss": 0.84634513, + "num_input_tokens_seen": 60398215, + "router_z_loss_clip": 2.8671875, + "router_z_loss_mlp": 0.26623535, + "step": 2787, + "time_per_iteration": 2.5801339149475098 + }, + { + "auxiliary_loss_clip": 0.06585419, + "auxiliary_loss_mlp": 0.01283974, + "balance_loss_clip": 0.06301223, + "balance_loss_mlp": 0.01257891, + "epoch": 0.1676236284382985, + "flos": 23841911450880.0, + "grad_norm": 1.9360315934234018, + "language_loss": 0.78495795, + "learning_rate": 3.804584712183972e-06, + "loss": 0.86365187, + "num_input_tokens_seen": 60416910, + "router_z_loss_clip": 2.84375, + "router_z_loss_mlp": 0.26086426, + "step": 2788, + "time_per_iteration": 2.5693655014038086 + }, + { + "auxiliary_loss_clip": 0.06435917, + "auxiliary_loss_mlp": 0.0126534, + "balance_loss_clip": 0.06275532, + "balance_loss_mlp": 0.01257765, + "epoch": 0.16768375169096647, + "flos": 59891313663360.0, + "grad_norm": 0.8394736884379908, + "language_loss": 0.59391403, + "learning_rate": 3.8044167714207013e-06, + "loss": 0.67092663, + "num_input_tokens_seen": 60468660, + "router_z_loss_clip": 1.60839844, + "router_z_loss_mlp": 0.07562256, + "step": 2789, + "time_per_iteration": 3.006455659866333 + }, + { + "auxiliary_loss_clip": 0.06580187, + "auxiliary_loss_mlp": 0.01282981, + "balance_loss_clip": 0.06298364, + "balance_loss_mlp": 0.01257566, + "epoch": 0.16774387494363446, + "flos": 38444785608960.0, + "grad_norm": 1.7149926461558054, + "language_loss": 0.71297312, + "learning_rate": 3.804248762233765e-06, + "loss": 0.79160476, + "num_input_tokens_seen": 60492370, + "router_z_loss_clip": 2.8203125, + "router_z_loss_mlp": 0.25427246, + "step": 2790, + "time_per_iteration": 2.6886403560638428 + }, + { + "auxiliary_loss_clip": 0.065869, + "auxiliary_loss_mlp": 0.01286845, + "balance_loss_clip": 0.06305605, + "balance_loss_mlp": 0.01260142, + "epoch": 0.16780399819630243, + "flos": 22644156862080.0, + "grad_norm": 1.6857838889349592, + "language_loss": 0.7969588, + "learning_rate": 3.8040806846295356e-06, + "loss": 0.8756963, + "num_input_tokens_seen": 60512655, + "router_z_loss_clip": 2.8125, + "router_z_loss_mlp": 0.26696777, + "step": 2791, + "time_per_iteration": 2.542351484298706 + }, + { + "auxiliary_loss_clip": 0.06585324, + "auxiliary_loss_mlp": 0.01283873, + "balance_loss_clip": 0.06304726, + "balance_loss_mlp": 0.01256502, + "epoch": 0.1678641214489704, + "flos": 32900001851520.0, + "grad_norm": 1.6260668766519037, + "language_loss": 0.72283256, + "learning_rate": 3.8039125386143853e-06, + "loss": 0.80152452, + "num_input_tokens_seen": 60533090, + "router_z_loss_clip": 2.80664062, + "router_z_loss_mlp": 0.27355957, + "step": 2792, + "time_per_iteration": 2.681652784347534 + }, + { + "auxiliary_loss_clip": 0.06588314, + "auxiliary_loss_mlp": 0.01281257, + "balance_loss_clip": 0.06305955, + "balance_loss_mlp": 0.01256223, + "epoch": 0.16792424470163836, + "flos": 19981133925120.0, + "grad_norm": 2.7315250216088756, + "language_loss": 0.7262826, + "learning_rate": 3.803744324194691e-06, + "loss": 0.80497831, + "num_input_tokens_seen": 60553190, + "router_z_loss_clip": 2.82617188, + "router_z_loss_mlp": 0.25036621, + "step": 2793, + "time_per_iteration": 2.5261969566345215 + }, + { + "auxiliary_loss_clip": 0.06583093, + "auxiliary_loss_mlp": 0.01283488, + "balance_loss_clip": 0.06301598, + "balance_loss_mlp": 0.01257333, + "epoch": 0.16798436795430632, + "flos": 19726114423680.0, + "grad_norm": 2.037397007218884, + "language_loss": 0.78064799, + "learning_rate": 3.803576041376831e-06, + "loss": 0.85931379, + "num_input_tokens_seen": 60571995, + "router_z_loss_clip": 2.81445312, + "router_z_loss_mlp": 0.26135254, + "step": 2794, + "time_per_iteration": 2.5393919944763184 + }, + { + "auxiliary_loss_clip": 0.06580402, + "auxiliary_loss_mlp": 0.01288563, + "balance_loss_clip": 0.06298761, + "balance_loss_mlp": 0.01262206, + "epoch": 0.1680444912069743, + "flos": 28111346138880.0, + "grad_norm": 2.312644294934493, + "language_loss": 0.72345173, + "learning_rate": 3.803407690167187e-06, + "loss": 0.80214143, + "num_input_tokens_seen": 60591275, + "router_z_loss_clip": 2.81640625, + "router_z_loss_mlp": 0.26379395, + "step": 2795, + "time_per_iteration": 2.565215587615967 + }, + { + "auxiliary_loss_clip": 0.06578698, + "auxiliary_loss_mlp": 0.01278302, + "balance_loss_clip": 0.06297935, + "balance_loss_mlp": 0.01254329, + "epoch": 0.16810461445964225, + "flos": 18080695797120.0, + "grad_norm": 1.8533332907405589, + "language_loss": 0.85181081, + "learning_rate": 3.803239270572142e-06, + "loss": 0.93038082, + "num_input_tokens_seen": 60609235, + "router_z_loss_clip": 2.81054688, + "router_z_loss_mlp": 0.23986816, + "step": 2796, + "time_per_iteration": 2.627962112426758 + }, + { + "auxiliary_loss_clip": 0.06595714, + "auxiliary_loss_mlp": 0.01283274, + "balance_loss_clip": 0.0630767, + "balance_loss_mlp": 0.01256571, + "epoch": 0.16816473771231025, + "flos": 23885488373760.0, + "grad_norm": 2.13286065055067, + "language_loss": 0.82093614, + "learning_rate": 3.8030707825980838e-06, + "loss": 0.89972603, + "num_input_tokens_seen": 60629880, + "router_z_loss_clip": 2.8828125, + "router_z_loss_mlp": 0.26696777, + "step": 2797, + "time_per_iteration": 2.5887176990509033 + }, + { + "auxiliary_loss_clip": 0.06571205, + "auxiliary_loss_mlp": 0.01280989, + "balance_loss_clip": 0.06298848, + "balance_loss_mlp": 0.01257922, + "epoch": 0.1682248609649782, + "flos": 22790163801600.0, + "grad_norm": 1.6719709230048432, + "language_loss": 0.75814915, + "learning_rate": 3.802902226251401e-06, + "loss": 0.83667111, + "num_input_tokens_seen": 60651175, + "router_z_loss_clip": 2.72460938, + "router_z_loss_mlp": 0.23071289, + "step": 2798, + "time_per_iteration": 2.5682647228240967 + }, + { + "auxiliary_loss_clip": 0.06575698, + "auxiliary_loss_mlp": 0.01285158, + "balance_loss_clip": 0.06297997, + "balance_loss_mlp": 0.01261483, + "epoch": 0.16828498421764618, + "flos": 20711545966080.0, + "grad_norm": 1.6493106854951614, + "language_loss": 0.8051939, + "learning_rate": 3.8027336015384845e-06, + "loss": 0.88380253, + "num_input_tokens_seen": 60670210, + "router_z_loss_clip": 2.77929688, + "router_z_loss_mlp": 0.23669434, + "step": 2799, + "time_per_iteration": 2.5808820724487305 + }, + { + "auxiliary_loss_clip": 0.06588444, + "auxiliary_loss_mlp": 0.01290497, + "balance_loss_clip": 0.06306663, + "balance_loss_mlp": 0.01264951, + "epoch": 0.16834510747031414, + "flos": 29427714581760.0, + "grad_norm": 2.08568782894778, + "language_loss": 0.71203279, + "learning_rate": 3.8025649084657296e-06, + "loss": 0.79082221, + "num_input_tokens_seen": 60690895, + "router_z_loss_clip": 2.81835938, + "router_z_loss_mlp": 0.25561523, + "step": 2800, + "time_per_iteration": 2.6072590351104736 + }, + { + "auxiliary_loss_clip": 0.06577089, + "auxiliary_loss_mlp": 0.01284192, + "balance_loss_clip": 0.06299706, + "balance_loss_mlp": 0.01258705, + "epoch": 0.1684052307229821, + "flos": 18150407994240.0, + "grad_norm": 2.3689825925758647, + "language_loss": 0.84516144, + "learning_rate": 3.8023961470395326e-06, + "loss": 0.9237743, + "num_input_tokens_seen": 60708280, + "router_z_loss_clip": 2.7734375, + "router_z_loss_mlp": 0.25488281, + "step": 2801, + "time_per_iteration": 2.5283203125 + }, + { + "auxiliary_loss_clip": 0.06582664, + "auxiliary_loss_mlp": 0.01284981, + "balance_loss_clip": 0.06302365, + "balance_loss_mlp": 0.01258612, + "epoch": 0.16846535397565007, + "flos": 16579439320320.0, + "grad_norm": 3.0795087290353744, + "language_loss": 0.84073383, + "learning_rate": 3.8022273172662933e-06, + "loss": 0.91941023, + "num_input_tokens_seen": 60724150, + "router_z_loss_clip": 2.80273438, + "router_z_loss_mlp": 0.26391602, + "step": 2802, + "time_per_iteration": 2.493727684020996 + }, + { + "auxiliary_loss_clip": 0.06582403, + "auxiliary_loss_mlp": 0.01282997, + "balance_loss_clip": 0.0630409, + "balance_loss_mlp": 0.01256831, + "epoch": 0.16852547722831807, + "flos": 30416667995520.0, + "grad_norm": 4.967511006144659, + "language_loss": 0.81234676, + "learning_rate": 3.802058419152413e-06, + "loss": 0.89100075, + "num_input_tokens_seen": 60746485, + "router_z_loss_clip": 2.78320312, + "router_z_loss_mlp": 0.26147461, + "step": 2803, + "time_per_iteration": 2.6188409328460693 + }, + { + "auxiliary_loss_clip": 0.06578018, + "auxiliary_loss_mlp": 0.01280405, + "balance_loss_clip": 0.06301461, + "balance_loss_mlp": 0.01256157, + "epoch": 0.16858560048098603, + "flos": 33515279982720.0, + "grad_norm": 2.6560543874068205, + "language_loss": 0.77301621, + "learning_rate": 3.801889452704297e-06, + "loss": 0.85160041, + "num_input_tokens_seen": 60762875, + "router_z_loss_clip": 2.76367188, + "router_z_loss_mlp": 0.24230957, + "step": 2804, + "time_per_iteration": 2.6222236156463623 + }, + { + "auxiliary_loss_clip": 0.06456417, + "auxiliary_loss_mlp": 0.01261999, + "balance_loss_clip": 0.06296105, + "balance_loss_mlp": 0.0125524, + "epoch": 0.168645723733654, + "flos": 67390845793920.0, + "grad_norm": 0.7985418659660302, + "language_loss": 0.55433214, + "learning_rate": 3.8017204179283526e-06, + "loss": 0.63151628, + "num_input_tokens_seen": 60825510, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.06774902, + "step": 2805, + "time_per_iteration": 3.1424005031585693 + }, + { + "auxiliary_loss_clip": 0.06571464, + "auxiliary_loss_mlp": 0.01283981, + "balance_loss_clip": 0.06301463, + "balance_loss_mlp": 0.01260723, + "epoch": 0.16870584698632196, + "flos": 21331016801280.0, + "grad_norm": 1.8814500249786532, + "language_loss": 0.74235076, + "learning_rate": 3.8015513148309892e-06, + "loss": 0.82090515, + "num_input_tokens_seen": 60844440, + "router_z_loss_clip": 2.70117188, + "router_z_loss_mlp": 0.23254395, + "step": 2806, + "time_per_iteration": 2.5448226928710938 + }, + { + "auxiliary_loss_clip": 0.06569488, + "auxiliary_loss_mlp": 0.01288633, + "balance_loss_clip": 0.06295753, + "balance_loss_mlp": 0.01264123, + "epoch": 0.16876597023898993, + "flos": 20747030970240.0, + "grad_norm": 2.4625186255791407, + "language_loss": 0.70848989, + "learning_rate": 3.80138214341862e-06, + "loss": 0.78707111, + "num_input_tokens_seen": 60863210, + "router_z_loss_clip": 2.734375, + "router_z_loss_mlp": 0.24523926, + "step": 2807, + "time_per_iteration": 2.5282390117645264 + }, + { + "auxiliary_loss_clip": 0.06578949, + "auxiliary_loss_mlp": 0.01289591, + "balance_loss_clip": 0.06299816, + "balance_loss_mlp": 0.0126383, + "epoch": 0.1688260934916579, + "flos": 20309806765440.0, + "grad_norm": 3.7758907272624715, + "language_loss": 0.71724349, + "learning_rate": 3.8012129036976587e-06, + "loss": 0.79592896, + "num_input_tokens_seen": 60882510, + "router_z_loss_clip": 2.79101562, + "router_z_loss_mlp": 0.25744629, + "step": 2808, + "time_per_iteration": 2.5146172046661377 + }, + { + "auxiliary_loss_clip": 0.06592815, + "auxiliary_loss_mlp": 0.01288179, + "balance_loss_clip": 0.06306504, + "balance_loss_mlp": 0.01261119, + "epoch": 0.16888621674432586, + "flos": 20347136559360.0, + "grad_norm": 2.150924717168134, + "language_loss": 0.80452245, + "learning_rate": 3.8010435956745236e-06, + "loss": 0.88333237, + "num_input_tokens_seen": 60901105, + "router_z_loss_clip": 2.86328125, + "router_z_loss_mlp": 0.27075195, + "step": 2809, + "time_per_iteration": 2.590801477432251 + }, + { + "auxiliary_loss_clip": 0.06586212, + "auxiliary_loss_mlp": 0.01286252, + "balance_loss_clip": 0.06299593, + "balance_loss_mlp": 0.01258965, + "epoch": 0.16894633999699385, + "flos": 16248963617280.0, + "grad_norm": 2.023624064417177, + "language_loss": 0.8897475, + "learning_rate": 3.8008742193556358e-06, + "loss": 0.96847212, + "num_input_tokens_seen": 60915340, + "router_z_loss_clip": 2.86523438, + "router_z_loss_mlp": 0.27294922, + "step": 2810, + "time_per_iteration": 2.553370714187622 + }, + { + "auxiliary_loss_clip": 0.0659079, + "auxiliary_loss_mlp": 0.01302127, + "balance_loss_clip": 0.06304274, + "balance_loss_mlp": 0.01273994, + "epoch": 0.16900646324966181, + "flos": 19616347175040.0, + "grad_norm": 1.906856377822649, + "language_loss": 0.93345243, + "learning_rate": 3.800704774747416e-06, + "loss": 1.01238155, + "num_input_tokens_seen": 60933735, + "router_z_loss_clip": 2.86523438, + "router_z_loss_mlp": 0.28137207, + "step": 2811, + "time_per_iteration": 2.5584306716918945 + }, + { + "auxiliary_loss_clip": 0.06579725, + "auxiliary_loss_mlp": 0.01293368, + "balance_loss_clip": 0.0629798, + "balance_loss_mlp": 0.01266534, + "epoch": 0.16906658650232978, + "flos": 22024644099840.0, + "grad_norm": 1.777677884933971, + "language_loss": 0.80087781, + "learning_rate": 3.800535261856291e-06, + "loss": 0.87960875, + "num_input_tokens_seen": 60953105, + "router_z_loss_clip": 2.81445312, + "router_z_loss_mlp": 0.26818848, + "step": 2812, + "time_per_iteration": 2.5193934440612793 + }, + { + "auxiliary_loss_clip": 0.06578699, + "auxiliary_loss_mlp": 0.01288816, + "balance_loss_clip": 0.06299478, + "balance_loss_mlp": 0.01262983, + "epoch": 0.16912670975499774, + "flos": 11768212131840.0, + "grad_norm": 2.3060118484148586, + "language_loss": 0.76260078, + "learning_rate": 3.8003656806886887e-06, + "loss": 0.84127587, + "num_input_tokens_seen": 60969150, + "router_z_loss_clip": 2.79296875, + "router_z_loss_mlp": 0.25830078, + "step": 2813, + "time_per_iteration": 2.5597875118255615 + }, + { + "auxiliary_loss_clip": 0.06583597, + "auxiliary_loss_mlp": 0.01290749, + "balance_loss_clip": 0.06300971, + "balance_loss_mlp": 0.01265083, + "epoch": 0.1691868330076657, + "flos": 17166443898240.0, + "grad_norm": 2.6968588943339444, + "language_loss": 0.70284265, + "learning_rate": 3.8001960312510396e-06, + "loss": 0.78158611, + "num_input_tokens_seen": 60982825, + "router_z_loss_clip": 2.82617188, + "router_z_loss_mlp": 0.2565918, + "step": 2814, + "time_per_iteration": 2.4971132278442383 + }, + { + "auxiliary_loss_clip": 0.06581523, + "auxiliary_loss_mlp": 0.01299068, + "balance_loss_clip": 0.06302465, + "balance_loss_mlp": 0.01272174, + "epoch": 0.16924695626033368, + "flos": 22422693720960.0, + "grad_norm": 1.782997034372258, + "language_loss": 0.63103068, + "learning_rate": 3.800026313549776e-06, + "loss": 0.7098366, + "num_input_tokens_seen": 61000875, + "router_z_loss_clip": 2.79296875, + "router_z_loss_mlp": 0.2689209, + "step": 2815, + "time_per_iteration": 2.583073377609253 + }, + { + "auxiliary_loss_clip": 0.06579722, + "auxiliary_loss_mlp": 0.01301206, + "balance_loss_clip": 0.06305208, + "balance_loss_mlp": 0.01275195, + "epoch": 0.16930707951300164, + "flos": 25746835772160.0, + "grad_norm": 1.6235196600742487, + "language_loss": 0.82652867, + "learning_rate": 3.7998565275913342e-06, + "loss": 0.90533793, + "num_input_tokens_seen": 61021940, + "router_z_loss_clip": 2.7421875, + "router_z_loss_mlp": 0.26037598, + "step": 2816, + "time_per_iteration": 2.567267894744873 + }, + { + "auxiliary_loss_clip": 0.06582578, + "auxiliary_loss_mlp": 0.01283511, + "balance_loss_clip": 0.06305215, + "balance_loss_mlp": 0.01257404, + "epoch": 0.16936720276566963, + "flos": 22753588694400.0, + "grad_norm": 2.305113279035628, + "language_loss": 0.88275278, + "learning_rate": 3.799686673382153e-06, + "loss": 0.96141362, + "num_input_tokens_seen": 61040285, + "router_z_loss_clip": 2.77148438, + "router_z_loss_mlp": 0.26074219, + "step": 2817, + "time_per_iteration": 2.55474853515625 + }, + { + "auxiliary_loss_clip": 0.06582828, + "auxiliary_loss_mlp": 0.0128986, + "balance_loss_clip": 0.06307572, + "balance_loss_mlp": 0.01264326, + "epoch": 0.1694273260183376, + "flos": 19580191338240.0, + "grad_norm": 1.9827332941616407, + "language_loss": 0.82882643, + "learning_rate": 3.799516750928672e-06, + "loss": 0.90755332, + "num_input_tokens_seen": 61059020, + "router_z_loss_clip": 2.75195312, + "router_z_loss_mlp": 0.2557373, + "step": 2818, + "time_per_iteration": 4.006748676300049 + }, + { + "auxiliary_loss_clip": 0.06584448, + "auxiliary_loss_mlp": 0.01293023, + "balance_loss_clip": 0.06306577, + "balance_loss_mlp": 0.01267905, + "epoch": 0.16948744927100556, + "flos": 12462636044160.0, + "grad_norm": 2.7889091010227367, + "language_loss": 0.81285071, + "learning_rate": 3.799346760237336e-06, + "loss": 0.8916254, + "num_input_tokens_seen": 61074245, + "router_z_loss_clip": 2.77929688, + "router_z_loss_mlp": 0.2512207, + "step": 2819, + "time_per_iteration": 2.513493537902832 + }, + { + "auxiliary_loss_clip": 0.06486231, + "auxiliary_loss_mlp": 0.01264851, + "balance_loss_clip": 0.06326687, + "balance_loss_mlp": 0.01257299, + "epoch": 0.16954757252367353, + "flos": 71309470164480.0, + "grad_norm": 0.8945207214981431, + "language_loss": 0.6004045, + "learning_rate": 3.7991767013145902e-06, + "loss": 0.67791533, + "num_input_tokens_seen": 61127080, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.07537842, + "step": 2820, + "time_per_iteration": 3.0841901302337646 + }, + { + "auxiliary_loss_clip": 0.06583934, + "auxiliary_loss_mlp": 0.01283404, + "balance_loss_clip": 0.06305862, + "balance_loss_mlp": 0.01258656, + "epoch": 0.1696076957763415, + "flos": 29614237770240.0, + "grad_norm": 2.2684361224992315, + "language_loss": 0.79040307, + "learning_rate": 3.7990065741668844e-06, + "loss": 0.86907649, + "num_input_tokens_seen": 61146955, + "router_z_loss_clip": 2.78125, + "router_z_loss_mlp": 0.24755859, + "step": 2821, + "time_per_iteration": 4.0664753913879395 + }, + { + "auxiliary_loss_clip": 0.06580411, + "auxiliary_loss_mlp": 0.01287682, + "balance_loss_clip": 0.06301302, + "balance_loss_mlp": 0.01260884, + "epoch": 0.16966781902900946, + "flos": 24395359668480.0, + "grad_norm": 4.427680473234215, + "language_loss": 0.79946303, + "learning_rate": 3.7988363788006685e-06, + "loss": 0.87814403, + "num_input_tokens_seen": 61166605, + "router_z_loss_clip": 2.79101562, + "router_z_loss_mlp": 0.26782227, + "step": 2822, + "time_per_iteration": 2.591439962387085 + }, + { + "auxiliary_loss_clip": 0.06573688, + "auxiliary_loss_mlp": 0.01292623, + "balance_loss_clip": 0.06300368, + "balance_loss_mlp": 0.0126834, + "epoch": 0.16972794228167745, + "flos": 23045392938240.0, + "grad_norm": 1.79403732378333, + "language_loss": 0.75404185, + "learning_rate": 3.7986661152223967e-06, + "loss": 0.83270496, + "num_input_tokens_seen": 61186535, + "router_z_loss_clip": 2.734375, + "router_z_loss_mlp": 0.24291992, + "step": 2823, + "time_per_iteration": 2.607241153717041 + }, + { + "auxiliary_loss_clip": 0.06584911, + "auxiliary_loss_mlp": 0.01296286, + "balance_loss_clip": 0.06309374, + "balance_loss_mlp": 0.01270704, + "epoch": 0.16978806553434542, + "flos": 35237915746560.0, + "grad_norm": 1.9541945473914888, + "language_loss": 0.60637134, + "learning_rate": 3.7984957834385257e-06, + "loss": 0.68518329, + "num_input_tokens_seen": 61208965, + "router_z_loss_clip": 2.7578125, + "router_z_loss_mlp": 0.2557373, + "step": 2824, + "time_per_iteration": 4.110937595367432 + }, + { + "auxiliary_loss_clip": 0.06588213, + "auxiliary_loss_mlp": 0.01295922, + "balance_loss_clip": 0.06311615, + "balance_loss_mlp": 0.01271114, + "epoch": 0.16984818878701338, + "flos": 32022366986880.0, + "grad_norm": 1.641592491230249, + "language_loss": 0.73562557, + "learning_rate": 3.7983253834555144e-06, + "loss": 0.81446695, + "num_input_tokens_seen": 61230670, + "router_z_loss_clip": 2.765625, + "router_z_loss_mlp": 0.24816895, + "step": 2825, + "time_per_iteration": 2.634206533432007 + }, + { + "auxiliary_loss_clip": 0.06593174, + "auxiliary_loss_mlp": 0.01295449, + "balance_loss_clip": 0.06306911, + "balance_loss_mlp": 0.01267411, + "epoch": 0.16990831203968135, + "flos": 22824936046080.0, + "grad_norm": 2.0964880275629465, + "language_loss": 0.86494017, + "learning_rate": 3.7981549152798245e-06, + "loss": 0.94382638, + "num_input_tokens_seen": 61249510, + "router_z_loss_clip": 2.8671875, + "router_z_loss_mlp": 0.28051758, + "step": 2826, + "time_per_iteration": 4.0616254806518555 + }, + { + "auxiliary_loss_clip": 0.0658946, + "auxiliary_loss_mlp": 0.01287444, + "balance_loss_clip": 0.0630484, + "balance_loss_mlp": 0.01260122, + "epoch": 0.1699684352923493, + "flos": 23046315333120.0, + "grad_norm": 1.7026807922554432, + "language_loss": 0.83019429, + "learning_rate": 3.7979843789179196e-06, + "loss": 0.90896332, + "num_input_tokens_seen": 61269440, + "router_z_loss_clip": 2.8515625, + "router_z_loss_mlp": 0.27307129, + "step": 2827, + "time_per_iteration": 2.5943539142608643 + }, + { + "auxiliary_loss_clip": 0.0658665, + "auxiliary_loss_mlp": 0.01291922, + "balance_loss_clip": 0.06303778, + "balance_loss_mlp": 0.01264206, + "epoch": 0.17002855854501728, + "flos": 21440532487680.0, + "grad_norm": 1.9993521816112911, + "language_loss": 0.75042886, + "learning_rate": 3.797813774376267e-06, + "loss": 0.82921457, + "num_input_tokens_seen": 61288195, + "router_z_loss_clip": 2.83203125, + "router_z_loss_mlp": 0.27722168, + "step": 2828, + "time_per_iteration": 2.5574147701263428 + }, + { + "auxiliary_loss_clip": 0.06457284, + "auxiliary_loss_mlp": 0.01264115, + "balance_loss_clip": 0.06297607, + "balance_loss_mlp": 0.01257433, + "epoch": 0.17008868179768524, + "flos": 71473966928640.0, + "grad_norm": 0.7544805989931621, + "language_loss": 0.56274545, + "learning_rate": 3.797643101661336e-06, + "loss": 0.63995945, + "num_input_tokens_seen": 61350850, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.06695557, + "step": 2829, + "time_per_iteration": 3.2194459438323975 + }, + { + "auxiliary_loss_clip": 0.06582125, + "auxiliary_loss_mlp": 0.01292929, + "balance_loss_clip": 0.06305368, + "balance_loss_mlp": 0.01267168, + "epoch": 0.17014880505035324, + "flos": 24907327315200.0, + "grad_norm": 1.8200636755843338, + "language_loss": 0.84280431, + "learning_rate": 3.7974723607795983e-06, + "loss": 0.9215548, + "num_input_tokens_seen": 61370765, + "router_z_loss_clip": 2.76757812, + "router_z_loss_mlp": 0.25769043, + "step": 2830, + "time_per_iteration": 2.5831046104431152 + }, + { + "auxiliary_loss_clip": 0.0658033, + "auxiliary_loss_mlp": 0.01286886, + "balance_loss_clip": 0.06302518, + "balance_loss_mlp": 0.0125985, + "epoch": 0.1702089283030212, + "flos": 29870263520640.0, + "grad_norm": 2.350653052094916, + "language_loss": 0.78878641, + "learning_rate": 3.797301551737529e-06, + "loss": 0.86745858, + "num_input_tokens_seen": 61388935, + "router_z_loss_clip": 2.77929688, + "router_z_loss_mlp": 0.2701416, + "step": 2831, + "time_per_iteration": 2.6084542274475098 + }, + { + "auxiliary_loss_clip": 0.06581105, + "auxiliary_loss_mlp": 0.01292582, + "balance_loss_clip": 0.06301375, + "balance_loss_mlp": 0.01266975, + "epoch": 0.17026905155568917, + "flos": 17749171918080.0, + "grad_norm": 2.0319157009696327, + "language_loss": 0.80466926, + "learning_rate": 3.7971306745416044e-06, + "loss": 0.88340604, + "num_input_tokens_seen": 61407350, + "router_z_loss_clip": 2.79882812, + "router_z_loss_mlp": 0.25610352, + "step": 2832, + "time_per_iteration": 2.5211668014526367 + }, + { + "auxiliary_loss_clip": 0.06573536, + "auxiliary_loss_mlp": 0.01286888, + "balance_loss_clip": 0.06297776, + "balance_loss_mlp": 0.0126133, + "epoch": 0.17032917480835713, + "flos": 23155327895040.0, + "grad_norm": 1.986078489446087, + "language_loss": 0.89480335, + "learning_rate": 3.7969597291983046e-06, + "loss": 0.97340751, + "num_input_tokens_seen": 61429010, + "router_z_loss_clip": 2.7578125, + "router_z_loss_mlp": 0.25561523, + "step": 2833, + "time_per_iteration": 2.560952663421631 + }, + { + "auxiliary_loss_clip": 0.06575279, + "auxiliary_loss_mlp": 0.01285966, + "balance_loss_clip": 0.06302077, + "balance_loss_mlp": 0.01261123, + "epoch": 0.1703892980610251, + "flos": 39211940465280.0, + "grad_norm": 2.220027390834487, + "language_loss": 0.73524815, + "learning_rate": 3.7967887157141115e-06, + "loss": 0.81386054, + "num_input_tokens_seen": 61450040, + "router_z_loss_clip": 2.73046875, + "router_z_loss_mlp": 0.24829102, + "step": 2834, + "time_per_iteration": 2.679527521133423 + }, + { + "auxiliary_loss_clip": 0.06581013, + "auxiliary_loss_mlp": 0.01285804, + "balance_loss_clip": 0.06300581, + "balance_loss_mlp": 0.01260245, + "epoch": 0.17044942131369306, + "flos": 23045728354560.0, + "grad_norm": 1.8327084439605401, + "language_loss": 0.87308288, + "learning_rate": 3.7966176340955106e-06, + "loss": 0.95175111, + "num_input_tokens_seen": 61468585, + "router_z_loss_clip": 2.8046875, + "router_z_loss_mlp": 0.2557373, + "step": 2835, + "time_per_iteration": 2.656421661376953 + }, + { + "auxiliary_loss_clip": 0.06579748, + "auxiliary_loss_mlp": 0.01283404, + "balance_loss_clip": 0.06297451, + "balance_loss_mlp": 0.01256451, + "epoch": 0.17050954456636103, + "flos": 17060533937280.0, + "grad_norm": 2.3811755619363058, + "language_loss": 0.75235045, + "learning_rate": 3.796446484348989e-06, + "loss": 0.83098197, + "num_input_tokens_seen": 61486330, + "router_z_loss_clip": 2.82226562, + "router_z_loss_mlp": 0.26940918, + "step": 2836, + "time_per_iteration": 2.4939451217651367 + }, + { + "auxiliary_loss_clip": 0.06577778, + "auxiliary_loss_mlp": 0.01283432, + "balance_loss_clip": 0.06295718, + "balance_loss_mlp": 0.01256955, + "epoch": 0.17056966781902902, + "flos": 16842634594560.0, + "grad_norm": 2.2113478912931606, + "language_loss": 0.81597924, + "learning_rate": 3.796275266481036e-06, + "loss": 0.89459133, + "num_input_tokens_seen": 61503950, + "router_z_loss_clip": 2.8203125, + "router_z_loss_mlp": 0.26501465, + "step": 2837, + "time_per_iteration": 2.5308785438537598 + }, + { + "auxiliary_loss_clip": 0.06567004, + "auxiliary_loss_mlp": 0.01296468, + "balance_loss_clip": 0.06298919, + "balance_loss_mlp": 0.01272149, + "epoch": 0.17062979107169698, + "flos": 17718340815360.0, + "grad_norm": 2.307982469607828, + "language_loss": 0.84291762, + "learning_rate": 3.7961039804981456e-06, + "loss": 0.92155236, + "num_input_tokens_seen": 61523550, + "router_z_loss_clip": 2.67578125, + "router_z_loss_mlp": 0.24328613, + "step": 2838, + "time_per_iteration": 2.509929895401001 + }, + { + "auxiliary_loss_clip": 0.06570365, + "auxiliary_loss_mlp": 0.01284738, + "balance_loss_clip": 0.06295732, + "balance_loss_mlp": 0.01260264, + "epoch": 0.17068991432436495, + "flos": 22531035450240.0, + "grad_norm": 1.8555127422179185, + "language_loss": 0.94406807, + "learning_rate": 3.795932626406812e-06, + "loss": 1.02261913, + "num_input_tokens_seen": 61542720, + "router_z_loss_clip": 2.74804688, + "router_z_loss_mlp": 0.24450684, + "step": 2839, + "time_per_iteration": 2.588021755218506 + }, + { + "auxiliary_loss_clip": 0.06569307, + "auxiliary_loss_mlp": 0.01284917, + "balance_loss_clip": 0.06293422, + "balance_loss_mlp": 0.01256808, + "epoch": 0.17075003757703291, + "flos": 25889698183680.0, + "grad_norm": 2.1000046554588394, + "language_loss": 0.84480917, + "learning_rate": 3.7957612042135336e-06, + "loss": 0.92335141, + "num_input_tokens_seen": 61563040, + "router_z_loss_clip": 2.75976562, + "router_z_loss_mlp": 0.28100586, + "step": 2840, + "time_per_iteration": 2.5653579235076904 + }, + { + "auxiliary_loss_clip": 0.06573716, + "auxiliary_loss_mlp": 0.01290397, + "balance_loss_clip": 0.06298221, + "balance_loss_mlp": 0.01263503, + "epoch": 0.17081016082970088, + "flos": 20126931229440.0, + "grad_norm": 1.871912800472889, + "language_loss": 0.76954079, + "learning_rate": 3.79558971392481e-06, + "loss": 0.8481819, + "num_input_tokens_seen": 61581890, + "router_z_loss_clip": 2.75390625, + "router_z_loss_mlp": 0.26879883, + "step": 2841, + "time_per_iteration": 2.5525524616241455 + }, + { + "auxiliary_loss_clip": 0.06573537, + "auxiliary_loss_mlp": 0.01282668, + "balance_loss_clip": 0.06297247, + "balance_loss_mlp": 0.01257026, + "epoch": 0.17087028408236885, + "flos": 24943441224960.0, + "grad_norm": 1.6793065618865832, + "language_loss": 0.77364486, + "learning_rate": 3.7954181555471443e-06, + "loss": 0.85220695, + "num_input_tokens_seen": 61602095, + "router_z_loss_clip": 2.765625, + "router_z_loss_mlp": 0.2565918, + "step": 2842, + "time_per_iteration": 2.5674381256103516 + }, + { + "auxiliary_loss_clip": 0.06561892, + "auxiliary_loss_mlp": 0.01282368, + "balance_loss_clip": 0.06295875, + "balance_loss_mlp": 0.01257489, + "epoch": 0.17093040733503684, + "flos": 19063108592640.0, + "grad_norm": 1.967223672886595, + "language_loss": 0.87176019, + "learning_rate": 3.795246529087043e-06, + "loss": 0.95020282, + "num_input_tokens_seen": 61620400, + "router_z_loss_clip": 2.66015625, + "router_z_loss_mlp": 0.24853516, + "step": 2843, + "time_per_iteration": 2.546586036682129 + }, + { + "auxiliary_loss_clip": 0.06571361, + "auxiliary_loss_mlp": 0.01285811, + "balance_loss_clip": 0.06299275, + "balance_loss_mlp": 0.01262339, + "epoch": 0.1709905305877048, + "flos": 13083993596160.0, + "grad_norm": 1.8800221555677419, + "language_loss": 0.69446707, + "learning_rate": 3.7950748345510126e-06, + "loss": 0.7730388, + "num_input_tokens_seen": 61637680, + "router_z_loss_clip": 2.72265625, + "router_z_loss_mlp": 0.23461914, + "step": 2844, + "time_per_iteration": 2.5857818126678467 + }, + { + "auxiliary_loss_clip": 0.06575634, + "auxiliary_loss_mlp": 0.01288208, + "balance_loss_clip": 0.06299984, + "balance_loss_mlp": 0.0126346, + "epoch": 0.17105065384037277, + "flos": 19215530369280.0, + "grad_norm": 1.7660184935388845, + "language_loss": 0.79213876, + "learning_rate": 3.7949030719455646e-06, + "loss": 0.87077713, + "num_input_tokens_seen": 61655630, + "router_z_loss_clip": 2.75585938, + "router_z_loss_mlp": 0.24780273, + "step": 2845, + "time_per_iteration": 2.5564208030700684 + }, + { + "auxiliary_loss_clip": 0.06577709, + "auxiliary_loss_mlp": 0.01293667, + "balance_loss_clip": 0.06302914, + "balance_loss_mlp": 0.01268586, + "epoch": 0.17111077709304073, + "flos": 18521106456960.0, + "grad_norm": 2.255753625544696, + "language_loss": 0.79110825, + "learning_rate": 3.7947312412772127e-06, + "loss": 0.86982203, + "num_input_tokens_seen": 61673475, + "router_z_loss_clip": 2.74804688, + "router_z_loss_mlp": 0.25085449, + "step": 2846, + "time_per_iteration": 2.513607978820801 + }, + { + "auxiliary_loss_clip": 0.06568472, + "auxiliary_loss_mlp": 0.01290569, + "balance_loss_clip": 0.06298524, + "balance_loss_mlp": 0.01266727, + "epoch": 0.1711709003457087, + "flos": 25089699726720.0, + "grad_norm": 1.7214534237870849, + "language_loss": 0.80675447, + "learning_rate": 3.794559342552472e-06, + "loss": 0.88534492, + "num_input_tokens_seen": 61693370, + "router_z_loss_clip": 2.69921875, + "router_z_loss_mlp": 0.23852539, + "step": 2847, + "time_per_iteration": 2.618793249130249 + }, + { + "auxiliary_loss_clip": 0.06569728, + "auxiliary_loss_mlp": 0.01293508, + "balance_loss_clip": 0.0629475, + "balance_loss_mlp": 0.01268796, + "epoch": 0.17123102359837666, + "flos": 17572124240640.0, + "grad_norm": 2.2846174525506973, + "language_loss": 0.88074541, + "learning_rate": 3.7943873757778614e-06, + "loss": 0.95937777, + "num_input_tokens_seen": 61710820, + "router_z_loss_clip": 2.75, + "router_z_loss_mlp": 0.24719238, + "step": 2848, + "time_per_iteration": 2.487272024154663 + }, + { + "auxiliary_loss_clip": 0.06569223, + "auxiliary_loss_mlp": 0.01309638, + "balance_loss_clip": 0.06294799, + "balance_loss_mlp": 0.01284688, + "epoch": 0.17129114685104463, + "flos": 26180244616320.0, + "grad_norm": 1.906108969463994, + "language_loss": 0.76101243, + "learning_rate": 3.794215340959902e-06, + "loss": 0.83980107, + "num_input_tokens_seen": 61729855, + "router_z_loss_clip": 2.74414062, + "router_z_loss_mlp": 0.24938965, + "step": 2849, + "time_per_iteration": 2.620347738265991 + }, + { + "auxiliary_loss_clip": 0.06449599, + "auxiliary_loss_mlp": 0.01264516, + "balance_loss_clip": 0.06291912, + "balance_loss_mlp": 0.01257077, + "epoch": 0.17135127010371262, + "flos": 69290696943360.0, + "grad_norm": 0.770033327211451, + "language_loss": 0.57434958, + "learning_rate": 3.7940432381051163e-06, + "loss": 0.65149075, + "num_input_tokens_seen": 61790290, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.07421875, + "step": 2850, + "time_per_iteration": 3.1464109420776367 + }, + { + "auxiliary_loss_clip": 0.0656237, + "auxiliary_loss_mlp": 0.01301725, + "balance_loss_clip": 0.06296088, + "balance_loss_mlp": 0.01277966, + "epoch": 0.1714113933563806, + "flos": 23556857460480.0, + "grad_norm": 2.479535747356738, + "language_loss": 0.81586778, + "learning_rate": 3.793871067220031e-06, + "loss": 0.89450872, + "num_input_tokens_seen": 61809265, + "router_z_loss_clip": 2.66601562, + "router_z_loss_mlp": 0.23742676, + "step": 2851, + "time_per_iteration": 2.558507204055786 + }, + { + "auxiliary_loss_clip": 0.06565535, + "auxiliary_loss_mlp": 0.01289531, + "balance_loss_clip": 0.06298645, + "balance_loss_mlp": 0.01267119, + "epoch": 0.17147151660904855, + "flos": 21148854024960.0, + "grad_norm": 2.2154108843285107, + "language_loss": 0.94662631, + "learning_rate": 3.7936988283111764e-06, + "loss": 1.025177, + "num_input_tokens_seen": 61828980, + "router_z_loss_clip": 2.66992188, + "router_z_loss_mlp": 0.22412109, + "step": 2852, + "time_per_iteration": 2.518974542617798 + }, + { + "auxiliary_loss_clip": 0.0657506, + "auxiliary_loss_mlp": 0.01290477, + "balance_loss_clip": 0.06300224, + "balance_loss_mlp": 0.01264299, + "epoch": 0.17153163986171652, + "flos": 18630873705600.0, + "grad_norm": 1.8056831581423547, + "language_loss": 0.70245004, + "learning_rate": 3.7935265213850817e-06, + "loss": 0.7811054, + "num_input_tokens_seen": 61847915, + "router_z_loss_clip": 2.74609375, + "router_z_loss_mlp": 0.26184082, + "step": 2853, + "time_per_iteration": 2.552562952041626 + }, + { + "auxiliary_loss_clip": 0.06576742, + "auxiliary_loss_mlp": 0.01296459, + "balance_loss_clip": 0.06299934, + "balance_loss_mlp": 0.01271663, + "epoch": 0.17159176311438448, + "flos": 18229134504960.0, + "grad_norm": 2.1946039611354418, + "language_loss": 0.67477524, + "learning_rate": 3.7933541464482815e-06, + "loss": 0.75350726, + "num_input_tokens_seen": 61865570, + "router_z_loss_clip": 2.765625, + "router_z_loss_mlp": 0.2479248, + "step": 2854, + "time_per_iteration": 2.5350561141967773 + }, + { + "auxiliary_loss_clip": 0.06572944, + "auxiliary_loss_mlp": 0.0128611, + "balance_loss_clip": 0.06305773, + "balance_loss_mlp": 0.01263973, + "epoch": 0.17165188636705245, + "flos": 20744976545280.0, + "grad_norm": 1.5291061865624715, + "language_loss": 0.89537871, + "learning_rate": 3.7931817035073124e-06, + "loss": 0.97396928, + "num_input_tokens_seen": 61883340, + "router_z_loss_clip": 2.671875, + "router_z_loss_mlp": 0.22143555, + "step": 2855, + "time_per_iteration": 2.5376133918762207 + }, + { + "auxiliary_loss_clip": 0.06575546, + "auxiliary_loss_mlp": 0.01295321, + "balance_loss_clip": 0.06304888, + "balance_loss_mlp": 0.01271145, + "epoch": 0.17171200961972044, + "flos": 24906824190720.0, + "grad_norm": 2.4271457535299654, + "language_loss": 0.84835625, + "learning_rate": 3.7930091925687134e-06, + "loss": 0.9270649, + "num_input_tokens_seen": 61900610, + "router_z_loss_clip": 2.70703125, + "router_z_loss_mlp": 0.24206543, + "step": 2856, + "time_per_iteration": 2.551483392715454 + }, + { + "auxiliary_loss_clip": 0.06575087, + "auxiliary_loss_mlp": 0.01290512, + "balance_loss_clip": 0.0630254, + "balance_loss_mlp": 0.01267528, + "epoch": 0.1717721328723884, + "flos": 20163464409600.0, + "grad_norm": 7.491722293090189, + "language_loss": 0.87615776, + "learning_rate": 3.792836613639026e-06, + "loss": 0.95481372, + "num_input_tokens_seen": 61916795, + "router_z_loss_clip": 2.72460938, + "router_z_loss_mlp": 0.23010254, + "step": 2857, + "time_per_iteration": 4.012267112731934 + }, + { + "auxiliary_loss_clip": 0.06572698, + "auxiliary_loss_mlp": 0.01287955, + "balance_loss_clip": 0.06301427, + "balance_loss_mlp": 0.01262385, + "epoch": 0.17183225612505637, + "flos": 23367357452160.0, + "grad_norm": 2.309816452702101, + "language_loss": 0.78393459, + "learning_rate": 3.7926639667247947e-06, + "loss": 0.86254114, + "num_input_tokens_seen": 61936665, + "router_z_loss_clip": 2.71484375, + "router_z_loss_mlp": 0.25585938, + "step": 2858, + "time_per_iteration": 2.58130145072937 + }, + { + "auxiliary_loss_clip": 0.06589144, + "auxiliary_loss_mlp": 0.0128985, + "balance_loss_clip": 0.06303509, + "balance_loss_mlp": 0.0126453, + "epoch": 0.17189237937772434, + "flos": 18120163870080.0, + "grad_norm": 2.664171996061716, + "language_loss": 0.77798349, + "learning_rate": 3.7924912518325663e-06, + "loss": 0.85677344, + "num_input_tokens_seen": 61954415, + "router_z_loss_clip": 2.85742188, + "router_z_loss_mlp": 0.25317383, + "step": 2859, + "time_per_iteration": 2.5043106079101562 + }, + { + "auxiliary_loss_clip": 0.06572397, + "auxiliary_loss_mlp": 0.01281612, + "balance_loss_clip": 0.06301641, + "balance_loss_mlp": 0.01258939, + "epoch": 0.1719525026303923, + "flos": 23265137070720.0, + "grad_norm": 5.679736885155129, + "language_loss": 0.77697283, + "learning_rate": 3.7923184689688902e-06, + "loss": 0.85551292, + "num_input_tokens_seen": 61973940, + "router_z_loss_clip": 2.7109375, + "router_z_loss_mlp": 0.22692871, + "step": 2860, + "time_per_iteration": 2.572662591934204 + }, + { + "auxiliary_loss_clip": 0.06574808, + "auxiliary_loss_mlp": 0.01292828, + "balance_loss_clip": 0.06301817, + "balance_loss_mlp": 0.01270583, + "epoch": 0.17201262588306027, + "flos": 20816156188800.0, + "grad_norm": 2.1792765136561036, + "language_loss": 0.82509398, + "learning_rate": 3.792145618140317e-06, + "loss": 0.90377033, + "num_input_tokens_seen": 61991845, + "router_z_loss_clip": 2.72851562, + "router_z_loss_mlp": 0.22229004, + "step": 2861, + "time_per_iteration": 3.9328150749206543 + }, + { + "auxiliary_loss_clip": 0.06577721, + "auxiliary_loss_mlp": 0.01292683, + "balance_loss_clip": 0.06305138, + "balance_loss_mlp": 0.0126896, + "epoch": 0.17207274913572823, + "flos": 20382076512000.0, + "grad_norm": 2.450020121503541, + "language_loss": 0.8692534, + "learning_rate": 3.7919726993534038e-06, + "loss": 0.9479574, + "num_input_tokens_seen": 62009395, + "router_z_loss_clip": 2.72460938, + "router_z_loss_mlp": 0.23718262, + "step": 2862, + "time_per_iteration": 2.533240795135498 + }, + { + "auxiliary_loss_clip": 0.06570788, + "auxiliary_loss_mlp": 0.01286464, + "balance_loss_clip": 0.06306001, + "balance_loss_mlp": 0.01264387, + "epoch": 0.17213287238839622, + "flos": 26805082112640.0, + "grad_norm": 1.8452916722599864, + "language_loss": 0.78642774, + "learning_rate": 3.7917997126147054e-06, + "loss": 0.86500025, + "num_input_tokens_seen": 62029005, + "router_z_loss_clip": 2.64257812, + "router_z_loss_mlp": 0.22045898, + "step": 2863, + "time_per_iteration": 2.5886759757995605 + }, + { + "auxiliary_loss_clip": 0.06585991, + "auxiliary_loss_mlp": 0.0129295, + "balance_loss_clip": 0.06318994, + "balance_loss_mlp": 0.01270336, + "epoch": 0.1721929956410642, + "flos": 26037927256320.0, + "grad_norm": 1.9522517065159992, + "language_loss": 0.73622, + "learning_rate": 3.7916266579307823e-06, + "loss": 0.81500947, + "num_input_tokens_seen": 62048730, + "router_z_loss_clip": 2.66796875, + "router_z_loss_mlp": 0.22631836, + "step": 2864, + "time_per_iteration": 4.05191445350647 + }, + { + "auxiliary_loss_clip": 0.06582697, + "auxiliary_loss_mlp": 0.01292894, + "balance_loss_clip": 0.06309051, + "balance_loss_mlp": 0.01269362, + "epoch": 0.17225311889373215, + "flos": 22279621674240.0, + "grad_norm": 1.6774687827131978, + "language_loss": 0.73856592, + "learning_rate": 3.7914535353081973e-06, + "loss": 0.81732178, + "num_input_tokens_seen": 62069000, + "router_z_loss_clip": 2.73632812, + "router_z_loss_mlp": 0.23535156, + "step": 2865, + "time_per_iteration": 3.9612531661987305 + }, + { + "auxiliary_loss_clip": 0.06584621, + "auxiliary_loss_mlp": 0.01305521, + "balance_loss_clip": 0.06313194, + "balance_loss_mlp": 0.01281405, + "epoch": 0.17231324214640012, + "flos": 21294106277760.0, + "grad_norm": 2.4869534197111385, + "language_loss": 0.79160404, + "learning_rate": 3.7912803447535145e-06, + "loss": 0.87050545, + "num_input_tokens_seen": 62086750, + "router_z_loss_clip": 2.71484375, + "router_z_loss_mlp": 0.24121094, + "step": 2866, + "time_per_iteration": 2.542663812637329 + }, + { + "auxiliary_loss_clip": 0.06586975, + "auxiliary_loss_mlp": 0.01295234, + "balance_loss_clip": 0.0631168, + "balance_loss_mlp": 0.01269688, + "epoch": 0.17237336539906808, + "flos": 19686520569600.0, + "grad_norm": 2.39942640082668, + "language_loss": 0.80413449, + "learning_rate": 3.7911070862733016e-06, + "loss": 0.8829565, + "num_input_tokens_seen": 62106240, + "router_z_loss_clip": 2.75390625, + "router_z_loss_mlp": 0.25549316, + "step": 2867, + "time_per_iteration": 2.524634599685669 + }, + { + "auxiliary_loss_clip": 0.06577912, + "auxiliary_loss_mlp": 0.01291096, + "balance_loss_clip": 0.063054, + "balance_loss_mlp": 0.01267123, + "epoch": 0.17243348865173605, + "flos": 17535339498240.0, + "grad_norm": 1.6440546002054504, + "language_loss": 0.80347586, + "learning_rate": 3.7909337598741276e-06, + "loss": 0.88216591, + "num_input_tokens_seen": 62124895, + "router_z_loss_clip": 2.72460938, + "router_z_loss_mlp": 0.23974609, + "step": 2868, + "time_per_iteration": 2.5237460136413574 + }, + { + "auxiliary_loss_clip": 0.06586674, + "auxiliary_loss_mlp": 0.0129419, + "balance_loss_clip": 0.06310418, + "balance_loss_mlp": 0.01270241, + "epoch": 0.17249361190440402, + "flos": 18265751539200.0, + "grad_norm": 1.9212015042396675, + "language_loss": 0.84995282, + "learning_rate": 3.7907603655625674e-06, + "loss": 0.92876148, + "num_input_tokens_seen": 62143510, + "router_z_loss_clip": 2.76171875, + "router_z_loss_mlp": 0.23937988, + "step": 2869, + "time_per_iteration": 2.4968101978302 + }, + { + "auxiliary_loss_clip": 0.06574747, + "auxiliary_loss_mlp": 0.01290391, + "balance_loss_clip": 0.06302473, + "balance_loss_mlp": 0.01265393, + "epoch": 0.172553735157072, + "flos": 21180020544000.0, + "grad_norm": 2.372251531694949, + "language_loss": 0.78318757, + "learning_rate": 3.7905869033451932e-06, + "loss": 0.861839, + "num_input_tokens_seen": 62162285, + "router_z_loss_clip": 2.71875, + "router_z_loss_mlp": 0.25, + "step": 2870, + "time_per_iteration": 2.6494200229644775 + }, + { + "auxiliary_loss_clip": 0.06572236, + "auxiliary_loss_mlp": 0.01286981, + "balance_loss_clip": 0.06308384, + "balance_loss_mlp": 0.01266083, + "epoch": 0.17261385840973997, + "flos": 22279831309440.0, + "grad_norm": 1.8100610801094352, + "language_loss": 0.77937269, + "learning_rate": 3.7904133732285857e-06, + "loss": 0.85796487, + "num_input_tokens_seen": 62180970, + "router_z_loss_clip": 2.63671875, + "router_z_loss_mlp": 0.20910645, + "step": 2871, + "time_per_iteration": 2.6145200729370117 + }, + { + "auxiliary_loss_clip": 0.06580749, + "auxiliary_loss_mlp": 0.01284391, + "balance_loss_clip": 0.06306709, + "balance_loss_mlp": 0.01260263, + "epoch": 0.17267398166240794, + "flos": 27928680238080.0, + "grad_norm": 2.361348336036686, + "language_loss": 0.75478256, + "learning_rate": 3.7902397752193228e-06, + "loss": 0.83343399, + "num_input_tokens_seen": 62198965, + "router_z_loss_clip": 2.74414062, + "router_z_loss_mlp": 0.24157715, + "step": 2872, + "time_per_iteration": 2.598762035369873 + }, + { + "auxiliary_loss_clip": 0.06570577, + "auxiliary_loss_mlp": 0.01297063, + "balance_loss_clip": 0.06302171, + "balance_loss_mlp": 0.01274067, + "epoch": 0.1727341049150759, + "flos": 21951661593600.0, + "grad_norm": 1.9699566193216007, + "language_loss": 0.83421481, + "learning_rate": 3.790066109323988e-06, + "loss": 0.91289121, + "num_input_tokens_seen": 62219890, + "router_z_loss_clip": 2.68359375, + "router_z_loss_mlp": 0.23010254, + "step": 2873, + "time_per_iteration": 2.5375001430511475 + }, + { + "auxiliary_loss_clip": 0.06575856, + "auxiliary_loss_mlp": 0.01290457, + "balance_loss_clip": 0.06307539, + "balance_loss_mlp": 0.01266198, + "epoch": 0.17279422816774387, + "flos": 18112742784000.0, + "grad_norm": 2.023952379864123, + "language_loss": 0.75553465, + "learning_rate": 3.7898923755491678e-06, + "loss": 0.83419782, + "num_input_tokens_seen": 62237140, + "router_z_loss_clip": 2.6796875, + "router_z_loss_mlp": 0.24243164, + "step": 2874, + "time_per_iteration": 2.6628403663635254 + }, + { + "auxiliary_loss_clip": 0.06583337, + "auxiliary_loss_mlp": 0.01288686, + "balance_loss_clip": 0.06308968, + "balance_loss_mlp": 0.01261959, + "epoch": 0.17285435142041183, + "flos": 21841936272000.0, + "grad_norm": 2.156422611189301, + "language_loss": 0.81707162, + "learning_rate": 3.7897185739014487e-06, + "loss": 0.89579183, + "num_input_tokens_seen": 62255405, + "router_z_loss_clip": 2.7421875, + "router_z_loss_mlp": 0.26733398, + "step": 2875, + "time_per_iteration": 2.5195512771606445 + }, + { + "auxiliary_loss_clip": 0.06576921, + "auxiliary_loss_mlp": 0.0129142, + "balance_loss_clip": 0.06303119, + "balance_loss_mlp": 0.01265122, + "epoch": 0.17291447467307983, + "flos": 18374219049600.0, + "grad_norm": 2.297860169925143, + "language_loss": 0.89334786, + "learning_rate": 3.7895447043874217e-06, + "loss": 0.9720313, + "num_input_tokens_seen": 62271280, + "router_z_loss_clip": 2.73828125, + "router_z_loss_mlp": 0.26281738, + "step": 2876, + "time_per_iteration": 2.5156540870666504 + }, + { + "auxiliary_loss_clip": 0.06576936, + "auxiliary_loss_mlp": 0.01286777, + "balance_loss_clip": 0.06307175, + "balance_loss_mlp": 0.01262793, + "epoch": 0.1729745979257478, + "flos": 18630580216320.0, + "grad_norm": 2.037856806425618, + "language_loss": 0.85539293, + "learning_rate": 3.789370767013681e-06, + "loss": 0.93403006, + "num_input_tokens_seen": 62289140, + "router_z_loss_clip": 2.69726562, + "router_z_loss_mlp": 0.23986816, + "step": 2877, + "time_per_iteration": 2.4874324798583984 + }, + { + "auxiliary_loss_clip": 0.06576495, + "auxiliary_loss_mlp": 0.01284602, + "balance_loss_clip": 0.06305559, + "balance_loss_mlp": 0.01260593, + "epoch": 0.17303472117841576, + "flos": 23004122002560.0, + "grad_norm": 1.956584823379214, + "language_loss": 0.79972547, + "learning_rate": 3.7891967617868204e-06, + "loss": 0.87833643, + "num_input_tokens_seen": 62307490, + "router_z_loss_clip": 2.70898438, + "router_z_loss_mlp": 0.23986816, + "step": 2878, + "time_per_iteration": 2.5546791553497314 + }, + { + "auxiliary_loss_clip": 0.06571983, + "auxiliary_loss_mlp": 0.01289115, + "balance_loss_clip": 0.06302349, + "balance_loss_mlp": 0.01264558, + "epoch": 0.17309484443108372, + "flos": 25671169935360.0, + "grad_norm": 1.824315336901638, + "language_loss": 0.72073978, + "learning_rate": 3.78902268871344e-06, + "loss": 0.79935074, + "num_input_tokens_seen": 62328570, + "router_z_loss_clip": 2.69335938, + "router_z_loss_mlp": 0.24584961, + "step": 2879, + "time_per_iteration": 2.5585644245147705 + }, + { + "auxiliary_loss_clip": 0.06575425, + "auxiliary_loss_mlp": 0.01284736, + "balance_loss_clip": 0.06301329, + "balance_loss_mlp": 0.01260048, + "epoch": 0.1731549676837517, + "flos": 13557960616320.0, + "grad_norm": 1.9540483547981324, + "language_loss": 0.8431474, + "learning_rate": 3.78884854780014e-06, + "loss": 0.921749, + "num_input_tokens_seen": 62345735, + "router_z_loss_clip": 2.7421875, + "router_z_loss_mlp": 0.24682617, + "step": 2880, + "time_per_iteration": 2.5332508087158203 + }, + { + "auxiliary_loss_clip": 0.06579134, + "auxiliary_loss_mlp": 0.01281408, + "balance_loss_clip": 0.06303075, + "balance_loss_mlp": 0.01256565, + "epoch": 0.17321509093641965, + "flos": 22863733286400.0, + "grad_norm": 3.3854797576129525, + "language_loss": 0.82168967, + "learning_rate": 3.7886743390535236e-06, + "loss": 0.90029514, + "num_input_tokens_seen": 62365525, + "router_z_loss_clip": 2.76171875, + "router_z_loss_mlp": 0.2487793, + "step": 2881, + "time_per_iteration": 2.5265071392059326 + }, + { + "auxiliary_loss_clip": 0.06575799, + "auxiliary_loss_mlp": 0.01283502, + "balance_loss_clip": 0.06305043, + "balance_loss_mlp": 0.0125904, + "epoch": 0.17327521418908762, + "flos": 24359665029120.0, + "grad_norm": 1.8504646386399068, + "language_loss": 0.77975154, + "learning_rate": 3.788500062480197e-06, + "loss": 0.85834455, + "num_input_tokens_seen": 62385160, + "router_z_loss_clip": 2.70898438, + "router_z_loss_mlp": 0.24450684, + "step": 2882, + "time_per_iteration": 2.56476092338562 + }, + { + "auxiliary_loss_clip": 0.0657361, + "auxiliary_loss_mlp": 0.01281786, + "balance_loss_clip": 0.063067, + "balance_loss_mlp": 0.01260495, + "epoch": 0.1733353374417556, + "flos": 33113373073920.0, + "grad_norm": 2.021690524452963, + "language_loss": 0.77161384, + "learning_rate": 3.788325718086769e-06, + "loss": 0.85016787, + "num_input_tokens_seen": 62405280, + "router_z_loss_clip": 2.66992188, + "router_z_loss_mlp": 0.21276855, + "step": 2883, + "time_per_iteration": 2.6154749393463135 + }, + { + "auxiliary_loss_clip": 0.06569435, + "auxiliary_loss_mlp": 0.01278991, + "balance_loss_clip": 0.06301424, + "balance_loss_mlp": 0.01256365, + "epoch": 0.17339546069442358, + "flos": 24395778938880.0, + "grad_norm": 4.943843215515709, + "language_loss": 0.86164784, + "learning_rate": 3.7881513058798503e-06, + "loss": 0.94013214, + "num_input_tokens_seen": 62423665, + "router_z_loss_clip": 2.68164062, + "router_z_loss_mlp": 0.22631836, + "step": 2884, + "time_per_iteration": 2.5598208904266357 + }, + { + "auxiliary_loss_clip": 0.06577636, + "auxiliary_loss_mlp": 0.01280409, + "balance_loss_clip": 0.06308297, + "balance_loss_mlp": 0.01256878, + "epoch": 0.17345558394709154, + "flos": 27461589252480.0, + "grad_norm": 1.714045228397976, + "language_loss": 0.75027329, + "learning_rate": 3.787976825866055e-06, + "loss": 0.82885373, + "num_input_tokens_seen": 62445170, + "router_z_loss_clip": 2.69335938, + "router_z_loss_mlp": 0.23535156, + "step": 2885, + "time_per_iteration": 2.584550619125366 + }, + { + "auxiliary_loss_clip": 0.06567928, + "auxiliary_loss_mlp": 0.01282091, + "balance_loss_clip": 0.06304367, + "balance_loss_mlp": 0.01259954, + "epoch": 0.1735157071997595, + "flos": 24689260264320.0, + "grad_norm": 1.6836608181022428, + "language_loss": 0.71760321, + "learning_rate": 3.7878022780519998e-06, + "loss": 0.79610336, + "num_input_tokens_seen": 62466135, + "router_z_loss_clip": 2.63671875, + "router_z_loss_mlp": 0.22131348, + "step": 2886, + "time_per_iteration": 2.5990986824035645 + }, + { + "auxiliary_loss_clip": 0.06574686, + "auxiliary_loss_mlp": 0.01280319, + "balance_loss_clip": 0.06304233, + "balance_loss_mlp": 0.01257275, + "epoch": 0.17357583045242747, + "flos": 21695300426880.0, + "grad_norm": 2.252280410203818, + "language_loss": 0.70329314, + "learning_rate": 3.7876276624443024e-06, + "loss": 0.78184319, + "num_input_tokens_seen": 62483910, + "router_z_loss_clip": 2.703125, + "router_z_loss_mlp": 0.23071289, + "step": 2887, + "time_per_iteration": 2.528995990753174 + }, + { + "auxiliary_loss_clip": 0.0657585, + "auxiliary_loss_mlp": 0.0127978, + "balance_loss_clip": 0.06305341, + "balance_loss_mlp": 0.01258155, + "epoch": 0.17363595370509544, + "flos": 15380846190720.0, + "grad_norm": 1.8987045627788157, + "language_loss": 0.85982835, + "learning_rate": 3.787452979049585e-06, + "loss": 0.93838477, + "num_input_tokens_seen": 62501530, + "router_z_loss_clip": 2.70507812, + "router_z_loss_mlp": 0.21618652, + "step": 2888, + "time_per_iteration": 2.520200252532959 + }, + { + "auxiliary_loss_clip": 0.06585068, + "auxiliary_loss_mlp": 0.0128524, + "balance_loss_clip": 0.06313335, + "balance_loss_mlp": 0.01262077, + "epoch": 0.1736960769577634, + "flos": 23447719117440.0, + "grad_norm": 1.9850534312792847, + "language_loss": 0.79895031, + "learning_rate": 3.7872782278744718e-06, + "loss": 0.87765336, + "num_input_tokens_seen": 62521295, + "router_z_loss_clip": 2.71679688, + "router_z_loss_mlp": 0.23193359, + "step": 2889, + "time_per_iteration": 2.5683798789978027 + }, + { + "auxiliary_loss_clip": 0.06572761, + "auxiliary_loss_mlp": 0.01291973, + "balance_loss_clip": 0.06309643, + "balance_loss_mlp": 0.01268966, + "epoch": 0.1737562002104314, + "flos": 18593711619840.0, + "grad_norm": 2.1673011596526743, + "language_loss": 0.85773498, + "learning_rate": 3.7871034089255883e-06, + "loss": 0.93638229, + "num_input_tokens_seen": 62539615, + "router_z_loss_clip": 2.62890625, + "router_z_loss_mlp": 0.23010254, + "step": 2890, + "time_per_iteration": 2.5268702507019043 + }, + { + "auxiliary_loss_clip": 0.06571183, + "auxiliary_loss_mlp": 0.0127752, + "balance_loss_clip": 0.06302673, + "balance_loss_mlp": 0.0125493, + "epoch": 0.17381632346309936, + "flos": 16003629262080.0, + "grad_norm": 2.262236435886973, + "language_loss": 0.8327142, + "learning_rate": 3.7869285222095653e-06, + "loss": 0.91120124, + "num_input_tokens_seen": 62556820, + "router_z_loss_clip": 2.68359375, + "router_z_loss_mlp": 0.22595215, + "step": 2891, + "time_per_iteration": 2.4975481033325195 + }, + { + "auxiliary_loss_clip": 0.065819, + "auxiliary_loss_mlp": 0.01286901, + "balance_loss_clip": 0.06304774, + "balance_loss_mlp": 0.01263512, + "epoch": 0.17387644671576732, + "flos": 13374749664000.0, + "grad_norm": 2.593478250918492, + "language_loss": 0.82133532, + "learning_rate": 3.7867535677330334e-06, + "loss": 0.9000234, + "num_input_tokens_seen": 62572450, + "router_z_loss_clip": 2.76953125, + "router_z_loss_mlp": 0.23388672, + "step": 2892, + "time_per_iteration": 2.488811492919922 + }, + { + "auxiliary_loss_clip": 0.06588026, + "auxiliary_loss_mlp": 0.0128266, + "balance_loss_clip": 0.06313482, + "balance_loss_mlp": 0.0125759, + "epoch": 0.1739365699684353, + "flos": 26622877409280.0, + "grad_norm": 1.869199176824797, + "language_loss": 0.7570942, + "learning_rate": 3.786578545502627e-06, + "loss": 0.83580112, + "num_input_tokens_seen": 62592580, + "router_z_loss_clip": 2.74804688, + "router_z_loss_mlp": 0.25061035, + "step": 2893, + "time_per_iteration": 2.6775050163269043 + }, + { + "auxiliary_loss_clip": 0.06578243, + "auxiliary_loss_mlp": 0.01282281, + "balance_loss_clip": 0.06306182, + "balance_loss_mlp": 0.01257903, + "epoch": 0.17399669322110325, + "flos": 23374736611200.0, + "grad_norm": 1.8950837051329763, + "language_loss": 0.82900345, + "learning_rate": 3.7864034555249828e-06, + "loss": 0.90760863, + "num_input_tokens_seen": 62611220, + "router_z_loss_clip": 2.72265625, + "router_z_loss_mlp": 0.24377441, + "step": 2894, + "time_per_iteration": 2.5567498207092285 + }, + { + "auxiliary_loss_clip": 0.06582697, + "auxiliary_loss_mlp": 0.01287491, + "balance_loss_clip": 0.06309928, + "balance_loss_mlp": 0.01263232, + "epoch": 0.17405681647377122, + "flos": 22060590301440.0, + "grad_norm": 2.244882299044818, + "language_loss": 0.74999332, + "learning_rate": 3.786228297806741e-06, + "loss": 0.82869518, + "num_input_tokens_seen": 62629185, + "router_z_loss_clip": 2.73046875, + "router_z_loss_mlp": 0.24279785, + "step": 2895, + "time_per_iteration": 2.535771369934082 + }, + { + "auxiliary_loss_clip": 0.06500985, + "auxiliary_loss_mlp": 0.01252168, + "balance_loss_clip": 0.06341717, + "balance_loss_mlp": 0.01244449, + "epoch": 0.1741169397264392, + "flos": 61476537530880.0, + "grad_norm": 0.8158755233881254, + "language_loss": 0.62716168, + "learning_rate": 3.7860530723545435e-06, + "loss": 0.7046932, + "num_input_tokens_seen": 62691895, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.0770874, + "step": 2896, + "time_per_iteration": 3.260303497314453 + }, + { + "auxiliary_loss_clip": 0.06578183, + "auxiliary_loss_mlp": 0.01278967, + "balance_loss_clip": 0.06304477, + "balance_loss_mlp": 0.01254791, + "epoch": 0.17417706297910718, + "flos": 27025245515520.0, + "grad_norm": 1.768440838457988, + "language_loss": 0.76261735, + "learning_rate": 3.785877779175034e-06, + "loss": 0.84118891, + "num_input_tokens_seen": 62713790, + "router_z_loss_clip": 2.73632812, + "router_z_loss_mlp": 0.24157715, + "step": 2897, + "time_per_iteration": 3.9564483165740967 + }, + { + "auxiliary_loss_clip": 0.06567717, + "auxiliary_loss_mlp": 0.01283821, + "balance_loss_clip": 0.06302972, + "balance_loss_mlp": 0.01260325, + "epoch": 0.17423718623177514, + "flos": 33516957064320.0, + "grad_norm": 2.1770598890745694, + "language_loss": 0.7037769, + "learning_rate": 3.7857024182748606e-06, + "loss": 0.78229225, + "num_input_tokens_seen": 62736285, + "router_z_loss_clip": 2.64453125, + "router_z_loss_mlp": 0.23486328, + "step": 2898, + "time_per_iteration": 2.6747710704803467 + }, + { + "auxiliary_loss_clip": 0.06586026, + "auxiliary_loss_mlp": 0.01283538, + "balance_loss_clip": 0.0630955, + "balance_loss_mlp": 0.01261008, + "epoch": 0.1742973094844431, + "flos": 27205982772480.0, + "grad_norm": 2.322018652940294, + "language_loss": 0.77535176, + "learning_rate": 3.7855269896606717e-06, + "loss": 0.85404742, + "num_input_tokens_seen": 62756240, + "router_z_loss_clip": 2.76757812, + "router_z_loss_mlp": 0.22509766, + "step": 2899, + "time_per_iteration": 2.5824503898620605 + }, + { + "auxiliary_loss_clip": 0.06566149, + "auxiliary_loss_mlp": 0.01285927, + "balance_loss_clip": 0.06301811, + "balance_loss_mlp": 0.01263611, + "epoch": 0.17435743273711107, + "flos": 22717307076480.0, + "grad_norm": 1.8730005414784603, + "language_loss": 0.7345652, + "learning_rate": 3.785351493339121e-06, + "loss": 0.81308603, + "num_input_tokens_seen": 62775910, + "router_z_loss_clip": 2.64453125, + "router_z_loss_mlp": 0.22302246, + "step": 2900, + "time_per_iteration": 3.9656574726104736 + }, + { + "auxiliary_loss_clip": 0.06572049, + "auxiliary_loss_mlp": 0.01282784, + "balance_loss_clip": 0.06301104, + "balance_loss_mlp": 0.01259311, + "epoch": 0.17441755598977904, + "flos": 41656141664640.0, + "grad_norm": 1.6285149505686385, + "language_loss": 0.70661789, + "learning_rate": 3.785175929316863e-06, + "loss": 0.7851662, + "num_input_tokens_seen": 62799385, + "router_z_loss_clip": 2.71289062, + "router_z_loss_mlp": 0.23474121, + "step": 2901, + "time_per_iteration": 2.6915066242218018 + }, + { + "auxiliary_loss_clip": 0.06578797, + "auxiliary_loss_mlp": 0.01281619, + "balance_loss_clip": 0.06304422, + "balance_loss_mlp": 0.0125885, + "epoch": 0.174477679242447, + "flos": 26294372277120.0, + "grad_norm": 4.182093359181909, + "language_loss": 0.76958787, + "learning_rate": 3.7850002976005543e-06, + "loss": 0.84819204, + "num_input_tokens_seen": 62819380, + "router_z_loss_clip": 2.74414062, + "router_z_loss_mlp": 0.2277832, + "step": 2902, + "time_per_iteration": 2.58911395072937 + }, + { + "auxiliary_loss_clip": 0.06574767, + "auxiliary_loss_mlp": 0.0128676, + "balance_loss_clip": 0.06303128, + "balance_loss_mlp": 0.01265076, + "epoch": 0.174537802495115, + "flos": 17864221973760.0, + "grad_norm": 2.5386707468858942, + "language_loss": 0.82260907, + "learning_rate": 3.7848245981968558e-06, + "loss": 0.90122437, + "num_input_tokens_seen": 62836205, + "router_z_loss_clip": 2.71875, + "router_z_loss_mlp": 0.21679688, + "step": 2903, + "time_per_iteration": 3.919084072113037 + }, + { + "auxiliary_loss_clip": 0.06573024, + "auxiliary_loss_mlp": 0.01291861, + "balance_loss_clip": 0.06307561, + "balance_loss_mlp": 0.01269139, + "epoch": 0.17459792574778296, + "flos": 16945441954560.0, + "grad_norm": 1.7914306748896518, + "language_loss": 0.7447511, + "learning_rate": 3.784648831112429e-06, + "loss": 0.82340002, + "num_input_tokens_seen": 62854045, + "router_z_loss_clip": 2.65039062, + "router_z_loss_mlp": 0.22717285, + "step": 2904, + "time_per_iteration": 2.578841209411621 + }, + { + "auxiliary_loss_clip": 0.06575242, + "auxiliary_loss_mlp": 0.01290708, + "balance_loss_clip": 0.0630535, + "balance_loss_mlp": 0.01266592, + "epoch": 0.17465804900045093, + "flos": 25527049712640.0, + "grad_norm": 2.1432197986147004, + "language_loss": 0.65256733, + "learning_rate": 3.7844729963539406e-06, + "loss": 0.73122686, + "num_input_tokens_seen": 62873075, + "router_z_loss_clip": 2.703125, + "router_z_loss_mlp": 0.24133301, + "step": 2905, + "time_per_iteration": 3.9871487617492676 + }, + { + "auxiliary_loss_clip": 0.06593791, + "auxiliary_loss_mlp": 0.0129467, + "balance_loss_clip": 0.06312381, + "balance_loss_mlp": 0.01270137, + "epoch": 0.1747181722531189, + "flos": 24135853973760.0, + "grad_norm": 2.2797831517729046, + "language_loss": 0.80441433, + "learning_rate": 3.7842970939280566e-06, + "loss": 0.88329899, + "num_input_tokens_seen": 62892675, + "router_z_loss_clip": 2.8125, + "router_z_loss_mlp": 0.24511719, + "step": 2906, + "time_per_iteration": 2.556459903717041 + }, + { + "auxiliary_loss_clip": 0.065907, + "auxiliary_loss_mlp": 0.01299352, + "balance_loss_clip": 0.0631306, + "balance_loss_mlp": 0.01274306, + "epoch": 0.17477829550578686, + "flos": 17754580506240.0, + "grad_norm": 7.784703467250062, + "language_loss": 0.81983393, + "learning_rate": 3.784121123841449e-06, + "loss": 0.89873445, + "num_input_tokens_seen": 62910675, + "router_z_loss_clip": 2.77734375, + "router_z_loss_mlp": 0.25024414, + "step": 2907, + "time_per_iteration": 2.5256009101867676 + }, + { + "auxiliary_loss_clip": 0.06586979, + "auxiliary_loss_mlp": 0.01293929, + "balance_loss_clip": 0.06311269, + "balance_loss_mlp": 0.01269777, + "epoch": 0.17483841875845482, + "flos": 15382732907520.0, + "grad_norm": 1.9551973542338994, + "language_loss": 0.82190001, + "learning_rate": 3.7839450861007886e-06, + "loss": 0.90070903, + "num_input_tokens_seen": 62928130, + "router_z_loss_clip": 2.7578125, + "router_z_loss_mlp": 0.24133301, + "step": 2908, + "time_per_iteration": 2.5280957221984863 + }, + { + "auxiliary_loss_clip": 0.0658935, + "auxiliary_loss_mlp": 0.01308706, + "balance_loss_clip": 0.06314441, + "balance_loss_mlp": 0.01283279, + "epoch": 0.17489854201112282, + "flos": 17168624104320.0, + "grad_norm": 3.0308502496460243, + "language_loss": 0.8151319, + "learning_rate": 3.7837689807127518e-06, + "loss": 0.89411247, + "num_input_tokens_seen": 62944290, + "router_z_loss_clip": 2.74804688, + "router_z_loss_mlp": 0.25427246, + "step": 2909, + "time_per_iteration": 2.501805543899536 + }, + { + "auxiliary_loss_clip": 0.06591058, + "auxiliary_loss_mlp": 0.01307034, + "balance_loss_clip": 0.06313848, + "balance_loss_mlp": 0.01280235, + "epoch": 0.17495866526379078, + "flos": 19761347865600.0, + "grad_norm": 2.106593508541441, + "language_loss": 0.77213359, + "learning_rate": 3.783592807684017e-06, + "loss": 0.85111451, + "num_input_tokens_seen": 62963505, + "router_z_loss_clip": 2.76953125, + "router_z_loss_mlp": 0.26818848, + "step": 2910, + "time_per_iteration": 2.5401246547698975 + }, + { + "auxiliary_loss_clip": 0.065902, + "auxiliary_loss_mlp": 0.01309875, + "balance_loss_clip": 0.06316847, + "balance_loss_mlp": 0.01282147, + "epoch": 0.17501878851645875, + "flos": 28518535854720.0, + "grad_norm": 6.625386462851426, + "language_loss": 0.8799597, + "learning_rate": 3.7834165670212645e-06, + "loss": 0.95896053, + "num_input_tokens_seen": 62985020, + "router_z_loss_clip": 2.73242188, + "router_z_loss_mlp": 0.27770996, + "step": 2911, + "time_per_iteration": 2.60190486907959 + }, + { + "auxiliary_loss_clip": 0.06591105, + "auxiliary_loss_mlp": 0.01300463, + "balance_loss_clip": 0.06318109, + "balance_loss_mlp": 0.0127537, + "epoch": 0.1750789117691267, + "flos": 17936994844800.0, + "grad_norm": 2.1857421016012832, + "language_loss": 0.90469962, + "learning_rate": 3.7832402587311764e-06, + "loss": 0.98361528, + "num_input_tokens_seen": 63001745, + "router_z_loss_clip": 2.73046875, + "router_z_loss_mlp": 0.2512207, + "step": 2912, + "time_per_iteration": 2.5914218425750732 + }, + { + "auxiliary_loss_clip": 0.06588344, + "auxiliary_loss_mlp": 0.01304507, + "balance_loss_clip": 0.06308792, + "balance_loss_mlp": 0.01277041, + "epoch": 0.17513903502179468, + "flos": 18265248414720.0, + "grad_norm": 2.129743219312126, + "language_loss": 0.74037218, + "learning_rate": 3.783063882820439e-06, + "loss": 0.81930077, + "num_input_tokens_seen": 63019750, + "router_z_loss_clip": 2.79296875, + "router_z_loss_mlp": 0.27453613, + "step": 2913, + "time_per_iteration": 2.5509839057922363 + }, + { + "auxiliary_loss_clip": 0.06580269, + "auxiliary_loss_mlp": 0.01314219, + "balance_loss_clip": 0.06308483, + "balance_loss_mlp": 0.01289781, + "epoch": 0.17519915827446264, + "flos": 20711084768640.0, + "grad_norm": 1.8784732947097995, + "language_loss": 0.70240569, + "learning_rate": 3.782887439295741e-06, + "loss": 0.78135055, + "num_input_tokens_seen": 63039500, + "router_z_loss_clip": 2.71289062, + "router_z_loss_mlp": 0.24450684, + "step": 2914, + "time_per_iteration": 2.560774564743042 + }, + { + "auxiliary_loss_clip": 0.06575729, + "auxiliary_loss_mlp": 0.0130416, + "balance_loss_clip": 0.06304997, + "balance_loss_mlp": 0.01278935, + "epoch": 0.1752592815271306, + "flos": 20529928241280.0, + "grad_norm": 1.7233134110017265, + "language_loss": 0.94360971, + "learning_rate": 3.782710928163772e-06, + "loss": 1.0224086, + "num_input_tokens_seen": 63059785, + "router_z_loss_clip": 2.70507812, + "router_z_loss_mlp": 0.25231934, + "step": 2915, + "time_per_iteration": 2.5500216484069824 + }, + { + "auxiliary_loss_clip": 0.06576817, + "auxiliary_loss_mlp": 0.01301313, + "balance_loss_clip": 0.06306335, + "balance_loss_mlp": 0.01277269, + "epoch": 0.1753194047797986, + "flos": 21805696581120.0, + "grad_norm": 1.6995224084103926, + "language_loss": 0.81995428, + "learning_rate": 3.782534349431226e-06, + "loss": 0.89873564, + "num_input_tokens_seen": 63079385, + "router_z_loss_clip": 2.70703125, + "router_z_loss_mlp": 0.24060059, + "step": 2916, + "time_per_iteration": 2.6210248470306396 + }, + { + "auxiliary_loss_clip": 0.06578801, + "auxiliary_loss_mlp": 0.01308944, + "balance_loss_clip": 0.06305841, + "balance_loss_mlp": 0.01282694, + "epoch": 0.17537952803246656, + "flos": 20674719296640.0, + "grad_norm": 7.015160336993527, + "language_loss": 0.74587643, + "learning_rate": 3.782357703104799e-06, + "loss": 0.82475388, + "num_input_tokens_seen": 63098970, + "router_z_loss_clip": 2.72851562, + "router_z_loss_mlp": 0.26245117, + "step": 2917, + "time_per_iteration": 2.5568697452545166 + }, + { + "auxiliary_loss_clip": 0.06575756, + "auxiliary_loss_mlp": 0.01293408, + "balance_loss_clip": 0.06306349, + "balance_loss_mlp": 0.01269018, + "epoch": 0.17543965128513453, + "flos": 23301837959040.0, + "grad_norm": 1.9034970134752385, + "language_loss": 0.77783519, + "learning_rate": 3.7821809891911897e-06, + "loss": 0.85652685, + "num_input_tokens_seen": 63118750, + "router_z_loss_clip": 2.6953125, + "router_z_loss_mlp": 0.24414062, + "step": 2918, + "time_per_iteration": 2.592294692993164 + }, + { + "auxiliary_loss_clip": 0.06589542, + "auxiliary_loss_mlp": 0.01295236, + "balance_loss_clip": 0.06310425, + "balance_loss_mlp": 0.01271549, + "epoch": 0.1754997745378025, + "flos": 29103234445440.0, + "grad_norm": 2.152727236459042, + "language_loss": 0.75315654, + "learning_rate": 3.782004207697098e-06, + "loss": 0.83200431, + "num_input_tokens_seen": 63136865, + "router_z_loss_clip": 2.79101562, + "router_z_loss_mlp": 0.23693848, + "step": 2919, + "time_per_iteration": 2.67553973197937 + }, + { + "auxiliary_loss_clip": 0.06596158, + "auxiliary_loss_mlp": 0.01303514, + "balance_loss_clip": 0.06314485, + "balance_loss_mlp": 0.01279601, + "epoch": 0.17555989779047046, + "flos": 30379547836800.0, + "grad_norm": 1.8096477139902465, + "language_loss": 0.74872279, + "learning_rate": 3.781827358629228e-06, + "loss": 0.82771957, + "num_input_tokens_seen": 63158325, + "router_z_loss_clip": 2.81835938, + "router_z_loss_mlp": 0.23925781, + "step": 2920, + "time_per_iteration": 2.6885359287261963 + }, + { + "auxiliary_loss_clip": 0.06577891, + "auxiliary_loss_mlp": 0.01294192, + "balance_loss_clip": 0.06307238, + "balance_loss_mlp": 0.01270982, + "epoch": 0.17562002104313842, + "flos": 23293284842880.0, + "grad_norm": 2.5308626608738423, + "language_loss": 0.80572176, + "learning_rate": 3.7816504419942873e-06, + "loss": 0.88444257, + "num_input_tokens_seen": 63173115, + "router_z_loss_clip": 2.70703125, + "router_z_loss_mlp": 0.23217773, + "step": 2921, + "time_per_iteration": 2.51985502243042 + }, + { + "auxiliary_loss_clip": 0.06590457, + "auxiliary_loss_mlp": 0.01284789, + "balance_loss_clip": 0.06311172, + "balance_loss_mlp": 0.01260971, + "epoch": 0.1756801442958064, + "flos": 24797434285440.0, + "grad_norm": 1.5780045761030037, + "language_loss": 0.88755381, + "learning_rate": 3.7814734577989823e-06, + "loss": 0.96630621, + "num_input_tokens_seen": 63192880, + "router_z_loss_clip": 2.79296875, + "router_z_loss_mlp": 0.23815918, + "step": 2922, + "time_per_iteration": 2.595477819442749 + }, + { + "auxiliary_loss_clip": 0.06584172, + "auxiliary_loss_mlp": 0.01290113, + "balance_loss_clip": 0.06306588, + "balance_loss_mlp": 0.01265211, + "epoch": 0.17574026754847438, + "flos": 25778086145280.0, + "grad_norm": 2.2356333874414043, + "language_loss": 0.63389397, + "learning_rate": 3.7812964060500253e-06, + "loss": 0.71263683, + "num_input_tokens_seen": 63214395, + "router_z_loss_clip": 2.78125, + "router_z_loss_mlp": 0.24890137, + "step": 2923, + "time_per_iteration": 2.56712007522583 + }, + { + "auxiliary_loss_clip": 0.06590886, + "auxiliary_loss_mlp": 0.01293522, + "balance_loss_clip": 0.06313786, + "balance_loss_mlp": 0.01269394, + "epoch": 0.17580039080114235, + "flos": 17462273137920.0, + "grad_norm": 2.8211803221017617, + "language_loss": 0.81614435, + "learning_rate": 3.78111928675413e-06, + "loss": 0.89498842, + "num_input_tokens_seen": 63231020, + "router_z_loss_clip": 2.77148438, + "router_z_loss_mlp": 0.24145508, + "step": 2924, + "time_per_iteration": 2.5396065711975098 + }, + { + "auxiliary_loss_clip": 0.06586142, + "auxiliary_loss_mlp": 0.01294774, + "balance_loss_clip": 0.06306558, + "balance_loss_mlp": 0.01269108, + "epoch": 0.1758605140538103, + "flos": 14869633230720.0, + "grad_norm": 2.6608767055753244, + "language_loss": 0.71953624, + "learning_rate": 3.7809420999180126e-06, + "loss": 0.79834545, + "num_input_tokens_seen": 63246245, + "router_z_loss_clip": 2.79492188, + "router_z_loss_mlp": 0.25671387, + "step": 2925, + "time_per_iteration": 2.594172239303589 + }, + { + "auxiliary_loss_clip": 0.0657725, + "auxiliary_loss_mlp": 0.01284494, + "balance_loss_clip": 0.06310555, + "balance_loss_mlp": 0.01261546, + "epoch": 0.17592063730647828, + "flos": 23011165745280.0, + "grad_norm": 1.6593164954495325, + "language_loss": 0.72342992, + "learning_rate": 3.7807648455483934e-06, + "loss": 0.80204731, + "num_input_tokens_seen": 63267790, + "router_z_loss_clip": 2.6640625, + "router_z_loss_mlp": 0.22961426, + "step": 2926, + "time_per_iteration": 2.592061758041382 + }, + { + "auxiliary_loss_clip": 0.06592301, + "auxiliary_loss_mlp": 0.0128622, + "balance_loss_clip": 0.06310115, + "balance_loss_mlp": 0.01260911, + "epoch": 0.17598076055914624, + "flos": 20747911438080.0, + "grad_norm": 1.7750261498089963, + "language_loss": 0.85897779, + "learning_rate": 3.7805875236519918e-06, + "loss": 0.93776292, + "num_input_tokens_seen": 63286830, + "router_z_loss_clip": 2.8203125, + "router_z_loss_mlp": 0.25317383, + "step": 2927, + "time_per_iteration": 2.546537160873413 + }, + { + "auxiliary_loss_clip": 0.06583759, + "auxiliary_loss_mlp": 0.01277616, + "balance_loss_clip": 0.06312352, + "balance_loss_mlp": 0.01255431, + "epoch": 0.1760408838118142, + "flos": 34100607479040.0, + "grad_norm": 1.9484214610767971, + "language_loss": 0.72539592, + "learning_rate": 3.7804101342355336e-06, + "loss": 0.80400968, + "num_input_tokens_seen": 63308870, + "router_z_loss_clip": 2.71289062, + "router_z_loss_mlp": 0.22167969, + "step": 2928, + "time_per_iteration": 2.674516201019287 + }, + { + "auxiliary_loss_clip": 0.06577812, + "auxiliary_loss_mlp": 0.01278822, + "balance_loss_clip": 0.06308608, + "balance_loss_mlp": 0.01256292, + "epoch": 0.1761010070644822, + "flos": 24174902776320.0, + "grad_norm": 1.786019104625144, + "language_loss": 0.83572811, + "learning_rate": 3.780232677305744e-06, + "loss": 0.91429448, + "num_input_tokens_seen": 63329005, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 0.22521973, + "step": 2929, + "time_per_iteration": 2.5528249740600586 + }, + { + "auxiliary_loss_clip": 0.06584716, + "auxiliary_loss_mlp": 0.01284422, + "balance_loss_clip": 0.06311291, + "balance_loss_mlp": 0.01261439, + "epoch": 0.17616113031715017, + "flos": 26583660898560.0, + "grad_norm": 1.8454669041222298, + "language_loss": 0.80018413, + "learning_rate": 3.7800551528693535e-06, + "loss": 0.87887549, + "num_input_tokens_seen": 63349390, + "router_z_loss_clip": 2.73242188, + "router_z_loss_mlp": 0.2298584, + "step": 2930, + "time_per_iteration": 2.6004958152770996 + }, + { + "auxiliary_loss_clip": 0.06579742, + "auxiliary_loss_mlp": 0.01287089, + "balance_loss_clip": 0.06306133, + "balance_loss_mlp": 0.01261935, + "epoch": 0.17622125356981813, + "flos": 25673853265920.0, + "grad_norm": 2.4724081113031677, + "language_loss": 0.77905595, + "learning_rate": 3.7798775609330927e-06, + "loss": 0.85772425, + "num_input_tokens_seen": 63368835, + "router_z_loss_clip": 2.73632812, + "router_z_loss_mlp": 0.25195312, + "step": 2931, + "time_per_iteration": 2.580275774002075 + }, + { + "auxiliary_loss_clip": 0.0657528, + "auxiliary_loss_mlp": 0.01279459, + "balance_loss_clip": 0.063051, + "balance_loss_mlp": 0.01256988, + "epoch": 0.1762813768224861, + "flos": 16514129462400.0, + "grad_norm": 2.8370907048277973, + "language_loss": 0.75863802, + "learning_rate": 3.779699901503696e-06, + "loss": 0.83718544, + "num_input_tokens_seen": 63385220, + "router_z_loss_clip": 2.703125, + "router_z_loss_mlp": 0.22473145, + "step": 2932, + "time_per_iteration": 2.5535829067230225 + }, + { + "auxiliary_loss_clip": 0.06587049, + "auxiliary_loss_mlp": 0.0128414, + "balance_loss_clip": 0.06307124, + "balance_loss_mlp": 0.01258975, + "epoch": 0.17634150007515406, + "flos": 11215518600960.0, + "grad_norm": 2.570844699660862, + "language_loss": 0.90240741, + "learning_rate": 3.7795221745879016e-06, + "loss": 0.98111933, + "num_input_tokens_seen": 63400865, + "router_z_loss_clip": 2.80273438, + "router_z_loss_mlp": 0.25146484, + "step": 2933, + "time_per_iteration": 2.5120935440063477 + }, + { + "auxiliary_loss_clip": 0.06578325, + "auxiliary_loss_mlp": 0.01278816, + "balance_loss_clip": 0.06313163, + "balance_loss_mlp": 0.01256893, + "epoch": 0.17640162332782203, + "flos": 23666750490240.0, + "grad_norm": 2.3821255620265376, + "language_loss": 0.89272201, + "learning_rate": 3.779344380192448e-06, + "loss": 0.97129339, + "num_input_tokens_seen": 63421390, + "router_z_loss_clip": 2.65429688, + "router_z_loss_mlp": 0.21936035, + "step": 2934, + "time_per_iteration": 2.5753555297851562 + }, + { + "auxiliary_loss_clip": 0.06578338, + "auxiliary_loss_mlp": 0.0128005, + "balance_loss_clip": 0.0630947, + "balance_loss_mlp": 0.0125709, + "epoch": 0.17646174658049, + "flos": 53808819056640.0, + "grad_norm": 1.971590125699774, + "language_loss": 0.71700215, + "learning_rate": 3.779166518324077e-06, + "loss": 0.79558611, + "num_input_tokens_seen": 63444715, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 0.2298584, + "step": 2935, + "time_per_iteration": 2.8537397384643555 + }, + { + "auxiliary_loss_clip": 0.06584434, + "auxiliary_loss_mlp": 0.01288458, + "balance_loss_clip": 0.06307955, + "balance_loss_mlp": 0.01264401, + "epoch": 0.17652186983315798, + "flos": 24250820175360.0, + "grad_norm": 8.554775287736033, + "language_loss": 0.71186781, + "learning_rate": 3.7789885889895325e-06, + "loss": 0.79059678, + "num_input_tokens_seen": 63465525, + "router_z_loss_clip": 2.76953125, + "router_z_loss_mlp": 0.24047852, + "step": 2936, + "time_per_iteration": 4.091250896453857 + }, + { + "auxiliary_loss_clip": 0.06580865, + "auxiliary_loss_mlp": 0.01286216, + "balance_loss_clip": 0.06309694, + "balance_loss_mlp": 0.01263745, + "epoch": 0.17658199308582595, + "flos": 27461715033600.0, + "grad_norm": 1.9442195602404513, + "language_loss": 0.72206265, + "learning_rate": 3.7788105921955634e-06, + "loss": 0.80073345, + "num_input_tokens_seen": 63485815, + "router_z_loss_clip": 2.7109375, + "router_z_loss_mlp": 0.22473145, + "step": 2937, + "time_per_iteration": 2.5836215019226074 + }, + { + "auxiliary_loss_clip": 0.06581761, + "auxiliary_loss_mlp": 0.0128249, + "balance_loss_clip": 0.06303879, + "balance_loss_mlp": 0.01258088, + "epoch": 0.17664211633849392, + "flos": 22425167416320.0, + "grad_norm": 2.618384752485795, + "language_loss": 0.76896954, + "learning_rate": 3.7786325279489184e-06, + "loss": 0.84761202, + "num_input_tokens_seen": 63503905, + "router_z_loss_clip": 2.77929688, + "router_z_loss_mlp": 0.24389648, + "step": 2938, + "time_per_iteration": 2.5426154136657715 + }, + { + "auxiliary_loss_clip": 0.06581972, + "auxiliary_loss_mlp": 0.0129211, + "balance_loss_clip": 0.06306289, + "balance_loss_mlp": 0.01268638, + "epoch": 0.17670223959116188, + "flos": 24721642667520.0, + "grad_norm": 2.0224209621562803, + "language_loss": 0.72049117, + "learning_rate": 3.7784543962563495e-06, + "loss": 0.79923201, + "num_input_tokens_seen": 63521985, + "router_z_loss_clip": 2.7578125, + "router_z_loss_mlp": 0.23474121, + "step": 2939, + "time_per_iteration": 4.034467935562134 + }, + { + "auxiliary_loss_clip": 0.06574269, + "auxiliary_loss_mlp": 0.01281711, + "balance_loss_clip": 0.06305616, + "balance_loss_mlp": 0.01258668, + "epoch": 0.17676236284382985, + "flos": 22533383364480.0, + "grad_norm": 2.2379803860691667, + "language_loss": 0.75736713, + "learning_rate": 3.7782761971246115e-06, + "loss": 0.83592695, + "num_input_tokens_seen": 63539830, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 0.23034668, + "step": 2940, + "time_per_iteration": 2.6091058254241943 + }, + { + "auxiliary_loss_clip": 0.06579125, + "auxiliary_loss_mlp": 0.01284811, + "balance_loss_clip": 0.06305407, + "balance_loss_mlp": 0.01261494, + "epoch": 0.1768224860964978, + "flos": 12389988954240.0, + "grad_norm": 2.2625025035762443, + "language_loss": 0.86326134, + "learning_rate": 3.7780979305604616e-06, + "loss": 0.94190073, + "num_input_tokens_seen": 63555495, + "router_z_loss_clip": 2.73632812, + "router_z_loss_mlp": 0.2331543, + "step": 2941, + "time_per_iteration": 2.529346227645874 + }, + { + "auxiliary_loss_clip": 0.06590004, + "auxiliary_loss_mlp": 0.01292545, + "balance_loss_clip": 0.06314506, + "balance_loss_mlp": 0.01269073, + "epoch": 0.1768826093491658, + "flos": 24360335861760.0, + "grad_norm": 2.5150262997144806, + "language_loss": 0.78079373, + "learning_rate": 3.7779195965706607e-06, + "loss": 0.8596192, + "num_input_tokens_seen": 63575290, + "router_z_loss_clip": 2.75390625, + "router_z_loss_mlp": 0.23498535, + "step": 2942, + "time_per_iteration": 2.5893354415893555 + }, + { + "auxiliary_loss_clip": 0.06590073, + "auxiliary_loss_mlp": 0.01285718, + "balance_loss_clip": 0.06313878, + "balance_loss_mlp": 0.01261745, + "epoch": 0.17694273260183377, + "flos": 23593893765120.0, + "grad_norm": 1.793399089669822, + "language_loss": 0.81007993, + "learning_rate": 3.77774119516197e-06, + "loss": 0.88883781, + "num_input_tokens_seen": 63594670, + "router_z_loss_clip": 2.76171875, + "router_z_loss_mlp": 0.23962402, + "step": 2943, + "time_per_iteration": 4.085087537765503 + }, + { + "auxiliary_loss_clip": 0.065895, + "auxiliary_loss_mlp": 0.01284454, + "balance_loss_clip": 0.06311318, + "balance_loss_mlp": 0.01260266, + "epoch": 0.17700285585450173, + "flos": 26768297370240.0, + "grad_norm": 2.7078535987609524, + "language_loss": 0.81690747, + "learning_rate": 3.777562726341155e-06, + "loss": 0.89564693, + "num_input_tokens_seen": 63614780, + "router_z_loss_clip": 2.78125, + "router_z_loss_mlp": 0.24194336, + "step": 2944, + "time_per_iteration": 4.037370204925537 + }, + { + "auxiliary_loss_clip": 0.06577846, + "auxiliary_loss_mlp": 0.01285687, + "balance_loss_clip": 0.06307179, + "balance_loss_mlp": 0.01262, + "epoch": 0.1770629791071697, + "flos": 42785986919040.0, + "grad_norm": 3.287704950657118, + "language_loss": 0.74187398, + "learning_rate": 3.7773841901149835e-06, + "loss": 0.82050931, + "num_input_tokens_seen": 63637190, + "router_z_loss_clip": 2.70898438, + "router_z_loss_mlp": 0.23693848, + "step": 2945, + "time_per_iteration": 2.726703405380249 + }, + { + "auxiliary_loss_clip": 0.06568955, + "auxiliary_loss_mlp": 0.01286818, + "balance_loss_clip": 0.06300092, + "balance_loss_mlp": 0.01263596, + "epoch": 0.17712310235983766, + "flos": 17350954588800.0, + "grad_norm": 3.5781735305150013, + "language_loss": 0.78848231, + "learning_rate": 3.7772055864902256e-06, + "loss": 0.86704004, + "num_input_tokens_seen": 63652140, + "router_z_loss_clip": 2.68554688, + "router_z_loss_mlp": 0.23217773, + "step": 2946, + "time_per_iteration": 2.6050639152526855 + }, + { + "auxiliary_loss_clip": 0.06568858, + "auxiliary_loss_mlp": 0.01284865, + "balance_loss_clip": 0.06300168, + "balance_loss_mlp": 0.01262156, + "epoch": 0.17718322561250563, + "flos": 23885278738560.0, + "grad_norm": 1.9584306466242212, + "language_loss": 0.77679253, + "learning_rate": 3.7770269154736535e-06, + "loss": 0.85532975, + "num_input_tokens_seen": 63671700, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 0.22705078, + "step": 2947, + "time_per_iteration": 2.562394857406616 + }, + { + "auxiliary_loss_clip": 0.06579228, + "auxiliary_loss_mlp": 0.01286605, + "balance_loss_clip": 0.06305858, + "balance_loss_mlp": 0.01262573, + "epoch": 0.1772433488651736, + "flos": 36475306116480.0, + "grad_norm": 3.3061595908349193, + "language_loss": 0.7337119, + "learning_rate": 3.7768481770720424e-06, + "loss": 0.81237024, + "num_input_tokens_seen": 63691685, + "router_z_loss_clip": 2.734375, + "router_z_loss_mlp": 0.24023438, + "step": 2948, + "time_per_iteration": 2.6738815307617188 + }, + { + "auxiliary_loss_clip": 0.06568594, + "auxiliary_loss_mlp": 0.01285694, + "balance_loss_clip": 0.06305531, + "balance_loss_mlp": 0.01263915, + "epoch": 0.1773034721178416, + "flos": 26691457576320.0, + "grad_norm": 2.3861566912178915, + "language_loss": 0.82720947, + "learning_rate": 3.776669371292171e-06, + "loss": 0.90575236, + "num_input_tokens_seen": 63711720, + "router_z_loss_clip": 2.63085938, + "router_z_loss_mlp": 0.21777344, + "step": 2949, + "time_per_iteration": 2.6339261531829834 + }, + { + "auxiliary_loss_clip": 0.06558515, + "auxiliary_loss_mlp": 0.0129088, + "balance_loss_clip": 0.06397671, + "balance_loss_mlp": 0.01282136, + "epoch": 0.17736359537050955, + "flos": 57136007053440.0, + "grad_norm": 0.7127406603181583, + "language_loss": 0.65079832, + "learning_rate": 3.7764904981408186e-06, + "loss": 0.72929227, + "num_input_tokens_seen": 63776280, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.08758545, + "step": 2950, + "time_per_iteration": 3.2668871879577637 + }, + { + "auxiliary_loss_clip": 0.06572378, + "auxiliary_loss_mlp": 0.01284106, + "balance_loss_clip": 0.06306554, + "balance_loss_mlp": 0.01260896, + "epoch": 0.17742371862317752, + "flos": 27205479648000.0, + "grad_norm": 1.9196695606626306, + "language_loss": 0.84746122, + "learning_rate": 3.7763115576247686e-06, + "loss": 0.92602605, + "num_input_tokens_seen": 63797535, + "router_z_loss_clip": 2.65820312, + "router_z_loss_mlp": 0.2322998, + "step": 2951, + "time_per_iteration": 2.585566520690918 + }, + { + "auxiliary_loss_clip": 0.06574618, + "auxiliary_loss_mlp": 0.01283229, + "balance_loss_clip": 0.06301534, + "balance_loss_mlp": 0.01260556, + "epoch": 0.17748384187584548, + "flos": 20966020416000.0, + "grad_norm": 2.232427680766164, + "language_loss": 0.82122993, + "learning_rate": 3.776132549750806e-06, + "loss": 0.89980847, + "num_input_tokens_seen": 63817045, + "router_z_loss_clip": 2.72851562, + "router_z_loss_mlp": 0.22680664, + "step": 2952, + "time_per_iteration": 2.55747652053833 + }, + { + "auxiliary_loss_clip": 0.06570595, + "auxiliary_loss_mlp": 0.01296069, + "balance_loss_clip": 0.06303248, + "balance_loss_mlp": 0.01272251, + "epoch": 0.17754396512851345, + "flos": 25017052636800.0, + "grad_norm": 5.629810818318968, + "language_loss": 0.8066265, + "learning_rate": 3.7759534745257194e-06, + "loss": 0.88529313, + "num_input_tokens_seen": 63837665, + "router_z_loss_clip": 2.671875, + "router_z_loss_mlp": 0.23840332, + "step": 2953, + "time_per_iteration": 2.5756490230560303 + }, + { + "auxiliary_loss_clip": 0.06576403, + "auxiliary_loss_mlp": 0.01299444, + "balance_loss_clip": 0.06307617, + "balance_loss_mlp": 0.01275877, + "epoch": 0.1776040883811814, + "flos": 32059780634880.0, + "grad_norm": 1.9568540134603198, + "language_loss": 0.89472413, + "learning_rate": 3.7757743319562994e-06, + "loss": 0.97348255, + "num_input_tokens_seen": 63858455, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 0.2355957, + "step": 2954, + "time_per_iteration": 2.64989972114563 + }, + { + "auxiliary_loss_clip": 0.06576417, + "auxiliary_loss_mlp": 0.01304463, + "balance_loss_clip": 0.06308817, + "balance_loss_mlp": 0.01280788, + "epoch": 0.17766421163384938, + "flos": 21579579538560.0, + "grad_norm": 2.0844074095191423, + "language_loss": 0.85445726, + "learning_rate": 3.7755951220493386e-06, + "loss": 0.93326604, + "num_input_tokens_seen": 63876935, + "router_z_loss_clip": 2.6796875, + "router_z_loss_mlp": 0.23693848, + "step": 2955, + "time_per_iteration": 2.5314552783966064 + }, + { + "auxiliary_loss_clip": 0.06566998, + "auxiliary_loss_mlp": 0.01298177, + "balance_loss_clip": 0.06301849, + "balance_loss_mlp": 0.01274287, + "epoch": 0.17772433488651737, + "flos": 22425922103040.0, + "grad_norm": 1.629233918934169, + "language_loss": 0.7198323, + "learning_rate": 3.7754158448116327e-06, + "loss": 0.79848409, + "num_input_tokens_seen": 63896815, + "router_z_loss_clip": 2.65039062, + "router_z_loss_mlp": 0.2388916, + "step": 2956, + "time_per_iteration": 2.5686161518096924 + }, + { + "auxiliary_loss_clip": 0.06565966, + "auxiliary_loss_mlp": 0.01302663, + "balance_loss_clip": 0.06303196, + "balance_loss_mlp": 0.01279632, + "epoch": 0.17778445813918534, + "flos": 25636481544960.0, + "grad_norm": 1.8690466813220736, + "language_loss": 0.8383618, + "learning_rate": 3.7752365002499795e-06, + "loss": 0.9170481, + "num_input_tokens_seen": 63916140, + "router_z_loss_clip": 2.63085938, + "router_z_loss_mlp": 0.23034668, + "step": 2957, + "time_per_iteration": 2.5693180561065674 + }, + { + "auxiliary_loss_clip": 0.06574687, + "auxiliary_loss_mlp": 0.0129708, + "balance_loss_clip": 0.06307757, + "balance_loss_mlp": 0.01274323, + "epoch": 0.1778445813918533, + "flos": 25635810712320.0, + "grad_norm": 1.5960329991483622, + "language_loss": 0.75535214, + "learning_rate": 3.7750570883711807e-06, + "loss": 0.83406979, + "num_input_tokens_seen": 63935220, + "router_z_loss_clip": 2.66992188, + "router_z_loss_mlp": 0.22753906, + "step": 2958, + "time_per_iteration": 2.6068832874298096 + }, + { + "auxiliary_loss_clip": 0.06572513, + "auxiliary_loss_mlp": 0.01295837, + "balance_loss_clip": 0.06305174, + "balance_loss_mlp": 0.01273533, + "epoch": 0.17790470464452127, + "flos": 22351975274880.0, + "grad_norm": 2.4916809347301867, + "language_loss": 0.8152473, + "learning_rate": 3.7748776091820397e-06, + "loss": 0.89393079, + "num_input_tokens_seen": 63954550, + "router_z_loss_clip": 2.67773438, + "router_z_loss_mlp": 0.22302246, + "step": 2959, + "time_per_iteration": 2.532893419265747 + }, + { + "auxiliary_loss_clip": 0.06580231, + "auxiliary_loss_mlp": 0.01291039, + "balance_loss_clip": 0.06308466, + "balance_loss_mlp": 0.01267293, + "epoch": 0.17796482789718923, + "flos": 18771052786560.0, + "grad_norm": 1.971364332808954, + "language_loss": 0.52699149, + "learning_rate": 3.774698062689362e-06, + "loss": 0.60570425, + "num_input_tokens_seen": 63972425, + "router_z_loss_clip": 2.71679688, + "router_z_loss_mlp": 0.23754883, + "step": 2960, + "time_per_iteration": 2.5427799224853516 + }, + { + "auxiliary_loss_clip": 0.06575893, + "auxiliary_loss_mlp": 0.01290781, + "balance_loss_clip": 0.06308038, + "balance_loss_mlp": 0.01267726, + "epoch": 0.1780249511498572, + "flos": 23447719117440.0, + "grad_norm": 1.7972451693934908, + "language_loss": 0.90068716, + "learning_rate": 3.7745184488999548e-06, + "loss": 0.97935379, + "num_input_tokens_seen": 63992165, + "router_z_loss_clip": 2.6796875, + "router_z_loss_mlp": 0.23083496, + "step": 2961, + "time_per_iteration": 2.5641977787017822 + }, + { + "auxiliary_loss_clip": 0.06579147, + "auxiliary_loss_mlp": 0.01285295, + "balance_loss_clip": 0.06309063, + "balance_loss_mlp": 0.0126075, + "epoch": 0.1780850744025252, + "flos": 23374149632640.0, + "grad_norm": 3.006724243875413, + "language_loss": 0.79600328, + "learning_rate": 3.774338767820631e-06, + "loss": 0.87464768, + "num_input_tokens_seen": 64013470, + "router_z_loss_clip": 2.703125, + "router_z_loss_mlp": 0.2454834, + "step": 2962, + "time_per_iteration": 2.605395555496216 + }, + { + "auxiliary_loss_clip": 0.06579778, + "auxiliary_loss_mlp": 0.01288142, + "balance_loss_clip": 0.06310856, + "balance_loss_mlp": 0.01262977, + "epoch": 0.17814519765519315, + "flos": 13777117770240.0, + "grad_norm": 1.8585534107816564, + "language_loss": 0.75987798, + "learning_rate": 3.774159019458203e-06, + "loss": 0.83855718, + "num_input_tokens_seen": 64030975, + "router_z_loss_clip": 2.6875, + "router_z_loss_mlp": 0.25146484, + "step": 2963, + "time_per_iteration": 2.4989051818847656 + }, + { + "auxiliary_loss_clip": 0.06582604, + "auxiliary_loss_mlp": 0.01280238, + "balance_loss_clip": 0.06308165, + "balance_loss_mlp": 0.01255573, + "epoch": 0.17820532090786112, + "flos": 21982073425920.0, + "grad_norm": 2.394373782804808, + "language_loss": 0.79892176, + "learning_rate": 3.7739792038194877e-06, + "loss": 0.87755024, + "num_input_tokens_seen": 64050075, + "router_z_loss_clip": 2.74609375, + "router_z_loss_mlp": 0.24682617, + "step": 2964, + "time_per_iteration": 2.6040844917297363 + }, + { + "auxiliary_loss_clip": 0.06584992, + "auxiliary_loss_mlp": 0.01284037, + "balance_loss_clip": 0.06315298, + "balance_loss_mlp": 0.01259289, + "epoch": 0.17826544416052909, + "flos": 24797727774720.0, + "grad_norm": 4.1010799155066, + "language_loss": 0.8221398, + "learning_rate": 3.7737993209113027e-06, + "loss": 0.90083003, + "num_input_tokens_seen": 64071920, + "router_z_loss_clip": 2.69726562, + "router_z_loss_mlp": 0.24755859, + "step": 2965, + "time_per_iteration": 2.5539731979370117 + }, + { + "auxiliary_loss_clip": 0.06570912, + "auxiliary_loss_mlp": 0.01281116, + "balance_loss_clip": 0.06306428, + "balance_loss_mlp": 0.01258788, + "epoch": 0.17832556741319705, + "flos": 13884411323520.0, + "grad_norm": 2.4679554184574974, + "language_loss": 0.96086347, + "learning_rate": 3.7736193707404698e-06, + "loss": 1.03938377, + "num_input_tokens_seen": 64086835, + "router_z_loss_clip": 2.64257812, + "router_z_loss_mlp": 0.22338867, + "step": 2966, + "time_per_iteration": 2.527735948562622 + }, + { + "auxiliary_loss_clip": 0.06579631, + "auxiliary_loss_mlp": 0.01280876, + "balance_loss_clip": 0.06311509, + "balance_loss_mlp": 0.0125688, + "epoch": 0.17838569066586502, + "flos": 36649502755200.0, + "grad_norm": 2.0843689120837965, + "language_loss": 0.73698831, + "learning_rate": 3.7734393533138127e-06, + "loss": 0.81559336, + "num_input_tokens_seen": 64107360, + "router_z_loss_clip": 2.68359375, + "router_z_loss_mlp": 0.24023438, + "step": 2967, + "time_per_iteration": 2.7015600204467773 + }, + { + "auxiliary_loss_clip": 0.06577688, + "auxiliary_loss_mlp": 0.01283294, + "balance_loss_clip": 0.06315881, + "balance_loss_mlp": 0.01260192, + "epoch": 0.17844581391853298, + "flos": 18732087838080.0, + "grad_norm": 3.4272342033369956, + "language_loss": 0.77622253, + "learning_rate": 3.773259268638157e-06, + "loss": 0.85483229, + "num_input_tokens_seen": 64124690, + "router_z_loss_clip": 2.62109375, + "router_z_loss_mlp": 0.2310791, + "step": 2968, + "time_per_iteration": 2.5782222747802734 + }, + { + "auxiliary_loss_clip": 0.06574235, + "auxiliary_loss_mlp": 0.01280569, + "balance_loss_clip": 0.06309816, + "balance_loss_mlp": 0.01257716, + "epoch": 0.17850593717120097, + "flos": 27385168728960.0, + "grad_norm": 2.732998701382931, + "language_loss": 0.76891911, + "learning_rate": 3.7730791167203333e-06, + "loss": 0.84746712, + "num_input_tokens_seen": 64146315, + "router_z_loss_clip": 2.64453125, + "router_z_loss_mlp": 0.2286377, + "step": 2969, + "time_per_iteration": 2.596932888031006 + }, + { + "auxiliary_loss_clip": 0.06469887, + "auxiliary_loss_mlp": 0.01257031, + "balance_loss_clip": 0.06316882, + "balance_loss_mlp": 0.01250105, + "epoch": 0.17856606042386894, + "flos": 67014696816000.0, + "grad_norm": 0.8163537423270849, + "language_loss": 0.69127434, + "learning_rate": 3.772898897567171e-06, + "loss": 0.76854354, + "num_input_tokens_seen": 64210875, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.06939697, + "step": 2970, + "time_per_iteration": 3.239208221435547 + }, + { + "auxiliary_loss_clip": 0.06585611, + "auxiliary_loss_mlp": 0.01285467, + "balance_loss_clip": 0.06311353, + "balance_loss_mlp": 0.01261936, + "epoch": 0.1786261836765369, + "flos": 36986015952000.0, + "grad_norm": 1.9165060952178286, + "language_loss": 0.67737955, + "learning_rate": 3.772718611185505e-06, + "loss": 0.75609034, + "num_input_tokens_seen": 64230740, + "router_z_loss_clip": 2.7421875, + "router_z_loss_mlp": 0.23522949, + "step": 2971, + "time_per_iteration": 2.6962218284606934 + }, + { + "auxiliary_loss_clip": 0.06573113, + "auxiliary_loss_mlp": 0.01289649, + "balance_loss_clip": 0.06303954, + "balance_loss_mlp": 0.01265164, + "epoch": 0.17868630692920487, + "flos": 24832122675840.0, + "grad_norm": 2.3195878790033992, + "language_loss": 0.90615618, + "learning_rate": 3.7725382575821717e-06, + "loss": 0.98478377, + "num_input_tokens_seen": 64252300, + "router_z_loss_clip": 2.69335938, + "router_z_loss_mlp": 0.24475098, + "step": 2972, + "time_per_iteration": 2.5959432125091553 + }, + { + "auxiliary_loss_clip": 0.06576589, + "auxiliary_loss_mlp": 0.01296839, + "balance_loss_clip": 0.06306117, + "balance_loss_mlp": 0.01272747, + "epoch": 0.17874643018187283, + "flos": 16987509504000.0, + "grad_norm": 2.140735852517547, + "language_loss": 0.89032125, + "learning_rate": 3.77235783676401e-06, + "loss": 0.96905553, + "num_input_tokens_seen": 64270105, + "router_z_loss_clip": 2.70507812, + "router_z_loss_mlp": 0.24084473, + "step": 2973, + "time_per_iteration": 2.5378026962280273 + }, + { + "auxiliary_loss_clip": 0.06586085, + "auxiliary_loss_mlp": 0.01287873, + "balance_loss_clip": 0.06315553, + "balance_loss_mlp": 0.01263459, + "epoch": 0.1788065534345408, + "flos": 21038499797760.0, + "grad_norm": 2.0743135363702097, + "language_loss": 0.77368832, + "learning_rate": 3.7721773487378615e-06, + "loss": 0.8524279, + "num_input_tokens_seen": 64287250, + "router_z_loss_clip": 2.70703125, + "router_z_loss_mlp": 0.2442627, + "step": 2974, + "time_per_iteration": 2.53279972076416 + }, + { + "auxiliary_loss_clip": 0.06580098, + "auxiliary_loss_mlp": 0.01294024, + "balance_loss_clip": 0.06311634, + "balance_loss_mlp": 0.01269825, + "epoch": 0.17886667668720876, + "flos": 23994500935680.0, + "grad_norm": 2.8964956916015323, + "language_loss": 0.75456583, + "learning_rate": 3.7719967935105705e-06, + "loss": 0.83330709, + "num_input_tokens_seen": 64307140, + "router_z_loss_clip": 2.68359375, + "router_z_loss_mlp": 0.24182129, + "step": 2975, + "time_per_iteration": 2.5941531658172607 + }, + { + "auxiliary_loss_clip": 0.06574937, + "auxiliary_loss_mlp": 0.01296496, + "balance_loss_clip": 0.06309143, + "balance_loss_mlp": 0.0127443, + "epoch": 0.17892679993987676, + "flos": 25746626136960.0, + "grad_norm": 1.5983536265516811, + "language_loss": 0.73931366, + "learning_rate": 3.7718161710889833e-06, + "loss": 0.81802797, + "num_input_tokens_seen": 64328760, + "router_z_loss_clip": 2.65820312, + "router_z_loss_mlp": 0.22070312, + "step": 2976, + "time_per_iteration": 3.9981672763824463 + }, + { + "auxiliary_loss_clip": 0.06569345, + "auxiliary_loss_mlp": 0.01289522, + "balance_loss_clip": 0.06309073, + "balance_loss_mlp": 0.01268697, + "epoch": 0.17898692319254472, + "flos": 25706277596160.0, + "grad_norm": 1.568582717127115, + "language_loss": 0.7779026, + "learning_rate": 3.7716354814799495e-06, + "loss": 0.85649121, + "num_input_tokens_seen": 64348800, + "router_z_loss_clip": 2.60351562, + "router_z_loss_mlp": 0.20837402, + "step": 2977, + "time_per_iteration": 2.6050028800964355 + }, + { + "auxiliary_loss_clip": 0.06579779, + "auxiliary_loss_mlp": 0.01290892, + "balance_loss_clip": 0.06314169, + "balance_loss_mlp": 0.01267538, + "epoch": 0.1790470464452127, + "flos": 19323830171520.0, + "grad_norm": 2.1998049901746395, + "language_loss": 0.80421352, + "learning_rate": 3.7714547246903203e-06, + "loss": 0.88292015, + "num_input_tokens_seen": 64367955, + "router_z_loss_clip": 2.65234375, + "router_z_loss_mlp": 0.23339844, + "step": 2978, + "time_per_iteration": 4.010040044784546 + }, + { + "auxiliary_loss_clip": 0.06576563, + "auxiliary_loss_mlp": 0.01293687, + "balance_loss_clip": 0.06306942, + "balance_loss_mlp": 0.01267556, + "epoch": 0.17910716969788065, + "flos": 30052048953600.0, + "grad_norm": 1.73318348994846, + "language_loss": 0.77042997, + "learning_rate": 3.7712739007269508e-06, + "loss": 0.84913242, + "num_input_tokens_seen": 64389805, + "router_z_loss_clip": 2.69335938, + "router_z_loss_mlp": 0.2611084, + "step": 2979, + "time_per_iteration": 2.608980655670166 + }, + { + "auxiliary_loss_clip": 0.06560802, + "auxiliary_loss_mlp": 0.01281236, + "balance_loss_clip": 0.06300105, + "balance_loss_mlp": 0.01258264, + "epoch": 0.17916729295054862, + "flos": 19433848982400.0, + "grad_norm": 2.44165935104879, + "language_loss": 0.69755781, + "learning_rate": 3.7710930095966976e-06, + "loss": 0.77597821, + "num_input_tokens_seen": 64408220, + "router_z_loss_clip": 2.60742188, + "router_z_loss_mlp": 0.22961426, + "step": 2980, + "time_per_iteration": 2.5433084964752197 + }, + { + "auxiliary_loss_clip": 0.06568111, + "auxiliary_loss_mlp": 0.01287625, + "balance_loss_clip": 0.06298865, + "balance_loss_mlp": 0.01262627, + "epoch": 0.17922741620321658, + "flos": 14616877789440.0, + "grad_norm": 2.147684280368508, + "language_loss": 0.7145257, + "learning_rate": 3.7709120513064196e-06, + "loss": 0.79308307, + "num_input_tokens_seen": 64426380, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 0.25, + "step": 2981, + "time_per_iteration": 2.500054359436035 + }, + { + "auxiliary_loss_clip": 0.06576173, + "auxiliary_loss_mlp": 0.01291804, + "balance_loss_clip": 0.06304301, + "balance_loss_mlp": 0.01267676, + "epoch": 0.17928753945588458, + "flos": 17171013945600.0, + "grad_norm": 2.0884907581744514, + "language_loss": 0.82620054, + "learning_rate": 3.7707310258629796e-06, + "loss": 0.90488029, + "num_input_tokens_seen": 64444355, + "router_z_loss_clip": 2.72070312, + "router_z_loss_mlp": 0.24145508, + "step": 2982, + "time_per_iteration": 2.5748655796051025 + }, + { + "auxiliary_loss_clip": 0.06564468, + "auxiliary_loss_mlp": 0.01285766, + "balance_loss_clip": 0.06298885, + "balance_loss_mlp": 0.01263212, + "epoch": 0.17934766270855254, + "flos": 31403860473600.0, + "grad_norm": 1.5724638299649338, + "language_loss": 0.83894312, + "learning_rate": 3.7705499332732413e-06, + "loss": 0.91744542, + "num_input_tokens_seen": 64467800, + "router_z_loss_clip": 2.65429688, + "router_z_loss_mlp": 0.2253418, + "step": 2983, + "time_per_iteration": 5.515043497085571 + }, + { + "auxiliary_loss_clip": 0.0656914, + "auxiliary_loss_mlp": 0.01282068, + "balance_loss_clip": 0.06294827, + "balance_loss_mlp": 0.01257571, + "epoch": 0.1794077859612205, + "flos": 20820558528000.0, + "grad_norm": 2.232182880378402, + "language_loss": 0.86948806, + "learning_rate": 3.7703687735440718e-06, + "loss": 0.94800013, + "num_input_tokens_seen": 64487230, + "router_z_loss_clip": 2.74414062, + "router_z_loss_mlp": 0.24523926, + "step": 2984, + "time_per_iteration": 2.51488995552063 + }, + { + "auxiliary_loss_clip": 0.0657285, + "auxiliary_loss_mlp": 0.0128885, + "balance_loss_clip": 0.06300434, + "balance_loss_mlp": 0.01263315, + "epoch": 0.17946790921388847, + "flos": 28994096102400.0, + "grad_norm": 1.3770556187482685, + "language_loss": 0.90024149, + "learning_rate": 3.7701875466823416e-06, + "loss": 0.97885847, + "num_input_tokens_seen": 64509165, + "router_z_loss_clip": 2.72851562, + "router_z_loss_mlp": 0.25537109, + "step": 2985, + "time_per_iteration": 2.6063013076782227 + }, + { + "auxiliary_loss_clip": 0.06556329, + "auxiliary_loss_mlp": 0.01283368, + "balance_loss_clip": 0.06297163, + "balance_loss_mlp": 0.01261088, + "epoch": 0.17952803246655644, + "flos": 20743131755520.0, + "grad_norm": 1.9976249367728316, + "language_loss": 0.71013325, + "learning_rate": 3.770006252694922e-06, + "loss": 0.78853023, + "num_input_tokens_seen": 64527940, + "router_z_loss_clip": 2.58984375, + "router_z_loss_mlp": 0.22277832, + "step": 2986, + "time_per_iteration": 2.519601345062256 + }, + { + "auxiliary_loss_clip": 0.0656532, + "auxiliary_loss_mlp": 0.01291064, + "balance_loss_clip": 0.06300499, + "balance_loss_mlp": 0.01266805, + "epoch": 0.1795881557192244, + "flos": 28263390572160.0, + "grad_norm": 2.1489314529360994, + "language_loss": 0.78320301, + "learning_rate": 3.769824891588688e-06, + "loss": 0.86176682, + "num_input_tokens_seen": 64545230, + "router_z_loss_clip": 2.65039062, + "router_z_loss_mlp": 0.24243164, + "step": 2987, + "time_per_iteration": 2.6449100971221924 + }, + { + "auxiliary_loss_clip": 0.06569126, + "auxiliary_loss_mlp": 0.01288456, + "balance_loss_clip": 0.06297948, + "balance_loss_mlp": 0.01263589, + "epoch": 0.17964827897189237, + "flos": 18558016980480.0, + "grad_norm": 1.9340316390641499, + "language_loss": 0.78628373, + "learning_rate": 3.7696434633705164e-06, + "loss": 0.86485958, + "num_input_tokens_seen": 64563820, + "router_z_loss_clip": 2.7109375, + "router_z_loss_mlp": 0.24890137, + "step": 2988, + "time_per_iteration": 2.53200101852417 + }, + { + "auxiliary_loss_clip": 0.06451814, + "auxiliary_loss_mlp": 0.01275074, + "balance_loss_clip": 0.06303016, + "balance_loss_mlp": 0.01267408, + "epoch": 0.17970840222456036, + "flos": 58182052625280.0, + "grad_norm": 0.7360596365876024, + "language_loss": 0.62615538, + "learning_rate": 3.7694619680472875e-06, + "loss": 0.70342427, + "num_input_tokens_seen": 64621315, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.07653809, + "step": 2989, + "time_per_iteration": 3.076199769973755 + }, + { + "auxiliary_loss_clip": 0.06567107, + "auxiliary_loss_mlp": 0.01292244, + "balance_loss_clip": 0.06300405, + "balance_loss_mlp": 0.0126808, + "epoch": 0.17976852547722832, + "flos": 20306662237440.0, + "grad_norm": 2.2696852334697035, + "language_loss": 0.71750367, + "learning_rate": 3.7692804056258837e-06, + "loss": 0.79609722, + "num_input_tokens_seen": 64639885, + "router_z_loss_clip": 2.66601562, + "router_z_loss_mlp": 0.24157715, + "step": 2990, + "time_per_iteration": 2.5519793033599854 + }, + { + "auxiliary_loss_clip": 0.06572431, + "auxiliary_loss_mlp": 0.01293466, + "balance_loss_clip": 0.0629989, + "balance_loss_mlp": 0.0126873, + "epoch": 0.1798286487298963, + "flos": 39677564004480.0, + "grad_norm": 1.9736942492438545, + "language_loss": 0.69419956, + "learning_rate": 3.7690987761131893e-06, + "loss": 0.77285856, + "num_input_tokens_seen": 64661220, + "router_z_loss_clip": 2.7265625, + "router_z_loss_mlp": 0.24743652, + "step": 2991, + "time_per_iteration": 2.6942460536956787 + }, + { + "auxiliary_loss_clip": 0.06566148, + "auxiliary_loss_mlp": 0.01286066, + "balance_loss_clip": 0.0629756, + "balance_loss_mlp": 0.012617, + "epoch": 0.17988877198256426, + "flos": 25527385128960.0, + "grad_norm": 1.696800264728132, + "language_loss": 0.83554435, + "learning_rate": 3.7689170795160924e-06, + "loss": 0.91406649, + "num_input_tokens_seen": 64682530, + "router_z_loss_clip": 2.68359375, + "router_z_loss_mlp": 0.24365234, + "step": 2992, + "time_per_iteration": 2.5905981063842773 + }, + { + "auxiliary_loss_clip": 0.06555136, + "auxiliary_loss_mlp": 0.01287452, + "balance_loss_clip": 0.06296399, + "balance_loss_mlp": 0.01264087, + "epoch": 0.17994889523523222, + "flos": 18813539606400.0, + "grad_norm": 1.8489809189150626, + "language_loss": 0.83113515, + "learning_rate": 3.7687353158414822e-06, + "loss": 0.90956104, + "num_input_tokens_seen": 64701025, + "router_z_loss_clip": 2.59179688, + "router_z_loss_mlp": 0.23352051, + "step": 2993, + "time_per_iteration": 2.52469801902771 + }, + { + "auxiliary_loss_clip": 0.06567293, + "auxiliary_loss_mlp": 0.01295673, + "balance_loss_clip": 0.06297931, + "balance_loss_mlp": 0.01270532, + "epoch": 0.18000901848790019, + "flos": 21110601836160.0, + "grad_norm": 1.6727087173341013, + "language_loss": 0.79138827, + "learning_rate": 3.7685534850962517e-06, + "loss": 0.87001795, + "num_input_tokens_seen": 64719570, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 0.25134277, + "step": 2994, + "time_per_iteration": 2.6068711280822754 + }, + { + "auxiliary_loss_clip": 0.06570512, + "auxiliary_loss_mlp": 0.01299664, + "balance_loss_clip": 0.06303661, + "balance_loss_mlp": 0.01275656, + "epoch": 0.18006914174056818, + "flos": 19652586865920.0, + "grad_norm": 2.057688194559839, + "language_loss": 0.81263554, + "learning_rate": 3.768371587287296e-06, + "loss": 0.89133728, + "num_input_tokens_seen": 64738110, + "router_z_loss_clip": 2.66796875, + "router_z_loss_mlp": 0.24023438, + "step": 2995, + "time_per_iteration": 2.55191707611084 + }, + { + "auxiliary_loss_clip": 0.06569074, + "auxiliary_loss_mlp": 0.0128305, + "balance_loss_clip": 0.06302823, + "balance_loss_mlp": 0.012599, + "epoch": 0.18012926499323614, + "flos": 19505909093760.0, + "grad_norm": 1.5669289310044971, + "language_loss": 0.84560204, + "learning_rate": 3.768189622421512e-06, + "loss": 0.92412329, + "num_input_tokens_seen": 64756345, + "router_z_loss_clip": 2.66210938, + "router_z_loss_mlp": 0.23156738, + "step": 2996, + "time_per_iteration": 2.5438597202301025 + }, + { + "auxiliary_loss_clip": 0.06562654, + "auxiliary_loss_mlp": 0.012845, + "balance_loss_clip": 0.06302606, + "balance_loss_mlp": 0.01261124, + "epoch": 0.1801893882459041, + "flos": 19470759505920.0, + "grad_norm": 1.7191902249906965, + "language_loss": 0.88438457, + "learning_rate": 3.7680075905058006e-06, + "loss": 0.96285611, + "num_input_tokens_seen": 64776375, + "router_z_loss_clip": 2.6015625, + "router_z_loss_mlp": 0.23352051, + "step": 2997, + "time_per_iteration": 2.5537290573120117 + }, + { + "auxiliary_loss_clip": 0.06589026, + "auxiliary_loss_mlp": 0.01294218, + "balance_loss_clip": 0.06317096, + "balance_loss_mlp": 0.01268731, + "epoch": 0.18024951149857207, + "flos": 26877938837760.0, + "grad_norm": 1.8629134602199495, + "language_loss": 0.86106455, + "learning_rate": 3.7678254915470643e-06, + "loss": 0.939897, + "num_input_tokens_seen": 64796210, + "router_z_loss_clip": 2.71679688, + "router_z_loss_mlp": 0.25500488, + "step": 2998, + "time_per_iteration": 2.6256613731384277 + }, + { + "auxiliary_loss_clip": 0.06576181, + "auxiliary_loss_mlp": 0.01293189, + "balance_loss_clip": 0.06311405, + "balance_loss_mlp": 0.01269573, + "epoch": 0.18030963475124004, + "flos": 30234421365120.0, + "grad_norm": 1.8712207411963018, + "language_loss": 0.84650278, + "learning_rate": 3.7676433255522084e-06, + "loss": 0.92519647, + "num_input_tokens_seen": 64818590, + "router_z_loss_clip": 2.64648438, + "router_z_loss_mlp": 0.23608398, + "step": 2999, + "time_per_iteration": 2.6169869899749756 + }, + { + "auxiliary_loss_clip": 0.06576863, + "auxiliary_loss_mlp": 0.01287758, + "balance_loss_clip": 0.06310622, + "balance_loss_mlp": 0.01263905, + "epoch": 0.180369758003908, + "flos": 22313681159040.0, + "grad_norm": 2.163703762887268, + "language_loss": 0.75604963, + "learning_rate": 3.76746109252814e-06, + "loss": 0.83469582, + "num_input_tokens_seen": 64838350, + "router_z_loss_clip": 2.6640625, + "router_z_loss_mlp": 0.23852539, + "step": 3000, + "time_per_iteration": 2.6028895378112793 + }, + { + "auxiliary_loss_clip": 0.06574081, + "auxiliary_loss_mlp": 0.01292075, + "balance_loss_clip": 0.06310557, + "balance_loss_mlp": 0.01270034, + "epoch": 0.18042988125657597, + "flos": 23738726747520.0, + "grad_norm": 2.5967993482221114, + "language_loss": 0.72796941, + "learning_rate": 3.76727879248177e-06, + "loss": 0.80663097, + "num_input_tokens_seen": 64858065, + "router_z_loss_clip": 2.63671875, + "router_z_loss_mlp": 0.22033691, + "step": 3001, + "time_per_iteration": 2.5506463050842285 + }, + { + "auxiliary_loss_clip": 0.06583872, + "auxiliary_loss_mlp": 0.01288133, + "balance_loss_clip": 0.06311986, + "balance_loss_mlp": 0.01262336, + "epoch": 0.18049000450924396, + "flos": 24099781991040.0, + "grad_norm": 2.0612506576335488, + "language_loss": 0.88948703, + "learning_rate": 3.767096425420011e-06, + "loss": 0.96820712, + "num_input_tokens_seen": 64877305, + "router_z_loss_clip": 2.71484375, + "router_z_loss_mlp": 0.25793457, + "step": 3002, + "time_per_iteration": 2.606262683868408 + }, + { + "auxiliary_loss_clip": 0.06584583, + "auxiliary_loss_mlp": 0.01297298, + "balance_loss_clip": 0.06316328, + "balance_loss_mlp": 0.01274613, + "epoch": 0.18055012776191193, + "flos": 22169602863360.0, + "grad_norm": 1.9471434915323604, + "language_loss": 0.82044661, + "learning_rate": 3.7669139913497788e-06, + "loss": 0.89926547, + "num_input_tokens_seen": 64896955, + "router_z_loss_clip": 2.68359375, + "router_z_loss_mlp": 0.22705078, + "step": 3003, + "time_per_iteration": 2.519054889678955 + }, + { + "auxiliary_loss_clip": 0.06584047, + "auxiliary_loss_mlp": 0.01304701, + "balance_loss_clip": 0.0631455, + "balance_loss_mlp": 0.01281098, + "epoch": 0.1806102510145799, + "flos": 28921155523200.0, + "grad_norm": 1.9671809983045359, + "language_loss": 0.67718011, + "learning_rate": 3.7667314902779907e-06, + "loss": 0.75606757, + "num_input_tokens_seen": 64917080, + "router_z_loss_clip": 2.6953125, + "router_z_loss_mlp": 0.23608398, + "step": 3004, + "time_per_iteration": 2.576216459274292 + }, + { + "auxiliary_loss_clip": 0.06581833, + "auxiliary_loss_mlp": 0.01290497, + "balance_loss_clip": 0.06313001, + "balance_loss_mlp": 0.01265976, + "epoch": 0.18067037426724786, + "flos": 19031648584320.0, + "grad_norm": 1.7292261015630317, + "language_loss": 0.86117315, + "learning_rate": 3.7665489222115677e-06, + "loss": 0.93989646, + "num_input_tokens_seen": 64935215, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 0.2454834, + "step": 3005, + "time_per_iteration": 2.51688814163208 + }, + { + "auxiliary_loss_clip": 0.06579112, + "auxiliary_loss_mlp": 0.01292933, + "balance_loss_clip": 0.0631589, + "balance_loss_mlp": 0.01270247, + "epoch": 0.18073049751991582, + "flos": 27460960346880.0, + "grad_norm": 1.9900110027616933, + "language_loss": 0.84054905, + "learning_rate": 3.766366287157432e-06, + "loss": 0.9192695, + "num_input_tokens_seen": 64956275, + "router_z_loss_clip": 2.6328125, + "router_z_loss_mlp": 0.22692871, + "step": 3006, + "time_per_iteration": 2.6471307277679443 + }, + { + "auxiliary_loss_clip": 0.06573892, + "auxiliary_loss_mlp": 0.01293776, + "balance_loss_clip": 0.06311665, + "balance_loss_mlp": 0.01270399, + "epoch": 0.1807906207725838, + "flos": 28736309416320.0, + "grad_norm": 1.8980852178108305, + "language_loss": 0.77909601, + "learning_rate": 3.7661835851225103e-06, + "loss": 0.85777271, + "num_input_tokens_seen": 64979390, + "router_z_loss_clip": 2.62304688, + "router_z_loss_mlp": 0.23376465, + "step": 3007, + "time_per_iteration": 2.596728801727295 + }, + { + "auxiliary_loss_clip": 0.06488212, + "auxiliary_loss_mlp": 0.01341948, + "balance_loss_clip": 0.06340114, + "balance_loss_mlp": 0.01332817, + "epoch": 0.18085074402525175, + "flos": 64488861411840.0, + "grad_norm": 0.8091646786767962, + "language_loss": 0.57128072, + "learning_rate": 3.7660008161137294e-06, + "loss": 0.64958233, + "num_input_tokens_seen": 65043135, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.09136963, + "step": 3008, + "time_per_iteration": 3.2818551063537598 + }, + { + "auxiliary_loss_clip": 0.06575561, + "auxiliary_loss_mlp": 0.0128936, + "balance_loss_clip": 0.06307852, + "balance_loss_mlp": 0.0126528, + "epoch": 0.18091086727791975, + "flos": 23483665319040.0, + "grad_norm": 2.791287786369512, + "language_loss": 0.68172324, + "learning_rate": 3.765817980138021e-06, + "loss": 0.76037246, + "num_input_tokens_seen": 65062845, + "router_z_loss_clip": 2.67773438, + "router_z_loss_mlp": 0.24072266, + "step": 3009, + "time_per_iteration": 2.612866163253784 + }, + { + "auxiliary_loss_clip": 0.06566571, + "auxiliary_loss_mlp": 0.01283544, + "balance_loss_clip": 0.06299911, + "balance_loss_mlp": 0.01261228, + "epoch": 0.1809709905305877, + "flos": 24177334544640.0, + "grad_norm": 2.2065616524174745, + "language_loss": 0.76732111, + "learning_rate": 3.7656350772023177e-06, + "loss": 0.84582222, + "num_input_tokens_seen": 65082110, + "router_z_loss_clip": 2.6640625, + "router_z_loss_mlp": 0.22314453, + "step": 3010, + "time_per_iteration": 2.570751190185547 + }, + { + "auxiliary_loss_clip": 0.0656049, + "auxiliary_loss_mlp": 0.01277678, + "balance_loss_clip": 0.06301664, + "balance_loss_mlp": 0.01255028, + "epoch": 0.18103111378325568, + "flos": 21657006311040.0, + "grad_norm": 1.5802962280270132, + "language_loss": 0.68172359, + "learning_rate": 3.7654521073135553e-06, + "loss": 0.76010525, + "num_input_tokens_seen": 65101985, + "router_z_loss_clip": 2.58789062, + "router_z_loss_mlp": 0.22644043, + "step": 3011, + "time_per_iteration": 2.5724563598632812 + }, + { + "auxiliary_loss_clip": 0.0656517, + "auxiliary_loss_mlp": 0.01279328, + "balance_loss_clip": 0.06304309, + "balance_loss_mlp": 0.01256989, + "epoch": 0.18109123703592364, + "flos": 53698632537600.0, + "grad_norm": 1.5833259733478497, + "language_loss": 0.71816081, + "learning_rate": 3.7652690704786723e-06, + "loss": 0.79660583, + "num_input_tokens_seen": 65129295, + "router_z_loss_clip": 2.60546875, + "router_z_loss_mlp": 0.22351074, + "step": 3012, + "time_per_iteration": 2.810831069946289 + }, + { + "auxiliary_loss_clip": 0.06566492, + "auxiliary_loss_mlp": 0.01285528, + "balance_loss_clip": 0.06309225, + "balance_loss_mlp": 0.01261376, + "epoch": 0.1811513602885916, + "flos": 35854325907840.0, + "grad_norm": 2.597528045864961, + "language_loss": 0.63496852, + "learning_rate": 3.765085966704609e-06, + "loss": 0.7134887, + "num_input_tokens_seen": 65150625, + "router_z_loss_clip": 2.57617188, + "router_z_loss_mlp": 0.24169922, + "step": 3013, + "time_per_iteration": 2.728149175643921 + }, + { + "auxiliary_loss_clip": 0.0656557, + "auxiliary_loss_mlp": 0.01286402, + "balance_loss_clip": 0.06302488, + "balance_loss_mlp": 0.01262405, + "epoch": 0.18121148354125957, + "flos": 23739355653120.0, + "grad_norm": 1.5758176693533255, + "language_loss": 0.76564461, + "learning_rate": 3.764902795998309e-06, + "loss": 0.84416431, + "num_input_tokens_seen": 65170880, + "router_z_loss_clip": 2.63085938, + "router_z_loss_mlp": 0.23986816, + "step": 3014, + "time_per_iteration": 2.547717332839966 + }, + { + "auxiliary_loss_clip": 0.06584823, + "auxiliary_loss_mlp": 0.01295776, + "balance_loss_clip": 0.06314109, + "balance_loss_mlp": 0.01270336, + "epoch": 0.18127160679392756, + "flos": 28735470875520.0, + "grad_norm": 2.560866552798296, + "language_loss": 0.66988617, + "learning_rate": 3.7647195583667184e-06, + "loss": 0.74869215, + "num_input_tokens_seen": 65192530, + "router_z_loss_clip": 2.70898438, + "router_z_loss_mlp": 0.2545166, + "step": 3015, + "time_per_iteration": 2.69026780128479 + }, + { + "auxiliary_loss_clip": 0.06569196, + "auxiliary_loss_mlp": 0.01280146, + "balance_loss_clip": 0.06306805, + "balance_loss_mlp": 0.0125696, + "epoch": 0.18133173004659553, + "flos": 20491256782080.0, + "grad_norm": 2.469275114619788, + "language_loss": 0.78958207, + "learning_rate": 3.764536253816785e-06, + "loss": 0.86807549, + "num_input_tokens_seen": 65211675, + "router_z_loss_clip": 2.62304688, + "router_z_loss_mlp": 0.23168945, + "step": 3016, + "time_per_iteration": 3.9831480979919434 + }, + { + "auxiliary_loss_clip": 0.0657627, + "auxiliary_loss_mlp": 0.01288204, + "balance_loss_clip": 0.06304967, + "balance_loss_mlp": 0.01262967, + "epoch": 0.1813918532992635, + "flos": 22857905427840.0, + "grad_norm": 1.6723213639278358, + "language_loss": 0.84196192, + "learning_rate": 3.7643528823554602e-06, + "loss": 0.92060661, + "num_input_tokens_seen": 65231185, + "router_z_loss_clip": 2.71484375, + "router_z_loss_mlp": 0.25256348, + "step": 3017, + "time_per_iteration": 2.5418076515197754 + }, + { + "auxiliary_loss_clip": 0.06562062, + "auxiliary_loss_mlp": 0.01287085, + "balance_loss_clip": 0.063041, + "balance_loss_mlp": 0.01264197, + "epoch": 0.18145197655193146, + "flos": 36074028113280.0, + "grad_norm": 1.9391079186566258, + "language_loss": 0.68509835, + "learning_rate": 3.764169443989697e-06, + "loss": 0.76358986, + "num_input_tokens_seen": 65251645, + "router_z_loss_clip": 2.58203125, + "router_z_loss_mlp": 0.22900391, + "step": 3018, + "time_per_iteration": 4.119429111480713 + }, + { + "auxiliary_loss_clip": 0.06567694, + "auxiliary_loss_mlp": 0.01285506, + "balance_loss_clip": 0.06301513, + "balance_loss_mlp": 0.01262296, + "epoch": 0.18151209980459942, + "flos": 24030698699520.0, + "grad_norm": 1.811235496294486, + "language_loss": 0.76789671, + "learning_rate": 3.7639859387264518e-06, + "loss": 0.84642869, + "num_input_tokens_seen": 65271125, + "router_z_loss_clip": 2.65820312, + "router_z_loss_mlp": 0.23205566, + "step": 3019, + "time_per_iteration": 2.5501174926757812 + }, + { + "auxiliary_loss_clip": 0.06571496, + "auxiliary_loss_mlp": 0.01294569, + "balance_loss_clip": 0.0630317, + "balance_loss_mlp": 0.01267544, + "epoch": 0.1815722230572674, + "flos": 23958470880000.0, + "grad_norm": 3.3265475746221305, + "language_loss": 0.82225502, + "learning_rate": 3.7638023665726834e-06, + "loss": 0.90091568, + "num_input_tokens_seen": 65290600, + "router_z_loss_clip": 2.68554688, + "router_z_loss_mlp": 0.26989746, + "step": 3020, + "time_per_iteration": 2.5695080757141113 + }, + { + "auxiliary_loss_clip": 0.06568192, + "auxiliary_loss_mlp": 0.01285845, + "balance_loss_clip": 0.06304596, + "balance_loss_mlp": 0.01262433, + "epoch": 0.18163234630993536, + "flos": 24392885973120.0, + "grad_norm": 1.8328180932997555, + "language_loss": 0.78643721, + "learning_rate": 3.763618727535352e-06, + "loss": 0.8649776, + "num_input_tokens_seen": 65311040, + "router_z_loss_clip": 2.63867188, + "router_z_loss_mlp": 0.234375, + "step": 3021, + "time_per_iteration": 2.551942825317383 + }, + { + "auxiliary_loss_clip": 0.06560968, + "auxiliary_loss_mlp": 0.01283899, + "balance_loss_clip": 0.06301476, + "balance_loss_mlp": 0.01261034, + "epoch": 0.18169246956260335, + "flos": 24688295942400.0, + "grad_norm": 2.040482316083418, + "language_loss": 0.85882831, + "learning_rate": 3.763435021621422e-06, + "loss": 0.93727696, + "num_input_tokens_seen": 65332115, + "router_z_loss_clip": 2.59179688, + "router_z_loss_mlp": 0.22851562, + "step": 3022, + "time_per_iteration": 5.58092737197876 + }, + { + "auxiliary_loss_clip": 0.06578015, + "auxiliary_loss_mlp": 0.01285165, + "balance_loss_clip": 0.06310268, + "balance_loss_mlp": 0.0126031, + "epoch": 0.1817525928152713, + "flos": 24250149342720.0, + "grad_norm": 1.8455534069636814, + "language_loss": 0.7011804, + "learning_rate": 3.763251248837859e-06, + "loss": 0.77981222, + "num_input_tokens_seen": 65352210, + "router_z_loss_clip": 2.67773438, + "router_z_loss_mlp": 0.24853516, + "step": 3023, + "time_per_iteration": 2.5510292053222656 + }, + { + "auxiliary_loss_clip": 0.06576993, + "auxiliary_loss_mlp": 0.01285425, + "balance_loss_clip": 0.06311849, + "balance_loss_mlp": 0.01262382, + "epoch": 0.18181271606793928, + "flos": 16477680136320.0, + "grad_norm": 3.5802196750479753, + "language_loss": 0.7475239, + "learning_rate": 3.7630674091916317e-06, + "loss": 0.82614803, + "num_input_tokens_seen": 65370600, + "router_z_loss_clip": 2.6484375, + "router_z_loss_mlp": 0.23034668, + "step": 3024, + "time_per_iteration": 2.532150983810425 + }, + { + "auxiliary_loss_clip": 0.0657917, + "auxiliary_loss_mlp": 0.01281973, + "balance_loss_clip": 0.06315119, + "balance_loss_mlp": 0.01258239, + "epoch": 0.18187283932060724, + "flos": 18585787409280.0, + "grad_norm": 2.5283577302616593, + "language_loss": 0.89396572, + "learning_rate": 3.7628835026897123e-06, + "loss": 0.97257715, + "num_input_tokens_seen": 65387270, + "router_z_loss_clip": 2.640625, + "router_z_loss_mlp": 0.23742676, + "step": 3025, + "time_per_iteration": 2.503992795944214 + }, + { + "auxiliary_loss_clip": 0.0657706, + "auxiliary_loss_mlp": 0.01284845, + "balance_loss_clip": 0.06313155, + "balance_loss_mlp": 0.01260049, + "epoch": 0.1819329625732752, + "flos": 20273105877120.0, + "grad_norm": 1.766887401432974, + "language_loss": 0.80214149, + "learning_rate": 3.7626995293390735e-06, + "loss": 0.88076055, + "num_input_tokens_seen": 65406550, + "router_z_loss_clip": 2.63867188, + "router_z_loss_mlp": 0.24804688, + "step": 3026, + "time_per_iteration": 2.5226128101348877 + }, + { + "auxiliary_loss_clip": 0.06583989, + "auxiliary_loss_mlp": 0.01292049, + "balance_loss_clip": 0.06316754, + "balance_loss_mlp": 0.01267695, + "epoch": 0.18199308582594317, + "flos": 25921242046080.0, + "grad_norm": 3.8781285127645924, + "language_loss": 0.76237446, + "learning_rate": 3.762515489146692e-06, + "loss": 0.84113485, + "num_input_tokens_seen": 65425955, + "router_z_loss_clip": 2.67578125, + "router_z_loss_mlp": 0.2434082, + "step": 3027, + "time_per_iteration": 2.578749418258667 + }, + { + "auxiliary_loss_clip": 0.06592765, + "auxiliary_loss_mlp": 0.01296803, + "balance_loss_clip": 0.06322083, + "balance_loss_mlp": 0.01271328, + "epoch": 0.18205320907861114, + "flos": 15382942542720.0, + "grad_norm": 3.274226659229475, + "language_loss": 0.86130804, + "learning_rate": 3.762331382119546e-06, + "loss": 0.94020373, + "num_input_tokens_seen": 65442820, + "router_z_loss_clip": 2.71289062, + "router_z_loss_mlp": 0.25476074, + "step": 3028, + "time_per_iteration": 2.5201306343078613 + }, + { + "auxiliary_loss_clip": 0.06585124, + "auxiliary_loss_mlp": 0.01291016, + "balance_loss_clip": 0.06319305, + "balance_loss_mlp": 0.01263896, + "epoch": 0.18211333233127913, + "flos": 25630485978240.0, + "grad_norm": 1.8702692274079507, + "language_loss": 0.83509612, + "learning_rate": 3.7621472082646183e-06, + "loss": 0.91385752, + "num_input_tokens_seen": 65461825, + "router_z_loss_clip": 2.65429688, + "router_z_loss_mlp": 0.27111816, + "step": 3029, + "time_per_iteration": 2.562183380126953 + }, + { + "auxiliary_loss_clip": 0.06592625, + "auxiliary_loss_mlp": 0.01296678, + "balance_loss_clip": 0.06326656, + "balance_loss_mlp": 0.01269153, + "epoch": 0.1821734555839471, + "flos": 14981329123200.0, + "grad_norm": 1.9791177396807749, + "language_loss": 0.78960443, + "learning_rate": 3.761962967588891e-06, + "loss": 0.86849743, + "num_input_tokens_seen": 65479480, + "router_z_loss_clip": 2.65820312, + "router_z_loss_mlp": 0.27514648, + "step": 3030, + "time_per_iteration": 2.5145437717437744 + }, + { + "auxiliary_loss_clip": 0.06592657, + "auxiliary_loss_mlp": 0.01296331, + "balance_loss_clip": 0.06325006, + "balance_loss_mlp": 0.01269748, + "epoch": 0.18223357883661506, + "flos": 20200291079040.0, + "grad_norm": 1.9881761765350903, + "language_loss": 0.86102521, + "learning_rate": 3.761778660099352e-06, + "loss": 0.93991506, + "num_input_tokens_seen": 65497775, + "router_z_loss_clip": 2.6796875, + "router_z_loss_mlp": 0.26623535, + "step": 3031, + "time_per_iteration": 2.5260634422302246 + }, + { + "auxiliary_loss_clip": 0.06592748, + "auxiliary_loss_mlp": 0.01294791, + "balance_loss_clip": 0.06325988, + "balance_loss_mlp": 0.01270473, + "epoch": 0.18229370208928303, + "flos": 15237438727680.0, + "grad_norm": 2.0909174524979033, + "language_loss": 0.8092168, + "learning_rate": 3.76159428580299e-06, + "loss": 0.88809216, + "num_input_tokens_seen": 65516505, + "router_z_loss_clip": 2.66796875, + "router_z_loss_mlp": 0.24316406, + "step": 3032, + "time_per_iteration": 2.5710113048553467 + }, + { + "auxiliary_loss_clip": 0.06594816, + "auxiliary_loss_mlp": 0.01293656, + "balance_loss_clip": 0.06321192, + "balance_loss_mlp": 0.0126718, + "epoch": 0.182353825341951, + "flos": 23847026549760.0, + "grad_norm": 1.952875580311909, + "language_loss": 0.81854784, + "learning_rate": 3.761409844706795e-06, + "loss": 0.89743257, + "num_input_tokens_seen": 65536160, + "router_z_loss_clip": 2.73632812, + "router_z_loss_mlp": 0.26501465, + "step": 3033, + "time_per_iteration": 2.5495798587799072 + }, + { + "auxiliary_loss_clip": 0.06484132, + "auxiliary_loss_mlp": 0.01303963, + "balance_loss_clip": 0.06340252, + "balance_loss_mlp": 0.01294378, + "epoch": 0.18241394859461896, + "flos": 61208017522560.0, + "grad_norm": 0.8447557433525825, + "language_loss": 0.63402653, + "learning_rate": 3.7612253368177625e-06, + "loss": 0.71190745, + "num_input_tokens_seen": 65589375, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.09570312, + "step": 3034, + "time_per_iteration": 3.0660452842712402 + }, + { + "auxiliary_loss_clip": 0.0658728, + "auxiliary_loss_mlp": 0.01296965, + "balance_loss_clip": 0.0632379, + "balance_loss_mlp": 0.01271896, + "epoch": 0.18247407184728695, + "flos": 18476439431040.0, + "grad_norm": 2.061097584564917, + "language_loss": 0.80526477, + "learning_rate": 3.7610407621428893e-06, + "loss": 0.88410723, + "num_input_tokens_seen": 65606720, + "router_z_loss_clip": 2.63476562, + "router_z_loss_mlp": 0.25073242, + "step": 3035, + "time_per_iteration": 2.5506694316864014 + }, + { + "auxiliary_loss_clip": 0.06580287, + "auxiliary_loss_mlp": 0.01288285, + "balance_loss_clip": 0.06319961, + "balance_loss_mlp": 0.01264181, + "epoch": 0.18253419509995492, + "flos": 21801042679680.0, + "grad_norm": 1.6140632959859456, + "language_loss": 0.85371202, + "learning_rate": 3.7608561206891735e-06, + "loss": 0.93239772, + "num_input_tokens_seen": 65625495, + "router_z_loss_clip": 2.6015625, + "router_z_loss_mlp": 0.24108887, + "step": 3036, + "time_per_iteration": 2.6029741764068604 + }, + { + "auxiliary_loss_clip": 0.06580038, + "auxiliary_loss_mlp": 0.01290184, + "balance_loss_clip": 0.0632468, + "balance_loss_mlp": 0.01266843, + "epoch": 0.18259431835262288, + "flos": 20154743585280.0, + "grad_norm": 2.265799944133398, + "language_loss": 0.80322921, + "learning_rate": 3.760671412463617e-06, + "loss": 0.88193142, + "num_input_tokens_seen": 65643515, + "router_z_loss_clip": 2.54882812, + "router_z_loss_mlp": 0.23327637, + "step": 3037, + "time_per_iteration": 2.519632577896118 + }, + { + "auxiliary_loss_clip": 0.06593587, + "auxiliary_loss_mlp": 0.01295693, + "balance_loss_clip": 0.063269, + "balance_loss_mlp": 0.01270373, + "epoch": 0.18265444160529085, + "flos": 16987132160640.0, + "grad_norm": 4.978587383263401, + "language_loss": 0.80596817, + "learning_rate": 3.7604866374732246e-06, + "loss": 0.88486093, + "num_input_tokens_seen": 65658155, + "router_z_loss_clip": 2.66601562, + "router_z_loss_mlp": 0.25341797, + "step": 3038, + "time_per_iteration": 2.549565315246582 + }, + { + "auxiliary_loss_clip": 0.06577064, + "auxiliary_loss_mlp": 0.01293219, + "balance_loss_clip": 0.06316892, + "balance_loss_mlp": 0.01268221, + "epoch": 0.1827145648579588, + "flos": 34431879795840.0, + "grad_norm": 3.0715308969073907, + "language_loss": 0.6822418, + "learning_rate": 3.7603017957250023e-06, + "loss": 0.76094472, + "num_input_tokens_seen": 65679310, + "router_z_loss_clip": 2.60351562, + "router_z_loss_mlp": 0.24987793, + "step": 3039, + "time_per_iteration": 2.664839267730713 + }, + { + "auxiliary_loss_clip": 0.06579359, + "auxiliary_loss_mlp": 0.01283138, + "balance_loss_clip": 0.06312781, + "balance_loss_mlp": 0.0125783, + "epoch": 0.18277468811062678, + "flos": 53298905834880.0, + "grad_norm": 2.0617529505454866, + "language_loss": 0.74242914, + "learning_rate": 3.7601168872259593e-06, + "loss": 0.82105416, + "num_input_tokens_seen": 65705235, + "router_z_loss_clip": 2.66796875, + "router_z_loss_mlp": 0.25305176, + "step": 3040, + "time_per_iteration": 2.8341598510742188 + }, + { + "auxiliary_loss_clip": 0.06576048, + "auxiliary_loss_mlp": 0.01287293, + "balance_loss_clip": 0.06314505, + "balance_loss_mlp": 0.01261997, + "epoch": 0.18283481136329474, + "flos": 31658879975040.0, + "grad_norm": 2.270513376553218, + "language_loss": 0.61012894, + "learning_rate": 3.7599319119831075e-06, + "loss": 0.68876237, + "num_input_tokens_seen": 65727575, + "router_z_loss_clip": 2.61328125, + "router_z_loss_mlp": 0.25305176, + "step": 3041, + "time_per_iteration": 2.6312432289123535 + }, + { + "auxiliary_loss_clip": 0.065763, + "auxiliary_loss_mlp": 0.01280171, + "balance_loss_clip": 0.06311682, + "balance_loss_mlp": 0.01254779, + "epoch": 0.18289493461596273, + "flos": 53148957753600.0, + "grad_norm": 1.9789856473501881, + "language_loss": 0.60569113, + "learning_rate": 3.7597468700034616e-06, + "loss": 0.68425584, + "num_input_tokens_seen": 65751370, + "router_z_loss_clip": 2.6484375, + "router_z_loss_mlp": 0.25366211, + "step": 3042, + "time_per_iteration": 2.8294289112091064 + }, + { + "auxiliary_loss_clip": 0.06571855, + "auxiliary_loss_mlp": 0.01284933, + "balance_loss_clip": 0.06311391, + "balance_loss_mlp": 0.01261818, + "epoch": 0.1829550578686307, + "flos": 25595797587840.0, + "grad_norm": 2.1969947776781593, + "language_loss": 0.87948751, + "learning_rate": 3.7595617612940374e-06, + "loss": 0.95805538, + "num_input_tokens_seen": 65771040, + "router_z_loss_clip": 2.60742188, + "router_z_loss_mlp": 0.2310791, + "step": 3043, + "time_per_iteration": 2.5895864963531494 + }, + { + "auxiliary_loss_clip": 0.06576079, + "auxiliary_loss_mlp": 0.01280472, + "balance_loss_clip": 0.06308874, + "balance_loss_mlp": 0.01255737, + "epoch": 0.18301518112129866, + "flos": 22608001025280.0, + "grad_norm": 2.7546688504112633, + "language_loss": 0.71556103, + "learning_rate": 3.7593765858618552e-06, + "loss": 0.79412657, + "num_input_tokens_seen": 65789345, + "router_z_loss_clip": 2.671875, + "router_z_loss_mlp": 0.24731445, + "step": 3044, + "time_per_iteration": 2.524653196334839 + }, + { + "auxiliary_loss_clip": 0.06580091, + "auxiliary_loss_mlp": 0.0128018, + "balance_loss_clip": 0.06309704, + "balance_loss_mlp": 0.01255277, + "epoch": 0.18307530437396663, + "flos": 34029176273280.0, + "grad_norm": 2.5838478211487406, + "language_loss": 0.65133858, + "learning_rate": 3.7591913437139365e-06, + "loss": 0.72994125, + "num_input_tokens_seen": 65810990, + "router_z_loss_clip": 2.70117188, + "router_z_loss_mlp": 0.24914551, + "step": 3045, + "time_per_iteration": 2.656454086303711 + }, + { + "auxiliary_loss_clip": 0.06567913, + "auxiliary_loss_mlp": 0.01279381, + "balance_loss_clip": 0.06306372, + "balance_loss_mlp": 0.01256898, + "epoch": 0.1831354276266346, + "flos": 21284756547840.0, + "grad_norm": 3.147408680423339, + "language_loss": 0.803563, + "learning_rate": 3.7590060348573066e-06, + "loss": 0.88203591, + "num_input_tokens_seen": 65827230, + "router_z_loss_clip": 2.6171875, + "router_z_loss_mlp": 0.22497559, + "step": 3046, + "time_per_iteration": 2.503777503967285 + }, + { + "auxiliary_loss_clip": 0.06581149, + "auxiliary_loss_mlp": 0.01284573, + "balance_loss_clip": 0.06310049, + "balance_loss_mlp": 0.01259217, + "epoch": 0.18319555087930256, + "flos": 21039338338560.0, + "grad_norm": 2.4200593706157627, + "language_loss": 0.79505324, + "learning_rate": 3.7588206592989903e-06, + "loss": 0.87371051, + "num_input_tokens_seen": 65845900, + "router_z_loss_clip": 2.71289062, + "router_z_loss_mlp": 0.25354004, + "step": 3047, + "time_per_iteration": 2.5604546070098877 + }, + { + "auxiliary_loss_clip": 0.06579873, + "auxiliary_loss_mlp": 0.01282037, + "balance_loss_clip": 0.06320655, + "balance_loss_mlp": 0.01258243, + "epoch": 0.18325567413197055, + "flos": 34390944276480.0, + "grad_norm": 1.4781726378987778, + "language_loss": 0.81601483, + "learning_rate": 3.7586352170460194e-06, + "loss": 0.89463389, + "num_input_tokens_seen": 65868730, + "router_z_loss_clip": 2.59570312, + "router_z_loss_mlp": 0.23779297, + "step": 3048, + "time_per_iteration": 2.6359665393829346 + }, + { + "auxiliary_loss_clip": 0.06575403, + "auxiliary_loss_mlp": 0.01285089, + "balance_loss_clip": 0.0631268, + "balance_loss_mlp": 0.01260472, + "epoch": 0.18331579738463852, + "flos": 20564742412800.0, + "grad_norm": 2.1940168845136045, + "language_loss": 0.87414008, + "learning_rate": 3.758449708105424e-06, + "loss": 0.95274496, + "num_input_tokens_seen": 65888420, + "router_z_loss_clip": 2.62890625, + "router_z_loss_mlp": 0.24633789, + "step": 3049, + "time_per_iteration": 2.5575695037841797 + }, + { + "auxiliary_loss_clip": 0.06592787, + "auxiliary_loss_mlp": 0.01283738, + "balance_loss_clip": 0.0632069, + "balance_loss_mlp": 0.01259086, + "epoch": 0.18337592063730648, + "flos": 19613663844480.0, + "grad_norm": 3.2022638976819486, + "language_loss": 0.78845787, + "learning_rate": 3.75826413248424e-06, + "loss": 0.86722308, + "num_input_tokens_seen": 65905840, + "router_z_loss_clip": 2.72265625, + "router_z_loss_mlp": 0.24694824, + "step": 3050, + "time_per_iteration": 2.5530426502227783 + }, + { + "auxiliary_loss_clip": 0.06580114, + "auxiliary_loss_mlp": 0.01276938, + "balance_loss_clip": 0.06318066, + "balance_loss_mlp": 0.01253466, + "epoch": 0.18343604388997445, + "flos": 20857301343360.0, + "grad_norm": 2.3642096483096764, + "language_loss": 1.00007951, + "learning_rate": 3.7580784901895035e-06, + "loss": 1.07865, + "num_input_tokens_seen": 65922845, + "router_z_loss_clip": 2.62109375, + "router_z_loss_mlp": 0.23474121, + "step": 3051, + "time_per_iteration": 2.53879714012146 + }, + { + "auxiliary_loss_clip": 0.06576733, + "auxiliary_loss_mlp": 0.01279033, + "balance_loss_clip": 0.06316614, + "balance_loss_mlp": 0.01255025, + "epoch": 0.1834961671426424, + "flos": 24402109921920.0, + "grad_norm": 1.6089937167063422, + "language_loss": 0.87510651, + "learning_rate": 3.7578927812282542e-06, + "loss": 0.95366418, + "num_input_tokens_seen": 65945555, + "router_z_loss_clip": 2.6015625, + "router_z_loss_mlp": 0.23999023, + "step": 3052, + "time_per_iteration": 2.616711378097534 + }, + { + "auxiliary_loss_clip": 0.06578867, + "auxiliary_loss_mlp": 0.01277944, + "balance_loss_clip": 0.06319693, + "balance_loss_mlp": 0.01255485, + "epoch": 0.18355629039531038, + "flos": 21257992368000.0, + "grad_norm": 1.906783267886923, + "language_loss": 0.73879737, + "learning_rate": 3.7577070056075356e-06, + "loss": 0.81736547, + "num_input_tokens_seen": 65963965, + "router_z_loss_clip": 2.59179688, + "router_z_loss_mlp": 0.22473145, + "step": 3053, + "time_per_iteration": 2.5624823570251465 + }, + { + "auxiliary_loss_clip": 0.06577893, + "auxiliary_loss_mlp": 0.01281464, + "balance_loss_clip": 0.06309894, + "balance_loss_mlp": 0.01257264, + "epoch": 0.18361641364797834, + "flos": 28663830034560.0, + "grad_norm": 2.5767200648108233, + "language_loss": 0.6330536, + "learning_rate": 3.7575211633343902e-06, + "loss": 0.71164715, + "num_input_tokens_seen": 65985965, + "router_z_loss_clip": 2.6796875, + "router_z_loss_mlp": 0.24194336, + "step": 3054, + "time_per_iteration": 2.6126291751861572 + }, + { + "auxiliary_loss_clip": 0.06580043, + "auxiliary_loss_mlp": 0.01278803, + "balance_loss_clip": 0.0631642, + "balance_loss_mlp": 0.0125539, + "epoch": 0.18367653690064634, + "flos": 20924414064000.0, + "grad_norm": 2.0083810279560192, + "language_loss": 0.79178774, + "learning_rate": 3.7573352544158663e-06, + "loss": 0.87037629, + "num_input_tokens_seen": 66005645, + "router_z_loss_clip": 2.63671875, + "router_z_loss_mlp": 0.23400879, + "step": 3055, + "time_per_iteration": 3.9858450889587402 + }, + { + "auxiliary_loss_clip": 0.06567059, + "auxiliary_loss_mlp": 0.01278609, + "balance_loss_clip": 0.06311087, + "balance_loss_mlp": 0.01255971, + "epoch": 0.1837366601533143, + "flos": 28772884523520.0, + "grad_norm": 1.844309785332071, + "language_loss": 0.71021843, + "learning_rate": 3.757149278859014e-06, + "loss": 0.78867513, + "num_input_tokens_seen": 66025675, + "router_z_loss_clip": 2.56054688, + "router_z_loss_mlp": 0.2265625, + "step": 3056, + "time_per_iteration": 2.623892068862915 + }, + { + "auxiliary_loss_clip": 0.06573971, + "auxiliary_loss_mlp": 0.01282679, + "balance_loss_clip": 0.06309162, + "balance_loss_mlp": 0.0125954, + "epoch": 0.18379678340598227, + "flos": 21257782732800.0, + "grad_norm": 1.9202402240588465, + "language_loss": 0.81177384, + "learning_rate": 3.7569632366708842e-06, + "loss": 0.89034033, + "num_input_tokens_seen": 66046125, + "router_z_loss_clip": 2.64648438, + "router_z_loss_mlp": 0.23144531, + "step": 3057, + "time_per_iteration": 3.994014263153076 + }, + { + "auxiliary_loss_clip": 0.06576763, + "auxiliary_loss_mlp": 0.01288527, + "balance_loss_clip": 0.06303927, + "balance_loss_mlp": 0.01263029, + "epoch": 0.18385690665865023, + "flos": 20455981413120.0, + "grad_norm": 5.209505310648867, + "language_loss": 0.83562195, + "learning_rate": 3.756777127858533e-06, + "loss": 0.91427481, + "num_input_tokens_seen": 66064375, + "router_z_loss_clip": 2.73242188, + "router_z_loss_mlp": 0.25500488, + "step": 3058, + "time_per_iteration": 2.559356689453125 + }, + { + "auxiliary_loss_clip": 0.0658073, + "auxiliary_loss_mlp": 0.01283954, + "balance_loss_clip": 0.06315949, + "balance_loss_mlp": 0.01259278, + "epoch": 0.1839170299113182, + "flos": 26147736432000.0, + "grad_norm": 2.1347539719525552, + "language_loss": 0.86113238, + "learning_rate": 3.756590952429017e-06, + "loss": 0.93977928, + "num_input_tokens_seen": 66084590, + "router_z_loss_clip": 2.6484375, + "router_z_loss_mlp": 0.2467041, + "step": 3059, + "time_per_iteration": 2.5702602863311768 + }, + { + "auxiliary_loss_clip": 0.0656752, + "auxiliary_loss_mlp": 0.01279577, + "balance_loss_clip": 0.06304803, + "balance_loss_mlp": 0.01255997, + "epoch": 0.18397715316398616, + "flos": 31765921966080.0, + "grad_norm": 1.5595075663945241, + "language_loss": 0.73269093, + "learning_rate": 3.756404710389396e-06, + "loss": 0.81116188, + "num_input_tokens_seen": 66107105, + "router_z_loss_clip": 2.625, + "router_z_loss_mlp": 0.23583984, + "step": 3060, + "time_per_iteration": 2.6496734619140625 + }, + { + "auxiliary_loss_clip": 0.06572919, + "auxiliary_loss_mlp": 0.01280202, + "balance_loss_clip": 0.06306632, + "balance_loss_mlp": 0.01254715, + "epoch": 0.18403727641665413, + "flos": 24619548067200.0, + "grad_norm": 1.685629450787069, + "language_loss": 0.73033082, + "learning_rate": 3.7562184017467323e-06, + "loss": 0.80886197, + "num_input_tokens_seen": 66129295, + "router_z_loss_clip": 2.6640625, + "router_z_loss_mlp": 0.25512695, + "step": 3061, + "time_per_iteration": 2.611788034439087 + }, + { + "auxiliary_loss_clip": 0.06574027, + "auxiliary_loss_mlp": 0.01285757, + "balance_loss_clip": 0.06309725, + "balance_loss_mlp": 0.01262666, + "epoch": 0.18409739966932212, + "flos": 23446503233280.0, + "grad_norm": 3.8650330009727893, + "language_loss": 0.81972837, + "learning_rate": 3.7560320265080906e-06, + "loss": 0.89832628, + "num_input_tokens_seen": 66146910, + "router_z_loss_clip": 2.64453125, + "router_z_loss_mlp": 0.23095703, + "step": 3062, + "time_per_iteration": 5.428592920303345 + }, + { + "auxiliary_loss_clip": 0.06579094, + "auxiliary_loss_mlp": 0.01285398, + "balance_loss_clip": 0.06309452, + "balance_loss_mlp": 0.01260806, + "epoch": 0.18415752292199009, + "flos": 21878637160320.0, + "grad_norm": 1.977008299285237, + "language_loss": 0.74067175, + "learning_rate": 3.7558455846805383e-06, + "loss": 0.81931663, + "num_input_tokens_seen": 66165370, + "router_z_loss_clip": 2.69921875, + "router_z_loss_mlp": 0.24572754, + "step": 3063, + "time_per_iteration": 2.53143572807312 + }, + { + "auxiliary_loss_clip": 0.06568366, + "auxiliary_loss_mlp": 0.0128141, + "balance_loss_clip": 0.06305687, + "balance_loss_mlp": 0.01257556, + "epoch": 0.18421764617465805, + "flos": 25417701734400.0, + "grad_norm": 1.7280289049146156, + "language_loss": 0.66864884, + "learning_rate": 3.7556590762711463e-06, + "loss": 0.74714661, + "num_input_tokens_seen": 66186210, + "router_z_loss_clip": 2.62890625, + "router_z_loss_mlp": 0.23864746, + "step": 3064, + "time_per_iteration": 2.595961332321167 + }, + { + "auxiliary_loss_clip": 0.06569844, + "auxiliary_loss_mlp": 0.0127972, + "balance_loss_clip": 0.06304256, + "balance_loss_mlp": 0.01256748, + "epoch": 0.18427776942732602, + "flos": 27205395793920.0, + "grad_norm": 1.7817654183541871, + "language_loss": 0.69580668, + "learning_rate": 3.7554725012869853e-06, + "loss": 0.77430236, + "num_input_tokens_seen": 66204800, + "router_z_loss_clip": 2.65625, + "router_z_loss_mlp": 0.22937012, + "step": 3065, + "time_per_iteration": 2.5717501640319824 + }, + { + "auxiliary_loss_clip": 0.06574196, + "auxiliary_loss_mlp": 0.01283905, + "balance_loss_clip": 0.06306924, + "balance_loss_mlp": 0.01258168, + "epoch": 0.18433789267999398, + "flos": 27859303457280.0, + "grad_norm": 2.294674560085645, + "language_loss": 0.73328084, + "learning_rate": 3.7552858597351318e-06, + "loss": 0.81186187, + "num_input_tokens_seen": 66222195, + "router_z_loss_clip": 2.671875, + "router_z_loss_mlp": 0.25720215, + "step": 3066, + "time_per_iteration": 2.5840933322906494 + }, + { + "auxiliary_loss_clip": 0.06567979, + "auxiliary_loss_mlp": 0.01283252, + "balance_loss_clip": 0.06303403, + "balance_loss_mlp": 0.01259458, + "epoch": 0.18439801593266195, + "flos": 17862502965120.0, + "grad_norm": 1.9426241343058523, + "language_loss": 0.8287726, + "learning_rate": 3.7550991516226622e-06, + "loss": 0.90728498, + "num_input_tokens_seen": 66239505, + "router_z_loss_clip": 2.64453125, + "router_z_loss_mlp": 0.23791504, + "step": 3067, + "time_per_iteration": 2.510010004043579 + }, + { + "auxiliary_loss_clip": 0.06482083, + "auxiliary_loss_mlp": 0.01256206, + "balance_loss_clip": 0.06330505, + "balance_loss_mlp": 0.01248302, + "epoch": 0.18445813918532994, + "flos": 56408236416000.0, + "grad_norm": 0.8014843936748705, + "language_loss": 0.59808761, + "learning_rate": 3.754912376956657e-06, + "loss": 0.67547047, + "num_input_tokens_seen": 66295695, + "router_z_loss_clip": 1.515625, + "router_z_loss_mlp": 0.07897949, + "step": 3068, + "time_per_iteration": 3.036146879196167 + }, + { + "auxiliary_loss_clip": 0.06564388, + "auxiliary_loss_mlp": 0.01280505, + "balance_loss_clip": 0.06303549, + "balance_loss_mlp": 0.01256687, + "epoch": 0.1845182624379979, + "flos": 20963085523200.0, + "grad_norm": 1.8439912741449518, + "language_loss": 0.77266169, + "learning_rate": 3.7547255357441987e-06, + "loss": 0.8511107, + "num_input_tokens_seen": 66315315, + "router_z_loss_clip": 2.60742188, + "router_z_loss_mlp": 0.23840332, + "step": 3069, + "time_per_iteration": 2.5499565601348877 + }, + { + "auxiliary_loss_clip": 0.06570058, + "auxiliary_loss_mlp": 0.01283287, + "balance_loss_clip": 0.06303704, + "balance_loss_mlp": 0.01258038, + "epoch": 0.18457838569066587, + "flos": 20491382563200.0, + "grad_norm": 2.2630610204441655, + "language_loss": 0.86447155, + "learning_rate": 3.7545386279923718e-06, + "loss": 0.94300503, + "num_input_tokens_seen": 66333675, + "router_z_loss_clip": 2.6640625, + "router_z_loss_mlp": 0.25280762, + "step": 3070, + "time_per_iteration": 2.573843479156494 + }, + { + "auxiliary_loss_clip": 0.06575848, + "auxiliary_loss_mlp": 0.0128984, + "balance_loss_clip": 0.06307413, + "balance_loss_mlp": 0.01265545, + "epoch": 0.18463850894333383, + "flos": 25017094563840.0, + "grad_norm": 2.0459920671080725, + "language_loss": 0.78778827, + "learning_rate": 3.754351653708265e-06, + "loss": 0.86644518, + "num_input_tokens_seen": 66354075, + "router_z_loss_clip": 2.68164062, + "router_z_loss_mlp": 0.24279785, + "step": 3071, + "time_per_iteration": 2.6498963832855225 + }, + { + "auxiliary_loss_clip": 0.06567957, + "auxiliary_loss_mlp": 0.01281558, + "balance_loss_clip": 0.06301579, + "balance_loss_mlp": 0.01256142, + "epoch": 0.1846986321960018, + "flos": 16806311049600.0, + "grad_norm": 2.346095649750701, + "language_loss": 0.77759838, + "learning_rate": 3.7541646128989674e-06, + "loss": 0.85609353, + "num_input_tokens_seen": 66372520, + "router_z_loss_clip": 2.66210938, + "router_z_loss_mlp": 0.25427246, + "step": 3072, + "time_per_iteration": 2.5731780529022217 + }, + { + "auxiliary_loss_clip": 0.06569058, + "auxiliary_loss_mlp": 0.01286345, + "balance_loss_clip": 0.06299037, + "balance_loss_mlp": 0.01261096, + "epoch": 0.18475875544866976, + "flos": 20820726236160.0, + "grad_norm": 1.9004070702769575, + "language_loss": 0.87276495, + "learning_rate": 3.7539775055715715e-06, + "loss": 0.95131898, + "num_input_tokens_seen": 66390745, + "router_z_loss_clip": 2.703125, + "router_z_loss_mlp": 0.25231934, + "step": 3073, + "time_per_iteration": 2.5327014923095703 + }, + { + "auxiliary_loss_clip": 0.06571067, + "auxiliary_loss_mlp": 0.01285925, + "balance_loss_clip": 0.06302057, + "balance_loss_mlp": 0.01261523, + "epoch": 0.18481887870133773, + "flos": 22608001025280.0, + "grad_norm": 2.4702398063651314, + "language_loss": 0.9204939, + "learning_rate": 3.7537903317331732e-06, + "loss": 0.99906385, + "num_input_tokens_seen": 66410525, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 0.24401855, + "step": 3074, + "time_per_iteration": 2.6219372749328613 + }, + { + "auxiliary_loss_clip": 0.06566601, + "auxiliary_loss_mlp": 0.01284131, + "balance_loss_clip": 0.06302647, + "balance_loss_mlp": 0.01257583, + "epoch": 0.18487900195400572, + "flos": 29466218332800.0, + "grad_norm": 2.295087571563985, + "language_loss": 0.64970315, + "learning_rate": 3.75360309139087e-06, + "loss": 0.72821045, + "num_input_tokens_seen": 66432535, + "router_z_loss_clip": 2.640625, + "router_z_loss_mlp": 0.26550293, + "step": 3075, + "time_per_iteration": 2.6108217239379883 + }, + { + "auxiliary_loss_clip": 0.06563977, + "auxiliary_loss_mlp": 0.0128829, + "balance_loss_clip": 0.06303947, + "balance_loss_mlp": 0.01264519, + "epoch": 0.1849391252066737, + "flos": 20634622318080.0, + "grad_norm": 2.1580493004205943, + "language_loss": 0.7321173, + "learning_rate": 3.753415784551761e-06, + "loss": 0.81063998, + "num_input_tokens_seen": 66450620, + "router_z_loss_clip": 2.60351562, + "router_z_loss_mlp": 0.23742676, + "step": 3076, + "time_per_iteration": 2.552551746368408 + }, + { + "auxiliary_loss_clip": 0.06574243, + "auxiliary_loss_mlp": 0.01280151, + "balance_loss_clip": 0.06304738, + "balance_loss_mlp": 0.01256309, + "epoch": 0.18499924845934165, + "flos": 14433750691200.0, + "grad_norm": 2.459416187119703, + "language_loss": 0.82324487, + "learning_rate": 3.7532284112229507e-06, + "loss": 0.90178883, + "num_input_tokens_seen": 66467865, + "router_z_loss_clip": 2.69726562, + "router_z_loss_mlp": 0.23864746, + "step": 3077, + "time_per_iteration": 2.493069648742676 + }, + { + "auxiliary_loss_clip": 0.06560019, + "auxiliary_loss_mlp": 0.01280161, + "balance_loss_clip": 0.06302261, + "balance_loss_mlp": 0.01256748, + "epoch": 0.18505937171200962, + "flos": 23733611648640.0, + "grad_norm": 1.8347096473751274, + "language_loss": 0.79534197, + "learning_rate": 3.7530409714115424e-06, + "loss": 0.87374371, + "num_input_tokens_seen": 66486245, + "router_z_loss_clip": 2.57617188, + "router_z_loss_mlp": 0.23425293, + "step": 3078, + "time_per_iteration": 2.5838091373443604 + }, + { + "auxiliary_loss_clip": 0.0657796, + "auxiliary_loss_mlp": 0.01288284, + "balance_loss_clip": 0.06314268, + "balance_loss_mlp": 0.0126536, + "epoch": 0.18511949496467758, + "flos": 25964525479680.0, + "grad_norm": 2.3879568543100174, + "language_loss": 0.78543603, + "learning_rate": 3.7528534651246453e-06, + "loss": 0.86409843, + "num_input_tokens_seen": 66506510, + "router_z_loss_clip": 2.63867188, + "router_z_loss_mlp": 0.22937012, + "step": 3079, + "time_per_iteration": 2.5836563110351562 + }, + { + "auxiliary_loss_clip": 0.06574707, + "auxiliary_loss_mlp": 0.01290584, + "balance_loss_clip": 0.06311746, + "balance_loss_mlp": 0.01266921, + "epoch": 0.18517961821734555, + "flos": 42423506156160.0, + "grad_norm": 2.6792059094445393, + "language_loss": 0.82738018, + "learning_rate": 3.752665892369369e-06, + "loss": 0.90603304, + "num_input_tokens_seen": 66530960, + "router_z_loss_clip": 2.62890625, + "router_z_loss_mlp": 0.23669434, + "step": 3080, + "time_per_iteration": 2.7419395446777344 + }, + { + "auxiliary_loss_clip": 0.06581488, + "auxiliary_loss_mlp": 0.01283912, + "balance_loss_clip": 0.06312552, + "balance_loss_mlp": 0.01258306, + "epoch": 0.18523974147001354, + "flos": 24104435892480.0, + "grad_norm": 2.0136248585759815, + "language_loss": 0.75280142, + "learning_rate": 3.7524782531528266e-06, + "loss": 0.83145541, + "num_input_tokens_seen": 66550275, + "router_z_loss_clip": 2.69140625, + "router_z_loss_mlp": 0.25622559, + "step": 3081, + "time_per_iteration": 2.558880567550659 + }, + { + "auxiliary_loss_clip": 0.06580579, + "auxiliary_loss_mlp": 0.01294641, + "balance_loss_clip": 0.06314941, + "balance_loss_mlp": 0.01267354, + "epoch": 0.1852998647226815, + "flos": 27381688784640.0, + "grad_norm": 2.2228183561660533, + "language_loss": 0.72592467, + "learning_rate": 3.7522905474821334e-06, + "loss": 0.80467689, + "num_input_tokens_seen": 66569040, + "router_z_loss_clip": 2.65820312, + "router_z_loss_mlp": 0.27282715, + "step": 3082, + "time_per_iteration": 2.588782787322998 + }, + { + "auxiliary_loss_clip": 0.06586821, + "auxiliary_loss_mlp": 0.01289587, + "balance_loss_clip": 0.06314754, + "balance_loss_mlp": 0.01263409, + "epoch": 0.18535998797534947, + "flos": 18338650191360.0, + "grad_norm": 1.9336985276158285, + "language_loss": 0.70667702, + "learning_rate": 3.752102775364407e-06, + "loss": 0.78544116, + "num_input_tokens_seen": 66587775, + "router_z_loss_clip": 2.72070312, + "router_z_loss_mlp": 0.26184082, + "step": 3083, + "time_per_iteration": 2.630099296569824 + }, + { + "auxiliary_loss_clip": 0.06573243, + "auxiliary_loss_mlp": 0.01286773, + "balance_loss_clip": 0.06312741, + "balance_loss_mlp": 0.01261548, + "epoch": 0.18542011122801744, + "flos": 37853881816320.0, + "grad_norm": 1.8745280868212635, + "language_loss": 0.69687432, + "learning_rate": 3.751914936806767e-06, + "loss": 0.77547449, + "num_input_tokens_seen": 66610800, + "router_z_loss_clip": 2.60742188, + "router_z_loss_mlp": 0.25244141, + "step": 3084, + "time_per_iteration": 2.7246148586273193 + }, + { + "auxiliary_loss_clip": 0.06577612, + "auxiliary_loss_mlp": 0.01284469, + "balance_loss_clip": 0.06314437, + "balance_loss_mlp": 0.01261402, + "epoch": 0.1854802344806854, + "flos": 25192171670400.0, + "grad_norm": 1.5329506051970134, + "language_loss": 0.78209639, + "learning_rate": 3.7517270318163377e-06, + "loss": 0.86071718, + "num_input_tokens_seen": 66630960, + "router_z_loss_clip": 2.6328125, + "router_z_loss_mlp": 0.23071289, + "step": 3085, + "time_per_iteration": 2.6189463138580322 + }, + { + "auxiliary_loss_clip": 0.06579587, + "auxiliary_loss_mlp": 0.01287952, + "balance_loss_clip": 0.06314654, + "balance_loss_mlp": 0.01261964, + "epoch": 0.18554035773335337, + "flos": 26691541430400.0, + "grad_norm": 1.8306415954747441, + "language_loss": 0.74554545, + "learning_rate": 3.751539060400244e-06, + "loss": 0.82422084, + "num_input_tokens_seen": 66650585, + "router_z_loss_clip": 2.65234375, + "router_z_loss_mlp": 0.2598877, + "step": 3086, + "time_per_iteration": 2.5668296813964844 + }, + { + "auxiliary_loss_clip": 0.06581503, + "auxiliary_loss_mlp": 0.0129843, + "balance_loss_clip": 0.06316213, + "balance_loss_mlp": 0.01272026, + "epoch": 0.18560048098602133, + "flos": 22353568502400.0, + "grad_norm": 2.451797107788235, + "language_loss": 0.70597452, + "learning_rate": 3.7513510225656132e-06, + "loss": 0.78477389, + "num_input_tokens_seen": 66670045, + "router_z_loss_clip": 2.65429688, + "router_z_loss_mlp": 0.26391602, + "step": 3087, + "time_per_iteration": 2.568380117416382 + }, + { + "auxiliary_loss_clip": 0.06584737, + "auxiliary_loss_mlp": 0.01292318, + "balance_loss_clip": 0.06317757, + "balance_loss_mlp": 0.01264543, + "epoch": 0.18566060423868933, + "flos": 17754245089920.0, + "grad_norm": 1.9281487675228464, + "language_loss": 0.73915106, + "learning_rate": 3.7511629183195764e-06, + "loss": 0.81792164, + "num_input_tokens_seen": 66688790, + "router_z_loss_clip": 2.671875, + "router_z_loss_mlp": 0.27783203, + "step": 3088, + "time_per_iteration": 2.536055326461792 + }, + { + "auxiliary_loss_clip": 0.06578237, + "auxiliary_loss_mlp": 0.01288694, + "balance_loss_clip": 0.06316703, + "balance_loss_mlp": 0.0126571, + "epoch": 0.1857207274913573, + "flos": 24683558186880.0, + "grad_norm": 1.798814131108877, + "language_loss": 0.92793214, + "learning_rate": 3.7509747476692663e-06, + "loss": 1.00660145, + "num_input_tokens_seen": 66708090, + "router_z_loss_clip": 2.6171875, + "router_z_loss_mlp": 0.2298584, + "step": 3089, + "time_per_iteration": 2.591520071029663 + }, + { + "auxiliary_loss_clip": 0.06581305, + "auxiliary_loss_mlp": 0.01284125, + "balance_loss_clip": 0.06316443, + "balance_loss_mlp": 0.01260772, + "epoch": 0.18578085074402526, + "flos": 28155426186240.0, + "grad_norm": 2.9732427277308724, + "language_loss": 0.59245396, + "learning_rate": 3.7507865106218176e-06, + "loss": 0.67110825, + "num_input_tokens_seen": 66727320, + "router_z_loss_clip": 2.6484375, + "router_z_loss_mlp": 0.23352051, + "step": 3090, + "time_per_iteration": 2.587693452835083 + }, + { + "auxiliary_loss_clip": 0.06569171, + "auxiliary_loss_mlp": 0.01294048, + "balance_loss_clip": 0.06308332, + "balance_loss_mlp": 0.01269372, + "epoch": 0.18584097399669322, + "flos": 23958764369280.0, + "grad_norm": 1.6455413495288673, + "language_loss": 0.825216, + "learning_rate": 3.7505982071843695e-06, + "loss": 0.90384817, + "num_input_tokens_seen": 66747505, + "router_z_loss_clip": 2.61132812, + "router_z_loss_mlp": 0.24694824, + "step": 3091, + "time_per_iteration": 2.564748525619507 + }, + { + "auxiliary_loss_clip": 0.06580666, + "auxiliary_loss_mlp": 0.01293234, + "balance_loss_clip": 0.06311382, + "balance_loss_mlp": 0.01266758, + "epoch": 0.18590109724936119, + "flos": 17207379417600.0, + "grad_norm": 2.4797040605264904, + "language_loss": 0.8537268, + "learning_rate": 3.7504098373640617e-06, + "loss": 0.93246579, + "num_input_tokens_seen": 66766425, + "router_z_loss_clip": 2.69335938, + "router_z_loss_mlp": 0.2644043, + "step": 3092, + "time_per_iteration": 2.514536142349243 + }, + { + "auxiliary_loss_clip": 0.06587748, + "auxiliary_loss_mlp": 0.01293739, + "balance_loss_clip": 0.06317791, + "balance_loss_mlp": 0.012665, + "epoch": 0.18596122050202915, + "flos": 17239761820800.0, + "grad_norm": 2.2590627268781316, + "language_loss": 0.93402261, + "learning_rate": 3.750221401168038e-06, + "loss": 1.01283741, + "num_input_tokens_seen": 66781130, + "router_z_loss_clip": 2.69921875, + "router_z_loss_mlp": 0.27246094, + "step": 3093, + "time_per_iteration": 2.5037660598754883 + }, + { + "auxiliary_loss_clip": 0.06575991, + "auxiliary_loss_mlp": 0.01284238, + "balance_loss_clip": 0.06309767, + "balance_loss_mlp": 0.01258477, + "epoch": 0.18602134375469712, + "flos": 19025862652800.0, + "grad_norm": 1.8616717248352448, + "language_loss": 0.77931499, + "learning_rate": 3.750032898603443e-06, + "loss": 0.85791731, + "num_input_tokens_seen": 66797535, + "router_z_loss_clip": 2.66210938, + "router_z_loss_mlp": 0.25744629, + "step": 3094, + "time_per_iteration": 2.529491662979126 + }, + { + "auxiliary_loss_clip": 0.06576168, + "auxiliary_loss_mlp": 0.0128492, + "balance_loss_clip": 0.06311647, + "balance_loss_mlp": 0.01260637, + "epoch": 0.1860814670073651, + "flos": 50961285429120.0, + "grad_norm": 1.6485050019084173, + "language_loss": 0.70511484, + "learning_rate": 3.749844329677425e-06, + "loss": 0.7837258, + "num_input_tokens_seen": 66821720, + "router_z_loss_clip": 2.64257812, + "router_z_loss_mlp": 0.24291992, + "step": 3095, + "time_per_iteration": 4.124077558517456 + }, + { + "auxiliary_loss_clip": 0.0658177, + "auxiliary_loss_mlp": 0.01296881, + "balance_loss_clip": 0.06310082, + "balance_loss_mlp": 0.01268819, + "epoch": 0.18614159026003307, + "flos": 19397064240000.0, + "grad_norm": 1.9264485804072164, + "language_loss": 0.81302798, + "learning_rate": 3.749655694397135e-06, + "loss": 0.89181447, + "num_input_tokens_seen": 66839060, + "router_z_loss_clip": 2.71484375, + "router_z_loss_mlp": 0.28051758, + "step": 3096, + "time_per_iteration": 2.5277867317199707 + }, + { + "auxiliary_loss_clip": 0.06581111, + "auxiliary_loss_mlp": 0.01285017, + "balance_loss_clip": 0.06310429, + "balance_loss_mlp": 0.01259173, + "epoch": 0.18620171351270104, + "flos": 21805235383680.0, + "grad_norm": 1.9931413029080365, + "language_loss": 0.76143897, + "learning_rate": 3.7494669927697255e-06, + "loss": 0.84010023, + "num_input_tokens_seen": 66857760, + "router_z_loss_clip": 2.70898438, + "router_z_loss_mlp": 0.25842285, + "step": 3097, + "time_per_iteration": 3.982475996017456 + }, + { + "auxiliary_loss_clip": 0.06569855, + "auxiliary_loss_mlp": 0.01288887, + "balance_loss_clip": 0.06308468, + "balance_loss_mlp": 0.01263877, + "epoch": 0.186261836765369, + "flos": 16368499866240.0, + "grad_norm": 2.207337076402474, + "language_loss": 0.67101508, + "learning_rate": 3.749278224802352e-06, + "loss": 0.74960256, + "num_input_tokens_seen": 66876460, + "router_z_loss_clip": 2.61328125, + "router_z_loss_mlp": 0.25061035, + "step": 3098, + "time_per_iteration": 2.5570473670959473 + }, + { + "auxiliary_loss_clip": 0.06578363, + "auxiliary_loss_mlp": 0.01287977, + "balance_loss_clip": 0.06308189, + "balance_loss_mlp": 0.0126044, + "epoch": 0.18632196001803697, + "flos": 23377168379520.0, + "grad_norm": 1.559550653919394, + "language_loss": 0.70188725, + "learning_rate": 3.7490893905021733e-06, + "loss": 0.7805506, + "num_input_tokens_seen": 66897960, + "router_z_loss_clip": 2.703125, + "router_z_loss_mlp": 0.2755127, + "step": 3099, + "time_per_iteration": 2.5704476833343506 + }, + { + "auxiliary_loss_clip": 0.0657559, + "auxiliary_loss_mlp": 0.01292152, + "balance_loss_clip": 0.06309687, + "balance_loss_mlp": 0.01266689, + "epoch": 0.18638208327070493, + "flos": 22498569192960.0, + "grad_norm": 1.5145032946618349, + "language_loss": 0.72489583, + "learning_rate": 3.7489004898763494e-06, + "loss": 0.80357325, + "num_input_tokens_seen": 66917675, + "router_z_loss_clip": 2.66015625, + "router_z_loss_mlp": 0.25463867, + "step": 3100, + "time_per_iteration": 2.628770351409912 + }, + { + "auxiliary_loss_clip": 0.06585407, + "auxiliary_loss_mlp": 0.01287458, + "balance_loss_clip": 0.06314865, + "balance_loss_mlp": 0.01261971, + "epoch": 0.18644220652337293, + "flos": 29172317736960.0, + "grad_norm": 1.7314771672192502, + "language_loss": 0.80930734, + "learning_rate": 3.7487115229320444e-06, + "loss": 0.88803601, + "num_input_tokens_seen": 66936000, + "router_z_loss_clip": 2.703125, + "router_z_loss_mlp": 0.25524902, + "step": 3101, + "time_per_iteration": 4.063347578048706 + }, + { + "auxiliary_loss_clip": 0.0657436, + "auxiliary_loss_mlp": 0.01283038, + "balance_loss_clip": 0.06309733, + "balance_loss_mlp": 0.01259494, + "epoch": 0.1865023297760409, + "flos": 24250736321280.0, + "grad_norm": 2.4348094857493834, + "language_loss": 0.77630436, + "learning_rate": 3.7485224896764222e-06, + "loss": 0.85487837, + "num_input_tokens_seen": 66955700, + "router_z_loss_clip": 2.64453125, + "router_z_loss_mlp": 0.23535156, + "step": 3102, + "time_per_iteration": 3.9878056049346924 + }, + { + "auxiliary_loss_clip": 0.06580452, + "auxiliary_loss_mlp": 0.01284097, + "balance_loss_clip": 0.0631346, + "balance_loss_mlp": 0.01259504, + "epoch": 0.18656245302870886, + "flos": 19133617403520.0, + "grad_norm": 4.261808326107292, + "language_loss": 0.77043533, + "learning_rate": 3.7483333901166525e-06, + "loss": 0.8490808, + "num_input_tokens_seen": 66972815, + "router_z_loss_clip": 2.66992188, + "router_z_loss_mlp": 0.24584961, + "step": 3103, + "time_per_iteration": 2.5497515201568604 + }, + { + "auxiliary_loss_clip": 0.06580411, + "auxiliary_loss_mlp": 0.01279736, + "balance_loss_clip": 0.06311087, + "balance_loss_mlp": 0.01255596, + "epoch": 0.18662257628137682, + "flos": 17791994154240.0, + "grad_norm": 1.8534126866214053, + "language_loss": 0.80155015, + "learning_rate": 3.7481442242599054e-06, + "loss": 0.88015163, + "num_input_tokens_seen": 66992280, + "router_z_loss_clip": 2.69335938, + "router_z_loss_mlp": 0.24157715, + "step": 3104, + "time_per_iteration": 2.5436315536499023 + }, + { + "auxiliary_loss_clip": 0.06576735, + "auxiliary_loss_mlp": 0.01287024, + "balance_loss_clip": 0.06310537, + "balance_loss_mlp": 0.01262884, + "epoch": 0.1866826995340448, + "flos": 24031201824000.0, + "grad_norm": 1.9078675803700618, + "language_loss": 0.86523151, + "learning_rate": 3.747954992113354e-06, + "loss": 0.94386911, + "num_input_tokens_seen": 67012220, + "router_z_loss_clip": 2.66210938, + "router_z_loss_mlp": 0.24169922, + "step": 3105, + "time_per_iteration": 2.5862667560577393 + }, + { + "auxiliary_loss_clip": 0.06594124, + "auxiliary_loss_mlp": 0.01282565, + "balance_loss_clip": 0.06317551, + "balance_loss_mlp": 0.01257853, + "epoch": 0.18674282278671275, + "flos": 26148533045760.0, + "grad_norm": 3.6817594399013203, + "language_loss": 0.87727821, + "learning_rate": 3.7477656936841742e-06, + "loss": 0.95604515, + "num_input_tokens_seen": 67032030, + "router_z_loss_clip": 2.765625, + "router_z_loss_mlp": 0.24719238, + "step": 3106, + "time_per_iteration": 2.6158018112182617 + }, + { + "auxiliary_loss_clip": 0.06587484, + "auxiliary_loss_mlp": 0.01282217, + "balance_loss_clip": 0.06311296, + "balance_loss_mlp": 0.01259078, + "epoch": 0.18680294603938072, + "flos": 19206893399040.0, + "grad_norm": 1.800292289422269, + "language_loss": 0.78916037, + "learning_rate": 3.7475763289795445e-06, + "loss": 0.86785746, + "num_input_tokens_seen": 67048920, + "router_z_loss_clip": 2.76171875, + "router_z_loss_mlp": 0.23132324, + "step": 3107, + "time_per_iteration": 2.519771099090576 + }, + { + "auxiliary_loss_clip": 0.06579127, + "auxiliary_loss_mlp": 0.01290711, + "balance_loss_clip": 0.06304596, + "balance_loss_mlp": 0.01264997, + "epoch": 0.1868630692920487, + "flos": 28551840652800.0, + "grad_norm": 3.3283393961991345, + "language_loss": 0.75120842, + "learning_rate": 3.7473868980066446e-06, + "loss": 0.82990676, + "num_input_tokens_seen": 67068645, + "router_z_loss_clip": 2.75, + "router_z_loss_mlp": 0.25720215, + "step": 3108, + "time_per_iteration": 2.5681068897247314 + }, + { + "auxiliary_loss_clip": 0.06588297, + "auxiliary_loss_mlp": 0.01287258, + "balance_loss_clip": 0.06313515, + "balance_loss_mlp": 0.01262451, + "epoch": 0.18692319254471668, + "flos": 17243702962560.0, + "grad_norm": 1.5585462553143232, + "language_loss": 0.7488178, + "learning_rate": 3.747197400772658e-06, + "loss": 0.82757336, + "num_input_tokens_seen": 67087075, + "router_z_loss_clip": 2.7421875, + "router_z_loss_mlp": 0.24816895, + "step": 3109, + "time_per_iteration": 2.5719470977783203 + }, + { + "auxiliary_loss_clip": 0.06585538, + "auxiliary_loss_mlp": 0.01282339, + "balance_loss_clip": 0.06316088, + "balance_loss_mlp": 0.01256113, + "epoch": 0.18698331579738464, + "flos": 23191861075200.0, + "grad_norm": 1.4817620217833272, + "language_loss": 0.85173523, + "learning_rate": 3.747007837284772e-06, + "loss": 0.93041396, + "num_input_tokens_seen": 67108040, + "router_z_loss_clip": 2.69726562, + "router_z_loss_mlp": 0.26220703, + "step": 3110, + "time_per_iteration": 2.604595899581909 + }, + { + "auxiliary_loss_clip": 0.06572624, + "auxiliary_loss_mlp": 0.01284902, + "balance_loss_clip": 0.06305574, + "balance_loss_mlp": 0.01260142, + "epoch": 0.1870434390500526, + "flos": 25523192424960.0, + "grad_norm": 2.402854340329271, + "language_loss": 0.85246378, + "learning_rate": 3.7468182075501737e-06, + "loss": 0.93103909, + "num_input_tokens_seen": 67127605, + "router_z_loss_clip": 2.671875, + "router_z_loss_mlp": 0.24755859, + "step": 3111, + "time_per_iteration": 2.58076810836792 + }, + { + "auxiliary_loss_clip": 0.06578258, + "auxiliary_loss_mlp": 0.0128217, + "balance_loss_clip": 0.06306738, + "balance_loss_mlp": 0.0125778, + "epoch": 0.18710356230272057, + "flos": 19506999196800.0, + "grad_norm": 1.9642208489694009, + "language_loss": 0.77830005, + "learning_rate": 3.7466285115760536e-06, + "loss": 0.85690439, + "num_input_tokens_seen": 67145785, + "router_z_loss_clip": 2.71679688, + "router_z_loss_mlp": 0.24365234, + "step": 3112, + "time_per_iteration": 2.5625264644622803 + }, + { + "auxiliary_loss_clip": 0.06577107, + "auxiliary_loss_mlp": 0.01281729, + "balance_loss_clip": 0.06307282, + "balance_loss_mlp": 0.01258113, + "epoch": 0.18716368555538854, + "flos": 26768129662080.0, + "grad_norm": 2.238016316213089, + "language_loss": 0.65778387, + "learning_rate": 3.7464387493696046e-06, + "loss": 0.73637217, + "num_input_tokens_seen": 67165930, + "router_z_loss_clip": 2.70117188, + "router_z_loss_mlp": 0.23620605, + "step": 3113, + "time_per_iteration": 2.6080710887908936 + }, + { + "auxiliary_loss_clip": 0.06588607, + "auxiliary_loss_mlp": 0.01279317, + "balance_loss_clip": 0.06312529, + "balance_loss_mlp": 0.01254962, + "epoch": 0.1872238088080565, + "flos": 25196490155520.0, + "grad_norm": 2.335075222112074, + "language_loss": 0.82613724, + "learning_rate": 3.746248920938024e-06, + "loss": 0.90481651, + "num_input_tokens_seen": 67185830, + "router_z_loss_clip": 2.75976562, + "router_z_loss_mlp": 0.2434082, + "step": 3114, + "time_per_iteration": 2.5988082885742188 + }, + { + "auxiliary_loss_clip": 0.06587939, + "auxiliary_loss_mlp": 0.01289131, + "balance_loss_clip": 0.06312289, + "balance_loss_mlp": 0.01262655, + "epoch": 0.1872839320607245, + "flos": 24141220634880.0, + "grad_norm": 2.589653310619875, + "language_loss": 0.58319235, + "learning_rate": 3.74605902628851e-06, + "loss": 0.66196311, + "num_input_tokens_seen": 67206930, + "router_z_loss_clip": 2.75390625, + "router_z_loss_mlp": 0.26464844, + "step": 3115, + "time_per_iteration": 2.597001552581787 + }, + { + "auxiliary_loss_clip": 0.06578196, + "auxiliary_loss_mlp": 0.01284839, + "balance_loss_clip": 0.06308471, + "balance_loss_mlp": 0.01261676, + "epoch": 0.18734405531339246, + "flos": 21179349711360.0, + "grad_norm": 2.089321408475999, + "language_loss": 0.7264486, + "learning_rate": 3.745869065428261e-06, + "loss": 0.80507892, + "num_input_tokens_seen": 67226290, + "router_z_loss_clip": 2.6953125, + "router_z_loss_mlp": 0.23168945, + "step": 3116, + "time_per_iteration": 2.559483051300049 + }, + { + "auxiliary_loss_clip": 0.06573902, + "auxiliary_loss_mlp": 0.01278215, + "balance_loss_clip": 0.06309307, + "balance_loss_mlp": 0.01256292, + "epoch": 0.18740417856606043, + "flos": 17243325619200.0, + "grad_norm": 2.0473943382883184, + "language_loss": 0.79514784, + "learning_rate": 3.7456790383644833e-06, + "loss": 0.87366909, + "num_input_tokens_seen": 67244410, + "router_z_loss_clip": 2.64648438, + "router_z_loss_mlp": 0.21936035, + "step": 3117, + "time_per_iteration": 2.5308892726898193 + }, + { + "auxiliary_loss_clip": 0.06575021, + "auxiliary_loss_mlp": 0.01286113, + "balance_loss_clip": 0.06310903, + "balance_loss_mlp": 0.01262426, + "epoch": 0.1874643018187284, + "flos": 32565626933760.0, + "grad_norm": 1.6927935343473184, + "language_loss": 0.84475845, + "learning_rate": 3.745488945104381e-06, + "loss": 0.92336977, + "num_input_tokens_seen": 67264470, + "router_z_loss_clip": 2.64257812, + "router_z_loss_mlp": 0.23669434, + "step": 3118, + "time_per_iteration": 2.645819902420044 + }, + { + "auxiliary_loss_clip": 0.06577513, + "auxiliary_loss_mlp": 0.01281432, + "balance_loss_clip": 0.06306227, + "balance_loss_mlp": 0.01256184, + "epoch": 0.18752442507139636, + "flos": 23264843581440.0, + "grad_norm": 1.8564508885039195, + "language_loss": 0.77631271, + "learning_rate": 3.7452987856551636e-06, + "loss": 0.85490215, + "num_input_tokens_seen": 67284315, + "router_z_loss_clip": 2.7109375, + "router_z_loss_mlp": 0.25280762, + "step": 3119, + "time_per_iteration": 2.5282692909240723 + }, + { + "auxiliary_loss_clip": 0.06577515, + "auxiliary_loss_mlp": 0.01280917, + "balance_loss_clip": 0.06308109, + "balance_loss_mlp": 0.01257934, + "epoch": 0.18758454832406432, + "flos": 21767150903040.0, + "grad_norm": 1.872231122069903, + "language_loss": 0.83286214, + "learning_rate": 3.7451085600240406e-06, + "loss": 0.91144645, + "num_input_tokens_seen": 67302780, + "router_z_loss_clip": 2.69921875, + "router_z_loss_mlp": 0.22973633, + "step": 3120, + "time_per_iteration": 2.5557563304901123 + }, + { + "auxiliary_loss_clip": 0.06574757, + "auxiliary_loss_mlp": 0.01283184, + "balance_loss_clip": 0.06308539, + "balance_loss_mlp": 0.01260606, + "epoch": 0.1876446715767323, + "flos": 29577956152320.0, + "grad_norm": 1.9256466590755805, + "language_loss": 0.85764915, + "learning_rate": 3.7449182682182263e-06, + "loss": 0.93622863, + "num_input_tokens_seen": 67323405, + "router_z_loss_clip": 2.6640625, + "router_z_loss_mlp": 0.22595215, + "step": 3121, + "time_per_iteration": 2.5938265323638916 + }, + { + "auxiliary_loss_clip": 0.06579052, + "auxiliary_loss_mlp": 0.01278188, + "balance_loss_clip": 0.06313133, + "balance_loss_mlp": 0.01255037, + "epoch": 0.18770479482940028, + "flos": 30348465171840.0, + "grad_norm": 1.7101492266675271, + "language_loss": 0.71341884, + "learning_rate": 3.744727910244937e-06, + "loss": 0.79199123, + "num_input_tokens_seen": 67345800, + "router_z_loss_clip": 2.66015625, + "router_z_loss_mlp": 0.23156738, + "step": 3122, + "time_per_iteration": 2.6486034393310547 + }, + { + "auxiliary_loss_clip": 0.06583723, + "auxiliary_loss_mlp": 0.01279754, + "balance_loss_clip": 0.06317301, + "balance_loss_mlp": 0.01255602, + "epoch": 0.18776491808206824, + "flos": 14470619287680.0, + "grad_norm": 1.9121070999681127, + "language_loss": 0.71984768, + "learning_rate": 3.7445374861113905e-06, + "loss": 0.79848242, + "num_input_tokens_seen": 67363575, + "router_z_loss_clip": 2.6640625, + "router_z_loss_mlp": 0.24157715, + "step": 3123, + "time_per_iteration": 2.50598406791687 + }, + { + "auxiliary_loss_clip": 0.06582906, + "auxiliary_loss_mlp": 0.01279768, + "balance_loss_clip": 0.06318765, + "balance_loss_mlp": 0.01258251, + "epoch": 0.1878250413347362, + "flos": 24505420406400.0, + "grad_norm": 1.8100549345620827, + "language_loss": 0.74830985, + "learning_rate": 3.7443469958248066e-06, + "loss": 0.8269366, + "num_input_tokens_seen": 67381765, + "router_z_loss_clip": 2.63867188, + "router_z_loss_mlp": 0.21520996, + "step": 3124, + "time_per_iteration": 2.588963031768799 + }, + { + "auxiliary_loss_clip": 0.06579177, + "auxiliary_loss_mlp": 0.01284317, + "balance_loss_clip": 0.06309149, + "balance_loss_mlp": 0.01260177, + "epoch": 0.18788516458740417, + "flos": 39795632807040.0, + "grad_norm": 2.0156197395212225, + "language_loss": 0.81827998, + "learning_rate": 3.7441564393924106e-06, + "loss": 0.89691496, + "num_input_tokens_seen": 67405000, + "router_z_loss_clip": 2.703125, + "router_z_loss_mlp": 0.24133301, + "step": 3125, + "time_per_iteration": 2.6984996795654297 + }, + { + "auxiliary_loss_clip": 0.06689048, + "auxiliary_loss_mlp": 0.01323199, + "balance_loss_clip": 0.06516109, + "balance_loss_mlp": 0.01312268, + "epoch": 0.18794528784007214, + "flos": 64717844221440.0, + "grad_norm": 0.9517259918121469, + "language_loss": 0.63560247, + "learning_rate": 3.7439658168214273e-06, + "loss": 0.715725, + "num_input_tokens_seen": 67467140, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.10949707, + "step": 3126, + "time_per_iteration": 3.246349811553955 + }, + { + "auxiliary_loss_clip": 0.06580469, + "auxiliary_loss_mlp": 0.01289138, + "balance_loss_clip": 0.06317941, + "balance_loss_mlp": 0.01265118, + "epoch": 0.1880054110927401, + "flos": 28629728622720.0, + "grad_norm": 1.7132867879725662, + "language_loss": 0.81907004, + "learning_rate": 3.7437751281190857e-06, + "loss": 0.89776611, + "num_input_tokens_seen": 67487980, + "router_z_loss_clip": 2.62304688, + "router_z_loss_mlp": 0.24035645, + "step": 3127, + "time_per_iteration": 2.6359355449676514 + }, + { + "auxiliary_loss_clip": 0.06571439, + "auxiliary_loss_mlp": 0.01288176, + "balance_loss_clip": 0.06401625, + "balance_loss_mlp": 0.01277983, + "epoch": 0.1880655343454081, + "flos": 64508959192320.0, + "grad_norm": 0.7555261261025208, + "language_loss": 0.61928779, + "learning_rate": 3.7435843732926164e-06, + "loss": 0.69788396, + "num_input_tokens_seen": 67552500, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.10192871, + "step": 3128, + "time_per_iteration": 3.3078746795654297 + }, + { + "auxiliary_loss_clip": 0.06593472, + "auxiliary_loss_mlp": 0.01285866, + "balance_loss_clip": 0.06323253, + "balance_loss_mlp": 0.0126243, + "epoch": 0.18812565759807606, + "flos": 32132679287040.0, + "grad_norm": 2.3201362692378806, + "language_loss": 0.72451007, + "learning_rate": 3.7433935523492536e-06, + "loss": 0.80330348, + "num_input_tokens_seen": 67573295, + "router_z_loss_clip": 2.70117188, + "router_z_loss_mlp": 0.234375, + "step": 3129, + "time_per_iteration": 2.684316396713257 + }, + { + "auxiliary_loss_clip": 0.06599562, + "auxiliary_loss_mlp": 0.01283183, + "balance_loss_clip": 0.06331511, + "balance_loss_mlp": 0.01259294, + "epoch": 0.18818578085074403, + "flos": 20629674927360.0, + "grad_norm": 2.0063290669545024, + "language_loss": 0.85961545, + "learning_rate": 3.7432026652962314e-06, + "loss": 0.93844295, + "num_input_tokens_seen": 67590010, + "router_z_loss_clip": 2.68359375, + "router_z_loss_mlp": 0.23876953, + "step": 3130, + "time_per_iteration": 2.5385701656341553 + }, + { + "auxiliary_loss_clip": 0.0659353, + "auxiliary_loss_mlp": 0.0128556, + "balance_loss_clip": 0.06323448, + "balance_loss_mlp": 0.01262564, + "epoch": 0.188245904103412, + "flos": 28848131089920.0, + "grad_norm": 1.7743332045981155, + "language_loss": 0.77165318, + "learning_rate": 3.7430117121407897e-06, + "loss": 0.85044408, + "num_input_tokens_seen": 67611110, + "router_z_loss_clip": 2.69921875, + "router_z_loss_mlp": 0.23010254, + "step": 3131, + "time_per_iteration": 2.6456139087677 + }, + { + "auxiliary_loss_clip": 0.06594209, + "auxiliary_loss_mlp": 0.0129295, + "balance_loss_clip": 0.06329745, + "balance_loss_mlp": 0.01266891, + "epoch": 0.18830602735607996, + "flos": 29427379165440.0, + "grad_norm": 1.8335043044334671, + "language_loss": 0.8226279, + "learning_rate": 3.74282069289017e-06, + "loss": 0.90149951, + "num_input_tokens_seen": 67631990, + "router_z_loss_clip": 2.640625, + "router_z_loss_mlp": 0.26049805, + "step": 3132, + "time_per_iteration": 2.604219436645508 + }, + { + "auxiliary_loss_clip": 0.06612615, + "auxiliary_loss_mlp": 0.01296327, + "balance_loss_clip": 0.06340778, + "balance_loss_mlp": 0.01269886, + "epoch": 0.18836615060874792, + "flos": 28879884587520.0, + "grad_norm": 2.5361304129104476, + "language_loss": 0.80964118, + "learning_rate": 3.742629607551614e-06, + "loss": 0.88873059, + "num_input_tokens_seen": 67650490, + "router_z_loss_clip": 2.72070312, + "router_z_loss_mlp": 0.26452637, + "step": 3133, + "time_per_iteration": 2.6110780239105225 + }, + { + "auxiliary_loss_clip": 0.06596034, + "auxiliary_loss_mlp": 0.01290384, + "balance_loss_clip": 0.06326675, + "balance_loss_mlp": 0.01266709, + "epoch": 0.18842627386141592, + "flos": 22608294514560.0, + "grad_norm": 1.918700832470348, + "language_loss": 0.83331311, + "learning_rate": 3.7424384561323698e-06, + "loss": 0.91217732, + "num_input_tokens_seen": 67668860, + "router_z_loss_clip": 2.69921875, + "router_z_loss_mlp": 0.23669434, + "step": 3134, + "time_per_iteration": 3.9871177673339844 + }, + { + "auxiliary_loss_clip": 0.06585519, + "auxiliary_loss_mlp": 0.01303727, + "balance_loss_clip": 0.06320879, + "balance_loss_mlp": 0.01279873, + "epoch": 0.18848639711408388, + "flos": 24580834680960.0, + "grad_norm": 1.5688225209098985, + "language_loss": 0.83794045, + "learning_rate": 3.742247238639684e-06, + "loss": 0.91683292, + "num_input_tokens_seen": 67690220, + "router_z_loss_clip": 2.64648438, + "router_z_loss_mlp": 0.23852539, + "step": 3135, + "time_per_iteration": 2.576728343963623 + }, + { + "auxiliary_loss_clip": 0.06580248, + "auxiliary_loss_mlp": 0.01300724, + "balance_loss_clip": 0.06314597, + "balance_loss_mlp": 0.01277049, + "epoch": 0.18854652036675185, + "flos": 34175350920960.0, + "grad_norm": 2.0171444284890674, + "language_loss": 0.79025453, + "learning_rate": 3.7420559550808083e-06, + "loss": 0.86906427, + "num_input_tokens_seen": 67709820, + "router_z_loss_clip": 2.65429688, + "router_z_loss_mlp": 0.23681641, + "step": 3136, + "time_per_iteration": 4.059029817581177 + }, + { + "auxiliary_loss_clip": 0.06580447, + "auxiliary_loss_mlp": 0.01296286, + "balance_loss_clip": 0.06314041, + "balance_loss_mlp": 0.01272348, + "epoch": 0.1886066436194198, + "flos": 24205985441280.0, + "grad_norm": 1.848748774649379, + "language_loss": 0.82736617, + "learning_rate": 3.741864605462996e-06, + "loss": 0.90613353, + "num_input_tokens_seen": 67729490, + "router_z_loss_clip": 2.66796875, + "router_z_loss_mlp": 0.23925781, + "step": 3137, + "time_per_iteration": 2.5432510375976562 + }, + { + "auxiliary_loss_clip": 0.06589224, + "auxiliary_loss_mlp": 0.01291304, + "balance_loss_clip": 0.0632188, + "balance_loss_mlp": 0.0126745, + "epoch": 0.18866676687208778, + "flos": 21257405389440.0, + "grad_norm": 1.7037003999682347, + "language_loss": 0.81716311, + "learning_rate": 3.741673189793504e-06, + "loss": 0.89596832, + "num_input_tokens_seen": 67749665, + "router_z_loss_clip": 2.67382812, + "router_z_loss_mlp": 0.23864746, + "step": 3138, + "time_per_iteration": 2.5536084175109863 + }, + { + "auxiliary_loss_clip": 0.06589679, + "auxiliary_loss_mlp": 0.01290101, + "balance_loss_clip": 0.06319093, + "balance_loss_mlp": 0.01265985, + "epoch": 0.18872689012475574, + "flos": 37318294517760.0, + "grad_norm": 2.1585183145570723, + "language_loss": 0.64404404, + "learning_rate": 3.7414817080795896e-06, + "loss": 0.72284186, + "num_input_tokens_seen": 67776230, + "router_z_loss_clip": 2.70507812, + "router_z_loss_mlp": 0.24133301, + "step": 3139, + "time_per_iteration": 2.7355217933654785 + }, + { + "auxiliary_loss_clip": 0.06586127, + "auxiliary_loss_mlp": 0.01305421, + "balance_loss_clip": 0.06318149, + "balance_loss_mlp": 0.01280554, + "epoch": 0.1887870133774237, + "flos": 21658641465600.0, + "grad_norm": 2.033663323673097, + "language_loss": 0.72120833, + "learning_rate": 3.741290160328514e-06, + "loss": 0.80012381, + "num_input_tokens_seen": 67795080, + "router_z_loss_clip": 2.68164062, + "router_z_loss_mlp": 0.24865723, + "step": 3140, + "time_per_iteration": 2.556196928024292 + }, + { + "auxiliary_loss_clip": 0.06585391, + "auxiliary_loss_mlp": 0.01291018, + "balance_loss_clip": 0.06316558, + "balance_loss_mlp": 0.01264935, + "epoch": 0.1888471366300917, + "flos": 15930143631360.0, + "grad_norm": 2.3984250647338254, + "language_loss": 0.88684165, + "learning_rate": 3.7410985465475412e-06, + "loss": 0.9656058, + "num_input_tokens_seen": 67813110, + "router_z_loss_clip": 2.68554688, + "router_z_loss_mlp": 0.26086426, + "step": 3141, + "time_per_iteration": 5.341757774353027 + }, + { + "auxiliary_loss_clip": 0.06587377, + "auxiliary_loss_mlp": 0.01281785, + "balance_loss_clip": 0.06315634, + "balance_loss_mlp": 0.01256358, + "epoch": 0.18890725988275966, + "flos": 18557933126400.0, + "grad_norm": 1.8324612256611552, + "language_loss": 0.7775296, + "learning_rate": 3.7409068667439378e-06, + "loss": 0.85622126, + "num_input_tokens_seen": 67831070, + "router_z_loss_clip": 2.72070312, + "router_z_loss_mlp": 0.25390625, + "step": 3142, + "time_per_iteration": 2.5836708545684814 + }, + { + "auxiliary_loss_clip": 0.06576081, + "auxiliary_loss_mlp": 0.01283372, + "balance_loss_clip": 0.06312332, + "balance_loss_mlp": 0.01261413, + "epoch": 0.18896738313542763, + "flos": 28848550360320.0, + "grad_norm": 1.9913316615923113, + "language_loss": 0.79816502, + "learning_rate": 3.740715120924971e-06, + "loss": 0.87675953, + "num_input_tokens_seen": 67852170, + "router_z_loss_clip": 2.640625, + "router_z_loss_mlp": 0.21972656, + "step": 3143, + "time_per_iteration": 2.6068625450134277 + }, + { + "auxiliary_loss_clip": 0.06581955, + "auxiliary_loss_mlp": 0.01290595, + "balance_loss_clip": 0.0631283, + "balance_loss_mlp": 0.01266146, + "epoch": 0.1890275063880956, + "flos": 22418249454720.0, + "grad_norm": 2.17929571565749, + "language_loss": 0.72435296, + "learning_rate": 3.740523309097912e-06, + "loss": 0.80307841, + "num_input_tokens_seen": 67869945, + "router_z_loss_clip": 2.69140625, + "router_z_loss_mlp": 0.24475098, + "step": 3144, + "time_per_iteration": 2.565488338470459 + }, + { + "auxiliary_loss_clip": 0.06576345, + "auxiliary_loss_mlp": 0.0128465, + "balance_loss_clip": 0.0630596, + "balance_loss_mlp": 0.012602, + "epoch": 0.18908762964076356, + "flos": 24250862102400.0, + "grad_norm": 2.4312750691575253, + "language_loss": 0.74294418, + "learning_rate": 3.7403314312700356e-06, + "loss": 0.82155418, + "num_input_tokens_seen": 67890240, + "router_z_loss_clip": 2.70507812, + "router_z_loss_mlp": 0.24438477, + "step": 3145, + "time_per_iteration": 2.582784414291382 + }, + { + "auxiliary_loss_clip": 0.0656594, + "auxiliary_loss_mlp": 0.01281011, + "balance_loss_clip": 0.063042, + "balance_loss_mlp": 0.01258385, + "epoch": 0.18914775289343153, + "flos": 16988599607040.0, + "grad_norm": 2.264042873648611, + "language_loss": 0.77487111, + "learning_rate": 3.740139487448616e-06, + "loss": 0.85334063, + "num_input_tokens_seen": 67907825, + "router_z_loss_clip": 2.6171875, + "router_z_loss_mlp": 0.22631836, + "step": 3146, + "time_per_iteration": 2.5446579456329346 + }, + { + "auxiliary_loss_clip": 0.06567892, + "auxiliary_loss_mlp": 0.01282874, + "balance_loss_clip": 0.06302544, + "balance_loss_mlp": 0.01259342, + "epoch": 0.1892078761460995, + "flos": 21550257809280.0, + "grad_norm": 2.367888350934947, + "language_loss": 0.79622674, + "learning_rate": 3.7399474776409326e-06, + "loss": 0.87473428, + "num_input_tokens_seen": 67926670, + "router_z_loss_clip": 2.66015625, + "router_z_loss_mlp": 0.23535156, + "step": 3147, + "time_per_iteration": 2.5432369709014893 + }, + { + "auxiliary_loss_clip": 0.06564464, + "auxiliary_loss_mlp": 0.0128295, + "balance_loss_clip": 0.06297393, + "balance_loss_mlp": 0.01259096, + "epoch": 0.18926799939876748, + "flos": 23007979290240.0, + "grad_norm": 3.3066597325179443, + "language_loss": 0.67790151, + "learning_rate": 3.739755401854267e-06, + "loss": 0.75637561, + "num_input_tokens_seen": 67943645, + "router_z_loss_clip": 2.66992188, + "router_z_loss_mlp": 0.23864746, + "step": 3148, + "time_per_iteration": 2.5936107635498047 + }, + { + "auxiliary_loss_clip": 0.06566582, + "auxiliary_loss_mlp": 0.01281142, + "balance_loss_clip": 0.06297165, + "balance_loss_mlp": 0.01256693, + "epoch": 0.18932812265143545, + "flos": 22279537820160.0, + "grad_norm": 2.2349625482761843, + "language_loss": 0.76378185, + "learning_rate": 3.739563260095902e-06, + "loss": 0.84225905, + "num_input_tokens_seen": 67962345, + "router_z_loss_clip": 2.69335938, + "router_z_loss_mlp": 0.24450684, + "step": 3149, + "time_per_iteration": 2.5491833686828613 + }, + { + "auxiliary_loss_clip": 0.0656079, + "auxiliary_loss_mlp": 0.01279685, + "balance_loss_clip": 0.06300658, + "balance_loss_mlp": 0.01256785, + "epoch": 0.1893882459041034, + "flos": 18630328654080.0, + "grad_norm": 2.2856364952022687, + "language_loss": 0.81782246, + "learning_rate": 3.7393710523731245e-06, + "loss": 0.89622724, + "num_input_tokens_seen": 67979760, + "router_z_loss_clip": 2.59960938, + "router_z_loss_mlp": 0.22912598, + "step": 3150, + "time_per_iteration": 2.568166494369507 + }, + { + "auxiliary_loss_clip": 0.06565347, + "auxiliary_loss_mlp": 0.01285958, + "balance_loss_clip": 0.06297709, + "balance_loss_mlp": 0.01262617, + "epoch": 0.18944836915677138, + "flos": 22899553706880.0, + "grad_norm": 2.23925150788406, + "language_loss": 0.86091208, + "learning_rate": 3.7391787786932215e-06, + "loss": 0.93942523, + "num_input_tokens_seen": 67996895, + "router_z_loss_clip": 2.67773438, + "router_z_loss_mlp": 0.2331543, + "step": 3151, + "time_per_iteration": 2.520254373550415 + }, + { + "auxiliary_loss_clip": 0.06570399, + "auxiliary_loss_mlp": 0.01289995, + "balance_loss_clip": 0.06303516, + "balance_loss_mlp": 0.01266297, + "epoch": 0.18950849240943934, + "flos": 26803698520320.0, + "grad_norm": 1.7542668261130185, + "language_loss": 0.75358492, + "learning_rate": 3.7389864390634857e-06, + "loss": 0.83218884, + "num_input_tokens_seen": 68018365, + "router_z_loss_clip": 2.66796875, + "router_z_loss_mlp": 0.23706055, + "step": 3152, + "time_per_iteration": 2.612248182296753 + }, + { + "auxiliary_loss_clip": 0.06565326, + "auxiliary_loss_mlp": 0.01283167, + "balance_loss_clip": 0.06301029, + "balance_loss_mlp": 0.01258431, + "epoch": 0.1895686156621073, + "flos": 24977919980160.0, + "grad_norm": 1.8204901028243692, + "language_loss": 0.76455373, + "learning_rate": 3.738794033491209e-06, + "loss": 0.84303862, + "num_input_tokens_seen": 68037985, + "router_z_loss_clip": 2.64257812, + "router_z_loss_mlp": 0.24755859, + "step": 3153, + "time_per_iteration": 2.5559494495391846 + }, + { + "auxiliary_loss_clip": 0.06567015, + "auxiliary_loss_mlp": 0.01280834, + "balance_loss_clip": 0.06300367, + "balance_loss_mlp": 0.01256599, + "epoch": 0.1896287389147753, + "flos": 21950990760960.0, + "grad_norm": 1.7894410743269322, + "language_loss": 0.80290896, + "learning_rate": 3.7386015619836887e-06, + "loss": 0.88138747, + "num_input_tokens_seen": 68057975, + "router_z_loss_clip": 2.66601562, + "router_z_loss_mlp": 0.24255371, + "step": 3154, + "time_per_iteration": 2.554861545562744 + }, + { + "auxiliary_loss_clip": 0.06572987, + "auxiliary_loss_mlp": 0.01294065, + "balance_loss_clip": 0.06302256, + "balance_loss_mlp": 0.01267612, + "epoch": 0.18968886216744327, + "flos": 18183628938240.0, + "grad_norm": 2.9256856308256447, + "language_loss": 0.74259496, + "learning_rate": 3.738409024548223e-06, + "loss": 0.82126546, + "num_input_tokens_seen": 68074175, + "router_z_loss_clip": 2.70898438, + "router_z_loss_mlp": 0.26452637, + "step": 3155, + "time_per_iteration": 2.473719358444214 + }, + { + "auxiliary_loss_clip": 0.06557501, + "auxiliary_loss_mlp": 0.01284077, + "balance_loss_clip": 0.06296935, + "balance_loss_mlp": 0.01260247, + "epoch": 0.18974898542011123, + "flos": 20418735473280.0, + "grad_norm": 2.585248701074102, + "language_loss": 0.74503541, + "learning_rate": 3.7382164211921136e-06, + "loss": 0.82345116, + "num_input_tokens_seen": 68095230, + "router_z_loss_clip": 2.60546875, + "router_z_loss_mlp": 0.23815918, + "step": 3156, + "time_per_iteration": 2.5825979709625244 + }, + { + "auxiliary_loss_clip": 0.06561351, + "auxiliary_loss_mlp": 0.01283032, + "balance_loss_clip": 0.06294506, + "balance_loss_mlp": 0.01259786, + "epoch": 0.1898091086727792, + "flos": 23991356407680.0, + "grad_norm": 1.7654819302184697, + "language_loss": 0.68914878, + "learning_rate": 3.7380237519226623e-06, + "loss": 0.76759267, + "num_input_tokens_seen": 68113805, + "router_z_loss_clip": 2.66796875, + "router_z_loss_mlp": 0.23266602, + "step": 3157, + "time_per_iteration": 2.614276170730591 + }, + { + "auxiliary_loss_clip": 0.06562739, + "auxiliary_loss_mlp": 0.01287461, + "balance_loss_clip": 0.06299365, + "balance_loss_mlp": 0.01263822, + "epoch": 0.18986923192544716, + "flos": 27644590569600.0, + "grad_norm": 1.6841569236878713, + "language_loss": 0.80553401, + "learning_rate": 3.737831016747176e-06, + "loss": 0.88403606, + "num_input_tokens_seen": 68133190, + "router_z_loss_clip": 2.63867188, + "router_z_loss_mlp": 0.23657227, + "step": 3158, + "time_per_iteration": 2.6667590141296387 + }, + { + "auxiliary_loss_clip": 0.06570458, + "auxiliary_loss_mlp": 0.01285173, + "balance_loss_clip": 0.06298561, + "balance_loss_mlp": 0.01260509, + "epoch": 0.18992935517811513, + "flos": 25491271219200.0, + "grad_norm": 2.1165299373469755, + "language_loss": 0.72984976, + "learning_rate": 3.737638215672964e-06, + "loss": 0.808406, + "num_input_tokens_seen": 68152330, + "router_z_loss_clip": 2.71875, + "router_z_loss_mlp": 0.2467041, + "step": 3159, + "time_per_iteration": 2.5685224533081055 + }, + { + "auxiliary_loss_clip": 0.06567825, + "auxiliary_loss_mlp": 0.01281428, + "balance_loss_clip": 0.06301159, + "balance_loss_mlp": 0.01257014, + "epoch": 0.1899894784307831, + "flos": 17426578498560.0, + "grad_norm": 1.8951112773112917, + "language_loss": 0.86019123, + "learning_rate": 3.7374453487073366e-06, + "loss": 0.93868375, + "num_input_tokens_seen": 68170185, + "router_z_loss_clip": 2.66601562, + "router_z_loss_mlp": 0.24438477, + "step": 3160, + "time_per_iteration": 2.533764362335205 + }, + { + "auxiliary_loss_clip": 0.06553883, + "auxiliary_loss_mlp": 0.0128672, + "balance_loss_clip": 0.06294671, + "balance_loss_mlp": 0.01264154, + "epoch": 0.19004960168345109, + "flos": 27499925295360.0, + "grad_norm": 1.7631570201415632, + "language_loss": 0.74244189, + "learning_rate": 3.7372524158576074e-06, + "loss": 0.82084787, + "num_input_tokens_seen": 68191665, + "router_z_loss_clip": 2.59570312, + "router_z_loss_mlp": 0.22570801, + "step": 3161, + "time_per_iteration": 2.590913772583008 + }, + { + "auxiliary_loss_clip": 0.06558438, + "auxiliary_loss_mlp": 0.01279623, + "balance_loss_clip": 0.06296802, + "balance_loss_mlp": 0.01255817, + "epoch": 0.19010972493611905, + "flos": 38663858908800.0, + "grad_norm": 1.9041337161295762, + "language_loss": 0.81525451, + "learning_rate": 3.7370594171310926e-06, + "loss": 0.89363515, + "num_input_tokens_seen": 68214635, + "router_z_loss_clip": 2.61523438, + "router_z_loss_mlp": 0.23803711, + "step": 3162, + "time_per_iteration": 2.7009496688842773 + }, + { + "auxiliary_loss_clip": 0.06556226, + "auxiliary_loss_mlp": 0.01281702, + "balance_loss_clip": 0.06291863, + "balance_loss_mlp": 0.012573, + "epoch": 0.19016984818878702, + "flos": 19250763811200.0, + "grad_norm": 2.198798501736265, + "language_loss": 0.77194953, + "learning_rate": 3.73686635253511e-06, + "loss": 0.8503288, + "num_input_tokens_seen": 68232150, + "router_z_loss_clip": 2.64453125, + "router_z_loss_mlp": 0.2442627, + "step": 3163, + "time_per_iteration": 2.5443172454833984 + }, + { + "auxiliary_loss_clip": 0.06551848, + "auxiliary_loss_mlp": 0.01280109, + "balance_loss_clip": 0.06291605, + "balance_loss_mlp": 0.01256291, + "epoch": 0.19022997144145498, + "flos": 37605947984640.0, + "grad_norm": 1.6741633946121544, + "language_loss": 0.75098169, + "learning_rate": 3.736673222076982e-06, + "loss": 0.82930118, + "num_input_tokens_seen": 68253370, + "router_z_loss_clip": 2.6015625, + "router_z_loss_mlp": 0.23815918, + "step": 3164, + "time_per_iteration": 2.6625473499298096 + }, + { + "auxiliary_loss_clip": 0.06555005, + "auxiliary_loss_mlp": 0.01280136, + "balance_loss_clip": 0.06294911, + "balance_loss_mlp": 0.01256759, + "epoch": 0.19029009469412295, + "flos": 61543874615040.0, + "grad_norm": 2.119573778415358, + "language_loss": 0.67527556, + "learning_rate": 3.7364800257640313e-06, + "loss": 0.75362694, + "num_input_tokens_seen": 68278895, + "router_z_loss_clip": 2.60351562, + "router_z_loss_mlp": 0.23364258, + "step": 3165, + "time_per_iteration": 2.8877623081207275 + }, + { + "auxiliary_loss_clip": 0.06552027, + "auxiliary_loss_mlp": 0.01278943, + "balance_loss_clip": 0.06292567, + "balance_loss_mlp": 0.01254433, + "epoch": 0.1903502179467909, + "flos": 13960077160320.0, + "grad_norm": 2.3966036589645916, + "language_loss": 0.75069398, + "learning_rate": 3.7362867636035835e-06, + "loss": 0.82900369, + "num_input_tokens_seen": 68294880, + "router_z_loss_clip": 2.59765625, + "router_z_loss_mlp": 0.24523926, + "step": 3166, + "time_per_iteration": 2.505680799484253 + }, + { + "auxiliary_loss_clip": 0.06499279, + "auxiliary_loss_mlp": 0.0131955, + "balance_loss_clip": 0.06350935, + "balance_loss_mlp": 0.01311236, + "epoch": 0.1904103411994589, + "flos": 66920484499200.0, + "grad_norm": 0.8228799096925371, + "language_loss": 0.50405741, + "learning_rate": 3.736093435602968e-06, + "loss": 0.58224571, + "num_input_tokens_seen": 68359665, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.08319092, + "step": 3167, + "time_per_iteration": 3.1767730712890625 + }, + { + "auxiliary_loss_clip": 0.06551085, + "auxiliary_loss_mlp": 0.0128493, + "balance_loss_clip": 0.06295685, + "balance_loss_mlp": 0.0126141, + "epoch": 0.19047046445212687, + "flos": 21915296121600.0, + "grad_norm": 1.8666443369688703, + "language_loss": 0.75258517, + "learning_rate": 3.7359000417695156e-06, + "loss": 0.83094531, + "num_input_tokens_seen": 68378950, + "router_z_loss_clip": 2.5546875, + "router_z_loss_mlp": 0.23522949, + "step": 3168, + "time_per_iteration": 2.539647102355957 + }, + { + "auxiliary_loss_clip": 0.06476398, + "auxiliary_loss_mlp": 0.01306941, + "balance_loss_clip": 0.06328493, + "balance_loss_mlp": 0.01299204, + "epoch": 0.19053058770479483, + "flos": 59271549338880.0, + "grad_norm": 0.8502356895352512, + "language_loss": 0.60174263, + "learning_rate": 3.73570658211056e-06, + "loss": 0.67957604, + "num_input_tokens_seen": 68434235, + "router_z_loss_clip": 1.4765625, + "router_z_loss_mlp": 0.07727051, + "step": 3169, + "time_per_iteration": 3.0786385536193848 + }, + { + "auxiliary_loss_clip": 0.06569149, + "auxiliary_loss_mlp": 0.01284984, + "balance_loss_clip": 0.06301555, + "balance_loss_mlp": 0.01260057, + "epoch": 0.1905907109574628, + "flos": 23958093536640.0, + "grad_norm": 1.6203962411975037, + "language_loss": 0.79296863, + "learning_rate": 3.735513056633436e-06, + "loss": 0.87151003, + "num_input_tokens_seen": 68453830, + "router_z_loss_clip": 2.67773438, + "router_z_loss_mlp": 0.24926758, + "step": 3170, + "time_per_iteration": 2.5439629554748535 + }, + { + "auxiliary_loss_clip": 0.06568529, + "auxiliary_loss_mlp": 0.01282478, + "balance_loss_clip": 0.06308423, + "balance_loss_mlp": 0.01258636, + "epoch": 0.19065083421013077, + "flos": 20818378321920.0, + "grad_norm": 3.266788836182488, + "language_loss": 0.78913432, + "learning_rate": 3.7353194653454834e-06, + "loss": 0.86764443, + "num_input_tokens_seen": 68473005, + "router_z_loss_clip": 2.6015625, + "router_z_loss_mlp": 0.23840332, + "step": 3171, + "time_per_iteration": 2.5944604873657227 + }, + { + "auxiliary_loss_clip": 0.06584235, + "auxiliary_loss_mlp": 0.01294559, + "balance_loss_clip": 0.06313154, + "balance_loss_mlp": 0.01269323, + "epoch": 0.19071095746279873, + "flos": 31293003121920.0, + "grad_norm": 1.9362395671252917, + "language_loss": 0.79769027, + "learning_rate": 3.7351258082540426e-06, + "loss": 0.8764782, + "num_input_tokens_seen": 68493470, + "router_z_loss_clip": 2.71484375, + "router_z_loss_mlp": 0.25256348, + "step": 3172, + "time_per_iteration": 2.6039323806762695 + }, + { + "auxiliary_loss_clip": 0.06578603, + "auxiliary_loss_mlp": 0.01291257, + "balance_loss_clip": 0.06316808, + "balance_loss_mlp": 0.0126738, + "epoch": 0.1907710807154667, + "flos": 14361397090560.0, + "grad_norm": 1.549568453685288, + "language_loss": 0.81519973, + "learning_rate": 3.7349320853664576e-06, + "loss": 0.89389837, + "num_input_tokens_seen": 68511290, + "router_z_loss_clip": 2.6171875, + "router_z_loss_mlp": 0.2388916, + "step": 3173, + "time_per_iteration": 2.566249132156372 + }, + { + "auxiliary_loss_clip": 0.06577085, + "auxiliary_loss_mlp": 0.01291087, + "balance_loss_clip": 0.06311868, + "balance_loss_mlp": 0.01266077, + "epoch": 0.1908312039681347, + "flos": 26914388163840.0, + "grad_norm": 1.4831321875737526, + "language_loss": 0.79620194, + "learning_rate": 3.7347382966900735e-06, + "loss": 0.87488365, + "num_input_tokens_seen": 68532575, + "router_z_loss_clip": 2.65429688, + "router_z_loss_mlp": 0.25012207, + "step": 3174, + "time_per_iteration": 4.032260179519653 + }, + { + "auxiliary_loss_clip": 0.06571774, + "auxiliary_loss_mlp": 0.01295417, + "balance_loss_clip": 0.06307514, + "balance_loss_mlp": 0.01271563, + "epoch": 0.19089132722080265, + "flos": 14498767059840.0, + "grad_norm": 1.9289574693520037, + "language_loss": 0.82161433, + "learning_rate": 3.7345444422322395e-06, + "loss": 0.9002862, + "num_input_tokens_seen": 68548760, + "router_z_loss_clip": 2.640625, + "router_z_loss_mlp": 0.23864746, + "step": 3175, + "time_per_iteration": 3.92791748046875 + }, + { + "auxiliary_loss_clip": 0.06570717, + "auxiliary_loss_mlp": 0.01290773, + "balance_loss_clip": 0.06306395, + "balance_loss_mlp": 0.01265393, + "epoch": 0.19095145047347062, + "flos": 13957771173120.0, + "grad_norm": 2.497584127695701, + "language_loss": 0.86521202, + "learning_rate": 3.7343505220003067e-06, + "loss": 0.94382691, + "num_input_tokens_seen": 68563100, + "router_z_loss_clip": 2.640625, + "router_z_loss_mlp": 0.25390625, + "step": 3176, + "time_per_iteration": 2.5083093643188477 + }, + { + "auxiliary_loss_clip": 0.06573781, + "auxiliary_loss_mlp": 0.01293305, + "balance_loss_clip": 0.06304888, + "balance_loss_mlp": 0.01265148, + "epoch": 0.19101157372613858, + "flos": 25308940734720.0, + "grad_norm": 2.21127293150792, + "language_loss": 0.82911885, + "learning_rate": 3.7341565360016285e-06, + "loss": 0.90778971, + "num_input_tokens_seen": 68581650, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 0.28137207, + "step": 3177, + "time_per_iteration": 2.5615227222442627 + }, + { + "auxiliary_loss_clip": 0.06560818, + "auxiliary_loss_mlp": 0.01287183, + "balance_loss_clip": 0.06300267, + "balance_loss_mlp": 0.01263985, + "epoch": 0.19107169697880655, + "flos": 20564448923520.0, + "grad_norm": 2.02770964818788, + "language_loss": 0.75787783, + "learning_rate": 3.73396248424356e-06, + "loss": 0.83635783, + "num_input_tokens_seen": 68600360, + "router_z_loss_clip": 2.60351562, + "router_z_loss_mlp": 0.23205566, + "step": 3178, + "time_per_iteration": 2.6215403079986572 + }, + { + "auxiliary_loss_clip": 0.06568342, + "auxiliary_loss_mlp": 0.01282871, + "balance_loss_clip": 0.06301986, + "balance_loss_mlp": 0.01260233, + "epoch": 0.19113182023147451, + "flos": 22169644790400.0, + "grad_norm": 1.6828125352275214, + "language_loss": 0.82549155, + "learning_rate": 3.7337683667334606e-06, + "loss": 0.90400362, + "num_input_tokens_seen": 68617885, + "router_z_loss_clip": 2.6640625, + "router_z_loss_mlp": 0.22644043, + "step": 3179, + "time_per_iteration": 2.5675652027130127 + }, + { + "auxiliary_loss_clip": 0.06569887, + "auxiliary_loss_mlp": 0.01296491, + "balance_loss_clip": 0.06307887, + "balance_loss_mlp": 0.012734, + "epoch": 0.19119194348414248, + "flos": 18586667877120.0, + "grad_norm": 2.5330173520749124, + "language_loss": 0.80732077, + "learning_rate": 3.733574183478691e-06, + "loss": 0.88598454, + "num_input_tokens_seen": 68634550, + "router_z_loss_clip": 2.62109375, + "router_z_loss_mlp": 0.23095703, + "step": 3180, + "time_per_iteration": 3.945387601852417 + }, + { + "auxiliary_loss_clip": 0.06563538, + "auxiliary_loss_mlp": 0.01290582, + "balance_loss_clip": 0.06302621, + "balance_loss_mlp": 0.01266883, + "epoch": 0.19125206673681047, + "flos": 19032738687360.0, + "grad_norm": 2.1003445268953373, + "language_loss": 0.79773259, + "learning_rate": 3.733379934486615e-06, + "loss": 0.87627381, + "num_input_tokens_seen": 68651895, + "router_z_loss_clip": 2.609375, + "router_z_loss_mlp": 0.23706055, + "step": 3181, + "time_per_iteration": 3.9274189472198486 + }, + { + "auxiliary_loss_clip": 0.06568001, + "auxiliary_loss_mlp": 0.01288302, + "balance_loss_clip": 0.06304715, + "balance_loss_mlp": 0.0126477, + "epoch": 0.19131218998947844, + "flos": 21696725946240.0, + "grad_norm": 2.2417902838655888, + "language_loss": 0.74386561, + "learning_rate": 3.7331856197645973e-06, + "loss": 0.82242858, + "num_input_tokens_seen": 68671500, + "router_z_loss_clip": 2.63476562, + "router_z_loss_mlp": 0.23547363, + "step": 3182, + "time_per_iteration": 2.550570487976074 + }, + { + "auxiliary_loss_clip": 0.06570706, + "auxiliary_loss_mlp": 0.0129189, + "balance_loss_clip": 0.06306151, + "balance_loss_mlp": 0.01267166, + "epoch": 0.1913723132421464, + "flos": 18448459367040.0, + "grad_norm": 1.7754326163332461, + "language_loss": 0.66467738, + "learning_rate": 3.7329912393200084e-06, + "loss": 0.7433033, + "num_input_tokens_seen": 68690570, + "router_z_loss_clip": 2.6484375, + "router_z_loss_mlp": 0.24719238, + "step": 3183, + "time_per_iteration": 2.589555501937866 + }, + { + "auxiliary_loss_clip": 0.06578184, + "auxiliary_loss_mlp": 0.01296721, + "balance_loss_clip": 0.06308434, + "balance_loss_mlp": 0.01268659, + "epoch": 0.19143243649481437, + "flos": 27167101678080.0, + "grad_norm": 1.7849918331200134, + "language_loss": 0.73866975, + "learning_rate": 3.7327967931602173e-06, + "loss": 0.81741881, + "num_input_tokens_seen": 68709735, + "router_z_loss_clip": 2.69921875, + "router_z_loss_mlp": 0.28076172, + "step": 3184, + "time_per_iteration": 2.7020864486694336 + }, + { + "auxiliary_loss_clip": 0.06571424, + "auxiliary_loss_mlp": 0.01290073, + "balance_loss_clip": 0.06304838, + "balance_loss_mlp": 0.01264049, + "epoch": 0.19149255974748233, + "flos": 21724244812800.0, + "grad_norm": 1.9651356872089878, + "language_loss": 0.89339554, + "learning_rate": 3.732602281292598e-06, + "loss": 0.97201049, + "num_input_tokens_seen": 68727565, + "router_z_loss_clip": 2.66601562, + "router_z_loss_mlp": 0.26037598, + "step": 3185, + "time_per_iteration": 2.512737512588501 + }, + { + "auxiliary_loss_clip": 0.06568564, + "auxiliary_loss_mlp": 0.01286821, + "balance_loss_clip": 0.06304171, + "balance_loss_mlp": 0.01261429, + "epoch": 0.1915526830001503, + "flos": 22969433612160.0, + "grad_norm": 2.041503418641191, + "language_loss": 0.74291968, + "learning_rate": 3.7324077037245267e-06, + "loss": 0.82147354, + "num_input_tokens_seen": 68748110, + "router_z_loss_clip": 2.64648438, + "router_z_loss_mlp": 0.25390625, + "step": 3186, + "time_per_iteration": 2.577359676361084 + }, + { + "auxiliary_loss_clip": 0.06579722, + "auxiliary_loss_mlp": 0.01289876, + "balance_loss_clip": 0.06312623, + "balance_loss_mlp": 0.01264675, + "epoch": 0.1916128062528183, + "flos": 26147946067200.0, + "grad_norm": 1.9086459802632982, + "language_loss": 0.84205973, + "learning_rate": 3.7322130604633825e-06, + "loss": 0.92075574, + "num_input_tokens_seen": 68769765, + "router_z_loss_clip": 2.671875, + "router_z_loss_mlp": 0.25231934, + "step": 3187, + "time_per_iteration": 2.575345039367676 + }, + { + "auxiliary_loss_clip": 0.06462009, + "auxiliary_loss_mlp": 0.01273815, + "balance_loss_clip": 0.06313258, + "balance_loss_mlp": 0.01266967, + "epoch": 0.19167292950548626, + "flos": 54943513119360.0, + "grad_norm": 0.8344019653061644, + "language_loss": 0.56017417, + "learning_rate": 3.732018351516544e-06, + "loss": 0.63753241, + "num_input_tokens_seen": 68826815, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.06866455, + "step": 3188, + "time_per_iteration": 3.186802387237549 + }, + { + "auxiliary_loss_clip": 0.06575608, + "auxiliary_loss_mlp": 0.01301201, + "balance_loss_clip": 0.06310253, + "balance_loss_mlp": 0.01276942, + "epoch": 0.19173305275815422, + "flos": 29943497589120.0, + "grad_norm": 2.242687399889932, + "language_loss": 0.70996517, + "learning_rate": 3.731823576891397e-06, + "loss": 0.78873324, + "num_input_tokens_seen": 68847585, + "router_z_loss_clip": 2.65039062, + "router_z_loss_mlp": 0.24267578, + "step": 3189, + "time_per_iteration": 2.5879886150360107 + }, + { + "auxiliary_loss_clip": 0.0656148, + "auxiliary_loss_mlp": 0.01285809, + "balance_loss_clip": 0.06303851, + "balance_loss_mlp": 0.01263994, + "epoch": 0.1917931760108222, + "flos": 24759140169600.0, + "grad_norm": 2.034629185065424, + "language_loss": 0.74848962, + "learning_rate": 3.7316287365953266e-06, + "loss": 0.82696253, + "num_input_tokens_seen": 68866620, + "router_z_loss_clip": 2.578125, + "router_z_loss_mlp": 0.21813965, + "step": 3190, + "time_per_iteration": 2.618912696838379 + }, + { + "auxiliary_loss_clip": 0.06566381, + "auxiliary_loss_mlp": 0.01292718, + "balance_loss_clip": 0.06306858, + "balance_loss_mlp": 0.01268614, + "epoch": 0.19185329926349015, + "flos": 18849527735040.0, + "grad_norm": 1.9370060266864375, + "language_loss": 0.84794742, + "learning_rate": 3.73143383063572e-06, + "loss": 0.92653841, + "num_input_tokens_seen": 68885515, + "router_z_loss_clip": 2.59179688, + "router_z_loss_mlp": 0.24108887, + "step": 3191, + "time_per_iteration": 2.5354197025299072 + }, + { + "auxiliary_loss_clip": 0.06560425, + "auxiliary_loss_mlp": 0.01288793, + "balance_loss_clip": 0.06303156, + "balance_loss_mlp": 0.01265595, + "epoch": 0.19191342251615812, + "flos": 22092721142400.0, + "grad_norm": 1.810553957384375, + "language_loss": 0.90797645, + "learning_rate": 3.73123885901997e-06, + "loss": 0.98646855, + "num_input_tokens_seen": 68903225, + "router_z_loss_clip": 2.57421875, + "router_z_loss_mlp": 0.23193359, + "step": 3192, + "time_per_iteration": 2.594034433364868 + }, + { + "auxiliary_loss_clip": 0.06575879, + "auxiliary_loss_mlp": 0.01297652, + "balance_loss_clip": 0.06307722, + "balance_loss_mlp": 0.01273727, + "epoch": 0.19197354576882608, + "flos": 22205465210880.0, + "grad_norm": 3.128458316309985, + "language_loss": 0.76021564, + "learning_rate": 3.7310438217554687e-06, + "loss": 0.83895093, + "num_input_tokens_seen": 68922860, + "router_z_loss_clip": 2.68554688, + "router_z_loss_mlp": 0.23925781, + "step": 3193, + "time_per_iteration": 2.5328986644744873 + }, + { + "auxiliary_loss_clip": 0.06572805, + "auxiliary_loss_mlp": 0.01303133, + "balance_loss_clip": 0.06305176, + "balance_loss_mlp": 0.01278504, + "epoch": 0.19203366902149407, + "flos": 24902505705600.0, + "grad_norm": 1.8726296466629722, + "language_loss": 0.75837868, + "learning_rate": 3.730848718849612e-06, + "loss": 0.83713806, + "num_input_tokens_seen": 68943000, + "router_z_loss_clip": 2.67382812, + "router_z_loss_mlp": 0.24633789, + "step": 3194, + "time_per_iteration": 2.594693660736084 + }, + { + "auxiliary_loss_clip": 0.06443634, + "auxiliary_loss_mlp": 0.01272062, + "balance_loss_clip": 0.06298726, + "balance_loss_mlp": 0.01264749, + "epoch": 0.19209379227416204, + "flos": 68435256211200.0, + "grad_norm": 0.738426265798758, + "language_loss": 0.68323666, + "learning_rate": 3.7306535503097985e-06, + "loss": 0.76039362, + "num_input_tokens_seen": 69000255, + "router_z_loss_clip": 1.453125, + "router_z_loss_mlp": 0.07293701, + "step": 3195, + "time_per_iteration": 3.082646369934082 + }, + { + "auxiliary_loss_clip": 0.0656238, + "auxiliary_loss_mlp": 0.0129433, + "balance_loss_clip": 0.06301422, + "balance_loss_mlp": 0.01270488, + "epoch": 0.19215391552683, + "flos": 22061848112640.0, + "grad_norm": 2.817360442151248, + "language_loss": 0.74132156, + "learning_rate": 3.730458316143429e-06, + "loss": 0.81988871, + "num_input_tokens_seen": 69019665, + "router_z_loss_clip": 2.61132812, + "router_z_loss_mlp": 0.23852539, + "step": 3196, + "time_per_iteration": 2.5596578121185303 + }, + { + "auxiliary_loss_clip": 0.0656443, + "auxiliary_loss_mlp": 0.01296614, + "balance_loss_clip": 0.06303307, + "balance_loss_mlp": 0.01272939, + "epoch": 0.19221403877949797, + "flos": 20309177859840.0, + "grad_norm": 2.156505210347581, + "language_loss": 0.84144557, + "learning_rate": 3.7302630163579068e-06, + "loss": 0.92005599, + "num_input_tokens_seen": 69039055, + "router_z_loss_clip": 2.61523438, + "router_z_loss_mlp": 0.23657227, + "step": 3197, + "time_per_iteration": 2.505884885787964 + }, + { + "auxiliary_loss_clip": 0.06563333, + "auxiliary_loss_mlp": 0.01294057, + "balance_loss_clip": 0.06297445, + "balance_loss_mlp": 0.0126894, + "epoch": 0.19227416203216594, + "flos": 23192028783360.0, + "grad_norm": 2.1973705189643042, + "language_loss": 0.8105517, + "learning_rate": 3.7300676509606373e-06, + "loss": 0.88912559, + "num_input_tokens_seen": 69056370, + "router_z_loss_clip": 2.65625, + "router_z_loss_mlp": 0.25109863, + "step": 3198, + "time_per_iteration": 2.5759875774383545 + }, + { + "auxiliary_loss_clip": 0.06570526, + "auxiliary_loss_mlp": 0.01303751, + "balance_loss_clip": 0.06301676, + "balance_loss_mlp": 0.01279194, + "epoch": 0.1923342852848339, + "flos": 25783872076800.0, + "grad_norm": 2.3405078734196274, + "language_loss": 0.79434526, + "learning_rate": 3.729872219959029e-06, + "loss": 0.873088, + "num_input_tokens_seen": 69075915, + "router_z_loss_clip": 2.6875, + "router_z_loss_mlp": 0.24536133, + "step": 3199, + "time_per_iteration": 2.57918643951416 + }, + { + "auxiliary_loss_clip": 0.06561789, + "auxiliary_loss_mlp": 0.01291155, + "balance_loss_clip": 0.06299184, + "balance_loss_mlp": 0.01267694, + "epoch": 0.19239440853750187, + "flos": 17133977640960.0, + "grad_norm": 1.9996812909650197, + "language_loss": 0.84443569, + "learning_rate": 3.7296767233604934e-06, + "loss": 0.92296517, + "num_input_tokens_seen": 69094145, + "router_z_loss_clip": 2.625, + "router_z_loss_mlp": 0.23449707, + "step": 3200, + "time_per_iteration": 2.5089356899261475 + }, + { + "auxiliary_loss_clip": 0.06560853, + "auxiliary_loss_mlp": 0.01287978, + "balance_loss_clip": 0.06299884, + "balance_loss_mlp": 0.01265185, + "epoch": 0.19245453179016986, + "flos": 16440601904640.0, + "grad_norm": 1.9071909055640763, + "language_loss": 0.79753184, + "learning_rate": 3.729481161172443e-06, + "loss": 0.87602013, + "num_input_tokens_seen": 69111110, + "router_z_loss_clip": 2.609375, + "router_z_loss_mlp": 0.22790527, + "step": 3201, + "time_per_iteration": 2.5428295135498047 + }, + { + "auxiliary_loss_clip": 0.06563856, + "auxiliary_loss_mlp": 0.01287849, + "balance_loss_clip": 0.06298736, + "balance_loss_mlp": 0.01264365, + "epoch": 0.19251465504283782, + "flos": 20236530769920.0, + "grad_norm": 3.4105372180153273, + "language_loss": 0.70024735, + "learning_rate": 3.7292855334022927e-06, + "loss": 0.77876443, + "num_input_tokens_seen": 69130280, + "router_z_loss_clip": 2.65039062, + "router_z_loss_mlp": 0.23498535, + "step": 3202, + "time_per_iteration": 2.545257806777954 + }, + { + "auxiliary_loss_clip": 0.06559525, + "auxiliary_loss_mlp": 0.01288531, + "balance_loss_clip": 0.06303041, + "balance_loss_mlp": 0.01265965, + "epoch": 0.1925747782955058, + "flos": 19470549870720.0, + "grad_norm": 1.8972638993856672, + "language_loss": 0.9187758, + "learning_rate": 3.7290898400574627e-06, + "loss": 0.9972564, + "num_input_tokens_seen": 69149570, + "router_z_loss_clip": 2.56445312, + "router_z_loss_mlp": 0.22570801, + "step": 3203, + "time_per_iteration": 2.52083420753479 + }, + { + "auxiliary_loss_clip": 0.06569508, + "auxiliary_loss_mlp": 0.01288191, + "balance_loss_clip": 0.06305829, + "balance_loss_mlp": 0.01263193, + "epoch": 0.19263490154817375, + "flos": 17791407175680.0, + "grad_norm": 2.3309919698880637, + "language_loss": 0.82672936, + "learning_rate": 3.7288940811453725e-06, + "loss": 0.9053064, + "num_input_tokens_seen": 69168190, + "router_z_loss_clip": 2.63671875, + "router_z_loss_mlp": 0.25012207, + "step": 3204, + "time_per_iteration": 2.552898645401001 + }, + { + "auxiliary_loss_clip": 0.06554051, + "auxiliary_loss_mlp": 0.01280623, + "balance_loss_clip": 0.06297573, + "balance_loss_mlp": 0.01257437, + "epoch": 0.19269502480084172, + "flos": 17462818189440.0, + "grad_norm": 2.4686415170818927, + "language_loss": 0.76927221, + "learning_rate": 3.7286982566734454e-06, + "loss": 0.84761888, + "num_input_tokens_seen": 69186950, + "router_z_loss_clip": 2.56835938, + "router_z_loss_mlp": 0.23181152, + "step": 3205, + "time_per_iteration": 2.635087251663208 + }, + { + "auxiliary_loss_clip": 0.06570686, + "auxiliary_loss_mlp": 0.01281824, + "balance_loss_clip": 0.06303753, + "balance_loss_mlp": 0.01259913, + "epoch": 0.19275514805350968, + "flos": 21513305358720.0, + "grad_norm": 2.6796703276560034, + "language_loss": 0.84088528, + "learning_rate": 3.728502366649107e-06, + "loss": 0.91941041, + "num_input_tokens_seen": 69204850, + "router_z_loss_clip": 2.671875, + "router_z_loss_mlp": 0.21911621, + "step": 3206, + "time_per_iteration": 2.5875258445739746 + }, + { + "auxiliary_loss_clip": 0.06462742, + "auxiliary_loss_mlp": 0.01299031, + "balance_loss_clip": 0.06320498, + "balance_loss_mlp": 0.01291426, + "epoch": 0.19281527130617768, + "flos": 47711578602240.0, + "grad_norm": 0.8155276906071137, + "language_loss": 0.60688889, + "learning_rate": 3.728306411079786e-06, + "loss": 0.68450665, + "num_input_tokens_seen": 69259200, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.07592773, + "step": 3207, + "time_per_iteration": 2.98170804977417 + }, + { + "auxiliary_loss_clip": 0.06570975, + "auxiliary_loss_mlp": 0.01284779, + "balance_loss_clip": 0.06306583, + "balance_loss_mlp": 0.01261426, + "epoch": 0.19287539455884564, + "flos": 11805961196160.0, + "grad_norm": 2.350100512422909, + "language_loss": 0.76272619, + "learning_rate": 3.7281103899729125e-06, + "loss": 0.8412838, + "num_input_tokens_seen": 69275835, + "router_z_loss_clip": 2.64453125, + "router_z_loss_mlp": 0.23364258, + "step": 3208, + "time_per_iteration": 2.495664358139038 + }, + { + "auxiliary_loss_clip": 0.06570548, + "auxiliary_loss_mlp": 0.01287656, + "balance_loss_clip": 0.06303693, + "balance_loss_mlp": 0.01263253, + "epoch": 0.1929355178115136, + "flos": 20637724919040.0, + "grad_norm": 2.572131519169912, + "language_loss": 0.61787575, + "learning_rate": 3.7279143033359195e-06, + "loss": 0.69645774, + "num_input_tokens_seen": 69294810, + "router_z_loss_clip": 2.66796875, + "router_z_loss_mlp": 0.24389648, + "step": 3209, + "time_per_iteration": 2.5720291137695312 + }, + { + "auxiliary_loss_clip": 0.06569174, + "auxiliary_loss_mlp": 0.0128696, + "balance_loss_clip": 0.06303342, + "balance_loss_mlp": 0.01262832, + "epoch": 0.19299564106418157, + "flos": 40817555602560.0, + "grad_norm": 2.1926342764258773, + "language_loss": 0.80817664, + "learning_rate": 3.727718151176243e-06, + "loss": 0.88673794, + "num_input_tokens_seen": 69316065, + "router_z_loss_clip": 2.65820312, + "router_z_loss_mlp": 0.24133301, + "step": 3210, + "time_per_iteration": 2.6967084407806396 + }, + { + "auxiliary_loss_clip": 0.06562287, + "auxiliary_loss_mlp": 0.01281086, + "balance_loss_clip": 0.06305824, + "balance_loss_mlp": 0.01258913, + "epoch": 0.19305576431684954, + "flos": 11365718244480.0, + "grad_norm": 4.335018711819376, + "language_loss": 0.83798629, + "learning_rate": 3.7275219335013217e-06, + "loss": 0.9164201, + "num_input_tokens_seen": 69332900, + "router_z_loss_clip": 2.56445312, + "router_z_loss_mlp": 0.22167969, + "step": 3211, + "time_per_iteration": 2.522151470184326 + }, + { + "auxiliary_loss_clip": 0.06460443, + "auxiliary_loss_mlp": 0.01261987, + "balance_loss_clip": 0.06318722, + "balance_loss_mlp": 0.01254787, + "epoch": 0.1931158875695175, + "flos": 54527476798080.0, + "grad_norm": 0.9401062048905866, + "language_loss": 0.63522434, + "learning_rate": 3.7273256503185953e-06, + "loss": 0.71244872, + "num_input_tokens_seen": 69382535, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.07196045, + "step": 3212, + "time_per_iteration": 3.0072474479675293 + }, + { + "auxiliary_loss_clip": 0.06559554, + "auxiliary_loss_mlp": 0.01284587, + "balance_loss_clip": 0.06301133, + "balance_loss_mlp": 0.01260936, + "epoch": 0.19317601082218547, + "flos": 19834540007040.0, + "grad_norm": 1.629103353649286, + "language_loss": 0.7732501, + "learning_rate": 3.7271293016355074e-06, + "loss": 0.85169148, + "num_input_tokens_seen": 69400600, + "router_z_loss_clip": 2.58398438, + "router_z_loss_mlp": 0.23669434, + "step": 3213, + "time_per_iteration": 3.972214698791504 + }, + { + "auxiliary_loss_clip": 0.06571522, + "auxiliary_loss_mlp": 0.01282458, + "balance_loss_clip": 0.06306578, + "balance_loss_mlp": 0.01259749, + "epoch": 0.19323613407485346, + "flos": 13157143810560.0, + "grad_norm": 2.0451873974907864, + "language_loss": 0.71339387, + "learning_rate": 3.726932887459503e-06, + "loss": 0.79193366, + "num_input_tokens_seen": 69417350, + "router_z_loss_clip": 2.64648438, + "router_z_loss_mlp": 0.22729492, + "step": 3214, + "time_per_iteration": 2.542698383331299 + }, + { + "auxiliary_loss_clip": 0.06565271, + "auxiliary_loss_mlp": 0.01287539, + "balance_loss_clip": 0.06303567, + "balance_loss_mlp": 0.01264365, + "epoch": 0.19329625732752143, + "flos": 14032388833920.0, + "grad_norm": 2.534528672768976, + "language_loss": 0.75987494, + "learning_rate": 3.72673640779803e-06, + "loss": 0.83840305, + "num_input_tokens_seen": 69431845, + "router_z_loss_clip": 2.61328125, + "router_z_loss_mlp": 0.23205566, + "step": 3215, + "time_per_iteration": 3.8739888668060303 + }, + { + "auxiliary_loss_clip": 0.06557035, + "auxiliary_loss_mlp": 0.01279943, + "balance_loss_clip": 0.06302097, + "balance_loss_mlp": 0.01257615, + "epoch": 0.1933563805801894, + "flos": 23448641512320.0, + "grad_norm": 2.010602658012729, + "language_loss": 0.88668227, + "learning_rate": 3.72653986265854e-06, + "loss": 0.96505201, + "num_input_tokens_seen": 69453275, + "router_z_loss_clip": 2.55273438, + "router_z_loss_mlp": 0.22338867, + "step": 3216, + "time_per_iteration": 2.5690455436706543 + }, + { + "auxiliary_loss_clip": 0.06557489, + "auxiliary_loss_mlp": 0.01281443, + "balance_loss_clip": 0.06301452, + "balance_loss_mlp": 0.01259019, + "epoch": 0.19341650383285736, + "flos": 20491550271360.0, + "grad_norm": 2.1677144094151823, + "language_loss": 0.80915409, + "learning_rate": 3.726343252048485e-06, + "loss": 0.88754338, + "num_input_tokens_seen": 69471830, + "router_z_loss_clip": 2.55859375, + "router_z_loss_mlp": 0.2244873, + "step": 3217, + "time_per_iteration": 2.522089958190918 + }, + { + "auxiliary_loss_clip": 0.06573136, + "auxiliary_loss_mlp": 0.01282755, + "balance_loss_clip": 0.06306149, + "balance_loss_mlp": 0.01257709, + "epoch": 0.19347662708552532, + "flos": 17864305827840.0, + "grad_norm": 3.8111547770960907, + "language_loss": 0.63612419, + "learning_rate": 3.7261465759753206e-06, + "loss": 0.71468312, + "num_input_tokens_seen": 69489320, + "router_z_loss_clip": 2.66992188, + "router_z_loss_mlp": 0.25048828, + "step": 3218, + "time_per_iteration": 2.511009693145752 + }, + { + "auxiliary_loss_clip": 0.06568655, + "auxiliary_loss_mlp": 0.01286799, + "balance_loss_clip": 0.06304532, + "balance_loss_mlp": 0.01262945, + "epoch": 0.1935367503381933, + "flos": 18193188303360.0, + "grad_norm": 1.6615722636986479, + "language_loss": 0.80769217, + "learning_rate": 3.7259498344465053e-06, + "loss": 0.88624674, + "num_input_tokens_seen": 69506665, + "router_z_loss_clip": 2.63867188, + "router_z_loss_mlp": 0.23852539, + "step": 3219, + "time_per_iteration": 2.49652099609375 + }, + { + "auxiliary_loss_clip": 0.06560229, + "auxiliary_loss_mlp": 0.01283688, + "balance_loss_clip": 0.06305727, + "balance_loss_mlp": 0.01262183, + "epoch": 0.19359687359086128, + "flos": 15961939056000.0, + "grad_norm": 2.4004031272371096, + "language_loss": 0.87055713, + "learning_rate": 3.7257530274694993e-06, + "loss": 0.94899631, + "num_input_tokens_seen": 69523835, + "router_z_loss_clip": 2.54492188, + "router_z_loss_mlp": 0.21520996, + "step": 3220, + "time_per_iteration": 3.9898974895477295 + }, + { + "auxiliary_loss_clip": 0.06557765, + "auxiliary_loss_mlp": 0.01279498, + "balance_loss_clip": 0.06308522, + "balance_loss_mlp": 0.0125829, + "epoch": 0.19365699684352924, + "flos": 21221584968960.0, + "grad_norm": 2.3273733740868296, + "language_loss": 0.84724689, + "learning_rate": 3.725556155051766e-06, + "loss": 0.92561948, + "num_input_tokens_seen": 69542620, + "router_z_loss_clip": 2.48828125, + "router_z_loss_mlp": 0.21191406, + "step": 3221, + "time_per_iteration": 2.546876907348633 + }, + { + "auxiliary_loss_clip": 0.06557351, + "auxiliary_loss_mlp": 0.01282697, + "balance_loss_clip": 0.06305219, + "balance_loss_mlp": 0.01260333, + "epoch": 0.1937171200961972, + "flos": 17316811249920.0, + "grad_norm": 2.1420374809622507, + "language_loss": 0.8628484, + "learning_rate": 3.7253592172007702e-06, + "loss": 0.94124895, + "num_input_tokens_seen": 69561130, + "router_z_loss_clip": 2.52148438, + "router_z_loss_mlp": 0.22351074, + "step": 3222, + "time_per_iteration": 2.497483015060425 + }, + { + "auxiliary_loss_clip": 0.06565784, + "auxiliary_loss_mlp": 0.0127706, + "balance_loss_clip": 0.06304947, + "balance_loss_mlp": 0.01255114, + "epoch": 0.19377724334886517, + "flos": 22642228218240.0, + "grad_norm": 2.292443034833117, + "language_loss": 0.7909472, + "learning_rate": 3.72516221392398e-06, + "loss": 0.86937559, + "num_input_tokens_seen": 69580425, + "router_z_loss_clip": 2.609375, + "router_z_loss_mlp": 0.21948242, + "step": 3223, + "time_per_iteration": 2.63804292678833 + }, + { + "auxiliary_loss_clip": 0.06563858, + "auxiliary_loss_mlp": 0.01278148, + "balance_loss_clip": 0.06308811, + "balance_loss_mlp": 0.01256452, + "epoch": 0.19383736660153314, + "flos": 15081872423040.0, + "grad_norm": 2.2027436227921977, + "language_loss": 0.76066363, + "learning_rate": 3.7249651452288653e-06, + "loss": 0.83908367, + "num_input_tokens_seen": 69597085, + "router_z_loss_clip": 2.55273438, + "router_z_loss_mlp": 0.21728516, + "step": 3224, + "time_per_iteration": 2.4926822185516357 + }, + { + "auxiliary_loss_clip": 0.06569614, + "auxiliary_loss_mlp": 0.01280842, + "balance_loss_clip": 0.06311695, + "balance_loss_mlp": 0.01257155, + "epoch": 0.1938974898542011, + "flos": 47130626246400.0, + "grad_norm": 2.47304361876348, + "language_loss": 0.71419585, + "learning_rate": 3.7247680111229e-06, + "loss": 0.79270041, + "num_input_tokens_seen": 69618885, + "router_z_loss_clip": 2.578125, + "router_z_loss_mlp": 0.23681641, + "step": 3225, + "time_per_iteration": 2.8417437076568604 + }, + { + "auxiliary_loss_clip": 0.0656653, + "auxiliary_loss_mlp": 0.01277702, + "balance_loss_clip": 0.06306545, + "balance_loss_mlp": 0.01255076, + "epoch": 0.19395761310686907, + "flos": 25819734424320.0, + "grad_norm": 2.3579945849430235, + "language_loss": 0.6987173, + "learning_rate": 3.7245708116135585e-06, + "loss": 0.77715963, + "num_input_tokens_seen": 69638200, + "router_z_loss_clip": 2.59960938, + "router_z_loss_mlp": 0.22619629, + "step": 3226, + "time_per_iteration": 2.5816895961761475 + }, + { + "auxiliary_loss_clip": 0.06556038, + "auxiliary_loss_mlp": 0.01279426, + "balance_loss_clip": 0.06305292, + "balance_loss_mlp": 0.01255608, + "epoch": 0.19401773635953706, + "flos": 23046315333120.0, + "grad_norm": 1.6993594132957168, + "language_loss": 0.76826584, + "learning_rate": 3.7243735467083193e-06, + "loss": 0.84662044, + "num_input_tokens_seen": 69657550, + "router_z_loss_clip": 2.5078125, + "router_z_loss_mlp": 0.23815918, + "step": 3227, + "time_per_iteration": 2.5873494148254395 + }, + { + "auxiliary_loss_clip": 0.06565821, + "auxiliary_loss_mlp": 0.01279646, + "balance_loss_clip": 0.063063, + "balance_loss_mlp": 0.01257187, + "epoch": 0.19407785961220503, + "flos": 15925615511040.0, + "grad_norm": 1.984580707337323, + "language_loss": 0.70403302, + "learning_rate": 3.724176216414662e-06, + "loss": 0.78248763, + "num_input_tokens_seen": 69675005, + "router_z_loss_clip": 2.59765625, + "router_z_loss_mlp": 0.22460938, + "step": 3228, + "time_per_iteration": 2.5275485515594482 + }, + { + "auxiliary_loss_clip": 0.06563079, + "auxiliary_loss_mlp": 0.01279835, + "balance_loss_clip": 0.06306829, + "balance_loss_mlp": 0.01257662, + "epoch": 0.194137982864873, + "flos": 25928872767360.0, + "grad_norm": 1.8334459249779138, + "language_loss": 0.74913502, + "learning_rate": 3.72397882074007e-06, + "loss": 0.82756412, + "num_input_tokens_seen": 69696455, + "router_z_loss_clip": 2.56640625, + "router_z_loss_mlp": 0.2220459, + "step": 3229, + "time_per_iteration": 2.588756561279297 + }, + { + "auxiliary_loss_clip": 0.06561101, + "auxiliary_loss_mlp": 0.01283623, + "balance_loss_clip": 0.06304256, + "balance_loss_mlp": 0.01260126, + "epoch": 0.19419810611754096, + "flos": 13266407934720.0, + "grad_norm": 2.0512138922716034, + "language_loss": 0.66050041, + "learning_rate": 3.7237813596920285e-06, + "loss": 0.73894763, + "num_input_tokens_seen": 69714245, + "router_z_loss_clip": 2.57226562, + "router_z_loss_mlp": 0.23486328, + "step": 3230, + "time_per_iteration": 2.51173996925354 + }, + { + "auxiliary_loss_clip": 0.06559683, + "auxiliary_loss_mlp": 0.01282022, + "balance_loss_clip": 0.06306173, + "balance_loss_mlp": 0.01259444, + "epoch": 0.19425822937020892, + "flos": 15710986477440.0, + "grad_norm": 1.9323382078744304, + "language_loss": 0.82361978, + "learning_rate": 3.7235838332780254e-06, + "loss": 0.90203679, + "num_input_tokens_seen": 69731515, + "router_z_loss_clip": 2.53515625, + "router_z_loss_mlp": 0.22583008, + "step": 3231, + "time_per_iteration": 2.5331170558929443 + }, + { + "auxiliary_loss_clip": 0.06565376, + "auxiliary_loss_mlp": 0.01284277, + "balance_loss_clip": 0.0630706, + "balance_loss_mlp": 0.01260793, + "epoch": 0.1943183526228769, + "flos": 23110912431360.0, + "grad_norm": 1.7851653331870696, + "language_loss": 0.8806898, + "learning_rate": 3.72338624150555e-06, + "loss": 0.95918632, + "num_input_tokens_seen": 69748885, + "router_z_loss_clip": 2.5859375, + "router_z_loss_mlp": 0.23474121, + "step": 3232, + "time_per_iteration": 2.556128740310669 + }, + { + "auxiliary_loss_clip": 0.06561054, + "auxiliary_loss_mlp": 0.01288213, + "balance_loss_clip": 0.06308518, + "balance_loss_mlp": 0.01265718, + "epoch": 0.19437847587554485, + "flos": 24718707774720.0, + "grad_norm": 1.9425002506843316, + "language_loss": 0.8592729, + "learning_rate": 3.723188584382096e-06, + "loss": 0.93776554, + "num_input_tokens_seen": 69767540, + "router_z_loss_clip": 2.52929688, + "router_z_loss_mlp": 0.22497559, + "step": 3233, + "time_per_iteration": 2.5888071060180664 + }, + { + "auxiliary_loss_clip": 0.06570844, + "auxiliary_loss_mlp": 0.01287681, + "balance_loss_clip": 0.06309654, + "balance_loss_mlp": 0.01263195, + "epoch": 0.19443859912821285, + "flos": 23123448616320.0, + "grad_norm": 2.322933236090491, + "language_loss": 0.8952834, + "learning_rate": 3.722990861915158e-06, + "loss": 0.97386861, + "num_input_tokens_seen": 69789340, + "router_z_loss_clip": 2.61328125, + "router_z_loss_mlp": 0.24499512, + "step": 3234, + "time_per_iteration": 2.598424196243286 + }, + { + "auxiliary_loss_clip": 0.0656711, + "auxiliary_loss_mlp": 0.01279524, + "balance_loss_clip": 0.06307149, + "balance_loss_mlp": 0.01256243, + "epoch": 0.1944987223808808, + "flos": 15089545071360.0, + "grad_norm": 2.0762312051619993, + "language_loss": 0.7883603, + "learning_rate": 3.722793074112234e-06, + "loss": 0.86682659, + "num_input_tokens_seen": 69806470, + "router_z_loss_clip": 2.59765625, + "router_z_loss_mlp": 0.23291016, + "step": 3235, + "time_per_iteration": 2.518150806427002 + }, + { + "auxiliary_loss_clip": 0.06562902, + "auxiliary_loss_mlp": 0.01278747, + "balance_loss_clip": 0.06309078, + "balance_loss_mlp": 0.01257253, + "epoch": 0.19455884563354878, + "flos": 17132258632320.0, + "grad_norm": 2.012702835830896, + "language_loss": 0.79693586, + "learning_rate": 3.7225952209808233e-06, + "loss": 0.87535232, + "num_input_tokens_seen": 69822655, + "router_z_loss_clip": 2.54296875, + "router_z_loss_mlp": 0.21520996, + "step": 3236, + "time_per_iteration": 2.5621957778930664 + }, + { + "auxiliary_loss_clip": 0.06562862, + "auxiliary_loss_mlp": 0.01279358, + "balance_loss_clip": 0.06309117, + "balance_loss_mlp": 0.0125635, + "epoch": 0.19461896888621674, + "flos": 20199578319360.0, + "grad_norm": 1.7644130728207734, + "language_loss": 0.76505381, + "learning_rate": 3.72239730252843e-06, + "loss": 0.84347594, + "num_input_tokens_seen": 69841895, + "router_z_loss_clip": 2.53710938, + "router_z_loss_mlp": 0.23010254, + "step": 3237, + "time_per_iteration": 2.545138359069824 + }, + { + "auxiliary_loss_clip": 0.06572011, + "auxiliary_loss_mlp": 0.01287724, + "balance_loss_clip": 0.06309787, + "balance_loss_mlp": 0.01264455, + "epoch": 0.1946790921388847, + "flos": 25308395683200.0, + "grad_norm": 3.0171180207385855, + "language_loss": 0.75939953, + "learning_rate": 3.7221993187625583e-06, + "loss": 0.8379969, + "num_input_tokens_seen": 69862220, + "router_z_loss_clip": 2.62304688, + "router_z_loss_mlp": 0.23291016, + "step": 3238, + "time_per_iteration": 2.6292033195495605 + }, + { + "auxiliary_loss_clip": 0.06564013, + "auxiliary_loss_mlp": 0.01283016, + "balance_loss_clip": 0.0631004, + "balance_loss_mlp": 0.0126033, + "epoch": 0.19473921539155267, + "flos": 20199578319360.0, + "grad_norm": 5.2039179549819, + "language_loss": 0.740753, + "learning_rate": 3.7220012696907155e-06, + "loss": 0.81922328, + "num_input_tokens_seen": 69881830, + "router_z_loss_clip": 2.5390625, + "router_z_loss_mlp": 0.22692871, + "step": 3239, + "time_per_iteration": 2.5251026153564453 + }, + { + "auxiliary_loss_clip": 0.06561047, + "auxiliary_loss_mlp": 0.01279887, + "balance_loss_clip": 0.06308049, + "balance_loss_mlp": 0.01257464, + "epoch": 0.19479933864422067, + "flos": 20894002231680.0, + "grad_norm": 2.589752485587752, + "language_loss": 0.74076676, + "learning_rate": 3.721803155320412e-06, + "loss": 0.8191762, + "num_input_tokens_seen": 69900515, + "router_z_loss_clip": 2.53125, + "router_z_loss_mlp": 0.22424316, + "step": 3240, + "time_per_iteration": 2.5630886554718018 + }, + { + "auxiliary_loss_clip": 0.06569096, + "auxiliary_loss_mlp": 0.01285658, + "balance_loss_clip": 0.06312588, + "balance_loss_mlp": 0.01262758, + "epoch": 0.19485946189688863, + "flos": 23301837959040.0, + "grad_norm": 2.269188581778515, + "language_loss": 0.67009896, + "learning_rate": 3.7216049756591606e-06, + "loss": 0.7486465, + "num_input_tokens_seen": 69920060, + "router_z_loss_clip": 2.56640625, + "router_z_loss_mlp": 0.22888184, + "step": 3241, + "time_per_iteration": 2.5366311073303223 + }, + { + "auxiliary_loss_clip": 0.0657091, + "auxiliary_loss_mlp": 0.01284859, + "balance_loss_clip": 0.06315701, + "balance_loss_mlp": 0.01261017, + "epoch": 0.1949195851495566, + "flos": 23301796032000.0, + "grad_norm": 1.7252715969085026, + "language_loss": 0.8313868, + "learning_rate": 3.7214067307144754e-06, + "loss": 0.90994453, + "num_input_tokens_seen": 69939820, + "router_z_loss_clip": 2.5546875, + "router_z_loss_mlp": 0.23828125, + "step": 3242, + "time_per_iteration": 2.5582659244537354 + }, + { + "auxiliary_loss_clip": 0.06462191, + "auxiliary_loss_mlp": 0.01271622, + "balance_loss_clip": 0.06317475, + "balance_loss_mlp": 0.01264684, + "epoch": 0.19497970840222456, + "flos": 64982884285440.0, + "grad_norm": 0.8039225971535554, + "language_loss": 0.57435864, + "learning_rate": 3.721208420493875e-06, + "loss": 0.6516968, + "num_input_tokens_seen": 70002145, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.06951904, + "step": 3243, + "time_per_iteration": 3.1517677307128906 + }, + { + "auxiliary_loss_clip": 0.06582105, + "auxiliary_loss_mlp": 0.01289713, + "balance_loss_clip": 0.06324299, + "balance_loss_mlp": 0.01264619, + "epoch": 0.19503983165489253, + "flos": 19650574368000.0, + "grad_norm": 1.7327160710810887, + "language_loss": 0.83662367, + "learning_rate": 3.7210100450048784e-06, + "loss": 0.91534185, + "num_input_tokens_seen": 70020510, + "router_z_loss_clip": 2.58007812, + "router_z_loss_mlp": 0.25085449, + "step": 3244, + "time_per_iteration": 2.580615282058716 + }, + { + "auxiliary_loss_clip": 0.06580628, + "auxiliary_loss_mlp": 0.01287488, + "balance_loss_clip": 0.06321178, + "balance_loss_mlp": 0.01264206, + "epoch": 0.1950999549075605, + "flos": 21148308973440.0, + "grad_norm": 1.8443508562563502, + "language_loss": 0.77383208, + "learning_rate": 3.7208116042550088e-06, + "loss": 0.85251331, + "num_input_tokens_seen": 70040760, + "router_z_loss_clip": 2.59375, + "router_z_loss_mlp": 0.23278809, + "step": 3245, + "time_per_iteration": 2.562547206878662 + }, + { + "auxiliary_loss_clip": 0.06574707, + "auxiliary_loss_mlp": 0.01284069, + "balance_loss_clip": 0.06316134, + "balance_loss_mlp": 0.01260168, + "epoch": 0.19516007816022846, + "flos": 20890815776640.0, + "grad_norm": 1.9180190042930891, + "language_loss": 0.84645605, + "learning_rate": 3.7206130982517906e-06, + "loss": 0.92504388, + "num_input_tokens_seen": 70058720, + "router_z_loss_clip": 2.58789062, + "router_z_loss_mlp": 0.2388916, + "step": 3246, + "time_per_iteration": 2.5781290531158447 + }, + { + "auxiliary_loss_clip": 0.06585012, + "auxiliary_loss_mlp": 0.01283635, + "balance_loss_clip": 0.0632351, + "balance_loss_mlp": 0.012612, + "epoch": 0.19522020141289645, + "flos": 16916287933440.0, + "grad_norm": 2.4019655481348177, + "language_loss": 0.77056623, + "learning_rate": 3.7204145270027514e-06, + "loss": 0.8492527, + "num_input_tokens_seen": 70076470, + "router_z_loss_clip": 2.61523438, + "router_z_loss_mlp": 0.22436523, + "step": 3247, + "time_per_iteration": 2.5042033195495605 + }, + { + "auxiliary_loss_clip": 0.06582692, + "auxiliary_loss_mlp": 0.01287787, + "balance_loss_clip": 0.06325091, + "balance_loss_mlp": 0.01264136, + "epoch": 0.19528032466556441, + "flos": 26732183460480.0, + "grad_norm": 1.5912411640106108, + "language_loss": 0.75763261, + "learning_rate": 3.720215890515421e-06, + "loss": 0.83633739, + "num_input_tokens_seen": 70096220, + "router_z_loss_clip": 2.58007812, + "router_z_loss_mlp": 0.23669434, + "step": 3248, + "time_per_iteration": 2.629751205444336 + }, + { + "auxiliary_loss_clip": 0.0657216, + "auxiliary_loss_mlp": 0.01286346, + "balance_loss_clip": 0.06312956, + "balance_loss_mlp": 0.01263994, + "epoch": 0.19534044791823238, + "flos": 21039170630400.0, + "grad_norm": 2.0257715109614822, + "language_loss": 0.79102194, + "learning_rate": 3.7200171887973316e-06, + "loss": 0.86960697, + "num_input_tokens_seen": 70114800, + "router_z_loss_clip": 2.58984375, + "router_z_loss_mlp": 0.22375488, + "step": 3249, + "time_per_iteration": 2.5774686336517334 + }, + { + "auxiliary_loss_clip": 0.06565905, + "auxiliary_loss_mlp": 0.01285899, + "balance_loss_clip": 0.06309386, + "balance_loss_mlp": 0.01263035, + "epoch": 0.19540057117090034, + "flos": 22350256266240.0, + "grad_norm": 1.6645797480066, + "language_loss": 0.73634374, + "learning_rate": 3.7198184218560176e-06, + "loss": 0.81486177, + "num_input_tokens_seen": 70134930, + "router_z_loss_clip": 2.56835938, + "router_z_loss_mlp": 0.2286377, + "step": 3250, + "time_per_iteration": 2.5834462642669678 + }, + { + "auxiliary_loss_clip": 0.06557436, + "auxiliary_loss_mlp": 0.01284202, + "balance_loss_clip": 0.06304777, + "balance_loss_mlp": 0.01261791, + "epoch": 0.1954606944235683, + "flos": 20307626559360.0, + "grad_norm": 5.203824713813235, + "language_loss": 0.80619103, + "learning_rate": 3.719619589699017e-06, + "loss": 0.88460743, + "num_input_tokens_seen": 70152045, + "router_z_loss_clip": 2.52929688, + "router_z_loss_mlp": 0.22399902, + "step": 3251, + "time_per_iteration": 2.5159976482391357 + }, + { + "auxiliary_loss_clip": 0.06569009, + "auxiliary_loss_mlp": 0.0128766, + "balance_loss_clip": 0.06309755, + "balance_loss_mlp": 0.01264593, + "epoch": 0.19552081767623627, + "flos": 17352463962240.0, + "grad_norm": 2.6280610562746882, + "language_loss": 0.84652966, + "learning_rate": 3.7194206923338695e-06, + "loss": 0.92509639, + "num_input_tokens_seen": 70169240, + "router_z_loss_clip": 2.59375, + "router_z_loss_mlp": 0.23071289, + "step": 3252, + "time_per_iteration": 2.584712505340576 + }, + { + "auxiliary_loss_clip": 0.0657175, + "auxiliary_loss_mlp": 0.01284666, + "balance_loss_clip": 0.06305347, + "balance_loss_mlp": 0.01258559, + "epoch": 0.19558094092890424, + "flos": 31985666098560.0, + "grad_norm": 1.8259798075239808, + "language_loss": 0.74205744, + "learning_rate": 3.719221729768117e-06, + "loss": 0.82062161, + "num_input_tokens_seen": 70192690, + "router_z_loss_clip": 2.66601562, + "router_z_loss_mlp": 0.26098633, + "step": 3253, + "time_per_iteration": 4.126874685287476 + }, + { + "auxiliary_loss_clip": 0.06567718, + "auxiliary_loss_mlp": 0.01281159, + "balance_loss_clip": 0.06301166, + "balance_loss_mlp": 0.0125721, + "epoch": 0.19564106418157223, + "flos": 22274716210560.0, + "grad_norm": 1.973936337746025, + "language_loss": 0.77398765, + "learning_rate": 3.7190227020093037e-06, + "loss": 0.85247642, + "num_input_tokens_seen": 70209685, + "router_z_loss_clip": 2.66210938, + "router_z_loss_mlp": 0.23962402, + "step": 3254, + "time_per_iteration": 2.6537773609161377 + }, + { + "auxiliary_loss_clip": 0.06437294, + "auxiliary_loss_mlp": 0.01260118, + "balance_loss_clip": 0.06291844, + "balance_loss_mlp": 0.01253204, + "epoch": 0.1957011874342402, + "flos": 54379876631040.0, + "grad_norm": 0.7412950515810539, + "language_loss": 0.55013955, + "learning_rate": 3.7188236090649774e-06, + "loss": 0.62711358, + "num_input_tokens_seen": 70265050, + "router_z_loss_clip": 1.453125, + "router_z_loss_mlp": 0.06933594, + "step": 3255, + "time_per_iteration": 4.54949426651001 + }, + { + "auxiliary_loss_clip": 0.06563026, + "auxiliary_loss_mlp": 0.01289416, + "balance_loss_clip": 0.06301506, + "balance_loss_mlp": 0.01265407, + "epoch": 0.19576131068690816, + "flos": 16511991183360.0, + "grad_norm": 2.710710922193229, + "language_loss": 0.71672189, + "learning_rate": 3.718624450942688e-06, + "loss": 0.79524636, + "num_input_tokens_seen": 70281830, + "router_z_loss_clip": 2.61523438, + "router_z_loss_mlp": 0.2401123, + "step": 3256, + "time_per_iteration": 2.563938617706299 + }, + { + "auxiliary_loss_clip": 0.06557887, + "auxiliary_loss_mlp": 0.01283051, + "balance_loss_clip": 0.06298412, + "balance_loss_mlp": 0.01259591, + "epoch": 0.19582143393957613, + "flos": 14724800248320.0, + "grad_norm": 2.2116868908222176, + "language_loss": 0.8133806, + "learning_rate": 3.718425227649987e-06, + "loss": 0.89178997, + "num_input_tokens_seen": 70297420, + "router_z_loss_clip": 2.59570312, + "router_z_loss_mlp": 0.23461914, + "step": 3257, + "time_per_iteration": 2.546842336654663 + }, + { + "auxiliary_loss_clip": 0.06568147, + "auxiliary_loss_mlp": 0.01289159, + "balance_loss_clip": 0.06309533, + "balance_loss_mlp": 0.01264554, + "epoch": 0.1958815571922441, + "flos": 24432354046080.0, + "grad_norm": 4.3707104143190785, + "language_loss": 0.76246595, + "learning_rate": 3.7182259391944292e-06, + "loss": 0.841039, + "num_input_tokens_seen": 70319210, + "router_z_loss_clip": 2.58398438, + "router_z_loss_mlp": 0.24609375, + "step": 3258, + "time_per_iteration": 2.596585273742676 + }, + { + "auxiliary_loss_clip": 0.06562606, + "auxiliary_loss_mlp": 0.01282027, + "balance_loss_clip": 0.06300102, + "balance_loss_mlp": 0.01257828, + "epoch": 0.19594168044491206, + "flos": 24907285388160.0, + "grad_norm": 1.9490064747675282, + "language_loss": 0.74507892, + "learning_rate": 3.7180265855835714e-06, + "loss": 0.82352525, + "num_input_tokens_seen": 70339045, + "router_z_loss_clip": 2.62695312, + "router_z_loss_mlp": 0.24230957, + "step": 3259, + "time_per_iteration": 2.572443723678589 + }, + { + "auxiliary_loss_clip": 0.06562422, + "auxiliary_loss_mlp": 0.01289683, + "balance_loss_clip": 0.06298189, + "balance_loss_mlp": 0.01263302, + "epoch": 0.19600180369758005, + "flos": 12061819238400.0, + "grad_norm": 2.2810085679716106, + "language_loss": 0.7772423, + "learning_rate": 3.7178271668249735e-06, + "loss": 0.85576332, + "num_input_tokens_seen": 70356505, + "router_z_loss_clip": 2.63867188, + "router_z_loss_mlp": 0.26379395, + "step": 3260, + "time_per_iteration": 5.330974340438843 + }, + { + "auxiliary_loss_clip": 0.06562512, + "auxiliary_loss_mlp": 0.01290293, + "balance_loss_clip": 0.06300309, + "balance_loss_mlp": 0.01266046, + "epoch": 0.19606192695024802, + "flos": 20856504729600.0, + "grad_norm": 2.085882514659535, + "language_loss": 0.83190846, + "learning_rate": 3.7176276829261975e-06, + "loss": 0.91043651, + "num_input_tokens_seen": 70375410, + "router_z_loss_clip": 2.62109375, + "router_z_loss_mlp": 0.24279785, + "step": 3261, + "time_per_iteration": 2.5832743644714355 + }, + { + "auxiliary_loss_clip": 0.06565593, + "auxiliary_loss_mlp": 0.01288067, + "balance_loss_clip": 0.06304751, + "balance_loss_mlp": 0.01263296, + "epoch": 0.19612205020291598, + "flos": 28483050850560.0, + "grad_norm": 1.7951789750723233, + "language_loss": 0.77451867, + "learning_rate": 3.717428133894807e-06, + "loss": 0.85305524, + "num_input_tokens_seen": 70396315, + "router_z_loss_clip": 2.60546875, + "router_z_loss_mlp": 0.24768066, + "step": 3262, + "time_per_iteration": 2.5895204544067383 + }, + { + "auxiliary_loss_clip": 0.06560683, + "auxiliary_loss_mlp": 0.01286928, + "balance_loss_clip": 0.06303811, + "balance_loss_mlp": 0.01264004, + "epoch": 0.19618217345558395, + "flos": 25563666746880.0, + "grad_norm": 1.6758780497522678, + "language_loss": 0.87025416, + "learning_rate": 3.71722851973837e-06, + "loss": 0.94873023, + "num_input_tokens_seen": 70417945, + "router_z_loss_clip": 2.57226562, + "router_z_loss_mlp": 0.22937012, + "step": 3263, + "time_per_iteration": 2.5864033699035645 + }, + { + "auxiliary_loss_clip": 0.0656628, + "auxiliary_loss_mlp": 0.01296773, + "balance_loss_clip": 0.06306224, + "balance_loss_mlp": 0.0127137, + "epoch": 0.1962422967082519, + "flos": 25271359378560.0, + "grad_norm": 1.67172611639437, + "language_loss": 0.74829996, + "learning_rate": 3.717028840464455e-06, + "loss": 0.82693052, + "num_input_tokens_seen": 70438690, + "router_z_loss_clip": 2.60546875, + "router_z_loss_mlp": 0.25390625, + "step": 3264, + "time_per_iteration": 2.5601091384887695 + }, + { + "auxiliary_loss_clip": 0.06569743, + "auxiliary_loss_mlp": 0.01288835, + "balance_loss_clip": 0.0631538, + "balance_loss_mlp": 0.01264337, + "epoch": 0.19630241996091988, + "flos": 18813371898240.0, + "grad_norm": 2.189524829184907, + "language_loss": 0.7983582, + "learning_rate": 3.7168290960806344e-06, + "loss": 0.87694395, + "num_input_tokens_seen": 70455385, + "router_z_loss_clip": 2.546875, + "router_z_loss_mlp": 0.24511719, + "step": 3265, + "time_per_iteration": 2.540691614151001 + }, + { + "auxiliary_loss_clip": 0.06455089, + "auxiliary_loss_mlp": 0.01265834, + "balance_loss_clip": 0.06313262, + "balance_loss_mlp": 0.01257317, + "epoch": 0.19636254321358784, + "flos": 62338240120320.0, + "grad_norm": 0.7691014679533006, + "language_loss": 0.53069305, + "learning_rate": 3.716629286594483e-06, + "loss": 0.60790235, + "num_input_tokens_seen": 70514280, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.08526611, + "step": 3266, + "time_per_iteration": 3.1712465286254883 + }, + { + "auxiliary_loss_clip": 0.06579427, + "auxiliary_loss_mlp": 0.01300624, + "balance_loss_clip": 0.06317084, + "balance_loss_mlp": 0.01276138, + "epoch": 0.19642266646625584, + "flos": 21075703810560.0, + "grad_norm": 2.1807082930425548, + "language_loss": 0.8080219, + "learning_rate": 3.7164294120135767e-06, + "loss": 0.88682246, + "num_input_tokens_seen": 70531800, + "router_z_loss_clip": 2.61914062, + "router_z_loss_mlp": 0.24487305, + "step": 3267, + "time_per_iteration": 2.551907539367676 + }, + { + "auxiliary_loss_clip": 0.06564153, + "auxiliary_loss_mlp": 0.0128147, + "balance_loss_clip": 0.06308893, + "balance_loss_mlp": 0.01257366, + "epoch": 0.1964827897189238, + "flos": 14543979137280.0, + "grad_norm": 2.1592598522148694, + "language_loss": 0.8731035, + "learning_rate": 3.7162294723454953e-06, + "loss": 0.95155978, + "num_input_tokens_seen": 70550615, + "router_z_loss_clip": 2.55078125, + "router_z_loss_mlp": 0.24108887, + "step": 3268, + "time_per_iteration": 2.520824909210205 + }, + { + "auxiliary_loss_clip": 0.06570253, + "auxiliary_loss_mlp": 0.01291413, + "balance_loss_clip": 0.0631839, + "balance_loss_mlp": 0.01268858, + "epoch": 0.19654291297159177, + "flos": 19250638030080.0, + "grad_norm": 2.3684809338902215, + "language_loss": 0.70127171, + "learning_rate": 3.7160294675978197e-06, + "loss": 0.77988833, + "num_input_tokens_seen": 70568690, + "router_z_loss_clip": 2.52148438, + "router_z_loss_mlp": 0.22546387, + "step": 3269, + "time_per_iteration": 2.542065382003784 + }, + { + "auxiliary_loss_clip": 0.06579614, + "auxiliary_loss_mlp": 0.01289007, + "balance_loss_clip": 0.06318989, + "balance_loss_mlp": 0.01263008, + "epoch": 0.19660303622425973, + "flos": 25782823900800.0, + "grad_norm": 3.1056086534351324, + "language_loss": 0.80997849, + "learning_rate": 3.715829397778135e-06, + "loss": 0.88866472, + "num_input_tokens_seen": 70588665, + "router_z_loss_clip": 2.60351562, + "router_z_loss_mlp": 0.25976562, + "step": 3270, + "time_per_iteration": 2.5732779502868652 + }, + { + "auxiliary_loss_clip": 0.0656828, + "auxiliary_loss_mlp": 0.0128367, + "balance_loss_clip": 0.06310552, + "balance_loss_mlp": 0.01257468, + "epoch": 0.1966631594769277, + "flos": 20601401374080.0, + "grad_norm": 4.117702501056874, + "language_loss": 0.84620351, + "learning_rate": 3.715629262894028e-06, + "loss": 0.92472303, + "num_input_tokens_seen": 70606900, + "router_z_loss_clip": 2.58007812, + "router_z_loss_mlp": 0.26220703, + "step": 3271, + "time_per_iteration": 2.54874587059021 + }, + { + "auxiliary_loss_clip": 0.06565209, + "auxiliary_loss_mlp": 0.01287963, + "balance_loss_clip": 0.06316341, + "balance_loss_mlp": 0.01263311, + "epoch": 0.19672328272959566, + "flos": 23629965747840.0, + "grad_norm": 1.9724475535226151, + "language_loss": 0.8064115, + "learning_rate": 3.715429062953087e-06, + "loss": 0.88494325, + "num_input_tokens_seen": 70625955, + "router_z_loss_clip": 2.48828125, + "router_z_loss_mlp": 0.2467041, + "step": 3272, + "time_per_iteration": 2.5446958541870117 + }, + { + "auxiliary_loss_clip": 0.06582461, + "auxiliary_loss_mlp": 0.01289002, + "balance_loss_clip": 0.06322335, + "balance_loss_mlp": 0.0126218, + "epoch": 0.19678340598226365, + "flos": 23117369195520.0, + "grad_norm": 1.7276133269560208, + "language_loss": 0.81592834, + "learning_rate": 3.7152287979629043e-06, + "loss": 0.89464301, + "num_input_tokens_seen": 70646090, + "router_z_loss_clip": 2.60351562, + "router_z_loss_mlp": 0.26831055, + "step": 3273, + "time_per_iteration": 2.625422239303589 + }, + { + "auxiliary_loss_clip": 0.06569564, + "auxiliary_loss_mlp": 0.01284595, + "balance_loss_clip": 0.06313652, + "balance_loss_mlp": 0.0126142, + "epoch": 0.19684352923493162, + "flos": 24541702024320.0, + "grad_norm": 1.8603958272733907, + "language_loss": 0.78998351, + "learning_rate": 3.7150284679310735e-06, + "loss": 0.86852515, + "num_input_tokens_seen": 70666065, + "router_z_loss_clip": 2.5625, + "router_z_loss_mlp": 0.23181152, + "step": 3274, + "time_per_iteration": 2.6299047470092773 + }, + { + "auxiliary_loss_clip": 0.06566115, + "auxiliary_loss_mlp": 0.01283599, + "balance_loss_clip": 0.0630929, + "balance_loss_mlp": 0.01259722, + "epoch": 0.19690365248759958, + "flos": 21802510126080.0, + "grad_norm": 2.495100495270235, + "language_loss": 0.82370663, + "learning_rate": 3.7148280728651914e-06, + "loss": 0.90220374, + "num_input_tokens_seen": 70681580, + "router_z_loss_clip": 2.56835938, + "router_z_loss_mlp": 0.23864746, + "step": 3275, + "time_per_iteration": 2.532348394393921 + }, + { + "auxiliary_loss_clip": 0.06571324, + "auxiliary_loss_mlp": 0.0128437, + "balance_loss_clip": 0.06313166, + "balance_loss_mlp": 0.01259134, + "epoch": 0.19696377574026755, + "flos": 19061683073280.0, + "grad_norm": 2.1007591714873968, + "language_loss": 0.81547761, + "learning_rate": 3.7146276127728563e-06, + "loss": 0.8940345, + "num_input_tokens_seen": 70697745, + "router_z_loss_clip": 2.58203125, + "router_z_loss_mlp": 0.25244141, + "step": 3276, + "time_per_iteration": 2.533137798309326 + }, + { + "auxiliary_loss_clip": 0.06571773, + "auxiliary_loss_mlp": 0.01280216, + "balance_loss_clip": 0.0631392, + "balance_loss_mlp": 0.01256135, + "epoch": 0.19702389899293551, + "flos": 22827325887360.0, + "grad_norm": 2.204561669505926, + "language_loss": 0.89893198, + "learning_rate": 3.7144270876616713e-06, + "loss": 0.97745186, + "num_input_tokens_seen": 70715110, + "router_z_loss_clip": 2.58007812, + "router_z_loss_mlp": 0.24084473, + "step": 3277, + "time_per_iteration": 2.5781216621398926 + }, + { + "auxiliary_loss_clip": 0.0657627, + "auxiliary_loss_mlp": 0.01285494, + "balance_loss_clip": 0.06313394, + "balance_loss_mlp": 0.01258922, + "epoch": 0.19708402224560348, + "flos": 22901021153280.0, + "grad_norm": 2.1685116517567273, + "language_loss": 0.63218272, + "learning_rate": 3.714226497539239e-06, + "loss": 0.71080041, + "num_input_tokens_seen": 70734715, + "router_z_loss_clip": 2.62890625, + "router_z_loss_mlp": 0.26574707, + "step": 3278, + "time_per_iteration": 2.5733482837677 + }, + { + "auxiliary_loss_clip": 0.06573428, + "auxiliary_loss_mlp": 0.01286907, + "balance_loss_clip": 0.0631459, + "balance_loss_mlp": 0.01261515, + "epoch": 0.19714414549827144, + "flos": 25668989729280.0, + "grad_norm": 2.1172991336759983, + "language_loss": 0.75555933, + "learning_rate": 3.714025842413166e-06, + "loss": 0.83416271, + "num_input_tokens_seen": 70752650, + "router_z_loss_clip": 2.5859375, + "router_z_loss_mlp": 0.25378418, + "step": 3279, + "time_per_iteration": 2.598710775375366 + }, + { + "auxiliary_loss_clip": 0.06574699, + "auxiliary_loss_mlp": 0.0128012, + "balance_loss_clip": 0.06317799, + "balance_loss_mlp": 0.01256671, + "epoch": 0.19720426875093944, + "flos": 23922776240640.0, + "grad_norm": 1.6530428540457747, + "language_loss": 0.82974696, + "learning_rate": 3.713825122291061e-06, + "loss": 0.90829515, + "num_input_tokens_seen": 70772365, + "router_z_loss_clip": 2.56835938, + "router_z_loss_mlp": 0.23449707, + "step": 3280, + "time_per_iteration": 2.618016481399536 + }, + { + "auxiliary_loss_clip": 0.06568167, + "auxiliary_loss_mlp": 0.01283165, + "balance_loss_clip": 0.0630914, + "balance_loss_mlp": 0.01259085, + "epoch": 0.1972643920036074, + "flos": 13887178508160.0, + "grad_norm": 2.6497469055747036, + "language_loss": 0.78509879, + "learning_rate": 3.713624337180536e-06, + "loss": 0.86361206, + "num_input_tokens_seen": 70790340, + "router_z_loss_clip": 2.58984375, + "router_z_loss_mlp": 0.24084473, + "step": 3281, + "time_per_iteration": 2.5222740173339844 + }, + { + "auxiliary_loss_clip": 0.06561945, + "auxiliary_loss_mlp": 0.01286304, + "balance_loss_clip": 0.06312899, + "balance_loss_mlp": 0.01263952, + "epoch": 0.19732451525627537, + "flos": 19869479959680.0, + "grad_norm": 1.7725817592402109, + "language_loss": 0.80340242, + "learning_rate": 3.7134234870892045e-06, + "loss": 0.88188481, + "num_input_tokens_seen": 70809295, + "router_z_loss_clip": 2.4921875, + "router_z_loss_mlp": 0.22351074, + "step": 3282, + "time_per_iteration": 2.6235008239746094 + }, + { + "auxiliary_loss_clip": 0.06573974, + "auxiliary_loss_mlp": 0.01283963, + "balance_loss_clip": 0.06315407, + "balance_loss_mlp": 0.01259668, + "epoch": 0.19738463850894333, + "flos": 24980477529600.0, + "grad_norm": 1.861487958506938, + "language_loss": 0.72318685, + "learning_rate": 3.7132225720246826e-06, + "loss": 0.80176622, + "num_input_tokens_seen": 70828765, + "router_z_loss_clip": 2.58398438, + "router_z_loss_mlp": 0.24304199, + "step": 3283, + "time_per_iteration": 2.5938494205474854 + }, + { + "auxiliary_loss_clip": 0.06574511, + "auxiliary_loss_mlp": 0.01281543, + "balance_loss_clip": 0.06317373, + "balance_loss_mlp": 0.01256247, + "epoch": 0.1974447617616113, + "flos": 18374722174080.0, + "grad_norm": 1.6759301931344739, + "language_loss": 0.79791147, + "learning_rate": 3.7130215919945886e-06, + "loss": 0.87647206, + "num_input_tokens_seen": 70846805, + "router_z_loss_clip": 2.56640625, + "router_z_loss_mlp": 0.25292969, + "step": 3284, + "time_per_iteration": 2.530935049057007 + }, + { + "auxiliary_loss_clip": 0.06572407, + "auxiliary_loss_mlp": 0.01285612, + "balance_loss_clip": 0.06312867, + "balance_loss_mlp": 0.01260554, + "epoch": 0.19750488501427926, + "flos": 22899511779840.0, + "grad_norm": 1.8637255752391477, + "language_loss": 0.87043929, + "learning_rate": 3.7128205470065445e-06, + "loss": 0.94901949, + "num_input_tokens_seen": 70863805, + "router_z_loss_clip": 2.59570312, + "router_z_loss_mlp": 0.25061035, + "step": 3285, + "time_per_iteration": 2.5539395809173584 + }, + { + "auxiliary_loss_clip": 0.06561802, + "auxiliary_loss_mlp": 0.01282259, + "balance_loss_clip": 0.06307627, + "balance_loss_mlp": 0.01258012, + "epoch": 0.19756500826694723, + "flos": 21877924400640.0, + "grad_norm": 2.4795216745498956, + "language_loss": 0.88948774, + "learning_rate": 3.712619437068174e-06, + "loss": 0.96792841, + "num_input_tokens_seen": 70882660, + "router_z_loss_clip": 2.54296875, + "router_z_loss_mlp": 0.24243164, + "step": 3286, + "time_per_iteration": 2.5367021560668945 + }, + { + "auxiliary_loss_clip": 0.06569161, + "auxiliary_loss_mlp": 0.01280864, + "balance_loss_clip": 0.06308903, + "balance_loss_mlp": 0.01256641, + "epoch": 0.19762513151961522, + "flos": 15164414294400.0, + "grad_norm": 2.1735993607640904, + "language_loss": 0.79236507, + "learning_rate": 3.712418262187102e-06, + "loss": 0.87086535, + "num_input_tokens_seen": 70898765, + "router_z_loss_clip": 2.6015625, + "router_z_loss_mlp": 0.24230957, + "step": 3287, + "time_per_iteration": 2.4954702854156494 + }, + { + "auxiliary_loss_clip": 0.0656468, + "auxiliary_loss_mlp": 0.01280142, + "balance_loss_clip": 0.0630395, + "balance_loss_mlp": 0.01256824, + "epoch": 0.1976852547722832, + "flos": 16984239194880.0, + "grad_norm": 4.513328663516958, + "language_loss": 0.81957221, + "learning_rate": 3.7122170223709584e-06, + "loss": 0.89802045, + "num_input_tokens_seen": 70916370, + "router_z_loss_clip": 2.60742188, + "router_z_loss_mlp": 0.23303223, + "step": 3288, + "time_per_iteration": 2.504995584487915 + }, + { + "auxiliary_loss_clip": 0.0655796, + "auxiliary_loss_mlp": 0.01284666, + "balance_loss_clip": 0.06307058, + "balance_loss_mlp": 0.01260526, + "epoch": 0.19774537802495115, + "flos": 20309135932800.0, + "grad_norm": 2.127297919409227, + "language_loss": 0.73378497, + "learning_rate": 3.712015717627374e-06, + "loss": 0.81221128, + "num_input_tokens_seen": 70934870, + "router_z_loss_clip": 2.50976562, + "router_z_loss_mlp": 0.24157715, + "step": 3289, + "time_per_iteration": 2.5189085006713867 + }, + { + "auxiliary_loss_clip": 0.06562441, + "auxiliary_loss_mlp": 0.01280497, + "balance_loss_clip": 0.06308928, + "balance_loss_mlp": 0.0125718, + "epoch": 0.19780550127761912, + "flos": 27242893296000.0, + "grad_norm": 3.229663808517491, + "language_loss": 0.79990375, + "learning_rate": 3.7118143479639813e-06, + "loss": 0.87833309, + "num_input_tokens_seen": 70955140, + "router_z_loss_clip": 2.53320312, + "router_z_loss_mlp": 0.2331543, + "step": 3290, + "time_per_iteration": 2.615630626678467 + }, + { + "auxiliary_loss_clip": 0.06446102, + "auxiliary_loss_mlp": 0.01262954, + "balance_loss_clip": 0.06305116, + "balance_loss_mlp": 0.01256308, + "epoch": 0.19786562453028708, + "flos": 63572597015040.0, + "grad_norm": 0.871535655745335, + "language_loss": 0.60331321, + "learning_rate": 3.711612913388418e-06, + "loss": 0.68040371, + "num_input_tokens_seen": 71012005, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.06658936, + "step": 3291, + "time_per_iteration": 3.1708285808563232 + }, + { + "auxiliary_loss_clip": 0.06578626, + "auxiliary_loss_mlp": 0.01283318, + "balance_loss_clip": 0.06312629, + "balance_loss_mlp": 0.0125621, + "epoch": 0.19792574778295505, + "flos": 26293869152640.0, + "grad_norm": 1.6662005392394712, + "language_loss": 0.82490212, + "learning_rate": 3.7114114139083204e-06, + "loss": 0.90352154, + "num_input_tokens_seen": 71031140, + "router_z_loss_clip": 2.65820312, + "router_z_loss_mlp": 0.2713623, + "step": 3292, + "time_per_iteration": 4.009428024291992 + }, + { + "auxiliary_loss_clip": 0.06559315, + "auxiliary_loss_mlp": 0.01281718, + "balance_loss_clip": 0.06308785, + "balance_loss_mlp": 0.01259641, + "epoch": 0.19798587103562304, + "flos": 19944265328640.0, + "grad_norm": 2.398610043576172, + "language_loss": 0.82271063, + "learning_rate": 3.7112098495313313e-06, + "loss": 0.9011209, + "num_input_tokens_seen": 71050250, + "router_z_loss_clip": 2.50390625, + "router_z_loss_mlp": 0.2208252, + "step": 3293, + "time_per_iteration": 2.5567917823791504 + }, + { + "auxiliary_loss_clip": 0.06584712, + "auxiliary_loss_mlp": 0.0128547, + "balance_loss_clip": 0.06316388, + "balance_loss_mlp": 0.01259351, + "epoch": 0.198045994288291, + "flos": 20126428104960.0, + "grad_norm": 22.121432113432896, + "language_loss": 0.62642097, + "learning_rate": 3.711008220265093e-06, + "loss": 0.70512283, + "num_input_tokens_seen": 71068665, + "router_z_loss_clip": 2.68554688, + "router_z_loss_mlp": 0.26135254, + "step": 3294, + "time_per_iteration": 4.055817365646362 + }, + { + "auxiliary_loss_clip": 0.06568369, + "auxiliary_loss_mlp": 0.01283249, + "balance_loss_clip": 0.06312987, + "balance_loss_mlp": 0.01259849, + "epoch": 0.19810611754095897, + "flos": 17973444170880.0, + "grad_norm": 2.078666367863598, + "language_loss": 0.88182533, + "learning_rate": 3.710806526117251e-06, + "loss": 0.96034157, + "num_input_tokens_seen": 71085320, + "router_z_loss_clip": 2.55273438, + "router_z_loss_mlp": 0.23413086, + "step": 3295, + "time_per_iteration": 2.616658926010132 + }, + { + "auxiliary_loss_clip": 0.06566019, + "auxiliary_loss_mlp": 0.01286636, + "balance_loss_clip": 0.06313851, + "balance_loss_mlp": 0.01265298, + "epoch": 0.19816624079362694, + "flos": 15090257831040.0, + "grad_norm": 2.9890739239636575, + "language_loss": 0.82427287, + "learning_rate": 3.7106047670954544e-06, + "loss": 0.90279943, + "num_input_tokens_seen": 71102020, + "router_z_loss_clip": 2.5234375, + "router_z_loss_mlp": 0.21337891, + "step": 3296, + "time_per_iteration": 2.642479658126831 + }, + { + "auxiliary_loss_clip": 0.06579386, + "auxiliary_loss_mlp": 0.01281841, + "balance_loss_clip": 0.06320241, + "balance_loss_mlp": 0.01256593, + "epoch": 0.1982263640462949, + "flos": 24907327315200.0, + "grad_norm": 2.6461649791490522, + "language_loss": 0.69111884, + "learning_rate": 3.710402943207354e-06, + "loss": 0.76973104, + "num_input_tokens_seen": 71123390, + "router_z_loss_clip": 2.59179688, + "router_z_loss_mlp": 0.25268555, + "step": 3297, + "time_per_iteration": 2.5983548164367676 + }, + { + "auxiliary_loss_clip": 0.06568186, + "auxiliary_loss_mlp": 0.01294298, + "balance_loss_clip": 0.06316572, + "balance_loss_mlp": 0.01272125, + "epoch": 0.19828648729896287, + "flos": 20382453855360.0, + "grad_norm": 1.615710211373745, + "language_loss": 0.8249923, + "learning_rate": 3.7102010544606016e-06, + "loss": 0.90361714, + "num_input_tokens_seen": 71141800, + "router_z_loss_clip": 2.51757812, + "router_z_loss_mlp": 0.22167969, + "step": 3298, + "time_per_iteration": 2.548333168029785 + }, + { + "auxiliary_loss_clip": 0.0657866, + "auxiliary_loss_mlp": 0.01298019, + "balance_loss_clip": 0.06318102, + "balance_loss_mlp": 0.01272592, + "epoch": 0.19834661055163083, + "flos": 18886018988160.0, + "grad_norm": 1.9534827487794544, + "language_loss": 0.86188138, + "learning_rate": 3.7099991008628544e-06, + "loss": 0.94064808, + "num_input_tokens_seen": 71159505, + "router_z_loss_clip": 2.60546875, + "router_z_loss_mlp": 0.25402832, + "step": 3299, + "time_per_iteration": 3.944326400756836 + }, + { + "auxiliary_loss_clip": 0.06449087, + "auxiliary_loss_mlp": 0.01270227, + "balance_loss_clip": 0.06307668, + "balance_loss_mlp": 0.01262615, + "epoch": 0.19840673380429882, + "flos": 60278908723200.0, + "grad_norm": 0.7519898728992364, + "language_loss": 0.53224742, + "learning_rate": 3.7097970824217706e-06, + "loss": 0.60944057, + "num_input_tokens_seen": 71223265, + "router_z_loss_clip": 1.41308594, + "router_z_loss_mlp": 0.07598877, + "step": 3300, + "time_per_iteration": 4.6055073738098145 + }, + { + "auxiliary_loss_clip": 0.06570522, + "auxiliary_loss_mlp": 0.01292871, + "balance_loss_clip": 0.06315967, + "balance_loss_mlp": 0.01267706, + "epoch": 0.1984668570569668, + "flos": 19908235272960.0, + "grad_norm": 2.2853574973511472, + "language_loss": 0.73847342, + "learning_rate": 3.7095949991450093e-06, + "loss": 0.81710732, + "num_input_tokens_seen": 71242385, + "router_z_loss_clip": 2.546875, + "router_z_loss_mlp": 0.25183105, + "step": 3301, + "time_per_iteration": 2.6006925106048584 + }, + { + "auxiliary_loss_clip": 0.06563142, + "auxiliary_loss_mlp": 0.01290092, + "balance_loss_clip": 0.0631086, + "balance_loss_mlp": 0.01267239, + "epoch": 0.19852698030963475, + "flos": 15635865692160.0, + "grad_norm": 3.8656690955217976, + "language_loss": 0.8953101, + "learning_rate": 3.709392851040235e-06, + "loss": 0.9738425, + "num_input_tokens_seen": 71258990, + "router_z_loss_clip": 2.5234375, + "router_z_loss_mlp": 0.22851562, + "step": 3302, + "time_per_iteration": 2.487173080444336 + }, + { + "auxiliary_loss_clip": 0.06567049, + "auxiliary_loss_mlp": 0.0128658, + "balance_loss_clip": 0.06310292, + "balance_loss_mlp": 0.01263013, + "epoch": 0.19858710356230272, + "flos": 43153037729280.0, + "grad_norm": 2.6127475741484347, + "language_loss": 0.74595749, + "learning_rate": 3.709190638115111e-06, + "loss": 0.82449377, + "num_input_tokens_seen": 71282770, + "router_z_loss_clip": 2.56835938, + "router_z_loss_mlp": 0.23596191, + "step": 3303, + "time_per_iteration": 2.733031749725342 + }, + { + "auxiliary_loss_clip": 0.06567588, + "auxiliary_loss_mlp": 0.0129499, + "balance_loss_clip": 0.06313773, + "balance_loss_mlp": 0.01270373, + "epoch": 0.19864722681497068, + "flos": 35151348879360.0, + "grad_norm": 2.3312818962460686, + "language_loss": 0.75973707, + "learning_rate": 3.7089883603773084e-06, + "loss": 0.83836287, + "num_input_tokens_seen": 71301410, + "router_z_loss_clip": 2.54101562, + "router_z_loss_mlp": 0.24597168, + "step": 3304, + "time_per_iteration": 2.627612829208374 + }, + { + "auxiliary_loss_clip": 0.06565879, + "auxiliary_loss_mlp": 0.01301567, + "balance_loss_clip": 0.06315561, + "balance_loss_mlp": 0.01279156, + "epoch": 0.19870735006763865, + "flos": 19432088046720.0, + "grad_norm": 2.2073504264205277, + "language_loss": 0.86939341, + "learning_rate": 3.7087860178344955e-06, + "loss": 0.9480679, + "num_input_tokens_seen": 71319670, + "router_z_loss_clip": 2.50390625, + "router_z_loss_mlp": 0.22399902, + "step": 3305, + "time_per_iteration": 2.5243277549743652 + }, + { + "auxiliary_loss_clip": 0.06573498, + "auxiliary_loss_mlp": 0.01293424, + "balance_loss_clip": 0.06314258, + "balance_loss_mlp": 0.01270035, + "epoch": 0.19876747332030664, + "flos": 23553671005440.0, + "grad_norm": 1.7277126311559312, + "language_loss": 0.69397068, + "learning_rate": 3.7085836104943445e-06, + "loss": 0.77263987, + "num_input_tokens_seen": 71339850, + "router_z_loss_clip": 2.59570312, + "router_z_loss_mlp": 0.23388672, + "step": 3306, + "time_per_iteration": 2.6042323112487793 + }, + { + "auxiliary_loss_clip": 0.06570327, + "auxiliary_loss_mlp": 0.01299594, + "balance_loss_clip": 0.06314942, + "balance_loss_mlp": 0.0127723, + "epoch": 0.1988275965729746, + "flos": 19835672037120.0, + "grad_norm": 3.1120189325389735, + "language_loss": 0.77373499, + "learning_rate": 3.7083811383645332e-06, + "loss": 0.85243422, + "num_input_tokens_seen": 71359795, + "router_z_loss_clip": 2.55273438, + "router_z_loss_mlp": 0.22375488, + "step": 3307, + "time_per_iteration": 2.6128084659576416 + }, + { + "auxiliary_loss_clip": 0.06569448, + "auxiliary_loss_mlp": 0.01292327, + "balance_loss_clip": 0.06316574, + "balance_loss_mlp": 0.01270452, + "epoch": 0.19888771982564257, + "flos": 23520366207360.0, + "grad_norm": 3.545114094394172, + "language_loss": 0.7662878, + "learning_rate": 3.708178601452737e-06, + "loss": 0.84490561, + "num_input_tokens_seen": 71378885, + "router_z_loss_clip": 2.52929688, + "router_z_loss_mlp": 0.21875, + "step": 3308, + "time_per_iteration": 2.5699222087860107 + }, + { + "auxiliary_loss_clip": 0.06565186, + "auxiliary_loss_mlp": 0.01291629, + "balance_loss_clip": 0.0631263, + "balance_loss_mlp": 0.0126799, + "epoch": 0.19894784307831054, + "flos": 18156403560960.0, + "grad_norm": 1.7056349525902872, + "language_loss": 0.76261461, + "learning_rate": 3.7079759997666374e-06, + "loss": 0.84118271, + "num_input_tokens_seen": 71397285, + "router_z_loss_clip": 2.52539062, + "router_z_loss_mlp": 0.23657227, + "step": 3309, + "time_per_iteration": 2.5804028511047363 + }, + { + "auxiliary_loss_clip": 0.06557433, + "auxiliary_loss_mlp": 0.01287248, + "balance_loss_clip": 0.06309506, + "balance_loss_mlp": 0.0126287, + "epoch": 0.1990079663309785, + "flos": 24282280183680.0, + "grad_norm": 1.5893437900436935, + "language_loss": 0.8845197, + "learning_rate": 3.707773333313917e-06, + "loss": 0.96296644, + "num_input_tokens_seen": 71415775, + "router_z_loss_clip": 2.47851562, + "router_z_loss_mlp": 0.24377441, + "step": 3310, + "time_per_iteration": 2.540788412094116 + }, + { + "auxiliary_loss_clip": 0.06554775, + "auxiliary_loss_mlp": 0.01280476, + "balance_loss_clip": 0.06304908, + "balance_loss_mlp": 0.01256575, + "epoch": 0.19906808958364647, + "flos": 34906391867520.0, + "grad_norm": 2.4688423193302347, + "language_loss": 0.64663219, + "learning_rate": 3.70757060210226e-06, + "loss": 0.72498477, + "num_input_tokens_seen": 71437315, + "router_z_loss_clip": 2.50195312, + "router_z_loss_mlp": 0.23925781, + "step": 3311, + "time_per_iteration": 2.6754508018493652 + }, + { + "auxiliary_loss_clip": 0.06567319, + "auxiliary_loss_mlp": 0.01285122, + "balance_loss_clip": 0.06310549, + "balance_loss_mlp": 0.01261351, + "epoch": 0.19912821283631443, + "flos": 24031788802560.0, + "grad_norm": 3.0857408174701186, + "language_loss": 0.75624847, + "learning_rate": 3.707367806139355e-06, + "loss": 0.83477283, + "num_input_tokens_seen": 71456320, + "router_z_loss_clip": 2.56445312, + "router_z_loss_mlp": 0.23779297, + "step": 3312, + "time_per_iteration": 2.5815083980560303 + }, + { + "auxiliary_loss_clip": 0.06553487, + "auxiliary_loss_mlp": 0.01286524, + "balance_loss_clip": 0.06300232, + "balance_loss_mlp": 0.01262611, + "epoch": 0.19918833608898243, + "flos": 19864155225600.0, + "grad_norm": 2.0583715987658264, + "language_loss": 0.84526402, + "learning_rate": 3.7071649454328915e-06, + "loss": 0.92366409, + "num_input_tokens_seen": 71475360, + "router_z_loss_clip": 2.53125, + "router_z_loss_mlp": 0.23937988, + "step": 3313, + "time_per_iteration": 2.5260941982269287 + }, + { + "auxiliary_loss_clip": 0.06547163, + "auxiliary_loss_mlp": 0.01284622, + "balance_loss_clip": 0.06294618, + "balance_loss_mlp": 0.01261376, + "epoch": 0.1992484593416504, + "flos": 29103444080640.0, + "grad_norm": 1.8813056340492245, + "language_loss": 0.81481469, + "learning_rate": 3.7069620199905625e-06, + "loss": 0.89313251, + "num_input_tokens_seen": 71496155, + "router_z_loss_clip": 2.52539062, + "router_z_loss_mlp": 0.2322998, + "step": 3314, + "time_per_iteration": 2.618865966796875 + }, + { + "auxiliary_loss_clip": 0.06544838, + "auxiliary_loss_mlp": 0.01278619, + "balance_loss_clip": 0.06300788, + "balance_loss_mlp": 0.01257924, + "epoch": 0.19930858259431836, + "flos": 23301754104960.0, + "grad_norm": 1.60969518187187, + "language_loss": 0.88063407, + "learning_rate": 3.7067590298200627e-06, + "loss": 0.95886856, + "num_input_tokens_seen": 71517295, + "router_z_loss_clip": 2.43945312, + "router_z_loss_mlp": 0.20690918, + "step": 3315, + "time_per_iteration": 2.5732057094573975 + }, + { + "auxiliary_loss_clip": 0.06550217, + "auxiliary_loss_mlp": 0.01280633, + "balance_loss_clip": 0.06298293, + "balance_loss_mlp": 0.0125728, + "epoch": 0.19936870584698632, + "flos": 25386619069440.0, + "grad_norm": 1.6023919835075873, + "language_loss": 0.71362162, + "learning_rate": 3.7065559749290892e-06, + "loss": 0.79193014, + "num_input_tokens_seen": 71540000, + "router_z_loss_clip": 2.51953125, + "router_z_loss_mlp": 0.23352051, + "step": 3316, + "time_per_iteration": 2.6071085929870605 + }, + { + "auxiliary_loss_clip": 0.06427301, + "auxiliary_loss_mlp": 0.01269861, + "balance_loss_clip": 0.06290084, + "balance_loss_mlp": 0.01263975, + "epoch": 0.1994288290996543, + "flos": 62190038246400.0, + "grad_norm": 0.8251623423654184, + "language_loss": 0.6634506, + "learning_rate": 3.706352855325342e-06, + "loss": 0.74042213, + "num_input_tokens_seen": 71607880, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.05880737, + "step": 3317, + "time_per_iteration": 3.216862201690674 + }, + { + "auxiliary_loss_clip": 0.06558052, + "auxiliary_loss_mlp": 0.01286476, + "balance_loss_clip": 0.06302503, + "balance_loss_mlp": 0.01262813, + "epoch": 0.19948895235232225, + "flos": 19031816292480.0, + "grad_norm": 2.159914212237722, + "language_loss": 0.74519444, + "learning_rate": 3.7061496710165233e-06, + "loss": 0.82363975, + "num_input_tokens_seen": 71625695, + "router_z_loss_clip": 2.55078125, + "router_z_loss_mlp": 0.23669434, + "step": 3318, + "time_per_iteration": 2.5432114601135254 + }, + { + "auxiliary_loss_clip": 0.06544004, + "auxiliary_loss_mlp": 0.01278248, + "balance_loss_clip": 0.06298326, + "balance_loss_mlp": 0.01256266, + "epoch": 0.19954907560499022, + "flos": 37824895503360.0, + "grad_norm": 2.0763327087054604, + "language_loss": 0.79865813, + "learning_rate": 3.7059464220103385e-06, + "loss": 0.87688065, + "num_input_tokens_seen": 71648520, + "router_z_loss_clip": 2.45703125, + "router_z_loss_mlp": 0.21984863, + "step": 3319, + "time_per_iteration": 2.6703901290893555 + }, + { + "auxiliary_loss_clip": 0.06551617, + "auxiliary_loss_mlp": 0.01282829, + "balance_loss_clip": 0.06300303, + "balance_loss_mlp": 0.01259631, + "epoch": 0.1996091988576582, + "flos": 49576420673280.0, + "grad_norm": 2.869788826425785, + "language_loss": 0.763668, + "learning_rate": 3.7057431083144945e-06, + "loss": 0.84201247, + "num_input_tokens_seen": 71672185, + "router_z_loss_clip": 2.51757812, + "router_z_loss_mlp": 0.2322998, + "step": 3320, + "time_per_iteration": 2.817199945449829 + }, + { + "auxiliary_loss_clip": 0.06552573, + "auxiliary_loss_mlp": 0.01291841, + "balance_loss_clip": 0.06302333, + "balance_loss_mlp": 0.01269608, + "epoch": 0.19966932211032618, + "flos": 22642018583040.0, + "grad_norm": 1.4988243809721686, + "language_loss": 0.81033528, + "learning_rate": 3.705539729936701e-06, + "loss": 0.8887794, + "num_input_tokens_seen": 71692890, + "router_z_loss_clip": 2.50390625, + "router_z_loss_mlp": 0.22229004, + "step": 3321, + "time_per_iteration": 2.6688761711120605 + }, + { + "auxiliary_loss_clip": 0.06416404, + "auxiliary_loss_mlp": 0.01265491, + "balance_loss_clip": 0.06281121, + "balance_loss_mlp": 0.01258195, + "epoch": 0.19972944536299414, + "flos": 54098973417600.0, + "grad_norm": 0.8569411614728654, + "language_loss": 0.65245974, + "learning_rate": 3.7053362868846696e-06, + "loss": 0.72927874, + "num_input_tokens_seen": 71745815, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.07275391, + "step": 3322, + "time_per_iteration": 3.000269651412964 + }, + { + "auxiliary_loss_clip": 0.06410387, + "auxiliary_loss_mlp": 0.01261864, + "balance_loss_clip": 0.06274698, + "balance_loss_mlp": 0.01254372, + "epoch": 0.1997895686156621, + "flos": 69371995731840.0, + "grad_norm": 0.7694165297899808, + "language_loss": 0.56849998, + "learning_rate": 3.7051327791661153e-06, + "loss": 0.64522249, + "num_input_tokens_seen": 71806915, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.07476807, + "step": 3323, + "time_per_iteration": 3.330606698989868 + }, + { + "auxiliary_loss_clip": 0.06562012, + "auxiliary_loss_mlp": 0.01292664, + "balance_loss_clip": 0.06316413, + "balance_loss_mlp": 0.01268596, + "epoch": 0.19984969186833007, + "flos": 18558058907520.0, + "grad_norm": 1.8232624283894519, + "language_loss": 0.81610429, + "learning_rate": 3.7049292067887555e-06, + "loss": 0.89465106, + "num_input_tokens_seen": 71824645, + "router_z_loss_clip": 2.45703125, + "router_z_loss_mlp": 0.24084473, + "step": 3324, + "time_per_iteration": 2.5314769744873047 + }, + { + "auxiliary_loss_clip": 0.06558169, + "auxiliary_loss_mlp": 0.01292911, + "balance_loss_clip": 0.06310347, + "balance_loss_mlp": 0.01268318, + "epoch": 0.19990981512099804, + "flos": 26436438074880.0, + "grad_norm": 1.6515442637335616, + "language_loss": 0.54047406, + "learning_rate": 3.7047255697603092e-06, + "loss": 0.61898488, + "num_input_tokens_seen": 71845125, + "router_z_loss_clip": 2.48242188, + "router_z_loss_mlp": 0.24609375, + "step": 3325, + "time_per_iteration": 2.6192479133605957 + }, + { + "auxiliary_loss_clip": 0.06565623, + "auxiliary_loss_mlp": 0.01288281, + "balance_loss_clip": 0.063146, + "balance_loss_mlp": 0.01265572, + "epoch": 0.19996993837366603, + "flos": 16331547415680.0, + "grad_norm": 1.9371709062145088, + "language_loss": 0.8658272, + "learning_rate": 3.7045218680884984e-06, + "loss": 0.94436622, + "num_input_tokens_seen": 71863500, + "router_z_loss_clip": 2.50976562, + "router_z_loss_mlp": 0.22729492, + "step": 3326, + "time_per_iteration": 2.5111629962921143 + }, + { + "auxiliary_loss_clip": 0.06551019, + "auxiliary_loss_mlp": 0.01289033, + "balance_loss_clip": 0.06305069, + "balance_loss_mlp": 0.01266705, + "epoch": 0.200030061626334, + "flos": 20849460986880.0, + "grad_norm": 6.809877440219623, + "language_loss": 0.7272824, + "learning_rate": 3.7043181017810476e-06, + "loss": 0.8056829, + "num_input_tokens_seen": 71881845, + "router_z_loss_clip": 2.45898438, + "router_z_loss_mlp": 0.22314453, + "step": 3327, + "time_per_iteration": 2.5571372509002686 + }, + { + "auxiliary_loss_clip": 0.06566358, + "auxiliary_loss_mlp": 0.01287053, + "balance_loss_clip": 0.06313111, + "balance_loss_mlp": 0.01261756, + "epoch": 0.20009018487900196, + "flos": 23768341966080.0, + "grad_norm": 1.841950801645188, + "language_loss": 0.77914047, + "learning_rate": 3.7041142708456833e-06, + "loss": 0.8576746, + "num_input_tokens_seen": 71900940, + "router_z_loss_clip": 2.53320312, + "router_z_loss_mlp": 0.25317383, + "step": 3328, + "time_per_iteration": 2.5489912033081055 + }, + { + "auxiliary_loss_clip": 0.06559211, + "auxiliary_loss_mlp": 0.01288822, + "balance_loss_clip": 0.06314486, + "balance_loss_mlp": 0.01265338, + "epoch": 0.20015030813166992, + "flos": 28119186495360.0, + "grad_norm": 1.7739956363125764, + "language_loss": 0.6938678, + "learning_rate": 3.7039103752901353e-06, + "loss": 0.77234817, + "num_input_tokens_seen": 71921925, + "router_z_loss_clip": 2.4453125, + "router_z_loss_mlp": 0.23474121, + "step": 3329, + "time_per_iteration": 2.790318489074707 + }, + { + "auxiliary_loss_clip": 0.06562928, + "auxiliary_loss_mlp": 0.01288787, + "balance_loss_clip": 0.06310034, + "balance_loss_mlp": 0.01263396, + "epoch": 0.2002104313843379, + "flos": 26074250801280.0, + "grad_norm": 1.6222638892170962, + "language_loss": 0.81793886, + "learning_rate": 3.7037064151221353e-06, + "loss": 0.896456, + "num_input_tokens_seen": 71941855, + "router_z_loss_clip": 2.52929688, + "router_z_loss_mlp": 0.25415039, + "step": 3330, + "time_per_iteration": 2.6165175437927246 + }, + { + "auxiliary_loss_clip": 0.06561245, + "auxiliary_loss_mlp": 0.01293061, + "balance_loss_clip": 0.06310615, + "balance_loss_mlp": 0.01268874, + "epoch": 0.20027055463700585, + "flos": 22973332826880.0, + "grad_norm": 3.6220429921180877, + "language_loss": 0.7808395, + "learning_rate": 3.703502390349417e-06, + "loss": 0.85938263, + "num_input_tokens_seen": 71960915, + "router_z_loss_clip": 2.50585938, + "router_z_loss_mlp": 0.24194336, + "step": 3331, + "time_per_iteration": 4.07051157951355 + }, + { + "auxiliary_loss_clip": 0.06564473, + "auxiliary_loss_mlp": 0.01290798, + "balance_loss_clip": 0.06310149, + "balance_loss_mlp": 0.01266014, + "epoch": 0.20033067788967382, + "flos": 17171433216000.0, + "grad_norm": 1.7477664730796658, + "language_loss": 0.79863441, + "learning_rate": 3.7032983009797176e-06, + "loss": 0.87718713, + "num_input_tokens_seen": 71979220, + "router_z_loss_clip": 2.54296875, + "router_z_loss_mlp": 0.24780273, + "step": 3332, + "time_per_iteration": 2.5321452617645264 + }, + { + "auxiliary_loss_clip": 0.06409155, + "auxiliary_loss_mlp": 0.01261657, + "balance_loss_clip": 0.06275231, + "balance_loss_mlp": 0.01253551, + "epoch": 0.2003908011423418, + "flos": 60842476085760.0, + "grad_norm": 0.9021189232739572, + "language_loss": 0.61913729, + "learning_rate": 3.703094147020776e-06, + "loss": 0.69584543, + "num_input_tokens_seen": 72033950, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.08105469, + "step": 3333, + "time_per_iteration": 4.713933706283569 + }, + { + "auxiliary_loss_clip": 0.06552575, + "auxiliary_loss_mlp": 0.0128469, + "balance_loss_clip": 0.06299093, + "balance_loss_mlp": 0.0126123, + "epoch": 0.20045092439500978, + "flos": 24212987256960.0, + "grad_norm": 1.8847951547254278, + "language_loss": 0.82181144, + "learning_rate": 3.7028899284803334e-06, + "loss": 0.90018404, + "num_input_tokens_seen": 72051395, + "router_z_loss_clip": 2.53320312, + "router_z_loss_mlp": 0.23461914, + "step": 3334, + "time_per_iteration": 2.597038984298706 + }, + { + "auxiliary_loss_clip": 0.0654801, + "auxiliary_loss_mlp": 0.01282898, + "balance_loss_clip": 0.06293298, + "balance_loss_mlp": 0.01256874, + "epoch": 0.20051104764767774, + "flos": 29395290251520.0, + "grad_norm": 2.256626356817437, + "language_loss": 0.7536357, + "learning_rate": 3.702685645366134e-06, + "loss": 0.83194482, + "num_input_tokens_seen": 72071305, + "router_z_loss_clip": 2.54882812, + "router_z_loss_mlp": 0.26049805, + "step": 3335, + "time_per_iteration": 2.5860390663146973 + }, + { + "auxiliary_loss_clip": 0.06552432, + "auxiliary_loss_mlp": 0.01280424, + "balance_loss_clip": 0.06300009, + "balance_loss_mlp": 0.0125632, + "epoch": 0.2005711709003457, + "flos": 23520575842560.0, + "grad_norm": 6.047041669068293, + "language_loss": 0.80452931, + "learning_rate": 3.7024812976859243e-06, + "loss": 0.88285786, + "num_input_tokens_seen": 72090165, + "router_z_loss_clip": 2.5234375, + "router_z_loss_mlp": 0.24108887, + "step": 3336, + "time_per_iteration": 2.662705898284912 + }, + { + "auxiliary_loss_clip": 0.06555694, + "auxiliary_loss_mlp": 0.01283807, + "balance_loss_clip": 0.06297083, + "balance_loss_mlp": 0.01258045, + "epoch": 0.20063129415301367, + "flos": 22529106806400.0, + "grad_norm": 1.88296777376126, + "language_loss": 0.78839928, + "learning_rate": 3.7022768854474532e-06, + "loss": 0.86679429, + "num_input_tokens_seen": 72107210, + "router_z_loss_clip": 2.5859375, + "router_z_loss_mlp": 0.25756836, + "step": 3337, + "time_per_iteration": 2.541239023208618 + }, + { + "auxiliary_loss_clip": 0.06548997, + "auxiliary_loss_mlp": 0.01282446, + "balance_loss_clip": 0.06296889, + "balance_loss_mlp": 0.01258389, + "epoch": 0.20069141740568164, + "flos": 25965405947520.0, + "grad_norm": 2.093788516709133, + "language_loss": 0.69608915, + "learning_rate": 3.7020724086584724e-06, + "loss": 0.77440357, + "num_input_tokens_seen": 72126315, + "router_z_loss_clip": 2.51953125, + "router_z_loss_mlp": 0.24072266, + "step": 3338, + "time_per_iteration": 4.011674165725708 + }, + { + "auxiliary_loss_clip": 0.06553162, + "auxiliary_loss_mlp": 0.01285819, + "balance_loss_clip": 0.06298589, + "balance_loss_mlp": 0.01261703, + "epoch": 0.2007515406583496, + "flos": 24797560066560.0, + "grad_norm": 2.5614555335728375, + "language_loss": 0.70278549, + "learning_rate": 3.701867867326735e-06, + "loss": 0.78117526, + "num_input_tokens_seen": 72146470, + "router_z_loss_clip": 2.54492188, + "router_z_loss_mlp": 0.24121094, + "step": 3339, + "time_per_iteration": 4.021097183227539 + }, + { + "auxiliary_loss_clip": 0.06558233, + "auxiliary_loss_mlp": 0.01288707, + "balance_loss_clip": 0.06300814, + "balance_loss_mlp": 0.01264401, + "epoch": 0.2008116639110176, + "flos": 37934746606080.0, + "grad_norm": 2.4782874615073265, + "language_loss": 0.67773008, + "learning_rate": 3.7016632614599974e-06, + "loss": 0.75619948, + "num_input_tokens_seen": 72166600, + "router_z_loss_clip": 2.578125, + "router_z_loss_mlp": 0.24291992, + "step": 3340, + "time_per_iteration": 2.741156816482544 + }, + { + "auxiliary_loss_clip": 0.06555235, + "auxiliary_loss_mlp": 0.01284766, + "balance_loss_clip": 0.06297287, + "balance_loss_mlp": 0.01258122, + "epoch": 0.20087178716368556, + "flos": 20746779408000.0, + "grad_norm": 2.067820693237163, + "language_loss": 0.74698186, + "learning_rate": 3.701458591066019e-06, + "loss": 0.82538182, + "num_input_tokens_seen": 72185160, + "router_z_loss_clip": 2.58007812, + "router_z_loss_mlp": 0.26623535, + "step": 3341, + "time_per_iteration": 2.564480781555176 + }, + { + "auxiliary_loss_clip": 0.06547385, + "auxiliary_loss_mlp": 0.01280207, + "balance_loss_clip": 0.06298249, + "balance_loss_mlp": 0.01256532, + "epoch": 0.20093191041635353, + "flos": 23849122901760.0, + "grad_norm": 1.820842392943319, + "language_loss": 0.7265389, + "learning_rate": 3.70125385615256e-06, + "loss": 0.80481482, + "num_input_tokens_seen": 72205160, + "router_z_loss_clip": 2.48828125, + "router_z_loss_mlp": 0.23657227, + "step": 3342, + "time_per_iteration": 2.5828449726104736 + }, + { + "auxiliary_loss_clip": 0.065575, + "auxiliary_loss_mlp": 0.01288338, + "balance_loss_clip": 0.06302083, + "balance_loss_mlp": 0.01264174, + "epoch": 0.2009920336690215, + "flos": 21797395027200.0, + "grad_norm": 1.987813203177408, + "language_loss": 0.73357129, + "learning_rate": 3.701049056727384e-06, + "loss": 0.81202972, + "num_input_tokens_seen": 72223555, + "router_z_loss_clip": 2.55273438, + "router_z_loss_mlp": 0.24169922, + "step": 3343, + "time_per_iteration": 2.547868490219116 + }, + { + "auxiliary_loss_clip": 0.06554922, + "auxiliary_loss_mlp": 0.012954, + "balance_loss_clip": 0.06301528, + "balance_loss_mlp": 0.01269865, + "epoch": 0.20105215692168946, + "flos": 26366390461440.0, + "grad_norm": 2.115251797604865, + "language_loss": 0.81433517, + "learning_rate": 3.7008441927982574e-06, + "loss": 0.89283836, + "num_input_tokens_seen": 72242465, + "router_z_loss_clip": 2.53515625, + "router_z_loss_mlp": 0.25524902, + "step": 3344, + "time_per_iteration": 2.6067302227020264 + }, + { + "auxiliary_loss_clip": 0.06556335, + "auxiliary_loss_mlp": 0.01281302, + "balance_loss_clip": 0.06301118, + "balance_loss_mlp": 0.01258426, + "epoch": 0.20111228017435742, + "flos": 18813288044160.0, + "grad_norm": 4.0042293338609385, + "language_loss": 0.84618676, + "learning_rate": 3.700639264372948e-06, + "loss": 0.92456311, + "num_input_tokens_seen": 72260655, + "router_z_loss_clip": 2.55273438, + "router_z_loss_mlp": 0.2286377, + "step": 3345, + "time_per_iteration": 2.554713726043701 + }, + { + "auxiliary_loss_clip": 0.06542929, + "auxiliary_loss_mlp": 0.01295407, + "balance_loss_clip": 0.0629687, + "balance_loss_mlp": 0.01272697, + "epoch": 0.20117240342702541, + "flos": 19981301633280.0, + "grad_norm": 2.1108086187654025, + "language_loss": 0.68437809, + "learning_rate": 3.7004342714592283e-06, + "loss": 0.76276147, + "num_input_tokens_seen": 72279055, + "router_z_loss_clip": 2.4609375, + "router_z_loss_mlp": 0.22705078, + "step": 3346, + "time_per_iteration": 2.5748066902160645 + }, + { + "auxiliary_loss_clip": 0.06553109, + "auxiliary_loss_mlp": 0.01283392, + "balance_loss_clip": 0.06301546, + "balance_loss_mlp": 0.01258739, + "epoch": 0.20123252667969338, + "flos": 23148368006400.0, + "grad_norm": 1.9426154174848713, + "language_loss": 0.73952061, + "learning_rate": 3.70022921406487e-06, + "loss": 0.81788564, + "num_input_tokens_seen": 72297895, + "router_z_loss_clip": 2.51757812, + "router_z_loss_mlp": 0.24682617, + "step": 3347, + "time_per_iteration": 2.5353236198425293 + }, + { + "auxiliary_loss_clip": 0.06546339, + "auxiliary_loss_mlp": 0.01287781, + "balance_loss_clip": 0.0629671, + "balance_loss_mlp": 0.01263487, + "epoch": 0.20129264993236134, + "flos": 23228352328320.0, + "grad_norm": 1.557023243146552, + "language_loss": 0.87284029, + "learning_rate": 3.70002409219765e-06, + "loss": 0.95118147, + "num_input_tokens_seen": 72318385, + "router_z_loss_clip": 2.49414062, + "router_z_loss_mlp": 0.24316406, + "step": 3348, + "time_per_iteration": 2.5943105220794678 + }, + { + "auxiliary_loss_clip": 0.06550047, + "auxiliary_loss_mlp": 0.01294068, + "balance_loss_clip": 0.06302264, + "balance_loss_mlp": 0.01269034, + "epoch": 0.2013527731850293, + "flos": 21877882473600.0, + "grad_norm": 1.6966939322149492, + "language_loss": 0.71502012, + "learning_rate": 3.699818905865346e-06, + "loss": 0.7934612, + "num_input_tokens_seen": 72338235, + "router_z_loss_clip": 2.47851562, + "router_z_loss_mlp": 0.25061035, + "step": 3349, + "time_per_iteration": 2.5671966075897217 + }, + { + "auxiliary_loss_clip": 0.06552055, + "auxiliary_loss_mlp": 0.01290022, + "balance_loss_clip": 0.06301533, + "balance_loss_mlp": 0.01263486, + "epoch": 0.20141289643769728, + "flos": 18046636312320.0, + "grad_norm": 1.7460886195435679, + "language_loss": 0.72473693, + "learning_rate": 3.6996136550757377e-06, + "loss": 0.80315775, + "num_input_tokens_seen": 72357825, + "router_z_loss_clip": 2.50195312, + "router_z_loss_mlp": 0.26501465, + "step": 3350, + "time_per_iteration": 2.558486223220825 + }, + { + "auxiliary_loss_clip": 0.06561922, + "auxiliary_loss_mlp": 0.01282894, + "balance_loss_clip": 0.0630732, + "balance_loss_mlp": 0.01256728, + "epoch": 0.20147301969036524, + "flos": 23958219317760.0, + "grad_norm": 2.4285458765514623, + "language_loss": 0.76773715, + "learning_rate": 3.69940833983661e-06, + "loss": 0.84618533, + "num_input_tokens_seen": 72376335, + "router_z_loss_clip": 2.54882812, + "router_z_loss_mlp": 0.26135254, + "step": 3351, + "time_per_iteration": 2.5236856937408447 + }, + { + "auxiliary_loss_clip": 0.0657143, + "auxiliary_loss_mlp": 0.01289916, + "balance_loss_clip": 0.06311074, + "balance_loss_mlp": 0.01260638, + "epoch": 0.2015331429430332, + "flos": 25594749411840.0, + "grad_norm": 1.6280311670130643, + "language_loss": 0.81367022, + "learning_rate": 3.699202960155748e-06, + "loss": 0.89228368, + "num_input_tokens_seen": 72395440, + "router_z_loss_clip": 2.60742188, + "router_z_loss_mlp": 0.29248047, + "step": 3352, + "time_per_iteration": 2.603740692138672 + }, + { + "auxiliary_loss_clip": 0.06557955, + "auxiliary_loss_mlp": 0.01286544, + "balance_loss_clip": 0.06303968, + "balance_loss_mlp": 0.01258458, + "epoch": 0.2015932661957012, + "flos": 26732351168640.0, + "grad_norm": 2.001275007108419, + "language_loss": 0.81670761, + "learning_rate": 3.6989975160409396e-06, + "loss": 0.89515263, + "num_input_tokens_seen": 72414670, + "router_z_loss_clip": 2.54101562, + "router_z_loss_mlp": 0.28063965, + "step": 3353, + "time_per_iteration": 2.5631332397460938 + }, + { + "auxiliary_loss_clip": 0.06555627, + "auxiliary_loss_mlp": 0.01278407, + "balance_loss_clip": 0.0630668, + "balance_loss_mlp": 0.01253206, + "epoch": 0.20165338944836916, + "flos": 15638632876800.0, + "grad_norm": 1.8574199324884482, + "language_loss": 0.9049592, + "learning_rate": 3.6987920074999747e-06, + "loss": 0.98329961, + "num_input_tokens_seen": 72432210, + "router_z_loss_clip": 2.49414062, + "router_z_loss_mlp": 0.2520752, + "step": 3354, + "time_per_iteration": 2.567229986190796 + }, + { + "auxiliary_loss_clip": 0.06439115, + "auxiliary_loss_mlp": 0.01275126, + "balance_loss_clip": 0.06305242, + "balance_loss_mlp": 0.01268129, + "epoch": 0.20171351270103713, + "flos": 57929926089600.0, + "grad_norm": 0.8202677442032412, + "language_loss": 0.55840385, + "learning_rate": 3.6985864345406465e-06, + "loss": 0.63554633, + "num_input_tokens_seen": 72489225, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.07012939, + "step": 3355, + "time_per_iteration": 3.118603229522705 + }, + { + "auxiliary_loss_clip": 0.06557105, + "auxiliary_loss_mlp": 0.01281149, + "balance_loss_clip": 0.06309459, + "balance_loss_mlp": 0.01257474, + "epoch": 0.2017736359537051, + "flos": 20820768163200.0, + "grad_norm": 1.5861142309185163, + "language_loss": 0.84845644, + "learning_rate": 3.698380797170751e-06, + "loss": 0.92683893, + "num_input_tokens_seen": 72508715, + "router_z_loss_clip": 2.47265625, + "router_z_loss_mlp": 0.23669434, + "step": 3356, + "time_per_iteration": 2.5407068729400635 + }, + { + "auxiliary_loss_clip": 0.06578876, + "auxiliary_loss_mlp": 0.01283859, + "balance_loss_clip": 0.06314196, + "balance_loss_mlp": 0.01255344, + "epoch": 0.20183375920637306, + "flos": 17097696023040.0, + "grad_norm": 3.7689574240726147, + "language_loss": 0.71072245, + "learning_rate": 3.698175095398085e-06, + "loss": 0.78934979, + "num_input_tokens_seen": 72525135, + "router_z_loss_clip": 2.64257812, + "router_z_loss_mlp": 0.28515625, + "step": 3357, + "time_per_iteration": 2.4921233654022217 + }, + { + "auxiliary_loss_clip": 0.065685, + "auxiliary_loss_mlp": 0.01288812, + "balance_loss_clip": 0.0631017, + "balance_loss_mlp": 0.01263206, + "epoch": 0.20189388245904102, + "flos": 18667323031680.0, + "grad_norm": 2.064581487792546, + "language_loss": 0.72707927, + "learning_rate": 3.6979693292304493e-06, + "loss": 0.80565238, + "num_input_tokens_seen": 72543690, + "router_z_loss_clip": 2.58007812, + "router_z_loss_mlp": 0.25585938, + "step": 3358, + "time_per_iteration": 2.531280040740967 + }, + { + "auxiliary_loss_clip": 0.06550319, + "auxiliary_loss_mlp": 0.0128707, + "balance_loss_clip": 0.06304348, + "balance_loss_mlp": 0.01263633, + "epoch": 0.20195400571170902, + "flos": 16802705324160.0, + "grad_norm": 1.761827203655194, + "language_loss": 0.83542818, + "learning_rate": 3.6977634986756463e-06, + "loss": 0.91380209, + "num_input_tokens_seen": 72560725, + "router_z_loss_clip": 2.46289062, + "router_z_loss_mlp": 0.234375, + "step": 3359, + "time_per_iteration": 2.5004122257232666 + }, + { + "auxiliary_loss_clip": 0.06415485, + "auxiliary_loss_mlp": 0.01275385, + "balance_loss_clip": 0.06281421, + "balance_loss_mlp": 0.01269109, + "epoch": 0.20201412896437698, + "flos": 67192792669440.0, + "grad_norm": 0.7763137973079639, + "language_loss": 0.58718604, + "learning_rate": 3.697557603741482e-06, + "loss": 0.66409475, + "num_input_tokens_seen": 72621940, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.06274414, + "step": 3360, + "time_per_iteration": 3.202280282974243 + }, + { + "auxiliary_loss_clip": 0.06567518, + "auxiliary_loss_mlp": 0.01281863, + "balance_loss_clip": 0.06312253, + "balance_loss_mlp": 0.01257055, + "epoch": 0.20207425221704495, + "flos": 21331477998720.0, + "grad_norm": 2.7701451368403767, + "language_loss": 0.63371557, + "learning_rate": 3.697351644435763e-06, + "loss": 0.71220934, + "num_input_tokens_seen": 72639135, + "router_z_loss_clip": 2.55078125, + "router_z_loss_mlp": 0.24841309, + "step": 3361, + "time_per_iteration": 2.591505527496338 + }, + { + "auxiliary_loss_clip": 0.06556661, + "auxiliary_loss_mlp": 0.01280295, + "balance_loss_clip": 0.06304803, + "balance_loss_mlp": 0.01257049, + "epoch": 0.2021343754697129, + "flos": 22533509145600.0, + "grad_norm": 1.837331842396403, + "language_loss": 0.76495373, + "learning_rate": 3.6971456207662993e-06, + "loss": 0.84332329, + "num_input_tokens_seen": 72658525, + "router_z_loss_clip": 2.51953125, + "router_z_loss_mlp": 0.23254395, + "step": 3362, + "time_per_iteration": 2.5748798847198486 + }, + { + "auxiliary_loss_clip": 0.06552652, + "auxiliary_loss_mlp": 0.01281781, + "balance_loss_clip": 0.06300291, + "balance_loss_mlp": 0.01257379, + "epoch": 0.20219449872238088, + "flos": 19068852597120.0, + "grad_norm": 1.6506097934595576, + "language_loss": 0.77716577, + "learning_rate": 3.6969395327409035e-06, + "loss": 0.85551012, + "num_input_tokens_seen": 72678085, + "router_z_loss_clip": 2.52734375, + "router_z_loss_mlp": 0.24365234, + "step": 3363, + "time_per_iteration": 2.5682361125946045 + }, + { + "auxiliary_loss_clip": 0.06556462, + "auxiliary_loss_mlp": 0.01285372, + "balance_loss_clip": 0.06303493, + "balance_loss_mlp": 0.01262198, + "epoch": 0.20225462197504884, + "flos": 24723864800640.0, + "grad_norm": 1.5662342973814338, + "language_loss": 0.75767177, + "learning_rate": 3.696733380367391e-06, + "loss": 0.83609009, + "num_input_tokens_seen": 72698695, + "router_z_loss_clip": 2.53515625, + "router_z_loss_mlp": 0.23181152, + "step": 3364, + "time_per_iteration": 2.620352029800415 + }, + { + "auxiliary_loss_clip": 0.06564072, + "auxiliary_loss_mlp": 0.01282858, + "balance_loss_clip": 0.06306748, + "balance_loss_mlp": 0.01259374, + "epoch": 0.2023147452277168, + "flos": 22024895662080.0, + "grad_norm": 2.684464985384485, + "language_loss": 0.72232616, + "learning_rate": 3.6965271636535783e-06, + "loss": 0.80079544, + "num_input_tokens_seen": 72717880, + "router_z_loss_clip": 2.57226562, + "router_z_loss_mlp": 0.23474121, + "step": 3365, + "time_per_iteration": 2.6884727478027344 + }, + { + "auxiliary_loss_clip": 0.06551654, + "auxiliary_loss_mlp": 0.01279748, + "balance_loss_clip": 0.0629961, + "balance_loss_mlp": 0.01256336, + "epoch": 0.2023748684803848, + "flos": 17750555510400.0, + "grad_norm": 1.8865204005259733, + "language_loss": 0.86329257, + "learning_rate": 3.696320882607286e-06, + "loss": 0.94160658, + "num_input_tokens_seen": 72736410, + "router_z_loss_clip": 2.52148438, + "router_z_loss_mlp": 0.23425293, + "step": 3366, + "time_per_iteration": 2.541398525238037 + }, + { + "auxiliary_loss_clip": 0.06552443, + "auxiliary_loss_mlp": 0.01277252, + "balance_loss_clip": 0.06302489, + "balance_loss_mlp": 0.01254698, + "epoch": 0.20243499173305277, + "flos": 31146912328320.0, + "grad_norm": 1.6069123477498997, + "language_loss": 0.69763649, + "learning_rate": 3.696114537236335e-06, + "loss": 0.77593338, + "num_input_tokens_seen": 72758295, + "router_z_loss_clip": 2.49804688, + "router_z_loss_mlp": 0.22558594, + "step": 3367, + "time_per_iteration": 2.674370527267456 + }, + { + "auxiliary_loss_clip": 0.06562914, + "auxiliary_loss_mlp": 0.01285589, + "balance_loss_clip": 0.06300482, + "balance_loss_mlp": 0.01257777, + "epoch": 0.20249511498572073, + "flos": 33847726256640.0, + "grad_norm": 1.76028679400595, + "language_loss": 0.69152057, + "learning_rate": 3.6959081275485512e-06, + "loss": 0.77000558, + "num_input_tokens_seen": 72782495, + "router_z_loss_clip": 2.62109375, + "router_z_loss_mlp": 0.27819824, + "step": 3368, + "time_per_iteration": 2.6662635803222656 + }, + { + "auxiliary_loss_clip": 0.06551345, + "auxiliary_loss_mlp": 0.0128738, + "balance_loss_clip": 0.06301117, + "balance_loss_mlp": 0.01263657, + "epoch": 0.2025552382383887, + "flos": 21222088093440.0, + "grad_norm": 1.819755421756695, + "language_loss": 0.78064144, + "learning_rate": 3.6957016535517615e-06, + "loss": 0.8590287, + "num_input_tokens_seen": 72801885, + "router_z_loss_clip": 2.50390625, + "router_z_loss_mlp": 0.23718262, + "step": 3369, + "time_per_iteration": 2.5846660137176514 + }, + { + "auxiliary_loss_clip": 0.06560668, + "auxiliary_loss_mlp": 0.01282514, + "balance_loss_clip": 0.06299458, + "balance_loss_mlp": 0.01257492, + "epoch": 0.20261536149105666, + "flos": 14652614355840.0, + "grad_norm": 3.2010156823618687, + "language_loss": 0.66533637, + "learning_rate": 3.695495115253795e-06, + "loss": 0.74376816, + "num_input_tokens_seen": 72816990, + "router_z_loss_clip": 2.61132812, + "router_z_loss_mlp": 0.25024414, + "step": 3370, + "time_per_iteration": 3.953664541244507 + }, + { + "auxiliary_loss_clip": 0.06420556, + "auxiliary_loss_mlp": 0.01256354, + "balance_loss_clip": 0.06284036, + "balance_loss_mlp": 0.01249797, + "epoch": 0.20267548474372463, + "flos": 66803380018560.0, + "grad_norm": 0.6606134365812599, + "language_loss": 0.58273321, + "learning_rate": 3.6952885126624834e-06, + "loss": 0.65950233, + "num_input_tokens_seen": 72879240, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.06567383, + "step": 3371, + "time_per_iteration": 3.2517025470733643 + }, + { + "auxiliary_loss_clip": 0.06555597, + "auxiliary_loss_mlp": 0.01283717, + "balance_loss_clip": 0.06300298, + "balance_loss_mlp": 0.01257944, + "epoch": 0.2027356079963926, + "flos": 24687667036800.0, + "grad_norm": 1.6416079718190109, + "language_loss": 0.92020303, + "learning_rate": 3.6950818457856617e-06, + "loss": 0.99859619, + "num_input_tokens_seen": 72899030, + "router_z_loss_clip": 2.55273438, + "router_z_loss_mlp": 0.25769043, + "step": 3372, + "time_per_iteration": 4.108370065689087 + }, + { + "auxiliary_loss_clip": 0.06555616, + "auxiliary_loss_mlp": 0.01283062, + "balance_loss_clip": 0.06298956, + "balance_loss_mlp": 0.01258672, + "epoch": 0.20279573124906058, + "flos": 26399443697280.0, + "grad_norm": 1.769817073167301, + "language_loss": 0.79293168, + "learning_rate": 3.694875114631167e-06, + "loss": 0.87131846, + "num_input_tokens_seen": 72919190, + "router_z_loss_clip": 2.56640625, + "router_z_loss_mlp": 0.24414062, + "step": 3373, + "time_per_iteration": 2.6076717376708984 + }, + { + "auxiliary_loss_clip": 0.06543471, + "auxiliary_loss_mlp": 0.01280674, + "balance_loss_clip": 0.06296648, + "balance_loss_mlp": 0.01256343, + "epoch": 0.20285585450172855, + "flos": 33808006621440.0, + "grad_norm": 3.4143342380796255, + "language_loss": 0.72364163, + "learning_rate": 3.6946683192068377e-06, + "loss": 0.8018831, + "num_input_tokens_seen": 72939720, + "router_z_loss_clip": 2.46679688, + "router_z_loss_mlp": 0.24328613, + "step": 3374, + "time_per_iteration": 2.6686174869537354 + }, + { + "auxiliary_loss_clip": 0.06419748, + "auxiliary_loss_mlp": 0.01258876, + "balance_loss_clip": 0.06284177, + "balance_loss_mlp": 0.01252266, + "epoch": 0.20291597775439651, + "flos": 71185768410240.0, + "grad_norm": 1.0120800133799934, + "language_loss": 0.62520474, + "learning_rate": 3.694461459520516e-06, + "loss": 0.70199096, + "num_input_tokens_seen": 73000015, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.06622314, + "step": 3375, + "time_per_iteration": 3.159513473510742 + }, + { + "auxiliary_loss_clip": 0.06548455, + "auxiliary_loss_mlp": 0.01283408, + "balance_loss_clip": 0.06294296, + "balance_loss_mlp": 0.0125891, + "epoch": 0.20297610100706448, + "flos": 19499368475520.0, + "grad_norm": 1.6178559610323104, + "language_loss": 0.82908762, + "learning_rate": 3.6942545355800463e-06, + "loss": 0.90740621, + "num_input_tokens_seen": 73017675, + "router_z_loss_clip": 2.54296875, + "router_z_loss_mlp": 0.24499512, + "step": 3376, + "time_per_iteration": 2.5366275310516357 + }, + { + "auxiliary_loss_clip": 0.06553418, + "auxiliary_loss_mlp": 0.01284265, + "balance_loss_clip": 0.06296962, + "balance_loss_mlp": 0.0125854, + "epoch": 0.20303622425973245, + "flos": 25050944413440.0, + "grad_norm": 2.015544075965587, + "language_loss": 0.82464767, + "learning_rate": 3.6940475473932743e-06, + "loss": 0.90302449, + "num_input_tokens_seen": 73036135, + "router_z_loss_clip": 2.56835938, + "router_z_loss_mlp": 0.25720215, + "step": 3377, + "time_per_iteration": 2.579468250274658 + }, + { + "auxiliary_loss_clip": 0.06554671, + "auxiliary_loss_mlp": 0.01287763, + "balance_loss_clip": 0.06300091, + "balance_loss_mlp": 0.01261453, + "epoch": 0.2030963475124004, + "flos": 21986266129920.0, + "grad_norm": 1.7361857812490578, + "language_loss": 0.7745406, + "learning_rate": 3.69384049496805e-06, + "loss": 0.85296494, + "num_input_tokens_seen": 73054075, + "router_z_loss_clip": 2.546875, + "router_z_loss_mlp": 0.26306152, + "step": 3378, + "time_per_iteration": 3.999164342880249 + }, + { + "auxiliary_loss_clip": 0.06557525, + "auxiliary_loss_mlp": 0.01285912, + "balance_loss_clip": 0.06298093, + "balance_loss_mlp": 0.01259423, + "epoch": 0.2031564707650684, + "flos": 19506496072320.0, + "grad_norm": 1.7814270376711854, + "language_loss": 0.80552137, + "learning_rate": 3.6936333783122242e-06, + "loss": 0.88395572, + "num_input_tokens_seen": 73073530, + "router_z_loss_clip": 2.59375, + "router_z_loss_mlp": 0.26525879, + "step": 3379, + "time_per_iteration": 3.94376277923584 + }, + { + "auxiliary_loss_clip": 0.06547987, + "auxiliary_loss_mlp": 0.01283987, + "balance_loss_clip": 0.06298195, + "balance_loss_mlp": 0.01259799, + "epoch": 0.20321659401773637, + "flos": 22753630621440.0, + "grad_norm": 1.8399421212903948, + "language_loss": 0.87578034, + "learning_rate": 3.6934261974336505e-06, + "loss": 0.95410013, + "num_input_tokens_seen": 73092820, + "router_z_loss_clip": 2.49609375, + "router_z_loss_mlp": 0.24206543, + "step": 3380, + "time_per_iteration": 2.5826356410980225 + }, + { + "auxiliary_loss_clip": 0.06554954, + "auxiliary_loss_mlp": 0.01300173, + "balance_loss_clip": 0.06299303, + "balance_loss_mlp": 0.01274817, + "epoch": 0.20327671727040433, + "flos": 22462455283200.0, + "grad_norm": 2.147675917051705, + "language_loss": 0.75801265, + "learning_rate": 3.693218952340186e-06, + "loss": 0.83656389, + "num_input_tokens_seen": 73113385, + "router_z_loss_clip": 2.55664062, + "router_z_loss_mlp": 0.2532959, + "step": 3381, + "time_per_iteration": 2.580035924911499 + }, + { + "auxiliary_loss_clip": 0.06559204, + "auxiliary_loss_mlp": 0.0128659, + "balance_loss_clip": 0.06297147, + "balance_loss_mlp": 0.01260198, + "epoch": 0.2033368405230723, + "flos": 19540807119360.0, + "grad_norm": 1.8225171591496117, + "language_loss": 0.79701936, + "learning_rate": 3.6930116430396895e-06, + "loss": 0.87547731, + "num_input_tokens_seen": 73131195, + "router_z_loss_clip": 2.62304688, + "router_z_loss_mlp": 0.26391602, + "step": 3382, + "time_per_iteration": 2.743842601776123 + }, + { + "auxiliary_loss_clip": 0.06551235, + "auxiliary_loss_mlp": 0.01283934, + "balance_loss_clip": 0.06293041, + "balance_loss_mlp": 0.01258745, + "epoch": 0.20339696377574026, + "flos": 13814489491200.0, + "grad_norm": 1.712325191768153, + "language_loss": 0.80308962, + "learning_rate": 3.6928042695400214e-06, + "loss": 0.8814413, + "num_input_tokens_seen": 73148850, + "router_z_loss_clip": 2.58203125, + "router_z_loss_mlp": 0.25195312, + "step": 3383, + "time_per_iteration": 2.6428067684173584 + }, + { + "auxiliary_loss_clip": 0.06548008, + "auxiliary_loss_mlp": 0.01285433, + "balance_loss_clip": 0.06295451, + "balance_loss_mlp": 0.01259541, + "epoch": 0.20345708702840823, + "flos": 20345627185920.0, + "grad_norm": 1.7809184522678074, + "language_loss": 0.75199848, + "learning_rate": 3.6925968318490464e-06, + "loss": 0.83033288, + "num_input_tokens_seen": 73166775, + "router_z_loss_clip": 2.52734375, + "router_z_loss_mlp": 0.25891113, + "step": 3384, + "time_per_iteration": 2.5601112842559814 + }, + { + "auxiliary_loss_clip": 0.06573269, + "auxiliary_loss_mlp": 0.01282943, + "balance_loss_clip": 0.06306025, + "balance_loss_mlp": 0.01256229, + "epoch": 0.2035172102810762, + "flos": 20339254275840.0, + "grad_norm": 2.5841350087074852, + "language_loss": 0.77226508, + "learning_rate": 3.6923893299746293e-06, + "loss": 0.85082722, + "num_input_tokens_seen": 73183215, + "router_z_loss_clip": 2.66992188, + "router_z_loss_mlp": 0.26745605, + "step": 3385, + "time_per_iteration": 2.527583122253418 + }, + { + "auxiliary_loss_clip": 0.06553946, + "auxiliary_loss_mlp": 0.01288968, + "balance_loss_clip": 0.06300423, + "balance_loss_mlp": 0.01263934, + "epoch": 0.2035773335337442, + "flos": 23337658379520.0, + "grad_norm": 1.6683994830989402, + "language_loss": 0.70000219, + "learning_rate": 3.692181763924639e-06, + "loss": 0.7784313, + "num_input_tokens_seen": 73203290, + "router_z_loss_clip": 2.53710938, + "router_z_loss_mlp": 0.25048828, + "step": 3386, + "time_per_iteration": 2.583940029144287 + }, + { + "auxiliary_loss_clip": 0.06550556, + "auxiliary_loss_mlp": 0.01289862, + "balance_loss_clip": 0.0629431, + "balance_loss_mlp": 0.01265495, + "epoch": 0.20363745678641215, + "flos": 28337924378880.0, + "grad_norm": 1.2744067098921972, + "language_loss": 0.81998229, + "learning_rate": 3.691974133706947e-06, + "loss": 0.89838648, + "num_input_tokens_seen": 73226185, + "router_z_loss_clip": 2.5625, + "router_z_loss_mlp": 0.24365234, + "step": 3387, + "time_per_iteration": 2.624765634536743 + }, + { + "auxiliary_loss_clip": 0.06543861, + "auxiliary_loss_mlp": 0.01285642, + "balance_loss_clip": 0.06297304, + "balance_loss_mlp": 0.01261705, + "epoch": 0.20369758003908012, + "flos": 18921503992320.0, + "grad_norm": 2.338231566069276, + "language_loss": 0.80333674, + "learning_rate": 3.6917664393294262e-06, + "loss": 0.88163185, + "num_input_tokens_seen": 73243300, + "router_z_loss_clip": 2.47070312, + "router_z_loss_mlp": 0.23925781, + "step": 3388, + "time_per_iteration": 2.565795421600342 + }, + { + "auxiliary_loss_clip": 0.06553982, + "auxiliary_loss_mlp": 0.01281213, + "balance_loss_clip": 0.06297579, + "balance_loss_mlp": 0.0125693, + "epoch": 0.20375770329174808, + "flos": 19212218133120.0, + "grad_norm": 1.8814817968190891, + "language_loss": 0.72894287, + "learning_rate": 3.6915586807999527e-06, + "loss": 0.80729485, + "num_input_tokens_seen": 73261490, + "router_z_loss_clip": 2.56445312, + "router_z_loss_mlp": 0.24279785, + "step": 3389, + "time_per_iteration": 2.5263590812683105 + }, + { + "auxiliary_loss_clip": 0.06544612, + "auxiliary_loss_mlp": 0.01286594, + "balance_loss_clip": 0.06296231, + "balance_loss_mlp": 0.01262204, + "epoch": 0.20381782654441605, + "flos": 19397106167040.0, + "grad_norm": 2.5524619095037626, + "language_loss": 0.88214552, + "learning_rate": 3.691350858126404e-06, + "loss": 0.96045768, + "num_input_tokens_seen": 73280180, + "router_z_loss_clip": 2.484375, + "router_z_loss_mlp": 0.24377441, + "step": 3390, + "time_per_iteration": 2.5450997352600098 + }, + { + "auxiliary_loss_clip": 0.06546676, + "auxiliary_loss_mlp": 0.01283726, + "balance_loss_clip": 0.06297011, + "balance_loss_mlp": 0.01260683, + "epoch": 0.203877949797084, + "flos": 24834764079360.0, + "grad_norm": 2.430374095532116, + "language_loss": 0.71690643, + "learning_rate": 3.691142971316662e-06, + "loss": 0.79521036, + "num_input_tokens_seen": 73300680, + "router_z_loss_clip": 2.49609375, + "router_z_loss_mlp": 0.23022461, + "step": 3391, + "time_per_iteration": 2.5983424186706543 + }, + { + "auxiliary_loss_clip": 0.06548478, + "auxiliary_loss_mlp": 0.01287319, + "balance_loss_clip": 0.06300271, + "balance_loss_mlp": 0.01263799, + "epoch": 0.20393807304975198, + "flos": 18009432299520.0, + "grad_norm": 3.271459971820983, + "language_loss": 0.87029123, + "learning_rate": 3.6909350203786086e-06, + "loss": 0.94864917, + "num_input_tokens_seen": 73316760, + "router_z_loss_clip": 2.48242188, + "router_z_loss_mlp": 0.2355957, + "step": 3392, + "time_per_iteration": 2.5094432830810547 + }, + { + "auxiliary_loss_clip": 0.06555735, + "auxiliary_loss_mlp": 0.01288889, + "balance_loss_clip": 0.06302007, + "balance_loss_mlp": 0.0126432, + "epoch": 0.20399819630241997, + "flos": 24213867724800.0, + "grad_norm": 1.4298747009925739, + "language_loss": 0.8143822, + "learning_rate": 3.69072700532013e-06, + "loss": 0.8928284, + "num_input_tokens_seen": 73339385, + "router_z_loss_clip": 2.53710938, + "router_z_loss_mlp": 0.24560547, + "step": 3393, + "time_per_iteration": 2.674898147583008 + }, + { + "auxiliary_loss_clip": 0.06555712, + "auxiliary_loss_mlp": 0.01283361, + "balance_loss_clip": 0.0630876, + "balance_loss_mlp": 0.01260747, + "epoch": 0.20405831955508794, + "flos": 20783396442240.0, + "grad_norm": 2.2973425083766377, + "language_loss": 0.87181509, + "learning_rate": 3.6905189261491137e-06, + "loss": 0.9502058, + "num_input_tokens_seen": 73357235, + "router_z_loss_clip": 2.47070312, + "router_z_loss_mlp": 0.22619629, + "step": 3394, + "time_per_iteration": 2.5489470958709717 + }, + { + "auxiliary_loss_clip": 0.06548424, + "auxiliary_loss_mlp": 0.0128548, + "balance_loss_clip": 0.06299029, + "balance_loss_mlp": 0.01262448, + "epoch": 0.2041184428077559, + "flos": 15492332448000.0, + "grad_norm": 2.1306464149991027, + "language_loss": 0.8456347, + "learning_rate": 3.69031078287345e-06, + "loss": 0.92397374, + "num_input_tokens_seen": 73374435, + "router_z_loss_clip": 2.49609375, + "router_z_loss_mlp": 0.23034668, + "step": 3395, + "time_per_iteration": 2.5297558307647705 + }, + { + "auxiliary_loss_clip": 0.06554371, + "auxiliary_loss_mlp": 0.01288203, + "balance_loss_clip": 0.06299008, + "balance_loss_mlp": 0.0126448, + "epoch": 0.20417856606042387, + "flos": 15592582258560.0, + "grad_norm": 1.9297262637725432, + "language_loss": 0.84104818, + "learning_rate": 3.690102575501033e-06, + "loss": 0.91947389, + "num_input_tokens_seen": 73391025, + "router_z_loss_clip": 2.55273438, + "router_z_loss_mlp": 0.23730469, + "step": 3396, + "time_per_iteration": 2.492448568344116 + }, + { + "auxiliary_loss_clip": 0.0654766, + "auxiliary_loss_mlp": 0.01296047, + "balance_loss_clip": 0.06301443, + "balance_loss_mlp": 0.01272706, + "epoch": 0.20423868931309183, + "flos": 24286137471360.0, + "grad_norm": 2.084884773893835, + "language_loss": 0.7751056, + "learning_rate": 3.6898943040397556e-06, + "loss": 0.85354269, + "num_input_tokens_seen": 73409270, + "router_z_loss_clip": 2.46289062, + "router_z_loss_mlp": 0.2331543, + "step": 3397, + "time_per_iteration": 2.5621836185455322 + }, + { + "auxiliary_loss_clip": 0.06547033, + "auxiliary_loss_mlp": 0.01291146, + "balance_loss_clip": 0.06300367, + "balance_loss_mlp": 0.01268067, + "epoch": 0.2042988125657598, + "flos": 18619176061440.0, + "grad_norm": 3.401004534017878, + "language_loss": 0.88746947, + "learning_rate": 3.689685968497518e-06, + "loss": 0.96585131, + "num_input_tokens_seen": 73425225, + "router_z_loss_clip": 2.46875, + "router_z_loss_mlp": 0.23083496, + "step": 3398, + "time_per_iteration": 2.4821889400482178 + }, + { + "auxiliary_loss_clip": 0.06555858, + "auxiliary_loss_mlp": 0.01287072, + "balance_loss_clip": 0.06305312, + "balance_loss_mlp": 0.01263361, + "epoch": 0.2043589358184278, + "flos": 17855836565760.0, + "grad_norm": 2.044777021305177, + "language_loss": 0.79053116, + "learning_rate": 3.6894775688822186e-06, + "loss": 0.8689605, + "num_input_tokens_seen": 73440940, + "router_z_loss_clip": 2.50390625, + "router_z_loss_mlp": 0.23706055, + "step": 3399, + "time_per_iteration": 2.5007028579711914 + }, + { + "auxiliary_loss_clip": 0.06554085, + "auxiliary_loss_mlp": 0.01288353, + "balance_loss_clip": 0.06300685, + "balance_loss_mlp": 0.01264678, + "epoch": 0.20441905907109575, + "flos": 21441832225920.0, + "grad_norm": 3.4484144890832327, + "language_loss": 0.77263522, + "learning_rate": 3.6892691052017603e-06, + "loss": 0.85105962, + "num_input_tokens_seen": 73458805, + "router_z_loss_clip": 2.53125, + "router_z_loss_mlp": 0.23669434, + "step": 3400, + "time_per_iteration": 2.524930715560913 + }, + { + "auxiliary_loss_clip": 0.06546277, + "auxiliary_loss_mlp": 0.0128369, + "balance_loss_clip": 0.0630067, + "balance_loss_mlp": 0.01262423, + "epoch": 0.20447918232376372, + "flos": 27714847818240.0, + "grad_norm": 1.566944783994086, + "language_loss": 0.7976017, + "learning_rate": 3.6890605774640487e-06, + "loss": 0.87590134, + "num_input_tokens_seen": 73479380, + "router_z_loss_clip": 2.45703125, + "router_z_loss_mlp": 0.21264648, + "step": 3401, + "time_per_iteration": 2.5868172645568848 + }, + { + "auxiliary_loss_clip": 0.06547564, + "auxiliary_loss_mlp": 0.01287222, + "balance_loss_clip": 0.06297088, + "balance_loss_mlp": 0.01263833, + "epoch": 0.20453930557643168, + "flos": 30533017789440.0, + "grad_norm": 1.6743436404675067, + "language_loss": 0.69998658, + "learning_rate": 3.688851985676991e-06, + "loss": 0.7783345, + "num_input_tokens_seen": 73505105, + "router_z_loss_clip": 2.50195312, + "router_z_loss_mlp": 0.23400879, + "step": 3402, + "time_per_iteration": 2.664961099624634 + }, + { + "auxiliary_loss_clip": 0.06561718, + "auxiliary_loss_mlp": 0.01282309, + "balance_loss_clip": 0.06309628, + "balance_loss_mlp": 0.01259981, + "epoch": 0.20459942882909965, + "flos": 18993480249600.0, + "grad_norm": 2.0207590642868736, + "language_loss": 0.82498461, + "learning_rate": 3.688643329848496e-06, + "loss": 0.90342486, + "num_input_tokens_seen": 73523700, + "router_z_loss_clip": 2.52148438, + "router_z_loss_mlp": 0.2232666, + "step": 3403, + "time_per_iteration": 2.527240514755249 + }, + { + "auxiliary_loss_clip": 0.0655287, + "auxiliary_loss_mlp": 0.0128312, + "balance_loss_clip": 0.06304024, + "balance_loss_mlp": 0.01260256, + "epoch": 0.20465955208176762, + "flos": 20345207915520.0, + "grad_norm": 1.870475930372837, + "language_loss": 0.83792305, + "learning_rate": 3.6884346099864772e-06, + "loss": 0.91628289, + "num_input_tokens_seen": 73542625, + "router_z_loss_clip": 2.48632812, + "router_z_loss_mlp": 0.22900391, + "step": 3404, + "time_per_iteration": 2.5108580589294434 + }, + { + "auxiliary_loss_clip": 0.06555478, + "auxiliary_loss_mlp": 0.01280254, + "balance_loss_clip": 0.06302839, + "balance_loss_mlp": 0.0125671, + "epoch": 0.20471967533443558, + "flos": 21257615024640.0, + "grad_norm": 1.9668153962924477, + "language_loss": 0.86568373, + "learning_rate": 3.6882258260988487e-06, + "loss": 0.94404107, + "num_input_tokens_seen": 73561450, + "router_z_loss_clip": 2.52539062, + "router_z_loss_mlp": 0.2355957, + "step": 3405, + "time_per_iteration": 2.6064257621765137 + }, + { + "auxiliary_loss_clip": 0.06551084, + "auxiliary_loss_mlp": 0.0128024, + "balance_loss_clip": 0.06302287, + "balance_loss_mlp": 0.01257256, + "epoch": 0.20477979858710357, + "flos": 14506775124480.0, + "grad_norm": 2.695451734790842, + "language_loss": 0.85318458, + "learning_rate": 3.6880169781935276e-06, + "loss": 0.93149781, + "num_input_tokens_seen": 73577155, + "router_z_loss_clip": 2.49023438, + "router_z_loss_mlp": 0.22973633, + "step": 3406, + "time_per_iteration": 2.490360975265503 + }, + { + "auxiliary_loss_clip": 0.06551544, + "auxiliary_loss_mlp": 0.01279954, + "balance_loss_clip": 0.06302837, + "balance_loss_mlp": 0.01256768, + "epoch": 0.20483992183977154, + "flos": 11405018609280.0, + "grad_norm": 8.923539759508978, + "language_loss": 0.69000643, + "learning_rate": 3.6878080662784336e-06, + "loss": 0.76832145, + "num_input_tokens_seen": 73594900, + "router_z_loss_clip": 2.484375, + "router_z_loss_mlp": 0.23193359, + "step": 3407, + "time_per_iteration": 2.5344340801239014 + }, + { + "auxiliary_loss_clip": 0.06549555, + "auxiliary_loss_mlp": 0.01280964, + "balance_loss_clip": 0.06303824, + "balance_loss_mlp": 0.01258374, + "epoch": 0.2049000450924395, + "flos": 19065917704320.0, + "grad_norm": 2.112423962078429, + "language_loss": 0.85367447, + "learning_rate": 3.6875990903614886e-06, + "loss": 0.93197966, + "num_input_tokens_seen": 73613810, + "router_z_loss_clip": 2.45703125, + "router_z_loss_mlp": 0.22583008, + "step": 3408, + "time_per_iteration": 2.5491087436676025 + }, + { + "auxiliary_loss_clip": 0.06564584, + "auxiliary_loss_mlp": 0.0128728, + "balance_loss_clip": 0.06310433, + "balance_loss_mlp": 0.0126314, + "epoch": 0.20496016834510747, + "flos": 14579799557760.0, + "grad_norm": 2.4221013711544876, + "language_loss": 0.65169537, + "learning_rate": 3.6873900504506166e-06, + "loss": 0.730214, + "num_input_tokens_seen": 73631495, + "router_z_loss_clip": 2.54492188, + "router_z_loss_mlp": 0.24121094, + "step": 3409, + "time_per_iteration": 2.5570828914642334 + }, + { + "auxiliary_loss_clip": 0.06553619, + "auxiliary_loss_mlp": 0.0128187, + "balance_loss_clip": 0.06302843, + "balance_loss_mlp": 0.01259029, + "epoch": 0.20502029159777543, + "flos": 22133069683200.0, + "grad_norm": 1.5677004994493864, + "language_loss": 0.81331646, + "learning_rate": 3.687180946553745e-06, + "loss": 0.89167136, + "num_input_tokens_seen": 73652840, + "router_z_loss_clip": 2.5078125, + "router_z_loss_mlp": 0.22851562, + "step": 3410, + "time_per_iteration": 3.9941341876983643 + }, + { + "auxiliary_loss_clip": 0.06562116, + "auxiliary_loss_mlp": 0.01278044, + "balance_loss_clip": 0.06316169, + "balance_loss_mlp": 0.01256252, + "epoch": 0.2050804148504434, + "flos": 25373873249280.0, + "grad_norm": 2.231323409005704, + "language_loss": 0.76898587, + "learning_rate": 3.686971778678803e-06, + "loss": 0.84738749, + "num_input_tokens_seen": 73672150, + "router_z_loss_clip": 2.45898438, + "router_z_loss_mlp": 0.21801758, + "step": 3411, + "time_per_iteration": 2.557502031326294 + }, + { + "auxiliary_loss_clip": 0.06566584, + "auxiliary_loss_mlp": 0.01283098, + "balance_loss_clip": 0.06318649, + "balance_loss_mlp": 0.01260567, + "epoch": 0.2051405381031114, + "flos": 23626443876480.0, + "grad_norm": 1.9814328821552187, + "language_loss": 0.73997778, + "learning_rate": 3.686762546833722e-06, + "loss": 0.81847459, + "num_input_tokens_seen": 73691940, + "router_z_loss_clip": 2.47851562, + "router_z_loss_mlp": 0.22521973, + "step": 3412, + "time_per_iteration": 4.038960695266724 + }, + { + "auxiliary_loss_clip": 0.06568237, + "auxiliary_loss_mlp": 0.01280941, + "balance_loss_clip": 0.06316938, + "balance_loss_mlp": 0.01257183, + "epoch": 0.20520066135577936, + "flos": 19570338483840.0, + "grad_norm": 2.4438525241528963, + "language_loss": 0.79063112, + "learning_rate": 3.6865532510264362e-06, + "loss": 0.86912292, + "num_input_tokens_seen": 73709080, + "router_z_loss_clip": 2.515625, + "router_z_loss_mlp": 0.23754883, + "step": 3413, + "time_per_iteration": 2.5169565677642822 + }, + { + "auxiliary_loss_clip": 0.0655475, + "auxiliary_loss_mlp": 0.0128187, + "balance_loss_clip": 0.06315412, + "balance_loss_mlp": 0.01259423, + "epoch": 0.20526078460844732, + "flos": 17682184978560.0, + "grad_norm": 1.8594099787920526, + "language_loss": 0.85324407, + "learning_rate": 3.6863438912648823e-06, + "loss": 0.93161035, + "num_input_tokens_seen": 73727670, + "router_z_loss_clip": 2.39453125, + "router_z_loss_mlp": 0.2244873, + "step": 3414, + "time_per_iteration": 2.51891827583313 + }, + { + "auxiliary_loss_clip": 0.06556672, + "auxiliary_loss_mlp": 0.01283982, + "balance_loss_clip": 0.0631127, + "balance_loss_mlp": 0.01261451, + "epoch": 0.2053209078611153, + "flos": 21505632710400.0, + "grad_norm": 1.8989416463636506, + "language_loss": 0.8139196, + "learning_rate": 3.6861344675569986e-06, + "loss": 0.89232612, + "num_input_tokens_seen": 73747170, + "router_z_loss_clip": 2.453125, + "router_z_loss_mlp": 0.22521973, + "step": 3415, + "time_per_iteration": 2.534064769744873 + }, + { + "auxiliary_loss_clip": 0.06545444, + "auxiliary_loss_mlp": 0.01280017, + "balance_loss_clip": 0.06300274, + "balance_loss_mlp": 0.01259048, + "epoch": 0.20538103111378325, + "flos": 25670163686400.0, + "grad_norm": 1.9272907146050138, + "language_loss": 0.73450923, + "learning_rate": 3.6859249799107275e-06, + "loss": 0.81276381, + "num_input_tokens_seen": 73767690, + "router_z_loss_clip": 2.44921875, + "router_z_loss_mlp": 0.20959473, + "step": 3416, + "time_per_iteration": 2.5862622261047363 + }, + { + "auxiliary_loss_clip": 0.06555279, + "auxiliary_loss_mlp": 0.01278586, + "balance_loss_clip": 0.06309061, + "balance_loss_mlp": 0.01256342, + "epoch": 0.20544115436645122, + "flos": 23155663311360.0, + "grad_norm": 3.21470343355828, + "language_loss": 0.79731691, + "learning_rate": 3.6857154283340115e-06, + "loss": 0.87565553, + "num_input_tokens_seen": 73786900, + "router_z_loss_clip": 2.46484375, + "router_z_loss_mlp": 0.22253418, + "step": 3417, + "time_per_iteration": 2.5488288402557373 + }, + { + "auxiliary_loss_clip": 0.06553051, + "auxiliary_loss_mlp": 0.0128197, + "balance_loss_clip": 0.06304517, + "balance_loss_mlp": 0.01258248, + "epoch": 0.20550127761911918, + "flos": 19396435334400.0, + "grad_norm": 3.2012221600430744, + "language_loss": 0.88593423, + "learning_rate": 3.685505812834798e-06, + "loss": 0.96428442, + "num_input_tokens_seen": 73804515, + "router_z_loss_clip": 2.484375, + "router_z_loss_mlp": 0.23681641, + "step": 3418, + "time_per_iteration": 5.385840177536011 + }, + { + "auxiliary_loss_clip": 0.06553373, + "auxiliary_loss_mlp": 0.01284895, + "balance_loss_clip": 0.06304052, + "balance_loss_mlp": 0.0125998, + "epoch": 0.20556140087178718, + "flos": 22899721415040.0, + "grad_norm": 2.325256215928591, + "language_loss": 0.63040721, + "learning_rate": 3.685296133421035e-06, + "loss": 0.70878994, + "num_input_tokens_seen": 73822910, + "router_z_loss_clip": 2.49609375, + "router_z_loss_mlp": 0.24926758, + "step": 3419, + "time_per_iteration": 2.5786759853363037 + }, + { + "auxiliary_loss_clip": 0.06563735, + "auxiliary_loss_mlp": 0.01291649, + "balance_loss_clip": 0.06310479, + "balance_loss_mlp": 0.01265554, + "epoch": 0.20562152412445514, + "flos": 19795365423360.0, + "grad_norm": 1.7732270709951168, + "language_loss": 0.86988509, + "learning_rate": 3.685086390100674e-06, + "loss": 0.948439, + "num_input_tokens_seen": 73841160, + "router_z_loss_clip": 2.53515625, + "router_z_loss_mlp": 0.26098633, + "step": 3420, + "time_per_iteration": 2.5364928245544434 + }, + { + "auxiliary_loss_clip": 0.06546585, + "auxiliary_loss_mlp": 0.01284653, + "balance_loss_clip": 0.0630153, + "balance_loss_mlp": 0.01261109, + "epoch": 0.2056816473771231, + "flos": 31509728507520.0, + "grad_norm": 10.333340616962191, + "language_loss": 0.71886712, + "learning_rate": 3.684876582881668e-06, + "loss": 0.79717946, + "num_input_tokens_seen": 73862795, + "router_z_loss_clip": 2.44726562, + "router_z_loss_mlp": 0.2355957, + "step": 3421, + "time_per_iteration": 2.6350786685943604 + }, + { + "auxiliary_loss_clip": 0.06544094, + "auxiliary_loss_mlp": 0.01288814, + "balance_loss_clip": 0.0629928, + "balance_loss_mlp": 0.0126564, + "epoch": 0.20574177062979107, + "flos": 23265095143680.0, + "grad_norm": 2.122387036588777, + "language_loss": 0.72175372, + "learning_rate": 3.6846667117719732e-06, + "loss": 0.8000828, + "num_input_tokens_seen": 73881525, + "router_z_loss_clip": 2.44921875, + "router_z_loss_mlp": 0.23168945, + "step": 3422, + "time_per_iteration": 2.578552007675171 + }, + { + "auxiliary_loss_clip": 0.06409879, + "auxiliary_loss_mlp": 0.01269861, + "balance_loss_clip": 0.06279843, + "balance_loss_mlp": 0.01263078, + "epoch": 0.20580189388245904, + "flos": 70331124291840.0, + "grad_norm": 0.7131964126658911, + "language_loss": 0.551377, + "learning_rate": 3.684456776779548e-06, + "loss": 0.62817442, + "num_input_tokens_seen": 73937775, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.06799316, + "step": 3423, + "time_per_iteration": 3.2106337547302246 + }, + { + "auxiliary_loss_clip": 0.06548166, + "auxiliary_loss_mlp": 0.0128448, + "balance_loss_clip": 0.06301543, + "balance_loss_mlp": 0.01261091, + "epoch": 0.205862017135127, + "flos": 30745802033280.0, + "grad_norm": 1.8660135712145316, + "language_loss": 0.72238076, + "learning_rate": 3.684246777912353e-06, + "loss": 0.80070728, + "num_input_tokens_seen": 73958250, + "router_z_loss_clip": 2.46875, + "router_z_loss_mlp": 0.23400879, + "step": 3424, + "time_per_iteration": 2.614389181137085 + }, + { + "auxiliary_loss_clip": 0.06544662, + "auxiliary_loss_mlp": 0.01287262, + "balance_loss_clip": 0.06303795, + "balance_loss_mlp": 0.01263229, + "epoch": 0.20592214038779497, + "flos": 21330932947200.0, + "grad_norm": 1.6926765615616197, + "language_loss": 0.75646138, + "learning_rate": 3.684036715178351e-06, + "loss": 0.83478063, + "num_input_tokens_seen": 73977775, + "router_z_loss_clip": 2.41015625, + "router_z_loss_mlp": 0.24023438, + "step": 3425, + "time_per_iteration": 2.5351436138153076 + }, + { + "auxiliary_loss_clip": 0.06546403, + "auxiliary_loss_mlp": 0.01289796, + "balance_loss_clip": 0.06304145, + "balance_loss_mlp": 0.01266813, + "epoch": 0.20598226364046296, + "flos": 22898002406400.0, + "grad_norm": 1.848184132977354, + "language_loss": 0.88618112, + "learning_rate": 3.683826588585508e-06, + "loss": 0.9645431, + "num_input_tokens_seen": 73996590, + "router_z_loss_clip": 2.421875, + "router_z_loss_mlp": 0.22998047, + "step": 3426, + "time_per_iteration": 2.604752779006958 + }, + { + "auxiliary_loss_clip": 0.06551787, + "auxiliary_loss_mlp": 0.01284615, + "balance_loss_clip": 0.06311674, + "balance_loss_mlp": 0.01261226, + "epoch": 0.20604238689313092, + "flos": 23885362592640.0, + "grad_norm": 1.5517486951437824, + "language_loss": 0.77144063, + "learning_rate": 3.6836163981417926e-06, + "loss": 0.8498047, + "num_input_tokens_seen": 74015935, + "router_z_loss_clip": 2.40625, + "router_z_loss_mlp": 0.23376465, + "step": 3427, + "time_per_iteration": 2.5526115894317627 + }, + { + "auxiliary_loss_clip": 0.06556956, + "auxiliary_loss_mlp": 0.01287227, + "balance_loss_clip": 0.06309945, + "balance_loss_mlp": 0.01264661, + "epoch": 0.2061025101457989, + "flos": 22498024141440.0, + "grad_norm": 1.8896972045039995, + "language_loss": 0.74443614, + "learning_rate": 3.683406143855174e-06, + "loss": 0.822878, + "num_input_tokens_seen": 74036575, + "router_z_loss_clip": 2.46875, + "router_z_loss_mlp": 0.22558594, + "step": 3428, + "time_per_iteration": 2.5644474029541016 + }, + { + "auxiliary_loss_clip": 0.06552382, + "auxiliary_loss_mlp": 0.01283805, + "balance_loss_clip": 0.06304047, + "balance_loss_mlp": 0.01259427, + "epoch": 0.20616263339846685, + "flos": 22784713286400.0, + "grad_norm": 1.96097325322206, + "language_loss": 0.74164659, + "learning_rate": 3.6831958257336256e-06, + "loss": 0.82000846, + "num_input_tokens_seen": 74055365, + "router_z_loss_clip": 2.484375, + "router_z_loss_mlp": 0.24377441, + "step": 3429, + "time_per_iteration": 2.5337913036346436 + }, + { + "auxiliary_loss_clip": 0.06551956, + "auxiliary_loss_mlp": 0.01286455, + "balance_loss_clip": 0.06304303, + "balance_loss_mlp": 0.01263126, + "epoch": 0.20622275665113482, + "flos": 20887755102720.0, + "grad_norm": 2.9642283368918863, + "language_loss": 0.86220586, + "learning_rate": 3.6829854437851237e-06, + "loss": 0.94058996, + "num_input_tokens_seen": 74074875, + "router_z_loss_clip": 2.4765625, + "router_z_loss_mlp": 0.23327637, + "step": 3430, + "time_per_iteration": 2.5939443111419678 + }, + { + "auxiliary_loss_clip": 0.06546243, + "auxiliary_loss_mlp": 0.01280876, + "balance_loss_clip": 0.06300765, + "balance_loss_mlp": 0.01257607, + "epoch": 0.20628287990380278, + "flos": 19360489132800.0, + "grad_norm": 1.6588894263331828, + "language_loss": 0.70011377, + "learning_rate": 3.6827749980176444e-06, + "loss": 0.77838504, + "num_input_tokens_seen": 74094505, + "router_z_loss_clip": 2.45507812, + "router_z_loss_mlp": 0.23278809, + "step": 3431, + "time_per_iteration": 2.565840482711792 + }, + { + "auxiliary_loss_clip": 0.06410907, + "auxiliary_loss_mlp": 0.0126731, + "balance_loss_clip": 0.06280327, + "balance_loss_mlp": 0.01261215, + "epoch": 0.20634300315647078, + "flos": 71536970799360.0, + "grad_norm": 0.791675242165557, + "language_loss": 0.60400987, + "learning_rate": 3.6825644884391693e-06, + "loss": 0.68079197, + "num_input_tokens_seen": 74158500, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.0609436, + "step": 3432, + "time_per_iteration": 3.305082082748413 + }, + { + "auxiliary_loss_clip": 0.06552991, + "auxiliary_loss_mlp": 0.01280414, + "balance_loss_clip": 0.06308176, + "balance_loss_mlp": 0.01257561, + "epoch": 0.20640312640913874, + "flos": 21730072671360.0, + "grad_norm": 1.5897016059046762, + "language_loss": 0.72477019, + "learning_rate": 3.682353915057679e-06, + "loss": 0.80310422, + "num_input_tokens_seen": 74176685, + "router_z_loss_clip": 2.44921875, + "router_z_loss_mlp": 0.22875977, + "step": 3433, + "time_per_iteration": 2.564393997192383 + }, + { + "auxiliary_loss_clip": 0.06561184, + "auxiliary_loss_mlp": 0.01281531, + "balance_loss_clip": 0.06312474, + "balance_loss_mlp": 0.01258512, + "epoch": 0.2064632496618067, + "flos": 20560256219520.0, + "grad_norm": 1.7877531320590552, + "language_loss": 0.87141019, + "learning_rate": 3.6821432778811604e-06, + "loss": 0.94983733, + "num_input_tokens_seen": 74194935, + "router_z_loss_clip": 2.484375, + "router_z_loss_mlp": 0.23010254, + "step": 3434, + "time_per_iteration": 2.5466108322143555 + }, + { + "auxiliary_loss_clip": 0.06556005, + "auxiliary_loss_mlp": 0.01283316, + "balance_loss_clip": 0.06305495, + "balance_loss_mlp": 0.01259427, + "epoch": 0.20652337291447467, + "flos": 29830669666560.0, + "grad_norm": 1.6526860814470912, + "language_loss": 0.6970489, + "learning_rate": 3.6819325769176004e-06, + "loss": 0.77544212, + "num_input_tokens_seen": 74215400, + "router_z_loss_clip": 2.50585938, + "router_z_loss_mlp": 0.2388916, + "step": 3435, + "time_per_iteration": 2.613896369934082 + }, + { + "auxiliary_loss_clip": 0.06545977, + "auxiliary_loss_mlp": 0.01289312, + "balance_loss_clip": 0.0630382, + "balance_loss_mlp": 0.01264325, + "epoch": 0.20658349616714264, + "flos": 26220844719360.0, + "grad_norm": 1.7674379542335852, + "language_loss": 0.89957321, + "learning_rate": 3.681721812174988e-06, + "loss": 0.97792608, + "num_input_tokens_seen": 74234090, + "router_z_loss_clip": 2.42578125, + "router_z_loss_mlp": 0.24975586, + "step": 3436, + "time_per_iteration": 2.590360641479492 + }, + { + "auxiliary_loss_clip": 0.06548543, + "auxiliary_loss_mlp": 0.01277538, + "balance_loss_clip": 0.06303848, + "balance_loss_mlp": 0.01254209, + "epoch": 0.2066436194198106, + "flos": 26001477930240.0, + "grad_norm": 1.7140409089026185, + "language_loss": 0.77244872, + "learning_rate": 3.6815109836613163e-06, + "loss": 0.8507095, + "num_input_tokens_seen": 74253345, + "router_z_loss_clip": 2.44335938, + "router_z_loss_mlp": 0.23339844, + "step": 3437, + "time_per_iteration": 2.6068568229675293 + }, + { + "auxiliary_loss_clip": 0.06548648, + "auxiliary_loss_mlp": 0.01280201, + "balance_loss_clip": 0.06300757, + "balance_loss_mlp": 0.01257682, + "epoch": 0.20670374267247857, + "flos": 21367466127360.0, + "grad_norm": 2.0146667208247355, + "language_loss": 0.78725338, + "learning_rate": 3.6813000913845795e-06, + "loss": 0.86554188, + "num_input_tokens_seen": 74271615, + "router_z_loss_clip": 2.47460938, + "router_z_loss_mlp": 0.22521973, + "step": 3438, + "time_per_iteration": 2.567963123321533 + }, + { + "auxiliary_loss_clip": 0.06407821, + "auxiliary_loss_mlp": 0.01263014, + "balance_loss_clip": 0.06278364, + "balance_loss_mlp": 0.01257164, + "epoch": 0.20676386592514656, + "flos": 66403108264320.0, + "grad_norm": 0.8029327028802032, + "language_loss": 0.66817588, + "learning_rate": 3.6810891353527747e-06, + "loss": 0.74488425, + "num_input_tokens_seen": 74331390, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.05844116, + "step": 3439, + "time_per_iteration": 3.1231849193573 + }, + { + "auxiliary_loss_clip": 0.06557775, + "auxiliary_loss_mlp": 0.01283609, + "balance_loss_clip": 0.06302103, + "balance_loss_mlp": 0.01260423, + "epoch": 0.20682398917781453, + "flos": 17280278069760.0, + "grad_norm": 1.9287299109512155, + "language_loss": 0.8404541, + "learning_rate": 3.6808781155739014e-06, + "loss": 0.91886795, + "num_input_tokens_seen": 74347335, + "router_z_loss_clip": 2.5546875, + "router_z_loss_mlp": 0.23168945, + "step": 3440, + "time_per_iteration": 2.496563196182251 + }, + { + "auxiliary_loss_clip": 0.06545421, + "auxiliary_loss_mlp": 0.01282262, + "balance_loss_clip": 0.06298509, + "balance_loss_mlp": 0.0126028, + "epoch": 0.2068841124304825, + "flos": 18083127565440.0, + "grad_norm": 3.100665935871663, + "language_loss": 0.85299611, + "learning_rate": 3.6806670320559614e-06, + "loss": 0.93127292, + "num_input_tokens_seen": 74366310, + "router_z_loss_clip": 2.47265625, + "router_z_loss_mlp": 0.2199707, + "step": 3441, + "time_per_iteration": 2.528823137283325 + }, + { + "auxiliary_loss_clip": 0.06546343, + "auxiliary_loss_mlp": 0.01282668, + "balance_loss_clip": 0.06300771, + "balance_loss_mlp": 0.01258958, + "epoch": 0.20694423568315046, + "flos": 27354798823680.0, + "grad_norm": 1.6487564578537555, + "language_loss": 0.86298448, + "learning_rate": 3.680455884806959e-06, + "loss": 0.94127464, + "num_input_tokens_seen": 74387100, + "router_z_loss_clip": 2.45898438, + "router_z_loss_mlp": 0.23693848, + "step": 3442, + "time_per_iteration": 2.5904433727264404 + }, + { + "auxiliary_loss_clip": 0.06553168, + "auxiliary_loss_mlp": 0.0128107, + "balance_loss_clip": 0.06302296, + "balance_loss_mlp": 0.01256298, + "epoch": 0.20700435893581842, + "flos": 20236027645440.0, + "grad_norm": 1.991917549605425, + "language_loss": 0.74110967, + "learning_rate": 3.6802446738349014e-06, + "loss": 0.81945205, + "num_input_tokens_seen": 74404460, + "router_z_loss_clip": 2.50976562, + "router_z_loss_mlp": 0.24755859, + "step": 3443, + "time_per_iteration": 2.546297311782837 + }, + { + "auxiliary_loss_clip": 0.06540793, + "auxiliary_loss_mlp": 0.01282, + "balance_loss_clip": 0.06298059, + "balance_loss_mlp": 0.01259183, + "epoch": 0.2070644821884864, + "flos": 20637347575680.0, + "grad_norm": 5.522598582225395, + "language_loss": 0.86263227, + "learning_rate": 3.680033399147797e-06, + "loss": 0.94086015, + "num_input_tokens_seen": 74423790, + "router_z_loss_clip": 2.42773438, + "router_z_loss_mlp": 0.22814941, + "step": 3444, + "time_per_iteration": 2.5644776821136475 + }, + { + "auxiliary_loss_clip": 0.06396829, + "auxiliary_loss_mlp": 0.01270586, + "balance_loss_clip": 0.06267206, + "balance_loss_mlp": 0.01264399, + "epoch": 0.20712460544115438, + "flos": 65960098128000.0, + "grad_norm": 0.6752802627643808, + "language_loss": 0.56895542, + "learning_rate": 3.6798220607536585e-06, + "loss": 0.64562953, + "num_input_tokens_seen": 74488130, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.06185913, + "step": 3445, + "time_per_iteration": 3.133159637451172 + }, + { + "auxiliary_loss_clip": 0.06550106, + "auxiliary_loss_mlp": 0.0128273, + "balance_loss_clip": 0.06305806, + "balance_loss_mlp": 0.01259412, + "epoch": 0.20718472869382235, + "flos": 19431542995200.0, + "grad_norm": 1.845349461285762, + "language_loss": 0.78388685, + "learning_rate": 3.6796106586604987e-06, + "loss": 0.86221522, + "num_input_tokens_seen": 74506720, + "router_z_loss_clip": 2.4453125, + "router_z_loss_mlp": 0.23327637, + "step": 3446, + "time_per_iteration": 2.5563149452209473 + }, + { + "auxiliary_loss_clip": 0.06562304, + "auxiliary_loss_mlp": 0.0128875, + "balance_loss_clip": 0.06302087, + "balance_loss_mlp": 0.01263215, + "epoch": 0.2072448519464903, + "flos": 24506007384960.0, + "grad_norm": 2.528724295630225, + "language_loss": 0.63215572, + "learning_rate": 3.679399192876334e-06, + "loss": 0.7106663, + "num_input_tokens_seen": 74525330, + "router_z_loss_clip": 2.60351562, + "router_z_loss_mlp": 0.25549316, + "step": 3447, + "time_per_iteration": 2.5858354568481445 + }, + { + "auxiliary_loss_clip": 0.06550243, + "auxiliary_loss_mlp": 0.01285454, + "balance_loss_clip": 0.06302016, + "balance_loss_mlp": 0.01261624, + "epoch": 0.20730497519915828, + "flos": 23082345388800.0, + "grad_norm": 1.7246458475869415, + "language_loss": 0.87330115, + "learning_rate": 3.679187663409184e-06, + "loss": 0.95165813, + "num_input_tokens_seen": 74544535, + "router_z_loss_clip": 2.47851562, + "router_z_loss_mlp": 0.23840332, + "step": 3448, + "time_per_iteration": 2.5367424488067627 + }, + { + "auxiliary_loss_clip": 0.06547908, + "auxiliary_loss_mlp": 0.01287375, + "balance_loss_clip": 0.06301224, + "balance_loss_mlp": 0.0126407, + "epoch": 0.20736509845182624, + "flos": 21075368394240.0, + "grad_norm": 2.238353970842136, + "language_loss": 0.75934261, + "learning_rate": 3.6789760702670696e-06, + "loss": 0.83769548, + "num_input_tokens_seen": 74562300, + "router_z_loss_clip": 2.47070312, + "router_z_loss_mlp": 0.23291016, + "step": 3449, + "time_per_iteration": 3.94480562210083 + }, + { + "auxiliary_loss_clip": 0.06557415, + "auxiliary_loss_mlp": 0.01291462, + "balance_loss_clip": 0.06305711, + "balance_loss_mlp": 0.01267262, + "epoch": 0.2074252217044942, + "flos": 17638021077120.0, + "grad_norm": 1.9890451191355467, + "language_loss": 0.77508813, + "learning_rate": 3.6787644134580134e-06, + "loss": 0.8535769, + "num_input_tokens_seen": 74580080, + "router_z_loss_clip": 2.51757812, + "router_z_loss_mlp": 0.24243164, + "step": 3450, + "time_per_iteration": 2.545430898666382 + }, + { + "auxiliary_loss_clip": 0.06561074, + "auxiliary_loss_mlp": 0.01294493, + "balance_loss_clip": 0.06309673, + "balance_loss_mlp": 0.01270579, + "epoch": 0.20748534495716217, + "flos": 23553209808000.0, + "grad_norm": 2.274256725147599, + "language_loss": 0.823879, + "learning_rate": 3.6785526929900436e-06, + "loss": 0.90243471, + "num_input_tokens_seen": 74598980, + "router_z_loss_clip": 2.515625, + "router_z_loss_mlp": 0.23913574, + "step": 3451, + "time_per_iteration": 4.003388404846191 + }, + { + "auxiliary_loss_clip": 0.0640305, + "auxiliary_loss_mlp": 0.01254439, + "balance_loss_clip": 0.06273949, + "balance_loss_mlp": 0.01248494, + "epoch": 0.20754546820983016, + "flos": 52268666757120.0, + "grad_norm": 0.7675919354914552, + "language_loss": 0.56549037, + "learning_rate": 3.6783409088711875e-06, + "loss": 0.64206523, + "num_input_tokens_seen": 74655275, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.05941772, + "step": 3452, + "time_per_iteration": 3.0660083293914795 + }, + { + "auxiliary_loss_clip": 0.06557937, + "auxiliary_loss_mlp": 0.01287582, + "balance_loss_clip": 0.06309802, + "balance_loss_mlp": 0.01264956, + "epoch": 0.20760559146249813, + "flos": 20418609692160.0, + "grad_norm": 1.8872949255610445, + "language_loss": 0.88967919, + "learning_rate": 3.6781290611094755e-06, + "loss": 0.9681344, + "num_input_tokens_seen": 74674560, + "router_z_loss_clip": 2.48046875, + "router_z_loss_mlp": 0.22619629, + "step": 3453, + "time_per_iteration": 2.581430673599243 + }, + { + "auxiliary_loss_clip": 0.06554953, + "auxiliary_loss_mlp": 0.01287205, + "balance_loss_clip": 0.06307904, + "balance_loss_mlp": 0.01263256, + "epoch": 0.2076657147151661, + "flos": 23192825397120.0, + "grad_norm": 1.4776896143180385, + "language_loss": 0.80720532, + "learning_rate": 3.6779171497129407e-06, + "loss": 0.88562691, + "num_input_tokens_seen": 74694500, + "router_z_loss_clip": 2.47070312, + "router_z_loss_mlp": 0.23962402, + "step": 3454, + "time_per_iteration": 2.5793018341064453 + }, + { + "auxiliary_loss_clip": 0.06549348, + "auxiliary_loss_mlp": 0.01286388, + "balance_loss_clip": 0.06301847, + "balance_loss_mlp": 0.01263476, + "epoch": 0.20772583796783406, + "flos": 18298595139840.0, + "grad_norm": 4.241833159654324, + "language_loss": 0.78446364, + "learning_rate": 3.6777051746896202e-06, + "loss": 0.86282104, + "num_input_tokens_seen": 74710485, + "router_z_loss_clip": 2.4765625, + "router_z_loss_mlp": 0.22912598, + "step": 3455, + "time_per_iteration": 2.5377535820007324 + }, + { + "auxiliary_loss_clip": 0.0654678, + "auxiliary_loss_mlp": 0.01279125, + "balance_loss_clip": 0.06301546, + "balance_loss_mlp": 0.01256547, + "epoch": 0.20778596122050202, + "flos": 17608531639680.0, + "grad_norm": 1.6321737814924744, + "language_loss": 0.81251496, + "learning_rate": 3.6774931360475516e-06, + "loss": 0.89077407, + "num_input_tokens_seen": 74727450, + "router_z_loss_clip": 2.453125, + "router_z_loss_mlp": 0.22595215, + "step": 3456, + "time_per_iteration": 2.5125768184661865 + }, + { + "auxiliary_loss_clip": 0.06554688, + "auxiliary_loss_mlp": 0.01282924, + "balance_loss_clip": 0.06304802, + "balance_loss_mlp": 0.01259893, + "epoch": 0.20784608447317, + "flos": 23812380086400.0, + "grad_norm": 2.3276439316102695, + "language_loss": 0.79071975, + "learning_rate": 3.6772810337947745e-06, + "loss": 0.86909586, + "num_input_tokens_seen": 74746725, + "router_z_loss_clip": 2.49804688, + "router_z_loss_mlp": 0.23022461, + "step": 3457, + "time_per_iteration": 5.41590428352356 + }, + { + "auxiliary_loss_clip": 0.06553855, + "auxiliary_loss_mlp": 0.01279092, + "balance_loss_clip": 0.0630386, + "balance_loss_mlp": 0.01255739, + "epoch": 0.20790620772583795, + "flos": 17645022892800.0, + "grad_norm": 1.9963286729709264, + "language_loss": 0.84664595, + "learning_rate": 3.677068867939333e-06, + "loss": 0.9249754, + "num_input_tokens_seen": 74765255, + "router_z_loss_clip": 2.49609375, + "router_z_loss_mlp": 0.23364258, + "step": 3458, + "time_per_iteration": 2.610107183456421 + }, + { + "auxiliary_loss_clip": 0.06541788, + "auxiliary_loss_mlp": 0.01277058, + "balance_loss_clip": 0.06299603, + "balance_loss_mlp": 0.01254289, + "epoch": 0.20796633097850595, + "flos": 27680997968640.0, + "grad_norm": 1.7522329071194311, + "language_loss": 0.76853168, + "learning_rate": 3.676856638489272e-06, + "loss": 0.8467201, + "num_input_tokens_seen": 74785710, + "router_z_loss_clip": 2.41796875, + "router_z_loss_mlp": 0.2277832, + "step": 3459, + "time_per_iteration": 2.63517689704895 + }, + { + "auxiliary_loss_clip": 0.06543219, + "auxiliary_loss_mlp": 0.01279579, + "balance_loss_clip": 0.06299554, + "balance_loss_mlp": 0.01257024, + "epoch": 0.2080264542311739, + "flos": 19251770060160.0, + "grad_norm": 1.8057193688460893, + "language_loss": 0.77803749, + "learning_rate": 3.6766443454526382e-06, + "loss": 0.85626543, + "num_input_tokens_seen": 74804490, + "router_z_loss_clip": 2.43945312, + "router_z_loss_mlp": 0.22570801, + "step": 3460, + "time_per_iteration": 2.5500359535217285 + }, + { + "auxiliary_loss_clip": 0.06544735, + "auxiliary_loss_mlp": 0.01276939, + "balance_loss_clip": 0.06297737, + "balance_loss_mlp": 0.01255315, + "epoch": 0.20808657748384188, + "flos": 27533146239360.0, + "grad_norm": 1.865214089074118, + "language_loss": 0.76152873, + "learning_rate": 3.6764319888374836e-06, + "loss": 0.8397454, + "num_input_tokens_seen": 74826340, + "router_z_loss_clip": 2.46484375, + "router_z_loss_mlp": 0.21618652, + "step": 3461, + "time_per_iteration": 2.575975179672241 + }, + { + "auxiliary_loss_clip": 0.06554922, + "auxiliary_loss_mlp": 0.01279751, + "balance_loss_clip": 0.06301013, + "balance_loss_mlp": 0.01256183, + "epoch": 0.20814670073650984, + "flos": 26914262382720.0, + "grad_norm": 2.229402903272821, + "language_loss": 0.89438462, + "learning_rate": 3.6762195686518604e-06, + "loss": 0.97273135, + "num_input_tokens_seen": 74844960, + "router_z_loss_clip": 2.53515625, + "router_z_loss_mlp": 0.23571777, + "step": 3462, + "time_per_iteration": 2.5732173919677734 + }, + { + "auxiliary_loss_clip": 0.06402825, + "auxiliary_loss_mlp": 0.01283843, + "balance_loss_clip": 0.06274004, + "balance_loss_mlp": 0.01278395, + "epoch": 0.2082068239891778, + "flos": 70195850674560.0, + "grad_norm": 0.9150130859854356, + "language_loss": 0.59001637, + "learning_rate": 3.6760070849038226e-06, + "loss": 0.66688299, + "num_input_tokens_seen": 74909075, + "router_z_loss_clip": 1.28417969, + "router_z_loss_mlp": 0.05456543, + "step": 3463, + "time_per_iteration": 3.269202709197998 + }, + { + "auxiliary_loss_clip": 0.06550549, + "auxiliary_loss_mlp": 0.01282784, + "balance_loss_clip": 0.06300794, + "balance_loss_mlp": 0.01257929, + "epoch": 0.20826694724184577, + "flos": 24614978019840.0, + "grad_norm": 2.6522237220698663, + "language_loss": 0.66949397, + "learning_rate": 3.675794537601429e-06, + "loss": 0.74782729, + "num_input_tokens_seen": 74928125, + "router_z_loss_clip": 2.5, + "router_z_loss_mlp": 0.2487793, + "step": 3464, + "time_per_iteration": 2.5638158321380615 + }, + { + "auxiliary_loss_clip": 0.06556059, + "auxiliary_loss_mlp": 0.01287892, + "balance_loss_clip": 0.06307128, + "balance_loss_mlp": 0.01263299, + "epoch": 0.20832707049451377, + "flos": 12897218845440.0, + "grad_norm": 2.2476817474527913, + "language_loss": 0.84321886, + "learning_rate": 3.6755819267527373e-06, + "loss": 0.9216584, + "num_input_tokens_seen": 74945090, + "router_z_loss_clip": 2.49023438, + "router_z_loss_mlp": 0.24609375, + "step": 3465, + "time_per_iteration": 2.5794646739959717 + }, + { + "auxiliary_loss_clip": 0.06542073, + "auxiliary_loss_mlp": 0.01282156, + "balance_loss_clip": 0.06295872, + "balance_loss_mlp": 0.01258326, + "epoch": 0.20838719374718173, + "flos": 22205129794560.0, + "grad_norm": 3.281235222185926, + "language_loss": 0.82741451, + "learning_rate": 3.6753692523658113e-06, + "loss": 0.90565681, + "num_input_tokens_seen": 74963630, + "router_z_loss_clip": 2.46289062, + "router_z_loss_mlp": 0.23828125, + "step": 3466, + "time_per_iteration": 2.540011405944824 + }, + { + "auxiliary_loss_clip": 0.06540319, + "auxiliary_loss_mlp": 0.01287937, + "balance_loss_clip": 0.06300111, + "balance_loss_mlp": 0.01267243, + "epoch": 0.2084473169998497, + "flos": 15164036951040.0, + "grad_norm": 2.490655035944783, + "language_loss": 0.82892549, + "learning_rate": 3.675156514448716e-06, + "loss": 0.90720803, + "num_input_tokens_seen": 74981875, + "router_z_loss_clip": 2.40234375, + "router_z_loss_mlp": 0.20690918, + "step": 3467, + "time_per_iteration": 2.54622745513916 + }, + { + "auxiliary_loss_clip": 0.06540733, + "auxiliary_loss_mlp": 0.01289148, + "balance_loss_clip": 0.06303266, + "balance_loss_mlp": 0.01268167, + "epoch": 0.20850744025251766, + "flos": 17462482773120.0, + "grad_norm": 1.8114532422505003, + "language_loss": 0.82299387, + "learning_rate": 3.674943713009518e-06, + "loss": 0.90129268, + "num_input_tokens_seen": 74999155, + "router_z_loss_clip": 2.37304688, + "router_z_loss_mlp": 0.2097168, + "step": 3468, + "time_per_iteration": 2.5321285724639893 + }, + { + "auxiliary_loss_clip": 0.06553383, + "auxiliary_loss_mlp": 0.01280357, + "balance_loss_clip": 0.06302625, + "balance_loss_mlp": 0.01257158, + "epoch": 0.20856756350518563, + "flos": 25705439055360.0, + "grad_norm": 1.667306072143411, + "language_loss": 0.9042781, + "learning_rate": 3.6747308480562856e-06, + "loss": 0.98261553, + "num_input_tokens_seen": 75017850, + "router_z_loss_clip": 2.5078125, + "router_z_loss_mlp": 0.23217773, + "step": 3469, + "time_per_iteration": 2.6107866764068604 + }, + { + "auxiliary_loss_clip": 0.0655106, + "auxiliary_loss_mlp": 0.01281556, + "balance_loss_clip": 0.06308927, + "balance_loss_mlp": 0.01259872, + "epoch": 0.2086276867578536, + "flos": 37898213425920.0, + "grad_norm": 1.9476878714472061, + "language_loss": 0.77294397, + "learning_rate": 3.674517919597092e-06, + "loss": 0.85127008, + "num_input_tokens_seen": 75039270, + "router_z_loss_clip": 2.421875, + "router_z_loss_mlp": 0.21679688, + "step": 3470, + "time_per_iteration": 2.7083425521850586 + }, + { + "auxiliary_loss_clip": 0.06547298, + "auxiliary_loss_mlp": 0.01289218, + "balance_loss_clip": 0.06307482, + "balance_loss_mlp": 0.01266283, + "epoch": 0.20868781001052156, + "flos": 25564169871360.0, + "grad_norm": 1.8036684586339249, + "language_loss": 0.76289082, + "learning_rate": 3.674304927640011e-06, + "loss": 0.84125602, + "num_input_tokens_seen": 75059350, + "router_z_loss_clip": 2.40039062, + "router_z_loss_mlp": 0.22937012, + "step": 3471, + "time_per_iteration": 2.589884042739868 + }, + { + "auxiliary_loss_clip": 0.06554438, + "auxiliary_loss_mlp": 0.01280867, + "balance_loss_clip": 0.06303854, + "balance_loss_mlp": 0.01259028, + "epoch": 0.20874793326318955, + "flos": 27536961600000.0, + "grad_norm": 1.6381609540737498, + "language_loss": 0.76341867, + "learning_rate": 3.67409187219312e-06, + "loss": 0.84177172, + "num_input_tokens_seen": 75080150, + "router_z_loss_clip": 2.5078125, + "router_z_loss_mlp": 0.21813965, + "step": 3472, + "time_per_iteration": 2.610260009765625 + }, + { + "auxiliary_loss_clip": 0.06544036, + "auxiliary_loss_mlp": 0.01279562, + "balance_loss_clip": 0.06302247, + "balance_loss_mlp": 0.01259022, + "epoch": 0.20880805651585752, + "flos": 18554243546880.0, + "grad_norm": 2.073955911698539, + "language_loss": 0.85418117, + "learning_rate": 3.6738787532644966e-06, + "loss": 0.93241715, + "num_input_tokens_seen": 75097920, + "router_z_loss_clip": 2.41601562, + "router_z_loss_mlp": 0.20532227, + "step": 3473, + "time_per_iteration": 2.5741372108459473 + }, + { + "auxiliary_loss_clip": 0.06431094, + "auxiliary_loss_mlp": 0.01255526, + "balance_loss_clip": 0.06305239, + "balance_loss_mlp": 0.01250132, + "epoch": 0.20886817976852548, + "flos": 65966596819200.0, + "grad_norm": 0.8661888314681573, + "language_loss": 0.63746876, + "learning_rate": 3.6736655708622235e-06, + "loss": 0.71433502, + "num_input_tokens_seen": 75152410, + "router_z_loss_clip": 1.25585938, + "router_z_loss_mlp": 0.05401611, + "step": 3474, + "time_per_iteration": 3.061617612838745 + }, + { + "auxiliary_loss_clip": 0.06545534, + "auxiliary_loss_mlp": 0.01278543, + "balance_loss_clip": 0.06299987, + "balance_loss_mlp": 0.01255751, + "epoch": 0.20892830302119345, + "flos": 36548120914560.0, + "grad_norm": 1.9594452651536962, + "language_loss": 0.70746702, + "learning_rate": 3.6734523249943844e-06, + "loss": 0.78570777, + "num_input_tokens_seen": 75173265, + "router_z_loss_clip": 2.45898438, + "router_z_loss_mlp": 0.22790527, + "step": 3475, + "time_per_iteration": 2.7295854091644287 + }, + { + "auxiliary_loss_clip": 0.06544538, + "auxiliary_loss_mlp": 0.01277403, + "balance_loss_clip": 0.06299123, + "balance_loss_mlp": 0.01255754, + "epoch": 0.2089884262738614, + "flos": 20962582398720.0, + "grad_norm": 1.6086426160627472, + "language_loss": 0.70801485, + "learning_rate": 3.673239015669065e-06, + "loss": 0.78623426, + "num_input_tokens_seen": 75193640, + "router_z_loss_clip": 2.45703125, + "router_z_loss_mlp": 0.21643066, + "step": 3476, + "time_per_iteration": 2.6065874099731445 + }, + { + "auxiliary_loss_clip": 0.06538086, + "auxiliary_loss_mlp": 0.01275305, + "balance_loss_clip": 0.06299278, + "balance_loss_mlp": 0.0125523, + "epoch": 0.20904854952652938, + "flos": 22790666926080.0, + "grad_norm": 1.9785394209574967, + "language_loss": 0.90003526, + "learning_rate": 3.6730256428943544e-06, + "loss": 0.9781692, + "num_input_tokens_seen": 75212545, + "router_z_loss_clip": 2.390625, + "router_z_loss_mlp": 0.20080566, + "step": 3477, + "time_per_iteration": 2.5576000213623047 + }, + { + "auxiliary_loss_clip": 0.06542666, + "auxiliary_loss_mlp": 0.01278801, + "balance_loss_clip": 0.06302647, + "balance_loss_mlp": 0.01257594, + "epoch": 0.20910867277919734, + "flos": 27309838308480.0, + "grad_norm": 2.554960999675803, + "language_loss": 0.69433093, + "learning_rate": 3.672812206678344e-06, + "loss": 0.77254558, + "num_input_tokens_seen": 75230865, + "router_z_loss_clip": 2.40234375, + "router_z_loss_mlp": 0.21203613, + "step": 3478, + "time_per_iteration": 2.605890989303589 + }, + { + "auxiliary_loss_clip": 0.0654031, + "auxiliary_loss_mlp": 0.01282288, + "balance_loss_clip": 0.06298592, + "balance_loss_mlp": 0.01260461, + "epoch": 0.20916879603186533, + "flos": 14324444640000.0, + "grad_norm": 1.9959140715838508, + "language_loss": 0.85550553, + "learning_rate": 3.672598707029127e-06, + "loss": 0.93373156, + "num_input_tokens_seen": 75248285, + "router_z_loss_clip": 2.41992188, + "router_z_loss_mlp": 0.21813965, + "step": 3479, + "time_per_iteration": 2.5808637142181396 + }, + { + "auxiliary_loss_clip": 0.06542581, + "auxiliary_loss_mlp": 0.01279649, + "balance_loss_clip": 0.06299447, + "balance_loss_mlp": 0.01258072, + "epoch": 0.2092289192845333, + "flos": 22279537820160.0, + "grad_norm": 2.3833241848820372, + "language_loss": 0.75129831, + "learning_rate": 3.6723851439548003e-06, + "loss": 0.82952058, + "num_input_tokens_seen": 75266310, + "router_z_loss_clip": 2.43359375, + "router_z_loss_mlp": 0.21569824, + "step": 3480, + "time_per_iteration": 2.519789218902588 + }, + { + "auxiliary_loss_clip": 0.06546038, + "auxiliary_loss_mlp": 0.01278892, + "balance_loss_clip": 0.06306421, + "balance_loss_mlp": 0.01258495, + "epoch": 0.20928904253720126, + "flos": 14836118797440.0, + "grad_norm": 2.1621149118450163, + "language_loss": 0.7689389, + "learning_rate": 3.67217151746346e-06, + "loss": 0.84718817, + "num_input_tokens_seen": 75284175, + "router_z_loss_clip": 2.39648438, + "router_z_loss_mlp": 0.20410156, + "step": 3481, + "time_per_iteration": 2.541019916534424 + }, + { + "auxiliary_loss_clip": 0.06542054, + "auxiliary_loss_mlp": 0.01279748, + "balance_loss_clip": 0.06299154, + "balance_loss_mlp": 0.01257718, + "epoch": 0.20934916578986923, + "flos": 23266017538560.0, + "grad_norm": 1.9029543431357738, + "language_loss": 0.85756385, + "learning_rate": 3.671957827563209e-06, + "loss": 0.93578184, + "num_input_tokens_seen": 75303465, + "router_z_loss_clip": 2.4296875, + "router_z_loss_mlp": 0.22021484, + "step": 3482, + "time_per_iteration": 2.57550048828125 + }, + { + "auxiliary_loss_clip": 0.06538534, + "auxiliary_loss_mlp": 0.01281551, + "balance_loss_clip": 0.0629866, + "balance_loss_mlp": 0.01260237, + "epoch": 0.2094092890425372, + "flos": 32022492768000.0, + "grad_norm": 2.0122422455266076, + "language_loss": 0.71876764, + "learning_rate": 3.6717440742621494e-06, + "loss": 0.79696846, + "num_input_tokens_seen": 75325290, + "router_z_loss_clip": 2.3984375, + "router_z_loss_mlp": 0.21325684, + "step": 3483, + "time_per_iteration": 2.6664113998413086 + }, + { + "auxiliary_loss_clip": 0.06543796, + "auxiliary_loss_mlp": 0.01277426, + "balance_loss_clip": 0.06297769, + "balance_loss_mlp": 0.0125567, + "epoch": 0.20946941229520516, + "flos": 20016744710400.0, + "grad_norm": 1.623254768822543, + "language_loss": 0.75620067, + "learning_rate": 3.6715302575683865e-06, + "loss": 0.83441281, + "num_input_tokens_seen": 75343895, + "router_z_loss_clip": 2.45898438, + "router_z_loss_mlp": 0.21728516, + "step": 3484, + "time_per_iteration": 2.537745714187622 + }, + { + "auxiliary_loss_clip": 0.06537648, + "auxiliary_loss_mlp": 0.01274667, + "balance_loss_clip": 0.0629506, + "balance_loss_mlp": 0.01252733, + "epoch": 0.20952953554787315, + "flos": 30748401509760.0, + "grad_norm": 1.6710062021876058, + "language_loss": 0.71473777, + "learning_rate": 3.6713163774900292e-06, + "loss": 0.79286093, + "num_input_tokens_seen": 75367100, + "router_z_loss_clip": 2.42382812, + "router_z_loss_mlp": 0.21936035, + "step": 3485, + "time_per_iteration": 2.6310439109802246 + }, + { + "auxiliary_loss_clip": 0.0654947, + "auxiliary_loss_mlp": 0.01281894, + "balance_loss_clip": 0.06304678, + "balance_loss_mlp": 0.01258517, + "epoch": 0.20958965880054112, + "flos": 27055950837120.0, + "grad_norm": 1.7793136829828902, + "language_loss": 0.83105123, + "learning_rate": 3.6711024340351875e-06, + "loss": 0.90936482, + "num_input_tokens_seen": 75389925, + "router_z_loss_clip": 2.44921875, + "router_z_loss_mlp": 0.23376465, + "step": 3486, + "time_per_iteration": 2.5819222927093506 + }, + { + "auxiliary_loss_clip": 0.06539689, + "auxiliary_loss_mlp": 0.01279221, + "balance_loss_clip": 0.06297638, + "balance_loss_mlp": 0.01257978, + "epoch": 0.20964978205320908, + "flos": 34212680714880.0, + "grad_norm": 2.582218695391969, + "language_loss": 0.87821579, + "learning_rate": 3.6708884272119737e-06, + "loss": 0.95640486, + "num_input_tokens_seen": 75408575, + "router_z_loss_clip": 2.421875, + "router_z_loss_mlp": 0.21240234, + "step": 3487, + "time_per_iteration": 2.639369487762451 + }, + { + "auxiliary_loss_clip": 0.06538714, + "auxiliary_loss_mlp": 0.01279661, + "balance_loss_clip": 0.06298582, + "balance_loss_mlp": 0.01258227, + "epoch": 0.20970990530587705, + "flos": 23484168443520.0, + "grad_norm": 2.287931950731532, + "language_loss": 0.72719586, + "learning_rate": 3.670674357028504e-06, + "loss": 0.80537963, + "num_input_tokens_seen": 75427155, + "router_z_loss_clip": 2.40039062, + "router_z_loss_mlp": 0.21411133, + "step": 3488, + "time_per_iteration": 3.9480032920837402 + }, + { + "auxiliary_loss_clip": 0.06540683, + "auxiliary_loss_mlp": 0.01275293, + "balance_loss_clip": 0.06299531, + "balance_loss_mlp": 0.01255123, + "epoch": 0.209770028558545, + "flos": 18557346147840.0, + "grad_norm": 2.67396224290917, + "language_loss": 0.81189376, + "learning_rate": 3.6704602234928945e-06, + "loss": 0.89005351, + "num_input_tokens_seen": 75444450, + "router_z_loss_clip": 2.41210938, + "router_z_loss_mlp": 0.20178223, + "step": 3489, + "time_per_iteration": 2.500709295272827 + }, + { + "auxiliary_loss_clip": 0.0654545, + "auxiliary_loss_mlp": 0.01278304, + "balance_loss_clip": 0.06303608, + "balance_loss_mlp": 0.0125724, + "epoch": 0.20983015181121298, + "flos": 21623533804800.0, + "grad_norm": 2.0567102060198743, + "language_loss": 0.73407692, + "learning_rate": 3.670246026613266e-06, + "loss": 0.81231445, + "num_input_tokens_seen": 75462625, + "router_z_loss_clip": 2.41992188, + "router_z_loss_mlp": 0.21057129, + "step": 3490, + "time_per_iteration": 2.5622947216033936 + }, + { + "auxiliary_loss_clip": 0.06534347, + "auxiliary_loss_mlp": 0.01280989, + "balance_loss_clip": 0.06300151, + "balance_loss_mlp": 0.01260128, + "epoch": 0.20989027506388094, + "flos": 16619787861120.0, + "grad_norm": 1.7677892351641744, + "language_loss": 0.71503973, + "learning_rate": 3.6700317663977415e-06, + "loss": 0.7931931, + "num_input_tokens_seen": 75480640, + "router_z_loss_clip": 2.34570312, + "router_z_loss_mlp": 0.20849609, + "step": 3491, + "time_per_iteration": 4.0022783279418945 + }, + { + "auxiliary_loss_clip": 0.06542461, + "auxiliary_loss_mlp": 0.01283797, + "balance_loss_clip": 0.0629908, + "balance_loss_mlp": 0.01260957, + "epoch": 0.20995039831654894, + "flos": 23222692177920.0, + "grad_norm": 2.702657778988086, + "language_loss": 0.80329478, + "learning_rate": 3.669817442854444e-06, + "loss": 0.88155735, + "num_input_tokens_seen": 75494900, + "router_z_loss_clip": 2.43164062, + "router_z_loss_mlp": 0.22839355, + "step": 3492, + "time_per_iteration": 2.5376975536346436 + }, + { + "auxiliary_loss_clip": 0.06546506, + "auxiliary_loss_mlp": 0.01283519, + "balance_loss_clip": 0.06307527, + "balance_loss_mlp": 0.01262741, + "epoch": 0.2100105215692169, + "flos": 18152881689600.0, + "grad_norm": 1.9319737068083613, + "language_loss": 0.87613726, + "learning_rate": 3.669603055991502e-06, + "loss": 0.95443749, + "num_input_tokens_seen": 75513370, + "router_z_loss_clip": 2.39453125, + "router_z_loss_mlp": 0.20800781, + "step": 3493, + "time_per_iteration": 2.5462660789489746 + }, + { + "auxiliary_loss_clip": 0.06538918, + "auxiliary_loss_mlp": 0.01283808, + "balance_loss_clip": 0.06303683, + "balance_loss_mlp": 0.01262673, + "epoch": 0.21007064482188487, + "flos": 15967179936000.0, + "grad_norm": 1.7380368048158776, + "language_loss": 0.69753766, + "learning_rate": 3.6693886058170455e-06, + "loss": 0.77576494, + "num_input_tokens_seen": 75532480, + "router_z_loss_clip": 2.35351562, + "router_z_loss_mlp": 0.21130371, + "step": 3494, + "time_per_iteration": 2.523575782775879 + }, + { + "auxiliary_loss_clip": 0.0654956, + "auxiliary_loss_mlp": 0.0128408, + "balance_loss_clip": 0.06306064, + "balance_loss_mlp": 0.01262598, + "epoch": 0.21013076807455283, + "flos": 32242614243840.0, + "grad_norm": 1.6795437076377473, + "language_loss": 0.79639518, + "learning_rate": 3.6691740923392053e-06, + "loss": 0.87473154, + "num_input_tokens_seen": 75552745, + "router_z_loss_clip": 2.4375, + "router_z_loss_mlp": 0.21472168, + "step": 3495, + "time_per_iteration": 2.679564952850342 + }, + { + "auxiliary_loss_clip": 0.06543255, + "auxiliary_loss_mlp": 0.01280683, + "balance_loss_clip": 0.06300748, + "balance_loss_mlp": 0.01258832, + "epoch": 0.2101908913272208, + "flos": 23703493305600.0, + "grad_norm": 2.110842443067005, + "language_loss": 0.77733672, + "learning_rate": 3.668959515566116e-06, + "loss": 0.85557616, + "num_input_tokens_seen": 75574355, + "router_z_loss_clip": 2.42773438, + "router_z_loss_mlp": 0.21862793, + "step": 3496, + "time_per_iteration": 2.5728261470794678 + }, + { + "auxiliary_loss_clip": 0.06546371, + "auxiliary_loss_mlp": 0.01280297, + "balance_loss_clip": 0.06304142, + "balance_loss_mlp": 0.01257993, + "epoch": 0.21025101457988876, + "flos": 20381992657920.0, + "grad_norm": 2.1840810602746643, + "language_loss": 0.82214069, + "learning_rate": 3.668744875505915e-06, + "loss": 0.90040743, + "num_input_tokens_seen": 75592215, + "router_z_loss_clip": 2.421875, + "router_z_loss_mlp": 0.22302246, + "step": 3497, + "time_per_iteration": 5.435751438140869 + }, + { + "auxiliary_loss_clip": 0.06554863, + "auxiliary_loss_mlp": 0.01281759, + "balance_loss_clip": 0.06307989, + "balance_loss_mlp": 0.01259205, + "epoch": 0.21031113783255675, + "flos": 25782740046720.0, + "grad_norm": 1.9653925911520136, + "language_loss": 0.68009126, + "learning_rate": 3.668530172166741e-06, + "loss": 0.75845742, + "num_input_tokens_seen": 75610740, + "router_z_loss_clip": 2.46875, + "router_z_loss_mlp": 0.22558594, + "step": 3498, + "time_per_iteration": 2.6047511100769043 + }, + { + "auxiliary_loss_clip": 0.06550896, + "auxiliary_loss_mlp": 0.01291723, + "balance_loss_clip": 0.06304521, + "balance_loss_mlp": 0.01269789, + "epoch": 0.21037126108522472, + "flos": 22024769880960.0, + "grad_norm": 1.5964372308761317, + "language_loss": 0.81248403, + "learning_rate": 3.6683154055567352e-06, + "loss": 0.89091027, + "num_input_tokens_seen": 75631005, + "router_z_loss_clip": 2.46484375, + "router_z_loss_mlp": 0.21948242, + "step": 3499, + "time_per_iteration": 2.5279107093811035 + }, + { + "auxiliary_loss_clip": 0.06537838, + "auxiliary_loss_mlp": 0.01278117, + "balance_loss_clip": 0.06300277, + "balance_loss_mlp": 0.01257911, + "epoch": 0.21043138433789269, + "flos": 25340861940480.0, + "grad_norm": 2.3111316875342274, + "language_loss": 0.78733355, + "learning_rate": 3.668100575684043e-06, + "loss": 0.86549306, + "num_input_tokens_seen": 75650655, + "router_z_loss_clip": 2.37890625, + "router_z_loss_mlp": 0.20214844, + "step": 3500, + "time_per_iteration": 2.5789358615875244 + }, + { + "auxiliary_loss_clip": 0.06548081, + "auxiliary_loss_mlp": 0.01281815, + "balance_loss_clip": 0.06307902, + "balance_loss_mlp": 0.01259809, + "epoch": 0.21049150759056065, + "flos": 25563708673920.0, + "grad_norm": 1.5222387073827752, + "language_loss": 0.74519855, + "learning_rate": 3.6678856825568094e-06, + "loss": 0.82349753, + "num_input_tokens_seen": 75669895, + "router_z_loss_clip": 2.40234375, + "router_z_loss_mlp": 0.22021484, + "step": 3501, + "time_per_iteration": 2.5740344524383545 + }, + { + "auxiliary_loss_clip": 0.06532234, + "auxiliary_loss_mlp": 0.01279657, + "balance_loss_clip": 0.06293183, + "balance_loss_mlp": 0.01258521, + "epoch": 0.21055163084322862, + "flos": 24501982389120.0, + "grad_norm": 1.5726278305934103, + "language_loss": 0.75732303, + "learning_rate": 3.667670726183183e-06, + "loss": 0.83544195, + "num_input_tokens_seen": 75689535, + "router_z_loss_clip": 2.39453125, + "router_z_loss_mlp": 0.21142578, + "step": 3502, + "time_per_iteration": 2.564650535583496 + }, + { + "auxiliary_loss_clip": 0.06532737, + "auxiliary_loss_mlp": 0.01282141, + "balance_loss_clip": 0.06294994, + "balance_loss_mlp": 0.01260731, + "epoch": 0.21061175409589658, + "flos": 25746123012480.0, + "grad_norm": 2.0578640076956165, + "language_loss": 0.78642297, + "learning_rate": 3.667455706571316e-06, + "loss": 0.86457181, + "num_input_tokens_seen": 75709265, + "router_z_loss_clip": 2.37695312, + "router_z_loss_mlp": 0.21411133, + "step": 3503, + "time_per_iteration": 2.5651087760925293 + }, + { + "auxiliary_loss_clip": 0.06548393, + "auxiliary_loss_mlp": 0.01287579, + "balance_loss_clip": 0.06300595, + "balance_loss_mlp": 0.01262426, + "epoch": 0.21067187734856455, + "flos": 18995115404160.0, + "grad_norm": 2.3829290271278363, + "language_loss": 0.79109055, + "learning_rate": 3.6672406237293617e-06, + "loss": 0.86945021, + "num_input_tokens_seen": 75727050, + "router_z_loss_clip": 2.47460938, + "router_z_loss_mlp": 0.25134277, + "step": 3504, + "time_per_iteration": 2.5907576084136963 + }, + { + "auxiliary_loss_clip": 0.06540846, + "auxiliary_loss_mlp": 0.01277653, + "balance_loss_clip": 0.06295908, + "balance_loss_mlp": 0.012561, + "epoch": 0.21073200060123254, + "flos": 24688337869440.0, + "grad_norm": 2.6276986020802386, + "language_loss": 0.77414715, + "learning_rate": 3.6670254776654754e-06, + "loss": 0.85233212, + "num_input_tokens_seen": 75747175, + "router_z_loss_clip": 2.44921875, + "router_z_loss_mlp": 0.21557617, + "step": 3505, + "time_per_iteration": 2.564504861831665 + }, + { + "auxiliary_loss_clip": 0.06529057, + "auxiliary_loss_mlp": 0.01277583, + "balance_loss_clip": 0.06294015, + "balance_loss_mlp": 0.01257186, + "epoch": 0.2107921238539005, + "flos": 28557039605760.0, + "grad_norm": 2.0513581673642434, + "language_loss": 0.64351165, + "learning_rate": 3.6668102683878163e-06, + "loss": 0.721578, + "num_input_tokens_seen": 75767690, + "router_z_loss_clip": 2.34960938, + "router_z_loss_mlp": 0.20397949, + "step": 3506, + "time_per_iteration": 2.641390323638916 + }, + { + "auxiliary_loss_clip": 0.06535215, + "auxiliary_loss_mlp": 0.01278768, + "balance_loss_clip": 0.0629719, + "balance_loss_mlp": 0.01257656, + "epoch": 0.21085224710656847, + "flos": 25893094273920.0, + "grad_norm": 2.3889311598286436, + "language_loss": 0.82716179, + "learning_rate": 3.6665949959045443e-06, + "loss": 0.90530163, + "num_input_tokens_seen": 75787255, + "router_z_loss_clip": 2.38085938, + "router_z_loss_mlp": 0.21105957, + "step": 3507, + "time_per_iteration": 2.5718142986297607 + }, + { + "auxiliary_loss_clip": 0.06534198, + "auxiliary_loss_mlp": 0.01280018, + "balance_loss_clip": 0.06294642, + "balance_loss_mlp": 0.0125769, + "epoch": 0.21091237035923643, + "flos": 14981664539520.0, + "grad_norm": 1.9856074738329712, + "language_loss": 0.76547742, + "learning_rate": 3.666379660223824e-06, + "loss": 0.84361959, + "num_input_tokens_seen": 75805890, + "router_z_loss_clip": 2.3984375, + "router_z_loss_mlp": 0.22338867, + "step": 3508, + "time_per_iteration": 2.5104117393493652 + }, + { + "auxiliary_loss_clip": 0.06543706, + "auxiliary_loss_mlp": 0.01282498, + "balance_loss_clip": 0.06299506, + "balance_loss_mlp": 0.01261159, + "epoch": 0.2109724936119044, + "flos": 16368080595840.0, + "grad_norm": 2.529935640705384, + "language_loss": 0.86242574, + "learning_rate": 3.6661642613538192e-06, + "loss": 0.94068778, + "num_input_tokens_seen": 75821620, + "router_z_loss_clip": 2.44335938, + "router_z_loss_mlp": 0.21325684, + "step": 3509, + "time_per_iteration": 2.508370876312256 + }, + { + "auxiliary_loss_clip": 0.06541994, + "auxiliary_loss_mlp": 0.01280685, + "balance_loss_clip": 0.06295836, + "balance_loss_mlp": 0.01258679, + "epoch": 0.21103261686457236, + "flos": 31510315486080.0, + "grad_norm": 1.7053981088389916, + "language_loss": 0.68853724, + "learning_rate": 3.6659487993026987e-06, + "loss": 0.76676404, + "num_input_tokens_seen": 75842490, + "router_z_loss_clip": 2.46289062, + "router_z_loss_mlp": 0.22009277, + "step": 3510, + "time_per_iteration": 2.6452746391296387 + }, + { + "auxiliary_loss_clip": 0.06542882, + "auxiliary_loss_mlp": 0.01284418, + "balance_loss_clip": 0.06299803, + "balance_loss_mlp": 0.01263259, + "epoch": 0.21109274011724033, + "flos": 27351360806400.0, + "grad_norm": 1.7932280077203222, + "language_loss": 0.7352736, + "learning_rate": 3.6657332740786327e-06, + "loss": 0.8135466, + "num_input_tokens_seen": 75865985, + "router_z_loss_clip": 2.43164062, + "router_z_loss_mlp": 0.21154785, + "step": 3511, + "time_per_iteration": 2.6538095474243164 + }, + { + "auxiliary_loss_clip": 0.06553793, + "auxiliary_loss_mlp": 0.01288613, + "balance_loss_clip": 0.06308056, + "balance_loss_mlp": 0.01265546, + "epoch": 0.21115286336990832, + "flos": 17825927857920.0, + "grad_norm": 2.4490749473958577, + "language_loss": 0.70309734, + "learning_rate": 3.665517685689794e-06, + "loss": 0.78152132, + "num_input_tokens_seen": 75882745, + "router_z_loss_clip": 2.4609375, + "router_z_loss_mlp": 0.23071289, + "step": 3512, + "time_per_iteration": 2.5178020000457764 + }, + { + "auxiliary_loss_clip": 0.06542063, + "auxiliary_loss_mlp": 0.01280138, + "balance_loss_clip": 0.06299283, + "balance_loss_mlp": 0.01257739, + "epoch": 0.2112129866225763, + "flos": 27205228085760.0, + "grad_norm": 1.580176351931222, + "language_loss": 0.73930323, + "learning_rate": 3.6653020341443584e-06, + "loss": 0.81752527, + "num_input_tokens_seen": 75904305, + "router_z_loss_clip": 2.43164062, + "router_z_loss_mlp": 0.22412109, + "step": 3513, + "time_per_iteration": 2.62662410736084 + }, + { + "auxiliary_loss_clip": 0.06537203, + "auxiliary_loss_mlp": 0.01281283, + "balance_loss_clip": 0.06301522, + "balance_loss_mlp": 0.01260303, + "epoch": 0.21127310987524425, + "flos": 23737846279680.0, + "grad_norm": 1.7494748899805272, + "language_loss": 0.75353736, + "learning_rate": 3.665086319450502e-06, + "loss": 0.8317222, + "num_input_tokens_seen": 75923710, + "router_z_loss_clip": 2.359375, + "router_z_loss_mlp": 0.20983887, + "step": 3514, + "time_per_iteration": 2.584502696990967 + }, + { + "auxiliary_loss_clip": 0.06546184, + "auxiliary_loss_mlp": 0.01281455, + "balance_loss_clip": 0.06301809, + "balance_loss_mlp": 0.01261309, + "epoch": 0.21133323312791222, + "flos": 18338356702080.0, + "grad_norm": 1.6761924057980855, + "language_loss": 0.77322358, + "learning_rate": 3.6648705416164062e-06, + "loss": 0.85149997, + "num_input_tokens_seen": 75942625, + "router_z_loss_clip": 2.4453125, + "router_z_loss_mlp": 0.20141602, + "step": 3515, + "time_per_iteration": 2.552231550216675 + }, + { + "auxiliary_loss_clip": 0.06544478, + "auxiliary_loss_mlp": 0.0128088, + "balance_loss_clip": 0.06304052, + "balance_loss_mlp": 0.01260865, + "epoch": 0.21139335638058018, + "flos": 17936994844800.0, + "grad_norm": 2.0687526262765212, + "language_loss": 0.69083852, + "learning_rate": 3.6646547006502518e-06, + "loss": 0.76909214, + "num_input_tokens_seen": 75959930, + "router_z_loss_clip": 2.40625, + "router_z_loss_mlp": 0.19995117, + "step": 3516, + "time_per_iteration": 2.535282611846924 + }, + { + "auxiliary_loss_clip": 0.0654862, + "auxiliary_loss_mlp": 0.01279905, + "balance_loss_clip": 0.0630609, + "balance_loss_mlp": 0.01257756, + "epoch": 0.21145347963324815, + "flos": 24579073745280.0, + "grad_norm": 1.818548989117399, + "language_loss": 0.85523438, + "learning_rate": 3.664438796560225e-06, + "loss": 0.93351966, + "num_input_tokens_seen": 75980335, + "router_z_loss_clip": 2.42578125, + "router_z_loss_mlp": 0.22155762, + "step": 3517, + "time_per_iteration": 2.5862202644348145 + }, + { + "auxiliary_loss_clip": 0.06554718, + "auxiliary_loss_mlp": 0.01280908, + "balance_loss_clip": 0.06311698, + "balance_loss_mlp": 0.01260368, + "epoch": 0.21151360288591614, + "flos": 35854787105280.0, + "grad_norm": 2.178791897783965, + "language_loss": 0.6333189, + "learning_rate": 3.664222829354512e-06, + "loss": 0.71167523, + "num_input_tokens_seen": 76002095, + "router_z_loss_clip": 2.43164062, + "router_z_loss_mlp": 0.20532227, + "step": 3518, + "time_per_iteration": 2.6618587970733643 + }, + { + "auxiliary_loss_clip": 0.0654604, + "auxiliary_loss_mlp": 0.0129195, + "balance_loss_clip": 0.06306089, + "balance_loss_mlp": 0.01271625, + "epoch": 0.2115737261385841, + "flos": 24647989328640.0, + "grad_norm": 1.8588369306942552, + "language_loss": 0.90024757, + "learning_rate": 3.664006799041303e-06, + "loss": 0.97862744, + "num_input_tokens_seen": 76020425, + "router_z_loss_clip": 2.39648438, + "router_z_loss_mlp": 0.20336914, + "step": 3519, + "time_per_iteration": 2.5962281227111816 + }, + { + "auxiliary_loss_clip": 0.06553498, + "auxiliary_loss_mlp": 0.0129082, + "balance_loss_clip": 0.06309567, + "balance_loss_mlp": 0.01268945, + "epoch": 0.21163384939125207, + "flos": 25233652241280.0, + "grad_norm": 1.74321759448714, + "language_loss": 0.81933582, + "learning_rate": 3.6637907056287886e-06, + "loss": 0.89777905, + "num_input_tokens_seen": 76041210, + "router_z_loss_clip": 2.43945312, + "router_z_loss_mlp": 0.21862793, + "step": 3520, + "time_per_iteration": 2.6036746501922607 + }, + { + "auxiliary_loss_clip": 0.06544603, + "auxiliary_loss_mlp": 0.0127827, + "balance_loss_clip": 0.0630887, + "balance_loss_mlp": 0.01257576, + "epoch": 0.21169397264392004, + "flos": 26074670071680.0, + "grad_norm": 1.5989262406015683, + "language_loss": 0.76731956, + "learning_rate": 3.6635745491251642e-06, + "loss": 0.84554833, + "num_input_tokens_seen": 76062685, + "router_z_loss_clip": 2.35742188, + "router_z_loss_mlp": 0.20690918, + "step": 3521, + "time_per_iteration": 2.613945960998535 + }, + { + "auxiliary_loss_clip": 0.06548078, + "auxiliary_loss_mlp": 0.01281462, + "balance_loss_clip": 0.06310651, + "balance_loss_mlp": 0.01261364, + "epoch": 0.211754095896588, + "flos": 23114266594560.0, + "grad_norm": 2.104686387571933, + "language_loss": 0.75886559, + "learning_rate": 3.663358329538626e-06, + "loss": 0.83716094, + "num_input_tokens_seen": 76082300, + "router_z_loss_clip": 2.37695312, + "router_z_loss_mlp": 0.20092773, + "step": 3522, + "time_per_iteration": 2.530388355255127 + }, + { + "auxiliary_loss_clip": 0.06550008, + "auxiliary_loss_mlp": 0.01276271, + "balance_loss_clip": 0.06309568, + "balance_loss_mlp": 0.01255994, + "epoch": 0.21181421914925597, + "flos": 27928806019200.0, + "grad_norm": 2.55069435165465, + "language_loss": 0.71218652, + "learning_rate": 3.663142046877374e-06, + "loss": 0.79044926, + "num_input_tokens_seen": 76101135, + "router_z_loss_clip": 2.40625, + "router_z_loss_mlp": 0.20288086, + "step": 3523, + "time_per_iteration": 2.6448264122009277 + }, + { + "auxiliary_loss_clip": 0.06544726, + "auxiliary_loss_mlp": 0.01276969, + "balance_loss_clip": 0.06308427, + "balance_loss_mlp": 0.01256191, + "epoch": 0.21187434240192393, + "flos": 17134313057280.0, + "grad_norm": 2.0846198886990566, + "language_loss": 0.77930927, + "learning_rate": 3.6629257011496085e-06, + "loss": 0.8575263, + "num_input_tokens_seen": 76119320, + "router_z_loss_clip": 2.36523438, + "router_z_loss_mlp": 0.20788574, + "step": 3524, + "time_per_iteration": 2.527096748352051 + }, + { + "auxiliary_loss_clip": 0.06557429, + "auxiliary_loss_mlp": 0.01277075, + "balance_loss_clip": 0.0631334, + "balance_loss_mlp": 0.01255045, + "epoch": 0.21193446565459192, + "flos": 22354071626880.0, + "grad_norm": 2.138137470282545, + "language_loss": 0.82111794, + "learning_rate": 3.6627092923635338e-06, + "loss": 0.89946306, + "num_input_tokens_seen": 76137445, + "router_z_loss_clip": 2.43945312, + "router_z_loss_mlp": 0.22033691, + "step": 3525, + "time_per_iteration": 2.583249807357788 + }, + { + "auxiliary_loss_clip": 0.06547971, + "auxiliary_loss_mlp": 0.01274856, + "balance_loss_clip": 0.06308704, + "balance_loss_mlp": 0.01254519, + "epoch": 0.2119945889072599, + "flos": 27206779386240.0, + "grad_norm": 1.7514877674009408, + "language_loss": 0.75671291, + "learning_rate": 3.662492820527356e-06, + "loss": 0.83494115, + "num_input_tokens_seen": 76159500, + "router_z_loss_clip": 2.39453125, + "router_z_loss_mlp": 0.20324707, + "step": 3526, + "time_per_iteration": 2.56286883354187 + }, + { + "auxiliary_loss_clip": 0.06556675, + "auxiliary_loss_mlp": 0.01279028, + "balance_loss_clip": 0.0631361, + "balance_loss_mlp": 0.01258107, + "epoch": 0.21205471215992786, + "flos": 20997480424320.0, + "grad_norm": 1.9989732630407808, + "language_loss": 0.77276337, + "learning_rate": 3.662276285649284e-06, + "loss": 0.85112035, + "num_input_tokens_seen": 76177990, + "router_z_loss_clip": 2.4296875, + "router_z_loss_mlp": 0.20910645, + "step": 3527, + "time_per_iteration": 2.7162973880767822 + }, + { + "auxiliary_loss_clip": 0.06551696, + "auxiliary_loss_mlp": 0.01279873, + "balance_loss_clip": 0.06314081, + "balance_loss_mlp": 0.01258224, + "epoch": 0.21211483541259582, + "flos": 20784025347840.0, + "grad_norm": 2.0427089539116783, + "language_loss": 0.78184944, + "learning_rate": 3.662059687737528e-06, + "loss": 0.86016512, + "num_input_tokens_seen": 76197125, + "router_z_loss_clip": 2.375, + "router_z_loss_mlp": 0.21643066, + "step": 3528, + "time_per_iteration": 3.990530490875244 + }, + { + "auxiliary_loss_clip": 0.06551792, + "auxiliary_loss_mlp": 0.01277875, + "balance_loss_clip": 0.06313196, + "balance_loss_mlp": 0.01257025, + "epoch": 0.21217495866526379, + "flos": 18996079726080.0, + "grad_norm": 1.942993331862389, + "language_loss": 0.82054245, + "learning_rate": 3.6618430268003024e-06, + "loss": 0.89883912, + "num_input_tokens_seen": 76216215, + "router_z_loss_clip": 2.38671875, + "router_z_loss_mlp": 0.20861816, + "step": 3529, + "time_per_iteration": 2.564383029937744 + }, + { + "auxiliary_loss_clip": 0.06555474, + "auxiliary_loss_mlp": 0.01278138, + "balance_loss_clip": 0.06313926, + "balance_loss_mlp": 0.01257134, + "epoch": 0.21223508191793175, + "flos": 20673503412480.0, + "grad_norm": 2.2777790477523236, + "language_loss": 0.77694297, + "learning_rate": 3.6616263028458235e-06, + "loss": 0.85527909, + "num_input_tokens_seen": 76237010, + "router_z_loss_clip": 2.41796875, + "router_z_loss_mlp": 0.21008301, + "step": 3530, + "time_per_iteration": 2.576662540435791 + }, + { + "auxiliary_loss_clip": 0.06550869, + "auxiliary_loss_mlp": 0.01274157, + "balance_loss_clip": 0.06314521, + "balance_loss_mlp": 0.01254106, + "epoch": 0.21229520517059972, + "flos": 21622904899200.0, + "grad_norm": 2.3150689342230644, + "language_loss": 0.83926791, + "learning_rate": 3.661409515882308e-06, + "loss": 0.91751814, + "num_input_tokens_seen": 76255965, + "router_z_loss_clip": 2.36523438, + "router_z_loss_mlp": 0.20043945, + "step": 3531, + "time_per_iteration": 4.092180252075195 + }, + { + "auxiliary_loss_clip": 0.06553733, + "auxiliary_loss_mlp": 0.01280648, + "balance_loss_clip": 0.06313696, + "balance_loss_mlp": 0.0125888, + "epoch": 0.2123553284232677, + "flos": 13996232997120.0, + "grad_norm": 2.2553338764718145, + "language_loss": 0.74256229, + "learning_rate": 3.661192665917977e-06, + "loss": 0.82090604, + "num_input_tokens_seen": 76272150, + "router_z_loss_clip": 2.40039062, + "router_z_loss_mlp": 0.21777344, + "step": 3532, + "time_per_iteration": 2.5215070247650146 + }, + { + "auxiliary_loss_clip": 0.06549011, + "auxiliary_loss_mlp": 0.01276957, + "balance_loss_clip": 0.06309506, + "balance_loss_mlp": 0.01255714, + "epoch": 0.21241545167593567, + "flos": 18302745916800.0, + "grad_norm": 1.8963653738624293, + "language_loss": 0.74378759, + "learning_rate": 3.660975752961054e-06, + "loss": 0.82204729, + "num_input_tokens_seen": 76291425, + "router_z_loss_clip": 2.39648438, + "router_z_loss_mlp": 0.21252441, + "step": 3533, + "time_per_iteration": 2.5286645889282227 + }, + { + "auxiliary_loss_clip": 0.06554842, + "auxiliary_loss_mlp": 0.01279741, + "balance_loss_clip": 0.06312128, + "balance_loss_mlp": 0.01257341, + "epoch": 0.21247557492860364, + "flos": 34721461906560.0, + "grad_norm": 1.8118406193913599, + "language_loss": 0.71620667, + "learning_rate": 3.6607587770197634e-06, + "loss": 0.79455251, + "num_input_tokens_seen": 76313975, + "router_z_loss_clip": 2.42773438, + "router_z_loss_mlp": 0.22399902, + "step": 3534, + "time_per_iteration": 2.6872916221618652 + }, + { + "auxiliary_loss_clip": 0.06548804, + "auxiliary_loss_mlp": 0.01283614, + "balance_loss_clip": 0.0630706, + "balance_loss_mlp": 0.01262586, + "epoch": 0.2125356981812716, + "flos": 22060254885120.0, + "grad_norm": 2.3502862502903046, + "language_loss": 0.72866982, + "learning_rate": 3.6605417381023346e-06, + "loss": 0.80699402, + "num_input_tokens_seen": 76330955, + "router_z_loss_clip": 2.41992188, + "router_z_loss_mlp": 0.21032715, + "step": 3535, + "time_per_iteration": 2.5843448638916016 + }, + { + "auxiliary_loss_clip": 0.06546953, + "auxiliary_loss_mlp": 0.01279722, + "balance_loss_clip": 0.06307133, + "balance_loss_mlp": 0.01257621, + "epoch": 0.21259582143393957, + "flos": 28555865648640.0, + "grad_norm": 2.199655139190772, + "language_loss": 0.70759106, + "learning_rate": 3.660324636216996e-06, + "loss": 0.7858578, + "num_input_tokens_seen": 76352680, + "router_z_loss_clip": 2.3984375, + "router_z_loss_mlp": 0.22106934, + "step": 3536, + "time_per_iteration": 4.056318998336792 + }, + { + "auxiliary_loss_clip": 0.06557733, + "auxiliary_loss_mlp": 0.01286072, + "balance_loss_clip": 0.06310252, + "balance_loss_mlp": 0.0126415, + "epoch": 0.21265594468660753, + "flos": 20127140864640.0, + "grad_norm": 2.2134041941920897, + "language_loss": 0.8820163, + "learning_rate": 3.660107471371981e-06, + "loss": 0.96045434, + "num_input_tokens_seen": 76370750, + "router_z_loss_clip": 2.4765625, + "router_z_loss_mlp": 0.21911621, + "step": 3537, + "time_per_iteration": 2.6233468055725098 + }, + { + "auxiliary_loss_clip": 0.06541121, + "auxiliary_loss_mlp": 0.01278147, + "balance_loss_clip": 0.06304413, + "balance_loss_mlp": 0.01256094, + "epoch": 0.21271606793927553, + "flos": 23082890440320.0, + "grad_norm": 1.7848498720134809, + "language_loss": 0.81086004, + "learning_rate": 3.659890243575524e-06, + "loss": 0.88905263, + "num_input_tokens_seen": 76390610, + "router_z_loss_clip": 2.36914062, + "router_z_loss_mlp": 0.22058105, + "step": 3538, + "time_per_iteration": 2.5589442253112793 + }, + { + "auxiliary_loss_clip": 0.06545715, + "auxiliary_loss_mlp": 0.01283722, + "balance_loss_clip": 0.06305592, + "balance_loss_mlp": 0.01263981, + "epoch": 0.2127761911919435, + "flos": 26394118963200.0, + "grad_norm": 2.023826748108625, + "language_loss": 0.87817419, + "learning_rate": 3.659672952835863e-06, + "loss": 0.95646858, + "num_input_tokens_seen": 76408860, + "router_z_loss_clip": 2.40234375, + "router_z_loss_mlp": 0.19763184, + "step": 3539, + "time_per_iteration": 2.6115527153015137 + }, + { + "auxiliary_loss_clip": 0.06554011, + "auxiliary_loss_mlp": 0.01284638, + "balance_loss_clip": 0.06309317, + "balance_loss_mlp": 0.01264277, + "epoch": 0.21283631444461146, + "flos": 20234182855680.0, + "grad_norm": 3.1687626880856667, + "language_loss": 0.59144789, + "learning_rate": 3.659455599161237e-06, + "loss": 0.66983438, + "num_input_tokens_seen": 76424980, + "router_z_loss_clip": 2.4453125, + "router_z_loss_mlp": 0.20361328, + "step": 3540, + "time_per_iteration": 2.525139570236206 + }, + { + "auxiliary_loss_clip": 0.06543202, + "auxiliary_loss_mlp": 0.01278528, + "balance_loss_clip": 0.0630211, + "balance_loss_mlp": 0.01256557, + "epoch": 0.21289643769727942, + "flos": 13522140195840.0, + "grad_norm": 1.940296770056649, + "language_loss": 0.7721082, + "learning_rate": 3.659238182559888e-06, + "loss": 0.85032547, + "num_input_tokens_seen": 76443135, + "router_z_loss_clip": 2.40820312, + "router_z_loss_mlp": 0.21972656, + "step": 3541, + "time_per_iteration": 2.563164234161377 + }, + { + "auxiliary_loss_clip": 0.06542824, + "auxiliary_loss_mlp": 0.01283205, + "balance_loss_clip": 0.06305471, + "balance_loss_mlp": 0.01262486, + "epoch": 0.2129565609499474, + "flos": 24833967465600.0, + "grad_norm": 1.7979798329536472, + "language_loss": 0.69596064, + "learning_rate": 3.6590207030400615e-06, + "loss": 0.77422094, + "num_input_tokens_seen": 76462470, + "router_z_loss_clip": 2.37304688, + "router_z_loss_mlp": 0.20703125, + "step": 3542, + "time_per_iteration": 2.6213386058807373 + }, + { + "auxiliary_loss_clip": 0.06542216, + "auxiliary_loss_mlp": 0.01284362, + "balance_loss_clip": 0.0630642, + "balance_loss_mlp": 0.01264692, + "epoch": 0.21301668420261535, + "flos": 23665953876480.0, + "grad_norm": 1.8238030340304547, + "language_loss": 0.77012485, + "learning_rate": 3.658803160610004e-06, + "loss": 0.84839058, + "num_input_tokens_seen": 76481995, + "router_z_loss_clip": 2.359375, + "router_z_loss_mlp": 0.19677734, + "step": 3543, + "time_per_iteration": 2.5654232501983643 + }, + { + "auxiliary_loss_clip": 0.0654586, + "auxiliary_loss_mlp": 0.01282767, + "balance_loss_clip": 0.0630815, + "balance_loss_mlp": 0.01261488, + "epoch": 0.21307680745528332, + "flos": 16368416012160.0, + "grad_norm": 2.0315626098903468, + "language_loss": 0.67305464, + "learning_rate": 3.6585855552779634e-06, + "loss": 0.75134087, + "num_input_tokens_seen": 76500245, + "router_z_loss_clip": 2.37890625, + "router_z_loss_mlp": 0.2130127, + "step": 3544, + "time_per_iteration": 2.513288736343384 + }, + { + "auxiliary_loss_clip": 0.06542834, + "auxiliary_loss_mlp": 0.01284, + "balance_loss_clip": 0.06304078, + "balance_loss_mlp": 0.01264223, + "epoch": 0.2131369307079513, + "flos": 19105092288000.0, + "grad_norm": 1.7034786511890583, + "language_loss": 0.71322483, + "learning_rate": 3.6583678870521934e-06, + "loss": 0.79149318, + "num_input_tokens_seen": 76519535, + "router_z_loss_clip": 2.38867188, + "router_z_loss_mlp": 0.19763184, + "step": 3545, + "time_per_iteration": 2.5347442626953125 + }, + { + "auxiliary_loss_clip": 0.06549121, + "auxiliary_loss_mlp": 0.01288311, + "balance_loss_clip": 0.06306408, + "balance_loss_mlp": 0.01268224, + "epoch": 0.21319705396061928, + "flos": 30380050961280.0, + "grad_norm": 2.304335172733059, + "language_loss": 0.73178399, + "learning_rate": 3.658150155940946e-06, + "loss": 0.81015837, + "num_input_tokens_seen": 76542065, + "router_z_loss_clip": 2.42382812, + "router_z_loss_mlp": 0.20092773, + "step": 3546, + "time_per_iteration": 2.6647720336914062 + }, + { + "auxiliary_loss_clip": 0.0655164, + "auxiliary_loss_mlp": 0.01278696, + "balance_loss_clip": 0.06310475, + "balance_loss_mlp": 0.01258609, + "epoch": 0.21325717721328724, + "flos": 21761616533760.0, + "grad_norm": 1.9338253687785023, + "language_loss": 0.81206107, + "learning_rate": 3.657932361952479e-06, + "loss": 0.89036447, + "num_input_tokens_seen": 76560540, + "router_z_loss_clip": 2.41210938, + "router_z_loss_mlp": 0.20092773, + "step": 3547, + "time_per_iteration": 2.533062696456909 + }, + { + "auxiliary_loss_clip": 0.06547703, + "auxiliary_loss_mlp": 0.01281658, + "balance_loss_clip": 0.06302875, + "balance_loss_mlp": 0.01259127, + "epoch": 0.2133173004659552, + "flos": 28738447695360.0, + "grad_norm": 3.206018032759459, + "language_loss": 0.74960929, + "learning_rate": 3.6577145050950504e-06, + "loss": 0.82790291, + "num_input_tokens_seen": 76581760, + "router_z_loss_clip": 2.45117188, + "router_z_loss_mlp": 0.22521973, + "step": 3548, + "time_per_iteration": 2.605151414871216 + }, + { + "auxiliary_loss_clip": 0.06554648, + "auxiliary_loss_mlp": 0.01281207, + "balance_loss_clip": 0.06309359, + "balance_loss_mlp": 0.01259236, + "epoch": 0.21337742371862317, + "flos": 16842760375680.0, + "grad_norm": 2.056331081084102, + "language_loss": 0.74889886, + "learning_rate": 3.657496585376922e-06, + "loss": 0.82725745, + "num_input_tokens_seen": 76599940, + "router_z_loss_clip": 2.453125, + "router_z_loss_mlp": 0.21972656, + "step": 3549, + "time_per_iteration": 2.518305540084839 + }, + { + "auxiliary_loss_clip": 0.06547625, + "auxiliary_loss_mlp": 0.01281066, + "balance_loss_clip": 0.06306933, + "balance_loss_mlp": 0.01261278, + "epoch": 0.21343754697129114, + "flos": 24431683213440.0, + "grad_norm": 1.7052192349692608, + "language_loss": 0.8095907, + "learning_rate": 3.657278602806357e-06, + "loss": 0.88787764, + "num_input_tokens_seen": 76619580, + "router_z_loss_clip": 2.40625, + "router_z_loss_mlp": 0.19787598, + "step": 3550, + "time_per_iteration": 2.621840715408325 + }, + { + "auxiliary_loss_clip": 0.06544942, + "auxiliary_loss_mlp": 0.01278049, + "balance_loss_clip": 0.06309815, + "balance_loss_mlp": 0.01258653, + "epoch": 0.21349767022395913, + "flos": 19283271995520.0, + "grad_norm": 1.8011583081598594, + "language_loss": 0.88582718, + "learning_rate": 3.657060557391621e-06, + "loss": 0.96405709, + "num_input_tokens_seen": 76638195, + "router_z_loss_clip": 2.35546875, + "router_z_loss_mlp": 0.19384766, + "step": 3551, + "time_per_iteration": 2.5354909896850586 + }, + { + "auxiliary_loss_clip": 0.06541884, + "auxiliary_loss_mlp": 0.01276889, + "balance_loss_clip": 0.06304973, + "balance_loss_mlp": 0.01256635, + "epoch": 0.2135577934766271, + "flos": 17353260576000.0, + "grad_norm": 1.8291964059748265, + "language_loss": 0.83669794, + "learning_rate": 3.656842449140983e-06, + "loss": 0.91488564, + "num_input_tokens_seen": 76656695, + "router_z_loss_clip": 2.36914062, + "router_z_loss_mlp": 0.20275879, + "step": 3552, + "time_per_iteration": 2.5428099632263184 + }, + { + "auxiliary_loss_clip": 0.06543534, + "auxiliary_loss_mlp": 0.01282655, + "balance_loss_clip": 0.06305505, + "balance_loss_mlp": 0.01261329, + "epoch": 0.21361791672929506, + "flos": 24063416519040.0, + "grad_norm": 1.71251087169846, + "language_loss": 0.77181637, + "learning_rate": 3.656624278062713e-06, + "loss": 0.85007823, + "num_input_tokens_seen": 76677430, + "router_z_loss_clip": 2.38085938, + "router_z_loss_mlp": 0.21325684, + "step": 3553, + "time_per_iteration": 2.5453906059265137 + }, + { + "auxiliary_loss_clip": 0.06546006, + "auxiliary_loss_mlp": 0.01280965, + "balance_loss_clip": 0.06308904, + "balance_loss_mlp": 0.01260556, + "epoch": 0.21367803998196302, + "flos": 22168596614400.0, + "grad_norm": 1.6386548216082337, + "language_loss": 0.72918522, + "learning_rate": 3.6564060441650843e-06, + "loss": 0.80745488, + "num_input_tokens_seen": 76697615, + "router_z_loss_clip": 2.37109375, + "router_z_loss_mlp": 0.20397949, + "step": 3554, + "time_per_iteration": 2.610447883605957 + }, + { + "auxiliary_loss_clip": 0.06543835, + "auxiliary_loss_mlp": 0.01296522, + "balance_loss_clip": 0.06304221, + "balance_loss_mlp": 0.01276483, + "epoch": 0.213738163234631, + "flos": 20893205617920.0, + "grad_norm": 2.167468133085416, + "language_loss": 0.6838634, + "learning_rate": 3.6561877474563724e-06, + "loss": 0.76226699, + "num_input_tokens_seen": 76715685, + "router_z_loss_clip": 2.39453125, + "router_z_loss_mlp": 0.20043945, + "step": 3555, + "time_per_iteration": 2.6348068714141846 + }, + { + "auxiliary_loss_clip": 0.06544648, + "auxiliary_loss_mlp": 0.01283651, + "balance_loss_clip": 0.06303324, + "balance_loss_mlp": 0.01262861, + "epoch": 0.21379828648729896, + "flos": 28410739176960.0, + "grad_norm": 1.8068010568670265, + "language_loss": 0.6581043, + "learning_rate": 3.6559693879448553e-06, + "loss": 0.73638725, + "num_input_tokens_seen": 76735405, + "router_z_loss_clip": 2.4140625, + "router_z_loss_mlp": 0.20800781, + "step": 3556, + "time_per_iteration": 2.6547720432281494 + }, + { + "auxiliary_loss_clip": 0.06542179, + "auxiliary_loss_mlp": 0.0129054, + "balance_loss_clip": 0.06305043, + "balance_loss_mlp": 0.01269905, + "epoch": 0.21385840973996692, + "flos": 25486030339200.0, + "grad_norm": 1.6965425102308196, + "language_loss": 0.73263884, + "learning_rate": 3.6557509656388125e-06, + "loss": 0.81096601, + "num_input_tokens_seen": 76754395, + "router_z_loss_clip": 2.375, + "router_z_loss_mlp": 0.20617676, + "step": 3557, + "time_per_iteration": 2.5850143432617188 + }, + { + "auxiliary_loss_clip": 0.06555384, + "auxiliary_loss_mlp": 0.01282331, + "balance_loss_clip": 0.06310774, + "balance_loss_mlp": 0.01260814, + "epoch": 0.2139185329926349, + "flos": 28081772847360.0, + "grad_norm": 1.6861756161591135, + "language_loss": 0.67894918, + "learning_rate": 3.655532480546528e-06, + "loss": 0.75732636, + "num_input_tokens_seen": 76777210, + "router_z_loss_clip": 2.4453125, + "router_z_loss_mlp": 0.21508789, + "step": 3558, + "time_per_iteration": 2.6937482357025146 + }, + { + "auxiliary_loss_clip": 0.06554736, + "auxiliary_loss_mlp": 0.01278958, + "balance_loss_clip": 0.06306359, + "balance_loss_mlp": 0.0125905, + "epoch": 0.21397865624530288, + "flos": 19614628166400.0, + "grad_norm": 2.1418574307637575, + "language_loss": 0.81358159, + "learning_rate": 3.655313932676286e-06, + "loss": 0.89191854, + "num_input_tokens_seen": 76795830, + "router_z_loss_clip": 2.484375, + "router_z_loss_mlp": 0.19909668, + "step": 3559, + "time_per_iteration": 2.5145814418792725 + }, + { + "auxiliary_loss_clip": 0.06551723, + "auxiliary_loss_mlp": 0.01281472, + "balance_loss_clip": 0.06314635, + "balance_loss_mlp": 0.01262899, + "epoch": 0.21403877949797084, + "flos": 24688463650560.0, + "grad_norm": 1.6715073288493136, + "language_loss": 0.68710625, + "learning_rate": 3.655095322036373e-06, + "loss": 0.7654382, + "num_input_tokens_seen": 76814700, + "router_z_loss_clip": 2.36914062, + "router_z_loss_mlp": 0.18554688, + "step": 3560, + "time_per_iteration": 2.5956926345825195 + }, + { + "auxiliary_loss_clip": 0.06554615, + "auxiliary_loss_mlp": 0.01279566, + "balance_loss_clip": 0.0631121, + "balance_loss_mlp": 0.01259313, + "epoch": 0.2140989027506388, + "flos": 19866628920960.0, + "grad_norm": 1.9885830979576231, + "language_loss": 0.73618603, + "learning_rate": 3.65487664863508e-06, + "loss": 0.81452787, + "num_input_tokens_seen": 76833400, + "router_z_loss_clip": 2.43359375, + "router_z_loss_mlp": 0.20263672, + "step": 3561, + "time_per_iteration": 2.5286123752593994 + }, + { + "auxiliary_loss_clip": 0.06553814, + "auxiliary_loss_mlp": 0.01282143, + "balance_loss_clip": 0.06311779, + "balance_loss_mlp": 0.01262402, + "epoch": 0.21415902600330677, + "flos": 19141331978880.0, + "grad_norm": 2.350872095274855, + "language_loss": 0.78756285, + "learning_rate": 3.654657912480698e-06, + "loss": 0.86592233, + "num_input_tokens_seen": 76850645, + "router_z_loss_clip": 2.41992188, + "router_z_loss_mlp": 0.19763184, + "step": 3562, + "time_per_iteration": 2.608041286468506 + }, + { + "auxiliary_loss_clip": 0.06546983, + "auxiliary_loss_mlp": 0.01281911, + "balance_loss_clip": 0.06309752, + "balance_loss_mlp": 0.01261788, + "epoch": 0.21421914925597474, + "flos": 22279076622720.0, + "grad_norm": 1.5018972458321598, + "language_loss": 0.85257983, + "learning_rate": 3.6544391135815237e-06, + "loss": 0.93086874, + "num_input_tokens_seen": 76870135, + "router_z_loss_clip": 2.37304688, + "router_z_loss_mlp": 0.20117188, + "step": 3563, + "time_per_iteration": 2.5593912601470947 + }, + { + "auxiliary_loss_clip": 0.06548097, + "auxiliary_loss_mlp": 0.01281509, + "balance_loss_clip": 0.06308593, + "balance_loss_mlp": 0.01262531, + "epoch": 0.2142792725086427, + "flos": 33883504750080.0, + "grad_norm": 1.9248219523503745, + "language_loss": 0.76925778, + "learning_rate": 3.6542202519458507e-06, + "loss": 0.84755385, + "num_input_tokens_seen": 76893905, + "router_z_loss_clip": 2.39453125, + "router_z_loss_mlp": 0.18981934, + "step": 3564, + "time_per_iteration": 2.668755531311035 + }, + { + "auxiliary_loss_clip": 0.06542072, + "auxiliary_loss_mlp": 0.01280159, + "balance_loss_clip": 0.06305549, + "balance_loss_mlp": 0.01261181, + "epoch": 0.2143393957613107, + "flos": 19865538817920.0, + "grad_norm": 1.690691453330226, + "language_loss": 0.89139843, + "learning_rate": 3.654001327581981e-06, + "loss": 0.9696207, + "num_input_tokens_seen": 76914205, + "router_z_loss_clip": 2.36328125, + "router_z_loss_mlp": 0.18969727, + "step": 3565, + "time_per_iteration": 2.660306215286255 + }, + { + "auxiliary_loss_clip": 0.06436334, + "auxiliary_loss_mlp": 0.01286647, + "balance_loss_clip": 0.06303974, + "balance_loss_mlp": 0.01279924, + "epoch": 0.21439951901397866, + "flos": 68549300017920.0, + "grad_norm": 0.8225285981700966, + "language_loss": 0.52211988, + "learning_rate": 3.653782340498215e-06, + "loss": 0.59934968, + "num_input_tokens_seen": 76975650, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.06738281, + "step": 3566, + "time_per_iteration": 3.0845720767974854 + }, + { + "auxiliary_loss_clip": 0.06539588, + "auxiliary_loss_mlp": 0.01284533, + "balance_loss_clip": 0.06306818, + "balance_loss_mlp": 0.0126478, + "epoch": 0.21445964226664663, + "flos": 19689161973120.0, + "grad_norm": 1.8060006281631265, + "language_loss": 0.68295264, + "learning_rate": 3.6535632907028566e-06, + "loss": 0.76119387, + "num_input_tokens_seen": 76992615, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.19775391, + "step": 3567, + "time_per_iteration": 2.5250415802001953 + }, + { + "auxiliary_loss_clip": 0.06543978, + "auxiliary_loss_mlp": 0.01283364, + "balance_loss_clip": 0.06310168, + "balance_loss_mlp": 0.012641, + "epoch": 0.2145197655193146, + "flos": 31116039298560.0, + "grad_norm": 2.0548954423707753, + "language_loss": 0.75150776, + "learning_rate": 3.6533441782042126e-06, + "loss": 0.82978123, + "num_input_tokens_seen": 77017005, + "router_z_loss_clip": 2.33984375, + "router_z_loss_mlp": 0.19250488, + "step": 3568, + "time_per_iteration": 4.018412113189697 + }, + { + "auxiliary_loss_clip": 0.06538366, + "auxiliary_loss_mlp": 0.01282205, + "balance_loss_clip": 0.063043, + "balance_loss_mlp": 0.01261773, + "epoch": 0.21457988877198256, + "flos": 20127015083520.0, + "grad_norm": 2.3975687399079284, + "language_loss": 0.78487438, + "learning_rate": 3.6531250030105917e-06, + "loss": 0.86308008, + "num_input_tokens_seen": 77034990, + "router_z_loss_clip": 2.34179688, + "router_z_loss_mlp": 0.20446777, + "step": 3569, + "time_per_iteration": 2.6051042079925537 + }, + { + "auxiliary_loss_clip": 0.06554128, + "auxiliary_loss_mlp": 0.01283223, + "balance_loss_clip": 0.06309038, + "balance_loss_mlp": 0.01262183, + "epoch": 0.21464001202465052, + "flos": 18593963182080.0, + "grad_norm": 2.5916710851503173, + "language_loss": 0.7048617, + "learning_rate": 3.6529057651303053e-06, + "loss": 0.78323519, + "num_input_tokens_seen": 77052610, + "router_z_loss_clip": 2.453125, + "router_z_loss_mlp": 0.21032715, + "step": 3570, + "time_per_iteration": 2.5029172897338867 + }, + { + "auxiliary_loss_clip": 0.06548594, + "auxiliary_loss_mlp": 0.01293921, + "balance_loss_clip": 0.06305287, + "balance_loss_mlp": 0.01274621, + "epoch": 0.21470013527731852, + "flos": 21841600855680.0, + "grad_norm": 3.519297534980699, + "language_loss": 0.79412138, + "learning_rate": 3.6526864645716666e-06, + "loss": 0.87254649, + "num_input_tokens_seen": 77072475, + "router_z_loss_clip": 2.43554688, + "router_z_loss_mlp": 0.19311523, + "step": 3571, + "time_per_iteration": 3.984830141067505 + }, + { + "auxiliary_loss_clip": 0.06547887, + "auxiliary_loss_mlp": 0.01283536, + "balance_loss_clip": 0.06306981, + "balance_loss_mlp": 0.01263413, + "epoch": 0.21476025852998648, + "flos": 17608992837120.0, + "grad_norm": 2.1137138833129114, + "language_loss": 0.83417559, + "learning_rate": 3.652467101342991e-06, + "loss": 0.91248989, + "num_input_tokens_seen": 77089930, + "router_z_loss_clip": 2.41210938, + "router_z_loss_mlp": 0.20117188, + "step": 3572, + "time_per_iteration": 2.550900459289551 + }, + { + "auxiliary_loss_clip": 0.06544446, + "auxiliary_loss_mlp": 0.01290796, + "balance_loss_clip": 0.06300403, + "balance_loss_mlp": 0.01271114, + "epoch": 0.21482038178265445, + "flos": 24835267203840.0, + "grad_norm": 5.91831897424108, + "language_loss": 0.6534397, + "learning_rate": 3.652247675452598e-06, + "loss": 0.73179209, + "num_input_tokens_seen": 77108970, + "router_z_loss_clip": 2.44140625, + "router_z_loss_mlp": 0.19677734, + "step": 3573, + "time_per_iteration": 2.574037551879883 + }, + { + "auxiliary_loss_clip": 0.06536618, + "auxiliary_loss_mlp": 0.01287357, + "balance_loss_clip": 0.06305118, + "balance_loss_mlp": 0.0126814, + "epoch": 0.2148805050353224, + "flos": 23264927435520.0, + "grad_norm": 1.8228372560216166, + "language_loss": 0.76129293, + "learning_rate": 3.652028186908807e-06, + "loss": 0.83953267, + "num_input_tokens_seen": 77126045, + "router_z_loss_clip": 2.31640625, + "router_z_loss_mlp": 0.1920166, + "step": 3574, + "time_per_iteration": 2.610541343688965 + }, + { + "auxiliary_loss_clip": 0.06537417, + "auxiliary_loss_mlp": 0.01280783, + "balance_loss_clip": 0.06298707, + "balance_loss_mlp": 0.0126066, + "epoch": 0.21494062828799038, + "flos": 21326907951360.0, + "grad_norm": 2.0935140233911644, + "language_loss": 0.72909325, + "learning_rate": 3.6518086357199416e-06, + "loss": 0.8072753, + "num_input_tokens_seen": 77144600, + "router_z_loss_clip": 2.38867188, + "router_z_loss_mlp": 0.20117188, + "step": 3575, + "time_per_iteration": 2.581932306289673 + }, + { + "auxiliary_loss_clip": 0.06537387, + "auxiliary_loss_mlp": 0.01288909, + "balance_loss_clip": 0.06302074, + "balance_loss_mlp": 0.01269657, + "epoch": 0.21500075154065834, + "flos": 18849276172800.0, + "grad_norm": 2.2103119968131986, + "language_loss": 0.6923548, + "learning_rate": 3.6515890218943277e-06, + "loss": 0.77061772, + "num_input_tokens_seen": 77162965, + "router_z_loss_clip": 2.35742188, + "router_z_loss_mlp": 0.19262695, + "step": 3576, + "time_per_iteration": 5.394233703613281 + }, + { + "auxiliary_loss_clip": 0.06547244, + "auxiliary_loss_mlp": 0.01282016, + "balance_loss_clip": 0.06304461, + "balance_loss_mlp": 0.0126069, + "epoch": 0.2150608747933263, + "flos": 18447872388480.0, + "grad_norm": 1.9274083971527407, + "language_loss": 0.89371777, + "learning_rate": 3.651369345440292e-06, + "loss": 0.97201031, + "num_input_tokens_seen": 77179960, + "router_z_loss_clip": 2.42773438, + "router_z_loss_mlp": 0.21337891, + "step": 3577, + "time_per_iteration": 2.5629777908325195 + }, + { + "auxiliary_loss_clip": 0.06425267, + "auxiliary_loss_mlp": 0.01303124, + "balance_loss_clip": 0.06298774, + "balance_loss_mlp": 0.01297548, + "epoch": 0.2151209980459943, + "flos": 66617443808640.0, + "grad_norm": 0.7978427219987446, + "language_loss": 0.56304139, + "learning_rate": 3.6511496063661654e-06, + "loss": 0.64032531, + "num_input_tokens_seen": 77239500, + "router_z_loss_clip": 1.265625, + "router_z_loss_mlp": 0.05581665, + "step": 3578, + "time_per_iteration": 3.0982370376586914 + }, + { + "auxiliary_loss_clip": 0.06546376, + "auxiliary_loss_mlp": 0.0128684, + "balance_loss_clip": 0.06309081, + "balance_loss_mlp": 0.0126729, + "epoch": 0.21518112129866226, + "flos": 21581633963520.0, + "grad_norm": 1.7619248126111737, + "language_loss": 0.89097106, + "learning_rate": 3.6509298046802807e-06, + "loss": 0.96930325, + "num_input_tokens_seen": 77254680, + "router_z_loss_clip": 2.37304688, + "router_z_loss_mlp": 0.19555664, + "step": 3579, + "time_per_iteration": 2.5552327632904053 + }, + { + "auxiliary_loss_clip": 0.06544919, + "auxiliary_loss_mlp": 0.01281002, + "balance_loss_clip": 0.06304899, + "balance_loss_mlp": 0.01260498, + "epoch": 0.21524124455133023, + "flos": 20053822942080.0, + "grad_norm": 1.8548300822509616, + "language_loss": 0.78671825, + "learning_rate": 3.650709940390972e-06, + "loss": 0.86497748, + "num_input_tokens_seen": 77274060, + "router_z_loss_clip": 2.39648438, + "router_z_loss_mlp": 0.20507812, + "step": 3580, + "time_per_iteration": 2.538740634918213 + }, + { + "auxiliary_loss_clip": 0.06547832, + "auxiliary_loss_mlp": 0.01284221, + "balance_loss_clip": 0.06311843, + "balance_loss_mlp": 0.01265279, + "epoch": 0.2153013678039982, + "flos": 23958680515200.0, + "grad_norm": 2.0040984242528905, + "language_loss": 0.73520374, + "learning_rate": 3.6504900135065775e-06, + "loss": 0.81352425, + "num_input_tokens_seen": 77293255, + "router_z_loss_clip": 2.359375, + "router_z_loss_mlp": 0.18933105, + "step": 3581, + "time_per_iteration": 2.5783493518829346 + }, + { + "auxiliary_loss_clip": 0.06544261, + "auxiliary_loss_mlp": 0.01283002, + "balance_loss_clip": 0.06307264, + "balance_loss_mlp": 0.01262438, + "epoch": 0.21536149105666616, + "flos": 20601107884800.0, + "grad_norm": 2.9043222851567574, + "language_loss": 0.71477044, + "learning_rate": 3.6502700240354357e-06, + "loss": 0.79304302, + "num_input_tokens_seen": 77312390, + "router_z_loss_clip": 2.3671875, + "router_z_loss_mlp": 0.20556641, + "step": 3582, + "time_per_iteration": 2.5253281593322754 + }, + { + "auxiliary_loss_clip": 0.06553562, + "auxiliary_loss_mlp": 0.01282797, + "balance_loss_clip": 0.06315581, + "balance_loss_mlp": 0.01262209, + "epoch": 0.21542161430933413, + "flos": 12865046077440.0, + "grad_norm": 2.5916269023447795, + "language_loss": 0.85900396, + "learning_rate": 3.650049971985889e-06, + "loss": 0.93736756, + "num_input_tokens_seen": 77330985, + "router_z_loss_clip": 2.37890625, + "router_z_loss_mlp": 0.20568848, + "step": 3583, + "time_per_iteration": 2.580411434173584 + }, + { + "auxiliary_loss_clip": 0.0655268, + "auxiliary_loss_mlp": 0.01295505, + "balance_loss_clip": 0.06312086, + "balance_loss_mlp": 0.01275561, + "epoch": 0.21548173756200212, + "flos": 26111077470720.0, + "grad_norm": 2.720923149453336, + "language_loss": 0.83510441, + "learning_rate": 3.6498298573662824e-06, + "loss": 0.91358626, + "num_input_tokens_seen": 77350770, + "router_z_loss_clip": 2.40625, + "router_z_loss_mlp": 0.19934082, + "step": 3584, + "time_per_iteration": 2.587843179702759 + }, + { + "auxiliary_loss_clip": 0.06549002, + "auxiliary_loss_mlp": 0.01288111, + "balance_loss_clip": 0.06314336, + "balance_loss_mlp": 0.01267667, + "epoch": 0.21554186081467008, + "flos": 22170315623040.0, + "grad_norm": 2.7712372256622357, + "language_loss": 0.91010725, + "learning_rate": 3.6496096801849625e-06, + "loss": 0.9884783, + "num_input_tokens_seen": 77370510, + "router_z_loss_clip": 2.34375, + "router_z_loss_mlp": 0.20446777, + "step": 3585, + "time_per_iteration": 2.5638017654418945 + }, + { + "auxiliary_loss_clip": 0.06548285, + "auxiliary_loss_mlp": 0.0129374, + "balance_loss_clip": 0.0631054, + "balance_loss_mlp": 0.012745, + "epoch": 0.21560198406733805, + "flos": 22973458608000.0, + "grad_norm": 2.0799258962001548, + "language_loss": 0.75285476, + "learning_rate": 3.649389440450277e-06, + "loss": 0.83127499, + "num_input_tokens_seen": 77390645, + "router_z_loss_clip": 2.37890625, + "router_z_loss_mlp": 0.19238281, + "step": 3586, + "time_per_iteration": 2.5816385746002197 + }, + { + "auxiliary_loss_clip": 0.06560329, + "auxiliary_loss_mlp": 0.01301548, + "balance_loss_clip": 0.06317623, + "balance_loss_mlp": 0.012817, + "epoch": 0.215662107320006, + "flos": 22790708853120.0, + "grad_norm": 1.7819627104594034, + "language_loss": 0.83628035, + "learning_rate": 3.6491691381705804e-06, + "loss": 0.91489911, + "num_input_tokens_seen": 77409655, + "router_z_loss_clip": 2.42382812, + "router_z_loss_mlp": 0.19848633, + "step": 3587, + "time_per_iteration": 2.5768468379974365 + }, + { + "auxiliary_loss_clip": 0.06549525, + "auxiliary_loss_mlp": 0.01284104, + "balance_loss_clip": 0.06311873, + "balance_loss_mlp": 0.01265114, + "epoch": 0.21572223057267398, + "flos": 30891850899840.0, + "grad_norm": 2.819752743062096, + "language_loss": 0.764575, + "learning_rate": 3.648948773354224e-06, + "loss": 0.8429113, + "num_input_tokens_seen": 77430560, + "router_z_loss_clip": 2.37890625, + "router_z_loss_mlp": 0.18981934, + "step": 3588, + "time_per_iteration": 2.6578357219696045 + }, + { + "auxiliary_loss_clip": 0.06557232, + "auxiliary_loss_mlp": 0.01294163, + "balance_loss_clip": 0.06316121, + "balance_loss_mlp": 0.01274494, + "epoch": 0.21578235382534194, + "flos": 26918413159680.0, + "grad_norm": 3.674353356251158, + "language_loss": 0.8181411, + "learning_rate": 3.6487283460095643e-06, + "loss": 0.89665502, + "num_input_tokens_seen": 77455000, + "router_z_loss_clip": 2.4140625, + "router_z_loss_mlp": 0.19689941, + "step": 3589, + "time_per_iteration": 2.6730964183807373 + }, + { + "auxiliary_loss_clip": 0.06560542, + "auxiliary_loss_mlp": 0.01287343, + "balance_loss_clip": 0.06321919, + "balance_loss_mlp": 0.01267959, + "epoch": 0.2158424770780099, + "flos": 24432605608320.0, + "grad_norm": 2.119721317496626, + "language_loss": 0.73323047, + "learning_rate": 3.648507856144961e-06, + "loss": 0.81170928, + "num_input_tokens_seen": 77475075, + "router_z_loss_clip": 2.38671875, + "router_z_loss_mlp": 0.19384766, + "step": 3590, + "time_per_iteration": 2.5885848999023438 + }, + { + "auxiliary_loss_clip": 0.06554762, + "auxiliary_loss_mlp": 0.0128494, + "balance_loss_clip": 0.06310897, + "balance_loss_mlp": 0.01264401, + "epoch": 0.2159026003306779, + "flos": 23956542236160.0, + "grad_norm": 2.0666561712978813, + "language_loss": 0.84929311, + "learning_rate": 3.648287303768775e-06, + "loss": 0.92769015, + "num_input_tokens_seen": 77495945, + "router_z_loss_clip": 2.44140625, + "router_z_loss_mlp": 0.20544434, + "step": 3591, + "time_per_iteration": 2.5598154067993164 + }, + { + "auxiliary_loss_clip": 0.0656037, + "auxiliary_loss_mlp": 0.01294269, + "balance_loss_clip": 0.06315921, + "balance_loss_mlp": 0.01272776, + "epoch": 0.21596272358334587, + "flos": 30048191665920.0, + "grad_norm": 1.8943006547331833, + "language_loss": 0.69118065, + "learning_rate": 3.6480666888893686e-06, + "loss": 0.76972699, + "num_input_tokens_seen": 77517140, + "router_z_loss_clip": 2.44140625, + "router_z_loss_mlp": 0.21496582, + "step": 3592, + "time_per_iteration": 2.623124599456787 + }, + { + "auxiliary_loss_clip": 0.06558264, + "auxiliary_loss_mlp": 0.01284651, + "balance_loss_clip": 0.06314576, + "balance_loss_mlp": 0.01264218, + "epoch": 0.21602284683601383, + "flos": 20382495782400.0, + "grad_norm": 3.2836833125469753, + "language_loss": 0.84947151, + "learning_rate": 3.647846011515108e-06, + "loss": 0.92790061, + "num_input_tokens_seen": 77536085, + "router_z_loss_clip": 2.4375, + "router_z_loss_mlp": 0.2043457, + "step": 3593, + "time_per_iteration": 2.5159051418304443 + }, + { + "auxiliary_loss_clip": 0.06551524, + "auxiliary_loss_mlp": 0.01289729, + "balance_loss_clip": 0.06309479, + "balance_loss_mlp": 0.01267615, + "epoch": 0.2160829700886818, + "flos": 20783648004480.0, + "grad_norm": 2.6962087820066567, + "language_loss": 0.76424301, + "learning_rate": 3.6476252716543625e-06, + "loss": 0.84265554, + "num_input_tokens_seen": 77553675, + "router_z_loss_clip": 2.42382812, + "router_z_loss_mlp": 0.22119141, + "step": 3594, + "time_per_iteration": 2.530874490737915 + }, + { + "auxiliary_loss_clip": 0.06549954, + "auxiliary_loss_mlp": 0.01280574, + "balance_loss_clip": 0.06313863, + "balance_loss_mlp": 0.01260189, + "epoch": 0.21614309334134976, + "flos": 22316322562560.0, + "grad_norm": 1.5622924015328905, + "language_loss": 0.80828846, + "learning_rate": 3.6474044693155007e-06, + "loss": 0.88659382, + "num_input_tokens_seen": 77573360, + "router_z_loss_clip": 2.36132812, + "router_z_loss_mlp": 0.20385742, + "step": 3595, + "time_per_iteration": 2.5720436573028564 + }, + { + "auxiliary_loss_clip": 0.0655812, + "auxiliary_loss_mlp": 0.01282788, + "balance_loss_clip": 0.06310599, + "balance_loss_mlp": 0.01261962, + "epoch": 0.21620321659401773, + "flos": 19615592488320.0, + "grad_norm": 2.071968351759389, + "language_loss": 0.79120421, + "learning_rate": 3.647183604506897e-06, + "loss": 0.86961329, + "num_input_tokens_seen": 77591865, + "router_z_loss_clip": 2.4765625, + "router_z_loss_mlp": 0.20825195, + "step": 3596, + "time_per_iteration": 2.529978036880493 + }, + { + "auxiliary_loss_clip": 0.06547653, + "auxiliary_loss_mlp": 0.01279822, + "balance_loss_clip": 0.06309111, + "balance_loss_mlp": 0.01258615, + "epoch": 0.2162633398466857, + "flos": 18850701692160.0, + "grad_norm": 1.8098333997433065, + "language_loss": 0.83728772, + "learning_rate": 3.6469626772369253e-06, + "loss": 0.91556245, + "num_input_tokens_seen": 77611600, + "router_z_loss_clip": 2.38671875, + "router_z_loss_mlp": 0.2121582, + "step": 3597, + "time_per_iteration": 2.514389991760254 + }, + { + "auxiliary_loss_clip": 0.06559294, + "auxiliary_loss_mlp": 0.01284022, + "balance_loss_clip": 0.06315802, + "balance_loss_mlp": 0.01262421, + "epoch": 0.21632346309935369, + "flos": 18774490803840.0, + "grad_norm": 2.0845397374343655, + "language_loss": 0.81213892, + "learning_rate": 3.6467416875139642e-06, + "loss": 0.89057213, + "num_input_tokens_seen": 77630665, + "router_z_loss_clip": 2.4375, + "router_z_loss_mlp": 0.21606445, + "step": 3598, + "time_per_iteration": 2.517596960067749 + }, + { + "auxiliary_loss_clip": 0.06554621, + "auxiliary_loss_mlp": 0.01287936, + "balance_loss_clip": 0.06312433, + "balance_loss_mlp": 0.01265072, + "epoch": 0.21638358635202165, + "flos": 26331576289920.0, + "grad_norm": 1.6266226591192001, + "language_loss": 0.82318664, + "learning_rate": 3.6465206353463934e-06, + "loss": 0.90161228, + "num_input_tokens_seen": 77650835, + "router_z_loss_clip": 2.41992188, + "router_z_loss_mlp": 0.22851562, + "step": 3599, + "time_per_iteration": 2.567528486251831 + }, + { + "auxiliary_loss_clip": 0.06553015, + "auxiliary_loss_mlp": 0.01284743, + "balance_loss_clip": 0.06314674, + "balance_loss_mlp": 0.01263107, + "epoch": 0.21644370960468962, + "flos": 20747156751360.0, + "grad_norm": 2.0891036476830585, + "language_loss": 0.76652539, + "learning_rate": 3.6462995207425947e-06, + "loss": 0.84490293, + "num_input_tokens_seen": 77669000, + "router_z_loss_clip": 2.38476562, + "router_z_loss_mlp": 0.21618652, + "step": 3600, + "time_per_iteration": 2.5642178058624268 + }, + { + "auxiliary_loss_clip": 0.06555548, + "auxiliary_loss_mlp": 0.01287253, + "balance_loss_clip": 0.06316924, + "balance_loss_mlp": 0.01267512, + "epoch": 0.21650383285735758, + "flos": 23959183639680.0, + "grad_norm": 1.8375873098897355, + "language_loss": 0.80812716, + "learning_rate": 3.6460783437109533e-06, + "loss": 0.88655519, + "num_input_tokens_seen": 77688745, + "router_z_loss_clip": 2.38476562, + "router_z_loss_mlp": 0.19726562, + "step": 3601, + "time_per_iteration": 2.536790132522583 + }, + { + "auxiliary_loss_clip": 0.06558496, + "auxiliary_loss_mlp": 0.01286287, + "balance_loss_clip": 0.06317312, + "balance_loss_mlp": 0.01265783, + "epoch": 0.21656395611002555, + "flos": 23702864400000.0, + "grad_norm": 1.8593805820505158, + "language_loss": 0.84205902, + "learning_rate": 3.6458571042598565e-06, + "loss": 0.92050683, + "num_input_tokens_seen": 77708445, + "router_z_loss_clip": 2.41210938, + "router_z_loss_mlp": 0.2052002, + "step": 3602, + "time_per_iteration": 2.5919816493988037 + }, + { + "auxiliary_loss_clip": 0.06553967, + "auxiliary_loss_mlp": 0.01285958, + "balance_loss_clip": 0.06313825, + "balance_loss_mlp": 0.01265371, + "epoch": 0.2166240793626935, + "flos": 20672035966080.0, + "grad_norm": 1.6537912100509087, + "language_loss": 0.75107038, + "learning_rate": 3.645635802397693e-06, + "loss": 0.82946962, + "num_input_tokens_seen": 77728465, + "router_z_loss_clip": 2.40429688, + "router_z_loss_mlp": 0.20581055, + "step": 3603, + "time_per_iteration": 2.5602827072143555 + }, + { + "auxiliary_loss_clip": 0.06545025, + "auxiliary_loss_mlp": 0.01279689, + "balance_loss_clip": 0.06314509, + "balance_loss_mlp": 0.0125996, + "epoch": 0.2166842026153615, + "flos": 21586916770560.0, + "grad_norm": 1.9607230977514314, + "language_loss": 0.75016356, + "learning_rate": 3.645414438132855e-06, + "loss": 0.82841063, + "num_input_tokens_seen": 77746735, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.1973877, + "step": 3604, + "time_per_iteration": 2.7099287509918213 + }, + { + "auxiliary_loss_clip": 0.06550605, + "auxiliary_loss_mlp": 0.01283396, + "balance_loss_clip": 0.06315283, + "balance_loss_mlp": 0.01263881, + "epoch": 0.21674432586802947, + "flos": 25637068523520.0, + "grad_norm": 1.5948705207891358, + "language_loss": 0.80732697, + "learning_rate": 3.6451930114737366e-06, + "loss": 0.88566697, + "num_input_tokens_seen": 77768105, + "router_z_loss_clip": 2.35351562, + "router_z_loss_mlp": 0.19506836, + "step": 3605, + "time_per_iteration": 2.601269483566284 + }, + { + "auxiliary_loss_clip": 0.06465107, + "auxiliary_loss_mlp": 0.01314575, + "balance_loss_clip": 0.0633797, + "balance_loss_mlp": 0.01307596, + "epoch": 0.21680444912069743, + "flos": 56435126376960.0, + "grad_norm": 0.68181157035555, + "language_loss": 0.58316016, + "learning_rate": 3.6449715224287347e-06, + "loss": 0.66095698, + "num_input_tokens_seen": 77833750, + "router_z_loss_clip": 1.27246094, + "router_z_loss_mlp": 0.06994629, + "step": 3606, + "time_per_iteration": 3.2531886100769043 + }, + { + "auxiliary_loss_clip": 0.06547002, + "auxiliary_loss_mlp": 0.01286663, + "balance_loss_clip": 0.06303971, + "balance_loss_mlp": 0.01264502, + "epoch": 0.2168645723733654, + "flos": 23885823790080.0, + "grad_norm": 1.8693102201830953, + "language_loss": 0.73682618, + "learning_rate": 3.644749971006248e-06, + "loss": 0.81516284, + "num_input_tokens_seen": 77853780, + "router_z_loss_clip": 2.42773438, + "router_z_loss_mlp": 0.22155762, + "step": 3607, + "time_per_iteration": 4.0285868644714355 + }, + { + "auxiliary_loss_clip": 0.06548688, + "auxiliary_loss_mlp": 0.01281672, + "balance_loss_clip": 0.06306184, + "balance_loss_mlp": 0.01259595, + "epoch": 0.21692469562603336, + "flos": 16951814864640.0, + "grad_norm": 1.845726065350227, + "language_loss": 0.78116572, + "learning_rate": 3.6445283572146765e-06, + "loss": 0.85946935, + "num_input_tokens_seen": 77872575, + "router_z_loss_clip": 2.421875, + "router_z_loss_mlp": 0.22070312, + "step": 3608, + "time_per_iteration": 2.4997665882110596 + }, + { + "auxiliary_loss_clip": 0.06549841, + "auxiliary_loss_mlp": 0.01279583, + "balance_loss_clip": 0.06307275, + "balance_loss_mlp": 0.01260248, + "epoch": 0.21698481887870133, + "flos": 25126065198720.0, + "grad_norm": 2.052249511327834, + "language_loss": 0.74638152, + "learning_rate": 3.6443066810624255e-06, + "loss": 0.82467568, + "num_input_tokens_seen": 77892700, + "router_z_loss_clip": 2.42773438, + "router_z_loss_mlp": 0.19335938, + "step": 3609, + "time_per_iteration": 2.5834193229675293 + }, + { + "auxiliary_loss_clip": 0.06540599, + "auxiliary_loss_mlp": 0.01279572, + "balance_loss_clip": 0.06301089, + "balance_loss_mlp": 0.01258221, + "epoch": 0.2170449421313693, + "flos": 17900461664640.0, + "grad_norm": 2.066668805909691, + "language_loss": 0.8888129, + "learning_rate": 3.6440849425579e-06, + "loss": 0.96701467, + "num_input_tokens_seen": 77911060, + "router_z_loss_clip": 2.39648438, + "router_z_loss_mlp": 0.21374512, + "step": 3610, + "time_per_iteration": 3.978980302810669 + }, + { + "auxiliary_loss_clip": 0.06540407, + "auxiliary_loss_mlp": 0.01284961, + "balance_loss_clip": 0.06302356, + "balance_loss_mlp": 0.01264457, + "epoch": 0.2171050653840373, + "flos": 22645121184000.0, + "grad_norm": 2.4524698956279978, + "language_loss": 0.78034103, + "learning_rate": 3.6438631417095095e-06, + "loss": 0.85859472, + "num_input_tokens_seen": 77929930, + "router_z_loss_clip": 2.38085938, + "router_z_loss_mlp": 0.20507812, + "step": 3611, + "time_per_iteration": 2.537783622741699 + }, + { + "auxiliary_loss_clip": 0.06539893, + "auxiliary_loss_mlp": 0.01277493, + "balance_loss_clip": 0.06301216, + "balance_loss_mlp": 0.0125619, + "epoch": 0.21716518863670525, + "flos": 19506034874880.0, + "grad_norm": 1.9372172398113192, + "language_loss": 0.63866782, + "learning_rate": 3.6436412785256637e-06, + "loss": 0.71684164, + "num_input_tokens_seen": 77949060, + "router_z_loss_clip": 2.38671875, + "router_z_loss_mlp": 0.21313477, + "step": 3612, + "time_per_iteration": 2.5200283527374268 + }, + { + "auxiliary_loss_clip": 0.06543254, + "auxiliary_loss_mlp": 0.01280194, + "balance_loss_clip": 0.06303414, + "balance_loss_mlp": 0.01259761, + "epoch": 0.21722531188937322, + "flos": 19798132608000.0, + "grad_norm": 1.7866878621114652, + "language_loss": 0.76463711, + "learning_rate": 3.643419353014776e-06, + "loss": 0.84287155, + "num_input_tokens_seen": 77967920, + "router_z_loss_clip": 2.40039062, + "router_z_loss_mlp": 0.2043457, + "step": 3613, + "time_per_iteration": 2.536395311355591 + }, + { + "auxiliary_loss_clip": 0.06540725, + "auxiliary_loss_mlp": 0.01277778, + "balance_loss_clip": 0.06303174, + "balance_loss_mlp": 0.01256165, + "epoch": 0.21728543514204118, + "flos": 13339474295040.0, + "grad_norm": 1.8023674067133515, + "language_loss": 0.72213733, + "learning_rate": 3.643197365185261e-06, + "loss": 0.80032235, + "num_input_tokens_seen": 77985330, + "router_z_loss_clip": 2.375, + "router_z_loss_mlp": 0.21582031, + "step": 3614, + "time_per_iteration": 2.5000360012054443 + }, + { + "auxiliary_loss_clip": 0.06542929, + "auxiliary_loss_mlp": 0.01277823, + "balance_loss_clip": 0.06304483, + "balance_loss_mlp": 0.01256973, + "epoch": 0.21734555839470915, + "flos": 15237312946560.0, + "grad_norm": 2.7303590898197463, + "language_loss": 0.73928845, + "learning_rate": 3.6429753150455378e-06, + "loss": 0.81749594, + "num_input_tokens_seen": 78003105, + "router_z_loss_clip": 2.38476562, + "router_z_loss_mlp": 0.20849609, + "step": 3615, + "time_per_iteration": 3.924616813659668 + }, + { + "auxiliary_loss_clip": 0.0654763, + "auxiliary_loss_mlp": 0.01279327, + "balance_loss_clip": 0.06301322, + "balance_loss_mlp": 0.0125694, + "epoch": 0.2174056816473771, + "flos": 19980043822080.0, + "grad_norm": 2.1391350951981467, + "language_loss": 0.913239, + "learning_rate": 3.6427532026040263e-06, + "loss": 0.99150848, + "num_input_tokens_seen": 78019655, + "router_z_loss_clip": 2.46875, + "router_z_loss_mlp": 0.22387695, + "step": 3616, + "time_per_iteration": 3.9379403591156006 + }, + { + "auxiliary_loss_clip": 0.06540038, + "auxiliary_loss_mlp": 0.01284656, + "balance_loss_clip": 0.06298746, + "balance_loss_mlp": 0.01263163, + "epoch": 0.21746580490004508, + "flos": 16692309169920.0, + "grad_norm": 2.057861674488091, + "language_loss": 0.81572813, + "learning_rate": 3.642531027869148e-06, + "loss": 0.89397502, + "num_input_tokens_seen": 78036025, + "router_z_loss_clip": 2.41601562, + "router_z_loss_mlp": 0.21496582, + "step": 3617, + "time_per_iteration": 2.5517330169677734 + }, + { + "auxiliary_loss_clip": 0.06543958, + "auxiliary_loss_mlp": 0.01280959, + "balance_loss_clip": 0.06300673, + "balance_loss_mlp": 0.01258881, + "epoch": 0.21752592815271307, + "flos": 25778840832000.0, + "grad_norm": 1.7475820668036919, + "language_loss": 0.76030993, + "learning_rate": 3.642308790849329e-06, + "loss": 0.83855915, + "num_input_tokens_seen": 78055645, + "router_z_loss_clip": 2.43164062, + "router_z_loss_mlp": 0.2208252, + "step": 3618, + "time_per_iteration": 2.5874650478363037 + }, + { + "auxiliary_loss_clip": 0.06542084, + "auxiliary_loss_mlp": 0.01277743, + "balance_loss_clip": 0.06299525, + "balance_loss_mlp": 0.01255928, + "epoch": 0.21758605140538104, + "flos": 11259430940160.0, + "grad_norm": 1.9309868599682727, + "language_loss": 0.69592559, + "learning_rate": 3.642086491552996e-06, + "loss": 0.77412391, + "num_input_tokens_seen": 78071660, + "router_z_loss_clip": 2.421875, + "router_z_loss_mlp": 0.21826172, + "step": 3619, + "time_per_iteration": 2.5259079933166504 + }, + { + "auxiliary_loss_clip": 0.06549741, + "auxiliary_loss_mlp": 0.01287424, + "balance_loss_clip": 0.06307657, + "balance_loss_mlp": 0.01264906, + "epoch": 0.217646174658049, + "flos": 19248290115840.0, + "grad_norm": 1.6696593228851853, + "language_loss": 0.78744078, + "learning_rate": 3.641864129988579e-06, + "loss": 0.86581242, + "num_input_tokens_seen": 78091265, + "router_z_loss_clip": 2.41992188, + "router_z_loss_mlp": 0.22521973, + "step": 3620, + "time_per_iteration": 2.5225844383239746 + }, + { + "auxiliary_loss_clip": 0.06542689, + "auxiliary_loss_mlp": 0.01283495, + "balance_loss_clip": 0.06306273, + "balance_loss_mlp": 0.01263349, + "epoch": 0.21770629791071697, + "flos": 21951619666560.0, + "grad_norm": 1.6751510482296663, + "language_loss": 0.80184436, + "learning_rate": 3.641641706164509e-06, + "loss": 0.88010621, + "num_input_tokens_seen": 78110095, + "router_z_loss_clip": 2.36328125, + "router_z_loss_mlp": 0.20141602, + "step": 3621, + "time_per_iteration": 2.5528457164764404 + }, + { + "auxiliary_loss_clip": 0.0654473, + "auxiliary_loss_mlp": 0.01278712, + "balance_loss_clip": 0.06305254, + "balance_loss_mlp": 0.012594, + "epoch": 0.21776642116338493, + "flos": 24943776641280.0, + "grad_norm": 1.5217586163816694, + "language_loss": 0.87951142, + "learning_rate": 3.641419220089221e-06, + "loss": 0.95774585, + "num_input_tokens_seen": 78129475, + "router_z_loss_clip": 2.39648438, + "router_z_loss_mlp": 0.19299316, + "step": 3622, + "time_per_iteration": 2.621716022491455 + }, + { + "auxiliary_loss_clip": 0.06559718, + "auxiliary_loss_mlp": 0.01277107, + "balance_loss_clip": 0.06313318, + "balance_loss_mlp": 0.01254445, + "epoch": 0.2178265444160529, + "flos": 17827017960960.0, + "grad_norm": 3.34018590012949, + "language_loss": 0.77879506, + "learning_rate": 3.641196671771152e-06, + "loss": 0.85716331, + "num_input_tokens_seen": 78146880, + "router_z_loss_clip": 2.46289062, + "router_z_loss_mlp": 0.22668457, + "step": 3623, + "time_per_iteration": 2.5479788780212402 + }, + { + "auxiliary_loss_clip": 0.0655373, + "auxiliary_loss_mlp": 0.01283267, + "balance_loss_clip": 0.06310436, + "balance_loss_mlp": 0.0126132, + "epoch": 0.2178866676687209, + "flos": 17718760085760.0, + "grad_norm": 2.118806527220675, + "language_loss": 0.85078007, + "learning_rate": 3.640974061218741e-06, + "loss": 0.92914999, + "num_input_tokens_seen": 78165065, + "router_z_loss_clip": 2.43359375, + "router_z_loss_mlp": 0.21936035, + "step": 3624, + "time_per_iteration": 2.4991443157196045 + }, + { + "auxiliary_loss_clip": 0.06544428, + "auxiliary_loss_mlp": 0.01281962, + "balance_loss_clip": 0.06301346, + "balance_loss_mlp": 0.01259014, + "epoch": 0.21794679092138886, + "flos": 16951437521280.0, + "grad_norm": 2.3785715622769357, + "language_loss": 0.7814458, + "learning_rate": 3.640751388440429e-06, + "loss": 0.85970974, + "num_input_tokens_seen": 78180005, + "router_z_loss_clip": 2.42773438, + "router_z_loss_mlp": 0.22961426, + "step": 3625, + "time_per_iteration": 2.5113301277160645 + }, + { + "auxiliary_loss_clip": 0.06435797, + "auxiliary_loss_mlp": 0.01281105, + "balance_loss_clip": 0.0630773, + "balance_loss_mlp": 0.01275631, + "epoch": 0.21800691417405682, + "flos": 63737737413120.0, + "grad_norm": 0.7732492376258139, + "language_loss": 0.60674119, + "learning_rate": 3.64052865344466e-06, + "loss": 0.68391013, + "num_input_tokens_seen": 78245350, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.05477905, + "step": 3626, + "time_per_iteration": 3.230576992034912 + }, + { + "auxiliary_loss_clip": 0.06551459, + "auxiliary_loss_mlp": 0.01275255, + "balance_loss_clip": 0.06306285, + "balance_loss_mlp": 0.01252271, + "epoch": 0.21806703742672479, + "flos": 21622821045120.0, + "grad_norm": 2.0426080259896664, + "language_loss": 0.91217983, + "learning_rate": 3.6403058562398795e-06, + "loss": 0.99044704, + "num_input_tokens_seen": 78264165, + "router_z_loss_clip": 2.45117188, + "router_z_loss_mlp": 0.22961426, + "step": 3627, + "time_per_iteration": 2.571704149246216 + }, + { + "auxiliary_loss_clip": 0.06549745, + "auxiliary_loss_mlp": 0.01279629, + "balance_loss_clip": 0.06307864, + "balance_loss_mlp": 0.01257313, + "epoch": 0.21812716067939275, + "flos": 19361034184320.0, + "grad_norm": 1.8240036323551578, + "language_loss": 0.74830574, + "learning_rate": 3.6400829968345365e-06, + "loss": 0.82659948, + "num_input_tokens_seen": 78283745, + "router_z_loss_clip": 2.421875, + "router_z_loss_mlp": 0.2232666, + "step": 3628, + "time_per_iteration": 2.5547990798950195 + }, + { + "auxiliary_loss_clip": 0.06543273, + "auxiliary_loss_mlp": 0.01279581, + "balance_loss_clip": 0.06304347, + "balance_loss_mlp": 0.01257039, + "epoch": 0.21818728393206072, + "flos": 23554467619200.0, + "grad_norm": 1.7805187473711719, + "language_loss": 0.77940357, + "learning_rate": 3.6398600752370826e-06, + "loss": 0.85763204, + "num_input_tokens_seen": 78302900, + "router_z_loss_clip": 2.38867188, + "router_z_loss_mlp": 0.2253418, + "step": 3629, + "time_per_iteration": 2.5777294635772705 + }, + { + "auxiliary_loss_clip": 0.06540327, + "auxiliary_loss_mlp": 0.01278528, + "balance_loss_clip": 0.06302765, + "balance_loss_mlp": 0.01257822, + "epoch": 0.21824740718472868, + "flos": 30233289335040.0, + "grad_norm": 1.6105707802077895, + "language_loss": 0.72294879, + "learning_rate": 3.63963709145597e-06, + "loss": 0.80113733, + "num_input_tokens_seen": 78326470, + "router_z_loss_clip": 2.375, + "router_z_loss_mlp": 0.20703125, + "step": 3630, + "time_per_iteration": 2.6015560626983643 + }, + { + "auxiliary_loss_clip": 0.06535304, + "auxiliary_loss_mlp": 0.01279689, + "balance_loss_clip": 0.06303381, + "balance_loss_mlp": 0.01259364, + "epoch": 0.21830753043739667, + "flos": 26140860397440.0, + "grad_norm": 1.9295675894773927, + "language_loss": 0.77031553, + "learning_rate": 3.6394140454996544e-06, + "loss": 0.8484655, + "num_input_tokens_seen": 78345810, + "router_z_loss_clip": 2.31835938, + "router_z_loss_mlp": 0.203125, + "step": 3631, + "time_per_iteration": 2.5712599754333496 + }, + { + "auxiliary_loss_clip": 0.06546577, + "auxiliary_loss_mlp": 0.01286362, + "balance_loss_clip": 0.06304416, + "balance_loss_mlp": 0.01265274, + "epoch": 0.21836765369006464, + "flos": 21726299237760.0, + "grad_norm": 24.58992261392957, + "language_loss": 0.76358086, + "learning_rate": 3.639190937376594e-06, + "loss": 0.84191024, + "num_input_tokens_seen": 78364085, + "router_z_loss_clip": 2.41992188, + "router_z_loss_mlp": 0.21081543, + "step": 3632, + "time_per_iteration": 2.5312108993530273 + }, + { + "auxiliary_loss_clip": 0.06541382, + "auxiliary_loss_mlp": 0.01277975, + "balance_loss_clip": 0.06306228, + "balance_loss_mlp": 0.01258008, + "epoch": 0.2184277769427326, + "flos": 19943678350080.0, + "grad_norm": 2.014902514553352, + "language_loss": 0.8455261, + "learning_rate": 3.638967767095249e-06, + "loss": 0.9237197, + "num_input_tokens_seen": 78381385, + "router_z_loss_clip": 2.35351562, + "router_z_loss_mlp": 0.19958496, + "step": 3633, + "time_per_iteration": 2.5392541885375977 + }, + { + "auxiliary_loss_clip": 0.06536385, + "auxiliary_loss_mlp": 0.01279679, + "balance_loss_clip": 0.06300621, + "balance_loss_mlp": 0.0125821, + "epoch": 0.21848790019540057, + "flos": 20346591507840.0, + "grad_norm": 2.269088705731375, + "language_loss": 0.82069844, + "learning_rate": 3.6387445346640823e-06, + "loss": 0.89885902, + "num_input_tokens_seen": 78400500, + "router_z_loss_clip": 2.35742188, + "router_z_loss_mlp": 0.21484375, + "step": 3634, + "time_per_iteration": 2.5536303520202637 + }, + { + "auxiliary_loss_clip": 0.06544928, + "auxiliary_loss_mlp": 0.01275115, + "balance_loss_clip": 0.063034, + "balance_loss_mlp": 0.01254063, + "epoch": 0.21854802344806853, + "flos": 15456302392320.0, + "grad_norm": 2.1744892406337133, + "language_loss": 0.75276726, + "learning_rate": 3.638521240091558e-06, + "loss": 0.83096772, + "num_input_tokens_seen": 78418340, + "router_z_loss_clip": 2.41796875, + "router_z_loss_mlp": 0.21044922, + "step": 3635, + "time_per_iteration": 2.5158851146698 + }, + { + "auxiliary_loss_clip": 0.0653572, + "auxiliary_loss_mlp": 0.01278867, + "balance_loss_clip": 0.06301719, + "balance_loss_mlp": 0.01259018, + "epoch": 0.2186081467007365, + "flos": 16325384140800.0, + "grad_norm": 1.9753193728837781, + "language_loss": 0.88470638, + "learning_rate": 3.6382978833861445e-06, + "loss": 0.96285218, + "num_input_tokens_seen": 78434375, + "router_z_loss_clip": 2.33984375, + "router_z_loss_mlp": 0.19836426, + "step": 3636, + "time_per_iteration": 2.5056772232055664 + }, + { + "auxiliary_loss_clip": 0.06536973, + "auxiliary_loss_mlp": 0.01285934, + "balance_loss_clip": 0.06300446, + "balance_loss_mlp": 0.01264798, + "epoch": 0.2186682699534045, + "flos": 21695677770240.0, + "grad_norm": 1.933426681732421, + "language_loss": 0.76219505, + "learning_rate": 3.638074464556311e-06, + "loss": 0.84042412, + "num_input_tokens_seen": 78451735, + "router_z_loss_clip": 2.3671875, + "router_z_loss_mlp": 0.21118164, + "step": 3637, + "time_per_iteration": 2.5159406661987305 + }, + { + "auxiliary_loss_clip": 0.06547473, + "auxiliary_loss_mlp": 0.01280148, + "balance_loss_clip": 0.06303671, + "balance_loss_mlp": 0.0125726, + "epoch": 0.21872839320607246, + "flos": 17743427913600.0, + "grad_norm": 3.0066644559057867, + "language_loss": 0.90341294, + "learning_rate": 3.63785098361053e-06, + "loss": 0.98168921, + "num_input_tokens_seen": 78462730, + "router_z_loss_clip": 2.43945312, + "router_z_loss_mlp": 0.22888184, + "step": 3638, + "time_per_iteration": 2.475271224975586 + }, + { + "auxiliary_loss_clip": 0.06535378, + "auxiliary_loss_mlp": 0.01286586, + "balance_loss_clip": 0.06297417, + "balance_loss_mlp": 0.01264318, + "epoch": 0.21878851645874042, + "flos": 18656757417600.0, + "grad_norm": 3.417327747399998, + "language_loss": 0.90034223, + "learning_rate": 3.637627440557275e-06, + "loss": 0.97856188, + "num_input_tokens_seen": 78476300, + "router_z_loss_clip": 2.38085938, + "router_z_loss_mlp": 0.22265625, + "step": 3639, + "time_per_iteration": 2.4722554683685303 + }, + { + "auxiliary_loss_clip": 0.06531254, + "auxiliary_loss_mlp": 0.01281993, + "balance_loss_clip": 0.06296734, + "balance_loss_mlp": 0.01262264, + "epoch": 0.2188486397114084, + "flos": 25564463360640.0, + "grad_norm": 1.6695470201966474, + "language_loss": 0.7997371, + "learning_rate": 3.637403835405024e-06, + "loss": 0.87786961, + "num_input_tokens_seen": 78496135, + "router_z_loss_clip": 2.34375, + "router_z_loss_mlp": 0.19726562, + "step": 3640, + "time_per_iteration": 2.5905494689941406 + }, + { + "auxiliary_loss_clip": 0.06541579, + "auxiliary_loss_mlp": 0.01284166, + "balance_loss_clip": 0.06302525, + "balance_loss_mlp": 0.01260074, + "epoch": 0.21890876296407635, + "flos": 17897400990720.0, + "grad_norm": 8.732271245188107, + "language_loss": 0.72940969, + "learning_rate": 3.637180168162255e-06, + "loss": 0.80766714, + "num_input_tokens_seen": 78513855, + "router_z_loss_clip": 2.38671875, + "router_z_loss_mlp": 0.24084473, + "step": 3641, + "time_per_iteration": 2.5452075004577637 + }, + { + "auxiliary_loss_clip": 0.06541288, + "auxiliary_loss_mlp": 0.01280481, + "balance_loss_clip": 0.06304857, + "balance_loss_mlp": 0.01259619, + "epoch": 0.21896888621674432, + "flos": 17754915922560.0, + "grad_norm": 1.8801395061290727, + "language_loss": 0.81693721, + "learning_rate": 3.63695643883745e-06, + "loss": 0.89515489, + "num_input_tokens_seen": 78531740, + "router_z_loss_clip": 2.36328125, + "router_z_loss_mlp": 0.20874023, + "step": 3642, + "time_per_iteration": 2.5234179496765137 + }, + { + "auxiliary_loss_clip": 0.06550857, + "auxiliary_loss_mlp": 0.01284985, + "balance_loss_clip": 0.06311135, + "balance_loss_mlp": 0.01262204, + "epoch": 0.21902900946941228, + "flos": 23082890440320.0, + "grad_norm": 1.5963488152753738, + "language_loss": 0.71952182, + "learning_rate": 3.6367326474390928e-06, + "loss": 0.79788017, + "num_input_tokens_seen": 78549600, + "router_z_loss_clip": 2.39648438, + "router_z_loss_mlp": 0.2277832, + "step": 3643, + "time_per_iteration": 2.5542049407958984 + }, + { + "auxiliary_loss_clip": 0.06535246, + "auxiliary_loss_mlp": 0.01285725, + "balance_loss_clip": 0.06298445, + "balance_loss_mlp": 0.01264506, + "epoch": 0.21908913272208028, + "flos": 48189501492480.0, + "grad_norm": 1.9271022520918928, + "language_loss": 0.69055694, + "learning_rate": 3.6365087939756696e-06, + "loss": 0.76876664, + "num_input_tokens_seen": 78573350, + "router_z_loss_clip": 2.37109375, + "router_z_loss_mlp": 0.21228027, + "step": 3644, + "time_per_iteration": 2.8034632205963135 + }, + { + "auxiliary_loss_clip": 0.06548485, + "auxiliary_loss_mlp": 0.01283418, + "balance_loss_clip": 0.06302129, + "balance_loss_mlp": 0.01261531, + "epoch": 0.21914925597474824, + "flos": 22243298129280.0, + "grad_norm": 2.4423330778710937, + "language_loss": 0.78728521, + "learning_rate": 3.636284878455669e-06, + "loss": 0.86560422, + "num_input_tokens_seen": 78591005, + "router_z_loss_clip": 2.46484375, + "router_z_loss_mlp": 0.21911621, + "step": 3645, + "time_per_iteration": 2.547746419906616 + }, + { + "auxiliary_loss_clip": 0.06531754, + "auxiliary_loss_mlp": 0.01275201, + "balance_loss_clip": 0.06300971, + "balance_loss_mlp": 0.01254936, + "epoch": 0.2192093792274162, + "flos": 22131853799040.0, + "grad_norm": 1.5020846701532837, + "language_loss": 0.82847381, + "learning_rate": 3.636060900887582e-06, + "loss": 0.90654337, + "num_input_tokens_seen": 78610645, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.20263672, + "step": 3646, + "time_per_iteration": 2.569216012954712 + }, + { + "auxiliary_loss_clip": 0.06536786, + "auxiliary_loss_mlp": 0.01283667, + "balance_loss_clip": 0.06302559, + "balance_loss_mlp": 0.01263449, + "epoch": 0.21926950248008417, + "flos": 15674914494720.0, + "grad_norm": 1.6949719683005162, + "language_loss": 0.83080441, + "learning_rate": 3.635836861279901e-06, + "loss": 0.90900892, + "num_input_tokens_seen": 78628340, + "router_z_loss_clip": 2.34179688, + "router_z_loss_mlp": 0.20227051, + "step": 3647, + "time_per_iteration": 3.9349160194396973 + }, + { + "auxiliary_loss_clip": 0.06534994, + "auxiliary_loss_mlp": 0.01281644, + "balance_loss_clip": 0.06301765, + "balance_loss_mlp": 0.01261105, + "epoch": 0.21932962573275214, + "flos": 30270199858560.0, + "grad_norm": 1.587891801710132, + "language_loss": 0.7257458, + "learning_rate": 3.635612759641123e-06, + "loss": 0.80391216, + "num_input_tokens_seen": 78649355, + "router_z_loss_clip": 2.33203125, + "router_z_loss_mlp": 0.20532227, + "step": 3648, + "time_per_iteration": 2.6465656757354736 + }, + { + "auxiliary_loss_clip": 0.06545104, + "auxiliary_loss_mlp": 0.0128538, + "balance_loss_clip": 0.06304809, + "balance_loss_mlp": 0.01263434, + "epoch": 0.2193897489854201, + "flos": 10784751160320.0, + "grad_norm": 3.088861131276654, + "language_loss": 0.74724281, + "learning_rate": 3.635388595979745e-06, + "loss": 0.8255477, + "num_input_tokens_seen": 78664915, + "router_z_loss_clip": 2.40039062, + "router_z_loss_mlp": 0.21960449, + "step": 3649, + "time_per_iteration": 2.510040283203125 + }, + { + "auxiliary_loss_clip": 0.06531087, + "auxiliary_loss_mlp": 0.01295006, + "balance_loss_clip": 0.06299826, + "balance_loss_mlp": 0.01274752, + "epoch": 0.21944987223808807, + "flos": 19138984064640.0, + "grad_norm": 4.303407628828735, + "language_loss": 0.86915123, + "learning_rate": 3.635164370304267e-06, + "loss": 0.94741207, + "num_input_tokens_seen": 78681475, + "router_z_loss_clip": 2.3125, + "router_z_loss_mlp": 0.20251465, + "step": 3650, + "time_per_iteration": 3.93752384185791 + }, + { + "auxiliary_loss_clip": 0.06543732, + "auxiliary_loss_mlp": 0.01294843, + "balance_loss_clip": 0.06307691, + "balance_loss_mlp": 0.01273422, + "epoch": 0.21950999549075606, + "flos": 22717726346880.0, + "grad_norm": 2.457938069648898, + "language_loss": 0.8456791, + "learning_rate": 3.6349400826231927e-06, + "loss": 0.92406487, + "num_input_tokens_seen": 78702300, + "router_z_loss_clip": 2.359375, + "router_z_loss_mlp": 0.2142334, + "step": 3651, + "time_per_iteration": 2.7058322429656982 + }, + { + "auxiliary_loss_clip": 0.06539044, + "auxiliary_loss_mlp": 0.01290725, + "balance_loss_clip": 0.06304742, + "balance_loss_mlp": 0.01270257, + "epoch": 0.21957011874342403, + "flos": 10565929422720.0, + "grad_norm": 1.8310150193660448, + "language_loss": 0.74885792, + "learning_rate": 3.634715732945027e-06, + "loss": 0.82715559, + "num_input_tokens_seen": 78720230, + "router_z_loss_clip": 2.34570312, + "router_z_loss_mlp": 0.20458984, + "step": 3652, + "time_per_iteration": 2.512620210647583 + }, + { + "auxiliary_loss_clip": 0.06458014, + "auxiliary_loss_mlp": 0.01487979, + "balance_loss_clip": 0.06335165, + "balance_loss_mlp": 0.01477775, + "epoch": 0.219630241996092, + "flos": 65765105677440.0, + "grad_norm": 0.8085744951241601, + "language_loss": 0.51588702, + "learning_rate": 3.6344913212782764e-06, + "loss": 0.59534693, + "num_input_tokens_seen": 78780200, + "router_z_loss_clip": 1.234375, + "router_z_loss_mlp": 0.10205078, + "step": 3653, + "time_per_iteration": 3.156705617904663 + }, + { + "auxiliary_loss_clip": 0.06532414, + "auxiliary_loss_mlp": 0.01292976, + "balance_loss_clip": 0.06300488, + "balance_loss_mlp": 0.01271685, + "epoch": 0.21969036524875996, + "flos": 23703367524480.0, + "grad_norm": 2.2498105533123467, + "language_loss": 0.7598449, + "learning_rate": 3.6342668476314514e-06, + "loss": 0.83809876, + "num_input_tokens_seen": 78800575, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.21289062, + "step": 3654, + "time_per_iteration": 2.5549349784851074 + }, + { + "auxiliary_loss_clip": 0.06539033, + "auxiliary_loss_mlp": 0.01287688, + "balance_loss_clip": 0.06300852, + "balance_loss_mlp": 0.01265277, + "epoch": 0.21975048850142792, + "flos": 19646130101760.0, + "grad_norm": 1.856190016757107, + "language_loss": 0.72937429, + "learning_rate": 3.634042312013064e-06, + "loss": 0.80764157, + "num_input_tokens_seen": 78819585, + "router_z_loss_clip": 2.3828125, + "router_z_loss_mlp": 0.22412109, + "step": 3655, + "time_per_iteration": 5.397899866104126 + }, + { + "auxiliary_loss_clip": 0.06537225, + "auxiliary_loss_mlp": 0.01285968, + "balance_loss_clip": 0.06301227, + "balance_loss_mlp": 0.01265667, + "epoch": 0.21981061175409589, + "flos": 22453944094080.0, + "grad_norm": 1.6446350088012902, + "language_loss": 0.81351042, + "learning_rate": 3.6338177144316276e-06, + "loss": 0.89174235, + "num_input_tokens_seen": 78837330, + "router_z_loss_clip": 2.36328125, + "router_z_loss_mlp": 0.20300293, + "step": 3656, + "time_per_iteration": 2.53308367729187 + }, + { + "auxiliary_loss_clip": 0.06536204, + "auxiliary_loss_mlp": 0.01286139, + "balance_loss_clip": 0.06302683, + "balance_loss_mlp": 0.01265027, + "epoch": 0.21987073500676388, + "flos": 18157032466560.0, + "grad_norm": 2.081609460517537, + "language_loss": 0.86280632, + "learning_rate": 3.63359305489566e-06, + "loss": 0.94102979, + "num_input_tokens_seen": 78854955, + "router_z_loss_clip": 2.33398438, + "router_z_loss_mlp": 0.21105957, + "step": 3657, + "time_per_iteration": 2.5165464878082275 + }, + { + "auxiliary_loss_clip": 0.06534712, + "auxiliary_loss_mlp": 0.01283645, + "balance_loss_clip": 0.0629717, + "balance_loss_mlp": 0.01263439, + "epoch": 0.21993085825943184, + "flos": 25632666184320.0, + "grad_norm": 1.606816904846988, + "language_loss": 0.80728716, + "learning_rate": 3.6333683334136803e-06, + "loss": 0.88547069, + "num_input_tokens_seen": 78874965, + "router_z_loss_clip": 2.375, + "router_z_loss_mlp": 0.20202637, + "step": 3658, + "time_per_iteration": 2.5528533458709717 + }, + { + "auxiliary_loss_clip": 0.06407537, + "auxiliary_loss_mlp": 0.01256954, + "balance_loss_clip": 0.0628604, + "balance_loss_mlp": 0.01250839, + "epoch": 0.2199909815120998, + "flos": 70946429621760.0, + "grad_norm": 0.7593962827668586, + "language_loss": 0.58126092, + "learning_rate": 3.6331435499942095e-06, + "loss": 0.65790582, + "num_input_tokens_seen": 78937740, + "router_z_loss_clip": 1.21875, + "router_z_loss_mlp": 0.06103516, + "step": 3659, + "time_per_iteration": 3.237276077270508 + }, + { + "auxiliary_loss_clip": 0.06524363, + "auxiliary_loss_mlp": 0.01284023, + "balance_loss_clip": 0.06293888, + "balance_loss_mlp": 0.01264091, + "epoch": 0.22005110476476777, + "flos": 21549964320000.0, + "grad_norm": 2.05919214646248, + "language_loss": 0.75117528, + "learning_rate": 3.632918704645772e-06, + "loss": 0.82925916, + "num_input_tokens_seen": 78955055, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.19946289, + "step": 3660, + "time_per_iteration": 2.5259556770324707 + }, + { + "auxiliary_loss_clip": 0.06528022, + "auxiliary_loss_mlp": 0.01287991, + "balance_loss_clip": 0.06292684, + "balance_loss_mlp": 0.01267976, + "epoch": 0.22011122801743574, + "flos": 22061051498880.0, + "grad_norm": 2.4805712407940645, + "language_loss": 0.81579179, + "learning_rate": 3.632693797376893e-06, + "loss": 0.89395189, + "num_input_tokens_seen": 78974895, + "router_z_loss_clip": 2.35546875, + "router_z_loss_mlp": 0.20019531, + "step": 3661, + "time_per_iteration": 2.5724833011627197 + }, + { + "auxiliary_loss_clip": 0.06527096, + "auxiliary_loss_mlp": 0.01283614, + "balance_loss_clip": 0.06295218, + "balance_loss_mlp": 0.01264039, + "epoch": 0.2201713512701037, + "flos": 26694811739520.0, + "grad_norm": 2.4209612671003993, + "language_loss": 0.73935246, + "learning_rate": 3.632468828196102e-06, + "loss": 0.81745958, + "num_input_tokens_seen": 78994990, + "router_z_loss_clip": 2.31640625, + "router_z_loss_mlp": 0.19567871, + "step": 3662, + "time_per_iteration": 2.594336986541748 + }, + { + "auxiliary_loss_clip": 0.06524752, + "auxiliary_loss_mlp": 0.01286026, + "balance_loss_clip": 0.06294893, + "balance_loss_mlp": 0.01266976, + "epoch": 0.22023147452277167, + "flos": 22168470833280.0, + "grad_norm": 1.5979135918213576, + "language_loss": 0.79490995, + "learning_rate": 3.632243797111929e-06, + "loss": 0.87301779, + "num_input_tokens_seen": 79014405, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.19042969, + "step": 3663, + "time_per_iteration": 2.6437172889709473 + }, + { + "auxiliary_loss_clip": 0.06536885, + "auxiliary_loss_mlp": 0.01285417, + "balance_loss_clip": 0.06298422, + "balance_loss_mlp": 0.01264627, + "epoch": 0.22029159777543966, + "flos": 22528981025280.0, + "grad_norm": 1.9228872111745317, + "language_loss": 0.81154871, + "learning_rate": 3.632018704132908e-06, + "loss": 0.8897717, + "num_input_tokens_seen": 79032375, + "router_z_loss_clip": 2.38671875, + "router_z_loss_mlp": 0.20800781, + "step": 3664, + "time_per_iteration": 2.551218271255493 + }, + { + "auxiliary_loss_clip": 0.06543128, + "auxiliary_loss_mlp": 0.01279618, + "balance_loss_clip": 0.06298529, + "balance_loss_mlp": 0.01257457, + "epoch": 0.22035172102810763, + "flos": 13047502343040.0, + "grad_norm": 2.388837963421245, + "language_loss": 0.77563322, + "learning_rate": 3.6317935492675742e-06, + "loss": 0.85386074, + "num_input_tokens_seen": 79049635, + "router_z_loss_clip": 2.44335938, + "router_z_loss_mlp": 0.22167969, + "step": 3665, + "time_per_iteration": 2.5317838191986084 + }, + { + "auxiliary_loss_clip": 0.06533245, + "auxiliary_loss_mlp": 0.0128412, + "balance_loss_clip": 0.06298798, + "balance_loss_mlp": 0.01263616, + "epoch": 0.2204118442807756, + "flos": 12170538311040.0, + "grad_norm": 5.328131395204355, + "language_loss": 0.98459631, + "learning_rate": 3.631568332524466e-06, + "loss": 1.06277001, + "num_input_tokens_seen": 79062890, + "router_z_loss_clip": 2.34765625, + "router_z_loss_mlp": 0.20507812, + "step": 3666, + "time_per_iteration": 2.500293254852295 + }, + { + "auxiliary_loss_clip": 0.06531642, + "auxiliary_loss_mlp": 0.01281342, + "balance_loss_clip": 0.06297208, + "balance_loss_mlp": 0.01260767, + "epoch": 0.22047196753344356, + "flos": 40117345758720.0, + "grad_norm": 2.0087807452217143, + "language_loss": 0.81544572, + "learning_rate": 3.631343053912122e-06, + "loss": 0.89357555, + "num_input_tokens_seen": 79085495, + "router_z_loss_clip": 2.34179688, + "router_z_loss_mlp": 0.20568848, + "step": 3667, + "time_per_iteration": 2.7539899349212646 + }, + { + "auxiliary_loss_clip": 0.06542197, + "auxiliary_loss_mlp": 0.0128155, + "balance_loss_clip": 0.06300189, + "balance_loss_mlp": 0.01258363, + "epoch": 0.22053209078611152, + "flos": 20706892064640.0, + "grad_norm": 2.631241235852179, + "language_loss": 0.77648765, + "learning_rate": 3.631117713439087e-06, + "loss": 0.85472512, + "num_input_tokens_seen": 79101820, + "router_z_loss_clip": 2.41992188, + "router_z_loss_mlp": 0.23168945, + "step": 3668, + "time_per_iteration": 2.524740695953369 + }, + { + "auxiliary_loss_clip": 0.06534266, + "auxiliary_loss_mlp": 0.01279226, + "balance_loss_clip": 0.06300663, + "balance_loss_mlp": 0.01258758, + "epoch": 0.2205922140387795, + "flos": 24723026259840.0, + "grad_norm": 2.1996350177899386, + "language_loss": 0.72024125, + "learning_rate": 3.630892311113904e-06, + "loss": 0.7983762, + "num_input_tokens_seen": 79123320, + "router_z_loss_clip": 2.33789062, + "router_z_loss_mlp": 0.20471191, + "step": 3669, + "time_per_iteration": 2.5901756286621094 + }, + { + "auxiliary_loss_clip": 0.06540591, + "auxiliary_loss_mlp": 0.01281842, + "balance_loss_clip": 0.06304247, + "balance_loss_mlp": 0.01261398, + "epoch": 0.22065233729144745, + "flos": 23484000735360.0, + "grad_norm": 1.708018932230371, + "language_loss": 0.85830641, + "learning_rate": 3.6306668469451215e-06, + "loss": 0.93653071, + "num_input_tokens_seen": 79141615, + "router_z_loss_clip": 2.36132812, + "router_z_loss_mlp": 0.20422363, + "step": 3670, + "time_per_iteration": 2.6102726459503174 + }, + { + "auxiliary_loss_clip": 0.06539471, + "auxiliary_loss_mlp": 0.01279884, + "balance_loss_clip": 0.06300244, + "balance_loss_mlp": 0.01259678, + "epoch": 0.22071246054411545, + "flos": 35234268094080.0, + "grad_norm": 1.8596418583208814, + "language_loss": 0.77398729, + "learning_rate": 3.6304413209412886e-06, + "loss": 0.85218084, + "num_input_tokens_seen": 79164910, + "router_z_loss_clip": 2.39453125, + "router_z_loss_mlp": 0.20202637, + "step": 3671, + "time_per_iteration": 2.6463472843170166 + }, + { + "auxiliary_loss_clip": 0.06536315, + "auxiliary_loss_mlp": 0.01275828, + "balance_loss_clip": 0.06302021, + "balance_loss_mlp": 0.01256934, + "epoch": 0.2207725837967834, + "flos": 18156151998720.0, + "grad_norm": 3.3605951725525807, + "language_loss": 0.81071377, + "learning_rate": 3.6302157331109573e-06, + "loss": 0.88883519, + "num_input_tokens_seen": 79179685, + "router_z_loss_clip": 2.34375, + "router_z_loss_mlp": 0.18896484, + "step": 3672, + "time_per_iteration": 2.522409200668335 + }, + { + "auxiliary_loss_clip": 0.06541845, + "auxiliary_loss_mlp": 0.01282888, + "balance_loss_clip": 0.06304064, + "balance_loss_mlp": 0.01262086, + "epoch": 0.22083270704945138, + "flos": 20484967726080.0, + "grad_norm": 2.0276751679318905, + "language_loss": 0.74039209, + "learning_rate": 3.629990083462682e-06, + "loss": 0.8186394, + "num_input_tokens_seen": 79196285, + "router_z_loss_clip": 2.38085938, + "router_z_loss_mlp": 0.20800781, + "step": 3673, + "time_per_iteration": 2.5588481426239014 + }, + { + "auxiliary_loss_clip": 0.06537451, + "auxiliary_loss_mlp": 0.0127904, + "balance_loss_clip": 0.06301045, + "balance_loss_mlp": 0.01258154, + "epoch": 0.22089283030211934, + "flos": 34133451079680.0, + "grad_norm": 2.1113123853963223, + "language_loss": 0.77576697, + "learning_rate": 3.6297643720050203e-06, + "loss": 0.85393184, + "num_input_tokens_seen": 79216060, + "router_z_loss_clip": 2.3671875, + "router_z_loss_mlp": 0.2088623, + "step": 3674, + "time_per_iteration": 2.6212525367736816 + }, + { + "auxiliary_loss_clip": 0.06539989, + "auxiliary_loss_mlp": 0.01277379, + "balance_loss_clip": 0.06303889, + "balance_loss_mlp": 0.01255349, + "epoch": 0.2209529535547873, + "flos": 18083043711360.0, + "grad_norm": 2.9913121905850213, + "language_loss": 0.7632584, + "learning_rate": 3.6295385987465293e-06, + "loss": 0.84143209, + "num_input_tokens_seen": 79235145, + "router_z_loss_clip": 2.36523438, + "router_z_loss_mlp": 0.22033691, + "step": 3675, + "time_per_iteration": 2.529346466064453 + }, + { + "auxiliary_loss_clip": 0.06540923, + "auxiliary_loss_mlp": 0.01279311, + "balance_loss_clip": 0.06303286, + "balance_loss_mlp": 0.01258592, + "epoch": 0.22101307680745527, + "flos": 27242725587840.0, + "grad_norm": 1.8493496269427605, + "language_loss": 0.8074736, + "learning_rate": 3.629312763695772e-06, + "loss": 0.88567591, + "num_input_tokens_seen": 79256960, + "router_z_loss_clip": 2.38085938, + "router_z_loss_mlp": 0.20727539, + "step": 3676, + "time_per_iteration": 2.5729713439941406 + }, + { + "auxiliary_loss_clip": 0.06539683, + "auxiliary_loss_mlp": 0.0128126, + "balance_loss_clip": 0.06299066, + "balance_loss_mlp": 0.01260637, + "epoch": 0.22107320006012326, + "flos": 16548566290560.0, + "grad_norm": 2.695197102889201, + "language_loss": 0.76204234, + "learning_rate": 3.6290868668613107e-06, + "loss": 0.84025168, + "num_input_tokens_seen": 79274860, + "router_z_loss_clip": 2.40820312, + "router_z_loss_mlp": 0.2064209, + "step": 3677, + "time_per_iteration": 2.5165653228759766 + }, + { + "auxiliary_loss_clip": 0.0653778, + "auxiliary_loss_mlp": 0.01277642, + "balance_loss_clip": 0.06301221, + "balance_loss_mlp": 0.01257889, + "epoch": 0.22113332331279123, + "flos": 22061009571840.0, + "grad_norm": 1.9269573452829223, + "language_loss": 0.84673274, + "learning_rate": 3.628860908251712e-06, + "loss": 0.92488694, + "num_input_tokens_seen": 79294005, + "router_z_loss_clip": 2.36523438, + "router_z_loss_mlp": 0.19750977, + "step": 3678, + "time_per_iteration": 2.5460638999938965 + }, + { + "auxiliary_loss_clip": 0.06537814, + "auxiliary_loss_mlp": 0.01282989, + "balance_loss_clip": 0.06304095, + "balance_loss_mlp": 0.01262354, + "epoch": 0.2211934465654592, + "flos": 26619690954240.0, + "grad_norm": 2.1729831488916327, + "language_loss": 0.89362311, + "learning_rate": 3.6286348878755452e-06, + "loss": 0.9718312, + "num_input_tokens_seen": 79314005, + "router_z_loss_clip": 2.34179688, + "router_z_loss_mlp": 0.20629883, + "step": 3679, + "time_per_iteration": 2.596503973007202 + }, + { + "auxiliary_loss_clip": 0.06542142, + "auxiliary_loss_mlp": 0.01291632, + "balance_loss_clip": 0.06301068, + "balance_loss_mlp": 0.01269817, + "epoch": 0.22125356981812716, + "flos": 16365564973440.0, + "grad_norm": 3.197923457760992, + "language_loss": 0.87311327, + "learning_rate": 3.6284088057413803e-06, + "loss": 0.95145106, + "num_input_tokens_seen": 79331030, + "router_z_loss_clip": 2.40820312, + "router_z_loss_mlp": 0.21801758, + "step": 3680, + "time_per_iteration": 2.507798433303833 + }, + { + "auxiliary_loss_clip": 0.06534758, + "auxiliary_loss_mlp": 0.01279239, + "balance_loss_clip": 0.06302372, + "balance_loss_mlp": 0.01258211, + "epoch": 0.22131369307079513, + "flos": 21657257873280.0, + "grad_norm": 1.8058433539562604, + "language_loss": 0.81643963, + "learning_rate": 3.6281826618577894e-06, + "loss": 0.89457959, + "num_input_tokens_seen": 79348560, + "router_z_loss_clip": 2.32226562, + "router_z_loss_mlp": 0.21032715, + "step": 3681, + "time_per_iteration": 2.536559820175171 + }, + { + "auxiliary_loss_clip": 0.06530598, + "auxiliary_loss_mlp": 0.01283453, + "balance_loss_clip": 0.06302136, + "balance_loss_mlp": 0.01264344, + "epoch": 0.2213738163234631, + "flos": 19615592488320.0, + "grad_norm": 3.0843961282743138, + "language_loss": 0.80613208, + "learning_rate": 3.62795645623335e-06, + "loss": 0.88427258, + "num_input_tokens_seen": 79367175, + "router_z_loss_clip": 2.28320312, + "router_z_loss_mlp": 0.19116211, + "step": 3682, + "time_per_iteration": 2.5523715019226074 + }, + { + "auxiliary_loss_clip": 0.06540116, + "auxiliary_loss_mlp": 0.01284666, + "balance_loss_clip": 0.06302039, + "balance_loss_mlp": 0.01261933, + "epoch": 0.22143393957613106, + "flos": 23630217310080.0, + "grad_norm": 1.560467578099588, + "language_loss": 0.78323001, + "learning_rate": 3.627730188876638e-06, + "loss": 0.86147785, + "num_input_tokens_seen": 79388435, + "router_z_loss_clip": 2.38085938, + "router_z_loss_mlp": 0.22729492, + "step": 3683, + "time_per_iteration": 2.563915491104126 + }, + { + "auxiliary_loss_clip": 0.06546305, + "auxiliary_loss_mlp": 0.01292128, + "balance_loss_clip": 0.06304266, + "balance_loss_mlp": 0.01270801, + "epoch": 0.22149406282879905, + "flos": 26185108152960.0, + "grad_norm": 2.3659446396904276, + "language_loss": 0.73827177, + "learning_rate": 3.627503859796234e-06, + "loss": 0.81665611, + "num_input_tokens_seen": 79407910, + "router_z_loss_clip": 2.42382812, + "router_z_loss_mlp": 0.21337891, + "step": 3684, + "time_per_iteration": 2.5829403400421143 + }, + { + "auxiliary_loss_clip": 0.06539842, + "auxiliary_loss_mlp": 0.01288295, + "balance_loss_clip": 0.06303138, + "balance_loss_mlp": 0.01266396, + "epoch": 0.221554186081467, + "flos": 14544104918400.0, + "grad_norm": 1.9346272357304948, + "language_loss": 0.81055164, + "learning_rate": 3.6272774690007207e-06, + "loss": 0.88883299, + "num_input_tokens_seen": 79424020, + "router_z_loss_clip": 2.3671875, + "router_z_loss_mlp": 0.21899414, + "step": 3685, + "time_per_iteration": 2.5229949951171875 + }, + { + "auxiliary_loss_clip": 0.06531791, + "auxiliary_loss_mlp": 0.0128599, + "balance_loss_clip": 0.06302623, + "balance_loss_mlp": 0.01266607, + "epoch": 0.22161430933413498, + "flos": 22245059064960.0, + "grad_norm": 1.5947500054188823, + "language_loss": 0.87523818, + "learning_rate": 3.6270510164986823e-06, + "loss": 0.95341599, + "num_input_tokens_seen": 79445605, + "router_z_loss_clip": 2.29296875, + "router_z_loss_mlp": 0.19372559, + "step": 3686, + "time_per_iteration": 4.0018064975738525 + }, + { + "auxiliary_loss_clip": 0.06530964, + "auxiliary_loss_mlp": 0.01294037, + "balance_loss_clip": 0.06297237, + "balance_loss_mlp": 0.01272198, + "epoch": 0.22167443258680294, + "flos": 23483162194560.0, + "grad_norm": 2.0272053301197186, + "language_loss": 0.78420949, + "learning_rate": 3.626824502298707e-06, + "loss": 0.86245942, + "num_input_tokens_seen": 79463850, + "router_z_loss_clip": 2.3359375, + "router_z_loss_mlp": 0.21826172, + "step": 3687, + "time_per_iteration": 2.543321132659912 + }, + { + "auxiliary_loss_clip": 0.06551681, + "auxiliary_loss_mlp": 0.01283958, + "balance_loss_clip": 0.0630649, + "balance_loss_mlp": 0.01261177, + "epoch": 0.2217345558394709, + "flos": 23227723422720.0, + "grad_norm": 1.7957197826329643, + "language_loss": 0.85492283, + "learning_rate": 3.626597926409383e-06, + "loss": 0.93327922, + "num_input_tokens_seen": 79482845, + "router_z_loss_clip": 2.44921875, + "router_z_loss_mlp": 0.2277832, + "step": 3688, + "time_per_iteration": 2.5456702709198 + }, + { + "auxiliary_loss_clip": 0.06557921, + "auxiliary_loss_mlp": 0.01283081, + "balance_loss_clip": 0.0631456, + "balance_loss_mlp": 0.01260812, + "epoch": 0.22179467909213887, + "flos": 20017247834880.0, + "grad_norm": 1.8193279444648072, + "language_loss": 0.81821239, + "learning_rate": 3.6263712888393027e-06, + "loss": 0.89662236, + "num_input_tokens_seen": 79501550, + "router_z_loss_clip": 2.43554688, + "router_z_loss_mlp": 0.22265625, + "step": 3689, + "time_per_iteration": 4.073091506958008 + }, + { + "auxiliary_loss_clip": 0.06540284, + "auxiliary_loss_mlp": 0.0128456, + "balance_loss_clip": 0.06304172, + "balance_loss_mlp": 0.01263269, + "epoch": 0.22185480234480687, + "flos": 19689203900160.0, + "grad_norm": 2.302195520769192, + "language_loss": 0.70934272, + "learning_rate": 3.626144589597061e-06, + "loss": 0.7875911, + "num_input_tokens_seen": 79519680, + "router_z_loss_clip": 2.36132812, + "router_z_loss_mlp": 0.2130127, + "step": 3690, + "time_per_iteration": 2.5177161693573 + }, + { + "auxiliary_loss_clip": 0.06548303, + "auxiliary_loss_mlp": 0.01286756, + "balance_loss_clip": 0.06307022, + "balance_loss_mlp": 0.01264416, + "epoch": 0.22191492559747483, + "flos": 21987817430400.0, + "grad_norm": 2.3084892961245576, + "language_loss": 0.7285862, + "learning_rate": 3.6259178286912528e-06, + "loss": 0.80693686, + "num_input_tokens_seen": 79539000, + "router_z_loss_clip": 2.4140625, + "router_z_loss_mlp": 0.22338867, + "step": 3691, + "time_per_iteration": 2.545271873474121 + }, + { + "auxiliary_loss_clip": 0.0654895, + "auxiliary_loss_mlp": 0.01283693, + "balance_loss_clip": 0.06313456, + "balance_loss_mlp": 0.01261771, + "epoch": 0.2219750488501428, + "flos": 23228813525760.0, + "grad_norm": 2.0680633952732195, + "language_loss": 0.71962094, + "learning_rate": 3.625691006130477e-06, + "loss": 0.79794735, + "num_input_tokens_seen": 79559695, + "router_z_loss_clip": 2.36132812, + "router_z_loss_mlp": 0.21936035, + "step": 3692, + "time_per_iteration": 2.543306350708008 + }, + { + "auxiliary_loss_clip": 0.06558576, + "auxiliary_loss_mlp": 0.0128071, + "balance_loss_clip": 0.06317012, + "balance_loss_mlp": 0.01258394, + "epoch": 0.22203517210281076, + "flos": 22459939660800.0, + "grad_norm": 1.9780142392305156, + "language_loss": 0.87528688, + "learning_rate": 3.6254641219233362e-06, + "loss": 0.95367974, + "num_input_tokens_seen": 79579095, + "router_z_loss_clip": 2.41601562, + "router_z_loss_mlp": 0.22338867, + "step": 3693, + "time_per_iteration": 2.571045398712158 + }, + { + "auxiliary_loss_clip": 0.06534213, + "auxiliary_loss_mlp": 0.01282043, + "balance_loss_clip": 0.06303744, + "balance_loss_mlp": 0.01261122, + "epoch": 0.22209529535547873, + "flos": 17569985961600.0, + "grad_norm": 2.4004359049860824, + "language_loss": 0.86418116, + "learning_rate": 3.6252371760784325e-06, + "loss": 0.94234371, + "num_input_tokens_seen": 79596430, + "router_z_loss_clip": 2.30859375, + "router_z_loss_mlp": 0.20922852, + "step": 3694, + "time_per_iteration": 4.03299617767334 + }, + { + "auxiliary_loss_clip": 0.06554222, + "auxiliary_loss_mlp": 0.0127962, + "balance_loss_clip": 0.06307386, + "balance_loss_mlp": 0.0125815, + "epoch": 0.2221554186081467, + "flos": 21475178951040.0, + "grad_norm": 1.7692850214061204, + "language_loss": 0.69924927, + "learning_rate": 3.6250101686043725e-06, + "loss": 0.77758765, + "num_input_tokens_seen": 79615825, + "router_z_loss_clip": 2.46875, + "router_z_loss_mlp": 0.21472168, + "step": 3695, + "time_per_iteration": 3.989173412322998 + }, + { + "auxiliary_loss_clip": 0.06536973, + "auxiliary_loss_mlp": 0.01283487, + "balance_loss_clip": 0.0630603, + "balance_loss_mlp": 0.01262781, + "epoch": 0.22221554186081466, + "flos": 27680956041600.0, + "grad_norm": 1.7088419756312998, + "language_loss": 0.72215462, + "learning_rate": 3.6247830995097637e-06, + "loss": 0.80035925, + "num_input_tokens_seen": 79637875, + "router_z_loss_clip": 2.30859375, + "router_z_loss_mlp": 0.20715332, + "step": 3696, + "time_per_iteration": 2.6339590549468994 + }, + { + "auxiliary_loss_clip": 0.06543445, + "auxiliary_loss_mlp": 0.01279581, + "balance_loss_clip": 0.06307454, + "balance_loss_mlp": 0.01257825, + "epoch": 0.22227566511348265, + "flos": 25966202561280.0, + "grad_norm": 1.8417969407055101, + "language_loss": 0.88068652, + "learning_rate": 3.624555968803217e-06, + "loss": 0.95891678, + "num_input_tokens_seen": 79656970, + "router_z_loss_clip": 2.36132812, + "router_z_loss_mlp": 0.21740723, + "step": 3697, + "time_per_iteration": 2.5599191188812256 + }, + { + "auxiliary_loss_clip": 0.06533489, + "auxiliary_loss_mlp": 0.01284902, + "balance_loss_clip": 0.06305174, + "balance_loss_mlp": 0.01265042, + "epoch": 0.22233578836615062, + "flos": 39213240203520.0, + "grad_norm": 2.5935528152985867, + "language_loss": 0.6687606, + "learning_rate": 3.624328776493346e-06, + "loss": 0.74694455, + "num_input_tokens_seen": 79680275, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.1986084, + "step": 3698, + "time_per_iteration": 2.812140703201294 + }, + { + "auxiliary_loss_clip": 0.06546268, + "auxiliary_loss_mlp": 0.01282222, + "balance_loss_clip": 0.06307642, + "balance_loss_mlp": 0.01260216, + "epoch": 0.22239591161881858, + "flos": 36292682142720.0, + "grad_norm": 1.853195446284453, + "language_loss": 0.82990527, + "learning_rate": 3.6241015225887637e-06, + "loss": 0.90819019, + "num_input_tokens_seen": 79701255, + "router_z_loss_clip": 2.38671875, + "router_z_loss_mlp": 0.22009277, + "step": 3699, + "time_per_iteration": 2.667423725128174 + }, + { + "auxiliary_loss_clip": 0.06537004, + "auxiliary_loss_mlp": 0.01281329, + "balance_loss_clip": 0.06302205, + "balance_loss_mlp": 0.01260014, + "epoch": 0.22245603487148655, + "flos": 19725779007360.0, + "grad_norm": 1.45021308141165, + "language_loss": 0.80335897, + "learning_rate": 3.62387420709809e-06, + "loss": 0.88154227, + "num_input_tokens_seen": 79721315, + "router_z_loss_clip": 2.34765625, + "router_z_loss_mlp": 0.21313477, + "step": 3700, + "time_per_iteration": 2.5526716709136963 + }, + { + "auxiliary_loss_clip": 0.06548695, + "auxiliary_loss_mlp": 0.01279557, + "balance_loss_clip": 0.06306358, + "balance_loss_mlp": 0.01257885, + "epoch": 0.2225161581241545, + "flos": 46290950081280.0, + "grad_norm": 3.047641549556173, + "language_loss": 0.73186177, + "learning_rate": 3.623646830029943e-06, + "loss": 0.81014431, + "num_input_tokens_seen": 79742705, + "router_z_loss_clip": 2.42382812, + "router_z_loss_mlp": 0.21655273, + "step": 3701, + "time_per_iteration": 2.776974678039551 + }, + { + "auxiliary_loss_clip": 0.06535295, + "auxiliary_loss_mlp": 0.01280405, + "balance_loss_clip": 0.06300849, + "balance_loss_mlp": 0.01259734, + "epoch": 0.22257628137682248, + "flos": 23702990181120.0, + "grad_norm": 4.404280219854046, + "language_loss": 0.80455184, + "learning_rate": 3.6234193913929454e-06, + "loss": 0.88270885, + "num_input_tokens_seen": 79763000, + "router_z_loss_clip": 2.34570312, + "router_z_loss_mlp": 0.20666504, + "step": 3702, + "time_per_iteration": 2.5657999515533447 + }, + { + "auxiliary_loss_clip": 0.06524558, + "auxiliary_loss_mlp": 0.01274253, + "balance_loss_clip": 0.06297488, + "balance_loss_mlp": 0.01253331, + "epoch": 0.22263640462949044, + "flos": 19359986008320.0, + "grad_norm": 3.4101413472023405, + "language_loss": 0.78629804, + "learning_rate": 3.623191891195723e-06, + "loss": 0.86428618, + "num_input_tokens_seen": 79781335, + "router_z_loss_clip": 2.27148438, + "router_z_loss_mlp": 0.20910645, + "step": 3703, + "time_per_iteration": 2.550189971923828 + }, + { + "auxiliary_loss_clip": 0.06541737, + "auxiliary_loss_mlp": 0.01279602, + "balance_loss_clip": 0.06300878, + "balance_loss_mlp": 0.01257084, + "epoch": 0.22269652788215843, + "flos": 20782138631040.0, + "grad_norm": 2.0986231414271828, + "language_loss": 0.75210625, + "learning_rate": 3.6229643294469005e-06, + "loss": 0.83031964, + "num_input_tokens_seen": 79800150, + "router_z_loss_clip": 2.41015625, + "router_z_loss_mlp": 0.22509766, + "step": 3704, + "time_per_iteration": 2.5540754795074463 + }, + { + "auxiliary_loss_clip": 0.06527826, + "auxiliary_loss_mlp": 0.01288936, + "balance_loss_clip": 0.06299336, + "balance_loss_mlp": 0.01268682, + "epoch": 0.2227566511348264, + "flos": 47969631578880.0, + "grad_norm": 1.891044771341396, + "language_loss": 0.65108556, + "learning_rate": 3.6227367061551074e-06, + "loss": 0.72925317, + "num_input_tokens_seen": 79822390, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.20239258, + "step": 3705, + "time_per_iteration": 2.8109097480773926 + }, + { + "auxiliary_loss_clip": 0.06438605, + "auxiliary_loss_mlp": 0.01266416, + "balance_loss_clip": 0.0631493, + "balance_loss_mlp": 0.012611, + "epoch": 0.22281677438749437, + "flos": 66235676607360.0, + "grad_norm": 1.322453387614222, + "language_loss": 0.65218806, + "learning_rate": 3.6225090213289766e-06, + "loss": 0.72923827, + "num_input_tokens_seen": 79873350, + "router_z_loss_clip": 1.234375, + "router_z_loss_mlp": 0.05322266, + "step": 3706, + "time_per_iteration": 3.059636354446411 + }, + { + "auxiliary_loss_clip": 0.06534128, + "auxiliary_loss_mlp": 0.01286492, + "balance_loss_clip": 0.06297205, + "balance_loss_mlp": 0.01266274, + "epoch": 0.22287689764016233, + "flos": 21878050181760.0, + "grad_norm": 2.374246987916323, + "language_loss": 0.80905002, + "learning_rate": 3.622281274977141e-06, + "loss": 0.88725626, + "num_input_tokens_seen": 79891715, + "router_z_loss_clip": 2.36914062, + "router_z_loss_mlp": 0.20202637, + "step": 3707, + "time_per_iteration": 2.5891129970550537 + }, + { + "auxiliary_loss_clip": 0.06532377, + "auxiliary_loss_mlp": 0.01280313, + "balance_loss_clip": 0.06298505, + "balance_loss_mlp": 0.01257854, + "epoch": 0.2229370208928303, + "flos": 27679824011520.0, + "grad_norm": 1.802742500055583, + "language_loss": 0.79219007, + "learning_rate": 3.6220534671082367e-06, + "loss": 0.87031698, + "num_input_tokens_seen": 79911175, + "router_z_loss_clip": 2.34179688, + "router_z_loss_mlp": 0.2244873, + "step": 3708, + "time_per_iteration": 2.5907180309295654 + }, + { + "auxiliary_loss_clip": 0.06539932, + "auxiliary_loss_mlp": 0.01293698, + "balance_loss_clip": 0.06300655, + "balance_loss_mlp": 0.01271525, + "epoch": 0.22299714414549826, + "flos": 30162612816000.0, + "grad_norm": 1.9019649120082793, + "language_loss": 0.81583631, + "learning_rate": 3.6218255977309024e-06, + "loss": 0.89417267, + "num_input_tokens_seen": 79931875, + "router_z_loss_clip": 2.39453125, + "router_z_loss_mlp": 0.22167969, + "step": 3709, + "time_per_iteration": 2.658768892288208 + }, + { + "auxiliary_loss_clip": 0.06540084, + "auxiliary_loss_mlp": 0.01295766, + "balance_loss_clip": 0.0630019, + "balance_loss_mlp": 0.01274464, + "epoch": 0.22305726739816625, + "flos": 23148871130880.0, + "grad_norm": 2.9556041497723236, + "language_loss": 0.69413233, + "learning_rate": 3.6215976668537787e-06, + "loss": 0.77249086, + "num_input_tokens_seen": 79952445, + "router_z_loss_clip": 2.39648438, + "router_z_loss_mlp": 0.21289062, + "step": 3710, + "time_per_iteration": 2.603476047515869 + }, + { + "auxiliary_loss_clip": 0.06536471, + "auxiliary_loss_mlp": 0.01286054, + "balance_loss_clip": 0.06297636, + "balance_loss_mlp": 0.01264429, + "epoch": 0.22311739065083422, + "flos": 19178116721280.0, + "grad_norm": 2.184897161331363, + "language_loss": 0.91282266, + "learning_rate": 3.6213696744855096e-06, + "loss": 0.99104792, + "num_input_tokens_seen": 79971030, + "router_z_loss_clip": 2.39257812, + "router_z_loss_mlp": 0.21606445, + "step": 3711, + "time_per_iteration": 2.6093854904174805 + }, + { + "auxiliary_loss_clip": 0.06539471, + "auxiliary_loss_mlp": 0.01298084, + "balance_loss_clip": 0.06302293, + "balance_loss_mlp": 0.01275911, + "epoch": 0.22317751390350218, + "flos": 13621467611520.0, + "grad_norm": 2.3638705243519142, + "language_loss": 0.89271343, + "learning_rate": 3.6211416206347395e-06, + "loss": 0.97108901, + "num_input_tokens_seen": 79982085, + "router_z_loss_clip": 2.375, + "router_z_loss_mlp": 0.22192383, + "step": 3712, + "time_per_iteration": 2.5170199871063232 + }, + { + "auxiliary_loss_clip": 0.06530519, + "auxiliary_loss_mlp": 0.01292247, + "balance_loss_clip": 0.06299888, + "balance_loss_mlp": 0.01271481, + "epoch": 0.22323763715617015, + "flos": 11032643064960.0, + "grad_norm": 2.927785991832361, + "language_loss": 0.74880064, + "learning_rate": 3.620913505310117e-06, + "loss": 0.82702827, + "num_input_tokens_seen": 79997460, + "router_z_loss_clip": 2.30859375, + "router_z_loss_mlp": 0.2076416, + "step": 3713, + "time_per_iteration": 2.521813154220581 + }, + { + "auxiliary_loss_clip": 0.06534518, + "auxiliary_loss_mlp": 0.0130023, + "balance_loss_clip": 0.06298245, + "balance_loss_mlp": 0.01277556, + "epoch": 0.22329776040883811, + "flos": 41360647841280.0, + "grad_norm": 2.458794372685298, + "language_loss": 0.62675929, + "learning_rate": 3.6206853285202917e-06, + "loss": 0.70510674, + "num_input_tokens_seen": 80022450, + "router_z_loss_clip": 2.36328125, + "router_z_loss_mlp": 0.22668457, + "step": 3714, + "time_per_iteration": 2.704357862472534 + }, + { + "auxiliary_loss_clip": 0.06529912, + "auxiliary_loss_mlp": 0.01289936, + "balance_loss_clip": 0.06295826, + "balance_loss_mlp": 0.01267906, + "epoch": 0.22335788366150608, + "flos": 25126568323200.0, + "grad_norm": 1.757427072944695, + "language_loss": 0.79499549, + "learning_rate": 3.6204570902739164e-06, + "loss": 0.87319398, + "num_input_tokens_seen": 80042100, + "router_z_loss_clip": 2.33984375, + "router_z_loss_mlp": 0.22009277, + "step": 3715, + "time_per_iteration": 2.571711301803589 + }, + { + "auxiliary_loss_clip": 0.06527971, + "auxiliary_loss_mlp": 0.01294287, + "balance_loss_clip": 0.06293058, + "balance_loss_mlp": 0.0127302, + "epoch": 0.22341800691417404, + "flos": 16989144658560.0, + "grad_norm": 1.5961840175356918, + "language_loss": 0.77329421, + "learning_rate": 3.620228790579645e-06, + "loss": 0.85151684, + "num_input_tokens_seen": 80059690, + "router_z_loss_clip": 2.34765625, + "router_z_loss_mlp": 0.21276855, + "step": 3716, + "time_per_iteration": 2.502037286758423 + }, + { + "auxiliary_loss_clip": 0.06529684, + "auxiliary_loss_mlp": 0.0129404, + "balance_loss_clip": 0.06297298, + "balance_loss_mlp": 0.01273977, + "epoch": 0.22347813016684204, + "flos": 14141904520320.0, + "grad_norm": 2.4369226344025665, + "language_loss": 0.80004126, + "learning_rate": 3.6200004294461367e-06, + "loss": 0.87827849, + "num_input_tokens_seen": 80076060, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.20068359, + "step": 3717, + "time_per_iteration": 2.5208563804626465 + }, + { + "auxiliary_loss_clip": 0.065373, + "auxiliary_loss_mlp": 0.01297317, + "balance_loss_clip": 0.06298472, + "balance_loss_mlp": 0.01275215, + "epoch": 0.22353825341951, + "flos": 23589323717760.0, + "grad_norm": 2.564573329936102, + "language_loss": 0.68781847, + "learning_rate": 3.6197720068820497e-06, + "loss": 0.76616466, + "num_input_tokens_seen": 80094760, + "router_z_loss_clip": 2.38867188, + "router_z_loss_mlp": 0.22106934, + "step": 3718, + "time_per_iteration": 2.6491305828094482 + }, + { + "auxiliary_loss_clip": 0.06536659, + "auxiliary_loss_mlp": 0.01296292, + "balance_loss_clip": 0.06298986, + "balance_loss_mlp": 0.01271187, + "epoch": 0.22359837667217797, + "flos": 29831759769600.0, + "grad_norm": 1.515297493499622, + "language_loss": 0.80957985, + "learning_rate": 3.619543522896045e-06, + "loss": 0.88790929, + "num_input_tokens_seen": 80114475, + "router_z_loss_clip": 2.37695312, + "router_z_loss_mlp": 0.25085449, + "step": 3719, + "time_per_iteration": 2.6334550380706787 + }, + { + "auxiliary_loss_clip": 0.06540611, + "auxiliary_loss_mlp": 0.01300766, + "balance_loss_clip": 0.06299402, + "balance_loss_mlp": 0.01276793, + "epoch": 0.22365849992484593, + "flos": 17608867056000.0, + "grad_norm": 2.352033480486632, + "language_loss": 0.87360144, + "learning_rate": 3.6193149774967885e-06, + "loss": 0.95201522, + "num_input_tokens_seen": 80132920, + "router_z_loss_clip": 2.41210938, + "router_z_loss_mlp": 0.23962402, + "step": 3720, + "time_per_iteration": 2.5415003299713135 + }, + { + "auxiliary_loss_clip": 0.06526608, + "auxiliary_loss_mlp": 0.01292998, + "balance_loss_clip": 0.06295964, + "balance_loss_mlp": 0.01271672, + "epoch": 0.2237186231775139, + "flos": 22717558638720.0, + "grad_norm": 1.8478771577440833, + "language_loss": 0.75151736, + "learning_rate": 3.619086370692945e-06, + "loss": 0.8297134, + "num_input_tokens_seen": 80152845, + "router_z_loss_clip": 2.30664062, + "router_z_loss_mlp": 0.21325684, + "step": 3721, + "time_per_iteration": 2.548450469970703 + }, + { + "auxiliary_loss_clip": 0.06540586, + "auxiliary_loss_mlp": 0.0129148, + "balance_loss_clip": 0.06299528, + "balance_loss_mlp": 0.01269105, + "epoch": 0.22377874643018186, + "flos": 13376720234880.0, + "grad_norm": 2.2094798322640736, + "language_loss": 0.79352558, + "learning_rate": 3.6188577024931844e-06, + "loss": 0.87184626, + "num_input_tokens_seen": 80170680, + "router_z_loss_clip": 2.41210938, + "router_z_loss_mlp": 0.22375488, + "step": 3722, + "time_per_iteration": 2.519277572631836 + }, + { + "auxiliary_loss_clip": 0.06531984, + "auxiliary_loss_mlp": 0.01288897, + "balance_loss_clip": 0.06299505, + "balance_loss_mlp": 0.01267964, + "epoch": 0.22383886968284986, + "flos": 17900797080960.0, + "grad_norm": 2.2930078409484196, + "language_loss": 0.83410442, + "learning_rate": 3.618628972906178e-06, + "loss": 0.91231328, + "num_input_tokens_seen": 80189030, + "router_z_loss_clip": 2.32421875, + "router_z_loss_mlp": 0.20922852, + "step": 3723, + "time_per_iteration": 2.5086076259613037 + }, + { + "auxiliary_loss_clip": 0.06544059, + "auxiliary_loss_mlp": 0.01285781, + "balance_loss_clip": 0.06305651, + "balance_loss_mlp": 0.01263834, + "epoch": 0.22389899293551782, + "flos": 23886033425280.0, + "grad_norm": 4.429276920778782, + "language_loss": 0.84606177, + "learning_rate": 3.6184001819405984e-06, + "loss": 0.92436016, + "num_input_tokens_seen": 80208365, + "router_z_loss_clip": 2.3828125, + "router_z_loss_mlp": 0.21960449, + "step": 3724, + "time_per_iteration": 2.574178695678711 + }, + { + "auxiliary_loss_clip": 0.06534179, + "auxiliary_loss_mlp": 0.01287846, + "balance_loss_clip": 0.06299204, + "balance_loss_mlp": 0.01267211, + "epoch": 0.2239591161881858, + "flos": 27279929600640.0, + "grad_norm": 1.978846940821608, + "language_loss": 0.79885381, + "learning_rate": 3.618171329605121e-06, + "loss": 0.87707412, + "num_input_tokens_seen": 80228685, + "router_z_loss_clip": 2.3515625, + "router_z_loss_mlp": 0.20617676, + "step": 3725, + "time_per_iteration": 2.589184522628784 + }, + { + "auxiliary_loss_clip": 0.06541407, + "auxiliary_loss_mlp": 0.01289084, + "balance_loss_clip": 0.06307919, + "balance_loss_mlp": 0.01267197, + "epoch": 0.22401923944085375, + "flos": 22243423910400.0, + "grad_norm": 1.7178260071510263, + "language_loss": 0.78001326, + "learning_rate": 3.6179424159084254e-06, + "loss": 0.85831815, + "num_input_tokens_seen": 80247635, + "router_z_loss_clip": 2.3359375, + "router_z_loss_mlp": 0.21875, + "step": 3726, + "time_per_iteration": 3.980494976043701 + }, + { + "auxiliary_loss_clip": 0.06552388, + "auxiliary_loss_mlp": 0.01297244, + "balance_loss_clip": 0.06307887, + "balance_loss_mlp": 0.01272175, + "epoch": 0.22407936269352172, + "flos": 12057920023680.0, + "grad_norm": 3.478702992871699, + "language_loss": 0.73437679, + "learning_rate": 3.6177134408591914e-06, + "loss": 0.81287301, + "num_input_tokens_seen": 80260045, + "router_z_loss_clip": 2.45117188, + "router_z_loss_mlp": 0.25097656, + "step": 3727, + "time_per_iteration": 2.4799015522003174 + }, + { + "auxiliary_loss_clip": 0.06549139, + "auxiliary_loss_mlp": 0.01296668, + "balance_loss_clip": 0.06309944, + "balance_loss_mlp": 0.0127341, + "epoch": 0.22413948594618968, + "flos": 19359482883840.0, + "grad_norm": 2.179866459674304, + "language_loss": 0.8799302, + "learning_rate": 3.6174844044661013e-06, + "loss": 0.95838827, + "num_input_tokens_seen": 80277680, + "router_z_loss_clip": 2.39257812, + "router_z_loss_mlp": 0.23254395, + "step": 3728, + "time_per_iteration": 2.547523021697998 + }, + { + "auxiliary_loss_clip": 0.0653842, + "auxiliary_loss_mlp": 0.01294185, + "balance_loss_clip": 0.06303863, + "balance_loss_mlp": 0.0126989, + "epoch": 0.22419960919885765, + "flos": 24176789493120.0, + "grad_norm": 1.9160734665449493, + "language_loss": 0.80446088, + "learning_rate": 3.6172553067378406e-06, + "loss": 0.88278687, + "num_input_tokens_seen": 80294795, + "router_z_loss_clip": 2.34375, + "router_z_loss_mlp": 0.24328613, + "step": 3729, + "time_per_iteration": 4.021615266799927 + }, + { + "auxiliary_loss_clip": 0.06533324, + "auxiliary_loss_mlp": 0.01292111, + "balance_loss_clip": 0.06302898, + "balance_loss_mlp": 0.01271237, + "epoch": 0.22425973245152564, + "flos": 27386007269760.0, + "grad_norm": 1.6841051152750983, + "language_loss": 0.87170112, + "learning_rate": 3.6170261476830964e-06, + "loss": 0.94995546, + "num_input_tokens_seen": 80315425, + "router_z_loss_clip": 2.30273438, + "router_z_loss_mlp": 0.2088623, + "step": 3730, + "time_per_iteration": 2.598576307296753 + }, + { + "auxiliary_loss_clip": 0.0653019, + "auxiliary_loss_mlp": 0.01298076, + "balance_loss_clip": 0.06300467, + "balance_loss_mlp": 0.01276403, + "epoch": 0.2243198557041936, + "flos": 13740794225280.0, + "grad_norm": 2.088554635044429, + "language_loss": 0.73449922, + "learning_rate": 3.616796927310559e-06, + "loss": 0.81278187, + "num_input_tokens_seen": 80333905, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.21655273, + "step": 3731, + "time_per_iteration": 2.5361716747283936 + }, + { + "auxiliary_loss_clip": 0.06541456, + "auxiliary_loss_mlp": 0.01292681, + "balance_loss_clip": 0.06301124, + "balance_loss_mlp": 0.01267933, + "epoch": 0.22437997895686157, + "flos": 19535775874560.0, + "grad_norm": 5.172507402775724, + "language_loss": 0.75803339, + "learning_rate": 3.6165676456289195e-06, + "loss": 0.83637482, + "num_input_tokens_seen": 80352165, + "router_z_loss_clip": 2.40625, + "router_z_loss_mlp": 0.24755859, + "step": 3732, + "time_per_iteration": 2.5423076152801514 + }, + { + "auxiliary_loss_clip": 0.06533462, + "auxiliary_loss_mlp": 0.01296517, + "balance_loss_clip": 0.06298938, + "balance_loss_mlp": 0.01273664, + "epoch": 0.22444010220952954, + "flos": 23703032108160.0, + "grad_norm": 1.6752991374876018, + "language_loss": 0.89338291, + "learning_rate": 3.616338302646873e-06, + "loss": 0.97168273, + "num_input_tokens_seen": 80371305, + "router_z_loss_clip": 2.34375, + "router_z_loss_mlp": 0.2286377, + "step": 3733, + "time_per_iteration": 4.021088123321533 + }, + { + "auxiliary_loss_clip": 0.065323, + "auxiliary_loss_mlp": 0.01294952, + "balance_loss_clip": 0.06298727, + "balance_loss_mlp": 0.01270193, + "epoch": 0.2245002254621975, + "flos": 22389514704000.0, + "grad_norm": 1.4651206016819107, + "language_loss": 0.85422146, + "learning_rate": 3.6161088983731166e-06, + "loss": 0.93249398, + "num_input_tokens_seen": 80391020, + "router_z_loss_clip": 2.33398438, + "router_z_loss_mlp": 0.24780273, + "step": 3734, + "time_per_iteration": 2.5562949180603027 + }, + { + "auxiliary_loss_clip": 0.06539299, + "auxiliary_loss_mlp": 0.01283537, + "balance_loss_clip": 0.06303868, + "balance_loss_mlp": 0.01261113, + "epoch": 0.22456034871486547, + "flos": 26949453897600.0, + "grad_norm": 1.579737554219585, + "language_loss": 0.77332962, + "learning_rate": 3.6158794328163482e-06, + "loss": 0.85155803, + "num_input_tokens_seen": 80411365, + "router_z_loss_clip": 2.3515625, + "router_z_loss_mlp": 0.22436523, + "step": 3735, + "time_per_iteration": 4.016703367233276 + }, + { + "auxiliary_loss_clip": 0.06526705, + "auxiliary_loss_mlp": 0.01290552, + "balance_loss_clip": 0.06298478, + "balance_loss_mlp": 0.01269559, + "epoch": 0.22462047196753343, + "flos": 28990700012160.0, + "grad_norm": 1.885472064442235, + "language_loss": 0.84907603, + "learning_rate": 3.6156499059852702e-06, + "loss": 0.92724866, + "num_input_tokens_seen": 80431075, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.21008301, + "step": 3736, + "time_per_iteration": 2.6118290424346924 + }, + { + "auxiliary_loss_clip": 0.06536424, + "auxiliary_loss_mlp": 0.01285836, + "balance_loss_clip": 0.0630133, + "balance_loss_mlp": 0.01261922, + "epoch": 0.22468059522020142, + "flos": 20017541324160.0, + "grad_norm": 1.5290746464045628, + "language_loss": 0.87103891, + "learning_rate": 3.615420317888586e-06, + "loss": 0.94926155, + "num_input_tokens_seen": 80449240, + "router_z_loss_clip": 2.3515625, + "router_z_loss_mlp": 0.23913574, + "step": 3737, + "time_per_iteration": 2.5211808681488037 + }, + { + "auxiliary_loss_clip": 0.06534176, + "auxiliary_loss_mlp": 0.01288351, + "balance_loss_clip": 0.06294889, + "balance_loss_mlp": 0.01263949, + "epoch": 0.2247407184728694, + "flos": 29321846547840.0, + "grad_norm": 1.8581473098744326, + "language_loss": 0.80131769, + "learning_rate": 3.6151906685350006e-06, + "loss": 0.87954295, + "num_input_tokens_seen": 80467900, + "router_z_loss_clip": 2.39257812, + "router_z_loss_mlp": 0.24389648, + "step": 3738, + "time_per_iteration": 2.604417085647583 + }, + { + "auxiliary_loss_clip": 0.06530435, + "auxiliary_loss_mlp": 0.01285051, + "balance_loss_clip": 0.06293893, + "balance_loss_mlp": 0.01263295, + "epoch": 0.22480084172553735, + "flos": 22317035322240.0, + "grad_norm": 1.7432458267253939, + "language_loss": 0.77190316, + "learning_rate": 3.614960957933224e-06, + "loss": 0.85005802, + "num_input_tokens_seen": 80487100, + "router_z_loss_clip": 2.36914062, + "router_z_loss_mlp": 0.21728516, + "step": 3739, + "time_per_iteration": 2.540266275405884 + }, + { + "auxiliary_loss_clip": 0.06531328, + "auxiliary_loss_mlp": 0.01283134, + "balance_loss_clip": 0.06295189, + "balance_loss_mlp": 0.01260091, + "epoch": 0.22486096497820532, + "flos": 25598019720960.0, + "grad_norm": 4.441094103460663, + "language_loss": 0.74799633, + "learning_rate": 3.6147311860919655e-06, + "loss": 0.82614094, + "num_input_tokens_seen": 80508625, + "router_z_loss_clip": 2.36132812, + "router_z_loss_mlp": 0.23022461, + "step": 3740, + "time_per_iteration": 2.640592575073242 + }, + { + "auxiliary_loss_clip": 0.06520827, + "auxiliary_loss_mlp": 0.01278747, + "balance_loss_clip": 0.06289122, + "balance_loss_mlp": 0.01256681, + "epoch": 0.22492108823087328, + "flos": 17645651798400.0, + "grad_norm": 2.0040821388775285, + "language_loss": 0.75983584, + "learning_rate": 3.614501353019939e-06, + "loss": 0.83783156, + "num_input_tokens_seen": 80527345, + "router_z_loss_clip": 2.31640625, + "router_z_loss_mlp": 0.22070312, + "step": 3741, + "time_per_iteration": 2.513965129852295 + }, + { + "auxiliary_loss_clip": 0.06526901, + "auxiliary_loss_mlp": 0.01283674, + "balance_loss_clip": 0.06296658, + "balance_loss_mlp": 0.0126224, + "epoch": 0.22498121148354125, + "flos": 16040246296320.0, + "grad_norm": 1.702368757801579, + "language_loss": 0.87747514, + "learning_rate": 3.6142714587258592e-06, + "loss": 0.95558089, + "num_input_tokens_seen": 80545545, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.21435547, + "step": 3742, + "time_per_iteration": 2.5164167881011963 + }, + { + "auxiliary_loss_clip": 0.0652426, + "auxiliary_loss_mlp": 0.01281848, + "balance_loss_clip": 0.06294844, + "balance_loss_mlp": 0.01259389, + "epoch": 0.22504133473620924, + "flos": 24030489064320.0, + "grad_norm": 1.7109022824395175, + "language_loss": 0.82010657, + "learning_rate": 3.614041503218444e-06, + "loss": 0.89816761, + "num_input_tokens_seen": 80565040, + "router_z_loss_clip": 2.29296875, + "router_z_loss_mlp": 0.22473145, + "step": 3743, + "time_per_iteration": 2.5486276149749756 + }, + { + "auxiliary_loss_clip": 0.06524298, + "auxiliary_loss_mlp": 0.0127565, + "balance_loss_clip": 0.06291372, + "balance_loss_mlp": 0.01254562, + "epoch": 0.2251014579888772, + "flos": 16769610161280.0, + "grad_norm": 2.126207867209009, + "language_loss": 0.64185399, + "learning_rate": 3.6138114865064134e-06, + "loss": 0.7198534, + "num_input_tokens_seen": 80582815, + "router_z_loss_clip": 2.328125, + "router_z_loss_mlp": 0.2109375, + "step": 3744, + "time_per_iteration": 2.535020351409912 + }, + { + "auxiliary_loss_clip": 0.06527244, + "auxiliary_loss_mlp": 0.01277496, + "balance_loss_clip": 0.06293654, + "balance_loss_mlp": 0.01256372, + "epoch": 0.22516158124154517, + "flos": 13996191070080.0, + "grad_norm": 3.1643825534304684, + "language_loss": 0.76886272, + "learning_rate": 3.613581408598489e-06, + "loss": 0.84691012, + "num_input_tokens_seen": 80600865, + "router_z_loss_clip": 2.3359375, + "router_z_loss_mlp": 0.21105957, + "step": 3745, + "time_per_iteration": 2.5233495235443115 + }, + { + "auxiliary_loss_clip": 0.06522205, + "auxiliary_loss_mlp": 0.01281406, + "balance_loss_clip": 0.06290923, + "balance_loss_mlp": 0.01260675, + "epoch": 0.22522170449421314, + "flos": 14394869596800.0, + "grad_norm": 1.6969236990578618, + "language_loss": 0.80721819, + "learning_rate": 3.6133512695033965e-06, + "loss": 0.88525426, + "num_input_tokens_seen": 80617455, + "router_z_loss_clip": 2.3125, + "router_z_loss_mlp": 0.20739746, + "step": 3746, + "time_per_iteration": 2.559129476547241 + }, + { + "auxiliary_loss_clip": 0.06533524, + "auxiliary_loss_mlp": 0.01280566, + "balance_loss_clip": 0.06296681, + "balance_loss_mlp": 0.0125881, + "epoch": 0.2252818277468811, + "flos": 23812338159360.0, + "grad_norm": 2.077776202364112, + "language_loss": 0.86226261, + "learning_rate": 3.613121069229862e-06, + "loss": 0.94040346, + "num_input_tokens_seen": 80635125, + "router_z_loss_clip": 2.3671875, + "router_z_loss_mlp": 0.21765137, + "step": 3747, + "time_per_iteration": 2.5834550857543945 + }, + { + "auxiliary_loss_clip": 0.06530412, + "auxiliary_loss_mlp": 0.01275087, + "balance_loss_clip": 0.06296586, + "balance_loss_mlp": 0.01255095, + "epoch": 0.22534195099954907, + "flos": 24725038757760.0, + "grad_norm": 1.8595393434505574, + "language_loss": 0.76982796, + "learning_rate": 3.6128908077866145e-06, + "loss": 0.84788299, + "num_input_tokens_seen": 80656370, + "router_z_loss_clip": 2.34179688, + "router_z_loss_mlp": 0.1998291, + "step": 3748, + "time_per_iteration": 2.5877788066864014 + }, + { + "auxiliary_loss_clip": 0.0652978, + "auxiliary_loss_mlp": 0.0128313, + "balance_loss_clip": 0.06296694, + "balance_loss_mlp": 0.01261768, + "epoch": 0.22540207425221703, + "flos": 21038625578880.0, + "grad_norm": 1.5282192474331018, + "language_loss": 0.80547005, + "learning_rate": 3.6126604851823864e-06, + "loss": 0.88359916, + "num_input_tokens_seen": 80676495, + "router_z_loss_clip": 2.33203125, + "router_z_loss_mlp": 0.21374512, + "step": 3749, + "time_per_iteration": 2.5356597900390625 + }, + { + "auxiliary_loss_clip": 0.06526259, + "auxiliary_loss_mlp": 0.01273546, + "balance_loss_clip": 0.06298405, + "balance_loss_mlp": 0.01253698, + "epoch": 0.22546219750488503, + "flos": 19396351480320.0, + "grad_norm": 1.5225090015602234, + "language_loss": 0.80070651, + "learning_rate": 3.6124301014259108e-06, + "loss": 0.87870455, + "num_input_tokens_seen": 80694755, + "router_z_loss_clip": 2.2734375, + "router_z_loss_mlp": 0.19848633, + "step": 3750, + "time_per_iteration": 2.524614095687866 + }, + { + "auxiliary_loss_clip": 0.06532078, + "auxiliary_loss_mlp": 0.01279372, + "balance_loss_clip": 0.06297495, + "balance_loss_mlp": 0.01258117, + "epoch": 0.225522320757553, + "flos": 25199760464640.0, + "grad_norm": 5.336084937176506, + "language_loss": 0.8300491, + "learning_rate": 3.6121996565259244e-06, + "loss": 0.90816361, + "num_input_tokens_seen": 80713670, + "router_z_loss_clip": 2.34765625, + "router_z_loss_mlp": 0.21264648, + "step": 3751, + "time_per_iteration": 2.5638771057128906 + }, + { + "auxiliary_loss_clip": 0.06527963, + "auxiliary_loss_mlp": 0.01280546, + "balance_loss_clip": 0.06296829, + "balance_loss_mlp": 0.01260149, + "epoch": 0.22558244401022096, + "flos": 17168456396160.0, + "grad_norm": 1.7246902184661286, + "language_loss": 0.8427825, + "learning_rate": 3.611969150491165e-06, + "loss": 0.92086762, + "num_input_tokens_seen": 80731450, + "router_z_loss_clip": 2.31054688, + "router_z_loss_mlp": 0.20385742, + "step": 3752, + "time_per_iteration": 2.5650362968444824 + }, + { + "auxiliary_loss_clip": 0.06527157, + "auxiliary_loss_mlp": 0.01275092, + "balance_loss_clip": 0.06298538, + "balance_loss_mlp": 0.01254839, + "epoch": 0.22564256726288892, + "flos": 15236306697600.0, + "grad_norm": 1.7312534305272433, + "language_loss": 0.78620666, + "learning_rate": 3.611738583330375e-06, + "loss": 0.8642292, + "num_input_tokens_seen": 80748415, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.20251465, + "step": 3753, + "time_per_iteration": 2.510344982147217 + }, + { + "auxiliary_loss_clip": 0.06525348, + "auxiliary_loss_mlp": 0.01279816, + "balance_loss_clip": 0.06296748, + "balance_loss_mlp": 0.01257869, + "epoch": 0.2257026905155569, + "flos": 34577215902720.0, + "grad_norm": 1.9706921359503449, + "language_loss": 0.79448152, + "learning_rate": 3.611507955052295e-06, + "loss": 0.8725332, + "num_input_tokens_seen": 80770835, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.21948242, + "step": 3754, + "time_per_iteration": 2.6429665088653564 + }, + { + "auxiliary_loss_clip": 0.06526577, + "auxiliary_loss_mlp": 0.01281198, + "balance_loss_clip": 0.06299241, + "balance_loss_mlp": 0.01259835, + "epoch": 0.22576281376822485, + "flos": 19944642672000.0, + "grad_norm": 1.7667035857085684, + "language_loss": 0.70640147, + "learning_rate": 3.6112772656656727e-06, + "loss": 0.78447914, + "num_input_tokens_seen": 80787840, + "router_z_loss_clip": 2.2734375, + "router_z_loss_mlp": 0.21374512, + "step": 3755, + "time_per_iteration": 2.5482447147369385 + }, + { + "auxiliary_loss_clip": 0.06530152, + "auxiliary_loss_mlp": 0.01282078, + "balance_loss_clip": 0.06295566, + "balance_loss_mlp": 0.01261085, + "epoch": 0.22582293702089282, + "flos": 24607892350080.0, + "grad_norm": 2.6955819116528588, + "language_loss": 0.77899122, + "learning_rate": 3.6110465151792547e-06, + "loss": 0.85711348, + "num_input_tokens_seen": 80806335, + "router_z_loss_clip": 2.34570312, + "router_z_loss_mlp": 0.21008301, + "step": 3756, + "time_per_iteration": 2.573639392852783 + }, + { + "auxiliary_loss_clip": 0.06536651, + "auxiliary_loss_mlp": 0.01278842, + "balance_loss_clip": 0.0629873, + "balance_loss_mlp": 0.01255394, + "epoch": 0.2258830602735608, + "flos": 23041451796480.0, + "grad_norm": 2.9460656412940405, + "language_loss": 0.82867002, + "learning_rate": 3.6108157036017916e-06, + "loss": 0.90682495, + "num_input_tokens_seen": 80825355, + "router_z_loss_clip": 2.38085938, + "router_z_loss_mlp": 0.23461914, + "step": 3757, + "time_per_iteration": 2.5425305366516113 + }, + { + "auxiliary_loss_clip": 0.06538612, + "auxiliary_loss_mlp": 0.01279229, + "balance_loss_clip": 0.06302969, + "balance_loss_mlp": 0.01257164, + "epoch": 0.22594318352622877, + "flos": 22164068494080.0, + "grad_norm": 3.099441845199118, + "language_loss": 0.73941171, + "learning_rate": 3.6105848309420358e-06, + "loss": 0.81759018, + "num_input_tokens_seen": 80842570, + "router_z_loss_clip": 2.359375, + "router_z_loss_mlp": 0.2208252, + "step": 3758, + "time_per_iteration": 2.506148099899292 + }, + { + "auxiliary_loss_clip": 0.06531477, + "auxiliary_loss_mlp": 0.01288595, + "balance_loss_clip": 0.06296086, + "balance_loss_mlp": 0.01266816, + "epoch": 0.22600330677889674, + "flos": 20600478979200.0, + "grad_norm": 2.4125098710516117, + "language_loss": 0.77881908, + "learning_rate": 3.6103538972087412e-06, + "loss": 0.85701978, + "num_input_tokens_seen": 80858745, + "router_z_loss_clip": 2.35546875, + "router_z_loss_mlp": 0.21777344, + "step": 3759, + "time_per_iteration": 2.5171775817871094 + }, + { + "auxiliary_loss_clip": 0.06534176, + "auxiliary_loss_mlp": 0.01288917, + "balance_loss_clip": 0.06296586, + "balance_loss_mlp": 0.01266267, + "epoch": 0.2260634300315647, + "flos": 35667970427520.0, + "grad_norm": 1.6851914496917324, + "language_loss": 0.7921207, + "learning_rate": 3.6101229024106655e-06, + "loss": 0.87035167, + "num_input_tokens_seen": 80880085, + "router_z_loss_clip": 2.37695312, + "router_z_loss_mlp": 0.22644043, + "step": 3760, + "time_per_iteration": 2.6410677433013916 + }, + { + "auxiliary_loss_clip": 0.06433272, + "auxiliary_loss_mlp": 0.01258557, + "balance_loss_clip": 0.06311189, + "balance_loss_mlp": 0.01252156, + "epoch": 0.22612355328423267, + "flos": 72107707685760.0, + "grad_norm": 0.875668320300708, + "language_loss": 0.60230321, + "learning_rate": 3.609891846556569e-06, + "loss": 0.67922151, + "num_input_tokens_seen": 80937660, + "router_z_loss_clip": 1.21875, + "router_z_loss_mlp": 0.06408691, + "step": 3761, + "time_per_iteration": 3.1083786487579346 + }, + { + "auxiliary_loss_clip": 0.06545433, + "auxiliary_loss_mlp": 0.01288291, + "balance_loss_clip": 0.06303856, + "balance_loss_mlp": 0.01267012, + "epoch": 0.22618367653690064, + "flos": 22790373436800.0, + "grad_norm": 3.0022983434583783, + "language_loss": 0.77876461, + "learning_rate": 3.609660729655211e-06, + "loss": 0.8571018, + "num_input_tokens_seen": 80956265, + "router_z_loss_clip": 2.41601562, + "router_z_loss_mlp": 0.21289062, + "step": 3762, + "time_per_iteration": 2.5256128311157227 + }, + { + "auxiliary_loss_clip": 0.06531228, + "auxiliary_loss_mlp": 0.01280361, + "balance_loss_clip": 0.06294668, + "balance_loss_mlp": 0.01258343, + "epoch": 0.22624379978956863, + "flos": 20454388185600.0, + "grad_norm": 1.959767281760525, + "language_loss": 0.79828411, + "learning_rate": 3.6094295517153573e-06, + "loss": 0.87639999, + "num_input_tokens_seen": 80975185, + "router_z_loss_clip": 2.36328125, + "router_z_loss_mlp": 0.22033691, + "step": 3763, + "time_per_iteration": 2.528965950012207 + }, + { + "auxiliary_loss_clip": 0.06540731, + "auxiliary_loss_mlp": 0.01291635, + "balance_loss_clip": 0.06300753, + "balance_loss_mlp": 0.01268949, + "epoch": 0.2263039230422366, + "flos": 17500189910400.0, + "grad_norm": 1.5800574189561347, + "language_loss": 0.91907668, + "learning_rate": 3.6091983127457743e-06, + "loss": 0.99740022, + "num_input_tokens_seen": 80992830, + "router_z_loss_clip": 2.3984375, + "router_z_loss_mlp": 0.22705078, + "step": 3764, + "time_per_iteration": 2.5012450218200684 + }, + { + "auxiliary_loss_clip": 0.06527007, + "auxiliary_loss_mlp": 0.01291683, + "balance_loss_clip": 0.06295396, + "balance_loss_mlp": 0.01271001, + "epoch": 0.22636404629490456, + "flos": 28337295473280.0, + "grad_norm": 3.379650672619254, + "language_loss": 0.75542498, + "learning_rate": 3.6089670127552293e-06, + "loss": 0.83361191, + "num_input_tokens_seen": 81013675, + "router_z_loss_clip": 2.31445312, + "router_z_loss_mlp": 0.20690918, + "step": 3765, + "time_per_iteration": 2.6149775981903076 + }, + { + "auxiliary_loss_clip": 0.06519896, + "auxiliary_loss_mlp": 0.01290584, + "balance_loss_clip": 0.06290467, + "balance_loss_mlp": 0.01268256, + "epoch": 0.22642416954757252, + "flos": 17494152416640.0, + "grad_norm": 2.1325205607667526, + "language_loss": 0.90732884, + "learning_rate": 3.608735651752494e-06, + "loss": 0.98543364, + "num_input_tokens_seen": 81030345, + "router_z_loss_clip": 2.29492188, + "router_z_loss_mlp": 0.22338867, + "step": 3766, + "time_per_iteration": 3.925321340560913 + }, + { + "auxiliary_loss_clip": 0.06520344, + "auxiliary_loss_mlp": 0.01279841, + "balance_loss_clip": 0.0629393, + "balance_loss_mlp": 0.0125756, + "epoch": 0.2264842928002405, + "flos": 24390621912960.0, + "grad_norm": 1.5335844294501488, + "language_loss": 0.74866152, + "learning_rate": 3.6085042297463417e-06, + "loss": 0.82666337, + "num_input_tokens_seen": 81051000, + "router_z_loss_clip": 2.26757812, + "router_z_loss_mlp": 0.22290039, + "step": 3767, + "time_per_iteration": 2.585827589035034 + }, + { + "auxiliary_loss_clip": 0.06526411, + "auxiliary_loss_mlp": 0.01285323, + "balance_loss_clip": 0.06291486, + "balance_loss_mlp": 0.01262816, + "epoch": 0.22654441605290845, + "flos": 19836971775360.0, + "grad_norm": 1.5156609478299474, + "language_loss": 0.72064531, + "learning_rate": 3.6082727467455477e-06, + "loss": 0.79876268, + "num_input_tokens_seen": 81071205, + "router_z_loss_clip": 2.35351562, + "router_z_loss_mlp": 0.22521973, + "step": 3768, + "time_per_iteration": 3.9932377338409424 + }, + { + "auxiliary_loss_clip": 0.06525982, + "auxiliary_loss_mlp": 0.01291355, + "balance_loss_clip": 0.06294759, + "balance_loss_mlp": 0.01268347, + "epoch": 0.22660453930557642, + "flos": 27462050449920.0, + "grad_norm": 1.8227506475765343, + "language_loss": 0.78781188, + "learning_rate": 3.6080412027588905e-06, + "loss": 0.86598527, + "num_input_tokens_seen": 81091880, + "router_z_loss_clip": 2.31445312, + "router_z_loss_mlp": 0.22998047, + "step": 3769, + "time_per_iteration": 2.5796549320220947 + }, + { + "auxiliary_loss_clip": 0.06531481, + "auxiliary_loss_mlp": 0.01287446, + "balance_loss_clip": 0.06292526, + "balance_loss_mlp": 0.01265428, + "epoch": 0.2266646625582444, + "flos": 23995004060160.0, + "grad_norm": 2.604534401291856, + "language_loss": 0.69374454, + "learning_rate": 3.6078095977951488e-06, + "loss": 0.77193379, + "num_input_tokens_seen": 81113290, + "router_z_loss_clip": 2.38867188, + "router_z_loss_mlp": 0.22021484, + "step": 3770, + "time_per_iteration": 2.6160407066345215 + }, + { + "auxiliary_loss_clip": 0.065291, + "auxiliary_loss_mlp": 0.01289999, + "balance_loss_clip": 0.06292273, + "balance_loss_mlp": 0.01269077, + "epoch": 0.22672478581091238, + "flos": 26034698874240.0, + "grad_norm": 1.4830972618629188, + "language_loss": 0.8083868, + "learning_rate": 3.6075779318631067e-06, + "loss": 0.88657784, + "num_input_tokens_seen": 81133535, + "router_z_loss_clip": 2.37109375, + "router_z_loss_mlp": 0.20922852, + "step": 3771, + "time_per_iteration": 2.576948642730713 + }, + { + "auxiliary_loss_clip": 0.06521479, + "auxiliary_loss_mlp": 0.01283736, + "balance_loss_clip": 0.06290901, + "balance_loss_mlp": 0.012613, + "epoch": 0.22678490906358034, + "flos": 23848577850240.0, + "grad_norm": 1.5694676435300003, + "language_loss": 0.79189658, + "learning_rate": 3.6073462049715486e-06, + "loss": 0.86994874, + "num_input_tokens_seen": 81154650, + "router_z_loss_clip": 2.30273438, + "router_z_loss_mlp": 0.22436523, + "step": 3772, + "time_per_iteration": 4.012827396392822 + }, + { + "auxiliary_loss_clip": 0.06410234, + "auxiliary_loss_mlp": 0.01286376, + "balance_loss_clip": 0.06287075, + "balance_loss_mlp": 0.01280571, + "epoch": 0.2268450323162483, + "flos": 65070163912320.0, + "grad_norm": 0.6415690360853892, + "language_loss": 0.53899318, + "learning_rate": 3.607114417129261e-06, + "loss": 0.61595929, + "num_input_tokens_seen": 81221240, + "router_z_loss_clip": 1.234375, + "router_z_loss_mlp": 0.0579834, + "step": 3773, + "time_per_iteration": 3.249551773071289 + }, + { + "auxiliary_loss_clip": 0.06526346, + "auxiliary_loss_mlp": 0.01287624, + "balance_loss_clip": 0.06294057, + "balance_loss_mlp": 0.01266238, + "epoch": 0.22690515556891627, + "flos": 22532251334400.0, + "grad_norm": 1.8359701531623327, + "language_loss": 0.70997107, + "learning_rate": 3.6068825683450334e-06, + "loss": 0.78811073, + "num_input_tokens_seen": 81241520, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.21386719, + "step": 3774, + "time_per_iteration": 2.558279275894165 + }, + { + "auxiliary_loss_clip": 0.06521672, + "auxiliary_loss_mlp": 0.01287873, + "balance_loss_clip": 0.06291246, + "balance_loss_mlp": 0.01266857, + "epoch": 0.22696527882158424, + "flos": 18229344140160.0, + "grad_norm": 2.047907778931267, + "language_loss": 0.75449002, + "learning_rate": 3.606650658627658e-06, + "loss": 0.83258545, + "num_input_tokens_seen": 81256825, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.21008301, + "step": 3775, + "time_per_iteration": 3.928666353225708 + }, + { + "auxiliary_loss_clip": 0.06524701, + "auxiliary_loss_mlp": 0.01286732, + "balance_loss_clip": 0.06292307, + "balance_loss_mlp": 0.01266168, + "epoch": 0.22702540207425223, + "flos": 17024923152000.0, + "grad_norm": 2.031895062113734, + "language_loss": 0.82818532, + "learning_rate": 3.606418687985928e-06, + "loss": 0.90629965, + "num_input_tokens_seen": 81275695, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.20581055, + "step": 3776, + "time_per_iteration": 2.5941483974456787 + }, + { + "auxiliary_loss_clip": 0.06528914, + "auxiliary_loss_mlp": 0.01279846, + "balance_loss_clip": 0.06293055, + "balance_loss_mlp": 0.01259222, + "epoch": 0.2270855253269202, + "flos": 21332316539520.0, + "grad_norm": 1.645158938946052, + "language_loss": 0.83362442, + "learning_rate": 3.606186656428641e-06, + "loss": 0.91171205, + "num_input_tokens_seen": 81294920, + "router_z_loss_clip": 2.359375, + "router_z_loss_mlp": 0.20617676, + "step": 3777, + "time_per_iteration": 2.5177228450775146 + }, + { + "auxiliary_loss_clip": 0.06532624, + "auxiliary_loss_mlp": 0.01278936, + "balance_loss_clip": 0.06296799, + "balance_loss_mlp": 0.01257002, + "epoch": 0.22714564857958816, + "flos": 23557276730880.0, + "grad_norm": 1.8837878269403912, + "language_loss": 0.73246169, + "learning_rate": 3.6059545639645955e-06, + "loss": 0.81057739, + "num_input_tokens_seen": 81314275, + "router_z_loss_clip": 2.36328125, + "router_z_loss_mlp": 0.21948242, + "step": 3778, + "time_per_iteration": 2.5589511394500732 + }, + { + "auxiliary_loss_clip": 0.06530988, + "auxiliary_loss_mlp": 0.01275867, + "balance_loss_clip": 0.06293572, + "balance_loss_mlp": 0.01255673, + "epoch": 0.22720577183225613, + "flos": 25996237050240.0, + "grad_norm": 2.9659284448048555, + "language_loss": 0.65779513, + "learning_rate": 3.605722410602591e-06, + "loss": 0.73586369, + "num_input_tokens_seen": 81333890, + "router_z_loss_clip": 2.37304688, + "router_z_loss_mlp": 0.20178223, + "step": 3779, + "time_per_iteration": 2.543818950653076 + }, + { + "auxiliary_loss_clip": 0.06525169, + "auxiliary_loss_mlp": 0.01276701, + "balance_loss_clip": 0.06295511, + "balance_loss_mlp": 0.01255982, + "epoch": 0.2272658950849241, + "flos": 20820432746880.0, + "grad_norm": 1.7825989229768946, + "language_loss": 0.70823693, + "learning_rate": 3.6054901963514323e-06, + "loss": 0.7862556, + "num_input_tokens_seen": 81353640, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.20703125, + "step": 3780, + "time_per_iteration": 2.558850049972534 + }, + { + "auxiliary_loss_clip": 0.06528573, + "auxiliary_loss_mlp": 0.01280577, + "balance_loss_clip": 0.06296494, + "balance_loss_mlp": 0.01257927, + "epoch": 0.22732601833759206, + "flos": 23915187446400.0, + "grad_norm": 1.6463040629853982, + "language_loss": 0.89639765, + "learning_rate": 3.6052579212199246e-06, + "loss": 0.97448915, + "num_input_tokens_seen": 81371595, + "router_z_loss_clip": 2.32226562, + "router_z_loss_mlp": 0.2265625, + "step": 3781, + "time_per_iteration": 2.527230739593506 + }, + { + "auxiliary_loss_clip": 0.06532317, + "auxiliary_loss_mlp": 0.01280346, + "balance_loss_clip": 0.06296034, + "balance_loss_mlp": 0.01257672, + "epoch": 0.22738614159026002, + "flos": 15929850142080.0, + "grad_norm": 2.4692396393453016, + "language_loss": 0.75309098, + "learning_rate": 3.6050255852168753e-06, + "loss": 0.83121765, + "num_input_tokens_seen": 81388435, + "router_z_loss_clip": 2.36523438, + "router_z_loss_mlp": 0.2265625, + "step": 3782, + "time_per_iteration": 2.4901020526885986 + }, + { + "auxiliary_loss_clip": 0.06532567, + "auxiliary_loss_mlp": 0.01278379, + "balance_loss_clip": 0.06300219, + "balance_loss_mlp": 0.01257959, + "epoch": 0.22744626484292801, + "flos": 24212148716160.0, + "grad_norm": 1.7681967435875452, + "language_loss": 0.8314634, + "learning_rate": 3.604793188351095e-06, + "loss": 0.90957284, + "num_input_tokens_seen": 81410195, + "router_z_loss_clip": 2.32421875, + "router_z_loss_mlp": 0.20422363, + "step": 3783, + "time_per_iteration": 2.559361696243286 + }, + { + "auxiliary_loss_clip": 0.06539755, + "auxiliary_loss_mlp": 0.0128451, + "balance_loss_clip": 0.06305835, + "balance_loss_mlp": 0.01262266, + "epoch": 0.22750638809559598, + "flos": 24798734023680.0, + "grad_norm": 1.794476113807414, + "language_loss": 0.76757884, + "learning_rate": 3.6045607306313964e-06, + "loss": 0.8458215, + "num_input_tokens_seen": 81430060, + "router_z_loss_clip": 2.34179688, + "router_z_loss_mlp": 0.22229004, + "step": 3784, + "time_per_iteration": 2.6693339347839355 + }, + { + "auxiliary_loss_clip": 0.06533188, + "auxiliary_loss_mlp": 0.012806, + "balance_loss_clip": 0.06299379, + "balance_loss_mlp": 0.01257998, + "epoch": 0.22756651134826394, + "flos": 22243004640000.0, + "grad_norm": 1.5985438146538498, + "language_loss": 0.71667248, + "learning_rate": 3.604328212066594e-06, + "loss": 0.79481035, + "num_input_tokens_seen": 81447375, + "router_z_loss_clip": 2.34179688, + "router_z_loss_mlp": 0.22583008, + "step": 3785, + "time_per_iteration": 2.5436675548553467 + }, + { + "auxiliary_loss_clip": 0.06421004, + "auxiliary_loss_mlp": 0.0127133, + "balance_loss_clip": 0.0629871, + "balance_loss_mlp": 0.01265915, + "epoch": 0.2276266346009319, + "flos": 62728225021440.0, + "grad_norm": 1.545506426452605, + "language_loss": 0.63058448, + "learning_rate": 3.6040956326655047e-06, + "loss": 0.70750785, + "num_input_tokens_seen": 81505235, + "router_z_loss_clip": 1.21875, + "router_z_loss_mlp": 0.05422974, + "step": 3786, + "time_per_iteration": 3.1247661113739014 + }, + { + "auxiliary_loss_clip": 0.06538717, + "auxiliary_loss_mlp": 0.01277474, + "balance_loss_clip": 0.06302891, + "balance_loss_mlp": 0.01254299, + "epoch": 0.22768675785359987, + "flos": 18618085958400.0, + "grad_norm": 2.466113986800572, + "language_loss": 0.8751514, + "learning_rate": 3.6038629924369486e-06, + "loss": 0.95331335, + "num_input_tokens_seen": 81518685, + "router_z_loss_clip": 2.35742188, + "router_z_loss_mlp": 0.23156738, + "step": 3787, + "time_per_iteration": 2.488539457321167 + }, + { + "auxiliary_loss_clip": 0.06537791, + "auxiliary_loss_mlp": 0.01280159, + "balance_loss_clip": 0.06305036, + "balance_loss_mlp": 0.01259488, + "epoch": 0.22774688110626784, + "flos": 26877477640320.0, + "grad_norm": 2.053207704033697, + "language_loss": 0.73054254, + "learning_rate": 3.6036302913897474e-06, + "loss": 0.80872202, + "num_input_tokens_seen": 81538940, + "router_z_loss_clip": 2.33007812, + "router_z_loss_mlp": 0.20678711, + "step": 3788, + "time_per_iteration": 2.5763657093048096 + }, + { + "auxiliary_loss_clip": 0.06534025, + "auxiliary_loss_mlp": 0.01282834, + "balance_loss_clip": 0.06303776, + "balance_loss_mlp": 0.01260971, + "epoch": 0.2278070043589358, + "flos": 15557977722240.0, + "grad_norm": 4.57361945380841, + "language_loss": 0.68007839, + "learning_rate": 3.6033975295327243e-06, + "loss": 0.75824702, + "num_input_tokens_seen": 81555525, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.21850586, + "step": 3789, + "time_per_iteration": 2.4907443523406982 + }, + { + "auxiliary_loss_clip": 0.0653897, + "auxiliary_loss_mlp": 0.01283477, + "balance_loss_clip": 0.06308074, + "balance_loss_mlp": 0.0126115, + "epoch": 0.2278671276116038, + "flos": 22422987210240.0, + "grad_norm": 2.4388022002275243, + "language_loss": 0.76775718, + "learning_rate": 3.6031647068747065e-06, + "loss": 0.84598166, + "num_input_tokens_seen": 81576305, + "router_z_loss_clip": 2.31054688, + "router_z_loss_mlp": 0.22338867, + "step": 3790, + "time_per_iteration": 2.5787651538848877 + }, + { + "auxiliary_loss_clip": 0.06540109, + "auxiliary_loss_mlp": 0.01282259, + "balance_loss_clip": 0.06309578, + "balance_loss_mlp": 0.01259252, + "epoch": 0.22792725086427176, + "flos": 20637641064960.0, + "grad_norm": 1.9300771626575046, + "language_loss": 0.91910696, + "learning_rate": 3.602931823424522e-06, + "loss": 0.99733061, + "num_input_tokens_seen": 81594115, + "router_z_loss_clip": 2.30664062, + "router_z_loss_mlp": 0.23010254, + "step": 3791, + "time_per_iteration": 2.52327823638916 + }, + { + "auxiliary_loss_clip": 0.06538808, + "auxiliary_loss_mlp": 0.01277492, + "balance_loss_clip": 0.06302848, + "balance_loss_mlp": 0.01256893, + "epoch": 0.22798737411693973, + "flos": 31436662147200.0, + "grad_norm": 1.9637481556258098, + "language_loss": 0.83064067, + "learning_rate": 3.6026988791910026e-06, + "loss": 0.9088037, + "num_input_tokens_seen": 81615355, + "router_z_loss_clip": 2.36132812, + "router_z_loss_mlp": 0.20617676, + "step": 3792, + "time_per_iteration": 2.6190388202667236 + }, + { + "auxiliary_loss_clip": 0.06410792, + "auxiliary_loss_mlp": 0.01268683, + "balance_loss_clip": 0.06289717, + "balance_loss_mlp": 0.01263259, + "epoch": 0.2280474973696077, + "flos": 52412074220160.0, + "grad_norm": 1.1033671526650368, + "language_loss": 0.65792358, + "learning_rate": 3.602465874182981e-06, + "loss": 0.73471832, + "num_input_tokens_seen": 81662075, + "router_z_loss_clip": 1.2109375, + "router_z_loss_mlp": 0.05432129, + "step": 3793, + "time_per_iteration": 2.9110665321350098 + }, + { + "auxiliary_loss_clip": 0.0654863, + "auxiliary_loss_mlp": 0.01287304, + "balance_loss_clip": 0.06306019, + "balance_loss_mlp": 0.01261889, + "epoch": 0.22810762062227566, + "flos": 26403300984960.0, + "grad_norm": 1.9908643306499119, + "language_loss": 0.78207439, + "learning_rate": 3.602232808409293e-06, + "loss": 0.8604337, + "num_input_tokens_seen": 81681625, + "router_z_loss_clip": 2.4296875, + "router_z_loss_mlp": 0.25415039, + "step": 3794, + "time_per_iteration": 2.5911734104156494 + }, + { + "auxiliary_loss_clip": 0.06544799, + "auxiliary_loss_mlp": 0.01285336, + "balance_loss_clip": 0.06310074, + "balance_loss_mlp": 0.01262412, + "epoch": 0.22816774387494362, + "flos": 25637445866880.0, + "grad_norm": 3.443157636284035, + "language_loss": 0.81285226, + "learning_rate": 3.6019996818787755e-06, + "loss": 0.89115357, + "num_input_tokens_seen": 81701170, + "router_z_loss_clip": 2.34960938, + "router_z_loss_mlp": 0.22912598, + "step": 3795, + "time_per_iteration": 2.6825528144836426 + }, + { + "auxiliary_loss_clip": 0.06536914, + "auxiliary_loss_mlp": 0.0128896, + "balance_loss_clip": 0.06306744, + "balance_loss_mlp": 0.01267586, + "epoch": 0.22822786712761162, + "flos": 22457507892480.0, + "grad_norm": 1.703568435651106, + "language_loss": 0.77948368, + "learning_rate": 3.6017664946002704e-06, + "loss": 0.85774243, + "num_input_tokens_seen": 81721265, + "router_z_loss_clip": 2.30664062, + "router_z_loss_mlp": 0.21362305, + "step": 3796, + "time_per_iteration": 2.5418922901153564 + }, + { + "auxiliary_loss_clip": 0.06535624, + "auxiliary_loss_mlp": 0.01278994, + "balance_loss_clip": 0.06302401, + "balance_loss_mlp": 0.01258692, + "epoch": 0.22828799038027958, + "flos": 12207323053440.0, + "grad_norm": 2.5041816771456076, + "language_loss": 0.96305406, + "learning_rate": 3.6015332465826188e-06, + "loss": 1.04120016, + "num_input_tokens_seen": 81736565, + "router_z_loss_clip": 2.33203125, + "router_z_loss_mlp": 0.20324707, + "step": 3797, + "time_per_iteration": 2.5794107913970947 + }, + { + "auxiliary_loss_clip": 0.06537494, + "auxiliary_loss_mlp": 0.01281478, + "balance_loss_clip": 0.06304877, + "balance_loss_mlp": 0.01260057, + "epoch": 0.22834811363294755, + "flos": 22091379477120.0, + "grad_norm": 1.517581709018558, + "language_loss": 0.82277977, + "learning_rate": 3.601299937834666e-06, + "loss": 0.90096951, + "num_input_tokens_seen": 81756240, + "router_z_loss_clip": 2.32421875, + "router_z_loss_mlp": 0.2142334, + "step": 3798, + "time_per_iteration": 2.618784189224243 + }, + { + "auxiliary_loss_clip": 0.06536907, + "auxiliary_loss_mlp": 0.01279844, + "balance_loss_clip": 0.06300005, + "balance_loss_mlp": 0.01257146, + "epoch": 0.2284082368856155, + "flos": 24867104555520.0, + "grad_norm": 1.8603662335211264, + "language_loss": 0.79381669, + "learning_rate": 3.6010665683652596e-06, + "loss": 0.87198418, + "num_input_tokens_seen": 81775720, + "router_z_loss_clip": 2.36914062, + "router_z_loss_mlp": 0.22705078, + "step": 3799, + "time_per_iteration": 2.591053009033203 + }, + { + "auxiliary_loss_clip": 0.06534393, + "auxiliary_loss_mlp": 0.0128126, + "balance_loss_clip": 0.06300979, + "balance_loss_mlp": 0.01258646, + "epoch": 0.22846836013828348, + "flos": 23299280409600.0, + "grad_norm": 1.5152328596048934, + "language_loss": 0.75782096, + "learning_rate": 3.6008331381832484e-06, + "loss": 0.83597749, + "num_input_tokens_seen": 81795830, + "router_z_loss_clip": 2.33398438, + "router_z_loss_mlp": 0.22619629, + "step": 3800, + "time_per_iteration": 2.5370395183563232 + }, + { + "auxiliary_loss_clip": 0.06535068, + "auxiliary_loss_mlp": 0.01279113, + "balance_loss_clip": 0.06302812, + "balance_loss_mlp": 0.01258001, + "epoch": 0.22852848339095144, + "flos": 27423462844800.0, + "grad_norm": 1.9420817073182375, + "language_loss": 0.64685607, + "learning_rate": 3.600599647297484e-06, + "loss": 0.72499788, + "num_input_tokens_seen": 81815745, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.21105957, + "step": 3801, + "time_per_iteration": 2.6190593242645264 + }, + { + "auxiliary_loss_clip": 0.06524718, + "auxiliary_loss_mlp": 0.01278565, + "balance_loss_clip": 0.06296816, + "balance_loss_mlp": 0.01257835, + "epoch": 0.2285886066436194, + "flos": 26328054418560.0, + "grad_norm": 1.6808395254049295, + "language_loss": 0.81957126, + "learning_rate": 3.60036609571682e-06, + "loss": 0.89760411, + "num_input_tokens_seen": 81835155, + "router_z_loss_clip": 2.27929688, + "router_z_loss_mlp": 0.20727539, + "step": 3802, + "time_per_iteration": 2.554079055786133 + }, + { + "auxiliary_loss_clip": 0.06534229, + "auxiliary_loss_mlp": 0.01286931, + "balance_loss_clip": 0.06299631, + "balance_loss_mlp": 0.0126415, + "epoch": 0.2286487298962874, + "flos": 29724298508160.0, + "grad_norm": 1.6760491170738747, + "language_loss": 0.79838073, + "learning_rate": 3.600132483450114e-06, + "loss": 0.87659228, + "num_input_tokens_seen": 81855655, + "router_z_loss_clip": 2.34960938, + "router_z_loss_mlp": 0.22790527, + "step": 3803, + "time_per_iteration": 2.6287641525268555 + }, + { + "auxiliary_loss_clip": 0.0653572, + "auxiliary_loss_mlp": 0.01279074, + "balance_loss_clip": 0.06296768, + "balance_loss_mlp": 0.012559, + "epoch": 0.22870885314895537, + "flos": 21293435445120.0, + "grad_norm": 1.7238152987334623, + "language_loss": 0.86273003, + "learning_rate": 3.5998988105062235e-06, + "loss": 0.94087803, + "num_input_tokens_seen": 81876385, + "router_z_loss_clip": 2.390625, + "router_z_loss_mlp": 0.23168945, + "step": 3804, + "time_per_iteration": 2.511462450027466 + }, + { + "auxiliary_loss_clip": 0.06539486, + "auxiliary_loss_mlp": 0.01279472, + "balance_loss_clip": 0.06301028, + "balance_loss_mlp": 0.01257537, + "epoch": 0.22876897640162333, + "flos": 14944754016000.0, + "grad_norm": 1.89266353651555, + "language_loss": 0.76854289, + "learning_rate": 3.59966507689401e-06, + "loss": 0.84673244, + "num_input_tokens_seen": 81893225, + "router_z_loss_clip": 2.38476562, + "router_z_loss_mlp": 0.21923828, + "step": 3805, + "time_per_iteration": 3.929358959197998 + }, + { + "auxiliary_loss_clip": 0.0654166, + "auxiliary_loss_mlp": 0.01280204, + "balance_loss_clip": 0.06298529, + "balance_loss_mlp": 0.01257542, + "epoch": 0.2288290996542913, + "flos": 18119786526720.0, + "grad_norm": 2.0123502787071073, + "language_loss": 0.79403114, + "learning_rate": 3.5994312826223363e-06, + "loss": 0.87224978, + "num_input_tokens_seen": 81911350, + "router_z_loss_clip": 2.43359375, + "router_z_loss_mlp": 0.22680664, + "step": 3806, + "time_per_iteration": 2.538203477859497 + }, + { + "auxiliary_loss_clip": 0.06540429, + "auxiliary_loss_mlp": 0.01282432, + "balance_loss_clip": 0.06303287, + "balance_loss_mlp": 0.01259878, + "epoch": 0.22888922290695926, + "flos": 39864296828160.0, + "grad_norm": 1.8839046523975558, + "language_loss": 0.70310783, + "learning_rate": 3.5991974277000684e-06, + "loss": 0.78133643, + "num_input_tokens_seen": 81935420, + "router_z_loss_clip": 2.37304688, + "router_z_loss_mlp": 0.22546387, + "step": 3807, + "time_per_iteration": 4.134840488433838 + }, + { + "auxiliary_loss_clip": 0.06550615, + "auxiliary_loss_mlp": 0.01290274, + "balance_loss_clip": 0.06307966, + "balance_loss_mlp": 0.01265121, + "epoch": 0.22894934615962723, + "flos": 23410431250560.0, + "grad_norm": 2.1946772997431103, + "language_loss": 0.65960705, + "learning_rate": 3.5989635121360733e-06, + "loss": 0.73801601, + "num_input_tokens_seen": 81953845, + "router_z_loss_clip": 2.42578125, + "router_z_loss_mlp": 0.25183105, + "step": 3808, + "time_per_iteration": 2.561497688293457 + }, + { + "auxiliary_loss_clip": 0.06539108, + "auxiliary_loss_mlp": 0.01281064, + "balance_loss_clip": 0.06300798, + "balance_loss_mlp": 0.01259154, + "epoch": 0.22900946941229522, + "flos": 18848898829440.0, + "grad_norm": 1.7761632941249064, + "language_loss": 0.75198555, + "learning_rate": 3.598729535939222e-06, + "loss": 0.83018732, + "num_input_tokens_seen": 81972100, + "router_z_loss_clip": 2.38476562, + "router_z_loss_mlp": 0.21899414, + "step": 3809, + "time_per_iteration": 2.490895986557007 + }, + { + "auxiliary_loss_clip": 0.06533305, + "auxiliary_loss_mlp": 0.0127892, + "balance_loss_clip": 0.06299955, + "balance_loss_mlp": 0.01257331, + "epoch": 0.22906959266496318, + "flos": 22935961105920.0, + "grad_norm": 1.4656596651362013, + "language_loss": 0.82576305, + "learning_rate": 3.5984954991183862e-06, + "loss": 0.90388525, + "num_input_tokens_seen": 81992760, + "router_z_loss_clip": 2.33398438, + "router_z_loss_mlp": 0.21606445, + "step": 3810, + "time_per_iteration": 2.5684924125671387 + }, + { + "auxiliary_loss_clip": 0.06535805, + "auxiliary_loss_mlp": 0.01278794, + "balance_loss_clip": 0.06303711, + "balance_loss_mlp": 0.01259041, + "epoch": 0.22912971591763115, + "flos": 19360614913920.0, + "grad_norm": 1.8664104481323773, + "language_loss": 0.79914212, + "learning_rate": 3.598261401682441e-06, + "loss": 0.8772881, + "num_input_tokens_seen": 82009080, + "router_z_loss_clip": 2.31835938, + "router_z_loss_mlp": 0.19750977, + "step": 3811, + "time_per_iteration": 3.9766526222229004 + }, + { + "auxiliary_loss_clip": 0.0653518, + "auxiliary_loss_mlp": 0.01280553, + "balance_loss_clip": 0.06300636, + "balance_loss_mlp": 0.01258976, + "epoch": 0.22918983917029911, + "flos": 19938940594560.0, + "grad_norm": 1.7476175457386653, + "language_loss": 0.83391893, + "learning_rate": 3.5980272436402632e-06, + "loss": 0.91207623, + "num_input_tokens_seen": 82026705, + "router_z_loss_clip": 2.34765625, + "router_z_loss_mlp": 0.21569824, + "step": 3812, + "time_per_iteration": 2.5174708366394043 + }, + { + "auxiliary_loss_clip": 0.0655017, + "auxiliary_loss_mlp": 0.01288002, + "balance_loss_clip": 0.06306149, + "balance_loss_mlp": 0.01264673, + "epoch": 0.22924996242296708, + "flos": 16696501873920.0, + "grad_norm": 2.3839142545709886, + "language_loss": 0.8400377, + "learning_rate": 3.5977930250007324e-06, + "loss": 0.91841948, + "num_input_tokens_seen": 82043245, + "router_z_loss_clip": 2.44335938, + "router_z_loss_mlp": 0.2331543, + "step": 3813, + "time_per_iteration": 2.554779052734375 + }, + { + "auxiliary_loss_clip": 0.06538843, + "auxiliary_loss_mlp": 0.01276305, + "balance_loss_clip": 0.06301966, + "balance_loss_mlp": 0.01255456, + "epoch": 0.22931008567563504, + "flos": 33044457490560.0, + "grad_norm": 1.6858267943586043, + "language_loss": 0.70580167, + "learning_rate": 3.5975587457727298e-06, + "loss": 0.78395313, + "num_input_tokens_seen": 82066870, + "router_z_loss_clip": 2.36914062, + "router_z_loss_mlp": 0.20861816, + "step": 3814, + "time_per_iteration": 2.6764509677886963 + }, + { + "auxiliary_loss_clip": 0.06536946, + "auxiliary_loss_mlp": 0.01276372, + "balance_loss_clip": 0.06305344, + "balance_loss_mlp": 0.01256786, + "epoch": 0.229370208928303, + "flos": 23337322963200.0, + "grad_norm": 2.8831118113675114, + "language_loss": 0.67954975, + "learning_rate": 3.597324405965139e-06, + "loss": 0.75768292, + "num_input_tokens_seen": 82083180, + "router_z_loss_clip": 2.31445312, + "router_z_loss_mlp": 0.19604492, + "step": 3815, + "time_per_iteration": 3.9759562015533447 + }, + { + "auxiliary_loss_clip": 0.06547147, + "auxiliary_loss_mlp": 0.01282792, + "balance_loss_clip": 0.06311129, + "balance_loss_mlp": 0.01259952, + "epoch": 0.229430332180971, + "flos": 28624068472320.0, + "grad_norm": 1.7261339214380451, + "language_loss": 0.83511633, + "learning_rate": 3.597090005586848e-06, + "loss": 0.91341567, + "num_input_tokens_seen": 82102950, + "router_z_loss_clip": 2.359375, + "router_z_loss_mlp": 0.22839355, + "step": 3816, + "time_per_iteration": 2.6059420108795166 + }, + { + "auxiliary_loss_clip": 0.06539545, + "auxiliary_loss_mlp": 0.01275582, + "balance_loss_clip": 0.06303526, + "balance_loss_mlp": 0.01253302, + "epoch": 0.22949045543363897, + "flos": 17243912597760.0, + "grad_norm": 2.759151157832335, + "language_loss": 0.87850988, + "learning_rate": 3.596855544646742e-06, + "loss": 0.95666116, + "num_input_tokens_seen": 82119510, + "router_z_loss_clip": 2.36132812, + "router_z_loss_mlp": 0.22290039, + "step": 3817, + "time_per_iteration": 2.4830808639526367 + }, + { + "auxiliary_loss_clip": 0.06543944, + "auxiliary_loss_mlp": 0.01278311, + "balance_loss_clip": 0.06306894, + "balance_loss_mlp": 0.01256412, + "epoch": 0.22955057868630693, + "flos": 27496654986240.0, + "grad_norm": 1.6534336608142677, + "language_loss": 0.75343978, + "learning_rate": 3.5966210231537154e-06, + "loss": 0.83166242, + "num_input_tokens_seen": 82140095, + "router_z_loss_clip": 2.37304688, + "router_z_loss_mlp": 0.21899414, + "step": 3818, + "time_per_iteration": 2.634387969970703 + }, + { + "auxiliary_loss_clip": 0.06541272, + "auxiliary_loss_mlp": 0.01278617, + "balance_loss_clip": 0.06305389, + "balance_loss_mlp": 0.0125524, + "epoch": 0.2296107019389749, + "flos": 23483036413440.0, + "grad_norm": 1.7338201278327374, + "language_loss": 0.75486314, + "learning_rate": 3.596386441116659e-06, + "loss": 0.83306205, + "num_input_tokens_seen": 82159510, + "router_z_loss_clip": 2.359375, + "router_z_loss_mlp": 0.23376465, + "step": 3819, + "time_per_iteration": 2.593780279159546 + }, + { + "auxiliary_loss_clip": 0.06542156, + "auxiliary_loss_mlp": 0.01283095, + "balance_loss_clip": 0.06305272, + "balance_loss_mlp": 0.01263009, + "epoch": 0.22967082519164286, + "flos": 31293212757120.0, + "grad_norm": 1.753994919034331, + "language_loss": 0.8208195, + "learning_rate": 3.5961517985444684e-06, + "loss": 0.89907205, + "num_input_tokens_seen": 82179580, + "router_z_loss_clip": 2.3671875, + "router_z_loss_mlp": 0.20092773, + "step": 3820, + "time_per_iteration": 2.6047699451446533 + }, + { + "auxiliary_loss_clip": 0.06548945, + "auxiliary_loss_mlp": 0.0128207, + "balance_loss_clip": 0.06306617, + "balance_loss_mlp": 0.0125892, + "epoch": 0.22973094844431083, + "flos": 14647415402880.0, + "grad_norm": 4.329935521611207, + "language_loss": 0.70069146, + "learning_rate": 3.595917095446042e-06, + "loss": 0.77900159, + "num_input_tokens_seen": 82195585, + "router_z_loss_clip": 2.42773438, + "router_z_loss_mlp": 0.23156738, + "step": 3821, + "time_per_iteration": 2.479454517364502 + }, + { + "auxiliary_loss_clip": 0.06540461, + "auxiliary_loss_mlp": 0.01284444, + "balance_loss_clip": 0.06305948, + "balance_loss_mlp": 0.0126177, + "epoch": 0.2297910716969788, + "flos": 22831057393920.0, + "grad_norm": 2.1026243527938897, + "language_loss": 0.83607674, + "learning_rate": 3.5956823318302796e-06, + "loss": 0.91432583, + "num_input_tokens_seen": 82217530, + "router_z_loss_clip": 2.34375, + "router_z_loss_mlp": 0.22668457, + "step": 3822, + "time_per_iteration": 2.6070644855499268 + }, + { + "auxiliary_loss_clip": 0.06532617, + "auxiliary_loss_mlp": 0.01279894, + "balance_loss_clip": 0.06300794, + "balance_loss_mlp": 0.01256637, + "epoch": 0.2298511949496468, + "flos": 23045644500480.0, + "grad_norm": 1.4679532921797136, + "language_loss": 0.66860032, + "learning_rate": 3.5954475077060833e-06, + "loss": 0.74672538, + "num_input_tokens_seen": 82237980, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.23266602, + "step": 3823, + "time_per_iteration": 2.5421886444091797 + }, + { + "auxiliary_loss_clip": 0.06414426, + "auxiliary_loss_mlp": 0.01282472, + "balance_loss_clip": 0.062925, + "balance_loss_mlp": 0.01277524, + "epoch": 0.22991131820231475, + "flos": 66910296228480.0, + "grad_norm": 0.7674542175482253, + "language_loss": 0.56982124, + "learning_rate": 3.595212623082357e-06, + "loss": 0.64679027, + "num_input_tokens_seen": 82301785, + "router_z_loss_clip": 1.21875, + "router_z_loss_mlp": 0.04943848, + "step": 3824, + "time_per_iteration": 3.2466728687286377 + }, + { + "auxiliary_loss_clip": 0.06530097, + "auxiliary_loss_mlp": 0.0127961, + "balance_loss_clip": 0.06299412, + "balance_loss_mlp": 0.01258975, + "epoch": 0.22997144145498272, + "flos": 17891782767360.0, + "grad_norm": 2.0818696062092643, + "language_loss": 0.73658061, + "learning_rate": 3.594977677968009e-06, + "loss": 0.81467766, + "num_input_tokens_seen": 82317355, + "router_z_loss_clip": 2.30664062, + "router_z_loss_mlp": 0.2064209, + "step": 3825, + "time_per_iteration": 2.4705512523651123 + }, + { + "auxiliary_loss_clip": 0.06534772, + "auxiliary_loss_mlp": 0.01279784, + "balance_loss_clip": 0.06299614, + "balance_loss_mlp": 0.01257432, + "epoch": 0.23003156470765068, + "flos": 24683055062400.0, + "grad_norm": 2.356013632504241, + "language_loss": 0.88289648, + "learning_rate": 3.5947426723719473e-06, + "loss": 0.96104205, + "num_input_tokens_seen": 82336645, + "router_z_loss_clip": 2.3515625, + "router_z_loss_mlp": 0.22351074, + "step": 3826, + "time_per_iteration": 2.5636119842529297 + }, + { + "auxiliary_loss_clip": 0.06540347, + "auxiliary_loss_mlp": 0.01282145, + "balance_loss_clip": 0.0629928, + "balance_loss_mlp": 0.0125897, + "epoch": 0.23009168796031865, + "flos": 15819412060800.0, + "grad_norm": 2.476820030154751, + "language_loss": 0.81866372, + "learning_rate": 3.594507606303083e-06, + "loss": 0.89688861, + "num_input_tokens_seen": 82354225, + "router_z_loss_clip": 2.41601562, + "router_z_loss_mlp": 0.23181152, + "step": 3827, + "time_per_iteration": 2.4817094802856445 + }, + { + "auxiliary_loss_clip": 0.06527712, + "auxiliary_loss_mlp": 0.01278643, + "balance_loss_clip": 0.06294617, + "balance_loss_mlp": 0.01256911, + "epoch": 0.2301518112129866, + "flos": 16217755171200.0, + "grad_norm": 1.7308897820243296, + "language_loss": 0.87303799, + "learning_rate": 3.5942724797703314e-06, + "loss": 0.95110154, + "num_input_tokens_seen": 82370240, + "router_z_loss_clip": 2.33398438, + "router_z_loss_mlp": 0.21716309, + "step": 3828, + "time_per_iteration": 2.517916202545166 + }, + { + "auxiliary_loss_clip": 0.06537049, + "auxiliary_loss_mlp": 0.01281894, + "balance_loss_clip": 0.06300969, + "balance_loss_mlp": 0.01260686, + "epoch": 0.2302119344656546, + "flos": 20601820644480.0, + "grad_norm": 2.1621841127041668, + "language_loss": 0.71223086, + "learning_rate": 3.594037292782607e-06, + "loss": 0.79042029, + "num_input_tokens_seen": 82389145, + "router_z_loss_clip": 2.36328125, + "router_z_loss_mlp": 0.21191406, + "step": 3829, + "time_per_iteration": 2.5232293605804443 + }, + { + "auxiliary_loss_clip": 0.06527743, + "auxiliary_loss_mlp": 0.01278561, + "balance_loss_clip": 0.06299868, + "balance_loss_mlp": 0.01258629, + "epoch": 0.23027205771832257, + "flos": 26804117790720.0, + "grad_norm": 1.5730479724984117, + "language_loss": 0.84944689, + "learning_rate": 3.5938020453488293e-06, + "loss": 0.92750996, + "num_input_tokens_seen": 82409185, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.19934082, + "step": 3830, + "time_per_iteration": 2.6153595447540283 + }, + { + "auxiliary_loss_clip": 0.0653088, + "auxiliary_loss_mlp": 0.01278488, + "balance_loss_clip": 0.06299009, + "balance_loss_mlp": 0.01256863, + "epoch": 0.23033218097099054, + "flos": 43883365916160.0, + "grad_norm": 2.1076872960056834, + "language_loss": 0.67121679, + "learning_rate": 3.5935667374779177e-06, + "loss": 0.74931049, + "num_input_tokens_seen": 82432070, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.21630859, + "step": 3831, + "time_per_iteration": 2.7302401065826416 + }, + { + "auxiliary_loss_clip": 0.06528492, + "auxiliary_loss_mlp": 0.0127826, + "balance_loss_clip": 0.06295311, + "balance_loss_mlp": 0.01255944, + "epoch": 0.2303923042236585, + "flos": 26074837779840.0, + "grad_norm": 2.0679638399971525, + "language_loss": 0.7580992, + "learning_rate": 3.5933313691787957e-06, + "loss": 0.83616674, + "num_input_tokens_seen": 82450625, + "router_z_loss_clip": 2.33203125, + "router_z_loss_mlp": 0.2232666, + "step": 3832, + "time_per_iteration": 2.5789363384246826 + }, + { + "auxiliary_loss_clip": 0.06538022, + "auxiliary_loss_mlp": 0.01277154, + "balance_loss_clip": 0.06301656, + "balance_loss_mlp": 0.01254731, + "epoch": 0.23045242747632647, + "flos": 18302284719360.0, + "grad_norm": 1.9809188001289737, + "language_loss": 0.88229948, + "learning_rate": 3.593095940460389e-06, + "loss": 0.96045125, + "num_input_tokens_seen": 82468575, + "router_z_loss_clip": 2.36523438, + "router_z_loss_mlp": 0.22387695, + "step": 3833, + "time_per_iteration": 2.4890406131744385 + }, + { + "auxiliary_loss_clip": 0.06526786, + "auxiliary_loss_mlp": 0.01275622, + "balance_loss_clip": 0.06291149, + "balance_loss_mlp": 0.01253295, + "epoch": 0.23051255072899443, + "flos": 25527636691200.0, + "grad_norm": 1.751792699614105, + "language_loss": 0.75447762, + "learning_rate": 3.592860451331624e-06, + "loss": 0.83250165, + "num_input_tokens_seen": 82488655, + "router_z_loss_clip": 2.35742188, + "router_z_loss_mlp": 0.2232666, + "step": 3834, + "time_per_iteration": 2.5706892013549805 + }, + { + "auxiliary_loss_clip": 0.06528607, + "auxiliary_loss_mlp": 0.0128462, + "balance_loss_clip": 0.06295913, + "balance_loss_mlp": 0.01262089, + "epoch": 0.2305726739816624, + "flos": 21221584968960.0, + "grad_norm": 2.065687600185831, + "language_loss": 0.86859775, + "learning_rate": 3.592624901801432e-06, + "loss": 0.94673002, + "num_input_tokens_seen": 82507220, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.2253418, + "step": 3835, + "time_per_iteration": 2.5243782997131348 + }, + { + "auxiliary_loss_clip": 0.06531255, + "auxiliary_loss_mlp": 0.01277066, + "balance_loss_clip": 0.06292518, + "balance_loss_mlp": 0.01255489, + "epoch": 0.2306327972343304, + "flos": 23337826087680.0, + "grad_norm": 2.699164056519065, + "language_loss": 0.8346436, + "learning_rate": 3.5923892918787432e-06, + "loss": 0.91272676, + "num_input_tokens_seen": 82527920, + "router_z_loss_clip": 2.38671875, + "router_z_loss_mlp": 0.21594238, + "step": 3836, + "time_per_iteration": 2.588916301727295 + }, + { + "auxiliary_loss_clip": 0.06530184, + "auxiliary_loss_mlp": 0.01278505, + "balance_loss_clip": 0.0629724, + "balance_loss_mlp": 0.01257918, + "epoch": 0.23069292048699835, + "flos": 20672832579840.0, + "grad_norm": 1.5308621387149557, + "language_loss": 0.80123997, + "learning_rate": 3.5921536215724934e-06, + "loss": 0.87932694, + "num_input_tokens_seen": 82549040, + "router_z_loss_clip": 2.328125, + "router_z_loss_mlp": 0.20581055, + "step": 3837, + "time_per_iteration": 2.5265891551971436 + }, + { + "auxiliary_loss_clip": 0.06398934, + "auxiliary_loss_mlp": 0.01263477, + "balance_loss_clip": 0.06276935, + "balance_loss_mlp": 0.01257871, + "epoch": 0.23075304373966632, + "flos": 70472854673280.0, + "grad_norm": 0.8661269137999401, + "language_loss": 0.65425092, + "learning_rate": 3.5919178908916184e-06, + "loss": 0.73087507, + "num_input_tokens_seen": 82604070, + "router_z_loss_clip": 1.21875, + "router_z_loss_mlp": 0.05606079, + "step": 3838, + "time_per_iteration": 3.0690691471099854 + }, + { + "auxiliary_loss_clip": 0.06529964, + "auxiliary_loss_mlp": 0.01281931, + "balance_loss_clip": 0.0629662, + "balance_loss_mlp": 0.01260592, + "epoch": 0.23081316699233428, + "flos": 16623603221760.0, + "grad_norm": 1.9712307402798914, + "language_loss": 0.76919234, + "learning_rate": 3.591682099845058e-06, + "loss": 0.84731126, + "num_input_tokens_seen": 82619665, + "router_z_loss_clip": 2.33398438, + "router_z_loss_mlp": 0.21337891, + "step": 3839, + "time_per_iteration": 2.507899522781372 + }, + { + "auxiliary_loss_clip": 0.06539556, + "auxiliary_loss_mlp": 0.01283771, + "balance_loss_clip": 0.06303147, + "balance_loss_mlp": 0.01261873, + "epoch": 0.23087329024500225, + "flos": 13303192677120.0, + "grad_norm": 1.9535711626830803, + "language_loss": 0.6973604, + "learning_rate": 3.591446248441752e-06, + "loss": 0.77559364, + "num_input_tokens_seen": 82637530, + "router_z_loss_clip": 2.36523438, + "router_z_loss_mlp": 0.21899414, + "step": 3840, + "time_per_iteration": 2.507403612136841 + }, + { + "auxiliary_loss_clip": 0.06524121, + "auxiliary_loss_mlp": 0.01283726, + "balance_loss_clip": 0.06291715, + "balance_loss_mlp": 0.01261994, + "epoch": 0.23093341349767021, + "flos": 17791574883840.0, + "grad_norm": 2.1010490795203967, + "language_loss": 0.79679501, + "learning_rate": 3.591210336690645e-06, + "loss": 0.87487352, + "num_input_tokens_seen": 82656130, + "router_z_loss_clip": 2.32421875, + "router_z_loss_mlp": 0.21740723, + "step": 3841, + "time_per_iteration": 2.542506456375122 + }, + { + "auxiliary_loss_clip": 0.06525128, + "auxiliary_loss_mlp": 0.0128577, + "balance_loss_clip": 0.06292316, + "balance_loss_mlp": 0.0126591, + "epoch": 0.23099353675033818, + "flos": 23994920206080.0, + "grad_norm": 2.202794692504719, + "language_loss": 0.83472121, + "learning_rate": 3.590974364600683e-06, + "loss": 0.91283023, + "num_input_tokens_seen": 82675295, + "router_z_loss_clip": 2.328125, + "router_z_loss_mlp": 0.19873047, + "step": 3842, + "time_per_iteration": 2.5885045528411865 + }, + { + "auxiliary_loss_clip": 0.06525495, + "auxiliary_loss_mlp": 0.01277864, + "balance_loss_clip": 0.06294134, + "balance_loss_mlp": 0.01256251, + "epoch": 0.23105366000300617, + "flos": 36004567478400.0, + "grad_norm": 1.5198018897685672, + "language_loss": 0.66582537, + "learning_rate": 3.5907383321808135e-06, + "loss": 0.74385899, + "num_input_tokens_seen": 82703260, + "router_z_loss_clip": 2.3125, + "router_z_loss_mlp": 0.21630859, + "step": 3843, + "time_per_iteration": 2.7418570518493652 + }, + { + "auxiliary_loss_clip": 0.06517389, + "auxiliary_loss_mlp": 0.01282302, + "balance_loss_clip": 0.06289946, + "balance_loss_mlp": 0.01261667, + "epoch": 0.23111378325567414, + "flos": 31252822289280.0, + "grad_norm": 2.0273673860648613, + "language_loss": 0.77953953, + "learning_rate": 3.590502239439987e-06, + "loss": 0.85753644, + "num_input_tokens_seen": 82725060, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.2064209, + "step": 3844, + "time_per_iteration": 2.697105884552002 + }, + { + "auxiliary_loss_clip": 0.0652685, + "auxiliary_loss_mlp": 0.01278908, + "balance_loss_clip": 0.0629425, + "balance_loss_mlp": 0.01258618, + "epoch": 0.2311739065083421, + "flos": 19214230631040.0, + "grad_norm": 1.5733936305181, + "language_loss": 0.78526026, + "learning_rate": 3.590266086387156e-06, + "loss": 0.86331779, + "num_input_tokens_seen": 82742960, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.20275879, + "step": 3845, + "time_per_iteration": 3.9081645011901855 + }, + { + "auxiliary_loss_clip": 0.06512116, + "auxiliary_loss_mlp": 0.01275528, + "balance_loss_clip": 0.06288872, + "balance_loss_mlp": 0.01256323, + "epoch": 0.23123402976101007, + "flos": 23365638443520.0, + "grad_norm": 2.144369954512039, + "language_loss": 0.7696318, + "learning_rate": 3.590029873031276e-06, + "loss": 0.84750825, + "num_input_tokens_seen": 82760205, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.1920166, + "step": 3846, + "time_per_iteration": 2.5204334259033203 + }, + { + "auxiliary_loss_clip": 0.06530652, + "auxiliary_loss_mlp": 0.01280785, + "balance_loss_clip": 0.06296441, + "balance_loss_mlp": 0.01258946, + "epoch": 0.23129415301367803, + "flos": 13740458808960.0, + "grad_norm": 2.058546116129278, + "language_loss": 0.70736533, + "learning_rate": 3.589793599381304e-06, + "loss": 0.78547966, + "num_input_tokens_seen": 82778590, + "router_z_loss_clip": 2.34570312, + "router_z_loss_mlp": 0.21862793, + "step": 3847, + "time_per_iteration": 3.955061197280884 + }, + { + "auxiliary_loss_clip": 0.06395237, + "auxiliary_loss_mlp": 0.01270099, + "balance_loss_clip": 0.06274906, + "balance_loss_mlp": 0.01264553, + "epoch": 0.231354276266346, + "flos": 69756907461120.0, + "grad_norm": 0.7764718422559022, + "language_loss": 0.60909712, + "learning_rate": 3.589557265446198e-06, + "loss": 0.68575048, + "num_input_tokens_seen": 82833925, + "router_z_loss_clip": 1.203125, + "router_z_loss_mlp": 0.05557251, + "step": 3848, + "time_per_iteration": 3.0406246185302734 + }, + { + "auxiliary_loss_clip": 0.0652846, + "auxiliary_loss_mlp": 0.0128118, + "balance_loss_clip": 0.06295802, + "balance_loss_mlp": 0.01259925, + "epoch": 0.231414399519014, + "flos": 18840597275520.0, + "grad_norm": 2.051565204924659, + "language_loss": 0.79345453, + "learning_rate": 3.589320871234923e-06, + "loss": 0.87155092, + "num_input_tokens_seen": 82850625, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.21252441, + "step": 3849, + "time_per_iteration": 2.508357048034668 + }, + { + "auxiliary_loss_clip": 0.06525768, + "auxiliary_loss_mlp": 0.01279584, + "balance_loss_clip": 0.06294318, + "balance_loss_mlp": 0.01257995, + "epoch": 0.23147452277168196, + "flos": 36143949945600.0, + "grad_norm": 1.9799304996672493, + "language_loss": 0.72033536, + "learning_rate": 3.5890844167564405e-06, + "loss": 0.7983889, + "num_input_tokens_seen": 82872105, + "router_z_loss_clip": 2.31640625, + "router_z_loss_mlp": 0.21594238, + "step": 3850, + "time_per_iteration": 2.6283209323883057 + }, + { + "auxiliary_loss_clip": 0.06522007, + "auxiliary_loss_mlp": 0.012814, + "balance_loss_clip": 0.06293751, + "balance_loss_mlp": 0.01260562, + "epoch": 0.23153464602434992, + "flos": 20819091081600.0, + "grad_norm": 2.1585980033328216, + "language_loss": 0.76770389, + "learning_rate": 3.588847902019718e-06, + "loss": 0.84573799, + "num_input_tokens_seen": 82890595, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.20825195, + "step": 3851, + "time_per_iteration": 3.9542527198791504 + }, + { + "auxiliary_loss_clip": 0.06522575, + "auxiliary_loss_mlp": 0.01285563, + "balance_loss_clip": 0.06294242, + "balance_loss_mlp": 0.01264367, + "epoch": 0.2315947692770179, + "flos": 19945606993920.0, + "grad_norm": 4.396515099862161, + "language_loss": 0.70780337, + "learning_rate": 3.588611327033723e-06, + "loss": 0.78588474, + "num_input_tokens_seen": 82908910, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.21191406, + "step": 3852, + "time_per_iteration": 2.5292365550994873 + }, + { + "auxiliary_loss_clip": 0.06530476, + "auxiliary_loss_mlp": 0.01287483, + "balance_loss_clip": 0.0629744, + "balance_loss_mlp": 0.01267027, + "epoch": 0.23165489252968585, + "flos": 12859805197440.0, + "grad_norm": 2.0519661349019906, + "language_loss": 0.68142366, + "learning_rate": 3.588374691807428e-06, + "loss": 0.75960326, + "num_input_tokens_seen": 82925405, + "router_z_loss_clip": 2.33203125, + "router_z_loss_mlp": 0.20471191, + "step": 3853, + "time_per_iteration": 2.524214267730713 + }, + { + "auxiliary_loss_clip": 0.06532255, + "auxiliary_loss_mlp": 0.0127975, + "balance_loss_clip": 0.06299816, + "balance_loss_mlp": 0.01258579, + "epoch": 0.23171501578235382, + "flos": 30636202492800.0, + "grad_norm": 2.067759569090495, + "language_loss": 0.80620718, + "learning_rate": 3.5881379963498053e-06, + "loss": 0.88432729, + "num_input_tokens_seen": 82945615, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.21166992, + "step": 3854, + "time_per_iteration": 3.9913628101348877 + }, + { + "auxiliary_loss_clip": 0.06540599, + "auxiliary_loss_mlp": 0.0128392, + "balance_loss_clip": 0.06299743, + "balance_loss_mlp": 0.0126201, + "epoch": 0.23177513903502178, + "flos": 23849709880320.0, + "grad_norm": 1.9679065377847755, + "language_loss": 0.66096866, + "learning_rate": 3.587901240669831e-06, + "loss": 0.73921382, + "num_input_tokens_seen": 82967570, + "router_z_loss_clip": 2.4140625, + "router_z_loss_mlp": 0.21899414, + "step": 3855, + "time_per_iteration": 2.560032844543457 + }, + { + "auxiliary_loss_clip": 0.06526054, + "auxiliary_loss_mlp": 0.0129156, + "balance_loss_clip": 0.06295231, + "balance_loss_mlp": 0.0126972, + "epoch": 0.23183526228768978, + "flos": 29578040006400.0, + "grad_norm": 1.903884891832667, + "language_loss": 0.71179903, + "learning_rate": 3.5876644247764815e-06, + "loss": 0.78997517, + "num_input_tokens_seen": 82987435, + "router_z_loss_clip": 2.30664062, + "router_z_loss_mlp": 0.21838379, + "step": 3856, + "time_per_iteration": 2.602130174636841 + }, + { + "auxiliary_loss_clip": 0.06526691, + "auxiliary_loss_mlp": 0.01281572, + "balance_loss_clip": 0.06295416, + "balance_loss_mlp": 0.01261032, + "epoch": 0.23189538554035774, + "flos": 34467155164800.0, + "grad_norm": 1.5724941960823864, + "language_loss": 0.77830631, + "learning_rate": 3.5874275486787387e-06, + "loss": 0.85638893, + "num_input_tokens_seen": 83010505, + "router_z_loss_clip": 2.3125, + "router_z_loss_mlp": 0.20532227, + "step": 3857, + "time_per_iteration": 2.6366043090820312 + }, + { + "auxiliary_loss_clip": 0.06534412, + "auxiliary_loss_mlp": 0.0128226, + "balance_loss_clip": 0.06299518, + "balance_loss_mlp": 0.01259813, + "epoch": 0.2319555087930257, + "flos": 18009558080640.0, + "grad_norm": 2.2572913357008804, + "language_loss": 0.91563249, + "learning_rate": 3.587190612385584e-06, + "loss": 0.99379921, + "num_input_tokens_seen": 83026705, + "router_z_loss_clip": 2.34765625, + "router_z_loss_mlp": 0.2244873, + "step": 3858, + "time_per_iteration": 2.532270908355713 + }, + { + "auxiliary_loss_clip": 0.06524485, + "auxiliary_loss_mlp": 0.01281992, + "balance_loss_clip": 0.06299204, + "balance_loss_mlp": 0.01261833, + "epoch": 0.23201563204569367, + "flos": 23149709671680.0, + "grad_norm": 2.204043049012761, + "language_loss": 0.77328205, + "learning_rate": 3.5869536159060026e-06, + "loss": 0.85134679, + "num_input_tokens_seen": 83046500, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.20153809, + "step": 3859, + "time_per_iteration": 2.539982318878174 + }, + { + "auxiliary_loss_clip": 0.06526206, + "auxiliary_loss_mlp": 0.01282174, + "balance_loss_clip": 0.0629694, + "balance_loss_mlp": 0.01261098, + "epoch": 0.23207575529836164, + "flos": 20674300026240.0, + "grad_norm": 1.845949683873727, + "language_loss": 0.84980345, + "learning_rate": 3.58671655924898e-06, + "loss": 0.9278872, + "num_input_tokens_seen": 83065280, + "router_z_loss_clip": 2.29101562, + "router_z_loss_mlp": 0.21057129, + "step": 3860, + "time_per_iteration": 2.5464277267456055 + }, + { + "auxiliary_loss_clip": 0.06522566, + "auxiliary_loss_mlp": 0.01275514, + "balance_loss_clip": 0.06296555, + "balance_loss_mlp": 0.01254927, + "epoch": 0.2321358785510296, + "flos": 16477805917440.0, + "grad_norm": 2.2860023761203423, + "language_loss": 0.83316106, + "learning_rate": 3.586479442423508e-06, + "loss": 0.91114187, + "num_input_tokens_seen": 83082310, + "router_z_loss_clip": 2.25976562, + "router_z_loss_mlp": 0.20581055, + "step": 3861, + "time_per_iteration": 2.611527681350708 + }, + { + "auxiliary_loss_clip": 0.06526297, + "auxiliary_loss_mlp": 0.01281702, + "balance_loss_clip": 0.06296666, + "balance_loss_mlp": 0.01261198, + "epoch": 0.2321960018036976, + "flos": 21622737191040.0, + "grad_norm": 1.932164160561112, + "language_loss": 0.86100018, + "learning_rate": 3.586242265438576e-06, + "loss": 0.93908012, + "num_input_tokens_seen": 83102065, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.2052002, + "step": 3862, + "time_per_iteration": 2.599078893661499 + }, + { + "auxiliary_loss_clip": 0.06517789, + "auxiliary_loss_mlp": 0.01277863, + "balance_loss_clip": 0.0629621, + "balance_loss_mlp": 0.0125898, + "epoch": 0.23225612505636556, + "flos": 22277734957440.0, + "grad_norm": 1.8279700206037066, + "language_loss": 0.75524014, + "learning_rate": 3.5860050283031773e-06, + "loss": 0.83319664, + "num_input_tokens_seen": 83121445, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.18884277, + "step": 3863, + "time_per_iteration": 2.5592801570892334 + }, + { + "auxiliary_loss_clip": 0.06518993, + "auxiliary_loss_mlp": 0.01279608, + "balance_loss_clip": 0.06295245, + "balance_loss_mlp": 0.01260237, + "epoch": 0.23231624830903352, + "flos": 17057431336320.0, + "grad_norm": 1.8656538002376628, + "language_loss": 0.7504397, + "learning_rate": 3.58576773102631e-06, + "loss": 0.82842577, + "num_input_tokens_seen": 83138175, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.19372559, + "step": 3864, + "time_per_iteration": 2.549480438232422 + }, + { + "auxiliary_loss_clip": 0.06521947, + "auxiliary_loss_mlp": 0.01276148, + "balance_loss_clip": 0.06295212, + "balance_loss_mlp": 0.01255572, + "epoch": 0.2323763715617015, + "flos": 34648353619200.0, + "grad_norm": 2.1960138476201023, + "language_loss": 0.70505309, + "learning_rate": 3.5855303736169714e-06, + "loss": 0.78303403, + "num_input_tokens_seen": 83161975, + "router_z_loss_clip": 2.26757812, + "router_z_loss_mlp": 0.20568848, + "step": 3865, + "time_per_iteration": 2.6358752250671387 + }, + { + "auxiliary_loss_clip": 0.06539118, + "auxiliary_loss_mlp": 0.01279948, + "balance_loss_clip": 0.06299968, + "balance_loss_mlp": 0.01256464, + "epoch": 0.23243649481436945, + "flos": 25557922742400.0, + "grad_norm": 1.8533317501805489, + "language_loss": 0.95648015, + "learning_rate": 3.5852929560841617e-06, + "loss": 1.03467083, + "num_input_tokens_seen": 83180905, + "router_z_loss_clip": 2.39257812, + "router_z_loss_mlp": 0.23510742, + "step": 3866, + "time_per_iteration": 2.5805771350860596 + }, + { + "auxiliary_loss_clip": 0.06523386, + "auxiliary_loss_mlp": 0.0128215, + "balance_loss_clip": 0.06294955, + "balance_loss_mlp": 0.01260561, + "epoch": 0.23249661806703742, + "flos": 20489411992320.0, + "grad_norm": 3.3036871554572285, + "language_loss": 0.74161094, + "learning_rate": 3.5850554784368846e-06, + "loss": 0.81966627, + "num_input_tokens_seen": 83196390, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.21569824, + "step": 3867, + "time_per_iteration": 2.485872268676758 + }, + { + "auxiliary_loss_clip": 0.06527717, + "auxiliary_loss_mlp": 0.01278812, + "balance_loss_clip": 0.06298171, + "balance_loss_mlp": 0.01257271, + "epoch": 0.23255674131970538, + "flos": 20382956979840.0, + "grad_norm": 1.7596317335066716, + "language_loss": 0.82912898, + "learning_rate": 3.584817940684145e-06, + "loss": 0.90719432, + "num_input_tokens_seen": 83216165, + "router_z_loss_clip": 2.29296875, + "router_z_loss_mlp": 0.2154541, + "step": 3868, + "time_per_iteration": 2.5404841899871826 + }, + { + "auxiliary_loss_clip": 0.06518516, + "auxiliary_loss_mlp": 0.01279395, + "balance_loss_clip": 0.0629604, + "balance_loss_mlp": 0.01260321, + "epoch": 0.23261686457237338, + "flos": 17061833675520.0, + "grad_norm": 1.6597028261056146, + "language_loss": 0.73686016, + "learning_rate": 3.58458034283495e-06, + "loss": 0.81483924, + "num_input_tokens_seen": 83233845, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.1907959, + "step": 3869, + "time_per_iteration": 2.4850685596466064 + }, + { + "auxiliary_loss_clip": 0.06524374, + "auxiliary_loss_mlp": 0.01289937, + "balance_loss_clip": 0.06296247, + "balance_loss_mlp": 0.01268241, + "epoch": 0.23267698782504134, + "flos": 29177726325120.0, + "grad_norm": 1.8030595092782438, + "language_loss": 0.8079325, + "learning_rate": 3.5843426848983097e-06, + "loss": 0.88607562, + "num_input_tokens_seen": 83254930, + "router_z_loss_clip": 2.28320312, + "router_z_loss_mlp": 0.21716309, + "step": 3870, + "time_per_iteration": 2.5915870666503906 + }, + { + "auxiliary_loss_clip": 0.06532744, + "auxiliary_loss_mlp": 0.01283178, + "balance_loss_clip": 0.06299601, + "balance_loss_mlp": 0.0126178, + "epoch": 0.2327371110777093, + "flos": 21180355960320.0, + "grad_norm": 1.9640097574691695, + "language_loss": 0.71693742, + "learning_rate": 3.5841049668832357e-06, + "loss": 0.79509664, + "num_input_tokens_seen": 83272095, + "router_z_loss_clip": 2.33398438, + "router_z_loss_mlp": 0.21411133, + "step": 3871, + "time_per_iteration": 2.4897918701171875 + }, + { + "auxiliary_loss_clip": 0.065286, + "auxiliary_loss_mlp": 0.01280741, + "balance_loss_clip": 0.06295659, + "balance_loss_mlp": 0.01260034, + "epoch": 0.23279723433037727, + "flos": 24869997521280.0, + "grad_norm": 2.5352867939179484, + "language_loss": 0.69289309, + "learning_rate": 3.5838671887987433e-06, + "loss": 0.77098656, + "num_input_tokens_seen": 83290980, + "router_z_loss_clip": 2.33007812, + "router_z_loss_mlp": 0.20715332, + "step": 3872, + "time_per_iteration": 2.5636072158813477 + }, + { + "auxiliary_loss_clip": 0.06535204, + "auxiliary_loss_mlp": 0.01285984, + "balance_loss_clip": 0.06299452, + "balance_loss_mlp": 0.01263894, + "epoch": 0.23285735758304524, + "flos": 38809823921280.0, + "grad_norm": 2.0709139139802497, + "language_loss": 0.78303361, + "learning_rate": 3.5836293506538474e-06, + "loss": 0.86124545, + "num_input_tokens_seen": 83315175, + "router_z_loss_clip": 2.35742188, + "router_z_loss_mlp": 0.22094727, + "step": 3873, + "time_per_iteration": 2.671551465988159 + }, + { + "auxiliary_loss_clip": 0.06419215, + "auxiliary_loss_mlp": 0.01286246, + "balance_loss_clip": 0.06302594, + "balance_loss_mlp": 0.01280601, + "epoch": 0.2329174808357132, + "flos": 53962274280960.0, + "grad_norm": 0.8377063316545934, + "language_loss": 0.60286367, + "learning_rate": 3.5833914524575687e-06, + "loss": 0.67991829, + "num_input_tokens_seen": 83372060, + "router_z_loss_clip": 1.171875, + "router_z_loss_mlp": 0.05636597, + "step": 3874, + "time_per_iteration": 3.087822675704956 + }, + { + "auxiliary_loss_clip": 0.06525364, + "auxiliary_loss_mlp": 0.01281697, + "balance_loss_clip": 0.06298245, + "balance_loss_mlp": 0.012608, + "epoch": 0.23297760408838117, + "flos": 21222549290880.0, + "grad_norm": 2.3064833177652773, + "language_loss": 0.81324208, + "learning_rate": 3.583153494218927e-06, + "loss": 0.89131272, + "num_input_tokens_seen": 83389795, + "router_z_loss_clip": 2.2734375, + "router_z_loss_mlp": 0.20898438, + "step": 3875, + "time_per_iteration": 2.560511589050293 + }, + { + "auxiliary_loss_clip": 0.06520373, + "auxiliary_loss_mlp": 0.01275593, + "balance_loss_clip": 0.06294609, + "balance_loss_mlp": 0.01255983, + "epoch": 0.23303772734104916, + "flos": 28410613395840.0, + "grad_norm": 2.285945976693144, + "language_loss": 0.62077069, + "learning_rate": 3.5829154759469464e-06, + "loss": 0.69873035, + "num_input_tokens_seen": 83410005, + "router_z_loss_clip": 2.25976562, + "router_z_loss_mlp": 0.19628906, + "step": 3876, + "time_per_iteration": 2.63901948928833 + }, + { + "auxiliary_loss_clip": 0.06525883, + "auxiliary_loss_mlp": 0.01277799, + "balance_loss_clip": 0.06296121, + "balance_loss_mlp": 0.01258034, + "epoch": 0.23309785059371713, + "flos": 24321328986240.0, + "grad_norm": 1.9984006432494335, + "language_loss": 0.71087664, + "learning_rate": 3.5826773976506523e-06, + "loss": 0.78891349, + "num_input_tokens_seen": 83430250, + "router_z_loss_clip": 2.29492188, + "router_z_loss_mlp": 0.19787598, + "step": 3877, + "time_per_iteration": 2.533858299255371 + }, + { + "auxiliary_loss_clip": 0.06524412, + "auxiliary_loss_mlp": 0.01274037, + "balance_loss_clip": 0.06297307, + "balance_loss_mlp": 0.01253485, + "epoch": 0.2331579738463851, + "flos": 15997633695360.0, + "grad_norm": 2.4085120625047143, + "language_loss": 0.81286502, + "learning_rate": 3.582439259339073e-06, + "loss": 0.89084947, + "num_input_tokens_seen": 83447950, + "router_z_loss_clip": 2.27148438, + "router_z_loss_mlp": 0.20556641, + "step": 3878, + "time_per_iteration": 2.5396199226379395 + }, + { + "auxiliary_loss_clip": 0.06534204, + "auxiliary_loss_mlp": 0.01280932, + "balance_loss_clip": 0.06299698, + "balance_loss_mlp": 0.0126013, + "epoch": 0.23321809709905306, + "flos": 36435418773120.0, + "grad_norm": 2.3738521781051207, + "language_loss": 0.75046253, + "learning_rate": 3.5822010610212374e-06, + "loss": 0.82861388, + "num_input_tokens_seen": 83467785, + "router_z_loss_clip": 2.34570312, + "router_z_loss_mlp": 0.20788574, + "step": 3879, + "time_per_iteration": 2.6389944553375244 + }, + { + "auxiliary_loss_clip": 0.06528227, + "auxiliary_loss_mlp": 0.01279465, + "balance_loss_clip": 0.06299725, + "balance_loss_mlp": 0.01257972, + "epoch": 0.23327822035172102, + "flos": 21331184509440.0, + "grad_norm": 4.081669167605711, + "language_loss": 0.90526301, + "learning_rate": 3.5819628027061795e-06, + "loss": 0.98333991, + "num_input_tokens_seen": 83485390, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.21496582, + "step": 3880, + "time_per_iteration": 2.5659923553466797 + }, + { + "auxiliary_loss_clip": 0.06530303, + "auxiliary_loss_mlp": 0.01278258, + "balance_loss_clip": 0.06297769, + "balance_loss_mlp": 0.0125841, + "epoch": 0.233338343604389, + "flos": 19177907086080.0, + "grad_norm": 1.8856968798779488, + "language_loss": 0.72716117, + "learning_rate": 3.5817244844029334e-06, + "loss": 0.80524671, + "num_input_tokens_seen": 83504890, + "router_z_loss_clip": 2.32421875, + "router_z_loss_mlp": 0.19848633, + "step": 3881, + "time_per_iteration": 2.528083324432373 + }, + { + "auxiliary_loss_clip": 0.0653114, + "auxiliary_loss_mlp": 0.01278184, + "balance_loss_clip": 0.06302784, + "balance_loss_mlp": 0.0125805, + "epoch": 0.23339846685705698, + "flos": 26915939464320.0, + "grad_norm": 1.6578041146422486, + "language_loss": 0.68699455, + "learning_rate": 3.581486106120537e-06, + "loss": 0.76508778, + "num_input_tokens_seen": 83526475, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.20129395, + "step": 3882, + "time_per_iteration": 2.575275182723999 + }, + { + "auxiliary_loss_clip": 0.06529698, + "auxiliary_loss_mlp": 0.0127867, + "balance_loss_clip": 0.0629693, + "balance_loss_mlp": 0.01258226, + "epoch": 0.23345859010972494, + "flos": 32351375243520.0, + "grad_norm": 2.0584115637368767, + "language_loss": 0.77458596, + "learning_rate": 3.5812476678680287e-06, + "loss": 0.8526696, + "num_input_tokens_seen": 83546620, + "router_z_loss_clip": 2.33007812, + "router_z_loss_mlp": 0.20446777, + "step": 3883, + "time_per_iteration": 2.626533269882202 + }, + { + "auxiliary_loss_clip": 0.06405331, + "auxiliary_loss_mlp": 0.01262592, + "balance_loss_clip": 0.0628854, + "balance_loss_mlp": 0.01257663, + "epoch": 0.2335187133623929, + "flos": 58505805273600.0, + "grad_norm": 0.7704933603606158, + "language_loss": 0.59193355, + "learning_rate": 3.58100916965445e-06, + "loss": 0.66861278, + "num_input_tokens_seen": 83616160, + "router_z_loss_clip": 1.17089844, + "router_z_loss_mlp": 0.04925537, + "step": 3884, + "time_per_iteration": 4.6365087032318115 + }, + { + "auxiliary_loss_clip": 0.06533933, + "auxiliary_loss_mlp": 0.01280044, + "balance_loss_clip": 0.06302384, + "balance_loss_mlp": 0.01260017, + "epoch": 0.23357883661506088, + "flos": 24509822745600.0, + "grad_norm": 1.6610169782824564, + "language_loss": 0.80755335, + "learning_rate": 3.5807706114888455e-06, + "loss": 0.88569313, + "num_input_tokens_seen": 83636795, + "router_z_loss_clip": 2.31640625, + "router_z_loss_mlp": 0.20031738, + "step": 3885, + "time_per_iteration": 2.6180286407470703 + }, + { + "auxiliary_loss_clip": 0.06523974, + "auxiliary_loss_mlp": 0.01286823, + "balance_loss_clip": 0.06296945, + "balance_loss_mlp": 0.01265687, + "epoch": 0.23363895986772884, + "flos": 18953760614400.0, + "grad_norm": 2.3207575064623613, + "language_loss": 0.88500953, + "learning_rate": 3.580531993380261e-06, + "loss": 0.96311754, + "num_input_tokens_seen": 83654050, + "router_z_loss_clip": 2.26953125, + "router_z_loss_mlp": 0.21130371, + "step": 3886, + "time_per_iteration": 2.5116477012634277 + }, + { + "auxiliary_loss_clip": 0.06532702, + "auxiliary_loss_mlp": 0.01282855, + "balance_loss_clip": 0.06302926, + "balance_loss_mlp": 0.01262518, + "epoch": 0.2336990831203968, + "flos": 31694993884800.0, + "grad_norm": 1.8877154320423692, + "language_loss": 0.74203557, + "learning_rate": 3.5802933153377445e-06, + "loss": 0.82019114, + "num_input_tokens_seen": 83673720, + "router_z_loss_clip": 2.29882812, + "router_z_loss_mlp": 0.20336914, + "step": 3887, + "time_per_iteration": 4.024793863296509 + }, + { + "auxiliary_loss_clip": 0.06531121, + "auxiliary_loss_mlp": 0.01281305, + "balance_loss_clip": 0.06301375, + "balance_loss_mlp": 0.01261206, + "epoch": 0.23375920637306477, + "flos": 27717237659520.0, + "grad_norm": 1.8176198265631485, + "language_loss": 0.84478307, + "learning_rate": 3.5800545773703475e-06, + "loss": 0.92290735, + "num_input_tokens_seen": 83693470, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.20092773, + "step": 3888, + "time_per_iteration": 2.6297786235809326 + }, + { + "auxiliary_loss_clip": 0.06524558, + "auxiliary_loss_mlp": 0.01283639, + "balance_loss_clip": 0.06298919, + "balance_loss_mlp": 0.01263934, + "epoch": 0.23381932962573276, + "flos": 17681346437760.0, + "grad_norm": 2.056965631559896, + "language_loss": 0.88319886, + "learning_rate": 3.5798157794871225e-06, + "loss": 0.96128076, + "num_input_tokens_seen": 83711620, + "router_z_loss_clip": 2.25585938, + "router_z_loss_mlp": 0.19689941, + "step": 3889, + "time_per_iteration": 2.524937152862549 + }, + { + "auxiliary_loss_clip": 0.06524722, + "auxiliary_loss_mlp": 0.01282198, + "balance_loss_clip": 0.06299812, + "balance_loss_mlp": 0.01262708, + "epoch": 0.23387945287840073, + "flos": 14395833918720.0, + "grad_norm": 2.5361674913720487, + "language_loss": 0.7777229, + "learning_rate": 3.579576921697125e-06, + "loss": 0.85579211, + "num_input_tokens_seen": 83727890, + "router_z_loss_clip": 2.24804688, + "router_z_loss_mlp": 0.19470215, + "step": 3890, + "time_per_iteration": 4.02982497215271 + }, + { + "auxiliary_loss_clip": 0.06526545, + "auxiliary_loss_mlp": 0.01284178, + "balance_loss_clip": 0.06297928, + "balance_loss_mlp": 0.01264008, + "epoch": 0.2339395761310687, + "flos": 46108451888640.0, + "grad_norm": 1.897831891943022, + "language_loss": 0.74213481, + "learning_rate": 3.579338004009412e-06, + "loss": 0.82024205, + "num_input_tokens_seen": 83749370, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.20166016, + "step": 3891, + "time_per_iteration": 2.7951042652130127 + }, + { + "auxiliary_loss_clip": 0.06524959, + "auxiliary_loss_mlp": 0.01281513, + "balance_loss_clip": 0.06301059, + "balance_loss_mlp": 0.01262821, + "epoch": 0.23399969938373666, + "flos": 22388508455040.0, + "grad_norm": 1.6273389699862264, + "language_loss": 0.82863498, + "learning_rate": 3.5790990264330433e-06, + "loss": 0.90669972, + "num_input_tokens_seen": 83769560, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.18688965, + "step": 3892, + "time_per_iteration": 2.530782461166382 + }, + { + "auxiliary_loss_clip": 0.06531358, + "auxiliary_loss_mlp": 0.01281181, + "balance_loss_clip": 0.06301633, + "balance_loss_mlp": 0.01260951, + "epoch": 0.23405982263640462, + "flos": 43518746874240.0, + "grad_norm": 1.4575042253356143, + "language_loss": 0.65593249, + "learning_rate": 3.578859988977082e-06, + "loss": 0.7340579, + "num_input_tokens_seen": 83795635, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.20227051, + "step": 3893, + "time_per_iteration": 4.212572813034058 + }, + { + "auxiliary_loss_clip": 0.06519544, + "auxiliary_loss_mlp": 0.01283369, + "balance_loss_clip": 0.06297972, + "balance_loss_mlp": 0.01263259, + "epoch": 0.2341199458890726, + "flos": 22571216282880.0, + "grad_norm": 2.0084649252152564, + "language_loss": 0.79620147, + "learning_rate": 3.5786208916505916e-06, + "loss": 0.87423062, + "num_input_tokens_seen": 83814090, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.20117188, + "step": 3894, + "time_per_iteration": 2.580109119415283 + }, + { + "auxiliary_loss_clip": 0.06524212, + "auxiliary_loss_mlp": 0.01276443, + "balance_loss_clip": 0.06300013, + "balance_loss_mlp": 0.01257763, + "epoch": 0.23418006914174055, + "flos": 25641764352000.0, + "grad_norm": 1.5130292757453454, + "language_loss": 0.82681906, + "learning_rate": 3.5783817344626383e-06, + "loss": 0.90482563, + "num_input_tokens_seen": 83836870, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.18664551, + "step": 3895, + "time_per_iteration": 2.583759069442749 + }, + { + "auxiliary_loss_clip": 0.06520028, + "auxiliary_loss_mlp": 0.01278233, + "balance_loss_clip": 0.06295593, + "balance_loss_mlp": 0.0125885, + "epoch": 0.23424019239440855, + "flos": 13549826770560.0, + "grad_norm": 2.4592405022159496, + "language_loss": 0.81334293, + "learning_rate": 3.578142517422292e-06, + "loss": 0.89132559, + "num_input_tokens_seen": 83853275, + "router_z_loss_clip": 2.24414062, + "router_z_loss_mlp": 0.19372559, + "step": 3896, + "time_per_iteration": 2.536252021789551 + }, + { + "auxiliary_loss_clip": 0.06530771, + "auxiliary_loss_mlp": 0.012867, + "balance_loss_clip": 0.06299435, + "balance_loss_mlp": 0.01264253, + "epoch": 0.2343003156470765, + "flos": 22426131738240.0, + "grad_norm": 3.0940729647414598, + "language_loss": 0.83988011, + "learning_rate": 3.577903240538623e-06, + "loss": 0.91805482, + "num_input_tokens_seen": 83872340, + "router_z_loss_clip": 2.31445312, + "router_z_loss_mlp": 0.2244873, + "step": 3897, + "time_per_iteration": 2.572230577468872 + }, + { + "auxiliary_loss_clip": 0.06528857, + "auxiliary_loss_mlp": 0.01279177, + "balance_loss_clip": 0.06296414, + "balance_loss_mlp": 0.01258626, + "epoch": 0.23436043889974448, + "flos": 14795644475520.0, + "grad_norm": 2.317273344502078, + "language_loss": 0.79819012, + "learning_rate": 3.577663903820705e-06, + "loss": 0.87627041, + "num_input_tokens_seen": 83888795, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.20544434, + "step": 3898, + "time_per_iteration": 2.5207583904266357 + }, + { + "auxiliary_loss_clip": 0.0651897, + "auxiliary_loss_mlp": 0.01278878, + "balance_loss_clip": 0.06297988, + "balance_loss_mlp": 0.0126021, + "epoch": 0.23442056215241244, + "flos": 22972242723840.0, + "grad_norm": 1.88849810547605, + "language_loss": 0.7476474, + "learning_rate": 3.577424507277614e-06, + "loss": 0.82562584, + "num_input_tokens_seen": 83906820, + "router_z_loss_clip": 2.20800781, + "router_z_loss_mlp": 0.18676758, + "step": 3899, + "time_per_iteration": 2.535256862640381 + }, + { + "auxiliary_loss_clip": 0.06525272, + "auxiliary_loss_mlp": 0.01280019, + "balance_loss_clip": 0.06296974, + "balance_loss_mlp": 0.01259515, + "epoch": 0.2344806854050804, + "flos": 23077901122560.0, + "grad_norm": 1.7218865416029, + "language_loss": 0.75599915, + "learning_rate": 3.5771850509184277e-06, + "loss": 0.83405209, + "num_input_tokens_seen": 83926370, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.20507812, + "step": 3900, + "time_per_iteration": 2.5674827098846436 + }, + { + "auxiliary_loss_clip": 0.06524841, + "auxiliary_loss_mlp": 0.01281356, + "balance_loss_clip": 0.06296976, + "balance_loss_mlp": 0.01260959, + "epoch": 0.23454080865774837, + "flos": 16332805226880.0, + "grad_norm": 2.155964713283421, + "language_loss": 0.67468774, + "learning_rate": 3.5769455347522256e-06, + "loss": 0.75274968, + "num_input_tokens_seen": 83944600, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.20410156, + "step": 3901, + "time_per_iteration": 2.536736249923706 + }, + { + "auxiliary_loss_clip": 0.06415819, + "auxiliary_loss_mlp": 0.01256149, + "balance_loss_clip": 0.06299057, + "balance_loss_mlp": 0.01251181, + "epoch": 0.23460093191041637, + "flos": 67779545685120.0, + "grad_norm": 0.7514179301091559, + "language_loss": 0.58278525, + "learning_rate": 3.576705958788091e-06, + "loss": 0.65950489, + "num_input_tokens_seen": 84005100, + "router_z_loss_clip": 1.16796875, + "router_z_loss_mlp": 0.0496521, + "step": 3902, + "time_per_iteration": 3.134718894958496 + }, + { + "auxiliary_loss_clip": 0.06519462, + "auxiliary_loss_mlp": 0.01278211, + "balance_loss_clip": 0.06292997, + "balance_loss_mlp": 0.01258375, + "epoch": 0.23466105516308433, + "flos": 20082725400960.0, + "grad_norm": 4.781089560028637, + "language_loss": 0.80931306, + "learning_rate": 3.576466323035108e-06, + "loss": 0.88728976, + "num_input_tokens_seen": 84023775, + "router_z_loss_clip": 2.26171875, + "router_z_loss_mlp": 0.19836426, + "step": 3903, + "time_per_iteration": 2.525059938430786 + }, + { + "auxiliary_loss_clip": 0.06522641, + "auxiliary_loss_mlp": 0.01278887, + "balance_loss_clip": 0.06295069, + "balance_loss_mlp": 0.01258955, + "epoch": 0.2347211784157523, + "flos": 24542708273280.0, + "grad_norm": 1.8578223556950417, + "language_loss": 0.82988703, + "learning_rate": 3.5762266275023645e-06, + "loss": 0.90790236, + "num_input_tokens_seen": 84042605, + "router_z_loss_clip": 2.27929688, + "router_z_loss_mlp": 0.19909668, + "step": 3904, + "time_per_iteration": 2.5903875827789307 + }, + { + "auxiliary_loss_clip": 0.0652332, + "auxiliary_loss_mlp": 0.01285911, + "balance_loss_clip": 0.06295672, + "balance_loss_mlp": 0.01265562, + "epoch": 0.23478130166842026, + "flos": 23811751180800.0, + "grad_norm": 1.985666710181995, + "language_loss": 0.7223646, + "learning_rate": 3.57598687219895e-06, + "loss": 0.80045688, + "num_input_tokens_seen": 84061520, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.20361328, + "step": 3905, + "time_per_iteration": 2.5441884994506836 + }, + { + "auxiliary_loss_clip": 0.06517074, + "auxiliary_loss_mlp": 0.01274876, + "balance_loss_clip": 0.06294023, + "balance_loss_mlp": 0.01255564, + "epoch": 0.23484142492108823, + "flos": 24099823918080.0, + "grad_norm": 2.433861192511871, + "language_loss": 0.71703601, + "learning_rate": 3.5757470571339543e-06, + "loss": 0.79495549, + "num_input_tokens_seen": 84081800, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.19311523, + "step": 3906, + "time_per_iteration": 2.698309898376465 + }, + { + "auxiliary_loss_clip": 0.06533175, + "auxiliary_loss_mlp": 0.01285298, + "balance_loss_clip": 0.06297503, + "balance_loss_mlp": 0.01264341, + "epoch": 0.2349015481737562, + "flos": 29103486007680.0, + "grad_norm": 2.7858195598302014, + "language_loss": 0.74089986, + "learning_rate": 3.575507182316473e-06, + "loss": 0.81908458, + "num_input_tokens_seen": 84102340, + "router_z_loss_clip": 2.35742188, + "router_z_loss_mlp": 0.20959473, + "step": 3907, + "time_per_iteration": 2.578900098800659 + }, + { + "auxiliary_loss_clip": 0.06524273, + "auxiliary_loss_mlp": 0.01280946, + "balance_loss_clip": 0.06294693, + "balance_loss_mlp": 0.01260418, + "epoch": 0.23496167142642416, + "flos": 18922258679040.0, + "grad_norm": 2.1308722973133385, + "language_loss": 0.73705935, + "learning_rate": 3.575267247755601e-06, + "loss": 0.81511152, + "num_input_tokens_seen": 84120370, + "router_z_loss_clip": 2.29492188, + "router_z_loss_mlp": 0.2052002, + "step": 3908, + "time_per_iteration": 2.599888801574707 + }, + { + "auxiliary_loss_clip": 0.06415461, + "auxiliary_loss_mlp": 0.01265268, + "balance_loss_clip": 0.06297816, + "balance_loss_mlp": 0.01259901, + "epoch": 0.23502179467909215, + "flos": 55884906541440.0, + "grad_norm": 1.2475277524680826, + "language_loss": 0.73364127, + "learning_rate": 3.5750272534604367e-06, + "loss": 0.81044865, + "num_input_tokens_seen": 84165515, + "router_z_loss_clip": 1.171875, + "router_z_loss_mlp": 0.05374146, + "step": 3909, + "time_per_iteration": 2.9221227169036865 + }, + { + "auxiliary_loss_clip": 0.06524064, + "auxiliary_loss_mlp": 0.01285302, + "balance_loss_clip": 0.06297419, + "balance_loss_mlp": 0.01265013, + "epoch": 0.23508191793176011, + "flos": 23408083336320.0, + "grad_norm": 1.6005271399570604, + "language_loss": 0.88581395, + "learning_rate": 3.5747871994400822e-06, + "loss": 0.9639076, + "num_input_tokens_seen": 84184540, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.20288086, + "step": 3910, + "time_per_iteration": 2.571974277496338 + }, + { + "auxiliary_loss_clip": 0.06520193, + "auxiliary_loss_mlp": 0.01278841, + "balance_loss_clip": 0.06293051, + "balance_loss_mlp": 0.01258658, + "epoch": 0.23514204118442808, + "flos": 20053864869120.0, + "grad_norm": 1.9643755437340527, + "language_loss": 0.76589572, + "learning_rate": 3.5745470857036386e-06, + "loss": 0.84388608, + "num_input_tokens_seen": 84202025, + "router_z_loss_clip": 2.27148438, + "router_z_loss_mlp": 0.2019043, + "step": 3911, + "time_per_iteration": 2.5159506797790527 + }, + { + "auxiliary_loss_clip": 0.06514487, + "auxiliary_loss_mlp": 0.01291153, + "balance_loss_clip": 0.06293596, + "balance_loss_mlp": 0.01272568, + "epoch": 0.23520216443709605, + "flos": 21587126405760.0, + "grad_norm": 1.5390832092388007, + "language_loss": 0.82200038, + "learning_rate": 3.5743069122602122e-06, + "loss": 0.90005672, + "num_input_tokens_seen": 84221895, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.18579102, + "step": 3912, + "time_per_iteration": 2.53330135345459 + }, + { + "auxiliary_loss_clip": 0.06515642, + "auxiliary_loss_mlp": 0.01288785, + "balance_loss_clip": 0.06294793, + "balance_loss_mlp": 0.01269604, + "epoch": 0.235262287689764, + "flos": 23192573834880.0, + "grad_norm": 1.8330232089961167, + "language_loss": 0.72023201, + "learning_rate": 3.574066679118909e-06, + "loss": 0.79827625, + "num_input_tokens_seen": 84240455, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.19177246, + "step": 3913, + "time_per_iteration": 2.5643818378448486 + }, + { + "auxiliary_loss_clip": 0.06528541, + "auxiliary_loss_mlp": 0.01277731, + "balance_loss_clip": 0.0629672, + "balance_loss_mlp": 0.01257238, + "epoch": 0.23532241094243198, + "flos": 23191903002240.0, + "grad_norm": 1.784539383466316, + "language_loss": 0.76976919, + "learning_rate": 3.57382638628884e-06, + "loss": 0.84783185, + "num_input_tokens_seen": 84261605, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.20483398, + "step": 3914, + "time_per_iteration": 2.575133800506592 + }, + { + "auxiliary_loss_clip": 0.06525879, + "auxiliary_loss_mlp": 0.01279953, + "balance_loss_clip": 0.06294835, + "balance_loss_mlp": 0.01259759, + "epoch": 0.23538253419509997, + "flos": 17025007006080.0, + "grad_norm": 2.4875564397369745, + "language_loss": 0.90170735, + "learning_rate": 3.5735860337791174e-06, + "loss": 0.97976559, + "num_input_tokens_seen": 84278675, + "router_z_loss_clip": 2.30859375, + "router_z_loss_mlp": 0.2019043, + "step": 3915, + "time_per_iteration": 2.563430070877075 + }, + { + "auxiliary_loss_clip": 0.06418007, + "auxiliary_loss_mlp": 0.01258116, + "balance_loss_clip": 0.06301998, + "balance_loss_mlp": 0.0125336, + "epoch": 0.23544265744776793, + "flos": 63465276263040.0, + "grad_norm": 0.7933859009920101, + "language_loss": 0.59378946, + "learning_rate": 3.573345621598854e-06, + "loss": 0.6705507, + "num_input_tokens_seen": 84329765, + "router_z_loss_clip": 1.15625, + "router_z_loss_mlp": 0.04748535, + "step": 3916, + "time_per_iteration": 3.0965490341186523 + }, + { + "auxiliary_loss_clip": 0.06410776, + "auxiliary_loss_mlp": 0.01260488, + "balance_loss_clip": 0.06295535, + "balance_loss_mlp": 0.01255756, + "epoch": 0.2355027807004359, + "flos": 70537395116160.0, + "grad_norm": 0.7426668339088592, + "language_loss": 0.49443412, + "learning_rate": 3.5731051497571675e-06, + "loss": 0.57114673, + "num_input_tokens_seen": 84393680, + "router_z_loss_clip": 1.15625, + "router_z_loss_mlp": 0.04724121, + "step": 3917, + "time_per_iteration": 3.180136203765869 + }, + { + "auxiliary_loss_clip": 0.06525698, + "auxiliary_loss_mlp": 0.01279416, + "balance_loss_clip": 0.06297344, + "balance_loss_mlp": 0.01259687, + "epoch": 0.23556290395310386, + "flos": 21440742122880.0, + "grad_norm": 2.189382839240281, + "language_loss": 0.77017808, + "learning_rate": 3.5728646182631756e-06, + "loss": 0.84822929, + "num_input_tokens_seen": 84412640, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.19714355, + "step": 3918, + "time_per_iteration": 2.546833038330078 + }, + { + "auxiliary_loss_clip": 0.0652653, + "auxiliary_loss_mlp": 0.01274201, + "balance_loss_clip": 0.06294574, + "balance_loss_mlp": 0.01254353, + "epoch": 0.23562302720577183, + "flos": 18192223981440.0, + "grad_norm": 2.402769767514051, + "language_loss": 0.70165813, + "learning_rate": 3.5726240271259995e-06, + "loss": 0.77966547, + "num_input_tokens_seen": 84431605, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.1986084, + "step": 3919, + "time_per_iteration": 2.561800479888916 + }, + { + "auxiliary_loss_clip": 0.06516096, + "auxiliary_loss_mlp": 0.01279326, + "balance_loss_clip": 0.06294449, + "balance_loss_mlp": 0.0125999, + "epoch": 0.2356831504584398, + "flos": 33739091038080.0, + "grad_norm": 1.6359966895302622, + "language_loss": 0.71094656, + "learning_rate": 3.5723833763547634e-06, + "loss": 0.78890085, + "num_input_tokens_seen": 84454210, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.19335938, + "step": 3920, + "time_per_iteration": 2.672703504562378 + }, + { + "auxiliary_loss_clip": 0.065192, + "auxiliary_loss_mlp": 0.0127625, + "balance_loss_clip": 0.06295229, + "balance_loss_mlp": 0.0125707, + "epoch": 0.23574327371110776, + "flos": 24939122739840.0, + "grad_norm": 1.9300596293530992, + "language_loss": 0.77833009, + "learning_rate": 3.5721426659585916e-06, + "loss": 0.85628462, + "num_input_tokens_seen": 84475540, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.19189453, + "step": 3921, + "time_per_iteration": 2.5823934078216553 + }, + { + "auxiliary_loss_clip": 0.06519832, + "auxiliary_loss_mlp": 0.01273471, + "balance_loss_clip": 0.06293498, + "balance_loss_mlp": 0.01254898, + "epoch": 0.23580339696377575, + "flos": 17827940355840.0, + "grad_norm": 2.282195745019935, + "language_loss": 0.76750088, + "learning_rate": 3.571901895946612e-06, + "loss": 0.84543383, + "num_input_tokens_seen": 84494580, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.18566895, + "step": 3922, + "time_per_iteration": 2.5005834102630615 + }, + { + "auxiliary_loss_clip": 0.06518443, + "auxiliary_loss_mlp": 0.01276376, + "balance_loss_clip": 0.06292558, + "balance_loss_mlp": 0.01257255, + "epoch": 0.23586352021644372, + "flos": 26293827225600.0, + "grad_norm": 2.0102031772622277, + "language_loss": 0.80626559, + "learning_rate": 3.571661066327956e-06, + "loss": 0.88421381, + "num_input_tokens_seen": 84513850, + "router_z_loss_clip": 2.25976562, + "router_z_loss_mlp": 0.19128418, + "step": 3923, + "time_per_iteration": 2.581338882446289 + }, + { + "auxiliary_loss_clip": 0.0652013, + "auxiliary_loss_mlp": 0.01275781, + "balance_loss_clip": 0.06296518, + "balance_loss_mlp": 0.01256326, + "epoch": 0.23592364346911168, + "flos": 14251965258240.0, + "grad_norm": 1.780788070615976, + "language_loss": 0.7507394, + "learning_rate": 3.571420177111754e-06, + "loss": 0.82869852, + "num_input_tokens_seen": 84532315, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.19458008, + "step": 3924, + "time_per_iteration": 3.9297289848327637 + }, + { + "auxiliary_loss_clip": 0.06516001, + "auxiliary_loss_mlp": 0.01276934, + "balance_loss_clip": 0.06293369, + "balance_loss_mlp": 0.01258039, + "epoch": 0.23598376672177965, + "flos": 18593837400960.0, + "grad_norm": 1.7528516859224217, + "language_loss": 0.83231425, + "learning_rate": 3.5711792283071416e-06, + "loss": 0.91024363, + "num_input_tokens_seen": 84550970, + "router_z_loss_clip": 2.2265625, + "router_z_loss_mlp": 0.18884277, + "step": 3925, + "time_per_iteration": 2.5267770290374756 + }, + { + "auxiliary_loss_clip": 0.06520985, + "auxiliary_loss_mlp": 0.01279855, + "balance_loss_clip": 0.06293195, + "balance_loss_mlp": 0.01259673, + "epoch": 0.2360438899744476, + "flos": 22682325196800.0, + "grad_norm": 1.753261892654821, + "language_loss": 0.60038519, + "learning_rate": 3.5709382199232564e-06, + "loss": 0.6783936, + "num_input_tokens_seen": 84571655, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.20178223, + "step": 3926, + "time_per_iteration": 4.023118257522583 + }, + { + "auxiliary_loss_clip": 0.06514051, + "auxiliary_loss_mlp": 0.01276677, + "balance_loss_clip": 0.06293727, + "balance_loss_mlp": 0.01257735, + "epoch": 0.23610401322711558, + "flos": 29577872298240.0, + "grad_norm": 1.9607796947198142, + "language_loss": 0.72402066, + "learning_rate": 3.570697151969235e-06, + "loss": 0.80192792, + "num_input_tokens_seen": 84593130, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.1895752, + "step": 3927, + "time_per_iteration": 2.6113367080688477 + }, + { + "auxiliary_loss_clip": 0.06515504, + "auxiliary_loss_mlp": 0.01275291, + "balance_loss_clip": 0.06291251, + "balance_loss_mlp": 0.01256373, + "epoch": 0.23616413647978354, + "flos": 17864347754880.0, + "grad_norm": 2.08357001670468, + "language_loss": 0.75570691, + "learning_rate": 3.570456024454221e-06, + "loss": 0.83361489, + "num_input_tokens_seen": 84612410, + "router_z_loss_clip": 2.24609375, + "router_z_loss_mlp": 0.18920898, + "step": 3928, + "time_per_iteration": 2.601884365081787 + }, + { + "auxiliary_loss_clip": 0.06522287, + "auxiliary_loss_mlp": 0.01280424, + "balance_loss_clip": 0.06293722, + "balance_loss_mlp": 0.01260338, + "epoch": 0.23622425973245154, + "flos": 11039393318400.0, + "grad_norm": 3.3378461006384788, + "language_loss": 0.82518888, + "learning_rate": 3.5702148373873576e-06, + "loss": 0.903216, + "num_input_tokens_seen": 84627610, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.20080566, + "step": 3929, + "time_per_iteration": 3.9035136699676514 + }, + { + "auxiliary_loss_clip": 0.0652993, + "auxiliary_loss_mlp": 0.01281554, + "balance_loss_clip": 0.06295136, + "balance_loss_mlp": 0.01261228, + "epoch": 0.2362843829851195, + "flos": 23410766666880.0, + "grad_norm": 2.0127268398029607, + "language_loss": 0.7229315, + "learning_rate": 3.569973590777789e-06, + "loss": 0.80104637, + "num_input_tokens_seen": 84648415, + "router_z_loss_clip": 2.34960938, + "router_z_loss_mlp": 0.203125, + "step": 3930, + "time_per_iteration": 2.5537455081939697 + }, + { + "auxiliary_loss_clip": 0.06516138, + "auxiliary_loss_mlp": 0.01275778, + "balance_loss_clip": 0.06290947, + "balance_loss_mlp": 0.01257312, + "epoch": 0.23634450623778747, + "flos": 39539103932160.0, + "grad_norm": 1.8975533795335693, + "language_loss": 0.74476141, + "learning_rate": 3.569732284634665e-06, + "loss": 0.82268059, + "num_input_tokens_seen": 84670080, + "router_z_loss_clip": 2.25390625, + "router_z_loss_mlp": 0.18444824, + "step": 3931, + "time_per_iteration": 2.6975677013397217 + }, + { + "auxiliary_loss_clip": 0.06517775, + "auxiliary_loss_mlp": 0.01279269, + "balance_loss_clip": 0.06291172, + "balance_loss_mlp": 0.01260208, + "epoch": 0.23640462949045543, + "flos": 24214077360000.0, + "grad_norm": 2.102820580807434, + "language_loss": 0.8105433, + "learning_rate": 3.569490918967136e-06, + "loss": 0.88851368, + "num_input_tokens_seen": 84686465, + "router_z_loss_clip": 2.26757812, + "router_z_loss_mlp": 0.19055176, + "step": 3932, + "time_per_iteration": 2.539280652999878 + }, + { + "auxiliary_loss_clip": 0.06510118, + "auxiliary_loss_mlp": 0.01272436, + "balance_loss_clip": 0.06289183, + "balance_loss_mlp": 0.01254949, + "epoch": 0.2364647527431234, + "flos": 26184898517760.0, + "grad_norm": 1.6370407311570319, + "language_loss": 0.85819322, + "learning_rate": 3.5692494937843537e-06, + "loss": 0.93601882, + "num_input_tokens_seen": 84708825, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.17480469, + "step": 3933, + "time_per_iteration": 4.0140979290008545 + }, + { + "auxiliary_loss_clip": 0.06528582, + "auxiliary_loss_mlp": 0.01277532, + "balance_loss_clip": 0.06296912, + "balance_loss_mlp": 0.01257314, + "epoch": 0.23652487599579136, + "flos": 22643444102400.0, + "grad_norm": 3.233125821654351, + "language_loss": 0.83709848, + "learning_rate": 3.5690080090954727e-06, + "loss": 0.91515964, + "num_input_tokens_seen": 84726165, + "router_z_loss_clip": 2.31835938, + "router_z_loss_mlp": 0.20214844, + "step": 3934, + "time_per_iteration": 2.542692184448242 + }, + { + "auxiliary_loss_clip": 0.06519171, + "auxiliary_loss_mlp": 0.01281493, + "balance_loss_clip": 0.06292115, + "balance_loss_mlp": 0.01262896, + "epoch": 0.23658499924845935, + "flos": 21768702203520.0, + "grad_norm": 1.7174434370199074, + "language_loss": 0.7898351, + "learning_rate": 3.5687664649096515e-06, + "loss": 0.86784172, + "num_input_tokens_seen": 84745815, + "router_z_loss_clip": 2.26953125, + "router_z_loss_mlp": 0.18615723, + "step": 3935, + "time_per_iteration": 2.5311288833618164 + }, + { + "auxiliary_loss_clip": 0.0651848, + "auxiliary_loss_mlp": 0.01276844, + "balance_loss_clip": 0.06296465, + "balance_loss_mlp": 0.01258533, + "epoch": 0.23664512250112732, + "flos": 21805486945920.0, + "grad_norm": 1.7511193987533888, + "language_loss": 0.80239666, + "learning_rate": 3.5685248612360487e-06, + "loss": 0.88034987, + "num_input_tokens_seen": 84765415, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.1829834, + "step": 3936, + "time_per_iteration": 2.5497477054595947 + }, + { + "auxiliary_loss_clip": 0.06513149, + "auxiliary_loss_mlp": 0.01276001, + "balance_loss_clip": 0.06288509, + "balance_loss_mlp": 0.01256593, + "epoch": 0.23670524575379528, + "flos": 22644450351360.0, + "grad_norm": 1.4782770271817958, + "language_loss": 0.79820013, + "learning_rate": 3.568283198083826e-06, + "loss": 0.8760916, + "num_input_tokens_seen": 84787080, + "router_z_loss_clip": 2.24804688, + "router_z_loss_mlp": 0.19396973, + "step": 3937, + "time_per_iteration": 2.5636842250823975 + }, + { + "auxiliary_loss_clip": 0.06515164, + "auxiliary_loss_mlp": 0.0127913, + "balance_loss_clip": 0.06294726, + "balance_loss_mlp": 0.01261487, + "epoch": 0.23676536900646325, + "flos": 16730225942400.0, + "grad_norm": 2.2850190898814686, + "language_loss": 0.85810506, + "learning_rate": 3.568041475462147e-06, + "loss": 0.93604803, + "num_input_tokens_seen": 84805395, + "router_z_loss_clip": 2.20507812, + "router_z_loss_mlp": 0.1763916, + "step": 3938, + "time_per_iteration": 2.568195343017578 + }, + { + "auxiliary_loss_clip": 0.06509314, + "auxiliary_loss_mlp": 0.01278669, + "balance_loss_clip": 0.06288411, + "balance_loss_mlp": 0.01259393, + "epoch": 0.23682549225913122, + "flos": 11138720734080.0, + "grad_norm": 3.1023600205020876, + "language_loss": 0.94564033, + "learning_rate": 3.5677996933801785e-06, + "loss": 1.02351999, + "num_input_tokens_seen": 84818090, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.19287109, + "step": 3939, + "time_per_iteration": 2.4615180492401123 + }, + { + "auxiliary_loss_clip": 0.0652378, + "auxiliary_loss_mlp": 0.01277473, + "balance_loss_clip": 0.06294175, + "balance_loss_mlp": 0.0125803, + "epoch": 0.23688561551179918, + "flos": 22564843372800.0, + "grad_norm": 5.475058210638743, + "language_loss": 0.82803464, + "learning_rate": 3.567557851847088e-06, + "loss": 0.90604717, + "num_input_tokens_seen": 84837695, + "router_z_loss_clip": 2.29492188, + "router_z_loss_mlp": 0.19445801, + "step": 3940, + "time_per_iteration": 2.573552131652832 + }, + { + "auxiliary_loss_clip": 0.06531326, + "auxiliary_loss_mlp": 0.01276996, + "balance_loss_clip": 0.06295921, + "balance_loss_mlp": 0.0125679, + "epoch": 0.23694573876446715, + "flos": 18520771040640.0, + "grad_norm": 2.098492916494123, + "language_loss": 0.8946867, + "learning_rate": 3.5673159508720464e-06, + "loss": 0.97276992, + "num_input_tokens_seen": 84854630, + "router_z_loss_clip": 2.35351562, + "router_z_loss_mlp": 0.2019043, + "step": 3941, + "time_per_iteration": 2.5142972469329834 + }, + { + "auxiliary_loss_clip": 0.06529268, + "auxiliary_loss_mlp": 0.01286958, + "balance_loss_clip": 0.06297106, + "balance_loss_mlp": 0.01267503, + "epoch": 0.23700586201713514, + "flos": 15340246087680.0, + "grad_norm": 1.8886698836060631, + "language_loss": 0.84989077, + "learning_rate": 3.5670739904642274e-06, + "loss": 0.92805308, + "num_input_tokens_seen": 84871805, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.19458008, + "step": 3942, + "time_per_iteration": 2.56052827835083 + }, + { + "auxiliary_loss_clip": 0.06538361, + "auxiliary_loss_mlp": 0.01285865, + "balance_loss_clip": 0.06307331, + "balance_loss_mlp": 0.01265492, + "epoch": 0.2370659852698031, + "flos": 23953775051520.0, + "grad_norm": 2.0845511028002197, + "language_loss": 0.81156456, + "learning_rate": 3.5668319706328065e-06, + "loss": 0.88980681, + "num_input_tokens_seen": 84889815, + "router_z_loss_clip": 2.31054688, + "router_z_loss_mlp": 0.20373535, + "step": 3943, + "time_per_iteration": 2.539264678955078 + }, + { + "auxiliary_loss_clip": 0.06543057, + "auxiliary_loss_mlp": 0.01292355, + "balance_loss_clip": 0.06306483, + "balance_loss_mlp": 0.01271494, + "epoch": 0.23712610852247107, + "flos": 15336514581120.0, + "grad_norm": 2.5863771047568926, + "language_loss": 0.682428, + "learning_rate": 3.566589891386959e-06, + "loss": 0.76078212, + "num_input_tokens_seen": 84904380, + "router_z_loss_clip": 2.3671875, + "router_z_loss_mlp": 0.20861816, + "step": 3944, + "time_per_iteration": 2.520453929901123 + }, + { + "auxiliary_loss_clip": 0.06529288, + "auxiliary_loss_mlp": 0.01297026, + "balance_loss_clip": 0.06299931, + "balance_loss_mlp": 0.01276963, + "epoch": 0.23718623177513903, + "flos": 19688658848640.0, + "grad_norm": 1.6926271274644824, + "language_loss": 0.76068223, + "learning_rate": 3.566347752735866e-06, + "loss": 0.83894539, + "num_input_tokens_seen": 84922935, + "router_z_loss_clip": 2.29296875, + "router_z_loss_mlp": 0.20043945, + "step": 3945, + "time_per_iteration": 2.517084836959839 + }, + { + "auxiliary_loss_clip": 0.06535566, + "auxiliary_loss_mlp": 0.01288141, + "balance_loss_clip": 0.06307153, + "balance_loss_mlp": 0.0126859, + "epoch": 0.237246355027807, + "flos": 24980351748480.0, + "grad_norm": 1.7408538946114391, + "language_loss": 0.63962567, + "learning_rate": 3.5661055546887094e-06, + "loss": 0.71786278, + "num_input_tokens_seen": 84943685, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.19555664, + "step": 3946, + "time_per_iteration": 2.6133670806884766 + }, + { + "auxiliary_loss_clip": 0.06535441, + "auxiliary_loss_mlp": 0.01289697, + "balance_loss_clip": 0.06306995, + "balance_loss_mlp": 0.01269324, + "epoch": 0.23730647828047496, + "flos": 15382816761600.0, + "grad_norm": 3.1254224655104252, + "language_loss": 0.77114201, + "learning_rate": 3.5658632972546734e-06, + "loss": 0.84939343, + "num_input_tokens_seen": 84959505, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.20385742, + "step": 3947, + "time_per_iteration": 2.495837926864624 + }, + { + "auxiliary_loss_clip": 0.06540522, + "auxiliary_loss_mlp": 0.01290208, + "balance_loss_clip": 0.06311937, + "balance_loss_mlp": 0.01270431, + "epoch": 0.23736660153314296, + "flos": 28158738422400.0, + "grad_norm": 1.595292591120463, + "language_loss": 0.80941439, + "learning_rate": 3.565620980442944e-06, + "loss": 0.88772172, + "num_input_tokens_seen": 84982130, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.19775391, + "step": 3948, + "time_per_iteration": 2.6460211277008057 + }, + { + "auxiliary_loss_clip": 0.06542704, + "auxiliary_loss_mlp": 0.01297731, + "balance_loss_clip": 0.06312679, + "balance_loss_mlp": 0.01277025, + "epoch": 0.23742672478581092, + "flos": 22092385726080.0, + "grad_norm": 1.753357741589714, + "language_loss": 0.80419362, + "learning_rate": 3.5653786042627107e-06, + "loss": 0.88259804, + "num_input_tokens_seen": 85000640, + "router_z_loss_clip": 2.30078125, + "router_z_loss_mlp": 0.20715332, + "step": 3949, + "time_per_iteration": 2.5428664684295654 + }, + { + "auxiliary_loss_clip": 0.06549721, + "auxiliary_loss_mlp": 0.01294419, + "balance_loss_clip": 0.06317213, + "balance_loss_mlp": 0.012732, + "epoch": 0.2374868480384789, + "flos": 19543238887680.0, + "grad_norm": 1.6923054699564082, + "language_loss": 0.73375976, + "learning_rate": 3.565136168723163e-06, + "loss": 0.81220114, + "num_input_tokens_seen": 85018970, + "router_z_loss_clip": 2.32421875, + "router_z_loss_mlp": 0.2121582, + "step": 3950, + "time_per_iteration": 2.6125261783599854 + }, + { + "auxiliary_loss_clip": 0.06527583, + "auxiliary_loss_mlp": 0.01288007, + "balance_loss_clip": 0.06302388, + "balance_loss_mlp": 0.01268957, + "epoch": 0.23754697129114685, + "flos": 19427769561600.0, + "grad_norm": 1.893051910973559, + "language_loss": 0.73254943, + "learning_rate": 3.564893673833495e-06, + "loss": 0.8107053, + "num_input_tokens_seen": 85035905, + "router_z_loss_clip": 2.25390625, + "router_z_loss_mlp": 0.1907959, + "step": 3951, + "time_per_iteration": 2.501091957092285 + }, + { + "auxiliary_loss_clip": 0.06543966, + "auxiliary_loss_mlp": 0.01301622, + "balance_loss_clip": 0.06315006, + "balance_loss_mlp": 0.01280332, + "epoch": 0.23760709454381482, + "flos": 19507208832000.0, + "grad_norm": 1.727887568846887, + "language_loss": 0.7427932, + "learning_rate": 3.564651119602903e-06, + "loss": 0.82124901, + "num_input_tokens_seen": 85054560, + "router_z_loss_clip": 2.2890625, + "router_z_loss_mlp": 0.2130127, + "step": 3952, + "time_per_iteration": 2.5467019081115723 + }, + { + "auxiliary_loss_clip": 0.06536686, + "auxiliary_loss_mlp": 0.01292988, + "balance_loss_clip": 0.0630881, + "balance_loss_mlp": 0.01273379, + "epoch": 0.23766721779648278, + "flos": 27644045518080.0, + "grad_norm": 3.105577179216311, + "language_loss": 0.71633041, + "learning_rate": 3.564408506040583e-06, + "loss": 0.79462719, + "num_input_tokens_seen": 85074425, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.19604492, + "step": 3953, + "time_per_iteration": 2.599946975708008 + }, + { + "auxiliary_loss_clip": 0.06537458, + "auxiliary_loss_mlp": 0.01292831, + "balance_loss_clip": 0.06305911, + "balance_loss_mlp": 0.01272673, + "epoch": 0.23772734104915075, + "flos": 23411102083200.0, + "grad_norm": 6.547469437533346, + "language_loss": 0.82534778, + "learning_rate": 3.5641658331557356e-06, + "loss": 0.90365064, + "num_input_tokens_seen": 85092865, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.20166016, + "step": 3954, + "time_per_iteration": 2.595163583755493 + }, + { + "auxiliary_loss_clip": 0.06538694, + "auxiliary_loss_mlp": 0.01291334, + "balance_loss_clip": 0.0630859, + "balance_loss_mlp": 0.01271486, + "epoch": 0.23778746430181874, + "flos": 15710902623360.0, + "grad_norm": 2.2065720754909606, + "language_loss": 0.66202033, + "learning_rate": 3.5639231009575634e-06, + "loss": 0.74032056, + "num_input_tokens_seen": 85110175, + "router_z_loss_clip": 2.30273438, + "router_z_loss_mlp": 0.19848633, + "step": 3955, + "time_per_iteration": 2.5345511436462402 + }, + { + "auxiliary_loss_clip": 0.06527859, + "auxiliary_loss_mlp": 0.01285762, + "balance_loss_clip": 0.06301668, + "balance_loss_mlp": 0.01266081, + "epoch": 0.2378475875544867, + "flos": 19432381536000.0, + "grad_norm": 1.4478942147045952, + "language_loss": 0.84203303, + "learning_rate": 3.5636803094552704e-06, + "loss": 0.92016923, + "num_input_tokens_seen": 85129925, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.19689941, + "step": 3956, + "time_per_iteration": 2.5458483695983887 + }, + { + "auxiliary_loss_clip": 0.06526335, + "auxiliary_loss_mlp": 0.01287929, + "balance_loss_clip": 0.06303546, + "balance_loss_mlp": 0.01268438, + "epoch": 0.23790771080715467, + "flos": 22274338867200.0, + "grad_norm": 2.194064451149358, + "language_loss": 0.8561964, + "learning_rate": 3.5634374586580635e-06, + "loss": 0.93433905, + "num_input_tokens_seen": 85147755, + "router_z_loss_clip": 2.2265625, + "router_z_loss_mlp": 0.19494629, + "step": 3957, + "time_per_iteration": 2.5579113960266113 + }, + { + "auxiliary_loss_clip": 0.06532466, + "auxiliary_loss_mlp": 0.01283677, + "balance_loss_clip": 0.0630599, + "balance_loss_mlp": 0.01264008, + "epoch": 0.23796783405982264, + "flos": 20053445598720.0, + "grad_norm": 2.4454692262909856, + "language_loss": 0.7073434, + "learning_rate": 3.563194548575151e-06, + "loss": 0.78550482, + "num_input_tokens_seen": 85165270, + "router_z_loss_clip": 2.26757812, + "router_z_loss_mlp": 0.19665527, + "step": 3958, + "time_per_iteration": 2.556201219558716 + }, + { + "auxiliary_loss_clip": 0.06533751, + "auxiliary_loss_mlp": 0.01277914, + "balance_loss_clip": 0.06301822, + "balance_loss_mlp": 0.01257303, + "epoch": 0.2380279573124906, + "flos": 14251084790400.0, + "grad_norm": 4.548053192599961, + "language_loss": 0.66760004, + "learning_rate": 3.562951579215745e-06, + "loss": 0.74571669, + "num_input_tokens_seen": 85181555, + "router_z_loss_clip": 2.32226562, + "router_z_loss_mlp": 0.20617676, + "step": 3959, + "time_per_iteration": 2.491999626159668 + }, + { + "auxiliary_loss_clip": 0.06529753, + "auxiliary_loss_mlp": 0.01278003, + "balance_loss_clip": 0.06303047, + "balance_loss_mlp": 0.01259228, + "epoch": 0.23808808056515857, + "flos": 21185638767360.0, + "grad_norm": 1.7806564555446132, + "language_loss": 0.72341377, + "learning_rate": 3.5627085505890586e-06, + "loss": 0.80149138, + "num_input_tokens_seen": 85199455, + "router_z_loss_clip": 2.26757812, + "router_z_loss_mlp": 0.18774414, + "step": 3960, + "time_per_iteration": 2.523761034011841 + }, + { + "auxiliary_loss_clip": 0.0652384, + "auxiliary_loss_mlp": 0.0127522, + "balance_loss_clip": 0.06296217, + "balance_loss_mlp": 0.01255169, + "epoch": 0.23814820381782653, + "flos": 22534850810880.0, + "grad_norm": 1.610971251516654, + "language_loss": 0.7476449, + "learning_rate": 3.562465462704307e-06, + "loss": 0.82563543, + "num_input_tokens_seen": 85219170, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.20031738, + "step": 3961, + "time_per_iteration": 2.5350120067596436 + }, + { + "auxiliary_loss_clip": 0.06528293, + "auxiliary_loss_mlp": 0.01283237, + "balance_loss_clip": 0.06297825, + "balance_loss_mlp": 0.01261505, + "epoch": 0.23820832707049452, + "flos": 22309991579520.0, + "grad_norm": 2.008938617955162, + "language_loss": 0.66267157, + "learning_rate": 3.5622223155707085e-06, + "loss": 0.74078679, + "num_input_tokens_seen": 85238480, + "router_z_loss_clip": 2.30273438, + "router_z_loss_mlp": 0.21728516, + "step": 3962, + "time_per_iteration": 2.554936170578003 + }, + { + "auxiliary_loss_clip": 0.06522447, + "auxiliary_loss_mlp": 0.01279056, + "balance_loss_clip": 0.0629696, + "balance_loss_mlp": 0.0126009, + "epoch": 0.2382684503231625, + "flos": 24871297259520.0, + "grad_norm": 1.868964177707197, + "language_loss": 0.75134146, + "learning_rate": 3.561979109197483e-06, + "loss": 0.82935649, + "num_input_tokens_seen": 85259180, + "router_z_loss_clip": 2.25585938, + "router_z_loss_mlp": 0.18969727, + "step": 3963, + "time_per_iteration": 3.9841935634613037 + }, + { + "auxiliary_loss_clip": 0.0652955, + "auxiliary_loss_mlp": 0.01278457, + "balance_loss_clip": 0.06298651, + "balance_loss_mlp": 0.01257428, + "epoch": 0.23832857357583045, + "flos": 21878050181760.0, + "grad_norm": 2.083636930734351, + "language_loss": 0.77508426, + "learning_rate": 3.5617358435938538e-06, + "loss": 0.85316432, + "num_input_tokens_seen": 85278550, + "router_z_loss_clip": 2.30859375, + "router_z_loss_mlp": 0.21032715, + "step": 3964, + "time_per_iteration": 2.546093463897705 + }, + { + "auxiliary_loss_clip": 0.06513681, + "auxiliary_loss_mlp": 0.01275741, + "balance_loss_clip": 0.06290185, + "balance_loss_mlp": 0.01256275, + "epoch": 0.23838869682849842, + "flos": 21294441694080.0, + "grad_norm": 2.0070777911568207, + "language_loss": 0.72507781, + "learning_rate": 3.561492518769045e-06, + "loss": 0.80297208, + "num_input_tokens_seen": 85297345, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.19458008, + "step": 3965, + "time_per_iteration": 2.605717182159424 + }, + { + "auxiliary_loss_clip": 0.06518564, + "auxiliary_loss_mlp": 0.012776, + "balance_loss_clip": 0.06293208, + "balance_loss_mlp": 0.01258181, + "epoch": 0.23844882008116638, + "flos": 16186211308800.0, + "grad_norm": 2.069567415104782, + "language_loss": 0.79030257, + "learning_rate": 3.561249134732282e-06, + "loss": 0.8682642, + "num_input_tokens_seen": 85315105, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.19396973, + "step": 3966, + "time_per_iteration": 3.980722427368164 + }, + { + "auxiliary_loss_clip": 0.06517511, + "auxiliary_loss_mlp": 0.01283232, + "balance_loss_clip": 0.06290257, + "balance_loss_mlp": 0.01264647, + "epoch": 0.23850894333383435, + "flos": 21076165008000.0, + "grad_norm": 3.0015774693629433, + "language_loss": 0.69417417, + "learning_rate": 3.561005691492797e-06, + "loss": 0.77218163, + "num_input_tokens_seen": 85334735, + "router_z_loss_clip": 2.27929688, + "router_z_loss_mlp": 0.18579102, + "step": 3967, + "time_per_iteration": 2.542595386505127 + }, + { + "auxiliary_loss_clip": 0.06523537, + "auxiliary_loss_mlp": 0.01278611, + "balance_loss_clip": 0.0629587, + "balance_loss_mlp": 0.01257821, + "epoch": 0.23856906658650234, + "flos": 17207295563520.0, + "grad_norm": 1.9959497275253817, + "language_loss": 0.68410718, + "learning_rate": 3.5607621890598185e-06, + "loss": 0.76212859, + "num_input_tokens_seen": 85352875, + "router_z_loss_clip": 2.27539062, + "router_z_loss_mlp": 0.20800781, + "step": 3968, + "time_per_iteration": 2.5275728702545166 + }, + { + "auxiliary_loss_clip": 0.06526159, + "auxiliary_loss_mlp": 0.01279655, + "balance_loss_clip": 0.0629804, + "balance_loss_mlp": 0.01261392, + "epoch": 0.2386291898391703, + "flos": 29501451774720.0, + "grad_norm": 2.0078802263631994, + "language_loss": 0.77147222, + "learning_rate": 3.5605186274425823e-06, + "loss": 0.84953034, + "num_input_tokens_seen": 85372205, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.18261719, + "step": 3969, + "time_per_iteration": 4.006864547729492 + }, + { + "auxiliary_loss_clip": 0.06514208, + "auxiliary_loss_mlp": 0.01292793, + "balance_loss_clip": 0.06291697, + "balance_loss_mlp": 0.01274602, + "epoch": 0.23868931309183827, + "flos": 21148854024960.0, + "grad_norm": 1.9717404660495825, + "language_loss": 0.76892555, + "learning_rate": 3.5602750066503225e-06, + "loss": 0.84699559, + "num_input_tokens_seen": 85389705, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.18188477, + "step": 3970, + "time_per_iteration": 2.558915615081787 + }, + { + "auxiliary_loss_clip": 0.06523073, + "auxiliary_loss_mlp": 0.0128602, + "balance_loss_clip": 0.0629265, + "balance_loss_mlp": 0.01265969, + "epoch": 0.23874943634450624, + "flos": 25665342076800.0, + "grad_norm": 2.212795121423013, + "language_loss": 0.85452002, + "learning_rate": 3.5600313266922793e-06, + "loss": 0.93261099, + "num_input_tokens_seen": 85407855, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.20043945, + "step": 3971, + "time_per_iteration": 2.5621652603149414 + }, + { + "auxiliary_loss_clip": 0.06391954, + "auxiliary_loss_mlp": 0.01255828, + "balance_loss_clip": 0.06279661, + "balance_loss_mlp": 0.01251122, + "epoch": 0.2388095595971742, + "flos": 59006871889920.0, + "grad_norm": 0.7183517633018239, + "language_loss": 0.62744105, + "learning_rate": 3.5597875875776915e-06, + "loss": 0.70391893, + "num_input_tokens_seen": 85470885, + "router_z_loss_clip": 1.125, + "router_z_loss_mlp": 0.04696655, + "step": 3972, + "time_per_iteration": 4.643376350402832 + }, + { + "auxiliary_loss_clip": 0.06515118, + "auxiliary_loss_mlp": 0.01277926, + "balance_loss_clip": 0.06290536, + "balance_loss_mlp": 0.01258399, + "epoch": 0.23886968284984217, + "flos": 16805975633280.0, + "grad_norm": 3.0192177240020976, + "language_loss": 0.81866533, + "learning_rate": 3.5595437893158013e-06, + "loss": 0.89659578, + "num_input_tokens_seen": 85488460, + "router_z_loss_clip": 2.2421875, + "router_z_loss_mlp": 0.19543457, + "step": 3973, + "time_per_iteration": 2.5597283840179443 + }, + { + "auxiliary_loss_clip": 0.06517763, + "auxiliary_loss_mlp": 0.01283675, + "balance_loss_clip": 0.06291795, + "balance_loss_mlp": 0.01265162, + "epoch": 0.23892980610251013, + "flos": 22389221214720.0, + "grad_norm": 1.829209898292947, + "language_loss": 0.79696077, + "learning_rate": 3.5592999319158546e-06, + "loss": 0.8749752, + "num_input_tokens_seen": 85508590, + "router_z_loss_clip": 2.25976562, + "router_z_loss_mlp": 0.18518066, + "step": 3974, + "time_per_iteration": 2.5331227779388428 + }, + { + "auxiliary_loss_clip": 0.06524864, + "auxiliary_loss_mlp": 0.01291591, + "balance_loss_clip": 0.06296244, + "balance_loss_mlp": 0.01272279, + "epoch": 0.23898992935517813, + "flos": 12828135553920.0, + "grad_norm": 6.773745042238101, + "language_loss": 0.85156423, + "learning_rate": 3.5590560153870984e-06, + "loss": 0.92972875, + "num_input_tokens_seen": 85525970, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.19311523, + "step": 3975, + "time_per_iteration": 2.6016342639923096 + }, + { + "auxiliary_loss_clip": 0.06513388, + "auxiliary_loss_mlp": 0.01278416, + "balance_loss_clip": 0.06290747, + "balance_loss_mlp": 0.01260117, + "epoch": 0.2390500526078461, + "flos": 22352142983040.0, + "grad_norm": 3.375355565005516, + "language_loss": 0.84191501, + "learning_rate": 3.5588120397387816e-06, + "loss": 0.91983294, + "num_input_tokens_seen": 85543700, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.1829834, + "step": 3976, + "time_per_iteration": 2.5339527130126953 + }, + { + "auxiliary_loss_clip": 0.06511909, + "auxiliary_loss_mlp": 0.01282136, + "balance_loss_clip": 0.06290296, + "balance_loss_mlp": 0.01264111, + "epoch": 0.23911017586051406, + "flos": 22641263896320.0, + "grad_norm": 3.0704844059493497, + "language_loss": 0.74960983, + "learning_rate": 3.5585680049801566e-06, + "loss": 0.82755029, + "num_input_tokens_seen": 85562765, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.18029785, + "step": 3977, + "time_per_iteration": 2.5528597831726074 + }, + { + "auxiliary_loss_clip": 0.06524444, + "auxiliary_loss_mlp": 0.01281803, + "balance_loss_clip": 0.06296922, + "balance_loss_mlp": 0.01261478, + "epoch": 0.23917029911318202, + "flos": 23658993987840.0, + "grad_norm": 3.246082679368102, + "language_loss": 0.7235828, + "learning_rate": 3.5583239111204764e-06, + "loss": 0.80164528, + "num_input_tokens_seen": 85581755, + "router_z_loss_clip": 2.2734375, + "router_z_loss_mlp": 0.203125, + "step": 3978, + "time_per_iteration": 2.548459768295288 + }, + { + "auxiliary_loss_clip": 0.06536747, + "auxiliary_loss_mlp": 0.01279264, + "balance_loss_clip": 0.06306014, + "balance_loss_mlp": 0.0125994, + "epoch": 0.23923042236585, + "flos": 22790163801600.0, + "grad_norm": 2.3394422136849875, + "language_loss": 0.79264927, + "learning_rate": 3.558079758168997e-06, + "loss": 0.87080932, + "num_input_tokens_seen": 85599455, + "router_z_loss_clip": 2.30859375, + "router_z_loss_mlp": 0.1932373, + "step": 3979, + "time_per_iteration": 2.5696120262145996 + }, + { + "auxiliary_loss_clip": 0.06521225, + "auxiliary_loss_mlp": 0.01282521, + "balance_loss_clip": 0.06295727, + "balance_loss_mlp": 0.01263185, + "epoch": 0.23929054561851795, + "flos": 28155300405120.0, + "grad_norm": 1.7900268576070866, + "language_loss": 0.81971824, + "learning_rate": 3.557835546134977e-06, + "loss": 0.89775562, + "num_input_tokens_seen": 85619970, + "router_z_loss_clip": 2.25585938, + "router_z_loss_mlp": 0.1932373, + "step": 3980, + "time_per_iteration": 2.587286949157715 + }, + { + "auxiliary_loss_clip": 0.06519361, + "auxiliary_loss_mlp": 0.01281001, + "balance_loss_clip": 0.06296664, + "balance_loss_mlp": 0.01261891, + "epoch": 0.23935066887118592, + "flos": 21692491315200.0, + "grad_norm": 1.7930077111492302, + "language_loss": 0.84270984, + "learning_rate": 3.5575912750276775e-06, + "loss": 0.92071348, + "num_input_tokens_seen": 85638850, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.19091797, + "step": 3981, + "time_per_iteration": 2.550725221633911 + }, + { + "auxiliary_loss_clip": 0.06535558, + "auxiliary_loss_mlp": 0.01280601, + "balance_loss_clip": 0.06303745, + "balance_loss_mlp": 0.01260669, + "epoch": 0.2394107921238539, + "flos": 32130121737600.0, + "grad_norm": 2.0248039039910393, + "language_loss": 0.77712274, + "learning_rate": 3.5573469448563607e-06, + "loss": 0.85528433, + "num_input_tokens_seen": 85656285, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.19934082, + "step": 3982, + "time_per_iteration": 2.594698667526245 + }, + { + "auxiliary_loss_clip": 0.06530322, + "auxiliary_loss_mlp": 0.01280321, + "balance_loss_clip": 0.06304529, + "balance_loss_mlp": 0.01261307, + "epoch": 0.23947091537652188, + "flos": 17024839297920.0, + "grad_norm": 1.9623565914246572, + "language_loss": 0.7809152, + "learning_rate": 3.5571025556302915e-06, + "loss": 0.85902166, + "num_input_tokens_seen": 85673020, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.19006348, + "step": 3983, + "time_per_iteration": 2.537132740020752 + }, + { + "auxiliary_loss_clip": 0.06527262, + "auxiliary_loss_mlp": 0.01280803, + "balance_loss_clip": 0.0630171, + "balance_loss_mlp": 0.01261956, + "epoch": 0.23953103862918984, + "flos": 20599640438400.0, + "grad_norm": 2.137172968887566, + "language_loss": 0.73945713, + "learning_rate": 3.556858107358737e-06, + "loss": 0.81753772, + "num_input_tokens_seen": 85692565, + "router_z_loss_clip": 2.25585938, + "router_z_loss_mlp": 0.18835449, + "step": 3984, + "time_per_iteration": 2.538221836090088 + }, + { + "auxiliary_loss_clip": 0.06531888, + "auxiliary_loss_mlp": 0.01281613, + "balance_loss_clip": 0.06302323, + "balance_loss_mlp": 0.01262587, + "epoch": 0.2395911618818578, + "flos": 20710707425280.0, + "grad_norm": 1.9765684717262704, + "language_loss": 0.7965889, + "learning_rate": 3.5566136000509674e-06, + "loss": 0.87472391, + "num_input_tokens_seen": 85709730, + "router_z_loss_clip": 2.29492188, + "router_z_loss_mlp": 0.19030762, + "step": 3985, + "time_per_iteration": 2.551649570465088 + }, + { + "auxiliary_loss_clip": 0.06532246, + "auxiliary_loss_mlp": 0.0127953, + "balance_loss_clip": 0.06304285, + "balance_loss_mlp": 0.01259265, + "epoch": 0.23965128513452577, + "flos": 27060982081920.0, + "grad_norm": 1.916737509209056, + "language_loss": 0.73610401, + "learning_rate": 3.556369033716254e-06, + "loss": 0.8142218, + "num_input_tokens_seen": 85730045, + "router_z_loss_clip": 2.27929688, + "router_z_loss_mlp": 0.20263672, + "step": 3986, + "time_per_iteration": 2.710397481918335 + }, + { + "auxiliary_loss_clip": 0.06540911, + "auxiliary_loss_mlp": 0.01281338, + "balance_loss_clip": 0.0630495, + "balance_loss_mlp": 0.01261, + "epoch": 0.23971140838719374, + "flos": 23150254723200.0, + "grad_norm": 1.785192597796332, + "language_loss": 0.88325328, + "learning_rate": 3.556124408363871e-06, + "loss": 0.96147585, + "num_input_tokens_seen": 85747590, + "router_z_loss_clip": 2.36328125, + "router_z_loss_mlp": 0.20336914, + "step": 3987, + "time_per_iteration": 2.6331911087036133 + }, + { + "auxiliary_loss_clip": 0.06529854, + "auxiliary_loss_mlp": 0.01278502, + "balance_loss_clip": 0.06312454, + "balance_loss_mlp": 0.0126043, + "epoch": 0.23977153163986173, + "flos": 18039341007360.0, + "grad_norm": 2.2552133940915224, + "language_loss": 0.84056735, + "learning_rate": 3.5558797240030945e-06, + "loss": 0.91865093, + "num_input_tokens_seen": 85763460, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.18078613, + "step": 3988, + "time_per_iteration": 2.5413994789123535 + }, + { + "auxiliary_loss_clip": 0.06533512, + "auxiliary_loss_mlp": 0.01288032, + "balance_loss_clip": 0.06306052, + "balance_loss_mlp": 0.01267052, + "epoch": 0.2398316548925297, + "flos": 18119157621120.0, + "grad_norm": 1.6232739060807335, + "language_loss": 0.85473406, + "learning_rate": 3.5556349806432035e-06, + "loss": 0.93294942, + "num_input_tokens_seen": 85782050, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.2097168, + "step": 3989, + "time_per_iteration": 2.528348207473755 + }, + { + "auxiliary_loss_clip": 0.06527147, + "auxiliary_loss_mlp": 0.01286562, + "balance_loss_clip": 0.06305796, + "balance_loss_mlp": 0.01266642, + "epoch": 0.23989177814519766, + "flos": 12572612928000.0, + "grad_norm": 2.695913709141839, + "language_loss": 0.8517406, + "learning_rate": 3.555390178293477e-06, + "loss": 0.92987764, + "num_input_tokens_seen": 85797400, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.19909668, + "step": 3990, + "time_per_iteration": 2.52915358543396 + }, + { + "auxiliary_loss_clip": 0.06527729, + "auxiliary_loss_mlp": 0.01283435, + "balance_loss_clip": 0.06302518, + "balance_loss_mlp": 0.01264064, + "epoch": 0.23995190139786562, + "flos": 25271569013760.0, + "grad_norm": 1.4267230320219149, + "language_loss": 0.76345301, + "learning_rate": 3.5551453169631994e-06, + "loss": 0.84156466, + "num_input_tokens_seen": 85818995, + "router_z_loss_clip": 2.25, + "router_z_loss_mlp": 0.19372559, + "step": 3991, + "time_per_iteration": 2.556820869445801 + }, + { + "auxiliary_loss_clip": 0.06413993, + "auxiliary_loss_mlp": 0.0126815, + "balance_loss_clip": 0.06298733, + "balance_loss_mlp": 0.01262789, + "epoch": 0.2400120246505336, + "flos": 61978107271680.0, + "grad_norm": 0.8724678757997124, + "language_loss": 0.6358996, + "learning_rate": 3.554900396661656e-06, + "loss": 0.71272099, + "num_input_tokens_seen": 85876695, + "router_z_loss_clip": 1.15625, + "router_z_loss_mlp": 0.05368042, + "step": 3992, + "time_per_iteration": 3.0817418098449707 + }, + { + "auxiliary_loss_clip": 0.06411353, + "auxiliary_loss_mlp": 0.01264238, + "balance_loss_clip": 0.06297012, + "balance_loss_mlp": 0.01259121, + "epoch": 0.24007214790320155, + "flos": 66727923816960.0, + "grad_norm": 0.7394753945990321, + "language_loss": 0.62864375, + "learning_rate": 3.5546554173981334e-06, + "loss": 0.70539963, + "num_input_tokens_seen": 85940990, + "router_z_loss_clip": 1.140625, + "router_z_loss_mlp": 0.05117798, + "step": 3993, + "time_per_iteration": 3.2552971839904785 + }, + { + "auxiliary_loss_clip": 0.0652933, + "auxiliary_loss_mlp": 0.01280032, + "balance_loss_clip": 0.062997, + "balance_loss_mlp": 0.0125886, + "epoch": 0.24013227115586952, + "flos": 25815667501440.0, + "grad_norm": 1.8775036450716396, + "language_loss": 0.77610862, + "learning_rate": 3.5544103791819218e-06, + "loss": 0.85420227, + "num_input_tokens_seen": 85961165, + "router_z_loss_clip": 2.30078125, + "router_z_loss_mlp": 0.21154785, + "step": 3994, + "time_per_iteration": 2.6225738525390625 + }, + { + "auxiliary_loss_clip": 0.06526788, + "auxiliary_loss_mlp": 0.01288387, + "balance_loss_clip": 0.06296962, + "balance_loss_mlp": 0.01266822, + "epoch": 0.2401923944085375, + "flos": 25564672995840.0, + "grad_norm": 1.626402048760673, + "language_loss": 0.78733414, + "learning_rate": 3.5541652820223124e-06, + "loss": 0.86548591, + "num_input_tokens_seen": 85982710, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.21557617, + "step": 3995, + "time_per_iteration": 2.5860579013824463 + }, + { + "auxiliary_loss_clip": 0.06395802, + "auxiliary_loss_mlp": 0.01265549, + "balance_loss_clip": 0.06282736, + "balance_loss_mlp": 0.01260685, + "epoch": 0.24025251766120548, + "flos": 54961457892480.0, + "grad_norm": 0.8928130340410044, + "language_loss": 0.63566971, + "learning_rate": 3.5539201259286006e-06, + "loss": 0.71228325, + "num_input_tokens_seen": 86046935, + "router_z_loss_clip": 1.12597656, + "router_z_loss_mlp": 0.04858398, + "step": 3996, + "time_per_iteration": 3.232227087020874 + }, + { + "auxiliary_loss_clip": 0.06522241, + "auxiliary_loss_mlp": 0.01283128, + "balance_loss_clip": 0.06290409, + "balance_loss_mlp": 0.0126328, + "epoch": 0.24031264091387344, + "flos": 20637305648640.0, + "grad_norm": 2.8724335092069864, + "language_loss": 0.71121502, + "learning_rate": 3.5536749109100808e-06, + "loss": 0.78926873, + "num_input_tokens_seen": 86064355, + "router_z_loss_clip": 2.31835938, + "router_z_loss_mlp": 0.19848633, + "step": 3997, + "time_per_iteration": 2.5484869480133057 + }, + { + "auxiliary_loss_clip": 0.06510898, + "auxiliary_loss_mlp": 0.01285703, + "balance_loss_clip": 0.06285729, + "balance_loss_mlp": 0.01265473, + "epoch": 0.2403727641665414, + "flos": 20892492858240.0, + "grad_norm": 1.7909711234465908, + "language_loss": 0.87516266, + "learning_rate": 3.5534296369760535e-06, + "loss": 0.9531287, + "num_input_tokens_seen": 86081340, + "router_z_loss_clip": 2.25390625, + "router_z_loss_mlp": 0.20227051, + "step": 3998, + "time_per_iteration": 2.563215970993042 + }, + { + "auxiliary_loss_clip": 0.06526193, + "auxiliary_loss_mlp": 0.01279159, + "balance_loss_clip": 0.06292593, + "balance_loss_mlp": 0.01258762, + "epoch": 0.24043288741920937, + "flos": 22826613127680.0, + "grad_norm": 1.593528116777893, + "language_loss": 0.76414531, + "learning_rate": 3.5531843041358183e-06, + "loss": 0.84219879, + "num_input_tokens_seen": 86102260, + "router_z_loss_clip": 2.3359375, + "router_z_loss_mlp": 0.20410156, + "step": 3999, + "time_per_iteration": 2.5577592849731445 + }, + { + "auxiliary_loss_clip": 0.06511137, + "auxiliary_loss_mlp": 0.01275527, + "balance_loss_clip": 0.0628795, + "balance_loss_mlp": 0.01256716, + "epoch": 0.24049301067187734, + "flos": 27966261594240.0, + "grad_norm": 2.3407253335254086, + "language_loss": 0.73292184, + "learning_rate": 3.552938912398679e-06, + "loss": 0.81078851, + "num_input_tokens_seen": 86123400, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.18823242, + "step": 4000, + "time_per_iteration": 2.583524703979492 + }, + { + "auxiliary_loss_clip": 0.06528921, + "auxiliary_loss_mlp": 0.01283655, + "balance_loss_clip": 0.06293923, + "balance_loss_mlp": 0.01261935, + "epoch": 0.24055313392454533, + "flos": 27458360870400.0, + "grad_norm": 2.671051655318694, + "language_loss": 0.67159665, + "learning_rate": 3.5526934617739397e-06, + "loss": 0.74972242, + "num_input_tokens_seen": 86144060, + "router_z_loss_clip": 2.3515625, + "router_z_loss_mlp": 0.21728516, + "step": 4001, + "time_per_iteration": 2.6188552379608154 + }, + { + "auxiliary_loss_clip": 0.06522354, + "auxiliary_loss_mlp": 0.01279459, + "balance_loss_clip": 0.06293849, + "balance_loss_mlp": 0.01257703, + "epoch": 0.2406132571772133, + "flos": 25563666746880.0, + "grad_norm": 5.034242823707272, + "language_loss": 0.83152658, + "learning_rate": 3.5524479522709095e-06, + "loss": 0.90954471, + "num_input_tokens_seen": 86163005, + "router_z_loss_clip": 2.28320312, + "router_z_loss_mlp": 0.21740723, + "step": 4002, + "time_per_iteration": 3.9769785404205322 + }, + { + "auxiliary_loss_clip": 0.06519094, + "auxiliary_loss_mlp": 0.01282536, + "balance_loss_clip": 0.06293523, + "balance_loss_mlp": 0.01262032, + "epoch": 0.24067338042988126, + "flos": 24798482461440.0, + "grad_norm": 2.0463487498067323, + "language_loss": 0.83599687, + "learning_rate": 3.552202383898897e-06, + "loss": 0.91401321, + "num_input_tokens_seen": 86182580, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.20483398, + "step": 4003, + "time_per_iteration": 2.581669569015503 + }, + { + "auxiliary_loss_clip": 0.06526292, + "auxiliary_loss_mlp": 0.01281725, + "balance_loss_clip": 0.06295015, + "balance_loss_mlp": 0.01261412, + "epoch": 0.24073350368254923, + "flos": 21184171320960.0, + "grad_norm": 2.0670244348036646, + "language_loss": 0.87907362, + "learning_rate": 3.551956756667215e-06, + "loss": 0.9571538, + "num_input_tokens_seen": 86200665, + "router_z_loss_clip": 2.31445312, + "router_z_loss_mlp": 0.20300293, + "step": 4004, + "time_per_iteration": 2.514268636703491 + }, + { + "auxiliary_loss_clip": 0.06526911, + "auxiliary_loss_mlp": 0.01282868, + "balance_loss_clip": 0.06294513, + "balance_loss_mlp": 0.01261815, + "epoch": 0.2407936269352172, + "flos": 22501252523520.0, + "grad_norm": 3.538522770409821, + "language_loss": 0.78168321, + "learning_rate": 3.551711070585177e-06, + "loss": 0.85978097, + "num_input_tokens_seen": 86221640, + "router_z_loss_clip": 2.32421875, + "router_z_loss_mlp": 0.21057129, + "step": 4005, + "time_per_iteration": 2.67775559425354 + }, + { + "auxiliary_loss_clip": 0.0651572, + "auxiliary_loss_mlp": 0.01283457, + "balance_loss_clip": 0.06293365, + "balance_loss_mlp": 0.01263084, + "epoch": 0.24085375018788516, + "flos": 18556968804480.0, + "grad_norm": 2.371719422478697, + "language_loss": 0.79360878, + "learning_rate": 3.5514653256620995e-06, + "loss": 0.87160051, + "num_input_tokens_seen": 86240795, + "router_z_loss_clip": 2.2265625, + "router_z_loss_mlp": 0.20373535, + "step": 4006, + "time_per_iteration": 4.034858465194702 + }, + { + "auxiliary_loss_clip": 0.0653493, + "auxiliary_loss_mlp": 0.01283621, + "balance_loss_clip": 0.06296381, + "balance_loss_mlp": 0.01260709, + "epoch": 0.24091387344055312, + "flos": 24177418398720.0, + "grad_norm": 1.8737477168573817, + "language_loss": 0.71813238, + "learning_rate": 3.551219521907302e-06, + "loss": 0.79631788, + "num_input_tokens_seen": 86262000, + "router_z_loss_clip": 2.38671875, + "router_z_loss_mlp": 0.22912598, + "step": 4007, + "time_per_iteration": 2.5730202198028564 + }, + { + "auxiliary_loss_clip": 0.06518448, + "auxiliary_loss_mlp": 0.01300708, + "balance_loss_clip": 0.06295364, + "balance_loss_mlp": 0.01278773, + "epoch": 0.24097399669322112, + "flos": 11041112327040.0, + "grad_norm": 6.473369852788927, + "language_loss": 0.76978099, + "learning_rate": 3.5509736593301042e-06, + "loss": 0.84797251, + "num_input_tokens_seen": 86279680, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.21936035, + "step": 4008, + "time_per_iteration": 2.55989146232605 + }, + { + "auxiliary_loss_clip": 0.06518552, + "auxiliary_loss_mlp": 0.01286303, + "balance_loss_clip": 0.062894, + "balance_loss_mlp": 0.01264928, + "epoch": 0.24103411994588908, + "flos": 17170762383360.0, + "grad_norm": 2.1979472110907556, + "language_loss": 0.75080305, + "learning_rate": 3.5507277379398295e-06, + "loss": 0.82885164, + "num_input_tokens_seen": 86297180, + "router_z_loss_clip": 2.29296875, + "router_z_loss_mlp": 0.21398926, + "step": 4009, + "time_per_iteration": 3.957920551300049 + }, + { + "auxiliary_loss_clip": 0.06521554, + "auxiliary_loss_mlp": 0.01301136, + "balance_loss_clip": 0.06293823, + "balance_loss_mlp": 0.01279869, + "epoch": 0.24109424319855705, + "flos": 20674258099200.0, + "grad_norm": 1.5898496231384156, + "language_loss": 0.80111217, + "learning_rate": 3.550481757745804e-06, + "loss": 0.8793391, + "num_input_tokens_seen": 86317660, + "router_z_loss_clip": 2.27539062, + "router_z_loss_mlp": 0.21264648, + "step": 4010, + "time_per_iteration": 2.5475916862487793 + }, + { + "auxiliary_loss_clip": 0.06527252, + "auxiliary_loss_mlp": 0.01291864, + "balance_loss_clip": 0.06297424, + "balance_loss_mlp": 0.01268964, + "epoch": 0.241154366451225, + "flos": 28188982546560.0, + "grad_norm": 2.0856120841249366, + "language_loss": 0.70933908, + "learning_rate": 3.5502357187573555e-06, + "loss": 0.78753024, + "num_input_tokens_seen": 86338325, + "router_z_loss_clip": 2.29882812, + "router_z_loss_mlp": 0.22912598, + "step": 4011, + "time_per_iteration": 2.630932092666626 + }, + { + "auxiliary_loss_clip": 0.06528456, + "auxiliary_loss_mlp": 0.0128714, + "balance_loss_clip": 0.06298923, + "balance_loss_mlp": 0.01265766, + "epoch": 0.24121448970389298, + "flos": 21696222821760.0, + "grad_norm": 1.7418824634594252, + "language_loss": 0.694484, + "learning_rate": 3.5499896209838118e-06, + "loss": 0.77263987, + "num_input_tokens_seen": 86357615, + "router_z_loss_clip": 2.29492188, + "router_z_loss_mlp": 0.21362305, + "step": 4012, + "time_per_iteration": 3.988281726837158 + }, + { + "auxiliary_loss_clip": 0.06528036, + "auxiliary_loss_mlp": 0.01287792, + "balance_loss_clip": 0.06296879, + "balance_loss_mlp": 0.01264391, + "epoch": 0.24127461295656094, + "flos": 39685530142080.0, + "grad_norm": 1.5971840931497265, + "language_loss": 0.74512959, + "learning_rate": 3.5497434644345073e-06, + "loss": 0.82328784, + "num_input_tokens_seen": 86380355, + "router_z_loss_clip": 2.31054688, + "router_z_loss_mlp": 0.23388672, + "step": 4013, + "time_per_iteration": 2.7159719467163086 + }, + { + "auxiliary_loss_clip": 0.06531674, + "auxiliary_loss_mlp": 0.01283711, + "balance_loss_clip": 0.0630402, + "balance_loss_mlp": 0.01263231, + "epoch": 0.2413347362092289, + "flos": 19141960884480.0, + "grad_norm": 1.667652232266074, + "language_loss": 0.89031768, + "learning_rate": 3.5494972491187753e-06, + "loss": 0.96847153, + "num_input_tokens_seen": 86399125, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.20483398, + "step": 4014, + "time_per_iteration": 2.5638303756713867 + }, + { + "auxiliary_loss_clip": 0.06538786, + "auxiliary_loss_mlp": 0.01289681, + "balance_loss_clip": 0.06304225, + "balance_loss_mlp": 0.01268831, + "epoch": 0.2413948594618969, + "flos": 26946099734400.0, + "grad_norm": 1.9521080560444544, + "language_loss": 0.95043075, + "learning_rate": 3.549250975045952e-06, + "loss": 1.02871537, + "num_input_tokens_seen": 86418625, + "router_z_loss_clip": 2.34179688, + "router_z_loss_mlp": 0.20849609, + "step": 4015, + "time_per_iteration": 2.5697052478790283 + }, + { + "auxiliary_loss_clip": 0.0653477, + "auxiliary_loss_mlp": 0.01278309, + "balance_loss_clip": 0.06306844, + "balance_loss_mlp": 0.01257781, + "epoch": 0.24145498271456486, + "flos": 25235077760640.0, + "grad_norm": 1.8045004389175856, + "language_loss": 0.83243644, + "learning_rate": 3.5490046422253768e-06, + "loss": 0.91056728, + "num_input_tokens_seen": 86438375, + "router_z_loss_clip": 2.27929688, + "router_z_loss_mlp": 0.2052002, + "step": 4016, + "time_per_iteration": 2.5709176063537598 + }, + { + "auxiliary_loss_clip": 0.06532364, + "auxiliary_loss_mlp": 0.01285254, + "balance_loss_clip": 0.06311545, + "balance_loss_mlp": 0.0126463, + "epoch": 0.24151510596723283, + "flos": 40671339027840.0, + "grad_norm": 2.079467312298135, + "language_loss": 0.69439638, + "learning_rate": 3.54875825066639e-06, + "loss": 0.77257252, + "num_input_tokens_seen": 86463230, + "router_z_loss_clip": 2.20507812, + "router_z_loss_mlp": 0.20617676, + "step": 4017, + "time_per_iteration": 2.6893186569213867 + }, + { + "auxiliary_loss_clip": 0.06536807, + "auxiliary_loss_mlp": 0.01288936, + "balance_loss_clip": 0.06305309, + "balance_loss_mlp": 0.01266286, + "epoch": 0.2415752292199008, + "flos": 18151917367680.0, + "grad_norm": 1.6840714927615923, + "language_loss": 0.84970623, + "learning_rate": 3.5485118003783353e-06, + "loss": 0.92796361, + "num_input_tokens_seen": 86481230, + "router_z_loss_clip": 2.31445312, + "router_z_loss_mlp": 0.2265625, + "step": 4018, + "time_per_iteration": 2.521129608154297 + }, + { + "auxiliary_loss_clip": 0.06448493, + "auxiliary_loss_mlp": 0.01257752, + "balance_loss_clip": 0.06334345, + "balance_loss_mlp": 0.01253335, + "epoch": 0.24163535247256876, + "flos": 67307213819520.0, + "grad_norm": 1.2396896293086193, + "language_loss": 0.6054306, + "learning_rate": 3.548265291370558e-06, + "loss": 0.68249303, + "num_input_tokens_seen": 86541260, + "router_z_loss_clip": 1.140625, + "router_z_loss_mlp": 0.04425049, + "step": 4019, + "time_per_iteration": 3.2191333770751953 + }, + { + "auxiliary_loss_clip": 0.06539527, + "auxiliary_loss_mlp": 0.0127314, + "balance_loss_clip": 0.06310145, + "balance_loss_mlp": 0.01253983, + "epoch": 0.24169547572523672, + "flos": 24935810503680.0, + "grad_norm": 1.839335570686334, + "language_loss": 0.73635018, + "learning_rate": 3.5480187236524055e-06, + "loss": 0.81447685, + "num_input_tokens_seen": 86559580, + "router_z_loss_clip": 2.29296875, + "router_z_loss_mlp": 0.19140625, + "step": 4020, + "time_per_iteration": 2.587033271789551 + }, + { + "auxiliary_loss_clip": 0.06547633, + "auxiliary_loss_mlp": 0.01279706, + "balance_loss_clip": 0.06321433, + "balance_loss_mlp": 0.01259094, + "epoch": 0.24175559897790472, + "flos": 18733303722240.0, + "grad_norm": 1.757855043925666, + "language_loss": 0.81927264, + "learning_rate": 3.5477720972332285e-06, + "loss": 0.89754599, + "num_input_tokens_seen": 86577560, + "router_z_loss_clip": 2.26171875, + "router_z_loss_mlp": 0.20617676, + "step": 4021, + "time_per_iteration": 2.516295909881592 + }, + { + "auxiliary_loss_clip": 0.06542306, + "auxiliary_loss_mlp": 0.0127859, + "balance_loss_clip": 0.06314138, + "balance_loss_mlp": 0.01255201, + "epoch": 0.24181572223057268, + "flos": 23045937989760.0, + "grad_norm": 1.9677245364232816, + "language_loss": 0.76831293, + "learning_rate": 3.547525412122378e-06, + "loss": 0.84652191, + "num_input_tokens_seen": 86595350, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.23388672, + "step": 4022, + "time_per_iteration": 2.560833692550659 + }, + { + "auxiliary_loss_clip": 0.0655847, + "auxiliary_loss_mlp": 0.01279281, + "balance_loss_clip": 0.06321847, + "balance_loss_mlp": 0.01257477, + "epoch": 0.24187584548324065, + "flos": 20382411928320.0, + "grad_norm": 1.7589452517035808, + "language_loss": 0.75334597, + "learning_rate": 3.5472786683292083e-06, + "loss": 0.83172357, + "num_input_tokens_seen": 86614805, + "router_z_loss_clip": 2.36523438, + "router_z_loss_mlp": 0.21789551, + "step": 4023, + "time_per_iteration": 2.5414137840270996 + }, + { + "auxiliary_loss_clip": 0.06554291, + "auxiliary_loss_mlp": 0.01279196, + "balance_loss_clip": 0.06325305, + "balance_loss_mlp": 0.01258466, + "epoch": 0.2419359687359086, + "flos": 21403915453440.0, + "grad_norm": 1.837159559636974, + "language_loss": 0.82581335, + "learning_rate": 3.5470318658630766e-06, + "loss": 0.90414816, + "num_input_tokens_seen": 86633700, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.20751953, + "step": 4024, + "time_per_iteration": 2.570636034011841 + }, + { + "auxiliary_loss_clip": 0.06544912, + "auxiliary_loss_mlp": 0.01281053, + "balance_loss_clip": 0.06319256, + "balance_loss_mlp": 0.01260394, + "epoch": 0.24199609198857658, + "flos": 18375309152640.0, + "grad_norm": 1.8763334718563411, + "language_loss": 0.86724782, + "learning_rate": 3.5467850047333424e-06, + "loss": 0.94550753, + "num_input_tokens_seen": 86650905, + "router_z_loss_clip": 2.25585938, + "router_z_loss_mlp": 0.20654297, + "step": 4025, + "time_per_iteration": 2.507725715637207 + }, + { + "auxiliary_loss_clip": 0.0654591, + "auxiliary_loss_mlp": 0.01281956, + "balance_loss_clip": 0.06312732, + "balance_loss_mlp": 0.01261905, + "epoch": 0.24205621524124454, + "flos": 19469962892160.0, + "grad_norm": 2.105058685916829, + "language_loss": 0.72386706, + "learning_rate": 3.546538084949365e-06, + "loss": 0.80214572, + "num_input_tokens_seen": 86669185, + "router_z_loss_clip": 2.33398438, + "router_z_loss_mlp": 0.20068359, + "step": 4026, + "time_per_iteration": 2.573822498321533 + }, + { + "auxiliary_loss_clip": 0.06536272, + "auxiliary_loss_mlp": 0.01278576, + "balance_loss_clip": 0.06314979, + "balance_loss_mlp": 0.01258191, + "epoch": 0.2421163384939125, + "flos": 14981706466560.0, + "grad_norm": 5.331027510747572, + "language_loss": 0.64474452, + "learning_rate": 3.546291106520509e-06, + "loss": 0.722893, + "num_input_tokens_seen": 86686805, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.20397949, + "step": 4027, + "time_per_iteration": 2.5038652420043945 + }, + { + "auxiliary_loss_clip": 0.06553975, + "auxiliary_loss_mlp": 0.01291382, + "balance_loss_clip": 0.063242, + "balance_loss_mlp": 0.01271069, + "epoch": 0.2421764617465805, + "flos": 18668161572480.0, + "grad_norm": 2.149571528027882, + "language_loss": 0.70816404, + "learning_rate": 3.5460440694561388e-06, + "loss": 0.78661758, + "num_input_tokens_seen": 86705520, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.203125, + "step": 4028, + "time_per_iteration": 2.5707366466522217 + }, + { + "auxiliary_loss_clip": 0.06448589, + "auxiliary_loss_mlp": 0.01261037, + "balance_loss_clip": 0.06335288, + "balance_loss_mlp": 0.01254865, + "epoch": 0.24223658499924847, + "flos": 64368025424640.0, + "grad_norm": 0.8397041896242922, + "language_loss": 0.55315495, + "learning_rate": 3.545796973765623e-06, + "loss": 0.63025129, + "num_input_tokens_seen": 86767320, + "router_z_loss_clip": 1.13085938, + "router_z_loss_mlp": 0.06170654, + "step": 4029, + "time_per_iteration": 3.149601936340332 + }, + { + "auxiliary_loss_clip": 0.06557409, + "auxiliary_loss_mlp": 0.01307587, + "balance_loss_clip": 0.06331506, + "balance_loss_mlp": 0.01284615, + "epoch": 0.24229670825191643, + "flos": 25782278849280.0, + "grad_norm": 2.2612571716693664, + "language_loss": 0.75111073, + "learning_rate": 3.54554981945833e-06, + "loss": 0.82976073, + "num_input_tokens_seen": 86788110, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.22998047, + "step": 4030, + "time_per_iteration": 2.5939297676086426 + }, + { + "auxiliary_loss_clip": 0.0654521, + "auxiliary_loss_mlp": 0.0130894, + "balance_loss_clip": 0.06321512, + "balance_loss_mlp": 0.01287733, + "epoch": 0.2423568315045844, + "flos": 20673251850240.0, + "grad_norm": 1.8607136485921192, + "language_loss": 0.77126729, + "learning_rate": 3.5453026065436343e-06, + "loss": 0.84980875, + "num_input_tokens_seen": 86807640, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.2121582, + "step": 4031, + "time_per_iteration": 2.5886638164520264 + }, + { + "auxiliary_loss_clip": 0.06556953, + "auxiliary_loss_mlp": 0.01312472, + "balance_loss_clip": 0.06323709, + "balance_loss_mlp": 0.01290252, + "epoch": 0.24241695475725236, + "flos": 22422987210240.0, + "grad_norm": 1.956173023936914, + "language_loss": 0.66108859, + "learning_rate": 3.5450553350309083e-06, + "loss": 0.73978281, + "num_input_tokens_seen": 86826795, + "router_z_loss_clip": 2.33203125, + "router_z_loss_mlp": 0.22216797, + "step": 4032, + "time_per_iteration": 2.5665037631988525 + }, + { + "auxiliary_loss_clip": 0.06539695, + "auxiliary_loss_mlp": 0.01309421, + "balance_loss_clip": 0.06316876, + "balance_loss_mlp": 0.0128751, + "epoch": 0.24247707800992033, + "flos": 17134732327680.0, + "grad_norm": 3.4494454498841725, + "language_loss": 0.81464761, + "learning_rate": 3.5448080049295286e-06, + "loss": 0.89313877, + "num_input_tokens_seen": 86843175, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.21911621, + "step": 4033, + "time_per_iteration": 2.5237317085266113 + }, + { + "auxiliary_loss_clip": 0.06538171, + "auxiliary_loss_mlp": 0.01328283, + "balance_loss_clip": 0.06318024, + "balance_loss_mlp": 0.01305359, + "epoch": 0.2425372012625883, + "flos": 31621885597440.0, + "grad_norm": 1.909836856098088, + "language_loss": 0.69935066, + "learning_rate": 3.5445606162488754e-06, + "loss": 0.7780152, + "num_input_tokens_seen": 86863185, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.22900391, + "step": 4034, + "time_per_iteration": 2.713991641998291 + }, + { + "auxiliary_loss_clip": 0.06546839, + "auxiliary_loss_mlp": 0.01319063, + "balance_loss_clip": 0.06324256, + "balance_loss_mlp": 0.01298273, + "epoch": 0.24259732451525629, + "flos": 16331589342720.0, + "grad_norm": 2.1729941621503532, + "language_loss": 0.96340013, + "learning_rate": 3.5443131689983283e-06, + "loss": 1.04205918, + "num_input_tokens_seen": 86880040, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.20776367, + "step": 4035, + "time_per_iteration": 2.532848596572876 + }, + { + "auxiliary_loss_clip": 0.06537193, + "auxiliary_loss_mlp": 0.01327475, + "balance_loss_clip": 0.06319901, + "balance_loss_mlp": 0.01307447, + "epoch": 0.24265744776792425, + "flos": 22863230161920.0, + "grad_norm": 1.6992215283488847, + "language_loss": 0.78653824, + "learning_rate": 3.5440656631872715e-06, + "loss": 0.8651849, + "num_input_tokens_seen": 86900610, + "router_z_loss_clip": 2.17578125, + "router_z_loss_mlp": 0.20019531, + "step": 4036, + "time_per_iteration": 2.6079328060150146 + }, + { + "auxiliary_loss_clip": 0.06539825, + "auxiliary_loss_mlp": 0.01304693, + "balance_loss_clip": 0.06315397, + "balance_loss_mlp": 0.01282806, + "epoch": 0.24271757102059222, + "flos": 21878008254720.0, + "grad_norm": 1.624872867937933, + "language_loss": 0.74970233, + "learning_rate": 3.5438180988250898e-06, + "loss": 0.82814753, + "num_input_tokens_seen": 86919385, + "router_z_loss_clip": 2.2421875, + "router_z_loss_mlp": 0.21887207, + "step": 4037, + "time_per_iteration": 2.561479091644287 + }, + { + "auxiliary_loss_clip": 0.06526245, + "auxiliary_loss_mlp": 0.01308805, + "balance_loss_clip": 0.06302498, + "balance_loss_mlp": 0.01287539, + "epoch": 0.24277769427326018, + "flos": 19214649901440.0, + "grad_norm": 4.15075765155633, + "language_loss": 0.76952362, + "learning_rate": 3.543570475921171e-06, + "loss": 0.84787416, + "num_input_tokens_seen": 86938885, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.21276855, + "step": 4038, + "time_per_iteration": 2.514899492263794 + }, + { + "auxiliary_loss_clip": 0.06539176, + "auxiliary_loss_mlp": 0.01295141, + "balance_loss_clip": 0.06314565, + "balance_loss_mlp": 0.01272992, + "epoch": 0.24283781752592815, + "flos": 19505909093760.0, + "grad_norm": 2.116114626089979, + "language_loss": 0.72802031, + "learning_rate": 3.543322794484905e-06, + "loss": 0.80636352, + "num_input_tokens_seen": 86957705, + "router_z_loss_clip": 2.24414062, + "router_z_loss_mlp": 0.22167969, + "step": 4039, + "time_per_iteration": 2.603787422180176 + }, + { + "auxiliary_loss_clip": 0.06537706, + "auxiliary_loss_mlp": 0.01290985, + "balance_loss_clip": 0.06312682, + "balance_loss_mlp": 0.01269372, + "epoch": 0.2428979407785961, + "flos": 19908444908160.0, + "grad_norm": 1.7691638050154863, + "language_loss": 0.78818536, + "learning_rate": 3.5430750545256843e-06, + "loss": 0.86647218, + "num_input_tokens_seen": 86975845, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.21606445, + "step": 4040, + "time_per_iteration": 2.570063829421997 + }, + { + "auxiliary_loss_clip": 0.06530759, + "auxiliary_loss_mlp": 0.01283615, + "balance_loss_clip": 0.06313588, + "balance_loss_mlp": 0.01265162, + "epoch": 0.2429580640312641, + "flos": 24722523135360.0, + "grad_norm": 1.6907745152184719, + "language_loss": 0.81039703, + "learning_rate": 3.5428272560529027e-06, + "loss": 0.8885408, + "num_input_tokens_seen": 86994800, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.18444824, + "step": 4041, + "time_per_iteration": 2.5693795680999756 + }, + { + "auxiliary_loss_clip": 0.06532191, + "auxiliary_loss_mlp": 0.01286793, + "balance_loss_clip": 0.06311769, + "balance_loss_mlp": 0.01267529, + "epoch": 0.24301818728393207, + "flos": 25637529720960.0, + "grad_norm": 3.2457124561568, + "language_loss": 0.77433085, + "learning_rate": 3.542579399075957e-06, + "loss": 0.8525207, + "num_input_tokens_seen": 87016845, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.19262695, + "step": 4042, + "time_per_iteration": 3.9626972675323486 + }, + { + "auxiliary_loss_clip": 0.0653407, + "auxiliary_loss_mlp": 0.01279857, + "balance_loss_clip": 0.06316316, + "balance_loss_mlp": 0.01260652, + "epoch": 0.24307831053660003, + "flos": 26148700753920.0, + "grad_norm": 1.8532279658121147, + "language_loss": 0.82188201, + "learning_rate": 3.542331483604246e-06, + "loss": 0.90002131, + "num_input_tokens_seen": 87036270, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.19226074, + "step": 4043, + "time_per_iteration": 2.598202705383301 + }, + { + "auxiliary_loss_clip": 0.06538229, + "auxiliary_loss_mlp": 0.0127841, + "balance_loss_clip": 0.06309159, + "balance_loss_mlp": 0.01256594, + "epoch": 0.243138433789268, + "flos": 14977136419200.0, + "grad_norm": 2.775508644952731, + "language_loss": 0.73897892, + "learning_rate": 3.5420835096471706e-06, + "loss": 0.81714529, + "num_input_tokens_seen": 87049920, + "router_z_loss_clip": 2.29101562, + "router_z_loss_mlp": 0.21801758, + "step": 4044, + "time_per_iteration": 2.483752489089966 + }, + { + "auxiliary_loss_clip": 0.06534028, + "auxiliary_loss_mlp": 0.01284645, + "balance_loss_clip": 0.0631184, + "balance_loss_mlp": 0.01263629, + "epoch": 0.24319855704193596, + "flos": 25198670361600.0, + "grad_norm": 2.3685654829247227, + "language_loss": 0.83778739, + "learning_rate": 3.5418354772141337e-06, + "loss": 0.91597402, + "num_input_tokens_seen": 87068230, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.21020508, + "step": 4045, + "time_per_iteration": 2.60435152053833 + }, + { + "auxiliary_loss_clip": 0.06529962, + "auxiliary_loss_mlp": 0.0127985, + "balance_loss_clip": 0.06307946, + "balance_loss_mlp": 0.01260323, + "epoch": 0.24325868029460393, + "flos": 22133740515840.0, + "grad_norm": 1.834350653864789, + "language_loss": 0.87040859, + "learning_rate": 3.541587386314541e-06, + "loss": 0.94850671, + "num_input_tokens_seen": 87086435, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.19519043, + "step": 4046, + "time_per_iteration": 3.990011692047119 + }, + { + "auxiliary_loss_clip": 0.0652798, + "auxiliary_loss_mlp": 0.01281438, + "balance_loss_clip": 0.06311028, + "balance_loss_mlp": 0.01260922, + "epoch": 0.2433188035472719, + "flos": 23588107833600.0, + "grad_norm": 2.274532821816236, + "language_loss": 0.72945291, + "learning_rate": 3.5413392369578e-06, + "loss": 0.80754709, + "num_input_tokens_seen": 87105340, + "router_z_loss_clip": 2.16894531, + "router_z_loss_mlp": 0.20495605, + "step": 4047, + "time_per_iteration": 2.552464246749878 + }, + { + "auxiliary_loss_clip": 0.06530058, + "auxiliary_loss_mlp": 0.01284969, + "balance_loss_clip": 0.06306041, + "balance_loss_mlp": 0.01263666, + "epoch": 0.2433789267999399, + "flos": 24469809621120.0, + "grad_norm": 3.993347012147321, + "language_loss": 0.74453223, + "learning_rate": 3.5410910291533213e-06, + "loss": 0.8226825, + "num_input_tokens_seen": 87125780, + "router_z_loss_clip": 2.23828125, + "router_z_loss_mlp": 0.21325684, + "step": 4048, + "time_per_iteration": 4.027734279632568 + }, + { + "auxiliary_loss_clip": 0.06529407, + "auxiliary_loss_mlp": 0.01275879, + "balance_loss_clip": 0.06309648, + "balance_loss_mlp": 0.0125671, + "epoch": 0.24343905005260785, + "flos": 16733622032640.0, + "grad_norm": 2.185429514920852, + "language_loss": 0.73832756, + "learning_rate": 3.5408427629105155e-06, + "loss": 0.81638038, + "num_input_tokens_seen": 87144470, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.19165039, + "step": 4049, + "time_per_iteration": 2.5527403354644775 + }, + { + "auxiliary_loss_clip": 0.06525055, + "auxiliary_loss_mlp": 0.01275563, + "balance_loss_clip": 0.06306046, + "balance_loss_mlp": 0.01256084, + "epoch": 0.24349917330527582, + "flos": 20049294821760.0, + "grad_norm": 1.6558681415401064, + "language_loss": 0.74824917, + "learning_rate": 3.5405944382387985e-06, + "loss": 0.82625538, + "num_input_tokens_seen": 87162830, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.19482422, + "step": 4050, + "time_per_iteration": 2.517671585083008 + }, + { + "auxiliary_loss_clip": 0.06520879, + "auxiliary_loss_mlp": 0.0127856, + "balance_loss_clip": 0.06303313, + "balance_loss_mlp": 0.01258187, + "epoch": 0.24355929655794378, + "flos": 17426285009280.0, + "grad_norm": 2.447710360159803, + "language_loss": 0.75780261, + "learning_rate": 3.5403460551475854e-06, + "loss": 0.83579695, + "num_input_tokens_seen": 87180905, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.20361328, + "step": 4051, + "time_per_iteration": 3.961841583251953 + }, + { + "auxiliary_loss_clip": 0.06532377, + "auxiliary_loss_mlp": 0.01277824, + "balance_loss_clip": 0.06310124, + "balance_loss_mlp": 0.01257343, + "epoch": 0.24361941981061175, + "flos": 25417995223680.0, + "grad_norm": 2.289221862828171, + "language_loss": 0.71344352, + "learning_rate": 3.540097613646296e-06, + "loss": 0.79154545, + "num_input_tokens_seen": 87202290, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.20471191, + "step": 4052, + "time_per_iteration": 2.5851869583129883 + }, + { + "auxiliary_loss_clip": 0.06524909, + "auxiliary_loss_mlp": 0.01278097, + "balance_loss_clip": 0.06306259, + "balance_loss_mlp": 0.01258583, + "epoch": 0.2436795430632797, + "flos": 22827493595520.0, + "grad_norm": 1.7731467261886882, + "language_loss": 0.82073057, + "learning_rate": 3.539849113744351e-06, + "loss": 0.89876068, + "num_input_tokens_seen": 87221650, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.19519043, + "step": 4053, + "time_per_iteration": 2.6217734813690186 + }, + { + "auxiliary_loss_clip": 0.06533736, + "auxiliary_loss_mlp": 0.01278722, + "balance_loss_clip": 0.06309207, + "balance_loss_mlp": 0.01260126, + "epoch": 0.2437396663159477, + "flos": 15163030702080.0, + "grad_norm": 1.5690390746940162, + "language_loss": 0.78588867, + "learning_rate": 3.539600555451172e-06, + "loss": 0.86401325, + "num_input_tokens_seen": 87238515, + "router_z_loss_clip": 2.24414062, + "router_z_loss_mlp": 0.18615723, + "step": 4054, + "time_per_iteration": 2.513720750808716 + }, + { + "auxiliary_loss_clip": 0.06529565, + "auxiliary_loss_mlp": 0.0128197, + "balance_loss_clip": 0.06307493, + "balance_loss_mlp": 0.01263111, + "epoch": 0.24379978956861567, + "flos": 22097710460160.0, + "grad_norm": 1.7039269278884617, + "language_loss": 0.84417951, + "learning_rate": 3.5393519387761866e-06, + "loss": 0.92229491, + "num_input_tokens_seen": 87256290, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.1887207, + "step": 4055, + "time_per_iteration": 2.557584524154663 + }, + { + "auxiliary_loss_clip": 0.06542832, + "auxiliary_loss_mlp": 0.01280691, + "balance_loss_clip": 0.06312343, + "balance_loss_mlp": 0.01259508, + "epoch": 0.24385991282128364, + "flos": 31475878657920.0, + "grad_norm": 2.786051029634521, + "language_loss": 0.56684959, + "learning_rate": 3.5391032637288217e-06, + "loss": 0.6450848, + "num_input_tokens_seen": 87277085, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.21179199, + "step": 4056, + "time_per_iteration": 2.6548893451690674 + }, + { + "auxiliary_loss_clip": 0.06533613, + "auxiliary_loss_mlp": 0.01283826, + "balance_loss_clip": 0.06307291, + "balance_loss_mlp": 0.01262321, + "epoch": 0.2439200360739516, + "flos": 23845055978880.0, + "grad_norm": 2.215401064957846, + "language_loss": 0.80586845, + "learning_rate": 3.538854530318506e-06, + "loss": 0.88404286, + "num_input_tokens_seen": 87293020, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.21520996, + "step": 4057, + "time_per_iteration": 2.5563580989837646 + }, + { + "auxiliary_loss_clip": 0.06533922, + "auxiliary_loss_mlp": 0.01279797, + "balance_loss_clip": 0.06311886, + "balance_loss_mlp": 0.01261009, + "epoch": 0.24398015932661957, + "flos": 19175684952960.0, + "grad_norm": 1.7331406857586058, + "language_loss": 0.79934907, + "learning_rate": 3.538605738554673e-06, + "loss": 0.87748623, + "num_input_tokens_seen": 87311445, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.18786621, + "step": 4058, + "time_per_iteration": 2.5552098751068115 + }, + { + "auxiliary_loss_clip": 0.06541391, + "auxiliary_loss_mlp": 0.01280168, + "balance_loss_clip": 0.06312001, + "balance_loss_mlp": 0.01259772, + "epoch": 0.24404028257928753, + "flos": 25269095318400.0, + "grad_norm": 1.7324044437804977, + "language_loss": 0.86104828, + "learning_rate": 3.538356888446756e-06, + "loss": 0.93926388, + "num_input_tokens_seen": 87332055, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.20410156, + "step": 4059, + "time_per_iteration": 2.575345754623413 + }, + { + "auxiliary_loss_clip": 0.06538763, + "auxiliary_loss_mlp": 0.01274337, + "balance_loss_clip": 0.06318676, + "balance_loss_mlp": 0.01255621, + "epoch": 0.2441004058319555, + "flos": 26474606409600.0, + "grad_norm": 1.5285193147278118, + "language_loss": 0.74698234, + "learning_rate": 3.5381079800041913e-06, + "loss": 0.8251133, + "num_input_tokens_seen": 87351295, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.18713379, + "step": 4060, + "time_per_iteration": 2.6277999877929688 + }, + { + "auxiliary_loss_clip": 0.06560756, + "auxiliary_loss_mlp": 0.01280844, + "balance_loss_clip": 0.06327853, + "balance_loss_mlp": 0.01259469, + "epoch": 0.2441605290846235, + "flos": 26767752318720.0, + "grad_norm": 1.6858410849727605, + "language_loss": 0.73894358, + "learning_rate": 3.5378590132364182e-06, + "loss": 0.81735957, + "num_input_tokens_seen": 87370650, + "router_z_loss_clip": 2.328125, + "router_z_loss_mlp": 0.21374512, + "step": 4061, + "time_per_iteration": 2.5895774364471436 + }, + { + "auxiliary_loss_clip": 0.06538899, + "auxiliary_loss_mlp": 0.01273593, + "balance_loss_clip": 0.0631846, + "balance_loss_mlp": 0.01254103, + "epoch": 0.24422065233729146, + "flos": 21112236990720.0, + "grad_norm": 1.7809128746808311, + "language_loss": 0.76782405, + "learning_rate": 3.5376099881528768e-06, + "loss": 0.84594905, + "num_input_tokens_seen": 87389020, + "router_z_loss_clip": 2.20507812, + "router_z_loss_mlp": 0.19494629, + "step": 4062, + "time_per_iteration": 2.5655109882354736 + }, + { + "auxiliary_loss_clip": 0.06538436, + "auxiliary_loss_mlp": 0.01278297, + "balance_loss_clip": 0.06319936, + "balance_loss_mlp": 0.01258019, + "epoch": 0.24428077558995942, + "flos": 25269891932160.0, + "grad_norm": 1.624722619478305, + "language_loss": 0.84975201, + "learning_rate": 3.537360904763011e-06, + "loss": 0.92791933, + "num_input_tokens_seen": 87409695, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.20263672, + "step": 4063, + "time_per_iteration": 2.569420576095581 + }, + { + "auxiliary_loss_clip": 0.06559969, + "auxiliary_loss_mlp": 0.01275678, + "balance_loss_clip": 0.06327148, + "balance_loss_mlp": 0.01254459, + "epoch": 0.24434089884262739, + "flos": 20491508344320.0, + "grad_norm": 2.099790248638241, + "language_loss": 0.68837494, + "learning_rate": 3.5371117630762656e-06, + "loss": 0.76673138, + "num_input_tokens_seen": 87428250, + "router_z_loss_clip": 2.328125, + "router_z_loss_mlp": 0.2121582, + "step": 4064, + "time_per_iteration": 2.560065984725952 + }, + { + "auxiliary_loss_clip": 0.06547809, + "auxiliary_loss_mlp": 0.01276127, + "balance_loss_clip": 0.06317605, + "balance_loss_mlp": 0.01255349, + "epoch": 0.24440102209529535, + "flos": 23628456374400.0, + "grad_norm": 1.7607893449036869, + "language_loss": 0.70700729, + "learning_rate": 3.536862563102088e-06, + "loss": 0.78524667, + "num_input_tokens_seen": 87449380, + "router_z_loss_clip": 2.29882812, + "router_z_loss_mlp": 0.20788574, + "step": 4065, + "time_per_iteration": 2.5619614124298096 + }, + { + "auxiliary_loss_clip": 0.06554856, + "auxiliary_loss_mlp": 0.0127847, + "balance_loss_clip": 0.06322616, + "balance_loss_mlp": 0.01256726, + "epoch": 0.24446114534796332, + "flos": 20560382000640.0, + "grad_norm": 2.0639555504298372, + "language_loss": 0.84639663, + "learning_rate": 3.5366133048499282e-06, + "loss": 0.92472994, + "num_input_tokens_seen": 87465365, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.21765137, + "step": 4066, + "time_per_iteration": 2.5640382766723633 + }, + { + "auxiliary_loss_clip": 0.0647334, + "auxiliary_loss_mlp": 0.01266455, + "balance_loss_clip": 0.06356817, + "balance_loss_mlp": 0.01260456, + "epoch": 0.24452126860063128, + "flos": 60406719327360.0, + "grad_norm": 0.7224646734980834, + "language_loss": 0.52123713, + "learning_rate": 3.5363639883292374e-06, + "loss": 0.59863508, + "num_input_tokens_seen": 87522525, + "router_z_loss_clip": 1.1640625, + "router_z_loss_mlp": 0.05990601, + "step": 4067, + "time_per_iteration": 3.067857503890991 + }, + { + "auxiliary_loss_clip": 0.06549152, + "auxiliary_loss_mlp": 0.01275932, + "balance_loss_clip": 0.063198, + "balance_loss_mlp": 0.01255106, + "epoch": 0.24458139185329927, + "flos": 15126958719360.0, + "grad_norm": 4.582785635832698, + "language_loss": 0.72625411, + "learning_rate": 3.5361146135494706e-06, + "loss": 0.80450499, + "num_input_tokens_seen": 87539170, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.20825195, + "step": 4068, + "time_per_iteration": 2.5490705966949463 + }, + { + "auxiliary_loss_clip": 0.06542531, + "auxiliary_loss_mlp": 0.0127677, + "balance_loss_clip": 0.06318012, + "balance_loss_mlp": 0.01256111, + "epoch": 0.24464151510596724, + "flos": 28005771594240.0, + "grad_norm": 1.4744908303961997, + "language_loss": 0.7839663, + "learning_rate": 3.5358651805200835e-06, + "loss": 0.86215931, + "num_input_tokens_seen": 87558875, + "router_z_loss_clip": 2.24609375, + "router_z_loss_mlp": 0.20654297, + "step": 4069, + "time_per_iteration": 2.6064302921295166 + }, + { + "auxiliary_loss_clip": 0.06535528, + "auxiliary_loss_mlp": 0.01277448, + "balance_loss_clip": 0.06312935, + "balance_loss_mlp": 0.01257493, + "epoch": 0.2447016383586352, + "flos": 19799138856960.0, + "grad_norm": 1.9167348410225946, + "language_loss": 0.80741036, + "learning_rate": 3.5356156892505347e-06, + "loss": 0.88554007, + "num_input_tokens_seen": 87576485, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.19946289, + "step": 4070, + "time_per_iteration": 2.633073568344116 + }, + { + "auxiliary_loss_clip": 0.06543916, + "auxiliary_loss_mlp": 0.0127809, + "balance_loss_clip": 0.06317008, + "balance_loss_mlp": 0.01258825, + "epoch": 0.24476176161130317, + "flos": 26074460436480.0, + "grad_norm": 1.476613235331205, + "language_loss": 0.8444066, + "learning_rate": 3.5353661397502854e-06, + "loss": 0.92262667, + "num_input_tokens_seen": 87598620, + "router_z_loss_clip": 2.2734375, + "router_z_loss_mlp": 0.19262695, + "step": 4071, + "time_per_iteration": 2.6165285110473633 + }, + { + "auxiliary_loss_clip": 0.06545337, + "auxiliary_loss_mlp": 0.01275719, + "balance_loss_clip": 0.06310376, + "balance_loss_mlp": 0.01254679, + "epoch": 0.24482188486397113, + "flos": 18849527735040.0, + "grad_norm": 2.1913275656577857, + "language_loss": 0.8027429, + "learning_rate": 3.535116532028798e-06, + "loss": 0.88095343, + "num_input_tokens_seen": 87616595, + "router_z_loss_clip": 2.3515625, + "router_z_loss_mlp": 0.21032715, + "step": 4072, + "time_per_iteration": 2.580077648162842 + }, + { + "auxiliary_loss_clip": 0.06531823, + "auxiliary_loss_mlp": 0.01275557, + "balance_loss_clip": 0.06311209, + "balance_loss_mlp": 0.01257031, + "epoch": 0.2448820081166391, + "flos": 21258202003200.0, + "grad_norm": 1.4781582217057618, + "language_loss": 0.7076053, + "learning_rate": 3.5348668660955382e-06, + "loss": 0.7856791, + "num_input_tokens_seen": 87635755, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.18505859, + "step": 4073, + "time_per_iteration": 2.5430707931518555 + }, + { + "auxiliary_loss_clip": 0.06525481, + "auxiliary_loss_mlp": 0.01279613, + "balance_loss_clip": 0.06303517, + "balance_loss_mlp": 0.01260921, + "epoch": 0.2449421313693071, + "flos": 23957254995840.0, + "grad_norm": 2.412576467354098, + "language_loss": 0.67577648, + "learning_rate": 3.5346171419599728e-06, + "loss": 0.75382745, + "num_input_tokens_seen": 87652885, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.18676758, + "step": 4074, + "time_per_iteration": 2.5616037845611572 + }, + { + "auxiliary_loss_clip": 0.06435025, + "auxiliary_loss_mlp": 0.01257107, + "balance_loss_clip": 0.06320108, + "balance_loss_mlp": 0.01251907, + "epoch": 0.24500225462197506, + "flos": 60705902730240.0, + "grad_norm": 0.8764237694402175, + "language_loss": 0.68656927, + "learning_rate": 3.5343673596315718e-06, + "loss": 0.76349056, + "num_input_tokens_seen": 87713220, + "router_z_loss_clip": 1.1484375, + "router_z_loss_mlp": 0.05203247, + "step": 4075, + "time_per_iteration": 3.2623581886291504 + }, + { + "auxiliary_loss_clip": 0.06527948, + "auxiliary_loss_mlp": 0.01276643, + "balance_loss_clip": 0.06305515, + "balance_loss_mlp": 0.01257414, + "epoch": 0.24506237787464302, + "flos": 26291018113920.0, + "grad_norm": 2.301278269127432, + "language_loss": 0.79781568, + "learning_rate": 3.5341175191198063e-06, + "loss": 0.87586164, + "num_input_tokens_seen": 87732680, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.19226074, + "step": 4076, + "time_per_iteration": 2.6342012882232666 + }, + { + "auxiliary_loss_clip": 0.06535772, + "auxiliary_loss_mlp": 0.01280909, + "balance_loss_clip": 0.06304428, + "balance_loss_mlp": 0.01258462, + "epoch": 0.245122501127311, + "flos": 20557530961920.0, + "grad_norm": 1.9232761502629154, + "language_loss": 0.82461953, + "learning_rate": 3.533867620434151e-06, + "loss": 0.90278631, + "num_input_tokens_seen": 87751880, + "router_z_loss_clip": 2.31445312, + "router_z_loss_mlp": 0.2244873, + "step": 4077, + "time_per_iteration": 2.5863101482391357 + }, + { + "auxiliary_loss_clip": 0.06532669, + "auxiliary_loss_mlp": 0.01279172, + "balance_loss_clip": 0.06305817, + "balance_loss_mlp": 0.01257774, + "epoch": 0.24518262437997895, + "flos": 29140312677120.0, + "grad_norm": 2.8377644839815357, + "language_loss": 0.63268852, + "learning_rate": 3.533617663584082e-06, + "loss": 0.71080685, + "num_input_tokens_seen": 87771795, + "router_z_loss_clip": 2.26953125, + "router_z_loss_mlp": 0.21398926, + "step": 4078, + "time_per_iteration": 2.6045711040496826 + }, + { + "auxiliary_loss_clip": 0.06522519, + "auxiliary_loss_mlp": 0.01277179, + "balance_loss_clip": 0.06301752, + "balance_loss_mlp": 0.01258249, + "epoch": 0.24524274763264692, + "flos": 23483623392000.0, + "grad_norm": 1.4700896000405594, + "language_loss": 0.75762683, + "learning_rate": 3.5333676485790765e-06, + "loss": 0.8356238, + "num_input_tokens_seen": 87793640, + "router_z_loss_clip": 2.20800781, + "router_z_loss_mlp": 0.18933105, + "step": 4079, + "time_per_iteration": 2.6327531337738037 + }, + { + "auxiliary_loss_clip": 0.06521107, + "auxiliary_loss_mlp": 0.01276139, + "balance_loss_clip": 0.06297373, + "balance_loss_mlp": 0.01256171, + "epoch": 0.24530287088531488, + "flos": 17206792439040.0, + "grad_norm": 1.743597814486786, + "language_loss": 0.75652814, + "learning_rate": 3.5331175754286173e-06, + "loss": 0.83450055, + "num_input_tokens_seen": 87812390, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.1998291, + "step": 4080, + "time_per_iteration": 2.5027806758880615 + }, + { + "auxiliary_loss_clip": 0.06517033, + "auxiliary_loss_mlp": 0.01282693, + "balance_loss_clip": 0.06296979, + "balance_loss_mlp": 0.01262129, + "epoch": 0.24536299413798288, + "flos": 14872903539840.0, + "grad_norm": 1.7999885027482954, + "language_loss": 0.83532149, + "learning_rate": 3.532867444142186e-06, + "loss": 0.91331875, + "num_input_tokens_seen": 87830640, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.20544434, + "step": 4081, + "time_per_iteration": 3.9672679901123047 + }, + { + "auxiliary_loss_clip": 0.06524678, + "auxiliary_loss_mlp": 0.01276758, + "balance_loss_clip": 0.06300613, + "balance_loss_mlp": 0.01257458, + "epoch": 0.24542311739065084, + "flos": 35270759347200.0, + "grad_norm": 2.0934334924975797, + "language_loss": 0.7376107, + "learning_rate": 3.532617254729267e-06, + "loss": 0.81562507, + "num_input_tokens_seen": 87850450, + "router_z_loss_clip": 2.2421875, + "router_z_loss_mlp": 0.19311523, + "step": 4082, + "time_per_iteration": 2.687596559524536 + }, + { + "auxiliary_loss_clip": 0.06520141, + "auxiliary_loss_mlp": 0.01272132, + "balance_loss_clip": 0.06301866, + "balance_loss_mlp": 0.01254334, + "epoch": 0.2454832406433188, + "flos": 21508903019520.0, + "grad_norm": 4.081398895882933, + "language_loss": 0.72681344, + "learning_rate": 3.5323670071993485e-06, + "loss": 0.8047362, + "num_input_tokens_seen": 87868810, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.17810059, + "step": 4083, + "time_per_iteration": 2.5715560913085938 + }, + { + "auxiliary_loss_clip": 0.06531677, + "auxiliary_loss_mlp": 0.01285124, + "balance_loss_clip": 0.06304878, + "balance_loss_mlp": 0.01263404, + "epoch": 0.24554336389598677, + "flos": 14761878480000.0, + "grad_norm": 2.078496591548884, + "language_loss": 0.75461411, + "learning_rate": 3.532116701561919e-06, + "loss": 0.83278215, + "num_input_tokens_seen": 87885685, + "router_z_loss_clip": 2.26953125, + "router_z_loss_mlp": 0.21704102, + "step": 4084, + "time_per_iteration": 2.527059316635132 + }, + { + "auxiliary_loss_clip": 0.06521569, + "auxiliary_loss_mlp": 0.01278312, + "balance_loss_clip": 0.06299873, + "balance_loss_mlp": 0.01259238, + "epoch": 0.24560348714865474, + "flos": 14981790320640.0, + "grad_norm": 1.9240939687866982, + "language_loss": 0.85311353, + "learning_rate": 3.531866337826471e-06, + "loss": 0.93111229, + "num_input_tokens_seen": 87903715, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.19055176, + "step": 4085, + "time_per_iteration": 4.107008695602417 + }, + { + "auxiliary_loss_clip": 0.06523392, + "auxiliary_loss_mlp": 0.01277742, + "balance_loss_clip": 0.06299591, + "balance_loss_mlp": 0.0125725, + "epoch": 0.2456636104013227, + "flos": 22682073634560.0, + "grad_norm": 1.671481131781836, + "language_loss": 0.79073685, + "learning_rate": 3.5316159160024982e-06, + "loss": 0.86874819, + "num_input_tokens_seen": 87923375, + "router_z_loss_clip": 2.23828125, + "router_z_loss_mlp": 0.20495605, + "step": 4086, + "time_per_iteration": 2.5609679222106934 + }, + { + "auxiliary_loss_clip": 0.06519614, + "auxiliary_loss_mlp": 0.01278477, + "balance_loss_clip": 0.06300113, + "balance_loss_mlp": 0.01260107, + "epoch": 0.2457237336539907, + "flos": 27425307634560.0, + "grad_norm": 1.6115503736345718, + "language_loss": 0.75352013, + "learning_rate": 3.531365436099496e-06, + "loss": 0.83150113, + "num_input_tokens_seen": 87943115, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.18359375, + "step": 4087, + "time_per_iteration": 4.046957015991211 + }, + { + "auxiliary_loss_clip": 0.06525059, + "auxiliary_loss_mlp": 0.01276774, + "balance_loss_clip": 0.06299827, + "balance_loss_mlp": 0.0125633, + "epoch": 0.24578385690665866, + "flos": 20418609692160.0, + "grad_norm": 2.7081304915573914, + "language_loss": 0.79987848, + "learning_rate": 3.5311148981269635e-06, + "loss": 0.87789685, + "num_input_tokens_seen": 87959505, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.20458984, + "step": 4088, + "time_per_iteration": 2.5119664669036865 + }, + { + "auxiliary_loss_clip": 0.06519316, + "auxiliary_loss_mlp": 0.01276403, + "balance_loss_clip": 0.06303117, + "balance_loss_mlp": 0.01258152, + "epoch": 0.24584398015932662, + "flos": 23922273116160.0, + "grad_norm": 2.802199957042034, + "language_loss": 0.77758735, + "learning_rate": 3.5308643020944e-06, + "loss": 0.85554451, + "num_input_tokens_seen": 87979725, + "router_z_loss_clip": 2.16601562, + "router_z_loss_mlp": 0.18249512, + "step": 4089, + "time_per_iteration": 2.5686089992523193 + }, + { + "auxiliary_loss_clip": 0.06525148, + "auxiliary_loss_mlp": 0.01281238, + "balance_loss_clip": 0.0630155, + "balance_loss_mlp": 0.01261021, + "epoch": 0.2459041034119946, + "flos": 41505313115520.0, + "grad_norm": 1.8031915906993192, + "language_loss": 0.81701422, + "learning_rate": 3.530613648011309e-06, + "loss": 0.89507812, + "num_input_tokens_seen": 87998270, + "router_z_loss_clip": 2.23828125, + "router_z_loss_mlp": 0.20214844, + "step": 4090, + "time_per_iteration": 2.678403377532959 + }, + { + "auxiliary_loss_clip": 0.065328, + "auxiliary_loss_mlp": 0.01279305, + "balance_loss_clip": 0.06309135, + "balance_loss_mlp": 0.01258861, + "epoch": 0.24596422666466256, + "flos": 19942755955200.0, + "grad_norm": 2.438516046551743, + "language_loss": 0.73629344, + "learning_rate": 3.5303629358871946e-06, + "loss": 0.8144145, + "num_input_tokens_seen": 88016760, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.20446777, + "step": 4091, + "time_per_iteration": 3.961276054382324 + }, + { + "auxiliary_loss_clip": 0.06539448, + "auxiliary_loss_mlp": 0.01279874, + "balance_loss_clip": 0.06316313, + "balance_loss_mlp": 0.0126148, + "epoch": 0.24602434991733052, + "flos": 21550970568960.0, + "grad_norm": 2.2480658521871897, + "language_loss": 0.77723873, + "learning_rate": 3.5301121657315653e-06, + "loss": 0.85543197, + "num_input_tokens_seen": 88036465, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.18408203, + "step": 4092, + "time_per_iteration": 2.5494375228881836 + }, + { + "auxiliary_loss_clip": 0.06537454, + "auxiliary_loss_mlp": 0.01278374, + "balance_loss_clip": 0.06307742, + "balance_loss_mlp": 0.01258907, + "epoch": 0.24608447316999849, + "flos": 23191735294080.0, + "grad_norm": 2.380112015735871, + "language_loss": 0.82381165, + "learning_rate": 3.5298613375539287e-06, + "loss": 0.90196991, + "num_input_tokens_seen": 88053270, + "router_z_loss_clip": 2.30078125, + "router_z_loss_mlp": 0.19470215, + "step": 4093, + "time_per_iteration": 2.5551040172576904 + }, + { + "auxiliary_loss_clip": 0.06532703, + "auxiliary_loss_mlp": 0.01285, + "balance_loss_clip": 0.06305315, + "balance_loss_mlp": 0.01264412, + "epoch": 0.24614459642266648, + "flos": 19647345985920.0, + "grad_norm": 21.11973952887688, + "language_loss": 0.87671578, + "learning_rate": 3.529610451363797e-06, + "loss": 0.95489287, + "num_input_tokens_seen": 88072305, + "router_z_loss_clip": 2.27148438, + "router_z_loss_mlp": 0.20581055, + "step": 4094, + "time_per_iteration": 2.534127712249756 + }, + { + "auxiliary_loss_clip": 0.06404499, + "auxiliary_loss_mlp": 0.01293713, + "balance_loss_clip": 0.06291573, + "balance_loss_mlp": 0.01289332, + "epoch": 0.24620471967533444, + "flos": 61757231109120.0, + "grad_norm": 0.7533459551406883, + "language_loss": 0.57023478, + "learning_rate": 3.5293595071706833e-06, + "loss": 0.64721692, + "num_input_tokens_seen": 88137995, + "router_z_loss_clip": 1.125, + "router_z_loss_mlp": 0.04388428, + "step": 4095, + "time_per_iteration": 3.238482713699341 + }, + { + "auxiliary_loss_clip": 0.06404348, + "auxiliary_loss_mlp": 0.01286038, + "balance_loss_clip": 0.06290346, + "balance_loss_mlp": 0.01281767, + "epoch": 0.2462648429280024, + "flos": 69174431003520.0, + "grad_norm": 0.6365745764429788, + "language_loss": 0.56240451, + "learning_rate": 3.5291085049841042e-06, + "loss": 0.63930833, + "num_input_tokens_seen": 88208490, + "router_z_loss_clip": 1.140625, + "router_z_loss_mlp": 0.04275513, + "step": 4096, + "time_per_iteration": 3.3192596435546875 + }, + { + "auxiliary_loss_clip": 0.06545975, + "auxiliary_loss_mlp": 0.01281956, + "balance_loss_clip": 0.06318395, + "balance_loss_mlp": 0.01262143, + "epoch": 0.24632496618067037, + "flos": 29467140727680.0, + "grad_norm": 1.505356285132213, + "language_loss": 0.78075927, + "learning_rate": 3.5288574448135773e-06, + "loss": 0.85903859, + "num_input_tokens_seen": 88228050, + "router_z_loss_clip": 2.2734375, + "router_z_loss_mlp": 0.19812012, + "step": 4097, + "time_per_iteration": 2.617108106613159 + }, + { + "auxiliary_loss_clip": 0.06547391, + "auxiliary_loss_mlp": 0.01279842, + "balance_loss_clip": 0.06315026, + "balance_loss_mlp": 0.01259993, + "epoch": 0.24638508943333834, + "flos": 24323341484160.0, + "grad_norm": 2.0372573834811267, + "language_loss": 0.77321315, + "learning_rate": 3.5286063266686235e-06, + "loss": 0.85148549, + "num_input_tokens_seen": 88248090, + "router_z_loss_clip": 2.32421875, + "router_z_loss_mlp": 0.1986084, + "step": 4098, + "time_per_iteration": 2.6069419384002686 + }, + { + "auxiliary_loss_clip": 0.06542017, + "auxiliary_loss_mlp": 0.01275521, + "balance_loss_clip": 0.0631687, + "balance_loss_mlp": 0.01257341, + "epoch": 0.2464452126860063, + "flos": 26620236005760.0, + "grad_norm": 2.17921698337753, + "language_loss": 0.69183016, + "learning_rate": 3.528355150558764e-06, + "loss": 0.77000558, + "num_input_tokens_seen": 88267545, + "router_z_loss_clip": 2.25, + "router_z_loss_mlp": 0.1817627, + "step": 4099, + "time_per_iteration": 2.655956506729126 + }, + { + "auxiliary_loss_clip": 0.06525709, + "auxiliary_loss_mlp": 0.01274552, + "balance_loss_clip": 0.06309929, + "balance_loss_mlp": 0.01256062, + "epoch": 0.24650533593867427, + "flos": 31220481813120.0, + "grad_norm": 2.2743270797915076, + "language_loss": 0.67268491, + "learning_rate": 3.5281039164935237e-06, + "loss": 0.75068748, + "num_input_tokens_seen": 88289785, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.18493652, + "step": 4100, + "time_per_iteration": 2.6497559547424316 + }, + { + "auxiliary_loss_clip": 0.0641202, + "auxiliary_loss_mlp": 0.01258309, + "balance_loss_clip": 0.06296985, + "balance_loss_mlp": 0.01253758, + "epoch": 0.24656545919134226, + "flos": 68513269962240.0, + "grad_norm": 0.6889590379062642, + "language_loss": 0.61607081, + "learning_rate": 3.5278526244824304e-06, + "loss": 0.69277412, + "num_input_tokens_seen": 88357320, + "router_z_loss_clip": 1.1484375, + "router_z_loss_mlp": 0.04559326, + "step": 4101, + "time_per_iteration": 3.2961082458496094 + }, + { + "auxiliary_loss_clip": 0.06538613, + "auxiliary_loss_mlp": 0.01277942, + "balance_loss_clip": 0.06317261, + "balance_loss_mlp": 0.01259, + "epoch": 0.24662558244401023, + "flos": 20090398049280.0, + "grad_norm": 1.6193028382456236, + "language_loss": 0.73591036, + "learning_rate": 3.527601274535012e-06, + "loss": 0.81407589, + "num_input_tokens_seen": 88377040, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.18945312, + "step": 4102, + "time_per_iteration": 2.542275905609131 + }, + { + "auxiliary_loss_clip": 0.0654332, + "auxiliary_loss_mlp": 0.01273749, + "balance_loss_clip": 0.06317908, + "balance_loss_mlp": 0.01255152, + "epoch": 0.2466857056966782, + "flos": 30709310780160.0, + "grad_norm": 2.0137613654817854, + "language_loss": 0.76325667, + "learning_rate": 3.5273498666608004e-06, + "loss": 0.84142733, + "num_input_tokens_seen": 88395085, + "router_z_loss_clip": 2.25, + "router_z_loss_mlp": 0.18603516, + "step": 4103, + "time_per_iteration": 2.6544189453125 + }, + { + "auxiliary_loss_clip": 0.06542745, + "auxiliary_loss_mlp": 0.01273413, + "balance_loss_clip": 0.06315098, + "balance_loss_mlp": 0.01253159, + "epoch": 0.24674582894934616, + "flos": 22535102373120.0, + "grad_norm": 2.0816413841430697, + "language_loss": 0.79265451, + "learning_rate": 3.5270984008693288e-06, + "loss": 0.87081611, + "num_input_tokens_seen": 88413205, + "router_z_loss_clip": 2.27929688, + "router_z_loss_mlp": 0.20275879, + "step": 4104, + "time_per_iteration": 2.5569820404052734 + }, + { + "auxiliary_loss_clip": 0.06525403, + "auxiliary_loss_mlp": 0.01276885, + "balance_loss_clip": 0.06306183, + "balance_loss_mlp": 0.01257251, + "epoch": 0.24680595220201412, + "flos": 20710581644160.0, + "grad_norm": 1.7450607123984514, + "language_loss": 0.83681756, + "learning_rate": 3.526846877170133e-06, + "loss": 0.9148404, + "num_input_tokens_seen": 88431525, + "router_z_loss_clip": 2.19335938, + "router_z_loss_mlp": 0.19641113, + "step": 4105, + "time_per_iteration": 2.553579330444336 + }, + { + "auxiliary_loss_clip": 0.06533727, + "auxiliary_loss_mlp": 0.01273598, + "balance_loss_clip": 0.06309752, + "balance_loss_mlp": 0.01255371, + "epoch": 0.2468660754546821, + "flos": 21836946954240.0, + "grad_norm": 1.9208859898797113, + "language_loss": 0.77469373, + "learning_rate": 3.52659529557275e-06, + "loss": 0.85276699, + "num_input_tokens_seen": 88451210, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.18212891, + "step": 4106, + "time_per_iteration": 2.5389256477355957 + }, + { + "auxiliary_loss_clip": 0.06534247, + "auxiliary_loss_mlp": 0.01276275, + "balance_loss_clip": 0.06310344, + "balance_loss_mlp": 0.01257463, + "epoch": 0.24692619870735008, + "flos": 15273049512960.0, + "grad_norm": 2.4615103155960485, + "language_loss": 0.73436344, + "learning_rate": 3.5263436560867205e-06, + "loss": 0.81246865, + "num_input_tokens_seen": 88467790, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.18798828, + "step": 4107, + "time_per_iteration": 2.5545566082000732 + }, + { + "auxiliary_loss_clip": 0.06538644, + "auxiliary_loss_mlp": 0.01275055, + "balance_loss_clip": 0.06314194, + "balance_loss_mlp": 0.01256745, + "epoch": 0.24698632196001805, + "flos": 29687933036160.0, + "grad_norm": 2.1377324014009504, + "language_loss": 0.66432422, + "learning_rate": 3.526091958721587e-06, + "loss": 0.7424612, + "num_input_tokens_seen": 88490330, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.18322754, + "step": 4108, + "time_per_iteration": 2.6196486949920654 + }, + { + "auxiliary_loss_clip": 0.06540007, + "auxiliary_loss_mlp": 0.01277779, + "balance_loss_clip": 0.06313555, + "balance_loss_mlp": 0.01259623, + "epoch": 0.247046445212686, + "flos": 39174736452480.0, + "grad_norm": 2.010829594577025, + "language_loss": 0.73608756, + "learning_rate": 3.5258402034868936e-06, + "loss": 0.81426549, + "num_input_tokens_seen": 88512435, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.18151855, + "step": 4109, + "time_per_iteration": 2.764406442642212 + }, + { + "auxiliary_loss_clip": 0.06534623, + "auxiliary_loss_mlp": 0.01277352, + "balance_loss_clip": 0.06311052, + "balance_loss_mlp": 0.01259077, + "epoch": 0.24710656846535398, + "flos": 23004834762240.0, + "grad_norm": 1.68605601916547, + "language_loss": 0.79419786, + "learning_rate": 3.5255883903921866e-06, + "loss": 0.87231761, + "num_input_tokens_seen": 88529780, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.18249512, + "step": 4110, + "time_per_iteration": 2.5460774898529053 + }, + { + "auxiliary_loss_clip": 0.06540776, + "auxiliary_loss_mlp": 0.01276666, + "balance_loss_clip": 0.06313831, + "balance_loss_mlp": 0.01257032, + "epoch": 0.24716669171802194, + "flos": 26440085727360.0, + "grad_norm": 2.6454329848736604, + "language_loss": 0.81789577, + "learning_rate": 3.5253365194470144e-06, + "loss": 0.89607012, + "num_input_tokens_seen": 88547200, + "router_z_loss_clip": 2.26953125, + "router_z_loss_mlp": 0.19628906, + "step": 4111, + "time_per_iteration": 2.632023811340332 + }, + { + "auxiliary_loss_clip": 0.06537174, + "auxiliary_loss_mlp": 0.0127416, + "balance_loss_clip": 0.06311068, + "balance_loss_mlp": 0.01256065, + "epoch": 0.2472268149706899, + "flos": 23336358641280.0, + "grad_norm": 1.983709335436533, + "language_loss": 0.75390071, + "learning_rate": 3.5250845906609294e-06, + "loss": 0.83201408, + "num_input_tokens_seen": 88566415, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.18115234, + "step": 4112, + "time_per_iteration": 2.5546083450317383 + }, + { + "auxiliary_loss_clip": 0.06533875, + "auxiliary_loss_mlp": 0.01274467, + "balance_loss_clip": 0.06308994, + "balance_loss_mlp": 0.01255548, + "epoch": 0.24728693822335787, + "flos": 23775469562880.0, + "grad_norm": 2.380234182887367, + "language_loss": 0.83472633, + "learning_rate": 3.5248326040434835e-06, + "loss": 0.91280973, + "num_input_tokens_seen": 88585225, + "router_z_loss_clip": 2.24804688, + "router_z_loss_mlp": 0.18920898, + "step": 4113, + "time_per_iteration": 2.6223254203796387 + }, + { + "auxiliary_loss_clip": 0.06540644, + "auxiliary_loss_mlp": 0.01276865, + "balance_loss_clip": 0.06315883, + "balance_loss_mlp": 0.01257279, + "epoch": 0.24734706147602586, + "flos": 19323494755200.0, + "grad_norm": 2.0367731486494636, + "language_loss": 0.87924093, + "learning_rate": 3.5245805596042322e-06, + "loss": 0.95741606, + "num_input_tokens_seen": 88603280, + "router_z_loss_clip": 2.24804688, + "router_z_loss_mlp": 0.19580078, + "step": 4114, + "time_per_iteration": 2.5495545864105225 + }, + { + "auxiliary_loss_clip": 0.06532501, + "auxiliary_loss_mlp": 0.01273212, + "balance_loss_clip": 0.06308883, + "balance_loss_mlp": 0.01255474, + "epoch": 0.24740718472869383, + "flos": 28044275345280.0, + "grad_norm": 1.9170399047542779, + "language_loss": 0.75640035, + "learning_rate": 3.524328457352734e-06, + "loss": 0.83445752, + "num_input_tokens_seen": 88624925, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.17736816, + "step": 4115, + "time_per_iteration": 2.6333982944488525 + }, + { + "auxiliary_loss_clip": 0.0642873, + "auxiliary_loss_mlp": 0.01264911, + "balance_loss_clip": 0.06315603, + "balance_loss_mlp": 0.01259151, + "epoch": 0.2474673079813618, + "flos": 68129265899520.0, + "grad_norm": 0.63897767002188, + "language_loss": 0.58004332, + "learning_rate": 3.5240762972985475e-06, + "loss": 0.65697974, + "num_input_tokens_seen": 88691475, + "router_z_loss_clip": 1.1328125, + "router_z_loss_mlp": 0.05752563, + "step": 4116, + "time_per_iteration": 3.251235246658325 + }, + { + "auxiliary_loss_clip": 0.06532618, + "auxiliary_loss_mlp": 0.01276179, + "balance_loss_clip": 0.063094, + "balance_loss_mlp": 0.01257022, + "epoch": 0.24752743123402976, + "flos": 29470075620480.0, + "grad_norm": 1.407143363910891, + "language_loss": 0.8425988, + "learning_rate": 3.523824079451235e-06, + "loss": 0.92068678, + "num_input_tokens_seen": 88713425, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.19152832, + "step": 4117, + "time_per_iteration": 2.640665292739868 + }, + { + "auxiliary_loss_clip": 0.06425081, + "auxiliary_loss_mlp": 0.01267093, + "balance_loss_clip": 0.0631275, + "balance_loss_mlp": 0.01262089, + "epoch": 0.24758755448669773, + "flos": 58367946908160.0, + "grad_norm": 0.8764773034828885, + "language_loss": 0.63508207, + "learning_rate": 3.5235718038203602e-06, + "loss": 0.71200383, + "num_input_tokens_seen": 88769995, + "router_z_loss_clip": 1.125, + "router_z_loss_mlp": 0.05001831, + "step": 4118, + "time_per_iteration": 3.052507162094116 + }, + { + "auxiliary_loss_clip": 0.0652981, + "auxiliary_loss_mlp": 0.01277419, + "balance_loss_clip": 0.06307684, + "balance_loss_mlp": 0.01258203, + "epoch": 0.2476476777393657, + "flos": 20490502095360.0, + "grad_norm": 1.7262960547494681, + "language_loss": 0.80051601, + "learning_rate": 3.523319470415491e-06, + "loss": 0.87858826, + "num_input_tokens_seen": 88789970, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.19238281, + "step": 4119, + "time_per_iteration": 2.554318428039551 + }, + { + "auxiliary_loss_clip": 0.06530587, + "auxiliary_loss_mlp": 0.01282865, + "balance_loss_clip": 0.06310613, + "balance_loss_mlp": 0.01265198, + "epoch": 0.24770780099203366, + "flos": 20492179176960.0, + "grad_norm": 2.4192345138137386, + "language_loss": 0.74556476, + "learning_rate": 3.5230670792461943e-06, + "loss": 0.8236993, + "num_input_tokens_seen": 88810000, + "router_z_loss_clip": 2.19921875, + "router_z_loss_mlp": 0.17663574, + "step": 4120, + "time_per_iteration": 3.996234655380249 + }, + { + "auxiliary_loss_clip": 0.06531808, + "auxiliary_loss_mlp": 0.01276043, + "balance_loss_clip": 0.06307146, + "balance_loss_mlp": 0.01256362, + "epoch": 0.24776792424470165, + "flos": 15157915603200.0, + "grad_norm": 2.13486110959629, + "language_loss": 0.89734054, + "learning_rate": 3.522814630322041e-06, + "loss": 0.97541904, + "num_input_tokens_seen": 88827515, + "router_z_loss_clip": 2.24804688, + "router_z_loss_mlp": 0.19689941, + "step": 4121, + "time_per_iteration": 2.5337533950805664 + }, + { + "auxiliary_loss_clip": 0.06540959, + "auxiliary_loss_mlp": 0.01278306, + "balance_loss_clip": 0.06314932, + "balance_loss_mlp": 0.01258744, + "epoch": 0.2478280474973696, + "flos": 21731833607040.0, + "grad_norm": 2.0829104418917646, + "language_loss": 0.69792116, + "learning_rate": 3.5225621236526045e-06, + "loss": 0.77611381, + "num_input_tokens_seen": 88845025, + "router_z_loss_clip": 2.25585938, + "router_z_loss_mlp": 0.19580078, + "step": 4122, + "time_per_iteration": 2.5857455730438232 + }, + { + "auxiliary_loss_clip": 0.06535036, + "auxiliary_loss_mlp": 0.01273779, + "balance_loss_clip": 0.0630946, + "balance_loss_mlp": 0.01254729, + "epoch": 0.24788817075003758, + "flos": 20418400056960.0, + "grad_norm": 2.5894895086667264, + "language_loss": 0.80832231, + "learning_rate": 3.5223095592474596e-06, + "loss": 0.88641047, + "num_input_tokens_seen": 88861740, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.19042969, + "step": 4123, + "time_per_iteration": 2.533696174621582 + }, + { + "auxiliary_loss_clip": 0.06528741, + "auxiliary_loss_mlp": 0.01276684, + "balance_loss_clip": 0.06306656, + "balance_loss_mlp": 0.01259625, + "epoch": 0.24794829400270554, + "flos": 22599867179520.0, + "grad_norm": 2.45373622595604, + "language_loss": 0.75091624, + "learning_rate": 3.5220569371161846e-06, + "loss": 0.82897043, + "num_input_tokens_seen": 88879740, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.1706543, + "step": 4124, + "time_per_iteration": 2.5478947162628174 + }, + { + "auxiliary_loss_clip": 0.06523614, + "auxiliary_loss_mlp": 0.01276208, + "balance_loss_clip": 0.06306844, + "balance_loss_mlp": 0.01258708, + "epoch": 0.2480084172553735, + "flos": 39685362433920.0, + "grad_norm": 1.4066224864196382, + "language_loss": 0.74510413, + "learning_rate": 3.521804257268357e-06, + "loss": 0.82310236, + "num_input_tokens_seen": 88904095, + "router_z_loss_clip": 2.16503906, + "router_z_loss_mlp": 0.17504883, + "step": 4125, + "time_per_iteration": 4.164500951766968 + }, + { + "auxiliary_loss_clip": 0.06546921, + "auxiliary_loss_mlp": 0.01279637, + "balance_loss_clip": 0.06313127, + "balance_loss_mlp": 0.01260599, + "epoch": 0.24806854050804147, + "flos": 22060129104000.0, + "grad_norm": 1.9518521214536066, + "language_loss": 0.69807184, + "learning_rate": 3.5215515197135595e-06, + "loss": 0.77633739, + "num_input_tokens_seen": 88920740, + "router_z_loss_clip": 2.34179688, + "router_z_loss_mlp": 0.19030762, + "step": 4126, + "time_per_iteration": 2.520550489425659 + }, + { + "auxiliary_loss_clip": 0.06526291, + "auxiliary_loss_mlp": 0.0127589, + "balance_loss_clip": 0.06304894, + "balance_loss_mlp": 0.01257281, + "epoch": 0.24812866376070947, + "flos": 15492164739840.0, + "grad_norm": 2.6036079521490834, + "language_loss": 0.81805199, + "learning_rate": 3.5212987244613764e-06, + "loss": 0.89607382, + "num_input_tokens_seen": 88938510, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.18615723, + "step": 4127, + "time_per_iteration": 4.052755832672119 + }, + { + "auxiliary_loss_clip": 0.06533966, + "auxiliary_loss_mlp": 0.012739, + "balance_loss_clip": 0.06306454, + "balance_loss_mlp": 0.01255494, + "epoch": 0.24818878701337743, + "flos": 14762758947840.0, + "grad_norm": 2.4130643839940746, + "language_loss": 0.85122234, + "learning_rate": 3.5210458715213927e-06, + "loss": 0.92930102, + "num_input_tokens_seen": 88955235, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.18395996, + "step": 4128, + "time_per_iteration": 2.5801029205322266 + }, + { + "auxiliary_loss_clip": 0.06541854, + "auxiliary_loss_mlp": 0.01278965, + "balance_loss_clip": 0.06316209, + "balance_loss_mlp": 0.01260821, + "epoch": 0.2482489102660454, + "flos": 27096886356480.0, + "grad_norm": 2.0112959815575713, + "language_loss": 0.66149813, + "learning_rate": 3.5207929609031973e-06, + "loss": 0.73970628, + "num_input_tokens_seen": 88975210, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.18151855, + "step": 4129, + "time_per_iteration": 2.5865726470947266 + }, + { + "auxiliary_loss_clip": 0.06528358, + "auxiliary_loss_mlp": 0.01276243, + "balance_loss_clip": 0.06307153, + "balance_loss_mlp": 0.01257444, + "epoch": 0.24830903351871336, + "flos": 26474522555520.0, + "grad_norm": 1.7021812681223303, + "language_loss": 0.75761282, + "learning_rate": 3.5205399926163806e-06, + "loss": 0.83565885, + "num_input_tokens_seen": 88996120, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.18811035, + "step": 4130, + "time_per_iteration": 2.6659512519836426 + }, + { + "auxiliary_loss_clip": 0.06526491, + "auxiliary_loss_mlp": 0.01274514, + "balance_loss_clip": 0.06302534, + "balance_loss_mlp": 0.01255, + "epoch": 0.24836915677138133, + "flos": 10232225337600.0, + "grad_norm": 2.0871707802719004, + "language_loss": 0.77625716, + "learning_rate": 3.520286966670535e-06, + "loss": 0.85426718, + "num_input_tokens_seen": 89008685, + "router_z_loss_clip": 2.23828125, + "router_z_loss_mlp": 0.19519043, + "step": 4131, + "time_per_iteration": 3.906522274017334 + }, + { + "auxiliary_loss_clip": 0.06519566, + "auxiliary_loss_mlp": 0.01270892, + "balance_loss_clip": 0.0630278, + "balance_loss_mlp": 0.01253582, + "epoch": 0.2484292800240493, + "flos": 30088162863360.0, + "grad_norm": 1.7622390062278706, + "language_loss": 0.84475207, + "learning_rate": 3.520033883075255e-06, + "loss": 0.92265671, + "num_input_tokens_seen": 89031160, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.17297363, + "step": 4132, + "time_per_iteration": 2.6436057090759277 + }, + { + "auxiliary_loss_clip": 0.06525066, + "auxiliary_loss_mlp": 0.01275924, + "balance_loss_clip": 0.06302708, + "balance_loss_mlp": 0.01256779, + "epoch": 0.24848940327671726, + "flos": 13447899878400.0, + "grad_norm": 1.545647189211169, + "language_loss": 0.71393758, + "learning_rate": 3.5197807418401386e-06, + "loss": 0.79194742, + "num_input_tokens_seen": 89047235, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.19152832, + "step": 4133, + "time_per_iteration": 2.5431106090545654 + }, + { + "auxiliary_loss_clip": 0.06542444, + "auxiliary_loss_mlp": 0.01275489, + "balance_loss_clip": 0.06309851, + "balance_loss_mlp": 0.01255116, + "epoch": 0.24854952652938525, + "flos": 19975683409920.0, + "grad_norm": 2.3352452144714513, + "language_loss": 0.6286931, + "learning_rate": 3.5195275429747834e-06, + "loss": 0.70687246, + "num_input_tokens_seen": 89064790, + "router_z_loss_clip": 2.328125, + "router_z_loss_mlp": 0.20373535, + "step": 4134, + "time_per_iteration": 2.571525812149048 + }, + { + "auxiliary_loss_clip": 0.06524864, + "auxiliary_loss_mlp": 0.01277288, + "balance_loss_clip": 0.06301688, + "balance_loss_mlp": 0.01258883, + "epoch": 0.24860964978205322, + "flos": 18156026217600.0, + "grad_norm": 1.960513817978903, + "language_loss": 0.79140246, + "learning_rate": 3.5192742864887914e-06, + "loss": 0.86942399, + "num_input_tokens_seen": 89083250, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.18383789, + "step": 4135, + "time_per_iteration": 2.588916301727295 + }, + { + "auxiliary_loss_clip": 0.06524552, + "auxiliary_loss_mlp": 0.01274974, + "balance_loss_clip": 0.06303368, + "balance_loss_mlp": 0.01256294, + "epoch": 0.24866977303472118, + "flos": 11733397960320.0, + "grad_norm": 2.2852251503119234, + "language_loss": 0.8410641, + "learning_rate": 3.5190209723917662e-06, + "loss": 0.9190594, + "num_input_tokens_seen": 89100905, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.18676758, + "step": 4136, + "time_per_iteration": 2.497654676437378 + }, + { + "auxiliary_loss_clip": 0.06524116, + "auxiliary_loss_mlp": 0.01273427, + "balance_loss_clip": 0.06297501, + "balance_loss_mlp": 0.01254521, + "epoch": 0.24872989628738915, + "flos": 34832109623040.0, + "grad_norm": 1.7046352309858128, + "language_loss": 0.71601558, + "learning_rate": 3.518767600693314e-06, + "loss": 0.79399109, + "num_input_tokens_seen": 89122630, + "router_z_loss_clip": 2.26757812, + "router_z_loss_mlp": 0.18908691, + "step": 4137, + "time_per_iteration": 2.732480764389038 + }, + { + "auxiliary_loss_clip": 0.06525281, + "auxiliary_loss_mlp": 0.01273776, + "balance_loss_clip": 0.06299166, + "balance_loss_mlp": 0.01255549, + "epoch": 0.2487900195400571, + "flos": 13704512607360.0, + "grad_norm": 2.5230361612400296, + "language_loss": 0.67583597, + "learning_rate": 3.518514171403042e-06, + "loss": 0.7538265, + "num_input_tokens_seen": 89141050, + "router_z_loss_clip": 2.26171875, + "router_z_loss_mlp": 0.18212891, + "step": 4138, + "time_per_iteration": 2.531855583190918 + }, + { + "auxiliary_loss_clip": 0.06519014, + "auxiliary_loss_mlp": 0.01272692, + "balance_loss_clip": 0.06298752, + "balance_loss_mlp": 0.01254501, + "epoch": 0.24885014279272508, + "flos": 25344845009280.0, + "grad_norm": 1.9341473695701388, + "language_loss": 0.83479851, + "learning_rate": 3.51826068453056e-06, + "loss": 0.91271555, + "num_input_tokens_seen": 89160810, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.18188477, + "step": 4139, + "time_per_iteration": 2.6051557064056396 + }, + { + "auxiliary_loss_clip": 0.06528804, + "auxiliary_loss_mlp": 0.01275882, + "balance_loss_clip": 0.06300579, + "balance_loss_mlp": 0.01255711, + "epoch": 0.24891026604539307, + "flos": 20637724919040.0, + "grad_norm": 1.6977646822397727, + "language_loss": 0.79297662, + "learning_rate": 3.518007140085481e-06, + "loss": 0.87102342, + "num_input_tokens_seen": 89180610, + "router_z_loss_clip": 2.2890625, + "router_z_loss_mlp": 0.20178223, + "step": 4140, + "time_per_iteration": 2.5448291301727295 + }, + { + "auxiliary_loss_clip": 0.0641291, + "auxiliary_loss_mlp": 0.01270262, + "balance_loss_clip": 0.06303305, + "balance_loss_mlp": 0.0126555, + "epoch": 0.24897038929806103, + "flos": 66979086030720.0, + "grad_norm": 0.8107945435966392, + "language_loss": 0.60717231, + "learning_rate": 3.51775353807742e-06, + "loss": 0.68400407, + "num_input_tokens_seen": 89241880, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.04705811, + "step": 4141, + "time_per_iteration": 3.2685940265655518 + }, + { + "auxiliary_loss_clip": 0.06525983, + "auxiliary_loss_mlp": 0.01275717, + "balance_loss_clip": 0.06301422, + "balance_loss_mlp": 0.01256894, + "epoch": 0.249030512550729, + "flos": 36401359288320.0, + "grad_norm": 1.7802793710753735, + "language_loss": 0.72871864, + "learning_rate": 3.5174998785159913e-06, + "loss": 0.80673563, + "num_input_tokens_seen": 89263340, + "router_z_loss_clip": 2.24609375, + "router_z_loss_mlp": 0.18823242, + "step": 4142, + "time_per_iteration": 2.6564056873321533 + }, + { + "auxiliary_loss_clip": 0.0652248, + "auxiliary_loss_mlp": 0.01276725, + "balance_loss_clip": 0.06302793, + "balance_loss_mlp": 0.0125789, + "epoch": 0.24909063580339696, + "flos": 20160361808640.0, + "grad_norm": 1.9535741137498925, + "language_loss": 0.81280798, + "learning_rate": 3.5172461614108157e-06, + "loss": 0.8908, + "num_input_tokens_seen": 89282870, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.18823242, + "step": 4143, + "time_per_iteration": 2.5795881748199463 + }, + { + "auxiliary_loss_clip": 0.06522508, + "auxiliary_loss_mlp": 0.01275624, + "balance_loss_clip": 0.06301625, + "balance_loss_mlp": 0.01257039, + "epoch": 0.24915075905606493, + "flos": 26403887963520.0, + "grad_norm": 1.964912825826696, + "language_loss": 0.59448719, + "learning_rate": 3.5169923867715137e-06, + "loss": 0.67246854, + "num_input_tokens_seen": 89303830, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.18579102, + "step": 4144, + "time_per_iteration": 2.5888898372650146 + }, + { + "auxiliary_loss_clip": 0.06520054, + "auxiliary_loss_mlp": 0.01279478, + "balance_loss_clip": 0.06300642, + "balance_loss_mlp": 0.01260608, + "epoch": 0.2492108823087329, + "flos": 27534655612800.0, + "grad_norm": 2.2926576094039253, + "language_loss": 0.79198605, + "learning_rate": 3.516738554607708e-06, + "loss": 0.86998141, + "num_input_tokens_seen": 89324350, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.18859863, + "step": 4145, + "time_per_iteration": 2.6068575382232666 + }, + { + "auxiliary_loss_clip": 0.06539698, + "auxiliary_loss_mlp": 0.01282889, + "balance_loss_clip": 0.06307465, + "balance_loss_mlp": 0.01262587, + "epoch": 0.24927100556140086, + "flos": 16697088852480.0, + "grad_norm": 2.388513156986414, + "language_loss": 0.65914291, + "learning_rate": 3.5164846649290253e-06, + "loss": 0.73736882, + "num_input_tokens_seen": 89342875, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.20300293, + "step": 4146, + "time_per_iteration": 2.550225019454956 + }, + { + "auxiliary_loss_clip": 0.06418058, + "auxiliary_loss_mlp": 0.01257626, + "balance_loss_clip": 0.06307501, + "balance_loss_mlp": 0.01252389, + "epoch": 0.24933112881406885, + "flos": 62791899724800.0, + "grad_norm": 0.9255702942051489, + "language_loss": 0.67495543, + "learning_rate": 3.5162307177450915e-06, + "loss": 0.75171226, + "num_input_tokens_seen": 89404925, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.05239868, + "step": 4147, + "time_per_iteration": 3.2676596641540527 + }, + { + "auxiliary_loss_clip": 0.06525366, + "auxiliary_loss_mlp": 0.01281982, + "balance_loss_clip": 0.06302118, + "balance_loss_mlp": 0.01261764, + "epoch": 0.24939125206673682, + "flos": 26659242881280.0, + "grad_norm": 1.678024692441642, + "language_loss": 0.89250457, + "learning_rate": 3.5159767130655366e-06, + "loss": 0.97057807, + "num_input_tokens_seen": 89425090, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.20214844, + "step": 4148, + "time_per_iteration": 2.5950350761413574 + }, + { + "auxiliary_loss_clip": 0.06529681, + "auxiliary_loss_mlp": 0.01281757, + "balance_loss_clip": 0.06300169, + "balance_loss_mlp": 0.0125968, + "epoch": 0.24945137531940478, + "flos": 20710623571200.0, + "grad_norm": 1.8952521518004763, + "language_loss": 0.68350649, + "learning_rate": 3.5157226508999935e-06, + "loss": 0.76162088, + "num_input_tokens_seen": 89442615, + "router_z_loss_clip": 2.29492188, + "router_z_loss_mlp": 0.22070312, + "step": 4149, + "time_per_iteration": 2.52567720413208 + }, + { + "auxiliary_loss_clip": 0.06528307, + "auxiliary_loss_mlp": 0.0128627, + "balance_loss_clip": 0.06306647, + "balance_loss_mlp": 0.01266398, + "epoch": 0.24951149857207275, + "flos": 23775385708800.0, + "grad_norm": 1.639238516163445, + "language_loss": 0.71759897, + "learning_rate": 3.515468531258095e-06, + "loss": 0.79574472, + "num_input_tokens_seen": 89463025, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.1986084, + "step": 4150, + "time_per_iteration": 2.580000877380371 + }, + { + "auxiliary_loss_clip": 0.06529218, + "auxiliary_loss_mlp": 0.01284871, + "balance_loss_clip": 0.06303831, + "balance_loss_mlp": 0.01264129, + "epoch": 0.2495716218247407, + "flos": 15669589760640.0, + "grad_norm": 1.939767404293352, + "language_loss": 0.73002028, + "learning_rate": 3.515214354149478e-06, + "loss": 0.80816114, + "num_input_tokens_seen": 89480225, + "router_z_loss_clip": 2.25390625, + "router_z_loss_mlp": 0.20739746, + "step": 4151, + "time_per_iteration": 2.4935879707336426 + }, + { + "auxiliary_loss_clip": 0.06534886, + "auxiliary_loss_mlp": 0.01281273, + "balance_loss_clip": 0.0630331, + "balance_loss_mlp": 0.01261055, + "epoch": 0.24963174507740868, + "flos": 24057924076800.0, + "grad_norm": 4.265592628376469, + "language_loss": 0.64070994, + "learning_rate": 3.514960119583781e-06, + "loss": 0.71887159, + "num_input_tokens_seen": 89496985, + "router_z_loss_clip": 2.31445312, + "router_z_loss_mlp": 0.20227051, + "step": 4152, + "time_per_iteration": 2.5687365531921387 + }, + { + "auxiliary_loss_clip": 0.06516105, + "auxiliary_loss_mlp": 0.01279803, + "balance_loss_clip": 0.06296911, + "balance_loss_mlp": 0.01259979, + "epoch": 0.24969186833007664, + "flos": 21806073924480.0, + "grad_norm": 2.335025994250793, + "language_loss": 0.7798419, + "learning_rate": 3.514705827570645e-06, + "loss": 0.85780108, + "num_input_tokens_seen": 89514420, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.19812012, + "step": 4153, + "time_per_iteration": 2.5565860271453857 + }, + { + "auxiliary_loss_clip": 0.06523906, + "auxiliary_loss_mlp": 0.01276939, + "balance_loss_clip": 0.06304043, + "balance_loss_mlp": 0.01257806, + "epoch": 0.24975199158274464, + "flos": 19944307255680.0, + "grad_norm": 2.3946475317027978, + "language_loss": 0.77287221, + "learning_rate": 3.514451478119711e-06, + "loss": 0.85088068, + "num_input_tokens_seen": 89532925, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.19152832, + "step": 4154, + "time_per_iteration": 2.5327064990997314 + }, + { + "auxiliary_loss_clip": 0.06533594, + "auxiliary_loss_mlp": 0.0128089, + "balance_loss_clip": 0.06299926, + "balance_loss_mlp": 0.01258145, + "epoch": 0.2498121148354126, + "flos": 25345515841920.0, + "grad_norm": 1.7912237432514402, + "language_loss": 0.71052945, + "learning_rate": 3.5141970712406258e-06, + "loss": 0.78867429, + "num_input_tokens_seen": 89552855, + "router_z_loss_clip": 2.33984375, + "router_z_loss_mlp": 0.22766113, + "step": 4155, + "time_per_iteration": 2.566044330596924 + }, + { + "auxiliary_loss_clip": 0.06528749, + "auxiliary_loss_mlp": 0.01277956, + "balance_loss_clip": 0.06300025, + "balance_loss_mlp": 0.01257809, + "epoch": 0.24987223808808057, + "flos": 20565119756160.0, + "grad_norm": 1.6974291352944781, + "language_loss": 0.75592315, + "learning_rate": 3.513942606943036e-06, + "loss": 0.83399028, + "num_input_tokens_seen": 89572830, + "router_z_loss_clip": 2.2890625, + "router_z_loss_mlp": 0.20141602, + "step": 4156, + "time_per_iteration": 2.5388355255126953 + }, + { + "auxiliary_loss_clip": 0.06524897, + "auxiliary_loss_mlp": 0.01278507, + "balance_loss_clip": 0.06303049, + "balance_loss_mlp": 0.0125842, + "epoch": 0.24993236134074853, + "flos": 19754052560640.0, + "grad_norm": 3.125892113983293, + "language_loss": 0.77757698, + "learning_rate": 3.513688085236591e-06, + "loss": 0.85561097, + "num_input_tokens_seen": 89590345, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.20068359, + "step": 4157, + "time_per_iteration": 2.5327329635620117 + }, + { + "auxiliary_loss_clip": 0.06527505, + "auxiliary_loss_mlp": 0.012775, + "balance_loss_clip": 0.06301083, + "balance_loss_mlp": 0.01257068, + "epoch": 0.2499924845934165, + "flos": 18776209812480.0, + "grad_norm": 1.8891569690037928, + "language_loss": 0.82203197, + "learning_rate": 3.513433506130942e-06, + "loss": 0.90008199, + "num_input_tokens_seen": 89610295, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.20422363, + "step": 4158, + "time_per_iteration": 2.5894827842712402 + }, + { + "auxiliary_loss_clip": 0.06518973, + "auxiliary_loss_mlp": 0.01272913, + "balance_loss_clip": 0.06295922, + "balance_loss_mlp": 0.012544, + "epoch": 0.25005260784608446, + "flos": 16877658401280.0, + "grad_norm": 2.206587551308884, + "language_loss": 0.75718945, + "learning_rate": 3.5131788696357427e-06, + "loss": 0.83510834, + "num_input_tokens_seen": 89627795, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.18505859, + "step": 4159, + "time_per_iteration": 2.5279693603515625 + }, + { + "auxiliary_loss_clip": 0.06529576, + "auxiliary_loss_mlp": 0.01278956, + "balance_loss_clip": 0.06300279, + "balance_loss_mlp": 0.01258142, + "epoch": 0.2501127310987524, + "flos": 22131057185280.0, + "grad_norm": 2.1699031495969354, + "language_loss": 0.71598893, + "learning_rate": 3.512924175760649e-06, + "loss": 0.7940743, + "num_input_tokens_seen": 89648090, + "router_z_loss_clip": 2.29492188, + "router_z_loss_mlp": 0.20812988, + "step": 4160, + "time_per_iteration": 3.9746532440185547 + }, + { + "auxiliary_loss_clip": 0.06424317, + "auxiliary_loss_mlp": 0.01267599, + "balance_loss_clip": 0.06313459, + "balance_loss_mlp": 0.01263326, + "epoch": 0.2501728543514204, + "flos": 69480071170560.0, + "grad_norm": 0.7438462037708533, + "language_loss": 0.56844532, + "learning_rate": 3.5126694245153186e-06, + "loss": 0.64536446, + "num_input_tokens_seen": 89710345, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.04278564, + "step": 4161, + "time_per_iteration": 3.233760356903076 + }, + { + "auxiliary_loss_clip": 0.06530809, + "auxiliary_loss_mlp": 0.01282686, + "balance_loss_clip": 0.06298731, + "balance_loss_mlp": 0.01261848, + "epoch": 0.25023297760408836, + "flos": 16295601214080.0, + "grad_norm": 2.49700797922569, + "language_loss": 0.8179751, + "learning_rate": 3.5124146159094125e-06, + "loss": 0.89611006, + "num_input_tokens_seen": 89729390, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.20849609, + "step": 4162, + "time_per_iteration": 2.553572654724121 + }, + { + "auxiliary_loss_clip": 0.0652239, + "auxiliary_loss_mlp": 0.01280647, + "balance_loss_clip": 0.06294353, + "balance_loss_mlp": 0.01260358, + "epoch": 0.2502931008567563, + "flos": 12242598422400.0, + "grad_norm": 2.2503072324763616, + "language_loss": 0.88019562, + "learning_rate": 3.5121597499525927e-06, + "loss": 0.95822597, + "num_input_tokens_seen": 89742805, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.203125, + "step": 4163, + "time_per_iteration": 2.531467914581299 + }, + { + "auxiliary_loss_clip": 0.06520548, + "auxiliary_loss_mlp": 0.01277405, + "balance_loss_clip": 0.06293885, + "balance_loss_mlp": 0.01257092, + "epoch": 0.25035322410942434, + "flos": 23188003787520.0, + "grad_norm": 1.6365124228332002, + "language_loss": 0.83867121, + "learning_rate": 3.5119048266545232e-06, + "loss": 0.91665077, + "num_input_tokens_seen": 89761145, + "router_z_loss_clip": 2.26757812, + "router_z_loss_mlp": 0.20300293, + "step": 4164, + "time_per_iteration": 4.068189382553101 + }, + { + "auxiliary_loss_clip": 0.06509531, + "auxiliary_loss_mlp": 0.01280667, + "balance_loss_clip": 0.06292763, + "balance_loss_mlp": 0.01262106, + "epoch": 0.2504133473620923, + "flos": 20922904690560.0, + "grad_norm": 1.788160941639295, + "language_loss": 0.7460506, + "learning_rate": 3.5116498460248716e-06, + "loss": 0.82395256, + "num_input_tokens_seen": 89780905, + "router_z_loss_clip": 2.16601562, + "router_z_loss_mlp": 0.18579102, + "step": 4165, + "time_per_iteration": 2.568701982498169 + }, + { + "auxiliary_loss_clip": 0.06526586, + "auxiliary_loss_mlp": 0.01278077, + "balance_loss_clip": 0.06293961, + "balance_loss_mlp": 0.01257883, + "epoch": 0.2504734706147603, + "flos": 20782725609600.0, + "grad_norm": 1.8100288551258081, + "language_loss": 0.74429101, + "learning_rate": 3.5113948080733062e-06, + "loss": 0.82233763, + "num_input_tokens_seen": 89799230, + "router_z_loss_clip": 2.33007812, + "router_z_loss_mlp": 0.2019043, + "step": 4166, + "time_per_iteration": 3.989368438720703 + }, + { + "auxiliary_loss_clip": 0.065147, + "auxiliary_loss_mlp": 0.01277163, + "balance_loss_clip": 0.06293219, + "balance_loss_mlp": 0.0125778, + "epoch": 0.25053359386742824, + "flos": 24355681960320.0, + "grad_norm": 1.5960764456675967, + "language_loss": 0.82469785, + "learning_rate": 3.5111397128094973e-06, + "loss": 0.90261644, + "num_input_tokens_seen": 89818240, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.19384766, + "step": 4167, + "time_per_iteration": 2.554733991622925 + }, + { + "auxiliary_loss_clip": 0.06513357, + "auxiliary_loss_mlp": 0.01280403, + "balance_loss_clip": 0.06292276, + "balance_loss_mlp": 0.01260614, + "epoch": 0.2505937171200962, + "flos": 21220578720000.0, + "grad_norm": 1.9887592956808484, + "language_loss": 0.80394876, + "learning_rate": 3.51088456024312e-06, + "loss": 0.88188636, + "num_input_tokens_seen": 89834485, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.19799805, + "step": 4168, + "time_per_iteration": 2.576969623565674 + }, + { + "auxiliary_loss_clip": 0.06531397, + "auxiliary_loss_mlp": 0.01277594, + "balance_loss_clip": 0.06300385, + "balance_loss_mlp": 0.01256196, + "epoch": 0.25065384037276417, + "flos": 41436816802560.0, + "grad_norm": 4.930314721126017, + "language_loss": 0.69985271, + "learning_rate": 3.510629350383849e-06, + "loss": 0.7779426, + "num_input_tokens_seen": 89855645, + "router_z_loss_clip": 2.31445312, + "router_z_loss_mlp": 0.21386719, + "step": 4169, + "time_per_iteration": 2.709149122238159 + }, + { + "auxiliary_loss_clip": 0.06511761, + "auxiliary_loss_mlp": 0.01277868, + "balance_loss_clip": 0.06292351, + "balance_loss_mlp": 0.0125827, + "epoch": 0.25071396362543213, + "flos": 26109274608000.0, + "grad_norm": 1.904216953279787, + "language_loss": 0.77927327, + "learning_rate": 3.510374083241361e-06, + "loss": 0.85716957, + "num_input_tokens_seen": 89874895, + "router_z_loss_clip": 2.19335938, + "router_z_loss_mlp": 0.19592285, + "step": 4170, + "time_per_iteration": 4.016170024871826 + }, + { + "auxiliary_loss_clip": 0.0651409, + "auxiliary_loss_mlp": 0.01278168, + "balance_loss_clip": 0.06291165, + "balance_loss_mlp": 0.01258975, + "epoch": 0.2507740868781001, + "flos": 19105008433920.0, + "grad_norm": 2.5077494433812966, + "language_loss": 0.76900339, + "learning_rate": 3.5101187588253368e-06, + "loss": 0.84692597, + "num_input_tokens_seen": 89891700, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.1920166, + "step": 4171, + "time_per_iteration": 2.5651609897613525 + }, + { + "auxiliary_loss_clip": 0.06406491, + "auxiliary_loss_mlp": 0.01262132, + "balance_loss_clip": 0.06297328, + "balance_loss_mlp": 0.01257083, + "epoch": 0.25083421013076806, + "flos": 64361652514560.0, + "grad_norm": 0.8214086964760371, + "language_loss": 0.6006844, + "learning_rate": 3.509863377145458e-06, + "loss": 0.67737067, + "num_input_tokens_seen": 89955775, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.05047607, + "step": 4172, + "time_per_iteration": 3.1837103366851807 + }, + { + "auxiliary_loss_clip": 0.06520402, + "auxiliary_loss_mlp": 0.01281138, + "balance_loss_clip": 0.06295419, + "balance_loss_mlp": 0.012603, + "epoch": 0.25089433338343603, + "flos": 24286430960640.0, + "grad_norm": 1.3489665028935822, + "language_loss": 0.79424238, + "learning_rate": 3.509607938211409e-06, + "loss": 0.87225777, + "num_input_tokens_seen": 89977150, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.20849609, + "step": 4173, + "time_per_iteration": 2.6214826107025146 + }, + { + "auxiliary_loss_clip": 0.06513289, + "auxiliary_loss_mlp": 0.01273745, + "balance_loss_clip": 0.06291197, + "balance_loss_mlp": 0.01254398, + "epoch": 0.250954456636104, + "flos": 14726896600320.0, + "grad_norm": 1.8312177549547823, + "language_loss": 0.83930022, + "learning_rate": 3.509352442032875e-06, + "loss": 0.91717052, + "num_input_tokens_seen": 89994925, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.19360352, + "step": 4174, + "time_per_iteration": 2.5973377227783203 + }, + { + "auxiliary_loss_clip": 0.06519122, + "auxiliary_loss_mlp": 0.0127901, + "balance_loss_clip": 0.0629285, + "balance_loss_mlp": 0.01259341, + "epoch": 0.25101457988877196, + "flos": 22280208652800.0, + "grad_norm": 2.088546315652338, + "language_loss": 0.71558678, + "learning_rate": 3.509096888619545e-06, + "loss": 0.79356813, + "num_input_tokens_seen": 90013235, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.19665527, + "step": 4175, + "time_per_iteration": 2.6718719005584717 + }, + { + "auxiliary_loss_clip": 0.06522886, + "auxiliary_loss_mlp": 0.01277602, + "balance_loss_clip": 0.06295571, + "balance_loss_mlp": 0.01256502, + "epoch": 0.2510747031414399, + "flos": 25195441979520.0, + "grad_norm": 1.9595604726907228, + "language_loss": 0.81335604, + "learning_rate": 3.50884127798111e-06, + "loss": 0.891361, + "num_input_tokens_seen": 90032150, + "router_z_loss_clip": 2.2734375, + "router_z_loss_mlp": 0.2109375, + "step": 4176, + "time_per_iteration": 2.5455691814422607 + }, + { + "auxiliary_loss_clip": 0.06515132, + "auxiliary_loss_mlp": 0.01279504, + "balance_loss_clip": 0.06292217, + "balance_loss_mlp": 0.01257319, + "epoch": 0.25113482639410795, + "flos": 20710455863040.0, + "grad_norm": 1.8805810902271358, + "language_loss": 0.83346581, + "learning_rate": 3.5085856101272623e-06, + "loss": 0.91141224, + "num_input_tokens_seen": 90049085, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.22167969, + "step": 4177, + "time_per_iteration": 2.5471949577331543 + }, + { + "auxiliary_loss_clip": 0.06520942, + "auxiliary_loss_mlp": 0.01276628, + "balance_loss_clip": 0.06300486, + "balance_loss_mlp": 0.01256375, + "epoch": 0.2511949496467759, + "flos": 21513347285760.0, + "grad_norm": 2.081094632338002, + "language_loss": 0.83410418, + "learning_rate": 3.508329885067698e-06, + "loss": 0.91207987, + "num_input_tokens_seen": 90067695, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.20251465, + "step": 4178, + "time_per_iteration": 2.5352370738983154 + }, + { + "auxiliary_loss_clip": 0.06514454, + "auxiliary_loss_mlp": 0.01274486, + "balance_loss_clip": 0.06294617, + "balance_loss_mlp": 0.01255949, + "epoch": 0.2512550728994439, + "flos": 20707898313600.0, + "grad_norm": 2.160080340734635, + "language_loss": 0.75744665, + "learning_rate": 3.508074102812112e-06, + "loss": 0.83533603, + "num_input_tokens_seen": 90083890, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.18554688, + "step": 4179, + "time_per_iteration": 2.560995578765869 + }, + { + "auxiliary_loss_clip": 0.0652363, + "auxiliary_loss_mlp": 0.0128226, + "balance_loss_clip": 0.06298499, + "balance_loss_mlp": 0.01261053, + "epoch": 0.25131519615211184, + "flos": 18484531349760.0, + "grad_norm": 2.0850842878171347, + "language_loss": 0.70515448, + "learning_rate": 3.507818263370206e-06, + "loss": 0.78321338, + "num_input_tokens_seen": 90100995, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.2121582, + "step": 4180, + "time_per_iteration": 2.510233163833618 + }, + { + "auxiliary_loss_clip": 0.06511761, + "auxiliary_loss_mlp": 0.01275296, + "balance_loss_clip": 0.06292045, + "balance_loss_mlp": 0.0125565, + "epoch": 0.2513753194047798, + "flos": 20491131000960.0, + "grad_norm": 1.8144815234901748, + "language_loss": 0.86591852, + "learning_rate": 3.5075623667516796e-06, + "loss": 0.94378912, + "num_input_tokens_seen": 90120365, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.19628906, + "step": 4181, + "time_per_iteration": 2.546736240386963 + }, + { + "auxiliary_loss_clip": 0.06519435, + "auxiliary_loss_mlp": 0.01276165, + "balance_loss_clip": 0.06298217, + "balance_loss_mlp": 0.01256555, + "epoch": 0.25143544265744777, + "flos": 37679182053120.0, + "grad_norm": 1.8572714108551465, + "language_loss": 0.68626046, + "learning_rate": 3.507306412966238e-06, + "loss": 0.76421642, + "num_input_tokens_seen": 90142610, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.19616699, + "step": 4182, + "time_per_iteration": 2.6632721424102783 + }, + { + "auxiliary_loss_clip": 0.06408723, + "auxiliary_loss_mlp": 0.01271787, + "balance_loss_clip": 0.0630056, + "balance_loss_mlp": 0.012679, + "epoch": 0.25149556591011574, + "flos": 69386502487680.0, + "grad_norm": 0.837431587640593, + "language_loss": 0.70118701, + "learning_rate": 3.5070504020235853e-06, + "loss": 0.77799207, + "num_input_tokens_seen": 90200555, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.03881836, + "step": 4183, + "time_per_iteration": 3.194293737411499 + }, + { + "auxiliary_loss_clip": 0.0651418, + "auxiliary_loss_mlp": 0.01278526, + "balance_loss_clip": 0.06292195, + "balance_loss_mlp": 0.01258725, + "epoch": 0.2515556891627837, + "flos": 13995478310400.0, + "grad_norm": 2.4106350957321805, + "language_loss": 0.74627292, + "learning_rate": 3.506794333933431e-06, + "loss": 0.82419991, + "num_input_tokens_seen": 90218120, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.19799805, + "step": 4184, + "time_per_iteration": 2.589237689971924 + }, + { + "auxiliary_loss_clip": 0.0652144, + "auxiliary_loss_mlp": 0.01279322, + "balance_loss_clip": 0.06299628, + "balance_loss_mlp": 0.01258496, + "epoch": 0.25161581241545167, + "flos": 22170022133760.0, + "grad_norm": 2.9216799071507964, + "language_loss": 0.83484751, + "learning_rate": 3.506538208705484e-06, + "loss": 0.91285515, + "num_input_tokens_seen": 90236790, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.20837402, + "step": 4185, + "time_per_iteration": 2.5535552501678467 + }, + { + "auxiliary_loss_clip": 0.06393237, + "auxiliary_loss_mlp": 0.01262208, + "balance_loss_clip": 0.06284703, + "balance_loss_mlp": 0.01258632, + "epoch": 0.25167593566811963, + "flos": 69375936873600.0, + "grad_norm": 0.7619629684954553, + "language_loss": 0.61517715, + "learning_rate": 3.5062820263494574e-06, + "loss": 0.69173163, + "num_input_tokens_seen": 90297070, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.03567505, + "step": 4186, + "time_per_iteration": 3.0749270915985107 + }, + { + "auxiliary_loss_clip": 0.06517404, + "auxiliary_loss_mlp": 0.01276354, + "balance_loss_clip": 0.06296861, + "balance_loss_mlp": 0.01256946, + "epoch": 0.2517360589207876, + "flos": 13266533715840.0, + "grad_norm": 1.9855339768496567, + "language_loss": 0.79795682, + "learning_rate": 3.5060257868750656e-06, + "loss": 0.87589443, + "num_input_tokens_seen": 90315255, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.1940918, + "step": 4187, + "time_per_iteration": 2.507354974746704 + }, + { + "auxiliary_loss_clip": 0.06517795, + "auxiliary_loss_mlp": 0.01276527, + "balance_loss_clip": 0.06298955, + "balance_loss_mlp": 0.01257001, + "epoch": 0.25179618217345556, + "flos": 20383208542080.0, + "grad_norm": 1.642205422551737, + "language_loss": 0.80147833, + "learning_rate": 3.5057694902920244e-06, + "loss": 0.87942159, + "num_input_tokens_seen": 90334990, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.19519043, + "step": 4188, + "time_per_iteration": 2.5763680934906006 + }, + { + "auxiliary_loss_clip": 0.06512115, + "auxiliary_loss_mlp": 0.01281194, + "balance_loss_clip": 0.06293076, + "balance_loss_mlp": 0.01261405, + "epoch": 0.25185630542612353, + "flos": 27670767770880.0, + "grad_norm": 1.9118309511671905, + "language_loss": 0.75198257, + "learning_rate": 3.5055131366100534e-06, + "loss": 0.8299157, + "num_input_tokens_seen": 90351825, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.19775391, + "step": 4189, + "time_per_iteration": 2.5764901638031006 + }, + { + "auxiliary_loss_clip": 0.06511948, + "auxiliary_loss_mlp": 0.01272813, + "balance_loss_clip": 0.06296545, + "balance_loss_mlp": 0.01255253, + "epoch": 0.25191642867879155, + "flos": 21002805158400.0, + "grad_norm": 1.9652552730181423, + "language_loss": 0.84938216, + "learning_rate": 3.5052567258388745e-06, + "loss": 0.92722976, + "num_input_tokens_seen": 90369860, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.17565918, + "step": 4190, + "time_per_iteration": 2.592289447784424 + }, + { + "auxiliary_loss_clip": 0.06519347, + "auxiliary_loss_mlp": 0.01277887, + "balance_loss_clip": 0.0629743, + "balance_loss_mlp": 0.01256513, + "epoch": 0.2519765519314595, + "flos": 21112027355520.0, + "grad_norm": 3.618444667756858, + "language_loss": 0.7581113, + "learning_rate": 3.5050002579882082e-06, + "loss": 0.83608365, + "num_input_tokens_seen": 90389245, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.21386719, + "step": 4191, + "time_per_iteration": 2.526263952255249 + }, + { + "auxiliary_loss_clip": 0.06391463, + "auxiliary_loss_mlp": 0.01256383, + "balance_loss_clip": 0.06282607, + "balance_loss_mlp": 0.01252372, + "epoch": 0.2520366751841275, + "flos": 62765932158720.0, + "grad_norm": 0.7119135795788611, + "language_loss": 0.56952, + "learning_rate": 3.5047437330677823e-06, + "loss": 0.64599848, + "num_input_tokens_seen": 90456735, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.0401001, + "step": 4192, + "time_per_iteration": 3.271810531616211 + }, + { + "auxiliary_loss_clip": 0.06513695, + "auxiliary_loss_mlp": 0.01277171, + "balance_loss_clip": 0.06298056, + "balance_loss_mlp": 0.01257835, + "epoch": 0.25209679843679544, + "flos": 22236254386560.0, + "grad_norm": 1.9003966807864532, + "language_loss": 0.77017993, + "learning_rate": 3.504487151087323e-06, + "loss": 0.84808856, + "num_input_tokens_seen": 90474165, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.19335938, + "step": 4193, + "time_per_iteration": 2.57377028465271 + }, + { + "auxiliary_loss_clip": 0.06516427, + "auxiliary_loss_mlp": 0.01274362, + "balance_loss_clip": 0.06290127, + "balance_loss_mlp": 0.01254573, + "epoch": 0.2521569216894634, + "flos": 12171502632960.0, + "grad_norm": 10.029516736128722, + "language_loss": 0.84954166, + "learning_rate": 3.5042305120565598e-06, + "loss": 0.92744958, + "num_input_tokens_seen": 90491660, + "router_z_loss_clip": 2.26171875, + "router_z_loss_mlp": 0.19787598, + "step": 4194, + "time_per_iteration": 2.553053140640259 + }, + { + "auxiliary_loss_clip": 0.06517825, + "auxiliary_loss_mlp": 0.01277837, + "balance_loss_clip": 0.06293463, + "balance_loss_mlp": 0.01258668, + "epoch": 0.2522170449421314, + "flos": 23707182885120.0, + "grad_norm": 1.454284137617771, + "language_loss": 0.88584, + "learning_rate": 3.5039738159852253e-06, + "loss": 0.96379662, + "num_input_tokens_seen": 90514025, + "router_z_loss_clip": 2.2421875, + "router_z_loss_mlp": 0.19165039, + "step": 4195, + "time_per_iteration": 2.576735734939575 + }, + { + "auxiliary_loss_clip": 0.06516481, + "auxiliary_loss_mlp": 0.01280538, + "balance_loss_clip": 0.06291771, + "balance_loss_mlp": 0.01258258, + "epoch": 0.25227716819479934, + "flos": 20961073025280.0, + "grad_norm": 2.023401186655312, + "language_loss": 0.86073804, + "learning_rate": 3.503717062883053e-06, + "loss": 0.93870831, + "num_input_tokens_seen": 90533530, + "router_z_loss_clip": 2.24609375, + "router_z_loss_mlp": 0.22290039, + "step": 4196, + "time_per_iteration": 2.561074733734131 + }, + { + "auxiliary_loss_clip": 0.06519768, + "auxiliary_loss_mlp": 0.01277786, + "balance_loss_clip": 0.06297043, + "balance_loss_mlp": 0.01258486, + "epoch": 0.2523372914474673, + "flos": 23338077649920.0, + "grad_norm": 1.7735111095668046, + "language_loss": 0.8382597, + "learning_rate": 3.5034602527597786e-06, + "loss": 0.91623521, + "num_input_tokens_seen": 90554025, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.19299316, + "step": 4197, + "time_per_iteration": 2.606966018676758 + }, + { + "auxiliary_loss_clip": 0.06523669, + "auxiliary_loss_mlp": 0.01282022, + "balance_loss_clip": 0.06298, + "balance_loss_mlp": 0.01260898, + "epoch": 0.25239741470013527, + "flos": 36978217522560.0, + "grad_norm": 2.239450775339409, + "language_loss": 0.72922301, + "learning_rate": 3.5032033856251405e-06, + "loss": 0.80727994, + "num_input_tokens_seen": 90576930, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.21130371, + "step": 4198, + "time_per_iteration": 2.6708526611328125 + }, + { + "auxiliary_loss_clip": 0.06527208, + "auxiliary_loss_mlp": 0.012804, + "balance_loss_clip": 0.06297485, + "balance_loss_mlp": 0.01258967, + "epoch": 0.25245753795280323, + "flos": 18521777289600.0, + "grad_norm": 2.0891954597653055, + "language_loss": 0.77475321, + "learning_rate": 3.50294646148888e-06, + "loss": 0.85282922, + "num_input_tokens_seen": 90595710, + "router_z_loss_clip": 2.29882812, + "router_z_loss_mlp": 0.21447754, + "step": 4199, + "time_per_iteration": 3.9535269737243652 + }, + { + "auxiliary_loss_clip": 0.06522667, + "auxiliary_loss_mlp": 0.01277202, + "balance_loss_clip": 0.06296766, + "balance_loss_mlp": 0.01257485, + "epoch": 0.2525176612054712, + "flos": 32353387741440.0, + "grad_norm": 1.7804914051128766, + "language_loss": 0.74169135, + "learning_rate": 3.502689480360739e-06, + "loss": 0.81969011, + "num_input_tokens_seen": 90617945, + "router_z_loss_clip": 2.25976562, + "router_z_loss_mlp": 0.19714355, + "step": 4200, + "time_per_iteration": 2.637592315673828 + }, + { + "auxiliary_loss_clip": 0.06517747, + "auxiliary_loss_mlp": 0.01275367, + "balance_loss_clip": 0.06294595, + "balance_loss_mlp": 0.01255602, + "epoch": 0.25257778445813917, + "flos": 45268440307200.0, + "grad_norm": 1.5897560976370495, + "language_loss": 0.82704282, + "learning_rate": 3.5024324422504616e-06, + "loss": 0.90497398, + "num_input_tokens_seen": 90640855, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.19775391, + "step": 4201, + "time_per_iteration": 2.740555763244629 + }, + { + "auxiliary_loss_clip": 0.06520839, + "auxiliary_loss_mlp": 0.01279409, + "balance_loss_clip": 0.06295383, + "balance_loss_mlp": 0.01259048, + "epoch": 0.25263790771080713, + "flos": 23374526976000.0, + "grad_norm": 1.712909977397354, + "language_loss": 0.75193971, + "learning_rate": 3.5021753471677965e-06, + "loss": 0.82994223, + "num_input_tokens_seen": 90661350, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.20361328, + "step": 4202, + "time_per_iteration": 2.55350661277771 + }, + { + "auxiliary_loss_clip": 0.06512797, + "auxiliary_loss_mlp": 0.01277812, + "balance_loss_clip": 0.06294158, + "balance_loss_mlp": 0.01258226, + "epoch": 0.25269803096347515, + "flos": 18520938748800.0, + "grad_norm": 3.10045167794265, + "language_loss": 0.73924601, + "learning_rate": 3.501918195122491e-06, + "loss": 0.81715208, + "num_input_tokens_seen": 90680540, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.19592285, + "step": 4203, + "time_per_iteration": 2.539475917816162 + }, + { + "auxiliary_loss_clip": 0.06523657, + "auxiliary_loss_mlp": 0.01272979, + "balance_loss_clip": 0.0629805, + "balance_loss_mlp": 0.01252964, + "epoch": 0.2527581542161431, + "flos": 24617870985600.0, + "grad_norm": 1.4931409888350198, + "language_loss": 0.78306639, + "learning_rate": 3.501660986124297e-06, + "loss": 0.86103272, + "num_input_tokens_seen": 90703460, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.20007324, + "step": 4204, + "time_per_iteration": 4.058368682861328 + }, + { + "auxiliary_loss_clip": 0.0651952, + "auxiliary_loss_mlp": 0.01278061, + "balance_loss_clip": 0.06294288, + "balance_loss_mlp": 0.01258427, + "epoch": 0.2528182774688111, + "flos": 12646266266880.0, + "grad_norm": 2.5678524165435928, + "language_loss": 0.72629768, + "learning_rate": 3.5014037201829684e-06, + "loss": 0.80427349, + "num_input_tokens_seen": 90718815, + "router_z_loss_clip": 2.25585938, + "router_z_loss_mlp": 0.19616699, + "step": 4205, + "time_per_iteration": 2.503054618835449 + }, + { + "auxiliary_loss_clip": 0.06508891, + "auxiliary_loss_mlp": 0.01281235, + "balance_loss_clip": 0.06295824, + "balance_loss_mlp": 0.01264164, + "epoch": 0.25287840072147905, + "flos": 46947331440000.0, + "grad_norm": 1.3326329418173375, + "language_loss": 0.76355231, + "learning_rate": 3.50114639730826e-06, + "loss": 0.84145361, + "num_input_tokens_seen": 90742125, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.17077637, + "step": 4206, + "time_per_iteration": 4.097341537475586 + }, + { + "auxiliary_loss_clip": 0.06516857, + "auxiliary_loss_mlp": 0.01278993, + "balance_loss_clip": 0.06295118, + "balance_loss_mlp": 0.0126042, + "epoch": 0.252938523974147, + "flos": 18885641644800.0, + "grad_norm": 1.8849973173990275, + "language_loss": 0.79775047, + "learning_rate": 3.5008890175099296e-06, + "loss": 0.875709, + "num_input_tokens_seen": 90760785, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.18579102, + "step": 4207, + "time_per_iteration": 2.545203447341919 + }, + { + "auxiliary_loss_clip": 0.06511112, + "auxiliary_loss_mlp": 0.01280475, + "balance_loss_clip": 0.06293532, + "balance_loss_mlp": 0.01261628, + "epoch": 0.252998647226815, + "flos": 21441245247360.0, + "grad_norm": 1.449056492648579, + "language_loss": 0.76862776, + "learning_rate": 3.5006315807977375e-06, + "loss": 0.84654361, + "num_input_tokens_seen": 90780045, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.18859863, + "step": 4208, + "time_per_iteration": 2.540531873703003 + }, + { + "auxiliary_loss_clip": 0.06512551, + "auxiliary_loss_mlp": 0.01282266, + "balance_loss_clip": 0.06295963, + "balance_loss_mlp": 0.01264098, + "epoch": 0.25305877047948294, + "flos": 25448365128960.0, + "grad_norm": 1.8025422596027827, + "language_loss": 0.70108622, + "learning_rate": 3.5003740871814456e-06, + "loss": 0.77903438, + "num_input_tokens_seen": 90797980, + "router_z_loss_clip": 2.1640625, + "router_z_loss_mlp": 0.1817627, + "step": 4209, + "time_per_iteration": 2.586179256439209 + }, + { + "auxiliary_loss_clip": 0.06401253, + "auxiliary_loss_mlp": 0.01256172, + "balance_loss_clip": 0.06294125, + "balance_loss_mlp": 0.01251663, + "epoch": 0.2531188937321509, + "flos": 60205213457280.0, + "grad_norm": 0.7328516672129679, + "language_loss": 0.55096745, + "learning_rate": 3.5001165366708175e-06, + "loss": 0.62754166, + "num_input_tokens_seen": 90864865, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.0451355, + "step": 4210, + "time_per_iteration": 4.676252841949463 + }, + { + "auxiliary_loss_clip": 0.06515378, + "auxiliary_loss_mlp": 0.01285614, + "balance_loss_clip": 0.06294395, + "balance_loss_mlp": 0.01265861, + "epoch": 0.25317901698481887, + "flos": 19688449213440.0, + "grad_norm": 2.0935195986224837, + "language_loss": 0.81166065, + "learning_rate": 3.4998589292756204e-06, + "loss": 0.88967055, + "num_input_tokens_seen": 90882885, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.19763184, + "step": 4211, + "time_per_iteration": 2.5251474380493164 + }, + { + "auxiliary_loss_clip": 0.06513076, + "auxiliary_loss_mlp": 0.01275756, + "balance_loss_clip": 0.06299528, + "balance_loss_mlp": 0.01258554, + "epoch": 0.25323914023748684, + "flos": 24431012380800.0, + "grad_norm": 1.7184165713115493, + "language_loss": 0.78543985, + "learning_rate": 3.499601265005622e-06, + "loss": 0.86332822, + "num_input_tokens_seen": 90902985, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.17199707, + "step": 4212, + "time_per_iteration": 2.609750986099243 + }, + { + "auxiliary_loss_clip": 0.06514729, + "auxiliary_loss_mlp": 0.01278491, + "balance_loss_clip": 0.06293602, + "balance_loss_mlp": 0.0125912, + "epoch": 0.2532992634901548, + "flos": 25454528403840.0, + "grad_norm": 1.862422609084939, + "language_loss": 0.53407073, + "learning_rate": 3.4993435438705938e-06, + "loss": 0.61200291, + "num_input_tokens_seen": 90923550, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.19384766, + "step": 4213, + "time_per_iteration": 2.5825159549713135 + }, + { + "auxiliary_loss_clip": 0.06517738, + "auxiliary_loss_mlp": 0.01278881, + "balance_loss_clip": 0.06296406, + "balance_loss_mlp": 0.01259832, + "epoch": 0.25335938674282277, + "flos": 18886605966720.0, + "grad_norm": 2.428420926128805, + "language_loss": 0.65041012, + "learning_rate": 3.499085765880308e-06, + "loss": 0.72837627, + "num_input_tokens_seen": 90943260, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.19030762, + "step": 4214, + "time_per_iteration": 2.567539930343628 + }, + { + "auxiliary_loss_clip": 0.06391697, + "auxiliary_loss_mlp": 0.01257675, + "balance_loss_clip": 0.06284457, + "balance_loss_mlp": 0.01253702, + "epoch": 0.25341950999549073, + "flos": 53079692025600.0, + "grad_norm": 0.8253897319773601, + "language_loss": 0.57886475, + "learning_rate": 3.4988279310445396e-06, + "loss": 0.65535849, + "num_input_tokens_seen": 90996295, + "router_z_loss_clip": 1.07324219, + "router_z_loss_mlp": 0.03970337, + "step": 4215, + "time_per_iteration": 2.941021680831909 + }, + { + "auxiliary_loss_clip": 0.06512114, + "auxiliary_loss_mlp": 0.01274398, + "balance_loss_clip": 0.0629489, + "balance_loss_mlp": 0.0125604, + "epoch": 0.2534796332481587, + "flos": 39029609980800.0, + "grad_norm": 1.6071125602920209, + "language_loss": 0.84078032, + "learning_rate": 3.498570039373066e-06, + "loss": 0.9186455, + "num_input_tokens_seen": 91017545, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.18359375, + "step": 4216, + "time_per_iteration": 2.732790946960449 + }, + { + "auxiliary_loss_clip": 0.06509562, + "auxiliary_loss_mlp": 0.0127764, + "balance_loss_clip": 0.06290903, + "balance_loss_mlp": 0.01259294, + "epoch": 0.2535397565008267, + "flos": 23593809911040.0, + "grad_norm": 1.7865601815504963, + "language_loss": 0.81036615, + "learning_rate": 3.498312090875666e-06, + "loss": 0.88823819, + "num_input_tokens_seen": 91037715, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.18371582, + "step": 4217, + "time_per_iteration": 2.5606398582458496 + }, + { + "auxiliary_loss_clip": 0.06514265, + "auxiliary_loss_mlp": 0.01279769, + "balance_loss_clip": 0.06294704, + "balance_loss_mlp": 0.01260255, + "epoch": 0.2535998797534947, + "flos": 19287422772480.0, + "grad_norm": 2.529157470409933, + "language_loss": 0.761132, + "learning_rate": 3.4980540855621218e-06, + "loss": 0.83907235, + "num_input_tokens_seen": 91055295, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.19519043, + "step": 4218, + "time_per_iteration": 2.623429298400879 + }, + { + "auxiliary_loss_clip": 0.06516235, + "auxiliary_loss_mlp": 0.01282224, + "balance_loss_clip": 0.06296211, + "balance_loss_mlp": 0.01262757, + "epoch": 0.25366000300616265, + "flos": 24031201824000.0, + "grad_norm": 1.721807278316132, + "language_loss": 0.75063616, + "learning_rate": 3.4977960234422167e-06, + "loss": 0.82862079, + "num_input_tokens_seen": 91075485, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.19482422, + "step": 4219, + "time_per_iteration": 2.564220428466797 + }, + { + "auxiliary_loss_clip": 0.06520407, + "auxiliary_loss_mlp": 0.0127968, + "balance_loss_clip": 0.06298073, + "balance_loss_mlp": 0.01259713, + "epoch": 0.2537201262588306, + "flos": 16294888454400.0, + "grad_norm": 1.6804083546431516, + "language_loss": 0.81834626, + "learning_rate": 3.497537904525736e-06, + "loss": 0.89634717, + "num_input_tokens_seen": 91093620, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.19970703, + "step": 4220, + "time_per_iteration": 2.576335906982422 + }, + { + "auxiliary_loss_clip": 0.0652357, + "auxiliary_loss_mlp": 0.01275521, + "balance_loss_clip": 0.06301299, + "balance_loss_mlp": 0.01256936, + "epoch": 0.2537802495114986, + "flos": 23301376761600.0, + "grad_norm": 2.4535775533256796, + "language_loss": 0.71752739, + "learning_rate": 3.497279728822468e-06, + "loss": 0.79551834, + "num_input_tokens_seen": 91114110, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.18579102, + "step": 4221, + "time_per_iteration": 2.561870813369751 + }, + { + "auxiliary_loss_clip": 0.06528511, + "auxiliary_loss_mlp": 0.01279389, + "balance_loss_clip": 0.0630452, + "balance_loss_mlp": 0.01259148, + "epoch": 0.25384037276416654, + "flos": 17644855184640.0, + "grad_norm": 1.5017476973585115, + "language_loss": 0.62507772, + "learning_rate": 3.497021496342202e-06, + "loss": 0.70315671, + "num_input_tokens_seen": 91133135, + "router_z_loss_clip": 2.23828125, + "router_z_loss_mlp": 0.20239258, + "step": 4222, + "time_per_iteration": 2.6921043395996094 + }, + { + "auxiliary_loss_clip": 0.06520825, + "auxiliary_loss_mlp": 0.01278393, + "balance_loss_clip": 0.06297866, + "balance_loss_mlp": 0.0125864, + "epoch": 0.2539004960168345, + "flos": 21513473066880.0, + "grad_norm": 1.6064438591236823, + "language_loss": 0.75066334, + "learning_rate": 3.496763207094731e-06, + "loss": 0.82865554, + "num_input_tokens_seen": 91151805, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.19763184, + "step": 4223, + "time_per_iteration": 2.525251626968384 + }, + { + "auxiliary_loss_clip": 0.06514867, + "auxiliary_loss_mlp": 0.01278352, + "balance_loss_clip": 0.06297616, + "balance_loss_mlp": 0.01260101, + "epoch": 0.2539606192695025, + "flos": 23957632339200.0, + "grad_norm": 1.753259760034452, + "language_loss": 0.80341679, + "learning_rate": 3.49650486108985e-06, + "loss": 0.88134897, + "num_input_tokens_seen": 91172270, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.18261719, + "step": 4224, + "time_per_iteration": 2.6002583503723145 + }, + { + "auxiliary_loss_clip": 0.06515887, + "auxiliary_loss_mlp": 0.01281311, + "balance_loss_clip": 0.0629767, + "balance_loss_mlp": 0.01261999, + "epoch": 0.25402074252217044, + "flos": 24176537930880.0, + "grad_norm": 1.4707313275482783, + "language_loss": 0.78211224, + "learning_rate": 3.496246458337354e-06, + "loss": 0.8600843, + "num_input_tokens_seen": 91192080, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.19299316, + "step": 4225, + "time_per_iteration": 2.5527138710021973 + }, + { + "auxiliary_loss_clip": 0.06521728, + "auxiliary_loss_mlp": 0.01282671, + "balance_loss_clip": 0.06302264, + "balance_loss_mlp": 0.01263013, + "epoch": 0.2540808657748384, + "flos": 22309320746880.0, + "grad_norm": 1.6188569007516582, + "language_loss": 0.85543132, + "learning_rate": 3.4959879988470426e-06, + "loss": 0.93347526, + "num_input_tokens_seen": 91211450, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.1965332, + "step": 4226, + "time_per_iteration": 2.5676872730255127 + }, + { + "auxiliary_loss_clip": 0.06515788, + "auxiliary_loss_mlp": 0.01277599, + "balance_loss_clip": 0.06296097, + "balance_loss_mlp": 0.01258883, + "epoch": 0.25414098902750637, + "flos": 27606883432320.0, + "grad_norm": 1.6805883261517605, + "language_loss": 0.71414381, + "learning_rate": 3.4957294826287164e-06, + "loss": 0.79207766, + "num_input_tokens_seen": 91231835, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.18713379, + "step": 4227, + "time_per_iteration": 2.5918691158294678 + }, + { + "auxiliary_loss_clip": 0.06387169, + "auxiliary_loss_mlp": 0.01261576, + "balance_loss_clip": 0.06279954, + "balance_loss_mlp": 0.01257166, + "epoch": 0.25420111228017434, + "flos": 58188760951680.0, + "grad_norm": 0.9697801274632529, + "language_loss": 0.61857057, + "learning_rate": 3.4954709096921785e-06, + "loss": 0.69505799, + "num_input_tokens_seen": 91288755, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.04418945, + "step": 4228, + "time_per_iteration": 3.01169490814209 + }, + { + "auxiliary_loss_clip": 0.06514917, + "auxiliary_loss_mlp": 0.01279347, + "balance_loss_clip": 0.0629469, + "balance_loss_mlp": 0.01258235, + "epoch": 0.2542612355328423, + "flos": 11467645136640.0, + "grad_norm": 2.3876652287650577, + "language_loss": 0.8721081, + "learning_rate": 3.4952122800472336e-06, + "loss": 0.95005071, + "num_input_tokens_seen": 91302485, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.21130371, + "step": 4229, + "time_per_iteration": 2.5960769653320312 + }, + { + "auxiliary_loss_clip": 0.06519967, + "auxiliary_loss_mlp": 0.01277589, + "balance_loss_clip": 0.06299049, + "balance_loss_mlp": 0.01257836, + "epoch": 0.2543213587855103, + "flos": 22972452359040.0, + "grad_norm": 2.100172466954555, + "language_loss": 0.78119314, + "learning_rate": 3.4949535937036892e-06, + "loss": 0.85916877, + "num_input_tokens_seen": 91321120, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.19775391, + "step": 4230, + "time_per_iteration": 2.5483899116516113 + }, + { + "auxiliary_loss_clip": 0.06511904, + "auxiliary_loss_mlp": 0.01277721, + "balance_loss_clip": 0.06292608, + "balance_loss_mlp": 0.01257622, + "epoch": 0.2543814820381783, + "flos": 18257953109760.0, + "grad_norm": 2.00545114565419, + "language_loss": 0.75687885, + "learning_rate": 3.4946948506713544e-06, + "loss": 0.83477509, + "num_input_tokens_seen": 91338575, + "router_z_loss_clip": 2.19335938, + "router_z_loss_mlp": 0.20092773, + "step": 4231, + "time_per_iteration": 2.566326379776001 + }, + { + "auxiliary_loss_clip": 0.06520282, + "auxiliary_loss_mlp": 0.01278584, + "balance_loss_clip": 0.06300422, + "balance_loss_mlp": 0.01259761, + "epoch": 0.25444160529084625, + "flos": 15638129752320.0, + "grad_norm": 1.7887257039808522, + "language_loss": 0.74637282, + "learning_rate": 3.4944360509600416e-06, + "loss": 0.82436144, + "num_input_tokens_seen": 91357355, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.18823242, + "step": 4232, + "time_per_iteration": 2.5229685306549072 + }, + { + "auxiliary_loss_clip": 0.0652221, + "auxiliary_loss_mlp": 0.01293975, + "balance_loss_clip": 0.06303085, + "balance_loss_mlp": 0.01272947, + "epoch": 0.2545017285435142, + "flos": 24607431152640.0, + "grad_norm": 1.8617746927090988, + "language_loss": 0.87183899, + "learning_rate": 3.4941771945795637e-06, + "loss": 0.95000088, + "num_input_tokens_seen": 91376515, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.21032715, + "step": 4233, + "time_per_iteration": 2.6281485557556152 + }, + { + "auxiliary_loss_clip": 0.06505871, + "auxiliary_loss_mlp": 0.01278753, + "balance_loss_clip": 0.06294682, + "balance_loss_mlp": 0.01260442, + "epoch": 0.2545618517961822, + "flos": 24685654538880.0, + "grad_norm": 1.601433299567329, + "language_loss": 0.75604707, + "learning_rate": 3.493918281539737e-06, + "loss": 0.8338933, + "num_input_tokens_seen": 91397595, + "router_z_loss_clip": 2.11328125, + "router_z_loss_mlp": 0.18322754, + "step": 4234, + "time_per_iteration": 2.596642017364502 + }, + { + "auxiliary_loss_clip": 0.06514844, + "auxiliary_loss_mlp": 0.01287463, + "balance_loss_clip": 0.06292339, + "balance_loss_mlp": 0.01268938, + "epoch": 0.25462197504885015, + "flos": 23921937699840.0, + "grad_norm": 1.4560099290474922, + "language_loss": 0.75372213, + "learning_rate": 3.493659311850379e-06, + "loss": 0.83174521, + "num_input_tokens_seen": 91417775, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.18518066, + "step": 4235, + "time_per_iteration": 2.592942953109741 + }, + { + "auxiliary_loss_clip": 0.06532556, + "auxiliary_loss_mlp": 0.01283911, + "balance_loss_clip": 0.06299181, + "balance_loss_mlp": 0.01261797, + "epoch": 0.2546820983015181, + "flos": 24796134547200.0, + "grad_norm": 1.9414760170646592, + "language_loss": 0.65519691, + "learning_rate": 3.4934002855213106e-06, + "loss": 0.73336154, + "num_input_tokens_seen": 91437665, + "router_z_loss_clip": 2.33398438, + "router_z_loss_mlp": 0.22131348, + "step": 4236, + "time_per_iteration": 2.5583407878875732 + }, + { + "auxiliary_loss_clip": 0.06512251, + "auxiliary_loss_mlp": 0.01281938, + "balance_loss_clip": 0.06294776, + "balance_loss_mlp": 0.01262984, + "epoch": 0.2547422215541861, + "flos": 18740095902720.0, + "grad_norm": 1.5016735811799797, + "language_loss": 0.678509, + "learning_rate": 3.493141202562354e-06, + "loss": 0.75645095, + "num_input_tokens_seen": 91456705, + "router_z_loss_clip": 2.17578125, + "router_z_loss_mlp": 0.18945312, + "step": 4237, + "time_per_iteration": 2.5650389194488525 + }, + { + "auxiliary_loss_clip": 0.0651492, + "auxiliary_loss_mlp": 0.01282053, + "balance_loss_clip": 0.06293051, + "balance_loss_mlp": 0.01261394, + "epoch": 0.25480234480685404, + "flos": 21038751360000.0, + "grad_norm": 2.061881611294133, + "language_loss": 0.75628269, + "learning_rate": 3.492882062983333e-06, + "loss": 0.83425242, + "num_input_tokens_seen": 91475535, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.20654297, + "step": 4238, + "time_per_iteration": 2.529883861541748 + }, + { + "auxiliary_loss_clip": 0.06513957, + "auxiliary_loss_mlp": 0.0127785, + "balance_loss_clip": 0.06292559, + "balance_loss_mlp": 0.01258287, + "epoch": 0.254862468059522, + "flos": 25089112748160.0, + "grad_norm": 1.8905919191970875, + "language_loss": 0.81253731, + "learning_rate": 3.492622866794074e-06, + "loss": 0.89045537, + "num_input_tokens_seen": 91499140, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.19555664, + "step": 4239, + "time_per_iteration": 4.02100944519043 + }, + { + "auxiliary_loss_clip": 0.06508629, + "auxiliary_loss_mlp": 0.01294237, + "balance_loss_clip": 0.06291452, + "balance_loss_mlp": 0.01273471, + "epoch": 0.25492259131219, + "flos": 20564658558720.0, + "grad_norm": 1.7183169382614727, + "language_loss": 0.7800405, + "learning_rate": 3.492363614004407e-06, + "loss": 0.85806918, + "num_input_tokens_seen": 91518335, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.2076416, + "step": 4240, + "time_per_iteration": 2.5323400497436523 + }, + { + "auxiliary_loss_clip": 0.06515411, + "auxiliary_loss_mlp": 0.01282684, + "balance_loss_clip": 0.06290809, + "balance_loss_mlp": 0.01262037, + "epoch": 0.25498271456485794, + "flos": 25048889988480.0, + "grad_norm": 1.7684080721058644, + "language_loss": 0.83764112, + "learning_rate": 3.492104304624162e-06, + "loss": 0.915622, + "num_input_tokens_seen": 91537655, + "router_z_loss_clip": 2.24804688, + "router_z_loss_mlp": 0.20629883, + "step": 4241, + "time_per_iteration": 2.618563413619995 + }, + { + "auxiliary_loss_clip": 0.06511963, + "auxiliary_loss_mlp": 0.01282405, + "balance_loss_clip": 0.06292334, + "balance_loss_mlp": 0.01262676, + "epoch": 0.2550428378175259, + "flos": 26185820912640.0, + "grad_norm": 1.7847215082139707, + "language_loss": 0.73873413, + "learning_rate": 3.4918449386631725e-06, + "loss": 0.81667781, + "num_input_tokens_seen": 91557545, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.19750977, + "step": 4242, + "time_per_iteration": 2.6289515495300293 + }, + { + "auxiliary_loss_clip": 0.06517772, + "auxiliary_loss_mlp": 0.01279972, + "balance_loss_clip": 0.06296564, + "balance_loss_mlp": 0.01260398, + "epoch": 0.2551029610701939, + "flos": 15272420607360.0, + "grad_norm": 2.4567533637161896, + "language_loss": 0.72771823, + "learning_rate": 3.491585516131273e-06, + "loss": 0.80569565, + "num_input_tokens_seen": 91574405, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.19567871, + "step": 4243, + "time_per_iteration": 3.9432499408721924 + }, + { + "auxiliary_loss_clip": 0.06515735, + "auxiliary_loss_mlp": 0.0127996, + "balance_loss_clip": 0.06295779, + "balance_loss_mlp": 0.01260195, + "epoch": 0.2551630843228619, + "flos": 18117774028800.0, + "grad_norm": 1.7474968125895491, + "language_loss": 0.82239074, + "learning_rate": 3.491326037038301e-06, + "loss": 0.90034771, + "num_input_tokens_seen": 91593755, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.19750977, + "step": 4244, + "time_per_iteration": 2.6024672985076904 + }, + { + "auxiliary_loss_clip": 0.06397872, + "auxiliary_loss_mlp": 0.01258297, + "balance_loss_clip": 0.06291912, + "balance_loss_mlp": 0.01253388, + "epoch": 0.25522320757552985, + "flos": 70543055266560.0, + "grad_norm": 0.6771353060664416, + "language_loss": 0.57579219, + "learning_rate": 3.4910665013940967e-06, + "loss": 0.65235388, + "num_input_tokens_seen": 91660335, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.04904175, + "step": 4245, + "time_per_iteration": 4.687421083450317 + }, + { + "auxiliary_loss_clip": 0.06516664, + "auxiliary_loss_mlp": 0.01277203, + "balance_loss_clip": 0.06290803, + "balance_loss_mlp": 0.01256628, + "epoch": 0.2552833308281978, + "flos": 22899679488000.0, + "grad_norm": 2.827648139992037, + "language_loss": 0.65781415, + "learning_rate": 3.4908069092085015e-06, + "loss": 0.73575282, + "num_input_tokens_seen": 91678500, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.20593262, + "step": 4246, + "time_per_iteration": 2.542945384979248 + }, + { + "auxiliary_loss_clip": 0.06504452, + "auxiliary_loss_mlp": 0.01278422, + "balance_loss_clip": 0.06290503, + "balance_loss_mlp": 0.01258455, + "epoch": 0.2553434540808658, + "flos": 22060003322880.0, + "grad_norm": 2.2137811054544003, + "language_loss": 0.82470047, + "learning_rate": 3.4905472604913585e-06, + "loss": 0.90252924, + "num_input_tokens_seen": 91696430, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.19970703, + "step": 4247, + "time_per_iteration": 2.5786685943603516 + }, + { + "auxiliary_loss_clip": 0.06521233, + "auxiliary_loss_mlp": 0.01279993, + "balance_loss_clip": 0.062906, + "balance_loss_mlp": 0.01257271, + "epoch": 0.25540357733353375, + "flos": 16549656393600.0, + "grad_norm": 2.135954108256579, + "language_loss": 0.83991635, + "learning_rate": 3.490287555252514e-06, + "loss": 0.91792852, + "num_input_tokens_seen": 91713270, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.22729492, + "step": 4248, + "time_per_iteration": 2.5408127307891846 + }, + { + "auxiliary_loss_clip": 0.06511332, + "auxiliary_loss_mlp": 0.01273979, + "balance_loss_clip": 0.062884, + "balance_loss_mlp": 0.01253773, + "epoch": 0.2554637005862017, + "flos": 17570531013120.0, + "grad_norm": 2.3193810219262585, + "language_loss": 0.84631854, + "learning_rate": 3.4900277935018166e-06, + "loss": 0.92417163, + "num_input_tokens_seen": 91728865, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.20202637, + "step": 4249, + "time_per_iteration": 4.003984212875366 + }, + { + "auxiliary_loss_clip": 0.06380495, + "auxiliary_loss_mlp": 0.01253384, + "balance_loss_clip": 0.06276014, + "balance_loss_mlp": 0.01249388, + "epoch": 0.2555238238388697, + "flos": 72263441698560.0, + "grad_norm": 0.7365466774710785, + "language_loss": 0.56168175, + "learning_rate": 3.489767975249115e-06, + "loss": 0.63802058, + "num_input_tokens_seen": 91787470, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.03994751, + "step": 4250, + "time_per_iteration": 3.169614553451538 + }, + { + "auxiliary_loss_clip": 0.06511974, + "auxiliary_loss_mlp": 0.01277356, + "balance_loss_clip": 0.06289789, + "balance_loss_mlp": 0.01255433, + "epoch": 0.25558394709153764, + "flos": 24396323990400.0, + "grad_norm": 2.4378887831258527, + "language_loss": 0.81129342, + "learning_rate": 3.4895081005042632e-06, + "loss": 0.88918668, + "num_input_tokens_seen": 91805640, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.21936035, + "step": 4251, + "time_per_iteration": 2.576631784439087 + }, + { + "auxiliary_loss_clip": 0.06382731, + "auxiliary_loss_mlp": 0.01258719, + "balance_loss_clip": 0.06278136, + "balance_loss_mlp": 0.01254794, + "epoch": 0.2556440703442056, + "flos": 69251857776000.0, + "grad_norm": 0.7756464213587903, + "language_loss": 0.66132653, + "learning_rate": 3.4892481692771146e-06, + "loss": 0.73774105, + "num_input_tokens_seen": 91869695, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.03921509, + "step": 4252, + "time_per_iteration": 3.2080140113830566 + }, + { + "auxiliary_loss_clip": 0.06505658, + "auxiliary_loss_mlp": 0.0127465, + "balance_loss_clip": 0.06288829, + "balance_loss_mlp": 0.01255922, + "epoch": 0.2557041935968736, + "flos": 24870919916160.0, + "grad_norm": 1.8769862610793295, + "language_loss": 0.74028432, + "learning_rate": 3.4889881815775267e-06, + "loss": 0.81808746, + "num_input_tokens_seen": 91889920, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.18737793, + "step": 4253, + "time_per_iteration": 2.569730520248413 + }, + { + "auxiliary_loss_clip": 0.06509089, + "auxiliary_loss_mlp": 0.01282548, + "balance_loss_clip": 0.06290878, + "balance_loss_mlp": 0.01261746, + "epoch": 0.25576431684954154, + "flos": 22498694974080.0, + "grad_norm": 4.507455095580577, + "language_loss": 0.742535, + "learning_rate": 3.488728137415357e-06, + "loss": 0.82045132, + "num_input_tokens_seen": 91908665, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.20800781, + "step": 4254, + "time_per_iteration": 2.58933424949646 + }, + { + "auxiliary_loss_clip": 0.0651402, + "auxiliary_loss_mlp": 0.0127796, + "balance_loss_clip": 0.06292839, + "balance_loss_mlp": 0.01257253, + "epoch": 0.2558244401022095, + "flos": 19832569436160.0, + "grad_norm": 1.7853658258569405, + "language_loss": 0.81599152, + "learning_rate": 3.4884680368004675e-06, + "loss": 0.89391136, + "num_input_tokens_seen": 91927855, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.20703125, + "step": 4255, + "time_per_iteration": 2.5198400020599365 + }, + { + "auxiliary_loss_clip": 0.06507239, + "auxiliary_loss_mlp": 0.01282593, + "balance_loss_clip": 0.06290218, + "balance_loss_mlp": 0.01262304, + "epoch": 0.2558845633548775, + "flos": 23226968736000.0, + "grad_norm": 1.3889535500711463, + "language_loss": 0.85781598, + "learning_rate": 3.488207879742721e-06, + "loss": 0.93571424, + "num_input_tokens_seen": 91948500, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.20275879, + "step": 4256, + "time_per_iteration": 2.6466193199157715 + }, + { + "auxiliary_loss_clip": 0.06518268, + "auxiliary_loss_mlp": 0.01279996, + "balance_loss_clip": 0.06292354, + "balance_loss_mlp": 0.01259432, + "epoch": 0.2559446866075455, + "flos": 16843682770560.0, + "grad_norm": 2.0395659723156814, + "language_loss": 0.75505483, + "learning_rate": 3.4879476662519826e-06, + "loss": 0.83303738, + "num_input_tokens_seen": 91968375, + "router_z_loss_clip": 2.25585938, + "router_z_loss_mlp": 0.20556641, + "step": 4257, + "time_per_iteration": 2.5399420261383057 + }, + { + "auxiliary_loss_clip": 0.06380453, + "auxiliary_loss_mlp": 0.01254162, + "balance_loss_clip": 0.06277193, + "balance_loss_mlp": 0.01249772, + "epoch": 0.25600480986021346, + "flos": 57612741258240.0, + "grad_norm": 0.7838298602570629, + "language_loss": 0.65205377, + "learning_rate": 3.4876873963381196e-06, + "loss": 0.72839993, + "num_input_tokens_seen": 92028490, + "router_z_loss_clip": 1.03125, + "router_z_loss_mlp": 0.04397583, + "step": 4258, + "time_per_iteration": 3.1310055255889893 + }, + { + "auxiliary_loss_clip": 0.06504042, + "auxiliary_loss_mlp": 0.01278745, + "balance_loss_clip": 0.06291071, + "balance_loss_mlp": 0.01257192, + "epoch": 0.2560649331128814, + "flos": 27827088762240.0, + "grad_norm": 1.6413095395992356, + "language_loss": 0.76769841, + "learning_rate": 3.4874270700110013e-06, + "loss": 0.84552622, + "num_input_tokens_seen": 92048060, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.2154541, + "step": 4259, + "time_per_iteration": 2.6200387477874756 + }, + { + "auxiliary_loss_clip": 0.06386054, + "auxiliary_loss_mlp": 0.01255029, + "balance_loss_clip": 0.06282675, + "balance_loss_mlp": 0.01250824, + "epoch": 0.2561250563655494, + "flos": 70972187552640.0, + "grad_norm": 0.7732791072218576, + "language_loss": 0.58378285, + "learning_rate": 3.4871666872804994e-06, + "loss": 0.66019368, + "num_input_tokens_seen": 92118180, + "router_z_loss_clip": 1.03125, + "router_z_loss_mlp": 0.04208374, + "step": 4260, + "time_per_iteration": 3.2671031951904297 + }, + { + "auxiliary_loss_clip": 0.06510498, + "auxiliary_loss_mlp": 0.01277826, + "balance_loss_clip": 0.06290598, + "balance_loss_mlp": 0.0125824, + "epoch": 0.25618517961821735, + "flos": 27018998386560.0, + "grad_norm": 1.6762593333812295, + "language_loss": 0.77063274, + "learning_rate": 3.4869062481564875e-06, + "loss": 0.84851599, + "num_input_tokens_seen": 92137570, + "router_z_loss_clip": 2.19921875, + "router_z_loss_mlp": 0.19580078, + "step": 4261, + "time_per_iteration": 2.6590030193328857 + }, + { + "auxiliary_loss_clip": 0.06510883, + "auxiliary_loss_mlp": 0.01281621, + "balance_loss_clip": 0.06293076, + "balance_loss_mlp": 0.01261534, + "epoch": 0.2562453028708853, + "flos": 23073708418560.0, + "grad_norm": 1.5026397479094624, + "language_loss": 0.83196223, + "learning_rate": 3.486645752648842e-06, + "loss": 0.90988725, + "num_input_tokens_seen": 92157625, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.20080566, + "step": 4262, + "time_per_iteration": 2.606386661529541 + }, + { + "auxiliary_loss_clip": 0.06520962, + "auxiliary_loss_mlp": 0.01278022, + "balance_loss_clip": 0.06295811, + "balance_loss_mlp": 0.0125778, + "epoch": 0.2563054261235533, + "flos": 15126120178560.0, + "grad_norm": 2.976746783245639, + "language_loss": 0.7460134, + "learning_rate": 3.4863852007674405e-06, + "loss": 0.82400322, + "num_input_tokens_seen": 92175350, + "router_z_loss_clip": 2.25585938, + "router_z_loss_mlp": 0.20239258, + "step": 4263, + "time_per_iteration": 2.573204517364502 + }, + { + "auxiliary_loss_clip": 0.06511976, + "auxiliary_loss_mlp": 0.01275308, + "balance_loss_clip": 0.0629802, + "balance_loss_mlp": 0.01256008, + "epoch": 0.25636554937622125, + "flos": 27862238350080.0, + "grad_norm": 1.7189236473805392, + "language_loss": 0.83209884, + "learning_rate": 3.486124592522163e-06, + "loss": 0.90997171, + "num_input_tokens_seen": 92196070, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.19299316, + "step": 4264, + "time_per_iteration": 2.5768978595733643 + }, + { + "auxiliary_loss_clip": 0.06522107, + "auxiliary_loss_mlp": 0.01275609, + "balance_loss_clip": 0.06300539, + "balance_loss_mlp": 0.01255403, + "epoch": 0.2564256726288892, + "flos": 28912979750400.0, + "grad_norm": 2.7518222985569247, + "language_loss": 0.75264466, + "learning_rate": 3.4858639279228924e-06, + "loss": 0.83062184, + "num_input_tokens_seen": 92216310, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.20202637, + "step": 4265, + "time_per_iteration": 2.6022770404815674 + }, + { + "auxiliary_loss_clip": 0.06514756, + "auxiliary_loss_mlp": 0.01276084, + "balance_loss_clip": 0.06293936, + "balance_loss_mlp": 0.01256701, + "epoch": 0.2564857958815572, + "flos": 18520812967680.0, + "grad_norm": 2.7205564726060754, + "language_loss": 0.82059085, + "learning_rate": 3.485603206979513e-06, + "loss": 0.89849925, + "num_input_tokens_seen": 92234510, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.19396973, + "step": 4266, + "time_per_iteration": 2.5768039226531982 + }, + { + "auxiliary_loss_clip": 0.06513181, + "auxiliary_loss_mlp": 0.01282165, + "balance_loss_clip": 0.06295994, + "balance_loss_mlp": 0.01263199, + "epoch": 0.25654591913422514, + "flos": 25814745106560.0, + "grad_norm": 2.256505464235654, + "language_loss": 0.79590619, + "learning_rate": 3.4853424297019103e-06, + "loss": 0.8738597, + "num_input_tokens_seen": 92254070, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.1895752, + "step": 4267, + "time_per_iteration": 2.58900785446167 + }, + { + "auxiliary_loss_clip": 0.06512932, + "auxiliary_loss_mlp": 0.01282882, + "balance_loss_clip": 0.06302384, + "balance_loss_mlp": 0.01263439, + "epoch": 0.2566060423868931, + "flos": 19105805047680.0, + "grad_norm": 1.7450924080459818, + "language_loss": 0.79543281, + "learning_rate": 3.4850815960999736e-06, + "loss": 0.87339091, + "num_input_tokens_seen": 92275060, + "router_z_loss_clip": 2.10546875, + "router_z_loss_mlp": 0.19421387, + "step": 4268, + "time_per_iteration": 2.532245635986328 + }, + { + "auxiliary_loss_clip": 0.06515032, + "auxiliary_loss_mlp": 0.01281336, + "balance_loss_clip": 0.06296947, + "balance_loss_mlp": 0.01261166, + "epoch": 0.25666616563956113, + "flos": 23849584099200.0, + "grad_norm": 1.6329297187056233, + "language_loss": 0.69106698, + "learning_rate": 3.484820706183595e-06, + "loss": 0.76903057, + "num_input_tokens_seen": 92293610, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.20153809, + "step": 4269, + "time_per_iteration": 2.7064032554626465 + }, + { + "auxiliary_loss_clip": 0.06520134, + "auxiliary_loss_mlp": 0.01278603, + "balance_loss_clip": 0.06299803, + "balance_loss_mlp": 0.01259016, + "epoch": 0.2567262888922291, + "flos": 14608366600320.0, + "grad_norm": 2.976489070793836, + "language_loss": 0.79361498, + "learning_rate": 3.484559759962666e-06, + "loss": 0.8716023, + "num_input_tokens_seen": 92308305, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.19580078, + "step": 4270, + "time_per_iteration": 2.5247366428375244 + }, + { + "auxiliary_loss_clip": 0.06528008, + "auxiliary_loss_mlp": 0.01281711, + "balance_loss_clip": 0.0630113, + "balance_loss_mlp": 0.0125899, + "epoch": 0.25678641214489706, + "flos": 32930791027200.0, + "grad_norm": 2.0785991894062104, + "language_loss": 0.68438745, + "learning_rate": 3.4842987574470816e-06, + "loss": 0.76248461, + "num_input_tokens_seen": 92329875, + "router_z_loss_clip": 2.26757812, + "router_z_loss_mlp": 0.22717285, + "step": 4271, + "time_per_iteration": 2.6327364444732666 + }, + { + "auxiliary_loss_clip": 0.06521121, + "auxiliary_loss_mlp": 0.01277495, + "balance_loss_clip": 0.06297284, + "balance_loss_mlp": 0.01256395, + "epoch": 0.256846535397565, + "flos": 24106029120000.0, + "grad_norm": 1.3298745054932861, + "language_loss": 0.87827712, + "learning_rate": 3.4840376986467403e-06, + "loss": 0.9562633, + "num_input_tokens_seen": 92348780, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.2109375, + "step": 4272, + "time_per_iteration": 2.5886576175689697 + }, + { + "auxiliary_loss_clip": 0.06520741, + "auxiliary_loss_mlp": 0.0127846, + "balance_loss_clip": 0.06299604, + "balance_loss_mlp": 0.01256204, + "epoch": 0.256906658650233, + "flos": 19724437342080.0, + "grad_norm": 1.6471317846086577, + "language_loss": 0.8228811, + "learning_rate": 3.483776583571541e-06, + "loss": 0.90087312, + "num_input_tokens_seen": 92368175, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.22253418, + "step": 4273, + "time_per_iteration": 2.5273654460906982 + }, + { + "auxiliary_loss_clip": 0.06513067, + "auxiliary_loss_mlp": 0.0127658, + "balance_loss_clip": 0.06299708, + "balance_loss_mlp": 0.01257638, + "epoch": 0.25696678190290095, + "flos": 22932019964160.0, + "grad_norm": 1.4706338186359442, + "language_loss": 0.77439249, + "learning_rate": 3.4835154122313846e-06, + "loss": 0.85228896, + "num_input_tokens_seen": 92387755, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.18933105, + "step": 4274, + "time_per_iteration": 2.5805962085723877 + }, + { + "auxiliary_loss_clip": 0.06508841, + "auxiliary_loss_mlp": 0.01274973, + "balance_loss_clip": 0.06295496, + "balance_loss_mlp": 0.0125435, + "epoch": 0.2570269051555689, + "flos": 27315163042560.0, + "grad_norm": 1.5809391622925344, + "language_loss": 0.84101403, + "learning_rate": 3.4832541846361743e-06, + "loss": 0.91885215, + "num_input_tokens_seen": 92409850, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.20629883, + "step": 4275, + "time_per_iteration": 2.5743672847747803 + }, + { + "auxiliary_loss_clip": 0.0652002, + "auxiliary_loss_mlp": 0.01273541, + "balance_loss_clip": 0.06298091, + "balance_loss_mlp": 0.01252965, + "epoch": 0.2570870284082369, + "flos": 27570811449600.0, + "grad_norm": 2.3295240533415016, + "language_loss": 0.78590673, + "learning_rate": 3.4829929007958175e-06, + "loss": 0.86384231, + "num_input_tokens_seen": 92431250, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.20581055, + "step": 4276, + "time_per_iteration": 2.631866216659546 + }, + { + "auxiliary_loss_clip": 0.06515533, + "auxiliary_loss_mlp": 0.01279943, + "balance_loss_clip": 0.06298599, + "balance_loss_mlp": 0.01260237, + "epoch": 0.25714715166090485, + "flos": 28738405768320.0, + "grad_norm": 1.6396366021430353, + "language_loss": 0.79803967, + "learning_rate": 3.4827315607202214e-06, + "loss": 0.8759945, + "num_input_tokens_seen": 92452065, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.19714355, + "step": 4277, + "time_per_iteration": 2.5990161895751953 + }, + { + "auxiliary_loss_clip": 0.06513472, + "auxiliary_loss_mlp": 0.01272259, + "balance_loss_clip": 0.06296529, + "balance_loss_mlp": 0.01254377, + "epoch": 0.2572072749135728, + "flos": 20121606495360.0, + "grad_norm": 1.9596681746733369, + "language_loss": 0.78998482, + "learning_rate": 3.482470164419295e-06, + "loss": 0.8678422, + "num_input_tokens_seen": 92470025, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.17883301, + "step": 4278, + "time_per_iteration": 4.02304744720459 + }, + { + "auxiliary_loss_clip": 0.06522302, + "auxiliary_loss_mlp": 0.01278536, + "balance_loss_clip": 0.06301469, + "balance_loss_mlp": 0.01256911, + "epoch": 0.2572673981662408, + "flos": 26037969183360.0, + "grad_norm": 2.3063853220673067, + "language_loss": 0.75400203, + "learning_rate": 3.482208711902952e-06, + "loss": 0.83201039, + "num_input_tokens_seen": 92489825, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.21618652, + "step": 4279, + "time_per_iteration": 2.5523123741149902 + }, + { + "auxiliary_loss_clip": 0.06516609, + "auxiliary_loss_mlp": 0.0128394, + "balance_loss_clip": 0.06297271, + "balance_loss_mlp": 0.01262721, + "epoch": 0.25732752141890874, + "flos": 16112054845440.0, + "grad_norm": 3.423283610494841, + "language_loss": 0.85997081, + "learning_rate": 3.4819472031811065e-06, + "loss": 0.9379763, + "num_input_tokens_seen": 92507270, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.2121582, + "step": 4280, + "time_per_iteration": 2.5104546546936035 + }, + { + "auxiliary_loss_clip": 0.06517641, + "auxiliary_loss_mlp": 0.01282108, + "balance_loss_clip": 0.06295675, + "balance_loss_mlp": 0.0126133, + "epoch": 0.2573876446715767, + "flos": 22530322690560.0, + "grad_norm": 2.5830483171875955, + "language_loss": 0.78735828, + "learning_rate": 3.4816856382636744e-06, + "loss": 0.86535579, + "num_input_tokens_seen": 92526300, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.20788574, + "step": 4281, + "time_per_iteration": 2.511723279953003 + }, + { + "auxiliary_loss_clip": 0.06512952, + "auxiliary_loss_mlp": 0.01285256, + "balance_loss_clip": 0.06294534, + "balance_loss_mlp": 0.01264048, + "epoch": 0.2574477679242447, + "flos": 23957548485120.0, + "grad_norm": 1.8266556980022217, + "language_loss": 0.87782013, + "learning_rate": 3.4814240171605737e-06, + "loss": 0.9558022, + "num_input_tokens_seen": 92546465, + "router_z_loss_clip": 2.18066406, + "router_z_loss_mlp": 0.21203613, + "step": 4282, + "time_per_iteration": 2.5573971271514893 + }, + { + "auxiliary_loss_clip": 0.06509817, + "auxiliary_loss_mlp": 0.0128236, + "balance_loss_clip": 0.06291438, + "balance_loss_mlp": 0.01262905, + "epoch": 0.2575078911769127, + "flos": 21988278627840.0, + "grad_norm": 1.3881538001933933, + "language_loss": 0.71042287, + "learning_rate": 3.4811623398817267e-06, + "loss": 0.78834462, + "num_input_tokens_seen": 92567260, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.19470215, + "step": 4283, + "time_per_iteration": 3.9826109409332275 + }, + { + "auxiliary_loss_clip": 0.06500088, + "auxiliary_loss_mlp": 0.01289815, + "balance_loss_clip": 0.06290558, + "balance_loss_mlp": 0.01271051, + "epoch": 0.25756801442958066, + "flos": 21951997009920.0, + "grad_norm": 1.9398744879334104, + "language_loss": 0.80991805, + "learning_rate": 3.4809006064370553e-06, + "loss": 0.88781703, + "num_input_tokens_seen": 92585425, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 0.18762207, + "step": 4284, + "time_per_iteration": 2.512471914291382 + }, + { + "auxiliary_loss_clip": 0.06508928, + "auxiliary_loss_mlp": 0.01294414, + "balance_loss_clip": 0.06291771, + "balance_loss_mlp": 0.01274923, + "epoch": 0.2576281376822486, + "flos": 35270675493120.0, + "grad_norm": 2.158245566426343, + "language_loss": 0.70814562, + "learning_rate": 3.4806388168364835e-06, + "loss": 0.78617907, + "num_input_tokens_seen": 92604770, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.19494629, + "step": 4285, + "time_per_iteration": 4.088344097137451 + }, + { + "auxiliary_loss_clip": 0.06504595, + "auxiliary_loss_mlp": 0.0128171, + "balance_loss_clip": 0.06288387, + "balance_loss_mlp": 0.01262505, + "epoch": 0.2576882609349166, + "flos": 14136705567360.0, + "grad_norm": 1.771877130646751, + "language_loss": 0.58818436, + "learning_rate": 3.4803769710899402e-06, + "loss": 0.66604745, + "num_input_tokens_seen": 92622635, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.1920166, + "step": 4286, + "time_per_iteration": 2.5344176292419434 + }, + { + "auxiliary_loss_clip": 0.0650837, + "auxiliary_loss_mlp": 0.01278621, + "balance_loss_clip": 0.06287025, + "balance_loss_mlp": 0.01259118, + "epoch": 0.25774838418758456, + "flos": 23265053216640.0, + "grad_norm": 2.057811055203196, + "language_loss": 0.6464054, + "learning_rate": 3.480115069207354e-06, + "loss": 0.72427529, + "num_input_tokens_seen": 92642960, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.19494629, + "step": 4287, + "time_per_iteration": 2.5958328247070312 + }, + { + "auxiliary_loss_clip": 0.0650748, + "auxiliary_loss_mlp": 0.01286721, + "balance_loss_clip": 0.06287187, + "balance_loss_mlp": 0.01265824, + "epoch": 0.2578085074402525, + "flos": 22608378368640.0, + "grad_norm": 1.9946373780944937, + "language_loss": 0.7222265, + "learning_rate": 3.4798531111986557e-06, + "loss": 0.80016851, + "num_input_tokens_seen": 92662455, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.2088623, + "step": 4288, + "time_per_iteration": 2.5767109394073486 + }, + { + "auxiliary_loss_clip": 0.06504134, + "auxiliary_loss_mlp": 0.01288175, + "balance_loss_clip": 0.06288374, + "balance_loss_mlp": 0.01268851, + "epoch": 0.2578686306929205, + "flos": 24578780256000.0, + "grad_norm": 1.4737569046844996, + "language_loss": 0.77657092, + "learning_rate": 3.4795910970737786e-06, + "loss": 0.85449398, + "num_input_tokens_seen": 92683520, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.1932373, + "step": 4289, + "time_per_iteration": 3.9734480381011963 + }, + { + "auxiliary_loss_clip": 0.0651005, + "auxiliary_loss_mlp": 0.01285951, + "balance_loss_clip": 0.06290901, + "balance_loss_mlp": 0.012641, + "epoch": 0.25792875394558845, + "flos": 18119828453760.0, + "grad_norm": 2.192134211179858, + "language_loss": 0.8580482, + "learning_rate": 3.4793290268426592e-06, + "loss": 0.93600821, + "num_input_tokens_seen": 92701450, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.21838379, + "step": 4290, + "time_per_iteration": 2.5564229488372803 + }, + { + "auxiliary_loss_clip": 0.0651224, + "auxiliary_loss_mlp": 0.01283874, + "balance_loss_clip": 0.06293762, + "balance_loss_mlp": 0.01263573, + "epoch": 0.2579888771982564, + "flos": 17718760085760.0, + "grad_norm": 2.0247866667145344, + "language_loss": 0.73390263, + "learning_rate": 3.4790669005152354e-06, + "loss": 0.81186378, + "num_input_tokens_seen": 92720355, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.20300293, + "step": 4291, + "time_per_iteration": 2.497671365737915 + }, + { + "auxiliary_loss_clip": 0.06508101, + "auxiliary_loss_mlp": 0.01275245, + "balance_loss_clip": 0.06287237, + "balance_loss_mlp": 0.01255647, + "epoch": 0.2580490004509244, + "flos": 16440350342400.0, + "grad_norm": 2.23272675200871, + "language_loss": 0.82139969, + "learning_rate": 3.4788047181014458e-06, + "loss": 0.8992331, + "num_input_tokens_seen": 92736755, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.19604492, + "step": 4292, + "time_per_iteration": 2.5467498302459717 + }, + { + "auxiliary_loss_clip": 0.06505652, + "auxiliary_loss_mlp": 0.01282583, + "balance_loss_clip": 0.06289525, + "balance_loss_mlp": 0.01262532, + "epoch": 0.25810912370359235, + "flos": 33842946574080.0, + "grad_norm": 1.9023591833174374, + "language_loss": 0.67644775, + "learning_rate": 3.4785424796112337e-06, + "loss": 0.7543301, + "num_input_tokens_seen": 92757655, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.20043945, + "step": 4293, + "time_per_iteration": 2.626880168914795 + }, + { + "auxiliary_loss_clip": 0.06507371, + "auxiliary_loss_mlp": 0.01275889, + "balance_loss_clip": 0.06295517, + "balance_loss_mlp": 0.01257244, + "epoch": 0.2581692469562603, + "flos": 25199257340160.0, + "grad_norm": 2.9603548878770387, + "language_loss": 0.76158464, + "learning_rate": 3.478280185054542e-06, + "loss": 0.83941722, + "num_input_tokens_seen": 92776100, + "router_z_loss_clip": 2.12109375, + "router_z_loss_mlp": 0.18640137, + "step": 4294, + "time_per_iteration": 2.5711581707000732 + }, + { + "auxiliary_loss_clip": 0.06508272, + "auxiliary_loss_mlp": 0.01277058, + "balance_loss_clip": 0.06293358, + "balance_loss_mlp": 0.01257866, + "epoch": 0.2582293702089283, + "flos": 34940619060480.0, + "grad_norm": 2.382767918587226, + "language_loss": 0.81769538, + "learning_rate": 3.478017834441318e-06, + "loss": 0.8955487, + "num_input_tokens_seen": 92798880, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.1920166, + "step": 4295, + "time_per_iteration": 2.635817766189575 + }, + { + "auxiliary_loss_clip": 0.06519823, + "auxiliary_loss_mlp": 0.01276702, + "balance_loss_clip": 0.06295969, + "balance_loss_mlp": 0.01256496, + "epoch": 0.2582894934615963, + "flos": 26841028314240.0, + "grad_norm": 1.964012337767824, + "language_loss": 0.72949934, + "learning_rate": 3.4777554277815096e-06, + "loss": 0.80746454, + "num_input_tokens_seen": 92817750, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.20214844, + "step": 4296, + "time_per_iteration": 2.569481134414673 + }, + { + "auxiliary_loss_clip": 0.06514452, + "auxiliary_loss_mlp": 0.01277621, + "balance_loss_clip": 0.0629501, + "balance_loss_mlp": 0.0125732, + "epoch": 0.25834961671426426, + "flos": 23522252924160.0, + "grad_norm": 1.7245670135783875, + "language_loss": 0.87440747, + "learning_rate": 3.477492965085067e-06, + "loss": 0.95232815, + "num_input_tokens_seen": 92837995, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.20288086, + "step": 4297, + "time_per_iteration": 2.5871896743774414 + }, + { + "auxiliary_loss_clip": 0.06510781, + "auxiliary_loss_mlp": 0.01279012, + "balance_loss_clip": 0.062922, + "balance_loss_mlp": 0.01260558, + "epoch": 0.25840973996693223, + "flos": 22456837059840.0, + "grad_norm": 2.9037965134923076, + "language_loss": 0.84894854, + "learning_rate": 3.477230446361943e-06, + "loss": 0.9268465, + "num_input_tokens_seen": 92857245, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.18469238, + "step": 4298, + "time_per_iteration": 2.5290613174438477 + }, + { + "auxiliary_loss_clip": 0.06510766, + "auxiliary_loss_mlp": 0.01276006, + "balance_loss_clip": 0.06292143, + "balance_loss_mlp": 0.01256158, + "epoch": 0.2584698632196002, + "flos": 11295544849920.0, + "grad_norm": 2.12928453409433, + "language_loss": 0.83727312, + "learning_rate": 3.4769678716220927e-06, + "loss": 0.91514087, + "num_input_tokens_seen": 92873265, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.1986084, + "step": 4299, + "time_per_iteration": 2.5314571857452393 + }, + { + "auxiliary_loss_clip": 0.06506392, + "auxiliary_loss_mlp": 0.01272204, + "balance_loss_clip": 0.06292024, + "balance_loss_mlp": 0.01253214, + "epoch": 0.25852998647226816, + "flos": 17935569325440.0, + "grad_norm": 2.08690605682093, + "language_loss": 0.83303946, + "learning_rate": 3.4767052408754726e-06, + "loss": 0.91082543, + "num_input_tokens_seen": 92890880, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.18981934, + "step": 4300, + "time_per_iteration": 2.494170904159546 + }, + { + "auxiliary_loss_clip": 0.06507458, + "auxiliary_loss_mlp": 0.01272704, + "balance_loss_clip": 0.06287713, + "balance_loss_mlp": 0.01254012, + "epoch": 0.2585901097249361, + "flos": 33264620893440.0, + "grad_norm": 3.3706811216639307, + "language_loss": 0.67941749, + "learning_rate": 3.4764425541320417e-06, + "loss": 0.75721914, + "num_input_tokens_seen": 92910770, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.18688965, + "step": 4301, + "time_per_iteration": 2.6923537254333496 + }, + { + "auxiliary_loss_clip": 0.06512292, + "auxiliary_loss_mlp": 0.01275999, + "balance_loss_clip": 0.06289004, + "balance_loss_mlp": 0.01257009, + "epoch": 0.2586502329776041, + "flos": 18447033847680.0, + "grad_norm": 2.7819934823512282, + "language_loss": 0.83073664, + "learning_rate": 3.4761798114017617e-06, + "loss": 0.90861952, + "num_input_tokens_seen": 92929520, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.18994141, + "step": 4302, + "time_per_iteration": 2.5102365016937256 + }, + { + "auxiliary_loss_clip": 0.06508462, + "auxiliary_loss_mlp": 0.01276586, + "balance_loss_clip": 0.06292115, + "balance_loss_mlp": 0.01257358, + "epoch": 0.25871035623027205, + "flos": 17973989222400.0, + "grad_norm": 1.7107484291097332, + "language_loss": 0.91874599, + "learning_rate": 3.475917012694595e-06, + "loss": 0.99659652, + "num_input_tokens_seen": 92947890, + "router_z_loss_clip": 2.1640625, + "router_z_loss_mlp": 0.19238281, + "step": 4303, + "time_per_iteration": 2.5386602878570557 + }, + { + "auxiliary_loss_clip": 0.06508803, + "auxiliary_loss_mlp": 0.01277595, + "balance_loss_clip": 0.0629281, + "balance_loss_mlp": 0.01258569, + "epoch": 0.25877047948294, + "flos": 27784392307200.0, + "grad_norm": 1.7938003883067368, + "language_loss": 0.67601281, + "learning_rate": 3.475654158020507e-06, + "loss": 0.75387681, + "num_input_tokens_seen": 92967690, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.19018555, + "step": 4304, + "time_per_iteration": 2.5739033222198486 + }, + { + "auxiliary_loss_clip": 0.06507856, + "auxiliary_loss_mlp": 0.01276896, + "balance_loss_clip": 0.06286401, + "balance_loss_mlp": 0.01257477, + "epoch": 0.258830602735608, + "flos": 27133209901440.0, + "grad_norm": 2.1929382614593242, + "language_loss": 0.73436916, + "learning_rate": 3.4753912473894657e-06, + "loss": 0.81221676, + "num_input_tokens_seen": 92986830, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.1940918, + "step": 4305, + "time_per_iteration": 2.5877888202667236 + }, + { + "auxiliary_loss_clip": 0.06515621, + "auxiliary_loss_mlp": 0.01276889, + "balance_loss_clip": 0.06293313, + "balance_loss_mlp": 0.01255992, + "epoch": 0.25889072598827595, + "flos": 17896730158080.0, + "grad_norm": 1.8662067033328453, + "language_loss": 0.76418924, + "learning_rate": 3.4751282808114403e-06, + "loss": 0.84211433, + "num_input_tokens_seen": 93002740, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.20898438, + "step": 4306, + "time_per_iteration": 2.482933282852173 + }, + { + "auxiliary_loss_clip": 0.06403579, + "auxiliary_loss_mlp": 0.01258203, + "balance_loss_clip": 0.06296976, + "balance_loss_mlp": 0.01253566, + "epoch": 0.2589508492409439, + "flos": 53951582885760.0, + "grad_norm": 0.8023409981232837, + "language_loss": 0.56592381, + "learning_rate": 3.474865258296403e-06, + "loss": 0.64254159, + "num_input_tokens_seen": 93058645, + "router_z_loss_clip": 1.06347656, + "router_z_loss_mlp": 0.04629517, + "step": 4307, + "time_per_iteration": 3.1265084743499756 + }, + { + "auxiliary_loss_clip": 0.06500413, + "auxiliary_loss_mlp": 0.0127407, + "balance_loss_clip": 0.06289256, + "balance_loss_mlp": 0.01256105, + "epoch": 0.2590109724936119, + "flos": 22132063434240.0, + "grad_norm": 1.735104377472534, + "language_loss": 0.71851504, + "learning_rate": 3.474602179854327e-06, + "loss": 0.79625988, + "num_input_tokens_seen": 93077140, + "router_z_loss_clip": 2.11328125, + "router_z_loss_mlp": 0.17956543, + "step": 4308, + "time_per_iteration": 2.5442304611206055 + }, + { + "auxiliary_loss_clip": 0.06513858, + "auxiliary_loss_mlp": 0.01278615, + "balance_loss_clip": 0.0629196, + "balance_loss_mlp": 0.01258993, + "epoch": 0.2590710957462799, + "flos": 13478395564800.0, + "grad_norm": 2.8033587428294657, + "language_loss": 0.84278727, + "learning_rate": 3.4743390454951886e-06, + "loss": 0.92071199, + "num_input_tokens_seen": 93093580, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.19628906, + "step": 4309, + "time_per_iteration": 2.546034336090088 + }, + { + "auxiliary_loss_clip": 0.06504438, + "auxiliary_loss_mlp": 0.01276588, + "balance_loss_clip": 0.06290606, + "balance_loss_mlp": 0.01258814, + "epoch": 0.25913121899894787, + "flos": 22313219961600.0, + "grad_norm": 1.5400127324827177, + "language_loss": 0.84972912, + "learning_rate": 3.474075855228966e-06, + "loss": 0.92753935, + "num_input_tokens_seen": 93112345, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.17785645, + "step": 4310, + "time_per_iteration": 2.5188028812408447 + }, + { + "auxiliary_loss_clip": 0.06511362, + "auxiliary_loss_mlp": 0.0127375, + "balance_loss_clip": 0.06293052, + "balance_loss_mlp": 0.01254533, + "epoch": 0.25919134225161583, + "flos": 25818770102400.0, + "grad_norm": 1.8118221315599161, + "language_loss": 0.78088975, + "learning_rate": 3.473812609065639e-06, + "loss": 0.85874081, + "num_input_tokens_seen": 93131545, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.19213867, + "step": 4311, + "time_per_iteration": 2.6044604778289795 + }, + { + "auxiliary_loss_clip": 0.06511068, + "auxiliary_loss_mlp": 0.01275144, + "balance_loss_clip": 0.06293963, + "balance_loss_mlp": 0.01256666, + "epoch": 0.2592514655042838, + "flos": 31220314104960.0, + "grad_norm": 4.381167674093932, + "language_loss": 0.73062587, + "learning_rate": 3.4735493070151904e-06, + "loss": 0.80848801, + "num_input_tokens_seen": 93150730, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.18469238, + "step": 4312, + "time_per_iteration": 2.587942600250244 + }, + { + "auxiliary_loss_clip": 0.06508243, + "auxiliary_loss_mlp": 0.01275986, + "balance_loss_clip": 0.06291987, + "balance_loss_mlp": 0.012569, + "epoch": 0.25931158875695176, + "flos": 18480296718720.0, + "grad_norm": 1.7543304647253515, + "language_loss": 0.70305753, + "learning_rate": 3.4732859490876044e-06, + "loss": 0.78089976, + "num_input_tokens_seen": 93167895, + "router_z_loss_clip": 2.16113281, + "router_z_loss_mlp": 0.19091797, + "step": 4313, + "time_per_iteration": 2.5092732906341553 + }, + { + "auxiliary_loss_clip": 0.06508952, + "auxiliary_loss_mlp": 0.01278616, + "balance_loss_clip": 0.06293979, + "balance_loss_mlp": 0.0125971, + "epoch": 0.2593717120096197, + "flos": 19213895214720.0, + "grad_norm": 1.751562510714179, + "language_loss": 0.81158572, + "learning_rate": 3.473022535292867e-06, + "loss": 0.8894614, + "num_input_tokens_seen": 93187650, + "router_z_loss_clip": 2.15332031, + "router_z_loss_mlp": 0.18908691, + "step": 4314, + "time_per_iteration": 2.5584335327148438 + }, + { + "auxiliary_loss_clip": 0.06515148, + "auxiliary_loss_mlp": 0.01278316, + "balance_loss_clip": 0.06292658, + "balance_loss_mlp": 0.01257359, + "epoch": 0.2594318352622877, + "flos": 31256050671360.0, + "grad_norm": 1.9178095473181331, + "language_loss": 0.67283171, + "learning_rate": 3.472759065640968e-06, + "loss": 0.7507664, + "num_input_tokens_seen": 93207370, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.20959473, + "step": 4315, + "time_per_iteration": 2.6295278072357178 + }, + { + "auxiliary_loss_clip": 0.06506292, + "auxiliary_loss_mlp": 0.01277654, + "balance_loss_clip": 0.06292329, + "balance_loss_mlp": 0.01259463, + "epoch": 0.25949195851495566, + "flos": 22243759326720.0, + "grad_norm": 1.412764147956583, + "language_loss": 0.80242419, + "learning_rate": 3.4724955401418976e-06, + "loss": 0.88026369, + "num_input_tokens_seen": 93227925, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.18212891, + "step": 4316, + "time_per_iteration": 2.5277152061462402 + }, + { + "auxiliary_loss_clip": 0.06510989, + "auxiliary_loss_mlp": 0.01277583, + "balance_loss_clip": 0.06290686, + "balance_loss_mlp": 0.01256781, + "epoch": 0.2595520817676236, + "flos": 28083449928960.0, + "grad_norm": 1.6660208675023864, + "language_loss": 0.78127223, + "learning_rate": 3.4722319588056487e-06, + "loss": 0.85915792, + "num_input_tokens_seen": 93250020, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.20812988, + "step": 4317, + "time_per_iteration": 2.6210665702819824 + }, + { + "auxiliary_loss_clip": 0.06507257, + "auxiliary_loss_mlp": 0.01281581, + "balance_loss_clip": 0.06291957, + "balance_loss_mlp": 0.01262054, + "epoch": 0.2596122050202916, + "flos": 20196727280640.0, + "grad_norm": 2.4040812102587377, + "language_loss": 0.78420109, + "learning_rate": 3.4719683216422163e-06, + "loss": 0.86208946, + "num_input_tokens_seen": 93269070, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.19519043, + "step": 4318, + "time_per_iteration": 3.9600155353546143 + }, + { + "auxiliary_loss_clip": 0.06505568, + "auxiliary_loss_mlp": 0.01276855, + "balance_loss_clip": 0.06290057, + "balance_loss_mlp": 0.01256637, + "epoch": 0.25967232827295955, + "flos": 22534431540480.0, + "grad_norm": 2.66294558684285, + "language_loss": 0.77022719, + "learning_rate": 3.471704628661598e-06, + "loss": 0.84805143, + "num_input_tokens_seen": 93290250, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.20227051, + "step": 4319, + "time_per_iteration": 2.544752836227417 + }, + { + "auxiliary_loss_clip": 0.0650554, + "auxiliary_loss_mlp": 0.01280509, + "balance_loss_clip": 0.06290743, + "balance_loss_mlp": 0.01261555, + "epoch": 0.2597324515256275, + "flos": 21074445999360.0, + "grad_norm": 1.7925219732685136, + "language_loss": 0.77426791, + "learning_rate": 3.4714408798737925e-06, + "loss": 0.85212845, + "num_input_tokens_seen": 93310090, + "router_z_loss_clip": 2.14941406, + "router_z_loss_mlp": 0.18945312, + "step": 4320, + "time_per_iteration": 2.569967269897461 + }, + { + "auxiliary_loss_clip": 0.06508496, + "auxiliary_loss_mlp": 0.01273671, + "balance_loss_clip": 0.06289761, + "balance_loss_mlp": 0.01254634, + "epoch": 0.2597925747782955, + "flos": 22055810618880.0, + "grad_norm": 1.593385908573569, + "language_loss": 0.71533716, + "learning_rate": 3.471177075288801e-06, + "loss": 0.79315877, + "num_input_tokens_seen": 93329570, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.19042969, + "step": 4321, + "time_per_iteration": 2.5314829349517822 + }, + { + "auxiliary_loss_clip": 0.0650996, + "auxiliary_loss_mlp": 0.01274348, + "balance_loss_clip": 0.06287652, + "balance_loss_mlp": 0.01254011, + "epoch": 0.2598526980309635, + "flos": 19543071179520.0, + "grad_norm": 2.282331155451991, + "language_loss": 0.75262189, + "learning_rate": 3.4709132149166277e-06, + "loss": 0.83046496, + "num_input_tokens_seen": 93347920, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.20336914, + "step": 4322, + "time_per_iteration": 2.525724411010742 + }, + { + "auxiliary_loss_clip": 0.06509394, + "auxiliary_loss_mlp": 0.01275417, + "balance_loss_clip": 0.06289983, + "balance_loss_mlp": 0.0125533, + "epoch": 0.25991282128363147, + "flos": 24501521191680.0, + "grad_norm": 2.623736611083137, + "language_loss": 0.7442928, + "learning_rate": 3.470649298767278e-06, + "loss": 0.82214087, + "num_input_tokens_seen": 93367145, + "router_z_loss_clip": 2.19335938, + "router_z_loss_mlp": 0.20092773, + "step": 4323, + "time_per_iteration": 3.957674026489258 + }, + { + "auxiliary_loss_clip": 0.06515582, + "auxiliary_loss_mlp": 0.01279409, + "balance_loss_clip": 0.0628901, + "balance_loss_mlp": 0.01258893, + "epoch": 0.25997294453629943, + "flos": 24207410960640.0, + "grad_norm": 1.7976461796423409, + "language_loss": 0.68052149, + "learning_rate": 3.4703853268507597e-06, + "loss": 0.75847143, + "num_input_tokens_seen": 93386555, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.20495605, + "step": 4324, + "time_per_iteration": 4.001135349273682 + }, + { + "auxiliary_loss_clip": 0.06505544, + "auxiliary_loss_mlp": 0.01272584, + "balance_loss_clip": 0.06286605, + "balance_loss_mlp": 0.01254608, + "epoch": 0.2600330677889674, + "flos": 31439597040000.0, + "grad_norm": 1.7946989584541546, + "language_loss": 0.71402133, + "learning_rate": 3.470121299177082e-06, + "loss": 0.79180264, + "num_input_tokens_seen": 93405590, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.1796875, + "step": 4325, + "time_per_iteration": 2.6213603019714355 + }, + { + "auxiliary_loss_clip": 0.06501837, + "auxiliary_loss_mlp": 0.01274613, + "balance_loss_clip": 0.06284901, + "balance_loss_mlp": 0.01255004, + "epoch": 0.26009319104163536, + "flos": 32274116179200.0, + "grad_norm": 1.826124228611905, + "language_loss": 0.73262805, + "learning_rate": 3.469857215756257e-06, + "loss": 0.81039256, + "num_input_tokens_seen": 93424750, + "router_z_loss_clip": 2.16601562, + "router_z_loss_mlp": 0.19592285, + "step": 4326, + "time_per_iteration": 2.593801736831665 + }, + { + "auxiliary_loss_clip": 0.06500994, + "auxiliary_loss_mlp": 0.01276886, + "balance_loss_clip": 0.06288173, + "balance_loss_mlp": 0.01258051, + "epoch": 0.26015331429430333, + "flos": 26293994933760.0, + "grad_norm": 1.858424121782002, + "language_loss": 0.8722446, + "learning_rate": 3.4695930765982997e-06, + "loss": 0.95002341, + "num_input_tokens_seen": 93443465, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.18835449, + "step": 4327, + "time_per_iteration": 2.5950510501861572 + }, + { + "auxiliary_loss_clip": 0.06508228, + "auxiliary_loss_mlp": 0.01274095, + "balance_loss_clip": 0.06287643, + "balance_loss_mlp": 0.01254271, + "epoch": 0.2602134375469713, + "flos": 21148728243840.0, + "grad_norm": 1.765295937421399, + "language_loss": 0.8100785, + "learning_rate": 3.4693288817132255e-06, + "loss": 0.88790172, + "num_input_tokens_seen": 93462580, + "router_z_loss_clip": 2.20507812, + "router_z_loss_mlp": 0.19824219, + "step": 4328, + "time_per_iteration": 3.923682928085327 + }, + { + "auxiliary_loss_clip": 0.06502862, + "auxiliary_loss_mlp": 0.01277051, + "balance_loss_clip": 0.06285354, + "balance_loss_mlp": 0.01258704, + "epoch": 0.26027356079963926, + "flos": 25928411569920.0, + "grad_norm": 1.3948699622732248, + "language_loss": 0.88172936, + "learning_rate": 3.4690646311110525e-06, + "loss": 0.95952845, + "num_input_tokens_seen": 93482790, + "router_z_loss_clip": 2.17578125, + "router_z_loss_mlp": 0.18347168, + "step": 4329, + "time_per_iteration": 2.5685267448425293 + }, + { + "auxiliary_loss_clip": 0.06502585, + "auxiliary_loss_mlp": 0.01271461, + "balance_loss_clip": 0.06288581, + "balance_loss_mlp": 0.0125327, + "epoch": 0.2603336840523072, + "flos": 26366390461440.0, + "grad_norm": 1.8811175805050973, + "language_loss": 0.77705932, + "learning_rate": 3.468800324801802e-06, + "loss": 0.85479975, + "num_input_tokens_seen": 93498795, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.18188477, + "step": 4330, + "time_per_iteration": 2.6185224056243896 + }, + { + "auxiliary_loss_clip": 0.06508863, + "auxiliary_loss_mlp": 0.01277238, + "balance_loss_clip": 0.06289242, + "balance_loss_mlp": 0.0125826, + "epoch": 0.2603938073049752, + "flos": 23520408134400.0, + "grad_norm": 1.5596482888270802, + "language_loss": 0.76200908, + "learning_rate": 3.4685359627954958e-06, + "loss": 0.8398701, + "num_input_tokens_seen": 93518335, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.18981934, + "step": 4331, + "time_per_iteration": 2.5152506828308105 + }, + { + "auxiliary_loss_clip": 0.06507871, + "auxiliary_loss_mlp": 0.01272217, + "balance_loss_clip": 0.06292268, + "balance_loss_mlp": 0.01254527, + "epoch": 0.26045393055764315, + "flos": 25381336262400.0, + "grad_norm": 1.426884348550376, + "language_loss": 0.69540298, + "learning_rate": 3.4682715451021584e-06, + "loss": 0.77320385, + "num_input_tokens_seen": 93539170, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.17700195, + "step": 4332, + "time_per_iteration": 2.5776190757751465 + }, + { + "auxiliary_loss_clip": 0.06511752, + "auxiliary_loss_mlp": 0.01275479, + "balance_loss_clip": 0.0629351, + "balance_loss_mlp": 0.0125693, + "epoch": 0.2605140538103111, + "flos": 27642494217600.0, + "grad_norm": 1.8844860211449586, + "language_loss": 0.79951644, + "learning_rate": 3.4680070717318174e-06, + "loss": 0.87738872, + "num_input_tokens_seen": 93558480, + "router_z_loss_clip": 2.18164062, + "router_z_loss_mlp": 0.1854248, + "step": 4333, + "time_per_iteration": 2.5523998737335205 + }, + { + "auxiliary_loss_clip": 0.06501235, + "auxiliary_loss_mlp": 0.01272154, + "balance_loss_clip": 0.06290703, + "balance_loss_mlp": 0.01254714, + "epoch": 0.2605741770629791, + "flos": 13774602147840.0, + "grad_norm": 1.6726919145500945, + "language_loss": 0.81128466, + "learning_rate": 3.467742542694501e-06, + "loss": 0.8890186, + "num_input_tokens_seen": 93575220, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 0.17443848, + "step": 4334, + "time_per_iteration": 2.522210121154785 + }, + { + "auxiliary_loss_clip": 0.06510483, + "auxiliary_loss_mlp": 0.01278802, + "balance_loss_clip": 0.06293018, + "balance_loss_mlp": 0.01259859, + "epoch": 0.26063430031564705, + "flos": 26038933505280.0, + "grad_norm": 1.7438742011205015, + "language_loss": 0.80170292, + "learning_rate": 3.46747795800024e-06, + "loss": 0.87959582, + "num_input_tokens_seen": 93597015, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.18945312, + "step": 4335, + "time_per_iteration": 2.582817792892456 + }, + { + "auxiliary_loss_clip": 0.06403506, + "auxiliary_loss_mlp": 0.01257225, + "balance_loss_clip": 0.06297, + "balance_loss_mlp": 0.01252544, + "epoch": 0.26069442356831507, + "flos": 62463143030400.0, + "grad_norm": 0.8284851894367303, + "language_loss": 0.60816151, + "learning_rate": 3.467213317659068e-06, + "loss": 0.6847688, + "num_input_tokens_seen": 93657775, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.04672241, + "step": 4336, + "time_per_iteration": 3.2036406993865967 + }, + { + "auxiliary_loss_clip": 0.0651319, + "auxiliary_loss_mlp": 0.0127574, + "balance_loss_clip": 0.06294517, + "balance_loss_mlp": 0.01257405, + "epoch": 0.26075454682098304, + "flos": 13631530101120.0, + "grad_norm": 1.8662385080657846, + "language_loss": 0.78028893, + "learning_rate": 3.46694862168102e-06, + "loss": 0.85817826, + "num_input_tokens_seen": 93676145, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.18322754, + "step": 4337, + "time_per_iteration": 2.4899747371673584 + }, + { + "auxiliary_loss_clip": 0.06515083, + "auxiliary_loss_mlp": 0.01276173, + "balance_loss_clip": 0.06295171, + "balance_loss_mlp": 0.01256289, + "epoch": 0.260814670073651, + "flos": 12130776748800.0, + "grad_norm": 2.165940638299647, + "language_loss": 0.74851859, + "learning_rate": 3.4666838700761334e-06, + "loss": 0.82643116, + "num_input_tokens_seen": 93692480, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.19897461, + "step": 4338, + "time_per_iteration": 2.5323259830474854 + }, + { + "auxiliary_loss_clip": 0.06522977, + "auxiliary_loss_mlp": 0.01274339, + "balance_loss_clip": 0.0629933, + "balance_loss_mlp": 0.01255039, + "epoch": 0.26087479332631897, + "flos": 15127964968320.0, + "grad_norm": 2.9662822483112388, + "language_loss": 0.81419933, + "learning_rate": 3.466419062854447e-06, + "loss": 0.89217252, + "num_input_tokens_seen": 93710165, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.19287109, + "step": 4339, + "time_per_iteration": 2.486024856567383 + }, + { + "auxiliary_loss_clip": 0.06514673, + "auxiliary_loss_mlp": 0.0127648, + "balance_loss_clip": 0.06300991, + "balance_loss_mlp": 0.01259278, + "epoch": 0.26093491657898693, + "flos": 24687834744960.0, + "grad_norm": 1.5467473582016638, + "language_loss": 0.77106607, + "learning_rate": 3.4661542000260033e-06, + "loss": 0.84897768, + "num_input_tokens_seen": 93730185, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.17199707, + "step": 4340, + "time_per_iteration": 2.570777416229248 + }, + { + "auxiliary_loss_clip": 0.06513949, + "auxiliary_loss_mlp": 0.01274956, + "balance_loss_clip": 0.062961, + "balance_loss_mlp": 0.01255788, + "epoch": 0.2609950398316549, + "flos": 25122669108480.0, + "grad_norm": 1.4533527138525517, + "language_loss": 0.82740015, + "learning_rate": 3.465889281600845e-06, + "loss": 0.90528917, + "num_input_tokens_seen": 93747690, + "router_z_loss_clip": 2.18164062, + "router_z_loss_mlp": 0.19177246, + "step": 4341, + "time_per_iteration": 2.5946342945098877 + }, + { + "auxiliary_loss_clip": 0.06519589, + "auxiliary_loss_mlp": 0.01282035, + "balance_loss_clip": 0.06303687, + "balance_loss_mlp": 0.01261794, + "epoch": 0.26105516308432286, + "flos": 28556159137920.0, + "grad_norm": 1.7858700463590271, + "language_loss": 0.77163744, + "learning_rate": 3.4656243075890183e-06, + "loss": 0.84965372, + "num_input_tokens_seen": 93767405, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.20251465, + "step": 4342, + "time_per_iteration": 2.5742342472076416 + }, + { + "auxiliary_loss_clip": 0.06521034, + "auxiliary_loss_mlp": 0.01277248, + "balance_loss_clip": 0.06303718, + "balance_loss_mlp": 0.01258115, + "epoch": 0.2611152863369908, + "flos": 39539984400000.0, + "grad_norm": 1.7100835603344944, + "language_loss": 0.66681403, + "learning_rate": 3.4653592780005707e-06, + "loss": 0.74479687, + "num_input_tokens_seen": 93789950, + "router_z_loss_clip": 2.17578125, + "router_z_loss_mlp": 0.19140625, + "step": 4343, + "time_per_iteration": 2.662271738052368 + }, + { + "auxiliary_loss_clip": 0.06524731, + "auxiliary_loss_mlp": 0.01280109, + "balance_loss_clip": 0.0630408, + "balance_loss_mlp": 0.01261917, + "epoch": 0.2611754095896588, + "flos": 13740416881920.0, + "grad_norm": 1.8127929734390111, + "language_loss": 0.74220115, + "learning_rate": 3.465094192845553e-06, + "loss": 0.82024956, + "num_input_tokens_seen": 93807835, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.18200684, + "step": 4344, + "time_per_iteration": 2.5201361179351807 + }, + { + "auxiliary_loss_clip": 0.06524797, + "auxiliary_loss_mlp": 0.01284917, + "balance_loss_clip": 0.06307752, + "balance_loss_mlp": 0.01264484, + "epoch": 0.26123553284232676, + "flos": 21513011869440.0, + "grad_norm": 2.1854473316742338, + "language_loss": 0.8696478, + "learning_rate": 3.4648290521340165e-06, + "loss": 0.94774491, + "num_input_tokens_seen": 93825670, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.20422363, + "step": 4345, + "time_per_iteration": 2.510000228881836 + }, + { + "auxiliary_loss_clip": 0.06521724, + "auxiliary_loss_mlp": 0.01276675, + "balance_loss_clip": 0.06307776, + "balance_loss_mlp": 0.01258293, + "epoch": 0.2612956560949947, + "flos": 21145751424000.0, + "grad_norm": 2.0739898036059095, + "language_loss": 0.76897335, + "learning_rate": 3.464563855876015e-06, + "loss": 0.84695733, + "num_input_tokens_seen": 93844045, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.18371582, + "step": 4346, + "time_per_iteration": 2.5322000980377197 + }, + { + "auxiliary_loss_clip": 0.06522055, + "auxiliary_loss_mlp": 0.01275162, + "balance_loss_clip": 0.06305227, + "balance_loss_mlp": 0.01256911, + "epoch": 0.2613557793476627, + "flos": 25126023271680.0, + "grad_norm": 1.5562871556893731, + "language_loss": 0.76140273, + "learning_rate": 3.464298604081606e-06, + "loss": 0.83937496, + "num_input_tokens_seen": 93864380, + "router_z_loss_clip": 2.16894531, + "router_z_loss_mlp": 0.18249512, + "step": 4347, + "time_per_iteration": 2.557077169418335 + }, + { + "auxiliary_loss_clip": 0.06522661, + "auxiliary_loss_mlp": 0.01286127, + "balance_loss_clip": 0.06307539, + "balance_loss_mlp": 0.01267208, + "epoch": 0.26141590260033065, + "flos": 26074879706880.0, + "grad_norm": 1.3369896368920637, + "language_loss": 0.7377249, + "learning_rate": 3.4640332967608476e-06, + "loss": 0.81581283, + "num_input_tokens_seen": 93885475, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.18920898, + "step": 4348, + "time_per_iteration": 2.5915603637695312 + }, + { + "auxiliary_loss_clip": 0.06527912, + "auxiliary_loss_mlp": 0.01280562, + "balance_loss_clip": 0.06309946, + "balance_loss_mlp": 0.01260881, + "epoch": 0.2614760258529987, + "flos": 25708415875200.0, + "grad_norm": 1.876318754691465, + "language_loss": 0.9123491, + "learning_rate": 3.463767933923799e-06, + "loss": 0.99043381, + "num_input_tokens_seen": 93905545, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.19689941, + "step": 4349, + "time_per_iteration": 2.594332218170166 + }, + { + "auxiliary_loss_clip": 0.06524529, + "auxiliary_loss_mlp": 0.01276126, + "balance_loss_clip": 0.0631379, + "balance_loss_mlp": 0.01256695, + "epoch": 0.26153614910566664, + "flos": 17462902043520.0, + "grad_norm": 1.601755901803269, + "language_loss": 0.80459869, + "learning_rate": 3.463502515580524e-06, + "loss": 0.8826052, + "num_input_tokens_seen": 93924185, + "router_z_loss_clip": 2.10546875, + "router_z_loss_mlp": 0.19433594, + "step": 4350, + "time_per_iteration": 2.509274482727051 + }, + { + "auxiliary_loss_clip": 0.06520928, + "auxiliary_loss_mlp": 0.01277683, + "balance_loss_clip": 0.0631097, + "balance_loss_mlp": 0.01259063, + "epoch": 0.2615962723583346, + "flos": 17718676231680.0, + "grad_norm": 1.8928977658247819, + "language_loss": 0.62482548, + "learning_rate": 3.4632370417410866e-06, + "loss": 0.7028116, + "num_input_tokens_seen": 93942825, + "router_z_loss_clip": 2.1015625, + "router_z_loss_mlp": 0.18615723, + "step": 4351, + "time_per_iteration": 2.522862672805786 + }, + { + "auxiliary_loss_clip": 0.06526107, + "auxiliary_loss_mlp": 0.01278827, + "balance_loss_clip": 0.06308405, + "balance_loss_mlp": 0.01259396, + "epoch": 0.26165639561100257, + "flos": 23264340456960.0, + "grad_norm": 2.4783042039829546, + "language_loss": 0.84264326, + "learning_rate": 3.462971512415555e-06, + "loss": 0.92069256, + "num_input_tokens_seen": 93962045, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.19445801, + "step": 4352, + "time_per_iteration": 2.5326311588287354 + }, + { + "auxiliary_loss_clip": 0.06398427, + "auxiliary_loss_mlp": 0.01261209, + "balance_loss_clip": 0.06294002, + "balance_loss_mlp": 0.01256817, + "epoch": 0.26171651886367053, + "flos": 66756155443200.0, + "grad_norm": 0.7669563885543124, + "language_loss": 0.7057451, + "learning_rate": 3.462705927613996e-06, + "loss": 0.78234154, + "num_input_tokens_seen": 94021175, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.04397583, + "step": 4353, + "time_per_iteration": 3.093543529510498 + }, + { + "auxiliary_loss_clip": 0.06517833, + "auxiliary_loss_mlp": 0.01279039, + "balance_loss_clip": 0.06305997, + "balance_loss_mlp": 0.01259619, + "epoch": 0.2617766421163385, + "flos": 22356713030400.0, + "grad_norm": 1.943198757771125, + "language_loss": 0.77770078, + "learning_rate": 3.4624402873464816e-06, + "loss": 0.8556695, + "num_input_tokens_seen": 94043370, + "router_z_loss_clip": 2.1171875, + "router_z_loss_mlp": 0.19433594, + "step": 4354, + "time_per_iteration": 2.5782573223114014 + }, + { + "auxiliary_loss_clip": 0.06522856, + "auxiliary_loss_mlp": 0.01279183, + "balance_loss_clip": 0.06303968, + "balance_loss_mlp": 0.01259907, + "epoch": 0.26183676536900646, + "flos": 26074208874240.0, + "grad_norm": 2.16382169558429, + "language_loss": 0.68941987, + "learning_rate": 3.462174591623085e-06, + "loss": 0.7674402, + "num_input_tokens_seen": 94063510, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.19274902, + "step": 4355, + "time_per_iteration": 2.608482599258423 + }, + { + "auxiliary_loss_clip": 0.06517249, + "auxiliary_loss_mlp": 0.01282478, + "balance_loss_clip": 0.06301509, + "balance_loss_mlp": 0.01260889, + "epoch": 0.26189688862167443, + "flos": 21002847085440.0, + "grad_norm": 2.1598133279644554, + "language_loss": 0.68533909, + "learning_rate": 3.4619088404538815e-06, + "loss": 0.76333642, + "num_input_tokens_seen": 94083865, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.21594238, + "step": 4356, + "time_per_iteration": 2.526376247406006 + }, + { + "auxiliary_loss_clip": 0.06398848, + "auxiliary_loss_mlp": 0.01254107, + "balance_loss_clip": 0.06295048, + "balance_loss_mlp": 0.01249723, + "epoch": 0.2619570118743424, + "flos": 65817780768000.0, + "grad_norm": 0.6753767209108164, + "language_loss": 0.5316326, + "learning_rate": 3.4616430338489487e-06, + "loss": 0.60816211, + "num_input_tokens_seen": 94144095, + "router_z_loss_clip": 1.03125, + "router_z_loss_mlp": 0.04391479, + "step": 4357, + "time_per_iteration": 4.58653450012207 + }, + { + "auxiliary_loss_clip": 0.065238, + "auxiliary_loss_mlp": 0.01280125, + "balance_loss_clip": 0.06302594, + "balance_loss_mlp": 0.01261183, + "epoch": 0.26201713512701036, + "flos": 28774310042880.0, + "grad_norm": 1.9589657113609436, + "language_loss": 0.85308599, + "learning_rate": 3.4613771718183654e-06, + "loss": 0.93112528, + "num_input_tokens_seen": 94163035, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.18933105, + "step": 4358, + "time_per_iteration": 2.65427303314209 + }, + { + "auxiliary_loss_clip": 0.0652793, + "auxiliary_loss_mlp": 0.0127535, + "balance_loss_clip": 0.06300082, + "balance_loss_mlp": 0.01254917, + "epoch": 0.2620772583796783, + "flos": 26439750311040.0, + "grad_norm": 2.2013035586341663, + "language_loss": 0.68206531, + "learning_rate": 3.4611112543722127e-06, + "loss": 0.7600981, + "num_input_tokens_seen": 94182520, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.20422363, + "step": 4359, + "time_per_iteration": 2.5460946559906006 + }, + { + "auxiliary_loss_clip": 0.06517753, + "auxiliary_loss_mlp": 0.01278599, + "balance_loss_clip": 0.06299832, + "balance_loss_mlp": 0.01258763, + "epoch": 0.2621373816323463, + "flos": 20162667795840.0, + "grad_norm": 1.9413360196767273, + "language_loss": 0.7857362, + "learning_rate": 3.4608452815205757e-06, + "loss": 0.86369967, + "num_input_tokens_seen": 94201795, + "router_z_loss_clip": 2.18164062, + "router_z_loss_mlp": 0.19848633, + "step": 4360, + "time_per_iteration": 2.5442395210266113 + }, + { + "auxiliary_loss_clip": 0.06513859, + "auxiliary_loss_mlp": 0.01282389, + "balance_loss_clip": 0.06305451, + "balance_loss_mlp": 0.01262839, + "epoch": 0.26219750488501425, + "flos": 28628764300800.0, + "grad_norm": 1.9016418571028826, + "language_loss": 0.68632245, + "learning_rate": 3.4605792532735387e-06, + "loss": 0.76428491, + "num_input_tokens_seen": 94222390, + "router_z_loss_clip": 2.08105469, + "router_z_loss_mlp": 0.19519043, + "step": 4361, + "time_per_iteration": 2.5506739616394043 + }, + { + "auxiliary_loss_clip": 0.0652248, + "auxiliary_loss_mlp": 0.01277506, + "balance_loss_clip": 0.06302515, + "balance_loss_mlp": 0.01256298, + "epoch": 0.2622576281376823, + "flos": 15046806689280.0, + "grad_norm": 1.72568625675014, + "language_loss": 0.84433615, + "learning_rate": 3.46031316964119e-06, + "loss": 0.92233592, + "num_input_tokens_seen": 94239980, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.21179199, + "step": 4362, + "time_per_iteration": 3.9455041885375977 + }, + { + "auxiliary_loss_clip": 0.06516212, + "auxiliary_loss_mlp": 0.01274047, + "balance_loss_clip": 0.06303745, + "balance_loss_mlp": 0.01254914, + "epoch": 0.26231775139035024, + "flos": 26403426766080.0, + "grad_norm": 1.7310155723144771, + "language_loss": 0.65182602, + "learning_rate": 3.4600470306336197e-06, + "loss": 0.72972858, + "num_input_tokens_seen": 94260715, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.19140625, + "step": 4363, + "time_per_iteration": 2.5710229873657227 + }, + { + "auxiliary_loss_clip": 0.06417713, + "auxiliary_loss_mlp": 0.01270336, + "balance_loss_clip": 0.06313097, + "balance_loss_mlp": 0.01263804, + "epoch": 0.2623778746430182, + "flos": 65430380615040.0, + "grad_norm": 0.9022976396731897, + "language_loss": 0.61189461, + "learning_rate": 3.4597808362609194e-06, + "loss": 0.68877506, + "num_input_tokens_seen": 94321285, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.06542969, + "step": 4364, + "time_per_iteration": 4.728578805923462 + }, + { + "auxiliary_loss_clip": 0.06528256, + "auxiliary_loss_mlp": 0.01280703, + "balance_loss_clip": 0.06308191, + "balance_loss_mlp": 0.01260402, + "epoch": 0.26243799789568617, + "flos": 12609104181120.0, + "grad_norm": 2.531531320883944, + "language_loss": 0.72247571, + "learning_rate": 3.459514586533184e-06, + "loss": 0.80056524, + "num_input_tokens_seen": 94335420, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.20300293, + "step": 4365, + "time_per_iteration": 2.5567469596862793 + }, + { + "auxiliary_loss_clip": 0.06519997, + "auxiliary_loss_mlp": 0.01276088, + "balance_loss_clip": 0.06307054, + "balance_loss_mlp": 0.01257146, + "epoch": 0.26249812114835414, + "flos": 28631783047680.0, + "grad_norm": 1.7351756990107399, + "language_loss": 0.78023124, + "learning_rate": 3.459248281460509e-06, + "loss": 0.85819209, + "num_input_tokens_seen": 94357440, + "router_z_loss_clip": 2.12792969, + "router_z_loss_mlp": 0.18945312, + "step": 4366, + "time_per_iteration": 2.6212668418884277 + }, + { + "auxiliary_loss_clip": 0.06522524, + "auxiliary_loss_mlp": 0.01276459, + "balance_loss_clip": 0.06305946, + "balance_loss_mlp": 0.01258351, + "epoch": 0.2625582444010221, + "flos": 14470661214720.0, + "grad_norm": 1.579355851615032, + "language_loss": 0.77007079, + "learning_rate": 3.4589819210529927e-06, + "loss": 0.84806067, + "num_input_tokens_seen": 94375690, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.18103027, + "step": 4367, + "time_per_iteration": 2.602072238922119 + }, + { + "auxiliary_loss_clip": 0.06517363, + "auxiliary_loss_mlp": 0.01271186, + "balance_loss_clip": 0.06304537, + "balance_loss_mlp": 0.01253471, + "epoch": 0.26261836765369007, + "flos": 16617984998400.0, + "grad_norm": 1.5269013949985815, + "language_loss": 0.70157337, + "learning_rate": 3.458715505320736e-06, + "loss": 0.77945888, + "num_input_tokens_seen": 94393190, + "router_z_loss_clip": 2.13183594, + "router_z_loss_mlp": 0.17700195, + "step": 4368, + "time_per_iteration": 4.012764930725098 + }, + { + "auxiliary_loss_clip": 0.06516206, + "auxiliary_loss_mlp": 0.01278713, + "balance_loss_clip": 0.06299432, + "balance_loss_mlp": 0.01256635, + "epoch": 0.26267849090635803, + "flos": 20525861318400.0, + "grad_norm": 1.916794033771568, + "language_loss": 0.79240829, + "learning_rate": 3.458449034273841e-06, + "loss": 0.87035751, + "num_input_tokens_seen": 94410975, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.22070312, + "step": 4369, + "time_per_iteration": 2.51906418800354 + }, + { + "auxiliary_loss_clip": 0.06514631, + "auxiliary_loss_mlp": 0.01276005, + "balance_loss_clip": 0.06301987, + "balance_loss_mlp": 0.01256883, + "epoch": 0.262738614159026, + "flos": 21330220187520.0, + "grad_norm": 3.2285566965587873, + "language_loss": 0.83905816, + "learning_rate": 3.4581825079224133e-06, + "loss": 0.91696453, + "num_input_tokens_seen": 94429985, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.19116211, + "step": 4370, + "time_per_iteration": 2.562302589416504 + }, + { + "auxiliary_loss_clip": 0.06520583, + "auxiliary_loss_mlp": 0.01275132, + "balance_loss_clip": 0.06299531, + "balance_loss_mlp": 0.01253972, + "epoch": 0.26279873741169396, + "flos": 17609454034560.0, + "grad_norm": 1.7096089610285066, + "language_loss": 0.71678042, + "learning_rate": 3.4579159262765575e-06, + "loss": 0.79473758, + "num_input_tokens_seen": 94448660, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.21179199, + "step": 4371, + "time_per_iteration": 2.4965152740478516 + }, + { + "auxiliary_loss_clip": 0.06398421, + "auxiliary_loss_mlp": 0.01256739, + "balance_loss_clip": 0.0629326, + "balance_loss_mlp": 0.01252516, + "epoch": 0.2628588606643619, + "flos": 60969139931520.0, + "grad_norm": 0.666639264120038, + "language_loss": 0.56056166, + "learning_rate": 3.457649289346384e-06, + "loss": 0.63711321, + "num_input_tokens_seen": 94515630, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.04226685, + "step": 4372, + "time_per_iteration": 3.2867443561553955 + }, + { + "auxiliary_loss_clip": 0.06512036, + "auxiliary_loss_mlp": 0.01277679, + "balance_loss_clip": 0.06298684, + "balance_loss_mlp": 0.01259178, + "epoch": 0.2629189839170299, + "flos": 27023652288000.0, + "grad_norm": 1.5439358769508327, + "language_loss": 0.78190762, + "learning_rate": 3.4573825971420042e-06, + "loss": 0.85980475, + "num_input_tokens_seen": 94535385, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.18505859, + "step": 4373, + "time_per_iteration": 2.577479362487793 + }, + { + "auxiliary_loss_clip": 0.06510606, + "auxiliary_loss_mlp": 0.01278833, + "balance_loss_clip": 0.06297645, + "balance_loss_mlp": 0.01260427, + "epoch": 0.26297910716969786, + "flos": 17025635911680.0, + "grad_norm": 2.1443132622279664, + "language_loss": 0.723768, + "learning_rate": 3.4571158496735294e-06, + "loss": 0.80166239, + "num_input_tokens_seen": 94552650, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.18383789, + "step": 4374, + "time_per_iteration": 2.5588772296905518 + }, + { + "auxiliary_loss_clip": 0.06517059, + "auxiliary_loss_mlp": 0.01278902, + "balance_loss_clip": 0.0630156, + "balance_loss_mlp": 0.01258505, + "epoch": 0.2630392304223659, + "flos": 24903889297920.0, + "grad_norm": 2.1190930293084933, + "language_loss": 0.81199759, + "learning_rate": 3.4568490469510756e-06, + "loss": 0.88995719, + "num_input_tokens_seen": 94574075, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.20373535, + "step": 4375, + "time_per_iteration": 2.591381311416626 + }, + { + "auxiliary_loss_clip": 0.0651055, + "auxiliary_loss_mlp": 0.01275326, + "balance_loss_clip": 0.0629838, + "balance_loss_mlp": 0.01257289, + "epoch": 0.26309935367503384, + "flos": 32862336641280.0, + "grad_norm": 1.9139045559413268, + "language_loss": 0.66626596, + "learning_rate": 3.4565821889847603e-06, + "loss": 0.74412477, + "num_input_tokens_seen": 94594255, + "router_z_loss_clip": 2.12304688, + "router_z_loss_mlp": 0.18041992, + "step": 4376, + "time_per_iteration": 2.643944025039673 + }, + { + "auxiliary_loss_clip": 0.06515232, + "auxiliary_loss_mlp": 0.01276237, + "balance_loss_clip": 0.06297503, + "balance_loss_mlp": 0.01257485, + "epoch": 0.2631594769277018, + "flos": 15893400816000.0, + "grad_norm": 1.6251454157029055, + "language_loss": 0.70145154, + "learning_rate": 3.4563152757847026e-06, + "loss": 0.77936625, + "num_input_tokens_seen": 94611410, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.1875, + "step": 4377, + "time_per_iteration": 2.5593788623809814 + }, + { + "auxiliary_loss_clip": 0.06513406, + "auxiliary_loss_mlp": 0.01274994, + "balance_loss_clip": 0.06296869, + "balance_loss_mlp": 0.01255408, + "epoch": 0.2632196001803698, + "flos": 50816242811520.0, + "grad_norm": 1.6666327452584295, + "language_loss": 0.80235565, + "learning_rate": 3.4560483073610233e-06, + "loss": 0.88023967, + "num_input_tokens_seen": 94636575, + "router_z_loss_clip": 2.16601562, + "router_z_loss_mlp": 0.19592285, + "step": 4378, + "time_per_iteration": 2.794290065765381 + }, + { + "auxiliary_loss_clip": 0.0651051, + "auxiliary_loss_mlp": 0.01272396, + "balance_loss_clip": 0.06297652, + "balance_loss_mlp": 0.0125492, + "epoch": 0.26327972343303774, + "flos": 13737733551360.0, + "grad_norm": 2.7188396998417548, + "language_loss": 0.77230549, + "learning_rate": 3.455781283723846e-06, + "loss": 0.85013449, + "num_input_tokens_seen": 94654345, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.17480469, + "step": 4379, + "time_per_iteration": 2.542442560195923 + }, + { + "auxiliary_loss_clip": 0.06519607, + "auxiliary_loss_mlp": 0.01274968, + "balance_loss_clip": 0.06299821, + "balance_loss_mlp": 0.01255084, + "epoch": 0.2633398466857057, + "flos": 23775846906240.0, + "grad_norm": 1.9724368576120554, + "language_loss": 0.78418016, + "learning_rate": 3.4555142048832975e-06, + "loss": 0.86212587, + "num_input_tokens_seen": 94673985, + "router_z_loss_clip": 2.19921875, + "router_z_loss_mlp": 0.19897461, + "step": 4380, + "time_per_iteration": 2.529573440551758 + }, + { + "auxiliary_loss_clip": 0.06516172, + "auxiliary_loss_mlp": 0.012759, + "balance_loss_clip": 0.06296928, + "balance_loss_mlp": 0.01257518, + "epoch": 0.26339996993837367, + "flos": 27607680046080.0, + "grad_norm": 1.9046534185934374, + "language_loss": 0.6460917, + "learning_rate": 3.4552470708495036e-06, + "loss": 0.72401243, + "num_input_tokens_seen": 94693145, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.18383789, + "step": 4381, + "time_per_iteration": 2.5774149894714355 + }, + { + "auxiliary_loss_clip": 0.06511073, + "auxiliary_loss_mlp": 0.01273848, + "balance_loss_clip": 0.06295128, + "balance_loss_mlp": 0.01255394, + "epoch": 0.26346009319104163, + "flos": 16951982572800.0, + "grad_norm": 1.8115834165165374, + "language_loss": 0.8293367, + "learning_rate": 3.454979881632595e-06, + "loss": 0.90718591, + "num_input_tokens_seen": 94710185, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.18444824, + "step": 4382, + "time_per_iteration": 2.503119945526123 + }, + { + "auxiliary_loss_clip": 0.06526808, + "auxiliary_loss_mlp": 0.01282548, + "balance_loss_clip": 0.06304507, + "balance_loss_mlp": 0.0126196, + "epoch": 0.2635202164437096, + "flos": 37241245088640.0, + "grad_norm": 2.8611377763647363, + "language_loss": 0.70728219, + "learning_rate": 3.4547126372427035e-06, + "loss": 0.78537577, + "num_input_tokens_seen": 94730280, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.20581055, + "step": 4383, + "time_per_iteration": 2.7256851196289062 + }, + { + "auxiliary_loss_clip": 0.06511825, + "auxiliary_loss_mlp": 0.01278143, + "balance_loss_clip": 0.0629648, + "balance_loss_mlp": 0.01260214, + "epoch": 0.26358033969637756, + "flos": 21002721304320.0, + "grad_norm": 1.8636489890531567, + "language_loss": 0.69725919, + "learning_rate": 3.4544453376899638e-06, + "loss": 0.77515888, + "num_input_tokens_seen": 94748560, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.17919922, + "step": 4384, + "time_per_iteration": 2.526306629180908 + }, + { + "auxiliary_loss_clip": 0.06514609, + "auxiliary_loss_mlp": 0.01274952, + "balance_loss_clip": 0.06301568, + "balance_loss_mlp": 0.01256355, + "epoch": 0.26364046294904553, + "flos": 27753561204480.0, + "grad_norm": 2.704228439938978, + "language_loss": 0.70769042, + "learning_rate": 3.45417798298451e-06, + "loss": 0.785586, + "num_input_tokens_seen": 94767570, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.18603516, + "step": 4385, + "time_per_iteration": 2.6091294288635254 + }, + { + "auxiliary_loss_clip": 0.06510788, + "auxiliary_loss_mlp": 0.01275036, + "balance_loss_clip": 0.06297003, + "balance_loss_mlp": 0.01255903, + "epoch": 0.2637005862017135, + "flos": 22899679488000.0, + "grad_norm": 1.8400483569046413, + "language_loss": 0.85200071, + "learning_rate": 3.453910573136482e-06, + "loss": 0.92985892, + "num_input_tokens_seen": 94784985, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.19116211, + "step": 4386, + "time_per_iteration": 2.5284476280212402 + }, + { + "auxiliary_loss_clip": 0.06516191, + "auxiliary_loss_mlp": 0.01275321, + "balance_loss_clip": 0.06302508, + "balance_loss_mlp": 0.01255759, + "epoch": 0.26376070945438146, + "flos": 15054143921280.0, + "grad_norm": 1.9881194524454247, + "language_loss": 0.77597183, + "learning_rate": 3.4536431081560196e-06, + "loss": 0.85388696, + "num_input_tokens_seen": 94802545, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.19567871, + "step": 4387, + "time_per_iteration": 2.522135019302368 + }, + { + "auxiliary_loss_clip": 0.0651316, + "auxiliary_loss_mlp": 0.01278261, + "balance_loss_clip": 0.06301039, + "balance_loss_mlp": 0.01259378, + "epoch": 0.2638208327070494, + "flos": 21148141265280.0, + "grad_norm": 2.1303107819849316, + "language_loss": 0.76193964, + "learning_rate": 3.453375588053264e-06, + "loss": 0.83985388, + "num_input_tokens_seen": 94820730, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.1887207, + "step": 4388, + "time_per_iteration": 2.5082008838653564 + }, + { + "auxiliary_loss_clip": 0.06516623, + "auxiliary_loss_mlp": 0.01271478, + "balance_loss_clip": 0.06302176, + "balance_loss_mlp": 0.01253681, + "epoch": 0.26388095595971744, + "flos": 21732001315200.0, + "grad_norm": 2.125202232596161, + "language_loss": 0.86967361, + "learning_rate": 3.4531080128383617e-06, + "loss": 0.94755471, + "num_input_tokens_seen": 94839175, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.17785645, + "step": 4389, + "time_per_iteration": 2.570643901824951 + }, + { + "auxiliary_loss_clip": 0.06416489, + "auxiliary_loss_mlp": 0.01268137, + "balance_loss_clip": 0.0630957, + "balance_loss_mlp": 0.01263464, + "epoch": 0.2639410792123854, + "flos": 65536542138240.0, + "grad_norm": 0.8199197454978128, + "language_loss": 0.60138249, + "learning_rate": 3.452840382521457e-06, + "loss": 0.6782288, + "num_input_tokens_seen": 94898865, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.04666138, + "step": 4390, + "time_per_iteration": 3.174226999282837 + }, + { + "auxiliary_loss_clip": 0.06524064, + "auxiliary_loss_mlp": 0.01274153, + "balance_loss_clip": 0.06302064, + "balance_loss_mlp": 0.01255008, + "epoch": 0.2640012024650534, + "flos": 23954907081600.0, + "grad_norm": 1.739207981028, + "language_loss": 0.77995527, + "learning_rate": 3.4525726971127e-06, + "loss": 0.85793746, + "num_input_tokens_seen": 94917490, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.19152832, + "step": 4391, + "time_per_iteration": 2.5869362354278564 + }, + { + "auxiliary_loss_clip": 0.06415629, + "auxiliary_loss_mlp": 0.01265443, + "balance_loss_clip": 0.06309642, + "balance_loss_mlp": 0.0126082, + "epoch": 0.26406132571772134, + "flos": 56462420880000.0, + "grad_norm": 0.8885893091984226, + "language_loss": 0.58835375, + "learning_rate": 3.45230495662224e-06, + "loss": 0.66516447, + "num_input_tokens_seen": 94969065, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.04620361, + "step": 4392, + "time_per_iteration": 3.1856343746185303 + }, + { + "auxiliary_loss_clip": 0.0652501, + "auxiliary_loss_mlp": 0.0127481, + "balance_loss_clip": 0.06303259, + "balance_loss_mlp": 0.01256631, + "epoch": 0.2641214489703893, + "flos": 22097039627520.0, + "grad_norm": 1.7095674260711007, + "language_loss": 0.69284153, + "learning_rate": 3.4520371610602306e-06, + "loss": 0.77083969, + "num_input_tokens_seen": 94988540, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.1817627, + "step": 4393, + "time_per_iteration": 2.5519895553588867 + }, + { + "auxiliary_loss_clip": 0.06526117, + "auxiliary_loss_mlp": 0.01277548, + "balance_loss_clip": 0.06302489, + "balance_loss_mlp": 0.01255959, + "epoch": 0.26418157222305727, + "flos": 16550327226240.0, + "grad_norm": 2.304177456685855, + "language_loss": 0.84805501, + "learning_rate": 3.4517693104368267e-06, + "loss": 0.92609167, + "num_input_tokens_seen": 95004810, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.21594238, + "step": 4394, + "time_per_iteration": 2.5253031253814697 + }, + { + "auxiliary_loss_clip": 0.06528334, + "auxiliary_loss_mlp": 0.01280976, + "balance_loss_clip": 0.06304967, + "balance_loss_mlp": 0.01260066, + "epoch": 0.26424169547572524, + "flos": 18008006780160.0, + "grad_norm": 1.9555526734650441, + "language_loss": 0.70342916, + "learning_rate": 3.4515014047621856e-06, + "loss": 0.78152227, + "num_input_tokens_seen": 95024085, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.20910645, + "step": 4395, + "time_per_iteration": 2.5117664337158203 + }, + { + "auxiliary_loss_clip": 0.06512758, + "auxiliary_loss_mlp": 0.01272399, + "balance_loss_clip": 0.06300145, + "balance_loss_mlp": 0.01253171, + "epoch": 0.2643018187283932, + "flos": 16988893096320.0, + "grad_norm": 1.791387622967983, + "language_loss": 0.87312353, + "learning_rate": 3.4512334440464655e-06, + "loss": 0.95097506, + "num_input_tokens_seen": 95042515, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.19238281, + "step": 4396, + "time_per_iteration": 2.566774368286133 + }, + { + "auxiliary_loss_clip": 0.06404904, + "auxiliary_loss_mlp": 0.01257464, + "balance_loss_clip": 0.06300922, + "balance_loss_mlp": 0.01252997, + "epoch": 0.26436194198106117, + "flos": 59682135144960.0, + "grad_norm": 0.7723405564107855, + "language_loss": 0.54990101, + "learning_rate": 3.4509654282998277e-06, + "loss": 0.62652469, + "num_input_tokens_seen": 95094835, + "router_z_loss_clip": 1.0390625, + "router_z_loss_mlp": 0.04473877, + "step": 4397, + "time_per_iteration": 4.373678684234619 + }, + { + "auxiliary_loss_clip": 0.06510547, + "auxiliary_loss_mlp": 0.01274266, + "balance_loss_clip": 0.06297219, + "balance_loss_mlp": 0.01255633, + "epoch": 0.26442206523372913, + "flos": 32928694675200.0, + "grad_norm": 2.4292177107300224, + "language_loss": 0.78606653, + "learning_rate": 3.450697357532435e-06, + "loss": 0.86391467, + "num_input_tokens_seen": 95113480, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.1862793, + "step": 4398, + "time_per_iteration": 2.6890292167663574 + }, + { + "auxiliary_loss_clip": 0.06511252, + "auxiliary_loss_mlp": 0.01279415, + "balance_loss_clip": 0.06294377, + "balance_loss_mlp": 0.01259244, + "epoch": 0.2644821884863971, + "flos": 21037409694720.0, + "grad_norm": 1.6698754866149341, + "language_loss": 0.67733896, + "learning_rate": 3.4504292317544534e-06, + "loss": 0.75524557, + "num_input_tokens_seen": 95132580, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.20178223, + "step": 4399, + "time_per_iteration": 2.5403761863708496 + }, + { + "auxiliary_loss_clip": 0.06507229, + "auxiliary_loss_mlp": 0.01274507, + "balance_loss_clip": 0.06301808, + "balance_loss_mlp": 0.01256841, + "epoch": 0.26454231173906506, + "flos": 20783019098880.0, + "grad_norm": 1.5093240378212085, + "language_loss": 0.8695311, + "learning_rate": 3.4501610509760504e-06, + "loss": 0.94734848, + "num_input_tokens_seen": 95152375, + "router_z_loss_clip": 2.05175781, + "router_z_loss_mlp": 0.17675781, + "step": 4400, + "time_per_iteration": 2.546402931213379 + }, + { + "auxiliary_loss_clip": 0.06514899, + "auxiliary_loss_mlp": 0.01275157, + "balance_loss_clip": 0.06298938, + "balance_loss_mlp": 0.01255404, + "epoch": 0.264602434991733, + "flos": 16624399835520.0, + "grad_norm": 2.9592381962347076, + "language_loss": 0.77008456, + "learning_rate": 3.4498928152073944e-06, + "loss": 0.84798515, + "num_input_tokens_seen": 95170265, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.19750977, + "step": 4401, + "time_per_iteration": 4.000045537948608 + }, + { + "auxiliary_loss_clip": 0.06515318, + "auxiliary_loss_mlp": 0.01277892, + "balance_loss_clip": 0.0629567, + "balance_loss_mlp": 0.01257149, + "epoch": 0.26466255824440105, + "flos": 19068726816000.0, + "grad_norm": 1.7667226788610035, + "language_loss": 0.88791883, + "learning_rate": 3.4496245244586577e-06, + "loss": 0.96585095, + "num_input_tokens_seen": 95188655, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.20739746, + "step": 4402, + "time_per_iteration": 2.504951000213623 + }, + { + "auxiliary_loss_clip": 0.06514971, + "auxiliary_loss_mlp": 0.01280074, + "balance_loss_clip": 0.06299384, + "balance_loss_mlp": 0.01261203, + "epoch": 0.264722681497069, + "flos": 22645246965120.0, + "grad_norm": 2.1016866817380944, + "language_loss": 0.78604829, + "learning_rate": 3.4493561787400137e-06, + "loss": 0.86399865, + "num_input_tokens_seen": 95209615, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.18884277, + "step": 4403, + "time_per_iteration": 3.9830996990203857 + }, + { + "auxiliary_loss_clip": 0.06513863, + "auxiliary_loss_mlp": 0.01273109, + "balance_loss_clip": 0.0629956, + "balance_loss_mlp": 0.01254322, + "epoch": 0.264782804749737, + "flos": 22498862682240.0, + "grad_norm": 2.2718142403423887, + "language_loss": 0.88776851, + "learning_rate": 3.4490877780616387e-06, + "loss": 0.96563816, + "num_input_tokens_seen": 95227810, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.18774414, + "step": 4404, + "time_per_iteration": 2.5655670166015625 + }, + { + "auxiliary_loss_clip": 0.06512003, + "auxiliary_loss_mlp": 0.01272083, + "balance_loss_clip": 0.06294957, + "balance_loss_mlp": 0.01253666, + "epoch": 0.26484292800240494, + "flos": 16805891779200.0, + "grad_norm": 1.6853243703943699, + "language_loss": 0.77144921, + "learning_rate": 3.448819322433709e-06, + "loss": 0.84929001, + "num_input_tokens_seen": 95245890, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.18408203, + "step": 4405, + "time_per_iteration": 2.5151660442352295 + }, + { + "auxiliary_loss_clip": 0.06518488, + "auxiliary_loss_mlp": 0.01280263, + "balance_loss_clip": 0.06303011, + "balance_loss_mlp": 0.0126113, + "epoch": 0.2649030512550729, + "flos": 20455939486080.0, + "grad_norm": 1.6552463254663874, + "language_loss": 0.70570582, + "learning_rate": 3.4485508118664066e-06, + "loss": 0.78369337, + "num_input_tokens_seen": 95264955, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.19152832, + "step": 4406, + "time_per_iteration": 2.5817081928253174 + }, + { + "auxiliary_loss_clip": 0.06515051, + "auxiliary_loss_mlp": 0.01282775, + "balance_loss_clip": 0.06304015, + "balance_loss_mlp": 0.01264071, + "epoch": 0.2649631745077409, + "flos": 22422190596480.0, + "grad_norm": 1.6043271976664373, + "language_loss": 0.84213567, + "learning_rate": 3.448282246369912e-06, + "loss": 0.92011392, + "num_input_tokens_seen": 95284245, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 0.18701172, + "step": 4407, + "time_per_iteration": 2.5317513942718506 + }, + { + "auxiliary_loss_clip": 0.06506669, + "auxiliary_loss_mlp": 0.01274017, + "balance_loss_clip": 0.06294346, + "balance_loss_mlp": 0.01255384, + "epoch": 0.26502329776040884, + "flos": 35124794334720.0, + "grad_norm": 1.8863485028384246, + "language_loss": 0.76080608, + "learning_rate": 3.4480136259544084e-06, + "loss": 0.83861291, + "num_input_tokens_seen": 95307125, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.18615723, + "step": 4408, + "time_per_iteration": 4.144388675689697 + }, + { + "auxiliary_loss_clip": 0.06504838, + "auxiliary_loss_mlp": 0.01278565, + "balance_loss_clip": 0.06293095, + "balance_loss_mlp": 0.01259765, + "epoch": 0.2650834210130768, + "flos": 38696073603840.0, + "grad_norm": 1.6572856868324277, + "language_loss": 0.71237993, + "learning_rate": 3.447744950630084e-06, + "loss": 0.79021394, + "num_input_tokens_seen": 95329150, + "router_z_loss_clip": 2.11523438, + "router_z_loss_mlp": 0.18786621, + "step": 4409, + "time_per_iteration": 2.6830790042877197 + }, + { + "auxiliary_loss_clip": 0.06513892, + "auxiliary_loss_mlp": 0.01277956, + "balance_loss_clip": 0.06296389, + "balance_loss_mlp": 0.01258513, + "epoch": 0.26514354426574477, + "flos": 24723655165440.0, + "grad_norm": 1.9985850932403133, + "language_loss": 0.74335337, + "learning_rate": 3.4474762204071253e-06, + "loss": 0.82127184, + "num_input_tokens_seen": 95349880, + "router_z_loss_clip": 2.17578125, + "router_z_loss_mlp": 0.19445801, + "step": 4410, + "time_per_iteration": 2.5640783309936523 + }, + { + "auxiliary_loss_clip": 0.06510055, + "auxiliary_loss_mlp": 0.01275315, + "balance_loss_clip": 0.06293881, + "balance_loss_mlp": 0.01256873, + "epoch": 0.26520366751841273, + "flos": 20346381872640.0, + "grad_norm": 1.7362440314024254, + "language_loss": 0.74604267, + "learning_rate": 3.4472074352957244e-06, + "loss": 0.82389635, + "num_input_tokens_seen": 95368570, + "router_z_loss_clip": 2.1640625, + "router_z_loss_mlp": 0.18457031, + "step": 4411, + "time_per_iteration": 2.681392192840576 + }, + { + "auxiliary_loss_clip": 0.06503807, + "auxiliary_loss_mlp": 0.0127974, + "balance_loss_clip": 0.06292095, + "balance_loss_mlp": 0.01260941, + "epoch": 0.2652637907710807, + "flos": 22350046631040.0, + "grad_norm": 1.9068391403977176, + "language_loss": 0.83043784, + "learning_rate": 3.446938595306071e-06, + "loss": 0.90827328, + "num_input_tokens_seen": 95387065, + "router_z_loss_clip": 2.1171875, + "router_z_loss_mlp": 0.18798828, + "step": 4412, + "time_per_iteration": 2.570462942123413 + }, + { + "auxiliary_loss_clip": 0.06509882, + "auxiliary_loss_mlp": 0.01280008, + "balance_loss_clip": 0.0629638, + "balance_loss_mlp": 0.01260327, + "epoch": 0.26532391402374866, + "flos": 19360279497600.0, + "grad_norm": 1.6015505507863077, + "language_loss": 0.75010121, + "learning_rate": 3.4466697004483622e-06, + "loss": 0.82800013, + "num_input_tokens_seen": 95406345, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.19677734, + "step": 4413, + "time_per_iteration": 2.5575060844421387 + }, + { + "auxiliary_loss_clip": 0.06392879, + "auxiliary_loss_mlp": 0.01259819, + "balance_loss_clip": 0.06288524, + "balance_loss_mlp": 0.01255307, + "epoch": 0.26538403727641663, + "flos": 44804479121280.0, + "grad_norm": 0.9088609657061584, + "language_loss": 0.57055008, + "learning_rate": 3.446400750732793e-06, + "loss": 0.64707708, + "num_input_tokens_seen": 95463595, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.04522705, + "step": 4414, + "time_per_iteration": 3.090242624282837 + }, + { + "auxiliary_loss_clip": 0.06501576, + "auxiliary_loss_mlp": 0.01278206, + "balance_loss_clip": 0.06294522, + "balance_loss_mlp": 0.01260587, + "epoch": 0.26544416052908465, + "flos": 28189359889920.0, + "grad_norm": 1.5322949912702364, + "language_loss": 0.74997067, + "learning_rate": 3.4461317461695625e-06, + "loss": 0.82776845, + "num_input_tokens_seen": 95484115, + "router_z_loss_clip": 2.0703125, + "router_z_loss_mlp": 0.17626953, + "step": 4415, + "time_per_iteration": 2.6143665313720703 + }, + { + "auxiliary_loss_clip": 0.06505995, + "auxiliary_loss_mlp": 0.01278176, + "balance_loss_clip": 0.06289595, + "balance_loss_mlp": 0.0125791, + "epoch": 0.2655042837817526, + "flos": 17570824502400.0, + "grad_norm": 4.108925661978825, + "language_loss": 0.87716872, + "learning_rate": 3.4458626867688707e-06, + "loss": 0.95501041, + "num_input_tokens_seen": 95501435, + "router_z_loss_clip": 2.1640625, + "router_z_loss_mlp": 0.20263672, + "step": 4416, + "time_per_iteration": 2.4974279403686523 + }, + { + "auxiliary_loss_clip": 0.06510112, + "auxiliary_loss_mlp": 0.01280216, + "balance_loss_clip": 0.0629703, + "balance_loss_mlp": 0.0126094, + "epoch": 0.2655644070344206, + "flos": 23411437499520.0, + "grad_norm": 1.4955026126411677, + "language_loss": 0.77089638, + "learning_rate": 3.4455935725409217e-06, + "loss": 0.84879971, + "num_input_tokens_seen": 95520135, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.19274902, + "step": 4417, + "time_per_iteration": 2.576826572418213 + }, + { + "auxiliary_loss_clip": 0.0650158, + "auxiliary_loss_mlp": 0.01274734, + "balance_loss_clip": 0.06292985, + "balance_loss_mlp": 0.01255946, + "epoch": 0.26562453028708854, + "flos": 26475612658560.0, + "grad_norm": 1.3751463134954343, + "language_loss": 0.80062425, + "learning_rate": 3.4453244034959196e-06, + "loss": 0.87838733, + "num_input_tokens_seen": 95541705, + "router_z_loss_clip": 2.08203125, + "router_z_loss_mlp": 0.18786621, + "step": 4418, + "time_per_iteration": 2.573490619659424 + }, + { + "auxiliary_loss_clip": 0.06510676, + "auxiliary_loss_mlp": 0.01274316, + "balance_loss_clip": 0.06295326, + "balance_loss_mlp": 0.01254945, + "epoch": 0.2656846535397565, + "flos": 19213475944320.0, + "grad_norm": 2.092556142181657, + "language_loss": 0.67613918, + "learning_rate": 3.445055179644071e-06, + "loss": 0.7539891, + "num_input_tokens_seen": 95560300, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.19372559, + "step": 4419, + "time_per_iteration": 2.5705552101135254 + }, + { + "auxiliary_loss_clip": 0.06507199, + "auxiliary_loss_mlp": 0.01281966, + "balance_loss_clip": 0.06293494, + "balance_loss_mlp": 0.01262153, + "epoch": 0.2657447767924245, + "flos": 30558566085120.0, + "grad_norm": 1.8356097714997412, + "language_loss": 0.79905182, + "learning_rate": 3.444785900995585e-06, + "loss": 0.87694353, + "num_input_tokens_seen": 95580150, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.19799805, + "step": 4420, + "time_per_iteration": 2.5966663360595703 + }, + { + "auxiliary_loss_clip": 0.06514539, + "auxiliary_loss_mlp": 0.01276693, + "balance_loss_clip": 0.06294198, + "balance_loss_mlp": 0.01256367, + "epoch": 0.26580490004509244, + "flos": 20928984111360.0, + "grad_norm": 2.015825119850129, + "language_loss": 0.81966692, + "learning_rate": 3.444516567560673e-06, + "loss": 0.89757919, + "num_input_tokens_seen": 95597570, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.20324707, + "step": 4421, + "time_per_iteration": 2.5285565853118896 + }, + { + "auxiliary_loss_clip": 0.06503608, + "auxiliary_loss_mlp": 0.01277509, + "balance_loss_clip": 0.06293386, + "balance_loss_mlp": 0.01259341, + "epoch": 0.2658650232977604, + "flos": 43955845297920.0, + "grad_norm": 1.6494646012937118, + "language_loss": 0.66448712, + "learning_rate": 3.444247179349548e-06, + "loss": 0.74229831, + "num_input_tokens_seen": 95619415, + "router_z_loss_clip": 2.1015625, + "router_z_loss_mlp": 0.1817627, + "step": 4422, + "time_per_iteration": 2.715272903442383 + }, + { + "auxiliary_loss_clip": 0.0650918, + "auxiliary_loss_mlp": 0.01275047, + "balance_loss_clip": 0.06296968, + "balance_loss_mlp": 0.01257011, + "epoch": 0.26592514655042837, + "flos": 29724256581120.0, + "grad_norm": 6.571308072686312, + "language_loss": 0.75332773, + "learning_rate": 3.4439777363724252e-06, + "loss": 0.83116996, + "num_input_tokens_seen": 95639155, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.18029785, + "step": 4423, + "time_per_iteration": 2.5891942977905273 + }, + { + "auxiliary_loss_clip": 0.06514621, + "auxiliary_loss_mlp": 0.01277348, + "balance_loss_clip": 0.06297594, + "balance_loss_mlp": 0.01257619, + "epoch": 0.26598526980309634, + "flos": 46687616110080.0, + "grad_norm": 1.5716819541281883, + "language_loss": 0.78054529, + "learning_rate": 3.443708238639522e-06, + "loss": 0.85846502, + "num_input_tokens_seen": 95663320, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.19726562, + "step": 4424, + "time_per_iteration": 2.731308698654175 + }, + { + "auxiliary_loss_clip": 0.06513417, + "auxiliary_loss_mlp": 0.01282972, + "balance_loss_clip": 0.06298374, + "balance_loss_mlp": 0.01263147, + "epoch": 0.2660453930557643, + "flos": 11514115025280.0, + "grad_norm": 1.8953438163908696, + "language_loss": 0.7980895, + "learning_rate": 3.4434386861610573e-06, + "loss": 0.87605333, + "num_input_tokens_seen": 95680260, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.19824219, + "step": 4425, + "time_per_iteration": 2.536639928817749 + }, + { + "auxiliary_loss_clip": 0.0650531, + "auxiliary_loss_mlp": 0.01275945, + "balance_loss_clip": 0.06295954, + "balance_loss_mlp": 0.01257837, + "epoch": 0.26610551630843227, + "flos": 24798692096640.0, + "grad_norm": 1.624984400061838, + "language_loss": 0.81150436, + "learning_rate": 3.4431690789472532e-06, + "loss": 0.88931698, + "num_input_tokens_seen": 95701140, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 0.18103027, + "step": 4426, + "time_per_iteration": 2.55570912361145 + }, + { + "auxiliary_loss_clip": 0.06512492, + "auxiliary_loss_mlp": 0.01281328, + "balance_loss_clip": 0.06298596, + "balance_loss_mlp": 0.01262302, + "epoch": 0.26616563956110023, + "flos": 27643793955840.0, + "grad_norm": 1.6446869519549492, + "language_loss": 0.77695107, + "learning_rate": 3.442899417008333e-06, + "loss": 0.85488927, + "num_input_tokens_seen": 95722060, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.19042969, + "step": 4427, + "time_per_iteration": 2.609236001968384 + }, + { + "auxiliary_loss_clip": 0.06512281, + "auxiliary_loss_mlp": 0.01275028, + "balance_loss_clip": 0.06306126, + "balance_loss_mlp": 0.01257588, + "epoch": 0.26622576281376825, + "flos": 28369887511680.0, + "grad_norm": 1.5754757805335664, + "language_loss": 0.77615106, + "learning_rate": 3.4426297003545227e-06, + "loss": 0.85402417, + "num_input_tokens_seen": 95742495, + "router_z_loss_clip": 2.0625, + "router_z_loss_mlp": 0.17443848, + "step": 4428, + "time_per_iteration": 2.5886542797088623 + }, + { + "auxiliary_loss_clip": 0.06507164, + "auxiliary_loss_mlp": 0.01273818, + "balance_loss_clip": 0.06292614, + "balance_loss_mlp": 0.0125627, + "epoch": 0.2662858860664362, + "flos": 18047265217920.0, + "grad_norm": 1.9210496781424948, + "language_loss": 0.83184117, + "learning_rate": 3.4423599289960495e-06, + "loss": 0.90965092, + "num_input_tokens_seen": 95761510, + "router_z_loss_clip": 2.14746094, + "router_z_loss_mlp": 0.17541504, + "step": 4429, + "time_per_iteration": 2.5387768745422363 + }, + { + "auxiliary_loss_clip": 0.06512052, + "auxiliary_loss_mlp": 0.01276801, + "balance_loss_clip": 0.06301999, + "balance_loss_mlp": 0.01256762, + "epoch": 0.2663460093191042, + "flos": 22752163175040.0, + "grad_norm": 1.799497911690532, + "language_loss": 0.73120302, + "learning_rate": 3.442090102943143e-06, + "loss": 0.80909157, + "num_input_tokens_seen": 95782385, + "router_z_loss_clip": 2.1015625, + "router_z_loss_mlp": 0.20043945, + "step": 4430, + "time_per_iteration": 2.6026084423065186 + }, + { + "auxiliary_loss_clip": 0.06508531, + "auxiliary_loss_mlp": 0.0127429, + "balance_loss_clip": 0.06296858, + "balance_loss_mlp": 0.012548, + "epoch": 0.26640613257177215, + "flos": 16514422951680.0, + "grad_norm": 2.040164300856009, + "language_loss": 0.83262235, + "learning_rate": 3.441820222206035e-06, + "loss": 0.91045058, + "num_input_tokens_seen": 95800595, + "router_z_loss_clip": 2.1171875, + "router_z_loss_mlp": 0.19482422, + "step": 4431, + "time_per_iteration": 2.5464959144592285 + }, + { + "auxiliary_loss_clip": 0.0651544, + "auxiliary_loss_mlp": 0.01281122, + "balance_loss_clip": 0.06296271, + "balance_loss_mlp": 0.01261488, + "epoch": 0.2664662558244401, + "flos": 23082638878080.0, + "grad_norm": 2.4012085548553537, + "language_loss": 0.76319212, + "learning_rate": 3.44155028679496e-06, + "loss": 0.84115773, + "num_input_tokens_seen": 95818480, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.19641113, + "step": 4432, + "time_per_iteration": 2.5570900440216064 + }, + { + "auxiliary_loss_clip": 0.06513382, + "auxiliary_loss_mlp": 0.01279336, + "balance_loss_clip": 0.0629918, + "balance_loss_mlp": 0.01259011, + "epoch": 0.2665263790771081, + "flos": 23776098468480.0, + "grad_norm": 1.7645797084145118, + "language_loss": 0.8352288, + "learning_rate": 3.441280296720154e-06, + "loss": 0.91315603, + "num_input_tokens_seen": 95837205, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.20324707, + "step": 4433, + "time_per_iteration": 2.5431323051452637 + }, + { + "auxiliary_loss_clip": 0.06506403, + "auxiliary_loss_mlp": 0.01279917, + "balance_loss_clip": 0.06294529, + "balance_loss_mlp": 0.01260248, + "epoch": 0.26658650232977604, + "flos": 28008748414080.0, + "grad_norm": 2.0130085710694097, + "language_loss": 0.77006185, + "learning_rate": 3.441010251991854e-06, + "loss": 0.84792507, + "num_input_tokens_seen": 95858395, + "router_z_loss_clip": 2.11230469, + "router_z_loss_mlp": 0.19677734, + "step": 4434, + "time_per_iteration": 2.626286268234253 + }, + { + "auxiliary_loss_clip": 0.06505096, + "auxiliary_loss_mlp": 0.01274565, + "balance_loss_clip": 0.06296869, + "balance_loss_mlp": 0.01255563, + "epoch": 0.266646625582444, + "flos": 22170147914880.0, + "grad_norm": 1.9216331890087734, + "language_loss": 0.82643783, + "learning_rate": 3.440740152620301e-06, + "loss": 0.90423441, + "num_input_tokens_seen": 95877875, + "router_z_loss_clip": 2.07910156, + "router_z_loss_mlp": 0.18994141, + "step": 4435, + "time_per_iteration": 2.519731283187866 + }, + { + "auxiliary_loss_clip": 0.06515168, + "auxiliary_loss_mlp": 0.01287569, + "balance_loss_clip": 0.06296054, + "balance_loss_mlp": 0.01267065, + "epoch": 0.266706748835112, + "flos": 27860687049600.0, + "grad_norm": 2.5550616111147257, + "language_loss": 0.88173652, + "learning_rate": 3.4404699986157376e-06, + "loss": 0.95976388, + "num_input_tokens_seen": 95895820, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.2052002, + "step": 4436, + "time_per_iteration": 2.5790481567382812 + }, + { + "auxiliary_loss_clip": 0.0650726, + "auxiliary_loss_mlp": 0.01276794, + "balance_loss_clip": 0.0629128, + "balance_loss_mlp": 0.01258507, + "epoch": 0.26676687208777994, + "flos": 25819231299840.0, + "grad_norm": 5.920609689832761, + "language_loss": 0.79025435, + "learning_rate": 3.440199789988407e-06, + "loss": 0.86809486, + "num_input_tokens_seen": 95918025, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.1829834, + "step": 4437, + "time_per_iteration": 3.9761762619018555 + }, + { + "auxiliary_loss_clip": 0.06508271, + "auxiliary_loss_mlp": 0.01274387, + "balance_loss_clip": 0.06295269, + "balance_loss_mlp": 0.01256065, + "epoch": 0.2668269953404479, + "flos": 36073399207680.0, + "grad_norm": 3.5501154130665333, + "language_loss": 0.64866304, + "learning_rate": 3.439929526748556e-06, + "loss": 0.72648954, + "num_input_tokens_seen": 95937725, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.18322754, + "step": 4438, + "time_per_iteration": 2.655214786529541 + }, + { + "auxiliary_loss_clip": 0.0650841, + "auxiliary_loss_mlp": 0.01282243, + "balance_loss_clip": 0.0629243, + "balance_loss_mlp": 0.01263015, + "epoch": 0.26688711859311587, + "flos": 26576994499200.0, + "grad_norm": 1.9779853569110368, + "language_loss": 0.76120412, + "learning_rate": 3.4396592089064334e-06, + "loss": 0.83911061, + "num_input_tokens_seen": 95956335, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.1920166, + "step": 4439, + "time_per_iteration": 2.5468099117279053 + }, + { + "auxiliary_loss_clip": 0.06509372, + "auxiliary_loss_mlp": 0.01279302, + "balance_loss_clip": 0.06293344, + "balance_loss_mlp": 0.01259156, + "epoch": 0.26694724184578383, + "flos": 26768968202880.0, + "grad_norm": 1.7452542153948158, + "language_loss": 0.71747917, + "learning_rate": 3.4393888364722897e-06, + "loss": 0.79536593, + "num_input_tokens_seen": 95977135, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.20141602, + "step": 4440, + "time_per_iteration": 2.5845727920532227 + }, + { + "auxiliary_loss_clip": 0.06513558, + "auxiliary_loss_mlp": 0.01278841, + "balance_loss_clip": 0.06297302, + "balance_loss_mlp": 0.01258003, + "epoch": 0.2670073650984518, + "flos": 20965894634880.0, + "grad_norm": 2.018310090260772, + "language_loss": 0.67180222, + "learning_rate": 3.439118409456376e-06, + "loss": 0.74972624, + "num_input_tokens_seen": 95995435, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.20837402, + "step": 4441, + "time_per_iteration": 4.018662691116333 + }, + { + "auxiliary_loss_clip": 0.06511593, + "auxiliary_loss_mlp": 0.01281375, + "balance_loss_clip": 0.06295494, + "balance_loss_mlp": 0.01260692, + "epoch": 0.2670674883511198, + "flos": 28373577091200.0, + "grad_norm": 1.7028334543675463, + "language_loss": 0.77360296, + "learning_rate": 3.4388479278689486e-06, + "loss": 0.8515327, + "num_input_tokens_seen": 96016340, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.20690918, + "step": 4442, + "time_per_iteration": 2.613529682159424 + }, + { + "auxiliary_loss_clip": 0.06397913, + "auxiliary_loss_mlp": 0.0126448, + "balance_loss_clip": 0.06295023, + "balance_loss_mlp": 0.01259818, + "epoch": 0.2671276116037878, + "flos": 58989010970880.0, + "grad_norm": 0.9159689493293411, + "language_loss": 0.61561328, + "learning_rate": 3.4385773917202637e-06, + "loss": 0.6922372, + "num_input_tokens_seen": 96071205, + "router_z_loss_clip": 1.03125, + "router_z_loss_mlp": 0.04653931, + "step": 4443, + "time_per_iteration": 4.460381031036377 + }, + { + "auxiliary_loss_clip": 0.06510781, + "auxiliary_loss_mlp": 0.01278926, + "balance_loss_clip": 0.06294855, + "balance_loss_mlp": 0.0126021, + "epoch": 0.26718773485645575, + "flos": 43955132538240.0, + "grad_norm": 8.593795125602613, + "language_loss": 0.76795793, + "learning_rate": 3.4383068010205793e-06, + "loss": 0.845855, + "num_input_tokens_seen": 96094240, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.18725586, + "step": 4444, + "time_per_iteration": 2.7442104816436768 + }, + { + "auxiliary_loss_clip": 0.06512623, + "auxiliary_loss_mlp": 0.0127732, + "balance_loss_clip": 0.06297334, + "balance_loss_mlp": 0.01256255, + "epoch": 0.2672478581091237, + "flos": 25235329322880.0, + "grad_norm": 2.0392997213265867, + "language_loss": 0.81111336, + "learning_rate": 3.438036155780158e-06, + "loss": 0.88901269, + "num_input_tokens_seen": 96114105, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.21057129, + "step": 4445, + "time_per_iteration": 2.5493359565734863 + }, + { + "auxiliary_loss_clip": 0.06511448, + "auxiliary_loss_mlp": 0.01275318, + "balance_loss_clip": 0.0629541, + "balance_loss_mlp": 0.01256054, + "epoch": 0.2673079813617917, + "flos": 15273594564480.0, + "grad_norm": 1.8279407549944744, + "language_loss": 0.89906365, + "learning_rate": 3.43776545600926e-06, + "loss": 0.97693127, + "num_input_tokens_seen": 96132140, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.19262695, + "step": 4446, + "time_per_iteration": 2.536916971206665 + }, + { + "auxiliary_loss_clip": 0.06512347, + "auxiliary_loss_mlp": 0.01275408, + "balance_loss_clip": 0.06295379, + "balance_loss_mlp": 0.01256894, + "epoch": 0.26736810461445965, + "flos": 25819944059520.0, + "grad_norm": 1.8969857257431861, + "language_loss": 0.68977708, + "learning_rate": 3.437494701718153e-06, + "loss": 0.76765466, + "num_input_tokens_seen": 96152090, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.18518066, + "step": 4447, + "time_per_iteration": 4.071701526641846 + }, + { + "auxiliary_loss_clip": 0.06511723, + "auxiliary_loss_mlp": 0.01279215, + "balance_loss_clip": 0.06295793, + "balance_loss_mlp": 0.01259116, + "epoch": 0.2674282278671276, + "flos": 24318981072000.0, + "grad_norm": 1.8615578685879888, + "language_loss": 0.83522677, + "learning_rate": 3.4372238929171026e-06, + "loss": 0.91313618, + "num_input_tokens_seen": 96170015, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.2010498, + "step": 4448, + "time_per_iteration": 2.581207036972046 + }, + { + "auxiliary_loss_clip": 0.06506026, + "auxiliary_loss_mlp": 0.0127612, + "balance_loss_clip": 0.06295379, + "balance_loss_mlp": 0.01256855, + "epoch": 0.2674883511197956, + "flos": 22821330320640.0, + "grad_norm": 1.5806903023960923, + "language_loss": 0.84385109, + "learning_rate": 3.436953029616378e-06, + "loss": 0.92167258, + "num_input_tokens_seen": 96188065, + "router_z_loss_clip": 2.10742188, + "router_z_loss_mlp": 0.19262695, + "step": 4449, + "time_per_iteration": 2.556368827819824 + }, + { + "auxiliary_loss_clip": 0.06523807, + "auxiliary_loss_mlp": 0.01278506, + "balance_loss_clip": 0.06298804, + "balance_loss_mlp": 0.01256679, + "epoch": 0.26754847437246354, + "flos": 25376514652800.0, + "grad_norm": 2.5106466446094275, + "language_loss": 0.84170121, + "learning_rate": 3.4366821118262506e-06, + "loss": 0.91972435, + "num_input_tokens_seen": 96205780, + "router_z_loss_clip": 2.25390625, + "router_z_loss_mlp": 0.21838379, + "step": 4450, + "time_per_iteration": 2.540792465209961 + }, + { + "auxiliary_loss_clip": 0.06503032, + "auxiliary_loss_mlp": 0.01274274, + "balance_loss_clip": 0.06293193, + "balance_loss_mlp": 0.01255248, + "epoch": 0.2676085976251315, + "flos": 20236698478080.0, + "grad_norm": 1.7838817445044992, + "language_loss": 0.81239712, + "learning_rate": 3.4364111395569937e-06, + "loss": 0.8901701, + "num_input_tokens_seen": 96224990, + "router_z_loss_clip": 2.09570312, + "router_z_loss_mlp": 0.19042969, + "step": 4451, + "time_per_iteration": 2.552764892578125 + }, + { + "auxiliary_loss_clip": 0.06515267, + "auxiliary_loss_mlp": 0.01275992, + "balance_loss_clip": 0.06304526, + "balance_loss_mlp": 0.01257324, + "epoch": 0.26766872087779947, + "flos": 28045784718720.0, + "grad_norm": 1.859886698365648, + "language_loss": 0.87156057, + "learning_rate": 3.436140112818882e-06, + "loss": 0.94947314, + "num_input_tokens_seen": 96245345, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.18664551, + "step": 4452, + "time_per_iteration": 2.580838918685913 + }, + { + "auxiliary_loss_clip": 0.06515863, + "auxiliary_loss_mlp": 0.01278142, + "balance_loss_clip": 0.06301846, + "balance_loss_mlp": 0.01258377, + "epoch": 0.26772884413046744, + "flos": 18329803585920.0, + "grad_norm": 2.0572254627861577, + "language_loss": 0.84003425, + "learning_rate": 3.435869031622194e-06, + "loss": 0.91797435, + "num_input_tokens_seen": 96259000, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.19775391, + "step": 4453, + "time_per_iteration": 2.5120368003845215 + }, + { + "auxiliary_loss_clip": 0.06513035, + "auxiliary_loss_mlp": 0.01281566, + "balance_loss_clip": 0.06298169, + "balance_loss_mlp": 0.01261992, + "epoch": 0.2677889673831354, + "flos": 22134075932160.0, + "grad_norm": 1.66096029715733, + "language_loss": 0.79950684, + "learning_rate": 3.435597895977208e-06, + "loss": 0.87745285, + "num_input_tokens_seen": 96277000, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.19580078, + "step": 4454, + "time_per_iteration": 2.5411524772644043 + }, + { + "auxiliary_loss_clip": 0.06518991, + "auxiliary_loss_mlp": 0.0127963, + "balance_loss_clip": 0.0630191, + "balance_loss_mlp": 0.01259949, + "epoch": 0.2678490906358034, + "flos": 23736001489920.0, + "grad_norm": 1.4726826789128313, + "language_loss": 0.72626883, + "learning_rate": 3.435326705894206e-06, + "loss": 0.80425501, + "num_input_tokens_seen": 96297010, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.19689941, + "step": 4455, + "time_per_iteration": 2.600341558456421 + }, + { + "auxiliary_loss_clip": 0.0650526, + "auxiliary_loss_mlp": 0.01280807, + "balance_loss_clip": 0.06295176, + "balance_loss_mlp": 0.01262675, + "epoch": 0.2679092138884714, + "flos": 21769414963200.0, + "grad_norm": 1.6724393178855028, + "language_loss": 0.74066579, + "learning_rate": 3.435055461383471e-06, + "loss": 0.81852639, + "num_input_tokens_seen": 96315780, + "router_z_loss_clip": 2.09863281, + "router_z_loss_mlp": 0.18139648, + "step": 4456, + "time_per_iteration": 2.5469894409179688 + }, + { + "auxiliary_loss_clip": 0.06520095, + "auxiliary_loss_mlp": 0.01278452, + "balance_loss_clip": 0.06300029, + "balance_loss_mlp": 0.01258127, + "epoch": 0.26796933714113935, + "flos": 19866670848000.0, + "grad_norm": 2.417277333537857, + "language_loss": 0.71260488, + "learning_rate": 3.4347841624552896e-06, + "loss": 0.79059041, + "num_input_tokens_seen": 96333465, + "router_z_loss_clip": 2.20507812, + "router_z_loss_mlp": 0.20324707, + "step": 4457, + "time_per_iteration": 2.592397451400757 + }, + { + "auxiliary_loss_clip": 0.06517951, + "auxiliary_loss_mlp": 0.01279854, + "balance_loss_clip": 0.06301091, + "balance_loss_mlp": 0.01259183, + "epoch": 0.2680294603938073, + "flos": 20054116431360.0, + "grad_norm": 2.0107664890053143, + "language_loss": 0.79466271, + "learning_rate": 3.4345128091199493e-06, + "loss": 0.87264079, + "num_input_tokens_seen": 96352005, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.20666504, + "step": 4458, + "time_per_iteration": 2.5134661197662354 + }, + { + "auxiliary_loss_clip": 0.06383923, + "auxiliary_loss_mlp": 0.01263146, + "balance_loss_clip": 0.06281242, + "balance_loss_mlp": 0.01258718, + "epoch": 0.2680895836464753, + "flos": 72134918334720.0, + "grad_norm": 0.8734266993254428, + "language_loss": 0.5870322, + "learning_rate": 3.434241401387739e-06, + "loss": 0.66350281, + "num_input_tokens_seen": 96406265, + "router_z_loss_clip": 1.02734375, + "router_z_loss_mlp": 0.04437256, + "step": 4459, + "time_per_iteration": 3.2277050018310547 + }, + { + "auxiliary_loss_clip": 0.06506394, + "auxiliary_loss_mlp": 0.01274552, + "balance_loss_clip": 0.06292672, + "balance_loss_mlp": 0.01255633, + "epoch": 0.26814970689914325, + "flos": 20455310580480.0, + "grad_norm": 1.8403982609946155, + "language_loss": 0.85477257, + "learning_rate": 3.4339699392689507e-06, + "loss": 0.93258202, + "num_input_tokens_seen": 96425225, + "router_z_loss_clip": 2.13769531, + "router_z_loss_mlp": 0.18920898, + "step": 4460, + "time_per_iteration": 2.513317346572876 + }, + { + "auxiliary_loss_clip": 0.06504844, + "auxiliary_loss_mlp": 0.01281285, + "balance_loss_clip": 0.06292892, + "balance_loss_mlp": 0.01261866, + "epoch": 0.2682098301518112, + "flos": 17572459656960.0, + "grad_norm": 1.8133404743184358, + "language_loss": 0.69389015, + "learning_rate": 3.4336984227738796e-06, + "loss": 0.7717514, + "num_input_tokens_seen": 96443780, + "router_z_loss_clip": 2.12011719, + "router_z_loss_mlp": 0.19421387, + "step": 4461, + "time_per_iteration": 2.5566093921661377 + }, + { + "auxiliary_loss_clip": 0.06506921, + "auxiliary_loss_mlp": 0.01281085, + "balance_loss_clip": 0.06293105, + "balance_loss_mlp": 0.01260152, + "epoch": 0.2682699534044792, + "flos": 18339237169920.0, + "grad_norm": 1.6584506269914416, + "language_loss": 0.67031932, + "learning_rate": 3.43342685191282e-06, + "loss": 0.74819934, + "num_input_tokens_seen": 96464530, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.20935059, + "step": 4462, + "time_per_iteration": 2.5427775382995605 + }, + { + "auxiliary_loss_clip": 0.06508102, + "auxiliary_loss_mlp": 0.01282385, + "balance_loss_clip": 0.0629629, + "balance_loss_mlp": 0.01263287, + "epoch": 0.26833007665714714, + "flos": 25308311829120.0, + "grad_norm": 1.7808644454945033, + "language_loss": 0.69747704, + "learning_rate": 3.4331552266960705e-06, + "loss": 0.77538192, + "num_input_tokens_seen": 96483345, + "router_z_loss_clip": 2.11523438, + "router_z_loss_mlp": 0.19116211, + "step": 4463, + "time_per_iteration": 2.6194493770599365 + }, + { + "auxiliary_loss_clip": 0.06508362, + "auxiliary_loss_mlp": 0.01280959, + "balance_loss_clip": 0.06291216, + "balance_loss_mlp": 0.0126092, + "epoch": 0.2683901999098151, + "flos": 16104046780800.0, + "grad_norm": 2.9245690778148465, + "language_loss": 0.78600121, + "learning_rate": 3.432883547133931e-06, + "loss": 0.86389446, + "num_input_tokens_seen": 96498305, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.20056152, + "step": 4464, + "time_per_iteration": 2.463418483734131 + }, + { + "auxiliary_loss_clip": 0.06508331, + "auxiliary_loss_mlp": 0.01281824, + "balance_loss_clip": 0.06294504, + "balance_loss_mlp": 0.01262154, + "epoch": 0.2684503231624831, + "flos": 27315414604800.0, + "grad_norm": 1.7531136867378412, + "language_loss": 0.71091688, + "learning_rate": 3.432611813236704e-06, + "loss": 0.78881842, + "num_input_tokens_seen": 96519740, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.19665527, + "step": 4465, + "time_per_iteration": 2.6083028316497803 + }, + { + "auxiliary_loss_clip": 0.06379254, + "auxiliary_loss_mlp": 0.01259677, + "balance_loss_clip": 0.0627647, + "balance_loss_mlp": 0.01255094, + "epoch": 0.26851044641515104, + "flos": 71879060292480.0, + "grad_norm": 0.6551429372657154, + "language_loss": 0.52683848, + "learning_rate": 3.4323400250146943e-06, + "loss": 0.60322779, + "num_input_tokens_seen": 96588870, + "router_z_loss_clip": 1.02929688, + "router_z_loss_mlp": 0.04577637, + "step": 4466, + "time_per_iteration": 3.2851803302764893 + }, + { + "auxiliary_loss_clip": 0.06507096, + "auxiliary_loss_mlp": 0.01283442, + "balance_loss_clip": 0.06291512, + "balance_loss_mlp": 0.01263105, + "epoch": 0.268570569667819, + "flos": 18739676632320.0, + "grad_norm": 10.994589827837663, + "language_loss": 0.74195564, + "learning_rate": 3.4320681824782057e-06, + "loss": 0.81986099, + "num_input_tokens_seen": 96605100, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.20324707, + "step": 4467, + "time_per_iteration": 2.4971463680267334 + }, + { + "auxiliary_loss_clip": 0.06517448, + "auxiliary_loss_mlp": 0.01283031, + "balance_loss_clip": 0.06297839, + "balance_loss_mlp": 0.01264005, + "epoch": 0.268630692920487, + "flos": 18182832324480.0, + "grad_norm": 2.2391086352503504, + "language_loss": 0.81577581, + "learning_rate": 3.4317962856375493e-06, + "loss": 0.89378059, + "num_input_tokens_seen": 96621410, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.19042969, + "step": 4468, + "time_per_iteration": 2.547626256942749 + }, + { + "auxiliary_loss_clip": 0.06377872, + "auxiliary_loss_mlp": 0.01264177, + "balance_loss_clip": 0.06275174, + "balance_loss_mlp": 0.01259552, + "epoch": 0.268690816173155, + "flos": 68754229176960.0, + "grad_norm": 0.8279608156690638, + "language_loss": 0.59413958, + "learning_rate": 3.4315243345030334e-06, + "loss": 0.67056012, + "num_input_tokens_seen": 96684810, + "router_z_loss_clip": 1.03125, + "router_z_loss_mlp": 0.0461731, + "step": 4469, + "time_per_iteration": 3.2565419673919678 + }, + { + "auxiliary_loss_clip": 0.06507242, + "auxiliary_loss_mlp": 0.01284548, + "balance_loss_clip": 0.06292132, + "balance_loss_mlp": 0.01263304, + "epoch": 0.26875093942582295, + "flos": 23300160877440.0, + "grad_norm": 1.9707129205098373, + "language_loss": 0.8163017, + "learning_rate": 3.431252329084972e-06, + "loss": 0.89421958, + "num_input_tokens_seen": 96701920, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.21240234, + "step": 4470, + "time_per_iteration": 2.542893171310425 + }, + { + "auxiliary_loss_clip": 0.06497125, + "auxiliary_loss_mlp": 0.0128145, + "balance_loss_clip": 0.0628901, + "balance_loss_mlp": 0.012619, + "epoch": 0.2688110626784909, + "flos": 21549880465920.0, + "grad_norm": 1.5945085425671264, + "language_loss": 0.83326346, + "learning_rate": 3.4309802693936786e-06, + "loss": 0.91104919, + "num_input_tokens_seen": 96721260, + "router_z_loss_clip": 2.08496094, + "router_z_loss_mlp": 0.19555664, + "step": 4471, + "time_per_iteration": 2.5213489532470703 + }, + { + "auxiliary_loss_clip": 0.06497657, + "auxiliary_loss_mlp": 0.01284463, + "balance_loss_clip": 0.06289607, + "balance_loss_mlp": 0.01264365, + "epoch": 0.2688711859311589, + "flos": 28407804284160.0, + "grad_norm": 1.9607526414443455, + "language_loss": 0.70046443, + "learning_rate": 3.43070815543947e-06, + "loss": 0.77828562, + "num_input_tokens_seen": 96740385, + "router_z_loss_clip": 2.08007812, + "router_z_loss_mlp": 0.20092773, + "step": 4472, + "time_per_iteration": 2.6251678466796875 + }, + { + "auxiliary_loss_clip": 0.06504884, + "auxiliary_loss_mlp": 0.0128234, + "balance_loss_clip": 0.06293008, + "balance_loss_mlp": 0.01263112, + "epoch": 0.26893130918382685, + "flos": 26002148762880.0, + "grad_norm": 1.9293915951077794, + "language_loss": 0.68364072, + "learning_rate": 3.4304359872326656e-06, + "loss": 0.76151299, + "num_input_tokens_seen": 96761860, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.19213867, + "step": 4473, + "time_per_iteration": 2.5682830810546875 + }, + { + "auxiliary_loss_clip": 0.06499921, + "auxiliary_loss_mlp": 0.01278958, + "balance_loss_clip": 0.06292213, + "balance_loss_mlp": 0.01259467, + "epoch": 0.2689914324364948, + "flos": 20345878748160.0, + "grad_norm": 1.608174101079712, + "language_loss": 0.83682281, + "learning_rate": 3.4301637647835843e-06, + "loss": 0.91461158, + "num_input_tokens_seen": 96781890, + "router_z_loss_clip": 2.08007812, + "router_z_loss_mlp": 0.19470215, + "step": 4474, + "time_per_iteration": 2.554151773452759 + }, + { + "auxiliary_loss_clip": 0.06502855, + "auxiliary_loss_mlp": 0.01275806, + "balance_loss_clip": 0.06296148, + "balance_loss_mlp": 0.01256482, + "epoch": 0.2690515556891628, + "flos": 19470759505920.0, + "grad_norm": 1.847749203594977, + "language_loss": 0.70725596, + "learning_rate": 3.4298914881025494e-06, + "loss": 0.78504252, + "num_input_tokens_seen": 96800390, + "router_z_loss_clip": 2.06835938, + "router_z_loss_mlp": 0.19348145, + "step": 4475, + "time_per_iteration": 2.5116677284240723 + }, + { + "auxiliary_loss_clip": 0.06503256, + "auxiliary_loss_mlp": 0.01277275, + "balance_loss_clip": 0.06287575, + "balance_loss_mlp": 0.01257188, + "epoch": 0.26911167894183075, + "flos": 18151875440640.0, + "grad_norm": 2.2814450019498236, + "language_loss": 0.73125452, + "learning_rate": 3.4296191571998863e-06, + "loss": 0.80905986, + "num_input_tokens_seen": 96816685, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.20092773, + "step": 4476, + "time_per_iteration": 3.923501968383789 + }, + { + "auxiliary_loss_clip": 0.0650249, + "auxiliary_loss_mlp": 0.01274263, + "balance_loss_clip": 0.06291398, + "balance_loss_mlp": 0.01255511, + "epoch": 0.2691718021944987, + "flos": 19981385487360.0, + "grad_norm": 1.4862356596427981, + "language_loss": 0.80676347, + "learning_rate": 3.429346772085922e-06, + "loss": 0.88453096, + "num_input_tokens_seen": 96836285, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 0.18762207, + "step": 4477, + "time_per_iteration": 2.562681198120117 + }, + { + "auxiliary_loss_clip": 0.06506729, + "auxiliary_loss_mlp": 0.01275723, + "balance_loss_clip": 0.06289821, + "balance_loss_mlp": 0.01254873, + "epoch": 0.2692319254471667, + "flos": 37455622560000.0, + "grad_norm": 1.8507584096301994, + "language_loss": 0.65612036, + "learning_rate": 3.429074332770984e-06, + "loss": 0.73394483, + "num_input_tokens_seen": 96857745, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.20861816, + "step": 4478, + "time_per_iteration": 2.6743321418762207 + }, + { + "auxiliary_loss_clip": 0.06505084, + "auxiliary_loss_mlp": 0.01278495, + "balance_loss_clip": 0.06291381, + "balance_loss_mlp": 0.01259242, + "epoch": 0.26929204869983464, + "flos": 22134411348480.0, + "grad_norm": 2.2415663972983864, + "language_loss": 0.81841063, + "learning_rate": 3.4288018392654047e-06, + "loss": 0.89624637, + "num_input_tokens_seen": 96877295, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.19250488, + "step": 4479, + "time_per_iteration": 2.563365936279297 + }, + { + "auxiliary_loss_clip": 0.06510025, + "auxiliary_loss_mlp": 0.01277354, + "balance_loss_clip": 0.06295313, + "balance_loss_mlp": 0.01258305, + "epoch": 0.2693521719525026, + "flos": 19799055002880.0, + "grad_norm": 1.97047433874797, + "language_loss": 0.81362212, + "learning_rate": 3.4285292915795166e-06, + "loss": 0.89149588, + "num_input_tokens_seen": 96896160, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.19055176, + "step": 4480, + "time_per_iteration": 2.505098342895508 + }, + { + "auxiliary_loss_clip": 0.06504171, + "auxiliary_loss_mlp": 0.01276381, + "balance_loss_clip": 0.06296593, + "balance_loss_mlp": 0.01257677, + "epoch": 0.2694122952051706, + "flos": 21000415317120.0, + "grad_norm": 1.6210366032838512, + "language_loss": 0.7826978, + "learning_rate": 3.4282566897236543e-06, + "loss": 0.86050338, + "num_input_tokens_seen": 96915410, + "router_z_loss_clip": 2.07421875, + "router_z_loss_mlp": 0.18713379, + "step": 4481, + "time_per_iteration": 4.100890874862671 + }, + { + "auxiliary_loss_clip": 0.06511036, + "auxiliary_loss_mlp": 0.01275006, + "balance_loss_clip": 0.06298155, + "balance_loss_mlp": 0.01254192, + "epoch": 0.2694724184578386, + "flos": 25856519166720.0, + "grad_norm": 1.8924674974759383, + "language_loss": 0.74293458, + "learning_rate": 3.4279840337081547e-06, + "loss": 0.820795, + "num_input_tokens_seen": 96937865, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.20788574, + "step": 4482, + "time_per_iteration": 4.145740747451782 + }, + { + "auxiliary_loss_clip": 0.06511661, + "auxiliary_loss_mlp": 0.01276613, + "balance_loss_clip": 0.06299306, + "balance_loss_mlp": 0.01256836, + "epoch": 0.26953254171050656, + "flos": 21733594542720.0, + "grad_norm": 2.48131981073459, + "language_loss": 0.72700799, + "learning_rate": 3.4277113235433584e-06, + "loss": 0.80489069, + "num_input_tokens_seen": 96957710, + "router_z_loss_clip": 2.12109375, + "router_z_loss_mlp": 0.19763184, + "step": 4483, + "time_per_iteration": 2.5375680923461914 + }, + { + "auxiliary_loss_clip": 0.06523035, + "auxiliary_loss_mlp": 0.01278438, + "balance_loss_clip": 0.0630566, + "balance_loss_mlp": 0.01257994, + "epoch": 0.2695926649631745, + "flos": 19689078119040.0, + "grad_norm": 2.054691934345778, + "language_loss": 0.87485874, + "learning_rate": 3.427438559239605e-06, + "loss": 0.95287347, + "num_input_tokens_seen": 96975890, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.20446777, + "step": 4484, + "time_per_iteration": 2.541909694671631 + }, + { + "auxiliary_loss_clip": 0.06515766, + "auxiliary_loss_mlp": 0.01278738, + "balance_loss_clip": 0.06300886, + "balance_loss_mlp": 0.01259474, + "epoch": 0.2696527882158425, + "flos": 32894257847040.0, + "grad_norm": 2.0183728032076966, + "language_loss": 0.66971946, + "learning_rate": 3.427165740807239e-06, + "loss": 0.74766451, + "num_input_tokens_seen": 96998595, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.19262695, + "step": 4485, + "time_per_iteration": 2.623896598815918 + }, + { + "auxiliary_loss_clip": 0.06514997, + "auxiliary_loss_mlp": 0.01282999, + "balance_loss_clip": 0.06301111, + "balance_loss_mlp": 0.01262877, + "epoch": 0.26971291146851045, + "flos": 12128806177920.0, + "grad_norm": 3.3281733059389498, + "language_loss": 0.74281263, + "learning_rate": 3.426892868256604e-06, + "loss": 0.82079262, + "num_input_tokens_seen": 97013715, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.2010498, + "step": 4486, + "time_per_iteration": 2.525820016860962 + }, + { + "auxiliary_loss_clip": 0.06519947, + "auxiliary_loss_mlp": 0.01289409, + "balance_loss_clip": 0.06302445, + "balance_loss_mlp": 0.01268846, + "epoch": 0.2697730347211784, + "flos": 22640467282560.0, + "grad_norm": 2.8316541967285183, + "language_loss": 0.84592897, + "learning_rate": 3.4266199415980495e-06, + "loss": 0.92402256, + "num_input_tokens_seen": 97031570, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.20556641, + "step": 4487, + "time_per_iteration": 3.936244249343872 + }, + { + "auxiliary_loss_clip": 0.06520635, + "auxiliary_loss_mlp": 0.01285695, + "balance_loss_clip": 0.06303369, + "balance_loss_mlp": 0.01264845, + "epoch": 0.2698331579738464, + "flos": 23519695374720.0, + "grad_norm": 2.431656191901387, + "language_loss": 0.73194599, + "learning_rate": 3.4263469608419234e-06, + "loss": 0.81000936, + "num_input_tokens_seen": 97049815, + "router_z_loss_clip": 2.17578125, + "router_z_loss_mlp": 0.20861816, + "step": 4488, + "time_per_iteration": 2.522861957550049 + }, + { + "auxiliary_loss_clip": 0.06516892, + "auxiliary_loss_mlp": 0.0127853, + "balance_loss_clip": 0.06303044, + "balance_loss_mlp": 0.01258681, + "epoch": 0.26989328122651435, + "flos": 24647360423040.0, + "grad_norm": 1.6427618857215789, + "language_loss": 0.84162384, + "learning_rate": 3.426073925998578e-06, + "loss": 0.91957808, + "num_input_tokens_seen": 97067570, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.1986084, + "step": 4489, + "time_per_iteration": 2.558133602142334 + }, + { + "auxiliary_loss_clip": 0.06523076, + "auxiliary_loss_mlp": 0.0128704, + "balance_loss_clip": 0.0630331, + "balance_loss_mlp": 0.01265821, + "epoch": 0.2699534044791823, + "flos": 10775904554880.0, + "grad_norm": 2.0847356564254014, + "language_loss": 0.90199494, + "learning_rate": 3.4258008370783656e-06, + "loss": 0.98009604, + "num_input_tokens_seen": 97082180, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.21228027, + "step": 4490, + "time_per_iteration": 2.461840867996216 + }, + { + "auxiliary_loss_clip": 0.06505966, + "auxiliary_loss_mlp": 0.01275421, + "balance_loss_clip": 0.06297465, + "balance_loss_mlp": 0.01256288, + "epoch": 0.2700135277318503, + "flos": 36180021928320.0, + "grad_norm": 2.13129158363681, + "language_loss": 0.73836827, + "learning_rate": 3.4255276940916434e-06, + "loss": 0.81618214, + "num_input_tokens_seen": 97103470, + "router_z_loss_clip": 2.08496094, + "router_z_loss_mlp": 0.19128418, + "step": 4491, + "time_per_iteration": 2.6479640007019043 + }, + { + "auxiliary_loss_clip": 0.06516409, + "auxiliary_loss_mlp": 0.01284517, + "balance_loss_clip": 0.06303698, + "balance_loss_mlp": 0.01264788, + "epoch": 0.27007365098451824, + "flos": 17424020949120.0, + "grad_norm": 2.8438546283757793, + "language_loss": 0.74296927, + "learning_rate": 3.4252544970487676e-06, + "loss": 0.82097852, + "num_input_tokens_seen": 97118100, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.19726562, + "step": 4492, + "time_per_iteration": 2.462226629257202 + }, + { + "auxiliary_loss_clip": 0.06510016, + "auxiliary_loss_mlp": 0.01279369, + "balance_loss_clip": 0.06300159, + "balance_loss_mlp": 0.01259926, + "epoch": 0.2701337742371862, + "flos": 23192448053760.0, + "grad_norm": 1.7359009481863723, + "language_loss": 0.88954818, + "learning_rate": 3.4249812459600986e-06, + "loss": 0.96744204, + "num_input_tokens_seen": 97136765, + "router_z_loss_clip": 2.09960938, + "router_z_loss_mlp": 0.19445801, + "step": 4493, + "time_per_iteration": 2.5385639667510986 + }, + { + "auxiliary_loss_clip": 0.06509903, + "auxiliary_loss_mlp": 0.01283619, + "balance_loss_clip": 0.06296834, + "balance_loss_mlp": 0.01265201, + "epoch": 0.2701938974898542, + "flos": 24396365917440.0, + "grad_norm": 1.3961943163888275, + "language_loss": 0.71571529, + "learning_rate": 3.424707940835998e-06, + "loss": 0.79365045, + "num_input_tokens_seen": 97157470, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.1842041, + "step": 4494, + "time_per_iteration": 2.542644500732422 + }, + { + "auxiliary_loss_clip": 0.06499185, + "auxiliary_loss_mlp": 0.01282381, + "balance_loss_clip": 0.0629191, + "balance_loss_mlp": 0.01263713, + "epoch": 0.2702540207425222, + "flos": 26221641333120.0, + "grad_norm": 2.6689304552375366, + "language_loss": 0.8697859, + "learning_rate": 3.42443458168683e-06, + "loss": 0.94760156, + "num_input_tokens_seen": 97176905, + "router_z_loss_clip": 2.07226562, + "router_z_loss_mlp": 0.18652344, + "step": 4495, + "time_per_iteration": 2.6052844524383545 + }, + { + "auxiliary_loss_clip": 0.06507061, + "auxiliary_loss_mlp": 0.01284126, + "balance_loss_clip": 0.06293719, + "balance_loss_mlp": 0.01263944, + "epoch": 0.27031414399519016, + "flos": 22932439234560.0, + "grad_norm": 1.7866659337876034, + "language_loss": 0.76608586, + "learning_rate": 3.424161168522959e-06, + "loss": 0.84399772, + "num_input_tokens_seen": 97196380, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.20166016, + "step": 4496, + "time_per_iteration": 2.5191855430603027 + }, + { + "auxiliary_loss_clip": 0.06445029, + "auxiliary_loss_mlp": 0.01262058, + "balance_loss_clip": 0.06340651, + "balance_loss_mlp": 0.01257498, + "epoch": 0.2703742672478581, + "flos": 63037904912640.0, + "grad_norm": 0.6591771406427821, + "language_loss": 0.49976462, + "learning_rate": 3.423887701354754e-06, + "loss": 0.57683551, + "num_input_tokens_seen": 97260100, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.0456543, + "step": 4497, + "time_per_iteration": 3.2403736114501953 + }, + { + "auxiliary_loss_clip": 0.06506558, + "auxiliary_loss_mlp": 0.01283587, + "balance_loss_clip": 0.06295481, + "balance_loss_mlp": 0.01266039, + "epoch": 0.2704343905005261, + "flos": 18846341280000.0, + "grad_norm": 2.8639988273107657, + "language_loss": 0.72431815, + "learning_rate": 3.4236141801925847e-06, + "loss": 0.80221957, + "num_input_tokens_seen": 97277935, + "router_z_loss_clip": 2.11328125, + "router_z_loss_mlp": 0.17553711, + "step": 4498, + "time_per_iteration": 2.509298086166382 + }, + { + "auxiliary_loss_clip": 0.06432115, + "auxiliary_loss_mlp": 0.01259251, + "balance_loss_clip": 0.06327531, + "balance_loss_mlp": 0.01254679, + "epoch": 0.27049451375319405, + "flos": 71253635817600.0, + "grad_norm": 0.9422572009255263, + "language_loss": 0.5900467, + "learning_rate": 3.4233406050468237e-06, + "loss": 0.66696036, + "num_input_tokens_seen": 97338845, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.04577637, + "step": 4499, + "time_per_iteration": 3.2116270065307617 + }, + { + "auxiliary_loss_clip": 0.06502165, + "auxiliary_loss_mlp": 0.01281307, + "balance_loss_clip": 0.06292122, + "balance_loss_mlp": 0.01261422, + "epoch": 0.270554637005862, + "flos": 24285257003520.0, + "grad_norm": 2.589715304320551, + "language_loss": 0.73975158, + "learning_rate": 3.4230669759278438e-06, + "loss": 0.8175863, + "num_input_tokens_seen": 97356640, + "router_z_loss_clip": 2.09960938, + "router_z_loss_mlp": 0.19897461, + "step": 4500, + "time_per_iteration": 2.537710189819336 + }, + { + "auxiliary_loss_clip": 0.06501484, + "auxiliary_loss_mlp": 0.01276741, + "balance_loss_clip": 0.06289591, + "balance_loss_mlp": 0.01257965, + "epoch": 0.27061476025853, + "flos": 17636889047040.0, + "grad_norm": 2.788947169536346, + "language_loss": 0.81470346, + "learning_rate": 3.4227932928460215e-06, + "loss": 0.89248574, + "num_input_tokens_seen": 97372585, + "router_z_loss_clip": 2.11523438, + "router_z_loss_mlp": 0.18774414, + "step": 4501, + "time_per_iteration": 2.5423648357391357 + }, + { + "auxiliary_loss_clip": 0.06510358, + "auxiliary_loss_mlp": 0.01287368, + "balance_loss_clip": 0.06294559, + "balance_loss_mlp": 0.01267579, + "epoch": 0.27067488351119795, + "flos": 22716594316800.0, + "grad_norm": 1.5278818221734496, + "language_loss": 0.7303015, + "learning_rate": 3.422519555811735e-06, + "loss": 0.8082788, + "num_input_tokens_seen": 97393315, + "router_z_loss_clip": 2.15917969, + "router_z_loss_mlp": 0.19775391, + "step": 4502, + "time_per_iteration": 2.5804011821746826 + }, + { + "auxiliary_loss_clip": 0.06507368, + "auxiliary_loss_mlp": 0.01278341, + "balance_loss_clip": 0.06289332, + "balance_loss_mlp": 0.01258576, + "epoch": 0.2707350067638659, + "flos": 41729333806080.0, + "grad_norm": 1.6949775973694576, + "language_loss": 0.69090897, + "learning_rate": 3.4222457648353642e-06, + "loss": 0.76876605, + "num_input_tokens_seen": 97417860, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.19763184, + "step": 4503, + "time_per_iteration": 2.740292549133301 + }, + { + "auxiliary_loss_clip": 0.06502387, + "auxiliary_loss_mlp": 0.0128307, + "balance_loss_clip": 0.06290283, + "balance_loss_mlp": 0.01263746, + "epoch": 0.2707951300165339, + "flos": 20199159048960.0, + "grad_norm": 1.9752400870870641, + "language_loss": 0.69172543, + "learning_rate": 3.4219719199272918e-06, + "loss": 0.76958001, + "num_input_tokens_seen": 97436780, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.1932373, + "step": 4504, + "time_per_iteration": 2.548069477081299 + }, + { + "auxiliary_loss_clip": 0.06502561, + "auxiliary_loss_mlp": 0.0128216, + "balance_loss_clip": 0.06291538, + "balance_loss_mlp": 0.01263492, + "epoch": 0.27085525326920185, + "flos": 21440364779520.0, + "grad_norm": 2.9855030089462993, + "language_loss": 0.76122642, + "learning_rate": 3.421698021097902e-06, + "loss": 0.8390736, + "num_input_tokens_seen": 97456190, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 0.18652344, + "step": 4505, + "time_per_iteration": 2.527165651321411 + }, + { + "auxiliary_loss_clip": 0.06505956, + "auxiliary_loss_mlp": 0.0128432, + "balance_loss_clip": 0.06289993, + "balance_loss_mlp": 0.01264459, + "epoch": 0.2709153765218698, + "flos": 17680885240320.0, + "grad_norm": 2.0693026918396487, + "language_loss": 0.73959178, + "learning_rate": 3.42142406835758e-06, + "loss": 0.81749451, + "num_input_tokens_seen": 97474545, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.1986084, + "step": 4506, + "time_per_iteration": 2.5131149291992188 + }, + { + "auxiliary_loss_clip": 0.0650361, + "auxiliary_loss_mlp": 0.01278265, + "balance_loss_clip": 0.06290495, + "balance_loss_mlp": 0.01258595, + "epoch": 0.2709754997745378, + "flos": 24462136972800.0, + "grad_norm": 1.8128724600792683, + "language_loss": 0.81647539, + "learning_rate": 3.421150061716715e-06, + "loss": 0.89429414, + "num_input_tokens_seen": 97494520, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.1965332, + "step": 4507, + "time_per_iteration": 2.684535503387451 + }, + { + "auxiliary_loss_clip": 0.06395597, + "auxiliary_loss_mlp": 0.01254395, + "balance_loss_clip": 0.0629042, + "balance_loss_mlp": 0.01250205, + "epoch": 0.2710356230272058, + "flos": 65229602232960.0, + "grad_norm": 0.712447813073055, + "language_loss": 0.50718415, + "learning_rate": 3.420876001185698e-06, + "loss": 0.58368409, + "num_input_tokens_seen": 97552455, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.04193115, + "step": 4508, + "time_per_iteration": 3.111752986907959 + }, + { + "auxiliary_loss_clip": 0.0649793, + "auxiliary_loss_mlp": 0.01272465, + "balance_loss_clip": 0.06289998, + "balance_loss_mlp": 0.01255263, + "epoch": 0.27109574627987376, + "flos": 25491606635520.0, + "grad_norm": 2.0258218163980213, + "language_loss": 0.75015354, + "learning_rate": 3.4206018867749197e-06, + "loss": 0.82785749, + "num_input_tokens_seen": 97572650, + "router_z_loss_clip": 2.08007812, + "router_z_loss_mlp": 0.171875, + "step": 4509, + "time_per_iteration": 2.555316209793091 + }, + { + "auxiliary_loss_clip": 0.06495094, + "auxiliary_loss_mlp": 0.01275639, + "balance_loss_clip": 0.06289092, + "balance_loss_mlp": 0.01256947, + "epoch": 0.2711558695325417, + "flos": 19688910410880.0, + "grad_norm": 2.3712253737099767, + "language_loss": 0.71864915, + "learning_rate": 3.4203277184947757e-06, + "loss": 0.79635644, + "num_input_tokens_seen": 97591150, + "router_z_loss_clip": 2.05859375, + "router_z_loss_mlp": 0.18688965, + "step": 4510, + "time_per_iteration": 2.5428407192230225 + }, + { + "auxiliary_loss_clip": 0.06499062, + "auxiliary_loss_mlp": 0.01279444, + "balance_loss_clip": 0.0629103, + "balance_loss_mlp": 0.012608, + "epoch": 0.2712159927852097, + "flos": 18593627765760.0, + "grad_norm": 2.5496745820614515, + "language_loss": 0.71357799, + "learning_rate": 3.4200534963556627e-06, + "loss": 0.791363, + "num_input_tokens_seen": 97607410, + "router_z_loss_clip": 2.07910156, + "router_z_loss_mlp": 0.1862793, + "step": 4511, + "time_per_iteration": 2.483739137649536 + }, + { + "auxiliary_loss_clip": 0.06505338, + "auxiliary_loss_mlp": 0.01274141, + "balance_loss_clip": 0.06292383, + "balance_loss_mlp": 0.01254817, + "epoch": 0.27127611603787766, + "flos": 25637403939840.0, + "grad_norm": 1.9202075405224084, + "language_loss": 0.81604505, + "learning_rate": 3.419779220367979e-06, + "loss": 0.89383984, + "num_input_tokens_seen": 97626870, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.1932373, + "step": 4512, + "time_per_iteration": 2.593388795852661 + }, + { + "auxiliary_loss_clip": 0.06503928, + "auxiliary_loss_mlp": 0.01273233, + "balance_loss_clip": 0.06296667, + "balance_loss_mlp": 0.01255554, + "epoch": 0.2713362392905456, + "flos": 23155663311360.0, + "grad_norm": 1.8072498717910284, + "language_loss": 0.809147, + "learning_rate": 3.419504890542124e-06, + "loss": 0.88691866, + "num_input_tokens_seen": 97646595, + "router_z_loss_clip": 2.07226562, + "router_z_loss_mlp": 0.17663574, + "step": 4513, + "time_per_iteration": 2.519502639770508 + }, + { + "auxiliary_loss_clip": 0.06501831, + "auxiliary_loss_mlp": 0.01278947, + "balance_loss_clip": 0.0628939, + "balance_loss_mlp": 0.01261018, + "epoch": 0.2713963625432136, + "flos": 18371409937920.0, + "grad_norm": 3.81368034370299, + "language_loss": 0.88867396, + "learning_rate": 3.4192305068885026e-06, + "loss": 0.96648169, + "num_input_tokens_seen": 97665485, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.17932129, + "step": 4514, + "time_per_iteration": 2.54484224319458 + }, + { + "auxiliary_loss_clip": 0.06502509, + "auxiliary_loss_mlp": 0.01277056, + "balance_loss_clip": 0.06292502, + "balance_loss_mlp": 0.01258709, + "epoch": 0.27145648579588155, + "flos": 22498275703680.0, + "grad_norm": 1.610354502574947, + "language_loss": 0.92402363, + "learning_rate": 3.418956069417517e-06, + "loss": 1.00181937, + "num_input_tokens_seen": 97683800, + "router_z_loss_clip": 2.09863281, + "router_z_loss_mlp": 0.18347168, + "step": 4515, + "time_per_iteration": 2.5121350288391113 + }, + { + "auxiliary_loss_clip": 0.06511631, + "auxiliary_loss_mlp": 0.01281138, + "balance_loss_clip": 0.06296228, + "balance_loss_mlp": 0.01259669, + "epoch": 0.2715166090485495, + "flos": 19244265120000.0, + "grad_norm": 2.423654901761582, + "language_loss": 0.73979908, + "learning_rate": 3.4186815781395756e-06, + "loss": 0.81772685, + "num_input_tokens_seen": 97700505, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.21435547, + "step": 4516, + "time_per_iteration": 3.917318344116211 + }, + { + "auxiliary_loss_clip": 0.06498563, + "auxiliary_loss_mlp": 0.01272915, + "balance_loss_clip": 0.06289151, + "balance_loss_mlp": 0.01253627, + "epoch": 0.2715767323012175, + "flos": 17714902798080.0, + "grad_norm": 1.854313921742246, + "language_loss": 0.76927733, + "learning_rate": 3.4184070330650866e-06, + "loss": 0.84699214, + "num_input_tokens_seen": 97717410, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 0.19287109, + "step": 4517, + "time_per_iteration": 2.576723098754883 + }, + { + "auxiliary_loss_clip": 0.06500702, + "auxiliary_loss_mlp": 0.01276287, + "balance_loss_clip": 0.06291518, + "balance_loss_mlp": 0.01256701, + "epoch": 0.27163685555388545, + "flos": 22389430849920.0, + "grad_norm": 2.0334929641517956, + "language_loss": 0.7833634, + "learning_rate": 3.4181324342044607e-06, + "loss": 0.86113334, + "num_input_tokens_seen": 97734545, + "router_z_loss_clip": 2.09179688, + "router_z_loss_mlp": 0.19592285, + "step": 4518, + "time_per_iteration": 2.5335004329681396 + }, + { + "auxiliary_loss_clip": 0.06502728, + "auxiliary_loss_mlp": 0.01276691, + "balance_loss_clip": 0.06292961, + "balance_loss_mlp": 0.0125925, + "epoch": 0.2716969788065534, + "flos": 22353358867200.0, + "grad_norm": 1.6261203259974584, + "language_loss": 0.68873644, + "learning_rate": 3.41785778156811e-06, + "loss": 0.76653063, + "num_input_tokens_seen": 97754000, + "router_z_loss_clip": 2.09863281, + "router_z_loss_mlp": 0.17443848, + "step": 4519, + "time_per_iteration": 2.60939359664917 + }, + { + "auxiliary_loss_clip": 0.06500532, + "auxiliary_loss_mlp": 0.0127723, + "balance_loss_clip": 0.06291862, + "balance_loss_mlp": 0.01260302, + "epoch": 0.2717571020592214, + "flos": 25235497031040.0, + "grad_norm": 1.9620818548787327, + "language_loss": 0.75925875, + "learning_rate": 3.417583075166451e-06, + "loss": 0.83703637, + "num_input_tokens_seen": 97772080, + "router_z_loss_clip": 2.08789062, + "router_z_loss_mlp": 0.16931152, + "step": 4520, + "time_per_iteration": 3.988518238067627 + }, + { + "auxiliary_loss_clip": 0.06503896, + "auxiliary_loss_mlp": 0.012736, + "balance_loss_clip": 0.06291716, + "balance_loss_mlp": 0.01253942, + "epoch": 0.2718172253118894, + "flos": 20195343688320.0, + "grad_norm": 3.05783023991908, + "language_loss": 0.76690799, + "learning_rate": 3.4173083150099e-06, + "loss": 0.84468293, + "num_input_tokens_seen": 97789370, + "router_z_loss_clip": 2.12304688, + "router_z_loss_mlp": 0.1965332, + "step": 4521, + "time_per_iteration": 3.9463987350463867 + }, + { + "auxiliary_loss_clip": 0.0650706, + "auxiliary_loss_mlp": 0.0127528, + "balance_loss_clip": 0.06291709, + "balance_loss_mlp": 0.01255432, + "epoch": 0.27187734856455736, + "flos": 14324318858880.0, + "grad_norm": 2.0792585055499435, + "language_loss": 0.74927616, + "learning_rate": 3.417033501108875e-06, + "loss": 0.82709956, + "num_input_tokens_seen": 97807385, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.19824219, + "step": 4522, + "time_per_iteration": 2.576792001724243 + }, + { + "auxiliary_loss_clip": 0.06503602, + "auxiliary_loss_mlp": 0.01276885, + "balance_loss_clip": 0.06291734, + "balance_loss_mlp": 0.01258884, + "epoch": 0.27193747181722533, + "flos": 21114375269760.0, + "grad_norm": 1.7974712998396492, + "language_loss": 0.73055947, + "learning_rate": 3.416758633473798e-06, + "loss": 0.80836433, + "num_input_tokens_seen": 97827930, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.17993164, + "step": 4523, + "time_per_iteration": 2.5116758346557617 + }, + { + "auxiliary_loss_clip": 0.06493908, + "auxiliary_loss_mlp": 0.01278011, + "balance_loss_clip": 0.06286807, + "balance_loss_mlp": 0.01259665, + "epoch": 0.2719975950698933, + "flos": 19688910410880.0, + "grad_norm": 1.3231652709358832, + "language_loss": 0.74779463, + "learning_rate": 3.4164837121150915e-06, + "loss": 0.82551384, + "num_input_tokens_seen": 97847440, + "router_z_loss_clip": 2.0703125, + "router_z_loss_mlp": 0.18334961, + "step": 4524, + "time_per_iteration": 2.5318901538848877 + }, + { + "auxiliary_loss_clip": 0.06503987, + "auxiliary_loss_mlp": 0.01277059, + "balance_loss_clip": 0.06291917, + "balance_loss_mlp": 0.01258248, + "epoch": 0.27205771832256126, + "flos": 24761488083840.0, + "grad_norm": 2.222226091972884, + "language_loss": 0.76783192, + "learning_rate": 3.4162087370431803e-06, + "loss": 0.84564239, + "num_input_tokens_seen": 97867620, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.18811035, + "step": 4525, + "time_per_iteration": 2.594209909439087 + }, + { + "auxiliary_loss_clip": 0.06492639, + "auxiliary_loss_mlp": 0.01271759, + "balance_loss_clip": 0.0628486, + "balance_loss_mlp": 0.01254712, + "epoch": 0.2721178415752292, + "flos": 21760903774080.0, + "grad_norm": 1.8877793172534498, + "language_loss": 0.82166058, + "learning_rate": 3.4159337082684926e-06, + "loss": 0.89930463, + "num_input_tokens_seen": 97884345, + "router_z_loss_clip": 2.07617188, + "router_z_loss_mlp": 0.17041016, + "step": 4526, + "time_per_iteration": 3.9739785194396973 + }, + { + "auxiliary_loss_clip": 0.06510428, + "auxiliary_loss_mlp": 0.01273954, + "balance_loss_clip": 0.06292043, + "balance_loss_mlp": 0.01254189, + "epoch": 0.2721779648278972, + "flos": 12681667416960.0, + "grad_norm": 2.608637418907724, + "language_loss": 0.77407986, + "learning_rate": 3.4156586258014566e-06, + "loss": 0.8519237, + "num_input_tokens_seen": 97901500, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.19763184, + "step": 4527, + "time_per_iteration": 2.5017969608306885 + }, + { + "auxiliary_loss_clip": 0.06502572, + "auxiliary_loss_mlp": 0.01278457, + "balance_loss_clip": 0.0629287, + "balance_loss_mlp": 0.01260194, + "epoch": 0.27223808808056515, + "flos": 16258774544640.0, + "grad_norm": 2.1231016049423608, + "language_loss": 0.82676923, + "learning_rate": 3.415383489652503e-06, + "loss": 0.90457952, + "num_input_tokens_seen": 97917800, + "router_z_loss_clip": 2.09570312, + "router_z_loss_mlp": 0.18249512, + "step": 4528, + "time_per_iteration": 2.5011186599731445 + }, + { + "auxiliary_loss_clip": 0.06500327, + "auxiliary_loss_mlp": 0.012781, + "balance_loss_clip": 0.06293638, + "balance_loss_mlp": 0.01260064, + "epoch": 0.2722982113332331, + "flos": 27753225788160.0, + "grad_norm": 1.6573852241711216, + "language_loss": 0.77553773, + "learning_rate": 3.4151082998320666e-06, + "loss": 0.85332191, + "num_input_tokens_seen": 97937225, + "router_z_loss_clip": 2.06640625, + "router_z_loss_mlp": 0.18041992, + "step": 4529, + "time_per_iteration": 2.5810396671295166 + }, + { + "auxiliary_loss_clip": 0.06499013, + "auxiliary_loss_mlp": 0.01277211, + "balance_loss_clip": 0.06287834, + "balance_loss_mlp": 0.01259055, + "epoch": 0.2723583345859011, + "flos": 21732756001920.0, + "grad_norm": 2.1115027178358354, + "language_loss": 0.82665265, + "learning_rate": 3.4148330563505805e-06, + "loss": 0.90441489, + "num_input_tokens_seen": 97956845, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.18164062, + "step": 4530, + "time_per_iteration": 2.586454391479492 + }, + { + "auxiliary_loss_clip": 0.06502904, + "auxiliary_loss_mlp": 0.01282339, + "balance_loss_clip": 0.06295159, + "balance_loss_mlp": 0.0126379, + "epoch": 0.27241845783856905, + "flos": 17352925159680.0, + "grad_norm": 2.154635693147181, + "language_loss": 0.92694783, + "learning_rate": 3.4145577592184838e-06, + "loss": 1.0048002, + "num_input_tokens_seen": 97972465, + "router_z_loss_clip": 2.078125, + "router_z_loss_mlp": 0.18530273, + "step": 4531, + "time_per_iteration": 2.5160703659057617 + }, + { + "auxiliary_loss_clip": 0.06501545, + "auxiliary_loss_mlp": 0.01275858, + "balance_loss_clip": 0.06288925, + "balance_loss_mlp": 0.01257928, + "epoch": 0.272478581091237, + "flos": 24761278448640.0, + "grad_norm": 1.903467624841223, + "language_loss": 0.76781744, + "learning_rate": 3.4142824084462155e-06, + "loss": 0.84559143, + "num_input_tokens_seen": 97990770, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.17919922, + "step": 4532, + "time_per_iteration": 2.568319082260132 + }, + { + "auxiliary_loss_clip": 0.06500092, + "auxiliary_loss_mlp": 0.01272039, + "balance_loss_clip": 0.06296673, + "balance_loss_mlp": 0.0125448, + "epoch": 0.272538704343905, + "flos": 17895723909120.0, + "grad_norm": 2.5230523304945685, + "language_loss": 0.89717656, + "learning_rate": 3.4140070040442162e-06, + "loss": 0.97489792, + "num_input_tokens_seen": 98005775, + "router_z_loss_clip": 2.03125, + "router_z_loss_mlp": 0.17565918, + "step": 4533, + "time_per_iteration": 2.538637399673462 + }, + { + "auxiliary_loss_clip": 0.06497633, + "auxiliary_loss_mlp": 0.01272152, + "balance_loss_clip": 0.06294405, + "balance_loss_mlp": 0.01255559, + "epoch": 0.272598827596573, + "flos": 22939021779840.0, + "grad_norm": 1.9282389689502992, + "language_loss": 0.72213519, + "learning_rate": 3.413731546022929e-06, + "loss": 0.79983306, + "num_input_tokens_seen": 98025750, + "router_z_loss_clip": 2.03027344, + "router_z_loss_mlp": 0.16589355, + "step": 4534, + "time_per_iteration": 2.5503549575805664 + }, + { + "auxiliary_loss_clip": 0.06500763, + "auxiliary_loss_mlp": 0.01275564, + "balance_loss_clip": 0.06290451, + "balance_loss_mlp": 0.01255847, + "epoch": 0.27265895084924097, + "flos": 24244447265280.0, + "grad_norm": 1.8514773269853142, + "language_loss": 0.91784394, + "learning_rate": 3.4134560343928005e-06, + "loss": 0.99560714, + "num_input_tokens_seen": 98044955, + "router_z_loss_clip": 2.10449219, + "router_z_loss_mlp": 0.19702148, + "step": 4535, + "time_per_iteration": 2.558943510055542 + }, + { + "auxiliary_loss_clip": 0.06506651, + "auxiliary_loss_mlp": 0.01276542, + "balance_loss_clip": 0.06297188, + "balance_loss_mlp": 0.01258768, + "epoch": 0.27271907410190893, + "flos": 27019962708480.0, + "grad_norm": 1.7799258806344853, + "language_loss": 0.73195565, + "learning_rate": 3.4131804691642778e-06, + "loss": 0.80978757, + "num_input_tokens_seen": 98065860, + "router_z_loss_clip": 2.09472656, + "router_z_loss_mlp": 0.17773438, + "step": 4536, + "time_per_iteration": 2.5590782165527344 + }, + { + "auxiliary_loss_clip": 0.06502935, + "auxiliary_loss_mlp": 0.01275256, + "balance_loss_clip": 0.0629502, + "balance_loss_mlp": 0.01257351, + "epoch": 0.2727791973545769, + "flos": 34460027568000.0, + "grad_norm": 1.8462150885541477, + "language_loss": 0.72167033, + "learning_rate": 3.41290485034781e-06, + "loss": 0.79945225, + "num_input_tokens_seen": 98085450, + "router_z_loss_clip": 2.07714844, + "router_z_loss_mlp": 0.17907715, + "step": 4537, + "time_per_iteration": 2.680515766143799 + }, + { + "auxiliary_loss_clip": 0.06501988, + "auxiliary_loss_mlp": 0.01276469, + "balance_loss_clip": 0.06293489, + "balance_loss_mlp": 0.0125829, + "epoch": 0.27283932060724486, + "flos": 15045842367360.0, + "grad_norm": 2.3888098238231503, + "language_loss": 0.78421736, + "learning_rate": 3.4126291779538485e-06, + "loss": 0.8620019, + "num_input_tokens_seen": 98099115, + "router_z_loss_clip": 2.08496094, + "router_z_loss_mlp": 0.1817627, + "step": 4538, + "time_per_iteration": 2.4626059532165527 + }, + { + "auxiliary_loss_clip": 0.06506806, + "auxiliary_loss_mlp": 0.01275863, + "balance_loss_clip": 0.06298484, + "balance_loss_mlp": 0.01258566, + "epoch": 0.2728994438599128, + "flos": 21658767246720.0, + "grad_norm": 1.6357140094020364, + "language_loss": 0.90640903, + "learning_rate": 3.412353451992847e-06, + "loss": 0.9842357, + "num_input_tokens_seen": 98118415, + "router_z_loss_clip": 2.08007812, + "router_z_loss_mlp": 0.17297363, + "step": 4539, + "time_per_iteration": 2.5629584789276123 + }, + { + "auxiliary_loss_clip": 0.06501281, + "auxiliary_loss_mlp": 0.01272923, + "balance_loss_clip": 0.06294584, + "balance_loss_mlp": 0.01253778, + "epoch": 0.2729595671125808, + "flos": 17493313875840.0, + "grad_norm": 1.7229738452441967, + "language_loss": 0.88610893, + "learning_rate": 3.4120776724752607e-06, + "loss": 0.96385098, + "num_input_tokens_seen": 98136300, + "router_z_loss_clip": 2.06542969, + "router_z_loss_mlp": 0.19140625, + "step": 4540, + "time_per_iteration": 2.4959304332733154 + }, + { + "auxiliary_loss_clip": 0.06504017, + "auxiliary_loss_mlp": 0.01274499, + "balance_loss_clip": 0.06294081, + "balance_loss_mlp": 0.0125744, + "epoch": 0.27301969036524876, + "flos": 19324249441920.0, + "grad_norm": 2.2191409784662, + "language_loss": 0.8242712, + "learning_rate": 3.4118018394115476e-06, + "loss": 0.9020564, + "num_input_tokens_seen": 98154580, + "router_z_loss_clip": 2.09960938, + "router_z_loss_mlp": 0.17053223, + "step": 4541, + "time_per_iteration": 2.550239086151123 + }, + { + "auxiliary_loss_clip": 0.06500127, + "auxiliary_loss_mlp": 0.01279087, + "balance_loss_clip": 0.06291916, + "balance_loss_mlp": 0.01260431, + "epoch": 0.2730798136179167, + "flos": 21071427252480.0, + "grad_norm": 2.3060281935178795, + "language_loss": 0.80131608, + "learning_rate": 3.4115259528121678e-06, + "loss": 0.87910819, + "num_input_tokens_seen": 98173115, + "router_z_loss_clip": 2.08007812, + "router_z_loss_mlp": 0.18664551, + "step": 4542, + "time_per_iteration": 2.519717216491699 + }, + { + "auxiliary_loss_clip": 0.06509651, + "auxiliary_loss_mlp": 0.01276731, + "balance_loss_clip": 0.06301565, + "balance_loss_mlp": 0.01258599, + "epoch": 0.2731399368705847, + "flos": 19177739377920.0, + "grad_norm": 1.9524817452008785, + "language_loss": 0.89606124, + "learning_rate": 3.411250012687582e-06, + "loss": 0.97392499, + "num_input_tokens_seen": 98190260, + "router_z_loss_clip": 2.08105469, + "router_z_loss_mlp": 0.18139648, + "step": 4543, + "time_per_iteration": 2.5182156562805176 + }, + { + "auxiliary_loss_clip": 0.06509942, + "auxiliary_loss_mlp": 0.01279235, + "balance_loss_clip": 0.06297313, + "balance_loss_mlp": 0.012604, + "epoch": 0.27320006012325265, + "flos": 18294989414400.0, + "grad_norm": 2.101118642115193, + "language_loss": 0.64112943, + "learning_rate": 3.410974019048255e-06, + "loss": 0.7190212, + "num_input_tokens_seen": 98207115, + "router_z_loss_clip": 2.12597656, + "router_z_loss_mlp": 0.18823242, + "step": 4544, + "time_per_iteration": 2.482348918914795 + }, + { + "auxiliary_loss_clip": 0.06504791, + "auxiliary_loss_mlp": 0.01282982, + "balance_loss_clip": 0.06296986, + "balance_loss_mlp": 0.01264231, + "epoch": 0.2732601833759206, + "flos": 34869607125120.0, + "grad_norm": 1.6845842729353224, + "language_loss": 0.70290005, + "learning_rate": 3.410697971904651e-06, + "loss": 0.78077781, + "num_input_tokens_seen": 98230610, + "router_z_loss_clip": 2.07714844, + "router_z_loss_mlp": 0.1875, + "step": 4545, + "time_per_iteration": 2.6779940128326416 + }, + { + "auxiliary_loss_clip": 0.06375119, + "auxiliary_loss_mlp": 0.01256033, + "balance_loss_clip": 0.06273499, + "balance_loss_mlp": 0.01252296, + "epoch": 0.2733203066285886, + "flos": 53929514534400.0, + "grad_norm": 0.7176798913576009, + "language_loss": 0.61676908, + "learning_rate": 3.4104218712672383e-06, + "loss": 0.6930806, + "num_input_tokens_seen": 98293585, + "router_z_loss_clip": 1.015625, + "router_z_loss_mlp": 0.03729248, + "step": 4546, + "time_per_iteration": 3.1508243083953857 + }, + { + "auxiliary_loss_clip": 0.06510071, + "auxiliary_loss_mlp": 0.01277702, + "balance_loss_clip": 0.06301852, + "balance_loss_mlp": 0.01258843, + "epoch": 0.2733804298812566, + "flos": 20665411493760.0, + "grad_norm": 1.9095347334938924, + "language_loss": 0.65170372, + "learning_rate": 3.410145717146488e-06, + "loss": 0.72958136, + "num_input_tokens_seen": 98311680, + "router_z_loss_clip": 2.08105469, + "router_z_loss_mlp": 0.1887207, + "step": 4547, + "time_per_iteration": 2.57828426361084 + }, + { + "auxiliary_loss_clip": 0.06498976, + "auxiliary_loss_mlp": 0.0127425, + "balance_loss_clip": 0.06296893, + "balance_loss_mlp": 0.01257799, + "epoch": 0.27344055313392457, + "flos": 25891333338240.0, + "grad_norm": 2.438857151480637, + "language_loss": 0.78365928, + "learning_rate": 3.4098695095528694e-06, + "loss": 0.86139154, + "num_input_tokens_seen": 98330770, + "router_z_loss_clip": 2.02246094, + "router_z_loss_mlp": 0.16455078, + "step": 4548, + "time_per_iteration": 2.566077470779419 + }, + { + "auxiliary_loss_clip": 0.0650417, + "auxiliary_loss_mlp": 0.01276844, + "balance_loss_clip": 0.06295689, + "balance_loss_mlp": 0.01259785, + "epoch": 0.27350067638659253, + "flos": 22936380376320.0, + "grad_norm": 2.3129649243249157, + "language_loss": 0.83350241, + "learning_rate": 3.4095932484968585e-06, + "loss": 0.91131258, + "num_input_tokens_seen": 98349860, + "router_z_loss_clip": 2.08496094, + "router_z_loss_mlp": 0.17053223, + "step": 4549, + "time_per_iteration": 2.560349941253662 + }, + { + "auxiliary_loss_clip": 0.06503863, + "auxiliary_loss_mlp": 0.01277801, + "balance_loss_clip": 0.06292209, + "balance_loss_mlp": 0.0125707, + "epoch": 0.2735607996392605, + "flos": 16579313539200.0, + "grad_norm": 2.1355332193902568, + "language_loss": 0.71687186, + "learning_rate": 3.4093169339889305e-06, + "loss": 0.79468852, + "num_input_tokens_seen": 98367040, + "router_z_loss_clip": 2.11328125, + "router_z_loss_mlp": 0.20727539, + "step": 4550, + "time_per_iteration": 2.4829771518707275 + }, + { + "auxiliary_loss_clip": 0.06503724, + "auxiliary_loss_mlp": 0.01271817, + "balance_loss_clip": 0.06298332, + "balance_loss_mlp": 0.01253435, + "epoch": 0.27362092289192846, + "flos": 19651245200640.0, + "grad_norm": 2.4590448673698546, + "language_loss": 0.79561722, + "learning_rate": 3.409040566039563e-06, + "loss": 0.87337267, + "num_input_tokens_seen": 98384010, + "router_z_loss_clip": 2.05078125, + "router_z_loss_mlp": 0.18371582, + "step": 4551, + "time_per_iteration": 2.5074269771575928 + }, + { + "auxiliary_loss_clip": 0.06500211, + "auxiliary_loss_mlp": 0.01281852, + "balance_loss_clip": 0.06290769, + "balance_loss_mlp": 0.01263565, + "epoch": 0.27368104614459643, + "flos": 17644855184640.0, + "grad_norm": 2.2858009613836465, + "language_loss": 0.71362597, + "learning_rate": 3.4087641446592362e-06, + "loss": 0.79144663, + "num_input_tokens_seen": 98399625, + "router_z_loss_clip": 2.09765625, + "router_z_loss_mlp": 0.18286133, + "step": 4552, + "time_per_iteration": 2.478208541870117 + }, + { + "auxiliary_loss_clip": 0.0650662, + "auxiliary_loss_mlp": 0.01277463, + "balance_loss_clip": 0.06295393, + "balance_loss_mlp": 0.01258759, + "epoch": 0.2737411693972644, + "flos": 21586455573120.0, + "grad_norm": 1.8660820035104149, + "language_loss": 0.71756262, + "learning_rate": 3.408487669858431e-06, + "loss": 0.79540348, + "num_input_tokens_seen": 98417310, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.18701172, + "step": 4553, + "time_per_iteration": 2.5268712043762207 + }, + { + "auxiliary_loss_clip": 0.0650337, + "auxiliary_loss_mlp": 0.01274907, + "balance_loss_clip": 0.06293483, + "balance_loss_mlp": 0.01255738, + "epoch": 0.27380129264993236, + "flos": 25491145438080.0, + "grad_norm": 1.7561499880950933, + "language_loss": 0.60065031, + "learning_rate": 3.4082111416476337e-06, + "loss": 0.67843306, + "num_input_tokens_seen": 98438670, + "router_z_loss_clip": 2.09765625, + "router_z_loss_mlp": 0.19177246, + "step": 4554, + "time_per_iteration": 2.5836522579193115 + }, + { + "auxiliary_loss_clip": 0.06509934, + "auxiliary_loss_mlp": 0.01281174, + "balance_loss_clip": 0.06291255, + "balance_loss_mlp": 0.01261838, + "epoch": 0.2738614159026003, + "flos": 18667155323520.0, + "grad_norm": 1.5632450212680145, + "language_loss": 0.74850649, + "learning_rate": 3.4079345600373275e-06, + "loss": 0.82641757, + "num_input_tokens_seen": 98456060, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.1932373, + "step": 4555, + "time_per_iteration": 3.9590039253234863 + }, + { + "auxiliary_loss_clip": 0.06511028, + "auxiliary_loss_mlp": 0.01279514, + "balance_loss_clip": 0.0629926, + "balance_loss_mlp": 0.0125982, + "epoch": 0.2739215391552683, + "flos": 23483874954240.0, + "grad_norm": 6.994475758797384, + "language_loss": 0.7822473, + "learning_rate": 3.407657925038002e-06, + "loss": 0.86015272, + "num_input_tokens_seen": 98473765, + "router_z_loss_clip": 2.11523438, + "router_z_loss_mlp": 0.19677734, + "step": 4556, + "time_per_iteration": 2.5688674449920654 + }, + { + "auxiliary_loss_clip": 0.06517123, + "auxiliary_loss_mlp": 0.01280796, + "balance_loss_clip": 0.06293104, + "balance_loss_mlp": 0.01260125, + "epoch": 0.27398166240793626, + "flos": 17134313057280.0, + "grad_norm": 1.8677949115203087, + "language_loss": 0.83077759, + "learning_rate": 3.4073812366601473e-06, + "loss": 0.90875673, + "num_input_tokens_seen": 98490590, + "router_z_loss_clip": 2.2421875, + "router_z_loss_mlp": 0.20690918, + "step": 4557, + "time_per_iteration": 2.490562915802002 + }, + { + "auxiliary_loss_clip": 0.06504503, + "auxiliary_loss_mlp": 0.01276773, + "balance_loss_clip": 0.06292793, + "balance_loss_mlp": 0.01256292, + "epoch": 0.2740417856606042, + "flos": 23411563280640.0, + "grad_norm": 1.9738441909854203, + "language_loss": 0.73066616, + "learning_rate": 3.4071044949142547e-06, + "loss": 0.80847895, + "num_input_tokens_seen": 98510590, + "router_z_loss_clip": 2.1171875, + "router_z_loss_mlp": 0.20483398, + "step": 4558, + "time_per_iteration": 2.5761232376098633 + }, + { + "auxiliary_loss_clip": 0.06504066, + "auxiliary_loss_mlp": 0.01276845, + "balance_loss_clip": 0.06292865, + "balance_loss_mlp": 0.01256651, + "epoch": 0.2741019089132722, + "flos": 12784307068800.0, + "grad_norm": 2.149984670873407, + "language_loss": 0.68751299, + "learning_rate": 3.406827699810819e-06, + "loss": 0.76532209, + "num_input_tokens_seen": 98527875, + "router_z_loss_clip": 2.10839844, + "router_z_loss_mlp": 0.2019043, + "step": 4559, + "time_per_iteration": 2.4976439476013184 + }, + { + "auxiliary_loss_clip": 0.06501673, + "auxiliary_loss_mlp": 0.01278249, + "balance_loss_clip": 0.0629222, + "balance_loss_mlp": 0.01259676, + "epoch": 0.27416203216594015, + "flos": 20637850700160.0, + "grad_norm": 1.7403202614473876, + "language_loss": 0.72741163, + "learning_rate": 3.4065508513603353e-06, + "loss": 0.80521083, + "num_input_tokens_seen": 98547575, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 0.18566895, + "step": 4560, + "time_per_iteration": 4.005557537078857 + }, + { + "auxiliary_loss_clip": 0.06501405, + "auxiliary_loss_mlp": 0.01278052, + "balance_loss_clip": 0.06289977, + "balance_loss_mlp": 0.01259718, + "epoch": 0.27422215541860817, + "flos": 26548762872960.0, + "grad_norm": 1.7791790627265829, + "language_loss": 0.82245278, + "learning_rate": 3.406273949573303e-06, + "loss": 0.90024734, + "num_input_tokens_seen": 98566290, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.18334961, + "step": 4561, + "time_per_iteration": 4.059048652648926 + }, + { + "auxiliary_loss_clip": 0.06510133, + "auxiliary_loss_mlp": 0.01276094, + "balance_loss_clip": 0.06296331, + "balance_loss_mlp": 0.012564, + "epoch": 0.27428227867127614, + "flos": 23337868014720.0, + "grad_norm": 1.9098162884662422, + "language_loss": 0.75760031, + "learning_rate": 3.4059969944602214e-06, + "loss": 0.83546257, + "num_input_tokens_seen": 98586255, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.19702148, + "step": 4562, + "time_per_iteration": 2.558397054672241 + }, + { + "auxiliary_loss_clip": 0.06506505, + "auxiliary_loss_mlp": 0.01277189, + "balance_loss_clip": 0.06293164, + "balance_loss_mlp": 0.01258092, + "epoch": 0.2743424019239441, + "flos": 23041074453120.0, + "grad_norm": 1.577834756327151, + "language_loss": 0.75198597, + "learning_rate": 3.4057199860315928e-06, + "loss": 0.8298229, + "num_input_tokens_seen": 98606030, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.19091797, + "step": 4563, + "time_per_iteration": 2.5698354244232178 + }, + { + "auxiliary_loss_clip": 0.06524341, + "auxiliary_loss_mlp": 0.01283879, + "balance_loss_clip": 0.06305183, + "balance_loss_mlp": 0.01262302, + "epoch": 0.27440252517661207, + "flos": 21987565868160.0, + "grad_norm": 2.0193615345580085, + "language_loss": 0.6348893, + "learning_rate": 3.4054429242979213e-06, + "loss": 0.71297145, + "num_input_tokens_seen": 98625225, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.21569824, + "step": 4564, + "time_per_iteration": 2.545741558074951 + }, + { + "auxiliary_loss_clip": 0.06513885, + "auxiliary_loss_mlp": 0.01280066, + "balance_loss_clip": 0.06299828, + "balance_loss_mlp": 0.01260647, + "epoch": 0.27446264842928003, + "flos": 40196952737280.0, + "grad_norm": 2.2005709679787153, + "language_loss": 0.7878077, + "learning_rate": 3.4051658092697135e-06, + "loss": 0.86574721, + "num_input_tokens_seen": 98649470, + "router_z_loss_clip": 2.14160156, + "router_z_loss_mlp": 0.19433594, + "step": 4565, + "time_per_iteration": 2.7061169147491455 + }, + { + "auxiliary_loss_clip": 0.0650921, + "auxiliary_loss_mlp": 0.01277346, + "balance_loss_clip": 0.06296623, + "balance_loss_mlp": 0.01257903, + "epoch": 0.274522771681948, + "flos": 13484684620800.0, + "grad_norm": 1.9604173340299715, + "language_loss": 0.69729757, + "learning_rate": 3.404888640957477e-06, + "loss": 0.77516317, + "num_input_tokens_seen": 98666915, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.19458008, + "step": 4566, + "time_per_iteration": 3.9156126976013184 + }, + { + "auxiliary_loss_clip": 0.06511474, + "auxiliary_loss_mlp": 0.0128161, + "balance_loss_clip": 0.06300822, + "balance_loss_mlp": 0.0126318, + "epoch": 0.27458289493461596, + "flos": 28629812476800.0, + "grad_norm": 1.605297231279352, + "language_loss": 0.61699307, + "learning_rate": 3.404611419371723e-06, + "loss": 0.69492388, + "num_input_tokens_seen": 98688240, + "router_z_loss_clip": 2.10546875, + "router_z_loss_mlp": 0.18432617, + "step": 4567, + "time_per_iteration": 2.5721306800842285 + }, + { + "auxiliary_loss_clip": 0.06514515, + "auxiliary_loss_mlp": 0.01275467, + "balance_loss_clip": 0.06299441, + "balance_loss_mlp": 0.01255511, + "epoch": 0.2746430181872839, + "flos": 20125883053440.0, + "grad_norm": 1.9422441687055725, + "language_loss": 0.83055782, + "learning_rate": 3.4043341445229627e-06, + "loss": 0.90845764, + "num_input_tokens_seen": 98708245, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.19970703, + "step": 4568, + "time_per_iteration": 2.5616700649261475 + }, + { + "auxiliary_loss_clip": 0.06521738, + "auxiliary_loss_mlp": 0.01275653, + "balance_loss_clip": 0.06304733, + "balance_loss_mlp": 0.01255709, + "epoch": 0.2747031414399519, + "flos": 20199662173440.0, + "grad_norm": 2.1285143693034367, + "language_loss": 0.6896143, + "learning_rate": 3.4040568164217117e-06, + "loss": 0.76758814, + "num_input_tokens_seen": 98724575, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.19934082, + "step": 4569, + "time_per_iteration": 2.531096935272217 + }, + { + "auxiliary_loss_clip": 0.06517979, + "auxiliary_loss_mlp": 0.01281496, + "balance_loss_clip": 0.06303072, + "balance_loss_mlp": 0.0126216, + "epoch": 0.27476326469261986, + "flos": 13521385509120.0, + "grad_norm": 2.4613635331126926, + "language_loss": 0.71897286, + "learning_rate": 3.4037794350784848e-06, + "loss": 0.79696763, + "num_input_tokens_seen": 98740700, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.19360352, + "step": 4570, + "time_per_iteration": 2.5235774517059326 + }, + { + "auxiliary_loss_clip": 0.06414898, + "auxiliary_loss_mlp": 0.01257276, + "balance_loss_clip": 0.06312878, + "balance_loss_mlp": 0.01253897, + "epoch": 0.2748233879452878, + "flos": 65955486153600.0, + "grad_norm": 0.6977768363268191, + "language_loss": 0.5577414, + "learning_rate": 3.4035020005038014e-06, + "loss": 0.63446319, + "num_input_tokens_seen": 98803030, + "router_z_loss_clip": 1.015625, + "router_z_loss_mlp": 0.03387451, + "step": 4571, + "time_per_iteration": 3.234433889389038 + }, + { + "auxiliary_loss_clip": 0.06526154, + "auxiliary_loss_mlp": 0.01279423, + "balance_loss_clip": 0.06308736, + "balance_loss_mlp": 0.01260326, + "epoch": 0.2748835111979558, + "flos": 17389961464320.0, + "grad_norm": 2.165338105639142, + "language_loss": 0.78105313, + "learning_rate": 3.4032245127081812e-06, + "loss": 0.85910892, + "num_input_tokens_seen": 98820505, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.19104004, + "step": 4572, + "time_per_iteration": 2.562450647354126 + }, + { + "auxiliary_loss_clip": 0.06506811, + "auxiliary_loss_mlp": 0.01278507, + "balance_loss_clip": 0.06298923, + "balance_loss_mlp": 0.01261711, + "epoch": 0.27494363445062375, + "flos": 23594480743680.0, + "grad_norm": 2.0912194071895014, + "language_loss": 0.81855798, + "learning_rate": 3.402946971702147e-06, + "loss": 0.89641118, + "num_input_tokens_seen": 98842150, + "router_z_loss_clip": 2.078125, + "router_z_loss_mlp": 0.16809082, + "step": 4573, + "time_per_iteration": 2.575467824935913 + }, + { + "auxiliary_loss_clip": 0.06512269, + "auxiliary_loss_mlp": 0.01277933, + "balance_loss_clip": 0.06303579, + "balance_loss_mlp": 0.01258585, + "epoch": 0.2750037577032918, + "flos": 17170175404800.0, + "grad_norm": 1.5550185346959569, + "language_loss": 0.79688454, + "learning_rate": 3.402669377496223e-06, + "loss": 0.87478662, + "num_input_tokens_seen": 98861050, + "router_z_loss_clip": 2.08398438, + "router_z_loss_mlp": 0.19360352, + "step": 4574, + "time_per_iteration": 2.522381067276001 + }, + { + "auxiliary_loss_clip": 0.06514049, + "auxiliary_loss_mlp": 0.012813, + "balance_loss_clip": 0.06300252, + "balance_loss_mlp": 0.01263383, + "epoch": 0.27506388095595974, + "flos": 24497663904000.0, + "grad_norm": 1.9638366231768782, + "language_loss": 0.75217533, + "learning_rate": 3.402391730100936e-06, + "loss": 0.83012879, + "num_input_tokens_seen": 98879695, + "router_z_loss_clip": 2.14160156, + "router_z_loss_mlp": 0.17907715, + "step": 4575, + "time_per_iteration": 2.564023971557617 + }, + { + "auxiliary_loss_clip": 0.06513455, + "auxiliary_loss_mlp": 0.01285217, + "balance_loss_clip": 0.06304657, + "balance_loss_mlp": 0.01267562, + "epoch": 0.2751240042086277, + "flos": 38774003500800.0, + "grad_norm": 1.5894976166299741, + "language_loss": 0.71788073, + "learning_rate": 3.402114029526814e-06, + "loss": 0.79586744, + "num_input_tokens_seen": 98902035, + "router_z_loss_clip": 2.08789062, + "router_z_loss_mlp": 0.17663574, + "step": 4576, + "time_per_iteration": 2.6856141090393066 + }, + { + "auxiliary_loss_clip": 0.06515673, + "auxiliary_loss_mlp": 0.01294199, + "balance_loss_clip": 0.06304252, + "balance_loss_mlp": 0.0127447, + "epoch": 0.27518412746129567, + "flos": 26914388163840.0, + "grad_norm": 1.693116107866749, + "language_loss": 0.73358452, + "learning_rate": 3.4018362757843866e-06, + "loss": 0.81168324, + "num_input_tokens_seen": 98921835, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.19726562, + "step": 4577, + "time_per_iteration": 2.5795719623565674 + }, + { + "auxiliary_loss_clip": 0.06517484, + "auxiliary_loss_mlp": 0.01279945, + "balance_loss_clip": 0.0630409, + "balance_loss_mlp": 0.01260514, + "epoch": 0.27524425071396363, + "flos": 24907578877440.0, + "grad_norm": 1.9498672791378742, + "language_loss": 0.76234132, + "learning_rate": 3.401558468884188e-06, + "loss": 0.84031564, + "num_input_tokens_seen": 98939610, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.19433594, + "step": 4578, + "time_per_iteration": 2.5547378063201904 + }, + { + "auxiliary_loss_clip": 0.06518476, + "auxiliary_loss_mlp": 0.01286331, + "balance_loss_clip": 0.06304644, + "balance_loss_mlp": 0.01265255, + "epoch": 0.2753043739666316, + "flos": 26295504307200.0, + "grad_norm": 1.3718100748583155, + "language_loss": 0.66504484, + "learning_rate": 3.4012806088367516e-06, + "loss": 0.74309289, + "num_input_tokens_seen": 98962250, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.21069336, + "step": 4579, + "time_per_iteration": 2.6126484870910645 + }, + { + "auxiliary_loss_clip": 0.06516613, + "auxiliary_loss_mlp": 0.01291851, + "balance_loss_clip": 0.06301446, + "balance_loss_mlp": 0.01271753, + "epoch": 0.27536449721929956, + "flos": 24213616162560.0, + "grad_norm": 3.1986582184359853, + "language_loss": 0.80722374, + "learning_rate": 3.4010026956526137e-06, + "loss": 0.88530838, + "num_input_tokens_seen": 98981845, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.2010498, + "step": 4580, + "time_per_iteration": 2.571364164352417 + }, + { + "auxiliary_loss_clip": 0.06513728, + "auxiliary_loss_mlp": 0.01285107, + "balance_loss_clip": 0.06304168, + "balance_loss_mlp": 0.01264305, + "epoch": 0.27542462047196753, + "flos": 19543448522880.0, + "grad_norm": 1.580662182314359, + "language_loss": 0.68234229, + "learning_rate": 3.4007247293423137e-06, + "loss": 0.76033062, + "num_input_tokens_seen": 99001855, + "router_z_loss_clip": 2.09570312, + "router_z_loss_mlp": 0.20788574, + "step": 4581, + "time_per_iteration": 2.5507936477661133 + }, + { + "auxiliary_loss_clip": 0.06515522, + "auxiliary_loss_mlp": 0.01276377, + "balance_loss_clip": 0.06298342, + "balance_loss_mlp": 0.01258448, + "epoch": 0.2754847437246355, + "flos": 14324360785920.0, + "grad_norm": 1.5474830525473977, + "language_loss": 0.78408682, + "learning_rate": 3.400446709916392e-06, + "loss": 0.86200583, + "num_input_tokens_seen": 99019880, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.17919922, + "step": 4582, + "time_per_iteration": 2.511134624481201 + }, + { + "auxiliary_loss_clip": 0.06505451, + "auxiliary_loss_mlp": 0.01284248, + "balance_loss_clip": 0.06298563, + "balance_loss_mlp": 0.01266605, + "epoch": 0.27554486697730346, + "flos": 18843951438720.0, + "grad_norm": 1.627014419094476, + "language_loss": 0.84829235, + "learning_rate": 3.4001686373853895e-06, + "loss": 0.92618936, + "num_input_tokens_seen": 99037570, + "router_z_loss_clip": 2.06933594, + "router_z_loss_mlp": 0.17663574, + "step": 4583, + "time_per_iteration": 2.5625038146972656 + }, + { + "auxiliary_loss_clip": 0.065156, + "auxiliary_loss_mlp": 0.01295136, + "balance_loss_clip": 0.0629985, + "balance_loss_mlp": 0.01274799, + "epoch": 0.2756049902299714, + "flos": 22388801944320.0, + "grad_norm": 2.5216327683147104, + "language_loss": 0.67592049, + "learning_rate": 3.3998905117598528e-06, + "loss": 0.75402784, + "num_input_tokens_seen": 99056875, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.20349121, + "step": 4584, + "time_per_iteration": 2.5712413787841797 + }, + { + "auxiliary_loss_clip": 0.06508277, + "auxiliary_loss_mlp": 0.01286302, + "balance_loss_clip": 0.06299593, + "balance_loss_mlp": 0.01268385, + "epoch": 0.2756651134826394, + "flos": 19580107484160.0, + "grad_norm": 1.7056038485870715, + "language_loss": 0.77640843, + "learning_rate": 3.399612333050327e-06, + "loss": 0.8543542, + "num_input_tokens_seen": 99074685, + "router_z_loss_clip": 2.08789062, + "router_z_loss_mlp": 0.17919922, + "step": 4585, + "time_per_iteration": 2.5581910610198975 + }, + { + "auxiliary_loss_clip": 0.06520131, + "auxiliary_loss_mlp": 0.01290999, + "balance_loss_clip": 0.06302814, + "balance_loss_mlp": 0.01271151, + "epoch": 0.27572523673530736, + "flos": 23593306786560.0, + "grad_norm": 1.6012607614221503, + "language_loss": 0.72652835, + "learning_rate": 3.399334101267362e-06, + "loss": 0.8046397, + "num_input_tokens_seen": 99095300, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.1986084, + "step": 4586, + "time_per_iteration": 2.5581955909729004 + }, + { + "auxiliary_loss_clip": 0.06512299, + "auxiliary_loss_mlp": 0.01283131, + "balance_loss_clip": 0.06300563, + "balance_loss_mlp": 0.01264475, + "epoch": 0.2757853599879754, + "flos": 22826696981760.0, + "grad_norm": 1.4211606049909042, + "language_loss": 0.8102116, + "learning_rate": 3.3990558164215073e-06, + "loss": 0.88816595, + "num_input_tokens_seen": 99115965, + "router_z_loss_clip": 2.11523438, + "router_z_loss_mlp": 0.18664551, + "step": 4587, + "time_per_iteration": 2.6184678077697754 + }, + { + "auxiliary_loss_clip": 0.0651072, + "auxiliary_loss_mlp": 0.01292397, + "balance_loss_clip": 0.06300361, + "balance_loss_mlp": 0.01273037, + "epoch": 0.27584548324064334, + "flos": 18557639637120.0, + "grad_norm": 2.3677019636161716, + "language_loss": 0.83699477, + "learning_rate": 3.398777478523316e-06, + "loss": 0.91502589, + "num_input_tokens_seen": 99134265, + "router_z_loss_clip": 2.10351562, + "router_z_loss_mlp": 0.19348145, + "step": 4588, + "time_per_iteration": 2.5100526809692383 + }, + { + "auxiliary_loss_clip": 0.06502403, + "auxiliary_loss_mlp": 0.01287014, + "balance_loss_clip": 0.06294176, + "balance_loss_mlp": 0.0126856, + "epoch": 0.2759056064933113, + "flos": 23776811228160.0, + "grad_norm": 1.8520309888563375, + "language_loss": 0.76066566, + "learning_rate": 3.398499087583342e-06, + "loss": 0.83855987, + "num_input_tokens_seen": 99156185, + "router_z_loss_clip": 2.08398438, + "router_z_loss_mlp": 0.18457031, + "step": 4589, + "time_per_iteration": 2.5906028747558594 + }, + { + "auxiliary_loss_clip": 0.06503198, + "auxiliary_loss_mlp": 0.01281135, + "balance_loss_clip": 0.06293473, + "balance_loss_mlp": 0.01261703, + "epoch": 0.27596572974597927, + "flos": 24289114291200.0, + "grad_norm": 1.7619688929899446, + "language_loss": 0.88857687, + "learning_rate": 3.398220643612143e-06, + "loss": 0.96642017, + "num_input_tokens_seen": 99176735, + "router_z_loss_clip": 2.09570312, + "router_z_loss_mlp": 0.19421387, + "step": 4590, + "time_per_iteration": 2.5526933670043945 + }, + { + "auxiliary_loss_clip": 0.0650104, + "auxiliary_loss_mlp": 0.01279948, + "balance_loss_clip": 0.06291595, + "balance_loss_mlp": 0.01261041, + "epoch": 0.27602585299864724, + "flos": 35049296206080.0, + "grad_norm": 1.573202994920717, + "language_loss": 0.71835011, + "learning_rate": 3.397942146620277e-06, + "loss": 0.79615998, + "num_input_tokens_seen": 99199765, + "router_z_loss_clip": 2.09667969, + "router_z_loss_mlp": 0.18908691, + "step": 4591, + "time_per_iteration": 2.659573554992676 + }, + { + "auxiliary_loss_clip": 0.06502488, + "auxiliary_loss_mlp": 0.01277501, + "balance_loss_clip": 0.06290874, + "balance_loss_mlp": 0.01258964, + "epoch": 0.2760859762513152, + "flos": 24315123784320.0, + "grad_norm": 2.0980893762293866, + "language_loss": 0.80327255, + "learning_rate": 3.3976635966183046e-06, + "loss": 0.8810724, + "num_input_tokens_seen": 99218435, + "router_z_loss_clip": 2.11621094, + "router_z_loss_mlp": 0.18530273, + "step": 4592, + "time_per_iteration": 2.5534770488739014 + }, + { + "auxiliary_loss_clip": 0.06405188, + "auxiliary_loss_mlp": 0.01272136, + "balance_loss_clip": 0.06302959, + "balance_loss_mlp": 0.0126841, + "epoch": 0.27614609950398317, + "flos": 71279435675520.0, + "grad_norm": 0.6848268802880488, + "language_loss": 0.6162945, + "learning_rate": 3.3973849936167886e-06, + "loss": 0.69306767, + "num_input_tokens_seen": 99276200, + "router_z_loss_clip": 1.0234375, + "router_z_loss_mlp": 0.03717041, + "step": 4593, + "time_per_iteration": 3.127192735671997 + }, + { + "auxiliary_loss_clip": 0.06506699, + "auxiliary_loss_mlp": 0.01276217, + "balance_loss_clip": 0.0629646, + "balance_loss_mlp": 0.01256881, + "epoch": 0.27620622275665113, + "flos": 29681811688320.0, + "grad_norm": 2.6081053554454363, + "language_loss": 0.77380788, + "learning_rate": 3.3971063376262937e-06, + "loss": 0.85163713, + "num_input_tokens_seen": 99297625, + "router_z_loss_clip": 2.1015625, + "router_z_loss_mlp": 0.1932373, + "step": 4594, + "time_per_iteration": 2.5809319019317627 + }, + { + "auxiliary_loss_clip": 0.06503148, + "auxiliary_loss_mlp": 0.01273163, + "balance_loss_clip": 0.06295307, + "balance_loss_mlp": 0.01255138, + "epoch": 0.2762663460093191, + "flos": 15383571448320.0, + "grad_norm": 1.4453472339612206, + "language_loss": 0.9229176, + "learning_rate": 3.3968276286573866e-06, + "loss": 1.00068069, + "num_input_tokens_seen": 99315790, + "router_z_loss_clip": 2.08007812, + "router_z_loss_mlp": 0.18029785, + "step": 4595, + "time_per_iteration": 3.9466536045074463 + }, + { + "auxiliary_loss_clip": 0.06509015, + "auxiliary_loss_mlp": 0.01281786, + "balance_loss_clip": 0.06294905, + "balance_loss_mlp": 0.01261592, + "epoch": 0.27632646926198706, + "flos": 20710330081920.0, + "grad_norm": 1.8151181533722092, + "language_loss": 0.69491673, + "learning_rate": 3.3965488667206353e-06, + "loss": 0.77282476, + "num_input_tokens_seen": 99334615, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.2019043, + "step": 4596, + "time_per_iteration": 2.552893877029419 + }, + { + "auxiliary_loss_clip": 0.06517404, + "auxiliary_loss_mlp": 0.01272476, + "balance_loss_clip": 0.0629788, + "balance_loss_mlp": 0.0125382, + "epoch": 0.276386592514655, + "flos": 32820981851520.0, + "grad_norm": 1.6734752779014743, + "language_loss": 0.64091378, + "learning_rate": 3.3962700518266113e-06, + "loss": 0.71881258, + "num_input_tokens_seen": 99356685, + "router_z_loss_clip": 2.19335938, + "router_z_loss_mlp": 0.18652344, + "step": 4597, + "time_per_iteration": 2.61291766166687 + }, + { + "auxiliary_loss_clip": 0.06500123, + "auxiliary_loss_mlp": 0.01279427, + "balance_loss_clip": 0.0629456, + "balance_loss_mlp": 0.01260616, + "epoch": 0.276446715767323, + "flos": 18557639637120.0, + "grad_norm": 1.8925825739150304, + "language_loss": 0.86690855, + "learning_rate": 3.395991183985887e-06, + "loss": 0.94470406, + "num_input_tokens_seen": 99374810, + "router_z_loss_clip": 2.05664062, + "router_z_loss_mlp": 0.18835449, + "step": 4598, + "time_per_iteration": 2.5411598682403564 + }, + { + "auxiliary_loss_clip": 0.0650408, + "auxiliary_loss_mlp": 0.0127527, + "balance_loss_clip": 0.06291056, + "balance_loss_mlp": 0.01256554, + "epoch": 0.27650683901999096, + "flos": 22826110003200.0, + "grad_norm": 2.378506410601605, + "language_loss": 0.79588032, + "learning_rate": 3.395712263209037e-06, + "loss": 0.8736738, + "num_input_tokens_seen": 99391290, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.18725586, + "step": 4599, + "time_per_iteration": 2.515411138534546 + }, + { + "auxiliary_loss_clip": 0.06518425, + "auxiliary_loss_mlp": 0.01279235, + "balance_loss_clip": 0.06301137, + "balance_loss_mlp": 0.01259756, + "epoch": 0.276566962272659, + "flos": 21368011178880.0, + "grad_norm": 2.1602669865212487, + "language_loss": 0.80043805, + "learning_rate": 3.395433289506639e-06, + "loss": 0.87841463, + "num_input_tokens_seen": 99409120, + "router_z_loss_clip": 2.17480469, + "router_z_loss_mlp": 0.19482422, + "step": 4600, + "time_per_iteration": 5.317862033843994 + }, + { + "auxiliary_loss_clip": 0.06511359, + "auxiliary_loss_mlp": 0.01276749, + "balance_loss_clip": 0.06296661, + "balance_loss_mlp": 0.01258843, + "epoch": 0.27662708552532694, + "flos": 17716076755200.0, + "grad_norm": 12.932121146702709, + "language_loss": 0.73461431, + "learning_rate": 3.3951542628892694e-06, + "loss": 0.81249541, + "num_input_tokens_seen": 99426180, + "router_z_loss_clip": 2.14550781, + "router_z_loss_mlp": 0.17907715, + "step": 4601, + "time_per_iteration": 2.5192854404449463 + }, + { + "auxiliary_loss_clip": 0.0650773, + "auxiliary_loss_mlp": 0.01282643, + "balance_loss_clip": 0.06297003, + "balance_loss_mlp": 0.01263676, + "epoch": 0.2766872087779949, + "flos": 21259292106240.0, + "grad_norm": 1.833059055741047, + "language_loss": 0.8051585, + "learning_rate": 3.3948751833675113e-06, + "loss": 0.88306224, + "num_input_tokens_seen": 99447720, + "router_z_loss_clip": 2.10742188, + "router_z_loss_mlp": 0.18981934, + "step": 4602, + "time_per_iteration": 2.635265350341797 + }, + { + "auxiliary_loss_clip": 0.06517955, + "auxiliary_loss_mlp": 0.01279657, + "balance_loss_clip": 0.06297721, + "balance_loss_mlp": 0.01259749, + "epoch": 0.2767473320306629, + "flos": 12936728845440.0, + "grad_norm": 2.082735068257359, + "language_loss": 0.7691201, + "learning_rate": 3.3945960509519455e-06, + "loss": 0.8470962, + "num_input_tokens_seen": 99464720, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.19921875, + "step": 4603, + "time_per_iteration": 2.6102261543273926 + }, + { + "auxiliary_loss_clip": 0.06506386, + "auxiliary_loss_mlp": 0.01276601, + "balance_loss_clip": 0.06300791, + "balance_loss_mlp": 0.01259017, + "epoch": 0.27680745528333084, + "flos": 15018239646720.0, + "grad_norm": 1.5173997695974415, + "language_loss": 0.81704807, + "learning_rate": 3.3943168656531585e-06, + "loss": 0.89487797, + "num_input_tokens_seen": 99482310, + "router_z_loss_clip": 2.05859375, + "router_z_loss_mlp": 0.17578125, + "step": 4604, + "time_per_iteration": 2.5022366046905518 + }, + { + "auxiliary_loss_clip": 0.06510165, + "auxiliary_loss_mlp": 0.01279666, + "balance_loss_clip": 0.06295862, + "balance_loss_mlp": 0.01261367, + "epoch": 0.2768675785359988, + "flos": 22644408424320.0, + "grad_norm": 1.8407701121062605, + "language_loss": 0.70736969, + "learning_rate": 3.3940376274817363e-06, + "loss": 0.78526795, + "num_input_tokens_seen": 99501255, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.18310547, + "step": 4605, + "time_per_iteration": 4.068409442901611 + }, + { + "auxiliary_loss_clip": 0.06402105, + "auxiliary_loss_mlp": 0.01269906, + "balance_loss_clip": 0.0629937, + "balance_loss_mlp": 0.01266097, + "epoch": 0.27692770178866677, + "flos": 66150772093440.0, + "grad_norm": 0.7075303746126435, + "language_loss": 0.57218695, + "learning_rate": 3.3937583364482673e-06, + "loss": 0.64890707, + "num_input_tokens_seen": 99568925, + "router_z_loss_clip": 1.03027344, + "router_z_loss_mlp": 0.0380249, + "step": 4606, + "time_per_iteration": 3.269275426864624 + }, + { + "auxiliary_loss_clip": 0.06516754, + "auxiliary_loss_mlp": 0.01286288, + "balance_loss_clip": 0.06299627, + "balance_loss_mlp": 0.01266118, + "epoch": 0.27698782504133473, + "flos": 26471545735680.0, + "grad_norm": 1.9632725808751148, + "language_loss": 0.69427574, + "learning_rate": 3.3934789925633424e-06, + "loss": 0.77230614, + "num_input_tokens_seen": 99588455, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.20153809, + "step": 4607, + "time_per_iteration": 2.566908836364746 + }, + { + "auxiliary_loss_clip": 0.06512889, + "auxiliary_loss_mlp": 0.01276778, + "balance_loss_clip": 0.06304939, + "balance_loss_mlp": 0.01258849, + "epoch": 0.2770479482940027, + "flos": 25891878389760.0, + "grad_norm": 1.6636880421304368, + "language_loss": 0.70338356, + "learning_rate": 3.393199595837555e-06, + "loss": 0.78128028, + "num_input_tokens_seen": 99609355, + "router_z_loss_clip": 2.08203125, + "router_z_loss_mlp": 0.17919922, + "step": 4608, + "time_per_iteration": 2.709989309310913 + }, + { + "auxiliary_loss_clip": 0.06514756, + "auxiliary_loss_mlp": 0.01279509, + "balance_loss_clip": 0.06298438, + "balance_loss_mlp": 0.01260781, + "epoch": 0.27710807154667066, + "flos": 22863942921600.0, + "grad_norm": 1.8326330841759049, + "language_loss": 0.73323762, + "learning_rate": 3.392920146281499e-06, + "loss": 0.81118023, + "num_input_tokens_seen": 99628780, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.18725586, + "step": 4609, + "time_per_iteration": 2.530625581741333 + }, + { + "auxiliary_loss_clip": 0.06522895, + "auxiliary_loss_mlp": 0.01276886, + "balance_loss_clip": 0.063067, + "balance_loss_mlp": 0.0125749, + "epoch": 0.27716819479933863, + "flos": 17716621806720.0, + "grad_norm": 2.1915868475112714, + "language_loss": 0.84688777, + "learning_rate": 3.3926406439057714e-06, + "loss": 0.92488557, + "num_input_tokens_seen": 99644545, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.19396973, + "step": 4610, + "time_per_iteration": 2.578780174255371 + }, + { + "auxiliary_loss_clip": 0.06521606, + "auxiliary_loss_mlp": 0.01280928, + "balance_loss_clip": 0.06305112, + "balance_loss_mlp": 0.01260054, + "epoch": 0.2772283180520066, + "flos": 19652125668480.0, + "grad_norm": 1.9738462991775114, + "language_loss": 0.69718874, + "learning_rate": 3.3923610887209705e-06, + "loss": 0.77521408, + "num_input_tokens_seen": 99663125, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.20874023, + "step": 4611, + "time_per_iteration": 2.5499660968780518 + }, + { + "auxiliary_loss_clip": 0.0651576, + "auxiliary_loss_mlp": 0.0127314, + "balance_loss_clip": 0.06309414, + "balance_loss_mlp": 0.01254997, + "epoch": 0.27728844130467456, + "flos": 21038960995200.0, + "grad_norm": 1.8677227151172762, + "language_loss": 0.74507141, + "learning_rate": 3.392081480737698e-06, + "loss": 0.82296044, + "num_input_tokens_seen": 99682645, + "router_z_loss_clip": 2.06347656, + "router_z_loss_mlp": 0.18151855, + "step": 4612, + "time_per_iteration": 2.567218065261841 + }, + { + "auxiliary_loss_clip": 0.06522087, + "auxiliary_loss_mlp": 0.01282319, + "balance_loss_clip": 0.06306847, + "balance_loss_mlp": 0.01263067, + "epoch": 0.2773485645573425, + "flos": 18995157331200.0, + "grad_norm": 2.3882423035535063, + "language_loss": 0.67084455, + "learning_rate": 3.3918018199665563e-06, + "loss": 0.74888861, + "num_input_tokens_seen": 99700520, + "router_z_loss_clip": 2.14941406, + "router_z_loss_mlp": 0.19250488, + "step": 4613, + "time_per_iteration": 2.5458126068115234 + }, + { + "auxiliary_loss_clip": 0.06515062, + "auxiliary_loss_mlp": 0.01275499, + "balance_loss_clip": 0.06304698, + "balance_loss_mlp": 0.0125577, + "epoch": 0.27740868781001055, + "flos": 21474508118400.0, + "grad_norm": 1.6100748666203144, + "language_loss": 0.79936564, + "learning_rate": 3.39152210641815e-06, + "loss": 0.87727129, + "num_input_tokens_seen": 99720355, + "router_z_loss_clip": 2.1015625, + "router_z_loss_mlp": 0.19750977, + "step": 4614, + "time_per_iteration": 2.5586962699890137 + }, + { + "auxiliary_loss_clip": 0.06520429, + "auxiliary_loss_mlp": 0.01279079, + "balance_loss_clip": 0.06305806, + "balance_loss_mlp": 0.01257884, + "epoch": 0.2774688110626785, + "flos": 19833827247360.0, + "grad_norm": 2.249482091575283, + "language_loss": 0.81082475, + "learning_rate": 3.3912423401030865e-06, + "loss": 0.88881981, + "num_input_tokens_seen": 99736090, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.21179199, + "step": 4615, + "time_per_iteration": 2.5192136764526367 + }, + { + "auxiliary_loss_clip": 0.0652476, + "auxiliary_loss_mlp": 0.0127518, + "balance_loss_clip": 0.06306368, + "balance_loss_mlp": 0.01256655, + "epoch": 0.2775289343153465, + "flos": 18220916805120.0, + "grad_norm": 2.6879454427381715, + "language_loss": 0.64382082, + "learning_rate": 3.3909625210319735e-06, + "loss": 0.72182024, + "num_input_tokens_seen": 99751805, + "router_z_loss_clip": 2.18164062, + "router_z_loss_mlp": 0.18518066, + "step": 4616, + "time_per_iteration": 2.528766393661499 + }, + { + "auxiliary_loss_clip": 0.06523173, + "auxiliary_loss_mlp": 0.01284441, + "balance_loss_clip": 0.06308753, + "balance_loss_mlp": 0.0126377, + "epoch": 0.27758905756801444, + "flos": 16478141333760.0, + "grad_norm": 2.0768832102625296, + "language_loss": 0.82857239, + "learning_rate": 3.3906826492154226e-06, + "loss": 0.90664852, + "num_input_tokens_seen": 99770610, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.20678711, + "step": 4617, + "time_per_iteration": 2.5130555629730225 + }, + { + "auxiliary_loss_clip": 0.06522305, + "auxiliary_loss_mlp": 0.01278739, + "balance_loss_clip": 0.06308021, + "balance_loss_mlp": 0.01260059, + "epoch": 0.2776491808206824, + "flos": 18733219868160.0, + "grad_norm": 2.583119020836192, + "language_loss": 0.77338278, + "learning_rate": 3.3904027246640458e-06, + "loss": 0.85139322, + "num_input_tokens_seen": 99787305, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.18676758, + "step": 4618, + "time_per_iteration": 2.5491156578063965 + }, + { + "auxiliary_loss_clip": 0.06524394, + "auxiliary_loss_mlp": 0.01277476, + "balance_loss_clip": 0.06309742, + "balance_loss_mlp": 0.01260191, + "epoch": 0.27770930407335037, + "flos": 28045742791680.0, + "grad_norm": 1.764934716544716, + "language_loss": 0.85733759, + "learning_rate": 3.390122747388459e-06, + "loss": 0.93535626, + "num_input_tokens_seen": 99808940, + "router_z_loss_clip": 2.14648438, + "router_z_loss_mlp": 0.17297363, + "step": 4619, + "time_per_iteration": 2.5741615295410156 + }, + { + "auxiliary_loss_clip": 0.06514929, + "auxiliary_loss_mlp": 0.01285121, + "balance_loss_clip": 0.06308962, + "balance_loss_mlp": 0.01266798, + "epoch": 0.27776942732601834, + "flos": 23556522044160.0, + "grad_norm": 1.4813387132666624, + "language_loss": 0.77092409, + "learning_rate": 3.3898427173992778e-06, + "loss": 0.84892452, + "num_input_tokens_seen": 99829575, + "router_z_loss_clip": 2.0625, + "router_z_loss_mlp": 0.18322754, + "step": 4620, + "time_per_iteration": 2.690934658050537 + }, + { + "auxiliary_loss_clip": 0.0651743, + "auxiliary_loss_mlp": 0.01277569, + "balance_loss_clip": 0.06309397, + "balance_loss_mlp": 0.0125821, + "epoch": 0.2778295505786863, + "flos": 23914474686720.0, + "grad_norm": 1.8907472710416175, + "language_loss": 0.78585863, + "learning_rate": 3.389562634707122e-06, + "loss": 0.86380863, + "num_input_tokens_seen": 99847575, + "router_z_loss_clip": 2.07910156, + "router_z_loss_mlp": 0.19360352, + "step": 4621, + "time_per_iteration": 2.5846168994903564 + }, + { + "auxiliary_loss_clip": 0.06522836, + "auxiliary_loss_mlp": 0.01279001, + "balance_loss_clip": 0.0630835, + "balance_loss_mlp": 0.01259701, + "epoch": 0.27788967383135427, + "flos": 25561276905600.0, + "grad_norm": 2.170367430288875, + "language_loss": 0.88217753, + "learning_rate": 3.389282499322611e-06, + "loss": 0.96019584, + "num_input_tokens_seen": 99864995, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.1932373, + "step": 4622, + "time_per_iteration": 2.6036407947540283 + }, + { + "auxiliary_loss_clip": 0.06512653, + "auxiliary_loss_mlp": 0.01273349, + "balance_loss_clip": 0.06299745, + "balance_loss_mlp": 0.01254919, + "epoch": 0.27794979708402223, + "flos": 16258103712000.0, + "grad_norm": 2.5896700244630018, + "language_loss": 0.81515396, + "learning_rate": 3.389002311256369e-06, + "loss": 0.89301395, + "num_input_tokens_seen": 99881540, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.18432617, + "step": 4623, + "time_per_iteration": 2.539655923843384 + }, + { + "auxiliary_loss_clip": 0.06518189, + "auxiliary_loss_mlp": 0.01278229, + "balance_loss_clip": 0.06306686, + "balance_loss_mlp": 0.01258941, + "epoch": 0.2780099203366902, + "flos": 20673880755840.0, + "grad_norm": 1.9609752985345037, + "language_loss": 0.82099682, + "learning_rate": 3.3887220705190204e-06, + "loss": 0.89896095, + "num_input_tokens_seen": 99899595, + "router_z_loss_clip": 2.11621094, + "router_z_loss_mlp": 0.19274902, + "step": 4624, + "time_per_iteration": 2.5662107467651367 + }, + { + "auxiliary_loss_clip": 0.06512089, + "auxiliary_loss_mlp": 0.01276338, + "balance_loss_clip": 0.06303106, + "balance_loss_mlp": 0.01258004, + "epoch": 0.27807004358935816, + "flos": 17743805256960.0, + "grad_norm": 3.013190567677447, + "language_loss": 0.77269506, + "learning_rate": 3.388441777121191e-06, + "loss": 0.85057938, + "num_input_tokens_seen": 99913020, + "router_z_loss_clip": 2.08789062, + "router_z_loss_mlp": 0.18322754, + "step": 4625, + "time_per_iteration": 2.5685927867889404 + }, + { + "auxiliary_loss_clip": 0.06507699, + "auxiliary_loss_mlp": 0.01271539, + "balance_loss_clip": 0.06299223, + "balance_loss_mlp": 0.01253658, + "epoch": 0.2781301668420261, + "flos": 16732699637760.0, + "grad_norm": 1.9769276375727096, + "language_loss": 0.70884871, + "learning_rate": 3.388161431073511e-06, + "loss": 0.78664112, + "num_input_tokens_seen": 99931405, + "router_z_loss_clip": 2.08398438, + "router_z_loss_mlp": 0.17883301, + "step": 4626, + "time_per_iteration": 2.527975559234619 + }, + { + "auxiliary_loss_clip": 0.06520554, + "auxiliary_loss_mlp": 0.01273216, + "balance_loss_clip": 0.06304689, + "balance_loss_mlp": 0.01254798, + "epoch": 0.27819029009469415, + "flos": 13849848714240.0, + "grad_norm": 2.4481240639566013, + "language_loss": 0.93016249, + "learning_rate": 3.38788103238661e-06, + "loss": 1.00810015, + "num_input_tokens_seen": 99948100, + "router_z_loss_clip": 2.15917969, + "router_z_loss_mlp": 0.18432617, + "step": 4627, + "time_per_iteration": 2.551558494567871 + }, + { + "auxiliary_loss_clip": 0.06514014, + "auxiliary_loss_mlp": 0.01276758, + "balance_loss_clip": 0.06298277, + "balance_loss_mlp": 0.01258364, + "epoch": 0.2782504133473621, + "flos": 27096634794240.0, + "grad_norm": 1.6603793888564844, + "language_loss": 0.85558021, + "learning_rate": 3.387600581071121e-06, + "loss": 0.93348801, + "num_input_tokens_seen": 99966470, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.1842041, + "step": 4628, + "time_per_iteration": 2.56680965423584 + }, + { + "auxiliary_loss_clip": 0.06511193, + "auxiliary_loss_mlp": 0.01275379, + "balance_loss_clip": 0.06301076, + "balance_loss_mlp": 0.01257569, + "epoch": 0.2783105366000301, + "flos": 21075116832000.0, + "grad_norm": 1.7183700627805243, + "language_loss": 0.79370463, + "learning_rate": 3.387320077137679e-06, + "loss": 0.87157035, + "num_input_tokens_seen": 99985930, + "router_z_loss_clip": 2.1015625, + "router_z_loss_mlp": 0.17810059, + "step": 4629, + "time_per_iteration": 2.579024076461792 + }, + { + "auxiliary_loss_clip": 0.06504764, + "auxiliary_loss_mlp": 0.01277211, + "balance_loss_clip": 0.06300465, + "balance_loss_mlp": 0.01259699, + "epoch": 0.27837065985269804, + "flos": 26508456259200.0, + "grad_norm": 2.4632649346037856, + "language_loss": 0.84664094, + "learning_rate": 3.3870395205969208e-06, + "loss": 0.92446071, + "num_input_tokens_seen": 100006235, + "router_z_loss_clip": 2.04296875, + "router_z_loss_mlp": 0.17529297, + "step": 4630, + "time_per_iteration": 2.568190336227417 + }, + { + "auxiliary_loss_clip": 0.06516108, + "auxiliary_loss_mlp": 0.01271169, + "balance_loss_clip": 0.06302783, + "balance_loss_mlp": 0.01253395, + "epoch": 0.278430783105366, + "flos": 20228271143040.0, + "grad_norm": 1.8872458968592738, + "language_loss": 0.80858278, + "learning_rate": 3.386758911459485e-06, + "loss": 0.8864556, + "num_input_tokens_seen": 100023655, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.17773438, + "step": 4631, + "time_per_iteration": 2.5658912658691406 + }, + { + "auxiliary_loss_clip": 0.06512441, + "auxiliary_loss_mlp": 0.01275522, + "balance_loss_clip": 0.06299636, + "balance_loss_mlp": 0.01256866, + "epoch": 0.278490906358034, + "flos": 25599906437760.0, + "grad_norm": 2.407277572133289, + "language_loss": 0.715128, + "learning_rate": 3.3864782497360126e-06, + "loss": 0.79300761, + "num_input_tokens_seen": 100043280, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.18652344, + "step": 4632, + "time_per_iteration": 2.620729446411133 + }, + { + "auxiliary_loss_clip": 0.06502309, + "auxiliary_loss_mlp": 0.01270787, + "balance_loss_clip": 0.06296511, + "balance_loss_mlp": 0.01253502, + "epoch": 0.27855102961070194, + "flos": 16175645694720.0, + "grad_norm": 1.8302171024684264, + "language_loss": 0.82394838, + "learning_rate": 3.386197535437145e-06, + "loss": 0.9016794, + "num_input_tokens_seen": 100057690, + "router_z_loss_clip": 2.05859375, + "router_z_loss_mlp": 0.17297363, + "step": 4633, + "time_per_iteration": 2.513705015182495 + }, + { + "auxiliary_loss_clip": 0.06511516, + "auxiliary_loss_mlp": 0.01278904, + "balance_loss_clip": 0.06299913, + "balance_loss_mlp": 0.012597, + "epoch": 0.2786111528633699, + "flos": 22933864753920.0, + "grad_norm": 1.5843012688553681, + "language_loss": 0.8872478, + "learning_rate": 3.385916768573529e-06, + "loss": 0.96515197, + "num_input_tokens_seen": 100075875, + "router_z_loss_clip": 2.11523438, + "router_z_loss_mlp": 0.19213867, + "step": 4634, + "time_per_iteration": 2.5471088886260986 + }, + { + "auxiliary_loss_clip": 0.06514788, + "auxiliary_loss_mlp": 0.01276007, + "balance_loss_clip": 0.06301814, + "balance_loss_mlp": 0.01256588, + "epoch": 0.27867127611603787, + "flos": 23410934375040.0, + "grad_norm": 1.5369483246730489, + "language_loss": 0.77466059, + "learning_rate": 3.38563594915581e-06, + "loss": 0.85256851, + "num_input_tokens_seen": 100092930, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.19433594, + "step": 4635, + "time_per_iteration": 3.9016311168670654 + }, + { + "auxiliary_loss_clip": 0.06508552, + "auxiliary_loss_mlp": 0.01273062, + "balance_loss_clip": 0.06295648, + "balance_loss_mlp": 0.01254859, + "epoch": 0.27873139936870583, + "flos": 19835210839680.0, + "grad_norm": 1.7801998538005617, + "language_loss": 0.66571766, + "learning_rate": 3.385355077194637e-06, + "loss": 0.74353385, + "num_input_tokens_seen": 100110790, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.18188477, + "step": 4636, + "time_per_iteration": 2.5264599323272705 + }, + { + "auxiliary_loss_clip": 0.06519878, + "auxiliary_loss_mlp": 0.01275894, + "balance_loss_clip": 0.06302889, + "balance_loss_mlp": 0.01256392, + "epoch": 0.2787915226213738, + "flos": 17712638737920.0, + "grad_norm": 2.933733922484583, + "language_loss": 0.83255613, + "learning_rate": 3.3850741527006604e-06, + "loss": 0.91051382, + "num_input_tokens_seen": 100126970, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.19506836, + "step": 4637, + "time_per_iteration": 2.5344014167785645 + }, + { + "auxiliary_loss_clip": 0.06505676, + "auxiliary_loss_mlp": 0.01276787, + "balance_loss_clip": 0.06297021, + "balance_loss_mlp": 0.01258918, + "epoch": 0.27885164587404176, + "flos": 22097039627520.0, + "grad_norm": 1.4932909871395708, + "language_loss": 0.76038569, + "learning_rate": 3.384793175684533e-06, + "loss": 0.83821034, + "num_input_tokens_seen": 100146720, + "router_z_loss_clip": 2.08691406, + "router_z_loss_mlp": 0.17871094, + "step": 4638, + "time_per_iteration": 2.544187068939209 + }, + { + "auxiliary_loss_clip": 0.06510019, + "auxiliary_loss_mlp": 0.01280274, + "balance_loss_clip": 0.06297282, + "balance_loss_mlp": 0.01262511, + "epoch": 0.27891176912670973, + "flos": 19213601725440.0, + "grad_norm": 2.235877812045319, + "language_loss": 0.72492748, + "learning_rate": 3.38451214615691e-06, + "loss": 0.8028304, + "num_input_tokens_seen": 100165920, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.17749023, + "step": 4639, + "time_per_iteration": 4.002680063247681 + }, + { + "auxiliary_loss_clip": 0.06515414, + "auxiliary_loss_mlp": 0.0127372, + "balance_loss_clip": 0.06300536, + "balance_loss_mlp": 0.01254813, + "epoch": 0.27897189237937775, + "flos": 27607428483840.0, + "grad_norm": 1.8877142592522154, + "language_loss": 0.66217673, + "learning_rate": 3.384231064128447e-06, + "loss": 0.74006808, + "num_input_tokens_seen": 100185525, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.18896484, + "step": 4640, + "time_per_iteration": 4.054874420166016 + }, + { + "auxiliary_loss_clip": 0.0651349, + "auxiliary_loss_mlp": 0.01272631, + "balance_loss_clip": 0.06301108, + "balance_loss_mlp": 0.01254654, + "epoch": 0.2790320156320457, + "flos": 21184506737280.0, + "grad_norm": 2.077527470737851, + "language_loss": 0.72818768, + "learning_rate": 3.383949929609804e-06, + "loss": 0.80604887, + "num_input_tokens_seen": 100204850, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.1796875, + "step": 4641, + "time_per_iteration": 2.566758155822754 + }, + { + "auxiliary_loss_clip": 0.06517549, + "auxiliary_loss_mlp": 0.01276062, + "balance_loss_clip": 0.06298883, + "balance_loss_mlp": 0.01256488, + "epoch": 0.2790921388847137, + "flos": 22790541144960.0, + "grad_norm": 1.8548696214163785, + "language_loss": 0.75277239, + "learning_rate": 3.383668742611641e-06, + "loss": 0.8307085, + "num_input_tokens_seen": 100224520, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.19567871, + "step": 4642, + "time_per_iteration": 2.5531389713287354 + }, + { + "auxiliary_loss_clip": 0.0651103, + "auxiliary_loss_mlp": 0.01281312, + "balance_loss_clip": 0.06296819, + "balance_loss_mlp": 0.01261631, + "epoch": 0.27915226213738165, + "flos": 23406783598080.0, + "grad_norm": 1.8301300365045747, + "language_loss": 0.85787475, + "learning_rate": 3.3833875031446205e-06, + "loss": 0.93579817, + "num_input_tokens_seen": 100243935, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.19689941, + "step": 4643, + "time_per_iteration": 2.561692714691162 + }, + { + "auxiliary_loss_clip": 0.06505755, + "auxiliary_loss_mlp": 0.01281002, + "balance_loss_clip": 0.06292956, + "balance_loss_mlp": 0.01262572, + "epoch": 0.2792123853900496, + "flos": 22754469162240.0, + "grad_norm": 2.128449816262669, + "language_loss": 0.83027583, + "learning_rate": 3.383106211219407e-06, + "loss": 0.9081434, + "num_input_tokens_seen": 100262290, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.1842041, + "step": 4644, + "time_per_iteration": 2.5298962593078613 + }, + { + "auxiliary_loss_clip": 0.06505448, + "auxiliary_loss_mlp": 0.01273805, + "balance_loss_clip": 0.0629155, + "balance_loss_mlp": 0.01256174, + "epoch": 0.2792725086427176, + "flos": 15054772826880.0, + "grad_norm": 1.7497246062339578, + "language_loss": 0.79546082, + "learning_rate": 3.3828248668466673e-06, + "loss": 0.87325335, + "num_input_tokens_seen": 100280015, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.17626953, + "step": 4645, + "time_per_iteration": 3.9172677993774414 + }, + { + "auxiliary_loss_clip": 0.06419063, + "auxiliary_loss_mlp": 0.01254208, + "balance_loss_clip": 0.0631457, + "balance_loss_mlp": 0.0125017, + "epoch": 0.27933263189538554, + "flos": 62562805862400.0, + "grad_norm": 0.7707831229317741, + "language_loss": 0.62136066, + "learning_rate": 3.3825434700370705e-06, + "loss": 0.6980933, + "num_input_tokens_seen": 100338935, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.04037476, + "step": 4646, + "time_per_iteration": 3.1527390480041504 + }, + { + "auxiliary_loss_clip": 0.06500821, + "auxiliary_loss_mlp": 0.01275319, + "balance_loss_clip": 0.0629313, + "balance_loss_mlp": 0.01257581, + "epoch": 0.2793927551480535, + "flos": 25125268584960.0, + "grad_norm": 1.6018723981737446, + "language_loss": 0.89582062, + "learning_rate": 3.3822620208012865e-06, + "loss": 0.97358203, + "num_input_tokens_seen": 100359905, + "router_z_loss_clip": 2.078125, + "router_z_loss_mlp": 0.17736816, + "step": 4647, + "time_per_iteration": 2.564333915710449 + }, + { + "auxiliary_loss_clip": 0.06509704, + "auxiliary_loss_mlp": 0.01277108, + "balance_loss_clip": 0.06292088, + "balance_loss_mlp": 0.01258142, + "epoch": 0.27945287840072147, + "flos": 21330974874240.0, + "grad_norm": 1.6381839497334347, + "language_loss": 0.87525821, + "learning_rate": 3.381980519149988e-06, + "loss": 0.95312631, + "num_input_tokens_seen": 100376955, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.1895752, + "step": 4648, + "time_per_iteration": 2.5516953468322754 + }, + { + "auxiliary_loss_clip": 0.06507549, + "auxiliary_loss_mlp": 0.01274847, + "balance_loss_clip": 0.06291072, + "balance_loss_mlp": 0.01256643, + "epoch": 0.27951300165338944, + "flos": 27457354621440.0, + "grad_norm": 2.652634800411286, + "language_loss": 0.73020303, + "learning_rate": 3.38169896509385e-06, + "loss": 0.80802703, + "num_input_tokens_seen": 100397545, + "router_z_loss_clip": 2.1640625, + "router_z_loss_mlp": 0.18212891, + "step": 4649, + "time_per_iteration": 2.5767719745635986 + }, + { + "auxiliary_loss_clip": 0.06508242, + "auxiliary_loss_mlp": 0.01277361, + "balance_loss_clip": 0.0629622, + "balance_loss_mlp": 0.01259003, + "epoch": 0.2795731249060574, + "flos": 15164456221440.0, + "grad_norm": 2.110277953429804, + "language_loss": 0.81314564, + "learning_rate": 3.381417358643549e-06, + "loss": 0.8910017, + "num_input_tokens_seen": 100415080, + "router_z_loss_clip": 2.12109375, + "router_z_loss_mlp": 0.18347168, + "step": 4650, + "time_per_iteration": 2.663588285446167 + }, + { + "auxiliary_loss_clip": 0.06406052, + "auxiliary_loss_mlp": 0.01252705, + "balance_loss_clip": 0.06303374, + "balance_loss_mlp": 0.01248944, + "epoch": 0.27963324815872537, + "flos": 60140951775360.0, + "grad_norm": 0.800089640521837, + "language_loss": 0.5874877, + "learning_rate": 3.3811356998097624e-06, + "loss": 0.66407531, + "num_input_tokens_seen": 100471105, + "router_z_loss_clip": 1.0234375, + "router_z_loss_mlp": 0.03753662, + "step": 4651, + "time_per_iteration": 3.205563545227051 + }, + { + "auxiliary_loss_clip": 0.06513405, + "auxiliary_loss_mlp": 0.01276159, + "balance_loss_clip": 0.06293929, + "balance_loss_mlp": 0.01257205, + "epoch": 0.27969337141139333, + "flos": 21773020688640.0, + "grad_norm": 1.70848848544609, + "language_loss": 0.74928713, + "learning_rate": 3.3808539886031726e-06, + "loss": 0.82718277, + "num_input_tokens_seen": 100492520, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.18945312, + "step": 4652, + "time_per_iteration": 2.620284080505371 + }, + { + "auxiliary_loss_clip": 0.06513481, + "auxiliary_loss_mlp": 0.01277362, + "balance_loss_clip": 0.06297033, + "balance_loss_mlp": 0.01259517, + "epoch": 0.27975349466406135, + "flos": 39859559072640.0, + "grad_norm": 2.257859492249039, + "language_loss": 0.81193566, + "learning_rate": 3.380572225034461e-06, + "loss": 0.88984406, + "num_input_tokens_seen": 100512870, + "router_z_loss_clip": 2.16601562, + "router_z_loss_mlp": 0.17834473, + "step": 4653, + "time_per_iteration": 2.6902103424072266 + }, + { + "auxiliary_loss_clip": 0.06505801, + "auxiliary_loss_mlp": 0.01275903, + "balance_loss_clip": 0.06293398, + "balance_loss_mlp": 0.01257939, + "epoch": 0.2798136179167293, + "flos": 21586204010880.0, + "grad_norm": 2.2005279612587647, + "language_loss": 0.78939915, + "learning_rate": 3.380290409114312e-06, + "loss": 0.86721623, + "num_input_tokens_seen": 100531655, + "router_z_loss_clip": 2.12304688, + "router_z_loss_mlp": 0.17956543, + "step": 4654, + "time_per_iteration": 2.5862321853637695 + }, + { + "auxiliary_loss_clip": 0.06514826, + "auxiliary_loss_mlp": 0.01276603, + "balance_loss_clip": 0.06294681, + "balance_loss_mlp": 0.01256457, + "epoch": 0.2798737411693973, + "flos": 21543130212480.0, + "grad_norm": 2.786817882874951, + "language_loss": 0.81491858, + "learning_rate": 3.3800085408534127e-06, + "loss": 0.89283288, + "num_input_tokens_seen": 100548005, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.20153809, + "step": 4655, + "time_per_iteration": 2.5335962772369385 + }, + { + "auxiliary_loss_clip": 0.06503223, + "auxiliary_loss_mlp": 0.01276615, + "balance_loss_clip": 0.06287771, + "balance_loss_mlp": 0.0125778, + "epoch": 0.27993386442206525, + "flos": 26988586554240.0, + "grad_norm": 1.7572759264995625, + "language_loss": 0.82015479, + "learning_rate": 3.3797266202624506e-06, + "loss": 0.89795309, + "num_input_tokens_seen": 100567980, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.18847656, + "step": 4656, + "time_per_iteration": 2.5953826904296875 + }, + { + "auxiliary_loss_clip": 0.0650457, + "auxiliary_loss_mlp": 0.01280726, + "balance_loss_clip": 0.06291523, + "balance_loss_mlp": 0.01261319, + "epoch": 0.2799939876747332, + "flos": 24356268938880.0, + "grad_norm": 1.602501989097996, + "language_loss": 0.83292782, + "learning_rate": 3.3794446473521176e-06, + "loss": 0.91078079, + "num_input_tokens_seen": 100588630, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.19396973, + "step": 4657, + "time_per_iteration": 2.546698808670044 + }, + { + "auxiliary_loss_clip": 0.06501682, + "auxiliary_loss_mlp": 0.01283943, + "balance_loss_clip": 0.06287715, + "balance_loss_mlp": 0.01265847, + "epoch": 0.2800541109274012, + "flos": 33665479626240.0, + "grad_norm": 2.056920585114217, + "language_loss": 0.64474404, + "learning_rate": 3.379162622133105e-06, + "loss": 0.72260022, + "num_input_tokens_seen": 100608775, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.18103027, + "step": 4658, + "time_per_iteration": 2.633352041244507 + }, + { + "auxiliary_loss_clip": 0.0650496, + "auxiliary_loss_mlp": 0.01278289, + "balance_loss_clip": 0.06292152, + "balance_loss_mlp": 0.01258298, + "epoch": 0.28011423418006914, + "flos": 21620515057920.0, + "grad_norm": 1.9139831777919125, + "language_loss": 0.78200769, + "learning_rate": 3.3788805446161073e-06, + "loss": 0.85984015, + "num_input_tokens_seen": 100627975, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.19995117, + "step": 4659, + "time_per_iteration": 2.5146000385284424 + }, + { + "auxiliary_loss_clip": 0.06512548, + "auxiliary_loss_mlp": 0.01279668, + "balance_loss_clip": 0.06298335, + "balance_loss_mlp": 0.01260582, + "epoch": 0.2801743574327371, + "flos": 23119130131200.0, + "grad_norm": 1.8180566150817747, + "language_loss": 0.79711032, + "learning_rate": 3.3785984148118215e-06, + "loss": 0.87503254, + "num_input_tokens_seen": 100645430, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.1907959, + "step": 4660, + "time_per_iteration": 2.5558273792266846 + }, + { + "auxiliary_loss_clip": 0.06502102, + "auxiliary_loss_mlp": 0.01275983, + "balance_loss_clip": 0.06293646, + "balance_loss_mlp": 0.01257732, + "epoch": 0.2802344806854051, + "flos": 12646433975040.0, + "grad_norm": 2.0195446081970685, + "language_loss": 0.8127892, + "learning_rate": 3.3783162327309453e-06, + "loss": 0.89057004, + "num_input_tokens_seen": 100663775, + "router_z_loss_clip": 2.08300781, + "router_z_loss_mlp": 0.18237305, + "step": 4661, + "time_per_iteration": 2.475562572479248 + }, + { + "auxiliary_loss_clip": 0.06508808, + "auxiliary_loss_mlp": 0.01277709, + "balance_loss_clip": 0.06296618, + "balance_loss_mlp": 0.01258898, + "epoch": 0.28029460393807304, + "flos": 37276772019840.0, + "grad_norm": 2.0240330571158904, + "language_loss": 0.79226935, + "learning_rate": 3.3780339983841794e-06, + "loss": 0.87013447, + "num_input_tokens_seen": 100686085, + "router_z_loss_clip": 2.12109375, + "router_z_loss_mlp": 0.18823242, + "step": 4662, + "time_per_iteration": 2.6644277572631836 + }, + { + "auxiliary_loss_clip": 0.06515819, + "auxiliary_loss_mlp": 0.01277387, + "balance_loss_clip": 0.06296565, + "balance_loss_mlp": 0.01258349, + "epoch": 0.280354727190741, + "flos": 20747450240640.0, + "grad_norm": 1.722651872041065, + "language_loss": 0.70744783, + "learning_rate": 3.377751711782227e-06, + "loss": 0.78537989, + "num_input_tokens_seen": 100705135, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.19042969, + "step": 4663, + "time_per_iteration": 2.5365068912506104 + }, + { + "auxiliary_loss_clip": 0.06510712, + "auxiliary_loss_mlp": 0.01280818, + "balance_loss_clip": 0.06293653, + "balance_loss_mlp": 0.01259312, + "epoch": 0.28041485044340897, + "flos": 21477526865280.0, + "grad_norm": 1.8007469711633386, + "language_loss": 0.77919745, + "learning_rate": 3.377469372935791e-06, + "loss": 0.85711277, + "num_input_tokens_seen": 100724960, + "router_z_loss_clip": 2.17089844, + "router_z_loss_mlp": 0.21520996, + "step": 4664, + "time_per_iteration": 2.578552484512329 + }, + { + "auxiliary_loss_clip": 0.06500383, + "auxiliary_loss_mlp": 0.01277041, + "balance_loss_clip": 0.06293675, + "balance_loss_mlp": 0.01259374, + "epoch": 0.28047497369607693, + "flos": 14799669471360.0, + "grad_norm": 1.9758280924180103, + "language_loss": 0.80386382, + "learning_rate": 3.377186981855578e-06, + "loss": 0.88163805, + "num_input_tokens_seen": 100741995, + "router_z_loss_clip": 2.06738281, + "router_z_loss_mlp": 0.17675781, + "step": 4665, + "time_per_iteration": 2.5088212490081787 + }, + { + "auxiliary_loss_clip": 0.06506059, + "auxiliary_loss_mlp": 0.01274647, + "balance_loss_clip": 0.06294893, + "balance_loss_mlp": 0.01257397, + "epoch": 0.2805350969487449, + "flos": 23076559457280.0, + "grad_norm": 2.052054159073397, + "language_loss": 0.81109238, + "learning_rate": 3.3769045385522968e-06, + "loss": 0.88889945, + "num_input_tokens_seen": 100758985, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.17236328, + "step": 4666, + "time_per_iteration": 2.5765438079833984 + }, + { + "auxiliary_loss_clip": 0.06505027, + "auxiliary_loss_mlp": 0.01282246, + "balance_loss_clip": 0.0629367, + "balance_loss_mlp": 0.01263149, + "epoch": 0.2805952202014129, + "flos": 20485177361280.0, + "grad_norm": 2.1346617464039395, + "language_loss": 0.84940714, + "learning_rate": 3.376622043036658e-06, + "loss": 0.92727995, + "num_input_tokens_seen": 100777820, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.19104004, + "step": 4667, + "time_per_iteration": 2.536466360092163 + }, + { + "auxiliary_loss_clip": 0.06510031, + "auxiliary_loss_mlp": 0.01284991, + "balance_loss_clip": 0.0629562, + "balance_loss_mlp": 0.0126581, + "epoch": 0.2806553434540809, + "flos": 27424678728960.0, + "grad_norm": 1.8168022919289022, + "language_loss": 0.80077279, + "learning_rate": 3.376339495319373e-06, + "loss": 0.87872303, + "num_input_tokens_seen": 100798205, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.19177246, + "step": 4668, + "time_per_iteration": 2.620793581008911 + }, + { + "auxiliary_loss_clip": 0.06508033, + "auxiliary_loss_mlp": 0.01279574, + "balance_loss_clip": 0.06290744, + "balance_loss_mlp": 0.0126124, + "epoch": 0.28071546670674885, + "flos": 26512187765760.0, + "grad_norm": 1.3575587104794173, + "language_loss": 0.76748574, + "learning_rate": 3.3760568954111563e-06, + "loss": 0.84536183, + "num_input_tokens_seen": 100819800, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.18334961, + "step": 4669, + "time_per_iteration": 2.629755973815918 + }, + { + "auxiliary_loss_clip": 0.06511085, + "auxiliary_loss_mlp": 0.01281258, + "balance_loss_clip": 0.06298456, + "balance_loss_mlp": 0.01263376, + "epoch": 0.2807755899594168, + "flos": 20564993975040.0, + "grad_norm": 1.8976620486576934, + "language_loss": 0.79953671, + "learning_rate": 3.375774243322725e-06, + "loss": 0.87746012, + "num_input_tokens_seen": 100837880, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.17883301, + "step": 4670, + "time_per_iteration": 2.630960702896118 + }, + { + "auxiliary_loss_clip": 0.06512859, + "auxiliary_loss_mlp": 0.0128758, + "balance_loss_clip": 0.06296019, + "balance_loss_mlp": 0.0126859, + "epoch": 0.2808357132120848, + "flos": 24319693831680.0, + "grad_norm": 2.1242803821214915, + "language_loss": 0.79548872, + "learning_rate": 3.3754915390647955e-06, + "loss": 0.87349308, + "num_input_tokens_seen": 100856350, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.18981934, + "step": 4671, + "time_per_iteration": 2.5943963527679443 + }, + { + "auxiliary_loss_clip": 0.06499608, + "auxiliary_loss_mlp": 0.01282791, + "balance_loss_clip": 0.06293108, + "balance_loss_mlp": 0.01265124, + "epoch": 0.28089583646475275, + "flos": 26439624529920.0, + "grad_norm": 1.773606658736433, + "language_loss": 0.75789028, + "learning_rate": 3.37520878264809e-06, + "loss": 0.83571434, + "num_input_tokens_seen": 100876135, + "router_z_loss_clip": 2.06640625, + "router_z_loss_mlp": 0.17663574, + "step": 4672, + "time_per_iteration": 2.5819919109344482 + }, + { + "auxiliary_loss_clip": 0.06515782, + "auxiliary_loss_mlp": 0.0128082, + "balance_loss_clip": 0.06299746, + "balance_loss_mlp": 0.01260412, + "epoch": 0.2809559597174207, + "flos": 23118417371520.0, + "grad_norm": 2.723902952009536, + "language_loss": 0.76012361, + "learning_rate": 3.3749259740833286e-06, + "loss": 0.83808959, + "num_input_tokens_seen": 100894790, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.20410156, + "step": 4673, + "time_per_iteration": 2.579460859298706 + }, + { + "auxiliary_loss_clip": 0.06510463, + "auxiliary_loss_mlp": 0.01285315, + "balance_loss_clip": 0.06297876, + "balance_loss_mlp": 0.0126704, + "epoch": 0.2810160829700887, + "flos": 20929864579200.0, + "grad_norm": 1.8153863613356214, + "language_loss": 0.72824192, + "learning_rate": 3.374643113381237e-06, + "loss": 0.80619967, + "num_input_tokens_seen": 100915100, + "router_z_loss_clip": 2.12792969, + "router_z_loss_mlp": 0.18261719, + "step": 4674, + "time_per_iteration": 4.0586278438568115 + }, + { + "auxiliary_loss_clip": 0.06522093, + "auxiliary_loss_mlp": 0.01283708, + "balance_loss_clip": 0.06307152, + "balance_loss_mlp": 0.0126405, + "epoch": 0.28107620622275664, + "flos": 14361145528320.0, + "grad_norm": 1.8954321480679195, + "language_loss": 0.77875817, + "learning_rate": 3.374360200552541e-06, + "loss": 0.85681611, + "num_input_tokens_seen": 100932795, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.1965332, + "step": 4675, + "time_per_iteration": 2.550075054168701 + }, + { + "auxiliary_loss_clip": 0.06512761, + "auxiliary_loss_mlp": 0.01288962, + "balance_loss_clip": 0.06296991, + "balance_loss_mlp": 0.01269531, + "epoch": 0.2811363294754246, + "flos": 20924707553280.0, + "grad_norm": 3.9789590396078784, + "language_loss": 0.70705891, + "learning_rate": 3.374077235607968e-06, + "loss": 0.78507614, + "num_input_tokens_seen": 100950505, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.19433594, + "step": 4676, + "time_per_iteration": 2.519028425216675 + }, + { + "auxiliary_loss_clip": 0.06504105, + "auxiliary_loss_mlp": 0.01278874, + "balance_loss_clip": 0.0629884, + "balance_loss_mlp": 0.01260611, + "epoch": 0.28119645272809257, + "flos": 20601107884800.0, + "grad_norm": 1.5779309471284284, + "language_loss": 0.70529211, + "learning_rate": 3.3737942185582487e-06, + "loss": 0.78312188, + "num_input_tokens_seen": 100968790, + "router_z_loss_clip": 2.04882812, + "router_z_loss_mlp": 0.18286133, + "step": 4677, + "time_per_iteration": 2.5834195613861084 + }, + { + "auxiliary_loss_clip": 0.06516379, + "auxiliary_loss_mlp": 0.01281791, + "balance_loss_clip": 0.06302937, + "balance_loss_mlp": 0.0126193, + "epoch": 0.28125657598076054, + "flos": 25344383811840.0, + "grad_norm": 1.5021857900224345, + "language_loss": 0.64105308, + "learning_rate": 3.3735111494141153e-06, + "loss": 0.71903479, + "num_input_tokens_seen": 100990205, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.1986084, + "step": 4678, + "time_per_iteration": 2.618948221206665 + }, + { + "auxiliary_loss_clip": 0.06517099, + "auxiliary_loss_mlp": 0.01278079, + "balance_loss_clip": 0.06306246, + "balance_loss_mlp": 0.01259947, + "epoch": 0.2813166992334285, + "flos": 24834051319680.0, + "grad_norm": 1.437486997447774, + "language_loss": 0.71167207, + "learning_rate": 3.3732280281863013e-06, + "loss": 0.7896238, + "num_input_tokens_seen": 101009815, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.18139648, + "step": 4679, + "time_per_iteration": 5.466668128967285 + }, + { + "auxiliary_loss_clip": 0.06520079, + "auxiliary_loss_mlp": 0.0127734, + "balance_loss_clip": 0.06306013, + "balance_loss_mlp": 0.01257491, + "epoch": 0.2813768224860965, + "flos": 21766941267840.0, + "grad_norm": 1.8819388160659554, + "language_loss": 0.75122017, + "learning_rate": 3.3729448548855422e-06, + "loss": 0.82919437, + "num_input_tokens_seen": 101026780, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.19848633, + "step": 4680, + "time_per_iteration": 2.5146636962890625 + }, + { + "auxiliary_loss_clip": 0.06519224, + "auxiliary_loss_mlp": 0.01276065, + "balance_loss_clip": 0.06307293, + "balance_loss_mlp": 0.01257969, + "epoch": 0.2814369457387645, + "flos": 24323760754560.0, + "grad_norm": 2.4475033368931984, + "language_loss": 0.77670574, + "learning_rate": 3.3726616295225774e-06, + "loss": 0.8546586, + "num_input_tokens_seen": 101046215, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.18103027, + "step": 4681, + "time_per_iteration": 2.576263189315796 + }, + { + "auxiliary_loss_clip": 0.06524731, + "auxiliary_loss_mlp": 0.01277602, + "balance_loss_clip": 0.06309941, + "balance_loss_mlp": 0.01259208, + "epoch": 0.28149706899143245, + "flos": 18521274165120.0, + "grad_norm": 2.513172937911882, + "language_loss": 0.7420646, + "learning_rate": 3.372378352108146e-06, + "loss": 0.82008791, + "num_input_tokens_seen": 101063365, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.18383789, + "step": 4682, + "time_per_iteration": 2.5019047260284424 + }, + { + "auxiliary_loss_clip": 0.06516165, + "auxiliary_loss_mlp": 0.01280522, + "balance_loss_clip": 0.06307921, + "balance_loss_mlp": 0.01262879, + "epoch": 0.2815571922441004, + "flos": 24870165229440.0, + "grad_norm": 1.4634735151261165, + "language_loss": 0.81619561, + "learning_rate": 3.3720950226529894e-06, + "loss": 0.89416242, + "num_input_tokens_seen": 101083835, + "router_z_loss_clip": 2.08105469, + "router_z_loss_mlp": 0.17626953, + "step": 4683, + "time_per_iteration": 2.6108040809631348 + }, + { + "auxiliary_loss_clip": 0.06511167, + "auxiliary_loss_mlp": 0.01277368, + "balance_loss_clip": 0.06297079, + "balance_loss_mlp": 0.01258771, + "epoch": 0.2816173154967684, + "flos": 19907774075520.0, + "grad_norm": 1.6126473409715323, + "language_loss": 0.76514447, + "learning_rate": 3.371811641167852e-06, + "loss": 0.8430298, + "num_input_tokens_seen": 101101740, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.18579102, + "step": 4684, + "time_per_iteration": 3.9593515396118164 + }, + { + "auxiliary_loss_clip": 0.06509569, + "auxiliary_loss_mlp": 0.0127644, + "balance_loss_clip": 0.06298888, + "balance_loss_mlp": 0.01257474, + "epoch": 0.28167743874943635, + "flos": 17496709966080.0, + "grad_norm": 1.741664239740996, + "language_loss": 0.76634955, + "learning_rate": 3.3715282076634807e-06, + "loss": 0.84420967, + "num_input_tokens_seen": 101120480, + "router_z_loss_clip": 2.10742188, + "router_z_loss_mlp": 0.18969727, + "step": 4685, + "time_per_iteration": 2.533033847808838 + }, + { + "auxiliary_loss_clip": 0.06512235, + "auxiliary_loss_mlp": 0.01277016, + "balance_loss_clip": 0.06303049, + "balance_loss_mlp": 0.01258002, + "epoch": 0.2817375620021043, + "flos": 25309276151040.0, + "grad_norm": 1.5379443905684582, + "language_loss": 0.76075816, + "learning_rate": 3.3712447221506218e-06, + "loss": 0.8386507, + "num_input_tokens_seen": 101142910, + "router_z_loss_clip": 2.09179688, + "router_z_loss_mlp": 0.19006348, + "step": 4686, + "time_per_iteration": 2.5632452964782715 + }, + { + "auxiliary_loss_clip": 0.0651376, + "auxiliary_loss_mlp": 0.01282744, + "balance_loss_clip": 0.06298173, + "balance_loss_mlp": 0.01262705, + "epoch": 0.2817976852547723, + "flos": 18698447623680.0, + "grad_norm": 3.4763910689128945, + "language_loss": 0.63974833, + "learning_rate": 3.370961184640025e-06, + "loss": 0.71771336, + "num_input_tokens_seen": 101160030, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.20043945, + "step": 4687, + "time_per_iteration": 2.5520877838134766 + }, + { + "auxiliary_loss_clip": 0.0651626, + "auxiliary_loss_mlp": 0.01278308, + "balance_loss_clip": 0.06302825, + "balance_loss_mlp": 0.01258889, + "epoch": 0.28185780850744024, + "flos": 22748012398080.0, + "grad_norm": 2.5451270798344208, + "language_loss": 0.76514482, + "learning_rate": 3.3706775951424433e-06, + "loss": 0.84309042, + "num_input_tokens_seen": 101177675, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.1940918, + "step": 4688, + "time_per_iteration": 2.5427582263946533 + }, + { + "auxiliary_loss_clip": 0.06506021, + "auxiliary_loss_mlp": 0.01276039, + "balance_loss_clip": 0.06297493, + "balance_loss_mlp": 0.01258622, + "epoch": 0.2819179317601082, + "flos": 14938297251840.0, + "grad_norm": 2.0673048339937394, + "language_loss": 0.79160047, + "learning_rate": 3.37039395366863e-06, + "loss": 0.86942106, + "num_input_tokens_seen": 101192225, + "router_z_loss_clip": 2.0859375, + "router_z_loss_mlp": 0.17407227, + "step": 4689, + "time_per_iteration": 2.514857769012451 + }, + { + "auxiliary_loss_clip": 0.06505655, + "auxiliary_loss_mlp": 0.01279731, + "balance_loss_clip": 0.06295724, + "balance_loss_mlp": 0.0126098, + "epoch": 0.2819780550127762, + "flos": 23151428680320.0, + "grad_norm": 2.0480677905828664, + "language_loss": 0.78403682, + "learning_rate": 3.37011026022934e-06, + "loss": 0.86189067, + "num_input_tokens_seen": 101210870, + "router_z_loss_clip": 2.1015625, + "router_z_loss_mlp": 0.18762207, + "step": 4690, + "time_per_iteration": 2.5567362308502197 + }, + { + "auxiliary_loss_clip": 0.06514366, + "auxiliary_loss_mlp": 0.01275411, + "balance_loss_clip": 0.06301816, + "balance_loss_mlp": 0.01256981, + "epoch": 0.28203817826544414, + "flos": 21622779118080.0, + "grad_norm": 2.5530247222146976, + "language_loss": 0.87619591, + "learning_rate": 3.369826514835332e-06, + "loss": 0.95409369, + "num_input_tokens_seen": 101229965, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.18432617, + "step": 4691, + "time_per_iteration": 2.5987935066223145 + }, + { + "auxiliary_loss_clip": 0.0651565, + "auxiliary_loss_mlp": 0.0127711, + "balance_loss_clip": 0.0629878, + "balance_loss_mlp": 0.01258787, + "epoch": 0.2820983015181121, + "flos": 24034010935680.0, + "grad_norm": 1.7719901211447804, + "language_loss": 0.82443225, + "learning_rate": 3.3695427174973654e-06, + "loss": 0.90235984, + "num_input_tokens_seen": 101250980, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.18322754, + "step": 4692, + "time_per_iteration": 2.607388496398926 + }, + { + "auxiliary_loss_clip": 0.06515577, + "auxiliary_loss_mlp": 0.01278887, + "balance_loss_clip": 0.06304249, + "balance_loss_mlp": 0.01259921, + "epoch": 0.2821584247707801, + "flos": 30015725408640.0, + "grad_norm": 1.5203777397001885, + "language_loss": 0.74437934, + "learning_rate": 3.3692588682262022e-06, + "loss": 0.82232404, + "num_input_tokens_seen": 101273335, + "router_z_loss_clip": 2.11328125, + "router_z_loss_mlp": 0.1895752, + "step": 4693, + "time_per_iteration": 2.6104559898376465 + }, + { + "auxiliary_loss_clip": 0.06512225, + "auxiliary_loss_mlp": 0.0127432, + "balance_loss_clip": 0.06298921, + "balance_loss_mlp": 0.01255593, + "epoch": 0.2822185480234481, + "flos": 21403034985600.0, + "grad_norm": 1.7641787467317929, + "language_loss": 0.77641487, + "learning_rate": 3.3689749670326046e-06, + "loss": 0.85428035, + "num_input_tokens_seen": 101292110, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.18737793, + "step": 4694, + "time_per_iteration": 2.5619184970855713 + }, + { + "auxiliary_loss_clip": 0.06513312, + "auxiliary_loss_mlp": 0.01274888, + "balance_loss_clip": 0.0630666, + "balance_loss_mlp": 0.01255898, + "epoch": 0.28227867127611606, + "flos": 27459996024960.0, + "grad_norm": 2.064814820064932, + "language_loss": 0.67270994, + "learning_rate": 3.3686910139273392e-06, + "loss": 0.75059193, + "num_input_tokens_seen": 101312815, + "router_z_loss_clip": 2.06640625, + "router_z_loss_mlp": 0.18969727, + "step": 4695, + "time_per_iteration": 2.5849459171295166 + }, + { + "auxiliary_loss_clip": 0.06524754, + "auxiliary_loss_mlp": 0.01277288, + "balance_loss_clip": 0.06312457, + "balance_loss_mlp": 0.01255914, + "epoch": 0.282338794528784, + "flos": 22599028638720.0, + "grad_norm": 2.3022925444863747, + "language_loss": 0.75992346, + "learning_rate": 3.3684070089211736e-06, + "loss": 0.83794391, + "num_input_tokens_seen": 101329045, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.21362305, + "step": 4696, + "time_per_iteration": 2.5599312782287598 + }, + { + "auxiliary_loss_clip": 0.06528555, + "auxiliary_loss_mlp": 0.01276345, + "balance_loss_clip": 0.06319815, + "balance_loss_mlp": 0.01257915, + "epoch": 0.282398917781452, + "flos": 42020592998400.0, + "grad_norm": 1.6923608864022255, + "language_loss": 0.62607121, + "learning_rate": 3.368122952024877e-06, + "loss": 0.70412022, + "num_input_tokens_seen": 101352715, + "router_z_loss_clip": 2.08691406, + "router_z_loss_mlp": 0.1842041, + "step": 4697, + "time_per_iteration": 2.719783067703247 + }, + { + "auxiliary_loss_clip": 0.0651894, + "auxiliary_loss_mlp": 0.01278397, + "balance_loss_clip": 0.0631054, + "balance_loss_mlp": 0.01260564, + "epoch": 0.28245904103411995, + "flos": 23231916126720.0, + "grad_norm": 1.330125700327103, + "language_loss": 0.73835146, + "learning_rate": 3.3678388432492214e-06, + "loss": 0.81632483, + "num_input_tokens_seen": 101374640, + "router_z_loss_clip": 2.08398438, + "router_z_loss_mlp": 0.17834473, + "step": 4698, + "time_per_iteration": 2.671154260635376 + }, + { + "auxiliary_loss_clip": 0.06520095, + "auxiliary_loss_mlp": 0.01274177, + "balance_loss_clip": 0.06314629, + "balance_loss_mlp": 0.01255699, + "epoch": 0.2825191642867879, + "flos": 25381713605760.0, + "grad_norm": 1.8806904568543696, + "language_loss": 0.75498992, + "learning_rate": 3.3675546826049788e-06, + "loss": 0.83293265, + "num_input_tokens_seen": 101393595, + "router_z_loss_clip": 2.05371094, + "router_z_loss_mlp": 0.18481445, + "step": 4699, + "time_per_iteration": 2.749073028564453 + }, + { + "auxiliary_loss_clip": 0.06532586, + "auxiliary_loss_mlp": 0.0127858, + "balance_loss_clip": 0.06318063, + "balance_loss_mlp": 0.01257969, + "epoch": 0.2825792875394559, + "flos": 17242277443200.0, + "grad_norm": 2.5468251061801697, + "language_loss": 0.80103695, + "learning_rate": 3.3672704701029265e-06, + "loss": 0.87914866, + "num_input_tokens_seen": 101409265, + "router_z_loss_clip": 2.14355469, + "router_z_loss_mlp": 0.20617676, + "step": 4700, + "time_per_iteration": 2.539794683456421 + }, + { + "auxiliary_loss_clip": 0.06516679, + "auxiliary_loss_mlp": 0.01274136, + "balance_loss_clip": 0.06314512, + "balance_loss_mlp": 0.01257006, + "epoch": 0.28263941079212385, + "flos": 26731177211520.0, + "grad_norm": 2.1068022199140213, + "language_loss": 0.8243857, + "learning_rate": 3.3669862057538402e-06, + "loss": 0.90229392, + "num_input_tokens_seen": 101428365, + "router_z_loss_clip": 2.02246094, + "router_z_loss_mlp": 0.17114258, + "step": 4701, + "time_per_iteration": 2.5763485431671143 + }, + { + "auxiliary_loss_clip": 0.06520683, + "auxiliary_loss_mlp": 0.01274057, + "balance_loss_clip": 0.06312392, + "balance_loss_mlp": 0.01256116, + "epoch": 0.2826995340447918, + "flos": 25928411569920.0, + "grad_norm": 2.2990609650841276, + "language_loss": 0.73153478, + "learning_rate": 3.3667018895685004e-06, + "loss": 0.80948216, + "num_input_tokens_seen": 101447280, + "router_z_loss_clip": 2.08007812, + "router_z_loss_mlp": 0.17956543, + "step": 4702, + "time_per_iteration": 2.5968289375305176 + }, + { + "auxiliary_loss_clip": 0.06520355, + "auxiliary_loss_mlp": 0.01275823, + "balance_loss_clip": 0.06316096, + "balance_loss_mlp": 0.01258848, + "epoch": 0.2827596572974598, + "flos": 22385783197440.0, + "grad_norm": 1.6603391807745085, + "language_loss": 0.78883457, + "learning_rate": 3.3664175215576886e-06, + "loss": 0.86679637, + "num_input_tokens_seen": 101465435, + "router_z_loss_clip": 2.04296875, + "router_z_loss_mlp": 0.1697998, + "step": 4703, + "time_per_iteration": 2.56088924407959 + }, + { + "auxiliary_loss_clip": 0.06518066, + "auxiliary_loss_mlp": 0.01281519, + "balance_loss_clip": 0.06307587, + "balance_loss_mlp": 0.01261885, + "epoch": 0.28281978055012774, + "flos": 33555544669440.0, + "grad_norm": 1.530922589206002, + "language_loss": 0.69937778, + "learning_rate": 3.3661331017321867e-06, + "loss": 0.77737355, + "num_input_tokens_seen": 101486355, + "router_z_loss_clip": 2.10253906, + "router_z_loss_mlp": 0.19628906, + "step": 4704, + "time_per_iteration": 2.725234031677246 + }, + { + "auxiliary_loss_clip": 0.0652602, + "auxiliary_loss_mlp": 0.01283133, + "balance_loss_clip": 0.06319317, + "balance_loss_mlp": 0.01264119, + "epoch": 0.2828799038027957, + "flos": 23447635263360.0, + "grad_norm": 1.9265232828394878, + "language_loss": 0.70927215, + "learning_rate": 3.3658486301027807e-06, + "loss": 0.78736377, + "num_input_tokens_seen": 101505875, + "router_z_loss_clip": 2.06445312, + "router_z_loss_mlp": 0.19006348, + "step": 4705, + "time_per_iteration": 2.5391383171081543 + }, + { + "auxiliary_loss_clip": 0.06482799, + "auxiliary_loss_mlp": 0.0126888, + "balance_loss_clip": 0.06378852, + "balance_loss_mlp": 0.01263947, + "epoch": 0.2829400270554637, + "flos": 69892055297280.0, + "grad_norm": 0.9159756060868983, + "language_loss": 0.59201139, + "learning_rate": 3.3655641066802577e-06, + "loss": 0.66952819, + "num_input_tokens_seen": 101565045, + "router_z_loss_clip": 1.0390625, + "router_z_loss_mlp": 0.04928589, + "step": 4706, + "time_per_iteration": 3.219618797302246 + }, + { + "auxiliary_loss_clip": 0.06512764, + "auxiliary_loss_mlp": 0.01277701, + "balance_loss_clip": 0.06312177, + "balance_loss_mlp": 0.01260547, + "epoch": 0.2830001503081317, + "flos": 24795715276800.0, + "grad_norm": 1.373077415158703, + "language_loss": 0.82380199, + "learning_rate": 3.365279531475407e-06, + "loss": 0.90170658, + "num_input_tokens_seen": 101585825, + "router_z_loss_clip": 2.00683594, + "router_z_loss_mlp": 0.17138672, + "step": 4707, + "time_per_iteration": 2.5680840015411377 + }, + { + "auxiliary_loss_clip": 0.06518079, + "auxiliary_loss_mlp": 0.01276357, + "balance_loss_clip": 0.06304221, + "balance_loss_mlp": 0.01257391, + "epoch": 0.28306027356079966, + "flos": 27676218286080.0, + "grad_norm": 1.5569970524845527, + "language_loss": 0.81077999, + "learning_rate": 3.36499490449902e-06, + "loss": 0.88872433, + "num_input_tokens_seen": 101606105, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.18969727, + "step": 4708, + "time_per_iteration": 2.643389940261841 + }, + { + "auxiliary_loss_clip": 0.06443536, + "auxiliary_loss_mlp": 0.01268639, + "balance_loss_clip": 0.06339511, + "balance_loss_mlp": 0.01264025, + "epoch": 0.2831203968134676, + "flos": 60543837734400.0, + "grad_norm": 0.8586282544888121, + "language_loss": 0.62812036, + "learning_rate": 3.3647102257618895e-06, + "loss": 0.7052421, + "num_input_tokens_seen": 101656875, + "router_z_loss_clip": 1.04199219, + "router_z_loss_mlp": 0.04608154, + "step": 4709, + "time_per_iteration": 3.0554397106170654 + }, + { + "auxiliary_loss_clip": 0.06507774, + "auxiliary_loss_mlp": 0.01270408, + "balance_loss_clip": 0.06301016, + "balance_loss_mlp": 0.01253015, + "epoch": 0.2831805200661356, + "flos": 22061386915200.0, + "grad_norm": 1.4201642822404892, + "language_loss": 0.74412584, + "learning_rate": 3.3644254952748103e-06, + "loss": 0.82190764, + "num_input_tokens_seen": 101676225, + "router_z_loss_clip": 2.06835938, + "router_z_loss_mlp": 0.1739502, + "step": 4710, + "time_per_iteration": 2.555367946624756 + }, + { + "auxiliary_loss_clip": 0.06514937, + "auxiliary_loss_mlp": 0.01275331, + "balance_loss_clip": 0.06306514, + "balance_loss_mlp": 0.01256627, + "epoch": 0.28324064331880355, + "flos": 22607120557440.0, + "grad_norm": 1.9767009095982746, + "language_loss": 0.8018595, + "learning_rate": 3.364140713048579e-06, + "loss": 0.87976217, + "num_input_tokens_seen": 101693710, + "router_z_loss_clip": 2.08300781, + "router_z_loss_mlp": 0.18713379, + "step": 4711, + "time_per_iteration": 2.610027313232422 + }, + { + "auxiliary_loss_clip": 0.06509729, + "auxiliary_loss_mlp": 0.01278493, + "balance_loss_clip": 0.06300638, + "balance_loss_mlp": 0.01260385, + "epoch": 0.2833007665714715, + "flos": 30411133626240.0, + "grad_norm": 1.982526263820073, + "language_loss": 0.70604694, + "learning_rate": 3.363855879093996e-06, + "loss": 0.78392917, + "num_input_tokens_seen": 101714010, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 0.18103027, + "step": 4712, + "time_per_iteration": 2.602795124053955 + }, + { + "auxiliary_loss_clip": 0.06508194, + "auxiliary_loss_mlp": 0.01282495, + "balance_loss_clip": 0.06299947, + "balance_loss_mlp": 0.01262992, + "epoch": 0.2833608898241395, + "flos": 23556144700800.0, + "grad_norm": 1.7823239687069516, + "language_loss": 0.8193841, + "learning_rate": 3.3635709934218605e-06, + "loss": 0.89729095, + "num_input_tokens_seen": 101732995, + "router_z_loss_clip": 2.08398438, + "router_z_loss_mlp": 0.19494629, + "step": 4713, + "time_per_iteration": 2.6088523864746094 + }, + { + "auxiliary_loss_clip": 0.06512519, + "auxiliary_loss_mlp": 0.01275048, + "balance_loss_clip": 0.06304006, + "balance_loss_mlp": 0.01255236, + "epoch": 0.28342101307680745, + "flos": 20272980096000.0, + "grad_norm": 2.6212370689858493, + "language_loss": 0.75431275, + "learning_rate": 3.3632860560429766e-06, + "loss": 0.83218849, + "num_input_tokens_seen": 101751385, + "router_z_loss_clip": 2.08398438, + "router_z_loss_mlp": 0.19799805, + "step": 4714, + "time_per_iteration": 3.986696243286133 + }, + { + "auxiliary_loss_clip": 0.06505996, + "auxiliary_loss_mlp": 0.01276776, + "balance_loss_clip": 0.06297115, + "balance_loss_mlp": 0.01259324, + "epoch": 0.2834811363294754, + "flos": 30854982303360.0, + "grad_norm": 1.3268888753773178, + "language_loss": 0.78198218, + "learning_rate": 3.3630010669681494e-06, + "loss": 0.85980994, + "num_input_tokens_seen": 101773825, + "router_z_loss_clip": 2.08691406, + "router_z_loss_mlp": 0.17468262, + "step": 4715, + "time_per_iteration": 2.652470111846924 + }, + { + "auxiliary_loss_clip": 0.06506517, + "auxiliary_loss_mlp": 0.01277278, + "balance_loss_clip": 0.06300199, + "balance_loss_mlp": 0.01260088, + "epoch": 0.2835412595821434, + "flos": 22717642492800.0, + "grad_norm": 1.6173599581374518, + "language_loss": 0.74551272, + "learning_rate": 3.3627160262081845e-06, + "loss": 0.82335067, + "num_input_tokens_seen": 101791920, + "router_z_loss_clip": 2.06542969, + "router_z_loss_mlp": 0.17175293, + "step": 4716, + "time_per_iteration": 2.597083806991577 + }, + { + "auxiliary_loss_clip": 0.06516325, + "auxiliary_loss_mlp": 0.01281584, + "balance_loss_clip": 0.06298752, + "balance_loss_mlp": 0.0126189, + "epoch": 0.28360138283481134, + "flos": 18083630689920.0, + "grad_norm": 2.1150039301458112, + "language_loss": 0.75477433, + "learning_rate": 3.3624309337738917e-06, + "loss": 0.83275348, + "num_input_tokens_seen": 101809515, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.19702148, + "step": 4717, + "time_per_iteration": 2.5648136138916016 + }, + { + "auxiliary_loss_clip": 0.06514253, + "auxiliary_loss_mlp": 0.01277656, + "balance_loss_clip": 0.06302426, + "balance_loss_mlp": 0.01258606, + "epoch": 0.2836615060874793, + "flos": 17859987342720.0, + "grad_norm": 1.540618458402471, + "language_loss": 0.67445159, + "learning_rate": 3.3621457896760813e-06, + "loss": 0.75237072, + "num_input_tokens_seen": 101827735, + "router_z_loss_clip": 2.1171875, + "router_z_loss_mlp": 0.19042969, + "step": 4718, + "time_per_iteration": 3.962265968322754 + }, + { + "auxiliary_loss_clip": 0.06507722, + "auxiliary_loss_mlp": 0.01278787, + "balance_loss_clip": 0.06295013, + "balance_loss_mlp": 0.01258772, + "epoch": 0.2837216293401473, + "flos": 25747590458880.0, + "grad_norm": 1.8038295919740834, + "language_loss": 0.73164374, + "learning_rate": 3.361860593925566e-06, + "loss": 0.8095088, + "num_input_tokens_seen": 101845970, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.20007324, + "step": 4719, + "time_per_iteration": 4.095008134841919 + }, + { + "auxiliary_loss_clip": 0.0650832, + "auxiliary_loss_mlp": 0.01277839, + "balance_loss_clip": 0.06301163, + "balance_loss_mlp": 0.01259386, + "epoch": 0.2837817525928153, + "flos": 20929906506240.0, + "grad_norm": 1.8981156672354917, + "language_loss": 0.80600828, + "learning_rate": 3.3615753465331605e-06, + "loss": 0.88386989, + "num_input_tokens_seen": 101865040, + "router_z_loss_clip": 2.07324219, + "router_z_loss_mlp": 0.18444824, + "step": 4720, + "time_per_iteration": 2.53869366645813 + }, + { + "auxiliary_loss_clip": 0.06515027, + "auxiliary_loss_mlp": 0.01280641, + "balance_loss_clip": 0.06304276, + "balance_loss_mlp": 0.01261687, + "epoch": 0.28384187584548326, + "flos": 18922719876480.0, + "grad_norm": 1.7940545446838874, + "language_loss": 0.7966662, + "learning_rate": 3.3612900475096817e-06, + "loss": 0.87462288, + "num_input_tokens_seen": 101883735, + "router_z_loss_clip": 2.10742188, + "router_z_loss_mlp": 0.18945312, + "step": 4721, + "time_per_iteration": 2.5736734867095947 + }, + { + "auxiliary_loss_clip": 0.06507237, + "auxiliary_loss_mlp": 0.01272866, + "balance_loss_clip": 0.06298702, + "balance_loss_mlp": 0.01254996, + "epoch": 0.2839019990981512, + "flos": 27351235025280.0, + "grad_norm": 1.8504915753410351, + "language_loss": 0.83238685, + "learning_rate": 3.3610046968659474e-06, + "loss": 0.91018784, + "num_input_tokens_seen": 101903025, + "router_z_loss_clip": 2.0859375, + "router_z_loss_mlp": 0.17871094, + "step": 4722, + "time_per_iteration": 2.5798823833465576 + }, + { + "auxiliary_loss_clip": 0.06511718, + "auxiliary_loss_mlp": 0.01273786, + "balance_loss_clip": 0.06302544, + "balance_loss_mlp": 0.01255547, + "epoch": 0.2839621223508192, + "flos": 18120247724160.0, + "grad_norm": 1.9056364243243222, + "language_loss": 0.71157932, + "learning_rate": 3.3607192946127785e-06, + "loss": 0.78943431, + "num_input_tokens_seen": 101922255, + "router_z_loss_clip": 2.09179688, + "router_z_loss_mlp": 0.18225098, + "step": 4723, + "time_per_iteration": 2.5472381114959717 + }, + { + "auxiliary_loss_clip": 0.06508423, + "auxiliary_loss_mlp": 0.01279225, + "balance_loss_clip": 0.06299602, + "balance_loss_mlp": 0.01259937, + "epoch": 0.28402224560348716, + "flos": 26365384212480.0, + "grad_norm": 1.5487216964387416, + "language_loss": 0.7882036, + "learning_rate": 3.360433840760998e-06, + "loss": 0.86608005, + "num_input_tokens_seen": 101943100, + "router_z_loss_clip": 2.09082031, + "router_z_loss_mlp": 0.19299316, + "step": 4724, + "time_per_iteration": 4.039300203323364 + }, + { + "auxiliary_loss_clip": 0.0650482, + "auxiliary_loss_mlp": 0.01275588, + "balance_loss_clip": 0.06294143, + "balance_loss_mlp": 0.0125754, + "epoch": 0.2840823688561551, + "flos": 24067609223040.0, + "grad_norm": 1.5786087270385247, + "language_loss": 0.92781484, + "learning_rate": 3.36014833532143e-06, + "loss": 1.00561893, + "num_input_tokens_seen": 101963160, + "router_z_loss_clip": 2.10546875, + "router_z_loss_mlp": 0.18066406, + "step": 4725, + "time_per_iteration": 2.5839502811431885 + }, + { + "auxiliary_loss_clip": 0.06504668, + "auxiliary_loss_mlp": 0.01283756, + "balance_loss_clip": 0.06292438, + "balance_loss_mlp": 0.01263097, + "epoch": 0.2841424921088231, + "flos": 29467392289920.0, + "grad_norm": 1.5513315701194426, + "language_loss": 0.89446843, + "learning_rate": 3.3598627783049e-06, + "loss": 0.97235262, + "num_input_tokens_seen": 101984300, + "router_z_loss_clip": 2.12304688, + "router_z_loss_mlp": 0.20666504, + "step": 4726, + "time_per_iteration": 2.617002010345459 + }, + { + "auxiliary_loss_clip": 0.06507252, + "auxiliary_loss_mlp": 0.01284138, + "balance_loss_clip": 0.0629679, + "balance_loss_mlp": 0.01264409, + "epoch": 0.28420261536149105, + "flos": 48110439565440.0, + "grad_norm": 2.259876030173266, + "language_loss": 0.79337573, + "learning_rate": 3.359577169722238e-06, + "loss": 0.87128961, + "num_input_tokens_seen": 102005765, + "router_z_loss_clip": 2.1015625, + "router_z_loss_mlp": 0.19763184, + "step": 4727, + "time_per_iteration": 2.774508476257324 + }, + { + "auxiliary_loss_clip": 0.06499238, + "auxiliary_loss_mlp": 0.01275292, + "balance_loss_clip": 0.06294493, + "balance_loss_mlp": 0.01257483, + "epoch": 0.284262738614159, + "flos": 25673224360320.0, + "grad_norm": 2.051338722061539, + "language_loss": 0.67073631, + "learning_rate": 3.3592915095842733e-06, + "loss": 0.74848163, + "num_input_tokens_seen": 102022755, + "router_z_loss_clip": 2.04589844, + "router_z_loss_mlp": 0.17810059, + "step": 4728, + "time_per_iteration": 2.614614725112915 + }, + { + "auxiliary_loss_clip": 0.06494898, + "auxiliary_loss_mlp": 0.01274263, + "balance_loss_clip": 0.06287634, + "balance_loss_mlp": 0.01255702, + "epoch": 0.284322861866827, + "flos": 19725066247680.0, + "grad_norm": 2.0236031999203132, + "language_loss": 0.76682353, + "learning_rate": 3.3590057979018386e-06, + "loss": 0.84451514, + "num_input_tokens_seen": 102041850, + "router_z_loss_clip": 2.07617188, + "router_z_loss_mlp": 0.18554688, + "step": 4729, + "time_per_iteration": 2.542400360107422 + }, + { + "auxiliary_loss_clip": 0.06505589, + "auxiliary_loss_mlp": 0.01273011, + "balance_loss_clip": 0.06292985, + "balance_loss_mlp": 0.0125414, + "epoch": 0.28438298511949495, + "flos": 23922105408000.0, + "grad_norm": 1.7626205541686495, + "language_loss": 0.67443657, + "learning_rate": 3.3587200346857674e-06, + "loss": 0.75222254, + "num_input_tokens_seen": 102059500, + "router_z_loss_clip": 2.12304688, + "router_z_loss_mlp": 0.1887207, + "step": 4730, + "time_per_iteration": 2.6005139350891113 + }, + { + "auxiliary_loss_clip": 0.06503962, + "auxiliary_loss_mlp": 0.01275972, + "balance_loss_clip": 0.06292562, + "balance_loss_mlp": 0.01256219, + "epoch": 0.2844431083721629, + "flos": 26074460436480.0, + "grad_norm": 1.9951841893982447, + "language_loss": 0.74777246, + "learning_rate": 3.3584342199468965e-06, + "loss": 0.82557184, + "num_input_tokens_seen": 102080460, + "router_z_loss_clip": 2.1171875, + "router_z_loss_mlp": 0.1973877, + "step": 4731, + "time_per_iteration": 2.571259021759033 + }, + { + "auxiliary_loss_clip": 0.06501718, + "auxiliary_loss_mlp": 0.01275528, + "balance_loss_clip": 0.06291741, + "balance_loss_mlp": 0.01257384, + "epoch": 0.2845032316248309, + "flos": 25817260728960.0, + "grad_norm": 1.5216025808612688, + "language_loss": 0.8435545, + "learning_rate": 3.3581483536960638e-06, + "loss": 0.92132688, + "num_input_tokens_seen": 102100950, + "router_z_loss_clip": 2.09570312, + "router_z_loss_mlp": 0.18139648, + "step": 4732, + "time_per_iteration": 2.604717254638672 + }, + { + "auxiliary_loss_clip": 0.06508272, + "auxiliary_loss_mlp": 0.01277146, + "balance_loss_clip": 0.06295733, + "balance_loss_mlp": 0.01256082, + "epoch": 0.2845633548774989, + "flos": 19828418659200.0, + "grad_norm": 1.722472955192697, + "language_loss": 0.79522747, + "learning_rate": 3.357862435944109e-06, + "loss": 0.87308168, + "num_input_tokens_seen": 102119345, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.21069336, + "step": 4733, + "time_per_iteration": 2.517216444015503 + }, + { + "auxiliary_loss_clip": 0.06511072, + "auxiliary_loss_mlp": 0.01275761, + "balance_loss_clip": 0.06296709, + "balance_loss_mlp": 0.01256878, + "epoch": 0.28462347813016686, + "flos": 23189093890560.0, + "grad_norm": 2.336729990473161, + "language_loss": 0.72093451, + "learning_rate": 3.357576466701875e-06, + "loss": 0.79880273, + "num_input_tokens_seen": 102139050, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.1887207, + "step": 4734, + "time_per_iteration": 2.5948264598846436 + }, + { + "auxiliary_loss_clip": 0.06501292, + "auxiliary_loss_mlp": 0.01274129, + "balance_loss_clip": 0.06292972, + "balance_loss_mlp": 0.01256283, + "epoch": 0.2846836013828348, + "flos": 18666316782720.0, + "grad_norm": 1.7839237241912007, + "language_loss": 0.74739748, + "learning_rate": 3.3572904459802056e-06, + "loss": 0.82515168, + "num_input_tokens_seen": 102157935, + "router_z_loss_clip": 2.08203125, + "router_z_loss_mlp": 0.1784668, + "step": 4735, + "time_per_iteration": 2.5192623138427734 + }, + { + "auxiliary_loss_clip": 0.06500865, + "auxiliary_loss_mlp": 0.01274478, + "balance_loss_clip": 0.06291883, + "balance_loss_mlp": 0.01256096, + "epoch": 0.2847437246355028, + "flos": 14178731189760.0, + "grad_norm": 1.8549790130823454, + "language_loss": 0.81047934, + "learning_rate": 3.357004373789946e-06, + "loss": 0.88823277, + "num_input_tokens_seen": 102175325, + "router_z_loss_clip": 2.09179688, + "router_z_loss_mlp": 0.18383789, + "step": 4736, + "time_per_iteration": 2.593890905380249 + }, + { + "auxiliary_loss_clip": 0.06503595, + "auxiliary_loss_mlp": 0.01274596, + "balance_loss_clip": 0.06293313, + "balance_loss_mlp": 0.01256285, + "epoch": 0.28480384788817076, + "flos": 29286068054400.0, + "grad_norm": 3.1700593253391895, + "language_loss": 0.60580242, + "learning_rate": 3.3567182501419453e-06, + "loss": 0.68358433, + "num_input_tokens_seen": 102196625, + "router_z_loss_clip": 2.10644531, + "router_z_loss_mlp": 0.18310547, + "step": 4737, + "time_per_iteration": 2.591672897338867 + }, + { + "auxiliary_loss_clip": 0.06501776, + "auxiliary_loss_mlp": 0.01274486, + "balance_loss_clip": 0.06295541, + "balance_loss_mlp": 0.01256855, + "epoch": 0.2848639711408387, + "flos": 22607875244160.0, + "grad_norm": 1.8212806326874897, + "language_loss": 0.86685491, + "learning_rate": 3.356432075047052e-06, + "loss": 0.94461757, + "num_input_tokens_seen": 102214975, + "router_z_loss_clip": 2.06347656, + "router_z_loss_mlp": 0.1763916, + "step": 4738, + "time_per_iteration": 2.5788064002990723 + }, + { + "auxiliary_loss_clip": 0.06504256, + "auxiliary_loss_mlp": 0.01280924, + "balance_loss_clip": 0.06291994, + "balance_loss_mlp": 0.01260575, + "epoch": 0.2849240943935067, + "flos": 17604632424960.0, + "grad_norm": 2.187311269731562, + "language_loss": 0.90640962, + "learning_rate": 3.356145848516118e-06, + "loss": 0.98426139, + "num_input_tokens_seen": 102231885, + "router_z_loss_clip": 2.12304688, + "router_z_loss_mlp": 0.20336914, + "step": 4739, + "time_per_iteration": 2.491391897201538 + }, + { + "auxiliary_loss_clip": 0.06502014, + "auxiliary_loss_mlp": 0.01271887, + "balance_loss_clip": 0.06294325, + "balance_loss_mlp": 0.01254363, + "epoch": 0.28498421764617465, + "flos": 24869368615680.0, + "grad_norm": 1.2838984451042732, + "language_loss": 0.72652215, + "learning_rate": 3.355859570559998e-06, + "loss": 0.80426115, + "num_input_tokens_seen": 102252725, + "router_z_loss_clip": 2.07714844, + "router_z_loss_mlp": 0.17529297, + "step": 4740, + "time_per_iteration": 2.628420352935791 + }, + { + "auxiliary_loss_clip": 0.06497836, + "auxiliary_loss_mlp": 0.01273023, + "balance_loss_clip": 0.06293581, + "balance_loss_mlp": 0.01254069, + "epoch": 0.2850443408988426, + "flos": 22788947917440.0, + "grad_norm": 1.7372555552312992, + "language_loss": 0.77982342, + "learning_rate": 3.3555732411895477e-06, + "loss": 0.85753202, + "num_input_tokens_seen": 102271730, + "router_z_loss_clip": 2.04296875, + "router_z_loss_mlp": 0.1895752, + "step": 4741, + "time_per_iteration": 2.5205776691436768 + }, + { + "auxiliary_loss_clip": 0.06505083, + "auxiliary_loss_mlp": 0.01279172, + "balance_loss_clip": 0.06290049, + "balance_loss_mlp": 0.01260278, + "epoch": 0.2851044641515106, + "flos": 18850114713600.0, + "grad_norm": 2.3624012556043246, + "language_loss": 0.7702412, + "learning_rate": 3.3552868604156235e-06, + "loss": 0.84808373, + "num_input_tokens_seen": 102291325, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.18896484, + "step": 4742, + "time_per_iteration": 2.5852768421173096 + }, + { + "auxiliary_loss_clip": 0.06507465, + "auxiliary_loss_mlp": 0.01281198, + "balance_loss_clip": 0.06292667, + "balance_loss_mlp": 0.01260252, + "epoch": 0.28516458740417855, + "flos": 18886564039680.0, + "grad_norm": 2.066213096861692, + "language_loss": 0.57976151, + "learning_rate": 3.355000428249086e-06, + "loss": 0.65764809, + "num_input_tokens_seen": 102309000, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.20959473, + "step": 4743, + "time_per_iteration": 2.562298059463501 + }, + { + "auxiliary_loss_clip": 0.06507643, + "auxiliary_loss_mlp": 0.01278324, + "balance_loss_clip": 0.06297275, + "balance_loss_mlp": 0.01259787, + "epoch": 0.2852247106568465, + "flos": 25306592820480.0, + "grad_norm": 1.602300087654556, + "language_loss": 0.75013685, + "learning_rate": 3.354713944700797e-06, + "loss": 0.82799655, + "num_input_tokens_seen": 102329240, + "router_z_loss_clip": 2.10546875, + "router_z_loss_mlp": 0.1854248, + "step": 4744, + "time_per_iteration": 2.610302209854126 + }, + { + "auxiliary_loss_clip": 0.06500175, + "auxiliary_loss_mlp": 0.01276557, + "balance_loss_clip": 0.06292172, + "balance_loss_mlp": 0.01258794, + "epoch": 0.2852848339095145, + "flos": 11660080037760.0, + "grad_norm": 2.2644691376510844, + "language_loss": 0.78515136, + "learning_rate": 3.3544274097816185e-06, + "loss": 0.86291873, + "num_input_tokens_seen": 102344440, + "router_z_loss_clip": 2.08105469, + "router_z_loss_mlp": 0.17749023, + "step": 4745, + "time_per_iteration": 2.5170419216156006 + }, + { + "auxiliary_loss_clip": 0.06491117, + "auxiliary_loss_mlp": 0.01272956, + "balance_loss_clip": 0.06290857, + "balance_loss_mlp": 0.01254836, + "epoch": 0.2853449571621825, + "flos": 12938280145920.0, + "grad_norm": 1.7221704990089022, + "language_loss": 0.83220983, + "learning_rate": 3.3541408235024173e-06, + "loss": 0.9098506, + "num_input_tokens_seen": 102360985, + "router_z_loss_clip": 2.00097656, + "router_z_loss_mlp": 0.18127441, + "step": 4746, + "time_per_iteration": 2.6257071495056152 + }, + { + "auxiliary_loss_clip": 0.06514393, + "auxiliary_loss_mlp": 0.01278566, + "balance_loss_clip": 0.06295399, + "balance_loss_mlp": 0.01257943, + "epoch": 0.28540508041485046, + "flos": 20016660856320.0, + "grad_norm": 1.8084134515670756, + "language_loss": 0.80507863, + "learning_rate": 3.3538541858740604e-06, + "loss": 0.88300824, + "num_input_tokens_seen": 102380320, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.20617676, + "step": 4747, + "time_per_iteration": 2.5699074268341064 + }, + { + "auxiliary_loss_clip": 0.06375369, + "auxiliary_loss_mlp": 0.0127529, + "balance_loss_clip": 0.0627491, + "balance_loss_mlp": 0.01269043, + "epoch": 0.28546520366751843, + "flos": 68160264710400.0, + "grad_norm": 0.7514031277524565, + "language_loss": 0.60153103, + "learning_rate": 3.3535674969074173e-06, + "loss": 0.67803764, + "num_input_tokens_seen": 102439140, + "router_z_loss_clip": 1.0078125, + "router_z_loss_mlp": 0.06237793, + "step": 4748, + "time_per_iteration": 3.1155877113342285 + }, + { + "auxiliary_loss_clip": 0.06492989, + "auxiliary_loss_mlp": 0.01272874, + "balance_loss_clip": 0.06285426, + "balance_loss_mlp": 0.01255791, + "epoch": 0.2855253269201864, + "flos": 13254961852800.0, + "grad_norm": 2.1744647780903352, + "language_loss": 0.80643219, + "learning_rate": 3.3532807566133592e-06, + "loss": 0.88409078, + "num_input_tokens_seen": 102450990, + "router_z_loss_clip": 2.07617188, + "router_z_loss_mlp": 0.17089844, + "step": 4749, + "time_per_iteration": 2.5422439575195312 + }, + { + "auxiliary_loss_clip": 0.06506198, + "auxiliary_loss_mlp": 0.01278695, + "balance_loss_clip": 0.06295547, + "balance_loss_mlp": 0.0126011, + "epoch": 0.28558545017285436, + "flos": 28628345030400.0, + "grad_norm": 1.9900791940744995, + "language_loss": 0.70889151, + "learning_rate": 3.3529939650027587e-06, + "loss": 0.78674042, + "num_input_tokens_seen": 102471820, + "router_z_loss_clip": 2.10644531, + "router_z_loss_mlp": 0.18579102, + "step": 4750, + "time_per_iteration": 2.6223177909851074 + }, + { + "auxiliary_loss_clip": 0.06498066, + "auxiliary_loss_mlp": 0.01278692, + "balance_loss_clip": 0.06294224, + "balance_loss_mlp": 0.01261562, + "epoch": 0.2856455734255223, + "flos": 34138901594880.0, + "grad_norm": 1.523200352045364, + "language_loss": 0.82438904, + "learning_rate": 3.3527071220864917e-06, + "loss": 0.90215659, + "num_input_tokens_seen": 102492625, + "router_z_loss_clip": 2.04101562, + "router_z_loss_mlp": 0.17138672, + "step": 4751, + "time_per_iteration": 2.710822582244873 + }, + { + "auxiliary_loss_clip": 0.06498431, + "auxiliary_loss_mlp": 0.01276615, + "balance_loss_clip": 0.06291521, + "balance_loss_mlp": 0.01258424, + "epoch": 0.2857056966781903, + "flos": 39795590880000.0, + "grad_norm": 1.6833478059847915, + "language_loss": 0.80598158, + "learning_rate": 3.3524202278754353e-06, + "loss": 0.88373208, + "num_input_tokens_seen": 102514145, + "router_z_loss_clip": 2.06835938, + "router_z_loss_mlp": 0.1817627, + "step": 4752, + "time_per_iteration": 2.685669422149658 + }, + { + "auxiliary_loss_clip": 0.0649987, + "auxiliary_loss_mlp": 0.01272426, + "balance_loss_clip": 0.06292621, + "balance_loss_mlp": 0.01254223, + "epoch": 0.28576581993085826, + "flos": 21878846795520.0, + "grad_norm": 1.793038640961372, + "language_loss": 0.79062063, + "learning_rate": 3.3521332823804676e-06, + "loss": 0.86834359, + "num_input_tokens_seen": 102532365, + "router_z_loss_clip": 2.07324219, + "router_z_loss_mlp": 0.18200684, + "step": 4753, + "time_per_iteration": 2.612639904022217 + }, + { + "auxiliary_loss_clip": 0.06511062, + "auxiliary_loss_mlp": 0.01278051, + "balance_loss_clip": 0.06298037, + "balance_loss_mlp": 0.01257523, + "epoch": 0.2858259431835262, + "flos": 19096455317760.0, + "grad_norm": 2.5775982542053963, + "language_loss": 0.89774185, + "learning_rate": 3.3518462856124704e-06, + "loss": 0.97563303, + "num_input_tokens_seen": 102548425, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.20532227, + "step": 4754, + "time_per_iteration": 3.914802312850952 + }, + { + "auxiliary_loss_clip": 0.06494384, + "auxiliary_loss_mlp": 0.01278048, + "balance_loss_clip": 0.06293342, + "balance_loss_mlp": 0.01259988, + "epoch": 0.2858860664361942, + "flos": 20339673546240.0, + "grad_norm": 1.9874166310668562, + "language_loss": 0.82672411, + "learning_rate": 3.3515592375823267e-06, + "loss": 0.90444839, + "num_input_tokens_seen": 102566370, + "router_z_loss_clip": 2.01171875, + "router_z_loss_mlp": 0.18066406, + "step": 4755, + "time_per_iteration": 2.673158884048462 + }, + { + "auxiliary_loss_clip": 0.06498866, + "auxiliary_loss_mlp": 0.01274185, + "balance_loss_clip": 0.06291682, + "balance_loss_mlp": 0.0125721, + "epoch": 0.28594618968886215, + "flos": 24468551809920.0, + "grad_norm": 1.6562500913369433, + "language_loss": 0.83843541, + "learning_rate": 3.351272138300922e-06, + "loss": 0.91616589, + "num_input_tokens_seen": 102588715, + "router_z_loss_clip": 2.06835938, + "router_z_loss_mlp": 0.16992188, + "step": 4756, + "time_per_iteration": 2.6029391288757324 + }, + { + "auxiliary_loss_clip": 0.06377822, + "auxiliary_loss_mlp": 0.01262219, + "balance_loss_clip": 0.06279279, + "balance_loss_mlp": 0.01256002, + "epoch": 0.2860063129415301, + "flos": 71676170830080.0, + "grad_norm": 1.4612509113917642, + "language_loss": 0.6086607, + "learning_rate": 3.350984987779142e-06, + "loss": 0.68506116, + "num_input_tokens_seen": 102656715, + "router_z_loss_clip": 0.984375, + "router_z_loss_mlp": 0.06207275, + "step": 4757, + "time_per_iteration": 3.326833963394165 + }, + { + "auxiliary_loss_clip": 0.0650306, + "auxiliary_loss_mlp": 0.01277184, + "balance_loss_clip": 0.06298901, + "balance_loss_mlp": 0.01260459, + "epoch": 0.2860664361941981, + "flos": 20564993975040.0, + "grad_norm": 2.5468639815388996, + "language_loss": 0.66759324, + "learning_rate": 3.3506977860278756e-06, + "loss": 0.74539566, + "num_input_tokens_seen": 102676545, + "router_z_loss_clip": 2.04101562, + "router_z_loss_mlp": 0.1673584, + "step": 4758, + "time_per_iteration": 5.454218626022339 + }, + { + "auxiliary_loss_clip": 0.06503905, + "auxiliary_loss_mlp": 0.01277556, + "balance_loss_clip": 0.06294875, + "balance_loss_mlp": 0.01258817, + "epoch": 0.2861265594468661, + "flos": 36005992997760.0, + "grad_norm": 1.4420872105733484, + "language_loss": 0.63405287, + "learning_rate": 3.3504105330580143e-06, + "loss": 0.71186751, + "num_input_tokens_seen": 102702875, + "router_z_loss_clip": 2.09082031, + "router_z_loss_mlp": 0.1875, + "step": 4759, + "time_per_iteration": 2.745704174041748 + }, + { + "auxiliary_loss_clip": 0.06510226, + "auxiliary_loss_mlp": 0.01276918, + "balance_loss_clip": 0.06302258, + "balance_loss_mlp": 0.01257892, + "epoch": 0.28618668269953407, + "flos": 20053571379840.0, + "grad_norm": 2.14199936751817, + "language_loss": 0.74684435, + "learning_rate": 3.3501232288804496e-06, + "loss": 0.82471573, + "num_input_tokens_seen": 102723160, + "router_z_loss_clip": 2.078125, + "router_z_loss_mlp": 0.19030762, + "step": 4760, + "time_per_iteration": 2.541759490966797 + }, + { + "auxiliary_loss_clip": 0.06496474, + "auxiliary_loss_mlp": 0.01276289, + "balance_loss_clip": 0.06297328, + "balance_loss_mlp": 0.01260482, + "epoch": 0.28624680595220203, + "flos": 24978632739840.0, + "grad_norm": 1.8333731861449165, + "language_loss": 0.72652757, + "learning_rate": 3.3498358735060773e-06, + "loss": 0.80425525, + "num_input_tokens_seen": 102743855, + "router_z_loss_clip": 1.9921875, + "router_z_loss_mlp": 0.15795898, + "step": 4761, + "time_per_iteration": 2.57940673828125 + }, + { + "auxiliary_loss_clip": 0.06509258, + "auxiliary_loss_mlp": 0.01273154, + "balance_loss_clip": 0.06299996, + "balance_loss_mlp": 0.01256095, + "epoch": 0.28630692920487, + "flos": 22498862682240.0, + "grad_norm": 1.9183655494362113, + "language_loss": 0.74669504, + "learning_rate": 3.349548466945793e-06, + "loss": 0.82451922, + "num_input_tokens_seen": 102761370, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 0.1706543, + "step": 4762, + "time_per_iteration": 2.5321590900421143 + }, + { + "auxiliary_loss_clip": 0.06505883, + "auxiliary_loss_mlp": 0.01274368, + "balance_loss_clip": 0.06301434, + "balance_loss_mlp": 0.0125694, + "epoch": 0.28636705245753796, + "flos": 21255979870080.0, + "grad_norm": 2.6303759088840413, + "language_loss": 0.76297629, + "learning_rate": 3.349261009210496e-06, + "loss": 0.84077883, + "num_input_tokens_seen": 102780885, + "router_z_loss_clip": 2.04101562, + "router_z_loss_mlp": 0.17443848, + "step": 4763, + "time_per_iteration": 3.979782819747925 + }, + { + "auxiliary_loss_clip": 0.06506684, + "auxiliary_loss_mlp": 0.01275654, + "balance_loss_clip": 0.06298703, + "balance_loss_mlp": 0.012572, + "epoch": 0.28642717571020593, + "flos": 24102339540480.0, + "grad_norm": 1.7484925103151405, + "language_loss": 0.77499843, + "learning_rate": 3.348973500311086e-06, + "loss": 0.85282177, + "num_input_tokens_seen": 102801000, + "router_z_loss_clip": 2.08105469, + "router_z_loss_mlp": 0.18444824, + "step": 4764, + "time_per_iteration": 2.6036336421966553 + }, + { + "auxiliary_loss_clip": 0.0651267, + "auxiliary_loss_mlp": 0.01277486, + "balance_loss_clip": 0.06302905, + "balance_loss_mlp": 0.01257829, + "epoch": 0.2864872989628739, + "flos": 22607959098240.0, + "grad_norm": 5.154577786286556, + "language_loss": 0.71671587, + "learning_rate": 3.348685940258466e-06, + "loss": 0.79461741, + "num_input_tokens_seen": 102820230, + "router_z_loss_clip": 2.09765625, + "router_z_loss_mlp": 0.1965332, + "step": 4765, + "time_per_iteration": 2.5488131046295166 + }, + { + "auxiliary_loss_clip": 0.0651048, + "auxiliary_loss_mlp": 0.01272743, + "balance_loss_clip": 0.06304644, + "balance_loss_mlp": 0.01255684, + "epoch": 0.28654742221554186, + "flos": 32753449860480.0, + "grad_norm": 1.504395922922802, + "language_loss": 0.7630865, + "learning_rate": 3.3483983290635395e-06, + "loss": 0.84091872, + "num_input_tokens_seen": 102842670, + "router_z_loss_clip": 2.05957031, + "router_z_loss_mlp": 0.17053223, + "step": 4766, + "time_per_iteration": 2.659499406814575 + }, + { + "auxiliary_loss_clip": 0.0650377, + "auxiliary_loss_mlp": 0.01271145, + "balance_loss_clip": 0.0630042, + "balance_loss_mlp": 0.01254277, + "epoch": 0.2866075454682098, + "flos": 26989257386880.0, + "grad_norm": 2.0841406955827075, + "language_loss": 0.78443938, + "learning_rate": 3.348110666737214e-06, + "loss": 0.86218858, + "num_input_tokens_seen": 102864480, + "router_z_loss_clip": 2.03320312, + "router_z_loss_mlp": 0.16870117, + "step": 4767, + "time_per_iteration": 2.5891125202178955 + }, + { + "auxiliary_loss_clip": 0.06511022, + "auxiliary_loss_mlp": 0.01279425, + "balance_loss_clip": 0.06305116, + "balance_loss_mlp": 0.01261746, + "epoch": 0.2866676687208778, + "flos": 23259812336640.0, + "grad_norm": 2.0448044221544737, + "language_loss": 0.65430236, + "learning_rate": 3.3478229532903956e-06, + "loss": 0.73220682, + "num_input_tokens_seen": 102883740, + "router_z_loss_clip": 2.05761719, + "router_z_loss_mlp": 0.17675781, + "step": 4768, + "time_per_iteration": 2.572230815887451 + }, + { + "auxiliary_loss_clip": 0.0651636, + "auxiliary_loss_mlp": 0.01271508, + "balance_loss_clip": 0.06302489, + "balance_loss_mlp": 0.01253782, + "epoch": 0.28672779197354575, + "flos": 21586120156800.0, + "grad_norm": 1.6016626643500549, + "language_loss": 0.71173406, + "learning_rate": 3.3475351887339967e-06, + "loss": 0.78961271, + "num_input_tokens_seen": 102902945, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.17724609, + "step": 4769, + "time_per_iteration": 2.5180304050445557 + }, + { + "auxiliary_loss_clip": 0.06513099, + "auxiliary_loss_mlp": 0.01273812, + "balance_loss_clip": 0.06304821, + "balance_loss_mlp": 0.01256562, + "epoch": 0.2867879152262137, + "flos": 19871785946880.0, + "grad_norm": 1.7128041826885096, + "language_loss": 0.75347042, + "learning_rate": 3.3472473730789288e-06, + "loss": 0.83133948, + "num_input_tokens_seen": 102922405, + "router_z_loss_clip": 2.08496094, + "router_z_loss_mlp": 0.17248535, + "step": 4770, + "time_per_iteration": 2.575993537902832 + }, + { + "auxiliary_loss_clip": 0.06514675, + "auxiliary_loss_mlp": 0.01275884, + "balance_loss_clip": 0.06304142, + "balance_loss_mlp": 0.01257967, + "epoch": 0.2868480384788817, + "flos": 28219687868160.0, + "grad_norm": 4.606069071133779, + "language_loss": 0.68064034, + "learning_rate": 3.3469595063361045e-06, + "loss": 0.75854599, + "num_input_tokens_seen": 102938980, + "router_z_loss_clip": 2.10546875, + "router_z_loss_mlp": 0.17907715, + "step": 4771, + "time_per_iteration": 2.5533907413482666 + }, + { + "auxiliary_loss_clip": 0.06411134, + "auxiliary_loss_mlp": 0.0125763, + "balance_loss_clip": 0.06311508, + "balance_loss_mlp": 0.01253345, + "epoch": 0.2869081617315497, + "flos": 65442218768640.0, + "grad_norm": 0.7478629548239109, + "language_loss": 0.56696546, + "learning_rate": 3.3466715885164414e-06, + "loss": 0.64365304, + "num_input_tokens_seen": 103000405, + "router_z_loss_clip": 0.99609375, + "router_z_loss_mlp": 0.04290771, + "step": 4772, + "time_per_iteration": 3.1295437812805176 + }, + { + "auxiliary_loss_clip": 0.06515288, + "auxiliary_loss_mlp": 0.01274714, + "balance_loss_clip": 0.06305212, + "balance_loss_mlp": 0.01256165, + "epoch": 0.28696828498421767, + "flos": 18666610272000.0, + "grad_norm": 3.729070810615603, + "language_loss": 0.84013474, + "learning_rate": 3.346383619630856e-06, + "loss": 0.91803479, + "num_input_tokens_seen": 103017970, + "router_z_loss_clip": 2.09765625, + "router_z_loss_mlp": 0.1854248, + "step": 4773, + "time_per_iteration": 2.5181708335876465 + }, + { + "auxiliary_loss_clip": 0.06518447, + "auxiliary_loss_mlp": 0.01274166, + "balance_loss_clip": 0.06306095, + "balance_loss_mlp": 0.01254985, + "epoch": 0.28702840823688563, + "flos": 23666540855040.0, + "grad_norm": 2.856350636496585, + "language_loss": 0.78241181, + "learning_rate": 3.34609559969027e-06, + "loss": 0.86033797, + "num_input_tokens_seen": 103036385, + "router_z_loss_clip": 2.12109375, + "router_z_loss_mlp": 0.19177246, + "step": 4774, + "time_per_iteration": 2.5831918716430664 + }, + { + "auxiliary_loss_clip": 0.06519175, + "auxiliary_loss_mlp": 0.01275468, + "balance_loss_clip": 0.06307949, + "balance_loss_mlp": 0.01255703, + "epoch": 0.2870885314895536, + "flos": 13809248611200.0, + "grad_norm": 1.8762920881530476, + "language_loss": 0.74056339, + "learning_rate": 3.3458075287056034e-06, + "loss": 0.81850982, + "num_input_tokens_seen": 103052170, + "router_z_loss_clip": 2.11328125, + "router_z_loss_mlp": 0.19763184, + "step": 4775, + "time_per_iteration": 2.505293369293213 + }, + { + "auxiliary_loss_clip": 0.06520346, + "auxiliary_loss_mlp": 0.01275844, + "balance_loss_clip": 0.06309157, + "balance_loss_mlp": 0.01258142, + "epoch": 0.28714865474222157, + "flos": 17792790768000.0, + "grad_norm": 1.8823617406689648, + "language_loss": 0.88338864, + "learning_rate": 3.34551940668778e-06, + "loss": 0.96135056, + "num_input_tokens_seen": 103070510, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.17687988, + "step": 4776, + "time_per_iteration": 2.5638997554779053 + }, + { + "auxiliary_loss_clip": 0.06511634, + "auxiliary_loss_mlp": 0.01275769, + "balance_loss_clip": 0.06302971, + "balance_loss_mlp": 0.01258269, + "epoch": 0.28720877799488953, + "flos": 16002958429440.0, + "grad_norm": 2.648093963017482, + "language_loss": 0.74451852, + "learning_rate": 3.345231233647726e-06, + "loss": 0.82239252, + "num_input_tokens_seen": 103089590, + "router_z_loss_clip": 2.08789062, + "router_z_loss_mlp": 0.17492676, + "step": 4777, + "time_per_iteration": 2.5142223834991455 + }, + { + "auxiliary_loss_clip": 0.06527238, + "auxiliary_loss_mlp": 0.01280106, + "balance_loss_clip": 0.06311023, + "balance_loss_mlp": 0.01259924, + "epoch": 0.2872689012475575, + "flos": 20929445308800.0, + "grad_norm": 2.200879096052639, + "language_loss": 0.80539143, + "learning_rate": 3.3449430095963696e-06, + "loss": 0.88346487, + "num_input_tokens_seen": 103109080, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.20202637, + "step": 4778, + "time_per_iteration": 2.563994884490967 + }, + { + "auxiliary_loss_clip": 0.06511427, + "auxiliary_loss_mlp": 0.01281129, + "balance_loss_clip": 0.06304548, + "balance_loss_mlp": 0.01263223, + "epoch": 0.28732902450022546, + "flos": 21331603779840.0, + "grad_norm": 1.7996465112645923, + "language_loss": 0.73886508, + "learning_rate": 3.3446547345446386e-06, + "loss": 0.8167907, + "num_input_tokens_seen": 103127755, + "router_z_loss_clip": 2.06542969, + "router_z_loss_mlp": 0.17895508, + "step": 4779, + "time_per_iteration": 2.5394158363342285 + }, + { + "auxiliary_loss_clip": 0.06518923, + "auxiliary_loss_mlp": 0.01275383, + "balance_loss_clip": 0.06307982, + "balance_loss_mlp": 0.01255379, + "epoch": 0.2873891477528934, + "flos": 20856714364800.0, + "grad_norm": 1.509851280453794, + "language_loss": 0.76844704, + "learning_rate": 3.3443664085034656e-06, + "loss": 0.84639007, + "num_input_tokens_seen": 103147035, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.19995117, + "step": 4780, + "time_per_iteration": 2.5928425788879395 + }, + { + "auxiliary_loss_clip": 0.06507713, + "auxiliary_loss_mlp": 0.01271777, + "balance_loss_clip": 0.06302975, + "balance_loss_mlp": 0.01254014, + "epoch": 0.2874492710055614, + "flos": 17425698030720.0, + "grad_norm": 1.6471362454858889, + "language_loss": 0.81874287, + "learning_rate": 3.344078031483784e-06, + "loss": 0.89653778, + "num_input_tokens_seen": 103165410, + "router_z_loss_clip": 2.04296875, + "router_z_loss_mlp": 0.17773438, + "step": 4781, + "time_per_iteration": 2.6121537685394287 + }, + { + "auxiliary_loss_clip": 0.06521222, + "auxiliary_loss_mlp": 0.0127902, + "balance_loss_clip": 0.06306514, + "balance_loss_mlp": 0.01257002, + "epoch": 0.28750939425822936, + "flos": 13411827895680.0, + "grad_norm": 2.0671181517724966, + "language_loss": 0.86987036, + "learning_rate": 3.3437896034965283e-06, + "loss": 0.94787276, + "num_input_tokens_seen": 103183710, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.22009277, + "step": 4782, + "time_per_iteration": 2.554326057434082 + }, + { + "auxiliary_loss_clip": 0.06525762, + "auxiliary_loss_mlp": 0.01282396, + "balance_loss_clip": 0.06310341, + "balance_loss_mlp": 0.01262238, + "epoch": 0.2875695175108973, + "flos": 21876205392000.0, + "grad_norm": 1.4282255381090248, + "language_loss": 0.71525908, + "learning_rate": 3.3435011245526357e-06, + "loss": 0.79334062, + "num_input_tokens_seen": 103203790, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.20153809, + "step": 4783, + "time_per_iteration": 2.5632100105285645 + }, + { + "auxiliary_loss_clip": 0.06514136, + "auxiliary_loss_mlp": 0.01279499, + "balance_loss_clip": 0.06305264, + "balance_loss_mlp": 0.01259186, + "epoch": 0.2876296407635653, + "flos": 26251885457280.0, + "grad_norm": 1.5568964680804804, + "language_loss": 0.77152872, + "learning_rate": 3.343212594663047e-06, + "loss": 0.84946513, + "num_input_tokens_seen": 103223925, + "router_z_loss_clip": 2.08886719, + "router_z_loss_mlp": 0.203125, + "step": 4784, + "time_per_iteration": 2.589073657989502 + }, + { + "auxiliary_loss_clip": 0.06506136, + "auxiliary_loss_mlp": 0.01278073, + "balance_loss_clip": 0.06301259, + "balance_loss_mlp": 0.01257914, + "epoch": 0.28768976401623325, + "flos": 25380581575680.0, + "grad_norm": 1.5725877671574655, + "language_loss": 0.76106405, + "learning_rate": 3.3429240138387015e-06, + "loss": 0.83890617, + "num_input_tokens_seen": 103244760, + "router_z_loss_clip": 2.04785156, + "router_z_loss_mlp": 0.20153809, + "step": 4785, + "time_per_iteration": 2.6051061153411865 + }, + { + "auxiliary_loss_clip": 0.06513079, + "auxiliary_loss_mlp": 0.0127873, + "balance_loss_clip": 0.06302914, + "balance_loss_mlp": 0.01259394, + "epoch": 0.28774988726890127, + "flos": 30672232548480.0, + "grad_norm": 2.246179731229797, + "language_loss": 0.83339965, + "learning_rate": 3.3426353820905425e-06, + "loss": 0.91131771, + "num_input_tokens_seen": 103261995, + "router_z_loss_clip": 2.1015625, + "router_z_loss_mlp": 0.19348145, + "step": 4786, + "time_per_iteration": 2.6064071655273438 + }, + { + "auxiliary_loss_clip": 0.06512371, + "auxiliary_loss_mlp": 0.01278365, + "balance_loss_clip": 0.06303188, + "balance_loss_mlp": 0.01258934, + "epoch": 0.28781001052156924, + "flos": 20601820644480.0, + "grad_norm": 2.4876341958211037, + "language_loss": 0.80607671, + "learning_rate": 3.342346699429516e-06, + "loss": 0.88398409, + "num_input_tokens_seen": 103279780, + "router_z_loss_clip": 2.09179688, + "router_z_loss_mlp": 0.19433594, + "step": 4787, + "time_per_iteration": 2.574558973312378 + }, + { + "auxiliary_loss_clip": 0.06516974, + "auxiliary_loss_mlp": 0.01280481, + "balance_loss_clip": 0.0630367, + "balance_loss_mlp": 0.01260191, + "epoch": 0.2878701337742372, + "flos": 26549643340800.0, + "grad_norm": 1.713934654291453, + "language_loss": 0.84188497, + "learning_rate": 3.3420579658665677e-06, + "loss": 0.91985947, + "num_input_tokens_seen": 103300580, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.20288086, + "step": 4788, + "time_per_iteration": 2.610520362854004 + }, + { + "auxiliary_loss_clip": 0.06528202, + "auxiliary_loss_mlp": 0.01278372, + "balance_loss_clip": 0.06311956, + "balance_loss_mlp": 0.01257594, + "epoch": 0.28793025702690517, + "flos": 28154294156160.0, + "grad_norm": 1.8819133496848792, + "language_loss": 0.73887986, + "learning_rate": 3.3417691814126468e-06, + "loss": 0.81694555, + "num_input_tokens_seen": 103320430, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.2076416, + "step": 4789, + "time_per_iteration": 2.637234687805176 + }, + { + "auxiliary_loss_clip": 0.06504419, + "auxiliary_loss_mlp": 0.0127649, + "balance_loss_clip": 0.06300576, + "balance_loss_mlp": 0.01259014, + "epoch": 0.28799038027957313, + "flos": 23812254305280.0, + "grad_norm": 1.6484379512289788, + "language_loss": 0.84411776, + "learning_rate": 3.341480346078704e-06, + "loss": 0.92192692, + "num_input_tokens_seen": 103337695, + "router_z_loss_clip": 2.03808594, + "router_z_loss_mlp": 0.17492676, + "step": 4790, + "time_per_iteration": 2.5587222576141357 + }, + { + "auxiliary_loss_clip": 0.06518544, + "auxiliary_loss_mlp": 0.01278217, + "balance_loss_clip": 0.06308021, + "balance_loss_mlp": 0.01259728, + "epoch": 0.2880505035322411, + "flos": 22350340120320.0, + "grad_norm": 1.9872780385985664, + "language_loss": 0.78222489, + "learning_rate": 3.3411914598756922e-06, + "loss": 0.86019248, + "num_input_tokens_seen": 103357010, + "router_z_loss_clip": 2.10546875, + "router_z_loss_mlp": 0.18481445, + "step": 4791, + "time_per_iteration": 2.624457359313965 + }, + { + "auxiliary_loss_clip": 0.06518695, + "auxiliary_loss_mlp": 0.01277015, + "balance_loss_clip": 0.06302316, + "balance_loss_mlp": 0.01257286, + "epoch": 0.28811062678490906, + "flos": 18010061205120.0, + "grad_norm": 3.7561845310327002, + "language_loss": 0.71278274, + "learning_rate": 3.3409025228145654e-06, + "loss": 0.79073977, + "num_input_tokens_seen": 103375600, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.19726562, + "step": 4792, + "time_per_iteration": 2.5208675861358643 + }, + { + "auxiliary_loss_clip": 0.06512474, + "auxiliary_loss_mlp": 0.01276677, + "balance_loss_clip": 0.06301394, + "balance_loss_mlp": 0.01258391, + "epoch": 0.28817075003757703, + "flos": 22097416970880.0, + "grad_norm": 1.8001054572072859, + "language_loss": 0.80413318, + "learning_rate": 3.3406135349062812e-06, + "loss": 0.88202471, + "num_input_tokens_seen": 103395225, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 0.18286133, + "step": 4793, + "time_per_iteration": 4.170284271240234 + }, + { + "auxiliary_loss_clip": 0.06499149, + "auxiliary_loss_mlp": 0.01283104, + "balance_loss_clip": 0.06297339, + "balance_loss_mlp": 0.01264484, + "epoch": 0.288230873290245, + "flos": 41692842552960.0, + "grad_norm": 1.6709200510021447, + "language_loss": 0.78107667, + "learning_rate": 3.340324496161797e-06, + "loss": 0.85889918, + "num_input_tokens_seen": 103417245, + "router_z_loss_clip": 2.01757812, + "router_z_loss_mlp": 0.18603516, + "step": 4794, + "time_per_iteration": 2.8557510375976562 + }, + { + "auxiliary_loss_clip": 0.06510264, + "auxiliary_loss_mlp": 0.01279527, + "balance_loss_clip": 0.06298079, + "balance_loss_mlp": 0.01260882, + "epoch": 0.28829099654291296, + "flos": 18630328654080.0, + "grad_norm": 2.1208293695579608, + "language_loss": 0.83245766, + "learning_rate": 3.340035406592074e-06, + "loss": 0.91035557, + "num_input_tokens_seen": 103435500, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.18652344, + "step": 4795, + "time_per_iteration": 2.535163164138794 + }, + { + "auxiliary_loss_clip": 0.06498718, + "auxiliary_loss_mlp": 0.0128311, + "balance_loss_clip": 0.06297053, + "balance_loss_mlp": 0.01266099, + "epoch": 0.2883511197955809, + "flos": 24680707148160.0, + "grad_norm": 2.078774389913416, + "language_loss": 0.75219119, + "learning_rate": 3.339746266208074e-06, + "loss": 0.83000946, + "num_input_tokens_seen": 103451040, + "router_z_loss_clip": 2.015625, + "router_z_loss_mlp": 0.17004395, + "step": 4796, + "time_per_iteration": 2.567488670349121 + }, + { + "auxiliary_loss_clip": 0.06509424, + "auxiliary_loss_mlp": 0.01276979, + "balance_loss_clip": 0.06296358, + "balance_loss_mlp": 0.01257798, + "epoch": 0.2884112430482489, + "flos": 23118794714880.0, + "grad_norm": 2.1968759883463513, + "language_loss": 0.73290622, + "learning_rate": 3.3394570750207614e-06, + "loss": 0.81077027, + "num_input_tokens_seen": 103471330, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.19189453, + "step": 4797, + "time_per_iteration": 3.975389242172241 + }, + { + "auxiliary_loss_clip": 0.06507025, + "auxiliary_loss_mlp": 0.01273799, + "balance_loss_clip": 0.0629791, + "balance_loss_mlp": 0.0125556, + "epoch": 0.28847136630091685, + "flos": 16879000066560.0, + "grad_norm": 2.2937655739300373, + "language_loss": 0.74862409, + "learning_rate": 3.3391678330411017e-06, + "loss": 0.82643229, + "num_input_tokens_seen": 103488060, + "router_z_loss_clip": 2.08984375, + "router_z_loss_mlp": 0.18212891, + "step": 4798, + "time_per_iteration": 3.9849729537963867 + }, + { + "auxiliary_loss_clip": 0.06517179, + "auxiliary_loss_mlp": 0.01285883, + "balance_loss_clip": 0.06306559, + "balance_loss_mlp": 0.01266381, + "epoch": 0.2885314895535849, + "flos": 25663161870720.0, + "grad_norm": 2.626807334731923, + "language_loss": 0.65891635, + "learning_rate": 3.3388785402800642e-06, + "loss": 0.736947, + "num_input_tokens_seen": 103503600, + "router_z_loss_clip": 2.09960938, + "router_z_loss_mlp": 0.19494629, + "step": 4799, + "time_per_iteration": 2.6063008308410645 + }, + { + "auxiliary_loss_clip": 0.06513311, + "auxiliary_loss_mlp": 0.01278669, + "balance_loss_clip": 0.06300591, + "balance_loss_mlp": 0.01260013, + "epoch": 0.28859161280625284, + "flos": 21113872145280.0, + "grad_norm": 1.5942901452973643, + "language_loss": 0.82659006, + "learning_rate": 3.3385891967486178e-06, + "loss": 0.9045099, + "num_input_tokens_seen": 103524195, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.18664551, + "step": 4800, + "time_per_iteration": 2.5522704124450684 + }, + { + "auxiliary_loss_clip": 0.06498213, + "auxiliary_loss_mlp": 0.01277775, + "balance_loss_clip": 0.06294428, + "balance_loss_mlp": 0.01260609, + "epoch": 0.2886517360589208, + "flos": 26476870469760.0, + "grad_norm": 1.7957021177556654, + "language_loss": 0.91005886, + "learning_rate": 3.3382998024577347e-06, + "loss": 0.98781872, + "num_input_tokens_seen": 103545235, + "router_z_loss_clip": 2.03613281, + "router_z_loss_mlp": 0.17175293, + "step": 4801, + "time_per_iteration": 2.648975372314453 + }, + { + "auxiliary_loss_clip": 0.06509861, + "auxiliary_loss_mlp": 0.01278615, + "balance_loss_clip": 0.06299478, + "balance_loss_mlp": 0.01260722, + "epoch": 0.28871185931158877, + "flos": 25272365627520.0, + "grad_norm": 1.8432796050129874, + "language_loss": 0.74294543, + "learning_rate": 3.33801035741839e-06, + "loss": 0.82083023, + "num_input_tokens_seen": 103563305, + "router_z_loss_clip": 2.10546875, + "router_z_loss_mlp": 0.17895508, + "step": 4802, + "time_per_iteration": 2.5519795417785645 + }, + { + "auxiliary_loss_clip": 0.0639186, + "auxiliary_loss_mlp": 0.01290861, + "balance_loss_clip": 0.06293292, + "balance_loss_mlp": 0.01286456, + "epoch": 0.28877198256425674, + "flos": 66683676061440.0, + "grad_norm": 0.7742675136744124, + "language_loss": 0.62925327, + "learning_rate": 3.337720861641558e-06, + "loss": 0.70608056, + "num_input_tokens_seen": 103625025, + "router_z_loss_clip": 0.984375, + "router_z_loss_mlp": 0.04412842, + "step": 4803, + "time_per_iteration": 4.557742595672607 + }, + { + "auxiliary_loss_clip": 0.06504417, + "auxiliary_loss_mlp": 0.01273971, + "balance_loss_clip": 0.06297504, + "balance_loss_mlp": 0.01256721, + "epoch": 0.2888321058169247, + "flos": 20309261713920.0, + "grad_norm": 2.312081796144873, + "language_loss": 0.71418971, + "learning_rate": 3.3374313151382165e-06, + "loss": 0.79197359, + "num_input_tokens_seen": 103644235, + "router_z_loss_clip": 2.06738281, + "router_z_loss_mlp": 0.17248535, + "step": 4804, + "time_per_iteration": 2.5679221153259277 + }, + { + "auxiliary_loss_clip": 0.06511839, + "auxiliary_loss_mlp": 0.01276786, + "balance_loss_clip": 0.06299883, + "balance_loss_mlp": 0.01258892, + "epoch": 0.28889222906959267, + "flos": 25523192424960.0, + "grad_norm": 2.035708939634364, + "language_loss": 0.68254268, + "learning_rate": 3.337141717919346e-06, + "loss": 0.76042891, + "num_input_tokens_seen": 103664700, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.17907715, + "step": 4805, + "time_per_iteration": 2.5894699096679688 + }, + { + "auxiliary_loss_clip": 0.06510667, + "auxiliary_loss_mlp": 0.01276264, + "balance_loss_clip": 0.06300112, + "balance_loss_mlp": 0.01258955, + "epoch": 0.28895235232226063, + "flos": 32679544959360.0, + "grad_norm": 1.67836402891337, + "language_loss": 0.69622278, + "learning_rate": 3.3368520699959272e-06, + "loss": 0.77409214, + "num_input_tokens_seen": 103686595, + "router_z_loss_clip": 2.10546875, + "router_z_loss_mlp": 0.1730957, + "step": 4806, + "time_per_iteration": 2.6661036014556885 + }, + { + "auxiliary_loss_clip": 0.06499489, + "auxiliary_loss_mlp": 0.01273073, + "balance_loss_clip": 0.06297253, + "balance_loss_mlp": 0.01256133, + "epoch": 0.2890124755749286, + "flos": 29722202156160.0, + "grad_norm": 1.5048672267596763, + "language_loss": 0.71718901, + "learning_rate": 3.3365623713789443e-06, + "loss": 0.7949146, + "num_input_tokens_seen": 103707525, + "router_z_loss_clip": 2.02441406, + "router_z_loss_mlp": 0.16931152, + "step": 4807, + "time_per_iteration": 2.6082210540771484 + }, + { + "auxiliary_loss_clip": 0.06506096, + "auxiliary_loss_mlp": 0.01273173, + "balance_loss_clip": 0.06298453, + "balance_loss_mlp": 0.01255769, + "epoch": 0.28907259882759656, + "flos": 22681067385600.0, + "grad_norm": 1.6103433555287536, + "language_loss": 0.8189373, + "learning_rate": 3.336272622079382e-06, + "loss": 0.89672995, + "num_input_tokens_seen": 103727905, + "router_z_loss_clip": 2.07421875, + "router_z_loss_mlp": 0.17407227, + "step": 4808, + "time_per_iteration": 2.575005292892456 + }, + { + "auxiliary_loss_clip": 0.0649471, + "auxiliary_loss_mlp": 0.01279377, + "balance_loss_clip": 0.06293811, + "balance_loss_mlp": 0.01261543, + "epoch": 0.2891327220802645, + "flos": 22572809510400.0, + "grad_norm": 1.6658984409983257, + "language_loss": 0.79128641, + "learning_rate": 3.3359828221082276e-06, + "loss": 0.86902726, + "num_input_tokens_seen": 103748335, + "router_z_loss_clip": 2.00878906, + "router_z_loss_mlp": 0.17834473, + "step": 4809, + "time_per_iteration": 2.563202142715454 + }, + { + "auxiliary_loss_clip": 0.06509645, + "auxiliary_loss_mlp": 0.01274578, + "balance_loss_clip": 0.06294866, + "balance_loss_mlp": 0.01256411, + "epoch": 0.2891928453329325, + "flos": 21659228444160.0, + "grad_norm": 1.9154470794900575, + "language_loss": 0.79370517, + "learning_rate": 3.3356929714764714e-06, + "loss": 0.8715474, + "num_input_tokens_seen": 103767020, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.18151855, + "step": 4810, + "time_per_iteration": 2.555290460586548 + }, + { + "auxiliary_loss_clip": 0.06499892, + "auxiliary_loss_mlp": 0.01275722, + "balance_loss_clip": 0.06295595, + "balance_loss_mlp": 0.01259259, + "epoch": 0.28925296858560046, + "flos": 23228855452800.0, + "grad_norm": 1.5886971021791327, + "language_loss": 0.77595514, + "learning_rate": 3.3354030701951032e-06, + "loss": 0.85371131, + "num_input_tokens_seen": 103786355, + "router_z_loss_clip": 2.04199219, + "router_z_loss_mlp": 0.16467285, + "step": 4811, + "time_per_iteration": 2.5522642135620117 + }, + { + "auxiliary_loss_clip": 0.06509165, + "auxiliary_loss_mlp": 0.01277164, + "balance_loss_clip": 0.06302579, + "balance_loss_mlp": 0.01259497, + "epoch": 0.2893130918382685, + "flos": 28629267425280.0, + "grad_norm": 1.704164513062304, + "language_loss": 0.78002596, + "learning_rate": 3.335113118275117e-06, + "loss": 0.85788929, + "num_input_tokens_seen": 103809345, + "router_z_loss_clip": 2.06933594, + "router_z_loss_mlp": 0.17675781, + "step": 4812, + "time_per_iteration": 2.6069154739379883 + }, + { + "auxiliary_loss_clip": 0.06384769, + "auxiliary_loss_mlp": 0.01270413, + "balance_loss_clip": 0.06288065, + "balance_loss_mlp": 0.01266965, + "epoch": 0.28937321509093644, + "flos": 72323328240000.0, + "grad_norm": 0.7614773045430072, + "language_loss": 0.60086656, + "learning_rate": 3.3348231157275085e-06, + "loss": 0.67741829, + "num_input_tokens_seen": 103871180, + "router_z_loss_clip": 0.96875, + "router_z_loss_mlp": 0.03457642, + "step": 4813, + "time_per_iteration": 3.3377795219421387 + }, + { + "auxiliary_loss_clip": 0.06503347, + "auxiliary_loss_mlp": 0.01279669, + "balance_loss_clip": 0.0629978, + "balance_loss_mlp": 0.01262253, + "epoch": 0.2894333383436044, + "flos": 16221905948160.0, + "grad_norm": 2.095142654160917, + "language_loss": 0.83059847, + "learning_rate": 3.3345330625632725e-06, + "loss": 0.90842861, + "num_input_tokens_seen": 103889040, + "router_z_loss_clip": 2.03515625, + "router_z_loss_mlp": 0.17407227, + "step": 4814, + "time_per_iteration": 2.519822120666504 + }, + { + "auxiliary_loss_clip": 0.06510264, + "auxiliary_loss_mlp": 0.0128276, + "balance_loss_clip": 0.06297985, + "balance_loss_mlp": 0.01264389, + "epoch": 0.2894934615962724, + "flos": 24835434912000.0, + "grad_norm": 1.4921373382431753, + "language_loss": 0.72583377, + "learning_rate": 3.3342429587934094e-06, + "loss": 0.80376399, + "num_input_tokens_seen": 103910380, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.18371582, + "step": 4815, + "time_per_iteration": 2.613424301147461 + }, + { + "auxiliary_loss_clip": 0.06496876, + "auxiliary_loss_mlp": 0.01270189, + "balance_loss_clip": 0.06299625, + "balance_loss_mlp": 0.01253858, + "epoch": 0.28955358484894034, + "flos": 20456400683520.0, + "grad_norm": 1.478095248571898, + "language_loss": 0.71455014, + "learning_rate": 3.3339528044289198e-06, + "loss": 0.79222083, + "num_input_tokens_seen": 103929955, + "router_z_loss_clip": 1.97265625, + "router_z_loss_mlp": 0.16345215, + "step": 4816, + "time_per_iteration": 2.523789644241333 + }, + { + "auxiliary_loss_clip": 0.0651416, + "auxiliary_loss_mlp": 0.01273853, + "balance_loss_clip": 0.06301913, + "balance_loss_mlp": 0.01256007, + "epoch": 0.2896137081016083, + "flos": 22571803261440.0, + "grad_norm": 2.1886400582799643, + "language_loss": 0.75928313, + "learning_rate": 3.3336625994808055e-06, + "loss": 0.83716327, + "num_input_tokens_seen": 103948020, + "router_z_loss_clip": 2.12109375, + "router_z_loss_mlp": 0.17834473, + "step": 4817, + "time_per_iteration": 2.5829625129699707 + }, + { + "auxiliary_loss_clip": 0.0650699, + "auxiliary_loss_mlp": 0.0127444, + "balance_loss_clip": 0.06299114, + "balance_loss_mlp": 0.01255486, + "epoch": 0.28967383135427627, + "flos": 26695231009920.0, + "grad_norm": 2.009148210409016, + "language_loss": 0.77384543, + "learning_rate": 3.3333723439600723e-06, + "loss": 0.85165972, + "num_input_tokens_seen": 103968740, + "router_z_loss_clip": 2.078125, + "router_z_loss_mlp": 0.18933105, + "step": 4818, + "time_per_iteration": 2.583580732345581 + }, + { + "auxiliary_loss_clip": 0.06511898, + "auxiliary_loss_mlp": 0.01274642, + "balance_loss_clip": 0.063049, + "balance_loss_mlp": 0.01257833, + "epoch": 0.28973395460694423, + "flos": 15563428237440.0, + "grad_norm": 1.8180363278883531, + "language_loss": 0.80166686, + "learning_rate": 3.3330820378777263e-06, + "loss": 0.87953222, + "num_input_tokens_seen": 103986005, + "router_z_loss_clip": 2.07128906, + "router_z_loss_mlp": 0.16833496, + "step": 4819, + "time_per_iteration": 2.58598256111145 + }, + { + "auxiliary_loss_clip": 0.06512412, + "auxiliary_loss_mlp": 0.01275212, + "balance_loss_clip": 0.06301294, + "balance_loss_mlp": 0.01256543, + "epoch": 0.2897940778596122, + "flos": 18703395014400.0, + "grad_norm": 1.8889731698350438, + "language_loss": 0.79784238, + "learning_rate": 3.332791681244776e-06, + "loss": 0.87571859, + "num_input_tokens_seen": 104005070, + "router_z_loss_clip": 2.11328125, + "router_z_loss_mlp": 0.18664551, + "step": 4820, + "time_per_iteration": 2.514738082885742 + }, + { + "auxiliary_loss_clip": 0.06519003, + "auxiliary_loss_mlp": 0.01272112, + "balance_loss_clip": 0.06309246, + "balance_loss_mlp": 0.01254612, + "epoch": 0.28985420111228016, + "flos": 18776209812480.0, + "grad_norm": 1.948801074603747, + "language_loss": 0.73537958, + "learning_rate": 3.332501274072231e-06, + "loss": 0.81329072, + "num_input_tokens_seen": 104022945, + "router_z_loss_clip": 2.09863281, + "router_z_loss_mlp": 0.17492676, + "step": 4821, + "time_per_iteration": 2.6552352905273438 + }, + { + "auxiliary_loss_clip": 0.06509826, + "auxiliary_loss_mlp": 0.01279091, + "balance_loss_clip": 0.06303322, + "balance_loss_mlp": 0.01260733, + "epoch": 0.28991432436494813, + "flos": 23075511281280.0, + "grad_norm": 1.9415887628712303, + "language_loss": 0.7256397, + "learning_rate": 3.332210816371104e-06, + "loss": 0.8035289, + "num_input_tokens_seen": 104042080, + "router_z_loss_clip": 2.06347656, + "router_z_loss_mlp": 0.18347168, + "step": 4822, + "time_per_iteration": 2.5311806201934814 + }, + { + "auxiliary_loss_clip": 0.06508678, + "auxiliary_loss_mlp": 0.0127532, + "balance_loss_clip": 0.06304502, + "balance_loss_mlp": 0.01258237, + "epoch": 0.2899744476176161, + "flos": 17608992837120.0, + "grad_norm": 1.6868082855094653, + "language_loss": 0.66498971, + "learning_rate": 3.3319203081524102e-06, + "loss": 0.74282968, + "num_input_tokens_seen": 104060975, + "router_z_loss_clip": 2.04003906, + "router_z_loss_mlp": 0.17077637, + "step": 4823, + "time_per_iteration": 2.5582497119903564 + }, + { + "auxiliary_loss_clip": 0.06507877, + "auxiliary_loss_mlp": 0.0127093, + "balance_loss_clip": 0.06303018, + "balance_loss_mlp": 0.01253728, + "epoch": 0.29003457087028406, + "flos": 22315861365120.0, + "grad_norm": 2.007628710478466, + "language_loss": 0.81589168, + "learning_rate": 3.331629749427164e-06, + "loss": 0.89367974, + "num_input_tokens_seen": 104081395, + "router_z_loss_clip": 2.04882812, + "router_z_loss_mlp": 0.171875, + "step": 4824, + "time_per_iteration": 2.5258595943450928 + }, + { + "auxiliary_loss_clip": 0.06510833, + "auxiliary_loss_mlp": 0.01276305, + "balance_loss_clip": 0.06301483, + "balance_loss_mlp": 0.01258376, + "epoch": 0.2900946941229521, + "flos": 21951493885440.0, + "grad_norm": 1.837693758429887, + "language_loss": 0.73192668, + "learning_rate": 3.331339140206385e-06, + "loss": 0.80979806, + "num_input_tokens_seen": 104099995, + "router_z_loss_clip": 2.09179688, + "router_z_loss_mlp": 0.17932129, + "step": 4825, + "time_per_iteration": 2.558096170425415 + }, + { + "auxiliary_loss_clip": 0.0651435, + "auxiliary_loss_mlp": 0.01275324, + "balance_loss_clip": 0.06305824, + "balance_loss_mlp": 0.01257049, + "epoch": 0.29015481737562004, + "flos": 17938126874880.0, + "grad_norm": 2.202818652908599, + "language_loss": 0.7426061, + "learning_rate": 3.331048480501092e-06, + "loss": 0.82050288, + "num_input_tokens_seen": 104118930, + "router_z_loss_clip": 2.08496094, + "router_z_loss_mlp": 0.18273926, + "step": 4826, + "time_per_iteration": 2.497711420059204 + }, + { + "auxiliary_loss_clip": 0.06516986, + "auxiliary_loss_mlp": 0.01278902, + "balance_loss_clip": 0.06309567, + "balance_loss_mlp": 0.01262141, + "epoch": 0.290214940628288, + "flos": 22790079947520.0, + "grad_norm": 1.934932602801083, + "language_loss": 0.69077051, + "learning_rate": 3.3307577703223073e-06, + "loss": 0.76872945, + "num_input_tokens_seen": 104136940, + "router_z_loss_clip": 2.07324219, + "router_z_loss_mlp": 0.16748047, + "step": 4827, + "time_per_iteration": 2.5729641914367676 + }, + { + "auxiliary_loss_clip": 0.06517433, + "auxiliary_loss_mlp": 0.0127379, + "balance_loss_clip": 0.06311382, + "balance_loss_mlp": 0.01255646, + "epoch": 0.290275063880956, + "flos": 20011881173760.0, + "grad_norm": 1.8047855406998587, + "language_loss": 0.80766201, + "learning_rate": 3.3304670096810545e-06, + "loss": 0.88557422, + "num_input_tokens_seen": 104154280, + "router_z_loss_clip": 2.06054688, + "router_z_loss_mlp": 0.18151855, + "step": 4828, + "time_per_iteration": 2.5190348625183105 + }, + { + "auxiliary_loss_clip": 0.0651058, + "auxiliary_loss_mlp": 0.01278642, + "balance_loss_clip": 0.06308287, + "balance_loss_mlp": 0.01260809, + "epoch": 0.29033518713362394, + "flos": 22060003322880.0, + "grad_norm": 1.646725141321262, + "language_loss": 0.80908686, + "learning_rate": 3.33017619858836e-06, + "loss": 0.8869791, + "num_input_tokens_seen": 104172605, + "router_z_loss_clip": 2.02246094, + "router_z_loss_mlp": 0.17822266, + "step": 4829, + "time_per_iteration": 2.564837694168091 + }, + { + "auxiliary_loss_clip": 0.06503877, + "auxiliary_loss_mlp": 0.01277613, + "balance_loss_clip": 0.06304269, + "balance_loss_mlp": 0.0126059, + "epoch": 0.2903953103862919, + "flos": 25637194304640.0, + "grad_norm": 1.4271698228137566, + "language_loss": 0.82616186, + "learning_rate": 3.329885337055249e-06, + "loss": 0.90397674, + "num_input_tokens_seen": 104194120, + "router_z_loss_clip": 1.99414062, + "router_z_loss_mlp": 0.17016602, + "step": 4830, + "time_per_iteration": 2.557326555252075 + }, + { + "auxiliary_loss_clip": 0.0652103, + "auxiliary_loss_mlp": 0.01280335, + "balance_loss_clip": 0.06313583, + "balance_loss_mlp": 0.01262036, + "epoch": 0.29045543363895987, + "flos": 16951437521280.0, + "grad_norm": 2.247105417787089, + "language_loss": 0.79901475, + "learning_rate": 3.3295944250927546e-06, + "loss": 0.87702841, + "num_input_tokens_seen": 104210875, + "router_z_loss_clip": 2.07421875, + "router_z_loss_mlp": 0.18310547, + "step": 4831, + "time_per_iteration": 2.5306637287139893 + }, + { + "auxiliary_loss_clip": 0.06507042, + "auxiliary_loss_mlp": 0.01277723, + "balance_loss_clip": 0.06307022, + "balance_loss_mlp": 0.01261392, + "epoch": 0.29051555689162784, + "flos": 26402630152320.0, + "grad_norm": 2.3059080747570775, + "language_loss": 0.75331926, + "learning_rate": 3.3293034627119055e-06, + "loss": 0.83116686, + "num_input_tokens_seen": 104229875, + "router_z_loss_clip": 2.00097656, + "router_z_loss_mlp": 0.16333008, + "step": 4832, + "time_per_iteration": 2.5603439807891846 + }, + { + "auxiliary_loss_clip": 0.06503655, + "auxiliary_loss_mlp": 0.01283448, + "balance_loss_clip": 0.06302731, + "balance_loss_mlp": 0.01267271, + "epoch": 0.2905756801442958, + "flos": 21109931003520.0, + "grad_norm": 1.626645949157208, + "language_loss": 0.76312864, + "learning_rate": 3.329012449923736e-06, + "loss": 0.8409996, + "num_input_tokens_seen": 104250405, + "router_z_loss_clip": 2.00878906, + "router_z_loss_mlp": 0.16162109, + "step": 4833, + "time_per_iteration": 4.029958963394165 + }, + { + "auxiliary_loss_clip": 0.06504881, + "auxiliary_loss_mlp": 0.01280243, + "balance_loss_clip": 0.06303954, + "balance_loss_mlp": 0.01263363, + "epoch": 0.29063580339696377, + "flos": 15711573456000.0, + "grad_norm": 1.645904053352059, + "language_loss": 0.65383506, + "learning_rate": 3.3287213867392813e-06, + "loss": 0.73168635, + "num_input_tokens_seen": 104269185, + "router_z_loss_clip": 2.00976562, + "router_z_loss_mlp": 0.16882324, + "step": 4834, + "time_per_iteration": 2.5233187675476074 + }, + { + "auxiliary_loss_clip": 0.06499655, + "auxiliary_loss_mlp": 0.01274915, + "balance_loss_clip": 0.06299647, + "balance_loss_mlp": 0.01258893, + "epoch": 0.29069592664963173, + "flos": 24651972397440.0, + "grad_norm": 1.808411103531711, + "language_loss": 0.71914709, + "learning_rate": 3.3284302731695783e-06, + "loss": 0.79689276, + "num_input_tokens_seen": 104289400, + "router_z_loss_clip": 1.99902344, + "router_z_loss_mlp": 0.16027832, + "step": 4835, + "time_per_iteration": 2.555670738220215 + }, + { + "auxiliary_loss_clip": 0.06500543, + "auxiliary_loss_mlp": 0.01275594, + "balance_loss_clip": 0.06299368, + "balance_loss_mlp": 0.01259536, + "epoch": 0.2907560499022997, + "flos": 24980854872960.0, + "grad_norm": 1.750724607078226, + "language_loss": 0.80319953, + "learning_rate": 3.3281391092256668e-06, + "loss": 0.88096082, + "num_input_tokens_seen": 104310485, + "router_z_loss_clip": 2.00878906, + "router_z_loss_mlp": 0.16052246, + "step": 4836, + "time_per_iteration": 3.9953579902648926 + }, + { + "auxiliary_loss_clip": 0.0650623, + "auxiliary_loss_mlp": 0.01276306, + "balance_loss_clip": 0.06305872, + "balance_loss_mlp": 0.01260236, + "epoch": 0.29081617315496766, + "flos": 18662836838400.0, + "grad_norm": 1.8282626295265978, + "language_loss": 0.81337535, + "learning_rate": 3.3278478949185865e-06, + "loss": 0.89120078, + "num_input_tokens_seen": 104327330, + "router_z_loss_clip": 2.00585938, + "router_z_loss_mlp": 0.16064453, + "step": 4837, + "time_per_iteration": 3.9492576122283936 + }, + { + "auxiliary_loss_clip": 0.06508449, + "auxiliary_loss_mlp": 0.01275256, + "balance_loss_clip": 0.06305645, + "balance_loss_mlp": 0.01257362, + "epoch": 0.2908762964076356, + "flos": 35339087952000.0, + "grad_norm": 1.819350457328488, + "language_loss": 0.67809796, + "learning_rate": 3.327556630259381e-06, + "loss": 0.75593495, + "num_input_tokens_seen": 104350350, + "router_z_loss_clip": 2.02734375, + "router_z_loss_mlp": 0.17895508, + "step": 4838, + "time_per_iteration": 2.6575772762298584 + }, + { + "auxiliary_loss_clip": 0.06511781, + "auxiliary_loss_mlp": 0.01274117, + "balance_loss_clip": 0.06305051, + "balance_loss_mlp": 0.01256688, + "epoch": 0.29093641966030365, + "flos": 23083058148480.0, + "grad_norm": 2.3112745331966185, + "language_loss": 0.71775508, + "learning_rate": 3.327265315259095e-06, + "loss": 0.79561406, + "num_input_tokens_seen": 104369995, + "router_z_loss_clip": 2.06640625, + "router_z_loss_mlp": 0.17419434, + "step": 4839, + "time_per_iteration": 2.6057844161987305 + }, + { + "auxiliary_loss_clip": 0.06504601, + "auxiliary_loss_mlp": 0.0127457, + "balance_loss_clip": 0.06301045, + "balance_loss_mlp": 0.01258071, + "epoch": 0.2909965429129716, + "flos": 35964260864640.0, + "grad_norm": 1.8988017352340443, + "language_loss": 0.75792682, + "learning_rate": 3.326973949928776e-06, + "loss": 0.83571851, + "num_input_tokens_seen": 104392285, + "router_z_loss_clip": 2.03515625, + "router_z_loss_mlp": 0.16503906, + "step": 4840, + "time_per_iteration": 2.7049334049224854 + }, + { + "auxiliary_loss_clip": 0.06503059, + "auxiliary_loss_mlp": 0.0127188, + "balance_loss_clip": 0.06299757, + "balance_loss_mlp": 0.01255417, + "epoch": 0.2910566661656396, + "flos": 30887616268800.0, + "grad_norm": 1.8129671702232821, + "language_loss": 0.60949063, + "learning_rate": 3.326682534279471e-06, + "loss": 0.68724, + "num_input_tokens_seen": 104412640, + "router_z_loss_clip": 2.03125, + "router_z_loss_mlp": 0.16479492, + "step": 4841, + "time_per_iteration": 2.7237274646759033 + }, + { + "auxiliary_loss_clip": 0.06506652, + "auxiliary_loss_mlp": 0.01272342, + "balance_loss_clip": 0.06303366, + "balance_loss_mlp": 0.01255021, + "epoch": 0.29111678941830754, + "flos": 30018366812160.0, + "grad_norm": 1.3487344136639734, + "language_loss": 0.71762401, + "learning_rate": 3.326391068322232e-06, + "loss": 0.79541385, + "num_input_tokens_seen": 104435245, + "router_z_loss_clip": 2.03320312, + "router_z_loss_mlp": 0.17333984, + "step": 4842, + "time_per_iteration": 4.036385774612427 + }, + { + "auxiliary_loss_clip": 0.06507391, + "auxiliary_loss_mlp": 0.01271836, + "balance_loss_clip": 0.06304808, + "balance_loss_mlp": 0.01256423, + "epoch": 0.2911769126709755, + "flos": 22864110629760.0, + "grad_norm": 1.4808705717301018, + "language_loss": 0.74052906, + "learning_rate": 3.3260995520681098e-06, + "loss": 0.81832135, + "num_input_tokens_seen": 104455395, + "router_z_loss_clip": 2.0234375, + "router_z_loss_mlp": 0.1541748, + "step": 4843, + "time_per_iteration": 2.565093755722046 + }, + { + "auxiliary_loss_clip": 0.06510359, + "auxiliary_loss_mlp": 0.01272609, + "balance_loss_clip": 0.06305443, + "balance_loss_mlp": 0.01256742, + "epoch": 0.2912370359236435, + "flos": 21656545113600.0, + "grad_norm": 3.6041214714298806, + "language_loss": 0.5879783, + "learning_rate": 3.3258079855281602e-06, + "loss": 0.66580796, + "num_input_tokens_seen": 104473350, + "router_z_loss_clip": 2.04785156, + "router_z_loss_mlp": 0.15856934, + "step": 4844, + "time_per_iteration": 2.636667490005493 + }, + { + "auxiliary_loss_clip": 0.06518383, + "auxiliary_loss_mlp": 0.01278792, + "balance_loss_clip": 0.06309091, + "balance_loss_mlp": 0.01261566, + "epoch": 0.29129715917631144, + "flos": 22899972977280.0, + "grad_norm": 1.9195914149996331, + "language_loss": 0.86846137, + "learning_rate": 3.3255163687134396e-06, + "loss": 0.94643313, + "num_input_tokens_seen": 104492265, + "router_z_loss_clip": 2.09277344, + "router_z_loss_mlp": 0.17224121, + "step": 4845, + "time_per_iteration": 2.549297571182251 + }, + { + "auxiliary_loss_clip": 0.06508736, + "auxiliary_loss_mlp": 0.01273322, + "balance_loss_clip": 0.06304652, + "balance_loss_mlp": 0.01256144, + "epoch": 0.2913572824289794, + "flos": 22681067385600.0, + "grad_norm": 1.8711717874469986, + "language_loss": 0.67698014, + "learning_rate": 3.3252247016350046e-06, + "loss": 0.75480074, + "num_input_tokens_seen": 104510755, + "router_z_loss_clip": 2.04199219, + "router_z_loss_mlp": 0.17175293, + "step": 4846, + "time_per_iteration": 2.607025146484375 + }, + { + "auxiliary_loss_clip": 0.06502484, + "auxiliary_loss_mlp": 0.01275425, + "balance_loss_clip": 0.06301165, + "balance_loss_mlp": 0.01258771, + "epoch": 0.29141740568164737, + "flos": 23113260345600.0, + "grad_norm": 4.990917175371688, + "language_loss": 0.708718, + "learning_rate": 3.3249329843039166e-06, + "loss": 0.78649712, + "num_input_tokens_seen": 104530830, + "router_z_loss_clip": 2.01171875, + "router_z_loss_mlp": 0.16674805, + "step": 4847, + "time_per_iteration": 2.5293991565704346 + }, + { + "auxiliary_loss_clip": 0.06504785, + "auxiliary_loss_mlp": 0.01278673, + "balance_loss_clip": 0.06301495, + "balance_loss_mlp": 0.01261877, + "epoch": 0.29147752893431533, + "flos": 23593851838080.0, + "grad_norm": 1.4565796817402286, + "language_loss": 0.74258435, + "learning_rate": 3.324641216731237e-06, + "loss": 0.82041889, + "num_input_tokens_seen": 104550115, + "router_z_loss_clip": 2.03125, + "router_z_loss_mlp": 0.16796875, + "step": 4848, + "time_per_iteration": 2.585296630859375 + }, + { + "auxiliary_loss_clip": 0.06502895, + "auxiliary_loss_mlp": 0.01276049, + "balance_loss_clip": 0.06298006, + "balance_loss_mlp": 0.01259729, + "epoch": 0.2915376521869833, + "flos": 20597753721600.0, + "grad_norm": 2.1223800155182624, + "language_loss": 0.77561575, + "learning_rate": 3.3243493989280295e-06, + "loss": 0.85340518, + "num_input_tokens_seen": 104566255, + "router_z_loss_clip": 2.046875, + "router_z_loss_mlp": 0.16333008, + "step": 4849, + "time_per_iteration": 2.4936819076538086 + }, + { + "auxiliary_loss_clip": 0.06514408, + "auxiliary_loss_mlp": 0.01274174, + "balance_loss_clip": 0.06305997, + "balance_loss_mlp": 0.01257723, + "epoch": 0.29159777543965126, + "flos": 20817414000000.0, + "grad_norm": 1.652469266745217, + "language_loss": 0.79415965, + "learning_rate": 3.3240575309053596e-06, + "loss": 0.87204546, + "num_input_tokens_seen": 104585235, + "router_z_loss_clip": 2.08789062, + "router_z_loss_mlp": 0.16442871, + "step": 4850, + "time_per_iteration": 2.55340313911438 + }, + { + "auxiliary_loss_clip": 0.06494947, + "auxiliary_loss_mlp": 0.0127524, + "balance_loss_clip": 0.06295137, + "balance_loss_mlp": 0.01258479, + "epoch": 0.29165789869231923, + "flos": 24251155591680.0, + "grad_norm": 1.7747423674847125, + "language_loss": 0.76365012, + "learning_rate": 3.323765612674296e-06, + "loss": 0.84135199, + "num_input_tokens_seen": 104605315, + "router_z_loss_clip": 1.99902344, + "router_z_loss_mlp": 0.16748047, + "step": 4851, + "time_per_iteration": 2.5335612297058105 + }, + { + "auxiliary_loss_clip": 0.06499958, + "auxiliary_loss_mlp": 0.01272557, + "balance_loss_clip": 0.06300404, + "balance_loss_mlp": 0.01256929, + "epoch": 0.29171802194498725, + "flos": 28957562922240.0, + "grad_norm": 1.3481127708223366, + "language_loss": 0.7781775, + "learning_rate": 3.3234736442459078e-06, + "loss": 0.85590267, + "num_input_tokens_seen": 104626055, + "router_z_loss_clip": 1.99609375, + "router_z_loss_mlp": 0.15612793, + "step": 4852, + "time_per_iteration": 2.6266329288482666 + }, + { + "auxiliary_loss_clip": 0.06501517, + "auxiliary_loss_mlp": 0.0127959, + "balance_loss_clip": 0.06297216, + "balance_loss_mlp": 0.01261697, + "epoch": 0.2917781451976552, + "flos": 22604269518720.0, + "grad_norm": 1.5006442804531215, + "language_loss": 0.78676021, + "learning_rate": 3.3231816256312665e-06, + "loss": 0.86457133, + "num_input_tokens_seen": 104646005, + "router_z_loss_clip": 2.04296875, + "router_z_loss_mlp": 0.17883301, + "step": 4853, + "time_per_iteration": 2.5417568683624268 + }, + { + "auxiliary_loss_clip": 0.06501997, + "auxiliary_loss_mlp": 0.01270758, + "balance_loss_clip": 0.06296347, + "balance_loss_mlp": 0.01253818, + "epoch": 0.2918382684503232, + "flos": 21579956881920.0, + "grad_norm": 4.190137743849971, + "language_loss": 0.88580358, + "learning_rate": 3.322889556841445e-06, + "loss": 0.96353114, + "num_input_tokens_seen": 104661620, + "router_z_loss_clip": 2.05761719, + "router_z_loss_mlp": 0.16943359, + "step": 4854, + "time_per_iteration": 2.537247896194458 + }, + { + "auxiliary_loss_clip": 0.06492339, + "auxiliary_loss_mlp": 0.01274317, + "balance_loss_clip": 0.06290923, + "balance_loss_mlp": 0.01255517, + "epoch": 0.29189839170299114, + "flos": 24360503569920.0, + "grad_norm": 1.79615422427109, + "language_loss": 0.86863208, + "learning_rate": 3.322597437887519e-06, + "loss": 0.94629866, + "num_input_tokens_seen": 104681445, + "router_z_loss_clip": 2.01464844, + "router_z_loss_mlp": 0.18798828, + "step": 4855, + "time_per_iteration": 2.5408217906951904 + }, + { + "auxiliary_loss_clip": 0.06394155, + "auxiliary_loss_mlp": 0.01254999, + "balance_loss_clip": 0.0629582, + "balance_loss_mlp": 0.01250765, + "epoch": 0.2919585149556591, + "flos": 71338693311360.0, + "grad_norm": 0.8469602753394808, + "language_loss": 0.60232264, + "learning_rate": 3.322305268780566e-06, + "loss": 0.67881417, + "num_input_tokens_seen": 104747945, + "router_z_loss_clip": 0.98388672, + "router_z_loss_mlp": 0.04238892, + "step": 4856, + "time_per_iteration": 3.245720863342285 + }, + { + "auxiliary_loss_clip": 0.06496054, + "auxiliary_loss_mlp": 0.01271452, + "balance_loss_clip": 0.06293447, + "balance_loss_mlp": 0.01254966, + "epoch": 0.2920186382083271, + "flos": 15638716730880.0, + "grad_norm": 1.9340338412348166, + "language_loss": 0.69134986, + "learning_rate": 3.322013049531664e-06, + "loss": 0.76902497, + "num_input_tokens_seen": 104766225, + "router_z_loss_clip": 2.02734375, + "router_z_loss_mlp": 0.16479492, + "step": 4857, + "time_per_iteration": 2.492515802383423 + }, + { + "auxiliary_loss_clip": 0.0649875, + "auxiliary_loss_mlp": 0.01275648, + "balance_loss_clip": 0.06298544, + "balance_loss_mlp": 0.01258863, + "epoch": 0.29207876146099504, + "flos": 28373535164160.0, + "grad_norm": 2.0544380804392346, + "language_loss": 0.84425288, + "learning_rate": 3.321720780151895e-06, + "loss": 0.92199689, + "num_input_tokens_seen": 104785345, + "router_z_loss_clip": 2.00195312, + "router_z_loss_mlp": 0.16772461, + "step": 4858, + "time_per_iteration": 2.596036434173584 + }, + { + "auxiliary_loss_clip": 0.06500848, + "auxiliary_loss_mlp": 0.01274974, + "balance_loss_clip": 0.06300872, + "balance_loss_mlp": 0.01257879, + "epoch": 0.292138884713663, + "flos": 21877295495040.0, + "grad_norm": 1.6880642207641439, + "language_loss": 0.781169, + "learning_rate": 3.321428460652342e-06, + "loss": 0.85892725, + "num_input_tokens_seen": 104804560, + "router_z_loss_clip": 2.0, + "router_z_loss_mlp": 0.17102051, + "step": 4859, + "time_per_iteration": 2.5885818004608154 + }, + { + "auxiliary_loss_clip": 0.06508546, + "auxiliary_loss_mlp": 0.01274065, + "balance_loss_clip": 0.06301034, + "balance_loss_mlp": 0.0125684, + "epoch": 0.29219900796633097, + "flos": 20998277038080.0, + "grad_norm": 2.276956308498861, + "language_loss": 0.68823123, + "learning_rate": 3.3211360910440885e-06, + "loss": 0.76605731, + "num_input_tokens_seen": 104821105, + "router_z_loss_clip": 2.07421875, + "router_z_loss_mlp": 0.17224121, + "step": 4860, + "time_per_iteration": 2.6006133556365967 + }, + { + "auxiliary_loss_clip": 0.06497137, + "auxiliary_loss_mlp": 0.01273361, + "balance_loss_clip": 0.06296673, + "balance_loss_mlp": 0.01256743, + "epoch": 0.29225913121899894, + "flos": 35012930734080.0, + "grad_norm": 1.9621079535677741, + "language_loss": 0.75927335, + "learning_rate": 3.320843671338222e-06, + "loss": 0.83697826, + "num_input_tokens_seen": 104841440, + "router_z_loss_clip": 2.00292969, + "router_z_loss_mlp": 0.16625977, + "step": 4861, + "time_per_iteration": 2.6738815307617188 + }, + { + "auxiliary_loss_clip": 0.06498605, + "auxiliary_loss_mlp": 0.01278705, + "balance_loss_clip": 0.06298269, + "balance_loss_mlp": 0.0126229, + "epoch": 0.2923192544716669, + "flos": 13520588895360.0, + "grad_norm": 2.4944662876521027, + "language_loss": 0.91953582, + "learning_rate": 3.320551201545832e-06, + "loss": 0.99730897, + "num_input_tokens_seen": 104858210, + "router_z_loss_clip": 2.00097656, + "router_z_loss_mlp": 0.16418457, + "step": 4862, + "time_per_iteration": 2.523393392562866 + }, + { + "auxiliary_loss_clip": 0.06498902, + "auxiliary_loss_mlp": 0.01275122, + "balance_loss_clip": 0.06296849, + "balance_loss_mlp": 0.01258325, + "epoch": 0.29237937772433487, + "flos": 19469543621760.0, + "grad_norm": 2.367835349845546, + "language_loss": 0.74302417, + "learning_rate": 3.320258681678008e-06, + "loss": 0.82076436, + "num_input_tokens_seen": 104875620, + "router_z_loss_clip": 2.02441406, + "router_z_loss_mlp": 0.16809082, + "step": 4863, + "time_per_iteration": 2.5615665912628174 + }, + { + "auxiliary_loss_clip": 0.06495367, + "auxiliary_loss_mlp": 0.01274458, + "balance_loss_clip": 0.06298485, + "balance_loss_mlp": 0.01257041, + "epoch": 0.29243950097700283, + "flos": 20856965927040.0, + "grad_norm": 1.6096808438714836, + "language_loss": 0.78180861, + "learning_rate": 3.319966111745842e-06, + "loss": 0.85950685, + "num_input_tokens_seen": 104894600, + "router_z_loss_clip": 1.96679688, + "router_z_loss_mlp": 0.17419434, + "step": 4864, + "time_per_iteration": 2.543239116668701 + }, + { + "auxiliary_loss_clip": 0.06506015, + "auxiliary_loss_mlp": 0.01278091, + "balance_loss_clip": 0.06299396, + "balance_loss_mlp": 0.01260127, + "epoch": 0.29249962422967085, + "flos": 23590581528960.0, + "grad_norm": 1.7200803595236853, + "language_loss": 0.82166076, + "learning_rate": 3.319673491760429e-06, + "loss": 0.8995018, + "num_input_tokens_seen": 104914530, + "router_z_loss_clip": 2.06640625, + "router_z_loss_mlp": 0.1796875, + "step": 4865, + "time_per_iteration": 2.6162562370300293 + }, + { + "auxiliary_loss_clip": 0.06504746, + "auxiliary_loss_mlp": 0.01277785, + "balance_loss_clip": 0.06300808, + "balance_loss_mlp": 0.01258783, + "epoch": 0.2925597474823388, + "flos": 22279915163520.0, + "grad_norm": 1.8207973709117147, + "language_loss": 0.85861242, + "learning_rate": 3.3193808217328645e-06, + "loss": 0.93643779, + "num_input_tokens_seen": 104933460, + "router_z_loss_clip": 2.04003906, + "router_z_loss_mlp": 0.18994141, + "step": 4866, + "time_per_iteration": 2.5991125106811523 + }, + { + "auxiliary_loss_clip": 0.06498669, + "auxiliary_loss_mlp": 0.01277634, + "balance_loss_clip": 0.06298468, + "balance_loss_mlp": 0.0126005, + "epoch": 0.2926198707350068, + "flos": 34464136417920.0, + "grad_norm": 1.677629799943763, + "language_loss": 0.76065934, + "learning_rate": 3.3190881016742476e-06, + "loss": 0.83842242, + "num_input_tokens_seen": 104954495, + "router_z_loss_clip": 2.00097656, + "router_z_loss_mlp": 0.17578125, + "step": 4867, + "time_per_iteration": 2.652083396911621 + }, + { + "auxiliary_loss_clip": 0.06508122, + "auxiliary_loss_mlp": 0.01277995, + "balance_loss_clip": 0.06302974, + "balance_loss_mlp": 0.01260483, + "epoch": 0.29267999398767475, + "flos": 20710413936000.0, + "grad_norm": 2.5581846543962197, + "language_loss": 0.73412025, + "learning_rate": 3.3187953315956776e-06, + "loss": 0.81198144, + "num_input_tokens_seen": 104971915, + "router_z_loss_clip": 2.04980469, + "router_z_loss_mlp": 0.1751709, + "step": 4868, + "time_per_iteration": 2.5104074478149414 + }, + { + "auxiliary_loss_clip": 0.06504919, + "auxiliary_loss_mlp": 0.0127382, + "balance_loss_clip": 0.06304781, + "balance_loss_mlp": 0.01256558, + "epoch": 0.2927401172403427, + "flos": 18374470611840.0, + "grad_norm": 1.376823387605754, + "language_loss": 0.74768585, + "learning_rate": 3.3185025115082566e-06, + "loss": 0.82547319, + "num_input_tokens_seen": 104991335, + "router_z_loss_clip": 2.00292969, + "router_z_loss_mlp": 0.17260742, + "step": 4869, + "time_per_iteration": 2.517545461654663 + }, + { + "auxiliary_loss_clip": 0.06509744, + "auxiliary_loss_mlp": 0.01275578, + "balance_loss_clip": 0.06308037, + "balance_loss_mlp": 0.01258627, + "epoch": 0.2928002404930107, + "flos": 26111203251840.0, + "grad_norm": 1.453461002371515, + "language_loss": 0.76538026, + "learning_rate": 3.318209641423088e-06, + "loss": 0.84323347, + "num_input_tokens_seen": 105012015, + "router_z_loss_clip": 2.01660156, + "router_z_loss_mlp": 0.16931152, + "step": 4870, + "time_per_iteration": 2.571554183959961 + }, + { + "auxiliary_loss_clip": 0.06512202, + "auxiliary_loss_mlp": 0.01274146, + "balance_loss_clip": 0.06304315, + "balance_loss_mlp": 0.01255967, + "epoch": 0.29286036374567864, + "flos": 21331142582400.0, + "grad_norm": 3.1299518178223726, + "language_loss": 0.67793286, + "learning_rate": 3.3179167213512777e-06, + "loss": 0.75579637, + "num_input_tokens_seen": 105031460, + "router_z_loss_clip": 2.078125, + "router_z_loss_mlp": 0.18188477, + "step": 4871, + "time_per_iteration": 2.5867390632629395 + }, + { + "auxiliary_loss_clip": 0.06504084, + "auxiliary_loss_mlp": 0.01272553, + "balance_loss_clip": 0.0630291, + "balance_loss_mlp": 0.01256973, + "epoch": 0.2929204869983466, + "flos": 29577117611520.0, + "grad_norm": 1.7840080197301964, + "language_loss": 0.78071094, + "learning_rate": 3.317623751303933e-06, + "loss": 0.85847723, + "num_input_tokens_seen": 105052965, + "router_z_loss_clip": 2.01171875, + "router_z_loss_mlp": 0.15588379, + "step": 4872, + "time_per_iteration": 2.598357915878296 + }, + { + "auxiliary_loss_clip": 0.06511893, + "auxiliary_loss_mlp": 0.01279899, + "balance_loss_clip": 0.06305112, + "balance_loss_mlp": 0.01260313, + "epoch": 0.2929806102510146, + "flos": 19063569790080.0, + "grad_norm": 1.7763964443019538, + "language_loss": 0.72879624, + "learning_rate": 3.317330731292164e-06, + "loss": 0.80671406, + "num_input_tokens_seen": 105071840, + "router_z_loss_clip": 2.06640625, + "router_z_loss_mlp": 0.19580078, + "step": 4873, + "time_per_iteration": 3.9404540061950684 + }, + { + "auxiliary_loss_clip": 0.06511085, + "auxiliary_loss_mlp": 0.01274077, + "balance_loss_clip": 0.06303495, + "balance_loss_mlp": 0.01256386, + "epoch": 0.29304073350368254, + "flos": 21950613417600.0, + "grad_norm": 1.85182595241139, + "language_loss": 0.79023468, + "learning_rate": 3.3170376613270812e-06, + "loss": 0.86808634, + "num_input_tokens_seen": 105089445, + "router_z_loss_clip": 2.07617188, + "router_z_loss_mlp": 0.17675781, + "step": 4874, + "time_per_iteration": 2.523942470550537 + }, + { + "auxiliary_loss_clip": 0.06517696, + "auxiliary_loss_mlp": 0.01272827, + "balance_loss_clip": 0.06305568, + "balance_loss_mlp": 0.01255315, + "epoch": 0.2931008567563505, + "flos": 15456302392320.0, + "grad_norm": 2.3441988108556377, + "language_loss": 0.7791701, + "learning_rate": 3.3167445414197985e-06, + "loss": 0.85707539, + "num_input_tokens_seen": 105106210, + "router_z_loss_clip": 2.12304688, + "router_z_loss_mlp": 0.17504883, + "step": 4875, + "time_per_iteration": 2.4990556240081787 + }, + { + "auxiliary_loss_clip": 0.06506883, + "auxiliary_loss_mlp": 0.01280573, + "balance_loss_clip": 0.06301031, + "balance_loss_mlp": 0.01263252, + "epoch": 0.29316098000901847, + "flos": 16988893096320.0, + "grad_norm": 1.859745338516673, + "language_loss": 0.70031023, + "learning_rate": 3.316451371581431e-06, + "loss": 0.77818477, + "num_input_tokens_seen": 105124200, + "router_z_loss_clip": 2.05859375, + "router_z_loss_mlp": 0.17321777, + "step": 4876, + "time_per_iteration": 5.4681243896484375 + }, + { + "auxiliary_loss_clip": 0.06504045, + "auxiliary_loss_mlp": 0.01275518, + "balance_loss_clip": 0.06302452, + "balance_loss_mlp": 0.01259174, + "epoch": 0.29322110326168643, + "flos": 16362462372480.0, + "grad_norm": 1.8247622937841679, + "language_loss": 0.82480925, + "learning_rate": 3.316158151823096e-06, + "loss": 0.90260488, + "num_input_tokens_seen": 105140400, + "router_z_loss_clip": 2.015625, + "router_z_loss_mlp": 0.16345215, + "step": 4877, + "time_per_iteration": 2.5517635345458984 + }, + { + "auxiliary_loss_clip": 0.06509132, + "auxiliary_loss_mlp": 0.01278665, + "balance_loss_clip": 0.06299806, + "balance_loss_mlp": 0.0126064, + "epoch": 0.29328122651435445, + "flos": 13996023361920.0, + "grad_norm": 2.6416558700601334, + "language_loss": 0.6810987, + "learning_rate": 3.315864882155911e-06, + "loss": 0.75897658, + "num_input_tokens_seen": 105157535, + "router_z_loss_clip": 2.09179688, + "router_z_loss_mlp": 0.18017578, + "step": 4878, + "time_per_iteration": 2.511922597885132 + }, + { + "auxiliary_loss_clip": 0.0649902, + "auxiliary_loss_mlp": 0.01275226, + "balance_loss_clip": 0.06298085, + "balance_loss_mlp": 0.01257697, + "epoch": 0.2933413497670224, + "flos": 25271569013760.0, + "grad_norm": 1.8820124674491874, + "language_loss": 0.74030542, + "learning_rate": 3.3155715625909982e-06, + "loss": 0.81804794, + "num_input_tokens_seen": 105175185, + "router_z_loss_clip": 2.01074219, + "router_z_loss_mlp": 0.17510986, + "step": 4879, + "time_per_iteration": 2.6044318675994873 + }, + { + "auxiliary_loss_clip": 0.06501681, + "auxiliary_loss_mlp": 0.01277426, + "balance_loss_clip": 0.0629803, + "balance_loss_mlp": 0.01259187, + "epoch": 0.2934014730196904, + "flos": 32131840746240.0, + "grad_norm": 2.9151820016542183, + "language_loss": 0.67178017, + "learning_rate": 3.3152781931394803e-06, + "loss": 0.7495712, + "num_input_tokens_seen": 105194540, + "router_z_loss_clip": 2.03515625, + "router_z_loss_mlp": 0.18237305, + "step": 4880, + "time_per_iteration": 2.603761672973633 + }, + { + "auxiliary_loss_clip": 0.06503071, + "auxiliary_loss_mlp": 0.01271949, + "balance_loss_clip": 0.0629775, + "balance_loss_mlp": 0.01255367, + "epoch": 0.29346159627235835, + "flos": 24359329612800.0, + "grad_norm": 2.6105900749093633, + "language_loss": 0.71260536, + "learning_rate": 3.314984773812481e-06, + "loss": 0.79035556, + "num_input_tokens_seen": 105213215, + "router_z_loss_clip": 2.05273438, + "router_z_loss_mlp": 0.16577148, + "step": 4881, + "time_per_iteration": 2.593226432800293 + }, + { + "auxiliary_loss_clip": 0.06502824, + "auxiliary_loss_mlp": 0.01275283, + "balance_loss_clip": 0.06298223, + "balance_loss_mlp": 0.01256603, + "epoch": 0.2935217195250263, + "flos": 22753253278080.0, + "grad_norm": 1.6618295774620153, + "language_loss": 0.83893931, + "learning_rate": 3.314691304621127e-06, + "loss": 0.91672039, + "num_input_tokens_seen": 105231585, + "router_z_loss_clip": 2.04589844, + "router_z_loss_mlp": 0.18688965, + "step": 4882, + "time_per_iteration": 3.9488399028778076 + }, + { + "auxiliary_loss_clip": 0.06502259, + "auxiliary_loss_mlp": 0.01273532, + "balance_loss_clip": 0.06293593, + "balance_loss_mlp": 0.01255961, + "epoch": 0.2935818427776943, + "flos": 21731959388160.0, + "grad_norm": 4.210124979545191, + "language_loss": 0.72920972, + "learning_rate": 3.314397785576548e-06, + "loss": 0.80696762, + "num_input_tokens_seen": 105250120, + "router_z_loss_clip": 2.08789062, + "router_z_loss_mlp": 0.17565918, + "step": 4883, + "time_per_iteration": 2.557283878326416 + }, + { + "auxiliary_loss_clip": 0.06496279, + "auxiliary_loss_mlp": 0.01274258, + "balance_loss_clip": 0.06292833, + "balance_loss_mlp": 0.01257103, + "epoch": 0.29364196603036224, + "flos": 23811667326720.0, + "grad_norm": 2.0649535872154217, + "language_loss": 0.93051624, + "learning_rate": 3.3141042166898726e-06, + "loss": 1.00822163, + "num_input_tokens_seen": 105266065, + "router_z_loss_clip": 2.03417969, + "router_z_loss_mlp": 0.17150879, + "step": 4884, + "time_per_iteration": 2.5359458923339844 + }, + { + "auxiliary_loss_clip": 0.06506841, + "auxiliary_loss_mlp": 0.01273123, + "balance_loss_clip": 0.06302871, + "balance_loss_mlp": 0.01255409, + "epoch": 0.2937020892830302, + "flos": 23475615327360.0, + "grad_norm": 2.6201562161688017, + "language_loss": 0.73813069, + "learning_rate": 3.313810597972234e-06, + "loss": 0.81593031, + "num_input_tokens_seen": 105282155, + "router_z_loss_clip": 2.04101562, + "router_z_loss_mlp": 0.17712402, + "step": 4885, + "time_per_iteration": 2.547731637954712 + }, + { + "auxiliary_loss_clip": 0.06506574, + "auxiliary_loss_mlp": 0.01271233, + "balance_loss_clip": 0.06302118, + "balance_loss_mlp": 0.01253936, + "epoch": 0.2937622125356982, + "flos": 24278422896000.0, + "grad_norm": 2.0067568315745907, + "language_loss": 0.8568837, + "learning_rate": 3.3135169294347655e-06, + "loss": 0.93466175, + "num_input_tokens_seen": 105299225, + "router_z_loss_clip": 2.04492188, + "router_z_loss_mlp": 0.1730957, + "step": 4886, + "time_per_iteration": 2.5345749855041504 + }, + { + "auxiliary_loss_clip": 0.06516494, + "auxiliary_loss_mlp": 0.01282352, + "balance_loss_clip": 0.06309356, + "balance_loss_mlp": 0.01266223, + "epoch": 0.29382233578836614, + "flos": 20667843262080.0, + "grad_norm": 2.2972144011917863, + "language_loss": 0.7819618, + "learning_rate": 3.313223211088603e-06, + "loss": 0.85995024, + "num_input_tokens_seen": 105315710, + "router_z_loss_clip": 2.07128906, + "router_z_loss_mlp": 0.16137695, + "step": 4887, + "time_per_iteration": 2.5718464851379395 + }, + { + "auxiliary_loss_clip": 0.06508423, + "auxiliary_loss_mlp": 0.01281343, + "balance_loss_clip": 0.06301117, + "balance_loss_mlp": 0.01263962, + "epoch": 0.2938824590410341, + "flos": 16550662642560.0, + "grad_norm": 2.5346543108244366, + "language_loss": 0.80135798, + "learning_rate": 3.3129294429448855e-06, + "loss": 0.87925565, + "num_input_tokens_seen": 105333505, + "router_z_loss_clip": 2.07226562, + "router_z_loss_mlp": 0.1739502, + "step": 4888, + "time_per_iteration": 2.5823678970336914 + }, + { + "auxiliary_loss_clip": 0.06512221, + "auxiliary_loss_mlp": 0.01274662, + "balance_loss_clip": 0.06308408, + "balance_loss_mlp": 0.01257878, + "epoch": 0.29394258229370207, + "flos": 37934620824960.0, + "grad_norm": 1.521834171262281, + "language_loss": 0.55984998, + "learning_rate": 3.3126356250147517e-06, + "loss": 0.63771886, + "num_input_tokens_seen": 105355605, + "router_z_loss_clip": 2.04101562, + "router_z_loss_mlp": 0.16784668, + "step": 4889, + "time_per_iteration": 2.6925320625305176 + }, + { + "auxiliary_loss_clip": 0.06519246, + "auxiliary_loss_mlp": 0.01278013, + "balance_loss_clip": 0.06313413, + "balance_loss_mlp": 0.0126056, + "epoch": 0.29400270554637004, + "flos": 20050384924800.0, + "grad_norm": 1.7589662768394465, + "language_loss": 0.85257453, + "learning_rate": 3.3123417573093434e-06, + "loss": 0.93054712, + "num_input_tokens_seen": 105374225, + "router_z_loss_clip": 2.05761719, + "router_z_loss_mlp": 0.17443848, + "step": 4890, + "time_per_iteration": 2.546391010284424 + }, + { + "auxiliary_loss_clip": 0.06513973, + "auxiliary_loss_mlp": 0.01284253, + "balance_loss_clip": 0.06307942, + "balance_loss_mlp": 0.01266288, + "epoch": 0.294062828799038, + "flos": 15271498212480.0, + "grad_norm": 1.9077501912209676, + "language_loss": 0.73679662, + "learning_rate": 3.3120478398398046e-06, + "loss": 0.81477886, + "num_input_tokens_seen": 105391565, + "router_z_loss_clip": 2.06152344, + "router_z_loss_mlp": 0.17956543, + "step": 4891, + "time_per_iteration": 2.496230125427246 + }, + { + "auxiliary_loss_clip": 0.06519526, + "auxiliary_loss_mlp": 0.01285439, + "balance_loss_clip": 0.06312989, + "balance_loss_mlp": 0.01267468, + "epoch": 0.294122952051706, + "flos": 22753714475520.0, + "grad_norm": 1.802215562222595, + "language_loss": 0.77636111, + "learning_rate": 3.3117538726172797e-06, + "loss": 0.85441071, + "num_input_tokens_seen": 105409840, + "router_z_loss_clip": 2.06445312, + "router_z_loss_mlp": 0.17974854, + "step": 4892, + "time_per_iteration": 2.556626796722412 + }, + { + "auxiliary_loss_clip": 0.06508264, + "auxiliary_loss_mlp": 0.01274763, + "balance_loss_clip": 0.06305899, + "balance_loss_mlp": 0.01257096, + "epoch": 0.294183075304374, + "flos": 24979848624000.0, + "grad_norm": 1.857019535889917, + "language_loss": 0.78546309, + "learning_rate": 3.3114598556529164e-06, + "loss": 0.86329335, + "num_input_tokens_seen": 105428645, + "router_z_loss_clip": 2.02539062, + "router_z_loss_mlp": 0.17675781, + "step": 4893, + "time_per_iteration": 2.5583088397979736 + }, + { + "auxiliary_loss_clip": 0.06512541, + "auxiliary_loss_mlp": 0.01279131, + "balance_loss_clip": 0.06308632, + "balance_loss_mlp": 0.01262764, + "epoch": 0.29424319855704195, + "flos": 30960347212800.0, + "grad_norm": 7.778949224672863, + "language_loss": 0.85594332, + "learning_rate": 3.311165788957864e-06, + "loss": 0.93386006, + "num_input_tokens_seen": 105447480, + "router_z_loss_clip": 2.04101562, + "router_z_loss_mlp": 0.16357422, + "step": 4894, + "time_per_iteration": 2.642275094985962 + }, + { + "auxiliary_loss_clip": 0.06515005, + "auxiliary_loss_mlp": 0.01277674, + "balance_loss_clip": 0.06308285, + "balance_loss_mlp": 0.01260639, + "epoch": 0.2943033218097099, + "flos": 15236977530240.0, + "grad_norm": 2.7328127009682617, + "language_loss": 0.91485763, + "learning_rate": 3.310871672543274e-06, + "loss": 0.99278444, + "num_input_tokens_seen": 105464600, + "router_z_loss_clip": 2.06640625, + "router_z_loss_mlp": 0.17028809, + "step": 4895, + "time_per_iteration": 2.499884605407715 + }, + { + "auxiliary_loss_clip": 0.06521617, + "auxiliary_loss_mlp": 0.01275591, + "balance_loss_clip": 0.06309959, + "balance_loss_mlp": 0.01257519, + "epoch": 0.2943634450623779, + "flos": 21732336731520.0, + "grad_norm": 1.9156960384195119, + "language_loss": 0.86768568, + "learning_rate": 3.3105775064202982e-06, + "loss": 0.94565773, + "num_input_tokens_seen": 105481510, + "router_z_loss_clip": 2.11328125, + "router_z_loss_mlp": 0.18078613, + "step": 4896, + "time_per_iteration": 2.5482704639434814 + }, + { + "auxiliary_loss_clip": 0.06512056, + "auxiliary_loss_mlp": 0.01275376, + "balance_loss_clip": 0.06306215, + "balance_loss_mlp": 0.01257996, + "epoch": 0.29442356831504585, + "flos": 22608797639040.0, + "grad_norm": 2.0283086901116354, + "language_loss": 0.73915696, + "learning_rate": 3.3102832906000924e-06, + "loss": 0.81703126, + "num_input_tokens_seen": 105501390, + "router_z_loss_clip": 2.05566406, + "router_z_loss_mlp": 0.17382812, + "step": 4897, + "time_per_iteration": 2.5434658527374268 + }, + { + "auxiliary_loss_clip": 0.0652054, + "auxiliary_loss_mlp": 0.01280641, + "balance_loss_clip": 0.06307404, + "balance_loss_mlp": 0.01262378, + "epoch": 0.2944836915677138, + "flos": 20017625178240.0, + "grad_norm": 1.9321922101744466, + "language_loss": 0.74697995, + "learning_rate": 3.309989025093813e-06, + "loss": 0.82499176, + "num_input_tokens_seen": 105519600, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.18261719, + "step": 4898, + "time_per_iteration": 2.5770161151885986 + }, + { + "auxiliary_loss_clip": 0.06516017, + "auxiliary_loss_mlp": 0.01278564, + "balance_loss_clip": 0.06305353, + "balance_loss_mlp": 0.01259586, + "epoch": 0.2945438148203818, + "flos": 20051768517120.0, + "grad_norm": 2.462097706840479, + "language_loss": 0.71617198, + "learning_rate": 3.309694709912618e-06, + "loss": 0.79411781, + "num_input_tokens_seen": 105535970, + "router_z_loss_clip": 2.10742188, + "router_z_loss_mlp": 0.18969727, + "step": 4899, + "time_per_iteration": 2.5297536849975586 + }, + { + "auxiliary_loss_clip": 0.06510775, + "auxiliary_loss_mlp": 0.01278061, + "balance_loss_clip": 0.06304912, + "balance_loss_mlp": 0.01259727, + "epoch": 0.29460393807304974, + "flos": 23740487683200.0, + "grad_norm": 9.70716698994663, + "language_loss": 0.79828262, + "learning_rate": 3.3094003450676685e-06, + "loss": 0.87617099, + "num_input_tokens_seen": 105556735, + "router_z_loss_clip": 2.05859375, + "router_z_loss_mlp": 0.18322754, + "step": 4900, + "time_per_iteration": 2.589350461959839 + }, + { + "auxiliary_loss_clip": 0.06501958, + "auxiliary_loss_mlp": 0.01277561, + "balance_loss_clip": 0.06297968, + "balance_loss_mlp": 0.01260025, + "epoch": 0.2946640613257177, + "flos": 14981412977280.0, + "grad_norm": 1.6788003410312407, + "language_loss": 0.81419849, + "learning_rate": 3.3091059305701268e-06, + "loss": 0.89199364, + "num_input_tokens_seen": 105574875, + "router_z_loss_clip": 2.0390625, + "router_z_loss_mlp": 0.1751709, + "step": 4901, + "time_per_iteration": 2.4958457946777344 + }, + { + "auxiliary_loss_clip": 0.06498285, + "auxiliary_loss_mlp": 0.01276891, + "balance_loss_clip": 0.0630265, + "balance_loss_mlp": 0.01261095, + "epoch": 0.2947241845783857, + "flos": 24250862102400.0, + "grad_norm": 2.051988062923015, + "language_loss": 0.58211619, + "learning_rate": 3.308811466431157e-06, + "loss": 0.659868, + "num_input_tokens_seen": 105594225, + "router_z_loss_clip": 1.95800781, + "router_z_loss_mlp": 0.15783691, + "step": 4902, + "time_per_iteration": 2.5867393016815186 + }, + { + "auxiliary_loss_clip": 0.06509895, + "auxiliary_loss_mlp": 0.01278228, + "balance_loss_clip": 0.06304582, + "balance_loss_mlp": 0.01261825, + "epoch": 0.29478430783105364, + "flos": 19944600744960.0, + "grad_norm": 1.670035021285574, + "language_loss": 0.75883406, + "learning_rate": 3.308516952661925e-06, + "loss": 0.83671534, + "num_input_tokens_seen": 105614000, + "router_z_loss_clip": 2.05371094, + "router_z_loss_mlp": 0.16418457, + "step": 4903, + "time_per_iteration": 2.5120930671691895 + }, + { + "auxiliary_loss_clip": 0.06499215, + "auxiliary_loss_mlp": 0.01273387, + "balance_loss_clip": 0.06295954, + "balance_loss_mlp": 0.01255612, + "epoch": 0.2948444310837216, + "flos": 27388774454400.0, + "grad_norm": 1.8166217426315454, + "language_loss": 0.6305517, + "learning_rate": 3.3082223892736e-06, + "loss": 0.7082777, + "num_input_tokens_seen": 105634575, + "router_z_loss_clip": 2.03320312, + "router_z_loss_mlp": 0.17773438, + "step": 4904, + "time_per_iteration": 2.610600709915161 + }, + { + "auxiliary_loss_clip": 0.06509106, + "auxiliary_loss_mlp": 0.01272684, + "balance_loss_clip": 0.06301488, + "balance_loss_mlp": 0.01255983, + "epoch": 0.2949045543363896, + "flos": 23412401821440.0, + "grad_norm": 1.721115639485294, + "language_loss": 0.73724848, + "learning_rate": 3.3079277762773496e-06, + "loss": 0.8150664, + "num_input_tokens_seen": 105654385, + "router_z_loss_clip": 2.07519531, + "router_z_loss_mlp": 0.16711426, + "step": 4905, + "time_per_iteration": 2.5330429077148438 + }, + { + "auxiliary_loss_clip": 0.06501255, + "auxiliary_loss_mlp": 0.01270139, + "balance_loss_clip": 0.06297939, + "balance_loss_mlp": 0.01252508, + "epoch": 0.2949646775890576, + "flos": 23958303171840.0, + "grad_norm": 1.607284793713989, + "language_loss": 0.81930244, + "learning_rate": 3.3076331136843476e-06, + "loss": 0.89701641, + "num_input_tokens_seen": 105673570, + "router_z_loss_clip": 2.03320312, + "router_z_loss_mlp": 0.17614746, + "step": 4906, + "time_per_iteration": 2.5717568397521973 + }, + { + "auxiliary_loss_clip": 0.06499709, + "auxiliary_loss_mlp": 0.0127024, + "balance_loss_clip": 0.06300811, + "balance_loss_mlp": 0.01254051, + "epoch": 0.29502480084172555, + "flos": 22791002342400.0, + "grad_norm": 1.8767623479937394, + "language_loss": 0.88041449, + "learning_rate": 3.3073384015057667e-06, + "loss": 0.95811397, + "num_input_tokens_seen": 105691940, + "router_z_loss_clip": 1.98632812, + "router_z_loss_mlp": 0.16186523, + "step": 4907, + "time_per_iteration": 2.532233238220215 + }, + { + "auxiliary_loss_clip": 0.06504819, + "auxiliary_loss_mlp": 0.01277393, + "balance_loss_clip": 0.06294614, + "balance_loss_mlp": 0.01257592, + "epoch": 0.2950849240943935, + "flos": 19652838428160.0, + "grad_norm": 2.2863974346720837, + "language_loss": 0.82530308, + "learning_rate": 3.307043639752782e-06, + "loss": 0.90312517, + "num_input_tokens_seen": 105709825, + "router_z_loss_clip": 2.10058594, + "router_z_loss_mlp": 0.19812012, + "step": 4908, + "time_per_iteration": 2.6338536739349365 + }, + { + "auxiliary_loss_clip": 0.06393203, + "auxiliary_loss_mlp": 0.01256311, + "balance_loss_clip": 0.06296152, + "balance_loss_mlp": 0.01251251, + "epoch": 0.2951450473470615, + "flos": 71021062010880.0, + "grad_norm": 0.749349843123412, + "language_loss": 0.57384133, + "learning_rate": 3.3067488284365728e-06, + "loss": 0.65033644, + "num_input_tokens_seen": 105766880, + "router_z_loss_clip": 0.96875, + "router_z_loss_mlp": 0.05059814, + "step": 4909, + "time_per_iteration": 3.0084846019744873 + }, + { + "auxiliary_loss_clip": 0.06500423, + "auxiliary_loss_mlp": 0.01279147, + "balance_loss_clip": 0.06298146, + "balance_loss_mlp": 0.0126278, + "epoch": 0.29520517059972945, + "flos": 22972955483520.0, + "grad_norm": 1.5167904233162786, + "language_loss": 0.87274551, + "learning_rate": 3.3064539675683163e-06, + "loss": 0.9505412, + "num_input_tokens_seen": 105786875, + "router_z_loss_clip": 2.02441406, + "router_z_loss_mlp": 0.16381836, + "step": 4910, + "time_per_iteration": 2.615015745162964 + }, + { + "auxiliary_loss_clip": 0.06494174, + "auxiliary_loss_mlp": 0.01271849, + "balance_loss_clip": 0.06294993, + "balance_loss_mlp": 0.01255017, + "epoch": 0.2952652938523974, + "flos": 20491969541760.0, + "grad_norm": 1.9871602841434197, + "language_loss": 0.72998595, + "learning_rate": 3.3061590571591946e-06, + "loss": 0.80764621, + "num_input_tokens_seen": 105805315, + "router_z_loss_clip": 1.99316406, + "router_z_loss_mlp": 0.16821289, + "step": 4911, + "time_per_iteration": 2.5274527072906494 + }, + { + "auxiliary_loss_clip": 0.06493053, + "auxiliary_loss_mlp": 0.01276167, + "balance_loss_clip": 0.06295265, + "balance_loss_mlp": 0.01260122, + "epoch": 0.2953254171050654, + "flos": 19652754574080.0, + "grad_norm": 1.8153147203758204, + "language_loss": 0.90350848, + "learning_rate": 3.3058640972203904e-06, + "loss": 0.98120075, + "num_input_tokens_seen": 105825125, + "router_z_loss_clip": 1.9765625, + "router_z_loss_mlp": 0.16040039, + "step": 4912, + "time_per_iteration": 4.015045881271362 + }, + { + "auxiliary_loss_clip": 0.06500725, + "auxiliary_loss_mlp": 0.01273843, + "balance_loss_clip": 0.06298609, + "balance_loss_mlp": 0.01256474, + "epoch": 0.29538554035773334, + "flos": 22754678797440.0, + "grad_norm": 1.456675217678442, + "language_loss": 0.83491737, + "learning_rate": 3.3055690877630894e-06, + "loss": 0.91266304, + "num_input_tokens_seen": 105846085, + "router_z_loss_clip": 2.0234375, + "router_z_loss_mlp": 0.17370605, + "step": 4913, + "time_per_iteration": 2.5691113471984863 + }, + { + "auxiliary_loss_clip": 0.06499185, + "auxiliary_loss_mlp": 0.01271149, + "balance_loss_clip": 0.06297807, + "balance_loss_mlp": 0.01255163, + "epoch": 0.2954456636104013, + "flos": 21878343671040.0, + "grad_norm": 1.7751266266229593, + "language_loss": 0.77296054, + "learning_rate": 3.3052740287984765e-06, + "loss": 0.85066384, + "num_input_tokens_seen": 105865400, + "router_z_loss_clip": 2.01367188, + "router_z_loss_mlp": 0.15991211, + "step": 4914, + "time_per_iteration": 2.5379679203033447 + }, + { + "auxiliary_loss_clip": 0.06494316, + "auxiliary_loss_mlp": 0.01276253, + "balance_loss_clip": 0.06294423, + "balance_loss_mlp": 0.01259563, + "epoch": 0.2955057868630693, + "flos": 40452056092800.0, + "grad_norm": 1.8412710776020966, + "language_loss": 0.81848276, + "learning_rate": 3.3049789203377424e-06, + "loss": 0.89618844, + "num_input_tokens_seen": 105887920, + "router_z_loss_clip": 2.0, + "router_z_loss_mlp": 0.16674805, + "step": 4915, + "time_per_iteration": 4.123507261276245 + }, + { + "auxiliary_loss_clip": 0.06504083, + "auxiliary_loss_mlp": 0.01278112, + "balance_loss_clip": 0.06299824, + "balance_loss_mlp": 0.01260707, + "epoch": 0.29556591011573724, + "flos": 22571006647680.0, + "grad_norm": 1.7265680083109098, + "language_loss": 0.85337454, + "learning_rate": 3.3046837623920772e-06, + "loss": 0.93119645, + "num_input_tokens_seen": 105904035, + "router_z_loss_clip": 2.04296875, + "router_z_loss_mlp": 0.1739502, + "step": 4916, + "time_per_iteration": 3.964902400970459 + }, + { + "auxiliary_loss_clip": 0.06496175, + "auxiliary_loss_mlp": 0.01273483, + "balance_loss_clip": 0.06292706, + "balance_loss_mlp": 0.01257187, + "epoch": 0.2956260333684052, + "flos": 22095572181120.0, + "grad_norm": 2.6877460244099254, + "language_loss": 0.71410239, + "learning_rate": 3.3043885549726723e-06, + "loss": 0.79179895, + "num_input_tokens_seen": 105922685, + "router_z_loss_clip": 2.03613281, + "router_z_loss_mlp": 0.16296387, + "step": 4917, + "time_per_iteration": 2.510061502456665 + }, + { + "auxiliary_loss_clip": 0.06495264, + "auxiliary_loss_mlp": 0.01273068, + "balance_loss_clip": 0.06293347, + "balance_loss_mlp": 0.01255771, + "epoch": 0.2956861566210732, + "flos": 16441063102080.0, + "grad_norm": 1.9904514264943383, + "language_loss": 0.9154985, + "learning_rate": 3.3040932980907226e-06, + "loss": 0.99318182, + "num_input_tokens_seen": 105940425, + "router_z_loss_clip": 2.01757812, + "router_z_loss_mlp": 0.1730957, + "step": 4918, + "time_per_iteration": 2.5177812576293945 + }, + { + "auxiliary_loss_clip": 0.06500694, + "auxiliary_loss_mlp": 0.01270804, + "balance_loss_clip": 0.0629639, + "balance_loss_mlp": 0.01252887, + "epoch": 0.2957462798737412, + "flos": 25819189372800.0, + "grad_norm": 2.9632565132584587, + "language_loss": 0.73171133, + "learning_rate": 3.303797991757425e-06, + "loss": 0.80942631, + "num_input_tokens_seen": 105960550, + "router_z_loss_clip": 2.04394531, + "router_z_loss_mlp": 0.17919922, + "step": 4919, + "time_per_iteration": 2.548271656036377 + }, + { + "auxiliary_loss_clip": 0.06494663, + "auxiliary_loss_mlp": 0.01276246, + "balance_loss_clip": 0.062939, + "balance_loss_mlp": 0.01259104, + "epoch": 0.29580640312640916, + "flos": 16696459946880.0, + "grad_norm": 2.067015346809242, + "language_loss": 0.76653767, + "learning_rate": 3.3035026359839763e-06, + "loss": 0.84424675, + "num_input_tokens_seen": 105978820, + "router_z_loss_clip": 2.00878906, + "router_z_loss_mlp": 0.17138672, + "step": 4920, + "time_per_iteration": 2.5283315181732178 + }, + { + "auxiliary_loss_clip": 0.06505087, + "auxiliary_loss_mlp": 0.01280613, + "balance_loss_clip": 0.06298134, + "balance_loss_mlp": 0.01262886, + "epoch": 0.2958665263790771, + "flos": 23951427137280.0, + "grad_norm": 2.1683803944953786, + "language_loss": 0.69314063, + "learning_rate": 3.3032072307815774e-06, + "loss": 0.77099764, + "num_input_tokens_seen": 105997545, + "router_z_loss_clip": 2.06738281, + "router_z_loss_mlp": 0.17724609, + "step": 4921, + "time_per_iteration": 3.9904286861419678 + }, + { + "auxiliary_loss_clip": 0.06507339, + "auxiliary_loss_mlp": 0.01279047, + "balance_loss_clip": 0.06297763, + "balance_loss_mlp": 0.01261023, + "epoch": 0.2959266496317451, + "flos": 18484279787520.0, + "grad_norm": 1.8551497184563221, + "language_loss": 0.75478184, + "learning_rate": 3.3029117761614298e-06, + "loss": 0.83264565, + "num_input_tokens_seen": 106015320, + "router_z_loss_clip": 2.09570312, + "router_z_loss_mlp": 0.18017578, + "step": 4922, + "time_per_iteration": 2.5025644302368164 + }, + { + "auxiliary_loss_clip": 0.06508595, + "auxiliary_loss_mlp": 0.01277113, + "balance_loss_clip": 0.06298192, + "balance_loss_mlp": 0.01258051, + "epoch": 0.29598677288441305, + "flos": 25964525479680.0, + "grad_norm": 1.7877276864194063, + "language_loss": 0.77317607, + "learning_rate": 3.302616272134737e-06, + "loss": 0.85103309, + "num_input_tokens_seen": 106034555, + "router_z_loss_clip": 2.10742188, + "router_z_loss_mlp": 0.19067383, + "step": 4923, + "time_per_iteration": 2.57328462600708 + }, + { + "auxiliary_loss_clip": 0.06498858, + "auxiliary_loss_mlp": 0.01279587, + "balance_loss_clip": 0.06293048, + "balance_loss_mlp": 0.01262016, + "epoch": 0.296046896137081, + "flos": 25163101503360.0, + "grad_norm": 2.2992847921393174, + "language_loss": 0.8687042, + "learning_rate": 3.3023207187127042e-06, + "loss": 0.94648862, + "num_input_tokens_seen": 106054200, + "router_z_loss_clip": 2.05859375, + "router_z_loss_mlp": 0.17565918, + "step": 4924, + "time_per_iteration": 2.569819450378418 + }, + { + "auxiliary_loss_clip": 0.06495638, + "auxiliary_loss_mlp": 0.01274356, + "balance_loss_clip": 0.06293976, + "balance_loss_mlp": 0.01256891, + "epoch": 0.296107019389749, + "flos": 21767402465280.0, + "grad_norm": 1.4490170840920502, + "language_loss": 0.823627, + "learning_rate": 3.3020251159065396e-06, + "loss": 0.90132689, + "num_input_tokens_seen": 106074700, + "router_z_loss_clip": 2.015625, + "router_z_loss_mlp": 0.17468262, + "step": 4925, + "time_per_iteration": 2.586395025253296 + }, + { + "auxiliary_loss_clip": 0.06496158, + "auxiliary_loss_mlp": 0.01278426, + "balance_loss_clip": 0.06294197, + "balance_loss_mlp": 0.01261415, + "epoch": 0.29616714264241695, + "flos": 17964555638400.0, + "grad_norm": 3.115838377994743, + "language_loss": 0.87332439, + "learning_rate": 3.301729463727452e-06, + "loss": 0.95107025, + "num_input_tokens_seen": 106091415, + "router_z_loss_clip": 2.01953125, + "router_z_loss_mlp": 0.17016602, + "step": 4926, + "time_per_iteration": 2.480851411819458 + }, + { + "auxiliary_loss_clip": 0.06502646, + "auxiliary_loss_mlp": 0.01277188, + "balance_loss_clip": 0.06295682, + "balance_loss_mlp": 0.0125995, + "epoch": 0.2962272658950849, + "flos": 15018155792640.0, + "grad_norm": 2.5897634799766296, + "language_loss": 0.86097062, + "learning_rate": 3.3014337621866527e-06, + "loss": 0.93876898, + "num_input_tokens_seen": 106109135, + "router_z_loss_clip": 2.06738281, + "router_z_loss_mlp": 0.17236328, + "step": 4927, + "time_per_iteration": 2.524277687072754 + }, + { + "auxiliary_loss_clip": 0.06496821, + "auxiliary_loss_mlp": 0.01273329, + "balance_loss_clip": 0.06295302, + "balance_loss_mlp": 0.01256545, + "epoch": 0.2962873891477529, + "flos": 14726183840640.0, + "grad_norm": 1.628327768422068, + "language_loss": 0.80864251, + "learning_rate": 3.3011380112953553e-06, + "loss": 0.88634396, + "num_input_tokens_seen": 106125750, + "router_z_loss_clip": 2.01367188, + "router_z_loss_mlp": 0.16772461, + "step": 4928, + "time_per_iteration": 2.495842933654785 + }, + { + "auxiliary_loss_clip": 0.06510531, + "auxiliary_loss_mlp": 0.01280378, + "balance_loss_clip": 0.0629655, + "balance_loss_mlp": 0.012609, + "epoch": 0.29634751240042084, + "flos": 26730967576320.0, + "grad_norm": 3.186979474193142, + "language_loss": 0.72557974, + "learning_rate": 3.300842211064773e-06, + "loss": 0.80348885, + "num_input_tokens_seen": 106142835, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.19482422, + "step": 4929, + "time_per_iteration": 2.5845630168914795 + }, + { + "auxiliary_loss_clip": 0.06503193, + "auxiliary_loss_mlp": 0.01287506, + "balance_loss_clip": 0.06293295, + "balance_loss_mlp": 0.01268456, + "epoch": 0.2964076356530888, + "flos": 14575984197120.0, + "grad_norm": 2.811052251549286, + "language_loss": 0.73200721, + "learning_rate": 3.3005463615061246e-06, + "loss": 0.80991417, + "num_input_tokens_seen": 106160680, + "router_z_loss_clip": 2.09960938, + "router_z_loss_mlp": 0.19042969, + "step": 4930, + "time_per_iteration": 2.488785982131958 + }, + { + "auxiliary_loss_clip": 0.06387739, + "auxiliary_loss_mlp": 0.01269345, + "balance_loss_clip": 0.06290003, + "balance_loss_mlp": 0.0126519, + "epoch": 0.29646775890575683, + "flos": 63124387925760.0, + "grad_norm": 0.773484435694784, + "language_loss": 0.60626972, + "learning_rate": 3.3002504626306275e-06, + "loss": 0.68284053, + "num_input_tokens_seen": 106224415, + "router_z_loss_clip": 0.9765625, + "router_z_loss_mlp": 0.04156494, + "step": 4931, + "time_per_iteration": 3.1399567127227783 + }, + { + "auxiliary_loss_clip": 0.06390411, + "auxiliary_loss_mlp": 0.01264384, + "balance_loss_clip": 0.06293079, + "balance_loss_mlp": 0.0126054, + "epoch": 0.2965278821584248, + "flos": 63087728964480.0, + "grad_norm": 0.7260178151779769, + "language_loss": 0.52335358, + "learning_rate": 3.2999545144495023e-06, + "loss": 0.59990156, + "num_input_tokens_seen": 106279140, + "router_z_loss_clip": 0.97265625, + "router_z_loss_mlp": 0.03839111, + "step": 4932, + "time_per_iteration": 3.0242393016815186 + }, + { + "auxiliary_loss_clip": 0.06496995, + "auxiliary_loss_mlp": 0.01277379, + "balance_loss_clip": 0.06294326, + "balance_loss_mlp": 0.01260368, + "epoch": 0.29658800541109276, + "flos": 23775469562880.0, + "grad_norm": 1.6744964780290639, + "language_loss": 0.82042706, + "learning_rate": 3.299658516973972e-06, + "loss": 0.89817077, + "num_input_tokens_seen": 106298190, + "router_z_loss_clip": 2.02734375, + "router_z_loss_mlp": 0.17028809, + "step": 4933, + "time_per_iteration": 2.5955240726470947 + }, + { + "auxiliary_loss_clip": 0.06493178, + "auxiliary_loss_mlp": 0.01272888, + "balance_loss_clip": 0.06293809, + "balance_loss_mlp": 0.01256377, + "epoch": 0.2966481286637607, + "flos": 23995465257600.0, + "grad_norm": 1.8381459517159284, + "language_loss": 0.75639498, + "learning_rate": 3.299362470215261e-06, + "loss": 0.83405566, + "num_input_tokens_seen": 106319065, + "router_z_loss_clip": 1.99414062, + "router_z_loss_mlp": 0.16503906, + "step": 4934, + "time_per_iteration": 2.5714681148529053 + }, + { + "auxiliary_loss_clip": 0.06508597, + "auxiliary_loss_mlp": 0.01280413, + "balance_loss_clip": 0.06299804, + "balance_loss_mlp": 0.01261697, + "epoch": 0.2967082519164287, + "flos": 17170846237440.0, + "grad_norm": 1.723450067314057, + "language_loss": 0.63127494, + "learning_rate": 3.299066374184594e-06, + "loss": 0.70916504, + "num_input_tokens_seen": 106338040, + "router_z_loss_clip": 2.0859375, + "router_z_loss_mlp": 0.18713379, + "step": 4935, + "time_per_iteration": 2.513557195663452 + }, + { + "auxiliary_loss_clip": 0.06500618, + "auxiliary_loss_mlp": 0.01281806, + "balance_loss_clip": 0.06298316, + "balance_loss_mlp": 0.01263424, + "epoch": 0.29676837516909665, + "flos": 29395416032640.0, + "grad_norm": 1.6887254989691298, + "language_loss": 0.80239189, + "learning_rate": 3.2987702288932e-06, + "loss": 0.88021612, + "num_input_tokens_seen": 106358900, + "router_z_loss_clip": 2.02148438, + "router_z_loss_mlp": 0.18383789, + "step": 4936, + "time_per_iteration": 2.6222426891326904 + }, + { + "auxiliary_loss_clip": 0.06510909, + "auxiliary_loss_mlp": 0.0128109, + "balance_loss_clip": 0.06301413, + "balance_loss_mlp": 0.01261444, + "epoch": 0.2968284984217646, + "flos": 34759839876480.0, + "grad_norm": 1.4826285887608224, + "language_loss": 0.74831104, + "learning_rate": 3.298474034352309e-06, + "loss": 0.826231, + "num_input_tokens_seen": 106381805, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 0.19665527, + "step": 4937, + "time_per_iteration": 2.7231242656707764 + }, + { + "auxiliary_loss_clip": 0.06501779, + "auxiliary_loss_mlp": 0.01274387, + "balance_loss_clip": 0.06297591, + "balance_loss_mlp": 0.01256768, + "epoch": 0.2968886216744326, + "flos": 21550635152640.0, + "grad_norm": 1.507706154697653, + "language_loss": 0.78372371, + "learning_rate": 3.2981777905731526e-06, + "loss": 0.86148536, + "num_input_tokens_seen": 106402365, + "router_z_loss_clip": 2.04101562, + "router_z_loss_mlp": 0.17614746, + "step": 4938, + "time_per_iteration": 2.564958095550537 + }, + { + "auxiliary_loss_clip": 0.06506119, + "auxiliary_loss_mlp": 0.01279001, + "balance_loss_clip": 0.06296918, + "balance_loss_mlp": 0.01260643, + "epoch": 0.29694874492710055, + "flos": 12792357060480.0, + "grad_norm": 3.019574533594622, + "language_loss": 0.76788878, + "learning_rate": 3.297881497566964e-06, + "loss": 0.84574002, + "num_input_tokens_seen": 106419800, + "router_z_loss_clip": 2.09179688, + "router_z_loss_mlp": 0.18359375, + "step": 4939, + "time_per_iteration": 2.514143943786621 + }, + { + "auxiliary_loss_clip": 0.06509334, + "auxiliary_loss_mlp": 0.01271997, + "balance_loss_clip": 0.06296703, + "balance_loss_mlp": 0.01254259, + "epoch": 0.2970088681797685, + "flos": 24576600049920.0, + "grad_norm": 1.687046897883716, + "language_loss": 0.78335512, + "learning_rate": 3.297585155344979e-06, + "loss": 0.86116844, + "num_input_tokens_seen": 106440300, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.17736816, + "step": 4940, + "time_per_iteration": 2.570279359817505 + }, + { + "auxiliary_loss_clip": 0.06508817, + "auxiliary_loss_mlp": 0.01275865, + "balance_loss_clip": 0.06300067, + "balance_loss_mlp": 0.01257113, + "epoch": 0.2970689914324365, + "flos": 23665870022400.0, + "grad_norm": 1.5281741947741105, + "language_loss": 0.75415564, + "learning_rate": 3.297288763918435e-06, + "loss": 0.8320024, + "num_input_tokens_seen": 106460035, + "router_z_loss_clip": 2.08789062, + "router_z_loss_mlp": 0.1875, + "step": 4941, + "time_per_iteration": 2.549976348876953 + }, + { + "auxiliary_loss_clip": 0.06509985, + "auxiliary_loss_mlp": 0.01274098, + "balance_loss_clip": 0.06298217, + "balance_loss_mlp": 0.01254667, + "epoch": 0.29712911468510445, + "flos": 39678654107520.0, + "grad_norm": 2.245999939669129, + "language_loss": 0.74959898, + "learning_rate": 3.2969923232985712e-06, + "loss": 0.82743979, + "num_input_tokens_seen": 106481095, + "router_z_loss_clip": 2.1171875, + "router_z_loss_mlp": 0.19445801, + "step": 4942, + "time_per_iteration": 2.7199416160583496 + }, + { + "auxiliary_loss_clip": 0.0651295, + "auxiliary_loss_mlp": 0.01282177, + "balance_loss_clip": 0.06299168, + "balance_loss_mlp": 0.01261744, + "epoch": 0.2971892379377724, + "flos": 26402420517120.0, + "grad_norm": 1.727137408051059, + "language_loss": 0.70931113, + "learning_rate": 3.2966958334966287e-06, + "loss": 0.78726244, + "num_input_tokens_seen": 106501590, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.2043457, + "step": 4943, + "time_per_iteration": 2.5410006046295166 + }, + { + "auxiliary_loss_clip": 0.06508674, + "auxiliary_loss_mlp": 0.01274303, + "balance_loss_clip": 0.06296329, + "balance_loss_mlp": 0.01255599, + "epoch": 0.2972493611904404, + "flos": 17608992837120.0, + "grad_norm": 2.280832061666768, + "language_loss": 0.8012532, + "learning_rate": 3.2963992945238497e-06, + "loss": 0.87908292, + "num_input_tokens_seen": 106519430, + "router_z_loss_clip": 2.12207031, + "router_z_loss_mlp": 0.18725586, + "step": 4944, + "time_per_iteration": 2.5628697872161865 + }, + { + "auxiliary_loss_clip": 0.06495067, + "auxiliary_loss_mlp": 0.01272551, + "balance_loss_clip": 0.06293043, + "balance_loss_mlp": 0.01255194, + "epoch": 0.2973094844431084, + "flos": 20419070889600.0, + "grad_norm": 2.0196449856406704, + "language_loss": 0.83490258, + "learning_rate": 3.2961027063914795e-06, + "loss": 0.91257876, + "num_input_tokens_seen": 106535870, + "router_z_loss_clip": 2.01953125, + "router_z_loss_mlp": 0.17346191, + "step": 4945, + "time_per_iteration": 2.5184381008148193 + }, + { + "auxiliary_loss_clip": 0.06494735, + "auxiliary_loss_mlp": 0.01274271, + "balance_loss_clip": 0.0629338, + "balance_loss_mlp": 0.01257081, + "epoch": 0.29736960769577636, + "flos": 17499225588480.0, + "grad_norm": 1.8481246337269472, + "language_loss": 0.67665654, + "learning_rate": 3.2958060691107654e-06, + "loss": 0.75434661, + "num_input_tokens_seen": 106553560, + "router_z_loss_clip": 2.00976562, + "router_z_loss_mlp": 0.171875, + "step": 4946, + "time_per_iteration": 2.524073362350464 + }, + { + "auxiliary_loss_clip": 0.06500807, + "auxiliary_loss_mlp": 0.01272914, + "balance_loss_clip": 0.06294695, + "balance_loss_mlp": 0.01255462, + "epoch": 0.2974297309484443, + "flos": 26111119397760.0, + "grad_norm": 1.9041348906467674, + "language_loss": 0.74493206, + "learning_rate": 3.2955093826929547e-06, + "loss": 0.82266927, + "num_input_tokens_seen": 106574115, + "router_z_loss_clip": 2.05957031, + "router_z_loss_mlp": 0.17443848, + "step": 4947, + "time_per_iteration": 2.55096435546875 + }, + { + "auxiliary_loss_clip": 0.06508033, + "auxiliary_loss_mlp": 0.01274303, + "balance_loss_clip": 0.06299601, + "balance_loss_mlp": 0.01255396, + "epoch": 0.2974898542011123, + "flos": 25673559776640.0, + "grad_norm": 5.5840313105791894, + "language_loss": 0.73332673, + "learning_rate": 3.2952126471492985e-06, + "loss": 0.81115007, + "num_input_tokens_seen": 106593070, + "router_z_loss_clip": 2.08496094, + "router_z_loss_mlp": 0.18896484, + "step": 4948, + "time_per_iteration": 2.604213237762451 + }, + { + "auxiliary_loss_clip": 0.06494544, + "auxiliary_loss_mlp": 0.01275305, + "balance_loss_clip": 0.06292598, + "balance_loss_mlp": 0.01258687, + "epoch": 0.29754997745378026, + "flos": 18667323031680.0, + "grad_norm": 1.916403484704169, + "language_loss": 0.84057009, + "learning_rate": 3.2949158624910497e-06, + "loss": 0.91826856, + "num_input_tokens_seen": 106610695, + "router_z_loss_clip": 2.015625, + "router_z_loss_mlp": 0.1661377, + "step": 4949, + "time_per_iteration": 2.4725756645202637 + }, + { + "auxiliary_loss_clip": 0.06495193, + "auxiliary_loss_mlp": 0.01276752, + "balance_loss_clip": 0.06291104, + "balance_loss_mlp": 0.01258692, + "epoch": 0.2976101007064482, + "flos": 22281382609920.0, + "grad_norm": 2.0864257908602464, + "language_loss": 0.71227181, + "learning_rate": 3.2946190287294603e-06, + "loss": 0.78999126, + "num_input_tokens_seen": 106631300, + "router_z_loss_clip": 2.04101562, + "router_z_loss_mlp": 0.18078613, + "step": 4950, + "time_per_iteration": 2.5644164085388184 + }, + { + "auxiliary_loss_clip": 0.06486266, + "auxiliary_loss_mlp": 0.01272795, + "balance_loss_clip": 0.06290439, + "balance_loss_mlp": 0.01256308, + "epoch": 0.2976702239591162, + "flos": 21952290499200.0, + "grad_norm": 2.1576156011429597, + "language_loss": 0.83112931, + "learning_rate": 3.294322145875789e-06, + "loss": 0.9087199, + "num_input_tokens_seen": 106650065, + "router_z_loss_clip": 1.95898438, + "router_z_loss_mlp": 0.16467285, + "step": 4951, + "time_per_iteration": 2.5149009227752686 + }, + { + "auxiliary_loss_clip": 0.06493516, + "auxiliary_loss_mlp": 0.01274653, + "balance_loss_clip": 0.06287138, + "balance_loss_mlp": 0.01257248, + "epoch": 0.29773034721178415, + "flos": 24642874229760.0, + "grad_norm": 2.538162384222029, + "language_loss": 0.73777694, + "learning_rate": 3.2940252139412912e-06, + "loss": 0.81545866, + "num_input_tokens_seen": 106668230, + "router_z_loss_clip": 2.06347656, + "router_z_loss_mlp": 0.17407227, + "step": 4952, + "time_per_iteration": 3.9977774620056152 + }, + { + "auxiliary_loss_clip": 0.06494328, + "auxiliary_loss_mlp": 0.01279914, + "balance_loss_clip": 0.06291338, + "balance_loss_mlp": 0.01261472, + "epoch": 0.2977904704644521, + "flos": 20563694236800.0, + "grad_norm": 1.830993802630573, + "language_loss": 0.8420608, + "learning_rate": 3.293728232937228e-06, + "loss": 0.91980314, + "num_input_tokens_seen": 106687785, + "router_z_loss_clip": 2.03027344, + "router_z_loss_mlp": 0.18444824, + "step": 4953, + "time_per_iteration": 2.556278944015503 + }, + { + "auxiliary_loss_clip": 0.0649702, + "auxiliary_loss_mlp": 0.01271138, + "balance_loss_clip": 0.06289494, + "balance_loss_mlp": 0.01254246, + "epoch": 0.2978505937171201, + "flos": 18922426387200.0, + "grad_norm": 2.0824874332629113, + "language_loss": 0.74276727, + "learning_rate": 3.2934312028748597e-06, + "loss": 0.82044888, + "num_input_tokens_seen": 106706875, + "router_z_loss_clip": 2.07617188, + "router_z_loss_mlp": 0.16894531, + "step": 4954, + "time_per_iteration": 3.9108667373657227 + }, + { + "auxiliary_loss_clip": 0.06489201, + "auxiliary_loss_mlp": 0.01275174, + "balance_loss_clip": 0.06286507, + "balance_loss_mlp": 0.01259164, + "epoch": 0.29791071696978805, + "flos": 19323788244480.0, + "grad_norm": 1.865430683209025, + "language_loss": 0.75582623, + "learning_rate": 3.293134123765452e-06, + "loss": 0.83346999, + "num_input_tokens_seen": 106725105, + "router_z_loss_clip": 2.02929688, + "router_z_loss_mlp": 0.16003418, + "step": 4955, + "time_per_iteration": 4.034101724624634 + }, + { + "auxiliary_loss_clip": 0.06493168, + "auxiliary_loss_mlp": 0.01273359, + "balance_loss_clip": 0.06285557, + "balance_loss_mlp": 0.0125593, + "epoch": 0.297970840222456, + "flos": 18812742992640.0, + "grad_norm": 1.8893942834003292, + "language_loss": 0.72569048, + "learning_rate": 3.2928369956202684e-06, + "loss": 0.80335575, + "num_input_tokens_seen": 106744780, + "router_z_loss_clip": 2.07421875, + "router_z_loss_mlp": 0.17419434, + "step": 4956, + "time_per_iteration": 2.523688793182373 + }, + { + "auxiliary_loss_clip": 0.06498902, + "auxiliary_loss_mlp": 0.01272155, + "balance_loss_clip": 0.06287451, + "balance_loss_mlp": 0.01253141, + "epoch": 0.298030963475124, + "flos": 22858702041600.0, + "grad_norm": 1.7093127439145954, + "language_loss": 0.79588521, + "learning_rate": 3.2925398184505754e-06, + "loss": 0.87359571, + "num_input_tokens_seen": 106764670, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.19006348, + "step": 4957, + "time_per_iteration": 2.5350780487060547 + }, + { + "auxiliary_loss_clip": 0.0648672, + "auxiliary_loss_mlp": 0.01278155, + "balance_loss_clip": 0.06281397, + "balance_loss_mlp": 0.01261084, + "epoch": 0.298091086727792, + "flos": 21874402529280.0, + "grad_norm": 1.5033412482034976, + "language_loss": 0.70601791, + "learning_rate": 3.2922425922676437e-06, + "loss": 0.78366661, + "num_input_tokens_seen": 106783695, + "router_z_loss_clip": 2.05273438, + "router_z_loss_mlp": 0.17077637, + "step": 4958, + "time_per_iteration": 2.52998948097229 + }, + { + "auxiliary_loss_clip": 0.06484255, + "auxiliary_loss_mlp": 0.01275467, + "balance_loss_clip": 0.06283475, + "balance_loss_mlp": 0.01256954, + "epoch": 0.29815120998045996, + "flos": 21180775230720.0, + "grad_norm": 1.4471916983062794, + "language_loss": 0.78955591, + "learning_rate": 3.291945317082743e-06, + "loss": 0.86715317, + "num_input_tokens_seen": 106803150, + "router_z_loss_clip": 2.00683594, + "router_z_loss_mlp": 0.18505859, + "step": 4959, + "time_per_iteration": 2.5247116088867188 + }, + { + "auxiliary_loss_clip": 0.06484501, + "auxiliary_loss_mlp": 0.01275754, + "balance_loss_clip": 0.06281502, + "balance_loss_mlp": 0.01258183, + "epoch": 0.29821133323312793, + "flos": 19901526946560.0, + "grad_norm": 1.8097637226237389, + "language_loss": 0.79637736, + "learning_rate": 3.291647992907147e-06, + "loss": 0.87397993, + "num_input_tokens_seen": 106820705, + "router_z_loss_clip": 2.02734375, + "router_z_loss_mlp": 0.17578125, + "step": 4960, + "time_per_iteration": 2.544517755508423 + }, + { + "auxiliary_loss_clip": 0.06493803, + "auxiliary_loss_mlp": 0.01272699, + "balance_loss_clip": 0.06284714, + "balance_loss_mlp": 0.01254483, + "epoch": 0.2982714564857959, + "flos": 12755781953280.0, + "grad_norm": 2.226713674353186, + "language_loss": 0.74493575, + "learning_rate": 3.291350619752129e-06, + "loss": 0.82260078, + "num_input_tokens_seen": 106837335, + "router_z_loss_clip": 2.09082031, + "router_z_loss_mlp": 0.18225098, + "step": 4961, + "time_per_iteration": 3.9662065505981445 + }, + { + "auxiliary_loss_clip": 0.06486452, + "auxiliary_loss_mlp": 0.01274578, + "balance_loss_clip": 0.062804, + "balance_loss_mlp": 0.01256756, + "epoch": 0.29833157973846386, + "flos": 22278238081920.0, + "grad_norm": 2.8000667311611167, + "language_loss": 0.62968349, + "learning_rate": 3.291053197628967e-06, + "loss": 0.70729387, + "num_input_tokens_seen": 106856250, + "router_z_loss_clip": 2.06054688, + "router_z_loss_mlp": 0.17810059, + "step": 4962, + "time_per_iteration": 2.533984661102295 + }, + { + "auxiliary_loss_clip": 0.06485053, + "auxiliary_loss_mlp": 0.01276691, + "balance_loss_clip": 0.06281514, + "balance_loss_mlp": 0.01259596, + "epoch": 0.2983917029911318, + "flos": 15377659735680.0, + "grad_norm": 1.6706058401186525, + "language_loss": 0.83686638, + "learning_rate": 3.2907557265489375e-06, + "loss": 0.91448379, + "num_input_tokens_seen": 106873370, + "router_z_loss_clip": 2.03613281, + "router_z_loss_mlp": 0.17102051, + "step": 4963, + "time_per_iteration": 2.524486780166626 + }, + { + "auxiliary_loss_clip": 0.0648464, + "auxiliary_loss_mlp": 0.01276785, + "balance_loss_clip": 0.06283776, + "balance_loss_mlp": 0.01259572, + "epoch": 0.2984518262437998, + "flos": 15383068323840.0, + "grad_norm": 2.213795741630968, + "language_loss": 0.66932309, + "learning_rate": 3.290458206523322e-06, + "loss": 0.74693739, + "num_input_tokens_seen": 106890330, + "router_z_loss_clip": 2.01074219, + "router_z_loss_mlp": 0.17224121, + "step": 4964, + "time_per_iteration": 2.5100491046905518 + }, + { + "auxiliary_loss_clip": 0.06485043, + "auxiliary_loss_mlp": 0.01273472, + "balance_loss_clip": 0.06283367, + "balance_loss_mlp": 0.01257701, + "epoch": 0.29851194949646775, + "flos": 18113413616640.0, + "grad_norm": 1.8232440195867097, + "language_loss": 0.72163451, + "learning_rate": 3.2901606375634015e-06, + "loss": 0.79921961, + "num_input_tokens_seen": 106909190, + "router_z_loss_clip": 2.01660156, + "router_z_loss_mlp": 0.15771484, + "step": 4965, + "time_per_iteration": 2.5180373191833496 + }, + { + "auxiliary_loss_clip": 0.06490128, + "auxiliary_loss_mlp": 0.01278877, + "balance_loss_clip": 0.06284484, + "balance_loss_mlp": 0.01261139, + "epoch": 0.2985720727491357, + "flos": 22024811808000.0, + "grad_norm": 1.7919900337102326, + "language_loss": 0.66928089, + "learning_rate": 3.289863019680461e-06, + "loss": 0.74697095, + "num_input_tokens_seen": 106927825, + "router_z_loss_clip": 2.05664062, + "router_z_loss_mlp": 0.17724609, + "step": 4966, + "time_per_iteration": 2.5509839057922363 + }, + { + "auxiliary_loss_clip": 0.06492805, + "auxiliary_loss_mlp": 0.01279859, + "balance_loss_clip": 0.06288783, + "balance_loss_mlp": 0.01262026, + "epoch": 0.2986321960018037, + "flos": 13046202604800.0, + "grad_norm": 2.9983208236286862, + "language_loss": 0.74761832, + "learning_rate": 3.289565352885785e-06, + "loss": 0.82534492, + "num_input_tokens_seen": 106943155, + "router_z_loss_clip": 2.04199219, + "router_z_loss_mlp": 0.17822266, + "step": 4967, + "time_per_iteration": 2.5119001865386963 + }, + { + "auxiliary_loss_clip": 0.06492577, + "auxiliary_loss_mlp": 0.01276602, + "balance_loss_clip": 0.06288804, + "balance_loss_mlp": 0.01260294, + "epoch": 0.29869231925447165, + "flos": 14470241944320.0, + "grad_norm": 1.9901449284839132, + "language_loss": 0.72232509, + "learning_rate": 3.2892676371906614e-06, + "loss": 0.80001682, + "num_input_tokens_seen": 106960295, + "router_z_loss_clip": 2.03613281, + "router_z_loss_mlp": 0.16308594, + "step": 4968, + "time_per_iteration": 2.49646258354187 + }, + { + "auxiliary_loss_clip": 0.06497695, + "auxiliary_loss_mlp": 0.01278817, + "balance_loss_clip": 0.06290321, + "balance_loss_mlp": 0.01261007, + "epoch": 0.2987524425071396, + "flos": 31658376850560.0, + "grad_norm": 1.780098836704026, + "language_loss": 0.76775402, + "learning_rate": 3.2889698726063805e-06, + "loss": 0.84551913, + "num_input_tokens_seen": 106982870, + "router_z_loss_clip": 2.07324219, + "router_z_loss_mlp": 0.17810059, + "step": 4969, + "time_per_iteration": 2.677133321762085 + }, + { + "auxiliary_loss_clip": 0.0649517, + "auxiliary_loss_mlp": 0.01279823, + "balance_loss_clip": 0.06290856, + "balance_loss_mlp": 0.0126355, + "epoch": 0.2988125657598076, + "flos": 21439735873920.0, + "grad_norm": 1.6530964666677603, + "language_loss": 0.702811, + "learning_rate": 3.2886720591442327e-06, + "loss": 0.78056097, + "num_input_tokens_seen": 107002405, + "router_z_loss_clip": 2.04589844, + "router_z_loss_mlp": 0.16271973, + "step": 4970, + "time_per_iteration": 2.542041301727295 + }, + { + "auxiliary_loss_clip": 0.06501894, + "auxiliary_loss_mlp": 0.01279087, + "balance_loss_clip": 0.06289935, + "balance_loss_mlp": 0.01260336, + "epoch": 0.2988726890124756, + "flos": 18082750222080.0, + "grad_norm": 2.836679638175962, + "language_loss": 0.84790057, + "learning_rate": 3.2883741968155103e-06, + "loss": 0.92571044, + "num_input_tokens_seen": 107017310, + "router_z_loss_clip": 2.11816406, + "router_z_loss_mlp": 0.18737793, + "step": 4971, + "time_per_iteration": 2.5460052490234375 + }, + { + "auxiliary_loss_clip": 0.06490934, + "auxiliary_loss_mlp": 0.01273993, + "balance_loss_clip": 0.06292243, + "balance_loss_mlp": 0.01257691, + "epoch": 0.29893281226514357, + "flos": 21760987628160.0, + "grad_norm": 1.7104631490326472, + "language_loss": 0.79530191, + "learning_rate": 3.2880762856315107e-06, + "loss": 0.87295115, + "num_input_tokens_seen": 107034645, + "router_z_loss_clip": 1.98925781, + "router_z_loss_mlp": 0.16314697, + "step": 4972, + "time_per_iteration": 2.521575689315796 + }, + { + "auxiliary_loss_clip": 0.0650093, + "auxiliary_loss_mlp": 0.01282709, + "balance_loss_clip": 0.06297094, + "balance_loss_mlp": 0.01266234, + "epoch": 0.29899293551781153, + "flos": 16842341105280.0, + "grad_norm": 1.7682293865220609, + "language_loss": 0.85643351, + "learning_rate": 3.2877783256035285e-06, + "loss": 0.93426991, + "num_input_tokens_seen": 107051125, + "router_z_loss_clip": 2.04101562, + "router_z_loss_mlp": 0.16467285, + "step": 4973, + "time_per_iteration": 2.546552896499634 + }, + { + "auxiliary_loss_clip": 0.06486042, + "auxiliary_loss_mlp": 0.01280538, + "balance_loss_clip": 0.06291717, + "balance_loss_mlp": 0.01263539, + "epoch": 0.2990530587704795, + "flos": 11734068792960.0, + "grad_norm": 1.5403026658154284, + "language_loss": 0.78163445, + "learning_rate": 3.287480316742863e-06, + "loss": 0.85930026, + "num_input_tokens_seen": 107068815, + "router_z_loss_clip": 1.94433594, + "router_z_loss_mlp": 0.17004395, + "step": 4974, + "time_per_iteration": 2.519416093826294 + }, + { + "auxiliary_loss_clip": 0.06492939, + "auxiliary_loss_mlp": 0.01274131, + "balance_loss_clip": 0.06288281, + "balance_loss_mlp": 0.01257001, + "epoch": 0.29911318202314746, + "flos": 28047713362560.0, + "grad_norm": 1.767842246111843, + "language_loss": 0.73036933, + "learning_rate": 3.287182259060815e-06, + "loss": 0.80804002, + "num_input_tokens_seen": 107090420, + "router_z_loss_clip": 2.04589844, + "router_z_loss_mlp": 0.17126465, + "step": 4975, + "time_per_iteration": 2.6099252700805664 + }, + { + "auxiliary_loss_clip": 0.0649198, + "auxiliary_loss_mlp": 0.01278331, + "balance_loss_clip": 0.06288506, + "balance_loss_mlp": 0.0126163, + "epoch": 0.2991733052758154, + "flos": 18739425070080.0, + "grad_norm": 3.7568061887968374, + "language_loss": 0.76564699, + "learning_rate": 3.286884152568687e-06, + "loss": 0.84335011, + "num_input_tokens_seen": 107107255, + "router_z_loss_clip": 2.03222656, + "router_z_loss_mlp": 0.16711426, + "step": 4976, + "time_per_iteration": 2.4865057468414307 + }, + { + "auxiliary_loss_clip": 0.0649081, + "auxiliary_loss_mlp": 0.01274025, + "balance_loss_clip": 0.06290253, + "balance_loss_mlp": 0.01257574, + "epoch": 0.2992334285284834, + "flos": 15564476413440.0, + "grad_norm": 2.0027584051633256, + "language_loss": 0.86547983, + "learning_rate": 3.2865859972777827e-06, + "loss": 0.94312823, + "num_input_tokens_seen": 107123840, + "router_z_loss_clip": 2.00585938, + "router_z_loss_mlp": 0.16455078, + "step": 4977, + "time_per_iteration": 2.5564377307891846 + }, + { + "auxiliary_loss_clip": 0.06492308, + "auxiliary_loss_mlp": 0.01273791, + "balance_loss_clip": 0.06289831, + "balance_loss_mlp": 0.0125684, + "epoch": 0.29929355178115136, + "flos": 21803809864320.0, + "grad_norm": 1.498415139231663, + "language_loss": 0.69035208, + "learning_rate": 3.2862877931994088e-06, + "loss": 0.76801312, + "num_input_tokens_seen": 107143475, + "router_z_loss_clip": 2.02539062, + "router_z_loss_mlp": 0.16943359, + "step": 4978, + "time_per_iteration": 2.519927978515625 + }, + { + "auxiliary_loss_clip": 0.06498158, + "auxiliary_loss_mlp": 0.01273756, + "balance_loss_clip": 0.06295491, + "balance_loss_mlp": 0.0125634, + "epoch": 0.2993536750338193, + "flos": 21184884080640.0, + "grad_norm": 2.2981139003330924, + "language_loss": 0.76821494, + "learning_rate": 3.2859895403448726e-06, + "loss": 0.84593409, + "num_input_tokens_seen": 107161725, + "router_z_loss_clip": 2.02832031, + "router_z_loss_mlp": 0.17407227, + "step": 4979, + "time_per_iteration": 2.5783658027648926 + }, + { + "auxiliary_loss_clip": 0.06495501, + "auxiliary_loss_mlp": 0.01275001, + "balance_loss_clip": 0.06288472, + "balance_loss_mlp": 0.0125762, + "epoch": 0.2994137982864873, + "flos": 32129954029440.0, + "grad_norm": 1.9038495469030372, + "language_loss": 0.69286489, + "learning_rate": 3.285691238725484e-06, + "loss": 0.77056986, + "num_input_tokens_seen": 107183935, + "router_z_loss_clip": 2.06835938, + "router_z_loss_mlp": 0.17382812, + "step": 4980, + "time_per_iteration": 2.582043170928955 + }, + { + "auxiliary_loss_clip": 0.06490306, + "auxiliary_loss_mlp": 0.01274236, + "balance_loss_clip": 0.06288646, + "balance_loss_mlp": 0.01257177, + "epoch": 0.29947392153915525, + "flos": 21111733866240.0, + "grad_norm": 1.7308746684442236, + "language_loss": 0.74001658, + "learning_rate": 3.285392888352555e-06, + "loss": 0.817662, + "num_input_tokens_seen": 107204285, + "router_z_loss_clip": 2.015625, + "router_z_loss_mlp": 0.17053223, + "step": 4981, + "time_per_iteration": 2.580580711364746 + }, + { + "auxiliary_loss_clip": 0.06490904, + "auxiliary_loss_mlp": 0.01273981, + "balance_loss_clip": 0.06281741, + "balance_loss_mlp": 0.0125635, + "epoch": 0.2995340447918232, + "flos": 21548916144000.0, + "grad_norm": 1.9422940804684126, + "language_loss": 0.86877131, + "learning_rate": 3.2850944892373987e-06, + "loss": 0.94642013, + "num_input_tokens_seen": 107225265, + "router_z_loss_clip": 2.08984375, + "router_z_loss_mlp": 0.17626953, + "step": 4982, + "time_per_iteration": 2.4962990283966064 + }, + { + "auxiliary_loss_clip": 0.06497963, + "auxiliary_loss_mlp": 0.0127461, + "balance_loss_clip": 0.06287588, + "balance_loss_mlp": 0.01257241, + "epoch": 0.2995941680444912, + "flos": 16730393650560.0, + "grad_norm": 2.5640920256819886, + "language_loss": 0.87797368, + "learning_rate": 3.2847960413913307e-06, + "loss": 0.95569938, + "num_input_tokens_seen": 107241335, + "router_z_loss_clip": 2.1015625, + "router_z_loss_mlp": 0.17382812, + "step": 4983, + "time_per_iteration": 2.5295448303222656 + }, + { + "auxiliary_loss_clip": 0.0649021, + "auxiliary_loss_mlp": 0.01273363, + "balance_loss_clip": 0.06287163, + "balance_loss_mlp": 0.012569, + "epoch": 0.2996542912971592, + "flos": 20929864579200.0, + "grad_norm": 2.1931631477553943, + "language_loss": 0.78985476, + "learning_rate": 3.284497544825668e-06, + "loss": 0.86749053, + "num_input_tokens_seen": 107259375, + "router_z_loss_clip": 2.03027344, + "router_z_loss_mlp": 0.16467285, + "step": 4984, + "time_per_iteration": 2.510861873626709 + }, + { + "auxiliary_loss_clip": 0.06490169, + "auxiliary_loss_mlp": 0.01276988, + "balance_loss_clip": 0.06284384, + "balance_loss_mlp": 0.01259702, + "epoch": 0.29971441454982717, + "flos": 25086429417600.0, + "grad_norm": 1.6549542244227224, + "language_loss": 0.78558743, + "learning_rate": 3.2841989995517303e-06, + "loss": 0.86325896, + "num_input_tokens_seen": 107279890, + "router_z_loss_clip": 2.05859375, + "router_z_loss_mlp": 0.17285156, + "step": 4985, + "time_per_iteration": 2.6011219024658203 + }, + { + "auxiliary_loss_clip": 0.06501257, + "auxiliary_loss_mlp": 0.01278562, + "balance_loss_clip": 0.06288561, + "balance_loss_mlp": 0.0125968, + "epoch": 0.29977453780249513, + "flos": 52567445617920.0, + "grad_norm": 2.1128232330624757, + "language_loss": 0.71929544, + "learning_rate": 3.283900405580837e-06, + "loss": 0.79709363, + "num_input_tokens_seen": 107303430, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.1887207, + "step": 4986, + "time_per_iteration": 2.8261890411376953 + }, + { + "auxiliary_loss_clip": 0.06496918, + "auxiliary_loss_mlp": 0.01277715, + "balance_loss_clip": 0.06288348, + "balance_loss_mlp": 0.0125981, + "epoch": 0.2998346610551631, + "flos": 22243759326720.0, + "grad_norm": 2.0495005677193703, + "language_loss": 0.73353851, + "learning_rate": 3.283601762924312e-06, + "loss": 0.81128478, + "num_input_tokens_seen": 107323700, + "router_z_loss_clip": 2.08203125, + "router_z_loss_mlp": 0.17907715, + "step": 4987, + "time_per_iteration": 2.5969009399414062 + }, + { + "auxiliary_loss_clip": 0.06487568, + "auxiliary_loss_mlp": 0.01277048, + "balance_loss_clip": 0.06283796, + "balance_loss_mlp": 0.01260561, + "epoch": 0.29989478430783106, + "flos": 16878832358400.0, + "grad_norm": 1.677350703029162, + "language_loss": 0.80982405, + "learning_rate": 3.2833030715934793e-06, + "loss": 0.88747025, + "num_input_tokens_seen": 107341965, + "router_z_loss_clip": 2.03710938, + "router_z_loss_mlp": 0.16479492, + "step": 4988, + "time_per_iteration": 2.4802756309509277 + }, + { + "auxiliary_loss_clip": 0.06489251, + "auxiliary_loss_mlp": 0.0127416, + "balance_loss_clip": 0.06285515, + "balance_loss_mlp": 0.0125759, + "epoch": 0.29995490756049903, + "flos": 23775637271040.0, + "grad_norm": 1.830625198484136, + "language_loss": 0.7097913, + "learning_rate": 3.2830043315996658e-06, + "loss": 0.7874254, + "num_input_tokens_seen": 107362615, + "router_z_loss_clip": 2.03613281, + "router_z_loss_mlp": 0.16577148, + "step": 4989, + "time_per_iteration": 2.5968902111053467 + }, + { + "auxiliary_loss_clip": 0.06498987, + "auxiliary_loss_mlp": 0.01283365, + "balance_loss_clip": 0.0628901, + "balance_loss_mlp": 0.01264948, + "epoch": 0.300015030813167, + "flos": 14470577360640.0, + "grad_norm": 2.8004651200920576, + "language_loss": 0.85787904, + "learning_rate": 3.282705542954199e-06, + "loss": 0.93570256, + "num_input_tokens_seen": 107378980, + "router_z_loss_clip": 2.09570312, + "router_z_loss_mlp": 0.18408203, + "step": 4990, + "time_per_iteration": 2.4837355613708496 + }, + { + "auxiliary_loss_clip": 0.06499861, + "auxiliary_loss_mlp": 0.01278121, + "balance_loss_clip": 0.06287368, + "balance_loss_mlp": 0.01260204, + "epoch": 0.30007515406583496, + "flos": 25199005777920.0, + "grad_norm": 1.6608247288012334, + "language_loss": 0.67339301, + "learning_rate": 3.28240670566841e-06, + "loss": 0.75117278, + "num_input_tokens_seen": 107397640, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.17919922, + "step": 4991, + "time_per_iteration": 4.060553312301636 + }, + { + "auxiliary_loss_clip": 0.0649571, + "auxiliary_loss_mlp": 0.01277369, + "balance_loss_clip": 0.06284688, + "balance_loss_mlp": 0.01259022, + "epoch": 0.3001352773185029, + "flos": 19397315802240.0, + "grad_norm": 1.7545259775845383, + "language_loss": 0.79479051, + "learning_rate": 3.28210781975363e-06, + "loss": 0.87252128, + "num_input_tokens_seen": 107416020, + "router_z_loss_clip": 2.10546875, + "router_z_loss_mlp": 0.18347168, + "step": 4992, + "time_per_iteration": 2.5394246578216553 + }, + { + "auxiliary_loss_clip": 0.06496455, + "auxiliary_loss_mlp": 0.01272727, + "balance_loss_clip": 0.06291893, + "balance_loss_mlp": 0.01255061, + "epoch": 0.3001954005711709, + "flos": 21550341663360.0, + "grad_norm": 1.8174225064451806, + "language_loss": 0.83191693, + "learning_rate": 3.281808885221193e-06, + "loss": 0.90960872, + "num_input_tokens_seen": 107436340, + "router_z_loss_clip": 2.04589844, + "router_z_loss_mlp": 0.17675781, + "step": 4993, + "time_per_iteration": 2.536900520324707 + }, + { + "auxiliary_loss_clip": 0.06502604, + "auxiliary_loss_mlp": 0.0127659, + "balance_loss_clip": 0.06290129, + "balance_loss_mlp": 0.01257051, + "epoch": 0.30025552382383885, + "flos": 17390087245440.0, + "grad_norm": 2.3964724385856955, + "language_loss": 0.8713994, + "learning_rate": 3.2815099020824345e-06, + "loss": 0.94919133, + "num_input_tokens_seen": 107454585, + "router_z_loss_clip": 2.12109375, + "router_z_loss_mlp": 0.1953125, + "step": 4994, + "time_per_iteration": 5.451568603515625 + }, + { + "auxiliary_loss_clip": 0.06500117, + "auxiliary_loss_mlp": 0.01272707, + "balance_loss_clip": 0.06293428, + "balance_loss_mlp": 0.01255696, + "epoch": 0.3003156470765068, + "flos": 29541003701760.0, + "grad_norm": 1.492375768993242, + "language_loss": 0.81277597, + "learning_rate": 3.2812108703486924e-06, + "loss": 0.89050424, + "num_input_tokens_seen": 107477180, + "router_z_loss_clip": 2.06445312, + "router_z_loss_mlp": 0.17016602, + "step": 4995, + "time_per_iteration": 2.6498701572418213 + }, + { + "auxiliary_loss_clip": 0.06495272, + "auxiliary_loss_mlp": 0.01276355, + "balance_loss_clip": 0.06291361, + "balance_loss_mlp": 0.01257818, + "epoch": 0.3003757703291748, + "flos": 43655278302720.0, + "grad_norm": 1.561088997277918, + "language_loss": 0.67591625, + "learning_rate": 3.2809117900313055e-06, + "loss": 0.75363255, + "num_input_tokens_seen": 107500250, + "router_z_loss_clip": 2.03808594, + "router_z_loss_mlp": 0.18530273, + "step": 4996, + "time_per_iteration": 2.6940386295318604 + }, + { + "auxiliary_loss_clip": 0.06490915, + "auxiliary_loss_mlp": 0.01277922, + "balance_loss_clip": 0.06287466, + "balance_loss_mlp": 0.0125985, + "epoch": 0.30043589358184275, + "flos": 22534934664960.0, + "grad_norm": 1.8202769971321224, + "language_loss": 0.76585484, + "learning_rate": 3.280612661141615e-06, + "loss": 0.84354323, + "num_input_tokens_seen": 107520070, + "router_z_loss_clip": 2.03515625, + "router_z_loss_mlp": 0.18054199, + "step": 4997, + "time_per_iteration": 2.551025629043579 + }, + { + "auxiliary_loss_clip": 0.06488951, + "auxiliary_loss_mlp": 0.01282226, + "balance_loss_clip": 0.06286483, + "balance_loss_mlp": 0.01264785, + "epoch": 0.30049601683451077, + "flos": 21002176252800.0, + "grad_norm": 1.7136041248753544, + "language_loss": 0.78929758, + "learning_rate": 3.2803134836909646e-06, + "loss": 0.86700928, + "num_input_tokens_seen": 107539285, + "router_z_loss_clip": 2.0234375, + "router_z_loss_mlp": 0.17443848, + "step": 4998, + "time_per_iteration": 2.4853529930114746 + }, + { + "auxiliary_loss_clip": 0.06495959, + "auxiliary_loss_mlp": 0.01277634, + "balance_loss_clip": 0.06296599, + "balance_loss_mlp": 0.0126104, + "epoch": 0.30055614008717874, + "flos": 23922985875840.0, + "grad_norm": 1.6408959445510187, + "language_loss": 0.73985869, + "learning_rate": 3.2800142576906985e-06, + "loss": 0.81759465, + "num_input_tokens_seen": 107560260, + "router_z_loss_clip": 1.99121094, + "router_z_loss_mlp": 0.16589355, + "step": 4999, + "time_per_iteration": 2.565272331237793 + }, + { + "auxiliary_loss_clip": 0.06497648, + "auxiliary_loss_mlp": 0.01276599, + "balance_loss_clip": 0.06290608, + "balance_loss_mlp": 0.01258837, + "epoch": 0.3006162633398467, + "flos": 19175475317760.0, + "grad_norm": 1.6585129963537202, + "language_loss": 0.76246512, + "learning_rate": 3.2797149831521626e-06, + "loss": 0.84020758, + "num_input_tokens_seen": 107579260, + "router_z_loss_clip": 2.0703125, + "router_z_loss_mlp": 0.1776123, + "step": 5000, + "time_per_iteration": 3.978001117706299 + }, + { + "auxiliary_loss_clip": 0.06488875, + "auxiliary_loss_mlp": 0.01280464, + "balance_loss_clip": 0.06288455, + "balance_loss_mlp": 0.0126244, + "epoch": 0.30067638659251467, + "flos": 14683697020800.0, + "grad_norm": 1.838860389970219, + "language_loss": 0.81972182, + "learning_rate": 3.2794156600867073e-06, + "loss": 0.89741528, + "num_input_tokens_seen": 107595245, + "router_z_loss_clip": 2.00390625, + "router_z_loss_mlp": 0.18041992, + "step": 5001, + "time_per_iteration": 2.4995031356811523 + }, + { + "auxiliary_loss_clip": 0.06495227, + "auxiliary_loss_mlp": 0.01279132, + "balance_loss_clip": 0.06291329, + "balance_loss_mlp": 0.01261322, + "epoch": 0.30073650984518263, + "flos": 23374778538240.0, + "grad_norm": 1.6002838962292127, + "language_loss": 0.81160742, + "learning_rate": 3.2791162885056815e-06, + "loss": 0.88935101, + "num_input_tokens_seen": 107613985, + "router_z_loss_clip": 2.04003906, + "router_z_loss_mlp": 0.17797852, + "step": 5002, + "time_per_iteration": 2.549882650375366 + }, + { + "auxiliary_loss_clip": 0.06502556, + "auxiliary_loss_mlp": 0.01273216, + "balance_loss_clip": 0.06290467, + "balance_loss_mlp": 0.01255728, + "epoch": 0.3007966330978506, + "flos": 22973332826880.0, + "grad_norm": 1.7018817575326768, + "language_loss": 0.71524274, + "learning_rate": 3.2788168684204376e-06, + "loss": 0.79300046, + "num_input_tokens_seen": 107631435, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.17504883, + "step": 5003, + "time_per_iteration": 2.537760019302368 + }, + { + "auxiliary_loss_clip": 0.06502316, + "auxiliary_loss_mlp": 0.01275597, + "balance_loss_clip": 0.06290226, + "balance_loss_mlp": 0.01257441, + "epoch": 0.30085675635051856, + "flos": 27825830951040.0, + "grad_norm": 1.9954765529899763, + "language_loss": 0.706792, + "learning_rate": 3.27851739984233e-06, + "loss": 0.78457117, + "num_input_tokens_seen": 107650530, + "router_z_loss_clip": 2.12304688, + "router_z_loss_mlp": 0.18151855, + "step": 5004, + "time_per_iteration": 2.6357674598693848 + }, + { + "auxiliary_loss_clip": 0.06504735, + "auxiliary_loss_mlp": 0.01282861, + "balance_loss_clip": 0.06296123, + "balance_loss_mlp": 0.01263513, + "epoch": 0.3009168796031865, + "flos": 10886216855040.0, + "grad_norm": 2.7451882694975662, + "language_loss": 0.81914413, + "learning_rate": 3.278217882782715e-06, + "loss": 0.89702016, + "num_input_tokens_seen": 107662240, + "router_z_loss_clip": 2.08398438, + "router_z_loss_mlp": 0.19335938, + "step": 5005, + "time_per_iteration": 2.4386463165283203 + }, + { + "auxiliary_loss_clip": 0.06497307, + "auxiliary_loss_mlp": 0.01278667, + "balance_loss_clip": 0.06293161, + "balance_loss_mlp": 0.01261179, + "epoch": 0.3009770028558545, + "flos": 23812170451200.0, + "grad_norm": 3.689468326241579, + "language_loss": 0.74513727, + "learning_rate": 3.2779183172529497e-06, + "loss": 0.82289702, + "num_input_tokens_seen": 107680330, + "router_z_loss_clip": 2.04296875, + "router_z_loss_mlp": 0.17492676, + "step": 5006, + "time_per_iteration": 2.6309902667999268 + }, + { + "auxiliary_loss_clip": 0.06490835, + "auxiliary_loss_mlp": 0.01274011, + "balance_loss_clip": 0.06288077, + "balance_loss_mlp": 0.01255247, + "epoch": 0.30103712610852246, + "flos": 26475319169280.0, + "grad_norm": 1.9837745378518294, + "language_loss": 0.71514297, + "learning_rate": 3.2776187032643932e-06, + "loss": 0.79279143, + "num_input_tokens_seen": 107700020, + "router_z_loss_clip": 2.0234375, + "router_z_loss_mlp": 0.18762207, + "step": 5007, + "time_per_iteration": 2.5425140857696533 + }, + { + "auxiliary_loss_clip": 0.06499007, + "auxiliary_loss_mlp": 0.01277558, + "balance_loss_clip": 0.062922, + "balance_loss_mlp": 0.01258961, + "epoch": 0.3010972493611904, + "flos": 22863020526720.0, + "grad_norm": 2.135948160193648, + "language_loss": 0.76715112, + "learning_rate": 3.2773190408284075e-06, + "loss": 0.84491682, + "num_input_tokens_seen": 107718575, + "router_z_loss_clip": 2.06835938, + "router_z_loss_mlp": 0.18579102, + "step": 5008, + "time_per_iteration": 2.560136556625366 + }, + { + "auxiliary_loss_clip": 0.06498778, + "auxiliary_loss_mlp": 0.01277162, + "balance_loss_clip": 0.06291865, + "balance_loss_mlp": 0.01258959, + "epoch": 0.3011573726138584, + "flos": 24059307669120.0, + "grad_norm": 1.8647165617813573, + "language_loss": 0.85181898, + "learning_rate": 3.2770193299563564e-06, + "loss": 0.92957842, + "num_input_tokens_seen": 107738635, + "router_z_loss_clip": 2.06835938, + "router_z_loss_mlp": 0.18200684, + "step": 5009, + "time_per_iteration": 2.5235841274261475 + }, + { + "auxiliary_loss_clip": 0.06506295, + "auxiliary_loss_mlp": 0.01281474, + "balance_loss_clip": 0.06291408, + "balance_loss_mlp": 0.0126041, + "epoch": 0.30121749586652635, + "flos": 20264762396160.0, + "grad_norm": 1.8315766872525614, + "language_loss": 0.84202898, + "learning_rate": 3.276719570659604e-06, + "loss": 0.91990662, + "num_input_tokens_seen": 107753415, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.21069336, + "step": 5010, + "time_per_iteration": 2.5768747329711914 + }, + { + "auxiliary_loss_clip": 0.06499103, + "auxiliary_loss_mlp": 0.01276454, + "balance_loss_clip": 0.06292678, + "balance_loss_mlp": 0.01258728, + "epoch": 0.3012776191191944, + "flos": 26950334365440.0, + "grad_norm": 2.3479091749479593, + "language_loss": 0.85299456, + "learning_rate": 3.2764197629495176e-06, + "loss": 0.93075019, + "num_input_tokens_seen": 107773840, + "router_z_loss_clip": 2.06445312, + "router_z_loss_mlp": 0.17724609, + "step": 5011, + "time_per_iteration": 2.5496773719787598 + }, + { + "auxiliary_loss_clip": 0.06498772, + "auxiliary_loss_mlp": 0.01276485, + "balance_loss_clip": 0.06287067, + "balance_loss_mlp": 0.01258472, + "epoch": 0.30133774237186234, + "flos": 20418525838080.0, + "grad_norm": 2.2969937551574615, + "language_loss": 0.73043567, + "learning_rate": 3.2761199068374656e-06, + "loss": 0.80818832, + "num_input_tokens_seen": 107792020, + "router_z_loss_clip": 2.1171875, + "router_z_loss_mlp": 0.18017578, + "step": 5012, + "time_per_iteration": 2.5352632999420166 + }, + { + "auxiliary_loss_clip": 0.06502604, + "auxiliary_loss_mlp": 0.01275987, + "balance_loss_clip": 0.06294451, + "balance_loss_mlp": 0.01257581, + "epoch": 0.3013978656245303, + "flos": 19798635732480.0, + "grad_norm": 2.0714365992737247, + "language_loss": 0.88282806, + "learning_rate": 3.275820002334819e-06, + "loss": 0.96061397, + "num_input_tokens_seen": 107809595, + "router_z_loss_clip": 2.08007812, + "router_z_loss_mlp": 0.1842041, + "step": 5013, + "time_per_iteration": 2.5217273235321045 + }, + { + "auxiliary_loss_clip": 0.06510235, + "auxiliary_loss_mlp": 0.01281959, + "balance_loss_clip": 0.06297365, + "balance_loss_mlp": 0.01261956, + "epoch": 0.30145798887719827, + "flos": 16254623767680.0, + "grad_norm": 2.0397198762739253, + "language_loss": 0.8413021, + "learning_rate": 3.2755200494529496e-06, + "loss": 0.91922402, + "num_input_tokens_seen": 107827230, + "router_z_loss_clip": 2.12988281, + "router_z_loss_mlp": 0.19995117, + "step": 5014, + "time_per_iteration": 2.543929100036621 + }, + { + "auxiliary_loss_clip": 0.06496109, + "auxiliary_loss_mlp": 0.01278136, + "balance_loss_clip": 0.06295025, + "balance_loss_mlp": 0.01260934, + "epoch": 0.30151811212986623, + "flos": 24578654474880.0, + "grad_norm": 1.6793816963153507, + "language_loss": 0.68929201, + "learning_rate": 3.2752200482032323e-06, + "loss": 0.76703441, + "num_input_tokens_seen": 107847195, + "router_z_loss_clip": 2.0078125, + "router_z_loss_mlp": 0.17199707, + "step": 5015, + "time_per_iteration": 2.5488638877868652 + }, + { + "auxiliary_loss_clip": 0.06498226, + "auxiliary_loss_mlp": 0.01282599, + "balance_loss_clip": 0.06293575, + "balance_loss_mlp": 0.01262989, + "epoch": 0.3015782353825342, + "flos": 21878595233280.0, + "grad_norm": 2.19954780338382, + "language_loss": 0.75070626, + "learning_rate": 3.2749199985970436e-06, + "loss": 0.82851446, + "num_input_tokens_seen": 107866420, + "router_z_loss_clip": 2.046875, + "router_z_loss_mlp": 0.19604492, + "step": 5016, + "time_per_iteration": 2.6430094242095947 + }, + { + "auxiliary_loss_clip": 0.06498955, + "auxiliary_loss_mlp": 0.01278069, + "balance_loss_clip": 0.06290609, + "balance_loss_mlp": 0.01260009, + "epoch": 0.30163835863520216, + "flos": 28777244935680.0, + "grad_norm": 1.487936670829871, + "language_loss": 0.657938, + "learning_rate": 3.2746199006457603e-06, + "loss": 0.73570824, + "num_input_tokens_seen": 107889090, + "router_z_loss_clip": 2.08300781, + "router_z_loss_mlp": 0.18041992, + "step": 5017, + "time_per_iteration": 2.62882661819458 + }, + { + "auxiliary_loss_clip": 0.06504996, + "auxiliary_loss_mlp": 0.0127571, + "balance_loss_clip": 0.06297189, + "balance_loss_mlp": 0.01258019, + "epoch": 0.30169848188787013, + "flos": 22972829702400.0, + "grad_norm": 1.7163502989136974, + "language_loss": 0.68538272, + "learning_rate": 3.2743197543607628e-06, + "loss": 0.76318979, + "num_input_tokens_seen": 107907520, + "router_z_loss_clip": 2.078125, + "router_z_loss_mlp": 0.17675781, + "step": 5018, + "time_per_iteration": 2.5743629932403564 + }, + { + "auxiliary_loss_clip": 0.06490742, + "auxiliary_loss_mlp": 0.01280876, + "balance_loss_clip": 0.06291413, + "balance_loss_mlp": 0.01263102, + "epoch": 0.3017586051405381, + "flos": 21841726636800.0, + "grad_norm": 1.8632302123292983, + "language_loss": 0.79424834, + "learning_rate": 3.2740195597534327e-06, + "loss": 0.87196445, + "num_input_tokens_seen": 107925650, + "router_z_loss_clip": 1.99023438, + "router_z_loss_mlp": 0.17773438, + "step": 5019, + "time_per_iteration": 2.490190029144287 + }, + { + "auxiliary_loss_clip": 0.06497257, + "auxiliary_loss_mlp": 0.01272585, + "balance_loss_clip": 0.06291286, + "balance_loss_mlp": 0.01255932, + "epoch": 0.30181872839320606, + "flos": 22166374481280.0, + "grad_norm": 1.9171916392208899, + "language_loss": 0.70839167, + "learning_rate": 3.2737193168351527e-06, + "loss": 0.78609014, + "num_input_tokens_seen": 107943975, + "router_z_loss_clip": 2.05957031, + "router_z_loss_mlp": 0.16650391, + "step": 5020, + "time_per_iteration": 2.5635480880737305 + }, + { + "auxiliary_loss_clip": 0.06504546, + "auxiliary_loss_mlp": 0.01281398, + "balance_loss_clip": 0.06293903, + "balance_loss_mlp": 0.01263063, + "epoch": 0.301878851645874, + "flos": 18120080016000.0, + "grad_norm": 1.792157390717078, + "language_loss": 0.78276378, + "learning_rate": 3.2734190256173085e-06, + "loss": 0.86062324, + "num_input_tokens_seen": 107962950, + "router_z_loss_clip": 2.10742188, + "router_z_loss_mlp": 0.18347168, + "step": 5021, + "time_per_iteration": 2.4956390857696533 + }, + { + "auxiliary_loss_clip": 0.06497782, + "auxiliary_loss_mlp": 0.01276425, + "balance_loss_clip": 0.06289995, + "balance_loss_mlp": 0.01258758, + "epoch": 0.301938974898542, + "flos": 17607860807040.0, + "grad_norm": 2.1405998927344774, + "language_loss": 0.77019519, + "learning_rate": 3.2731186861112877e-06, + "loss": 0.84793723, + "num_input_tokens_seen": 107979700, + "router_z_loss_clip": 2.078125, + "router_z_loss_mlp": 0.17663574, + "step": 5022, + "time_per_iteration": 2.5157957077026367 + }, + { + "auxiliary_loss_clip": 0.06495966, + "auxiliary_loss_mlp": 0.01276385, + "balance_loss_clip": 0.0628897, + "balance_loss_mlp": 0.01258766, + "epoch": 0.30199909815120995, + "flos": 11185861455360.0, + "grad_norm": 1.768248661027107, + "language_loss": 0.70051187, + "learning_rate": 3.2728182983284793e-06, + "loss": 0.77823544, + "num_input_tokens_seen": 107996645, + "router_z_loss_clip": 2.06835938, + "router_z_loss_mlp": 0.17626953, + "step": 5023, + "time_per_iteration": 2.466554641723633 + }, + { + "auxiliary_loss_clip": 0.06500031, + "auxiliary_loss_mlp": 0.01272609, + "balance_loss_clip": 0.0628899, + "balance_loss_mlp": 0.0125586, + "epoch": 0.302059221403878, + "flos": 21914247945600.0, + "grad_norm": 1.9915350532209553, + "language_loss": 0.72159773, + "learning_rate": 3.2725178622802724e-06, + "loss": 0.7993241, + "num_input_tokens_seen": 108015020, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.16748047, + "step": 5024, + "time_per_iteration": 2.550529956817627 + }, + { + "auxiliary_loss_clip": 0.06490807, + "auxiliary_loss_mlp": 0.0127689, + "balance_loss_clip": 0.06288145, + "balance_loss_mlp": 0.01259068, + "epoch": 0.30211934465654594, + "flos": 26403678328320.0, + "grad_norm": 1.894121412902458, + "language_loss": 0.74805325, + "learning_rate": 3.272217377978061e-06, + "loss": 0.8257302, + "num_input_tokens_seen": 108036430, + "router_z_loss_clip": 2.02832031, + "router_z_loss_mlp": 0.17822266, + "step": 5025, + "time_per_iteration": 2.566805124282837 + }, + { + "auxiliary_loss_clip": 0.06489006, + "auxiliary_loss_mlp": 0.01277493, + "balance_loss_clip": 0.06288895, + "balance_loss_mlp": 0.01260649, + "epoch": 0.3021794679092139, + "flos": 23406573962880.0, + "grad_norm": 1.5421556017832176, + "language_loss": 0.67708206, + "learning_rate": 3.2719168454332387e-06, + "loss": 0.75474703, + "num_input_tokens_seen": 108054250, + "router_z_loss_clip": 1.99902344, + "router_z_loss_mlp": 0.16845703, + "step": 5026, + "time_per_iteration": 2.5388495922088623 + }, + { + "auxiliary_loss_clip": 0.06496219, + "auxiliary_loss_mlp": 0.01276315, + "balance_loss_clip": 0.0629091, + "balance_loss_mlp": 0.0125829, + "epoch": 0.30223959116188187, + "flos": 20266271769600.0, + "grad_norm": 1.7822947119811494, + "language_loss": 0.851165, + "learning_rate": 3.2716162646572034e-06, + "loss": 0.92889023, + "num_input_tokens_seen": 108071495, + "router_z_loss_clip": 2.05175781, + "router_z_loss_mlp": 0.18017578, + "step": 5027, + "time_per_iteration": 2.4944281578063965 + }, + { + "auxiliary_loss_clip": 0.06486274, + "auxiliary_loss_mlp": 0.01272499, + "balance_loss_clip": 0.06286463, + "balance_loss_mlp": 0.012555, + "epoch": 0.30229971441454984, + "flos": 26695105228800.0, + "grad_norm": 1.4959542036115716, + "language_loss": 0.79103637, + "learning_rate": 3.271315635661351e-06, + "loss": 0.86862409, + "num_input_tokens_seen": 108092135, + "router_z_loss_clip": 1.99804688, + "router_z_loss_mlp": 0.17004395, + "step": 5028, + "time_per_iteration": 2.559110403060913 + }, + { + "auxiliary_loss_clip": 0.06488896, + "auxiliary_loss_mlp": 0.01272685, + "balance_loss_clip": 0.06286621, + "balance_loss_mlp": 0.01255114, + "epoch": 0.3023598376672178, + "flos": 34353111358080.0, + "grad_norm": 2.034560710438702, + "language_loss": 0.777421, + "learning_rate": 3.2710149584570826e-06, + "loss": 0.8550368, + "num_input_tokens_seen": 108112945, + "router_z_loss_clip": 2.02246094, + "router_z_loss_mlp": 0.17553711, + "step": 5029, + "time_per_iteration": 2.616746187210083 + }, + { + "auxiliary_loss_clip": 0.06491397, + "auxiliary_loss_mlp": 0.012793, + "balance_loss_clip": 0.06285096, + "balance_loss_mlp": 0.0126112, + "epoch": 0.30241996091988577, + "flos": 23118794714880.0, + "grad_norm": 1.8709670039612754, + "language_loss": 0.83096594, + "learning_rate": 3.2707142330557993e-06, + "loss": 0.90867293, + "num_input_tokens_seen": 108130325, + "router_z_loss_clip": 2.06152344, + "router_z_loss_mlp": 0.1817627, + "step": 5030, + "time_per_iteration": 2.56754994392395 + }, + { + "auxiliary_loss_clip": 0.06496526, + "auxiliary_loss_mlp": 0.01269852, + "balance_loss_clip": 0.06289787, + "balance_loss_mlp": 0.01252817, + "epoch": 0.30248008417255373, + "flos": 19395932209920.0, + "grad_norm": 1.6009792224367259, + "language_loss": 0.70107001, + "learning_rate": 3.270413459468905e-06, + "loss": 0.77873379, + "num_input_tokens_seen": 108150300, + "router_z_loss_clip": 2.06542969, + "router_z_loss_mlp": 0.17028809, + "step": 5031, + "time_per_iteration": 3.9598355293273926 + }, + { + "auxiliary_loss_clip": 0.06489968, + "auxiliary_loss_mlp": 0.01272903, + "balance_loss_clip": 0.06286315, + "balance_loss_mlp": 0.01254843, + "epoch": 0.3025402074252217, + "flos": 23776601592960.0, + "grad_norm": 1.6577801639127376, + "language_loss": 0.83241403, + "learning_rate": 3.2701126377078047e-06, + "loss": 0.91004276, + "num_input_tokens_seen": 108170330, + "router_z_loss_clip": 2.03417969, + "router_z_loss_mlp": 0.18066406, + "step": 5032, + "time_per_iteration": 2.5589263439178467 + }, + { + "auxiliary_loss_clip": 0.064991, + "auxiliary_loss_mlp": 0.01275787, + "balance_loss_clip": 0.06290475, + "balance_loss_mlp": 0.01257846, + "epoch": 0.30260033067788966, + "flos": 26001184440960.0, + "grad_norm": 2.284722647008976, + "language_loss": 0.73521686, + "learning_rate": 3.269811767783906e-06, + "loss": 0.81296575, + "num_input_tokens_seen": 108191265, + "router_z_loss_clip": 2.08984375, + "router_z_loss_mlp": 0.17956543, + "step": 5033, + "time_per_iteration": 4.029735088348389 + }, + { + "auxiliary_loss_clip": 0.06487451, + "auxiliary_loss_mlp": 0.01273985, + "balance_loss_clip": 0.06287168, + "balance_loss_mlp": 0.01257201, + "epoch": 0.3026604539305576, + "flos": 25381629751680.0, + "grad_norm": 1.972268943863271, + "language_loss": 0.74434245, + "learning_rate": 3.2695108497086185e-06, + "loss": 0.82195687, + "num_input_tokens_seen": 108211615, + "router_z_loss_clip": 2.0, + "router_z_loss_mlp": 0.16784668, + "step": 5034, + "time_per_iteration": 4.0717785358428955 + }, + { + "auxiliary_loss_clip": 0.06489293, + "auxiliary_loss_mlp": 0.01272883, + "balance_loss_clip": 0.06285236, + "balance_loss_mlp": 0.01253785, + "epoch": 0.3027205771832256, + "flos": 25819944059520.0, + "grad_norm": 2.1341895685230434, + "language_loss": 0.72872615, + "learning_rate": 3.269209883493352e-06, + "loss": 0.80634785, + "num_input_tokens_seen": 108231080, + "router_z_loss_clip": 2.04003906, + "router_z_loss_mlp": 0.19104004, + "step": 5035, + "time_per_iteration": 2.552910804748535 + }, + { + "auxiliary_loss_clip": 0.06487517, + "auxiliary_loss_mlp": 0.01272592, + "balance_loss_clip": 0.06287874, + "balance_loss_mlp": 0.01255545, + "epoch": 0.30278070043589356, + "flos": 27351905857920.0, + "grad_norm": 2.3429469920607384, + "language_loss": 0.87837774, + "learning_rate": 3.2689088691495196e-06, + "loss": 0.95597875, + "num_input_tokens_seen": 108251125, + "router_z_loss_clip": 1.99414062, + "router_z_loss_mlp": 0.17041016, + "step": 5036, + "time_per_iteration": 2.5958964824676514 + }, + { + "auxiliary_loss_clip": 0.06487815, + "auxiliary_loss_mlp": 0.01273729, + "balance_loss_clip": 0.06288295, + "balance_loss_mlp": 0.0125574, + "epoch": 0.3028408236885616, + "flos": 24792444967680.0, + "grad_norm": 1.4626052772561229, + "language_loss": 0.77969307, + "learning_rate": 3.268607806688536e-06, + "loss": 0.85730845, + "num_input_tokens_seen": 108272545, + "router_z_loss_clip": 1.99609375, + "router_z_loss_mlp": 0.17980957, + "step": 5037, + "time_per_iteration": 2.556859016418457 + }, + { + "auxiliary_loss_clip": 0.06492691, + "auxiliary_loss_mlp": 0.01276846, + "balance_loss_clip": 0.06287664, + "balance_loss_mlp": 0.01258381, + "epoch": 0.30290094694122954, + "flos": 12937399678080.0, + "grad_norm": 2.1717737457337236, + "language_loss": 0.78095227, + "learning_rate": 3.268306696121816e-06, + "loss": 0.85864764, + "num_input_tokens_seen": 108289725, + "router_z_loss_clip": 2.04882812, + "router_z_loss_mlp": 0.18469238, + "step": 5038, + "time_per_iteration": 2.534095525741577 + }, + { + "auxiliary_loss_clip": 0.06487858, + "auxiliary_loss_mlp": 0.01274285, + "balance_loss_clip": 0.06289861, + "balance_loss_mlp": 0.01257631, + "epoch": 0.3029610701938975, + "flos": 25922709492480.0, + "grad_norm": 1.6864855803341283, + "language_loss": 0.74257523, + "learning_rate": 3.2680055374607804e-06, + "loss": 0.82019669, + "num_input_tokens_seen": 108310690, + "router_z_loss_clip": 1.9765625, + "router_z_loss_mlp": 0.16650391, + "step": 5039, + "time_per_iteration": 3.9620656967163086 + }, + { + "auxiliary_loss_clip": 0.06482661, + "auxiliary_loss_mlp": 0.01275025, + "balance_loss_clip": 0.06285235, + "balance_loss_mlp": 0.0125923, + "epoch": 0.3030211934465655, + "flos": 21987440087040.0, + "grad_norm": 1.8054159725903498, + "language_loss": 0.80141723, + "learning_rate": 3.267704330716847e-06, + "loss": 0.87899411, + "num_input_tokens_seen": 108328905, + "router_z_loss_clip": 1.97070312, + "router_z_loss_mlp": 0.15795898, + "step": 5040, + "time_per_iteration": 2.5038623809814453 + }, + { + "auxiliary_loss_clip": 0.06493679, + "auxiliary_loss_mlp": 0.01273287, + "balance_loss_clip": 0.06295684, + "balance_loss_mlp": 0.01256705, + "epoch": 0.30308131669923344, + "flos": 20997606205440.0, + "grad_norm": 1.5545793881611087, + "language_loss": 0.82498085, + "learning_rate": 3.267403075901438e-06, + "loss": 0.90265048, + "num_input_tokens_seen": 108346680, + "router_z_loss_clip": 1.97851562, + "router_z_loss_mlp": 0.16589355, + "step": 5041, + "time_per_iteration": 2.5619800090789795 + }, + { + "auxiliary_loss_clip": 0.06388037, + "auxiliary_loss_mlp": 0.01273694, + "balance_loss_clip": 0.062912, + "balance_loss_mlp": 0.012703, + "epoch": 0.3031414399519014, + "flos": 60568281198720.0, + "grad_norm": 0.7609258494567089, + "language_loss": 0.59132683, + "learning_rate": 3.267101773025978e-06, + "loss": 0.66794419, + "num_input_tokens_seen": 108413885, + "router_z_loss_clip": 0.96875, + "router_z_loss_mlp": 0.0340271, + "step": 5042, + "time_per_iteration": 3.2389016151428223 + }, + { + "auxiliary_loss_clip": 0.06493344, + "auxiliary_loss_mlp": 0.01274817, + "balance_loss_clip": 0.06288245, + "balance_loss_mlp": 0.0125808, + "epoch": 0.30320156320456937, + "flos": 21914038310400.0, + "grad_norm": 1.8743682054895758, + "language_loss": 0.71638298, + "learning_rate": 3.266800422101892e-06, + "loss": 0.79406464, + "num_input_tokens_seen": 108433640, + "router_z_loss_clip": 2.05175781, + "router_z_loss_mlp": 0.1673584, + "step": 5043, + "time_per_iteration": 2.5684726238250732 + }, + { + "auxiliary_loss_clip": 0.06492111, + "auxiliary_loss_mlp": 0.01274799, + "balance_loss_clip": 0.06289819, + "balance_loss_mlp": 0.01258121, + "epoch": 0.30326168645723733, + "flos": 21659186517120.0, + "grad_norm": 1.7052050019212173, + "language_loss": 0.70087332, + "learning_rate": 3.266499023140606e-06, + "loss": 0.7785424, + "num_input_tokens_seen": 108452640, + "router_z_loss_clip": 2.02441406, + "router_z_loss_mlp": 0.16699219, + "step": 5044, + "time_per_iteration": 2.517548084259033 + }, + { + "auxiliary_loss_clip": 0.06487354, + "auxiliary_loss_mlp": 0.01273722, + "balance_loss_clip": 0.06289065, + "balance_loss_mlp": 0.01257641, + "epoch": 0.3033218097099053, + "flos": 21877672838400.0, + "grad_norm": 1.4072868323237386, + "language_loss": 0.77798641, + "learning_rate": 3.2661975761535513e-06, + "loss": 0.85559714, + "num_input_tokens_seen": 108472470, + "router_z_loss_clip": 1.98242188, + "router_z_loss_mlp": 0.16088867, + "step": 5045, + "time_per_iteration": 2.5525407791137695 + }, + { + "auxiliary_loss_clip": 0.06487602, + "auxiliary_loss_mlp": 0.01277286, + "balance_loss_clip": 0.06286202, + "balance_loss_mlp": 0.01260096, + "epoch": 0.30338193296257326, + "flos": 27097137918720.0, + "grad_norm": 1.6677605508610576, + "language_loss": 0.72664404, + "learning_rate": 3.2658960811521564e-06, + "loss": 0.80429292, + "num_input_tokens_seen": 108493025, + "router_z_loss_clip": 2.01171875, + "router_z_loss_mlp": 0.171875, + "step": 5046, + "time_per_iteration": 2.5747427940368652 + }, + { + "auxiliary_loss_clip": 0.06495762, + "auxiliary_loss_mlp": 0.01276721, + "balance_loss_clip": 0.0629053, + "balance_loss_mlp": 0.0125897, + "epoch": 0.30344205621524123, + "flos": 19540052432640.0, + "grad_norm": 1.932306391246397, + "language_loss": 0.81483316, + "learning_rate": 3.2655945381478564e-06, + "loss": 0.89255798, + "num_input_tokens_seen": 108513480, + "router_z_loss_clip": 2.0546875, + "router_z_loss_mlp": 0.1776123, + "step": 5047, + "time_per_iteration": 2.5763392448425293 + }, + { + "auxiliary_loss_clip": 0.0648682, + "auxiliary_loss_mlp": 0.01271507, + "balance_loss_clip": 0.06287121, + "balance_loss_mlp": 0.01255568, + "epoch": 0.3035021794679092, + "flos": 23917116090240.0, + "grad_norm": 1.635585540948891, + "language_loss": 0.72204739, + "learning_rate": 3.265292947152084e-06, + "loss": 0.7996307, + "num_input_tokens_seen": 108533155, + "router_z_loss_clip": 1.99707031, + "router_z_loss_mlp": 0.15942383, + "step": 5048, + "time_per_iteration": 2.5134665966033936 + }, + { + "auxiliary_loss_clip": 0.06488065, + "auxiliary_loss_mlp": 0.01279017, + "balance_loss_clip": 0.0628863, + "balance_loss_mlp": 0.0126296, + "epoch": 0.30356230272057716, + "flos": 16149133077120.0, + "grad_norm": 2.0386560470204804, + "language_loss": 0.75622666, + "learning_rate": 3.2649913081762763e-06, + "loss": 0.83389747, + "num_input_tokens_seen": 108551900, + "router_z_loss_clip": 1.99707031, + "router_z_loss_mlp": 0.16052246, + "step": 5049, + "time_per_iteration": 2.516463279724121 + }, + { + "auxiliary_loss_clip": 0.06494351, + "auxiliary_loss_mlp": 0.01274287, + "balance_loss_clip": 0.06289351, + "balance_loss_mlp": 0.01257597, + "epoch": 0.3036224259732452, + "flos": 28922539115520.0, + "grad_norm": 1.525083803020086, + "language_loss": 0.82698894, + "learning_rate": 3.2646896212318717e-06, + "loss": 0.90467536, + "num_input_tokens_seen": 108574005, + "router_z_loss_clip": 2.05078125, + "router_z_loss_mlp": 0.16687012, + "step": 5050, + "time_per_iteration": 2.558199405670166 + }, + { + "auxiliary_loss_clip": 0.0649763, + "auxiliary_loss_mlp": 0.01273759, + "balance_loss_clip": 0.06295735, + "balance_loss_mlp": 0.01256617, + "epoch": 0.30368254922591315, + "flos": 21111943501440.0, + "grad_norm": 2.311701267026144, + "language_loss": 0.74346399, + "learning_rate": 3.2643878863303106e-06, + "loss": 0.82117784, + "num_input_tokens_seen": 108592715, + "router_z_loss_clip": 2.01953125, + "router_z_loss_mlp": 0.17150879, + "step": 5051, + "time_per_iteration": 2.530457019805908 + }, + { + "auxiliary_loss_clip": 0.06494159, + "auxiliary_loss_mlp": 0.01277624, + "balance_loss_clip": 0.06292571, + "balance_loss_mlp": 0.01260339, + "epoch": 0.3037426724785811, + "flos": 23008859758080.0, + "grad_norm": 1.7255753861859113, + "language_loss": 0.76444, + "learning_rate": 3.264086103483033e-06, + "loss": 0.84215784, + "num_input_tokens_seen": 108611770, + "router_z_loss_clip": 2.01367188, + "router_z_loss_mlp": 0.17297363, + "step": 5052, + "time_per_iteration": 2.596210479736328 + }, + { + "auxiliary_loss_clip": 0.06501957, + "auxiliary_loss_mlp": 0.01280226, + "balance_loss_clip": 0.06295583, + "balance_loss_mlp": 0.01262332, + "epoch": 0.3038027957312491, + "flos": 15638129752320.0, + "grad_norm": 1.9820354931454651, + "language_loss": 0.83096367, + "learning_rate": 3.2637842727014836e-06, + "loss": 0.90878546, + "num_input_tokens_seen": 108629070, + "router_z_loss_clip": 2.0625, + "router_z_loss_mlp": 0.17871094, + "step": 5053, + "time_per_iteration": 2.5384886264801025 + }, + { + "auxiliary_loss_clip": 0.06489826, + "auxiliary_loss_mlp": 0.0127909, + "balance_loss_clip": 0.06288566, + "balance_loss_mlp": 0.01262174, + "epoch": 0.30386291898391704, + "flos": 12718955283840.0, + "grad_norm": 1.6755872357210637, + "language_loss": 0.7197504, + "learning_rate": 3.2634823939971083e-06, + "loss": 0.79743958, + "num_input_tokens_seen": 108646315, + "router_z_loss_clip": 2.01171875, + "router_z_loss_mlp": 0.16906738, + "step": 5054, + "time_per_iteration": 2.4787559509277344 + }, + { + "auxiliary_loss_clip": 0.06500221, + "auxiliary_loss_mlp": 0.01282757, + "balance_loss_clip": 0.06298432, + "balance_loss_mlp": 0.01265805, + "epoch": 0.303923042236585, + "flos": 26366642023680.0, + "grad_norm": 1.8480883425842163, + "language_loss": 0.70137346, + "learning_rate": 3.2631804673813545e-06, + "loss": 0.77920318, + "num_input_tokens_seen": 108665920, + "router_z_loss_clip": 2.01757812, + "router_z_loss_mlp": 0.16943359, + "step": 5055, + "time_per_iteration": 2.5929152965545654 + }, + { + "auxiliary_loss_clip": 0.06494389, + "auxiliary_loss_mlp": 0.01279452, + "balance_loss_clip": 0.0629337, + "balance_loss_mlp": 0.01262488, + "epoch": 0.30398316548925297, + "flos": 19725359736960.0, + "grad_norm": 2.1405790356583516, + "language_loss": 0.68347496, + "learning_rate": 3.2628784928656707e-06, + "loss": 0.7612133, + "num_input_tokens_seen": 108683485, + "router_z_loss_clip": 2.00878906, + "router_z_loss_mlp": 0.16955566, + "step": 5056, + "time_per_iteration": 2.531677007675171 + }, + { + "auxiliary_loss_clip": 0.06490116, + "auxiliary_loss_mlp": 0.01281162, + "balance_loss_clip": 0.06292629, + "balance_loss_mlp": 0.01264377, + "epoch": 0.30404328874192094, + "flos": 24246124346880.0, + "grad_norm": 1.6503197514246037, + "language_loss": 0.83083463, + "learning_rate": 3.262576470461507e-06, + "loss": 0.9085474, + "num_input_tokens_seen": 108702700, + "router_z_loss_clip": 1.97753906, + "router_z_loss_mlp": 0.16796875, + "step": 5057, + "time_per_iteration": 2.5836069583892822 + }, + { + "auxiliary_loss_clip": 0.06484263, + "auxiliary_loss_mlp": 0.01272995, + "balance_loss_clip": 0.06286788, + "balance_loss_mlp": 0.01256603, + "epoch": 0.3041034119945889, + "flos": 24505881603840.0, + "grad_norm": 1.6860023663091837, + "language_loss": 0.89784855, + "learning_rate": 3.2622744001803176e-06, + "loss": 0.97542113, + "num_input_tokens_seen": 108721860, + "router_z_loss_clip": 1.97265625, + "router_z_loss_mlp": 0.16394043, + "step": 5058, + "time_per_iteration": 2.589932918548584 + }, + { + "auxiliary_loss_clip": 0.06495658, + "auxiliary_loss_mlp": 0.01274369, + "balance_loss_clip": 0.06294262, + "balance_loss_mlp": 0.01256524, + "epoch": 0.30416353524725687, + "flos": 28295689121280.0, + "grad_norm": 2.5117349508823392, + "language_loss": 0.71471179, + "learning_rate": 3.2619722820335564e-06, + "loss": 0.79241204, + "num_input_tokens_seen": 108743215, + "router_z_loss_clip": 2.01464844, + "router_z_loss_mlp": 0.17858887, + "step": 5059, + "time_per_iteration": 2.5827505588531494 + }, + { + "auxiliary_loss_clip": 0.06486548, + "auxiliary_loss_mlp": 0.01273567, + "balance_loss_clip": 0.06286812, + "balance_loss_mlp": 0.01257367, + "epoch": 0.30422365849992483, + "flos": 23667295541760.0, + "grad_norm": 1.868956784724377, + "language_loss": 0.73344606, + "learning_rate": 3.26167011603268e-06, + "loss": 0.8110472, + "num_input_tokens_seen": 108765505, + "router_z_loss_clip": 1.99609375, + "router_z_loss_mlp": 0.16174316, + "step": 5060, + "time_per_iteration": 2.624408006668091 + }, + { + "auxiliary_loss_clip": 0.06490071, + "auxiliary_loss_mlp": 0.01273663, + "balance_loss_clip": 0.06289257, + "balance_loss_mlp": 0.01257451, + "epoch": 0.3042837817525928, + "flos": 23004750908160.0, + "grad_norm": 1.75217091558972, + "language_loss": 0.7751621, + "learning_rate": 3.2613679021891463e-06, + "loss": 0.85279948, + "num_input_tokens_seen": 108783370, + "router_z_loss_clip": 2.0078125, + "router_z_loss_mlp": 0.16210938, + "step": 5061, + "time_per_iteration": 2.542299509048462 + }, + { + "auxiliary_loss_clip": 0.06496524, + "auxiliary_loss_mlp": 0.01274148, + "balance_loss_clip": 0.06292392, + "balance_loss_mlp": 0.01256362, + "epoch": 0.30434390500526076, + "flos": 22087438335360.0, + "grad_norm": 2.647933932315435, + "language_loss": 0.8275395, + "learning_rate": 3.261065640514415e-06, + "loss": 0.90524626, + "num_input_tokens_seen": 108797430, + "router_z_loss_clip": 2.03808594, + "router_z_loss_mlp": 0.17773438, + "step": 5062, + "time_per_iteration": 2.5313212871551514 + }, + { + "auxiliary_loss_clip": 0.06485732, + "auxiliary_loss_mlp": 0.01270116, + "balance_loss_clip": 0.06286077, + "balance_loss_mlp": 0.01253689, + "epoch": 0.3044040282579287, + "flos": 25490516532480.0, + "grad_norm": 1.803893214603413, + "language_loss": 0.74348861, + "learning_rate": 3.2607633310199483e-06, + "loss": 0.82104707, + "num_input_tokens_seen": 108816945, + "router_z_loss_clip": 1.99609375, + "router_z_loss_mlp": 0.16394043, + "step": 5063, + "time_per_iteration": 2.553527355194092 + }, + { + "auxiliary_loss_clip": 0.0649004, + "auxiliary_loss_mlp": 0.01274813, + "balance_loss_clip": 0.06291289, + "balance_loss_mlp": 0.01256753, + "epoch": 0.30446415151059675, + "flos": 21952080864000.0, + "grad_norm": 1.6090072895521823, + "language_loss": 0.84824491, + "learning_rate": 3.26046097371721e-06, + "loss": 0.92589343, + "num_input_tokens_seen": 108836615, + "router_z_loss_clip": 1.98925781, + "router_z_loss_mlp": 0.18066406, + "step": 5064, + "time_per_iteration": 2.558650493621826 + }, + { + "auxiliary_loss_clip": 0.06490266, + "auxiliary_loss_mlp": 0.01274023, + "balance_loss_clip": 0.06290541, + "balance_loss_mlp": 0.0125644, + "epoch": 0.3045242747632647, + "flos": 16440979248000.0, + "grad_norm": 2.1763674367183965, + "language_loss": 0.76565492, + "learning_rate": 3.2601585686176655e-06, + "loss": 0.84329784, + "num_input_tokens_seen": 108855165, + "router_z_loss_clip": 1.99804688, + "router_z_loss_mlp": 0.17578125, + "step": 5065, + "time_per_iteration": 2.50644588470459 + }, + { + "auxiliary_loss_clip": 0.06490786, + "auxiliary_loss_mlp": 0.01279051, + "balance_loss_clip": 0.06288782, + "balance_loss_mlp": 0.01260586, + "epoch": 0.3045843980159327, + "flos": 31548399966720.0, + "grad_norm": 1.8114152917186497, + "language_loss": 0.62859941, + "learning_rate": 3.2598561157327814e-06, + "loss": 0.70629776, + "num_input_tokens_seen": 108874690, + "router_z_loss_clip": 2.01953125, + "router_z_loss_mlp": 0.18469238, + "step": 5066, + "time_per_iteration": 2.6319751739501953 + }, + { + "auxiliary_loss_clip": 0.06499436, + "auxiliary_loss_mlp": 0.01273162, + "balance_loss_clip": 0.0629437, + "balance_loss_mlp": 0.01255602, + "epoch": 0.30464452126860064, + "flos": 17858645677440.0, + "grad_norm": 2.0549933694905653, + "language_loss": 0.82941914, + "learning_rate": 3.2595536150740265e-06, + "loss": 0.90714514, + "num_input_tokens_seen": 108893140, + "router_z_loss_clip": 2.05078125, + "router_z_loss_mlp": 0.17565918, + "step": 5067, + "time_per_iteration": 2.483863592147827 + }, + { + "auxiliary_loss_clip": 0.06485019, + "auxiliary_loss_mlp": 0.0127176, + "balance_loss_clip": 0.06289113, + "balance_loss_mlp": 0.01255643, + "epoch": 0.3047046445212686, + "flos": 20637682992000.0, + "grad_norm": 1.9234738451458053, + "language_loss": 0.63749218, + "learning_rate": 3.259251066652873e-06, + "loss": 0.71506, + "num_input_tokens_seen": 108911880, + "router_z_loss_clip": 1.95800781, + "router_z_loss_mlp": 0.16113281, + "step": 5068, + "time_per_iteration": 2.5133988857269287 + }, + { + "auxiliary_loss_clip": 0.06487909, + "auxiliary_loss_mlp": 0.01273097, + "balance_loss_clip": 0.06291264, + "balance_loss_mlp": 0.01256884, + "epoch": 0.3047647677739366, + "flos": 21293896642560.0, + "grad_norm": 1.767828765686575, + "language_loss": 0.75521863, + "learning_rate": 3.258948470480793e-06, + "loss": 0.8328287, + "num_input_tokens_seen": 108930440, + "router_z_loss_clip": 1.96679688, + "router_z_loss_mlp": 0.1619873, + "step": 5069, + "time_per_iteration": 2.5039985179901123 + }, + { + "auxiliary_loss_clip": 0.06492448, + "auxiliary_loss_mlp": 0.01270604, + "balance_loss_clip": 0.06298955, + "balance_loss_mlp": 0.01255047, + "epoch": 0.30482489102660454, + "flos": 21002218179840.0, + "grad_norm": 2.053197356954631, + "language_loss": 0.76551294, + "learning_rate": 3.258645826569261e-06, + "loss": 0.84314346, + "num_input_tokens_seen": 108949125, + "router_z_loss_clip": 1.93457031, + "router_z_loss_mlp": 0.15551758, + "step": 5070, + "time_per_iteration": 2.56703519821167 + }, + { + "auxiliary_loss_clip": 0.06501058, + "auxiliary_loss_mlp": 0.01275886, + "balance_loss_clip": 0.06296416, + "balance_loss_mlp": 0.01257742, + "epoch": 0.3048850142792725, + "flos": 26298732689280.0, + "grad_norm": 1.581704774716999, + "language_loss": 0.82567108, + "learning_rate": 3.2583431349297527e-06, + "loss": 0.90344059, + "num_input_tokens_seen": 108972190, + "router_z_loss_clip": 2.046875, + "router_z_loss_mlp": 0.18139648, + "step": 5071, + "time_per_iteration": 3.9534900188446045 + }, + { + "auxiliary_loss_clip": 0.06502657, + "auxiliary_loss_mlp": 0.01270862, + "balance_loss_clip": 0.06296133, + "balance_loss_mlp": 0.01253374, + "epoch": 0.30494513753194047, + "flos": 22352813815680.0, + "grad_norm": 1.6603887086526505, + "language_loss": 0.76386344, + "learning_rate": 3.2580403955737467e-06, + "loss": 0.84159869, + "num_input_tokens_seen": 108990325, + "router_z_loss_clip": 2.06542969, + "router_z_loss_mlp": 0.17492676, + "step": 5072, + "time_per_iteration": 3.9736859798431396 + }, + { + "auxiliary_loss_clip": 0.06492919, + "auxiliary_loss_mlp": 0.01277102, + "balance_loss_clip": 0.06293403, + "balance_loss_mlp": 0.01260544, + "epoch": 0.30500526078460843, + "flos": 19543909720320.0, + "grad_norm": 1.870095200943675, + "language_loss": 0.71741343, + "learning_rate": 3.257737608512723e-06, + "loss": 0.79511362, + "num_input_tokens_seen": 109009505, + "router_z_loss_clip": 1.99609375, + "router_z_loss_mlp": 0.16564941, + "step": 5073, + "time_per_iteration": 3.961787700653076 + }, + { + "auxiliary_loss_clip": 0.064973, + "auxiliary_loss_mlp": 0.01276358, + "balance_loss_clip": 0.06293514, + "balance_loss_mlp": 0.01259752, + "epoch": 0.3050653840372764, + "flos": 14470577360640.0, + "grad_norm": 2.0196062448027843, + "language_loss": 0.76699424, + "learning_rate": 3.257434773758163e-06, + "loss": 0.84473085, + "num_input_tokens_seen": 109026350, + "router_z_loss_clip": 2.03613281, + "router_z_loss_mlp": 0.16601562, + "step": 5074, + "time_per_iteration": 2.498986005783081 + }, + { + "auxiliary_loss_clip": 0.06498405, + "auxiliary_loss_mlp": 0.01271199, + "balance_loss_clip": 0.06298129, + "balance_loss_mlp": 0.01254534, + "epoch": 0.30512550728994436, + "flos": 24250736321280.0, + "grad_norm": 2.0830863268570496, + "language_loss": 0.75075227, + "learning_rate": 3.25713189132155e-06, + "loss": 0.8284483, + "num_input_tokens_seen": 109044165, + "router_z_loss_clip": 2.00292969, + "router_z_loss_mlp": 0.16662598, + "step": 5075, + "time_per_iteration": 2.586857557296753 + }, + { + "auxiliary_loss_clip": 0.06500411, + "auxiliary_loss_mlp": 0.01274386, + "balance_loss_clip": 0.06294686, + "balance_loss_mlp": 0.01256004, + "epoch": 0.30518563054261233, + "flos": 16365774608640.0, + "grad_norm": 1.8100237719305525, + "language_loss": 0.75655556, + "learning_rate": 3.2568289612143703e-06, + "loss": 0.8343035, + "num_input_tokens_seen": 109060665, + "router_z_loss_clip": 2.0546875, + "router_z_loss_mlp": 0.18371582, + "step": 5076, + "time_per_iteration": 2.4945309162139893 + }, + { + "auxiliary_loss_clip": 0.06496741, + "auxiliary_loss_mlp": 0.01270713, + "balance_loss_clip": 0.06296699, + "balance_loss_mlp": 0.01252712, + "epoch": 0.30524575379528035, + "flos": 21585952448640.0, + "grad_norm": 4.173383760279569, + "language_loss": 0.79782987, + "learning_rate": 3.25652598344811e-06, + "loss": 0.87550437, + "num_input_tokens_seen": 109080035, + "router_z_loss_clip": 2.0, + "router_z_loss_mlp": 0.17993164, + "step": 5077, + "time_per_iteration": 2.534932851791382 + }, + { + "auxiliary_loss_clip": 0.06489561, + "auxiliary_loss_mlp": 0.01270916, + "balance_loss_clip": 0.06295882, + "balance_loss_mlp": 0.01254012, + "epoch": 0.3053058770479483, + "flos": 16550872277760.0, + "grad_norm": 2.5701417949840146, + "language_loss": 0.7555238, + "learning_rate": 3.256222958034259e-06, + "loss": 0.83312857, + "num_input_tokens_seen": 109097385, + "router_z_loss_clip": 1.93652344, + "router_z_loss_mlp": 0.16894531, + "step": 5078, + "time_per_iteration": 2.530031442642212 + }, + { + "auxiliary_loss_clip": 0.06495726, + "auxiliary_loss_mlp": 0.01279629, + "balance_loss_clip": 0.06297612, + "balance_loss_mlp": 0.01262487, + "epoch": 0.3053660003006163, + "flos": 12317844988800.0, + "grad_norm": 1.8416681282179364, + "language_loss": 0.67517591, + "learning_rate": 3.255919884984307e-06, + "loss": 0.75292945, + "num_input_tokens_seen": 109115495, + "router_z_loss_clip": 1.98046875, + "router_z_loss_mlp": 0.17126465, + "step": 5079, + "time_per_iteration": 3.8981266021728516 + }, + { + "auxiliary_loss_clip": 0.06496017, + "auxiliary_loss_mlp": 0.01271448, + "balance_loss_clip": 0.06296019, + "balance_loss_mlp": 0.01253757, + "epoch": 0.30542612355328425, + "flos": 23118962423040.0, + "grad_norm": 1.7235884914338329, + "language_loss": 0.8044346, + "learning_rate": 3.2556167643097477e-06, + "loss": 0.88210917, + "num_input_tokens_seen": 109134235, + "router_z_loss_clip": 1.99707031, + "router_z_loss_mlp": 0.17687988, + "step": 5080, + "time_per_iteration": 2.562946081161499 + }, + { + "auxiliary_loss_clip": 0.06497588, + "auxiliary_loss_mlp": 0.01276495, + "balance_loss_clip": 0.06297643, + "balance_loss_mlp": 0.01259377, + "epoch": 0.3054862468059522, + "flos": 24396365917440.0, + "grad_norm": 2.5665035909877725, + "language_loss": 0.81653202, + "learning_rate": 3.255313596022074e-06, + "loss": 0.89427292, + "num_input_tokens_seen": 109152760, + "router_z_loss_clip": 1.99707031, + "router_z_loss_mlp": 0.17114258, + "step": 5081, + "time_per_iteration": 2.6026763916015625 + }, + { + "auxiliary_loss_clip": 0.06490453, + "auxiliary_loss_mlp": 0.0127058, + "balance_loss_clip": 0.06291625, + "balance_loss_mlp": 0.01253962, + "epoch": 0.3055463700586202, + "flos": 29393529315840.0, + "grad_norm": 1.580638075296793, + "language_loss": 0.72516012, + "learning_rate": 3.255010380132783e-06, + "loss": 0.80277044, + "num_input_tokens_seen": 109173925, + "router_z_loss_clip": 1.98632812, + "router_z_loss_mlp": 0.16619873, + "step": 5082, + "time_per_iteration": 2.650310516357422 + }, + { + "auxiliary_loss_clip": 0.06499462, + "auxiliary_loss_mlp": 0.01274957, + "balance_loss_clip": 0.06293429, + "balance_loss_mlp": 0.01257159, + "epoch": 0.30560649331128814, + "flos": 25598606699520.0, + "grad_norm": 2.3807589086926533, + "language_loss": 0.73733467, + "learning_rate": 3.2547071166533736e-06, + "loss": 0.81507885, + "num_input_tokens_seen": 109192510, + "router_z_loss_clip": 2.05761719, + "router_z_loss_mlp": 0.17797852, + "step": 5083, + "time_per_iteration": 2.595439910888672 + }, + { + "auxiliary_loss_clip": 0.06488115, + "auxiliary_loss_mlp": 0.01272372, + "balance_loss_clip": 0.0628676, + "balance_loss_mlp": 0.01254729, + "epoch": 0.3056666165639561, + "flos": 19133156206080.0, + "grad_norm": 1.8141392710911106, + "language_loss": 0.71165347, + "learning_rate": 3.254403805595344e-06, + "loss": 0.78925836, + "num_input_tokens_seen": 109210885, + "router_z_loss_clip": 2.01367188, + "router_z_loss_mlp": 0.17626953, + "step": 5084, + "time_per_iteration": 2.499873161315918 + }, + { + "auxiliary_loss_clip": 0.06505337, + "auxiliary_loss_mlp": 0.01276239, + "balance_loss_clip": 0.063004, + "balance_loss_mlp": 0.01260194, + "epoch": 0.30572673981662407, + "flos": 15529368752640.0, + "grad_norm": 2.0821129981034567, + "language_loss": 0.79337353, + "learning_rate": 3.2541004469701962e-06, + "loss": 0.87118936, + "num_input_tokens_seen": 109229180, + "router_z_loss_clip": 2.04882812, + "router_z_loss_mlp": 0.16027832, + "step": 5085, + "time_per_iteration": 2.479790449142456 + }, + { + "auxiliary_loss_clip": 0.06486039, + "auxiliary_loss_mlp": 0.01278912, + "balance_loss_clip": 0.06289506, + "balance_loss_mlp": 0.01260602, + "epoch": 0.30578686306929204, + "flos": 21512886088320.0, + "grad_norm": 2.123366644532801, + "language_loss": 0.78524947, + "learning_rate": 3.2537970407894342e-06, + "loss": 0.86289901, + "num_input_tokens_seen": 109249510, + "router_z_loss_clip": 1.96484375, + "router_z_loss_mlp": 0.18310547, + "step": 5086, + "time_per_iteration": 2.5372772216796875 + }, + { + "auxiliary_loss_clip": 0.06487311, + "auxiliary_loss_mlp": 0.01277834, + "balance_loss_clip": 0.06289313, + "balance_loss_mlp": 0.01259797, + "epoch": 0.30584698632196, + "flos": 20959689432960.0, + "grad_norm": 1.7535206397091907, + "language_loss": 0.77160186, + "learning_rate": 3.253493587064563e-06, + "loss": 0.8492533, + "num_input_tokens_seen": 109268200, + "router_z_loss_clip": 1.98144531, + "router_z_loss_mlp": 0.18041992, + "step": 5087, + "time_per_iteration": 2.4971578121185303 + }, + { + "auxiliary_loss_clip": 0.06492934, + "auxiliary_loss_mlp": 0.01277252, + "balance_loss_clip": 0.06288779, + "balance_loss_mlp": 0.01258154, + "epoch": 0.30590710957462797, + "flos": 24688044380160.0, + "grad_norm": 1.802467786704899, + "language_loss": 0.7266196, + "learning_rate": 3.2531900858070885e-06, + "loss": 0.80432141, + "num_input_tokens_seen": 109288370, + "router_z_loss_clip": 2.04492188, + "router_z_loss_mlp": 0.19091797, + "step": 5088, + "time_per_iteration": 2.5416259765625 + }, + { + "auxiliary_loss_clip": 0.06501624, + "auxiliary_loss_mlp": 0.0127311, + "balance_loss_clip": 0.06292014, + "balance_loss_mlp": 0.01253893, + "epoch": 0.30596723282729593, + "flos": 17091700456320.0, + "grad_norm": 2.3226252492467037, + "language_loss": 0.79702371, + "learning_rate": 3.252886537028521e-06, + "loss": 0.874771, + "num_input_tokens_seen": 109306730, + "router_z_loss_clip": 2.09570312, + "router_z_loss_mlp": 0.19226074, + "step": 5089, + "time_per_iteration": 2.4745559692382812 + }, + { + "auxiliary_loss_clip": 0.06491631, + "auxiliary_loss_mlp": 0.01275196, + "balance_loss_clip": 0.06291364, + "balance_loss_mlp": 0.01256981, + "epoch": 0.30602735607996395, + "flos": 22863775213440.0, + "grad_norm": 6.857787253608019, + "language_loss": 0.77299303, + "learning_rate": 3.2525829407403703e-06, + "loss": 0.85066134, + "num_input_tokens_seen": 109327360, + "router_z_loss_clip": 2.00097656, + "router_z_loss_mlp": 0.18225098, + "step": 5090, + "time_per_iteration": 2.5330631732940674 + }, + { + "auxiliary_loss_clip": 0.06500913, + "auxiliary_loss_mlp": 0.01279012, + "balance_loss_clip": 0.06295903, + "balance_loss_mlp": 0.01260773, + "epoch": 0.3060874793326319, + "flos": 29869173417600.0, + "grad_norm": 1.854909004407163, + "language_loss": 0.76970392, + "learning_rate": 3.2522792969541488e-06, + "loss": 0.84750324, + "num_input_tokens_seen": 109348135, + "router_z_loss_clip": 2.04882812, + "router_z_loss_mlp": 0.18237305, + "step": 5091, + "time_per_iteration": 2.561894178390503 + }, + { + "auxiliary_loss_clip": 0.06491988, + "auxiliary_loss_mlp": 0.01272552, + "balance_loss_clip": 0.06287533, + "balance_loss_mlp": 0.01254551, + "epoch": 0.3061476025852999, + "flos": 20454765528960.0, + "grad_norm": 1.7300285931862276, + "language_loss": 0.72878456, + "learning_rate": 3.2519756056813705e-06, + "loss": 0.80642998, + "num_input_tokens_seen": 109366220, + "router_z_loss_clip": 2.04589844, + "router_z_loss_mlp": 0.18005371, + "step": 5092, + "time_per_iteration": 2.5661561489105225 + }, + { + "auxiliary_loss_clip": 0.06495406, + "auxiliary_loss_mlp": 0.01276172, + "balance_loss_clip": 0.06294402, + "balance_loss_mlp": 0.01258696, + "epoch": 0.30620772583796785, + "flos": 19397651218560.0, + "grad_norm": 1.8286917674158676, + "language_loss": 0.83293521, + "learning_rate": 3.2516718669335522e-06, + "loss": 0.91065109, + "num_input_tokens_seen": 109385260, + "router_z_loss_clip": 2.00976562, + "router_z_loss_mlp": 0.17468262, + "step": 5093, + "time_per_iteration": 2.49686336517334 + }, + { + "auxiliary_loss_clip": 0.06495437, + "auxiliary_loss_mlp": 0.01277069, + "balance_loss_clip": 0.06295857, + "balance_loss_mlp": 0.01259652, + "epoch": 0.3062678490906358, + "flos": 24031411459200.0, + "grad_norm": 1.7386581048181018, + "language_loss": 0.74963737, + "learning_rate": 3.2513680807222114e-06, + "loss": 0.82736242, + "num_input_tokens_seen": 109405025, + "router_z_loss_clip": 1.99414062, + "router_z_loss_mlp": 0.17419434, + "step": 5094, + "time_per_iteration": 2.5497004985809326 + }, + { + "auxiliary_loss_clip": 0.06491575, + "auxiliary_loss_mlp": 0.01272234, + "balance_loss_clip": 0.06293601, + "balance_loss_mlp": 0.01255735, + "epoch": 0.3063279723433038, + "flos": 19760593178880.0, + "grad_norm": 1.8971341227661025, + "language_loss": 0.76389223, + "learning_rate": 3.251064247058868e-06, + "loss": 0.84153032, + "num_input_tokens_seen": 109422465, + "router_z_loss_clip": 1.9765625, + "router_z_loss_mlp": 0.16503906, + "step": 5095, + "time_per_iteration": 2.493479013442993 + }, + { + "auxiliary_loss_clip": 0.06485657, + "auxiliary_loss_mlp": 0.0128124, + "balance_loss_clip": 0.06288686, + "balance_loss_mlp": 0.01262727, + "epoch": 0.30638809559597174, + "flos": 22455663102720.0, + "grad_norm": 1.6310889817091494, + "language_loss": 0.81246006, + "learning_rate": 3.250760365955042e-06, + "loss": 0.89012897, + "num_input_tokens_seen": 109440575, + "router_z_loss_clip": 1.97070312, + "router_z_loss_mlp": 0.18518066, + "step": 5096, + "time_per_iteration": 2.606100559234619 + }, + { + "auxiliary_loss_clip": 0.06500001, + "auxiliary_loss_mlp": 0.01286183, + "balance_loss_clip": 0.06297529, + "balance_loss_mlp": 0.01269947, + "epoch": 0.3064482188486397, + "flos": 17170846237440.0, + "grad_norm": 2.1701963694762862, + "language_loss": 0.81871414, + "learning_rate": 3.250456437422258e-06, + "loss": 0.89657605, + "num_input_tokens_seen": 109459050, + "router_z_loss_clip": 2.0234375, + "router_z_loss_mlp": 0.16235352, + "step": 5097, + "time_per_iteration": 2.506908893585205 + }, + { + "auxiliary_loss_clip": 0.06498241, + "auxiliary_loss_mlp": 0.01288982, + "balance_loss_clip": 0.06297113, + "balance_loss_mlp": 0.01269647, + "epoch": 0.3065083421013077, + "flos": 23775176073600.0, + "grad_norm": 2.1266024193404385, + "language_loss": 0.7855283, + "learning_rate": 3.250152461472041e-06, + "loss": 0.86340058, + "num_input_tokens_seen": 109475860, + "router_z_loss_clip": 2.0078125, + "router_z_loss_mlp": 0.19335938, + "step": 5098, + "time_per_iteration": 2.546875238418579 + }, + { + "auxiliary_loss_clip": 0.06494713, + "auxiliary_loss_mlp": 0.01291897, + "balance_loss_clip": 0.06296527, + "balance_loss_mlp": 0.0127367, + "epoch": 0.30656846535397564, + "flos": 26438953697280.0, + "grad_norm": 1.8261556885246946, + "language_loss": 0.84430897, + "learning_rate": 3.249848438115917e-06, + "loss": 0.92217511, + "num_input_tokens_seen": 109494760, + "router_z_loss_clip": 1.98242188, + "router_z_loss_mlp": 0.18225098, + "step": 5099, + "time_per_iteration": 2.5726583003997803 + }, + { + "auxiliary_loss_clip": 0.06498358, + "auxiliary_loss_mlp": 0.01287293, + "balance_loss_clip": 0.06295489, + "balance_loss_mlp": 0.01268434, + "epoch": 0.3066285886066436, + "flos": 26659117100160.0, + "grad_norm": 1.588615118025773, + "language_loss": 0.86241573, + "learning_rate": 3.2495443673654148e-06, + "loss": 0.94027227, + "num_input_tokens_seen": 109516480, + "router_z_loss_clip": 2.02832031, + "router_z_loss_mlp": 0.18859863, + "step": 5100, + "time_per_iteration": 2.5711421966552734 + }, + { + "auxiliary_loss_clip": 0.06496789, + "auxiliary_loss_mlp": 0.01283562, + "balance_loss_clip": 0.06296922, + "balance_loss_mlp": 0.01264345, + "epoch": 0.30668871185931157, + "flos": 15055443659520.0, + "grad_norm": 1.7244173580954059, + "language_loss": 0.79369497, + "learning_rate": 3.249240249232065e-06, + "loss": 0.87149858, + "num_input_tokens_seen": 109534615, + "router_z_loss_clip": 1.99804688, + "router_z_loss_mlp": 0.19226074, + "step": 5101, + "time_per_iteration": 2.539132833480835 + }, + { + "auxiliary_loss_clip": 0.0650195, + "auxiliary_loss_mlp": 0.01287055, + "balance_loss_clip": 0.06299084, + "balance_loss_mlp": 0.01268172, + "epoch": 0.30674883511197953, + "flos": 20087966280960.0, + "grad_norm": 1.7739241542858428, + "language_loss": 0.80435872, + "learning_rate": 3.2489360837273998e-06, + "loss": 0.88224876, + "num_input_tokens_seen": 109554040, + "router_z_loss_clip": 2.02832031, + "router_z_loss_mlp": 0.1887207, + "step": 5102, + "time_per_iteration": 2.5558016300201416 + }, + { + "auxiliary_loss_clip": 0.06503183, + "auxiliary_loss_mlp": 0.01284648, + "balance_loss_clip": 0.06301928, + "balance_loss_mlp": 0.01265253, + "epoch": 0.30680895836464755, + "flos": 22900518028800.0, + "grad_norm": 1.6865927559982214, + "language_loss": 0.89335668, + "learning_rate": 3.2486318708629532e-06, + "loss": 0.97123504, + "num_input_tokens_seen": 109574345, + "router_z_loss_clip": 2.015625, + "router_z_loss_mlp": 0.19396973, + "step": 5103, + "time_per_iteration": 2.542555570602417 + }, + { + "auxiliary_loss_clip": 0.06501935, + "auxiliary_loss_mlp": 0.01286618, + "balance_loss_clip": 0.06302223, + "balance_loss_mlp": 0.0126876, + "epoch": 0.3068690816173155, + "flos": 23702948254080.0, + "grad_norm": 2.119732369805114, + "language_loss": 0.74448419, + "learning_rate": 3.2483276106502607e-06, + "loss": 0.82236969, + "num_input_tokens_seen": 109593670, + "router_z_loss_clip": 1.99707031, + "router_z_loss_mlp": 0.17871094, + "step": 5104, + "time_per_iteration": 2.560253143310547 + }, + { + "auxiliary_loss_clip": 0.06502049, + "auxiliary_loss_mlp": 0.01274873, + "balance_loss_clip": 0.06295487, + "balance_loss_mlp": 0.01257552, + "epoch": 0.3069292048699835, + "flos": 23557947563520.0, + "grad_norm": 1.7334515387821061, + "language_loss": 0.72909176, + "learning_rate": 3.2480233031008605e-06, + "loss": 0.80686092, + "num_input_tokens_seen": 109613385, + "router_z_loss_clip": 2.06835938, + "router_z_loss_mlp": 0.17321777, + "step": 5105, + "time_per_iteration": 2.5751454830169678 + }, + { + "auxiliary_loss_clip": 0.06498945, + "auxiliary_loss_mlp": 0.01282015, + "balance_loss_clip": 0.06297372, + "balance_loss_mlp": 0.01263907, + "epoch": 0.30698932812265145, + "flos": 24537970517760.0, + "grad_norm": 2.0977567017321608, + "language_loss": 0.87578112, + "learning_rate": 3.2477189482262916e-06, + "loss": 0.95359075, + "num_input_tokens_seen": 109632395, + "router_z_loss_clip": 2.01464844, + "router_z_loss_mlp": 0.18103027, + "step": 5106, + "time_per_iteration": 2.54413104057312 + }, + { + "auxiliary_loss_clip": 0.06503764, + "auxiliary_loss_mlp": 0.01279082, + "balance_loss_clip": 0.06296381, + "balance_loss_mlp": 0.01261189, + "epoch": 0.3070494513753194, + "flos": 21002805158400.0, + "grad_norm": 2.310425767564757, + "language_loss": 0.72092319, + "learning_rate": 3.2474145460380945e-06, + "loss": 0.79875165, + "num_input_tokens_seen": 109651380, + "router_z_loss_clip": 2.0703125, + "router_z_loss_mlp": 0.17883301, + "step": 5107, + "time_per_iteration": 2.571430206298828 + }, + { + "auxiliary_loss_clip": 0.06493405, + "auxiliary_loss_mlp": 0.01275594, + "balance_loss_clip": 0.06294269, + "balance_loss_mlp": 0.01256735, + "epoch": 0.3071095746279874, + "flos": 19031942073600.0, + "grad_norm": 1.99593781887154, + "language_loss": 0.72653455, + "learning_rate": 3.247110096547814e-06, + "loss": 0.80422449, + "num_input_tokens_seen": 109670240, + "router_z_loss_clip": 1.99121094, + "router_z_loss_mlp": 0.18847656, + "step": 5108, + "time_per_iteration": 2.497788190841675 + }, + { + "auxiliary_loss_clip": 0.06499057, + "auxiliary_loss_mlp": 0.01277116, + "balance_loss_clip": 0.06297708, + "balance_loss_mlp": 0.01259533, + "epoch": 0.30716969788065535, + "flos": 21221962312320.0, + "grad_norm": 1.48656392648579, + "language_loss": 0.86441541, + "learning_rate": 3.2468055997669926e-06, + "loss": 0.94217712, + "num_input_tokens_seen": 109690810, + "router_z_loss_clip": 2.01074219, + "router_z_loss_mlp": 0.17578125, + "step": 5109, + "time_per_iteration": 2.563480854034424 + }, + { + "auxiliary_loss_clip": 0.06501789, + "auxiliary_loss_mlp": 0.01278566, + "balance_loss_clip": 0.063005, + "balance_loss_mlp": 0.01260541, + "epoch": 0.3072298211333233, + "flos": 25779385883520.0, + "grad_norm": 1.8235353484155168, + "language_loss": 0.67904091, + "learning_rate": 3.2465010557071788e-06, + "loss": 0.75684446, + "num_input_tokens_seen": 109711145, + "router_z_loss_clip": 2.01171875, + "router_z_loss_mlp": 0.18029785, + "step": 5110, + "time_per_iteration": 3.9785540103912354 + }, + { + "auxiliary_loss_clip": 0.06493396, + "auxiliary_loss_mlp": 0.01273369, + "balance_loss_clip": 0.06295427, + "balance_loss_mlp": 0.01256727, + "epoch": 0.3072899443859913, + "flos": 25856099896320.0, + "grad_norm": 1.4123986071879864, + "language_loss": 0.76984161, + "learning_rate": 3.246196464379919e-06, + "loss": 0.84750926, + "num_input_tokens_seen": 109731425, + "router_z_loss_clip": 1.97753906, + "router_z_loss_mlp": 0.16638184, + "step": 5111, + "time_per_iteration": 2.5771117210388184 + }, + { + "auxiliary_loss_clip": 0.06498265, + "auxiliary_loss_mlp": 0.01277301, + "balance_loss_clip": 0.06293567, + "balance_loss_mlp": 0.01258585, + "epoch": 0.30735006763865924, + "flos": 25930130578560.0, + "grad_norm": 2.349951455822933, + "language_loss": 0.67755288, + "learning_rate": 3.245891825796765e-06, + "loss": 0.75530857, + "num_input_tokens_seen": 109752720, + "router_z_loss_clip": 2.04394531, + "router_z_loss_mlp": 0.18713379, + "step": 5112, + "time_per_iteration": 3.963136672973633 + }, + { + "auxiliary_loss_clip": 0.0650286, + "auxiliary_loss_mlp": 0.01277737, + "balance_loss_clip": 0.06295824, + "balance_loss_mlp": 0.01257614, + "epoch": 0.3074101908913272, + "flos": 30924442938240.0, + "grad_norm": 2.270303220058131, + "language_loss": 0.79939896, + "learning_rate": 3.2455871399692678e-06, + "loss": 0.87720484, + "num_input_tokens_seen": 109772840, + "router_z_loss_clip": 2.06640625, + "router_z_loss_mlp": 0.20117188, + "step": 5113, + "time_per_iteration": 4.084795236587524 + }, + { + "auxiliary_loss_clip": 0.06502695, + "auxiliary_loss_mlp": 0.01276516, + "balance_loss_clip": 0.06297943, + "balance_loss_mlp": 0.01258599, + "epoch": 0.30747031414399517, + "flos": 18406182182400.0, + "grad_norm": 2.072714063381377, + "language_loss": 0.77269047, + "learning_rate": 3.2452824069089815e-06, + "loss": 0.85048258, + "num_input_tokens_seen": 109790150, + "router_z_loss_clip": 2.04589844, + "router_z_loss_mlp": 0.17919922, + "step": 5114, + "time_per_iteration": 2.4906773567199707 + }, + { + "auxiliary_loss_clip": 0.06498024, + "auxiliary_loss_mlp": 0.01283612, + "balance_loss_clip": 0.06298083, + "balance_loss_mlp": 0.01265087, + "epoch": 0.30753043739666314, + "flos": 22638957909120.0, + "grad_norm": 1.8131309248321845, + "language_loss": 0.62640405, + "learning_rate": 3.2449776266274623e-06, + "loss": 0.70422041, + "num_input_tokens_seen": 109807985, + "router_z_loss_clip": 2.0, + "router_z_loss_mlp": 0.18530273, + "step": 5115, + "time_per_iteration": 2.5328574180603027 + }, + { + "auxiliary_loss_clip": 0.06499057, + "auxiliary_loss_mlp": 0.01274347, + "balance_loss_clip": 0.06295817, + "balance_loss_mlp": 0.0125513, + "epoch": 0.3075905606493311, + "flos": 27351360806400.0, + "grad_norm": 1.7894066300170501, + "language_loss": 0.83589995, + "learning_rate": 3.2446727991362657e-06, + "loss": 0.91363406, + "num_input_tokens_seen": 109825920, + "router_z_loss_clip": 2.03222656, + "router_z_loss_mlp": 0.19213867, + "step": 5116, + "time_per_iteration": 2.562014102935791 + }, + { + "auxiliary_loss_clip": 0.06500115, + "auxiliary_loss_mlp": 0.01273454, + "balance_loss_clip": 0.06298394, + "balance_loss_mlp": 0.0125512, + "epoch": 0.3076506839019991, + "flos": 22097333116800.0, + "grad_norm": 1.8649453582041782, + "language_loss": 0.76016742, + "learning_rate": 3.244367924446952e-06, + "loss": 0.83790314, + "num_input_tokens_seen": 109846220, + "router_z_loss_clip": 2.01660156, + "router_z_loss_mlp": 0.18322754, + "step": 5117, + "time_per_iteration": 2.5509209632873535 + }, + { + "auxiliary_loss_clip": 0.06498168, + "auxiliary_loss_mlp": 0.01274202, + "balance_loss_clip": 0.0629583, + "balance_loss_mlp": 0.01256142, + "epoch": 0.3077108071546671, + "flos": 21296160702720.0, + "grad_norm": 2.167097847201453, + "language_loss": 0.72108531, + "learning_rate": 3.2440630025710826e-06, + "loss": 0.79880905, + "num_input_tokens_seen": 109863870, + "router_z_loss_clip": 2.02246094, + "router_z_loss_mlp": 0.18054199, + "step": 5118, + "time_per_iteration": 2.5190913677215576 + }, + { + "auxiliary_loss_clip": 0.06502286, + "auxiliary_loss_mlp": 0.01275745, + "balance_loss_clip": 0.06299888, + "balance_loss_mlp": 0.01258198, + "epoch": 0.30777093040733505, + "flos": 21436884835200.0, + "grad_norm": 2.760855389686565, + "language_loss": 0.74956095, + "learning_rate": 3.243758033520219e-06, + "loss": 0.82734126, + "num_input_tokens_seen": 109883500, + "router_z_loss_clip": 2.0234375, + "router_z_loss_mlp": 0.17553711, + "step": 5119, + "time_per_iteration": 3.973721981048584 + }, + { + "auxiliary_loss_clip": 0.06494488, + "auxiliary_loss_mlp": 0.01279388, + "balance_loss_clip": 0.06289928, + "balance_loss_mlp": 0.01259814, + "epoch": 0.307831053660003, + "flos": 23156040654720.0, + "grad_norm": 1.7924264386276263, + "language_loss": 0.80264926, + "learning_rate": 3.243453017305926e-06, + "loss": 0.88038802, + "num_input_tokens_seen": 109904620, + "router_z_loss_clip": 2.04101562, + "router_z_loss_mlp": 0.19580078, + "step": 5120, + "time_per_iteration": 2.54705548286438 + }, + { + "auxiliary_loss_clip": 0.06492078, + "auxiliary_loss_mlp": 0.01273208, + "balance_loss_clip": 0.06292595, + "balance_loss_mlp": 0.01255445, + "epoch": 0.307891176912671, + "flos": 17025510130560.0, + "grad_norm": 1.642273509687288, + "language_loss": 0.80521786, + "learning_rate": 3.24314795393977e-06, + "loss": 0.88287073, + "num_input_tokens_seen": 109922275, + "router_z_loss_clip": 1.99707031, + "router_z_loss_mlp": 0.1776123, + "step": 5121, + "time_per_iteration": 2.515054702758789 + }, + { + "auxiliary_loss_clip": 0.06496292, + "auxiliary_loss_mlp": 0.0127453, + "balance_loss_clip": 0.06298114, + "balance_loss_mlp": 0.01256875, + "epoch": 0.30795130016533895, + "flos": 27711745217280.0, + "grad_norm": 1.3913461280715187, + "language_loss": 0.82847351, + "learning_rate": 3.242842843433319e-06, + "loss": 0.90618169, + "num_input_tokens_seen": 109944265, + "router_z_loss_clip": 1.98339844, + "router_z_loss_mlp": 0.17651367, + "step": 5122, + "time_per_iteration": 2.5832252502441406 + }, + { + "auxiliary_loss_clip": 0.06416376, + "auxiliary_loss_mlp": 0.01252861, + "balance_loss_clip": 0.0632116, + "balance_loss_mlp": 0.01249526, + "epoch": 0.3080114234180069, + "flos": 69080973373440.0, + "grad_norm": 0.7221499072225652, + "language_loss": 0.58650029, + "learning_rate": 3.242537685798143e-06, + "loss": 0.66319263, + "num_input_tokens_seen": 110014160, + "router_z_loss_clip": 0.953125, + "router_z_loss_mlp": 0.03341675, + "step": 5123, + "time_per_iteration": 3.3316402435302734 + }, + { + "auxiliary_loss_clip": 0.06503562, + "auxiliary_loss_mlp": 0.01279925, + "balance_loss_clip": 0.06296872, + "balance_loss_mlp": 0.01260744, + "epoch": 0.3080715466706749, + "flos": 24066938390400.0, + "grad_norm": 1.6584153298959496, + "language_loss": 0.83586073, + "learning_rate": 3.242232481045813e-06, + "loss": 0.91369557, + "num_input_tokens_seen": 110034865, + "router_z_loss_clip": 2.06640625, + "router_z_loss_mlp": 0.1920166, + "step": 5124, + "time_per_iteration": 2.589906930923462 + }, + { + "auxiliary_loss_clip": 0.06498908, + "auxiliary_loss_mlp": 0.01271737, + "balance_loss_clip": 0.06294107, + "balance_loss_mlp": 0.01253629, + "epoch": 0.30813166992334284, + "flos": 25855806407040.0, + "grad_norm": 2.061271988083176, + "language_loss": 0.79248756, + "learning_rate": 3.2419272291879035e-06, + "loss": 0.87019402, + "num_input_tokens_seen": 110052930, + "router_z_loss_clip": 2.04589844, + "router_z_loss_mlp": 0.1809082, + "step": 5125, + "time_per_iteration": 2.550884485244751 + }, + { + "auxiliary_loss_clip": 0.06501068, + "auxiliary_loss_mlp": 0.012774, + "balance_loss_clip": 0.06292764, + "balance_loss_mlp": 0.01258374, + "epoch": 0.3081917931760108, + "flos": 20455981413120.0, + "grad_norm": 2.085029494567846, + "language_loss": 0.64930958, + "learning_rate": 3.241621930235989e-06, + "loss": 0.72709423, + "num_input_tokens_seen": 110071765, + "router_z_loss_clip": 2.08203125, + "router_z_loss_mlp": 0.19018555, + "step": 5126, + "time_per_iteration": 2.576352119445801 + }, + { + "auxiliary_loss_clip": 0.06490224, + "auxiliary_loss_mlp": 0.01277045, + "balance_loss_clip": 0.06294391, + "balance_loss_mlp": 0.01259533, + "epoch": 0.3082519164286788, + "flos": 22173208588800.0, + "grad_norm": 1.5681866965441809, + "language_loss": 0.87117672, + "learning_rate": 3.241316584201646e-06, + "loss": 0.94884944, + "num_input_tokens_seen": 110092660, + "router_z_loss_clip": 1.95800781, + "router_z_loss_mlp": 0.17504883, + "step": 5127, + "time_per_iteration": 2.567615270614624 + }, + { + "auxiliary_loss_clip": 0.0649047, + "auxiliary_loss_mlp": 0.01273562, + "balance_loss_clip": 0.06291968, + "balance_loss_mlp": 0.0125593, + "epoch": 0.30831203968134674, + "flos": 28921029742080.0, + "grad_norm": 1.4544126326452276, + "language_loss": 0.69282925, + "learning_rate": 3.2410111910964538e-06, + "loss": 0.77046961, + "num_input_tokens_seen": 110114960, + "router_z_loss_clip": 1.984375, + "router_z_loss_mlp": 0.1763916, + "step": 5128, + "time_per_iteration": 2.6129322052001953 + }, + { + "auxiliary_loss_clip": 0.06499469, + "auxiliary_loss_mlp": 0.01276178, + "balance_loss_clip": 0.06295171, + "balance_loss_mlp": 0.01257843, + "epoch": 0.3083721629340147, + "flos": 25675069150080.0, + "grad_norm": 2.0282558045061396, + "language_loss": 0.7195785, + "learning_rate": 3.240705750931993e-06, + "loss": 0.79733503, + "num_input_tokens_seen": 110135750, + "router_z_loss_clip": 2.04394531, + "router_z_loss_mlp": 0.18334961, + "step": 5129, + "time_per_iteration": 2.5587165355682373 + }, + { + "auxiliary_loss_clip": 0.06388761, + "auxiliary_loss_mlp": 0.01275431, + "balance_loss_clip": 0.06292662, + "balance_loss_mlp": 0.01271816, + "epoch": 0.3084322861866827, + "flos": 68233666487040.0, + "grad_norm": 0.8077979927321801, + "language_loss": 0.58935201, + "learning_rate": 3.240400263719846e-06, + "loss": 0.66599393, + "num_input_tokens_seen": 110189480, + "router_z_loss_clip": 0.9609375, + "router_z_loss_mlp": 0.03607178, + "step": 5130, + "time_per_iteration": 3.2353098392486572 + }, + { + "auxiliary_loss_clip": 0.06498231, + "auxiliary_loss_mlp": 0.012758, + "balance_loss_clip": 0.0629265, + "balance_loss_mlp": 0.01258443, + "epoch": 0.3084924094393507, + "flos": 20301630992640.0, + "grad_norm": 2.071340626605126, + "language_loss": 0.73298538, + "learning_rate": 3.2400947294715957e-06, + "loss": 0.81072569, + "num_input_tokens_seen": 110206445, + "router_z_loss_clip": 2.05664062, + "router_z_loss_mlp": 0.17370605, + "step": 5131, + "time_per_iteration": 2.523510456085205 + }, + { + "auxiliary_loss_clip": 0.06487547, + "auxiliary_loss_mlp": 0.01274811, + "balance_loss_clip": 0.06290068, + "balance_loss_mlp": 0.01257728, + "epoch": 0.30855253269201866, + "flos": 23956374528000.0, + "grad_norm": 1.6208223340220833, + "language_loss": 0.71358359, + "learning_rate": 3.2397891481988303e-06, + "loss": 0.79120713, + "num_input_tokens_seen": 110226845, + "router_z_loss_clip": 1.97558594, + "router_z_loss_mlp": 0.17077637, + "step": 5132, + "time_per_iteration": 2.581470012664795 + }, + { + "auxiliary_loss_clip": 0.06487268, + "auxiliary_loss_mlp": 0.01273323, + "balance_loss_clip": 0.06290212, + "balance_loss_mlp": 0.01255262, + "epoch": 0.3086126559446866, + "flos": 19288009751040.0, + "grad_norm": 1.7801590489825803, + "language_loss": 0.90374929, + "learning_rate": 3.239483519913136e-06, + "loss": 0.98135513, + "num_input_tokens_seen": 110244095, + "router_z_loss_clip": 1.96972656, + "router_z_loss_mlp": 0.18066406, + "step": 5133, + "time_per_iteration": 2.5197763442993164 + }, + { + "auxiliary_loss_clip": 0.06499831, + "auxiliary_loss_mlp": 0.01275105, + "balance_loss_clip": 0.06295495, + "balance_loss_mlp": 0.01257105, + "epoch": 0.3086727791973546, + "flos": 33768328913280.0, + "grad_norm": 1.8524807236065886, + "language_loss": 0.67443442, + "learning_rate": 3.239177844626102e-06, + "loss": 0.75218379, + "num_input_tokens_seen": 110264240, + "router_z_loss_clip": 2.04101562, + "router_z_loss_mlp": 0.18017578, + "step": 5134, + "time_per_iteration": 2.664303779602051 + }, + { + "auxiliary_loss_clip": 0.06498815, + "auxiliary_loss_mlp": 0.01275704, + "balance_loss_clip": 0.06293166, + "balance_loss_mlp": 0.01257167, + "epoch": 0.30873290245002255, + "flos": 16039659317760.0, + "grad_norm": 1.8927812104332384, + "language_loss": 0.83517784, + "learning_rate": 3.2388721223493197e-06, + "loss": 0.91292304, + "num_input_tokens_seen": 110282450, + "router_z_loss_clip": 2.05664062, + "router_z_loss_mlp": 0.18518066, + "step": 5135, + "time_per_iteration": 2.505138397216797 + }, + { + "auxiliary_loss_clip": 0.06377634, + "auxiliary_loss_mlp": 0.01258895, + "balance_loss_clip": 0.06282344, + "balance_loss_mlp": 0.01255602, + "epoch": 0.3087930257026905, + "flos": 65070415474560.0, + "grad_norm": 0.6863645266912056, + "language_loss": 0.55337238, + "learning_rate": 3.2385663530943824e-06, + "loss": 0.62973773, + "num_input_tokens_seen": 110343715, + "router_z_loss_clip": 0.953125, + "router_z_loss_mlp": 0.0329895, + "step": 5136, + "time_per_iteration": 3.179166555404663 + }, + { + "auxiliary_loss_clip": 0.06488921, + "auxiliary_loss_mlp": 0.01274465, + "balance_loss_clip": 0.06290261, + "balance_loss_mlp": 0.01257085, + "epoch": 0.3088531489553585, + "flos": 74754001733760.0, + "grad_norm": 1.8635236180899502, + "language_loss": 0.76610464, + "learning_rate": 3.2382605368728852e-06, + "loss": 0.8437385, + "num_input_tokens_seen": 110368430, + "router_z_loss_clip": 1.98632812, + "router_z_loss_mlp": 0.1739502, + "step": 5137, + "time_per_iteration": 2.9993999004364014 + }, + { + "auxiliary_loss_clip": 0.06489644, + "auxiliary_loss_mlp": 0.01272707, + "balance_loss_clip": 0.06290223, + "balance_loss_mlp": 0.01255458, + "epoch": 0.30891327220802645, + "flos": 21148686316800.0, + "grad_norm": 1.7480087539569926, + "language_loss": 0.80450445, + "learning_rate": 3.237954673696424e-06, + "loss": 0.882128, + "num_input_tokens_seen": 110386735, + "router_z_loss_clip": 1.9921875, + "router_z_loss_mlp": 0.17248535, + "step": 5138, + "time_per_iteration": 2.531916856765747 + }, + { + "auxiliary_loss_clip": 0.06496161, + "auxiliary_loss_mlp": 0.01276289, + "balance_loss_clip": 0.06294001, + "balance_loss_mlp": 0.01258896, + "epoch": 0.3089733954606944, + "flos": 25671295716480.0, + "grad_norm": 1.629930216805369, + "language_loss": 0.81626344, + "learning_rate": 3.2376487635765983e-06, + "loss": 0.89398789, + "num_input_tokens_seen": 110406820, + "router_z_loss_clip": 2.02050781, + "router_z_loss_mlp": 0.1739502, + "step": 5139, + "time_per_iteration": 2.585380792617798 + }, + { + "auxiliary_loss_clip": 0.06501773, + "auxiliary_loss_mlp": 0.01277306, + "balance_loss_clip": 0.06292425, + "balance_loss_mlp": 0.01258817, + "epoch": 0.3090335187133624, + "flos": 19433429712000.0, + "grad_norm": 2.0033599705043854, + "language_loss": 0.77724934, + "learning_rate": 3.2373428065250067e-06, + "loss": 0.85504013, + "num_input_tokens_seen": 110424225, + "router_z_loss_clip": 2.09179688, + "router_z_loss_mlp": 0.18481445, + "step": 5140, + "time_per_iteration": 2.504387617111206 + }, + { + "auxiliary_loss_clip": 0.06482549, + "auxiliary_loss_mlp": 0.01272919, + "balance_loss_clip": 0.06290817, + "balance_loss_mlp": 0.0125741, + "epoch": 0.30909364196603034, + "flos": 20017541324160.0, + "grad_norm": 1.9132937458234096, + "language_loss": 0.78916645, + "learning_rate": 3.237036802553252e-06, + "loss": 0.86672109, + "num_input_tokens_seen": 110443310, + "router_z_loss_clip": 1.91601562, + "router_z_loss_mlp": 0.15515137, + "step": 5141, + "time_per_iteration": 2.5588464736938477 + }, + { + "auxiliary_loss_clip": 0.06494773, + "auxiliary_loss_mlp": 0.01277459, + "balance_loss_clip": 0.06291379, + "balance_loss_mlp": 0.01260543, + "epoch": 0.3091537652186983, + "flos": 19682830990080.0, + "grad_norm": 2.2087235088394728, + "language_loss": 0.8789897, + "learning_rate": 3.2367307516729377e-06, + "loss": 0.95671201, + "num_input_tokens_seen": 110460215, + "router_z_loss_clip": 2.03222656, + "router_z_loss_mlp": 0.16906738, + "step": 5142, + "time_per_iteration": 2.52750825881958 + }, + { + "auxiliary_loss_clip": 0.06498981, + "auxiliary_loss_mlp": 0.01276818, + "balance_loss_clip": 0.06294474, + "balance_loss_mlp": 0.01259438, + "epoch": 0.3092138884713663, + "flos": 17025845546880.0, + "grad_norm": 2.3473661014686984, + "language_loss": 0.7985431, + "learning_rate": 3.23642465389567e-06, + "loss": 0.87630117, + "num_input_tokens_seen": 110479385, + "router_z_loss_clip": 2.04785156, + "router_z_loss_mlp": 0.17382812, + "step": 5143, + "time_per_iteration": 2.658299207687378 + }, + { + "auxiliary_loss_clip": 0.06489455, + "auxiliary_loss_mlp": 0.01277055, + "balance_loss_clip": 0.06291586, + "balance_loss_mlp": 0.01260378, + "epoch": 0.3092740117240343, + "flos": 25017052636800.0, + "grad_norm": 1.6187717199492768, + "language_loss": 0.72479737, + "learning_rate": 3.236118509233055e-06, + "loss": 0.8024624, + "num_input_tokens_seen": 110499885, + "router_z_loss_clip": 1.97949219, + "router_z_loss_mlp": 0.16662598, + "step": 5144, + "time_per_iteration": 2.547358989715576 + }, + { + "auxiliary_loss_clip": 0.06496169, + "auxiliary_loss_mlp": 0.01272398, + "balance_loss_clip": 0.06292454, + "balance_loss_mlp": 0.01256138, + "epoch": 0.30933413497670226, + "flos": 25597013472000.0, + "grad_norm": 2.2714150562550466, + "language_loss": 0.74676621, + "learning_rate": 3.235812317696702e-06, + "loss": 0.82445192, + "num_input_tokens_seen": 110519690, + "router_z_loss_clip": 2.03515625, + "router_z_loss_mlp": 0.16271973, + "step": 5145, + "time_per_iteration": 2.6273365020751953 + }, + { + "auxiliary_loss_clip": 0.06490701, + "auxiliary_loss_mlp": 0.01273039, + "balance_loss_clip": 0.06289125, + "balance_loss_mlp": 0.01256296, + "epoch": 0.3093942582293702, + "flos": 24396617479680.0, + "grad_norm": 1.731689317121935, + "language_loss": 0.76830649, + "learning_rate": 3.2355060792982224e-06, + "loss": 0.84594393, + "num_input_tokens_seen": 110540520, + "router_z_loss_clip": 2.01464844, + "router_z_loss_mlp": 0.16729736, + "step": 5146, + "time_per_iteration": 2.5352702140808105 + }, + { + "auxiliary_loss_clip": 0.06485911, + "auxiliary_loss_mlp": 0.01273533, + "balance_loss_clip": 0.06287882, + "balance_loss_mlp": 0.0125707, + "epoch": 0.3094543814820382, + "flos": 19652586865920.0, + "grad_norm": 1.8011449994622988, + "language_loss": 0.66675043, + "learning_rate": 3.2351997940492286e-06, + "loss": 0.74434483, + "num_input_tokens_seen": 110557950, + "router_z_loss_clip": 1.97949219, + "router_z_loss_mlp": 0.16467285, + "step": 5147, + "time_per_iteration": 2.545940637588501 + }, + { + "auxiliary_loss_clip": 0.06492072, + "auxiliary_loss_mlp": 0.01271267, + "balance_loss_clip": 0.0628895, + "balance_loss_mlp": 0.01253731, + "epoch": 0.30951450473470615, + "flos": 25670499102720.0, + "grad_norm": 1.8580519203508368, + "language_loss": 0.74971956, + "learning_rate": 3.2348934619613346e-06, + "loss": 0.82735288, + "num_input_tokens_seen": 110578215, + "router_z_loss_clip": 2.03125, + "router_z_loss_mlp": 0.17529297, + "step": 5148, + "time_per_iteration": 2.5673537254333496 + }, + { + "auxiliary_loss_clip": 0.06501722, + "auxiliary_loss_mlp": 0.01278545, + "balance_loss_clip": 0.06290632, + "balance_loss_mlp": 0.01260342, + "epoch": 0.3095746279873741, + "flos": 12025202204160.0, + "grad_norm": 2.1335435485893166, + "language_loss": 0.73367, + "learning_rate": 3.2345870830461567e-06, + "loss": 0.81147265, + "num_input_tokens_seen": 110592990, + "router_z_loss_clip": 2.11230469, + "router_z_loss_mlp": 0.18212891, + "step": 5149, + "time_per_iteration": 2.682609796524048 + }, + { + "auxiliary_loss_clip": 0.06497431, + "auxiliary_loss_mlp": 0.01277143, + "balance_loss_clip": 0.06292653, + "balance_loss_mlp": 0.01258534, + "epoch": 0.3096347512400421, + "flos": 23629798039680.0, + "grad_norm": 1.913638713978071, + "language_loss": 0.85296845, + "learning_rate": 3.2342806573153132e-06, + "loss": 0.93071413, + "num_input_tokens_seen": 110612130, + "router_z_loss_clip": 2.04785156, + "router_z_loss_mlp": 0.18591309, + "step": 5150, + "time_per_iteration": 3.9813008308410645 + }, + { + "auxiliary_loss_clip": 0.06483387, + "auxiliary_loss_mlp": 0.01274387, + "balance_loss_clip": 0.06285527, + "balance_loss_mlp": 0.01256815, + "epoch": 0.30969487449271005, + "flos": 22536024768000.0, + "grad_norm": 1.8960829077128427, + "language_loss": 0.79181123, + "learning_rate": 3.233974184780424e-06, + "loss": 0.86938894, + "num_input_tokens_seen": 110632045, + "router_z_loss_clip": 1.97851562, + "router_z_loss_mlp": 0.17565918, + "step": 5151, + "time_per_iteration": 2.5336477756500244 + }, + { + "auxiliary_loss_clip": 0.06493182, + "auxiliary_loss_mlp": 0.01274378, + "balance_loss_clip": 0.06291731, + "balance_loss_mlp": 0.01257426, + "epoch": 0.309754997745378, + "flos": 15273301075200.0, + "grad_norm": 2.079664023782487, + "language_loss": 0.67843604, + "learning_rate": 3.2336676654531084e-06, + "loss": 0.75611162, + "num_input_tokens_seen": 110649340, + "router_z_loss_clip": 2.01367188, + "router_z_loss_mlp": 0.16931152, + "step": 5152, + "time_per_iteration": 5.332815647125244 + }, + { + "auxiliary_loss_clip": 0.06492282, + "auxiliary_loss_mlp": 0.01278303, + "balance_loss_clip": 0.06293005, + "balance_loss_mlp": 0.01261888, + "epoch": 0.309815120998046, + "flos": 26986532129280.0, + "grad_norm": 1.9990242894688834, + "language_loss": 0.83170605, + "learning_rate": 3.2333610993449926e-06, + "loss": 0.90941191, + "num_input_tokens_seen": 110668450, + "router_z_loss_clip": 1.98925781, + "router_z_loss_mlp": 0.16394043, + "step": 5153, + "time_per_iteration": 2.5944862365722656 + }, + { + "auxiliary_loss_clip": 0.06488585, + "auxiliary_loss_mlp": 0.0127453, + "balance_loss_clip": 0.06290878, + "balance_loss_mlp": 0.0125709, + "epoch": 0.30987524425071394, + "flos": 21149692565760.0, + "grad_norm": 1.7708804151784365, + "language_loss": 0.74136615, + "learning_rate": 3.2330544864676997e-06, + "loss": 0.81899732, + "num_input_tokens_seen": 110689410, + "router_z_loss_clip": 1.9765625, + "router_z_loss_mlp": 0.17456055, + "step": 5154, + "time_per_iteration": 2.529526948928833 + }, + { + "auxiliary_loss_clip": 0.0648791, + "auxiliary_loss_mlp": 0.01284436, + "balance_loss_clip": 0.06292189, + "balance_loss_mlp": 0.01267544, + "epoch": 0.3099353675033819, + "flos": 15273720345600.0, + "grad_norm": 2.7515131151360763, + "language_loss": 0.76419097, + "learning_rate": 3.232747826832858e-06, + "loss": 0.84191442, + "num_input_tokens_seen": 110707350, + "router_z_loss_clip": 1.95605469, + "router_z_loss_mlp": 0.16882324, + "step": 5155, + "time_per_iteration": 2.5338993072509766 + }, + { + "auxiliary_loss_clip": 0.06490543, + "auxiliary_loss_mlp": 0.01273122, + "balance_loss_clip": 0.06289169, + "balance_loss_mlp": 0.01256373, + "epoch": 0.30999549075604993, + "flos": 15419182233600.0, + "grad_norm": 1.684257178792462, + "language_loss": 0.79886794, + "learning_rate": 3.232441120452094e-06, + "loss": 0.87650466, + "num_input_tokens_seen": 110724910, + "router_z_loss_clip": 2.01367188, + "router_z_loss_mlp": 0.1673584, + "step": 5156, + "time_per_iteration": 2.5190272331237793 + }, + { + "auxiliary_loss_clip": 0.06493768, + "auxiliary_loss_mlp": 0.01281451, + "balance_loss_clip": 0.06290715, + "balance_loss_mlp": 0.01264821, + "epoch": 0.3100556140087179, + "flos": 23191106388480.0, + "grad_norm": 2.1803769191775197, + "language_loss": 0.74967813, + "learning_rate": 3.23213436733704e-06, + "loss": 0.82743037, + "num_input_tokens_seen": 110744010, + "router_z_loss_clip": 2.03125, + "router_z_loss_mlp": 0.16625977, + "step": 5157, + "time_per_iteration": 2.59045147895813 + }, + { + "auxiliary_loss_clip": 0.06486322, + "auxiliary_loss_mlp": 0.01274347, + "balance_loss_clip": 0.06289537, + "balance_loss_mlp": 0.01258921, + "epoch": 0.31011573726138586, + "flos": 25749770664960.0, + "grad_norm": 2.4337865277632065, + "language_loss": 0.69860423, + "learning_rate": 3.231827567499327e-06, + "loss": 0.7762109, + "num_input_tokens_seen": 110765835, + "router_z_loss_clip": 1.96777344, + "router_z_loss_mlp": 0.1541748, + "step": 5158, + "time_per_iteration": 4.041999578475952 + }, + { + "auxiliary_loss_clip": 0.06488799, + "auxiliary_loss_mlp": 0.0127365, + "balance_loss_clip": 0.0629247, + "balance_loss_mlp": 0.0125795, + "epoch": 0.3101758605140538, + "flos": 20017541324160.0, + "grad_norm": 2.0387737109261477, + "language_loss": 0.84883308, + "learning_rate": 3.2315207209505896e-06, + "loss": 0.92645758, + "num_input_tokens_seen": 110784655, + "router_z_loss_clip": 1.96484375, + "router_z_loss_mlp": 0.15673828, + "step": 5159, + "time_per_iteration": 2.5081369876861572 + }, + { + "auxiliary_loss_clip": 0.06491841, + "auxiliary_loss_mlp": 0.0127455, + "balance_loss_clip": 0.06293043, + "balance_loss_mlp": 0.01257002, + "epoch": 0.3102359837667218, + "flos": 19141751249280.0, + "grad_norm": 1.926707434190644, + "language_loss": 0.85498118, + "learning_rate": 3.231213827702462e-06, + "loss": 0.93264508, + "num_input_tokens_seen": 110802545, + "router_z_loss_clip": 1.98828125, + "router_z_loss_mlp": 0.17529297, + "step": 5160, + "time_per_iteration": 2.5466468334198 + }, + { + "auxiliary_loss_clip": 0.06486624, + "auxiliary_loss_mlp": 0.01270062, + "balance_loss_clip": 0.06291263, + "balance_loss_mlp": 0.01253945, + "epoch": 0.31029610701938976, + "flos": 22270649287680.0, + "grad_norm": 1.6869427612303989, + "language_loss": 0.75787026, + "learning_rate": 3.230906887766584e-06, + "loss": 0.83543712, + "num_input_tokens_seen": 110820265, + "router_z_loss_clip": 1.95214844, + "router_z_loss_mlp": 0.16113281, + "step": 5161, + "time_per_iteration": 2.518521785736084 + }, + { + "auxiliary_loss_clip": 0.06491208, + "auxiliary_loss_mlp": 0.0127494, + "balance_loss_clip": 0.06289751, + "balance_loss_mlp": 0.01256915, + "epoch": 0.3103562302720577, + "flos": 20810244476160.0, + "grad_norm": 2.463900279304932, + "language_loss": 0.8222912, + "learning_rate": 3.2305999011545924e-06, + "loss": 0.89995265, + "num_input_tokens_seen": 110836195, + "router_z_loss_clip": 2.01367188, + "router_z_loss_mlp": 0.18029785, + "step": 5162, + "time_per_iteration": 2.5057315826416016 + }, + { + "auxiliary_loss_clip": 0.06485277, + "auxiliary_loss_mlp": 0.01269002, + "balance_loss_clip": 0.06289959, + "balance_loss_mlp": 0.01253594, + "epoch": 0.3104163535247257, + "flos": 22350382047360.0, + "grad_norm": 1.4717884967200954, + "language_loss": 0.83087295, + "learning_rate": 3.2302928678781295e-06, + "loss": 0.90841573, + "num_input_tokens_seen": 110856420, + "router_z_loss_clip": 1.95507812, + "router_z_loss_mlp": 0.15423584, + "step": 5163, + "time_per_iteration": 2.542052745819092 + }, + { + "auxiliary_loss_clip": 0.06490193, + "auxiliary_loss_mlp": 0.01271791, + "balance_loss_clip": 0.06290418, + "balance_loss_mlp": 0.0125559, + "epoch": 0.31047647677739365, + "flos": 21695803551360.0, + "grad_norm": 1.756895513371669, + "language_loss": 0.76630449, + "learning_rate": 3.2299857879488376e-06, + "loss": 0.84392428, + "num_input_tokens_seen": 110876650, + "router_z_loss_clip": 1.99707031, + "router_z_loss_mlp": 0.16186523, + "step": 5164, + "time_per_iteration": 2.5616652965545654 + }, + { + "auxiliary_loss_clip": 0.06486434, + "auxiliary_loss_mlp": 0.01274591, + "balance_loss_clip": 0.0628885, + "balance_loss_mlp": 0.01258331, + "epoch": 0.3105366000300616, + "flos": 18923390709120.0, + "grad_norm": 1.866784827400394, + "language_loss": 0.75307393, + "learning_rate": 3.2296786613783626e-06, + "loss": 0.83068419, + "num_input_tokens_seen": 110894445, + "router_z_loss_clip": 1.97558594, + "router_z_loss_mlp": 0.16271973, + "step": 5165, + "time_per_iteration": 2.5190699100494385 + }, + { + "auxiliary_loss_clip": 0.06483215, + "auxiliary_loss_mlp": 0.01271797, + "balance_loss_clip": 0.062862, + "balance_loss_mlp": 0.01255096, + "epoch": 0.3105967232827296, + "flos": 18266380444800.0, + "grad_norm": 1.5432274368627708, + "language_loss": 0.76476973, + "learning_rate": 3.229371488178348e-06, + "loss": 0.84231985, + "num_input_tokens_seen": 110912855, + "router_z_loss_clip": 1.97070312, + "router_z_loss_mlp": 0.16699219, + "step": 5166, + "time_per_iteration": 2.5421557426452637 + }, + { + "auxiliary_loss_clip": 0.06486712, + "auxiliary_loss_mlp": 0.01273485, + "balance_loss_clip": 0.06287863, + "balance_loss_mlp": 0.01256796, + "epoch": 0.31065684653539755, + "flos": 17677279514880.0, + "grad_norm": 2.119255684006569, + "language_loss": 0.74129677, + "learning_rate": 3.229064268360444e-06, + "loss": 0.81889874, + "num_input_tokens_seen": 110928025, + "router_z_loss_clip": 1.98828125, + "router_z_loss_mlp": 0.16687012, + "step": 5167, + "time_per_iteration": 2.5039737224578857 + }, + { + "auxiliary_loss_clip": 0.06378125, + "auxiliary_loss_mlp": 0.01261765, + "balance_loss_clip": 0.06284033, + "balance_loss_mlp": 0.01258356, + "epoch": 0.3107169697880655, + "flos": 68551522151040.0, + "grad_norm": 0.7172817016896729, + "language_loss": 0.53065968, + "learning_rate": 3.2287570019362997e-06, + "loss": 0.60705864, + "num_input_tokens_seen": 110992215, + "router_z_loss_clip": 0.93945312, + "router_z_loss_mlp": 0.03417969, + "step": 5168, + "time_per_iteration": 3.211498737335205 + }, + { + "auxiliary_loss_clip": 0.06491841, + "auxiliary_loss_mlp": 0.0127061, + "balance_loss_clip": 0.06290184, + "balance_loss_mlp": 0.01254052, + "epoch": 0.3107770930407335, + "flos": 13193844698880.0, + "grad_norm": 1.7226101243088363, + "language_loss": 0.79536855, + "learning_rate": 3.2284496889175668e-06, + "loss": 0.87299311, + "num_input_tokens_seen": 111010400, + "router_z_loss_clip": 2.01757812, + "router_z_loss_mlp": 0.16552734, + "step": 5169, + "time_per_iteration": 2.526906728744507 + }, + { + "auxiliary_loss_clip": 0.06491011, + "auxiliary_loss_mlp": 0.01271387, + "balance_loss_clip": 0.06288561, + "balance_loss_mlp": 0.01254328, + "epoch": 0.3108372162934015, + "flos": 31589587048320.0, + "grad_norm": 1.7384868970357352, + "language_loss": 0.6439994, + "learning_rate": 3.2281423293158986e-06, + "loss": 0.7216233, + "num_input_tokens_seen": 111033960, + "router_z_loss_clip": 2.02636719, + "router_z_loss_mlp": 0.17077637, + "step": 5170, + "time_per_iteration": 2.659008264541626 + }, + { + "auxiliary_loss_clip": 0.06488822, + "auxiliary_loss_mlp": 0.01276189, + "balance_loss_clip": 0.06288925, + "balance_loss_mlp": 0.01258927, + "epoch": 0.31089733954606946, + "flos": 28737231811200.0, + "grad_norm": 2.2754975952460086, + "language_loss": 0.77238673, + "learning_rate": 3.22783492314295e-06, + "loss": 0.8500368, + "num_input_tokens_seen": 111053265, + "router_z_loss_clip": 1.99902344, + "router_z_loss_mlp": 0.17260742, + "step": 5171, + "time_per_iteration": 2.5726847648620605 + }, + { + "auxiliary_loss_clip": 0.06489364, + "auxiliary_loss_mlp": 0.01274912, + "balance_loss_clip": 0.06290348, + "balance_loss_mlp": 0.01258294, + "epoch": 0.3109574627987374, + "flos": 19689455462400.0, + "grad_norm": 1.774750718996553, + "language_loss": 0.84023309, + "learning_rate": 3.2275274704103785e-06, + "loss": 0.91787583, + "num_input_tokens_seen": 111071130, + "router_z_loss_clip": 1.9921875, + "router_z_loss_mlp": 0.16625977, + "step": 5172, + "time_per_iteration": 2.5289804935455322 + }, + { + "auxiliary_loss_clip": 0.06485899, + "auxiliary_loss_mlp": 0.01271683, + "balance_loss_clip": 0.06284268, + "balance_loss_mlp": 0.01254493, + "epoch": 0.3110175860514054, + "flos": 14689231390080.0, + "grad_norm": 2.444929493076507, + "language_loss": 0.8466565, + "learning_rate": 3.227219971129842e-06, + "loss": 0.92423236, + "num_input_tokens_seen": 111089560, + "router_z_loss_clip": 2.01855469, + "router_z_loss_mlp": 0.17199707, + "step": 5173, + "time_per_iteration": 2.477851629257202 + }, + { + "auxiliary_loss_clip": 0.06478094, + "auxiliary_loss_mlp": 0.01270979, + "balance_loss_clip": 0.06285643, + "balance_loss_mlp": 0.01255279, + "epoch": 0.31107770930407336, + "flos": 25746835772160.0, + "grad_norm": 1.6684709759498597, + "language_loss": 0.83928138, + "learning_rate": 3.226912425313001e-06, + "loss": 0.91677213, + "num_input_tokens_seen": 111109960, + "router_z_loss_clip": 1.92382812, + "router_z_loss_mlp": 0.15698242, + "step": 5174, + "time_per_iteration": 2.6188318729400635 + }, + { + "auxiliary_loss_clip": 0.06483682, + "auxiliary_loss_mlp": 0.0127308, + "balance_loss_clip": 0.06284115, + "balance_loss_mlp": 0.01256057, + "epoch": 0.3111378325567413, + "flos": 19214272558080.0, + "grad_norm": 2.0188284806938945, + "language_loss": 0.85820258, + "learning_rate": 3.2266048329715183e-06, + "loss": 0.93577021, + "num_input_tokens_seen": 111127960, + "router_z_loss_clip": 1.99414062, + "router_z_loss_mlp": 0.17016602, + "step": 5175, + "time_per_iteration": 2.489356756210327 + }, + { + "auxiliary_loss_clip": 0.06477995, + "auxiliary_loss_mlp": 0.01275126, + "balance_loss_clip": 0.0628527, + "balance_loss_mlp": 0.01257352, + "epoch": 0.3111979558094093, + "flos": 23703199816320.0, + "grad_norm": 1.907748003287586, + "language_loss": 0.84357607, + "learning_rate": 3.2262971941170575e-06, + "loss": 0.92110729, + "num_input_tokens_seen": 111146730, + "router_z_loss_clip": 1.92675781, + "router_z_loss_mlp": 0.17773438, + "step": 5176, + "time_per_iteration": 2.599229574203491 + }, + { + "auxiliary_loss_clip": 0.06476277, + "auxiliary_loss_mlp": 0.01273206, + "balance_loss_clip": 0.06279132, + "balance_loss_mlp": 0.01255468, + "epoch": 0.31125807906207725, + "flos": 21039422192640.0, + "grad_norm": 2.9714078029027977, + "language_loss": 0.80720133, + "learning_rate": 3.2259895087612837e-06, + "loss": 0.88469613, + "num_input_tokens_seen": 111166295, + "router_z_loss_clip": 1.96972656, + "router_z_loss_mlp": 0.17736816, + "step": 5177, + "time_per_iteration": 2.500892162322998 + }, + { + "auxiliary_loss_clip": 0.06482373, + "auxiliary_loss_mlp": 0.01272639, + "balance_loss_clip": 0.06283157, + "balance_loss_mlp": 0.01255353, + "epoch": 0.3113182023147452, + "flos": 23083435491840.0, + "grad_norm": 1.9531801027744504, + "language_loss": 0.81037831, + "learning_rate": 3.2256817769158657e-06, + "loss": 0.88792837, + "num_input_tokens_seen": 111185665, + "router_z_loss_clip": 1.9921875, + "router_z_loss_mlp": 0.17285156, + "step": 5178, + "time_per_iteration": 2.6086864471435547 + }, + { + "auxiliary_loss_clip": 0.06483644, + "auxiliary_loss_mlp": 0.01276661, + "balance_loss_clip": 0.06283852, + "balance_loss_mlp": 0.01259316, + "epoch": 0.3113783255674132, + "flos": 11843919895680.0, + "grad_norm": 1.9055325557306373, + "language_loss": 0.81524587, + "learning_rate": 3.225373998592471e-06, + "loss": 0.89284897, + "num_input_tokens_seen": 111201615, + "router_z_loss_clip": 1.99511719, + "router_z_loss_mlp": 0.17346191, + "step": 5179, + "time_per_iteration": 2.4582295417785645 + }, + { + "auxiliary_loss_clip": 0.06482498, + "auxiliary_loss_mlp": 0.01272412, + "balance_loss_clip": 0.06285708, + "balance_loss_mlp": 0.01255926, + "epoch": 0.31143844882008115, + "flos": 16295098089600.0, + "grad_norm": 1.625598326664227, + "language_loss": 0.78714401, + "learning_rate": 3.2250661738027715e-06, + "loss": 0.86469316, + "num_input_tokens_seen": 111220515, + "router_z_loss_clip": 1.96582031, + "router_z_loss_mlp": 0.16491699, + "step": 5180, + "time_per_iteration": 2.4980807304382324 + }, + { + "auxiliary_loss_clip": 0.06486566, + "auxiliary_loss_mlp": 0.01274849, + "balance_loss_clip": 0.06288585, + "balance_loss_mlp": 0.01257742, + "epoch": 0.3114985720727491, + "flos": 23223824208000.0, + "grad_norm": 4.8505374097148595, + "language_loss": 0.83649975, + "learning_rate": 3.22475830255844e-06, + "loss": 0.91411394, + "num_input_tokens_seen": 111240395, + "router_z_loss_clip": 1.98144531, + "router_z_loss_mlp": 0.17102051, + "step": 5181, + "time_per_iteration": 2.519810438156128 + }, + { + "auxiliary_loss_clip": 0.0648061, + "auxiliary_loss_mlp": 0.01273344, + "balance_loss_clip": 0.06285872, + "balance_loss_mlp": 0.01258348, + "epoch": 0.3115586953254171, + "flos": 30052468224000.0, + "grad_norm": 1.6592506395593873, + "language_loss": 0.74442661, + "learning_rate": 3.2244503848711516e-06, + "loss": 0.82196611, + "num_input_tokens_seen": 111261100, + "router_z_loss_clip": 1.94726562, + "router_z_loss_mlp": 0.15002441, + "step": 5182, + "time_per_iteration": 2.6227729320526123 + }, + { + "auxiliary_loss_clip": 0.06490366, + "auxiliary_loss_mlp": 0.01270872, + "balance_loss_clip": 0.06288615, + "balance_loss_mlp": 0.01254362, + "epoch": 0.3116188185780851, + "flos": 25673433995520.0, + "grad_norm": 2.0195817263542852, + "language_loss": 0.70974112, + "learning_rate": 3.2241424207525815e-06, + "loss": 0.78735352, + "num_input_tokens_seen": 111281320, + "router_z_loss_clip": 2.015625, + "router_z_loss_mlp": 0.16503906, + "step": 5183, + "time_per_iteration": 2.5801775455474854 + }, + { + "auxiliary_loss_clip": 0.06369011, + "auxiliary_loss_mlp": 0.0126694, + "balance_loss_clip": 0.06276023, + "balance_loss_mlp": 0.0126376, + "epoch": 0.31167894183075306, + "flos": 69528568285440.0, + "grad_norm": 0.9410725627351464, + "language_loss": 0.59133947, + "learning_rate": 3.223834410214408e-06, + "loss": 0.66769892, + "num_input_tokens_seen": 111341405, + "router_z_loss_clip": 0.9296875, + "router_z_loss_mlp": 0.03182983, + "step": 5184, + "time_per_iteration": 3.1446807384490967 + }, + { + "auxiliary_loss_clip": 0.06488199, + "auxiliary_loss_mlp": 0.01277241, + "balance_loss_clip": 0.06288702, + "balance_loss_mlp": 0.01260206, + "epoch": 0.31173906508342103, + "flos": 14945215213440.0, + "grad_norm": 2.5697318046341424, + "language_loss": 0.69689488, + "learning_rate": 3.223526353268311e-06, + "loss": 0.77454925, + "num_input_tokens_seen": 111358975, + "router_z_loss_clip": 1.99121094, + "router_z_loss_mlp": 0.17041016, + "step": 5185, + "time_per_iteration": 2.51505446434021 + }, + { + "auxiliary_loss_clip": 0.06492566, + "auxiliary_loss_mlp": 0.01273506, + "balance_loss_clip": 0.06291321, + "balance_loss_mlp": 0.01256507, + "epoch": 0.311799188336089, + "flos": 16180886574720.0, + "grad_norm": 2.500262239817252, + "language_loss": 0.63946617, + "learning_rate": 3.2232182499259725e-06, + "loss": 0.71712691, + "num_input_tokens_seen": 111375845, + "router_z_loss_clip": 2.01269531, + "router_z_loss_mlp": 0.17004395, + "step": 5186, + "time_per_iteration": 2.505030870437622 + }, + { + "auxiliary_loss_clip": 0.06492127, + "auxiliary_loss_mlp": 0.01277284, + "balance_loss_clip": 0.06286798, + "balance_loss_mlp": 0.01258592, + "epoch": 0.31185931158875696, + "flos": 25016633366400.0, + "grad_norm": 2.1681671670490603, + "language_loss": 0.86641979, + "learning_rate": 3.2229101001990747e-06, + "loss": 0.94411391, + "num_input_tokens_seen": 111394150, + "router_z_loss_clip": 2.05664062, + "router_z_loss_mlp": 0.18688965, + "step": 5187, + "time_per_iteration": 2.583510160446167 + }, + { + "auxiliary_loss_clip": 0.06487665, + "auxiliary_loss_mlp": 0.01281669, + "balance_loss_clip": 0.06287494, + "balance_loss_mlp": 0.01264527, + "epoch": 0.3119194348414249, + "flos": 37242041702400.0, + "grad_norm": 1.4465041932602023, + "language_loss": 0.6305244, + "learning_rate": 3.2226019040993036e-06, + "loss": 0.70821768, + "num_input_tokens_seen": 111418355, + "router_z_loss_clip": 2.00097656, + "router_z_loss_mlp": 0.17138672, + "step": 5188, + "time_per_iteration": 2.7036139965057373 + }, + { + "auxiliary_loss_clip": 0.06486794, + "auxiliary_loss_mlp": 0.01278194, + "balance_loss_clip": 0.06286722, + "balance_loss_mlp": 0.01261397, + "epoch": 0.3119795580940929, + "flos": 15018155792640.0, + "grad_norm": 2.1005201528303683, + "language_loss": 0.83722234, + "learning_rate": 3.222293661638346e-06, + "loss": 0.91487223, + "num_input_tokens_seen": 111435445, + "router_z_loss_clip": 1.99902344, + "router_z_loss_mlp": 0.16796875, + "step": 5189, + "time_per_iteration": 3.933061361312866 + }, + { + "auxiliary_loss_clip": 0.06481164, + "auxiliary_loss_mlp": 0.0127866, + "balance_loss_clip": 0.06286235, + "balance_loss_mlp": 0.01262602, + "epoch": 0.31203968134676086, + "flos": 16003755043200.0, + "grad_norm": 2.4405990352060862, + "language_loss": 0.79429829, + "learning_rate": 3.22198537282789e-06, + "loss": 0.87189662, + "num_input_tokens_seen": 111453430, + "router_z_loss_clip": 1.94824219, + "router_z_loss_mlp": 0.16064453, + "step": 5190, + "time_per_iteration": 2.479335308074951 + }, + { + "auxiliary_loss_clip": 0.0648755, + "auxiliary_loss_mlp": 0.01275874, + "balance_loss_clip": 0.06287287, + "balance_loss_mlp": 0.01259292, + "epoch": 0.3120998045994288, + "flos": 23843378897280.0, + "grad_norm": 1.451249914697294, + "language_loss": 0.75502658, + "learning_rate": 3.2216770376796262e-06, + "loss": 0.83266091, + "num_input_tokens_seen": 111475325, + "router_z_loss_clip": 2.00195312, + "router_z_loss_mlp": 0.16589355, + "step": 5191, + "time_per_iteration": 3.997621536254883 + }, + { + "auxiliary_loss_clip": 0.06364973, + "auxiliary_loss_mlp": 0.01267778, + "balance_loss_clip": 0.0627303, + "balance_loss_mlp": 0.01264178, + "epoch": 0.3121599278520968, + "flos": 69203081900160.0, + "grad_norm": 0.8286054534369729, + "language_loss": 0.63964236, + "learning_rate": 3.221368656205247e-06, + "loss": 0.71596992, + "num_input_tokens_seen": 111533960, + "router_z_loss_clip": 0.91796875, + "router_z_loss_mlp": 0.03594971, + "step": 5192, + "time_per_iteration": 4.631687879562378 + }, + { + "auxiliary_loss_clip": 0.06487048, + "auxiliary_loss_mlp": 0.01274026, + "balance_loss_clip": 0.06284614, + "balance_loss_mlp": 0.01254916, + "epoch": 0.31222005110476475, + "flos": 23813302481280.0, + "grad_norm": 1.6272414578256373, + "language_loss": 0.80280936, + "learning_rate": 3.221060228416446e-06, + "loss": 0.88042009, + "num_input_tokens_seen": 111554055, + "router_z_loss_clip": 2.02441406, + "router_z_loss_mlp": 0.19116211, + "step": 5193, + "time_per_iteration": 2.5469777584075928 + }, + { + "auxiliary_loss_clip": 0.06487141, + "auxiliary_loss_mlp": 0.01274246, + "balance_loss_clip": 0.06286725, + "balance_loss_mlp": 0.01255244, + "epoch": 0.3122801743574327, + "flos": 25232771773440.0, + "grad_norm": 1.8740192083695482, + "language_loss": 0.72266662, + "learning_rate": 3.2207517543249183e-06, + "loss": 0.80028057, + "num_input_tokens_seen": 111574305, + "router_z_loss_clip": 2.00292969, + "router_z_loss_mlp": 0.19006348, + "step": 5194, + "time_per_iteration": 2.5416929721832275 + }, + { + "auxiliary_loss_clip": 0.06483766, + "auxiliary_loss_mlp": 0.01273792, + "balance_loss_clip": 0.06285778, + "balance_loss_mlp": 0.01257604, + "epoch": 0.3123402976101007, + "flos": 22973165118720.0, + "grad_norm": 1.4810805631902553, + "language_loss": 0.77076054, + "learning_rate": 3.2204432339423616e-06, + "loss": 0.8483361, + "num_input_tokens_seen": 111595680, + "router_z_loss_clip": 1.97949219, + "router_z_loss_mlp": 0.16186523, + "step": 5195, + "time_per_iteration": 2.5890305042266846 + }, + { + "auxiliary_loss_clip": 0.06489303, + "auxiliary_loss_mlp": 0.01273064, + "balance_loss_clip": 0.06285589, + "balance_loss_mlp": 0.01256268, + "epoch": 0.3124004208627687, + "flos": 25199131559040.0, + "grad_norm": 1.3828607146804377, + "language_loss": 0.78218812, + "learning_rate": 3.220134667280476e-06, + "loss": 0.85981178, + "num_input_tokens_seen": 111618135, + "router_z_loss_clip": 2.03710938, + "router_z_loss_mlp": 0.16796875, + "step": 5196, + "time_per_iteration": 2.608607769012451 + }, + { + "auxiliary_loss_clip": 0.06360652, + "auxiliary_loss_mlp": 0.0126022, + "balance_loss_clip": 0.06268834, + "balance_loss_mlp": 0.01256831, + "epoch": 0.31246054411543667, + "flos": 67506398974080.0, + "grad_norm": 0.7576873975695796, + "language_loss": 0.54860902, + "learning_rate": 3.2198260543509613e-06, + "loss": 0.62481773, + "num_input_tokens_seen": 111682220, + "router_z_loss_clip": 0.91455078, + "router_z_loss_mlp": 0.03396606, + "step": 5197, + "time_per_iteration": 4.588749170303345 + }, + { + "auxiliary_loss_clip": 0.06482677, + "auxiliary_loss_mlp": 0.0127766, + "balance_loss_clip": 0.06286696, + "balance_loss_mlp": 0.01261424, + "epoch": 0.31252066736810463, + "flos": 17864347754880.0, + "grad_norm": 1.7824095594325715, + "language_loss": 0.67078102, + "learning_rate": 3.21951739516552e-06, + "loss": 0.74838442, + "num_input_tokens_seen": 111700815, + "router_z_loss_clip": 1.9609375, + "router_z_loss_mlp": 0.16247559, + "step": 5198, + "time_per_iteration": 2.5304651260375977 + }, + { + "auxiliary_loss_clip": 0.06490927, + "auxiliary_loss_mlp": 0.01280145, + "balance_loss_clip": 0.06286959, + "balance_loss_mlp": 0.01261596, + "epoch": 0.3125807906207726, + "flos": 18480338645760.0, + "grad_norm": 2.4146329055675264, + "language_loss": 0.70401263, + "learning_rate": 3.219208689735857e-06, + "loss": 0.78172338, + "num_input_tokens_seen": 111718195, + "router_z_loss_clip": 2.04101562, + "router_z_loss_mlp": 0.1854248, + "step": 5199, + "time_per_iteration": 2.5358517169952393 + }, + { + "auxiliary_loss_clip": 0.06486207, + "auxiliary_loss_mlp": 0.01275953, + "balance_loss_clip": 0.06286721, + "balance_loss_mlp": 0.01258751, + "epoch": 0.31264091387344056, + "flos": 18951454627200.0, + "grad_norm": 1.7917967449154466, + "language_loss": 0.79258394, + "learning_rate": 3.2188999380736785e-06, + "loss": 0.87020558, + "num_input_tokens_seen": 111734440, + "router_z_loss_clip": 1.99316406, + "router_z_loss_mlp": 0.17211914, + "step": 5200, + "time_per_iteration": 2.5519278049468994 + }, + { + "auxiliary_loss_clip": 0.06479242, + "auxiliary_loss_mlp": 0.0127792, + "balance_loss_clip": 0.06284697, + "balance_loss_mlp": 0.01261195, + "epoch": 0.3127010371261085, + "flos": 21474591972480.0, + "grad_norm": 1.8808343302197998, + "language_loss": 0.83758473, + "learning_rate": 3.2185911401906917e-06, + "loss": 0.91515636, + "num_input_tokens_seen": 111751960, + "router_z_loss_clip": 1.94726562, + "router_z_loss_mlp": 0.16711426, + "step": 5201, + "time_per_iteration": 2.509331226348877 + }, + { + "auxiliary_loss_clip": 0.06487838, + "auxiliary_loss_mlp": 0.0127922, + "balance_loss_clip": 0.06288306, + "balance_loss_mlp": 0.01262006, + "epoch": 0.3127611603787765, + "flos": 15340623431040.0, + "grad_norm": 2.173524859167814, + "language_loss": 0.69690537, + "learning_rate": 3.2182822960986072e-06, + "loss": 0.77457595, + "num_input_tokens_seen": 111769585, + "router_z_loss_clip": 1.99414062, + "router_z_loss_mlp": 0.17224121, + "step": 5202, + "time_per_iteration": 2.52652907371521 + }, + { + "auxiliary_loss_clip": 0.06486704, + "auxiliary_loss_mlp": 0.01278352, + "balance_loss_clip": 0.06286184, + "balance_loss_mlp": 0.01261257, + "epoch": 0.31282128363144446, + "flos": 17608741274880.0, + "grad_norm": 2.6038382996561604, + "language_loss": 0.83874559, + "learning_rate": 3.2179734058091358e-06, + "loss": 0.91639626, + "num_input_tokens_seen": 111787880, + "router_z_loss_clip": 2.00488281, + "router_z_loss_mlp": 0.17077637, + "step": 5203, + "time_per_iteration": 2.502721071243286 + }, + { + "auxiliary_loss_clip": 0.06488604, + "auxiliary_loss_mlp": 0.01274199, + "balance_loss_clip": 0.06287186, + "balance_loss_mlp": 0.01256604, + "epoch": 0.3128814068841124, + "flos": 26763349979520.0, + "grad_norm": 2.412675439541041, + "language_loss": 0.61310971, + "learning_rate": 3.2176644693339913e-06, + "loss": 0.69073772, + "num_input_tokens_seen": 111805950, + "router_z_loss_clip": 2.01464844, + "router_z_loss_mlp": 0.17602539, + "step": 5204, + "time_per_iteration": 2.62591814994812 + }, + { + "auxiliary_loss_clip": 0.06482827, + "auxiliary_loss_mlp": 0.01275158, + "balance_loss_clip": 0.0628654, + "balance_loss_mlp": 0.01259553, + "epoch": 0.3129415301367804, + "flos": 22278783133440.0, + "grad_norm": 1.7324044566720012, + "language_loss": 0.66418731, + "learning_rate": 3.217355486684887e-06, + "loss": 0.74176717, + "num_input_tokens_seen": 111826135, + "router_z_loss_clip": 1.96191406, + "router_z_loss_mlp": 0.15582275, + "step": 5205, + "time_per_iteration": 2.512777328491211 + }, + { + "auxiliary_loss_clip": 0.06487758, + "auxiliary_loss_mlp": 0.01277628, + "balance_loss_clip": 0.06287788, + "balance_loss_mlp": 0.01260021, + "epoch": 0.31300165338944835, + "flos": 26471461881600.0, + "grad_norm": 1.8344199627772577, + "language_loss": 0.77298087, + "learning_rate": 3.2170464578735414e-06, + "loss": 0.85063475, + "num_input_tokens_seen": 111844700, + "router_z_loss_clip": 1.99902344, + "router_z_loss_mlp": 0.17614746, + "step": 5206, + "time_per_iteration": 2.5712244510650635 + }, + { + "auxiliary_loss_clip": 0.06485735, + "auxiliary_loss_mlp": 0.01271701, + "balance_loss_clip": 0.06288184, + "balance_loss_mlp": 0.01255488, + "epoch": 0.3130617766421163, + "flos": 21951116542080.0, + "grad_norm": 2.0121384013718226, + "language_loss": 0.83184564, + "learning_rate": 3.216737382911672e-06, + "loss": 0.90941995, + "num_input_tokens_seen": 111861585, + "router_z_loss_clip": 1.97558594, + "router_z_loss_mlp": 0.16210938, + "step": 5207, + "time_per_iteration": 2.5004825592041016 + }, + { + "auxiliary_loss_clip": 0.06481713, + "auxiliary_loss_mlp": 0.01271341, + "balance_loss_clip": 0.06286129, + "balance_loss_mlp": 0.0125489, + "epoch": 0.3131218998947843, + "flos": 23299154628480.0, + "grad_norm": 2.0890442442793478, + "language_loss": 0.71795774, + "learning_rate": 3.216428261810999e-06, + "loss": 0.79548824, + "num_input_tokens_seen": 111882950, + "router_z_loss_clip": 1.95605469, + "router_z_loss_mlp": 0.16442871, + "step": 5208, + "time_per_iteration": 2.5763585567474365 + }, + { + "auxiliary_loss_clip": 0.06485837, + "auxiliary_loss_mlp": 0.01275661, + "balance_loss_clip": 0.06287587, + "balance_loss_mlp": 0.0125927, + "epoch": 0.3131820231474523, + "flos": 21145583715840.0, + "grad_norm": 1.890905451265213, + "language_loss": 0.74832964, + "learning_rate": 3.2161190945832445e-06, + "loss": 0.82594466, + "num_input_tokens_seen": 111901640, + "router_z_loss_clip": 1.98046875, + "router_z_loss_mlp": 0.1640625, + "step": 5209, + "time_per_iteration": 2.510582685470581 + }, + { + "auxiliary_loss_clip": 0.06483819, + "auxiliary_loss_mlp": 0.01271712, + "balance_loss_clip": 0.06284019, + "balance_loss_mlp": 0.01255678, + "epoch": 0.31324214640012027, + "flos": 23915816352000.0, + "grad_norm": 1.8368712630160764, + "language_loss": 0.77846575, + "learning_rate": 3.2158098812401325e-06, + "loss": 0.85602105, + "num_input_tokens_seen": 111919615, + "router_z_loss_clip": 1.99609375, + "router_z_loss_mlp": 0.16027832, + "step": 5210, + "time_per_iteration": 2.5457394123077393 + }, + { + "auxiliary_loss_clip": 0.06472643, + "auxiliary_loss_mlp": 0.01278598, + "balance_loss_clip": 0.06280389, + "balance_loss_mlp": 0.01262963, + "epoch": 0.31330226965278823, + "flos": 22243507764480.0, + "grad_norm": 1.7690758446531836, + "language_loss": 0.79563594, + "learning_rate": 3.2155006217933874e-06, + "loss": 0.87314838, + "num_input_tokens_seen": 111938485, + "router_z_loss_clip": 1.92285156, + "router_z_loss_mlp": 0.15643311, + "step": 5211, + "time_per_iteration": 2.5383517742156982 + }, + { + "auxiliary_loss_clip": 0.0648172, + "auxiliary_loss_mlp": 0.01270065, + "balance_loss_clip": 0.06285914, + "balance_loss_mlp": 0.01254699, + "epoch": 0.3133623929054562, + "flos": 19759838492160.0, + "grad_norm": 1.6892345584465767, + "language_loss": 0.79993588, + "learning_rate": 3.2151913162547367e-06, + "loss": 0.87745374, + "num_input_tokens_seen": 111956425, + "router_z_loss_clip": 1.95996094, + "router_z_loss_mlp": 0.15368652, + "step": 5212, + "time_per_iteration": 2.5550856590270996 + }, + { + "auxiliary_loss_clip": 0.06489062, + "auxiliary_loss_mlp": 0.01276168, + "balance_loss_clip": 0.06287421, + "balance_loss_mlp": 0.01258919, + "epoch": 0.31342251615812416, + "flos": 27169617300480.0, + "grad_norm": 2.030797991853156, + "language_loss": 0.71651685, + "learning_rate": 3.2148819646359097e-06, + "loss": 0.79416913, + "num_input_tokens_seen": 111975915, + "router_z_loss_clip": 2.01660156, + "router_z_loss_mlp": 0.17248535, + "step": 5213, + "time_per_iteration": 2.5827908515930176 + }, + { + "auxiliary_loss_clip": 0.06486979, + "auxiliary_loss_mlp": 0.01275678, + "balance_loss_clip": 0.06285015, + "balance_loss_mlp": 0.01258763, + "epoch": 0.31348263941079213, + "flos": 20235985718400.0, + "grad_norm": 2.164105834219518, + "language_loss": 0.77949297, + "learning_rate": 3.2145725669486374e-06, + "loss": 0.85711956, + "num_input_tokens_seen": 111995055, + "router_z_loss_clip": 2.01757812, + "router_z_loss_mlp": 0.16918945, + "step": 5214, + "time_per_iteration": 2.539149761199951 + }, + { + "auxiliary_loss_clip": 0.06478322, + "auxiliary_loss_mlp": 0.0127674, + "balance_loss_clip": 0.06285194, + "balance_loss_mlp": 0.01261267, + "epoch": 0.3135427626634601, + "flos": 24614474895360.0, + "grad_norm": 1.5354860146289633, + "language_loss": 0.82935429, + "learning_rate": 3.2142631232046517e-06, + "loss": 0.90690494, + "num_input_tokens_seen": 112015830, + "router_z_loss_clip": 1.92871094, + "router_z_loss_mlp": 0.15472412, + "step": 5215, + "time_per_iteration": 2.541269302368164 + }, + { + "auxiliary_loss_clip": 0.06486098, + "auxiliary_loss_mlp": 0.01273565, + "balance_loss_clip": 0.06288007, + "balance_loss_mlp": 0.01257186, + "epoch": 0.31360288591612806, + "flos": 20966230051200.0, + "grad_norm": 1.8278899125375987, + "language_loss": 0.79790628, + "learning_rate": 3.213953633415686e-06, + "loss": 0.87550294, + "num_input_tokens_seen": 112035065, + "router_z_loss_clip": 1.98144531, + "router_z_loss_mlp": 0.16369629, + "step": 5216, + "time_per_iteration": 2.5465261936187744 + }, + { + "auxiliary_loss_clip": 0.06489767, + "auxiliary_loss_mlp": 0.0127801, + "balance_loss_clip": 0.06286536, + "balance_loss_mlp": 0.01258722, + "epoch": 0.313663009168796, + "flos": 26987957648640.0, + "grad_norm": 1.8964979694160957, + "language_loss": 0.68953168, + "learning_rate": 3.213644097593477e-06, + "loss": 0.76720947, + "num_input_tokens_seen": 112058405, + "router_z_loss_clip": 2.03125, + "router_z_loss_mlp": 0.19299316, + "step": 5217, + "time_per_iteration": 2.5518875122070312 + }, + { + "auxiliary_loss_clip": 0.06480299, + "auxiliary_loss_mlp": 0.01275451, + "balance_loss_clip": 0.06283456, + "balance_loss_mlp": 0.01259298, + "epoch": 0.313723132421464, + "flos": 18046762093440.0, + "grad_norm": 1.6389262097165689, + "language_loss": 0.80772746, + "learning_rate": 3.2133345157497624e-06, + "loss": 0.88528496, + "num_input_tokens_seen": 112076420, + "router_z_loss_clip": 1.96679688, + "router_z_loss_mlp": 0.16149902, + "step": 5218, + "time_per_iteration": 2.5255727767944336 + }, + { + "auxiliary_loss_clip": 0.06485314, + "auxiliary_loss_mlp": 0.0127641, + "balance_loss_clip": 0.06285116, + "balance_loss_mlp": 0.01259363, + "epoch": 0.31378325567413196, + "flos": 22494963467520.0, + "grad_norm": 2.253901481236794, + "language_loss": 0.70057523, + "learning_rate": 3.2130248878962813e-06, + "loss": 0.77819252, + "num_input_tokens_seen": 112090775, + "router_z_loss_clip": 2.00195312, + "router_z_loss_mlp": 0.17047119, + "step": 5219, + "time_per_iteration": 2.487877368927002 + }, + { + "auxiliary_loss_clip": 0.06483484, + "auxiliary_loss_mlp": 0.0127692, + "balance_loss_clip": 0.06284904, + "balance_loss_mlp": 0.01259181, + "epoch": 0.3138433789267999, + "flos": 22425838248960.0, + "grad_norm": 1.9320324134388631, + "language_loss": 0.80156839, + "learning_rate": 3.2127152140447747e-06, + "loss": 0.87917244, + "num_input_tokens_seen": 112110980, + "router_z_loss_clip": 1.98535156, + "router_z_loss_mlp": 0.17736816, + "step": 5220, + "time_per_iteration": 2.5364530086517334 + }, + { + "auxiliary_loss_clip": 0.06484166, + "auxiliary_loss_mlp": 0.01276534, + "balance_loss_clip": 0.06287254, + "balance_loss_mlp": 0.01260751, + "epoch": 0.3139035021794679, + "flos": 13010927235840.0, + "grad_norm": 1.8390249578816682, + "language_loss": 0.73235905, + "learning_rate": 3.212405494206986e-06, + "loss": 0.80996603, + "num_input_tokens_seen": 112129020, + "router_z_loss_clip": 1.96777344, + "router_z_loss_mlp": 0.15771484, + "step": 5221, + "time_per_iteration": 2.477369546890259 + }, + { + "auxiliary_loss_clip": 0.06480553, + "auxiliary_loss_mlp": 0.0127616, + "balance_loss_clip": 0.0628504, + "balance_loss_mlp": 0.0125996, + "epoch": 0.31396362543213585, + "flos": 16951605229440.0, + "grad_norm": 1.9354629264259422, + "language_loss": 0.81906354, + "learning_rate": 3.2120957283946588e-06, + "loss": 0.89663064, + "num_input_tokens_seen": 112147865, + "router_z_loss_clip": 1.95605469, + "router_z_loss_mlp": 0.16223145, + "step": 5222, + "time_per_iteration": 2.5057129859924316 + }, + { + "auxiliary_loss_clip": 0.06490297, + "auxiliary_loss_mlp": 0.01284294, + "balance_loss_clip": 0.06288279, + "balance_loss_mlp": 0.01266555, + "epoch": 0.31402374868480387, + "flos": 20162877431040.0, + "grad_norm": 1.9084075298763516, + "language_loss": 0.70490289, + "learning_rate": 3.2117859166195407e-06, + "loss": 0.78264874, + "num_input_tokens_seen": 112166745, + "router_z_loss_clip": 2.02050781, + "router_z_loss_mlp": 0.17749023, + "step": 5223, + "time_per_iteration": 2.4747233390808105 + }, + { + "auxiliary_loss_clip": 0.06484593, + "auxiliary_loss_mlp": 0.01276253, + "balance_loss_clip": 0.06287414, + "balance_loss_mlp": 0.01259718, + "epoch": 0.31408387193747184, + "flos": 21257363462400.0, + "grad_norm": 1.5262001080385015, + "language_loss": 0.80608702, + "learning_rate": 3.211476058893379e-06, + "loss": 0.88369542, + "num_input_tokens_seen": 112185895, + "router_z_loss_clip": 1.96972656, + "router_z_loss_mlp": 0.1652832, + "step": 5224, + "time_per_iteration": 2.576864004135132 + }, + { + "auxiliary_loss_clip": 0.06497495, + "auxiliary_loss_mlp": 0.01279621, + "balance_loss_clip": 0.06291461, + "balance_loss_mlp": 0.01261632, + "epoch": 0.3141439951901398, + "flos": 27490617492480.0, + "grad_norm": 2.962077450034062, + "language_loss": 0.58624607, + "learning_rate": 3.2111661552279243e-06, + "loss": 0.66401726, + "num_input_tokens_seen": 112204465, + "router_z_loss_clip": 2.05859375, + "router_z_loss_mlp": 0.17993164, + "step": 5225, + "time_per_iteration": 2.558159828186035 + }, + { + "auxiliary_loss_clip": 0.06482717, + "auxiliary_loss_mlp": 0.0128044, + "balance_loss_clip": 0.06289019, + "balance_loss_mlp": 0.0126505, + "epoch": 0.31420411844280777, + "flos": 17857010522880.0, + "grad_norm": 1.7568792542410607, + "language_loss": 0.81975454, + "learning_rate": 3.2108562056349273e-06, + "loss": 0.89738619, + "num_input_tokens_seen": 112221635, + "router_z_loss_clip": 1.93847656, + "router_z_loss_mlp": 0.15380859, + "step": 5226, + "time_per_iteration": 2.5197925567626953 + }, + { + "auxiliary_loss_clip": 0.06493273, + "auxiliary_loss_mlp": 0.01283534, + "balance_loss_clip": 0.0629416, + "balance_loss_mlp": 0.01265998, + "epoch": 0.31426424169547573, + "flos": 21623491877760.0, + "grad_norm": 1.9094319640845634, + "language_loss": 0.74358761, + "learning_rate": 3.210546210126141e-06, + "loss": 0.8213557, + "num_input_tokens_seen": 112241240, + "router_z_loss_clip": 1.99023438, + "router_z_loss_mlp": 0.17529297, + "step": 5227, + "time_per_iteration": 2.6723456382751465 + }, + { + "auxiliary_loss_clip": 0.06493893, + "auxiliary_loss_mlp": 0.01287677, + "balance_loss_clip": 0.0629607, + "balance_loss_mlp": 0.01270392, + "epoch": 0.3143243649481437, + "flos": 30928677569280.0, + "grad_norm": 1.9492252245216757, + "language_loss": 0.68802202, + "learning_rate": 3.2102361687133213e-06, + "loss": 0.76583767, + "num_input_tokens_seen": 112262350, + "router_z_loss_clip": 1.97949219, + "router_z_loss_mlp": 0.17297363, + "step": 5228, + "time_per_iteration": 2.724705934524536 + }, + { + "auxiliary_loss_clip": 0.06488988, + "auxiliary_loss_mlp": 0.01281066, + "balance_loss_clip": 0.06292454, + "balance_loss_mlp": 0.01265044, + "epoch": 0.31438448820081166, + "flos": 22828206355200.0, + "grad_norm": 1.7089427628420442, + "language_loss": 0.80276144, + "learning_rate": 3.2099260814082254e-06, + "loss": 0.88046199, + "num_input_tokens_seen": 112283710, + "router_z_loss_clip": 1.96484375, + "router_z_loss_mlp": 0.16015625, + "step": 5229, + "time_per_iteration": 4.091265678405762 + }, + { + "auxiliary_loss_clip": 0.06481495, + "auxiliary_loss_mlp": 0.01275808, + "balance_loss_clip": 0.06287295, + "balance_loss_mlp": 0.01259428, + "epoch": 0.3144446114534796, + "flos": 23298399941760.0, + "grad_norm": 1.658320923858175, + "language_loss": 0.70112014, + "learning_rate": 3.209615948222611e-06, + "loss": 0.7786932, + "num_input_tokens_seen": 112304285, + "router_z_loss_clip": 1.93945312, + "router_z_loss_mlp": 0.16381836, + "step": 5230, + "time_per_iteration": 2.5652499198913574 + }, + { + "auxiliary_loss_clip": 0.06489812, + "auxiliary_loss_mlp": 0.01281571, + "balance_loss_clip": 0.06291179, + "balance_loss_mlp": 0.01264572, + "epoch": 0.3145047347061476, + "flos": 31363679640960.0, + "grad_norm": 2.930398163442548, + "language_loss": 0.80236816, + "learning_rate": 3.209305769168239e-06, + "loss": 0.88008201, + "num_input_tokens_seen": 112325110, + "router_z_loss_clip": 1.984375, + "router_z_loss_mlp": 0.17004395, + "step": 5231, + "time_per_iteration": 5.461926698684692 + }, + { + "auxiliary_loss_clip": 0.06483024, + "auxiliary_loss_mlp": 0.01279077, + "balance_loss_clip": 0.062879, + "balance_loss_mlp": 0.01262912, + "epoch": 0.31456485795881556, + "flos": 10894182992640.0, + "grad_norm": 3.377505802107346, + "language_loss": 0.85102671, + "learning_rate": 3.2089955442568704e-06, + "loss": 0.92864776, + "num_input_tokens_seen": 112339855, + "router_z_loss_clip": 1.95117188, + "router_z_loss_mlp": 0.16149902, + "step": 5232, + "time_per_iteration": 2.549555778503418 + }, + { + "auxiliary_loss_clip": 0.06479923, + "auxiliary_loss_mlp": 0.01286528, + "balance_loss_clip": 0.06287266, + "balance_loss_mlp": 0.01269779, + "epoch": 0.3146249812114835, + "flos": 17098157220480.0, + "grad_norm": 1.5771176865385883, + "language_loss": 0.80666757, + "learning_rate": 3.2086852735002692e-06, + "loss": 0.88433212, + "num_input_tokens_seen": 112358480, + "router_z_loss_clip": 1.92675781, + "router_z_loss_mlp": 0.16748047, + "step": 5233, + "time_per_iteration": 2.502790927886963 + }, + { + "auxiliary_loss_clip": 0.06496342, + "auxiliary_loss_mlp": 0.01276742, + "balance_loss_clip": 0.06294576, + "balance_loss_mlp": 0.01260768, + "epoch": 0.3146851044641515, + "flos": 55303283352960.0, + "grad_norm": 1.6501859452394316, + "language_loss": 0.71124518, + "learning_rate": 3.2083749569102024e-06, + "loss": 0.78897607, + "num_input_tokens_seen": 112382350, + "router_z_loss_clip": 2.01660156, + "router_z_loss_mlp": 0.15966797, + "step": 5234, + "time_per_iteration": 2.8301026821136475 + }, + { + "auxiliary_loss_clip": 0.06491733, + "auxiliary_loss_mlp": 0.01276589, + "balance_loss_clip": 0.06292239, + "balance_loss_mlp": 0.01259566, + "epoch": 0.31474522771681945, + "flos": 27023149163520.0, + "grad_norm": 1.9231261360365097, + "language_loss": 0.73437119, + "learning_rate": 3.2080645944984356e-06, + "loss": 0.8120544, + "num_input_tokens_seen": 112400260, + "router_z_loss_clip": 1.99902344, + "router_z_loss_mlp": 0.17004395, + "step": 5235, + "time_per_iteration": 2.543799638748169 + }, + { + "auxiliary_loss_clip": 0.0648193, + "auxiliary_loss_mlp": 0.0127527, + "balance_loss_clip": 0.0628682, + "balance_loss_mlp": 0.01259308, + "epoch": 0.3148053509694875, + "flos": 21258369711360.0, + "grad_norm": 1.9283939280374622, + "language_loss": 0.79554284, + "learning_rate": 3.2077541862767384e-06, + "loss": 0.87311482, + "num_input_tokens_seen": 112419400, + "router_z_loss_clip": 1.95214844, + "router_z_loss_mlp": 0.15942383, + "step": 5236, + "time_per_iteration": 2.5356431007385254 + }, + { + "auxiliary_loss_clip": 0.06493077, + "auxiliary_loss_mlp": 0.01277667, + "balance_loss_clip": 0.06288847, + "balance_loss_mlp": 0.01260942, + "epoch": 0.31486547422215544, + "flos": 31256721504000.0, + "grad_norm": 2.880510555000243, + "language_loss": 0.76337612, + "learning_rate": 3.207443732256881e-06, + "loss": 0.84108353, + "num_input_tokens_seen": 112440825, + "router_z_loss_clip": 2.04101562, + "router_z_loss_mlp": 0.16723633, + "step": 5237, + "time_per_iteration": 4.129598379135132 + }, + { + "auxiliary_loss_clip": 0.0648271, + "auxiliary_loss_mlp": 0.01277744, + "balance_loss_clip": 0.06291585, + "balance_loss_mlp": 0.01262843, + "epoch": 0.3149255974748234, + "flos": 19834749642240.0, + "grad_norm": 1.6736027402410734, + "language_loss": 0.7951014, + "learning_rate": 3.2071332324506372e-06, + "loss": 0.87270594, + "num_input_tokens_seen": 112459180, + "router_z_loss_clip": 1.91015625, + "router_z_loss_mlp": 0.14916992, + "step": 5238, + "time_per_iteration": 2.504612445831299 + }, + { + "auxiliary_loss_clip": 0.06376656, + "auxiliary_loss_mlp": 0.01267743, + "balance_loss_clip": 0.06282751, + "balance_loss_mlp": 0.01263604, + "epoch": 0.31498572072749137, + "flos": 67701867350400.0, + "grad_norm": 0.8276402478045692, + "language_loss": 0.68007928, + "learning_rate": 3.2068226868697795e-06, + "loss": 0.75652325, + "num_input_tokens_seen": 112516680, + "router_z_loss_clip": 0.9375, + "router_z_loss_mlp": 0.04141235, + "step": 5239, + "time_per_iteration": 3.174287796020508 + }, + { + "auxiliary_loss_clip": 0.06498836, + "auxiliary_loss_mlp": 0.01274257, + "balance_loss_clip": 0.06292844, + "balance_loss_mlp": 0.01256376, + "epoch": 0.31504584398015933, + "flos": 19799432346240.0, + "grad_norm": 2.176171670908613, + "language_loss": 0.82951081, + "learning_rate": 3.2065120955260846e-06, + "loss": 0.9072417, + "num_input_tokens_seen": 112535895, + "router_z_loss_clip": 2.0625, + "router_z_loss_mlp": 0.17883301, + "step": 5240, + "time_per_iteration": 2.509793996810913 + }, + { + "auxiliary_loss_clip": 0.06485248, + "auxiliary_loss_mlp": 0.01278588, + "balance_loss_clip": 0.06288239, + "balance_loss_mlp": 0.01262125, + "epoch": 0.3151059672328273, + "flos": 26622751628160.0, + "grad_norm": 1.8077188253124041, + "language_loss": 0.81193888, + "learning_rate": 3.2062014584313302e-06, + "loss": 0.88957721, + "num_input_tokens_seen": 112557490, + "router_z_loss_clip": 1.97363281, + "router_z_loss_mlp": 0.16455078, + "step": 5241, + "time_per_iteration": 2.571192502975464 + }, + { + "auxiliary_loss_clip": 0.06482153, + "auxiliary_loss_mlp": 0.01277268, + "balance_loss_clip": 0.06291743, + "balance_loss_mlp": 0.01260912, + "epoch": 0.31516609048549526, + "flos": 24210890904960.0, + "grad_norm": 1.4478120037649602, + "language_loss": 0.74484038, + "learning_rate": 3.2058907755972956e-06, + "loss": 0.82243454, + "num_input_tokens_seen": 112577075, + "router_z_loss_clip": 1.90332031, + "router_z_loss_mlp": 0.16357422, + "step": 5242, + "time_per_iteration": 2.526357650756836 + }, + { + "auxiliary_loss_clip": 0.06487267, + "auxiliary_loss_mlp": 0.01275494, + "balance_loss_clip": 0.06292535, + "balance_loss_mlp": 0.01259163, + "epoch": 0.31522621373816323, + "flos": 25965950999040.0, + "grad_norm": 1.6442244241642663, + "language_loss": 0.73668325, + "learning_rate": 3.2055800470357626e-06, + "loss": 0.81431091, + "num_input_tokens_seen": 112597620, + "router_z_loss_clip": 1.95019531, + "router_z_loss_mlp": 0.16320801, + "step": 5243, + "time_per_iteration": 2.606276273727417 + }, + { + "auxiliary_loss_clip": 0.06485401, + "auxiliary_loss_mlp": 0.01273843, + "balance_loss_clip": 0.0628818, + "balance_loss_mlp": 0.0125713, + "epoch": 0.3152863369908312, + "flos": 21915379975680.0, + "grad_norm": 1.7357669101009914, + "language_loss": 0.64914608, + "learning_rate": 3.205269272758513e-06, + "loss": 0.72673857, + "num_input_tokens_seen": 112617150, + "router_z_loss_clip": 1.97265625, + "router_z_loss_mlp": 0.16711426, + "step": 5244, + "time_per_iteration": 2.5950305461883545 + }, + { + "auxiliary_loss_clip": 0.06492754, + "auxiliary_loss_mlp": 0.01274277, + "balance_loss_clip": 0.06292984, + "balance_loss_mlp": 0.01257743, + "epoch": 0.31534646024349916, + "flos": 16285203308160.0, + "grad_norm": 2.8540583379791005, + "language_loss": 0.91357732, + "learning_rate": 3.2049584527773313e-06, + "loss": 0.99124765, + "num_input_tokens_seen": 112631090, + "router_z_loss_clip": 2.0, + "router_z_loss_mlp": 0.16540527, + "step": 5245, + "time_per_iteration": 2.510085105895996 + }, + { + "auxiliary_loss_clip": 0.06488977, + "auxiliary_loss_mlp": 0.01277309, + "balance_loss_clip": 0.06291293, + "balance_loss_mlp": 0.01260596, + "epoch": 0.3154065834961671, + "flos": 24724116362880.0, + "grad_norm": 1.9445780779956967, + "language_loss": 0.75699973, + "learning_rate": 3.2046475871040048e-06, + "loss": 0.83466256, + "num_input_tokens_seen": 112651220, + "router_z_loss_clip": 1.97851562, + "router_z_loss_mlp": 0.1673584, + "step": 5246, + "time_per_iteration": 2.543600559234619 + }, + { + "auxiliary_loss_clip": 0.06488622, + "auxiliary_loss_mlp": 0.01279725, + "balance_loss_clip": 0.06290317, + "balance_loss_mlp": 0.01262833, + "epoch": 0.3154667067488351, + "flos": 35379813836160.0, + "grad_norm": 1.6152414177037249, + "language_loss": 0.61608225, + "learning_rate": 3.204336675750321e-06, + "loss": 0.69376576, + "num_input_tokens_seen": 112671560, + "router_z_loss_clip": 1.98144531, + "router_z_loss_mlp": 0.16882324, + "step": 5247, + "time_per_iteration": 2.6849827766418457 + }, + { + "auxiliary_loss_clip": 0.06491058, + "auxiliary_loss_mlp": 0.01281873, + "balance_loss_clip": 0.06290263, + "balance_loss_mlp": 0.0126417, + "epoch": 0.31552683000150306, + "flos": 17462105429760.0, + "grad_norm": 2.6938697298202667, + "language_loss": 0.82848823, + "learning_rate": 3.2040257187280693e-06, + "loss": 0.90621758, + "num_input_tokens_seen": 112689790, + "router_z_loss_clip": 2.00585938, + "router_z_loss_mlp": 0.17687988, + "step": 5248, + "time_per_iteration": 2.4956586360931396 + }, + { + "auxiliary_loss_clip": 0.06488842, + "auxiliary_loss_mlp": 0.01282146, + "balance_loss_clip": 0.06291078, + "balance_loss_mlp": 0.01264121, + "epoch": 0.3155869532541711, + "flos": 18411674624640.0, + "grad_norm": 4.654519722073602, + "language_loss": 0.85721719, + "learning_rate": 3.2037147160490423e-06, + "loss": 0.93492711, + "num_input_tokens_seen": 112708265, + "router_z_loss_clip": 1.9765625, + "router_z_loss_mlp": 0.18029785, + "step": 5249, + "time_per_iteration": 2.568054437637329 + }, + { + "auxiliary_loss_clip": 0.06489561, + "auxiliary_loss_mlp": 0.01280069, + "balance_loss_clip": 0.06290483, + "balance_loss_mlp": 0.01261198, + "epoch": 0.31564707650683904, + "flos": 21586162083840.0, + "grad_norm": 1.7795262086342007, + "language_loss": 0.86067384, + "learning_rate": 3.2034036677250322e-06, + "loss": 0.93837023, + "num_input_tokens_seen": 112727820, + "router_z_loss_clip": 1.98730469, + "router_z_loss_mlp": 0.1887207, + "step": 5250, + "time_per_iteration": 2.508528709411621 + }, + { + "auxiliary_loss_clip": 0.06486481, + "auxiliary_loss_mlp": 0.01279989, + "balance_loss_clip": 0.06289366, + "balance_loss_mlp": 0.01262334, + "epoch": 0.315707199759507, + "flos": 21037032351360.0, + "grad_norm": 2.1261014211455063, + "language_loss": 0.6942147, + "learning_rate": 3.203092573767835e-06, + "loss": 0.77187943, + "num_input_tokens_seen": 112743140, + "router_z_loss_clip": 1.96777344, + "router_z_loss_mlp": 0.1763916, + "step": 5251, + "time_per_iteration": 2.526685953140259 + }, + { + "auxiliary_loss_clip": 0.06487083, + "auxiliary_loss_mlp": 0.01273182, + "balance_loss_clip": 0.06288725, + "balance_loss_mlp": 0.01255586, + "epoch": 0.31576732301217497, + "flos": 26835326236800.0, + "grad_norm": 2.019211823887184, + "language_loss": 0.78895354, + "learning_rate": 3.202781434189246e-06, + "loss": 0.86655623, + "num_input_tokens_seen": 112764705, + "router_z_loss_clip": 1.98339844, + "router_z_loss_mlp": 0.17602539, + "step": 5252, + "time_per_iteration": 2.570160150527954 + }, + { + "auxiliary_loss_clip": 0.06486022, + "auxiliary_loss_mlp": 0.01277329, + "balance_loss_clip": 0.06289184, + "balance_loss_mlp": 0.01261664, + "epoch": 0.31582744626484294, + "flos": 22717810200960.0, + "grad_norm": 1.5436537660689573, + "language_loss": 0.74377203, + "learning_rate": 3.202470249001066e-06, + "loss": 0.82140553, + "num_input_tokens_seen": 112785310, + "router_z_loss_clip": 1.96679688, + "router_z_loss_mlp": 0.15661621, + "step": 5253, + "time_per_iteration": 2.587277412414551 + }, + { + "auxiliary_loss_clip": 0.06489179, + "auxiliary_loss_mlp": 0.01281773, + "balance_loss_clip": 0.06290863, + "balance_loss_mlp": 0.01264309, + "epoch": 0.3158875695175109, + "flos": 23958806296320.0, + "grad_norm": 1.6773864910066614, + "language_loss": 0.73971915, + "learning_rate": 3.2021590182150924e-06, + "loss": 0.81742871, + "num_input_tokens_seen": 112802905, + "router_z_loss_clip": 1.98339844, + "router_z_loss_mlp": 0.17456055, + "step": 5254, + "time_per_iteration": 2.588543653488159 + }, + { + "auxiliary_loss_clip": 0.06491473, + "auxiliary_loss_mlp": 0.01275265, + "balance_loss_clip": 0.06290045, + "balance_loss_mlp": 0.01257408, + "epoch": 0.31594769277017887, + "flos": 13267036840320.0, + "grad_norm": 2.7381317978754933, + "language_loss": 0.78115344, + "learning_rate": 3.201847741843128e-06, + "loss": 0.85882092, + "num_input_tokens_seen": 112820305, + "router_z_loss_clip": 2.01367188, + "router_z_loss_mlp": 0.17858887, + "step": 5255, + "time_per_iteration": 2.5159435272216797 + }, + { + "auxiliary_loss_clip": 0.0648552, + "auxiliary_loss_mlp": 0.01275031, + "balance_loss_clip": 0.06288838, + "balance_loss_mlp": 0.01255921, + "epoch": 0.31600781602284683, + "flos": 23375072027520.0, + "grad_norm": 2.9601180138118286, + "language_loss": 0.78838313, + "learning_rate": 3.2015364198969772e-06, + "loss": 0.86598861, + "num_input_tokens_seen": 112841185, + "router_z_loss_clip": 1.96875, + "router_z_loss_mlp": 0.19104004, + "step": 5256, + "time_per_iteration": 2.560702085494995 + }, + { + "auxiliary_loss_clip": 0.06480406, + "auxiliary_loss_mlp": 0.01272902, + "balance_loss_clip": 0.06291319, + "balance_loss_mlp": 0.01257352, + "epoch": 0.3160679392755148, + "flos": 19834707715200.0, + "grad_norm": 1.443888473305352, + "language_loss": 0.71476674, + "learning_rate": 3.2012250523884453e-06, + "loss": 0.79229981, + "num_input_tokens_seen": 112860570, + "router_z_loss_clip": 1.89257812, + "router_z_loss_mlp": 0.15533447, + "step": 5257, + "time_per_iteration": 2.515044927597046 + }, + { + "auxiliary_loss_clip": 0.06490695, + "auxiliary_loss_mlp": 0.01275192, + "balance_loss_clip": 0.06291541, + "balance_loss_mlp": 0.01257787, + "epoch": 0.31612806252818276, + "flos": 20199368684160.0, + "grad_norm": 3.1125237193001967, + "language_loss": 0.77181315, + "learning_rate": 3.2009136393293393e-06, + "loss": 0.84947205, + "num_input_tokens_seen": 112877975, + "router_z_loss_clip": 1.9921875, + "router_z_loss_mlp": 0.17419434, + "step": 5258, + "time_per_iteration": 2.544926166534424 + }, + { + "auxiliary_loss_clip": 0.06484105, + "auxiliary_loss_mlp": 0.01276302, + "balance_loss_clip": 0.06286652, + "balance_loss_mlp": 0.01258624, + "epoch": 0.31618818578085073, + "flos": 24241596226560.0, + "grad_norm": 2.554871248122792, + "language_loss": 0.73012489, + "learning_rate": 3.200602180731467e-06, + "loss": 0.80772901, + "num_input_tokens_seen": 112896170, + "router_z_loss_clip": 1.97363281, + "router_z_loss_mlp": 0.17675781, + "step": 5259, + "time_per_iteration": 2.5244109630584717 + }, + { + "auxiliary_loss_clip": 0.06490766, + "auxiliary_loss_mlp": 0.01272581, + "balance_loss_clip": 0.06291697, + "balance_loss_mlp": 0.01256106, + "epoch": 0.3162483090335187, + "flos": 25088735404800.0, + "grad_norm": 2.502439629336286, + "language_loss": 0.66774327, + "learning_rate": 3.20029067660664e-06, + "loss": 0.74537671, + "num_input_tokens_seen": 112916180, + "router_z_loss_clip": 1.9921875, + "router_z_loss_mlp": 0.16455078, + "step": 5260, + "time_per_iteration": 2.575772762298584 + }, + { + "auxiliary_loss_clip": 0.06481651, + "auxiliary_loss_mlp": 0.01272837, + "balance_loss_clip": 0.06285223, + "balance_loss_mlp": 0.01256386, + "epoch": 0.31630843228618666, + "flos": 26330653895040.0, + "grad_norm": 2.0766337978972023, + "language_loss": 0.72817439, + "learning_rate": 3.1999791269666706e-06, + "loss": 0.80571926, + "num_input_tokens_seen": 112936745, + "router_z_loss_clip": 1.96679688, + "router_z_loss_mlp": 0.16455078, + "step": 5261, + "time_per_iteration": 2.559112548828125 + }, + { + "auxiliary_loss_clip": 0.06366719, + "auxiliary_loss_mlp": 0.01254616, + "balance_loss_clip": 0.06272718, + "balance_loss_mlp": 0.01250792, + "epoch": 0.3163685555388547, + "flos": 66780053856000.0, + "grad_norm": 0.7132570662369885, + "language_loss": 0.50697625, + "learning_rate": 3.1996675318233716e-06, + "loss": 0.58318961, + "num_input_tokens_seen": 112994845, + "router_z_loss_clip": 0.9375, + "router_z_loss_mlp": 0.03817749, + "step": 5262, + "time_per_iteration": 3.1381468772888184 + }, + { + "auxiliary_loss_clip": 0.06487425, + "auxiliary_loss_mlp": 0.01273056, + "balance_loss_clip": 0.06289163, + "balance_loss_mlp": 0.01256224, + "epoch": 0.31642867879152264, + "flos": 26002987303680.0, + "grad_norm": 1.713052875923359, + "language_loss": 0.85966682, + "learning_rate": 3.19935589118856e-06, + "loss": 0.9372716, + "num_input_tokens_seen": 113015125, + "router_z_loss_clip": 1.98046875, + "router_z_loss_mlp": 0.16833496, + "step": 5263, + "time_per_iteration": 2.5385844707489014 + }, + { + "auxiliary_loss_clip": 0.0647549, + "auxiliary_loss_mlp": 0.01273956, + "balance_loss_clip": 0.06283621, + "balance_loss_mlp": 0.01257695, + "epoch": 0.3164888020441906, + "flos": 25781943432960.0, + "grad_norm": 1.4697461293234868, + "language_loss": 0.82077682, + "learning_rate": 3.1990442050740535e-06, + "loss": 0.89827132, + "num_input_tokens_seen": 113035535, + "router_z_loss_clip": 1.91894531, + "router_z_loss_mlp": 0.16247559, + "step": 5264, + "time_per_iteration": 2.558708429336548 + }, + { + "auxiliary_loss_clip": 0.06488511, + "auxiliary_loss_mlp": 0.01271533, + "balance_loss_clip": 0.06288397, + "balance_loss_mlp": 0.01254117, + "epoch": 0.3165489252968586, + "flos": 19762437968640.0, + "grad_norm": 1.8601211050375244, + "language_loss": 0.80259931, + "learning_rate": 3.19873247349167e-06, + "loss": 0.88019973, + "num_input_tokens_seen": 113052720, + "router_z_loss_clip": 2.00097656, + "router_z_loss_mlp": 0.17419434, + "step": 5265, + "time_per_iteration": 2.492342948913574 + }, + { + "auxiliary_loss_clip": 0.06481829, + "auxiliary_loss_mlp": 0.01275233, + "balance_loss_clip": 0.06283312, + "balance_loss_mlp": 0.01257148, + "epoch": 0.31660904854952654, + "flos": 23190393628800.0, + "grad_norm": 2.032053662698869, + "language_loss": 0.75410831, + "learning_rate": 3.1984206964532307e-06, + "loss": 0.83167893, + "num_input_tokens_seen": 113071435, + "router_z_loss_clip": 1.98730469, + "router_z_loss_mlp": 0.1809082, + "step": 5266, + "time_per_iteration": 2.5563931465148926 + }, + { + "auxiliary_loss_clip": 0.06488708, + "auxiliary_loss_mlp": 0.01276821, + "balance_loss_clip": 0.06287502, + "balance_loss_mlp": 0.01258308, + "epoch": 0.3166691718021945, + "flos": 20414081571840.0, + "grad_norm": 2.020882594632444, + "language_loss": 0.79489279, + "learning_rate": 3.1981088739705585e-06, + "loss": 0.87254804, + "num_input_tokens_seen": 113088645, + "router_z_loss_clip": 2.01269531, + "router_z_loss_mlp": 0.18518066, + "step": 5267, + "time_per_iteration": 2.509413242340088 + }, + { + "auxiliary_loss_clip": 0.06371635, + "auxiliary_loss_mlp": 0.01254873, + "balance_loss_clip": 0.06277829, + "balance_loss_mlp": 0.01251359, + "epoch": 0.31672929505486247, + "flos": 70165816185600.0, + "grad_norm": 1.145238273522293, + "language_loss": 0.57623893, + "learning_rate": 3.197797006055478e-06, + "loss": 0.65250397, + "num_input_tokens_seen": 113152775, + "router_z_loss_clip": 0.9375, + "router_z_loss_mlp": 0.03518677, + "step": 5268, + "time_per_iteration": 4.6658477783203125 + }, + { + "auxiliary_loss_clip": 0.06486145, + "auxiliary_loss_mlp": 0.01271551, + "balance_loss_clip": 0.06287054, + "balance_loss_mlp": 0.01253884, + "epoch": 0.31678941830753043, + "flos": 14360977820160.0, + "grad_norm": 2.2953322915245784, + "language_loss": 0.73492396, + "learning_rate": 3.197485092719815e-06, + "loss": 0.81250083, + "num_input_tokens_seen": 113171410, + "router_z_loss_clip": 1.98925781, + "router_z_loss_mlp": 0.17651367, + "step": 5269, + "time_per_iteration": 2.500276565551758 + }, + { + "auxiliary_loss_clip": 0.06490922, + "auxiliary_loss_mlp": 0.01279355, + "balance_loss_clip": 0.06295022, + "balance_loss_mlp": 0.01261652, + "epoch": 0.3168495415601984, + "flos": 22754385308160.0, + "grad_norm": 1.8930521062253438, + "language_loss": 0.80391312, + "learning_rate": 3.1971731339753973e-06, + "loss": 0.88161588, + "num_input_tokens_seen": 113189965, + "router_z_loss_clip": 1.9609375, + "router_z_loss_mlp": 0.17700195, + "step": 5270, + "time_per_iteration": 4.030852794647217 + }, + { + "auxiliary_loss_clip": 0.0648749, + "auxiliary_loss_mlp": 0.01275027, + "balance_loss_clip": 0.06288311, + "balance_loss_mlp": 0.01257742, + "epoch": 0.31690966481286637, + "flos": 20120558319360.0, + "grad_norm": 2.0275703030815744, + "language_loss": 0.79860884, + "learning_rate": 3.1968611298340545e-06, + "loss": 0.87623405, + "num_input_tokens_seen": 113206355, + "router_z_loss_clip": 1.99121094, + "router_z_loss_mlp": 0.17285156, + "step": 5271, + "time_per_iteration": 3.963491201400757 + }, + { + "auxiliary_loss_clip": 0.06485552, + "auxiliary_loss_mlp": 0.01274595, + "balance_loss_clip": 0.06286864, + "balance_loss_mlp": 0.01256344, + "epoch": 0.31696978806553433, + "flos": 21185345278080.0, + "grad_norm": 2.0532864997035616, + "language_loss": 0.7348994, + "learning_rate": 3.1965490803076173e-06, + "loss": 0.81250083, + "num_input_tokens_seen": 113225440, + "router_z_loss_clip": 1.98828125, + "router_z_loss_mlp": 0.18237305, + "step": 5272, + "time_per_iteration": 2.5324926376342773 + }, + { + "auxiliary_loss_clip": 0.06497657, + "auxiliary_loss_mlp": 0.01275072, + "balance_loss_clip": 0.06294467, + "balance_loss_mlp": 0.01255629, + "epoch": 0.3170299113182023, + "flos": 43007030789760.0, + "grad_norm": 2.3636013379780083, + "language_loss": 0.69916022, + "learning_rate": 3.1962369854079194e-06, + "loss": 0.77688754, + "num_input_tokens_seen": 113248840, + "router_z_loss_clip": 2.03320312, + "router_z_loss_mlp": 0.19458008, + "step": 5273, + "time_per_iteration": 2.8313193321228027 + }, + { + "auxiliary_loss_clip": 0.0648469, + "auxiliary_loss_mlp": 0.01272488, + "balance_loss_clip": 0.06288255, + "balance_loss_mlp": 0.01255954, + "epoch": 0.31709003457087026, + "flos": 24466707020160.0, + "grad_norm": 3.373298123766896, + "language_loss": 0.68486917, + "learning_rate": 3.195924845146795e-06, + "loss": 0.76244098, + "num_input_tokens_seen": 113269630, + "router_z_loss_clip": 1.9609375, + "router_z_loss_mlp": 0.1652832, + "step": 5274, + "time_per_iteration": 2.5647053718566895 + }, + { + "auxiliary_loss_clip": 0.06486842, + "auxiliary_loss_mlp": 0.01272159, + "balance_loss_clip": 0.06295811, + "balance_loss_mlp": 0.01256114, + "epoch": 0.3171501578235382, + "flos": 24142394592000.0, + "grad_norm": 1.437173314012816, + "language_loss": 0.8105545, + "learning_rate": 3.195612659536081e-06, + "loss": 0.88814449, + "num_input_tokens_seen": 113291200, + "router_z_loss_clip": 1.91308594, + "router_z_loss_mlp": 0.16052246, + "step": 5275, + "time_per_iteration": 2.545689821243286 + }, + { + "auxiliary_loss_clip": 0.06496362, + "auxiliary_loss_mlp": 0.01272499, + "balance_loss_clip": 0.0629561, + "balance_loss_mlp": 0.01254296, + "epoch": 0.31721028107620625, + "flos": 18885641644800.0, + "grad_norm": 1.7797970991839078, + "language_loss": 0.73459136, + "learning_rate": 3.1953004285876147e-06, + "loss": 0.81228, + "num_input_tokens_seen": 113310170, + "router_z_loss_clip": 2.00683594, + "router_z_loss_mlp": 0.18212891, + "step": 5276, + "time_per_iteration": 3.978994131088257 + }, + { + "auxiliary_loss_clip": 0.06480486, + "auxiliary_loss_mlp": 0.01276369, + "balance_loss_clip": 0.06288992, + "balance_loss_mlp": 0.01259811, + "epoch": 0.3172704043288742, + "flos": 23154405500160.0, + "grad_norm": 1.4192945576637652, + "language_loss": 0.78409082, + "learning_rate": 3.194988152313236e-06, + "loss": 0.86165935, + "num_input_tokens_seen": 113331140, + "router_z_loss_clip": 1.91699219, + "router_z_loss_mlp": 0.16552734, + "step": 5277, + "time_per_iteration": 2.6181840896606445 + }, + { + "auxiliary_loss_clip": 0.06493685, + "auxiliary_loss_mlp": 0.01273951, + "balance_loss_clip": 0.06294833, + "balance_loss_mlp": 0.01256653, + "epoch": 0.3173305275815422, + "flos": 17864347754880.0, + "grad_norm": 1.9934204528772321, + "language_loss": 0.79709554, + "learning_rate": 3.1946758307247878e-06, + "loss": 0.87477195, + "num_input_tokens_seen": 113350030, + "router_z_loss_clip": 1.98828125, + "router_z_loss_mlp": 0.17297363, + "step": 5278, + "time_per_iteration": 2.4955894947052 + }, + { + "auxiliary_loss_clip": 0.06380783, + "auxiliary_loss_mlp": 0.01265109, + "balance_loss_clip": 0.06288064, + "balance_loss_mlp": 0.01260886, + "epoch": 0.31739065083421014, + "flos": 59988083529600.0, + "grad_norm": 0.841903886868049, + "language_loss": 0.62797457, + "learning_rate": 3.1943634638341114e-06, + "loss": 0.7044335, + "num_input_tokens_seen": 113395820, + "router_z_loss_clip": 0.92724609, + "router_z_loss_mlp": 0.04226685, + "step": 5279, + "time_per_iteration": 2.920987367630005 + }, + { + "auxiliary_loss_clip": 0.06489395, + "auxiliary_loss_mlp": 0.01285376, + "balance_loss_clip": 0.06287935, + "balance_loss_mlp": 0.01265265, + "epoch": 0.3174507740868781, + "flos": 23807013425280.0, + "grad_norm": 2.0709232065681475, + "language_loss": 0.81487882, + "learning_rate": 3.194051051653053e-06, + "loss": 0.89262652, + "num_input_tokens_seen": 113416835, + "router_z_loss_clip": 2.01367188, + "router_z_loss_mlp": 0.2010498, + "step": 5280, + "time_per_iteration": 2.537612199783325 + }, + { + "auxiliary_loss_clip": 0.06483282, + "auxiliary_loss_mlp": 0.01281645, + "balance_loss_clip": 0.06291374, + "balance_loss_mlp": 0.01264276, + "epoch": 0.31751089733954607, + "flos": 27646728848640.0, + "grad_norm": 1.437826441265799, + "language_loss": 0.78464299, + "learning_rate": 3.19373859419346e-06, + "loss": 0.86229229, + "num_input_tokens_seen": 113440850, + "router_z_loss_clip": 1.91699219, + "router_z_loss_mlp": 0.17358398, + "step": 5281, + "time_per_iteration": 2.6482186317443848 + }, + { + "auxiliary_loss_clip": 0.06485789, + "auxiliary_loss_mlp": 0.01283007, + "balance_loss_clip": 0.06290175, + "balance_loss_mlp": 0.01265424, + "epoch": 0.31757102059221404, + "flos": 23776098468480.0, + "grad_norm": 1.5338111796323235, + "language_loss": 0.78882301, + "learning_rate": 3.193426091467179e-06, + "loss": 0.86651099, + "num_input_tokens_seen": 113461000, + "router_z_loss_clip": 1.95410156, + "router_z_loss_mlp": 0.17590332, + "step": 5282, + "time_per_iteration": 2.5157217979431152 + }, + { + "auxiliary_loss_clip": 0.06494205, + "auxiliary_loss_mlp": 0.01276135, + "balance_loss_clip": 0.0629286, + "balance_loss_mlp": 0.01258373, + "epoch": 0.317631143844882, + "flos": 25271485159680.0, + "grad_norm": 2.0006947857157753, + "language_loss": 0.67952389, + "learning_rate": 3.193113543486061e-06, + "loss": 0.7572273, + "num_input_tokens_seen": 113480820, + "router_z_loss_clip": 2.01367188, + "router_z_loss_mlp": 0.1776123, + "step": 5283, + "time_per_iteration": 2.565925359725952 + }, + { + "auxiliary_loss_clip": 0.06373101, + "auxiliary_loss_mlp": 0.01271528, + "balance_loss_clip": 0.0628058, + "balance_loss_mlp": 0.01267352, + "epoch": 0.31769126709754997, + "flos": 55841832743040.0, + "grad_norm": 0.7241871595116953, + "language_loss": 0.52631503, + "learning_rate": 3.192800950261958e-06, + "loss": 0.60276127, + "num_input_tokens_seen": 113536910, + "router_z_loss_clip": 0.921875, + "router_z_loss_mlp": 0.04177856, + "step": 5284, + "time_per_iteration": 3.1037213802337646 + }, + { + "auxiliary_loss_clip": 0.0649649, + "auxiliary_loss_mlp": 0.01274319, + "balance_loss_clip": 0.06291351, + "balance_loss_mlp": 0.01257225, + "epoch": 0.31775139035021793, + "flos": 16696124530560.0, + "grad_norm": 2.2460762000689294, + "language_loss": 0.70842284, + "learning_rate": 3.1924883118067235e-06, + "loss": 0.78613091, + "num_input_tokens_seen": 113555480, + "router_z_loss_clip": 2.04980469, + "router_z_loss_mlp": 0.17102051, + "step": 5285, + "time_per_iteration": 2.5407655239105225 + }, + { + "auxiliary_loss_clip": 0.06366412, + "auxiliary_loss_mlp": 0.01262401, + "balance_loss_clip": 0.06274283, + "balance_loss_mlp": 0.01258384, + "epoch": 0.3178115136028859, + "flos": 64246141261440.0, + "grad_norm": 1.0137073922687154, + "language_loss": 0.60545647, + "learning_rate": 3.1921756281322123e-06, + "loss": 0.68174458, + "num_input_tokens_seen": 113616790, + "router_z_loss_clip": 0.921875, + "router_z_loss_mlp": 0.04016113, + "step": 5286, + "time_per_iteration": 3.1833202838897705 + }, + { + "auxiliary_loss_clip": 0.06498363, + "auxiliary_loss_mlp": 0.01284909, + "balance_loss_clip": 0.06297486, + "balance_loss_mlp": 0.01267051, + "epoch": 0.31787163685555386, + "flos": 18703395014400.0, + "grad_norm": 1.7319286904547555, + "language_loss": 0.72404122, + "learning_rate": 3.1918628992502826e-06, + "loss": 0.80187392, + "num_input_tokens_seen": 113635320, + "router_z_loss_clip": 2.0078125, + "router_z_loss_mlp": 0.17871094, + "step": 5287, + "time_per_iteration": 2.50571608543396 + }, + { + "auxiliary_loss_clip": 0.06495041, + "auxiliary_loss_mlp": 0.01276683, + "balance_loss_clip": 0.06292516, + "balance_loss_mlp": 0.012578, + "epoch": 0.31793176010822183, + "flos": 21331184509440.0, + "grad_norm": 1.978321388726588, + "language_loss": 0.76231503, + "learning_rate": 3.191550125172792e-06, + "loss": 0.84003228, + "num_input_tokens_seen": 113654000, + "router_z_loss_clip": 2.02539062, + "router_z_loss_mlp": 0.18884277, + "step": 5288, + "time_per_iteration": 2.5568416118621826 + }, + { + "auxiliary_loss_clip": 0.06485806, + "auxiliary_loss_mlp": 0.01283528, + "balance_loss_clip": 0.06293501, + "balance_loss_mlp": 0.01267816, + "epoch": 0.31799188336088985, + "flos": 20964846458880.0, + "grad_norm": 1.7076221862053031, + "language_loss": 0.88265222, + "learning_rate": 3.1912373059116007e-06, + "loss": 0.96034551, + "num_input_tokens_seen": 113672375, + "router_z_loss_clip": 1.92578125, + "router_z_loss_mlp": 0.15710449, + "step": 5289, + "time_per_iteration": 2.5359349250793457 + }, + { + "auxiliary_loss_clip": 0.06488061, + "auxiliary_loss_mlp": 0.01286652, + "balance_loss_clip": 0.06295781, + "balance_loss_mlp": 0.01269724, + "epoch": 0.3180520066135578, + "flos": 22498485338880.0, + "grad_norm": 1.4069348748047803, + "language_loss": 0.68210149, + "learning_rate": 3.190924441478572e-06, + "loss": 0.75984859, + "num_input_tokens_seen": 113692385, + "router_z_loss_clip": 1.92382812, + "router_z_loss_mlp": 0.16906738, + "step": 5290, + "time_per_iteration": 2.5393311977386475 + }, + { + "auxiliary_loss_clip": 0.06494544, + "auxiliary_loss_mlp": 0.0128386, + "balance_loss_clip": 0.06290419, + "balance_loss_mlp": 0.01265788, + "epoch": 0.3181121298662258, + "flos": 27242725587840.0, + "grad_norm": 3.4346413288346, + "language_loss": 0.79944348, + "learning_rate": 3.1906115318855687e-06, + "loss": 0.87722754, + "num_input_tokens_seen": 113712145, + "router_z_loss_clip": 2.04003906, + "router_z_loss_mlp": 0.18066406, + "step": 5291, + "time_per_iteration": 2.564091444015503 + }, + { + "auxiliary_loss_clip": 0.06485635, + "auxiliary_loss_mlp": 0.01278435, + "balance_loss_clip": 0.06287642, + "balance_loss_mlp": 0.01259361, + "epoch": 0.31817225311889374, + "flos": 23185991289600.0, + "grad_norm": 2.0451390273410004, + "language_loss": 0.79931051, + "learning_rate": 3.1902985771444577e-06, + "loss": 0.87695122, + "num_input_tokens_seen": 113731435, + "router_z_loss_clip": 1.97949219, + "router_z_loss_mlp": 0.19067383, + "step": 5292, + "time_per_iteration": 2.743156671524048 + }, + { + "auxiliary_loss_clip": 0.06476898, + "auxiliary_loss_mlp": 0.01275055, + "balance_loss_clip": 0.06287324, + "balance_loss_mlp": 0.01258044, + "epoch": 0.3182323763715617, + "flos": 23265598268160.0, + "grad_norm": 1.819133879513315, + "language_loss": 0.75602406, + "learning_rate": 3.1899855772671043e-06, + "loss": 0.8335436, + "num_input_tokens_seen": 113750825, + "router_z_loss_clip": 1.89550781, + "router_z_loss_mlp": 0.17004395, + "step": 5293, + "time_per_iteration": 2.523386001586914 + }, + { + "auxiliary_loss_clip": 0.06482453, + "auxiliary_loss_mlp": 0.01276012, + "balance_loss_clip": 0.06290737, + "balance_loss_mlp": 0.01260204, + "epoch": 0.3182924996242297, + "flos": 29023292050560.0, + "grad_norm": 2.0524562129349526, + "language_loss": 0.75145984, + "learning_rate": 3.189672532265379e-06, + "loss": 0.82904446, + "num_input_tokens_seen": 113770010, + "router_z_loss_clip": 1.91601562, + "router_z_loss_mlp": 0.15808105, + "step": 5294, + "time_per_iteration": 2.607849597930908 + }, + { + "auxiliary_loss_clip": 0.06489888, + "auxiliary_loss_mlp": 0.01277785, + "balance_loss_clip": 0.06293514, + "balance_loss_mlp": 0.01259201, + "epoch": 0.31835262287689764, + "flos": 20455478288640.0, + "grad_norm": 2.029675905915872, + "language_loss": 0.76497674, + "learning_rate": 3.189359442151152e-06, + "loss": 0.84265351, + "num_input_tokens_seen": 113788640, + "router_z_loss_clip": 1.96582031, + "router_z_loss_mlp": 0.18591309, + "step": 5295, + "time_per_iteration": 2.4980461597442627 + }, + { + "auxiliary_loss_clip": 0.06494178, + "auxiliary_loss_mlp": 0.01278535, + "balance_loss_clip": 0.06293284, + "balance_loss_mlp": 0.01261166, + "epoch": 0.3184127461295656, + "flos": 25126568323200.0, + "grad_norm": 2.03182891885516, + "language_loss": 0.70142519, + "learning_rate": 3.189046306936296e-06, + "loss": 0.77915227, + "num_input_tokens_seen": 113809515, + "router_z_loss_clip": 2.0078125, + "router_z_loss_mlp": 0.17358398, + "step": 5296, + "time_per_iteration": 2.610671043395996 + }, + { + "auxiliary_loss_clip": 0.06483515, + "auxiliary_loss_mlp": 0.01274893, + "balance_loss_clip": 0.0628704, + "balance_loss_mlp": 0.01258371, + "epoch": 0.31847286938223357, + "flos": 25557377690880.0, + "grad_norm": 1.5251920176335134, + "language_loss": 0.77957898, + "learning_rate": 3.1887331266326846e-06, + "loss": 0.85716307, + "num_input_tokens_seen": 113829770, + "router_z_loss_clip": 1.96484375, + "router_z_loss_mlp": 0.16516113, + "step": 5297, + "time_per_iteration": 2.539649486541748 + }, + { + "auxiliary_loss_clip": 0.06479752, + "auxiliary_loss_mlp": 0.01272766, + "balance_loss_clip": 0.06283344, + "balance_loss_mlp": 0.01255516, + "epoch": 0.31853299263490154, + "flos": 27789926676480.0, + "grad_norm": 1.8177911904554251, + "language_loss": 0.80074358, + "learning_rate": 3.1884199012521942e-06, + "loss": 0.87826872, + "num_input_tokens_seen": 113849320, + "router_z_loss_clip": 1.96386719, + "router_z_loss_mlp": 0.17248535, + "step": 5298, + "time_per_iteration": 2.6127634048461914 + }, + { + "auxiliary_loss_clip": 0.06487016, + "auxiliary_loss_mlp": 0.0127216, + "balance_loss_clip": 0.06284906, + "balance_loss_mlp": 0.01254815, + "epoch": 0.3185931158875695, + "flos": 22712653175040.0, + "grad_norm": 1.6158824069779534, + "language_loss": 0.74615932, + "learning_rate": 3.1881066308067016e-06, + "loss": 0.82375109, + "num_input_tokens_seen": 113867860, + "router_z_loss_clip": 2.02148438, + "router_z_loss_mlp": 0.17346191, + "step": 5299, + "time_per_iteration": 2.570178508758545 + }, + { + "auxiliary_loss_clip": 0.06491919, + "auxiliary_loss_mlp": 0.01275355, + "balance_loss_clip": 0.06290901, + "balance_loss_mlp": 0.01258249, + "epoch": 0.31865323914023747, + "flos": 24578402912640.0, + "grad_norm": 1.9760141697724851, + "language_loss": 0.78568625, + "learning_rate": 3.1877933153080873e-06, + "loss": 0.86335897, + "num_input_tokens_seen": 113886375, + "router_z_loss_clip": 2.0078125, + "router_z_loss_mlp": 0.17102051, + "step": 5300, + "time_per_iteration": 2.7260777950286865 + }, + { + "auxiliary_loss_clip": 0.06483838, + "auxiliary_loss_mlp": 0.01272854, + "balance_loss_clip": 0.06287212, + "balance_loss_mlp": 0.01254495, + "epoch": 0.31871336239290543, + "flos": 18192391689600.0, + "grad_norm": 2.1538981188283195, + "language_loss": 0.84250915, + "learning_rate": 3.1874799547682304e-06, + "loss": 0.92007607, + "num_input_tokens_seen": 113904065, + "router_z_loss_clip": 1.96386719, + "router_z_loss_mlp": 0.18347168, + "step": 5301, + "time_per_iteration": 2.485152244567871 + }, + { + "auxiliary_loss_clip": 0.06484723, + "auxiliary_loss_mlp": 0.01274861, + "balance_loss_clip": 0.06291914, + "balance_loss_mlp": 0.01256777, + "epoch": 0.31877348564557345, + "flos": 21831789928320.0, + "grad_norm": 2.0482094969798696, + "language_loss": 0.7812382, + "learning_rate": 3.187166549199015e-06, + "loss": 0.85883403, + "num_input_tokens_seen": 113918415, + "router_z_loss_clip": 1.92675781, + "router_z_loss_mlp": 0.18066406, + "step": 5302, + "time_per_iteration": 2.528764247894287 + }, + { + "auxiliary_loss_clip": 0.0648333, + "auxiliary_loss_mlp": 0.01275814, + "balance_loss_clip": 0.06290714, + "balance_loss_mlp": 0.01258159, + "epoch": 0.3188336088982414, + "flos": 22021331863680.0, + "grad_norm": 1.6144767194600491, + "language_loss": 0.79736584, + "learning_rate": 3.1868530986123255e-06, + "loss": 0.8749572, + "num_input_tokens_seen": 113938135, + "router_z_loss_clip": 1.92675781, + "router_z_loss_mlp": 0.17651367, + "step": 5303, + "time_per_iteration": 2.5235095024108887 + }, + { + "auxiliary_loss_clip": 0.06497993, + "auxiliary_loss_mlp": 0.01276899, + "balance_loss_clip": 0.06290174, + "balance_loss_mlp": 0.01258159, + "epoch": 0.3188937321509094, + "flos": 20054116431360.0, + "grad_norm": 1.7320090718032515, + "language_loss": 0.73529422, + "learning_rate": 3.186539603020047e-06, + "loss": 0.81304312, + "num_input_tokens_seen": 113957125, + "router_z_loss_clip": 2.08007812, + "router_z_loss_mlp": 0.18737793, + "step": 5304, + "time_per_iteration": 2.5141329765319824 + }, + { + "auxiliary_loss_clip": 0.06481734, + "auxiliary_loss_mlp": 0.01278154, + "balance_loss_clip": 0.06290816, + "balance_loss_mlp": 0.01260928, + "epoch": 0.31895385540357735, + "flos": 25855135574400.0, + "grad_norm": 1.8091269764667626, + "language_loss": 0.72548914, + "learning_rate": 3.186226062434068e-06, + "loss": 0.80308801, + "num_input_tokens_seen": 113974875, + "router_z_loss_clip": 1.91113281, + "router_z_loss_mlp": 0.17236328, + "step": 5305, + "time_per_iteration": 2.5648975372314453 + }, + { + "auxiliary_loss_clip": 0.06487268, + "auxiliary_loss_mlp": 0.01270708, + "balance_loss_clip": 0.06292576, + "balance_loss_mlp": 0.01254603, + "epoch": 0.3190139786562453, + "flos": 23484545786880.0, + "grad_norm": 2.116447005947582, + "language_loss": 0.64815247, + "learning_rate": 3.1859124768662778e-06, + "loss": 0.72573221, + "num_input_tokens_seen": 113994450, + "router_z_loss_clip": 1.94628906, + "router_z_loss_mlp": 0.16113281, + "step": 5306, + "time_per_iteration": 2.5745668411254883 + }, + { + "auxiliary_loss_clip": 0.06483987, + "auxiliary_loss_mlp": 0.01282676, + "balance_loss_clip": 0.0628574, + "balance_loss_mlp": 0.01264413, + "epoch": 0.3190741019089133, + "flos": 29103150591360.0, + "grad_norm": 2.0084949709877726, + "language_loss": 0.79260421, + "learning_rate": 3.1855988463285678e-06, + "loss": 0.87027091, + "num_input_tokens_seen": 114013945, + "router_z_loss_clip": 1.98144531, + "router_z_loss_mlp": 0.18273926, + "step": 5307, + "time_per_iteration": 2.557509183883667 + }, + { + "auxiliary_loss_clip": 0.06481419, + "auxiliary_loss_mlp": 0.01278653, + "balance_loss_clip": 0.06289747, + "balance_loss_mlp": 0.01260736, + "epoch": 0.31913422516158124, + "flos": 17135361233280.0, + "grad_norm": 3.9021838038471097, + "language_loss": 0.78660965, + "learning_rate": 3.1852851708328308e-06, + "loss": 0.86421037, + "num_input_tokens_seen": 114031375, + "router_z_loss_clip": 1.91796875, + "router_z_loss_mlp": 0.17907715, + "step": 5308, + "time_per_iteration": 3.906280994415283 + }, + { + "auxiliary_loss_clip": 0.06493698, + "auxiliary_loss_mlp": 0.01278493, + "balance_loss_clip": 0.06287338, + "balance_loss_mlp": 0.01259408, + "epoch": 0.3191943484142492, + "flos": 16075228176000.0, + "grad_norm": 3.1945469837170215, + "language_loss": 0.74758154, + "learning_rate": 3.184971450390961e-06, + "loss": 0.82530349, + "num_input_tokens_seen": 114048465, + "router_z_loss_clip": 2.06152344, + "router_z_loss_mlp": 0.19091797, + "step": 5309, + "time_per_iteration": 2.4796438217163086 + }, + { + "auxiliary_loss_clip": 0.06480245, + "auxiliary_loss_mlp": 0.01274762, + "balance_loss_clip": 0.06283399, + "balance_loss_mlp": 0.01257954, + "epoch": 0.3192544716669172, + "flos": 22972787775360.0, + "grad_norm": 1.6995242114780418, + "language_loss": 0.83242565, + "learning_rate": 3.184657685014856e-06, + "loss": 0.90997577, + "num_input_tokens_seen": 114068415, + "router_z_loss_clip": 1.96777344, + "router_z_loss_mlp": 0.16809082, + "step": 5310, + "time_per_iteration": 5.470219373703003 + }, + { + "auxiliary_loss_clip": 0.06475915, + "auxiliary_loss_mlp": 0.01272391, + "balance_loss_clip": 0.06281388, + "balance_loss_mlp": 0.01255868, + "epoch": 0.31931459491958514, + "flos": 26877645348480.0, + "grad_norm": 1.407923936832892, + "language_loss": 0.78906345, + "learning_rate": 3.184343874716412e-06, + "loss": 0.86654651, + "num_input_tokens_seen": 114088565, + "router_z_loss_clip": 1.94628906, + "router_z_loss_mlp": 0.1652832, + "step": 5311, + "time_per_iteration": 2.546112298965454 + }, + { + "auxiliary_loss_clip": 0.06477334, + "auxiliary_loss_mlp": 0.01272194, + "balance_loss_clip": 0.06282097, + "balance_loss_mlp": 0.01255254, + "epoch": 0.3193747181722531, + "flos": 21843194083200.0, + "grad_norm": 1.8192899238067177, + "language_loss": 0.84889889, + "learning_rate": 3.1840300195075295e-06, + "loss": 0.92639416, + "num_input_tokens_seen": 114107160, + "router_z_loss_clip": 1.95117188, + "router_z_loss_mlp": 0.16943359, + "step": 5312, + "time_per_iteration": 2.5534987449645996 + }, + { + "auxiliary_loss_clip": 0.06489489, + "auxiliary_loss_mlp": 0.01274677, + "balance_loss_clip": 0.06284228, + "balance_loss_mlp": 0.012567, + "epoch": 0.31943484142492107, + "flos": 18329593950720.0, + "grad_norm": 3.1557419136729536, + "language_loss": 0.79280984, + "learning_rate": 3.1837161194001102e-06, + "loss": 0.87045145, + "num_input_tokens_seen": 114123420, + "router_z_loss_clip": 2.05078125, + "router_z_loss_mlp": 0.17980957, + "step": 5313, + "time_per_iteration": 2.47098445892334 + }, + { + "auxiliary_loss_clip": 0.06477478, + "auxiliary_loss_mlp": 0.01274452, + "balance_loss_clip": 0.06281047, + "balance_loss_mlp": 0.01256618, + "epoch": 0.31949496467758903, + "flos": 21622150212480.0, + "grad_norm": 2.7721598847405584, + "language_loss": 0.86245549, + "learning_rate": 3.183402174406057e-06, + "loss": 0.93997484, + "num_input_tokens_seen": 114139230, + "router_z_loss_clip": 1.96484375, + "router_z_loss_mlp": 0.17834473, + "step": 5314, + "time_per_iteration": 2.531196117401123 + }, + { + "auxiliary_loss_clip": 0.0647811, + "auxiliary_loss_mlp": 0.0127239, + "balance_loss_clip": 0.06281686, + "balance_loss_mlp": 0.01255188, + "epoch": 0.31955508793025705, + "flos": 21766312362240.0, + "grad_norm": 1.712027342879292, + "language_loss": 0.80238831, + "learning_rate": 3.1830881845372747e-06, + "loss": 0.8798933, + "num_input_tokens_seen": 114159290, + "router_z_loss_clip": 1.96386719, + "router_z_loss_mlp": 0.17199707, + "step": 5315, + "time_per_iteration": 2.5066771507263184 + }, + { + "auxiliary_loss_clip": 0.06485026, + "auxiliary_loss_mlp": 0.01283831, + "balance_loss_clip": 0.06286455, + "balance_loss_mlp": 0.01265854, + "epoch": 0.319615211182925, + "flos": 17169881915520.0, + "grad_norm": 2.687676993792702, + "language_loss": 0.67569852, + "learning_rate": 3.18277414980567e-06, + "loss": 0.75338709, + "num_input_tokens_seen": 114177655, + "router_z_loss_clip": 1.98632812, + "router_z_loss_mlp": 0.17980957, + "step": 5316, + "time_per_iteration": 3.943110942840576 + }, + { + "auxiliary_loss_clip": 0.0648303, + "auxiliary_loss_mlp": 0.01272207, + "balance_loss_clip": 0.0628902, + "balance_loss_mlp": 0.01255566, + "epoch": 0.319675334435593, + "flos": 28120653941760.0, + "grad_norm": 1.5692381446514811, + "language_loss": 0.69637752, + "learning_rate": 3.1824600702231515e-06, + "loss": 0.77392983, + "num_input_tokens_seen": 114200880, + "router_z_loss_clip": 1.93652344, + "router_z_loss_mlp": 0.16650391, + "step": 5317, + "time_per_iteration": 2.642251491546631 + }, + { + "auxiliary_loss_clip": 0.06377298, + "auxiliary_loss_mlp": 0.0129256, + "balance_loss_clip": 0.06285109, + "balance_loss_mlp": 0.01288716, + "epoch": 0.31973545768826095, + "flos": 69524235072000.0, + "grad_norm": 0.7198160842036254, + "language_loss": 0.5281924, + "learning_rate": 3.182145945801628e-06, + "loss": 0.60489094, + "num_input_tokens_seen": 114267145, + "router_z_loss_clip": 0.921875, + "router_z_loss_mlp": 0.03839111, + "step": 5318, + "time_per_iteration": 3.2718679904937744 + }, + { + "auxiliary_loss_clip": 0.06479475, + "auxiliary_loss_mlp": 0.01271921, + "balance_loss_clip": 0.0628712, + "balance_loss_mlp": 0.01254969, + "epoch": 0.3197955809409289, + "flos": 13704344899200.0, + "grad_norm": 1.5995609143402318, + "language_loss": 0.84504628, + "learning_rate": 3.181831776553012e-06, + "loss": 0.92256021, + "num_input_tokens_seen": 114284630, + "router_z_loss_clip": 1.92480469, + "router_z_loss_mlp": 0.16955566, + "step": 5319, + "time_per_iteration": 2.5372629165649414 + }, + { + "auxiliary_loss_clip": 0.06480815, + "auxiliary_loss_mlp": 0.01279474, + "balance_loss_clip": 0.06286162, + "balance_loss_mlp": 0.01261199, + "epoch": 0.3198557041935969, + "flos": 33226368704640.0, + "grad_norm": 1.6136244255626262, + "language_loss": 0.64208525, + "learning_rate": 3.1815175624892165e-06, + "loss": 0.71968812, + "num_input_tokens_seen": 114305830, + "router_z_loss_clip": 1.9453125, + "router_z_loss_mlp": 0.18273926, + "step": 5320, + "time_per_iteration": 2.675477981567383 + }, + { + "auxiliary_loss_clip": 0.0648189, + "auxiliary_loss_mlp": 0.01271878, + "balance_loss_clip": 0.06280586, + "balance_loss_mlp": 0.01254402, + "epoch": 0.31991582744626484, + "flos": 23738726747520.0, + "grad_norm": 1.9696222638037655, + "language_loss": 0.71059012, + "learning_rate": 3.1812033036221567e-06, + "loss": 0.78812778, + "num_input_tokens_seen": 114325165, + "router_z_loss_clip": 2.01171875, + "router_z_loss_mlp": 0.17480469, + "step": 5321, + "time_per_iteration": 2.6383230686187744 + }, + { + "auxiliary_loss_clip": 0.06491005, + "auxiliary_loss_mlp": 0.01288903, + "balance_loss_clip": 0.06286187, + "balance_loss_mlp": 0.01270318, + "epoch": 0.3199759506989328, + "flos": 18556633388160.0, + "grad_norm": 2.30981924299517, + "language_loss": 0.86988461, + "learning_rate": 3.180888999963749e-06, + "loss": 0.94768369, + "num_input_tokens_seen": 114341310, + "router_z_loss_clip": 2.04980469, + "router_z_loss_mlp": 0.18591309, + "step": 5322, + "time_per_iteration": 2.4862442016601562 + }, + { + "auxiliary_loss_clip": 0.0648296, + "auxiliary_loss_mlp": 0.01273077, + "balance_loss_clip": 0.06285054, + "balance_loss_mlp": 0.01256722, + "epoch": 0.3200360739516008, + "flos": 22425418978560.0, + "grad_norm": 1.6041292280722281, + "language_loss": 0.83380175, + "learning_rate": 3.1805746515259123e-06, + "loss": 0.91136217, + "num_input_tokens_seen": 114360355, + "router_z_loss_clip": 1.9765625, + "router_z_loss_mlp": 0.16369629, + "step": 5323, + "time_per_iteration": 2.5262420177459717 + }, + { + "auxiliary_loss_clip": 0.06476378, + "auxiliary_loss_mlp": 0.01277972, + "balance_loss_clip": 0.06284457, + "balance_loss_mlp": 0.01258529, + "epoch": 0.32009619720426874, + "flos": 20601569082240.0, + "grad_norm": 1.775654796490425, + "language_loss": 0.78471839, + "learning_rate": 3.1802602583205663e-06, + "loss": 0.86226195, + "num_input_tokens_seen": 114379220, + "router_z_loss_clip": 1.91796875, + "router_z_loss_mlp": 0.19433594, + "step": 5324, + "time_per_iteration": 2.492380380630493 + }, + { + "auxiliary_loss_clip": 0.06478705, + "auxiliary_loss_mlp": 0.01274174, + "balance_loss_clip": 0.06283212, + "balance_loss_mlp": 0.01256042, + "epoch": 0.3201563204569367, + "flos": 18153049397760.0, + "grad_norm": 1.7224742254360714, + "language_loss": 0.80742848, + "learning_rate": 3.1799458203596333e-06, + "loss": 0.88495719, + "num_input_tokens_seen": 114396365, + "router_z_loss_clip": 1.95507812, + "router_z_loss_mlp": 0.18139648, + "step": 5325, + "time_per_iteration": 2.4962642192840576 + }, + { + "auxiliary_loss_clip": 0.06478769, + "auxiliary_loss_mlp": 0.01277308, + "balance_loss_clip": 0.06280222, + "balance_loss_mlp": 0.01259701, + "epoch": 0.32021644370960467, + "flos": 31691975137920.0, + "grad_norm": 1.8321318923341703, + "language_loss": 0.75898254, + "learning_rate": 3.179631337655037e-06, + "loss": 0.83654332, + "num_input_tokens_seen": 114416780, + "router_z_loss_clip": 1.98632812, + "router_z_loss_mlp": 0.17602539, + "step": 5326, + "time_per_iteration": 2.5752692222595215 + }, + { + "auxiliary_loss_clip": 0.06472234, + "auxiliary_loss_mlp": 0.01278108, + "balance_loss_clip": 0.06281741, + "balance_loss_mlp": 0.01260918, + "epoch": 0.32027656696227264, + "flos": 26872488322560.0, + "grad_norm": 1.458996564995821, + "language_loss": 0.81400204, + "learning_rate": 3.179316810218701e-06, + "loss": 0.89150548, + "num_input_tokens_seen": 114437405, + "router_z_loss_clip": 1.90527344, + "router_z_loss_mlp": 0.171875, + "step": 5327, + "time_per_iteration": 2.5635383129119873 + }, + { + "auxiliary_loss_clip": 0.06486546, + "auxiliary_loss_mlp": 0.01273421, + "balance_loss_clip": 0.062847, + "balance_loss_mlp": 0.01256207, + "epoch": 0.32033669021494066, + "flos": 24176705639040.0, + "grad_norm": 1.3787000535244864, + "language_loss": 0.77910948, + "learning_rate": 3.179002238062554e-06, + "loss": 0.85670912, + "num_input_tokens_seen": 114458505, + "router_z_loss_clip": 2.01855469, + "router_z_loss_mlp": 0.17211914, + "step": 5328, + "time_per_iteration": 2.514646053314209 + }, + { + "auxiliary_loss_clip": 0.06484267, + "auxiliary_loss_mlp": 0.01278516, + "balance_loss_clip": 0.06287045, + "balance_loss_mlp": 0.0125992, + "epoch": 0.3203968134676086, + "flos": 24467419779840.0, + "grad_norm": 1.5501370939230803, + "language_loss": 0.74267161, + "learning_rate": 3.178687621198524e-06, + "loss": 0.82029939, + "num_input_tokens_seen": 114479050, + "router_z_loss_clip": 1.97265625, + "router_z_loss_mlp": 0.18591309, + "step": 5329, + "time_per_iteration": 2.5436654090881348 + }, + { + "auxiliary_loss_clip": 0.06471072, + "auxiliary_loss_mlp": 0.01278598, + "balance_loss_clip": 0.06282842, + "balance_loss_mlp": 0.01262434, + "epoch": 0.3204569367202766, + "flos": 18010606256640.0, + "grad_norm": 1.7046636031855489, + "language_loss": 0.71222955, + "learning_rate": 3.1783729596385415e-06, + "loss": 0.78972626, + "num_input_tokens_seen": 114497415, + "router_z_loss_clip": 1.8828125, + "router_z_loss_mlp": 0.16162109, + "step": 5330, + "time_per_iteration": 2.479647397994995 + }, + { + "auxiliary_loss_clip": 0.06485157, + "auxiliary_loss_mlp": 0.01277162, + "balance_loss_clip": 0.0628237, + "balance_loss_mlp": 0.0125791, + "epoch": 0.32051705997294455, + "flos": 30597237544320.0, + "grad_norm": 1.705143811074938, + "language_loss": 0.80496192, + "learning_rate": 3.1780582533945376e-06, + "loss": 0.88258511, + "num_input_tokens_seen": 114518785, + "router_z_loss_clip": 2.02832031, + "router_z_loss_mlp": 0.19250488, + "step": 5331, + "time_per_iteration": 2.5741958618164062 + }, + { + "auxiliary_loss_clip": 0.06384323, + "auxiliary_loss_mlp": 0.0125803, + "balance_loss_clip": 0.06292279, + "balance_loss_mlp": 0.01253741, + "epoch": 0.3205771832256125, + "flos": 68436723657600.0, + "grad_norm": 0.7949538218297083, + "language_loss": 0.5776577, + "learning_rate": 3.177743502478447e-06, + "loss": 0.65408123, + "num_input_tokens_seen": 114577710, + "router_z_loss_clip": 0.921875, + "router_z_loss_mlp": 0.04293823, + "step": 5332, + "time_per_iteration": 3.084747314453125 + }, + { + "auxiliary_loss_clip": 0.06488422, + "auxiliary_loss_mlp": 0.01272523, + "balance_loss_clip": 0.06286052, + "balance_loss_mlp": 0.01255154, + "epoch": 0.3206373064782805, + "flos": 30451524094080.0, + "grad_norm": 1.5377704746044631, + "language_loss": 0.73702615, + "learning_rate": 3.177428706902205e-06, + "loss": 0.81463563, + "num_input_tokens_seen": 114598640, + "router_z_loss_clip": 2.0234375, + "router_z_loss_mlp": 0.17358398, + "step": 5333, + "time_per_iteration": 2.6130683422088623 + }, + { + "auxiliary_loss_clip": 0.06480561, + "auxiliary_loss_mlp": 0.01273615, + "balance_loss_clip": 0.06284031, + "balance_loss_mlp": 0.01256246, + "epoch": 0.32069742973094845, + "flos": 22061051498880.0, + "grad_norm": 1.6882238799892797, + "language_loss": 0.70957875, + "learning_rate": 3.1771138666777485e-06, + "loss": 0.78712052, + "num_input_tokens_seen": 114618780, + "router_z_loss_clip": 1.96484375, + "router_z_loss_mlp": 0.17382812, + "step": 5334, + "time_per_iteration": 2.5501654148101807 + }, + { + "auxiliary_loss_clip": 0.06476508, + "auxiliary_loss_mlp": 0.01276305, + "balance_loss_clip": 0.06281763, + "balance_loss_mlp": 0.01257947, + "epoch": 0.3207575529836164, + "flos": 22060464520320.0, + "grad_norm": 1.723674002448169, + "language_loss": 0.77349097, + "learning_rate": 3.1767989818170156e-06, + "loss": 0.85101908, + "num_input_tokens_seen": 114637525, + "router_z_loss_clip": 1.94433594, + "router_z_loss_mlp": 0.18347168, + "step": 5335, + "time_per_iteration": 2.5194711685180664 + }, + { + "auxiliary_loss_clip": 0.06479798, + "auxiliary_loss_mlp": 0.0127571, + "balance_loss_clip": 0.06285612, + "balance_loss_mlp": 0.0125889, + "epoch": 0.3208176762362844, + "flos": 34065961015680.0, + "grad_norm": 1.52521333905674, + "language_loss": 0.68891776, + "learning_rate": 3.1764840523319477e-06, + "loss": 0.76647282, + "num_input_tokens_seen": 114659705, + "router_z_loss_clip": 1.94238281, + "router_z_loss_mlp": 0.16809082, + "step": 5336, + "time_per_iteration": 2.6550848484039307 + }, + { + "auxiliary_loss_clip": 0.06481949, + "auxiliary_loss_mlp": 0.01285819, + "balance_loss_clip": 0.06286713, + "balance_loss_mlp": 0.01268343, + "epoch": 0.32087779948895234, + "flos": 21805151529600.0, + "grad_norm": 1.6666772631518172, + "language_loss": 0.79367507, + "learning_rate": 3.176169078234487e-06, + "loss": 0.87135273, + "num_input_tokens_seen": 114678340, + "router_z_loss_clip": 1.95410156, + "router_z_loss_mlp": 0.17480469, + "step": 5337, + "time_per_iteration": 2.5133795738220215 + }, + { + "auxiliary_loss_clip": 0.06473362, + "auxiliary_loss_mlp": 0.01277197, + "balance_loss_clip": 0.06284505, + "balance_loss_mlp": 0.01260865, + "epoch": 0.3209379227416203, + "flos": 21440532487680.0, + "grad_norm": 1.6244255970978692, + "language_loss": 0.75145769, + "learning_rate": 3.1758540595365766e-06, + "loss": 0.82896328, + "num_input_tokens_seen": 114696980, + "router_z_loss_clip": 1.88964844, + "router_z_loss_mlp": 0.16320801, + "step": 5338, + "time_per_iteration": 2.526841402053833 + }, + { + "auxiliary_loss_clip": 0.06482957, + "auxiliary_loss_mlp": 0.01277739, + "balance_loss_clip": 0.06285477, + "balance_loss_mlp": 0.01260216, + "epoch": 0.3209980459942883, + "flos": 25856267604480.0, + "grad_norm": 1.7965894601451369, + "language_loss": 0.63241929, + "learning_rate": 3.1755389962501626e-06, + "loss": 0.7100262, + "num_input_tokens_seen": 114717330, + "router_z_loss_clip": 1.97558594, + "router_z_loss_mlp": 0.17504883, + "step": 5339, + "time_per_iteration": 2.5847740173339844 + }, + { + "auxiliary_loss_clip": 0.06482022, + "auxiliary_loss_mlp": 0.0127165, + "balance_loss_clip": 0.06283947, + "balance_loss_mlp": 0.01255151, + "epoch": 0.32105816924695624, + "flos": 19105218069120.0, + "grad_norm": 2.418138513897033, + "language_loss": 0.81912339, + "learning_rate": 3.175223888387192e-06, + "loss": 0.89666009, + "num_input_tokens_seen": 114736320, + "router_z_loss_clip": 1.97949219, + "router_z_loss_mlp": 0.16491699, + "step": 5340, + "time_per_iteration": 2.5764145851135254 + }, + { + "auxiliary_loss_clip": 0.06475554, + "auxiliary_loss_mlp": 0.01271917, + "balance_loss_clip": 0.06281976, + "balance_loss_mlp": 0.01254774, + "epoch": 0.3211182924996242, + "flos": 16587531239040.0, + "grad_norm": 1.7719401771551753, + "language_loss": 0.76604897, + "learning_rate": 3.1749087359596137e-06, + "loss": 0.84352368, + "num_input_tokens_seen": 114754575, + "router_z_loss_clip": 1.93652344, + "router_z_loss_mlp": 0.17150879, + "step": 5341, + "time_per_iteration": 2.505668878555298 + }, + { + "auxiliary_loss_clip": 0.06474154, + "auxiliary_loss_mlp": 0.01272611, + "balance_loss_clip": 0.0628191, + "balance_loss_mlp": 0.01255969, + "epoch": 0.3211784157522922, + "flos": 22678425982080.0, + "grad_norm": 1.4764530250267398, + "language_loss": 0.79422891, + "learning_rate": 3.1745935389793786e-06, + "loss": 0.87169659, + "num_input_tokens_seen": 114773590, + "router_z_loss_clip": 1.92382812, + "router_z_loss_mlp": 0.16662598, + "step": 5342, + "time_per_iteration": 2.5391595363616943 + }, + { + "auxiliary_loss_clip": 0.06483465, + "auxiliary_loss_mlp": 0.01277474, + "balance_loss_clip": 0.06286988, + "balance_loss_mlp": 0.01260141, + "epoch": 0.3212385390049602, + "flos": 20565119756160.0, + "grad_norm": 2.45787142613039, + "language_loss": 0.75074786, + "learning_rate": 3.174278297458438e-06, + "loss": 0.82835722, + "num_input_tokens_seen": 114790775, + "router_z_loss_clip": 1.96484375, + "router_z_loss_mlp": 0.17321777, + "step": 5343, + "time_per_iteration": 2.4957783222198486 + }, + { + "auxiliary_loss_clip": 0.06479985, + "auxiliary_loss_mlp": 0.01272066, + "balance_loss_clip": 0.06286201, + "balance_loss_mlp": 0.01255043, + "epoch": 0.32129866225762815, + "flos": 24798188972160.0, + "grad_norm": 1.5494427093400844, + "language_loss": 0.82596725, + "learning_rate": 3.173963011408748e-06, + "loss": 0.9034878, + "num_input_tokens_seen": 114809835, + "router_z_loss_clip": 1.94042969, + "router_z_loss_mlp": 0.17028809, + "step": 5344, + "time_per_iteration": 2.5672519207000732 + }, + { + "auxiliary_loss_clip": 0.06478736, + "auxiliary_loss_mlp": 0.01273821, + "balance_loss_clip": 0.06282513, + "balance_loss_mlp": 0.0125731, + "epoch": 0.3213587855102961, + "flos": 18372374259840.0, + "grad_norm": 1.9111940233558649, + "language_loss": 0.80321491, + "learning_rate": 3.173647680842262e-06, + "loss": 0.8807404, + "num_input_tokens_seen": 114826505, + "router_z_loss_clip": 1.96289062, + "router_z_loss_mlp": 0.16516113, + "step": 5345, + "time_per_iteration": 2.479442834854126 + }, + { + "auxiliary_loss_clip": 0.06478975, + "auxiliary_loss_mlp": 0.01271046, + "balance_loss_clip": 0.06283471, + "balance_loss_mlp": 0.01254321, + "epoch": 0.3214189087629641, + "flos": 27023274944640.0, + "grad_norm": 1.7019036305222461, + "language_loss": 0.83604348, + "learning_rate": 3.1733323057709384e-06, + "loss": 0.9135437, + "num_input_tokens_seen": 114846140, + "router_z_loss_clip": 1.95507812, + "router_z_loss_mlp": 0.16723633, + "step": 5346, + "time_per_iteration": 2.549257755279541 + }, + { + "auxiliary_loss_clip": 0.0648382, + "auxiliary_loss_mlp": 0.01272196, + "balance_loss_clip": 0.06285056, + "balance_loss_mlp": 0.0125528, + "epoch": 0.32147903201563205, + "flos": 23154866697600.0, + "grad_norm": 1.4545038816344273, + "language_loss": 0.81656283, + "learning_rate": 3.1730168862067366e-06, + "loss": 0.89412296, + "num_input_tokens_seen": 114866660, + "router_z_loss_clip": 1.98925781, + "router_z_loss_mlp": 0.16918945, + "step": 5347, + "time_per_iteration": 2.5096054077148438 + }, + { + "auxiliary_loss_clip": 0.06480029, + "auxiliary_loss_mlp": 0.01274054, + "balance_loss_clip": 0.06286772, + "balance_loss_mlp": 0.01256673, + "epoch": 0.3215391552683, + "flos": 16586231500800.0, + "grad_norm": 2.536962878441814, + "language_loss": 0.80386555, + "learning_rate": 3.1727014221616164e-06, + "loss": 0.88140643, + "num_input_tokens_seen": 114882820, + "router_z_loss_clip": 1.9296875, + "router_z_loss_mlp": 0.1739502, + "step": 5348, + "time_per_iteration": 3.9639015197753906 + }, + { + "auxiliary_loss_clip": 0.06474565, + "auxiliary_loss_mlp": 0.01276371, + "balance_loss_clip": 0.06280862, + "balance_loss_mlp": 0.01259431, + "epoch": 0.321599278520968, + "flos": 17827604939520.0, + "grad_norm": 2.026618804026968, + "language_loss": 0.85758352, + "learning_rate": 3.172385913647542e-06, + "loss": 0.93509287, + "num_input_tokens_seen": 114900745, + "router_z_loss_clip": 1.93554688, + "router_z_loss_mlp": 0.16943359, + "step": 5349, + "time_per_iteration": 3.8848202228546143 + }, + { + "auxiliary_loss_clip": 0.06481349, + "auxiliary_loss_mlp": 0.01274724, + "balance_loss_clip": 0.06286412, + "balance_loss_mlp": 0.01257022, + "epoch": 0.32165940177363594, + "flos": 16257097463040.0, + "grad_norm": 1.7607877661370477, + "language_loss": 0.8123306, + "learning_rate": 3.172070360676475e-06, + "loss": 0.88989133, + "num_input_tokens_seen": 114917940, + "router_z_loss_clip": 1.95019531, + "router_z_loss_mlp": 0.17700195, + "step": 5350, + "time_per_iteration": 3.9589500427246094 + }, + { + "auxiliary_loss_clip": 0.06471309, + "auxiliary_loss_mlp": 0.01270957, + "balance_loss_clip": 0.06282239, + "balance_loss_mlp": 0.01255055, + "epoch": 0.3217195250263039, + "flos": 27607302702720.0, + "grad_norm": 1.8529018663543275, + "language_loss": 0.80116528, + "learning_rate": 3.1717547632603828e-06, + "loss": 0.87858802, + "num_input_tokens_seen": 114937735, + "router_z_loss_clip": 1.89257812, + "router_z_loss_mlp": 0.15905762, + "step": 5351, + "time_per_iteration": 2.562232732772827 + }, + { + "auxiliary_loss_clip": 0.06476732, + "auxiliary_loss_mlp": 0.01274907, + "balance_loss_clip": 0.06284767, + "balance_loss_mlp": 0.01256668, + "epoch": 0.3217796482789719, + "flos": 21477023740800.0, + "grad_norm": 2.0321110975992562, + "language_loss": 0.7641573, + "learning_rate": 3.1714391214112326e-06, + "loss": 0.84167361, + "num_input_tokens_seen": 114956630, + "router_z_loss_clip": 1.91699219, + "router_z_loss_mlp": 0.18249512, + "step": 5352, + "time_per_iteration": 2.5320773124694824 + }, + { + "auxiliary_loss_clip": 0.0648407, + "auxiliary_loss_mlp": 0.01278365, + "balance_loss_clip": 0.06291708, + "balance_loss_mlp": 0.0126133, + "epoch": 0.32183977153163984, + "flos": 21222046166400.0, + "grad_norm": 1.9188598206640457, + "language_loss": 0.82159722, + "learning_rate": 3.1711234351409933e-06, + "loss": 0.89922154, + "num_input_tokens_seen": 114976470, + "router_z_loss_clip": 1.92089844, + "router_z_loss_mlp": 0.17028809, + "step": 5353, + "time_per_iteration": 2.5061802864074707 + }, + { + "auxiliary_loss_clip": 0.06480308, + "auxiliary_loss_mlp": 0.01275858, + "balance_loss_clip": 0.0629053, + "balance_loss_mlp": 0.0125837, + "epoch": 0.3218998947843078, + "flos": 24615103800960.0, + "grad_norm": 1.8505936463490174, + "language_loss": 0.74125177, + "learning_rate": 3.1708077044616365e-06, + "loss": 0.81881344, + "num_input_tokens_seen": 114996710, + "router_z_loss_clip": 1.89941406, + "router_z_loss_mlp": 0.17480469, + "step": 5354, + "time_per_iteration": 2.5725185871124268 + }, + { + "auxiliary_loss_clip": 0.06479903, + "auxiliary_loss_mlp": 0.01277081, + "balance_loss_clip": 0.06284866, + "balance_loss_mlp": 0.01259951, + "epoch": 0.3219600180369758, + "flos": 22276686781440.0, + "grad_norm": 2.612968571970558, + "language_loss": 0.83769405, + "learning_rate": 3.1704919293851334e-06, + "loss": 0.91526389, + "num_input_tokens_seen": 115015775, + "router_z_loss_clip": 1.95019531, + "router_z_loss_mlp": 0.17126465, + "step": 5355, + "time_per_iteration": 3.985846757888794 + }, + { + "auxiliary_loss_clip": 0.0647967, + "auxiliary_loss_mlp": 0.01272253, + "balance_loss_clip": 0.0628608, + "balance_loss_mlp": 0.01255528, + "epoch": 0.3220201412896438, + "flos": 14944376672640.0, + "grad_norm": 1.8959584470465125, + "language_loss": 0.71344721, + "learning_rate": 3.1701761099234597e-06, + "loss": 0.79096651, + "num_input_tokens_seen": 115034265, + "router_z_loss_clip": 1.93554688, + "router_z_loss_mlp": 0.1673584, + "step": 5356, + "time_per_iteration": 2.5644400119781494 + }, + { + "auxiliary_loss_clip": 0.06494904, + "auxiliary_loss_mlp": 0.01280986, + "balance_loss_clip": 0.0629259, + "balance_loss_mlp": 0.01263367, + "epoch": 0.32208026454231176, + "flos": 22672807758720.0, + "grad_norm": 2.5335154176231525, + "language_loss": 0.67879629, + "learning_rate": 3.1698602460885903e-06, + "loss": 0.7565552, + "num_input_tokens_seen": 115051945, + "router_z_loss_clip": 2.02050781, + "router_z_loss_mlp": 0.17614746, + "step": 5357, + "time_per_iteration": 2.546654224395752 + }, + { + "auxiliary_loss_clip": 0.06384487, + "auxiliary_loss_mlp": 0.01261366, + "balance_loss_clip": 0.06294875, + "balance_loss_mlp": 0.01257649, + "epoch": 0.3221403877949797, + "flos": 64626273308160.0, + "grad_norm": 0.6824166316331671, + "language_loss": 0.58314437, + "learning_rate": 3.1695443378925035e-06, + "loss": 0.65960288, + "num_input_tokens_seen": 115119090, + "router_z_loss_clip": 0.89648438, + "router_z_loss_mlp": 0.03707886, + "step": 5358, + "time_per_iteration": 3.2290756702423096 + }, + { + "auxiliary_loss_clip": 0.06481851, + "auxiliary_loss_mlp": 0.01282518, + "balance_loss_clip": 0.06287378, + "balance_loss_mlp": 0.01264839, + "epoch": 0.3222005110476477, + "flos": 20163212847360.0, + "grad_norm": 1.9186908993809755, + "language_loss": 0.84190667, + "learning_rate": 3.1692283853471777e-06, + "loss": 0.91955042, + "num_input_tokens_seen": 115137755, + "router_z_loss_clip": 1.94628906, + "router_z_loss_mlp": 0.17675781, + "step": 5359, + "time_per_iteration": 2.531033754348755 + }, + { + "auxiliary_loss_clip": 0.06480163, + "auxiliary_loss_mlp": 0.01277134, + "balance_loss_clip": 0.06287846, + "balance_loss_mlp": 0.01260051, + "epoch": 0.32226063430031565, + "flos": 22680731969280.0, + "grad_norm": 1.6695480137557102, + "language_loss": 0.79997146, + "learning_rate": 3.168912388464595e-06, + "loss": 0.87754452, + "num_input_tokens_seen": 115158150, + "router_z_loss_clip": 1.921875, + "router_z_loss_mlp": 0.17077637, + "step": 5360, + "time_per_iteration": 2.544461727142334 + }, + { + "auxiliary_loss_clip": 0.06382456, + "auxiliary_loss_mlp": 0.01256795, + "balance_loss_clip": 0.06292457, + "balance_loss_mlp": 0.01253353, + "epoch": 0.3223207575529836, + "flos": 63847798151040.0, + "grad_norm": 0.6356253914940931, + "language_loss": 0.56731617, + "learning_rate": 3.168596347256737e-06, + "loss": 0.64370871, + "num_input_tokens_seen": 115212755, + "router_z_loss_clip": 0.8984375, + "router_z_loss_mlp": 0.03451538, + "step": 5361, + "time_per_iteration": 3.0336568355560303 + }, + { + "auxiliary_loss_clip": 0.06478466, + "auxiliary_loss_mlp": 0.01277797, + "balance_loss_clip": 0.06288562, + "balance_loss_mlp": 0.01261346, + "epoch": 0.3223808808056516, + "flos": 26877393786240.0, + "grad_norm": 2.167930910708006, + "language_loss": 0.71792114, + "learning_rate": 3.168280261735588e-06, + "loss": 0.79548371, + "num_input_tokens_seen": 115233090, + "router_z_loss_clip": 1.89746094, + "router_z_loss_mlp": 0.16442871, + "step": 5362, + "time_per_iteration": 2.561345338821411 + }, + { + "auxiliary_loss_clip": 0.06483887, + "auxiliary_loss_mlp": 0.01279203, + "balance_loss_clip": 0.06293412, + "balance_loss_mlp": 0.01262692, + "epoch": 0.32244100405831955, + "flos": 26768716640640.0, + "grad_norm": 1.5327886568658977, + "language_loss": 0.73854291, + "learning_rate": 3.167964131913135e-06, + "loss": 0.81617379, + "num_input_tokens_seen": 115252645, + "router_z_loss_clip": 1.90722656, + "router_z_loss_mlp": 0.16503906, + "step": 5363, + "time_per_iteration": 2.583064556121826 + }, + { + "auxiliary_loss_clip": 0.06489229, + "auxiliary_loss_mlp": 0.01275466, + "balance_loss_clip": 0.06291971, + "balance_loss_mlp": 0.01258717, + "epoch": 0.3225011273109875, + "flos": 23809403266560.0, + "grad_norm": 2.354374584633167, + "language_loss": 0.76664144, + "learning_rate": 3.167647957801365e-06, + "loss": 0.84428835, + "num_input_tokens_seen": 115269085, + "router_z_loss_clip": 1.97265625, + "router_z_loss_mlp": 0.16748047, + "step": 5364, + "time_per_iteration": 2.5177268981933594 + }, + { + "auxiliary_loss_clip": 0.06479897, + "auxiliary_loss_mlp": 0.01275674, + "balance_loss_clip": 0.06290577, + "balance_loss_mlp": 0.01259473, + "epoch": 0.3225612505636555, + "flos": 17280194215680.0, + "grad_norm": 2.1891061142162327, + "language_loss": 0.7715044, + "learning_rate": 3.1673317394122672e-06, + "loss": 0.84906018, + "num_input_tokens_seen": 115286470, + "router_z_loss_clip": 1.89257812, + "router_z_loss_mlp": 0.1619873, + "step": 5365, + "time_per_iteration": 2.5122928619384766 + }, + { + "auxiliary_loss_clip": 0.06484331, + "auxiliary_loss_mlp": 0.01277663, + "balance_loss_clip": 0.06292351, + "balance_loss_mlp": 0.01260711, + "epoch": 0.32262137381632344, + "flos": 23372724113280.0, + "grad_norm": 2.314444268247813, + "language_loss": 0.77153468, + "learning_rate": 3.1670154767578333e-06, + "loss": 0.84915465, + "num_input_tokens_seen": 115307000, + "router_z_loss_clip": 1.92089844, + "router_z_loss_mlp": 0.16955566, + "step": 5366, + "time_per_iteration": 2.514768362045288 + }, + { + "auxiliary_loss_clip": 0.06481092, + "auxiliary_loss_mlp": 0.01280366, + "balance_loss_clip": 0.0629226, + "balance_loss_mlp": 0.0126388, + "epoch": 0.3226814970689914, + "flos": 23265598268160.0, + "grad_norm": 1.8642315088319754, + "language_loss": 0.72423649, + "learning_rate": 3.166699169850055e-06, + "loss": 0.80185115, + "num_input_tokens_seen": 115325925, + "router_z_loss_clip": 1.88769531, + "router_z_loss_mlp": 0.16491699, + "step": 5367, + "time_per_iteration": 2.544145345687866 + }, + { + "auxiliary_loss_clip": 0.06480073, + "auxiliary_loss_mlp": 0.01278287, + "balance_loss_clip": 0.06290721, + "balance_loss_mlp": 0.01262248, + "epoch": 0.32274162032165943, + "flos": 16400127582720.0, + "grad_norm": 1.9542840286813894, + "language_loss": 0.74559301, + "learning_rate": 3.1663828187009274e-06, + "loss": 0.82317662, + "num_input_tokens_seen": 115343705, + "router_z_loss_clip": 1.89453125, + "router_z_loss_mlp": 0.16033936, + "step": 5368, + "time_per_iteration": 2.4653942584991455 + }, + { + "auxiliary_loss_clip": 0.06481207, + "auxiliary_loss_mlp": 0.01271425, + "balance_loss_clip": 0.06294385, + "balance_loss_mlp": 0.01255874, + "epoch": 0.3228017435743274, + "flos": 27862489912320.0, + "grad_norm": 2.016369988637382, + "language_loss": 0.79033995, + "learning_rate": 3.1660664233224467e-06, + "loss": 0.86786628, + "num_input_tokens_seen": 115364170, + "router_z_loss_clip": 1.86621094, + "router_z_loss_mlp": 0.15533447, + "step": 5369, + "time_per_iteration": 2.6923141479492188 + }, + { + "auxiliary_loss_clip": 0.06471382, + "auxiliary_loss_mlp": 0.01280148, + "balance_loss_clip": 0.0628759, + "balance_loss_mlp": 0.01264567, + "epoch": 0.32286186682699536, + "flos": 19614712020480.0, + "grad_norm": 1.8619928029866217, + "language_loss": 0.83607441, + "learning_rate": 3.16574998372661e-06, + "loss": 0.91358972, + "num_input_tokens_seen": 115382495, + "router_z_loss_clip": 1.83984375, + "router_z_loss_mlp": 0.15576172, + "step": 5370, + "time_per_iteration": 2.4963490962982178 + }, + { + "auxiliary_loss_clip": 0.06481104, + "auxiliary_loss_mlp": 0.01278081, + "balance_loss_clip": 0.062904, + "balance_loss_mlp": 0.01262703, + "epoch": 0.3229219900796633, + "flos": 24140885218560.0, + "grad_norm": 2.7780356443351146, + "language_loss": 0.83346975, + "learning_rate": 3.1654334999254177e-06, + "loss": 0.91106164, + "num_input_tokens_seen": 115399450, + "router_z_loss_clip": 1.90625, + "router_z_loss_mlp": 0.15368652, + "step": 5371, + "time_per_iteration": 2.554034948348999 + }, + { + "auxiliary_loss_clip": 0.06486623, + "auxiliary_loss_mlp": 0.01278101, + "balance_loss_clip": 0.0629211, + "balance_loss_mlp": 0.01260434, + "epoch": 0.3229821133323313, + "flos": 17754454725120.0, + "grad_norm": 2.279534384310274, + "language_loss": 0.89153087, + "learning_rate": 3.1651169719308695e-06, + "loss": 0.96917808, + "num_input_tokens_seen": 115417700, + "router_z_loss_clip": 1.94433594, + "router_z_loss_mlp": 0.17663574, + "step": 5372, + "time_per_iteration": 2.468693971633911 + }, + { + "auxiliary_loss_clip": 0.06478924, + "auxiliary_loss_mlp": 0.01278448, + "balance_loss_clip": 0.06288313, + "balance_loss_mlp": 0.01261843, + "epoch": 0.32304223658499925, + "flos": 22352562253440.0, + "grad_norm": 1.986067660558338, + "language_loss": 0.730793, + "learning_rate": 3.1648003997549694e-06, + "loss": 0.80836678, + "num_input_tokens_seen": 115435840, + "router_z_loss_clip": 1.90332031, + "router_z_loss_mlp": 0.16601562, + "step": 5373, + "time_per_iteration": 2.5757906436920166 + }, + { + "auxiliary_loss_clip": 0.06476311, + "auxiliary_loss_mlp": 0.0127432, + "balance_loss_clip": 0.06293686, + "balance_loss_mlp": 0.01258227, + "epoch": 0.3231023598376672, + "flos": 18484154006400.0, + "grad_norm": 2.1970042176000963, + "language_loss": 0.82592154, + "learning_rate": 3.1644837834097214e-06, + "loss": 0.90342778, + "num_input_tokens_seen": 115454210, + "router_z_loss_clip": 1.82714844, + "router_z_loss_mlp": 0.1607666, + "step": 5374, + "time_per_iteration": 2.4853713512420654 + }, + { + "auxiliary_loss_clip": 0.06474404, + "auxiliary_loss_mlp": 0.01271223, + "balance_loss_clip": 0.06291121, + "balance_loss_mlp": 0.0125544, + "epoch": 0.3231624830903352, + "flos": 27643710101760.0, + "grad_norm": 1.9120740622639463, + "language_loss": 0.88405079, + "learning_rate": 3.1641671229071317e-06, + "loss": 0.96150708, + "num_input_tokens_seen": 115471785, + "router_z_loss_clip": 1.83007812, + "router_z_loss_mlp": 0.15783691, + "step": 5375, + "time_per_iteration": 2.58644700050354 + }, + { + "auxiliary_loss_clip": 0.06483716, + "auxiliary_loss_mlp": 0.01275166, + "balance_loss_clip": 0.06289761, + "balance_loss_mlp": 0.01258799, + "epoch": 0.32322260634300315, + "flos": 21732965637120.0, + "grad_norm": 2.2884949024183983, + "language_loss": 0.76224899, + "learning_rate": 3.1638504182592076e-06, + "loss": 0.83983773, + "num_input_tokens_seen": 115491405, + "router_z_loss_clip": 1.94042969, + "router_z_loss_mlp": 0.16345215, + "step": 5376, + "time_per_iteration": 2.5090999603271484 + }, + { + "auxiliary_loss_clip": 0.0647772, + "auxiliary_loss_mlp": 0.01272254, + "balance_loss_clip": 0.06289793, + "balance_loss_mlp": 0.01256649, + "epoch": 0.3232827295956711, + "flos": 22644198789120.0, + "grad_norm": 1.5259481118475857, + "language_loss": 0.67275858, + "learning_rate": 3.1635336694779594e-06, + "loss": 0.75025833, + "num_input_tokens_seen": 115511555, + "router_z_loss_clip": 1.88085938, + "router_z_loss_mlp": 0.15594482, + "step": 5377, + "time_per_iteration": 2.592737913131714 + }, + { + "auxiliary_loss_clip": 0.06482306, + "auxiliary_loss_mlp": 0.01279693, + "balance_loss_clip": 0.06294581, + "balance_loss_mlp": 0.01262158, + "epoch": 0.3233428528483391, + "flos": 26329731500160.0, + "grad_norm": 1.747214931760967, + "language_loss": 0.73022175, + "learning_rate": 3.1632168765753982e-06, + "loss": 0.80784178, + "num_input_tokens_seen": 115532860, + "router_z_loss_clip": 1.87597656, + "router_z_loss_mlp": 0.17541504, + "step": 5378, + "time_per_iteration": 2.560969114303589 + }, + { + "auxiliary_loss_clip": 0.06476232, + "auxiliary_loss_mlp": 0.01272167, + "balance_loss_clip": 0.06289409, + "balance_loss_mlp": 0.01256598, + "epoch": 0.32340297610100704, + "flos": 28592818099200.0, + "grad_norm": 2.0362074337070832, + "language_loss": 0.82332939, + "learning_rate": 3.1629000395635357e-06, + "loss": 0.90081334, + "num_input_tokens_seen": 115553850, + "router_z_loss_clip": 1.86816406, + "router_z_loss_mlp": 0.15563965, + "step": 5379, + "time_per_iteration": 2.661787986755371 + }, + { + "auxiliary_loss_clip": 0.06481552, + "auxiliary_loss_mlp": 0.01276474, + "balance_loss_clip": 0.06288823, + "balance_loss_mlp": 0.01260548, + "epoch": 0.323463099353675, + "flos": 30781664380800.0, + "grad_norm": 1.6212615798097256, + "language_loss": 0.78942055, + "learning_rate": 3.162583158454388e-06, + "loss": 0.86700082, + "num_input_tokens_seen": 115575530, + "router_z_loss_clip": 1.92285156, + "router_z_loss_mlp": 0.15942383, + "step": 5380, + "time_per_iteration": 2.593618631362915 + }, + { + "auxiliary_loss_clip": 0.06489569, + "auxiliary_loss_mlp": 0.01272069, + "balance_loss_clip": 0.06298643, + "balance_loss_mlp": 0.01255368, + "epoch": 0.32352322260634303, + "flos": 25235664739200.0, + "grad_norm": 1.685322069138263, + "language_loss": 0.77853882, + "learning_rate": 3.1622662332599697e-06, + "loss": 0.85615522, + "num_input_tokens_seen": 115594885, + "router_z_loss_clip": 1.91015625, + "router_z_loss_mlp": 0.16699219, + "step": 5381, + "time_per_iteration": 2.5967609882354736 + }, + { + "auxiliary_loss_clip": 0.06476527, + "auxiliary_loss_mlp": 0.01269308, + "balance_loss_clip": 0.06292967, + "balance_loss_mlp": 0.01255438, + "epoch": 0.323583345859011, + "flos": 23337071400960.0, + "grad_norm": 1.9004028984655497, + "language_loss": 0.72391021, + "learning_rate": 3.1619492639922998e-06, + "loss": 0.80136859, + "num_input_tokens_seen": 115614080, + "router_z_loss_clip": 1.83496094, + "router_z_loss_mlp": 0.13848877, + "step": 5382, + "time_per_iteration": 2.5095293521881104 + }, + { + "auxiliary_loss_clip": 0.06488711, + "auxiliary_loss_mlp": 0.01277606, + "balance_loss_clip": 0.06295708, + "balance_loss_mlp": 0.01262157, + "epoch": 0.32364346911167896, + "flos": 26213675195520.0, + "grad_norm": 2.3447859303702883, + "language_loss": 0.71528596, + "learning_rate": 3.1616322506633964e-06, + "loss": 0.79294908, + "num_input_tokens_seen": 115632820, + "router_z_loss_clip": 1.9296875, + "router_z_loss_mlp": 0.15441895, + "step": 5383, + "time_per_iteration": 2.5806562900543213 + }, + { + "auxiliary_loss_clip": 0.06476977, + "auxiliary_loss_mlp": 0.01276799, + "balance_loss_clip": 0.06292375, + "balance_loss_mlp": 0.01261564, + "epoch": 0.3237035923643469, + "flos": 23702487056640.0, + "grad_norm": 1.948915226701978, + "language_loss": 0.78857487, + "learning_rate": 3.161315193285283e-06, + "loss": 0.86611259, + "num_input_tokens_seen": 115652860, + "router_z_loss_clip": 1.84472656, + "router_z_loss_mlp": 0.15234375, + "step": 5384, + "time_per_iteration": 2.548797369003296 + }, + { + "auxiliary_loss_clip": 0.06481218, + "auxiliary_loss_mlp": 0.01274762, + "balance_loss_clip": 0.06288576, + "balance_loss_mlp": 0.0125793, + "epoch": 0.3237637156170149, + "flos": 14433960326400.0, + "grad_norm": 1.885180362402172, + "language_loss": 0.75034815, + "learning_rate": 3.16099809186998e-06, + "loss": 0.82790792, + "num_input_tokens_seen": 115670940, + "router_z_loss_clip": 1.92675781, + "router_z_loss_mlp": 0.16821289, + "step": 5385, + "time_per_iteration": 2.577547073364258 + }, + { + "auxiliary_loss_clip": 0.06486371, + "auxiliary_loss_mlp": 0.0127007, + "balance_loss_clip": 0.06298091, + "balance_loss_mlp": 0.01255032, + "epoch": 0.32382383886968286, + "flos": 31070449877760.0, + "grad_norm": 1.8174179211363362, + "language_loss": 0.72224641, + "learning_rate": 3.1606809464295145e-06, + "loss": 0.79981083, + "num_input_tokens_seen": 115691155, + "router_z_loss_clip": 1.8828125, + "router_z_loss_mlp": 0.15032959, + "step": 5386, + "time_per_iteration": 2.585822820663452 + }, + { + "auxiliary_loss_clip": 0.06485418, + "auxiliary_loss_mlp": 0.01273325, + "balance_loss_clip": 0.06292341, + "balance_loss_mlp": 0.01256803, + "epoch": 0.3238839621223508, + "flos": 23263418062080.0, + "grad_norm": 3.182973165751226, + "language_loss": 0.95573068, + "learning_rate": 3.1603637569759095e-06, + "loss": 1.03331804, + "num_input_tokens_seen": 115710340, + "router_z_loss_clip": 1.93261719, + "router_z_loss_mlp": 0.16503906, + "step": 5387, + "time_per_iteration": 4.075104236602783 + }, + { + "auxiliary_loss_clip": 0.06490889, + "auxiliary_loss_mlp": 0.01270509, + "balance_loss_clip": 0.06298059, + "balance_loss_mlp": 0.0125376, + "epoch": 0.3239440853750188, + "flos": 22971026839680.0, + "grad_norm": 2.142304582151843, + "language_loss": 0.78141761, + "learning_rate": 3.1600465235211956e-06, + "loss": 0.85903162, + "num_input_tokens_seen": 115726745, + "router_z_loss_clip": 1.92675781, + "router_z_loss_mlp": 0.16748047, + "step": 5388, + "time_per_iteration": 2.623976707458496 + }, + { + "auxiliary_loss_clip": 0.06478786, + "auxiliary_loss_mlp": 0.01276501, + "balance_loss_clip": 0.06289905, + "balance_loss_mlp": 0.01259704, + "epoch": 0.32400420862768675, + "flos": 36255394275840.0, + "grad_norm": 1.9954909505528162, + "language_loss": 0.71735168, + "learning_rate": 3.1597292460774006e-06, + "loss": 0.79490453, + "num_input_tokens_seen": 115749385, + "router_z_loss_clip": 1.88867188, + "router_z_loss_mlp": 0.16796875, + "step": 5389, + "time_per_iteration": 4.133269309997559 + }, + { + "auxiliary_loss_clip": 0.06479806, + "auxiliary_loss_mlp": 0.01273464, + "balance_loss_clip": 0.06294239, + "balance_loss_mlp": 0.01257872, + "epoch": 0.3240643318803547, + "flos": 21622946826240.0, + "grad_norm": 1.7464997421167434, + "language_loss": 0.81443554, + "learning_rate": 3.159411924656557e-06, + "loss": 0.89196825, + "num_input_tokens_seen": 115768105, + "router_z_loss_clip": 1.85449219, + "router_z_loss_mlp": 0.15588379, + "step": 5390, + "time_per_iteration": 3.9378364086151123 + }, + { + "auxiliary_loss_clip": 0.06491944, + "auxiliary_loss_mlp": 0.01278594, + "balance_loss_clip": 0.06301276, + "balance_loss_mlp": 0.01261296, + "epoch": 0.3241244551330227, + "flos": 23302466864640.0, + "grad_norm": 1.9807661160762629, + "language_loss": 0.73182476, + "learning_rate": 3.1590945592706967e-06, + "loss": 0.80953014, + "num_input_tokens_seen": 115787340, + "router_z_loss_clip": 1.90625, + "router_z_loss_mlp": 0.1730957, + "step": 5391, + "time_per_iteration": 2.532317638397217 + }, + { + "auxiliary_loss_clip": 0.06482222, + "auxiliary_loss_mlp": 0.01278908, + "balance_loss_clip": 0.06294864, + "balance_loss_mlp": 0.0126241, + "epoch": 0.32418457838569065, + "flos": 14101891395840.0, + "grad_norm": 1.5457442510257688, + "language_loss": 0.77541089, + "learning_rate": 3.158777149931855e-06, + "loss": 0.85302216, + "num_input_tokens_seen": 115805565, + "router_z_loss_clip": 1.87304688, + "router_z_loss_mlp": 0.16491699, + "step": 5392, + "time_per_iteration": 2.486161470413208 + }, + { + "auxiliary_loss_clip": 0.06490408, + "auxiliary_loss_mlp": 0.01278183, + "balance_loss_clip": 0.0629712, + "balance_loss_mlp": 0.01261411, + "epoch": 0.3242447016383586, + "flos": 29760454344960.0, + "grad_norm": 1.849936210081937, + "language_loss": 0.63213563, + "learning_rate": 3.158459696652067e-06, + "loss": 0.70982158, + "num_input_tokens_seen": 115826725, + "router_z_loss_clip": 1.93066406, + "router_z_loss_mlp": 0.16760254, + "step": 5393, + "time_per_iteration": 2.5853707790374756 + }, + { + "auxiliary_loss_clip": 0.06489256, + "auxiliary_loss_mlp": 0.01282677, + "balance_loss_clip": 0.06301466, + "balance_loss_mlp": 0.01266011, + "epoch": 0.3243048248910266, + "flos": 24357820239360.0, + "grad_norm": 1.7023503315224988, + "language_loss": 0.82889545, + "learning_rate": 3.158142199443371e-06, + "loss": 0.90661478, + "num_input_tokens_seen": 115846955, + "router_z_loss_clip": 1.88085938, + "router_z_loss_mlp": 0.16674805, + "step": 5394, + "time_per_iteration": 3.946955680847168 + }, + { + "auxiliary_loss_clip": 0.06480435, + "auxiliary_loss_mlp": 0.01285084, + "balance_loss_clip": 0.06298714, + "balance_loss_mlp": 0.01269825, + "epoch": 0.3243649481436946, + "flos": 24359958518400.0, + "grad_norm": 2.1573093021253333, + "language_loss": 0.82280314, + "learning_rate": 3.1578246583178076e-06, + "loss": 0.90045834, + "num_input_tokens_seen": 115865975, + "router_z_loss_clip": 1.81738281, + "router_z_loss_mlp": 0.15270996, + "step": 5395, + "time_per_iteration": 2.537313222885132 + }, + { + "auxiliary_loss_clip": 0.06480338, + "auxiliary_loss_mlp": 0.01292267, + "balance_loss_clip": 0.06300412, + "balance_loss_mlp": 0.01276424, + "epoch": 0.32442507139636256, + "flos": 22931097569280.0, + "grad_norm": 1.7302006802896392, + "language_loss": 0.839818, + "learning_rate": 3.157507073287417e-06, + "loss": 0.91754401, + "num_input_tokens_seen": 115884950, + "router_z_loss_clip": 1.79785156, + "router_z_loss_mlp": 0.15844727, + "step": 5396, + "time_per_iteration": 2.6440067291259766 + }, + { + "auxiliary_loss_clip": 0.06491997, + "auxiliary_loss_mlp": 0.01291538, + "balance_loss_clip": 0.06299315, + "balance_loss_mlp": 0.01274121, + "epoch": 0.32448519464903053, + "flos": 22206723022080.0, + "grad_norm": 1.8684779143202024, + "language_loss": 0.76113403, + "learning_rate": 3.1571894443642414e-06, + "loss": 0.83896935, + "num_input_tokens_seen": 115904170, + "router_z_loss_clip": 1.92578125, + "router_z_loss_mlp": 0.17419434, + "step": 5397, + "time_per_iteration": 2.506601095199585 + }, + { + "auxiliary_loss_clip": 0.06473789, + "auxiliary_loss_mlp": 0.01290487, + "balance_loss_clip": 0.06290997, + "balance_loss_mlp": 0.0127387, + "epoch": 0.3245453179016985, + "flos": 18843574095360.0, + "grad_norm": 2.304762567896747, + "language_loss": 0.67975587, + "learning_rate": 3.1568717715603263e-06, + "loss": 0.75739866, + "num_input_tokens_seen": 115919255, + "router_z_loss_clip": 1.828125, + "router_z_loss_mlp": 0.1661377, + "step": 5398, + "time_per_iteration": 2.50168514251709 + }, + { + "auxiliary_loss_clip": 0.06478744, + "auxiliary_loss_mlp": 0.01288926, + "balance_loss_clip": 0.06293125, + "balance_loss_mlp": 0.01272189, + "epoch": 0.32460544115436646, + "flos": 21184716372480.0, + "grad_norm": 1.3685049489713428, + "language_loss": 0.73232323, + "learning_rate": 3.156554054887718e-06, + "loss": 0.80999994, + "num_input_tokens_seen": 115938535, + "router_z_loss_clip": 1.85742188, + "router_z_loss_mlp": 0.16748047, + "step": 5399, + "time_per_iteration": 2.5114216804504395 + }, + { + "auxiliary_loss_clip": 0.0648094, + "auxiliary_loss_mlp": 0.01289931, + "balance_loss_clip": 0.06293677, + "balance_loss_mlp": 0.01273241, + "epoch": 0.3246655644070344, + "flos": 21987607795200.0, + "grad_norm": 2.072173153822147, + "language_loss": 0.71044981, + "learning_rate": 3.1562362943584645e-06, + "loss": 0.78815848, + "num_input_tokens_seen": 115955005, + "router_z_loss_clip": 1.87207031, + "router_z_loss_mlp": 0.16687012, + "step": 5400, + "time_per_iteration": 2.548757314682007 + }, + { + "auxiliary_loss_clip": 0.06480449, + "auxiliary_loss_mlp": 0.01279651, + "balance_loss_clip": 0.06289301, + "balance_loss_mlp": 0.01263355, + "epoch": 0.3247256876597024, + "flos": 32167745020800.0, + "grad_norm": 2.104371315429844, + "language_loss": 0.80626661, + "learning_rate": 3.155918489984614e-06, + "loss": 0.88386756, + "num_input_tokens_seen": 115975305, + "router_z_loss_clip": 1.91210938, + "router_z_loss_mlp": 0.16296387, + "step": 5401, + "time_per_iteration": 2.59226393699646 + }, + { + "auxiliary_loss_clip": 0.06483636, + "auxiliary_loss_mlp": 0.01281263, + "balance_loss_clip": 0.06294005, + "balance_loss_mlp": 0.01264073, + "epoch": 0.32478581091237035, + "flos": 21004104896640.0, + "grad_norm": 1.4796090680940444, + "language_loss": 0.87935805, + "learning_rate": 3.1556006417782196e-06, + "loss": 0.95700705, + "num_input_tokens_seen": 115994810, + "router_z_loss_clip": 1.89746094, + "router_z_loss_mlp": 0.17175293, + "step": 5402, + "time_per_iteration": 2.5548956394195557 + }, + { + "auxiliary_loss_clip": 0.06474966, + "auxiliary_loss_mlp": 0.0127368, + "balance_loss_clip": 0.06291528, + "balance_loss_mlp": 0.01258767, + "epoch": 0.3248459341650383, + "flos": 17929741466880.0, + "grad_norm": 2.584856005153906, + "language_loss": 0.85243386, + "learning_rate": 3.155282749751332e-06, + "loss": 0.92992032, + "num_input_tokens_seen": 116011095, + "router_z_loss_clip": 1.83691406, + "router_z_loss_mlp": 0.14904785, + "step": 5403, + "time_per_iteration": 2.479205369949341 + }, + { + "auxiliary_loss_clip": 0.06468324, + "auxiliary_loss_mlp": 0.01277336, + "balance_loss_clip": 0.06290223, + "balance_loss_mlp": 0.01262667, + "epoch": 0.3249060574177063, + "flos": 24542582492160.0, + "grad_norm": 2.1052258035485214, + "language_loss": 0.8828373, + "learning_rate": 3.154964813916007e-06, + "loss": 0.96029389, + "num_input_tokens_seen": 116028805, + "router_z_loss_clip": 1.77832031, + "router_z_loss_mlp": 0.14672852, + "step": 5404, + "time_per_iteration": 2.5845093727111816 + }, + { + "auxiliary_loss_clip": 0.06473936, + "auxiliary_loss_mlp": 0.01275771, + "balance_loss_clip": 0.06291413, + "balance_loss_mlp": 0.01259368, + "epoch": 0.32496618067037425, + "flos": 26001939127680.0, + "grad_norm": 1.6833557203411496, + "language_loss": 0.72900558, + "learning_rate": 3.1546468342843008e-06, + "loss": 0.80650264, + "num_input_tokens_seen": 116047765, + "router_z_loss_clip": 1.82617188, + "router_z_loss_mlp": 0.1640625, + "step": 5405, + "time_per_iteration": 2.542433500289917 + }, + { + "auxiliary_loss_clip": 0.06474283, + "auxiliary_loss_mlp": 0.01273684, + "balance_loss_clip": 0.06290333, + "balance_loss_mlp": 0.01258264, + "epoch": 0.3250263039230422, + "flos": 19579939776000.0, + "grad_norm": 1.7320098663924197, + "language_loss": 0.83355331, + "learning_rate": 3.1543288108682707e-06, + "loss": 0.91103297, + "num_input_tokens_seen": 116068385, + "router_z_loss_clip": 1.83789062, + "router_z_loss_mlp": 0.15435791, + "step": 5406, + "time_per_iteration": 2.591207265853882 + }, + { + "auxiliary_loss_clip": 0.06474167, + "auxiliary_loss_mlp": 0.01270112, + "balance_loss_clip": 0.06290454, + "balance_loss_mlp": 0.01254949, + "epoch": 0.3250864271757102, + "flos": 16769232817920.0, + "grad_norm": 2.13827452533593, + "language_loss": 0.87879711, + "learning_rate": 3.1540107436799764e-06, + "loss": 0.95623994, + "num_input_tokens_seen": 116085350, + "router_z_loss_clip": 1.83886719, + "router_z_loss_mlp": 0.15161133, + "step": 5407, + "time_per_iteration": 2.4856173992156982 + }, + { + "auxiliary_loss_clip": 0.06469748, + "auxiliary_loss_mlp": 0.01276836, + "balance_loss_clip": 0.06284758, + "balance_loss_mlp": 0.01261195, + "epoch": 0.3251465504283782, + "flos": 27827004908160.0, + "grad_norm": 2.430972813034592, + "language_loss": 0.69975567, + "learning_rate": 3.153692632731479e-06, + "loss": 0.77722144, + "num_input_tokens_seen": 116107560, + "router_z_loss_clip": 1.84960938, + "router_z_loss_mlp": 0.15649414, + "step": 5408, + "time_per_iteration": 2.5838799476623535 + }, + { + "auxiliary_loss_clip": 0.06481153, + "auxiliary_loss_mlp": 0.01282988, + "balance_loss_clip": 0.06286341, + "balance_loss_mlp": 0.01267396, + "epoch": 0.32520667368104617, + "flos": 19069271867520.0, + "grad_norm": 3.909403651515765, + "language_loss": 0.78053123, + "learning_rate": 3.153374478034841e-06, + "loss": 0.85817266, + "num_input_tokens_seen": 116125980, + "router_z_loss_clip": 1.94824219, + "router_z_loss_mlp": 0.15588379, + "step": 5409, + "time_per_iteration": 2.5178377628326416 + }, + { + "auxiliary_loss_clip": 0.06473932, + "auxiliary_loss_mlp": 0.01272582, + "balance_loss_clip": 0.06286227, + "balance_loss_mlp": 0.01256202, + "epoch": 0.32526679693371413, + "flos": 29388917341440.0, + "grad_norm": 1.8050072916987376, + "language_loss": 0.83473468, + "learning_rate": 3.1530562796021285e-06, + "loss": 0.91219985, + "num_input_tokens_seen": 116146530, + "router_z_loss_clip": 1.87695312, + "router_z_loss_mlp": 0.16381836, + "step": 5410, + "time_per_iteration": 2.5948092937469482 + }, + { + "auxiliary_loss_clip": 0.06466505, + "auxiliary_loss_mlp": 0.01275621, + "balance_loss_clip": 0.06286819, + "balance_loss_mlp": 0.01261274, + "epoch": 0.3253269201863821, + "flos": 20710833206400.0, + "grad_norm": 1.580323990141508, + "language_loss": 0.72005814, + "learning_rate": 3.152738037445405e-06, + "loss": 0.79747939, + "num_input_tokens_seen": 116165695, + "router_z_loss_clip": 1.79589844, + "router_z_loss_mlp": 0.14349365, + "step": 5411, + "time_per_iteration": 2.515542507171631 + }, + { + "auxiliary_loss_clip": 0.06472497, + "auxiliary_loss_mlp": 0.0127713, + "balance_loss_clip": 0.06287136, + "balance_loss_mlp": 0.01261632, + "epoch": 0.32538704343905006, + "flos": 29101515436800.0, + "grad_norm": 1.470162471805647, + "language_loss": 0.83496881, + "learning_rate": 3.1524197515767403e-06, + "loss": 0.91246504, + "num_input_tokens_seen": 116185375, + "router_z_loss_clip": 1.85351562, + "router_z_loss_mlp": 0.15490723, + "step": 5412, + "time_per_iteration": 2.55008602142334 + }, + { + "auxiliary_loss_clip": 0.06476887, + "auxiliary_loss_mlp": 0.01277617, + "balance_loss_clip": 0.06287435, + "balance_loss_mlp": 0.01260904, + "epoch": 0.325447166691718, + "flos": 24682216521600.0, + "grad_norm": 1.5504273053971407, + "language_loss": 0.8129071, + "learning_rate": 3.152101422008203e-06, + "loss": 0.89045215, + "num_input_tokens_seen": 116204335, + "router_z_loss_clip": 1.89550781, + "router_z_loss_mlp": 0.16711426, + "step": 5413, + "time_per_iteration": 2.54195499420166 + }, + { + "auxiliary_loss_clip": 0.06477104, + "auxiliary_loss_mlp": 0.0127801, + "balance_loss_clip": 0.0628976, + "balance_loss_mlp": 0.01261643, + "epoch": 0.325507289944386, + "flos": 21549503122560.0, + "grad_norm": 1.5527044192655586, + "language_loss": 0.76985061, + "learning_rate": 3.151783048751864e-06, + "loss": 0.84740174, + "num_input_tokens_seen": 116222840, + "router_z_loss_clip": 1.87402344, + "router_z_loss_mlp": 0.16363525, + "step": 5414, + "time_per_iteration": 2.5435919761657715 + }, + { + "auxiliary_loss_clip": 0.063807, + "auxiliary_loss_mlp": 0.01284661, + "balance_loss_clip": 0.06291388, + "balance_loss_mlp": 0.01280793, + "epoch": 0.32556741319705396, + "flos": 71537893194240.0, + "grad_norm": 0.9015335749308697, + "language_loss": 0.64095414, + "learning_rate": 3.1514646318197965e-06, + "loss": 0.71760774, + "num_input_tokens_seen": 116274940, + "router_z_loss_clip": 0.890625, + "router_z_loss_mlp": 0.03863525, + "step": 5415, + "time_per_iteration": 3.0875957012176514 + }, + { + "auxiliary_loss_clip": 0.0647157, + "auxiliary_loss_mlp": 0.01275105, + "balance_loss_clip": 0.06285933, + "balance_loss_mlp": 0.01258845, + "epoch": 0.3256275364497219, + "flos": 23739187944960.0, + "grad_norm": 1.4815485577141352, + "language_loss": 0.74123245, + "learning_rate": 3.151146171224075e-06, + "loss": 0.81869924, + "num_input_tokens_seen": 116297300, + "router_z_loss_clip": 1.85546875, + "router_z_loss_mlp": 0.16235352, + "step": 5416, + "time_per_iteration": 2.5792665481567383 + }, + { + "auxiliary_loss_clip": 0.06381539, + "auxiliary_loss_mlp": 0.01266569, + "balance_loss_clip": 0.06293018, + "balance_loss_mlp": 0.01262769, + "epoch": 0.3256876597023899, + "flos": 67308136214400.0, + "grad_norm": 0.7704887993649999, + "language_loss": 0.57850802, + "learning_rate": 3.1508276669767757e-06, + "loss": 0.65498912, + "num_input_tokens_seen": 116362370, + "router_z_loss_clip": 0.88378906, + "router_z_loss_mlp": 0.03793335, + "step": 5417, + "time_per_iteration": 3.2770884037017822 + }, + { + "auxiliary_loss_clip": 0.06373264, + "auxiliary_loss_mlp": 0.01258837, + "balance_loss_clip": 0.06284805, + "balance_loss_mlp": 0.01254933, + "epoch": 0.32574778295505785, + "flos": 71304633826560.0, + "grad_norm": 0.8775074523137479, + "language_loss": 0.63674986, + "learning_rate": 3.150509119089975e-06, + "loss": 0.71307087, + "num_input_tokens_seen": 116430365, + "router_z_loss_clip": 0.8828125, + "router_z_loss_mlp": 0.03900146, + "step": 5418, + "time_per_iteration": 3.315948724746704 + }, + { + "auxiliary_loss_clip": 0.06476019, + "auxiliary_loss_mlp": 0.01273465, + "balance_loss_clip": 0.06290952, + "balance_loss_mlp": 0.01258111, + "epoch": 0.3258079062077258, + "flos": 20782515974400.0, + "grad_norm": 1.8847025208507953, + "language_loss": 0.6957128, + "learning_rate": 3.1501905275757537e-06, + "loss": 0.77320766, + "num_input_tokens_seen": 116447525, + "router_z_loss_clip": 1.85058594, + "router_z_loss_mlp": 0.15344238, + "step": 5419, + "time_per_iteration": 2.5722780227661133 + }, + { + "auxiliary_loss_clip": 0.06480842, + "auxiliary_loss_mlp": 0.01275789, + "balance_loss_clip": 0.06291591, + "balance_loss_mlp": 0.01260006, + "epoch": 0.3258680294603938, + "flos": 22241788755840.0, + "grad_norm": 2.023173952709465, + "language_loss": 0.77398664, + "learning_rate": 3.1498718924461926e-06, + "loss": 0.85155296, + "num_input_tokens_seen": 116466310, + "router_z_loss_clip": 1.890625, + "router_z_loss_mlp": 0.15783691, + "step": 5420, + "time_per_iteration": 2.5199873447418213 + }, + { + "auxiliary_loss_clip": 0.06478356, + "auxiliary_loss_mlp": 0.0127343, + "balance_loss_clip": 0.06290038, + "balance_loss_mlp": 0.0125798, + "epoch": 0.3259281527130618, + "flos": 26987328743040.0, + "grad_norm": 1.5124533627457746, + "language_loss": 0.80826706, + "learning_rate": 3.1495532137133736e-06, + "loss": 0.88578492, + "num_input_tokens_seen": 116487825, + "router_z_loss_clip": 1.88378906, + "router_z_loss_mlp": 0.15441895, + "step": 5421, + "time_per_iteration": 2.6014363765716553 + }, + { + "auxiliary_loss_clip": 0.06476312, + "auxiliary_loss_mlp": 0.0127337, + "balance_loss_clip": 0.06293876, + "balance_loss_mlp": 0.01258982, + "epoch": 0.32598827596572977, + "flos": 26221557479040.0, + "grad_norm": 1.4846059645471, + "language_loss": 0.76098251, + "learning_rate": 3.149234491389381e-06, + "loss": 0.8384794, + "num_input_tokens_seen": 116509950, + "router_z_loss_clip": 1.82421875, + "router_z_loss_mlp": 0.1439209, + "step": 5422, + "time_per_iteration": 2.5738978385925293 + }, + { + "auxiliary_loss_clip": 0.06480287, + "auxiliary_loss_mlp": 0.01270664, + "balance_loss_clip": 0.06288645, + "balance_loss_mlp": 0.01255095, + "epoch": 0.32604839921839773, + "flos": 17645567944320.0, + "grad_norm": 2.282982793788361, + "language_loss": 0.63826233, + "learning_rate": 3.1489157254863026e-06, + "loss": 0.71577179, + "num_input_tokens_seen": 116527695, + "router_z_loss_clip": 1.91699219, + "router_z_loss_mlp": 0.15576172, + "step": 5423, + "time_per_iteration": 2.5513644218444824 + }, + { + "auxiliary_loss_clip": 0.06469624, + "auxiliary_loss_mlp": 0.01273816, + "balance_loss_clip": 0.06290927, + "balance_loss_mlp": 0.01258748, + "epoch": 0.3261085224710657, + "flos": 23629420696320.0, + "grad_norm": 1.6690467832946037, + "language_loss": 0.75170749, + "learning_rate": 3.148596916016224e-06, + "loss": 0.82914186, + "num_input_tokens_seen": 116547800, + "router_z_loss_clip": 1.78710938, + "router_z_loss_mlp": 0.1505127, + "step": 5424, + "time_per_iteration": 2.546074151992798 + }, + { + "auxiliary_loss_clip": 0.06470636, + "auxiliary_loss_mlp": 0.01274311, + "balance_loss_clip": 0.06288706, + "balance_loss_mlp": 0.01258945, + "epoch": 0.32616864572373366, + "flos": 23267526912000.0, + "grad_norm": 1.6415169459291201, + "language_loss": 0.7718606, + "learning_rate": 3.1482780629912355e-06, + "loss": 0.84931004, + "num_input_tokens_seen": 116568460, + "router_z_loss_clip": 1.81933594, + "router_z_loss_mlp": 0.15368652, + "step": 5425, + "time_per_iteration": 2.5883710384368896 + }, + { + "auxiliary_loss_clip": 0.06476015, + "auxiliary_loss_mlp": 0.01273254, + "balance_loss_clip": 0.06284191, + "balance_loss_mlp": 0.01256589, + "epoch": 0.32622876897640163, + "flos": 25600535343360.0, + "grad_norm": 2.4681515054731924, + "language_loss": 0.78599709, + "learning_rate": 3.147959166423428e-06, + "loss": 0.86348987, + "num_input_tokens_seen": 116588705, + "router_z_loss_clip": 1.91992188, + "router_z_loss_mlp": 0.16650391, + "step": 5426, + "time_per_iteration": 2.569566488265991 + }, + { + "auxiliary_loss_clip": 0.06473041, + "auxiliary_loss_mlp": 0.01273059, + "balance_loss_clip": 0.06286261, + "balance_loss_mlp": 0.0125749, + "epoch": 0.3262888922290696, + "flos": 22425544759680.0, + "grad_norm": 1.6671872965592953, + "language_loss": 0.74719262, + "learning_rate": 3.147640226324893e-06, + "loss": 0.82465363, + "num_input_tokens_seen": 116608845, + "router_z_loss_clip": 1.86816406, + "router_z_loss_mlp": 0.15563965, + "step": 5427, + "time_per_iteration": 3.941770315170288 + }, + { + "auxiliary_loss_clip": 0.06474692, + "auxiliary_loss_mlp": 0.0127251, + "balance_loss_clip": 0.06285059, + "balance_loss_mlp": 0.01256154, + "epoch": 0.32634901548173756, + "flos": 19724982393600.0, + "grad_norm": 2.0508761677602965, + "language_loss": 0.79472262, + "learning_rate": 3.1473212427077266e-06, + "loss": 0.87219465, + "num_input_tokens_seen": 116628145, + "router_z_loss_clip": 1.89550781, + "router_z_loss_mlp": 0.16357422, + "step": 5428, + "time_per_iteration": 3.9950850009918213 + }, + { + "auxiliary_loss_clip": 0.06475013, + "auxiliary_loss_mlp": 0.01275116, + "balance_loss_clip": 0.0628937, + "balance_loss_mlp": 0.01259309, + "epoch": 0.3264091387344055, + "flos": 16148336463360.0, + "grad_norm": 1.5445825374219135, + "language_loss": 0.71770716, + "learning_rate": 3.147002215584023e-06, + "loss": 0.79520845, + "num_input_tokens_seen": 116646920, + "router_z_loss_clip": 1.85644531, + "router_z_loss_mlp": 0.15808105, + "step": 5429, + "time_per_iteration": 3.922197103500366 + }, + { + "auxiliary_loss_clip": 0.06468233, + "auxiliary_loss_mlp": 0.01269844, + "balance_loss_clip": 0.06283497, + "balance_loss_mlp": 0.01254466, + "epoch": 0.3264692619870735, + "flos": 16404655703040.0, + "grad_norm": 1.5791835311639297, + "language_loss": 0.78689212, + "learning_rate": 3.146683144965881e-06, + "loss": 0.86427283, + "num_input_tokens_seen": 116665100, + "router_z_loss_clip": 1.84863281, + "router_z_loss_mlp": 0.15380859, + "step": 5430, + "time_per_iteration": 2.4873790740966797 + }, + { + "auxiliary_loss_clip": 0.06468185, + "auxiliary_loss_mlp": 0.0127668, + "balance_loss_clip": 0.06281599, + "balance_loss_mlp": 0.01259561, + "epoch": 0.32652938523974145, + "flos": 22388843871360.0, + "grad_norm": 1.9481749952405665, + "language_loss": 0.84556186, + "learning_rate": 3.146364030865399e-06, + "loss": 0.92301053, + "num_input_tokens_seen": 116682205, + "router_z_loss_clip": 1.86425781, + "router_z_loss_mlp": 0.17126465, + "step": 5431, + "time_per_iteration": 2.522075653076172 + }, + { + "auxiliary_loss_clip": 0.06468672, + "auxiliary_loss_mlp": 0.01274085, + "balance_loss_clip": 0.06286903, + "balance_loss_mlp": 0.01259327, + "epoch": 0.3265895084924094, + "flos": 21914499507840.0, + "grad_norm": 1.6266920997971765, + "language_loss": 0.71123517, + "learning_rate": 3.146044873294678e-06, + "loss": 0.78866279, + "num_input_tokens_seen": 116702575, + "router_z_loss_clip": 1.81835938, + "router_z_loss_mlp": 0.14758301, + "step": 5432, + "time_per_iteration": 2.513209104537964 + }, + { + "auxiliary_loss_clip": 0.06469099, + "auxiliary_loss_mlp": 0.01272277, + "balance_loss_clip": 0.06282821, + "balance_loss_mlp": 0.01257424, + "epoch": 0.3266496317450774, + "flos": 16072083648000.0, + "grad_norm": 1.3982751613904698, + "language_loss": 0.84207368, + "learning_rate": 3.1457256722658203e-06, + "loss": 0.91948748, + "num_input_tokens_seen": 116720885, + "router_z_loss_clip": 1.86328125, + "router_z_loss_mlp": 0.14855957, + "step": 5433, + "time_per_iteration": 2.5324172973632812 + }, + { + "auxiliary_loss_clip": 0.06463822, + "auxiliary_loss_mlp": 0.01279207, + "balance_loss_clip": 0.06283711, + "balance_loss_mlp": 0.01264049, + "epoch": 0.3267097549977454, + "flos": 22534766956800.0, + "grad_norm": 1.4562075652627795, + "language_loss": 0.85916972, + "learning_rate": 3.145406427790931e-06, + "loss": 0.93660003, + "num_input_tokens_seen": 116740395, + "router_z_loss_clip": 1.80175781, + "router_z_loss_mlp": 0.15155029, + "step": 5434, + "time_per_iteration": 3.9434614181518555 + }, + { + "auxiliary_loss_clip": 0.06468898, + "auxiliary_loss_mlp": 0.01277076, + "balance_loss_clip": 0.06281307, + "balance_loss_mlp": 0.0126134, + "epoch": 0.32676987825041337, + "flos": 27277581686400.0, + "grad_norm": 1.6909362765146225, + "language_loss": 0.88470823, + "learning_rate": 3.1450871398821147e-06, + "loss": 0.96216792, + "num_input_tokens_seen": 116758870, + "router_z_loss_clip": 1.875, + "router_z_loss_mlp": 0.1574707, + "step": 5435, + "time_per_iteration": 2.5430006980895996 + }, + { + "auxiliary_loss_clip": 0.06469613, + "auxiliary_loss_mlp": 0.01271625, + "balance_loss_clip": 0.06283396, + "balance_loss_mlp": 0.01256306, + "epoch": 0.32683000150308134, + "flos": 11512731432960.0, + "grad_norm": 2.3091497119382733, + "language_loss": 0.77129918, + "learning_rate": 3.144767808551479e-06, + "loss": 0.84871155, + "num_input_tokens_seen": 116773440, + "router_z_loss_clip": 1.86035156, + "router_z_loss_mlp": 0.15307617, + "step": 5436, + "time_per_iteration": 2.486003875732422 + }, + { + "auxiliary_loss_clip": 0.06464212, + "auxiliary_loss_mlp": 0.01277236, + "balance_loss_clip": 0.0628064, + "balance_loss_mlp": 0.01261977, + "epoch": 0.3268901247557493, + "flos": 25637362012800.0, + "grad_norm": 1.5303988762112921, + "language_loss": 0.72448635, + "learning_rate": 3.144448433811134e-06, + "loss": 0.80190074, + "num_input_tokens_seen": 116794375, + "router_z_loss_clip": 1.83398438, + "router_z_loss_mlp": 0.15270996, + "step": 5437, + "time_per_iteration": 2.545548915863037 + }, + { + "auxiliary_loss_clip": 0.06472606, + "auxiliary_loss_mlp": 0.01275014, + "balance_loss_clip": 0.06282267, + "balance_loss_mlp": 0.01258253, + "epoch": 0.32695024800841727, + "flos": 24867356117760.0, + "grad_norm": 1.604360978002023, + "language_loss": 0.64194709, + "learning_rate": 3.144129015673189e-06, + "loss": 0.71942323, + "num_input_tokens_seen": 116815095, + "router_z_loss_clip": 1.90429688, + "router_z_loss_mlp": 0.16760254, + "step": 5438, + "time_per_iteration": 2.5657694339752197 + }, + { + "auxiliary_loss_clip": 0.06462848, + "auxiliary_loss_mlp": 0.01273041, + "balance_loss_clip": 0.0627985, + "balance_loss_mlp": 0.01257246, + "epoch": 0.32701037126108523, + "flos": 28846663643520.0, + "grad_norm": 1.637174889107761, + "language_loss": 0.74795192, + "learning_rate": 3.1438095541497576e-06, + "loss": 0.82531083, + "num_input_tokens_seen": 116836630, + "router_z_loss_clip": 1.82910156, + "router_z_loss_mlp": 0.15795898, + "step": 5439, + "time_per_iteration": 2.5655689239501953 + }, + { + "auxiliary_loss_clip": 0.06473309, + "auxiliary_loss_mlp": 0.01272512, + "balance_loss_clip": 0.06286814, + "balance_loss_mlp": 0.01257087, + "epoch": 0.3270704945137532, + "flos": 27972592577280.0, + "grad_norm": 1.745503595629167, + "language_loss": 0.74950606, + "learning_rate": 3.1434900492529527e-06, + "loss": 0.82696426, + "num_input_tokens_seen": 116856880, + "router_z_loss_clip": 1.86425781, + "router_z_loss_mlp": 0.1541748, + "step": 5440, + "time_per_iteration": 2.601821184158325 + }, + { + "auxiliary_loss_clip": 0.06460315, + "auxiliary_loss_mlp": 0.01269526, + "balance_loss_clip": 0.06277528, + "balance_loss_mlp": 0.01254947, + "epoch": 0.32713061776642116, + "flos": 23696575344000.0, + "grad_norm": 1.95462638600934, + "language_loss": 0.84695202, + "learning_rate": 3.1431705009948914e-06, + "loss": 0.92425048, + "num_input_tokens_seen": 116873770, + "router_z_loss_clip": 1.82910156, + "router_z_loss_mlp": 0.14599609, + "step": 5441, + "time_per_iteration": 2.5020570755004883 + }, + { + "auxiliary_loss_clip": 0.06466734, + "auxiliary_loss_mlp": 0.01272021, + "balance_loss_clip": 0.06280614, + "balance_loss_mlp": 0.01256798, + "epoch": 0.3271907410190891, + "flos": 22462203720960.0, + "grad_norm": 1.9620532707625304, + "language_loss": 0.86928713, + "learning_rate": 3.1428509093876897e-06, + "loss": 0.9466747, + "num_input_tokens_seen": 116891225, + "router_z_loss_clip": 1.86035156, + "router_z_loss_mlp": 0.15222168, + "step": 5442, + "time_per_iteration": 2.5388059616088867 + }, + { + "auxiliary_loss_clip": 0.06470812, + "auxiliary_loss_mlp": 0.0126936, + "balance_loss_clip": 0.06282146, + "balance_loss_mlp": 0.01254399, + "epoch": 0.3272508642717571, + "flos": 22826696981760.0, + "grad_norm": 1.5979656279548642, + "language_loss": 0.77388418, + "learning_rate": 3.1425312744434668e-06, + "loss": 0.85128593, + "num_input_tokens_seen": 116912300, + "router_z_loss_clip": 1.88671875, + "router_z_loss_mlp": 0.1496582, + "step": 5443, + "time_per_iteration": 2.5765621662139893 + }, + { + "auxiliary_loss_clip": 0.0646731, + "auxiliary_loss_mlp": 0.01271075, + "balance_loss_clip": 0.06280384, + "balance_loss_mlp": 0.01255518, + "epoch": 0.32731098752442506, + "flos": 11806086977280.0, + "grad_norm": 2.2200780771744073, + "language_loss": 0.82818562, + "learning_rate": 3.142211596174343e-06, + "loss": 0.90556955, + "num_input_tokens_seen": 116929425, + "router_z_loss_clip": 1.8671875, + "router_z_loss_mlp": 0.15551758, + "step": 5444, + "time_per_iteration": 2.5514841079711914 + }, + { + "auxiliary_loss_clip": 0.06468201, + "auxiliary_loss_mlp": 0.01274937, + "balance_loss_clip": 0.06282412, + "balance_loss_mlp": 0.01258295, + "epoch": 0.327371110777093, + "flos": 21033300844800.0, + "grad_norm": 2.365977713323657, + "language_loss": 0.59248179, + "learning_rate": 3.1418918745924423e-06, + "loss": 0.66991317, + "num_input_tokens_seen": 116948255, + "router_z_loss_clip": 1.85742188, + "router_z_loss_mlp": 0.16638184, + "step": 5445, + "time_per_iteration": 2.5325539112091064 + }, + { + "auxiliary_loss_clip": 0.06469189, + "auxiliary_loss_mlp": 0.01278146, + "balance_loss_clip": 0.0628283, + "balance_loss_mlp": 0.01261278, + "epoch": 0.327431234029761, + "flos": 19068055983360.0, + "grad_norm": 2.7570820492615886, + "language_loss": 0.89260846, + "learning_rate": 3.1415721097098865e-06, + "loss": 0.97008175, + "num_input_tokens_seen": 116964905, + "router_z_loss_clip": 1.86425781, + "router_z_loss_mlp": 0.16870117, + "step": 5446, + "time_per_iteration": 2.576833724975586 + }, + { + "auxiliary_loss_clip": 0.06476346, + "auxiliary_loss_mlp": 0.01274903, + "balance_loss_clip": 0.06282137, + "balance_loss_mlp": 0.01257403, + "epoch": 0.32749135728242895, + "flos": 25856435312640.0, + "grad_norm": 1.9641165872810087, + "language_loss": 0.79404771, + "learning_rate": 3.141252301538802e-06, + "loss": 0.87156022, + "num_input_tokens_seen": 116983650, + "router_z_loss_clip": 1.94238281, + "router_z_loss_mlp": 0.17480469, + "step": 5447, + "time_per_iteration": 2.5539090633392334 + }, + { + "auxiliary_loss_clip": 0.06462374, + "auxiliary_loss_mlp": 0.01278273, + "balance_loss_clip": 0.06277826, + "balance_loss_mlp": 0.01263277, + "epoch": 0.327551480535097, + "flos": 20126721594240.0, + "grad_norm": 1.953936246680755, + "language_loss": 0.73150277, + "learning_rate": 3.1409324500913157e-06, + "loss": 0.80890924, + "num_input_tokens_seen": 117003265, + "router_z_loss_clip": 1.84667969, + "router_z_loss_mlp": 0.14990234, + "step": 5448, + "time_per_iteration": 2.633612871170044 + }, + { + "auxiliary_loss_clip": 0.06464307, + "auxiliary_loss_mlp": 0.01272265, + "balance_loss_clip": 0.0628064, + "balance_loss_mlp": 0.01256291, + "epoch": 0.32761160378776494, + "flos": 28811094785280.0, + "grad_norm": 1.3623614976773524, + "language_loss": 0.67002481, + "learning_rate": 3.1406125553795567e-06, + "loss": 0.74739063, + "num_input_tokens_seen": 117025370, + "router_z_loss_clip": 1.83789062, + "router_z_loss_mlp": 0.15966797, + "step": 5449, + "time_per_iteration": 2.5777859687805176 + }, + { + "auxiliary_loss_clip": 0.0647198, + "auxiliary_loss_mlp": 0.01270062, + "balance_loss_clip": 0.0628611, + "balance_loss_mlp": 0.01254493, + "epoch": 0.3276717270404329, + "flos": 26944171090560.0, + "grad_norm": 1.378619651715801, + "language_loss": 0.65736711, + "learning_rate": 3.1402926174156556e-06, + "loss": 0.73478758, + "num_input_tokens_seen": 117044350, + "router_z_loss_clip": 1.85644531, + "router_z_loss_mlp": 0.15576172, + "step": 5450, + "time_per_iteration": 2.5971169471740723 + }, + { + "auxiliary_loss_clip": 0.06468028, + "auxiliary_loss_mlp": 0.01275162, + "balance_loss_clip": 0.06280884, + "balance_loss_mlp": 0.01258509, + "epoch": 0.32773185029310087, + "flos": 25345557768960.0, + "grad_norm": 7.041147023955008, + "language_loss": 0.77832162, + "learning_rate": 3.1399726362117437e-06, + "loss": 0.85575354, + "num_input_tokens_seen": 117064450, + "router_z_loss_clip": 1.87207031, + "router_z_loss_mlp": 0.16662598, + "step": 5451, + "time_per_iteration": 2.572112560272217 + }, + { + "auxiliary_loss_clip": 0.06472664, + "auxiliary_loss_mlp": 0.01278588, + "balance_loss_clip": 0.06283467, + "balance_loss_mlp": 0.01262042, + "epoch": 0.32779197354576883, + "flos": 26398227813120.0, + "grad_norm": 1.9495025825112327, + "language_loss": 0.70696288, + "learning_rate": 3.1396526117799555e-06, + "loss": 0.78447533, + "num_input_tokens_seen": 117083060, + "router_z_loss_clip": 1.88867188, + "router_z_loss_mlp": 0.16540527, + "step": 5452, + "time_per_iteration": 2.6081676483154297 + }, + { + "auxiliary_loss_clip": 0.0646618, + "auxiliary_loss_mlp": 0.01272924, + "balance_loss_clip": 0.06283787, + "balance_loss_mlp": 0.01256938, + "epoch": 0.3278520967984368, + "flos": 24906237212160.0, + "grad_norm": 1.6132254933408041, + "language_loss": 0.7924304, + "learning_rate": 3.1393325441324256e-06, + "loss": 0.86982143, + "num_input_tokens_seen": 117101860, + "router_z_loss_clip": 1.82519531, + "router_z_loss_mlp": 0.15979004, + "step": 5453, + "time_per_iteration": 2.5893869400024414 + }, + { + "auxiliary_loss_clip": 0.06469721, + "auxiliary_loss_mlp": 0.01274795, + "balance_loss_clip": 0.06282013, + "balance_loss_mlp": 0.01259309, + "epoch": 0.32791222005110476, + "flos": 29760831688320.0, + "grad_norm": 2.0442879632543476, + "language_loss": 0.758448, + "learning_rate": 3.1390124332812916e-06, + "loss": 0.83589315, + "num_input_tokens_seen": 117123100, + "router_z_loss_clip": 1.87792969, + "router_z_loss_mlp": 0.15478516, + "step": 5454, + "time_per_iteration": 2.590080499649048 + }, + { + "auxiliary_loss_clip": 0.06461332, + "auxiliary_loss_mlp": 0.01271865, + "balance_loss_clip": 0.06280516, + "balance_loss_mlp": 0.01257536, + "epoch": 0.32797234330377273, + "flos": 16513584410880.0, + "grad_norm": 2.183253633037468, + "language_loss": 0.77119774, + "learning_rate": 3.1386922792386924e-06, + "loss": 0.8485297, + "num_input_tokens_seen": 117140515, + "router_z_loss_clip": 1.80761719, + "router_z_loss_mlp": 0.14318848, + "step": 5455, + "time_per_iteration": 2.4873318672180176 + }, + { + "auxiliary_loss_clip": 0.06482153, + "auxiliary_loss_mlp": 0.01285817, + "balance_loss_clip": 0.06290287, + "balance_loss_mlp": 0.01268377, + "epoch": 0.3280324665564407, + "flos": 26585086417920.0, + "grad_norm": 1.6915080932551223, + "language_loss": 0.74407738, + "learning_rate": 3.138372082016768e-06, + "loss": 0.82175708, + "num_input_tokens_seen": 117161485, + "router_z_loss_clip": 1.91894531, + "router_z_loss_mlp": 0.17443848, + "step": 5456, + "time_per_iteration": 2.593258857727051 + }, + { + "auxiliary_loss_clip": 0.0646835, + "auxiliary_loss_mlp": 0.01277637, + "balance_loss_clip": 0.06282572, + "balance_loss_mlp": 0.01261306, + "epoch": 0.32809258980910866, + "flos": 22936631938560.0, + "grad_norm": 1.4862092693082851, + "language_loss": 0.78666067, + "learning_rate": 3.1380518416276596e-06, + "loss": 0.8641206, + "num_input_tokens_seen": 117181870, + "router_z_loss_clip": 1.85546875, + "router_z_loss_mlp": 0.16345215, + "step": 5457, + "time_per_iteration": 2.523540496826172 + }, + { + "auxiliary_loss_clip": 0.06473868, + "auxiliary_loss_mlp": 0.01274513, + "balance_loss_clip": 0.06281006, + "balance_loss_mlp": 0.0125873, + "epoch": 0.3281527130617766, + "flos": 22790457290880.0, + "grad_norm": 2.0769759307730644, + "language_loss": 0.78958774, + "learning_rate": 3.1377315580835115e-06, + "loss": 0.86707151, + "num_input_tokens_seen": 117201380, + "router_z_loss_clip": 1.92773438, + "router_z_loss_mlp": 0.15795898, + "step": 5458, + "time_per_iteration": 2.552680015563965 + }, + { + "auxiliary_loss_clip": 0.06469774, + "auxiliary_loss_mlp": 0.01274499, + "balance_loss_clip": 0.06284518, + "balance_loss_mlp": 0.01258215, + "epoch": 0.3282128363144446, + "flos": 21256902264960.0, + "grad_norm": 1.5512978296749391, + "language_loss": 0.73655844, + "learning_rate": 3.1374112313964686e-06, + "loss": 0.8140012, + "num_input_tokens_seen": 117221040, + "router_z_loss_clip": 1.84960938, + "router_z_loss_mlp": 0.1628418, + "step": 5459, + "time_per_iteration": 2.5166404247283936 + }, + { + "auxiliary_loss_clip": 0.0647283, + "auxiliary_loss_mlp": 0.01274033, + "balance_loss_clip": 0.0628351, + "balance_loss_mlp": 0.01257761, + "epoch": 0.32827295956711255, + "flos": 30850328401920.0, + "grad_norm": 2.2277675097031993, + "language_loss": 0.84476066, + "learning_rate": 3.1370908615786783e-06, + "loss": 0.92222929, + "num_input_tokens_seen": 117241395, + "router_z_loss_clip": 1.89257812, + "router_z_loss_mlp": 0.16271973, + "step": 5460, + "time_per_iteration": 2.6067721843719482 + }, + { + "auxiliary_loss_clip": 0.06469227, + "auxiliary_loss_mlp": 0.01276293, + "balance_loss_clip": 0.06282166, + "balance_loss_mlp": 0.01260319, + "epoch": 0.3283330828197806, + "flos": 25921032410880.0, + "grad_norm": 2.3722751928185297, + "language_loss": 0.78114808, + "learning_rate": 3.136770448642288e-06, + "loss": 0.8586033, + "num_input_tokens_seen": 117259340, + "router_z_loss_clip": 1.86914062, + "router_z_loss_mlp": 0.15991211, + "step": 5461, + "time_per_iteration": 2.550417184829712 + }, + { + "auxiliary_loss_clip": 0.06469681, + "auxiliary_loss_mlp": 0.01279493, + "balance_loss_clip": 0.06282061, + "balance_loss_mlp": 0.01261361, + "epoch": 0.32839320607244854, + "flos": 38591295672960.0, + "grad_norm": 1.5965953358146812, + "language_loss": 0.62925887, + "learning_rate": 3.1364499925994484e-06, + "loss": 0.70675063, + "num_input_tokens_seen": 117282375, + "router_z_loss_clip": 1.87695312, + "router_z_loss_mlp": 0.18115234, + "step": 5462, + "time_per_iteration": 2.7004194259643555 + }, + { + "auxiliary_loss_clip": 0.06467308, + "auxiliary_loss_mlp": 0.0128086, + "balance_loss_clip": 0.06284478, + "balance_loss_mlp": 0.01265077, + "epoch": 0.3284533293251165, + "flos": 26658068924160.0, + "grad_norm": 1.3126719376538145, + "language_loss": 0.78502059, + "learning_rate": 3.1361294934623115e-06, + "loss": 0.86250222, + "num_input_tokens_seen": 117303830, + "router_z_loss_clip": 1.828125, + "router_z_loss_mlp": 0.15783691, + "step": 5463, + "time_per_iteration": 2.6072070598602295 + }, + { + "auxiliary_loss_clip": 0.0647091, + "auxiliary_loss_mlp": 0.01272646, + "balance_loss_clip": 0.06283993, + "balance_loss_mlp": 0.01256589, + "epoch": 0.32851345257778447, + "flos": 15309498839040.0, + "grad_norm": 1.727782559794916, + "language_loss": 0.70068884, + "learning_rate": 3.1358089512430303e-06, + "loss": 0.77812445, + "num_input_tokens_seen": 117320665, + "router_z_loss_clip": 1.87109375, + "router_z_loss_mlp": 0.16064453, + "step": 5464, + "time_per_iteration": 2.519319534301758 + }, + { + "auxiliary_loss_clip": 0.06466094, + "auxiliary_loss_mlp": 0.01275271, + "balance_loss_clip": 0.06284432, + "balance_loss_mlp": 0.01257938, + "epoch": 0.32857357583045244, + "flos": 23520491988480.0, + "grad_norm": 1.6619431416557902, + "language_loss": 0.72759986, + "learning_rate": 3.1354883659537594e-06, + "loss": 0.80501354, + "num_input_tokens_seen": 117339795, + "router_z_loss_clip": 1.81445312, + "router_z_loss_mlp": 0.17333984, + "step": 5465, + "time_per_iteration": 2.573444366455078 + }, + { + "auxiliary_loss_clip": 0.06473792, + "auxiliary_loss_mlp": 0.01281793, + "balance_loss_clip": 0.06287415, + "balance_loss_mlp": 0.01265509, + "epoch": 0.3286336990831204, + "flos": 21001379639040.0, + "grad_norm": 1.5232981833560715, + "language_loss": 0.82967317, + "learning_rate": 3.1351677376066567e-06, + "loss": 0.90722907, + "num_input_tokens_seen": 117359525, + "router_z_loss_clip": 1.86328125, + "router_z_loss_mlp": 0.16271973, + "step": 5466, + "time_per_iteration": 4.012515306472778 + }, + { + "auxiliary_loss_clip": 0.0647275, + "auxiliary_loss_mlp": 0.01271061, + "balance_loss_clip": 0.06285034, + "balance_loss_mlp": 0.01254932, + "epoch": 0.32869382233578837, + "flos": 23665450752000.0, + "grad_norm": 1.6606265994221874, + "language_loss": 0.79192597, + "learning_rate": 3.134847066213879e-06, + "loss": 0.86936402, + "num_input_tokens_seen": 117380320, + "router_z_loss_clip": 1.87597656, + "router_z_loss_mlp": 0.16125488, + "step": 5467, + "time_per_iteration": 4.000247955322266 + }, + { + "auxiliary_loss_clip": 0.06467809, + "auxiliary_loss_mlp": 0.01271951, + "balance_loss_clip": 0.06279044, + "balance_loss_mlp": 0.01255333, + "epoch": 0.32875394558845633, + "flos": 25343335635840.0, + "grad_norm": 1.5510134892276737, + "language_loss": 0.74865687, + "learning_rate": 3.134526351787587e-06, + "loss": 0.82605445, + "num_input_tokens_seen": 117400695, + "router_z_loss_clip": 1.88574219, + "router_z_loss_mlp": 0.16601562, + "step": 5468, + "time_per_iteration": 2.5805253982543945 + }, + { + "auxiliary_loss_clip": 0.06474267, + "auxiliary_loss_mlp": 0.01276703, + "balance_loss_clip": 0.0628129, + "balance_loss_mlp": 0.01259108, + "epoch": 0.3288140688411243, + "flos": 14908430471040.0, + "grad_norm": 1.672146103500693, + "language_loss": 0.78728724, + "learning_rate": 3.134205594339942e-06, + "loss": 0.86479694, + "num_input_tokens_seen": 117418800, + "router_z_loss_clip": 1.9296875, + "router_z_loss_mlp": 0.17614746, + "step": 5469, + "time_per_iteration": 3.955373525619507 + }, + { + "auxiliary_loss_clip": 0.06466976, + "auxiliary_loss_mlp": 0.01273245, + "balance_loss_clip": 0.06279504, + "balance_loss_mlp": 0.01257224, + "epoch": 0.32887419209379226, + "flos": 18557220366720.0, + "grad_norm": 1.6018901390748483, + "language_loss": 0.82183433, + "learning_rate": 3.133884793883107e-06, + "loss": 0.89923656, + "num_input_tokens_seen": 117438220, + "router_z_loss_clip": 1.87402344, + "router_z_loss_mlp": 0.16015625, + "step": 5470, + "time_per_iteration": 2.5481319427490234 + }, + { + "auxiliary_loss_clip": 0.06467617, + "auxiliary_loss_mlp": 0.01271427, + "balance_loss_clip": 0.06279681, + "balance_loss_mlp": 0.01254869, + "epoch": 0.3289343153464602, + "flos": 48116560913280.0, + "grad_norm": 1.6166643495117736, + "language_loss": 0.68441176, + "learning_rate": 3.1335639504292478e-06, + "loss": 0.76180226, + "num_input_tokens_seen": 117462560, + "router_z_loss_clip": 1.87890625, + "router_z_loss_mlp": 0.16564941, + "step": 5471, + "time_per_iteration": 2.780454158782959 + }, + { + "auxiliary_loss_clip": 0.06479289, + "auxiliary_loss_mlp": 0.012789, + "balance_loss_clip": 0.06285035, + "balance_loss_mlp": 0.01260637, + "epoch": 0.3289944385991282, + "flos": 27607763900160.0, + "grad_norm": 1.5078842371471577, + "language_loss": 0.65564525, + "learning_rate": 3.1332430639905288e-06, + "loss": 0.73322713, + "num_input_tokens_seen": 117483665, + "router_z_loss_clip": 1.94140625, + "router_z_loss_mlp": 0.18273926, + "step": 5472, + "time_per_iteration": 2.580644369125366 + }, + { + "auxiliary_loss_clip": 0.06472386, + "auxiliary_loss_mlp": 0.01277133, + "balance_loss_clip": 0.06281875, + "balance_loss_mlp": 0.01259144, + "epoch": 0.32905456185179616, + "flos": 20126470032000.0, + "grad_norm": 1.614198879205061, + "language_loss": 0.88538003, + "learning_rate": 3.13292213457912e-06, + "loss": 0.96287525, + "num_input_tokens_seen": 117503565, + "router_z_loss_clip": 1.90527344, + "router_z_loss_mlp": 0.17993164, + "step": 5473, + "time_per_iteration": 4.021254062652588 + }, + { + "auxiliary_loss_clip": 0.06475069, + "auxiliary_loss_mlp": 0.01270272, + "balance_loss_clip": 0.06285396, + "balance_loss_mlp": 0.01253714, + "epoch": 0.3291146851044642, + "flos": 23186075143680.0, + "grad_norm": 1.7643015597930078, + "language_loss": 0.78719336, + "learning_rate": 3.1326011622071903e-06, + "loss": 0.86464679, + "num_input_tokens_seen": 117521460, + "router_z_loss_clip": 1.89453125, + "router_z_loss_mlp": 0.16552734, + "step": 5474, + "time_per_iteration": 2.5416688919067383 + }, + { + "auxiliary_loss_clip": 0.06379573, + "auxiliary_loss_mlp": 0.0134405, + "balance_loss_clip": 0.06291323, + "balance_loss_mlp": 0.01340224, + "epoch": 0.32917480835713214, + "flos": 67641630664320.0, + "grad_norm": 0.8577160187921843, + "language_loss": 0.60258645, + "learning_rate": 3.132280146886911e-06, + "loss": 0.67982268, + "num_input_tokens_seen": 117580550, + "router_z_loss_clip": 0.8828125, + "router_z_loss_mlp": 0.03820801, + "step": 5475, + "time_per_iteration": 3.1267805099487305 + }, + { + "auxiliary_loss_clip": 0.06479369, + "auxiliary_loss_mlp": 0.01279647, + "balance_loss_clip": 0.06284596, + "balance_loss_mlp": 0.01261599, + "epoch": 0.3292349316098001, + "flos": 27971963671680.0, + "grad_norm": 3.252822648856248, + "language_loss": 0.7712574, + "learning_rate": 3.131959088630455e-06, + "loss": 0.84884757, + "num_input_tokens_seen": 117600645, + "router_z_loss_clip": 1.94824219, + "router_z_loss_mlp": 0.18041992, + "step": 5476, + "time_per_iteration": 2.5819692611694336 + }, + { + "auxiliary_loss_clip": 0.06469015, + "auxiliary_loss_mlp": 0.01275163, + "balance_loss_clip": 0.06282525, + "balance_loss_mlp": 0.01258956, + "epoch": 0.3292950548624681, + "flos": 20269416297600.0, + "grad_norm": 1.7333439092472165, + "language_loss": 0.7556808, + "learning_rate": 3.131637987449997e-06, + "loss": 0.83312255, + "num_input_tokens_seen": 117618880, + "router_z_loss_clip": 1.86425781, + "router_z_loss_mlp": 0.1619873, + "step": 5477, + "time_per_iteration": 2.532106637954712 + }, + { + "auxiliary_loss_clip": 0.06470291, + "auxiliary_loss_mlp": 0.01275718, + "balance_loss_clip": 0.0628788, + "balance_loss_mlp": 0.01259541, + "epoch": 0.32935517811513604, + "flos": 20819174935680.0, + "grad_norm": 2.104456143380591, + "language_loss": 0.75728148, + "learning_rate": 3.131316843357713e-06, + "loss": 0.83474159, + "num_input_tokens_seen": 117636445, + "router_z_loss_clip": 1.82226562, + "router_z_loss_mlp": 0.16174316, + "step": 5478, + "time_per_iteration": 2.5293543338775635 + }, + { + "auxiliary_loss_clip": 0.06470281, + "auxiliary_loss_mlp": 0.01278094, + "balance_loss_clip": 0.06287058, + "balance_loss_mlp": 0.01261631, + "epoch": 0.329415301367804, + "flos": 18447704680320.0, + "grad_norm": 2.368560120299576, + "language_loss": 0.80772918, + "learning_rate": 3.1309956563657807e-06, + "loss": 0.8852129, + "num_input_tokens_seen": 117653105, + "router_z_loss_clip": 1.83203125, + "router_z_loss_mlp": 0.16455078, + "step": 5479, + "time_per_iteration": 2.5154647827148438 + }, + { + "auxiliary_loss_clip": 0.06362775, + "auxiliary_loss_mlp": 0.01272199, + "balance_loss_clip": 0.06275004, + "balance_loss_mlp": 0.01268579, + "epoch": 0.32947542462047197, + "flos": 66344967930240.0, + "grad_norm": 0.7366188072531391, + "language_loss": 0.56333017, + "learning_rate": 3.1306744264863804e-06, + "loss": 0.63967991, + "num_input_tokens_seen": 117719225, + "router_z_loss_clip": 0.87597656, + "router_z_loss_mlp": 0.03616333, + "step": 5480, + "time_per_iteration": 3.2369706630706787 + }, + { + "auxiliary_loss_clip": 0.06477214, + "auxiliary_loss_mlp": 0.01278618, + "balance_loss_clip": 0.06290235, + "balance_loss_mlp": 0.01262179, + "epoch": 0.32953554787313993, + "flos": 23228268474240.0, + "grad_norm": 1.631877255513098, + "language_loss": 0.7736274, + "learning_rate": 3.1303531537316915e-06, + "loss": 0.85118574, + "num_input_tokens_seen": 117738725, + "router_z_loss_clip": 1.8671875, + "router_z_loss_mlp": 0.16442871, + "step": 5481, + "time_per_iteration": 2.5206968784332275 + }, + { + "auxiliary_loss_clip": 0.06479073, + "auxiliary_loss_mlp": 0.01277292, + "balance_loss_clip": 0.0628771, + "balance_loss_mlp": 0.01260686, + "epoch": 0.3295956711258079, + "flos": 27015686150400.0, + "grad_norm": 1.3752047504599005, + "language_loss": 0.78639877, + "learning_rate": 3.130031838113899e-06, + "loss": 0.86396235, + "num_input_tokens_seen": 117757765, + "router_z_loss_clip": 1.91601562, + "router_z_loss_mlp": 0.16601562, + "step": 5482, + "time_per_iteration": 2.604720115661621 + }, + { + "auxiliary_loss_clip": 0.06475698, + "auxiliary_loss_mlp": 0.01274916, + "balance_loss_clip": 0.06286834, + "balance_loss_mlp": 0.01258274, + "epoch": 0.32965579437847586, + "flos": 19177697450880.0, + "grad_norm": 2.0027782692889358, + "language_loss": 0.74399549, + "learning_rate": 3.129710479645185e-06, + "loss": 0.82150161, + "num_input_tokens_seen": 117776810, + "router_z_loss_clip": 1.88964844, + "router_z_loss_mlp": 0.16662598, + "step": 5483, + "time_per_iteration": 2.5124409198760986 + }, + { + "auxiliary_loss_clip": 0.06472629, + "auxiliary_loss_mlp": 0.01273838, + "balance_loss_clip": 0.06286867, + "balance_loss_mlp": 0.01258472, + "epoch": 0.32971591763114383, + "flos": 30490447115520.0, + "grad_norm": 1.7640387903996015, + "language_loss": 0.7588225, + "learning_rate": 3.1293890783377366e-06, + "loss": 0.83628714, + "num_input_tokens_seen": 117797730, + "router_z_loss_clip": 1.85742188, + "router_z_loss_mlp": 0.15368652, + "step": 5484, + "time_per_iteration": 2.64021635055542 + }, + { + "auxiliary_loss_clip": 0.06469439, + "auxiliary_loss_mlp": 0.01274788, + "balance_loss_clip": 0.06284587, + "balance_loss_mlp": 0.01259232, + "epoch": 0.3297760408838118, + "flos": 16295140016640.0, + "grad_norm": 1.7787654746377481, + "language_loss": 0.72680974, + "learning_rate": 3.129067634203742e-06, + "loss": 0.80425203, + "num_input_tokens_seen": 117815365, + "router_z_loss_clip": 1.84960938, + "router_z_loss_mlp": 0.15563965, + "step": 5485, + "time_per_iteration": 2.516080379486084 + }, + { + "auxiliary_loss_clip": 0.06466281, + "auxiliary_loss_mlp": 0.01274799, + "balance_loss_clip": 0.06281459, + "balance_loss_mlp": 0.0125991, + "epoch": 0.32983616413647976, + "flos": 29538194590080.0, + "grad_norm": 2.336444213272706, + "language_loss": 0.80720758, + "learning_rate": 3.128746147255388e-06, + "loss": 0.8846184, + "num_input_tokens_seen": 117836095, + "router_z_loss_clip": 1.84765625, + "router_z_loss_mlp": 0.14904785, + "step": 5486, + "time_per_iteration": 2.633730173110962 + }, + { + "auxiliary_loss_clip": 0.06467714, + "auxiliary_loss_mlp": 0.01276658, + "balance_loss_clip": 0.06283799, + "balance_loss_mlp": 0.01261828, + "epoch": 0.3298962873891478, + "flos": 20637682992000.0, + "grad_norm": 1.9361428819205904, + "language_loss": 0.84726417, + "learning_rate": 3.1284246175048683e-06, + "loss": 0.92470789, + "num_input_tokens_seen": 117854655, + "router_z_loss_clip": 1.83886719, + "router_z_loss_mlp": 0.14819336, + "step": 5487, + "time_per_iteration": 2.5073888301849365 + }, + { + "auxiliary_loss_clip": 0.06473765, + "auxiliary_loss_mlp": 0.01275689, + "balance_loss_clip": 0.06283425, + "balance_loss_mlp": 0.01258845, + "epoch": 0.32995641064181574, + "flos": 14981329123200.0, + "grad_norm": 2.0510786453666707, + "language_loss": 0.74805683, + "learning_rate": 3.1281030449643735e-06, + "loss": 0.82555139, + "num_input_tokens_seen": 117873300, + "router_z_loss_clip": 1.90429688, + "router_z_loss_mlp": 0.16833496, + "step": 5488, + "time_per_iteration": 2.5195999145507812 + }, + { + "auxiliary_loss_clip": 0.06475645, + "auxiliary_loss_mlp": 0.01276585, + "balance_loss_clip": 0.06288432, + "balance_loss_mlp": 0.012611, + "epoch": 0.3300165338944837, + "flos": 18667448812800.0, + "grad_norm": 2.2567239989743912, + "language_loss": 0.73048651, + "learning_rate": 3.127781429646098e-06, + "loss": 0.80800879, + "num_input_tokens_seen": 117891540, + "router_z_loss_clip": 1.87207031, + "router_z_loss_mlp": 0.15466309, + "step": 5489, + "time_per_iteration": 2.489529609680176 + }, + { + "auxiliary_loss_clip": 0.06468415, + "auxiliary_loss_mlp": 0.01275877, + "balance_loss_clip": 0.06282636, + "balance_loss_mlp": 0.01260987, + "epoch": 0.3300766571471517, + "flos": 25589215042560.0, + "grad_norm": 2.1838257682132256, + "language_loss": 0.89381063, + "learning_rate": 3.127459771562238e-06, + "loss": 0.97125351, + "num_input_tokens_seen": 117907690, + "router_z_loss_clip": 1.859375, + "router_z_loss_mlp": 0.14898682, + "step": 5490, + "time_per_iteration": 2.583505153656006 + }, + { + "auxiliary_loss_clip": 0.06470391, + "auxiliary_loss_mlp": 0.01273693, + "balance_loss_clip": 0.06285221, + "balance_loss_mlp": 0.01258339, + "epoch": 0.33013678039981964, + "flos": 11368150012800.0, + "grad_norm": 1.8708534793530802, + "language_loss": 0.82974613, + "learning_rate": 3.1271380707249907e-06, + "loss": 0.90718699, + "num_input_tokens_seen": 117925640, + "router_z_loss_clip": 1.85253906, + "router_z_loss_mlp": 0.15344238, + "step": 5491, + "time_per_iteration": 2.4903311729431152 + }, + { + "auxiliary_loss_clip": 0.06473103, + "auxiliary_loss_mlp": 0.01274646, + "balance_loss_clip": 0.06285959, + "balance_loss_mlp": 0.01258589, + "epoch": 0.3301969036524876, + "flos": 24827175285120.0, + "grad_norm": 1.8609460693795263, + "language_loss": 0.77910721, + "learning_rate": 3.126816327146554e-06, + "loss": 0.85658479, + "num_input_tokens_seen": 117944525, + "router_z_loss_clip": 1.87402344, + "router_z_loss_mlp": 0.16052246, + "step": 5492, + "time_per_iteration": 2.5615334510803223 + }, + { + "auxiliary_loss_clip": 0.06478797, + "auxiliary_loss_mlp": 0.01277822, + "balance_loss_clip": 0.06287751, + "balance_loss_mlp": 0.01261324, + "epoch": 0.33025702690515557, + "flos": 15966634884480.0, + "grad_norm": 2.4722908606070875, + "language_loss": 0.75614154, + "learning_rate": 3.12649454083913e-06, + "loss": 0.83370769, + "num_input_tokens_seen": 117962515, + "router_z_loss_clip": 1.91015625, + "router_z_loss_mlp": 0.16503906, + "step": 5493, + "time_per_iteration": 2.489143133163452 + }, + { + "auxiliary_loss_clip": 0.06366986, + "auxiliary_loss_mlp": 0.01258616, + "balance_loss_clip": 0.06280049, + "balance_loss_mlp": 0.0125515, + "epoch": 0.33031715015782354, + "flos": 59435794540800.0, + "grad_norm": 0.7878547289977352, + "language_loss": 0.54030049, + "learning_rate": 3.12617271181492e-06, + "loss": 0.61655653, + "num_input_tokens_seen": 118018780, + "router_z_loss_clip": 0.86767578, + "router_z_loss_mlp": 0.03475952, + "step": 5494, + "time_per_iteration": 3.0869832038879395 + }, + { + "auxiliary_loss_clip": 0.06482484, + "auxiliary_loss_mlp": 0.01281394, + "balance_loss_clip": 0.0629174, + "balance_loss_mlp": 0.01264753, + "epoch": 0.3303772734104915, + "flos": 23190896753280.0, + "grad_norm": 1.4215593277180028, + "language_loss": 0.87367666, + "learning_rate": 3.1258508400861276e-06, + "loss": 0.9513154, + "num_input_tokens_seen": 118038610, + "router_z_loss_clip": 1.90625, + "router_z_loss_mlp": 0.16625977, + "step": 5495, + "time_per_iteration": 2.5188820362091064 + }, + { + "auxiliary_loss_clip": 0.06477214, + "auxiliary_loss_mlp": 0.0127749, + "balance_loss_clip": 0.06287535, + "balance_loss_mlp": 0.01260038, + "epoch": 0.33043739666315947, + "flos": 33080068275840.0, + "grad_norm": 2.0083800771900995, + "language_loss": 0.74168754, + "learning_rate": 3.1255289256649587e-06, + "loss": 0.81923461, + "num_input_tokens_seen": 118055905, + "router_z_loss_clip": 1.89550781, + "router_z_loss_mlp": 0.17443848, + "step": 5496, + "time_per_iteration": 2.6151347160339355 + }, + { + "auxiliary_loss_clip": 0.06470463, + "auxiliary_loss_mlp": 0.01272194, + "balance_loss_clip": 0.0628539, + "balance_loss_mlp": 0.01256434, + "epoch": 0.33049751991582743, + "flos": 24901625237760.0, + "grad_norm": 1.9468549986980455, + "language_loss": 0.72676557, + "learning_rate": 3.1252069685636196e-06, + "loss": 0.80419219, + "num_input_tokens_seen": 118073695, + "router_z_loss_clip": 1.84960938, + "router_z_loss_mlp": 0.15759277, + "step": 5497, + "time_per_iteration": 2.51874041557312 + }, + { + "auxiliary_loss_clip": 0.06472345, + "auxiliary_loss_mlp": 0.0127459, + "balance_loss_clip": 0.06286049, + "balance_loss_mlp": 0.01259343, + "epoch": 0.3305576431684954, + "flos": 29468272757760.0, + "grad_norm": 1.8137955115189202, + "language_loss": 0.80825889, + "learning_rate": 3.124884968794321e-06, + "loss": 0.88572824, + "num_input_tokens_seen": 118094030, + "router_z_loss_clip": 1.86328125, + "router_z_loss_mlp": 0.15234375, + "step": 5498, + "time_per_iteration": 2.6010656356811523 + }, + { + "auxiliary_loss_clip": 0.06476308, + "auxiliary_loss_mlp": 0.0127559, + "balance_loss_clip": 0.0628619, + "balance_loss_mlp": 0.01258281, + "epoch": 0.33061776642116336, + "flos": 22637951660160.0, + "grad_norm": 1.8227647554707032, + "language_loss": 0.76843095, + "learning_rate": 3.12456292636927e-06, + "loss": 0.84594989, + "num_input_tokens_seen": 118111665, + "router_z_loss_clip": 1.90039062, + "router_z_loss_mlp": 0.1730957, + "step": 5499, + "time_per_iteration": 2.5308327674865723 + }, + { + "auxiliary_loss_clip": 0.06475572, + "auxiliary_loss_mlp": 0.01277032, + "balance_loss_clip": 0.06287447, + "balance_loss_mlp": 0.01260832, + "epoch": 0.3306778896738313, + "flos": 25783536660480.0, + "grad_norm": 1.5377855738322084, + "language_loss": 0.79203349, + "learning_rate": 3.124240841300681e-06, + "loss": 0.86955953, + "num_input_tokens_seen": 118132435, + "router_z_loss_clip": 1.88183594, + "router_z_loss_mlp": 0.16186523, + "step": 5500, + "time_per_iteration": 2.5970370769500732 + }, + { + "auxiliary_loss_clip": 0.0648918, + "auxiliary_loss_mlp": 0.01275283, + "balance_loss_clip": 0.06298861, + "balance_loss_mlp": 0.01257544, + "epoch": 0.33073801292649935, + "flos": 36949566625920.0, + "grad_norm": 1.9211086255091194, + "language_loss": 0.66916561, + "learning_rate": 3.1239187136007665e-06, + "loss": 0.7468102, + "num_input_tokens_seen": 118155255, + "router_z_loss_clip": 1.90527344, + "router_z_loss_mlp": 0.17724609, + "step": 5501, + "time_per_iteration": 2.687847375869751 + }, + { + "auxiliary_loss_clip": 0.06481969, + "auxiliary_loss_mlp": 0.01273275, + "balance_loss_clip": 0.06291866, + "balance_loss_mlp": 0.01255763, + "epoch": 0.3307981361791673, + "flos": 12972465411840.0, + "grad_norm": 2.0893698607967957, + "language_loss": 0.77978551, + "learning_rate": 3.1235965432817417e-06, + "loss": 0.85733795, + "num_input_tokens_seen": 118169865, + "router_z_loss_clip": 1.90039062, + "router_z_loss_mlp": 0.17504883, + "step": 5502, + "time_per_iteration": 2.500303268432617 + }, + { + "auxiliary_loss_clip": 0.06481159, + "auxiliary_loss_mlp": 0.0127421, + "balance_loss_clip": 0.06290131, + "balance_loss_mlp": 0.01256424, + "epoch": 0.3308582594318353, + "flos": 25381420116480.0, + "grad_norm": 1.7450780858535315, + "language_loss": 0.72841054, + "learning_rate": 3.123274330355824e-06, + "loss": 0.80596423, + "num_input_tokens_seen": 118190760, + "router_z_loss_clip": 1.90917969, + "router_z_loss_mlp": 0.17773438, + "step": 5503, + "time_per_iteration": 2.5851874351501465 + }, + { + "auxiliary_loss_clip": 0.06475106, + "auxiliary_loss_mlp": 0.01274446, + "balance_loss_clip": 0.06287622, + "balance_loss_mlp": 0.01257769, + "epoch": 0.33091838268450324, + "flos": 26475738439680.0, + "grad_norm": 1.4901464435255347, + "language_loss": 0.7565586, + "learning_rate": 3.12295207483523e-06, + "loss": 0.83405411, + "num_input_tokens_seen": 118213620, + "router_z_loss_clip": 1.87597656, + "router_z_loss_mlp": 0.16674805, + "step": 5504, + "time_per_iteration": 2.5670559406280518 + }, + { + "auxiliary_loss_clip": 0.06476955, + "auxiliary_loss_mlp": 0.01276594, + "balance_loss_clip": 0.06289346, + "balance_loss_mlp": 0.01261025, + "epoch": 0.3309785059371712, + "flos": 24977836126080.0, + "grad_norm": 1.5646403370775293, + "language_loss": 0.70214427, + "learning_rate": 3.1226297767321816e-06, + "loss": 0.77967972, + "num_input_tokens_seen": 118235010, + "router_z_loss_clip": 1.87597656, + "router_z_loss_mlp": 0.15545654, + "step": 5505, + "time_per_iteration": 2.628267288208008 + }, + { + "auxiliary_loss_clip": 0.06474259, + "auxiliary_loss_mlp": 0.01275018, + "balance_loss_clip": 0.06288742, + "balance_loss_mlp": 0.01258543, + "epoch": 0.3310386291898392, + "flos": 20452585322880.0, + "grad_norm": 1.7982072656373813, + "language_loss": 0.8240785, + "learning_rate": 3.122307436058899e-06, + "loss": 0.90157127, + "num_input_tokens_seen": 118255820, + "router_z_loss_clip": 1.85644531, + "router_z_loss_mlp": 0.16467285, + "step": 5506, + "time_per_iteration": 4.10949444770813 + }, + { + "auxiliary_loss_clip": 0.06476486, + "auxiliary_loss_mlp": 0.01275135, + "balance_loss_clip": 0.0628888, + "balance_loss_mlp": 0.01258428, + "epoch": 0.33109875244250714, + "flos": 23188926182400.0, + "grad_norm": 1.740251919086934, + "language_loss": 0.79860532, + "learning_rate": 3.121985052827606e-06, + "loss": 0.87612152, + "num_input_tokens_seen": 118274160, + "router_z_loss_clip": 1.875, + "router_z_loss_mlp": 0.16705322, + "step": 5507, + "time_per_iteration": 4.12217903137207 + }, + { + "auxiliary_loss_clip": 0.06468768, + "auxiliary_loss_mlp": 0.01276749, + "balance_loss_clip": 0.06281893, + "balance_loss_mlp": 0.01260477, + "epoch": 0.3311588756951751, + "flos": 24174902776320.0, + "grad_norm": 1.6433149866128014, + "language_loss": 0.71967649, + "learning_rate": 3.1216626270505274e-06, + "loss": 0.79713166, + "num_input_tokens_seen": 118294385, + "router_z_loss_clip": 1.87109375, + "router_z_loss_mlp": 0.1628418, + "step": 5508, + "time_per_iteration": 2.5890002250671387 + }, + { + "auxiliary_loss_clip": 0.06468692, + "auxiliary_loss_mlp": 0.01272213, + "balance_loss_clip": 0.06284875, + "balance_loss_mlp": 0.01256788, + "epoch": 0.33121899894784307, + "flos": 28152994417920.0, + "grad_norm": 1.6757523088462936, + "language_loss": 0.71588784, + "learning_rate": 3.12134015873989e-06, + "loss": 0.79329687, + "num_input_tokens_seen": 118313105, + "router_z_loss_clip": 1.83691406, + "router_z_loss_mlp": 0.15429688, + "step": 5509, + "time_per_iteration": 3.976996660232544 + }, + { + "auxiliary_loss_clip": 0.06471643, + "auxiliary_loss_mlp": 0.01279857, + "balance_loss_clip": 0.06286702, + "balance_loss_mlp": 0.01264396, + "epoch": 0.33127912220051103, + "flos": 29574979332480.0, + "grad_norm": 1.5753317257606638, + "language_loss": 0.73806137, + "learning_rate": 3.121017647907921e-06, + "loss": 0.81557631, + "num_input_tokens_seen": 118335250, + "router_z_loss_clip": 1.84960938, + "router_z_loss_mlp": 0.15460205, + "step": 5510, + "time_per_iteration": 2.576838731765747 + }, + { + "auxiliary_loss_clip": 0.06473264, + "auxiliary_loss_mlp": 0.01276647, + "balance_loss_clip": 0.06286872, + "balance_loss_mlp": 0.01261019, + "epoch": 0.331339245453179, + "flos": 14434086107520.0, + "grad_norm": 2.529546935928515, + "language_loss": 0.88507652, + "learning_rate": 3.1206950945668508e-06, + "loss": 0.96257567, + "num_input_tokens_seen": 118351470, + "router_z_loss_clip": 1.86328125, + "router_z_loss_mlp": 0.15612793, + "step": 5511, + "time_per_iteration": 2.550442695617676 + }, + { + "auxiliary_loss_clip": 0.06464168, + "auxiliary_loss_mlp": 0.01275515, + "balance_loss_clip": 0.06286532, + "balance_loss_mlp": 0.01260494, + "epoch": 0.33139936870584696, + "flos": 20893499107200.0, + "grad_norm": 1.6341387009287651, + "language_loss": 0.73559558, + "learning_rate": 3.12037249872891e-06, + "loss": 0.81299245, + "num_input_tokens_seen": 118370970, + "router_z_loss_clip": 1.77441406, + "router_z_loss_mlp": 0.15026855, + "step": 5512, + "time_per_iteration": 2.5596871376037598 + }, + { + "auxiliary_loss_clip": 0.06468001, + "auxiliary_loss_mlp": 0.01278341, + "balance_loss_clip": 0.06286225, + "balance_loss_mlp": 0.01262438, + "epoch": 0.33145949195851493, + "flos": 36293352975360.0, + "grad_norm": 1.8738374179289, + "language_loss": 0.72677827, + "learning_rate": 3.1200498604063317e-06, + "loss": 0.80424166, + "num_input_tokens_seen": 118393125, + "router_z_loss_clip": 1.81738281, + "router_z_loss_mlp": 0.15905762, + "step": 5513, + "time_per_iteration": 4.148774147033691 + }, + { + "auxiliary_loss_clip": 0.06472933, + "auxiliary_loss_mlp": 0.01275876, + "balance_loss_clip": 0.06284368, + "balance_loss_mlp": 0.0125958, + "epoch": 0.33151961521118295, + "flos": 14284431515520.0, + "grad_norm": 1.8311253656567958, + "language_loss": 0.69026303, + "learning_rate": 3.1197271796113507e-06, + "loss": 0.7677511, + "num_input_tokens_seen": 118410860, + "router_z_loss_clip": 1.8828125, + "router_z_loss_mlp": 0.16296387, + "step": 5514, + "time_per_iteration": 2.486818313598633 + }, + { + "auxiliary_loss_clip": 0.06477968, + "auxiliary_loss_mlp": 0.0127816, + "balance_loss_clip": 0.06291951, + "balance_loss_mlp": 0.01261089, + "epoch": 0.3315797384638509, + "flos": 20780126133120.0, + "grad_norm": 1.9656560392088134, + "language_loss": 0.66393441, + "learning_rate": 3.1194044563562026e-06, + "loss": 0.74149573, + "num_input_tokens_seen": 118429570, + "router_z_loss_clip": 1.859375, + "router_z_loss_mlp": 0.17053223, + "step": 5515, + "time_per_iteration": 2.531658411026001 + }, + { + "auxiliary_loss_clip": 0.06473279, + "auxiliary_loss_mlp": 0.01275122, + "balance_loss_clip": 0.06286342, + "balance_loss_mlp": 0.01258885, + "epoch": 0.3316398617165189, + "flos": 24686115736320.0, + "grad_norm": 3.8914339391091732, + "language_loss": 0.69369388, + "learning_rate": 3.1190816906531257e-06, + "loss": 0.77117789, + "num_input_tokens_seen": 118450285, + "router_z_loss_clip": 1.8671875, + "router_z_loss_mlp": 0.16235352, + "step": 5516, + "time_per_iteration": 2.5392425060272217 + }, + { + "auxiliary_loss_clip": 0.06476592, + "auxiliary_loss_mlp": 0.01274968, + "balance_loss_clip": 0.06287026, + "balance_loss_mlp": 0.0125959, + "epoch": 0.33169998496918685, + "flos": 18593879328000.0, + "grad_norm": 2.757231582138207, + "language_loss": 0.80914545, + "learning_rate": 3.118758882514359e-06, + "loss": 0.88666099, + "num_input_tokens_seen": 118468270, + "router_z_loss_clip": 1.89550781, + "router_z_loss_mlp": 0.15368652, + "step": 5517, + "time_per_iteration": 2.4851818084716797 + }, + { + "auxiliary_loss_clip": 0.06465174, + "auxiliary_loss_mlp": 0.01279818, + "balance_loss_clip": 0.06284687, + "balance_loss_mlp": 0.01264142, + "epoch": 0.3317601082218548, + "flos": 20199871808640.0, + "grad_norm": 1.6705032998917397, + "language_loss": 0.74656814, + "learning_rate": 3.118436031952143e-06, + "loss": 0.82401806, + "num_input_tokens_seen": 118486615, + "router_z_loss_clip": 1.8046875, + "router_z_loss_mlp": 0.15686035, + "step": 5518, + "time_per_iteration": 2.518036127090454 + }, + { + "auxiliary_loss_clip": 0.06372921, + "auxiliary_loss_mlp": 0.01283465, + "balance_loss_clip": 0.06286249, + "balance_loss_mlp": 0.01279764, + "epoch": 0.3318202314745228, + "flos": 68995119265920.0, + "grad_norm": 0.7149144856696655, + "language_loss": 0.54263318, + "learning_rate": 3.1181131389787206e-06, + "loss": 0.61919701, + "num_input_tokens_seen": 118553580, + "router_z_loss_clip": 0.8671875, + "router_z_loss_mlp": 0.03692627, + "step": 5519, + "time_per_iteration": 3.246586322784424 + }, + { + "auxiliary_loss_clip": 0.06472577, + "auxiliary_loss_mlp": 0.01276695, + "balance_loss_clip": 0.06288108, + "balance_loss_mlp": 0.0125966, + "epoch": 0.33188035472719074, + "flos": 21505381148160.0, + "grad_norm": 2.182658812554146, + "language_loss": 0.79452467, + "learning_rate": 3.117790203606336e-06, + "loss": 0.87201744, + "num_input_tokens_seen": 118570280, + "router_z_loss_clip": 1.84375, + "router_z_loss_mlp": 0.17028809, + "step": 5520, + "time_per_iteration": 2.517853260040283 + }, + { + "auxiliary_loss_clip": 0.06465811, + "auxiliary_loss_mlp": 0.01271287, + "balance_loss_clip": 0.06283027, + "balance_loss_mlp": 0.01256279, + "epoch": 0.3319404779798587, + "flos": 28877033548800.0, + "grad_norm": 1.8300903967069966, + "language_loss": 0.77067709, + "learning_rate": 3.1174672258472344e-06, + "loss": 0.84804809, + "num_input_tokens_seen": 118590455, + "router_z_loss_clip": 1.82910156, + "router_z_loss_mlp": 0.15002441, + "step": 5521, + "time_per_iteration": 2.555697441101074 + }, + { + "auxiliary_loss_clip": 0.06478226, + "auxiliary_loss_mlp": 0.01278256, + "balance_loss_clip": 0.06288885, + "balance_loss_mlp": 0.01261542, + "epoch": 0.33200060123252667, + "flos": 23083770908160.0, + "grad_norm": 1.9119948906690396, + "language_loss": 0.70441258, + "learning_rate": 3.117144205713664e-06, + "loss": 0.78197736, + "num_input_tokens_seen": 118609495, + "router_z_loss_clip": 1.89355469, + "router_z_loss_mlp": 0.16699219, + "step": 5522, + "time_per_iteration": 2.5673933029174805 + }, + { + "auxiliary_loss_clip": 0.06474358, + "auxiliary_loss_mlp": 0.01271133, + "balance_loss_clip": 0.06290573, + "balance_loss_mlp": 0.01255255, + "epoch": 0.33206072448519464, + "flos": 21148895952000.0, + "grad_norm": 1.6906348218339255, + "language_loss": 0.74640656, + "learning_rate": 3.1168211432178735e-06, + "loss": 0.82386148, + "num_input_tokens_seen": 118628720, + "router_z_loss_clip": 1.83789062, + "router_z_loss_mlp": 0.15881348, + "step": 5523, + "time_per_iteration": 2.516275405883789 + }, + { + "auxiliary_loss_clip": 0.06473421, + "auxiliary_loss_mlp": 0.01271212, + "balance_loss_clip": 0.06292297, + "balance_loss_mlp": 0.01255763, + "epoch": 0.3321208477378626, + "flos": 13084161304320.0, + "grad_norm": 2.1726495268835024, + "language_loss": 0.82172406, + "learning_rate": 3.116498038372114e-06, + "loss": 0.8991704, + "num_input_tokens_seen": 118645955, + "router_z_loss_clip": 1.80957031, + "router_z_loss_mlp": 0.15454102, + "step": 5524, + "time_per_iteration": 2.557941198348999 + }, + { + "auxiliary_loss_clip": 0.06473367, + "auxiliary_loss_mlp": 0.01272915, + "balance_loss_clip": 0.06289522, + "balance_loss_mlp": 0.01257251, + "epoch": 0.33218097099053057, + "flos": 21221836531200.0, + "grad_norm": 1.6566666481357326, + "language_loss": 0.83100772, + "learning_rate": 3.116174891188636e-06, + "loss": 0.90847051, + "num_input_tokens_seen": 118665605, + "router_z_loss_clip": 1.8359375, + "router_z_loss_mlp": 0.15649414, + "step": 5525, + "time_per_iteration": 2.527944564819336 + }, + { + "auxiliary_loss_clip": 0.06379532, + "auxiliary_loss_mlp": 0.01265433, + "balance_loss_clip": 0.06292765, + "balance_loss_mlp": 0.01261484, + "epoch": 0.33224109424319853, + "flos": 64369954068480.0, + "grad_norm": 0.7407224947932968, + "language_loss": 0.52533764, + "learning_rate": 3.1158517016796945e-06, + "loss": 0.60178727, + "num_input_tokens_seen": 118728155, + "router_z_loss_clip": 0.8671875, + "router_z_loss_mlp": 0.03945923, + "step": 5526, + "time_per_iteration": 3.1679162979125977 + }, + { + "auxiliary_loss_clip": 0.0647909, + "auxiliary_loss_mlp": 0.01274604, + "balance_loss_clip": 0.06291543, + "balance_loss_mlp": 0.01258391, + "epoch": 0.33230121749586655, + "flos": 17351457713280.0, + "grad_norm": 1.970764365513445, + "language_loss": 0.79041827, + "learning_rate": 3.1155284698575445e-06, + "loss": 0.86795521, + "num_input_tokens_seen": 118743955, + "router_z_loss_clip": 1.87597656, + "router_z_loss_mlp": 0.1619873, + "step": 5527, + "time_per_iteration": 2.5327274799346924 + }, + { + "auxiliary_loss_clip": 0.06477004, + "auxiliary_loss_mlp": 0.01278538, + "balance_loss_clip": 0.06294803, + "balance_loss_mlp": 0.01263458, + "epoch": 0.3323613407485345, + "flos": 21003517918080.0, + "grad_norm": 1.6591522480418575, + "language_loss": 0.72383821, + "learning_rate": 3.1152051957344434e-06, + "loss": 0.80139363, + "num_input_tokens_seen": 118763275, + "router_z_loss_clip": 1.82226562, + "router_z_loss_mlp": 0.15063477, + "step": 5528, + "time_per_iteration": 2.6072213649749756 + }, + { + "auxiliary_loss_clip": 0.06477713, + "auxiliary_loss_mlp": 0.01274869, + "balance_loss_clip": 0.06292165, + "balance_loss_mlp": 0.01259396, + "epoch": 0.3324214640012025, + "flos": 13157688862080.0, + "grad_norm": 1.8543805866880412, + "language_loss": 0.8336091, + "learning_rate": 3.1148818793226497e-06, + "loss": 0.91113496, + "num_input_tokens_seen": 118781110, + "router_z_loss_clip": 1.85546875, + "router_z_loss_mlp": 0.15466309, + "step": 5529, + "time_per_iteration": 2.5001087188720703 + }, + { + "auxiliary_loss_clip": 0.06479646, + "auxiliary_loss_mlp": 0.01270144, + "balance_loss_clip": 0.06290583, + "balance_loss_mlp": 0.01254587, + "epoch": 0.33248158725387045, + "flos": 22280124798720.0, + "grad_norm": 1.7380748666321508, + "language_loss": 0.70133483, + "learning_rate": 3.114558520634423e-06, + "loss": 0.77883273, + "num_input_tokens_seen": 118800620, + "router_z_loss_clip": 1.89355469, + "router_z_loss_mlp": 0.15551758, + "step": 5530, + "time_per_iteration": 2.5806338787078857 + }, + { + "auxiliary_loss_clip": 0.06479505, + "auxiliary_loss_mlp": 0.01275357, + "balance_loss_clip": 0.06291899, + "balance_loss_mlp": 0.01258751, + "epoch": 0.3325417105065384, + "flos": 20747324459520.0, + "grad_norm": 2.7342028000668552, + "language_loss": 0.77694213, + "learning_rate": 3.1142351196820256e-06, + "loss": 0.85449082, + "num_input_tokens_seen": 118818725, + "router_z_loss_clip": 1.87597656, + "router_z_loss_mlp": 0.16589355, + "step": 5531, + "time_per_iteration": 2.5307323932647705 + }, + { + "auxiliary_loss_clip": 0.06477839, + "auxiliary_loss_mlp": 0.01280766, + "balance_loss_clip": 0.06290089, + "balance_loss_mlp": 0.01263552, + "epoch": 0.3326018337592064, + "flos": 24797476212480.0, + "grad_norm": 1.9473942094883194, + "language_loss": 0.73779702, + "learning_rate": 3.1139116764777206e-06, + "loss": 0.81538308, + "num_input_tokens_seen": 118839390, + "router_z_loss_clip": 1.87597656, + "router_z_loss_mlp": 0.17211914, + "step": 5532, + "time_per_iteration": 2.5989890098571777 + }, + { + "auxiliary_loss_clip": 0.06472681, + "auxiliary_loss_mlp": 0.01278728, + "balance_loss_clip": 0.06288014, + "balance_loss_mlp": 0.01263147, + "epoch": 0.33266195701187434, + "flos": 14506942832640.0, + "grad_norm": 1.825417572799306, + "language_loss": 0.66042602, + "learning_rate": 3.1135881910337735e-06, + "loss": 0.73794013, + "num_input_tokens_seen": 118856275, + "router_z_loss_clip": 1.84667969, + "router_z_loss_mlp": 0.15576172, + "step": 5533, + "time_per_iteration": 2.47566294670105 + }, + { + "auxiliary_loss_clip": 0.06474279, + "auxiliary_loss_mlp": 0.012755, + "balance_loss_clip": 0.06289338, + "balance_loss_mlp": 0.01258954, + "epoch": 0.3327220802645423, + "flos": 15309792328320.0, + "grad_norm": 1.6677538876536442, + "language_loss": 0.71568084, + "learning_rate": 3.113264663362451e-06, + "loss": 0.79317868, + "num_input_tokens_seen": 118873830, + "router_z_loss_clip": 1.84863281, + "router_z_loss_mlp": 0.16552734, + "step": 5534, + "time_per_iteration": 2.5140762329101562 + }, + { + "auxiliary_loss_clip": 0.06474573, + "auxiliary_loss_mlp": 0.01273002, + "balance_loss_clip": 0.06290095, + "balance_loss_mlp": 0.01257088, + "epoch": 0.3327822035172103, + "flos": 23484336151680.0, + "grad_norm": 1.635346823223845, + "language_loss": 0.67885029, + "learning_rate": 3.1129410934760204e-06, + "loss": 0.75632608, + "num_input_tokens_seen": 118891560, + "router_z_loss_clip": 1.84277344, + "router_z_loss_mlp": 0.15917969, + "step": 5535, + "time_per_iteration": 2.522270917892456 + }, + { + "auxiliary_loss_clip": 0.0647034, + "auxiliary_loss_mlp": 0.01273438, + "balance_loss_clip": 0.06284929, + "balance_loss_mlp": 0.01257547, + "epoch": 0.33284232676987824, + "flos": 25381587824640.0, + "grad_norm": 2.3715726564419155, + "language_loss": 0.72782886, + "learning_rate": 3.1126174813867517e-06, + "loss": 0.80526668, + "num_input_tokens_seen": 118910260, + "router_z_loss_clip": 1.85449219, + "router_z_loss_mlp": 0.15893555, + "step": 5536, + "time_per_iteration": 2.5831825733184814 + }, + { + "auxiliary_loss_clip": 0.06470598, + "auxiliary_loss_mlp": 0.01270866, + "balance_loss_clip": 0.06285597, + "balance_loss_mlp": 0.01255464, + "epoch": 0.3329024500225462, + "flos": 23700851902080.0, + "grad_norm": 1.6831469867631554, + "language_loss": 0.81958938, + "learning_rate": 3.112293827106917e-06, + "loss": 0.89700401, + "num_input_tokens_seen": 118929985, + "router_z_loss_clip": 1.84960938, + "router_z_loss_mlp": 0.15405273, + "step": 5537, + "time_per_iteration": 2.520211935043335 + }, + { + "auxiliary_loss_clip": 0.06473641, + "auxiliary_loss_mlp": 0.01270298, + "balance_loss_clip": 0.06284811, + "balance_loss_mlp": 0.01253799, + "epoch": 0.33296257327521417, + "flos": 31731317429760.0, + "grad_norm": 1.8576028267218818, + "language_loss": 0.71933794, + "learning_rate": 3.111970130648789e-06, + "loss": 0.79677737, + "num_input_tokens_seen": 118951355, + "router_z_loss_clip": 1.88671875, + "router_z_loss_mlp": 0.16491699, + "step": 5538, + "time_per_iteration": 2.6061229705810547 + }, + { + "auxiliary_loss_clip": 0.06466128, + "auxiliary_loss_mlp": 0.01271828, + "balance_loss_clip": 0.06283107, + "balance_loss_mlp": 0.01256784, + "epoch": 0.33302269652788213, + "flos": 22750863436800.0, + "grad_norm": 1.8542539639588682, + "language_loss": 0.75063813, + "learning_rate": 3.1116463920246424e-06, + "loss": 0.82801771, + "num_input_tokens_seen": 118970910, + "router_z_loss_clip": 1.83007812, + "router_z_loss_mlp": 0.15039062, + "step": 5539, + "time_per_iteration": 2.5176634788513184 + }, + { + "auxiliary_loss_clip": 0.06473792, + "auxiliary_loss_mlp": 0.0127244, + "balance_loss_clip": 0.06284824, + "balance_loss_mlp": 0.01255739, + "epoch": 0.33308281978055015, + "flos": 11478546167040.0, + "grad_norm": 1.8040392528519402, + "language_loss": 0.71489209, + "learning_rate": 3.1113226112467527e-06, + "loss": 0.79235446, + "num_input_tokens_seen": 118989200, + "router_z_loss_clip": 1.890625, + "router_z_loss_mlp": 0.16699219, + "step": 5540, + "time_per_iteration": 2.536752939224243 + }, + { + "auxiliary_loss_clip": 0.06462967, + "auxiliary_loss_mlp": 0.01271775, + "balance_loss_clip": 0.06280267, + "balance_loss_mlp": 0.01256576, + "epoch": 0.3331429430332181, + "flos": 38222274291840.0, + "grad_norm": 3.095851444688792, + "language_loss": 0.60970843, + "learning_rate": 3.1109987883273983e-06, + "loss": 0.68705589, + "num_input_tokens_seen": 119011030, + "router_z_loss_clip": 1.82519531, + "router_z_loss_mlp": 0.15197754, + "step": 5541, + "time_per_iteration": 2.6592354774475098 + }, + { + "auxiliary_loss_clip": 0.06472225, + "auxiliary_loss_mlp": 0.01276024, + "balance_loss_clip": 0.06284402, + "balance_loss_mlp": 0.01259872, + "epoch": 0.3332030662858861, + "flos": 22535270081280.0, + "grad_norm": 1.770287690308821, + "language_loss": 0.69711685, + "learning_rate": 3.1106749232788584e-06, + "loss": 0.77459931, + "num_input_tokens_seen": 119030620, + "router_z_loss_clip": 1.87597656, + "router_z_loss_mlp": 0.16149902, + "step": 5542, + "time_per_iteration": 2.5427184104919434 + }, + { + "auxiliary_loss_clip": 0.06473213, + "auxiliary_loss_mlp": 0.01276881, + "balance_loss_clip": 0.06286451, + "balance_loss_mlp": 0.01261658, + "epoch": 0.33326318953855405, + "flos": 16003293845760.0, + "grad_norm": 1.6729265705607443, + "language_loss": 0.75927889, + "learning_rate": 3.110351016113414e-06, + "loss": 0.83677983, + "num_input_tokens_seen": 119048015, + "router_z_loss_clip": 1.86914062, + "router_z_loss_mlp": 0.15222168, + "step": 5543, + "time_per_iteration": 2.4745616912841797 + }, + { + "auxiliary_loss_clip": 0.06475509, + "auxiliary_loss_mlp": 0.01276899, + "balance_loss_clip": 0.06287046, + "balance_loss_mlp": 0.01260281, + "epoch": 0.333323312791222, + "flos": 25600661124480.0, + "grad_norm": 1.7242995092969657, + "language_loss": 0.75332278, + "learning_rate": 3.110027066843348e-06, + "loss": 0.83084685, + "num_input_tokens_seen": 119066280, + "router_z_loss_clip": 1.88476562, + "router_z_loss_mlp": 0.16601562, + "step": 5544, + "time_per_iteration": 2.565572738647461 + }, + { + "auxiliary_loss_clip": 0.06467521, + "auxiliary_loss_mlp": 0.01270286, + "balance_loss_clip": 0.06283619, + "balance_loss_mlp": 0.01254848, + "epoch": 0.33338343604389, + "flos": 25126652177280.0, + "grad_norm": 1.4364166263140996, + "language_loss": 0.71556139, + "learning_rate": 3.1097030754809456e-06, + "loss": 0.79293942, + "num_input_tokens_seen": 119087680, + "router_z_loss_clip": 1.83984375, + "router_z_loss_mlp": 0.1541748, + "step": 5545, + "time_per_iteration": 3.9951117038726807 + }, + { + "auxiliary_loss_clip": 0.0646642, + "auxiliary_loss_mlp": 0.01275763, + "balance_loss_clip": 0.0628425, + "balance_loss_mlp": 0.01260063, + "epoch": 0.33344355929655795, + "flos": 16953114602880.0, + "grad_norm": 1.5928525652704049, + "language_loss": 0.69892073, + "learning_rate": 3.1093790420384894e-06, + "loss": 0.77634251, + "num_input_tokens_seen": 119105820, + "router_z_loss_clip": 1.8203125, + "router_z_loss_mlp": 0.15722656, + "step": 5546, + "time_per_iteration": 4.069552659988403 + }, + { + "auxiliary_loss_clip": 0.06469481, + "auxiliary_loss_mlp": 0.01273771, + "balance_loss_clip": 0.06280591, + "balance_loss_mlp": 0.01257308, + "epoch": 0.3335036825492259, + "flos": 27896675178240.0, + "grad_norm": 1.5973320112543803, + "language_loss": 0.65030676, + "learning_rate": 3.1090549665282702e-06, + "loss": 0.72773933, + "num_input_tokens_seen": 119126630, + "router_z_loss_clip": 1.88964844, + "router_z_loss_mlp": 0.16455078, + "step": 5547, + "time_per_iteration": 2.578320026397705 + }, + { + "auxiliary_loss_clip": 0.06468174, + "auxiliary_loss_mlp": 0.01274769, + "balance_loss_clip": 0.06284153, + "balance_loss_mlp": 0.01258736, + "epoch": 0.3335638058018939, + "flos": 16184995424640.0, + "grad_norm": 1.9789366990729325, + "language_loss": 0.85645819, + "learning_rate": 3.1087308489625742e-06, + "loss": 0.9338876, + "num_input_tokens_seen": 119143375, + "router_z_loss_clip": 1.84082031, + "router_z_loss_mlp": 0.16040039, + "step": 5548, + "time_per_iteration": 3.917346477508545 + }, + { + "auxiliary_loss_clip": 0.06473708, + "auxiliary_loss_mlp": 0.01275416, + "balance_loss_clip": 0.06283803, + "balance_loss_mlp": 0.01259264, + "epoch": 0.33362392905456184, + "flos": 39905651617920.0, + "grad_norm": 1.927393858225298, + "language_loss": 0.74956143, + "learning_rate": 3.1084066893536945e-06, + "loss": 0.82705271, + "num_input_tokens_seen": 119166450, + "router_z_loss_clip": 1.89453125, + "router_z_loss_mlp": 0.16149902, + "step": 5549, + "time_per_iteration": 2.662152051925659 + }, + { + "auxiliary_loss_clip": 0.0647629, + "auxiliary_loss_mlp": 0.01276829, + "balance_loss_clip": 0.06287523, + "balance_loss_mlp": 0.0125946, + "epoch": 0.3336840523072298, + "flos": 44280954339840.0, + "grad_norm": 3.284743863263659, + "language_loss": 0.68874133, + "learning_rate": 3.108082487713921e-06, + "loss": 0.76627254, + "num_input_tokens_seen": 119189645, + "router_z_loss_clip": 1.88476562, + "router_z_loss_mlp": 0.17370605, + "step": 5550, + "time_per_iteration": 2.703099250793457 + }, + { + "auxiliary_loss_clip": 0.06476407, + "auxiliary_loss_mlp": 0.01275354, + "balance_loss_clip": 0.06290508, + "balance_loss_mlp": 0.01259488, + "epoch": 0.33374417555989777, + "flos": 15091054444800.0, + "grad_norm": 2.6465919002896436, + "language_loss": 0.60992151, + "learning_rate": 3.1077582440555495e-06, + "loss": 0.6874392, + "num_input_tokens_seen": 119208045, + "router_z_loss_clip": 1.85839844, + "router_z_loss_mlp": 0.15856934, + "step": 5551, + "time_per_iteration": 2.5024354457855225 + }, + { + "auxiliary_loss_clip": 0.06471356, + "auxiliary_loss_mlp": 0.01275291, + "balance_loss_clip": 0.06287605, + "balance_loss_mlp": 0.01259985, + "epoch": 0.33380429881256574, + "flos": 15854226232320.0, + "grad_norm": 1.6170207033712265, + "language_loss": 0.71155131, + "learning_rate": 3.1074339583908746e-06, + "loss": 0.78901786, + "num_input_tokens_seen": 119224910, + "router_z_loss_clip": 1.83691406, + "router_z_loss_mlp": 0.15307617, + "step": 5552, + "time_per_iteration": 4.0786826610565186 + }, + { + "auxiliary_loss_clip": 0.06476602, + "auxiliary_loss_mlp": 0.01270143, + "balance_loss_clip": 0.06291272, + "balance_loss_mlp": 0.01255182, + "epoch": 0.33386442206523376, + "flos": 13485439307520.0, + "grad_norm": 2.244029622012826, + "language_loss": 0.83864999, + "learning_rate": 3.107109630732192e-06, + "loss": 0.91611743, + "num_input_tokens_seen": 119243290, + "router_z_loss_clip": 1.85644531, + "router_z_loss_mlp": 0.1496582, + "step": 5553, + "time_per_iteration": 2.603986978530884 + }, + { + "auxiliary_loss_clip": 0.06474789, + "auxiliary_loss_mlp": 0.0127187, + "balance_loss_clip": 0.06288507, + "balance_loss_mlp": 0.01255562, + "epoch": 0.3339245453179017, + "flos": 16696250311680.0, + "grad_norm": 2.098616423404285, + "language_loss": 0.81424135, + "learning_rate": 3.1067852610918017e-06, + "loss": 0.89170802, + "num_input_tokens_seen": 119261195, + "router_z_loss_clip": 1.86132812, + "router_z_loss_mlp": 0.16320801, + "step": 5554, + "time_per_iteration": 2.4884121417999268 + }, + { + "auxiliary_loss_clip": 0.06477922, + "auxiliary_loss_mlp": 0.01277907, + "balance_loss_clip": 0.06288742, + "balance_loss_mlp": 0.01261647, + "epoch": 0.3339846685705697, + "flos": 24617954839680.0, + "grad_norm": 1.4369599322997015, + "language_loss": 0.81866252, + "learning_rate": 3.1064608494820032e-06, + "loss": 0.89622086, + "num_input_tokens_seen": 119282845, + "router_z_loss_clip": 1.89160156, + "router_z_loss_mlp": 0.16259766, + "step": 5555, + "time_per_iteration": 2.6273152828216553 + }, + { + "auxiliary_loss_clip": 0.06478396, + "auxiliary_loss_mlp": 0.01271619, + "balance_loss_clip": 0.06292441, + "balance_loss_mlp": 0.01256325, + "epoch": 0.33404479182323765, + "flos": 30961311534720.0, + "grad_norm": 1.7387044564853729, + "language_loss": 0.74836755, + "learning_rate": 3.106136395915099e-06, + "loss": 0.82586771, + "num_input_tokens_seen": 119304430, + "router_z_loss_clip": 1.85839844, + "router_z_loss_mlp": 0.1529541, + "step": 5556, + "time_per_iteration": 2.5936899185180664 + }, + { + "auxiliary_loss_clip": 0.06476042, + "auxiliary_loss_mlp": 0.01275785, + "balance_loss_clip": 0.06293188, + "balance_loss_mlp": 0.01260562, + "epoch": 0.3341049150759056, + "flos": 23519988864000.0, + "grad_norm": 1.3815052276914728, + "language_loss": 0.82545519, + "learning_rate": 3.105811900403391e-06, + "loss": 0.90297353, + "num_input_tokens_seen": 119323830, + "router_z_loss_clip": 1.82617188, + "router_z_loss_mlp": 0.15222168, + "step": 5557, + "time_per_iteration": 2.5862598419189453 + }, + { + "auxiliary_loss_clip": 0.06477734, + "auxiliary_loss_mlp": 0.01279505, + "balance_loss_clip": 0.0629133, + "balance_loss_mlp": 0.01264067, + "epoch": 0.3341650383285736, + "flos": 24034052862720.0, + "grad_norm": 2.760917503655681, + "language_loss": 0.80188966, + "learning_rate": 3.1054873629591855e-06, + "loss": 0.87946206, + "num_input_tokens_seen": 119346340, + "router_z_loss_clip": 1.86230469, + "router_z_loss_mlp": 0.15429688, + "step": 5558, + "time_per_iteration": 2.596344232559204 + }, + { + "auxiliary_loss_clip": 0.06475051, + "auxiliary_loss_mlp": 0.01282797, + "balance_loss_clip": 0.06287208, + "balance_loss_mlp": 0.01267646, + "epoch": 0.33422516158124155, + "flos": 24909255959040.0, + "grad_norm": 1.7423955567809428, + "language_loss": 0.81954122, + "learning_rate": 3.105162783594788e-06, + "loss": 0.8971197, + "num_input_tokens_seen": 119367285, + "router_z_loss_clip": 1.87792969, + "router_z_loss_mlp": 0.1517334, + "step": 5559, + "time_per_iteration": 2.587005376815796 + }, + { + "auxiliary_loss_clip": 0.06467593, + "auxiliary_loss_mlp": 0.01279767, + "balance_loss_clip": 0.06286522, + "balance_loss_mlp": 0.01265224, + "epoch": 0.3342852848339095, + "flos": 18339404878080.0, + "grad_norm": 2.1220335034517093, + "language_loss": 0.72058392, + "learning_rate": 3.1048381623225074e-06, + "loss": 0.79805756, + "num_input_tokens_seen": 119385370, + "router_z_loss_clip": 1.81054688, + "router_z_loss_mlp": 0.14550781, + "step": 5560, + "time_per_iteration": 2.536546230316162 + }, + { + "auxiliary_loss_clip": 0.06481705, + "auxiliary_loss_mlp": 0.01285397, + "balance_loss_clip": 0.06292065, + "balance_loss_mlp": 0.01269458, + "epoch": 0.3343454080865775, + "flos": 30054690357120.0, + "grad_norm": 1.596178779859494, + "language_loss": 0.75386882, + "learning_rate": 3.1045134991546526e-06, + "loss": 0.83153981, + "num_input_tokens_seen": 119409150, + "router_z_loss_clip": 1.89453125, + "router_z_loss_mlp": 0.15930176, + "step": 5561, + "time_per_iteration": 2.672700881958008 + }, + { + "auxiliary_loss_clip": 0.06477022, + "auxiliary_loss_mlp": 0.01277798, + "balance_loss_clip": 0.06291385, + "balance_loss_mlp": 0.01262551, + "epoch": 0.33440553133924544, + "flos": 16404362213760.0, + "grad_norm": 1.6462526862455489, + "language_loss": 0.70108986, + "learning_rate": 3.1041887941035355e-06, + "loss": 0.77863806, + "num_input_tokens_seen": 119426475, + "router_z_loss_clip": 1.85742188, + "router_z_loss_mlp": 0.15246582, + "step": 5562, + "time_per_iteration": 2.501317024230957 + }, + { + "auxiliary_loss_clip": 0.06472157, + "auxiliary_loss_mlp": 0.01280428, + "balance_loss_clip": 0.06287345, + "balance_loss_mlp": 0.01265396, + "epoch": 0.3344656545919134, + "flos": 24248723823360.0, + "grad_norm": 1.5361546803562123, + "language_loss": 0.65648419, + "learning_rate": 3.1038640471814685e-06, + "loss": 0.7340101, + "num_input_tokens_seen": 119446900, + "router_z_loss_clip": 1.84667969, + "router_z_loss_mlp": 0.15026855, + "step": 5563, + "time_per_iteration": 2.5564165115356445 + }, + { + "auxiliary_loss_clip": 0.06477885, + "auxiliary_loss_mlp": 0.01282181, + "balance_loss_clip": 0.06290222, + "balance_loss_mlp": 0.01264752, + "epoch": 0.3345257778445814, + "flos": 52130431048320.0, + "grad_norm": 1.3531042812140452, + "language_loss": 0.74246049, + "learning_rate": 3.103539258400766e-06, + "loss": 0.82006115, + "num_input_tokens_seen": 119470945, + "router_z_loss_clip": 1.87792969, + "router_z_loss_mlp": 0.17431641, + "step": 5564, + "time_per_iteration": 2.810534715652466 + }, + { + "auxiliary_loss_clip": 0.06356741, + "auxiliary_loss_mlp": 0.01295627, + "balance_loss_clip": 0.06270711, + "balance_loss_mlp": 0.01291562, + "epoch": 0.33458590109724934, + "flos": 68066528319360.0, + "grad_norm": 0.78222915395806, + "language_loss": 0.55275309, + "learning_rate": 3.103214427773745e-06, + "loss": 0.62927675, + "num_input_tokens_seen": 119529925, + "router_z_loss_clip": 0.859375, + "router_z_loss_mlp": 0.04064941, + "step": 5565, + "time_per_iteration": 3.1279821395874023 + }, + { + "auxiliary_loss_clip": 0.06471252, + "auxiliary_loss_mlp": 0.01279791, + "balance_loss_clip": 0.06288698, + "balance_loss_mlp": 0.01264163, + "epoch": 0.3346460243499173, + "flos": 37423869062400.0, + "grad_norm": 1.705115292174207, + "language_loss": 0.65565574, + "learning_rate": 3.102889555312721e-06, + "loss": 0.73316622, + "num_input_tokens_seen": 119550700, + "router_z_loss_clip": 1.82519531, + "router_z_loss_mlp": 0.15625, + "step": 5566, + "time_per_iteration": 2.712435245513916 + }, + { + "auxiliary_loss_clip": 0.0647177, + "auxiliary_loss_mlp": 0.01282122, + "balance_loss_clip": 0.06289912, + "balance_loss_mlp": 0.01266529, + "epoch": 0.3347061476025853, + "flos": 18703269233280.0, + "grad_norm": 1.6655571733561654, + "language_loss": 0.77372861, + "learning_rate": 3.102564641030016e-06, + "loss": 0.85126758, + "num_input_tokens_seen": 119569295, + "router_z_loss_clip": 1.81835938, + "router_z_loss_mlp": 0.15588379, + "step": 5567, + "time_per_iteration": 2.4871251583099365 + }, + { + "auxiliary_loss_clip": 0.06471208, + "auxiliary_loss_mlp": 0.01275703, + "balance_loss_clip": 0.06285998, + "balance_loss_mlp": 0.01259491, + "epoch": 0.3347662708552533, + "flos": 13922957001600.0, + "grad_norm": 1.6558873666299474, + "language_loss": 0.77099127, + "learning_rate": 3.102239684937949e-06, + "loss": 0.84846038, + "num_input_tokens_seen": 119587375, + "router_z_loss_clip": 1.8515625, + "router_z_loss_mlp": 0.16223145, + "step": 5568, + "time_per_iteration": 2.5343427658081055 + }, + { + "auxiliary_loss_clip": 0.06472506, + "auxiliary_loss_mlp": 0.01277777, + "balance_loss_clip": 0.06286565, + "balance_loss_mlp": 0.01262136, + "epoch": 0.33482639410792125, + "flos": 19755645788160.0, + "grad_norm": 1.9310298365294178, + "language_loss": 0.71334505, + "learning_rate": 3.101914687048842e-06, + "loss": 0.7908479, + "num_input_tokens_seen": 119604530, + "router_z_loss_clip": 1.85742188, + "router_z_loss_mlp": 0.15643311, + "step": 5569, + "time_per_iteration": 2.5091118812561035 + }, + { + "auxiliary_loss_clip": 0.06473939, + "auxiliary_loss_mlp": 0.01271857, + "balance_loss_clip": 0.06285448, + "balance_loss_mlp": 0.01256479, + "epoch": 0.3348865173605892, + "flos": 16107820214400.0, + "grad_norm": 1.931700529164995, + "language_loss": 0.90211284, + "learning_rate": 3.10158964737502e-06, + "loss": 0.97957081, + "num_input_tokens_seen": 119621025, + "router_z_loss_clip": 1.88476562, + "router_z_loss_mlp": 0.15380859, + "step": 5570, + "time_per_iteration": 2.6067447662353516 + }, + { + "auxiliary_loss_clip": 0.06465288, + "auxiliary_loss_mlp": 0.01272678, + "balance_loss_clip": 0.06282274, + "balance_loss_mlp": 0.01257264, + "epoch": 0.3349466406132572, + "flos": 25015836752640.0, + "grad_norm": 1.5216158426421846, + "language_loss": 0.79890078, + "learning_rate": 3.101264565928808e-06, + "loss": 0.87628049, + "num_input_tokens_seen": 119641725, + "router_z_loss_clip": 1.82910156, + "router_z_loss_mlp": 0.15405273, + "step": 5571, + "time_per_iteration": 2.5423781871795654 + }, + { + "auxiliary_loss_clip": 0.06342317, + "auxiliary_loss_mlp": 0.01254883, + "balance_loss_clip": 0.06257176, + "balance_loss_mlp": 0.01251411, + "epoch": 0.33500676386592515, + "flos": 54340058413440.0, + "grad_norm": 0.8278358272998855, + "language_loss": 0.55695772, + "learning_rate": 3.1009394427225335e-06, + "loss": 0.63292974, + "num_input_tokens_seen": 119693560, + "router_z_loss_clip": 0.8515625, + "router_z_loss_mlp": 0.03482056, + "step": 5572, + "time_per_iteration": 3.1027615070343018 + }, + { + "auxiliary_loss_clip": 0.06472763, + "auxiliary_loss_mlp": 0.0127696, + "balance_loss_clip": 0.06287524, + "balance_loss_mlp": 0.01261677, + "epoch": 0.3350668871185931, + "flos": 26804620915200.0, + "grad_norm": 1.9863197052332227, + "language_loss": 0.78856999, + "learning_rate": 3.1006142777685257e-06, + "loss": 0.86606717, + "num_input_tokens_seen": 119712935, + "router_z_loss_clip": 1.85253906, + "router_z_loss_mlp": 0.15283203, + "step": 5573, + "time_per_iteration": 2.571803331375122 + }, + { + "auxiliary_loss_clip": 0.06473139, + "auxiliary_loss_mlp": 0.01274748, + "balance_loss_clip": 0.06286675, + "balance_loss_mlp": 0.01257999, + "epoch": 0.3351270103712611, + "flos": 33518885708160.0, + "grad_norm": 2.2174625445936256, + "language_loss": 0.72959399, + "learning_rate": 3.1002890710791133e-06, + "loss": 0.80707288, + "num_input_tokens_seen": 119731680, + "router_z_loss_clip": 1.86523438, + "router_z_loss_mlp": 0.16723633, + "step": 5574, + "time_per_iteration": 2.660301923751831 + }, + { + "auxiliary_loss_clip": 0.06465638, + "auxiliary_loss_mlp": 0.01271718, + "balance_loss_clip": 0.06284496, + "balance_loss_mlp": 0.01256042, + "epoch": 0.33518713362392905, + "flos": 26513613285120.0, + "grad_norm": 1.6818935039401424, + "language_loss": 0.88364851, + "learning_rate": 3.0999638226666287e-06, + "loss": 0.96102208, + "num_input_tokens_seen": 119752155, + "router_z_loss_clip": 1.81152344, + "router_z_loss_mlp": 0.15661621, + "step": 5575, + "time_per_iteration": 2.5729191303253174 + }, + { + "auxiliary_loss_clip": 0.0648465, + "auxiliary_loss_mlp": 0.01276363, + "balance_loss_clip": 0.06290504, + "balance_loss_mlp": 0.01259316, + "epoch": 0.335247256876597, + "flos": 17237078490240.0, + "grad_norm": 1.9893319880263207, + "language_loss": 0.83043218, + "learning_rate": 3.0996385325434063e-06, + "loss": 0.90804225, + "num_input_tokens_seen": 119769195, + "router_z_loss_clip": 1.94042969, + "router_z_loss_mlp": 0.17053223, + "step": 5576, + "time_per_iteration": 2.5360445976257324 + }, + { + "auxiliary_loss_clip": 0.06478332, + "auxiliary_loss_mlp": 0.01275534, + "balance_loss_clip": 0.06288211, + "balance_loss_mlp": 0.01259095, + "epoch": 0.335307380129265, + "flos": 25636397690880.0, + "grad_norm": 2.0001339744496622, + "language_loss": 0.73279572, + "learning_rate": 3.0993132007217806e-06, + "loss": 0.81033432, + "num_input_tokens_seen": 119786810, + "router_z_loss_clip": 1.89941406, + "router_z_loss_mlp": 0.16442871, + "step": 5577, + "time_per_iteration": 2.575026750564575 + }, + { + "auxiliary_loss_clip": 0.06475031, + "auxiliary_loss_mlp": 0.01274987, + "balance_loss_clip": 0.0628825, + "balance_loss_mlp": 0.01257689, + "epoch": 0.33536750338193294, + "flos": 19685765882880.0, + "grad_norm": 1.6019428598408136, + "language_loss": 0.82233781, + "learning_rate": 3.0989878272140883e-06, + "loss": 0.89983797, + "num_input_tokens_seen": 119805395, + "router_z_loss_clip": 1.86425781, + "router_z_loss_mlp": 0.17297363, + "step": 5578, + "time_per_iteration": 2.544978380203247 + }, + { + "auxiliary_loss_clip": 0.06461956, + "auxiliary_loss_mlp": 0.01278493, + "balance_loss_clip": 0.06282087, + "balance_loss_mlp": 0.01262907, + "epoch": 0.3354276266346009, + "flos": 18338482483200.0, + "grad_norm": 1.788420802177993, + "language_loss": 0.72050315, + "learning_rate": 3.0986624120326676e-06, + "loss": 0.79790771, + "num_input_tokens_seen": 119823135, + "router_z_loss_clip": 1.796875, + "router_z_loss_mlp": 0.15582275, + "step": 5579, + "time_per_iteration": 2.50080943107605 + }, + { + "auxiliary_loss_clip": 0.06478497, + "auxiliary_loss_mlp": 0.01282646, + "balance_loss_clip": 0.06290549, + "balance_loss_mlp": 0.01266898, + "epoch": 0.3354877498872689, + "flos": 17864389681920.0, + "grad_norm": 2.052679713623706, + "language_loss": 0.81401342, + "learning_rate": 3.0983369551898573e-06, + "loss": 0.89162487, + "num_input_tokens_seen": 119842265, + "router_z_loss_clip": 1.87597656, + "router_z_loss_mlp": 0.15734863, + "step": 5580, + "time_per_iteration": 2.566675901412964 + }, + { + "auxiliary_loss_clip": 0.06473458, + "auxiliary_loss_mlp": 0.0128019, + "balance_loss_clip": 0.06284851, + "balance_loss_mlp": 0.01263691, + "epoch": 0.3355478731399369, + "flos": 24724703341440.0, + "grad_norm": 1.6024353673136869, + "language_loss": 0.78190315, + "learning_rate": 3.0980114566980003e-06, + "loss": 0.85943961, + "num_input_tokens_seen": 119862500, + "router_z_loss_clip": 1.88378906, + "router_z_loss_mlp": 0.16485596, + "step": 5581, + "time_per_iteration": 2.539208173751831 + }, + { + "auxiliary_loss_clip": 0.06482114, + "auxiliary_loss_mlp": 0.01276643, + "balance_loss_clip": 0.06289735, + "balance_loss_mlp": 0.01259084, + "epoch": 0.33560799639260486, + "flos": 16879628972160.0, + "grad_norm": 2.359779356701633, + "language_loss": 0.74923486, + "learning_rate": 3.0976859165694384e-06, + "loss": 0.8268224, + "num_input_tokens_seen": 119880160, + "router_z_loss_clip": 1.92480469, + "router_z_loss_mlp": 0.17565918, + "step": 5582, + "time_per_iteration": 2.5489563941955566 + }, + { + "auxiliary_loss_clip": 0.06478906, + "auxiliary_loss_mlp": 0.01276582, + "balance_loss_clip": 0.06287926, + "balance_loss_mlp": 0.01260191, + "epoch": 0.3356681196452728, + "flos": 18339530659200.0, + "grad_norm": 1.5985505462491367, + "language_loss": 0.82591236, + "learning_rate": 3.0973603348165166e-06, + "loss": 0.90346718, + "num_input_tokens_seen": 119899040, + "router_z_loss_clip": 1.91113281, + "router_z_loss_mlp": 0.16369629, + "step": 5583, + "time_per_iteration": 2.4985439777374268 + }, + { + "auxiliary_loss_clip": 0.06466989, + "auxiliary_loss_mlp": 0.01276424, + "balance_loss_clip": 0.06282677, + "balance_loss_mlp": 0.01260664, + "epoch": 0.3357282428979408, + "flos": 34759127116800.0, + "grad_norm": 1.8261350586664176, + "language_loss": 0.77844834, + "learning_rate": 3.097034711451581e-06, + "loss": 0.85588253, + "num_input_tokens_seen": 119921120, + "router_z_loss_clip": 1.84277344, + "router_z_loss_mlp": 0.15771484, + "step": 5584, + "time_per_iteration": 2.649090051651001 + }, + { + "auxiliary_loss_clip": 0.06475179, + "auxiliary_loss_mlp": 0.01274752, + "balance_loss_clip": 0.06285385, + "balance_loss_mlp": 0.01259427, + "epoch": 0.33578836615060875, + "flos": 21586539427200.0, + "grad_norm": 1.6814695059799305, + "language_loss": 0.76339197, + "learning_rate": 3.0967090464869795e-06, + "loss": 0.84089124, + "num_input_tokens_seen": 119940165, + "router_z_loss_clip": 1.89941406, + "router_z_loss_mlp": 0.15313721, + "step": 5585, + "time_per_iteration": 5.408076763153076 + }, + { + "auxiliary_loss_clip": 0.06463687, + "auxiliary_loss_mlp": 0.01277288, + "balance_loss_clip": 0.06280811, + "balance_loss_mlp": 0.0126054, + "epoch": 0.3358484894032767, + "flos": 24536377290240.0, + "grad_norm": 1.7085225722674646, + "language_loss": 0.78121984, + "learning_rate": 3.0963833399350608e-06, + "loss": 0.85862964, + "num_input_tokens_seen": 119959730, + "router_z_loss_clip": 1.83007812, + "router_z_loss_mlp": 0.16760254, + "step": 5586, + "time_per_iteration": 2.5785536766052246 + }, + { + "auxiliary_loss_clip": 0.06482486, + "auxiliary_loss_mlp": 0.01271246, + "balance_loss_clip": 0.06290784, + "balance_loss_mlp": 0.01254902, + "epoch": 0.3359086126559447, + "flos": 22462161793920.0, + "grad_norm": 1.9607494340110725, + "language_loss": 0.81952178, + "learning_rate": 3.0960575918081756e-06, + "loss": 0.89705908, + "num_input_tokens_seen": 119979315, + "router_z_loss_clip": 1.91796875, + "router_z_loss_mlp": 0.16357422, + "step": 5587, + "time_per_iteration": 3.9456732273101807 + }, + { + "auxiliary_loss_clip": 0.06460288, + "auxiliary_loss_mlp": 0.01274939, + "balance_loss_clip": 0.06281327, + "balance_loss_mlp": 0.01259692, + "epoch": 0.33596873590861265, + "flos": 16549069415040.0, + "grad_norm": 1.7386991231776667, + "language_loss": 0.67118108, + "learning_rate": 3.095731802118677e-06, + "loss": 0.74853337, + "num_input_tokens_seen": 119996140, + "router_z_loss_clip": 1.79003906, + "router_z_loss_mlp": 0.15234375, + "step": 5588, + "time_per_iteration": 2.6328773498535156 + }, + { + "auxiliary_loss_clip": 0.06471635, + "auxiliary_loss_mlp": 0.01272286, + "balance_loss_clip": 0.0628484, + "balance_loss_mlp": 0.01255215, + "epoch": 0.3360288591612806, + "flos": 31183864778880.0, + "grad_norm": 2.547244730124186, + "language_loss": 0.70319438, + "learning_rate": 3.095405970878919e-06, + "loss": 0.78063357, + "num_input_tokens_seen": 120017720, + "router_z_loss_clip": 1.86621094, + "router_z_loss_mlp": 0.17077637, + "step": 5589, + "time_per_iteration": 2.631972074508667 + }, + { + "auxiliary_loss_clip": 0.06473772, + "auxiliary_loss_mlp": 0.01270331, + "balance_loss_clip": 0.06286001, + "balance_loss_mlp": 0.01255096, + "epoch": 0.3360889824139486, + "flos": 23703828721920.0, + "grad_norm": 1.7722032929069027, + "language_loss": 0.67818141, + "learning_rate": 3.0950800981012567e-06, + "loss": 0.75562239, + "num_input_tokens_seen": 120036335, + "router_z_loss_clip": 1.87695312, + "router_z_loss_mlp": 0.15258789, + "step": 5590, + "time_per_iteration": 2.582160711288452 + }, + { + "auxiliary_loss_clip": 0.0646477, + "auxiliary_loss_mlp": 0.01273314, + "balance_loss_clip": 0.06283349, + "balance_loss_mlp": 0.01257972, + "epoch": 0.33614910566661654, + "flos": 19324207514880.0, + "grad_norm": 1.8733623292805037, + "language_loss": 0.73821473, + "learning_rate": 3.094754183798047e-06, + "loss": 0.81559563, + "num_input_tokens_seen": 120056120, + "router_z_loss_clip": 1.8125, + "router_z_loss_mlp": 0.15344238, + "step": 5591, + "time_per_iteration": 2.5325355529785156 + }, + { + "auxiliary_loss_clip": 0.06462986, + "auxiliary_loss_mlp": 0.01270586, + "balance_loss_clip": 0.06280106, + "balance_loss_mlp": 0.01254945, + "epoch": 0.3362092289192845, + "flos": 16477889771520.0, + "grad_norm": 3.0838875929044036, + "language_loss": 0.70195794, + "learning_rate": 3.0944282279816493e-06, + "loss": 0.77929366, + "num_input_tokens_seen": 120073650, + "router_z_loss_clip": 1.828125, + "router_z_loss_mlp": 0.15637207, + "step": 5592, + "time_per_iteration": 3.919609546661377 + }, + { + "auxiliary_loss_clip": 0.06466913, + "auxiliary_loss_mlp": 0.01271712, + "balance_loss_clip": 0.06283789, + "balance_loss_mlp": 0.01257014, + "epoch": 0.33626935217195253, + "flos": 24250484759040.0, + "grad_norm": 2.017741256836838, + "language_loss": 0.76621854, + "learning_rate": 3.094102230664423e-06, + "loss": 0.8436048, + "num_input_tokens_seen": 120093260, + "router_z_loss_clip": 1.83007812, + "router_z_loss_mlp": 0.14697266, + "step": 5593, + "time_per_iteration": 2.582902431488037 + }, + { + "auxiliary_loss_clip": 0.06476289, + "auxiliary_loss_mlp": 0.01272909, + "balance_loss_clip": 0.06285767, + "balance_loss_mlp": 0.01255445, + "epoch": 0.3363294754246205, + "flos": 19724814685440.0, + "grad_norm": 3.212319882003512, + "language_loss": 0.72710228, + "learning_rate": 3.093776191858731e-06, + "loss": 0.80459422, + "num_input_tokens_seen": 120111830, + "router_z_loss_clip": 1.90625, + "router_z_loss_mlp": 0.17456055, + "step": 5594, + "time_per_iteration": 2.495196580886841 + }, + { + "auxiliary_loss_clip": 0.06477273, + "auxiliary_loss_mlp": 0.01272377, + "balance_loss_clip": 0.06289684, + "balance_loss_mlp": 0.01256379, + "epoch": 0.33638959867728846, + "flos": 22602005458560.0, + "grad_norm": 1.7565144487218112, + "language_loss": 0.8009572, + "learning_rate": 3.0934501115769363e-06, + "loss": 0.87845373, + "num_input_tokens_seen": 120130470, + "router_z_loss_clip": 1.87402344, + "router_z_loss_mlp": 0.16003418, + "step": 5595, + "time_per_iteration": 2.5639891624450684 + }, + { + "auxiliary_loss_clip": 0.06468762, + "auxiliary_loss_mlp": 0.01271282, + "balance_loss_clip": 0.06285411, + "balance_loss_mlp": 0.01256691, + "epoch": 0.3364497219299564, + "flos": 21000834587520.0, + "grad_norm": 1.6187307873664143, + "language_loss": 0.81718135, + "learning_rate": 3.0931239898314037e-06, + "loss": 0.89458185, + "num_input_tokens_seen": 120150735, + "router_z_loss_clip": 1.83203125, + "router_z_loss_mlp": 0.14587402, + "step": 5596, + "time_per_iteration": 2.579089403152466 + }, + { + "auxiliary_loss_clip": 0.06470582, + "auxiliary_loss_mlp": 0.01270351, + "balance_loss_clip": 0.06285384, + "balance_loss_mlp": 0.01256034, + "epoch": 0.3365098451826244, + "flos": 25235664739200.0, + "grad_norm": 1.5539796133352632, + "language_loss": 0.76225436, + "learning_rate": 3.0927978266344995e-06, + "loss": 0.83966368, + "num_input_tokens_seen": 120173230, + "router_z_loss_clip": 1.8515625, + "router_z_loss_mlp": 0.14318848, + "step": 5597, + "time_per_iteration": 2.6059625148773193 + }, + { + "auxiliary_loss_clip": 0.06473622, + "auxiliary_loss_mlp": 0.01271725, + "balance_loss_clip": 0.06290761, + "balance_loss_mlp": 0.01257206, + "epoch": 0.33656996843529235, + "flos": 24578612547840.0, + "grad_norm": 1.67554812607641, + "language_loss": 0.78886169, + "learning_rate": 3.0924716219985916e-06, + "loss": 0.86631513, + "num_input_tokens_seen": 120191860, + "router_z_loss_clip": 1.83007812, + "router_z_loss_mlp": 0.14520264, + "step": 5598, + "time_per_iteration": 2.54971981048584 + }, + { + "auxiliary_loss_clip": 0.06487022, + "auxiliary_loss_mlp": 0.01275679, + "balance_loss_clip": 0.0629402, + "balance_loss_mlp": 0.01259353, + "epoch": 0.3366300916879603, + "flos": 44101223331840.0, + "grad_norm": 1.966389459711274, + "language_loss": 0.64792764, + "learning_rate": 3.0921453759360514e-06, + "loss": 0.7255547, + "num_input_tokens_seen": 120219195, + "router_z_loss_clip": 1.92871094, + "router_z_loss_mlp": 0.16326904, + "step": 5599, + "time_per_iteration": 2.741544723510742 + }, + { + "auxiliary_loss_clip": 0.06483869, + "auxiliary_loss_mlp": 0.01276046, + "balance_loss_clip": 0.06290758, + "balance_loss_mlp": 0.01259118, + "epoch": 0.3366902149406283, + "flos": 13884746739840.0, + "grad_norm": 2.857086104177812, + "language_loss": 0.82787466, + "learning_rate": 3.091819088459249e-06, + "loss": 0.90547383, + "num_input_tokens_seen": 120232950, + "router_z_loss_clip": 1.93164062, + "router_z_loss_mlp": 0.16906738, + "step": 5600, + "time_per_iteration": 2.4761526584625244 + }, + { + "auxiliary_loss_clip": 0.06480727, + "auxiliary_loss_mlp": 0.01272907, + "balance_loss_clip": 0.06289887, + "balance_loss_mlp": 0.01257255, + "epoch": 0.33675033819329625, + "flos": 16258648763520.0, + "grad_norm": 2.1921833677853853, + "language_loss": 0.83268821, + "learning_rate": 3.0914927595805573e-06, + "loss": 0.91022456, + "num_input_tokens_seen": 120248865, + "router_z_loss_clip": 1.90625, + "router_z_loss_mlp": 0.15649414, + "step": 5601, + "time_per_iteration": 2.5205788612365723 + }, + { + "auxiliary_loss_clip": 0.06469133, + "auxiliary_loss_mlp": 0.01269312, + "balance_loss_clip": 0.0628977, + "balance_loss_mlp": 0.01255382, + "epoch": 0.3368104614459642, + "flos": 17061498259200.0, + "grad_norm": 1.6270640398275205, + "language_loss": 0.83791035, + "learning_rate": 3.0911663893123507e-06, + "loss": 0.91529477, + "num_input_tokens_seen": 120267820, + "router_z_loss_clip": 1.79199219, + "router_z_loss_mlp": 0.1394043, + "step": 5602, + "time_per_iteration": 2.5069589614868164 + }, + { + "auxiliary_loss_clip": 0.06479525, + "auxiliary_loss_mlp": 0.01274011, + "balance_loss_clip": 0.06294133, + "balance_loss_mlp": 0.01258645, + "epoch": 0.3368705846986322, + "flos": 17864473536000.0, + "grad_norm": 2.666791314538914, + "language_loss": 0.69934028, + "learning_rate": 3.0908399776670048e-06, + "loss": 0.77687562, + "num_input_tokens_seen": 120286540, + "router_z_loss_clip": 1.85351562, + "router_z_loss_mlp": 0.15380859, + "step": 5603, + "time_per_iteration": 2.5512561798095703 + }, + { + "auxiliary_loss_clip": 0.0648806, + "auxiliary_loss_mlp": 0.01271029, + "balance_loss_clip": 0.06298037, + "balance_loss_mlp": 0.01255376, + "epoch": 0.33693070795130015, + "flos": 22936086887040.0, + "grad_norm": 1.5393691582180518, + "language_loss": 0.83336604, + "learning_rate": 3.090513524656898e-06, + "loss": 0.91095686, + "num_input_tokens_seen": 120307305, + "router_z_loss_clip": 1.90234375, + "router_z_loss_mlp": 0.15661621, + "step": 5604, + "time_per_iteration": 2.542419910430908 + }, + { + "auxiliary_loss_clip": 0.06487563, + "auxiliary_loss_mlp": 0.01271201, + "balance_loss_clip": 0.06296179, + "balance_loss_mlp": 0.01255, + "epoch": 0.3369908312039681, + "flos": 22023889413120.0, + "grad_norm": 1.7290560496085086, + "language_loss": 0.74166059, + "learning_rate": 3.090187030294409e-06, + "loss": 0.8192482, + "num_input_tokens_seen": 120327845, + "router_z_loss_clip": 1.91015625, + "router_z_loss_mlp": 0.1619873, + "step": 5605, + "time_per_iteration": 2.551250696182251 + }, + { + "auxiliary_loss_clip": 0.0648852, + "auxiliary_loss_mlp": 0.01268868, + "balance_loss_clip": 0.06295876, + "balance_loss_mlp": 0.01253347, + "epoch": 0.33705095445663613, + "flos": 11806799736960.0, + "grad_norm": 2.683910051705504, + "language_loss": 0.84068418, + "learning_rate": 3.089860494591919e-06, + "loss": 0.91825807, + "num_input_tokens_seen": 120343255, + "router_z_loss_clip": 1.92675781, + "router_z_loss_mlp": 0.15515137, + "step": 5606, + "time_per_iteration": 2.4841489791870117 + }, + { + "auxiliary_loss_clip": 0.0647673, + "auxiliary_loss_mlp": 0.01269431, + "balance_loss_clip": 0.06290583, + "balance_loss_mlp": 0.01254721, + "epoch": 0.3371110777093041, + "flos": 25053460035840.0, + "grad_norm": 1.669780314791874, + "language_loss": 0.68210214, + "learning_rate": 3.089533917561809e-06, + "loss": 0.7595638, + "num_input_tokens_seen": 120361745, + "router_z_loss_clip": 1.85839844, + "router_z_loss_mlp": 0.14709473, + "step": 5607, + "time_per_iteration": 2.6018009185791016 + }, + { + "auxiliary_loss_clip": 0.0648887, + "auxiliary_loss_mlp": 0.01274582, + "balance_loss_clip": 0.06295381, + "balance_loss_mlp": 0.01258131, + "epoch": 0.33717120096197206, + "flos": 26586386156160.0, + "grad_norm": 1.643709475435958, + "language_loss": 0.71566343, + "learning_rate": 3.089207299216464e-06, + "loss": 0.79329789, + "num_input_tokens_seen": 120380565, + "router_z_loss_clip": 1.93261719, + "router_z_loss_mlp": 0.16442871, + "step": 5608, + "time_per_iteration": 2.5980639457702637 + }, + { + "auxiliary_loss_clip": 0.06479236, + "auxiliary_loss_mlp": 0.01274936, + "balance_loss_clip": 0.06291037, + "balance_loss_mlp": 0.01258712, + "epoch": 0.33723132421464, + "flos": 15163911169920.0, + "grad_norm": 1.8781248289320855, + "language_loss": 0.79662472, + "learning_rate": 3.088880639568269e-06, + "loss": 0.87416643, + "num_input_tokens_seen": 120399235, + "router_z_loss_clip": 1.8828125, + "router_z_loss_mlp": 0.16223145, + "step": 5609, + "time_per_iteration": 2.6196935176849365 + }, + { + "auxiliary_loss_clip": 0.06480544, + "auxiliary_loss_mlp": 0.01274048, + "balance_loss_clip": 0.06290779, + "balance_loss_mlp": 0.01256262, + "epoch": 0.337291447467308, + "flos": 23442058967040.0, + "grad_norm": 1.7293742366408622, + "language_loss": 0.83075953, + "learning_rate": 3.0885539386296114e-06, + "loss": 0.90830547, + "num_input_tokens_seen": 120420095, + "router_z_loss_clip": 1.89550781, + "router_z_loss_mlp": 0.17785645, + "step": 5610, + "time_per_iteration": 2.53485369682312 + }, + { + "auxiliary_loss_clip": 0.06471263, + "auxiliary_loss_mlp": 0.01269511, + "balance_loss_clip": 0.06288794, + "balance_loss_mlp": 0.01254097, + "epoch": 0.33735157071997596, + "flos": 17243870670720.0, + "grad_norm": 1.916021570377688, + "language_loss": 0.82657987, + "learning_rate": 3.088227196412879e-06, + "loss": 0.90398765, + "num_input_tokens_seen": 120437690, + "router_z_loss_clip": 1.82519531, + "router_z_loss_mlp": 0.1541748, + "step": 5611, + "time_per_iteration": 2.5164084434509277 + }, + { + "auxiliary_loss_clip": 0.06478009, + "auxiliary_loss_mlp": 0.01278112, + "balance_loss_clip": 0.0629037, + "balance_loss_mlp": 0.01260005, + "epoch": 0.3374116939726439, + "flos": 28265025726720.0, + "grad_norm": 3.0042840390827106, + "language_loss": 0.79815799, + "learning_rate": 3.0879004129304626e-06, + "loss": 0.87571925, + "num_input_tokens_seen": 120459240, + "router_z_loss_clip": 1.87597656, + "router_z_loss_mlp": 0.18084717, + "step": 5612, + "time_per_iteration": 2.582742929458618 + }, + { + "auxiliary_loss_clip": 0.06476334, + "auxiliary_loss_mlp": 0.0127707, + "balance_loss_clip": 0.06288031, + "balance_loss_mlp": 0.01261597, + "epoch": 0.3374718172253119, + "flos": 35928314663040.0, + "grad_norm": 2.3711016444568003, + "language_loss": 0.69757682, + "learning_rate": 3.087573588194753e-06, + "loss": 0.7751109, + "num_input_tokens_seen": 120481090, + "router_z_loss_clip": 1.88378906, + "router_z_loss_mlp": 0.15466309, + "step": 5613, + "time_per_iteration": 2.6553308963775635 + }, + { + "auxiliary_loss_clip": 0.06477948, + "auxiliary_loss_mlp": 0.01274833, + "balance_loss_clip": 0.06288674, + "balance_loss_mlp": 0.01259181, + "epoch": 0.33753194047797985, + "flos": 18192517470720.0, + "grad_norm": 1.7341744507496721, + "language_loss": 0.80043244, + "learning_rate": 3.087246722218144e-06, + "loss": 0.87796032, + "num_input_tokens_seen": 120500045, + "router_z_loss_clip": 1.89257812, + "router_z_loss_mlp": 0.15673828, + "step": 5614, + "time_per_iteration": 2.5162055492401123 + }, + { + "auxiliary_loss_clip": 0.06479318, + "auxiliary_loss_mlp": 0.01274123, + "balance_loss_clip": 0.06289384, + "balance_loss_mlp": 0.01257684, + "epoch": 0.3375920637306478, + "flos": 23155621384320.0, + "grad_norm": 1.8737965791301845, + "language_loss": 0.91138643, + "learning_rate": 3.086919815013031e-06, + "loss": 0.98892087, + "num_input_tokens_seen": 120521125, + "router_z_loss_clip": 1.90234375, + "router_z_loss_mlp": 0.16430664, + "step": 5615, + "time_per_iteration": 2.5491819381713867 + }, + { + "auxiliary_loss_clip": 0.0646698, + "auxiliary_loss_mlp": 0.01277747, + "balance_loss_clip": 0.06282586, + "balance_loss_mlp": 0.01261857, + "epoch": 0.3376521869833158, + "flos": 23118878568960.0, + "grad_norm": 1.8899714235087088, + "language_loss": 0.81227732, + "learning_rate": 3.086592866591809e-06, + "loss": 0.88972461, + "num_input_tokens_seen": 120539180, + "router_z_loss_clip": 1.84375, + "router_z_loss_mlp": 0.15881348, + "step": 5616, + "time_per_iteration": 2.551891803741455 + }, + { + "auxiliary_loss_clip": 0.0647929, + "auxiliary_loss_mlp": 0.01281624, + "balance_loss_clip": 0.06285349, + "balance_loss_mlp": 0.01263576, + "epoch": 0.33771231023598375, + "flos": 19279498561920.0, + "grad_norm": 1.7280186066143421, + "language_loss": 0.84097004, + "learning_rate": 3.0862658769668774e-06, + "loss": 0.91857922, + "num_input_tokens_seen": 120556280, + "router_z_loss_clip": 1.9375, + "router_z_loss_mlp": 0.18054199, + "step": 5617, + "time_per_iteration": 2.532703161239624 + }, + { + "auxiliary_loss_clip": 0.06466082, + "auxiliary_loss_mlp": 0.01273548, + "balance_loss_clip": 0.06279126, + "balance_loss_mlp": 0.01257073, + "epoch": 0.3377724334886517, + "flos": 18156026217600.0, + "grad_norm": 1.631465963150073, + "language_loss": 0.80857313, + "learning_rate": 3.0859388461506343e-06, + "loss": 0.8859694, + "num_input_tokens_seen": 120575395, + "router_z_loss_clip": 1.86816406, + "router_z_loss_mlp": 0.16467285, + "step": 5618, + "time_per_iteration": 2.5592081546783447 + }, + { + "auxiliary_loss_clip": 0.06473768, + "auxiliary_loss_mlp": 0.01275311, + "balance_loss_clip": 0.06286047, + "balance_loss_mlp": 0.01258514, + "epoch": 0.3378325567413197, + "flos": 25783159317120.0, + "grad_norm": 2.0305417192076267, + "language_loss": 0.71181929, + "learning_rate": 3.085611774155481e-06, + "loss": 0.7893101, + "num_input_tokens_seen": 120596075, + "router_z_loss_clip": 1.87695312, + "router_z_loss_mlp": 0.16809082, + "step": 5619, + "time_per_iteration": 2.5726358890533447 + }, + { + "auxiliary_loss_clip": 0.06476114, + "auxiliary_loss_mlp": 0.01271613, + "balance_loss_clip": 0.06289306, + "balance_loss_mlp": 0.01256688, + "epoch": 0.3378926799939877, + "flos": 21322254049920.0, + "grad_norm": 2.6280659122339496, + "language_loss": 0.70615005, + "learning_rate": 3.085284660993821e-06, + "loss": 0.78362733, + "num_input_tokens_seen": 120614195, + "router_z_loss_clip": 1.86523438, + "router_z_loss_mlp": 0.14929199, + "step": 5620, + "time_per_iteration": 2.604161500930786 + }, + { + "auxiliary_loss_clip": 0.06467394, + "auxiliary_loss_mlp": 0.0127348, + "balance_loss_clip": 0.0628472, + "balance_loss_mlp": 0.01258054, + "epoch": 0.33795280324665566, + "flos": 24906991898880.0, + "grad_norm": 2.3940060195146384, + "language_loss": 0.6847257, + "learning_rate": 3.084957506678058e-06, + "loss": 0.76213443, + "num_input_tokens_seen": 120634475, + "router_z_loss_clip": 1.82421875, + "router_z_loss_mlp": 0.1541748, + "step": 5621, + "time_per_iteration": 2.559730052947998 + }, + { + "auxiliary_loss_clip": 0.06469798, + "auxiliary_loss_mlp": 0.01273981, + "balance_loss_clip": 0.06287812, + "balance_loss_mlp": 0.0125914, + "epoch": 0.33801292649932363, + "flos": 24760859178240.0, + "grad_norm": 1.8671152624425502, + "language_loss": 0.82685888, + "learning_rate": 3.0846303112205975e-06, + "loss": 0.90429658, + "num_input_tokens_seen": 120654980, + "router_z_loss_clip": 1.81933594, + "router_z_loss_mlp": 0.1484375, + "step": 5622, + "time_per_iteration": 2.5722928047180176 + }, + { + "auxiliary_loss_clip": 0.06466316, + "auxiliary_loss_mlp": 0.01274625, + "balance_loss_clip": 0.06284748, + "balance_loss_mlp": 0.01260564, + "epoch": 0.3380730497519916, + "flos": 26731177211520.0, + "grad_norm": 1.4865849557607265, + "language_loss": 0.74114043, + "learning_rate": 3.0843030746338464e-06, + "loss": 0.81854987, + "num_input_tokens_seen": 120676245, + "router_z_loss_clip": 1.81738281, + "router_z_loss_mlp": 0.14056396, + "step": 5623, + "time_per_iteration": 2.5830907821655273 + }, + { + "auxiliary_loss_clip": 0.06389539, + "auxiliary_loss_mlp": 0.01273334, + "balance_loss_clip": 0.06299451, + "balance_loss_mlp": 0.01265943, + "epoch": 0.33813317300465956, + "flos": 70056845550720.0, + "grad_norm": 0.7132848624035326, + "language_loss": 0.54856884, + "learning_rate": 3.083975796930215e-06, + "loss": 0.62519753, + "num_input_tokens_seen": 120741965, + "router_z_loss_clip": 0.8984375, + "router_z_loss_mlp": 0.07373047, + "step": 5624, + "time_per_iteration": 4.680114030838013 + }, + { + "auxiliary_loss_clip": 0.06475174, + "auxiliary_loss_mlp": 0.01272775, + "balance_loss_clip": 0.06285602, + "balance_loss_mlp": 0.01256086, + "epoch": 0.3381932962573275, + "flos": 24104142403200.0, + "grad_norm": 3.6042241236842267, + "language_loss": 0.73496938, + "learning_rate": 3.083648478122111e-06, + "loss": 0.81244886, + "num_input_tokens_seen": 120760410, + "router_z_loss_clip": 1.89550781, + "router_z_loss_mlp": 0.16687012, + "step": 5625, + "time_per_iteration": 4.002846956253052 + }, + { + "auxiliary_loss_clip": 0.06480759, + "auxiliary_loss_mlp": 0.01274878, + "balance_loss_clip": 0.06288841, + "balance_loss_mlp": 0.01257021, + "epoch": 0.3382534195099955, + "flos": 19283775120000.0, + "grad_norm": 1.9831743515273117, + "language_loss": 0.7176404, + "learning_rate": 3.0833211182219497e-06, + "loss": 0.79519677, + "num_input_tokens_seen": 120777705, + "router_z_loss_clip": 1.91796875, + "router_z_loss_mlp": 0.17858887, + "step": 5626, + "time_per_iteration": 2.4999427795410156 + }, + { + "auxiliary_loss_clip": 0.06468458, + "auxiliary_loss_mlp": 0.01272986, + "balance_loss_clip": 0.06287608, + "balance_loss_mlp": 0.01257739, + "epoch": 0.33831354276266346, + "flos": 25232897554560.0, + "grad_norm": 2.987617225478933, + "language_loss": 0.81275499, + "learning_rate": 3.0829937172421425e-06, + "loss": 0.8901695, + "num_input_tokens_seen": 120798660, + "router_z_loss_clip": 1.80957031, + "router_z_loss_mlp": 0.15246582, + "step": 5627, + "time_per_iteration": 3.951984405517578 + }, + { + "auxiliary_loss_clip": 0.06478465, + "auxiliary_loss_mlp": 0.01272976, + "balance_loss_clip": 0.06288861, + "balance_loss_mlp": 0.0125668, + "epoch": 0.3383736660153314, + "flos": 23118627006720.0, + "grad_norm": 1.844905449272807, + "language_loss": 0.80405974, + "learning_rate": 3.0826662751951055e-06, + "loss": 0.88157415, + "num_input_tokens_seen": 120816705, + "router_z_loss_clip": 1.89648438, + "router_z_loss_mlp": 0.16296387, + "step": 5628, + "time_per_iteration": 2.5670697689056396 + }, + { + "auxiliary_loss_clip": 0.06477988, + "auxiliary_loss_mlp": 0.01270735, + "balance_loss_clip": 0.06288996, + "balance_loss_mlp": 0.0125457, + "epoch": 0.3384337892679994, + "flos": 23483874954240.0, + "grad_norm": 2.662319374226008, + "language_loss": 0.77757806, + "learning_rate": 3.082338792093254e-06, + "loss": 0.85506529, + "num_input_tokens_seen": 120835375, + "router_z_loss_clip": 1.88867188, + "router_z_loss_mlp": 0.16174316, + "step": 5629, + "time_per_iteration": 2.5463128089904785 + }, + { + "auxiliary_loss_clip": 0.06482605, + "auxiliary_loss_mlp": 0.01280413, + "balance_loss_clip": 0.06291752, + "balance_loss_mlp": 0.01262758, + "epoch": 0.33849391252066735, + "flos": 19431626849280.0, + "grad_norm": 1.826421419331283, + "language_loss": 0.85789764, + "learning_rate": 3.0820112679490074e-06, + "loss": 0.9355278, + "num_input_tokens_seen": 120854260, + "router_z_loss_clip": 1.90722656, + "router_z_loss_mlp": 0.17663574, + "step": 5630, + "time_per_iteration": 2.5818262100219727 + }, + { + "auxiliary_loss_clip": 0.06476109, + "auxiliary_loss_mlp": 0.01274878, + "balance_loss_clip": 0.06290477, + "balance_loss_mlp": 0.01260073, + "epoch": 0.3385540357733353, + "flos": 21070462930560.0, + "grad_norm": 2.179516256809373, + "language_loss": 0.72520673, + "learning_rate": 3.0816837027747857e-06, + "loss": 0.80271661, + "num_input_tokens_seen": 120871590, + "router_z_loss_clip": 1.85351562, + "router_z_loss_mlp": 0.14807129, + "step": 5631, + "time_per_iteration": 3.9340498447418213 + }, + { + "auxiliary_loss_clip": 0.06388511, + "auxiliary_loss_mlp": 0.01280567, + "balance_loss_clip": 0.06298131, + "balance_loss_mlp": 0.01272211, + "epoch": 0.3386141590260033, + "flos": 69224772908160.0, + "grad_norm": 0.8339652565495183, + "language_loss": 0.56105018, + "learning_rate": 3.0813560965830084e-06, + "loss": 0.63774097, + "num_input_tokens_seen": 120925550, + "router_z_loss_clip": 0.90234375, + "router_z_loss_mlp": 0.08361816, + "step": 5632, + "time_per_iteration": 3.215395450592041 + }, + { + "auxiliary_loss_clip": 0.06477562, + "auxiliary_loss_mlp": 0.01271677, + "balance_loss_clip": 0.06290288, + "balance_loss_mlp": 0.01256573, + "epoch": 0.3386742822786713, + "flos": 25526420807040.0, + "grad_norm": 3.459768837753136, + "language_loss": 0.81030583, + "learning_rate": 3.0810284493861005e-06, + "loss": 0.88779831, + "num_input_tokens_seen": 120947620, + "router_z_loss_clip": 1.87011719, + "router_z_loss_mlp": 0.15112305, + "step": 5633, + "time_per_iteration": 2.6278936862945557 + }, + { + "auxiliary_loss_clip": 0.06473435, + "auxiliary_loss_mlp": 0.01274796, + "balance_loss_clip": 0.06287597, + "balance_loss_mlp": 0.01258942, + "epoch": 0.33873440553133927, + "flos": 23629881893760.0, + "grad_norm": 2.634738846372382, + "language_loss": 0.59410667, + "learning_rate": 3.0807007611964855e-06, + "loss": 0.67158902, + "num_input_tokens_seen": 120965205, + "router_z_loss_clip": 1.85839844, + "router_z_loss_mlp": 0.15856934, + "step": 5634, + "time_per_iteration": 2.565622091293335 + }, + { + "auxiliary_loss_clip": 0.06475686, + "auxiliary_loss_mlp": 0.01270379, + "balance_loss_clip": 0.0628805, + "balance_loss_mlp": 0.01255216, + "epoch": 0.33879452878400723, + "flos": 17094006443520.0, + "grad_norm": 1.81394172090833, + "language_loss": 0.92877531, + "learning_rate": 3.080373032026589e-06, + "loss": 1.00623596, + "num_input_tokens_seen": 120983560, + "router_z_loss_clip": 1.87597656, + "router_z_loss_mlp": 0.15161133, + "step": 5635, + "time_per_iteration": 2.539051055908203 + }, + { + "auxiliary_loss_clip": 0.06470082, + "auxiliary_loss_mlp": 0.01273079, + "balance_loss_clip": 0.0629005, + "balance_loss_mlp": 0.01257457, + "epoch": 0.3388546520366752, + "flos": 15747477730560.0, + "grad_norm": 1.8703432540182672, + "language_loss": 0.75823128, + "learning_rate": 3.0800452618888386e-06, + "loss": 0.83566296, + "num_input_tokens_seen": 121001400, + "router_z_loss_clip": 1.80078125, + "router_z_loss_mlp": 0.15618896, + "step": 5636, + "time_per_iteration": 2.4998726844787598 + }, + { + "auxiliary_loss_clip": 0.064714, + "auxiliary_loss_mlp": 0.01275037, + "balance_loss_clip": 0.06288341, + "balance_loss_mlp": 0.01258848, + "epoch": 0.33891477528934316, + "flos": 22425251270400.0, + "grad_norm": 1.6981405891584176, + "language_loss": 0.83775222, + "learning_rate": 3.0797174507956637e-06, + "loss": 0.91521657, + "num_input_tokens_seen": 121021760, + "router_z_loss_clip": 1.828125, + "router_z_loss_mlp": 0.1619873, + "step": 5637, + "time_per_iteration": 2.551074981689453 + }, + { + "auxiliary_loss_clip": 0.06474115, + "auxiliary_loss_mlp": 0.01272331, + "balance_loss_clip": 0.06286962, + "balance_loss_mlp": 0.01254736, + "epoch": 0.3389748985420111, + "flos": 17280571559040.0, + "grad_norm": 1.787045955061502, + "language_loss": 0.70609659, + "learning_rate": 3.079389598759495e-06, + "loss": 0.78356105, + "num_input_tokens_seen": 121041070, + "router_z_loss_clip": 1.87304688, + "router_z_loss_mlp": 0.17590332, + "step": 5638, + "time_per_iteration": 2.5479955673217773 + }, + { + "auxiliary_loss_clip": 0.06478329, + "auxiliary_loss_mlp": 0.01289332, + "balance_loss_clip": 0.06293231, + "balance_loss_mlp": 0.01272404, + "epoch": 0.3390350217946791, + "flos": 27752261466240.0, + "grad_norm": 1.7018866339003167, + "language_loss": 0.81276166, + "learning_rate": 3.079061705792765e-06, + "loss": 0.89043832, + "num_input_tokens_seen": 121060890, + "router_z_loss_clip": 1.84960938, + "router_z_loss_mlp": 0.16931152, + "step": 5639, + "time_per_iteration": 2.614819288253784 + }, + { + "auxiliary_loss_clip": 0.06487049, + "auxiliary_loss_mlp": 0.01288743, + "balance_loss_clip": 0.06296147, + "balance_loss_mlp": 0.01270635, + "epoch": 0.33909514504734706, + "flos": 20346088383360.0, + "grad_norm": 6.449374256721531, + "language_loss": 0.68149316, + "learning_rate": 3.078733771907907e-06, + "loss": 0.75925112, + "num_input_tokens_seen": 121079135, + "router_z_loss_clip": 1.90917969, + "router_z_loss_mlp": 0.18103027, + "step": 5640, + "time_per_iteration": 2.496300220489502 + }, + { + "auxiliary_loss_clip": 0.06471096, + "auxiliary_loss_mlp": 0.01277542, + "balance_loss_clip": 0.06286727, + "balance_loss_mlp": 0.0125978, + "epoch": 0.339155268300015, + "flos": 14835322183680.0, + "grad_norm": 1.7549267997867504, + "language_loss": 0.70165765, + "learning_rate": 3.0784057971173554e-06, + "loss": 0.77914405, + "num_input_tokens_seen": 121097685, + "router_z_loss_clip": 1.84570312, + "router_z_loss_mlp": 0.1776123, + "step": 5641, + "time_per_iteration": 2.524548053741455 + }, + { + "auxiliary_loss_clip": 0.0647646, + "auxiliary_loss_mlp": 0.0128105, + "balance_loss_clip": 0.06289618, + "balance_loss_mlp": 0.01264611, + "epoch": 0.339215391552683, + "flos": 26075173196160.0, + "grad_norm": 2.2643311920206592, + "language_loss": 0.88204467, + "learning_rate": 3.0780777814335483e-06, + "loss": 0.95961982, + "num_input_tokens_seen": 121115640, + "router_z_loss_clip": 1.86914062, + "router_z_loss_mlp": 0.16430664, + "step": 5642, + "time_per_iteration": 2.551790237426758 + }, + { + "auxiliary_loss_clip": 0.06466684, + "auxiliary_loss_mlp": 0.01272488, + "balance_loss_clip": 0.06289211, + "balance_loss_mlp": 0.01258195, + "epoch": 0.33927551480535095, + "flos": 14579967265920.0, + "grad_norm": 2.023061860440481, + "language_loss": 0.84285331, + "learning_rate": 3.077749724868924e-06, + "loss": 0.92024505, + "num_input_tokens_seen": 121132485, + "router_z_loss_clip": 1.7734375, + "router_z_loss_mlp": 0.1428833, + "step": 5643, + "time_per_iteration": 2.542921304702759 + }, + { + "auxiliary_loss_clip": 0.06468654, + "auxiliary_loss_mlp": 0.01272873, + "balance_loss_clip": 0.06285787, + "balance_loss_mlp": 0.01256708, + "epoch": 0.3393356380580189, + "flos": 23812380086400.0, + "grad_norm": 6.736940029896959, + "language_loss": 0.77634799, + "learning_rate": 3.077421627435922e-06, + "loss": 0.85376322, + "num_input_tokens_seen": 121152935, + "router_z_loss_clip": 1.828125, + "router_z_loss_mlp": 0.16162109, + "step": 5644, + "time_per_iteration": 2.523386240005493 + }, + { + "auxiliary_loss_clip": 0.06472027, + "auxiliary_loss_mlp": 0.01274584, + "balance_loss_clip": 0.06288091, + "balance_loss_mlp": 0.0125873, + "epoch": 0.3393957613106869, + "flos": 17353637919360.0, + "grad_norm": 2.9654561398927752, + "language_loss": 0.6324017, + "learning_rate": 3.0770934891469832e-06, + "loss": 0.70986784, + "num_input_tokens_seen": 121169835, + "router_z_loss_clip": 1.84179688, + "router_z_loss_mlp": 0.15856934, + "step": 5645, + "time_per_iteration": 2.51273775100708 + }, + { + "auxiliary_loss_clip": 0.06466414, + "auxiliary_loss_mlp": 0.01272532, + "balance_loss_clip": 0.06285059, + "balance_loss_mlp": 0.01256284, + "epoch": 0.3394558845633549, + "flos": 28440647884800.0, + "grad_norm": 2.089100449350665, + "language_loss": 0.77295536, + "learning_rate": 3.076765310014552e-06, + "loss": 0.8503449, + "num_input_tokens_seen": 121190290, + "router_z_loss_clip": 1.8125, + "router_z_loss_mlp": 0.16247559, + "step": 5646, + "time_per_iteration": 2.5461859703063965 + }, + { + "auxiliary_loss_clip": 0.06477356, + "auxiliary_loss_mlp": 0.01274638, + "balance_loss_clip": 0.06287819, + "balance_loss_mlp": 0.01257568, + "epoch": 0.33951600781602287, + "flos": 22092804996480.0, + "grad_norm": 2.533529984962848, + "language_loss": 0.79702288, + "learning_rate": 3.0764370900510727e-06, + "loss": 0.87454283, + "num_input_tokens_seen": 121209060, + "router_z_loss_clip": 1.89550781, + "router_z_loss_mlp": 0.17077637, + "step": 5647, + "time_per_iteration": 2.5699684619903564 + }, + { + "auxiliary_loss_clip": 0.0647471, + "auxiliary_loss_mlp": 0.01272685, + "balance_loss_clip": 0.06288452, + "balance_loss_mlp": 0.01256067, + "epoch": 0.33957613106869083, + "flos": 23885027176320.0, + "grad_norm": 2.1454269075726535, + "language_loss": 0.78001738, + "learning_rate": 3.0761088292689904e-06, + "loss": 0.85749137, + "num_input_tokens_seen": 121227480, + "router_z_loss_clip": 1.86328125, + "router_z_loss_mlp": 0.16625977, + "step": 5648, + "time_per_iteration": 2.5294926166534424 + }, + { + "auxiliary_loss_clip": 0.063921, + "auxiliary_loss_mlp": 0.01261966, + "balance_loss_clip": 0.0630298, + "balance_loss_mlp": 0.01254759, + "epoch": 0.3396362543213588, + "flos": 71264411066880.0, + "grad_norm": 0.7604552176896413, + "language_loss": 0.56109136, + "learning_rate": 3.075780527680754e-06, + "loss": 0.63763207, + "num_input_tokens_seen": 121291305, + "router_z_loss_clip": 0.890625, + "router_z_loss_mlp": 0.07196045, + "step": 5649, + "time_per_iteration": 3.2003703117370605 + }, + { + "auxiliary_loss_clip": 0.06473398, + "auxiliary_loss_mlp": 0.01280094, + "balance_loss_clip": 0.06287606, + "balance_loss_mlp": 0.01263274, + "epoch": 0.33969637757402676, + "flos": 25928746986240.0, + "grad_norm": 1.4812234353432667, + "language_loss": 0.85783911, + "learning_rate": 3.0754521852988117e-06, + "loss": 0.93537402, + "num_input_tokens_seen": 121312740, + "router_z_loss_clip": 1.85839844, + "router_z_loss_mlp": 0.16821289, + "step": 5650, + "time_per_iteration": 2.551633834838867 + }, + { + "auxiliary_loss_clip": 0.06475022, + "auxiliary_loss_mlp": 0.01277613, + "balance_loss_clip": 0.06292272, + "balance_loss_mlp": 0.01261841, + "epoch": 0.33975650082669473, + "flos": 35270382003840.0, + "grad_norm": 3.382903843955623, + "language_loss": 0.71404934, + "learning_rate": 3.0751238021356152e-06, + "loss": 0.79157567, + "num_input_tokens_seen": 121334220, + "router_z_loss_clip": 1.82617188, + "router_z_loss_mlp": 0.15759277, + "step": 5651, + "time_per_iteration": 2.665083885192871 + }, + { + "auxiliary_loss_clip": 0.06471914, + "auxiliary_loss_mlp": 0.01278706, + "balance_loss_clip": 0.06286959, + "balance_loss_mlp": 0.01261922, + "epoch": 0.3398166240793627, + "flos": 16651373650560.0, + "grad_norm": 4.478617872089092, + "language_loss": 0.81850624, + "learning_rate": 3.074795378203616e-06, + "loss": 0.89601243, + "num_input_tokens_seen": 121351870, + "router_z_loss_clip": 1.84960938, + "router_z_loss_mlp": 0.16772461, + "step": 5652, + "time_per_iteration": 2.5136160850524902 + }, + { + "auxiliary_loss_clip": 0.06483054, + "auxiliary_loss_mlp": 0.01281024, + "balance_loss_clip": 0.06293614, + "balance_loss_mlp": 0.0126344, + "epoch": 0.33987674733203066, + "flos": 24069244377600.0, + "grad_norm": 3.0225456344203088, + "language_loss": 0.77707815, + "learning_rate": 3.0744669135152685e-06, + "loss": 0.85471892, + "num_input_tokens_seen": 121373400, + "router_z_loss_clip": 1.89257812, + "router_z_loss_mlp": 0.17590332, + "step": 5653, + "time_per_iteration": 2.6221256256103516 + }, + { + "auxiliary_loss_clip": 0.06478614, + "auxiliary_loss_mlp": 0.01275428, + "balance_loss_clip": 0.06293246, + "balance_loss_mlp": 0.01259788, + "epoch": 0.3399368705846986, + "flos": 13253955603840.0, + "grad_norm": 4.6454995512067745, + "language_loss": 0.86809218, + "learning_rate": 3.0741384080830278e-06, + "loss": 0.94563264, + "num_input_tokens_seen": 121385225, + "router_z_loss_clip": 1.85449219, + "router_z_loss_mlp": 0.15625, + "step": 5654, + "time_per_iteration": 2.4661965370178223 + }, + { + "auxiliary_loss_clip": 0.06476527, + "auxiliary_loss_mlp": 0.01283952, + "balance_loss_clip": 0.06292438, + "balance_loss_mlp": 0.01267584, + "epoch": 0.3399969938373666, + "flos": 27019585365120.0, + "grad_norm": 2.782601809339298, + "language_loss": 0.65974486, + "learning_rate": 3.073809861919351e-06, + "loss": 0.73734963, + "num_input_tokens_seen": 121404735, + "router_z_loss_clip": 1.83984375, + "router_z_loss_mlp": 0.16369629, + "step": 5655, + "time_per_iteration": 2.555647611618042 + }, + { + "auxiliary_loss_clip": 0.06478781, + "auxiliary_loss_mlp": 0.01275484, + "balance_loss_clip": 0.06293027, + "balance_loss_mlp": 0.01259558, + "epoch": 0.34005711709003456, + "flos": 28557920073600.0, + "grad_norm": 1.4106761603755547, + "language_loss": 0.76612461, + "learning_rate": 3.073481275036697e-06, + "loss": 0.84366733, + "num_input_tokens_seen": 121426780, + "router_z_loss_clip": 1.85839844, + "router_z_loss_mlp": 0.15917969, + "step": 5656, + "time_per_iteration": 2.644866466522217 + }, + { + "auxiliary_loss_clip": 0.06484362, + "auxiliary_loss_mlp": 0.01277113, + "balance_loss_clip": 0.06293096, + "balance_loss_mlp": 0.01260436, + "epoch": 0.3401172403427025, + "flos": 21623533804800.0, + "grad_norm": 1.950261924987131, + "language_loss": 0.83422613, + "learning_rate": 3.073152647447525e-06, + "loss": 0.9118408, + "num_input_tokens_seen": 121447245, + "router_z_loss_clip": 1.91210938, + "router_z_loss_mlp": 0.16674805, + "step": 5657, + "time_per_iteration": 2.701688051223755 + }, + { + "auxiliary_loss_clip": 0.06477939, + "auxiliary_loss_mlp": 0.01276671, + "balance_loss_clip": 0.06292981, + "balance_loss_mlp": 0.01259851, + "epoch": 0.3401773635953705, + "flos": 25893010419840.0, + "grad_norm": 5.064784702806917, + "language_loss": 0.86277437, + "learning_rate": 3.0728239791642976e-06, + "loss": 0.94032043, + "num_input_tokens_seen": 121468165, + "router_z_loss_clip": 1.84765625, + "router_z_loss_mlp": 0.16833496, + "step": 5658, + "time_per_iteration": 2.622107744216919 + }, + { + "auxiliary_loss_clip": 0.06400045, + "auxiliary_loss_mlp": 0.01275632, + "balance_loss_clip": 0.06310016, + "balance_loss_mlp": 0.01268671, + "epoch": 0.3402374868480385, + "flos": 65527737459840.0, + "grad_norm": 0.8082747939523138, + "language_loss": 0.59960568, + "learning_rate": 3.072495270199477e-06, + "loss": 0.67636251, + "num_input_tokens_seen": 121523795, + "router_z_loss_clip": 0.8984375, + "router_z_loss_mlp": 0.06970215, + "step": 5659, + "time_per_iteration": 3.1002566814422607 + }, + { + "auxiliary_loss_clip": 0.0647618, + "auxiliary_loss_mlp": 0.01281423, + "balance_loss_clip": 0.06294397, + "balance_loss_mlp": 0.01264591, + "epoch": 0.34029761010070647, + "flos": 24067357660800.0, + "grad_norm": 2.7764582815625514, + "language_loss": 0.68693221, + "learning_rate": 3.0721665205655284e-06, + "loss": 0.76450825, + "num_input_tokens_seen": 121542950, + "router_z_loss_clip": 1.81542969, + "router_z_loss_mlp": 0.16821289, + "step": 5660, + "time_per_iteration": 2.620135545730591 + }, + { + "auxiliary_loss_clip": 0.06473149, + "auxiliary_loss_mlp": 0.01278369, + "balance_loss_clip": 0.06289428, + "balance_loss_mlp": 0.01262157, + "epoch": 0.34035773335337444, + "flos": 27607093067520.0, + "grad_norm": 2.0682817387265477, + "language_loss": 0.6727913, + "learning_rate": 3.071837730274918e-06, + "loss": 0.75030649, + "num_input_tokens_seen": 121562765, + "router_z_loss_clip": 1.83691406, + "router_z_loss_mlp": 0.16210938, + "step": 5661, + "time_per_iteration": 2.56429123878479 + }, + { + "auxiliary_loss_clip": 0.06469939, + "auxiliary_loss_mlp": 0.01280149, + "balance_loss_clip": 0.06289508, + "balance_loss_mlp": 0.01264175, + "epoch": 0.3404178566060424, + "flos": 20818923373440.0, + "grad_norm": 1.802665197928241, + "language_loss": 0.79380333, + "learning_rate": 3.071508899340113e-06, + "loss": 0.87130427, + "num_input_tokens_seen": 121581610, + "router_z_loss_clip": 1.80566406, + "router_z_loss_mlp": 0.15966797, + "step": 5662, + "time_per_iteration": 2.552755832672119 + }, + { + "auxiliary_loss_clip": 0.06474, + "auxiliary_loss_mlp": 0.01278156, + "balance_loss_clip": 0.06290844, + "balance_loss_mlp": 0.01260454, + "epoch": 0.34047797985871037, + "flos": 26840818679040.0, + "grad_norm": 2.1558050020889894, + "language_loss": 0.73809367, + "learning_rate": 3.0711800277735833e-06, + "loss": 0.8156153, + "num_input_tokens_seen": 121601885, + "router_z_loss_clip": 1.83007812, + "router_z_loss_mlp": 0.17700195, + "step": 5663, + "time_per_iteration": 2.5490622520446777 + }, + { + "auxiliary_loss_clip": 0.06470126, + "auxiliary_loss_mlp": 0.01281986, + "balance_loss_clip": 0.06290488, + "balance_loss_mlp": 0.01265714, + "epoch": 0.34053810311137833, + "flos": 19688742702720.0, + "grad_norm": 1.852400144955729, + "language_loss": 0.86839676, + "learning_rate": 3.0708511155877997e-06, + "loss": 0.94591784, + "num_input_tokens_seen": 121621335, + "router_z_loss_clip": 1.796875, + "router_z_loss_mlp": 0.16259766, + "step": 5664, + "time_per_iteration": 5.419060707092285 + }, + { + "auxiliary_loss_clip": 0.06483276, + "auxiliary_loss_mlp": 0.01274362, + "balance_loss_clip": 0.06295361, + "balance_loss_mlp": 0.01257423, + "epoch": 0.3405982263640463, + "flos": 21732169023360.0, + "grad_norm": 1.8640809787797845, + "language_loss": 0.69509971, + "learning_rate": 3.070522162795235e-06, + "loss": 0.77267611, + "num_input_tokens_seen": 121641310, + "router_z_loss_clip": 1.88183594, + "router_z_loss_mlp": 0.16943359, + "step": 5665, + "time_per_iteration": 2.547194719314575 + }, + { + "auxiliary_loss_clip": 0.06482168, + "auxiliary_loss_mlp": 0.01274659, + "balance_loss_clip": 0.0629427, + "balance_loss_mlp": 0.01257648, + "epoch": 0.34065834961671426, + "flos": 18047600634240.0, + "grad_norm": 2.6257214905883237, + "language_loss": 0.73526829, + "learning_rate": 3.0701931694083626e-06, + "loss": 0.81283653, + "num_input_tokens_seen": 121659625, + "router_z_loss_clip": 1.87890625, + "router_z_loss_mlp": 0.17016602, + "step": 5666, + "time_per_iteration": 2.527994155883789 + }, + { + "auxiliary_loss_clip": 0.06482688, + "auxiliary_loss_mlp": 0.01272217, + "balance_loss_clip": 0.06295212, + "balance_loss_mlp": 0.01255373, + "epoch": 0.3407184728693822, + "flos": 21403705818240.0, + "grad_norm": 1.661941695135435, + "language_loss": 0.74005675, + "learning_rate": 3.0698641354396576e-06, + "loss": 0.81760579, + "num_input_tokens_seen": 121679205, + "router_z_loss_clip": 1.87304688, + "router_z_loss_mlp": 0.1685791, + "step": 5667, + "time_per_iteration": 4.029574155807495 + }, + { + "auxiliary_loss_clip": 0.06378959, + "auxiliary_loss_mlp": 0.01268313, + "balance_loss_clip": 0.06290369, + "balance_loss_mlp": 0.01260898, + "epoch": 0.3407785961220502, + "flos": 68709352515840.0, + "grad_norm": 0.8062084259911544, + "language_loss": 0.63318539, + "learning_rate": 3.069535060901597e-06, + "loss": 0.70965815, + "num_input_tokens_seen": 121751085, + "router_z_loss_clip": 0.8828125, + "router_z_loss_mlp": 0.07397461, + "step": 5668, + "time_per_iteration": 3.3641560077667236 + }, + { + "auxiliary_loss_clip": 0.06472414, + "auxiliary_loss_mlp": 0.01272754, + "balance_loss_clip": 0.0628752, + "balance_loss_mlp": 0.01256863, + "epoch": 0.34083871937471816, + "flos": 14069634773760.0, + "grad_norm": 2.007810831329869, + "language_loss": 0.73127198, + "learning_rate": 3.0692059458066596e-06, + "loss": 0.80872369, + "num_input_tokens_seen": 121768565, + "router_z_loss_clip": 1.84765625, + "router_z_loss_mlp": 0.15893555, + "step": 5669, + "time_per_iteration": 2.4918038845062256 + }, + { + "auxiliary_loss_clip": 0.06479842, + "auxiliary_loss_mlp": 0.0127954, + "balance_loss_clip": 0.06292197, + "balance_loss_mlp": 0.01263423, + "epoch": 0.3408988426273861, + "flos": 17089981447680.0, + "grad_norm": 2.0642744441347287, + "language_loss": 0.80626565, + "learning_rate": 3.0688767901673265e-06, + "loss": 0.88385952, + "num_input_tokens_seen": 121784925, + "router_z_loss_clip": 1.87597656, + "router_z_loss_mlp": 0.16125488, + "step": 5670, + "time_per_iteration": 2.5270040035247803 + }, + { + "auxiliary_loss_clip": 0.06481062, + "auxiliary_loss_mlp": 0.01275164, + "balance_loss_clip": 0.06291522, + "balance_loss_mlp": 0.0125838, + "epoch": 0.3409589658800541, + "flos": 24031411459200.0, + "grad_norm": 1.863009265742361, + "language_loss": 0.77916187, + "learning_rate": 3.068547593996078e-06, + "loss": 0.85672414, + "num_input_tokens_seen": 121804425, + "router_z_loss_clip": 1.89355469, + "router_z_loss_mlp": 0.16784668, + "step": 5671, + "time_per_iteration": 4.039815664291382 + }, + { + "auxiliary_loss_clip": 0.06473973, + "auxiliary_loss_mlp": 0.01276984, + "balance_loss_clip": 0.06289308, + "balance_loss_mlp": 0.01260712, + "epoch": 0.34101908913272205, + "flos": 21148350900480.0, + "grad_norm": 1.9142883162018633, + "language_loss": 0.74626315, + "learning_rate": 3.0682183573053974e-06, + "loss": 0.82377267, + "num_input_tokens_seen": 121825145, + "router_z_loss_clip": 1.84960938, + "router_z_loss_mlp": 0.16259766, + "step": 5672, + "time_per_iteration": 2.564887762069702 + }, + { + "auxiliary_loss_clip": 0.06475951, + "auxiliary_loss_mlp": 0.01275656, + "balance_loss_clip": 0.06287946, + "balance_loss_mlp": 0.01259265, + "epoch": 0.3410792123853901, + "flos": 15706835700480.0, + "grad_norm": 1.714309741158987, + "language_loss": 0.73791027, + "learning_rate": 3.06788908010777e-06, + "loss": 0.81542635, + "num_input_tokens_seen": 121842185, + "router_z_loss_clip": 1.88085938, + "router_z_loss_mlp": 0.16394043, + "step": 5673, + "time_per_iteration": 2.540194511413574 + }, + { + "auxiliary_loss_clip": 0.06466323, + "auxiliary_loss_mlp": 0.01283225, + "balance_loss_clip": 0.06284231, + "balance_loss_mlp": 0.01266584, + "epoch": 0.34113933563805804, + "flos": 23042122629120.0, + "grad_norm": 1.8379615104267257, + "language_loss": 0.7978701, + "learning_rate": 3.067559762415682e-06, + "loss": 0.87536556, + "num_input_tokens_seen": 121862260, + "router_z_loss_clip": 1.81835938, + "router_z_loss_mlp": 0.16638184, + "step": 5674, + "time_per_iteration": 2.5462148189544678 + }, + { + "auxiliary_loss_clip": 0.06364837, + "auxiliary_loss_mlp": 0.01262017, + "balance_loss_clip": 0.06277348, + "balance_loss_mlp": 0.01255442, + "epoch": 0.341199458890726, + "flos": 69631878769920.0, + "grad_norm": 0.7752872762952348, + "language_loss": 0.56147063, + "learning_rate": 3.0672304042416198e-06, + "loss": 0.63773918, + "num_input_tokens_seen": 121923560, + "router_z_loss_clip": 0.875, + "router_z_loss_mlp": 0.06585693, + "step": 5675, + "time_per_iteration": 3.370281457901001 + }, + { + "auxiliary_loss_clip": 0.0645988, + "auxiliary_loss_mlp": 0.01273396, + "balance_loss_clip": 0.06281768, + "balance_loss_mlp": 0.01257398, + "epoch": 0.34125958214339397, + "flos": 22352939596800.0, + "grad_norm": 2.600205708544321, + "language_loss": 0.79689062, + "learning_rate": 3.0669010055980734e-06, + "loss": 0.87422335, + "num_input_tokens_seen": 121943515, + "router_z_loss_clip": 1.78125, + "router_z_loss_mlp": 0.16003418, + "step": 5676, + "time_per_iteration": 2.5312321186065674 + }, + { + "auxiliary_loss_clip": 0.06470488, + "auxiliary_loss_mlp": 0.01271752, + "balance_loss_clip": 0.06286064, + "balance_loss_mlp": 0.01255051, + "epoch": 0.34131970539606193, + "flos": 21878427525120.0, + "grad_norm": 2.203551534393157, + "language_loss": 0.8601976, + "learning_rate": 3.0665715664975357e-06, + "loss": 0.93761992, + "num_input_tokens_seen": 121962540, + "router_z_loss_clip": 1.84375, + "router_z_loss_mlp": 0.16699219, + "step": 5677, + "time_per_iteration": 2.555037260055542 + }, + { + "auxiliary_loss_clip": 0.06463757, + "auxiliary_loss_mlp": 0.01274207, + "balance_loss_clip": 0.06280699, + "balance_loss_mlp": 0.01257268, + "epoch": 0.3413798286487299, + "flos": 24942560757120.0, + "grad_norm": 2.786164717546535, + "language_loss": 0.80252033, + "learning_rate": 3.0662420869524966e-06, + "loss": 0.87989998, + "num_input_tokens_seen": 121979830, + "router_z_loss_clip": 1.83203125, + "router_z_loss_mlp": 0.16955566, + "step": 5678, + "time_per_iteration": 2.6321489810943604 + }, + { + "auxiliary_loss_clip": 0.06467854, + "auxiliary_loss_mlp": 0.01270663, + "balance_loss_clip": 0.06282793, + "balance_loss_mlp": 0.01255404, + "epoch": 0.34143995190139786, + "flos": 25381420116480.0, + "grad_norm": 1.8772848902338297, + "language_loss": 0.75927806, + "learning_rate": 3.0659125669754506e-06, + "loss": 0.83666325, + "num_input_tokens_seen": 121999055, + "router_z_loss_clip": 1.85253906, + "router_z_loss_mlp": 0.15246582, + "step": 5679, + "time_per_iteration": 2.5981781482696533 + }, + { + "auxiliary_loss_clip": 0.06365222, + "auxiliary_loss_mlp": 0.01260685, + "balance_loss_clip": 0.06278291, + "balance_loss_mlp": 0.01253538, + "epoch": 0.34150007515406583, + "flos": 67804785763200.0, + "grad_norm": 0.7019635675964923, + "language_loss": 0.59521842, + "learning_rate": 3.0655830065788923e-06, + "loss": 0.67147756, + "num_input_tokens_seen": 122067015, + "router_z_loss_clip": 0.8671875, + "router_z_loss_mlp": 0.0713501, + "step": 5680, + "time_per_iteration": 3.2768852710723877 + }, + { + "auxiliary_loss_clip": 0.06464119, + "auxiliary_loss_mlp": 0.01271493, + "balance_loss_clip": 0.06282759, + "balance_loss_mlp": 0.01255602, + "epoch": 0.3415601984067338, + "flos": 20308548954240.0, + "grad_norm": 1.756785442101194, + "language_loss": 0.72804415, + "learning_rate": 3.0652534057753206e-06, + "loss": 0.80540025, + "num_input_tokens_seen": 122085295, + "router_z_loss_clip": 1.81445312, + "router_z_loss_mlp": 0.15881348, + "step": 5681, + "time_per_iteration": 2.540839195251465 + }, + { + "auxiliary_loss_clip": 0.06462204, + "auxiliary_loss_mlp": 0.01272244, + "balance_loss_clip": 0.06283034, + "balance_loss_mlp": 0.01256806, + "epoch": 0.34162032165940176, + "flos": 26038346526720.0, + "grad_norm": 5.204332383129175, + "language_loss": 0.71220171, + "learning_rate": 3.064923764577233e-06, + "loss": 0.78954625, + "num_input_tokens_seen": 122104020, + "router_z_loss_clip": 1.79199219, + "router_z_loss_mlp": 0.15454102, + "step": 5682, + "time_per_iteration": 2.5933032035827637 + }, + { + "auxiliary_loss_clip": 0.06466864, + "auxiliary_loss_mlp": 0.0127503, + "balance_loss_clip": 0.06286532, + "balance_loss_mlp": 0.01258711, + "epoch": 0.3416804449120697, + "flos": 28810843223040.0, + "grad_norm": 1.4703350638010875, + "language_loss": 0.83879244, + "learning_rate": 3.0645940829971295e-06, + "loss": 0.91621137, + "num_input_tokens_seen": 122125080, + "router_z_loss_clip": 1.80273438, + "router_z_loss_mlp": 0.16320801, + "step": 5683, + "time_per_iteration": 2.595921277999878 + }, + { + "auxiliary_loss_clip": 0.06468399, + "auxiliary_loss_mlp": 0.01274924, + "balance_loss_clip": 0.06284815, + "balance_loss_mlp": 0.01258354, + "epoch": 0.3417405681647377, + "flos": 22608210660480.0, + "grad_norm": 1.8188343464074745, + "language_loss": 0.71334541, + "learning_rate": 3.0642643610475116e-06, + "loss": 0.79077864, + "num_input_tokens_seen": 122146350, + "router_z_loss_clip": 1.8359375, + "router_z_loss_mlp": 0.16577148, + "step": 5684, + "time_per_iteration": 2.5821194648742676 + }, + { + "auxiliary_loss_clip": 0.06462076, + "auxiliary_loss_mlp": 0.01268234, + "balance_loss_clip": 0.06284268, + "balance_loss_mlp": 0.01253816, + "epoch": 0.34180069141740566, + "flos": 24722942405760.0, + "grad_norm": 1.4943065575919134, + "language_loss": 0.75352108, + "learning_rate": 3.0639345987408823e-06, + "loss": 0.8308242, + "num_input_tokens_seen": 122168085, + "router_z_loss_clip": 1.77636719, + "router_z_loss_mlp": 0.144104, + "step": 5685, + "time_per_iteration": 2.545419216156006 + }, + { + "auxiliary_loss_clip": 0.06457227, + "auxiliary_loss_mlp": 0.01270508, + "balance_loss_clip": 0.06281762, + "balance_loss_mlp": 0.0125501, + "epoch": 0.3418608146700737, + "flos": 30526644879360.0, + "grad_norm": 1.8907916568784255, + "language_loss": 0.70833004, + "learning_rate": 3.0636047960897468e-06, + "loss": 0.7856074, + "num_input_tokens_seen": 122191040, + "router_z_loss_clip": 1.75390625, + "router_z_loss_mlp": 0.1550293, + "step": 5686, + "time_per_iteration": 2.645081043243408 + }, + { + "auxiliary_loss_clip": 0.06467415, + "auxiliary_loss_mlp": 0.01269453, + "balance_loss_clip": 0.06284459, + "balance_loss_mlp": 0.01253407, + "epoch": 0.34192093792274164, + "flos": 15127755333120.0, + "grad_norm": 2.1973050683231303, + "language_loss": 0.77864039, + "learning_rate": 3.06327495310661e-06, + "loss": 0.85600907, + "num_input_tokens_seen": 122209225, + "router_z_loss_clip": 1.82910156, + "router_z_loss_mlp": 0.16052246, + "step": 5687, + "time_per_iteration": 2.501957654953003 + }, + { + "auxiliary_loss_clip": 0.06462508, + "auxiliary_loss_mlp": 0.01273993, + "balance_loss_clip": 0.06286186, + "balance_loss_mlp": 0.01257435, + "epoch": 0.3419810611754096, + "flos": 13192754595840.0, + "grad_norm": 1.8198375176693335, + "language_loss": 0.87159389, + "learning_rate": 3.062945069803981e-06, + "loss": 0.94895893, + "num_input_tokens_seen": 122226160, + "router_z_loss_clip": 1.76269531, + "router_z_loss_mlp": 0.16552734, + "step": 5688, + "time_per_iteration": 2.514558792114258 + }, + { + "auxiliary_loss_clip": 0.06470017, + "auxiliary_loss_mlp": 0.01272882, + "balance_loss_clip": 0.06283651, + "balance_loss_mlp": 0.01255025, + "epoch": 0.34204118442807757, + "flos": 19542274565760.0, + "grad_norm": 1.9150705307332732, + "language_loss": 0.80177575, + "learning_rate": 3.0626151461943684e-06, + "loss": 0.87920475, + "num_input_tokens_seen": 122243115, + "router_z_loss_clip": 1.86328125, + "router_z_loss_mlp": 0.17858887, + "step": 5689, + "time_per_iteration": 2.4941842555999756 + }, + { + "auxiliary_loss_clip": 0.06471369, + "auxiliary_loss_mlp": 0.01270545, + "balance_loss_clip": 0.06286988, + "balance_loss_mlp": 0.01254476, + "epoch": 0.34210130768074554, + "flos": 15200192787840.0, + "grad_norm": 1.8413075326603192, + "language_loss": 0.74004579, + "learning_rate": 3.0622851822902834e-06, + "loss": 0.81746483, + "num_input_tokens_seen": 122261105, + "router_z_loss_clip": 1.84277344, + "router_z_loss_mlp": 0.1607666, + "step": 5690, + "time_per_iteration": 2.5133728981018066 + }, + { + "auxiliary_loss_clip": 0.06470567, + "auxiliary_loss_mlp": 0.01270745, + "balance_loss_clip": 0.06288044, + "balance_loss_mlp": 0.01254854, + "epoch": 0.3421614309334135, + "flos": 24943147735680.0, + "grad_norm": 2.8439157619722666, + "language_loss": 0.76563686, + "learning_rate": 3.061955178104237e-06, + "loss": 0.84305, + "num_input_tokens_seen": 122279995, + "router_z_loss_clip": 1.82519531, + "router_z_loss_mlp": 0.15893555, + "step": 5691, + "time_per_iteration": 2.5346477031707764 + }, + { + "auxiliary_loss_clip": 0.06465675, + "auxiliary_loss_mlp": 0.01269129, + "balance_loss_clip": 0.06286939, + "balance_loss_mlp": 0.01254395, + "epoch": 0.34222155418608147, + "flos": 21915170340480.0, + "grad_norm": 1.7269103068173344, + "language_loss": 0.6888957, + "learning_rate": 3.0616251336487447e-06, + "loss": 0.7662437, + "num_input_tokens_seen": 122299070, + "router_z_loss_clip": 1.78710938, + "router_z_loss_mlp": 0.1472168, + "step": 5692, + "time_per_iteration": 2.544475793838501 + }, + { + "auxiliary_loss_clip": 0.06469652, + "auxiliary_loss_mlp": 0.01276187, + "balance_loss_clip": 0.06286649, + "balance_loss_mlp": 0.01259069, + "epoch": 0.34228167743874943, + "flos": 18119954234880.0, + "grad_norm": 2.5543870280075494, + "language_loss": 0.72691154, + "learning_rate": 3.06129504893632e-06, + "loss": 0.80436993, + "num_input_tokens_seen": 122316800, + "router_z_loss_clip": 1.83203125, + "router_z_loss_mlp": 0.17126465, + "step": 5693, + "time_per_iteration": 2.4823062419891357 + }, + { + "auxiliary_loss_clip": 0.06469734, + "auxiliary_loss_mlp": 0.01268069, + "balance_loss_clip": 0.06291726, + "balance_loss_mlp": 0.01253049, + "epoch": 0.3423418006914174, + "flos": 21295070599680.0, + "grad_norm": 1.6526919771326485, + "language_loss": 0.76433146, + "learning_rate": 3.0609649239794813e-06, + "loss": 0.84170949, + "num_input_tokens_seen": 122335275, + "router_z_loss_clip": 1.78027344, + "router_z_loss_mlp": 0.15008545, + "step": 5694, + "time_per_iteration": 2.5759999752044678 + }, + { + "auxiliary_loss_clip": 0.06469683, + "auxiliary_loss_mlp": 0.01269733, + "balance_loss_clip": 0.06292015, + "balance_loss_mlp": 0.01254498, + "epoch": 0.34240192394408536, + "flos": 19828754075520.0, + "grad_norm": 1.7073290043069882, + "language_loss": 0.80359411, + "learning_rate": 3.060634758790747e-06, + "loss": 0.88098824, + "num_input_tokens_seen": 122353215, + "router_z_loss_clip": 1.77734375, + "router_z_loss_mlp": 0.15222168, + "step": 5695, + "time_per_iteration": 2.53019118309021 + }, + { + "auxiliary_loss_clip": 0.06473886, + "auxiliary_loss_mlp": 0.01274215, + "balance_loss_clip": 0.06292082, + "balance_loss_mlp": 0.01257335, + "epoch": 0.3424620471967533, + "flos": 24542498638080.0, + "grad_norm": 2.150928833794339, + "language_loss": 0.74189723, + "learning_rate": 3.060304553382635e-06, + "loss": 0.81937826, + "num_input_tokens_seen": 122372495, + "router_z_loss_clip": 1.81640625, + "router_z_loss_mlp": 0.16882324, + "step": 5696, + "time_per_iteration": 2.6046504974365234 + }, + { + "auxiliary_loss_clip": 0.06472932, + "auxiliary_loss_mlp": 0.01273918, + "balance_loss_clip": 0.062935, + "balance_loss_mlp": 0.0125786, + "epoch": 0.3425221704494213, + "flos": 25856057969280.0, + "grad_norm": 1.9268953245740004, + "language_loss": 0.71419311, + "learning_rate": 3.0599743077676685e-06, + "loss": 0.79166162, + "num_input_tokens_seen": 122394600, + "router_z_loss_clip": 1.79394531, + "router_z_loss_mlp": 0.16052246, + "step": 5697, + "time_per_iteration": 2.565295696258545 + }, + { + "auxiliary_loss_clip": 0.06469944, + "auxiliary_loss_mlp": 0.01270077, + "balance_loss_clip": 0.06292768, + "balance_loss_mlp": 0.01254293, + "epoch": 0.34258229370208926, + "flos": 21546442448640.0, + "grad_norm": 1.77565898086167, + "language_loss": 0.82456839, + "learning_rate": 3.05964402195837e-06, + "loss": 0.90196872, + "num_input_tokens_seen": 122414700, + "router_z_loss_clip": 1.77050781, + "router_z_loss_mlp": 0.15795898, + "step": 5698, + "time_per_iteration": 2.636547327041626 + }, + { + "auxiliary_loss_clip": 0.06476933, + "auxiliary_loss_mlp": 0.01277942, + "balance_loss_clip": 0.06293021, + "balance_loss_mlp": 0.01260573, + "epoch": 0.3426424169547573, + "flos": 23658407009280.0, + "grad_norm": 1.9460205950694964, + "language_loss": 0.69722092, + "learning_rate": 3.0593136959672645e-06, + "loss": 0.77476966, + "num_input_tokens_seen": 122432760, + "router_z_loss_clip": 1.83886719, + "router_z_loss_mlp": 0.17358398, + "step": 5699, + "time_per_iteration": 2.523766040802002 + }, + { + "auxiliary_loss_clip": 0.06469926, + "auxiliary_loss_mlp": 0.0127405, + "balance_loss_clip": 0.06289239, + "balance_loss_mlp": 0.01257719, + "epoch": 0.34270254020742524, + "flos": 24651846616320.0, + "grad_norm": 2.105384484263751, + "language_loss": 0.72511256, + "learning_rate": 3.058983329806877e-06, + "loss": 0.80255234, + "num_input_tokens_seen": 122449105, + "router_z_loss_clip": 1.80566406, + "router_z_loss_mlp": 0.16320801, + "step": 5700, + "time_per_iteration": 2.57511568069458 + }, + { + "auxiliary_loss_clip": 0.06467311, + "auxiliary_loss_mlp": 0.01271093, + "balance_loss_clip": 0.06288276, + "balance_loss_mlp": 0.01254273, + "epoch": 0.3427626634600932, + "flos": 21003182501760.0, + "grad_norm": 2.114283139984186, + "language_loss": 0.82378924, + "learning_rate": 3.0586529234897354e-06, + "loss": 0.90117323, + "num_input_tokens_seen": 122468700, + "router_z_loss_clip": 1.79003906, + "router_z_loss_mlp": 0.16821289, + "step": 5701, + "time_per_iteration": 2.496392250061035 + }, + { + "auxiliary_loss_clip": 0.06469429, + "auxiliary_loss_mlp": 0.0127326, + "balance_loss_clip": 0.06287375, + "balance_loss_mlp": 0.01256439, + "epoch": 0.3428227867127612, + "flos": 21440155144320.0, + "grad_norm": 1.6330699344557849, + "language_loss": 0.71898985, + "learning_rate": 3.0583224770283694e-06, + "loss": 0.79641676, + "num_input_tokens_seen": 122488160, + "router_z_loss_clip": 1.8203125, + "router_z_loss_mlp": 0.16821289, + "step": 5702, + "time_per_iteration": 2.566856861114502 + }, + { + "auxiliary_loss_clip": 0.06377172, + "auxiliary_loss_mlp": 0.01259818, + "balance_loss_clip": 0.06290582, + "balance_loss_mlp": 0.01252552, + "epoch": 0.34288290996542914, + "flos": 55750219902720.0, + "grad_norm": 0.7671857510805999, + "language_loss": 0.56708395, + "learning_rate": 3.057991990435309e-06, + "loss": 0.64345384, + "num_input_tokens_seen": 122542890, + "router_z_loss_clip": 0.8671875, + "router_z_loss_mlp": 0.07244873, + "step": 5703, + "time_per_iteration": 4.447732925415039 + }, + { + "auxiliary_loss_clip": 0.06465772, + "auxiliary_loss_mlp": 0.01272683, + "balance_loss_clip": 0.06283242, + "balance_loss_mlp": 0.01255207, + "epoch": 0.3429430332180971, + "flos": 20162961285120.0, + "grad_norm": 1.88810633796735, + "language_loss": 0.74954486, + "learning_rate": 3.057661463723086e-06, + "loss": 0.82692933, + "num_input_tokens_seen": 122561770, + "router_z_loss_clip": 1.82617188, + "router_z_loss_mlp": 0.17468262, + "step": 5704, + "time_per_iteration": 4.062070608139038 + }, + { + "auxiliary_loss_clip": 0.06463447, + "auxiliary_loss_mlp": 0.01275499, + "balance_loss_clip": 0.06284866, + "balance_loss_mlp": 0.01259716, + "epoch": 0.34300315647076507, + "flos": 17971347818880.0, + "grad_norm": 2.0890845856962565, + "language_loss": 0.73438597, + "learning_rate": 3.0573308969042346e-06, + "loss": 0.81177545, + "num_input_tokens_seen": 122580580, + "router_z_loss_clip": 1.78417969, + "router_z_loss_mlp": 0.15795898, + "step": 5705, + "time_per_iteration": 2.5125277042388916 + }, + { + "auxiliary_loss_clip": 0.06466857, + "auxiliary_loss_mlp": 0.01271633, + "balance_loss_clip": 0.0628458, + "balance_loss_mlp": 0.01255194, + "epoch": 0.34306327972343303, + "flos": 22092679215360.0, + "grad_norm": 2.3658652894382075, + "language_loss": 0.80144984, + "learning_rate": 3.057000289991289e-06, + "loss": 0.87883472, + "num_input_tokens_seen": 122599810, + "router_z_loss_clip": 1.82226562, + "router_z_loss_mlp": 0.16430664, + "step": 5706, + "time_per_iteration": 2.524531364440918 + }, + { + "auxiliary_loss_clip": 0.06468605, + "auxiliary_loss_mlp": 0.01272132, + "balance_loss_clip": 0.06282079, + "balance_loss_mlp": 0.0125493, + "epoch": 0.343123402976101, + "flos": 18448669002240.0, + "grad_norm": 1.9272208577124825, + "language_loss": 0.83210528, + "learning_rate": 3.056669642996787e-06, + "loss": 0.90951264, + "num_input_tokens_seen": 122616035, + "router_z_loss_clip": 1.86621094, + "router_z_loss_mlp": 0.17199707, + "step": 5707, + "time_per_iteration": 4.017935514450073 + }, + { + "auxiliary_loss_clip": 0.06464301, + "auxiliary_loss_mlp": 0.01275983, + "balance_loss_clip": 0.06283538, + "balance_loss_mlp": 0.01259544, + "epoch": 0.34318352622876896, + "flos": 17169127228800.0, + "grad_norm": 1.5274992455100316, + "language_loss": 0.74774885, + "learning_rate": 3.056338955933266e-06, + "loss": 0.82515168, + "num_input_tokens_seen": 122633785, + "router_z_loss_clip": 1.80566406, + "router_z_loss_mlp": 0.16442871, + "step": 5708, + "time_per_iteration": 2.6189568042755127 + }, + { + "auxiliary_loss_clip": 0.06460952, + "auxiliary_loss_mlp": 0.01273078, + "balance_loss_clip": 0.06282704, + "balance_loss_mlp": 0.01256365, + "epoch": 0.34324364948143693, + "flos": 26695482572160.0, + "grad_norm": 1.5717787719434457, + "language_loss": 0.80904007, + "learning_rate": 3.0560082288132662e-06, + "loss": 0.88638043, + "num_input_tokens_seen": 122652100, + "router_z_loss_clip": 1.78515625, + "router_z_loss_mlp": 0.16711426, + "step": 5709, + "time_per_iteration": 2.563938617706299 + }, + { + "auxiliary_loss_clip": 0.06471742, + "auxiliary_loss_mlp": 0.01280104, + "balance_loss_clip": 0.06286193, + "balance_loss_mlp": 0.01260685, + "epoch": 0.3433037727341049, + "flos": 21257950440960.0, + "grad_norm": 2.571520261591023, + "language_loss": 0.79460347, + "learning_rate": 3.055677461649329e-06, + "loss": 0.87212193, + "num_input_tokens_seen": 122669720, + "router_z_loss_clip": 1.85449219, + "router_z_loss_mlp": 0.1940918, + "step": 5710, + "time_per_iteration": 2.5515291690826416 + }, + { + "auxiliary_loss_clip": 0.06468266, + "auxiliary_loss_mlp": 0.0127181, + "balance_loss_clip": 0.06282788, + "balance_loss_mlp": 0.01254334, + "epoch": 0.34336389598677286, + "flos": 20635377004800.0, + "grad_norm": 1.916674758610419, + "language_loss": 0.70532334, + "learning_rate": 3.055346654453996e-06, + "loss": 0.78272408, + "num_input_tokens_seen": 122688715, + "router_z_loss_clip": 1.85253906, + "router_z_loss_mlp": 0.17468262, + "step": 5711, + "time_per_iteration": 3.958890914916992 + }, + { + "auxiliary_loss_clip": 0.06467056, + "auxiliary_loss_mlp": 0.01273896, + "balance_loss_clip": 0.0628437, + "balance_loss_mlp": 0.01256909, + "epoch": 0.3434240192394409, + "flos": 14543895283200.0, + "grad_norm": 2.810027228242578, + "language_loss": 0.67786914, + "learning_rate": 3.055015807239812e-06, + "loss": 0.75527865, + "num_input_tokens_seen": 122706970, + "router_z_loss_clip": 1.82421875, + "router_z_loss_mlp": 0.16992188, + "step": 5712, + "time_per_iteration": 2.4752726554870605 + }, + { + "auxiliary_loss_clip": 0.06366295, + "auxiliary_loss_mlp": 0.01262856, + "balance_loss_clip": 0.06280869, + "balance_loss_mlp": 0.01254685, + "epoch": 0.34348414249210885, + "flos": 58067799183360.0, + "grad_norm": 0.8383081559544242, + "language_loss": 0.58214718, + "learning_rate": 3.0546849200193226e-06, + "loss": 0.65843868, + "num_input_tokens_seen": 122758095, + "router_z_loss_clip": 0.85205078, + "router_z_loss_mlp": 0.08172607, + "step": 5713, + "time_per_iteration": 3.11580491065979 + }, + { + "auxiliary_loss_clip": 0.06465655, + "auxiliary_loss_mlp": 0.01274581, + "balance_loss_clip": 0.06281169, + "balance_loss_mlp": 0.01257308, + "epoch": 0.3435442657447768, + "flos": 20710749352320.0, + "grad_norm": 1.8141637433077298, + "language_loss": 0.81045675, + "learning_rate": 3.054353992805076e-06, + "loss": 0.88785917, + "num_input_tokens_seen": 122777815, + "router_z_loss_clip": 1.84472656, + "router_z_loss_mlp": 0.17272949, + "step": 5714, + "time_per_iteration": 2.510929822921753 + }, + { + "auxiliary_loss_clip": 0.0646632, + "auxiliary_loss_mlp": 0.01276019, + "balance_loss_clip": 0.06283875, + "balance_loss_mlp": 0.01260045, + "epoch": 0.3436043889974448, + "flos": 22936967354880.0, + "grad_norm": 2.602776673257047, + "language_loss": 0.72001171, + "learning_rate": 3.05402302560962e-06, + "loss": 0.79743505, + "num_input_tokens_seen": 122797555, + "router_z_loss_clip": 1.82421875, + "router_z_loss_mlp": 0.15991211, + "step": 5715, + "time_per_iteration": 2.5680224895477295 + }, + { + "auxiliary_loss_clip": 0.06365244, + "auxiliary_loss_mlp": 0.01259148, + "balance_loss_clip": 0.06280053, + "balance_loss_mlp": 0.01251191, + "epoch": 0.34366451225011274, + "flos": 58423514964480.0, + "grad_norm": 0.8879413605742031, + "language_loss": 0.65628481, + "learning_rate": 3.053692018445505e-06, + "loss": 0.73252875, + "num_input_tokens_seen": 122863955, + "router_z_loss_clip": 0.8515625, + "router_z_loss_mlp": 0.07952881, + "step": 5716, + "time_per_iteration": 3.184952735900879 + }, + { + "auxiliary_loss_clip": 0.06463662, + "auxiliary_loss_mlp": 0.01279768, + "balance_loss_clip": 0.0628469, + "balance_loss_mlp": 0.01264509, + "epoch": 0.3437246355027807, + "flos": 15601722353280.0, + "grad_norm": 1.9800950186090778, + "language_loss": 0.74289393, + "learning_rate": 3.0533609713252838e-06, + "loss": 0.82032824, + "num_input_tokens_seen": 122883000, + "router_z_loss_clip": 1.78808594, + "router_z_loss_mlp": 0.15252686, + "step": 5717, + "time_per_iteration": 2.5220494270324707 + }, + { + "auxiliary_loss_clip": 0.06466433, + "auxiliary_loss_mlp": 0.01278824, + "balance_loss_clip": 0.0628383, + "balance_loss_mlp": 0.01262946, + "epoch": 0.34378475875544867, + "flos": 27679572449280.0, + "grad_norm": 1.8348085520910409, + "language_loss": 0.75694019, + "learning_rate": 3.0530298842615077e-06, + "loss": 0.83439279, + "num_input_tokens_seen": 122903265, + "router_z_loss_clip": 1.82617188, + "router_z_loss_mlp": 0.15869141, + "step": 5718, + "time_per_iteration": 2.5983147621154785 + }, + { + "auxiliary_loss_clip": 0.06468937, + "auxiliary_loss_mlp": 0.01273829, + "balance_loss_clip": 0.06284641, + "balance_loss_mlp": 0.01256829, + "epoch": 0.34384488200811664, + "flos": 31439638967040.0, + "grad_norm": 1.8816683210791167, + "language_loss": 0.6437763, + "learning_rate": 3.052698757266734e-06, + "loss": 0.72120392, + "num_input_tokens_seen": 122923860, + "router_z_loss_clip": 1.84179688, + "router_z_loss_mlp": 0.17004395, + "step": 5719, + "time_per_iteration": 2.7075517177581787 + }, + { + "auxiliary_loss_clip": 0.06472047, + "auxiliary_loss_mlp": 0.0127673, + "balance_loss_clip": 0.06285335, + "balance_loss_mlp": 0.012596, + "epoch": 0.3439050052607846, + "flos": 24906866117760.0, + "grad_norm": 1.6709560385881974, + "language_loss": 0.73730874, + "learning_rate": 3.0523675903535183e-06, + "loss": 0.81479651, + "num_input_tokens_seen": 122945305, + "router_z_loss_clip": 1.86816406, + "router_z_loss_mlp": 0.17150879, + "step": 5720, + "time_per_iteration": 2.5936295986175537 + }, + { + "auxiliary_loss_clip": 0.06469208, + "auxiliary_loss_mlp": 0.01280833, + "balance_loss_clip": 0.06286804, + "balance_loss_mlp": 0.01264072, + "epoch": 0.34396512851345257, + "flos": 18155900436480.0, + "grad_norm": 1.8909667336437188, + "language_loss": 0.74550021, + "learning_rate": 3.0520363835344173e-06, + "loss": 0.82300061, + "num_input_tokens_seen": 122962535, + "router_z_loss_clip": 1.82421875, + "router_z_loss_mlp": 0.16748047, + "step": 5721, + "time_per_iteration": 2.5109763145446777 + }, + { + "auxiliary_loss_clip": 0.06468637, + "auxiliary_loss_mlp": 0.01276688, + "balance_loss_clip": 0.06284628, + "balance_loss_mlp": 0.01260208, + "epoch": 0.34402525176612053, + "flos": 16039994734080.0, + "grad_norm": 3.7669546448597497, + "language_loss": 0.80102623, + "learning_rate": 3.051705136821992e-06, + "loss": 0.87847948, + "num_input_tokens_seen": 122979750, + "router_z_loss_clip": 1.83984375, + "router_z_loss_mlp": 0.16479492, + "step": 5722, + "time_per_iteration": 2.5231471061706543 + }, + { + "auxiliary_loss_clip": 0.06467631, + "auxiliary_loss_mlp": 0.01281232, + "balance_loss_clip": 0.06286201, + "balance_loss_mlp": 0.01265806, + "epoch": 0.3440853750187885, + "flos": 21185009861760.0, + "grad_norm": 1.9591310013999468, + "language_loss": 0.82034022, + "learning_rate": 3.051373850228801e-06, + "loss": 0.89782888, + "num_input_tokens_seen": 122998955, + "router_z_loss_clip": 1.81445312, + "router_z_loss_mlp": 0.1541748, + "step": 5723, + "time_per_iteration": 2.5556578636169434 + }, + { + "auxiliary_loss_clip": 0.06471531, + "auxiliary_loss_mlp": 0.01281521, + "balance_loss_clip": 0.0628756, + "balance_loss_mlp": 0.0126588, + "epoch": 0.34414549827145646, + "flos": 12682883301120.0, + "grad_norm": 1.867182825140108, + "language_loss": 0.8172524, + "learning_rate": 3.0510425237674096e-06, + "loss": 0.8947829, + "num_input_tokens_seen": 123016165, + "router_z_loss_clip": 1.83984375, + "router_z_loss_mlp": 0.15661621, + "step": 5724, + "time_per_iteration": 2.509129524230957 + }, + { + "auxiliary_loss_clip": 0.06476942, + "auxiliary_loss_mlp": 0.01281282, + "balance_loss_clip": 0.06292838, + "balance_loss_mlp": 0.01265237, + "epoch": 0.3442056215241244, + "flos": 31292458070400.0, + "grad_norm": 1.852126712281853, + "language_loss": 0.69186389, + "learning_rate": 3.05071115745038e-06, + "loss": 0.76944625, + "num_input_tokens_seen": 123036900, + "router_z_loss_clip": 1.83886719, + "router_z_loss_mlp": 0.16040039, + "step": 5725, + "time_per_iteration": 2.6253697872161865 + }, + { + "auxiliary_loss_clip": 0.06482734, + "auxiliary_loss_mlp": 0.01284248, + "balance_loss_clip": 0.06293113, + "balance_loss_mlp": 0.01266462, + "epoch": 0.34426574477679245, + "flos": 23373939997440.0, + "grad_norm": 1.5373453518160676, + "language_loss": 0.69532049, + "learning_rate": 3.0503797512902773e-06, + "loss": 0.77299035, + "num_input_tokens_seen": 123057480, + "router_z_loss_clip": 1.89648438, + "router_z_loss_mlp": 0.17785645, + "step": 5726, + "time_per_iteration": 2.5495173931121826 + }, + { + "auxiliary_loss_clip": 0.06477433, + "auxiliary_loss_mlp": 0.01281684, + "balance_loss_clip": 0.06292193, + "balance_loss_mlp": 0.01265948, + "epoch": 0.3443258680294604, + "flos": 24542372856960.0, + "grad_norm": 3.3735616171284453, + "language_loss": 0.73631704, + "learning_rate": 3.0500483052996703e-06, + "loss": 0.81390822, + "num_input_tokens_seen": 123076890, + "router_z_loss_clip": 1.85253906, + "router_z_loss_mlp": 0.15734863, + "step": 5727, + "time_per_iteration": 2.5395119190216064 + }, + { + "auxiliary_loss_clip": 0.06474276, + "auxiliary_loss_mlp": 0.01274594, + "balance_loss_clip": 0.06292102, + "balance_loss_mlp": 0.01259049, + "epoch": 0.3443859912821284, + "flos": 20236363061760.0, + "grad_norm": 1.756953821036591, + "language_loss": 0.88303459, + "learning_rate": 3.0497168194911257e-06, + "loss": 0.96052337, + "num_input_tokens_seen": 123092530, + "router_z_loss_clip": 1.8203125, + "router_z_loss_mlp": 0.15551758, + "step": 5728, + "time_per_iteration": 2.5943620204925537 + }, + { + "auxiliary_loss_clip": 0.06472028, + "auxiliary_loss_mlp": 0.01275786, + "balance_loss_clip": 0.06289984, + "balance_loss_mlp": 0.01259382, + "epoch": 0.34444611453479634, + "flos": 24323425338240.0, + "grad_norm": 1.9801243778486481, + "language_loss": 0.70532095, + "learning_rate": 3.0493852938772143e-06, + "loss": 0.78279907, + "num_input_tokens_seen": 123110560, + "router_z_loss_clip": 1.8203125, + "router_z_loss_mlp": 0.1640625, + "step": 5729, + "time_per_iteration": 2.5122504234313965 + }, + { + "auxiliary_loss_clip": 0.06472413, + "auxiliary_loss_mlp": 0.01278834, + "balance_loss_clip": 0.06293523, + "balance_loss_mlp": 0.01263123, + "epoch": 0.3445062377874643, + "flos": 16989186585600.0, + "grad_norm": 2.065738946159642, + "language_loss": 0.74902749, + "learning_rate": 3.0490537284705078e-06, + "loss": 0.82653993, + "num_input_tokens_seen": 123128655, + "router_z_loss_clip": 1.79003906, + "router_z_loss_mlp": 0.15710449, + "step": 5730, + "time_per_iteration": 2.4971024990081787 + }, + { + "auxiliary_loss_clip": 0.06477457, + "auxiliary_loss_mlp": 0.01272788, + "balance_loss_clip": 0.06295118, + "balance_loss_mlp": 0.01256921, + "epoch": 0.3445663610401323, + "flos": 20308884370560.0, + "grad_norm": 2.25692333978076, + "language_loss": 0.79881716, + "learning_rate": 3.048722123283578e-06, + "loss": 0.87631959, + "num_input_tokens_seen": 123145130, + "router_z_loss_clip": 1.82128906, + "router_z_loss_mlp": 0.15869141, + "step": 5731, + "time_per_iteration": 2.5055606365203857 + }, + { + "auxiliary_loss_clip": 0.0647382, + "auxiliary_loss_mlp": 0.01272088, + "balance_loss_clip": 0.06289574, + "balance_loss_mlp": 0.01256532, + "epoch": 0.34462648429280024, + "flos": 15893568524160.0, + "grad_norm": 2.0529883798711586, + "language_loss": 0.78536034, + "learning_rate": 3.0483904783290006e-06, + "loss": 0.86281943, + "num_input_tokens_seen": 123162265, + "router_z_loss_clip": 1.84277344, + "router_z_loss_mlp": 0.15545654, + "step": 5732, + "time_per_iteration": 2.58428692817688 + }, + { + "auxiliary_loss_clip": 0.06393671, + "auxiliary_loss_mlp": 0.01269392, + "balance_loss_clip": 0.06309536, + "balance_loss_mlp": 0.01263571, + "epoch": 0.3446866075454682, + "flos": 59330681193600.0, + "grad_norm": 0.7296400398421587, + "language_loss": 0.53166986, + "learning_rate": 3.0480587936193505e-06, + "loss": 0.60830045, + "num_input_tokens_seen": 123218620, + "router_z_loss_clip": 0.84375, + "router_z_loss_mlp": 0.05813599, + "step": 5733, + "time_per_iteration": 3.1921679973602295 + }, + { + "auxiliary_loss_clip": 0.06473544, + "auxiliary_loss_mlp": 0.01275818, + "balance_loss_clip": 0.06292105, + "balance_loss_mlp": 0.01259248, + "epoch": 0.34474673079813617, + "flos": 22349962776960.0, + "grad_norm": 1.6143563972241732, + "language_loss": 0.83787543, + "learning_rate": 3.047727069167207e-06, + "loss": 0.91536903, + "num_input_tokens_seen": 123237325, + "router_z_loss_clip": 1.81542969, + "router_z_loss_mlp": 0.16564941, + "step": 5734, + "time_per_iteration": 2.5630810260772705 + }, + { + "auxiliary_loss_clip": 0.06472072, + "auxiliary_loss_mlp": 0.01278915, + "balance_loss_clip": 0.0628967, + "balance_loss_mlp": 0.01262834, + "epoch": 0.34480685405080413, + "flos": 27677098753920.0, + "grad_norm": 1.7144738343554842, + "language_loss": 0.93389094, + "learning_rate": 3.0473953049851478e-06, + "loss": 1.01140082, + "num_input_tokens_seen": 123258650, + "router_z_loss_clip": 1.82324219, + "router_z_loss_mlp": 0.1607666, + "step": 5735, + "time_per_iteration": 2.5621798038482666 + }, + { + "auxiliary_loss_clip": 0.06471383, + "auxiliary_loss_mlp": 0.01276844, + "balance_loss_clip": 0.06284925, + "balance_loss_mlp": 0.01259273, + "epoch": 0.3448669773034721, + "flos": 22462664918400.0, + "grad_norm": 1.7840822264419087, + "language_loss": 0.77095437, + "learning_rate": 3.0470635010857533e-06, + "loss": 0.84843659, + "num_input_tokens_seen": 123277155, + "router_z_loss_clip": 1.86523438, + "router_z_loss_mlp": 0.17578125, + "step": 5736, + "time_per_iteration": 2.5377349853515625 + }, + { + "auxiliary_loss_clip": 0.06471781, + "auxiliary_loss_mlp": 0.01270645, + "balance_loss_clip": 0.06287266, + "balance_loss_mlp": 0.01255326, + "epoch": 0.34492710055614006, + "flos": 24943105808640.0, + "grad_norm": 1.6287034776462515, + "language_loss": 0.79113513, + "learning_rate": 3.0467316574816064e-06, + "loss": 0.86855936, + "num_input_tokens_seen": 123297640, + "router_z_loss_clip": 1.84667969, + "router_z_loss_mlp": 0.15319824, + "step": 5737, + "time_per_iteration": 2.5471904277801514 + }, + { + "auxiliary_loss_clip": 0.06473309, + "auxiliary_loss_mlp": 0.01276485, + "balance_loss_clip": 0.06285917, + "balance_loss_mlp": 0.0125976, + "epoch": 0.34498722380880803, + "flos": 20127057010560.0, + "grad_norm": 2.191814396638409, + "language_loss": 0.72072059, + "learning_rate": 3.0463997741852893e-06, + "loss": 0.79821849, + "num_input_tokens_seen": 123314370, + "router_z_loss_clip": 1.875, + "router_z_loss_mlp": 0.16723633, + "step": 5738, + "time_per_iteration": 2.540442943572998 + }, + { + "auxiliary_loss_clip": 0.06471272, + "auxiliary_loss_mlp": 0.01272808, + "balance_loss_clip": 0.06284432, + "balance_loss_mlp": 0.01255821, + "epoch": 0.34504734706147605, + "flos": 28445511421440.0, + "grad_norm": 1.9413212194180998, + "language_loss": 0.82238245, + "learning_rate": 3.046067851209389e-06, + "loss": 0.89982325, + "num_input_tokens_seen": 123336085, + "router_z_loss_clip": 1.8671875, + "router_z_loss_mlp": 0.16992188, + "step": 5739, + "time_per_iteration": 2.57327938079834 + }, + { + "auxiliary_loss_clip": 0.06469989, + "auxiliary_loss_mlp": 0.0127904, + "balance_loss_clip": 0.06284826, + "balance_loss_mlp": 0.01261862, + "epoch": 0.345107470314144, + "flos": 22681067385600.0, + "grad_norm": 1.914547064909644, + "language_loss": 0.83564734, + "learning_rate": 3.0457358885664898e-06, + "loss": 0.91313767, + "num_input_tokens_seen": 123354460, + "router_z_loss_clip": 1.84960938, + "router_z_loss_mlp": 0.171875, + "step": 5740, + "time_per_iteration": 2.5514895915985107 + }, + { + "auxiliary_loss_clip": 0.06466584, + "auxiliary_loss_mlp": 0.01275646, + "balance_loss_clip": 0.06283005, + "balance_loss_mlp": 0.01258921, + "epoch": 0.345167593566812, + "flos": 20636886378240.0, + "grad_norm": 2.1474795597791734, + "language_loss": 0.76802379, + "learning_rate": 3.045403886269181e-06, + "loss": 0.84544611, + "num_input_tokens_seen": 123373420, + "router_z_loss_clip": 1.83496094, + "router_z_loss_mlp": 0.16723633, + "step": 5741, + "time_per_iteration": 2.511997699737549 + }, + { + "auxiliary_loss_clip": 0.06466299, + "auxiliary_loss_mlp": 0.0127053, + "balance_loss_clip": 0.06279384, + "balance_loss_mlp": 0.01254544, + "epoch": 0.34522771681947995, + "flos": 26221683260160.0, + "grad_norm": 1.6006732343467382, + "language_loss": 0.77803171, + "learning_rate": 3.045071844330053e-06, + "loss": 0.85540009, + "num_input_tokens_seen": 123394730, + "router_z_loss_clip": 1.87011719, + "router_z_loss_mlp": 0.15966797, + "step": 5742, + "time_per_iteration": 2.5593955516815186 + }, + { + "auxiliary_loss_clip": 0.06464212, + "auxiliary_loss_mlp": 0.01272894, + "balance_loss_clip": 0.06281982, + "balance_loss_mlp": 0.01256074, + "epoch": 0.3452878400721479, + "flos": 19068349472640.0, + "grad_norm": 2.2544306863162538, + "language_loss": 0.76459014, + "learning_rate": 3.0447397627616955e-06, + "loss": 0.84196126, + "num_input_tokens_seen": 123412895, + "router_z_loss_clip": 1.82421875, + "router_z_loss_mlp": 0.16821289, + "step": 5743, + "time_per_iteration": 3.996267557144165 + }, + { + "auxiliary_loss_clip": 0.06462429, + "auxiliary_loss_mlp": 0.0126984, + "balance_loss_clip": 0.06281956, + "balance_loss_mlp": 0.01255118, + "epoch": 0.3453479633248159, + "flos": 27937442989440.0, + "grad_norm": 1.578255214465821, + "language_loss": 0.7080915, + "learning_rate": 3.0444076415767016e-06, + "loss": 0.78541422, + "num_input_tokens_seen": 123432320, + "router_z_loss_clip": 1.80761719, + "router_z_loss_mlp": 0.14727783, + "step": 5744, + "time_per_iteration": 2.5594234466552734 + }, + { + "auxiliary_loss_clip": 0.06462625, + "auxiliary_loss_mlp": 0.01272389, + "balance_loss_clip": 0.0628416, + "balance_loss_mlp": 0.01256523, + "epoch": 0.34540808657748384, + "flos": 19611609419520.0, + "grad_norm": 1.8945383960499247, + "language_loss": 0.79877782, + "learning_rate": 3.044075480787665e-06, + "loss": 0.87612802, + "num_input_tokens_seen": 123450980, + "router_z_loss_clip": 1.78320312, + "router_z_loss_mlp": 0.15881348, + "step": 5745, + "time_per_iteration": 2.5577902793884277 + }, + { + "auxiliary_loss_clip": 0.0646376, + "auxiliary_loss_mlp": 0.0127446, + "balance_loss_clip": 0.0627804, + "balance_loss_mlp": 0.01258343, + "epoch": 0.3454682098301518, + "flos": 20417771151360.0, + "grad_norm": 2.2215207406176063, + "language_loss": 0.90027881, + "learning_rate": 3.043743280407182e-06, + "loss": 0.97766101, + "num_input_tokens_seen": 123469365, + "router_z_loss_clip": 1.85742188, + "router_z_loss_mlp": 0.16113281, + "step": 5746, + "time_per_iteration": 4.126953840255737 + }, + { + "auxiliary_loss_clip": 0.06469168, + "auxiliary_loss_mlp": 0.01271588, + "balance_loss_clip": 0.06281114, + "balance_loss_mlp": 0.01254648, + "epoch": 0.34552833308281977, + "flos": 21331603779840.0, + "grad_norm": 1.8420175913064167, + "language_loss": 0.65233189, + "learning_rate": 3.043411040447849e-06, + "loss": 0.72973943, + "num_input_tokens_seen": 123489425, + "router_z_loss_clip": 1.88085938, + "router_z_loss_mlp": 0.16931152, + "step": 5747, + "time_per_iteration": 2.6445960998535156 + }, + { + "auxiliary_loss_clip": 0.06461484, + "auxiliary_loss_mlp": 0.01274425, + "balance_loss_clip": 0.06279166, + "balance_loss_mlp": 0.01259166, + "epoch": 0.34558845633548774, + "flos": 36251914331520.0, + "grad_norm": 1.6152983170909512, + "language_loss": 0.72912234, + "learning_rate": 3.043078760922264e-06, + "loss": 0.80648136, + "num_input_tokens_seen": 123509970, + "router_z_loss_clip": 1.82226562, + "router_z_loss_mlp": 0.15246582, + "step": 5748, + "time_per_iteration": 2.668628692626953 + }, + { + "auxiliary_loss_clip": 0.0646018, + "auxiliary_loss_mlp": 0.01271906, + "balance_loss_clip": 0.06281725, + "balance_loss_mlp": 0.01257268, + "epoch": 0.3456485795881557, + "flos": 22456292008320.0, + "grad_norm": 2.139365243179929, + "language_loss": 0.75935584, + "learning_rate": 3.042746441843029e-06, + "loss": 0.83667672, + "num_input_tokens_seen": 123531055, + "router_z_loss_clip": 1.78320312, + "router_z_loss_mlp": 0.14648438, + "step": 5749, + "time_per_iteration": 2.533357620239258 + }, + { + "auxiliary_loss_clip": 0.06372777, + "auxiliary_loss_mlp": 0.01259534, + "balance_loss_clip": 0.06290279, + "balance_loss_mlp": 0.0125392, + "epoch": 0.34570870284082367, + "flos": 62023277422080.0, + "grad_norm": 0.8741398929973155, + "language_loss": 0.62861037, + "learning_rate": 3.0424140832227437e-06, + "loss": 0.70493352, + "num_input_tokens_seen": 123584720, + "router_z_loss_clip": 0.82470703, + "router_z_loss_mlp": 0.05612183, + "step": 5750, + "time_per_iteration": 4.42021369934082 + }, + { + "auxiliary_loss_clip": 0.06455849, + "auxiliary_loss_mlp": 0.0126761, + "balance_loss_clip": 0.06279862, + "balance_loss_mlp": 0.01253383, + "epoch": 0.34576882609349163, + "flos": 22788528647040.0, + "grad_norm": 2.5604939014714043, + "language_loss": 0.80745482, + "learning_rate": 3.042081685074012e-06, + "loss": 0.88468945, + "num_input_tokens_seen": 123604465, + "router_z_loss_clip": 1.7578125, + "router_z_loss_mlp": 0.14227295, + "step": 5751, + "time_per_iteration": 2.610229730606079 + }, + { + "auxiliary_loss_clip": 0.06461278, + "auxiliary_loss_mlp": 0.01273124, + "balance_loss_clip": 0.06282206, + "balance_loss_mlp": 0.01258199, + "epoch": 0.34582894934615965, + "flos": 12353665409280.0, + "grad_norm": 2.333174149642167, + "language_loss": 0.85112172, + "learning_rate": 3.041749247409439e-06, + "loss": 0.92846578, + "num_input_tokens_seen": 123622320, + "router_z_loss_clip": 1.79101562, + "router_z_loss_mlp": 0.14904785, + "step": 5752, + "time_per_iteration": 2.49895977973938 + }, + { + "auxiliary_loss_clip": 0.06379203, + "auxiliary_loss_mlp": 0.01260282, + "balance_loss_clip": 0.06296635, + "balance_loss_mlp": 0.01254092, + "epoch": 0.3458890725988276, + "flos": 70186459017600.0, + "grad_norm": 0.7233537791569425, + "language_loss": 0.63163221, + "learning_rate": 3.0414167702416296e-06, + "loss": 0.70802706, + "num_input_tokens_seen": 123678010, + "router_z_loss_clip": 0.828125, + "router_z_loss_mlp": 0.06185913, + "step": 5753, + "time_per_iteration": 3.0605263710021973 + }, + { + "auxiliary_loss_clip": 0.06463367, + "auxiliary_loss_mlp": 0.01274407, + "balance_loss_clip": 0.06282756, + "balance_loss_mlp": 0.01258498, + "epoch": 0.3459491958514956, + "flos": 17098324928640.0, + "grad_norm": 2.0282181813946116, + "language_loss": 0.71483171, + "learning_rate": 3.0410842535831914e-06, + "loss": 0.79220951, + "num_input_tokens_seen": 123696830, + "router_z_loss_clip": 1.80566406, + "router_z_loss_mlp": 0.15899658, + "step": 5754, + "time_per_iteration": 2.499213457107544 + }, + { + "auxiliary_loss_clip": 0.06470741, + "auxiliary_loss_mlp": 0.01271896, + "balance_loss_clip": 0.06282809, + "balance_loss_mlp": 0.01255898, + "epoch": 0.34600931910416355, + "flos": 16655985624960.0, + "grad_norm": 2.0834630321372534, + "language_loss": 0.7328862, + "learning_rate": 3.0407516974467343e-06, + "loss": 0.81031251, + "num_input_tokens_seen": 123714360, + "router_z_loss_clip": 1.87695312, + "router_z_loss_mlp": 0.15979004, + "step": 5755, + "time_per_iteration": 2.540292263031006 + }, + { + "auxiliary_loss_clip": 0.0646005, + "auxiliary_loss_mlp": 0.01272619, + "balance_loss_clip": 0.06280342, + "balance_loss_mlp": 0.01257801, + "epoch": 0.3460694423568315, + "flos": 38555517179520.0, + "grad_norm": 1.432388080922509, + "language_loss": 0.7255426, + "learning_rate": 3.040419101844869e-06, + "loss": 0.80286932, + "num_input_tokens_seen": 123739250, + "router_z_loss_clip": 1.79589844, + "router_z_loss_mlp": 0.14813232, + "step": 5756, + "time_per_iteration": 2.679203510284424 + }, + { + "auxiliary_loss_clip": 0.06371044, + "auxiliary_loss_mlp": 0.01257585, + "balance_loss_clip": 0.06288835, + "balance_loss_mlp": 0.01251058, + "epoch": 0.3461295656094995, + "flos": 72103332545280.0, + "grad_norm": 0.6902951700774806, + "language_loss": 0.62318385, + "learning_rate": 3.040086466790207e-06, + "loss": 0.69947016, + "num_input_tokens_seen": 123802845, + "router_z_loss_clip": 0.8203125, + "router_z_loss_mlp": 0.06536865, + "step": 5757, + "time_per_iteration": 3.209688901901245 + }, + { + "auxiliary_loss_clip": 0.06363717, + "auxiliary_loss_mlp": 0.01259824, + "balance_loss_clip": 0.06281064, + "balance_loss_mlp": 0.01253244, + "epoch": 0.34618968886216744, + "flos": 65477913408000.0, + "grad_norm": 0.8114970964410039, + "language_loss": 0.59130025, + "learning_rate": 3.039753792295362e-06, + "loss": 0.66753566, + "num_input_tokens_seen": 123861805, + "router_z_loss_clip": 0.828125, + "router_z_loss_mlp": 0.06591797, + "step": 5758, + "time_per_iteration": 3.139495372772217 + }, + { + "auxiliary_loss_clip": 0.06467785, + "auxiliary_loss_mlp": 0.01274731, + "balance_loss_clip": 0.06288655, + "balance_loss_mlp": 0.01259747, + "epoch": 0.3462498121148354, + "flos": 23478508293120.0, + "grad_norm": 1.7665020183034759, + "language_loss": 0.72321635, + "learning_rate": 3.0394210783729487e-06, + "loss": 0.80064148, + "num_input_tokens_seen": 123881820, + "router_z_loss_clip": 1.79101562, + "router_z_loss_mlp": 0.14990234, + "step": 5759, + "time_per_iteration": 2.575479745864868 + }, + { + "auxiliary_loss_clip": 0.06456805, + "auxiliary_loss_mlp": 0.01274415, + "balance_loss_clip": 0.06277698, + "balance_loss_mlp": 0.01258632, + "epoch": 0.3463099353675034, + "flos": 24177711888000.0, + "grad_norm": 1.8760422141660649, + "language_loss": 0.83568478, + "learning_rate": 3.0390883250355836e-06, + "loss": 0.91299695, + "num_input_tokens_seen": 123903700, + "router_z_loss_clip": 1.79199219, + "router_z_loss_mlp": 0.15771484, + "step": 5760, + "time_per_iteration": 2.5610272884368896 + }, + { + "auxiliary_loss_clip": 0.06358143, + "auxiliary_loss_mlp": 0.01257449, + "balance_loss_clip": 0.06276596, + "balance_loss_mlp": 0.0125125, + "epoch": 0.34637005862017134, + "flos": 63716773893120.0, + "grad_norm": 0.8043642187655193, + "language_loss": 0.56576806, + "learning_rate": 3.0387555322958865e-06, + "loss": 0.64192402, + "num_input_tokens_seen": 123960075, + "router_z_loss_clip": 0.81591797, + "router_z_loss_mlp": 0.06195068, + "step": 5761, + "time_per_iteration": 3.2343695163726807 + }, + { + "auxiliary_loss_clip": 0.06453449, + "auxiliary_loss_mlp": 0.01270941, + "balance_loss_clip": 0.06277917, + "balance_loss_mlp": 0.01256457, + "epoch": 0.3464301818728393, + "flos": 13149513089280.0, + "grad_norm": 1.936786863895872, + "language_loss": 0.9549523, + "learning_rate": 3.038422700166474e-06, + "loss": 1.03219616, + "num_input_tokens_seen": 123975805, + "router_z_loss_clip": 1.75585938, + "router_z_loss_mlp": 0.14477539, + "step": 5762, + "time_per_iteration": 2.496039390563965 + }, + { + "auxiliary_loss_clip": 0.06467324, + "auxiliary_loss_mlp": 0.01276759, + "balance_loss_clip": 0.06279808, + "balance_loss_mlp": 0.01260928, + "epoch": 0.34649030512550727, + "flos": 29322936650880.0, + "grad_norm": 1.870020160295256, + "language_loss": 0.69913763, + "learning_rate": 3.0380898286599692e-06, + "loss": 0.77657849, + "num_input_tokens_seen": 123997530, + "router_z_loss_clip": 1.875, + "router_z_loss_mlp": 0.15820312, + "step": 5763, + "time_per_iteration": 2.5929718017578125 + }, + { + "auxiliary_loss_clip": 0.06466965, + "auxiliary_loss_mlp": 0.01270957, + "balance_loss_clip": 0.06278971, + "balance_loss_mlp": 0.01253922, + "epoch": 0.34655042837817523, + "flos": 23737385082240.0, + "grad_norm": 1.7922805842181977, + "language_loss": 0.83863467, + "learning_rate": 3.0377569177889945e-06, + "loss": 0.9160139, + "num_input_tokens_seen": 124016375, + "router_z_loss_clip": 1.88183594, + "router_z_loss_mlp": 0.17028809, + "step": 5764, + "time_per_iteration": 2.634692668914795 + }, + { + "auxiliary_loss_clip": 0.06459094, + "auxiliary_loss_mlp": 0.01274486, + "balance_loss_clip": 0.06279744, + "balance_loss_mlp": 0.01259263, + "epoch": 0.34661055163084326, + "flos": 22060716082560.0, + "grad_norm": 2.9007104109569943, + "language_loss": 0.67647815, + "learning_rate": 3.0374239675661722e-06, + "loss": 0.75381392, + "num_input_tokens_seen": 124033975, + "router_z_loss_clip": 1.79394531, + "router_z_loss_mlp": 0.15234375, + "step": 5765, + "time_per_iteration": 2.5028090476989746 + }, + { + "auxiliary_loss_clip": 0.06460512, + "auxiliary_loss_mlp": 0.0127692, + "balance_loss_clip": 0.06280708, + "balance_loss_mlp": 0.01262233, + "epoch": 0.3466706748835112, + "flos": 21805738508160.0, + "grad_norm": 3.5961884004183426, + "language_loss": 0.77947313, + "learning_rate": 3.03709097800413e-06, + "loss": 0.85684741, + "num_input_tokens_seen": 124051930, + "router_z_loss_clip": 1.79980469, + "router_z_loss_mlp": 0.14709473, + "step": 5766, + "time_per_iteration": 2.5584661960601807 + }, + { + "auxiliary_loss_clip": 0.06460432, + "auxiliary_loss_mlp": 0.01274096, + "balance_loss_clip": 0.06278767, + "balance_loss_mlp": 0.01260614, + "epoch": 0.3467307981361792, + "flos": 19467405342720.0, + "grad_norm": 1.5497773141022704, + "language_loss": 0.73886019, + "learning_rate": 3.0367579491154943e-06, + "loss": 0.8162055, + "num_input_tokens_seen": 124071220, + "router_z_loss_clip": 1.81640625, + "router_z_loss_mlp": 0.13500977, + "step": 5767, + "time_per_iteration": 2.571500062942505 + }, + { + "auxiliary_loss_clip": 0.06461183, + "auxiliary_loss_mlp": 0.01276021, + "balance_loss_clip": 0.06279645, + "balance_loss_mlp": 0.01260107, + "epoch": 0.34679092138884715, + "flos": 24834470590080.0, + "grad_norm": 2.0350854996297696, + "language_loss": 0.78955162, + "learning_rate": 3.036424880912893e-06, + "loss": 0.86692369, + "num_input_tokens_seen": 124090140, + "router_z_loss_clip": 1.8125, + "router_z_loss_mlp": 0.15917969, + "step": 5768, + "time_per_iteration": 2.5747995376586914 + }, + { + "auxiliary_loss_clip": 0.06369781, + "auxiliary_loss_mlp": 0.01257254, + "balance_loss_clip": 0.06288455, + "balance_loss_mlp": 0.01251723, + "epoch": 0.3468510446415151, + "flos": 63253791757440.0, + "grad_norm": 0.7431238132649503, + "language_loss": 0.57319033, + "learning_rate": 3.036091773408956e-06, + "loss": 0.64946061, + "num_input_tokens_seen": 124152025, + "router_z_loss_clip": 0.8125, + "router_z_loss_mlp": 0.05535889, + "step": 5769, + "time_per_iteration": 3.176074981689453 + }, + { + "auxiliary_loss_clip": 0.06479758, + "auxiliary_loss_mlp": 0.01277235, + "balance_loss_clip": 0.06285711, + "balance_loss_mlp": 0.01260212, + "epoch": 0.3469111678941831, + "flos": 12123984568320.0, + "grad_norm": 2.4016361546378158, + "language_loss": 0.85419703, + "learning_rate": 3.0357586266163154e-06, + "loss": 0.93176699, + "num_input_tokens_seen": 124165795, + "router_z_loss_clip": 1.94042969, + "router_z_loss_mlp": 0.17028809, + "step": 5770, + "time_per_iteration": 2.5156779289245605 + }, + { + "auxiliary_loss_clip": 0.06372644, + "auxiliary_loss_mlp": 0.01258777, + "balance_loss_clip": 0.0629043, + "balance_loss_mlp": 0.01253087, + "epoch": 0.34697129114685105, + "flos": 65951964282240.0, + "grad_norm": 0.7493725348793998, + "language_loss": 0.59862447, + "learning_rate": 3.0354254405476036e-06, + "loss": 0.67493868, + "num_input_tokens_seen": 124222925, + "router_z_loss_clip": 0.82080078, + "router_z_loss_mlp": 0.05685425, + "step": 5771, + "time_per_iteration": 2.938957691192627 + }, + { + "auxiliary_loss_clip": 0.0646434, + "auxiliary_loss_mlp": 0.012787, + "balance_loss_clip": 0.06282143, + "balance_loss_mlp": 0.01263572, + "epoch": 0.347031414399519, + "flos": 34461914284800.0, + "grad_norm": 1.9396999801577832, + "language_loss": 0.72527683, + "learning_rate": 3.0350922152154557e-06, + "loss": 0.80270731, + "num_input_tokens_seen": 124240915, + "router_z_loss_clip": 1.82324219, + "router_z_loss_mlp": 0.15136719, + "step": 5772, + "time_per_iteration": 2.6529078483581543 + }, + { + "auxiliary_loss_clip": 0.06462972, + "auxiliary_loss_mlp": 0.01272172, + "balance_loss_clip": 0.06281382, + "balance_loss_mlp": 0.01256246, + "epoch": 0.347091537652187, + "flos": 26951592176640.0, + "grad_norm": 1.5709710398058576, + "language_loss": 0.76695967, + "learning_rate": 3.034758950632507e-06, + "loss": 0.84431112, + "num_input_tokens_seen": 124262770, + "router_z_loss_clip": 1.81445312, + "router_z_loss_mlp": 0.15924072, + "step": 5773, + "time_per_iteration": 2.5785317420959473 + }, + { + "auxiliary_loss_clip": 0.06466497, + "auxiliary_loss_mlp": 0.01271256, + "balance_loss_clip": 0.06280655, + "balance_loss_mlp": 0.01255366, + "epoch": 0.34715166090485494, + "flos": 21148602462720.0, + "grad_norm": 2.4326309651076463, + "language_loss": 0.70796078, + "learning_rate": 3.034425646811396e-06, + "loss": 0.78533834, + "num_input_tokens_seen": 124280950, + "router_z_loss_clip": 1.85644531, + "router_z_loss_mlp": 0.15893555, + "step": 5774, + "time_per_iteration": 2.5585873126983643 + }, + { + "auxiliary_loss_clip": 0.06458526, + "auxiliary_loss_mlp": 0.01271942, + "balance_loss_clip": 0.06278332, + "balance_loss_mlp": 0.01256707, + "epoch": 0.3472117841575229, + "flos": 23484881203200.0, + "grad_norm": 2.2084812675777474, + "language_loss": 0.76485682, + "learning_rate": 3.0340923037647602e-06, + "loss": 0.84216148, + "num_input_tokens_seen": 124299540, + "router_z_loss_clip": 1.80175781, + "router_z_loss_mlp": 0.15228271, + "step": 5775, + "time_per_iteration": 2.5899477005004883 + }, + { + "auxiliary_loss_clip": 0.06472419, + "auxiliary_loss_mlp": 0.01271173, + "balance_loss_clip": 0.06281743, + "balance_loss_mlp": 0.01255163, + "epoch": 0.34727190741019087, + "flos": 17498428974720.0, + "grad_norm": 2.2070819655775282, + "language_loss": 0.7869916, + "learning_rate": 3.0337589215052404e-06, + "loss": 0.86442757, + "num_input_tokens_seen": 124316285, + "router_z_loss_clip": 1.90625, + "router_z_loss_mlp": 0.16009521, + "step": 5776, + "time_per_iteration": 2.5874037742614746 + }, + { + "auxiliary_loss_clip": 0.0636313, + "auxiliary_loss_mlp": 0.01265305, + "balance_loss_clip": 0.06280468, + "balance_loss_mlp": 0.0125983, + "epoch": 0.34733203066285884, + "flos": 65287350495360.0, + "grad_norm": 0.8333293277096808, + "language_loss": 0.63448966, + "learning_rate": 3.033425500045478e-06, + "loss": 0.710774, + "num_input_tokens_seen": 124376650, + "router_z_loss_clip": 0.82666016, + "router_z_loss_mlp": 0.05477905, + "step": 5777, + "time_per_iteration": 3.168325185775757 + }, + { + "auxiliary_loss_clip": 0.0646584, + "auxiliary_loss_mlp": 0.01270867, + "balance_loss_clip": 0.06279471, + "balance_loss_mlp": 0.01255048, + "epoch": 0.3473921539155268, + "flos": 28666429511040.0, + "grad_norm": 3.258496862714712, + "language_loss": 0.65075529, + "learning_rate": 3.033092039398119e-06, + "loss": 0.72812235, + "num_input_tokens_seen": 124396475, + "router_z_loss_clip": 1.86425781, + "router_z_loss_mlp": 0.15808105, + "step": 5778, + "time_per_iteration": 2.5797836780548096 + }, + { + "auxiliary_loss_clip": 0.06467149, + "auxiliary_loss_mlp": 0.01271344, + "balance_loss_clip": 0.06278305, + "balance_loss_mlp": 0.0125633, + "epoch": 0.3474522771681948, + "flos": 40845284104320.0, + "grad_norm": 1.7195764072446118, + "language_loss": 0.722601, + "learning_rate": 3.0327585395758046e-06, + "loss": 0.79998595, + "num_input_tokens_seen": 124416480, + "router_z_loss_clip": 1.88769531, + "router_z_loss_mlp": 0.15008545, + "step": 5779, + "time_per_iteration": 2.6901330947875977 + }, + { + "auxiliary_loss_clip": 0.06474127, + "auxiliary_loss_mlp": 0.01275722, + "balance_loss_clip": 0.06282836, + "balance_loss_mlp": 0.01259092, + "epoch": 0.3475124004208628, + "flos": 24615564998400.0, + "grad_norm": 2.601451729132101, + "language_loss": 0.62399209, + "learning_rate": 3.0324250005911837e-06, + "loss": 0.70149052, + "num_input_tokens_seen": 124435950, + "router_z_loss_clip": 1.9140625, + "router_z_loss_mlp": 0.1663208, + "step": 5780, + "time_per_iteration": 2.5493476390838623 + }, + { + "auxiliary_loss_clip": 0.0647147, + "auxiliary_loss_mlp": 0.01271785, + "balance_loss_clip": 0.06285025, + "balance_loss_mlp": 0.01256264, + "epoch": 0.34757252367353075, + "flos": 22717977909120.0, + "grad_norm": 3.4183593986527043, + "language_loss": 0.72164977, + "learning_rate": 3.0320914224569033e-06, + "loss": 0.79908228, + "num_input_tokens_seen": 124455410, + "router_z_loss_clip": 1.86523438, + "router_z_loss_mlp": 0.15515137, + "step": 5781, + "time_per_iteration": 2.610198974609375 + }, + { + "auxiliary_loss_clip": 0.06471756, + "auxiliary_loss_mlp": 0.01273476, + "balance_loss_clip": 0.06282213, + "balance_loss_mlp": 0.01257228, + "epoch": 0.3476326469261987, + "flos": 19834246517760.0, + "grad_norm": 2.4264406265191325, + "language_loss": 0.77686667, + "learning_rate": 3.031757805185612e-06, + "loss": 0.85431898, + "num_input_tokens_seen": 124474870, + "router_z_loss_clip": 1.89648438, + "router_z_loss_mlp": 0.16235352, + "step": 5782, + "time_per_iteration": 3.918602705001831 + }, + { + "auxiliary_loss_clip": 0.06470296, + "auxiliary_loss_mlp": 0.01277549, + "balance_loss_clip": 0.0628626, + "balance_loss_mlp": 0.01262695, + "epoch": 0.3476927701788667, + "flos": 19944265328640.0, + "grad_norm": 2.639685157679876, + "language_loss": 0.63410383, + "learning_rate": 3.0314241487899622e-06, + "loss": 0.7115823, + "num_input_tokens_seen": 124494105, + "router_z_loss_clip": 1.84082031, + "router_z_loss_mlp": 0.14855957, + "step": 5783, + "time_per_iteration": 4.021190881729126 + }, + { + "auxiliary_loss_clip": 0.06469369, + "auxiliary_loss_mlp": 0.01277895, + "balance_loss_clip": 0.06290524, + "balance_loss_mlp": 0.01264121, + "epoch": 0.34775289343153465, + "flos": 20740448424960.0, + "grad_norm": 1.686879732071426, + "language_loss": 0.89054763, + "learning_rate": 3.031090453282605e-06, + "loss": 0.9680202, + "num_input_tokens_seen": 124512030, + "router_z_loss_clip": 1.78808594, + "router_z_loss_mlp": 0.13763428, + "step": 5784, + "time_per_iteration": 2.553847074508667 + }, + { + "auxiliary_loss_clip": 0.06470798, + "auxiliary_loss_mlp": 0.01275566, + "balance_loss_clip": 0.06289466, + "balance_loss_mlp": 0.01260903, + "epoch": 0.3478130166842026, + "flos": 19360992257280.0, + "grad_norm": 1.643062521609265, + "language_loss": 0.82068878, + "learning_rate": 3.0307567186761946e-06, + "loss": 0.89815247, + "num_input_tokens_seen": 124530980, + "router_z_loss_clip": 1.8125, + "router_z_loss_mlp": 0.14672852, + "step": 5785, + "time_per_iteration": 2.5452024936676025 + }, + { + "auxiliary_loss_clip": 0.06472684, + "auxiliary_loss_mlp": 0.01281071, + "balance_loss_clip": 0.06290045, + "balance_loss_mlp": 0.01267004, + "epoch": 0.3478731399368706, + "flos": 22057194211200.0, + "grad_norm": 1.6654216237849466, + "language_loss": 0.80731958, + "learning_rate": 3.0304229449833862e-06, + "loss": 0.88485718, + "num_input_tokens_seen": 124549330, + "router_z_loss_clip": 1.82617188, + "router_z_loss_mlp": 0.14074707, + "step": 5786, + "time_per_iteration": 4.040801286697388 + }, + { + "auxiliary_loss_clip": 0.06468868, + "auxiliary_loss_mlp": 0.01275893, + "balance_loss_clip": 0.06289011, + "balance_loss_mlp": 0.01260515, + "epoch": 0.34793326318953854, + "flos": 18047390999040.0, + "grad_norm": 1.5833193798509506, + "language_loss": 0.75743961, + "learning_rate": 3.030089132216836e-06, + "loss": 0.83488721, + "num_input_tokens_seen": 124567200, + "router_z_loss_clip": 1.79785156, + "router_z_loss_mlp": 0.15368652, + "step": 5787, + "time_per_iteration": 2.5231845378875732 + }, + { + "auxiliary_loss_clip": 0.06470607, + "auxiliary_loss_mlp": 0.01273428, + "balance_loss_clip": 0.06287535, + "balance_loss_mlp": 0.01259111, + "epoch": 0.3479933864422065, + "flos": 29322349672320.0, + "grad_norm": 1.5447805606313796, + "language_loss": 0.81661141, + "learning_rate": 3.029755280389203e-06, + "loss": 0.89405167, + "num_input_tokens_seen": 124587025, + "router_z_loss_clip": 1.83007812, + "router_z_loss_mlp": 0.14312744, + "step": 5788, + "time_per_iteration": 2.5828304290771484 + }, + { + "auxiliary_loss_clip": 0.064804, + "auxiliary_loss_mlp": 0.01277805, + "balance_loss_clip": 0.06290662, + "balance_loss_mlp": 0.01261831, + "epoch": 0.3480535096948745, + "flos": 20126931229440.0, + "grad_norm": 1.9688082680528027, + "language_loss": 0.85984367, + "learning_rate": 3.029421389513147e-06, + "loss": 0.93742573, + "num_input_tokens_seen": 124605860, + "router_z_loss_clip": 1.8984375, + "router_z_loss_mlp": 0.15979004, + "step": 5789, + "time_per_iteration": 2.582662343978882 + }, + { + "auxiliary_loss_clip": 0.06479242, + "auxiliary_loss_mlp": 0.0127695, + "balance_loss_clip": 0.06292568, + "balance_loss_mlp": 0.0126178, + "epoch": 0.34811363294754244, + "flos": 18554453182080.0, + "grad_norm": 1.6869236803506542, + "language_loss": 0.84773821, + "learning_rate": 3.029087459601328e-06, + "loss": 0.92530012, + "num_input_tokens_seen": 124624270, + "router_z_loss_clip": 1.86816406, + "router_z_loss_mlp": 0.15185547, + "step": 5790, + "time_per_iteration": 3.942929983139038 + }, + { + "auxiliary_loss_clip": 0.06469919, + "auxiliary_loss_mlp": 0.01274378, + "balance_loss_clip": 0.0628828, + "balance_loss_mlp": 0.01259465, + "epoch": 0.3481737562002104, + "flos": 26877603421440.0, + "grad_norm": 1.9257745343225423, + "language_loss": 0.81410027, + "learning_rate": 3.0287534906664097e-06, + "loss": 0.89154327, + "num_input_tokens_seen": 124644005, + "router_z_loss_clip": 1.81738281, + "router_z_loss_mlp": 0.14904785, + "step": 5791, + "time_per_iteration": 2.5533103942871094 + }, + { + "auxiliary_loss_clip": 0.06478444, + "auxiliary_loss_mlp": 0.01278573, + "balance_loss_clip": 0.0629065, + "balance_loss_mlp": 0.01263356, + "epoch": 0.3482338794528784, + "flos": 28915495372800.0, + "grad_norm": 1.656722788090249, + "language_loss": 0.78119808, + "learning_rate": 3.028419482721056e-06, + "loss": 0.85876822, + "num_input_tokens_seen": 124663020, + "router_z_loss_clip": 1.87988281, + "router_z_loss_mlp": 0.15216064, + "step": 5792, + "time_per_iteration": 2.5784294605255127 + }, + { + "auxiliary_loss_clip": 0.06471643, + "auxiliary_loss_mlp": 0.01270557, + "balance_loss_clip": 0.06287935, + "balance_loss_mlp": 0.01255989, + "epoch": 0.3482940027055464, + "flos": 22207393854720.0, + "grad_norm": 1.5928062225109956, + "language_loss": 0.82187879, + "learning_rate": 3.0280854357779325e-06, + "loss": 0.89930081, + "num_input_tokens_seen": 124682975, + "router_z_loss_clip": 1.83496094, + "router_z_loss_mlp": 0.14575195, + "step": 5793, + "time_per_iteration": 2.545158624649048 + }, + { + "auxiliary_loss_clip": 0.06472721, + "auxiliary_loss_mlp": 0.01275633, + "balance_loss_clip": 0.06286202, + "balance_loss_mlp": 0.01259438, + "epoch": 0.34835412595821436, + "flos": 20308884370560.0, + "grad_norm": 1.8552979095996294, + "language_loss": 0.7616328, + "learning_rate": 3.027751349849706e-06, + "loss": 0.83911633, + "num_input_tokens_seen": 124701340, + "router_z_loss_clip": 1.86328125, + "router_z_loss_mlp": 0.1618042, + "step": 5794, + "time_per_iteration": 2.548841953277588 + }, + { + "auxiliary_loss_clip": 0.06468202, + "auxiliary_loss_mlp": 0.01277142, + "balance_loss_clip": 0.06286102, + "balance_loss_mlp": 0.01262271, + "epoch": 0.3484142492108823, + "flos": 20456065267200.0, + "grad_norm": 2.5979910850639336, + "language_loss": 0.57406038, + "learning_rate": 3.0274172249490456e-06, + "loss": 0.65151387, + "num_input_tokens_seen": 124719165, + "router_z_loss_clip": 1.82226562, + "router_z_loss_mlp": 0.14868164, + "step": 5795, + "time_per_iteration": 2.5222668647766113 + }, + { + "auxiliary_loss_clip": 0.06465806, + "auxiliary_loss_mlp": 0.01271041, + "balance_loss_clip": 0.06285395, + "balance_loss_mlp": 0.01257469, + "epoch": 0.3484743724635503, + "flos": 24359832737280.0, + "grad_norm": 1.8988060542741243, + "language_loss": 0.83093596, + "learning_rate": 3.0270830610886213e-06, + "loss": 0.90830439, + "num_input_tokens_seen": 124738670, + "router_z_loss_clip": 1.80371094, + "router_z_loss_mlp": 0.13580322, + "step": 5796, + "time_per_iteration": 2.5901992321014404 + }, + { + "auxiliary_loss_clip": 0.06459932, + "auxiliary_loss_mlp": 0.01272067, + "balance_loss_clip": 0.06285086, + "balance_loss_mlp": 0.01258692, + "epoch": 0.34853449571621825, + "flos": 24359916591360.0, + "grad_norm": 1.6441838604480552, + "language_loss": 0.83544898, + "learning_rate": 3.0267488582811033e-06, + "loss": 0.91276896, + "num_input_tokens_seen": 124758760, + "router_z_loss_clip": 1.74511719, + "router_z_loss_mlp": 0.13378906, + "step": 5797, + "time_per_iteration": 2.5595455169677734 + }, + { + "auxiliary_loss_clip": 0.06466283, + "auxiliary_loss_mlp": 0.01269705, + "balance_loss_clip": 0.06287575, + "balance_loss_mlp": 0.01256055, + "epoch": 0.3485946189688862, + "flos": 27274395231360.0, + "grad_norm": 1.5517160717894904, + "language_loss": 0.73727238, + "learning_rate": 3.026414616539167e-06, + "loss": 0.81463224, + "num_input_tokens_seen": 124777765, + "router_z_loss_clip": 1.78808594, + "router_z_loss_mlp": 0.13647461, + "step": 5798, + "time_per_iteration": 2.716830015182495 + }, + { + "auxiliary_loss_clip": 0.06466942, + "auxiliary_loss_mlp": 0.012712, + "balance_loss_clip": 0.06280895, + "balance_loss_mlp": 0.0125618, + "epoch": 0.3486547422215542, + "flos": 20162835504000.0, + "grad_norm": 1.8098383323780278, + "language_loss": 0.76806593, + "learning_rate": 3.026080335875485e-06, + "loss": 0.84544736, + "num_input_tokens_seen": 124796775, + "router_z_loss_clip": 1.86035156, + "router_z_loss_mlp": 0.15014648, + "step": 5799, + "time_per_iteration": 2.550356149673462 + }, + { + "auxiliary_loss_clip": 0.06464861, + "auxiliary_loss_mlp": 0.01267271, + "balance_loss_clip": 0.06284796, + "balance_loss_mlp": 0.01253735, + "epoch": 0.34871486547422215, + "flos": 20236614624000.0, + "grad_norm": 2.6888551620055363, + "language_loss": 0.75880742, + "learning_rate": 3.025746016302734e-06, + "loss": 0.83612871, + "num_input_tokens_seen": 124815825, + "router_z_loss_clip": 1.79980469, + "router_z_loss_mlp": 0.13543701, + "step": 5800, + "time_per_iteration": 2.559406042098999 + }, + { + "auxiliary_loss_clip": 0.06468332, + "auxiliary_loss_mlp": 0.01272895, + "balance_loss_clip": 0.06284243, + "balance_loss_mlp": 0.01258375, + "epoch": 0.3487749887268901, + "flos": 44063096924160.0, + "grad_norm": 1.6752863637060063, + "language_loss": 0.67620414, + "learning_rate": 3.025411657833591e-06, + "loss": 0.75361645, + "num_input_tokens_seen": 124838420, + "router_z_loss_clip": 1.84082031, + "router_z_loss_mlp": 0.14538574, + "step": 5801, + "time_per_iteration": 2.7286293506622314 + }, + { + "auxiliary_loss_clip": 0.064619, + "auxiliary_loss_mlp": 0.01268462, + "balance_loss_clip": 0.06283519, + "balance_loss_mlp": 0.01253406, + "epoch": 0.3488351119795581, + "flos": 23301921813120.0, + "grad_norm": 1.7427843167651098, + "language_loss": 0.76900619, + "learning_rate": 3.025077260480735e-06, + "loss": 0.84630978, + "num_input_tokens_seen": 124857320, + "router_z_loss_clip": 1.78320312, + "router_z_loss_mlp": 0.15075684, + "step": 5802, + "time_per_iteration": 2.5632455348968506 + }, + { + "auxiliary_loss_clip": 0.0645422, + "auxiliary_loss_mlp": 0.01273067, + "balance_loss_clip": 0.06281535, + "balance_loss_mlp": 0.01260109, + "epoch": 0.34889523523222604, + "flos": 19940449968000.0, + "grad_norm": 1.7168444943641856, + "language_loss": 0.79347479, + "learning_rate": 3.0247428242568474e-06, + "loss": 0.87074769, + "num_input_tokens_seen": 124875685, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.12957764, + "step": 5803, + "time_per_iteration": 2.5202274322509766 + }, + { + "auxiliary_loss_clip": 0.06462935, + "auxiliary_loss_mlp": 0.01269017, + "balance_loss_clip": 0.06277519, + "balance_loss_mlp": 0.01255212, + "epoch": 0.348955358484894, + "flos": 30454123570560.0, + "grad_norm": 2.672940484210586, + "language_loss": 0.67680007, + "learning_rate": 3.0244083491746085e-06, + "loss": 0.75411958, + "num_input_tokens_seen": 124895960, + "router_z_loss_clip": 1.85351562, + "router_z_loss_mlp": 0.13812256, + "step": 5804, + "time_per_iteration": 2.636371374130249 + }, + { + "auxiliary_loss_clip": 0.06455779, + "auxiliary_loss_mlp": 0.01267233, + "balance_loss_clip": 0.06282568, + "balance_loss_mlp": 0.01253989, + "epoch": 0.349015481737562, + "flos": 18005071887360.0, + "grad_norm": 1.776416664420285, + "language_loss": 0.76608741, + "learning_rate": 3.024073835246702e-06, + "loss": 0.84331751, + "num_input_tokens_seen": 124914140, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.13238525, + "step": 5805, + "time_per_iteration": 2.4746642112731934 + }, + { + "auxiliary_loss_clip": 0.06461459, + "auxiliary_loss_mlp": 0.01269872, + "balance_loss_clip": 0.06281143, + "balance_loss_mlp": 0.0125568, + "epoch": 0.34907560499023, + "flos": 27205815064320.0, + "grad_norm": 2.094620432718779, + "language_loss": 0.67626035, + "learning_rate": 3.023739282485814e-06, + "loss": 0.7535736, + "num_input_tokens_seen": 124934180, + "router_z_loss_clip": 1.80273438, + "router_z_loss_mlp": 0.14178467, + "step": 5806, + "time_per_iteration": 2.6109619140625 + }, + { + "auxiliary_loss_clip": 0.06461781, + "auxiliary_loss_mlp": 0.01268424, + "balance_loss_clip": 0.06281736, + "balance_loss_mlp": 0.01254596, + "epoch": 0.34913572824289796, + "flos": 30234714854400.0, + "grad_norm": 1.7462714312606824, + "language_loss": 0.71972066, + "learning_rate": 3.023404690904629e-06, + "loss": 0.7970227, + "num_input_tokens_seen": 124956060, + "router_z_loss_clip": 1.80273438, + "router_z_loss_mlp": 0.1383667, + "step": 5807, + "time_per_iteration": 2.6023621559143066 + }, + { + "auxiliary_loss_clip": 0.06464535, + "auxiliary_loss_mlp": 0.01272433, + "balance_loss_clip": 0.06279333, + "balance_loss_mlp": 0.01257425, + "epoch": 0.3491958514955659, + "flos": 29979779207040.0, + "grad_norm": 2.0002365662223727, + "language_loss": 0.74799109, + "learning_rate": 3.0230700605158364e-06, + "loss": 0.82536077, + "num_input_tokens_seen": 124976070, + "router_z_loss_clip": 1.8515625, + "router_z_loss_mlp": 0.15002441, + "step": 5808, + "time_per_iteration": 2.661327362060547 + }, + { + "auxiliary_loss_clip": 0.0645329, + "auxiliary_loss_mlp": 0.01272203, + "balance_loss_clip": 0.06278954, + "balance_loss_mlp": 0.0125828, + "epoch": 0.3492559747482339, + "flos": 22789786458240.0, + "grad_norm": 1.539446612060682, + "language_loss": 0.84555626, + "learning_rate": 3.0227353913321238e-06, + "loss": 0.92281115, + "num_input_tokens_seen": 124996995, + "router_z_loss_clip": 1.74316406, + "router_z_loss_mlp": 0.13922119, + "step": 5809, + "time_per_iteration": 2.577709197998047 + }, + { + "auxiliary_loss_clip": 0.06454454, + "auxiliary_loss_mlp": 0.01270466, + "balance_loss_clip": 0.06282149, + "balance_loss_mlp": 0.0125755, + "epoch": 0.34931609800090185, + "flos": 26075257050240.0, + "grad_norm": 1.9706347482771516, + "language_loss": 0.80724359, + "learning_rate": 3.0224006833661835e-06, + "loss": 0.88449275, + "num_input_tokens_seen": 125015600, + "router_z_loss_clip": 1.72167969, + "router_z_loss_mlp": 0.12921143, + "step": 5810, + "time_per_iteration": 2.583709955215454 + }, + { + "auxiliary_loss_clip": 0.06460047, + "auxiliary_loss_mlp": 0.01274437, + "balance_loss_clip": 0.06281585, + "balance_loss_mlp": 0.01260477, + "epoch": 0.3493762212535698, + "flos": 29249744509440.0, + "grad_norm": 1.580057936247994, + "language_loss": 0.75975537, + "learning_rate": 3.0220659366307057e-06, + "loss": 0.83710015, + "num_input_tokens_seen": 125035290, + "router_z_loss_clip": 1.78515625, + "router_z_loss_mlp": 0.1395874, + "step": 5811, + "time_per_iteration": 2.6304807662963867 + }, + { + "auxiliary_loss_clip": 0.06459605, + "auxiliary_loss_mlp": 0.01268711, + "balance_loss_clip": 0.06280548, + "balance_loss_mlp": 0.01254746, + "epoch": 0.3494363445062378, + "flos": 27133461463680.0, + "grad_norm": 1.6291603050336358, + "language_loss": 0.80527401, + "learning_rate": 3.021731151138386e-06, + "loss": 0.88255721, + "num_input_tokens_seen": 125057130, + "router_z_loss_clip": 1.79003906, + "router_z_loss_mlp": 0.1395874, + "step": 5812, + "time_per_iteration": 2.657989025115967 + }, + { + "auxiliary_loss_clip": 0.06462281, + "auxiliary_loss_mlp": 0.01270882, + "balance_loss_clip": 0.0628228, + "balance_loss_mlp": 0.01257179, + "epoch": 0.34949646775890575, + "flos": 12281102173440.0, + "grad_norm": 2.0118644405033463, + "language_loss": 0.701132, + "learning_rate": 3.021396326901918e-06, + "loss": 0.7784636, + "num_input_tokens_seen": 125073720, + "router_z_loss_clip": 1.80273438, + "router_z_loss_mlp": 0.137146, + "step": 5813, + "time_per_iteration": 2.47231388092041 + }, + { + "auxiliary_loss_clip": 0.06457584, + "auxiliary_loss_mlp": 0.01270878, + "balance_loss_clip": 0.06281666, + "balance_loss_mlp": 0.01257401, + "epoch": 0.3495565910115737, + "flos": 17171265507840.0, + "grad_norm": 1.9224367307793844, + "language_loss": 0.76310062, + "learning_rate": 3.0210614639339998e-06, + "loss": 0.8403852, + "num_input_tokens_seen": 125090635, + "router_z_loss_clip": 1.75976562, + "router_z_loss_mlp": 0.13482666, + "step": 5814, + "time_per_iteration": 2.4967095851898193 + }, + { + "auxiliary_loss_clip": 0.06471042, + "auxiliary_loss_mlp": 0.01271787, + "balance_loss_clip": 0.06288652, + "balance_loss_mlp": 0.01257076, + "epoch": 0.3496167142642417, + "flos": 26472342349440.0, + "grad_norm": 1.8186936331307002, + "language_loss": 0.85099685, + "learning_rate": 3.020726562247328e-06, + "loss": 0.92842519, + "num_input_tokens_seen": 125110070, + "router_z_loss_clip": 1.82519531, + "router_z_loss_mlp": 0.1472168, + "step": 5815, + "time_per_iteration": 2.597399950027466 + }, + { + "auxiliary_loss_clip": 0.06466906, + "auxiliary_loss_mlp": 0.01275707, + "balance_loss_clip": 0.06285352, + "balance_loss_mlp": 0.01261712, + "epoch": 0.34967683751690964, + "flos": 17419618609920.0, + "grad_norm": 2.3640337842934565, + "language_loss": 0.78006089, + "learning_rate": 3.0203916218546024e-06, + "loss": 0.85748702, + "num_input_tokens_seen": 125125730, + "router_z_loss_clip": 1.81542969, + "router_z_loss_mlp": 0.13995361, + "step": 5816, + "time_per_iteration": 2.5164036750793457 + }, + { + "auxiliary_loss_clip": 0.0646984, + "auxiliary_loss_mlp": 0.01273456, + "balance_loss_clip": 0.06286636, + "balance_loss_mlp": 0.01258692, + "epoch": 0.3497369607695776, + "flos": 22606365870720.0, + "grad_norm": 1.8515414586733512, + "language_loss": 0.59787703, + "learning_rate": 3.0200566427685246e-06, + "loss": 0.6753099, + "num_input_tokens_seen": 125146195, + "router_z_loss_clip": 1.83398438, + "router_z_loss_mlp": 0.14764404, + "step": 5817, + "time_per_iteration": 2.542877674102783 + }, + { + "auxiliary_loss_clip": 0.06358884, + "auxiliary_loss_mlp": 0.01261904, + "balance_loss_clip": 0.06277611, + "balance_loss_mlp": 0.01257669, + "epoch": 0.34979708402224563, + "flos": 68548461477120.0, + "grad_norm": 0.858700346008579, + "language_loss": 0.59824663, + "learning_rate": 3.0197216250017975e-06, + "loss": 0.67445457, + "num_input_tokens_seen": 125207790, + "router_z_loss_clip": 0.8125, + "router_z_loss_mlp": 0.04238892, + "step": 5818, + "time_per_iteration": 3.1992976665496826 + }, + { + "auxiliary_loss_clip": 0.06459703, + "auxiliary_loss_mlp": 0.01271152, + "balance_loss_clip": 0.06283519, + "balance_loss_mlp": 0.01257109, + "epoch": 0.3498572072749136, + "flos": 18995660455680.0, + "grad_norm": 1.926998914600137, + "language_loss": 0.83806789, + "learning_rate": 3.019386568567123e-06, + "loss": 0.91537642, + "num_input_tokens_seen": 125226220, + "router_z_loss_clip": 1.76074219, + "router_z_loss_mlp": 0.14031982, + "step": 5819, + "time_per_iteration": 2.5241613388061523 + }, + { + "auxiliary_loss_clip": 0.06466879, + "auxiliary_loss_mlp": 0.01269175, + "balance_loss_clip": 0.0628517, + "balance_loss_mlp": 0.0125493, + "epoch": 0.34991733052758156, + "flos": 27826334075520.0, + "grad_norm": 2.092302610514248, + "language_loss": 0.71273863, + "learning_rate": 3.0190514734772083e-06, + "loss": 0.79009914, + "num_input_tokens_seen": 125247485, + "router_z_loss_clip": 1.81835938, + "router_z_loss_mlp": 0.14245605, + "step": 5820, + "time_per_iteration": 2.569838762283325 + }, + { + "auxiliary_loss_clip": 0.06470378, + "auxiliary_loss_mlp": 0.01270567, + "balance_loss_clip": 0.06288413, + "balance_loss_mlp": 0.01256292, + "epoch": 0.3499774537802495, + "flos": 33592706755200.0, + "grad_norm": 2.4345068466865083, + "language_loss": 0.70581877, + "learning_rate": 3.018716339744759e-06, + "loss": 0.78322828, + "num_input_tokens_seen": 125268625, + "router_z_loss_clip": 1.81835938, + "router_z_loss_mlp": 0.14294434, + "step": 5821, + "time_per_iteration": 2.6535534858703613 + }, + { + "auxiliary_loss_clip": 0.06479154, + "auxiliary_loss_mlp": 0.0127118, + "balance_loss_clip": 0.06291604, + "balance_loss_mlp": 0.01254538, + "epoch": 0.3500375770329175, + "flos": 23483413756800.0, + "grad_norm": 1.9533795991074365, + "language_loss": 0.74227631, + "learning_rate": 3.0183811673824842e-06, + "loss": 0.81977963, + "num_input_tokens_seen": 125287530, + "router_z_loss_clip": 1.87402344, + "router_z_loss_mlp": 0.16650391, + "step": 5822, + "time_per_iteration": 5.406672716140747 + }, + { + "auxiliary_loss_clip": 0.06470097, + "auxiliary_loss_mlp": 0.01273086, + "balance_loss_clip": 0.06285684, + "balance_loss_mlp": 0.01257588, + "epoch": 0.35009770028558546, + "flos": 19032067854720.0, + "grad_norm": 2.646032233627204, + "language_loss": 0.7905609, + "learning_rate": 3.018045956403094e-06, + "loss": 0.86799276, + "num_input_tokens_seen": 125307020, + "router_z_loss_clip": 1.84570312, + "router_z_loss_mlp": 0.15496826, + "step": 5823, + "time_per_iteration": 2.5048515796661377 + }, + { + "auxiliary_loss_clip": 0.06353101, + "auxiliary_loss_mlp": 0.01254576, + "balance_loss_clip": 0.06271273, + "balance_loss_mlp": 0.01249748, + "epoch": 0.3501578235382534, + "flos": 68371749216000.0, + "grad_norm": 0.6915411290730273, + "language_loss": 0.58945203, + "learning_rate": 3.017710706819298e-06, + "loss": 0.66552877, + "num_input_tokens_seen": 125370445, + "router_z_loss_clip": 0.81689453, + "router_z_loss_mlp": 0.04821777, + "step": 5824, + "time_per_iteration": 3.209726333618164 + }, + { + "auxiliary_loss_clip": 0.06465952, + "auxiliary_loss_mlp": 0.01274281, + "balance_loss_clip": 0.06284555, + "balance_loss_mlp": 0.01258045, + "epoch": 0.3502179467909214, + "flos": 21257153827200.0, + "grad_norm": 3.0621504018438164, + "language_loss": 0.85168576, + "learning_rate": 3.017375418643811e-06, + "loss": 0.92908812, + "num_input_tokens_seen": 125388900, + "router_z_loss_clip": 1.81445312, + "router_z_loss_mlp": 0.16223145, + "step": 5825, + "time_per_iteration": 2.513498067855835 + }, + { + "auxiliary_loss_clip": 0.06462917, + "auxiliary_loss_mlp": 0.01268842, + "balance_loss_clip": 0.06283134, + "balance_loss_mlp": 0.01254275, + "epoch": 0.35027807004358935, + "flos": 11946978817920.0, + "grad_norm": 2.498923152973308, + "language_loss": 0.83643848, + "learning_rate": 3.0170400918893464e-06, + "loss": 0.91375613, + "num_input_tokens_seen": 125402675, + "router_z_loss_clip": 1.79882812, + "router_z_loss_mlp": 0.14556885, + "step": 5826, + "time_per_iteration": 3.9313511848449707 + }, + { + "auxiliary_loss_clip": 0.06470059, + "auxiliary_loss_mlp": 0.01272158, + "balance_loss_clip": 0.06284411, + "balance_loss_mlp": 0.01254956, + "epoch": 0.3503381932962573, + "flos": 21477401084160.0, + "grad_norm": 2.100708343809493, + "language_loss": 0.81216669, + "learning_rate": 3.0167047265686186e-06, + "loss": 0.88958883, + "num_input_tokens_seen": 125421360, + "router_z_loss_clip": 1.85644531, + "router_z_loss_mlp": 0.17211914, + "step": 5827, + "time_per_iteration": 2.556704044342041 + }, + { + "auxiliary_loss_clip": 0.06462219, + "auxiliary_loss_mlp": 0.01272255, + "balance_loss_clip": 0.06283772, + "balance_loss_mlp": 0.01257473, + "epoch": 0.3503983165489253, + "flos": 21257405389440.0, + "grad_norm": 2.0166313071454858, + "language_loss": 0.71145403, + "learning_rate": 3.0163693226943467e-06, + "loss": 0.78879881, + "num_input_tokens_seen": 125440000, + "router_z_loss_clip": 1.78515625, + "router_z_loss_mlp": 0.14794922, + "step": 5828, + "time_per_iteration": 2.5173239707946777 + }, + { + "auxiliary_loss_clip": 0.06467165, + "auxiliary_loss_mlp": 0.01274622, + "balance_loss_clip": 0.06285597, + "balance_loss_mlp": 0.01257539, + "epoch": 0.35045843980159325, + "flos": 27822644496000.0, + "grad_norm": 1.678964319221545, + "language_loss": 0.79897165, + "learning_rate": 3.016033880279248e-06, + "loss": 0.8763895, + "num_input_tokens_seen": 125460390, + "router_z_loss_clip": 1.81542969, + "router_z_loss_mlp": 0.17077637, + "step": 5829, + "time_per_iteration": 4.086450099945068 + }, + { + "auxiliary_loss_clip": 0.06475446, + "auxiliary_loss_mlp": 0.01275238, + "balance_loss_clip": 0.06286699, + "balance_loss_mlp": 0.01257988, + "epoch": 0.3505185630542612, + "flos": 25928201934720.0, + "grad_norm": 1.7428196933402165, + "language_loss": 0.72440839, + "learning_rate": 3.0156983993360417e-06, + "loss": 0.80191517, + "num_input_tokens_seen": 125478410, + "router_z_loss_clip": 1.88769531, + "router_z_loss_mlp": 0.17248535, + "step": 5830, + "time_per_iteration": 2.625723361968994 + }, + { + "auxiliary_loss_clip": 0.06461293, + "auxiliary_loss_mlp": 0.01273843, + "balance_loss_clip": 0.06283247, + "balance_loss_mlp": 0.01259633, + "epoch": 0.35057868630692923, + "flos": 20527999597440.0, + "grad_norm": 2.5118715805025884, + "language_loss": 0.88613749, + "learning_rate": 3.0153628798774513e-06, + "loss": 0.96348894, + "num_input_tokens_seen": 125495975, + "router_z_loss_clip": 1.78222656, + "router_z_loss_mlp": 0.14221191, + "step": 5831, + "time_per_iteration": 2.577260732650757 + }, + { + "auxiliary_loss_clip": 0.06466363, + "auxiliary_loss_mlp": 0.01273549, + "balance_loss_clip": 0.06284641, + "balance_loss_mlp": 0.01258672, + "epoch": 0.3506388095595972, + "flos": 20454849383040.0, + "grad_norm": 2.013142681723478, + "language_loss": 0.78719735, + "learning_rate": 3.0150273219161985e-06, + "loss": 0.86459637, + "num_input_tokens_seen": 125515035, + "router_z_loss_clip": 1.81738281, + "router_z_loss_mlp": 0.14868164, + "step": 5832, + "time_per_iteration": 2.584496021270752 + }, + { + "auxiliary_loss_clip": 0.06470136, + "auxiliary_loss_mlp": 0.01274951, + "balance_loss_clip": 0.06284127, + "balance_loss_mlp": 0.01258536, + "epoch": 0.35069893281226516, + "flos": 23115901749120.0, + "grad_norm": 3.869403317005625, + "language_loss": 0.71628016, + "learning_rate": 3.014691725465008e-06, + "loss": 0.79373109, + "num_input_tokens_seen": 125535555, + "router_z_loss_clip": 1.86230469, + "router_z_loss_mlp": 0.1640625, + "step": 5833, + "time_per_iteration": 2.559213161468506 + }, + { + "auxiliary_loss_clip": 0.06462866, + "auxiliary_loss_mlp": 0.01271192, + "balance_loss_clip": 0.06285653, + "balance_loss_mlp": 0.01256291, + "epoch": 0.35075905606493313, + "flos": 27279426476160.0, + "grad_norm": 2.081089463640026, + "language_loss": 0.80963689, + "learning_rate": 3.014356090536606e-06, + "loss": 0.88697743, + "num_input_tokens_seen": 125558195, + "router_z_loss_clip": 1.77246094, + "router_z_loss_mlp": 0.14892578, + "step": 5834, + "time_per_iteration": 2.6462955474853516 + }, + { + "auxiliary_loss_clip": 0.06469317, + "auxiliary_loss_mlp": 0.0127505, + "balance_loss_clip": 0.06288308, + "balance_loss_mlp": 0.01258634, + "epoch": 0.3508191793176011, + "flos": 19133491622400.0, + "grad_norm": 2.5340357013843566, + "language_loss": 0.84608614, + "learning_rate": 3.0140204171437183e-06, + "loss": 0.92352986, + "num_input_tokens_seen": 125575375, + "router_z_loss_clip": 1.80957031, + "router_z_loss_mlp": 0.1640625, + "step": 5835, + "time_per_iteration": 2.5068061351776123 + }, + { + "auxiliary_loss_clip": 0.06463549, + "auxiliary_loss_mlp": 0.01274357, + "balance_loss_clip": 0.0628426, + "balance_loss_mlp": 0.01259122, + "epoch": 0.35087930257026906, + "flos": 25564798776960.0, + "grad_norm": 1.6798272602016127, + "language_loss": 0.77162683, + "learning_rate": 3.0136847052990754e-06, + "loss": 0.84900588, + "num_input_tokens_seen": 125596745, + "router_z_loss_clip": 1.79296875, + "router_z_loss_mlp": 0.15234375, + "step": 5836, + "time_per_iteration": 2.628737449645996 + }, + { + "auxiliary_loss_clip": 0.06462973, + "auxiliary_loss_mlp": 0.01284097, + "balance_loss_clip": 0.06285001, + "balance_loss_mlp": 0.01268767, + "epoch": 0.350939425822937, + "flos": 18010061205120.0, + "grad_norm": 1.7914903677000888, + "language_loss": 0.7777887, + "learning_rate": 3.0133489550154074e-06, + "loss": 0.85525942, + "num_input_tokens_seen": 125613980, + "router_z_loss_clip": 1.77929688, + "router_z_loss_mlp": 0.15325928, + "step": 5837, + "time_per_iteration": 2.4906866550445557 + }, + { + "auxiliary_loss_clip": 0.06464779, + "auxiliary_loss_mlp": 0.0127724, + "balance_loss_clip": 0.0628402, + "balance_loss_mlp": 0.01261575, + "epoch": 0.350999549075605, + "flos": 22279747455360.0, + "grad_norm": 2.3774474075228995, + "language_loss": 0.68712002, + "learning_rate": 3.0130131663054442e-06, + "loss": 0.7645402, + "num_input_tokens_seen": 125632100, + "router_z_loss_clip": 1.8046875, + "router_z_loss_mlp": 0.15649414, + "step": 5838, + "time_per_iteration": 2.616330862045288 + }, + { + "auxiliary_loss_clip": 0.06463079, + "auxiliary_loss_mlp": 0.01275242, + "balance_loss_clip": 0.0628327, + "balance_loss_mlp": 0.01259554, + "epoch": 0.35105967232827295, + "flos": 14397511000320.0, + "grad_norm": 2.135026117356547, + "language_loss": 0.83941519, + "learning_rate": 3.0126773391819215e-06, + "loss": 0.91679841, + "num_input_tokens_seen": 125649190, + "router_z_loss_clip": 1.79785156, + "router_z_loss_mlp": 0.15686035, + "step": 5839, + "time_per_iteration": 2.475210428237915 + }, + { + "auxiliary_loss_clip": 0.06472797, + "auxiliary_loss_mlp": 0.01274732, + "balance_loss_clip": 0.06285894, + "balance_loss_mlp": 0.01258376, + "epoch": 0.3511197955809409, + "flos": 25089322383360.0, + "grad_norm": 2.313381638226651, + "language_loss": 0.58970249, + "learning_rate": 3.012341473657572e-06, + "loss": 0.6671778, + "num_input_tokens_seen": 125668680, + "router_z_loss_clip": 1.86914062, + "router_z_loss_mlp": 0.16357422, + "step": 5840, + "time_per_iteration": 2.5654497146606445 + }, + { + "auxiliary_loss_clip": 0.06465258, + "auxiliary_loss_mlp": 0.01277785, + "balance_loss_clip": 0.06280696, + "balance_loss_mlp": 0.0126174, + "epoch": 0.3511799188336089, + "flos": 25891123703040.0, + "grad_norm": 2.5798747861510254, + "language_loss": 0.87567091, + "learning_rate": 3.0120055697451322e-06, + "loss": 0.9531014, + "num_input_tokens_seen": 125686935, + "router_z_loss_clip": 1.84570312, + "router_z_loss_mlp": 0.16040039, + "step": 5841, + "time_per_iteration": 2.5275204181671143 + }, + { + "auxiliary_loss_clip": 0.06473795, + "auxiliary_loss_mlp": 0.01278097, + "balance_loss_clip": 0.0628502, + "balance_loss_mlp": 0.01261038, + "epoch": 0.35124004208627685, + "flos": 20089852997760.0, + "grad_norm": 1.7442007932185601, + "language_loss": 0.7546367, + "learning_rate": 3.0116696274573406e-06, + "loss": 0.83215564, + "num_input_tokens_seen": 125707180, + "router_z_loss_clip": 1.88964844, + "router_z_loss_mlp": 0.17077637, + "step": 5842, + "time_per_iteration": 2.5876784324645996 + }, + { + "auxiliary_loss_clip": 0.06465417, + "auxiliary_loss_mlp": 0.01280375, + "balance_loss_clip": 0.06280544, + "balance_loss_mlp": 0.01265105, + "epoch": 0.3513001653389448, + "flos": 17788891553280.0, + "grad_norm": 2.704982383226077, + "language_loss": 0.68951106, + "learning_rate": 3.0113336468069346e-06, + "loss": 0.76696897, + "num_input_tokens_seen": 125722780, + "router_z_loss_clip": 1.84765625, + "router_z_loss_mlp": 0.15258789, + "step": 5843, + "time_per_iteration": 2.4710304737091064 + }, + { + "auxiliary_loss_clip": 0.06466319, + "auxiliary_loss_mlp": 0.01285229, + "balance_loss_clip": 0.0628369, + "balance_loss_mlp": 0.01268892, + "epoch": 0.3513602885916128, + "flos": 29394745200000.0, + "grad_norm": 2.1140022916881525, + "language_loss": 0.66181982, + "learning_rate": 3.010997627806655e-06, + "loss": 0.7393353, + "num_input_tokens_seen": 125742110, + "router_z_loss_clip": 1.82714844, + "router_z_loss_mlp": 0.16326904, + "step": 5844, + "time_per_iteration": 2.585793972015381 + }, + { + "auxiliary_loss_clip": 0.06472903, + "auxiliary_loss_mlp": 0.01282408, + "balance_loss_clip": 0.0628912, + "balance_loss_mlp": 0.01265761, + "epoch": 0.3514204118442808, + "flos": 16185372768000.0, + "grad_norm": 2.0590361589883206, + "language_loss": 0.75743866, + "learning_rate": 3.010661570469245e-06, + "loss": 0.83499175, + "num_input_tokens_seen": 125759980, + "router_z_loss_clip": 1.83789062, + "router_z_loss_mlp": 0.1663208, + "step": 5845, + "time_per_iteration": 2.50748348236084 + }, + { + "auxiliary_loss_clip": 0.06463686, + "auxiliary_loss_mlp": 0.01285129, + "balance_loss_clip": 0.06284383, + "balance_loss_mlp": 0.01270102, + "epoch": 0.35148053509694877, + "flos": 23840234369280.0, + "grad_norm": 5.020955850717412, + "language_loss": 0.73988718, + "learning_rate": 3.0103254748074465e-06, + "loss": 0.8173753, + "num_input_tokens_seen": 125772660, + "router_z_loss_clip": 1.79101562, + "router_z_loss_mlp": 0.15032959, + "step": 5846, + "time_per_iteration": 2.626898765563965 + }, + { + "auxiliary_loss_clip": 0.06470932, + "auxiliary_loss_mlp": 0.01280544, + "balance_loss_clip": 0.06285631, + "balance_loss_mlp": 0.01265482, + "epoch": 0.35154065834961673, + "flos": 20996809591680.0, + "grad_norm": 1.7410870567887373, + "language_loss": 0.75501883, + "learning_rate": 3.0099893408340046e-06, + "loss": 0.8325336, + "num_input_tokens_seen": 125791935, + "router_z_loss_clip": 1.85253906, + "router_z_loss_mlp": 0.1506958, + "step": 5847, + "time_per_iteration": 2.559131383895874 + }, + { + "auxiliary_loss_clip": 0.06472816, + "auxiliary_loss_mlp": 0.01272158, + "balance_loss_clip": 0.06284919, + "balance_loss_mlp": 0.01257316, + "epoch": 0.3516007816022847, + "flos": 33263866206720.0, + "grad_norm": 1.8955744454716683, + "language_loss": 0.72774404, + "learning_rate": 3.009653168561666e-06, + "loss": 0.80519378, + "num_input_tokens_seen": 125813455, + "router_z_loss_clip": 1.87792969, + "router_z_loss_mlp": 0.1484375, + "step": 5848, + "time_per_iteration": 2.6645965576171875 + }, + { + "auxiliary_loss_clip": 0.06467354, + "auxiliary_loss_mlp": 0.01280776, + "balance_loss_clip": 0.06280826, + "balance_loss_mlp": 0.01265124, + "epoch": 0.35166090485495266, + "flos": 11731427389440.0, + "grad_norm": 2.1922530808110983, + "language_loss": 0.90064394, + "learning_rate": 3.009316958003178e-06, + "loss": 0.97812521, + "num_input_tokens_seen": 125827660, + "router_z_loss_clip": 1.86425781, + "router_z_loss_mlp": 0.15655518, + "step": 5849, + "time_per_iteration": 2.4567575454711914 + }, + { + "auxiliary_loss_clip": 0.06464183, + "auxiliary_loss_mlp": 0.01272929, + "balance_loss_clip": 0.06281896, + "balance_loss_mlp": 0.01257461, + "epoch": 0.3517210281076206, + "flos": 22645121184000.0, + "grad_norm": 2.4964624006606946, + "language_loss": 0.75405449, + "learning_rate": 3.0089807091712897e-06, + "loss": 0.83142555, + "num_input_tokens_seen": 125846655, + "router_z_loss_clip": 1.82324219, + "router_z_loss_mlp": 0.15472412, + "step": 5850, + "time_per_iteration": 2.5980029106140137 + }, + { + "auxiliary_loss_clip": 0.06463099, + "auxiliary_loss_mlp": 0.01274678, + "balance_loss_clip": 0.06282984, + "balance_loss_mlp": 0.01259842, + "epoch": 0.3517811513602886, + "flos": 21328836595200.0, + "grad_norm": 2.0250770904548303, + "language_loss": 0.76385641, + "learning_rate": 3.0086444220787515e-06, + "loss": 0.84123409, + "num_input_tokens_seen": 125866290, + "router_z_loss_clip": 1.80371094, + "router_z_loss_mlp": 0.14825439, + "step": 5851, + "time_per_iteration": 2.5065958499908447 + }, + { + "auxiliary_loss_clip": 0.06463097, + "auxiliary_loss_mlp": 0.01275014, + "balance_loss_clip": 0.06281513, + "balance_loss_mlp": 0.01258933, + "epoch": 0.35184127461295656, + "flos": 21039254484480.0, + "grad_norm": 1.95256002439052, + "language_loss": 0.88133335, + "learning_rate": 3.0083080967383165e-06, + "loss": 0.95871449, + "num_input_tokens_seen": 125884620, + "router_z_loss_clip": 1.81738281, + "router_z_loss_mlp": 0.1607666, + "step": 5852, + "time_per_iteration": 2.571439266204834 + }, + { + "auxiliary_loss_clip": 0.06461711, + "auxiliary_loss_mlp": 0.01273084, + "balance_loss_clip": 0.06282608, + "balance_loss_mlp": 0.01258087, + "epoch": 0.3519013978656245, + "flos": 22461784450560.0, + "grad_norm": 2.1690150127965038, + "language_loss": 0.68480182, + "learning_rate": 3.007971733162737e-06, + "loss": 0.76214981, + "num_input_tokens_seen": 125902430, + "router_z_loss_clip": 1.79101562, + "router_z_loss_mlp": 0.14990234, + "step": 5853, + "time_per_iteration": 2.5121214389801025 + }, + { + "auxiliary_loss_clip": 0.06466305, + "auxiliary_loss_mlp": 0.0127272, + "balance_loss_clip": 0.06282477, + "balance_loss_mlp": 0.01256972, + "epoch": 0.3519615211182925, + "flos": 13120317141120.0, + "grad_norm": 2.1084516189193403, + "language_loss": 0.81284809, + "learning_rate": 3.0076353313647686e-06, + "loss": 0.89023829, + "num_input_tokens_seen": 125920570, + "router_z_loss_clip": 1.83691406, + "router_z_loss_mlp": 0.15734863, + "step": 5854, + "time_per_iteration": 2.644672155380249 + }, + { + "auxiliary_loss_clip": 0.06456967, + "auxiliary_loss_mlp": 0.01267471, + "balance_loss_clip": 0.06279022, + "balance_loss_mlp": 0.01253481, + "epoch": 0.35202164437096045, + "flos": 19141122343680.0, + "grad_norm": 1.5283351736697255, + "language_loss": 0.73366165, + "learning_rate": 3.0072988913571666e-06, + "loss": 0.81090605, + "num_input_tokens_seen": 125939800, + "router_z_loss_clip": 1.77832031, + "router_z_loss_mlp": 0.13970947, + "step": 5855, + "time_per_iteration": 2.489614486694336 + }, + { + "auxiliary_loss_clip": 0.06458069, + "auxiliary_loss_mlp": 0.01271058, + "balance_loss_clip": 0.06279419, + "balance_loss_mlp": 0.01256717, + "epoch": 0.3520817676236284, + "flos": 26549475632640.0, + "grad_norm": 1.8023400431296785, + "language_loss": 0.71055883, + "learning_rate": 3.006962413152691e-06, + "loss": 0.78785008, + "num_input_tokens_seen": 125958720, + "router_z_loss_clip": 1.78613281, + "router_z_loss_mlp": 0.14337158, + "step": 5856, + "time_per_iteration": 2.5643463134765625 + }, + { + "auxiliary_loss_clip": 0.064651, + "auxiliary_loss_mlp": 0.01271649, + "balance_loss_clip": 0.062787, + "balance_loss_mlp": 0.01255663, + "epoch": 0.3521418908762964, + "flos": 44903653557120.0, + "grad_norm": 1.9243906825553334, + "language_loss": 0.61456323, + "learning_rate": 3.0066258967640987e-06, + "loss": 0.69193071, + "num_input_tokens_seen": 125984310, + "router_z_loss_clip": 1.86523438, + "router_z_loss_mlp": 0.16003418, + "step": 5857, + "time_per_iteration": 2.723026752471924 + }, + { + "auxiliary_loss_clip": 0.06463988, + "auxiliary_loss_mlp": 0.0126934, + "balance_loss_clip": 0.06281644, + "balance_loss_mlp": 0.01253569, + "epoch": 0.3522020141289644, + "flos": 20192576503680.0, + "grad_norm": 1.9490734994800325, + "language_loss": 0.73682863, + "learning_rate": 3.006289342204152e-06, + "loss": 0.8141619, + "num_input_tokens_seen": 126002410, + "router_z_loss_clip": 1.82324219, + "router_z_loss_mlp": 0.15765381, + "step": 5858, + "time_per_iteration": 2.5245583057403564 + }, + { + "auxiliary_loss_clip": 0.0646653, + "auxiliary_loss_mlp": 0.01270245, + "balance_loss_clip": 0.06283493, + "balance_loss_mlp": 0.01255368, + "epoch": 0.35226213738163237, + "flos": 27571398428160.0, + "grad_norm": 1.5191641480211209, + "language_loss": 0.76385832, + "learning_rate": 3.0059527494856126e-06, + "loss": 0.8412261, + "num_input_tokens_seen": 126022490, + "router_z_loss_clip": 1.83105469, + "router_z_loss_mlp": 0.14880371, + "step": 5859, + "time_per_iteration": 2.5650510787963867 + }, + { + "auxiliary_loss_clip": 0.06474233, + "auxiliary_loss_mlp": 0.01272168, + "balance_loss_clip": 0.06283402, + "balance_loss_mlp": 0.01256862, + "epoch": 0.35232226063430033, + "flos": 22972955483520.0, + "grad_norm": 2.0210321352313305, + "language_loss": 0.72436023, + "learning_rate": 3.0056161186212435e-06, + "loss": 0.80182427, + "num_input_tokens_seen": 126042895, + "router_z_loss_clip": 1.90820312, + "router_z_loss_mlp": 0.15307617, + "step": 5860, + "time_per_iteration": 2.557419776916504 + }, + { + "auxiliary_loss_clip": 0.06468037, + "auxiliary_loss_mlp": 0.01273009, + "balance_loss_clip": 0.06280215, + "balance_loss_mlp": 0.01257304, + "epoch": 0.3523823838869683, + "flos": 19173714382080.0, + "grad_norm": 2.1675794505809076, + "language_loss": 0.66646308, + "learning_rate": 3.005279449623811e-06, + "loss": 0.74387354, + "num_input_tokens_seen": 126060130, + "router_z_loss_clip": 1.87890625, + "router_z_loss_mlp": 0.15704346, + "step": 5861, + "time_per_iteration": 5.330287218093872 + }, + { + "auxiliary_loss_clip": 0.06464717, + "auxiliary_loss_mlp": 0.01272322, + "balance_loss_clip": 0.06283809, + "balance_loss_mlp": 0.01257331, + "epoch": 0.35244250713963626, + "flos": 17936743282560.0, + "grad_norm": 1.8073030876467324, + "language_loss": 0.67339319, + "learning_rate": 3.0049427425060815e-06, + "loss": 0.7507636, + "num_input_tokens_seen": 126077850, + "router_z_loss_clip": 1.80859375, + "router_z_loss_mlp": 0.15002441, + "step": 5862, + "time_per_iteration": 2.545534372329712 + }, + { + "auxiliary_loss_clip": 0.06465253, + "auxiliary_loss_mlp": 0.01277428, + "balance_loss_clip": 0.06279148, + "balance_loss_mlp": 0.01260775, + "epoch": 0.35250263039230423, + "flos": 21438687697920.0, + "grad_norm": 2.06594301339393, + "language_loss": 0.76956195, + "learning_rate": 3.0046059972808215e-06, + "loss": 0.8469888, + "num_input_tokens_seen": 126095985, + "router_z_loss_clip": 1.859375, + "router_z_loss_mlp": 0.16650391, + "step": 5863, + "time_per_iteration": 2.5614800453186035 + }, + { + "auxiliary_loss_clip": 0.06466909, + "auxiliary_loss_mlp": 0.01270449, + "balance_loss_clip": 0.06283094, + "balance_loss_mlp": 0.01255846, + "epoch": 0.3525627536449722, + "flos": 27424133677440.0, + "grad_norm": 1.7204880099735786, + "language_loss": 0.75455201, + "learning_rate": 3.0042692139608024e-06, + "loss": 0.83192563, + "num_input_tokens_seen": 126116070, + "router_z_loss_clip": 1.83789062, + "router_z_loss_mlp": 0.14605713, + "step": 5864, + "time_per_iteration": 2.590428113937378 + }, + { + "auxiliary_loss_clip": 0.06465425, + "auxiliary_loss_mlp": 0.01271849, + "balance_loss_clip": 0.06283714, + "balance_loss_mlp": 0.01257306, + "epoch": 0.35262287689764016, + "flos": 24796637671680.0, + "grad_norm": 2.274548371802061, + "language_loss": 0.79325253, + "learning_rate": 3.003932392558793e-06, + "loss": 0.87062526, + "num_input_tokens_seen": 126135205, + "router_z_loss_clip": 1.81738281, + "router_z_loss_mlp": 0.14550781, + "step": 5865, + "time_per_iteration": 4.090251922607422 + }, + { + "auxiliary_loss_clip": 0.06479216, + "auxiliary_loss_mlp": 0.01273849, + "balance_loss_clip": 0.06290671, + "balance_loss_mlp": 0.01257935, + "epoch": 0.3526830001503081, + "flos": 17827353377280.0, + "grad_norm": 3.6346687905375155, + "language_loss": 0.81561065, + "learning_rate": 3.0035955330875677e-06, + "loss": 0.89314139, + "num_input_tokens_seen": 126151895, + "router_z_loss_clip": 1.88476562, + "router_z_loss_mlp": 0.15917969, + "step": 5866, + "time_per_iteration": 2.5417611598968506 + }, + { + "auxiliary_loss_clip": 0.06481875, + "auxiliary_loss_mlp": 0.01272499, + "balance_loss_clip": 0.06287797, + "balance_loss_mlp": 0.01255226, + "epoch": 0.3527431234029761, + "flos": 18084091887360.0, + "grad_norm": 2.1275369997353692, + "language_loss": 0.84947896, + "learning_rate": 3.0032586355598986e-06, + "loss": 0.9270227, + "num_input_tokens_seen": 126168515, + "router_z_loss_clip": 1.94042969, + "router_z_loss_mlp": 0.17272949, + "step": 5867, + "time_per_iteration": 2.487138509750366 + }, + { + "auxiliary_loss_clip": 0.06472977, + "auxiliary_loss_mlp": 0.01270369, + "balance_loss_clip": 0.06285943, + "balance_loss_mlp": 0.01254431, + "epoch": 0.35280324665564405, + "flos": 19433429712000.0, + "grad_norm": 2.157782607866355, + "language_loss": 0.74828005, + "learning_rate": 3.0029216999885613e-06, + "loss": 0.82571352, + "num_input_tokens_seen": 126186460, + "router_z_loss_clip": 1.86816406, + "router_z_loss_mlp": 0.15942383, + "step": 5868, + "time_per_iteration": 2.536522150039673 + }, + { + "auxiliary_loss_clip": 0.06471637, + "auxiliary_loss_mlp": 0.01277122, + "balance_loss_clip": 0.06284134, + "balance_loss_mlp": 0.01260277, + "epoch": 0.352863369908312, + "flos": 21509951195520.0, + "grad_norm": 2.023756469283546, + "language_loss": 0.6153, + "learning_rate": 3.0025847263863327e-06, + "loss": 0.69278765, + "num_input_tokens_seen": 126206170, + "router_z_loss_clip": 1.87402344, + "router_z_loss_mlp": 0.16845703, + "step": 5869, + "time_per_iteration": 3.977250099182129 + }, + { + "auxiliary_loss_clip": 0.06469242, + "auxiliary_loss_mlp": 0.01275411, + "balance_loss_clip": 0.06282457, + "balance_loss_mlp": 0.01259985, + "epoch": 0.35292349316098, + "flos": 22316029073280.0, + "grad_norm": 3.8155591266042173, + "language_loss": 0.75253737, + "learning_rate": 3.0022477147659917e-06, + "loss": 0.82998383, + "num_input_tokens_seen": 126225605, + "router_z_loss_clip": 1.86914062, + "router_z_loss_mlp": 0.1541748, + "step": 5870, + "time_per_iteration": 2.5275635719299316 + }, + { + "auxiliary_loss_clip": 0.06466261, + "auxiliary_loss_mlp": 0.01271259, + "balance_loss_clip": 0.06282211, + "balance_loss_mlp": 0.01255964, + "epoch": 0.352983616413648, + "flos": 33118152756480.0, + "grad_norm": 1.8217533687724534, + "language_loss": 0.72204906, + "learning_rate": 3.001910665140316e-06, + "loss": 0.79942429, + "num_input_tokens_seen": 126250230, + "router_z_loss_clip": 1.84082031, + "router_z_loss_mlp": 0.1529541, + "step": 5871, + "time_per_iteration": 2.660351037979126 + }, + { + "auxiliary_loss_clip": 0.06463222, + "auxiliary_loss_mlp": 0.012708, + "balance_loss_clip": 0.0628562, + "balance_loss_mlp": 0.01257389, + "epoch": 0.35304373966631597, + "flos": 18702388765440.0, + "grad_norm": 1.8432981727531608, + "language_loss": 0.73899144, + "learning_rate": 3.0015735775220873e-06, + "loss": 0.81633162, + "num_input_tokens_seen": 126268315, + "router_z_loss_clip": 1.77636719, + "router_z_loss_mlp": 0.13415527, + "step": 5872, + "time_per_iteration": 2.501868724822998 + }, + { + "auxiliary_loss_clip": 0.06467956, + "auxiliary_loss_mlp": 0.01269552, + "balance_loss_clip": 0.06285646, + "balance_loss_mlp": 0.01255163, + "epoch": 0.35310386291898394, + "flos": 23371214739840.0, + "grad_norm": 1.6596154000518588, + "language_loss": 0.83059716, + "learning_rate": 3.001236451924089e-06, + "loss": 0.90797222, + "num_input_tokens_seen": 126288390, + "router_z_loss_clip": 1.8203125, + "router_z_loss_mlp": 0.14404297, + "step": 5873, + "time_per_iteration": 2.6044130325317383 + }, + { + "auxiliary_loss_clip": 0.06475792, + "auxiliary_loss_mlp": 0.01275098, + "balance_loss_clip": 0.06285458, + "balance_loss_mlp": 0.0125879, + "epoch": 0.3531639861716519, + "flos": 24468803372160.0, + "grad_norm": 2.6977932070351183, + "language_loss": 0.65726781, + "learning_rate": 3.000899288359104e-06, + "loss": 0.73477674, + "num_input_tokens_seen": 126305750, + "router_z_loss_clip": 1.90234375, + "router_z_loss_mlp": 0.16308594, + "step": 5874, + "time_per_iteration": 2.558915138244629 + }, + { + "auxiliary_loss_clip": 0.06370112, + "auxiliary_loss_mlp": 0.01273024, + "balance_loss_clip": 0.06287491, + "balance_loss_mlp": 0.01268941, + "epoch": 0.35322410942431987, + "flos": 70331040437760.0, + "grad_norm": 0.7490717453474699, + "language_loss": 0.616135, + "learning_rate": 3.000562086839917e-06, + "loss": 0.69256639, + "num_input_tokens_seen": 126362495, + "router_z_loss_clip": 0.82714844, + "router_z_loss_mlp": 0.04083252, + "step": 5875, + "time_per_iteration": 3.1286721229553223 + }, + { + "auxiliary_loss_clip": 0.06475496, + "auxiliary_loss_mlp": 0.01277595, + "balance_loss_clip": 0.06289661, + "balance_loss_mlp": 0.01262086, + "epoch": 0.35328423267698783, + "flos": 19825735328640.0, + "grad_norm": 2.073373185113386, + "language_loss": 0.8042345, + "learning_rate": 3.0002248473793163e-06, + "loss": 0.88176548, + "num_input_tokens_seen": 126378320, + "router_z_loss_clip": 1.85742188, + "router_z_loss_mlp": 0.15509033, + "step": 5876, + "time_per_iteration": 2.5174875259399414 + }, + { + "auxiliary_loss_clip": 0.063563, + "auxiliary_loss_mlp": 0.01261292, + "balance_loss_clip": 0.06274077, + "balance_loss_mlp": 0.01257364, + "epoch": 0.3533443559296558, + "flos": 60843398480640.0, + "grad_norm": 0.6578323239794136, + "language_loss": 0.56720114, + "learning_rate": 2.999887569990088e-06, + "loss": 0.64337707, + "num_input_tokens_seen": 126442735, + "router_z_loss_clip": 0.82128906, + "router_z_loss_mlp": 0.03924561, + "step": 5877, + "time_per_iteration": 3.239800214767456 + }, + { + "auxiliary_loss_clip": 0.0647119, + "auxiliary_loss_mlp": 0.01275609, + "balance_loss_clip": 0.06286252, + "balance_loss_mlp": 0.01259301, + "epoch": 0.35340447918232376, + "flos": 24762997457280.0, + "grad_norm": 1.7728898292153, + "language_loss": 0.72425848, + "learning_rate": 2.999550254685024e-06, + "loss": 0.80172646, + "num_input_tokens_seen": 126463090, + "router_z_loss_clip": 1.84863281, + "router_z_loss_mlp": 0.16308594, + "step": 5878, + "time_per_iteration": 2.576354742050171 + }, + { + "auxiliary_loss_clip": 0.06470102, + "auxiliary_loss_mlp": 0.01272441, + "balance_loss_clip": 0.06286008, + "balance_loss_mlp": 0.01256789, + "epoch": 0.3534646024349917, + "flos": 21802342417920.0, + "grad_norm": 2.4353464978664494, + "language_loss": 0.78682542, + "learning_rate": 2.9992129014769136e-06, + "loss": 0.86425084, + "num_input_tokens_seen": 126482105, + "router_z_loss_clip": 1.84082031, + "router_z_loss_mlp": 0.15649414, + "step": 5879, + "time_per_iteration": 2.535600423812866 + }, + { + "auxiliary_loss_clip": 0.06481053, + "auxiliary_loss_mlp": 0.01271703, + "balance_loss_clip": 0.0628894, + "balance_loss_mlp": 0.01253714, + "epoch": 0.3535247256876597, + "flos": 20018463719040.0, + "grad_norm": 2.0590866059314035, + "language_loss": 0.63551295, + "learning_rate": 2.9988755103785493e-06, + "loss": 0.71304053, + "num_input_tokens_seen": 126502125, + "router_z_loss_clip": 1.91992188, + "router_z_loss_mlp": 0.17980957, + "step": 5880, + "time_per_iteration": 2.5576937198638916 + }, + { + "auxiliary_loss_clip": 0.06481048, + "auxiliary_loss_mlp": 0.01274855, + "balance_loss_clip": 0.06292346, + "balance_loss_mlp": 0.01258035, + "epoch": 0.35358484894032766, + "flos": 18193984917120.0, + "grad_norm": 2.6506562916801273, + "language_loss": 0.66346908, + "learning_rate": 2.998538081402727e-06, + "loss": 0.74102807, + "num_input_tokens_seen": 126521950, + "router_z_loss_clip": 1.88671875, + "router_z_loss_mlp": 0.16821289, + "step": 5881, + "time_per_iteration": 2.5375049114227295 + }, + { + "auxiliary_loss_clip": 0.06465093, + "auxiliary_loss_mlp": 0.01272514, + "balance_loss_clip": 0.06285467, + "balance_loss_mlp": 0.0125818, + "epoch": 0.3536449721929956, + "flos": 22826990471040.0, + "grad_norm": 1.7415962616346485, + "language_loss": 0.75838578, + "learning_rate": 2.998200614562239e-06, + "loss": 0.8357619, + "num_input_tokens_seen": 126542445, + "router_z_loss_clip": 1.79492188, + "router_z_loss_mlp": 0.14337158, + "step": 5882, + "time_per_iteration": 2.546163558959961 + }, + { + "auxiliary_loss_clip": 0.06472618, + "auxiliary_loss_mlp": 0.01271877, + "balance_loss_clip": 0.06285325, + "balance_loss_mlp": 0.01256189, + "epoch": 0.3537050954456636, + "flos": 26439540675840.0, + "grad_norm": 2.210270342508568, + "language_loss": 0.70790988, + "learning_rate": 2.9978631098698847e-06, + "loss": 0.78535485, + "num_input_tokens_seen": 126560690, + "router_z_loss_clip": 1.87402344, + "router_z_loss_mlp": 0.15692139, + "step": 5883, + "time_per_iteration": 2.5813896656036377 + }, + { + "auxiliary_loss_clip": 0.06481725, + "auxiliary_loss_mlp": 0.01274676, + "balance_loss_clip": 0.0628854, + "balance_loss_mlp": 0.01258105, + "epoch": 0.3537652186983316, + "flos": 17202096610560.0, + "grad_norm": 3.5308447991949348, + "language_loss": 0.7912811, + "learning_rate": 2.9975255673384614e-06, + "loss": 0.86884505, + "num_input_tokens_seen": 126577620, + "router_z_loss_clip": 1.9296875, + "router_z_loss_mlp": 0.16564941, + "step": 5884, + "time_per_iteration": 2.564178228378296 + }, + { + "auxiliary_loss_clip": 0.06469014, + "auxiliary_loss_mlp": 0.01273424, + "balance_loss_clip": 0.06285414, + "balance_loss_mlp": 0.01258142, + "epoch": 0.3538253419509996, + "flos": 19542861544320.0, + "grad_norm": 3.0890260502514173, + "language_loss": 0.76079619, + "learning_rate": 2.9971879869807673e-06, + "loss": 0.83822054, + "num_input_tokens_seen": 126596235, + "router_z_loss_clip": 1.83691406, + "router_z_loss_mlp": 0.15283203, + "step": 5885, + "time_per_iteration": 2.5860350131988525 + }, + { + "auxiliary_loss_clip": 0.06473316, + "auxiliary_loss_mlp": 0.01274145, + "balance_loss_clip": 0.06285691, + "balance_loss_mlp": 0.01257766, + "epoch": 0.35388546520366754, + "flos": 12133166590080.0, + "grad_norm": 4.983567417880078, + "language_loss": 0.83563066, + "learning_rate": 2.996850368809606e-06, + "loss": 0.91310525, + "num_input_tokens_seen": 126612830, + "router_z_loss_clip": 1.87597656, + "router_z_loss_mlp": 0.16357422, + "step": 5886, + "time_per_iteration": 2.549227714538574 + }, + { + "auxiliary_loss_clip": 0.06464715, + "auxiliary_loss_mlp": 0.01274591, + "balance_loss_clip": 0.06282739, + "balance_loss_mlp": 0.0125851, + "epoch": 0.3539455884563355, + "flos": 19683501822720.0, + "grad_norm": 3.219387216821374, + "language_loss": 0.78429639, + "learning_rate": 2.9965127128377787e-06, + "loss": 0.86168945, + "num_input_tokens_seen": 126630910, + "router_z_loss_clip": 1.82128906, + "router_z_loss_mlp": 0.16088867, + "step": 5887, + "time_per_iteration": 2.523743152618408 + }, + { + "auxiliary_loss_clip": 0.0646676, + "auxiliary_loss_mlp": 0.0127383, + "balance_loss_clip": 0.06282511, + "balance_loss_mlp": 0.01258631, + "epoch": 0.35400571170900347, + "flos": 18077006217600.0, + "grad_norm": 1.8956957640615841, + "language_loss": 0.66116667, + "learning_rate": 2.996175019078089e-06, + "loss": 0.7385726, + "num_input_tokens_seen": 126648365, + "router_z_loss_clip": 1.84570312, + "router_z_loss_mlp": 0.15197754, + "step": 5888, + "time_per_iteration": 2.5279300212860107 + }, + { + "auxiliary_loss_clip": 0.06467725, + "auxiliary_loss_mlp": 0.01271718, + "balance_loss_clip": 0.06284125, + "balance_loss_mlp": 0.01256185, + "epoch": 0.35406583496167143, + "flos": 26075298977280.0, + "grad_norm": 2.3097601077816443, + "language_loss": 0.76721621, + "learning_rate": 2.9958372875433437e-06, + "loss": 0.84461069, + "num_input_tokens_seen": 126667500, + "router_z_loss_clip": 1.8359375, + "router_z_loss_mlp": 0.15527344, + "step": 5889, + "time_per_iteration": 2.564761161804199 + }, + { + "auxiliary_loss_clip": 0.06465457, + "auxiliary_loss_mlp": 0.01270164, + "balance_loss_clip": 0.06283142, + "balance_loss_mlp": 0.01254357, + "epoch": 0.3541259582143394, + "flos": 19798635732480.0, + "grad_norm": 2.1640548649274116, + "language_loss": 0.81408846, + "learning_rate": 2.9954995182463478e-06, + "loss": 0.89144462, + "num_input_tokens_seen": 126686820, + "router_z_loss_clip": 1.82421875, + "router_z_loss_mlp": 0.15808105, + "step": 5890, + "time_per_iteration": 2.5614936351776123 + }, + { + "auxiliary_loss_clip": 0.06466024, + "auxiliary_loss_mlp": 0.01270173, + "balance_loss_clip": 0.06285816, + "balance_loss_mlp": 0.01256094, + "epoch": 0.35418608146700736, + "flos": 24028518493440.0, + "grad_norm": 1.6495661544524922, + "language_loss": 0.80017459, + "learning_rate": 2.99516171119991e-06, + "loss": 0.87753654, + "num_input_tokens_seen": 126706965, + "router_z_loss_clip": 1.80078125, + "router_z_loss_mlp": 0.14074707, + "step": 5891, + "time_per_iteration": 2.553158760070801 + }, + { + "auxiliary_loss_clip": 0.06471643, + "auxiliary_loss_mlp": 0.01282427, + "balance_loss_clip": 0.06289162, + "balance_loss_mlp": 0.01265928, + "epoch": 0.35424620471967533, + "flos": 12390701713920.0, + "grad_norm": 1.7694155250203176, + "language_loss": 0.73450041, + "learning_rate": 2.9948238664168415e-06, + "loss": 0.81204116, + "num_input_tokens_seen": 126724015, + "router_z_loss_clip": 1.82226562, + "router_z_loss_mlp": 0.16516113, + "step": 5892, + "time_per_iteration": 2.529136896133423 + }, + { + "auxiliary_loss_clip": 0.06470741, + "auxiliary_loss_mlp": 0.01274401, + "balance_loss_clip": 0.06286078, + "balance_loss_mlp": 0.01259059, + "epoch": 0.3543063279723433, + "flos": 19678219015680.0, + "grad_norm": 3.019670501918518, + "language_loss": 0.67408991, + "learning_rate": 2.9944859839099518e-06, + "loss": 0.75154132, + "num_input_tokens_seen": 126737565, + "router_z_loss_clip": 1.84667969, + "router_z_loss_mlp": 0.15344238, + "step": 5893, + "time_per_iteration": 2.507456064224243 + }, + { + "auxiliary_loss_clip": 0.06469926, + "auxiliary_loss_mlp": 0.01274247, + "balance_loss_clip": 0.06287754, + "balance_loss_mlp": 0.01257545, + "epoch": 0.35436645122501126, + "flos": 21915841173120.0, + "grad_norm": 1.8801549379271045, + "language_loss": 0.70079887, + "learning_rate": 2.9941480636920533e-06, + "loss": 0.77824062, + "num_input_tokens_seen": 126756095, + "router_z_loss_clip": 1.82128906, + "router_z_loss_mlp": 0.16711426, + "step": 5894, + "time_per_iteration": 2.5596466064453125 + }, + { + "auxiliary_loss_clip": 0.0646911, + "auxiliary_loss_mlp": 0.0127714, + "balance_loss_clip": 0.06291118, + "balance_loss_mlp": 0.0126256, + "epoch": 0.3544265744776792, + "flos": 21724915645440.0, + "grad_norm": 1.8040348457355686, + "language_loss": 0.74516678, + "learning_rate": 2.9938101057759615e-06, + "loss": 0.82262927, + "num_input_tokens_seen": 126775455, + "router_z_loss_clip": 1.78027344, + "router_z_loss_mlp": 0.14569092, + "step": 5895, + "time_per_iteration": 2.602884531021118 + }, + { + "auxiliary_loss_clip": 0.06476314, + "auxiliary_loss_mlp": 0.01274747, + "balance_loss_clip": 0.06292941, + "balance_loss_mlp": 0.01259643, + "epoch": 0.3544866977303472, + "flos": 21219278981760.0, + "grad_norm": 1.7647167527567422, + "language_loss": 0.83600783, + "learning_rate": 2.993472110174491e-06, + "loss": 0.91351843, + "num_input_tokens_seen": 126792320, + "router_z_loss_clip": 1.83300781, + "router_z_loss_mlp": 0.15100098, + "step": 5896, + "time_per_iteration": 2.5642035007476807 + }, + { + "auxiliary_loss_clip": 0.06472465, + "auxiliary_loss_mlp": 0.01278933, + "balance_loss_clip": 0.06292751, + "balance_loss_mlp": 0.01261576, + "epoch": 0.35454682098301515, + "flos": 29318534311680.0, + "grad_norm": 1.8515152904238923, + "language_loss": 0.70294917, + "learning_rate": 2.9931340769004576e-06, + "loss": 0.7804631, + "num_input_tokens_seen": 126813680, + "router_z_loss_clip": 1.79785156, + "router_z_loss_mlp": 0.17346191, + "step": 5897, + "time_per_iteration": 2.613032341003418 + }, + { + "auxiliary_loss_clip": 0.06475735, + "auxiliary_loss_mlp": 0.01274261, + "balance_loss_clip": 0.06293957, + "balance_loss_mlp": 0.01259205, + "epoch": 0.3546069442356832, + "flos": 24323509192320.0, + "grad_norm": 1.6960731630978507, + "language_loss": 0.81964374, + "learning_rate": 2.9927960059666816e-06, + "loss": 0.89714372, + "num_input_tokens_seen": 126834395, + "router_z_loss_clip": 1.81933594, + "router_z_loss_mlp": 0.15063477, + "step": 5898, + "time_per_iteration": 2.6033098697662354 + }, + { + "auxiliary_loss_clip": 0.06471986, + "auxiliary_loss_mlp": 0.01279895, + "balance_loss_clip": 0.0629501, + "balance_loss_mlp": 0.01265173, + "epoch": 0.35466706748835114, + "flos": 22863984848640.0, + "grad_norm": 1.4933011631381068, + "language_loss": 0.74405515, + "learning_rate": 2.9924578973859804e-06, + "loss": 0.82157397, + "num_input_tokens_seen": 126855145, + "router_z_loss_clip": 1.76855469, + "router_z_loss_mlp": 0.14727783, + "step": 5899, + "time_per_iteration": 2.5492894649505615 + }, + { + "auxiliary_loss_clip": 0.0647797, + "auxiliary_loss_mlp": 0.01272872, + "balance_loss_clip": 0.06294148, + "balance_loss_mlp": 0.01257196, + "epoch": 0.3547271907410191, + "flos": 28337714743680.0, + "grad_norm": 3.4583325446366673, + "language_loss": 0.80211669, + "learning_rate": 2.9921197511711763e-06, + "loss": 0.87962508, + "num_input_tokens_seen": 126873790, + "router_z_loss_clip": 1.83886719, + "router_z_loss_mlp": 0.15698242, + "step": 5900, + "time_per_iteration": 5.435121774673462 + }, + { + "auxiliary_loss_clip": 0.06478105, + "auxiliary_loss_mlp": 0.01279951, + "balance_loss_clip": 0.06296446, + "balance_loss_mlp": 0.01263607, + "epoch": 0.35478731399368707, + "flos": 23520911258880.0, + "grad_norm": 2.0942596894242533, + "language_loss": 0.8216058, + "learning_rate": 2.991781567335093e-06, + "loss": 0.89918637, + "num_input_tokens_seen": 126892865, + "router_z_loss_clip": 1.81445312, + "router_z_loss_mlp": 0.16357422, + "step": 5901, + "time_per_iteration": 2.603769540786743 + }, + { + "auxiliary_loss_clip": 0.06480999, + "auxiliary_loss_mlp": 0.01277169, + "balance_loss_clip": 0.06295676, + "balance_loss_mlp": 0.01261899, + "epoch": 0.35484743724635504, + "flos": 18630202872960.0, + "grad_norm": 2.2545917554681663, + "language_loss": 0.75979805, + "learning_rate": 2.9914433458905525e-06, + "loss": 0.83737969, + "num_input_tokens_seen": 126911935, + "router_z_loss_clip": 1.85253906, + "router_z_loss_mlp": 0.152771, + "step": 5902, + "time_per_iteration": 2.5356359481811523 + }, + { + "auxiliary_loss_clip": 0.06482422, + "auxiliary_loss_mlp": 0.01280542, + "balance_loss_clip": 0.06300852, + "balance_loss_mlp": 0.01265331, + "epoch": 0.354907560499023, + "flos": 17390296880640.0, + "grad_norm": 1.6908684001073404, + "language_loss": 0.70729327, + "learning_rate": 2.991105086850381e-06, + "loss": 0.78492296, + "num_input_tokens_seen": 126930040, + "router_z_loss_clip": 1.81640625, + "router_z_loss_mlp": 0.15209961, + "step": 5903, + "time_per_iteration": 2.52494478225708 + }, + { + "auxiliary_loss_clip": 0.06482972, + "auxiliary_loss_mlp": 0.01276075, + "balance_loss_clip": 0.06297173, + "balance_loss_mlp": 0.0125929, + "epoch": 0.35496768375169097, + "flos": 19214607974400.0, + "grad_norm": 2.9744492269587153, + "language_loss": 0.75001359, + "learning_rate": 2.9907667902274053e-06, + "loss": 0.82760406, + "num_input_tokens_seen": 126948390, + "router_z_loss_clip": 1.85742188, + "router_z_loss_mlp": 0.16784668, + "step": 5904, + "time_per_iteration": 2.5316994190216064 + }, + { + "auxiliary_loss_clip": 0.0648163, + "auxiliary_loss_mlp": 0.01277137, + "balance_loss_clip": 0.06297497, + "balance_loss_mlp": 0.01261902, + "epoch": 0.35502780700435893, + "flos": 18338692118400.0, + "grad_norm": 2.2144866791488536, + "language_loss": 0.78981996, + "learning_rate": 2.9904284560344536e-06, + "loss": 0.86740756, + "num_input_tokens_seen": 126964905, + "router_z_loss_clip": 1.84179688, + "router_z_loss_mlp": 0.15246582, + "step": 5905, + "time_per_iteration": 3.9867374897003174 + }, + { + "auxiliary_loss_clip": 0.06472038, + "auxiliary_loss_mlp": 0.01276232, + "balance_loss_clip": 0.06301226, + "balance_loss_mlp": 0.01262249, + "epoch": 0.3550879302570269, + "flos": 15453660988800.0, + "grad_norm": 1.8340819850757704, + "language_loss": 0.72531646, + "learning_rate": 2.990090084284356e-06, + "loss": 0.80279917, + "num_input_tokens_seen": 126982000, + "router_z_loss_clip": 1.70703125, + "router_z_loss_mlp": 0.13977051, + "step": 5906, + "time_per_iteration": 2.5326547622680664 + }, + { + "auxiliary_loss_clip": 0.06491787, + "auxiliary_loss_mlp": 0.01272032, + "balance_loss_clip": 0.06306198, + "balance_loss_mlp": 0.01256046, + "epoch": 0.35514805350969486, + "flos": 21985343735040.0, + "grad_norm": 1.9483914182465616, + "language_loss": 0.75052631, + "learning_rate": 2.9897516749899426e-06, + "loss": 0.82816458, + "num_input_tokens_seen": 126998390, + "router_z_loss_clip": 1.85839844, + "router_z_loss_mlp": 0.15991211, + "step": 5907, + "time_per_iteration": 2.526137113571167 + }, + { + "auxiliary_loss_clip": 0.06486456, + "auxiliary_loss_mlp": 0.01280245, + "balance_loss_clip": 0.06305459, + "balance_loss_mlp": 0.01264271, + "epoch": 0.3552081767623628, + "flos": 29869718469120.0, + "grad_norm": 2.2786495725258424, + "language_loss": 0.76563632, + "learning_rate": 2.989413228164047e-06, + "loss": 0.84330332, + "num_input_tokens_seen": 127020220, + "router_z_loss_clip": 1.81152344, + "router_z_loss_mlp": 0.15966797, + "step": 5908, + "time_per_iteration": 4.063998222351074 + }, + { + "auxiliary_loss_clip": 0.06491728, + "auxiliary_loss_mlp": 0.01276886, + "balance_loss_clip": 0.06310974, + "balance_loss_mlp": 0.0126146, + "epoch": 0.3552683000150308, + "flos": 26439456821760.0, + "grad_norm": 2.352503484530038, + "language_loss": 0.68572766, + "learning_rate": 2.989074743819502e-06, + "loss": 0.76341379, + "num_input_tokens_seen": 127038585, + "router_z_loss_clip": 1.80566406, + "router_z_loss_mlp": 0.15429688, + "step": 5909, + "time_per_iteration": 2.6902143955230713 + }, + { + "auxiliary_loss_clip": 0.0648414, + "auxiliary_loss_mlp": 0.01282146, + "balance_loss_clip": 0.06310885, + "balance_loss_mlp": 0.01268061, + "epoch": 0.35532842326769876, + "flos": 19791088865280.0, + "grad_norm": 1.9680680199916993, + "language_loss": 0.79103023, + "learning_rate": 2.988736221969144e-06, + "loss": 0.86869311, + "num_input_tokens_seen": 127056215, + "router_z_loss_clip": 1.73144531, + "router_z_loss_mlp": 0.14086914, + "step": 5910, + "time_per_iteration": 2.535050630569458 + }, + { + "auxiliary_loss_clip": 0.06495271, + "auxiliary_loss_mlp": 0.01274944, + "balance_loss_clip": 0.06310071, + "balance_loss_mlp": 0.0125841, + "epoch": 0.3553885465203668, + "flos": 17245170408960.0, + "grad_norm": 1.607302447744311, + "language_loss": 0.7130779, + "learning_rate": 2.98839766262581e-06, + "loss": 0.79078007, + "num_input_tokens_seen": 127075825, + "router_z_loss_clip": 1.85253906, + "router_z_loss_mlp": 0.1652832, + "step": 5911, + "time_per_iteration": 2.572942018508911 + }, + { + "auxiliary_loss_clip": 0.06485709, + "auxiliary_loss_mlp": 0.01272785, + "balance_loss_clip": 0.06309631, + "balance_loss_mlp": 0.01258313, + "epoch": 0.35544866977303474, + "flos": 14938800376320.0, + "grad_norm": 2.1423891041027514, + "language_loss": 0.87973344, + "learning_rate": 2.9880590658023366e-06, + "loss": 0.95731837, + "num_input_tokens_seen": 127091205, + "router_z_loss_clip": 1.76074219, + "router_z_loss_mlp": 0.14477539, + "step": 5912, + "time_per_iteration": 2.4826059341430664 + }, + { + "auxiliary_loss_clip": 0.0648666, + "auxiliary_loss_mlp": 0.01278679, + "balance_loss_clip": 0.0630875, + "balance_loss_mlp": 0.0126441, + "epoch": 0.3555087930257027, + "flos": 19762228333440.0, + "grad_norm": 2.0928412919366477, + "language_loss": 0.77506435, + "learning_rate": 2.9877204315115646e-06, + "loss": 0.8527177, + "num_input_tokens_seen": 127109210, + "router_z_loss_clip": 1.77832031, + "router_z_loss_mlp": 0.14251709, + "step": 5913, + "time_per_iteration": 2.577362060546875 + }, + { + "auxiliary_loss_clip": 0.06486008, + "auxiliary_loss_mlp": 0.01273445, + "balance_loss_clip": 0.06311025, + "balance_loss_mlp": 0.01258789, + "epoch": 0.3555689162783707, + "flos": 21074445999360.0, + "grad_norm": 5.920108951080063, + "language_loss": 0.82525283, + "learning_rate": 2.9873817597663353e-06, + "loss": 0.90284735, + "num_input_tokens_seen": 127128400, + "router_z_loss_clip": 1.75097656, + "router_z_loss_mlp": 0.14660645, + "step": 5914, + "time_per_iteration": 2.521756649017334 + }, + { + "auxiliary_loss_clip": 0.06490604, + "auxiliary_loss_mlp": 0.01268632, + "balance_loss_clip": 0.06310836, + "balance_loss_mlp": 0.01254118, + "epoch": 0.35562903953103864, + "flos": 33077426872320.0, + "grad_norm": 3.2692214801304686, + "language_loss": 0.7113682, + "learning_rate": 2.98704305057949e-06, + "loss": 0.78896052, + "num_input_tokens_seen": 127149965, + "router_z_loss_clip": 1.79882812, + "router_z_loss_mlp": 0.14508057, + "step": 5915, + "time_per_iteration": 2.6931562423706055 + }, + { + "auxiliary_loss_clip": 0.06477264, + "auxiliary_loss_mlp": 0.01269512, + "balance_loss_clip": 0.06297429, + "balance_loss_mlp": 0.01254814, + "epoch": 0.3556891627837066, + "flos": 20564029653120.0, + "grad_norm": 4.458093980019367, + "language_loss": 0.76718718, + "learning_rate": 2.9867043039638737e-06, + "loss": 0.84465492, + "num_input_tokens_seen": 127169865, + "router_z_loss_clip": 1.796875, + "router_z_loss_mlp": 0.14697266, + "step": 5916, + "time_per_iteration": 2.5489182472229004 + }, + { + "auxiliary_loss_clip": 0.06487325, + "auxiliary_loss_mlp": 0.01272059, + "balance_loss_clip": 0.06307879, + "balance_loss_mlp": 0.01256651, + "epoch": 0.35574928603637457, + "flos": 20709449614080.0, + "grad_norm": 1.674174142445476, + "language_loss": 0.88208687, + "learning_rate": 2.986365519932332e-06, + "loss": 0.95968074, + "num_input_tokens_seen": 127188075, + "router_z_loss_clip": 1.79589844, + "router_z_loss_mlp": 0.1539917, + "step": 5917, + "time_per_iteration": 2.6043195724487305 + }, + { + "auxiliary_loss_clip": 0.0649041, + "auxiliary_loss_mlp": 0.0126981, + "balance_loss_clip": 0.0631107, + "balance_loss_mlp": 0.01254289, + "epoch": 0.35580940928904253, + "flos": 15199899298560.0, + "grad_norm": 3.6980401889874086, + "language_loss": 0.75538862, + "learning_rate": 2.98602669849771e-06, + "loss": 0.83299077, + "num_input_tokens_seen": 127206065, + "router_z_loss_clip": 1.79296875, + "router_z_loss_mlp": 0.15515137, + "step": 5918, + "time_per_iteration": 2.5186190605163574 + }, + { + "auxiliary_loss_clip": 0.06461592, + "auxiliary_loss_mlp": 0.01285001, + "balance_loss_clip": 0.06381316, + "balance_loss_mlp": 0.01279086, + "epoch": 0.3558695325417105, + "flos": 58656145426560.0, + "grad_norm": 0.8458689331650495, + "language_loss": 0.63255095, + "learning_rate": 2.985687839672857e-06, + "loss": 0.71001691, + "num_input_tokens_seen": 127257885, + "router_z_loss_clip": 0.80224609, + "router_z_loss_mlp": 0.05911255, + "step": 5919, + "time_per_iteration": 2.9552297592163086 + }, + { + "auxiliary_loss_clip": 0.06485933, + "auxiliary_loss_mlp": 0.01271829, + "balance_loss_clip": 0.06302524, + "balance_loss_mlp": 0.01255998, + "epoch": 0.35592965579437846, + "flos": 22024811808000.0, + "grad_norm": 2.2679396062128188, + "language_loss": 0.74402696, + "learning_rate": 2.9853489434706223e-06, + "loss": 0.82160461, + "num_input_tokens_seen": 127275550, + "router_z_loss_clip": 1.83496094, + "router_z_loss_mlp": 0.1583252, + "step": 5920, + "time_per_iteration": 2.54848313331604 + }, + { + "auxiliary_loss_clip": 0.06483243, + "auxiliary_loss_mlp": 0.01277956, + "balance_loss_clip": 0.06304519, + "balance_loss_mlp": 0.01262638, + "epoch": 0.35598977904704643, + "flos": 23374401194880.0, + "grad_norm": 3.1552684799501733, + "language_loss": 0.77735227, + "learning_rate": 2.985010009903857e-06, + "loss": 0.85496426, + "num_input_tokens_seen": 127295110, + "router_z_loss_clip": 1.78710938, + "router_z_loss_mlp": 0.15332031, + "step": 5921, + "time_per_iteration": 2.6517810821533203 + }, + { + "auxiliary_loss_clip": 0.06490617, + "auxiliary_loss_mlp": 0.01276672, + "balance_loss_clip": 0.06309058, + "balance_loss_mlp": 0.01261329, + "epoch": 0.3560499022997144, + "flos": 17791113686400.0, + "grad_norm": 2.349487021583332, + "language_loss": 0.6770314, + "learning_rate": 2.9846710389854133e-06, + "loss": 0.75470436, + "num_input_tokens_seen": 127312865, + "router_z_loss_clip": 1.81640625, + "router_z_loss_mlp": 0.15332031, + "step": 5922, + "time_per_iteration": 2.525566577911377 + }, + { + "auxiliary_loss_clip": 0.06484485, + "auxiliary_loss_mlp": 0.0127389, + "balance_loss_clip": 0.06306913, + "balance_loss_mlp": 0.01258524, + "epoch": 0.35611002555238236, + "flos": 20746695553920.0, + "grad_norm": 2.231194122260979, + "language_loss": 0.79304701, + "learning_rate": 2.9843320307281454e-06, + "loss": 0.87063074, + "num_input_tokens_seen": 127331710, + "router_z_loss_clip": 1.77441406, + "router_z_loss_mlp": 0.15380859, + "step": 5923, + "time_per_iteration": 2.5809409618377686 + }, + { + "auxiliary_loss_clip": 0.06479051, + "auxiliary_loss_mlp": 0.01272719, + "balance_loss_clip": 0.06301268, + "balance_loss_mlp": 0.01257579, + "epoch": 0.3561701488050504, + "flos": 19468034248320.0, + "grad_norm": 1.61778925366919, + "language_loss": 0.8543126, + "learning_rate": 2.983992985144908e-06, + "loss": 0.93183035, + "num_input_tokens_seen": 127350950, + "router_z_loss_clip": 1.77929688, + "router_z_loss_mlp": 0.15148926, + "step": 5924, + "time_per_iteration": 2.524949312210083 + }, + { + "auxiliary_loss_clip": 0.06478724, + "auxiliary_loss_mlp": 0.01271843, + "balance_loss_clip": 0.06301951, + "balance_loss_mlp": 0.01255797, + "epoch": 0.35623027205771834, + "flos": 30783006046080.0, + "grad_norm": 1.9504196686726267, + "language_loss": 0.77609557, + "learning_rate": 2.9836539022485578e-06, + "loss": 0.85360122, + "num_input_tokens_seen": 127369385, + "router_z_loss_clip": 1.76660156, + "router_z_loss_mlp": 0.16033936, + "step": 5925, + "time_per_iteration": 2.6268069744110107 + }, + { + "auxiliary_loss_clip": 0.06472521, + "auxiliary_loss_mlp": 0.01273729, + "balance_loss_clip": 0.06292735, + "balance_loss_mlp": 0.01258291, + "epoch": 0.3562903953103863, + "flos": 16986461328000.0, + "grad_norm": 1.8072288436418724, + "language_loss": 0.76488966, + "learning_rate": 2.9833147820519535e-06, + "loss": 0.84235215, + "num_input_tokens_seen": 127386965, + "router_z_loss_clip": 1.79589844, + "router_z_loss_mlp": 0.15441895, + "step": 5926, + "time_per_iteration": 2.492009401321411 + }, + { + "auxiliary_loss_clip": 0.064781, + "auxiliary_loss_mlp": 0.01271518, + "balance_loss_clip": 0.06293385, + "balance_loss_mlp": 0.01255478, + "epoch": 0.3563505185630543, + "flos": 23846271863040.0, + "grad_norm": 2.038892178711472, + "language_loss": 0.69665909, + "learning_rate": 2.9829756245679544e-06, + "loss": 0.77415526, + "num_input_tokens_seen": 127406075, + "router_z_loss_clip": 1.84863281, + "router_z_loss_mlp": 0.16046143, + "step": 5927, + "time_per_iteration": 2.555192708969116 + }, + { + "auxiliary_loss_clip": 0.06471409, + "auxiliary_loss_mlp": 0.01273845, + "balance_loss_clip": 0.06293224, + "balance_loss_mlp": 0.0125889, + "epoch": 0.35641064181572224, + "flos": 22280040944640.0, + "grad_norm": 1.7768317666214009, + "language_loss": 0.79454333, + "learning_rate": 2.9826364298094212e-06, + "loss": 0.87199581, + "num_input_tokens_seen": 127425350, + "router_z_loss_clip": 1.78027344, + "router_z_loss_mlp": 0.1494751, + "step": 5928, + "time_per_iteration": 2.5192928314208984 + }, + { + "auxiliary_loss_clip": 0.06473258, + "auxiliary_loss_mlp": 0.01271381, + "balance_loss_clip": 0.06294424, + "balance_loss_mlp": 0.01256439, + "epoch": 0.3564707650683902, + "flos": 23007643873920.0, + "grad_norm": 1.230692465633979, + "language_loss": 0.8197661, + "learning_rate": 2.982297197789215e-06, + "loss": 0.89721251, + "num_input_tokens_seen": 127446335, + "router_z_loss_clip": 1.78808594, + "router_z_loss_mlp": 0.1494751, + "step": 5929, + "time_per_iteration": 2.6044368743896484 + }, + { + "auxiliary_loss_clip": 0.0646459, + "auxiliary_loss_mlp": 0.01268428, + "balance_loss_clip": 0.06289564, + "balance_loss_mlp": 0.01253765, + "epoch": 0.35653088832105817, + "flos": 14689566806400.0, + "grad_norm": 1.5209281639747478, + "language_loss": 0.70385516, + "learning_rate": 2.981957928520201e-06, + "loss": 0.78118533, + "num_input_tokens_seen": 127462795, + "router_z_loss_clip": 1.75097656, + "router_z_loss_mlp": 0.14685059, + "step": 5930, + "time_per_iteration": 2.498253107070923 + }, + { + "auxiliary_loss_clip": 0.06473252, + "auxiliary_loss_mlp": 0.01273096, + "balance_loss_clip": 0.06289653, + "balance_loss_mlp": 0.01256943, + "epoch": 0.35659101157372614, + "flos": 23483791100160.0, + "grad_norm": 2.174064041384607, + "language_loss": 0.68760598, + "learning_rate": 2.981618622015244e-06, + "loss": 0.76506943, + "num_input_tokens_seen": 127482675, + "router_z_loss_clip": 1.83496094, + "router_z_loss_mlp": 0.16162109, + "step": 5931, + "time_per_iteration": 2.5391998291015625 + }, + { + "auxiliary_loss_clip": 0.06463969, + "auxiliary_loss_mlp": 0.0126845, + "balance_loss_clip": 0.06288578, + "balance_loss_mlp": 0.01253788, + "epoch": 0.3566511348263941, + "flos": 26585966885760.0, + "grad_norm": 1.5444695234240167, + "language_loss": 0.68331707, + "learning_rate": 2.981279278287211e-06, + "loss": 0.76064122, + "num_input_tokens_seen": 127502275, + "router_z_loss_clip": 1.75195312, + "router_z_loss_mlp": 0.14660645, + "step": 5932, + "time_per_iteration": 2.553738832473755 + }, + { + "auxiliary_loss_clip": 0.06465189, + "auxiliary_loss_mlp": 0.01272147, + "balance_loss_clip": 0.06290227, + "balance_loss_mlp": 0.01257854, + "epoch": 0.35671125807906207, + "flos": 13119981724800.0, + "grad_norm": 2.4744838507658917, + "language_loss": 0.79635656, + "learning_rate": 2.980939897348969e-06, + "loss": 0.87372994, + "num_input_tokens_seen": 127520195, + "router_z_loss_clip": 1.74902344, + "router_z_loss_mlp": 0.14294434, + "step": 5933, + "time_per_iteration": 2.573812961578369 + }, + { + "auxiliary_loss_clip": 0.06470121, + "auxiliary_loss_mlp": 0.01270309, + "balance_loss_clip": 0.06288668, + "balance_loss_mlp": 0.01255372, + "epoch": 0.35677138133173003, + "flos": 33009014413440.0, + "grad_norm": 1.4096936090904761, + "language_loss": 0.69970256, + "learning_rate": 2.980600479213388e-06, + "loss": 0.77710688, + "num_input_tokens_seen": 127544495, + "router_z_loss_clip": 1.81445312, + "router_z_loss_mlp": 0.14929199, + "step": 5934, + "time_per_iteration": 2.6381173133850098 + }, + { + "auxiliary_loss_clip": 0.06481285, + "auxiliary_loss_mlp": 0.01277705, + "balance_loss_clip": 0.06294179, + "balance_loss_mlp": 0.01260741, + "epoch": 0.356831504584398, + "flos": 20784234983040.0, + "grad_norm": 2.103415594097178, + "language_loss": 0.72006869, + "learning_rate": 2.9802610238933384e-06, + "loss": 0.79765862, + "num_input_tokens_seen": 127563810, + "router_z_loss_clip": 1.87304688, + "router_z_loss_mlp": 0.16967773, + "step": 5935, + "time_per_iteration": 2.620471954345703 + }, + { + "auxiliary_loss_clip": 0.06467808, + "auxiliary_loss_mlp": 0.01275583, + "balance_loss_clip": 0.06287988, + "balance_loss_mlp": 0.01261004, + "epoch": 0.35689162783706596, + "flos": 12170244821760.0, + "grad_norm": 2.011082803426264, + "language_loss": 0.78423738, + "learning_rate": 2.979921531401692e-06, + "loss": 0.86167133, + "num_input_tokens_seen": 127579065, + "router_z_loss_clip": 1.796875, + "router_z_loss_mlp": 0.14569092, + "step": 5936, + "time_per_iteration": 2.4827091693878174 + }, + { + "auxiliary_loss_clip": 0.06466486, + "auxiliary_loss_mlp": 0.01273239, + "balance_loss_clip": 0.06289199, + "balance_loss_mlp": 0.01258147, + "epoch": 0.356951751089734, + "flos": 23848200506880.0, + "grad_norm": 1.8250890312079233, + "language_loss": 0.64893055, + "learning_rate": 2.9795820017513242e-06, + "loss": 0.72632784, + "num_input_tokens_seen": 127599105, + "router_z_loss_clip": 1.77636719, + "router_z_loss_mlp": 0.15100098, + "step": 5937, + "time_per_iteration": 2.5968148708343506 + }, + { + "auxiliary_loss_clip": 0.06470716, + "auxiliary_loss_mlp": 0.01277052, + "balance_loss_clip": 0.06291182, + "balance_loss_mlp": 0.01261644, + "epoch": 0.35701187434240195, + "flos": 11725851093120.0, + "grad_norm": 3.2825373138133633, + "language_loss": 0.79029787, + "learning_rate": 2.9792424349551073e-06, + "loss": 0.86777556, + "num_input_tokens_seen": 127614940, + "router_z_loss_clip": 1.79492188, + "router_z_loss_mlp": 0.15429688, + "step": 5938, + "time_per_iteration": 2.4724228382110596 + }, + { + "auxiliary_loss_clip": 0.0646999, + "auxiliary_loss_mlp": 0.01275118, + "balance_loss_clip": 0.06289655, + "balance_loss_mlp": 0.01259835, + "epoch": 0.3570719975950699, + "flos": 24905650233600.0, + "grad_norm": 2.3707612213619624, + "language_loss": 0.80684471, + "learning_rate": 2.9789028310259202e-06, + "loss": 0.88429582, + "num_input_tokens_seen": 127634960, + "router_z_loss_clip": 1.80566406, + "router_z_loss_mlp": 0.15307617, + "step": 5939, + "time_per_iteration": 4.067660331726074 + }, + { + "auxiliary_loss_clip": 0.06474897, + "auxiliary_loss_mlp": 0.01278586, + "balance_loss_clip": 0.06288245, + "balance_loss_mlp": 0.01263357, + "epoch": 0.3571321208477379, + "flos": 26002022981760.0, + "grad_norm": 1.7209958005115653, + "language_loss": 0.79509544, + "learning_rate": 2.9785631899766395e-06, + "loss": 0.8726303, + "num_input_tokens_seen": 127654545, + "router_z_loss_clip": 1.8671875, + "router_z_loss_mlp": 0.15228271, + "step": 5940, + "time_per_iteration": 3.961956262588501 + }, + { + "auxiliary_loss_clip": 0.06472583, + "auxiliary_loss_mlp": 0.01274024, + "balance_loss_clip": 0.0628977, + "balance_loss_mlp": 0.01258223, + "epoch": 0.35719224410040584, + "flos": 14506900905600.0, + "grad_norm": 2.455654522420387, + "language_loss": 0.72918689, + "learning_rate": 2.9782235118201443e-06, + "loss": 0.80665296, + "num_input_tokens_seen": 127672320, + "router_z_loss_clip": 1.82714844, + "router_z_loss_mlp": 0.15802002, + "step": 5941, + "time_per_iteration": 2.529376745223999 + }, + { + "auxiliary_loss_clip": 0.06469624, + "auxiliary_loss_mlp": 0.01274223, + "balance_loss_clip": 0.06291723, + "balance_loss_mlp": 0.01258577, + "epoch": 0.3572523673530738, + "flos": 31183445508480.0, + "grad_norm": 1.9522398224767823, + "language_loss": 0.64961332, + "learning_rate": 2.9778837965693154e-06, + "loss": 0.72705185, + "num_input_tokens_seen": 127693315, + "router_z_loss_clip": 1.77636719, + "router_z_loss_mlp": 0.15667725, + "step": 5942, + "time_per_iteration": 2.6694955825805664 + }, + { + "auxiliary_loss_clip": 0.06470639, + "auxiliary_loss_mlp": 0.01273062, + "balance_loss_clip": 0.06291504, + "balance_loss_mlp": 0.01257124, + "epoch": 0.3573124906057418, + "flos": 15857496541440.0, + "grad_norm": 1.9232266262089555, + "language_loss": 0.7463761, + "learning_rate": 2.9775440442370354e-06, + "loss": 0.82381314, + "num_input_tokens_seen": 127711570, + "router_z_loss_clip": 1.79101562, + "router_z_loss_mlp": 0.1595459, + "step": 5943, + "time_per_iteration": 2.5988807678222656 + }, + { + "auxiliary_loss_clip": 0.06416824, + "auxiliary_loss_mlp": 0.01259877, + "balance_loss_clip": 0.06336363, + "balance_loss_mlp": 0.01254631, + "epoch": 0.35737261385840974, + "flos": 60839163849600.0, + "grad_norm": 0.8122274991603828, + "language_loss": 0.60684133, + "learning_rate": 2.9772042548361867e-06, + "loss": 0.68360829, + "num_input_tokens_seen": 127772475, + "router_z_loss_clip": 0.8046875, + "router_z_loss_mlp": 0.05249023, + "step": 5944, + "time_per_iteration": 3.2639529705047607 + }, + { + "auxiliary_loss_clip": 0.06467592, + "auxiliary_loss_mlp": 0.01274246, + "balance_loss_clip": 0.06290887, + "balance_loss_mlp": 0.01259464, + "epoch": 0.3574327371110777, + "flos": 18849779297280.0, + "grad_norm": 1.8477550360079977, + "language_loss": 0.7280755, + "learning_rate": 2.976864428379655e-06, + "loss": 0.80549395, + "num_input_tokens_seen": 127790940, + "router_z_loss_clip": 1.765625, + "router_z_loss_mlp": 0.14782715, + "step": 5945, + "time_per_iteration": 3.974971294403076 + }, + { + "auxiliary_loss_clip": 0.06464474, + "auxiliary_loss_mlp": 0.01274521, + "balance_loss_clip": 0.06288721, + "balance_loss_mlp": 0.01259619, + "epoch": 0.35749286036374567, + "flos": 23556354336000.0, + "grad_norm": 1.6530257311602492, + "language_loss": 0.8152287, + "learning_rate": 2.976524564880326e-06, + "loss": 0.89261866, + "num_input_tokens_seen": 127808275, + "router_z_loss_clip": 1.7578125, + "router_z_loss_mlp": 0.14892578, + "step": 5946, + "time_per_iteration": 2.567702531814575 + }, + { + "auxiliary_loss_clip": 0.06472433, + "auxiliary_loss_mlp": 0.01275229, + "balance_loss_clip": 0.06292298, + "balance_loss_mlp": 0.01260036, + "epoch": 0.35755298361641363, + "flos": 21111817720320.0, + "grad_norm": 1.4004407917222146, + "language_loss": 0.69023073, + "learning_rate": 2.9761846643510882e-06, + "loss": 0.76770723, + "num_input_tokens_seen": 127828840, + "router_z_loss_clip": 1.80273438, + "router_z_loss_mlp": 0.15209961, + "step": 5947, + "time_per_iteration": 2.531938076019287 + }, + { + "auxiliary_loss_clip": 0.06458312, + "auxiliary_loss_mlp": 0.01270008, + "balance_loss_clip": 0.06284653, + "balance_loss_mlp": 0.01256109, + "epoch": 0.3576131068690816, + "flos": 19251099227520.0, + "grad_norm": 2.059659188145791, + "language_loss": 0.75891036, + "learning_rate": 2.9758447268048297e-06, + "loss": 0.83619356, + "num_input_tokens_seen": 127846240, + "router_z_loss_clip": 1.73535156, + "router_z_loss_mlp": 0.13916016, + "step": 5948, + "time_per_iteration": 3.9236361980438232 + }, + { + "auxiliary_loss_clip": 0.06466205, + "auxiliary_loss_mlp": 0.01276458, + "balance_loss_clip": 0.06287337, + "balance_loss_mlp": 0.01261462, + "epoch": 0.35767323012174956, + "flos": 28661733682560.0, + "grad_norm": 1.6908098548641093, + "language_loss": 0.71228039, + "learning_rate": 2.9755047522544415e-06, + "loss": 0.78970701, + "num_input_tokens_seen": 127866880, + "router_z_loss_clip": 1.78808594, + "router_z_loss_mlp": 0.15002441, + "step": 5949, + "time_per_iteration": 2.56809663772583 + }, + { + "auxiliary_loss_clip": 0.06464282, + "auxiliary_loss_mlp": 0.01281848, + "balance_loss_clip": 0.06286816, + "balance_loss_mlp": 0.01266995, + "epoch": 0.35773335337441753, + "flos": 17089897593600.0, + "grad_norm": 1.7763817610233048, + "language_loss": 0.77821207, + "learning_rate": 2.9751647407128154e-06, + "loss": 0.85567343, + "num_input_tokens_seen": 127883560, + "router_z_loss_clip": 1.7734375, + "router_z_loss_mlp": 0.1484375, + "step": 5950, + "time_per_iteration": 2.529543876647949 + }, + { + "auxiliary_loss_clip": 0.06465182, + "auxiliary_loss_mlp": 0.01276208, + "balance_loss_clip": 0.0628643, + "balance_loss_mlp": 0.01261331, + "epoch": 0.35779347662708555, + "flos": 15894155502720.0, + "grad_norm": 2.1549260339424725, + "language_loss": 0.73109937, + "learning_rate": 2.9748246921928445e-06, + "loss": 0.80851334, + "num_input_tokens_seen": 127902330, + "router_z_loss_clip": 1.78710938, + "router_z_loss_mlp": 0.14892578, + "step": 5951, + "time_per_iteration": 2.5201168060302734 + }, + { + "auxiliary_loss_clip": 0.06470691, + "auxiliary_loss_mlp": 0.01277881, + "balance_loss_clip": 0.06287189, + "balance_loss_mlp": 0.01262181, + "epoch": 0.3578535998797535, + "flos": 28666555292160.0, + "grad_norm": 1.9784791605149854, + "language_loss": 0.7026071, + "learning_rate": 2.9744846067074236e-06, + "loss": 0.78009284, + "num_input_tokens_seen": 127922325, + "router_z_loss_clip": 1.83691406, + "router_z_loss_mlp": 0.15698242, + "step": 5952, + "time_per_iteration": 2.5931434631347656 + }, + { + "auxiliary_loss_clip": 0.0646029, + "auxiliary_loss_mlp": 0.01277333, + "balance_loss_clip": 0.06284408, + "balance_loss_mlp": 0.01263069, + "epoch": 0.3579137231324215, + "flos": 37861554464640.0, + "grad_norm": 1.6267089711440414, + "language_loss": 0.69578886, + "learning_rate": 2.974144484269449e-06, + "loss": 0.77316511, + "num_input_tokens_seen": 127942635, + "router_z_loss_clip": 1.75878906, + "router_z_loss_mlp": 0.14276123, + "step": 5953, + "time_per_iteration": 2.668464422225952 + }, + { + "auxiliary_loss_clip": 0.0645823, + "auxiliary_loss_mlp": 0.01275685, + "balance_loss_clip": 0.06282876, + "balance_loss_mlp": 0.01261117, + "epoch": 0.35797384638508944, + "flos": 22353526575360.0, + "grad_norm": 1.5719996722989455, + "language_loss": 0.67333478, + "learning_rate": 2.9738043248918175e-06, + "loss": 0.75067389, + "num_input_tokens_seen": 127962520, + "router_z_loss_clip": 1.75390625, + "router_z_loss_mlp": 0.14562988, + "step": 5954, + "time_per_iteration": 2.5791454315185547 + }, + { + "auxiliary_loss_clip": 0.06459846, + "auxiliary_loss_mlp": 0.01278708, + "balance_loss_clip": 0.06287006, + "balance_loss_mlp": 0.0126414, + "epoch": 0.3580339696377574, + "flos": 13594829212800.0, + "grad_norm": 1.8066455981447187, + "language_loss": 0.75335681, + "learning_rate": 2.9734641285874282e-06, + "loss": 0.83074236, + "num_input_tokens_seen": 127981180, + "router_z_loss_clip": 1.73144531, + "router_z_loss_mlp": 0.14556885, + "step": 5955, + "time_per_iteration": 2.5049943923950195 + }, + { + "auxiliary_loss_clip": 0.06458074, + "auxiliary_loss_mlp": 0.01270596, + "balance_loss_clip": 0.06286005, + "balance_loss_mlp": 0.01256595, + "epoch": 0.3580940928904254, + "flos": 23774882584320.0, + "grad_norm": 1.7018331496498176, + "language_loss": 0.76155579, + "learning_rate": 2.973123895369182e-06, + "loss": 0.83884245, + "num_input_tokens_seen": 127999725, + "router_z_loss_clip": 1.72070312, + "router_z_loss_mlp": 0.14007568, + "step": 5956, + "time_per_iteration": 2.565455675125122 + }, + { + "auxiliary_loss_clip": 0.06456999, + "auxiliary_loss_mlp": 0.01278066, + "balance_loss_clip": 0.06286499, + "balance_loss_mlp": 0.01263415, + "epoch": 0.35815421614309334, + "flos": 19469962892160.0, + "grad_norm": 1.5319401259692025, + "language_loss": 0.73558611, + "learning_rate": 2.9727836252499805e-06, + "loss": 0.81293678, + "num_input_tokens_seen": 128018885, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.14642334, + "step": 5957, + "time_per_iteration": 2.5241572856903076 + }, + { + "auxiliary_loss_clip": 0.064648, + "auxiliary_loss_mlp": 0.01274688, + "balance_loss_clip": 0.06291045, + "balance_loss_mlp": 0.01260204, + "epoch": 0.3582143393957613, + "flos": 23374988173440.0, + "grad_norm": 2.1285308943055727, + "language_loss": 0.71748459, + "learning_rate": 2.972443318242726e-06, + "loss": 0.79487944, + "num_input_tokens_seen": 128037875, + "router_z_loss_clip": 1.73828125, + "router_z_loss_mlp": 0.14477539, + "step": 5958, + "time_per_iteration": 2.566181182861328 + }, + { + "auxiliary_loss_clip": 0.06459813, + "auxiliary_loss_mlp": 0.01267621, + "balance_loss_clip": 0.06289116, + "balance_loss_mlp": 0.0125415, + "epoch": 0.35827446264842927, + "flos": 26330528113920.0, + "grad_norm": 1.6357791647016078, + "language_loss": 0.88725436, + "learning_rate": 2.972102974360324e-06, + "loss": 0.96452874, + "num_input_tokens_seen": 128056045, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.13452148, + "step": 5959, + "time_per_iteration": 2.6218011379241943 + }, + { + "auxiliary_loss_clip": 0.06463417, + "auxiliary_loss_mlp": 0.01271505, + "balance_loss_clip": 0.06288788, + "balance_loss_mlp": 0.0125816, + "epoch": 0.35833458590109724, + "flos": 30454626695040.0, + "grad_norm": 1.5143701220572547, + "language_loss": 0.58769095, + "learning_rate": 2.971762593615679e-06, + "loss": 0.66504014, + "num_input_tokens_seen": 128077815, + "router_z_loss_clip": 1.74707031, + "router_z_loss_mlp": 0.13348389, + "step": 5960, + "time_per_iteration": 2.636439800262451 + }, + { + "auxiliary_loss_clip": 0.06462947, + "auxiliary_loss_mlp": 0.01269103, + "balance_loss_clip": 0.06286879, + "balance_loss_mlp": 0.01253469, + "epoch": 0.3583947091537652, + "flos": 14835154475520.0, + "grad_norm": 2.541265940729937, + "language_loss": 0.76686686, + "learning_rate": 2.9714221760216993e-06, + "loss": 0.84418738, + "num_input_tokens_seen": 128095460, + "router_z_loss_clip": 1.75976562, + "router_z_loss_mlp": 0.15631104, + "step": 5961, + "time_per_iteration": 2.523674249649048 + }, + { + "auxiliary_loss_clip": 0.06464821, + "auxiliary_loss_mlp": 0.01275426, + "balance_loss_clip": 0.06287968, + "balance_loss_mlp": 0.01261324, + "epoch": 0.35845483240643317, + "flos": 34249213895040.0, + "grad_norm": 1.6475679018941416, + "language_loss": 0.70478481, + "learning_rate": 2.971081721591294e-06, + "loss": 0.78218734, + "num_input_tokens_seen": 128118605, + "router_z_loss_clip": 1.76953125, + "router_z_loss_mlp": 0.14099121, + "step": 5962, + "time_per_iteration": 2.6199357509613037 + }, + { + "auxiliary_loss_clip": 0.06464063, + "auxiliary_loss_mlp": 0.01269417, + "balance_loss_clip": 0.06289653, + "balance_loss_mlp": 0.01255207, + "epoch": 0.35851495565910113, + "flos": 20966481613440.0, + "grad_norm": 1.6496872805273144, + "language_loss": 0.75120842, + "learning_rate": 2.9707412303373716e-06, + "loss": 0.82854319, + "num_input_tokens_seen": 128139205, + "router_z_loss_clip": 1.74609375, + "router_z_loss_mlp": 0.14221191, + "step": 5963, + "time_per_iteration": 2.5526950359344482 + }, + { + "auxiliary_loss_clip": 0.06467253, + "auxiliary_loss_mlp": 0.01271151, + "balance_loss_clip": 0.06291784, + "balance_loss_mlp": 0.01256322, + "epoch": 0.35857507891176915, + "flos": 22316448343680.0, + "grad_norm": 1.675466861885377, + "language_loss": 0.78945208, + "learning_rate": 2.9704007022728447e-06, + "loss": 0.86683613, + "num_input_tokens_seen": 128158765, + "router_z_loss_clip": 1.75292969, + "router_z_loss_mlp": 0.14831543, + "step": 5964, + "time_per_iteration": 2.5257983207702637 + }, + { + "auxiliary_loss_clip": 0.0647264, + "auxiliary_loss_mlp": 0.01272042, + "balance_loss_clip": 0.06292663, + "balance_loss_mlp": 0.0125726, + "epoch": 0.3586352021644371, + "flos": 23374610830080.0, + "grad_norm": 3.2898914726182684, + "language_loss": 0.667786, + "learning_rate": 2.970060137410626e-06, + "loss": 0.74523282, + "num_input_tokens_seen": 128177850, + "router_z_loss_clip": 1.79785156, + "router_z_loss_mlp": 0.14764404, + "step": 5965, + "time_per_iteration": 2.5664315223693848 + }, + { + "auxiliary_loss_clip": 0.06463271, + "auxiliary_loss_mlp": 0.01271526, + "balance_loss_clip": 0.06287476, + "balance_loss_mlp": 0.01256773, + "epoch": 0.3586953254171051, + "flos": 27855655804800.0, + "grad_norm": 1.5935311272675807, + "language_loss": 0.79428947, + "learning_rate": 2.9697195357636294e-06, + "loss": 0.87163734, + "num_input_tokens_seen": 128196925, + "router_z_loss_clip": 1.75585938, + "router_z_loss_mlp": 0.14746094, + "step": 5966, + "time_per_iteration": 2.576537609100342 + }, + { + "auxiliary_loss_clip": 0.06467331, + "auxiliary_loss_mlp": 0.01268742, + "balance_loss_clip": 0.06291486, + "balance_loss_mlp": 0.01254717, + "epoch": 0.35875544866977305, + "flos": 19506621853440.0, + "grad_norm": 2.077713447457672, + "language_loss": 0.91477883, + "learning_rate": 2.9693788973447715e-06, + "loss": 0.99213958, + "num_input_tokens_seen": 128213955, + "router_z_loss_clip": 1.75683594, + "router_z_loss_mlp": 0.14044189, + "step": 5967, + "time_per_iteration": 2.553084135055542 + }, + { + "auxiliary_loss_clip": 0.06466691, + "auxiliary_loss_mlp": 0.01272699, + "balance_loss_clip": 0.06288824, + "balance_loss_mlp": 0.01257261, + "epoch": 0.358815571922441, + "flos": 21477652646400.0, + "grad_norm": 1.8463229992001005, + "language_loss": 0.80835712, + "learning_rate": 2.9690382221669682e-06, + "loss": 0.88575101, + "num_input_tokens_seen": 128232980, + "router_z_loss_clip": 1.77832031, + "router_z_loss_mlp": 0.15435791, + "step": 5968, + "time_per_iteration": 2.526298761367798 + }, + { + "auxiliary_loss_clip": 0.06467028, + "auxiliary_loss_mlp": 0.0127428, + "balance_loss_clip": 0.06287041, + "balance_loss_mlp": 0.012587, + "epoch": 0.358875695175109, + "flos": 21841894344960.0, + "grad_norm": 1.8179824378655614, + "language_loss": 0.84621, + "learning_rate": 2.9686975102431384e-06, + "loss": 0.92362314, + "num_input_tokens_seen": 128252795, + "router_z_loss_clip": 1.80078125, + "router_z_loss_mlp": 0.15588379, + "step": 5969, + "time_per_iteration": 2.5340397357940674 + }, + { + "auxiliary_loss_clip": 0.0646342, + "auxiliary_loss_mlp": 0.0127204, + "balance_loss_clip": 0.06288599, + "balance_loss_mlp": 0.01258664, + "epoch": 0.35893581842777694, + "flos": 32019264385920.0, + "grad_norm": 1.8505987075691241, + "language_loss": 0.72233456, + "learning_rate": 2.968356761586202e-06, + "loss": 0.79968911, + "num_input_tokens_seen": 128273115, + "router_z_loss_clip": 1.74804688, + "router_z_loss_mlp": 0.13366699, + "step": 5970, + "time_per_iteration": 2.581071615219116 + }, + { + "auxiliary_loss_clip": 0.06468321, + "auxiliary_loss_mlp": 0.01272468, + "balance_loss_clip": 0.06292167, + "balance_loss_mlp": 0.01258056, + "epoch": 0.3589959416804449, + "flos": 20492137249920.0, + "grad_norm": 1.5610077365233734, + "language_loss": 0.79753757, + "learning_rate": 2.9680159762090805e-06, + "loss": 0.87494546, + "num_input_tokens_seen": 128292220, + "router_z_loss_clip": 1.76269531, + "router_z_loss_mlp": 0.14422607, + "step": 5971, + "time_per_iteration": 2.5405828952789307 + }, + { + "auxiliary_loss_clip": 0.0646906, + "auxiliary_loss_mlp": 0.01270026, + "balance_loss_clip": 0.06288019, + "balance_loss_mlp": 0.01255006, + "epoch": 0.3590560649331129, + "flos": 16186295162880.0, + "grad_norm": 1.6291573791515084, + "language_loss": 0.78869599, + "learning_rate": 2.967675154124696e-06, + "loss": 0.86608684, + "num_input_tokens_seen": 128310305, + "router_z_loss_clip": 1.81054688, + "router_z_loss_mlp": 0.15026855, + "step": 5972, + "time_per_iteration": 2.4778740406036377 + }, + { + "auxiliary_loss_clip": 0.06465904, + "auxiliary_loss_mlp": 0.01274602, + "balance_loss_clip": 0.06286226, + "balance_loss_mlp": 0.01260201, + "epoch": 0.35911618818578084, + "flos": 20381531460480.0, + "grad_norm": 2.0141455740295875, + "language_loss": 0.81742013, + "learning_rate": 2.9673342953459722e-06, + "loss": 0.89482516, + "num_input_tokens_seen": 128328305, + "router_z_loss_clip": 1.79492188, + "router_z_loss_mlp": 0.1439209, + "step": 5973, + "time_per_iteration": 2.532027006149292 + }, + { + "auxiliary_loss_clip": 0.06404248, + "auxiliary_loss_mlp": 0.01258065, + "balance_loss_clip": 0.06324309, + "balance_loss_mlp": 0.01254096, + "epoch": 0.3591763114384488, + "flos": 41250991645440.0, + "grad_norm": 0.9082562918021452, + "language_loss": 0.56514442, + "learning_rate": 2.9669933998858355e-06, + "loss": 0.64176756, + "num_input_tokens_seen": 128378380, + "router_z_loss_clip": 0.796875, + "router_z_loss_mlp": 0.03967285, + "step": 5974, + "time_per_iteration": 3.0029375553131104 + }, + { + "auxiliary_loss_clip": 0.06464389, + "auxiliary_loss_mlp": 0.01272027, + "balance_loss_clip": 0.06286667, + "balance_loss_mlp": 0.01257781, + "epoch": 0.35923643469111677, + "flos": 18701047100160.0, + "grad_norm": 1.9591615340661908, + "language_loss": 0.69342583, + "learning_rate": 2.9666524677572114e-06, + "loss": 0.77078998, + "num_input_tokens_seen": 128394315, + "router_z_loss_clip": 1.77636719, + "router_z_loss_mlp": 0.14227295, + "step": 5975, + "time_per_iteration": 2.5330698490142822 + }, + { + "auxiliary_loss_clip": 0.06462636, + "auxiliary_loss_mlp": 0.0126783, + "balance_loss_clip": 0.06286036, + "balance_loss_mlp": 0.0125325, + "epoch": 0.35929655794378473, + "flos": 25017010709760.0, + "grad_norm": 1.597565036747504, + "language_loss": 0.8049522, + "learning_rate": 2.96631149897303e-06, + "loss": 0.88225687, + "num_input_tokens_seen": 128414515, + "router_z_loss_clip": 1.765625, + "router_z_loss_mlp": 0.14575195, + "step": 5976, + "time_per_iteration": 2.5599968433380127 + }, + { + "auxiliary_loss_clip": 0.0646351, + "auxiliary_loss_mlp": 0.0126845, + "balance_loss_clip": 0.06286681, + "balance_loss_mlp": 0.01253489, + "epoch": 0.35935668119645275, + "flos": 14980825998720.0, + "grad_norm": 1.8019140268476472, + "language_loss": 0.79171205, + "learning_rate": 2.9659704935462194e-06, + "loss": 0.86903155, + "num_input_tokens_seen": 128430615, + "router_z_loss_clip": 1.76757812, + "router_z_loss_mlp": 0.1496582, + "step": 5977, + "time_per_iteration": 2.4876949787139893 + }, + { + "auxiliary_loss_clip": 0.06459211, + "auxiliary_loss_mlp": 0.01266574, + "balance_loss_clip": 0.0628271, + "balance_loss_mlp": 0.0125324, + "epoch": 0.3594168044491207, + "flos": 21184422883200.0, + "grad_norm": 1.897291031169604, + "language_loss": 0.80843097, + "learning_rate": 2.9656294514897102e-06, + "loss": 0.88568884, + "num_input_tokens_seen": 128449480, + "router_z_loss_clip": 1.76464844, + "router_z_loss_mlp": 0.13342285, + "step": 5978, + "time_per_iteration": 2.5270771980285645 + }, + { + "auxiliary_loss_clip": 0.06458849, + "auxiliary_loss_mlp": 0.01272545, + "balance_loss_clip": 0.06279429, + "balance_loss_mlp": 0.01257703, + "epoch": 0.3594769277017887, + "flos": 27679446668160.0, + "grad_norm": 1.6570486295636508, + "language_loss": 0.67797875, + "learning_rate": 2.965288372816436e-06, + "loss": 0.75529265, + "num_input_tokens_seen": 128471465, + "router_z_loss_clip": 1.79394531, + "router_z_loss_mlp": 0.14819336, + "step": 5979, + "time_per_iteration": 5.427239179611206 + }, + { + "auxiliary_loss_clip": 0.06460471, + "auxiliary_loss_mlp": 0.01270252, + "balance_loss_clip": 0.06282781, + "balance_loss_mlp": 0.01256323, + "epoch": 0.35953705095445665, + "flos": 23008901685120.0, + "grad_norm": 2.1534655116077928, + "language_loss": 0.67667198, + "learning_rate": 2.9649472575393296e-06, + "loss": 0.75397921, + "num_input_tokens_seen": 128490645, + "router_z_loss_clip": 1.77929688, + "router_z_loss_mlp": 0.13928223, + "step": 5980, + "time_per_iteration": 2.538149833679199 + }, + { + "auxiliary_loss_clip": 0.0647162, + "auxiliary_loss_mlp": 0.01273216, + "balance_loss_clip": 0.06285568, + "balance_loss_mlp": 0.01257146, + "epoch": 0.3595971742071246, + "flos": 25520005969920.0, + "grad_norm": 2.2162969460708597, + "language_loss": 0.71122372, + "learning_rate": 2.964606105671327e-06, + "loss": 0.78867209, + "num_input_tokens_seen": 128510225, + "router_z_loss_clip": 1.86132812, + "router_z_loss_mlp": 0.16064453, + "step": 5981, + "time_per_iteration": 2.5711326599121094 + }, + { + "auxiliary_loss_clip": 0.06464566, + "auxiliary_loss_mlp": 0.01272445, + "balance_loss_clip": 0.06283125, + "balance_loss_mlp": 0.01256709, + "epoch": 0.3596572974597926, + "flos": 29870431228800.0, + "grad_norm": 2.0278025655936958, + "language_loss": 0.71914935, + "learning_rate": 2.9642649172253635e-06, + "loss": 0.7965194, + "num_input_tokens_seen": 128530195, + "router_z_loss_clip": 1.81640625, + "router_z_loss_mlp": 0.1572876, + "step": 5982, + "time_per_iteration": 2.6292126178741455 + }, + { + "auxiliary_loss_clip": 0.06458835, + "auxiliary_loss_mlp": 0.01267882, + "balance_loss_clip": 0.06286852, + "balance_loss_mlp": 0.0125428, + "epoch": 0.35971742071246054, + "flos": 23119255912320.0, + "grad_norm": 1.6791573126106523, + "language_loss": 0.7649492, + "learning_rate": 2.9639236922143786e-06, + "loss": 0.84221637, + "num_input_tokens_seen": 128549990, + "router_z_loss_clip": 1.72070312, + "router_z_loss_mlp": 0.13598633, + "step": 5983, + "time_per_iteration": 2.540801763534546 + }, + { + "auxiliary_loss_clip": 0.06468493, + "auxiliary_loss_mlp": 0.01273263, + "balance_loss_clip": 0.06285352, + "balance_loss_mlp": 0.01257206, + "epoch": 0.3597775439651285, + "flos": 16730645212800.0, + "grad_norm": 1.651729152091261, + "language_loss": 0.77260226, + "learning_rate": 2.96358243065131e-06, + "loss": 0.85001981, + "num_input_tokens_seen": 128567925, + "router_z_loss_clip": 1.83007812, + "router_z_loss_mlp": 0.16052246, + "step": 5984, + "time_per_iteration": 2.5278737545013428 + }, + { + "auxiliary_loss_clip": 0.06458455, + "auxiliary_loss_mlp": 0.01270496, + "balance_loss_clip": 0.0628411, + "balance_loss_mlp": 0.01256155, + "epoch": 0.3598376672177965, + "flos": 19725653226240.0, + "grad_norm": 2.0268922239891163, + "language_loss": 0.87093443, + "learning_rate": 2.9632411325490993e-06, + "loss": 0.94822395, + "num_input_tokens_seen": 128585655, + "router_z_loss_clip": 1.7421875, + "router_z_loss_mlp": 0.14355469, + "step": 5985, + "time_per_iteration": 3.9569170475006104 + }, + { + "auxiliary_loss_clip": 0.06461216, + "auxiliary_loss_mlp": 0.01272807, + "balance_loss_clip": 0.06284203, + "balance_loss_mlp": 0.01258109, + "epoch": 0.35989779047046444, + "flos": 17317314374400.0, + "grad_norm": 1.4939910635791536, + "language_loss": 0.72980917, + "learning_rate": 2.9628997979206884e-06, + "loss": 0.80714941, + "num_input_tokens_seen": 128604820, + "router_z_loss_clip": 1.77246094, + "router_z_loss_mlp": 0.14709473, + "step": 5986, + "time_per_iteration": 2.5065739154815674 + }, + { + "auxiliary_loss_clip": 0.06469383, + "auxiliary_loss_mlp": 0.0126965, + "balance_loss_clip": 0.06283881, + "balance_loss_mlp": 0.01254761, + "epoch": 0.3599579137231324, + "flos": 22717894055040.0, + "grad_norm": 2.903112824764454, + "language_loss": 0.73792106, + "learning_rate": 2.9625584267790204e-06, + "loss": 0.81531143, + "num_input_tokens_seen": 128623070, + "router_z_loss_clip": 1.85449219, + "router_z_loss_mlp": 0.14892578, + "step": 5987, + "time_per_iteration": 3.961486339569092 + }, + { + "auxiliary_loss_clip": 0.06467381, + "auxiliary_loss_mlp": 0.01269998, + "balance_loss_clip": 0.06286356, + "balance_loss_mlp": 0.01255347, + "epoch": 0.36001803697580037, + "flos": 20966230051200.0, + "grad_norm": 1.8945086710394061, + "language_loss": 0.69721663, + "learning_rate": 2.9622170191370404e-06, + "loss": 0.77459043, + "num_input_tokens_seen": 128642430, + "router_z_loss_clip": 1.80859375, + "router_z_loss_mlp": 0.14648438, + "step": 5988, + "time_per_iteration": 2.5483100414276123 + }, + { + "auxiliary_loss_clip": 0.0647547, + "auxiliary_loss_mlp": 0.01273545, + "balance_loss_clip": 0.06292704, + "balance_loss_mlp": 0.01258209, + "epoch": 0.36007816022846834, + "flos": 20491843760640.0, + "grad_norm": 1.7927951606002523, + "language_loss": 0.7305057, + "learning_rate": 2.9618755750076953e-06, + "loss": 0.80799592, + "num_input_tokens_seen": 128661285, + "router_z_loss_clip": 1.82714844, + "router_z_loss_mlp": 0.15344238, + "step": 5989, + "time_per_iteration": 2.5010430812835693 + }, + { + "auxiliary_loss_clip": 0.06467338, + "auxiliary_loss_mlp": 0.01268061, + "balance_loss_clip": 0.06289014, + "balance_loss_mlp": 0.01254173, + "epoch": 0.36013828348113636, + "flos": 28008706487040.0, + "grad_norm": 1.4999082498201763, + "language_loss": 0.80117184, + "learning_rate": 2.961534094403931e-06, + "loss": 0.87852585, + "num_input_tokens_seen": 128682210, + "router_z_loss_clip": 1.78515625, + "router_z_loss_mlp": 0.13897705, + "step": 5990, + "time_per_iteration": 2.6733410358428955 + }, + { + "auxiliary_loss_clip": 0.06464024, + "auxiliary_loss_mlp": 0.01270971, + "balance_loss_clip": 0.0628631, + "balance_loss_mlp": 0.01255938, + "epoch": 0.3601984067338043, + "flos": 20088050135040.0, + "grad_norm": 1.799909646769202, + "language_loss": 0.84338784, + "learning_rate": 2.961192577338698e-06, + "loss": 0.92073774, + "num_input_tokens_seen": 128700445, + "router_z_loss_clip": 1.77441406, + "router_z_loss_mlp": 0.15032959, + "step": 5991, + "time_per_iteration": 2.518554925918579 + }, + { + "auxiliary_loss_clip": 0.06474696, + "auxiliary_loss_mlp": 0.01276578, + "balance_loss_clip": 0.06292041, + "balance_loss_mlp": 0.01261367, + "epoch": 0.3602585299864723, + "flos": 18622362516480.0, + "grad_norm": 1.891276760716041, + "language_loss": 0.76406145, + "learning_rate": 2.9608510238249463e-06, + "loss": 0.84157419, + "num_input_tokens_seen": 128716855, + "router_z_loss_clip": 1.82617188, + "router_z_loss_mlp": 0.1519165, + "step": 5992, + "time_per_iteration": 2.5224106311798096 + }, + { + "auxiliary_loss_clip": 0.06471405, + "auxiliary_loss_mlp": 0.01273147, + "balance_loss_clip": 0.06294376, + "balance_loss_mlp": 0.01258496, + "epoch": 0.36031865323914025, + "flos": 19579059308160.0, + "grad_norm": 2.086772991356176, + "language_loss": 0.78120929, + "learning_rate": 2.960509433875627e-06, + "loss": 0.8586548, + "num_input_tokens_seen": 128735835, + "router_z_loss_clip": 1.76953125, + "router_z_loss_mlp": 0.14648438, + "step": 5993, + "time_per_iteration": 2.5155129432678223 + }, + { + "auxiliary_loss_clip": 0.06474859, + "auxiliary_loss_mlp": 0.01271898, + "balance_loss_clip": 0.06293729, + "balance_loss_mlp": 0.01257807, + "epoch": 0.3603787764918082, + "flos": 17495871425280.0, + "grad_norm": 1.6487847999674183, + "language_loss": 0.74534261, + "learning_rate": 2.9601678075036943e-06, + "loss": 0.82281017, + "num_input_tokens_seen": 128752465, + "router_z_loss_clip": 1.81445312, + "router_z_loss_mlp": 0.14086914, + "step": 5994, + "time_per_iteration": 2.647794723510742 + }, + { + "auxiliary_loss_clip": 0.06474246, + "auxiliary_loss_mlp": 0.01268785, + "balance_loss_clip": 0.06290799, + "balance_loss_mlp": 0.01254415, + "epoch": 0.3604388997444762, + "flos": 15528823701120.0, + "grad_norm": 1.8873654318884407, + "language_loss": 0.69500113, + "learning_rate": 2.9598261447221024e-06, + "loss": 0.77243149, + "num_input_tokens_seen": 128770865, + "router_z_loss_clip": 1.83691406, + "router_z_loss_mlp": 0.14361572, + "step": 5995, + "time_per_iteration": 2.501981019973755 + }, + { + "auxiliary_loss_clip": 0.06479774, + "auxiliary_loss_mlp": 0.01276345, + "balance_loss_clip": 0.06295834, + "balance_loss_mlp": 0.01261688, + "epoch": 0.36049902299714415, + "flos": 17316559687680.0, + "grad_norm": 1.8201062799427143, + "language_loss": 0.8309989, + "learning_rate": 2.9594844455438057e-06, + "loss": 0.90856004, + "num_input_tokens_seen": 128789730, + "router_z_loss_clip": 1.83886719, + "router_z_loss_mlp": 0.14642334, + "step": 5996, + "time_per_iteration": 2.551095962524414 + }, + { + "auxiliary_loss_clip": 0.06472808, + "auxiliary_loss_mlp": 0.01275418, + "balance_loss_clip": 0.06293936, + "balance_loss_mlp": 0.01260493, + "epoch": 0.3605591462498121, + "flos": 17061749821440.0, + "grad_norm": 2.2503529028172804, + "language_loss": 0.73762429, + "learning_rate": 2.959142709981763e-06, + "loss": 0.81510657, + "num_input_tokens_seen": 128806610, + "router_z_loss_clip": 1.79003906, + "router_z_loss_mlp": 0.14910889, + "step": 5997, + "time_per_iteration": 2.493100881576538 + }, + { + "auxiliary_loss_clip": 0.06465439, + "auxiliary_loss_mlp": 0.0127421, + "balance_loss_clip": 0.06288476, + "balance_loss_mlp": 0.0125944, + "epoch": 0.3606192695024801, + "flos": 16842508813440.0, + "grad_norm": 2.0075843423569326, + "language_loss": 0.69582814, + "learning_rate": 2.9588009380489337e-06, + "loss": 0.77322465, + "num_input_tokens_seen": 128824830, + "router_z_loss_clip": 1.76855469, + "router_z_loss_mlp": 0.14758301, + "step": 5998, + "time_per_iteration": 2.54227352142334 + }, + { + "auxiliary_loss_clip": 0.06468997, + "auxiliary_loss_mlp": 0.01272453, + "balance_loss_clip": 0.06292363, + "balance_loss_mlp": 0.01258243, + "epoch": 0.36067939275514804, + "flos": 12134424401280.0, + "grad_norm": 2.607888629955908, + "language_loss": 0.77566224, + "learning_rate": 2.9584591297582758e-06, + "loss": 0.8530767, + "num_input_tokens_seen": 128838170, + "router_z_loss_clip": 1.76757812, + "router_z_loss_mlp": 0.14208984, + "step": 5999, + "time_per_iteration": 2.456887722015381 + }, + { + "auxiliary_loss_clip": 0.06474666, + "auxiliary_loss_mlp": 0.01272087, + "balance_loss_clip": 0.06294585, + "balance_loss_mlp": 0.01257776, + "epoch": 0.360739516007816, + "flos": 18047390999040.0, + "grad_norm": 1.725953097254869, + "language_loss": 0.78777629, + "learning_rate": 2.9581172851227516e-06, + "loss": 0.86524385, + "num_input_tokens_seen": 128855625, + "router_z_loss_clip": 1.80273438, + "router_z_loss_mlp": 0.14300537, + "step": 6000, + "time_per_iteration": 2.523397445678711 + }, + { + "auxiliary_loss_clip": 0.06471578, + "auxiliary_loss_mlp": 0.01271527, + "balance_loss_clip": 0.06294253, + "balance_loss_mlp": 0.01257854, + "epoch": 0.360799639260484, + "flos": 18555417504000.0, + "grad_norm": 1.7389483603698193, + "language_loss": 0.78602117, + "learning_rate": 2.9577754041553243e-06, + "loss": 0.86345226, + "num_input_tokens_seen": 128873540, + "router_z_loss_clip": 1.77148438, + "router_z_loss_mlp": 0.13671875, + "step": 6001, + "time_per_iteration": 2.4887304306030273 + }, + { + "auxiliary_loss_clip": 0.06462014, + "auxiliary_loss_mlp": 0.01269074, + "balance_loss_clip": 0.06287026, + "balance_loss_mlp": 0.012549, + "epoch": 0.36085976251315194, + "flos": 19688029943040.0, + "grad_norm": 2.5640130860082206, + "language_loss": 0.83264118, + "learning_rate": 2.9574334868689575e-06, + "loss": 0.90995204, + "num_input_tokens_seen": 128889925, + "router_z_loss_clip": 1.74902344, + "router_z_loss_mlp": 0.14178467, + "step": 6002, + "time_per_iteration": 2.523263931274414 + }, + { + "auxiliary_loss_clip": 0.06462792, + "auxiliary_loss_mlp": 0.01274754, + "balance_loss_clip": 0.06293326, + "balance_loss_mlp": 0.01262034, + "epoch": 0.3609198857658199, + "flos": 24204476067840.0, + "grad_norm": 2.058215255218527, + "language_loss": 0.91365647, + "learning_rate": 2.9570915332766165e-06, + "loss": 0.991032, + "num_input_tokens_seen": 128906890, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.12713623, + "step": 6003, + "time_per_iteration": 2.5147922039031982 + }, + { + "auxiliary_loss_clip": 0.06424739, + "auxiliary_loss_mlp": 0.01257394, + "balance_loss_clip": 0.06345953, + "balance_loss_mlp": 0.01254351, + "epoch": 0.3609800090184879, + "flos": 57134288044800.0, + "grad_norm": 0.8495896975763515, + "language_loss": 0.53457719, + "learning_rate": 2.9567495433912693e-06, + "loss": 0.61139846, + "num_input_tokens_seen": 128965940, + "router_z_loss_clip": 0.78857422, + "router_z_loss_mlp": 0.03041077, + "step": 6004, + "time_per_iteration": 3.1006038188934326 + }, + { + "auxiliary_loss_clip": 0.06473242, + "auxiliary_loss_mlp": 0.01270523, + "balance_loss_clip": 0.06291834, + "balance_loss_mlp": 0.0125549, + "epoch": 0.3610401322711559, + "flos": 20817120510720.0, + "grad_norm": 1.7032625156204924, + "language_loss": 0.78291458, + "learning_rate": 2.956407517225883e-06, + "loss": 0.86035228, + "num_input_tokens_seen": 128985835, + "router_z_loss_clip": 1.81640625, + "router_z_loss_mlp": 0.15026855, + "step": 6005, + "time_per_iteration": 2.507681369781494 + }, + { + "auxiliary_loss_clip": 0.06466124, + "auxiliary_loss_mlp": 0.01274708, + "balance_loss_clip": 0.06289654, + "balance_loss_mlp": 0.01260373, + "epoch": 0.36110025552382385, + "flos": 13704302972160.0, + "grad_norm": 1.9788670063291258, + "language_loss": 0.79365236, + "learning_rate": 2.956065454793429e-06, + "loss": 0.87106061, + "num_input_tokens_seen": 129003120, + "router_z_loss_clip": 1.76464844, + "router_z_loss_mlp": 0.14349365, + "step": 6006, + "time_per_iteration": 2.6221675872802734 + }, + { + "auxiliary_loss_clip": 0.06467897, + "auxiliary_loss_mlp": 0.01276481, + "balance_loss_clip": 0.06290089, + "balance_loss_mlp": 0.01260317, + "epoch": 0.3611603787764918, + "flos": 22461490961280.0, + "grad_norm": 1.8947484153914913, + "language_loss": 0.84532005, + "learning_rate": 2.955723356106876e-06, + "loss": 0.92276382, + "num_input_tokens_seen": 129021645, + "router_z_loss_clip": 1.77734375, + "router_z_loss_mlp": 0.16162109, + "step": 6007, + "time_per_iteration": 2.5697944164276123 + }, + { + "auxiliary_loss_clip": 0.06477423, + "auxiliary_loss_mlp": 0.01275582, + "balance_loss_clip": 0.06289505, + "balance_loss_mlp": 0.0126018, + "epoch": 0.3612205020291598, + "flos": 20892954055680.0, + "grad_norm": 2.2451481952848953, + "language_loss": 0.73192191, + "learning_rate": 2.955381221179198e-06, + "loss": 0.80945194, + "num_input_tokens_seen": 129038375, + "router_z_loss_clip": 1.87890625, + "router_z_loss_mlp": 0.1541748, + "step": 6008, + "time_per_iteration": 2.5410661697387695 + }, + { + "auxiliary_loss_clip": 0.06468849, + "auxiliary_loss_mlp": 0.01276747, + "balance_loss_clip": 0.06288531, + "balance_loss_mlp": 0.01262036, + "epoch": 0.36128062528182775, + "flos": 15747393876480.0, + "grad_norm": 2.0636796050179194, + "language_loss": 0.83194089, + "learning_rate": 2.955039050023368e-06, + "loss": 0.90939683, + "num_input_tokens_seen": 129056235, + "router_z_loss_clip": 1.80273438, + "router_z_loss_mlp": 0.1472168, + "step": 6009, + "time_per_iteration": 2.4896605014801025 + }, + { + "auxiliary_loss_clip": 0.06467466, + "auxiliary_loss_mlp": 0.01270582, + "balance_loss_clip": 0.0629051, + "balance_loss_mlp": 0.012553, + "epoch": 0.3613407485344957, + "flos": 16770239066880.0, + "grad_norm": 1.996577445690206, + "language_loss": 0.7613554, + "learning_rate": 2.954696842652362e-06, + "loss": 0.83873594, + "num_input_tokens_seen": 129072405, + "router_z_loss_clip": 1.76953125, + "router_z_loss_mlp": 0.15258789, + "step": 6010, + "time_per_iteration": 2.501328468322754 + }, + { + "auxiliary_loss_clip": 0.064712, + "auxiliary_loss_mlp": 0.0127317, + "balance_loss_clip": 0.06292284, + "balance_loss_mlp": 0.01258734, + "epoch": 0.3614008717871637, + "flos": 20376625996800.0, + "grad_norm": 1.7565456089129825, + "language_loss": 0.8353886, + "learning_rate": 2.9543545990791554e-06, + "loss": 0.91283226, + "num_input_tokens_seen": 129090225, + "router_z_loss_clip": 1.78710938, + "router_z_loss_mlp": 0.14440918, + "step": 6011, + "time_per_iteration": 2.5080785751342773 + }, + { + "auxiliary_loss_clip": 0.06473367, + "auxiliary_loss_mlp": 0.01273027, + "balance_loss_clip": 0.06288376, + "balance_loss_mlp": 0.0125784, + "epoch": 0.36146099503983165, + "flos": 22782071882880.0, + "grad_norm": 2.5852128775447536, + "language_loss": 0.62982023, + "learning_rate": 2.954012319316727e-06, + "loss": 0.70728415, + "num_input_tokens_seen": 129107685, + "router_z_loss_clip": 1.84863281, + "router_z_loss_mlp": 0.15185547, + "step": 6012, + "time_per_iteration": 2.5285983085632324 + }, + { + "auxiliary_loss_clip": 0.06468817, + "auxiliary_loss_mlp": 0.01279391, + "balance_loss_clip": 0.06292222, + "balance_loss_mlp": 0.01264728, + "epoch": 0.3615211182924996, + "flos": 23002277212800.0, + "grad_norm": 2.060645495819417, + "language_loss": 0.83850408, + "learning_rate": 2.9536700033780565e-06, + "loss": 0.91598618, + "num_input_tokens_seen": 129125315, + "router_z_loss_clip": 1.76660156, + "router_z_loss_mlp": 0.14648438, + "step": 6013, + "time_per_iteration": 2.511187791824341 + }, + { + "auxiliary_loss_clip": 0.06469796, + "auxiliary_loss_mlp": 0.01276155, + "balance_loss_clip": 0.06291521, + "balance_loss_mlp": 0.01259501, + "epoch": 0.3615812415451676, + "flos": 16652631461760.0, + "grad_norm": 1.9072870373759168, + "language_loss": 0.92107058, + "learning_rate": 2.9533276512761228e-06, + "loss": 0.99853015, + "num_input_tokens_seen": 129141600, + "router_z_loss_clip": 1.78417969, + "router_z_loss_mlp": 0.16638184, + "step": 6014, + "time_per_iteration": 2.498011350631714 + }, + { + "auxiliary_loss_clip": 0.06466013, + "auxiliary_loss_mlp": 0.01275475, + "balance_loss_clip": 0.06290498, + "balance_loss_mlp": 0.01260097, + "epoch": 0.36164136479783554, + "flos": 21325733994240.0, + "grad_norm": 8.045361949377702, + "language_loss": 0.73973721, + "learning_rate": 2.95298526302391e-06, + "loss": 0.81715214, + "num_input_tokens_seen": 129160665, + "router_z_loss_clip": 1.75683594, + "router_z_loss_mlp": 0.15393066, + "step": 6015, + "time_per_iteration": 2.5139665603637695 + }, + { + "auxiliary_loss_clip": 0.0646963, + "auxiliary_loss_mlp": 0.01277804, + "balance_loss_clip": 0.06291166, + "balance_loss_mlp": 0.01262151, + "epoch": 0.3617014880505035, + "flos": 24176286368640.0, + "grad_norm": 1.9455925595590893, + "language_loss": 0.65181047, + "learning_rate": 2.9526428386344e-06, + "loss": 0.72928476, + "num_input_tokens_seen": 129179220, + "router_z_loss_clip": 1.78320312, + "router_z_loss_mlp": 0.15637207, + "step": 6016, + "time_per_iteration": 2.5485315322875977 + }, + { + "auxiliary_loss_clip": 0.06469464, + "auxiliary_loss_mlp": 0.01276058, + "balance_loss_clip": 0.06288736, + "balance_loss_mlp": 0.01259261, + "epoch": 0.3617616113031715, + "flos": 39023278997760.0, + "grad_norm": 1.6846943976812254, + "language_loss": 0.72102833, + "learning_rate": 2.9523003781205785e-06, + "loss": 0.79848349, + "num_input_tokens_seen": 129200385, + "router_z_loss_clip": 1.81152344, + "router_z_loss_mlp": 0.16784668, + "step": 6017, + "time_per_iteration": 2.6685996055603027 + }, + { + "auxiliary_loss_clip": 0.06470844, + "auxiliary_loss_mlp": 0.01272479, + "balance_loss_clip": 0.06287402, + "balance_loss_mlp": 0.01256886, + "epoch": 0.3618217345558395, + "flos": 12135807993600.0, + "grad_norm": 2.3155685522099962, + "language_loss": 0.74387789, + "learning_rate": 2.9519578814954307e-06, + "loss": 0.82131112, + "num_input_tokens_seen": 129217395, + "router_z_loss_clip": 1.83398438, + "router_z_loss_mlp": 0.15600586, + "step": 6018, + "time_per_iteration": 3.93249249458313 + }, + { + "auxiliary_loss_clip": 0.06458628, + "auxiliary_loss_mlp": 0.01273986, + "balance_loss_clip": 0.06287278, + "balance_loss_mlp": 0.0125856, + "epoch": 0.36188185780850746, + "flos": 24941722216320.0, + "grad_norm": 2.406612181934337, + "language_loss": 0.69554305, + "learning_rate": 2.9516153487719448e-06, + "loss": 0.77286923, + "num_input_tokens_seen": 129238940, + "router_z_loss_clip": 1.71191406, + "router_z_loss_mlp": 0.1541748, + "step": 6019, + "time_per_iteration": 4.000872373580933 + }, + { + "auxiliary_loss_clip": 0.06472806, + "auxiliary_loss_mlp": 0.01271681, + "balance_loss_clip": 0.0628852, + "balance_loss_mlp": 0.01255815, + "epoch": 0.3619419810611754, + "flos": 20965014167040.0, + "grad_norm": 2.953778610066193, + "language_loss": 0.76874363, + "learning_rate": 2.95127277996311e-06, + "loss": 0.84618843, + "num_input_tokens_seen": 129258240, + "router_z_loss_clip": 1.84277344, + "router_z_loss_mlp": 0.15869141, + "step": 6020, + "time_per_iteration": 2.5465614795684814 + }, + { + "auxiliary_loss_clip": 0.06471147, + "auxiliary_loss_mlp": 0.01273965, + "balance_loss_clip": 0.06288891, + "balance_loss_mlp": 0.01257264, + "epoch": 0.3620021043138434, + "flos": 22535521643520.0, + "grad_norm": 2.2311166939070097, + "language_loss": 0.74090236, + "learning_rate": 2.9509301750819156e-06, + "loss": 0.81835353, + "num_input_tokens_seen": 129279040, + "router_z_loss_clip": 1.82519531, + "router_z_loss_mlp": 0.16687012, + "step": 6021, + "time_per_iteration": 2.57817006111145 + }, + { + "auxiliary_loss_clip": 0.06467178, + "auxiliary_loss_mlp": 0.01270658, + "balance_loss_clip": 0.0628859, + "balance_loss_mlp": 0.01255685, + "epoch": 0.36206222756651135, + "flos": 15602183550720.0, + "grad_norm": 5.238961551513005, + "language_loss": 0.81591839, + "learning_rate": 2.9505875341413533e-06, + "loss": 0.89329672, + "num_input_tokens_seen": 129295415, + "router_z_loss_clip": 1.78710938, + "router_z_loss_mlp": 0.1496582, + "step": 6022, + "time_per_iteration": 2.5385305881500244 + }, + { + "auxiliary_loss_clip": 0.06457289, + "auxiliary_loss_mlp": 0.0127544, + "balance_loss_clip": 0.06285636, + "balance_loss_mlp": 0.01260349, + "epoch": 0.3621223508191793, + "flos": 23594019546240.0, + "grad_norm": 2.318322058767841, + "language_loss": 0.81707698, + "learning_rate": 2.950244857154417e-06, + "loss": 0.89440429, + "num_input_tokens_seen": 129312620, + "router_z_loss_clip": 1.71484375, + "router_z_loss_mlp": 0.15075684, + "step": 6023, + "time_per_iteration": 2.604048013687134 + }, + { + "auxiliary_loss_clip": 0.0647051, + "auxiliary_loss_mlp": 0.01276448, + "balance_loss_clip": 0.06288643, + "balance_loss_mlp": 0.01259795, + "epoch": 0.3621824740718473, + "flos": 22316490270720.0, + "grad_norm": 2.4056275848880038, + "language_loss": 0.80008531, + "learning_rate": 2.9499021441341e-06, + "loss": 0.87755489, + "num_input_tokens_seen": 129331825, + "router_z_loss_clip": 1.81640625, + "router_z_loss_mlp": 0.16650391, + "step": 6024, + "time_per_iteration": 3.9998557567596436 + }, + { + "auxiliary_loss_clip": 0.06462081, + "auxiliary_loss_mlp": 0.01273703, + "balance_loss_clip": 0.06288754, + "balance_loss_mlp": 0.01258599, + "epoch": 0.36224259732451525, + "flos": 16769232817920.0, + "grad_norm": 2.2201652107227354, + "language_loss": 0.75149572, + "learning_rate": 2.9495593950933997e-06, + "loss": 0.82885349, + "num_input_tokens_seen": 129350400, + "router_z_loss_clip": 1.73535156, + "router_z_loss_mlp": 0.15112305, + "step": 6025, + "time_per_iteration": 2.5139317512512207 + }, + { + "auxiliary_loss_clip": 0.06466474, + "auxiliary_loss_mlp": 0.01274175, + "balance_loss_clip": 0.06290425, + "balance_loss_mlp": 0.01260198, + "epoch": 0.3623027205771832, + "flos": 23156585706240.0, + "grad_norm": 1.704945166995659, + "language_loss": 0.72471905, + "learning_rate": 2.9492166100453107e-06, + "loss": 0.80212557, + "num_input_tokens_seen": 129371155, + "router_z_loss_clip": 1.76074219, + "router_z_loss_mlp": 0.13989258, + "step": 6026, + "time_per_iteration": 3.974848985671997 + }, + { + "auxiliary_loss_clip": 0.06476888, + "auxiliary_loss_mlp": 0.01276899, + "balance_loss_clip": 0.06290971, + "balance_loss_mlp": 0.01260233, + "epoch": 0.3623628438298512, + "flos": 28556829970560.0, + "grad_norm": 1.945563554904942, + "language_loss": 0.79502189, + "learning_rate": 2.948873789002833e-06, + "loss": 0.87255979, + "num_input_tokens_seen": 129391230, + "router_z_loss_clip": 1.859375, + "router_z_loss_mlp": 0.16662598, + "step": 6027, + "time_per_iteration": 2.614713430404663 + }, + { + "auxiliary_loss_clip": 0.06469107, + "auxiliary_loss_mlp": 0.01272818, + "balance_loss_clip": 0.06288799, + "balance_loss_mlp": 0.01256427, + "epoch": 0.36242296708251914, + "flos": 25492193614080.0, + "grad_norm": 4.95803648299326, + "language_loss": 0.68042505, + "learning_rate": 2.9485309319789667e-06, + "loss": 0.75784421, + "num_input_tokens_seen": 129410065, + "router_z_loss_clip": 1.80371094, + "router_z_loss_mlp": 0.16381836, + "step": 6028, + "time_per_iteration": 2.5680782794952393 + }, + { + "auxiliary_loss_clip": 0.06467344, + "auxiliary_loss_mlp": 0.01275782, + "balance_loss_clip": 0.0629041, + "balance_loss_mlp": 0.01260273, + "epoch": 0.3624830903351871, + "flos": 16296062411520.0, + "grad_norm": 2.2968183263714983, + "language_loss": 0.85463655, + "learning_rate": 2.9481880389867117e-06, + "loss": 0.93206775, + "num_input_tokens_seen": 129428655, + "router_z_loss_clip": 1.76953125, + "router_z_loss_mlp": 0.1550293, + "step": 6029, + "time_per_iteration": 2.519960403442383 + }, + { + "auxiliary_loss_clip": 0.06462874, + "auxiliary_loss_mlp": 0.01270115, + "balance_loss_clip": 0.0628645, + "balance_loss_mlp": 0.01255107, + "epoch": 0.36254321358785513, + "flos": 18302200865280.0, + "grad_norm": 1.7460468862336926, + "language_loss": 0.72888201, + "learning_rate": 2.9478451100390714e-06, + "loss": 0.80621189, + "num_input_tokens_seen": 129447845, + "router_z_loss_clip": 1.76367188, + "router_z_loss_mlp": 0.15008545, + "step": 6030, + "time_per_iteration": 2.480053663253784 + }, + { + "auxiliary_loss_clip": 0.06476077, + "auxiliary_loss_mlp": 0.01274605, + "balance_loss_clip": 0.06291036, + "balance_loss_mlp": 0.01257558, + "epoch": 0.3626033368405231, + "flos": 14870387917440.0, + "grad_norm": 3.30241855147188, + "language_loss": 0.75249928, + "learning_rate": 2.94750214514905e-06, + "loss": 0.83000606, + "num_input_tokens_seen": 129463275, + "router_z_loss_clip": 1.84960938, + "router_z_loss_mlp": 0.17041016, + "step": 6031, + "time_per_iteration": 2.4887540340423584 + }, + { + "auxiliary_loss_clip": 0.06465365, + "auxiliary_loss_mlp": 0.01279599, + "balance_loss_clip": 0.06287815, + "balance_loss_mlp": 0.01264245, + "epoch": 0.36266346009319106, + "flos": 22312632983040.0, + "grad_norm": 2.377019393957944, + "language_loss": 0.73490477, + "learning_rate": 2.9471591443296516e-06, + "loss": 0.81235439, + "num_input_tokens_seen": 129483205, + "router_z_loss_clip": 1.77539062, + "router_z_loss_mlp": 0.15344238, + "step": 6032, + "time_per_iteration": 2.5194106101989746 + }, + { + "auxiliary_loss_clip": 0.06471337, + "auxiliary_loss_mlp": 0.01274047, + "balance_loss_clip": 0.06290144, + "balance_loss_mlp": 0.01258776, + "epoch": 0.362723583345859, + "flos": 18228044401920.0, + "grad_norm": 1.8908046818451942, + "language_loss": 0.78089464, + "learning_rate": 2.946816107593884e-06, + "loss": 0.85834849, + "num_input_tokens_seen": 129499885, + "router_z_loss_clip": 1.81054688, + "router_z_loss_mlp": 0.15270996, + "step": 6033, + "time_per_iteration": 2.6062612533569336 + }, + { + "auxiliary_loss_clip": 0.06434236, + "auxiliary_loss_mlp": 0.01267532, + "balance_loss_clip": 0.06350702, + "balance_loss_mlp": 0.01264055, + "epoch": 0.362783706598527, + "flos": 68519307456000.0, + "grad_norm": 0.7613876705351186, + "language_loss": 0.64809752, + "learning_rate": 2.9464730349547547e-06, + "loss": 0.72511524, + "num_input_tokens_seen": 129561885, + "router_z_loss_clip": 0.8359375, + "router_z_loss_mlp": 0.03485107, + "step": 6034, + "time_per_iteration": 3.216454267501831 + }, + { + "auxiliary_loss_clip": 0.06466131, + "auxiliary_loss_mlp": 0.01276184, + "balance_loss_clip": 0.06289437, + "balance_loss_mlp": 0.01260222, + "epoch": 0.36284382985119495, + "flos": 26583535117440.0, + "grad_norm": 2.053623051898619, + "language_loss": 0.89456552, + "learning_rate": 2.946129926425273e-06, + "loss": 0.97198874, + "num_input_tokens_seen": 129582325, + "router_z_loss_clip": 1.765625, + "router_z_loss_mlp": 0.15966797, + "step": 6035, + "time_per_iteration": 2.5606629848480225 + }, + { + "auxiliary_loss_clip": 0.06479318, + "auxiliary_loss_mlp": 0.01272395, + "balance_loss_clip": 0.06295764, + "balance_loss_mlp": 0.0125592, + "epoch": 0.3629039531038629, + "flos": 20162919358080.0, + "grad_norm": 1.7740824971358589, + "language_loss": 0.73855877, + "learning_rate": 2.9457867820184496e-06, + "loss": 0.81607592, + "num_input_tokens_seen": 129600350, + "router_z_loss_clip": 1.83398438, + "router_z_loss_mlp": 0.16455078, + "step": 6036, + "time_per_iteration": 2.5144500732421875 + }, + { + "auxiliary_loss_clip": 0.06482191, + "auxiliary_loss_mlp": 0.01272832, + "balance_loss_clip": 0.06296846, + "balance_loss_mlp": 0.01256823, + "epoch": 0.3629640763565309, + "flos": 18631838027520.0, + "grad_norm": 1.8050884717083873, + "language_loss": 0.76438695, + "learning_rate": 2.945443601747297e-06, + "loss": 0.84193718, + "num_input_tokens_seen": 129618425, + "router_z_loss_clip": 1.8515625, + "router_z_loss_mlp": 0.16015625, + "step": 6037, + "time_per_iteration": 2.5286643505096436 + }, + { + "auxiliary_loss_clip": 0.06467965, + "auxiliary_loss_mlp": 0.01277972, + "balance_loss_clip": 0.06292737, + "balance_loss_mlp": 0.01262546, + "epoch": 0.36302419960919885, + "flos": 19577256445440.0, + "grad_norm": 1.633141884703147, + "language_loss": 0.78871524, + "learning_rate": 2.945100385624828e-06, + "loss": 0.86617458, + "num_input_tokens_seen": 129636750, + "router_z_loss_clip": 1.75097656, + "router_z_loss_mlp": 0.1541748, + "step": 6038, + "time_per_iteration": 2.5062947273254395 + }, + { + "auxiliary_loss_clip": 0.06400688, + "auxiliary_loss_mlp": 0.01261234, + "balance_loss_clip": 0.06318134, + "balance_loss_mlp": 0.01257723, + "epoch": 0.3630843228618668, + "flos": 63817805589120.0, + "grad_norm": 0.8140528620617334, + "language_loss": 0.63225597, + "learning_rate": 2.9447571336640573e-06, + "loss": 0.70887518, + "num_input_tokens_seen": 129699030, + "router_z_loss_clip": 0.82470703, + "router_z_loss_mlp": 0.03512573, + "step": 6039, + "time_per_iteration": 3.269761323928833 + }, + { + "auxiliary_loss_clip": 0.06467007, + "auxiliary_loss_mlp": 0.01269703, + "balance_loss_clip": 0.06289599, + "balance_loss_mlp": 0.01253932, + "epoch": 0.3631444461145348, + "flos": 21841600855680.0, + "grad_norm": 2.592040544468795, + "language_loss": 0.71409321, + "learning_rate": 2.944413845878002e-06, + "loss": 0.79146034, + "num_input_tokens_seen": 129717135, + "router_z_loss_clip": 1.7734375, + "router_z_loss_mlp": 0.15783691, + "step": 6040, + "time_per_iteration": 2.5549709796905518 + }, + { + "auxiliary_loss_clip": 0.06477243, + "auxiliary_loss_mlp": 0.01276394, + "balance_loss_clip": 0.06293249, + "balance_loss_mlp": 0.01260277, + "epoch": 0.36320456936720275, + "flos": 21727850538240.0, + "grad_norm": 1.6745525965006305, + "language_loss": 0.81387192, + "learning_rate": 2.9440705222796783e-06, + "loss": 0.89140832, + "num_input_tokens_seen": 129735940, + "router_z_loss_clip": 1.83789062, + "router_z_loss_mlp": 0.16113281, + "step": 6041, + "time_per_iteration": 2.529555320739746 + }, + { + "auxiliary_loss_clip": 0.06473525, + "auxiliary_loss_mlp": 0.01278326, + "balance_loss_clip": 0.0629223, + "balance_loss_mlp": 0.01261291, + "epoch": 0.3632646926198707, + "flos": 17024713516800.0, + "grad_norm": 3.0330286867158547, + "language_loss": 0.8477391, + "learning_rate": 2.943727162882107e-06, + "loss": 0.92525762, + "num_input_tokens_seen": 129752790, + "router_z_loss_clip": 1.81347656, + "router_z_loss_mlp": 0.17016602, + "step": 6042, + "time_per_iteration": 2.52242112159729 + }, + { + "auxiliary_loss_clip": 0.06469671, + "auxiliary_loss_mlp": 0.01277961, + "balance_loss_clip": 0.06290909, + "balance_loss_mlp": 0.01261892, + "epoch": 0.36332481587253873, + "flos": 23337868014720.0, + "grad_norm": 1.7311470578574424, + "language_loss": 0.78563523, + "learning_rate": 2.9433837676983064e-06, + "loss": 0.86311156, + "num_input_tokens_seen": 129773655, + "router_z_loss_clip": 1.78808594, + "router_z_loss_mlp": 0.16088867, + "step": 6043, + "time_per_iteration": 2.5507187843322754 + }, + { + "auxiliary_loss_clip": 0.06465393, + "auxiliary_loss_mlp": 0.0127573, + "balance_loss_clip": 0.06289753, + "balance_loss_mlp": 0.01258755, + "epoch": 0.3633849391252067, + "flos": 10748134126080.0, + "grad_norm": 2.0752100798218245, + "language_loss": 0.66141021, + "learning_rate": 2.943040336741298e-06, + "loss": 0.73882145, + "num_input_tokens_seen": 129791605, + "router_z_loss_clip": 1.75683594, + "router_z_loss_mlp": 0.16967773, + "step": 6044, + "time_per_iteration": 2.5431315898895264 + }, + { + "auxiliary_loss_clip": 0.06470387, + "auxiliary_loss_mlp": 0.0127397, + "balance_loss_clip": 0.06293066, + "balance_loss_mlp": 0.01258794, + "epoch": 0.36344506237787466, + "flos": 25856351458560.0, + "grad_norm": 1.7019744870222642, + "language_loss": 0.81317604, + "learning_rate": 2.9426968700241066e-06, + "loss": 0.89061964, + "num_input_tokens_seen": 129811075, + "router_z_loss_clip": 1.7734375, + "router_z_loss_mlp": 0.15185547, + "step": 6045, + "time_per_iteration": 2.578608274459839 + }, + { + "auxiliary_loss_clip": 0.06471765, + "auxiliary_loss_mlp": 0.01277035, + "balance_loss_clip": 0.06291001, + "balance_loss_mlp": 0.01260977, + "epoch": 0.3635051856305426, + "flos": 30161900056320.0, + "grad_norm": 1.9031490691130954, + "language_loss": 0.64869618, + "learning_rate": 2.942353367559755e-06, + "loss": 0.72618413, + "num_input_tokens_seen": 129833755, + "router_z_loss_clip": 1.80761719, + "router_z_loss_mlp": 0.16064453, + "step": 6046, + "time_per_iteration": 2.6581788063049316 + }, + { + "auxiliary_loss_clip": 0.06469898, + "auxiliary_loss_mlp": 0.01279877, + "balance_loss_clip": 0.06291277, + "balance_loss_mlp": 0.01264082, + "epoch": 0.3635653088832106, + "flos": 22204626670080.0, + "grad_norm": 1.4883910134219482, + "language_loss": 0.77790976, + "learning_rate": 2.9420098293612692e-06, + "loss": 0.85540754, + "num_input_tokens_seen": 129854475, + "router_z_loss_clip": 1.78417969, + "router_z_loss_mlp": 0.15783691, + "step": 6047, + "time_per_iteration": 2.59384822845459 + }, + { + "auxiliary_loss_clip": 0.06482202, + "auxiliary_loss_mlp": 0.01277437, + "balance_loss_clip": 0.0629375, + "balance_loss_mlp": 0.01259794, + "epoch": 0.36362543213587856, + "flos": 24793409289600.0, + "grad_norm": 2.402065763679051, + "language_loss": 0.79315472, + "learning_rate": 2.9416662554416767e-06, + "loss": 0.87075114, + "num_input_tokens_seen": 129873530, + "router_z_loss_clip": 1.88574219, + "router_z_loss_mlp": 0.1763916, + "step": 6048, + "time_per_iteration": 2.586355447769165 + }, + { + "auxiliary_loss_clip": 0.06388409, + "auxiliary_loss_mlp": 0.01275978, + "balance_loss_clip": 0.06308184, + "balance_loss_mlp": 0.01272211, + "epoch": 0.3636855553885465, + "flos": 62547320056320.0, + "grad_norm": 0.756250652706744, + "language_loss": 0.52505761, + "learning_rate": 2.9413226458140054e-06, + "loss": 0.6017015, + "num_input_tokens_seen": 129940400, + "router_z_loss_clip": 0.80126953, + "router_z_loss_mlp": 0.03759766, + "step": 6049, + "time_per_iteration": 3.1991608142852783 + }, + { + "auxiliary_loss_clip": 0.06471006, + "auxiliary_loss_mlp": 0.01281005, + "balance_loss_clip": 0.06289691, + "balance_loss_mlp": 0.01264518, + "epoch": 0.3637456786412145, + "flos": 24067441514880.0, + "grad_norm": 1.9518715754512581, + "language_loss": 0.8677333, + "learning_rate": 2.9409790004912845e-06, + "loss": 0.94525343, + "num_input_tokens_seen": 129958635, + "router_z_loss_clip": 1.81152344, + "router_z_loss_mlp": 0.16467285, + "step": 6050, + "time_per_iteration": 2.619880437850952 + }, + { + "auxiliary_loss_clip": 0.06465575, + "auxiliary_loss_mlp": 0.01288294, + "balance_loss_clip": 0.06288004, + "balance_loss_mlp": 0.01271784, + "epoch": 0.36380580189388245, + "flos": 16697214633600.0, + "grad_norm": 2.0514222430242937, + "language_loss": 0.78671187, + "learning_rate": 2.940635319486546e-06, + "loss": 0.86425054, + "num_input_tokens_seen": 129977685, + "router_z_loss_clip": 1.77539062, + "router_z_loss_mlp": 0.16491699, + "step": 6051, + "time_per_iteration": 2.5192694664001465 + }, + { + "auxiliary_loss_clip": 0.064697, + "auxiliary_loss_mlp": 0.0128748, + "balance_loss_clip": 0.06289212, + "balance_loss_mlp": 0.01271315, + "epoch": 0.3638659251465504, + "flos": 25120279267200.0, + "grad_norm": 2.1218426019343943, + "language_loss": 0.82423818, + "learning_rate": 2.940291602812822e-06, + "loss": 0.90180993, + "num_input_tokens_seen": 129997530, + "router_z_loss_clip": 1.8046875, + "router_z_loss_mlp": 0.16174316, + "step": 6052, + "time_per_iteration": 2.6190178394317627 + }, + { + "auxiliary_loss_clip": 0.06462704, + "auxiliary_loss_mlp": 0.01293914, + "balance_loss_clip": 0.06289209, + "balance_loss_mlp": 0.0127831, + "epoch": 0.3639260483992184, + "flos": 23009698298880.0, + "grad_norm": 1.6976848198598335, + "language_loss": 0.72702307, + "learning_rate": 2.939947850483145e-06, + "loss": 0.80458927, + "num_input_tokens_seen": 130017955, + "router_z_loss_clip": 1.73339844, + "router_z_loss_mlp": 0.15588379, + "step": 6053, + "time_per_iteration": 2.5632545948028564 + }, + { + "auxiliary_loss_clip": 0.0637124, + "auxiliary_loss_mlp": 0.0126271, + "balance_loss_clip": 0.06291765, + "balance_loss_mlp": 0.01258046, + "epoch": 0.36398617165188635, + "flos": 70735043698560.0, + "grad_norm": 0.7367280535398725, + "language_loss": 0.61109686, + "learning_rate": 2.9396040625105532e-06, + "loss": 0.68743634, + "num_input_tokens_seen": 130074275, + "router_z_loss_clip": 0.79541016, + "router_z_loss_mlp": 0.04656982, + "step": 6054, + "time_per_iteration": 3.1670703887939453 + }, + { + "auxiliary_loss_clip": 0.06468257, + "auxiliary_loss_mlp": 0.01284514, + "balance_loss_clip": 0.06288631, + "balance_loss_mlp": 0.01267062, + "epoch": 0.3640462949045543, + "flos": 22241788755840.0, + "grad_norm": 2.4941401517388795, + "language_loss": 0.76399368, + "learning_rate": 2.9392602389080802e-06, + "loss": 0.84152138, + "num_input_tokens_seen": 130091375, + "router_z_loss_clip": 1.79589844, + "router_z_loss_mlp": 0.17456055, + "step": 6055, + "time_per_iteration": 2.5719425678253174 + }, + { + "auxiliary_loss_clip": 0.06463572, + "auxiliary_loss_mlp": 0.0128082, + "balance_loss_clip": 0.06286994, + "balance_loss_mlp": 0.01264023, + "epoch": 0.3641064181572223, + "flos": 21549964320000.0, + "grad_norm": 1.5003458585655993, + "language_loss": 0.75247842, + "learning_rate": 2.938916379688765e-06, + "loss": 0.82992232, + "num_input_tokens_seen": 130111595, + "router_z_loss_clip": 1.765625, + "router_z_loss_mlp": 0.16784668, + "step": 6056, + "time_per_iteration": 2.548563241958618 + }, + { + "auxiliary_loss_clip": 0.06463505, + "auxiliary_loss_mlp": 0.01288137, + "balance_loss_clip": 0.06286436, + "balance_loss_mlp": 0.01271805, + "epoch": 0.3641665414098903, + "flos": 22279873236480.0, + "grad_norm": 1.8427248639079936, + "language_loss": 0.80231911, + "learning_rate": 2.9385724848656468e-06, + "loss": 0.87983549, + "num_input_tokens_seen": 130131440, + "router_z_loss_clip": 1.77050781, + "router_z_loss_mlp": 0.16320801, + "step": 6057, + "time_per_iteration": 2.590890645980835 + }, + { + "auxiliary_loss_clip": 0.06463237, + "auxiliary_loss_mlp": 0.01288366, + "balance_loss_clip": 0.06286855, + "balance_loss_mlp": 0.01271259, + "epoch": 0.36422666466255826, + "flos": 28337211619200.0, + "grad_norm": 2.0267495677395106, + "language_loss": 0.80895132, + "learning_rate": 2.9382285544517647e-06, + "loss": 0.88646734, + "num_input_tokens_seen": 130151375, + "router_z_loss_clip": 1.76367188, + "router_z_loss_mlp": 0.17114258, + "step": 6058, + "time_per_iteration": 3.9912350177764893 + }, + { + "auxiliary_loss_clip": 0.06462751, + "auxiliary_loss_mlp": 0.01284352, + "balance_loss_clip": 0.06282878, + "balance_loss_mlp": 0.01267794, + "epoch": 0.36428678791522623, + "flos": 24177376471680.0, + "grad_norm": 1.829086801108262, + "language_loss": 0.84467566, + "learning_rate": 2.9378845884601636e-06, + "loss": 0.9221468, + "num_input_tokens_seen": 130169960, + "router_z_loss_clip": 1.80078125, + "router_z_loss_mlp": 0.16552734, + "step": 6059, + "time_per_iteration": 3.9484288692474365 + }, + { + "auxiliary_loss_clip": 0.06463904, + "auxiliary_loss_mlp": 0.01290231, + "balance_loss_clip": 0.06284287, + "balance_loss_mlp": 0.01274006, + "epoch": 0.3643469111678942, + "flos": 22535018519040.0, + "grad_norm": 1.8662633122766634, + "language_loss": 0.88296366, + "learning_rate": 2.937540586903884e-06, + "loss": 0.96050501, + "num_input_tokens_seen": 130189800, + "router_z_loss_clip": 1.796875, + "router_z_loss_mlp": 0.16223145, + "step": 6060, + "time_per_iteration": 2.580472946166992 + }, + { + "auxiliary_loss_clip": 0.06469811, + "auxiliary_loss_mlp": 0.01278183, + "balance_loss_clip": 0.06287585, + "balance_loss_mlp": 0.01260611, + "epoch": 0.36440703442056216, + "flos": 19432549244160.0, + "grad_norm": 2.050716636944588, + "language_loss": 0.66968513, + "learning_rate": 2.937196549795971e-06, + "loss": 0.74716496, + "num_input_tokens_seen": 130206370, + "router_z_loss_clip": 1.82226562, + "router_z_loss_mlp": 0.17578125, + "step": 6061, + "time_per_iteration": 2.4934303760528564 + }, + { + "auxiliary_loss_clip": 0.06472699, + "auxiliary_loss_mlp": 0.01276187, + "balance_loss_clip": 0.06290831, + "balance_loss_mlp": 0.01259283, + "epoch": 0.3644671576732301, + "flos": 18046300896000.0, + "grad_norm": 2.6099029342135838, + "language_loss": 0.76223081, + "learning_rate": 2.9368524771494718e-06, + "loss": 0.83971971, + "num_input_tokens_seen": 130224445, + "router_z_loss_clip": 1.81835938, + "router_z_loss_mlp": 0.16918945, + "step": 6062, + "time_per_iteration": 2.5342442989349365 + }, + { + "auxiliary_loss_clip": 0.06462175, + "auxiliary_loss_mlp": 0.01277866, + "balance_loss_clip": 0.06284274, + "balance_loss_mlp": 0.01261844, + "epoch": 0.3645272809258981, + "flos": 21549125779200.0, + "grad_norm": 1.679264330509425, + "language_loss": 0.7250427, + "learning_rate": 2.936508368977432e-06, + "loss": 0.80244315, + "num_input_tokens_seen": 130245380, + "router_z_loss_clip": 1.77929688, + "router_z_loss_mlp": 0.16027832, + "step": 6063, + "time_per_iteration": 2.560140609741211 + }, + { + "auxiliary_loss_clip": 0.06463223, + "auxiliary_loss_mlp": 0.01278838, + "balance_loss_clip": 0.0628884, + "balance_loss_mlp": 0.0126256, + "epoch": 0.36458740417856605, + "flos": 22753379059200.0, + "grad_norm": 1.9927269992491163, + "language_loss": 0.67982519, + "learning_rate": 2.936164225292901e-06, + "loss": 0.75724578, + "num_input_tokens_seen": 130265575, + "router_z_loss_clip": 1.74414062, + "router_z_loss_mlp": 0.16265869, + "step": 6064, + "time_per_iteration": 4.001475095748901 + }, + { + "auxiliary_loss_clip": 0.06469691, + "auxiliary_loss_mlp": 0.01281677, + "balance_loss_clip": 0.06288914, + "balance_loss_mlp": 0.01265131, + "epoch": 0.364647527431234, + "flos": 26147862213120.0, + "grad_norm": 2.2981357468080725, + "language_loss": 0.75006247, + "learning_rate": 2.9358200461089297e-06, + "loss": 0.82757616, + "num_input_tokens_seen": 130286195, + "router_z_loss_clip": 1.80761719, + "router_z_loss_mlp": 0.16540527, + "step": 6065, + "time_per_iteration": 2.557175397872925 + }, + { + "auxiliary_loss_clip": 0.06475934, + "auxiliary_loss_mlp": 0.01274844, + "balance_loss_clip": 0.06292161, + "balance_loss_mlp": 0.01257487, + "epoch": 0.364707650683902, + "flos": 31037941693440.0, + "grad_norm": 1.8804228270875918, + "language_loss": 0.75913531, + "learning_rate": 2.9354758314385676e-06, + "loss": 0.8366431, + "num_input_tokens_seen": 130306095, + "router_z_loss_clip": 1.83691406, + "router_z_loss_mlp": 0.17370605, + "step": 6066, + "time_per_iteration": 4.028696537017822 + }, + { + "auxiliary_loss_clip": 0.06465262, + "auxiliary_loss_mlp": 0.01275132, + "balance_loss_clip": 0.06290717, + "balance_loss_mlp": 0.01260124, + "epoch": 0.36476777393656995, + "flos": 19578933527040.0, + "grad_norm": 2.1324188585544293, + "language_loss": 0.77645338, + "learning_rate": 2.9351315812948684e-06, + "loss": 0.85385728, + "num_input_tokens_seen": 130324685, + "router_z_loss_clip": 1.74414062, + "router_z_loss_mlp": 0.15014648, + "step": 6067, + "time_per_iteration": 2.5697665214538574 + }, + { + "auxiliary_loss_clip": 0.06463823, + "auxiliary_loss_mlp": 0.01273764, + "balance_loss_clip": 0.06289702, + "balance_loss_mlp": 0.01258684, + "epoch": 0.3648278971892379, + "flos": 17754622433280.0, + "grad_norm": 1.930394247385299, + "language_loss": 0.71678597, + "learning_rate": 2.934787295690886e-06, + "loss": 0.7941618, + "num_input_tokens_seen": 130343855, + "router_z_loss_clip": 1.74023438, + "router_z_loss_mlp": 0.15063477, + "step": 6068, + "time_per_iteration": 2.4845492839813232 + }, + { + "auxiliary_loss_clip": 0.06473656, + "auxiliary_loss_mlp": 0.0127485, + "balance_loss_clip": 0.06290961, + "balance_loss_mlp": 0.01258005, + "epoch": 0.3648880204419059, + "flos": 17936952917760.0, + "grad_norm": 1.8532098574136342, + "language_loss": 0.73989958, + "learning_rate": 2.9344429746396755e-06, + "loss": 0.8173846, + "num_input_tokens_seen": 130362320, + "router_z_loss_clip": 1.82519531, + "router_z_loss_mlp": 0.16845703, + "step": 6069, + "time_per_iteration": 2.508863687515259 + }, + { + "auxiliary_loss_clip": 0.06469753, + "auxiliary_loss_mlp": 0.01277718, + "balance_loss_clip": 0.06287999, + "balance_loss_mlp": 0.01261684, + "epoch": 0.3649481436945739, + "flos": 22644911548800.0, + "grad_norm": 1.9157179359535086, + "language_loss": 0.66736126, + "learning_rate": 2.9340986181542945e-06, + "loss": 0.74483597, + "num_input_tokens_seen": 130383165, + "router_z_loss_clip": 1.81738281, + "router_z_loss_mlp": 0.16027832, + "step": 6070, + "time_per_iteration": 2.516735076904297 + }, + { + "auxiliary_loss_clip": 0.06467332, + "auxiliary_loss_mlp": 0.01274362, + "balance_loss_clip": 0.06291667, + "balance_loss_mlp": 0.01259169, + "epoch": 0.36500826694724187, + "flos": 21586036302720.0, + "grad_norm": 1.8858284323375742, + "language_loss": 0.7453323, + "learning_rate": 2.9337542262477994e-06, + "loss": 0.82274926, + "num_input_tokens_seen": 130402425, + "router_z_loss_clip": 1.75878906, + "router_z_loss_mlp": 0.1519165, + "step": 6071, + "time_per_iteration": 2.566274642944336 + }, + { + "auxiliary_loss_clip": 0.06468312, + "auxiliary_loss_mlp": 0.01272795, + "balance_loss_clip": 0.0629068, + "balance_loss_mlp": 0.0125703, + "epoch": 0.36506839019990983, + "flos": 13777746675840.0, + "grad_norm": 1.7184690359068113, + "language_loss": 0.88681865, + "learning_rate": 2.9334097989332506e-06, + "loss": 0.96422982, + "num_input_tokens_seen": 130419440, + "router_z_loss_clip": 1.77539062, + "router_z_loss_mlp": 0.15771484, + "step": 6072, + "time_per_iteration": 2.510390043258667 + }, + { + "auxiliary_loss_clip": 0.06471045, + "auxiliary_loss_mlp": 0.01276068, + "balance_loss_clip": 0.06292107, + "balance_loss_mlp": 0.01260285, + "epoch": 0.3651285134525778, + "flos": 17280739267200.0, + "grad_norm": 2.591250971390436, + "language_loss": 0.72601849, + "learning_rate": 2.9330653362237094e-06, + "loss": 0.80348963, + "num_input_tokens_seen": 130438495, + "router_z_loss_clip": 1.79101562, + "router_z_loss_mlp": 0.15771484, + "step": 6073, + "time_per_iteration": 2.5448079109191895 + }, + { + "auxiliary_loss_clip": 0.06476631, + "auxiliary_loss_mlp": 0.012706, + "balance_loss_clip": 0.06296042, + "balance_loss_mlp": 0.0125422, + "epoch": 0.36518863670524576, + "flos": 21914415653760.0, + "grad_norm": 2.188049192517554, + "language_loss": 0.66876209, + "learning_rate": 2.932720838132236e-06, + "loss": 0.74623442, + "num_input_tokens_seen": 130455575, + "router_z_loss_clip": 1.8046875, + "router_z_loss_mlp": 0.16394043, + "step": 6074, + "time_per_iteration": 2.5186121463775635 + }, + { + "auxiliary_loss_clip": 0.06466351, + "auxiliary_loss_mlp": 0.01270864, + "balance_loss_clip": 0.06289779, + "balance_loss_mlp": 0.01255319, + "epoch": 0.3652487599579137, + "flos": 27128933343360.0, + "grad_norm": 1.455377552522792, + "language_loss": 0.73552799, + "learning_rate": 2.9323763046718954e-06, + "loss": 0.81290013, + "num_input_tokens_seen": 130476385, + "router_z_loss_clip": 1.76269531, + "router_z_loss_mlp": 0.15551758, + "step": 6075, + "time_per_iteration": 2.5611414909362793 + }, + { + "auxiliary_loss_clip": 0.06476435, + "auxiliary_loss_mlp": 0.01270879, + "balance_loss_clip": 0.06292082, + "balance_loss_mlp": 0.01255107, + "epoch": 0.3653088832105817, + "flos": 19761683281920.0, + "grad_norm": 3.551310730384351, + "language_loss": 0.89872956, + "learning_rate": 2.9320317358557524e-06, + "loss": 0.97620273, + "num_input_tokens_seen": 130493630, + "router_z_loss_clip": 1.84179688, + "router_z_loss_mlp": 0.15771484, + "step": 6076, + "time_per_iteration": 2.491070508956909 + }, + { + "auxiliary_loss_clip": 0.06471214, + "auxiliary_loss_mlp": 0.01269524, + "balance_loss_clip": 0.06294619, + "balance_loss_mlp": 0.01253782, + "epoch": 0.36536900646324966, + "flos": 13119981724800.0, + "grad_norm": 1.9522812947590364, + "language_loss": 0.69894624, + "learning_rate": 2.931687131696872e-06, + "loss": 0.7763536, + "num_input_tokens_seen": 130510735, + "router_z_loss_clip": 1.76855469, + "router_z_loss_mlp": 0.15740967, + "step": 6077, + "time_per_iteration": 2.5298445224761963 + }, + { + "auxiliary_loss_clip": 0.06367216, + "auxiliary_loss_mlp": 0.01255974, + "balance_loss_clip": 0.06288684, + "balance_loss_mlp": 0.0125196, + "epoch": 0.3654291297159176, + "flos": 71122848393600.0, + "grad_norm": 0.715882721223993, + "language_loss": 0.61670828, + "learning_rate": 2.9313424922083224e-06, + "loss": 0.69294018, + "num_input_tokens_seen": 130577050, + "router_z_loss_clip": 0.78271484, + "router_z_loss_mlp": 0.04013062, + "step": 6078, + "time_per_iteration": 3.245680093765259 + }, + { + "auxiliary_loss_clip": 0.06468864, + "auxiliary_loss_mlp": 0.01269715, + "balance_loss_clip": 0.0628942, + "balance_loss_mlp": 0.01254217, + "epoch": 0.3654892529685856, + "flos": 23623299348480.0, + "grad_norm": 2.6954686860737427, + "language_loss": 0.78565228, + "learning_rate": 2.930997817403173e-06, + "loss": 0.86303806, + "num_input_tokens_seen": 130593780, + "router_z_loss_clip": 1.79492188, + "router_z_loss_mlp": 0.1550293, + "step": 6079, + "time_per_iteration": 2.5243916511535645 + }, + { + "auxiliary_loss_clip": 0.06474455, + "auxiliary_loss_mlp": 0.0127227, + "balance_loss_clip": 0.06293908, + "balance_loss_mlp": 0.01255557, + "epoch": 0.36554937622125355, + "flos": 43480788174720.0, + "grad_norm": 2.827080544182906, + "language_loss": 0.62854588, + "learning_rate": 2.9306531072944913e-06, + "loss": 0.70601308, + "num_input_tokens_seen": 130615510, + "router_z_loss_clip": 1.80371094, + "router_z_loss_mlp": 0.16711426, + "step": 6080, + "time_per_iteration": 2.755979299545288 + }, + { + "auxiliary_loss_clip": 0.06473932, + "auxiliary_loss_mlp": 0.01273454, + "balance_loss_clip": 0.06292675, + "balance_loss_mlp": 0.012568, + "epoch": 0.3656094994739215, + "flos": 23301334834560.0, + "grad_norm": 2.0380719718304046, + "language_loss": 0.68215913, + "learning_rate": 2.930308361895352e-06, + "loss": 0.75963295, + "num_input_tokens_seen": 130635410, + "router_z_loss_clip": 1.8125, + "router_z_loss_mlp": 0.16674805, + "step": 6081, + "time_per_iteration": 2.5318713188171387 + }, + { + "auxiliary_loss_clip": 0.06476995, + "auxiliary_loss_mlp": 0.01283221, + "balance_loss_clip": 0.06289314, + "balance_loss_mlp": 0.01267021, + "epoch": 0.3656696227265895, + "flos": 24578947964160.0, + "grad_norm": 1.6214502004720641, + "language_loss": 0.75242162, + "learning_rate": 2.9299635812188257e-06, + "loss": 0.83002377, + "num_input_tokens_seen": 130657725, + "router_z_loss_clip": 1.87597656, + "router_z_loss_mlp": 0.1619873, + "step": 6082, + "time_per_iteration": 2.614473819732666 + }, + { + "auxiliary_loss_clip": 0.06474194, + "auxiliary_loss_mlp": 0.0127049, + "balance_loss_clip": 0.06295186, + "balance_loss_mlp": 0.01255851, + "epoch": 0.3657297459792575, + "flos": 27935849761920.0, + "grad_norm": 4.519769037138984, + "language_loss": 0.83192384, + "learning_rate": 2.929618765277987e-06, + "loss": 0.90937066, + "num_input_tokens_seen": 130678360, + "router_z_loss_clip": 1.79199219, + "router_z_loss_mlp": 0.14660645, + "step": 6083, + "time_per_iteration": 2.569382429122925 + }, + { + "auxiliary_loss_clip": 0.06373743, + "auxiliary_loss_mlp": 0.01258609, + "balance_loss_clip": 0.06293802, + "balance_loss_mlp": 0.01254855, + "epoch": 0.36578986923192547, + "flos": 67410566231040.0, + "grad_norm": 0.7897440828264927, + "language_loss": 0.59315842, + "learning_rate": 2.9292739140859125e-06, + "loss": 0.66948193, + "num_input_tokens_seen": 130742110, + "router_z_loss_clip": 0.796875, + "router_z_loss_mlp": 0.03747559, + "step": 6084, + "time_per_iteration": 3.2453150749206543 + }, + { + "auxiliary_loss_clip": 0.0646999, + "auxiliary_loss_mlp": 0.0127453, + "balance_loss_clip": 0.06292025, + "balance_loss_mlp": 0.01258801, + "epoch": 0.36584999248459343, + "flos": 20233302387840.0, + "grad_norm": 1.9605927592145687, + "language_loss": 0.73469806, + "learning_rate": 2.9289290276556767e-06, + "loss": 0.81214333, + "num_input_tokens_seen": 130759870, + "router_z_loss_clip": 1.78320312, + "router_z_loss_mlp": 0.15734863, + "step": 6085, + "time_per_iteration": 2.5149080753326416 + }, + { + "auxiliary_loss_clip": 0.06475443, + "auxiliary_loss_mlp": 0.01272781, + "balance_loss_clip": 0.06296027, + "balance_loss_mlp": 0.01256974, + "epoch": 0.3659101157372614, + "flos": 19068475253760.0, + "grad_norm": 1.7755618246241633, + "language_loss": 0.78367889, + "learning_rate": 2.9285841060003604e-06, + "loss": 0.86116111, + "num_input_tokens_seen": 130778510, + "router_z_loss_clip": 1.79492188, + "router_z_loss_mlp": 0.15802002, + "step": 6086, + "time_per_iteration": 2.6959855556488037 + }, + { + "auxiliary_loss_clip": 0.06460601, + "auxiliary_loss_mlp": 0.01277624, + "balance_loss_clip": 0.0628686, + "balance_loss_mlp": 0.01262449, + "epoch": 0.36597023898992936, + "flos": 30818658758400.0, + "grad_norm": 2.7333963743808387, + "language_loss": 0.77419388, + "learning_rate": 2.9282391491330416e-06, + "loss": 0.85157609, + "num_input_tokens_seen": 130798535, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.15185547, + "step": 6087, + "time_per_iteration": 2.660513401031494 + }, + { + "auxiliary_loss_clip": 0.06470397, + "auxiliary_loss_mlp": 0.01281375, + "balance_loss_clip": 0.06288096, + "balance_loss_mlp": 0.0126543, + "epoch": 0.36603036224259733, + "flos": 20528041524480.0, + "grad_norm": 2.0856395013908005, + "language_loss": 0.70779794, + "learning_rate": 2.9278941570668002e-06, + "loss": 0.78531569, + "num_input_tokens_seen": 130816655, + "router_z_loss_clip": 1.82226562, + "router_z_loss_mlp": 0.15948486, + "step": 6088, + "time_per_iteration": 2.5904111862182617 + }, + { + "auxiliary_loss_clip": 0.064822, + "auxiliary_loss_mlp": 0.0127711, + "balance_loss_clip": 0.06290494, + "balance_loss_mlp": 0.01258835, + "epoch": 0.3660904854952653, + "flos": 38339043356160.0, + "grad_norm": 1.5018444157956148, + "language_loss": 0.8073988, + "learning_rate": 2.92754912981472e-06, + "loss": 0.88499188, + "num_input_tokens_seen": 130841225, + "router_z_loss_clip": 1.9140625, + "router_z_loss_mlp": 0.18273926, + "step": 6089, + "time_per_iteration": 2.695387125015259 + }, + { + "auxiliary_loss_clip": 0.06466638, + "auxiliary_loss_mlp": 0.0126828, + "balance_loss_clip": 0.06289521, + "balance_loss_mlp": 0.01254065, + "epoch": 0.36615060874793326, + "flos": 21842062053120.0, + "grad_norm": 1.783943984741075, + "language_loss": 0.71745276, + "learning_rate": 2.927204067389884e-06, + "loss": 0.79480195, + "num_input_tokens_seen": 130861050, + "router_z_loss_clip": 1.77050781, + "router_z_loss_mlp": 0.14208984, + "step": 6090, + "time_per_iteration": 2.5730583667755127 + }, + { + "auxiliary_loss_clip": 0.06467035, + "auxiliary_loss_mlp": 0.01270022, + "balance_loss_clip": 0.06292006, + "balance_loss_mlp": 0.01254585, + "epoch": 0.3662107320006012, + "flos": 16587153895680.0, + "grad_norm": 1.8168526275922985, + "language_loss": 0.74269617, + "learning_rate": 2.9268589698053763e-06, + "loss": 0.82006675, + "num_input_tokens_seen": 130879775, + "router_z_loss_clip": 1.75195312, + "router_z_loss_mlp": 0.1541748, + "step": 6091, + "time_per_iteration": 2.5094668865203857 + }, + { + "auxiliary_loss_clip": 0.06470925, + "auxiliary_loss_mlp": 0.01271934, + "balance_loss_clip": 0.062924, + "balance_loss_mlp": 0.01256699, + "epoch": 0.3662708552532692, + "flos": 20964469115520.0, + "grad_norm": 2.9410218249320796, + "language_loss": 0.72888803, + "learning_rate": 2.926513837074284e-06, + "loss": 0.80631661, + "num_input_tokens_seen": 130898070, + "router_z_loss_clip": 1.78613281, + "router_z_loss_mlp": 0.15234375, + "step": 6092, + "time_per_iteration": 2.525499105453491 + }, + { + "auxiliary_loss_clip": 0.06472248, + "auxiliary_loss_mlp": 0.01276986, + "balance_loss_clip": 0.06288992, + "balance_loss_mlp": 0.01260833, + "epoch": 0.36633097850593715, + "flos": 21908252378880.0, + "grad_norm": 2.382181592286333, + "language_loss": 0.78829455, + "learning_rate": 2.9261686692096942e-06, + "loss": 0.86578685, + "num_input_tokens_seen": 130915250, + "router_z_loss_clip": 1.83105469, + "router_z_loss_mlp": 0.16174316, + "step": 6093, + "time_per_iteration": 2.519925355911255 + }, + { + "auxiliary_loss_clip": 0.06470528, + "auxiliary_loss_mlp": 0.01272915, + "balance_loss_clip": 0.06288898, + "balance_loss_mlp": 0.0125743, + "epoch": 0.3663911017586051, + "flos": 32862462422400.0, + "grad_norm": 1.6789792555665461, + "language_loss": 0.74561131, + "learning_rate": 2.925823466224696e-06, + "loss": 0.82304573, + "num_input_tokens_seen": 130936995, + "router_z_loss_clip": 1.81640625, + "router_z_loss_mlp": 0.15478516, + "step": 6094, + "time_per_iteration": 2.6374077796936035 + }, + { + "auxiliary_loss_clip": 0.06470601, + "auxiliary_loss_mlp": 0.01277645, + "balance_loss_clip": 0.06289363, + "balance_loss_mlp": 0.01261421, + "epoch": 0.3664512250112731, + "flos": 27279132986880.0, + "grad_norm": 1.6273421100585188, + "language_loss": 0.7975142, + "learning_rate": 2.9254782281323785e-06, + "loss": 0.87499666, + "num_input_tokens_seen": 130957970, + "router_z_loss_clip": 1.81152344, + "router_z_loss_mlp": 0.16223145, + "step": 6095, + "time_per_iteration": 2.565009117126465 + }, + { + "auxiliary_loss_clip": 0.06480707, + "auxiliary_loss_mlp": 0.01275122, + "balance_loss_clip": 0.06295107, + "balance_loss_mlp": 0.01258552, + "epoch": 0.3665113482639411, + "flos": 17790065510400.0, + "grad_norm": 2.4875649346087725, + "language_loss": 0.73963505, + "learning_rate": 2.925132954945834e-06, + "loss": 0.81719339, + "num_input_tokens_seen": 130974915, + "router_z_loss_clip": 1.859375, + "router_z_loss_mlp": 0.16577148, + "step": 6096, + "time_per_iteration": 2.5150070190429688 + }, + { + "auxiliary_loss_clip": 0.06474067, + "auxiliary_loss_mlp": 0.01271541, + "balance_loss_clip": 0.06288943, + "balance_loss_mlp": 0.01255901, + "epoch": 0.36657147151660907, + "flos": 27861944860800.0, + "grad_norm": 1.9533584433338151, + "language_loss": 0.67592847, + "learning_rate": 2.924787646678155e-06, + "loss": 0.75338453, + "num_input_tokens_seen": 130995745, + "router_z_loss_clip": 1.8515625, + "router_z_loss_mlp": 0.15649414, + "step": 6097, + "time_per_iteration": 4.085919618606567 + }, + { + "auxiliary_loss_clip": 0.06474558, + "auxiliary_loss_mlp": 0.01273059, + "balance_loss_clip": 0.06292384, + "balance_loss_mlp": 0.01257204, + "epoch": 0.36663159476927704, + "flos": 25381000846080.0, + "grad_norm": 1.4284875999183062, + "language_loss": 0.77924675, + "learning_rate": 2.9244423033424365e-06, + "loss": 0.85672289, + "num_input_tokens_seen": 131015545, + "router_z_loss_clip": 1.82421875, + "router_z_loss_mlp": 0.15856934, + "step": 6098, + "time_per_iteration": 4.075935363769531 + }, + { + "auxiliary_loss_clip": 0.06469452, + "auxiliary_loss_mlp": 0.01270135, + "balance_loss_clip": 0.06291129, + "balance_loss_mlp": 0.01254751, + "epoch": 0.366691718021945, + "flos": 21362979934080.0, + "grad_norm": 2.6338542151665862, + "language_loss": 0.73907244, + "learning_rate": 2.9240969249517723e-06, + "loss": 0.81646824, + "num_input_tokens_seen": 131033990, + "router_z_loss_clip": 1.78320312, + "router_z_loss_mlp": 0.15386963, + "step": 6099, + "time_per_iteration": 2.5343947410583496 + }, + { + "auxiliary_loss_clip": 0.06462912, + "auxiliary_loss_mlp": 0.01271369, + "balance_loss_clip": 0.06286579, + "balance_loss_mlp": 0.01256695, + "epoch": 0.36675184127461297, + "flos": 16806017560320.0, + "grad_norm": 1.7024924966611934, + "language_loss": 0.84795189, + "learning_rate": 2.9237515115192602e-06, + "loss": 0.92529464, + "num_input_tokens_seen": 131050710, + "router_z_loss_clip": 1.76171875, + "router_z_loss_mlp": 0.14660645, + "step": 6100, + "time_per_iteration": 2.5503897666931152 + }, + { + "auxiliary_loss_clip": 0.06478457, + "auxiliary_loss_mlp": 0.01268408, + "balance_loss_clip": 0.06293124, + "balance_loss_mlp": 0.0125216, + "epoch": 0.36681196452728093, + "flos": 21912696645120.0, + "grad_norm": 2.268106387872694, + "language_loss": 0.712331, + "learning_rate": 2.9234060630579992e-06, + "loss": 0.78979969, + "num_input_tokens_seen": 131071435, + "router_z_loss_clip": 1.85351562, + "router_z_loss_mlp": 0.16235352, + "step": 6101, + "time_per_iteration": 2.5698294639587402 + }, + { + "auxiliary_loss_clip": 0.06474541, + "auxiliary_loss_mlp": 0.01273553, + "balance_loss_clip": 0.0629383, + "balance_loss_mlp": 0.01257137, + "epoch": 0.3668720877799489, + "flos": 17718215034240.0, + "grad_norm": 2.179497141372214, + "language_loss": 0.76701671, + "learning_rate": 2.9230605795810865e-06, + "loss": 0.84449768, + "num_input_tokens_seen": 131088775, + "router_z_loss_clip": 1.8046875, + "router_z_loss_mlp": 0.16418457, + "step": 6102, + "time_per_iteration": 2.653047561645508 + }, + { + "auxiliary_loss_clip": 0.06477299, + "auxiliary_loss_mlp": 0.01279444, + "balance_loss_clip": 0.06290299, + "balance_loss_mlp": 0.01262099, + "epoch": 0.36693221103261686, + "flos": 47055882804480.0, + "grad_norm": 1.641444039565929, + "language_loss": 0.70188046, + "learning_rate": 2.922715061101625e-06, + "loss": 0.77944791, + "num_input_tokens_seen": 131112800, + "router_z_loss_clip": 1.86621094, + "router_z_loss_mlp": 0.17333984, + "step": 6103, + "time_per_iteration": 2.7502424716949463 + }, + { + "auxiliary_loss_clip": 0.06472746, + "auxiliary_loss_mlp": 0.01272056, + "balance_loss_clip": 0.06290279, + "balance_loss_mlp": 0.01255581, + "epoch": 0.3669923342852848, + "flos": 15966383322240.0, + "grad_norm": 1.6662921664183201, + "language_loss": 0.71920598, + "learning_rate": 2.922369507632716e-06, + "loss": 0.79665399, + "num_input_tokens_seen": 131131150, + "router_z_loss_clip": 1.82324219, + "router_z_loss_mlp": 0.16467285, + "step": 6104, + "time_per_iteration": 3.993805408477783 + }, + { + "auxiliary_loss_clip": 0.0647142, + "auxiliary_loss_mlp": 0.01272456, + "balance_loss_clip": 0.06291486, + "balance_loss_mlp": 0.01256494, + "epoch": 0.3670524575379528, + "flos": 19980630800640.0, + "grad_norm": 1.7978052174853272, + "language_loss": 0.81448174, + "learning_rate": 2.9220239191874617e-06, + "loss": 0.89192045, + "num_input_tokens_seen": 131150365, + "router_z_loss_clip": 1.79882812, + "router_z_loss_mlp": 0.15966797, + "step": 6105, + "time_per_iteration": 3.907820463180542 + }, + { + "auxiliary_loss_clip": 0.06477002, + "auxiliary_loss_mlp": 0.01272813, + "balance_loss_clip": 0.06288886, + "balance_loss_mlp": 0.01254896, + "epoch": 0.36711258079062076, + "flos": 25710092956800.0, + "grad_norm": 1.7139492182529468, + "language_loss": 0.81421959, + "learning_rate": 2.9216782957789692e-06, + "loss": 0.89171767, + "num_input_tokens_seen": 131169310, + "router_z_loss_clip": 1.88183594, + "router_z_loss_mlp": 0.17919922, + "step": 6106, + "time_per_iteration": 2.5623860359191895 + }, + { + "auxiliary_loss_clip": 0.06422871, + "auxiliary_loss_mlp": 0.01259281, + "balance_loss_clip": 0.06342293, + "balance_loss_mlp": 0.01254903, + "epoch": 0.3671727040432887, + "flos": 60793014648960.0, + "grad_norm": 0.6928078159632836, + "language_loss": 0.59215379, + "learning_rate": 2.9213326374203426e-06, + "loss": 0.66897523, + "num_input_tokens_seen": 131232900, + "router_z_loss_clip": 0.8046875, + "router_z_loss_mlp": 0.04385376, + "step": 6107, + "time_per_iteration": 3.2451207637786865 + }, + { + "auxiliary_loss_clip": 0.06468046, + "auxiliary_loss_mlp": 0.01273048, + "balance_loss_clip": 0.06291793, + "balance_loss_mlp": 0.01257396, + "epoch": 0.3672328272959567, + "flos": 18667281104640.0, + "grad_norm": 1.5826982165866754, + "language_loss": 0.74750638, + "learning_rate": 2.92098694412469e-06, + "loss": 0.82491726, + "num_input_tokens_seen": 131250920, + "router_z_loss_clip": 1.76171875, + "router_z_loss_mlp": 0.15631104, + "step": 6108, + "time_per_iteration": 2.5317509174346924 + }, + { + "auxiliary_loss_clip": 0.06472465, + "auxiliary_loss_mlp": 0.01275992, + "balance_loss_clip": 0.06289458, + "balance_loss_mlp": 0.01260482, + "epoch": 0.3672929505486247, + "flos": 15054395483520.0, + "grad_norm": 2.0251921146130547, + "language_loss": 0.74524188, + "learning_rate": 2.9206412159051213e-06, + "loss": 0.82272649, + "num_input_tokens_seen": 131267910, + "router_z_loss_clip": 1.83203125, + "router_z_loss_mlp": 0.15490723, + "step": 6109, + "time_per_iteration": 2.530214309692383 + }, + { + "auxiliary_loss_clip": 0.06464404, + "auxiliary_loss_mlp": 0.01270146, + "balance_loss_clip": 0.06286883, + "balance_loss_mlp": 0.0125503, + "epoch": 0.3673530738012927, + "flos": 20594693047680.0, + "grad_norm": 1.6431777634434088, + "language_loss": 0.53560948, + "learning_rate": 2.920295452774744e-06, + "loss": 0.61295497, + "num_input_tokens_seen": 131287150, + "router_z_loss_clip": 1.77734375, + "router_z_loss_mlp": 0.15112305, + "step": 6110, + "time_per_iteration": 2.5247035026550293 + }, + { + "auxiliary_loss_clip": 0.06459565, + "auxiliary_loss_mlp": 0.01275062, + "balance_loss_clip": 0.06284792, + "balance_loss_mlp": 0.01258957, + "epoch": 0.36741319705396064, + "flos": 21696348602880.0, + "grad_norm": 1.814369900920369, + "language_loss": 0.80767608, + "learning_rate": 2.919949654746672e-06, + "loss": 0.8850224, + "num_input_tokens_seen": 131308225, + "router_z_loss_clip": 1.74707031, + "router_z_loss_mlp": 0.16088867, + "step": 6111, + "time_per_iteration": 2.6213719844818115 + }, + { + "auxiliary_loss_clip": 0.06459287, + "auxiliary_loss_mlp": 0.01273038, + "balance_loss_clip": 0.06284556, + "balance_loss_mlp": 0.01256861, + "epoch": 0.3674733203066286, + "flos": 29870011958400.0, + "grad_norm": 1.7131296557309772, + "language_loss": 0.72860467, + "learning_rate": 2.9196038218340163e-06, + "loss": 0.80592787, + "num_input_tokens_seen": 131332115, + "router_z_loss_clip": 1.74609375, + "router_z_loss_mlp": 0.16174316, + "step": 6112, + "time_per_iteration": 2.656101703643799 + }, + { + "auxiliary_loss_clip": 0.06459092, + "auxiliary_loss_mlp": 0.01272993, + "balance_loss_clip": 0.06283998, + "balance_loss_mlp": 0.01257866, + "epoch": 0.36753344355929657, + "flos": 18262439303040.0, + "grad_norm": 1.5099687925303509, + "language_loss": 0.85667342, + "learning_rate": 2.919257954049892e-06, + "loss": 0.93399429, + "num_input_tokens_seen": 131351885, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.15124512, + "step": 6113, + "time_per_iteration": 2.5230536460876465 + }, + { + "auxiliary_loss_clip": 0.06460717, + "auxiliary_loss_mlp": 0.01276985, + "balance_loss_clip": 0.06281444, + "balance_loss_mlp": 0.01260439, + "epoch": 0.36759356681196453, + "flos": 25308144120960.0, + "grad_norm": 1.9025835930032806, + "language_loss": 0.78706479, + "learning_rate": 2.918912051407413e-06, + "loss": 0.86444181, + "num_input_tokens_seen": 131370245, + "router_z_loss_clip": 1.79101562, + "router_z_loss_mlp": 0.16540527, + "step": 6114, + "time_per_iteration": 2.6091229915618896 + }, + { + "auxiliary_loss_clip": 0.06466475, + "auxiliary_loss_mlp": 0.01272915, + "balance_loss_clip": 0.0628548, + "balance_loss_mlp": 0.01255725, + "epoch": 0.3676536900646325, + "flos": 21039338338560.0, + "grad_norm": 1.6305517572579116, + "language_loss": 0.67626929, + "learning_rate": 2.918566113919698e-06, + "loss": 0.75366318, + "num_input_tokens_seen": 131388115, + "router_z_loss_clip": 1.80859375, + "router_z_loss_mlp": 0.17199707, + "step": 6115, + "time_per_iteration": 2.5226221084594727 + }, + { + "auxiliary_loss_clip": 0.06454025, + "auxiliary_loss_mlp": 0.01272139, + "balance_loss_clip": 0.06280309, + "balance_loss_mlp": 0.01257077, + "epoch": 0.36771381331730046, + "flos": 16293882205440.0, + "grad_norm": 2.2835896682412105, + "language_loss": 0.76996851, + "learning_rate": 2.9182201415998636e-06, + "loss": 0.84723008, + "num_input_tokens_seen": 131404595, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.15063477, + "step": 6116, + "time_per_iteration": 2.504951238632202 + }, + { + "auxiliary_loss_clip": 0.06459618, + "auxiliary_loss_mlp": 0.01274615, + "balance_loss_clip": 0.06282905, + "balance_loss_mlp": 0.01259153, + "epoch": 0.36777393656996843, + "flos": 22316574124800.0, + "grad_norm": 1.8264539284878285, + "language_loss": 0.62890095, + "learning_rate": 2.9178741344610286e-06, + "loss": 0.70624328, + "num_input_tokens_seen": 131423760, + "router_z_loss_clip": 1.76367188, + "router_z_loss_mlp": 0.15454102, + "step": 6117, + "time_per_iteration": 2.529193639755249 + }, + { + "auxiliary_loss_clip": 0.06458353, + "auxiliary_loss_mlp": 0.01270127, + "balance_loss_clip": 0.06285255, + "balance_loss_mlp": 0.01254749, + "epoch": 0.3678340598226364, + "flos": 26841405657600.0, + "grad_norm": 1.7359331247938332, + "language_loss": 0.73532575, + "learning_rate": 2.9175280925163156e-06, + "loss": 0.81261057, + "num_input_tokens_seen": 131444955, + "router_z_loss_clip": 1.73046875, + "router_z_loss_mlp": 0.15368652, + "step": 6118, + "time_per_iteration": 2.6261374950408936 + }, + { + "auxiliary_loss_clip": 0.06469986, + "auxiliary_loss_mlp": 0.01276003, + "balance_loss_clip": 0.06289329, + "balance_loss_mlp": 0.01259707, + "epoch": 0.36789418307530436, + "flos": 21768073297920.0, + "grad_norm": 1.5781425493049515, + "language_loss": 0.73047614, + "learning_rate": 2.9171820157788445e-06, + "loss": 0.80793607, + "num_input_tokens_seen": 131465720, + "router_z_loss_clip": 1.80566406, + "router_z_loss_mlp": 0.16320801, + "step": 6119, + "time_per_iteration": 2.5320048332214355 + }, + { + "auxiliary_loss_clip": 0.06466002, + "auxiliary_loss_mlp": 0.0127303, + "balance_loss_clip": 0.06290065, + "balance_loss_mlp": 0.0125789, + "epoch": 0.3679543063279723, + "flos": 15929598579840.0, + "grad_norm": 2.0565678381587307, + "language_loss": 0.8018201, + "learning_rate": 2.9168359042617404e-06, + "loss": 0.87921047, + "num_input_tokens_seen": 131483080, + "router_z_loss_clip": 1.75976562, + "router_z_loss_mlp": 0.15136719, + "step": 6120, + "time_per_iteration": 2.5085418224334717 + }, + { + "auxiliary_loss_clip": 0.06467941, + "auxiliary_loss_mlp": 0.01276389, + "balance_loss_clip": 0.0629365, + "balance_loss_mlp": 0.01260868, + "epoch": 0.3680144295806403, + "flos": 24281693205120.0, + "grad_norm": 2.0719591239633703, + "language_loss": 0.64803445, + "learning_rate": 2.916489757978126e-06, + "loss": 0.72547781, + "num_input_tokens_seen": 131502545, + "router_z_loss_clip": 1.74316406, + "router_z_loss_mlp": 0.15515137, + "step": 6121, + "time_per_iteration": 2.532470703125 + }, + { + "auxiliary_loss_clip": 0.06466727, + "auxiliary_loss_mlp": 0.01268749, + "balance_loss_clip": 0.06293779, + "balance_loss_mlp": 0.01254527, + "epoch": 0.36807455283330826, + "flos": 26111329032960.0, + "grad_norm": 1.9648479350594452, + "language_loss": 0.71416938, + "learning_rate": 2.9161435769411286e-06, + "loss": 0.79152405, + "num_input_tokens_seen": 131522155, + "router_z_loss_clip": 1.72851562, + "router_z_loss_mlp": 0.14221191, + "step": 6122, + "time_per_iteration": 2.5836074352264404 + }, + { + "auxiliary_loss_clip": 0.06461313, + "auxiliary_loss_mlp": 0.01273307, + "balance_loss_clip": 0.06291762, + "balance_loss_mlp": 0.0125831, + "epoch": 0.3681346760859763, + "flos": 24651972397440.0, + "grad_norm": 1.8972357597085572, + "language_loss": 0.69858962, + "learning_rate": 2.915797361163875e-06, + "loss": 0.77593577, + "num_input_tokens_seen": 131543865, + "router_z_loss_clip": 1.69726562, + "router_z_loss_mlp": 0.15002441, + "step": 6123, + "time_per_iteration": 2.5574307441711426 + }, + { + "auxiliary_loss_clip": 0.06474412, + "auxiliary_loss_mlp": 0.0127264, + "balance_loss_clip": 0.06293641, + "balance_loss_mlp": 0.01256094, + "epoch": 0.36819479933864424, + "flos": 23885152957440.0, + "grad_norm": 2.796866262853862, + "language_loss": 0.74766016, + "learning_rate": 2.9154511106594933e-06, + "loss": 0.8251307, + "num_input_tokens_seen": 131562155, + "router_z_loss_clip": 1.80957031, + "router_z_loss_mlp": 0.16540527, + "step": 6124, + "time_per_iteration": 2.5769121646881104 + }, + { + "auxiliary_loss_clip": 0.06470435, + "auxiliary_loss_mlp": 0.01274758, + "balance_loss_clip": 0.06295419, + "balance_loss_mlp": 0.01258116, + "epoch": 0.3682549225913122, + "flos": 25560606072960.0, + "grad_norm": 3.2532876436035236, + "language_loss": 0.74467599, + "learning_rate": 2.915104825441114e-06, + "loss": 0.82212794, + "num_input_tokens_seen": 131581695, + "router_z_loss_clip": 1.75195312, + "router_z_loss_mlp": 0.16625977, + "step": 6125, + "time_per_iteration": 2.5822880268096924 + }, + { + "auxiliary_loss_clip": 0.06476732, + "auxiliary_loss_mlp": 0.01270787, + "balance_loss_clip": 0.06296605, + "balance_loss_mlp": 0.01253967, + "epoch": 0.36831504584398017, + "flos": 16952317989120.0, + "grad_norm": 1.938795434914092, + "language_loss": 0.7843706, + "learning_rate": 2.9147585055218686e-06, + "loss": 0.86184579, + "num_input_tokens_seen": 131599465, + "router_z_loss_clip": 1.80078125, + "router_z_loss_mlp": 0.16809082, + "step": 6126, + "time_per_iteration": 2.5298731327056885 + }, + { + "auxiliary_loss_clip": 0.06483818, + "auxiliary_loss_mlp": 0.01275366, + "balance_loss_clip": 0.06301596, + "balance_loss_mlp": 0.01257413, + "epoch": 0.36837516909664814, + "flos": 19871198968320.0, + "grad_norm": 2.3034543329783173, + "language_loss": 0.66139042, + "learning_rate": 2.914412150914888e-06, + "loss": 0.73898232, + "num_input_tokens_seen": 131618330, + "router_z_loss_clip": 1.82128906, + "router_z_loss_mlp": 0.17980957, + "step": 6127, + "time_per_iteration": 2.5208253860473633 + }, + { + "auxiliary_loss_clip": 0.06475674, + "auxiliary_loss_mlp": 0.01272228, + "balance_loss_clip": 0.06294744, + "balance_loss_mlp": 0.01256409, + "epoch": 0.3684352923493161, + "flos": 37634976224640.0, + "grad_norm": 1.7597572196634643, + "language_loss": 0.70472896, + "learning_rate": 2.9140657616333074e-06, + "loss": 0.78220791, + "num_input_tokens_seen": 131638960, + "router_z_loss_clip": 1.80761719, + "router_z_loss_mlp": 0.15808105, + "step": 6128, + "time_per_iteration": 2.6984474658966064 + }, + { + "auxiliary_loss_clip": 0.06467833, + "auxiliary_loss_mlp": 0.01270944, + "balance_loss_clip": 0.06293194, + "balance_loss_mlp": 0.01255613, + "epoch": 0.36849541560198407, + "flos": 14470786995840.0, + "grad_norm": 1.6868142680460214, + "language_loss": 0.7591843, + "learning_rate": 2.9137193376902614e-06, + "loss": 0.83657211, + "num_input_tokens_seen": 131657440, + "router_z_loss_clip": 1.74511719, + "router_z_loss_mlp": 0.15332031, + "step": 6129, + "time_per_iteration": 2.49924898147583 + }, + { + "auxiliary_loss_clip": 0.06473218, + "auxiliary_loss_mlp": 0.01270816, + "balance_loss_clip": 0.06296876, + "balance_loss_mlp": 0.01255844, + "epoch": 0.36855553885465203, + "flos": 25777037969280.0, + "grad_norm": 1.6502765336301308, + "language_loss": 0.85087365, + "learning_rate": 2.9133728790988868e-06, + "loss": 0.92831397, + "num_input_tokens_seen": 131678035, + "router_z_loss_clip": 1.76464844, + "router_z_loss_mlp": 0.1496582, + "step": 6130, + "time_per_iteration": 2.604851484298706 + }, + { + "auxiliary_loss_clip": 0.06391466, + "auxiliary_loss_mlp": 0.01263828, + "balance_loss_clip": 0.06313837, + "balance_loss_mlp": 0.01261091, + "epoch": 0.36861566210732, + "flos": 65071715212800.0, + "grad_norm": 0.7916436629428728, + "language_loss": 0.60275888, + "learning_rate": 2.913026385872321e-06, + "loss": 0.67931175, + "num_input_tokens_seen": 131742470, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.02740479, + "step": 6131, + "time_per_iteration": 3.228571891784668 + }, + { + "auxiliary_loss_clip": 0.0647023, + "auxiliary_loss_mlp": 0.01270609, + "balance_loss_clip": 0.06296837, + "balance_loss_mlp": 0.01255332, + "epoch": 0.36867578535998796, + "flos": 30962108148480.0, + "grad_norm": 1.7580055354180455, + "language_loss": 0.73204952, + "learning_rate": 2.9126798580237034e-06, + "loss": 0.8094579, + "num_input_tokens_seen": 131764570, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.152771, + "step": 6132, + "time_per_iteration": 2.6286978721618652 + }, + { + "auxiliary_loss_clip": 0.06478602, + "auxiliary_loss_mlp": 0.01273616, + "balance_loss_clip": 0.0629575, + "balance_loss_mlp": 0.0125738, + "epoch": 0.3687359086126559, + "flos": 28845154270080.0, + "grad_norm": 1.8077518075699008, + "language_loss": 0.7455107, + "learning_rate": 2.9123332955661736e-06, + "loss": 0.82303286, + "num_input_tokens_seen": 131785720, + "router_z_loss_clip": 1.83007812, + "router_z_loss_mlp": 0.16235352, + "step": 6133, + "time_per_iteration": 2.6024398803710938 + }, + { + "auxiliary_loss_clip": 0.06463782, + "auxiliary_loss_mlp": 0.0127464, + "balance_loss_clip": 0.06292324, + "balance_loss_mlp": 0.01258618, + "epoch": 0.3687960318653239, + "flos": 21403076912640.0, + "grad_norm": 1.7721182564640174, + "language_loss": 0.7199074, + "learning_rate": 2.911986698512874e-06, + "loss": 0.79729164, + "num_input_tokens_seen": 131804430, + "router_z_loss_clip": 1.71386719, + "router_z_loss_mlp": 0.16027832, + "step": 6134, + "time_per_iteration": 2.646097421646118 + }, + { + "auxiliary_loss_clip": 0.0646476, + "auxiliary_loss_mlp": 0.0126875, + "balance_loss_clip": 0.06289706, + "balance_loss_mlp": 0.01252288, + "epoch": 0.36885615511799186, + "flos": 20272183482240.0, + "grad_norm": 4.124945820193244, + "language_loss": 0.7570188, + "learning_rate": 2.9116400668769477e-06, + "loss": 0.83435392, + "num_input_tokens_seen": 131822060, + "router_z_loss_clip": 1.74804688, + "router_z_loss_mlp": 0.16455078, + "step": 6135, + "time_per_iteration": 2.6019539833068848 + }, + { + "auxiliary_loss_clip": 0.06382909, + "auxiliary_loss_mlp": 0.01256883, + "balance_loss_clip": 0.06304377, + "balance_loss_mlp": 0.0125392, + "epoch": 0.3689162783706599, + "flos": 63106317371520.0, + "grad_norm": 0.7816734524389999, + "language_loss": 0.58664352, + "learning_rate": 2.9112934006715376e-06, + "loss": 0.66304147, + "num_input_tokens_seen": 131880715, + "router_z_loss_clip": 0.78515625, + "router_z_loss_mlp": 0.02960205, + "step": 6136, + "time_per_iteration": 3.139789342880249 + }, + { + "auxiliary_loss_clip": 0.06465235, + "auxiliary_loss_mlp": 0.01270986, + "balance_loss_clip": 0.06292487, + "balance_loss_mlp": 0.012563, + "epoch": 0.36897640162332784, + "flos": 10966536593280.0, + "grad_norm": 2.7370945268269806, + "language_loss": 0.79547632, + "learning_rate": 2.9109466999097918e-06, + "loss": 0.8728385, + "num_input_tokens_seen": 131895850, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.14678955, + "step": 6137, + "time_per_iteration": 3.937328577041626 + }, + { + "auxiliary_loss_clip": 0.06472172, + "auxiliary_loss_mlp": 0.01271273, + "balance_loss_clip": 0.06297816, + "balance_loss_mlp": 0.01255764, + "epoch": 0.3690365248759958, + "flos": 20710581644160.0, + "grad_norm": 1.9257362559650297, + "language_loss": 0.74479491, + "learning_rate": 2.9105999646048552e-06, + "loss": 0.82222939, + "num_input_tokens_seen": 131915775, + "router_z_loss_clip": 1.74414062, + "router_z_loss_mlp": 0.15515137, + "step": 6138, + "time_per_iteration": 4.004723072052002 + }, + { + "auxiliary_loss_clip": 0.06475753, + "auxiliary_loss_mlp": 0.01270871, + "balance_loss_clip": 0.06296947, + "balance_loss_mlp": 0.01255827, + "epoch": 0.3690966481286638, + "flos": 31833495884160.0, + "grad_norm": 1.986271481109943, + "language_loss": 0.65762347, + "learning_rate": 2.9102531947698764e-06, + "loss": 0.73508972, + "num_input_tokens_seen": 131935715, + "router_z_loss_clip": 1.78808594, + "router_z_loss_mlp": 0.1505127, + "step": 6139, + "time_per_iteration": 2.621832847595215 + }, + { + "auxiliary_loss_clip": 0.06460394, + "auxiliary_loss_mlp": 0.01271698, + "balance_loss_clip": 0.06290884, + "balance_loss_mlp": 0.0125626, + "epoch": 0.36915677138133174, + "flos": 13119897870720.0, + "grad_norm": 1.9334180469367421, + "language_loss": 0.72060692, + "learning_rate": 2.909906390418006e-06, + "loss": 0.7979278, + "num_input_tokens_seen": 131954120, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.15429688, + "step": 6140, + "time_per_iteration": 2.542410135269165 + }, + { + "auxiliary_loss_clip": 0.06370358, + "auxiliary_loss_mlp": 0.01255246, + "balance_loss_clip": 0.06292184, + "balance_loss_mlp": 0.01252388, + "epoch": 0.3692168946339997, + "flos": 68707926996480.0, + "grad_norm": 0.7297912869343693, + "language_loss": 0.59210759, + "learning_rate": 2.9095595515623934e-06, + "loss": 0.66836369, + "num_input_tokens_seen": 132017485, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 0.02853394, + "step": 6141, + "time_per_iteration": 3.242342710494995 + }, + { + "auxiliary_loss_clip": 0.06465677, + "auxiliary_loss_mlp": 0.01272477, + "balance_loss_clip": 0.06289662, + "balance_loss_mlp": 0.01256336, + "epoch": 0.36927701788666767, + "flos": 22024392537600.0, + "grad_norm": 1.6449420117919953, + "language_loss": 0.75489783, + "learning_rate": 2.909212678216192e-06, + "loss": 0.83227944, + "num_input_tokens_seen": 132036760, + "router_z_loss_clip": 1.76074219, + "router_z_loss_mlp": 0.16149902, + "step": 6142, + "time_per_iteration": 2.552541732788086 + }, + { + "auxiliary_loss_clip": 0.06459697, + "auxiliary_loss_mlp": 0.01271426, + "balance_loss_clip": 0.06287819, + "balance_loss_mlp": 0.01256883, + "epoch": 0.36933714113933563, + "flos": 21842103980160.0, + "grad_norm": 2.1834908331499694, + "language_loss": 0.77180201, + "learning_rate": 2.908865770392555e-06, + "loss": 0.84911323, + "num_input_tokens_seen": 132056935, + "router_z_loss_clip": 1.71777344, + "router_z_loss_mlp": 0.14544678, + "step": 6143, + "time_per_iteration": 3.990859031677246 + }, + { + "auxiliary_loss_clip": 0.06461622, + "auxiliary_loss_mlp": 0.01265429, + "balance_loss_clip": 0.06289461, + "balance_loss_mlp": 0.01251565, + "epoch": 0.3693972643920036, + "flos": 23697749301120.0, + "grad_norm": 1.9416354027972629, + "language_loss": 0.82307315, + "learning_rate": 2.9085188281046364e-06, + "loss": 0.9003436, + "num_input_tokens_seen": 132077285, + "router_z_loss_clip": 1.72265625, + "router_z_loss_mlp": 0.13867188, + "step": 6144, + "time_per_iteration": 2.5504705905914307 + }, + { + "auxiliary_loss_clip": 0.06462898, + "auxiliary_loss_mlp": 0.01269861, + "balance_loss_clip": 0.06287374, + "balance_loss_mlp": 0.01255586, + "epoch": 0.36945738764467156, + "flos": 22863355943040.0, + "grad_norm": 2.172105123479451, + "language_loss": 0.78995448, + "learning_rate": 2.908171851365593e-06, + "loss": 0.86728209, + "num_input_tokens_seen": 132095520, + "router_z_loss_clip": 1.75390625, + "router_z_loss_mlp": 0.14282227, + "step": 6145, + "time_per_iteration": 3.9733781814575195 + }, + { + "auxiliary_loss_clip": 0.06468924, + "auxiliary_loss_mlp": 0.01271457, + "balance_loss_clip": 0.06291068, + "balance_loss_mlp": 0.01256067, + "epoch": 0.36951751089733953, + "flos": 16621213380480.0, + "grad_norm": 1.6722610276638135, + "language_loss": 0.77129662, + "learning_rate": 2.9078248401885815e-06, + "loss": 0.8487004, + "num_input_tokens_seen": 132112810, + "router_z_loss_clip": 1.77832031, + "router_z_loss_mlp": 0.15380859, + "step": 6146, + "time_per_iteration": 2.5411174297332764 + }, + { + "auxiliary_loss_clip": 0.06466483, + "auxiliary_loss_mlp": 0.0127594, + "balance_loss_clip": 0.06289164, + "balance_loss_mlp": 0.01260419, + "epoch": 0.3695776341500075, + "flos": 18920204254080.0, + "grad_norm": 1.6293394058894772, + "language_loss": 0.81346822, + "learning_rate": 2.907477794586761e-06, + "loss": 0.89089251, + "num_input_tokens_seen": 132131615, + "router_z_loss_clip": 1.7734375, + "router_z_loss_mlp": 0.1550293, + "step": 6147, + "time_per_iteration": 2.5456924438476562 + }, + { + "auxiliary_loss_clip": 0.06463629, + "auxiliary_loss_mlp": 0.01275917, + "balance_loss_clip": 0.06286413, + "balance_loss_mlp": 0.01261684, + "epoch": 0.36963775740267546, + "flos": 20813892128640.0, + "grad_norm": 1.8090658573318705, + "language_loss": 0.83484954, + "learning_rate": 2.9071307145732926e-06, + "loss": 0.91224504, + "num_input_tokens_seen": 132149585, + "router_z_loss_clip": 1.77246094, + "router_z_loss_mlp": 0.14227295, + "step": 6148, + "time_per_iteration": 2.6318178176879883 + }, + { + "auxiliary_loss_clip": 0.06458767, + "auxiliary_loss_mlp": 0.01266964, + "balance_loss_clip": 0.06284354, + "balance_loss_mlp": 0.01252814, + "epoch": 0.3696978806553435, + "flos": 26068087526400.0, + "grad_norm": 2.191330684134815, + "language_loss": 0.74277508, + "learning_rate": 2.9067836001613357e-06, + "loss": 0.82003242, + "num_input_tokens_seen": 132165555, + "router_z_loss_clip": 1.74414062, + "router_z_loss_mlp": 0.14147949, + "step": 6149, + "time_per_iteration": 2.6037940979003906 + }, + { + "auxiliary_loss_clip": 0.06464496, + "auxiliary_loss_mlp": 0.01271867, + "balance_loss_clip": 0.06287233, + "balance_loss_mlp": 0.01256203, + "epoch": 0.36975800390801145, + "flos": 26841237949440.0, + "grad_norm": 2.856714094904378, + "language_loss": 0.71066409, + "learning_rate": 2.906436451364054e-06, + "loss": 0.78802776, + "num_input_tokens_seen": 132185100, + "router_z_loss_clip": 1.77050781, + "router_z_loss_mlp": 0.15667725, + "step": 6150, + "time_per_iteration": 2.612860918045044 + }, + { + "auxiliary_loss_clip": 0.06457143, + "auxiliary_loss_mlp": 0.01270306, + "balance_loss_clip": 0.06283612, + "balance_loss_mlp": 0.01256341, + "epoch": 0.3698181271606794, + "flos": 21149063660160.0, + "grad_norm": 1.8423166255946122, + "language_loss": 0.81970799, + "learning_rate": 2.906089268194611e-06, + "loss": 0.89698249, + "num_input_tokens_seen": 132203930, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.1395874, + "step": 6151, + "time_per_iteration": 2.535888195037842 + }, + { + "auxiliary_loss_clip": 0.0635625, + "auxiliary_loss_mlp": 0.01266021, + "balance_loss_clip": 0.06277541, + "balance_loss_mlp": 0.01262752, + "epoch": 0.3698782504133474, + "flos": 66761605958400.0, + "grad_norm": 0.7660918799950965, + "language_loss": 0.63089043, + "learning_rate": 2.9057420506661726e-06, + "loss": 0.70711315, + "num_input_tokens_seen": 132263845, + "router_z_loss_clip": 0.78857422, + "router_z_loss_mlp": 0.03274536, + "step": 6152, + "time_per_iteration": 3.27481746673584 + }, + { + "auxiliary_loss_clip": 0.06456928, + "auxiliary_loss_mlp": 0.01269625, + "balance_loss_clip": 0.06289765, + "balance_loss_mlp": 0.01256709, + "epoch": 0.36993837366601534, + "flos": 24317597479680.0, + "grad_norm": 2.4460843976292455, + "language_loss": 0.7067228, + "learning_rate": 2.9053947987919044e-06, + "loss": 0.78398836, + "num_input_tokens_seen": 132282350, + "router_z_loss_clip": 1.671875, + "router_z_loss_mlp": 0.12921143, + "step": 6153, + "time_per_iteration": 2.561366319656372 + }, + { + "auxiliary_loss_clip": 0.06461591, + "auxiliary_loss_mlp": 0.01272426, + "balance_loss_clip": 0.06285959, + "balance_loss_mlp": 0.0125796, + "epoch": 0.3699984969186833, + "flos": 24355472325120.0, + "grad_norm": 1.7390512131477307, + "language_loss": 0.72820848, + "learning_rate": 2.9050475125849755e-06, + "loss": 0.80554867, + "num_input_tokens_seen": 132301930, + "router_z_loss_clip": 1.75585938, + "router_z_loss_mlp": 0.14459229, + "step": 6154, + "time_per_iteration": 2.6359784603118896 + }, + { + "auxiliary_loss_clip": 0.06464534, + "auxiliary_loss_mlp": 0.01270069, + "balance_loss_clip": 0.06290819, + "balance_loss_mlp": 0.01256468, + "epoch": 0.37005862017135127, + "flos": 19835378547840.0, + "grad_norm": 1.7720975153034155, + "language_loss": 0.68251342, + "learning_rate": 2.9047001920585534e-06, + "loss": 0.75985944, + "num_input_tokens_seen": 132320915, + "router_z_loss_clip": 1.73828125, + "router_z_loss_mlp": 0.1361084, + "step": 6155, + "time_per_iteration": 2.6026792526245117 + }, + { + "auxiliary_loss_clip": 0.06462097, + "auxiliary_loss_mlp": 0.01275284, + "balance_loss_clip": 0.06290478, + "balance_loss_mlp": 0.01261551, + "epoch": 0.37011874342401924, + "flos": 19579981703040.0, + "grad_norm": 1.763175663447542, + "language_loss": 0.68228447, + "learning_rate": 2.9043528372258097e-06, + "loss": 0.75965828, + "num_input_tokens_seen": 132340415, + "router_z_loss_clip": 1.71679688, + "router_z_loss_mlp": 0.13745117, + "step": 6156, + "time_per_iteration": 2.5805797576904297 + }, + { + "auxiliary_loss_clip": 0.06460856, + "auxiliary_loss_mlp": 0.01276122, + "balance_loss_clip": 0.06292138, + "balance_loss_mlp": 0.01263051, + "epoch": 0.3701788666766872, + "flos": 20380315576320.0, + "grad_norm": 2.4756712581972673, + "language_loss": 0.82280111, + "learning_rate": 2.904005448099916e-06, + "loss": 0.9001708, + "num_input_tokens_seen": 132358600, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.13061523, + "step": 6157, + "time_per_iteration": 2.5239012241363525 + }, + { + "auxiliary_loss_clip": 0.06472905, + "auxiliary_loss_mlp": 0.01276517, + "balance_loss_clip": 0.06294029, + "balance_loss_mlp": 0.0126136, + "epoch": 0.37023898992935517, + "flos": 15346325508480.0, + "grad_norm": 2.1879647979069055, + "language_loss": 0.77007514, + "learning_rate": 2.9036580246940444e-06, + "loss": 0.84756935, + "num_input_tokens_seen": 132373160, + "router_z_loss_clip": 1.78710938, + "router_z_loss_mlp": 0.15142822, + "step": 6158, + "time_per_iteration": 2.5507380962371826 + }, + { + "auxiliary_loss_clip": 0.06472066, + "auxiliary_loss_mlp": 0.01273585, + "balance_loss_clip": 0.0629342, + "balance_loss_mlp": 0.0125872, + "epoch": 0.37029911318202313, + "flos": 19580149411200.0, + "grad_norm": 1.9796058392103062, + "language_loss": 0.68833315, + "learning_rate": 2.9033105670213708e-06, + "loss": 0.76578963, + "num_input_tokens_seen": 132392345, + "router_z_loss_clip": 1.78710938, + "router_z_loss_mlp": 0.14880371, + "step": 6159, + "time_per_iteration": 2.4941582679748535 + }, + { + "auxiliary_loss_clip": 0.06464109, + "auxiliary_loss_mlp": 0.01275069, + "balance_loss_clip": 0.06292266, + "balance_loss_mlp": 0.01261986, + "epoch": 0.3703592364346911, + "flos": 26220509303040.0, + "grad_norm": 1.9367461088396363, + "language_loss": 0.71322787, + "learning_rate": 2.9029630750950697e-06, + "loss": 0.79061961, + "num_input_tokens_seen": 132412620, + "router_z_loss_clip": 1.71679688, + "router_z_loss_mlp": 0.13079834, + "step": 6160, + "time_per_iteration": 2.5934555530548096 + }, + { + "auxiliary_loss_clip": 0.06465742, + "auxiliary_loss_mlp": 0.01272758, + "balance_loss_clip": 0.06295532, + "balance_loss_mlp": 0.0125958, + "epoch": 0.37041935968735906, + "flos": 20054619555840.0, + "grad_norm": 1.6534007301448785, + "language_loss": 0.78978807, + "learning_rate": 2.9026155489283176e-06, + "loss": 0.86717302, + "num_input_tokens_seen": 132431570, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.1317749, + "step": 6161, + "time_per_iteration": 2.5337588787078857 + }, + { + "auxiliary_loss_clip": 0.06465232, + "auxiliary_loss_mlp": 0.01270423, + "balance_loss_clip": 0.06291839, + "balance_loss_mlp": 0.01255837, + "epoch": 0.3704794829400271, + "flos": 24140633656320.0, + "grad_norm": 1.7631614273732186, + "language_loss": 0.79746109, + "learning_rate": 2.902267988534295e-06, + "loss": 0.87481761, + "num_input_tokens_seen": 132451525, + "router_z_loss_clip": 1.73339844, + "router_z_loss_mlp": 0.14587402, + "step": 6162, + "time_per_iteration": 2.5815200805664062 + }, + { + "auxiliary_loss_clip": 0.06466715, + "auxiliary_loss_mlp": 0.01274307, + "balance_loss_clip": 0.06292939, + "balance_loss_mlp": 0.01260717, + "epoch": 0.37053960619269505, + "flos": 14872232707200.0, + "grad_norm": 1.8866019587111915, + "language_loss": 0.80318987, + "learning_rate": 2.9019203939261783e-06, + "loss": 0.88060015, + "num_input_tokens_seen": 132469875, + "router_z_loss_clip": 1.73925781, + "router_z_loss_mlp": 0.13580322, + "step": 6163, + "time_per_iteration": 2.501971483230591 + }, + { + "auxiliary_loss_clip": 0.06466764, + "auxiliary_loss_mlp": 0.01273928, + "balance_loss_clip": 0.0629348, + "balance_loss_mlp": 0.01260315, + "epoch": 0.370599729445363, + "flos": 21367969251840.0, + "grad_norm": 1.81392406825425, + "language_loss": 0.68857837, + "learning_rate": 2.9015727651171507e-06, + "loss": 0.76598537, + "num_input_tokens_seen": 132488360, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.13598633, + "step": 6164, + "time_per_iteration": 2.557870388031006 + }, + { + "auxiliary_loss_clip": 0.06463528, + "auxiliary_loss_mlp": 0.01275542, + "balance_loss_clip": 0.06290606, + "balance_loss_mlp": 0.0126064, + "epoch": 0.370659852698031, + "flos": 26835535872000.0, + "grad_norm": 2.3609289004256984, + "language_loss": 0.83364576, + "learning_rate": 2.9012251021203935e-06, + "loss": 0.91103643, + "num_input_tokens_seen": 132508630, + "router_z_loss_clip": 1.72851562, + "router_z_loss_mlp": 0.14916992, + "step": 6165, + "time_per_iteration": 2.5597267150878906 + }, + { + "auxiliary_loss_clip": 0.06475651, + "auxiliary_loss_mlp": 0.01276631, + "balance_loss_clip": 0.06294797, + "balance_loss_mlp": 0.01261086, + "epoch": 0.37071997595069894, + "flos": 19105050360960.0, + "grad_norm": 1.8212520052796557, + "language_loss": 0.69703627, + "learning_rate": 2.9008774049490896e-06, + "loss": 0.77455908, + "num_input_tokens_seen": 132527465, + "router_z_loss_clip": 1.81054688, + "router_z_loss_mlp": 0.15551758, + "step": 6166, + "time_per_iteration": 2.7443737983703613 + }, + { + "auxiliary_loss_clip": 0.06351966, + "auxiliary_loss_mlp": 0.01259396, + "balance_loss_clip": 0.0627325, + "balance_loss_mlp": 0.01255936, + "epoch": 0.3707800992033669, + "flos": 52193839461120.0, + "grad_norm": 0.7767712005900987, + "language_loss": 0.55992532, + "learning_rate": 2.9005296736164244e-06, + "loss": 0.6360389, + "num_input_tokens_seen": 132579940, + "router_z_loss_clip": 0.78808594, + "router_z_loss_mlp": 0.03469849, + "step": 6167, + "time_per_iteration": 3.122786045074463 + }, + { + "auxiliary_loss_clip": 0.06470326, + "auxiliary_loss_mlp": 0.01270542, + "balance_loss_clip": 0.06298738, + "balance_loss_mlp": 0.01256553, + "epoch": 0.3708402224560349, + "flos": 19908025637760.0, + "grad_norm": 1.887650816435161, + "language_loss": 0.75851792, + "learning_rate": 2.900181908135584e-06, + "loss": 0.83592659, + "num_input_tokens_seen": 132598390, + "router_z_loss_clip": 1.71484375, + "router_z_loss_mlp": 0.13983154, + "step": 6168, + "time_per_iteration": 2.516329050064087 + }, + { + "auxiliary_loss_clip": 0.06462339, + "auxiliary_loss_mlp": 0.01269774, + "balance_loss_clip": 0.0628986, + "balance_loss_mlp": 0.01255833, + "epoch": 0.37090034570870284, + "flos": 20013222839040.0, + "grad_norm": 1.688087532093935, + "language_loss": 0.74697542, + "learning_rate": 2.899834108519755e-06, + "loss": 0.82429659, + "num_input_tokens_seen": 132616920, + "router_z_loss_clip": 1.72363281, + "router_z_loss_mlp": 0.13946533, + "step": 6169, + "time_per_iteration": 2.571059226989746 + }, + { + "auxiliary_loss_clip": 0.06462043, + "auxiliary_loss_mlp": 0.01269285, + "balance_loss_clip": 0.06291892, + "balance_loss_mlp": 0.0125526, + "epoch": 0.3709604689613708, + "flos": 24141681832320.0, + "grad_norm": 1.6120375976718775, + "language_loss": 0.79462636, + "learning_rate": 2.899486274782127e-06, + "loss": 0.87193966, + "num_input_tokens_seen": 132637660, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.14007568, + "step": 6170, + "time_per_iteration": 2.539099931716919 + }, + { + "auxiliary_loss_clip": 0.06461793, + "auxiliary_loss_mlp": 0.01268893, + "balance_loss_clip": 0.06289523, + "balance_loss_mlp": 0.01254183, + "epoch": 0.37102059221403877, + "flos": 23882469626880.0, + "grad_norm": 1.7170622011660002, + "language_loss": 0.76363444, + "learning_rate": 2.8991384069358885e-06, + "loss": 0.84094131, + "num_input_tokens_seen": 132657635, + "router_z_loss_clip": 1.72167969, + "router_z_loss_mlp": 0.14703369, + "step": 6171, + "time_per_iteration": 2.5565338134765625 + }, + { + "auxiliary_loss_clip": 0.06464403, + "auxiliary_loss_mlp": 0.01269741, + "balance_loss_clip": 0.06292279, + "balance_loss_mlp": 0.0125568, + "epoch": 0.37108071546670673, + "flos": 14506439708160.0, + "grad_norm": 2.2434941236901222, + "language_loss": 0.80974334, + "learning_rate": 2.898790504994232e-06, + "loss": 0.88708472, + "num_input_tokens_seen": 132674455, + "router_z_loss_clip": 1.71972656, + "router_z_loss_mlp": 0.140625, + "step": 6172, + "time_per_iteration": 2.496101140975952 + }, + { + "auxiliary_loss_clip": 0.06468061, + "auxiliary_loss_mlp": 0.01272991, + "balance_loss_clip": 0.06291698, + "balance_loss_mlp": 0.01258352, + "epoch": 0.3711408387193747, + "flos": 34570172160000.0, + "grad_norm": 1.701200983183655, + "language_loss": 0.59536189, + "learning_rate": 2.89844256897035e-06, + "loss": 0.67277241, + "num_input_tokens_seen": 132695140, + "router_z_loss_clip": 1.76367188, + "router_z_loss_mlp": 0.14648438, + "step": 6173, + "time_per_iteration": 2.68860125541687 + }, + { + "auxiliary_loss_clip": 0.06465948, + "auxiliary_loss_mlp": 0.01267208, + "balance_loss_clip": 0.06291407, + "balance_loss_mlp": 0.01252825, + "epoch": 0.37120096197204266, + "flos": 17316350052480.0, + "grad_norm": 3.482738270256764, + "language_loss": 0.81161231, + "learning_rate": 2.898094598877435e-06, + "loss": 0.88894391, + "num_input_tokens_seen": 132712470, + "router_z_loss_clip": 1.74414062, + "router_z_loss_mlp": 0.1439209, + "step": 6174, + "time_per_iteration": 2.498631238937378 + }, + { + "auxiliary_loss_clip": 0.06459825, + "auxiliary_loss_mlp": 0.01267088, + "balance_loss_clip": 0.06290745, + "balance_loss_mlp": 0.01253826, + "epoch": 0.37126108522471063, + "flos": 30671855205120.0, + "grad_norm": 1.7762050826086826, + "language_loss": 0.79733562, + "learning_rate": 2.8977465947286826e-06, + "loss": 0.87460476, + "num_input_tokens_seen": 132732945, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.13275146, + "step": 6175, + "time_per_iteration": 2.6155989170074463 + }, + { + "auxiliary_loss_clip": 0.06469794, + "auxiliary_loss_mlp": 0.01268815, + "balance_loss_clip": 0.06296568, + "balance_loss_mlp": 0.01253926, + "epoch": 0.37132120847737865, + "flos": 25162682232960.0, + "grad_norm": 2.183025760433602, + "language_loss": 0.8886646, + "learning_rate": 2.89739855653729e-06, + "loss": 0.96605068, + "num_input_tokens_seen": 132752470, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.14880371, + "step": 6176, + "time_per_iteration": 3.9855380058288574 + }, + { + "auxiliary_loss_clip": 0.06463525, + "auxiliary_loss_mlp": 0.01266267, + "balance_loss_clip": 0.0629091, + "balance_loss_mlp": 0.01252331, + "epoch": 0.3713813317300466, + "flos": 21219572471040.0, + "grad_norm": 1.8377156327305517, + "language_loss": 0.73693877, + "learning_rate": 2.8970504843164546e-06, + "loss": 0.8142367, + "num_input_tokens_seen": 132771485, + "router_z_loss_clip": 1.72363281, + "router_z_loss_mlp": 0.13952637, + "step": 6177, + "time_per_iteration": 2.584007501602173 + }, + { + "auxiliary_loss_clip": 0.06460603, + "auxiliary_loss_mlp": 0.01270943, + "balance_loss_clip": 0.06288581, + "balance_loss_mlp": 0.01256722, + "epoch": 0.3714414549827146, + "flos": 21623114534400.0, + "grad_norm": 3.348536242845292, + "language_loss": 0.75657964, + "learning_rate": 2.896702378079374e-06, + "loss": 0.83389515, + "num_input_tokens_seen": 132791465, + "router_z_loss_clip": 1.72265625, + "router_z_loss_mlp": 0.14227295, + "step": 6178, + "time_per_iteration": 4.047810077667236 + }, + { + "auxiliary_loss_clip": 0.06459013, + "auxiliary_loss_mlp": 0.01268256, + "balance_loss_clip": 0.06288654, + "balance_loss_mlp": 0.01253796, + "epoch": 0.37150157823538255, + "flos": 19978073251200.0, + "grad_norm": 1.677068577007521, + "language_loss": 0.7243154, + "learning_rate": 2.8963542378392502e-06, + "loss": 0.80158818, + "num_input_tokens_seen": 132810160, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.14465332, + "step": 6179, + "time_per_iteration": 2.525162696838379 + }, + { + "auxiliary_loss_clip": 0.06464912, + "auxiliary_loss_mlp": 0.01268865, + "balance_loss_clip": 0.06289817, + "balance_loss_mlp": 0.01254506, + "epoch": 0.3715617014880505, + "flos": 24867020701440.0, + "grad_norm": 1.5744290711880986, + "language_loss": 0.70164317, + "learning_rate": 2.896006063609283e-06, + "loss": 0.77898097, + "num_input_tokens_seen": 132831265, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.14361572, + "step": 6180, + "time_per_iteration": 2.564251661300659 + }, + { + "auxiliary_loss_clip": 0.06459807, + "auxiliary_loss_mlp": 0.01269776, + "balance_loss_clip": 0.0628929, + "balance_loss_mlp": 0.01255173, + "epoch": 0.3716218247407185, + "flos": 20455352507520.0, + "grad_norm": 1.6669585833251956, + "language_loss": 0.78357702, + "learning_rate": 2.8956578554026767e-06, + "loss": 0.86087286, + "num_input_tokens_seen": 132850005, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.14605713, + "step": 6181, + "time_per_iteration": 2.5857934951782227 + }, + { + "auxiliary_loss_clip": 0.06456675, + "auxiliary_loss_mlp": 0.01268697, + "balance_loss_clip": 0.06286183, + "balance_loss_mlp": 0.01254195, + "epoch": 0.37168194799338644, + "flos": 24140256312960.0, + "grad_norm": 1.7806049549646892, + "language_loss": 0.78926349, + "learning_rate": 2.8953096132326343e-06, + "loss": 0.86651719, + "num_input_tokens_seen": 132865790, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.14520264, + "step": 6182, + "time_per_iteration": 2.572563409805298 + }, + { + "auxiliary_loss_clip": 0.0637676, + "auxiliary_loss_mlp": 0.01256678, + "balance_loss_clip": 0.06297279, + "balance_loss_mlp": 0.01253508, + "epoch": 0.3717420712460544, + "flos": 67429601107200.0, + "grad_norm": 0.7782169453066291, + "language_loss": 0.57265592, + "learning_rate": 2.894961337112362e-06, + "loss": 0.64899027, + "num_input_tokens_seen": 132921775, + "router_z_loss_clip": 0.79296875, + "router_z_loss_mlp": 0.03170776, + "step": 6183, + "time_per_iteration": 4.616533279418945 + }, + { + "auxiliary_loss_clip": 0.06460768, + "auxiliary_loss_mlp": 0.0127302, + "balance_loss_clip": 0.06282368, + "balance_loss_mlp": 0.01258059, + "epoch": 0.37180219449872237, + "flos": 22382512888320.0, + "grad_norm": 2.288371354177028, + "language_loss": 0.77116179, + "learning_rate": 2.894613027055066e-06, + "loss": 0.84849966, + "num_input_tokens_seen": 132941060, + "router_z_loss_clip": 1.78320312, + "router_z_loss_mlp": 0.1496582, + "step": 6184, + "time_per_iteration": 2.5182292461395264 + }, + { + "auxiliary_loss_clip": 0.06457444, + "auxiliary_loss_mlp": 0.01269752, + "balance_loss_clip": 0.0628842, + "balance_loss_mlp": 0.01255739, + "epoch": 0.37186231775139034, + "flos": 21876037683840.0, + "grad_norm": 2.2342830987852023, + "language_loss": 0.72608167, + "learning_rate": 2.894264683073954e-06, + "loss": 0.80335367, + "num_input_tokens_seen": 132961850, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.14007568, + "step": 6185, + "time_per_iteration": 3.928272247314453 + }, + { + "auxiliary_loss_clip": 0.06453837, + "auxiliary_loss_mlp": 0.01267225, + "balance_loss_clip": 0.06286646, + "balance_loss_mlp": 0.01253075, + "epoch": 0.3719224410040583, + "flos": 22421142420480.0, + "grad_norm": 1.6056881027286982, + "language_loss": 0.77329034, + "learning_rate": 2.8939163051822363e-06, + "loss": 0.85050094, + "num_input_tokens_seen": 132981625, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.14160156, + "step": 6186, + "time_per_iteration": 2.549499988555908 + }, + { + "auxiliary_loss_clip": 0.0646092, + "auxiliary_loss_mlp": 0.01274226, + "balance_loss_clip": 0.06283663, + "balance_loss_mlp": 0.01258121, + "epoch": 0.37198256425672627, + "flos": 25157525207040.0, + "grad_norm": 1.8763954627941488, + "language_loss": 0.84227252, + "learning_rate": 2.8935678933931224e-06, + "loss": 0.91962403, + "num_input_tokens_seen": 133001225, + "router_z_loss_clip": 1.77246094, + "router_z_loss_mlp": 0.16101074, + "step": 6187, + "time_per_iteration": 2.542978048324585 + }, + { + "auxiliary_loss_clip": 0.06456143, + "auxiliary_loss_mlp": 0.01269651, + "balance_loss_clip": 0.06286585, + "balance_loss_mlp": 0.01255919, + "epoch": 0.37204268750939423, + "flos": 21144032415360.0, + "grad_norm": 2.100791898470326, + "language_loss": 0.84696567, + "learning_rate": 2.893219447719824e-06, + "loss": 0.9242236, + "num_input_tokens_seen": 133018820, + "router_z_loss_clip": 1.69433594, + "router_z_loss_mlp": 0.13726807, + "step": 6188, + "time_per_iteration": 2.626126766204834 + }, + { + "auxiliary_loss_clip": 0.06458837, + "auxiliary_loss_mlp": 0.01269894, + "balance_loss_clip": 0.06288396, + "balance_loss_mlp": 0.01256232, + "epoch": 0.37210281076206225, + "flos": 21513221504640.0, + "grad_norm": 2.2586863759616564, + "language_loss": 0.66390121, + "learning_rate": 2.8928709681755548e-06, + "loss": 0.74118853, + "num_input_tokens_seen": 133040205, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.13653564, + "step": 6189, + "time_per_iteration": 2.5793135166168213 + }, + { + "auxiliary_loss_clip": 0.06460261, + "auxiliary_loss_mlp": 0.01271387, + "balance_loss_clip": 0.0628726, + "balance_loss_mlp": 0.01255926, + "epoch": 0.3721629340147302, + "flos": 17353595992320.0, + "grad_norm": 2.971940637043147, + "language_loss": 0.84218514, + "learning_rate": 2.8925224547735293e-06, + "loss": 0.91950166, + "num_input_tokens_seen": 133058095, + "router_z_loss_clip": 1.72949219, + "router_z_loss_mlp": 0.15466309, + "step": 6190, + "time_per_iteration": 2.530977487564087 + }, + { + "auxiliary_loss_clip": 0.06464738, + "auxiliary_loss_mlp": 0.01270544, + "balance_loss_clip": 0.06287063, + "balance_loss_mlp": 0.01255905, + "epoch": 0.3722230572673982, + "flos": 16437457376640.0, + "grad_norm": 2.7368484374177076, + "language_loss": 0.89274895, + "learning_rate": 2.8921739075269633e-06, + "loss": 0.97010183, + "num_input_tokens_seen": 133071530, + "router_z_loss_clip": 1.77441406, + "router_z_loss_mlp": 0.14648438, + "step": 6191, + "time_per_iteration": 2.4786319732666016 + }, + { + "auxiliary_loss_clip": 0.06463645, + "auxiliary_loss_mlp": 0.01271285, + "balance_loss_clip": 0.06286322, + "balance_loss_mlp": 0.01254465, + "epoch": 0.37228318052006615, + "flos": 22681360874880.0, + "grad_norm": 2.1321020045013577, + "language_loss": 0.74374199, + "learning_rate": 2.891825326449073e-06, + "loss": 0.82109123, + "num_input_tokens_seen": 133091410, + "router_z_loss_clip": 1.77148438, + "router_z_loss_mlp": 0.16790771, + "step": 6192, + "time_per_iteration": 2.6107547283172607 + }, + { + "auxiliary_loss_clip": 0.06461145, + "auxiliary_loss_mlp": 0.01269074, + "balance_loss_clip": 0.06288278, + "balance_loss_mlp": 0.0125493, + "epoch": 0.3723433037727341, + "flos": 25272617189760.0, + "grad_norm": 2.3785606336548124, + "language_loss": 0.79934001, + "learning_rate": 2.8914767115530766e-06, + "loss": 0.87664223, + "num_input_tokens_seen": 133110365, + "router_z_loss_clip": 1.72753906, + "router_z_loss_mlp": 0.14154053, + "step": 6193, + "time_per_iteration": 2.5584514141082764 + }, + { + "auxiliary_loss_clip": 0.06469596, + "auxiliary_loss_mlp": 0.01270113, + "balance_loss_clip": 0.06293128, + "balance_loss_mlp": 0.01255594, + "epoch": 0.3724034270254021, + "flos": 10529228534400.0, + "grad_norm": 1.7620775512614164, + "language_loss": 0.84889179, + "learning_rate": 2.891128062852194e-06, + "loss": 0.92628884, + "num_input_tokens_seen": 133128255, + "router_z_loss_clip": 1.76269531, + "router_z_loss_mlp": 0.14526367, + "step": 6194, + "time_per_iteration": 2.5419061183929443 + }, + { + "auxiliary_loss_clip": 0.06460975, + "auxiliary_loss_mlp": 0.01266847, + "balance_loss_clip": 0.06288271, + "balance_loss_mlp": 0.01253317, + "epoch": 0.37246355027807004, + "flos": 20272393117440.0, + "grad_norm": 2.226391461709797, + "language_loss": 0.78030515, + "learning_rate": 2.890779380359646e-06, + "loss": 0.85758334, + "num_input_tokens_seen": 133143975, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.13543701, + "step": 6195, + "time_per_iteration": 2.51361346244812 + }, + { + "auxiliary_loss_clip": 0.06459115, + "auxiliary_loss_mlp": 0.01274112, + "balance_loss_clip": 0.06288831, + "balance_loss_mlp": 0.01258955, + "epoch": 0.372523673530738, + "flos": 19506705707520.0, + "grad_norm": 1.8216220923823887, + "language_loss": 0.79924363, + "learning_rate": 2.890430664088655e-06, + "loss": 0.87657595, + "num_input_tokens_seen": 133162935, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.15155029, + "step": 6196, + "time_per_iteration": 2.6005568504333496 + }, + { + "auxiliary_loss_clip": 0.06458211, + "auxiliary_loss_mlp": 0.01270847, + "balance_loss_clip": 0.06289028, + "balance_loss_mlp": 0.01256888, + "epoch": 0.372583796783406, + "flos": 16769945577600.0, + "grad_norm": 2.2795878215352396, + "language_loss": 0.84059894, + "learning_rate": 2.890081914052443e-06, + "loss": 0.91788948, + "num_input_tokens_seen": 133181180, + "router_z_loss_clip": 1.69140625, + "router_z_loss_mlp": 0.13952637, + "step": 6197, + "time_per_iteration": 2.538058042526245 + }, + { + "auxiliary_loss_clip": 0.06456813, + "auxiliary_loss_mlp": 0.01270068, + "balance_loss_clip": 0.06289704, + "balance_loss_mlp": 0.01255095, + "epoch": 0.37264392003607394, + "flos": 22644576132480.0, + "grad_norm": 1.7143100919816474, + "language_loss": 0.64964151, + "learning_rate": 2.889733130264237e-06, + "loss": 0.72691035, + "num_input_tokens_seen": 133199615, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.14971924, + "step": 6198, + "time_per_iteration": 2.5891072750091553 + }, + { + "auxiliary_loss_clip": 0.06454235, + "auxiliary_loss_mlp": 0.0127235, + "balance_loss_clip": 0.0628581, + "balance_loss_mlp": 0.01258367, + "epoch": 0.3727040432887419, + "flos": 19979037573120.0, + "grad_norm": 1.4303592099178044, + "language_loss": 0.74534631, + "learning_rate": 2.889384312737261e-06, + "loss": 0.82261217, + "num_input_tokens_seen": 133219650, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.13977051, + "step": 6199, + "time_per_iteration": 2.5612289905548096 + }, + { + "auxiliary_loss_clip": 0.06453978, + "auxiliary_loss_mlp": 0.01269323, + "balance_loss_clip": 0.06284302, + "balance_loss_mlp": 0.01255095, + "epoch": 0.37276416654140987, + "flos": 63911906853120.0, + "grad_norm": 1.6001689252403943, + "language_loss": 0.81250614, + "learning_rate": 2.889035461484742e-06, + "loss": 0.88973916, + "num_input_tokens_seen": 133245675, + "router_z_loss_clip": 1.69433594, + "router_z_loss_mlp": 0.14227295, + "step": 6200, + "time_per_iteration": 2.9802377223968506 + }, + { + "auxiliary_loss_clip": 0.06452343, + "auxiliary_loss_mlp": 0.01273173, + "balance_loss_clip": 0.06282811, + "balance_loss_mlp": 0.0125907, + "epoch": 0.37282428979407783, + "flos": 39795381244800.0, + "grad_norm": 2.0282879733455776, + "language_loss": 0.61128068, + "learning_rate": 2.88868657651991e-06, + "loss": 0.68853581, + "num_input_tokens_seen": 133266905, + "router_z_loss_clip": 1.69238281, + "router_z_loss_mlp": 0.14123535, + "step": 6201, + "time_per_iteration": 2.6786048412323 + }, + { + "auxiliary_loss_clip": 0.06460309, + "auxiliary_loss_mlp": 0.01271912, + "balance_loss_clip": 0.06284842, + "balance_loss_mlp": 0.01257166, + "epoch": 0.37288441304674586, + "flos": 22715336505600.0, + "grad_norm": 1.562126243298772, + "language_loss": 0.73424393, + "learning_rate": 2.8883376578559934e-06, + "loss": 0.81156611, + "num_input_tokens_seen": 133286865, + "router_z_loss_clip": 1.75390625, + "router_z_loss_mlp": 0.14746094, + "step": 6202, + "time_per_iteration": 2.5774593353271484 + }, + { + "auxiliary_loss_clip": 0.06450565, + "auxiliary_loss_mlp": 0.01268741, + "balance_loss_clip": 0.06279768, + "balance_loss_mlp": 0.01253697, + "epoch": 0.3729445362994138, + "flos": 18776209812480.0, + "grad_norm": 3.8476229642649895, + "language_loss": 0.73690808, + "learning_rate": 2.8879887055062243e-06, + "loss": 0.81410116, + "num_input_tokens_seen": 133305295, + "router_z_loss_clip": 1.70898438, + "router_z_loss_mlp": 0.1505127, + "step": 6203, + "time_per_iteration": 2.4786221981048584 + }, + { + "auxiliary_loss_clip": 0.06448745, + "auxiliary_loss_mlp": 0.0126679, + "balance_loss_clip": 0.06280582, + "balance_loss_mlp": 0.01253402, + "epoch": 0.3730046595520818, + "flos": 22462874553600.0, + "grad_norm": 1.6222639611717555, + "language_loss": 0.82113981, + "learning_rate": 2.8876397194838353e-06, + "loss": 0.89829516, + "num_input_tokens_seen": 133324625, + "router_z_loss_clip": 1.68066406, + "router_z_loss_mlp": 0.13391113, + "step": 6204, + "time_per_iteration": 2.5474419593811035 + }, + { + "auxiliary_loss_clip": 0.06456085, + "auxiliary_loss_mlp": 0.01267649, + "balance_loss_clip": 0.06282973, + "balance_loss_mlp": 0.01253094, + "epoch": 0.37306478280474975, + "flos": 24323257630080.0, + "grad_norm": 1.5013454609640156, + "language_loss": 0.75699729, + "learning_rate": 2.8872906998020577e-06, + "loss": 0.8342346, + "num_input_tokens_seen": 133344625, + "router_z_loss_clip": 1.73144531, + "router_z_loss_mlp": 0.14562988, + "step": 6205, + "time_per_iteration": 2.5284838676452637 + }, + { + "auxiliary_loss_clip": 0.06453846, + "auxiliary_loss_mlp": 0.01269403, + "balance_loss_clip": 0.06283775, + "balance_loss_mlp": 0.01254538, + "epoch": 0.3731249060574177, + "flos": 15820627944960.0, + "grad_norm": 2.409990557003708, + "language_loss": 0.78042793, + "learning_rate": 2.886941646474128e-06, + "loss": 0.85766041, + "num_input_tokens_seen": 133363605, + "router_z_loss_clip": 1.70019531, + "router_z_loss_mlp": 0.14868164, + "step": 6206, + "time_per_iteration": 2.5130996704101562 + }, + { + "auxiliary_loss_clip": 0.06455843, + "auxiliary_loss_mlp": 0.01268821, + "balance_loss_clip": 0.06284125, + "balance_loss_mlp": 0.01253085, + "epoch": 0.3731850293100857, + "flos": 19834120736640.0, + "grad_norm": 3.8358433201526334, + "language_loss": 0.93966329, + "learning_rate": 2.886592559513283e-06, + "loss": 1.01690984, + "num_input_tokens_seen": 133379405, + "router_z_loss_clip": 1.71484375, + "router_z_loss_mlp": 0.15734863, + "step": 6207, + "time_per_iteration": 2.4994020462036133 + }, + { + "auxiliary_loss_clip": 0.06459471, + "auxiliary_loss_mlp": 0.01269129, + "balance_loss_clip": 0.06283936, + "balance_loss_mlp": 0.01254561, + "epoch": 0.37324515256275365, + "flos": 19068349472640.0, + "grad_norm": 2.1400449567396826, + "language_loss": 0.82643408, + "learning_rate": 2.886243438932759e-06, + "loss": 0.90372002, + "num_input_tokens_seen": 133397585, + "router_z_loss_clip": 1.75488281, + "router_z_loss_mlp": 0.14575195, + "step": 6208, + "time_per_iteration": 2.5359628200531006 + }, + { + "auxiliary_loss_clip": 0.06460227, + "auxiliary_loss_mlp": 0.01272188, + "balance_loss_clip": 0.06285752, + "balance_loss_mlp": 0.01255904, + "epoch": 0.3733052758154216, + "flos": 20710623571200.0, + "grad_norm": 2.148305950788212, + "language_loss": 0.73528939, + "learning_rate": 2.8858942847457953e-06, + "loss": 0.81261349, + "num_input_tokens_seen": 133415365, + "router_z_loss_clip": 1.7421875, + "router_z_loss_mlp": 0.1628418, + "step": 6209, + "time_per_iteration": 2.499209403991699 + }, + { + "auxiliary_loss_clip": 0.06455819, + "auxiliary_loss_mlp": 0.01273959, + "balance_loss_clip": 0.06285547, + "balance_loss_mlp": 0.01258593, + "epoch": 0.3733653990680896, + "flos": 20199704100480.0, + "grad_norm": 2.014449395888949, + "language_loss": 0.71212471, + "learning_rate": 2.8855450969656305e-06, + "loss": 0.78942245, + "num_input_tokens_seen": 133435700, + "router_z_loss_clip": 1.70117188, + "router_z_loss_mlp": 0.15368652, + "step": 6210, + "time_per_iteration": 2.5324270725250244 + }, + { + "auxiliary_loss_clip": 0.06468424, + "auxiliary_loss_mlp": 0.01268574, + "balance_loss_clip": 0.06295058, + "balance_loss_mlp": 0.01253631, + "epoch": 0.37342552232075754, + "flos": 20345920675200.0, + "grad_norm": 1.543701660359285, + "language_loss": 0.7823801, + "learning_rate": 2.8851958756055073e-06, + "loss": 0.85975003, + "num_input_tokens_seen": 133455180, + "router_z_loss_clip": 1.73535156, + "router_z_loss_mlp": 0.1494751, + "step": 6211, + "time_per_iteration": 2.5388078689575195 + }, + { + "auxiliary_loss_clip": 0.06464606, + "auxiliary_loss_mlp": 0.01268752, + "balance_loss_clip": 0.06291494, + "balance_loss_mlp": 0.0125347, + "epoch": 0.3734856455734255, + "flos": 35526701243520.0, + "grad_norm": 1.6765525733287814, + "language_loss": 0.73612988, + "learning_rate": 2.884846620678668e-06, + "loss": 0.81346345, + "num_input_tokens_seen": 133476715, + "router_z_loss_clip": 1.73046875, + "router_z_loss_mlp": 0.15283203, + "step": 6212, + "time_per_iteration": 2.663950204849243 + }, + { + "auxiliary_loss_clip": 0.06477734, + "auxiliary_loss_mlp": 0.01272795, + "balance_loss_clip": 0.06294222, + "balance_loss_mlp": 0.01256345, + "epoch": 0.37354576882609347, + "flos": 21148686316800.0, + "grad_norm": 1.865900947954382, + "language_loss": 0.82430422, + "learning_rate": 2.884497332198356e-06, + "loss": 0.90180945, + "num_input_tokens_seen": 133494550, + "router_z_loss_clip": 1.83496094, + "router_z_loss_mlp": 0.16455078, + "step": 6213, + "time_per_iteration": 2.541431427001953 + }, + { + "auxiliary_loss_clip": 0.06467836, + "auxiliary_loss_mlp": 0.01271096, + "balance_loss_clip": 0.06295606, + "balance_loss_mlp": 0.01255623, + "epoch": 0.37360589207876144, + "flos": 21513179577600.0, + "grad_norm": 2.345206885791162, + "language_loss": 0.7896657, + "learning_rate": 2.8841480101778167e-06, + "loss": 0.86705506, + "num_input_tokens_seen": 133512640, + "router_z_loss_clip": 1.72070312, + "router_z_loss_mlp": 0.15466309, + "step": 6214, + "time_per_iteration": 2.545792579650879 + }, + { + "auxiliary_loss_clip": 0.06466322, + "auxiliary_loss_mlp": 0.01270745, + "balance_loss_clip": 0.06297071, + "balance_loss_mlp": 0.01255981, + "epoch": 0.37366601533142946, + "flos": 38444953317120.0, + "grad_norm": 1.6116656191599898, + "language_loss": 0.85112274, + "learning_rate": 2.883798654630296e-06, + "loss": 0.92849338, + "num_input_tokens_seen": 133535540, + "router_z_loss_clip": 1.69433594, + "router_z_loss_mlp": 0.14758301, + "step": 6215, + "time_per_iteration": 2.70700740814209 + }, + { + "auxiliary_loss_clip": 0.06472297, + "auxiliary_loss_mlp": 0.01270089, + "balance_loss_clip": 0.06296762, + "balance_loss_mlp": 0.01254044, + "epoch": 0.3737261385840974, + "flos": 18446908066560.0, + "grad_norm": 1.6510257786225762, + "language_loss": 0.6833967, + "learning_rate": 2.8834492655690423e-06, + "loss": 0.76082057, + "num_input_tokens_seen": 133555795, + "router_z_loss_clip": 1.75683594, + "router_z_loss_mlp": 0.16040039, + "step": 6216, + "time_per_iteration": 3.941821575164795 + }, + { + "auxiliary_loss_clip": 0.06466141, + "auxiliary_loss_mlp": 0.01276294, + "balance_loss_clip": 0.06293347, + "balance_loss_mlp": 0.01260224, + "epoch": 0.3737862618367654, + "flos": 22936506157440.0, + "grad_norm": 2.1208446300989983, + "language_loss": 0.6621505, + "learning_rate": 2.883099843007303e-06, + "loss": 0.73957485, + "num_input_tokens_seen": 133575905, + "router_z_loss_clip": 1.72753906, + "router_z_loss_mlp": 0.1607666, + "step": 6217, + "time_per_iteration": 4.067852258682251 + }, + { + "auxiliary_loss_clip": 0.06468368, + "auxiliary_loss_mlp": 0.01272371, + "balance_loss_clip": 0.06294458, + "balance_loss_mlp": 0.0125772, + "epoch": 0.37384638508943335, + "flos": 15414360624000.0, + "grad_norm": 1.5564133784357135, + "language_loss": 0.80760753, + "learning_rate": 2.88275038695833e-06, + "loss": 0.88501501, + "num_input_tokens_seen": 133592585, + "router_z_loss_clip": 1.73925781, + "router_z_loss_mlp": 0.1463623, + "step": 6218, + "time_per_iteration": 2.5253372192382812 + }, + { + "auxiliary_loss_clip": 0.06465785, + "auxiliary_loss_mlp": 0.01272039, + "balance_loss_clip": 0.06298652, + "balance_loss_mlp": 0.01256661, + "epoch": 0.3739065083421013, + "flos": 24287856480000.0, + "grad_norm": 2.4835018506755566, + "language_loss": 0.79185957, + "learning_rate": 2.8824008974353736e-06, + "loss": 0.86923778, + "num_input_tokens_seen": 133615070, + "router_z_loss_clip": 1.671875, + "router_z_loss_mlp": 0.15380859, + "step": 6219, + "time_per_iteration": 2.595684289932251 + }, + { + "auxiliary_loss_clip": 0.06464131, + "auxiliary_loss_mlp": 0.01274727, + "balance_loss_clip": 0.06296779, + "balance_loss_mlp": 0.01260177, + "epoch": 0.3739666315947693, + "flos": 23009488663680.0, + "grad_norm": 2.098390778414135, + "language_loss": 0.77614415, + "learning_rate": 2.8820513744516866e-06, + "loss": 0.85353279, + "num_input_tokens_seen": 133633490, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.14538574, + "step": 6220, + "time_per_iteration": 2.5899298191070557 + }, + { + "auxiliary_loss_clip": 0.06466513, + "auxiliary_loss_mlp": 0.01270657, + "balance_loss_clip": 0.06292208, + "balance_loss_mlp": 0.0125541, + "epoch": 0.37402675484743725, + "flos": 19397231948160.0, + "grad_norm": 1.5821121915867322, + "language_loss": 0.83564717, + "learning_rate": 2.8817018180205235e-06, + "loss": 0.91301888, + "num_input_tokens_seen": 133653425, + "router_z_loss_clip": 1.74121094, + "router_z_loss_mlp": 0.15240479, + "step": 6221, + "time_per_iteration": 2.540102481842041 + }, + { + "auxiliary_loss_clip": 0.06464627, + "auxiliary_loss_mlp": 0.0127713, + "balance_loss_clip": 0.06293692, + "balance_loss_mlp": 0.01262647, + "epoch": 0.3740868781001052, + "flos": 17131420091520.0, + "grad_norm": 1.6401420513761291, + "language_loss": 0.76738596, + "learning_rate": 2.8813522281551387e-06, + "loss": 0.84480345, + "num_input_tokens_seen": 133670220, + "router_z_loss_clip": 1.7109375, + "router_z_loss_mlp": 0.14477539, + "step": 6222, + "time_per_iteration": 4.020254850387573 + }, + { + "auxiliary_loss_clip": 0.06466988, + "auxiliary_loss_mlp": 0.01277801, + "balance_loss_clip": 0.06296736, + "balance_loss_mlp": 0.01263467, + "epoch": 0.3741470013527732, + "flos": 20049001332480.0, + "grad_norm": 1.799306271558528, + "language_loss": 0.70768011, + "learning_rate": 2.881002604868789e-06, + "loss": 0.785128, + "num_input_tokens_seen": 133688910, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.14349365, + "step": 6223, + "time_per_iteration": 2.6146726608276367 + }, + { + "auxiliary_loss_clip": 0.0646846, + "auxiliary_loss_mlp": 0.01272132, + "balance_loss_clip": 0.06299432, + "balance_loss_mlp": 0.01258954, + "epoch": 0.37420712460544114, + "flos": 36905151162240.0, + "grad_norm": 1.9191598081110601, + "language_loss": 0.69292819, + "learning_rate": 2.8806529481747325e-06, + "loss": 0.77033412, + "num_input_tokens_seen": 133708690, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.1317749, + "step": 6224, + "time_per_iteration": 4.144296407699585 + }, + { + "auxiliary_loss_clip": 0.06463895, + "auxiliary_loss_mlp": 0.01274949, + "balance_loss_clip": 0.06296779, + "balance_loss_mlp": 0.01260126, + "epoch": 0.3742672478581091, + "flos": 22207896979200.0, + "grad_norm": 1.811742579086715, + "language_loss": 0.70166373, + "learning_rate": 2.880303258086228e-06, + "loss": 0.77905214, + "num_input_tokens_seen": 133728095, + "router_z_loss_clip": 1.66894531, + "router_z_loss_mlp": 0.14819336, + "step": 6225, + "time_per_iteration": 2.562023162841797 + }, + { + "auxiliary_loss_clip": 0.06462345, + "auxiliary_loss_mlp": 0.0127698, + "balance_loss_clip": 0.06296264, + "balance_loss_mlp": 0.01262257, + "epoch": 0.3743273711107771, + "flos": 24688547504640.0, + "grad_norm": 2.0306145345851614, + "language_loss": 0.79386592, + "learning_rate": 2.879953534616536e-06, + "loss": 0.87125921, + "num_input_tokens_seen": 133745590, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.14715576, + "step": 6226, + "time_per_iteration": 2.5372707843780518 + }, + { + "auxiliary_loss_clip": 0.06464548, + "auxiliary_loss_mlp": 0.01273743, + "balance_loss_clip": 0.0629389, + "balance_loss_mlp": 0.01259021, + "epoch": 0.37438749436344504, + "flos": 24466078114560.0, + "grad_norm": 1.6346435650910545, + "language_loss": 0.68240035, + "learning_rate": 2.879603777778917e-06, + "loss": 0.75978327, + "num_input_tokens_seen": 133766155, + "router_z_loss_clip": 1.70800781, + "router_z_loss_mlp": 0.14733887, + "step": 6227, + "time_per_iteration": 2.5752079486846924 + }, + { + "auxiliary_loss_clip": 0.06464467, + "auxiliary_loss_mlp": 0.01270066, + "balance_loss_clip": 0.06297411, + "balance_loss_mlp": 0.0125588, + "epoch": 0.374447617616113, + "flos": 21805193456640.0, + "grad_norm": 1.6298548281431393, + "language_loss": 0.83520573, + "learning_rate": 2.879253987586635e-06, + "loss": 0.91255105, + "num_input_tokens_seen": 133783185, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.14190674, + "step": 6228, + "time_per_iteration": 2.605607748031616 + }, + { + "auxiliary_loss_clip": 0.06458256, + "auxiliary_loss_mlp": 0.01270458, + "balance_loss_clip": 0.06288552, + "balance_loss_mlp": 0.01256033, + "epoch": 0.374507740868781, + "flos": 17974073076480.0, + "grad_norm": 1.5343038876343353, + "language_loss": 0.75450277, + "learning_rate": 2.8789041640529535e-06, + "loss": 0.83178985, + "num_input_tokens_seen": 133800975, + "router_z_loss_clip": 1.69824219, + "router_z_loss_mlp": 0.14428711, + "step": 6229, + "time_per_iteration": 2.607506036758423 + }, + { + "auxiliary_loss_clip": 0.06464534, + "auxiliary_loss_mlp": 0.012714, + "balance_loss_clip": 0.06293011, + "balance_loss_mlp": 0.01256249, + "epoch": 0.374567864121449, + "flos": 16111132450560.0, + "grad_norm": 3.0205318355467083, + "language_loss": 0.84065855, + "learning_rate": 2.8785543071911383e-06, + "loss": 0.91801792, + "num_input_tokens_seen": 133818020, + "router_z_loss_clip": 1.71386719, + "router_z_loss_mlp": 0.15142822, + "step": 6230, + "time_per_iteration": 2.4964523315429688 + }, + { + "auxiliary_loss_clip": 0.06463904, + "auxiliary_loss_mlp": 0.01275239, + "balance_loss_clip": 0.06291893, + "balance_loss_mlp": 0.01259569, + "epoch": 0.37462798737411696, + "flos": 25779847080960.0, + "grad_norm": 1.7178487844900587, + "language_loss": 0.73793018, + "learning_rate": 2.878204417014456e-06, + "loss": 0.81532168, + "num_input_tokens_seen": 133840690, + "router_z_loss_clip": 1.72167969, + "router_z_loss_mlp": 0.15667725, + "step": 6231, + "time_per_iteration": 2.589771270751953 + }, + { + "auxiliary_loss_clip": 0.06465879, + "auxiliary_loss_mlp": 0.01270223, + "balance_loss_clip": 0.06291361, + "balance_loss_mlp": 0.01255298, + "epoch": 0.3746881106267849, + "flos": 16660136401920.0, + "grad_norm": 1.8762806294571872, + "language_loss": 0.74086344, + "learning_rate": 2.8778544935361735e-06, + "loss": 0.81822443, + "num_input_tokens_seen": 133858350, + "router_z_loss_clip": 1.74414062, + "router_z_loss_mlp": 0.14929199, + "step": 6232, + "time_per_iteration": 2.483219861984253 + }, + { + "auxiliary_loss_clip": 0.06463014, + "auxiliary_loss_mlp": 0.01270796, + "balance_loss_clip": 0.06290261, + "balance_loss_mlp": 0.0125605, + "epoch": 0.3747482338794529, + "flos": 26185317788160.0, + "grad_norm": 1.743409558247901, + "language_loss": 0.77404612, + "learning_rate": 2.877504536769561e-06, + "loss": 0.85138428, + "num_input_tokens_seen": 133879775, + "router_z_loss_clip": 1.72753906, + "router_z_loss_mlp": 0.14758301, + "step": 6233, + "time_per_iteration": 2.5796406269073486 + }, + { + "auxiliary_loss_clip": 0.06463634, + "auxiliary_loss_mlp": 0.01269135, + "balance_loss_clip": 0.06292734, + "balance_loss_mlp": 0.01255432, + "epoch": 0.37480835713212085, + "flos": 12025956890880.0, + "grad_norm": 1.7958128584553208, + "language_loss": 0.69650698, + "learning_rate": 2.8771545467278883e-06, + "loss": 0.77383471, + "num_input_tokens_seen": 133898295, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.13690186, + "step": 6234, + "time_per_iteration": 2.524226188659668 + }, + { + "auxiliary_loss_clip": 0.06464471, + "auxiliary_loss_mlp": 0.01267248, + "balance_loss_clip": 0.06295948, + "balance_loss_mlp": 0.0125311, + "epoch": 0.3748684803847888, + "flos": 19684801560960.0, + "grad_norm": 2.1537876510353597, + "language_loss": 0.83551729, + "learning_rate": 2.8768045234244276e-06, + "loss": 0.91283447, + "num_input_tokens_seen": 133915230, + "router_z_loss_clip": 1.68554688, + "router_z_loss_mlp": 0.14135742, + "step": 6235, + "time_per_iteration": 2.5380606651306152 + }, + { + "auxiliary_loss_clip": 0.06462481, + "auxiliary_loss_mlp": 0.01267157, + "balance_loss_clip": 0.06289958, + "balance_loss_mlp": 0.0125222, + "epoch": 0.3749286036374568, + "flos": 20527328764800.0, + "grad_norm": 1.8434440291752416, + "language_loss": 0.78213942, + "learning_rate": 2.8764544668724517e-06, + "loss": 0.8594358, + "num_input_tokens_seen": 133934110, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.14941406, + "step": 6236, + "time_per_iteration": 2.507180690765381 + }, + { + "auxiliary_loss_clip": 0.06465082, + "auxiliary_loss_mlp": 0.0127323, + "balance_loss_clip": 0.06288011, + "balance_loss_mlp": 0.0125616, + "epoch": 0.37498872689012475, + "flos": 20710958987520.0, + "grad_norm": 1.9437086154972172, + "language_loss": 0.73305297, + "learning_rate": 2.876104377085234e-06, + "loss": 0.81043607, + "num_input_tokens_seen": 133952395, + "router_z_loss_clip": 1.76953125, + "router_z_loss_mlp": 0.17077637, + "step": 6237, + "time_per_iteration": 2.5545706748962402 + }, + { + "auxiliary_loss_clip": 0.06460923, + "auxiliary_loss_mlp": 0.01271336, + "balance_loss_clip": 0.0628608, + "balance_loss_mlp": 0.01256548, + "epoch": 0.3750488501427927, + "flos": 21580418079360.0, + "grad_norm": 2.5847168840400787, + "language_loss": 0.93616223, + "learning_rate": 2.8757542540760508e-06, + "loss": 1.01348472, + "num_input_tokens_seen": 133969635, + "router_z_loss_clip": 1.74804688, + "router_z_loss_mlp": 0.14788818, + "step": 6238, + "time_per_iteration": 2.544524669647217 + }, + { + "auxiliary_loss_clip": 0.06457306, + "auxiliary_loss_mlp": 0.01272243, + "balance_loss_clip": 0.06286643, + "balance_loss_mlp": 0.01257127, + "epoch": 0.3751089733954607, + "flos": 15929221236480.0, + "grad_norm": 2.2437121352489093, + "language_loss": 0.71661341, + "learning_rate": 2.8754040978581777e-06, + "loss": 0.79390883, + "num_input_tokens_seen": 133987215, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.15106201, + "step": 6239, + "time_per_iteration": 2.519807815551758 + }, + { + "auxiliary_loss_clip": 0.06461261, + "auxiliary_loss_mlp": 0.01271582, + "balance_loss_clip": 0.06287319, + "balance_loss_mlp": 0.01256485, + "epoch": 0.37516909664812864, + "flos": 36293688391680.0, + "grad_norm": 1.5212724151961043, + "language_loss": 0.65758455, + "learning_rate": 2.875053908444895e-06, + "loss": 0.73491299, + "num_input_tokens_seen": 134009250, + "router_z_loss_clip": 1.73730469, + "router_z_loss_mlp": 0.15118408, + "step": 6240, + "time_per_iteration": 2.6838748455047607 + }, + { + "auxiliary_loss_clip": 0.06461462, + "auxiliary_loss_mlp": 0.0126514, + "balance_loss_clip": 0.06288624, + "balance_loss_mlp": 0.01251258, + "epoch": 0.3752292199007966, + "flos": 13520882384640.0, + "grad_norm": 2.454894337240739, + "language_loss": 0.76209545, + "learning_rate": 2.8747036858494795e-06, + "loss": 0.83936143, + "num_input_tokens_seen": 134026875, + "router_z_loss_clip": 1.73046875, + "router_z_loss_mlp": 0.13867188, + "step": 6241, + "time_per_iteration": 2.498286008834839 + }, + { + "auxiliary_loss_clip": 0.06461808, + "auxiliary_loss_mlp": 0.01268507, + "balance_loss_clip": 0.06289176, + "balance_loss_mlp": 0.01253206, + "epoch": 0.3752893431534646, + "flos": 27205353866880.0, + "grad_norm": 2.0832931967812853, + "language_loss": 0.84671998, + "learning_rate": 2.874353430085213e-06, + "loss": 0.92402315, + "num_input_tokens_seen": 134047185, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.15313721, + "step": 6242, + "time_per_iteration": 2.6289877891540527 + }, + { + "auxiliary_loss_clip": 0.06457841, + "auxiliary_loss_mlp": 0.01272178, + "balance_loss_clip": 0.06285247, + "balance_loss_mlp": 0.01257379, + "epoch": 0.3753494664061326, + "flos": 30015431919360.0, + "grad_norm": 2.6434313807577112, + "language_loss": 0.68551457, + "learning_rate": 2.8740031411653766e-06, + "loss": 0.76281476, + "num_input_tokens_seen": 134067330, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.14813232, + "step": 6243, + "time_per_iteration": 2.7211153507232666 + }, + { + "auxiliary_loss_clip": 0.0645824, + "auxiliary_loss_mlp": 0.01270289, + "balance_loss_clip": 0.06286814, + "balance_loss_mlp": 0.01254482, + "epoch": 0.37540958965880056, + "flos": 24468803372160.0, + "grad_norm": 1.7478523324296555, + "language_loss": 0.8397631, + "learning_rate": 2.8736528191032535e-06, + "loss": 0.91704839, + "num_input_tokens_seen": 134085525, + "router_z_loss_clip": 1.71386719, + "router_z_loss_mlp": 0.15808105, + "step": 6244, + "time_per_iteration": 2.5738887786865234 + }, + { + "auxiliary_loss_clip": 0.0645659, + "auxiliary_loss_mlp": 0.01266605, + "balance_loss_clip": 0.06290226, + "balance_loss_mlp": 0.01252842, + "epoch": 0.3754697129114685, + "flos": 16513961754240.0, + "grad_norm": 3.8447339818169257, + "language_loss": 0.83823436, + "learning_rate": 2.8733024639121277e-06, + "loss": 0.91546631, + "num_input_tokens_seen": 134101855, + "router_z_loss_clip": 1.66113281, + "router_z_loss_mlp": 0.13751221, + "step": 6245, + "time_per_iteration": 2.5320816040039062 + }, + { + "auxiliary_loss_clip": 0.06453504, + "auxiliary_loss_mlp": 0.0127263, + "balance_loss_clip": 0.06282875, + "balance_loss_mlp": 0.01257633, + "epoch": 0.3755298361641365, + "flos": 19396980385920.0, + "grad_norm": 2.4621620681348295, + "language_loss": 0.64685225, + "learning_rate": 2.8729520756052853e-06, + "loss": 0.72411358, + "num_input_tokens_seen": 134119360, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.14990234, + "step": 6246, + "time_per_iteration": 2.58577561378479 + }, + { + "auxiliary_loss_clip": 0.06466524, + "auxiliary_loss_mlp": 0.01278259, + "balance_loss_clip": 0.06289048, + "balance_loss_mlp": 0.01262428, + "epoch": 0.37558995941680445, + "flos": 14725638789120.0, + "grad_norm": 2.3474335464279648, + "language_loss": 0.75348055, + "learning_rate": 2.8726016541960124e-06, + "loss": 0.83092844, + "num_input_tokens_seen": 134137475, + "router_z_loss_clip": 1.77148438, + "router_z_loss_mlp": 0.1583252, + "step": 6247, + "time_per_iteration": 2.47930908203125 + }, + { + "auxiliary_loss_clip": 0.06456453, + "auxiliary_loss_mlp": 0.012715, + "balance_loss_clip": 0.06282347, + "balance_loss_mlp": 0.01255503, + "epoch": 0.3756500826694724, + "flos": 21696432456960.0, + "grad_norm": 3.5646784592424017, + "language_loss": 0.55380279, + "learning_rate": 2.872251199697598e-06, + "loss": 0.6310823, + "num_input_tokens_seen": 134154580, + "router_z_loss_clip": 1.74121094, + "router_z_loss_mlp": 0.16003418, + "step": 6248, + "time_per_iteration": 2.5266313552856445 + }, + { + "auxiliary_loss_clip": 0.06453443, + "auxiliary_loss_mlp": 0.01268535, + "balance_loss_clip": 0.06283841, + "balance_loss_mlp": 0.01253109, + "epoch": 0.3757102059221404, + "flos": 26512942452480.0, + "grad_norm": 1.7302245846967215, + "language_loss": 0.84781861, + "learning_rate": 2.8719007121233297e-06, + "loss": 0.92503834, + "num_input_tokens_seen": 134174285, + "router_z_loss_clip": 1.69433594, + "router_z_loss_mlp": 0.15429688, + "step": 6249, + "time_per_iteration": 2.5590078830718994 + }, + { + "auxiliary_loss_clip": 0.06456596, + "auxiliary_loss_mlp": 0.01267858, + "balance_loss_clip": 0.0628508, + "balance_loss_mlp": 0.01253481, + "epoch": 0.37577032917480835, + "flos": 37346526144000.0, + "grad_norm": 1.6299752789251518, + "language_loss": 0.68482721, + "learning_rate": 2.8715501914864993e-06, + "loss": 0.76207179, + "num_input_tokens_seen": 134195940, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.14361572, + "step": 6250, + "time_per_iteration": 2.6926450729370117 + }, + { + "auxiliary_loss_clip": 0.06454285, + "auxiliary_loss_mlp": 0.01268088, + "balance_loss_clip": 0.06283066, + "balance_loss_mlp": 0.01254099, + "epoch": 0.3758304524274763, + "flos": 21915128413440.0, + "grad_norm": 2.0147801854845895, + "language_loss": 0.78550422, + "learning_rate": 2.8711996378003987e-06, + "loss": 0.862728, + "num_input_tokens_seen": 134212235, + "router_z_loss_clip": 1.71191406, + "router_z_loss_mlp": 0.13995361, + "step": 6251, + "time_per_iteration": 2.5072193145751953 + }, + { + "auxiliary_loss_clip": 0.06455163, + "auxiliary_loss_mlp": 0.01271265, + "balance_loss_clip": 0.06285167, + "balance_loss_mlp": 0.01257139, + "epoch": 0.3758905756801443, + "flos": 36577233008640.0, + "grad_norm": 2.2428429985343543, + "language_loss": 0.58560276, + "learning_rate": 2.8708490510783203e-06, + "loss": 0.66286701, + "num_input_tokens_seen": 134233810, + "router_z_loss_clip": 1.69824219, + "router_z_loss_mlp": 0.14111328, + "step": 6252, + "time_per_iteration": 2.684899091720581 + }, + { + "auxiliary_loss_clip": 0.06456266, + "auxiliary_loss_mlp": 0.01271539, + "balance_loss_clip": 0.06283682, + "balance_loss_mlp": 0.01255649, + "epoch": 0.37595069893281224, + "flos": 24534616354560.0, + "grad_norm": 1.5871699178816958, + "language_loss": 0.8998009, + "learning_rate": 2.8704984313335584e-06, + "loss": 0.97707891, + "num_input_tokens_seen": 134252020, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.15869141, + "step": 6253, + "time_per_iteration": 2.539088010787964 + }, + { + "auxiliary_loss_clip": 0.0645566, + "auxiliary_loss_mlp": 0.01270173, + "balance_loss_clip": 0.06288448, + "balance_loss_mlp": 0.01255523, + "epoch": 0.3760108221854802, + "flos": 16440518050560.0, + "grad_norm": 2.3821241740713086, + "language_loss": 0.77027023, + "learning_rate": 2.8701477785794097e-06, + "loss": 0.84752858, + "num_input_tokens_seen": 134269495, + "router_z_loss_clip": 1.671875, + "router_z_loss_mlp": 0.14648438, + "step": 6254, + "time_per_iteration": 2.545330047607422 + }, + { + "auxiliary_loss_clip": 0.06454843, + "auxiliary_loss_mlp": 0.01270718, + "balance_loss_clip": 0.06281418, + "balance_loss_mlp": 0.01254386, + "epoch": 0.37607094543814823, + "flos": 13776824280960.0, + "grad_norm": 2.2494955117694007, + "language_loss": 0.62504637, + "learning_rate": 2.869797092829169e-06, + "loss": 0.70230198, + "num_input_tokens_seen": 134287035, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.16333008, + "step": 6255, + "time_per_iteration": 3.937791109085083 + }, + { + "auxiliary_loss_clip": 0.06456207, + "auxiliary_loss_mlp": 0.0127009, + "balance_loss_clip": 0.06282066, + "balance_loss_mlp": 0.01253758, + "epoch": 0.3761310686908162, + "flos": 19862855487360.0, + "grad_norm": 2.2501042164391634, + "language_loss": 0.74801397, + "learning_rate": 2.869446374096135e-06, + "loss": 0.82527697, + "num_input_tokens_seen": 134304840, + "router_z_loss_clip": 1.74023438, + "router_z_loss_mlp": 0.16345215, + "step": 6256, + "time_per_iteration": 2.52768611907959 + }, + { + "auxiliary_loss_clip": 0.06456085, + "auxiliary_loss_mlp": 0.01270671, + "balance_loss_clip": 0.06281887, + "balance_loss_mlp": 0.01254637, + "epoch": 0.37619119194348416, + "flos": 12755823880320.0, + "grad_norm": 1.8167076240371511, + "language_loss": 0.70818299, + "learning_rate": 2.8690956223936088e-06, + "loss": 0.78545058, + "num_input_tokens_seen": 134323180, + "router_z_loss_clip": 1.74023438, + "router_z_loss_mlp": 0.16040039, + "step": 6257, + "time_per_iteration": 4.052328824996948 + }, + { + "auxiliary_loss_clip": 0.06452011, + "auxiliary_loss_mlp": 0.01268418, + "balance_loss_clip": 0.0628053, + "balance_loss_mlp": 0.01253743, + "epoch": 0.3762513151961521, + "flos": 17536387674240.0, + "grad_norm": 1.6926603581335775, + "language_loss": 0.85114312, + "learning_rate": 2.868744837734889e-06, + "loss": 0.92834735, + "num_input_tokens_seen": 134341390, + "router_z_loss_clip": 1.71386719, + "router_z_loss_mlp": 0.14672852, + "step": 6258, + "time_per_iteration": 2.50252366065979 + }, + { + "auxiliary_loss_clip": 0.06455131, + "auxiliary_loss_mlp": 0.0127104, + "balance_loss_clip": 0.06282814, + "balance_loss_mlp": 0.01256503, + "epoch": 0.3763114384488201, + "flos": 23623215494400.0, + "grad_norm": 1.3678719492617617, + "language_loss": 0.81156051, + "learning_rate": 2.868394020133277e-06, + "loss": 0.8888222, + "num_input_tokens_seen": 134360425, + "router_z_loss_clip": 1.72265625, + "router_z_loss_mlp": 0.14532471, + "step": 6259, + "time_per_iteration": 2.5430314540863037 + }, + { + "auxiliary_loss_clip": 0.06458686, + "auxiliary_loss_mlp": 0.01274293, + "balance_loss_clip": 0.06282908, + "balance_loss_mlp": 0.0125696, + "epoch": 0.37637156170148806, + "flos": 25413383249280.0, + "grad_norm": 1.809326583941318, + "language_loss": 0.71774137, + "learning_rate": 2.8680431696020783e-06, + "loss": 0.79507113, + "num_input_tokens_seen": 134379775, + "router_z_loss_clip": 1.75585938, + "router_z_loss_mlp": 0.17321777, + "step": 6260, + "time_per_iteration": 2.566267490386963 + }, + { + "auxiliary_loss_clip": 0.06455691, + "auxiliary_loss_mlp": 0.0127871, + "balance_loss_clip": 0.06279852, + "balance_loss_mlp": 0.01262128, + "epoch": 0.376431684954156, + "flos": 23447677190400.0, + "grad_norm": 1.8475234283885087, + "language_loss": 0.78925788, + "learning_rate": 2.867692286154594e-06, + "loss": 0.86660182, + "num_input_tokens_seen": 134400315, + "router_z_loss_clip": 1.75878906, + "router_z_loss_mlp": 0.16589355, + "step": 6261, + "time_per_iteration": 2.5848124027252197 + }, + { + "auxiliary_loss_clip": 0.06455033, + "auxiliary_loss_mlp": 0.01273009, + "balance_loss_clip": 0.06278862, + "balance_loss_mlp": 0.01257607, + "epoch": 0.376491808206824, + "flos": 34213099985280.0, + "grad_norm": 2.1653724604475255, + "language_loss": 0.80626601, + "learning_rate": 2.867341369804132e-06, + "loss": 0.88354641, + "num_input_tokens_seen": 134422875, + "router_z_loss_clip": 1.76171875, + "router_z_loss_mlp": 0.15405273, + "step": 6262, + "time_per_iteration": 4.146479368209839 + }, + { + "auxiliary_loss_clip": 0.06453078, + "auxiliary_loss_mlp": 0.01268581, + "balance_loss_clip": 0.06282018, + "balance_loss_mlp": 0.01253799, + "epoch": 0.37655193145949195, + "flos": 35193793772160.0, + "grad_norm": 1.6953841761456194, + "language_loss": 0.81274903, + "learning_rate": 2.866990420563998e-06, + "loss": 0.88996559, + "num_input_tokens_seen": 134443025, + "router_z_loss_clip": 1.70996094, + "router_z_loss_mlp": 0.14794922, + "step": 6263, + "time_per_iteration": 2.6529650688171387 + }, + { + "auxiliary_loss_clip": 0.06460523, + "auxiliary_loss_mlp": 0.01276014, + "balance_loss_clip": 0.06286405, + "balance_loss_mlp": 0.01261172, + "epoch": 0.3766120547121599, + "flos": 16767136465920.0, + "grad_norm": 1.8888627452248796, + "language_loss": 0.79794824, + "learning_rate": 2.866639438447501e-06, + "loss": 0.87531358, + "num_input_tokens_seen": 134460945, + "router_z_loss_clip": 1.74121094, + "router_z_loss_mlp": 0.14831543, + "step": 6264, + "time_per_iteration": 3.9715349674224854 + }, + { + "auxiliary_loss_clip": 0.06455237, + "auxiliary_loss_mlp": 0.01269266, + "balance_loss_clip": 0.06284397, + "balance_loss_mlp": 0.0125396, + "epoch": 0.3766721779648279, + "flos": 23557150949760.0, + "grad_norm": 1.690336708132248, + "language_loss": 0.7363869, + "learning_rate": 2.8662884234679497e-06, + "loss": 0.81363189, + "num_input_tokens_seen": 134480440, + "router_z_loss_clip": 1.70898438, + "router_z_loss_mlp": 0.15307617, + "step": 6265, + "time_per_iteration": 2.5544657707214355 + }, + { + "auxiliary_loss_clip": 0.06451584, + "auxiliary_loss_mlp": 0.01276088, + "balance_loss_clip": 0.06283864, + "balance_loss_mlp": 0.01262486, + "epoch": 0.37673230121749585, + "flos": 29136329608320.0, + "grad_norm": 1.6256668529315172, + "language_loss": 0.6925773, + "learning_rate": 2.865937375638654e-06, + "loss": 0.76985407, + "num_input_tokens_seen": 134501110, + "router_z_loss_clip": 1.67675781, + "router_z_loss_mlp": 0.1361084, + "step": 6266, + "time_per_iteration": 2.5735552310943604 + }, + { + "auxiliary_loss_clip": 0.06456051, + "auxiliary_loss_mlp": 0.01274095, + "balance_loss_clip": 0.06279004, + "balance_loss_mlp": 0.01258825, + "epoch": 0.3767924244701638, + "flos": 28154210302080.0, + "grad_norm": 2.361518747365002, + "language_loss": 0.63358176, + "learning_rate": 2.8655862949729264e-06, + "loss": 0.7108832, + "num_input_tokens_seen": 134522460, + "router_z_loss_clip": 1.76855469, + "router_z_loss_mlp": 0.15270996, + "step": 6267, + "time_per_iteration": 2.6408746242523193 + }, + { + "auxiliary_loss_clip": 0.0637848, + "auxiliary_loss_mlp": 0.01265654, + "balance_loss_clip": 0.0630175, + "balance_loss_mlp": 0.01263043, + "epoch": 0.37685254772283183, + "flos": 60815460343680.0, + "grad_norm": 0.7019670976586264, + "language_loss": 0.58932841, + "learning_rate": 2.8652351814840795e-06, + "loss": 0.66576976, + "num_input_tokens_seen": 134589545, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.02612305, + "step": 6268, + "time_per_iteration": 3.3041250705718994 + }, + { + "auxiliary_loss_clip": 0.06448595, + "auxiliary_loss_mlp": 0.01272563, + "balance_loss_clip": 0.06277184, + "balance_loss_mlp": 0.01256756, + "epoch": 0.3769126709754998, + "flos": 26039939754240.0, + "grad_norm": 1.4401012750228117, + "language_loss": 0.65166855, + "learning_rate": 2.8648840351854283e-06, + "loss": 0.72888005, + "num_input_tokens_seen": 134610550, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.15795898, + "step": 6269, + "time_per_iteration": 2.654707670211792 + }, + { + "auxiliary_loss_clip": 0.06454687, + "auxiliary_loss_mlp": 0.01276662, + "balance_loss_clip": 0.06286559, + "balance_loss_mlp": 0.01261296, + "epoch": 0.37697279422816776, + "flos": 23585508357120.0, + "grad_norm": 1.4576669810179597, + "language_loss": 0.71144199, + "learning_rate": 2.8645328560902874e-06, + "loss": 0.78875554, + "num_input_tokens_seen": 134630485, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.15362549, + "step": 6270, + "time_per_iteration": 2.5369231700897217 + }, + { + "auxiliary_loss_clip": 0.06374384, + "auxiliary_loss_mlp": 0.0126987, + "balance_loss_clip": 0.062971, + "balance_loss_mlp": 0.01266305, + "epoch": 0.3770329174808357, + "flos": 64766242753920.0, + "grad_norm": 0.6950430831807741, + "language_loss": 0.56232381, + "learning_rate": 2.8641816442119746e-06, + "loss": 0.63876635, + "num_input_tokens_seen": 134693510, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.03561401, + "step": 6271, + "time_per_iteration": 3.1599924564361572 + }, + { + "auxiliary_loss_clip": 0.06448443, + "auxiliary_loss_mlp": 0.01272708, + "balance_loss_clip": 0.06279441, + "balance_loss_mlp": 0.0125696, + "epoch": 0.3770930407335037, + "flos": 21841768563840.0, + "grad_norm": 1.6801171250404496, + "language_loss": 0.80461442, + "learning_rate": 2.8638303995638066e-06, + "loss": 0.88182592, + "num_input_tokens_seen": 134713115, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.1574707, + "step": 6272, + "time_per_iteration": 2.524846076965332 + }, + { + "auxiliary_loss_clip": 0.06450769, + "auxiliary_loss_mlp": 0.01273349, + "balance_loss_clip": 0.06283743, + "balance_loss_mlp": 0.01258329, + "epoch": 0.37715316398617166, + "flos": 22754594943360.0, + "grad_norm": 1.6672783573066894, + "language_loss": 0.74972034, + "learning_rate": 2.863479122159103e-06, + "loss": 0.82696146, + "num_input_tokens_seen": 134732635, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.15026855, + "step": 6273, + "time_per_iteration": 2.5571129322052 + }, + { + "auxiliary_loss_clip": 0.06449255, + "auxiliary_loss_mlp": 0.01271721, + "balance_loss_clip": 0.06280608, + "balance_loss_mlp": 0.01257148, + "epoch": 0.3772132872388396, + "flos": 18920246181120.0, + "grad_norm": 1.32773283576084, + "language_loss": 0.72241038, + "learning_rate": 2.8631278120111858e-06, + "loss": 0.79962015, + "num_input_tokens_seen": 134750695, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.14569092, + "step": 6274, + "time_per_iteration": 2.4966516494750977 + }, + { + "auxiliary_loss_clip": 0.06454083, + "auxiliary_loss_mlp": 0.01271444, + "balance_loss_clip": 0.06282286, + "balance_loss_mlp": 0.01257467, + "epoch": 0.3772734104915076, + "flos": 17351709275520.0, + "grad_norm": 1.8983068498635614, + "language_loss": 0.84638643, + "learning_rate": 2.8627764691333742e-06, + "loss": 0.92364168, + "num_input_tokens_seen": 134768935, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.13983154, + "step": 6275, + "time_per_iteration": 2.534308910369873 + }, + { + "auxiliary_loss_clip": 0.06448515, + "auxiliary_loss_mlp": 0.01272502, + "balance_loss_clip": 0.06282812, + "balance_loss_mlp": 0.01258865, + "epoch": 0.37733353374417555, + "flos": 32350452848640.0, + "grad_norm": 1.3669254528099, + "language_loss": 0.75387293, + "learning_rate": 2.8624250935389935e-06, + "loss": 0.83108306, + "num_input_tokens_seen": 134791260, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.13641357, + "step": 6276, + "time_per_iteration": 2.6563172340393066 + }, + { + "auxiliary_loss_clip": 0.06453335, + "auxiliary_loss_mlp": 0.0127286, + "balance_loss_clip": 0.06282486, + "balance_loss_mlp": 0.0125803, + "epoch": 0.3773936569968435, + "flos": 23366225422080.0, + "grad_norm": 1.9054341571687776, + "language_loss": 0.86016738, + "learning_rate": 2.862073685241366e-06, + "loss": 0.93742937, + "num_input_tokens_seen": 134808350, + "router_z_loss_clip": 1.70898438, + "router_z_loss_mlp": 0.1484375, + "step": 6277, + "time_per_iteration": 2.6153500080108643 + }, + { + "auxiliary_loss_clip": 0.06448077, + "auxiliary_loss_mlp": 0.01271912, + "balance_loss_clip": 0.0628462, + "balance_loss_mlp": 0.01257488, + "epoch": 0.3774537802495115, + "flos": 21472579474560.0, + "grad_norm": 1.5956300393708251, + "language_loss": 0.78636366, + "learning_rate": 2.861722244253818e-06, + "loss": 0.86356354, + "num_input_tokens_seen": 134826005, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.14428711, + "step": 6278, + "time_per_iteration": 2.564234495162964 + }, + { + "auxiliary_loss_clip": 0.06459187, + "auxiliary_loss_mlp": 0.01270608, + "balance_loss_clip": 0.06284142, + "balance_loss_mlp": 0.01255075, + "epoch": 0.37751390350217945, + "flos": 24980812945920.0, + "grad_norm": 1.8067410295121689, + "language_loss": 0.8371948, + "learning_rate": 2.8613707705896767e-06, + "loss": 0.91449273, + "num_input_tokens_seen": 134844995, + "router_z_loss_clip": 1.75097656, + "router_z_loss_mlp": 0.15527344, + "step": 6279, + "time_per_iteration": 2.6134567260742188 + }, + { + "auxiliary_loss_clip": 0.06454675, + "auxiliary_loss_mlp": 0.01271405, + "balance_loss_clip": 0.06282948, + "balance_loss_mlp": 0.01257117, + "epoch": 0.3775740267548474, + "flos": 27826585637760.0, + "grad_norm": 1.84994794715845, + "language_loss": 0.74995327, + "learning_rate": 2.861019264262269e-06, + "loss": 0.82721412, + "num_input_tokens_seen": 134865285, + "router_z_loss_clip": 1.71777344, + "router_z_loss_mlp": 0.1428833, + "step": 6280, + "time_per_iteration": 2.6029937267303467 + }, + { + "auxiliary_loss_clip": 0.06448464, + "auxiliary_loss_mlp": 0.01272763, + "balance_loss_clip": 0.06282684, + "balance_loss_mlp": 0.01259156, + "epoch": 0.3776341500075154, + "flos": 22571845188480.0, + "grad_norm": 1.3018494364650444, + "language_loss": 0.76205039, + "learning_rate": 2.8606677252849242e-06, + "loss": 0.83926266, + "num_input_tokens_seen": 134886535, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.13592529, + "step": 6281, + "time_per_iteration": 2.524489641189575 + }, + { + "auxiliary_loss_clip": 0.06448536, + "auxiliary_loss_mlp": 0.01271342, + "balance_loss_clip": 0.06279069, + "balance_loss_mlp": 0.0125718, + "epoch": 0.3776942732601834, + "flos": 23084148251520.0, + "grad_norm": 1.5306913056637732, + "language_loss": 0.84658033, + "learning_rate": 2.860316153670974e-06, + "loss": 0.92377913, + "num_input_tokens_seen": 134907435, + "router_z_loss_clip": 1.69238281, + "router_z_loss_mlp": 0.14160156, + "step": 6282, + "time_per_iteration": 2.6190710067749023 + }, + { + "auxiliary_loss_clip": 0.06449918, + "auxiliary_loss_mlp": 0.01269426, + "balance_loss_clip": 0.06282572, + "balance_loss_mlp": 0.0125555, + "epoch": 0.37775439651285136, + "flos": 21730617722880.0, + "grad_norm": 1.840636786741823, + "language_loss": 0.70143461, + "learning_rate": 2.8599645494337484e-06, + "loss": 0.77862805, + "num_input_tokens_seen": 134925360, + "router_z_loss_clip": 1.67578125, + "router_z_loss_mlp": 0.13879395, + "step": 6283, + "time_per_iteration": 2.555816411972046 + }, + { + "auxiliary_loss_clip": 0.06452499, + "auxiliary_loss_mlp": 0.01274632, + "balance_loss_clip": 0.06285429, + "balance_loss_mlp": 0.01259957, + "epoch": 0.37781451976551933, + "flos": 23994542862720.0, + "grad_norm": 1.743481736886233, + "language_loss": 0.76856482, + "learning_rate": 2.859612912586581e-06, + "loss": 0.8458361, + "num_input_tokens_seen": 134944205, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.14648438, + "step": 6284, + "time_per_iteration": 2.560770034790039 + }, + { + "auxiliary_loss_clip": 0.06464045, + "auxiliary_loss_mlp": 0.01271283, + "balance_loss_clip": 0.06286186, + "balance_loss_mlp": 0.01254725, + "epoch": 0.3778746430181873, + "flos": 13731821838720.0, + "grad_norm": 2.746966655353194, + "language_loss": 0.85536617, + "learning_rate": 2.8592612431428055e-06, + "loss": 0.93271947, + "num_input_tokens_seen": 134960255, + "router_z_loss_clip": 1.77636719, + "router_z_loss_mlp": 0.16564941, + "step": 6285, + "time_per_iteration": 2.5006392002105713 + }, + { + "auxiliary_loss_clip": 0.06451872, + "auxiliary_loss_mlp": 0.01271139, + "balance_loss_clip": 0.06279811, + "balance_loss_mlp": 0.01256065, + "epoch": 0.37793476627085526, + "flos": 19466021750400.0, + "grad_norm": 1.7632018529100697, + "language_loss": 0.84913701, + "learning_rate": 2.858909541115758e-06, + "loss": 0.9263671, + "num_input_tokens_seen": 134978605, + "router_z_loss_clip": 1.71777344, + "router_z_loss_mlp": 0.1506958, + "step": 6286, + "time_per_iteration": 2.566092014312744 + }, + { + "auxiliary_loss_clip": 0.06452557, + "auxiliary_loss_mlp": 0.01269453, + "balance_loss_clip": 0.06281806, + "balance_loss_mlp": 0.01254182, + "epoch": 0.3779948895235232, + "flos": 10711600945920.0, + "grad_norm": 1.9010574176879877, + "language_loss": 0.823708, + "learning_rate": 2.858557806518775e-06, + "loss": 0.90092808, + "num_input_tokens_seen": 134995020, + "router_z_loss_clip": 1.70800781, + "router_z_loss_mlp": 0.15258789, + "step": 6287, + "time_per_iteration": 2.4892444610595703 + }, + { + "auxiliary_loss_clip": 0.06454234, + "auxiliary_loss_mlp": 0.01274095, + "balance_loss_clip": 0.06284317, + "balance_loss_mlp": 0.01258408, + "epoch": 0.3780550127761912, + "flos": 22316616051840.0, + "grad_norm": 2.1030531862013584, + "language_loss": 0.7330361, + "learning_rate": 2.8582060393651927e-06, + "loss": 0.81031942, + "num_input_tokens_seen": 135012620, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.15679932, + "step": 6288, + "time_per_iteration": 2.5415592193603516 + }, + { + "auxiliary_loss_clip": 0.06453485, + "auxiliary_loss_mlp": 0.01269135, + "balance_loss_clip": 0.06282276, + "balance_loss_mlp": 0.01254359, + "epoch": 0.37811513602885916, + "flos": 28958401463040.0, + "grad_norm": 1.6277535048544236, + "language_loss": 0.75782627, + "learning_rate": 2.857854239668352e-06, + "loss": 0.83505249, + "num_input_tokens_seen": 135033365, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.14770508, + "step": 6289, + "time_per_iteration": 2.5579047203063965 + }, + { + "auxiliary_loss_clip": 0.06454412, + "auxiliary_loss_mlp": 0.01273518, + "balance_loss_clip": 0.06284275, + "balance_loss_mlp": 0.01257925, + "epoch": 0.3781752592815271, + "flos": 23119717109760.0, + "grad_norm": 1.945372772068441, + "language_loss": 0.74155736, + "learning_rate": 2.857502407441593e-06, + "loss": 0.81883669, + "num_input_tokens_seen": 135052185, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.15588379, + "step": 6290, + "time_per_iteration": 2.5697786808013916 + }, + { + "auxiliary_loss_clip": 0.06458094, + "auxiliary_loss_mlp": 0.01273362, + "balance_loss_clip": 0.06281058, + "balance_loss_mlp": 0.0125653, + "epoch": 0.3782353825341951, + "flos": 19762102552320.0, + "grad_norm": 2.4066647483264596, + "language_loss": 0.80529308, + "learning_rate": 2.8571505426982566e-06, + "loss": 0.88260764, + "num_input_tokens_seen": 135070425, + "router_z_loss_clip": 1.77050781, + "router_z_loss_mlp": 0.16833496, + "step": 6291, + "time_per_iteration": 2.4970998764038086 + }, + { + "auxiliary_loss_clip": 0.06456125, + "auxiliary_loss_mlp": 0.01270776, + "balance_loss_clip": 0.06283687, + "balance_loss_mlp": 0.01254933, + "epoch": 0.37829550578686305, + "flos": 22056774940800.0, + "grad_norm": 1.7419894192909393, + "language_loss": 0.76369846, + "learning_rate": 2.8567986454516854e-06, + "loss": 0.84096742, + "num_input_tokens_seen": 135090525, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.1583252, + "step": 6292, + "time_per_iteration": 2.572916030883789 + }, + { + "auxiliary_loss_clip": 0.06452248, + "auxiliary_loss_mlp": 0.0127064, + "balance_loss_clip": 0.06281239, + "balance_loss_mlp": 0.01255631, + "epoch": 0.378355629039531, + "flos": 16475667638400.0, + "grad_norm": 1.682972265329385, + "language_loss": 0.70006013, + "learning_rate": 2.856446715715224e-06, + "loss": 0.77728903, + "num_input_tokens_seen": 135109575, + "router_z_loss_clip": 1.70996094, + "router_z_loss_mlp": 0.15014648, + "step": 6293, + "time_per_iteration": 2.5161240100860596 + }, + { + "auxiliary_loss_clip": 0.06449296, + "auxiliary_loss_mlp": 0.01271246, + "balance_loss_clip": 0.06281447, + "balance_loss_mlp": 0.01255934, + "epoch": 0.378415752292199, + "flos": 19981050071040.0, + "grad_norm": 1.9898859900525039, + "language_loss": 0.7173214, + "learning_rate": 2.8560947535022173e-06, + "loss": 0.79452682, + "num_input_tokens_seen": 135127000, + "router_z_loss_clip": 1.6796875, + "router_z_loss_mlp": 0.15332031, + "step": 6294, + "time_per_iteration": 3.9304022789001465 + }, + { + "auxiliary_loss_clip": 0.06465693, + "auxiliary_loss_mlp": 0.01279732, + "balance_loss_clip": 0.06285857, + "balance_loss_mlp": 0.01264068, + "epoch": 0.378475875544867, + "flos": 14652614355840.0, + "grad_norm": 2.57033704665896, + "language_loss": 0.83215445, + "learning_rate": 2.855742758826011e-06, + "loss": 0.90960872, + "num_input_tokens_seen": 135145285, + "router_z_loss_clip": 1.79980469, + "router_z_loss_mlp": 0.15655518, + "step": 6295, + "time_per_iteration": 2.488780975341797 + }, + { + "auxiliary_loss_clip": 0.06459963, + "auxiliary_loss_mlp": 0.01268811, + "balance_loss_clip": 0.06288329, + "balance_loss_mlp": 0.01253255, + "epoch": 0.37853599879753497, + "flos": 26658194705280.0, + "grad_norm": 1.6154959379599871, + "language_loss": 0.71442378, + "learning_rate": 2.8553907316999547e-06, + "loss": 0.79171151, + "num_input_tokens_seen": 135165240, + "router_z_loss_clip": 1.71484375, + "router_z_loss_mlp": 0.15563965, + "step": 6296, + "time_per_iteration": 4.0578773021698 + }, + { + "auxiliary_loss_clip": 0.06454356, + "auxiliary_loss_mlp": 0.01274534, + "balance_loss_clip": 0.06287888, + "balance_loss_mlp": 0.01260455, + "epoch": 0.37859612205020293, + "flos": 17317817498880.0, + "grad_norm": 1.7695984237012152, + "language_loss": 0.77514613, + "learning_rate": 2.855038672137396e-06, + "loss": 0.85243499, + "num_input_tokens_seen": 135184045, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.14074707, + "step": 6297, + "time_per_iteration": 2.54968523979187 + }, + { + "auxiliary_loss_clip": 0.06462398, + "auxiliary_loss_mlp": 0.01275228, + "balance_loss_clip": 0.0628902, + "balance_loss_mlp": 0.01259481, + "epoch": 0.3786562453028709, + "flos": 18225780341760.0, + "grad_norm": 1.977165612519376, + "language_loss": 0.80132794, + "learning_rate": 2.854686580151684e-06, + "loss": 0.87870419, + "num_input_tokens_seen": 135202365, + "router_z_loss_clip": 1.73339844, + "router_z_loss_mlp": 0.1574707, + "step": 6298, + "time_per_iteration": 2.5013349056243896 + }, + { + "auxiliary_loss_clip": 0.06454945, + "auxiliary_loss_mlp": 0.01270815, + "balance_loss_clip": 0.06285203, + "balance_loss_mlp": 0.01255711, + "epoch": 0.37871636855553886, + "flos": 21221207625600.0, + "grad_norm": 1.480969598733767, + "language_loss": 0.8501091, + "learning_rate": 2.8543344557561722e-06, + "loss": 0.92736673, + "num_input_tokens_seen": 135220955, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.15087891, + "step": 6299, + "time_per_iteration": 2.5749709606170654 + }, + { + "auxiliary_loss_clip": 0.06460874, + "auxiliary_loss_mlp": 0.01272586, + "balance_loss_clip": 0.06288288, + "balance_loss_mlp": 0.01256844, + "epoch": 0.3787764918082068, + "flos": 20957886570240.0, + "grad_norm": 2.4357425027716895, + "language_loss": 0.77022231, + "learning_rate": 2.8539822989642116e-06, + "loss": 0.84755683, + "num_input_tokens_seen": 135239715, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.15740967, + "step": 6300, + "time_per_iteration": 2.521772623062134 + }, + { + "auxiliary_loss_clip": 0.06472084, + "auxiliary_loss_mlp": 0.01275415, + "balance_loss_clip": 0.06293886, + "balance_loss_mlp": 0.01258177, + "epoch": 0.3788366150608748, + "flos": 17313205524480.0, + "grad_norm": 1.8143586204861406, + "language_loss": 0.83141446, + "learning_rate": 2.8536301097891577e-06, + "loss": 0.90888953, + "num_input_tokens_seen": 135257035, + "router_z_loss_clip": 1.78125, + "router_z_loss_mlp": 0.17236328, + "step": 6301, + "time_per_iteration": 3.982780933380127 + }, + { + "auxiliary_loss_clip": 0.0646001, + "auxiliary_loss_mlp": 0.01270469, + "balance_loss_clip": 0.06287184, + "balance_loss_mlp": 0.0125428, + "epoch": 0.37889673831354276, + "flos": 24317094355200.0, + "grad_norm": 1.8203378599779103, + "language_loss": 0.68096328, + "learning_rate": 2.8532778882443636e-06, + "loss": 0.75826812, + "num_input_tokens_seen": 135275720, + "router_z_loss_clip": 1.72753906, + "router_z_loss_mlp": 0.16186523, + "step": 6302, + "time_per_iteration": 2.5983002185821533 + }, + { + "auxiliary_loss_clip": 0.06455475, + "auxiliary_loss_mlp": 0.01270441, + "balance_loss_clip": 0.06284864, + "balance_loss_mlp": 0.01255718, + "epoch": 0.3789568615662107, + "flos": 26690157838080.0, + "grad_norm": 2.521279180058548, + "language_loss": 0.68357861, + "learning_rate": 2.8529256343431867e-06, + "loss": 0.76083779, + "num_input_tokens_seen": 135294140, + "router_z_loss_clip": 1.70703125, + "router_z_loss_mlp": 0.1472168, + "step": 6303, + "time_per_iteration": 2.5610175132751465 + }, + { + "auxiliary_loss_clip": 0.06458124, + "auxiliary_loss_mlp": 0.01272095, + "balance_loss_clip": 0.06285581, + "balance_loss_mlp": 0.01257265, + "epoch": 0.3790169848188787, + "flos": 23591713559040.0, + "grad_norm": 1.604251878296904, + "language_loss": 0.78095663, + "learning_rate": 2.8525733480989846e-06, + "loss": 0.85825884, + "num_input_tokens_seen": 135314845, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.14807129, + "step": 6304, + "time_per_iteration": 3.994072437286377 + }, + { + "auxiliary_loss_clip": 0.06468576, + "auxiliary_loss_mlp": 0.01269708, + "balance_loss_clip": 0.06292479, + "balance_loss_mlp": 0.01253806, + "epoch": 0.37907710807154665, + "flos": 18442547654400.0, + "grad_norm": 1.8924180649319282, + "language_loss": 0.80524492, + "learning_rate": 2.8522210295251146e-06, + "loss": 0.88262779, + "num_input_tokens_seen": 135333055, + "router_z_loss_clip": 1.76074219, + "router_z_loss_mlp": 0.15881348, + "step": 6305, + "time_per_iteration": 2.5073235034942627 + }, + { + "auxiliary_loss_clip": 0.06370047, + "auxiliary_loss_mlp": 0.01262008, + "balance_loss_clip": 0.06291789, + "balance_loss_mlp": 0.01258527, + "epoch": 0.3791372313242146, + "flos": 50123690887680.0, + "grad_norm": 0.9538902579511545, + "language_loss": 0.64400995, + "learning_rate": 2.8518686786349387e-06, + "loss": 0.72033048, + "num_input_tokens_seen": 135387865, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 0.03491211, + "step": 6306, + "time_per_iteration": 3.106515645980835 + }, + { + "auxiliary_loss_clip": 0.06464424, + "auxiliary_loss_mlp": 0.01273174, + "balance_loss_clip": 0.06292081, + "balance_loss_mlp": 0.01257683, + "epoch": 0.3791973545768826, + "flos": 24323467265280.0, + "grad_norm": 1.5167178412192643, + "language_loss": 0.73534656, + "learning_rate": 2.851516295441817e-06, + "loss": 0.8127225, + "num_input_tokens_seen": 135409095, + "router_z_loss_clip": 1.72363281, + "router_z_loss_mlp": 0.15484619, + "step": 6307, + "time_per_iteration": 2.6272099018096924 + }, + { + "auxiliary_loss_clip": 0.06462627, + "auxiliary_loss_mlp": 0.01270499, + "balance_loss_clip": 0.06287986, + "balance_loss_mlp": 0.0125505, + "epoch": 0.3792574778295506, + "flos": 21586329792000.0, + "grad_norm": 1.8539993286062635, + "language_loss": 0.78603798, + "learning_rate": 2.851163879959112e-06, + "loss": 0.86336923, + "num_input_tokens_seen": 135429585, + "router_z_loss_clip": 1.74511719, + "router_z_loss_mlp": 0.15441895, + "step": 6308, + "time_per_iteration": 2.518927574157715 + }, + { + "auxiliary_loss_clip": 0.06459265, + "auxiliary_loss_mlp": 0.01272841, + "balance_loss_clip": 0.06287025, + "balance_loss_mlp": 0.01257028, + "epoch": 0.37931760108221857, + "flos": 22279202403840.0, + "grad_norm": 4.0253147283534, + "language_loss": 0.73503512, + "learning_rate": 2.8508114322001876e-06, + "loss": 0.81235617, + "num_input_tokens_seen": 135446320, + "router_z_loss_clip": 1.72265625, + "router_z_loss_mlp": 0.15814209, + "step": 6309, + "time_per_iteration": 2.539158344268799 + }, + { + "auxiliary_loss_clip": 0.06457806, + "auxiliary_loss_mlp": 0.01274513, + "balance_loss_clip": 0.06288067, + "balance_loss_mlp": 0.0125963, + "epoch": 0.37937772433488653, + "flos": 19689161973120.0, + "grad_norm": 1.3654110952225158, + "language_loss": 0.79184294, + "learning_rate": 2.8504589521784083e-06, + "loss": 0.86916614, + "num_input_tokens_seen": 135465720, + "router_z_loss_clip": 1.69726562, + "router_z_loss_mlp": 0.14886475, + "step": 6310, + "time_per_iteration": 2.4997847080230713 + }, + { + "auxiliary_loss_clip": 0.06457442, + "auxiliary_loss_mlp": 0.01268809, + "balance_loss_clip": 0.06285986, + "balance_loss_mlp": 0.01253586, + "epoch": 0.3794378475875545, + "flos": 19105469631360.0, + "grad_norm": 1.8573579951480166, + "language_loss": 0.76741791, + "learning_rate": 2.8501064399071403e-06, + "loss": 0.84468043, + "num_input_tokens_seen": 135485155, + "router_z_loss_clip": 1.71484375, + "router_z_loss_mlp": 0.15222168, + "step": 6311, + "time_per_iteration": 2.5216546058654785 + }, + { + "auxiliary_loss_clip": 0.06457929, + "auxiliary_loss_mlp": 0.01276784, + "balance_loss_clip": 0.06287444, + "balance_loss_mlp": 0.01261746, + "epoch": 0.37949797084022246, + "flos": 20345920675200.0, + "grad_norm": 1.4012846072012495, + "language_loss": 0.71063423, + "learning_rate": 2.8497538953997504e-06, + "loss": 0.78798139, + "num_input_tokens_seen": 135502675, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.15032959, + "step": 6312, + "time_per_iteration": 2.4909064769744873 + }, + { + "auxiliary_loss_clip": 0.06361144, + "auxiliary_loss_mlp": 0.01254908, + "balance_loss_clip": 0.06283364, + "balance_loss_mlp": 0.01251185, + "epoch": 0.37955809409289043, + "flos": 63991121760000.0, + "grad_norm": 0.7457914665340521, + "language_loss": 0.55941355, + "learning_rate": 2.849401318669608e-06, + "loss": 0.63557404, + "num_input_tokens_seen": 135562005, + "router_z_loss_clip": 0.77734375, + "router_z_loss_mlp": 0.03713989, + "step": 6313, + "time_per_iteration": 3.1312170028686523 + }, + { + "auxiliary_loss_clip": 0.06457204, + "auxiliary_loss_mlp": 0.0127245, + "balance_loss_clip": 0.06285529, + "balance_loss_mlp": 0.01258211, + "epoch": 0.3796182173455584, + "flos": 31548777310080.0, + "grad_norm": 1.7202421351204062, + "language_loss": 0.71222353, + "learning_rate": 2.849048709730083e-06, + "loss": 0.78952008, + "num_input_tokens_seen": 135582600, + "router_z_loss_clip": 1.71679688, + "router_z_loss_mlp": 0.14233398, + "step": 6314, + "time_per_iteration": 2.5876691341400146 + }, + { + "auxiliary_loss_clip": 0.06465393, + "auxiliary_loss_mlp": 0.01270992, + "balance_loss_clip": 0.06290812, + "balance_loss_mlp": 0.01254922, + "epoch": 0.37967834059822636, + "flos": 12135766066560.0, + "grad_norm": 2.8019471516683985, + "language_loss": 0.74203241, + "learning_rate": 2.848696068594545e-06, + "loss": 0.81939626, + "num_input_tokens_seen": 135600280, + "router_z_loss_clip": 1.74511719, + "router_z_loss_mlp": 0.16064453, + "step": 6315, + "time_per_iteration": 2.5312654972076416 + }, + { + "auxiliary_loss_clip": 0.06455735, + "auxiliary_loss_mlp": 0.01269414, + "balance_loss_clip": 0.0628659, + "balance_loss_mlp": 0.01253512, + "epoch": 0.3797384638508943, + "flos": 39357989331840.0, + "grad_norm": 5.544256779510487, + "language_loss": 0.7095021, + "learning_rate": 2.8483433952763677e-06, + "loss": 0.78675354, + "num_input_tokens_seen": 135621560, + "router_z_loss_clip": 1.69238281, + "router_z_loss_mlp": 0.15905762, + "step": 6316, + "time_per_iteration": 2.642946481704712 + }, + { + "auxiliary_loss_clip": 0.06458603, + "auxiliary_loss_mlp": 0.0127094, + "balance_loss_clip": 0.06288237, + "balance_loss_mlp": 0.01255991, + "epoch": 0.3797985871035623, + "flos": 34061852165760.0, + "grad_norm": 2.4477129072331656, + "language_loss": 0.65612113, + "learning_rate": 2.847990689788923e-06, + "loss": 0.7334165, + "num_input_tokens_seen": 135641745, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.1496582, + "step": 6317, + "time_per_iteration": 2.634066104888916 + }, + { + "auxiliary_loss_clip": 0.0645286, + "auxiliary_loss_mlp": 0.0127098, + "balance_loss_clip": 0.06285463, + "balance_loss_mlp": 0.0125702, + "epoch": 0.37985871035623026, + "flos": 23228939306880.0, + "grad_norm": 1.9893651635894969, + "language_loss": 0.86348939, + "learning_rate": 2.8476379521455877e-06, + "loss": 0.94072783, + "num_input_tokens_seen": 135660650, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.13964844, + "step": 6318, + "time_per_iteration": 2.50665545463562 + }, + { + "auxiliary_loss_clip": 0.06460046, + "auxiliary_loss_mlp": 0.01273041, + "balance_loss_clip": 0.06287004, + "balance_loss_mlp": 0.01257675, + "epoch": 0.3799188336088982, + "flos": 18121002410880.0, + "grad_norm": 2.356531700065532, + "language_loss": 0.76647675, + "learning_rate": 2.8472851823597354e-06, + "loss": 0.84380764, + "num_input_tokens_seen": 135679980, + "router_z_loss_clip": 1.73046875, + "router_z_loss_mlp": 0.15368652, + "step": 6319, + "time_per_iteration": 2.50382137298584 + }, + { + "auxiliary_loss_clip": 0.06453398, + "auxiliary_loss_mlp": 0.01272745, + "balance_loss_clip": 0.06284256, + "balance_loss_mlp": 0.01258082, + "epoch": 0.3799789568615662, + "flos": 21878385598080.0, + "grad_norm": 6.804259628026359, + "language_loss": 0.6451484, + "learning_rate": 2.846932380444744e-06, + "loss": 0.72240984, + "num_input_tokens_seen": 135699400, + "router_z_loss_clip": 1.69140625, + "router_z_loss_mlp": 0.14660645, + "step": 6320, + "time_per_iteration": 2.516150712966919 + }, + { + "auxiliary_loss_clip": 0.06456275, + "auxiliary_loss_mlp": 0.01268463, + "balance_loss_clip": 0.06285265, + "balance_loss_mlp": 0.01252846, + "epoch": 0.3800390801142342, + "flos": 32971181495040.0, + "grad_norm": 1.7343317020382172, + "language_loss": 0.71855223, + "learning_rate": 2.846579546413992e-06, + "loss": 0.79579961, + "num_input_tokens_seen": 135723455, + "router_z_loss_clip": 1.71191406, + "router_z_loss_mlp": 0.15612793, + "step": 6321, + "time_per_iteration": 2.6204988956451416 + }, + { + "auxiliary_loss_clip": 0.06458073, + "auxiliary_loss_mlp": 0.01268703, + "balance_loss_clip": 0.06285845, + "balance_loss_mlp": 0.01253784, + "epoch": 0.38009920336690217, + "flos": 26914430090880.0, + "grad_norm": 1.8398392312515923, + "language_loss": 0.75578612, + "learning_rate": 2.846226680280859e-06, + "loss": 0.83305389, + "num_input_tokens_seen": 135744335, + "router_z_loss_clip": 1.72070312, + "router_z_loss_mlp": 0.14923096, + "step": 6322, + "time_per_iteration": 2.5463461875915527 + }, + { + "auxiliary_loss_clip": 0.06454624, + "auxiliary_loss_mlp": 0.01271033, + "balance_loss_clip": 0.06285781, + "balance_loss_mlp": 0.01256823, + "epoch": 0.38015932661957014, + "flos": 22494963467520.0, + "grad_norm": 1.8201003599281902, + "language_loss": 0.85709381, + "learning_rate": 2.845873782058725e-06, + "loss": 0.93435031, + "num_input_tokens_seen": 135761440, + "router_z_loss_clip": 1.6875, + "router_z_loss_mlp": 0.14215088, + "step": 6323, + "time_per_iteration": 2.4927124977111816 + }, + { + "auxiliary_loss_clip": 0.06458908, + "auxiliary_loss_mlp": 0.01270641, + "balance_loss_clip": 0.06286593, + "balance_loss_mlp": 0.01254596, + "epoch": 0.3802194498722381, + "flos": 21987440087040.0, + "grad_norm": 2.2452863694907426, + "language_loss": 0.73932886, + "learning_rate": 2.845520851760973e-06, + "loss": 0.81662428, + "num_input_tokens_seen": 135779955, + "router_z_loss_clip": 1.72265625, + "router_z_loss_mlp": 0.16027832, + "step": 6324, + "time_per_iteration": 2.4913861751556396 + }, + { + "auxiliary_loss_clip": 0.06464465, + "auxiliary_loss_mlp": 0.01272923, + "balance_loss_clip": 0.06288414, + "balance_loss_mlp": 0.01257724, + "epoch": 0.38027957312490607, + "flos": 21331310290560.0, + "grad_norm": 1.7884051563809298, + "language_loss": 0.84122628, + "learning_rate": 2.8451678894009847e-06, + "loss": 0.91860014, + "num_input_tokens_seen": 135799840, + "router_z_loss_clip": 1.76171875, + "router_z_loss_mlp": 0.15203857, + "step": 6325, + "time_per_iteration": 2.6119046211242676 + }, + { + "auxiliary_loss_clip": 0.06455745, + "auxiliary_loss_mlp": 0.01266737, + "balance_loss_clip": 0.06285073, + "balance_loss_mlp": 0.01252712, + "epoch": 0.38033969637757403, + "flos": 16696921144320.0, + "grad_norm": 2.2200302984742915, + "language_loss": 0.79868543, + "learning_rate": 2.8448148949921465e-06, + "loss": 0.87591028, + "num_input_tokens_seen": 135817880, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.14019775, + "step": 6326, + "time_per_iteration": 2.5188262462615967 + }, + { + "auxiliary_loss_clip": 0.06455691, + "auxiliary_loss_mlp": 0.01270203, + "balance_loss_clip": 0.06286497, + "balance_loss_mlp": 0.01255242, + "epoch": 0.380399819630242, + "flos": 36219741563520.0, + "grad_norm": 3.3742704435112025, + "language_loss": 0.73389304, + "learning_rate": 2.844461868547842e-06, + "loss": 0.81115204, + "num_input_tokens_seen": 135838940, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.14978027, + "step": 6327, + "time_per_iteration": 2.649383783340454 + }, + { + "auxiliary_loss_clip": 0.06459647, + "auxiliary_loss_mlp": 0.01269027, + "balance_loss_clip": 0.06290785, + "balance_loss_mlp": 0.01255145, + "epoch": 0.38045994288290996, + "flos": 21295364088960.0, + "grad_norm": 1.4936601975654378, + "language_loss": 0.83229524, + "learning_rate": 2.844108810081459e-06, + "loss": 0.90958202, + "num_input_tokens_seen": 135858325, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.13867188, + "step": 6328, + "time_per_iteration": 2.527261972427368 + }, + { + "auxiliary_loss_clip": 0.06452741, + "auxiliary_loss_mlp": 0.01268758, + "balance_loss_clip": 0.06281206, + "balance_loss_mlp": 0.01253755, + "epoch": 0.38052006613557793, + "flos": 20929151819520.0, + "grad_norm": 1.5056942690240434, + "language_loss": 0.61757982, + "learning_rate": 2.843755719606385e-06, + "loss": 0.69479483, + "num_input_tokens_seen": 135878430, + "router_z_loss_clip": 1.71679688, + "router_z_loss_mlp": 0.15008545, + "step": 6329, + "time_per_iteration": 2.54025936126709 + }, + { + "auxiliary_loss_clip": 0.0645529, + "auxiliary_loss_mlp": 0.01268187, + "balance_loss_clip": 0.06283917, + "balance_loss_mlp": 0.01254037, + "epoch": 0.3805801893882459, + "flos": 20996138759040.0, + "grad_norm": 2.0488191193117316, + "language_loss": 0.56127822, + "learning_rate": 2.8434025971360104e-06, + "loss": 0.63851297, + "num_input_tokens_seen": 135894755, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.14160156, + "step": 6330, + "time_per_iteration": 2.4913628101348877 + }, + { + "auxiliary_loss_clip": 0.06449446, + "auxiliary_loss_mlp": 0.01269693, + "balance_loss_clip": 0.06282543, + "balance_loss_mlp": 0.01255781, + "epoch": 0.38064031264091386, + "flos": 25565972734080.0, + "grad_norm": 1.4483276491856993, + "language_loss": 0.65912807, + "learning_rate": 2.8430494426837243e-06, + "loss": 0.73631942, + "num_input_tokens_seen": 135918275, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.13903809, + "step": 6331, + "time_per_iteration": 2.6071105003356934 + }, + { + "auxiliary_loss_clip": 0.0645493, + "auxiliary_loss_mlp": 0.01269934, + "balance_loss_clip": 0.06284193, + "balance_loss_mlp": 0.01254312, + "epoch": 0.3807004358935818, + "flos": 15091264080000.0, + "grad_norm": 1.528944840420101, + "language_loss": 0.7597304, + "learning_rate": 2.842696256262919e-06, + "loss": 0.83697909, + "num_input_tokens_seen": 135937430, + "router_z_loss_clip": 1.70898438, + "router_z_loss_mlp": 0.15618896, + "step": 6332, + "time_per_iteration": 2.4808928966522217 + }, + { + "auxiliary_loss_clip": 0.06456427, + "auxiliary_loss_mlp": 0.01273089, + "balance_loss_clip": 0.06283183, + "balance_loss_mlp": 0.01257943, + "epoch": 0.3807605591462498, + "flos": 16405033046400.0, + "grad_norm": 2.2042220893600226, + "language_loss": 0.82397389, + "learning_rate": 2.842343037886987e-06, + "loss": 0.90126908, + "num_input_tokens_seen": 135954210, + "router_z_loss_clip": 1.73144531, + "router_z_loss_mlp": 0.15142822, + "step": 6333, + "time_per_iteration": 2.5033013820648193 + }, + { + "auxiliary_loss_clip": 0.06454624, + "auxiliary_loss_mlp": 0.01269205, + "balance_loss_clip": 0.06283775, + "balance_loss_mlp": 0.01254655, + "epoch": 0.3808206823989178, + "flos": 29064353351040.0, + "grad_norm": 1.4831969327294916, + "language_loss": 0.86723578, + "learning_rate": 2.8419897875693226e-06, + "loss": 0.9444741, + "num_input_tokens_seen": 135974425, + "router_z_loss_clip": 1.70703125, + "router_z_loss_mlp": 0.14538574, + "step": 6334, + "time_per_iteration": 4.024240493774414 + }, + { + "auxiliary_loss_clip": 0.06455058, + "auxiliary_loss_mlp": 0.01270467, + "balance_loss_clip": 0.06282362, + "balance_loss_mlp": 0.01255155, + "epoch": 0.3808808056515858, + "flos": 15711321893760.0, + "grad_norm": 2.3448311359770795, + "language_loss": 0.79450226, + "learning_rate": 2.841636505323321e-06, + "loss": 0.87175757, + "num_input_tokens_seen": 135991985, + "router_z_loss_clip": 1.72753906, + "router_z_loss_mlp": 0.15301514, + "step": 6335, + "time_per_iteration": 2.4698357582092285 + }, + { + "auxiliary_loss_clip": 0.06453745, + "auxiliary_loss_mlp": 0.0127096, + "balance_loss_clip": 0.06281872, + "balance_loss_mlp": 0.0125517, + "epoch": 0.38094092890425374, + "flos": 20710917060480.0, + "grad_norm": 1.9128487431319638, + "language_loss": 0.72795898, + "learning_rate": 2.8412831911623795e-06, + "loss": 0.80520606, + "num_input_tokens_seen": 136010015, + "router_z_loss_clip": 1.71777344, + "router_z_loss_mlp": 0.15802002, + "step": 6336, + "time_per_iteration": 3.9780919551849365 + }, + { + "auxiliary_loss_clip": 0.06449959, + "auxiliary_loss_mlp": 0.01267203, + "balance_loss_clip": 0.06281384, + "balance_loss_mlp": 0.01252826, + "epoch": 0.3810010521569217, + "flos": 20674258099200.0, + "grad_norm": 2.2277206975915362, + "language_loss": 0.69756234, + "learning_rate": 2.840929845099894e-06, + "loss": 0.77473396, + "num_input_tokens_seen": 136028440, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.14373779, + "step": 6337, + "time_per_iteration": 2.5475378036499023 + }, + { + "auxiliary_loss_clip": 0.06454941, + "auxiliary_loss_mlp": 0.01273075, + "balance_loss_clip": 0.06282912, + "balance_loss_mlp": 0.012579, + "epoch": 0.38106117540958967, + "flos": 31834963330560.0, + "grad_norm": 1.987280020069696, + "language_loss": 0.64026022, + "learning_rate": 2.8405764671492652e-06, + "loss": 0.71754032, + "num_input_tokens_seen": 136048360, + "router_z_loss_clip": 1.71777344, + "router_z_loss_mlp": 0.1517334, + "step": 6338, + "time_per_iteration": 2.5795555114746094 + }, + { + "auxiliary_loss_clip": 0.06456137, + "auxiliary_loss_mlp": 0.01271603, + "balance_loss_clip": 0.06282276, + "balance_loss_mlp": 0.01255772, + "epoch": 0.38112129866225763, + "flos": 16907231692800.0, + "grad_norm": 1.6550535893348008, + "language_loss": 0.69685936, + "learning_rate": 2.8402230573238923e-06, + "loss": 0.77413678, + "num_input_tokens_seen": 136065500, + "router_z_loss_clip": 1.74121094, + "router_z_loss_mlp": 0.15856934, + "step": 6339, + "time_per_iteration": 2.48705792427063 + }, + { + "auxiliary_loss_clip": 0.06451984, + "auxiliary_loss_mlp": 0.01267449, + "balance_loss_clip": 0.06281533, + "balance_loss_mlp": 0.01253913, + "epoch": 0.3811814219149256, + "flos": 20893624888320.0, + "grad_norm": 2.252585455539085, + "language_loss": 0.68345773, + "learning_rate": 2.839869615637177e-06, + "loss": 0.76065207, + "num_input_tokens_seen": 136084060, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.13519287, + "step": 6340, + "time_per_iteration": 2.5142812728881836 + }, + { + "auxiliary_loss_clip": 0.06456652, + "auxiliary_loss_mlp": 0.01275426, + "balance_loss_clip": 0.06282599, + "balance_loss_mlp": 0.01260083, + "epoch": 0.38124154516759357, + "flos": 16696418019840.0, + "grad_norm": 2.4997436549257754, + "language_loss": 0.89721388, + "learning_rate": 2.839516142102522e-06, + "loss": 0.97453463, + "num_input_tokens_seen": 136102310, + "router_z_loss_clip": 1.74023438, + "router_z_loss_mlp": 0.15332031, + "step": 6341, + "time_per_iteration": 4.08266806602478 + }, + { + "auxiliary_loss_clip": 0.06461132, + "auxiliary_loss_mlp": 0.01272557, + "balance_loss_clip": 0.06284279, + "balance_loss_mlp": 0.01255427, + "epoch": 0.38130166842026153, + "flos": 19687946088960.0, + "grad_norm": 1.4891162994718032, + "language_loss": 0.75298452, + "learning_rate": 2.83916263673333e-06, + "loss": 0.83032143, + "num_input_tokens_seen": 136120725, + "router_z_loss_clip": 1.76855469, + "router_z_loss_mlp": 0.17138672, + "step": 6342, + "time_per_iteration": 2.496697425842285 + }, + { + "auxiliary_loss_clip": 0.06453368, + "auxiliary_loss_mlp": 0.01271075, + "balance_loss_clip": 0.06281647, + "balance_loss_mlp": 0.0125646, + "epoch": 0.3813617916729295, + "flos": 22204668597120.0, + "grad_norm": 1.7145643847071266, + "language_loss": 0.83785719, + "learning_rate": 2.838809099543007e-06, + "loss": 0.91510159, + "num_input_tokens_seen": 136139105, + "router_z_loss_clip": 1.71777344, + "router_z_loss_mlp": 0.14599609, + "step": 6343, + "time_per_iteration": 4.049302339553833 + }, + { + "auxiliary_loss_clip": 0.0645491, + "auxiliary_loss_mlp": 0.01269585, + "balance_loss_clip": 0.06281073, + "balance_loss_mlp": 0.01254905, + "epoch": 0.38142191492559746, + "flos": 19102576665600.0, + "grad_norm": 1.619462393744454, + "language_loss": 0.77529186, + "learning_rate": 2.838455530544959e-06, + "loss": 0.8525368, + "num_input_tokens_seen": 136158265, + "router_z_loss_clip": 1.73828125, + "router_z_loss_mlp": 0.14678955, + "step": 6344, + "time_per_iteration": 2.579394817352295 + }, + { + "auxiliary_loss_clip": 0.06456682, + "auxiliary_loss_mlp": 0.01271203, + "balance_loss_clip": 0.06285504, + "balance_loss_mlp": 0.01256635, + "epoch": 0.3814820381782654, + "flos": 24104645527680.0, + "grad_norm": 1.8871239884396722, + "language_loss": 0.74166036, + "learning_rate": 2.838101929752593e-06, + "loss": 0.81893921, + "num_input_tokens_seen": 136176100, + "router_z_loss_clip": 1.7109375, + "router_z_loss_mlp": 0.14587402, + "step": 6345, + "time_per_iteration": 2.5367093086242676 + }, + { + "auxiliary_loss_clip": 0.06457509, + "auxiliary_loss_mlp": 0.0127188, + "balance_loss_clip": 0.06287415, + "balance_loss_mlp": 0.01257765, + "epoch": 0.3815421614309334, + "flos": 15783927056640.0, + "grad_norm": 1.7118462514914357, + "language_loss": 0.69868183, + "learning_rate": 2.8377482971793187e-06, + "loss": 0.7759757, + "num_input_tokens_seen": 136195125, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.14111328, + "step": 6346, + "time_per_iteration": 2.5815930366516113 + }, + { + "auxiliary_loss_clip": 0.06466204, + "auxiliary_loss_mlp": 0.0127262, + "balance_loss_clip": 0.06290555, + "balance_loss_mlp": 0.01257236, + "epoch": 0.38160228468360136, + "flos": 19905593869440.0, + "grad_norm": 1.781545419456976, + "language_loss": 0.7611326, + "learning_rate": 2.8373946328385437e-06, + "loss": 0.83852088, + "num_input_tokens_seen": 136213885, + "router_z_loss_clip": 1.75585938, + "router_z_loss_mlp": 0.15374756, + "step": 6347, + "time_per_iteration": 2.5027284622192383 + }, + { + "auxiliary_loss_clip": 0.06456521, + "auxiliary_loss_mlp": 0.01269003, + "balance_loss_clip": 0.06283832, + "balance_loss_mlp": 0.012553, + "epoch": 0.3816624079362694, + "flos": 19287045429120.0, + "grad_norm": 1.488288802844173, + "language_loss": 0.75192666, + "learning_rate": 2.8370409367436813e-06, + "loss": 0.82918191, + "num_input_tokens_seen": 136232700, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.13702393, + "step": 6348, + "time_per_iteration": 2.559131383895874 + }, + { + "auxiliary_loss_clip": 0.0645996, + "auxiliary_loss_mlp": 0.01270391, + "balance_loss_clip": 0.06286097, + "balance_loss_mlp": 0.01256599, + "epoch": 0.38172253118893734, + "flos": 21183752050560.0, + "grad_norm": 1.729316797973715, + "language_loss": 0.88237411, + "learning_rate": 2.836687208908142e-06, + "loss": 0.95967764, + "num_input_tokens_seen": 136248975, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.13775635, + "step": 6349, + "time_per_iteration": 2.525542974472046 + }, + { + "auxiliary_loss_clip": 0.06453095, + "auxiliary_loss_mlp": 0.0126974, + "balance_loss_clip": 0.06281723, + "balance_loss_mlp": 0.01255149, + "epoch": 0.3817826544416053, + "flos": 17534836373760.0, + "grad_norm": 1.7576595366031973, + "language_loss": 0.76939785, + "learning_rate": 2.836333449345341e-06, + "loss": 0.84662628, + "num_input_tokens_seen": 136266710, + "router_z_loss_clip": 1.71386719, + "router_z_loss_mlp": 0.14593506, + "step": 6350, + "time_per_iteration": 2.532376289367676 + }, + { + "auxiliary_loss_clip": 0.06458531, + "auxiliary_loss_mlp": 0.01273484, + "balance_loss_clip": 0.06286063, + "balance_loss_mlp": 0.01258231, + "epoch": 0.38184277769427327, + "flos": 16332176321280.0, + "grad_norm": 2.21296257119241, + "language_loss": 0.77054518, + "learning_rate": 2.8359796580686907e-06, + "loss": 0.84786528, + "num_input_tokens_seen": 136284445, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.15264893, + "step": 6351, + "time_per_iteration": 2.4930031299591064 + }, + { + "auxiliary_loss_clip": 0.06457832, + "auxiliary_loss_mlp": 0.01273263, + "balance_loss_clip": 0.0628476, + "balance_loss_mlp": 0.012577, + "epoch": 0.38190290094694124, + "flos": 30450937115520.0, + "grad_norm": 2.2550067272061254, + "language_loss": 0.74895489, + "learning_rate": 2.8356258350916085e-06, + "loss": 0.82626581, + "num_input_tokens_seen": 136305730, + "router_z_loss_clip": 1.73144531, + "router_z_loss_mlp": 0.15563965, + "step": 6352, + "time_per_iteration": 2.6078808307647705 + }, + { + "auxiliary_loss_clip": 0.0645259, + "auxiliary_loss_mlp": 0.01270341, + "balance_loss_clip": 0.06283389, + "balance_loss_mlp": 0.0125659, + "epoch": 0.3819630241996092, + "flos": 14215138588800.0, + "grad_norm": 2.0554991668998777, + "language_loss": 0.63961715, + "learning_rate": 2.8352719804275104e-06, + "loss": 0.71684647, + "num_input_tokens_seen": 136323850, + "router_z_loss_clip": 1.69140625, + "router_z_loss_mlp": 0.13739014, + "step": 6353, + "time_per_iteration": 2.476759433746338 + }, + { + "auxiliary_loss_clip": 0.06456264, + "auxiliary_loss_mlp": 0.01279815, + "balance_loss_clip": 0.06283177, + "balance_loss_mlp": 0.01266112, + "epoch": 0.38202314745227717, + "flos": 25016717220480.0, + "grad_norm": 1.720129608989886, + "language_loss": 0.83556378, + "learning_rate": 2.834918094089816e-06, + "loss": 0.91292459, + "num_input_tokens_seen": 136344880, + "router_z_loss_clip": 1.72949219, + "router_z_loss_mlp": 0.13702393, + "step": 6354, + "time_per_iteration": 2.5726418495178223 + }, + { + "auxiliary_loss_clip": 0.06456912, + "auxiliary_loss_mlp": 0.01271961, + "balance_loss_clip": 0.06290418, + "balance_loss_mlp": 0.0125911, + "epoch": 0.38208327070494513, + "flos": 20820935871360.0, + "grad_norm": 1.6482101436629937, + "language_loss": 0.81480742, + "learning_rate": 2.834564176091943e-06, + "loss": 0.89209616, + "num_input_tokens_seen": 136366060, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.12854004, + "step": 6355, + "time_per_iteration": 2.5225114822387695 + }, + { + "auxiliary_loss_clip": 0.06459523, + "auxiliary_loss_mlp": 0.01273228, + "balance_loss_clip": 0.06289364, + "balance_loss_mlp": 0.01259179, + "epoch": 0.3821433939576131, + "flos": 22644282643200.0, + "grad_norm": 1.8808367718392982, + "language_loss": 0.75647783, + "learning_rate": 2.8342102264473125e-06, + "loss": 0.83380532, + "num_input_tokens_seen": 136385625, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.14031982, + "step": 6356, + "time_per_iteration": 2.5584537982940674 + }, + { + "auxiliary_loss_clip": 0.0646046, + "auxiliary_loss_mlp": 0.01272045, + "balance_loss_clip": 0.06287301, + "balance_loss_mlp": 0.01257645, + "epoch": 0.38220351721028106, + "flos": 26877100296960.0, + "grad_norm": 1.8976132208861074, + "language_loss": 0.82161039, + "learning_rate": 2.833856245169348e-06, + "loss": 0.89893544, + "num_input_tokens_seen": 136405750, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.14398193, + "step": 6357, + "time_per_iteration": 2.546190023422241 + }, + { + "auxiliary_loss_clip": 0.06463508, + "auxiliary_loss_mlp": 0.01275628, + "balance_loss_clip": 0.0629019, + "balance_loss_mlp": 0.01260035, + "epoch": 0.38226364046294903, + "flos": 23374149632640.0, + "grad_norm": 1.7334885634957151, + "language_loss": 0.78531659, + "learning_rate": 2.8335022322714695e-06, + "loss": 0.86270791, + "num_input_tokens_seen": 136426085, + "router_z_loss_clip": 1.73535156, + "router_z_loss_mlp": 0.15612793, + "step": 6358, + "time_per_iteration": 2.5330071449279785 + }, + { + "auxiliary_loss_clip": 0.06462916, + "auxiliary_loss_mlp": 0.01271369, + "balance_loss_clip": 0.06287834, + "balance_loss_mlp": 0.01256086, + "epoch": 0.382323763715617, + "flos": 19652335303680.0, + "grad_norm": 1.9007754709735623, + "language_loss": 0.79191673, + "learning_rate": 2.8331481877671036e-06, + "loss": 0.86925954, + "num_input_tokens_seen": 136442670, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.15270996, + "step": 6359, + "time_per_iteration": 2.5185654163360596 + }, + { + "auxiliary_loss_clip": 0.06457044, + "auxiliary_loss_mlp": 0.01275796, + "balance_loss_clip": 0.06287733, + "balance_loss_mlp": 0.01261884, + "epoch": 0.38238388696828496, + "flos": 54136527575040.0, + "grad_norm": 1.6591220194179586, + "language_loss": 0.70001733, + "learning_rate": 2.8327941116696754e-06, + "loss": 0.77734572, + "num_input_tokens_seen": 136465730, + "router_z_loss_clip": 1.69238281, + "router_z_loss_mlp": 0.13903809, + "step": 6360, + "time_per_iteration": 2.8067054748535156 + }, + { + "auxiliary_loss_clip": 0.06461466, + "auxiliary_loss_mlp": 0.01277777, + "balance_loss_clip": 0.06292595, + "balance_loss_mlp": 0.01262923, + "epoch": 0.382444010220953, + "flos": 24943105808640.0, + "grad_norm": 1.5737902616354833, + "language_loss": 0.79093289, + "learning_rate": 2.83244000399261e-06, + "loss": 0.86832535, + "num_input_tokens_seen": 136487215, + "router_z_loss_clip": 1.6875, + "router_z_loss_mlp": 0.14849854, + "step": 6361, + "time_per_iteration": 2.558579683303833 + }, + { + "auxiliary_loss_clip": 0.0645285, + "auxiliary_loss_mlp": 0.01272146, + "balance_loss_clip": 0.06286099, + "balance_loss_mlp": 0.01257996, + "epoch": 0.38250413347362094, + "flos": 42346750216320.0, + "grad_norm": 1.4645255919949542, + "language_loss": 0.65580732, + "learning_rate": 2.832085864749337e-06, + "loss": 0.73305726, + "num_input_tokens_seen": 136510365, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.14154053, + "step": 6362, + "time_per_iteration": 2.709390878677368 + }, + { + "auxiliary_loss_clip": 0.06459438, + "auxiliary_loss_mlp": 0.01270758, + "balance_loss_clip": 0.06287294, + "balance_loss_mlp": 0.01255415, + "epoch": 0.3825642567262889, + "flos": 16294720746240.0, + "grad_norm": 1.6166481183320216, + "language_loss": 0.8211807, + "learning_rate": 2.8317316939532848e-06, + "loss": 0.89848268, + "num_input_tokens_seen": 136527100, + "router_z_loss_clip": 1.72070312, + "router_z_loss_mlp": 0.15332031, + "step": 6363, + "time_per_iteration": 2.468846559524536 + }, + { + "auxiliary_loss_clip": 0.06453779, + "auxiliary_loss_mlp": 0.01274743, + "balance_loss_clip": 0.06286556, + "balance_loss_mlp": 0.01259401, + "epoch": 0.3826243799789569, + "flos": 45664267795200.0, + "grad_norm": 1.6258867054195516, + "language_loss": 0.59107661, + "learning_rate": 2.8313774916178825e-06, + "loss": 0.6683619, + "num_input_tokens_seen": 136550870, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.15356445, + "step": 6364, + "time_per_iteration": 2.745589256286621 + }, + { + "auxiliary_loss_clip": 0.06465845, + "auxiliary_loss_mlp": 0.0127531, + "balance_loss_clip": 0.06290866, + "balance_loss_mlp": 0.01261058, + "epoch": 0.38268450323162484, + "flos": 25308647245440.0, + "grad_norm": 2.2940920681906873, + "language_loss": 0.6951021, + "learning_rate": 2.8310232577565635e-06, + "loss": 0.77251363, + "num_input_tokens_seen": 136569895, + "router_z_loss_clip": 1.74902344, + "router_z_loss_mlp": 0.14257812, + "step": 6365, + "time_per_iteration": 2.561795473098755 + }, + { + "auxiliary_loss_clip": 0.06461614, + "auxiliary_loss_mlp": 0.01269888, + "balance_loss_clip": 0.06285347, + "balance_loss_mlp": 0.0125451, + "epoch": 0.3827446264842928, + "flos": 21842607104640.0, + "grad_norm": 2.2040506714686208, + "language_loss": 0.73211187, + "learning_rate": 2.830668992382758e-06, + "loss": 0.8094269, + "num_input_tokens_seen": 136588585, + "router_z_loss_clip": 1.76464844, + "router_z_loss_mlp": 0.15374756, + "step": 6366, + "time_per_iteration": 2.527252435684204 + }, + { + "auxiliary_loss_clip": 0.06455328, + "auxiliary_loss_mlp": 0.01270912, + "balance_loss_clip": 0.06284537, + "balance_loss_mlp": 0.0125703, + "epoch": 0.38280474973696077, + "flos": 25740924059520.0, + "grad_norm": 2.537372436592335, + "language_loss": 0.69208872, + "learning_rate": 2.830314695509902e-06, + "loss": 0.76935112, + "num_input_tokens_seen": 136606640, + "router_z_loss_clip": 1.70898438, + "router_z_loss_mlp": 0.13885498, + "step": 6367, + "time_per_iteration": 2.563174247741699 + }, + { + "auxiliary_loss_clip": 0.06445135, + "auxiliary_loss_mlp": 0.01267364, + "balance_loss_clip": 0.06281811, + "balance_loss_mlp": 0.01253482, + "epoch": 0.38286487298962874, + "flos": 24902212216320.0, + "grad_norm": 2.529219827632029, + "language_loss": 0.64519894, + "learning_rate": 2.82996036715143e-06, + "loss": 0.72232389, + "num_input_tokens_seen": 136624940, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.13897705, + "step": 6368, + "time_per_iteration": 2.5240230560302734 + }, + { + "auxiliary_loss_clip": 0.0644632, + "auxiliary_loss_mlp": 0.0126879, + "balance_loss_clip": 0.06279288, + "balance_loss_mlp": 0.01255111, + "epoch": 0.3829249962422967, + "flos": 28550457060480.0, + "grad_norm": 1.3073196657605344, + "language_loss": 0.68441451, + "learning_rate": 2.8296060073207763e-06, + "loss": 0.76156569, + "num_input_tokens_seen": 136645540, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.13677979, + "step": 6369, + "time_per_iteration": 2.623020887374878 + }, + { + "auxiliary_loss_clip": 0.06452611, + "auxiliary_loss_mlp": 0.01268713, + "balance_loss_clip": 0.0628352, + "balance_loss_mlp": 0.01254724, + "epoch": 0.38298511949496467, + "flos": 21477736500480.0, + "grad_norm": 1.6896603918496267, + "language_loss": 0.79100078, + "learning_rate": 2.8292516160313804e-06, + "loss": 0.86821401, + "num_input_tokens_seen": 136664530, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.13995361, + "step": 6370, + "time_per_iteration": 2.5265746116638184 + }, + { + "auxiliary_loss_clip": 0.06451623, + "auxiliary_loss_mlp": 0.0127085, + "balance_loss_clip": 0.06281339, + "balance_loss_mlp": 0.01256265, + "epoch": 0.38304524274763263, + "flos": 31687027747200.0, + "grad_norm": 2.908092380852583, + "language_loss": 0.651667, + "learning_rate": 2.8288971932966805e-06, + "loss": 0.72889173, + "num_input_tokens_seen": 136682315, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.14587402, + "step": 6371, + "time_per_iteration": 2.6345784664154053 + }, + { + "auxiliary_loss_clip": 0.06459577, + "auxiliary_loss_mlp": 0.01272301, + "balance_loss_clip": 0.06283382, + "balance_loss_mlp": 0.01257543, + "epoch": 0.3831053660003006, + "flos": 25082865619200.0, + "grad_norm": 2.362243450203488, + "language_loss": 0.73142469, + "learning_rate": 2.8285427391301155e-06, + "loss": 0.80874348, + "num_input_tokens_seen": 136701185, + "router_z_loss_clip": 1.76464844, + "router_z_loss_mlp": 0.14746094, + "step": 6372, + "time_per_iteration": 2.5150070190429688 + }, + { + "auxiliary_loss_clip": 0.06454702, + "auxiliary_loss_mlp": 0.01266707, + "balance_loss_clip": 0.06282556, + "balance_loss_mlp": 0.01252485, + "epoch": 0.38316548925296856, + "flos": 23265849830400.0, + "grad_norm": 1.5439174716844835, + "language_loss": 0.85255867, + "learning_rate": 2.8281882535451266e-06, + "loss": 0.92977273, + "num_input_tokens_seen": 136721265, + "router_z_loss_clip": 1.72167969, + "router_z_loss_mlp": 0.14221191, + "step": 6373, + "time_per_iteration": 4.056765794754028 + }, + { + "auxiliary_loss_clip": 0.0645606, + "auxiliary_loss_mlp": 0.01272183, + "balance_loss_clip": 0.06281903, + "balance_loss_mlp": 0.01257431, + "epoch": 0.3832256125056366, + "flos": 34432131358080.0, + "grad_norm": 8.29118461423438, + "language_loss": 0.75127506, + "learning_rate": 2.8278337365551567e-06, + "loss": 0.82855743, + "num_input_tokens_seen": 136741885, + "router_z_loss_clip": 1.74121094, + "router_z_loss_mlp": 0.14758301, + "step": 6374, + "time_per_iteration": 2.739825963973999 + }, + { + "auxiliary_loss_clip": 0.06457414, + "auxiliary_loss_mlp": 0.01272454, + "balance_loss_clip": 0.0628335, + "balance_loss_mlp": 0.01258042, + "epoch": 0.38328573575830455, + "flos": 21769289182080.0, + "grad_norm": 1.9434329018980874, + "language_loss": 0.76033717, + "learning_rate": 2.8274791881736485e-06, + "loss": 0.83763582, + "num_input_tokens_seen": 136760905, + "router_z_loss_clip": 1.74121094, + "router_z_loss_mlp": 0.14416504, + "step": 6375, + "time_per_iteration": 2.521092176437378 + }, + { + "auxiliary_loss_clip": 0.06457017, + "auxiliary_loss_mlp": 0.01267252, + "balance_loss_clip": 0.06283681, + "balance_loss_mlp": 0.01252541, + "epoch": 0.3833458590109725, + "flos": 17385056000640.0, + "grad_norm": 2.081333613596134, + "language_loss": 0.73067588, + "learning_rate": 2.8271246084140457e-06, + "loss": 0.80791855, + "num_input_tokens_seen": 136777240, + "router_z_loss_clip": 1.73046875, + "router_z_loss_mlp": 0.1472168, + "step": 6376, + "time_per_iteration": 3.913828134536743 + }, + { + "auxiliary_loss_clip": 0.06451094, + "auxiliary_loss_mlp": 0.01266207, + "balance_loss_clip": 0.06282462, + "balance_loss_mlp": 0.01251294, + "epoch": 0.3834059822636405, + "flos": 29432326556160.0, + "grad_norm": 1.6469866452188906, + "language_loss": 0.68444526, + "learning_rate": 2.826769997289796e-06, + "loss": 0.76161826, + "num_input_tokens_seen": 136801040, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.14916992, + "step": 6377, + "time_per_iteration": 2.552703857421875 + }, + { + "auxiliary_loss_clip": 0.0646103, + "auxiliary_loss_mlp": 0.01268999, + "balance_loss_clip": 0.06285432, + "balance_loss_mlp": 0.01253413, + "epoch": 0.38346610551630844, + "flos": 21477191448960.0, + "grad_norm": 1.937210921117629, + "language_loss": 0.73608565, + "learning_rate": 2.826415354814344e-06, + "loss": 0.8133859, + "num_input_tokens_seen": 136819495, + "router_z_loss_clip": 1.75488281, + "router_z_loss_mlp": 0.15582275, + "step": 6378, + "time_per_iteration": 2.554784059524536 + }, + { + "auxiliary_loss_clip": 0.06455162, + "auxiliary_loss_mlp": 0.01271763, + "balance_loss_clip": 0.06283469, + "balance_loss_mlp": 0.01257661, + "epoch": 0.3835262287689764, + "flos": 27568253900160.0, + "grad_norm": 1.6187724503548255, + "language_loss": 0.69142127, + "learning_rate": 2.8260606810011396e-06, + "loss": 0.76869053, + "num_input_tokens_seen": 136838840, + "router_z_loss_clip": 1.72070312, + "router_z_loss_mlp": 0.14099121, + "step": 6379, + "time_per_iteration": 2.540184736251831 + }, + { + "auxiliary_loss_clip": 0.06449591, + "auxiliary_loss_mlp": 0.01271871, + "balance_loss_clip": 0.06281038, + "balance_loss_mlp": 0.01258209, + "epoch": 0.3835863520216444, + "flos": 15529201044480.0, + "grad_norm": 1.7677581121541173, + "language_loss": 0.8420229, + "learning_rate": 2.8257059758636315e-06, + "loss": 0.91923743, + "num_input_tokens_seen": 136854425, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.13659668, + "step": 6380, + "time_per_iteration": 3.9425628185272217 + }, + { + "auxiliary_loss_clip": 0.06454644, + "auxiliary_loss_mlp": 0.01270371, + "balance_loss_clip": 0.06286694, + "balance_loss_mlp": 0.01255786, + "epoch": 0.38364647527431234, + "flos": 21910851855360.0, + "grad_norm": 1.4264464063638025, + "language_loss": 0.81255281, + "learning_rate": 2.8253512394152697e-06, + "loss": 0.88980293, + "num_input_tokens_seen": 136874355, + "router_z_loss_clip": 1.67871094, + "router_z_loss_mlp": 0.14569092, + "step": 6381, + "time_per_iteration": 2.5692083835601807 + }, + { + "auxiliary_loss_clip": 0.06363897, + "auxiliary_loss_mlp": 0.0126892, + "balance_loss_clip": 0.06286111, + "balance_loss_mlp": 0.01265082, + "epoch": 0.3837065985269803, + "flos": 65553076120320.0, + "grad_norm": 0.8198763586735168, + "language_loss": 0.60085058, + "learning_rate": 2.8249964716695068e-06, + "loss": 0.67717874, + "num_input_tokens_seen": 136937475, + "router_z_loss_clip": 0.77734375, + "router_z_loss_mlp": 0.03833008, + "step": 6382, + "time_per_iteration": 3.1118690967559814 + }, + { + "auxiliary_loss_clip": 0.06458844, + "auxiliary_loss_mlp": 0.0127264, + "balance_loss_clip": 0.06285119, + "balance_loss_mlp": 0.01257375, + "epoch": 0.38376672177964827, + "flos": 28264103331840.0, + "grad_norm": 2.361672223919581, + "language_loss": 0.67004663, + "learning_rate": 2.824641672639794e-06, + "loss": 0.74736154, + "num_input_tokens_seen": 136955805, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.15264893, + "step": 6383, + "time_per_iteration": 3.949587345123291 + }, + { + "auxiliary_loss_clip": 0.06458098, + "auxiliary_loss_mlp": 0.01270272, + "balance_loss_clip": 0.06285569, + "balance_loss_mlp": 0.01255919, + "epoch": 0.38382684503231623, + "flos": 20637641064960.0, + "grad_norm": 1.580160930907899, + "language_loss": 0.75169957, + "learning_rate": 2.824286842339587e-06, + "loss": 0.82898319, + "num_input_tokens_seen": 136975240, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.14355469, + "step": 6384, + "time_per_iteration": 2.5578341484069824 + }, + { + "auxiliary_loss_clip": 0.0645394, + "auxiliary_loss_mlp": 0.01272921, + "balance_loss_clip": 0.06286485, + "balance_loss_mlp": 0.01259819, + "epoch": 0.3838869682849842, + "flos": 19611274003200.0, + "grad_norm": 1.4416039952500834, + "language_loss": 0.76348937, + "learning_rate": 2.823931980782341e-06, + "loss": 0.84075809, + "num_input_tokens_seen": 136994985, + "router_z_loss_clip": 1.67578125, + "router_z_loss_mlp": 0.13092041, + "step": 6385, + "time_per_iteration": 2.5225770473480225 + }, + { + "auxiliary_loss_clip": 0.06357871, + "auxiliary_loss_mlp": 0.01265462, + "balance_loss_clip": 0.06280675, + "balance_loss_mlp": 0.01261296, + "epoch": 0.38394709153765216, + "flos": 56572202856960.0, + "grad_norm": 1.1093406194632214, + "language_loss": 0.67841589, + "learning_rate": 2.82357708798151e-06, + "loss": 0.75464916, + "num_input_tokens_seen": 137046290, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.04168701, + "step": 6386, + "time_per_iteration": 3.0481390953063965 + }, + { + "auxiliary_loss_clip": 0.06453113, + "auxiliary_loss_mlp": 0.01268796, + "balance_loss_clip": 0.06286535, + "balance_loss_mlp": 0.01254777, + "epoch": 0.3840072147903202, + "flos": 15894323210880.0, + "grad_norm": 1.5665063027995272, + "language_loss": 0.72740716, + "learning_rate": 2.8232221639505547e-06, + "loss": 0.80462623, + "num_input_tokens_seen": 137064725, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.14025879, + "step": 6387, + "time_per_iteration": 2.514692783355713 + }, + { + "auxiliary_loss_clip": 0.06447147, + "auxiliary_loss_mlp": 0.01275854, + "balance_loss_clip": 0.06283197, + "balance_loss_mlp": 0.0126187, + "epoch": 0.38406733804298815, + "flos": 28225180310400.0, + "grad_norm": 2.2869557055676095, + "language_loss": 0.81707162, + "learning_rate": 2.822867208702932e-06, + "loss": 0.89430165, + "num_input_tokens_seen": 137086030, + "router_z_loss_clip": 1.63867188, + "router_z_loss_mlp": 0.13989258, + "step": 6388, + "time_per_iteration": 2.6592257022857666 + }, + { + "auxiliary_loss_clip": 0.06454118, + "auxiliary_loss_mlp": 0.01267752, + "balance_loss_clip": 0.0628527, + "balance_loss_mlp": 0.01253888, + "epoch": 0.3841274612956561, + "flos": 18229511848320.0, + "grad_norm": 1.6912658906890043, + "language_loss": 0.76762819, + "learning_rate": 2.8225122222521026e-06, + "loss": 0.84484684, + "num_input_tokens_seen": 137105400, + "router_z_loss_clip": 1.6875, + "router_z_loss_mlp": 0.13873291, + "step": 6389, + "time_per_iteration": 2.5315403938293457 + }, + { + "auxiliary_loss_clip": 0.06454799, + "auxiliary_loss_mlp": 0.01270773, + "balance_loss_clip": 0.06281878, + "balance_loss_mlp": 0.01254847, + "epoch": 0.3841875845483241, + "flos": 19799138856960.0, + "grad_norm": 1.6723623276481432, + "language_loss": 0.76991975, + "learning_rate": 2.8221572046115273e-06, + "loss": 0.84717548, + "num_input_tokens_seen": 137124985, + "router_z_loss_clip": 1.73046875, + "router_z_loss_mlp": 0.15905762, + "step": 6390, + "time_per_iteration": 2.5315029621124268 + }, + { + "auxiliary_loss_clip": 0.0646126, + "auxiliary_loss_mlp": 0.01271779, + "balance_loss_clip": 0.06286746, + "balance_loss_mlp": 0.01255572, + "epoch": 0.38424770780099204, + "flos": 29906670919680.0, + "grad_norm": 1.876202489708209, + "language_loss": 0.70321602, + "learning_rate": 2.821802155794668e-06, + "loss": 0.78054643, + "num_input_tokens_seen": 137146745, + "router_z_loss_clip": 1.74414062, + "router_z_loss_mlp": 0.1618042, + "step": 6391, + "time_per_iteration": 2.6110270023345947 + }, + { + "auxiliary_loss_clip": 0.06455616, + "auxiliary_loss_mlp": 0.01272965, + "balance_loss_clip": 0.06284156, + "balance_loss_mlp": 0.01258499, + "epoch": 0.38430783105366, + "flos": 20820013476480.0, + "grad_norm": 1.8135855175826887, + "language_loss": 0.83923954, + "learning_rate": 2.8214470758149884e-06, + "loss": 0.91652524, + "num_input_tokens_seen": 137163195, + "router_z_loss_clip": 1.71386719, + "router_z_loss_mlp": 0.14459229, + "step": 6392, + "time_per_iteration": 2.5735576152801514 + }, + { + "auxiliary_loss_clip": 0.06461488, + "auxiliary_loss_mlp": 0.01268829, + "balance_loss_clip": 0.06290185, + "balance_loss_mlp": 0.01255162, + "epoch": 0.384367954306328, + "flos": 11003153627520.0, + "grad_norm": 1.9242234625767662, + "language_loss": 0.61454862, + "learning_rate": 2.8210919646859536e-06, + "loss": 0.69185179, + "num_input_tokens_seen": 137179330, + "router_z_loss_clip": 1.7109375, + "router_z_loss_mlp": 0.13677979, + "step": 6393, + "time_per_iteration": 2.4626450538635254 + }, + { + "auxiliary_loss_clip": 0.06467697, + "auxiliary_loss_mlp": 0.01271997, + "balance_loss_clip": 0.06290497, + "balance_loss_mlp": 0.01256071, + "epoch": 0.38442807755899594, + "flos": 25345096571520.0, + "grad_norm": 2.1306446802295325, + "language_loss": 0.71410203, + "learning_rate": 2.820736822421029e-06, + "loss": 0.79149896, + "num_input_tokens_seen": 137198655, + "router_z_loss_clip": 1.7734375, + "router_z_loss_mlp": 0.15905762, + "step": 6394, + "time_per_iteration": 2.5997071266174316 + }, + { + "auxiliary_loss_clip": 0.06463788, + "auxiliary_loss_mlp": 0.01269365, + "balance_loss_clip": 0.0628664, + "balance_loss_mlp": 0.01254082, + "epoch": 0.3844882008116639, + "flos": 21076206935040.0, + "grad_norm": 1.9216116882295546, + "language_loss": 0.82087183, + "learning_rate": 2.8203816490336822e-06, + "loss": 0.89820337, + "num_input_tokens_seen": 137217120, + "router_z_loss_clip": 1.76953125, + "router_z_loss_mlp": 0.1529541, + "step": 6395, + "time_per_iteration": 2.517411470413208 + }, + { + "auxiliary_loss_clip": 0.06460339, + "auxiliary_loss_mlp": 0.01275993, + "balance_loss_clip": 0.06287727, + "balance_loss_mlp": 0.01261831, + "epoch": 0.38454832406433187, + "flos": 17968287144960.0, + "grad_norm": 2.112818402600052, + "language_loss": 0.70801687, + "learning_rate": 2.8200264445373813e-06, + "loss": 0.78538024, + "num_input_tokens_seen": 137234410, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.14160156, + "step": 6396, + "time_per_iteration": 2.50288987159729 + }, + { + "auxiliary_loss_clip": 0.06365301, + "auxiliary_loss_mlp": 0.01257609, + "balance_loss_clip": 0.06287754, + "balance_loss_mlp": 0.01253767, + "epoch": 0.38460844731699984, + "flos": 67946641925760.0, + "grad_norm": 0.873922952794391, + "language_loss": 0.59863293, + "learning_rate": 2.8196712089455954e-06, + "loss": 0.67486203, + "num_input_tokens_seen": 137294940, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.0383606, + "step": 6397, + "time_per_iteration": 3.206678628921509 + }, + { + "auxiliary_loss_clip": 0.06450997, + "auxiliary_loss_mlp": 0.01276354, + "balance_loss_clip": 0.06284742, + "balance_loss_mlp": 0.0126187, + "epoch": 0.3846685705696678, + "flos": 25856267604480.0, + "grad_norm": 1.772406293141946, + "language_loss": 0.85227352, + "learning_rate": 2.819315942271794e-06, + "loss": 0.92954701, + "num_input_tokens_seen": 137315035, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.14477539, + "step": 6398, + "time_per_iteration": 2.5761947631835938 + }, + { + "auxiliary_loss_clip": 0.06453151, + "auxiliary_loss_mlp": 0.01277177, + "balance_loss_clip": 0.06285614, + "balance_loss_mlp": 0.01262467, + "epoch": 0.38472869382233577, + "flos": 16295852776320.0, + "grad_norm": 2.386881726324987, + "language_loss": 0.80489028, + "learning_rate": 2.8189606445294515e-06, + "loss": 0.88219357, + "num_input_tokens_seen": 137333155, + "router_z_loss_clip": 1.67578125, + "router_z_loss_mlp": 0.14715576, + "step": 6399, + "time_per_iteration": 2.4882943630218506 + }, + { + "auxiliary_loss_clip": 0.06455526, + "auxiliary_loss_mlp": 0.01279196, + "balance_loss_clip": 0.06283697, + "balance_loss_mlp": 0.01263592, + "epoch": 0.38478881707500373, + "flos": 19358979759360.0, + "grad_norm": 1.8772073039605681, + "language_loss": 0.67565721, + "learning_rate": 2.818605315732038e-06, + "loss": 0.75300437, + "num_input_tokens_seen": 137351515, + "router_z_loss_clip": 1.71972656, + "router_z_loss_mlp": 0.15588379, + "step": 6400, + "time_per_iteration": 2.5162830352783203 + }, + { + "auxiliary_loss_clip": 0.06460319, + "auxiliary_loss_mlp": 0.01269914, + "balance_loss_clip": 0.06288355, + "balance_loss_mlp": 0.01255454, + "epoch": 0.38484894032767175, + "flos": 24867356117760.0, + "grad_norm": 1.6933093627789975, + "language_loss": 0.7382642, + "learning_rate": 2.81824995589303e-06, + "loss": 0.81556654, + "num_input_tokens_seen": 137371255, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.14459229, + "step": 6401, + "time_per_iteration": 2.5274739265441895 + }, + { + "auxiliary_loss_clip": 0.06457724, + "auxiliary_loss_mlp": 0.01277936, + "balance_loss_clip": 0.06285743, + "balance_loss_mlp": 0.01262296, + "epoch": 0.3849090635803397, + "flos": 14507068613760.0, + "grad_norm": 1.836175131611194, + "language_loss": 0.72368169, + "learning_rate": 2.8178945650259012e-06, + "loss": 0.80103827, + "num_input_tokens_seen": 137388980, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.15637207, + "step": 6402, + "time_per_iteration": 2.509624481201172 + }, + { + "auxiliary_loss_clip": 0.06455728, + "auxiliary_loss_mlp": 0.01275333, + "balance_loss_clip": 0.06288305, + "balance_loss_mlp": 0.01261195, + "epoch": 0.3849691868330077, + "flos": 18521903070720.0, + "grad_norm": 1.8063322577059318, + "language_loss": 0.83321881, + "learning_rate": 2.817539143144128e-06, + "loss": 0.91052943, + "num_input_tokens_seen": 137406885, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.14147949, + "step": 6403, + "time_per_iteration": 2.469576835632324 + }, + { + "auxiliary_loss_clip": 0.06451748, + "auxiliary_loss_mlp": 0.01274136, + "balance_loss_clip": 0.06283461, + "balance_loss_mlp": 0.01259813, + "epoch": 0.38502931008567565, + "flos": 21622821045120.0, + "grad_norm": 1.901744090638215, + "language_loss": 0.83685166, + "learning_rate": 2.817183690261189e-06, + "loss": 0.91411054, + "num_input_tokens_seen": 137425535, + "router_z_loss_clip": 1.68066406, + "router_z_loss_mlp": 0.14331055, + "step": 6404, + "time_per_iteration": 2.53399920463562 + }, + { + "auxiliary_loss_clip": 0.06460617, + "auxiliary_loss_mlp": 0.01279935, + "balance_loss_clip": 0.06287636, + "balance_loss_mlp": 0.01265844, + "epoch": 0.3850894333383436, + "flos": 25423152249600.0, + "grad_norm": 1.4804001380923333, + "language_loss": 0.70053053, + "learning_rate": 2.816828206390563e-06, + "loss": 0.77793604, + "num_input_tokens_seen": 137447700, + "router_z_loss_clip": 1.72949219, + "router_z_loss_mlp": 0.14105225, + "step": 6405, + "time_per_iteration": 2.577394485473633 + }, + { + "auxiliary_loss_clip": 0.06446706, + "auxiliary_loss_mlp": 0.01276604, + "balance_loss_clip": 0.06280848, + "balance_loss_mlp": 0.01263628, + "epoch": 0.3851495565910116, + "flos": 20233721658240.0, + "grad_norm": 1.9002503642999313, + "language_loss": 0.7926501, + "learning_rate": 2.816472691545729e-06, + "loss": 0.86988324, + "num_input_tokens_seen": 137462245, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.12976074, + "step": 6406, + "time_per_iteration": 2.491785764694214 + }, + { + "auxiliary_loss_clip": 0.06454885, + "auxiliary_loss_mlp": 0.01271692, + "balance_loss_clip": 0.06282916, + "balance_loss_mlp": 0.01256516, + "epoch": 0.38520967984367954, + "flos": 16514045608320.0, + "grad_norm": 2.2453520034380463, + "language_loss": 0.84628403, + "learning_rate": 2.8161171457401694e-06, + "loss": 0.92354977, + "num_input_tokens_seen": 137476455, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.1517334, + "step": 6407, + "time_per_iteration": 2.461927890777588 + }, + { + "auxiliary_loss_clip": 0.06351051, + "auxiliary_loss_mlp": 0.01274061, + "balance_loss_clip": 0.06273395, + "balance_loss_mlp": 0.01270625, + "epoch": 0.3852698030963475, + "flos": 61333088140800.0, + "grad_norm": 0.7518927461814024, + "language_loss": 0.64829391, + "learning_rate": 2.815761568987365e-06, + "loss": 0.72454506, + "num_input_tokens_seen": 137539845, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.03445435, + "step": 6408, + "time_per_iteration": 3.195535659790039 + }, + { + "auxiliary_loss_clip": 0.06454469, + "auxiliary_loss_mlp": 0.01271284, + "balance_loss_clip": 0.06283102, + "balance_loss_mlp": 0.01256383, + "epoch": 0.3853299263490155, + "flos": 22899595633920.0, + "grad_norm": 1.3862214198415879, + "language_loss": 0.73785079, + "learning_rate": 2.8154059613008e-06, + "loss": 0.8151083, + "num_input_tokens_seen": 137559880, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.14904785, + "step": 6409, + "time_per_iteration": 2.5463829040527344 + }, + { + "auxiliary_loss_clip": 0.06465833, + "auxiliary_loss_mlp": 0.01272782, + "balance_loss_clip": 0.06287792, + "balance_loss_mlp": 0.01257667, + "epoch": 0.38539004960168344, + "flos": 20053655233920.0, + "grad_norm": 2.2638026574615076, + "language_loss": 0.70597708, + "learning_rate": 2.81505032269396e-06, + "loss": 0.78336322, + "num_input_tokens_seen": 137578225, + "router_z_loss_clip": 1.78027344, + "router_z_loss_mlp": 0.15100098, + "step": 6410, + "time_per_iteration": 2.4989383220672607 + }, + { + "auxiliary_loss_clip": 0.06347367, + "auxiliary_loss_mlp": 0.01259072, + "balance_loss_clip": 0.06269964, + "balance_loss_mlp": 0.01255689, + "epoch": 0.3854501728543514, + "flos": 68752971365760.0, + "grad_norm": 0.6472142759451909, + "language_loss": 0.6009953, + "learning_rate": 2.81469465318033e-06, + "loss": 0.67705965, + "num_input_tokens_seen": 137645770, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.03390503, + "step": 6411, + "time_per_iteration": 3.221977472305298 + }, + { + "auxiliary_loss_clip": 0.06456396, + "auxiliary_loss_mlp": 0.01271512, + "balance_loss_clip": 0.06285078, + "balance_loss_mlp": 0.01257266, + "epoch": 0.38551029610701937, + "flos": 20491214855040.0, + "grad_norm": 1.7976443608036217, + "language_loss": 0.78197634, + "learning_rate": 2.814338952773397e-06, + "loss": 0.85925543, + "num_input_tokens_seen": 137664090, + "router_z_loss_clip": 1.71484375, + "router_z_loss_mlp": 0.14245605, + "step": 6412, + "time_per_iteration": 2.5103437900543213 + }, + { + "auxiliary_loss_clip": 0.06460511, + "auxiliary_loss_mlp": 0.01272302, + "balance_loss_clip": 0.06287103, + "balance_loss_mlp": 0.01255267, + "epoch": 0.38557041935968733, + "flos": 23477627825280.0, + "grad_norm": 1.8586112834781277, + "language_loss": 0.78031844, + "learning_rate": 2.8139832214866493e-06, + "loss": 0.85764652, + "num_input_tokens_seen": 137683190, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.17041016, + "step": 6413, + "time_per_iteration": 3.933619499206543 + }, + { + "auxiliary_loss_clip": 0.06342902, + "auxiliary_loss_mlp": 0.01258937, + "balance_loss_clip": 0.06265719, + "balance_loss_mlp": 0.01255421, + "epoch": 0.38563054261235535, + "flos": 63984623068800.0, + "grad_norm": 0.7920557210391271, + "language_loss": 0.61310911, + "learning_rate": 2.813627459333576e-06, + "loss": 0.6891275, + "num_input_tokens_seen": 137737315, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.03527832, + "step": 6414, + "time_per_iteration": 3.063016891479492 + }, + { + "auxiliary_loss_clip": 0.06460327, + "auxiliary_loss_mlp": 0.0126994, + "balance_loss_clip": 0.06286235, + "balance_loss_mlp": 0.01255552, + "epoch": 0.3856906658650233, + "flos": 23994584789760.0, + "grad_norm": 1.981122511442252, + "language_loss": 0.78303337, + "learning_rate": 2.8132716663276685e-06, + "loss": 0.86033607, + "num_input_tokens_seen": 137753535, + "router_z_loss_clip": 1.74121094, + "router_z_loss_mlp": 0.14379883, + "step": 6415, + "time_per_iteration": 3.915883779525757 + }, + { + "auxiliary_loss_clip": 0.06448652, + "auxiliary_loss_mlp": 0.0126708, + "balance_loss_clip": 0.06285002, + "balance_loss_mlp": 0.01253842, + "epoch": 0.3857507891176913, + "flos": 25014075816960.0, + "grad_norm": 1.7132059772930233, + "language_loss": 0.8030045, + "learning_rate": 2.8129158424824173e-06, + "loss": 0.88016176, + "num_input_tokens_seen": 137773405, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.13244629, + "step": 6416, + "time_per_iteration": 2.5699849128723145 + }, + { + "auxiliary_loss_clip": 0.06451176, + "auxiliary_loss_mlp": 0.01270271, + "balance_loss_clip": 0.06281747, + "balance_loss_mlp": 0.01256353, + "epoch": 0.38581091237035925, + "flos": 21542082036480.0, + "grad_norm": 1.7425936217489657, + "language_loss": 0.79650658, + "learning_rate": 2.8125599878113155e-06, + "loss": 0.87372106, + "num_input_tokens_seen": 137790810, + "router_z_loss_clip": 1.69433594, + "router_z_loss_mlp": 0.13909912, + "step": 6417, + "time_per_iteration": 2.490114450454712 + }, + { + "auxiliary_loss_clip": 0.06448381, + "auxiliary_loss_mlp": 0.01266538, + "balance_loss_clip": 0.06279223, + "balance_loss_mlp": 0.01252602, + "epoch": 0.3858710356230272, + "flos": 17389584120960.0, + "grad_norm": 1.6880082960892822, + "language_loss": 0.80518526, + "learning_rate": 2.8122041023278583e-06, + "loss": 0.88233447, + "num_input_tokens_seen": 137810265, + "router_z_loss_clip": 1.69042969, + "router_z_loss_mlp": 0.13922119, + "step": 6418, + "time_per_iteration": 2.5246312618255615 + }, + { + "auxiliary_loss_clip": 0.06443715, + "auxiliary_loss_mlp": 0.01268216, + "balance_loss_clip": 0.06276865, + "balance_loss_mlp": 0.01254662, + "epoch": 0.3859311588756952, + "flos": 20345836821120.0, + "grad_norm": 1.685120659988575, + "language_loss": 0.79909503, + "learning_rate": 2.8118481860455407e-06, + "loss": 0.87621439, + "num_input_tokens_seen": 137828580, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.13568115, + "step": 6419, + "time_per_iteration": 3.9288835525512695 + }, + { + "auxiliary_loss_clip": 0.06446663, + "auxiliary_loss_mlp": 0.01270123, + "balance_loss_clip": 0.06280138, + "balance_loss_mlp": 0.01254745, + "epoch": 0.38599128212836314, + "flos": 26328054418560.0, + "grad_norm": 1.9252922162684358, + "language_loss": 0.67831242, + "learning_rate": 2.8114922389778573e-06, + "loss": 0.75548029, + "num_input_tokens_seen": 137846145, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.15362549, + "step": 6420, + "time_per_iteration": 2.5568132400512695 + }, + { + "auxiliary_loss_clip": 0.06447464, + "auxiliary_loss_mlp": 0.0127397, + "balance_loss_clip": 0.06282772, + "balance_loss_mlp": 0.01260267, + "epoch": 0.3860514053810311, + "flos": 13559050719360.0, + "grad_norm": 1.8138727093850848, + "language_loss": 0.81903851, + "learning_rate": 2.8111362611383076e-06, + "loss": 0.89625287, + "num_input_tokens_seen": 137863705, + "router_z_loss_clip": 1.64746094, + "router_z_loss_mlp": 0.13690186, + "step": 6421, + "time_per_iteration": 2.6095190048217773 + }, + { + "auxiliary_loss_clip": 0.06448883, + "auxiliary_loss_mlp": 0.01270223, + "balance_loss_clip": 0.06279142, + "balance_loss_mlp": 0.01254654, + "epoch": 0.3861115286336991, + "flos": 20959689432960.0, + "grad_norm": 1.9472147710185277, + "language_loss": 0.72463268, + "learning_rate": 2.8107802525403886e-06, + "loss": 0.80182374, + "num_input_tokens_seen": 137880285, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.15576172, + "step": 6422, + "time_per_iteration": 3.9032654762268066 + }, + { + "auxiliary_loss_clip": 0.06443937, + "auxiliary_loss_mlp": 0.01268443, + "balance_loss_clip": 0.06280221, + "balance_loss_mlp": 0.01254925, + "epoch": 0.38617165188636704, + "flos": 16368290231040.0, + "grad_norm": 1.6312257254810183, + "language_loss": 0.66935605, + "learning_rate": 2.8104242131976025e-06, + "loss": 0.74647987, + "num_input_tokens_seen": 137898335, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.13531494, + "step": 6423, + "time_per_iteration": 2.4858603477478027 + }, + { + "auxiliary_loss_clip": 0.06452656, + "auxiliary_loss_mlp": 0.01269446, + "balance_loss_clip": 0.06281117, + "balance_loss_mlp": 0.01254771, + "epoch": 0.386231775139035, + "flos": 34795828005120.0, + "grad_norm": 1.7836916741722195, + "language_loss": 0.69448572, + "learning_rate": 2.810068143123449e-06, + "loss": 0.77170676, + "num_input_tokens_seen": 137918605, + "router_z_loss_clip": 1.71679688, + "router_z_loss_mlp": 0.14685059, + "step": 6424, + "time_per_iteration": 2.636545181274414 + }, + { + "auxiliary_loss_clip": 0.06446116, + "auxiliary_loss_mlp": 0.01269815, + "balance_loss_clip": 0.0628031, + "balance_loss_mlp": 0.0125616, + "epoch": 0.38629189839170297, + "flos": 21732672147840.0, + "grad_norm": 1.4876753960050375, + "language_loss": 0.72829968, + "learning_rate": 2.809712042331429e-06, + "loss": 0.80545902, + "num_input_tokens_seen": 137938245, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.13677979, + "step": 6425, + "time_per_iteration": 2.520872116088867 + }, + { + "auxiliary_loss_clip": 0.06454374, + "auxiliary_loss_mlp": 0.01269159, + "balance_loss_clip": 0.06279134, + "balance_loss_mlp": 0.01254383, + "epoch": 0.38635202164437094, + "flos": 27930315392640.0, + "grad_norm": 3.253764220801107, + "language_loss": 0.8113848, + "learning_rate": 2.8093559108350484e-06, + "loss": 0.88862014, + "num_input_tokens_seen": 137956770, + "router_z_loss_clip": 1.75097656, + "router_z_loss_mlp": 0.14752197, + "step": 6426, + "time_per_iteration": 2.577439785003662 + }, + { + "auxiliary_loss_clip": 0.06458677, + "auxiliary_loss_mlp": 0.01277199, + "balance_loss_clip": 0.06288534, + "balance_loss_mlp": 0.01261797, + "epoch": 0.38641214489703896, + "flos": 23593390640640.0, + "grad_norm": 1.9966810796758758, + "language_loss": 0.75299263, + "learning_rate": 2.80899974864781e-06, + "loss": 0.83035141, + "num_input_tokens_seen": 137977040, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.15393066, + "step": 6427, + "time_per_iteration": 2.538494825363159 + }, + { + "auxiliary_loss_clip": 0.06449243, + "auxiliary_loss_mlp": 0.01269948, + "balance_loss_clip": 0.0627961, + "balance_loss_mlp": 0.01255512, + "epoch": 0.3864722681497069, + "flos": 12646224339840.0, + "grad_norm": 1.7399599530073546, + "language_loss": 0.70451963, + "learning_rate": 2.8086435557832203e-06, + "loss": 0.78171146, + "num_input_tokens_seen": 137993545, + "router_z_loss_clip": 1.69726562, + "router_z_loss_mlp": 0.14428711, + "step": 6428, + "time_per_iteration": 2.501620292663574 + }, + { + "auxiliary_loss_clip": 0.06450263, + "auxiliary_loss_mlp": 0.01273584, + "balance_loss_clip": 0.06279485, + "balance_loss_mlp": 0.01259517, + "epoch": 0.3865323914023749, + "flos": 17604003519360.0, + "grad_norm": 1.9791686977360912, + "language_loss": 0.84605539, + "learning_rate": 2.8082873322547863e-06, + "loss": 0.92329377, + "num_input_tokens_seen": 138010140, + "router_z_loss_clip": 1.70898438, + "router_z_loss_mlp": 0.14074707, + "step": 6429, + "time_per_iteration": 2.4769797325134277 + }, + { + "auxiliary_loss_clip": 0.06453393, + "auxiliary_loss_mlp": 0.01272687, + "balance_loss_clip": 0.06283154, + "balance_loss_mlp": 0.01258679, + "epoch": 0.38659251465504285, + "flos": 18484908693120.0, + "grad_norm": 1.8799663311521415, + "language_loss": 0.81149292, + "learning_rate": 2.807931078076015e-06, + "loss": 0.88875371, + "num_input_tokens_seen": 138028880, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.13995361, + "step": 6430, + "time_per_iteration": 2.552243232727051 + }, + { + "auxiliary_loss_clip": 0.06342202, + "auxiliary_loss_mlp": 0.0126596, + "balance_loss_clip": 0.06266356, + "balance_loss_mlp": 0.0126256, + "epoch": 0.3866526379077108, + "flos": 64186533480960.0, + "grad_norm": 0.7018569193916078, + "language_loss": 0.58841789, + "learning_rate": 2.807574793260416e-06, + "loss": 0.66449958, + "num_input_tokens_seen": 138098090, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.03408813, + "step": 6431, + "time_per_iteration": 3.1865365505218506 + }, + { + "auxiliary_loss_clip": 0.06457522, + "auxiliary_loss_mlp": 0.01268383, + "balance_loss_clip": 0.06283836, + "balance_loss_mlp": 0.01253464, + "epoch": 0.3867127611603788, + "flos": 14392857098880.0, + "grad_norm": 1.8389423140015868, + "language_loss": 0.79719216, + "learning_rate": 2.8072184778215004e-06, + "loss": 0.87445116, + "num_input_tokens_seen": 138114735, + "router_z_loss_clip": 1.73828125, + "router_z_loss_mlp": 0.14910889, + "step": 6432, + "time_per_iteration": 2.5060834884643555 + }, + { + "auxiliary_loss_clip": 0.06456694, + "auxiliary_loss_mlp": 0.0127456, + "balance_loss_clip": 0.06279335, + "balance_loss_mlp": 0.01259217, + "epoch": 0.38677288441304675, + "flos": 20016870491520.0, + "grad_norm": 2.041684818915054, + "language_loss": 0.80982423, + "learning_rate": 2.806862131772779e-06, + "loss": 0.88713682, + "num_input_tokens_seen": 138130480, + "router_z_loss_clip": 1.77050781, + "router_z_loss_mlp": 0.15350342, + "step": 6433, + "time_per_iteration": 2.4978644847869873 + }, + { + "auxiliary_loss_clip": 0.06452015, + "auxiliary_loss_mlp": 0.01268045, + "balance_loss_clip": 0.06280582, + "balance_loss_mlp": 0.01251725, + "epoch": 0.3868330076657147, + "flos": 22243465837440.0, + "grad_norm": 1.5518308416482827, + "language_loss": 0.71316475, + "learning_rate": 2.806505755127765e-06, + "loss": 0.79036534, + "num_input_tokens_seen": 138150640, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.16308594, + "step": 6434, + "time_per_iteration": 2.5623676776885986 + }, + { + "auxiliary_loss_clip": 0.06457677, + "auxiliary_loss_mlp": 0.01269901, + "balance_loss_clip": 0.06278059, + "balance_loss_mlp": 0.01254547, + "epoch": 0.3868931309183827, + "flos": 16733076981120.0, + "grad_norm": 1.5292505515468358, + "language_loss": 0.77740347, + "learning_rate": 2.806149347899972e-06, + "loss": 0.85467923, + "num_input_tokens_seen": 138169700, + "router_z_loss_clip": 1.79394531, + "router_z_loss_mlp": 0.15350342, + "step": 6435, + "time_per_iteration": 2.4930777549743652 + }, + { + "auxiliary_loss_clip": 0.06446007, + "auxiliary_loss_mlp": 0.01272949, + "balance_loss_clip": 0.0627854, + "balance_loss_mlp": 0.01257594, + "epoch": 0.38695325417105064, + "flos": 22681360874880.0, + "grad_norm": 2.334489182765127, + "language_loss": 0.79902756, + "learning_rate": 2.805792910102915e-06, + "loss": 0.87621707, + "num_input_tokens_seen": 138185835, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.15362549, + "step": 6436, + "time_per_iteration": 2.595480442047119 + }, + { + "auxiliary_loss_clip": 0.06446151, + "auxiliary_loss_mlp": 0.01269254, + "balance_loss_clip": 0.0628051, + "balance_loss_mlp": 0.01255312, + "epoch": 0.3870133774237186, + "flos": 23118668933760.0, + "grad_norm": 1.736913277816888, + "language_loss": 0.77232099, + "learning_rate": 2.8054364417501093e-06, + "loss": 0.84947503, + "num_input_tokens_seen": 138204080, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.13934326, + "step": 6437, + "time_per_iteration": 2.6555299758911133 + }, + { + "auxiliary_loss_clip": 0.064465, + "auxiliary_loss_mlp": 0.01272869, + "balance_loss_clip": 0.06279578, + "balance_loss_mlp": 0.01259422, + "epoch": 0.3870735006763866, + "flos": 17681430291840.0, + "grad_norm": 2.573442514460841, + "language_loss": 0.81961322, + "learning_rate": 2.805079942855074e-06, + "loss": 0.89680696, + "num_input_tokens_seen": 138220710, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.13452148, + "step": 6438, + "time_per_iteration": 2.55658221244812 + }, + { + "auxiliary_loss_clip": 0.06449786, + "auxiliary_loss_mlp": 0.01268651, + "balance_loss_clip": 0.06278464, + "balance_loss_mlp": 0.01253869, + "epoch": 0.38713362392905454, + "flos": 23302676499840.0, + "grad_norm": 1.3535213690135137, + "language_loss": 0.75684851, + "learning_rate": 2.804723413431326e-06, + "loss": 0.83403289, + "num_input_tokens_seen": 138241720, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.14782715, + "step": 6439, + "time_per_iteration": 2.5023999214172363 + }, + { + "auxiliary_loss_clip": 0.06452194, + "auxiliary_loss_mlp": 0.01275332, + "balance_loss_clip": 0.06287295, + "balance_loss_mlp": 0.0126083, + "epoch": 0.38719374718172256, + "flos": 21037283913600.0, + "grad_norm": 2.8624272787557556, + "language_loss": 0.74227071, + "learning_rate": 2.8043668534923855e-06, + "loss": 0.81954598, + "num_input_tokens_seen": 138261885, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.1449585, + "step": 6440, + "time_per_iteration": 2.5370354652404785 + }, + { + "auxiliary_loss_clip": 0.06454886, + "auxiliary_loss_mlp": 0.01272767, + "balance_loss_clip": 0.06279822, + "balance_loss_mlp": 0.01257401, + "epoch": 0.3872538704343905, + "flos": 19615885977600.0, + "grad_norm": 1.8472167429080706, + "language_loss": 0.82205182, + "learning_rate": 2.804010263051774e-06, + "loss": 0.89932835, + "num_input_tokens_seen": 138280255, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.15368652, + "step": 6441, + "time_per_iteration": 2.4829154014587402 + }, + { + "auxiliary_loss_clip": 0.06449816, + "auxiliary_loss_mlp": 0.01273448, + "balance_loss_clip": 0.0628119, + "balance_loss_mlp": 0.01258833, + "epoch": 0.3873139936870585, + "flos": 17535800695680.0, + "grad_norm": 2.061540845511299, + "language_loss": 0.80687004, + "learning_rate": 2.8036536421230118e-06, + "loss": 0.8841027, + "num_input_tokens_seen": 138296675, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.14593506, + "step": 6442, + "time_per_iteration": 2.5348403453826904 + }, + { + "auxiliary_loss_clip": 0.0645024, + "auxiliary_loss_mlp": 0.01274941, + "balance_loss_clip": 0.0628161, + "balance_loss_mlp": 0.01260302, + "epoch": 0.38737411693972645, + "flos": 17792539205760.0, + "grad_norm": 1.5850563005203315, + "language_loss": 0.84242606, + "learning_rate": 2.803296990719624e-06, + "loss": 0.91967785, + "num_input_tokens_seen": 138314985, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.14642334, + "step": 6443, + "time_per_iteration": 2.475142240524292 + }, + { + "auxiliary_loss_clip": 0.06346577, + "auxiliary_loss_mlp": 0.01257136, + "balance_loss_clip": 0.06270638, + "balance_loss_mlp": 0.01253804, + "epoch": 0.3874342401923944, + "flos": 58320554624640.0, + "grad_norm": 0.7460963165264183, + "language_loss": 0.5025984, + "learning_rate": 2.8029403088551327e-06, + "loss": 0.57863545, + "num_input_tokens_seen": 138373275, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.03338623, + "step": 6444, + "time_per_iteration": 3.146993398666382 + }, + { + "auxiliary_loss_clip": 0.06439754, + "auxiliary_loss_mlp": 0.01267857, + "balance_loss_clip": 0.0627708, + "balance_loss_mlp": 0.01254088, + "epoch": 0.3874943634450624, + "flos": 17717628055680.0, + "grad_norm": 1.4103476418524727, + "language_loss": 0.79081571, + "learning_rate": 2.802583596543065e-06, + "loss": 0.86789179, + "num_input_tokens_seen": 138391145, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.13757324, + "step": 6445, + "time_per_iteration": 2.4769954681396484 + }, + { + "auxiliary_loss_clip": 0.06442489, + "auxiliary_loss_mlp": 0.01275349, + "balance_loss_clip": 0.06277544, + "balance_loss_mlp": 0.01261497, + "epoch": 0.38755448669773035, + "flos": 19250889592320.0, + "grad_norm": 1.890349589911811, + "language_loss": 0.81530821, + "learning_rate": 2.8022268537969474e-06, + "loss": 0.89248657, + "num_input_tokens_seen": 138409875, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.13861084, + "step": 6446, + "time_per_iteration": 2.5224525928497314 + }, + { + "auxiliary_loss_clip": 0.06442682, + "auxiliary_loss_mlp": 0.01277068, + "balance_loss_clip": 0.06275576, + "balance_loss_mlp": 0.01262489, + "epoch": 0.3876146099503983, + "flos": 20600437052160.0, + "grad_norm": 2.019397578580159, + "language_loss": 0.77555805, + "learning_rate": 2.801870080630306e-06, + "loss": 0.85275555, + "num_input_tokens_seen": 138428965, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.14575195, + "step": 6447, + "time_per_iteration": 2.4808783531188965 + }, + { + "auxiliary_loss_clip": 0.06441282, + "auxiliary_loss_mlp": 0.01273458, + "balance_loss_clip": 0.06277911, + "balance_loss_mlp": 0.01259355, + "epoch": 0.3876747332030663, + "flos": 19287129283200.0, + "grad_norm": 1.5926200346390118, + "language_loss": 0.76299512, + "learning_rate": 2.801513277056671e-06, + "loss": 0.84014249, + "num_input_tokens_seen": 138448090, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.14099121, + "step": 6448, + "time_per_iteration": 2.532101631164551 + }, + { + "auxiliary_loss_clip": 0.06445228, + "auxiliary_loss_mlp": 0.01276025, + "balance_loss_clip": 0.06280892, + "balance_loss_mlp": 0.01262363, + "epoch": 0.38773485645573424, + "flos": 18950699940480.0, + "grad_norm": 1.5288018173805344, + "language_loss": 0.76734072, + "learning_rate": 2.8011564430895725e-06, + "loss": 0.84455323, + "num_input_tokens_seen": 138466105, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.13647461, + "step": 6449, + "time_per_iteration": 2.515660524368286 + }, + { + "auxiliary_loss_clip": 0.06448871, + "auxiliary_loss_mlp": 0.01273884, + "balance_loss_clip": 0.0627744, + "balance_loss_mlp": 0.01258673, + "epoch": 0.3877949797084022, + "flos": 23077272216960.0, + "grad_norm": 1.7542495709483765, + "language_loss": 0.78832948, + "learning_rate": 2.800799578742542e-06, + "loss": 0.86555696, + "num_input_tokens_seen": 138485160, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.15209961, + "step": 6450, + "time_per_iteration": 2.5662050247192383 + }, + { + "auxiliary_loss_clip": 0.06452119, + "auxiliary_loss_mlp": 0.01276385, + "balance_loss_clip": 0.06276712, + "balance_loss_mlp": 0.01261317, + "epoch": 0.3878551029610702, + "flos": 29103150591360.0, + "grad_norm": 2.1638461576043095, + "language_loss": 0.78188771, + "learning_rate": 2.8004426840291106e-06, + "loss": 0.8591727, + "num_input_tokens_seen": 138504135, + "router_z_loss_clip": 1.75390625, + "router_z_loss_mlp": 0.15063477, + "step": 6451, + "time_per_iteration": 2.5734686851501465 + }, + { + "auxiliary_loss_clip": 0.06442447, + "auxiliary_loss_mlp": 0.01277813, + "balance_loss_clip": 0.06278168, + "balance_loss_mlp": 0.01263967, + "epoch": 0.38791522621373814, + "flos": 21002763231360.0, + "grad_norm": 1.7745661107883532, + "language_loss": 0.76657486, + "learning_rate": 2.800085758962812e-06, + "loss": 0.84377748, + "num_input_tokens_seen": 138523955, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.13842773, + "step": 6452, + "time_per_iteration": 4.083965301513672 + }, + { + "auxiliary_loss_clip": 0.06445795, + "auxiliary_loss_mlp": 0.01272941, + "balance_loss_clip": 0.06277959, + "balance_loss_mlp": 0.01258457, + "epoch": 0.3879753494664061, + "flos": 15492248593920.0, + "grad_norm": 1.5775897118958155, + "language_loss": 0.80075014, + "learning_rate": 2.799728803557182e-06, + "loss": 0.87793756, + "num_input_tokens_seen": 138541655, + "router_z_loss_clip": 1.67871094, + "router_z_loss_mlp": 0.14483643, + "step": 6453, + "time_per_iteration": 2.5186924934387207 + }, + { + "auxiliary_loss_clip": 0.06452494, + "auxiliary_loss_mlp": 0.01273182, + "balance_loss_clip": 0.06277925, + "balance_loss_mlp": 0.01258472, + "epoch": 0.3880354727190741, + "flos": 22060422593280.0, + "grad_norm": 1.7271767654368522, + "language_loss": 0.71748114, + "learning_rate": 2.7993718178257555e-06, + "loss": 0.79473794, + "num_input_tokens_seen": 138560860, + "router_z_loss_clip": 1.74609375, + "router_z_loss_mlp": 0.14697266, + "step": 6454, + "time_per_iteration": 2.516023635864258 + }, + { + "auxiliary_loss_clip": 0.0645522, + "auxiliary_loss_mlp": 0.01280556, + "balance_loss_clip": 0.06279911, + "balance_loss_mlp": 0.01263986, + "epoch": 0.3880955959717421, + "flos": 20346675361920.0, + "grad_norm": 2.0562500360548452, + "language_loss": 0.77941358, + "learning_rate": 2.7990148017820694e-06, + "loss": 0.85677135, + "num_input_tokens_seen": 138580200, + "router_z_loss_clip": 1.75097656, + "router_z_loss_mlp": 0.16577148, + "step": 6455, + "time_per_iteration": 3.9251530170440674 + }, + { + "auxiliary_loss_clip": 0.0644723, + "auxiliary_loss_mlp": 0.0127199, + "balance_loss_clip": 0.062791, + "balance_loss_mlp": 0.01257804, + "epoch": 0.38815571922441006, + "flos": 23082009972480.0, + "grad_norm": 1.5355571660803105, + "language_loss": 0.76081556, + "learning_rate": 2.798657755439662e-06, + "loss": 0.83800781, + "num_input_tokens_seen": 138598315, + "router_z_loss_clip": 1.68066406, + "router_z_loss_mlp": 0.14196777, + "step": 6456, + "time_per_iteration": 2.5377979278564453 + }, + { + "auxiliary_loss_clip": 0.064498, + "auxiliary_loss_mlp": 0.01279611, + "balance_loss_clip": 0.06277888, + "balance_loss_mlp": 0.01264811, + "epoch": 0.388215842477078, + "flos": 20783186807040.0, + "grad_norm": 2.2521174172947838, + "language_loss": 0.60975528, + "learning_rate": 2.7983006788120726e-06, + "loss": 0.68704933, + "num_input_tokens_seen": 138615695, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.14801025, + "step": 6457, + "time_per_iteration": 2.500054121017456 + }, + { + "auxiliary_loss_clip": 0.06447765, + "auxiliary_loss_mlp": 0.01274853, + "balance_loss_clip": 0.06275971, + "balance_loss_mlp": 0.01259308, + "epoch": 0.388275965729746, + "flos": 20454304331520.0, + "grad_norm": 3.4499577756661384, + "language_loss": 0.80527538, + "learning_rate": 2.797943571912841e-06, + "loss": 0.88250154, + "num_input_tokens_seen": 138633180, + "router_z_loss_clip": 1.71777344, + "router_z_loss_mlp": 0.15551758, + "step": 6458, + "time_per_iteration": 2.5349881649017334 + }, + { + "auxiliary_loss_clip": 0.06448271, + "auxiliary_loss_mlp": 0.01274317, + "balance_loss_clip": 0.06278434, + "balance_loss_mlp": 0.0125938, + "epoch": 0.38833608898241395, + "flos": 27899945487360.0, + "grad_norm": 3.532155031934189, + "language_loss": 0.8156774, + "learning_rate": 2.797586434755509e-06, + "loss": 0.89290321, + "num_input_tokens_seen": 138654785, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.14941406, + "step": 6459, + "time_per_iteration": 4.015187978744507 + }, + { + "auxiliary_loss_clip": 0.0644253, + "auxiliary_loss_mlp": 0.01277266, + "balance_loss_clip": 0.06278129, + "balance_loss_mlp": 0.01263789, + "epoch": 0.3883962122350819, + "flos": 18082079389440.0, + "grad_norm": 1.6405749509561738, + "language_loss": 0.62564123, + "learning_rate": 2.7972292673536202e-06, + "loss": 0.7028392, + "num_input_tokens_seen": 138673330, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.13470459, + "step": 6460, + "time_per_iteration": 2.497053861618042 + }, + { + "auxiliary_loss_clip": 0.06445154, + "auxiliary_loss_mlp": 0.01273315, + "balance_loss_clip": 0.06277992, + "balance_loss_mlp": 0.01259374, + "epoch": 0.3884563354877499, + "flos": 23628875644800.0, + "grad_norm": 1.560750838950793, + "language_loss": 0.86785483, + "learning_rate": 2.796872069720717e-06, + "loss": 0.94503951, + "num_input_tokens_seen": 138694185, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.1394043, + "step": 6461, + "time_per_iteration": 2.5308427810668945 + }, + { + "auxiliary_loss_clip": 0.06442384, + "auxiliary_loss_mlp": 0.01273139, + "balance_loss_clip": 0.06272203, + "balance_loss_mlp": 0.01258369, + "epoch": 0.38851645874041785, + "flos": 27460834565760.0, + "grad_norm": 2.5738865735247285, + "language_loss": 0.71770304, + "learning_rate": 2.7965148418703456e-06, + "loss": 0.79485828, + "num_input_tokens_seen": 138714625, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.14782715, + "step": 6462, + "time_per_iteration": 3.942819833755493 + }, + { + "auxiliary_loss_clip": 0.06442184, + "auxiliary_loss_mlp": 0.01271045, + "balance_loss_clip": 0.0627287, + "balance_loss_mlp": 0.01256036, + "epoch": 0.3885765819930858, + "flos": 25235035833600.0, + "grad_norm": 2.2250707690072886, + "language_loss": 0.76693827, + "learning_rate": 2.796157583816052e-06, + "loss": 0.84407055, + "num_input_tokens_seen": 138733585, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.15014648, + "step": 6463, + "time_per_iteration": 2.577254056930542 + }, + { + "auxiliary_loss_clip": 0.06458563, + "auxiliary_loss_mlp": 0.01275367, + "balance_loss_clip": 0.06282724, + "balance_loss_mlp": 0.01259441, + "epoch": 0.3886367052457538, + "flos": 16952317989120.0, + "grad_norm": 2.5235079856597196, + "language_loss": 0.70838499, + "learning_rate": 2.795800295571382e-06, + "loss": 0.78572428, + "num_input_tokens_seen": 138752335, + "router_z_loss_clip": 1.75683594, + "router_z_loss_mlp": 0.15930176, + "step": 6464, + "time_per_iteration": 2.501830816268921 + }, + { + "auxiliary_loss_clip": 0.06442419, + "auxiliary_loss_mlp": 0.01270994, + "balance_loss_clip": 0.06275325, + "balance_loss_mlp": 0.01255699, + "epoch": 0.38869682849842174, + "flos": 27160141789440.0, + "grad_norm": 1.8571499226781363, + "language_loss": 0.69473737, + "learning_rate": 2.7954429771498858e-06, + "loss": 0.77187151, + "num_input_tokens_seen": 138768450, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.15301514, + "step": 6465, + "time_per_iteration": 2.6060595512390137 + }, + { + "auxiliary_loss_clip": 0.06446355, + "auxiliary_loss_mlp": 0.01273054, + "balance_loss_clip": 0.06276145, + "balance_loss_mlp": 0.01257271, + "epoch": 0.3887569517510897, + "flos": 21069037411200.0, + "grad_norm": 2.3078416168388243, + "language_loss": 0.78628361, + "learning_rate": 2.7950856285651117e-06, + "loss": 0.86347771, + "num_input_tokens_seen": 138786775, + "router_z_loss_clip": 1.70019531, + "router_z_loss_mlp": 0.15771484, + "step": 6466, + "time_per_iteration": 2.503218650817871 + }, + { + "auxiliary_loss_clip": 0.06447446, + "auxiliary_loss_mlp": 0.01269245, + "balance_loss_clip": 0.0627599, + "balance_loss_mlp": 0.01255, + "epoch": 0.38881707500375773, + "flos": 29505141354240.0, + "grad_norm": 1.7748655394270907, + "language_loss": 0.695912, + "learning_rate": 2.794728249830611e-06, + "loss": 0.77307892, + "num_input_tokens_seen": 138810100, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.1427002, + "step": 6467, + "time_per_iteration": 2.6156952381134033 + }, + { + "auxiliary_loss_clip": 0.0644877, + "auxiliary_loss_mlp": 0.01269809, + "balance_loss_clip": 0.06277345, + "balance_loss_mlp": 0.01255403, + "epoch": 0.3888771982564257, + "flos": 17493146167680.0, + "grad_norm": 2.2278384059050285, + "language_loss": 0.83988351, + "learning_rate": 2.794370840959936e-06, + "loss": 0.91706932, + "num_input_tokens_seen": 138825140, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.14404297, + "step": 6468, + "time_per_iteration": 2.446979522705078 + }, + { + "auxiliary_loss_clip": 0.0644114, + "auxiliary_loss_mlp": 0.01268766, + "balance_loss_clip": 0.06273733, + "balance_loss_mlp": 0.01254628, + "epoch": 0.38893732150909366, + "flos": 21948517065600.0, + "grad_norm": 2.4269891965149837, + "language_loss": 0.84667963, + "learning_rate": 2.7940134019666383e-06, + "loss": 0.92377871, + "num_input_tokens_seen": 138844115, + "router_z_loss_clip": 1.67480469, + "router_z_loss_mlp": 0.14141846, + "step": 6469, + "time_per_iteration": 2.6123251914978027 + }, + { + "auxiliary_loss_clip": 0.06445388, + "auxiliary_loss_mlp": 0.01267071, + "balance_loss_clip": 0.06276623, + "balance_loss_mlp": 0.01252575, + "epoch": 0.3889974447617616, + "flos": 24282657527040.0, + "grad_norm": 1.7885497899924685, + "language_loss": 0.75114912, + "learning_rate": 2.793655932864273e-06, + "loss": 0.82827377, + "num_input_tokens_seen": 138860860, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.14508057, + "step": 6470, + "time_per_iteration": 2.5293121337890625 + }, + { + "auxiliary_loss_clip": 0.06447375, + "auxiliary_loss_mlp": 0.01272376, + "balance_loss_clip": 0.06277959, + "balance_loss_mlp": 0.01257785, + "epoch": 0.3890575680144296, + "flos": 25674356390400.0, + "grad_norm": 2.975621998510204, + "language_loss": 0.75126278, + "learning_rate": 2.7932984336663953e-06, + "loss": 0.8284604, + "num_input_tokens_seen": 138881910, + "router_z_loss_clip": 1.69433594, + "router_z_loss_mlp": 0.14575195, + "step": 6471, + "time_per_iteration": 2.6211233139038086 + }, + { + "auxiliary_loss_clip": 0.0644885, + "auxiliary_loss_mlp": 0.01268799, + "balance_loss_clip": 0.06277963, + "balance_loss_mlp": 0.01254291, + "epoch": 0.38911769126709755, + "flos": 22861636934400.0, + "grad_norm": 1.6871762941495017, + "language_loss": 0.68158531, + "learning_rate": 2.792940904386562e-06, + "loss": 0.75876176, + "num_input_tokens_seen": 138900975, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.1451416, + "step": 6472, + "time_per_iteration": 2.5192203521728516 + }, + { + "auxiliary_loss_clip": 0.06449802, + "auxiliary_loss_mlp": 0.01271384, + "balance_loss_clip": 0.06278318, + "balance_loss_mlp": 0.01256739, + "epoch": 0.3891778145197655, + "flos": 25454612257920.0, + "grad_norm": 1.6537492711017865, + "language_loss": 0.76761287, + "learning_rate": 2.7925833450383293e-06, + "loss": 0.84482473, + "num_input_tokens_seen": 138920795, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.14654541, + "step": 6473, + "time_per_iteration": 2.588179349899292 + }, + { + "auxiliary_loss_clip": 0.06451473, + "auxiliary_loss_mlp": 0.01269072, + "balance_loss_clip": 0.0627984, + "balance_loss_mlp": 0.01254803, + "epoch": 0.3892379377724335, + "flos": 14033227374720.0, + "grad_norm": 1.8453216957475485, + "language_loss": 0.71886337, + "learning_rate": 2.792225755635257e-06, + "loss": 0.79606879, + "num_input_tokens_seen": 138938770, + "router_z_loss_clip": 1.71582031, + "router_z_loss_mlp": 0.1427002, + "step": 6474, + "time_per_iteration": 2.5054657459259033 + }, + { + "auxiliary_loss_clip": 0.06452703, + "auxiliary_loss_mlp": 0.01266582, + "balance_loss_clip": 0.06280853, + "balance_loss_mlp": 0.01252945, + "epoch": 0.38929806102510145, + "flos": 20163715971840.0, + "grad_norm": 1.4152146042292184, + "language_loss": 0.68943882, + "learning_rate": 2.7918681361909046e-06, + "loss": 0.76663172, + "num_input_tokens_seen": 138958880, + "router_z_loss_clip": 1.71582031, + "router_z_loss_mlp": 0.1362915, + "step": 6475, + "time_per_iteration": 2.5646328926086426 + }, + { + "auxiliary_loss_clip": 0.06459899, + "auxiliary_loss_mlp": 0.01272247, + "balance_loss_clip": 0.06281739, + "balance_loss_mlp": 0.01257107, + "epoch": 0.3893581842777694, + "flos": 22170525258240.0, + "grad_norm": 1.7897820076570896, + "language_loss": 0.75474584, + "learning_rate": 2.7915104867188332e-06, + "loss": 0.83206725, + "num_input_tokens_seen": 138977240, + "router_z_loss_clip": 1.78125, + "router_z_loss_mlp": 0.15142822, + "step": 6476, + "time_per_iteration": 2.515145778656006 + }, + { + "auxiliary_loss_clip": 0.06356712, + "auxiliary_loss_mlp": 0.01262119, + "balance_loss_clip": 0.06275933, + "balance_loss_mlp": 0.01259353, + "epoch": 0.3894183075304374, + "flos": 67322936459520.0, + "grad_norm": 0.7612569916112396, + "language_loss": 0.58157814, + "learning_rate": 2.7911528072326055e-06, + "loss": 0.65776634, + "num_input_tokens_seen": 139039035, + "router_z_loss_clip": 0.80615234, + "router_z_loss_mlp": 0.0276947, + "step": 6477, + "time_per_iteration": 3.147226572036743 + }, + { + "auxiliary_loss_clip": 0.06461065, + "auxiliary_loss_mlp": 0.0127366, + "balance_loss_clip": 0.06287047, + "balance_loss_mlp": 0.01258711, + "epoch": 0.38947843078310534, + "flos": 18552734173440.0, + "grad_norm": 2.207057593016708, + "language_loss": 0.77832031, + "learning_rate": 2.7907950977457832e-06, + "loss": 0.85566759, + "num_input_tokens_seen": 139055560, + "router_z_loss_clip": 1.73925781, + "router_z_loss_mlp": 0.14953613, + "step": 6478, + "time_per_iteration": 2.5238850116729736 + }, + { + "auxiliary_loss_clip": 0.06450923, + "auxiliary_loss_mlp": 0.01273895, + "balance_loss_clip": 0.06281843, + "balance_loss_mlp": 0.01260162, + "epoch": 0.3895385540357733, + "flos": 14610253317120.0, + "grad_norm": 2.187508322407885, + "language_loss": 0.83306336, + "learning_rate": 2.7904373582719317e-06, + "loss": 0.91031158, + "num_input_tokens_seen": 139071865, + "router_z_loss_clip": 1.69140625, + "router_z_loss_mlp": 0.13739014, + "step": 6479, + "time_per_iteration": 2.5355920791625977 + }, + { + "auxiliary_loss_clip": 0.06451993, + "auxiliary_loss_mlp": 0.0126931, + "balance_loss_clip": 0.06282853, + "balance_loss_mlp": 0.01254414, + "epoch": 0.38959867728844133, + "flos": 19981469341440.0, + "grad_norm": 1.7759645272954405, + "language_loss": 0.80297941, + "learning_rate": 2.790079588824617e-06, + "loss": 0.8801924, + "num_input_tokens_seen": 139089640, + "router_z_loss_clip": 1.69140625, + "router_z_loss_mlp": 0.14892578, + "step": 6480, + "time_per_iteration": 2.51645565032959 + }, + { + "auxiliary_loss_clip": 0.06447603, + "auxiliary_loss_mlp": 0.01270991, + "balance_loss_clip": 0.06278986, + "balance_loss_mlp": 0.01256924, + "epoch": 0.3896588005411093, + "flos": 22678342128000.0, + "grad_norm": 1.6438066173178132, + "language_loss": 0.83259583, + "learning_rate": 2.7897217894174038e-06, + "loss": 0.90978175, + "num_input_tokens_seen": 139109365, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.140625, + "step": 6481, + "time_per_iteration": 2.542642116546631 + }, + { + "auxiliary_loss_clip": 0.06446713, + "auxiliary_loss_mlp": 0.0127065, + "balance_loss_clip": 0.0628217, + "balance_loss_mlp": 0.01257204, + "epoch": 0.38971892379377726, + "flos": 21002343960960.0, + "grad_norm": 1.5951406272778517, + "language_loss": 0.75640547, + "learning_rate": 2.789363960063863e-06, + "loss": 0.83357906, + "num_input_tokens_seen": 139128260, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.13458252, + "step": 6482, + "time_per_iteration": 2.5500056743621826 + }, + { + "auxiliary_loss_clip": 0.06452929, + "auxiliary_loss_mlp": 0.01268783, + "balance_loss_clip": 0.06281099, + "balance_loss_mlp": 0.01254853, + "epoch": 0.3897790470464452, + "flos": 22535060446080.0, + "grad_norm": 1.9197222218969183, + "language_loss": 0.78993875, + "learning_rate": 2.78900610077756e-06, + "loss": 0.86715591, + "num_input_tokens_seen": 139147315, + "router_z_loss_clip": 1.71777344, + "router_z_loss_mlp": 0.13922119, + "step": 6483, + "time_per_iteration": 2.5677597522735596 + }, + { + "auxiliary_loss_clip": 0.06452915, + "auxiliary_loss_mlp": 0.01271475, + "balance_loss_clip": 0.06281908, + "balance_loss_mlp": 0.01256157, + "epoch": 0.3898391702991132, + "flos": 26216484307200.0, + "grad_norm": 1.4915682478636534, + "language_loss": 0.80430162, + "learning_rate": 2.788648211572067e-06, + "loss": 0.88154554, + "num_input_tokens_seen": 139167270, + "router_z_loss_clip": 1.70898438, + "router_z_loss_mlp": 0.15307617, + "step": 6484, + "time_per_iteration": 2.582933187484741 + }, + { + "auxiliary_loss_clip": 0.06455952, + "auxiliary_loss_mlp": 0.01270999, + "balance_loss_clip": 0.06285131, + "balance_loss_mlp": 0.01255347, + "epoch": 0.38989929355178116, + "flos": 21071301471360.0, + "grad_norm": 1.959559170578303, + "language_loss": 0.7792083, + "learning_rate": 2.7882902924609557e-06, + "loss": 0.8564778, + "num_input_tokens_seen": 139185970, + "router_z_loss_clip": 1.70800781, + "router_z_loss_mlp": 0.15637207, + "step": 6485, + "time_per_iteration": 2.532944917678833 + }, + { + "auxiliary_loss_clip": 0.06453831, + "auxiliary_loss_mlp": 0.01268339, + "balance_loss_clip": 0.06280229, + "balance_loss_mlp": 0.01253444, + "epoch": 0.3899594168044491, + "flos": 25491229292160.0, + "grad_norm": 2.289645436499478, + "language_loss": 0.84979439, + "learning_rate": 2.7879323434577965e-06, + "loss": 0.92701602, + "num_input_tokens_seen": 139203730, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.14898682, + "step": 6486, + "time_per_iteration": 2.5743820667266846 + }, + { + "auxiliary_loss_clip": 0.06453397, + "auxiliary_loss_mlp": 0.01267827, + "balance_loss_clip": 0.06278502, + "balance_loss_mlp": 0.01253141, + "epoch": 0.3900195400571171, + "flos": 31147415452800.0, + "grad_norm": 1.9273192838933928, + "language_loss": 0.85622168, + "learning_rate": 2.7875743645761645e-06, + "loss": 0.93343389, + "num_input_tokens_seen": 139222560, + "router_z_loss_clip": 1.74902344, + "router_z_loss_mlp": 0.14672852, + "step": 6487, + "time_per_iteration": 2.580012321472168 + }, + { + "auxiliary_loss_clip": 0.06449067, + "auxiliary_loss_mlp": 0.01273707, + "balance_loss_clip": 0.06279142, + "balance_loss_mlp": 0.01259121, + "epoch": 0.39007966330978505, + "flos": 20236111499520.0, + "grad_norm": 1.468779525903349, + "language_loss": 0.73436427, + "learning_rate": 2.787216355829633e-06, + "loss": 0.81159198, + "num_input_tokens_seen": 139242165, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.14569092, + "step": 6488, + "time_per_iteration": 2.54925274848938 + }, + { + "auxiliary_loss_clip": 0.06455337, + "auxiliary_loss_mlp": 0.0127042, + "balance_loss_clip": 0.06281433, + "balance_loss_mlp": 0.01255072, + "epoch": 0.390139786562453, + "flos": 22535353935360.0, + "grad_norm": 1.7339556546984902, + "language_loss": 0.68455738, + "learning_rate": 2.786858317231779e-06, + "loss": 0.76181495, + "num_input_tokens_seen": 139262525, + "router_z_loss_clip": 1.73730469, + "router_z_loss_mlp": 0.15344238, + "step": 6489, + "time_per_iteration": 2.529337167739868 + }, + { + "auxiliary_loss_clip": 0.06445001, + "auxiliary_loss_mlp": 0.01269777, + "balance_loss_clip": 0.0627808, + "balance_loss_mlp": 0.01256079, + "epoch": 0.390199909815121, + "flos": 26440211508480.0, + "grad_norm": 1.5752653046558913, + "language_loss": 0.81221771, + "learning_rate": 2.7865002487961788e-06, + "loss": 0.88936543, + "num_input_tokens_seen": 139282835, + "router_z_loss_clip": 1.66894531, + "router_z_loss_mlp": 0.13690186, + "step": 6490, + "time_per_iteration": 2.580287218093872 + }, + { + "auxiliary_loss_clip": 0.06445351, + "auxiliary_loss_mlp": 0.01270566, + "balance_loss_clip": 0.06275269, + "balance_loss_mlp": 0.01255784, + "epoch": 0.39026003306778895, + "flos": 17280278069760.0, + "grad_norm": 1.8612382479767444, + "language_loss": 0.89715946, + "learning_rate": 2.7861421505364104e-06, + "loss": 0.97431856, + "num_input_tokens_seen": 139299490, + "router_z_loss_clip": 1.69824219, + "router_z_loss_mlp": 0.14782715, + "step": 6491, + "time_per_iteration": 2.476393461227417 + }, + { + "auxiliary_loss_clip": 0.06446734, + "auxiliary_loss_mlp": 0.01271059, + "balance_loss_clip": 0.06275047, + "balance_loss_mlp": 0.01256325, + "epoch": 0.3903201563204569, + "flos": 24539354110080.0, + "grad_norm": 1.7715634168525083, + "language_loss": 0.78570807, + "learning_rate": 2.7857840224660523e-06, + "loss": 0.86288601, + "num_input_tokens_seen": 139317865, + "router_z_loss_clip": 1.71679688, + "router_z_loss_mlp": 0.14746094, + "step": 6492, + "time_per_iteration": 3.918022871017456 + }, + { + "auxiliary_loss_clip": 0.06448489, + "auxiliary_loss_mlp": 0.01270077, + "balance_loss_clip": 0.06278895, + "balance_loss_mlp": 0.01255528, + "epoch": 0.39038027957312493, + "flos": 23774547168000.0, + "grad_norm": 1.9649032306705667, + "language_loss": 0.74995399, + "learning_rate": 2.7854258645986857e-06, + "loss": 0.82713962, + "num_input_tokens_seen": 139339840, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.14544678, + "step": 6493, + "time_per_iteration": 2.5337636470794678 + }, + { + "auxiliary_loss_clip": 0.06457585, + "auxiliary_loss_mlp": 0.0126917, + "balance_loss_clip": 0.06280027, + "balance_loss_mlp": 0.0125341, + "epoch": 0.3904404028257929, + "flos": 14105832537600.0, + "grad_norm": 2.4323863844033498, + "language_loss": 0.76480663, + "learning_rate": 2.7850676769478916e-06, + "loss": 0.84207416, + "num_input_tokens_seen": 139357555, + "router_z_loss_clip": 1.77636719, + "router_z_loss_mlp": 0.15771484, + "step": 6494, + "time_per_iteration": 3.9828202724456787 + }, + { + "auxiliary_loss_clip": 0.06461826, + "auxiliary_loss_mlp": 0.01272307, + "balance_loss_clip": 0.06279928, + "balance_loss_mlp": 0.01255582, + "epoch": 0.39050052607846086, + "flos": 16915742881920.0, + "grad_norm": 1.9306711407360488, + "language_loss": 0.74818373, + "learning_rate": 2.7847094595272525e-06, + "loss": 0.82552505, + "num_input_tokens_seen": 139374455, + "router_z_loss_clip": 1.81933594, + "router_z_loss_mlp": 0.16723633, + "step": 6495, + "time_per_iteration": 2.5104000568389893 + }, + { + "auxiliary_loss_clip": 0.06450078, + "auxiliary_loss_mlp": 0.01273142, + "balance_loss_clip": 0.06281738, + "balance_loss_mlp": 0.01257358, + "epoch": 0.39056064933112883, + "flos": 25921912878720.0, + "grad_norm": 2.748187950361319, + "language_loss": 0.68202364, + "learning_rate": 2.784351212350352e-06, + "loss": 0.75925589, + "num_input_tokens_seen": 139394770, + "router_z_loss_clip": 1.68554688, + "router_z_loss_mlp": 0.15783691, + "step": 6496, + "time_per_iteration": 2.550957202911377 + }, + { + "auxiliary_loss_clip": 0.0637021, + "auxiliary_loss_mlp": 0.01254222, + "balance_loss_clip": 0.06292024, + "balance_loss_mlp": 0.01251394, + "epoch": 0.3906207725837968, + "flos": 60046125281280.0, + "grad_norm": 0.6447698339715318, + "language_loss": 0.53706288, + "learning_rate": 2.783992935430775e-06, + "loss": 0.61330724, + "num_input_tokens_seen": 139454760, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 0.02824402, + "step": 6497, + "time_per_iteration": 3.2988505363464355 + }, + { + "auxiliary_loss_clip": 0.06453034, + "auxiliary_loss_mlp": 0.01276113, + "balance_loss_clip": 0.06281406, + "balance_loss_mlp": 0.01261265, + "epoch": 0.39068089583646476, + "flos": 21074949123840.0, + "grad_norm": 2.0090604178847795, + "language_loss": 0.68947327, + "learning_rate": 2.7836346287821068e-06, + "loss": 0.76676476, + "num_input_tokens_seen": 139472645, + "router_z_loss_clip": 1.71679688, + "router_z_loss_mlp": 0.14837646, + "step": 6498, + "time_per_iteration": 3.9722609519958496 + }, + { + "auxiliary_loss_clip": 0.06365327, + "auxiliary_loss_mlp": 0.01254987, + "balance_loss_clip": 0.06287005, + "balance_loss_mlp": 0.01252178, + "epoch": 0.3907410190891327, + "flos": 70468269897600.0, + "grad_norm": 0.719858085665683, + "language_loss": 0.51721394, + "learning_rate": 2.783276292417936e-06, + "loss": 0.59341711, + "num_input_tokens_seen": 139536730, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 0.02807617, + "step": 6499, + "time_per_iteration": 3.209885835647583 + }, + { + "auxiliary_loss_clip": 0.06452541, + "auxiliary_loss_mlp": 0.01273785, + "balance_loss_clip": 0.06277416, + "balance_loss_mlp": 0.0125681, + "epoch": 0.3908011423418007, + "flos": 27969531903360.0, + "grad_norm": 1.5964691032272669, + "language_loss": 0.7347858, + "learning_rate": 2.7829179263518487e-06, + "loss": 0.81204903, + "num_input_tokens_seen": 139557540, + "router_z_loss_clip": 1.75195312, + "router_z_loss_mlp": 0.16992188, + "step": 6500, + "time_per_iteration": 2.5915534496307373 + }, + { + "auxiliary_loss_clip": 0.06456988, + "auxiliary_loss_mlp": 0.01269402, + "balance_loss_clip": 0.06284038, + "balance_loss_mlp": 0.01254728, + "epoch": 0.39086126559446865, + "flos": 24468971080320.0, + "grad_norm": 2.170342944486325, + "language_loss": 0.68858671, + "learning_rate": 2.7825595305974354e-06, + "loss": 0.7658506, + "num_input_tokens_seen": 139576875, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.14691162, + "step": 6501, + "time_per_iteration": 3.948155164718628 + }, + { + "auxiliary_loss_clip": 0.06445958, + "auxiliary_loss_mlp": 0.01271431, + "balance_loss_clip": 0.06277448, + "balance_loss_mlp": 0.01256327, + "epoch": 0.3909213888471366, + "flos": 16946406276480.0, + "grad_norm": 1.631531331045391, + "language_loss": 0.78994954, + "learning_rate": 2.782201105168287e-06, + "loss": 0.86712337, + "num_input_tokens_seen": 139594295, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.15100098, + "step": 6502, + "time_per_iteration": 2.505021810531616 + }, + { + "auxiliary_loss_clip": 0.06451446, + "auxiliary_loss_mlp": 0.01272758, + "balance_loss_clip": 0.06288067, + "balance_loss_mlp": 0.01259133, + "epoch": 0.3909815120998046, + "flos": 29286109981440.0, + "grad_norm": 4.8026818588998115, + "language_loss": 0.80286908, + "learning_rate": 2.7818426500779932e-06, + "loss": 0.88011116, + "num_input_tokens_seen": 139614080, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.13623047, + "step": 6503, + "time_per_iteration": 2.6041667461395264 + }, + { + "auxiliary_loss_clip": 0.06444375, + "auxiliary_loss_mlp": 0.01267951, + "balance_loss_clip": 0.06278107, + "balance_loss_mlp": 0.01253574, + "epoch": 0.39104163535247255, + "flos": 18956947069440.0, + "grad_norm": 1.8714653526076386, + "language_loss": 0.71717298, + "learning_rate": 2.7814841653401485e-06, + "loss": 0.79429626, + "num_input_tokens_seen": 139632755, + "router_z_loss_clip": 1.66113281, + "router_z_loss_mlp": 0.14379883, + "step": 6504, + "time_per_iteration": 2.499645471572876 + }, + { + "auxiliary_loss_clip": 0.06449269, + "auxiliary_loss_mlp": 0.01267487, + "balance_loss_clip": 0.06279607, + "balance_loss_mlp": 0.0125379, + "epoch": 0.3911017586051405, + "flos": 26330611968000.0, + "grad_norm": 1.7094242767760466, + "language_loss": 0.83403468, + "learning_rate": 2.7811256509683454e-06, + "loss": 0.91120219, + "num_input_tokens_seen": 139654205, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.137146, + "step": 6505, + "time_per_iteration": 2.5698060989379883 + }, + { + "auxiliary_loss_clip": 0.06447234, + "auxiliary_loss_mlp": 0.01267488, + "balance_loss_clip": 0.06281015, + "balance_loss_mlp": 0.01253022, + "epoch": 0.3911618818578085, + "flos": 21842313615360.0, + "grad_norm": 2.3254017668705083, + "language_loss": 0.71427596, + "learning_rate": 2.7807671069761797e-06, + "loss": 0.7914232, + "num_input_tokens_seen": 139673595, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.14465332, + "step": 6506, + "time_per_iteration": 2.4988996982574463 + }, + { + "auxiliary_loss_clip": 0.06443267, + "auxiliary_loss_mlp": 0.01271489, + "balance_loss_clip": 0.0628104, + "balance_loss_mlp": 0.01258149, + "epoch": 0.3912220051104765, + "flos": 16364768359680.0, + "grad_norm": 2.639532414168514, + "language_loss": 0.75588799, + "learning_rate": 2.7804085333772477e-06, + "loss": 0.83303547, + "num_input_tokens_seen": 139690565, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.13348389, + "step": 6507, + "time_per_iteration": 2.506723403930664 + }, + { + "auxiliary_loss_clip": 0.06355534, + "auxiliary_loss_mlp": 0.01255368, + "balance_loss_clip": 0.0627788, + "balance_loss_mlp": 0.01252429, + "epoch": 0.39128212836314447, + "flos": 71071179552000.0, + "grad_norm": 0.751869236178363, + "language_loss": 0.56649405, + "learning_rate": 2.7800499301851446e-06, + "loss": 0.64260316, + "num_input_tokens_seen": 139749420, + "router_z_loss_clip": 0.77441406, + "router_z_loss_mlp": 0.02935791, + "step": 6508, + "time_per_iteration": 3.282604455947876 + }, + { + "auxiliary_loss_clip": 0.06448714, + "auxiliary_loss_mlp": 0.01268575, + "balance_loss_clip": 0.06280237, + "balance_loss_mlp": 0.01254294, + "epoch": 0.39134225161581243, + "flos": 20336948288640.0, + "grad_norm": 1.8618605672003898, + "language_loss": 0.76758552, + "learning_rate": 2.779691297413471e-06, + "loss": 0.84475839, + "num_input_tokens_seen": 139766265, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.14276123, + "step": 6509, + "time_per_iteration": 2.5330445766448975 + }, + { + "auxiliary_loss_clip": 0.0644654, + "auxiliary_loss_mlp": 0.01272023, + "balance_loss_clip": 0.06278333, + "balance_loss_mlp": 0.01256073, + "epoch": 0.3914023748684804, + "flos": 17023916903040.0, + "grad_norm": 3.0317271524647427, + "language_loss": 0.83418059, + "learning_rate": 2.779332635075825e-06, + "loss": 0.91136616, + "num_input_tokens_seen": 139782400, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.1595459, + "step": 6510, + "time_per_iteration": 2.484149217605591 + }, + { + "auxiliary_loss_clip": 0.06450167, + "auxiliary_loss_mlp": 0.01268149, + "balance_loss_clip": 0.06277542, + "balance_loss_mlp": 0.01254463, + "epoch": 0.39146249812114836, + "flos": 18411045719040.0, + "grad_norm": 1.8343195842354416, + "language_loss": 0.77659726, + "learning_rate": 2.7789739431858073e-06, + "loss": 0.85378045, + "num_input_tokens_seen": 139801435, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.13684082, + "step": 6511, + "time_per_iteration": 2.493088722229004 + }, + { + "auxiliary_loss_clip": 0.06343137, + "auxiliary_loss_mlp": 0.01261237, + "balance_loss_clip": 0.06266295, + "balance_loss_mlp": 0.01258513, + "epoch": 0.3915226213738163, + "flos": 67659659291520.0, + "grad_norm": 0.7080449531762238, + "language_loss": 0.57720256, + "learning_rate": 2.7786152217570196e-06, + "loss": 0.65324628, + "num_input_tokens_seen": 139869700, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.02726746, + "step": 6512, + "time_per_iteration": 3.217658042907715 + }, + { + "auxiliary_loss_clip": 0.06445479, + "auxiliary_loss_mlp": 0.01273045, + "balance_loss_clip": 0.06275767, + "balance_loss_mlp": 0.01257452, + "epoch": 0.3915827446264843, + "flos": 26366516242560.0, + "grad_norm": 1.5252758876056967, + "language_loss": 0.69950658, + "learning_rate": 2.7782564708030647e-06, + "loss": 0.77669179, + "num_input_tokens_seen": 139890140, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.15600586, + "step": 6513, + "time_per_iteration": 2.560802936553955 + }, + { + "auxiliary_loss_clip": 0.06451759, + "auxiliary_loss_mlp": 0.01273121, + "balance_loss_clip": 0.06276838, + "balance_loss_mlp": 0.01258208, + "epoch": 0.39164286787915226, + "flos": 21950236074240.0, + "grad_norm": 2.7587511630204777, + "language_loss": 0.76322639, + "learning_rate": 2.7778976903375464e-06, + "loss": 0.8404752, + "num_input_tokens_seen": 139908020, + "router_z_loss_clip": 1.74707031, + "router_z_loss_mlp": 0.14916992, + "step": 6514, + "time_per_iteration": 2.499101400375366 + }, + { + "auxiliary_loss_clip": 0.0644438, + "auxiliary_loss_mlp": 0.01269565, + "balance_loss_clip": 0.06276566, + "balance_loss_mlp": 0.0125619, + "epoch": 0.3917029911318202, + "flos": 16405536170880.0, + "grad_norm": 1.811906351936664, + "language_loss": 0.782359, + "learning_rate": 2.7775388803740693e-06, + "loss": 0.8594985, + "num_input_tokens_seen": 139926180, + "router_z_loss_clip": 1.67773438, + "router_z_loss_mlp": 0.13378906, + "step": 6515, + "time_per_iteration": 2.5104947090148926 + }, + { + "auxiliary_loss_clip": 0.06443886, + "auxiliary_loss_mlp": 0.01270163, + "balance_loss_clip": 0.06277545, + "balance_loss_mlp": 0.0125705, + "epoch": 0.3917631143844882, + "flos": 26218580659200.0, + "grad_norm": 1.4298617884300358, + "language_loss": 0.79790455, + "learning_rate": 2.7771800409262406e-06, + "loss": 0.87504506, + "num_input_tokens_seen": 139947420, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.13122559, + "step": 6516, + "time_per_iteration": 2.5912764072418213 + }, + { + "auxiliary_loss_clip": 0.06446922, + "auxiliary_loss_mlp": 0.0126951, + "balance_loss_clip": 0.06278265, + "balance_loss_mlp": 0.0125511, + "epoch": 0.39182323763715615, + "flos": 18553740422400.0, + "grad_norm": 1.8457537699229483, + "language_loss": 0.70234001, + "learning_rate": 2.7768211720076665e-06, + "loss": 0.7795043, + "num_input_tokens_seen": 139965800, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.14404297, + "step": 6517, + "time_per_iteration": 2.630155324935913 + }, + { + "auxiliary_loss_clip": 0.06449963, + "auxiliary_loss_mlp": 0.01269735, + "balance_loss_clip": 0.06279542, + "balance_loss_mlp": 0.01254905, + "epoch": 0.3918833608898241, + "flos": 34322112547200.0, + "grad_norm": 1.6944592538331644, + "language_loss": 0.72209281, + "learning_rate": 2.776462273631956e-06, + "loss": 0.79928982, + "num_input_tokens_seen": 139988140, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.1484375, + "step": 6518, + "time_per_iteration": 2.6439340114593506 + }, + { + "auxiliary_loss_clip": 0.06453219, + "auxiliary_loss_mlp": 0.0127268, + "balance_loss_clip": 0.06280756, + "balance_loss_mlp": 0.0125751, + "epoch": 0.3919434841424921, + "flos": 36948434595840.0, + "grad_norm": 1.7409198797741048, + "language_loss": 0.62180024, + "learning_rate": 2.7761033458127177e-06, + "loss": 0.69905925, + "num_input_tokens_seen": 140010060, + "router_z_loss_clip": 1.72363281, + "router_z_loss_mlp": 0.15179443, + "step": 6519, + "time_per_iteration": 2.6407580375671387 + }, + { + "auxiliary_loss_clip": 0.06457552, + "auxiliary_loss_mlp": 0.01269986, + "balance_loss_clip": 0.06280086, + "balance_loss_mlp": 0.01253535, + "epoch": 0.3920036073951601, + "flos": 23514915692160.0, + "grad_norm": 2.3243103288051485, + "language_loss": 0.6728406, + "learning_rate": 2.775744388563563e-06, + "loss": 0.75011599, + "num_input_tokens_seen": 140029400, + "router_z_loss_clip": 1.77441406, + "router_z_loss_mlp": 0.16442871, + "step": 6520, + "time_per_iteration": 2.557736396789551 + }, + { + "auxiliary_loss_clip": 0.06452015, + "auxiliary_loss_mlp": 0.01272672, + "balance_loss_clip": 0.06281003, + "balance_loss_mlp": 0.0125845, + "epoch": 0.39206373064782807, + "flos": 18412051968000.0, + "grad_norm": 5.792319014223258, + "language_loss": 0.79119205, + "learning_rate": 2.775385401898104e-06, + "loss": 0.86843884, + "num_input_tokens_seen": 140048940, + "router_z_loss_clip": 1.70898438, + "router_z_loss_mlp": 0.14233398, + "step": 6521, + "time_per_iteration": 2.487144947052002 + }, + { + "auxiliary_loss_clip": 0.0645816, + "auxiliary_loss_mlp": 0.01271766, + "balance_loss_clip": 0.06282392, + "balance_loss_mlp": 0.01255297, + "epoch": 0.39212385390049603, + "flos": 12318012696960.0, + "grad_norm": 2.63137671789129, + "language_loss": 0.70893902, + "learning_rate": 2.775026385829952e-06, + "loss": 0.78623831, + "num_input_tokens_seen": 140066380, + "router_z_loss_clip": 1.75976562, + "router_z_loss_mlp": 0.16473389, + "step": 6522, + "time_per_iteration": 2.501777410507202 + }, + { + "auxiliary_loss_clip": 0.06455532, + "auxiliary_loss_mlp": 0.01272148, + "balance_loss_clip": 0.06282486, + "balance_loss_mlp": 0.01257693, + "epoch": 0.392183977153164, + "flos": 19725275882880.0, + "grad_norm": 2.1277990565539087, + "language_loss": 0.77424598, + "learning_rate": 2.774667340372722e-06, + "loss": 0.8515228, + "num_input_tokens_seen": 140085275, + "router_z_loss_clip": 1.73046875, + "router_z_loss_mlp": 0.14453125, + "step": 6523, + "time_per_iteration": 2.494900941848755 + }, + { + "auxiliary_loss_clip": 0.0645543, + "auxiliary_loss_mlp": 0.01272716, + "balance_loss_clip": 0.06282179, + "balance_loss_mlp": 0.01258769, + "epoch": 0.39224410040583196, + "flos": 33153092709120.0, + "grad_norm": 2.7826558407508855, + "language_loss": 0.62314886, + "learning_rate": 2.7743082655400293e-06, + "loss": 0.70043033, + "num_input_tokens_seen": 140105105, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.13964844, + "step": 6524, + "time_per_iteration": 2.6380085945129395 + }, + { + "auxiliary_loss_clip": 0.06452876, + "auxiliary_loss_mlp": 0.01268165, + "balance_loss_clip": 0.06281661, + "balance_loss_mlp": 0.01252895, + "epoch": 0.39230422365849993, + "flos": 27789884749440.0, + "grad_norm": 1.7105729654368218, + "language_loss": 0.74638754, + "learning_rate": 2.773949161345489e-06, + "loss": 0.82359803, + "num_input_tokens_seen": 140125645, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.15264893, + "step": 6525, + "time_per_iteration": 2.5430080890655518 + }, + { + "auxiliary_loss_clip": 0.06454577, + "auxiliary_loss_mlp": 0.0126824, + "balance_loss_clip": 0.06280737, + "balance_loss_mlp": 0.01253863, + "epoch": 0.3923643469111679, + "flos": 17937497969280.0, + "grad_norm": 2.1060109606385673, + "language_loss": 0.8182255, + "learning_rate": 2.773590027802719e-06, + "loss": 0.89545369, + "num_input_tokens_seen": 140141925, + "router_z_loss_clip": 1.73925781, + "router_z_loss_mlp": 0.14367676, + "step": 6526, + "time_per_iteration": 2.4994354248046875 + }, + { + "auxiliary_loss_clip": 0.06454204, + "auxiliary_loss_mlp": 0.01269978, + "balance_loss_clip": 0.06281518, + "balance_loss_mlp": 0.01255482, + "epoch": 0.39242447016383586, + "flos": 24066141776640.0, + "grad_norm": 1.5927090967738864, + "language_loss": 0.70157206, + "learning_rate": 2.7732308649253383e-06, + "loss": 0.77881384, + "num_input_tokens_seen": 140160965, + "router_z_loss_clip": 1.72753906, + "router_z_loss_mlp": 0.14501953, + "step": 6527, + "time_per_iteration": 2.5232326984405518 + }, + { + "auxiliary_loss_clip": 0.06452368, + "auxiliary_loss_mlp": 0.01268854, + "balance_loss_clip": 0.06281934, + "balance_loss_mlp": 0.01254245, + "epoch": 0.3924845934165038, + "flos": 10667562825600.0, + "grad_norm": 3.256824520755738, + "language_loss": 0.82039493, + "learning_rate": 2.772871672726965e-06, + "loss": 0.89760715, + "num_input_tokens_seen": 140177780, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.14605713, + "step": 6528, + "time_per_iteration": 2.498852014541626 + }, + { + "auxiliary_loss_clip": 0.06450985, + "auxiliary_loss_mlp": 0.0127277, + "balance_loss_clip": 0.06284485, + "balance_loss_mlp": 0.01258048, + "epoch": 0.3925447166691718, + "flos": 31253493121920.0, + "grad_norm": 1.712128770360143, + "language_loss": 0.68666142, + "learning_rate": 2.7725124512212205e-06, + "loss": 0.76389897, + "num_input_tokens_seen": 140201660, + "router_z_loss_clip": 1.66308594, + "router_z_loss_mlp": 0.14733887, + "step": 6529, + "time_per_iteration": 2.588303565979004 + }, + { + "auxiliary_loss_clip": 0.06454393, + "auxiliary_loss_mlp": 0.01267174, + "balance_loss_clip": 0.06281163, + "balance_loss_mlp": 0.01252213, + "epoch": 0.39260483992183975, + "flos": 29421215890560.0, + "grad_norm": 2.512935177473184, + "language_loss": 0.80622673, + "learning_rate": 2.7721532004217267e-06, + "loss": 0.8834424, + "num_input_tokens_seen": 140218585, + "router_z_loss_clip": 1.73339844, + "router_z_loss_mlp": 0.14959717, + "step": 6530, + "time_per_iteration": 2.5896732807159424 + }, + { + "auxiliary_loss_clip": 0.06449011, + "auxiliary_loss_mlp": 0.01267415, + "balance_loss_clip": 0.06279489, + "balance_loss_mlp": 0.0125252, + "epoch": 0.3926649631745077, + "flos": 22864571827200.0, + "grad_norm": 1.8446830755174628, + "language_loss": 0.76176864, + "learning_rate": 2.7717939203421063e-06, + "loss": 0.83893287, + "num_input_tokens_seen": 140239905, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.14892578, + "step": 6531, + "time_per_iteration": 3.9335060119628906 + }, + { + "auxiliary_loss_clip": 0.06348795, + "auxiliary_loss_mlp": 0.01256081, + "balance_loss_clip": 0.06271987, + "balance_loss_mlp": 0.01253434, + "epoch": 0.3927250864271757, + "flos": 63911892124800.0, + "grad_norm": 0.7987882767963658, + "language_loss": 0.6030035, + "learning_rate": 2.7714346109959822e-06, + "loss": 0.67905223, + "num_input_tokens_seen": 140293820, + "router_z_loss_clip": 0.76611328, + "router_z_loss_mlp": 0.02648926, + "step": 6532, + "time_per_iteration": 3.023615598678589 + }, + { + "auxiliary_loss_clip": 0.06346735, + "auxiliary_loss_mlp": 0.01258162, + "balance_loss_clip": 0.06270058, + "balance_loss_mlp": 0.01255445, + "epoch": 0.3927852096798437, + "flos": 68931486489600.0, + "grad_norm": 0.7618686105615924, + "language_loss": 0.55496854, + "learning_rate": 2.771075272396981e-06, + "loss": 0.63101745, + "num_input_tokens_seen": 140360420, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.02720642, + "step": 6533, + "time_per_iteration": 3.2504148483276367 + }, + { + "auxiliary_loss_clip": 0.06452841, + "auxiliary_loss_mlp": 0.01269959, + "balance_loss_clip": 0.06277935, + "balance_loss_mlp": 0.01254557, + "epoch": 0.39284533293251167, + "flos": 29723711529600.0, + "grad_norm": 1.823371664681604, + "language_loss": 0.76552856, + "learning_rate": 2.7707159045587284e-06, + "loss": 0.84275657, + "num_input_tokens_seen": 140381950, + "router_z_loss_clip": 1.74804688, + "router_z_loss_mlp": 0.15405273, + "step": 6534, + "time_per_iteration": 4.098775148391724 + }, + { + "auxiliary_loss_clip": 0.06459314, + "auxiliary_loss_mlp": 0.01269352, + "balance_loss_clip": 0.06282811, + "balance_loss_mlp": 0.01253974, + "epoch": 0.39290545618517964, + "flos": 18558016980480.0, + "grad_norm": 2.2164588420846267, + "language_loss": 0.78656316, + "learning_rate": 2.770356507494851e-06, + "loss": 0.86384982, + "num_input_tokens_seen": 140399410, + "router_z_loss_clip": 1.76464844, + "router_z_loss_mlp": 0.15380859, + "step": 6535, + "time_per_iteration": 2.4923341274261475 + }, + { + "auxiliary_loss_clip": 0.06449763, + "auxiliary_loss_mlp": 0.01266735, + "balance_loss_clip": 0.06282885, + "balance_loss_mlp": 0.01253592, + "epoch": 0.3929655794378476, + "flos": 26256581285760.0, + "grad_norm": 2.2738959430224326, + "language_loss": 0.69076276, + "learning_rate": 2.769997081218978e-06, + "loss": 0.76792771, + "num_input_tokens_seen": 140419055, + "router_z_loss_clip": 1.66699219, + "router_z_loss_mlp": 0.1315918, + "step": 6536, + "time_per_iteration": 2.5980727672576904 + }, + { + "auxiliary_loss_clip": 0.06448898, + "auxiliary_loss_mlp": 0.0127095, + "balance_loss_clip": 0.0628474, + "balance_loss_mlp": 0.01257265, + "epoch": 0.39302570269051557, + "flos": 29285564929920.0, + "grad_norm": 1.8741537429596062, + "language_loss": 0.69716197, + "learning_rate": 2.769637625744738e-06, + "loss": 0.77436042, + "num_input_tokens_seen": 140438800, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.13684082, + "step": 6537, + "time_per_iteration": 4.096014499664307 + }, + { + "auxiliary_loss_clip": 0.064602, + "auxiliary_loss_mlp": 0.01269576, + "balance_loss_clip": 0.06288625, + "balance_loss_mlp": 0.01255432, + "epoch": 0.39308582594318353, + "flos": 17353134794880.0, + "grad_norm": 1.7942703591990323, + "language_loss": 0.79606509, + "learning_rate": 2.769278141085763e-06, + "loss": 0.8733629, + "num_input_tokens_seen": 140456880, + "router_z_loss_clip": 1.71582031, + "router_z_loss_mlp": 0.14129639, + "step": 6538, + "time_per_iteration": 2.578815221786499 + }, + { + "auxiliary_loss_clip": 0.06359898, + "auxiliary_loss_mlp": 0.01255927, + "balance_loss_clip": 0.06283404, + "balance_loss_mlp": 0.0125297, + "epoch": 0.3931459491958515, + "flos": 61023884175360.0, + "grad_norm": 0.7947880980854773, + "language_loss": 0.61826062, + "learning_rate": 2.768918627255683e-06, + "loss": 0.69441885, + "num_input_tokens_seen": 140507510, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.02955627, + "step": 6539, + "time_per_iteration": 2.9553403854370117 + }, + { + "auxiliary_loss_clip": 0.06458268, + "auxiliary_loss_mlp": 0.01272646, + "balance_loss_clip": 0.06289513, + "balance_loss_mlp": 0.01257339, + "epoch": 0.39320607244851946, + "flos": 39024662590080.0, + "grad_norm": 2.4294685123961295, + "language_loss": 0.68263721, + "learning_rate": 2.7685590842681315e-06, + "loss": 0.75994635, + "num_input_tokens_seen": 140528740, + "router_z_loss_clip": 1.68554688, + "router_z_loss_mlp": 0.15307617, + "step": 6540, + "time_per_iteration": 2.732541799545288 + }, + { + "auxiliary_loss_clip": 0.06455955, + "auxiliary_loss_mlp": 0.01271651, + "balance_loss_clip": 0.06287128, + "balance_loss_mlp": 0.0125613, + "epoch": 0.3932661957011874, + "flos": 24686451152640.0, + "grad_norm": 1.7600019176005988, + "language_loss": 0.72681171, + "learning_rate": 2.7681995121367433e-06, + "loss": 0.80408776, + "num_input_tokens_seen": 140547560, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.15527344, + "step": 6541, + "time_per_iteration": 4.03834342956543 + }, + { + "auxiliary_loss_clip": 0.06358681, + "auxiliary_loss_mlp": 0.01262146, + "balance_loss_clip": 0.06282184, + "balance_loss_mlp": 0.01259297, + "epoch": 0.3933263189538554, + "flos": 70115614790400.0, + "grad_norm": 0.7938144397826515, + "language_loss": 0.60408866, + "learning_rate": 2.7678399108751516e-06, + "loss": 0.6802969, + "num_input_tokens_seen": 140601175, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.02844238, + "step": 6542, + "time_per_iteration": 3.0015151500701904 + }, + { + "auxiliary_loss_clip": 0.06453243, + "auxiliary_loss_mlp": 0.01279318, + "balance_loss_clip": 0.0628323, + "balance_loss_mlp": 0.01265305, + "epoch": 0.39338644220652336, + "flos": 22935583762560.0, + "grad_norm": 1.4413337304531033, + "language_loss": 0.82278919, + "learning_rate": 2.7674802804969947e-06, + "loss": 0.90011483, + "num_input_tokens_seen": 140622200, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.14013672, + "step": 6543, + "time_per_iteration": 2.6289048194885254 + }, + { + "auxiliary_loss_clip": 0.06454003, + "auxiliary_loss_mlp": 0.01270252, + "balance_loss_clip": 0.06284549, + "balance_loss_mlp": 0.01255768, + "epoch": 0.3934465654591913, + "flos": 30856282041600.0, + "grad_norm": 1.7408174737933344, + "language_loss": 0.69224536, + "learning_rate": 2.767120621015908e-06, + "loss": 0.76948798, + "num_input_tokens_seen": 140643125, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.14489746, + "step": 6544, + "time_per_iteration": 2.6554784774780273 + }, + { + "auxiliary_loss_clip": 0.06466363, + "auxiliary_loss_mlp": 0.01274712, + "balance_loss_clip": 0.06291823, + "balance_loss_mlp": 0.01258524, + "epoch": 0.3935066887118593, + "flos": 29243329672320.0, + "grad_norm": 2.0329338261061887, + "language_loss": 0.75462705, + "learning_rate": 2.76676093244553e-06, + "loss": 0.83203781, + "num_input_tokens_seen": 140662500, + "router_z_loss_clip": 1.74511719, + "router_z_loss_mlp": 0.1619873, + "step": 6545, + "time_per_iteration": 2.606234312057495 + }, + { + "auxiliary_loss_clip": 0.06446254, + "auxiliary_loss_mlp": 0.01275344, + "balance_loss_clip": 0.06285709, + "balance_loss_mlp": 0.01262309, + "epoch": 0.3935668119645273, + "flos": 19141290051840.0, + "grad_norm": 1.4467327313094591, + "language_loss": 0.75122333, + "learning_rate": 2.7664012147995015e-06, + "loss": 0.82843935, + "num_input_tokens_seen": 140681960, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.13043213, + "step": 6546, + "time_per_iteration": 2.5514185428619385 + }, + { + "auxiliary_loss_clip": 0.06461848, + "auxiliary_loss_mlp": 0.01270617, + "balance_loss_clip": 0.06285486, + "balance_loss_mlp": 0.01254822, + "epoch": 0.3936269352171953, + "flos": 18522196560000.0, + "grad_norm": 2.187625212538507, + "language_loss": 0.82285661, + "learning_rate": 2.7660414680914617e-06, + "loss": 0.90018129, + "num_input_tokens_seen": 140699170, + "router_z_loss_clip": 1.76367188, + "router_z_loss_mlp": 0.15783691, + "step": 6547, + "time_per_iteration": 2.536921501159668 + }, + { + "auxiliary_loss_clip": 0.06454909, + "auxiliary_loss_mlp": 0.01273072, + "balance_loss_clip": 0.06285325, + "balance_loss_mlp": 0.01259685, + "epoch": 0.39368705846986324, + "flos": 15638255533440.0, + "grad_norm": 1.8611217813328955, + "language_loss": 0.84309554, + "learning_rate": 2.7656816923350525e-06, + "loss": 0.92037535, + "num_input_tokens_seen": 140714920, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.1340332, + "step": 6548, + "time_per_iteration": 2.586596727371216 + }, + { + "auxiliary_loss_clip": 0.06451154, + "auxiliary_loss_mlp": 0.01275141, + "balance_loss_clip": 0.06285168, + "balance_loss_mlp": 0.01261325, + "epoch": 0.3937471817225312, + "flos": 21332442320640.0, + "grad_norm": 1.5541020214417252, + "language_loss": 0.7306931, + "learning_rate": 2.7653218875439174e-06, + "loss": 0.8079561, + "num_input_tokens_seen": 140734595, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.13842773, + "step": 6549, + "time_per_iteration": 2.5176355838775635 + }, + { + "auxiliary_loss_clip": 0.06453951, + "auxiliary_loss_mlp": 0.0127461, + "balance_loss_clip": 0.06282675, + "balance_loss_mlp": 0.01258398, + "epoch": 0.39380730497519917, + "flos": 20782893317760.0, + "grad_norm": 1.443831260247086, + "language_loss": 0.77958995, + "learning_rate": 2.764962053731699e-06, + "loss": 0.85687554, + "num_input_tokens_seen": 140754050, + "router_z_loss_clip": 1.71191406, + "router_z_loss_mlp": 0.16204834, + "step": 6550, + "time_per_iteration": 2.5665266513824463 + }, + { + "auxiliary_loss_clip": 0.06449334, + "auxiliary_loss_mlp": 0.01268564, + "balance_loss_clip": 0.0628082, + "balance_loss_mlp": 0.01254455, + "epoch": 0.39386742822786713, + "flos": 21615106469760.0, + "grad_norm": 1.5479702434138036, + "language_loss": 0.81395853, + "learning_rate": 2.7646021909120434e-06, + "loss": 0.89113748, + "num_input_tokens_seen": 140771440, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.14129639, + "step": 6551, + "time_per_iteration": 2.509472370147705 + }, + { + "auxiliary_loss_clip": 0.06452134, + "auxiliary_loss_mlp": 0.01274621, + "balance_loss_clip": 0.06282679, + "balance_loss_mlp": 0.01259791, + "epoch": 0.3939275514805351, + "flos": 12418304434560.0, + "grad_norm": 2.3772322810911892, + "language_loss": 0.80163503, + "learning_rate": 2.764242299098596e-06, + "loss": 0.87890255, + "num_input_tokens_seen": 140786715, + "router_z_loss_clip": 1.69433594, + "router_z_loss_mlp": 0.14825439, + "step": 6552, + "time_per_iteration": 2.512632369995117 + }, + { + "auxiliary_loss_clip": 0.06458388, + "auxiliary_loss_mlp": 0.01271806, + "balance_loss_clip": 0.06285821, + "balance_loss_mlp": 0.01256803, + "epoch": 0.39398767473320306, + "flos": 18558016980480.0, + "grad_norm": 1.9836463121020687, + "language_loss": 0.71468151, + "learning_rate": 2.763882378305003e-06, + "loss": 0.79198349, + "num_input_tokens_seen": 140804950, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.14996338, + "step": 6553, + "time_per_iteration": 2.4973459243774414 + }, + { + "auxiliary_loss_clip": 0.06447914, + "auxiliary_loss_mlp": 0.01269169, + "balance_loss_clip": 0.06280744, + "balance_loss_mlp": 0.0125422, + "epoch": 0.39404779798587103, + "flos": 29315599418880.0, + "grad_norm": 1.8230931816174483, + "language_loss": 0.64176017, + "learning_rate": 2.7635224285449144e-06, + "loss": 0.71893102, + "num_input_tokens_seen": 140822800, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.14941406, + "step": 6554, + "time_per_iteration": 2.6340816020965576 + }, + { + "auxiliary_loss_clip": 0.06448209, + "auxiliary_loss_mlp": 0.01269545, + "balance_loss_clip": 0.06281387, + "balance_loss_mlp": 0.0125561, + "epoch": 0.394107921238539, + "flos": 34905679107840.0, + "grad_norm": 1.8577413865682035, + "language_loss": 0.79801202, + "learning_rate": 2.7631624498319796e-06, + "loss": 0.8751896, + "num_input_tokens_seen": 140842940, + "router_z_loss_clip": 1.66894531, + "router_z_loss_mlp": 0.13934326, + "step": 6555, + "time_per_iteration": 2.673266887664795 + }, + { + "auxiliary_loss_clip": 0.06451041, + "auxiliary_loss_mlp": 0.01267708, + "balance_loss_clip": 0.06280783, + "balance_loss_mlp": 0.01252748, + "epoch": 0.39416804449120696, + "flos": 25088232280320.0, + "grad_norm": 1.8326733466575391, + "language_loss": 0.72028196, + "learning_rate": 2.7628024421798473e-06, + "loss": 0.79746938, + "num_input_tokens_seen": 140863060, + "router_z_loss_clip": 1.70117188, + "router_z_loss_mlp": 0.1496582, + "step": 6556, + "time_per_iteration": 2.572880744934082 + }, + { + "auxiliary_loss_clip": 0.06448796, + "auxiliary_loss_mlp": 0.01268731, + "balance_loss_clip": 0.06281175, + "balance_loss_mlp": 0.01254348, + "epoch": 0.3942281677438749, + "flos": 32314842063360.0, + "grad_norm": 2.2262653228658666, + "language_loss": 0.83903825, + "learning_rate": 2.7624424056021705e-06, + "loss": 0.91621351, + "num_input_tokens_seen": 140883795, + "router_z_loss_clip": 1.67675781, + "router_z_loss_mlp": 0.14373779, + "step": 6557, + "time_per_iteration": 2.605922222137451 + }, + { + "auxiliary_loss_clip": 0.06447846, + "auxiliary_loss_mlp": 0.01272636, + "balance_loss_clip": 0.06281336, + "balance_loss_mlp": 0.01258671, + "epoch": 0.3942882909965429, + "flos": 24943608933120.0, + "grad_norm": 2.1784611950300605, + "language_loss": 0.80248392, + "learning_rate": 2.7620823401126004e-06, + "loss": 0.87968874, + "num_input_tokens_seen": 140903055, + "router_z_loss_clip": 1.6640625, + "router_z_loss_mlp": 0.1395874, + "step": 6558, + "time_per_iteration": 2.5902092456817627 + }, + { + "auxiliary_loss_clip": 0.06445447, + "auxiliary_loss_mlp": 0.01267686, + "balance_loss_clip": 0.06280681, + "balance_loss_mlp": 0.01253816, + "epoch": 0.39434841424921085, + "flos": 11879614535040.0, + "grad_norm": 2.1357186014692546, + "language_loss": 0.71689725, + "learning_rate": 2.761722245724792e-06, + "loss": 0.79402852, + "num_input_tokens_seen": 140920685, + "router_z_loss_clip": 1.64648438, + "router_z_loss_mlp": 0.13873291, + "step": 6559, + "time_per_iteration": 2.4894917011260986 + }, + { + "auxiliary_loss_clip": 0.06456885, + "auxiliary_loss_mlp": 0.01269254, + "balance_loss_clip": 0.0628094, + "balance_loss_mlp": 0.01254622, + "epoch": 0.3944085375018789, + "flos": 16367032419840.0, + "grad_norm": 2.0841749511208705, + "language_loss": 0.81285572, + "learning_rate": 2.7613621224524003e-06, + "loss": 0.89011705, + "num_input_tokens_seen": 140937320, + "router_z_loss_clip": 1.7578125, + "router_z_loss_mlp": 0.14630127, + "step": 6560, + "time_per_iteration": 2.522434711456299 + }, + { + "auxiliary_loss_clip": 0.06452034, + "auxiliary_loss_mlp": 0.0126948, + "balance_loss_clip": 0.06282307, + "balance_loss_mlp": 0.01254078, + "epoch": 0.39446866075454684, + "flos": 10637821825920.0, + "grad_norm": 3.641985825462619, + "language_loss": 0.83127379, + "learning_rate": 2.7610019703090803e-06, + "loss": 0.90848899, + "num_input_tokens_seen": 140954855, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.15386963, + "step": 6561, + "time_per_iteration": 2.4804983139038086 + }, + { + "auxiliary_loss_clip": 0.06450383, + "auxiliary_loss_mlp": 0.0127031, + "balance_loss_clip": 0.06283262, + "balance_loss_mlp": 0.01257102, + "epoch": 0.3945287840072148, + "flos": 18193481792640.0, + "grad_norm": 2.043086634933395, + "language_loss": 0.80616236, + "learning_rate": 2.7606417893084887e-06, + "loss": 0.88336933, + "num_input_tokens_seen": 140973250, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.13208008, + "step": 6562, + "time_per_iteration": 2.5335006713867188 + }, + { + "auxiliary_loss_clip": 0.06448314, + "auxiliary_loss_mlp": 0.01268686, + "balance_loss_clip": 0.06283693, + "balance_loss_mlp": 0.01254476, + "epoch": 0.39458890725988277, + "flos": 23046650749440.0, + "grad_norm": 1.5717146465742573, + "language_loss": 0.81509531, + "learning_rate": 2.7602815794642853e-06, + "loss": 0.89226532, + "num_input_tokens_seen": 140993050, + "router_z_loss_clip": 1.64648438, + "router_z_loss_mlp": 0.14215088, + "step": 6563, + "time_per_iteration": 2.5315918922424316 + }, + { + "auxiliary_loss_clip": 0.06453238, + "auxiliary_loss_mlp": 0.01270349, + "balance_loss_clip": 0.0628344, + "balance_loss_mlp": 0.0125608, + "epoch": 0.39464903051255074, + "flos": 17163718640640.0, + "grad_norm": 1.8608988788141587, + "language_loss": 0.70080984, + "learning_rate": 2.759921340790127e-06, + "loss": 0.77804577, + "num_input_tokens_seen": 141010815, + "router_z_loss_clip": 1.69726562, + "router_z_loss_mlp": 0.14257812, + "step": 6564, + "time_per_iteration": 2.543459415435791 + }, + { + "auxiliary_loss_clip": 0.06449583, + "auxiliary_loss_mlp": 0.01269395, + "balance_loss_clip": 0.06281252, + "balance_loss_mlp": 0.01254648, + "epoch": 0.3947091537652187, + "flos": 15894616700160.0, + "grad_norm": 2.288586168499947, + "language_loss": 0.83967394, + "learning_rate": 2.759561073299676e-06, + "loss": 0.91686368, + "num_input_tokens_seen": 141028720, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.14746094, + "step": 6565, + "time_per_iteration": 2.5438666343688965 + }, + { + "auxiliary_loss_clip": 0.06447474, + "auxiliary_loss_mlp": 0.01269356, + "balance_loss_clip": 0.06280743, + "balance_loss_mlp": 0.01255229, + "epoch": 0.39476927701788667, + "flos": 18550386259200.0, + "grad_norm": 2.0020652066074285, + "language_loss": 0.83519006, + "learning_rate": 2.7592007770065937e-06, + "loss": 0.91235834, + "num_input_tokens_seen": 141046025, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.14129639, + "step": 6566, + "time_per_iteration": 2.550548791885376 + }, + { + "auxiliary_loss_clip": 0.06459671, + "auxiliary_loss_mlp": 0.01271058, + "balance_loss_clip": 0.06282969, + "balance_loss_mlp": 0.01255072, + "epoch": 0.39482940027055463, + "flos": 22282682348160.0, + "grad_norm": 1.770017298907609, + "language_loss": 0.77499187, + "learning_rate": 2.7588404519245403e-06, + "loss": 0.85229909, + "num_input_tokens_seen": 141066865, + "router_z_loss_clip": 1.76660156, + "router_z_loss_mlp": 0.15979004, + "step": 6567, + "time_per_iteration": 2.535980463027954 + }, + { + "auxiliary_loss_clip": 0.0644526, + "auxiliary_loss_mlp": 0.01270792, + "balance_loss_clip": 0.06283294, + "balance_loss_mlp": 0.01257851, + "epoch": 0.3948895235232226, + "flos": 14763010510080.0, + "grad_norm": 1.9280900707618294, + "language_loss": 0.80259991, + "learning_rate": 2.758480098067182e-06, + "loss": 0.87976044, + "num_input_tokens_seen": 141084210, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.12945557, + "step": 6568, + "time_per_iteration": 2.56528639793396 + }, + { + "auxiliary_loss_clip": 0.06451409, + "auxiliary_loss_mlp": 0.01272888, + "balance_loss_clip": 0.06283959, + "balance_loss_mlp": 0.01258356, + "epoch": 0.39494964677589056, + "flos": 22572474094080.0, + "grad_norm": 2.8189067544408166, + "language_loss": 0.84836519, + "learning_rate": 2.7581197154481816e-06, + "loss": 0.9256081, + "num_input_tokens_seen": 141103895, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.1451416, + "step": 6569, + "time_per_iteration": 2.512678623199463 + }, + { + "auxiliary_loss_clip": 0.06448043, + "auxiliary_loss_mlp": 0.01269688, + "balance_loss_clip": 0.06284526, + "balance_loss_mlp": 0.01255538, + "epoch": 0.3950097700285585, + "flos": 22969307831040.0, + "grad_norm": 1.7602858722639216, + "language_loss": 0.74665594, + "learning_rate": 2.7577593040812066e-06, + "loss": 0.82383323, + "num_input_tokens_seen": 141124000, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.14147949, + "step": 6570, + "time_per_iteration": 2.611072063446045 + }, + { + "auxiliary_loss_clip": 0.06447589, + "auxiliary_loss_mlp": 0.01270515, + "balance_loss_clip": 0.06279834, + "balance_loss_mlp": 0.01256305, + "epoch": 0.3950698932812265, + "flos": 20601569082240.0, + "grad_norm": 1.9769080404363342, + "language_loss": 0.80472994, + "learning_rate": 2.757398863979922e-06, + "loss": 0.88191104, + "num_input_tokens_seen": 141142535, + "router_z_loss_clip": 1.67871094, + "router_z_loss_mlp": 0.14196777, + "step": 6571, + "time_per_iteration": 4.037761688232422 + }, + { + "auxiliary_loss_clip": 0.06446905, + "auxiliary_loss_mlp": 0.012706, + "balance_loss_clip": 0.06278758, + "balance_loss_mlp": 0.01257022, + "epoch": 0.39513001653389446, + "flos": 20381992657920.0, + "grad_norm": 1.599556952476494, + "language_loss": 0.78081018, + "learning_rate": 2.757038395157997e-06, + "loss": 0.8579852, + "num_input_tokens_seen": 141161575, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.13574219, + "step": 6572, + "time_per_iteration": 2.542388439178467 + }, + { + "auxiliary_loss_clip": 0.06450671, + "auxiliary_loss_mlp": 0.01268422, + "balance_loss_clip": 0.06281148, + "balance_loss_mlp": 0.01253991, + "epoch": 0.3951901397865625, + "flos": 26469994435200.0, + "grad_norm": 1.9679034095416588, + "language_loss": 0.74861181, + "learning_rate": 2.7566778976291002e-06, + "loss": 0.8258028, + "num_input_tokens_seen": 141181150, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.14434814, + "step": 6573, + "time_per_iteration": 3.9954564571380615 + }, + { + "auxiliary_loss_clip": 0.06447303, + "auxiliary_loss_mlp": 0.012677, + "balance_loss_clip": 0.06282736, + "balance_loss_mlp": 0.0125492, + "epoch": 0.39525026303923044, + "flos": 43848845233920.0, + "grad_norm": 1.4348738267970096, + "language_loss": 0.67874503, + "learning_rate": 2.7563173714069017e-06, + "loss": 0.75589502, + "num_input_tokens_seen": 141206310, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.12799072, + "step": 6574, + "time_per_iteration": 2.75056791305542 + }, + { + "auxiliary_loss_clip": 0.06451984, + "auxiliary_loss_mlp": 0.01270185, + "balance_loss_clip": 0.06284595, + "balance_loss_mlp": 0.01255832, + "epoch": 0.3953103862918984, + "flos": 18046636312320.0, + "grad_norm": 3.0759560063082736, + "language_loss": 0.72770178, + "learning_rate": 2.755956816505072e-06, + "loss": 0.80492353, + "num_input_tokens_seen": 141223925, + "router_z_loss_clip": 1.67675781, + "router_z_loss_mlp": 0.14355469, + "step": 6575, + "time_per_iteration": 2.508314847946167 + }, + { + "auxiliary_loss_clip": 0.06452627, + "auxiliary_loss_mlp": 0.01270422, + "balance_loss_clip": 0.0628259, + "balance_loss_mlp": 0.01256015, + "epoch": 0.3953705095445664, + "flos": 16980549615360.0, + "grad_norm": 2.3956956088423382, + "language_loss": 0.73929548, + "learning_rate": 2.7555962329372845e-06, + "loss": 0.816526, + "num_input_tokens_seen": 141239010, + "router_z_loss_clip": 1.69824219, + "router_z_loss_mlp": 0.1439209, + "step": 6576, + "time_per_iteration": 2.4877238273620605 + }, + { + "auxiliary_loss_clip": 0.06453596, + "auxiliary_loss_mlp": 0.01269813, + "balance_loss_clip": 0.06286615, + "balance_loss_mlp": 0.0125704, + "epoch": 0.39543063279723434, + "flos": 17415300124800.0, + "grad_norm": 2.3089155525157397, + "language_loss": 0.8424108, + "learning_rate": 2.7552356207172124e-06, + "loss": 0.91964483, + "num_input_tokens_seen": 141252255, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.12786865, + "step": 6577, + "time_per_iteration": 3.9026546478271484 + }, + { + "auxiliary_loss_clip": 0.06447916, + "auxiliary_loss_mlp": 0.01269176, + "balance_loss_clip": 0.06283568, + "balance_loss_mlp": 0.01255788, + "epoch": 0.3954907560499023, + "flos": 22790876561280.0, + "grad_norm": 2.6090797034217603, + "language_loss": 0.90399998, + "learning_rate": 2.75487497985853e-06, + "loss": 0.98117089, + "num_input_tokens_seen": 141269325, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.1338501, + "step": 6578, + "time_per_iteration": 2.624901533126831 + }, + { + "auxiliary_loss_clip": 0.06451896, + "auxiliary_loss_mlp": 0.01269341, + "balance_loss_clip": 0.06281315, + "balance_loss_mlp": 0.01254284, + "epoch": 0.39555087930257027, + "flos": 21950823052800.0, + "grad_norm": 1.8247592517251146, + "language_loss": 0.78543842, + "learning_rate": 2.7545143103749117e-06, + "loss": 0.86265075, + "num_input_tokens_seen": 141288505, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.15063477, + "step": 6579, + "time_per_iteration": 2.5111443996429443 + }, + { + "auxiliary_loss_clip": 0.06456701, + "auxiliary_loss_mlp": 0.01273715, + "balance_loss_clip": 0.0628474, + "balance_loss_mlp": 0.01258492, + "epoch": 0.39561100255523823, + "flos": 20409553451520.0, + "grad_norm": 2.1653293739232753, + "language_loss": 0.68659246, + "learning_rate": 2.754153612280037e-06, + "loss": 0.76389658, + "num_input_tokens_seen": 141303680, + "router_z_loss_clip": 1.71972656, + "router_z_loss_mlp": 0.15216064, + "step": 6580, + "time_per_iteration": 4.038321495056152 + }, + { + "auxiliary_loss_clip": 0.06448758, + "auxiliary_loss_mlp": 0.01270958, + "balance_loss_clip": 0.06283981, + "balance_loss_mlp": 0.01256635, + "epoch": 0.3956711258079062, + "flos": 27972005598720.0, + "grad_norm": 1.867170796056586, + "language_loss": 0.58577931, + "learning_rate": 2.7537928855875797e-06, + "loss": 0.6629765, + "num_input_tokens_seen": 141324090, + "router_z_loss_clip": 1.64648438, + "router_z_loss_mlp": 0.14318848, + "step": 6581, + "time_per_iteration": 2.618917942047119 + }, + { + "auxiliary_loss_clip": 0.0645448, + "auxiliary_loss_mlp": 0.0127135, + "balance_loss_clip": 0.06288571, + "balance_loss_mlp": 0.01256413, + "epoch": 0.39573124906057416, + "flos": 14433457201920.0, + "grad_norm": 2.002939068333409, + "language_loss": 0.69910431, + "learning_rate": 2.7534321303112224e-06, + "loss": 0.77636254, + "num_input_tokens_seen": 141342235, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.14929199, + "step": 6582, + "time_per_iteration": 2.530895709991455 + }, + { + "auxiliary_loss_clip": 0.06451949, + "auxiliary_loss_mlp": 0.01273006, + "balance_loss_clip": 0.06283893, + "balance_loss_mlp": 0.01258546, + "epoch": 0.39579137231324213, + "flos": 18739592778240.0, + "grad_norm": 2.2302551557868457, + "language_loss": 0.76587689, + "learning_rate": 2.753071346464642e-06, + "loss": 0.84312642, + "num_input_tokens_seen": 141361195, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.14453125, + "step": 6583, + "time_per_iteration": 2.5276317596435547 + }, + { + "auxiliary_loss_clip": 0.0645259, + "auxiliary_loss_mlp": 0.0127002, + "balance_loss_clip": 0.06284047, + "balance_loss_mlp": 0.01256562, + "epoch": 0.3958514955659101, + "flos": 17682268832640.0, + "grad_norm": 1.926047340176765, + "language_loss": 0.66262352, + "learning_rate": 2.7527105340615207e-06, + "loss": 0.73984963, + "num_input_tokens_seen": 141378275, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.13458252, + "step": 6584, + "time_per_iteration": 2.501209259033203 + }, + { + "auxiliary_loss_clip": 0.06456675, + "auxiliary_loss_mlp": 0.01270923, + "balance_loss_clip": 0.06285589, + "balance_loss_mlp": 0.01256803, + "epoch": 0.39591161881857806, + "flos": 29315850981120.0, + "grad_norm": 1.992954295318491, + "language_loss": 0.72398281, + "learning_rate": 2.7523496931155413e-06, + "loss": 0.8012588, + "num_input_tokens_seen": 141396960, + "router_z_loss_clip": 1.7109375, + "router_z_loss_mlp": 0.14111328, + "step": 6585, + "time_per_iteration": 2.617694616317749 + }, + { + "auxiliary_loss_clip": 0.06457305, + "auxiliary_loss_mlp": 0.0127182, + "balance_loss_clip": 0.06288064, + "balance_loss_mlp": 0.01257336, + "epoch": 0.3959717420712461, + "flos": 25778295780480.0, + "grad_norm": 1.6889684303793513, + "language_loss": 0.73472714, + "learning_rate": 2.7519888236403856e-06, + "loss": 0.81201839, + "num_input_tokens_seen": 141417320, + "router_z_loss_clip": 1.69238281, + "router_z_loss_mlp": 0.14477539, + "step": 6586, + "time_per_iteration": 2.565883159637451 + }, + { + "auxiliary_loss_clip": 0.06454571, + "auxiliary_loss_mlp": 0.01267143, + "balance_loss_clip": 0.06286268, + "balance_loss_mlp": 0.01252969, + "epoch": 0.39603186532391405, + "flos": 20930199995520.0, + "grad_norm": 1.6150585752618039, + "language_loss": 0.71662915, + "learning_rate": 2.7516279256497382e-06, + "loss": 0.79384637, + "num_input_tokens_seen": 141435985, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.14160156, + "step": 6587, + "time_per_iteration": 2.5788414478302 + }, + { + "auxiliary_loss_clip": 0.06362241, + "auxiliary_loss_mlp": 0.01254401, + "balance_loss_clip": 0.06286076, + "balance_loss_mlp": 0.01251419, + "epoch": 0.396091988576582, + "flos": 54897336720000.0, + "grad_norm": 0.8108180128275717, + "language_loss": 0.60705078, + "learning_rate": 2.751266999157285e-06, + "loss": 0.68321717, + "num_input_tokens_seen": 141486075, + "router_z_loss_clip": 0.75830078, + "router_z_loss_mlp": 0.02980042, + "step": 6588, + "time_per_iteration": 2.973475217819214 + }, + { + "auxiliary_loss_clip": 0.06457016, + "auxiliary_loss_mlp": 0.01266428, + "balance_loss_clip": 0.06285909, + "balance_loss_mlp": 0.01251873, + "epoch": 0.39615211182925, + "flos": 20708946489600.0, + "grad_norm": 1.752385405351709, + "language_loss": 0.81335068, + "learning_rate": 2.7509060441767115e-06, + "loss": 0.89058518, + "num_input_tokens_seen": 141505280, + "router_z_loss_clip": 1.71191406, + "router_z_loss_mlp": 0.14575195, + "step": 6589, + "time_per_iteration": 2.557732582092285 + }, + { + "auxiliary_loss_clip": 0.06456019, + "auxiliary_loss_mlp": 0.01269797, + "balance_loss_clip": 0.06286196, + "balance_loss_mlp": 0.01254431, + "epoch": 0.39621223508191794, + "flos": 21000331463040.0, + "grad_norm": 1.8508577793480634, + "language_loss": 0.71167219, + "learning_rate": 2.7505450607217057e-06, + "loss": 0.7889303, + "num_input_tokens_seen": 141523930, + "router_z_loss_clip": 1.69726562, + "router_z_loss_mlp": 0.15368652, + "step": 6590, + "time_per_iteration": 2.5155017375946045 + }, + { + "auxiliary_loss_clip": 0.06451933, + "auxiliary_loss_mlp": 0.01266373, + "balance_loss_clip": 0.06285245, + "balance_loss_mlp": 0.01253284, + "epoch": 0.3962723583345859, + "flos": 23375742860160.0, + "grad_norm": 1.6853348593397999, + "language_loss": 0.75984478, + "learning_rate": 2.750184048805956e-06, + "loss": 0.83702791, + "num_input_tokens_seen": 141541320, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.13098145, + "step": 6591, + "time_per_iteration": 2.569958448410034 + }, + { + "auxiliary_loss_clip": 0.06454425, + "auxiliary_loss_mlp": 0.01268025, + "balance_loss_clip": 0.06288329, + "balance_loss_mlp": 0.01254215, + "epoch": 0.39633248158725387, + "flos": 25122040202880.0, + "grad_norm": 1.5542594066551045, + "language_loss": 0.78422546, + "learning_rate": 2.749823008443152e-06, + "loss": 0.8614499, + "num_input_tokens_seen": 141561880, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.13806152, + "step": 6592, + "time_per_iteration": 2.5509040355682373 + }, + { + "auxiliary_loss_clip": 0.06448938, + "auxiliary_loss_mlp": 0.0127036, + "balance_loss_clip": 0.062861, + "balance_loss_mlp": 0.01256615, + "epoch": 0.39639260483992184, + "flos": 39797309888640.0, + "grad_norm": 1.716432087396327, + "language_loss": 0.69405383, + "learning_rate": 2.7494619396469843e-06, + "loss": 0.77124685, + "num_input_tokens_seen": 141586460, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.13751221, + "step": 6593, + "time_per_iteration": 2.742421865463257 + }, + { + "auxiliary_loss_clip": 0.06455009, + "auxiliary_loss_mlp": 0.01268833, + "balance_loss_clip": 0.06285039, + "balance_loss_mlp": 0.01253896, + "epoch": 0.3964527280925898, + "flos": 17352673597440.0, + "grad_norm": 2.6756229463225134, + "language_loss": 0.78082192, + "learning_rate": 2.7491008424311452e-06, + "loss": 0.85806036, + "num_input_tokens_seen": 141605955, + "router_z_loss_clip": 1.69824219, + "router_z_loss_mlp": 0.14929199, + "step": 6594, + "time_per_iteration": 2.5240583419799805 + }, + { + "auxiliary_loss_clip": 0.06345355, + "auxiliary_loss_mlp": 0.01253278, + "balance_loss_clip": 0.06269702, + "balance_loss_mlp": 0.0125056, + "epoch": 0.39651285134525777, + "flos": 71739845533440.0, + "grad_norm": 0.9367359782969226, + "language_loss": 0.6293599, + "learning_rate": 2.7487397168093265e-06, + "loss": 0.70534623, + "num_input_tokens_seen": 141673140, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.02722168, + "step": 6595, + "time_per_iteration": 3.195411205291748 + }, + { + "auxiliary_loss_clip": 0.06455558, + "auxiliary_loss_mlp": 0.01273293, + "balance_loss_clip": 0.0628309, + "balance_loss_mlp": 0.0125714, + "epoch": 0.39657297459792573, + "flos": 25782823900800.0, + "grad_norm": 2.0629727816625656, + "language_loss": 0.63503623, + "learning_rate": 2.748378562795223e-06, + "loss": 0.71232474, + "num_input_tokens_seen": 141692955, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.16149902, + "step": 6596, + "time_per_iteration": 2.564436197280884 + }, + { + "auxiliary_loss_clip": 0.06445512, + "auxiliary_loss_mlp": 0.01270278, + "balance_loss_clip": 0.0628349, + "balance_loss_mlp": 0.01256086, + "epoch": 0.3966330978505937, + "flos": 20272267336320.0, + "grad_norm": 3.0845696935228646, + "language_loss": 0.79033494, + "learning_rate": 2.7480173804025293e-06, + "loss": 0.86749279, + "num_input_tokens_seen": 141710680, + "router_z_loss_clip": 1.61914062, + "router_z_loss_mlp": 0.14202881, + "step": 6597, + "time_per_iteration": 2.5187220573425293 + }, + { + "auxiliary_loss_clip": 0.0645806, + "auxiliary_loss_mlp": 0.01272047, + "balance_loss_clip": 0.06285266, + "balance_loss_mlp": 0.01257259, + "epoch": 0.39669322110326166, + "flos": 20637431429760.0, + "grad_norm": 1.9127598273467419, + "language_loss": 0.67675543, + "learning_rate": 2.747656169644941e-06, + "loss": 0.75405657, + "num_input_tokens_seen": 141729860, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.14776611, + "step": 6598, + "time_per_iteration": 2.5287654399871826 + }, + { + "auxiliary_loss_clip": 0.06448894, + "auxiliary_loss_mlp": 0.01270917, + "balance_loss_clip": 0.06280929, + "balance_loss_mlp": 0.01257643, + "epoch": 0.3967533443559297, + "flos": 21732546366720.0, + "grad_norm": 1.6941457063111416, + "language_loss": 0.79130334, + "learning_rate": 2.747294930536157e-06, + "loss": 0.86850142, + "num_input_tokens_seen": 141749060, + "router_z_loss_clip": 1.68066406, + "router_z_loss_mlp": 0.13269043, + "step": 6599, + "time_per_iteration": 2.564073324203491 + }, + { + "auxiliary_loss_clip": 0.06447926, + "auxiliary_loss_mlp": 0.01270436, + "balance_loss_clip": 0.06279482, + "balance_loss_mlp": 0.01254289, + "epoch": 0.39681346760859765, + "flos": 25491271219200.0, + "grad_norm": 1.7355689440790156, + "language_loss": 0.72895992, + "learning_rate": 2.7469336630898737e-06, + "loss": 0.80614352, + "num_input_tokens_seen": 141769860, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.16149902, + "step": 6600, + "time_per_iteration": 2.6141197681427 + }, + { + "auxiliary_loss_clip": 0.06448444, + "auxiliary_loss_mlp": 0.01274951, + "balance_loss_clip": 0.06280382, + "balance_loss_mlp": 0.01261045, + "epoch": 0.3968735908612656, + "flos": 20965894634880.0, + "grad_norm": 1.918502465070546, + "language_loss": 0.85902363, + "learning_rate": 2.746572367319791e-06, + "loss": 0.9362576, + "num_input_tokens_seen": 141788465, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.13909912, + "step": 6601, + "time_per_iteration": 2.539337396621704 + }, + { + "auxiliary_loss_clip": 0.06455625, + "auxiliary_loss_mlp": 0.01273924, + "balance_loss_clip": 0.06281834, + "balance_loss_mlp": 0.0125773, + "epoch": 0.3969337141139336, + "flos": 10711684800000.0, + "grad_norm": 2.4177834123100412, + "language_loss": 0.70406669, + "learning_rate": 2.7462110432396095e-06, + "loss": 0.78136218, + "num_input_tokens_seen": 141804955, + "router_z_loss_clip": 1.74023438, + "router_z_loss_mlp": 0.16192627, + "step": 6602, + "time_per_iteration": 2.5344958305358887 + }, + { + "auxiliary_loss_clip": 0.06450728, + "auxiliary_loss_mlp": 0.01272133, + "balance_loss_clip": 0.06280322, + "balance_loss_mlp": 0.01257583, + "epoch": 0.39699383736660154, + "flos": 17597924098560.0, + "grad_norm": 4.3880896635048865, + "language_loss": 0.84332073, + "learning_rate": 2.7458496908630305e-06, + "loss": 0.92054927, + "num_input_tokens_seen": 141820025, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.14550781, + "step": 6603, + "time_per_iteration": 2.4587697982788086 + }, + { + "auxiliary_loss_clip": 0.06445679, + "auxiliary_loss_mlp": 0.01276756, + "balance_loss_clip": 0.06278397, + "balance_loss_mlp": 0.01263017, + "epoch": 0.3970539606192695, + "flos": 17791826446080.0, + "grad_norm": 1.5258003920697418, + "language_loss": 0.7302916, + "learning_rate": 2.7454883102037563e-06, + "loss": 0.80751598, + "num_input_tokens_seen": 141838735, + "router_z_loss_clip": 1.67480469, + "router_z_loss_mlp": 0.13751221, + "step": 6604, + "time_per_iteration": 2.525475025177002 + }, + { + "auxiliary_loss_clip": 0.06437713, + "auxiliary_loss_mlp": 0.01269691, + "balance_loss_clip": 0.06277181, + "balance_loss_mlp": 0.0125609, + "epoch": 0.3971140838719375, + "flos": 24796260328320.0, + "grad_norm": 1.5312177971095886, + "language_loss": 0.82809514, + "learning_rate": 2.745126901275491e-06, + "loss": 0.90516913, + "num_input_tokens_seen": 141858090, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.13598633, + "step": 6605, + "time_per_iteration": 2.5601069927215576 + }, + { + "auxiliary_loss_clip": 0.06439412, + "auxiliary_loss_mlp": 0.01269635, + "balance_loss_clip": 0.06274941, + "balance_loss_mlp": 0.01256337, + "epoch": 0.39717420712460544, + "flos": 24250484759040.0, + "grad_norm": 1.721474173213711, + "language_loss": 0.74617773, + "learning_rate": 2.7447654640919383e-06, + "loss": 0.82326818, + "num_input_tokens_seen": 141877540, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.13293457, + "step": 6606, + "time_per_iteration": 2.570338726043701 + }, + { + "auxiliary_loss_clip": 0.06450282, + "auxiliary_loss_mlp": 0.01268462, + "balance_loss_clip": 0.06279129, + "balance_loss_mlp": 0.01255343, + "epoch": 0.3972343303772734, + "flos": 25891752608640.0, + "grad_norm": 1.7826498780228273, + "language_loss": 0.74625784, + "learning_rate": 2.744403998666805e-06, + "loss": 0.8234452, + "num_input_tokens_seen": 141897315, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.13122559, + "step": 6607, + "time_per_iteration": 2.554779052734375 + }, + { + "auxiliary_loss_clip": 0.06451584, + "auxiliary_loss_mlp": 0.01271624, + "balance_loss_clip": 0.0628166, + "balance_loss_mlp": 0.01257366, + "epoch": 0.39729445362994137, + "flos": 45634107525120.0, + "grad_norm": 2.013518755058626, + "language_loss": 0.68503535, + "learning_rate": 2.744042505013797e-06, + "loss": 0.76226741, + "num_input_tokens_seen": 141919580, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.1427002, + "step": 6608, + "time_per_iteration": 2.814741611480713 + }, + { + "auxiliary_loss_clip": 0.06453016, + "auxiliary_loss_mlp": 0.01270819, + "balance_loss_clip": 0.06280445, + "balance_loss_mlp": 0.01256496, + "epoch": 0.39735457688260933, + "flos": 20200249152000.0, + "grad_norm": 2.238404873213265, + "language_loss": 0.74168068, + "learning_rate": 2.7436809831466233e-06, + "loss": 0.818919, + "num_input_tokens_seen": 141937045, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.14318848, + "step": 6609, + "time_per_iteration": 2.549020767211914 + }, + { + "auxiliary_loss_clip": 0.06450722, + "auxiliary_loss_mlp": 0.01268408, + "balance_loss_clip": 0.06281993, + "balance_loss_mlp": 0.0125424, + "epoch": 0.3974147001352773, + "flos": 23337868014720.0, + "grad_norm": 1.4758458837885644, + "language_loss": 0.71468556, + "learning_rate": 2.7433194330789927e-06, + "loss": 0.79187685, + "num_input_tokens_seen": 141956695, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.14154053, + "step": 6610, + "time_per_iteration": 3.985957622528076 + }, + { + "auxiliary_loss_clip": 0.06440872, + "auxiliary_loss_mlp": 0.01270494, + "balance_loss_clip": 0.062764, + "balance_loss_mlp": 0.01256559, + "epoch": 0.39747482338794526, + "flos": 21694965010560.0, + "grad_norm": 1.555692262156073, + "language_loss": 0.7854501, + "learning_rate": 2.7429578548246133e-06, + "loss": 0.86256385, + "num_input_tokens_seen": 141975935, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.13934326, + "step": 6611, + "time_per_iteration": 2.5972208976745605 + }, + { + "auxiliary_loss_clip": 0.06447503, + "auxiliary_loss_mlp": 0.01268941, + "balance_loss_clip": 0.06280762, + "balance_loss_mlp": 0.01255065, + "epoch": 0.3975349466406133, + "flos": 30995957998080.0, + "grad_norm": 2.19308398220208, + "language_loss": 0.79606485, + "learning_rate": 2.7425962483971985e-06, + "loss": 0.87322932, + "num_input_tokens_seen": 141995750, + "router_z_loss_clip": 1.66699219, + "router_z_loss_mlp": 0.13891602, + "step": 6612, + "time_per_iteration": 2.6106274127960205 + }, + { + "auxiliary_loss_clip": 0.0634682, + "auxiliary_loss_mlp": 0.01253265, + "balance_loss_clip": 0.06270839, + "balance_loss_mlp": 0.01250469, + "epoch": 0.39759506989328125, + "flos": 63703426366080.0, + "grad_norm": 0.8245936024085626, + "language_loss": 0.6463905, + "learning_rate": 2.742234613810459e-06, + "loss": 0.72239137, + "num_input_tokens_seen": 142057655, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.02796936, + "step": 6613, + "time_per_iteration": 4.473678112030029 + }, + { + "auxiliary_loss_clip": 0.06450668, + "auxiliary_loss_mlp": 0.01269678, + "balance_loss_clip": 0.06282368, + "balance_loss_mlp": 0.01255367, + "epoch": 0.3976551931459492, + "flos": 23702570910720.0, + "grad_norm": 2.448614415916545, + "language_loss": 0.72596258, + "learning_rate": 2.741872951078109e-06, + "loss": 0.80316603, + "num_input_tokens_seen": 142076020, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.14312744, + "step": 6614, + "time_per_iteration": 2.5691444873809814 + }, + { + "auxiliary_loss_clip": 0.06449673, + "auxiliary_loss_mlp": 0.0127007, + "balance_loss_clip": 0.06283288, + "balance_loss_mlp": 0.01256051, + "epoch": 0.3977153163986172, + "flos": 15675166056960.0, + "grad_norm": 2.2284862441621995, + "language_loss": 0.81666011, + "learning_rate": 2.741511260213862e-06, + "loss": 0.89385748, + "num_input_tokens_seen": 142093790, + "router_z_loss_clip": 1.6640625, + "router_z_loss_mlp": 0.14013672, + "step": 6615, + "time_per_iteration": 2.55078387260437 + }, + { + "auxiliary_loss_clip": 0.06452717, + "auxiliary_loss_mlp": 0.01269531, + "balance_loss_clip": 0.06284063, + "balance_loss_mlp": 0.01255679, + "epoch": 0.39777543965128515, + "flos": 14070012117120.0, + "grad_norm": 1.96274897748641, + "language_loss": 0.67687142, + "learning_rate": 2.741149541231434e-06, + "loss": 0.75409389, + "num_input_tokens_seen": 142110545, + "router_z_loss_clip": 1.68554688, + "router_z_loss_mlp": 0.13842773, + "step": 6616, + "time_per_iteration": 2.533982992172241 + }, + { + "auxiliary_loss_clip": 0.06455097, + "auxiliary_loss_mlp": 0.0126897, + "balance_loss_clip": 0.06281532, + "balance_loss_mlp": 0.01253986, + "epoch": 0.3978355629039531, + "flos": 23374149632640.0, + "grad_norm": 2.1811174101900552, + "language_loss": 0.8396368, + "learning_rate": 2.740787794144541e-06, + "loss": 0.91687751, + "num_input_tokens_seen": 142128695, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.14978027, + "step": 6617, + "time_per_iteration": 3.9742090702056885 + }, + { + "auxiliary_loss_clip": 0.06446042, + "auxiliary_loss_mlp": 0.01268103, + "balance_loss_clip": 0.06283504, + "balance_loss_mlp": 0.01255556, + "epoch": 0.3978956861566211, + "flos": 19068852597120.0, + "grad_norm": 1.7253210008214133, + "language_loss": 0.73000187, + "learning_rate": 2.7404260189669e-06, + "loss": 0.80714333, + "num_input_tokens_seen": 142148375, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.12536621, + "step": 6618, + "time_per_iteration": 2.562913179397583 + }, + { + "auxiliary_loss_clip": 0.06454587, + "auxiliary_loss_mlp": 0.01274299, + "balance_loss_clip": 0.06285769, + "balance_loss_mlp": 0.01258576, + "epoch": 0.39795580940928904, + "flos": 30235679176320.0, + "grad_norm": 1.6365941861062427, + "language_loss": 0.65343797, + "learning_rate": 2.740064215712231e-06, + "loss": 0.73072684, + "num_input_tokens_seen": 142169735, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.15710449, + "step": 6619, + "time_per_iteration": 2.598667860031128 + }, + { + "auxiliary_loss_clip": 0.06341819, + "auxiliary_loss_mlp": 0.01254465, + "balance_loss_clip": 0.06266081, + "balance_loss_mlp": 0.01251738, + "epoch": 0.398015932661957, + "flos": 69867261688320.0, + "grad_norm": 0.7579483566665592, + "language_loss": 0.582268, + "learning_rate": 2.7397023843942527e-06, + "loss": 0.65823084, + "num_input_tokens_seen": 142229520, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.02731323, + "step": 6620, + "time_per_iteration": 4.528149604797363 + }, + { + "auxiliary_loss_clip": 0.06446633, + "auxiliary_loss_mlp": 0.01270084, + "balance_loss_clip": 0.06280729, + "balance_loss_mlp": 0.01256858, + "epoch": 0.39807605591462497, + "flos": 20164093315200.0, + "grad_norm": 1.5024608902652035, + "language_loss": 0.79499102, + "learning_rate": 2.739340525026686e-06, + "loss": 0.87215811, + "num_input_tokens_seen": 142247660, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.13232422, + "step": 6621, + "time_per_iteration": 2.559305191040039 + }, + { + "auxiliary_loss_clip": 0.06445563, + "auxiliary_loss_mlp": 0.01270989, + "balance_loss_clip": 0.06279579, + "balance_loss_mlp": 0.01257435, + "epoch": 0.39813617916729294, + "flos": 21148057411200.0, + "grad_norm": 1.7591122738615637, + "language_loss": 0.78347874, + "learning_rate": 2.738978637623252e-06, + "loss": 0.86064428, + "num_input_tokens_seen": 142266990, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.13568115, + "step": 6622, + "time_per_iteration": 2.541325569152832 + }, + { + "auxiliary_loss_clip": 0.06444648, + "auxiliary_loss_mlp": 0.01270694, + "balance_loss_clip": 0.06278688, + "balance_loss_mlp": 0.01255948, + "epoch": 0.3981963024199609, + "flos": 18994318790400.0, + "grad_norm": 9.51473607747463, + "language_loss": 0.75430334, + "learning_rate": 2.738616722197674e-06, + "loss": 0.83145678, + "num_input_tokens_seen": 142287170, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.14733887, + "step": 6623, + "time_per_iteration": 2.5859150886535645 + }, + { + "auxiliary_loss_clip": 0.06449074, + "auxiliary_loss_mlp": 0.0127457, + "balance_loss_clip": 0.06282511, + "balance_loss_mlp": 0.01260551, + "epoch": 0.39825642567262887, + "flos": 16579648955520.0, + "grad_norm": 1.7143371951380526, + "language_loss": 0.79926246, + "learning_rate": 2.7382547787636766e-06, + "loss": 0.87649894, + "num_input_tokens_seen": 142305405, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.14025879, + "step": 6624, + "time_per_iteration": 2.509500026702881 + }, + { + "auxiliary_loss_clip": 0.06454292, + "auxiliary_loss_mlp": 0.01269994, + "balance_loss_clip": 0.06280515, + "balance_loss_mlp": 0.01254234, + "epoch": 0.39831654892529683, + "flos": 22206303751680.0, + "grad_norm": 2.195062259081814, + "language_loss": 0.84314877, + "learning_rate": 2.7378928073349832e-06, + "loss": 0.92039162, + "num_input_tokens_seen": 142322710, + "router_z_loss_clip": 1.73535156, + "router_z_loss_mlp": 0.15759277, + "step": 6625, + "time_per_iteration": 2.5617175102233887 + }, + { + "auxiliary_loss_clip": 0.06446299, + "auxiliary_loss_mlp": 0.01272387, + "balance_loss_clip": 0.06279518, + "balance_loss_mlp": 0.01258517, + "epoch": 0.39837667217796485, + "flos": 10492485719040.0, + "grad_norm": 1.8250293636172175, + "language_loss": 0.8709324, + "learning_rate": 2.737530807925321e-06, + "loss": 0.94811928, + "num_input_tokens_seen": 142338535, + "router_z_loss_clip": 1.66699219, + "router_z_loss_mlp": 0.13867188, + "step": 6626, + "time_per_iteration": 2.72031307220459 + }, + { + "auxiliary_loss_clip": 0.06447423, + "auxiliary_loss_mlp": 0.01271086, + "balance_loss_clip": 0.0627908, + "balance_loss_mlp": 0.01256531, + "epoch": 0.3984367954306328, + "flos": 17970676986240.0, + "grad_norm": 2.760632977827581, + "language_loss": 0.84402627, + "learning_rate": 2.737168780548417e-06, + "loss": 0.9212113, + "num_input_tokens_seen": 142354570, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.14575195, + "step": 6627, + "time_per_iteration": 2.6228654384613037 + }, + { + "auxiliary_loss_clip": 0.06445234, + "auxiliary_loss_mlp": 0.01268693, + "balance_loss_clip": 0.0627917, + "balance_loss_mlp": 0.01255443, + "epoch": 0.3984969186833008, + "flos": 22717684419840.0, + "grad_norm": 3.2429830324928095, + "language_loss": 0.83402491, + "learning_rate": 2.736806725217998e-06, + "loss": 0.91116416, + "num_input_tokens_seen": 142374395, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.13250732, + "step": 6628, + "time_per_iteration": 2.6287484169006348 + }, + { + "auxiliary_loss_clip": 0.06449139, + "auxiliary_loss_mlp": 0.01271852, + "balance_loss_clip": 0.06279008, + "balance_loss_mlp": 0.01256981, + "epoch": 0.39855704193596875, + "flos": 23412779164800.0, + "grad_norm": 1.5731823007903518, + "language_loss": 0.71793973, + "learning_rate": 2.7364446419477945e-06, + "loss": 0.79514968, + "num_input_tokens_seen": 142396040, + "router_z_loss_clip": 1.70117188, + "router_z_loss_mlp": 0.14868164, + "step": 6629, + "time_per_iteration": 2.5752875804901123 + }, + { + "auxiliary_loss_clip": 0.06441505, + "auxiliary_loss_mlp": 0.01268472, + "balance_loss_clip": 0.06280406, + "balance_loss_mlp": 0.01254834, + "epoch": 0.3986171651886367, + "flos": 21258369711360.0, + "grad_norm": 2.035566678796665, + "language_loss": 0.80905473, + "learning_rate": 2.7360825307515366e-06, + "loss": 0.88615453, + "num_input_tokens_seen": 142415495, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.1362915, + "step": 6630, + "time_per_iteration": 2.5329513549804688 + }, + { + "auxiliary_loss_clip": 0.06445715, + "auxiliary_loss_mlp": 0.01269878, + "balance_loss_clip": 0.06276714, + "balance_loss_mlp": 0.01255693, + "epoch": 0.3986772884413047, + "flos": 12463642293120.0, + "grad_norm": 2.1251751047068783, + "language_loss": 0.75146663, + "learning_rate": 2.7357203916429555e-06, + "loss": 0.82862258, + "num_input_tokens_seen": 142431865, + "router_z_loss_clip": 1.69042969, + "router_z_loss_mlp": 0.14190674, + "step": 6631, + "time_per_iteration": 2.5500082969665527 + }, + { + "auxiliary_loss_clip": 0.06448178, + "auxiliary_loss_mlp": 0.01269111, + "balance_loss_clip": 0.06279311, + "balance_loss_mlp": 0.0125505, + "epoch": 0.39873741169397264, + "flos": 19652209522560.0, + "grad_norm": 1.6915315525927903, + "language_loss": 0.71496904, + "learning_rate": 2.735358224635783e-06, + "loss": 0.79214191, + "num_input_tokens_seen": 142450595, + "router_z_loss_clip": 1.69042969, + "router_z_loss_mlp": 0.140625, + "step": 6632, + "time_per_iteration": 2.563776731491089 + }, + { + "auxiliary_loss_clip": 0.06444843, + "auxiliary_loss_mlp": 0.01269449, + "balance_loss_clip": 0.06279632, + "balance_loss_mlp": 0.01255955, + "epoch": 0.3987975349466406, + "flos": 21690436890240.0, + "grad_norm": 1.8116978167005697, + "language_loss": 0.75623924, + "learning_rate": 2.7349960297437533e-06, + "loss": 0.83338219, + "num_input_tokens_seen": 142466650, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.13494873, + "step": 6633, + "time_per_iteration": 2.5171151161193848 + }, + { + "auxiliary_loss_clip": 0.06449188, + "auxiliary_loss_mlp": 0.01272467, + "balance_loss_clip": 0.06280442, + "balance_loss_mlp": 0.0125846, + "epoch": 0.3988576581993086, + "flos": 23920721815680.0, + "grad_norm": 1.9002609831735993, + "language_loss": 0.81678545, + "learning_rate": 2.7346338069806e-06, + "loss": 0.89400202, + "num_input_tokens_seen": 142486165, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.14001465, + "step": 6634, + "time_per_iteration": 2.539128065109253 + }, + { + "auxiliary_loss_clip": 0.06453361, + "auxiliary_loss_mlp": 0.01269766, + "balance_loss_clip": 0.06283009, + "balance_loss_mlp": 0.01255449, + "epoch": 0.39891778145197654, + "flos": 18155690801280.0, + "grad_norm": 1.9946050359209588, + "language_loss": 0.7547667, + "learning_rate": 2.7342715563600597e-06, + "loss": 0.83199799, + "num_input_tokens_seen": 142505035, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.14306641, + "step": 6635, + "time_per_iteration": 2.5426242351531982 + }, + { + "auxiliary_loss_clip": 0.06468328, + "auxiliary_loss_mlp": 0.01272826, + "balance_loss_clip": 0.06289048, + "balance_loss_mlp": 0.01256053, + "epoch": 0.3989779047046445, + "flos": 22600831501440.0, + "grad_norm": 1.9740114535883675, + "language_loss": 0.66474432, + "learning_rate": 2.733909277895868e-06, + "loss": 0.74215585, + "num_input_tokens_seen": 142521870, + "router_z_loss_clip": 1.79296875, + "router_z_loss_mlp": 0.16760254, + "step": 6636, + "time_per_iteration": 2.5290956497192383 + }, + { + "auxiliary_loss_clip": 0.06452767, + "auxiliary_loss_mlp": 0.01270258, + "balance_loss_clip": 0.06285115, + "balance_loss_mlp": 0.01255012, + "epoch": 0.39903802795731247, + "flos": 18083043711360.0, + "grad_norm": 1.6936131920640751, + "language_loss": 0.82211542, + "learning_rate": 2.733546971601763e-06, + "loss": 0.89934564, + "num_input_tokens_seen": 142540455, + "router_z_loss_clip": 1.67773438, + "router_z_loss_mlp": 0.15246582, + "step": 6637, + "time_per_iteration": 2.516279458999634 + }, + { + "auxiliary_loss_clip": 0.06353697, + "auxiliary_loss_mlp": 0.01252791, + "balance_loss_clip": 0.06278069, + "balance_loss_mlp": 0.01250418, + "epoch": 0.39909815120998043, + "flos": 70463238652800.0, + "grad_norm": 0.7262189478909644, + "language_loss": 0.531524, + "learning_rate": 2.733184637491484e-06, + "loss": 0.60758889, + "num_input_tokens_seen": 142599665, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.0236969, + "step": 6638, + "time_per_iteration": 3.2179603576660156 + }, + { + "auxiliary_loss_clip": 0.06449973, + "auxiliary_loss_mlp": 0.01277744, + "balance_loss_clip": 0.06279011, + "balance_loss_mlp": 0.0126304, + "epoch": 0.39915827446264845, + "flos": 18554788598400.0, + "grad_norm": 1.4980640352775056, + "language_loss": 0.75670731, + "learning_rate": 2.732822275578769e-06, + "loss": 0.83398449, + "num_input_tokens_seen": 142618845, + "router_z_loss_clip": 1.70898438, + "router_z_loss_mlp": 0.14715576, + "step": 6639, + "time_per_iteration": 2.5319011211395264 + }, + { + "auxiliary_loss_clip": 0.06442601, + "auxiliary_loss_mlp": 0.01272751, + "balance_loss_clip": 0.0627881, + "balance_loss_mlp": 0.01258249, + "epoch": 0.3992183977153164, + "flos": 29904826129920.0, + "grad_norm": 2.014095124557279, + "language_loss": 0.76376802, + "learning_rate": 2.7324598858773603e-06, + "loss": 0.84092152, + "num_input_tokens_seen": 142640885, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.1451416, + "step": 6640, + "time_per_iteration": 2.642223834991455 + }, + { + "auxiliary_loss_clip": 0.06449724, + "auxiliary_loss_mlp": 0.01270265, + "balance_loss_clip": 0.06280393, + "balance_loss_mlp": 0.01255757, + "epoch": 0.3992785209679844, + "flos": 22571677480320.0, + "grad_norm": 2.238528881986372, + "language_loss": 0.8211664, + "learning_rate": 2.7320974684009996e-06, + "loss": 0.89836633, + "num_input_tokens_seen": 142659340, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.14501953, + "step": 6641, + "time_per_iteration": 2.530189275741577 + }, + { + "auxiliary_loss_clip": 0.06456075, + "auxiliary_loss_mlp": 0.01270045, + "balance_loss_clip": 0.06284191, + "balance_loss_mlp": 0.01254971, + "epoch": 0.39933864422065235, + "flos": 19688784629760.0, + "grad_norm": 1.8306704082742173, + "language_loss": 0.77208257, + "learning_rate": 2.7317350231634288e-06, + "loss": 0.84934378, + "num_input_tokens_seen": 142677085, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.15081787, + "step": 6642, + "time_per_iteration": 2.5495219230651855 + }, + { + "auxiliary_loss_clip": 0.06453043, + "auxiliary_loss_mlp": 0.01270555, + "balance_loss_clip": 0.06281064, + "balance_loss_mlp": 0.01255564, + "epoch": 0.3993987674733203, + "flos": 23045015594880.0, + "grad_norm": 2.242078242091602, + "language_loss": 0.72883618, + "learning_rate": 2.731372550178393e-06, + "loss": 0.80607212, + "num_input_tokens_seen": 142694595, + "router_z_loss_clip": 1.71972656, + "router_z_loss_mlp": 0.14984131, + "step": 6643, + "time_per_iteration": 2.521857500076294 + }, + { + "auxiliary_loss_clip": 0.06456347, + "auxiliary_loss_mlp": 0.01273961, + "balance_loss_clip": 0.06283459, + "balance_loss_mlp": 0.01259317, + "epoch": 0.3994588907259883, + "flos": 19396896531840.0, + "grad_norm": 1.7649027305896348, + "language_loss": 0.66785717, + "learning_rate": 2.7310100494596375e-06, + "loss": 0.74516022, + "num_input_tokens_seen": 142714175, + "router_z_loss_clip": 1.72851562, + "router_z_loss_mlp": 0.14642334, + "step": 6644, + "time_per_iteration": 2.571690320968628 + }, + { + "auxiliary_loss_clip": 0.06454624, + "auxiliary_loss_mlp": 0.0127806, + "balance_loss_clip": 0.06282313, + "balance_loss_mlp": 0.01263737, + "epoch": 0.39951901397865625, + "flos": 13739326778880.0, + "grad_norm": 1.9095077452421072, + "language_loss": 0.78757256, + "learning_rate": 2.730647521020907e-06, + "loss": 0.86489946, + "num_input_tokens_seen": 142730955, + "router_z_loss_clip": 1.72363281, + "router_z_loss_mlp": 0.14312744, + "step": 6645, + "time_per_iteration": 2.499361753463745 + }, + { + "auxiliary_loss_clip": 0.06458238, + "auxiliary_loss_mlp": 0.01274341, + "balance_loss_clip": 0.06283879, + "balance_loss_mlp": 0.01259321, + "epoch": 0.3995791372313242, + "flos": 23593181005440.0, + "grad_norm": 1.5926569767996783, + "language_loss": 0.7044934, + "learning_rate": 2.73028496487595e-06, + "loss": 0.78181922, + "num_input_tokens_seen": 142751200, + "router_z_loss_clip": 1.74414062, + "router_z_loss_mlp": 0.15026855, + "step": 6646, + "time_per_iteration": 2.619114875793457 + }, + { + "auxiliary_loss_clip": 0.06456489, + "auxiliary_loss_mlp": 0.01271766, + "balance_loss_clip": 0.06284152, + "balance_loss_mlp": 0.01257103, + "epoch": 0.3996392604839922, + "flos": 21361428633600.0, + "grad_norm": 2.2667385155288917, + "language_loss": 0.72035694, + "learning_rate": 2.729922381038513e-06, + "loss": 0.79763949, + "num_input_tokens_seen": 142770170, + "router_z_loss_clip": 1.72265625, + "router_z_loss_mlp": 0.14660645, + "step": 6647, + "time_per_iteration": 2.58251953125 + }, + { + "auxiliary_loss_clip": 0.06449988, + "auxiliary_loss_mlp": 0.01272061, + "balance_loss_clip": 0.06284988, + "balance_loss_mlp": 0.01257195, + "epoch": 0.39969938373666014, + "flos": 26039604337920.0, + "grad_norm": 1.4692875023338006, + "language_loss": 0.74830031, + "learning_rate": 2.7295597695223463e-06, + "loss": 0.82552081, + "num_input_tokens_seen": 142792680, + "router_z_loss_clip": 1.64941406, + "router_z_loss_mlp": 0.14849854, + "step": 6648, + "time_per_iteration": 2.7020201683044434 + }, + { + "auxiliary_loss_clip": 0.06453955, + "auxiliary_loss_mlp": 0.0126884, + "balance_loss_clip": 0.06283584, + "balance_loss_mlp": 0.0125472, + "epoch": 0.3997595069893281, + "flos": 20121858057600.0, + "grad_norm": 2.0106261298514907, + "language_loss": 0.65986454, + "learning_rate": 2.7291971303412006e-06, + "loss": 0.73709244, + "num_input_tokens_seen": 142810510, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.14117432, + "step": 6649, + "time_per_iteration": 3.9323928356170654 + }, + { + "auxiliary_loss_clip": 0.06463098, + "auxiliary_loss_mlp": 0.0127713, + "balance_loss_clip": 0.06290667, + "balance_loss_mlp": 0.01260774, + "epoch": 0.39981963024199607, + "flos": 27791016779520.0, + "grad_norm": 1.831691866077207, + "language_loss": 0.75774682, + "learning_rate": 2.728834463508826e-06, + "loss": 0.83514905, + "num_input_tokens_seen": 142832455, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.16357422, + "step": 6650, + "time_per_iteration": 2.6374714374542236 + }, + { + "auxiliary_loss_clip": 0.06454846, + "auxiliary_loss_mlp": 0.01272611, + "balance_loss_clip": 0.06283177, + "balance_loss_mlp": 0.01257782, + "epoch": 0.39987975349466404, + "flos": 21950864979840.0, + "grad_norm": 1.4608995971033776, + "language_loss": 0.7199676, + "learning_rate": 2.728471769038975e-06, + "loss": 0.79724216, + "num_input_tokens_seen": 142852590, + "router_z_loss_clip": 1.71582031, + "router_z_loss_mlp": 0.14831543, + "step": 6651, + "time_per_iteration": 2.5789706707000732 + }, + { + "auxiliary_loss_clip": 0.06457064, + "auxiliary_loss_mlp": 0.01269592, + "balance_loss_clip": 0.06283179, + "balance_loss_mlp": 0.01255245, + "epoch": 0.39993987674733206, + "flos": 20710707425280.0, + "grad_norm": 1.930350074981486, + "language_loss": 0.73724478, + "learning_rate": 2.728109046945403e-06, + "loss": 0.8145113, + "num_input_tokens_seen": 142870595, + "router_z_loss_clip": 1.74023438, + "router_z_loss_mlp": 0.14331055, + "step": 6652, + "time_per_iteration": 3.9592838287353516 + }, + { + "auxiliary_loss_clip": 0.06347093, + "auxiliary_loss_mlp": 0.01255075, + "balance_loss_clip": 0.06271589, + "balance_loss_mlp": 0.01252878, + "epoch": 0.4, + "flos": 61543566397440.0, + "grad_norm": 0.8159851457251004, + "language_loss": 0.60542929, + "learning_rate": 2.727746297241862e-06, + "loss": 0.68145096, + "num_input_tokens_seen": 142925805, + "router_z_loss_clip": 0.75732422, + "router_z_loss_mlp": 0.02201843, + "step": 6653, + "time_per_iteration": 3.0700466632843018 + }, + { + "auxiliary_loss_clip": 0.06454087, + "auxiliary_loss_mlp": 0.01272182, + "balance_loss_clip": 0.0629051, + "balance_loss_mlp": 0.01257698, + "epoch": 0.400060123252668, + "flos": 14507655592320.0, + "grad_norm": 1.9278074838902122, + "language_loss": 0.66929328, + "learning_rate": 2.7273835199421085e-06, + "loss": 0.74655592, + "num_input_tokens_seen": 142943145, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.14477539, + "step": 6654, + "time_per_iteration": 2.5292413234710693 + }, + { + "auxiliary_loss_clip": 0.06457023, + "auxiliary_loss_mlp": 0.01271182, + "balance_loss_clip": 0.06287654, + "balance_loss_mlp": 0.01257396, + "epoch": 0.40012024650533595, + "flos": 19098383961600.0, + "grad_norm": 1.998304088554008, + "language_loss": 0.90550762, + "learning_rate": 2.7270207150599e-06, + "loss": 0.98278964, + "num_input_tokens_seen": 142956925, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.13775635, + "step": 6655, + "time_per_iteration": 2.529496192932129 + }, + { + "auxiliary_loss_clip": 0.06450539, + "auxiliary_loss_mlp": 0.012675, + "balance_loss_clip": 0.06286812, + "balance_loss_mlp": 0.01254899, + "epoch": 0.4001803697580039, + "flos": 29358673217280.0, + "grad_norm": 1.6559902316252946, + "language_loss": 0.73729336, + "learning_rate": 2.7266578826089917e-06, + "loss": 0.81447375, + "num_input_tokens_seen": 142978040, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.1260376, + "step": 6656, + "time_per_iteration": 4.062687158584595 + }, + { + "auxiliary_loss_clip": 0.0645894, + "auxiliary_loss_mlp": 0.01271003, + "balance_loss_clip": 0.06288408, + "balance_loss_mlp": 0.01255696, + "epoch": 0.4002404930106719, + "flos": 20925839583360.0, + "grad_norm": 1.4738199157728433, + "language_loss": 0.73207194, + "learning_rate": 2.726295022603144e-06, + "loss": 0.80937135, + "num_input_tokens_seen": 142998390, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.15307617, + "step": 6657, + "time_per_iteration": 2.5996904373168945 + }, + { + "auxiliary_loss_clip": 0.06458808, + "auxiliary_loss_mlp": 0.0127186, + "balance_loss_clip": 0.06288153, + "balance_loss_mlp": 0.01256506, + "epoch": 0.40030061626333985, + "flos": 28413799850880.0, + "grad_norm": 1.489557881553797, + "language_loss": 0.79247761, + "learning_rate": 2.725932135056117e-06, + "loss": 0.86978424, + "num_input_tokens_seen": 143021505, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.15350342, + "step": 6658, + "time_per_iteration": 2.7172279357910156 + }, + { + "auxiliary_loss_clip": 0.06459276, + "auxiliary_loss_mlp": 0.01278121, + "balance_loss_clip": 0.06289512, + "balance_loss_mlp": 0.01264084, + "epoch": 0.4003607395160078, + "flos": 25928746986240.0, + "grad_norm": 2.1209995886317956, + "language_loss": 0.77640641, + "learning_rate": 2.72556921998167e-06, + "loss": 0.85378039, + "num_input_tokens_seen": 143041375, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.14050293, + "step": 6659, + "time_per_iteration": 4.3210484981536865 + }, + { + "auxiliary_loss_clip": 0.06450686, + "auxiliary_loss_mlp": 0.01279792, + "balance_loss_clip": 0.06291049, + "balance_loss_mlp": 0.01267442, + "epoch": 0.4004208627686758, + "flos": 20773501660800.0, + "grad_norm": 1.7380110296153854, + "language_loss": 0.73432875, + "learning_rate": 2.7252062773935662e-06, + "loss": 0.81163359, + "num_input_tokens_seen": 143058725, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.12359619, + "step": 6660, + "time_per_iteration": 2.668088436126709 + }, + { + "auxiliary_loss_clip": 0.06457424, + "auxiliary_loss_mlp": 0.01270844, + "balance_loss_clip": 0.06287603, + "balance_loss_mlp": 0.01258077, + "epoch": 0.40048098602134374, + "flos": 24688170161280.0, + "grad_norm": 2.131845423391088, + "language_loss": 0.71318859, + "learning_rate": 2.7248433073055674e-06, + "loss": 0.79047126, + "num_input_tokens_seen": 143076995, + "router_z_loss_clip": 1.69726562, + "router_z_loss_mlp": 0.12786865, + "step": 6661, + "time_per_iteration": 2.5673065185546875 + }, + { + "auxiliary_loss_clip": 0.06462744, + "auxiliary_loss_mlp": 0.01272248, + "balance_loss_clip": 0.06291083, + "balance_loss_mlp": 0.01257889, + "epoch": 0.4005411092740117, + "flos": 23192448053760.0, + "grad_norm": 1.7831816831822005, + "language_loss": 0.75751495, + "learning_rate": 2.724480309731437e-06, + "loss": 0.83486485, + "num_input_tokens_seen": 143096780, + "router_z_loss_clip": 1.71582031, + "router_z_loss_mlp": 0.14361572, + "step": 6662, + "time_per_iteration": 2.5870559215545654 + }, + { + "auxiliary_loss_clip": 0.06461672, + "auxiliary_loss_mlp": 0.01271183, + "balance_loss_clip": 0.0628756, + "balance_loss_mlp": 0.01256175, + "epoch": 0.4006012325266797, + "flos": 17526786382080.0, + "grad_norm": 2.241735466255753, + "language_loss": 0.66247231, + "learning_rate": 2.7241172846849417e-06, + "loss": 0.73980081, + "num_input_tokens_seen": 143112590, + "router_z_loss_clip": 1.7421875, + "router_z_loss_mlp": 0.15014648, + "step": 6663, + "time_per_iteration": 2.5879623889923096 + }, + { + "auxiliary_loss_clip": 0.06461117, + "auxiliary_loss_mlp": 0.01271573, + "balance_loss_clip": 0.06290103, + "balance_loss_mlp": 0.01257316, + "epoch": 0.40066135577934764, + "flos": 19862016946560.0, + "grad_norm": 2.129058070747091, + "language_loss": 0.86377645, + "learning_rate": 2.7237542321798455e-06, + "loss": 0.94110334, + "num_input_tokens_seen": 143130220, + "router_z_loss_clip": 1.70898438, + "router_z_loss_mlp": 0.14251709, + "step": 6664, + "time_per_iteration": 2.580240249633789 + }, + { + "auxiliary_loss_clip": 0.06459028, + "auxiliary_loss_mlp": 0.01272821, + "balance_loss_clip": 0.06287652, + "balance_loss_mlp": 0.01259064, + "epoch": 0.40072147903201566, + "flos": 18155816582400.0, + "grad_norm": 1.9805392577959038, + "language_loss": 0.84895325, + "learning_rate": 2.723391152229917e-06, + "loss": 0.92627168, + "num_input_tokens_seen": 143147160, + "router_z_loss_clip": 1.71191406, + "router_z_loss_mlp": 0.13751221, + "step": 6665, + "time_per_iteration": 2.50386381149292 + }, + { + "auxiliary_loss_clip": 0.06457423, + "auxiliary_loss_mlp": 0.01268968, + "balance_loss_clip": 0.06286919, + "balance_loss_mlp": 0.0125458, + "epoch": 0.4007816022846836, + "flos": 18667239177600.0, + "grad_norm": 1.826402815553393, + "language_loss": 0.78598213, + "learning_rate": 2.7230280448489236e-06, + "loss": 0.86324608, + "num_input_tokens_seen": 143164605, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.14404297, + "step": 6666, + "time_per_iteration": 2.5133461952209473 + }, + { + "auxiliary_loss_clip": 0.06465514, + "auxiliary_loss_mlp": 0.01268831, + "balance_loss_clip": 0.06295928, + "balance_loss_mlp": 0.01253834, + "epoch": 0.4008417255373516, + "flos": 25710344519040.0, + "grad_norm": 1.8943268651740763, + "language_loss": 0.74139559, + "learning_rate": 2.7226649100506333e-06, + "loss": 0.81873906, + "num_input_tokens_seen": 143183965, + "router_z_loss_clip": 1.69433594, + "router_z_loss_mlp": 0.14990234, + "step": 6667, + "time_per_iteration": 2.635195732116699 + }, + { + "auxiliary_loss_clip": 0.06460091, + "auxiliary_loss_mlp": 0.01273802, + "balance_loss_clip": 0.06287248, + "balance_loss_mlp": 0.01258519, + "epoch": 0.40090184879001955, + "flos": 22865536149120.0, + "grad_norm": 1.4912552700664468, + "language_loss": 0.75818384, + "learning_rate": 2.7223017478488183e-06, + "loss": 0.83552277, + "num_input_tokens_seen": 143204965, + "router_z_loss_clip": 1.72851562, + "router_z_loss_mlp": 0.15270996, + "step": 6668, + "time_per_iteration": 2.567748546600342 + }, + { + "auxiliary_loss_clip": 0.06454465, + "auxiliary_loss_mlp": 0.01272615, + "balance_loss_clip": 0.0628936, + "balance_loss_mlp": 0.01258572, + "epoch": 0.4009619720426875, + "flos": 29067581733120.0, + "grad_norm": 1.8066450616757106, + "language_loss": 0.82171971, + "learning_rate": 2.721938558257248e-06, + "loss": 0.89899051, + "num_input_tokens_seen": 143225015, + "router_z_loss_clip": 1.64941406, + "router_z_loss_mlp": 0.14050293, + "step": 6669, + "time_per_iteration": 2.614875555038452 + }, + { + "auxiliary_loss_clip": 0.06349576, + "auxiliary_loss_mlp": 0.01259788, + "balance_loss_clip": 0.06273951, + "balance_loss_mlp": 0.01257549, + "epoch": 0.4010220952953555, + "flos": 66080347136640.0, + "grad_norm": 0.6837113267664942, + "language_loss": 0.53268963, + "learning_rate": 2.721575341289695e-06, + "loss": 0.60878325, + "num_input_tokens_seen": 143294925, + "router_z_loss_clip": 0.75683594, + "router_z_loss_mlp": 0.02243042, + "step": 6670, + "time_per_iteration": 3.2985219955444336 + }, + { + "auxiliary_loss_clip": 0.06453651, + "auxiliary_loss_mlp": 0.01274966, + "balance_loss_clip": 0.06286684, + "balance_loss_mlp": 0.01260405, + "epoch": 0.40108221854802345, + "flos": 29650519388160.0, + "grad_norm": 1.6370315093264123, + "language_loss": 0.88528681, + "learning_rate": 2.7212120969599333e-06, + "loss": 0.96257305, + "num_input_tokens_seen": 143314170, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.14556885, + "step": 6671, + "time_per_iteration": 2.6268246173858643 + }, + { + "auxiliary_loss_clip": 0.06460971, + "auxiliary_loss_mlp": 0.01272066, + "balance_loss_clip": 0.06289764, + "balance_loss_mlp": 0.01256861, + "epoch": 0.4011423418006914, + "flos": 19934286693120.0, + "grad_norm": 1.7015153377224497, + "language_loss": 0.78868973, + "learning_rate": 2.720848825281736e-06, + "loss": 0.86602008, + "num_input_tokens_seen": 143330050, + "router_z_loss_clip": 1.7109375, + "router_z_loss_mlp": 0.1519165, + "step": 6672, + "time_per_iteration": 2.4949698448181152 + }, + { + "auxiliary_loss_clip": 0.06458279, + "auxiliary_loss_mlp": 0.01271887, + "balance_loss_clip": 0.06290099, + "balance_loss_mlp": 0.01257701, + "epoch": 0.4012024650533594, + "flos": 20090523830400.0, + "grad_norm": 2.076088840896174, + "language_loss": 0.63474464, + "learning_rate": 2.72048552626888e-06, + "loss": 0.71204633, + "num_input_tokens_seen": 143348650, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.1418457, + "step": 6673, + "time_per_iteration": 2.644050121307373 + }, + { + "auxiliary_loss_clip": 0.06458048, + "auxiliary_loss_mlp": 0.0127375, + "balance_loss_clip": 0.062879, + "balance_loss_mlp": 0.01259827, + "epoch": 0.40126258830602735, + "flos": 21703224637440.0, + "grad_norm": 1.4478595936596839, + "language_loss": 0.80581552, + "learning_rate": 2.7201221999351402e-06, + "loss": 0.88313353, + "num_input_tokens_seen": 143370275, + "router_z_loss_clip": 1.70117188, + "router_z_loss_mlp": 0.13903809, + "step": 6674, + "time_per_iteration": 2.559034824371338 + }, + { + "auxiliary_loss_clip": 0.0646532, + "auxiliary_loss_mlp": 0.01272896, + "balance_loss_clip": 0.06289816, + "balance_loss_mlp": 0.01258269, + "epoch": 0.4013227115586953, + "flos": 12025160277120.0, + "grad_norm": 2.4455561687367195, + "language_loss": 0.82561237, + "learning_rate": 2.719758846294294e-06, + "loss": 0.90299457, + "num_input_tokens_seen": 143385390, + "router_z_loss_clip": 1.75390625, + "router_z_loss_mlp": 0.14624023, + "step": 6675, + "time_per_iteration": 2.5448951721191406 + }, + { + "auxiliary_loss_clip": 0.06465134, + "auxiliary_loss_mlp": 0.01268709, + "balance_loss_clip": 0.06295693, + "balance_loss_mlp": 0.01254106, + "epoch": 0.4013828348113633, + "flos": 25454612257920.0, + "grad_norm": 1.6408733853472015, + "language_loss": 0.93777156, + "learning_rate": 2.71939546536012e-06, + "loss": 1.01511002, + "num_input_tokens_seen": 143404215, + "router_z_loss_clip": 1.69433594, + "router_z_loss_mlp": 0.14581299, + "step": 6676, + "time_per_iteration": 2.5721349716186523 + }, + { + "auxiliary_loss_clip": 0.06469207, + "auxiliary_loss_mlp": 0.01274451, + "balance_loss_clip": 0.06291738, + "balance_loss_mlp": 0.01258274, + "epoch": 0.40144295806403124, + "flos": 18588009542400.0, + "grad_norm": 2.5026106137632222, + "language_loss": 0.80060673, + "learning_rate": 2.719032057146399e-06, + "loss": 0.87804335, + "num_input_tokens_seen": 143422245, + "router_z_loss_clip": 1.7734375, + "router_z_loss_mlp": 0.16186523, + "step": 6677, + "time_per_iteration": 2.5438191890716553 + }, + { + "auxiliary_loss_clip": 0.06455022, + "auxiliary_loss_mlp": 0.01270715, + "balance_loss_clip": 0.0628567, + "balance_loss_mlp": 0.01256934, + "epoch": 0.4015030813166992, + "flos": 22936925427840.0, + "grad_norm": 1.8567640541952835, + "language_loss": 0.83925951, + "learning_rate": 2.71866862166691e-06, + "loss": 0.9165169, + "num_input_tokens_seen": 143443130, + "router_z_loss_clip": 1.69042969, + "router_z_loss_mlp": 0.13793945, + "step": 6678, + "time_per_iteration": 2.5458457469940186 + }, + { + "auxiliary_loss_clip": 0.06455562, + "auxiliary_loss_mlp": 0.0127344, + "balance_loss_clip": 0.06287661, + "balance_loss_mlp": 0.01258325, + "epoch": 0.4015632045693672, + "flos": 20601359447040.0, + "grad_norm": 2.2595275456436767, + "language_loss": 0.6400671, + "learning_rate": 2.718305158935434e-06, + "loss": 0.7173571, + "num_input_tokens_seen": 143461385, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.15124512, + "step": 6679, + "time_per_iteration": 2.553312063217163 + }, + { + "auxiliary_loss_clip": 0.0645475, + "auxiliary_loss_mlp": 0.01270251, + "balance_loss_clip": 0.06285992, + "balance_loss_mlp": 0.01256268, + "epoch": 0.4016233278220352, + "flos": 23445371203200.0, + "grad_norm": 1.525723625053638, + "language_loss": 0.78686285, + "learning_rate": 2.7179416689657554e-06, + "loss": 0.86411297, + "num_input_tokens_seen": 143481750, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.14001465, + "step": 6680, + "time_per_iteration": 2.5376389026641846 + }, + { + "auxiliary_loss_clip": 0.0646753, + "auxiliary_loss_mlp": 0.0127372, + "balance_loss_clip": 0.06289258, + "balance_loss_mlp": 0.01258008, + "epoch": 0.40168345107470316, + "flos": 21436968689280.0, + "grad_norm": 1.5038657697958466, + "language_loss": 0.76059246, + "learning_rate": 2.7175781517716556e-06, + "loss": 0.83800501, + "num_input_tokens_seen": 143501540, + "router_z_loss_clip": 1.78417969, + "router_z_loss_mlp": 0.15710449, + "step": 6681, + "time_per_iteration": 2.532668352127075 + }, + { + "auxiliary_loss_clip": 0.06461542, + "auxiliary_loss_mlp": 0.01268459, + "balance_loss_clip": 0.06289437, + "balance_loss_mlp": 0.01254285, + "epoch": 0.4017435743273711, + "flos": 22863900994560.0, + "grad_norm": 2.212326324471445, + "language_loss": 0.6446861, + "learning_rate": 2.7172146073669213e-06, + "loss": 0.72198606, + "num_input_tokens_seen": 143520530, + "router_z_loss_clip": 1.72070312, + "router_z_loss_mlp": 0.1416626, + "step": 6682, + "time_per_iteration": 2.585963010787964 + }, + { + "auxiliary_loss_clip": 0.06452938, + "auxiliary_loss_mlp": 0.01271302, + "balance_loss_clip": 0.06279296, + "balance_loss_mlp": 0.01257288, + "epoch": 0.4018036975800391, + "flos": 28630022112000.0, + "grad_norm": 1.839007150843812, + "language_loss": 0.73340857, + "learning_rate": 2.716851035765337e-06, + "loss": 0.81065094, + "num_input_tokens_seen": 143540210, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.14013672, + "step": 6683, + "time_per_iteration": 2.5977652072906494 + }, + { + "auxiliary_loss_clip": 0.06452199, + "auxiliary_loss_mlp": 0.01270902, + "balance_loss_clip": 0.0628196, + "balance_loss_mlp": 0.01257252, + "epoch": 0.40186382083270705, + "flos": 26658446267520.0, + "grad_norm": 1.545951486041889, + "language_loss": 0.73326242, + "learning_rate": 2.7164874369806896e-06, + "loss": 0.81049347, + "num_input_tokens_seen": 143560940, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.13671875, + "step": 6684, + "time_per_iteration": 2.579061985015869 + }, + { + "auxiliary_loss_clip": 0.06341122, + "auxiliary_loss_mlp": 0.01263578, + "balance_loss_clip": 0.06265609, + "balance_loss_mlp": 0.01260683, + "epoch": 0.401923944085375, + "flos": 59277167562240.0, + "grad_norm": 0.7966859396902427, + "language_loss": 0.60515714, + "learning_rate": 2.716123811026767e-06, + "loss": 0.68120408, + "num_input_tokens_seen": 143624015, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 0.02891541, + "step": 6685, + "time_per_iteration": 3.2738587856292725 + }, + { + "auxiliary_loss_clip": 0.06456321, + "auxiliary_loss_mlp": 0.01269632, + "balance_loss_clip": 0.06278493, + "balance_loss_mlp": 0.01255291, + "epoch": 0.401984067338043, + "flos": 16988473825920.0, + "grad_norm": 1.7615677724791905, + "language_loss": 0.70125616, + "learning_rate": 2.715760157917357e-06, + "loss": 0.77851576, + "num_input_tokens_seen": 143642750, + "router_z_loss_clip": 1.77929688, + "router_z_loss_mlp": 0.14343262, + "step": 6686, + "time_per_iteration": 2.565185070037842 + }, + { + "auxiliary_loss_clip": 0.06450202, + "auxiliary_loss_mlp": 0.01269094, + "balance_loss_clip": 0.06279442, + "balance_loss_mlp": 0.0125554, + "epoch": 0.40204419059071095, + "flos": 24979387426560.0, + "grad_norm": 1.3440220766592053, + "language_loss": 0.74867636, + "learning_rate": 2.7153964776662504e-06, + "loss": 0.82586932, + "num_input_tokens_seen": 143664515, + "router_z_loss_clip": 1.70800781, + "router_z_loss_mlp": 0.13549805, + "step": 6687, + "time_per_iteration": 2.6009433269500732 + }, + { + "auxiliary_loss_clip": 0.06451625, + "auxiliary_loss_mlp": 0.01275028, + "balance_loss_clip": 0.06281097, + "balance_loss_mlp": 0.01261164, + "epoch": 0.4021043138433789, + "flos": 23484252297600.0, + "grad_norm": 1.7565801002117698, + "language_loss": 0.71198428, + "learning_rate": 2.7150327702872385e-06, + "loss": 0.78925073, + "num_input_tokens_seen": 143683135, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.13873291, + "step": 6688, + "time_per_iteration": 3.9550609588623047 + }, + { + "auxiliary_loss_clip": 0.06455014, + "auxiliary_loss_mlp": 0.01278979, + "balance_loss_clip": 0.06277826, + "balance_loss_mlp": 0.01263506, + "epoch": 0.4021644370960469, + "flos": 26003155011840.0, + "grad_norm": 1.6503070586239919, + "language_loss": 0.64854121, + "learning_rate": 2.7146690357941112e-06, + "loss": 0.7258811, + "num_input_tokens_seen": 143703985, + "router_z_loss_clip": 1.77246094, + "router_z_loss_mlp": 0.15478516, + "step": 6689, + "time_per_iteration": 2.552058458328247 + }, + { + "auxiliary_loss_clip": 0.06450799, + "auxiliary_loss_mlp": 0.01267992, + "balance_loss_clip": 0.06276366, + "balance_loss_mlp": 0.0125417, + "epoch": 0.40222456034871484, + "flos": 13592816714880.0, + "grad_norm": 1.9543405887805447, + "language_loss": 0.73594153, + "learning_rate": 2.7143052742006632e-06, + "loss": 0.81312943, + "num_input_tokens_seen": 143719245, + "router_z_loss_clip": 1.74414062, + "router_z_loss_mlp": 0.13824463, + "step": 6690, + "time_per_iteration": 2.5484251976013184 + }, + { + "auxiliary_loss_clip": 0.06448495, + "auxiliary_loss_mlp": 0.0127057, + "balance_loss_clip": 0.06278096, + "balance_loss_mlp": 0.01256682, + "epoch": 0.4022846836013828, + "flos": 24284586170880.0, + "grad_norm": 1.722227920192768, + "language_loss": 0.74861401, + "learning_rate": 2.7139414855206872e-06, + "loss": 0.82580471, + "num_input_tokens_seen": 143739575, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.13903809, + "step": 6691, + "time_per_iteration": 3.9708051681518555 + }, + { + "auxiliary_loss_clip": 0.06451076, + "auxiliary_loss_mlp": 0.01277672, + "balance_loss_clip": 0.0627808, + "balance_loss_mlp": 0.01262151, + "epoch": 0.40234480685405083, + "flos": 20156881864320.0, + "grad_norm": 1.7761891830354823, + "language_loss": 0.72677463, + "learning_rate": 2.7135776697679785e-06, + "loss": 0.80406213, + "num_input_tokens_seen": 143758515, + "router_z_loss_clip": 1.72949219, + "router_z_loss_mlp": 0.15515137, + "step": 6692, + "time_per_iteration": 2.5179357528686523 + }, + { + "auxiliary_loss_clip": 0.06447224, + "auxiliary_loss_mlp": 0.01270814, + "balance_loss_clip": 0.06276847, + "balance_loss_mlp": 0.0125664, + "epoch": 0.4024049301067188, + "flos": 22936925427840.0, + "grad_norm": 1.7625804596819372, + "language_loss": 0.8401857, + "learning_rate": 2.7132138269563333e-06, + "loss": 0.91736615, + "num_input_tokens_seen": 143776770, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.1418457, + "step": 6693, + "time_per_iteration": 2.707941770553589 + }, + { + "auxiliary_loss_clip": 0.06452498, + "auxiliary_loss_mlp": 0.01266294, + "balance_loss_clip": 0.06281643, + "balance_loss_mlp": 0.01252865, + "epoch": 0.40246505335938676, + "flos": 36037285297920.0, + "grad_norm": 1.8844808694168769, + "language_loss": 0.70966387, + "learning_rate": 2.7128499570995483e-06, + "loss": 0.78685182, + "num_input_tokens_seen": 143798450, + "router_z_loss_clip": 1.70996094, + "router_z_loss_mlp": 0.13433838, + "step": 6694, + "time_per_iteration": 2.637481927871704 + }, + { + "auxiliary_loss_clip": 0.06444509, + "auxiliary_loss_mlp": 0.01272964, + "balance_loss_clip": 0.0627351, + "balance_loss_mlp": 0.01258552, + "epoch": 0.4025251766120547, + "flos": 20600478979200.0, + "grad_norm": 1.9746374404018712, + "language_loss": 0.68475246, + "learning_rate": 2.7124860602114212e-06, + "loss": 0.76192719, + "num_input_tokens_seen": 143816995, + "router_z_loss_clip": 1.7109375, + "router_z_loss_mlp": 0.14428711, + "step": 6695, + "time_per_iteration": 3.9740405082702637 + }, + { + "auxiliary_loss_clip": 0.06446315, + "auxiliary_loss_mlp": 0.01270396, + "balance_loss_clip": 0.06276862, + "balance_loss_mlp": 0.01256484, + "epoch": 0.4025852998647227, + "flos": 64537582890240.0, + "grad_norm": 2.0865884556399363, + "language_loss": 0.79765463, + "learning_rate": 2.7121221363057515e-06, + "loss": 0.87482178, + "num_input_tokens_seen": 143842090, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.13897705, + "step": 6696, + "time_per_iteration": 3.0413708686828613 + }, + { + "auxiliary_loss_clip": 0.06454235, + "auxiliary_loss_mlp": 0.01269123, + "balance_loss_clip": 0.06281278, + "balance_loss_mlp": 0.01254473, + "epoch": 0.40264542311739066, + "flos": 20892534785280.0, + "grad_norm": 1.7976365729577468, + "language_loss": 0.71608603, + "learning_rate": 2.7117581853963393e-06, + "loss": 0.79331958, + "num_input_tokens_seen": 143860800, + "router_z_loss_clip": 1.72851562, + "router_z_loss_mlp": 0.14660645, + "step": 6697, + "time_per_iteration": 2.5200350284576416 + }, + { + "auxiliary_loss_clip": 0.06445032, + "auxiliary_loss_mlp": 0.01270069, + "balance_loss_clip": 0.06276169, + "balance_loss_mlp": 0.0125658, + "epoch": 0.4027055463700586, + "flos": 26257419826560.0, + "grad_norm": 1.9918981514977272, + "language_loss": 0.61230171, + "learning_rate": 2.711394207496984e-06, + "loss": 0.68945277, + "num_input_tokens_seen": 143878950, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.13464355, + "step": 6698, + "time_per_iteration": 2.576472520828247 + }, + { + "auxiliary_loss_clip": 0.06449181, + "auxiliary_loss_mlp": 0.0126685, + "balance_loss_clip": 0.06276856, + "balance_loss_mlp": 0.01252849, + "epoch": 0.4027656696227266, + "flos": 20637682992000.0, + "grad_norm": 2.0070875825685266, + "language_loss": 0.77479243, + "learning_rate": 2.711030202621491e-06, + "loss": 0.85195273, + "num_input_tokens_seen": 143898385, + "router_z_loss_clip": 1.72363281, + "router_z_loss_mlp": 0.14001465, + "step": 6699, + "time_per_iteration": 3.937375545501709 + }, + { + "auxiliary_loss_clip": 0.0644554, + "auxiliary_loss_mlp": 0.01267758, + "balance_loss_clip": 0.0627719, + "balance_loss_mlp": 0.01253977, + "epoch": 0.40282579287539455, + "flos": 22352855742720.0, + "grad_norm": 1.735185416550665, + "language_loss": 0.80698907, + "learning_rate": 2.7106661707836605e-06, + "loss": 0.88412201, + "num_input_tokens_seen": 143918795, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.13793945, + "step": 6700, + "time_per_iteration": 2.535510540008545 + }, + { + "auxiliary_loss_clip": 0.06459837, + "auxiliary_loss_mlp": 0.01268332, + "balance_loss_clip": 0.06282608, + "balance_loss_mlp": 0.01253157, + "epoch": 0.4028859161280625, + "flos": 29282126912640.0, + "grad_norm": 1.7653471156752092, + "language_loss": 0.74938649, + "learning_rate": 2.7103021119972977e-06, + "loss": 0.82666814, + "num_input_tokens_seen": 143938245, + "router_z_loss_clip": 1.77246094, + "router_z_loss_mlp": 0.1517334, + "step": 6701, + "time_per_iteration": 2.6509363651275635 + }, + { + "auxiliary_loss_clip": 0.06451308, + "auxiliary_loss_mlp": 0.01270948, + "balance_loss_clip": 0.06281418, + "balance_loss_mlp": 0.01257329, + "epoch": 0.4029460393807305, + "flos": 28630022112000.0, + "grad_norm": 1.48917022125432, + "language_loss": 0.66283298, + "learning_rate": 2.709938026276208e-06, + "loss": 0.74005556, + "num_input_tokens_seen": 143960995, + "router_z_loss_clip": 1.69824219, + "router_z_loss_mlp": 0.13641357, + "step": 6702, + "time_per_iteration": 2.6183536052703857 + }, + { + "auxiliary_loss_clip": 0.06460792, + "auxiliary_loss_mlp": 0.0127397, + "balance_loss_clip": 0.06286055, + "balance_loss_mlp": 0.01259117, + "epoch": 0.40300616263339845, + "flos": 22608588003840.0, + "grad_norm": 1.5996325972429297, + "language_loss": 0.66632348, + "learning_rate": 2.7095739136341964e-06, + "loss": 0.74367112, + "num_input_tokens_seen": 143979910, + "router_z_loss_clip": 1.74511719, + "router_z_loss_mlp": 0.14849854, + "step": 6703, + "time_per_iteration": 2.583040237426758 + }, + { + "auxiliary_loss_clip": 0.06456298, + "auxiliary_loss_mlp": 0.01273361, + "balance_loss_clip": 0.06282306, + "balance_loss_mlp": 0.012584, + "epoch": 0.4030662858860664, + "flos": 25527385128960.0, + "grad_norm": 1.7345540067512994, + "language_loss": 0.82398093, + "learning_rate": 2.709209774085071e-06, + "loss": 0.90127754, + "num_input_tokens_seen": 144000095, + "router_z_loss_clip": 1.73925781, + "router_z_loss_mlp": 0.14959717, + "step": 6704, + "time_per_iteration": 2.564052104949951 + }, + { + "auxiliary_loss_clip": 0.06457714, + "auxiliary_loss_mlp": 0.01272416, + "balance_loss_clip": 0.06283459, + "balance_loss_mlp": 0.01258332, + "epoch": 0.40312640913873443, + "flos": 23593474494720.0, + "grad_norm": 1.6434462448941187, + "language_loss": 0.73919153, + "learning_rate": 2.7088456076426407e-06, + "loss": 0.81649286, + "num_input_tokens_seen": 144019695, + "router_z_loss_clip": 1.74316406, + "router_z_loss_mlp": 0.140625, + "step": 6705, + "time_per_iteration": 2.609738349914551 + }, + { + "auxiliary_loss_clip": 0.06450006, + "auxiliary_loss_mlp": 0.01270089, + "balance_loss_clip": 0.06282469, + "balance_loss_mlp": 0.01256481, + "epoch": 0.4031865323914024, + "flos": 20017205907840.0, + "grad_norm": 1.6242014521871173, + "language_loss": 0.66795284, + "learning_rate": 2.708481414320713e-06, + "loss": 0.74515378, + "num_input_tokens_seen": 144038525, + "router_z_loss_clip": 1.67480469, + "router_z_loss_mlp": 0.1361084, + "step": 6706, + "time_per_iteration": 2.5215423107147217 + }, + { + "auxiliary_loss_clip": 0.06452154, + "auxiliary_loss_mlp": 0.01268976, + "balance_loss_clip": 0.06282388, + "balance_loss_mlp": 0.0125513, + "epoch": 0.40324665564407036, + "flos": 21877840546560.0, + "grad_norm": 1.6449246324910813, + "language_loss": 0.71481538, + "learning_rate": 2.7081171941330992e-06, + "loss": 0.79202664, + "num_input_tokens_seen": 144059485, + "router_z_loss_clip": 1.70019531, + "router_z_loss_mlp": 0.13842773, + "step": 6707, + "time_per_iteration": 2.5762581825256348 + }, + { + "auxiliary_loss_clip": 0.0644149, + "auxiliary_loss_mlp": 0.01271296, + "balance_loss_clip": 0.06278867, + "balance_loss_mlp": 0.01258379, + "epoch": 0.4033067788967383, + "flos": 23885572227840.0, + "grad_norm": 1.6148090336243837, + "language_loss": 0.80062628, + "learning_rate": 2.707752947093611e-06, + "loss": 0.87775409, + "num_input_tokens_seen": 144080265, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.12908936, + "step": 6708, + "time_per_iteration": 2.5509586334228516 + }, + { + "auxiliary_loss_clip": 0.06459241, + "auxiliary_loss_mlp": 0.01271237, + "balance_loss_clip": 0.0628079, + "balance_loss_mlp": 0.01256133, + "epoch": 0.4033669021494063, + "flos": 17425530322560.0, + "grad_norm": 2.5431099630067435, + "language_loss": 0.8334195, + "learning_rate": 2.70738867321606e-06, + "loss": 0.91072428, + "num_input_tokens_seen": 144098040, + "router_z_loss_clip": 1.78417969, + "router_z_loss_mlp": 0.15100098, + "step": 6709, + "time_per_iteration": 2.5844790935516357 + }, + { + "auxiliary_loss_clip": 0.06454608, + "auxiliary_loss_mlp": 0.01274744, + "balance_loss_clip": 0.0628157, + "balance_loss_mlp": 0.01259211, + "epoch": 0.40342702540207426, + "flos": 29607277881600.0, + "grad_norm": 1.5307534200842645, + "language_loss": 0.71642667, + "learning_rate": 2.70702437251426e-06, + "loss": 0.79372019, + "num_input_tokens_seen": 144118265, + "router_z_loss_clip": 1.72753906, + "router_z_loss_mlp": 0.15527344, + "step": 6710, + "time_per_iteration": 2.5950214862823486 + }, + { + "auxiliary_loss_clip": 0.06448973, + "auxiliary_loss_mlp": 0.01270551, + "balance_loss_clip": 0.06280518, + "balance_loss_mlp": 0.01256037, + "epoch": 0.4034871486547422, + "flos": 11288249544960.0, + "grad_norm": 5.632076524924719, + "language_loss": 0.85771239, + "learning_rate": 2.7066600450020236e-06, + "loss": 0.93490767, + "num_input_tokens_seen": 144133865, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.1451416, + "step": 6711, + "time_per_iteration": 2.530691146850586 + }, + { + "auxiliary_loss_clip": 0.06457499, + "auxiliary_loss_mlp": 0.01273198, + "balance_loss_clip": 0.0628542, + "balance_loss_mlp": 0.01258732, + "epoch": 0.4035472719074102, + "flos": 15557097254400.0, + "grad_norm": 2.360012043566648, + "language_loss": 0.76516247, + "learning_rate": 2.706295690693168e-06, + "loss": 0.84246945, + "num_input_tokens_seen": 144150125, + "router_z_loss_clip": 1.72167969, + "router_z_loss_mlp": 0.14471436, + "step": 6712, + "time_per_iteration": 2.485973358154297 + }, + { + "auxiliary_loss_clip": 0.06453355, + "auxiliary_loss_mlp": 0.01270625, + "balance_loss_clip": 0.06282951, + "balance_loss_mlp": 0.01256249, + "epoch": 0.40360739516007815, + "flos": 24680162096640.0, + "grad_norm": 2.2673991582834803, + "language_loss": 0.80280489, + "learning_rate": 2.7059313096015096e-06, + "loss": 0.88004464, + "num_input_tokens_seen": 144169295, + "router_z_loss_clip": 1.70117188, + "router_z_loss_mlp": 0.14379883, + "step": 6713, + "time_per_iteration": 2.604844093322754 + }, + { + "auxiliary_loss_clip": 0.06452335, + "auxiliary_loss_mlp": 0.01272867, + "balance_loss_clip": 0.06279401, + "balance_loss_mlp": 0.01258824, + "epoch": 0.4036675184127461, + "flos": 17308635477120.0, + "grad_norm": 2.487123438751718, + "language_loss": 0.88458717, + "learning_rate": 2.705566901740865e-06, + "loss": 0.9618392, + "num_input_tokens_seen": 144185790, + "router_z_loss_clip": 1.73144531, + "router_z_loss_mlp": 0.14038086, + "step": 6714, + "time_per_iteration": 2.4827568531036377 + }, + { + "auxiliary_loss_clip": 0.06454237, + "auxiliary_loss_mlp": 0.01268245, + "balance_loss_clip": 0.06281483, + "balance_loss_mlp": 0.01254011, + "epoch": 0.4037276416654141, + "flos": 19869983084160.0, + "grad_norm": 1.5212273970247687, + "language_loss": 0.69752967, + "learning_rate": 2.7052024671250527e-06, + "loss": 0.77475452, + "num_input_tokens_seen": 144205190, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.14233398, + "step": 6715, + "time_per_iteration": 2.5602893829345703 + }, + { + "auxiliary_loss_clip": 0.06458366, + "auxiliary_loss_mlp": 0.01269769, + "balance_loss_clip": 0.06281729, + "balance_loss_mlp": 0.0125541, + "epoch": 0.40378776491808205, + "flos": 18302158938240.0, + "grad_norm": 1.8718399277124913, + "language_loss": 0.78095776, + "learning_rate": 2.704838005767892e-06, + "loss": 0.85823905, + "num_input_tokens_seen": 144222705, + "router_z_loss_clip": 1.76757812, + "router_z_loss_mlp": 0.14367676, + "step": 6716, + "time_per_iteration": 2.4911210536956787 + }, + { + "auxiliary_loss_clip": 0.06449929, + "auxiliary_loss_mlp": 0.01275524, + "balance_loss_clip": 0.0628348, + "balance_loss_mlp": 0.01262185, + "epoch": 0.40384788817075, + "flos": 15054772826880.0, + "grad_norm": 1.8985450182353327, + "language_loss": 0.76491797, + "learning_rate": 2.7044735176832037e-06, + "loss": 0.8421725, + "num_input_tokens_seen": 144239545, + "router_z_loss_clip": 1.66308594, + "router_z_loss_mlp": 0.13342285, + "step": 6717, + "time_per_iteration": 2.5457956790924072 + }, + { + "auxiliary_loss_clip": 0.0634857, + "auxiliary_loss_mlp": 0.01256954, + "balance_loss_clip": 0.06272445, + "balance_loss_mlp": 0.01254165, + "epoch": 0.40390801142341803, + "flos": 61948659761280.0, + "grad_norm": 0.8842261639057883, + "language_loss": 0.60140264, + "learning_rate": 2.7041090028848084e-06, + "loss": 0.67745787, + "num_input_tokens_seen": 144288145, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.02790833, + "step": 6718, + "time_per_iteration": 2.9733822345733643 + }, + { + "auxiliary_loss_clip": 0.06457312, + "auxiliary_loss_mlp": 0.0127584, + "balance_loss_clip": 0.06279647, + "balance_loss_mlp": 0.01260366, + "epoch": 0.403968134676086, + "flos": 22743945475200.0, + "grad_norm": 1.799198719667369, + "language_loss": 0.75286412, + "learning_rate": 2.7037444613865306e-06, + "loss": 0.83019567, + "num_input_tokens_seen": 144302315, + "router_z_loss_clip": 1.77441406, + "router_z_loss_mlp": 0.15490723, + "step": 6719, + "time_per_iteration": 2.5417115688323975 + }, + { + "auxiliary_loss_clip": 0.06454173, + "auxiliary_loss_mlp": 0.01269672, + "balance_loss_clip": 0.06282561, + "balance_loss_mlp": 0.01254592, + "epoch": 0.40402825792875396, + "flos": 19789244075520.0, + "grad_norm": 2.1951890128687257, + "language_loss": 0.81351668, + "learning_rate": 2.7033798932021906e-06, + "loss": 0.89075512, + "num_input_tokens_seen": 144318990, + "router_z_loss_clip": 1.71484375, + "router_z_loss_mlp": 0.15100098, + "step": 6720, + "time_per_iteration": 2.4906880855560303 + }, + { + "auxiliary_loss_clip": 0.06453006, + "auxiliary_loss_mlp": 0.01269643, + "balance_loss_clip": 0.06279179, + "balance_loss_mlp": 0.01254742, + "epoch": 0.40408838118142193, + "flos": 19615298999040.0, + "grad_norm": 1.8273574705972042, + "language_loss": 0.77227581, + "learning_rate": 2.7030152983456153e-06, + "loss": 0.84950233, + "num_input_tokens_seen": 144335765, + "router_z_loss_clip": 1.73925781, + "router_z_loss_mlp": 0.14904785, + "step": 6721, + "time_per_iteration": 2.5645196437835693 + }, + { + "auxiliary_loss_clip": 0.06447627, + "auxiliary_loss_mlp": 0.01264811, + "balance_loss_clip": 0.06279851, + "balance_loss_mlp": 0.01251931, + "epoch": 0.4041485044340899, + "flos": 24432982951680.0, + "grad_norm": 1.7503779333013576, + "language_loss": 0.72784024, + "learning_rate": 2.7026506768306304e-06, + "loss": 0.80496466, + "num_input_tokens_seen": 144355825, + "router_z_loss_clip": 1.67773438, + "router_z_loss_mlp": 0.12884521, + "step": 6722, + "time_per_iteration": 2.5520758628845215 + }, + { + "auxiliary_loss_clip": 0.06450947, + "auxiliary_loss_mlp": 0.01270139, + "balance_loss_clip": 0.06280953, + "balance_loss_mlp": 0.01256972, + "epoch": 0.40420862768675786, + "flos": 16765207822080.0, + "grad_norm": 1.6533819858806273, + "language_loss": 0.65986466, + "learning_rate": 2.7022860286710602e-06, + "loss": 0.73707551, + "num_input_tokens_seen": 144374320, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.13165283, + "step": 6723, + "time_per_iteration": 2.5385141372680664 + }, + { + "auxiliary_loss_clip": 0.06456833, + "auxiliary_loss_mlp": 0.01276273, + "balance_loss_clip": 0.06280676, + "balance_loss_mlp": 0.01262039, + "epoch": 0.4042687509394258, + "flos": 22498066068480.0, + "grad_norm": 1.4281101192387737, + "language_loss": 0.74082482, + "learning_rate": 2.701921353880734e-06, + "loss": 0.81815588, + "num_input_tokens_seen": 144394325, + "router_z_loss_clip": 1.76464844, + "router_z_loss_mlp": 0.14227295, + "step": 6724, + "time_per_iteration": 2.5705087184906006 + }, + { + "auxiliary_loss_clip": 0.06445859, + "auxiliary_loss_mlp": 0.01269625, + "balance_loss_clip": 0.06280795, + "balance_loss_mlp": 0.01256226, + "epoch": 0.4043288741920938, + "flos": 30343978978560.0, + "grad_norm": 1.716107680872733, + "language_loss": 0.75255632, + "learning_rate": 2.7015566524734787e-06, + "loss": 0.8297112, + "num_input_tokens_seen": 144412765, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.13409424, + "step": 6725, + "time_per_iteration": 2.6433653831481934 + }, + { + "auxiliary_loss_clip": 0.06451583, + "auxiliary_loss_mlp": 0.01271794, + "balance_loss_clip": 0.06282748, + "balance_loss_mlp": 0.01257054, + "epoch": 0.40438899744476176, + "flos": 46357978947840.0, + "grad_norm": 1.593616701788039, + "language_loss": 0.77198207, + "learning_rate": 2.701191924463126e-06, + "loss": 0.84921581, + "num_input_tokens_seen": 144435400, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.14733887, + "step": 6726, + "time_per_iteration": 2.8469409942626953 + }, + { + "auxiliary_loss_clip": 0.06452948, + "auxiliary_loss_mlp": 0.0127047, + "balance_loss_clip": 0.06279704, + "balance_loss_mlp": 0.01256058, + "epoch": 0.4044491206974297, + "flos": 13338468046080.0, + "grad_norm": 2.072990787427281, + "language_loss": 0.82297921, + "learning_rate": 2.7008271698635054e-06, + "loss": 0.90021348, + "num_input_tokens_seen": 144452925, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.14404297, + "step": 6727, + "time_per_iteration": 2.5381619930267334 + }, + { + "auxiliary_loss_clip": 0.06453642, + "auxiliary_loss_mlp": 0.01266247, + "balance_loss_clip": 0.06281026, + "balance_loss_mlp": 0.01252413, + "epoch": 0.4045092439500977, + "flos": 12098603980800.0, + "grad_norm": 2.0199249210029055, + "language_loss": 0.86119437, + "learning_rate": 2.700462388688447e-06, + "loss": 0.93839324, + "num_input_tokens_seen": 144470195, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.13830566, + "step": 6728, + "time_per_iteration": 3.903547763824463 + }, + { + "auxiliary_loss_clip": 0.06450571, + "auxiliary_loss_mlp": 0.01275259, + "balance_loss_clip": 0.06281772, + "balance_loss_mlp": 0.01260567, + "epoch": 0.40456936720276565, + "flos": 21186225745920.0, + "grad_norm": 1.6307737524107195, + "language_loss": 0.82346553, + "learning_rate": 2.700097580951786e-06, + "loss": 0.90072381, + "num_input_tokens_seen": 144490320, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.14697266, + "step": 6729, + "time_per_iteration": 2.5673158168792725 + }, + { + "auxiliary_loss_clip": 0.06454299, + "auxiliary_loss_mlp": 0.01268394, + "balance_loss_clip": 0.06281105, + "balance_loss_mlp": 0.01253755, + "epoch": 0.4046294904554336, + "flos": 23922147335040.0, + "grad_norm": 1.7857320211804986, + "language_loss": 0.73840159, + "learning_rate": 2.6997327466673533e-06, + "loss": 0.81562853, + "num_input_tokens_seen": 144508990, + "router_z_loss_clip": 1.73144531, + "router_z_loss_mlp": 0.14630127, + "step": 6730, + "time_per_iteration": 4.11122727394104 + }, + { + "auxiliary_loss_clip": 0.0645189, + "auxiliary_loss_mlp": 0.01268684, + "balance_loss_clip": 0.06282154, + "balance_loss_mlp": 0.01254767, + "epoch": 0.4046896137081016, + "flos": 38080376202240.0, + "grad_norm": 1.7383158082611918, + "language_loss": 0.67290312, + "learning_rate": 2.699367885848985e-06, + "loss": 0.75010884, + "num_input_tokens_seen": 144529550, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.13922119, + "step": 6731, + "time_per_iteration": 2.8046634197235107 + }, + { + "auxiliary_loss_clip": 0.06450266, + "auxiliary_loss_mlp": 0.01270158, + "balance_loss_clip": 0.0628126, + "balance_loss_mlp": 0.01256175, + "epoch": 0.4047497369607696, + "flos": 23623047786240.0, + "grad_norm": 1.7716081402001673, + "language_loss": 0.74489558, + "learning_rate": 2.699002998510517e-06, + "loss": 0.8220998, + "num_input_tokens_seen": 144549310, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.13977051, + "step": 6732, + "time_per_iteration": 2.608191728591919 + }, + { + "auxiliary_loss_clip": 0.06450449, + "auxiliary_loss_mlp": 0.01269365, + "balance_loss_clip": 0.06283008, + "balance_loss_mlp": 0.01255978, + "epoch": 0.40480986021343757, + "flos": 12828596751360.0, + "grad_norm": 1.6538752037468725, + "language_loss": 0.77253687, + "learning_rate": 2.6986380846657852e-06, + "loss": 0.84973502, + "num_input_tokens_seen": 144567430, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.13391113, + "step": 6733, + "time_per_iteration": 2.525399923324585 + }, + { + "auxiliary_loss_clip": 0.06457898, + "auxiliary_loss_mlp": 0.01270828, + "balance_loss_clip": 0.06280859, + "balance_loss_mlp": 0.01255176, + "epoch": 0.40486998346610553, + "flos": 23775511489920.0, + "grad_norm": 4.637374264151728, + "language_loss": 0.76891112, + "learning_rate": 2.698273144328627e-06, + "loss": 0.84619832, + "num_input_tokens_seen": 144585975, + "router_z_loss_clip": 1.76953125, + "router_z_loss_mlp": 0.15661621, + "step": 6734, + "time_per_iteration": 4.040409564971924 + }, + { + "auxiliary_loss_clip": 0.06455547, + "auxiliary_loss_mlp": 0.01267949, + "balance_loss_clip": 0.0627891, + "balance_loss_mlp": 0.0125421, + "epoch": 0.4049301067187735, + "flos": 22863439797120.0, + "grad_norm": 2.24732512167567, + "language_loss": 0.64935613, + "learning_rate": 2.6979081775128805e-06, + "loss": 0.72659111, + "num_input_tokens_seen": 144605225, + "router_z_loss_clip": 1.76757812, + "router_z_loss_mlp": 0.13745117, + "step": 6735, + "time_per_iteration": 2.5326993465423584 + }, + { + "auxiliary_loss_clip": 0.06448689, + "auxiliary_loss_mlp": 0.01271873, + "balance_loss_clip": 0.06279301, + "balance_loss_mlp": 0.01258849, + "epoch": 0.40499022997144146, + "flos": 22790624999040.0, + "grad_norm": 1.962844708798157, + "language_loss": 0.83769405, + "learning_rate": 2.697543184232387e-06, + "loss": 0.91489971, + "num_input_tokens_seen": 144624145, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.13024902, + "step": 6736, + "time_per_iteration": 2.5863215923309326 + }, + { + "auxiliary_loss_clip": 0.06454039, + "auxiliary_loss_mlp": 0.01271412, + "balance_loss_clip": 0.06281038, + "balance_loss_mlp": 0.01256832, + "epoch": 0.4050503532241094, + "flos": 23046021843840.0, + "grad_norm": 1.714368942149708, + "language_loss": 0.75428641, + "learning_rate": 2.6971781645009863e-06, + "loss": 0.83154088, + "num_input_tokens_seen": 144644470, + "router_z_loss_clip": 1.72949219, + "router_z_loss_mlp": 0.14569092, + "step": 6737, + "time_per_iteration": 2.6163716316223145 + }, + { + "auxiliary_loss_clip": 0.06448484, + "auxiliary_loss_mlp": 0.01271121, + "balance_loss_clip": 0.06280237, + "balance_loss_mlp": 0.01257644, + "epoch": 0.4051104764767774, + "flos": 16652254118400.0, + "grad_norm": 4.810644037565116, + "language_loss": 0.72306561, + "learning_rate": 2.696813118332519e-06, + "loss": 0.80026174, + "num_input_tokens_seen": 144661055, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.13470459, + "step": 6738, + "time_per_iteration": 4.0618274211883545 + }, + { + "auxiliary_loss_clip": 0.06449332, + "auxiliary_loss_mlp": 0.01270399, + "balance_loss_clip": 0.06280854, + "balance_loss_mlp": 0.01257399, + "epoch": 0.40517059972944536, + "flos": 16363929818880.0, + "grad_norm": 1.8147061411614016, + "language_loss": 0.75123262, + "learning_rate": 2.696448045740828e-06, + "loss": 0.82842994, + "num_input_tokens_seen": 144677935, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.13000488, + "step": 6739, + "time_per_iteration": 2.489001512527466 + }, + { + "auxiliary_loss_clip": 0.06454495, + "auxiliary_loss_mlp": 0.0126968, + "balance_loss_clip": 0.06282163, + "balance_loss_mlp": 0.01255405, + "epoch": 0.4052307229821133, + "flos": 28810885150080.0, + "grad_norm": 1.87280601387568, + "language_loss": 0.74278009, + "learning_rate": 2.6960829467397576e-06, + "loss": 0.82002187, + "num_input_tokens_seen": 144697725, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.14257812, + "step": 6740, + "time_per_iteration": 2.616560220718384 + }, + { + "auxiliary_loss_clip": 0.0644789, + "auxiliary_loss_mlp": 0.01270934, + "balance_loss_clip": 0.06280458, + "balance_loss_mlp": 0.01257076, + "epoch": 0.4052908462347813, + "flos": 21404334723840.0, + "grad_norm": 1.6527814212000655, + "language_loss": 0.77083528, + "learning_rate": 2.695717821343153e-06, + "loss": 0.84802353, + "num_input_tokens_seen": 144718805, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.1385498, + "step": 6741, + "time_per_iteration": 2.5236477851867676 + }, + { + "auxiliary_loss_clip": 0.06449165, + "auxiliary_loss_mlp": 0.01274329, + "balance_loss_clip": 0.06278783, + "balance_loss_mlp": 0.01259606, + "epoch": 0.40535096948744925, + "flos": 22425628613760.0, + "grad_norm": 1.6285650306233073, + "language_loss": 0.7166388, + "learning_rate": 2.6953526695648577e-06, + "loss": 0.79387373, + "num_input_tokens_seen": 144737105, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.1472168, + "step": 6742, + "time_per_iteration": 2.588928699493408 + }, + { + "auxiliary_loss_clip": 0.06454468, + "auxiliary_loss_mlp": 0.01273335, + "balance_loss_clip": 0.06282452, + "balance_loss_mlp": 0.01258016, + "epoch": 0.4054110927401172, + "flos": 17015028370560.0, + "grad_norm": 2.751799665484638, + "language_loss": 0.73206228, + "learning_rate": 2.6949874914187202e-06, + "loss": 0.80934024, + "num_input_tokens_seen": 144751350, + "router_z_loss_clip": 1.72167969, + "router_z_loss_mlp": 0.15332031, + "step": 6743, + "time_per_iteration": 2.519907236099243 + }, + { + "auxiliary_loss_clip": 0.0645441, + "auxiliary_loss_mlp": 0.01272217, + "balance_loss_clip": 0.06280394, + "balance_loss_mlp": 0.01257494, + "epoch": 0.4054712159927852, + "flos": 21621018182400.0, + "grad_norm": 2.0068914143371623, + "language_loss": 0.7128458, + "learning_rate": 2.694622286918588e-06, + "loss": 0.79011208, + "num_input_tokens_seen": 144770030, + "router_z_loss_clip": 1.73925781, + "router_z_loss_mlp": 0.14733887, + "step": 6744, + "time_per_iteration": 2.641242742538452 + }, + { + "auxiliary_loss_clip": 0.06447047, + "auxiliary_loss_mlp": 0.01269556, + "balance_loss_clip": 0.06280165, + "balance_loss_mlp": 0.01255722, + "epoch": 0.4055313392454532, + "flos": 25819734424320.0, + "grad_norm": 1.5431481906112547, + "language_loss": 0.80460721, + "learning_rate": 2.6942570560783076e-06, + "loss": 0.88177323, + "num_input_tokens_seen": 144790965, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.13830566, + "step": 6745, + "time_per_iteration": 2.563445806503296 + }, + { + "auxiliary_loss_clip": 0.06450857, + "auxiliary_loss_mlp": 0.01269463, + "balance_loss_clip": 0.06282623, + "balance_loss_mlp": 0.01255009, + "epoch": 0.40559146249812117, + "flos": 14142323790720.0, + "grad_norm": 1.9690336991849304, + "language_loss": 0.67176485, + "learning_rate": 2.693891798911731e-06, + "loss": 0.74896801, + "num_input_tokens_seen": 144807755, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.14465332, + "step": 6746, + "time_per_iteration": 2.532186508178711 + }, + { + "auxiliary_loss_clip": 0.064533, + "auxiliary_loss_mlp": 0.01266546, + "balance_loss_clip": 0.06283557, + "balance_loss_mlp": 0.01253272, + "epoch": 0.40565158575078913, + "flos": 41365259815680.0, + "grad_norm": 1.4380414737187444, + "language_loss": 0.57222033, + "learning_rate": 2.6935265154327075e-06, + "loss": 0.64941883, + "num_input_tokens_seen": 144832405, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.1328125, + "step": 6747, + "time_per_iteration": 2.7487149238586426 + }, + { + "auxiliary_loss_clip": 0.06454123, + "auxiliary_loss_mlp": 0.01269064, + "balance_loss_clip": 0.06282702, + "balance_loss_mlp": 0.01255319, + "epoch": 0.4057117090034571, + "flos": 28551421382400.0, + "grad_norm": 2.093705794925994, + "language_loss": 0.84795344, + "learning_rate": 2.693161205655089e-06, + "loss": 0.92518532, + "num_input_tokens_seen": 144853890, + "router_z_loss_clip": 1.71582031, + "router_z_loss_mlp": 0.13739014, + "step": 6748, + "time_per_iteration": 2.5967648029327393 + }, + { + "auxiliary_loss_clip": 0.06453951, + "auxiliary_loss_mlp": 0.01269749, + "balance_loss_clip": 0.06281549, + "balance_loss_mlp": 0.01254794, + "epoch": 0.40577183225612506, + "flos": 18009851569920.0, + "grad_norm": 1.9056349360303495, + "language_loss": 0.81943792, + "learning_rate": 2.6927958695927287e-06, + "loss": 0.89667493, + "num_input_tokens_seen": 144871395, + "router_z_loss_clip": 1.72265625, + "router_z_loss_mlp": 0.14953613, + "step": 6749, + "time_per_iteration": 2.546419143676758 + }, + { + "auxiliary_loss_clip": 0.06450339, + "auxiliary_loss_mlp": 0.01270578, + "balance_loss_clip": 0.06281818, + "balance_loss_mlp": 0.01256762, + "epoch": 0.40583195550879303, + "flos": 19542819617280.0, + "grad_norm": 1.7354001752331154, + "language_loss": 0.75251377, + "learning_rate": 2.6924305072594784e-06, + "loss": 0.82972294, + "num_input_tokens_seen": 144890975, + "router_z_loss_clip": 1.68554688, + "router_z_loss_mlp": 0.13824463, + "step": 6750, + "time_per_iteration": 2.633349895477295 + }, + { + "auxiliary_loss_clip": 0.06461279, + "auxiliary_loss_mlp": 0.01270913, + "balance_loss_clip": 0.06282868, + "balance_loss_mlp": 0.01256441, + "epoch": 0.405892078761461, + "flos": 22315987146240.0, + "grad_norm": 2.3215315740209026, + "language_loss": 0.73715317, + "learning_rate": 2.692065118669195e-06, + "loss": 0.81447506, + "num_input_tokens_seen": 144908170, + "router_z_loss_clip": 1.78125, + "router_z_loss_mlp": 0.14459229, + "step": 6751, + "time_per_iteration": 2.579233169555664 + }, + { + "auxiliary_loss_clip": 0.06456044, + "auxiliary_loss_mlp": 0.01276434, + "balance_loss_clip": 0.06282923, + "balance_loss_mlp": 0.01261622, + "epoch": 0.40595220201412896, + "flos": 25491564708480.0, + "grad_norm": 1.5288716905414277, + "language_loss": 0.66520017, + "learning_rate": 2.6916997038357326e-06, + "loss": 0.74252492, + "num_input_tokens_seen": 144928020, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.14788818, + "step": 6752, + "time_per_iteration": 2.5768818855285645 + }, + { + "auxiliary_loss_clip": 0.06457777, + "auxiliary_loss_mlp": 0.01274224, + "balance_loss_clip": 0.06281942, + "balance_loss_mlp": 0.01259025, + "epoch": 0.4060123252667969, + "flos": 49867092887040.0, + "grad_norm": 1.7025851849816316, + "language_loss": 0.71210098, + "learning_rate": 2.691334262772948e-06, + "loss": 0.78942096, + "num_input_tokens_seen": 144951240, + "router_z_loss_clip": 1.75976562, + "router_z_loss_mlp": 0.15197754, + "step": 6753, + "time_per_iteration": 2.807713031768799 + }, + { + "auxiliary_loss_clip": 0.06455305, + "auxiliary_loss_mlp": 0.01268505, + "balance_loss_clip": 0.06281379, + "balance_loss_mlp": 0.01254736, + "epoch": 0.4060724485194649, + "flos": 21140720179200.0, + "grad_norm": 2.0551663576230657, + "language_loss": 0.72102135, + "learning_rate": 2.690968795494699e-06, + "loss": 0.7982595, + "num_input_tokens_seen": 144969100, + "router_z_loss_clip": 1.73925781, + "router_z_loss_mlp": 0.13763428, + "step": 6754, + "time_per_iteration": 2.5342867374420166 + }, + { + "auxiliary_loss_clip": 0.0645773, + "auxiliary_loss_mlp": 0.01273848, + "balance_loss_clip": 0.06283537, + "balance_loss_mlp": 0.0125931, + "epoch": 0.40613257177213286, + "flos": 21763796739840.0, + "grad_norm": 1.762365568083109, + "language_loss": 0.83186102, + "learning_rate": 2.690603302014844e-06, + "loss": 0.90917671, + "num_input_tokens_seen": 144987065, + "router_z_loss_clip": 1.74121094, + "router_z_loss_mlp": 0.14520264, + "step": 6755, + "time_per_iteration": 2.6024997234344482 + }, + { + "auxiliary_loss_clip": 0.06461492, + "auxiliary_loss_mlp": 0.01268966, + "balance_loss_clip": 0.06283044, + "balance_loss_mlp": 0.01254047, + "epoch": 0.4061926950248008, + "flos": 25561863884160.0, + "grad_norm": 1.6099502444653784, + "language_loss": 0.71436989, + "learning_rate": 2.6902377823472426e-06, + "loss": 0.79167449, + "num_input_tokens_seen": 145007310, + "router_z_loss_clip": 1.78417969, + "router_z_loss_mlp": 0.14923096, + "step": 6756, + "time_per_iteration": 2.5427916049957275 + }, + { + "auxiliary_loss_clip": 0.06455702, + "auxiliary_loss_mlp": 0.01272698, + "balance_loss_clip": 0.06279261, + "balance_loss_mlp": 0.01257726, + "epoch": 0.4062528182774688, + "flos": 23702528983680.0, + "grad_norm": 1.686471122095966, + "language_loss": 0.79134113, + "learning_rate": 2.689872236505755e-06, + "loss": 0.86862516, + "num_input_tokens_seen": 145026210, + "router_z_loss_clip": 1.76464844, + "router_z_loss_mlp": 0.14990234, + "step": 6757, + "time_per_iteration": 2.573546886444092 + }, + { + "auxiliary_loss_clip": 0.06451409, + "auxiliary_loss_mlp": 0.01275677, + "balance_loss_clip": 0.0627944, + "balance_loss_mlp": 0.01260561, + "epoch": 0.4063129415301368, + "flos": 21732504439680.0, + "grad_norm": 1.6631673854083442, + "language_loss": 0.78665155, + "learning_rate": 2.6895066645042437e-06, + "loss": 0.86392242, + "num_input_tokens_seen": 145045475, + "router_z_loss_clip": 1.71972656, + "router_z_loss_mlp": 0.15100098, + "step": 6758, + "time_per_iteration": 2.5283167362213135 + }, + { + "auxiliary_loss_clip": 0.06450847, + "auxiliary_loss_mlp": 0.01276876, + "balance_loss_clip": 0.06280972, + "balance_loss_mlp": 0.0126331, + "epoch": 0.40637306478280477, + "flos": 12792650549760.0, + "grad_norm": 2.0123521464099183, + "language_loss": 0.89116049, + "learning_rate": 2.6891410663565703e-06, + "loss": 0.96843767, + "num_input_tokens_seen": 145062260, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.13568115, + "step": 6759, + "time_per_iteration": 2.5211679935455322 + }, + { + "auxiliary_loss_clip": 0.06457647, + "auxiliary_loss_mlp": 0.01273439, + "balance_loss_clip": 0.06284226, + "balance_loss_mlp": 0.01259742, + "epoch": 0.40643318803547274, + "flos": 24031327605120.0, + "grad_norm": 2.379594130925159, + "language_loss": 0.64235389, + "learning_rate": 2.688775442076598e-06, + "loss": 0.71966481, + "num_input_tokens_seen": 145082470, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.13690186, + "step": 6760, + "time_per_iteration": 2.546807050704956 + }, + { + "auxiliary_loss_clip": 0.0645775, + "auxiliary_loss_mlp": 0.01275543, + "balance_loss_clip": 0.06282319, + "balance_loss_mlp": 0.01260856, + "epoch": 0.4064933112881407, + "flos": 25599361386240.0, + "grad_norm": 1.4617486076979092, + "language_loss": 0.75530171, + "learning_rate": 2.688409791678193e-06, + "loss": 0.83263463, + "num_input_tokens_seen": 145105685, + "router_z_loss_clip": 1.75390625, + "router_z_loss_mlp": 0.14666748, + "step": 6761, + "time_per_iteration": 2.635345935821533 + }, + { + "auxiliary_loss_clip": 0.0645279, + "auxiliary_loss_mlp": 0.01275826, + "balance_loss_clip": 0.06285599, + "balance_loss_mlp": 0.01262183, + "epoch": 0.40655343454080867, + "flos": 22060841863680.0, + "grad_norm": 1.3772427401241372, + "language_loss": 0.70268184, + "learning_rate": 2.6880441151752185e-06, + "loss": 0.77996796, + "num_input_tokens_seen": 145125590, + "router_z_loss_clip": 1.671875, + "router_z_loss_mlp": 0.1362915, + "step": 6762, + "time_per_iteration": 2.5381741523742676 + }, + { + "auxiliary_loss_clip": 0.06454535, + "auxiliary_loss_mlp": 0.01269241, + "balance_loss_clip": 0.06282306, + "balance_loss_mlp": 0.01255532, + "epoch": 0.40661355779347663, + "flos": 26476115783040.0, + "grad_norm": 2.097586218934523, + "language_loss": 0.74072015, + "learning_rate": 2.6876784125815433e-06, + "loss": 0.81795788, + "num_input_tokens_seen": 145146810, + "router_z_loss_clip": 1.72265625, + "router_z_loss_mlp": 0.13708496, + "step": 6763, + "time_per_iteration": 2.6068081855773926 + }, + { + "auxiliary_loss_clip": 0.06460483, + "auxiliary_loss_mlp": 0.01272662, + "balance_loss_clip": 0.06284823, + "balance_loss_mlp": 0.01257946, + "epoch": 0.4066736810461446, + "flos": 13266156372480.0, + "grad_norm": 1.6908157420926835, + "language_loss": 0.69497877, + "learning_rate": 2.687312683911033e-06, + "loss": 0.77231026, + "num_input_tokens_seen": 145163130, + "router_z_loss_clip": 1.7578125, + "router_z_loss_mlp": 0.14703369, + "step": 6764, + "time_per_iteration": 2.511901378631592 + }, + { + "auxiliary_loss_clip": 0.06461611, + "auxiliary_loss_mlp": 0.01272386, + "balance_loss_clip": 0.06284289, + "balance_loss_mlp": 0.01255995, + "epoch": 0.40673380429881256, + "flos": 28811178639360.0, + "grad_norm": 2.09874166778498, + "language_loss": 0.91354716, + "learning_rate": 2.686946929177557e-06, + "loss": 0.99088717, + "num_input_tokens_seen": 145181420, + "router_z_loss_clip": 1.7734375, + "router_z_loss_mlp": 0.16381836, + "step": 6765, + "time_per_iteration": 2.614131450653076 + }, + { + "auxiliary_loss_clip": 0.06467324, + "auxiliary_loss_mlp": 0.01271556, + "balance_loss_clip": 0.06289016, + "balance_loss_mlp": 0.01256959, + "epoch": 0.4067939275514805, + "flos": 12500301254400.0, + "grad_norm": 2.6861779086384945, + "language_loss": 0.7896508, + "learning_rate": 2.6865811483949855e-06, + "loss": 0.86703956, + "num_input_tokens_seen": 145198545, + "router_z_loss_clip": 1.78222656, + "router_z_loss_mlp": 0.14599609, + "step": 6766, + "time_per_iteration": 2.5117299556732178 + }, + { + "auxiliary_loss_clip": 0.06462067, + "auxiliary_loss_mlp": 0.01273332, + "balance_loss_clip": 0.0628517, + "balance_loss_mlp": 0.01258306, + "epoch": 0.4068540508041485, + "flos": 18776461374720.0, + "grad_norm": 40.22612567694579, + "language_loss": 0.77094513, + "learning_rate": 2.6862153415771867e-06, + "loss": 0.84829921, + "num_input_tokens_seen": 145215835, + "router_z_loss_clip": 1.76855469, + "router_z_loss_mlp": 0.15020752, + "step": 6767, + "time_per_iteration": 2.5433967113494873 + }, + { + "auxiliary_loss_clip": 0.06456982, + "auxiliary_loss_mlp": 0.01274714, + "balance_loss_clip": 0.06286283, + "balance_loss_mlp": 0.01260784, + "epoch": 0.40691417405681646, + "flos": 28520506425600.0, + "grad_norm": 1.6477494711234055, + "language_loss": 0.77846849, + "learning_rate": 2.685849508738034e-06, + "loss": 0.85578549, + "num_input_tokens_seen": 145236555, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.1394043, + "step": 6768, + "time_per_iteration": 4.049299478530884 + }, + { + "auxiliary_loss_clip": 0.06460279, + "auxiliary_loss_mlp": 0.0127197, + "balance_loss_clip": 0.06286994, + "balance_loss_mlp": 0.01258213, + "epoch": 0.4069742973094844, + "flos": 20820390819840.0, + "grad_norm": 1.9557468193178857, + "language_loss": 0.87631512, + "learning_rate": 2.6854836498913995e-06, + "loss": 0.9536376, + "num_input_tokens_seen": 145254595, + "router_z_loss_clip": 1.73339844, + "router_z_loss_mlp": 0.13757324, + "step": 6769, + "time_per_iteration": 2.540104389190674 + }, + { + "auxiliary_loss_clip": 0.06461371, + "auxiliary_loss_mlp": 0.01272921, + "balance_loss_clip": 0.06292167, + "balance_loss_mlp": 0.01259504, + "epoch": 0.4070344205621524, + "flos": 21476646397440.0, + "grad_norm": 2.001246026688969, + "language_loss": 0.80859989, + "learning_rate": 2.685117765051156e-06, + "loss": 0.88594282, + "num_input_tokens_seen": 145274005, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.13421631, + "step": 6770, + "time_per_iteration": 3.9851884841918945 + }, + { + "auxiliary_loss_clip": 0.06465216, + "auxiliary_loss_mlp": 0.01270985, + "balance_loss_clip": 0.06288273, + "balance_loss_mlp": 0.01256203, + "epoch": 0.4070945438148204, + "flos": 26836709829120.0, + "grad_norm": 1.8007492597774561, + "language_loss": 0.80221689, + "learning_rate": 2.6847518542311783e-06, + "loss": 0.87957895, + "num_input_tokens_seen": 145294850, + "router_z_loss_clip": 1.77148438, + "router_z_loss_mlp": 0.14770508, + "step": 6771, + "time_per_iteration": 2.5747835636138916 + }, + { + "auxiliary_loss_clip": 0.06460344, + "auxiliary_loss_mlp": 0.01270623, + "balance_loss_clip": 0.06287014, + "balance_loss_mlp": 0.01256926, + "epoch": 0.4071546670674884, + "flos": 26360478748800.0, + "grad_norm": 1.364923552922522, + "language_loss": 0.7623316, + "learning_rate": 2.6843859174453417e-06, + "loss": 0.83964121, + "num_input_tokens_seen": 145317050, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.13696289, + "step": 6772, + "time_per_iteration": 2.628304958343506 + }, + { + "auxiliary_loss_clip": 0.06461407, + "auxiliary_loss_mlp": 0.01270312, + "balance_loss_clip": 0.06287165, + "balance_loss_mlp": 0.01255471, + "epoch": 0.40721479032015634, + "flos": 17901300205440.0, + "grad_norm": 1.7629352970283074, + "language_loss": 0.81345379, + "learning_rate": 2.6840199547075218e-06, + "loss": 0.89077097, + "num_input_tokens_seen": 145334480, + "router_z_loss_clip": 1.74316406, + "router_z_loss_mlp": 0.1484375, + "step": 6773, + "time_per_iteration": 2.5225751399993896 + }, + { + "auxiliary_loss_clip": 0.06368425, + "auxiliary_loss_mlp": 0.01263617, + "balance_loss_clip": 0.06289985, + "balance_loss_mlp": 0.01259653, + "epoch": 0.4072749135728243, + "flos": 49871522424960.0, + "grad_norm": 0.8094154348681942, + "language_loss": 0.64365125, + "learning_rate": 2.683653966031597e-06, + "loss": 0.71997166, + "num_input_tokens_seen": 145388695, + "router_z_loss_clip": 0.78515625, + "router_z_loss_mlp": 0.03961182, + "step": 6774, + "time_per_iteration": 4.446218967437744 + }, + { + "auxiliary_loss_clip": 0.06460027, + "auxiliary_loss_mlp": 0.01268161, + "balance_loss_clip": 0.06283361, + "balance_loss_mlp": 0.01254481, + "epoch": 0.40733503682549227, + "flos": 27571063011840.0, + "grad_norm": 1.7398483222375367, + "language_loss": 0.7269184, + "learning_rate": 2.683287951431446e-06, + "loss": 0.80420029, + "num_input_tokens_seen": 145408240, + "router_z_loss_clip": 1.765625, + "router_z_loss_mlp": 0.13659668, + "step": 6775, + "time_per_iteration": 2.599534511566162 + }, + { + "auxiliary_loss_clip": 0.0645956, + "auxiliary_loss_mlp": 0.01271281, + "balance_loss_clip": 0.06285449, + "balance_loss_mlp": 0.01257328, + "epoch": 0.40739516007816023, + "flos": 22133447026560.0, + "grad_norm": 1.36694346344043, + "language_loss": 0.78053248, + "learning_rate": 2.6829219109209474e-06, + "loss": 0.8578409, + "num_input_tokens_seen": 145428395, + "router_z_loss_clip": 1.74023438, + "router_z_loss_mlp": 0.13946533, + "step": 6776, + "time_per_iteration": 2.6111807823181152 + }, + { + "auxiliary_loss_clip": 0.06466034, + "auxiliary_loss_mlp": 0.01268413, + "balance_loss_clip": 0.06288318, + "balance_loss_mlp": 0.01254358, + "epoch": 0.4074552833308282, + "flos": 23849080974720.0, + "grad_norm": 2.6992343713036933, + "language_loss": 0.79444098, + "learning_rate": 2.682555844513981e-06, + "loss": 0.87178552, + "num_input_tokens_seen": 145448290, + "router_z_loss_clip": 1.77734375, + "router_z_loss_mlp": 0.14056396, + "step": 6777, + "time_per_iteration": 2.6968321800231934 + }, + { + "auxiliary_loss_clip": 0.0635563, + "auxiliary_loss_mlp": 0.01254556, + "balance_loss_clip": 0.06276868, + "balance_loss_mlp": 0.01251499, + "epoch": 0.40751540658349616, + "flos": 58019847120000.0, + "grad_norm": 0.6740608536307336, + "language_loss": 0.53006828, + "learning_rate": 2.6821897522244286e-06, + "loss": 0.60617012, + "num_input_tokens_seen": 145509785, + "router_z_loss_clip": 0.7890625, + "router_z_loss_mlp": 0.0305481, + "step": 6778, + "time_per_iteration": 4.5793616771698 + }, + { + "auxiliary_loss_clip": 0.0645799, + "auxiliary_loss_mlp": 0.01272337, + "balance_loss_clip": 0.06285123, + "balance_loss_mlp": 0.01257996, + "epoch": 0.40757552983616413, + "flos": 21220956063360.0, + "grad_norm": 2.166644010842874, + "language_loss": 0.8325671, + "learning_rate": 2.6818236340661718e-06, + "loss": 0.90987039, + "num_input_tokens_seen": 145528620, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.14349365, + "step": 6779, + "time_per_iteration": 2.5122289657592773 + }, + { + "auxiliary_loss_clip": 0.06459656, + "auxiliary_loss_mlp": 0.01270176, + "balance_loss_clip": 0.06286415, + "balance_loss_mlp": 0.01255752, + "epoch": 0.4076356530888321, + "flos": 26840776752000.0, + "grad_norm": 1.555798351548063, + "language_loss": 0.76392281, + "learning_rate": 2.6814574900530957e-06, + "loss": 0.84122109, + "num_input_tokens_seen": 145547775, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.14440918, + "step": 6780, + "time_per_iteration": 2.5635926723480225 + }, + { + "auxiliary_loss_clip": 0.06453321, + "auxiliary_loss_mlp": 0.01268481, + "balance_loss_clip": 0.06285319, + "balance_loss_mlp": 0.01255964, + "epoch": 0.40769577634150006, + "flos": 12207868104960.0, + "grad_norm": 2.3318684771465388, + "language_loss": 0.66762495, + "learning_rate": 2.6810913201990827e-06, + "loss": 0.74484301, + "num_input_tokens_seen": 145564465, + "router_z_loss_clip": 1.67871094, + "router_z_loss_mlp": 0.12512207, + "step": 6781, + "time_per_iteration": 2.4998953342437744 + }, + { + "auxiliary_loss_clip": 0.06457075, + "auxiliary_loss_mlp": 0.01270756, + "balance_loss_clip": 0.06285501, + "balance_loss_mlp": 0.01257005, + "epoch": 0.407755899594168, + "flos": 33663467128320.0, + "grad_norm": 1.4801990709986605, + "language_loss": 0.71833825, + "learning_rate": 2.6807251245180183e-06, + "loss": 0.79561651, + "num_input_tokens_seen": 145585965, + "router_z_loss_clip": 1.71679688, + "router_z_loss_mlp": 0.13757324, + "step": 6782, + "time_per_iteration": 2.6407761573791504 + }, + { + "auxiliary_loss_clip": 0.06455722, + "auxiliary_loss_mlp": 0.01265619, + "balance_loss_clip": 0.06282325, + "balance_loss_mlp": 0.01252804, + "epoch": 0.407816022846836, + "flos": 20163590190720.0, + "grad_norm": 1.6531823939859909, + "language_loss": 0.82546687, + "learning_rate": 2.6803589030237897e-06, + "loss": 0.90268028, + "num_input_tokens_seen": 145605000, + "router_z_loss_clip": 1.73535156, + "router_z_loss_mlp": 0.12823486, + "step": 6783, + "time_per_iteration": 2.521007776260376 + }, + { + "auxiliary_loss_clip": 0.06456424, + "auxiliary_loss_mlp": 0.01272041, + "balance_loss_clip": 0.06284439, + "balance_loss_mlp": 0.01258504, + "epoch": 0.40787614609950396, + "flos": 21185219496960.0, + "grad_norm": 3.105146861858365, + "language_loss": 0.80980694, + "learning_rate": 2.679992655730283e-06, + "loss": 0.88709158, + "num_input_tokens_seen": 145623740, + "router_z_loss_clip": 1.71972656, + "router_z_loss_mlp": 0.13549805, + "step": 6784, + "time_per_iteration": 2.555502414703369 + }, + { + "auxiliary_loss_clip": 0.06462008, + "auxiliary_loss_mlp": 0.01270528, + "balance_loss_clip": 0.06282149, + "balance_loss_mlp": 0.01254888, + "epoch": 0.407936269352172, + "flos": 20526699859200.0, + "grad_norm": 1.8248584482375538, + "language_loss": 0.65994555, + "learning_rate": 2.679626382651386e-06, + "loss": 0.73727089, + "num_input_tokens_seen": 145643515, + "router_z_loss_clip": 1.79882812, + "router_z_loss_mlp": 0.15661621, + "step": 6785, + "time_per_iteration": 2.5122246742248535 + }, + { + "auxiliary_loss_clip": 0.06453374, + "auxiliary_loss_mlp": 0.01270477, + "balance_loss_clip": 0.06281981, + "balance_loss_mlp": 0.01256505, + "epoch": 0.40799639260483994, + "flos": 20124709096320.0, + "grad_norm": 2.5052548980669487, + "language_loss": 0.80350053, + "learning_rate": 2.679260083800989e-06, + "loss": 0.88073903, + "num_input_tokens_seen": 145660890, + "router_z_loss_clip": 1.71191406, + "router_z_loss_mlp": 0.13970947, + "step": 6786, + "time_per_iteration": 2.554553985595703 + }, + { + "auxiliary_loss_clip": 0.0645851, + "auxiliary_loss_mlp": 0.01272529, + "balance_loss_clip": 0.06286281, + "balance_loss_mlp": 0.01258874, + "epoch": 0.4080565158575079, + "flos": 21003853334400.0, + "grad_norm": 1.5530341827396597, + "language_loss": 0.81621969, + "learning_rate": 2.678893759192982e-06, + "loss": 0.89353013, + "num_input_tokens_seen": 145680070, + "router_z_loss_clip": 1.72070312, + "router_z_loss_mlp": 0.13665771, + "step": 6787, + "time_per_iteration": 2.536215305328369 + }, + { + "auxiliary_loss_clip": 0.06458452, + "auxiliary_loss_mlp": 0.01268932, + "balance_loss_clip": 0.0628721, + "balance_loss_mlp": 0.01255623, + "epoch": 0.40811663911017587, + "flos": 19323746317440.0, + "grad_norm": 1.9049170263972377, + "language_loss": 0.6798445, + "learning_rate": 2.678527408841255e-06, + "loss": 0.75711828, + "num_input_tokens_seen": 145698010, + "router_z_loss_clip": 1.7109375, + "router_z_loss_mlp": 0.13323975, + "step": 6788, + "time_per_iteration": 2.533457040786743 + }, + { + "auxiliary_loss_clip": 0.06456561, + "auxiliary_loss_mlp": 0.01272482, + "balance_loss_clip": 0.06284444, + "balance_loss_mlp": 0.01258952, + "epoch": 0.40817676236284384, + "flos": 40634973555840.0, + "grad_norm": 1.8916550457168047, + "language_loss": 0.66478348, + "learning_rate": 2.678161032759701e-06, + "loss": 0.74207389, + "num_input_tokens_seen": 145722215, + "router_z_loss_clip": 1.72167969, + "router_z_loss_mlp": 0.13537598, + "step": 6789, + "time_per_iteration": 2.726292371749878 + }, + { + "auxiliary_loss_clip": 0.06456382, + "auxiliary_loss_mlp": 0.01270282, + "balance_loss_clip": 0.06284897, + "balance_loss_mlp": 0.01256383, + "epoch": 0.4082368856155118, + "flos": 20528376940800.0, + "grad_norm": 1.5670896359254076, + "language_loss": 0.61192298, + "learning_rate": 2.6777946309622123e-06, + "loss": 0.68918967, + "num_input_tokens_seen": 145741090, + "router_z_loss_clip": 1.71484375, + "router_z_loss_mlp": 0.13885498, + "step": 6790, + "time_per_iteration": 2.5437731742858887 + }, + { + "auxiliary_loss_clip": 0.06455828, + "auxiliary_loss_mlp": 0.01271387, + "balance_loss_clip": 0.062863, + "balance_loss_mlp": 0.01257928, + "epoch": 0.40829700886817977, + "flos": 11430944248320.0, + "grad_norm": 3.0698605132878076, + "language_loss": 0.69964224, + "learning_rate": 2.677428203462683e-06, + "loss": 0.77691442, + "num_input_tokens_seen": 145754985, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.13452148, + "step": 6791, + "time_per_iteration": 2.4941210746765137 + }, + { + "auxiliary_loss_clip": 0.0635563, + "auxiliary_loss_mlp": 0.01262815, + "balance_loss_clip": 0.06277826, + "balance_loss_mlp": 0.01259486, + "epoch": 0.40835713212084773, + "flos": 67350455326080.0, + "grad_norm": 0.7295736549212738, + "language_loss": 0.59295797, + "learning_rate": 2.6770617502750093e-06, + "loss": 0.66914248, + "num_input_tokens_seen": 145815260, + "router_z_loss_clip": 0.77734375, + "router_z_loss_mlp": 0.03335571, + "step": 6792, + "time_per_iteration": 3.153479814529419 + }, + { + "auxiliary_loss_clip": 0.06459208, + "auxiliary_loss_mlp": 0.01270498, + "balance_loss_clip": 0.06285354, + "balance_loss_mlp": 0.01256193, + "epoch": 0.4084172553735157, + "flos": 21768408714240.0, + "grad_norm": 1.6689878199369865, + "language_loss": 0.80186534, + "learning_rate": 2.6766952714130857e-06, + "loss": 0.87916243, + "num_input_tokens_seen": 145832665, + "router_z_loss_clip": 1.73828125, + "router_z_loss_mlp": 0.14306641, + "step": 6793, + "time_per_iteration": 2.562311887741089 + }, + { + "auxiliary_loss_clip": 0.06458702, + "auxiliary_loss_mlp": 0.01272476, + "balance_loss_clip": 0.06283591, + "balance_loss_mlp": 0.01258237, + "epoch": 0.40847737862618366, + "flos": 27424594874880.0, + "grad_norm": 3.9059129474249, + "language_loss": 0.85597503, + "learning_rate": 2.6763287668908094e-06, + "loss": 0.93328679, + "num_input_tokens_seen": 145850240, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.14227295, + "step": 6794, + "time_per_iteration": 2.558554172515869 + }, + { + "auxiliary_loss_clip": 0.06457786, + "auxiliary_loss_mlp": 0.01274296, + "balance_loss_clip": 0.0628652, + "balance_loss_mlp": 0.01259991, + "epoch": 0.4085375018788516, + "flos": 18593040787200.0, + "grad_norm": 1.7852935587618148, + "language_loss": 0.80216181, + "learning_rate": 2.6759622367220788e-06, + "loss": 0.87948263, + "num_input_tokens_seen": 145869545, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.14306641, + "step": 6795, + "time_per_iteration": 2.540349006652832 + }, + { + "auxiliary_loss_clip": 0.06465046, + "auxiliary_loss_mlp": 0.01270762, + "balance_loss_clip": 0.0628596, + "balance_loss_mlp": 0.01255718, + "epoch": 0.4085976251315196, + "flos": 15416834319360.0, + "grad_norm": 2.647671549267762, + "language_loss": 0.70204669, + "learning_rate": 2.675595680920792e-06, + "loss": 0.77940476, + "num_input_tokens_seen": 145884025, + "router_z_loss_clip": 1.79101562, + "router_z_loss_mlp": 0.15057373, + "step": 6796, + "time_per_iteration": 2.483670711517334 + }, + { + "auxiliary_loss_clip": 0.06458762, + "auxiliary_loss_mlp": 0.01269742, + "balance_loss_clip": 0.06285367, + "balance_loss_mlp": 0.01256558, + "epoch": 0.40865774838418756, + "flos": 21258705127680.0, + "grad_norm": 1.5727118215642113, + "language_loss": 0.78255171, + "learning_rate": 2.6752290995008498e-06, + "loss": 0.85983676, + "num_input_tokens_seen": 145903210, + "router_z_loss_clip": 1.73535156, + "router_z_loss_mlp": 0.13189697, + "step": 6797, + "time_per_iteration": 2.580595016479492 + }, + { + "auxiliary_loss_clip": 0.06459324, + "auxiliary_loss_mlp": 0.01274053, + "balance_loss_clip": 0.06286809, + "balance_loss_mlp": 0.01260183, + "epoch": 0.4087178716368556, + "flos": 13777411259520.0, + "grad_norm": 1.8045279385790254, + "language_loss": 0.86005986, + "learning_rate": 2.6748624924761523e-06, + "loss": 0.93739361, + "num_input_tokens_seen": 145920985, + "router_z_loss_clip": 1.72363281, + "router_z_loss_mlp": 0.13885498, + "step": 6798, + "time_per_iteration": 2.525223970413208 + }, + { + "auxiliary_loss_clip": 0.0645816, + "auxiliary_loss_mlp": 0.01271081, + "balance_loss_clip": 0.06287363, + "balance_loss_mlp": 0.01258308, + "epoch": 0.40877799488952354, + "flos": 23628288666240.0, + "grad_norm": 1.532136532380416, + "language_loss": 0.84202659, + "learning_rate": 2.674495859860601e-06, + "loss": 0.91931903, + "num_input_tokens_seen": 145940350, + "router_z_loss_clip": 1.70800781, + "router_z_loss_mlp": 0.12774658, + "step": 6799, + "time_per_iteration": 2.5898637771606445 + }, + { + "auxiliary_loss_clip": 0.06456885, + "auxiliary_loss_mlp": 0.01270815, + "balance_loss_clip": 0.06284514, + "balance_loss_mlp": 0.01256695, + "epoch": 0.4088381181421915, + "flos": 20924372136960.0, + "grad_norm": 3.2861641598601516, + "language_loss": 0.83725351, + "learning_rate": 2.6741292016681e-06, + "loss": 0.91453052, + "num_input_tokens_seen": 145957460, + "router_z_loss_clip": 1.72265625, + "router_z_loss_mlp": 0.14129639, + "step": 6800, + "time_per_iteration": 2.5050573348999023 + }, + { + "auxiliary_loss_clip": 0.06460495, + "auxiliary_loss_mlp": 0.0127488, + "balance_loss_clip": 0.06284706, + "balance_loss_mlp": 0.01260324, + "epoch": 0.4088982413948595, + "flos": 13302605698560.0, + "grad_norm": 2.1402246624759225, + "language_loss": 0.74944514, + "learning_rate": 2.6737625179125514e-06, + "loss": 0.82679886, + "num_input_tokens_seen": 145975285, + "router_z_loss_clip": 1.7578125, + "router_z_loss_mlp": 0.14532471, + "step": 6801, + "time_per_iteration": 2.546226978302002 + }, + { + "auxiliary_loss_clip": 0.0646005, + "auxiliary_loss_mlp": 0.0127012, + "balance_loss_clip": 0.06286253, + "balance_loss_mlp": 0.01256358, + "epoch": 0.40895836464752744, + "flos": 15273007585920.0, + "grad_norm": 2.8712837575861316, + "language_loss": 0.80348778, + "learning_rate": 2.673395808607861e-06, + "loss": 0.8807894, + "num_input_tokens_seen": 145989150, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.13775635, + "step": 6802, + "time_per_iteration": 2.4804327487945557 + }, + { + "auxiliary_loss_clip": 0.06463334, + "auxiliary_loss_mlp": 0.01271488, + "balance_loss_clip": 0.06286001, + "balance_loss_mlp": 0.01256813, + "epoch": 0.4090184879001954, + "flos": 14506607416320.0, + "grad_norm": 2.1610413406346147, + "language_loss": 0.7616486, + "learning_rate": 2.673029073767934e-06, + "loss": 0.83899677, + "num_input_tokens_seen": 146006980, + "router_z_loss_clip": 1.77246094, + "router_z_loss_mlp": 0.14660645, + "step": 6803, + "time_per_iteration": 2.5792553424835205 + }, + { + "auxiliary_loss_clip": 0.06459032, + "auxiliary_loss_mlp": 0.01268618, + "balance_loss_clip": 0.06286538, + "balance_loss_mlp": 0.01255017, + "epoch": 0.40907861115286337, + "flos": 13886759237760.0, + "grad_norm": 1.7652651103072021, + "language_loss": 0.79160619, + "learning_rate": 2.6726623134066764e-06, + "loss": 0.86888266, + "num_input_tokens_seen": 146025125, + "router_z_loss_clip": 1.72363281, + "router_z_loss_mlp": 0.1361084, + "step": 6804, + "time_per_iteration": 2.489569902420044 + }, + { + "auxiliary_loss_clip": 0.06464031, + "auxiliary_loss_mlp": 0.01273102, + "balance_loss_clip": 0.06285653, + "balance_loss_mlp": 0.0125919, + "epoch": 0.40913873440553133, + "flos": 28045071959040.0, + "grad_norm": 1.8644340771163777, + "language_loss": 0.75315928, + "learning_rate": 2.672295527537998e-06, + "loss": 0.83053064, + "num_input_tokens_seen": 146044990, + "router_z_loss_clip": 1.78125, + "router_z_loss_mlp": 0.13909912, + "step": 6805, + "time_per_iteration": 2.6142778396606445 + }, + { + "auxiliary_loss_clip": 0.06465782, + "auxiliary_loss_mlp": 0.01272786, + "balance_loss_clip": 0.06288569, + "balance_loss_mlp": 0.01257957, + "epoch": 0.4091988576581993, + "flos": 21624917397120.0, + "grad_norm": 1.7712960163929097, + "language_loss": 0.7965951, + "learning_rate": 2.671928716175804e-06, + "loss": 0.87398076, + "num_input_tokens_seen": 146066045, + "router_z_loss_clip": 1.77050781, + "router_z_loss_mlp": 0.14825439, + "step": 6806, + "time_per_iteration": 2.567579984664917 + }, + { + "auxiliary_loss_clip": 0.06464592, + "auxiliary_loss_mlp": 0.01268771, + "balance_loss_clip": 0.06287415, + "balance_loss_mlp": 0.01254609, + "epoch": 0.40925898091086726, + "flos": 25230381932160.0, + "grad_norm": 1.8487150493759184, + "language_loss": 0.725999, + "learning_rate": 2.671561879334007e-06, + "loss": 0.80333263, + "num_input_tokens_seen": 146086280, + "router_z_loss_clip": 1.77148438, + "router_z_loss_mlp": 0.14147949, + "step": 6807, + "time_per_iteration": 4.0469160079956055 + }, + { + "auxiliary_loss_clip": 0.06359696, + "auxiliary_loss_mlp": 0.012552, + "balance_loss_clip": 0.06279803, + "balance_loss_mlp": 0.01251397, + "epoch": 0.40931910416353523, + "flos": 68949697553280.0, + "grad_norm": 0.8076862955861985, + "language_loss": 0.5884732, + "learning_rate": 2.6711950170265155e-06, + "loss": 0.66462219, + "num_input_tokens_seen": 146148840, + "router_z_loss_clip": 0.79833984, + "router_z_loss_mlp": 0.03796387, + "step": 6808, + "time_per_iteration": 3.236466407775879 + }, + { + "auxiliary_loss_clip": 0.0646228, + "auxiliary_loss_mlp": 0.01268444, + "balance_loss_clip": 0.06290961, + "balance_loss_mlp": 0.0125511, + "epoch": 0.4093792274162032, + "flos": 20195092126080.0, + "grad_norm": 2.068974912031903, + "language_loss": 0.54879391, + "learning_rate": 2.670828129267242e-06, + "loss": 0.62610114, + "num_input_tokens_seen": 146166195, + "router_z_loss_clip": 1.71386719, + "router_z_loss_mlp": 0.13342285, + "step": 6809, + "time_per_iteration": 4.028552055358887 + }, + { + "auxiliary_loss_clip": 0.06460767, + "auxiliary_loss_mlp": 0.01271891, + "balance_loss_clip": 0.06288341, + "balance_loss_mlp": 0.0125805, + "epoch": 0.40943935066887116, + "flos": 25235832447360.0, + "grad_norm": 1.6877735836202645, + "language_loss": 0.83297133, + "learning_rate": 2.6704612160700983e-06, + "loss": 0.91029787, + "num_input_tokens_seen": 146185045, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.13830566, + "step": 6810, + "time_per_iteration": 2.5688657760620117 + }, + { + "auxiliary_loss_clip": 0.06467541, + "auxiliary_loss_mlp": 0.01274919, + "balance_loss_clip": 0.06291755, + "balance_loss_mlp": 0.01260376, + "epoch": 0.4094994739215392, + "flos": 23261531345280.0, + "grad_norm": 2.1410482965152475, + "language_loss": 0.78002244, + "learning_rate": 2.670094277448999e-06, + "loss": 0.85744703, + "num_input_tokens_seen": 146204655, + "router_z_loss_clip": 1.75878906, + "router_z_loss_mlp": 0.14526367, + "step": 6811, + "time_per_iteration": 2.5859668254852295 + }, + { + "auxiliary_loss_clip": 0.06461761, + "auxiliary_loss_mlp": 0.01270439, + "balance_loss_clip": 0.06286068, + "balance_loss_mlp": 0.01255705, + "epoch": 0.40955959717420715, + "flos": 17387571623040.0, + "grad_norm": 1.532323288412775, + "language_loss": 0.70159924, + "learning_rate": 2.669727313417857e-06, + "loss": 0.77892125, + "num_input_tokens_seen": 146222000, + "router_z_loss_clip": 1.75683594, + "router_z_loss_mlp": 0.1472168, + "step": 6812, + "time_per_iteration": 2.5128583908081055 + }, + { + "auxiliary_loss_clip": 0.06459609, + "auxiliary_loss_mlp": 0.01271673, + "balance_loss_clip": 0.06286342, + "balance_loss_mlp": 0.01257689, + "epoch": 0.4096197204268751, + "flos": 25089406237440.0, + "grad_norm": 1.5016829758663763, + "language_loss": 0.6657182, + "learning_rate": 2.6693603239905872e-06, + "loss": 0.74303102, + "num_input_tokens_seen": 146242630, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.13989258, + "step": 6813, + "time_per_iteration": 4.086791515350342 + }, + { + "auxiliary_loss_clip": 0.06457571, + "auxiliary_loss_mlp": 0.01273443, + "balance_loss_clip": 0.06284814, + "balance_loss_mlp": 0.01259186, + "epoch": 0.4096798436795431, + "flos": 30593841454080.0, + "grad_norm": 3.468085127477164, + "language_loss": 0.74528515, + "learning_rate": 2.6689933091811087e-06, + "loss": 0.82259536, + "num_input_tokens_seen": 146263070, + "router_z_loss_clip": 1.72949219, + "router_z_loss_mlp": 0.14282227, + "step": 6814, + "time_per_iteration": 2.6079764366149902 + }, + { + "auxiliary_loss_clip": 0.06469103, + "auxiliary_loss_mlp": 0.0126922, + "balance_loss_clip": 0.06290863, + "balance_loss_mlp": 0.01254927, + "epoch": 0.40973996693221104, + "flos": 24140424021120.0, + "grad_norm": 2.1723549744151573, + "language_loss": 0.66418713, + "learning_rate": 2.6686262690033357e-06, + "loss": 0.74157035, + "num_input_tokens_seen": 146282890, + "router_z_loss_clip": 1.78222656, + "router_z_loss_mlp": 0.14276123, + "step": 6815, + "time_per_iteration": 2.574538469314575 + }, + { + "auxiliary_loss_clip": 0.06459038, + "auxiliary_loss_mlp": 0.01277533, + "balance_loss_clip": 0.06290913, + "balance_loss_mlp": 0.01264116, + "epoch": 0.409800090184879, + "flos": 23995968382080.0, + "grad_norm": 1.5545179592453178, + "language_loss": 0.76523387, + "learning_rate": 2.668259203471188e-06, + "loss": 0.84259957, + "num_input_tokens_seen": 146301755, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.13433838, + "step": 6816, + "time_per_iteration": 2.5691564083099365 + }, + { + "auxiliary_loss_clip": 0.06462897, + "auxiliary_loss_mlp": 0.01272633, + "balance_loss_clip": 0.06288977, + "balance_loss_mlp": 0.01258834, + "epoch": 0.40986021343754697, + "flos": 16149216931200.0, + "grad_norm": 2.0573498340626957, + "language_loss": 0.82244468, + "learning_rate": 2.6678921125985843e-06, + "loss": 0.8998, + "num_input_tokens_seen": 146316835, + "router_z_loss_clip": 1.73925781, + "router_z_loss_mlp": 0.13812256, + "step": 6817, + "time_per_iteration": 3.992452621459961 + }, + { + "auxiliary_loss_clip": 0.06471414, + "auxiliary_loss_mlp": 0.0127126, + "balance_loss_clip": 0.06288736, + "balance_loss_mlp": 0.0125556, + "epoch": 0.40992033669021494, + "flos": 24797811628800.0, + "grad_norm": 1.5933135055943601, + "language_loss": 0.80022383, + "learning_rate": 2.667524996399444e-06, + "loss": 0.87765062, + "num_input_tokens_seen": 146336650, + "router_z_loss_clip": 1.82714844, + "router_z_loss_mlp": 0.15698242, + "step": 6818, + "time_per_iteration": 2.6226916313171387 + }, + { + "auxiliary_loss_clip": 0.06458658, + "auxiliary_loss_mlp": 0.01265615, + "balance_loss_clip": 0.06287554, + "balance_loss_mlp": 0.01252609, + "epoch": 0.4099804599428829, + "flos": 29649429285120.0, + "grad_norm": 1.5014418509343528, + "language_loss": 0.66358954, + "learning_rate": 2.66715785488769e-06, + "loss": 0.74083227, + "num_input_tokens_seen": 146357640, + "router_z_loss_clip": 1.7109375, + "router_z_loss_mlp": 0.13006592, + "step": 6819, + "time_per_iteration": 2.5726187229156494 + }, + { + "auxiliary_loss_clip": 0.06472912, + "auxiliary_loss_mlp": 0.01275099, + "balance_loss_clip": 0.06290931, + "balance_loss_mlp": 0.01259566, + "epoch": 0.41004058319555087, + "flos": 24833464341120.0, + "grad_norm": 1.4779477588129932, + "language_loss": 0.85265613, + "learning_rate": 2.6667906880772428e-06, + "loss": 0.9301362, + "num_input_tokens_seen": 146379325, + "router_z_loss_clip": 1.8203125, + "router_z_loss_mlp": 0.15527344, + "step": 6820, + "time_per_iteration": 2.5997445583343506 + }, + { + "auxiliary_loss_clip": 0.06459977, + "auxiliary_loss_mlp": 0.01274929, + "balance_loss_clip": 0.06289133, + "balance_loss_mlp": 0.01261571, + "epoch": 0.41010070644821883, + "flos": 25744278222720.0, + "grad_norm": 1.6716831778372079, + "language_loss": 0.71520668, + "learning_rate": 2.6664234959820256e-06, + "loss": 0.79255575, + "num_input_tokens_seen": 146398635, + "router_z_loss_clip": 1.70800781, + "router_z_loss_mlp": 0.13360596, + "step": 6821, + "time_per_iteration": 2.5686511993408203 + }, + { + "auxiliary_loss_clip": 0.06462038, + "auxiliary_loss_mlp": 0.01275085, + "balance_loss_clip": 0.06288444, + "balance_loss_mlp": 0.01262037, + "epoch": 0.4101608297008868, + "flos": 22352604180480.0, + "grad_norm": 1.920651769082741, + "language_loss": 0.74875939, + "learning_rate": 2.6660562786159634e-06, + "loss": 0.82613057, + "num_input_tokens_seen": 146417585, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.13049316, + "step": 6822, + "time_per_iteration": 2.5453121662139893 + }, + { + "auxiliary_loss_clip": 0.0646743, + "auxiliary_loss_mlp": 0.01270606, + "balance_loss_clip": 0.06293608, + "balance_loss_mlp": 0.01256408, + "epoch": 0.41022095295355476, + "flos": 21951619666560.0, + "grad_norm": 2.1329933375936045, + "language_loss": 0.75859648, + "learning_rate": 2.6656890359929796e-06, + "loss": 0.83597684, + "num_input_tokens_seen": 146437035, + "router_z_loss_clip": 1.73730469, + "router_z_loss_mlp": 0.14208984, + "step": 6823, + "time_per_iteration": 2.514934539794922 + }, + { + "auxiliary_loss_clip": 0.06469562, + "auxiliary_loss_mlp": 0.01272535, + "balance_loss_clip": 0.06289219, + "balance_loss_mlp": 0.01257276, + "epoch": 0.4102810762062228, + "flos": 27457312694400.0, + "grad_norm": 5.1897859223278004, + "language_loss": 0.74005461, + "learning_rate": 2.665321768127001e-06, + "loss": 0.81747556, + "num_input_tokens_seen": 146457370, + "router_z_loss_clip": 1.80078125, + "router_z_loss_mlp": 0.15258789, + "step": 6824, + "time_per_iteration": 2.645362615585327 + }, + { + "auxiliary_loss_clip": 0.06472579, + "auxiliary_loss_mlp": 0.01268406, + "balance_loss_clip": 0.06292652, + "balance_loss_mlp": 0.01253589, + "epoch": 0.41034119945889075, + "flos": 24506258947200.0, + "grad_norm": 2.0548664701913215, + "language_loss": 0.72348672, + "learning_rate": 2.6649544750319548e-06, + "loss": 0.80089658, + "num_input_tokens_seen": 146478105, + "router_z_loss_clip": 1.79882812, + "router_z_loss_mlp": 0.14788818, + "step": 6825, + "time_per_iteration": 2.5779926776885986 + }, + { + "auxiliary_loss_clip": 0.0646458, + "auxiliary_loss_mlp": 0.01269358, + "balance_loss_clip": 0.06292018, + "balance_loss_mlp": 0.01255822, + "epoch": 0.4104013227115587, + "flos": 24359497320960.0, + "grad_norm": 2.1141131447671, + "language_loss": 0.85571408, + "learning_rate": 2.664587156721768e-06, + "loss": 0.93305349, + "num_input_tokens_seen": 146497835, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.13537598, + "step": 6826, + "time_per_iteration": 2.556445598602295 + }, + { + "auxiliary_loss_clip": 0.06462094, + "auxiliary_loss_mlp": 0.01278764, + "balance_loss_clip": 0.0629297, + "balance_loss_mlp": 0.0126468, + "epoch": 0.4104614459642267, + "flos": 23735582219520.0, + "grad_norm": 2.6430290167775037, + "language_loss": 0.6714378, + "learning_rate": 2.6642198132103696e-06, + "loss": 0.74884635, + "num_input_tokens_seen": 146517735, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.14080811, + "step": 6827, + "time_per_iteration": 2.55556058883667 + }, + { + "auxiliary_loss_clip": 0.06463977, + "auxiliary_loss_mlp": 0.01267684, + "balance_loss_clip": 0.06292337, + "balance_loss_mlp": 0.01254017, + "epoch": 0.41052156921689464, + "flos": 22134620983680.0, + "grad_norm": 1.346138162541555, + "language_loss": 0.72310138, + "learning_rate": 2.663852444511689e-06, + "loss": 0.80041802, + "num_input_tokens_seen": 146537640, + "router_z_loss_clip": 1.71777344, + "router_z_loss_mlp": 0.13665771, + "step": 6828, + "time_per_iteration": 2.6050894260406494 + }, + { + "auxiliary_loss_clip": 0.06477004, + "auxiliary_loss_mlp": 0.01275424, + "balance_loss_clip": 0.06296174, + "balance_loss_mlp": 0.01259855, + "epoch": 0.4105816924695626, + "flos": 20090607684480.0, + "grad_norm": 2.1527229818824196, + "language_loss": 0.84003794, + "learning_rate": 2.6634850506396574e-06, + "loss": 0.91756219, + "num_input_tokens_seen": 146554695, + "router_z_loss_clip": 1.81054688, + "router_z_loss_mlp": 0.15588379, + "step": 6829, + "time_per_iteration": 2.5358362197875977 + }, + { + "auxiliary_loss_clip": 0.06466494, + "auxiliary_loss_mlp": 0.01273558, + "balance_loss_clip": 0.0629379, + "balance_loss_mlp": 0.01259789, + "epoch": 0.4106418157222306, + "flos": 18082540586880.0, + "grad_norm": 1.474811924806309, + "language_loss": 0.90568459, + "learning_rate": 2.663117631608206e-06, + "loss": 0.98308516, + "num_input_tokens_seen": 146573740, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.13781738, + "step": 6830, + "time_per_iteration": 2.5749125480651855 + }, + { + "auxiliary_loss_clip": 0.06471005, + "auxiliary_loss_mlp": 0.01271813, + "balance_loss_clip": 0.06296638, + "balance_loss_mlp": 0.01257729, + "epoch": 0.41070193897489854, + "flos": 21653442512640.0, + "grad_norm": 1.8339460976388509, + "language_loss": 0.6606307, + "learning_rate": 2.662750187431268e-06, + "loss": 0.73805887, + "num_input_tokens_seen": 146592885, + "router_z_loss_clip": 1.74414062, + "router_z_loss_mlp": 0.14080811, + "step": 6831, + "time_per_iteration": 2.5448153018951416 + }, + { + "auxiliary_loss_clip": 0.06473927, + "auxiliary_loss_mlp": 0.01269964, + "balance_loss_clip": 0.06301369, + "balance_loss_mlp": 0.01256613, + "epoch": 0.4107620622275665, + "flos": 26654924396160.0, + "grad_norm": 2.1106075691496766, + "language_loss": 0.69853723, + "learning_rate": 2.662382718122776e-06, + "loss": 0.77597612, + "num_input_tokens_seen": 146611995, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.13360596, + "step": 6832, + "time_per_iteration": 2.61200213432312 + }, + { + "auxiliary_loss_clip": 0.06467804, + "auxiliary_loss_mlp": 0.01274675, + "balance_loss_clip": 0.06296351, + "balance_loss_mlp": 0.01261586, + "epoch": 0.41082218548023447, + "flos": 18740305537920.0, + "grad_norm": 3.2749058883058177, + "language_loss": 0.73955101, + "learning_rate": 2.662015223696666e-06, + "loss": 0.81697583, + "num_input_tokens_seen": 146628045, + "router_z_loss_clip": 1.71386719, + "router_z_loss_mlp": 0.13092041, + "step": 6833, + "time_per_iteration": 2.5293643474578857 + }, + { + "auxiliary_loss_clip": 0.06477401, + "auxiliary_loss_mlp": 0.01270878, + "balance_loss_clip": 0.06301869, + "balance_loss_mlp": 0.01256334, + "epoch": 0.41088230873290243, + "flos": 22900476101760.0, + "grad_norm": 1.6362019789175348, + "language_loss": 0.72870773, + "learning_rate": 2.6616477041668713e-06, + "loss": 0.80619049, + "num_input_tokens_seen": 146648355, + "router_z_loss_clip": 1.75488281, + "router_z_loss_mlp": 0.14532471, + "step": 6834, + "time_per_iteration": 2.5534543991088867 + }, + { + "auxiliary_loss_clip": 0.06479818, + "auxiliary_loss_mlp": 0.01271417, + "balance_loss_clip": 0.0630189, + "balance_loss_mlp": 0.01257601, + "epoch": 0.4109424319855704, + "flos": 24283370286720.0, + "grad_norm": 2.482567827780577, + "language_loss": 0.71274042, + "learning_rate": 2.661280159547329e-06, + "loss": 0.7902528, + "num_input_tokens_seen": 146668370, + "router_z_loss_clip": 1.78027344, + "router_z_loss_mlp": 0.13824463, + "step": 6835, + "time_per_iteration": 2.6012609004974365 + }, + { + "auxiliary_loss_clip": 0.06481166, + "auxiliary_loss_mlp": 0.012697, + "balance_loss_clip": 0.06306168, + "balance_loss_mlp": 0.01255318, + "epoch": 0.41100255523823837, + "flos": 12974100566400.0, + "grad_norm": 1.7690004377507398, + "language_loss": 0.87590879, + "learning_rate": 2.660912589851978e-06, + "loss": 0.95341742, + "num_input_tokens_seen": 146686665, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.14373779, + "step": 6836, + "time_per_iteration": 2.5210461616516113 + }, + { + "auxiliary_loss_clip": 0.06475058, + "auxiliary_loss_mlp": 0.0127358, + "balance_loss_clip": 0.06304475, + "balance_loss_mlp": 0.01259937, + "epoch": 0.4110626784909064, + "flos": 23151806023680.0, + "grad_norm": 1.7062413123689164, + "language_loss": 0.69134921, + "learning_rate": 2.6605449950947547e-06, + "loss": 0.76883554, + "num_input_tokens_seen": 146706570, + "router_z_loss_clip": 1.70703125, + "router_z_loss_mlp": 0.13641357, + "step": 6837, + "time_per_iteration": 2.58320689201355 + }, + { + "auxiliary_loss_clip": 0.06479225, + "auxiliary_loss_mlp": 0.01273179, + "balance_loss_clip": 0.06301909, + "balance_loss_mlp": 0.01258248, + "epoch": 0.41112280174357435, + "flos": 22754007964800.0, + "grad_norm": 1.9797600155486905, + "language_loss": 0.7565136, + "learning_rate": 2.660177375289599e-06, + "loss": 0.83403766, + "num_input_tokens_seen": 146723425, + "router_z_loss_clip": 1.7734375, + "router_z_loss_mlp": 0.1494751, + "step": 6838, + "time_per_iteration": 2.5357375144958496 + }, + { + "auxiliary_loss_clip": 0.06478335, + "auxiliary_loss_mlp": 0.01273659, + "balance_loss_clip": 0.06305958, + "balance_loss_mlp": 0.01259318, + "epoch": 0.4111829249962423, + "flos": 21108211994880.0, + "grad_norm": 2.0771476339041635, + "language_loss": 0.82403398, + "learning_rate": 2.659809730450451e-06, + "loss": 0.90155393, + "num_input_tokens_seen": 146741640, + "router_z_loss_clip": 1.72265625, + "router_z_loss_mlp": 0.14343262, + "step": 6839, + "time_per_iteration": 2.596498489379883 + }, + { + "auxiliary_loss_clip": 0.06477809, + "auxiliary_loss_mlp": 0.01273131, + "balance_loss_clip": 0.06305793, + "balance_loss_mlp": 0.01259404, + "epoch": 0.4112430482489103, + "flos": 21512005620480.0, + "grad_norm": 1.908617135949294, + "language_loss": 0.8080616, + "learning_rate": 2.6594420605912523e-06, + "loss": 0.885571, + "num_input_tokens_seen": 146759195, + "router_z_loss_clip": 1.72070312, + "router_z_loss_mlp": 0.13726807, + "step": 6840, + "time_per_iteration": 2.575131893157959 + }, + { + "auxiliary_loss_clip": 0.06480156, + "auxiliary_loss_mlp": 0.01275329, + "balance_loss_clip": 0.06307412, + "balance_loss_mlp": 0.01262639, + "epoch": 0.41130317150157825, + "flos": 19575579363840.0, + "grad_norm": 1.874526459917051, + "language_loss": 0.67950094, + "learning_rate": 2.6590743657259442e-06, + "loss": 0.75705582, + "num_input_tokens_seen": 146774990, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.12701416, + "step": 6841, + "time_per_iteration": 2.5642948150634766 + }, + { + "auxiliary_loss_clip": 0.06386833, + "auxiliary_loss_mlp": 0.01258898, + "balance_loss_clip": 0.06308911, + "balance_loss_mlp": 0.01256092, + "epoch": 0.4113632947542462, + "flos": 62404541498880.0, + "grad_norm": 0.7544179812034518, + "language_loss": 0.59557825, + "learning_rate": 2.65870664586847e-06, + "loss": 0.67203557, + "num_input_tokens_seen": 146839610, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 0.02804565, + "step": 6842, + "time_per_iteration": 3.2257192134857178 + }, + { + "auxiliary_loss_clip": 0.06472278, + "auxiliary_loss_mlp": 0.01271531, + "balance_loss_clip": 0.06304677, + "balance_loss_mlp": 0.01257977, + "epoch": 0.4114234180069142, + "flos": 13923879396480.0, + "grad_norm": 2.0142050293437803, + "language_loss": 0.70280814, + "learning_rate": 2.6583389010327742e-06, + "loss": 0.78024626, + "num_input_tokens_seen": 146857360, + "router_z_loss_clip": 1.67480469, + "router_z_loss_mlp": 0.13562012, + "step": 6843, + "time_per_iteration": 2.565969944000244 + }, + { + "auxiliary_loss_clip": 0.06380486, + "auxiliary_loss_mlp": 0.01256868, + "balance_loss_clip": 0.06302112, + "balance_loss_mlp": 0.01253599, + "epoch": 0.41148354125958214, + "flos": 64948866727680.0, + "grad_norm": 0.7130365683812196, + "language_loss": 0.53645009, + "learning_rate": 2.6579711312328013e-06, + "loss": 0.61282361, + "num_input_tokens_seen": 146917055, + "router_z_loss_clip": 0.78222656, + "router_z_loss_mlp": 0.03274536, + "step": 6844, + "time_per_iteration": 3.16054105758667 + }, + { + "auxiliary_loss_clip": 0.06475421, + "auxiliary_loss_mlp": 0.0126646, + "balance_loss_clip": 0.06304798, + "balance_loss_mlp": 0.01253144, + "epoch": 0.4115436645122501, + "flos": 18733848773760.0, + "grad_norm": 1.6055019254999645, + "language_loss": 0.66105658, + "learning_rate": 2.6576033364824967e-06, + "loss": 0.73847538, + "num_input_tokens_seen": 146935215, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.13317871, + "step": 6845, + "time_per_iteration": 2.5785298347473145 + }, + { + "auxiliary_loss_clip": 0.06478415, + "auxiliary_loss_mlp": 0.01267629, + "balance_loss_clip": 0.06307876, + "balance_loss_mlp": 0.01254176, + "epoch": 0.41160378776491807, + "flos": 16258439128320.0, + "grad_norm": 2.0979946916750594, + "language_loss": 0.70201457, + "learning_rate": 2.657235516795808e-06, + "loss": 0.77947497, + "num_input_tokens_seen": 146951970, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.13446045, + "step": 6846, + "time_per_iteration": 2.510215997695923 + }, + { + "auxiliary_loss_clip": 0.06481081, + "auxiliary_loss_mlp": 0.01271315, + "balance_loss_clip": 0.06309364, + "balance_loss_mlp": 0.01257391, + "epoch": 0.41166391101758604, + "flos": 27978378508800.0, + "grad_norm": 1.4002739744354715, + "language_loss": 0.65459704, + "learning_rate": 2.6568676721866826e-06, + "loss": 0.73212105, + "num_input_tokens_seen": 146975615, + "router_z_loss_clip": 1.71777344, + "router_z_loss_mlp": 0.13922119, + "step": 6847, + "time_per_iteration": 4.048614025115967 + }, + { + "auxiliary_loss_clip": 0.06476664, + "auxiliary_loss_mlp": 0.01270454, + "balance_loss_clip": 0.06304531, + "balance_loss_mlp": 0.01256459, + "epoch": 0.411724034270254, + "flos": 34139865916800.0, + "grad_norm": 1.3666484547506623, + "language_loss": 0.7086308, + "learning_rate": 2.656499802669069e-06, + "loss": 0.78610194, + "num_input_tokens_seen": 146998855, + "router_z_loss_clip": 1.72363281, + "router_z_loss_mlp": 0.13983154, + "step": 6848, + "time_per_iteration": 4.219269037246704 + }, + { + "auxiliary_loss_clip": 0.06375948, + "auxiliary_loss_mlp": 0.01253417, + "balance_loss_clip": 0.06298448, + "balance_loss_mlp": 0.01250777, + "epoch": 0.41178415752292197, + "flos": 67945090625280.0, + "grad_norm": 0.8791919044020794, + "language_loss": 0.56300032, + "learning_rate": 2.6561319082569174e-06, + "loss": 0.63929397, + "num_input_tokens_seen": 147062710, + "router_z_loss_clip": 0.77441406, + "router_z_loss_mlp": 0.02642822, + "step": 6849, + "time_per_iteration": 3.226757287979126 + }, + { + "auxiliary_loss_clip": 0.06472921, + "auxiliary_loss_mlp": 0.0127066, + "balance_loss_clip": 0.06303038, + "balance_loss_mlp": 0.0125707, + "epoch": 0.41184428077558993, + "flos": 34322573744640.0, + "grad_norm": 1.830210581648694, + "language_loss": 0.76533353, + "learning_rate": 2.6557639889641783e-06, + "loss": 0.84276927, + "num_input_tokens_seen": 147086075, + "router_z_loss_clip": 1.70019531, + "router_z_loss_mlp": 0.13598633, + "step": 6850, + "time_per_iteration": 2.653665542602539 + }, + { + "auxiliary_loss_clip": 0.06475841, + "auxiliary_loss_mlp": 0.01268752, + "balance_loss_clip": 0.06303935, + "balance_loss_mlp": 0.0125484, + "epoch": 0.41190440402825795, + "flos": 35452796342400.0, + "grad_norm": 1.6037978840830116, + "language_loss": 0.68379039, + "learning_rate": 2.6553960448048025e-06, + "loss": 0.76123631, + "num_input_tokens_seen": 147107590, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.13909912, + "step": 6851, + "time_per_iteration": 2.72273588180542 + }, + { + "auxiliary_loss_clip": 0.06482952, + "auxiliary_loss_mlp": 0.01272578, + "balance_loss_clip": 0.06306773, + "balance_loss_mlp": 0.01256437, + "epoch": 0.4119645272809259, + "flos": 20856127386240.0, + "grad_norm": 2.4937650031840275, + "language_loss": 0.80344605, + "learning_rate": 2.655028075792743e-06, + "loss": 0.88100129, + "num_input_tokens_seen": 147123715, + "router_z_loss_clip": 1.76171875, + "router_z_loss_mlp": 0.16162109, + "step": 6852, + "time_per_iteration": 2.563422679901123 + }, + { + "auxiliary_loss_clip": 0.06490047, + "auxiliary_loss_mlp": 0.01270823, + "balance_loss_clip": 0.06310906, + "balance_loss_mlp": 0.01256267, + "epoch": 0.4120246505335939, + "flos": 27569218222080.0, + "grad_norm": 2.025784739879877, + "language_loss": 0.77943873, + "learning_rate": 2.6546600819419537e-06, + "loss": 0.8570475, + "num_input_tokens_seen": 147144290, + "router_z_loss_clip": 1.79003906, + "router_z_loss_mlp": 0.14538574, + "step": 6853, + "time_per_iteration": 4.108957290649414 + }, + { + "auxiliary_loss_clip": 0.06493531, + "auxiliary_loss_mlp": 0.0127083, + "balance_loss_clip": 0.06310283, + "balance_loss_mlp": 0.01254618, + "epoch": 0.41208477378626185, + "flos": 37824476232960.0, + "grad_norm": 1.7138113243533049, + "language_loss": 0.66213286, + "learning_rate": 2.6542920632663883e-06, + "loss": 0.73977649, + "num_input_tokens_seen": 147166340, + "router_z_loss_clip": 1.83203125, + "router_z_loss_mlp": 0.16223145, + "step": 6854, + "time_per_iteration": 2.706514596939087 + }, + { + "auxiliary_loss_clip": 0.06481706, + "auxiliary_loss_mlp": 0.012695, + "balance_loss_clip": 0.06308492, + "balance_loss_mlp": 0.01256268, + "epoch": 0.4121448970389298, + "flos": 23447509482240.0, + "grad_norm": 1.8819465084993465, + "language_loss": 0.83935457, + "learning_rate": 2.6539240197800023e-06, + "loss": 0.9168666, + "num_input_tokens_seen": 147184025, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.13238525, + "step": 6855, + "time_per_iteration": 2.6131205558776855 + }, + { + "auxiliary_loss_clip": 0.06478727, + "auxiliary_loss_mlp": 0.01272662, + "balance_loss_clip": 0.06308559, + "balance_loss_mlp": 0.01258524, + "epoch": 0.4122050202915978, + "flos": 21331813415040.0, + "grad_norm": 1.6556690578140216, + "language_loss": 0.79642534, + "learning_rate": 2.6535559514967517e-06, + "loss": 0.87393928, + "num_input_tokens_seen": 147202730, + "router_z_loss_clip": 1.70019531, + "router_z_loss_mlp": 0.14129639, + "step": 6856, + "time_per_iteration": 2.6186776161193848 + }, + { + "auxiliary_loss_clip": 0.06486623, + "auxiliary_loss_mlp": 0.01271133, + "balance_loss_clip": 0.06312534, + "balance_loss_mlp": 0.01257383, + "epoch": 0.41226514354426574, + "flos": 17311193026560.0, + "grad_norm": 2.5768867092656516, + "language_loss": 0.80543911, + "learning_rate": 2.6531878584305935e-06, + "loss": 0.88301665, + "num_input_tokens_seen": 147215315, + "router_z_loss_clip": 1.74023438, + "router_z_loss_mlp": 0.13739014, + "step": 6857, + "time_per_iteration": 4.0222320556640625 + }, + { + "auxiliary_loss_clip": 0.06484015, + "auxiliary_loss_mlp": 0.01273092, + "balance_loss_clip": 0.06307175, + "balance_loss_mlp": 0.01259168, + "epoch": 0.4123252667969337, + "flos": 17644519768320.0, + "grad_norm": 1.8891533513627916, + "language_loss": 0.71074593, + "learning_rate": 2.6528197405954873e-06, + "loss": 0.78831697, + "num_input_tokens_seen": 147233330, + "router_z_loss_clip": 1.765625, + "router_z_loss_mlp": 0.13934326, + "step": 6858, + "time_per_iteration": 2.598215341567993 + }, + { + "auxiliary_loss_clip": 0.06484012, + "auxiliary_loss_mlp": 0.01270468, + "balance_loss_clip": 0.06310833, + "balance_loss_mlp": 0.01256109, + "epoch": 0.4123853900496017, + "flos": 46435070304000.0, + "grad_norm": 1.791293678645808, + "language_loss": 0.59712768, + "learning_rate": 2.652451598005391e-06, + "loss": 0.67467248, + "num_input_tokens_seen": 147257780, + "router_z_loss_clip": 1.73339844, + "router_z_loss_mlp": 0.14361572, + "step": 6859, + "time_per_iteration": 2.818535804748535 + }, + { + "auxiliary_loss_clip": 0.0648525, + "auxiliary_loss_mlp": 0.01269281, + "balance_loss_clip": 0.06306802, + "balance_loss_mlp": 0.01255423, + "epoch": 0.41244551330226964, + "flos": 17680801386240.0, + "grad_norm": 3.190643468711074, + "language_loss": 0.73818636, + "learning_rate": 2.652083430674264e-06, + "loss": 0.81573164, + "num_input_tokens_seen": 147276055, + "router_z_loss_clip": 1.78417969, + "router_z_loss_mlp": 0.13861084, + "step": 6860, + "time_per_iteration": 2.559460163116455 + }, + { + "auxiliary_loss_clip": 0.06473921, + "auxiliary_loss_mlp": 0.01270813, + "balance_loss_clip": 0.06301314, + "balance_loss_mlp": 0.01257706, + "epoch": 0.4125056365549376, + "flos": 18699034602240.0, + "grad_norm": 1.5713730110506565, + "language_loss": 0.74087375, + "learning_rate": 2.651715238616068e-06, + "loss": 0.81832111, + "num_input_tokens_seen": 147293200, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.13110352, + "step": 6861, + "time_per_iteration": 2.563107967376709 + }, + { + "auxiliary_loss_clip": 0.06476536, + "auxiliary_loss_mlp": 0.01266788, + "balance_loss_clip": 0.06306636, + "balance_loss_mlp": 0.01253425, + "epoch": 0.41256575980760557, + "flos": 17901174424320.0, + "grad_norm": 2.040837827964215, + "language_loss": 0.8021872, + "learning_rate": 2.651347021844765e-06, + "loss": 0.87962043, + "num_input_tokens_seen": 147310640, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.13354492, + "step": 6862, + "time_per_iteration": 2.4968619346618652 + }, + { + "auxiliary_loss_clip": 0.06481781, + "auxiliary_loss_mlp": 0.01269578, + "balance_loss_clip": 0.06308153, + "balance_loss_mlp": 0.01255881, + "epoch": 0.41262588306027354, + "flos": 21987817430400.0, + "grad_norm": 2.204342418200638, + "language_loss": 0.767263, + "learning_rate": 2.650978780374318e-06, + "loss": 0.84477663, + "num_input_tokens_seen": 147329435, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.13708496, + "step": 6863, + "time_per_iteration": 2.5787971019744873 + }, + { + "auxiliary_loss_clip": 0.06377177, + "auxiliary_loss_mlp": 0.01254592, + "balance_loss_clip": 0.06300335, + "balance_loss_mlp": 0.01252135, + "epoch": 0.41268600631294156, + "flos": 53366339243520.0, + "grad_norm": 0.6821216328900507, + "language_loss": 0.52583742, + "learning_rate": 2.650610514218691e-06, + "loss": 0.60215503, + "num_input_tokens_seen": 147385805, + "router_z_loss_clip": 0.77001953, + "router_z_loss_mlp": 0.02455139, + "step": 6864, + "time_per_iteration": 3.1086013317108154 + }, + { + "auxiliary_loss_clip": 0.06480177, + "auxiliary_loss_mlp": 0.01271204, + "balance_loss_clip": 0.06300756, + "balance_loss_mlp": 0.01256714, + "epoch": 0.4127461295656095, + "flos": 24391586234880.0, + "grad_norm": 1.7134572277425464, + "language_loss": 0.72468507, + "learning_rate": 2.6502422233918468e-06, + "loss": 0.80219889, + "num_input_tokens_seen": 147405160, + "router_z_loss_clip": 1.79394531, + "router_z_loss_mlp": 0.14489746, + "step": 6865, + "time_per_iteration": 2.6081020832061768 + }, + { + "auxiliary_loss_clip": 0.06375298, + "auxiliary_loss_mlp": 0.01255641, + "balance_loss_clip": 0.06298722, + "balance_loss_mlp": 0.01252579, + "epoch": 0.4128062528182775, + "flos": 71725129142400.0, + "grad_norm": 0.9099190790692077, + "language_loss": 0.66497219, + "learning_rate": 2.649873907907753e-06, + "loss": 0.74128163, + "num_input_tokens_seen": 147460245, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.03059387, + "step": 6866, + "time_per_iteration": 3.0357213020324707 + }, + { + "auxiliary_loss_clip": 0.06476509, + "auxiliary_loss_mlp": 0.01269311, + "balance_loss_clip": 0.06301893, + "balance_loss_mlp": 0.01255799, + "epoch": 0.41286637607094545, + "flos": 17853362870400.0, + "grad_norm": 2.1198776843792357, + "language_loss": 0.81617618, + "learning_rate": 2.649505567780375e-06, + "loss": 0.89363438, + "num_input_tokens_seen": 147476200, + "router_z_loss_clip": 1.74414062, + "router_z_loss_mlp": 0.13500977, + "step": 6867, + "time_per_iteration": 2.6095240116119385 + }, + { + "auxiliary_loss_clip": 0.06482062, + "auxiliary_loss_mlp": 0.01270795, + "balance_loss_clip": 0.06303717, + "balance_loss_mlp": 0.01256657, + "epoch": 0.4129264993236134, + "flos": 25555407120000.0, + "grad_norm": 2.8405529060711006, + "language_loss": 0.78333044, + "learning_rate": 2.6491372030236815e-06, + "loss": 0.86085904, + "num_input_tokens_seen": 147494315, + "router_z_loss_clip": 1.78222656, + "router_z_loss_mlp": 0.14147949, + "step": 6868, + "time_per_iteration": 2.558155059814453 + }, + { + "auxiliary_loss_clip": 0.06374986, + "auxiliary_loss_mlp": 0.01255045, + "balance_loss_clip": 0.06298015, + "balance_loss_mlp": 0.01251991, + "epoch": 0.4129866225762814, + "flos": 65430730759680.0, + "grad_norm": 0.8212939455862347, + "language_loss": 0.57654673, + "learning_rate": 2.64876881365164e-06, + "loss": 0.65284705, + "num_input_tokens_seen": 147543665, + "router_z_loss_clip": 0.77001953, + "router_z_loss_mlp": 0.03051758, + "step": 6869, + "time_per_iteration": 2.9284112453460693 + }, + { + "auxiliary_loss_clip": 0.06481783, + "auxiliary_loss_mlp": 0.01277222, + "balance_loss_clip": 0.06310707, + "balance_loss_mlp": 0.01263472, + "epoch": 0.41304674582894935, + "flos": 28884622343040.0, + "grad_norm": 2.4401499988028594, + "language_loss": 0.75528967, + "learning_rate": 2.64840039967822e-06, + "loss": 0.83287978, + "num_input_tokens_seen": 147564870, + "router_z_loss_clip": 1.70996094, + "router_z_loss_mlp": 0.13763428, + "step": 6870, + "time_per_iteration": 2.6844911575317383 + }, + { + "auxiliary_loss_clip": 0.0647882, + "auxiliary_loss_mlp": 0.01278278, + "balance_loss_clip": 0.06302784, + "balance_loss_mlp": 0.0126414, + "epoch": 0.4131068690816173, + "flos": 22898379749760.0, + "grad_norm": 1.5575458850844177, + "language_loss": 0.83697838, + "learning_rate": 2.6480319611173912e-06, + "loss": 0.91454935, + "num_input_tokens_seen": 147584840, + "router_z_loss_clip": 1.76074219, + "router_z_loss_mlp": 0.14135742, + "step": 6871, + "time_per_iteration": 2.636808156967163 + }, + { + "auxiliary_loss_clip": 0.06479517, + "auxiliary_loss_mlp": 0.0126964, + "balance_loss_clip": 0.06303998, + "balance_loss_mlp": 0.01256033, + "epoch": 0.4131669923342853, + "flos": 26071944814080.0, + "grad_norm": 2.2227773400911732, + "language_loss": 0.69246161, + "learning_rate": 2.6476634979831263e-06, + "loss": 0.76995325, + "num_input_tokens_seen": 147604635, + "router_z_loss_clip": 1.75585938, + "router_z_loss_mlp": 0.1361084, + "step": 6872, + "time_per_iteration": 2.6492373943328857 + }, + { + "auxiliary_loss_clip": 0.06480041, + "auxiliary_loss_mlp": 0.01273197, + "balance_loss_clip": 0.06303592, + "balance_loss_mlp": 0.01259494, + "epoch": 0.41322711558695324, + "flos": 19250554176000.0, + "grad_norm": 1.8563624048188305, + "language_loss": 0.76261687, + "learning_rate": 2.6472950102893964e-06, + "loss": 0.84014916, + "num_input_tokens_seen": 147620700, + "router_z_loss_clip": 1.76464844, + "router_z_loss_mlp": 0.13696289, + "step": 6873, + "time_per_iteration": 2.5294342041015625 + }, + { + "auxiliary_loss_clip": 0.06480598, + "auxiliary_loss_mlp": 0.01273623, + "balance_loss_clip": 0.06302338, + "balance_loss_mlp": 0.0125958, + "epoch": 0.4132872388396212, + "flos": 22681067385600.0, + "grad_norm": 1.8281818605346505, + "language_loss": 0.83432305, + "learning_rate": 2.6469264980501746e-06, + "loss": 0.91186529, + "num_input_tokens_seen": 147639490, + "router_z_loss_clip": 1.78027344, + "router_z_loss_mlp": 0.14031982, + "step": 6874, + "time_per_iteration": 2.6135475635528564 + }, + { + "auxiliary_loss_clip": 0.06483124, + "auxiliary_loss_mlp": 0.01273525, + "balance_loss_clip": 0.06306563, + "balance_loss_mlp": 0.01258498, + "epoch": 0.4133473620922892, + "flos": 20155246709760.0, + "grad_norm": 1.7886089381127788, + "language_loss": 0.72210878, + "learning_rate": 2.646557961279436e-06, + "loss": 0.79967523, + "num_input_tokens_seen": 147657205, + "router_z_loss_clip": 1.76367188, + "router_z_loss_mlp": 0.15020752, + "step": 6875, + "time_per_iteration": 2.535613536834717 + }, + { + "auxiliary_loss_clip": 0.06467389, + "auxiliary_loss_mlp": 0.01270264, + "balance_loss_clip": 0.06301813, + "balance_loss_mlp": 0.01257151, + "epoch": 0.41340748534495714, + "flos": 24249520437120.0, + "grad_norm": 1.4522680677637643, + "language_loss": 0.82662565, + "learning_rate": 2.646189399991154e-06, + "loss": 0.90400219, + "num_input_tokens_seen": 147677005, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.13098145, + "step": 6876, + "time_per_iteration": 2.631683111190796 + }, + { + "auxiliary_loss_clip": 0.06476636, + "auxiliary_loss_mlp": 0.0126976, + "balance_loss_clip": 0.06298597, + "balance_loss_mlp": 0.01255198, + "epoch": 0.41346760859762516, + "flos": 14397385219200.0, + "grad_norm": 2.4272621941749044, + "language_loss": 0.65427208, + "learning_rate": 2.6458208141993048e-06, + "loss": 0.73173606, + "num_input_tokens_seen": 147693435, + "router_z_loss_clip": 1.77929688, + "router_z_loss_mlp": 0.14556885, + "step": 6877, + "time_per_iteration": 2.5211727619171143 + }, + { + "auxiliary_loss_clip": 0.06477489, + "auxiliary_loss_mlp": 0.01272334, + "balance_loss_clip": 0.06304673, + "balance_loss_mlp": 0.0125853, + "epoch": 0.4135277318502931, + "flos": 22498569192960.0, + "grad_norm": 1.7887587996629348, + "language_loss": 0.77271414, + "learning_rate": 2.6454522039178668e-06, + "loss": 0.85021234, + "num_input_tokens_seen": 147714000, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.13800049, + "step": 6878, + "time_per_iteration": 2.591952085494995 + }, + { + "auxiliary_loss_clip": 0.06478719, + "auxiliary_loss_mlp": 0.01271871, + "balance_loss_clip": 0.06303747, + "balance_loss_mlp": 0.01258525, + "epoch": 0.4135878551029611, + "flos": 22425251270400.0, + "grad_norm": 1.9381355665838014, + "language_loss": 0.8049022, + "learning_rate": 2.6450835691608154e-06, + "loss": 0.88240814, + "num_input_tokens_seen": 147731010, + "router_z_loss_clip": 1.74609375, + "router_z_loss_mlp": 0.13354492, + "step": 6879, + "time_per_iteration": 2.565875291824341 + }, + { + "auxiliary_loss_clip": 0.06476135, + "auxiliary_loss_mlp": 0.0127254, + "balance_loss_clip": 0.06301241, + "balance_loss_mlp": 0.01258688, + "epoch": 0.41364797835562905, + "flos": 27060646665600.0, + "grad_norm": 1.8294611042748399, + "language_loss": 0.8543402, + "learning_rate": 2.6447149099421315e-06, + "loss": 0.93182689, + "num_input_tokens_seen": 147750880, + "router_z_loss_clip": 1.74804688, + "router_z_loss_mlp": 0.13861084, + "step": 6880, + "time_per_iteration": 2.6438286304473877 + }, + { + "auxiliary_loss_clip": 0.06478438, + "auxiliary_loss_mlp": 0.01270379, + "balance_loss_clip": 0.06301369, + "balance_loss_mlp": 0.01256258, + "epoch": 0.413708101608297, + "flos": 22974464856960.0, + "grad_norm": 2.0767525842165413, + "language_loss": 0.70694637, + "learning_rate": 2.6443462262757927e-06, + "loss": 0.78443456, + "num_input_tokens_seen": 147771360, + "router_z_loss_clip": 1.77050781, + "router_z_loss_mlp": 0.14129639, + "step": 6881, + "time_per_iteration": 2.57663893699646 + }, + { + "auxiliary_loss_clip": 0.06468567, + "auxiliary_loss_mlp": 0.01269061, + "balance_loss_clip": 0.06300917, + "balance_loss_mlp": 0.01255978, + "epoch": 0.413768224860965, + "flos": 13339013097600.0, + "grad_norm": 1.7206029499163673, + "language_loss": 0.81694102, + "learning_rate": 2.6439775181757805e-06, + "loss": 0.89431733, + "num_input_tokens_seen": 147787440, + "router_z_loss_clip": 1.67578125, + "router_z_loss_mlp": 0.13092041, + "step": 6882, + "time_per_iteration": 2.572300672531128 + }, + { + "auxiliary_loss_clip": 0.06484764, + "auxiliary_loss_mlp": 0.01273853, + "balance_loss_clip": 0.06306723, + "balance_loss_mlp": 0.0125776, + "epoch": 0.41382834811363295, + "flos": 20820306965760.0, + "grad_norm": 2.0204096459019176, + "language_loss": 0.69182575, + "learning_rate": 2.643608785656077e-06, + "loss": 0.76941192, + "num_input_tokens_seen": 147805720, + "router_z_loss_clip": 1.77929688, + "router_z_loss_mlp": 0.16088867, + "step": 6883, + "time_per_iteration": 2.5611510276794434 + }, + { + "auxiliary_loss_clip": 0.06472149, + "auxiliary_loss_mlp": 0.0126815, + "balance_loss_clip": 0.06297622, + "balance_loss_mlp": 0.01255061, + "epoch": 0.4138884713663009, + "flos": 20673293777280.0, + "grad_norm": 2.0786241324697, + "language_loss": 0.75945485, + "learning_rate": 2.643240028730663e-06, + "loss": 0.83685786, + "num_input_tokens_seen": 147824605, + "router_z_loss_clip": 1.74609375, + "router_z_loss_mlp": 0.13092041, + "step": 6884, + "time_per_iteration": 2.5788567066192627 + }, + { + "auxiliary_loss_clip": 0.06477202, + "auxiliary_loss_mlp": 0.01273717, + "balance_loss_clip": 0.06298974, + "balance_loss_mlp": 0.01260008, + "epoch": 0.4139485946189689, + "flos": 29063808299520.0, + "grad_norm": 3.0401310083666444, + "language_loss": 0.76198518, + "learning_rate": 2.642871247413523e-06, + "loss": 0.83949435, + "num_input_tokens_seen": 147845445, + "router_z_loss_clip": 1.78125, + "router_z_loss_mlp": 0.13720703, + "step": 6885, + "time_per_iteration": 2.5964529514312744 + }, + { + "auxiliary_loss_clip": 0.06475228, + "auxiliary_loss_mlp": 0.01270635, + "balance_loss_clip": 0.06299268, + "balance_loss_mlp": 0.01256187, + "epoch": 0.41400871787163684, + "flos": 24432605608320.0, + "grad_norm": 1.9051304938208142, + "language_loss": 0.70031226, + "learning_rate": 2.6425024417186414e-06, + "loss": 0.77777094, + "num_input_tokens_seen": 147865580, + "router_z_loss_clip": 1.75878906, + "router_z_loss_mlp": 0.14447021, + "step": 6886, + "time_per_iteration": 4.101384878158569 + }, + { + "auxiliary_loss_clip": 0.06475122, + "auxiliary_loss_mlp": 0.01275658, + "balance_loss_clip": 0.06297341, + "balance_loss_mlp": 0.01260423, + "epoch": 0.4140688411243048, + "flos": 19470172527360.0, + "grad_norm": 1.459976196778311, + "language_loss": 0.75538456, + "learning_rate": 2.642133611660002e-06, + "loss": 0.83289236, + "num_input_tokens_seen": 147885230, + "router_z_loss_clip": 1.77832031, + "router_z_loss_mlp": 0.15234375, + "step": 6887, + "time_per_iteration": 2.5979294776916504 + }, + { + "auxiliary_loss_clip": 0.06468056, + "auxiliary_loss_mlp": 0.01273257, + "balance_loss_clip": 0.06294202, + "balance_loss_mlp": 0.0125916, + "epoch": 0.4141289643769728, + "flos": 19319008561920.0, + "grad_norm": 2.153365375528394, + "language_loss": 0.70707798, + "learning_rate": 2.641764757251592e-06, + "loss": 0.78449106, + "num_input_tokens_seen": 147903035, + "router_z_loss_clip": 1.73730469, + "router_z_loss_mlp": 0.14099121, + "step": 6888, + "time_per_iteration": 4.008386850357056 + }, + { + "auxiliary_loss_clip": 0.06466109, + "auxiliary_loss_mlp": 0.01273102, + "balance_loss_clip": 0.0629206, + "balance_loss_mlp": 0.0125863, + "epoch": 0.41418908762964074, + "flos": 16732448075520.0, + "grad_norm": 2.015209624353795, + "language_loss": 0.76631236, + "learning_rate": 2.6413958785073976e-06, + "loss": 0.84370446, + "num_input_tokens_seen": 147918745, + "router_z_loss_clip": 1.74121094, + "router_z_loss_mlp": 0.14477539, + "step": 6889, + "time_per_iteration": 2.5270447731018066 + }, + { + "auxiliary_loss_clip": 0.06466071, + "auxiliary_loss_mlp": 0.012722, + "balance_loss_clip": 0.06294381, + "balance_loss_mlp": 0.01258628, + "epoch": 0.41424921088230876, + "flos": 25303112876160.0, + "grad_norm": 1.5878983493356928, + "language_loss": 0.80245477, + "learning_rate": 2.6410269754414074e-06, + "loss": 0.87983751, + "num_input_tokens_seen": 147938265, + "router_z_loss_clip": 1.71679688, + "router_z_loss_mlp": 0.13568115, + "step": 6890, + "time_per_iteration": 2.5559017658233643 + }, + { + "auxiliary_loss_clip": 0.06465066, + "auxiliary_loss_mlp": 0.01273625, + "balance_loss_clip": 0.06294424, + "balance_loss_mlp": 0.01258592, + "epoch": 0.4143093341349767, + "flos": 20966984737920.0, + "grad_norm": 1.4631338633868025, + "language_loss": 0.74175858, + "learning_rate": 2.6406580480676113e-06, + "loss": 0.81914544, + "num_input_tokens_seen": 147957320, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.15014648, + "step": 6891, + "time_per_iteration": 2.5313403606414795 + }, + { + "auxiliary_loss_clip": 0.06475316, + "auxiliary_loss_mlp": 0.01269526, + "balance_loss_clip": 0.0629719, + "balance_loss_mlp": 0.01253283, + "epoch": 0.4143694573876447, + "flos": 22024182902400.0, + "grad_norm": 2.801103384820577, + "language_loss": 0.84378529, + "learning_rate": 2.6402890963999963e-06, + "loss": 0.92123371, + "num_input_tokens_seen": 147977045, + "router_z_loss_clip": 1.78125, + "router_z_loss_mlp": 0.16247559, + "step": 6892, + "time_per_iteration": 3.9777607917785645 + }, + { + "auxiliary_loss_clip": 0.06465086, + "auxiliary_loss_mlp": 0.01270368, + "balance_loss_clip": 0.06295982, + "balance_loss_mlp": 0.01257339, + "epoch": 0.41442958064031266, + "flos": 35705761418880.0, + "grad_norm": 1.735816743811137, + "language_loss": 0.70161885, + "learning_rate": 2.6399201204525554e-06, + "loss": 0.7789734, + "num_input_tokens_seen": 147996905, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.13037109, + "step": 6893, + "time_per_iteration": 2.6909854412078857 + }, + { + "auxiliary_loss_clip": 0.06467048, + "auxiliary_loss_mlp": 0.01267192, + "balance_loss_clip": 0.0629535, + "balance_loss_mlp": 0.01253799, + "epoch": 0.4144897038929806, + "flos": 28301391198720.0, + "grad_norm": 1.3940088969507989, + "language_loss": 0.73223269, + "learning_rate": 2.639551120239279e-06, + "loss": 0.80957508, + "num_input_tokens_seen": 148017875, + "router_z_loss_clip": 1.71484375, + "router_z_loss_mlp": 0.13378906, + "step": 6894, + "time_per_iteration": 2.5950350761413574 + }, + { + "auxiliary_loss_clip": 0.06476665, + "auxiliary_loss_mlp": 0.01273362, + "balance_loss_clip": 0.06300536, + "balance_loss_mlp": 0.0125867, + "epoch": 0.4145498271456486, + "flos": 11651568848640.0, + "grad_norm": 2.440609351676066, + "language_loss": 0.62663507, + "learning_rate": 2.63918209577416e-06, + "loss": 0.7041353, + "num_input_tokens_seen": 148032300, + "router_z_loss_clip": 1.76074219, + "router_z_loss_mlp": 0.14697266, + "step": 6895, + "time_per_iteration": 2.471320390701294 + }, + { + "auxiliary_loss_clip": 0.0646576, + "auxiliary_loss_mlp": 0.01272394, + "balance_loss_clip": 0.06296334, + "balance_loss_mlp": 0.01258589, + "epoch": 0.41460995039831655, + "flos": 27243061004160.0, + "grad_norm": 3.24758428503537, + "language_loss": 0.70684588, + "learning_rate": 2.638813047071192e-06, + "loss": 0.78422737, + "num_input_tokens_seen": 148053260, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.13806152, + "step": 6896, + "time_per_iteration": 2.5871524810791016 + }, + { + "auxiliary_loss_clip": 0.06475289, + "auxiliary_loss_mlp": 0.01275214, + "balance_loss_clip": 0.06299431, + "balance_loss_mlp": 0.01260164, + "epoch": 0.4146700736509845, + "flos": 25929627454080.0, + "grad_norm": 1.8920871134817128, + "language_loss": 0.73144394, + "learning_rate": 2.6384439741443696e-06, + "loss": 0.80894893, + "num_input_tokens_seen": 148072965, + "router_z_loss_clip": 1.7578125, + "router_z_loss_mlp": 0.15057373, + "step": 6897, + "time_per_iteration": 4.0778656005859375 + }, + { + "auxiliary_loss_clip": 0.0646714, + "auxiliary_loss_mlp": 0.01271778, + "balance_loss_clip": 0.06293359, + "balance_loss_mlp": 0.01257371, + "epoch": 0.4147301969036525, + "flos": 26840441335680.0, + "grad_norm": 6.247593775216772, + "language_loss": 0.84715986, + "learning_rate": 2.6380748770076873e-06, + "loss": 0.92454904, + "num_input_tokens_seen": 148093240, + "router_z_loss_clip": 1.73730469, + "router_z_loss_mlp": 0.14404297, + "step": 6898, + "time_per_iteration": 2.5603139400482178 + }, + { + "auxiliary_loss_clip": 0.06469397, + "auxiliary_loss_mlp": 0.01267488, + "balance_loss_clip": 0.06293289, + "balance_loss_mlp": 0.01253678, + "epoch": 0.41479032015632045, + "flos": 20303727344640.0, + "grad_norm": 2.0378276609946098, + "language_loss": 0.74898899, + "learning_rate": 2.6377057556751416e-06, + "loss": 0.82635784, + "num_input_tokens_seen": 148110925, + "router_z_loss_clip": 1.76074219, + "router_z_loss_mlp": 0.13812256, + "step": 6899, + "time_per_iteration": 2.53822660446167 + }, + { + "auxiliary_loss_clip": 0.06477535, + "auxiliary_loss_mlp": 0.01273796, + "balance_loss_clip": 0.06297705, + "balance_loss_mlp": 0.01258239, + "epoch": 0.4148504434089884, + "flos": 25272030211200.0, + "grad_norm": 2.0370175779228465, + "language_loss": 0.75786376, + "learning_rate": 2.6373366101607306e-06, + "loss": 0.83537704, + "num_input_tokens_seen": 148130670, + "router_z_loss_clip": 1.796875, + "router_z_loss_mlp": 0.15563965, + "step": 6900, + "time_per_iteration": 2.5547776222229004 + }, + { + "auxiliary_loss_clip": 0.06470095, + "auxiliary_loss_mlp": 0.01275828, + "balance_loss_clip": 0.06298018, + "balance_loss_mlp": 0.01260057, + "epoch": 0.4149105666616564, + "flos": 12827087377920.0, + "grad_norm": 3.426788101109298, + "language_loss": 0.80153453, + "learning_rate": 2.6369674404784503e-06, + "loss": 0.87899375, + "num_input_tokens_seen": 148148350, + "router_z_loss_clip": 1.72070312, + "router_z_loss_mlp": 0.15783691, + "step": 6901, + "time_per_iteration": 2.5724570751190186 + }, + { + "auxiliary_loss_clip": 0.06464257, + "auxiliary_loss_mlp": 0.01273382, + "balance_loss_clip": 0.06292327, + "balance_loss_mlp": 0.01258791, + "epoch": 0.41497068991432434, + "flos": 16769526307200.0, + "grad_norm": 2.2871359145608507, + "language_loss": 0.70271528, + "learning_rate": 2.6365982466423014e-06, + "loss": 0.78009164, + "num_input_tokens_seen": 148167550, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.14593506, + "step": 6902, + "time_per_iteration": 2.518018960952759 + }, + { + "auxiliary_loss_clip": 0.06463319, + "auxiliary_loss_mlp": 0.01270625, + "balance_loss_clip": 0.06294475, + "balance_loss_mlp": 0.01255706, + "epoch": 0.4150308131669923, + "flos": 18006161990400.0, + "grad_norm": 2.0523680752477906, + "language_loss": 0.8405019, + "learning_rate": 2.6362290286662834e-06, + "loss": 0.91784132, + "num_input_tokens_seen": 148184740, + "router_z_loss_clip": 1.68554688, + "router_z_loss_mlp": 0.14923096, + "step": 6903, + "time_per_iteration": 2.719252586364746 + }, + { + "auxiliary_loss_clip": 0.06478511, + "auxiliary_loss_mlp": 0.01270948, + "balance_loss_clip": 0.06298795, + "balance_loss_mlp": 0.01254282, + "epoch": 0.41509093641966033, + "flos": 30052635932160.0, + "grad_norm": 2.3513516306772826, + "language_loss": 0.67960835, + "learning_rate": 2.6358597865643968e-06, + "loss": 0.75710285, + "num_input_tokens_seen": 148204605, + "router_z_loss_clip": 1.796875, + "router_z_loss_mlp": 0.16674805, + "step": 6904, + "time_per_iteration": 2.605834484100342 + }, + { + "auxiliary_loss_clip": 0.06473922, + "auxiliary_loss_mlp": 0.01267185, + "balance_loss_clip": 0.06295053, + "balance_loss_mlp": 0.01252678, + "epoch": 0.4151510596723283, + "flos": 24286892158080.0, + "grad_norm": 1.8668907258080212, + "language_loss": 0.77697861, + "learning_rate": 2.635490520350643e-06, + "loss": 0.85438967, + "num_input_tokens_seen": 148224675, + "router_z_loss_clip": 1.78710938, + "router_z_loss_mlp": 0.14508057, + "step": 6905, + "time_per_iteration": 2.6073246002197266 + }, + { + "auxiliary_loss_clip": 0.06477012, + "auxiliary_loss_mlp": 0.01269791, + "balance_loss_clip": 0.06300149, + "balance_loss_mlp": 0.01255391, + "epoch": 0.41521118292499626, + "flos": 23482784851200.0, + "grad_norm": 2.106489831039321, + "language_loss": 0.68546331, + "learning_rate": 2.635121230039025e-06, + "loss": 0.76293135, + "num_input_tokens_seen": 148243375, + "router_z_loss_clip": 1.76855469, + "router_z_loss_mlp": 0.1439209, + "step": 6906, + "time_per_iteration": 2.5378260612487793 + }, + { + "auxiliary_loss_clip": 0.06470662, + "auxiliary_loss_mlp": 0.01269025, + "balance_loss_clip": 0.06298003, + "balance_loss_mlp": 0.01254839, + "epoch": 0.4152713061776642, + "flos": 22131728017920.0, + "grad_norm": 2.406599601104124, + "language_loss": 0.68275452, + "learning_rate": 2.6347519156435467e-06, + "loss": 0.76015139, + "num_input_tokens_seen": 148261140, + "router_z_loss_clip": 1.72851562, + "router_z_loss_mlp": 0.14196777, + "step": 6907, + "time_per_iteration": 2.548020124435425 + }, + { + "auxiliary_loss_clip": 0.06477083, + "auxiliary_loss_mlp": 0.01270349, + "balance_loss_clip": 0.06301615, + "balance_loss_mlp": 0.01256342, + "epoch": 0.4153314294303322, + "flos": 21257740805760.0, + "grad_norm": 2.5393224991434398, + "language_loss": 0.77004838, + "learning_rate": 2.6343825771782123e-06, + "loss": 0.84752274, + "num_input_tokens_seen": 148279655, + "router_z_loss_clip": 1.75585938, + "router_z_loss_mlp": 0.14013672, + "step": 6908, + "time_per_iteration": 2.52205753326416 + }, + { + "auxiliary_loss_clip": 0.0635362, + "auxiliary_loss_mlp": 0.01259834, + "balance_loss_clip": 0.06277395, + "balance_loss_mlp": 0.01256612, + "epoch": 0.41539155268300015, + "flos": 57939443527680.0, + "grad_norm": 0.769240592375345, + "language_loss": 0.64804208, + "learning_rate": 2.634013214657026e-06, + "loss": 0.72417659, + "num_input_tokens_seen": 148339005, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.03225708, + "step": 6909, + "time_per_iteration": 3.109095573425293 + }, + { + "auxiliary_loss_clip": 0.06469519, + "auxiliary_loss_mlp": 0.01271461, + "balance_loss_clip": 0.06297643, + "balance_loss_mlp": 0.0125746, + "epoch": 0.4154516759356681, + "flos": 21909384408960.0, + "grad_norm": 1.4248669333769037, + "language_loss": 0.87550539, + "learning_rate": 2.633643828093996e-06, + "loss": 0.95291519, + "num_input_tokens_seen": 148358715, + "router_z_loss_clip": 1.71777344, + "router_z_loss_mlp": 0.13989258, + "step": 6910, + "time_per_iteration": 2.5253639221191406 + }, + { + "auxiliary_loss_clip": 0.06354217, + "auxiliary_loss_mlp": 0.01257534, + "balance_loss_clip": 0.0627715, + "balance_loss_mlp": 0.01254598, + "epoch": 0.4155117991883361, + "flos": 67852234702080.0, + "grad_norm": 0.8147918233574727, + "language_loss": 0.62098897, + "learning_rate": 2.633274417503128e-06, + "loss": 0.69710648, + "num_input_tokens_seen": 148417280, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.02932739, + "step": 6911, + "time_per_iteration": 3.1515297889709473 + }, + { + "auxiliary_loss_clip": 0.06486405, + "auxiliary_loss_mlp": 0.01269938, + "balance_loss_clip": 0.06302486, + "balance_loss_mlp": 0.01254393, + "epoch": 0.41557192244100405, + "flos": 14287869532800.0, + "grad_norm": 2.853367345352451, + "language_loss": 0.88092077, + "learning_rate": 2.6329049828984312e-06, + "loss": 0.95848417, + "num_input_tokens_seen": 148432610, + "router_z_loss_clip": 1.83691406, + "router_z_loss_mlp": 0.15551758, + "step": 6912, + "time_per_iteration": 2.5334529876708984 + }, + { + "auxiliary_loss_clip": 0.06480967, + "auxiliary_loss_mlp": 0.01268043, + "balance_loss_clip": 0.06303312, + "balance_loss_mlp": 0.01253451, + "epoch": 0.415632045693672, + "flos": 24468803372160.0, + "grad_norm": 2.9756004279328945, + "language_loss": 0.63331664, + "learning_rate": 2.632535524293914e-06, + "loss": 0.71080673, + "num_input_tokens_seen": 148451510, + "router_z_loss_clip": 1.77734375, + "router_z_loss_mlp": 0.14581299, + "step": 6913, + "time_per_iteration": 2.547567129135132 + }, + { + "auxiliary_loss_clip": 0.06471419, + "auxiliary_loss_mlp": 0.01270035, + "balance_loss_clip": 0.06297998, + "balance_loss_mlp": 0.01256249, + "epoch": 0.41569216894634, + "flos": 20120600246400.0, + "grad_norm": 1.832366261637427, + "language_loss": 0.75605875, + "learning_rate": 2.632166041703586e-06, + "loss": 0.83347332, + "num_input_tokens_seen": 148469945, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.13787842, + "step": 6914, + "time_per_iteration": 2.5624208450317383 + }, + { + "auxiliary_loss_clip": 0.06479953, + "auxiliary_loss_mlp": 0.01273918, + "balance_loss_clip": 0.06302451, + "balance_loss_mlp": 0.01257897, + "epoch": 0.41575229219900794, + "flos": 23804497802880.0, + "grad_norm": 2.012818087979969, + "language_loss": 0.87586981, + "learning_rate": 2.631796535141458e-06, + "loss": 0.95340854, + "num_input_tokens_seen": 148486655, + "router_z_loss_clip": 1.77441406, + "router_z_loss_mlp": 0.16015625, + "step": 6915, + "time_per_iteration": 2.545825481414795 + }, + { + "auxiliary_loss_clip": 0.06478707, + "auxiliary_loss_mlp": 0.01273084, + "balance_loss_clip": 0.06302266, + "balance_loss_mlp": 0.01259273, + "epoch": 0.4158124154516759, + "flos": 23114224667520.0, + "grad_norm": 2.419843437778294, + "language_loss": 0.71605122, + "learning_rate": 2.6314270046215426e-06, + "loss": 0.79356909, + "num_input_tokens_seen": 148505035, + "router_z_loss_clip": 1.765625, + "router_z_loss_mlp": 0.13818359, + "step": 6916, + "time_per_iteration": 2.59429669380188 + }, + { + "auxiliary_loss_clip": 0.06477056, + "auxiliary_loss_mlp": 0.01267217, + "balance_loss_clip": 0.06298968, + "balance_loss_mlp": 0.01252208, + "epoch": 0.41587253870434393, + "flos": 24249771999360.0, + "grad_norm": 1.4428572529082921, + "language_loss": 0.71931446, + "learning_rate": 2.631057450157852e-06, + "loss": 0.7967571, + "num_input_tokens_seen": 148525575, + "router_z_loss_clip": 1.78222656, + "router_z_loss_mlp": 0.15002441, + "step": 6917, + "time_per_iteration": 2.56001877784729 + }, + { + "auxiliary_loss_clip": 0.06469631, + "auxiliary_loss_mlp": 0.01267681, + "balance_loss_clip": 0.06294615, + "balance_loss_mlp": 0.01253089, + "epoch": 0.4159326619570119, + "flos": 23888926391040.0, + "grad_norm": 4.142003179261072, + "language_loss": 0.80924189, + "learning_rate": 2.6306878717643988e-06, + "loss": 0.88661504, + "num_input_tokens_seen": 148547270, + "router_z_loss_clip": 1.75097656, + "router_z_loss_mlp": 0.14599609, + "step": 6918, + "time_per_iteration": 2.6182031631469727 + }, + { + "auxiliary_loss_clip": 0.06479505, + "auxiliary_loss_mlp": 0.01270819, + "balance_loss_clip": 0.06299014, + "balance_loss_mlp": 0.01255, + "epoch": 0.41599278520967986, + "flos": 40636315221120.0, + "grad_norm": 1.446116397311604, + "language_loss": 0.70620072, + "learning_rate": 2.6303182694551995e-06, + "loss": 0.78370392, + "num_input_tokens_seen": 148572100, + "router_z_loss_clip": 1.8046875, + "router_z_loss_mlp": 0.1583252, + "step": 6919, + "time_per_iteration": 2.7974801063537598 + }, + { + "auxiliary_loss_clip": 0.06470604, + "auxiliary_loss_mlp": 0.01270956, + "balance_loss_clip": 0.06293205, + "balance_loss_mlp": 0.01255697, + "epoch": 0.4160529084623478, + "flos": 18228757161600.0, + "grad_norm": 1.8139422387612383, + "language_loss": 0.81669927, + "learning_rate": 2.6299486432442677e-06, + "loss": 0.89411485, + "num_input_tokens_seen": 148591245, + "router_z_loss_clip": 1.77441406, + "router_z_loss_mlp": 0.15258789, + "step": 6920, + "time_per_iteration": 2.652277708053589 + }, + { + "auxiliary_loss_clip": 0.06476951, + "auxiliary_loss_mlp": 0.01273828, + "balance_loss_clip": 0.06298292, + "balance_loss_mlp": 0.01258724, + "epoch": 0.4161130317150158, + "flos": 13666973178240.0, + "grad_norm": 2.775667367204969, + "language_loss": 0.65528631, + "learning_rate": 2.6295789931456195e-06, + "loss": 0.73279405, + "num_input_tokens_seen": 148607980, + "router_z_loss_clip": 1.7890625, + "router_z_loss_mlp": 0.15100098, + "step": 6921, + "time_per_iteration": 2.543761968612671 + }, + { + "auxiliary_loss_clip": 0.0647813, + "auxiliary_loss_mlp": 0.01273522, + "balance_loss_clip": 0.06301805, + "balance_loss_mlp": 0.01258168, + "epoch": 0.41617315496768376, + "flos": 16183779540480.0, + "grad_norm": 2.038581093377189, + "language_loss": 0.80900288, + "learning_rate": 2.629209319173274e-06, + "loss": 0.88651937, + "num_input_tokens_seen": 148624490, + "router_z_loss_clip": 1.76171875, + "router_z_loss_mlp": 0.15368652, + "step": 6922, + "time_per_iteration": 2.5606656074523926 + }, + { + "auxiliary_loss_clip": 0.06480581, + "auxiliary_loss_mlp": 0.01270422, + "balance_loss_clip": 0.06301428, + "balance_loss_mlp": 0.01255163, + "epoch": 0.4162332782203517, + "flos": 26220467376000.0, + "grad_norm": 1.63600266107907, + "language_loss": 0.6809119, + "learning_rate": 2.628839621341247e-06, + "loss": 0.7584219, + "num_input_tokens_seen": 148646490, + "router_z_loss_clip": 1.79101562, + "router_z_loss_mlp": 0.15258789, + "step": 6923, + "time_per_iteration": 2.5789952278137207 + }, + { + "auxiliary_loss_clip": 0.06474873, + "auxiliary_loss_mlp": 0.0126996, + "balance_loss_clip": 0.06299335, + "balance_loss_mlp": 0.01254152, + "epoch": 0.4162934014730197, + "flos": 28191540096000.0, + "grad_norm": 1.91165548300248, + "language_loss": 0.76249051, + "learning_rate": 2.6284698996635593e-06, + "loss": 0.83993888, + "num_input_tokens_seen": 148668580, + "router_z_loss_clip": 1.75585938, + "router_z_loss_mlp": 0.15795898, + "step": 6924, + "time_per_iteration": 2.6209194660186768 + }, + { + "auxiliary_loss_clip": 0.06473987, + "auxiliary_loss_mlp": 0.01272207, + "balance_loss_clip": 0.06295989, + "balance_loss_mlp": 0.01257759, + "epoch": 0.41635352472568765, + "flos": 19871492457600.0, + "grad_norm": 1.5667233765254498, + "language_loss": 0.73101473, + "learning_rate": 2.62810015415423e-06, + "loss": 0.80847669, + "num_input_tokens_seen": 148688410, + "router_z_loss_clip": 1.78027344, + "router_z_loss_mlp": 0.14465332, + "step": 6925, + "time_per_iteration": 2.5133748054504395 + }, + { + "auxiliary_loss_clip": 0.0646892, + "auxiliary_loss_mlp": 0.01268263, + "balance_loss_clip": 0.06293461, + "balance_loss_mlp": 0.0125391, + "epoch": 0.4164136479783556, + "flos": 14939974333440.0, + "grad_norm": 2.1337011873068445, + "language_loss": 0.84242827, + "learning_rate": 2.6277303848272792e-06, + "loss": 0.91980004, + "num_input_tokens_seen": 148704855, + "router_z_loss_clip": 1.75488281, + "router_z_loss_mlp": 0.14361572, + "step": 6926, + "time_per_iteration": 3.923924446105957 + }, + { + "auxiliary_loss_clip": 0.06465639, + "auxiliary_loss_mlp": 0.01268265, + "balance_loss_clip": 0.06292935, + "balance_loss_mlp": 0.01254574, + "epoch": 0.4164737712310236, + "flos": 21763251688320.0, + "grad_norm": 1.56658623429888, + "language_loss": 0.86570489, + "learning_rate": 2.6273605916967302e-06, + "loss": 0.94304395, + "num_input_tokens_seen": 148723065, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.13696289, + "step": 6927, + "time_per_iteration": 3.9643561840057373 + }, + { + "auxiliary_loss_clip": 0.06468353, + "auxiliary_loss_mlp": 0.01275736, + "balance_loss_clip": 0.06293458, + "balance_loss_mlp": 0.01260287, + "epoch": 0.41653389448369155, + "flos": 20746318210560.0, + "grad_norm": 2.3770101780600976, + "language_loss": 0.72583216, + "learning_rate": 2.626990774776604e-06, + "loss": 0.80327296, + "num_input_tokens_seen": 148741780, + "router_z_loss_clip": 1.74902344, + "router_z_loss_mlp": 0.15447998, + "step": 6928, + "time_per_iteration": 2.5111186504364014 + }, + { + "auxiliary_loss_clip": 0.06468435, + "auxiliary_loss_mlp": 0.01272442, + "balance_loss_clip": 0.062929, + "balance_loss_mlp": 0.0125735, + "epoch": 0.4165940177363595, + "flos": 24979848624000.0, + "grad_norm": 1.9381497388164433, + "language_loss": 0.78399348, + "learning_rate": 2.6266209340809254e-06, + "loss": 0.86140227, + "num_input_tokens_seen": 148759795, + "router_z_loss_clip": 1.75488281, + "router_z_loss_mlp": 0.15087891, + "step": 6929, + "time_per_iteration": 2.6066014766693115 + }, + { + "auxiliary_loss_clip": 0.0646543, + "auxiliary_loss_mlp": 0.01268046, + "balance_loss_clip": 0.06291193, + "balance_loss_mlp": 0.01253842, + "epoch": 0.41665414098902753, + "flos": 20527957670400.0, + "grad_norm": 1.8432748306405895, + "language_loss": 0.71154583, + "learning_rate": 2.6262510696237182e-06, + "loss": 0.78888059, + "num_input_tokens_seen": 148778680, + "router_z_loss_clip": 1.74316406, + "router_z_loss_mlp": 0.14190674, + "step": 6930, + "time_per_iteration": 2.5052478313446045 + }, + { + "auxiliary_loss_clip": 0.06468388, + "auxiliary_loss_mlp": 0.01269408, + "balance_loss_clip": 0.06291626, + "balance_loss_mlp": 0.01255067, + "epoch": 0.4167142642416955, + "flos": 19689078119040.0, + "grad_norm": 1.7731266468983917, + "language_loss": 0.81487417, + "learning_rate": 2.625881181419007e-06, + "loss": 0.89225209, + "num_input_tokens_seen": 148796470, + "router_z_loss_clip": 1.76855469, + "router_z_loss_mlp": 0.14355469, + "step": 6931, + "time_per_iteration": 2.555651903152466 + }, + { + "auxiliary_loss_clip": 0.0646255, + "auxiliary_loss_mlp": 0.01270611, + "balance_loss_clip": 0.06289293, + "balance_loss_mlp": 0.01255233, + "epoch": 0.41677438749436346, + "flos": 23769641704320.0, + "grad_norm": 2.211036345176988, + "language_loss": 0.79310054, + "learning_rate": 2.6255112694808193e-06, + "loss": 0.87043214, + "num_input_tokens_seen": 148815300, + "router_z_loss_clip": 1.73144531, + "router_z_loss_mlp": 0.15362549, + "step": 6932, + "time_per_iteration": 4.05314040184021 + }, + { + "auxiliary_loss_clip": 0.06464541, + "auxiliary_loss_mlp": 0.01269463, + "balance_loss_clip": 0.06289106, + "balance_loss_mlp": 0.01254752, + "epoch": 0.41683451074703143, + "flos": 30418051587840.0, + "grad_norm": 2.244908394273299, + "language_loss": 0.82220912, + "learning_rate": 2.6251413338231813e-06, + "loss": 0.89954913, + "num_input_tokens_seen": 148834315, + "router_z_loss_clip": 1.75292969, + "router_z_loss_mlp": 0.14727783, + "step": 6933, + "time_per_iteration": 2.715542793273926 + }, + { + "auxiliary_loss_clip": 0.06467043, + "auxiliary_loss_mlp": 0.01272262, + "balance_loss_clip": 0.06287256, + "balance_loss_mlp": 0.01257963, + "epoch": 0.4168946339996994, + "flos": 21513137650560.0, + "grad_norm": 1.8583396237684835, + "language_loss": 0.76938605, + "learning_rate": 2.624771374460121e-06, + "loss": 0.84677911, + "num_input_tokens_seen": 148852420, + "router_z_loss_clip": 1.796875, + "router_z_loss_mlp": 0.14300537, + "step": 6934, + "time_per_iteration": 2.630192279815674 + }, + { + "auxiliary_loss_clip": 0.06469443, + "auxiliary_loss_mlp": 0.0126919, + "balance_loss_clip": 0.06293288, + "balance_loss_mlp": 0.01254586, + "epoch": 0.41695475725236736, + "flos": 17644310133120.0, + "grad_norm": 2.110423315639561, + "language_loss": 0.67164314, + "learning_rate": 2.624401391405668e-06, + "loss": 0.74902946, + "num_input_tokens_seen": 148869305, + "router_z_loss_clip": 1.76074219, + "router_z_loss_mlp": 0.14599609, + "step": 6935, + "time_per_iteration": 2.484464168548584 + }, + { + "auxiliary_loss_clip": 0.0646461, + "auxiliary_loss_mlp": 0.01269491, + "balance_loss_clip": 0.06289718, + "balance_loss_mlp": 0.01254458, + "epoch": 0.4170148805050353, + "flos": 15674285589120.0, + "grad_norm": 2.4566205528754033, + "language_loss": 0.7383365, + "learning_rate": 2.6240313846738513e-06, + "loss": 0.81567752, + "num_input_tokens_seen": 148886395, + "router_z_loss_clip": 1.74902344, + "router_z_loss_mlp": 0.15039062, + "step": 6936, + "time_per_iteration": 3.9171254634857178 + }, + { + "auxiliary_loss_clip": 0.06457968, + "auxiliary_loss_mlp": 0.01275405, + "balance_loss_clip": 0.06285361, + "balance_loss_mlp": 0.01262184, + "epoch": 0.4170750037577033, + "flos": 15164623929600.0, + "grad_norm": 4.126334603160969, + "language_loss": 0.74596691, + "learning_rate": 2.6236613542787024e-06, + "loss": 0.8233006, + "num_input_tokens_seen": 148905235, + "router_z_loss_clip": 1.72851562, + "router_z_loss_mlp": 0.13226318, + "step": 6937, + "time_per_iteration": 2.5286996364593506 + }, + { + "auxiliary_loss_clip": 0.06462386, + "auxiliary_loss_mlp": 0.01273752, + "balance_loss_clip": 0.06289354, + "balance_loss_mlp": 0.01259727, + "epoch": 0.41713512701037125, + "flos": 28776029051520.0, + "grad_norm": 1.4497703642581674, + "language_loss": 0.84985441, + "learning_rate": 2.6232913002342518e-06, + "loss": 0.92721575, + "num_input_tokens_seen": 148928130, + "router_z_loss_clip": 1.72949219, + "router_z_loss_mlp": 0.14031982, + "step": 6938, + "time_per_iteration": 2.594024419784546 + }, + { + "auxiliary_loss_clip": 0.06468149, + "auxiliary_loss_mlp": 0.01274736, + "balance_loss_clip": 0.06289169, + "balance_loss_mlp": 0.01259114, + "epoch": 0.4171952502630392, + "flos": 28264564529280.0, + "grad_norm": 1.8332960409763566, + "language_loss": 0.74288213, + "learning_rate": 2.6229212225545334e-06, + "loss": 0.82031095, + "num_input_tokens_seen": 148948790, + "router_z_loss_clip": 1.7890625, + "router_z_loss_mlp": 0.15618896, + "step": 6939, + "time_per_iteration": 2.628620147705078 + }, + { + "auxiliary_loss_clip": 0.06462568, + "auxiliary_loss_mlp": 0.01269134, + "balance_loss_clip": 0.06289193, + "balance_loss_mlp": 0.01254817, + "epoch": 0.4172553735157072, + "flos": 24578612547840.0, + "grad_norm": 1.6044361894616455, + "language_loss": 0.75275123, + "learning_rate": 2.622551121253579e-06, + "loss": 0.83006829, + "num_input_tokens_seen": 148967690, + "router_z_loss_clip": 1.73339844, + "router_z_loss_mlp": 0.14331055, + "step": 6940, + "time_per_iteration": 2.55566143989563 + }, + { + "auxiliary_loss_clip": 0.06464436, + "auxiliary_loss_mlp": 0.01269512, + "balance_loss_clip": 0.0628769, + "balance_loss_mlp": 0.01255338, + "epoch": 0.41731549676837515, + "flos": 27051967768320.0, + "grad_norm": 1.7023568307679129, + "language_loss": 0.71513987, + "learning_rate": 2.622180996345424e-06, + "loss": 0.79247934, + "num_input_tokens_seen": 148987150, + "router_z_loss_clip": 1.76855469, + "router_z_loss_mlp": 0.1416626, + "step": 6941, + "time_per_iteration": 2.628779649734497 + }, + { + "auxiliary_loss_clip": 0.06464395, + "auxiliary_loss_mlp": 0.0127035, + "balance_loss_clip": 0.06285797, + "balance_loss_mlp": 0.01255342, + "epoch": 0.4173756200210431, + "flos": 28400173562880.0, + "grad_norm": 3.007655990717308, + "language_loss": 0.73701853, + "learning_rate": 2.621810847844104e-06, + "loss": 0.81436592, + "num_input_tokens_seen": 149004895, + "router_z_loss_clip": 1.78613281, + "router_z_loss_mlp": 0.15008545, + "step": 6942, + "time_per_iteration": 2.579085350036621 + }, + { + "auxiliary_loss_clip": 0.06469673, + "auxiliary_loss_mlp": 0.01269256, + "balance_loss_clip": 0.06289446, + "balance_loss_mlp": 0.01254587, + "epoch": 0.41743574327371114, + "flos": 22526968527360.0, + "grad_norm": 2.366625341311562, + "language_loss": 0.73327738, + "learning_rate": 2.6214406757636534e-06, + "loss": 0.81066668, + "num_input_tokens_seen": 149020970, + "router_z_loss_clip": 1.80273438, + "router_z_loss_mlp": 0.14672852, + "step": 6943, + "time_per_iteration": 2.5890767574310303 + }, + { + "auxiliary_loss_clip": 0.06466928, + "auxiliary_loss_mlp": 0.01267232, + "balance_loss_clip": 0.06290001, + "balance_loss_mlp": 0.01252998, + "epoch": 0.4174958665263791, + "flos": 30120587193600.0, + "grad_norm": 2.3204117950268817, + "language_loss": 0.63901597, + "learning_rate": 2.621070480118111e-06, + "loss": 0.71635759, + "num_input_tokens_seen": 149041795, + "router_z_loss_clip": 1.76855469, + "router_z_loss_mlp": 0.14245605, + "step": 6944, + "time_per_iteration": 2.586949586868286 + }, + { + "auxiliary_loss_clip": 0.06466375, + "auxiliary_loss_mlp": 0.01271741, + "balance_loss_clip": 0.0628995, + "balance_loss_mlp": 0.0125684, + "epoch": 0.41755598977904707, + "flos": 25270227348480.0, + "grad_norm": 11.202050930016789, + "language_loss": 0.70295048, + "learning_rate": 2.620700260921513e-06, + "loss": 0.78033161, + "num_input_tokens_seen": 149063700, + "router_z_loss_clip": 1.76269531, + "router_z_loss_mlp": 0.14898682, + "step": 6945, + "time_per_iteration": 2.6323587894439697 + }, + { + "auxiliary_loss_clip": 0.06460019, + "auxiliary_loss_mlp": 0.01270496, + "balance_loss_clip": 0.06285217, + "balance_loss_mlp": 0.01255219, + "epoch": 0.41761611303171503, + "flos": 19834707715200.0, + "grad_norm": 1.6201275470111005, + "language_loss": 0.8079865, + "learning_rate": 2.620330018187899e-06, + "loss": 0.88529164, + "num_input_tokens_seen": 149082410, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.152771, + "step": 6946, + "time_per_iteration": 2.5303776264190674 + }, + { + "auxiliary_loss_clip": 0.064612, + "auxiliary_loss_mlp": 0.01269709, + "balance_loss_clip": 0.06288694, + "balance_loss_mlp": 0.0125569, + "epoch": 0.417676236284383, + "flos": 15528655992960.0, + "grad_norm": 2.2948583781036027, + "language_loss": 0.77726543, + "learning_rate": 2.6199597519313086e-06, + "loss": 0.85457456, + "num_input_tokens_seen": 149098745, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.14038086, + "step": 6947, + "time_per_iteration": 2.5844216346740723 + }, + { + "auxiliary_loss_clip": 0.06465282, + "auxiliary_loss_mlp": 0.01267852, + "balance_loss_clip": 0.06289726, + "balance_loss_mlp": 0.01252844, + "epoch": 0.41773635953705096, + "flos": 32532531770880.0, + "grad_norm": 1.6041388362904736, + "language_loss": 0.71914941, + "learning_rate": 2.6195894621657825e-06, + "loss": 0.79648077, + "num_input_tokens_seen": 149122255, + "router_z_loss_clip": 1.75488281, + "router_z_loss_mlp": 0.15014648, + "step": 6948, + "time_per_iteration": 2.632211685180664 + }, + { + "auxiliary_loss_clip": 0.06460577, + "auxiliary_loss_mlp": 0.01271252, + "balance_loss_clip": 0.06288102, + "balance_loss_mlp": 0.01256303, + "epoch": 0.4177964827897189, + "flos": 23447719117440.0, + "grad_norm": 1.868509756028272, + "language_loss": 0.76914591, + "learning_rate": 2.619219148905362e-06, + "loss": 0.84646416, + "num_input_tokens_seen": 149142845, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.14941406, + "step": 6949, + "time_per_iteration": 2.5791566371917725 + }, + { + "auxiliary_loss_clip": 0.06466889, + "auxiliary_loss_mlp": 0.01270609, + "balance_loss_clip": 0.06288934, + "balance_loss_mlp": 0.01255476, + "epoch": 0.4178566060423869, + "flos": 22755768900480.0, + "grad_norm": 1.6605109484051197, + "language_loss": 0.81921285, + "learning_rate": 2.6188488121640888e-06, + "loss": 0.89658785, + "num_input_tokens_seen": 149163375, + "router_z_loss_clip": 1.77734375, + "router_z_loss_mlp": 0.15148926, + "step": 6950, + "time_per_iteration": 2.550705909729004 + }, + { + "auxiliary_loss_clip": 0.06457172, + "auxiliary_loss_mlp": 0.01266593, + "balance_loss_clip": 0.062898, + "balance_loss_mlp": 0.01253319, + "epoch": 0.41791672929505486, + "flos": 26040233243520.0, + "grad_norm": 1.3162845057727355, + "language_loss": 0.76396811, + "learning_rate": 2.618478451956007e-06, + "loss": 0.84120584, + "num_input_tokens_seen": 149185610, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.13275146, + "step": 6951, + "time_per_iteration": 2.6047768592834473 + }, + { + "auxiliary_loss_clip": 0.06472172, + "auxiliary_loss_mlp": 0.01271966, + "balance_loss_clip": 0.06291625, + "balance_loss_mlp": 0.01256988, + "epoch": 0.4179768525477228, + "flos": 19574028063360.0, + "grad_norm": 1.8780871701618023, + "language_loss": 0.72956991, + "learning_rate": 2.61810806829516e-06, + "loss": 0.80701125, + "num_input_tokens_seen": 149203990, + "router_z_loss_clip": 1.80566406, + "router_z_loss_mlp": 0.14978027, + "step": 6952, + "time_per_iteration": 2.498915910720825 + }, + { + "auxiliary_loss_clip": 0.06467617, + "auxiliary_loss_mlp": 0.01270698, + "balance_loss_clip": 0.06290505, + "balance_loss_mlp": 0.01256286, + "epoch": 0.4180369758003908, + "flos": 17789352750720.0, + "grad_norm": 3.5208466342014444, + "language_loss": 0.72192442, + "learning_rate": 2.617737661195593e-06, + "loss": 0.79930753, + "num_input_tokens_seen": 149221385, + "router_z_loss_clip": 1.77050781, + "router_z_loss_mlp": 0.14428711, + "step": 6953, + "time_per_iteration": 2.5105345249176025 + }, + { + "auxiliary_loss_clip": 0.06460451, + "auxiliary_loss_mlp": 0.01269376, + "balance_loss_clip": 0.0629045, + "balance_loss_mlp": 0.01255143, + "epoch": 0.41809709905305875, + "flos": 20967152446080.0, + "grad_norm": 1.9107321624636409, + "language_loss": 0.76574248, + "learning_rate": 2.617367230671353e-06, + "loss": 0.8430407, + "num_input_tokens_seen": 149241175, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.14233398, + "step": 6954, + "time_per_iteration": 2.5424091815948486 + }, + { + "auxiliary_loss_clip": 0.06461184, + "auxiliary_loss_mlp": 0.01271375, + "balance_loss_clip": 0.06286837, + "balance_loss_mlp": 0.01255866, + "epoch": 0.4181572223057267, + "flos": 22024099048320.0, + "grad_norm": 2.2757291119189693, + "language_loss": 0.84719867, + "learning_rate": 2.616996776736485e-06, + "loss": 0.92452419, + "num_input_tokens_seen": 149259115, + "router_z_loss_clip": 1.74511719, + "router_z_loss_mlp": 0.15490723, + "step": 6955, + "time_per_iteration": 2.5423128604888916 + }, + { + "auxiliary_loss_clip": 0.06460696, + "auxiliary_loss_mlp": 0.01269449, + "balance_loss_clip": 0.06289047, + "balance_loss_mlp": 0.01255001, + "epoch": 0.4182173455583947, + "flos": 26251969311360.0, + "grad_norm": 1.5480485879739414, + "language_loss": 0.83159053, + "learning_rate": 2.616626299405037e-06, + "loss": 0.90889192, + "num_input_tokens_seen": 149278705, + "router_z_loss_clip": 1.71582031, + "router_z_loss_mlp": 0.14453125, + "step": 6956, + "time_per_iteration": 2.5377910137176514 + }, + { + "auxiliary_loss_clip": 0.06470253, + "auxiliary_loss_mlp": 0.01272951, + "balance_loss_clip": 0.06292067, + "balance_loss_mlp": 0.01258163, + "epoch": 0.4182774688110627, + "flos": 14796566870400.0, + "grad_norm": 2.2161530875987205, + "language_loss": 0.72170293, + "learning_rate": 2.616255798691059e-06, + "loss": 0.79913497, + "num_input_tokens_seen": 149294040, + "router_z_loss_clip": 1.77929688, + "router_z_loss_mlp": 0.14801025, + "step": 6957, + "time_per_iteration": 2.5512890815734863 + }, + { + "auxiliary_loss_clip": 0.06465964, + "auxiliary_loss_mlp": 0.01272907, + "balance_loss_clip": 0.06289618, + "balance_loss_mlp": 0.01258745, + "epoch": 0.41833759206373067, + "flos": 20418190421760.0, + "grad_norm": 1.9534240722910163, + "language_loss": 0.75827634, + "learning_rate": 2.6158852746085982e-06, + "loss": 0.83566499, + "num_input_tokens_seen": 149310385, + "router_z_loss_clip": 1.76269531, + "router_z_loss_mlp": 0.14147949, + "step": 6958, + "time_per_iteration": 2.5025634765625 + }, + { + "auxiliary_loss_clip": 0.06461923, + "auxiliary_loss_mlp": 0.01277567, + "balance_loss_clip": 0.06289306, + "balance_loss_mlp": 0.01262505, + "epoch": 0.41839771531639863, + "flos": 23662557786240.0, + "grad_norm": 1.62032760192947, + "language_loss": 0.77450699, + "learning_rate": 2.6155147271717066e-06, + "loss": 0.85190189, + "num_input_tokens_seen": 149328235, + "router_z_loss_clip": 1.72851562, + "router_z_loss_mlp": 0.15075684, + "step": 6959, + "time_per_iteration": 2.5644967555999756 + }, + { + "auxiliary_loss_clip": 0.06462178, + "auxiliary_loss_mlp": 0.01275343, + "balance_loss_clip": 0.06288128, + "balance_loss_mlp": 0.01259423, + "epoch": 0.4184578385690666, + "flos": 19760006200320.0, + "grad_norm": 1.8483570445524284, + "language_loss": 0.77022827, + "learning_rate": 2.6151441563944347e-06, + "loss": 0.84760344, + "num_input_tokens_seen": 149347465, + "router_z_loss_clip": 1.74121094, + "router_z_loss_mlp": 0.15924072, + "step": 6960, + "time_per_iteration": 2.5269885063171387 + }, + { + "auxiliary_loss_clip": 0.06453702, + "auxiliary_loss_mlp": 0.01269309, + "balance_loss_clip": 0.06288585, + "balance_loss_mlp": 0.01255552, + "epoch": 0.41851796182173456, + "flos": 20199578319360.0, + "grad_norm": 2.3993036704472717, + "language_loss": 0.75495946, + "learning_rate": 2.614773562290835e-06, + "loss": 0.83218956, + "num_input_tokens_seen": 149366685, + "router_z_loss_clip": 1.64941406, + "router_z_loss_mlp": 0.13769531, + "step": 6961, + "time_per_iteration": 2.571563243865967 + }, + { + "auxiliary_loss_clip": 0.06367883, + "auxiliary_loss_mlp": 0.0126221, + "balance_loss_clip": 0.06291385, + "balance_loss_mlp": 0.01259577, + "epoch": 0.41857808507440253, + "flos": 59038331898240.0, + "grad_norm": 0.8546546360875583, + "language_loss": 0.54730451, + "learning_rate": 2.61440294487496e-06, + "loss": 0.62360549, + "num_input_tokens_seen": 149422925, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.02635193, + "step": 6962, + "time_per_iteration": 3.0928165912628174 + }, + { + "auxiliary_loss_clip": 0.06468143, + "auxiliary_loss_mlp": 0.0127052, + "balance_loss_clip": 0.06293048, + "balance_loss_mlp": 0.01256423, + "epoch": 0.4186382083270705, + "flos": 18484740984960.0, + "grad_norm": 2.146654503648622, + "language_loss": 0.8523612, + "learning_rate": 2.614032304160864e-06, + "loss": 0.92974788, + "num_input_tokens_seen": 149440820, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.14093018, + "step": 6963, + "time_per_iteration": 2.4891340732574463 + }, + { + "auxiliary_loss_clip": 0.06465001, + "auxiliary_loss_mlp": 0.01271241, + "balance_loss_clip": 0.06290912, + "balance_loss_mlp": 0.01256453, + "epoch": 0.41869833157973846, + "flos": 21584988126720.0, + "grad_norm": 1.5636714712462336, + "language_loss": 0.70520425, + "learning_rate": 2.6136616401626014e-06, + "loss": 0.78256667, + "num_input_tokens_seen": 149461060, + "router_z_loss_clip": 1.74023438, + "router_z_loss_mlp": 0.14788818, + "step": 6964, + "time_per_iteration": 2.6037514209747314 + }, + { + "auxiliary_loss_clip": 0.06460649, + "auxiliary_loss_mlp": 0.01270666, + "balance_loss_clip": 0.06289357, + "balance_loss_mlp": 0.01257034, + "epoch": 0.4187584548324064, + "flos": 35526156192000.0, + "grad_norm": 2.108688626905877, + "language_loss": 0.71782613, + "learning_rate": 2.6132909528942273e-06, + "loss": 0.79513931, + "num_input_tokens_seen": 149483115, + "router_z_loss_clip": 1.71191406, + "router_z_loss_mlp": 0.1362915, + "step": 6965, + "time_per_iteration": 4.077980279922485 + }, + { + "auxiliary_loss_clip": 0.06453691, + "auxiliary_loss_mlp": 0.0126997, + "balance_loss_clip": 0.06286767, + "balance_loss_mlp": 0.01257173, + "epoch": 0.4188185780850744, + "flos": 18660950121600.0, + "grad_norm": 1.7018758391145836, + "language_loss": 0.72080678, + "learning_rate": 2.6129202423697997e-06, + "loss": 0.79804349, + "num_input_tokens_seen": 149501495, + "router_z_loss_clip": 1.66894531, + "router_z_loss_mlp": 0.12792969, + "step": 6966, + "time_per_iteration": 2.5740551948547363 + }, + { + "auxiliary_loss_clip": 0.06466748, + "auxiliary_loss_mlp": 0.0127158, + "balance_loss_clip": 0.06288405, + "balance_loss_mlp": 0.0125625, + "epoch": 0.41887870133774235, + "flos": 40342959676800.0, + "grad_norm": 4.506306240026155, + "language_loss": 0.71212667, + "learning_rate": 2.612549508603375e-06, + "loss": 0.78950995, + "num_input_tokens_seen": 149523170, + "router_z_loss_clip": 1.78125, + "router_z_loss_mlp": 0.15338135, + "step": 6967, + "time_per_iteration": 4.179578065872192 + }, + { + "auxiliary_loss_clip": 0.0636977, + "auxiliary_loss_mlp": 0.01256477, + "balance_loss_clip": 0.06291805, + "balance_loss_mlp": 0.01253975, + "epoch": 0.4189388245904103, + "flos": 61388083946880.0, + "grad_norm": 0.6570416522373307, + "language_loss": 0.45988834, + "learning_rate": 2.612178751609011e-06, + "loss": 0.53615081, + "num_input_tokens_seen": 149583955, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 0.02500916, + "step": 6968, + "time_per_iteration": 3.1288843154907227 + }, + { + "auxiliary_loss_clip": 0.06467855, + "auxiliary_loss_mlp": 0.01273397, + "balance_loss_clip": 0.06290668, + "balance_loss_mlp": 0.01257685, + "epoch": 0.4189989478430783, + "flos": 28222371198720.0, + "grad_norm": 1.7081344299750898, + "language_loss": 0.75350499, + "learning_rate": 2.6118079714007685e-06, + "loss": 0.8309176, + "num_input_tokens_seen": 149604440, + "router_z_loss_clip": 1.77246094, + "router_z_loss_mlp": 0.15710449, + "step": 6969, + "time_per_iteration": 2.5936050415039062 + }, + { + "auxiliary_loss_clip": 0.06460407, + "auxiliary_loss_mlp": 0.01272467, + "balance_loss_clip": 0.06287546, + "balance_loss_mlp": 0.01258365, + "epoch": 0.4190590710957463, + "flos": 24571820367360.0, + "grad_norm": 1.8003201263588986, + "language_loss": 0.80904478, + "learning_rate": 2.611437167992705e-06, + "loss": 0.88637358, + "num_input_tokens_seen": 149623745, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.14099121, + "step": 6970, + "time_per_iteration": 2.5366463661193848 + }, + { + "auxiliary_loss_clip": 0.06461529, + "auxiliary_loss_mlp": 0.01271512, + "balance_loss_clip": 0.06291033, + "balance_loss_mlp": 0.01257594, + "epoch": 0.41911919434841427, + "flos": 21732504439680.0, + "grad_norm": 2.0427263912189098, + "language_loss": 0.83781362, + "learning_rate": 2.6110663413988835e-06, + "loss": 0.91514409, + "num_input_tokens_seen": 149643025, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.13922119, + "step": 6971, + "time_per_iteration": 4.038029909133911 + }, + { + "auxiliary_loss_clip": 0.06459013, + "auxiliary_loss_mlp": 0.01277453, + "balance_loss_clip": 0.06292501, + "balance_loss_mlp": 0.01262766, + "epoch": 0.41917931760108224, + "flos": 17607064193280.0, + "grad_norm": 1.8913036217137231, + "language_loss": 0.74956995, + "learning_rate": 2.6106954916333648e-06, + "loss": 0.82693458, + "num_input_tokens_seen": 149660695, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.14685059, + "step": 6972, + "time_per_iteration": 2.5450055599212646 + }, + { + "auxiliary_loss_clip": 0.06463002, + "auxiliary_loss_mlp": 0.01269114, + "balance_loss_clip": 0.06289829, + "balance_loss_mlp": 0.01255405, + "epoch": 0.4192394408537502, + "flos": 37825943679360.0, + "grad_norm": 1.6425528401757075, + "language_loss": 0.73133683, + "learning_rate": 2.610324618710212e-06, + "loss": 0.808658, + "num_input_tokens_seen": 149682040, + "router_z_loss_clip": 1.73339844, + "router_z_loss_mlp": 0.13684082, + "step": 6973, + "time_per_iteration": 2.6852450370788574 + }, + { + "auxiliary_loss_clip": 0.06474721, + "auxiliary_loss_mlp": 0.01271721, + "balance_loss_clip": 0.06293075, + "balance_loss_mlp": 0.01257272, + "epoch": 0.41929956410641817, + "flos": 23113637688960.0, + "grad_norm": 1.8862458299453466, + "language_loss": 0.74830127, + "learning_rate": 2.609953722643489e-06, + "loss": 0.82576567, + "num_input_tokens_seen": 149700855, + "router_z_loss_clip": 1.81542969, + "router_z_loss_mlp": 0.14453125, + "step": 6974, + "time_per_iteration": 2.5765645503997803 + }, + { + "auxiliary_loss_clip": 0.06460831, + "auxiliary_loss_mlp": 0.01266173, + "balance_loss_clip": 0.0628831, + "balance_loss_mlp": 0.01252744, + "epoch": 0.41935968735908613, + "flos": 22530448471680.0, + "grad_norm": 1.902296645802657, + "language_loss": 0.73513019, + "learning_rate": 2.609582803447259e-06, + "loss": 0.81240016, + "num_input_tokens_seen": 149717360, + "router_z_loss_clip": 1.72363281, + "router_z_loss_mlp": 0.13421631, + "step": 6975, + "time_per_iteration": 2.4907052516937256 + }, + { + "auxiliary_loss_clip": 0.06461257, + "auxiliary_loss_mlp": 0.0127025, + "balance_loss_clip": 0.06293045, + "balance_loss_mlp": 0.01256172, + "epoch": 0.4194198106117541, + "flos": 26877771129600.0, + "grad_norm": 1.432926445179704, + "language_loss": 0.80820251, + "learning_rate": 2.6092118611355885e-06, + "loss": 0.8855176, + "num_input_tokens_seen": 149738975, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.14086914, + "step": 6976, + "time_per_iteration": 4.015337705612183 + }, + { + "auxiliary_loss_clip": 0.06465544, + "auxiliary_loss_mlp": 0.01265752, + "balance_loss_clip": 0.06291896, + "balance_loss_mlp": 0.01252174, + "epoch": 0.41947993386442206, + "flos": 19908696470400.0, + "grad_norm": 6.530638917868016, + "language_loss": 0.67613435, + "learning_rate": 2.6088408957225425e-06, + "loss": 0.75344729, + "num_input_tokens_seen": 149757055, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.13592529, + "step": 6977, + "time_per_iteration": 2.5907933712005615 + }, + { + "auxiliary_loss_clip": 0.06466645, + "auxiliary_loss_mlp": 0.012707, + "balance_loss_clip": 0.06291468, + "balance_loss_mlp": 0.01257104, + "epoch": 0.41954005711709, + "flos": 17389584120960.0, + "grad_norm": 2.431968733580352, + "language_loss": 0.8152501, + "learning_rate": 2.6084699072221898e-06, + "loss": 0.89262354, + "num_input_tokens_seen": 149772885, + "router_z_loss_clip": 1.75390625, + "router_z_loss_mlp": 0.13604736, + "step": 6978, + "time_per_iteration": 2.5534939765930176 + }, + { + "auxiliary_loss_clip": 0.06466036, + "auxiliary_loss_mlp": 0.01269917, + "balance_loss_clip": 0.06288658, + "balance_loss_mlp": 0.012561, + "epoch": 0.419600180369758, + "flos": 25009254207360.0, + "grad_norm": 1.7617066668945498, + "language_loss": 0.83044857, + "learning_rate": 2.6080988956485964e-06, + "loss": 0.90780807, + "num_input_tokens_seen": 149791515, + "router_z_loss_clip": 1.77539062, + "router_z_loss_mlp": 0.13824463, + "step": 6979, + "time_per_iteration": 2.5991194248199463 + }, + { + "auxiliary_loss_clip": 0.06464113, + "auxiliary_loss_mlp": 0.01266396, + "balance_loss_clip": 0.0629217, + "balance_loss_mlp": 0.01253313, + "epoch": 0.41966030362242596, + "flos": 17389458339840.0, + "grad_norm": 2.43413237172065, + "language_loss": 0.83727056, + "learning_rate": 2.6077278610158325e-06, + "loss": 0.9145757, + "num_input_tokens_seen": 149807250, + "router_z_loss_clip": 1.71972656, + "router_z_loss_mlp": 0.13079834, + "step": 6980, + "time_per_iteration": 2.4868295192718506 + }, + { + "auxiliary_loss_clip": 0.06469644, + "auxiliary_loss_mlp": 0.01274217, + "balance_loss_clip": 0.06293017, + "balance_loss_mlp": 0.01260061, + "epoch": 0.4197204268750939, + "flos": 22161427090560.0, + "grad_norm": 2.953064628504675, + "language_loss": 0.79802233, + "learning_rate": 2.6073568033379665e-06, + "loss": 0.87546098, + "num_input_tokens_seen": 149821640, + "router_z_loss_clip": 1.765625, + "router_z_loss_mlp": 0.14172363, + "step": 6981, + "time_per_iteration": 2.572671890258789 + }, + { + "auxiliary_loss_clip": 0.06461273, + "auxiliary_loss_mlp": 0.01268979, + "balance_loss_clip": 0.06293882, + "balance_loss_mlp": 0.01256152, + "epoch": 0.4197805501277619, + "flos": 22089534687360.0, + "grad_norm": 1.8874441419731374, + "language_loss": 0.84437835, + "learning_rate": 2.6069857226290696e-06, + "loss": 0.92168081, + "num_input_tokens_seen": 149840545, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.12823486, + "step": 6982, + "time_per_iteration": 2.515719413757324 + }, + { + "auxiliary_loss_clip": 0.06468281, + "auxiliary_loss_mlp": 0.0127262, + "balance_loss_clip": 0.06291284, + "balance_loss_mlp": 0.0125844, + "epoch": 0.4198406733804299, + "flos": 26439372967680.0, + "grad_norm": 2.198770889515785, + "language_loss": 0.57229298, + "learning_rate": 2.606614618903214e-06, + "loss": 0.64970195, + "num_input_tokens_seen": 149860375, + "router_z_loss_clip": 1.76953125, + "router_z_loss_mlp": 0.1418457, + "step": 6983, + "time_per_iteration": 2.589905023574829 + }, + { + "auxiliary_loss_clip": 0.06459898, + "auxiliary_loss_mlp": 0.01268511, + "balance_loss_clip": 0.0629196, + "balance_loss_mlp": 0.01255922, + "epoch": 0.4199007966330979, + "flos": 12535870112640.0, + "grad_norm": 1.9546340544122036, + "language_loss": 0.82430601, + "learning_rate": 2.606243492174471e-06, + "loss": 0.90159011, + "num_input_tokens_seen": 149877850, + "router_z_loss_clip": 1.6796875, + "router_z_loss_mlp": 0.1260376, + "step": 6984, + "time_per_iteration": 2.4837801456451416 + }, + { + "auxiliary_loss_clip": 0.06465998, + "auxiliary_loss_mlp": 0.0127065, + "balance_loss_clip": 0.06293395, + "balance_loss_mlp": 0.01257698, + "epoch": 0.41996091988576584, + "flos": 21769498817280.0, + "grad_norm": 1.6572496297875159, + "language_loss": 0.79565531, + "learning_rate": 2.605872342456914e-06, + "loss": 0.87302184, + "num_input_tokens_seen": 149896110, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.12963867, + "step": 6985, + "time_per_iteration": 2.558382511138916 + }, + { + "auxiliary_loss_clip": 0.06471538, + "auxiliary_loss_mlp": 0.01269266, + "balance_loss_clip": 0.06292171, + "balance_loss_mlp": 0.01254425, + "epoch": 0.4200210431384338, + "flos": 26549182143360.0, + "grad_norm": 1.7232010674189546, + "language_loss": 0.78413719, + "learning_rate": 2.6055011697646173e-06, + "loss": 0.86154521, + "num_input_tokens_seen": 149916495, + "router_z_loss_clip": 1.79492188, + "router_z_loss_mlp": 0.14831543, + "step": 6986, + "time_per_iteration": 2.557201385498047 + }, + { + "auxiliary_loss_clip": 0.06457713, + "auxiliary_loss_mlp": 0.01264036, + "balance_loss_clip": 0.06290729, + "balance_loss_mlp": 0.0125171, + "epoch": 0.42008116639110177, + "flos": 26802859979520.0, + "grad_norm": 1.5119871943534449, + "language_loss": 0.72772801, + "learning_rate": 2.605129974111655e-06, + "loss": 0.80494547, + "num_input_tokens_seen": 149936445, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.12310791, + "step": 6987, + "time_per_iteration": 2.590758800506592 + }, + { + "auxiliary_loss_clip": 0.06464639, + "auxiliary_loss_mlp": 0.01270518, + "balance_loss_clip": 0.06291942, + "balance_loss_mlp": 0.01256994, + "epoch": 0.42014128964376973, + "flos": 32095433347200.0, + "grad_norm": 1.493413355723003, + "language_loss": 0.75077468, + "learning_rate": 2.604758755512104e-06, + "loss": 0.82812625, + "num_input_tokens_seen": 149959430, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.13519287, + "step": 6988, + "time_per_iteration": 2.6159229278564453 + }, + { + "auxiliary_loss_clip": 0.064705, + "auxiliary_loss_mlp": 0.01272645, + "balance_loss_clip": 0.06293759, + "balance_loss_mlp": 0.01258256, + "epoch": 0.4202014128964377, + "flos": 26474061358080.0, + "grad_norm": 1.4960604967721163, + "language_loss": 0.7416907, + "learning_rate": 2.60438751398004e-06, + "loss": 0.81912208, + "num_input_tokens_seen": 149980365, + "router_z_loss_clip": 1.76855469, + "router_z_loss_mlp": 0.14385986, + "step": 6989, + "time_per_iteration": 2.6082265377044678 + }, + { + "auxiliary_loss_clip": 0.06467222, + "auxiliary_loss_mlp": 0.01268972, + "balance_loss_clip": 0.06291176, + "balance_loss_mlp": 0.0125413, + "epoch": 0.42026153614910566, + "flos": 13405287277440.0, + "grad_norm": 2.240751664581705, + "language_loss": 0.70939904, + "learning_rate": 2.6040162495295404e-06, + "loss": 0.78676105, + "num_input_tokens_seen": 149997375, + "router_z_loss_clip": 1.75976562, + "router_z_loss_mlp": 0.14831543, + "step": 6990, + "time_per_iteration": 2.5301413536071777 + }, + { + "auxiliary_loss_clip": 0.06372039, + "auxiliary_loss_mlp": 0.01262281, + "balance_loss_clip": 0.06294142, + "balance_loss_mlp": 0.01259734, + "epoch": 0.42032165940177363, + "flos": 60268720452480.0, + "grad_norm": 0.7958876139316734, + "language_loss": 0.6024788, + "learning_rate": 2.603644962174685e-06, + "loss": 0.67882204, + "num_input_tokens_seen": 150051230, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 0.02546692, + "step": 6991, + "time_per_iteration": 3.036398410797119 + }, + { + "auxiliary_loss_clip": 0.06468751, + "auxiliary_loss_mlp": 0.0127226, + "balance_loss_clip": 0.06294238, + "balance_loss_mlp": 0.01257251, + "epoch": 0.4203817826544416, + "flos": 24542121294720.0, + "grad_norm": 1.5524019758451273, + "language_loss": 0.83787376, + "learning_rate": 2.6032736519295517e-06, + "loss": 0.91528386, + "num_input_tokens_seen": 150071135, + "router_z_loss_clip": 1.74511719, + "router_z_loss_mlp": 0.15014648, + "step": 6992, + "time_per_iteration": 2.5513317584991455 + }, + { + "auxiliary_loss_clip": 0.06374694, + "auxiliary_loss_mlp": 0.01259872, + "balance_loss_clip": 0.06295739, + "balance_loss_mlp": 0.01257284, + "epoch": 0.42044190590710956, + "flos": 58837679297280.0, + "grad_norm": 0.7870388441722128, + "language_loss": 0.65295899, + "learning_rate": 2.6029023188082217e-06, + "loss": 0.72930467, + "num_input_tokens_seen": 150125220, + "router_z_loss_clip": 0.7890625, + "router_z_loss_mlp": 0.02589417, + "step": 6993, + "time_per_iteration": 3.139356851577759 + }, + { + "auxiliary_loss_clip": 0.06475414, + "auxiliary_loss_mlp": 0.01273103, + "balance_loss_clip": 0.06293732, + "balance_loss_mlp": 0.01257534, + "epoch": 0.4205020291597775, + "flos": 16441733934720.0, + "grad_norm": 2.0884817814411307, + "language_loss": 0.83771634, + "learning_rate": 2.6025309628247746e-06, + "loss": 0.91520149, + "num_input_tokens_seen": 150142300, + "router_z_loss_clip": 1.81738281, + "router_z_loss_mlp": 0.15576172, + "step": 6994, + "time_per_iteration": 2.5307908058166504 + }, + { + "auxiliary_loss_clip": 0.06461746, + "auxiliary_loss_mlp": 0.01269563, + "balance_loss_clip": 0.06292755, + "balance_loss_mlp": 0.01255544, + "epoch": 0.4205621524124455, + "flos": 18411548843520.0, + "grad_norm": 1.728991128313806, + "language_loss": 0.79243588, + "learning_rate": 2.6021595839932934e-06, + "loss": 0.86974895, + "num_input_tokens_seen": 150161345, + "router_z_loss_clip": 1.69140625, + "router_z_loss_mlp": 0.14013672, + "step": 6995, + "time_per_iteration": 2.5054030418395996 + }, + { + "auxiliary_loss_clip": 0.06461824, + "auxiliary_loss_mlp": 0.0126885, + "balance_loss_clip": 0.06293637, + "balance_loss_mlp": 0.01255433, + "epoch": 0.4206222756651135, + "flos": 25527133566720.0, + "grad_norm": 1.491511685078805, + "language_loss": 0.80235636, + "learning_rate": 2.60178818232786e-06, + "loss": 0.87966311, + "num_input_tokens_seen": 150182420, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.13409424, + "step": 6996, + "time_per_iteration": 2.6613996028900146 + }, + { + "auxiliary_loss_clip": 0.06466329, + "auxiliary_loss_mlp": 0.01268157, + "balance_loss_clip": 0.06293097, + "balance_loss_mlp": 0.01254466, + "epoch": 0.4206823989177815, + "flos": 15309708474240.0, + "grad_norm": 2.3637588948298998, + "language_loss": 0.76051879, + "learning_rate": 2.601416757842559e-06, + "loss": 0.83786368, + "num_input_tokens_seen": 150200175, + "router_z_loss_clip": 1.73339844, + "router_z_loss_mlp": 0.13690186, + "step": 6997, + "time_per_iteration": 2.484876871109009 + }, + { + "auxiliary_loss_clip": 0.06463061, + "auxiliary_loss_mlp": 0.0126838, + "balance_loss_clip": 0.06288689, + "balance_loss_mlp": 0.01253789, + "epoch": 0.42074252217044944, + "flos": 15558564700800.0, + "grad_norm": 2.0514206793414345, + "language_loss": 0.76478076, + "learning_rate": 2.6010453105514743e-06, + "loss": 0.84209514, + "num_input_tokens_seen": 150217100, + "router_z_loss_clip": 1.74316406, + "router_z_loss_mlp": 0.14599609, + "step": 6998, + "time_per_iteration": 2.5640127658843994 + }, + { + "auxiliary_loss_clip": 0.06466474, + "auxiliary_loss_mlp": 0.01275488, + "balance_loss_clip": 0.06289443, + "balance_loss_mlp": 0.01260587, + "epoch": 0.4208026454231174, + "flos": 26153941633920.0, + "grad_norm": 1.581279992496262, + "language_loss": 0.76102519, + "learning_rate": 2.60067384046869e-06, + "loss": 0.83844483, + "num_input_tokens_seen": 150239830, + "router_z_loss_clip": 1.76757812, + "router_z_loss_mlp": 0.14892578, + "step": 6999, + "time_per_iteration": 2.6406025886535645 + }, + { + "auxiliary_loss_clip": 0.06461642, + "auxiliary_loss_mlp": 0.01267644, + "balance_loss_clip": 0.06291209, + "balance_loss_mlp": 0.01254382, + "epoch": 0.42086276867578537, + "flos": 23556857460480.0, + "grad_norm": 1.988296138175356, + "language_loss": 0.64461291, + "learning_rate": 2.600302347608295e-06, + "loss": 0.72190583, + "num_input_tokens_seen": 150260690, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.13244629, + "step": 7000, + "time_per_iteration": 2.6081695556640625 + }, + { + "auxiliary_loss_clip": 0.06469343, + "auxiliary_loss_mlp": 0.01270405, + "balance_loss_clip": 0.06294516, + "balance_loss_mlp": 0.01256076, + "epoch": 0.42092289192845334, + "flos": 18119199548160.0, + "grad_norm": 1.6363851387704167, + "language_loss": 0.77022576, + "learning_rate": 2.5999308319843743e-06, + "loss": 0.84762329, + "num_input_tokens_seen": 150279885, + "router_z_loss_clip": 1.74804688, + "router_z_loss_mlp": 0.14318848, + "step": 7001, + "time_per_iteration": 2.5761475563049316 + }, + { + "auxiliary_loss_clip": 0.06461353, + "auxiliary_loss_mlp": 0.01268364, + "balance_loss_clip": 0.06290751, + "balance_loss_mlp": 0.01254882, + "epoch": 0.4209830151811213, + "flos": 20012006954880.0, + "grad_norm": 1.5030484792833017, + "language_loss": 0.86740428, + "learning_rate": 2.5995592936110154e-06, + "loss": 0.94470143, + "num_input_tokens_seen": 150297390, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.13482666, + "step": 7002, + "time_per_iteration": 2.585397958755493 + }, + { + "auxiliary_loss_clip": 0.06461627, + "auxiliary_loss_mlp": 0.01271644, + "balance_loss_clip": 0.06290498, + "balance_loss_mlp": 0.01258251, + "epoch": 0.42104313843378927, + "flos": 21985050245760.0, + "grad_norm": 2.152971198745627, + "language_loss": 0.68539977, + "learning_rate": 2.5991877325023096e-06, + "loss": 0.76273245, + "num_input_tokens_seen": 150317390, + "router_z_loss_clip": 1.70996094, + "router_z_loss_mlp": 0.1338501, + "step": 7003, + "time_per_iteration": 2.5039963722229004 + }, + { + "auxiliary_loss_clip": 0.06469242, + "auxiliary_loss_mlp": 0.01271214, + "balance_loss_clip": 0.06293743, + "balance_loss_mlp": 0.01255747, + "epoch": 0.42110326168645723, + "flos": 25450461480960.0, + "grad_norm": 1.8015075946869743, + "language_loss": 0.77306843, + "learning_rate": 2.598816148672344e-06, + "loss": 0.85047305, + "num_input_tokens_seen": 150337455, + "router_z_loss_clip": 1.75585938, + "router_z_loss_mlp": 0.15472412, + "step": 7004, + "time_per_iteration": 2.6128745079040527 + }, + { + "auxiliary_loss_clip": 0.06462541, + "auxiliary_loss_mlp": 0.01273285, + "balance_loss_clip": 0.06294234, + "balance_loss_mlp": 0.0125873, + "epoch": 0.4211633849391252, + "flos": 17828485407360.0, + "grad_norm": 1.7810886301824922, + "language_loss": 0.68804276, + "learning_rate": 2.59844454213521e-06, + "loss": 0.76540101, + "num_input_tokens_seen": 150355385, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.14562988, + "step": 7005, + "time_per_iteration": 3.888760566711426 + }, + { + "auxiliary_loss_clip": 0.06465107, + "auxiliary_loss_mlp": 0.01269773, + "balance_loss_clip": 0.0629124, + "balance_loss_mlp": 0.01255593, + "epoch": 0.42122350819179316, + "flos": 16286796535680.0, + "grad_norm": 1.8605985429595449, + "language_loss": 0.72998816, + "learning_rate": 2.5980729129049994e-06, + "loss": 0.80733699, + "num_input_tokens_seen": 150371750, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.14178467, + "step": 7006, + "time_per_iteration": 3.991835832595825 + }, + { + "auxiliary_loss_clip": 0.06464688, + "auxiliary_loss_mlp": 0.01266849, + "balance_loss_clip": 0.06289375, + "balance_loss_mlp": 0.01252424, + "epoch": 0.4212836314444611, + "flos": 19651916033280.0, + "grad_norm": 1.623062925912009, + "language_loss": 0.7118417, + "learning_rate": 2.5977012609958033e-06, + "loss": 0.78915709, + "num_input_tokens_seen": 150389955, + "router_z_loss_clip": 1.75585938, + "router_z_loss_mlp": 0.14416504, + "step": 7007, + "time_per_iteration": 2.5425753593444824 + }, + { + "auxiliary_loss_clip": 0.06463595, + "auxiliary_loss_mlp": 0.01271642, + "balance_loss_clip": 0.06289028, + "balance_loss_mlp": 0.01257581, + "epoch": 0.4213437546971291, + "flos": 18374889882240.0, + "grad_norm": 2.097779928402724, + "language_loss": 0.82573175, + "learning_rate": 2.5973295864217166e-06, + "loss": 0.90308416, + "num_input_tokens_seen": 150405780, + "router_z_loss_clip": 1.74316406, + "router_z_loss_mlp": 0.140625, + "step": 7008, + "time_per_iteration": 2.492260456085205 + }, + { + "auxiliary_loss_clip": 0.0646316, + "auxiliary_loss_mlp": 0.01269434, + "balance_loss_clip": 0.06289843, + "balance_loss_mlp": 0.01255129, + "epoch": 0.42140387794979706, + "flos": 27711116311680.0, + "grad_norm": 1.9580680041192111, + "language_loss": 0.72638381, + "learning_rate": 2.596957889196831e-06, + "loss": 0.80370975, + "num_input_tokens_seen": 150425615, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.14318848, + "step": 7009, + "time_per_iteration": 2.6216533184051514 + }, + { + "auxiliary_loss_clip": 0.06466616, + "auxiliary_loss_mlp": 0.0126722, + "balance_loss_clip": 0.06289244, + "balance_loss_mlp": 0.01253338, + "epoch": 0.4214640012024651, + "flos": 28154545718400.0, + "grad_norm": 2.5692415195563543, + "language_loss": 0.66926241, + "learning_rate": 2.596586169335243e-06, + "loss": 0.74660075, + "num_input_tokens_seen": 150445765, + "router_z_loss_clip": 1.77441406, + "router_z_loss_mlp": 0.13873291, + "step": 7010, + "time_per_iteration": 2.606501579284668 + }, + { + "auxiliary_loss_clip": 0.06462754, + "auxiliary_loss_mlp": 0.01271396, + "balance_loss_clip": 0.06290238, + "balance_loss_mlp": 0.01256662, + "epoch": 0.42152412445513304, + "flos": 23002989972480.0, + "grad_norm": 1.6839098151972378, + "language_loss": 0.7266804, + "learning_rate": 2.5962144268510477e-06, + "loss": 0.80402195, + "num_input_tokens_seen": 150464405, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.14727783, + "step": 7011, + "time_per_iteration": 4.0488903522491455 + }, + { + "auxiliary_loss_clip": 0.06363396, + "auxiliary_loss_mlp": 0.01255682, + "balance_loss_clip": 0.06285673, + "balance_loss_mlp": 0.01253149, + "epoch": 0.421584247707801, + "flos": 63767855756160.0, + "grad_norm": 0.7737758086067837, + "language_loss": 0.54255652, + "learning_rate": 2.5958426617583417e-06, + "loss": 0.61874723, + "num_input_tokens_seen": 150520430, + "router_z_loss_clip": 0.77734375, + "router_z_loss_mlp": 0.02532959, + "step": 7012, + "time_per_iteration": 3.0473456382751465 + }, + { + "auxiliary_loss_clip": 0.06465481, + "auxiliary_loss_mlp": 0.01272136, + "balance_loss_clip": 0.06289969, + "balance_loss_mlp": 0.01256656, + "epoch": 0.421644370960469, + "flos": 24321203205120.0, + "grad_norm": 1.3531523641491952, + "language_loss": 0.78821653, + "learning_rate": 2.5954708740712215e-06, + "loss": 0.86559272, + "num_input_tokens_seen": 150542610, + "router_z_loss_clip": 1.75292969, + "router_z_loss_mlp": 0.15472412, + "step": 7013, + "time_per_iteration": 2.5436811447143555 + }, + { + "auxiliary_loss_clip": 0.06463543, + "auxiliary_loss_mlp": 0.0127162, + "balance_loss_clip": 0.06287397, + "balance_loss_mlp": 0.01256516, + "epoch": 0.42170449421313694, + "flos": 23447425628160.0, + "grad_norm": 1.8634561108800796, + "language_loss": 0.81284738, + "learning_rate": 2.595099063803787e-06, + "loss": 0.89019895, + "num_input_tokens_seen": 150560970, + "router_z_loss_clip": 1.76269531, + "router_z_loss_mlp": 0.15100098, + "step": 7014, + "time_per_iteration": 2.6464757919311523 + }, + { + "auxiliary_loss_clip": 0.06460524, + "auxiliary_loss_mlp": 0.01273083, + "balance_loss_clip": 0.06287747, + "balance_loss_mlp": 0.01259225, + "epoch": 0.4217646174658049, + "flos": 23702151640320.0, + "grad_norm": 1.4680948866945018, + "language_loss": 0.77888769, + "learning_rate": 2.5947272309701354e-06, + "loss": 0.85622376, + "num_input_tokens_seen": 150582615, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.1385498, + "step": 7015, + "time_per_iteration": 4.043898582458496 + }, + { + "auxiliary_loss_clip": 0.06464352, + "auxiliary_loss_mlp": 0.01268474, + "balance_loss_clip": 0.06287283, + "balance_loss_mlp": 0.01253394, + "epoch": 0.42182474071847287, + "flos": 24978297323520.0, + "grad_norm": 1.853408702102599, + "language_loss": 0.82096922, + "learning_rate": 2.594355375584368e-06, + "loss": 0.89829755, + "num_input_tokens_seen": 150603640, + "router_z_loss_clip": 1.77050781, + "router_z_loss_mlp": 0.15075684, + "step": 7016, + "time_per_iteration": 2.5523900985717773 + }, + { + "auxiliary_loss_clip": 0.06465739, + "auxiliary_loss_mlp": 0.01271643, + "balance_loss_clip": 0.06291386, + "balance_loss_mlp": 0.01256527, + "epoch": 0.42188486397114083, + "flos": 22863230161920.0, + "grad_norm": 2.845700477826224, + "language_loss": 0.6853466, + "learning_rate": 2.593983497660586e-06, + "loss": 0.76272047, + "num_input_tokens_seen": 150622490, + "router_z_loss_clip": 1.74121094, + "router_z_loss_mlp": 0.15112305, + "step": 7017, + "time_per_iteration": 2.57027530670166 + }, + { + "auxiliary_loss_clip": 0.0636536, + "auxiliary_loss_mlp": 0.01255401, + "balance_loss_clip": 0.06287346, + "balance_loss_mlp": 0.01252595, + "epoch": 0.4219449872238088, + "flos": 66997072730880.0, + "grad_norm": 0.6666550742113542, + "language_loss": 0.59442866, + "learning_rate": 2.5936115972128895e-06, + "loss": 0.67063624, + "num_input_tokens_seen": 150689545, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 0.02804565, + "step": 7018, + "time_per_iteration": 3.1860194206237793 + }, + { + "auxiliary_loss_clip": 0.0646835, + "auxiliary_loss_mlp": 0.01271161, + "balance_loss_clip": 0.0628873, + "balance_loss_mlp": 0.0125617, + "epoch": 0.42200511047647676, + "flos": 13120400995200.0, + "grad_norm": 1.8819765217055724, + "language_loss": 0.75926054, + "learning_rate": 2.593239674255382e-06, + "loss": 0.83665562, + "num_input_tokens_seen": 150707610, + "router_z_loss_clip": 1.79492188, + "router_z_loss_mlp": 0.14990234, + "step": 7019, + "time_per_iteration": 2.542468309402466 + }, + { + "auxiliary_loss_clip": 0.06462015, + "auxiliary_loss_mlp": 0.01273146, + "balance_loss_clip": 0.06287961, + "balance_loss_mlp": 0.01257864, + "epoch": 0.42206523372914473, + "flos": 13996400705280.0, + "grad_norm": 1.899626408213008, + "language_loss": 0.69618917, + "learning_rate": 2.592867728802166e-06, + "loss": 0.77354079, + "num_input_tokens_seen": 150724530, + "router_z_loss_clip": 1.74121094, + "router_z_loss_mlp": 0.15283203, + "step": 7020, + "time_per_iteration": 2.4884140491485596 + }, + { + "auxiliary_loss_clip": 0.06459437, + "auxiliary_loss_mlp": 0.01272473, + "balance_loss_clip": 0.06290746, + "balance_loss_mlp": 0.01258347, + "epoch": 0.4221253569818127, + "flos": 21948391284480.0, + "grad_norm": 1.6760812445081854, + "language_loss": 0.81457055, + "learning_rate": 2.592495760867347e-06, + "loss": 0.89188963, + "num_input_tokens_seen": 150742870, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.14135742, + "step": 7021, + "time_per_iteration": 2.60335111618042 + }, + { + "auxiliary_loss_clip": 0.06460646, + "auxiliary_loss_mlp": 0.01268222, + "balance_loss_clip": 0.06286098, + "balance_loss_mlp": 0.01253869, + "epoch": 0.42218548023448066, + "flos": 32200001642880.0, + "grad_norm": 1.5750279801473723, + "language_loss": 0.70101392, + "learning_rate": 2.5921237704650293e-06, + "loss": 0.77830255, + "num_input_tokens_seen": 150765500, + "router_z_loss_clip": 1.74609375, + "router_z_loss_mlp": 0.14355469, + "step": 7022, + "time_per_iteration": 2.605795383453369 + }, + { + "auxiliary_loss_clip": 0.06450655, + "auxiliary_loss_mlp": 0.01272538, + "balance_loss_clip": 0.06284072, + "balance_loss_mlp": 0.01258788, + "epoch": 0.4222456034871487, + "flos": 30127043957760.0, + "grad_norm": 1.5974321201389856, + "language_loss": 0.67428911, + "learning_rate": 2.5917517576093188e-06, + "loss": 0.75152111, + "num_input_tokens_seen": 150784945, + "router_z_loss_clip": 1.66699219, + "router_z_loss_mlp": 0.13751221, + "step": 7023, + "time_per_iteration": 2.6615898609161377 + }, + { + "auxiliary_loss_clip": 0.06455819, + "auxiliary_loss_mlp": 0.01270862, + "balance_loss_clip": 0.06287459, + "balance_loss_mlp": 0.01255508, + "epoch": 0.42230572673981664, + "flos": 22134537129600.0, + "grad_norm": 1.6408413231786074, + "language_loss": 0.69710904, + "learning_rate": 2.591379722314322e-06, + "loss": 0.77437586, + "num_input_tokens_seen": 150803120, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.15356445, + "step": 7024, + "time_per_iteration": 2.531874895095825 + }, + { + "auxiliary_loss_clip": 0.06457987, + "auxiliary_loss_mlp": 0.01269795, + "balance_loss_clip": 0.06283922, + "balance_loss_mlp": 0.01255598, + "epoch": 0.4223658499924846, + "flos": 22061722331520.0, + "grad_norm": 2.1972757713163102, + "language_loss": 0.76880538, + "learning_rate": 2.591007664594147e-06, + "loss": 0.84608328, + "num_input_tokens_seen": 150823135, + "router_z_loss_clip": 1.73925781, + "router_z_loss_mlp": 0.14196777, + "step": 7025, + "time_per_iteration": 2.568814754486084 + }, + { + "auxiliary_loss_clip": 0.06457998, + "auxiliary_loss_mlp": 0.01277209, + "balance_loss_clip": 0.06287608, + "balance_loss_mlp": 0.01263017, + "epoch": 0.4224259732451526, + "flos": 20416681048320.0, + "grad_norm": 1.910881237925828, + "language_loss": 0.80124468, + "learning_rate": 2.5906355844629024e-06, + "loss": 0.87859672, + "num_input_tokens_seen": 150842070, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.14208984, + "step": 7026, + "time_per_iteration": 2.4988901615142822 + }, + { + "auxiliary_loss_clip": 0.06353324, + "auxiliary_loss_mlp": 0.01252769, + "balance_loss_clip": 0.06275862, + "balance_loss_mlp": 0.01250106, + "epoch": 0.42248609649782054, + "flos": 62866307750400.0, + "grad_norm": 0.7325438580667073, + "language_loss": 0.62037623, + "learning_rate": 2.5902634819346966e-06, + "loss": 0.69643718, + "num_input_tokens_seen": 150907450, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.0266571, + "step": 7027, + "time_per_iteration": 3.230607748031616 + }, + { + "auxiliary_loss_clip": 0.06460012, + "auxiliary_loss_mlp": 0.01272089, + "balance_loss_clip": 0.06290331, + "balance_loss_mlp": 0.01257456, + "epoch": 0.4225462197504885, + "flos": 26257126337280.0, + "grad_norm": 2.572422824646089, + "language_loss": 0.71053827, + "learning_rate": 2.5898913570236414e-06, + "loss": 0.78785932, + "num_input_tokens_seen": 150928040, + "router_z_loss_clip": 1.69824219, + "router_z_loss_mlp": 0.14642334, + "step": 7028, + "time_per_iteration": 2.5667781829833984 + }, + { + "auxiliary_loss_clip": 0.06463138, + "auxiliary_loss_mlp": 0.01269354, + "balance_loss_clip": 0.06289553, + "balance_loss_mlp": 0.01255437, + "epoch": 0.42260634300315647, + "flos": 20528209232640.0, + "grad_norm": 1.948126664005559, + "language_loss": 0.82621461, + "learning_rate": 2.589519209743846e-06, + "loss": 0.90353954, + "num_input_tokens_seen": 150945760, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.13928223, + "step": 7029, + "time_per_iteration": 2.5936038494110107 + }, + { + "auxiliary_loss_clip": 0.06468205, + "auxiliary_loss_mlp": 0.01274403, + "balance_loss_clip": 0.06289516, + "balance_loss_mlp": 0.01258441, + "epoch": 0.42266646625582444, + "flos": 24323676900480.0, + "grad_norm": 1.8377333901506168, + "language_loss": 0.75193119, + "learning_rate": 2.589147040109424e-06, + "loss": 0.82935727, + "num_input_tokens_seen": 150965665, + "router_z_loss_clip": 1.78613281, + "router_z_loss_mlp": 0.15966797, + "step": 7030, + "time_per_iteration": 2.6162269115448 + }, + { + "auxiliary_loss_clip": 0.06462294, + "auxiliary_loss_mlp": 0.01267502, + "balance_loss_clip": 0.06287964, + "balance_loss_mlp": 0.01251421, + "epoch": 0.4227265895084924, + "flos": 24210555488640.0, + "grad_norm": 1.9734407814648771, + "language_loss": 0.86909479, + "learning_rate": 2.588774848134486e-06, + "loss": 0.94639277, + "num_input_tokens_seen": 150982260, + "router_z_loss_clip": 1.74511719, + "router_z_loss_mlp": 0.1607666, + "step": 7031, + "time_per_iteration": 2.5292763710021973 + }, + { + "auxiliary_loss_clip": 0.06460671, + "auxiliary_loss_mlp": 0.01269226, + "balance_loss_clip": 0.06286174, + "balance_loss_mlp": 0.01255171, + "epoch": 0.42278671276116037, + "flos": 16915407465600.0, + "grad_norm": 1.893963671956315, + "language_loss": 0.73803562, + "learning_rate": 2.5884026338331473e-06, + "loss": 0.81533462, + "num_input_tokens_seen": 150999990, + "router_z_loss_clip": 1.74511719, + "router_z_loss_mlp": 0.140625, + "step": 7032, + "time_per_iteration": 2.5382707118988037 + }, + { + "auxiliary_loss_clip": 0.06463667, + "auxiliary_loss_mlp": 0.0126981, + "balance_loss_clip": 0.06286915, + "balance_loss_mlp": 0.01254874, + "epoch": 0.42284683601382833, + "flos": 25418162931840.0, + "grad_norm": 1.9439146678532522, + "language_loss": 0.70438349, + "learning_rate": 2.5880303972195222e-06, + "loss": 0.78171825, + "num_input_tokens_seen": 151021105, + "router_z_loss_clip": 1.76660156, + "router_z_loss_mlp": 0.1496582, + "step": 7033, + "time_per_iteration": 2.5798444747924805 + }, + { + "auxiliary_loss_clip": 0.06464536, + "auxiliary_loss_mlp": 0.01270969, + "balance_loss_clip": 0.06288149, + "balance_loss_mlp": 0.01256282, + "epoch": 0.4229069592664963, + "flos": 23047153873920.0, + "grad_norm": 1.8861418032064503, + "language_loss": 0.90879869, + "learning_rate": 2.5876581383077256e-06, + "loss": 0.98615378, + "num_input_tokens_seen": 151040665, + "router_z_loss_clip": 1.76269531, + "router_z_loss_mlp": 0.14685059, + "step": 7034, + "time_per_iteration": 2.5370678901672363 + }, + { + "auxiliary_loss_clip": 0.06455763, + "auxiliary_loss_mlp": 0.01270773, + "balance_loss_clip": 0.06283915, + "balance_loss_mlp": 0.01256676, + "epoch": 0.42296708251916426, + "flos": 26074586217600.0, + "grad_norm": 1.9962240812191803, + "language_loss": 0.77578306, + "learning_rate": 2.5872858571118723e-06, + "loss": 0.85304844, + "num_input_tokens_seen": 151061240, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.14080811, + "step": 7035, + "time_per_iteration": 2.542121648788452 + }, + { + "auxiliary_loss_clip": 0.06464495, + "auxiliary_loss_mlp": 0.01274418, + "balance_loss_clip": 0.06287753, + "balance_loss_mlp": 0.01259863, + "epoch": 0.4230272057718323, + "flos": 19463548055040.0, + "grad_norm": 2.323654021784471, + "language_loss": 0.83016878, + "learning_rate": 2.5869135536460817e-06, + "loss": 0.90755796, + "num_input_tokens_seen": 151076870, + "router_z_loss_clip": 1.76855469, + "router_z_loss_mlp": 0.14538574, + "step": 7036, + "time_per_iteration": 2.5446789264678955 + }, + { + "auxiliary_loss_clip": 0.06461224, + "auxiliary_loss_mlp": 0.01270872, + "balance_loss_clip": 0.06292447, + "balance_loss_mlp": 0.01256859, + "epoch": 0.42308732902450025, + "flos": 22389975901440.0, + "grad_norm": 1.9007003646753964, + "language_loss": 0.70561719, + "learning_rate": 2.58654122792447e-06, + "loss": 0.78293824, + "num_input_tokens_seen": 151095110, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.14031982, + "step": 7037, + "time_per_iteration": 2.5331337451934814 + }, + { + "auxiliary_loss_clip": 0.06462964, + "auxiliary_loss_mlp": 0.01269409, + "balance_loss_clip": 0.06289166, + "balance_loss_mlp": 0.01253923, + "epoch": 0.4231474522771682, + "flos": 21001631201280.0, + "grad_norm": 1.6547666669933128, + "language_loss": 0.77886164, + "learning_rate": 2.586168879961155e-06, + "loss": 0.85618538, + "num_input_tokens_seen": 151114355, + "router_z_loss_clip": 1.73828125, + "router_z_loss_mlp": 0.1550293, + "step": 7038, + "time_per_iteration": 2.547067165374756 + }, + { + "auxiliary_loss_clip": 0.06470759, + "auxiliary_loss_mlp": 0.01270751, + "balance_loss_clip": 0.06292742, + "balance_loss_mlp": 0.01255432, + "epoch": 0.4232075755298362, + "flos": 14981161415040.0, + "grad_norm": 2.6561544689274714, + "language_loss": 0.67851424, + "learning_rate": 2.585796509770259e-06, + "loss": 0.75592935, + "num_input_tokens_seen": 151131505, + "router_z_loss_clip": 1.77929688, + "router_z_loss_mlp": 0.15301514, + "step": 7039, + "time_per_iteration": 2.5148706436157227 + }, + { + "auxiliary_loss_clip": 0.06471442, + "auxiliary_loss_mlp": 0.01274269, + "balance_loss_clip": 0.06291762, + "balance_loss_mlp": 0.01258962, + "epoch": 0.42326769878250414, + "flos": 24539144474880.0, + "grad_norm": 1.5526791387199284, + "language_loss": 0.75859225, + "learning_rate": 2.5854241173658996e-06, + "loss": 0.83604932, + "num_input_tokens_seen": 151151555, + "router_z_loss_clip": 1.79492188, + "router_z_loss_mlp": 0.15307617, + "step": 7040, + "time_per_iteration": 2.6170670986175537 + }, + { + "auxiliary_loss_clip": 0.0646336, + "auxiliary_loss_mlp": 0.01267915, + "balance_loss_clip": 0.06288165, + "balance_loss_mlp": 0.01253199, + "epoch": 0.4233278220351721, + "flos": 26877603421440.0, + "grad_norm": 2.185572961013026, + "language_loss": 0.65619481, + "learning_rate": 2.5850517027621996e-06, + "loss": 0.73350751, + "num_input_tokens_seen": 151172385, + "router_z_loss_clip": 1.75097656, + "router_z_loss_mlp": 0.14715576, + "step": 7041, + "time_per_iteration": 2.5701920986175537 + }, + { + "auxiliary_loss_clip": 0.06470653, + "auxiliary_loss_mlp": 0.01271372, + "balance_loss_clip": 0.06294046, + "balance_loss_mlp": 0.01256626, + "epoch": 0.4233879452878401, + "flos": 42824951867520.0, + "grad_norm": 2.182989579985364, + "language_loss": 0.73763824, + "learning_rate": 2.5846792659732803e-06, + "loss": 0.81505847, + "num_input_tokens_seen": 151194930, + "router_z_loss_clip": 1.76367188, + "router_z_loss_mlp": 0.14752197, + "step": 7042, + "time_per_iteration": 2.7377729415893555 + }, + { + "auxiliary_loss_clip": 0.06466709, + "auxiliary_loss_mlp": 0.01270508, + "balance_loss_clip": 0.06294659, + "balance_loss_mlp": 0.01256119, + "epoch": 0.42344806854050804, + "flos": 25236125936640.0, + "grad_norm": 1.357775127981886, + "language_loss": 0.82479644, + "learning_rate": 2.5843068070132643e-06, + "loss": 0.90216863, + "num_input_tokens_seen": 151217905, + "router_z_loss_clip": 1.72167969, + "router_z_loss_mlp": 0.14379883, + "step": 7043, + "time_per_iteration": 2.6002635955810547 + }, + { + "auxiliary_loss_clip": 0.06466006, + "auxiliary_loss_mlp": 0.01268509, + "balance_loss_clip": 0.06294385, + "balance_loss_mlp": 0.01252749, + "epoch": 0.423508191793176, + "flos": 22784587505280.0, + "grad_norm": 2.981661405110402, + "language_loss": 0.65042412, + "learning_rate": 2.5839343258962763e-06, + "loss": 0.72776926, + "num_input_tokens_seen": 151234580, + "router_z_loss_clip": 1.71679688, + "router_z_loss_mlp": 0.1574707, + "step": 7044, + "time_per_iteration": 4.032661437988281 + }, + { + "auxiliary_loss_clip": 0.06473978, + "auxiliary_loss_mlp": 0.01277434, + "balance_loss_clip": 0.06294475, + "balance_loss_mlp": 0.01261793, + "epoch": 0.42356831504584397, + "flos": 34645376799360.0, + "grad_norm": 1.8091896069955142, + "language_loss": 0.74864423, + "learning_rate": 2.5835618226364393e-06, + "loss": 0.82615834, + "num_input_tokens_seen": 151254765, + "router_z_loss_clip": 1.79394531, + "router_z_loss_mlp": 0.15649414, + "step": 7045, + "time_per_iteration": 2.6634554862976074 + }, + { + "auxiliary_loss_clip": 0.06458761, + "auxiliary_loss_mlp": 0.01272071, + "balance_loss_clip": 0.06289783, + "balance_loss_mlp": 0.01258177, + "epoch": 0.42362843829851193, + "flos": 17601487896960.0, + "grad_norm": 2.434331790625752, + "language_loss": 0.8101598, + "learning_rate": 2.5831892972478797e-06, + "loss": 0.88746816, + "num_input_tokens_seen": 151269045, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.13885498, + "step": 7046, + "time_per_iteration": 3.8471035957336426 + }, + { + "auxiliary_loss_clip": 0.06470428, + "auxiliary_loss_mlp": 0.01270077, + "balance_loss_clip": 0.06293224, + "balance_loss_mlp": 0.01255635, + "epoch": 0.4236885615511799, + "flos": 22572390240000.0, + "grad_norm": 1.5654922866483163, + "language_loss": 0.77272886, + "learning_rate": 2.5828167497447242e-06, + "loss": 0.8501339, + "num_input_tokens_seen": 151287530, + "router_z_loss_clip": 1.77050781, + "router_z_loss_mlp": 0.14416504, + "step": 7047, + "time_per_iteration": 2.5323123931884766 + }, + { + "auxiliary_loss_clip": 0.06461948, + "auxiliary_loss_mlp": 0.01271728, + "balance_loss_clip": 0.06291857, + "balance_loss_mlp": 0.01258245, + "epoch": 0.42374868480384786, + "flos": 26476493126400.0, + "grad_norm": 1.7230664508561655, + "language_loss": 0.68109751, + "learning_rate": 2.582444180141098e-06, + "loss": 0.75843424, + "num_input_tokens_seen": 151308905, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.13482666, + "step": 7048, + "time_per_iteration": 2.5632970333099365 + }, + { + "auxiliary_loss_clip": 0.06464637, + "auxiliary_loss_mlp": 0.01268497, + "balance_loss_clip": 0.06289657, + "balance_loss_mlp": 0.01253263, + "epoch": 0.4238088080565159, + "flos": 20375493966720.0, + "grad_norm": 1.6594147848364105, + "language_loss": 0.78005636, + "learning_rate": 2.5820715884511307e-06, + "loss": 0.85738766, + "num_input_tokens_seen": 151326525, + "router_z_loss_clip": 1.74707031, + "router_z_loss_mlp": 0.15234375, + "step": 7049, + "time_per_iteration": 2.5366568565368652 + }, + { + "auxiliary_loss_clip": 0.06468852, + "auxiliary_loss_mlp": 0.01270789, + "balance_loss_clip": 0.06292627, + "balance_loss_mlp": 0.01256067, + "epoch": 0.42386893130918385, + "flos": 21177379140480.0, + "grad_norm": 1.886460992095426, + "language_loss": 0.83185136, + "learning_rate": 2.5816989746889504e-06, + "loss": 0.90924776, + "num_input_tokens_seen": 151344675, + "router_z_loss_clip": 1.76269531, + "router_z_loss_mlp": 0.1472168, + "step": 7050, + "time_per_iteration": 2.5130441188812256 + }, + { + "auxiliary_loss_clip": 0.06460265, + "auxiliary_loss_mlp": 0.01271009, + "balance_loss_clip": 0.06286017, + "balance_loss_mlp": 0.01255738, + "epoch": 0.4239290545618518, + "flos": 17681346437760.0, + "grad_norm": 2.0965482043088968, + "language_loss": 0.73218369, + "learning_rate": 2.581326338868687e-06, + "loss": 0.80949646, + "num_input_tokens_seen": 151360730, + "router_z_loss_clip": 1.74316406, + "router_z_loss_mlp": 0.15283203, + "step": 7051, + "time_per_iteration": 3.92645263671875 + }, + { + "auxiliary_loss_clip": 0.06464715, + "auxiliary_loss_mlp": 0.01268876, + "balance_loss_clip": 0.06291503, + "balance_loss_mlp": 0.01254595, + "epoch": 0.4239891778145198, + "flos": 24321077424000.0, + "grad_norm": 1.57175281695923, + "language_loss": 0.86744994, + "learning_rate": 2.5809536810044706e-06, + "loss": 0.94478583, + "num_input_tokens_seen": 151380445, + "router_z_loss_clip": 1.73144531, + "router_z_loss_mlp": 0.1427002, + "step": 7052, + "time_per_iteration": 2.584425210952759 + }, + { + "auxiliary_loss_clip": 0.06467065, + "auxiliary_loss_mlp": 0.01277353, + "balance_loss_clip": 0.06289236, + "balance_loss_mlp": 0.01262559, + "epoch": 0.42404930106718774, + "flos": 20564700485760.0, + "grad_norm": 1.3965954512003949, + "language_loss": 0.72571224, + "learning_rate": 2.5805810011104323e-06, + "loss": 0.80315644, + "num_input_tokens_seen": 151399325, + "router_z_loss_clip": 1.77734375, + "router_z_loss_mlp": 0.14794922, + "step": 7053, + "time_per_iteration": 2.5454976558685303 + }, + { + "auxiliary_loss_clip": 0.06462884, + "auxiliary_loss_mlp": 0.01267759, + "balance_loss_clip": 0.06288673, + "balance_loss_mlp": 0.01253251, + "epoch": 0.4241094243198557, + "flos": 22314351991680.0, + "grad_norm": 1.5249079777591508, + "language_loss": 0.82902604, + "learning_rate": 2.580208299200704e-06, + "loss": 0.90633249, + "num_input_tokens_seen": 151417240, + "router_z_loss_clip": 1.74023438, + "router_z_loss_mlp": 0.14508057, + "step": 7054, + "time_per_iteration": 4.019419193267822 + }, + { + "auxiliary_loss_clip": 0.06381379, + "auxiliary_loss_mlp": 0.01253973, + "balance_loss_clip": 0.06300146, + "balance_loss_mlp": 0.01250773, + "epoch": 0.4241695475725237, + "flos": 70632445973760.0, + "grad_norm": 0.7904217901105888, + "language_loss": 0.60280955, + "learning_rate": 2.5798355752894183e-06, + "loss": 0.6791631, + "num_input_tokens_seen": 151476015, + "router_z_loss_clip": 0.8125, + "router_z_loss_mlp": 0.03204346, + "step": 7055, + "time_per_iteration": 3.152217388153076 + }, + { + "auxiliary_loss_clip": 0.06467455, + "auxiliary_loss_mlp": 0.01267499, + "balance_loss_clip": 0.06290264, + "balance_loss_mlp": 0.01252717, + "epoch": 0.42422967082519164, + "flos": 14032640396160.0, + "grad_norm": 2.414100924234879, + "language_loss": 0.77460873, + "learning_rate": 2.5794628293907107e-06, + "loss": 0.85195827, + "num_input_tokens_seen": 151492035, + "router_z_loss_clip": 1.7734375, + "router_z_loss_mlp": 0.14782715, + "step": 7056, + "time_per_iteration": 2.469475746154785 + }, + { + "auxiliary_loss_clip": 0.06476917, + "auxiliary_loss_mlp": 0.01275416, + "balance_loss_clip": 0.06295634, + "balance_loss_mlp": 0.01259013, + "epoch": 0.4242897940778596, + "flos": 22351975274880.0, + "grad_norm": 2.3823515442172187, + "language_loss": 0.84773225, + "learning_rate": 2.579090061518714e-06, + "loss": 0.92525554, + "num_input_tokens_seen": 151508970, + "router_z_loss_clip": 1.8125, + "router_z_loss_mlp": 0.1640625, + "step": 7057, + "time_per_iteration": 2.559659481048584 + }, + { + "auxiliary_loss_clip": 0.06472223, + "auxiliary_loss_mlp": 0.01277699, + "balance_loss_clip": 0.06293373, + "balance_loss_mlp": 0.01262202, + "epoch": 0.42434991733052757, + "flos": 22601502334080.0, + "grad_norm": 3.5122040291641583, + "language_loss": 0.83485544, + "learning_rate": 2.5787172716875642e-06, + "loss": 0.91235471, + "num_input_tokens_seen": 151525295, + "router_z_loss_clip": 1.78808594, + "router_z_loss_mlp": 0.15490723, + "step": 7058, + "time_per_iteration": 2.4998161792755127 + }, + { + "auxiliary_loss_clip": 0.06459209, + "auxiliary_loss_mlp": 0.01270641, + "balance_loss_clip": 0.06288499, + "balance_loss_mlp": 0.01256205, + "epoch": 0.42441004058319554, + "flos": 20017667105280.0, + "grad_norm": 2.0122152391379498, + "language_loss": 0.80975556, + "learning_rate": 2.5783444599113973e-06, + "loss": 0.88705409, + "num_input_tokens_seen": 151544435, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.14440918, + "step": 7059, + "time_per_iteration": 2.581310987472534 + }, + { + "auxiliary_loss_clip": 0.06467164, + "auxiliary_loss_mlp": 0.0127411, + "balance_loss_clip": 0.06288522, + "balance_loss_mlp": 0.01258053, + "epoch": 0.4244701638358635, + "flos": 11149663691520.0, + "grad_norm": 2.3594129001130963, + "language_loss": 0.70608068, + "learning_rate": 2.57797162620435e-06, + "loss": 0.7834934, + "num_input_tokens_seen": 151559520, + "router_z_loss_clip": 1.78710938, + "router_z_loss_mlp": 0.16064453, + "step": 7060, + "time_per_iteration": 2.485072612762451 + }, + { + "auxiliary_loss_clip": 0.06469266, + "auxiliary_loss_mlp": 0.01274664, + "balance_loss_clip": 0.06293246, + "balance_loss_mlp": 0.01260317, + "epoch": 0.42453028708853147, + "flos": 23994542862720.0, + "grad_norm": 1.485543893241047, + "language_loss": 0.76297516, + "learning_rate": 2.577598770580562e-06, + "loss": 0.84041446, + "num_input_tokens_seen": 151579790, + "router_z_loss_clip": 1.76171875, + "router_z_loss_mlp": 0.14324951, + "step": 7061, + "time_per_iteration": 2.594430685043335 + }, + { + "auxiliary_loss_clip": 0.06469865, + "auxiliary_loss_mlp": 0.01271574, + "balance_loss_clip": 0.06291063, + "balance_loss_mlp": 0.01256643, + "epoch": 0.42459041034119943, + "flos": 18412345457280.0, + "grad_norm": 1.9822246970542112, + "language_loss": 0.72630441, + "learning_rate": 2.5772258930541693e-06, + "loss": 0.80371881, + "num_input_tokens_seen": 151598285, + "router_z_loss_clip": 1.79296875, + "router_z_loss_mlp": 0.14935303, + "step": 7062, + "time_per_iteration": 2.64372181892395 + }, + { + "auxiliary_loss_clip": 0.06460352, + "auxiliary_loss_mlp": 0.01277188, + "balance_loss_clip": 0.06284757, + "balance_loss_mlp": 0.01262215, + "epoch": 0.42465053359386745, + "flos": 20964049845120.0, + "grad_norm": 2.6818567528078923, + "language_loss": 0.66330427, + "learning_rate": 2.5768529936393137e-06, + "loss": 0.74067968, + "num_input_tokens_seen": 151615430, + "router_z_loss_clip": 1.7578125, + "router_z_loss_mlp": 0.1496582, + "step": 7063, + "time_per_iteration": 2.5413248538970947 + }, + { + "auxiliary_loss_clip": 0.06452604, + "auxiliary_loss_mlp": 0.01267624, + "balance_loss_clip": 0.062814, + "balance_loss_mlp": 0.01254195, + "epoch": 0.4247106568465354, + "flos": 33114001979520.0, + "grad_norm": 1.5147527354116395, + "language_loss": 0.78917265, + "learning_rate": 2.5764800723501354e-06, + "loss": 0.86637491, + "num_input_tokens_seen": 151637030, + "router_z_loss_clip": 1.71191406, + "router_z_loss_mlp": 0.13446045, + "step": 7064, + "time_per_iteration": 2.610231876373291 + }, + { + "auxiliary_loss_clip": 0.06469544, + "auxiliary_loss_mlp": 0.01271013, + "balance_loss_clip": 0.06291715, + "balance_loss_mlp": 0.01256267, + "epoch": 0.4247707800992034, + "flos": 20052984401280.0, + "grad_norm": 1.8682780470126852, + "language_loss": 0.75125778, + "learning_rate": 2.5761071292007736e-06, + "loss": 0.82866335, + "num_input_tokens_seen": 151655745, + "router_z_loss_clip": 1.77832031, + "router_z_loss_mlp": 0.14733887, + "step": 7065, + "time_per_iteration": 2.583846092224121 + }, + { + "auxiliary_loss_clip": 0.06463289, + "auxiliary_loss_mlp": 0.01272027, + "balance_loss_clip": 0.06289071, + "balance_loss_mlp": 0.01256971, + "epoch": 0.42483090335187135, + "flos": 22392114180480.0, + "grad_norm": 1.5143179334948575, + "language_loss": 0.72187293, + "learning_rate": 2.5757341642053725e-06, + "loss": 0.79922605, + "num_input_tokens_seen": 151678040, + "router_z_loss_clip": 1.7421875, + "router_z_loss_mlp": 0.1505127, + "step": 7066, + "time_per_iteration": 2.5569074153900146 + }, + { + "auxiliary_loss_clip": 0.06467879, + "auxiliary_loss_mlp": 0.01269525, + "balance_loss_clip": 0.06290474, + "balance_loss_mlp": 0.01254231, + "epoch": 0.4248910266045393, + "flos": 21362518736640.0, + "grad_norm": 2.6158792173392484, + "language_loss": 0.79757857, + "learning_rate": 2.5753611773780745e-06, + "loss": 0.87495261, + "num_input_tokens_seen": 151696410, + "router_z_loss_clip": 1.77441406, + "router_z_loss_mlp": 0.15289307, + "step": 7067, + "time_per_iteration": 2.5845797061920166 + }, + { + "auxiliary_loss_clip": 0.06384341, + "auxiliary_loss_mlp": 0.01254549, + "balance_loss_clip": 0.06303053, + "balance_loss_mlp": 0.01250746, + "epoch": 0.4249511498572073, + "flos": 64026942180480.0, + "grad_norm": 1.3506219442036578, + "language_loss": 0.63354319, + "learning_rate": 2.574988168733022e-06, + "loss": 0.70993209, + "num_input_tokens_seen": 151756365, + "router_z_loss_clip": 0.8125, + "router_z_loss_mlp": 0.03796387, + "step": 7068, + "time_per_iteration": 3.082864284515381 + }, + { + "auxiliary_loss_clip": 0.06464778, + "auxiliary_loss_mlp": 0.0127101, + "balance_loss_clip": 0.06287815, + "balance_loss_mlp": 0.01255155, + "epoch": 0.42501127310987524, + "flos": 19612699522560.0, + "grad_norm": 2.0360912712095875, + "language_loss": 0.72778141, + "learning_rate": 2.574615138284361e-06, + "loss": 0.8051393, + "num_input_tokens_seen": 151775165, + "router_z_loss_clip": 1.76953125, + "router_z_loss_mlp": 0.15844727, + "step": 7069, + "time_per_iteration": 2.560899257659912 + }, + { + "auxiliary_loss_clip": 0.06466071, + "auxiliary_loss_mlp": 0.01271316, + "balance_loss_clip": 0.06289013, + "balance_loss_mlp": 0.01255378, + "epoch": 0.4250713963625432, + "flos": 19468160029440.0, + "grad_norm": 2.1627827730841074, + "language_loss": 0.79640651, + "learning_rate": 2.5742420860462364e-06, + "loss": 0.87378043, + "num_input_tokens_seen": 151792620, + "router_z_loss_clip": 1.77050781, + "router_z_loss_mlp": 0.15930176, + "step": 7070, + "time_per_iteration": 2.507615327835083 + }, + { + "auxiliary_loss_clip": 0.06461551, + "auxiliary_loss_mlp": 0.01270578, + "balance_loss_clip": 0.06285524, + "balance_loss_mlp": 0.01255117, + "epoch": 0.4251315196152112, + "flos": 25344719228160.0, + "grad_norm": 1.9437385428250697, + "language_loss": 0.70912981, + "learning_rate": 2.573869012032795e-06, + "loss": 0.7864511, + "num_input_tokens_seen": 151812850, + "router_z_loss_clip": 1.76074219, + "router_z_loss_mlp": 0.15454102, + "step": 7071, + "time_per_iteration": 2.5730371475219727 + }, + { + "auxiliary_loss_clip": 0.06465049, + "auxiliary_loss_mlp": 0.01271451, + "balance_loss_clip": 0.06289509, + "balance_loss_mlp": 0.01256896, + "epoch": 0.42519164286787914, + "flos": 26366348534400.0, + "grad_norm": 2.618295142810269, + "language_loss": 0.71212989, + "learning_rate": 2.5734959162581824e-06, + "loss": 0.78949487, + "num_input_tokens_seen": 151831785, + "router_z_loss_clip": 1.75390625, + "router_z_loss_mlp": 0.14544678, + "step": 7072, + "time_per_iteration": 2.5560264587402344 + }, + { + "auxiliary_loss_clip": 0.06469329, + "auxiliary_loss_mlp": 0.01270547, + "balance_loss_clip": 0.06289761, + "balance_loss_mlp": 0.01256182, + "epoch": 0.4252517661205471, + "flos": 26038220745600.0, + "grad_norm": 1.647981639391401, + "language_loss": 0.81448823, + "learning_rate": 2.5731227987365475e-06, + "loss": 0.89188695, + "num_input_tokens_seen": 151853885, + "router_z_loss_clip": 1.79589844, + "router_z_loss_mlp": 0.14385986, + "step": 7073, + "time_per_iteration": 2.5955123901367188 + }, + { + "auxiliary_loss_clip": 0.06462769, + "auxiliary_loss_mlp": 0.01273163, + "balance_loss_clip": 0.06288294, + "balance_loss_mlp": 0.01259204, + "epoch": 0.42531188937321507, + "flos": 12718536013440.0, + "grad_norm": 2.653395632366352, + "language_loss": 0.91860557, + "learning_rate": 2.5727496594820386e-06, + "loss": 0.99596488, + "num_input_tokens_seen": 151871780, + "router_z_loss_clip": 1.74316406, + "router_z_loss_mlp": 0.1395874, + "step": 7074, + "time_per_iteration": 2.4894237518310547 + }, + { + "auxiliary_loss_clip": 0.06467288, + "auxiliary_loss_mlp": 0.01273087, + "balance_loss_clip": 0.06287881, + "balance_loss_mlp": 0.0125827, + "epoch": 0.42537201262588303, + "flos": 22098339365760.0, + "grad_norm": 1.877755960639547, + "language_loss": 0.64814276, + "learning_rate": 2.572376498508805e-06, + "loss": 0.72554648, + "num_input_tokens_seen": 151891600, + "router_z_loss_clip": 1.79296875, + "router_z_loss_mlp": 0.14807129, + "step": 7075, + "time_per_iteration": 2.598754644393921 + }, + { + "auxiliary_loss_clip": 0.06455241, + "auxiliary_loss_mlp": 0.01269515, + "balance_loss_clip": 0.06284718, + "balance_loss_mlp": 0.01255246, + "epoch": 0.42543213587855105, + "flos": 23009824080000.0, + "grad_norm": 2.0883967049140666, + "language_loss": 0.74251705, + "learning_rate": 2.5720033158309973e-06, + "loss": 0.81976461, + "num_input_tokens_seen": 151911330, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.1427002, + "step": 7076, + "time_per_iteration": 2.537986993789673 + }, + { + "auxiliary_loss_clip": 0.0646292, + "auxiliary_loss_mlp": 0.01270865, + "balance_loss_clip": 0.06284414, + "balance_loss_mlp": 0.01256334, + "epoch": 0.425492259131219, + "flos": 25089448164480.0, + "grad_norm": 3.3689754116422335, + "language_loss": 0.79212517, + "learning_rate": 2.571630111462766e-06, + "loss": 0.86946297, + "num_input_tokens_seen": 151930355, + "router_z_loss_clip": 1.78417969, + "router_z_loss_mlp": 0.14520264, + "step": 7077, + "time_per_iteration": 2.6490280628204346 + }, + { + "auxiliary_loss_clip": 0.06455311, + "auxiliary_loss_mlp": 0.01267846, + "balance_loss_clip": 0.06287791, + "balance_loss_mlp": 0.01254721, + "epoch": 0.425552382383887, + "flos": 22822881621120.0, + "grad_norm": 1.7167135286528112, + "language_loss": 0.7317155, + "learning_rate": 2.571256885418265e-06, + "loss": 0.80894709, + "num_input_tokens_seen": 151949695, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.13116455, + "step": 7078, + "time_per_iteration": 2.5729281902313232 + }, + { + "auxiliary_loss_clip": 0.06459501, + "auxiliary_loss_mlp": 0.01269381, + "balance_loss_clip": 0.06290293, + "balance_loss_mlp": 0.01256173, + "epoch": 0.42561250563655495, + "flos": 13558757230080.0, + "grad_norm": 1.6803598980459025, + "language_loss": 0.80183727, + "learning_rate": 2.5708836377116445e-06, + "loss": 0.87912607, + "num_input_tokens_seen": 151967640, + "router_z_loss_clip": 1.69238281, + "router_z_loss_mlp": 0.13201904, + "step": 7079, + "time_per_iteration": 2.4937188625335693 + }, + { + "auxiliary_loss_clip": 0.06460771, + "auxiliary_loss_mlp": 0.0127097, + "balance_loss_clip": 0.06287594, + "balance_loss_mlp": 0.01257481, + "epoch": 0.4256726288892229, + "flos": 46989692478720.0, + "grad_norm": 1.4689183555154843, + "language_loss": 0.71987867, + "learning_rate": 2.5705103683570592e-06, + "loss": 0.79719609, + "num_input_tokens_seen": 151994020, + "router_z_loss_clip": 1.73046875, + "router_z_loss_mlp": 0.13500977, + "step": 7080, + "time_per_iteration": 2.774247884750366 + }, + { + "auxiliary_loss_clip": 0.06462272, + "auxiliary_loss_mlp": 0.01269683, + "balance_loss_clip": 0.0628937, + "balance_loss_mlp": 0.01256505, + "epoch": 0.4257327521418909, + "flos": 23593181005440.0, + "grad_norm": 1.9610396393278133, + "language_loss": 0.80520535, + "learning_rate": 2.5701370773686646e-06, + "loss": 0.88252497, + "num_input_tokens_seen": 152013415, + "router_z_loss_clip": 1.72949219, + "router_z_loss_mlp": 0.13165283, + "step": 7081, + "time_per_iteration": 2.53387451171875 + }, + { + "auxiliary_loss_clip": 0.06452817, + "auxiliary_loss_mlp": 0.01271536, + "balance_loss_clip": 0.06286353, + "balance_loss_mlp": 0.01257844, + "epoch": 0.42579287539455885, + "flos": 18996079726080.0, + "grad_norm": 1.496926936820616, + "language_loss": 0.81558168, + "learning_rate": 2.5697637647606138e-06, + "loss": 0.89282513, + "num_input_tokens_seen": 152030860, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.13702393, + "step": 7082, + "time_per_iteration": 2.50972580909729 + }, + { + "auxiliary_loss_clip": 0.06462308, + "auxiliary_loss_mlp": 0.01271701, + "balance_loss_clip": 0.06289167, + "balance_loss_mlp": 0.0125745, + "epoch": 0.4258529986472268, + "flos": 25198921923840.0, + "grad_norm": 1.6583429285627758, + "language_loss": 0.70258069, + "learning_rate": 2.569390430547065e-06, + "loss": 0.77992082, + "num_input_tokens_seen": 152050395, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.14251709, + "step": 7083, + "time_per_iteration": 2.543390989303589 + }, + { + "auxiliary_loss_clip": 0.06373302, + "auxiliary_loss_mlp": 0.01258345, + "balance_loss_clip": 0.06290752, + "balance_loss_mlp": 0.01254316, + "epoch": 0.4259131218998948, + "flos": 69990277881600.0, + "grad_norm": 0.8555028711944374, + "language_loss": 0.67011017, + "learning_rate": 2.569017074742173e-06, + "loss": 0.74642664, + "num_input_tokens_seen": 152113555, + "router_z_loss_clip": 0.82714844, + "router_z_loss_mlp": 0.0402832, + "step": 7084, + "time_per_iteration": 4.592621803283691 + }, + { + "auxiliary_loss_clip": 0.0645996, + "auxiliary_loss_mlp": 0.01273486, + "balance_loss_clip": 0.06287397, + "balance_loss_mlp": 0.01259348, + "epoch": 0.42597324515256274, + "flos": 18010899745920.0, + "grad_norm": 6.078178213614668, + "language_loss": 0.78467649, + "learning_rate": 2.5686436973600964e-06, + "loss": 0.86201096, + "num_input_tokens_seen": 152131575, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.14135742, + "step": 7085, + "time_per_iteration": 4.053593635559082 + }, + { + "auxiliary_loss_clip": 0.0647409, + "auxiliary_loss_mlp": 0.01277113, + "balance_loss_clip": 0.0629435, + "balance_loss_mlp": 0.01262158, + "epoch": 0.4260333684052307, + "flos": 15164204659200.0, + "grad_norm": 2.149155774842141, + "language_loss": 0.7699095, + "learning_rate": 2.568270298414995e-06, + "loss": 0.84742153, + "num_input_tokens_seen": 152149435, + "router_z_loss_clip": 1.79785156, + "router_z_loss_mlp": 0.1496582, + "step": 7086, + "time_per_iteration": 2.480053424835205 + }, + { + "auxiliary_loss_clip": 0.06458418, + "auxiliary_loss_mlp": 0.01275137, + "balance_loss_clip": 0.06286179, + "balance_loss_mlp": 0.01260129, + "epoch": 0.42609349165789867, + "flos": 14944628234880.0, + "grad_norm": 1.8417550415955477, + "language_loss": 0.80286872, + "learning_rate": 2.5678968779210255e-06, + "loss": 0.88020432, + "num_input_tokens_seen": 152166860, + "router_z_loss_clip": 1.72070312, + "router_z_loss_mlp": 0.15026855, + "step": 7087, + "time_per_iteration": 2.5487940311431885 + }, + { + "auxiliary_loss_clip": 0.06464538, + "auxiliary_loss_mlp": 0.01271303, + "balance_loss_clip": 0.06291935, + "balance_loss_mlp": 0.01257183, + "epoch": 0.42615361491056664, + "flos": 23738642893440.0, + "grad_norm": 2.1069826106325213, + "language_loss": 0.66537511, + "learning_rate": 2.5675234358923505e-06, + "loss": 0.7427336, + "num_input_tokens_seen": 152187475, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.14111328, + "step": 7088, + "time_per_iteration": 2.5807759761810303 + }, + { + "auxiliary_loss_clip": 0.06470972, + "auxiliary_loss_mlp": 0.01274052, + "balance_loss_clip": 0.06293773, + "balance_loss_mlp": 0.01260402, + "epoch": 0.42621373816323466, + "flos": 24943399297920.0, + "grad_norm": 2.133950232933384, + "language_loss": 0.69013214, + "learning_rate": 2.56714997234313e-06, + "loss": 0.76758242, + "num_input_tokens_seen": 152207235, + "router_z_loss_clip": 1.76660156, + "router_z_loss_mlp": 0.13665771, + "step": 7089, + "time_per_iteration": 2.5817432403564453 + }, + { + "auxiliary_loss_clip": 0.06463064, + "auxiliary_loss_mlp": 0.0127013, + "balance_loss_clip": 0.0628805, + "balance_loss_mlp": 0.0125598, + "epoch": 0.4262738614159026, + "flos": 13558044470400.0, + "grad_norm": 4.212045379455766, + "language_loss": 0.74597216, + "learning_rate": 2.566776487287525e-06, + "loss": 0.82330406, + "num_input_tokens_seen": 152224240, + "router_z_loss_clip": 1.74804688, + "router_z_loss_mlp": 0.14141846, + "step": 7090, + "time_per_iteration": 3.9426205158233643 + }, + { + "auxiliary_loss_clip": 0.06464858, + "auxiliary_loss_mlp": 0.01272944, + "balance_loss_clip": 0.06287836, + "balance_loss_mlp": 0.01259211, + "epoch": 0.4263339846685706, + "flos": 29755926224640.0, + "grad_norm": 2.684790824023287, + "language_loss": 0.75386477, + "learning_rate": 2.5664029807396994e-06, + "loss": 0.8312428, + "num_input_tokens_seen": 152242595, + "router_z_loss_clip": 1.77148438, + "router_z_loss_mlp": 0.13745117, + "step": 7091, + "time_per_iteration": 2.563892126083374 + }, + { + "auxiliary_loss_clip": 0.0645293, + "auxiliary_loss_mlp": 0.01269396, + "balance_loss_clip": 0.06285767, + "balance_loss_mlp": 0.01257278, + "epoch": 0.42639410792123855, + "flos": 16839406212480.0, + "grad_norm": 1.8445868770478253, + "language_loss": 0.82496071, + "learning_rate": 2.5660294527138156e-06, + "loss": 0.90218395, + "num_input_tokens_seen": 152260840, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.12121582, + "step": 7092, + "time_per_iteration": 2.55583119392395 + }, + { + "auxiliary_loss_clip": 0.06467807, + "auxiliary_loss_mlp": 0.01271484, + "balance_loss_clip": 0.06288138, + "balance_loss_mlp": 0.01257567, + "epoch": 0.4264542311739065, + "flos": 28769991557760.0, + "grad_norm": 1.5226511822280566, + "language_loss": 0.73850381, + "learning_rate": 2.565655903224038e-06, + "loss": 0.81589675, + "num_input_tokens_seen": 152280580, + "router_z_loss_clip": 1.796875, + "router_z_loss_mlp": 0.13922119, + "step": 7093, + "time_per_iteration": 4.021864414215088 + }, + { + "auxiliary_loss_clip": 0.06460725, + "auxiliary_loss_mlp": 0.012688, + "balance_loss_clip": 0.06287876, + "balance_loss_mlp": 0.01254512, + "epoch": 0.4265143544265745, + "flos": 24719881731840.0, + "grad_norm": 2.2430846112789617, + "language_loss": 0.70883787, + "learning_rate": 2.565282332284532e-06, + "loss": 0.78613305, + "num_input_tokens_seen": 152298455, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.14300537, + "step": 7094, + "time_per_iteration": 2.5826168060302734 + }, + { + "auxiliary_loss_clip": 0.06461484, + "auxiliary_loss_mlp": 0.01268246, + "balance_loss_clip": 0.06287476, + "balance_loss_mlp": 0.0125381, + "epoch": 0.42657447767924245, + "flos": 21871467636480.0, + "grad_norm": 1.4959257312535472, + "language_loss": 0.81979394, + "learning_rate": 2.564908739909464e-06, + "loss": 0.89709127, + "num_input_tokens_seen": 152316995, + "router_z_loss_clip": 1.73925781, + "router_z_loss_mlp": 0.14428711, + "step": 7095, + "time_per_iteration": 2.5714282989501953 + }, + { + "auxiliary_loss_clip": 0.06464021, + "auxiliary_loss_mlp": 0.0127044, + "balance_loss_clip": 0.06287175, + "balance_loss_mlp": 0.01255831, + "epoch": 0.4266346009319104, + "flos": 21476604470400.0, + "grad_norm": 2.7630559086257533, + "language_loss": 0.80476701, + "learning_rate": 2.5645351261129996e-06, + "loss": 0.88211161, + "num_input_tokens_seen": 152334800, + "router_z_loss_clip": 1.76757812, + "router_z_loss_mlp": 0.1461792, + "step": 7096, + "time_per_iteration": 2.52101731300354 + }, + { + "auxiliary_loss_clip": 0.06471846, + "auxiliary_loss_mlp": 0.0126828, + "balance_loss_clip": 0.06290311, + "balance_loss_mlp": 0.01253946, + "epoch": 0.4266947241845784, + "flos": 25526295025920.0, + "grad_norm": 2.003429077322888, + "language_loss": 0.65857691, + "learning_rate": 2.5641614909093066e-06, + "loss": 0.73597825, + "num_input_tokens_seen": 152355175, + "router_z_loss_clip": 1.81542969, + "router_z_loss_mlp": 0.14331055, + "step": 7097, + "time_per_iteration": 2.6010050773620605 + }, + { + "auxiliary_loss_clip": 0.0645384, + "auxiliary_loss_mlp": 0.01272923, + "balance_loss_clip": 0.06282586, + "balance_loss_mlp": 0.01259601, + "epoch": 0.42675484743724634, + "flos": 26548343602560.0, + "grad_norm": 1.7498935394273216, + "language_loss": 0.75170088, + "learning_rate": 2.5637878343125535e-06, + "loss": 0.82896858, + "num_input_tokens_seen": 152377245, + "router_z_loss_clip": 1.71386719, + "router_z_loss_mlp": 0.13317871, + "step": 7098, + "time_per_iteration": 2.5674946308135986 + }, + { + "auxiliary_loss_clip": 0.06458846, + "auxiliary_loss_mlp": 0.01274446, + "balance_loss_clip": 0.0628911, + "balance_loss_mlp": 0.01260033, + "epoch": 0.4268149706899143, + "flos": 23119465547520.0, + "grad_norm": 1.6850998762786562, + "language_loss": 0.75184697, + "learning_rate": 2.5634141563369086e-06, + "loss": 0.82917988, + "num_input_tokens_seen": 152396985, + "router_z_loss_clip": 1.69726562, + "router_z_loss_mlp": 0.14428711, + "step": 7099, + "time_per_iteration": 2.5784735679626465 + }, + { + "auxiliary_loss_clip": 0.06459826, + "auxiliary_loss_mlp": 0.01273278, + "balance_loss_clip": 0.06283994, + "balance_loss_mlp": 0.01259116, + "epoch": 0.4268750939425823, + "flos": 22712401612800.0, + "grad_norm": 2.0765509228592802, + "language_loss": 0.83059096, + "learning_rate": 2.5630404569965432e-06, + "loss": 0.90792197, + "num_input_tokens_seen": 152415590, + "router_z_loss_clip": 1.75683594, + "router_z_loss_mlp": 0.14172363, + "step": 7100, + "time_per_iteration": 2.520923614501953 + }, + { + "auxiliary_loss_clip": 0.06459752, + "auxiliary_loss_mlp": 0.01269142, + "balance_loss_clip": 0.06284218, + "balance_loss_mlp": 0.01255839, + "epoch": 0.42693521719525024, + "flos": 25382007095040.0, + "grad_norm": 1.4351436052366604, + "language_loss": 0.82259512, + "learning_rate": 2.562666736305627e-06, + "loss": 0.8998841, + "num_input_tokens_seen": 152436735, + "router_z_loss_clip": 1.75292969, + "router_z_loss_mlp": 0.13311768, + "step": 7101, + "time_per_iteration": 2.595768451690674 + }, + { + "auxiliary_loss_clip": 0.06466523, + "auxiliary_loss_mlp": 0.01273606, + "balance_loss_clip": 0.06287891, + "balance_loss_mlp": 0.01259099, + "epoch": 0.42699534044791826, + "flos": 18156613196160.0, + "grad_norm": 2.266580923573967, + "language_loss": 0.72800845, + "learning_rate": 2.5622929942783314e-06, + "loss": 0.80540979, + "num_input_tokens_seen": 152455685, + "router_z_loss_clip": 1.78613281, + "router_z_loss_mlp": 0.14501953, + "step": 7102, + "time_per_iteration": 2.494673252105713 + }, + { + "auxiliary_loss_clip": 0.06457532, + "auxiliary_loss_mlp": 0.0127168, + "balance_loss_clip": 0.06287985, + "balance_loss_mlp": 0.01257935, + "epoch": 0.4270554637005862, + "flos": 13703422504320.0, + "grad_norm": 2.1781975733094936, + "language_loss": 0.83514953, + "learning_rate": 2.5619192309288297e-06, + "loss": 0.91244167, + "num_input_tokens_seen": 152473500, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.13751221, + "step": 7103, + "time_per_iteration": 2.506204128265381 + }, + { + "auxiliary_loss_clip": 0.06465043, + "auxiliary_loss_mlp": 0.01274672, + "balance_loss_clip": 0.0628773, + "balance_loss_mlp": 0.01259753, + "epoch": 0.4271155869532542, + "flos": 17499351369600.0, + "grad_norm": 2.042502996026563, + "language_loss": 0.73773789, + "learning_rate": 2.561545446271294e-06, + "loss": 0.815135, + "num_input_tokens_seen": 152491320, + "router_z_loss_clip": 1.77246094, + "router_z_loss_mlp": 0.14916992, + "step": 7104, + "time_per_iteration": 2.5006070137023926 + }, + { + "auxiliary_loss_clip": 0.06459317, + "auxiliary_loss_mlp": 0.01274322, + "balance_loss_clip": 0.0628491, + "balance_loss_mlp": 0.01260494, + "epoch": 0.42717571020592215, + "flos": 32460471659520.0, + "grad_norm": 3.22189729136274, + "language_loss": 0.75052768, + "learning_rate": 2.5611716403198987e-06, + "loss": 0.82786405, + "num_input_tokens_seen": 152511970, + "router_z_loss_clip": 1.74609375, + "router_z_loss_mlp": 0.13830566, + "step": 7105, + "time_per_iteration": 2.607759475708008 + }, + { + "auxiliary_loss_clip": 0.06461999, + "auxiliary_loss_mlp": 0.01274519, + "balance_loss_clip": 0.06286199, + "balance_loss_mlp": 0.01261168, + "epoch": 0.4272358334585901, + "flos": 16258606836480.0, + "grad_norm": 17.703344591331568, + "language_loss": 0.77349067, + "learning_rate": 2.560797813088819e-06, + "loss": 0.85085583, + "num_input_tokens_seen": 152530515, + "router_z_loss_clip": 1.75878906, + "router_z_loss_mlp": 0.13354492, + "step": 7106, + "time_per_iteration": 2.4834203720092773 + }, + { + "auxiliary_loss_clip": 0.06461152, + "auxiliary_loss_mlp": 0.01276721, + "balance_loss_clip": 0.06287872, + "balance_loss_mlp": 0.01262499, + "epoch": 0.4272959567112581, + "flos": 24205817733120.0, + "grad_norm": 1.9445558892844073, + "language_loss": 0.8013317, + "learning_rate": 2.560423964592229e-06, + "loss": 0.87871039, + "num_input_tokens_seen": 152549295, + "router_z_loss_clip": 1.73046875, + "router_z_loss_mlp": 0.14233398, + "step": 7107, + "time_per_iteration": 2.5639657974243164 + }, + { + "auxiliary_loss_clip": 0.06454289, + "auxiliary_loss_mlp": 0.01267783, + "balance_loss_clip": 0.06283173, + "balance_loss_mlp": 0.01253424, + "epoch": 0.42735607996392605, + "flos": 27970747787520.0, + "grad_norm": 1.710799907332892, + "language_loss": 0.68469441, + "learning_rate": 2.5600500948443075e-06, + "loss": 0.76191515, + "num_input_tokens_seen": 152570725, + "router_z_loss_clip": 1.70996094, + "router_z_loss_mlp": 0.14349365, + "step": 7108, + "time_per_iteration": 2.5538556575775146 + }, + { + "auxiliary_loss_clip": 0.06460684, + "auxiliary_loss_mlp": 0.01273244, + "balance_loss_clip": 0.06285615, + "balance_loss_mlp": 0.01258712, + "epoch": 0.427416203216594, + "flos": 20300582816640.0, + "grad_norm": 2.1700047707431342, + "language_loss": 0.72192961, + "learning_rate": 2.5596762038592294e-06, + "loss": 0.79926884, + "num_input_tokens_seen": 152588950, + "router_z_loss_clip": 1.75097656, + "router_z_loss_mlp": 0.14520264, + "step": 7109, + "time_per_iteration": 2.5418453216552734 + }, + { + "auxiliary_loss_clip": 0.06462875, + "auxiliary_loss_mlp": 0.01279728, + "balance_loss_clip": 0.06288399, + "balance_loss_mlp": 0.01264159, + "epoch": 0.427476326469262, + "flos": 26951382541440.0, + "grad_norm": 2.7192306397859034, + "language_loss": 0.64651388, + "learning_rate": 2.559302291651174e-06, + "loss": 0.7239399, + "num_input_tokens_seen": 152608965, + "router_z_loss_clip": 1.74511719, + "router_z_loss_mlp": 0.15551758, + "step": 7110, + "time_per_iteration": 2.6708264350891113 + }, + { + "auxiliary_loss_clip": 0.06457267, + "auxiliary_loss_mlp": 0.01278945, + "balance_loss_clip": 0.06284395, + "balance_loss_mlp": 0.01264056, + "epoch": 0.42753644972192995, + "flos": 25709967175680.0, + "grad_norm": 2.127603657525877, + "language_loss": 0.76798368, + "learning_rate": 2.5589283582343197e-06, + "loss": 0.84534585, + "num_input_tokens_seen": 152630220, + "router_z_loss_clip": 1.72851562, + "router_z_loss_mlp": 0.14880371, + "step": 7111, + "time_per_iteration": 2.678954601287842 + }, + { + "auxiliary_loss_clip": 0.0646024, + "auxiliary_loss_mlp": 0.01269729, + "balance_loss_clip": 0.06282812, + "balance_loss_mlp": 0.01255352, + "epoch": 0.4275965729745979, + "flos": 18772855649280.0, + "grad_norm": 1.9451066993795918, + "language_loss": 0.73479104, + "learning_rate": 2.558554403622845e-06, + "loss": 0.81209064, + "num_input_tokens_seen": 152648835, + "router_z_loss_clip": 1.77441406, + "router_z_loss_mlp": 0.1439209, + "step": 7112, + "time_per_iteration": 2.4913687705993652 + }, + { + "auxiliary_loss_clip": 0.06453889, + "auxiliary_loss_mlp": 0.01274214, + "balance_loss_clip": 0.06283249, + "balance_loss_mlp": 0.01260248, + "epoch": 0.4276566962272659, + "flos": 23770438318080.0, + "grad_norm": 1.6965987454612683, + "language_loss": 0.71646041, + "learning_rate": 2.5581804278309323e-06, + "loss": 0.79374146, + "num_input_tokens_seen": 152668375, + "router_z_loss_clip": 1.70800781, + "router_z_loss_mlp": 0.13964844, + "step": 7113, + "time_per_iteration": 2.567722797393799 + }, + { + "auxiliary_loss_clip": 0.06462316, + "auxiliary_loss_mlp": 0.01277106, + "balance_loss_clip": 0.06286302, + "balance_loss_mlp": 0.01262157, + "epoch": 0.42771681947993384, + "flos": 22499156171520.0, + "grad_norm": 1.507728091462329, + "language_loss": 0.61987239, + "learning_rate": 2.5578064308727617e-06, + "loss": 0.69726658, + "num_input_tokens_seen": 152689725, + "router_z_loss_clip": 1.75878906, + "router_z_loss_mlp": 0.14953613, + "step": 7114, + "time_per_iteration": 2.5800352096557617 + }, + { + "auxiliary_loss_clip": 0.06466354, + "auxiliary_loss_mlp": 0.01281834, + "balance_loss_clip": 0.06284335, + "balance_loss_mlp": 0.01264895, + "epoch": 0.42777694273260186, + "flos": 25051489464960.0, + "grad_norm": 1.9424022728130763, + "language_loss": 0.64557558, + "learning_rate": 2.5574324127625153e-06, + "loss": 0.72305751, + "num_input_tokens_seen": 152709375, + "router_z_loss_clip": 1.8203125, + "router_z_loss_mlp": 0.16943359, + "step": 7115, + "time_per_iteration": 2.625234603881836 + }, + { + "auxiliary_loss_clip": 0.06458592, + "auxiliary_loss_mlp": 0.01271806, + "balance_loss_clip": 0.06283341, + "balance_loss_mlp": 0.01257668, + "epoch": 0.4278370659852698, + "flos": 18667532666880.0, + "grad_norm": 1.4802584121928888, + "language_loss": 0.73841792, + "learning_rate": 2.5570583735143753e-06, + "loss": 0.81572187, + "num_input_tokens_seen": 152727510, + "router_z_loss_clip": 1.75195312, + "router_z_loss_mlp": 0.14141846, + "step": 7116, + "time_per_iteration": 2.517512798309326 + }, + { + "auxiliary_loss_clip": 0.06453552, + "auxiliary_loss_mlp": 0.0127651, + "balance_loss_clip": 0.06284202, + "balance_loss_mlp": 0.01262461, + "epoch": 0.4278971892379378, + "flos": 27315666167040.0, + "grad_norm": 1.6819154869474044, + "language_loss": 0.69691694, + "learning_rate": 2.5566843131425275e-06, + "loss": 0.77421755, + "num_input_tokens_seen": 152746670, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.14044189, + "step": 7117, + "time_per_iteration": 2.5842087268829346 + }, + { + "auxiliary_loss_clip": 0.06455907, + "auxiliary_loss_mlp": 0.01274379, + "balance_loss_clip": 0.06285148, + "balance_loss_mlp": 0.0126008, + "epoch": 0.42795731249060576, + "flos": 12892397235840.0, + "grad_norm": 2.190420439429125, + "language_loss": 0.69763142, + "learning_rate": 2.5563102316611536e-06, + "loss": 0.77493429, + "num_input_tokens_seen": 152760545, + "router_z_loss_clip": 1.70800781, + "router_z_loss_mlp": 0.14306641, + "step": 7118, + "time_per_iteration": 2.480435609817505 + }, + { + "auxiliary_loss_clip": 0.06457028, + "auxiliary_loss_mlp": 0.01277321, + "balance_loss_clip": 0.06285428, + "balance_loss_mlp": 0.01262109, + "epoch": 0.4280174357432737, + "flos": 33409873146240.0, + "grad_norm": 2.392758427844577, + "language_loss": 0.74691743, + "learning_rate": 2.55593612908444e-06, + "loss": 0.82426095, + "num_input_tokens_seen": 152780970, + "router_z_loss_clip": 1.71582031, + "router_z_loss_mlp": 0.15197754, + "step": 7119, + "time_per_iteration": 2.633418083190918 + }, + { + "auxiliary_loss_clip": 0.06453852, + "auxiliary_loss_mlp": 0.01276265, + "balance_loss_clip": 0.06282485, + "balance_loss_mlp": 0.0126134, + "epoch": 0.4280775589959417, + "flos": 18264871071360.0, + "grad_norm": 2.26485992413173, + "language_loss": 0.75017536, + "learning_rate": 2.555562005426573e-06, + "loss": 0.8274765, + "num_input_tokens_seen": 152798475, + "router_z_loss_clip": 1.71386719, + "router_z_loss_mlp": 0.14916992, + "step": 7120, + "time_per_iteration": 2.4857230186462402 + }, + { + "auxiliary_loss_clip": 0.06459665, + "auxiliary_loss_mlp": 0.01279872, + "balance_loss_clip": 0.062869, + "balance_loss_mlp": 0.01265883, + "epoch": 0.42813768224860965, + "flos": 21477820354560.0, + "grad_norm": 1.904077899556691, + "language_loss": 0.77223492, + "learning_rate": 2.5551878607017385e-06, + "loss": 0.8496303, + "num_input_tokens_seen": 152817555, + "router_z_loss_clip": 1.72753906, + "router_z_loss_mlp": 0.13989258, + "step": 7121, + "time_per_iteration": 2.547011375427246 + }, + { + "auxiliary_loss_clip": 0.06450777, + "auxiliary_loss_mlp": 0.01281298, + "balance_loss_clip": 0.06280679, + "balance_loss_mlp": 0.01267255, + "epoch": 0.4281978055012776, + "flos": 15674704859520.0, + "grad_norm": 1.7733631777850345, + "language_loss": 0.85767531, + "learning_rate": 2.554813694924126e-06, + "loss": 0.93499613, + "num_input_tokens_seen": 152836295, + "router_z_loss_clip": 1.70117188, + "router_z_loss_mlp": 0.14056396, + "step": 7122, + "time_per_iteration": 2.488633155822754 + }, + { + "auxiliary_loss_clip": 0.06454846, + "auxiliary_loss_mlp": 0.01275392, + "balance_loss_clip": 0.06284268, + "balance_loss_mlp": 0.01261022, + "epoch": 0.4282579287539456, + "flos": 17717711909760.0, + "grad_norm": 2.3186837977879886, + "language_loss": 0.8157897, + "learning_rate": 2.554439508107921e-06, + "loss": 0.89309216, + "num_input_tokens_seen": 152854950, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.14355469, + "step": 7123, + "time_per_iteration": 3.969069719314575 + }, + { + "auxiliary_loss_clip": 0.06453736, + "auxiliary_loss_mlp": 0.01276304, + "balance_loss_clip": 0.06284729, + "balance_loss_mlp": 0.01262034, + "epoch": 0.42831805200661355, + "flos": 19287171210240.0, + "grad_norm": 1.594767030772038, + "language_loss": 0.80927598, + "learning_rate": 2.5540653002673153e-06, + "loss": 0.88657635, + "num_input_tokens_seen": 152873995, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.14257812, + "step": 7124, + "time_per_iteration": 3.901512861251831 + }, + { + "auxiliary_loss_clip": 0.06454194, + "auxiliary_loss_mlp": 0.01273804, + "balance_loss_clip": 0.06283361, + "balance_loss_mlp": 0.01258312, + "epoch": 0.4283781752592815, + "flos": 19798845367680.0, + "grad_norm": 1.7493536594312618, + "language_loss": 0.81056678, + "learning_rate": 2.553691071416498e-06, + "loss": 0.88784677, + "num_input_tokens_seen": 152892925, + "router_z_loss_clip": 1.70703125, + "router_z_loss_mlp": 0.15484619, + "step": 7125, + "time_per_iteration": 2.561479091644287 + }, + { + "auxiliary_loss_clip": 0.06453275, + "auxiliary_loss_mlp": 0.0127252, + "balance_loss_clip": 0.06283629, + "balance_loss_mlp": 0.01259467, + "epoch": 0.4284382985119495, + "flos": 16513584410880.0, + "grad_norm": 2.012470201752393, + "language_loss": 0.75256401, + "learning_rate": 2.553316821569659e-06, + "loss": 0.829822, + "num_input_tokens_seen": 152910935, + "router_z_loss_clip": 1.69824219, + "router_z_loss_mlp": 0.13037109, + "step": 7126, + "time_per_iteration": 2.550835371017456 + }, + { + "auxiliary_loss_clip": 0.06454661, + "auxiliary_loss_mlp": 0.01269423, + "balance_loss_clip": 0.06280357, + "balance_loss_mlp": 0.01255518, + "epoch": 0.42849842176461744, + "flos": 23337406817280.0, + "grad_norm": 1.7018740006461155, + "language_loss": 0.81619167, + "learning_rate": 2.5529425507409913e-06, + "loss": 0.8934325, + "num_input_tokens_seen": 152931030, + "router_z_loss_clip": 1.74511719, + "router_z_loss_mlp": 0.13916016, + "step": 7127, + "time_per_iteration": 2.512833833694458 + }, + { + "auxiliary_loss_clip": 0.06455937, + "auxiliary_loss_mlp": 0.01269506, + "balance_loss_clip": 0.06282341, + "balance_loss_mlp": 0.01254659, + "epoch": 0.4285585450172854, + "flos": 17280110361600.0, + "grad_norm": 1.7733778395824964, + "language_loss": 0.76877725, + "learning_rate": 2.5525682589446867e-06, + "loss": 0.84603173, + "num_input_tokens_seen": 152948085, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.14837646, + "step": 7128, + "time_per_iteration": 2.54837703704834 + }, + { + "auxiliary_loss_clip": 0.06458156, + "auxiliary_loss_mlp": 0.01271641, + "balance_loss_clip": 0.06282061, + "balance_loss_mlp": 0.01255726, + "epoch": 0.42861866826995343, + "flos": 24286430960640.0, + "grad_norm": 1.8449893243882522, + "language_loss": 0.74647015, + "learning_rate": 2.552193946194937e-06, + "loss": 0.82376814, + "num_input_tokens_seen": 152966265, + "router_z_loss_clip": 1.76074219, + "router_z_loss_mlp": 0.15917969, + "step": 7129, + "time_per_iteration": 2.5513017177581787 + }, + { + "auxiliary_loss_clip": 0.06454159, + "auxiliary_loss_mlp": 0.0127295, + "balance_loss_clip": 0.06282164, + "balance_loss_mlp": 0.01258949, + "epoch": 0.4286787915226214, + "flos": 24360042372480.0, + "grad_norm": 1.8999084688655365, + "language_loss": 0.7830866, + "learning_rate": 2.5518196125059394e-06, + "loss": 0.86035764, + "num_input_tokens_seen": 152986775, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.14007568, + "step": 7130, + "time_per_iteration": 3.9916892051696777 + }, + { + "auxiliary_loss_clip": 0.06456774, + "auxiliary_loss_mlp": 0.01278579, + "balance_loss_clip": 0.06282126, + "balance_loss_mlp": 0.01263618, + "epoch": 0.42873891477528936, + "flos": 15455338070400.0, + "grad_norm": 2.1626861971351263, + "language_loss": 0.73881406, + "learning_rate": 2.551445257891886e-06, + "loss": 0.81616759, + "num_input_tokens_seen": 153003595, + "router_z_loss_clip": 1.74707031, + "router_z_loss_mlp": 0.1496582, + "step": 7131, + "time_per_iteration": 2.504786252975464 + }, + { + "auxiliary_loss_clip": 0.06455156, + "auxiliary_loss_mlp": 0.01273453, + "balance_loss_clip": 0.06282241, + "balance_loss_mlp": 0.01258183, + "epoch": 0.4287990380279573, + "flos": 17645358309120.0, + "grad_norm": 2.0546861067047533, + "language_loss": 0.77884281, + "learning_rate": 2.551070882366973e-06, + "loss": 0.85612893, + "num_input_tokens_seen": 153021960, + "router_z_loss_clip": 1.72753906, + "router_z_loss_mlp": 0.15270996, + "step": 7132, + "time_per_iteration": 2.5048811435699463 + }, + { + "auxiliary_loss_clip": 0.06456134, + "auxiliary_loss_mlp": 0.01270516, + "balance_loss_clip": 0.06281912, + "balance_loss_mlp": 0.01254542, + "epoch": 0.4288591612806253, + "flos": 27169701154560.0, + "grad_norm": 1.7726331897563596, + "language_loss": 0.78733218, + "learning_rate": 2.550696485945397e-06, + "loss": 0.86459869, + "num_input_tokens_seen": 153042110, + "router_z_loss_clip": 1.74121094, + "router_z_loss_mlp": 0.1595459, + "step": 7133, + "time_per_iteration": 4.068531036376953 + }, + { + "auxiliary_loss_clip": 0.06450784, + "auxiliary_loss_mlp": 0.01268858, + "balance_loss_clip": 0.06277733, + "balance_loss_mlp": 0.01254785, + "epoch": 0.42891928453329325, + "flos": 17168540250240.0, + "grad_norm": 1.7118267088696246, + "language_loss": 0.7483775, + "learning_rate": 2.550322068641355e-06, + "loss": 0.82557392, + "num_input_tokens_seen": 153058925, + "router_z_loss_clip": 1.73046875, + "router_z_loss_mlp": 0.14068604, + "step": 7134, + "time_per_iteration": 2.504011631011963 + }, + { + "auxiliary_loss_clip": 0.06450233, + "auxiliary_loss_mlp": 0.01272762, + "balance_loss_clip": 0.06279828, + "balance_loss_mlp": 0.0125882, + "epoch": 0.4289794077859612, + "flos": 18192936741120.0, + "grad_norm": 1.9195667435408965, + "language_loss": 0.84458339, + "learning_rate": 2.5499476304690455e-06, + "loss": 0.92181337, + "num_input_tokens_seen": 153078070, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.13946533, + "step": 7135, + "time_per_iteration": 2.4924819469451904 + }, + { + "auxiliary_loss_clip": 0.06447092, + "auxiliary_loss_mlp": 0.01268039, + "balance_loss_clip": 0.06279005, + "balance_loss_mlp": 0.01253949, + "epoch": 0.4290395310386292, + "flos": 28264438748160.0, + "grad_norm": 2.116473983113214, + "language_loss": 0.754601, + "learning_rate": 2.549573171442666e-06, + "loss": 0.8317523, + "num_input_tokens_seen": 153096680, + "router_z_loss_clip": 1.6796875, + "router_z_loss_mlp": 0.14099121, + "step": 7136, + "time_per_iteration": 2.579450845718384 + }, + { + "auxiliary_loss_clip": 0.06453092, + "auxiliary_loss_mlp": 0.01272367, + "balance_loss_clip": 0.06277236, + "balance_loss_mlp": 0.01257895, + "epoch": 0.42909965429129715, + "flos": 16221528604800.0, + "grad_norm": 1.8728665886520197, + "language_loss": 0.79211873, + "learning_rate": 2.5491986915764175e-06, + "loss": 0.86937326, + "num_input_tokens_seen": 153113305, + "router_z_loss_clip": 1.75976562, + "router_z_loss_mlp": 0.14465332, + "step": 7137, + "time_per_iteration": 2.485880136489868 + }, + { + "auxiliary_loss_clip": 0.06452384, + "auxiliary_loss_mlp": 0.01271962, + "balance_loss_clip": 0.06279657, + "balance_loss_mlp": 0.01257359, + "epoch": 0.4291597775439651, + "flos": 23119633255680.0, + "grad_norm": 1.8713356259191796, + "language_loss": 0.76152903, + "learning_rate": 2.548824190884499e-06, + "loss": 0.83877248, + "num_input_tokens_seen": 153132735, + "router_z_loss_clip": 1.72753906, + "router_z_loss_mlp": 0.14605713, + "step": 7138, + "time_per_iteration": 2.5630223751068115 + }, + { + "auxiliary_loss_clip": 0.06367285, + "auxiliary_loss_mlp": 0.01254388, + "balance_loss_clip": 0.06288805, + "balance_loss_mlp": 0.01250711, + "epoch": 0.4292199007966331, + "flos": 67565461703040.0, + "grad_norm": 0.7609122933706777, + "language_loss": 0.5608238, + "learning_rate": 2.548449669381113e-06, + "loss": 0.63704056, + "num_input_tokens_seen": 153187925, + "router_z_loss_clip": 0.78515625, + "router_z_loss_mlp": 0.03668213, + "step": 7139, + "time_per_iteration": 3.0345327854156494 + }, + { + "auxiliary_loss_clip": 0.06448679, + "auxiliary_loss_mlp": 0.01269902, + "balance_loss_clip": 0.06282055, + "balance_loss_mlp": 0.01256861, + "epoch": 0.42928002404930105, + "flos": 23006008719360.0, + "grad_norm": 1.7405631209015646, + "language_loss": 0.81563902, + "learning_rate": 2.5480751270804595e-06, + "loss": 0.89282477, + "num_input_tokens_seen": 153206990, + "router_z_loss_clip": 1.66699219, + "router_z_loss_mlp": 0.13049316, + "step": 7140, + "time_per_iteration": 2.5697882175445557 + }, + { + "auxiliary_loss_clip": 0.06455392, + "auxiliary_loss_mlp": 0.01267223, + "balance_loss_clip": 0.0628099, + "balance_loss_mlp": 0.01252543, + "epoch": 0.429340147301969, + "flos": 11549432321280.0, + "grad_norm": 1.8011940744465647, + "language_loss": 0.82215559, + "learning_rate": 2.5477005639967424e-06, + "loss": 0.89938176, + "num_input_tokens_seen": 153222345, + "router_z_loss_clip": 1.74316406, + "router_z_loss_mlp": 0.14678955, + "step": 7141, + "time_per_iteration": 2.4844813346862793 + }, + { + "auxiliary_loss_clip": 0.0646215, + "auxiliary_loss_mlp": 0.0128237, + "balance_loss_clip": 0.06283965, + "balance_loss_mlp": 0.01266336, + "epoch": 0.42940027055463703, + "flos": 25272030211200.0, + "grad_norm": 2.0081644747821947, + "language_loss": 0.86468136, + "learning_rate": 2.547325980144166e-06, + "loss": 0.94212657, + "num_input_tokens_seen": 153240570, + "router_z_loss_clip": 1.78222656, + "router_z_loss_mlp": 0.16027832, + "step": 7142, + "time_per_iteration": 2.570967674255371 + }, + { + "auxiliary_loss_clip": 0.0645667, + "auxiliary_loss_mlp": 0.01269132, + "balance_loss_clip": 0.06288485, + "balance_loss_mlp": 0.01255596, + "epoch": 0.429460393807305, + "flos": 23811709253760.0, + "grad_norm": 2.010483035293097, + "language_loss": 0.78394985, + "learning_rate": 2.5469513755369323e-06, + "loss": 0.86120784, + "num_input_tokens_seen": 153259575, + "router_z_loss_clip": 1.68066406, + "router_z_loss_mlp": 0.13549805, + "step": 7143, + "time_per_iteration": 2.5245959758758545 + }, + { + "auxiliary_loss_clip": 0.06458203, + "auxiliary_loss_mlp": 0.01271797, + "balance_loss_clip": 0.06286128, + "balance_loss_mlp": 0.01257689, + "epoch": 0.42952051705997296, + "flos": 13923502053120.0, + "grad_norm": 1.8646185905931467, + "language_loss": 0.77133417, + "learning_rate": 2.5465767501892484e-06, + "loss": 0.84863412, + "num_input_tokens_seen": 153276650, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.14117432, + "step": 7144, + "time_per_iteration": 2.5442261695861816 + }, + { + "auxiliary_loss_clip": 0.0645657, + "auxiliary_loss_mlp": 0.01274131, + "balance_loss_clip": 0.06283006, + "balance_loss_mlp": 0.0125973, + "epoch": 0.4295806403126409, + "flos": 26767584610560.0, + "grad_norm": 1.5670382727140026, + "language_loss": 0.74293256, + "learning_rate": 2.54620210411532e-06, + "loss": 0.8202396, + "num_input_tokens_seen": 153298025, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.14404297, + "step": 7145, + "time_per_iteration": 2.5812947750091553 + }, + { + "auxiliary_loss_clip": 0.06458145, + "auxiliary_loss_mlp": 0.01276391, + "balance_loss_clip": 0.06281675, + "balance_loss_mlp": 0.01261585, + "epoch": 0.4296407635653089, + "flos": 20957760789120.0, + "grad_norm": 2.084760622121642, + "language_loss": 0.79444236, + "learning_rate": 2.545827437329352e-06, + "loss": 0.87178773, + "num_input_tokens_seen": 153315775, + "router_z_loss_clip": 1.76269531, + "router_z_loss_mlp": 0.14807129, + "step": 7146, + "time_per_iteration": 2.5411908626556396 + }, + { + "auxiliary_loss_clip": 0.0645076, + "auxiliary_loss_mlp": 0.01276231, + "balance_loss_clip": 0.06280234, + "balance_loss_mlp": 0.01262373, + "epoch": 0.42970088681797686, + "flos": 15857915811840.0, + "grad_norm": 1.9977945232207481, + "language_loss": 0.83012491, + "learning_rate": 2.5454527498455532e-06, + "loss": 0.90739477, + "num_input_tokens_seen": 153332765, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.13867188, + "step": 7147, + "time_per_iteration": 2.4752652645111084 + }, + { + "auxiliary_loss_clip": 0.06456682, + "auxiliary_loss_mlp": 0.01274227, + "balance_loss_clip": 0.06283284, + "balance_loss_mlp": 0.01258622, + "epoch": 0.4297610100706448, + "flos": 22389179287680.0, + "grad_norm": 1.9494252458685553, + "language_loss": 0.87818855, + "learning_rate": 2.545078041678131e-06, + "loss": 0.95549762, + "num_input_tokens_seen": 153350760, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.15612793, + "step": 7148, + "time_per_iteration": 2.5504684448242188 + }, + { + "auxiliary_loss_clip": 0.06459592, + "auxiliary_loss_mlp": 0.0127006, + "balance_loss_clip": 0.06287406, + "balance_loss_mlp": 0.01255689, + "epoch": 0.4298211333233128, + "flos": 27932705233920.0, + "grad_norm": 1.7901480630114543, + "language_loss": 0.78474885, + "learning_rate": 2.5447033128412957e-06, + "loss": 0.86204541, + "num_input_tokens_seen": 153370765, + "router_z_loss_clip": 1.72265625, + "router_z_loss_mlp": 0.14373779, + "step": 7149, + "time_per_iteration": 2.5467026233673096 + }, + { + "auxiliary_loss_clip": 0.06454438, + "auxiliary_loss_mlp": 0.01275691, + "balance_loss_clip": 0.06285315, + "balance_loss_mlp": 0.01261153, + "epoch": 0.42988125657598075, + "flos": 24432479827200.0, + "grad_norm": 1.6909372302648806, + "language_loss": 0.79794931, + "learning_rate": 2.544328563349256e-06, + "loss": 0.87525058, + "num_input_tokens_seen": 153390725, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.14550781, + "step": 7150, + "time_per_iteration": 2.5642549991607666 + }, + { + "auxiliary_loss_clip": 0.06463797, + "auxiliary_loss_mlp": 0.01273266, + "balance_loss_clip": 0.06283444, + "balance_loss_mlp": 0.01256636, + "epoch": 0.4299413798286487, + "flos": 15855400189440.0, + "grad_norm": 1.6104667865383644, + "language_loss": 0.75438166, + "learning_rate": 2.5439537932162222e-06, + "loss": 0.8317523, + "num_input_tokens_seen": 153408010, + "router_z_loss_clip": 1.80371094, + "router_z_loss_mlp": 0.16638184, + "step": 7151, + "time_per_iteration": 2.47206711769104 + }, + { + "auxiliary_loss_clip": 0.06463672, + "auxiliary_loss_mlp": 0.01271158, + "balance_loss_clip": 0.06284998, + "balance_loss_mlp": 0.01256179, + "epoch": 0.4300015030813167, + "flos": 22316029073280.0, + "grad_norm": 1.9504143763164294, + "language_loss": 0.70926738, + "learning_rate": 2.543579002456406e-06, + "loss": 0.78661567, + "num_input_tokens_seen": 153426865, + "router_z_loss_clip": 1.78808594, + "router_z_loss_mlp": 0.14984131, + "step": 7152, + "time_per_iteration": 2.541208267211914 + }, + { + "auxiliary_loss_clip": 0.06452823, + "auxiliary_loss_mlp": 0.01271847, + "balance_loss_clip": 0.06279409, + "balance_loss_mlp": 0.01257482, + "epoch": 0.43006162633398465, + "flos": 34906391867520.0, + "grad_norm": 1.81395768481921, + "language_loss": 0.7223562, + "learning_rate": 2.54320419108402e-06, + "loss": 0.79960287, + "num_input_tokens_seen": 153449410, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.14361572, + "step": 7153, + "time_per_iteration": 2.6242926120758057 + }, + { + "auxiliary_loss_clip": 0.064519, + "auxiliary_loss_mlp": 0.01271383, + "balance_loss_clip": 0.06279962, + "balance_loss_mlp": 0.01257018, + "epoch": 0.4301217495866526, + "flos": 15967138008960.0, + "grad_norm": 2.006134184464422, + "language_loss": 0.78977376, + "learning_rate": 2.542829359113276e-06, + "loss": 0.8670066, + "num_input_tokens_seen": 153467910, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.14367676, + "step": 7154, + "time_per_iteration": 2.5568442344665527 + }, + { + "auxiliary_loss_clip": 0.06457433, + "auxiliary_loss_mlp": 0.01273105, + "balance_loss_clip": 0.06286051, + "balance_loss_mlp": 0.01258943, + "epoch": 0.43018187283932063, + "flos": 18776293666560.0, + "grad_norm": 1.5037130128548426, + "language_loss": 0.78947407, + "learning_rate": 2.542454506558389e-06, + "loss": 0.86677945, + "num_input_tokens_seen": 153487100, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.14172363, + "step": 7155, + "time_per_iteration": 2.5090463161468506 + }, + { + "auxiliary_loss_clip": 0.06448177, + "auxiliary_loss_mlp": 0.01271989, + "balance_loss_clip": 0.06280203, + "balance_loss_mlp": 0.01258613, + "epoch": 0.4302419960919886, + "flos": 20157007645440.0, + "grad_norm": 4.525310176173048, + "language_loss": 0.89197671, + "learning_rate": 2.5420796334335723e-06, + "loss": 0.96917844, + "num_input_tokens_seen": 153505565, + "router_z_loss_clip": 1.67871094, + "router_z_loss_mlp": 0.13397217, + "step": 7156, + "time_per_iteration": 2.5620951652526855 + }, + { + "auxiliary_loss_clip": 0.0645663, + "auxiliary_loss_mlp": 0.01274773, + "balance_loss_clip": 0.06281747, + "balance_loss_mlp": 0.01259836, + "epoch": 0.43030211934465656, + "flos": 26440001873280.0, + "grad_norm": 2.4796677358200423, + "language_loss": 0.82988536, + "learning_rate": 2.541704739753042e-06, + "loss": 0.90719938, + "num_input_tokens_seen": 153526130, + "router_z_loss_clip": 1.74707031, + "router_z_loss_mlp": 0.14929199, + "step": 7157, + "time_per_iteration": 2.5528175830841064 + }, + { + "auxiliary_loss_clip": 0.06457967, + "auxiliary_loss_mlp": 0.01275139, + "balance_loss_clip": 0.06280558, + "balance_loss_mlp": 0.01258974, + "epoch": 0.43036224259732453, + "flos": 24396114355200.0, + "grad_norm": 1.7333061296854189, + "language_loss": 0.71840358, + "learning_rate": 2.5413298255310132e-06, + "loss": 0.79573464, + "num_input_tokens_seen": 153546370, + "router_z_loss_clip": 1.77441406, + "router_z_loss_mlp": 0.16162109, + "step": 7158, + "time_per_iteration": 2.540012836456299 + }, + { + "auxiliary_loss_clip": 0.06449466, + "auxiliary_loss_mlp": 0.01275077, + "balance_loss_clip": 0.06278417, + "balance_loss_mlp": 0.01260355, + "epoch": 0.4304223658499925, + "flos": 17207421344640.0, + "grad_norm": 2.0047997442662684, + "language_loss": 0.82936633, + "learning_rate": 2.5409548907817034e-06, + "loss": 0.9066118, + "num_input_tokens_seen": 153562800, + "router_z_loss_clip": 1.7109375, + "router_z_loss_mlp": 0.14709473, + "step": 7159, + "time_per_iteration": 2.550978183746338 + }, + { + "auxiliary_loss_clip": 0.0645431, + "auxiliary_loss_mlp": 0.01270347, + "balance_loss_clip": 0.06281546, + "balance_loss_mlp": 0.01256048, + "epoch": 0.43048248910266046, + "flos": 14908304689920.0, + "grad_norm": 2.57539664943107, + "language_loss": 0.82999021, + "learning_rate": 2.54057993551933e-06, + "loss": 0.90723681, + "num_input_tokens_seen": 153578395, + "router_z_loss_clip": 1.72851562, + "router_z_loss_mlp": 0.1428833, + "step": 7160, + "time_per_iteration": 2.525343894958496 + }, + { + "auxiliary_loss_clip": 0.0645951, + "auxiliary_loss_mlp": 0.01269507, + "balance_loss_clip": 0.06281772, + "balance_loss_mlp": 0.01252675, + "epoch": 0.4305426123553284, + "flos": 21586245937920.0, + "grad_norm": 3.3699216716451046, + "language_loss": 0.77364504, + "learning_rate": 2.5402049597581116e-06, + "loss": 0.85093522, + "num_input_tokens_seen": 153596880, + "router_z_loss_clip": 1.77929688, + "router_z_loss_mlp": 0.16845703, + "step": 7161, + "time_per_iteration": 2.5307719707489014 + }, + { + "auxiliary_loss_clip": 0.06452791, + "auxiliary_loss_mlp": 0.0127042, + "balance_loss_clip": 0.06280292, + "balance_loss_mlp": 0.01256449, + "epoch": 0.4306027356079964, + "flos": 22607833317120.0, + "grad_norm": 2.044056208596942, + "language_loss": 0.73045391, + "learning_rate": 2.5398299635122662e-06, + "loss": 0.80768597, + "num_input_tokens_seen": 153616570, + "router_z_loss_clip": 1.72363281, + "router_z_loss_mlp": 0.13964844, + "step": 7162, + "time_per_iteration": 2.53442645072937 + }, + { + "auxiliary_loss_clip": 0.06358678, + "auxiliary_loss_mlp": 0.01256162, + "balance_loss_clip": 0.06279682, + "balance_loss_mlp": 0.01252738, + "epoch": 0.43066285886066435, + "flos": 70689873548160.0, + "grad_norm": 0.805422068373614, + "language_loss": 0.58694339, + "learning_rate": 2.5394549467960147e-06, + "loss": 0.66309178, + "num_input_tokens_seen": 153671450, + "router_z_loss_clip": 0.7890625, + "router_z_loss_mlp": 0.03433228, + "step": 7163, + "time_per_iteration": 4.420603036880493 + }, + { + "auxiliary_loss_clip": 0.06450315, + "auxiliary_loss_mlp": 0.01271156, + "balance_loss_clip": 0.06279671, + "balance_loss_mlp": 0.01257298, + "epoch": 0.4307229821133323, + "flos": 26727236069760.0, + "grad_norm": 1.7043821860128514, + "language_loss": 0.79015797, + "learning_rate": 2.5390799096235783e-06, + "loss": 0.86737275, + "num_input_tokens_seen": 153691405, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.13842773, + "step": 7164, + "time_per_iteration": 4.077051162719727 + }, + { + "auxiliary_loss_clip": 0.0645581, + "auxiliary_loss_mlp": 0.01269266, + "balance_loss_clip": 0.06279337, + "balance_loss_mlp": 0.01254222, + "epoch": 0.4307831053660003, + "flos": 26184311539200.0, + "grad_norm": 1.6263476545367235, + "language_loss": 0.68622434, + "learning_rate": 2.538704852009177e-06, + "loss": 0.76347512, + "num_input_tokens_seen": 153711555, + "router_z_loss_clip": 1.765625, + "router_z_loss_mlp": 0.1505127, + "step": 7165, + "time_per_iteration": 2.5447044372558594 + }, + { + "auxiliary_loss_clip": 0.06454252, + "auxiliary_loss_mlp": 0.01269461, + "balance_loss_clip": 0.06280573, + "balance_loss_mlp": 0.01254733, + "epoch": 0.43084322861866825, + "flos": 18915298790400.0, + "grad_norm": 2.036386887615401, + "language_loss": 0.75601453, + "learning_rate": 2.538329773967034e-06, + "loss": 0.83325171, + "num_input_tokens_seen": 153730095, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.14758301, + "step": 7166, + "time_per_iteration": 2.5380423069000244 + }, + { + "auxiliary_loss_clip": 0.06447423, + "auxiliary_loss_mlp": 0.01267427, + "balance_loss_clip": 0.06278174, + "balance_loss_mlp": 0.0125401, + "epoch": 0.4309033518713362, + "flos": 26440211508480.0, + "grad_norm": 1.6055464610704053, + "language_loss": 0.72472453, + "learning_rate": 2.537954675511372e-06, + "loss": 0.80187303, + "num_input_tokens_seen": 153749320, + "router_z_loss_clip": 1.69140625, + "router_z_loss_mlp": 0.13415527, + "step": 7167, + "time_per_iteration": 2.581911563873291 + }, + { + "auxiliary_loss_clip": 0.06445278, + "auxiliary_loss_mlp": 0.01267536, + "balance_loss_clip": 0.06279434, + "balance_loss_mlp": 0.01253398, + "epoch": 0.43096347512400424, + "flos": 21219362835840.0, + "grad_norm": 1.5535022771303773, + "language_loss": 0.78678393, + "learning_rate": 2.537579556656414e-06, + "loss": 0.86391199, + "num_input_tokens_seen": 153767825, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.14135742, + "step": 7168, + "time_per_iteration": 2.5395426750183105 + }, + { + "auxiliary_loss_clip": 0.06449728, + "auxiliary_loss_mlp": 0.0127075, + "balance_loss_clip": 0.06278324, + "balance_loss_mlp": 0.01257095, + "epoch": 0.4310235983766722, + "flos": 16544918638080.0, + "grad_norm": 2.3704233546720936, + "language_loss": 0.82314277, + "learning_rate": 2.537204417416387e-06, + "loss": 0.90034759, + "num_input_tokens_seen": 153785350, + "router_z_loss_clip": 1.71191406, + "router_z_loss_mlp": 0.13647461, + "step": 7169, + "time_per_iteration": 3.8934504985809326 + }, + { + "auxiliary_loss_clip": 0.06353073, + "auxiliary_loss_mlp": 0.01255187, + "balance_loss_clip": 0.0627488, + "balance_loss_mlp": 0.01251897, + "epoch": 0.43108372162934017, + "flos": 64794893650560.0, + "grad_norm": 0.6586067859139012, + "language_loss": 0.60826671, + "learning_rate": 2.5368292578055132e-06, + "loss": 0.6843493, + "num_input_tokens_seen": 153856400, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 0.03295898, + "step": 7170, + "time_per_iteration": 3.303295612335205 + }, + { + "auxiliary_loss_clip": 0.06446448, + "auxiliary_loss_mlp": 0.01267633, + "balance_loss_clip": 0.06276239, + "balance_loss_mlp": 0.01253841, + "epoch": 0.43114384488200813, + "flos": 13449241543680.0, + "grad_norm": 1.7965809828184895, + "language_loss": 0.76463991, + "learning_rate": 2.536454077838021e-06, + "loss": 0.84178072, + "num_input_tokens_seen": 153875230, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.13787842, + "step": 7171, + "time_per_iteration": 2.4991650581359863 + }, + { + "auxiliary_loss_clip": 0.06446211, + "auxiliary_loss_mlp": 0.01267534, + "balance_loss_clip": 0.06277663, + "balance_loss_mlp": 0.01253592, + "epoch": 0.4312039681346761, + "flos": 26293911079680.0, + "grad_norm": 1.4736819236139371, + "language_loss": 0.77570975, + "learning_rate": 2.5360788775281357e-06, + "loss": 0.8528471, + "num_input_tokens_seen": 153894740, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.13934326, + "step": 7172, + "time_per_iteration": 2.540095567703247 + }, + { + "auxiliary_loss_clip": 0.06448045, + "auxiliary_loss_mlp": 0.01271237, + "balance_loss_clip": 0.062755, + "balance_loss_mlp": 0.01256449, + "epoch": 0.43126409138734406, + "flos": 20383040833920.0, + "grad_norm": 1.8735364024745536, + "language_loss": 0.76837397, + "learning_rate": 2.535703656890086e-06, + "loss": 0.84556675, + "num_input_tokens_seen": 153913230, + "router_z_loss_clip": 1.72363281, + "router_z_loss_mlp": 0.14776611, + "step": 7173, + "time_per_iteration": 3.998828887939453 + }, + { + "auxiliary_loss_clip": 0.06449778, + "auxiliary_loss_mlp": 0.0126907, + "balance_loss_clip": 0.06280752, + "balance_loss_mlp": 0.0125529, + "epoch": 0.431324214640012, + "flos": 22128918906240.0, + "grad_norm": 1.4124937065278635, + "language_loss": 0.76940411, + "learning_rate": 2.5353284159381e-06, + "loss": 0.84659261, + "num_input_tokens_seen": 153933250, + "router_z_loss_clip": 1.69042969, + "router_z_loss_mlp": 0.13800049, + "step": 7174, + "time_per_iteration": 2.510742425918579 + }, + { + "auxiliary_loss_clip": 0.06448075, + "auxiliary_loss_mlp": 0.01271664, + "balance_loss_clip": 0.06275856, + "balance_loss_mlp": 0.01256477, + "epoch": 0.43138433789268, + "flos": 15236306697600.0, + "grad_norm": 1.9136821796322663, + "language_loss": 0.82178259, + "learning_rate": 2.534953154686407e-06, + "loss": 0.89898002, + "num_input_tokens_seen": 153951325, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.15185547, + "step": 7175, + "time_per_iteration": 2.5317423343658447 + }, + { + "auxiliary_loss_clip": 0.06456869, + "auxiliary_loss_mlp": 0.01274036, + "balance_loss_clip": 0.06277366, + "balance_loss_mlp": 0.01256935, + "epoch": 0.43144446114534796, + "flos": 18156151998720.0, + "grad_norm": 2.207412358761708, + "language_loss": 0.74869847, + "learning_rate": 2.5345778731492366e-06, + "loss": 0.82600749, + "num_input_tokens_seen": 153966975, + "router_z_loss_clip": 1.79492188, + "router_z_loss_mlp": 0.17095947, + "step": 7176, + "time_per_iteration": 2.4871389865875244 + }, + { + "auxiliary_loss_clip": 0.0645103, + "auxiliary_loss_mlp": 0.01269847, + "balance_loss_clip": 0.06277142, + "balance_loss_mlp": 0.01255565, + "epoch": 0.4315045843980159, + "flos": 22936506157440.0, + "grad_norm": 1.949576719813971, + "language_loss": 0.73992217, + "learning_rate": 2.534202571340819e-06, + "loss": 0.81713092, + "num_input_tokens_seen": 153986695, + "router_z_loss_clip": 1.73828125, + "router_z_loss_mlp": 0.14294434, + "step": 7177, + "time_per_iteration": 2.5317373275756836 + }, + { + "auxiliary_loss_clip": 0.06461225, + "auxiliary_loss_mlp": 0.01270022, + "balance_loss_clip": 0.06277613, + "balance_loss_mlp": 0.01253667, + "epoch": 0.4315647076506839, + "flos": 22133321245440.0, + "grad_norm": 1.7707547745548928, + "language_loss": 0.81576592, + "learning_rate": 2.533827249275387e-06, + "loss": 0.89307833, + "num_input_tokens_seen": 154004710, + "router_z_loss_clip": 1.83691406, + "router_z_loss_mlp": 0.16357422, + "step": 7178, + "time_per_iteration": 2.5210797786712646 + }, + { + "auxiliary_loss_clip": 0.06445872, + "auxiliary_loss_mlp": 0.01271308, + "balance_loss_clip": 0.06281172, + "balance_loss_mlp": 0.01257962, + "epoch": 0.43162483090335185, + "flos": 26878567743360.0, + "grad_norm": 1.4959775860860902, + "language_loss": 0.84818423, + "learning_rate": 2.5334519069671725e-06, + "loss": 0.92535609, + "num_input_tokens_seen": 154024320, + "router_z_loss_clip": 1.64746094, + "router_z_loss_mlp": 0.13360596, + "step": 7179, + "time_per_iteration": 2.6229355335235596 + }, + { + "auxiliary_loss_clip": 0.06446353, + "auxiliary_loss_mlp": 0.01270616, + "balance_loss_clip": 0.06276584, + "balance_loss_mlp": 0.01256096, + "epoch": 0.4316849541560198, + "flos": 13917464559360.0, + "grad_norm": 1.6356598233983888, + "language_loss": 0.75595218, + "learning_rate": 2.5330765444304075e-06, + "loss": 0.83312184, + "num_input_tokens_seen": 154041755, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.1451416, + "step": 7180, + "time_per_iteration": 2.4882874488830566 + }, + { + "auxiliary_loss_clip": 0.06450133, + "auxiliary_loss_mlp": 0.01266264, + "balance_loss_clip": 0.0627453, + "balance_loss_mlp": 0.01251023, + "epoch": 0.4317450774086878, + "flos": 16440685758720.0, + "grad_norm": 1.8060434620212955, + "language_loss": 0.81820869, + "learning_rate": 2.5327011616793274e-06, + "loss": 0.89537263, + "num_input_tokens_seen": 154056775, + "router_z_loss_clip": 1.7578125, + "router_z_loss_mlp": 0.15252686, + "step": 7181, + "time_per_iteration": 2.534747838973999 + }, + { + "auxiliary_loss_clip": 0.0644898, + "auxiliary_loss_mlp": 0.0127112, + "balance_loss_clip": 0.06274159, + "balance_loss_mlp": 0.01256189, + "epoch": 0.4318052006613558, + "flos": 20560675489920.0, + "grad_norm": 1.632078496987146, + "language_loss": 0.88980561, + "learning_rate": 2.532325758728165e-06, + "loss": 0.96700662, + "num_input_tokens_seen": 154075015, + "router_z_loss_clip": 1.74804688, + "router_z_loss_mlp": 0.14923096, + "step": 7182, + "time_per_iteration": 2.493427038192749 + }, + { + "auxiliary_loss_clip": 0.06446697, + "auxiliary_loss_mlp": 0.01267064, + "balance_loss_clip": 0.06278539, + "balance_loss_mlp": 0.01254052, + "epoch": 0.43186532391402377, + "flos": 22826613127680.0, + "grad_norm": 1.9212724157627075, + "language_loss": 0.75858486, + "learning_rate": 2.5319503355911566e-06, + "loss": 0.83572245, + "num_input_tokens_seen": 154095170, + "router_z_loss_clip": 1.68554688, + "router_z_loss_mlp": 0.13012695, + "step": 7183, + "time_per_iteration": 2.552116870880127 + }, + { + "auxiliary_loss_clip": 0.06451686, + "auxiliary_loss_mlp": 0.01267536, + "balance_loss_clip": 0.06278371, + "balance_loss_mlp": 0.01253923, + "epoch": 0.43192544716669173, + "flos": 25563624819840.0, + "grad_norm": 1.5103875784905794, + "language_loss": 0.77652711, + "learning_rate": 2.5315748922825393e-06, + "loss": 0.85371935, + "num_input_tokens_seen": 154116895, + "router_z_loss_clip": 1.73339844, + "router_z_loss_mlp": 0.13604736, + "step": 7184, + "time_per_iteration": 2.5299277305603027 + }, + { + "auxiliary_loss_clip": 0.06444119, + "auxiliary_loss_mlp": 0.01269203, + "balance_loss_clip": 0.06279948, + "balance_loss_mlp": 0.01255494, + "epoch": 0.4319855704193597, + "flos": 30962317783680.0, + "grad_norm": 1.4924548432613554, + "language_loss": 0.73502755, + "learning_rate": 2.5311994288165474e-06, + "loss": 0.81216079, + "num_input_tokens_seen": 154138395, + "router_z_loss_clip": 1.63964844, + "router_z_loss_mlp": 0.13720703, + "step": 7185, + "time_per_iteration": 2.5939247608184814 + }, + { + "auxiliary_loss_clip": 0.06455707, + "auxiliary_loss_mlp": 0.01271443, + "balance_loss_clip": 0.06279209, + "balance_loss_mlp": 0.0125684, + "epoch": 0.43204569367202766, + "flos": 24244824608640.0, + "grad_norm": 2.4112385113933015, + "language_loss": 0.75683951, + "learning_rate": 2.530823945207421e-06, + "loss": 0.83411103, + "num_input_tokens_seen": 154156775, + "router_z_loss_clip": 1.76464844, + "router_z_loss_mlp": 0.14611816, + "step": 7186, + "time_per_iteration": 2.543679714202881 + }, + { + "auxiliary_loss_clip": 0.06451818, + "auxiliary_loss_mlp": 0.01273087, + "balance_loss_clip": 0.06278853, + "balance_loss_mlp": 0.01259068, + "epoch": 0.43210581692469563, + "flos": 18413058216960.0, + "grad_norm": 2.2976206703160065, + "language_loss": 0.76516449, + "learning_rate": 2.5304484414693962e-06, + "loss": 0.84241354, + "num_input_tokens_seen": 154177500, + "router_z_loss_clip": 1.73046875, + "router_z_loss_mlp": 0.14038086, + "step": 7187, + "time_per_iteration": 2.530064105987549 + }, + { + "auxiliary_loss_clip": 0.06368419, + "auxiliary_loss_mlp": 0.01252589, + "balance_loss_clip": 0.06291005, + "balance_loss_mlp": 0.01249776, + "epoch": 0.4321659401773636, + "flos": 49851718133760.0, + "grad_norm": 0.8382360401327144, + "language_loss": 0.68072379, + "learning_rate": 2.530072917616714e-06, + "loss": 0.75693387, + "num_input_tokens_seen": 154237110, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.02812195, + "step": 7188, + "time_per_iteration": 3.1670610904693604 + }, + { + "auxiliary_loss_clip": 0.06446176, + "auxiliary_loss_mlp": 0.01270026, + "balance_loss_clip": 0.06279401, + "balance_loss_mlp": 0.01256913, + "epoch": 0.43222606343003156, + "flos": 17134229203200.0, + "grad_norm": 1.9056972558163987, + "language_loss": 0.7844317, + "learning_rate": 2.529697373663614e-06, + "loss": 0.86159372, + "num_input_tokens_seen": 154253910, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.13110352, + "step": 7189, + "time_per_iteration": 2.491743564605713 + }, + { + "auxiliary_loss_clip": 0.06457567, + "auxiliary_loss_mlp": 0.01270927, + "balance_loss_clip": 0.06278813, + "balance_loss_mlp": 0.01255906, + "epoch": 0.4322861866826995, + "flos": 22756984784640.0, + "grad_norm": 1.8601510823080152, + "language_loss": 0.72126836, + "learning_rate": 2.5293218096243364e-06, + "loss": 0.79855329, + "num_input_tokens_seen": 154274770, + "router_z_loss_clip": 1.78808594, + "router_z_loss_mlp": 0.15020752, + "step": 7190, + "time_per_iteration": 2.5745973587036133 + }, + { + "auxiliary_loss_clip": 0.06452946, + "auxiliary_loss_mlp": 0.01274284, + "balance_loss_clip": 0.06282853, + "balance_loss_mlp": 0.0125992, + "epoch": 0.4323463099353675, + "flos": 27899400435840.0, + "grad_norm": 1.5852812804273753, + "language_loss": 0.79949737, + "learning_rate": 2.5289462255131223e-06, + "loss": 0.87676966, + "num_input_tokens_seen": 154295035, + "router_z_loss_clip": 1.70019531, + "router_z_loss_mlp": 0.14355469, + "step": 7191, + "time_per_iteration": 2.5719873905181885 + }, + { + "auxiliary_loss_clip": 0.06448484, + "auxiliary_loss_mlp": 0.01269731, + "balance_loss_clip": 0.06279992, + "balance_loss_mlp": 0.01255694, + "epoch": 0.43240643318803546, + "flos": 21620892401280.0, + "grad_norm": 3.0880415359088467, + "language_loss": 0.75279927, + "learning_rate": 2.5285706213442146e-06, + "loss": 0.82998139, + "num_input_tokens_seen": 154314905, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.14056396, + "step": 7192, + "time_per_iteration": 2.536587715148926 + }, + { + "auxiliary_loss_clip": 0.0644784, + "auxiliary_loss_mlp": 0.01276118, + "balance_loss_clip": 0.06277698, + "balance_loss_mlp": 0.01260883, + "epoch": 0.4324665564407034, + "flos": 17562774510720.0, + "grad_norm": 2.069328799544239, + "language_loss": 0.79199994, + "learning_rate": 2.5281949971318557e-06, + "loss": 0.86923951, + "num_input_tokens_seen": 154331740, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.15216064, + "step": 7193, + "time_per_iteration": 2.483978033065796 + }, + { + "auxiliary_loss_clip": 0.06449077, + "auxiliary_loss_mlp": 0.01277624, + "balance_loss_clip": 0.06278618, + "balance_loss_mlp": 0.01263212, + "epoch": 0.4325266796933714, + "flos": 18407775409920.0, + "grad_norm": 2.329186427032778, + "language_loss": 0.76053572, + "learning_rate": 2.5278193528902897e-06, + "loss": 0.83780271, + "num_input_tokens_seen": 154348740, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.14404297, + "step": 7194, + "time_per_iteration": 2.5057263374328613 + }, + { + "auxiliary_loss_clip": 0.06451394, + "auxiliary_loss_mlp": 0.01275378, + "balance_loss_clip": 0.06279992, + "balance_loss_mlp": 0.01260847, + "epoch": 0.4325868029460394, + "flos": 22571342064000.0, + "grad_norm": 1.9582306658700896, + "language_loss": 0.60073519, + "learning_rate": 2.5274436886337613e-06, + "loss": 0.67800295, + "num_input_tokens_seen": 154368835, + "router_z_loss_clip": 1.71191406, + "router_z_loss_mlp": 0.14532471, + "step": 7195, + "time_per_iteration": 2.5116991996765137 + }, + { + "auxiliary_loss_clip": 0.06458029, + "auxiliary_loss_mlp": 0.01275051, + "balance_loss_clip": 0.06281463, + "balance_loss_mlp": 0.01259989, + "epoch": 0.43264692619870737, + "flos": 14609834046720.0, + "grad_norm": 1.968403141706004, + "language_loss": 0.65685856, + "learning_rate": 2.527068004376515e-06, + "loss": 0.73418939, + "num_input_tokens_seen": 154384620, + "router_z_loss_clip": 1.76660156, + "router_z_loss_mlp": 0.1506958, + "step": 7196, + "time_per_iteration": 2.5037827491760254 + }, + { + "auxiliary_loss_clip": 0.06456476, + "auxiliary_loss_mlp": 0.01272338, + "balance_loss_clip": 0.06280259, + "balance_loss_mlp": 0.01257151, + "epoch": 0.43270704945137534, + "flos": 21507184010880.0, + "grad_norm": 2.17558250449299, + "language_loss": 0.72638965, + "learning_rate": 2.526692300132797e-06, + "loss": 0.8036778, + "num_input_tokens_seen": 154402865, + "router_z_loss_clip": 1.76171875, + "router_z_loss_mlp": 0.15197754, + "step": 7197, + "time_per_iteration": 2.4931299686431885 + }, + { + "auxiliary_loss_clip": 0.0645181, + "auxiliary_loss_mlp": 0.01280731, + "balance_loss_clip": 0.06284913, + "balance_loss_mlp": 0.01265627, + "epoch": 0.4327671727040433, + "flos": 25162975722240.0, + "grad_norm": 1.6800922175899422, + "language_loss": 0.72821289, + "learning_rate": 2.5263165759168547e-06, + "loss": 0.8055383, + "num_input_tokens_seen": 154423625, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.15100098, + "step": 7198, + "time_per_iteration": 2.574894428253174 + }, + { + "auxiliary_loss_clip": 0.06448364, + "auxiliary_loss_mlp": 0.01268994, + "balance_loss_clip": 0.06280281, + "balance_loss_mlp": 0.01254969, + "epoch": 0.43282729595671127, + "flos": 25454192987520.0, + "grad_norm": 1.3407856907116962, + "language_loss": 0.8128798, + "learning_rate": 2.525940831742934e-06, + "loss": 0.89005339, + "num_input_tokens_seen": 154444775, + "router_z_loss_clip": 1.68066406, + "router_z_loss_mlp": 0.14013672, + "step": 7199, + "time_per_iteration": 2.5314407348632812 + }, + { + "auxiliary_loss_clip": 0.06450363, + "auxiliary_loss_mlp": 0.01269925, + "balance_loss_clip": 0.06280895, + "balance_loss_mlp": 0.01255918, + "epoch": 0.43288741920937923, + "flos": 24131661269760.0, + "grad_norm": 2.374744791798318, + "language_loss": 0.68757379, + "learning_rate": 2.525565067625286e-06, + "loss": 0.76477665, + "num_input_tokens_seen": 154460815, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.14013672, + "step": 7200, + "time_per_iteration": 2.5569095611572266 + }, + { + "auxiliary_loss_clip": 0.06449814, + "auxiliary_loss_mlp": 0.01269719, + "balance_loss_clip": 0.06278992, + "balance_loss_mlp": 0.01254925, + "epoch": 0.4329475424620472, + "flos": 19210415270400.0, + "grad_norm": 1.7756006077325563, + "language_loss": 0.87039292, + "learning_rate": 2.525189283578157e-06, + "loss": 0.94758821, + "num_input_tokens_seen": 154479145, + "router_z_loss_clip": 1.70898438, + "router_z_loss_mlp": 0.14807129, + "step": 7201, + "time_per_iteration": 2.4946835041046143 + }, + { + "auxiliary_loss_clip": 0.06464264, + "auxiliary_loss_mlp": 0.0127186, + "balance_loss_clip": 0.06283499, + "balance_loss_mlp": 0.01255016, + "epoch": 0.43300766571471516, + "flos": 22645037329920.0, + "grad_norm": 5.903168179153311, + "language_loss": 0.64564252, + "learning_rate": 2.5248134796157974e-06, + "loss": 0.72300375, + "num_input_tokens_seen": 154498905, + "router_z_loss_clip": 1.80664062, + "router_z_loss_mlp": 0.16845703, + "step": 7202, + "time_per_iteration": 2.5667803287506104 + }, + { + "auxiliary_loss_clip": 0.06448028, + "auxiliary_loss_mlp": 0.01268297, + "balance_loss_clip": 0.06278727, + "balance_loss_mlp": 0.01254838, + "epoch": 0.4330677889673831, + "flos": 22126570992000.0, + "grad_norm": 2.072135817395126, + "language_loss": 0.8230809, + "learning_rate": 2.5244376557524586e-06, + "loss": 0.90024418, + "num_input_tokens_seen": 154517270, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.13470459, + "step": 7203, + "time_per_iteration": 5.375681161880493 + }, + { + "auxiliary_loss_clip": 0.06458279, + "auxiliary_loss_mlp": 0.01268927, + "balance_loss_clip": 0.06282033, + "balance_loss_mlp": 0.01254169, + "epoch": 0.4331279122200511, + "flos": 23228184620160.0, + "grad_norm": 2.3968905297379024, + "language_loss": 0.81134045, + "learning_rate": 2.5240618120023912e-06, + "loss": 0.88861251, + "num_input_tokens_seen": 154535945, + "router_z_loss_clip": 1.75976562, + "router_z_loss_mlp": 0.14764404, + "step": 7204, + "time_per_iteration": 2.524557113647461 + }, + { + "auxiliary_loss_clip": 0.06450962, + "auxiliary_loss_mlp": 0.01270518, + "balance_loss_clip": 0.06281083, + "balance_loss_mlp": 0.0125691, + "epoch": 0.43318803547271906, + "flos": 18265625758080.0, + "grad_norm": 2.088854485199162, + "language_loss": 0.7413221, + "learning_rate": 2.5236859483798468e-06, + "loss": 0.81853694, + "num_input_tokens_seen": 154554935, + "router_z_loss_clip": 1.69726562, + "router_z_loss_mlp": 0.13604736, + "step": 7205, + "time_per_iteration": 2.519554376602173 + }, + { + "auxiliary_loss_clip": 0.0644919, + "auxiliary_loss_mlp": 0.01273515, + "balance_loss_clip": 0.06284859, + "balance_loss_mlp": 0.01259908, + "epoch": 0.433248158725387, + "flos": 27425936540160.0, + "grad_norm": 1.5872196628882773, + "language_loss": 0.75603741, + "learning_rate": 2.5233100648990803e-06, + "loss": 0.83326447, + "num_input_tokens_seen": 154576065, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.13598633, + "step": 7206, + "time_per_iteration": 2.5732641220092773 + }, + { + "auxiliary_loss_clip": 0.0644986, + "auxiliary_loss_mlp": 0.01269665, + "balance_loss_clip": 0.06280635, + "balance_loss_mlp": 0.01254728, + "epoch": 0.433308281978055, + "flos": 23224075770240.0, + "grad_norm": 1.828436296505125, + "language_loss": 0.78923273, + "learning_rate": 2.522934161574342e-06, + "loss": 0.86642796, + "num_input_tokens_seen": 154595110, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.1496582, + "step": 7207, + "time_per_iteration": 2.6846628189086914 + }, + { + "auxiliary_loss_clip": 0.06456017, + "auxiliary_loss_mlp": 0.01270448, + "balance_loss_clip": 0.06279423, + "balance_loss_mlp": 0.0125513, + "epoch": 0.433368405230723, + "flos": 15857999665920.0, + "grad_norm": 2.196810095173743, + "language_loss": 0.81095958, + "learning_rate": 2.5225582384198888e-06, + "loss": 0.8882243, + "num_input_tokens_seen": 154612255, + "router_z_loss_clip": 1.76367188, + "router_z_loss_mlp": 0.15307617, + "step": 7208, + "time_per_iteration": 2.4724419116973877 + }, + { + "auxiliary_loss_clip": 0.0645436, + "auxiliary_loss_mlp": 0.01269383, + "balance_loss_clip": 0.0628323, + "balance_loss_mlp": 0.0125481, + "epoch": 0.433428528483391, + "flos": 19032109781760.0, + "grad_norm": 2.1243132825557107, + "language_loss": 0.71321076, + "learning_rate": 2.5221822954499744e-06, + "loss": 0.79044819, + "num_input_tokens_seen": 154630440, + "router_z_loss_clip": 1.7109375, + "router_z_loss_mlp": 0.14581299, + "step": 7209, + "time_per_iteration": 3.9143481254577637 + }, + { + "auxiliary_loss_clip": 0.06450495, + "auxiliary_loss_mlp": 0.01271038, + "balance_loss_clip": 0.06281973, + "balance_loss_mlp": 0.01255517, + "epoch": 0.43348865173605894, + "flos": 24725290320000.0, + "grad_norm": 1.4388803928851785, + "language_loss": 0.8148647, + "learning_rate": 2.5218063326788557e-06, + "loss": 0.89208007, + "num_input_tokens_seen": 154652515, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.15515137, + "step": 7210, + "time_per_iteration": 2.564333915710449 + }, + { + "auxiliary_loss_clip": 0.06451392, + "auxiliary_loss_mlp": 0.01274146, + "balance_loss_clip": 0.06281275, + "balance_loss_mlp": 0.01261045, + "epoch": 0.4335487749887269, + "flos": 22097165408640.0, + "grad_norm": 1.8576931130518815, + "language_loss": 0.82474005, + "learning_rate": 2.5214303501207885e-06, + "loss": 0.90199542, + "num_input_tokens_seen": 154670965, + "router_z_loss_clip": 1.70019531, + "router_z_loss_mlp": 0.13110352, + "step": 7211, + "time_per_iteration": 2.491514205932617 + }, + { + "auxiliary_loss_clip": 0.06452142, + "auxiliary_loss_mlp": 0.01271809, + "balance_loss_clip": 0.06280628, + "balance_loss_mlp": 0.01258362, + "epoch": 0.43360889824139487, + "flos": 22389556631040.0, + "grad_norm": 12.106558391415842, + "language_loss": 0.7536357, + "learning_rate": 2.521054347790029e-06, + "loss": 0.83087522, + "num_input_tokens_seen": 154689980, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.13452148, + "step": 7212, + "time_per_iteration": 2.551093816757202 + }, + { + "auxiliary_loss_clip": 0.06452519, + "auxiliary_loss_mlp": 0.01272111, + "balance_loss_clip": 0.06284005, + "balance_loss_mlp": 0.01259517, + "epoch": 0.43366902149406283, + "flos": 17533746270720.0, + "grad_norm": 1.8081714291238689, + "language_loss": 0.77247733, + "learning_rate": 2.5206783257008375e-06, + "loss": 0.84972358, + "num_input_tokens_seen": 154706570, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.1260376, + "step": 7213, + "time_per_iteration": 3.8823790550231934 + }, + { + "auxiliary_loss_clip": 0.06452443, + "auxiliary_loss_mlp": 0.01274704, + "balance_loss_clip": 0.06281798, + "balance_loss_mlp": 0.01261245, + "epoch": 0.4337291447467308, + "flos": 19028126712960.0, + "grad_norm": 1.4293111519880635, + "language_loss": 0.65090191, + "learning_rate": 2.520302283867471e-06, + "loss": 0.72817338, + "num_input_tokens_seen": 154725210, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.13446045, + "step": 7214, + "time_per_iteration": 2.512341260910034 + }, + { + "auxiliary_loss_clip": 0.0644484, + "auxiliary_loss_mlp": 0.01268075, + "balance_loss_clip": 0.06280676, + "balance_loss_mlp": 0.01255319, + "epoch": 0.43378926799939876, + "flos": 27241216214400.0, + "grad_norm": 1.6847650033402397, + "language_loss": 0.7180531, + "learning_rate": 2.519926222304191e-06, + "loss": 0.79518223, + "num_input_tokens_seen": 154745945, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.12750244, + "step": 7215, + "time_per_iteration": 2.5413544178009033 + }, + { + "auxiliary_loss_clip": 0.06451561, + "auxiliary_loss_mlp": 0.01271937, + "balance_loss_clip": 0.06284516, + "balance_loss_mlp": 0.01258365, + "epoch": 0.43384939125206673, + "flos": 15966592957440.0, + "grad_norm": 1.7641597528508168, + "language_loss": 0.75291193, + "learning_rate": 2.519550141025255e-06, + "loss": 0.83014691, + "num_input_tokens_seen": 154763580, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.13574219, + "step": 7216, + "time_per_iteration": 2.539677143096924 + }, + { + "auxiliary_loss_clip": 0.06459753, + "auxiliary_loss_mlp": 0.01268936, + "balance_loss_clip": 0.06280532, + "balance_loss_mlp": 0.01254256, + "epoch": 0.4339095145047347, + "flos": 21798736692480.0, + "grad_norm": 2.367070732862923, + "language_loss": 0.7623983, + "learning_rate": 2.519174040044927e-06, + "loss": 0.8396852, + "num_input_tokens_seen": 154776825, + "router_z_loss_clip": 1.79394531, + "router_z_loss_mlp": 0.14685059, + "step": 7217, + "time_per_iteration": 2.491522789001465 + }, + { + "auxiliary_loss_clip": 0.06451164, + "auxiliary_loss_mlp": 0.01267926, + "balance_loss_clip": 0.0628095, + "balance_loss_mlp": 0.01254389, + "epoch": 0.43396963775740266, + "flos": 14215054734720.0, + "grad_norm": 2.758270274773255, + "language_loss": 0.74231893, + "learning_rate": 2.5187979193774664e-06, + "loss": 0.81950986, + "num_input_tokens_seen": 154794025, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.13531494, + "step": 7218, + "time_per_iteration": 2.5123910903930664 + }, + { + "auxiliary_loss_clip": 0.06450492, + "auxiliary_loss_mlp": 0.01270563, + "balance_loss_clip": 0.06277994, + "balance_loss_mlp": 0.01256443, + "epoch": 0.4340297610100706, + "flos": 19725150101760.0, + "grad_norm": 1.5975368135070402, + "language_loss": 0.69353253, + "learning_rate": 2.5184217790371367e-06, + "loss": 0.77074307, + "num_input_tokens_seen": 154813105, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.14117432, + "step": 7219, + "time_per_iteration": 2.502150297164917 + }, + { + "auxiliary_loss_clip": 0.06450121, + "auxiliary_loss_mlp": 0.01273865, + "balance_loss_clip": 0.06280973, + "balance_loss_mlp": 0.01259482, + "epoch": 0.4340898842627386, + "flos": 18959588472960.0, + "grad_norm": 2.696483499139917, + "language_loss": 0.77797616, + "learning_rate": 2.518045619038202e-06, + "loss": 0.85521603, + "num_input_tokens_seen": 154833525, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.1439209, + "step": 7220, + "time_per_iteration": 2.5805821418762207 + }, + { + "auxiliary_loss_clip": 0.06449743, + "auxiliary_loss_mlp": 0.01270897, + "balance_loss_clip": 0.06280366, + "balance_loss_mlp": 0.01257331, + "epoch": 0.4341500075154066, + "flos": 22024895662080.0, + "grad_norm": 2.140213938529436, + "language_loss": 0.69858402, + "learning_rate": 2.5176694393949243e-06, + "loss": 0.77579045, + "num_input_tokens_seen": 154853090, + "router_z_loss_clip": 1.69238281, + "router_z_loss_mlp": 0.13562012, + "step": 7221, + "time_per_iteration": 2.556913137435913 + }, + { + "auxiliary_loss_clip": 0.06448823, + "auxiliary_loss_mlp": 0.01267968, + "balance_loss_clip": 0.06277943, + "balance_loss_mlp": 0.01254188, + "epoch": 0.4342101307680746, + "flos": 23588527104000.0, + "grad_norm": 1.6725579163220456, + "language_loss": 0.65062654, + "learning_rate": 2.51729324012157e-06, + "loss": 0.72779441, + "num_input_tokens_seen": 154872055, + "router_z_loss_clip": 1.70898438, + "router_z_loss_mlp": 0.13793945, + "step": 7222, + "time_per_iteration": 2.642038345336914 + }, + { + "auxiliary_loss_clip": 0.0644563, + "auxiliary_loss_mlp": 0.01269163, + "balance_loss_clip": 0.06277044, + "balance_loss_mlp": 0.01254912, + "epoch": 0.43427025402074254, + "flos": 17973821514240.0, + "grad_norm": 2.158287657708821, + "language_loss": 0.73335516, + "learning_rate": 2.5169170212324053e-06, + "loss": 0.81050307, + "num_input_tokens_seen": 154886645, + "router_z_loss_clip": 1.68554688, + "router_z_loss_mlp": 0.14257812, + "step": 7223, + "time_per_iteration": 2.5124166011810303 + }, + { + "auxiliary_loss_clip": 0.06448437, + "auxiliary_loss_mlp": 0.01270913, + "balance_loss_clip": 0.06275682, + "balance_loss_mlp": 0.0125746, + "epoch": 0.4343303772734105, + "flos": 26293575663360.0, + "grad_norm": 1.9810355285503365, + "language_loss": 0.94283241, + "learning_rate": 2.516540782741694e-06, + "loss": 1.02002597, + "num_input_tokens_seen": 154906775, + "router_z_loss_clip": 1.72753906, + "router_z_loss_mlp": 0.13458252, + "step": 7224, + "time_per_iteration": 2.5581512451171875 + }, + { + "auxiliary_loss_clip": 0.06445128, + "auxiliary_loss_mlp": 0.01270275, + "balance_loss_clip": 0.06277162, + "balance_loss_mlp": 0.01257383, + "epoch": 0.43439050052607847, + "flos": 26841279876480.0, + "grad_norm": 2.0217716161026624, + "language_loss": 0.61832798, + "learning_rate": 2.5161645246637056e-06, + "loss": 0.69548196, + "num_input_tokens_seen": 154926990, + "router_z_loss_clip": 1.6796875, + "router_z_loss_mlp": 0.12890625, + "step": 7225, + "time_per_iteration": 2.5797905921936035 + }, + { + "auxiliary_loss_clip": 0.06447432, + "auxiliary_loss_mlp": 0.01269551, + "balance_loss_clip": 0.06278066, + "balance_loss_mlp": 0.01255895, + "epoch": 0.43445062377874644, + "flos": 21404083161600.0, + "grad_norm": 2.452465231522654, + "language_loss": 0.77966076, + "learning_rate": 2.5157882470127054e-06, + "loss": 0.8568306, + "num_input_tokens_seen": 154946210, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.13653564, + "step": 7226, + "time_per_iteration": 2.511101722717285 + }, + { + "auxiliary_loss_clip": 0.06444375, + "auxiliary_loss_mlp": 0.01273195, + "balance_loss_clip": 0.06280836, + "balance_loss_mlp": 0.01260553, + "epoch": 0.4345107470314144, + "flos": 19908151418880.0, + "grad_norm": 1.6845072318289191, + "language_loss": 0.84942114, + "learning_rate": 2.515411949802964e-06, + "loss": 0.92659688, + "num_input_tokens_seen": 154964995, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.12652588, + "step": 7227, + "time_per_iteration": 2.525317430496216 + }, + { + "auxiliary_loss_clip": 0.06449986, + "auxiliary_loss_mlp": 0.01270041, + "balance_loss_clip": 0.06281552, + "balance_loss_mlp": 0.0125601, + "epoch": 0.43457087028408237, + "flos": 26439876092160.0, + "grad_norm": 2.0880007397823714, + "language_loss": 0.77098775, + "learning_rate": 2.5150356330487498e-06, + "loss": 0.84818804, + "num_input_tokens_seen": 154984775, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.14025879, + "step": 7228, + "time_per_iteration": 2.5491206645965576 + }, + { + "auxiliary_loss_clip": 0.06447831, + "auxiliary_loss_mlp": 0.01269154, + "balance_loss_clip": 0.06281967, + "balance_loss_mlp": 0.0125486, + "epoch": 0.43463099353675033, + "flos": 31876947025920.0, + "grad_norm": 1.527689344505128, + "language_loss": 0.80533445, + "learning_rate": 2.5146592967643324e-06, + "loss": 0.88250422, + "num_input_tokens_seen": 155008125, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.14294434, + "step": 7229, + "time_per_iteration": 2.6139633655548096 + }, + { + "auxiliary_loss_clip": 0.06448658, + "auxiliary_loss_mlp": 0.01272316, + "balance_loss_clip": 0.0627811, + "balance_loss_mlp": 0.01258208, + "epoch": 0.4346911167894183, + "flos": 24578109423360.0, + "grad_norm": 1.897670481755329, + "language_loss": 0.8187139, + "learning_rate": 2.5142829409639834e-06, + "loss": 0.89592373, + "num_input_tokens_seen": 155027885, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.14117432, + "step": 7230, + "time_per_iteration": 2.535597085952759 + }, + { + "auxiliary_loss_clip": 0.06454149, + "auxiliary_loss_mlp": 0.01272993, + "balance_loss_clip": 0.06280425, + "balance_loss_mlp": 0.01258849, + "epoch": 0.43475124004208626, + "flos": 17096102795520.0, + "grad_norm": 2.6326033188165012, + "language_loss": 0.77091682, + "learning_rate": 2.513906565661973e-06, + "loss": 0.84818828, + "num_input_tokens_seen": 155043375, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.14135742, + "step": 7231, + "time_per_iteration": 2.509392738342285 + }, + { + "auxiliary_loss_clip": 0.064488, + "auxiliary_loss_mlp": 0.01274763, + "balance_loss_clip": 0.06282736, + "balance_loss_mlp": 0.01262162, + "epoch": 0.4348113632947542, + "flos": 26111874084480.0, + "grad_norm": 2.1662461953899044, + "language_loss": 0.69288278, + "learning_rate": 2.513530170872575e-06, + "loss": 0.77011836, + "num_input_tokens_seen": 155062930, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.1260376, + "step": 7232, + "time_per_iteration": 2.547469139099121 + }, + { + "auxiliary_loss_clip": 0.0645097, + "auxiliary_loss_mlp": 0.01271517, + "balance_loss_clip": 0.06279375, + "balance_loss_mlp": 0.01256431, + "epoch": 0.4348714865474222, + "flos": 34208446083840.0, + "grad_norm": 2.030594980717477, + "language_loss": 0.72046328, + "learning_rate": 2.5131537566100605e-06, + "loss": 0.79768813, + "num_input_tokens_seen": 155084980, + "router_z_loss_clip": 1.71386719, + "router_z_loss_mlp": 0.15075684, + "step": 7233, + "time_per_iteration": 2.633953332901001 + }, + { + "auxiliary_loss_clip": 0.06453332, + "auxiliary_loss_mlp": 0.01271348, + "balance_loss_clip": 0.06279553, + "balance_loss_mlp": 0.01257466, + "epoch": 0.43493160980009016, + "flos": 31545045803520.0, + "grad_norm": 1.5667863682634524, + "language_loss": 0.75517476, + "learning_rate": 2.5127773228887053e-06, + "loss": 0.83242154, + "num_input_tokens_seen": 155107260, + "router_z_loss_clip": 1.73730469, + "router_z_loss_mlp": 0.13885498, + "step": 7234, + "time_per_iteration": 2.592467784881592 + }, + { + "auxiliary_loss_clip": 0.06464201, + "auxiliary_loss_mlp": 0.01272529, + "balance_loss_clip": 0.06286918, + "balance_loss_mlp": 0.01258003, + "epoch": 0.4349917330527582, + "flos": 24068238128640.0, + "grad_norm": 2.6345915143615284, + "language_loss": 0.5890404, + "learning_rate": 2.512400869722782e-06, + "loss": 0.6664077, + "num_input_tokens_seen": 155126720, + "router_z_loss_clip": 1.77246094, + "router_z_loss_mlp": 0.14520264, + "step": 7235, + "time_per_iteration": 2.5652947425842285 + }, + { + "auxiliary_loss_clip": 0.06449015, + "auxiliary_loss_mlp": 0.01271774, + "balance_loss_clip": 0.06278686, + "balance_loss_mlp": 0.01257754, + "epoch": 0.43505185630542614, + "flos": 30527315712000.0, + "grad_norm": 1.3439257210534017, + "language_loss": 0.77555895, + "learning_rate": 2.512024397126566e-06, + "loss": 0.85276687, + "num_input_tokens_seen": 155148640, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.14019775, + "step": 7236, + "time_per_iteration": 2.600897789001465 + }, + { + "auxiliary_loss_clip": 0.06450135, + "auxiliary_loss_mlp": 0.01275561, + "balance_loss_clip": 0.06283981, + "balance_loss_mlp": 0.01260833, + "epoch": 0.4351119795580941, + "flos": 15739427738880.0, + "grad_norm": 1.5753739577535406, + "language_loss": 0.81058431, + "learning_rate": 2.5116479051143345e-06, + "loss": 0.88784134, + "num_input_tokens_seen": 155165870, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.14733887, + "step": 7237, + "time_per_iteration": 2.515153169631958 + }, + { + "auxiliary_loss_clip": 0.0644604, + "auxiliary_loss_mlp": 0.01270202, + "balance_loss_clip": 0.0627768, + "balance_loss_mlp": 0.0125607, + "epoch": 0.4351721028107621, + "flos": 18737328718080.0, + "grad_norm": 1.5657016421471992, + "language_loss": 0.63616467, + "learning_rate": 2.5112713937003623e-06, + "loss": 0.71332717, + "num_input_tokens_seen": 155185315, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.14129639, + "step": 7238, + "time_per_iteration": 2.4845099449157715 + }, + { + "auxiliary_loss_clip": 0.06448185, + "auxiliary_loss_mlp": 0.01273501, + "balance_loss_clip": 0.06281941, + "balance_loss_mlp": 0.01260162, + "epoch": 0.43523222606343004, + "flos": 25233652241280.0, + "grad_norm": 1.9152472058436172, + "language_loss": 0.85898602, + "learning_rate": 2.510894862898928e-06, + "loss": 0.93620288, + "num_input_tokens_seen": 155205790, + "router_z_loss_clip": 1.66308594, + "router_z_loss_mlp": 0.13342285, + "step": 7239, + "time_per_iteration": 2.579202175140381 + }, + { + "auxiliary_loss_clip": 0.06452584, + "auxiliary_loss_mlp": 0.01267786, + "balance_loss_clip": 0.06283215, + "balance_loss_mlp": 0.01253987, + "epoch": 0.435292349316098, + "flos": 22715504213760.0, + "grad_norm": 1.439066736410537, + "language_loss": 0.72456282, + "learning_rate": 2.510518312724309e-06, + "loss": 0.80176651, + "num_input_tokens_seen": 155226475, + "router_z_loss_clip": 1.69433594, + "router_z_loss_mlp": 0.13793945, + "step": 7240, + "time_per_iteration": 2.5192179679870605 + }, + { + "auxiliary_loss_clip": 0.06454788, + "auxiliary_loss_mlp": 0.01270866, + "balance_loss_clip": 0.06282151, + "balance_loss_mlp": 0.01256913, + "epoch": 0.43535247256876597, + "flos": 25783033536000.0, + "grad_norm": 2.0220617163145485, + "language_loss": 0.81900156, + "learning_rate": 2.5101417431907842e-06, + "loss": 0.89625818, + "num_input_tokens_seen": 155247110, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.1394043, + "step": 7241, + "time_per_iteration": 2.5792059898376465 + }, + { + "auxiliary_loss_clip": 0.06460294, + "auxiliary_loss_mlp": 0.01275581, + "balance_loss_clip": 0.0628238, + "balance_loss_mlp": 0.01260346, + "epoch": 0.43541259582143393, + "flos": 17533578562560.0, + "grad_norm": 2.581589278543144, + "language_loss": 0.79383838, + "learning_rate": 2.5097651543126345e-06, + "loss": 0.8711971, + "num_input_tokens_seen": 155261335, + "router_z_loss_clip": 1.77929688, + "router_z_loss_mlp": 0.15246582, + "step": 7242, + "time_per_iteration": 3.918156623840332 + }, + { + "auxiliary_loss_clip": 0.06452459, + "auxiliary_loss_mlp": 0.01271144, + "balance_loss_clip": 0.06278028, + "balance_loss_mlp": 0.01257405, + "epoch": 0.4354727190741019, + "flos": 15200612058240.0, + "grad_norm": 2.430343835688426, + "language_loss": 0.69088292, + "learning_rate": 2.509388546104138e-06, + "loss": 0.76811898, + "num_input_tokens_seen": 155278510, + "router_z_loss_clip": 1.7421875, + "router_z_loss_mlp": 0.13745117, + "step": 7243, + "time_per_iteration": 3.900606632232666 + }, + { + "auxiliary_loss_clip": 0.06444837, + "auxiliary_loss_mlp": 0.01271827, + "balance_loss_clip": 0.06279606, + "balance_loss_mlp": 0.01258655, + "epoch": 0.43553284232676986, + "flos": 16654015054080.0, + "grad_norm": 1.5901355562967736, + "language_loss": 0.81475091, + "learning_rate": 2.5090119185795766e-06, + "loss": 0.89191759, + "num_input_tokens_seen": 155296450, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.1317749, + "step": 7244, + "time_per_iteration": 2.581033229827881 + }, + { + "auxiliary_loss_clip": 0.06446069, + "auxiliary_loss_mlp": 0.01268137, + "balance_loss_clip": 0.06277774, + "balance_loss_mlp": 0.01255596, + "epoch": 0.43559296557943783, + "flos": 23407035160320.0, + "grad_norm": 1.5978807757182665, + "language_loss": 0.73241115, + "learning_rate": 2.508635271753234e-06, + "loss": 0.80955315, + "num_input_tokens_seen": 155316080, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.12554932, + "step": 7245, + "time_per_iteration": 2.5589826107025146 + }, + { + "auxiliary_loss_clip": 0.06452223, + "auxiliary_loss_mlp": 0.01269888, + "balance_loss_clip": 0.06282671, + "balance_loss_mlp": 0.01255792, + "epoch": 0.4356530888321058, + "flos": 22425628613760.0, + "grad_norm": 1.6720109050482812, + "language_loss": 0.77539527, + "learning_rate": 2.508258605639389e-06, + "loss": 0.85261637, + "num_input_tokens_seen": 155336765, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.14111328, + "step": 7246, + "time_per_iteration": 2.593538999557495 + }, + { + "auxiliary_loss_clip": 0.06448724, + "auxiliary_loss_mlp": 0.01267563, + "balance_loss_clip": 0.06280839, + "balance_loss_mlp": 0.01254033, + "epoch": 0.43571321208477376, + "flos": 21622527555840.0, + "grad_norm": 3.3071750834647426, + "language_loss": 0.86156344, + "learning_rate": 2.5078819202523275e-06, + "loss": 0.93872631, + "num_input_tokens_seen": 155356440, + "router_z_loss_clip": 1.67871094, + "router_z_loss_mlp": 0.13531494, + "step": 7247, + "time_per_iteration": 2.5369882583618164 + }, + { + "auxiliary_loss_clip": 0.06446265, + "auxiliary_loss_mlp": 0.01269788, + "balance_loss_clip": 0.06277846, + "balance_loss_mlp": 0.01257194, + "epoch": 0.4357733353374418, + "flos": 23994081665280.0, + "grad_norm": 1.7467086672612386, + "language_loss": 0.73132598, + "learning_rate": 2.507505215606333e-06, + "loss": 0.80848658, + "num_input_tokens_seen": 155377070, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.12597656, + "step": 7248, + "time_per_iteration": 3.9830687046051025 + }, + { + "auxiliary_loss_clip": 0.06447548, + "auxiliary_loss_mlp": 0.01267385, + "balance_loss_clip": 0.06279291, + "balance_loss_mlp": 0.01254022, + "epoch": 0.43583345859010975, + "flos": 25271736721920.0, + "grad_norm": 1.509350817375945, + "language_loss": 0.87227005, + "learning_rate": 2.5071284917156893e-06, + "loss": 0.94941938, + "num_input_tokens_seen": 155398415, + "router_z_loss_clip": 1.68066406, + "router_z_loss_mlp": 0.13378906, + "step": 7249, + "time_per_iteration": 2.565516948699951 + }, + { + "auxiliary_loss_clip": 0.06451611, + "auxiliary_loss_mlp": 0.01267914, + "balance_loss_clip": 0.06279075, + "balance_loss_mlp": 0.01254223, + "epoch": 0.4358935818427777, + "flos": 23703115962240.0, + "grad_norm": 1.8925784396827436, + "language_loss": 0.8199448, + "learning_rate": 2.506751748594683e-06, + "loss": 0.89714003, + "num_input_tokens_seen": 155415625, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.13690186, + "step": 7250, + "time_per_iteration": 2.5410354137420654 + }, + { + "auxiliary_loss_clip": 0.06454265, + "auxiliary_loss_mlp": 0.01273165, + "balance_loss_clip": 0.06283678, + "balance_loss_mlp": 0.01258901, + "epoch": 0.4359537050954457, + "flos": 29540416723200.0, + "grad_norm": 2.0613712873147723, + "language_loss": 0.85409963, + "learning_rate": 2.5063749862575988e-06, + "loss": 0.93137395, + "num_input_tokens_seen": 155435505, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.14251709, + "step": 7251, + "time_per_iteration": 2.5893919467926025 + }, + { + "auxiliary_loss_clip": 0.06448197, + "auxiliary_loss_mlp": 0.01270693, + "balance_loss_clip": 0.06280132, + "balance_loss_mlp": 0.01257431, + "epoch": 0.43601382834811364, + "flos": 22717935982080.0, + "grad_norm": 1.9454057009257966, + "language_loss": 0.69792974, + "learning_rate": 2.5059982047187245e-06, + "loss": 0.77511865, + "num_input_tokens_seen": 155455425, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.13262939, + "step": 7252, + "time_per_iteration": 2.518423080444336 + }, + { + "auxiliary_loss_clip": 0.06442783, + "auxiliary_loss_mlp": 0.01269502, + "balance_loss_clip": 0.06278728, + "balance_loss_mlp": 0.01256336, + "epoch": 0.4360739516007816, + "flos": 19104714944640.0, + "grad_norm": 1.67696041016681, + "language_loss": 0.83826983, + "learning_rate": 2.505621403992348e-06, + "loss": 0.91539264, + "num_input_tokens_seen": 155474250, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.13146973, + "step": 7253, + "time_per_iteration": 3.929287910461426 + }, + { + "auxiliary_loss_clip": 0.06446494, + "auxiliary_loss_mlp": 0.01271781, + "balance_loss_clip": 0.06278495, + "balance_loss_mlp": 0.01257095, + "epoch": 0.43613407485344957, + "flos": 23411185937280.0, + "grad_norm": 1.865330471105, + "language_loss": 0.7061553, + "learning_rate": 2.505244584092757e-06, + "loss": 0.78333807, + "num_input_tokens_seen": 155494685, + "router_z_loss_clip": 1.68066406, + "router_z_loss_mlp": 0.14678955, + "step": 7254, + "time_per_iteration": 2.5348615646362305 + }, + { + "auxiliary_loss_clip": 0.06446688, + "auxiliary_loss_mlp": 0.01270934, + "balance_loss_clip": 0.0628084, + "balance_loss_mlp": 0.01257249, + "epoch": 0.43619419810611754, + "flos": 22644366497280.0, + "grad_norm": 1.8869772682878516, + "language_loss": 0.81010306, + "learning_rate": 2.5048677450342406e-06, + "loss": 0.88727921, + "num_input_tokens_seen": 155513040, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.13671875, + "step": 7255, + "time_per_iteration": 2.6183383464813232 + }, + { + "auxiliary_loss_clip": 0.06450298, + "auxiliary_loss_mlp": 0.01267933, + "balance_loss_clip": 0.06279971, + "balance_loss_mlp": 0.01254772, + "epoch": 0.4362543213587855, + "flos": 20054200285440.0, + "grad_norm": 1.8086691858124306, + "language_loss": 0.78106731, + "learning_rate": 2.504490886831089e-06, + "loss": 0.85824955, + "num_input_tokens_seen": 155530100, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.13165283, + "step": 7256, + "time_per_iteration": 2.5364508628845215 + }, + { + "auxiliary_loss_clip": 0.06446915, + "auxiliary_loss_mlp": 0.01269551, + "balance_loss_clip": 0.06280836, + "balance_loss_mlp": 0.01256122, + "epoch": 0.43631444461145347, + "flos": 21367759616640.0, + "grad_norm": 1.5279282177598472, + "language_loss": 0.75952047, + "learning_rate": 2.5041140094975922e-06, + "loss": 0.83668512, + "num_input_tokens_seen": 155549375, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.13452148, + "step": 7257, + "time_per_iteration": 2.5156846046447754 + }, + { + "auxiliary_loss_clip": 0.06452259, + "auxiliary_loss_mlp": 0.01269452, + "balance_loss_clip": 0.06281701, + "balance_loss_mlp": 0.01255123, + "epoch": 0.43637456786412143, + "flos": 22424999708160.0, + "grad_norm": 1.7230532534800784, + "language_loss": 0.73248196, + "learning_rate": 2.5037371130480417e-06, + "loss": 0.80969918, + "num_input_tokens_seen": 155569395, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.14324951, + "step": 7258, + "time_per_iteration": 2.6132447719573975 + }, + { + "auxiliary_loss_clip": 0.06453618, + "auxiliary_loss_mlp": 0.01267142, + "balance_loss_clip": 0.06282197, + "balance_loss_mlp": 0.01253725, + "epoch": 0.4364346911167894, + "flos": 28556452627200.0, + "grad_norm": 1.8100021880336497, + "language_loss": 0.77633202, + "learning_rate": 2.5033601974967297e-06, + "loss": 0.85353959, + "num_input_tokens_seen": 155589090, + "router_z_loss_clip": 1.71191406, + "router_z_loss_mlp": 0.13415527, + "step": 7259, + "time_per_iteration": 2.589134931564331 + }, + { + "auxiliary_loss_clip": 0.06393245, + "auxiliary_loss_mlp": 0.01278627, + "balance_loss_clip": 0.0631365, + "balance_loss_mlp": 0.01275647, + "epoch": 0.43649481436945736, + "flos": 62678149407360.0, + "grad_norm": 0.7458705100033151, + "language_loss": 0.56939262, + "learning_rate": 2.5029832628579483e-06, + "loss": 0.64611137, + "num_input_tokens_seen": 155648660, + "router_z_loss_clip": 0.796875, + "router_z_loss_mlp": 0.02978516, + "step": 7260, + "time_per_iteration": 3.11572265625 + }, + { + "auxiliary_loss_clip": 0.06454421, + "auxiliary_loss_mlp": 0.01272288, + "balance_loss_clip": 0.06285764, + "balance_loss_mlp": 0.01257494, + "epoch": 0.4365549376221254, + "flos": 30600088583040.0, + "grad_norm": 1.806363539403124, + "language_loss": 0.71915948, + "learning_rate": 2.5026063091459907e-06, + "loss": 0.79642659, + "num_input_tokens_seen": 155669945, + "router_z_loss_clip": 1.6875, + "router_z_loss_mlp": 0.14794922, + "step": 7261, + "time_per_iteration": 2.6100480556488037 + }, + { + "auxiliary_loss_clip": 0.06453972, + "auxiliary_loss_mlp": 0.01271962, + "balance_loss_clip": 0.06284794, + "balance_loss_mlp": 0.0125836, + "epoch": 0.43661506087479335, + "flos": 17171684778240.0, + "grad_norm": 2.033659544742114, + "language_loss": 0.69274759, + "learning_rate": 2.5022293363751522e-06, + "loss": 0.77000701, + "num_input_tokens_seen": 155688555, + "router_z_loss_clip": 1.69042969, + "router_z_loss_mlp": 0.13604736, + "step": 7262, + "time_per_iteration": 2.556318521499634 + }, + { + "auxiliary_loss_clip": 0.0644339, + "auxiliary_loss_mlp": 0.01266124, + "balance_loss_clip": 0.06282735, + "balance_loss_mlp": 0.01253345, + "epoch": 0.4366751841274613, + "flos": 22052875726080.0, + "grad_norm": 1.6437752521732585, + "language_loss": 0.80115777, + "learning_rate": 2.501852344559726e-06, + "loss": 0.87825286, + "num_input_tokens_seen": 155705370, + "router_z_loss_clip": 1.60546875, + "router_z_loss_mlp": 0.12780762, + "step": 7263, + "time_per_iteration": 2.509807825088501 + }, + { + "auxiliary_loss_clip": 0.06448945, + "auxiliary_loss_mlp": 0.01267422, + "balance_loss_clip": 0.06281485, + "balance_loss_mlp": 0.01254076, + "epoch": 0.4367353073801293, + "flos": 16002748794240.0, + "grad_norm": 1.6772415302555446, + "language_loss": 0.76036841, + "learning_rate": 2.50147533371401e-06, + "loss": 0.83753204, + "num_input_tokens_seen": 155721890, + "router_z_loss_clip": 1.67480469, + "router_z_loss_mlp": 0.13354492, + "step": 7264, + "time_per_iteration": 2.523973226547241 + }, + { + "auxiliary_loss_clip": 0.06444526, + "auxiliary_loss_mlp": 0.01267772, + "balance_loss_clip": 0.06279328, + "balance_loss_mlp": 0.01253997, + "epoch": 0.43679543063279724, + "flos": 38226760485120.0, + "grad_norm": 2.1479145935669615, + "language_loss": 0.61845875, + "learning_rate": 2.501098303852298e-06, + "loss": 0.69558173, + "num_input_tokens_seen": 155743970, + "router_z_loss_clip": 1.65234375, + "router_z_loss_mlp": 0.13787842, + "step": 7265, + "time_per_iteration": 2.6696202754974365 + }, + { + "auxiliary_loss_clip": 0.06447139, + "auxiliary_loss_mlp": 0.01269097, + "balance_loss_clip": 0.06282498, + "balance_loss_mlp": 0.01256211, + "epoch": 0.4368555538854652, + "flos": 15198306071040.0, + "grad_norm": 1.934873925186605, + "language_loss": 0.73721504, + "learning_rate": 2.5007212549888884e-06, + "loss": 0.81437743, + "num_input_tokens_seen": 155761830, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.12896729, + "step": 7266, + "time_per_iteration": 2.5559945106506348 + }, + { + "auxiliary_loss_clip": 0.0644975, + "auxiliary_loss_mlp": 0.01273187, + "balance_loss_clip": 0.06282988, + "balance_loss_mlp": 0.01260432, + "epoch": 0.4369156771381332, + "flos": 23074630813440.0, + "grad_norm": 2.1253877681457904, + "language_loss": 0.82184762, + "learning_rate": 2.5003441871380794e-06, + "loss": 0.899077, + "num_input_tokens_seen": 155779610, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.12762451, + "step": 7267, + "time_per_iteration": 2.534639358520508 + }, + { + "auxiliary_loss_clip": 0.06444408, + "auxiliary_loss_mlp": 0.01269536, + "balance_loss_clip": 0.06281124, + "balance_loss_mlp": 0.01256459, + "epoch": 0.43697580039080114, + "flos": 23447886825600.0, + "grad_norm": 2.09966668439896, + "language_loss": 0.75195235, + "learning_rate": 2.4999671003141674e-06, + "loss": 0.82909179, + "num_input_tokens_seen": 155798765, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.13085938, + "step": 7268, + "time_per_iteration": 2.6128745079040527 + }, + { + "auxiliary_loss_clip": 0.06451406, + "auxiliary_loss_mlp": 0.0126863, + "balance_loss_clip": 0.06280525, + "balance_loss_mlp": 0.0125451, + "epoch": 0.4370359236434691, + "flos": 18520519478400.0, + "grad_norm": 3.050341004743464, + "language_loss": 0.79660171, + "learning_rate": 2.499589994531454e-06, + "loss": 0.87380207, + "num_input_tokens_seen": 155817750, + "router_z_loss_clip": 1.70898438, + "router_z_loss_mlp": 0.14099121, + "step": 7269, + "time_per_iteration": 2.516211986541748 + }, + { + "auxiliary_loss_clip": 0.06446489, + "auxiliary_loss_mlp": 0.01273185, + "balance_loss_clip": 0.06281964, + "balance_loss_mlp": 0.01260174, + "epoch": 0.43709604689613707, + "flos": 23229316650240.0, + "grad_norm": 1.8886828014681587, + "language_loss": 0.75057715, + "learning_rate": 2.499212869804237e-06, + "loss": 0.82777393, + "num_input_tokens_seen": 155836490, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.13024902, + "step": 7270, + "time_per_iteration": 2.5755550861358643 + }, + { + "auxiliary_loss_clip": 0.06447008, + "auxiliary_loss_mlp": 0.01268284, + "balance_loss_clip": 0.06279345, + "balance_loss_mlp": 0.01255064, + "epoch": 0.43715617014880503, + "flos": 23810199880320.0, + "grad_norm": 1.808972971243201, + "language_loss": 0.79453981, + "learning_rate": 2.4988357261468182e-06, + "loss": 0.87169278, + "num_input_tokens_seen": 155856225, + "router_z_loss_clip": 1.67773438, + "router_z_loss_mlp": 0.13220215, + "step": 7271, + "time_per_iteration": 2.564471960067749 + }, + { + "auxiliary_loss_clip": 0.06369642, + "auxiliary_loss_mlp": 0.01258814, + "balance_loss_clip": 0.0629034, + "balance_loss_mlp": 0.01255858, + "epoch": 0.437216293401473, + "flos": 61961824851840.0, + "grad_norm": 0.6886560925106296, + "language_loss": 0.54733157, + "learning_rate": 2.4984585635734993e-06, + "loss": 0.62361616, + "num_input_tokens_seen": 155916770, + "router_z_loss_clip": 0.79443359, + "router_z_loss_mlp": 0.02954102, + "step": 7272, + "time_per_iteration": 3.208707332611084 + }, + { + "auxiliary_loss_clip": 0.06451584, + "auxiliary_loss_mlp": 0.01270794, + "balance_loss_clip": 0.06281105, + "balance_loss_mlp": 0.01256757, + "epoch": 0.43727641665414096, + "flos": 21988907533440.0, + "grad_norm": 1.571184799437717, + "language_loss": 0.70994467, + "learning_rate": 2.498081382098581e-06, + "loss": 0.78716844, + "num_input_tokens_seen": 155936490, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.14031982, + "step": 7273, + "time_per_iteration": 2.540081262588501 + }, + { + "auxiliary_loss_clip": 0.06448624, + "auxiliary_loss_mlp": 0.0126917, + "balance_loss_clip": 0.06279367, + "balance_loss_mlp": 0.01255271, + "epoch": 0.437336539906809, + "flos": 39540277889280.0, + "grad_norm": 1.8107596290780341, + "language_loss": 0.7551834, + "learning_rate": 2.497704181736367e-06, + "loss": 0.83236134, + "num_input_tokens_seen": 155957595, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.13903809, + "step": 7274, + "time_per_iteration": 2.6836495399475098 + }, + { + "auxiliary_loss_clip": 0.06441884, + "auxiliary_loss_mlp": 0.01265059, + "balance_loss_clip": 0.06277934, + "balance_loss_mlp": 0.01252703, + "epoch": 0.43739666315947695, + "flos": 17462902043520.0, + "grad_norm": 1.9085211858375455, + "language_loss": 0.80314881, + "learning_rate": 2.49732696250116e-06, + "loss": 0.88021827, + "num_input_tokens_seen": 155975710, + "router_z_loss_clip": 1.63867188, + "router_z_loss_mlp": 0.12353516, + "step": 7275, + "time_per_iteration": 2.5408823490142822 + }, + { + "auxiliary_loss_clip": 0.06450746, + "auxiliary_loss_mlp": 0.01272848, + "balance_loss_clip": 0.06284586, + "balance_loss_mlp": 0.01259753, + "epoch": 0.4374567864121449, + "flos": 16363678256640.0, + "grad_norm": 1.98644372860744, + "language_loss": 0.81298435, + "learning_rate": 2.496949724407266e-06, + "loss": 0.89022022, + "num_input_tokens_seen": 155993090, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.13092041, + "step": 7276, + "time_per_iteration": 2.4871010780334473 + }, + { + "auxiliary_loss_clip": 0.06454313, + "auxiliary_loss_mlp": 0.01266955, + "balance_loss_clip": 0.06281172, + "balance_loss_mlp": 0.01253013, + "epoch": 0.4375169096648129, + "flos": 30594721921920.0, + "grad_norm": 1.9320579241517422, + "language_loss": 0.73048055, + "learning_rate": 2.496572467468988e-06, + "loss": 0.8076933, + "num_input_tokens_seen": 156013685, + "router_z_loss_clip": 1.73339844, + "router_z_loss_mlp": 0.1394043, + "step": 7277, + "time_per_iteration": 2.6151673793792725 + }, + { + "auxiliary_loss_clip": 0.06445154, + "auxiliary_loss_mlp": 0.01272648, + "balance_loss_clip": 0.06279732, + "balance_loss_mlp": 0.01258939, + "epoch": 0.43757703291748085, + "flos": 30563555402880.0, + "grad_norm": 1.9557335242574223, + "language_loss": 0.72527206, + "learning_rate": 2.4961951917006317e-06, + "loss": 0.80245006, + "num_input_tokens_seen": 156034300, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.13696289, + "step": 7278, + "time_per_iteration": 2.583293914794922 + }, + { + "auxiliary_loss_clip": 0.06440841, + "auxiliary_loss_mlp": 0.01270709, + "balance_loss_clip": 0.06277154, + "balance_loss_mlp": 0.01258371, + "epoch": 0.4376371561701488, + "flos": 21403747745280.0, + "grad_norm": 1.4778175335443475, + "language_loss": 0.65870327, + "learning_rate": 2.4958178971165046e-06, + "loss": 0.73581874, + "num_input_tokens_seen": 156053805, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.12329102, + "step": 7279, + "time_per_iteration": 2.5419130325317383 + }, + { + "auxiliary_loss_clip": 0.06451775, + "auxiliary_loss_mlp": 0.01270137, + "balance_loss_clip": 0.06279162, + "balance_loss_mlp": 0.01256559, + "epoch": 0.4376972794228168, + "flos": 23411144010240.0, + "grad_norm": 1.7454635588007905, + "language_loss": 0.8264519, + "learning_rate": 2.4954405837309126e-06, + "loss": 0.90367103, + "num_input_tokens_seen": 156073295, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.13568115, + "step": 7280, + "time_per_iteration": 2.5270493030548096 + }, + { + "auxiliary_loss_clip": 0.06438784, + "auxiliary_loss_mlp": 0.01272842, + "balance_loss_clip": 0.06277376, + "balance_loss_mlp": 0.01259848, + "epoch": 0.43775740267548474, + "flos": 22899511779840.0, + "grad_norm": 1.6085189920631162, + "language_loss": 0.7756325, + "learning_rate": 2.4950632515581653e-06, + "loss": 0.85274875, + "num_input_tokens_seen": 156094540, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.13000488, + "step": 7281, + "time_per_iteration": 2.614102602005005 + }, + { + "auxiliary_loss_clip": 0.0644282, + "auxiliary_loss_mlp": 0.01275956, + "balance_loss_clip": 0.06276567, + "balance_loss_mlp": 0.01263028, + "epoch": 0.4378175259281527, + "flos": 23301041345280.0, + "grad_norm": 1.8125010794319167, + "language_loss": 0.7622053, + "learning_rate": 2.494685900612569e-06, + "loss": 0.83939308, + "num_input_tokens_seen": 156114070, + "router_z_loss_clip": 1.66308594, + "router_z_loss_mlp": 0.12915039, + "step": 7282, + "time_per_iteration": 3.9149930477142334 + }, + { + "auxiliary_loss_clip": 0.06446523, + "auxiliary_loss_mlp": 0.01267087, + "balance_loss_clip": 0.06279582, + "balance_loss_mlp": 0.01254438, + "epoch": 0.43787764918082067, + "flos": 23883433948800.0, + "grad_norm": 2.0076194716834874, + "language_loss": 0.85396934, + "learning_rate": 2.4943085309084333e-06, + "loss": 0.93110549, + "num_input_tokens_seen": 156132130, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.12652588, + "step": 7283, + "time_per_iteration": 3.9656553268432617 + }, + { + "auxiliary_loss_clip": 0.0644891, + "auxiliary_loss_mlp": 0.01268213, + "balance_loss_clip": 0.06279234, + "balance_loss_mlp": 0.01254999, + "epoch": 0.43793777243348864, + "flos": 23995004060160.0, + "grad_norm": 1.8602515290448327, + "language_loss": 0.8091675, + "learning_rate": 2.49393114246007e-06, + "loss": 0.88633871, + "num_input_tokens_seen": 156150820, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.13214111, + "step": 7284, + "time_per_iteration": 2.566521167755127 + }, + { + "auxiliary_loss_clip": 0.06443676, + "auxiliary_loss_mlp": 0.0127107, + "balance_loss_clip": 0.06278057, + "balance_loss_mlp": 0.01258774, + "epoch": 0.4379978956861566, + "flos": 18629909383680.0, + "grad_norm": 1.7731724137458924, + "language_loss": 0.80635571, + "learning_rate": 2.493553735281787e-06, + "loss": 0.8835032, + "num_input_tokens_seen": 156170125, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.12310791, + "step": 7285, + "time_per_iteration": 2.5004618167877197 + }, + { + "auxiliary_loss_clip": 0.0643899, + "auxiliary_loss_mlp": 0.01269665, + "balance_loss_clip": 0.06274976, + "balance_loss_mlp": 0.01256642, + "epoch": 0.43805801893882457, + "flos": 21987901284480.0, + "grad_norm": 1.9005617879541583, + "language_loss": 0.75070119, + "learning_rate": 2.493176309387897e-06, + "loss": 0.82778776, + "num_input_tokens_seen": 156187320, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.13031006, + "step": 7286, + "time_per_iteration": 2.5617265701293945 + }, + { + "auxiliary_loss_clip": 0.0644343, + "auxiliary_loss_mlp": 0.01269982, + "balance_loss_clip": 0.06274993, + "balance_loss_mlp": 0.01257239, + "epoch": 0.43811814219149253, + "flos": 26400114529920.0, + "grad_norm": 2.124374396883661, + "language_loss": 0.73769003, + "learning_rate": 2.492798864792712e-06, + "loss": 0.81482422, + "num_input_tokens_seen": 156207455, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.12738037, + "step": 7287, + "time_per_iteration": 2.5709421634674072 + }, + { + "auxiliary_loss_clip": 0.06442735, + "auxiliary_loss_mlp": 0.01272914, + "balance_loss_clip": 0.06276426, + "balance_loss_mlp": 0.01259115, + "epoch": 0.43817826544416055, + "flos": 17499015953280.0, + "grad_norm": 1.6607447345750057, + "language_loss": 0.82538438, + "learning_rate": 2.492421401510545e-06, + "loss": 0.90254092, + "num_input_tokens_seen": 156226560, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.13812256, + "step": 7288, + "time_per_iteration": 3.92202091217041 + }, + { + "auxiliary_loss_clip": 0.06447385, + "auxiliary_loss_mlp": 0.01268847, + "balance_loss_clip": 0.06276591, + "balance_loss_mlp": 0.01254888, + "epoch": 0.4382383886968285, + "flos": 21587629530240.0, + "grad_norm": 1.4460149141548964, + "language_loss": 0.84252048, + "learning_rate": 2.4920439195557093e-06, + "loss": 0.9196828, + "num_input_tokens_seen": 156246740, + "router_z_loss_clip": 1.70703125, + "router_z_loss_mlp": 0.1395874, + "step": 7289, + "time_per_iteration": 2.557433843612671 + }, + { + "auxiliary_loss_clip": 0.06446871, + "auxiliary_loss_mlp": 0.01267959, + "balance_loss_clip": 0.06274465, + "balance_loss_mlp": 0.01254912, + "epoch": 0.4382985119494965, + "flos": 27930441173760.0, + "grad_norm": 2.36337419111835, + "language_loss": 0.78573066, + "learning_rate": 2.4916664189425183e-06, + "loss": 0.86287904, + "num_input_tokens_seen": 156266440, + "router_z_loss_clip": 1.72753906, + "router_z_loss_mlp": 0.13067627, + "step": 7290, + "time_per_iteration": 2.5970215797424316 + }, + { + "auxiliary_loss_clip": 0.06439934, + "auxiliary_loss_mlp": 0.01272143, + "balance_loss_clip": 0.06275328, + "balance_loss_mlp": 0.0125903, + "epoch": 0.43835863520216445, + "flos": 24943860495360.0, + "grad_norm": 1.8528017599911322, + "language_loss": 0.7800144, + "learning_rate": 2.491288899685288e-06, + "loss": 0.85713518, + "num_input_tokens_seen": 156286900, + "router_z_loss_clip": 1.64648438, + "router_z_loss_mlp": 0.13110352, + "step": 7291, + "time_per_iteration": 2.5944950580596924 + }, + { + "auxiliary_loss_clip": 0.06443708, + "auxiliary_loss_mlp": 0.01274453, + "balance_loss_clip": 0.06277154, + "balance_loss_mlp": 0.0126106, + "epoch": 0.4384187584548324, + "flos": 33518634145920.0, + "grad_norm": 1.8972630881774872, + "language_loss": 0.64874315, + "learning_rate": 2.4909113617983325e-06, + "loss": 0.72592473, + "num_input_tokens_seen": 156307690, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.13391113, + "step": 7292, + "time_per_iteration": 2.628173351287842 + }, + { + "auxiliary_loss_clip": 0.06447129, + "auxiliary_loss_mlp": 0.01269671, + "balance_loss_clip": 0.06278794, + "balance_loss_mlp": 0.01256653, + "epoch": 0.4384788817075004, + "flos": 23957800047360.0, + "grad_norm": 1.5925770854238166, + "language_loss": 0.74671286, + "learning_rate": 2.49053380529597e-06, + "loss": 0.82388091, + "num_input_tokens_seen": 156326620, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.13031006, + "step": 7293, + "time_per_iteration": 3.9379074573516846 + }, + { + "auxiliary_loss_clip": 0.06446324, + "auxiliary_loss_mlp": 0.01270789, + "balance_loss_clip": 0.06279649, + "balance_loss_mlp": 0.0125668, + "epoch": 0.43853900496016834, + "flos": 19104463382400.0, + "grad_norm": 4.9627482836353165, + "language_loss": 0.7920171, + "learning_rate": 2.490156230192516e-06, + "loss": 0.86918819, + "num_input_tokens_seen": 156345495, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.14099121, + "step": 7294, + "time_per_iteration": 2.4718902111053467 + }, + { + "auxiliary_loss_clip": 0.06450905, + "auxiliary_loss_mlp": 0.01269807, + "balance_loss_clip": 0.06283231, + "balance_loss_mlp": 0.01256252, + "epoch": 0.4385991282128363, + "flos": 13230503660160.0, + "grad_norm": 1.631074893492929, + "language_loss": 0.73162925, + "learning_rate": 2.4897786365022883e-06, + "loss": 0.80883634, + "num_input_tokens_seen": 156363155, + "router_z_loss_clip": 1.67480469, + "router_z_loss_mlp": 0.13574219, + "step": 7295, + "time_per_iteration": 2.531641721725464 + }, + { + "auxiliary_loss_clip": 0.06452312, + "auxiliary_loss_mlp": 0.01270937, + "balance_loss_clip": 0.06283045, + "balance_loss_mlp": 0.01256298, + "epoch": 0.4386592514655043, + "flos": 14325199326720.0, + "grad_norm": 2.435451861079371, + "language_loss": 0.75030828, + "learning_rate": 2.4894010242396063e-06, + "loss": 0.8275407, + "num_input_tokens_seen": 156380940, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.14648438, + "step": 7296, + "time_per_iteration": 2.4799978733062744 + }, + { + "auxiliary_loss_clip": 0.06443385, + "auxiliary_loss_mlp": 0.01270746, + "balance_loss_clip": 0.06278379, + "balance_loss_mlp": 0.01257598, + "epoch": 0.43871937471817224, + "flos": 22791128123520.0, + "grad_norm": 1.513671798105688, + "language_loss": 0.69379568, + "learning_rate": 2.4890233934187873e-06, + "loss": 0.77093697, + "num_input_tokens_seen": 156400415, + "router_z_loss_clip": 1.64941406, + "router_z_loss_mlp": 0.13146973, + "step": 7297, + "time_per_iteration": 2.5378599166870117 + }, + { + "auxiliary_loss_clip": 0.06447895, + "auxiliary_loss_mlp": 0.01268794, + "balance_loss_clip": 0.06281355, + "balance_loss_mlp": 0.01255878, + "epoch": 0.4387794979708402, + "flos": 28079466860160.0, + "grad_norm": 1.3753147611046208, + "language_loss": 0.70496702, + "learning_rate": 2.4886457440541535e-06, + "loss": 0.78213394, + "num_input_tokens_seen": 156421120, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.12902832, + "step": 7298, + "time_per_iteration": 2.5667972564697266 + }, + { + "auxiliary_loss_clip": 0.06442846, + "auxiliary_loss_mlp": 0.01270993, + "balance_loss_clip": 0.06279726, + "balance_loss_mlp": 0.01258023, + "epoch": 0.43883962122350817, + "flos": 26256665139840.0, + "grad_norm": 1.5271246100670304, + "language_loss": 0.72762883, + "learning_rate": 2.4882680761600238e-06, + "loss": 0.80476719, + "num_input_tokens_seen": 156441535, + "router_z_loss_clip": 1.63085938, + "router_z_loss_mlp": 0.12976074, + "step": 7299, + "time_per_iteration": 2.567258834838867 + }, + { + "auxiliary_loss_clip": 0.06449576, + "auxiliary_loss_mlp": 0.012749, + "balance_loss_clip": 0.06281091, + "balance_loss_mlp": 0.01260142, + "epoch": 0.43889974447617613, + "flos": 25890662505600.0, + "grad_norm": 1.7549107290593968, + "language_loss": 0.76878119, + "learning_rate": 2.487890389750719e-06, + "loss": 0.84602594, + "num_input_tokens_seen": 156462015, + "router_z_loss_clip": 1.6875, + "router_z_loss_mlp": 0.14758301, + "step": 7300, + "time_per_iteration": 2.541740655899048 + }, + { + "auxiliary_loss_clip": 0.06448291, + "auxiliary_loss_mlp": 0.01268162, + "balance_loss_clip": 0.06281555, + "balance_loss_mlp": 0.01254346, + "epoch": 0.43895986772884416, + "flos": 25053711598080.0, + "grad_norm": 2.544712476821277, + "language_loss": 0.71268392, + "learning_rate": 2.4875126848405626e-06, + "loss": 0.78984845, + "num_input_tokens_seen": 156482165, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.13824463, + "step": 7301, + "time_per_iteration": 2.547846794128418 + }, + { + "auxiliary_loss_clip": 0.06445279, + "auxiliary_loss_mlp": 0.01269466, + "balance_loss_clip": 0.06277898, + "balance_loss_mlp": 0.01254434, + "epoch": 0.4390199909815121, + "flos": 26001729492480.0, + "grad_norm": 4.607507625532986, + "language_loss": 0.71274817, + "learning_rate": 2.4871349614438757e-06, + "loss": 0.78989553, + "num_input_tokens_seen": 156503170, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.15026855, + "step": 7302, + "time_per_iteration": 2.531633138656616 + }, + { + "auxiliary_loss_clip": 0.06444067, + "auxiliary_loss_mlp": 0.0126751, + "balance_loss_clip": 0.06280646, + "balance_loss_mlp": 0.01254618, + "epoch": 0.4390801142341801, + "flos": 29029790741760.0, + "grad_norm": 1.545722029471357, + "language_loss": 0.82388735, + "learning_rate": 2.486757219574983e-06, + "loss": 0.90100312, + "num_input_tokens_seen": 156523005, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.12908936, + "step": 7303, + "time_per_iteration": 2.6841824054718018 + }, + { + "auxiliary_loss_clip": 0.06456171, + "auxiliary_loss_mlp": 0.01271253, + "balance_loss_clip": 0.06284264, + "balance_loss_mlp": 0.01256649, + "epoch": 0.43914023748684805, + "flos": 33447077159040.0, + "grad_norm": 2.3091286506484034, + "language_loss": 0.69152826, + "learning_rate": 2.4863794592482067e-06, + "loss": 0.76880252, + "num_input_tokens_seen": 156544440, + "router_z_loss_clip": 1.71972656, + "router_z_loss_mlp": 0.1461792, + "step": 7304, + "time_per_iteration": 2.6893982887268066 + }, + { + "auxiliary_loss_clip": 0.06439492, + "auxiliary_loss_mlp": 0.01269095, + "balance_loss_clip": 0.06278437, + "balance_loss_mlp": 0.01256507, + "epoch": 0.439200360739516, + "flos": 34540347306240.0, + "grad_norm": 1.5007015420493954, + "language_loss": 0.78744507, + "learning_rate": 2.486001680477873e-06, + "loss": 0.86453092, + "num_input_tokens_seen": 156565410, + "router_z_loss_clip": 1.61132812, + "router_z_loss_mlp": 0.12573242, + "step": 7305, + "time_per_iteration": 2.6403284072875977 + }, + { + "auxiliary_loss_clip": 0.06446742, + "auxiliary_loss_mlp": 0.01269235, + "balance_loss_clip": 0.06281108, + "balance_loss_mlp": 0.01255019, + "epoch": 0.439260483992184, + "flos": 21914247945600.0, + "grad_norm": 1.7423010107893722, + "language_loss": 0.68937683, + "learning_rate": 2.485623883278308e-06, + "loss": 0.76653659, + "num_input_tokens_seen": 156584210, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.14221191, + "step": 7306, + "time_per_iteration": 2.5665781497955322 + }, + { + "auxiliary_loss_clip": 0.06446797, + "auxiliary_loss_mlp": 0.01272443, + "balance_loss_clip": 0.06279111, + "balance_loss_mlp": 0.01258877, + "epoch": 0.43932060724485195, + "flos": 21002805158400.0, + "grad_norm": 1.5749593715316206, + "language_loss": 0.63249755, + "learning_rate": 2.4852460676638344e-06, + "loss": 0.70968997, + "num_input_tokens_seen": 156602730, + "router_z_loss_clip": 1.67675781, + "router_z_loss_mlp": 0.13562012, + "step": 7307, + "time_per_iteration": 2.5204410552978516 + }, + { + "auxiliary_loss_clip": 0.06449466, + "auxiliary_loss_mlp": 0.0126805, + "balance_loss_clip": 0.06279462, + "balance_loss_mlp": 0.01254305, + "epoch": 0.4393807304975199, + "flos": 17752526081280.0, + "grad_norm": 1.900088770074622, + "language_loss": 0.72216207, + "learning_rate": 2.4848682336487828e-06, + "loss": 0.79933721, + "num_input_tokens_seen": 156619405, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.13745117, + "step": 7308, + "time_per_iteration": 2.4988410472869873 + }, + { + "auxiliary_loss_clip": 0.06445662, + "auxiliary_loss_mlp": 0.01268116, + "balance_loss_clip": 0.06277111, + "balance_loss_mlp": 0.01254669, + "epoch": 0.4394408537501879, + "flos": 22535102373120.0, + "grad_norm": 2.200318468716899, + "language_loss": 0.76911771, + "learning_rate": 2.4844903812474787e-06, + "loss": 0.84625548, + "num_input_tokens_seen": 156638165, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.13458252, + "step": 7309, + "time_per_iteration": 2.521385431289673 + }, + { + "auxiliary_loss_clip": 0.06438792, + "auxiliary_loss_mlp": 0.01270246, + "balance_loss_clip": 0.06277418, + "balance_loss_mlp": 0.01257908, + "epoch": 0.43950097700285584, + "flos": 23447383701120.0, + "grad_norm": 3.092354645663241, + "language_loss": 0.71101463, + "learning_rate": 2.484112510474251e-06, + "loss": 0.78810501, + "num_input_tokens_seen": 156658845, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.12335205, + "step": 7310, + "time_per_iteration": 2.609769344329834 + }, + { + "auxiliary_loss_clip": 0.06452246, + "auxiliary_loss_mlp": 0.01269049, + "balance_loss_clip": 0.06282806, + "balance_loss_mlp": 0.0125624, + "epoch": 0.4395611002555238, + "flos": 23186620195200.0, + "grad_norm": 3.6443795998554744, + "language_loss": 0.76179528, + "learning_rate": 2.483734621343429e-06, + "loss": 0.83900821, + "num_input_tokens_seen": 156677275, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.12817383, + "step": 7311, + "time_per_iteration": 2.5347063541412354 + }, + { + "auxiliary_loss_clip": 0.06451476, + "auxiliary_loss_mlp": 0.01270936, + "balance_loss_clip": 0.0628207, + "balance_loss_mlp": 0.01258043, + "epoch": 0.43962122350819177, + "flos": 22133908224000.0, + "grad_norm": 1.9101034753519561, + "language_loss": 0.81546378, + "learning_rate": 2.483356713869341e-06, + "loss": 0.89268786, + "num_input_tokens_seen": 156695815, + "router_z_loss_clip": 1.69042969, + "router_z_loss_mlp": 0.12890625, + "step": 7312, + "time_per_iteration": 2.5744950771331787 + }, + { + "auxiliary_loss_clip": 0.06441756, + "auxiliary_loss_mlp": 0.01268695, + "balance_loss_clip": 0.06277572, + "balance_loss_mlp": 0.01255713, + "epoch": 0.43968134676085974, + "flos": 17426285009280.0, + "grad_norm": 1.9172183853591918, + "language_loss": 0.86001694, + "learning_rate": 2.482978788066318e-06, + "loss": 0.93712139, + "num_input_tokens_seen": 156714385, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.12982178, + "step": 7313, + "time_per_iteration": 2.536870241165161 + }, + { + "auxiliary_loss_clip": 0.06445049, + "auxiliary_loss_mlp": 0.01271249, + "balance_loss_clip": 0.06276917, + "balance_loss_mlp": 0.01258184, + "epoch": 0.43974147001352776, + "flos": 18958582224000.0, + "grad_norm": 6.24702313006486, + "language_loss": 0.679317, + "learning_rate": 2.4826008439486904e-06, + "loss": 0.75647992, + "num_input_tokens_seen": 156732615, + "router_z_loss_clip": 1.68066406, + "router_z_loss_mlp": 0.13061523, + "step": 7314, + "time_per_iteration": 2.5457370281219482 + }, + { + "auxiliary_loss_clip": 0.06448518, + "auxiliary_loss_mlp": 0.01271322, + "balance_loss_clip": 0.06279253, + "balance_loss_mlp": 0.01258209, + "epoch": 0.4398015932661957, + "flos": 18959588472960.0, + "grad_norm": 1.6336273312910292, + "language_loss": 0.76986659, + "learning_rate": 2.4822228815307915e-06, + "loss": 0.84706497, + "num_input_tokens_seen": 156750920, + "router_z_loss_clip": 1.69042969, + "router_z_loss_mlp": 0.13098145, + "step": 7315, + "time_per_iteration": 2.5225329399108887 + }, + { + "auxiliary_loss_clip": 0.06442133, + "auxiliary_loss_mlp": 0.01268226, + "balance_loss_clip": 0.06276898, + "balance_loss_mlp": 0.01255447, + "epoch": 0.4398617165188637, + "flos": 24205608097920.0, + "grad_norm": 2.1993234427936637, + "language_loss": 0.74934149, + "learning_rate": 2.4818449008269523e-06, + "loss": 0.8264451, + "num_input_tokens_seen": 156768520, + "router_z_loss_clip": 1.65234375, + "router_z_loss_mlp": 0.12780762, + "step": 7316, + "time_per_iteration": 2.5561742782592773 + }, + { + "auxiliary_loss_clip": 0.06444536, + "auxiliary_loss_mlp": 0.01271979, + "balance_loss_clip": 0.06280385, + "balance_loss_mlp": 0.01259289, + "epoch": 0.43992183977153165, + "flos": 22243214275200.0, + "grad_norm": 2.7598614180807814, + "language_loss": 0.65349543, + "learning_rate": 2.481466901851506e-06, + "loss": 0.73066062, + "num_input_tokens_seen": 156788700, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.12695312, + "step": 7317, + "time_per_iteration": 2.5142266750335693 + }, + { + "auxiliary_loss_clip": 0.06450248, + "auxiliary_loss_mlp": 0.01270442, + "balance_loss_clip": 0.06283192, + "balance_loss_mlp": 0.01256929, + "epoch": 0.4399819630241996, + "flos": 18703395014400.0, + "grad_norm": 1.826408349581849, + "language_loss": 0.80062312, + "learning_rate": 2.4810888846187865e-06, + "loss": 0.87783003, + "num_input_tokens_seen": 156806470, + "router_z_loss_clip": 1.66894531, + "router_z_loss_mlp": 0.13519287, + "step": 7318, + "time_per_iteration": 2.519906520843506 + }, + { + "auxiliary_loss_clip": 0.06445621, + "auxiliary_loss_mlp": 0.01269422, + "balance_loss_clip": 0.06275794, + "balance_loss_mlp": 0.01255725, + "epoch": 0.4400420862768676, + "flos": 23886326914560.0, + "grad_norm": 1.6582419144412086, + "language_loss": 0.79880667, + "learning_rate": 2.4807108491431283e-06, + "loss": 0.87595713, + "num_input_tokens_seen": 156825895, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.13708496, + "step": 7319, + "time_per_iteration": 2.593442440032959 + }, + { + "auxiliary_loss_clip": 0.06445733, + "auxiliary_loss_mlp": 0.01274619, + "balance_loss_clip": 0.06279506, + "balance_loss_mlp": 0.01260547, + "epoch": 0.44010220952953555, + "flos": 28045071959040.0, + "grad_norm": 2.6685359162637172, + "language_loss": 0.80292428, + "learning_rate": 2.4803327954388667e-06, + "loss": 0.88012779, + "num_input_tokens_seen": 156845990, + "router_z_loss_clip": 1.66113281, + "router_z_loss_mlp": 0.14074707, + "step": 7320, + "time_per_iteration": 2.576824188232422 + }, + { + "auxiliary_loss_clip": 0.06443729, + "auxiliary_loss_mlp": 0.01271309, + "balance_loss_clip": 0.06278579, + "balance_loss_mlp": 0.01258333, + "epoch": 0.4401623327822035, + "flos": 23775763052160.0, + "grad_norm": 3.573791590582856, + "language_loss": 0.69620574, + "learning_rate": 2.4799547235203376e-06, + "loss": 0.77335614, + "num_input_tokens_seen": 156866685, + "router_z_loss_clip": 1.65136719, + "router_z_loss_mlp": 0.12969971, + "step": 7321, + "time_per_iteration": 4.008130311965942 + }, + { + "auxiliary_loss_clip": 0.06352215, + "auxiliary_loss_mlp": 0.01268902, + "balance_loss_clip": 0.06277325, + "balance_loss_mlp": 0.01265612, + "epoch": 0.4402224560348715, + "flos": 70797320081280.0, + "grad_norm": 0.8902034574652531, + "language_loss": 0.56966496, + "learning_rate": 2.4795766334018763e-06, + "loss": 0.64587617, + "num_input_tokens_seen": 156923450, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.03295898, + "step": 7322, + "time_per_iteration": 4.591723680496216 + }, + { + "auxiliary_loss_clip": 0.06443685, + "auxiliary_loss_mlp": 0.01271286, + "balance_loss_clip": 0.06277888, + "balance_loss_mlp": 0.01258787, + "epoch": 0.44028257928753944, + "flos": 22898170114560.0, + "grad_norm": 1.423216656342095, + "language_loss": 0.76491451, + "learning_rate": 2.479198525097822e-06, + "loss": 0.8420642, + "num_input_tokens_seen": 156944795, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.12493896, + "step": 7323, + "time_per_iteration": 2.5367372035980225 + }, + { + "auxiliary_loss_clip": 0.06449594, + "auxiliary_loss_mlp": 0.01277882, + "balance_loss_clip": 0.06282798, + "balance_loss_mlp": 0.01265216, + "epoch": 0.4403427025402074, + "flos": 17901719475840.0, + "grad_norm": 1.6412485345287482, + "language_loss": 0.80679965, + "learning_rate": 2.478820398622511e-06, + "loss": 0.88407433, + "num_input_tokens_seen": 156962755, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.12670898, + "step": 7324, + "time_per_iteration": 2.496735095977783 + }, + { + "auxiliary_loss_clip": 0.0634661, + "auxiliary_loss_mlp": 0.01259308, + "balance_loss_clip": 0.06271856, + "balance_loss_mlp": 0.01255979, + "epoch": 0.4404028257928754, + "flos": 69583717071360.0, + "grad_norm": 0.6517122364434149, + "language_loss": 0.54482663, + "learning_rate": 2.478442253990283e-06, + "loss": 0.62088585, + "num_input_tokens_seen": 157028095, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.03335571, + "step": 7325, + "time_per_iteration": 3.1927096843719482 + }, + { + "auxiliary_loss_clip": 0.06445315, + "auxiliary_loss_mlp": 0.01266283, + "balance_loss_clip": 0.06281503, + "balance_loss_mlp": 0.01253981, + "epoch": 0.44046294904554334, + "flos": 20930074214400.0, + "grad_norm": 1.5304533021700073, + "language_loss": 0.69945073, + "learning_rate": 2.4780640912154766e-06, + "loss": 0.77656674, + "num_input_tokens_seen": 157048365, + "router_z_loss_clip": 1.63964844, + "router_z_loss_mlp": 0.12298584, + "step": 7326, + "time_per_iteration": 2.5716168880462646 + }, + { + "auxiliary_loss_clip": 0.06441578, + "auxiliary_loss_mlp": 0.01266878, + "balance_loss_clip": 0.06279023, + "balance_loss_mlp": 0.01254402, + "epoch": 0.44052307229821136, + "flos": 23630301164160.0, + "grad_norm": 1.488040619087652, + "language_loss": 0.76529855, + "learning_rate": 2.477685910312432e-06, + "loss": 0.84238315, + "num_input_tokens_seen": 157069130, + "router_z_loss_clip": 1.62402344, + "router_z_loss_mlp": 0.12481689, + "step": 7327, + "time_per_iteration": 3.997654676437378 + }, + { + "auxiliary_loss_clip": 0.06439877, + "auxiliary_loss_mlp": 0.01269684, + "balance_loss_clip": 0.06277373, + "balance_loss_mlp": 0.01256744, + "epoch": 0.4405831955508793, + "flos": 17602536072960.0, + "grad_norm": 2.6410067735498512, + "language_loss": 0.83833683, + "learning_rate": 2.4773077112954897e-06, + "loss": 0.91543245, + "num_input_tokens_seen": 157084940, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.1295166, + "step": 7328, + "time_per_iteration": 2.520899534225464 + }, + { + "auxiliary_loss_clip": 0.06445633, + "auxiliary_loss_mlp": 0.01268864, + "balance_loss_clip": 0.06283547, + "balance_loss_mlp": 0.01255703, + "epoch": 0.4406433188035473, + "flos": 21468596405760.0, + "grad_norm": 3.134642090151518, + "language_loss": 0.77723283, + "learning_rate": 2.4769294941789908e-06, + "loss": 0.85437775, + "num_input_tokens_seen": 157102770, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.13165283, + "step": 7329, + "time_per_iteration": 2.5004947185516357 + }, + { + "auxiliary_loss_clip": 0.06448144, + "auxiliary_loss_mlp": 0.01272671, + "balance_loss_clip": 0.06280035, + "balance_loss_mlp": 0.01259176, + "epoch": 0.44070344205621526, + "flos": 22680019209600.0, + "grad_norm": 1.6769566948090702, + "language_loss": 0.74290001, + "learning_rate": 2.476551258977278e-06, + "loss": 0.82010818, + "num_input_tokens_seen": 157122035, + "router_z_loss_clip": 1.6796875, + "router_z_loss_mlp": 0.1348877, + "step": 7330, + "time_per_iteration": 2.534775733947754 + }, + { + "auxiliary_loss_clip": 0.06448483, + "auxiliary_loss_mlp": 0.01270882, + "balance_loss_clip": 0.06283589, + "balance_loss_mlp": 0.01258127, + "epoch": 0.4407635653088832, + "flos": 23448012606720.0, + "grad_norm": 1.699983061814717, + "language_loss": 0.74538559, + "learning_rate": 2.4761730057046936e-06, + "loss": 0.82257915, + "num_input_tokens_seen": 157142800, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.12762451, + "step": 7331, + "time_per_iteration": 2.5442659854888916 + }, + { + "auxiliary_loss_clip": 0.06442808, + "auxiliary_loss_mlp": 0.01270328, + "balance_loss_clip": 0.06279509, + "balance_loss_mlp": 0.01256667, + "epoch": 0.4408236885615512, + "flos": 24027596098560.0, + "grad_norm": 1.6889636086213913, + "language_loss": 0.76643395, + "learning_rate": 2.475794734375581e-06, + "loss": 0.84356534, + "num_input_tokens_seen": 157163295, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.13659668, + "step": 7332, + "time_per_iteration": 2.5714762210845947 + }, + { + "auxiliary_loss_clip": 0.06442308, + "auxiliary_loss_mlp": 0.01271754, + "balance_loss_clip": 0.06277508, + "balance_loss_mlp": 0.01258272, + "epoch": 0.44088381181421915, + "flos": 12681667416960.0, + "grad_norm": 1.845933322464005, + "language_loss": 0.73768836, + "learning_rate": 2.475416445004285e-06, + "loss": 0.81482899, + "num_input_tokens_seen": 157180890, + "router_z_loss_clip": 1.64746094, + "router_z_loss_mlp": 0.1348877, + "step": 7333, + "time_per_iteration": 3.9176201820373535 + }, + { + "auxiliary_loss_clip": 0.06439593, + "auxiliary_loss_mlp": 0.01265669, + "balance_loss_clip": 0.06280486, + "balance_loss_mlp": 0.01253486, + "epoch": 0.4409439350668871, + "flos": 24576474268800.0, + "grad_norm": 1.6297964144317614, + "language_loss": 0.79249531, + "learning_rate": 2.4750381376051493e-06, + "loss": 0.8695479, + "num_input_tokens_seen": 157200580, + "router_z_loss_clip": 1.59277344, + "router_z_loss_mlp": 0.12200928, + "step": 7334, + "time_per_iteration": 2.530762195587158 + }, + { + "auxiliary_loss_clip": 0.06456793, + "auxiliary_loss_mlp": 0.01269696, + "balance_loss_clip": 0.06281539, + "balance_loss_mlp": 0.01254747, + "epoch": 0.4410040583195551, + "flos": 22674191351040.0, + "grad_norm": 7.845487214918662, + "language_loss": 0.7603153, + "learning_rate": 2.47465981219252e-06, + "loss": 0.83758014, + "num_input_tokens_seen": 157218345, + "router_z_loss_clip": 1.75097656, + "router_z_loss_mlp": 0.1496582, + "step": 7335, + "time_per_iteration": 2.5146994590759277 + }, + { + "auxiliary_loss_clip": 0.06445056, + "auxiliary_loss_mlp": 0.01269223, + "balance_loss_clip": 0.06279862, + "balance_loss_mlp": 0.01254942, + "epoch": 0.44106418157222305, + "flos": 10857062833920.0, + "grad_norm": 1.9701535584859973, + "language_loss": 0.72720182, + "learning_rate": 2.4742814687807423e-06, + "loss": 0.80434465, + "num_input_tokens_seen": 157234395, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.14263916, + "step": 7336, + "time_per_iteration": 2.470501661300659 + }, + { + "auxiliary_loss_clip": 0.06448875, + "auxiliary_loss_mlp": 0.01272884, + "balance_loss_clip": 0.06281201, + "balance_loss_mlp": 0.01259079, + "epoch": 0.441124304824891, + "flos": 21733301053440.0, + "grad_norm": 2.690720747597236, + "language_loss": 0.62764168, + "learning_rate": 2.473903107384165e-06, + "loss": 0.70485932, + "num_input_tokens_seen": 157254805, + "router_z_loss_clip": 1.67578125, + "router_z_loss_mlp": 0.13812256, + "step": 7337, + "time_per_iteration": 2.5464730262756348 + }, + { + "auxiliary_loss_clip": 0.06339368, + "auxiliary_loss_mlp": 0.01255392, + "balance_loss_clip": 0.06265444, + "balance_loss_mlp": 0.01252635, + "epoch": 0.441184428077559, + "flos": 63241702041600.0, + "grad_norm": 0.7296971987367982, + "language_loss": 0.52622962, + "learning_rate": 2.473524728017134e-06, + "loss": 0.60217726, + "num_input_tokens_seen": 157317870, + "router_z_loss_clip": 0.73876953, + "router_z_loss_mlp": 0.02761841, + "step": 7338, + "time_per_iteration": 3.1634135246276855 + }, + { + "auxiliary_loss_clip": 0.06451306, + "auxiliary_loss_mlp": 0.0127376, + "balance_loss_clip": 0.06278681, + "balance_loss_mlp": 0.01259133, + "epoch": 0.44124455133022694, + "flos": 21184213248000.0, + "grad_norm": 2.888450189779477, + "language_loss": 0.71053195, + "learning_rate": 2.473146330693997e-06, + "loss": 0.78778255, + "num_input_tokens_seen": 157336505, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.14611816, + "step": 7339, + "time_per_iteration": 2.526179552078247 + }, + { + "auxiliary_loss_clip": 0.06437125, + "auxiliary_loss_mlp": 0.01265386, + "balance_loss_clip": 0.06279349, + "balance_loss_mlp": 0.01252833, + "epoch": 0.4413046745828949, + "flos": 17463740584320.0, + "grad_norm": 1.6365123651784117, + "language_loss": 0.70282859, + "learning_rate": 2.472767915429105e-06, + "loss": 0.77985364, + "num_input_tokens_seen": 157354995, + "router_z_loss_clip": 1.57714844, + "router_z_loss_mlp": 0.12554932, + "step": 7340, + "time_per_iteration": 2.4790234565734863 + }, + { + "auxiliary_loss_clip": 0.06342094, + "auxiliary_loss_mlp": 0.01254424, + "balance_loss_clip": 0.06268074, + "balance_loss_mlp": 0.01251767, + "epoch": 0.4413647978355629, + "flos": 61602251783040.0, + "grad_norm": 0.8821319445569078, + "language_loss": 0.64009017, + "learning_rate": 2.4723894822368054e-06, + "loss": 0.71605539, + "num_input_tokens_seen": 157404260, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.02659607, + "step": 7341, + "time_per_iteration": 2.9593453407287598 + }, + { + "auxiliary_loss_clip": 0.06446001, + "auxiliary_loss_mlp": 0.0127129, + "balance_loss_clip": 0.06280506, + "balance_loss_mlp": 0.01257992, + "epoch": 0.4414249210882309, + "flos": 27534404050560.0, + "grad_norm": 1.9827417031820809, + "language_loss": 0.73812068, + "learning_rate": 2.47201103113145e-06, + "loss": 0.81529361, + "num_input_tokens_seen": 157423045, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.13299561, + "step": 7342, + "time_per_iteration": 2.5592381954193115 + }, + { + "auxiliary_loss_clip": 0.06443819, + "auxiliary_loss_mlp": 0.01272391, + "balance_loss_clip": 0.06280041, + "balance_loss_mlp": 0.01258497, + "epoch": 0.44148504434089886, + "flos": 23520785477760.0, + "grad_norm": 1.7847903417039304, + "language_loss": 0.80326116, + "learning_rate": 2.4716325621273886e-06, + "loss": 0.88042319, + "num_input_tokens_seen": 157441815, + "router_z_loss_clip": 1.63769531, + "router_z_loss_mlp": 0.13885498, + "step": 7343, + "time_per_iteration": 2.567669630050659 + }, + { + "auxiliary_loss_clip": 0.0644604, + "auxiliary_loss_mlp": 0.01268371, + "balance_loss_clip": 0.06281629, + "balance_loss_mlp": 0.01254382, + "epoch": 0.4415451675935668, + "flos": 21587126405760.0, + "grad_norm": 1.6274174275387656, + "language_loss": 0.7678231, + "learning_rate": 2.4712540752389725e-06, + "loss": 0.84496725, + "num_input_tokens_seen": 157460470, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.14001465, + "step": 7344, + "time_per_iteration": 2.50498628616333 + }, + { + "auxiliary_loss_clip": 0.06331868, + "auxiliary_loss_mlp": 0.01254509, + "balance_loss_clip": 0.06258254, + "balance_loss_mlp": 0.01251979, + "epoch": 0.4416052908462348, + "flos": 59023825142400.0, + "grad_norm": 0.9594048262741005, + "language_loss": 0.63725042, + "learning_rate": 2.470875570480556e-06, + "loss": 0.71311414, + "num_input_tokens_seen": 157512655, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.02529907, + "step": 7345, + "time_per_iteration": 2.9305789470672607 + }, + { + "auxiliary_loss_clip": 0.06448534, + "auxiliary_loss_mlp": 0.01269691, + "balance_loss_clip": 0.06281187, + "balance_loss_mlp": 0.01255386, + "epoch": 0.44166541409890275, + "flos": 26364545671680.0, + "grad_norm": 1.5861169822925434, + "language_loss": 0.86231661, + "learning_rate": 2.470497047866489e-06, + "loss": 0.9394989, + "num_input_tokens_seen": 157533700, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.14306641, + "step": 7346, + "time_per_iteration": 2.566326141357422 + }, + { + "auxiliary_loss_clip": 0.06448992, + "auxiliary_loss_mlp": 0.01268131, + "balance_loss_clip": 0.06282933, + "balance_loss_mlp": 0.01253909, + "epoch": 0.4417255373515707, + "flos": 20198739778560.0, + "grad_norm": 1.9006247897038917, + "language_loss": 0.80872411, + "learning_rate": 2.470118507411128e-06, + "loss": 0.88589537, + "num_input_tokens_seen": 157551105, + "router_z_loss_clip": 1.66113281, + "router_z_loss_mlp": 0.14221191, + "step": 7347, + "time_per_iteration": 2.4968490600585938 + }, + { + "auxiliary_loss_clip": 0.06445403, + "auxiliary_loss_mlp": 0.01269031, + "balance_loss_clip": 0.06279784, + "balance_loss_mlp": 0.01254166, + "epoch": 0.4417856606042387, + "flos": 17892537454080.0, + "grad_norm": 1.9280841383218132, + "language_loss": 0.83507645, + "learning_rate": 2.4697399491288263e-06, + "loss": 0.91222078, + "num_input_tokens_seen": 157568285, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.14868164, + "step": 7348, + "time_per_iteration": 2.5483500957489014 + }, + { + "auxiliary_loss_clip": 0.06451687, + "auxiliary_loss_mlp": 0.01270301, + "balance_loss_clip": 0.06282644, + "balance_loss_mlp": 0.0125571, + "epoch": 0.44184578385690665, + "flos": 27971376693120.0, + "grad_norm": 2.209333058456871, + "language_loss": 0.70229864, + "learning_rate": 2.469361373033938e-06, + "loss": 0.77951854, + "num_input_tokens_seen": 157590405, + "router_z_loss_clip": 1.69042969, + "router_z_loss_mlp": 0.14593506, + "step": 7349, + "time_per_iteration": 2.5552031993865967 + }, + { + "auxiliary_loss_clip": 0.06448848, + "auxiliary_loss_mlp": 0.01269717, + "balance_loss_clip": 0.06281149, + "balance_loss_mlp": 0.01254858, + "epoch": 0.4419059071095746, + "flos": 23374652757120.0, + "grad_norm": 1.8931524120790788, + "language_loss": 0.74732667, + "learning_rate": 2.468982779140819e-06, + "loss": 0.82451236, + "num_input_tokens_seen": 157607420, + "router_z_loss_clip": 1.67578125, + "router_z_loss_mlp": 0.14855957, + "step": 7350, + "time_per_iteration": 2.5428407192230225 + }, + { + "auxiliary_loss_clip": 0.06449752, + "auxiliary_loss_mlp": 0.01269052, + "balance_loss_clip": 0.06283528, + "balance_loss_mlp": 0.01254591, + "epoch": 0.4419660303622426, + "flos": 15017443032960.0, + "grad_norm": 2.6211867622298626, + "language_loss": 0.81412131, + "learning_rate": 2.468604167463827e-06, + "loss": 0.89130938, + "num_input_tokens_seen": 157624990, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.14453125, + "step": 7351, + "time_per_iteration": 2.5310895442962646 + }, + { + "auxiliary_loss_clip": 0.06439559, + "auxiliary_loss_mlp": 0.01271292, + "balance_loss_clip": 0.06278528, + "balance_loss_mlp": 0.01258537, + "epoch": 0.44202615361491054, + "flos": 25378359442560.0, + "grad_norm": 1.998249332467298, + "language_loss": 0.73669267, + "learning_rate": 2.4682255380173176e-06, + "loss": 0.81380117, + "num_input_tokens_seen": 157645300, + "router_z_loss_clip": 1.61035156, + "router_z_loss_mlp": 0.12774658, + "step": 7352, + "time_per_iteration": 2.6823537349700928 + }, + { + "auxiliary_loss_clip": 0.06450884, + "auxiliary_loss_mlp": 0.01267651, + "balance_loss_clip": 0.06284234, + "balance_loss_mlp": 0.01253584, + "epoch": 0.4420862768675785, + "flos": 24688044380160.0, + "grad_norm": 1.9707834429969424, + "language_loss": 0.87580955, + "learning_rate": 2.467846890815649e-06, + "loss": 0.95299494, + "num_input_tokens_seen": 157664060, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.14086914, + "step": 7353, + "time_per_iteration": 2.531208038330078 + }, + { + "auxiliary_loss_clip": 0.06445745, + "auxiliary_loss_mlp": 0.01274404, + "balance_loss_clip": 0.06277722, + "balance_loss_mlp": 0.01260659, + "epoch": 0.44214640012024653, + "flos": 19533134471040.0, + "grad_norm": 2.5061219192509676, + "language_loss": 0.76425511, + "learning_rate": 2.4674682258731795e-06, + "loss": 0.84145659, + "num_input_tokens_seen": 157680905, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.13751221, + "step": 7354, + "time_per_iteration": 2.5208046436309814 + }, + { + "auxiliary_loss_clip": 0.06442366, + "auxiliary_loss_mlp": 0.01269638, + "balance_loss_clip": 0.06279345, + "balance_loss_mlp": 0.01256894, + "epoch": 0.4422065233729145, + "flos": 47568143940480.0, + "grad_norm": 2.32689870132585, + "language_loss": 0.65273595, + "learning_rate": 2.467089543204268e-06, + "loss": 0.72985595, + "num_input_tokens_seen": 157701980, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.12768555, + "step": 7355, + "time_per_iteration": 2.7359063625335693 + }, + { + "auxiliary_loss_clip": 0.06452843, + "auxiliary_loss_mlp": 0.01272532, + "balance_loss_clip": 0.06279876, + "balance_loss_mlp": 0.01257225, + "epoch": 0.44226664662558246, + "flos": 19287045429120.0, + "grad_norm": 1.8090120162092156, + "language_loss": 0.78513968, + "learning_rate": 2.466710842823274e-06, + "loss": 0.86239338, + "num_input_tokens_seen": 157720555, + "router_z_loss_clip": 1.72753906, + "router_z_loss_mlp": 0.15307617, + "step": 7356, + "time_per_iteration": 2.5535836219787598 + }, + { + "auxiliary_loss_clip": 0.0645135, + "auxiliary_loss_mlp": 0.01270574, + "balance_loss_clip": 0.0628085, + "balance_loss_mlp": 0.01255184, + "epoch": 0.4423267698782504, + "flos": 17827604939520.0, + "grad_norm": 1.5923292427452285, + "language_loss": 0.77331412, + "learning_rate": 2.4663321247445577e-06, + "loss": 0.85053337, + "num_input_tokens_seen": 157739160, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.1539917, + "step": 7357, + "time_per_iteration": 2.472616195678711 + }, + { + "auxiliary_loss_clip": 0.06444242, + "auxiliary_loss_mlp": 0.0127409, + "balance_loss_clip": 0.06277513, + "balance_loss_mlp": 0.01259112, + "epoch": 0.4423868931309184, + "flos": 29211953518080.0, + "grad_norm": 1.4316006976636513, + "language_loss": 0.73656726, + "learning_rate": 2.465953388982481e-06, + "loss": 0.81375057, + "num_input_tokens_seen": 157760020, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.14971924, + "step": 7358, + "time_per_iteration": 2.596794366836548 + }, + { + "auxiliary_loss_clip": 0.06449263, + "auxiliary_loss_mlp": 0.01268513, + "balance_loss_clip": 0.06281863, + "balance_loss_mlp": 0.01255131, + "epoch": 0.44244701638358636, + "flos": 29720399293440.0, + "grad_norm": 1.5482043588344903, + "language_loss": 0.75746959, + "learning_rate": 2.465574635551405e-06, + "loss": 0.83464736, + "num_input_tokens_seen": 157780435, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.13378906, + "step": 7359, + "time_per_iteration": 2.565152168273926 + }, + { + "auxiliary_loss_clip": 0.06449427, + "auxiliary_loss_mlp": 0.01273427, + "balance_loss_clip": 0.06282771, + "balance_loss_mlp": 0.01258907, + "epoch": 0.4425071396362543, + "flos": 22936715792640.0, + "grad_norm": 1.7006216058888692, + "language_loss": 0.70234901, + "learning_rate": 2.4651958644656923e-06, + "loss": 0.77957749, + "num_input_tokens_seen": 157799420, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.14526367, + "step": 7360, + "time_per_iteration": 3.9516735076904297 + }, + { + "auxiliary_loss_clip": 0.06450445, + "auxiliary_loss_mlp": 0.01276643, + "balance_loss_clip": 0.06282296, + "balance_loss_mlp": 0.01262028, + "epoch": 0.4425672628889223, + "flos": 19798509951360.0, + "grad_norm": 2.334645337647824, + "language_loss": 0.69802427, + "learning_rate": 2.4648170757397053e-06, + "loss": 0.77529514, + "num_input_tokens_seen": 157817025, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.14599609, + "step": 7361, + "time_per_iteration": 3.9590420722961426 + }, + { + "auxiliary_loss_clip": 0.06448395, + "auxiliary_loss_mlp": 0.01271063, + "balance_loss_clip": 0.06281347, + "balance_loss_mlp": 0.01256287, + "epoch": 0.44262738614159025, + "flos": 13667266667520.0, + "grad_norm": 1.9889994262633817, + "language_loss": 0.82882756, + "learning_rate": 2.464438269387809e-06, + "loss": 0.90602213, + "num_input_tokens_seen": 157834345, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.14770508, + "step": 7362, + "time_per_iteration": 2.4627645015716553 + }, + { + "auxiliary_loss_clip": 0.06458044, + "auxiliary_loss_mlp": 0.01274491, + "balance_loss_clip": 0.06284538, + "balance_loss_mlp": 0.01258111, + "epoch": 0.4426875093942582, + "flos": 14215474005120.0, + "grad_norm": 1.7592716332344263, + "language_loss": 0.75051332, + "learning_rate": 2.464059445424366e-06, + "loss": 0.82783866, + "num_input_tokens_seen": 157852290, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.16381836, + "step": 7363, + "time_per_iteration": 2.526925802230835 + }, + { + "auxiliary_loss_clip": 0.0633463, + "auxiliary_loss_mlp": 0.01256608, + "balance_loss_clip": 0.06260501, + "balance_loss_mlp": 0.01253844, + "epoch": 0.4427476326469262, + "flos": 70140100181760.0, + "grad_norm": 0.6687771463902197, + "language_loss": 0.55581295, + "learning_rate": 2.463680603863743e-06, + "loss": 0.63172531, + "num_input_tokens_seen": 157923060, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.02767944, + "step": 7364, + "time_per_iteration": 3.2234084606170654 + }, + { + "auxiliary_loss_clip": 0.06445954, + "auxiliary_loss_mlp": 0.01269396, + "balance_loss_clip": 0.06280937, + "balance_loss_mlp": 0.01255479, + "epoch": 0.44280775589959415, + "flos": 25451761219200.0, + "grad_norm": 6.076987981061014, + "language_loss": 0.75066888, + "learning_rate": 2.463301744720305e-06, + "loss": 0.82782239, + "num_input_tokens_seen": 157944110, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.13928223, + "step": 7365, + "time_per_iteration": 2.606168746948242 + }, + { + "auxiliary_loss_clip": 0.06448679, + "auxiliary_loss_mlp": 0.01268458, + "balance_loss_clip": 0.06282686, + "balance_loss_mlp": 0.01253724, + "epoch": 0.4428678791522621, + "flos": 22863900994560.0, + "grad_norm": 1.5120042705282817, + "language_loss": 0.74655497, + "learning_rate": 2.4629228680084184e-06, + "loss": 0.82372636, + "num_input_tokens_seen": 157964295, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.1473999, + "step": 7366, + "time_per_iteration": 2.5269834995269775 + }, + { + "auxiliary_loss_clip": 0.06449491, + "auxiliary_loss_mlp": 0.0127034, + "balance_loss_clip": 0.06283636, + "balance_loss_mlp": 0.01255438, + "epoch": 0.44292800240493013, + "flos": 25819608643200.0, + "grad_norm": 2.3253747528787447, + "language_loss": 0.7339704, + "learning_rate": 2.46254397374245e-06, + "loss": 0.81116873, + "num_input_tokens_seen": 157983970, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.14904785, + "step": 7367, + "time_per_iteration": 4.017570495605469 + }, + { + "auxiliary_loss_clip": 0.06453082, + "auxiliary_loss_mlp": 0.01276023, + "balance_loss_clip": 0.06286091, + "balance_loss_mlp": 0.01260979, + "epoch": 0.4429881256575981, + "flos": 32425238217600.0, + "grad_norm": 1.584590811661976, + "language_loss": 0.73953557, + "learning_rate": 2.4621650619367677e-06, + "loss": 0.81682664, + "num_input_tokens_seen": 158006515, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.15057373, + "step": 7368, + "time_per_iteration": 2.6219804286956787 + }, + { + "auxiliary_loss_clip": 0.06446074, + "auxiliary_loss_mlp": 0.01270312, + "balance_loss_clip": 0.06281151, + "balance_loss_mlp": 0.01256007, + "epoch": 0.44304824891026606, + "flos": 22170231768960.0, + "grad_norm": 1.6442785623938219, + "language_loss": 0.79845673, + "learning_rate": 2.4617861326057403e-06, + "loss": 0.8756206, + "num_input_tokens_seen": 158025565, + "router_z_loss_clip": 1.64941406, + "router_z_loss_mlp": 0.14306641, + "step": 7369, + "time_per_iteration": 2.5048859119415283 + }, + { + "auxiliary_loss_clip": 0.06445719, + "auxiliary_loss_mlp": 0.01268056, + "balance_loss_clip": 0.0628242, + "balance_loss_mlp": 0.01253524, + "epoch": 0.443108372162934, + "flos": 25345725477120.0, + "grad_norm": 1.8080912741875748, + "language_loss": 0.72226167, + "learning_rate": 2.461407185763737e-06, + "loss": 0.79939938, + "num_input_tokens_seen": 158045620, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.14538574, + "step": 7370, + "time_per_iteration": 2.59167218208313 + }, + { + "auxiliary_loss_clip": 0.06444093, + "auxiliary_loss_mlp": 0.01274154, + "balance_loss_clip": 0.06279977, + "balance_loss_mlp": 0.01259741, + "epoch": 0.443168495415602, + "flos": 23337616452480.0, + "grad_norm": 2.642683672552081, + "language_loss": 0.70957971, + "learning_rate": 2.461028221425126e-06, + "loss": 0.78676224, + "num_input_tokens_seen": 158063505, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.14428711, + "step": 7371, + "time_per_iteration": 2.5119266510009766 + }, + { + "auxiliary_loss_clip": 0.0644391, + "auxiliary_loss_mlp": 0.01268622, + "balance_loss_clip": 0.06280756, + "balance_loss_mlp": 0.01255288, + "epoch": 0.44322861866826996, + "flos": 21877924400640.0, + "grad_norm": 2.5641722247612977, + "language_loss": 0.69211292, + "learning_rate": 2.4606492396042786e-06, + "loss": 0.76923823, + "num_input_tokens_seen": 158080335, + "router_z_loss_clip": 1.63085938, + "router_z_loss_mlp": 0.13330078, + "step": 7372, + "time_per_iteration": 2.575803518295288 + }, + { + "auxiliary_loss_clip": 0.06450622, + "auxiliary_loss_mlp": 0.01273627, + "balance_loss_clip": 0.06281562, + "balance_loss_mlp": 0.01257855, + "epoch": 0.4432887419209379, + "flos": 20090649611520.0, + "grad_norm": 1.7339006835744544, + "language_loss": 0.83742619, + "learning_rate": 2.4602702403155664e-06, + "loss": 0.91466868, + "num_input_tokens_seen": 158098955, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.15765381, + "step": 7373, + "time_per_iteration": 4.006488084793091 + }, + { + "auxiliary_loss_clip": 0.06340961, + "auxiliary_loss_mlp": 0.01252329, + "balance_loss_clip": 0.06267951, + "balance_loss_mlp": 0.01249765, + "epoch": 0.4433488651736059, + "flos": 70056593988480.0, + "grad_norm": 0.7566866942124226, + "language_loss": 0.55204445, + "learning_rate": 2.4598912235733604e-06, + "loss": 0.62797731, + "num_input_tokens_seen": 158164110, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.02565002, + "step": 7374, + "time_per_iteration": 3.1780457496643066 + }, + { + "auxiliary_loss_clip": 0.06443411, + "auxiliary_loss_mlp": 0.01275671, + "balance_loss_clip": 0.06280876, + "balance_loss_mlp": 0.01260198, + "epoch": 0.44340898842627385, + "flos": 16286838462720.0, + "grad_norm": 2.3260457628480617, + "language_loss": 0.82868445, + "learning_rate": 2.4595121893920327e-06, + "loss": 0.90587527, + "num_input_tokens_seen": 158179850, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.15478516, + "step": 7375, + "time_per_iteration": 2.5473110675811768 + }, + { + "auxiliary_loss_clip": 0.0644948, + "auxiliary_loss_mlp": 0.01269753, + "balance_loss_clip": 0.06282064, + "balance_loss_mlp": 0.01255388, + "epoch": 0.4434691116789418, + "flos": 16616601406080.0, + "grad_norm": 2.217281539940859, + "language_loss": 0.83904636, + "learning_rate": 2.4591331377859578e-06, + "loss": 0.91623867, + "num_input_tokens_seen": 158196590, + "router_z_loss_clip": 1.671875, + "router_z_loss_mlp": 0.1439209, + "step": 7376, + "time_per_iteration": 2.4960668087005615 + }, + { + "auxiliary_loss_clip": 0.06447101, + "auxiliary_loss_mlp": 0.01271986, + "balance_loss_clip": 0.06282647, + "balance_loss_mlp": 0.01257573, + "epoch": 0.4435292349316098, + "flos": 19069397648640.0, + "grad_norm": 1.7110647715019258, + "language_loss": 0.77357483, + "learning_rate": 2.4587540687695077e-06, + "loss": 0.85076571, + "num_input_tokens_seen": 158216355, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.14422607, + "step": 7377, + "time_per_iteration": 2.5489466190338135 + }, + { + "auxiliary_loss_clip": 0.064443, + "auxiliary_loss_mlp": 0.01269165, + "balance_loss_clip": 0.06284986, + "balance_loss_mlp": 0.01255396, + "epoch": 0.44358935818427775, + "flos": 21257656951680.0, + "grad_norm": 1.7746716431943175, + "language_loss": 0.75928617, + "learning_rate": 2.458374982357057e-06, + "loss": 0.83642089, + "num_input_tokens_seen": 158235825, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.13763428, + "step": 7378, + "time_per_iteration": 2.498782157897949 + }, + { + "auxiliary_loss_clip": 0.06446375, + "auxiliary_loss_mlp": 0.01269929, + "balance_loss_clip": 0.06281648, + "balance_loss_mlp": 0.01255106, + "epoch": 0.4436494814369457, + "flos": 12500259327360.0, + "grad_norm": 1.8740687903376234, + "language_loss": 0.69627756, + "learning_rate": 2.457995878562982e-06, + "loss": 0.77344066, + "num_input_tokens_seen": 158254230, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.14825439, + "step": 7379, + "time_per_iteration": 2.5212602615356445 + }, + { + "auxiliary_loss_clip": 0.0645185, + "auxiliary_loss_mlp": 0.01266938, + "balance_loss_clip": 0.0628576, + "balance_loss_mlp": 0.01252556, + "epoch": 0.44370960468961373, + "flos": 23666666636160.0, + "grad_norm": 2.508566876625721, + "language_loss": 0.73565447, + "learning_rate": 2.457616757401656e-06, + "loss": 0.81284231, + "num_input_tokens_seen": 158273400, + "router_z_loss_clip": 1.66113281, + "router_z_loss_mlp": 0.1439209, + "step": 7380, + "time_per_iteration": 2.500859260559082 + }, + { + "auxiliary_loss_clip": 0.06449685, + "auxiliary_loss_mlp": 0.01268804, + "balance_loss_clip": 0.06285541, + "balance_loss_mlp": 0.01255452, + "epoch": 0.4437697279422817, + "flos": 32425196290560.0, + "grad_norm": 1.7107220322970214, + "language_loss": 0.65104783, + "learning_rate": 2.457237618887458e-06, + "loss": 0.72823262, + "num_input_tokens_seen": 158296840, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.13336182, + "step": 7381, + "time_per_iteration": 2.618229627609253 + }, + { + "auxiliary_loss_clip": 0.06454551, + "auxiliary_loss_mlp": 0.01272971, + "balance_loss_clip": 0.06288015, + "balance_loss_mlp": 0.01258773, + "epoch": 0.44382985119494966, + "flos": 18118570642560.0, + "grad_norm": 2.331874867497661, + "language_loss": 0.80543017, + "learning_rate": 2.456858463034763e-06, + "loss": 0.88270545, + "num_input_tokens_seen": 158314935, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.14190674, + "step": 7382, + "time_per_iteration": 2.4738404750823975 + }, + { + "auxiliary_loss_clip": 0.06452931, + "auxiliary_loss_mlp": 0.01272481, + "balance_loss_clip": 0.06287742, + "balance_loss_mlp": 0.01258486, + "epoch": 0.44388997444761763, + "flos": 30782083651200.0, + "grad_norm": 1.5922456749371714, + "language_loss": 0.65226638, + "learning_rate": 2.456479289857949e-06, + "loss": 0.72952044, + "num_input_tokens_seen": 158334620, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.13983154, + "step": 7383, + "time_per_iteration": 2.614912986755371 + }, + { + "auxiliary_loss_clip": 0.0645685, + "auxiliary_loss_mlp": 0.01272667, + "balance_loss_clip": 0.0628838, + "balance_loss_mlp": 0.01258088, + "epoch": 0.4439500977002856, + "flos": 20345333696640.0, + "grad_norm": 2.064556949518224, + "language_loss": 0.76699257, + "learning_rate": 2.4561000993713953e-06, + "loss": 0.84428775, + "num_input_tokens_seen": 158350550, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.14587402, + "step": 7384, + "time_per_iteration": 2.4842731952667236 + }, + { + "auxiliary_loss_clip": 0.06456664, + "auxiliary_loss_mlp": 0.012692, + "balance_loss_clip": 0.06288753, + "balance_loss_mlp": 0.01254442, + "epoch": 0.44401022095295356, + "flos": 20376667923840.0, + "grad_norm": 2.2924078267975605, + "language_loss": 0.80810666, + "learning_rate": 2.4557208915894796e-06, + "loss": 0.88536537, + "num_input_tokens_seen": 158369555, + "router_z_loss_clip": 1.67871094, + "router_z_loss_mlp": 0.14758301, + "step": 7385, + "time_per_iteration": 2.5268380641937256 + }, + { + "auxiliary_loss_clip": 0.0645503, + "auxiliary_loss_mlp": 0.01272748, + "balance_loss_clip": 0.06290472, + "balance_loss_mlp": 0.01257013, + "epoch": 0.4440703442056215, + "flos": 20236950040320.0, + "grad_norm": 1.6897241264536553, + "language_loss": 0.82179439, + "learning_rate": 2.455341666526582e-06, + "loss": 0.89907217, + "num_input_tokens_seen": 158388045, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.15734863, + "step": 7386, + "time_per_iteration": 2.497891426086426 + }, + { + "auxiliary_loss_clip": 0.06463334, + "auxiliary_loss_mlp": 0.01273049, + "balance_loss_clip": 0.06290253, + "balance_loss_mlp": 0.01257683, + "epoch": 0.4441304674582895, + "flos": 39504163979520.0, + "grad_norm": 2.9557468241194624, + "language_loss": 0.70275033, + "learning_rate": 2.4549624241970832e-06, + "loss": 0.78011411, + "num_input_tokens_seen": 158410115, + "router_z_loss_clip": 1.73046875, + "router_z_loss_mlp": 0.15356445, + "step": 7387, + "time_per_iteration": 2.6782705783843994 + }, + { + "auxiliary_loss_clip": 0.06455649, + "auxiliary_loss_mlp": 0.01272917, + "balance_loss_clip": 0.06289866, + "balance_loss_mlp": 0.01258206, + "epoch": 0.44419059071095746, + "flos": 14834902913280.0, + "grad_norm": 1.9684531060003607, + "language_loss": 0.72165161, + "learning_rate": 2.4545831646153628e-06, + "loss": 0.79893732, + "num_input_tokens_seen": 158427765, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.14715576, + "step": 7388, + "time_per_iteration": 2.5119476318359375 + }, + { + "auxiliary_loss_clip": 0.06464041, + "auxiliary_loss_mlp": 0.01270575, + "balance_loss_clip": 0.06293739, + "balance_loss_mlp": 0.01255113, + "epoch": 0.4442507139636254, + "flos": 22644408424320.0, + "grad_norm": 1.566920019209845, + "language_loss": 0.69646138, + "learning_rate": 2.4542038877958044e-06, + "loss": 0.77380753, + "num_input_tokens_seen": 158446375, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.15454102, + "step": 7389, + "time_per_iteration": 2.671290874481201 + }, + { + "auxiliary_loss_clip": 0.06455444, + "auxiliary_loss_mlp": 0.01269625, + "balance_loss_clip": 0.06289597, + "balance_loss_mlp": 0.01255487, + "epoch": 0.4443108372162934, + "flos": 38299994553600.0, + "grad_norm": 1.918848783354648, + "language_loss": 0.74912727, + "learning_rate": 2.453824593752788e-06, + "loss": 0.82637799, + "num_input_tokens_seen": 158467260, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.14135742, + "step": 7390, + "time_per_iteration": 2.6656923294067383 + }, + { + "auxiliary_loss_clip": 0.06453501, + "auxiliary_loss_mlp": 0.01269903, + "balance_loss_clip": 0.06290193, + "balance_loss_mlp": 0.0125657, + "epoch": 0.44437096046896135, + "flos": 17754790141440.0, + "grad_norm": 1.7902511429273704, + "language_loss": 0.82203722, + "learning_rate": 2.4534452825006988e-06, + "loss": 0.89927119, + "num_input_tokens_seen": 158486720, + "router_z_loss_clip": 1.63476562, + "router_z_loss_mlp": 0.13323975, + "step": 7391, + "time_per_iteration": 2.5425097942352295 + }, + { + "auxiliary_loss_clip": 0.06451984, + "auxiliary_loss_mlp": 0.01268602, + "balance_loss_clip": 0.06289234, + "balance_loss_mlp": 0.01254547, + "epoch": 0.4444310837216293, + "flos": 13736936937600.0, + "grad_norm": 1.5949305897923123, + "language_loss": 0.73880637, + "learning_rate": 2.4530659540539185e-06, + "loss": 0.81601214, + "num_input_tokens_seen": 158502530, + "router_z_loss_clip": 1.62890625, + "router_z_loss_mlp": 0.14044189, + "step": 7392, + "time_per_iteration": 2.509695053100586 + }, + { + "auxiliary_loss_clip": 0.06450866, + "auxiliary_loss_mlp": 0.01269173, + "balance_loss_clip": 0.06287552, + "balance_loss_mlp": 0.01256424, + "epoch": 0.44449120697429734, + "flos": 25017346126080.0, + "grad_norm": 1.7319744549950544, + "language_loss": 0.79953551, + "learning_rate": 2.4526866084268313e-06, + "loss": 0.87673593, + "num_input_tokens_seen": 158522715, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.12744141, + "step": 7393, + "time_per_iteration": 2.6058006286621094 + }, + { + "auxiliary_loss_clip": 0.06460646, + "auxiliary_loss_mlp": 0.01270821, + "balance_loss_clip": 0.06291801, + "balance_loss_mlp": 0.01255276, + "epoch": 0.4445513302269653, + "flos": 32680006156800.0, + "grad_norm": 1.76893741086752, + "language_loss": 0.8113097, + "learning_rate": 2.4523072456338226e-06, + "loss": 0.88862437, + "num_input_tokens_seen": 158543615, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.15551758, + "step": 7394, + "time_per_iteration": 2.6408586502075195 + }, + { + "auxiliary_loss_clip": 0.06448914, + "auxiliary_loss_mlp": 0.01267892, + "balance_loss_clip": 0.06286056, + "balance_loss_mlp": 0.01254796, + "epoch": 0.44461145347963327, + "flos": 11660583162240.0, + "grad_norm": 2.0227503675909646, + "language_loss": 0.79471397, + "learning_rate": 2.4519278656892785e-06, + "loss": 0.87188208, + "num_input_tokens_seen": 158560330, + "router_z_loss_clip": 1.62988281, + "router_z_loss_mlp": 0.13092041, + "step": 7395, + "time_per_iteration": 2.482771158218384 + }, + { + "auxiliary_loss_clip": 0.06457528, + "auxiliary_loss_mlp": 0.01269923, + "balance_loss_clip": 0.06293359, + "balance_loss_mlp": 0.01255838, + "epoch": 0.44467157673230123, + "flos": 20893079836800.0, + "grad_norm": 1.8465254869377097, + "language_loss": 0.68925393, + "learning_rate": 2.451548468607584e-06, + "loss": 0.76652849, + "num_input_tokens_seen": 158579735, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.14074707, + "step": 7396, + "time_per_iteration": 2.526031017303467 + }, + { + "auxiliary_loss_clip": 0.06458125, + "auxiliary_loss_mlp": 0.01266336, + "balance_loss_clip": 0.06290217, + "balance_loss_mlp": 0.0125299, + "epoch": 0.4447316999849692, + "flos": 18551140945920.0, + "grad_norm": 2.1703937468753964, + "language_loss": 0.80956584, + "learning_rate": 2.451169054403126e-06, + "loss": 0.88681042, + "num_input_tokens_seen": 158597075, + "router_z_loss_clip": 1.67773438, + "router_z_loss_mlp": 0.13342285, + "step": 7397, + "time_per_iteration": 2.482004404067993 + }, + { + "auxiliary_loss_clip": 0.06453413, + "auxiliary_loss_mlp": 0.01269867, + "balance_loss_clip": 0.06290947, + "balance_loss_mlp": 0.01256814, + "epoch": 0.44479182323763716, + "flos": 23775846906240.0, + "grad_norm": 2.7975733901761672, + "language_loss": 0.67842102, + "learning_rate": 2.450789623090293e-06, + "loss": 0.75565386, + "num_input_tokens_seen": 158616650, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.13067627, + "step": 7398, + "time_per_iteration": 2.579227924346924 + }, + { + "auxiliary_loss_clip": 0.06451767, + "auxiliary_loss_mlp": 0.01268989, + "balance_loss_clip": 0.06290427, + "balance_loss_mlp": 0.01256097, + "epoch": 0.44485194649030513, + "flos": 16549237123200.0, + "grad_norm": 1.6886298033370946, + "language_loss": 0.70454216, + "learning_rate": 2.450410174683472e-06, + "loss": 0.78174973, + "num_input_tokens_seen": 158634515, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.12896729, + "step": 7399, + "time_per_iteration": 2.491422653198242 + }, + { + "auxiliary_loss_clip": 0.06448349, + "auxiliary_loss_mlp": 0.01267519, + "balance_loss_clip": 0.06287403, + "balance_loss_mlp": 0.01254543, + "epoch": 0.4449120697429731, + "flos": 22607455973760.0, + "grad_norm": 1.7365156462421643, + "language_loss": 0.72588718, + "learning_rate": 2.4500307091970514e-06, + "loss": 0.80304587, + "num_input_tokens_seen": 158653760, + "router_z_loss_clip": 1.61035156, + "router_z_loss_mlp": 0.12963867, + "step": 7400, + "time_per_iteration": 3.9914138317108154 + }, + { + "auxiliary_loss_clip": 0.06451382, + "auxiliary_loss_mlp": 0.01270619, + "balance_loss_clip": 0.06288703, + "balance_loss_mlp": 0.0125738, + "epoch": 0.44497219299564106, + "flos": 20009994456960.0, + "grad_norm": 1.5547932465186114, + "language_loss": 0.85223019, + "learning_rate": 2.449651226645422e-06, + "loss": 0.92945021, + "num_input_tokens_seen": 158672190, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.13250732, + "step": 7401, + "time_per_iteration": 3.972844123840332 + }, + { + "auxiliary_loss_clip": 0.0644277, + "auxiliary_loss_mlp": 0.01266074, + "balance_loss_clip": 0.06283394, + "balance_loss_mlp": 0.01254099, + "epoch": 0.445032316248309, + "flos": 25601499665280.0, + "grad_norm": 1.7738805367720483, + "language_loss": 0.8345179, + "learning_rate": 2.449271727042973e-06, + "loss": 0.91160637, + "num_input_tokens_seen": 158694115, + "router_z_loss_clip": 1.59277344, + "router_z_loss_mlp": 0.11968994, + "step": 7402, + "time_per_iteration": 2.546557664871216 + }, + { + "auxiliary_loss_clip": 0.06449325, + "auxiliary_loss_mlp": 0.0126916, + "balance_loss_clip": 0.06285563, + "balance_loss_mlp": 0.01255898, + "epoch": 0.445092439500977, + "flos": 21256608775680.0, + "grad_norm": 1.6765614973905527, + "language_loss": 0.77230763, + "learning_rate": 2.4488922104040947e-06, + "loss": 0.84949255, + "num_input_tokens_seen": 158711000, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.13275146, + "step": 7403, + "time_per_iteration": 2.540351152420044 + }, + { + "auxiliary_loss_clip": 0.06362203, + "auxiliary_loss_mlp": 0.01255762, + "balance_loss_clip": 0.0628911, + "balance_loss_mlp": 0.01252394, + "epoch": 0.44515256275364495, + "flos": 57781990506240.0, + "grad_norm": 0.751382178532419, + "language_loss": 0.60078514, + "learning_rate": 2.4485126767431793e-06, + "loss": 0.67696476, + "num_input_tokens_seen": 158769675, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.03375244, + "step": 7404, + "time_per_iteration": 3.1188013553619385 + }, + { + "auxiliary_loss_clip": 0.06455964, + "auxiliary_loss_mlp": 0.01272779, + "balance_loss_clip": 0.06287853, + "balance_loss_mlp": 0.01258462, + "epoch": 0.4452126860063129, + "flos": 15601386936960.0, + "grad_norm": 1.4877710129276585, + "language_loss": 0.82279229, + "learning_rate": 2.4481331260746177e-06, + "loss": 0.90007967, + "num_input_tokens_seen": 158788215, + "router_z_loss_clip": 1.68066406, + "router_z_loss_mlp": 0.14312744, + "step": 7405, + "time_per_iteration": 2.5388095378875732 + }, + { + "auxiliary_loss_clip": 0.06447265, + "auxiliary_loss_mlp": 0.01267875, + "balance_loss_clip": 0.06283686, + "balance_loss_mlp": 0.0125512, + "epoch": 0.4452728092589809, + "flos": 21623995002240.0, + "grad_norm": 1.5786988713847923, + "language_loss": 0.75529754, + "learning_rate": 2.4477535584128036e-06, + "loss": 0.83244896, + "num_input_tokens_seen": 158809090, + "router_z_loss_clip": 1.63476562, + "router_z_loss_mlp": 0.12744141, + "step": 7406, + "time_per_iteration": 2.5249385833740234 + }, + { + "auxiliary_loss_clip": 0.06440533, + "auxiliary_loss_mlp": 0.01271164, + "balance_loss_clip": 0.06282739, + "balance_loss_mlp": 0.01259094, + "epoch": 0.4453329325116489, + "flos": 29505267135360.0, + "grad_norm": 1.6524917293298949, + "language_loss": 0.65847838, + "learning_rate": 2.447373973772129e-06, + "loss": 0.73559535, + "num_input_tokens_seen": 158828320, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.12060547, + "step": 7407, + "time_per_iteration": 3.998326063156128 + }, + { + "auxiliary_loss_clip": 0.06449907, + "auxiliary_loss_mlp": 0.01269795, + "balance_loss_clip": 0.06284529, + "balance_loss_mlp": 0.01256777, + "epoch": 0.44539305576431687, + "flos": 21367549981440.0, + "grad_norm": 1.547450204556426, + "language_loss": 0.68216872, + "learning_rate": 2.4469943721669887e-06, + "loss": 0.75936574, + "num_input_tokens_seen": 158847040, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.13018799, + "step": 7408, + "time_per_iteration": 2.5295586585998535 + }, + { + "auxiliary_loss_clip": 0.06449315, + "auxiliary_loss_mlp": 0.01269644, + "balance_loss_clip": 0.06285807, + "balance_loss_mlp": 0.01256508, + "epoch": 0.44545317901698483, + "flos": 41437278000000.0, + "grad_norm": 2.0427525389439443, + "language_loss": 0.720608, + "learning_rate": 2.4466147536117776e-06, + "loss": 0.79779756, + "num_input_tokens_seen": 158870490, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.13134766, + "step": 7409, + "time_per_iteration": 2.678666114807129 + }, + { + "auxiliary_loss_clip": 0.06448312, + "auxiliary_loss_mlp": 0.01270862, + "balance_loss_clip": 0.06284307, + "balance_loss_mlp": 0.01257045, + "epoch": 0.4455133022696528, + "flos": 22061638477440.0, + "grad_norm": 1.7184461657241017, + "language_loss": 0.65940762, + "learning_rate": 2.4462351181208895e-06, + "loss": 0.73659933, + "num_input_tokens_seen": 158889920, + "router_z_loss_clip": 1.63769531, + "router_z_loss_mlp": 0.13818359, + "step": 7410, + "time_per_iteration": 2.5486950874328613 + }, + { + "auxiliary_loss_clip": 0.06453686, + "auxiliary_loss_mlp": 0.01268565, + "balance_loss_clip": 0.06284985, + "balance_loss_mlp": 0.0125522, + "epoch": 0.44557342552232077, + "flos": 23483665319040.0, + "grad_norm": 3.696220183147237, + "language_loss": 0.74690163, + "learning_rate": 2.4458554657087217e-06, + "loss": 0.82412422, + "num_input_tokens_seen": 158909580, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.13360596, + "step": 7411, + "time_per_iteration": 2.5290050506591797 + }, + { + "auxiliary_loss_clip": 0.0644176, + "auxiliary_loss_mlp": 0.01268016, + "balance_loss_clip": 0.06284117, + "balance_loss_mlp": 0.01256166, + "epoch": 0.44563354877498873, + "flos": 19140577292160.0, + "grad_norm": 2.065063291172047, + "language_loss": 0.7906481, + "learning_rate": 2.4454757963896695e-06, + "loss": 0.86774588, + "num_input_tokens_seen": 158924600, + "router_z_loss_clip": 1.57617188, + "router_z_loss_mlp": 0.11859131, + "step": 7412, + "time_per_iteration": 2.5156190395355225 + }, + { + "auxiliary_loss_clip": 0.0645022, + "auxiliary_loss_mlp": 0.01268988, + "balance_loss_clip": 0.06282784, + "balance_loss_mlp": 0.01255792, + "epoch": 0.4456936720276567, + "flos": 13625744169600.0, + "grad_norm": 2.15802472542835, + "language_loss": 0.80199099, + "learning_rate": 2.4450961101781304e-06, + "loss": 0.87918305, + "num_input_tokens_seen": 158939345, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.13195801, + "step": 7413, + "time_per_iteration": 3.9694504737854004 + }, + { + "auxiliary_loss_clip": 0.06443125, + "auxiliary_loss_mlp": 0.01265994, + "balance_loss_clip": 0.0628258, + "balance_loss_mlp": 0.01254037, + "epoch": 0.44575379528032466, + "flos": 14717840359680.0, + "grad_norm": 1.9357576200238034, + "language_loss": 0.76531088, + "learning_rate": 2.4447164070885026e-06, + "loss": 0.8424021, + "num_input_tokens_seen": 158955855, + "router_z_loss_clip": 1.60546875, + "router_z_loss_mlp": 0.11956787, + "step": 7414, + "time_per_iteration": 2.515110731124878 + }, + { + "auxiliary_loss_clip": 0.06447163, + "auxiliary_loss_mlp": 0.01269628, + "balance_loss_clip": 0.06286051, + "balance_loss_mlp": 0.01257177, + "epoch": 0.4458139185329926, + "flos": 24177586106880.0, + "grad_norm": 1.4166090983539044, + "language_loss": 0.84000552, + "learning_rate": 2.4443366871351837e-06, + "loss": 0.91717345, + "num_input_tokens_seen": 158976315, + "router_z_loss_clip": 1.61132812, + "router_z_loss_mlp": 0.12457275, + "step": 7415, + "time_per_iteration": 2.528939723968506 + }, + { + "auxiliary_loss_clip": 0.06442896, + "auxiliary_loss_mlp": 0.01267494, + "balance_loss_clip": 0.06282021, + "balance_loss_mlp": 0.01254733, + "epoch": 0.4458740417856606, + "flos": 21768660276480.0, + "grad_norm": 1.9578275078246672, + "language_loss": 0.84485269, + "learning_rate": 2.4439569503325732e-06, + "loss": 0.92195654, + "num_input_tokens_seen": 158996725, + "router_z_loss_clip": 1.60839844, + "router_z_loss_mlp": 0.12756348, + "step": 7416, + "time_per_iteration": 2.57027268409729 + }, + { + "auxiliary_loss_clip": 0.06451635, + "auxiliary_loss_mlp": 0.0126797, + "balance_loss_clip": 0.06285699, + "balance_loss_mlp": 0.01255298, + "epoch": 0.44593416503832856, + "flos": 21075074904960.0, + "grad_norm": 1.7085615846271827, + "language_loss": 0.81362593, + "learning_rate": 2.4435771966950706e-06, + "loss": 0.89082199, + "num_input_tokens_seen": 159017255, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.12670898, + "step": 7417, + "time_per_iteration": 2.547837734222412 + }, + { + "auxiliary_loss_clip": 0.06448114, + "auxiliary_loss_mlp": 0.01267636, + "balance_loss_clip": 0.06283562, + "balance_loss_mlp": 0.01255601, + "epoch": 0.4459942882909965, + "flos": 22606910922240.0, + "grad_norm": 1.8801354401717048, + "language_loss": 0.81286234, + "learning_rate": 2.443197426237077e-06, + "loss": 0.89001989, + "num_input_tokens_seen": 159035010, + "router_z_loss_clip": 1.64648438, + "router_z_loss_mlp": 0.12042236, + "step": 7418, + "time_per_iteration": 2.5529236793518066 + }, + { + "auxiliary_loss_clip": 0.06449951, + "auxiliary_loss_mlp": 0.01268288, + "balance_loss_clip": 0.06284475, + "balance_loss_mlp": 0.01255652, + "epoch": 0.4460544115436645, + "flos": 26512732817280.0, + "grad_norm": 1.8068813549808598, + "language_loss": 0.77866399, + "learning_rate": 2.442817638972991e-06, + "loss": 0.85584641, + "num_input_tokens_seen": 159055345, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.12646484, + "step": 7419, + "time_per_iteration": 2.637568235397339 + }, + { + "auxiliary_loss_clip": 0.06446308, + "auxiliary_loss_mlp": 0.01271146, + "balance_loss_clip": 0.06283416, + "balance_loss_mlp": 0.01258349, + "epoch": 0.4461145347963325, + "flos": 17609957159040.0, + "grad_norm": 3.5469346323262068, + "language_loss": 0.73053217, + "learning_rate": 2.4424378349172176e-06, + "loss": 0.80770659, + "num_input_tokens_seen": 159074225, + "router_z_loss_clip": 1.63085938, + "router_z_loss_mlp": 0.12805176, + "step": 7420, + "time_per_iteration": 2.4839932918548584 + }, + { + "auxiliary_loss_clip": 0.06441851, + "auxiliary_loss_mlp": 0.01268009, + "balance_loss_clip": 0.06283888, + "balance_loss_mlp": 0.01255176, + "epoch": 0.44617465804900047, + "flos": 27274982209920.0, + "grad_norm": 1.4177043979342248, + "language_loss": 0.75314558, + "learning_rate": 2.442058014084156e-06, + "loss": 0.83024418, + "num_input_tokens_seen": 159095415, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.12823486, + "step": 7421, + "time_per_iteration": 2.6001040935516357 + }, + { + "auxiliary_loss_clip": 0.06439819, + "auxiliary_loss_mlp": 0.01266608, + "balance_loss_clip": 0.06281345, + "balance_loss_mlp": 0.01254073, + "epoch": 0.44623478130166844, + "flos": 17792371497600.0, + "grad_norm": 1.9155365450665858, + "language_loss": 0.75864565, + "learning_rate": 2.44167817648821e-06, + "loss": 0.83570993, + "num_input_tokens_seen": 159114615, + "router_z_loss_clip": 1.58398438, + "router_z_loss_mlp": 0.12536621, + "step": 7422, + "time_per_iteration": 2.481241226196289 + }, + { + "auxiliary_loss_clip": 0.06447253, + "auxiliary_loss_mlp": 0.01267362, + "balance_loss_clip": 0.06284253, + "balance_loss_mlp": 0.01254804, + "epoch": 0.4462949045543364, + "flos": 23009698298880.0, + "grad_norm": 1.7347835392128452, + "language_loss": 0.65679651, + "learning_rate": 2.441298322143784e-06, + "loss": 0.73394263, + "num_input_tokens_seen": 159134370, + "router_z_loss_clip": 1.62890625, + "router_z_loss_mlp": 0.12573242, + "step": 7423, + "time_per_iteration": 2.539268732070923 + }, + { + "auxiliary_loss_clip": 0.06440745, + "auxiliary_loss_mlp": 0.01268488, + "balance_loss_clip": 0.06283564, + "balance_loss_mlp": 0.01256591, + "epoch": 0.44635502780700437, + "flos": 17825592441600.0, + "grad_norm": 1.4381231336851048, + "language_loss": 0.79473054, + "learning_rate": 2.4409184510652807e-06, + "loss": 0.87182289, + "num_input_tokens_seen": 159152540, + "router_z_loss_clip": 1.57128906, + "router_z_loss_mlp": 0.11901855, + "step": 7424, + "time_per_iteration": 2.488111972808838 + }, + { + "auxiliary_loss_clip": 0.06437074, + "auxiliary_loss_mlp": 0.01267937, + "balance_loss_clip": 0.06280597, + "balance_loss_mlp": 0.01256148, + "epoch": 0.44641515105967233, + "flos": 26695314864000.0, + "grad_norm": 1.3471148592694158, + "language_loss": 0.8055563, + "learning_rate": 2.4405385632671063e-06, + "loss": 0.88260639, + "num_input_tokens_seen": 159173425, + "router_z_loss_clip": 1.56542969, + "router_z_loss_mlp": 0.11791992, + "step": 7425, + "time_per_iteration": 2.598731756210327 + }, + { + "auxiliary_loss_clip": 0.06439465, + "auxiliary_loss_mlp": 0.01271755, + "balance_loss_clip": 0.06279327, + "balance_loss_mlp": 0.01259536, + "epoch": 0.4464752743123403, + "flos": 18918778734720.0, + "grad_norm": 1.4143607287110962, + "language_loss": 0.77488291, + "learning_rate": 2.4401586587636655e-06, + "loss": 0.85199511, + "num_input_tokens_seen": 159191210, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.12207031, + "step": 7426, + "time_per_iteration": 2.494330406188965 + }, + { + "auxiliary_loss_clip": 0.06445856, + "auxiliary_loss_mlp": 0.01267303, + "balance_loss_clip": 0.06281333, + "balance_loss_mlp": 0.01253773, + "epoch": 0.44653539756500826, + "flos": 29578081933440.0, + "grad_norm": 1.9924998088803147, + "language_loss": 0.64776599, + "learning_rate": 2.4397787375693634e-06, + "loss": 0.72489762, + "num_input_tokens_seen": 159211755, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.13513184, + "step": 7427, + "time_per_iteration": 2.611482858657837 + }, + { + "auxiliary_loss_clip": 0.06441574, + "auxiliary_loss_mlp": 0.01275968, + "balance_loss_clip": 0.06284505, + "balance_loss_mlp": 0.0126372, + "epoch": 0.44659552081767623, + "flos": 21475137024000.0, + "grad_norm": 1.5780428941103348, + "language_loss": 0.75530696, + "learning_rate": 2.439398799698608e-06, + "loss": 0.8324824, + "num_input_tokens_seen": 159230315, + "router_z_loss_clip": 1.56933594, + "router_z_loss_mlp": 0.12268066, + "step": 7428, + "time_per_iteration": 2.505094051361084 + }, + { + "auxiliary_loss_clip": 0.06441561, + "auxiliary_loss_mlp": 0.01271156, + "balance_loss_clip": 0.06281359, + "balance_loss_mlp": 0.0125843, + "epoch": 0.4466556440703442, + "flos": 17937791458560.0, + "grad_norm": 1.912744298925221, + "language_loss": 0.78478271, + "learning_rate": 2.439018845165806e-06, + "loss": 0.86190987, + "num_input_tokens_seen": 159249810, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.12731934, + "step": 7429, + "time_per_iteration": 2.5107972621917725 + }, + { + "auxiliary_loss_clip": 0.06447433, + "auxiliary_loss_mlp": 0.0127403, + "balance_loss_clip": 0.06283738, + "balance_loss_mlp": 0.01260667, + "epoch": 0.44671576732301216, + "flos": 21114081780480.0, + "grad_norm": 1.7694096542013318, + "language_loss": 0.91354167, + "learning_rate": 2.438638873985366e-06, + "loss": 0.99075633, + "num_input_tokens_seen": 159271715, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.13366699, + "step": 7430, + "time_per_iteration": 2.537428140640259 + }, + { + "auxiliary_loss_clip": 0.06451312, + "auxiliary_loss_mlp": 0.01271269, + "balance_loss_clip": 0.06282946, + "balance_loss_mlp": 0.01257792, + "epoch": 0.4467758905756801, + "flos": 23514873765120.0, + "grad_norm": 1.610238873942938, + "language_loss": 0.80143106, + "learning_rate": 2.4382588861716954e-06, + "loss": 0.87865686, + "num_input_tokens_seen": 159290690, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.1348877, + "step": 7431, + "time_per_iteration": 2.5611300468444824 + }, + { + "auxiliary_loss_clip": 0.06447126, + "auxiliary_loss_mlp": 0.01271916, + "balance_loss_clip": 0.06282945, + "balance_loss_mlp": 0.01258374, + "epoch": 0.4468360138283481, + "flos": 18739970121600.0, + "grad_norm": 1.9551980798487134, + "language_loss": 0.80273902, + "learning_rate": 2.437878881739204e-06, + "loss": 0.87992942, + "num_input_tokens_seen": 159309400, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.13543701, + "step": 7432, + "time_per_iteration": 2.500554084777832 + }, + { + "auxiliary_loss_clip": 0.06450094, + "auxiliary_loss_mlp": 0.01273992, + "balance_loss_clip": 0.06283274, + "balance_loss_mlp": 0.0126073, + "epoch": 0.4468961370810161, + "flos": 23483874954240.0, + "grad_norm": 1.835454334349629, + "language_loss": 0.76644909, + "learning_rate": 2.437498860702301e-06, + "loss": 0.84368992, + "num_input_tokens_seen": 159327425, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.13269043, + "step": 7433, + "time_per_iteration": 2.5840916633605957 + }, + { + "auxiliary_loss_clip": 0.06435596, + "auxiliary_loss_mlp": 0.01271551, + "balance_loss_clip": 0.06279343, + "balance_loss_mlp": 0.01260047, + "epoch": 0.4469562603336841, + "flos": 30081873807360.0, + "grad_norm": 1.6012992804544768, + "language_loss": 0.77581275, + "learning_rate": 2.437118823075398e-06, + "loss": 0.85288417, + "num_input_tokens_seen": 159345805, + "router_z_loss_clip": 1.5625, + "router_z_loss_mlp": 0.1151123, + "step": 7434, + "time_per_iteration": 2.579667329788208 + }, + { + "auxiliary_loss_clip": 0.06443198, + "auxiliary_loss_mlp": 0.01270182, + "balance_loss_clip": 0.06278063, + "balance_loss_mlp": 0.01257439, + "epoch": 0.44701638358635204, + "flos": 22463126115840.0, + "grad_norm": 1.683412458990524, + "language_loss": 0.63887638, + "learning_rate": 2.436738768872905e-06, + "loss": 0.71601021, + "num_input_tokens_seen": 159364595, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.12750244, + "step": 7435, + "time_per_iteration": 2.5773611068725586 + }, + { + "auxiliary_loss_clip": 0.06444404, + "auxiliary_loss_mlp": 0.01272477, + "balance_loss_clip": 0.06280479, + "balance_loss_mlp": 0.01258714, + "epoch": 0.44707650683902, + "flos": 24064171205760.0, + "grad_norm": 1.5617494879233198, + "language_loss": 0.83911443, + "learning_rate": 2.4363586981092346e-06, + "loss": 0.91628319, + "num_input_tokens_seen": 159385265, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.13763428, + "step": 7436, + "time_per_iteration": 2.5204451084136963 + }, + { + "auxiliary_loss_clip": 0.0644998, + "auxiliary_loss_mlp": 0.01269044, + "balance_loss_clip": 0.0628316, + "balance_loss_mlp": 0.01254226, + "epoch": 0.44713663009168797, + "flos": 23773373210880.0, + "grad_norm": 1.7812959316100008, + "language_loss": 0.79632622, + "learning_rate": 2.435978610798798e-06, + "loss": 0.87351644, + "num_input_tokens_seen": 159405080, + "router_z_loss_clip": 1.66699219, + "router_z_loss_mlp": 0.14819336, + "step": 7437, + "time_per_iteration": 2.564180374145508 + }, + { + "auxiliary_loss_clip": 0.0644551, + "auxiliary_loss_mlp": 0.01269936, + "balance_loss_clip": 0.06279416, + "balance_loss_mlp": 0.01256829, + "epoch": 0.44719675334435594, + "flos": 24506258947200.0, + "grad_norm": 1.814975751419929, + "language_loss": 0.72632974, + "learning_rate": 2.435598506956009e-06, + "loss": 0.8034842, + "num_input_tokens_seen": 159424595, + "router_z_loss_clip": 1.66113281, + "router_z_loss_mlp": 0.13116455, + "step": 7438, + "time_per_iteration": 2.601855993270874 + }, + { + "auxiliary_loss_clip": 0.06445266, + "auxiliary_loss_mlp": 0.01270946, + "balance_loss_clip": 0.06279082, + "balance_loss_mlp": 0.01257046, + "epoch": 0.4472568765970239, + "flos": 29788308627840.0, + "grad_norm": 3.3026679320519716, + "language_loss": 0.67660618, + "learning_rate": 2.4352183865952808e-06, + "loss": 0.75376832, + "num_input_tokens_seen": 159443865, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.13903809, + "step": 7439, + "time_per_iteration": 2.6503498554229736 + }, + { + "auxiliary_loss_clip": 0.06447087, + "auxiliary_loss_mlp": 0.01272251, + "balance_loss_clip": 0.06280239, + "balance_loss_mlp": 0.01257648, + "epoch": 0.44731699984969187, + "flos": 24649792191360.0, + "grad_norm": 1.6003212894552636, + "language_loss": 0.73896551, + "learning_rate": 2.4348382497310285e-06, + "loss": 0.81615895, + "num_input_tokens_seen": 159464525, + "router_z_loss_clip": 1.66894531, + "router_z_loss_mlp": 0.14605713, + "step": 7440, + "time_per_iteration": 4.026291608810425 + }, + { + "auxiliary_loss_clip": 0.06441355, + "auxiliary_loss_mlp": 0.01270172, + "balance_loss_clip": 0.06277838, + "balance_loss_mlp": 0.0125722, + "epoch": 0.44737712310235983, + "flos": 29462570680320.0, + "grad_norm": 1.5530123963175664, + "language_loss": 0.74356592, + "learning_rate": 2.4344580963776655e-06, + "loss": 0.82068115, + "num_input_tokens_seen": 159486385, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.12963867, + "step": 7441, + "time_per_iteration": 2.5968191623687744 + }, + { + "auxiliary_loss_clip": 0.06443278, + "auxiliary_loss_mlp": 0.01268347, + "balance_loss_clip": 0.06277753, + "balance_loss_mlp": 0.01254983, + "epoch": 0.4474372463550278, + "flos": 24903260392320.0, + "grad_norm": 2.4580446492601014, + "language_loss": 0.75523049, + "learning_rate": 2.4340779265496082e-06, + "loss": 0.83234674, + "num_input_tokens_seen": 159503880, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.13378906, + "step": 7442, + "time_per_iteration": 2.6050899028778076 + }, + { + "auxiliary_loss_clip": 0.0645077, + "auxiliary_loss_mlp": 0.01276603, + "balance_loss_clip": 0.06281515, + "balance_loss_mlp": 0.01262644, + "epoch": 0.44749736960769576, + "flos": 33189835524480.0, + "grad_norm": 1.8304580376547321, + "language_loss": 0.74504036, + "learning_rate": 2.433697740261273e-06, + "loss": 0.82231408, + "num_input_tokens_seen": 159522980, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.13952637, + "step": 7443, + "time_per_iteration": 2.590211868286133 + }, + { + "auxiliary_loss_clip": 0.06441949, + "auxiliary_loss_mlp": 0.01270493, + "balance_loss_clip": 0.06278961, + "balance_loss_mlp": 0.01256605, + "epoch": 0.4475574928603637, + "flos": 21078596776320.0, + "grad_norm": 1.7164366382085705, + "language_loss": 0.78287792, + "learning_rate": 2.4333175375270748e-06, + "loss": 0.86000234, + "num_input_tokens_seen": 159543340, + "router_z_loss_clip": 1.62890625, + "router_z_loss_mlp": 0.13891602, + "step": 7444, + "time_per_iteration": 2.554215669631958 + }, + { + "auxiliary_loss_clip": 0.06437638, + "auxiliary_loss_mlp": 0.01276986, + "balance_loss_clip": 0.06276217, + "balance_loss_mlp": 0.01263664, + "epoch": 0.4476176161130317, + "flos": 21867442640640.0, + "grad_norm": 2.3488437532538735, + "language_loss": 0.85014707, + "learning_rate": 2.4329373183614333e-06, + "loss": 0.9272933, + "num_input_tokens_seen": 159558210, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.13317871, + "step": 7445, + "time_per_iteration": 2.463123321533203 + }, + { + "auxiliary_loss_clip": 0.06446601, + "auxiliary_loss_mlp": 0.0127394, + "balance_loss_clip": 0.06279677, + "balance_loss_mlp": 0.01258312, + "epoch": 0.4476777393656997, + "flos": 22535270081280.0, + "grad_norm": 2.2137135091267135, + "language_loss": 0.64567178, + "learning_rate": 2.432557082778765e-06, + "loss": 0.72287714, + "num_input_tokens_seen": 159577920, + "router_z_loss_clip": 1.66894531, + "router_z_loss_mlp": 0.15631104, + "step": 7446, + "time_per_iteration": 3.9910571575164795 + }, + { + "auxiliary_loss_clip": 0.06349403, + "auxiliary_loss_mlp": 0.01256298, + "balance_loss_clip": 0.06276181, + "balance_loss_mlp": 0.01253975, + "epoch": 0.4477378626183677, + "flos": 49034236101120.0, + "grad_norm": 0.7348354325841562, + "language_loss": 0.49922079, + "learning_rate": 2.4321768307934884e-06, + "loss": 0.57527786, + "num_input_tokens_seen": 159632295, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.0231781, + "step": 7447, + "time_per_iteration": 3.0209667682647705 + }, + { + "auxiliary_loss_clip": 0.06344398, + "auxiliary_loss_mlp": 0.01262514, + "balance_loss_clip": 0.06271263, + "balance_loss_mlp": 0.01260019, + "epoch": 0.44779798587103564, + "flos": 56562041784960.0, + "grad_norm": 0.8026230684928909, + "language_loss": 0.59334445, + "learning_rate": 2.4317965624200235e-06, + "loss": 0.66941357, + "num_input_tokens_seen": 159698435, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.02493286, + "step": 7448, + "time_per_iteration": 3.2380871772766113 + }, + { + "auxiliary_loss_clip": 0.06443155, + "auxiliary_loss_mlp": 0.01270524, + "balance_loss_clip": 0.06277426, + "balance_loss_mlp": 0.01256994, + "epoch": 0.4478581091237036, + "flos": 46508933278080.0, + "grad_norm": 1.7384627548967189, + "language_loss": 0.59131092, + "learning_rate": 2.431416277672789e-06, + "loss": 0.66844773, + "num_input_tokens_seen": 159722150, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.13537598, + "step": 7449, + "time_per_iteration": 2.7783467769622803 + }, + { + "auxiliary_loss_clip": 0.06440828, + "auxiliary_loss_mlp": 0.01272088, + "balance_loss_clip": 0.06277853, + "balance_loss_mlp": 0.01258868, + "epoch": 0.4479182323763716, + "flos": 20820768163200.0, + "grad_norm": 1.956040680672474, + "language_loss": 0.81008971, + "learning_rate": 2.4310359765662065e-06, + "loss": 0.88721895, + "num_input_tokens_seen": 159740550, + "router_z_loss_clip": 1.62890625, + "router_z_loss_mlp": 0.13220215, + "step": 7450, + "time_per_iteration": 2.488323450088501 + }, + { + "auxiliary_loss_clip": 0.06442301, + "auxiliary_loss_mlp": 0.01273054, + "balance_loss_clip": 0.06277788, + "balance_loss_mlp": 0.01259172, + "epoch": 0.44797835562903954, + "flos": 14251126717440.0, + "grad_norm": 2.5451576111358136, + "language_loss": 0.79348361, + "learning_rate": 2.430655659114697e-06, + "loss": 0.87063718, + "num_input_tokens_seen": 159758245, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.13885498, + "step": 7451, + "time_per_iteration": 2.4923946857452393 + }, + { + "auxiliary_loss_clip": 0.06344576, + "auxiliary_loss_mlp": 0.0125349, + "balance_loss_clip": 0.06271936, + "balance_loss_mlp": 0.0125126, + "epoch": 0.4480384788817075, + "flos": 63553436357760.0, + "grad_norm": 0.7850742570611701, + "language_loss": 0.62791413, + "learning_rate": 2.430275325332681e-06, + "loss": 0.70389479, + "num_input_tokens_seen": 159826790, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.02233887, + "step": 7452, + "time_per_iteration": 3.2259254455566406 + }, + { + "auxiliary_loss_clip": 0.06441975, + "auxiliary_loss_mlp": 0.01272416, + "balance_loss_clip": 0.06277539, + "balance_loss_mlp": 0.01258874, + "epoch": 0.44809860213437547, + "flos": 21659018808960.0, + "grad_norm": 1.8053672901244522, + "language_loss": 0.62585479, + "learning_rate": 2.429894975234582e-06, + "loss": 0.70299876, + "num_input_tokens_seen": 159845805, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.13537598, + "step": 7453, + "time_per_iteration": 3.928234577178955 + }, + { + "auxiliary_loss_clip": 0.06345223, + "auxiliary_loss_mlp": 0.01256622, + "balance_loss_clip": 0.06272231, + "balance_loss_mlp": 0.01254279, + "epoch": 0.44815872538704343, + "flos": 69210586840320.0, + "grad_norm": 0.747363028090033, + "language_loss": 0.5699693, + "learning_rate": 2.4295146088348224e-06, + "loss": 0.64598775, + "num_input_tokens_seen": 159898860, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.02339172, + "step": 7454, + "time_per_iteration": 3.0569918155670166 + }, + { + "auxiliary_loss_clip": 0.06447325, + "auxiliary_loss_mlp": 0.01268938, + "balance_loss_clip": 0.06281178, + "balance_loss_mlp": 0.01255705, + "epoch": 0.4482188486397114, + "flos": 12602186219520.0, + "grad_norm": 1.9501180256269237, + "language_loss": 0.75448847, + "learning_rate": 2.4291342261478255e-06, + "loss": 0.83165109, + "num_input_tokens_seen": 159911555, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.13220215, + "step": 7455, + "time_per_iteration": 2.4410433769226074 + }, + { + "auxiliary_loss_clip": 0.06442874, + "auxiliary_loss_mlp": 0.0126888, + "balance_loss_clip": 0.06278916, + "balance_loss_mlp": 0.01254932, + "epoch": 0.44827897189237936, + "flos": 34066715702400.0, + "grad_norm": 1.6532992970231903, + "language_loss": 0.76341856, + "learning_rate": 2.428753827188016e-06, + "loss": 0.84053606, + "num_input_tokens_seen": 159931470, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.1394043, + "step": 7456, + "time_per_iteration": 2.6695046424865723 + }, + { + "auxiliary_loss_clip": 0.06443818, + "auxiliary_loss_mlp": 0.01274223, + "balance_loss_clip": 0.06283055, + "balance_loss_mlp": 0.01261087, + "epoch": 0.44833909514504733, + "flos": 25153080940800.0, + "grad_norm": 1.8332154029673087, + "language_loss": 0.7703625, + "learning_rate": 2.428373411969818e-06, + "loss": 0.84754294, + "num_input_tokens_seen": 159946115, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.13122559, + "step": 7457, + "time_per_iteration": 2.4982032775878906 + }, + { + "auxiliary_loss_clip": 0.06449621, + "auxiliary_loss_mlp": 0.0126721, + "balance_loss_clip": 0.06282188, + "balance_loss_mlp": 0.01253269, + "epoch": 0.4483992183977153, + "flos": 16185498549120.0, + "grad_norm": 2.4281328609676254, + "language_loss": 0.68744391, + "learning_rate": 2.4279929805076576e-06, + "loss": 0.7646122, + "num_input_tokens_seen": 159963915, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.1394043, + "step": 7458, + "time_per_iteration": 2.4979610443115234 + }, + { + "auxiliary_loss_clip": 0.06448827, + "auxiliary_loss_mlp": 0.01274875, + "balance_loss_clip": 0.06280437, + "balance_loss_mlp": 0.01259592, + "epoch": 0.44845934165038326, + "flos": 17751352124160.0, + "grad_norm": 1.539492966179865, + "language_loss": 0.71756333, + "learning_rate": 2.427612532815961e-06, + "loss": 0.79480034, + "num_input_tokens_seen": 159982140, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.15283203, + "step": 7459, + "time_per_iteration": 2.482675075531006 + }, + { + "auxiliary_loss_clip": 0.06445904, + "auxiliary_loss_mlp": 0.01268873, + "balance_loss_clip": 0.06281781, + "balance_loss_mlp": 0.01255343, + "epoch": 0.4485194649030513, + "flos": 21842481323520.0, + "grad_norm": 1.7620296739852843, + "language_loss": 0.69945031, + "learning_rate": 2.427232068909154e-06, + "loss": 0.7765981, + "num_input_tokens_seen": 160002280, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.13525391, + "step": 7460, + "time_per_iteration": 2.548891067504883 + }, + { + "auxiliary_loss_clip": 0.06446661, + "auxiliary_loss_mlp": 0.01267799, + "balance_loss_clip": 0.06281269, + "balance_loss_mlp": 0.01253744, + "epoch": 0.44857958815571924, + "flos": 20090775392640.0, + "grad_norm": 2.1567039258492637, + "language_loss": 0.77558124, + "learning_rate": 2.4268515888016635e-06, + "loss": 0.85272586, + "num_input_tokens_seen": 160020260, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.14068604, + "step": 7461, + "time_per_iteration": 2.488675832748413 + }, + { + "auxiliary_loss_clip": 0.0644468, + "auxiliary_loss_mlp": 0.01266891, + "balance_loss_clip": 0.0627977, + "balance_loss_mlp": 0.01252514, + "epoch": 0.4486397114083872, + "flos": 27060982081920.0, + "grad_norm": 1.6449935173844783, + "language_loss": 0.68081152, + "learning_rate": 2.4264710925079184e-06, + "loss": 0.75792718, + "num_input_tokens_seen": 160040240, + "router_z_loss_clip": 1.64941406, + "router_z_loss_mlp": 0.14367676, + "step": 7462, + "time_per_iteration": 2.5873477458953857 + }, + { + "auxiliary_loss_clip": 0.06346884, + "auxiliary_loss_mlp": 0.01259781, + "balance_loss_clip": 0.06274216, + "balance_loss_mlp": 0.01257521, + "epoch": 0.4486998346610552, + "flos": 67339386587520.0, + "grad_norm": 0.7371865357722727, + "language_loss": 0.54459572, + "learning_rate": 2.4260905800423462e-06, + "loss": 0.62066233, + "num_input_tokens_seen": 160093865, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.0226593, + "step": 7463, + "time_per_iteration": 3.135831594467163 + }, + { + "auxiliary_loss_clip": 0.06446455, + "auxiliary_loss_mlp": 0.01271071, + "balance_loss_clip": 0.06283797, + "balance_loss_mlp": 0.01257344, + "epoch": 0.44875995791372314, + "flos": 27644297080320.0, + "grad_norm": 1.768714620285087, + "language_loss": 0.76698768, + "learning_rate": 2.4257100514193775e-06, + "loss": 0.844163, + "num_input_tokens_seen": 160113590, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.13726807, + "step": 7464, + "time_per_iteration": 2.5624353885650635 + }, + { + "auxiliary_loss_clip": 0.06442145, + "auxiliary_loss_mlp": 0.01270123, + "balance_loss_clip": 0.06281784, + "balance_loss_mlp": 0.01257063, + "epoch": 0.4488200811663911, + "flos": 13010969162880.0, + "grad_norm": 1.8955897931068166, + "language_loss": 0.74468267, + "learning_rate": 2.425329506653441e-06, + "loss": 0.82180536, + "num_input_tokens_seen": 160131795, + "router_z_loss_clip": 1.60351562, + "router_z_loss_mlp": 0.13043213, + "step": 7465, + "time_per_iteration": 2.4702823162078857 + }, + { + "auxiliary_loss_clip": 0.0645618, + "auxiliary_loss_mlp": 0.01272918, + "balance_loss_clip": 0.06284305, + "balance_loss_mlp": 0.01257391, + "epoch": 0.44888020441905907, + "flos": 27497283891840.0, + "grad_norm": 2.0464026275546314, + "language_loss": 0.80248308, + "learning_rate": 2.424948945758966e-06, + "loss": 0.87977397, + "num_input_tokens_seen": 160150635, + "router_z_loss_clip": 1.71777344, + "router_z_loss_mlp": 0.1552124, + "step": 7466, + "time_per_iteration": 2.542721748352051 + }, + { + "auxiliary_loss_clip": 0.06448439, + "auxiliary_loss_mlp": 0.01269021, + "balance_loss_clip": 0.0628207, + "balance_loss_mlp": 0.01255735, + "epoch": 0.44894032767172704, + "flos": 18265541904000.0, + "grad_norm": 2.2890338528416416, + "language_loss": 0.80875736, + "learning_rate": 2.4245683687503844e-06, + "loss": 0.88593197, + "num_input_tokens_seen": 160168615, + "router_z_loss_clip": 1.66308594, + "router_z_loss_mlp": 0.13293457, + "step": 7467, + "time_per_iteration": 2.4503378868103027 + }, + { + "auxiliary_loss_clip": 0.06442044, + "auxiliary_loss_mlp": 0.01269059, + "balance_loss_clip": 0.06284908, + "balance_loss_mlp": 0.01256465, + "epoch": 0.449000450924395, + "flos": 21586245937920.0, + "grad_norm": 2.2421166338055762, + "language_loss": 0.75738609, + "learning_rate": 2.424187775642129e-06, + "loss": 0.83449709, + "num_input_tokens_seen": 160187295, + "router_z_loss_clip": 1.5703125, + "router_z_loss_mlp": 0.12597656, + "step": 7468, + "time_per_iteration": 2.5120036602020264 + }, + { + "auxiliary_loss_clip": 0.06448267, + "auxiliary_loss_mlp": 0.01270415, + "balance_loss_clip": 0.06286301, + "balance_loss_mlp": 0.01257993, + "epoch": 0.44906057417706297, + "flos": 17973737660160.0, + "grad_norm": 2.1198815882874626, + "language_loss": 0.71292973, + "learning_rate": 2.4238071664486297e-06, + "loss": 0.79011655, + "num_input_tokens_seen": 160205115, + "router_z_loss_clip": 1.62109375, + "router_z_loss_mlp": 0.12414551, + "step": 7469, + "time_per_iteration": 2.4725160598754883 + }, + { + "auxiliary_loss_clip": 0.06450349, + "auxiliary_loss_mlp": 0.01268175, + "balance_loss_clip": 0.06284628, + "balance_loss_mlp": 0.0125427, + "epoch": 0.44912069742973093, + "flos": 20053487525760.0, + "grad_norm": 1.6969020049584582, + "language_loss": 0.7254343, + "learning_rate": 2.4234265411843203e-06, + "loss": 0.80261958, + "num_input_tokens_seen": 160222580, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.13903809, + "step": 7470, + "time_per_iteration": 2.5212604999542236 + }, + { + "auxiliary_loss_clip": 0.06447989, + "auxiliary_loss_mlp": 0.01269333, + "balance_loss_clip": 0.0628368, + "balance_loss_mlp": 0.01255951, + "epoch": 0.4491808206823989, + "flos": 21040009171200.0, + "grad_norm": 2.607168963621531, + "language_loss": 0.77266711, + "learning_rate": 2.423045899863634e-06, + "loss": 0.84984034, + "num_input_tokens_seen": 160241520, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.13397217, + "step": 7471, + "time_per_iteration": 2.4833462238311768 + }, + { + "auxiliary_loss_clip": 0.0644739, + "auxiliary_loss_mlp": 0.01274961, + "balance_loss_clip": 0.06286953, + "balance_loss_mlp": 0.01261579, + "epoch": 0.44924094393506686, + "flos": 22973919805440.0, + "grad_norm": 1.613716342828386, + "language_loss": 0.69996417, + "learning_rate": 2.4226652425010048e-06, + "loss": 0.77718765, + "num_input_tokens_seen": 160261815, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.1338501, + "step": 7472, + "time_per_iteration": 2.5575385093688965 + }, + { + "auxiliary_loss_clip": 0.06348881, + "auxiliary_loss_mlp": 0.01263011, + "balance_loss_clip": 0.0627597, + "balance_loss_mlp": 0.01260363, + "epoch": 0.4493010671877349, + "flos": 59252332026240.0, + "grad_norm": 0.7278471165666979, + "language_loss": 0.61657208, + "learning_rate": 2.4222845691108676e-06, + "loss": 0.69269097, + "num_input_tokens_seen": 160317070, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.02650452, + "step": 7473, + "time_per_iteration": 3.1560816764831543 + }, + { + "auxiliary_loss_clip": 0.06448925, + "auxiliary_loss_mlp": 0.01270251, + "balance_loss_clip": 0.0628556, + "balance_loss_mlp": 0.01256417, + "epoch": 0.44936119044040285, + "flos": 18010815891840.0, + "grad_norm": 2.7240719920550873, + "language_loss": 0.77420998, + "learning_rate": 2.421903879707657e-06, + "loss": 0.85140175, + "num_input_tokens_seen": 160334980, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.13830566, + "step": 7474, + "time_per_iteration": 2.4717578887939453 + }, + { + "auxiliary_loss_clip": 0.06442197, + "auxiliary_loss_mlp": 0.01276021, + "balance_loss_clip": 0.06283113, + "balance_loss_mlp": 0.0126264, + "epoch": 0.4494213136930708, + "flos": 21258243930240.0, + "grad_norm": 2.650117553560035, + "language_loss": 0.72072601, + "learning_rate": 2.4215231743058086e-06, + "loss": 0.79790819, + "num_input_tokens_seen": 160354500, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.1338501, + "step": 7475, + "time_per_iteration": 2.513819456100464 + }, + { + "auxiliary_loss_clip": 0.06442311, + "auxiliary_loss_mlp": 0.01269894, + "balance_loss_clip": 0.06277612, + "balance_loss_mlp": 0.01256954, + "epoch": 0.4494814369457388, + "flos": 27426271956480.0, + "grad_norm": 1.759412456892788, + "language_loss": 0.77338856, + "learning_rate": 2.4211424529197594e-06, + "loss": 0.8505106, + "num_input_tokens_seen": 160373650, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.1295166, + "step": 7476, + "time_per_iteration": 2.5318853855133057 + }, + { + "auxiliary_loss_clip": 0.06449737, + "auxiliary_loss_mlp": 0.01271172, + "balance_loss_clip": 0.06281359, + "balance_loss_mlp": 0.01256754, + "epoch": 0.44954156019840674, + "flos": 22860211415040.0, + "grad_norm": 1.712065897066968, + "language_loss": 0.71606135, + "learning_rate": 2.4207617155639464e-06, + "loss": 0.79327047, + "num_input_tokens_seen": 160393430, + "router_z_loss_clip": 1.68066406, + "router_z_loss_mlp": 0.144104, + "step": 7477, + "time_per_iteration": 2.532437324523926 + }, + { + "auxiliary_loss_clip": 0.06452323, + "auxiliary_loss_mlp": 0.01271774, + "balance_loss_clip": 0.06283113, + "balance_loss_mlp": 0.01257457, + "epoch": 0.4496016834510747, + "flos": 17207253636480.0, + "grad_norm": 8.505711381360525, + "language_loss": 0.68249893, + "learning_rate": 2.4203809622528062e-06, + "loss": 0.75973988, + "num_input_tokens_seen": 160410545, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.14331055, + "step": 7478, + "time_per_iteration": 2.4901106357574463 + }, + { + "auxiliary_loss_clip": 0.06438291, + "auxiliary_loss_mlp": 0.01274211, + "balance_loss_clip": 0.06278055, + "balance_loss_mlp": 0.01261676, + "epoch": 0.4496618067037427, + "flos": 18922636022400.0, + "grad_norm": 1.7939017561082606, + "language_loss": 0.89897281, + "learning_rate": 2.420000193000779e-06, + "loss": 0.97609776, + "num_input_tokens_seen": 160428105, + "router_z_loss_clip": 1.60058594, + "router_z_loss_mlp": 0.12518311, + "step": 7479, + "time_per_iteration": 3.9324028491973877 + }, + { + "auxiliary_loss_clip": 0.06445809, + "auxiliary_loss_mlp": 0.01275156, + "balance_loss_clip": 0.06282537, + "balance_loss_mlp": 0.01261304, + "epoch": 0.44972192995641064, + "flos": 21037828965120.0, + "grad_norm": 1.5817445570827902, + "language_loss": 0.75620329, + "learning_rate": 2.419619407822302e-06, + "loss": 0.833413, + "num_input_tokens_seen": 160448815, + "router_z_loss_clip": 1.63085938, + "router_z_loss_mlp": 0.13861084, + "step": 7480, + "time_per_iteration": 2.519364595413208 + }, + { + "auxiliary_loss_clip": 0.06450936, + "auxiliary_loss_mlp": 0.01270868, + "balance_loss_clip": 0.06283928, + "balance_loss_mlp": 0.01257033, + "epoch": 0.4497820532090786, + "flos": 20783354515200.0, + "grad_norm": 2.4818923045987233, + "language_loss": 0.79794782, + "learning_rate": 2.419238606731815e-06, + "loss": 0.87516582, + "num_input_tokens_seen": 160465940, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.1385498, + "step": 7481, + "time_per_iteration": 2.511104106903076 + }, + { + "auxiliary_loss_clip": 0.06439544, + "auxiliary_loss_mlp": 0.01274879, + "balance_loss_clip": 0.06280965, + "balance_loss_mlp": 0.01261003, + "epoch": 0.44984217646174657, + "flos": 33811067295360.0, + "grad_norm": 1.5325857273153378, + "language_loss": 0.68501163, + "learning_rate": 2.418857789743758e-06, + "loss": 0.76215583, + "num_input_tokens_seen": 160486710, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 0.13873291, + "step": 7482, + "time_per_iteration": 2.6323177814483643 + }, + { + "auxiliary_loss_clip": 0.06449723, + "auxiliary_loss_mlp": 0.01275016, + "balance_loss_clip": 0.06284413, + "balance_loss_mlp": 0.01261236, + "epoch": 0.44990229971441453, + "flos": 15522953915520.0, + "grad_norm": 2.4692742165129347, + "language_loss": 0.85184467, + "learning_rate": 2.418476956872571e-06, + "loss": 0.92909217, + "num_input_tokens_seen": 160503405, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.13775635, + "step": 7483, + "time_per_iteration": 2.5510005950927734 + }, + { + "auxiliary_loss_clip": 0.0644832, + "auxiliary_loss_mlp": 0.01272458, + "balance_loss_clip": 0.06278956, + "balance_loss_mlp": 0.01259017, + "epoch": 0.4499624229670825, + "flos": 29869676542080.0, + "grad_norm": 2.2555510336477362, + "language_loss": 0.81026614, + "learning_rate": 2.4180961081326967e-06, + "loss": 0.88747394, + "num_input_tokens_seen": 160525080, + "router_z_loss_clip": 1.69238281, + "router_z_loss_mlp": 0.13439941, + "step": 7484, + "time_per_iteration": 2.5549514293670654 + }, + { + "auxiliary_loss_clip": 0.06454043, + "auxiliary_loss_mlp": 0.01271307, + "balance_loss_clip": 0.06282799, + "balance_loss_mlp": 0.01257133, + "epoch": 0.45002254621975046, + "flos": 18519345521280.0, + "grad_norm": 3.0066277785462296, + "language_loss": 0.75523663, + "learning_rate": 2.4177152435385754e-06, + "loss": 0.83249015, + "num_input_tokens_seen": 160540895, + "router_z_loss_clip": 1.71191406, + "router_z_loss_mlp": 0.14172363, + "step": 7485, + "time_per_iteration": 2.5260515213012695 + }, + { + "auxiliary_loss_clip": 0.06353837, + "auxiliary_loss_mlp": 0.01254878, + "balance_loss_clip": 0.06280266, + "balance_loss_mlp": 0.01252054, + "epoch": 0.4500826694724185, + "flos": 70438753261440.0, + "grad_norm": 0.7710237062022668, + "language_loss": 0.58055162, + "learning_rate": 2.4173343631046504e-06, + "loss": 0.65663874, + "num_input_tokens_seen": 160598270, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.02819824, + "step": 7486, + "time_per_iteration": 4.631975173950195 + }, + { + "auxiliary_loss_clip": 0.06445555, + "auxiliary_loss_mlp": 0.0126857, + "balance_loss_clip": 0.06281094, + "balance_loss_mlp": 0.0125523, + "epoch": 0.45014279272508645, + "flos": 15784388254080.0, + "grad_norm": 2.313810641491004, + "language_loss": 0.83291382, + "learning_rate": 2.4169534668453654e-06, + "loss": 0.91005504, + "num_input_tokens_seen": 160614720, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.13336182, + "step": 7487, + "time_per_iteration": 2.4474549293518066 + }, + { + "auxiliary_loss_clip": 0.06440553, + "auxiliary_loss_mlp": 0.01274868, + "balance_loss_clip": 0.06278186, + "balance_loss_mlp": 0.01260879, + "epoch": 0.4502029159777544, + "flos": 21806157778560.0, + "grad_norm": 1.8256144522955593, + "language_loss": 0.77817398, + "learning_rate": 2.4165725547751622e-06, + "loss": 0.8553282, + "num_input_tokens_seen": 160635170, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.13983154, + "step": 7488, + "time_per_iteration": 2.5497655868530273 + }, + { + "auxiliary_loss_clip": 0.0645895, + "auxiliary_loss_mlp": 0.01273187, + "balance_loss_clip": 0.06284817, + "balance_loss_mlp": 0.01257773, + "epoch": 0.4502630392304224, + "flos": 28775651708160.0, + "grad_norm": 2.1057521417086194, + "language_loss": 0.72464138, + "learning_rate": 2.4161916269084858e-06, + "loss": 0.80196273, + "num_input_tokens_seen": 160654490, + "router_z_loss_clip": 1.7421875, + "router_z_loss_mlp": 0.15405273, + "step": 7489, + "time_per_iteration": 2.536022186279297 + }, + { + "auxiliary_loss_clip": 0.06449728, + "auxiliary_loss_mlp": 0.01273963, + "balance_loss_clip": 0.06281854, + "balance_loss_mlp": 0.012597, + "epoch": 0.45032316248309034, + "flos": 15848398373760.0, + "grad_norm": 2.178444480440472, + "language_loss": 0.70506239, + "learning_rate": 2.4158106832597817e-06, + "loss": 0.78229928, + "num_input_tokens_seen": 160669400, + "router_z_loss_clip": 1.6796875, + "router_z_loss_mlp": 0.14263916, + "step": 7490, + "time_per_iteration": 2.5048370361328125 + }, + { + "auxiliary_loss_clip": 0.06351414, + "auxiliary_loss_mlp": 0.01254304, + "balance_loss_clip": 0.06277761, + "balance_loss_mlp": 0.01251552, + "epoch": 0.4503832857357583, + "flos": 57873337056000.0, + "grad_norm": 0.766905441156629, + "language_loss": 0.56608462, + "learning_rate": 2.415429723843495e-06, + "loss": 0.64214182, + "num_input_tokens_seen": 160733820, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.02757263, + "step": 7491, + "time_per_iteration": 3.1021111011505127 + }, + { + "auxiliary_loss_clip": 0.06440033, + "auxiliary_loss_mlp": 0.01270664, + "balance_loss_clip": 0.06278066, + "balance_loss_mlp": 0.01257217, + "epoch": 0.4504434089884263, + "flos": 23884817541120.0, + "grad_norm": 1.940533812141729, + "language_loss": 0.79471588, + "learning_rate": 2.4150487486740713e-06, + "loss": 0.87182283, + "num_input_tokens_seen": 160753175, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.13446045, + "step": 7492, + "time_per_iteration": 3.906813144683838 + }, + { + "auxiliary_loss_clip": 0.06454505, + "auxiliary_loss_mlp": 0.01272683, + "balance_loss_clip": 0.06282404, + "balance_loss_mlp": 0.01257925, + "epoch": 0.45050353224109424, + "flos": 17790820197120.0, + "grad_norm": 2.4926790281130566, + "language_loss": 0.92799652, + "learning_rate": 2.4146677577659573e-06, + "loss": 1.00526834, + "num_input_tokens_seen": 160768310, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.14758301, + "step": 7493, + "time_per_iteration": 2.516523838043213 + }, + { + "auxiliary_loss_clip": 0.06351101, + "auxiliary_loss_mlp": 0.01253906, + "balance_loss_clip": 0.06277501, + "balance_loss_mlp": 0.01251232, + "epoch": 0.4505636554937622, + "flos": 65081960138880.0, + "grad_norm": 0.7917943169613642, + "language_loss": 0.62850708, + "learning_rate": 2.4142867511336e-06, + "loss": 0.70455718, + "num_input_tokens_seen": 160827370, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.02676392, + "step": 7494, + "time_per_iteration": 3.200533866882324 + }, + { + "auxiliary_loss_clip": 0.06439039, + "auxiliary_loss_mlp": 0.01268167, + "balance_loss_clip": 0.06275568, + "balance_loss_mlp": 0.01255305, + "epoch": 0.45062377874643017, + "flos": 22206597240960.0, + "grad_norm": 1.3576432808579277, + "language_loss": 0.8187722, + "learning_rate": 2.4139057287914484e-06, + "loss": 0.89584428, + "num_input_tokens_seen": 160849140, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.12860107, + "step": 7495, + "time_per_iteration": 2.6740329265594482 + }, + { + "auxiliary_loss_clip": 0.06444755, + "auxiliary_loss_mlp": 0.01267118, + "balance_loss_clip": 0.06279008, + "balance_loss_mlp": 0.01253344, + "epoch": 0.45068390199909814, + "flos": 37679433615360.0, + "grad_norm": 3.4533684270887988, + "language_loss": 0.85559022, + "learning_rate": 2.41352469075395e-06, + "loss": 0.93270886, + "num_input_tokens_seen": 160871280, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.13775635, + "step": 7496, + "time_per_iteration": 2.6514453887939453 + }, + { + "auxiliary_loss_clip": 0.06445448, + "auxiliary_loss_mlp": 0.01271465, + "balance_loss_clip": 0.06277982, + "balance_loss_mlp": 0.01258042, + "epoch": 0.4507440252517661, + "flos": 22307853300480.0, + "grad_norm": 2.147795774994512, + "language_loss": 0.76396865, + "learning_rate": 2.4131436370355534e-06, + "loss": 0.84113777, + "num_input_tokens_seen": 160888625, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.13427734, + "step": 7497, + "time_per_iteration": 2.5248610973358154 + }, + { + "auxiliary_loss_clip": 0.0644587, + "auxiliary_loss_mlp": 0.01268435, + "balance_loss_clip": 0.062753, + "balance_loss_mlp": 0.01254189, + "epoch": 0.45080414850443407, + "flos": 13193425428480.0, + "grad_norm": 1.9297018893586142, + "language_loss": 0.75253481, + "learning_rate": 2.4127625676507088e-06, + "loss": 0.82967794, + "num_input_tokens_seen": 160907040, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.14245605, + "step": 7498, + "time_per_iteration": 2.482625722885132 + }, + { + "auxiliary_loss_clip": 0.06447846, + "auxiliary_loss_mlp": 0.01269776, + "balance_loss_clip": 0.06277958, + "balance_loss_mlp": 0.01255697, + "epoch": 0.4508642717571021, + "flos": 21951451958400.0, + "grad_norm": 1.9463705761270829, + "language_loss": 0.70564914, + "learning_rate": 2.4123814826138663e-06, + "loss": 0.78282535, + "num_input_tokens_seen": 160927115, + "router_z_loss_clip": 1.69824219, + "router_z_loss_mlp": 0.14093018, + "step": 7499, + "time_per_iteration": 2.5338642597198486 + }, + { + "auxiliary_loss_clip": 0.06449613, + "auxiliary_loss_mlp": 0.01268145, + "balance_loss_clip": 0.06278396, + "balance_loss_mlp": 0.0125412, + "epoch": 0.45092439500977005, + "flos": 23374149632640.0, + "grad_norm": 2.119825325087625, + "language_loss": 0.77484369, + "learning_rate": 2.412000381939477e-06, + "loss": 0.85202128, + "num_input_tokens_seen": 160944405, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.14025879, + "step": 7500, + "time_per_iteration": 2.5290849208831787 + }, + { + "auxiliary_loss_clip": 0.06441833, + "auxiliary_loss_mlp": 0.01275038, + "balance_loss_clip": 0.06276967, + "balance_loss_mlp": 0.01262211, + "epoch": 0.450984518262438, + "flos": 20778532905600.0, + "grad_norm": 2.0513851791377014, + "language_loss": 0.62714708, + "learning_rate": 2.411619265641992e-06, + "loss": 0.70431578, + "num_input_tokens_seen": 160961345, + "router_z_loss_clip": 1.64746094, + "router_z_loss_mlp": 0.12823486, + "step": 7501, + "time_per_iteration": 2.513014316558838 + }, + { + "auxiliary_loss_clip": 0.06447023, + "auxiliary_loss_mlp": 0.01269353, + "balance_loss_clip": 0.0627754, + "balance_loss_mlp": 0.01255251, + "epoch": 0.451044641515106, + "flos": 17712303321600.0, + "grad_norm": 1.7676077358786102, + "language_loss": 0.8475225, + "learning_rate": 2.411238133735863e-06, + "loss": 0.92468631, + "num_input_tokens_seen": 160977330, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.14111328, + "step": 7502, + "time_per_iteration": 2.502213954925537 + }, + { + "auxiliary_loss_clip": 0.06440664, + "auxiliary_loss_mlp": 0.01270795, + "balance_loss_clip": 0.06275544, + "balance_loss_mlp": 0.01256967, + "epoch": 0.45110476476777395, + "flos": 20600940176640.0, + "grad_norm": 1.2963550821027272, + "language_loss": 0.79440266, + "learning_rate": 2.4108569862355418e-06, + "loss": 0.8715173, + "num_input_tokens_seen": 160997280, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.13824463, + "step": 7503, + "time_per_iteration": 2.539870023727417 + }, + { + "auxiliary_loss_clip": 0.0643944, + "auxiliary_loss_mlp": 0.01270582, + "balance_loss_clip": 0.06278714, + "balance_loss_mlp": 0.01257213, + "epoch": 0.4511648880204419, + "flos": 16039533536640.0, + "grad_norm": 2.8864102182872746, + "language_loss": 0.80966014, + "learning_rate": 2.410475823155484e-06, + "loss": 0.88676035, + "num_input_tokens_seen": 161014235, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.13354492, + "step": 7504, + "time_per_iteration": 2.4834609031677246 + }, + { + "auxiliary_loss_clip": 0.06439783, + "auxiliary_loss_mlp": 0.01267614, + "balance_loss_clip": 0.06277721, + "balance_loss_mlp": 0.0125412, + "epoch": 0.4512250112731099, + "flos": 23984103029760.0, + "grad_norm": 1.8935476867238503, + "language_loss": 0.63783783, + "learning_rate": 2.4100946445101405e-06, + "loss": 0.71491182, + "num_input_tokens_seen": 161032360, + "router_z_loss_clip": 1.62109375, + "router_z_loss_mlp": 0.1350708, + "step": 7505, + "time_per_iteration": 2.5183863639831543 + }, + { + "auxiliary_loss_clip": 0.06338686, + "auxiliary_loss_mlp": 0.0125649, + "balance_loss_clip": 0.06265638, + "balance_loss_mlp": 0.01253881, + "epoch": 0.45128513452577784, + "flos": 71484239053440.0, + "grad_norm": 0.8179087732062593, + "language_loss": 0.58726048, + "learning_rate": 2.409713450313968e-06, + "loss": 0.66321218, + "num_input_tokens_seen": 161091360, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.02610779, + "step": 7506, + "time_per_iteration": 3.2057392597198486 + }, + { + "auxiliary_loss_clip": 0.06438521, + "auxiliary_loss_mlp": 0.01269482, + "balance_loss_clip": 0.0627608, + "balance_loss_mlp": 0.01255987, + "epoch": 0.4513452577784458, + "flos": 22097375043840.0, + "grad_norm": 1.6199933066680872, + "language_loss": 0.79207951, + "learning_rate": 2.40933224058142e-06, + "loss": 0.86915958, + "num_input_tokens_seen": 161110825, + "router_z_loss_clip": 1.62402344, + "router_z_loss_mlp": 0.1348877, + "step": 7507, + "time_per_iteration": 2.485177993774414 + }, + { + "auxiliary_loss_clip": 0.0644455, + "auxiliary_loss_mlp": 0.01270991, + "balance_loss_clip": 0.06277668, + "balance_loss_mlp": 0.01256543, + "epoch": 0.4514053810311138, + "flos": 24282699454080.0, + "grad_norm": 1.6041025363642085, + "language_loss": 0.74460357, + "learning_rate": 2.4089510153269526e-06, + "loss": 0.82175899, + "num_input_tokens_seen": 161130685, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.14440918, + "step": 7508, + "time_per_iteration": 2.5957343578338623 + }, + { + "auxiliary_loss_clip": 0.06439587, + "auxiliary_loss_mlp": 0.01271402, + "balance_loss_clip": 0.06279378, + "balance_loss_mlp": 0.01258552, + "epoch": 0.45146550428378174, + "flos": 17891237715840.0, + "grad_norm": 2.0541508842975946, + "language_loss": 0.79828942, + "learning_rate": 2.4085697745650217e-06, + "loss": 0.87539923, + "num_input_tokens_seen": 161147555, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.12841797, + "step": 7509, + "time_per_iteration": 2.4700090885162354 + }, + { + "auxiliary_loss_clip": 0.06441342, + "auxiliary_loss_mlp": 0.01270525, + "balance_loss_clip": 0.06278946, + "balance_loss_mlp": 0.01257746, + "epoch": 0.4515256275364497, + "flos": 24250317050880.0, + "grad_norm": 1.7065874480024321, + "language_loss": 0.73257631, + "learning_rate": 2.4081885183100837e-06, + "loss": 0.80969501, + "num_input_tokens_seen": 161166255, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.12774658, + "step": 7510, + "time_per_iteration": 2.5448224544525146 + }, + { + "auxiliary_loss_clip": 0.06438527, + "auxiliary_loss_mlp": 0.01269291, + "balance_loss_clip": 0.06273279, + "balance_loss_mlp": 0.01255707, + "epoch": 0.45158575078911767, + "flos": 20637263721600.0, + "grad_norm": 1.688618785836195, + "language_loss": 0.77059448, + "learning_rate": 2.4078072465765964e-06, + "loss": 0.8476727, + "num_input_tokens_seen": 161184720, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.13598633, + "step": 7511, + "time_per_iteration": 2.48913311958313 + }, + { + "auxiliary_loss_clip": 0.06443627, + "auxiliary_loss_mlp": 0.01270366, + "balance_loss_clip": 0.06277004, + "balance_loss_mlp": 0.0125543, + "epoch": 0.45164587404178563, + "flos": 23333884945920.0, + "grad_norm": 1.5549799825793658, + "language_loss": 0.79259372, + "learning_rate": 2.4074259593790174e-06, + "loss": 0.86973357, + "num_input_tokens_seen": 161204360, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.14929199, + "step": 7512, + "time_per_iteration": 2.5429651737213135 + }, + { + "auxiliary_loss_clip": 0.06447546, + "auxiliary_loss_mlp": 0.01266751, + "balance_loss_clip": 0.06275645, + "balance_loss_mlp": 0.01252219, + "epoch": 0.45170599729445365, + "flos": 23812841283840.0, + "grad_norm": 2.088368619040166, + "language_loss": 0.87660837, + "learning_rate": 2.4070446567318053e-06, + "loss": 0.95375133, + "num_input_tokens_seen": 161223575, + "router_z_loss_clip": 1.71777344, + "router_z_loss_mlp": 0.14538574, + "step": 7513, + "time_per_iteration": 2.50119686126709 + }, + { + "auxiliary_loss_clip": 0.06437154, + "auxiliary_loss_mlp": 0.01272349, + "balance_loss_clip": 0.06280629, + "balance_loss_mlp": 0.01259963, + "epoch": 0.4517661205471216, + "flos": 23519569593600.0, + "grad_norm": 1.9321046654640033, + "language_loss": 0.67692971, + "learning_rate": 2.406663338649419e-06, + "loss": 0.75402474, + "num_input_tokens_seen": 161243805, + "router_z_loss_clip": 1.56738281, + "router_z_loss_mlp": 0.1237793, + "step": 7514, + "time_per_iteration": 2.548349618911743 + }, + { + "auxiliary_loss_clip": 0.0644633, + "auxiliary_loss_mlp": 0.01272615, + "balance_loss_clip": 0.06280062, + "balance_loss_mlp": 0.01258017, + "epoch": 0.4518262437997896, + "flos": 23520743550720.0, + "grad_norm": 2.108913826152056, + "language_loss": 0.69738746, + "learning_rate": 2.406282005146318e-06, + "loss": 0.7745769, + "num_input_tokens_seen": 161261450, + "router_z_loss_clip": 1.6640625, + "router_z_loss_mlp": 0.14587402, + "step": 7515, + "time_per_iteration": 2.5203166007995605 + }, + { + "auxiliary_loss_clip": 0.06448089, + "auxiliary_loss_mlp": 0.01273292, + "balance_loss_clip": 0.06278358, + "balance_loss_mlp": 0.01258379, + "epoch": 0.45188636705245755, + "flos": 14572210763520.0, + "grad_norm": 2.327142049261069, + "language_loss": 0.81245089, + "learning_rate": 2.405900656236963e-06, + "loss": 0.88966471, + "num_input_tokens_seen": 161276965, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.14916992, + "step": 7516, + "time_per_iteration": 2.5070860385894775 + }, + { + "auxiliary_loss_clip": 0.06440821, + "auxiliary_loss_mlp": 0.01272469, + "balance_loss_clip": 0.0627999, + "balance_loss_mlp": 0.01259899, + "epoch": 0.4519464903051255, + "flos": 19907690221440.0, + "grad_norm": 1.8586788547852597, + "language_loss": 0.65825433, + "learning_rate": 2.4055192919358137e-06, + "loss": 0.73538721, + "num_input_tokens_seen": 161295375, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.12573242, + "step": 7517, + "time_per_iteration": 2.4824438095092773 + }, + { + "auxiliary_loss_clip": 0.06439231, + "auxiliary_loss_mlp": 0.01270445, + "balance_loss_clip": 0.06279515, + "balance_loss_mlp": 0.01257923, + "epoch": 0.4520066135577935, + "flos": 18850492056960.0, + "grad_norm": 1.7463164288041955, + "language_loss": 0.63218093, + "learning_rate": 2.405137912257333e-06, + "loss": 0.70927775, + "num_input_tokens_seen": 161313010, + "router_z_loss_clip": 1.59570312, + "router_z_loss_mlp": 0.12524414, + "step": 7518, + "time_per_iteration": 2.5339365005493164 + }, + { + "auxiliary_loss_clip": 0.0644324, + "auxiliary_loss_mlp": 0.01270416, + "balance_loss_clip": 0.06278235, + "balance_loss_mlp": 0.0125713, + "epoch": 0.45206673681046144, + "flos": 48225279985920.0, + "grad_norm": 1.4167266474258036, + "language_loss": 0.59749353, + "learning_rate": 2.404756517215982e-06, + "loss": 0.67463017, + "num_input_tokens_seen": 161336690, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.13287354, + "step": 7519, + "time_per_iteration": 4.238602876663208 + }, + { + "auxiliary_loss_clip": 0.06444496, + "auxiliary_loss_mlp": 0.01271755, + "balance_loss_clip": 0.06278859, + "balance_loss_mlp": 0.0125789, + "epoch": 0.4521268600631294, + "flos": 23848997120640.0, + "grad_norm": 1.307309529899749, + "language_loss": 0.72893107, + "learning_rate": 2.404375106826223e-06, + "loss": 0.80609363, + "num_input_tokens_seen": 161357845, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.13848877, + "step": 7520, + "time_per_iteration": 2.5295658111572266 + }, + { + "auxiliary_loss_clip": 0.06438812, + "auxiliary_loss_mlp": 0.01272031, + "balance_loss_clip": 0.062758, + "balance_loss_mlp": 0.01257875, + "epoch": 0.4521869833157974, + "flos": 18849611589120.0, + "grad_norm": 1.9694306251575102, + "language_loss": 0.75821477, + "learning_rate": 2.4039936811025194e-06, + "loss": 0.83532321, + "num_input_tokens_seen": 161375160, + "router_z_loss_clip": 1.62890625, + "router_z_loss_mlp": 0.14147949, + "step": 7521, + "time_per_iteration": 2.51493763923645 + }, + { + "auxiliary_loss_clip": 0.06448258, + "auxiliary_loss_mlp": 0.01268765, + "balance_loss_clip": 0.06277934, + "balance_loss_mlp": 0.01255485, + "epoch": 0.45224710656846534, + "flos": 19793520633600.0, + "grad_norm": 2.0145516283749334, + "language_loss": 0.68112928, + "learning_rate": 2.4036122400593343e-06, + "loss": 0.75829947, + "num_input_tokens_seen": 161393690, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.1328125, + "step": 7522, + "time_per_iteration": 2.4986941814422607 + }, + { + "auxiliary_loss_clip": 0.06441501, + "auxiliary_loss_mlp": 0.0127253, + "balance_loss_clip": 0.06278691, + "balance_loss_mlp": 0.01258797, + "epoch": 0.4523072298211333, + "flos": 28263558280320.0, + "grad_norm": 1.4118666030005445, + "language_loss": 0.61165464, + "learning_rate": 2.403230783711134e-06, + "loss": 0.68879497, + "num_input_tokens_seen": 161415015, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.13739014, + "step": 7523, + "time_per_iteration": 2.5918800830841064 + }, + { + "auxiliary_loss_clip": 0.06446532, + "auxiliary_loss_mlp": 0.01271231, + "balance_loss_clip": 0.06278014, + "balance_loss_mlp": 0.01256187, + "epoch": 0.45236735307380127, + "flos": 11185651820160.0, + "grad_norm": 1.7682897571754845, + "language_loss": 0.78361082, + "learning_rate": 2.4028493120723813e-06, + "loss": 0.86078846, + "num_input_tokens_seen": 161432940, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.15057373, + "step": 7524, + "time_per_iteration": 2.4915785789489746 + }, + { + "auxiliary_loss_clip": 0.06441181, + "auxiliary_loss_mlp": 0.01272652, + "balance_loss_clip": 0.06277032, + "balance_loss_mlp": 0.01259527, + "epoch": 0.45242747632646924, + "flos": 22607959098240.0, + "grad_norm": 1.5918865124670334, + "language_loss": 0.63704681, + "learning_rate": 2.4024678251575417e-06, + "loss": 0.71418512, + "num_input_tokens_seen": 161452215, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.13122559, + "step": 7525, + "time_per_iteration": 4.0678441524505615 + }, + { + "auxiliary_loss_clip": 0.06439088, + "auxiliary_loss_mlp": 0.01272795, + "balance_loss_clip": 0.06279112, + "balance_loss_mlp": 0.01260153, + "epoch": 0.45248759957913726, + "flos": 18261558835200.0, + "grad_norm": 33.97196740045056, + "language_loss": 0.78961569, + "learning_rate": 2.402086322981083e-06, + "loss": 0.8667345, + "num_input_tokens_seen": 161469520, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.12664795, + "step": 7526, + "time_per_iteration": 2.4813144207000732 + }, + { + "auxiliary_loss_clip": 0.06437138, + "auxiliary_loss_mlp": 0.01271118, + "balance_loss_clip": 0.06276058, + "balance_loss_mlp": 0.01257493, + "epoch": 0.4525477228318052, + "flos": 22455746956800.0, + "grad_norm": 1.6415997795559136, + "language_loss": 0.81301343, + "learning_rate": 2.40170480555747e-06, + "loss": 0.89009607, + "num_input_tokens_seen": 161487335, + "router_z_loss_clip": 1.61132812, + "router_z_loss_mlp": 0.13641357, + "step": 7527, + "time_per_iteration": 2.5056183338165283 + }, + { + "auxiliary_loss_clip": 0.06441762, + "auxiliary_loss_mlp": 0.01270981, + "balance_loss_clip": 0.06280501, + "balance_loss_mlp": 0.01258106, + "epoch": 0.4526078460844732, + "flos": 29652909229440.0, + "grad_norm": 1.731340365534577, + "language_loss": 0.65853465, + "learning_rate": 2.4013232729011706e-06, + "loss": 0.73566198, + "num_input_tokens_seen": 161510095, + "router_z_loss_clip": 1.61230469, + "router_z_loss_mlp": 0.12866211, + "step": 7528, + "time_per_iteration": 2.6073391437530518 + }, + { + "auxiliary_loss_clip": 0.06439637, + "auxiliary_loss_mlp": 0.0127116, + "balance_loss_clip": 0.06280227, + "balance_loss_mlp": 0.01257296, + "epoch": 0.45266796933714115, + "flos": 23046483041280.0, + "grad_norm": 1.6874802957215247, + "language_loss": 0.75494301, + "learning_rate": 2.4009417250266525e-06, + "loss": 0.83205104, + "num_input_tokens_seen": 161528725, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.13867188, + "step": 7529, + "time_per_iteration": 2.5490171909332275 + }, + { + "auxiliary_loss_clip": 0.06443143, + "auxiliary_loss_mlp": 0.01270284, + "balance_loss_clip": 0.06278682, + "balance_loss_mlp": 0.0125614, + "epoch": 0.4527280925898091, + "flos": 14433582983040.0, + "grad_norm": 5.318026120447717, + "language_loss": 0.73199093, + "learning_rate": 2.400560161948384e-06, + "loss": 0.80912519, + "num_input_tokens_seen": 161547195, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.14160156, + "step": 7530, + "time_per_iteration": 2.4709434509277344 + }, + { + "auxiliary_loss_clip": 0.06441925, + "auxiliary_loss_mlp": 0.01267178, + "balance_loss_clip": 0.06279813, + "balance_loss_mlp": 0.01253857, + "epoch": 0.4527882158424771, + "flos": 22931432985600.0, + "grad_norm": 1.7055117614079858, + "language_loss": 0.76767921, + "learning_rate": 2.400178583680834e-06, + "loss": 0.84477019, + "num_input_tokens_seen": 161565565, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.13336182, + "step": 7531, + "time_per_iteration": 3.9209694862365723 + }, + { + "auxiliary_loss_clip": 0.06439964, + "auxiliary_loss_mlp": 0.01266077, + "balance_loss_clip": 0.06281422, + "balance_loss_mlp": 0.01253018, + "epoch": 0.45284833909514505, + "flos": 25562157373440.0, + "grad_norm": 1.5452453614533965, + "language_loss": 0.67367595, + "learning_rate": 2.3997969902384717e-06, + "loss": 0.75073636, + "num_input_tokens_seen": 161586630, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.1305542, + "step": 7532, + "time_per_iteration": 2.5799813270568848 + }, + { + "auxiliary_loss_clip": 0.06441537, + "auxiliary_loss_mlp": 0.01269682, + "balance_loss_clip": 0.06280663, + "balance_loss_mlp": 0.01257206, + "epoch": 0.452908462347813, + "flos": 18155816582400.0, + "grad_norm": 2.362226158293886, + "language_loss": 0.78750062, + "learning_rate": 2.399415381635768e-06, + "loss": 0.86461282, + "num_input_tokens_seen": 161603815, + "router_z_loss_clip": 1.60839844, + "router_z_loss_mlp": 0.12481689, + "step": 7533, + "time_per_iteration": 2.4713315963745117 + }, + { + "auxiliary_loss_clip": 0.06451754, + "auxiliary_loss_mlp": 0.01272809, + "balance_loss_clip": 0.06279968, + "balance_loss_mlp": 0.01257849, + "epoch": 0.452968585600481, + "flos": 19068810670080.0, + "grad_norm": 1.7736608700696739, + "language_loss": 0.83544481, + "learning_rate": 2.3990337578871927e-06, + "loss": 0.9126904, + "num_input_tokens_seen": 161622900, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.1494751, + "step": 7534, + "time_per_iteration": 2.632647752761841 + }, + { + "auxiliary_loss_clip": 0.06447195, + "auxiliary_loss_mlp": 0.01272735, + "balance_loss_clip": 0.06281491, + "balance_loss_mlp": 0.01258597, + "epoch": 0.45302870885314894, + "flos": 22057823116800.0, + "grad_norm": 1.5477368000033016, + "language_loss": 0.77199811, + "learning_rate": 2.3986521190072176e-06, + "loss": 0.84919739, + "num_input_tokens_seen": 161641700, + "router_z_loss_clip": 1.65625, + "router_z_loss_mlp": 0.14129639, + "step": 7535, + "time_per_iteration": 2.504075765609741 + }, + { + "auxiliary_loss_clip": 0.06444988, + "auxiliary_loss_mlp": 0.01267095, + "balance_loss_clip": 0.06283444, + "balance_loss_mlp": 0.01254453, + "epoch": 0.4530888321058169, + "flos": 20382495782400.0, + "grad_norm": 1.553658728431748, + "language_loss": 0.80988163, + "learning_rate": 2.3982704650103138e-06, + "loss": 0.88700247, + "num_input_tokens_seen": 161661955, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.12640381, + "step": 7536, + "time_per_iteration": 2.5701963901519775 + }, + { + "auxiliary_loss_clip": 0.06448273, + "auxiliary_loss_mlp": 0.01269034, + "balance_loss_clip": 0.06281114, + "balance_loss_mlp": 0.01255617, + "epoch": 0.4531489553584849, + "flos": 14835783381120.0, + "grad_norm": 1.8444336957712972, + "language_loss": 0.76206815, + "learning_rate": 2.3978887959109544e-06, + "loss": 0.83924115, + "num_input_tokens_seen": 161679245, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.13427734, + "step": 7537, + "time_per_iteration": 2.4535741806030273 + }, + { + "auxiliary_loss_clip": 0.06453362, + "auxiliary_loss_mlp": 0.01269094, + "balance_loss_clip": 0.06287456, + "balance_loss_mlp": 0.0125526, + "epoch": 0.45320907861115284, + "flos": 21951493885440.0, + "grad_norm": 1.8251133101176713, + "language_loss": 0.75698435, + "learning_rate": 2.3975071117236118e-06, + "loss": 0.83420891, + "num_input_tokens_seen": 161698795, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.13830566, + "step": 7538, + "time_per_iteration": 2.5437614917755127 + }, + { + "auxiliary_loss_clip": 0.06342177, + "auxiliary_loss_mlp": 0.01255931, + "balance_loss_clip": 0.06267795, + "balance_loss_mlp": 0.01253302, + "epoch": 0.45326920186382086, + "flos": 66273620578560.0, + "grad_norm": 1.09487044177016, + "language_loss": 0.62420493, + "learning_rate": 2.3971254124627593e-06, + "loss": 0.70018601, + "num_input_tokens_seen": 161761980, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.02630615, + "step": 7539, + "time_per_iteration": 3.1658005714416504 + }, + { + "auxiliary_loss_clip": 0.06450586, + "auxiliary_loss_mlp": 0.01270155, + "balance_loss_clip": 0.06287818, + "balance_loss_mlp": 0.01256404, + "epoch": 0.4533293251164888, + "flos": 14689524879360.0, + "grad_norm": 1.7102983978579578, + "language_loss": 0.65674543, + "learning_rate": 2.396743698142872e-06, + "loss": 0.73395288, + "num_input_tokens_seen": 161779455, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.13757324, + "step": 7540, + "time_per_iteration": 2.5642666816711426 + }, + { + "auxiliary_loss_clip": 0.06454974, + "auxiliary_loss_mlp": 0.01269021, + "balance_loss_clip": 0.06285828, + "balance_loss_mlp": 0.01254179, + "epoch": 0.4533894483691568, + "flos": 22607749463040.0, + "grad_norm": 2.019177110810713, + "language_loss": 0.84982491, + "learning_rate": 2.396361968778424e-06, + "loss": 0.92706484, + "num_input_tokens_seen": 161798980, + "router_z_loss_clip": 1.69238281, + "router_z_loss_mlp": 0.1484375, + "step": 7541, + "time_per_iteration": 2.515012741088867 + }, + { + "auxiliary_loss_clip": 0.06444205, + "auxiliary_loss_mlp": 0.01270638, + "balance_loss_clip": 0.06281162, + "balance_loss_mlp": 0.01257853, + "epoch": 0.45344957162182475, + "flos": 34760301073920.0, + "grad_norm": 1.6772641382422697, + "language_loss": 0.77260393, + "learning_rate": 2.395980224383889e-06, + "loss": 0.84975231, + "num_input_tokens_seen": 161819745, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.12780762, + "step": 7542, + "time_per_iteration": 2.6276772022247314 + }, + { + "auxiliary_loss_clip": 0.06447195, + "auxiliary_loss_mlp": 0.01266644, + "balance_loss_clip": 0.06281827, + "balance_loss_mlp": 0.01252398, + "epoch": 0.4535096948744927, + "flos": 23556983241600.0, + "grad_norm": 1.679511772595701, + "language_loss": 0.80522043, + "learning_rate": 2.395598464973746e-06, + "loss": 0.88235873, + "num_input_tokens_seen": 161838575, + "router_z_loss_clip": 1.65234375, + "router_z_loss_mlp": 0.14233398, + "step": 7543, + "time_per_iteration": 2.5102038383483887 + }, + { + "auxiliary_loss_clip": 0.06448692, + "auxiliary_loss_mlp": 0.01269791, + "balance_loss_clip": 0.06283225, + "balance_loss_mlp": 0.01256339, + "epoch": 0.4535698181271607, + "flos": 25564756849920.0, + "grad_norm": 1.5595363191014409, + "language_loss": 0.76234162, + "learning_rate": 2.395216690562469e-06, + "loss": 0.83952641, + "num_input_tokens_seen": 161858590, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.13446045, + "step": 7544, + "time_per_iteration": 2.613546371459961 + }, + { + "auxiliary_loss_clip": 0.06450664, + "auxiliary_loss_mlp": 0.0127145, + "balance_loss_clip": 0.06283042, + "balance_loss_mlp": 0.01257747, + "epoch": 0.45362994137982865, + "flos": 24871171478400.0, + "grad_norm": 1.656067150864753, + "language_loss": 0.75691646, + "learning_rate": 2.3948349011645355e-06, + "loss": 0.83413762, + "num_input_tokens_seen": 161878390, + "router_z_loss_clip": 1.67578125, + "router_z_loss_mlp": 0.137146, + "step": 7545, + "time_per_iteration": 2.5587077140808105 + }, + { + "auxiliary_loss_clip": 0.06444206, + "auxiliary_loss_mlp": 0.01276554, + "balance_loss_clip": 0.06279359, + "balance_loss_mlp": 0.01263161, + "epoch": 0.4536900646324966, + "flos": 30814088711040.0, + "grad_norm": 1.7013764448707542, + "language_loss": 0.72677243, + "learning_rate": 2.394453096794423e-06, + "loss": 0.80397999, + "num_input_tokens_seen": 161898610, + "router_z_loss_clip": 1.64941406, + "router_z_loss_mlp": 0.13391113, + "step": 7546, + "time_per_iteration": 2.582507371902466 + }, + { + "auxiliary_loss_clip": 0.06454303, + "auxiliary_loss_mlp": 0.01276587, + "balance_loss_clip": 0.06282242, + "balance_loss_mlp": 0.01261531, + "epoch": 0.4537501878851646, + "flos": 23411060156160.0, + "grad_norm": 1.4140833040204603, + "language_loss": 0.76407051, + "learning_rate": 2.394071277466609e-06, + "loss": 0.8413794, + "num_input_tokens_seen": 161918210, + "router_z_loss_clip": 1.72070312, + "router_z_loss_mlp": 0.1505127, + "step": 7547, + "time_per_iteration": 2.5376148223876953 + }, + { + "auxiliary_loss_clip": 0.06452849, + "auxiliary_loss_mlp": 0.0127245, + "balance_loss_clip": 0.06284454, + "balance_loss_mlp": 0.01258086, + "epoch": 0.45381031113783254, + "flos": 18154978041600.0, + "grad_norm": 1.9572251150113926, + "language_loss": 0.70011902, + "learning_rate": 2.393689443195573e-06, + "loss": 0.777372, + "num_input_tokens_seen": 161936950, + "router_z_loss_clip": 1.68554688, + "router_z_loss_mlp": 0.14367676, + "step": 7548, + "time_per_iteration": 2.519615650177002 + }, + { + "auxiliary_loss_clip": 0.0644725, + "auxiliary_loss_mlp": 0.01271972, + "balance_loss_clip": 0.06283379, + "balance_loss_mlp": 0.01258638, + "epoch": 0.4538704343905005, + "flos": 25343503344000.0, + "grad_norm": 2.0312160927741933, + "language_loss": 0.72993481, + "learning_rate": 2.393307593995794e-06, + "loss": 0.80712706, + "num_input_tokens_seen": 161955550, + "router_z_loss_clip": 1.63769531, + "router_z_loss_mlp": 0.13342285, + "step": 7549, + "time_per_iteration": 2.57501482963562 + }, + { + "auxiliary_loss_clip": 0.06446082, + "auxiliary_loss_mlp": 0.01269972, + "balance_loss_clip": 0.06283575, + "balance_loss_mlp": 0.01257312, + "epoch": 0.4539305576431685, + "flos": 28739118528000.0, + "grad_norm": 1.441987244253853, + "language_loss": 0.65387678, + "learning_rate": 2.392925729881751e-06, + "loss": 0.73103732, + "num_input_tokens_seen": 161976760, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.12658691, + "step": 7550, + "time_per_iteration": 2.5835819244384766 + }, + { + "auxiliary_loss_clip": 0.06445216, + "auxiliary_loss_mlp": 0.01271365, + "balance_loss_clip": 0.06284294, + "balance_loss_mlp": 0.01258162, + "epoch": 0.45399068089583644, + "flos": 22499030390400.0, + "grad_norm": 1.5764003430967004, + "language_loss": 0.6906575, + "learning_rate": 2.3925438508679263e-06, + "loss": 0.76782334, + "num_input_tokens_seen": 161996120, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.13189697, + "step": 7551, + "time_per_iteration": 2.562033176422119 + }, + { + "auxiliary_loss_clip": 0.06442459, + "auxiliary_loss_mlp": 0.01272903, + "balance_loss_clip": 0.06276844, + "balance_loss_mlp": 0.01259504, + "epoch": 0.45405080414850446, + "flos": 12897889678080.0, + "grad_norm": 1.6874134559177159, + "language_loss": 0.79426885, + "learning_rate": 2.392161956968798e-06, + "loss": 0.87142253, + "num_input_tokens_seen": 162011125, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.13409424, + "step": 7552, + "time_per_iteration": 2.4449541568756104 + }, + { + "auxiliary_loss_clip": 0.063404, + "auxiliary_loss_mlp": 0.01262626, + "balance_loss_clip": 0.06265783, + "balance_loss_mlp": 0.01260128, + "epoch": 0.4541109274011724, + "flos": 59783558912640.0, + "grad_norm": 0.8094629177090237, + "language_loss": 0.57832247, + "learning_rate": 2.39178004819885e-06, + "loss": 0.65435266, + "num_input_tokens_seen": 162068705, + "router_z_loss_clip": 0.74707031, + "router_z_loss_mlp": 0.02496338, + "step": 7553, + "time_per_iteration": 3.089684247970581 + }, + { + "auxiliary_loss_clip": 0.06443945, + "auxiliary_loss_mlp": 0.01272453, + "balance_loss_clip": 0.06280293, + "balance_loss_mlp": 0.01258946, + "epoch": 0.4541710506538404, + "flos": 28519248614400.0, + "grad_norm": 1.8062911390055711, + "language_loss": 0.76727033, + "learning_rate": 2.3913981245725626e-06, + "loss": 0.84443438, + "num_input_tokens_seen": 162089655, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.13494873, + "step": 7554, + "time_per_iteration": 2.541727066040039 + }, + { + "auxiliary_loss_clip": 0.06449907, + "auxiliary_loss_mlp": 0.0126986, + "balance_loss_clip": 0.06284112, + "balance_loss_mlp": 0.0125559, + "epoch": 0.45423117390650836, + "flos": 17681304510720.0, + "grad_norm": 3.221825223389834, + "language_loss": 0.76701951, + "learning_rate": 2.3910161861044194e-06, + "loss": 0.84421712, + "num_input_tokens_seen": 162108465, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.1427002, + "step": 7555, + "time_per_iteration": 2.5190746784210205 + }, + { + "auxiliary_loss_clip": 0.06447887, + "auxiliary_loss_mlp": 0.01270234, + "balance_loss_clip": 0.06284074, + "balance_loss_mlp": 0.01256292, + "epoch": 0.4542912971591763, + "flos": 28079760349440.0, + "grad_norm": 1.2938327471401587, + "language_loss": 0.7293222, + "learning_rate": 2.390634232808903e-06, + "loss": 0.80650342, + "num_input_tokens_seen": 162129910, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.13946533, + "step": 7556, + "time_per_iteration": 2.559330940246582 + }, + { + "auxiliary_loss_clip": 0.06452744, + "auxiliary_loss_mlp": 0.0127062, + "balance_loss_clip": 0.06282438, + "balance_loss_mlp": 0.01256351, + "epoch": 0.4543514204118443, + "flos": 22677922857600.0, + "grad_norm": 1.9930550713200077, + "language_loss": 0.63614035, + "learning_rate": 2.3902522647004982e-06, + "loss": 0.71337396, + "num_input_tokens_seen": 162148840, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.14294434, + "step": 7557, + "time_per_iteration": 2.555694580078125 + }, + { + "auxiliary_loss_clip": 0.06341553, + "auxiliary_loss_mlp": 0.01256007, + "balance_loss_clip": 0.06267436, + "balance_loss_mlp": 0.01253351, + "epoch": 0.45441154366451225, + "flos": 58236027454080.0, + "grad_norm": 0.6640379644801875, + "language_loss": 0.57562745, + "learning_rate": 2.3898702817936875e-06, + "loss": 0.65160298, + "num_input_tokens_seen": 162208500, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.02658081, + "step": 7558, + "time_per_iteration": 5.871712684631348 + }, + { + "auxiliary_loss_clip": 0.06449831, + "auxiliary_loss_mlp": 0.01270129, + "balance_loss_clip": 0.06282432, + "balance_loss_mlp": 0.01255216, + "epoch": 0.4544716669171802, + "flos": 16769987504640.0, + "grad_norm": 2.2880587940678927, + "language_loss": 0.56438738, + "learning_rate": 2.3894882841029573e-06, + "loss": 0.64158702, + "num_input_tokens_seen": 162224650, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.14904785, + "step": 7559, + "time_per_iteration": 2.4660634994506836 + }, + { + "auxiliary_loss_clip": 0.06446083, + "auxiliary_loss_mlp": 0.01271383, + "balance_loss_clip": 0.06282272, + "balance_loss_mlp": 0.01257728, + "epoch": 0.4545317901698482, + "flos": 15930814464000.0, + "grad_norm": 1.794091833084443, + "language_loss": 0.72316611, + "learning_rate": 2.389106271642792e-06, + "loss": 0.80034077, + "num_input_tokens_seen": 162242930, + "router_z_loss_clip": 1.63867188, + "router_z_loss_mlp": 0.13671875, + "step": 7560, + "time_per_iteration": 2.497083902359009 + }, + { + "auxiliary_loss_clip": 0.06455533, + "auxiliary_loss_mlp": 0.01271449, + "balance_loss_clip": 0.0628465, + "balance_loss_mlp": 0.01257096, + "epoch": 0.45459191342251615, + "flos": 17645567944320.0, + "grad_norm": 2.9678955818231167, + "language_loss": 0.69120479, + "learning_rate": 2.3887242444276775e-06, + "loss": 0.76847458, + "num_input_tokens_seen": 162261455, + "router_z_loss_clip": 1.70800781, + "router_z_loss_mlp": 0.14355469, + "step": 7561, + "time_per_iteration": 2.463433027267456 + }, + { + "auxiliary_loss_clip": 0.06447616, + "auxiliary_loss_mlp": 0.01269071, + "balance_loss_clip": 0.06286462, + "balance_loss_mlp": 0.01256161, + "epoch": 0.4546520366751841, + "flos": 16181557407360.0, + "grad_norm": 2.3534128933362277, + "language_loss": 0.85417646, + "learning_rate": 2.3883422024721015e-06, + "loss": 0.93134332, + "num_input_tokens_seen": 162279725, + "router_z_loss_clip": 1.61132812, + "router_z_loss_mlp": 0.12908936, + "step": 7562, + "time_per_iteration": 2.5475013256073 + }, + { + "auxiliary_loss_clip": 0.06445649, + "auxiliary_loss_mlp": 0.01271177, + "balance_loss_clip": 0.06284063, + "balance_loss_mlp": 0.01257504, + "epoch": 0.4547121599278521, + "flos": 19756861672320.0, + "grad_norm": 1.7772924752060992, + "language_loss": 0.89642298, + "learning_rate": 2.38796014579055e-06, + "loss": 0.97359127, + "num_input_tokens_seen": 162297865, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.13684082, + "step": 7563, + "time_per_iteration": 2.489121675491333 + }, + { + "auxiliary_loss_clip": 0.06453149, + "auxiliary_loss_mlp": 0.01274815, + "balance_loss_clip": 0.06286659, + "balance_loss_mlp": 0.01260397, + "epoch": 0.45477228318052004, + "flos": 19943510641920.0, + "grad_norm": 1.9263110789996643, + "language_loss": 0.71668887, + "learning_rate": 2.3875780743975097e-06, + "loss": 0.79396844, + "num_input_tokens_seen": 162316010, + "router_z_loss_clip": 1.6640625, + "router_z_loss_mlp": 0.14428711, + "step": 7564, + "time_per_iteration": 2.4964044094085693 + }, + { + "auxiliary_loss_clip": 0.06450239, + "auxiliary_loss_mlp": 0.01267794, + "balance_loss_clip": 0.06283273, + "balance_loss_mlp": 0.01253912, + "epoch": 0.454832406433188, + "flos": 21294735183360.0, + "grad_norm": 2.0561067408009994, + "language_loss": 0.68633133, + "learning_rate": 2.3871959883074713e-06, + "loss": 0.7635116, + "num_input_tokens_seen": 162336115, + "router_z_loss_clip": 1.66894531, + "router_z_loss_mlp": 0.13879395, + "step": 7565, + "time_per_iteration": 4.080512762069702 + }, + { + "auxiliary_loss_clip": 0.06446166, + "auxiliary_loss_mlp": 0.01274343, + "balance_loss_clip": 0.06282604, + "balance_loss_mlp": 0.01260247, + "epoch": 0.45489252968585603, + "flos": 24505630041600.0, + "grad_norm": 2.0436514367854413, + "language_loss": 0.802881, + "learning_rate": 2.386813887534922e-06, + "loss": 0.88008606, + "num_input_tokens_seen": 162355705, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.14105225, + "step": 7566, + "time_per_iteration": 2.521056890487671 + }, + { + "auxiliary_loss_clip": 0.06452477, + "auxiliary_loss_mlp": 0.01273216, + "balance_loss_clip": 0.06286022, + "balance_loss_mlp": 0.01257558, + "epoch": 0.454952652938524, + "flos": 17098199147520.0, + "grad_norm": 2.208842453595512, + "language_loss": 0.74317467, + "learning_rate": 2.3864317720943508e-06, + "loss": 0.82043159, + "num_input_tokens_seen": 162374055, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.15661621, + "step": 7567, + "time_per_iteration": 2.515658140182495 + }, + { + "auxiliary_loss_clip": 0.06459296, + "auxiliary_loss_mlp": 0.01271605, + "balance_loss_clip": 0.06291091, + "balance_loss_mlp": 0.0125801, + "epoch": 0.45501277619119196, + "flos": 27636792140160.0, + "grad_norm": 1.5215577708435108, + "language_loss": 0.80959934, + "learning_rate": 2.386049642000249e-06, + "loss": 0.88690829, + "num_input_tokens_seen": 162393560, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.13604736, + "step": 7568, + "time_per_iteration": 2.558258533477783 + }, + { + "auxiliary_loss_clip": 0.06466229, + "auxiliary_loss_mlp": 0.01276365, + "balance_loss_clip": 0.06294216, + "balance_loss_mlp": 0.01260176, + "epoch": 0.4550728994438599, + "flos": 19980840435840.0, + "grad_norm": 1.8148678559144198, + "language_loss": 0.80280846, + "learning_rate": 2.3856674972671055e-06, + "loss": 0.88023436, + "num_input_tokens_seen": 162413170, + "router_z_loss_clip": 1.71972656, + "router_z_loss_mlp": 0.16186523, + "step": 7569, + "time_per_iteration": 2.531153917312622 + }, + { + "auxiliary_loss_clip": 0.06458277, + "auxiliary_loss_mlp": 0.01268707, + "balance_loss_clip": 0.06287743, + "balance_loss_mlp": 0.01254176, + "epoch": 0.4551330226965279, + "flos": 26073915384960.0, + "grad_norm": 1.3474740501928035, + "language_loss": 0.75202894, + "learning_rate": 2.385285337909412e-06, + "loss": 0.82929879, + "num_input_tokens_seen": 162434080, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.14538574, + "step": 7570, + "time_per_iteration": 2.543170690536499 + }, + { + "auxiliary_loss_clip": 0.06452256, + "auxiliary_loss_mlp": 0.01273702, + "balance_loss_clip": 0.06289603, + "balance_loss_mlp": 0.01259826, + "epoch": 0.45519314594919585, + "flos": 32789396062080.0, + "grad_norm": 1.7878922954829848, + "language_loss": 0.74832451, + "learning_rate": 2.3849031639416596e-06, + "loss": 0.82558417, + "num_input_tokens_seen": 162455445, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.13879395, + "step": 7571, + "time_per_iteration": 4.052931308746338 + }, + { + "auxiliary_loss_clip": 0.06451707, + "auxiliary_loss_mlp": 0.01275937, + "balance_loss_clip": 0.06292738, + "balance_loss_mlp": 0.01261954, + "epoch": 0.4552532692018638, + "flos": 19178829480960.0, + "grad_norm": 1.5879241198756615, + "language_loss": 0.81163442, + "learning_rate": 2.3845209753783414e-06, + "loss": 0.88891089, + "num_input_tokens_seen": 162474940, + "router_z_loss_clip": 1.58886719, + "router_z_loss_mlp": 0.13983154, + "step": 7572, + "time_per_iteration": 2.511032819747925 + }, + { + "auxiliary_loss_clip": 0.06461887, + "auxiliary_loss_mlp": 0.01269094, + "balance_loss_clip": 0.06292465, + "balance_loss_mlp": 0.01254306, + "epoch": 0.4553133924545318, + "flos": 26033650698240.0, + "grad_norm": 2.340526601051543, + "language_loss": 0.72866237, + "learning_rate": 2.3841387722339486e-06, + "loss": 0.80597222, + "num_input_tokens_seen": 162493340, + "router_z_loss_clip": 1.69238281, + "router_z_loss_mlp": 0.14788818, + "step": 7573, + "time_per_iteration": 2.5469906330108643 + }, + { + "auxiliary_loss_clip": 0.06470129, + "auxiliary_loss_mlp": 0.0127089, + "balance_loss_clip": 0.06300491, + "balance_loss_mlp": 0.01255094, + "epoch": 0.45537351570719975, + "flos": 30668920312320.0, + "grad_norm": 1.9189620807456311, + "language_loss": 0.74504352, + "learning_rate": 2.3837565545229748e-06, + "loss": 0.82245368, + "num_input_tokens_seen": 162514360, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.15783691, + "step": 7574, + "time_per_iteration": 2.6484622955322266 + }, + { + "auxiliary_loss_clip": 0.06463373, + "auxiliary_loss_mlp": 0.01271034, + "balance_loss_clip": 0.06294367, + "balance_loss_mlp": 0.0125661, + "epoch": 0.4554336389598677, + "flos": 24360377788800.0, + "grad_norm": 1.669597443611077, + "language_loss": 0.71544576, + "learning_rate": 2.383374322259915e-06, + "loss": 0.79278982, + "num_input_tokens_seen": 162535240, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.14428711, + "step": 7575, + "time_per_iteration": 2.544975519180298 + }, + { + "auxiliary_loss_clip": 0.06456485, + "auxiliary_loss_mlp": 0.01268004, + "balance_loss_clip": 0.06290726, + "balance_loss_mlp": 0.01253794, + "epoch": 0.4554937622125357, + "flos": 20564113507200.0, + "grad_norm": 1.7578928676474412, + "language_loss": 0.7370066, + "learning_rate": 2.3829920754592617e-06, + "loss": 0.81425148, + "num_input_tokens_seen": 162553880, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.14202881, + "step": 7576, + "time_per_iteration": 2.534135580062866 + }, + { + "auxiliary_loss_clip": 0.06453636, + "auxiliary_loss_mlp": 0.0127588, + "balance_loss_clip": 0.06290971, + "balance_loss_mlp": 0.01261551, + "epoch": 0.45555388546520365, + "flos": 22827451668480.0, + "grad_norm": 2.007695048360481, + "language_loss": 0.66580224, + "learning_rate": 2.382609814135511e-06, + "loss": 0.74309736, + "num_input_tokens_seen": 162574485, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.14312744, + "step": 7577, + "time_per_iteration": 2.5095431804656982 + }, + { + "auxiliary_loss_clip": 0.06452672, + "auxiliary_loss_mlp": 0.01272369, + "balance_loss_clip": 0.0628684, + "balance_loss_mlp": 0.01256538, + "epoch": 0.4556140087178716, + "flos": 21732462512640.0, + "grad_norm": 1.904316861437945, + "language_loss": 0.74386835, + "learning_rate": 2.382227538303157e-06, + "loss": 0.82111871, + "num_input_tokens_seen": 162595130, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.15820312, + "step": 7578, + "time_per_iteration": 2.5497546195983887 + }, + { + "auxiliary_loss_clip": 0.06453466, + "auxiliary_loss_mlp": 0.01270181, + "balance_loss_clip": 0.06290053, + "balance_loss_mlp": 0.01256645, + "epoch": 0.45567413197053963, + "flos": 26001645638400.0, + "grad_norm": 1.7724513927111563, + "language_loss": 0.70436674, + "learning_rate": 2.381845247976697e-06, + "loss": 0.78160322, + "num_input_tokens_seen": 162615720, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.13531494, + "step": 7579, + "time_per_iteration": 2.5318000316619873 + }, + { + "auxiliary_loss_clip": 0.06449443, + "auxiliary_loss_mlp": 0.01270557, + "balance_loss_clip": 0.06286655, + "balance_loss_mlp": 0.01257664, + "epoch": 0.4557342552232076, + "flos": 21543046358400.0, + "grad_norm": 1.8462396851301097, + "language_loss": 0.78760922, + "learning_rate": 2.381462943170627e-06, + "loss": 0.86480927, + "num_input_tokens_seen": 162635825, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.12902832, + "step": 7580, + "time_per_iteration": 2.5358526706695557 + }, + { + "auxiliary_loss_clip": 0.06450854, + "auxiliary_loss_mlp": 0.0127087, + "balance_loss_clip": 0.06288584, + "balance_loss_mlp": 0.01257822, + "epoch": 0.45579437847587556, + "flos": 40010932673280.0, + "grad_norm": 1.6599136037597217, + "language_loss": 0.68708634, + "learning_rate": 2.381080623899444e-06, + "loss": 0.76430357, + "num_input_tokens_seen": 162659130, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.13049316, + "step": 7581, + "time_per_iteration": 2.667543888092041 + }, + { + "auxiliary_loss_clip": 0.06448796, + "auxiliary_loss_mlp": 0.01272512, + "balance_loss_clip": 0.06289542, + "balance_loss_mlp": 0.01258678, + "epoch": 0.4558545017285435, + "flos": 31146409203840.0, + "grad_norm": 1.6471906775179725, + "language_loss": 0.7358638, + "learning_rate": 2.3806982901776455e-06, + "loss": 0.81307691, + "num_input_tokens_seen": 162681665, + "router_z_loss_clip": 1.59277344, + "router_z_loss_mlp": 0.1383667, + "step": 7582, + "time_per_iteration": 2.6570708751678467 + }, + { + "auxiliary_loss_clip": 0.06455518, + "auxiliary_loss_mlp": 0.01272969, + "balance_loss_clip": 0.06286626, + "balance_loss_mlp": 0.01257818, + "epoch": 0.4559146249812115, + "flos": 21732210950400.0, + "grad_norm": 1.8620959272942483, + "language_loss": 0.73187852, + "learning_rate": 2.380315942019729e-06, + "loss": 0.80916339, + "num_input_tokens_seen": 162702040, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.15148926, + "step": 7583, + "time_per_iteration": 2.510700225830078 + }, + { + "auxiliary_loss_clip": 0.06455322, + "auxiliary_loss_mlp": 0.01272152, + "balance_loss_clip": 0.06287013, + "balance_loss_mlp": 0.01256202, + "epoch": 0.45597474823387946, + "flos": 23812841283840.0, + "grad_norm": 1.81949303768272, + "language_loss": 0.72839421, + "learning_rate": 2.379933579440195e-06, + "loss": 0.80566895, + "num_input_tokens_seen": 162722375, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.1595459, + "step": 7584, + "time_per_iteration": 2.5747973918914795 + }, + { + "auxiliary_loss_clip": 0.06447833, + "auxiliary_loss_mlp": 0.01268136, + "balance_loss_clip": 0.0628446, + "balance_loss_mlp": 0.01255357, + "epoch": 0.4560348714865474, + "flos": 31913857549440.0, + "grad_norm": 1.7864940938501939, + "language_loss": 0.67957801, + "learning_rate": 2.379551202453541e-06, + "loss": 0.75673771, + "num_input_tokens_seen": 162746095, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.12792969, + "step": 7585, + "time_per_iteration": 2.6153225898742676 + }, + { + "auxiliary_loss_clip": 0.0645072, + "auxiliary_loss_mlp": 0.01268647, + "balance_loss_clip": 0.06284043, + "balance_loss_mlp": 0.01254449, + "epoch": 0.4560949947392154, + "flos": 22054427026560.0, + "grad_norm": 1.7083540410775564, + "language_loss": 0.76353097, + "learning_rate": 2.379168811074267e-06, + "loss": 0.84072465, + "num_input_tokens_seen": 162766330, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.14190674, + "step": 7586, + "time_per_iteration": 2.5682435035705566 + }, + { + "auxiliary_loss_clip": 0.06448488, + "auxiliary_loss_mlp": 0.01267379, + "balance_loss_clip": 0.0628647, + "balance_loss_mlp": 0.01254182, + "epoch": 0.45615511799188335, + "flos": 24578738328960.0, + "grad_norm": 1.819670635232321, + "language_loss": 0.78360641, + "learning_rate": 2.3787864053168747e-06, + "loss": 0.86076516, + "num_input_tokens_seen": 162784755, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.13189697, + "step": 7587, + "time_per_iteration": 2.5558509826660156 + }, + { + "auxiliary_loss_clip": 0.06459979, + "auxiliary_loss_mlp": 0.01275995, + "balance_loss_clip": 0.06286488, + "balance_loss_mlp": 0.01260152, + "epoch": 0.4562152412445513, + "flos": 18336260350080.0, + "grad_norm": 1.7968748305561377, + "language_loss": 0.69667047, + "learning_rate": 2.378403985195863e-06, + "loss": 0.77403021, + "num_input_tokens_seen": 162803850, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.1583252, + "step": 7588, + "time_per_iteration": 2.5365071296691895 + }, + { + "auxiliary_loss_clip": 0.06447656, + "auxiliary_loss_mlp": 0.01274434, + "balance_loss_clip": 0.06286096, + "balance_loss_mlp": 0.01261422, + "epoch": 0.4562753644972193, + "flos": 13521595144320.0, + "grad_norm": 1.6774091429175193, + "language_loss": 0.79575098, + "learning_rate": 2.378021550725735e-06, + "loss": 0.87297189, + "num_input_tokens_seen": 162820775, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.13006592, + "step": 7589, + "time_per_iteration": 2.484713315963745 + }, + { + "auxiliary_loss_clip": 0.06452583, + "auxiliary_loss_mlp": 0.01271771, + "balance_loss_clip": 0.06289135, + "balance_loss_mlp": 0.0125774, + "epoch": 0.45633548774988725, + "flos": 29646871735680.0, + "grad_norm": 2.003946782113331, + "language_loss": 0.62696528, + "learning_rate": 2.377639101920992e-06, + "loss": 0.70420885, + "num_input_tokens_seen": 162839695, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.14044189, + "step": 7590, + "time_per_iteration": 2.609936475753784 + }, + { + "auxiliary_loss_clip": 0.06445528, + "auxiliary_loss_mlp": 0.01270847, + "balance_loss_clip": 0.06280724, + "balance_loss_mlp": 0.01257496, + "epoch": 0.4563956110025552, + "flos": 22239398914560.0, + "grad_norm": 1.8300596662255737, + "language_loss": 0.73085624, + "learning_rate": 2.377256638796135e-06, + "loss": 0.80802, + "num_input_tokens_seen": 162856095, + "router_z_loss_clip": 1.64746094, + "router_z_loss_mlp": 0.13330078, + "step": 7591, + "time_per_iteration": 2.47824764251709 + }, + { + "auxiliary_loss_clip": 0.06452768, + "auxiliary_loss_mlp": 0.01273962, + "balance_loss_clip": 0.0628728, + "balance_loss_mlp": 0.01260205, + "epoch": 0.45645573425522323, + "flos": 17097696023040.0, + "grad_norm": 1.9979722051509847, + "language_loss": 0.77518493, + "learning_rate": 2.3768741613656695e-06, + "loss": 0.85245228, + "num_input_tokens_seen": 162874070, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.13751221, + "step": 7592, + "time_per_iteration": 2.5239169597625732 + }, + { + "auxiliary_loss_clip": 0.06449406, + "auxiliary_loss_mlp": 0.01273175, + "balance_loss_clip": 0.06284081, + "balance_loss_mlp": 0.01259954, + "epoch": 0.4565158575078912, + "flos": 20337367559040.0, + "grad_norm": 2.421698823443505, + "language_loss": 0.6941641, + "learning_rate": 2.376491669644098e-06, + "loss": 0.77138984, + "num_input_tokens_seen": 162891000, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.13232422, + "step": 7593, + "time_per_iteration": 2.5688788890838623 + }, + { + "auxiliary_loss_clip": 0.06437326, + "auxiliary_loss_mlp": 0.01268265, + "balance_loss_clip": 0.06278698, + "balance_loss_mlp": 0.01256034, + "epoch": 0.45657598076055916, + "flos": 23989008493440.0, + "grad_norm": 2.02887277896486, + "language_loss": 0.8417384, + "learning_rate": 2.3761091636459248e-06, + "loss": 0.91879439, + "num_input_tokens_seen": 162910120, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.12237549, + "step": 7594, + "time_per_iteration": 2.5792298316955566 + }, + { + "auxiliary_loss_clip": 0.06341574, + "auxiliary_loss_mlp": 0.01258819, + "balance_loss_clip": 0.06267718, + "balance_loss_mlp": 0.0125595, + "epoch": 0.45663610401322713, + "flos": 69382812908160.0, + "grad_norm": 0.7684087429591354, + "language_loss": 0.52710819, + "learning_rate": 2.375726643385654e-06, + "loss": 0.60311204, + "num_input_tokens_seen": 162963720, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.02864075, + "step": 7595, + "time_per_iteration": 3.150902509689331 + }, + { + "auxiliary_loss_clip": 0.06451569, + "auxiliary_loss_mlp": 0.01268714, + "balance_loss_clip": 0.06282795, + "balance_loss_mlp": 0.0125491, + "epoch": 0.4566962272658951, + "flos": 15152884358400.0, + "grad_norm": 2.304862186673624, + "language_loss": 0.8729161, + "learning_rate": 2.3753441088777915e-06, + "loss": 0.95011896, + "num_input_tokens_seen": 162975760, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.13824463, + "step": 7596, + "time_per_iteration": 2.490346908569336 + }, + { + "auxiliary_loss_clip": 0.0644666, + "auxiliary_loss_mlp": 0.01270115, + "balance_loss_clip": 0.06282236, + "balance_loss_mlp": 0.01257324, + "epoch": 0.45675635051856306, + "flos": 18703395014400.0, + "grad_norm": 1.5857620712679525, + "language_loss": 0.77719533, + "learning_rate": 2.374961560136843e-06, + "loss": 0.85436308, + "num_input_tokens_seen": 162994865, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.12792969, + "step": 7597, + "time_per_iteration": 2.5043859481811523 + }, + { + "auxiliary_loss_clip": 0.0644691, + "auxiliary_loss_mlp": 0.01271101, + "balance_loss_clip": 0.06280024, + "balance_loss_mlp": 0.01256587, + "epoch": 0.456816473771231, + "flos": 19104211820160.0, + "grad_norm": 1.619707981694153, + "language_loss": 0.78513646, + "learning_rate": 2.374578997177314e-06, + "loss": 0.86231661, + "num_input_tokens_seen": 163014730, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.14501953, + "step": 7598, + "time_per_iteration": 3.9724912643432617 + }, + { + "auxiliary_loss_clip": 0.06447135, + "auxiliary_loss_mlp": 0.01268948, + "balance_loss_clip": 0.06284773, + "balance_loss_mlp": 0.01255508, + "epoch": 0.456876597023899, + "flos": 28957730630400.0, + "grad_norm": 2.2287540067942957, + "language_loss": 0.72171777, + "learning_rate": 2.374196420013712e-06, + "loss": 0.79887861, + "num_input_tokens_seen": 163033405, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.13458252, + "step": 7599, + "time_per_iteration": 2.594240188598633 + }, + { + "auxiliary_loss_clip": 0.06445186, + "auxiliary_loss_mlp": 0.0126948, + "balance_loss_clip": 0.06281814, + "balance_loss_mlp": 0.01256021, + "epoch": 0.45693672027656695, + "flos": 23295297340800.0, + "grad_norm": 1.7934880288039583, + "language_loss": 0.70205128, + "learning_rate": 2.373813828660544e-06, + "loss": 0.77919793, + "num_input_tokens_seen": 163051400, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.13439941, + "step": 7600, + "time_per_iteration": 2.5063295364379883 + }, + { + "auxiliary_loss_clip": 0.06449603, + "auxiliary_loss_mlp": 0.01270393, + "balance_loss_clip": 0.06284294, + "balance_loss_mlp": 0.01256571, + "epoch": 0.4569968435292349, + "flos": 20564448923520.0, + "grad_norm": 2.031833923402261, + "language_loss": 0.78985888, + "learning_rate": 2.373431223132319e-06, + "loss": 0.86705881, + "num_input_tokens_seen": 163069250, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.13824463, + "step": 7601, + "time_per_iteration": 2.559072494506836 + }, + { + "auxiliary_loss_clip": 0.06449661, + "auxiliary_loss_mlp": 0.0127022, + "balance_loss_clip": 0.06283583, + "balance_loss_mlp": 0.01257089, + "epoch": 0.4570569667819029, + "flos": 41292403090560.0, + "grad_norm": 1.9704151582810323, + "language_loss": 0.71676505, + "learning_rate": 2.3730486034435448e-06, + "loss": 0.79396379, + "num_input_tokens_seen": 163091755, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.13134766, + "step": 7602, + "time_per_iteration": 2.6897006034851074 + }, + { + "auxiliary_loss_clip": 0.06446967, + "auxiliary_loss_mlp": 0.01270876, + "balance_loss_clip": 0.06280911, + "balance_loss_mlp": 0.01255843, + "epoch": 0.45711709003457085, + "flos": 26038807724160.0, + "grad_norm": 1.8547506252317059, + "language_loss": 0.73479527, + "learning_rate": 2.372665969608729e-06, + "loss": 0.81197369, + "num_input_tokens_seen": 163111600, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.15026855, + "step": 7603, + "time_per_iteration": 2.5908169746398926 + }, + { + "auxiliary_loss_clip": 0.06447335, + "auxiliary_loss_mlp": 0.01269467, + "balance_loss_clip": 0.0628283, + "balance_loss_mlp": 0.01254077, + "epoch": 0.4571772132872388, + "flos": 22163649223680.0, + "grad_norm": 1.7365999934209901, + "language_loss": 0.83048642, + "learning_rate": 2.372283321642383e-06, + "loss": 0.90765446, + "num_input_tokens_seen": 163127350, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.15374756, + "step": 7604, + "time_per_iteration": 2.462653636932373 + }, + { + "auxiliary_loss_clip": 0.0645724, + "auxiliary_loss_mlp": 0.01271667, + "balance_loss_clip": 0.06285316, + "balance_loss_mlp": 0.01256456, + "epoch": 0.45723733653990684, + "flos": 23885739936000.0, + "grad_norm": 1.8384947858044167, + "language_loss": 0.86237913, + "learning_rate": 2.371900659559016e-06, + "loss": 0.93966818, + "num_input_tokens_seen": 163145855, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.15209961, + "step": 7605, + "time_per_iteration": 3.9711341857910156 + }, + { + "auxiliary_loss_clip": 0.0645397, + "auxiliary_loss_mlp": 0.01268015, + "balance_loss_clip": 0.06283225, + "balance_loss_mlp": 0.01253686, + "epoch": 0.4572974597925748, + "flos": 16877197203840.0, + "grad_norm": 1.5621441730902494, + "language_loss": 0.73368603, + "learning_rate": 2.371517983373138e-06, + "loss": 0.81090587, + "num_input_tokens_seen": 163163830, + "router_z_loss_clip": 1.70898438, + "router_z_loss_mlp": 0.14343262, + "step": 7606, + "time_per_iteration": 2.53171968460083 + }, + { + "auxiliary_loss_clip": 0.06450876, + "auxiliary_loss_mlp": 0.01272472, + "balance_loss_clip": 0.06281146, + "balance_loss_mlp": 0.01257118, + "epoch": 0.45735758304524277, + "flos": 13776530791680.0, + "grad_norm": 2.9980100906386324, + "language_loss": 0.80445778, + "learning_rate": 2.371135293099262e-06, + "loss": 0.88169128, + "num_input_tokens_seen": 163180700, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.15356445, + "step": 7607, + "time_per_iteration": 2.4730136394500732 + }, + { + "auxiliary_loss_clip": 0.06449468, + "auxiliary_loss_mlp": 0.01267355, + "balance_loss_clip": 0.06282607, + "balance_loss_mlp": 0.01252216, + "epoch": 0.45741770629791073, + "flos": 21106283351040.0, + "grad_norm": 1.9890456967063905, + "language_loss": 0.80849135, + "learning_rate": 2.3707525887518982e-06, + "loss": 0.88565969, + "num_input_tokens_seen": 163199450, + "router_z_loss_clip": 1.66894531, + "router_z_loss_mlp": 0.15130615, + "step": 7608, + "time_per_iteration": 2.5604805946350098 + }, + { + "auxiliary_loss_clip": 0.06445852, + "auxiliary_loss_mlp": 0.01268416, + "balance_loss_clip": 0.06281331, + "balance_loss_mlp": 0.01254576, + "epoch": 0.4574778295505787, + "flos": 23119675182720.0, + "grad_norm": 1.6776975313937859, + "language_loss": 0.68550682, + "learning_rate": 2.370369870345559e-06, + "loss": 0.76264954, + "num_input_tokens_seen": 163217875, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.1385498, + "step": 7609, + "time_per_iteration": 2.5249829292297363 + }, + { + "auxiliary_loss_clip": 0.06446596, + "auxiliary_loss_mlp": 0.01267793, + "balance_loss_clip": 0.06279876, + "balance_loss_mlp": 0.01253917, + "epoch": 0.45753795280324666, + "flos": 24359832737280.0, + "grad_norm": 4.839518120228961, + "language_loss": 0.81053591, + "learning_rate": 2.369987137894757e-06, + "loss": 0.88767982, + "num_input_tokens_seen": 163237430, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.13879395, + "step": 7610, + "time_per_iteration": 3.9629292488098145 + }, + { + "auxiliary_loss_clip": 0.06456244, + "auxiliary_loss_mlp": 0.01272187, + "balance_loss_clip": 0.06284218, + "balance_loss_mlp": 0.01258359, + "epoch": 0.4575980760559146, + "flos": 16659297861120.0, + "grad_norm": 2.22162560638367, + "language_loss": 0.82538879, + "learning_rate": 2.3696043914140057e-06, + "loss": 0.90267307, + "num_input_tokens_seen": 163253905, + "router_z_loss_clip": 1.71972656, + "router_z_loss_mlp": 0.13848877, + "step": 7611, + "time_per_iteration": 2.483184337615967 + }, + { + "auxiliary_loss_clip": 0.06450104, + "auxiliary_loss_mlp": 0.01268987, + "balance_loss_clip": 0.06284404, + "balance_loss_mlp": 0.01254753, + "epoch": 0.4576581993085826, + "flos": 35919006860160.0, + "grad_norm": 1.7486456420241998, + "language_loss": 0.73840886, + "learning_rate": 2.369221630917819e-06, + "loss": 0.81559974, + "num_input_tokens_seen": 163274285, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.14239502, + "step": 7612, + "time_per_iteration": 2.629122734069824 + }, + { + "auxiliary_loss_clip": 0.06446031, + "auxiliary_loss_mlp": 0.0126785, + "balance_loss_clip": 0.06281702, + "balance_loss_mlp": 0.01253711, + "epoch": 0.45771832256125056, + "flos": 20085995710080.0, + "grad_norm": 1.498537690587119, + "language_loss": 0.85104787, + "learning_rate": 2.368838856420711e-06, + "loss": 0.92818671, + "num_input_tokens_seen": 163293150, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.14160156, + "step": 7613, + "time_per_iteration": 2.4995853900909424 + }, + { + "auxiliary_loss_clip": 0.06450839, + "auxiliary_loss_mlp": 0.01271405, + "balance_loss_clip": 0.062853, + "balance_loss_mlp": 0.01257458, + "epoch": 0.4577784458139185, + "flos": 10749056520960.0, + "grad_norm": 2.317250545042104, + "language_loss": 0.75818133, + "learning_rate": 2.3684560679371965e-06, + "loss": 0.8354038, + "num_input_tokens_seen": 163310065, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.13946533, + "step": 7614, + "time_per_iteration": 2.5512688159942627 + }, + { + "auxiliary_loss_clip": 0.06447698, + "auxiliary_loss_mlp": 0.01269104, + "balance_loss_clip": 0.06284869, + "balance_loss_mlp": 0.01254513, + "epoch": 0.4578385690665865, + "flos": 21913577112960.0, + "grad_norm": 1.7278714332693421, + "language_loss": 0.7495364, + "learning_rate": 2.368073265481791e-06, + "loss": 0.82670438, + "num_input_tokens_seen": 163329415, + "router_z_loss_clip": 1.62890625, + "router_z_loss_mlp": 0.14587402, + "step": 7615, + "time_per_iteration": 2.4959964752197266 + }, + { + "auxiliary_loss_clip": 0.06341572, + "auxiliary_loss_mlp": 0.01260056, + "balance_loss_clip": 0.06266811, + "balance_loss_mlp": 0.01256924, + "epoch": 0.45789869231925445, + "flos": 64774559036160.0, + "grad_norm": 0.7564263714074747, + "language_loss": 0.57682395, + "learning_rate": 2.3676904490690105e-06, + "loss": 0.65284026, + "num_input_tokens_seen": 163385875, + "router_z_loss_clip": 0.74804688, + "router_z_loss_mlp": 0.03129578, + "step": 7616, + "time_per_iteration": 3.1225674152374268 + }, + { + "auxiliary_loss_clip": 0.06451499, + "auxiliary_loss_mlp": 0.01269699, + "balance_loss_clip": 0.06287209, + "balance_loss_mlp": 0.01255299, + "epoch": 0.4579588155719224, + "flos": 16149594274560.0, + "grad_norm": 2.222129623674548, + "language_loss": 0.71319497, + "learning_rate": 2.3673076187133704e-06, + "loss": 0.790407, + "num_input_tokens_seen": 163405170, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.144104, + "step": 7617, + "time_per_iteration": 2.535795211791992 + }, + { + "auxiliary_loss_clip": 0.06453606, + "auxiliary_loss_mlp": 0.01272033, + "balance_loss_clip": 0.06288601, + "balance_loss_mlp": 0.0125749, + "epoch": 0.45801893882459044, + "flos": 21401609466240.0, + "grad_norm": 1.7708953304075432, + "language_loss": 0.7611897, + "learning_rate": 2.36692477442939e-06, + "loss": 0.83844614, + "num_input_tokens_seen": 163423155, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.14538574, + "step": 7618, + "time_per_iteration": 2.486976146697998 + }, + { + "auxiliary_loss_clip": 0.06453368, + "auxiliary_loss_mlp": 0.01269962, + "balance_loss_clip": 0.06288654, + "balance_loss_mlp": 0.01256778, + "epoch": 0.4580790620772584, + "flos": 19542609982080.0, + "grad_norm": 1.989312042597275, + "language_loss": 0.76642346, + "learning_rate": 2.366541916231585e-06, + "loss": 0.84365678, + "num_input_tokens_seen": 163442450, + "router_z_loss_clip": 1.64648438, + "router_z_loss_mlp": 0.13195801, + "step": 7619, + "time_per_iteration": 2.5505213737487793 + }, + { + "auxiliary_loss_clip": 0.06448688, + "auxiliary_loss_mlp": 0.01269236, + "balance_loss_clip": 0.06287201, + "balance_loss_mlp": 0.01256242, + "epoch": 0.45813918532992637, + "flos": 16586608844160.0, + "grad_norm": 1.7634638926548802, + "language_loss": 0.72444797, + "learning_rate": 2.366159044134473e-06, + "loss": 0.80162722, + "num_input_tokens_seen": 163459810, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.13018799, + "step": 7620, + "time_per_iteration": 2.5020828247070312 + }, + { + "auxiliary_loss_clip": 0.06448015, + "auxiliary_loss_mlp": 0.0127207, + "balance_loss_clip": 0.06286486, + "balance_loss_mlp": 0.01259243, + "epoch": 0.45819930858259433, + "flos": 42240085568640.0, + "grad_norm": 2.4478513756868168, + "language_loss": 0.77894747, + "learning_rate": 2.3657761581525748e-06, + "loss": 0.8561483, + "num_input_tokens_seen": 163482970, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.12835693, + "step": 7621, + "time_per_iteration": 2.7115588188171387 + }, + { + "auxiliary_loss_clip": 0.06339111, + "auxiliary_loss_mlp": 0.01257981, + "balance_loss_clip": 0.06264743, + "balance_loss_mlp": 0.01255324, + "epoch": 0.4582594318352623, + "flos": 63733335073920.0, + "grad_norm": 0.7682856550602313, + "language_loss": 0.64809114, + "learning_rate": 2.3653932583004063e-06, + "loss": 0.72406203, + "num_input_tokens_seen": 163545330, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.02659607, + "step": 7622, + "time_per_iteration": 3.13112473487854 + }, + { + "auxiliary_loss_clip": 0.06452725, + "auxiliary_loss_mlp": 0.01272617, + "balance_loss_clip": 0.06286744, + "balance_loss_mlp": 0.01258449, + "epoch": 0.45831955508793026, + "flos": 26877226078080.0, + "grad_norm": 1.7433537302254658, + "language_loss": 0.79958743, + "learning_rate": 2.3650103445924903e-06, + "loss": 0.87684089, + "num_input_tokens_seen": 163564620, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.1416626, + "step": 7623, + "time_per_iteration": 2.6407015323638916 + }, + { + "auxiliary_loss_clip": 0.0645254, + "auxiliary_loss_mlp": 0.0127269, + "balance_loss_clip": 0.06285348, + "balance_loss_mlp": 0.01258528, + "epoch": 0.45837967834059823, + "flos": 18739886267520.0, + "grad_norm": 2.305548200028626, + "language_loss": 0.71172595, + "learning_rate": 2.3646274170433452e-06, + "loss": 0.78897822, + "num_input_tokens_seen": 163581010, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.14160156, + "step": 7624, + "time_per_iteration": 2.4580042362213135 + }, + { + "auxiliary_loss_clip": 0.06451602, + "auxiliary_loss_mlp": 0.01273069, + "balance_loss_clip": 0.06285381, + "balance_loss_mlp": 0.012593, + "epoch": 0.4584398015932662, + "flos": 21184380956160.0, + "grad_norm": 1.776025787081333, + "language_loss": 0.73132861, + "learning_rate": 2.364244475667491e-06, + "loss": 0.80857527, + "num_input_tokens_seen": 163599955, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.13763428, + "step": 7625, + "time_per_iteration": 2.5352139472961426 + }, + { + "auxiliary_loss_clip": 0.06452388, + "auxiliary_loss_mlp": 0.01273572, + "balance_loss_clip": 0.06287026, + "balance_loss_mlp": 0.01259857, + "epoch": 0.45849992484593416, + "flos": 19795826620800.0, + "grad_norm": 3.130746647878431, + "language_loss": 0.78340298, + "learning_rate": 2.363861520479451e-06, + "loss": 0.86066258, + "num_input_tokens_seen": 163618545, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.137146, + "step": 7626, + "time_per_iteration": 2.4839165210723877 + }, + { + "auxiliary_loss_clip": 0.06454711, + "auxiliary_loss_mlp": 0.01271249, + "balance_loss_clip": 0.06286182, + "balance_loss_mlp": 0.01257284, + "epoch": 0.4585600480986021, + "flos": 18229134504960.0, + "grad_norm": 1.6201293476115848, + "language_loss": 0.85071468, + "learning_rate": 2.3634785514937445e-06, + "loss": 0.92797422, + "num_input_tokens_seen": 163636055, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.1394043, + "step": 7627, + "time_per_iteration": 2.5822484493255615 + }, + { + "auxiliary_loss_clip": 0.06454201, + "auxiliary_loss_mlp": 0.01270166, + "balance_loss_clip": 0.06285322, + "balance_loss_mlp": 0.01255634, + "epoch": 0.4586201713512701, + "flos": 29029748814720.0, + "grad_norm": 1.6524494424678404, + "language_loss": 0.69812655, + "learning_rate": 2.3630955687248953e-06, + "loss": 0.77537024, + "num_input_tokens_seen": 163657485, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.14544678, + "step": 7628, + "time_per_iteration": 2.5642716884613037 + }, + { + "auxiliary_loss_clip": 0.06450283, + "auxiliary_loss_mlp": 0.01272737, + "balance_loss_clip": 0.06287684, + "balance_loss_mlp": 0.01258492, + "epoch": 0.45868029460393805, + "flos": 23411395572480.0, + "grad_norm": 1.512396631295222, + "language_loss": 0.78590345, + "learning_rate": 2.3627125721874265e-06, + "loss": 0.86313355, + "num_input_tokens_seen": 163676030, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.14245605, + "step": 7629, + "time_per_iteration": 2.5380680561065674 + }, + { + "auxiliary_loss_clip": 0.0645413, + "auxiliary_loss_mlp": 0.01273786, + "balance_loss_clip": 0.06283213, + "balance_loss_mlp": 0.01258372, + "epoch": 0.458740417856606, + "flos": 18227625131520.0, + "grad_norm": 2.58579854057945, + "language_loss": 0.7964831, + "learning_rate": 2.3623295618958595e-06, + "loss": 0.87376225, + "num_input_tokens_seen": 163694490, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.1541748, + "step": 7630, + "time_per_iteration": 2.4736902713775635 + }, + { + "auxiliary_loss_clip": 0.0645593, + "auxiliary_loss_mlp": 0.01273082, + "balance_loss_clip": 0.06288286, + "balance_loss_mlp": 0.01258378, + "epoch": 0.458800541109274, + "flos": 34577341683840.0, + "grad_norm": 2.0263904819558243, + "language_loss": 0.72204614, + "learning_rate": 2.3619465378647198e-06, + "loss": 0.79933631, + "num_input_tokens_seen": 163717035, + "router_z_loss_clip": 1.67675781, + "router_z_loss_mlp": 0.14715576, + "step": 7631, + "time_per_iteration": 2.8143060207366943 + }, + { + "auxiliary_loss_clip": 0.06451838, + "auxiliary_loss_mlp": 0.01269985, + "balance_loss_clip": 0.06285281, + "balance_loss_mlp": 0.0125565, + "epoch": 0.458860664361942, + "flos": 17717837690880.0, + "grad_norm": 2.417001672331849, + "language_loss": 0.71850061, + "learning_rate": 2.361563500108531e-06, + "loss": 0.79571879, + "num_input_tokens_seen": 163734525, + "router_z_loss_clip": 1.66113281, + "router_z_loss_mlp": 0.14324951, + "step": 7632, + "time_per_iteration": 2.616152048110962 + }, + { + "auxiliary_loss_clip": 0.0645618, + "auxiliary_loss_mlp": 0.01272337, + "balance_loss_clip": 0.06285533, + "balance_loss_mlp": 0.01258055, + "epoch": 0.45892078761460997, + "flos": 18447746607360.0, + "grad_norm": 2.3994338935229784, + "language_loss": 0.69457287, + "learning_rate": 2.3611804486418178e-06, + "loss": 0.7718581, + "num_input_tokens_seen": 163752860, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.14294434, + "step": 7633, + "time_per_iteration": 2.544916868209839 + }, + { + "auxiliary_loss_clip": 0.06450637, + "auxiliary_loss_mlp": 0.01269265, + "balance_loss_clip": 0.06284192, + "balance_loss_mlp": 0.01255055, + "epoch": 0.45898091086727794, + "flos": 22679306449920.0, + "grad_norm": 1.6111707393144439, + "language_loss": 0.81188464, + "learning_rate": 2.3607973834791062e-06, + "loss": 0.88908368, + "num_input_tokens_seen": 163772495, + "router_z_loss_clip": 1.6640625, + "router_z_loss_mlp": 0.14208984, + "step": 7634, + "time_per_iteration": 2.508498430252075 + }, + { + "auxiliary_loss_clip": 0.06458217, + "auxiliary_loss_mlp": 0.0127198, + "balance_loss_clip": 0.06285305, + "balance_loss_mlp": 0.01256995, + "epoch": 0.4590410341199459, + "flos": 21659396152320.0, + "grad_norm": 1.6788945577423258, + "language_loss": 0.8141619, + "learning_rate": 2.3604143046349216e-06, + "loss": 0.89146382, + "num_input_tokens_seen": 163791475, + "router_z_loss_clip": 1.72851562, + "router_z_loss_mlp": 0.15002441, + "step": 7635, + "time_per_iteration": 2.5435891151428223 + }, + { + "auxiliary_loss_clip": 0.06450347, + "auxiliary_loss_mlp": 0.01272084, + "balance_loss_clip": 0.06285377, + "balance_loss_mlp": 0.01258095, + "epoch": 0.45910115737261387, + "flos": 36543676648320.0, + "grad_norm": 1.5202825589824251, + "language_loss": 0.65088654, + "learning_rate": 2.3600312121237905e-06, + "loss": 0.72811085, + "num_input_tokens_seen": 163812995, + "router_z_loss_clip": 1.64746094, + "router_z_loss_mlp": 0.13995361, + "step": 7636, + "time_per_iteration": 2.6333730220794678 + }, + { + "auxiliary_loss_clip": 0.06449063, + "auxiliary_loss_mlp": 0.01267464, + "balance_loss_clip": 0.06286588, + "balance_loss_mlp": 0.0125376, + "epoch": 0.45916128062528183, + "flos": 24425771500800.0, + "grad_norm": 1.3857173948582018, + "language_loss": 0.80552399, + "learning_rate": 2.3596481059602395e-06, + "loss": 0.88268924, + "num_input_tokens_seen": 163833945, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.13702393, + "step": 7637, + "time_per_iteration": 4.1112189292907715 + }, + { + "auxiliary_loss_clip": 0.06456389, + "auxiliary_loss_mlp": 0.0127208, + "balance_loss_clip": 0.06286228, + "balance_loss_mlp": 0.01257089, + "epoch": 0.4592214038779498, + "flos": 23228687744640.0, + "grad_norm": 2.823234077565048, + "language_loss": 0.75517625, + "learning_rate": 2.3592649861587965e-06, + "loss": 0.83246088, + "num_input_tokens_seen": 163853885, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.14990234, + "step": 7638, + "time_per_iteration": 3.910426616668701 + }, + { + "auxiliary_loss_clip": 0.06446041, + "auxiliary_loss_mlp": 0.01269213, + "balance_loss_clip": 0.06283274, + "balance_loss_mlp": 0.01254824, + "epoch": 0.45928152713061776, + "flos": 19178200575360.0, + "grad_norm": 1.717868731304971, + "language_loss": 0.74023581, + "learning_rate": 2.358881852733989e-06, + "loss": 0.81738836, + "num_input_tokens_seen": 163871855, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.14373779, + "step": 7639, + "time_per_iteration": 2.566300630569458 + }, + { + "auxiliary_loss_clip": 0.06454983, + "auxiliary_loss_mlp": 0.01270543, + "balance_loss_clip": 0.06286465, + "balance_loss_mlp": 0.01255165, + "epoch": 0.4593416503832857, + "flos": 22420513514880.0, + "grad_norm": 1.8698154023651474, + "language_loss": 0.683029, + "learning_rate": 2.358498705700346e-06, + "loss": 0.76028425, + "num_input_tokens_seen": 163891450, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.15380859, + "step": 7640, + "time_per_iteration": 2.5371484756469727 + }, + { + "auxiliary_loss_clip": 0.06455723, + "auxiliary_loss_mlp": 0.01270807, + "balance_loss_clip": 0.06285085, + "balance_loss_mlp": 0.01256454, + "epoch": 0.4594017736359537, + "flos": 18886228623360.0, + "grad_norm": 1.657871276405927, + "language_loss": 0.76190329, + "learning_rate": 2.3581155450723958e-06, + "loss": 0.83916861, + "num_input_tokens_seen": 163909345, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.14367676, + "step": 7641, + "time_per_iteration": 2.633190631866455 + }, + { + "auxiliary_loss_clip": 0.06450865, + "auxiliary_loss_mlp": 0.01271757, + "balance_loss_clip": 0.06281709, + "balance_loss_mlp": 0.01256749, + "epoch": 0.45946189688862166, + "flos": 20524268090880.0, + "grad_norm": 2.1109400166256753, + "language_loss": 0.75088501, + "learning_rate": 2.357732370864668e-06, + "loss": 0.82811123, + "num_input_tokens_seen": 163926940, + "router_z_loss_clip": 1.69238281, + "router_z_loss_mlp": 0.15008545, + "step": 7642, + "time_per_iteration": 2.497342824935913 + }, + { + "auxiliary_loss_clip": 0.06325873, + "auxiliary_loss_mlp": 0.01255986, + "balance_loss_clip": 0.06252096, + "balance_loss_mlp": 0.01253583, + "epoch": 0.4595220201412896, + "flos": 61422436920960.0, + "grad_norm": 0.8082143270085457, + "language_loss": 0.58238232, + "learning_rate": 2.357349183091694e-06, + "loss": 0.65820098, + "num_input_tokens_seen": 163977785, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.02400208, + "step": 7643, + "time_per_iteration": 2.9001851081848145 + }, + { + "auxiliary_loss_clip": 0.06454818, + "auxiliary_loss_mlp": 0.01269178, + "balance_loss_clip": 0.06279951, + "balance_loss_mlp": 0.01254467, + "epoch": 0.4595821433939576, + "flos": 23337616452480.0, + "grad_norm": 1.460564072578963, + "language_loss": 0.93123877, + "learning_rate": 2.3569659817680016e-06, + "loss": 1.00847864, + "num_input_tokens_seen": 163996630, + "router_z_loss_clip": 1.74902344, + "router_z_loss_mlp": 0.14709473, + "step": 7644, + "time_per_iteration": 3.956286668777466 + }, + { + "auxiliary_loss_clip": 0.06453376, + "auxiliary_loss_mlp": 0.01272616, + "balance_loss_clip": 0.06283151, + "balance_loss_mlp": 0.01258591, + "epoch": 0.4596422666466256, + "flos": 14287492189440.0, + "grad_norm": 2.5856018073831954, + "language_loss": 0.82780254, + "learning_rate": 2.3565827669081243e-06, + "loss": 0.90506244, + "num_input_tokens_seen": 164013190, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.14031982, + "step": 7645, + "time_per_iteration": 2.5230045318603516 + }, + { + "auxiliary_loss_clip": 0.0632263, + "auxiliary_loss_mlp": 0.0125685, + "balance_loss_clip": 0.06249407, + "balance_loss_mlp": 0.01254095, + "epoch": 0.4597023898992936, + "flos": 65747188103040.0, + "grad_norm": 0.7461836102968291, + "language_loss": 0.59904981, + "learning_rate": 2.356199538526593e-06, + "loss": 0.67484462, + "num_input_tokens_seen": 164074030, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.02758789, + "step": 7646, + "time_per_iteration": 3.0677428245544434 + }, + { + "auxiliary_loss_clip": 0.06451902, + "auxiliary_loss_mlp": 0.01272391, + "balance_loss_clip": 0.06282644, + "balance_loss_mlp": 0.01257931, + "epoch": 0.45976251315196154, + "flos": 26914430090880.0, + "grad_norm": 1.5401961064627432, + "language_loss": 0.72954202, + "learning_rate": 2.355816296637939e-06, + "loss": 0.80678499, + "num_input_tokens_seen": 164095515, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.14465332, + "step": 7647, + "time_per_iteration": 2.5715911388397217 + }, + { + "auxiliary_loss_clip": 0.06455843, + "auxiliary_loss_mlp": 0.01270403, + "balance_loss_clip": 0.06283608, + "balance_loss_mlp": 0.0125586, + "epoch": 0.4598226364046295, + "flos": 26625854229120.0, + "grad_norm": 1.5262276937698116, + "language_loss": 0.66966379, + "learning_rate": 2.3554330412566957e-06, + "loss": 0.74692625, + "num_input_tokens_seen": 164117270, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.14526367, + "step": 7648, + "time_per_iteration": 2.6032962799072266 + }, + { + "auxiliary_loss_clip": 0.06453076, + "auxiliary_loss_mlp": 0.01270647, + "balance_loss_clip": 0.06283541, + "balance_loss_mlp": 0.01256562, + "epoch": 0.45988275965729747, + "flos": 24394395346560.0, + "grad_norm": 1.3937992948207578, + "language_loss": 0.78837889, + "learning_rate": 2.3550497723973953e-06, + "loss": 0.86561614, + "num_input_tokens_seen": 164137850, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.14093018, + "step": 7649, + "time_per_iteration": 3.961230754852295 + }, + { + "auxiliary_loss_clip": 0.06449774, + "auxiliary_loss_mlp": 0.01273295, + "balance_loss_clip": 0.06282938, + "balance_loss_mlp": 0.01258221, + "epoch": 0.45994288290996543, + "flos": 24542834054400.0, + "grad_norm": 2.427132979105608, + "language_loss": 0.694453, + "learning_rate": 2.3546664900745726e-06, + "loss": 0.77168369, + "num_input_tokens_seen": 164157960, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.15087891, + "step": 7650, + "time_per_iteration": 2.5870516300201416 + }, + { + "auxiliary_loss_clip": 0.06454967, + "auxiliary_loss_mlp": 0.01271386, + "balance_loss_clip": 0.06281558, + "balance_loss_mlp": 0.01255876, + "epoch": 0.4600030061626334, + "flos": 14835573745920.0, + "grad_norm": 2.508823744651641, + "language_loss": 0.84580773, + "learning_rate": 2.354283194302761e-06, + "loss": 0.92307127, + "num_input_tokens_seen": 164174590, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.15515137, + "step": 7651, + "time_per_iteration": 2.4682910442352295 + }, + { + "auxiliary_loss_clip": 0.06447899, + "auxiliary_loss_mlp": 0.01269723, + "balance_loss_clip": 0.06282218, + "balance_loss_mlp": 0.01255567, + "epoch": 0.46006312941530136, + "flos": 18119702672640.0, + "grad_norm": 2.0398588051370536, + "language_loss": 0.75204146, + "learning_rate": 2.3538998850964948e-06, + "loss": 0.82921767, + "num_input_tokens_seen": 164192935, + "router_z_loss_clip": 1.65625, + "router_z_loss_mlp": 0.14160156, + "step": 7652, + "time_per_iteration": 2.533160448074341 + }, + { + "auxiliary_loss_clip": 0.06453463, + "auxiliary_loss_mlp": 0.01267977, + "balance_loss_clip": 0.06283025, + "balance_loss_mlp": 0.01253803, + "epoch": 0.46012325266796933, + "flos": 21982157280000.0, + "grad_norm": 1.8219910575186118, + "language_loss": 0.76111704, + "learning_rate": 2.3535165624703097e-06, + "loss": 0.83833146, + "num_input_tokens_seen": 164213160, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.14154053, + "step": 7653, + "time_per_iteration": 2.607556104660034 + }, + { + "auxiliary_loss_clip": 0.06466014, + "auxiliary_loss_mlp": 0.01279742, + "balance_loss_clip": 0.06286691, + "balance_loss_mlp": 0.01262618, + "epoch": 0.4601833759206373, + "flos": 15273468783360.0, + "grad_norm": 1.9930521100890286, + "language_loss": 0.66339052, + "learning_rate": 2.353133226438741e-06, + "loss": 0.74084806, + "num_input_tokens_seen": 164229330, + "router_z_loss_clip": 1.79199219, + "router_z_loss_mlp": 0.17132568, + "step": 7654, + "time_per_iteration": 2.5845115184783936 + }, + { + "auxiliary_loss_clip": 0.06450775, + "auxiliary_loss_mlp": 0.01273684, + "balance_loss_clip": 0.06283066, + "balance_loss_mlp": 0.01260524, + "epoch": 0.46024349917330526, + "flos": 27096299377920.0, + "grad_norm": 1.834954182024095, + "language_loss": 0.79552221, + "learning_rate": 2.3527498770163248e-06, + "loss": 0.87276679, + "num_input_tokens_seen": 164248240, + "router_z_loss_clip": 1.67675781, + "router_z_loss_mlp": 0.1315918, + "step": 7655, + "time_per_iteration": 2.5619075298309326 + }, + { + "auxiliary_loss_clip": 0.06446843, + "auxiliary_loss_mlp": 0.01271784, + "balance_loss_clip": 0.06282479, + "balance_loss_mlp": 0.0125795, + "epoch": 0.4603036224259732, + "flos": 24469935402240.0, + "grad_norm": 1.525008853184554, + "language_loss": 0.68020397, + "learning_rate": 2.3523665142175985e-06, + "loss": 0.7573902, + "num_input_tokens_seen": 164268020, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.13824463, + "step": 7656, + "time_per_iteration": 2.534085988998413 + }, + { + "auxiliary_loss_clip": 0.06450829, + "auxiliary_loss_mlp": 0.0126853, + "balance_loss_clip": 0.06280753, + "balance_loss_mlp": 0.01254249, + "epoch": 0.4603637456786412, + "flos": 28116545091840.0, + "grad_norm": 1.6883930229899933, + "language_loss": 0.81940675, + "learning_rate": 2.351983138057098e-06, + "loss": 0.89660037, + "num_input_tokens_seen": 164287305, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.14300537, + "step": 7657, + "time_per_iteration": 2.6093909740448 + }, + { + "auxiliary_loss_clip": 0.06452166, + "auxiliary_loss_mlp": 0.01272452, + "balance_loss_clip": 0.06283732, + "balance_loss_mlp": 0.01257598, + "epoch": 0.4604238689313092, + "flos": 24355178835840.0, + "grad_norm": 1.9081069655960825, + "language_loss": 0.70684779, + "learning_rate": 2.3515997485493623e-06, + "loss": 0.78409398, + "num_input_tokens_seen": 164306835, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.1484375, + "step": 7658, + "time_per_iteration": 2.5257532596588135 + }, + { + "auxiliary_loss_clip": 0.06333129, + "auxiliary_loss_mlp": 0.01254207, + "balance_loss_clip": 0.06259783, + "balance_loss_mlp": 0.01251698, + "epoch": 0.4604839921839772, + "flos": 53622742337280.0, + "grad_norm": 1.3056028191134426, + "language_loss": 0.6180622, + "learning_rate": 2.351216345708928e-06, + "loss": 0.69393557, + "num_input_tokens_seen": 164367095, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.02508545, + "step": 7659, + "time_per_iteration": 3.2051191329956055 + }, + { + "auxiliary_loss_clip": 0.06450778, + "auxiliary_loss_mlp": 0.01270415, + "balance_loss_clip": 0.06284198, + "balance_loss_mlp": 0.01254692, + "epoch": 0.46054411543664514, + "flos": 31256428014720.0, + "grad_norm": 1.6821089703035916, + "language_loss": 0.68614, + "learning_rate": 2.350832929550336e-06, + "loss": 0.76335192, + "num_input_tokens_seen": 164388895, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.1572876, + "step": 7660, + "time_per_iteration": 2.5768120288848877 + }, + { + "auxiliary_loss_clip": 0.06455722, + "auxiliary_loss_mlp": 0.01269625, + "balance_loss_clip": 0.06285393, + "balance_loss_mlp": 0.01254843, + "epoch": 0.4606042386893131, + "flos": 24098943450240.0, + "grad_norm": 1.8024702284570222, + "language_loss": 0.76982367, + "learning_rate": 2.3504495000881227e-06, + "loss": 0.84707713, + "num_input_tokens_seen": 164409080, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.14782715, + "step": 7661, + "time_per_iteration": 2.5556533336639404 + }, + { + "auxiliary_loss_clip": 0.06448123, + "auxiliary_loss_mlp": 0.01270523, + "balance_loss_clip": 0.06284644, + "balance_loss_mlp": 0.01257511, + "epoch": 0.46066436194198107, + "flos": 26585715323520.0, + "grad_norm": 1.64374674726695, + "language_loss": 0.75330603, + "learning_rate": 2.3500660573368305e-06, + "loss": 0.8304925, + "num_input_tokens_seen": 164427585, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.13000488, + "step": 7662, + "time_per_iteration": 2.5430636405944824 + }, + { + "auxiliary_loss_clip": 0.064645, + "auxiliary_loss_mlp": 0.01271435, + "balance_loss_clip": 0.06287506, + "balance_loss_mlp": 0.01255807, + "epoch": 0.46072448519464904, + "flos": 17779751458560.0, + "grad_norm": 2.8997354943734144, + "language_loss": 0.79542935, + "learning_rate": 2.349682601310998e-06, + "loss": 0.87278873, + "num_input_tokens_seen": 164438455, + "router_z_loss_clip": 1.76953125, + "router_z_loss_mlp": 0.15625, + "step": 7663, + "time_per_iteration": 2.4792025089263916 + }, + { + "auxiliary_loss_clip": 0.06451327, + "auxiliary_loss_mlp": 0.01270399, + "balance_loss_clip": 0.0628781, + "balance_loss_mlp": 0.01256344, + "epoch": 0.460784608447317, + "flos": 15091557569280.0, + "grad_norm": 1.9500633364095115, + "language_loss": 0.73664737, + "learning_rate": 2.3492991320251653e-06, + "loss": 0.81386459, + "num_input_tokens_seen": 164456830, + "router_z_loss_clip": 1.63476562, + "router_z_loss_mlp": 0.14050293, + "step": 7664, + "time_per_iteration": 2.5058319568634033 + }, + { + "auxiliary_loss_clip": 0.06454196, + "auxiliary_loss_mlp": 0.01269654, + "balance_loss_clip": 0.06286658, + "balance_loss_mlp": 0.01255403, + "epoch": 0.46084473169998497, + "flos": 18594214744320.0, + "grad_norm": 1.4541358898310397, + "language_loss": 0.72731769, + "learning_rate": 2.3489156494938753e-06, + "loss": 0.80455625, + "num_input_tokens_seen": 164475375, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.14257812, + "step": 7665, + "time_per_iteration": 2.5651309490203857 + }, + { + "auxiliary_loss_clip": 0.06452034, + "auxiliary_loss_mlp": 0.01269476, + "balance_loss_clip": 0.06283794, + "balance_loss_mlp": 0.01255016, + "epoch": 0.46090485495265293, + "flos": 19499955454080.0, + "grad_norm": 1.6858212343920378, + "language_loss": 0.78057897, + "learning_rate": 2.348532153731669e-06, + "loss": 0.85779405, + "num_input_tokens_seen": 164492040, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.14459229, + "step": 7666, + "time_per_iteration": 2.4884724617004395 + }, + { + "auxiliary_loss_clip": 0.06454702, + "auxiliary_loss_mlp": 0.01278259, + "balance_loss_clip": 0.06288874, + "balance_loss_mlp": 0.01262982, + "epoch": 0.4609649782053209, + "flos": 33373339966080.0, + "grad_norm": 1.3323556356345916, + "language_loss": 0.7438637, + "learning_rate": 2.348148644753088e-06, + "loss": 0.82119334, + "num_input_tokens_seen": 164513665, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.15270996, + "step": 7667, + "time_per_iteration": 2.6961426734924316 + }, + { + "auxiliary_loss_clip": 0.06450665, + "auxiliary_loss_mlp": 0.01267319, + "balance_loss_clip": 0.06283414, + "balance_loss_mlp": 0.01253574, + "epoch": 0.46102510145798886, + "flos": 23775972687360.0, + "grad_norm": 1.463924526715157, + "language_loss": 0.76157856, + "learning_rate": 2.347765122572676e-06, + "loss": 0.83875835, + "num_input_tokens_seen": 164533890, + "router_z_loss_clip": 1.67480469, + "router_z_loss_mlp": 0.1373291, + "step": 7668, + "time_per_iteration": 2.517401933670044 + }, + { + "auxiliary_loss_clip": 0.06446877, + "auxiliary_loss_mlp": 0.0126819, + "balance_loss_clip": 0.06284317, + "balance_loss_mlp": 0.01254982, + "epoch": 0.4610852247106568, + "flos": 23301544469760.0, + "grad_norm": 1.5533292001822034, + "language_loss": 0.78315312, + "learning_rate": 2.347381587204975e-06, + "loss": 0.86030376, + "num_input_tokens_seen": 164553815, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.13208008, + "step": 7669, + "time_per_iteration": 2.58445405960083 + }, + { + "auxiliary_loss_clip": 0.06450041, + "auxiliary_loss_mlp": 0.01264673, + "balance_loss_clip": 0.06282575, + "balance_loss_mlp": 0.01251286, + "epoch": 0.4611453479633248, + "flos": 25454528403840.0, + "grad_norm": 1.739851036429443, + "language_loss": 0.83272684, + "learning_rate": 2.34699803866453e-06, + "loss": 0.90987396, + "num_input_tokens_seen": 164573125, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.13391113, + "step": 7670, + "time_per_iteration": 2.5387001037597656 + }, + { + "auxiliary_loss_clip": 0.06451756, + "auxiliary_loss_mlp": 0.01270534, + "balance_loss_clip": 0.06288445, + "balance_loss_mlp": 0.01257129, + "epoch": 0.4612054712159928, + "flos": 21145541788800.0, + "grad_norm": 1.8274954721629995, + "language_loss": 0.63656652, + "learning_rate": 2.3466144769658845e-06, + "loss": 0.7137894, + "num_input_tokens_seen": 164592575, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.1340332, + "step": 7671, + "time_per_iteration": 2.5336413383483887 + }, + { + "auxiliary_loss_clip": 0.06335695, + "auxiliary_loss_mlp": 0.01251787, + "balance_loss_clip": 0.0626289, + "balance_loss_mlp": 0.01249119, + "epoch": 0.4612655944686608, + "flos": 69979754194560.0, + "grad_norm": 0.792480479203595, + "language_loss": 0.55791217, + "learning_rate": 2.346230902123583e-06, + "loss": 0.63378698, + "num_input_tokens_seen": 164659795, + "router_z_loss_clip": 0.72705078, + "router_z_loss_mlp": 0.02670288, + "step": 7672, + "time_per_iteration": 3.2302184104919434 + }, + { + "auxiliary_loss_clip": 0.06453065, + "auxiliary_loss_mlp": 0.01266934, + "balance_loss_clip": 0.06283592, + "balance_loss_mlp": 0.01253213, + "epoch": 0.46132571772132874, + "flos": 16842844229760.0, + "grad_norm": 2.026723370874256, + "language_loss": 0.71486014, + "learning_rate": 2.3458473141521715e-06, + "loss": 0.79206014, + "num_input_tokens_seen": 164678735, + "router_z_loss_clip": 1.69238281, + "router_z_loss_mlp": 0.13720703, + "step": 7673, + "time_per_iteration": 2.5307891368865967 + }, + { + "auxiliary_loss_clip": 0.06444372, + "auxiliary_loss_mlp": 0.01267461, + "balance_loss_clip": 0.06280223, + "balance_loss_mlp": 0.01254014, + "epoch": 0.4613858409739967, + "flos": 35817666946560.0, + "grad_norm": 1.6118988477871892, + "language_loss": 0.70779812, + "learning_rate": 2.345463713066195e-06, + "loss": 0.7849164, + "num_input_tokens_seen": 164700885, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.13446045, + "step": 7674, + "time_per_iteration": 2.67787766456604 + }, + { + "auxiliary_loss_clip": 0.06445141, + "auxiliary_loss_mlp": 0.01269162, + "balance_loss_clip": 0.06278897, + "balance_loss_mlp": 0.01255554, + "epoch": 0.4614459642266647, + "flos": 35276251789440.0, + "grad_norm": 1.4817902433092767, + "language_loss": 0.65456873, + "learning_rate": 2.3450800988801996e-06, + "loss": 0.73171175, + "num_input_tokens_seen": 164726960, + "router_z_loss_clip": 1.66113281, + "router_z_loss_mlp": 0.1362915, + "step": 7675, + "time_per_iteration": 2.683043956756592 + }, + { + "auxiliary_loss_clip": 0.06330552, + "auxiliary_loss_mlp": 0.01253837, + "balance_loss_clip": 0.06257802, + "balance_loss_mlp": 0.01251083, + "epoch": 0.46150608747933264, + "flos": 66723311842560.0, + "grad_norm": 0.7159632658119685, + "language_loss": 0.58438665, + "learning_rate": 2.3446964716087327e-06, + "loss": 0.66023052, + "num_input_tokens_seen": 164788525, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.02758789, + "step": 7676, + "time_per_iteration": 3.2052080631256104 + }, + { + "auxiliary_loss_clip": 0.06331712, + "auxiliary_loss_mlp": 0.01253621, + "balance_loss_clip": 0.06258753, + "balance_loss_mlp": 0.01250806, + "epoch": 0.4615662107320006, + "flos": 55846780133760.0, + "grad_norm": 0.7666580083801284, + "language_loss": 0.62806678, + "learning_rate": 2.344312831266341e-06, + "loss": 0.70392013, + "num_input_tokens_seen": 164843525, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.02810669, + "step": 7677, + "time_per_iteration": 5.753543853759766 + }, + { + "auxiliary_loss_clip": 0.06441256, + "auxiliary_loss_mlp": 0.01269221, + "balance_loss_clip": 0.06278154, + "balance_loss_mlp": 0.012564, + "epoch": 0.46162633398466857, + "flos": 15488055889920.0, + "grad_norm": 2.0928007642005224, + "language_loss": 0.7694543, + "learning_rate": 2.3439291778675718e-06, + "loss": 0.84655911, + "num_input_tokens_seen": 164859895, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.12817383, + "step": 7678, + "time_per_iteration": 2.5979206562042236 + }, + { + "auxiliary_loss_clip": 0.06447493, + "auxiliary_loss_mlp": 0.01267035, + "balance_loss_clip": 0.06279032, + "balance_loss_mlp": 0.01253672, + "epoch": 0.46168645723733653, + "flos": 20017667105280.0, + "grad_norm": 1.9130482273301792, + "language_loss": 0.66792345, + "learning_rate": 2.343545511426974e-06, + "loss": 0.74506873, + "num_input_tokens_seen": 164878030, + "router_z_loss_clip": 1.68554688, + "router_z_loss_mlp": 0.13360596, + "step": 7679, + "time_per_iteration": 2.548025131225586 + }, + { + "auxiliary_loss_clip": 0.06445532, + "auxiliary_loss_mlp": 0.0127232, + "balance_loss_clip": 0.06279338, + "balance_loss_mlp": 0.01259409, + "epoch": 0.4617465804900045, + "flos": 20304020833920.0, + "grad_norm": 2.6299917180378203, + "language_loss": 0.702595, + "learning_rate": 2.3431618319590963e-06, + "loss": 0.77977353, + "num_input_tokens_seen": 164895710, + "router_z_loss_clip": 1.66113281, + "router_z_loss_mlp": 0.12921143, + "step": 7680, + "time_per_iteration": 2.475419282913208 + }, + { + "auxiliary_loss_clip": 0.06449848, + "auxiliary_loss_mlp": 0.01274843, + "balance_loss_clip": 0.06279959, + "balance_loss_mlp": 0.01260454, + "epoch": 0.46180670374267246, + "flos": 22352897669760.0, + "grad_norm": 1.6539051623213383, + "language_loss": 0.63903129, + "learning_rate": 2.342778139478487e-06, + "loss": 0.7162782, + "num_input_tokens_seen": 164913365, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.14398193, + "step": 7681, + "time_per_iteration": 2.518878698348999 + }, + { + "auxiliary_loss_clip": 0.06438938, + "auxiliary_loss_mlp": 0.01267755, + "balance_loss_clip": 0.06277744, + "balance_loss_mlp": 0.01255566, + "epoch": 0.46186682699534043, + "flos": 19900856113920.0, + "grad_norm": 1.5795449228659066, + "language_loss": 0.67458999, + "learning_rate": 2.342394433999697e-06, + "loss": 0.75165695, + "num_input_tokens_seen": 164931620, + "router_z_loss_clip": 1.61230469, + "router_z_loss_mlp": 0.12194824, + "step": 7682, + "time_per_iteration": 2.4734294414520264 + }, + { + "auxiliary_loss_clip": 0.06442823, + "auxiliary_loss_mlp": 0.01267731, + "balance_loss_clip": 0.06276147, + "balance_loss_mlp": 0.01254564, + "epoch": 0.4619269502480084, + "flos": 31511573297280.0, + "grad_norm": 2.0778412213868025, + "language_loss": 0.74573362, + "learning_rate": 2.342010715537275e-06, + "loss": 0.82283914, + "num_input_tokens_seen": 164950905, + "router_z_loss_clip": 1.66699219, + "router_z_loss_mlp": 0.1317749, + "step": 7683, + "time_per_iteration": 2.5680744647979736 + }, + { + "auxiliary_loss_clip": 0.0644316, + "auxiliary_loss_mlp": 0.01269615, + "balance_loss_clip": 0.06278165, + "balance_loss_mlp": 0.01255995, + "epoch": 0.46198707350067636, + "flos": 25016465658240.0, + "grad_norm": 2.034673139361796, + "language_loss": 0.77701104, + "learning_rate": 2.3416269841057726e-06, + "loss": 0.85413885, + "num_input_tokens_seen": 164970950, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.13604736, + "step": 7684, + "time_per_iteration": 3.9865663051605225 + }, + { + "auxiliary_loss_clip": 0.06455924, + "auxiliary_loss_mlp": 0.01269534, + "balance_loss_clip": 0.06282193, + "balance_loss_mlp": 0.01255074, + "epoch": 0.4620471967533444, + "flos": 18297588890880.0, + "grad_norm": 1.7679070884814239, + "language_loss": 0.79849184, + "learning_rate": 2.3412432397197412e-06, + "loss": 0.87574637, + "num_input_tokens_seen": 164989855, + "router_z_loss_clip": 1.73828125, + "router_z_loss_mlp": 0.14471436, + "step": 7685, + "time_per_iteration": 2.4874165058135986 + }, + { + "auxiliary_loss_clip": 0.06442665, + "auxiliary_loss_mlp": 0.01268831, + "balance_loss_clip": 0.06282581, + "balance_loss_mlp": 0.01254151, + "epoch": 0.46210732000601235, + "flos": 33993607415040.0, + "grad_norm": 2.697729181890728, + "language_loss": 0.66966581, + "learning_rate": 2.340859482393731e-06, + "loss": 0.74678075, + "num_input_tokens_seen": 165012290, + "router_z_loss_clip": 1.59863281, + "router_z_loss_mlp": 0.14678955, + "step": 7686, + "time_per_iteration": 2.673029661178589 + }, + { + "auxiliary_loss_clip": 0.06450719, + "auxiliary_loss_mlp": 0.01270437, + "balance_loss_clip": 0.06281859, + "balance_loss_mlp": 0.01255929, + "epoch": 0.4621674432586803, + "flos": 25016381804160.0, + "grad_norm": 1.8957956969587364, + "language_loss": 0.7416718, + "learning_rate": 2.340475712142296e-06, + "loss": 0.81888342, + "num_input_tokens_seen": 165030810, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.14508057, + "step": 7687, + "time_per_iteration": 2.520526885986328 + }, + { + "auxiliary_loss_clip": 0.06441881, + "auxiliary_loss_mlp": 0.01268556, + "balance_loss_clip": 0.06278582, + "balance_loss_mlp": 0.01254943, + "epoch": 0.4622275665113483, + "flos": 22019906344320.0, + "grad_norm": 2.1641165257521098, + "language_loss": 0.75034606, + "learning_rate": 2.3400919289799873e-06, + "loss": 0.82745045, + "num_input_tokens_seen": 165050205, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.13623047, + "step": 7688, + "time_per_iteration": 2.6087183952331543 + }, + { + "auxiliary_loss_clip": 0.06442745, + "auxiliary_loss_mlp": 0.01266791, + "balance_loss_clip": 0.06279768, + "balance_loss_mlp": 0.0125375, + "epoch": 0.46228768976401624, + "flos": 24065303235840.0, + "grad_norm": 1.76695871159964, + "language_loss": 0.78822517, + "learning_rate": 2.3397081329213585e-06, + "loss": 0.86532056, + "num_input_tokens_seen": 165069370, + "router_z_loss_clip": 1.62988281, + "router_z_loss_mlp": 0.13043213, + "step": 7689, + "time_per_iteration": 4.008488416671753 + }, + { + "auxiliary_loss_clip": 0.0644816, + "auxiliary_loss_mlp": 0.01269125, + "balance_loss_clip": 0.06278446, + "balance_loss_mlp": 0.01254116, + "epoch": 0.4623478130166842, + "flos": 26658655902720.0, + "grad_norm": 2.4003711776889936, + "language_loss": 0.56824899, + "learning_rate": 2.339324323980964e-06, + "loss": 0.6454218, + "num_input_tokens_seen": 165089610, + "router_z_loss_clip": 1.69726562, + "router_z_loss_mlp": 0.15020752, + "step": 7690, + "time_per_iteration": 2.586726665496826 + }, + { + "auxiliary_loss_clip": 0.0644986, + "auxiliary_loss_mlp": 0.01270548, + "balance_loss_clip": 0.06281572, + "balance_loss_mlp": 0.01256421, + "epoch": 0.46240793626935217, + "flos": 20564700485760.0, + "grad_norm": 2.1153050114919387, + "language_loss": 0.83470464, + "learning_rate": 2.3389405021733562e-06, + "loss": 0.91190875, + "num_input_tokens_seen": 165109050, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.14135742, + "step": 7691, + "time_per_iteration": 2.5688517093658447 + }, + { + "auxiliary_loss_clip": 0.06446303, + "auxiliary_loss_mlp": 0.01268112, + "balance_loss_clip": 0.06280233, + "balance_loss_mlp": 0.01254528, + "epoch": 0.46246805952202014, + "flos": 22462706845440.0, + "grad_norm": 1.4394066258336355, + "language_loss": 0.75601387, + "learning_rate": 2.338556667513091e-06, + "loss": 0.83315802, + "num_input_tokens_seen": 165130130, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.13604736, + "step": 7692, + "time_per_iteration": 2.537447929382324 + }, + { + "auxiliary_loss_clip": 0.06447245, + "auxiliary_loss_mlp": 0.01269367, + "balance_loss_clip": 0.06279314, + "balance_loss_mlp": 0.01255324, + "epoch": 0.4625281827746881, + "flos": 35049673549440.0, + "grad_norm": 1.4816622996820314, + "language_loss": 0.74488908, + "learning_rate": 2.338172820014723e-06, + "loss": 0.82205522, + "num_input_tokens_seen": 165152685, + "router_z_loss_clip": 1.67871094, + "router_z_loss_mlp": 0.14038086, + "step": 7693, + "time_per_iteration": 2.655733823776245 + }, + { + "auxiliary_loss_clip": 0.06448781, + "auxiliary_loss_mlp": 0.01269271, + "balance_loss_clip": 0.06283827, + "balance_loss_mlp": 0.01255496, + "epoch": 0.46258830602735607, + "flos": 21074907196800.0, + "grad_norm": 1.4111581138712515, + "language_loss": 0.85637844, + "learning_rate": 2.337788959692808e-06, + "loss": 0.93355894, + "num_input_tokens_seen": 165173315, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.13781738, + "step": 7694, + "time_per_iteration": 2.5321285724639893 + }, + { + "auxiliary_loss_clip": 0.06447286, + "auxiliary_loss_mlp": 0.01268569, + "balance_loss_clip": 0.06280261, + "balance_loss_mlp": 0.01254979, + "epoch": 0.46264842928002403, + "flos": 26184437320320.0, + "grad_norm": 2.8233556574725744, + "language_loss": 0.79577935, + "learning_rate": 2.337405086561902e-06, + "loss": 0.87293792, + "num_input_tokens_seen": 165192395, + "router_z_loss_clip": 1.66699219, + "router_z_loss_mlp": 0.13586426, + "step": 7695, + "time_per_iteration": 2.569974660873413 + }, + { + "auxiliary_loss_clip": 0.06442414, + "auxiliary_loss_mlp": 0.01270579, + "balance_loss_clip": 0.0628098, + "balance_loss_mlp": 0.01258432, + "epoch": 0.462708552532692, + "flos": 16769903650560.0, + "grad_norm": 1.6398131561505984, + "language_loss": 0.72464627, + "learning_rate": 2.3370212006365606e-06, + "loss": 0.80177617, + "num_input_tokens_seen": 165211355, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.12133789, + "step": 7696, + "time_per_iteration": 2.49324369430542 + }, + { + "auxiliary_loss_clip": 0.06448425, + "auxiliary_loss_mlp": 0.01269091, + "balance_loss_clip": 0.06281986, + "balance_loss_mlp": 0.01256139, + "epoch": 0.46276867578535996, + "flos": 15565985786880.0, + "grad_norm": 1.5682310460433448, + "language_loss": 0.69151074, + "learning_rate": 2.3366373019313423e-06, + "loss": 0.76868594, + "num_input_tokens_seen": 165229380, + "router_z_loss_clip": 1.66308594, + "router_z_loss_mlp": 0.12945557, + "step": 7697, + "time_per_iteration": 2.5437402725219727 + }, + { + "auxiliary_loss_clip": 0.06445374, + "auxiliary_loss_mlp": 0.01272368, + "balance_loss_clip": 0.06278891, + "balance_loss_mlp": 0.01258903, + "epoch": 0.462828799038028, + "flos": 22421352055680.0, + "grad_norm": 2.477481810490018, + "language_loss": 0.84870285, + "learning_rate": 2.3362533904608025e-06, + "loss": 0.92588031, + "num_input_tokens_seen": 165247200, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.13470459, + "step": 7698, + "time_per_iteration": 2.5088558197021484 + }, + { + "auxiliary_loss_clip": 0.06449191, + "auxiliary_loss_mlp": 0.01269693, + "balance_loss_clip": 0.06284188, + "balance_loss_mlp": 0.01255883, + "epoch": 0.46288892229069595, + "flos": 21075997299840.0, + "grad_norm": 1.5978854439043657, + "language_loss": 0.71711451, + "learning_rate": 2.335869466239502e-06, + "loss": 0.79430336, + "num_input_tokens_seen": 165265825, + "router_z_loss_clip": 1.64941406, + "router_z_loss_mlp": 0.13824463, + "step": 7699, + "time_per_iteration": 2.572908639907837 + }, + { + "auxiliary_loss_clip": 0.06453253, + "auxiliary_loss_mlp": 0.01268472, + "balance_loss_clip": 0.06283245, + "balance_loss_mlp": 0.01253952, + "epoch": 0.4629490455433639, + "flos": 23192448053760.0, + "grad_norm": 3.9296940778908724, + "language_loss": 0.71994227, + "learning_rate": 2.335485529281996e-06, + "loss": 0.79715955, + "num_input_tokens_seen": 165284380, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.1451416, + "step": 7700, + "time_per_iteration": 2.5155210494995117 + }, + { + "auxiliary_loss_clip": 0.06446292, + "auxiliary_loss_mlp": 0.01271375, + "balance_loss_clip": 0.0628306, + "balance_loss_mlp": 0.01258608, + "epoch": 0.4630091687960319, + "flos": 18840178005120.0, + "grad_norm": 2.0219592023308297, + "language_loss": 0.72735655, + "learning_rate": 2.3351015796028467e-06, + "loss": 0.80453324, + "num_input_tokens_seen": 165300320, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.12780762, + "step": 7701, + "time_per_iteration": 2.5208041667938232 + }, + { + "auxiliary_loss_clip": 0.06455772, + "auxiliary_loss_mlp": 0.01272275, + "balance_loss_clip": 0.06285252, + "balance_loss_mlp": 0.01258768, + "epoch": 0.46306929204869984, + "flos": 38915733882240.0, + "grad_norm": 1.8677153728043454, + "language_loss": 0.64857763, + "learning_rate": 2.3347176172166114e-06, + "loss": 0.72585809, + "num_input_tokens_seen": 165318130, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.13519287, + "step": 7702, + "time_per_iteration": 2.6274476051330566 + }, + { + "auxiliary_loss_clip": 0.06443912, + "auxiliary_loss_mlp": 0.01267806, + "balance_loss_clip": 0.06281176, + "balance_loss_mlp": 0.01255181, + "epoch": 0.4631294153013678, + "flos": 19649945462400.0, + "grad_norm": 1.8702283374659314, + "language_loss": 0.73327863, + "learning_rate": 2.33433364213785e-06, + "loss": 0.81039578, + "num_input_tokens_seen": 165336225, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.12640381, + "step": 7703, + "time_per_iteration": 2.505009651184082 + }, + { + "auxiliary_loss_clip": 0.06456561, + "auxiliary_loss_mlp": 0.01272434, + "balance_loss_clip": 0.0628607, + "balance_loss_mlp": 0.0125776, + "epoch": 0.4631895385540358, + "flos": 24615187655040.0, + "grad_norm": 1.7291559958554978, + "language_loss": 0.68770319, + "learning_rate": 2.3339496543811243e-06, + "loss": 0.76499313, + "num_input_tokens_seen": 165355005, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.14666748, + "step": 7704, + "time_per_iteration": 2.5337138175964355 + }, + { + "auxiliary_loss_clip": 0.06456052, + "auxiliary_loss_mlp": 0.01269056, + "balance_loss_clip": 0.06286585, + "balance_loss_mlp": 0.01255693, + "epoch": 0.46324966180670374, + "flos": 26326838534400.0, + "grad_norm": 2.021774763699282, + "language_loss": 0.81483209, + "learning_rate": 2.3335656539609934e-06, + "loss": 0.89208323, + "num_input_tokens_seen": 165374910, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.13378906, + "step": 7705, + "time_per_iteration": 2.612663745880127 + }, + { + "auxiliary_loss_clip": 0.06459744, + "auxiliary_loss_mlp": 0.01269987, + "balance_loss_clip": 0.06288762, + "balance_loss_mlp": 0.01256313, + "epoch": 0.4633097850593717, + "flos": 19245816420480.0, + "grad_norm": 1.7146225700720175, + "language_loss": 0.77885628, + "learning_rate": 2.3331816408920196e-06, + "loss": 0.85615361, + "num_input_tokens_seen": 165392590, + "router_z_loss_clip": 1.70898438, + "router_z_loss_mlp": 0.13684082, + "step": 7706, + "time_per_iteration": 2.508925437927246 + }, + { + "auxiliary_loss_clip": 0.06446654, + "auxiliary_loss_mlp": 0.01269933, + "balance_loss_clip": 0.06285432, + "balance_loss_mlp": 0.01256254, + "epoch": 0.46336990831203967, + "flos": 22789660677120.0, + "grad_norm": 1.8229249281456994, + "language_loss": 0.70008546, + "learning_rate": 2.3327976151887654e-06, + "loss": 0.77725136, + "num_input_tokens_seen": 165411195, + "router_z_loss_clip": 1.61230469, + "router_z_loss_mlp": 0.13671875, + "step": 7707, + "time_per_iteration": 2.5517148971557617 + }, + { + "auxiliary_loss_clip": 0.06460145, + "auxiliary_loss_mlp": 0.01269807, + "balance_loss_clip": 0.06290638, + "balance_loss_mlp": 0.01255716, + "epoch": 0.46343003156470763, + "flos": 38218668566400.0, + "grad_norm": 2.701141573629833, + "language_loss": 0.61044616, + "learning_rate": 2.332413576865791e-06, + "loss": 0.68774569, + "num_input_tokens_seen": 165430150, + "router_z_loss_clip": 1.69433594, + "router_z_loss_mlp": 0.14093018, + "step": 7708, + "time_per_iteration": 2.6566975116729736 + }, + { + "auxiliary_loss_clip": 0.06457859, + "auxiliary_loss_mlp": 0.01269726, + "balance_loss_clip": 0.06291145, + "balance_loss_mlp": 0.01255946, + "epoch": 0.4634901548173756, + "flos": 31946156098560.0, + "grad_norm": 2.0418964495503125, + "language_loss": 0.77915132, + "learning_rate": 2.3320295259376614e-06, + "loss": 0.85642713, + "num_input_tokens_seen": 165450595, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.13781738, + "step": 7709, + "time_per_iteration": 2.6596858501434326 + }, + { + "auxiliary_loss_clip": 0.06459823, + "auxiliary_loss_mlp": 0.01271527, + "balance_loss_clip": 0.06291819, + "balance_loss_mlp": 0.01256756, + "epoch": 0.46355027807004356, + "flos": 20088469405440.0, + "grad_norm": 1.5745013311626586, + "language_loss": 0.77581245, + "learning_rate": 2.3316454624189385e-06, + "loss": 0.85312593, + "num_input_tokens_seen": 165469515, + "router_z_loss_clip": 1.67773438, + "router_z_loss_mlp": 0.14764404, + "step": 7710, + "time_per_iteration": 2.5101842880249023 + }, + { + "auxiliary_loss_clip": 0.06457606, + "auxiliary_loss_mlp": 0.01274408, + "balance_loss_clip": 0.06287406, + "balance_loss_mlp": 0.01260151, + "epoch": 0.4636104013227116, + "flos": 24068280055680.0, + "grad_norm": 2.3601088939338086, + "language_loss": 0.73606086, + "learning_rate": 2.3312613863241865e-06, + "loss": 0.81338096, + "num_input_tokens_seen": 165488125, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.14257812, + "step": 7711, + "time_per_iteration": 2.590855598449707 + }, + { + "auxiliary_loss_clip": 0.06459524, + "auxiliary_loss_mlp": 0.01272046, + "balance_loss_clip": 0.06293879, + "balance_loss_mlp": 0.01257354, + "epoch": 0.46367052457537955, + "flos": 23921392648320.0, + "grad_norm": 1.4235356855228358, + "language_loss": 0.71632046, + "learning_rate": 2.33087729766797e-06, + "loss": 0.7936362, + "num_input_tokens_seen": 165509225, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.14685059, + "step": 7712, + "time_per_iteration": 2.524653434753418 + }, + { + "auxiliary_loss_clip": 0.06464949, + "auxiliary_loss_mlp": 0.01272658, + "balance_loss_clip": 0.06290694, + "balance_loss_mlp": 0.01257709, + "epoch": 0.4637306478280475, + "flos": 26403846036480.0, + "grad_norm": 2.2505033505731493, + "language_loss": 0.73737693, + "learning_rate": 2.3304931964648524e-06, + "loss": 0.81475306, + "num_input_tokens_seen": 165529945, + "router_z_loss_clip": 1.74121094, + "router_z_loss_mlp": 0.14941406, + "step": 7713, + "time_per_iteration": 2.5624618530273438 + }, + { + "auxiliary_loss_clip": 0.06466722, + "auxiliary_loss_mlp": 0.01276857, + "balance_loss_clip": 0.06292763, + "balance_loss_mlp": 0.01261372, + "epoch": 0.4637907710807155, + "flos": 21987104670720.0, + "grad_norm": 1.4954624193011212, + "language_loss": 0.58918363, + "learning_rate": 2.3301090827294e-06, + "loss": 0.66661942, + "num_input_tokens_seen": 165550690, + "router_z_loss_clip": 1.74023438, + "router_z_loss_mlp": 0.15466309, + "step": 7714, + "time_per_iteration": 2.510551929473877 + }, + { + "auxiliary_loss_clip": 0.06456332, + "auxiliary_loss_mlp": 0.01271959, + "balance_loss_clip": 0.06290398, + "balance_loss_mlp": 0.01257427, + "epoch": 0.46385089433338345, + "flos": 12427234894080.0, + "grad_norm": 2.7033660685293186, + "language_loss": 0.70470357, + "learning_rate": 2.3297249564761784e-06, + "loss": 0.78198647, + "num_input_tokens_seen": 165567775, + "router_z_loss_clip": 1.66113281, + "router_z_loss_mlp": 0.14538574, + "step": 7715, + "time_per_iteration": 2.533158779144287 + }, + { + "auxiliary_loss_clip": 0.06470867, + "auxiliary_loss_mlp": 0.01270095, + "balance_loss_clip": 0.06294338, + "balance_loss_mlp": 0.01255731, + "epoch": 0.4639110175860514, + "flos": 23922692386560.0, + "grad_norm": 1.7790063066577455, + "language_loss": 0.68472731, + "learning_rate": 2.3293408177197527e-06, + "loss": 0.762137, + "num_input_tokens_seen": 165587010, + "router_z_loss_clip": 1.76660156, + "router_z_loss_mlp": 0.14355469, + "step": 7716, + "time_per_iteration": 4.020689249038696 + }, + { + "auxiliary_loss_clip": 0.06459275, + "auxiliary_loss_mlp": 0.01270908, + "balance_loss_clip": 0.06288785, + "balance_loss_mlp": 0.01255858, + "epoch": 0.4639711408387194, + "flos": 25307263653120.0, + "grad_norm": 1.603260424737227, + "language_loss": 0.81029081, + "learning_rate": 2.328956666474691e-06, + "loss": 0.88759267, + "num_input_tokens_seen": 165607850, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.1505127, + "step": 7717, + "time_per_iteration": 3.932593584060669 + }, + { + "auxiliary_loss_clip": 0.06454346, + "auxiliary_loss_mlp": 0.01273075, + "balance_loss_clip": 0.06284629, + "balance_loss_mlp": 0.01258127, + "epoch": 0.46403126409138734, + "flos": 21217643827200.0, + "grad_norm": 1.6983648240686933, + "language_loss": 0.73560178, + "learning_rate": 2.3285725027555593e-06, + "loss": 0.81287599, + "num_input_tokens_seen": 165627175, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.14929199, + "step": 7718, + "time_per_iteration": 2.567814350128174 + }, + { + "auxiliary_loss_clip": 0.06461985, + "auxiliary_loss_mlp": 0.0127191, + "balance_loss_clip": 0.06294554, + "balance_loss_mlp": 0.01257384, + "epoch": 0.4640913873440553, + "flos": 35854325907840.0, + "grad_norm": 1.9528130818693374, + "language_loss": 0.70908272, + "learning_rate": 2.3281883265769254e-06, + "loss": 0.78642172, + "num_input_tokens_seen": 165648340, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.14526367, + "step": 7719, + "time_per_iteration": 2.6412456035614014 + }, + { + "auxiliary_loss_clip": 0.06458225, + "auxiliary_loss_mlp": 0.01272538, + "balance_loss_clip": 0.06287955, + "balance_loss_mlp": 0.01258793, + "epoch": 0.46415151059672327, + "flos": 19171282613760.0, + "grad_norm": 2.2400961683609473, + "language_loss": 0.86823237, + "learning_rate": 2.327804137953357e-06, + "loss": 0.94553995, + "num_input_tokens_seen": 165667195, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.13745117, + "step": 7720, + "time_per_iteration": 2.5479180812835693 + }, + { + "auxiliary_loss_clip": 0.06346954, + "auxiliary_loss_mlp": 0.01257869, + "balance_loss_clip": 0.06273555, + "balance_loss_mlp": 0.01255387, + "epoch": 0.46421163384939124, + "flos": 58932841207680.0, + "grad_norm": 0.7060507258277461, + "language_loss": 0.54935473, + "learning_rate": 2.3274199368994226e-06, + "loss": 0.62540293, + "num_input_tokens_seen": 165726760, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.02481079, + "step": 7721, + "time_per_iteration": 3.185922861099243 + }, + { + "auxiliary_loss_clip": 0.06453753, + "auxiliary_loss_mlp": 0.01271222, + "balance_loss_clip": 0.0628788, + "balance_loss_mlp": 0.01257227, + "epoch": 0.4642717571020592, + "flos": 20163590190720.0, + "grad_norm": 1.901448408880664, + "language_loss": 0.80108112, + "learning_rate": 2.3270357234296918e-06, + "loss": 0.87833083, + "num_input_tokens_seen": 165745005, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.13995361, + "step": 7722, + "time_per_iteration": 2.524707317352295 + }, + { + "auxiliary_loss_clip": 0.06454173, + "auxiliary_loss_mlp": 0.01270539, + "balance_loss_clip": 0.06282455, + "balance_loss_mlp": 0.0125627, + "epoch": 0.46433188035472717, + "flos": 25053208473600.0, + "grad_norm": 1.90118065677523, + "language_loss": 0.78278601, + "learning_rate": 2.3266514975587332e-06, + "loss": 0.86003315, + "num_input_tokens_seen": 165765750, + "router_z_loss_clip": 1.71777344, + "router_z_loss_mlp": 0.1427002, + "step": 7723, + "time_per_iteration": 3.9820849895477295 + }, + { + "auxiliary_loss_clip": 0.06448075, + "auxiliary_loss_mlp": 0.01267351, + "balance_loss_clip": 0.06282157, + "balance_loss_mlp": 0.01253046, + "epoch": 0.4643920036073952, + "flos": 28083366074880.0, + "grad_norm": 1.6378874340525207, + "language_loss": 0.68861282, + "learning_rate": 2.326267259301118e-06, + "loss": 0.7657671, + "num_input_tokens_seen": 165787515, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.14306641, + "step": 7724, + "time_per_iteration": 2.550832748413086 + }, + { + "auxiliary_loss_clip": 0.06449208, + "auxiliary_loss_mlp": 0.01272875, + "balance_loss_clip": 0.06283656, + "balance_loss_mlp": 0.01259297, + "epoch": 0.46445212686006315, + "flos": 18375267225600.0, + "grad_norm": 2.354559005563411, + "language_loss": 0.67722934, + "learning_rate": 2.325883008671415e-06, + "loss": 0.7544502, + "num_input_tokens_seen": 165806675, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.13592529, + "step": 7725, + "time_per_iteration": 2.534698009490967 + }, + { + "auxiliary_loss_clip": 0.0644237, + "auxiliary_loss_mlp": 0.01270691, + "balance_loss_clip": 0.06281108, + "balance_loss_mlp": 0.01258108, + "epoch": 0.4645122501127311, + "flos": 31729514567040.0, + "grad_norm": 1.5959059771038482, + "language_loss": 0.65303701, + "learning_rate": 2.3254987456841955e-06, + "loss": 0.73016763, + "num_input_tokens_seen": 165829835, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.12585449, + "step": 7726, + "time_per_iteration": 2.6071393489837646 + }, + { + "auxiliary_loss_clip": 0.06452325, + "auxiliary_loss_mlp": 0.01268836, + "balance_loss_clip": 0.06286149, + "balance_loss_mlp": 0.01255312, + "epoch": 0.4645723733653991, + "flos": 23775553416960.0, + "grad_norm": 2.198219591713496, + "language_loss": 0.75535023, + "learning_rate": 2.3251144703540307e-06, + "loss": 0.83256185, + "num_input_tokens_seen": 165849380, + "router_z_loss_clip": 1.66113281, + "router_z_loss_mlp": 0.13525391, + "step": 7727, + "time_per_iteration": 2.5323383808135986 + }, + { + "auxiliary_loss_clip": 0.06449004, + "auxiliary_loss_mlp": 0.01272292, + "balance_loss_clip": 0.06281407, + "balance_loss_mlp": 0.01258166, + "epoch": 0.46463249661806705, + "flos": 33153805468800.0, + "grad_norm": 1.912145195790545, + "language_loss": 0.78694946, + "learning_rate": 2.3247301826954936e-06, + "loss": 0.86416245, + "num_input_tokens_seen": 165868620, + "router_z_loss_clip": 1.67675781, + "router_z_loss_mlp": 0.14147949, + "step": 7728, + "time_per_iteration": 3.998812437057495 + }, + { + "auxiliary_loss_clip": 0.06450211, + "auxiliary_loss_mlp": 0.01270241, + "balance_loss_clip": 0.06282613, + "balance_loss_mlp": 0.0125658, + "epoch": 0.464692619870735, + "flos": 18301865448960.0, + "grad_norm": 2.3670866338465295, + "language_loss": 0.76134968, + "learning_rate": 2.324345882723155e-06, + "loss": 0.83855414, + "num_input_tokens_seen": 165885915, + "router_z_loss_clip": 1.67675781, + "router_z_loss_mlp": 0.13659668, + "step": 7729, + "time_per_iteration": 2.459913730621338 + }, + { + "auxiliary_loss_clip": 0.06449223, + "auxiliary_loss_mlp": 0.01270726, + "balance_loss_clip": 0.06283462, + "balance_loss_mlp": 0.01257339, + "epoch": 0.464752743123403, + "flos": 22644659986560.0, + "grad_norm": 1.7402612149106196, + "language_loss": 0.80316758, + "learning_rate": 2.323961570451588e-06, + "loss": 0.88036704, + "num_input_tokens_seen": 165905465, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.13378906, + "step": 7730, + "time_per_iteration": 2.5472798347473145 + }, + { + "auxiliary_loss_clip": 0.06447513, + "auxiliary_loss_mlp": 0.01272657, + "balance_loss_clip": 0.06282953, + "balance_loss_mlp": 0.01258924, + "epoch": 0.46481286637607094, + "flos": 20418316202880.0, + "grad_norm": 1.544685409716396, + "language_loss": 0.77440143, + "learning_rate": 2.3235772458953655e-06, + "loss": 0.85160315, + "num_input_tokens_seen": 165924640, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.13726807, + "step": 7731, + "time_per_iteration": 2.539971351623535 + }, + { + "auxiliary_loss_clip": 0.06444095, + "auxiliary_loss_mlp": 0.01267001, + "balance_loss_clip": 0.06280014, + "balance_loss_mlp": 0.01253984, + "epoch": 0.4648729896287389, + "flos": 34283692650240.0, + "grad_norm": 1.8393249998070078, + "language_loss": 0.66022158, + "learning_rate": 2.323192909069061e-06, + "loss": 0.73733258, + "num_input_tokens_seen": 165945765, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.13006592, + "step": 7732, + "time_per_iteration": 2.6860389709472656 + }, + { + "auxiliary_loss_clip": 0.0645274, + "auxiliary_loss_mlp": 0.01268588, + "balance_loss_clip": 0.0628058, + "balance_loss_mlp": 0.01254474, + "epoch": 0.4649331128814069, + "flos": 21327704565120.0, + "grad_norm": 2.1920635353287157, + "language_loss": 0.73225021, + "learning_rate": 2.32280855998725e-06, + "loss": 0.8094635, + "num_input_tokens_seen": 165964025, + "router_z_loss_clip": 1.72363281, + "router_z_loss_mlp": 0.14123535, + "step": 7733, + "time_per_iteration": 2.4875564575195312 + }, + { + "auxiliary_loss_clip": 0.06338679, + "auxiliary_loss_mlp": 0.01252754, + "balance_loss_clip": 0.0626616, + "balance_loss_mlp": 0.0124981, + "epoch": 0.46499323613407484, + "flos": 58325082744960.0, + "grad_norm": 1.3051386869973822, + "language_loss": 0.52022988, + "learning_rate": 2.3224241986645057e-06, + "loss": 0.5961442, + "num_input_tokens_seen": 166021950, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.02941895, + "step": 7734, + "time_per_iteration": 3.0869898796081543 + }, + { + "auxiliary_loss_clip": 0.0644846, + "auxiliary_loss_mlp": 0.01271308, + "balance_loss_clip": 0.06283916, + "balance_loss_mlp": 0.01257856, + "epoch": 0.4650533593867428, + "flos": 10894308773760.0, + "grad_norm": 2.170877243914886, + "language_loss": 0.75776118, + "learning_rate": 2.3220398251154035e-06, + "loss": 0.83495891, + "num_input_tokens_seen": 166039675, + "router_z_loss_clip": 1.64648438, + "router_z_loss_mlp": 0.13464355, + "step": 7735, + "time_per_iteration": 2.478837490081787 + }, + { + "auxiliary_loss_clip": 0.06441534, + "auxiliary_loss_mlp": 0.01268486, + "balance_loss_clip": 0.0627993, + "balance_loss_mlp": 0.01255009, + "epoch": 0.46511348263941077, + "flos": 19980756581760.0, + "grad_norm": 2.0032469234086507, + "language_loss": 0.6994068, + "learning_rate": 2.321655439354519e-06, + "loss": 0.77650702, + "num_input_tokens_seen": 166057745, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.13482666, + "step": 7736, + "time_per_iteration": 2.6353659629821777 + }, + { + "auxiliary_loss_clip": 0.06442849, + "auxiliary_loss_mlp": 0.01268241, + "balance_loss_clip": 0.0628303, + "balance_loss_mlp": 0.01256237, + "epoch": 0.46517360589207873, + "flos": 19683795312000.0, + "grad_norm": 1.6634794649969447, + "language_loss": 0.72674608, + "learning_rate": 2.321271041396427e-06, + "loss": 0.80385697, + "num_input_tokens_seen": 166076440, + "router_z_loss_clip": 1.59570312, + "router_z_loss_mlp": 0.12005615, + "step": 7737, + "time_per_iteration": 2.5038952827453613 + }, + { + "auxiliary_loss_clip": 0.06449911, + "auxiliary_loss_mlp": 0.01268223, + "balance_loss_clip": 0.06283341, + "balance_loss_mlp": 0.01254603, + "epoch": 0.46523372914474675, + "flos": 16878203452800.0, + "grad_norm": 1.9711860161800356, + "language_loss": 0.84095049, + "learning_rate": 2.3208866312557065e-06, + "loss": 0.91813183, + "num_input_tokens_seen": 166092520, + "router_z_loss_clip": 1.66699219, + "router_z_loss_mlp": 0.1361084, + "step": 7738, + "time_per_iteration": 2.5216240882873535 + }, + { + "auxiliary_loss_clip": 0.06338458, + "auxiliary_loss_mlp": 0.01253722, + "balance_loss_clip": 0.06265976, + "balance_loss_mlp": 0.01250617, + "epoch": 0.4652938523974147, + "flos": 53458188917760.0, + "grad_norm": 0.7399188166866549, + "language_loss": 0.57646966, + "learning_rate": 2.320502208946932e-06, + "loss": 0.65239149, + "num_input_tokens_seen": 166156285, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.03102112, + "step": 7739, + "time_per_iteration": 3.215662717819214 + }, + { + "auxiliary_loss_clip": 0.06450304, + "auxiliary_loss_mlp": 0.01271295, + "balance_loss_clip": 0.06285876, + "balance_loss_mlp": 0.01257299, + "epoch": 0.4653539756500827, + "flos": 15236642113920.0, + "grad_norm": 1.7449085109148506, + "language_loss": 0.85184145, + "learning_rate": 2.3201177744846815e-06, + "loss": 0.92905748, + "num_input_tokens_seen": 166173455, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.14013672, + "step": 7740, + "time_per_iteration": 2.4736168384552 + }, + { + "auxiliary_loss_clip": 0.0644415, + "auxiliary_loss_mlp": 0.01270653, + "balance_loss_clip": 0.06281894, + "balance_loss_mlp": 0.01256706, + "epoch": 0.46541409890275065, + "flos": 23738978309760.0, + "grad_norm": 1.5125636475233326, + "language_loss": 0.76338875, + "learning_rate": 2.3197333278835327e-06, + "loss": 0.84053683, + "num_input_tokens_seen": 166194370, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.1394043, + "step": 7741, + "time_per_iteration": 2.56061053276062 + }, + { + "auxiliary_loss_clip": 0.06456167, + "auxiliary_loss_mlp": 0.01268672, + "balance_loss_clip": 0.06284943, + "balance_loss_mlp": 0.01254838, + "epoch": 0.4654742221554186, + "flos": 20853150566400.0, + "grad_norm": 1.6688490987186926, + "language_loss": 0.81291914, + "learning_rate": 2.319348869158064e-06, + "loss": 0.89016759, + "num_input_tokens_seen": 166213195, + "router_z_loss_clip": 1.71386719, + "router_z_loss_mlp": 0.13812256, + "step": 7742, + "time_per_iteration": 2.5372226238250732 + }, + { + "auxiliary_loss_clip": 0.06456183, + "auxiliary_loss_mlp": 0.01268485, + "balance_loss_clip": 0.06287557, + "balance_loss_mlp": 0.01254264, + "epoch": 0.4655343454080866, + "flos": 20711210549760.0, + "grad_norm": 1.6329017257985423, + "language_loss": 0.72620338, + "learning_rate": 2.3189643983228555e-06, + "loss": 0.80345011, + "num_input_tokens_seen": 166231350, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.14227295, + "step": 7743, + "time_per_iteration": 2.561323404312134 + }, + { + "auxiliary_loss_clip": 0.0644543, + "auxiliary_loss_mlp": 0.01269888, + "balance_loss_clip": 0.06280947, + "balance_loss_mlp": 0.01256036, + "epoch": 0.46559446866075455, + "flos": 18995912017920.0, + "grad_norm": 1.7294678893011792, + "language_loss": 0.71235406, + "learning_rate": 2.318579915392483e-06, + "loss": 0.78950727, + "num_input_tokens_seen": 166250530, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.13842773, + "step": 7744, + "time_per_iteration": 2.491428852081299 + }, + { + "auxiliary_loss_clip": 0.06446386, + "auxiliary_loss_mlp": 0.01264964, + "balance_loss_clip": 0.06285123, + "balance_loss_mlp": 0.01252513, + "epoch": 0.4656545919134225, + "flos": 34505030010240.0, + "grad_norm": 1.6678897715471863, + "language_loss": 0.84893715, + "learning_rate": 2.31819542038153e-06, + "loss": 0.92605066, + "num_input_tokens_seen": 166272545, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.12451172, + "step": 7745, + "time_per_iteration": 2.759547233581543 + }, + { + "auxiliary_loss_clip": 0.064444, + "auxiliary_loss_mlp": 0.01268532, + "balance_loss_clip": 0.06282735, + "balance_loss_mlp": 0.01255824, + "epoch": 0.4657147151660905, + "flos": 24316465449600.0, + "grad_norm": 1.3285756054685907, + "language_loss": 0.73465878, + "learning_rate": 2.317810913304574e-06, + "loss": 0.81178808, + "num_input_tokens_seen": 166292135, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.12701416, + "step": 7746, + "time_per_iteration": 2.5268633365631104 + }, + { + "auxiliary_loss_clip": 0.064431, + "auxiliary_loss_mlp": 0.01272209, + "balance_loss_clip": 0.06282558, + "balance_loss_mlp": 0.0125931, + "epoch": 0.46577483841875844, + "flos": 58807743390720.0, + "grad_norm": 1.6027404056917662, + "language_loss": 0.69721079, + "learning_rate": 2.3174263941761963e-06, + "loss": 0.77436388, + "num_input_tokens_seen": 166316710, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.12896729, + "step": 7747, + "time_per_iteration": 2.8772974014282227 + }, + { + "auxiliary_loss_clip": 0.06441785, + "auxiliary_loss_mlp": 0.01269191, + "balance_loss_clip": 0.06279266, + "balance_loss_mlp": 0.01255631, + "epoch": 0.4658349616714264, + "flos": 31330081353600.0, + "grad_norm": 1.8250767057505617, + "language_loss": 0.68153578, + "learning_rate": 2.317041863010978e-06, + "loss": 0.75864553, + "num_input_tokens_seen": 166338535, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.13543701, + "step": 7748, + "time_per_iteration": 2.576828956604004 + }, + { + "auxiliary_loss_clip": 0.06449303, + "auxiliary_loss_mlp": 0.01269068, + "balance_loss_clip": 0.06280029, + "balance_loss_mlp": 0.01254768, + "epoch": 0.46589508492409437, + "flos": 14864601985920.0, + "grad_norm": 2.1691376792383554, + "language_loss": 0.64591479, + "learning_rate": 2.3166573198235007e-06, + "loss": 0.72309858, + "num_input_tokens_seen": 166355540, + "router_z_loss_clip": 1.69140625, + "router_z_loss_mlp": 0.14306641, + "step": 7749, + "time_per_iteration": 2.5408928394317627 + }, + { + "auxiliary_loss_clip": 0.06452534, + "auxiliary_loss_mlp": 0.01273929, + "balance_loss_clip": 0.06283832, + "balance_loss_mlp": 0.01258795, + "epoch": 0.46595520817676234, + "flos": 12900908424960.0, + "grad_norm": 2.0171049134441237, + "language_loss": 0.74442625, + "learning_rate": 2.3162727646283456e-06, + "loss": 0.82169086, + "num_input_tokens_seen": 166372635, + "router_z_loss_clip": 1.6875, + "router_z_loss_mlp": 0.15142822, + "step": 7750, + "time_per_iteration": 2.4698846340179443 + }, + { + "auxiliary_loss_clip": 0.06444734, + "auxiliary_loss_mlp": 0.01270437, + "balance_loss_clip": 0.06276895, + "balance_loss_mlp": 0.01255811, + "epoch": 0.46601533142943036, + "flos": 32862504349440.0, + "grad_norm": 1.8980956421649817, + "language_loss": 0.7426213, + "learning_rate": 2.3158881974400963e-06, + "loss": 0.81977308, + "num_input_tokens_seen": 166393175, + "router_z_loss_clip": 1.67871094, + "router_z_loss_mlp": 0.14624023, + "step": 7751, + "time_per_iteration": 2.6534221172332764 + }, + { + "auxiliary_loss_clip": 0.06449904, + "auxiliary_loss_mlp": 0.01267221, + "balance_loss_clip": 0.06280084, + "balance_loss_mlp": 0.01253017, + "epoch": 0.4660754546820983, + "flos": 19972496954880.0, + "grad_norm": 1.7579709538150943, + "language_loss": 0.73910719, + "learning_rate": 2.3155036182733345e-06, + "loss": 0.81627846, + "num_input_tokens_seen": 166408630, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.14202881, + "step": 7752, + "time_per_iteration": 2.474492311477661 + }, + { + "auxiliary_loss_clip": 0.06447943, + "auxiliary_loss_mlp": 0.01268609, + "balance_loss_clip": 0.06279718, + "balance_loss_mlp": 0.01254578, + "epoch": 0.4661355779347663, + "flos": 26695482572160.0, + "grad_norm": 2.190938043745359, + "language_loss": 0.69726032, + "learning_rate": 2.315119027142644e-06, + "loss": 0.7744258, + "num_input_tokens_seen": 166428170, + "router_z_loss_clip": 1.68066406, + "router_z_loss_mlp": 0.14038086, + "step": 7753, + "time_per_iteration": 2.604612350463867 + }, + { + "auxiliary_loss_clip": 0.06438763, + "auxiliary_loss_mlp": 0.01269724, + "balance_loss_clip": 0.0627787, + "balance_loss_mlp": 0.01256777, + "epoch": 0.46619570118743425, + "flos": 20965726926720.0, + "grad_norm": 1.7706266197381177, + "language_loss": 0.73293746, + "learning_rate": 2.3147344240626076e-06, + "loss": 0.81002235, + "num_input_tokens_seen": 166446705, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.12963867, + "step": 7754, + "time_per_iteration": 2.491225242614746 + }, + { + "auxiliary_loss_clip": 0.06444383, + "auxiliary_loss_mlp": 0.01271714, + "balance_loss_clip": 0.06278208, + "balance_loss_mlp": 0.01256855, + "epoch": 0.4662558244401022, + "flos": 24433024878720.0, + "grad_norm": 1.5728879839910523, + "language_loss": 0.79001075, + "learning_rate": 2.3143498090478114e-06, + "loss": 0.8671717, + "num_input_tokens_seen": 166466750, + "router_z_loss_clip": 1.6640625, + "router_z_loss_mlp": 0.14868164, + "step": 7755, + "time_per_iteration": 2.562178134918213 + }, + { + "auxiliary_loss_clip": 0.06436031, + "auxiliary_loss_mlp": 0.01269294, + "balance_loss_clip": 0.06276575, + "balance_loss_mlp": 0.01256181, + "epoch": 0.4663159476927702, + "flos": 20601820644480.0, + "grad_norm": 1.5633103047544015, + "language_loss": 0.72593671, + "learning_rate": 2.3139651821128382e-06, + "loss": 0.80299002, + "num_input_tokens_seen": 166485400, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.13116455, + "step": 7756, + "time_per_iteration": 4.01608943939209 + }, + { + "auxiliary_loss_clip": 0.06436817, + "auxiliary_loss_mlp": 0.01269611, + "balance_loss_clip": 0.06276436, + "balance_loss_mlp": 0.01256897, + "epoch": 0.46637607094543815, + "flos": 25668235042560.0, + "grad_norm": 1.701604485790762, + "language_loss": 0.7836898, + "learning_rate": 2.313580543272274e-06, + "loss": 0.86075413, + "num_input_tokens_seen": 166505730, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.12719727, + "step": 7757, + "time_per_iteration": 2.555097818374634 + }, + { + "auxiliary_loss_clip": 0.06441291, + "auxiliary_loss_mlp": 0.01274403, + "balance_loss_clip": 0.06277295, + "balance_loss_mlp": 0.01261123, + "epoch": 0.4664361941981061, + "flos": 24279722634240.0, + "grad_norm": 1.9711907960618857, + "language_loss": 0.66213286, + "learning_rate": 2.313195892540705e-06, + "loss": 0.73928982, + "num_input_tokens_seen": 166523770, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.13275146, + "step": 7758, + "time_per_iteration": 2.569962739944458 + }, + { + "auxiliary_loss_clip": 0.06442615, + "auxiliary_loss_mlp": 0.01273146, + "balance_loss_clip": 0.0627957, + "balance_loss_mlp": 0.01260629, + "epoch": 0.4664963174507741, + "flos": 18411800405760.0, + "grad_norm": 1.9738824417509344, + "language_loss": 0.74950838, + "learning_rate": 2.3128112299327147e-06, + "loss": 0.826666, + "num_input_tokens_seen": 166542935, + "router_z_loss_clip": 1.63085938, + "router_z_loss_mlp": 0.12518311, + "step": 7759, + "time_per_iteration": 2.47729229927063 + }, + { + "auxiliary_loss_clip": 0.06440781, + "auxiliary_loss_mlp": 0.01272683, + "balance_loss_clip": 0.06281125, + "balance_loss_mlp": 0.01259827, + "epoch": 0.46655644070344204, + "flos": 22461616742400.0, + "grad_norm": 3.1770723580201103, + "language_loss": 0.77710176, + "learning_rate": 2.312426555462893e-06, + "loss": 0.85423636, + "num_input_tokens_seen": 166563935, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.12860107, + "step": 7760, + "time_per_iteration": 2.555143117904663 + }, + { + "auxiliary_loss_clip": 0.06438316, + "auxiliary_loss_mlp": 0.01270754, + "balance_loss_clip": 0.06279285, + "balance_loss_mlp": 0.01256675, + "epoch": 0.46661656395611, + "flos": 13813525169280.0, + "grad_norm": 1.6658245877843647, + "language_loss": 0.7447418, + "learning_rate": 2.3120418691458237e-06, + "loss": 0.82183254, + "num_input_tokens_seen": 166582175, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.14099121, + "step": 7761, + "time_per_iteration": 2.493032217025757 + }, + { + "auxiliary_loss_clip": 0.06446707, + "auxiliary_loss_mlp": 0.01275728, + "balance_loss_clip": 0.06281132, + "balance_loss_mlp": 0.0126094, + "epoch": 0.466676687208778, + "flos": 21658473757440.0, + "grad_norm": 1.6817719059657052, + "language_loss": 0.78770381, + "learning_rate": 2.3116571709960956e-06, + "loss": 0.86492819, + "num_input_tokens_seen": 166601870, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.14788818, + "step": 7762, + "time_per_iteration": 2.5613081455230713 + }, + { + "auxiliary_loss_clip": 0.06338885, + "auxiliary_loss_mlp": 0.01268455, + "balance_loss_clip": 0.06268312, + "balance_loss_mlp": 0.01265552, + "epoch": 0.46673681046144594, + "flos": 68554163554560.0, + "grad_norm": 0.7818830178478652, + "language_loss": 0.59643799, + "learning_rate": 2.311272461028297e-06, + "loss": 0.67251134, + "num_input_tokens_seen": 166668960, + "router_z_loss_clip": 0.70605469, + "router_z_loss_mlp": 0.0289917, + "step": 7763, + "time_per_iteration": 4.584456443786621 + }, + { + "auxiliary_loss_clip": 0.06446124, + "auxiliary_loss_mlp": 0.01269966, + "balance_loss_clip": 0.06278878, + "balance_loss_mlp": 0.01255559, + "epoch": 0.46679693371411396, + "flos": 15819789404160.0, + "grad_norm": 1.948864663001373, + "language_loss": 0.79278809, + "learning_rate": 2.3108877392570146e-06, + "loss": 0.86994898, + "num_input_tokens_seen": 166686110, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.14398193, + "step": 7764, + "time_per_iteration": 2.465179920196533 + }, + { + "auxiliary_loss_clip": 0.06441632, + "auxiliary_loss_mlp": 0.01267635, + "balance_loss_clip": 0.06281599, + "balance_loss_mlp": 0.01255035, + "epoch": 0.4668570569667819, + "flos": 18520393697280.0, + "grad_norm": 2.0437394229584123, + "language_loss": 0.72096646, + "learning_rate": 2.310503005696839e-06, + "loss": 0.79805923, + "num_input_tokens_seen": 166703930, + "router_z_loss_clip": 1.59863281, + "router_z_loss_mlp": 0.12597656, + "step": 7765, + "time_per_iteration": 2.5701630115509033 + }, + { + "auxiliary_loss_clip": 0.06443523, + "auxiliary_loss_mlp": 0.01272136, + "balance_loss_clip": 0.06278671, + "balance_loss_mlp": 0.01258141, + "epoch": 0.4669171802194499, + "flos": 19212385841280.0, + "grad_norm": 2.21059711365052, + "language_loss": 0.77947736, + "learning_rate": 2.3101182603623576e-06, + "loss": 0.85663396, + "num_input_tokens_seen": 166719940, + "router_z_loss_clip": 1.64746094, + "router_z_loss_mlp": 0.14001465, + "step": 7766, + "time_per_iteration": 2.481160879135132 + }, + { + "auxiliary_loss_clip": 0.06441876, + "auxiliary_loss_mlp": 0.01272138, + "balance_loss_clip": 0.06280202, + "balance_loss_mlp": 0.01258489, + "epoch": 0.46697730347211786, + "flos": 12281018319360.0, + "grad_norm": 2.232432946710323, + "language_loss": 0.65461195, + "learning_rate": 2.3097335032681607e-06, + "loss": 0.73175204, + "num_input_tokens_seen": 166738285, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.13653564, + "step": 7767, + "time_per_iteration": 2.5368387699127197 + }, + { + "auxiliary_loss_clip": 0.06442834, + "auxiliary_loss_mlp": 0.01272968, + "balance_loss_clip": 0.06280966, + "balance_loss_mlp": 0.01259307, + "epoch": 0.4670374267247858, + "flos": 23593516421760.0, + "grad_norm": 2.313152144280668, + "language_loss": 0.75071919, + "learning_rate": 2.3093487344288393e-06, + "loss": 0.82787716, + "num_input_tokens_seen": 166758170, + "router_z_loss_clip": 1.61914062, + "router_z_loss_mlp": 0.13677979, + "step": 7768, + "time_per_iteration": 3.9271702766418457 + }, + { + "auxiliary_loss_clip": 0.06441817, + "auxiliary_loss_mlp": 0.0126721, + "balance_loss_clip": 0.06279824, + "balance_loss_mlp": 0.01253697, + "epoch": 0.4670975499774538, + "flos": 15995495416320.0, + "grad_norm": 1.5695198160982793, + "language_loss": 0.71176434, + "learning_rate": 2.308963953858982e-06, + "loss": 0.7888546, + "num_input_tokens_seen": 166775750, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.1350708, + "step": 7769, + "time_per_iteration": 2.5253636837005615 + }, + { + "auxiliary_loss_clip": 0.06441696, + "auxiliary_loss_mlp": 0.01271746, + "balance_loss_clip": 0.06279374, + "balance_loss_mlp": 0.01258305, + "epoch": 0.46715767323012175, + "flos": 15383026396800.0, + "grad_norm": 1.8223238330296296, + "language_loss": 0.81503379, + "learning_rate": 2.3085791615731803e-06, + "loss": 0.89216816, + "num_input_tokens_seen": 166791720, + "router_z_loss_clip": 1.61914062, + "router_z_loss_mlp": 0.13446045, + "step": 7770, + "time_per_iteration": 2.468287706375122 + }, + { + "auxiliary_loss_clip": 0.06346406, + "auxiliary_loss_mlp": 0.01251242, + "balance_loss_clip": 0.06275694, + "balance_loss_mlp": 0.01249068, + "epoch": 0.4672177964827897, + "flos": 60270774877440.0, + "grad_norm": 0.8490857527823061, + "language_loss": 0.55591935, + "learning_rate": 2.3081943575860265e-06, + "loss": 0.63189584, + "num_input_tokens_seen": 166856360, + "router_z_loss_clip": 0.70703125, + "router_z_loss_mlp": 0.02177429, + "step": 7771, + "time_per_iteration": 3.1719799041748047 + }, + { + "auxiliary_loss_clip": 0.064445, + "auxiliary_loss_mlp": 0.01269252, + "balance_loss_clip": 0.06282087, + "balance_loss_mlp": 0.01256234, + "epoch": 0.4672779197354577, + "flos": 27643500466560.0, + "grad_norm": 2.2149063838305363, + "language_loss": 0.65989488, + "learning_rate": 2.3078095419121117e-06, + "loss": 0.73703241, + "num_input_tokens_seen": 166875925, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.13024902, + "step": 7772, + "time_per_iteration": 2.616668939590454 + }, + { + "auxiliary_loss_clip": 0.06441614, + "auxiliary_loss_mlp": 0.01269621, + "balance_loss_clip": 0.06282961, + "balance_loss_mlp": 0.01257009, + "epoch": 0.46733804298812565, + "flos": 31402267246080.0, + "grad_norm": 2.671628135597842, + "language_loss": 0.64495057, + "learning_rate": 2.3074247145660283e-06, + "loss": 0.72206295, + "num_input_tokens_seen": 166896520, + "router_z_loss_clip": 1.58789062, + "router_z_loss_mlp": 0.1260376, + "step": 7773, + "time_per_iteration": 2.5923900604248047 + }, + { + "auxiliary_loss_clip": 0.06442621, + "auxiliary_loss_mlp": 0.01269928, + "balance_loss_clip": 0.06280822, + "balance_loss_mlp": 0.01256457, + "epoch": 0.4673981662407936, + "flos": 19506747634560.0, + "grad_norm": 1.7164237292195044, + "language_loss": 0.80045915, + "learning_rate": 2.3070398755623685e-06, + "loss": 0.87758458, + "num_input_tokens_seen": 166915370, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.13464355, + "step": 7774, + "time_per_iteration": 2.577458620071411 + }, + { + "auxiliary_loss_clip": 0.06444994, + "auxiliary_loss_mlp": 0.01268006, + "balance_loss_clip": 0.06279732, + "balance_loss_mlp": 0.01254583, + "epoch": 0.4674582894934616, + "flos": 20528083451520.0, + "grad_norm": 1.5985457295090966, + "language_loss": 0.78042519, + "learning_rate": 2.306655024915726e-06, + "loss": 0.85755515, + "num_input_tokens_seen": 166934875, + "router_z_loss_clip": 1.65234375, + "router_z_loss_mlp": 0.13439941, + "step": 7775, + "time_per_iteration": 2.5538787841796875 + }, + { + "auxiliary_loss_clip": 0.06442325, + "auxiliary_loss_mlp": 0.0127297, + "balance_loss_clip": 0.06282222, + "balance_loss_mlp": 0.01259988, + "epoch": 0.46751841274612954, + "flos": 22097500824960.0, + "grad_norm": 1.8860444903676625, + "language_loss": 0.69909471, + "learning_rate": 2.306270162640694e-06, + "loss": 0.77624762, + "num_input_tokens_seen": 166954285, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.12963867, + "step": 7776, + "time_per_iteration": 2.561692237854004 + }, + { + "auxiliary_loss_clip": 0.0644502, + "auxiliary_loss_mlp": 0.01270071, + "balance_loss_clip": 0.06284119, + "balance_loss_mlp": 0.01257244, + "epoch": 0.46757853599879756, + "flos": 26987454524160.0, + "grad_norm": 1.3861659298765134, + "language_loss": 0.74096608, + "learning_rate": 2.3058852887518678e-06, + "loss": 0.81811702, + "num_input_tokens_seen": 166975975, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.1282959, + "step": 7777, + "time_per_iteration": 2.536015510559082 + }, + { + "auxiliary_loss_clip": 0.06447745, + "auxiliary_loss_mlp": 0.01270612, + "balance_loss_clip": 0.06284414, + "balance_loss_mlp": 0.01256921, + "epoch": 0.4676386592514655, + "flos": 24140927145600.0, + "grad_norm": 1.9470179218555579, + "language_loss": 0.69820189, + "learning_rate": 2.3055004032638394e-06, + "loss": 0.77538544, + "num_input_tokens_seen": 166996140, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.13690186, + "step": 7778, + "time_per_iteration": 2.548154354095459 + }, + { + "auxiliary_loss_clip": 0.06447626, + "auxiliary_loss_mlp": 0.01267681, + "balance_loss_clip": 0.06282265, + "balance_loss_mlp": 0.01253513, + "epoch": 0.4676987825041335, + "flos": 25490768094720.0, + "grad_norm": 1.4247023457023664, + "language_loss": 0.73440385, + "learning_rate": 2.305115506191206e-06, + "loss": 0.81155688, + "num_input_tokens_seen": 167016105, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.14160156, + "step": 7779, + "time_per_iteration": 2.5291388034820557 + }, + { + "auxiliary_loss_clip": 0.06443821, + "auxiliary_loss_mlp": 0.01265717, + "balance_loss_clip": 0.06285408, + "balance_loss_mlp": 0.01253379, + "epoch": 0.46775890575680146, + "flos": 21951871228800.0, + "grad_norm": 1.9613896423037807, + "language_loss": 0.72685552, + "learning_rate": 2.304730597548562e-06, + "loss": 0.80395079, + "num_input_tokens_seen": 167036185, + "router_z_loss_clip": 1.58398438, + "router_z_loss_mlp": 0.12353516, + "step": 7780, + "time_per_iteration": 2.5508480072021484 + }, + { + "auxiliary_loss_clip": 0.06447856, + "auxiliary_loss_mlp": 0.01269851, + "balance_loss_clip": 0.06280719, + "balance_loss_mlp": 0.01256273, + "epoch": 0.4678190290094694, + "flos": 25235413176960.0, + "grad_norm": 1.8471847442174032, + "language_loss": 0.74638426, + "learning_rate": 2.3043456773505023e-06, + "loss": 0.82356131, + "num_input_tokens_seen": 167054515, + "router_z_loss_clip": 1.671875, + "router_z_loss_mlp": 0.13586426, + "step": 7781, + "time_per_iteration": 2.527614116668701 + }, + { + "auxiliary_loss_clip": 0.06446712, + "auxiliary_loss_mlp": 0.01269353, + "balance_loss_clip": 0.06281281, + "balance_loss_mlp": 0.0125528, + "epoch": 0.4678791522621374, + "flos": 32276254458240.0, + "grad_norm": 1.845752858447898, + "language_loss": 0.63050562, + "learning_rate": 2.3039607456116252e-06, + "loss": 0.70766628, + "num_input_tokens_seen": 167077245, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.140625, + "step": 7782, + "time_per_iteration": 2.650505304336548 + }, + { + "auxiliary_loss_clip": 0.06445308, + "auxiliary_loss_mlp": 0.01268795, + "balance_loss_clip": 0.06280467, + "balance_loss_mlp": 0.01255306, + "epoch": 0.46793927551480535, + "flos": 27052764382080.0, + "grad_norm": 2.229893941722145, + "language_loss": 0.63585413, + "learning_rate": 2.3035758023465254e-06, + "loss": 0.71299517, + "num_input_tokens_seen": 167097235, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.13494873, + "step": 7783, + "time_per_iteration": 2.5537588596343994 + }, + { + "auxiliary_loss_clip": 0.0645118, + "auxiliary_loss_mlp": 0.01271407, + "balance_loss_clip": 0.06280845, + "balance_loss_mlp": 0.01257245, + "epoch": 0.4679993987674733, + "flos": 17463195532800.0, + "grad_norm": 2.4083561383098004, + "language_loss": 0.68662858, + "learning_rate": 2.303190847569801e-06, + "loss": 0.7638545, + "num_input_tokens_seen": 167113155, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.1418457, + "step": 7784, + "time_per_iteration": 2.560459613800049 + }, + { + "auxiliary_loss_clip": 0.06438549, + "auxiliary_loss_mlp": 0.01266567, + "balance_loss_clip": 0.06278238, + "balance_loss_mlp": 0.01254003, + "epoch": 0.4680595220201413, + "flos": 17170804310400.0, + "grad_norm": 1.9765250646873525, + "language_loss": 0.84616911, + "learning_rate": 2.3028058812960497e-06, + "loss": 0.92322016, + "num_input_tokens_seen": 167131765, + "router_z_loss_clip": 1.60351562, + "router_z_loss_mlp": 0.12567139, + "step": 7785, + "time_per_iteration": 2.5567643642425537 + }, + { + "auxiliary_loss_clip": 0.06444662, + "auxiliary_loss_mlp": 0.01268089, + "balance_loss_clip": 0.06281722, + "balance_loss_mlp": 0.01254225, + "epoch": 0.46811964527280925, + "flos": 11332329592320.0, + "grad_norm": 1.9719414675879272, + "language_loss": 0.77991092, + "learning_rate": 2.3024209035398678e-06, + "loss": 0.85703844, + "num_input_tokens_seen": 167149030, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.13867188, + "step": 7786, + "time_per_iteration": 2.507206439971924 + }, + { + "auxiliary_loss_clip": 0.06440122, + "auxiliary_loss_mlp": 0.01265794, + "balance_loss_clip": 0.06281641, + "balance_loss_mlp": 0.01253897, + "epoch": 0.4681797685254772, + "flos": 24285508565760.0, + "grad_norm": 2.2497529795631817, + "language_loss": 0.74387538, + "learning_rate": 2.302035914315856e-06, + "loss": 0.82093459, + "num_input_tokens_seen": 167167375, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.11901855, + "step": 7787, + "time_per_iteration": 2.498021125793457 + }, + { + "auxiliary_loss_clip": 0.06439888, + "auxiliary_loss_mlp": 0.01272631, + "balance_loss_clip": 0.06278901, + "balance_loss_mlp": 0.01258785, + "epoch": 0.4682398917781452, + "flos": 31658544558720.0, + "grad_norm": 1.7533783368280031, + "language_loss": 0.66132212, + "learning_rate": 2.3016509136386116e-06, + "loss": 0.73844731, + "num_input_tokens_seen": 167188065, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.1383667, + "step": 7788, + "time_per_iteration": 2.650092363357544 + }, + { + "auxiliary_loss_clip": 0.06441839, + "auxiliary_loss_mlp": 0.01268022, + "balance_loss_clip": 0.06280681, + "balance_loss_mlp": 0.01256036, + "epoch": 0.46830001503081314, + "flos": 28118264100480.0, + "grad_norm": 1.5278727961877703, + "language_loss": 0.64315766, + "learning_rate": 2.3012659015227343e-06, + "loss": 0.72025621, + "num_input_tokens_seen": 167209675, + "router_z_loss_clip": 1.61132812, + "router_z_loss_mlp": 0.11987305, + "step": 7789, + "time_per_iteration": 2.5806198120117188 + }, + { + "auxiliary_loss_clip": 0.06338993, + "auxiliary_loss_mlp": 0.01252338, + "balance_loss_clip": 0.06268935, + "balance_loss_mlp": 0.01250063, + "epoch": 0.4683601382834811, + "flos": 57900059308800.0, + "grad_norm": 0.6904155708009142, + "language_loss": 0.61868596, + "learning_rate": 2.300880877982825e-06, + "loss": 0.69459921, + "num_input_tokens_seen": 167273940, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 0.02276611, + "step": 7790, + "time_per_iteration": 3.2271504402160645 + }, + { + "auxiliary_loss_clip": 0.06442016, + "auxiliary_loss_mlp": 0.01269711, + "balance_loss_clip": 0.06283005, + "balance_loss_mlp": 0.01257111, + "epoch": 0.46842026153614913, + "flos": 21878427525120.0, + "grad_norm": 1.6377280327187325, + "language_loss": 0.79426539, + "learning_rate": 2.3004958430334808e-06, + "loss": 0.87138271, + "num_input_tokens_seen": 167292730, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.12597656, + "step": 7791, + "time_per_iteration": 2.490171194076538 + }, + { + "auxiliary_loss_clip": 0.06441824, + "auxiliary_loss_mlp": 0.01269493, + "balance_loss_clip": 0.06283456, + "balance_loss_mlp": 0.01256899, + "epoch": 0.4684803847888171, + "flos": 24907914293760.0, + "grad_norm": 1.496703208223837, + "language_loss": 0.74930024, + "learning_rate": 2.3001107966893052e-06, + "loss": 0.82641351, + "num_input_tokens_seen": 167313460, + "router_z_loss_clip": 1.58300781, + "router_z_loss_mlp": 0.12573242, + "step": 7792, + "time_per_iteration": 2.5588057041168213 + }, + { + "auxiliary_loss_clip": 0.0643919, + "auxiliary_loss_mlp": 0.01267774, + "balance_loss_clip": 0.06282478, + "balance_loss_mlp": 0.01255972, + "epoch": 0.46854050804148506, + "flos": 26259138835200.0, + "grad_norm": 1.9488467409065784, + "language_loss": 0.68353844, + "learning_rate": 2.299725738964898e-06, + "loss": 0.76060808, + "num_input_tokens_seen": 167335385, + "router_z_loss_clip": 1.56640625, + "router_z_loss_mlp": 0.11804199, + "step": 7793, + "time_per_iteration": 2.543156147003174 + }, + { + "auxiliary_loss_clip": 0.06441274, + "auxiliary_loss_mlp": 0.01273582, + "balance_loss_clip": 0.0628298, + "balance_loss_mlp": 0.01261387, + "epoch": 0.468600631294153, + "flos": 21586204010880.0, + "grad_norm": 1.8535654365133143, + "language_loss": 0.74367434, + "learning_rate": 2.2993406698748607e-06, + "loss": 0.82082289, + "num_input_tokens_seen": 167353625, + "router_z_loss_clip": 1.58105469, + "router_z_loss_mlp": 0.12194824, + "step": 7794, + "time_per_iteration": 2.6082603931427 + }, + { + "auxiliary_loss_clip": 0.06445156, + "auxiliary_loss_mlp": 0.01268892, + "balance_loss_clip": 0.06285646, + "balance_loss_mlp": 0.01255343, + "epoch": 0.468660754546821, + "flos": 25892842711680.0, + "grad_norm": 2.128212140250663, + "language_loss": 0.64027059, + "learning_rate": 2.2989555894337953e-06, + "loss": 0.71741104, + "num_input_tokens_seen": 167374565, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.13537598, + "step": 7795, + "time_per_iteration": 2.554871082305908 + }, + { + "auxiliary_loss_clip": 0.06440422, + "auxiliary_loss_mlp": 0.01266239, + "balance_loss_clip": 0.06283793, + "balance_loss_mlp": 0.01253067, + "epoch": 0.46872087779948896, + "flos": 35482746977280.0, + "grad_norm": 1.4934025143707166, + "language_loss": 0.6791029, + "learning_rate": 2.298570497656304e-06, + "loss": 0.7561695, + "num_input_tokens_seen": 167395010, + "router_z_loss_clip": 1.56542969, + "router_z_loss_mlp": 0.13171387, + "step": 7796, + "time_per_iteration": 4.070605754852295 + }, + { + "auxiliary_loss_clip": 0.06441301, + "auxiliary_loss_mlp": 0.01267111, + "balance_loss_clip": 0.06280352, + "balance_loss_mlp": 0.0125435, + "epoch": 0.4687810010521569, + "flos": 26403720255360.0, + "grad_norm": 1.619506492510176, + "language_loss": 0.70710748, + "learning_rate": 2.2981853945569894e-06, + "loss": 0.78419161, + "num_input_tokens_seen": 167415285, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.12762451, + "step": 7797, + "time_per_iteration": 2.574291706085205 + }, + { + "auxiliary_loss_clip": 0.06443868, + "auxiliary_loss_mlp": 0.01272473, + "balance_loss_clip": 0.0628204, + "balance_loss_mlp": 0.01258472, + "epoch": 0.4688411243048249, + "flos": 19978618302720.0, + "grad_norm": 1.9026226114754317, + "language_loss": 0.67159688, + "learning_rate": 2.297800280150454e-06, + "loss": 0.74876028, + "num_input_tokens_seen": 167432405, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.14007568, + "step": 7798, + "time_per_iteration": 2.4703564643859863 + }, + { + "auxiliary_loss_clip": 0.06331287, + "auxiliary_loss_mlp": 0.01256102, + "balance_loss_clip": 0.06261373, + "balance_loss_mlp": 0.01253898, + "epoch": 0.46890124755749285, + "flos": 63996739983360.0, + "grad_norm": 0.926390069403038, + "language_loss": 0.64518279, + "learning_rate": 2.2974151544513033e-06, + "loss": 0.7210567, + "num_input_tokens_seen": 167499365, + "router_z_loss_clip": 0.69921875, + "router_z_loss_mlp": 0.02207947, + "step": 7799, + "time_per_iteration": 3.3128738403320312 + }, + { + "auxiliary_loss_clip": 0.06441961, + "auxiliary_loss_mlp": 0.01271763, + "balance_loss_clip": 0.06283548, + "balance_loss_mlp": 0.01258429, + "epoch": 0.4689613708101608, + "flos": 23775763052160.0, + "grad_norm": 1.2629628474735628, + "language_loss": 0.72331405, + "learning_rate": 2.2970300174741395e-06, + "loss": 0.80045128, + "num_input_tokens_seen": 167520390, + "router_z_loss_clip": 1.58300781, + "router_z_loss_mlp": 0.13330078, + "step": 7800, + "time_per_iteration": 2.5339090824127197 + }, + { + "auxiliary_loss_clip": 0.06436972, + "auxiliary_loss_mlp": 0.01269738, + "balance_loss_clip": 0.06279731, + "balance_loss_mlp": 0.01257406, + "epoch": 0.4690214940628288, + "flos": 24795337933440.0, + "grad_norm": 2.7480307453946726, + "language_loss": 0.72682166, + "learning_rate": 2.296644869233568e-06, + "loss": 0.80388874, + "num_input_tokens_seen": 167539865, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.12335205, + "step": 7801, + "time_per_iteration": 2.552154541015625 + }, + { + "auxiliary_loss_clip": 0.06449857, + "auxiliary_loss_mlp": 0.01274232, + "balance_loss_clip": 0.06283514, + "balance_loss_mlp": 0.01260094, + "epoch": 0.46908161731549675, + "flos": 18083169492480.0, + "grad_norm": 1.9453242658612842, + "language_loss": 0.62466741, + "learning_rate": 2.2962597097441936e-06, + "loss": 0.70190829, + "num_input_tokens_seen": 167558190, + "router_z_loss_clip": 1.66308594, + "router_z_loss_mlp": 0.14135742, + "step": 7802, + "time_per_iteration": 3.9707396030426025 + }, + { + "auxiliary_loss_clip": 0.06437971, + "auxiliary_loss_mlp": 0.01270017, + "balance_loss_clip": 0.06277081, + "balance_loss_mlp": 0.01257459, + "epoch": 0.4691417405681647, + "flos": 25710554154240.0, + "grad_norm": 1.8844359624083942, + "language_loss": 0.73532665, + "learning_rate": 2.2958745390206206e-06, + "loss": 0.81240654, + "num_input_tokens_seen": 167577685, + "router_z_loss_clip": 1.60839844, + "router_z_loss_mlp": 0.12554932, + "step": 7803, + "time_per_iteration": 2.554459810256958 + }, + { + "auxiliary_loss_clip": 0.06438211, + "auxiliary_loss_mlp": 0.01272362, + "balance_loss_clip": 0.06278156, + "balance_loss_mlp": 0.01259338, + "epoch": 0.46920186382083273, + "flos": 17462776262400.0, + "grad_norm": 1.58578754852504, + "language_loss": 0.77327907, + "learning_rate": 2.2954893570774558e-06, + "loss": 0.85038471, + "num_input_tokens_seen": 167596390, + "router_z_loss_clip": 1.60058594, + "router_z_loss_mlp": 0.13012695, + "step": 7804, + "time_per_iteration": 2.543470621109009 + }, + { + "auxiliary_loss_clip": 0.06432682, + "auxiliary_loss_mlp": 0.0126654, + "balance_loss_clip": 0.06275688, + "balance_loss_mlp": 0.01254298, + "epoch": 0.4692619870735007, + "flos": 20345669112960.0, + "grad_norm": 1.787683586047485, + "language_loss": 0.77375299, + "learning_rate": 2.295104163929305e-06, + "loss": 0.8507452, + "num_input_tokens_seen": 167614980, + "router_z_loss_clip": 1.56933594, + "router_z_loss_mlp": 0.12231445, + "step": 7805, + "time_per_iteration": 2.501739740371704 + }, + { + "auxiliary_loss_clip": 0.0644381, + "auxiliary_loss_mlp": 0.01270681, + "balance_loss_clip": 0.06276695, + "balance_loss_mlp": 0.01257163, + "epoch": 0.46932211032616866, + "flos": 29504177032320.0, + "grad_norm": 1.522976757050157, + "language_loss": 0.83108258, + "learning_rate": 2.2947189595907742e-06, + "loss": 0.90822744, + "num_input_tokens_seen": 167635895, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.13519287, + "step": 7806, + "time_per_iteration": 2.6634225845336914 + }, + { + "auxiliary_loss_clip": 0.06437123, + "auxiliary_loss_mlp": 0.01266513, + "balance_loss_clip": 0.06274477, + "balance_loss_mlp": 0.01253496, + "epoch": 0.4693822335788366, + "flos": 36220202760960.0, + "grad_norm": 1.6923542734381007, + "language_loss": 0.77444482, + "learning_rate": 2.294333744076472e-06, + "loss": 0.8514812, + "num_input_tokens_seen": 167657440, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.13006592, + "step": 7807, + "time_per_iteration": 4.0442986488342285 + }, + { + "auxiliary_loss_clip": 0.06438392, + "auxiliary_loss_mlp": 0.01270643, + "balance_loss_clip": 0.06276641, + "balance_loss_mlp": 0.01257024, + "epoch": 0.4694423568315046, + "flos": 20345124061440.0, + "grad_norm": 1.7839407979100135, + "language_loss": 0.51769608, + "learning_rate": 2.2939485174010035e-06, + "loss": 0.59478641, + "num_input_tokens_seen": 167675025, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.13635254, + "step": 7808, + "time_per_iteration": 2.4910712242126465 + }, + { + "auxiliary_loss_clip": 0.06328695, + "auxiliary_loss_mlp": 0.01252926, + "balance_loss_clip": 0.06259091, + "balance_loss_mlp": 0.01250451, + "epoch": 0.46950248008417256, + "flos": 64343540033280.0, + "grad_norm": 0.7688077124363479, + "language_loss": 0.57691324, + "learning_rate": 2.293563279578978e-06, + "loss": 0.65272945, + "num_input_tokens_seen": 167729635, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 0.0247345, + "step": 7809, + "time_per_iteration": 3.055589199066162 + }, + { + "auxiliary_loss_clip": 0.06439595, + "auxiliary_loss_mlp": 0.01268316, + "balance_loss_clip": 0.06276885, + "balance_loss_mlp": 0.01254845, + "epoch": 0.4695626033368405, + "flos": 19204755120000.0, + "grad_norm": 2.3576337237105425, + "language_loss": 0.71649069, + "learning_rate": 2.2931780306250045e-06, + "loss": 0.7935698, + "num_input_tokens_seen": 167745135, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.13470459, + "step": 7810, + "time_per_iteration": 2.5001537799835205 + }, + { + "auxiliary_loss_clip": 0.06435918, + "auxiliary_loss_mlp": 0.01272852, + "balance_loss_clip": 0.06275883, + "balance_loss_mlp": 0.01259113, + "epoch": 0.4696227265895085, + "flos": 23009027466240.0, + "grad_norm": 3.6880824309964617, + "language_loss": 0.81146425, + "learning_rate": 2.29279277055369e-06, + "loss": 0.88855195, + "num_input_tokens_seen": 167763875, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.13726807, + "step": 7811, + "time_per_iteration": 2.5971217155456543 + }, + { + "auxiliary_loss_clip": 0.06437828, + "auxiliary_loss_mlp": 0.01267753, + "balance_loss_clip": 0.06276736, + "balance_loss_mlp": 0.0125405, + "epoch": 0.46968284984217645, + "flos": 21877169713920.0, + "grad_norm": 1.5426371434141024, + "language_loss": 0.80606401, + "learning_rate": 2.292407499379644e-06, + "loss": 0.88311982, + "num_input_tokens_seen": 167784895, + "router_z_loss_clip": 1.61035156, + "router_z_loss_mlp": 0.13708496, + "step": 7812, + "time_per_iteration": 2.5140600204467773 + }, + { + "auxiliary_loss_clip": 0.06435272, + "auxiliary_loss_mlp": 0.01271707, + "balance_loss_clip": 0.06277305, + "balance_loss_mlp": 0.01258445, + "epoch": 0.4697429730948444, + "flos": 19981217779200.0, + "grad_norm": 1.702985157553907, + "language_loss": 0.74653876, + "learning_rate": 2.292022217117477e-06, + "loss": 0.82360852, + "num_input_tokens_seen": 167803185, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.13256836, + "step": 7813, + "time_per_iteration": 2.530773401260376 + }, + { + "auxiliary_loss_clip": 0.06438613, + "auxiliary_loss_mlp": 0.01270357, + "balance_loss_clip": 0.06279637, + "balance_loss_mlp": 0.01256755, + "epoch": 0.4698030963475124, + "flos": 15161185912320.0, + "grad_norm": 2.103167897479233, + "language_loss": 0.84843278, + "learning_rate": 2.291636923781798e-06, + "loss": 0.92552245, + "num_input_tokens_seen": 167816550, + "router_z_loss_clip": 1.58886719, + "router_z_loss_mlp": 0.13604736, + "step": 7814, + "time_per_iteration": 2.550631046295166 + }, + { + "auxiliary_loss_clip": 0.06432581, + "auxiliary_loss_mlp": 0.01265742, + "balance_loss_clip": 0.06276342, + "balance_loss_mlp": 0.01252856, + "epoch": 0.46986321960018035, + "flos": 15155316126720.0, + "grad_norm": 2.71974016097947, + "language_loss": 0.82219559, + "learning_rate": 2.291251619387217e-06, + "loss": 0.89917886, + "num_input_tokens_seen": 167831845, + "router_z_loss_clip": 1.56152344, + "router_z_loss_mlp": 0.12896729, + "step": 7815, + "time_per_iteration": 2.508582592010498 + }, + { + "auxiliary_loss_clip": 0.06434117, + "auxiliary_loss_mlp": 0.01273411, + "balance_loss_clip": 0.06275953, + "balance_loss_mlp": 0.01259952, + "epoch": 0.4699233428528483, + "flos": 23115021281280.0, + "grad_norm": 2.356408218131492, + "language_loss": 0.77761489, + "learning_rate": 2.2908663039483468e-06, + "loss": 0.85469019, + "num_input_tokens_seen": 167850360, + "router_z_loss_clip": 1.58300781, + "router_z_loss_mlp": 0.13452148, + "step": 7816, + "time_per_iteration": 2.505244493484497 + }, + { + "auxiliary_loss_clip": 0.06334539, + "auxiliary_loss_mlp": 0.01254323, + "balance_loss_clip": 0.06264929, + "balance_loss_mlp": 0.01251993, + "epoch": 0.46998346610551633, + "flos": 68126917985280.0, + "grad_norm": 0.8142436419344395, + "language_loss": 0.58616334, + "learning_rate": 2.290480977479796e-06, + "loss": 0.66205192, + "num_input_tokens_seen": 167908660, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 0.02325439, + "step": 7817, + "time_per_iteration": 3.1171398162841797 + }, + { + "auxiliary_loss_clip": 0.0643587, + "auxiliary_loss_mlp": 0.01268626, + "balance_loss_clip": 0.06280724, + "balance_loss_mlp": 0.01255119, + "epoch": 0.4700435893581843, + "flos": 24135560484480.0, + "grad_norm": 1.6087842481989176, + "language_loss": 0.7922467, + "learning_rate": 2.2900956399961775e-06, + "loss": 0.8692916, + "num_input_tokens_seen": 167927905, + "router_z_loss_clip": 1.54980469, + "router_z_loss_mlp": 0.13513184, + "step": 7818, + "time_per_iteration": 2.5133657455444336 + }, + { + "auxiliary_loss_clip": 0.06435841, + "auxiliary_loss_mlp": 0.01270106, + "balance_loss_clip": 0.06278426, + "balance_loss_mlp": 0.01257279, + "epoch": 0.47010371261085226, + "flos": 20155624053120.0, + "grad_norm": 1.9598217577618973, + "language_loss": 0.83629054, + "learning_rate": 2.289710291512104e-06, + "loss": 0.91334999, + "num_input_tokens_seen": 167945995, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.12841797, + "step": 7819, + "time_per_iteration": 2.512434482574463 + }, + { + "auxiliary_loss_clip": 0.06440641, + "auxiliary_loss_mlp": 0.01268241, + "balance_loss_clip": 0.06277996, + "balance_loss_mlp": 0.01253519, + "epoch": 0.47016383586352023, + "flos": 15127587624960.0, + "grad_norm": 1.951811924314391, + "language_loss": 0.76718354, + "learning_rate": 2.289324932042186e-06, + "loss": 0.84427238, + "num_input_tokens_seen": 167963380, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.1472168, + "step": 7820, + "time_per_iteration": 2.4596121311187744 + }, + { + "auxiliary_loss_clip": 0.06434815, + "auxiliary_loss_mlp": 0.01270743, + "balance_loss_clip": 0.06279559, + "balance_loss_mlp": 0.01257636, + "epoch": 0.4702239591161882, + "flos": 13558044470400.0, + "grad_norm": 1.9648943700675503, + "language_loss": 0.74081844, + "learning_rate": 2.288939561601039e-06, + "loss": 0.81787401, + "num_input_tokens_seen": 167981740, + "router_z_loss_clip": 1.55175781, + "router_z_loss_mlp": 0.13116455, + "step": 7821, + "time_per_iteration": 2.4793312549591064 + }, + { + "auxiliary_loss_clip": 0.06431578, + "auxiliary_loss_mlp": 0.01268853, + "balance_loss_clip": 0.06276228, + "balance_loss_mlp": 0.01256658, + "epoch": 0.47028408236885616, + "flos": 24282825235200.0, + "grad_norm": 1.6413236035832721, + "language_loss": 0.89491117, + "learning_rate": 2.2885541802032746e-06, + "loss": 0.97191548, + "num_input_tokens_seen": 167999380, + "router_z_loss_clip": 1.54980469, + "router_z_loss_mlp": 0.12207031, + "step": 7822, + "time_per_iteration": 2.5880398750305176 + }, + { + "auxiliary_loss_clip": 0.06433522, + "auxiliary_loss_mlp": 0.01266311, + "balance_loss_clip": 0.06277143, + "balance_loss_mlp": 0.01254062, + "epoch": 0.4703442056215241, + "flos": 22863565578240.0, + "grad_norm": 1.438932852866735, + "language_loss": 0.79699898, + "learning_rate": 2.2881687878635055e-06, + "loss": 0.87399733, + "num_input_tokens_seen": 168018395, + "router_z_loss_clip": 1.56445312, + "router_z_loss_mlp": 0.12255859, + "step": 7823, + "time_per_iteration": 2.5661919116973877 + }, + { + "auxiliary_loss_clip": 0.06324597, + "auxiliary_loss_mlp": 0.01253174, + "balance_loss_clip": 0.06255165, + "balance_loss_mlp": 0.01250784, + "epoch": 0.4704043288741921, + "flos": 69262381463040.0, + "grad_norm": 0.6854102840454825, + "language_loss": 0.56514406, + "learning_rate": 2.2877833845963487e-06, + "loss": 0.64092177, + "num_input_tokens_seen": 168084080, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 0.02386475, + "step": 7824, + "time_per_iteration": 3.223728656768799 + }, + { + "auxiliary_loss_clip": 0.06442541, + "auxiliary_loss_mlp": 0.01269654, + "balance_loss_clip": 0.06281068, + "balance_loss_mlp": 0.01255837, + "epoch": 0.47046445212686006, + "flos": 18046971728640.0, + "grad_norm": 1.8116047863427858, + "language_loss": 0.81242847, + "learning_rate": 2.2873979704164157e-06, + "loss": 0.88955039, + "num_input_tokens_seen": 168101555, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.13818359, + "step": 7825, + "time_per_iteration": 2.4815890789031982 + }, + { + "auxiliary_loss_clip": 0.06441189, + "auxiliary_loss_mlp": 0.01270609, + "balance_loss_clip": 0.06280564, + "balance_loss_mlp": 0.01257443, + "epoch": 0.470524575379528, + "flos": 23958261244800.0, + "grad_norm": 2.19673184020816, + "language_loss": 0.67126369, + "learning_rate": 2.287012545338324e-06, + "loss": 0.74838167, + "num_input_tokens_seen": 168121530, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.1317749, + "step": 7826, + "time_per_iteration": 2.5820834636688232 + }, + { + "auxiliary_loss_clip": 0.06443623, + "auxiliary_loss_mlp": 0.01268086, + "balance_loss_clip": 0.06281798, + "balance_loss_mlp": 0.01254824, + "epoch": 0.470584698632196, + "flos": 18119367256320.0, + "grad_norm": 1.7021383964965269, + "language_loss": 0.8395251, + "learning_rate": 2.2866271093766877e-06, + "loss": 0.91664219, + "num_input_tokens_seen": 168140335, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.13250732, + "step": 7827, + "time_per_iteration": 2.4966769218444824 + }, + { + "auxiliary_loss_clip": 0.06333943, + "auxiliary_loss_mlp": 0.01253247, + "balance_loss_clip": 0.06264865, + "balance_loss_mlp": 0.01250913, + "epoch": 0.47064482188486395, + "flos": 57268555413120.0, + "grad_norm": 0.786622619089935, + "language_loss": 0.55656797, + "learning_rate": 2.286241662546122e-06, + "loss": 0.63243991, + "num_input_tokens_seen": 168200535, + "router_z_loss_clip": 0.6875, + "router_z_loss_mlp": 0.02328491, + "step": 7828, + "time_per_iteration": 3.1594009399414062 + }, + { + "auxiliary_loss_clip": 0.06439656, + "auxiliary_loss_mlp": 0.01268005, + "balance_loss_clip": 0.06281954, + "balance_loss_mlp": 0.01254743, + "epoch": 0.4707049451375319, + "flos": 17900922862080.0, + "grad_norm": 1.8377127056601934, + "language_loss": 0.80904895, + "learning_rate": 2.285856204861245e-06, + "loss": 0.88612556, + "num_input_tokens_seen": 168219610, + "router_z_loss_clip": 1.57519531, + "router_z_loss_mlp": 0.13256836, + "step": 7829, + "time_per_iteration": 2.485140800476074 + }, + { + "auxiliary_loss_clip": 0.0643746, + "auxiliary_loss_mlp": 0.01272596, + "balance_loss_clip": 0.06279843, + "balance_loss_mlp": 0.0126024, + "epoch": 0.47076506839019994, + "flos": 25240402494720.0, + "grad_norm": 1.2696703606336757, + "language_loss": 0.76018727, + "learning_rate": 2.2854707363366703e-06, + "loss": 0.83728784, + "num_input_tokens_seen": 168242505, + "router_z_loss_clip": 1.57519531, + "router_z_loss_mlp": 0.12359619, + "step": 7830, + "time_per_iteration": 2.6114325523376465 + }, + { + "auxiliary_loss_clip": 0.06438384, + "auxiliary_loss_mlp": 0.01269492, + "balance_loss_clip": 0.06283822, + "balance_loss_mlp": 0.01257016, + "epoch": 0.4708251916428679, + "flos": 13484684620800.0, + "grad_norm": 2.037519777934202, + "language_loss": 0.78570348, + "learning_rate": 2.2850852569870177e-06, + "loss": 0.86278224, + "num_input_tokens_seen": 168260220, + "router_z_loss_clip": 1.54785156, + "router_z_loss_mlp": 0.12463379, + "step": 7831, + "time_per_iteration": 2.4759325981140137 + }, + { + "auxiliary_loss_clip": 0.06447008, + "auxiliary_loss_mlp": 0.01269408, + "balance_loss_clip": 0.06280228, + "balance_loss_mlp": 0.01255365, + "epoch": 0.47088531489553587, + "flos": 30154646678400.0, + "grad_norm": 1.667499960909574, + "language_loss": 0.7574442, + "learning_rate": 2.2846997668269033e-06, + "loss": 0.83460832, + "num_input_tokens_seen": 168277360, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.140625, + "step": 7832, + "time_per_iteration": 2.6298487186431885 + }, + { + "auxiliary_loss_clip": 0.06434175, + "auxiliary_loss_mlp": 0.01267877, + "balance_loss_clip": 0.0627791, + "balance_loss_mlp": 0.01256844, + "epoch": 0.47094543814820383, + "flos": 21804648405120.0, + "grad_norm": 1.2855995862723888, + "language_loss": 0.74791807, + "learning_rate": 2.2843142658709454e-06, + "loss": 0.82493854, + "num_input_tokens_seen": 168296605, + "router_z_loss_clip": 1.5625, + "router_z_loss_mlp": 0.1104126, + "step": 7833, + "time_per_iteration": 2.5464203357696533 + }, + { + "auxiliary_loss_clip": 0.06437977, + "auxiliary_loss_mlp": 0.01266439, + "balance_loss_clip": 0.06281009, + "balance_loss_mlp": 0.01254118, + "epoch": 0.4710055614008718, + "flos": 23009698298880.0, + "grad_norm": 1.569702279619268, + "language_loss": 0.76145566, + "learning_rate": 2.283928754133762e-06, + "loss": 0.83849978, + "num_input_tokens_seen": 168316205, + "router_z_loss_clip": 1.57128906, + "router_z_loss_mlp": 0.12329102, + "step": 7834, + "time_per_iteration": 2.6125214099884033 + }, + { + "auxiliary_loss_clip": 0.06433094, + "auxiliary_loss_mlp": 0.01266226, + "balance_loss_clip": 0.06278115, + "balance_loss_mlp": 0.01254078, + "epoch": 0.47106568465353976, + "flos": 42751256601600.0, + "grad_norm": 1.4292072421609816, + "language_loss": 0.66957295, + "learning_rate": 2.283543231629972e-06, + "loss": 0.74656606, + "num_input_tokens_seen": 168338935, + "router_z_loss_clip": 1.54882812, + "router_z_loss_mlp": 0.12158203, + "step": 7835, + "time_per_iteration": 5.518744707107544 + }, + { + "auxiliary_loss_clip": 0.06330478, + "auxiliary_loss_mlp": 0.01256395, + "balance_loss_clip": 0.06261497, + "balance_loss_mlp": 0.01253791, + "epoch": 0.4711258079062077, + "flos": 68571116807040.0, + "grad_norm": 0.853960187866431, + "language_loss": 0.62259066, + "learning_rate": 2.283157698374194e-06, + "loss": 0.69845939, + "num_input_tokens_seen": 168392800, + "router_z_loss_clip": 0.68798828, + "router_z_loss_mlp": 0.02604675, + "step": 7836, + "time_per_iteration": 3.1000564098358154 + }, + { + "auxiliary_loss_clip": 0.06439401, + "auxiliary_loss_mlp": 0.01267232, + "balance_loss_clip": 0.06274831, + "balance_loss_mlp": 0.01254006, + "epoch": 0.4711859311588757, + "flos": 25453522154880.0, + "grad_norm": 1.6974399997165228, + "language_loss": 0.69606686, + "learning_rate": 2.2827721543810475e-06, + "loss": 0.7731331, + "num_input_tokens_seen": 168412940, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.13238525, + "step": 7837, + "time_per_iteration": 2.5282108783721924 + }, + { + "auxiliary_loss_clip": 0.06437849, + "auxiliary_loss_mlp": 0.01268383, + "balance_loss_clip": 0.06277718, + "balance_loss_mlp": 0.01255061, + "epoch": 0.47124605441154366, + "flos": 21988488263040.0, + "grad_norm": 1.9658270715858404, + "language_loss": 0.66562694, + "learning_rate": 2.282386599665153e-06, + "loss": 0.74268925, + "num_input_tokens_seen": 168431995, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.13311768, + "step": 7838, + "time_per_iteration": 2.5846638679504395 + }, + { + "auxiliary_loss_clip": 0.06440166, + "auxiliary_loss_mlp": 0.01268362, + "balance_loss_clip": 0.06276537, + "balance_loss_mlp": 0.01255082, + "epoch": 0.4713061776642116, + "flos": 25420049648640.0, + "grad_norm": 5.850528361960432, + "language_loss": 0.77699667, + "learning_rate": 2.2820010342411304e-06, + "loss": 0.85408199, + "num_input_tokens_seen": 168454585, + "router_z_loss_clip": 1.63476562, + "router_z_loss_mlp": 0.1328125, + "step": 7839, + "time_per_iteration": 2.5414958000183105 + }, + { + "auxiliary_loss_clip": 0.06429788, + "auxiliary_loss_mlp": 0.01268311, + "balance_loss_clip": 0.06275208, + "balance_loss_mlp": 0.0125592, + "epoch": 0.4713663009168796, + "flos": 26549559486720.0, + "grad_norm": 2.242315176037199, + "language_loss": 0.73086643, + "learning_rate": 2.2816154581235993e-06, + "loss": 0.80784744, + "num_input_tokens_seen": 168471265, + "router_z_loss_clip": 1.54492188, + "router_z_loss_mlp": 0.12390137, + "step": 7840, + "time_per_iteration": 2.5519280433654785 + }, + { + "auxiliary_loss_clip": 0.06431505, + "auxiliary_loss_mlp": 0.01263733, + "balance_loss_clip": 0.06274457, + "balance_loss_mlp": 0.01251562, + "epoch": 0.47142642416954755, + "flos": 23630426945280.0, + "grad_norm": 1.566587637557085, + "language_loss": 0.75317335, + "learning_rate": 2.2812298713271833e-06, + "loss": 0.83012575, + "num_input_tokens_seen": 168491360, + "router_z_loss_clip": 1.5703125, + "router_z_loss_mlp": 0.1217041, + "step": 7841, + "time_per_iteration": 2.552835702896118 + }, + { + "auxiliary_loss_clip": 0.06436779, + "auxiliary_loss_mlp": 0.01265983, + "balance_loss_clip": 0.06277694, + "balance_loss_mlp": 0.01252947, + "epoch": 0.4714865474222155, + "flos": 22316783760000.0, + "grad_norm": 1.5550986710562988, + "language_loss": 0.70513815, + "learning_rate": 2.280844273866501e-06, + "loss": 0.78216577, + "num_input_tokens_seen": 168511335, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.13049316, + "step": 7842, + "time_per_iteration": 3.933955192565918 + }, + { + "auxiliary_loss_clip": 0.06436103, + "auxiliary_loss_mlp": 0.01268574, + "balance_loss_clip": 0.0627934, + "balance_loss_mlp": 0.01255891, + "epoch": 0.4715466706748835, + "flos": 17828317699200.0, + "grad_norm": 1.9804632158033957, + "language_loss": 0.79634649, + "learning_rate": 2.280458665756177e-06, + "loss": 0.87339324, + "num_input_tokens_seen": 168529920, + "router_z_loss_clip": 1.56738281, + "router_z_loss_mlp": 0.12677002, + "step": 7843, + "time_per_iteration": 2.4907753467559814 + }, + { + "auxiliary_loss_clip": 0.06434722, + "auxiliary_loss_mlp": 0.01265319, + "balance_loss_clip": 0.06276919, + "balance_loss_mlp": 0.0125301, + "epoch": 0.4716067939275515, + "flos": 23666289292800.0, + "grad_norm": 1.6302002599700955, + "language_loss": 0.74402809, + "learning_rate": 2.280073047010832e-06, + "loss": 0.82102847, + "num_input_tokens_seen": 168550595, + "router_z_loss_clip": 1.57714844, + "router_z_loss_mlp": 0.12298584, + "step": 7844, + "time_per_iteration": 2.5746476650238037 + }, + { + "auxiliary_loss_clip": 0.06436022, + "auxiliary_loss_mlp": 0.0127037, + "balance_loss_clip": 0.0627865, + "balance_loss_mlp": 0.01257138, + "epoch": 0.47166691718021947, + "flos": 17935778960640.0, + "grad_norm": 2.158450508091108, + "language_loss": 0.78678179, + "learning_rate": 2.279687417645088e-06, + "loss": 0.86384571, + "num_input_tokens_seen": 168569765, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.13238525, + "step": 7845, + "time_per_iteration": 2.4827558994293213 + }, + { + "auxiliary_loss_clip": 0.06430048, + "auxiliary_loss_mlp": 0.01266435, + "balance_loss_clip": 0.06273912, + "balance_loss_mlp": 0.01254991, + "epoch": 0.47172704043288743, + "flos": 26621787306240.0, + "grad_norm": 1.2653259456946966, + "language_loss": 0.73458219, + "learning_rate": 2.2793017776735703e-06, + "loss": 0.81154698, + "num_input_tokens_seen": 168591525, + "router_z_loss_clip": 1.56152344, + "router_z_loss_mlp": 0.11450195, + "step": 7846, + "time_per_iteration": 2.586641550064087 + }, + { + "auxiliary_loss_clip": 0.06430165, + "auxiliary_loss_mlp": 0.01268985, + "balance_loss_clip": 0.06277196, + "balance_loss_mlp": 0.01256754, + "epoch": 0.4717871636855554, + "flos": 27929225289600.0, + "grad_norm": 1.2918573904220954, + "language_loss": 0.74434412, + "learning_rate": 2.2789161271109e-06, + "loss": 0.82133555, + "num_input_tokens_seen": 168611235, + "router_z_loss_clip": 1.52929688, + "router_z_loss_mlp": 0.12243652, + "step": 7847, + "time_per_iteration": 3.984661817550659 + }, + { + "auxiliary_loss_clip": 0.06434786, + "auxiliary_loss_mlp": 0.0126996, + "balance_loss_clip": 0.06276622, + "balance_loss_mlp": 0.01258123, + "epoch": 0.47184728693822336, + "flos": 14507571738240.0, + "grad_norm": 1.68455833448323, + "language_loss": 0.81004, + "learning_rate": 2.278530465971703e-06, + "loss": 0.88708746, + "num_input_tokens_seen": 168628710, + "router_z_loss_clip": 1.58105469, + "router_z_loss_mlp": 0.1184082, + "step": 7848, + "time_per_iteration": 2.482759714126587 + }, + { + "auxiliary_loss_clip": 0.06438575, + "auxiliary_loss_mlp": 0.01265775, + "balance_loss_clip": 0.06279046, + "balance_loss_mlp": 0.01252394, + "epoch": 0.47190741019089133, + "flos": 17862041767680.0, + "grad_norm": 1.8089027190058555, + "language_loss": 0.70106918, + "learning_rate": 2.2781447942706032e-06, + "loss": 0.77811265, + "num_input_tokens_seen": 168645645, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.1338501, + "step": 7849, + "time_per_iteration": 2.5101277828216553 + }, + { + "auxiliary_loss_clip": 0.06444675, + "auxiliary_loss_mlp": 0.01269385, + "balance_loss_clip": 0.06280467, + "balance_loss_mlp": 0.0125539, + "epoch": 0.4719675334435593, + "flos": 17901384059520.0, + "grad_norm": 1.915736246727948, + "language_loss": 0.69964916, + "learning_rate": 2.277759112022224e-06, + "loss": 0.77678978, + "num_input_tokens_seen": 168664165, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.14001465, + "step": 7850, + "time_per_iteration": 2.46455979347229 + }, + { + "auxiliary_loss_clip": 0.06441706, + "auxiliary_loss_mlp": 0.01269243, + "balance_loss_clip": 0.0627879, + "balance_loss_mlp": 0.01255951, + "epoch": 0.47202765669622726, + "flos": 20710665498240.0, + "grad_norm": 1.953909301983903, + "language_loss": 0.75806379, + "learning_rate": 2.2773734192411916e-06, + "loss": 0.83517331, + "num_input_tokens_seen": 168681940, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.13305664, + "step": 7851, + "time_per_iteration": 2.5298452377319336 + }, + { + "auxiliary_loss_clip": 0.06440549, + "auxiliary_loss_mlp": 0.01271731, + "balance_loss_clip": 0.06277989, + "balance_loss_mlp": 0.01257534, + "epoch": 0.4720877799488952, + "flos": 16365439192320.0, + "grad_norm": 1.905541371588542, + "language_loss": 0.76767981, + "learning_rate": 2.276987715942132e-06, + "loss": 0.84480262, + "num_input_tokens_seen": 168698830, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.14196777, + "step": 7852, + "time_per_iteration": 2.473349094390869 + }, + { + "auxiliary_loss_clip": 0.06431545, + "auxiliary_loss_mlp": 0.01270384, + "balance_loss_clip": 0.06273787, + "balance_loss_mlp": 0.01257742, + "epoch": 0.4721479032015632, + "flos": 20674509661440.0, + "grad_norm": 2.394869083314355, + "language_loss": 0.69452804, + "learning_rate": 2.2766020021396696e-06, + "loss": 0.77154732, + "num_input_tokens_seen": 168718305, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.12658691, + "step": 7853, + "time_per_iteration": 2.537550210952759 + }, + { + "auxiliary_loss_clip": 0.06333929, + "auxiliary_loss_mlp": 0.01250651, + "balance_loss_clip": 0.06264801, + "balance_loss_mlp": 0.01248457, + "epoch": 0.47220802645423116, + "flos": 67773367681920.0, + "grad_norm": 0.6896509796832918, + "language_loss": 0.50247812, + "learning_rate": 2.276216277848432e-06, + "loss": 0.57832396, + "num_input_tokens_seen": 168782365, + "router_z_loss_clip": 0.69140625, + "router_z_loss_mlp": 0.02197266, + "step": 7854, + "time_per_iteration": 3.2550642490386963 + }, + { + "auxiliary_loss_clip": 0.06436136, + "auxiliary_loss_mlp": 0.0126914, + "balance_loss_clip": 0.06276229, + "balance_loss_mlp": 0.0125583, + "epoch": 0.4722681497068991, + "flos": 20927474737920.0, + "grad_norm": 1.8228483302344913, + "language_loss": 0.63672256, + "learning_rate": 2.2758305430830455e-06, + "loss": 0.71377528, + "num_input_tokens_seen": 168800485, + "router_z_loss_clip": 1.60058594, + "router_z_loss_mlp": 0.13317871, + "step": 7855, + "time_per_iteration": 2.5252599716186523 + }, + { + "auxiliary_loss_clip": 0.06439453, + "auxiliary_loss_mlp": 0.01268333, + "balance_loss_clip": 0.06280654, + "balance_loss_mlp": 0.01255715, + "epoch": 0.4723282729595671, + "flos": 28300594584960.0, + "grad_norm": 1.8174966086465816, + "language_loss": 0.76136196, + "learning_rate": 2.2754447978581376e-06, + "loss": 0.83843982, + "num_input_tokens_seen": 168818965, + "router_z_loss_clip": 1.58789062, + "router_z_loss_mlp": 0.1262207, + "step": 7856, + "time_per_iteration": 2.560236692428589 + }, + { + "auxiliary_loss_clip": 0.06436295, + "auxiliary_loss_mlp": 0.01269996, + "balance_loss_clip": 0.06279726, + "balance_loss_mlp": 0.01258284, + "epoch": 0.4723883962122351, + "flos": 27132287506560.0, + "grad_norm": 1.7138943667728106, + "language_loss": 0.750875, + "learning_rate": 2.2750590421883347e-06, + "loss": 0.8279379, + "num_input_tokens_seen": 168840355, + "router_z_loss_clip": 1.56347656, + "router_z_loss_mlp": 0.11706543, + "step": 7857, + "time_per_iteration": 2.5613489151000977 + }, + { + "auxiliary_loss_clip": 0.06436294, + "auxiliary_loss_mlp": 0.01270819, + "balance_loss_clip": 0.0628143, + "balance_loss_mlp": 0.01258946, + "epoch": 0.47244851946490307, + "flos": 31544794241280.0, + "grad_norm": 1.4694813046790665, + "language_loss": 0.64839488, + "learning_rate": 2.2746732760882655e-06, + "loss": 0.72546607, + "num_input_tokens_seen": 168861765, + "router_z_loss_clip": 1.54882812, + "router_z_loss_mlp": 0.11889648, + "step": 7858, + "time_per_iteration": 2.5957653522491455 + }, + { + "auxiliary_loss_clip": 0.06431169, + "auxiliary_loss_mlp": 0.01271908, + "balance_loss_clip": 0.06278542, + "balance_loss_mlp": 0.01259719, + "epoch": 0.47250864271757104, + "flos": 20892828274560.0, + "grad_norm": 1.741748713475879, + "language_loss": 0.71104157, + "learning_rate": 2.2742874995725575e-06, + "loss": 0.78807235, + "num_input_tokens_seen": 168881310, + "router_z_loss_clip": 1.52539062, + "router_z_loss_mlp": 0.12194824, + "step": 7859, + "time_per_iteration": 2.541404962539673 + }, + { + "auxiliary_loss_clip": 0.06440333, + "auxiliary_loss_mlp": 0.01270209, + "balance_loss_clip": 0.06277637, + "balance_loss_mlp": 0.01257776, + "epoch": 0.472568765970239, + "flos": 20528376940800.0, + "grad_norm": 1.7364161900477437, + "language_loss": 0.62341475, + "learning_rate": 2.2739017126558413e-06, + "loss": 0.70052016, + "num_input_tokens_seen": 168899470, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.12426758, + "step": 7860, + "time_per_iteration": 2.5165910720825195 + }, + { + "auxiliary_loss_clip": 0.06438711, + "auxiliary_loss_mlp": 0.01267574, + "balance_loss_clip": 0.06280093, + "balance_loss_mlp": 0.01254914, + "epoch": 0.47262888922290697, + "flos": 35813306534400.0, + "grad_norm": 2.092826385669962, + "language_loss": 0.72540921, + "learning_rate": 2.2735159153527445e-06, + "loss": 0.80247205, + "num_input_tokens_seen": 168921495, + "router_z_loss_clip": 1.58398438, + "router_z_loss_mlp": 0.12658691, + "step": 7861, + "time_per_iteration": 2.6575915813446045 + }, + { + "auxiliary_loss_clip": 0.06439754, + "auxiliary_loss_mlp": 0.01268288, + "balance_loss_clip": 0.0628088, + "balance_loss_mlp": 0.01254734, + "epoch": 0.47268901247557493, + "flos": 20674006536960.0, + "grad_norm": 2.2960282018232965, + "language_loss": 0.85134012, + "learning_rate": 2.273130107677896e-06, + "loss": 0.92842054, + "num_input_tokens_seen": 168940515, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 0.13555908, + "step": 7862, + "time_per_iteration": 2.4969582557678223 + }, + { + "auxiliary_loss_clip": 0.06443156, + "auxiliary_loss_mlp": 0.01269094, + "balance_loss_clip": 0.06283151, + "balance_loss_mlp": 0.012566, + "epoch": 0.4727491357282429, + "flos": 19579394724480.0, + "grad_norm": 1.7759944267926648, + "language_loss": 0.84885079, + "learning_rate": 2.272744289645927e-06, + "loss": 0.92597324, + "num_input_tokens_seen": 168958340, + "router_z_loss_clip": 1.60058594, + "router_z_loss_mlp": 0.12506104, + "step": 7863, + "time_per_iteration": 2.545445442199707 + }, + { + "auxiliary_loss_clip": 0.06435807, + "auxiliary_loss_mlp": 0.01268812, + "balance_loss_clip": 0.06279373, + "balance_loss_mlp": 0.01256873, + "epoch": 0.47280925898091086, + "flos": 18222090762240.0, + "grad_norm": 1.953539417417106, + "language_loss": 0.6582734, + "learning_rate": 2.272358461271467e-06, + "loss": 0.73531955, + "num_input_tokens_seen": 168974850, + "router_z_loss_clip": 1.5625, + "router_z_loss_mlp": 0.11950684, + "step": 7864, + "time_per_iteration": 2.4730403423309326 + }, + { + "auxiliary_loss_clip": 0.06438613, + "auxiliary_loss_mlp": 0.01269576, + "balance_loss_clip": 0.06280264, + "balance_loss_mlp": 0.01257619, + "epoch": 0.4728693822335788, + "flos": 17827604939520.0, + "grad_norm": 1.945688521953863, + "language_loss": 0.65635985, + "learning_rate": 2.271972622569147e-06, + "loss": 0.73344177, + "num_input_tokens_seen": 168992860, + "router_z_loss_clip": 1.58398438, + "router_z_loss_mlp": 0.11962891, + "step": 7865, + "time_per_iteration": 2.498135805130005 + }, + { + "auxiliary_loss_clip": 0.06430352, + "auxiliary_loss_mlp": 0.01270111, + "balance_loss_clip": 0.06277367, + "balance_loss_mlp": 0.01257671, + "epoch": 0.4729295054862468, + "flos": 20601359447040.0, + "grad_norm": 2.5713138482446234, + "language_loss": 0.73970878, + "learning_rate": 2.2715867735535976e-06, + "loss": 0.81671345, + "num_input_tokens_seen": 169010325, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.12445068, + "step": 7866, + "time_per_iteration": 2.495232582092285 + }, + { + "auxiliary_loss_clip": 0.06437797, + "auxiliary_loss_mlp": 0.01265922, + "balance_loss_clip": 0.06279261, + "balance_loss_mlp": 0.01254347, + "epoch": 0.47298962873891476, + "flos": 23374862392320.0, + "grad_norm": 2.8570557032751522, + "language_loss": 0.83387589, + "learning_rate": 2.271200914239451e-06, + "loss": 0.91091311, + "num_input_tokens_seen": 169029840, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.11578369, + "step": 7867, + "time_per_iteration": 2.565706968307495 + }, + { + "auxiliary_loss_clip": 0.06430209, + "auxiliary_loss_mlp": 0.01265413, + "balance_loss_clip": 0.06275865, + "balance_loss_mlp": 0.01253391, + "epoch": 0.4730497519915827, + "flos": 22058410095360.0, + "grad_norm": 1.6535025871822049, + "language_loss": 0.79521739, + "learning_rate": 2.2708150446413385e-06, + "loss": 0.87217355, + "num_input_tokens_seen": 169049975, + "router_z_loss_clip": 1.54492188, + "router_z_loss_mlp": 0.12036133, + "step": 7868, + "time_per_iteration": 2.549220561981201 + }, + { + "auxiliary_loss_clip": 0.06442262, + "auxiliary_loss_mlp": 0.01268103, + "balance_loss_clip": 0.06279381, + "balance_loss_mlp": 0.01255169, + "epoch": 0.4731098752442507, + "flos": 21076165008000.0, + "grad_norm": 1.8227151972017304, + "language_loss": 0.75178695, + "learning_rate": 2.2704291647738915e-06, + "loss": 0.82889056, + "num_input_tokens_seen": 169069540, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.12945557, + "step": 7869, + "time_per_iteration": 2.5188441276550293 + }, + { + "auxiliary_loss_clip": 0.06441551, + "auxiliary_loss_mlp": 0.01271574, + "balance_loss_clip": 0.06282122, + "balance_loss_mlp": 0.01258014, + "epoch": 0.4731699984969187, + "flos": 22535395862400.0, + "grad_norm": 1.4513841331120019, + "language_loss": 0.73749697, + "learning_rate": 2.2700432746517443e-06, + "loss": 0.81462824, + "num_input_tokens_seen": 169089940, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.13555908, + "step": 7870, + "time_per_iteration": 2.520761251449585 + }, + { + "auxiliary_loss_clip": 0.0644481, + "auxiliary_loss_mlp": 0.01272916, + "balance_loss_clip": 0.06280311, + "balance_loss_mlp": 0.01259231, + "epoch": 0.4732301217495867, + "flos": 24904769765760.0, + "grad_norm": 1.9907019842809281, + "language_loss": 0.81971508, + "learning_rate": 2.2696573742895292e-06, + "loss": 0.89689231, + "num_input_tokens_seen": 169109650, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.13684082, + "step": 7871, + "time_per_iteration": 2.7390120029449463 + }, + { + "auxiliary_loss_clip": 0.06436551, + "auxiliary_loss_mlp": 0.01267445, + "balance_loss_clip": 0.06278443, + "balance_loss_mlp": 0.01254261, + "epoch": 0.47329024500225464, + "flos": 22791128123520.0, + "grad_norm": 1.7255093919697873, + "language_loss": 0.76232624, + "learning_rate": 2.269271463701879e-06, + "loss": 0.8393662, + "num_input_tokens_seen": 169128990, + "router_z_loss_clip": 1.58203125, + "router_z_loss_mlp": 0.13189697, + "step": 7872, + "time_per_iteration": 2.6356093883514404 + }, + { + "auxiliary_loss_clip": 0.06438267, + "auxiliary_loss_mlp": 0.0127024, + "balance_loss_clip": 0.06279084, + "balance_loss_mlp": 0.01256847, + "epoch": 0.4733503682549226, + "flos": 38705884531200.0, + "grad_norm": 1.877318740282883, + "language_loss": 0.67809367, + "learning_rate": 2.268885542903428e-06, + "loss": 0.75517869, + "num_input_tokens_seen": 169154645, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.1338501, + "step": 7873, + "time_per_iteration": 2.7092511653900146 + }, + { + "auxiliary_loss_clip": 0.06434255, + "auxiliary_loss_mlp": 0.01269292, + "balance_loss_clip": 0.06277623, + "balance_loss_mlp": 0.0125699, + "epoch": 0.47341049150759057, + "flos": 22973584389120.0, + "grad_norm": 1.442307420398724, + "language_loss": 0.72792107, + "learning_rate": 2.26849961190881e-06, + "loss": 0.80495656, + "num_input_tokens_seen": 169174995, + "router_z_loss_clip": 1.56738281, + "router_z_loss_mlp": 0.12298584, + "step": 7874, + "time_per_iteration": 3.9462826251983643 + }, + { + "auxiliary_loss_clip": 0.06440391, + "auxiliary_loss_mlp": 0.01271103, + "balance_loss_clip": 0.06281446, + "balance_loss_mlp": 0.01258431, + "epoch": 0.47347061476025853, + "flos": 14543769502080.0, + "grad_norm": 2.253933500743018, + "language_loss": 0.65938866, + "learning_rate": 2.26811367073266e-06, + "loss": 0.7365036, + "num_input_tokens_seen": 169191815, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.12658691, + "step": 7875, + "time_per_iteration": 4.013593435287476 + }, + { + "auxiliary_loss_clip": 0.06443131, + "auxiliary_loss_mlp": 0.01267762, + "balance_loss_clip": 0.06284615, + "balance_loss_mlp": 0.01254571, + "epoch": 0.4735307380129265, + "flos": 30271080326400.0, + "grad_norm": 2.373261357507393, + "language_loss": 0.80868709, + "learning_rate": 2.2677277193896125e-06, + "loss": 0.88579601, + "num_input_tokens_seen": 169210430, + "router_z_loss_clip": 1.58398438, + "router_z_loss_mlp": 0.13183594, + "step": 7876, + "time_per_iteration": 2.577624797821045 + }, + { + "auxiliary_loss_clip": 0.06439028, + "auxiliary_loss_mlp": 0.01268228, + "balance_loss_clip": 0.0628099, + "balance_loss_mlp": 0.0125583, + "epoch": 0.47359086126559446, + "flos": 19397148094080.0, + "grad_norm": 1.7113236821341018, + "language_loss": 0.792979, + "learning_rate": 2.267341757894304e-06, + "loss": 0.87005162, + "num_input_tokens_seen": 169229295, + "router_z_loss_clip": 1.58007812, + "router_z_loss_mlp": 0.12402344, + "step": 7877, + "time_per_iteration": 2.5248916149139404 + }, + { + "auxiliary_loss_clip": 0.06431633, + "auxiliary_loss_mlp": 0.01269276, + "balance_loss_clip": 0.0627646, + "balance_loss_mlp": 0.01256938, + "epoch": 0.47365098451826243, + "flos": 21944995194240.0, + "grad_norm": 1.9478135029908927, + "language_loss": 0.70673579, + "learning_rate": 2.2669557862613685e-06, + "loss": 0.78374487, + "num_input_tokens_seen": 169247855, + "router_z_loss_clip": 1.55273438, + "router_z_loss_mlp": 0.12335205, + "step": 7878, + "time_per_iteration": 2.5023298263549805 + }, + { + "auxiliary_loss_clip": 0.06432398, + "auxiliary_loss_mlp": 0.01268548, + "balance_loss_clip": 0.06278147, + "balance_loss_mlp": 0.01256382, + "epoch": 0.4737111077709304, + "flos": 25851571776000.0, + "grad_norm": 1.6314467446120229, + "language_loss": 0.75137293, + "learning_rate": 2.2665698045054425e-06, + "loss": 0.82838243, + "num_input_tokens_seen": 169268860, + "router_z_loss_clip": 1.54199219, + "router_z_loss_mlp": 0.1217041, + "step": 7879, + "time_per_iteration": 2.623811960220337 + }, + { + "auxiliary_loss_clip": 0.06320075, + "auxiliary_loss_mlp": 0.01265678, + "balance_loss_clip": 0.06251323, + "balance_loss_mlp": 0.01262992, + "epoch": 0.47377123102359836, + "flos": 67779461831040.0, + "grad_norm": 0.7167002771941348, + "language_loss": 0.6131798, + "learning_rate": 2.266183812641164e-06, + "loss": 0.68903732, + "num_input_tokens_seen": 169331855, + "router_z_loss_clip": 0.6875, + "router_z_loss_mlp": 0.02690125, + "step": 7880, + "time_per_iteration": 3.159388303756714 + }, + { + "auxiliary_loss_clip": 0.06434937, + "auxiliary_loss_mlp": 0.01268898, + "balance_loss_clip": 0.06278567, + "balance_loss_mlp": 0.01256035, + "epoch": 0.4738313542762663, + "flos": 24322796432640.0, + "grad_norm": 1.5964233369580554, + "language_loss": 0.68369412, + "learning_rate": 2.2657978106831675e-06, + "loss": 0.76073253, + "num_input_tokens_seen": 169352175, + "router_z_loss_clip": 1.56542969, + "router_z_loss_mlp": 0.12866211, + "step": 7881, + "time_per_iteration": 4.010294198989868 + }, + { + "auxiliary_loss_clip": 0.06434233, + "auxiliary_loss_mlp": 0.01267509, + "balance_loss_clip": 0.06279774, + "balance_loss_mlp": 0.01255964, + "epoch": 0.4738914775289343, + "flos": 20711797528320.0, + "grad_norm": 1.8204307046333812, + "language_loss": 0.77692872, + "learning_rate": 2.265411798646092e-06, + "loss": 0.85394609, + "num_input_tokens_seen": 169371215, + "router_z_loss_clip": 1.54492188, + "router_z_loss_mlp": 0.11541748, + "step": 7882, + "time_per_iteration": 2.5205814838409424 + }, + { + "auxiliary_loss_clip": 0.06437336, + "auxiliary_loss_mlp": 0.01269511, + "balance_loss_clip": 0.06279442, + "balance_loss_mlp": 0.01257208, + "epoch": 0.4739516007816023, + "flos": 25453228665600.0, + "grad_norm": 1.3763225621826927, + "language_loss": 0.76357329, + "learning_rate": 2.2650257765445747e-06, + "loss": 0.84064174, + "num_input_tokens_seen": 169391745, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.12304688, + "step": 7883, + "time_per_iteration": 2.5500354766845703 + }, + { + "auxiliary_loss_clip": 0.0643235, + "auxiliary_loss_mlp": 0.0126636, + "balance_loss_clip": 0.06278035, + "balance_loss_mlp": 0.01255101, + "epoch": 0.4740117240342703, + "flos": 19980463092480.0, + "grad_norm": 1.6935272320670107, + "language_loss": 0.72225314, + "learning_rate": 2.2646397443932525e-06, + "loss": 0.79924023, + "num_input_tokens_seen": 169409845, + "router_z_loss_clip": 1.54101562, + "router_z_loss_mlp": 0.1126709, + "step": 7884, + "time_per_iteration": 2.5347273349761963 + }, + { + "auxiliary_loss_clip": 0.06443354, + "auxiliary_loss_mlp": 0.01266451, + "balance_loss_clip": 0.06279097, + "balance_loss_mlp": 0.01252944, + "epoch": 0.47407184728693824, + "flos": 15665229348480.0, + "grad_norm": 2.6351569696409314, + "language_loss": 0.82340348, + "learning_rate": 2.2642537022067655e-06, + "loss": 0.90050149, + "num_input_tokens_seen": 169426085, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.13513184, + "step": 7885, + "time_per_iteration": 2.482201099395752 + }, + { + "auxiliary_loss_clip": 0.06433931, + "auxiliary_loss_mlp": 0.01271088, + "balance_loss_clip": 0.06277239, + "balance_loss_mlp": 0.01259262, + "epoch": 0.4741319705396062, + "flos": 18594843649920.0, + "grad_norm": 1.913533031103811, + "language_loss": 0.7349298, + "learning_rate": 2.263867649999751e-06, + "loss": 0.81198001, + "num_input_tokens_seen": 169444705, + "router_z_loss_clip": 1.56738281, + "router_z_loss_mlp": 0.11816406, + "step": 7886, + "time_per_iteration": 3.95589017868042 + }, + { + "auxiliary_loss_clip": 0.06445764, + "auxiliary_loss_mlp": 0.01269023, + "balance_loss_clip": 0.0628106, + "balance_loss_mlp": 0.01256655, + "epoch": 0.47419209379227417, + "flos": 13266114445440.0, + "grad_norm": 1.8957247676006206, + "language_loss": 0.74131465, + "learning_rate": 2.263481587786849e-06, + "loss": 0.81846249, + "num_input_tokens_seen": 169460850, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.12384033, + "step": 7887, + "time_per_iteration": 2.558175563812256 + }, + { + "auxiliary_loss_clip": 0.06431396, + "auxiliary_loss_mlp": 0.01269479, + "balance_loss_clip": 0.06276178, + "balance_loss_mlp": 0.01257499, + "epoch": 0.47425221704494214, + "flos": 20049630238080.0, + "grad_norm": 2.0468025330010016, + "language_loss": 0.7742272, + "learning_rate": 2.2630955155826993e-06, + "loss": 0.85123587, + "num_input_tokens_seen": 169478890, + "router_z_loss_clip": 1.55273438, + "router_z_loss_mlp": 0.11987305, + "step": 7888, + "time_per_iteration": 2.5532913208007812 + }, + { + "auxiliary_loss_clip": 0.06440586, + "auxiliary_loss_mlp": 0.01268245, + "balance_loss_clip": 0.06282103, + "balance_loss_mlp": 0.01255978, + "epoch": 0.4743123402976101, + "flos": 27279300695040.0, + "grad_norm": 1.7248476258859713, + "language_loss": 0.72833514, + "learning_rate": 2.2627094334019406e-06, + "loss": 0.80542344, + "num_input_tokens_seen": 169499690, + "router_z_loss_clip": 1.58398438, + "router_z_loss_mlp": 0.1227417, + "step": 7889, + "time_per_iteration": 2.635697603225708 + }, + { + "auxiliary_loss_clip": 0.06323753, + "auxiliary_loss_mlp": 0.01252671, + "balance_loss_clip": 0.0625556, + "balance_loss_mlp": 0.01250217, + "epoch": 0.47437246355027807, + "flos": 55410771813120.0, + "grad_norm": 0.6980000025852627, + "language_loss": 0.55692458, + "learning_rate": 2.262323341259214e-06, + "loss": 0.63268882, + "num_input_tokens_seen": 169560475, + "router_z_loss_clip": 0.68261719, + "router_z_loss_mlp": 0.02452087, + "step": 7890, + "time_per_iteration": 3.196005344390869 + }, + { + "auxiliary_loss_clip": 0.06440383, + "auxiliary_loss_mlp": 0.01269286, + "balance_loss_clip": 0.06280889, + "balance_loss_mlp": 0.01255929, + "epoch": 0.47443258680294603, + "flos": 23885278738560.0, + "grad_norm": 1.7863596191541609, + "language_loss": 0.65755105, + "learning_rate": 2.2619372391691605e-06, + "loss": 0.73464775, + "num_input_tokens_seen": 169580110, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.13366699, + "step": 7891, + "time_per_iteration": 2.5535497665405273 + }, + { + "auxiliary_loss_clip": 0.06448144, + "auxiliary_loss_mlp": 0.01270649, + "balance_loss_clip": 0.06284909, + "balance_loss_mlp": 0.01256892, + "epoch": 0.474492710055614, + "flos": 21983666653440.0, + "grad_norm": 2.0785188787991133, + "language_loss": 0.70081401, + "learning_rate": 2.26155112714642e-06, + "loss": 0.77800196, + "num_input_tokens_seen": 169597510, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.13757324, + "step": 7892, + "time_per_iteration": 2.512953519821167 + }, + { + "auxiliary_loss_clip": 0.06322581, + "auxiliary_loss_mlp": 0.01253797, + "balance_loss_clip": 0.06254438, + "balance_loss_mlp": 0.01251454, + "epoch": 0.47455283330828196, + "flos": 62577186837120.0, + "grad_norm": 0.7954751994073583, + "language_loss": 0.58515328, + "learning_rate": 2.2611650052056355e-06, + "loss": 0.66091704, + "num_input_tokens_seen": 169660010, + "router_z_loss_clip": 0.6796875, + "router_z_loss_mlp": 0.02337646, + "step": 7893, + "time_per_iteration": 3.2652807235717773 + }, + { + "auxiliary_loss_clip": 0.06435462, + "auxiliary_loss_mlp": 0.01271377, + "balance_loss_clip": 0.06278428, + "balance_loss_mlp": 0.01259498, + "epoch": 0.47461295656094993, + "flos": 12098478199680.0, + "grad_norm": 1.6548256161788057, + "language_loss": 0.77515912, + "learning_rate": 2.2607788733614463e-06, + "loss": 0.85222745, + "num_input_tokens_seen": 169678485, + "router_z_loss_clip": 1.57226562, + "router_z_loss_mlp": 0.11871338, + "step": 7894, + "time_per_iteration": 2.4962351322174072 + }, + { + "auxiliary_loss_clip": 0.06436545, + "auxiliary_loss_mlp": 0.01267591, + "balance_loss_clip": 0.06277076, + "balance_loss_mlp": 0.01254883, + "epoch": 0.4746730798136179, + "flos": 20890522287360.0, + "grad_norm": 1.8932038979458137, + "language_loss": 0.75310624, + "learning_rate": 2.260392731628497e-06, + "loss": 0.83014762, + "num_input_tokens_seen": 169697335, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.1270752, + "step": 7895, + "time_per_iteration": 2.536651611328125 + }, + { + "auxiliary_loss_clip": 0.06438908, + "auxiliary_loss_mlp": 0.0126825, + "balance_loss_clip": 0.06280944, + "balance_loss_mlp": 0.012559, + "epoch": 0.4747332030662859, + "flos": 19981008144000.0, + "grad_norm": 1.9186877339725528, + "language_loss": 0.824898, + "learning_rate": 2.260006580021429e-06, + "loss": 0.90196961, + "num_input_tokens_seen": 169715395, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.12341309, + "step": 7896, + "time_per_iteration": 2.5451180934906006 + }, + { + "auxiliary_loss_clip": 0.06438936, + "auxiliary_loss_mlp": 0.0126766, + "balance_loss_clip": 0.06281327, + "balance_loss_mlp": 0.01254964, + "epoch": 0.4747933263189539, + "flos": 16039701244800.0, + "grad_norm": 4.910262672985542, + "language_loss": 0.76465023, + "learning_rate": 2.259620418554886e-06, + "loss": 0.84171617, + "num_input_tokens_seen": 169733755, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.12689209, + "step": 7897, + "time_per_iteration": 2.529157876968384 + }, + { + "auxiliary_loss_clip": 0.06443989, + "auxiliary_loss_mlp": 0.012709, + "balance_loss_clip": 0.0627964, + "balance_loss_mlp": 0.01257376, + "epoch": 0.47485344957162184, + "flos": 13960370649600.0, + "grad_norm": 1.9701771451271233, + "language_loss": 0.64411497, + "learning_rate": 2.25923424724351e-06, + "loss": 0.72126389, + "num_input_tokens_seen": 169751390, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.13519287, + "step": 7898, + "time_per_iteration": 2.4861059188842773 + }, + { + "auxiliary_loss_clip": 0.06443477, + "auxiliary_loss_mlp": 0.01269988, + "balance_loss_clip": 0.0628337, + "balance_loss_mlp": 0.01256774, + "epoch": 0.4749135728242898, + "flos": 20455352507520.0, + "grad_norm": 2.55946780946792, + "language_loss": 0.70317411, + "learning_rate": 2.258848066101946e-06, + "loss": 0.78030878, + "num_input_tokens_seen": 169769500, + "router_z_loss_clip": 1.60058594, + "router_z_loss_mlp": 0.13201904, + "step": 7899, + "time_per_iteration": 2.5035181045532227 + }, + { + "auxiliary_loss_clip": 0.06438522, + "auxiliary_loss_mlp": 0.0127023, + "balance_loss_clip": 0.06280558, + "balance_loss_mlp": 0.01257701, + "epoch": 0.4749736960769578, + "flos": 28957604849280.0, + "grad_norm": 1.797290129910965, + "language_loss": 0.68821597, + "learning_rate": 2.258461875144837e-06, + "loss": 0.76530349, + "num_input_tokens_seen": 169789215, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.12536621, + "step": 7900, + "time_per_iteration": 2.638021469116211 + }, + { + "auxiliary_loss_clip": 0.06435557, + "auxiliary_loss_mlp": 0.0126873, + "balance_loss_clip": 0.06277159, + "balance_loss_mlp": 0.01254216, + "epoch": 0.47503381932962574, + "flos": 31946407660800.0, + "grad_norm": 2.027602507157595, + "language_loss": 0.70583236, + "learning_rate": 2.2580756743868273e-06, + "loss": 0.78287518, + "num_input_tokens_seen": 169808825, + "router_z_loss_clip": 1.58300781, + "router_z_loss_mlp": 0.14501953, + "step": 7901, + "time_per_iteration": 2.6210362911224365 + }, + { + "auxiliary_loss_clip": 0.06438562, + "auxiliary_loss_mlp": 0.01269369, + "balance_loss_clip": 0.06280936, + "balance_loss_mlp": 0.01256817, + "epoch": 0.4750939425822937, + "flos": 22133782442880.0, + "grad_norm": 1.48556411263083, + "language_loss": 0.73796129, + "learning_rate": 2.2576894638425636e-06, + "loss": 0.81504059, + "num_input_tokens_seen": 169827590, + "router_z_loss_clip": 1.57617188, + "router_z_loss_mlp": 0.12542725, + "step": 7902, + "time_per_iteration": 2.5175282955169678 + }, + { + "auxiliary_loss_clip": 0.06431635, + "auxiliary_loss_mlp": 0.01269606, + "balance_loss_clip": 0.06275479, + "balance_loss_mlp": 0.0125747, + "epoch": 0.47515406583496167, + "flos": 20856378948480.0, + "grad_norm": 3.332476837285125, + "language_loss": 0.69285202, + "learning_rate": 2.257303243526688e-06, + "loss": 0.76986444, + "num_input_tokens_seen": 169844925, + "router_z_loss_clip": 1.5625, + "router_z_loss_mlp": 0.12139893, + "step": 7903, + "time_per_iteration": 2.5292611122131348 + }, + { + "auxiliary_loss_clip": 0.06430157, + "auxiliary_loss_mlp": 0.01266387, + "balance_loss_clip": 0.06276098, + "balance_loss_mlp": 0.01255015, + "epoch": 0.47521418908762963, + "flos": 17529679347840.0, + "grad_norm": 1.464561850634071, + "language_loss": 0.72526675, + "learning_rate": 2.256917013453848e-06, + "loss": 0.80223215, + "num_input_tokens_seen": 169862705, + "router_z_loss_clip": 1.54003906, + "router_z_loss_mlp": 0.1137085, + "step": 7904, + "time_per_iteration": 2.491152286529541 + }, + { + "auxiliary_loss_clip": 0.06430416, + "auxiliary_loss_mlp": 0.01265335, + "balance_loss_clip": 0.06276643, + "balance_loss_mlp": 0.01253706, + "epoch": 0.4752743123402976, + "flos": 20565874442880.0, + "grad_norm": 1.4968424405470007, + "language_loss": 0.86079156, + "learning_rate": 2.25653077363869e-06, + "loss": 0.93774903, + "num_input_tokens_seen": 169880155, + "router_z_loss_clip": 1.53710938, + "router_z_loss_mlp": 0.11633301, + "step": 7905, + "time_per_iteration": 2.5502467155456543 + }, + { + "auxiliary_loss_clip": 0.06426042, + "auxiliary_loss_mlp": 0.01267894, + "balance_loss_clip": 0.06274827, + "balance_loss_mlp": 0.01256146, + "epoch": 0.47533443559296557, + "flos": 26368025616000.0, + "grad_norm": 2.2485080153720425, + "language_loss": 0.82345891, + "learning_rate": 2.2561445240958583e-06, + "loss": 0.90039825, + "num_input_tokens_seen": 169901525, + "router_z_loss_clip": 1.51269531, + "router_z_loss_mlp": 0.11749268, + "step": 7906, + "time_per_iteration": 2.5368199348449707 + }, + { + "auxiliary_loss_clip": 0.06321883, + "auxiliary_loss_mlp": 0.01254668, + "balance_loss_clip": 0.06254389, + "balance_loss_mlp": 0.01251897, + "epoch": 0.47539455884563353, + "flos": 65970118690560.0, + "grad_norm": 0.659791256047387, + "language_loss": 0.5900293, + "learning_rate": 2.255758264840002e-06, + "loss": 0.66579485, + "num_input_tokens_seen": 169970345, + "router_z_loss_clip": 0.67529297, + "router_z_loss_mlp": 0.02775574, + "step": 7907, + "time_per_iteration": 3.279963254928589 + }, + { + "auxiliary_loss_clip": 0.06431986, + "auxiliary_loss_mlp": 0.01269488, + "balance_loss_clip": 0.06276301, + "balance_loss_mlp": 0.01256721, + "epoch": 0.4754546820983015, + "flos": 17243828743680.0, + "grad_norm": 1.7704403118247245, + "language_loss": 0.81422615, + "learning_rate": 2.255371995885765e-06, + "loss": 0.89124084, + "num_input_tokens_seen": 169986440, + "router_z_loss_clip": 1.55664062, + "router_z_loss_mlp": 0.12756348, + "step": 7908, + "time_per_iteration": 2.5366125106811523 + }, + { + "auxiliary_loss_clip": 0.0643681, + "auxiliary_loss_mlp": 0.01270103, + "balance_loss_clip": 0.06278989, + "balance_loss_mlp": 0.01257258, + "epoch": 0.47551480535096946, + "flos": 19831563187200.0, + "grad_norm": 1.6522879253580633, + "language_loss": 0.74338585, + "learning_rate": 2.254985717247797e-06, + "loss": 0.82045496, + "num_input_tokens_seen": 170005705, + "router_z_loss_clip": 1.57714844, + "router_z_loss_mlp": 0.12841797, + "step": 7909, + "time_per_iteration": 2.5318603515625 + }, + { + "auxiliary_loss_clip": 0.06431618, + "auxiliary_loss_mlp": 0.01267166, + "balance_loss_clip": 0.0627422, + "balance_loss_mlp": 0.01255192, + "epoch": 0.4755749286036375, + "flos": 22170525258240.0, + "grad_norm": 1.5977935042114109, + "language_loss": 0.75628603, + "learning_rate": 2.2545994289407457e-06, + "loss": 0.83327389, + "num_input_tokens_seen": 170023415, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.11987305, + "step": 7910, + "time_per_iteration": 2.5529162883758545 + }, + { + "auxiliary_loss_clip": 0.0643287, + "auxiliary_loss_mlp": 0.01264956, + "balance_loss_clip": 0.06276555, + "balance_loss_mlp": 0.01253488, + "epoch": 0.47563505185630545, + "flos": 21653945637120.0, + "grad_norm": 1.8732404582916444, + "language_loss": 0.7930491, + "learning_rate": 2.2542131309792577e-06, + "loss": 0.8700273, + "num_input_tokens_seen": 170042395, + "router_z_loss_clip": 1.56347656, + "router_z_loss_mlp": 0.11474609, + "step": 7911, + "time_per_iteration": 2.5172598361968994 + }, + { + "auxiliary_loss_clip": 0.0643772, + "auxiliary_loss_mlp": 0.01268087, + "balance_loss_clip": 0.06276082, + "balance_loss_mlp": 0.01253854, + "epoch": 0.4756951751089734, + "flos": 20634622318080.0, + "grad_norm": 1.775078995772379, + "language_loss": 0.76487613, + "learning_rate": 2.253826823377983e-06, + "loss": 0.8419342, + "num_input_tokens_seen": 170061610, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.14239502, + "step": 7912, + "time_per_iteration": 2.5627753734588623 + }, + { + "auxiliary_loss_clip": 0.06432701, + "auxiliary_loss_mlp": 0.01273558, + "balance_loss_clip": 0.06275164, + "balance_loss_mlp": 0.01260797, + "epoch": 0.4757552983616414, + "flos": 25855932188160.0, + "grad_norm": 1.3867905424321492, + "language_loss": 0.74749589, + "learning_rate": 2.253440506151569e-06, + "loss": 0.82455844, + "num_input_tokens_seen": 170083505, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.12762451, + "step": 7913, + "time_per_iteration": 2.539555549621582 + }, + { + "auxiliary_loss_clip": 0.06434918, + "auxiliary_loss_mlp": 0.01269661, + "balance_loss_clip": 0.06277134, + "balance_loss_mlp": 0.01257418, + "epoch": 0.47581542161430934, + "flos": 18228841015680.0, + "grad_norm": 1.9858873239790236, + "language_loss": 0.72184181, + "learning_rate": 2.253054179314666e-06, + "loss": 0.79888761, + "num_input_tokens_seen": 170100690, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.12249756, + "step": 7914, + "time_per_iteration": 3.9911863803863525 + }, + { + "auxiliary_loss_clip": 0.06440303, + "auxiliary_loss_mlp": 0.01270006, + "balance_loss_clip": 0.06281254, + "balance_loss_mlp": 0.0125737, + "epoch": 0.4758755448669773, + "flos": 21586162083840.0, + "grad_norm": 1.8571830642758371, + "language_loss": 0.65017748, + "learning_rate": 2.2526678428819227e-06, + "loss": 0.72728062, + "num_input_tokens_seen": 170119240, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.12628174, + "step": 7915, + "time_per_iteration": 3.94254207611084 + }, + { + "auxiliary_loss_clip": 0.06428695, + "auxiliary_loss_mlp": 0.01268984, + "balance_loss_clip": 0.06276157, + "balance_loss_mlp": 0.01257027, + "epoch": 0.47593566811964527, + "flos": 15236474405760.0, + "grad_norm": 1.6782618347522322, + "language_loss": 0.77118516, + "learning_rate": 2.2522814968679896e-06, + "loss": 0.84816194, + "num_input_tokens_seen": 170136450, + "router_z_loss_clip": 1.52636719, + "router_z_loss_mlp": 0.11950684, + "step": 7916, + "time_per_iteration": 2.5071310997009277 + }, + { + "auxiliary_loss_clip": 0.0642941, + "auxiliary_loss_mlp": 0.01270125, + "balance_loss_clip": 0.06275692, + "balance_loss_mlp": 0.01258842, + "epoch": 0.47599579137231324, + "flos": 21549628903680.0, + "grad_norm": 2.1020342658546878, + "language_loss": 0.64506871, + "learning_rate": 2.2518951412875173e-06, + "loss": 0.72206402, + "num_input_tokens_seen": 170155295, + "router_z_loss_clip": 1.53710938, + "router_z_loss_mlp": 0.112854, + "step": 7917, + "time_per_iteration": 2.660997152328491 + }, + { + "auxiliary_loss_clip": 0.06322742, + "auxiliary_loss_mlp": 0.01267172, + "balance_loss_clip": 0.06253887, + "balance_loss_mlp": 0.01264125, + "epoch": 0.4760559146249812, + "flos": 64573388582400.0, + "grad_norm": 0.81764582989578, + "language_loss": 0.65507567, + "learning_rate": 2.2515087761551557e-06, + "loss": 0.73097479, + "num_input_tokens_seen": 170222325, + "router_z_loss_clip": 0.6875, + "router_z_loss_mlp": 0.03042603, + "step": 7918, + "time_per_iteration": 3.185194492340088 + }, + { + "auxiliary_loss_clip": 0.06435688, + "auxiliary_loss_mlp": 0.01270072, + "balance_loss_clip": 0.06277663, + "balance_loss_mlp": 0.01257781, + "epoch": 0.47611603787764917, + "flos": 22239943966080.0, + "grad_norm": 1.5442115166230013, + "language_loss": 0.69113988, + "learning_rate": 2.2511224014855563e-06, + "loss": 0.76819742, + "num_input_tokens_seen": 170241625, + "router_z_loss_clip": 1.58007812, + "router_z_loss_mlp": 0.12286377, + "step": 7919, + "time_per_iteration": 2.5625159740448 + }, + { + "auxiliary_loss_clip": 0.06440815, + "auxiliary_loss_mlp": 0.01266869, + "balance_loss_clip": 0.06280257, + "balance_loss_mlp": 0.01254966, + "epoch": 0.47617616113031713, + "flos": 22785971097600.0, + "grad_norm": 1.4153562055419862, + "language_loss": 0.75135148, + "learning_rate": 2.2507360172933694e-06, + "loss": 0.82842833, + "num_input_tokens_seen": 170262470, + "router_z_loss_clip": 1.60742188, + "router_z_loss_mlp": 0.11914062, + "step": 7920, + "time_per_iteration": 2.606783866882324 + }, + { + "auxiliary_loss_clip": 0.06442747, + "auxiliary_loss_mlp": 0.01268403, + "balance_loss_clip": 0.06280643, + "balance_loss_mlp": 0.01255391, + "epoch": 0.4762362843829851, + "flos": 24140633656320.0, + "grad_norm": 1.5595930907743143, + "language_loss": 0.77291155, + "learning_rate": 2.2503496235932487e-06, + "loss": 0.85002303, + "num_input_tokens_seen": 170283460, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.13000488, + "step": 7921, + "time_per_iteration": 4.0331573486328125 + }, + { + "auxiliary_loss_clip": 0.06441253, + "auxiliary_loss_mlp": 0.01270198, + "balance_loss_clip": 0.06281719, + "balance_loss_mlp": 0.01256859, + "epoch": 0.47629640763565306, + "flos": 22458052944000.0, + "grad_norm": 1.5318798569312555, + "language_loss": 0.78402638, + "learning_rate": 2.249963220399845e-06, + "loss": 0.86114085, + "num_input_tokens_seen": 170304225, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.13342285, + "step": 7922, + "time_per_iteration": 2.615656614303589 + }, + { + "auxiliary_loss_clip": 0.06443102, + "auxiliary_loss_mlp": 0.01266921, + "balance_loss_clip": 0.06280392, + "balance_loss_mlp": 0.01253426, + "epoch": 0.4763565308883211, + "flos": 11186071090560.0, + "grad_norm": 1.9566034639967664, + "language_loss": 0.72915596, + "learning_rate": 2.2495768077278104e-06, + "loss": 0.80625618, + "num_input_tokens_seen": 170322110, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.1350708, + "step": 7923, + "time_per_iteration": 2.495023727416992 + }, + { + "auxiliary_loss_clip": 0.06440397, + "auxiliary_loss_mlp": 0.01267365, + "balance_loss_clip": 0.06280472, + "balance_loss_mlp": 0.01255772, + "epoch": 0.47641665414098905, + "flos": 22388634236160.0, + "grad_norm": 2.175648520453788, + "language_loss": 0.82023257, + "learning_rate": 2.2491903855917992e-06, + "loss": 0.8973102, + "num_input_tokens_seen": 170340700, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.11590576, + "step": 7924, + "time_per_iteration": 2.5592448711395264 + }, + { + "auxiliary_loss_clip": 0.06449094, + "auxiliary_loss_mlp": 0.01271258, + "balance_loss_clip": 0.06283164, + "balance_loss_mlp": 0.01257191, + "epoch": 0.476476777393657, + "flos": 25053166546560.0, + "grad_norm": 1.6497722763363074, + "language_loss": 0.80566549, + "learning_rate": 2.2488039540064626e-06, + "loss": 0.88286906, + "num_input_tokens_seen": 170359780, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.14074707, + "step": 7925, + "time_per_iteration": 2.5462217330932617 + }, + { + "auxiliary_loss_clip": 0.06433398, + "auxiliary_loss_mlp": 0.01273204, + "balance_loss_clip": 0.06273591, + "balance_loss_mlp": 0.01259984, + "epoch": 0.476536900646325, + "flos": 27276994707840.0, + "grad_norm": 1.5163925310357687, + "language_loss": 0.72183931, + "learning_rate": 2.2484175129864558e-06, + "loss": 0.79890537, + "num_input_tokens_seen": 170381260, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.13214111, + "step": 7926, + "time_per_iteration": 4.022697448730469 + }, + { + "auxiliary_loss_clip": 0.06443252, + "auxiliary_loss_mlp": 0.01270757, + "balance_loss_clip": 0.062805, + "balance_loss_mlp": 0.01257304, + "epoch": 0.47659702389899294, + "flos": 25308437610240.0, + "grad_norm": 2.540030120332383, + "language_loss": 0.69248974, + "learning_rate": 2.248031062546432e-06, + "loss": 0.76962984, + "num_input_tokens_seen": 170400595, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.13452148, + "step": 7927, + "time_per_iteration": 2.651005744934082 + }, + { + "auxiliary_loss_clip": 0.06432809, + "auxiliary_loss_mlp": 0.01274998, + "balance_loss_clip": 0.06276157, + "balance_loss_mlp": 0.01262928, + "epoch": 0.4766571471516609, + "flos": 25999716994560.0, + "grad_norm": 1.8555909912878064, + "language_loss": 0.68153882, + "learning_rate": 2.247644602701045e-06, + "loss": 0.75861686, + "num_input_tokens_seen": 170421110, + "router_z_loss_clip": 1.56640625, + "router_z_loss_mlp": 0.12072754, + "step": 7928, + "time_per_iteration": 2.6001169681549072 + }, + { + "auxiliary_loss_clip": 0.06439018, + "auxiliary_loss_mlp": 0.01266996, + "balance_loss_clip": 0.06277569, + "balance_loss_mlp": 0.01254497, + "epoch": 0.4767172704043289, + "flos": 16037395257600.0, + "grad_norm": 2.030081429010121, + "language_loss": 0.79402888, + "learning_rate": 2.2472581334649496e-06, + "loss": 0.87108904, + "num_input_tokens_seen": 170436700, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.12506104, + "step": 7929, + "time_per_iteration": 2.4979782104492188 + }, + { + "auxiliary_loss_clip": 0.06434054, + "auxiliary_loss_mlp": 0.0127525, + "balance_loss_clip": 0.06276359, + "balance_loss_mlp": 0.01263496, + "epoch": 0.47677739365699684, + "flos": 39244113233280.0, + "grad_norm": 1.8073767988538123, + "language_loss": 0.67109072, + "learning_rate": 2.2468716548528016e-06, + "loss": 0.74818379, + "num_input_tokens_seen": 170459555, + "router_z_loss_clip": 1.57617188, + "router_z_loss_mlp": 0.11749268, + "step": 7930, + "time_per_iteration": 2.64865779876709 + }, + { + "auxiliary_loss_clip": 0.06440657, + "auxiliary_loss_mlp": 0.01272697, + "balance_loss_clip": 0.06280986, + "balance_loss_mlp": 0.01260484, + "epoch": 0.4768375169096648, + "flos": 24724745268480.0, + "grad_norm": 1.7506463735046407, + "language_loss": 0.79864836, + "learning_rate": 2.2464851668792555e-06, + "loss": 0.87578189, + "num_input_tokens_seen": 170479175, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.12207031, + "step": 7931, + "time_per_iteration": 2.5824391841888428 + }, + { + "auxiliary_loss_clip": 0.06435428, + "auxiliary_loss_mlp": 0.01273232, + "balance_loss_clip": 0.06274468, + "balance_loss_mlp": 0.01260203, + "epoch": 0.47689764016233277, + "flos": 22535270081280.0, + "grad_norm": 2.3707401208689753, + "language_loss": 0.76826382, + "learning_rate": 2.2460986695589678e-06, + "loss": 0.8453505, + "num_input_tokens_seen": 170498450, + "router_z_loss_clip": 1.61035156, + "router_z_loss_mlp": 0.13043213, + "step": 7932, + "time_per_iteration": 2.510439157485962 + }, + { + "auxiliary_loss_clip": 0.06434679, + "auxiliary_loss_mlp": 0.01279125, + "balance_loss_clip": 0.06279778, + "balance_loss_mlp": 0.01266101, + "epoch": 0.47695776341500074, + "flos": 15125742835200.0, + "grad_norm": 3.7494408598150946, + "language_loss": 0.79909194, + "learning_rate": 2.245712162906593e-06, + "loss": 0.87623, + "num_input_tokens_seen": 170516255, + "router_z_loss_clip": 1.54589844, + "router_z_loss_mlp": 0.13012695, + "step": 7933, + "time_per_iteration": 2.5868406295776367 + }, + { + "auxiliary_loss_clip": 0.06440616, + "auxiliary_loss_mlp": 0.01270557, + "balance_loss_clip": 0.06276172, + "balance_loss_mlp": 0.01256889, + "epoch": 0.4770178866676687, + "flos": 14683319677440.0, + "grad_norm": 1.845903856635024, + "language_loss": 0.74363738, + "learning_rate": 2.2453256469367888e-06, + "loss": 0.8207491, + "num_input_tokens_seen": 170532705, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.13677979, + "step": 7934, + "time_per_iteration": 2.467625141143799 + }, + { + "auxiliary_loss_clip": 0.06439498, + "auxiliary_loss_mlp": 0.01269177, + "balance_loss_clip": 0.06278646, + "balance_loss_mlp": 0.01256213, + "epoch": 0.47707800992033667, + "flos": 22572264458880.0, + "grad_norm": 2.1751877197221847, + "language_loss": 0.80426806, + "learning_rate": 2.244939121664211e-06, + "loss": 0.88135481, + "num_input_tokens_seen": 170551925, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.12963867, + "step": 7935, + "time_per_iteration": 2.57150936126709 + }, + { + "auxiliary_loss_clip": 0.06443004, + "auxiliary_loss_mlp": 0.01271494, + "balance_loss_clip": 0.06275547, + "balance_loss_mlp": 0.01257249, + "epoch": 0.4771381331730047, + "flos": 30925868457600.0, + "grad_norm": 1.696374515888555, + "language_loss": 0.71442336, + "learning_rate": 2.2445525871035177e-06, + "loss": 0.7915684, + "num_input_tokens_seen": 170572320, + "router_z_loss_clip": 1.67480469, + "router_z_loss_mlp": 0.14245605, + "step": 7936, + "time_per_iteration": 2.577134609222412 + }, + { + "auxiliary_loss_clip": 0.06440726, + "auxiliary_loss_mlp": 0.01267366, + "balance_loss_clip": 0.06278887, + "balance_loss_mlp": 0.01254593, + "epoch": 0.47719825642567265, + "flos": 25745955304320.0, + "grad_norm": 1.9394747057802306, + "language_loss": 0.68651855, + "learning_rate": 2.2441660432693656e-06, + "loss": 0.76359951, + "num_input_tokens_seen": 170589470, + "router_z_loss_clip": 1.61914062, + "router_z_loss_mlp": 0.12774658, + "step": 7937, + "time_per_iteration": 2.5523571968078613 + }, + { + "auxiliary_loss_clip": 0.06332788, + "auxiliary_loss_mlp": 0.01255518, + "balance_loss_clip": 0.06264147, + "balance_loss_mlp": 0.01252959, + "epoch": 0.4772583796783406, + "flos": 66376344084480.0, + "grad_norm": 0.7063710164794027, + "language_loss": 0.56256598, + "learning_rate": 2.2437794901764128e-06, + "loss": 0.63844901, + "num_input_tokens_seen": 170662265, + "router_z_loss_clip": 0.6875, + "router_z_loss_mlp": 0.02558899, + "step": 7938, + "time_per_iteration": 3.3101401329040527 + }, + { + "auxiliary_loss_clip": 0.06435397, + "auxiliary_loss_mlp": 0.0126663, + "balance_loss_clip": 0.06278569, + "balance_loss_mlp": 0.01252927, + "epoch": 0.4773185029310086, + "flos": 22057068430080.0, + "grad_norm": 1.5498541545702798, + "language_loss": 0.89232612, + "learning_rate": 2.243392927839317e-06, + "loss": 0.96934634, + "num_input_tokens_seen": 170679680, + "router_z_loss_clip": 1.56738281, + "router_z_loss_mlp": 0.13702393, + "step": 7939, + "time_per_iteration": 2.559797525405884 + }, + { + "auxiliary_loss_clip": 0.06434917, + "auxiliary_loss_mlp": 0.01268488, + "balance_loss_clip": 0.06277393, + "balance_loss_mlp": 0.01256239, + "epoch": 0.47737862618367655, + "flos": 16733496251520.0, + "grad_norm": 2.4258721196632456, + "language_loss": 0.77298427, + "learning_rate": 2.2430063562727367e-06, + "loss": 0.85001838, + "num_input_tokens_seen": 170697340, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.12249756, + "step": 7940, + "time_per_iteration": 2.5268869400024414 + }, + { + "auxiliary_loss_clip": 0.06430884, + "auxiliary_loss_mlp": 0.01269812, + "balance_loss_clip": 0.0627719, + "balance_loss_mlp": 0.01257373, + "epoch": 0.4774387494363445, + "flos": 19615508634240.0, + "grad_norm": 1.6559533080399789, + "language_loss": 0.85386801, + "learning_rate": 2.2426197754913322e-06, + "loss": 0.930875, + "num_input_tokens_seen": 170714905, + "router_z_loss_clip": 1.53710938, + "router_z_loss_mlp": 0.12432861, + "step": 7941, + "time_per_iteration": 2.547070264816284 + }, + { + "auxiliary_loss_clip": 0.06437483, + "auxiliary_loss_mlp": 0.01270392, + "balance_loss_clip": 0.06277451, + "balance_loss_mlp": 0.01257965, + "epoch": 0.4774988726890125, + "flos": 16659507496320.0, + "grad_norm": 1.9070361015512296, + "language_loss": 0.76308775, + "learning_rate": 2.24223318550976e-06, + "loss": 0.84016657, + "num_input_tokens_seen": 170731810, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.12420654, + "step": 7942, + "time_per_iteration": 2.4842329025268555 + }, + { + "auxiliary_loss_clip": 0.06440963, + "auxiliary_loss_mlp": 0.01266017, + "balance_loss_clip": 0.06282113, + "balance_loss_mlp": 0.01253601, + "epoch": 0.47755899594168044, + "flos": 20491843760640.0, + "grad_norm": 1.6294214929971118, + "language_loss": 0.64313745, + "learning_rate": 2.241846586342682e-06, + "loss": 0.72020721, + "num_input_tokens_seen": 170750270, + "router_z_loss_clip": 1.58789062, + "router_z_loss_mlp": 0.12402344, + "step": 7943, + "time_per_iteration": 2.5384066104888916 + }, + { + "auxiliary_loss_clip": 0.06444484, + "auxiliary_loss_mlp": 0.01267261, + "balance_loss_clip": 0.06280033, + "balance_loss_mlp": 0.01253493, + "epoch": 0.4776191191943484, + "flos": 21659228444160.0, + "grad_norm": 1.6943023581153507, + "language_loss": 0.73866045, + "learning_rate": 2.2414599780047577e-06, + "loss": 0.8157779, + "num_input_tokens_seen": 170769015, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.13781738, + "step": 7944, + "time_per_iteration": 2.5201148986816406 + }, + { + "auxiliary_loss_clip": 0.06447009, + "auxiliary_loss_mlp": 0.01271608, + "balance_loss_clip": 0.06287117, + "balance_loss_mlp": 0.01258459, + "epoch": 0.4776792424470164, + "flos": 18776125958400.0, + "grad_norm": 2.2429214657199257, + "language_loss": 0.68437827, + "learning_rate": 2.2410733605106456e-06, + "loss": 0.76156443, + "num_input_tokens_seen": 170785725, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.13153076, + "step": 7945, + "time_per_iteration": 2.5126469135284424 + }, + { + "auxiliary_loss_clip": 0.06440154, + "auxiliary_loss_mlp": 0.01270213, + "balance_loss_clip": 0.06280819, + "balance_loss_mlp": 0.01257577, + "epoch": 0.47773936569968434, + "flos": 29723543821440.0, + "grad_norm": 1.8191434389659598, + "language_loss": 0.75203103, + "learning_rate": 2.240686733875009e-06, + "loss": 0.8291347, + "num_input_tokens_seen": 170804600, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.12628174, + "step": 7946, + "time_per_iteration": 2.5952818393707275 + }, + { + "auxiliary_loss_clip": 0.06450987, + "auxiliary_loss_mlp": 0.0126674, + "balance_loss_clip": 0.06288904, + "balance_loss_mlp": 0.0125368, + "epoch": 0.4777994889523523, + "flos": 24798650169600.0, + "grad_norm": 2.1264871549136566, + "language_loss": 0.79598629, + "learning_rate": 2.240300098112506e-06, + "loss": 0.87316352, + "num_input_tokens_seen": 170824230, + "router_z_loss_clip": 1.62109375, + "router_z_loss_mlp": 0.13043213, + "step": 7947, + "time_per_iteration": 2.561429023742676 + }, + { + "auxiliary_loss_clip": 0.06437, + "auxiliary_loss_mlp": 0.01268516, + "balance_loss_clip": 0.06282562, + "balance_loss_mlp": 0.01255302, + "epoch": 0.47785961220502027, + "flos": 17863928484480.0, + "grad_norm": 1.6733844414372485, + "language_loss": 0.73571151, + "learning_rate": 2.2399134532377998e-06, + "loss": 0.81276667, + "num_input_tokens_seen": 170843365, + "router_z_loss_clip": 1.54199219, + "router_z_loss_mlp": 0.13220215, + "step": 7948, + "time_per_iteration": 2.5309975147247314 + }, + { + "auxiliary_loss_clip": 0.06442553, + "auxiliary_loss_mlp": 0.01267736, + "balance_loss_clip": 0.06283022, + "balance_loss_mlp": 0.01253848, + "epoch": 0.4779197354576883, + "flos": 20272770460800.0, + "grad_norm": 2.2305312131568256, + "language_loss": 0.78282905, + "learning_rate": 2.2395267992655514e-06, + "loss": 0.85993195, + "num_input_tokens_seen": 170863515, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.13891602, + "step": 7949, + "time_per_iteration": 2.5135691165924072 + }, + { + "auxiliary_loss_clip": 0.06441014, + "auxiliary_loss_mlp": 0.01264008, + "balance_loss_clip": 0.06285359, + "balance_loss_mlp": 0.01251849, + "epoch": 0.47797985871035625, + "flos": 17062420654080.0, + "grad_norm": 2.4211239692864686, + "language_loss": 0.75134766, + "learning_rate": 2.2391401362104227e-06, + "loss": 0.82839787, + "num_input_tokens_seen": 170881245, + "router_z_loss_clip": 1.55566406, + "router_z_loss_mlp": 0.12164307, + "step": 7950, + "time_per_iteration": 2.5256588459014893 + }, + { + "auxiliary_loss_clip": 0.06439517, + "auxiliary_loss_mlp": 0.01271424, + "balance_loss_clip": 0.0628176, + "balance_loss_mlp": 0.01258668, + "epoch": 0.4780399819630242, + "flos": 31366530679680.0, + "grad_norm": 1.6557560470716002, + "language_loss": 0.744519, + "learning_rate": 2.2387534640870756e-06, + "loss": 0.82162845, + "num_input_tokens_seen": 170901285, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.12756348, + "step": 7951, + "time_per_iteration": 2.6257662773132324 + }, + { + "auxiliary_loss_clip": 0.0644564, + "auxiliary_loss_mlp": 0.0126871, + "balance_loss_clip": 0.06285301, + "balance_loss_mlp": 0.01255925, + "epoch": 0.4781001052156922, + "flos": 24906488774400.0, + "grad_norm": 2.0941094174335, + "language_loss": 0.80880862, + "learning_rate": 2.238366782910174e-06, + "loss": 0.88595212, + "num_input_tokens_seen": 170919740, + "router_z_loss_clip": 1.60351562, + "router_z_loss_mlp": 0.12786865, + "step": 7952, + "time_per_iteration": 2.6039650440216064 + }, + { + "auxiliary_loss_clip": 0.06449462, + "auxiliary_loss_mlp": 0.01273751, + "balance_loss_clip": 0.06286798, + "balance_loss_mlp": 0.01259684, + "epoch": 0.47816022846836015, + "flos": 18703688503680.0, + "grad_norm": 1.7383850677064194, + "language_loss": 0.78965735, + "learning_rate": 2.23798009269438e-06, + "loss": 0.86688948, + "num_input_tokens_seen": 170938510, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.14068604, + "step": 7953, + "time_per_iteration": 3.9394986629486084 + }, + { + "auxiliary_loss_clip": 0.0644647, + "auxiliary_loss_mlp": 0.0126971, + "balance_loss_clip": 0.0628321, + "balance_loss_mlp": 0.01256793, + "epoch": 0.4782203517210281, + "flos": 11981289864960.0, + "grad_norm": 2.1105030234958733, + "language_loss": 0.84721971, + "learning_rate": 2.2375933934543566e-06, + "loss": 0.92438149, + "num_input_tokens_seen": 170951170, + "router_z_loss_clip": 1.62988281, + "router_z_loss_mlp": 0.12921143, + "step": 7954, + "time_per_iteration": 3.9196231365203857 + }, + { + "auxiliary_loss_clip": 0.06440185, + "auxiliary_loss_mlp": 0.0126799, + "balance_loss_clip": 0.06283759, + "balance_loss_mlp": 0.01255282, + "epoch": 0.4782804749736961, + "flos": 20819761914240.0, + "grad_norm": 1.4881886911999394, + "language_loss": 0.70481235, + "learning_rate": 2.237206685204768e-06, + "loss": 0.78189409, + "num_input_tokens_seen": 170970990, + "router_z_loss_clip": 1.56347656, + "router_z_loss_mlp": 0.1270752, + "step": 7955, + "time_per_iteration": 2.5434484481811523 + }, + { + "auxiliary_loss_clip": 0.064454, + "auxiliary_loss_mlp": 0.01270242, + "balance_loss_clip": 0.06284527, + "balance_loss_mlp": 0.01257326, + "epoch": 0.47834059822636404, + "flos": 23846816914560.0, + "grad_norm": 1.553979149808007, + "language_loss": 0.823044, + "learning_rate": 2.2368199679602787e-06, + "loss": 0.90020043, + "num_input_tokens_seen": 170991215, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.12902832, + "step": 7956, + "time_per_iteration": 2.545602560043335 + }, + { + "auxiliary_loss_clip": 0.06441168, + "auxiliary_loss_mlp": 0.01269938, + "balance_loss_clip": 0.06284995, + "balance_loss_mlp": 0.01255627, + "epoch": 0.478400721479032, + "flos": 22639670668800.0, + "grad_norm": 1.9591153371347299, + "language_loss": 0.85127819, + "learning_rate": 2.2364332417355516e-06, + "loss": 0.92838925, + "num_input_tokens_seen": 171007325, + "router_z_loss_clip": 1.56152344, + "router_z_loss_mlp": 0.14300537, + "step": 7957, + "time_per_iteration": 2.548643112182617 + }, + { + "auxiliary_loss_clip": 0.06441608, + "auxiliary_loss_mlp": 0.01269143, + "balance_loss_clip": 0.06285611, + "balance_loss_mlp": 0.01257001, + "epoch": 0.4784608447317, + "flos": 19361118038400.0, + "grad_norm": 7.050300940807432, + "language_loss": 0.79869133, + "learning_rate": 2.2360465065452527e-06, + "loss": 0.87579882, + "num_input_tokens_seen": 171025650, + "router_z_loss_clip": 1.55957031, + "router_z_loss_mlp": 0.12139893, + "step": 7958, + "time_per_iteration": 2.5078237056732178 + }, + { + "auxiliary_loss_clip": 0.06441762, + "auxiliary_loss_mlp": 0.01268959, + "balance_loss_clip": 0.06283723, + "balance_loss_mlp": 0.0125534, + "epoch": 0.47852096798436794, + "flos": 24027386463360.0, + "grad_norm": 1.6951891176109464, + "language_loss": 0.82802176, + "learning_rate": 2.235659762404047e-06, + "loss": 0.90512896, + "num_input_tokens_seen": 171045045, + "router_z_loss_clip": 1.58203125, + "router_z_loss_mlp": 0.1361084, + "step": 7959, + "time_per_iteration": 2.565302610397339 + }, + { + "auxiliary_loss_clip": 0.06438372, + "auxiliary_loss_mlp": 0.01267095, + "balance_loss_clip": 0.06285324, + "balance_loss_mlp": 0.01255615, + "epoch": 0.4785810912370359, + "flos": 25673559776640.0, + "grad_norm": 2.330976037710063, + "language_loss": 0.73464501, + "learning_rate": 2.235273009326599e-06, + "loss": 0.81169969, + "num_input_tokens_seen": 171062910, + "router_z_loss_clip": 1.53027344, + "router_z_loss_mlp": 0.1149292, + "step": 7960, + "time_per_iteration": 4.027269124984741 + }, + { + "auxiliary_loss_clip": 0.06436551, + "auxiliary_loss_mlp": 0.01270036, + "balance_loss_clip": 0.0628148, + "balance_loss_mlp": 0.01258014, + "epoch": 0.47864121448970387, + "flos": 21438226500480.0, + "grad_norm": 3.172971837567245, + "language_loss": 0.77372915, + "learning_rate": 2.2348862473275745e-06, + "loss": 0.85079503, + "num_input_tokens_seen": 171080875, + "router_z_loss_clip": 1.55078125, + "router_z_loss_mlp": 0.12036133, + "step": 7961, + "time_per_iteration": 2.5147969722747803 + }, + { + "auxiliary_loss_clip": 0.06435739, + "auxiliary_loss_mlp": 0.01267875, + "balance_loss_clip": 0.06278273, + "balance_loss_mlp": 0.01255269, + "epoch": 0.47870133774237184, + "flos": 16149468493440.0, + "grad_norm": 1.5337652867811775, + "language_loss": 0.78017688, + "learning_rate": 2.2344994764216405e-06, + "loss": 0.85721302, + "num_input_tokens_seen": 171099190, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.12597656, + "step": 7962, + "time_per_iteration": 2.513148307800293 + }, + { + "auxiliary_loss_clip": 0.06441396, + "auxiliary_loss_mlp": 0.01270097, + "balance_loss_clip": 0.06281849, + "balance_loss_mlp": 0.01257646, + "epoch": 0.47876146099503986, + "flos": 26914094674560.0, + "grad_norm": 1.8277818369463197, + "language_loss": 0.65211046, + "learning_rate": 2.2341126966234635e-06, + "loss": 0.7292254, + "num_input_tokens_seen": 171119060, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.12457275, + "step": 7963, + "time_per_iteration": 2.601811647415161 + }, + { + "auxiliary_loss_clip": 0.06439337, + "auxiliary_loss_mlp": 0.01266508, + "balance_loss_clip": 0.06280507, + "balance_loss_mlp": 0.01253621, + "epoch": 0.4788215842477078, + "flos": 45342470989440.0, + "grad_norm": 2.309935013710649, + "language_loss": 0.77810884, + "learning_rate": 2.2337259079477083e-06, + "loss": 0.85516727, + "num_input_tokens_seen": 171141900, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.12890625, + "step": 7964, + "time_per_iteration": 2.747879981994629 + }, + { + "auxiliary_loss_clip": 0.06446981, + "auxiliary_loss_mlp": 0.01271797, + "balance_loss_clip": 0.06283239, + "balance_loss_mlp": 0.01257218, + "epoch": 0.4788817075003758, + "flos": 22243801253760.0, + "grad_norm": 1.6568781202078557, + "language_loss": 0.76541996, + "learning_rate": 2.233339110409044e-06, + "loss": 0.84260774, + "num_input_tokens_seen": 171161045, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.14587402, + "step": 7965, + "time_per_iteration": 2.562894344329834 + }, + { + "auxiliary_loss_clip": 0.06441608, + "auxiliary_loss_mlp": 0.01268659, + "balance_loss_clip": 0.06281182, + "balance_loss_mlp": 0.01256434, + "epoch": 0.47894183075304375, + "flos": 16476631960320.0, + "grad_norm": 1.6972134667517975, + "language_loss": 0.74819887, + "learning_rate": 2.232952304022137e-06, + "loss": 0.82530153, + "num_input_tokens_seen": 171179675, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.12237549, + "step": 7966, + "time_per_iteration": 4.023793697357178 + }, + { + "auxiliary_loss_clip": 0.06437664, + "auxiliary_loss_mlp": 0.01265562, + "balance_loss_clip": 0.06279117, + "balance_loss_mlp": 0.01253033, + "epoch": 0.4790019540057117, + "flos": 24290036686080.0, + "grad_norm": 1.5237416858661557, + "language_loss": 0.73335361, + "learning_rate": 2.232565488801655e-06, + "loss": 0.81038582, + "num_input_tokens_seen": 171201175, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.12518311, + "step": 7967, + "time_per_iteration": 2.586228847503662 + }, + { + "auxiliary_loss_clip": 0.06429637, + "auxiliary_loss_mlp": 0.01267705, + "balance_loss_clip": 0.06277768, + "balance_loss_mlp": 0.01254825, + "epoch": 0.4790620772583797, + "flos": 25673601703680.0, + "grad_norm": 2.2388113154567058, + "language_loss": 0.79254079, + "learning_rate": 2.232178664762267e-06, + "loss": 0.86951417, + "num_input_tokens_seen": 171221750, + "router_z_loss_clip": 1.51855469, + "router_z_loss_mlp": 0.12896729, + "step": 7968, + "time_per_iteration": 2.569835901260376 + }, + { + "auxiliary_loss_clip": 0.06330545, + "auxiliary_loss_mlp": 0.01255481, + "balance_loss_clip": 0.06260878, + "balance_loss_mlp": 0.01252947, + "epoch": 0.47912220051104765, + "flos": 69451168711680.0, + "grad_norm": 0.7701358383106056, + "language_loss": 0.62163401, + "learning_rate": 2.2317918319186408e-06, + "loss": 0.69749427, + "num_input_tokens_seen": 171292235, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 0.02534485, + "step": 7969, + "time_per_iteration": 3.2898826599121094 + }, + { + "auxiliary_loss_clip": 0.06435778, + "auxiliary_loss_mlp": 0.01265918, + "balance_loss_clip": 0.06281342, + "balance_loss_mlp": 0.012529, + "epoch": 0.4791823237637156, + "flos": 24175531681920.0, + "grad_norm": 1.7909857243287752, + "language_loss": 0.77847564, + "learning_rate": 2.2314049902854446e-06, + "loss": 0.85549259, + "num_input_tokens_seen": 171312215, + "router_z_loss_clip": 1.54296875, + "router_z_loss_mlp": 0.13006592, + "step": 7970, + "time_per_iteration": 2.5170607566833496 + }, + { + "auxiliary_loss_clip": 0.06435491, + "auxiliary_loss_mlp": 0.01267513, + "balance_loss_clip": 0.06276551, + "balance_loss_mlp": 0.0125384, + "epoch": 0.4792424470163836, + "flos": 24757966212480.0, + "grad_norm": 1.6160167990193877, + "language_loss": 0.71182537, + "learning_rate": 2.231018139877349e-06, + "loss": 0.78885543, + "num_input_tokens_seen": 171332975, + "router_z_loss_clip": 1.58886719, + "router_z_loss_mlp": 0.13665771, + "step": 7971, + "time_per_iteration": 2.572124719619751 + }, + { + "auxiliary_loss_clip": 0.06436221, + "auxiliary_loss_mlp": 0.01271919, + "balance_loss_clip": 0.06279434, + "balance_loss_mlp": 0.01258836, + "epoch": 0.47930257026905154, + "flos": 23264550092160.0, + "grad_norm": 1.2950674857674533, + "language_loss": 0.80144143, + "learning_rate": 2.230631280709021e-06, + "loss": 0.87852287, + "num_input_tokens_seen": 171353880, + "router_z_loss_clip": 1.56738281, + "router_z_loss_mlp": 0.1307373, + "step": 7972, + "time_per_iteration": 2.545262575149536 + }, + { + "auxiliary_loss_clip": 0.06442808, + "auxiliary_loss_mlp": 0.01270328, + "balance_loss_clip": 0.06281324, + "balance_loss_mlp": 0.01256392, + "epoch": 0.4793626935217195, + "flos": 14069299357440.0, + "grad_norm": 2.062531710859889, + "language_loss": 0.70572007, + "learning_rate": 2.2302444127951327e-06, + "loss": 0.7828514, + "num_input_tokens_seen": 171370930, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.13934326, + "step": 7973, + "time_per_iteration": 2.5338237285614014 + }, + { + "auxiliary_loss_clip": 0.064371, + "auxiliary_loss_mlp": 0.01270261, + "balance_loss_clip": 0.06283109, + "balance_loss_mlp": 0.0125806, + "epoch": 0.4794228167743875, + "flos": 21805319237760.0, + "grad_norm": 1.7273933233655367, + "language_loss": 0.79198468, + "learning_rate": 2.2298575361503523e-06, + "loss": 0.86905837, + "num_input_tokens_seen": 171387575, + "router_z_loss_clip": 1.54101562, + "router_z_loss_mlp": 0.12200928, + "step": 7974, + "time_per_iteration": 2.5069854259490967 + }, + { + "auxiliary_loss_clip": 0.06339005, + "auxiliary_loss_mlp": 0.01258702, + "balance_loss_clip": 0.06269643, + "balance_loss_mlp": 0.01255866, + "epoch": 0.47948294002705544, + "flos": 66989022739200.0, + "grad_norm": 0.7443790840370731, + "language_loss": 0.53920376, + "learning_rate": 2.2294706507893517e-06, + "loss": 0.61518085, + "num_input_tokens_seen": 171449980, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 0.02832031, + "step": 7975, + "time_per_iteration": 3.2263216972351074 + }, + { + "auxiliary_loss_clip": 0.06450166, + "auxiliary_loss_mlp": 0.01269981, + "balance_loss_clip": 0.06283702, + "balance_loss_mlp": 0.0125465, + "epoch": 0.47954306327972346, + "flos": 12427444529280.0, + "grad_norm": 1.9824704830592612, + "language_loss": 0.90397954, + "learning_rate": 2.2290837567268008e-06, + "loss": 0.98118103, + "num_input_tokens_seen": 171465290, + "router_z_loss_clip": 1.6640625, + "router_z_loss_mlp": 0.15313721, + "step": 7976, + "time_per_iteration": 2.5806965827941895 + }, + { + "auxiliary_loss_clip": 0.06448781, + "auxiliary_loss_mlp": 0.01272852, + "balance_loss_clip": 0.06284519, + "balance_loss_mlp": 0.01257629, + "epoch": 0.4796031865323914, + "flos": 18366630255360.0, + "grad_norm": 3.7288296944586166, + "language_loss": 0.73905623, + "learning_rate": 2.2286968539773713e-06, + "loss": 0.81627262, + "num_input_tokens_seen": 171481130, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.15209961, + "step": 7977, + "time_per_iteration": 2.5562849044799805 + }, + { + "auxiliary_loss_clip": 0.06437217, + "auxiliary_loss_mlp": 0.01268705, + "balance_loss_clip": 0.06283021, + "balance_loss_mlp": 0.01255741, + "epoch": 0.4796633097850594, + "flos": 21841517001600.0, + "grad_norm": 1.607227573724713, + "language_loss": 0.78873986, + "learning_rate": 2.228309942555734e-06, + "loss": 0.86579907, + "num_input_tokens_seen": 171501140, + "router_z_loss_clip": 1.54296875, + "router_z_loss_mlp": 0.12976074, + "step": 7978, + "time_per_iteration": 2.558842420578003 + }, + { + "auxiliary_loss_clip": 0.06440634, + "auxiliary_loss_mlp": 0.01269299, + "balance_loss_clip": 0.06280127, + "balance_loss_mlp": 0.01255214, + "epoch": 0.47972343303772735, + "flos": 23443526413440.0, + "grad_norm": 1.9276236664860738, + "language_loss": 0.89800453, + "learning_rate": 2.22792302247656e-06, + "loss": 0.97510386, + "num_input_tokens_seen": 171519835, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.14099121, + "step": 7979, + "time_per_iteration": 2.5952987670898438 + }, + { + "auxiliary_loss_clip": 0.06446249, + "auxiliary_loss_mlp": 0.01270987, + "balance_loss_clip": 0.06283665, + "balance_loss_mlp": 0.01256378, + "epoch": 0.4797835562903953, + "flos": 24906698409600.0, + "grad_norm": 1.4562164603157606, + "language_loss": 0.7704469, + "learning_rate": 2.227536093754523e-06, + "loss": 0.8476193, + "num_input_tokens_seen": 171540980, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.14605713, + "step": 7980, + "time_per_iteration": 2.5736522674560547 + }, + { + "auxiliary_loss_clip": 0.06447264, + "auxiliary_loss_mlp": 0.01273404, + "balance_loss_clip": 0.06281359, + "balance_loss_mlp": 0.01258938, + "epoch": 0.4798436795430633, + "flos": 35051644120320.0, + "grad_norm": 1.875578547391537, + "language_loss": 0.71508431, + "learning_rate": 2.227149156404295e-06, + "loss": 0.79229099, + "num_input_tokens_seen": 171563600, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.14459229, + "step": 7981, + "time_per_iteration": 2.6367290019989014 + }, + { + "auxiliary_loss_clip": 0.06439552, + "auxiliary_loss_mlp": 0.01273941, + "balance_loss_clip": 0.06281938, + "balance_loss_mlp": 0.01258998, + "epoch": 0.47990380279573125, + "flos": 20595699296640.0, + "grad_norm": 1.7763359166784585, + "language_loss": 0.70155972, + "learning_rate": 2.2267622104405473e-06, + "loss": 0.77869463, + "num_input_tokens_seen": 171580700, + "router_z_loss_clip": 1.57519531, + "router_z_loss_mlp": 0.14935303, + "step": 7982, + "time_per_iteration": 2.5258874893188477 + }, + { + "auxiliary_loss_clip": 0.06432236, + "auxiliary_loss_mlp": 0.0126906, + "balance_loss_clip": 0.06278554, + "balance_loss_mlp": 0.01257079, + "epoch": 0.4799639260483992, + "flos": 26366600096640.0, + "grad_norm": 1.7437778110304778, + "language_loss": 0.71608925, + "learning_rate": 2.2263752558779544e-06, + "loss": 0.79310226, + "num_input_tokens_seen": 171602035, + "router_z_loss_clip": 1.53417969, + "router_z_loss_mlp": 0.11975098, + "step": 7983, + "time_per_iteration": 2.568826913833618 + }, + { + "auxiliary_loss_clip": 0.06340544, + "auxiliary_loss_mlp": 0.01252804, + "balance_loss_clip": 0.06272082, + "balance_loss_mlp": 0.01249972, + "epoch": 0.4800240493010672, + "flos": 70999371002880.0, + "grad_norm": 0.765879442061108, + "language_loss": 0.59357727, + "learning_rate": 2.2259882927311883e-06, + "loss": 0.66951072, + "num_input_tokens_seen": 171659215, + "router_z_loss_clip": 0.68554688, + "router_z_loss_mlp": 0.02828979, + "step": 7984, + "time_per_iteration": 3.1084651947021484 + }, + { + "auxiliary_loss_clip": 0.0643955, + "auxiliary_loss_mlp": 0.01275134, + "balance_loss_clip": 0.0628207, + "balance_loss_mlp": 0.01262152, + "epoch": 0.48008417255373514, + "flos": 17091406967040.0, + "grad_norm": 1.5773823669430012, + "language_loss": 0.67127079, + "learning_rate": 2.2256013210149247e-06, + "loss": 0.74841756, + "num_input_tokens_seen": 171675710, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.12988281, + "step": 7985, + "time_per_iteration": 2.4906041622161865 + }, + { + "auxiliary_loss_clip": 0.06439713, + "auxiliary_loss_mlp": 0.01270507, + "balance_loss_clip": 0.0627727, + "balance_loss_mlp": 0.01256458, + "epoch": 0.4801442958064031, + "flos": 15418762963200.0, + "grad_norm": 1.6902399231491212, + "language_loss": 0.70749509, + "learning_rate": 2.225214340743835e-06, + "loss": 0.78459728, + "num_input_tokens_seen": 171692510, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.14056396, + "step": 7986, + "time_per_iteration": 2.52093243598938 + }, + { + "auxiliary_loss_clip": 0.06445119, + "auxiliary_loss_mlp": 0.01273703, + "balance_loss_clip": 0.06282695, + "balance_loss_mlp": 0.0125972, + "epoch": 0.4802044190590711, + "flos": 11478546167040.0, + "grad_norm": 1.9459651571320913, + "language_loss": 0.79178715, + "learning_rate": 2.2248273519325956e-06, + "loss": 0.86897534, + "num_input_tokens_seen": 171710235, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.13983154, + "step": 7987, + "time_per_iteration": 2.498640537261963 + }, + { + "auxiliary_loss_clip": 0.06442459, + "auxiliary_loss_mlp": 0.01274239, + "balance_loss_clip": 0.06282187, + "balance_loss_mlp": 0.01260029, + "epoch": 0.48026454231173904, + "flos": 20955874072320.0, + "grad_norm": 2.568897435463935, + "language_loss": 0.75366008, + "learning_rate": 2.2244403545958812e-06, + "loss": 0.83082712, + "num_input_tokens_seen": 171726715, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.14215088, + "step": 7988, + "time_per_iteration": 2.516512632369995 + }, + { + "auxiliary_loss_clip": 0.0644449, + "auxiliary_loss_mlp": 0.01267812, + "balance_loss_clip": 0.06284034, + "balance_loss_mlp": 0.01254651, + "epoch": 0.48032466556440706, + "flos": 20454220477440.0, + "grad_norm": 2.121657383550553, + "language_loss": 0.79781222, + "learning_rate": 2.224053348748365e-06, + "loss": 0.87493527, + "num_input_tokens_seen": 171743605, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.13140869, + "step": 7989, + "time_per_iteration": 2.5021252632141113 + }, + { + "auxiliary_loss_clip": 0.06450642, + "auxiliary_loss_mlp": 0.01272628, + "balance_loss_clip": 0.0628516, + "balance_loss_mlp": 0.01259277, + "epoch": 0.480384788817075, + "flos": 37129507269120.0, + "grad_norm": 1.6027553338262992, + "language_loss": 0.73628318, + "learning_rate": 2.223666334404724e-06, + "loss": 0.81351584, + "num_input_tokens_seen": 171765445, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.13360596, + "step": 7990, + "time_per_iteration": 2.678316593170166 + }, + { + "auxiliary_loss_clip": 0.06340674, + "auxiliary_loss_mlp": 0.01254539, + "balance_loss_clip": 0.06272323, + "balance_loss_mlp": 0.01252124, + "epoch": 0.480444912069743, + "flos": 69572103281280.0, + "grad_norm": 0.7463246314152452, + "language_loss": 0.59028065, + "learning_rate": 2.223279311579633e-06, + "loss": 0.66623276, + "num_input_tokens_seen": 171830115, + "router_z_loss_clip": 0.68359375, + "router_z_loss_mlp": 0.02412415, + "step": 7991, + "time_per_iteration": 3.2123708724975586 + }, + { + "auxiliary_loss_clip": 0.06440669, + "auxiliary_loss_mlp": 0.0127166, + "balance_loss_clip": 0.06280738, + "balance_loss_mlp": 0.01258493, + "epoch": 0.48050503532241096, + "flos": 29829453782400.0, + "grad_norm": 1.8077991766436714, + "language_loss": 0.67425305, + "learning_rate": 2.222892280287768e-06, + "loss": 0.75137639, + "num_input_tokens_seen": 171849135, + "router_z_loss_clip": 1.60058594, + "router_z_loss_mlp": 0.1317749, + "step": 7992, + "time_per_iteration": 4.022457599639893 + }, + { + "auxiliary_loss_clip": 0.06441684, + "auxiliary_loss_mlp": 0.01270903, + "balance_loss_clip": 0.06280079, + "balance_loss_mlp": 0.01257289, + "epoch": 0.4805651585750789, + "flos": 23954865154560.0, + "grad_norm": 1.520335815005364, + "language_loss": 0.76567221, + "learning_rate": 2.2225052405438056e-06, + "loss": 0.84279805, + "num_input_tokens_seen": 171868880, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.13616943, + "step": 7993, + "time_per_iteration": 2.5975513458251953 + }, + { + "auxiliary_loss_clip": 0.0643717, + "auxiliary_loss_mlp": 0.012705, + "balance_loss_clip": 0.06281759, + "balance_loss_mlp": 0.01257101, + "epoch": 0.4806252818277469, + "flos": 25672385819520.0, + "grad_norm": 1.5304271246014225, + "language_loss": 0.78575444, + "learning_rate": 2.222118192362422e-06, + "loss": 0.86283118, + "num_input_tokens_seen": 171889455, + "router_z_loss_clip": 1.55273438, + "router_z_loss_mlp": 0.1340332, + "step": 7994, + "time_per_iteration": 3.9770989418029785 + }, + { + "auxiliary_loss_clip": 0.06441342, + "auxiliary_loss_mlp": 0.01268981, + "balance_loss_clip": 0.06282856, + "balance_loss_mlp": 0.01255284, + "epoch": 0.48068540508041485, + "flos": 13157059956480.0, + "grad_norm": 1.7612496141579397, + "language_loss": 0.80023497, + "learning_rate": 2.2217311357582946e-06, + "loss": 0.87733817, + "num_input_tokens_seen": 171906070, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.13702393, + "step": 7995, + "time_per_iteration": 2.565765380859375 + }, + { + "auxiliary_loss_clip": 0.06436922, + "auxiliary_loss_mlp": 0.01271915, + "balance_loss_clip": 0.06281693, + "balance_loss_mlp": 0.01259499, + "epoch": 0.4807455283330828, + "flos": 21182787728640.0, + "grad_norm": 1.7014068364920145, + "language_loss": 0.82857656, + "learning_rate": 2.2213440707461e-06, + "loss": 0.90566498, + "num_input_tokens_seen": 171926515, + "router_z_loss_clip": 1.55078125, + "router_z_loss_mlp": 0.12408447, + "step": 7996, + "time_per_iteration": 2.5223636627197266 + }, + { + "auxiliary_loss_clip": 0.06437848, + "auxiliary_loss_mlp": 0.01273993, + "balance_loss_clip": 0.06283682, + "balance_loss_mlp": 0.0126104, + "epoch": 0.4808056515857508, + "flos": 12280850611200.0, + "grad_norm": 2.0553444119055095, + "language_loss": 0.81048906, + "learning_rate": 2.220956997340516e-06, + "loss": 0.88760751, + "num_input_tokens_seen": 171943845, + "router_z_loss_clip": 1.54003906, + "router_z_loss_mlp": 0.12957764, + "step": 7997, + "time_per_iteration": 2.5387723445892334 + }, + { + "auxiliary_loss_clip": 0.06439243, + "auxiliary_loss_mlp": 0.01272881, + "balance_loss_clip": 0.06278609, + "balance_loss_mlp": 0.01258886, + "epoch": 0.48086577483841875, + "flos": 24832835435520.0, + "grad_norm": 1.673774189345091, + "language_loss": 0.72584945, + "learning_rate": 2.220569915556221e-06, + "loss": 0.80297071, + "num_input_tokens_seen": 171964970, + "router_z_loss_clip": 1.60742188, + "router_z_loss_mlp": 0.13989258, + "step": 7998, + "time_per_iteration": 2.5332131385803223 + }, + { + "auxiliary_loss_clip": 0.06438513, + "auxiliary_loss_mlp": 0.0127211, + "balance_loss_clip": 0.06282588, + "balance_loss_mlp": 0.01258931, + "epoch": 0.4809258980910867, + "flos": 24472786440960.0, + "grad_norm": 1.7584112558628078, + "language_loss": 0.71207035, + "learning_rate": 2.220182825407892e-06, + "loss": 0.78917658, + "num_input_tokens_seen": 171986340, + "router_z_loss_clip": 1.55664062, + "router_z_loss_mlp": 0.1317749, + "step": 7999, + "time_per_iteration": 2.5675172805786133 + }, + { + "auxiliary_loss_clip": 0.06447413, + "auxiliary_loss_mlp": 0.01268559, + "balance_loss_clip": 0.06285158, + "balance_loss_mlp": 0.01254581, + "epoch": 0.4809860213437547, + "flos": 21222465436800.0, + "grad_norm": 1.5803850534596136, + "language_loss": 0.71622467, + "learning_rate": 2.2197957269102083e-06, + "loss": 0.79338437, + "num_input_tokens_seen": 172007300, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.13983154, + "step": 8000, + "time_per_iteration": 4.0574305057525635 + }, + { + "auxiliary_loss_clip": 0.06440975, + "auxiliary_loss_mlp": 0.01266748, + "balance_loss_clip": 0.06282955, + "balance_loss_mlp": 0.01253558, + "epoch": 0.48104614459642264, + "flos": 37640929864320.0, + "grad_norm": 1.3783876991224597, + "language_loss": 0.75060636, + "learning_rate": 2.2194086200778485e-06, + "loss": 0.82768357, + "num_input_tokens_seen": 172029585, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.13189697, + "step": 8001, + "time_per_iteration": 2.6750619411468506 + }, + { + "auxiliary_loss_clip": 0.06444116, + "auxiliary_loss_mlp": 0.01269598, + "balance_loss_clip": 0.06285578, + "balance_loss_mlp": 0.0125667, + "epoch": 0.48110626784909066, + "flos": 18412093895040.0, + "grad_norm": 3.3850625220280066, + "language_loss": 0.81721932, + "learning_rate": 2.219021504925493e-06, + "loss": 0.89435649, + "num_input_tokens_seen": 172047495, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.12921143, + "step": 8002, + "time_per_iteration": 2.537611961364746 + }, + { + "auxiliary_loss_clip": 0.06444092, + "auxiliary_loss_mlp": 0.01266064, + "balance_loss_clip": 0.06282309, + "balance_loss_mlp": 0.0125232, + "epoch": 0.48116639110175863, + "flos": 28447481992320.0, + "grad_norm": 1.6717054522334394, + "language_loss": 0.71586967, + "learning_rate": 2.218634381467819e-06, + "loss": 0.79297119, + "num_input_tokens_seen": 172067625, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.13739014, + "step": 8003, + "time_per_iteration": 2.586836576461792 + }, + { + "auxiliary_loss_clip": 0.06435338, + "auxiliary_loss_mlp": 0.01268946, + "balance_loss_clip": 0.0628237, + "balance_loss_mlp": 0.01256375, + "epoch": 0.4812265143544266, + "flos": 21731582044800.0, + "grad_norm": 1.5740971137450945, + "language_loss": 0.82286322, + "learning_rate": 2.218247249719507e-06, + "loss": 0.89990604, + "num_input_tokens_seen": 172087885, + "router_z_loss_clip": 1.52832031, + "router_z_loss_mlp": 0.12561035, + "step": 8004, + "time_per_iteration": 2.5606155395507812 + }, + { + "auxiliary_loss_clip": 0.06454347, + "auxiliary_loss_mlp": 0.01272857, + "balance_loss_clip": 0.06285338, + "balance_loss_mlp": 0.01258004, + "epoch": 0.48128663760709456, + "flos": 13229707046400.0, + "grad_norm": 2.0390359670143465, + "language_loss": 0.77871376, + "learning_rate": 2.217860109695239e-06, + "loss": 0.85598582, + "num_input_tokens_seen": 172105815, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.14837646, + "step": 8005, + "time_per_iteration": 2.47816801071167 + }, + { + "auxiliary_loss_clip": 0.06444031, + "auxiliary_loss_mlp": 0.01265777, + "balance_loss_clip": 0.06283107, + "balance_loss_mlp": 0.01252902, + "epoch": 0.4813467608597625, + "flos": 24250317050880.0, + "grad_norm": 8.997763816911675, + "language_loss": 0.71145892, + "learning_rate": 2.217472961409692e-06, + "loss": 0.78855699, + "num_input_tokens_seen": 172126125, + "router_z_loss_clip": 1.61132812, + "router_z_loss_mlp": 0.12866211, + "step": 8006, + "time_per_iteration": 3.998465061187744 + }, + { + "auxiliary_loss_clip": 0.06443979, + "auxiliary_loss_mlp": 0.0126724, + "balance_loss_clip": 0.06283164, + "balance_loss_mlp": 0.01253502, + "epoch": 0.4814068841124305, + "flos": 27486131299200.0, + "grad_norm": 1.774717747938, + "language_loss": 0.7057631, + "learning_rate": 2.2170858048775495e-06, + "loss": 0.78287524, + "num_input_tokens_seen": 172141945, + "router_z_loss_clip": 1.60839844, + "router_z_loss_mlp": 0.13726807, + "step": 8007, + "time_per_iteration": 2.6010959148406982 + }, + { + "auxiliary_loss_clip": 0.06445048, + "auxiliary_loss_mlp": 0.01270091, + "balance_loss_clip": 0.06283326, + "balance_loss_mlp": 0.01256382, + "epoch": 0.48146700736509845, + "flos": 19578933527040.0, + "grad_norm": 1.7543289086675633, + "language_loss": 0.72215438, + "learning_rate": 2.2166986401134914e-06, + "loss": 0.79930574, + "num_input_tokens_seen": 172161095, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.137146, + "step": 8008, + "time_per_iteration": 2.5119597911834717 + }, + { + "auxiliary_loss_clip": 0.064485, + "auxiliary_loss_mlp": 0.01270116, + "balance_loss_clip": 0.06287649, + "balance_loss_mlp": 0.01256699, + "epoch": 0.4815271306177664, + "flos": 20633448360960.0, + "grad_norm": 2.3493781090087427, + "language_loss": 0.61680824, + "learning_rate": 2.216311467132199e-06, + "loss": 0.6939944, + "num_input_tokens_seen": 172178750, + "router_z_loss_clip": 1.60839844, + "router_z_loss_mlp": 0.13421631, + "step": 8009, + "time_per_iteration": 2.531614303588867 + }, + { + "auxiliary_loss_clip": 0.06337314, + "auxiliary_loss_mlp": 0.01256915, + "balance_loss_clip": 0.062691, + "balance_loss_mlp": 0.01254566, + "epoch": 0.4815872538704344, + "flos": 67710168904320.0, + "grad_norm": 0.8824544242806498, + "language_loss": 0.61164761, + "learning_rate": 2.2159242859483547e-06, + "loss": 0.68758988, + "num_input_tokens_seen": 172240235, + "router_z_loss_clip": 0.68017578, + "router_z_loss_mlp": 0.0234375, + "step": 8010, + "time_per_iteration": 3.1565909385681152 + }, + { + "auxiliary_loss_clip": 0.06445675, + "auxiliary_loss_mlp": 0.01270127, + "balance_loss_clip": 0.06287005, + "balance_loss_mlp": 0.01256364, + "epoch": 0.48164737712310235, + "flos": 22827451668480.0, + "grad_norm": 1.6746394307020662, + "language_loss": 0.73637664, + "learning_rate": 2.215537096576639e-06, + "loss": 0.81353462, + "num_input_tokens_seen": 172259875, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.1373291, + "step": 8011, + "time_per_iteration": 2.6046555042266846 + }, + { + "auxiliary_loss_clip": 0.0643819, + "auxiliary_loss_mlp": 0.01270392, + "balance_loss_clip": 0.06284268, + "balance_loss_mlp": 0.01257887, + "epoch": 0.4817075003757703, + "flos": 23740865026560.0, + "grad_norm": 1.8215201759984196, + "language_loss": 0.79494172, + "learning_rate": 2.2151498990317354e-06, + "loss": 0.87202752, + "num_input_tokens_seen": 172280150, + "router_z_loss_clip": 1.5390625, + "router_z_loss_mlp": 0.125, + "step": 8012, + "time_per_iteration": 2.5538861751556396 + }, + { + "auxiliary_loss_clip": 0.06444636, + "auxiliary_loss_mlp": 0.0127321, + "balance_loss_clip": 0.0628611, + "balance_loss_mlp": 0.01259501, + "epoch": 0.4817676236284383, + "flos": 28190282284800.0, + "grad_norm": 1.6047815948624113, + "language_loss": 0.73606604, + "learning_rate": 2.214762693328326e-06, + "loss": 0.81324452, + "num_input_tokens_seen": 172300810, + "router_z_loss_clip": 1.58398438, + "router_z_loss_mlp": 0.1373291, + "step": 8013, + "time_per_iteration": 2.6944220066070557 + }, + { + "auxiliary_loss_clip": 0.06441531, + "auxiliary_loss_mlp": 0.01264943, + "balance_loss_clip": 0.06285915, + "balance_loss_mlp": 0.01253094, + "epoch": 0.48182774688110624, + "flos": 17097360606720.0, + "grad_norm": 1.8755216355849496, + "language_loss": 0.91141838, + "learning_rate": 2.214375479481094e-06, + "loss": 0.98848319, + "num_input_tokens_seen": 172317930, + "router_z_loss_clip": 1.55371094, + "router_z_loss_mlp": 0.11859131, + "step": 8014, + "time_per_iteration": 2.501678466796875 + }, + { + "auxiliary_loss_clip": 0.06448989, + "auxiliary_loss_mlp": 0.0126993, + "balance_loss_clip": 0.06285382, + "balance_loss_mlp": 0.01256149, + "epoch": 0.4818878701337742, + "flos": 12572780636160.0, + "grad_norm": 2.068904383285823, + "language_loss": 0.75191212, + "learning_rate": 2.213988257504722e-06, + "loss": 0.82910132, + "num_input_tokens_seen": 172336340, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.13775635, + "step": 8015, + "time_per_iteration": 2.574915885925293 + }, + { + "auxiliary_loss_clip": 0.06450102, + "auxiliary_loss_mlp": 0.01268556, + "balance_loss_clip": 0.06285062, + "balance_loss_mlp": 0.01254942, + "epoch": 0.48194799338644223, + "flos": 24615481144320.0, + "grad_norm": 2.7940595212226693, + "language_loss": 0.80323374, + "learning_rate": 2.213601027413894e-06, + "loss": 0.88042033, + "num_input_tokens_seen": 172354315, + "router_z_loss_clip": 1.65136719, + "router_z_loss_mlp": 0.13604736, + "step": 8016, + "time_per_iteration": 2.545562744140625 + }, + { + "auxiliary_loss_clip": 0.06441234, + "auxiliary_loss_mlp": 0.01268233, + "balance_loss_clip": 0.06288698, + "balance_loss_mlp": 0.01255996, + "epoch": 0.4820081166391102, + "flos": 21111482304000.0, + "grad_norm": 1.7856263642868424, + "language_loss": 0.77840865, + "learning_rate": 2.2132137892232933e-06, + "loss": 0.85550332, + "num_input_tokens_seen": 172372695, + "router_z_loss_clip": 1.52636719, + "router_z_loss_mlp": 0.12237549, + "step": 8017, + "time_per_iteration": 2.548884153366089 + }, + { + "auxiliary_loss_clip": 0.06442289, + "auxiliary_loss_mlp": 0.01274842, + "balance_loss_clip": 0.06287417, + "balance_loss_mlp": 0.01261729, + "epoch": 0.48206823989177816, + "flos": 25271569013760.0, + "grad_norm": 1.8858588216369734, + "language_loss": 0.80356038, + "learning_rate": 2.2128265429476043e-06, + "loss": 0.8807317, + "num_input_tokens_seen": 172390905, + "router_z_loss_clip": 1.54785156, + "router_z_loss_mlp": 0.13098145, + "step": 8018, + "time_per_iteration": 2.5485877990722656 + }, + { + "auxiliary_loss_clip": 0.06443836, + "auxiliary_loss_mlp": 0.01268171, + "balance_loss_clip": 0.06283845, + "balance_loss_mlp": 0.01255177, + "epoch": 0.4821283631444461, + "flos": 24652056251520.0, + "grad_norm": 1.8013341989070415, + "language_loss": 0.76402384, + "learning_rate": 2.2124392886015124e-06, + "loss": 0.84114391, + "num_input_tokens_seen": 172412295, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.12988281, + "step": 8019, + "time_per_iteration": 2.583380937576294 + }, + { + "auxiliary_loss_clip": 0.06444359, + "auxiliary_loss_mlp": 0.01271658, + "balance_loss_clip": 0.06285813, + "balance_loss_mlp": 0.01258826, + "epoch": 0.4821884863971141, + "flos": 23959015931520.0, + "grad_norm": 1.6800720935629156, + "language_loss": 0.79355383, + "learning_rate": 2.212052026199701e-06, + "loss": 0.87071395, + "num_input_tokens_seen": 172432625, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 0.12841797, + "step": 8020, + "time_per_iteration": 2.531282663345337 + }, + { + "auxiliary_loss_clip": 0.06436829, + "auxiliary_loss_mlp": 0.01270595, + "balance_loss_clip": 0.06283663, + "balance_loss_mlp": 0.01257655, + "epoch": 0.48224860964978206, + "flos": 17165605357440.0, + "grad_norm": 1.8962985695511603, + "language_loss": 0.70203435, + "learning_rate": 2.211664755756855e-06, + "loss": 0.77910858, + "num_input_tokens_seen": 172450010, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.12945557, + "step": 8021, + "time_per_iteration": 2.5050454139709473 + }, + { + "auxiliary_loss_clip": 0.06448636, + "auxiliary_loss_mlp": 0.01267557, + "balance_loss_clip": 0.06284462, + "balance_loss_mlp": 0.01253568, + "epoch": 0.48230873290245, + "flos": 23082513096960.0, + "grad_norm": 1.8444275684859448, + "language_loss": 0.63131356, + "learning_rate": 2.2112774772876603e-06, + "loss": 0.70847559, + "num_input_tokens_seen": 172469080, + "router_z_loss_clip": 1.63964844, + "router_z_loss_mlp": 0.14001465, + "step": 8022, + "time_per_iteration": 2.5153286457061768 + }, + { + "auxiliary_loss_clip": 0.06439438, + "auxiliary_loss_mlp": 0.0127221, + "balance_loss_clip": 0.06284659, + "balance_loss_mlp": 0.01259544, + "epoch": 0.482368856155118, + "flos": 19359440956800.0, + "grad_norm": 2.0552590280374625, + "language_loss": 0.67256629, + "learning_rate": 2.2108901908068028e-06, + "loss": 0.74968272, + "num_input_tokens_seen": 172484850, + "router_z_loss_clip": 1.54785156, + "router_z_loss_mlp": 0.12664795, + "step": 8023, + "time_per_iteration": 2.5504207611083984 + }, + { + "auxiliary_loss_clip": 0.06441902, + "auxiliary_loss_mlp": 0.01274331, + "balance_loss_clip": 0.06284256, + "balance_loss_mlp": 0.01261426, + "epoch": 0.48242897940778595, + "flos": 20084318628480.0, + "grad_norm": 1.5610336564699971, + "language_loss": 0.76933229, + "learning_rate": 2.2105028963289683e-06, + "loss": 0.84649462, + "num_input_tokens_seen": 172503525, + "router_z_loss_clip": 1.57714844, + "router_z_loss_mlp": 0.12915039, + "step": 8024, + "time_per_iteration": 2.576347589492798 + }, + { + "auxiliary_loss_clip": 0.06441621, + "auxiliary_loss_mlp": 0.01268624, + "balance_loss_clip": 0.06283119, + "balance_loss_mlp": 0.01255553, + "epoch": 0.4824891026604539, + "flos": 23410682812800.0, + "grad_norm": 1.519749434932375, + "language_loss": 0.75555682, + "learning_rate": 2.2101155938688423e-06, + "loss": 0.83265924, + "num_input_tokens_seen": 172524360, + "router_z_loss_clip": 1.58203125, + "router_z_loss_mlp": 0.13067627, + "step": 8025, + "time_per_iteration": 2.559722900390625 + }, + { + "auxiliary_loss_clip": 0.06445173, + "auxiliary_loss_mlp": 0.01270078, + "balance_loss_clip": 0.06286605, + "balance_loss_mlp": 0.01256536, + "epoch": 0.4825492259131219, + "flos": 20373691104000.0, + "grad_norm": 3.210842824131336, + "language_loss": 0.71099132, + "learning_rate": 2.209728283441112e-06, + "loss": 0.78814387, + "num_input_tokens_seen": 172541480, + "router_z_loss_clip": 1.58398438, + "router_z_loss_mlp": 0.13543701, + "step": 8026, + "time_per_iteration": 2.512563943862915 + }, + { + "auxiliary_loss_clip": 0.06450065, + "auxiliary_loss_mlp": 0.0127128, + "balance_loss_clip": 0.06287996, + "balance_loss_mlp": 0.01257094, + "epoch": 0.48260934916578985, + "flos": 14324193077760.0, + "grad_norm": 2.0787728376845385, + "language_loss": 0.74646676, + "learning_rate": 2.209340965060465e-06, + "loss": 0.82368022, + "num_input_tokens_seen": 172559005, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.14190674, + "step": 8027, + "time_per_iteration": 2.523252248764038 + }, + { + "auxiliary_loss_clip": 0.06445143, + "auxiliary_loss_mlp": 0.01269951, + "balance_loss_clip": 0.06285772, + "balance_loss_mlp": 0.01257166, + "epoch": 0.4826694724184578, + "flos": 22126654846080.0, + "grad_norm": 1.6924958309049165, + "language_loss": 0.67414463, + "learning_rate": 2.2089536387415868e-06, + "loss": 0.75129557, + "num_input_tokens_seen": 172578435, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.12792969, + "step": 8028, + "time_per_iteration": 2.5118508338928223 + }, + { + "auxiliary_loss_clip": 0.06443746, + "auxiliary_loss_mlp": 0.01268069, + "balance_loss_clip": 0.06285068, + "balance_loss_mlp": 0.01254926, + "epoch": 0.48272959567112583, + "flos": 16186882141440.0, + "grad_norm": 1.4109383431826554, + "language_loss": 0.73031461, + "learning_rate": 2.2085663044991655e-06, + "loss": 0.80743277, + "num_input_tokens_seen": 172596095, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.13134766, + "step": 8029, + "time_per_iteration": 2.513986587524414 + }, + { + "auxiliary_loss_clip": 0.06447576, + "auxiliary_loss_mlp": 0.01267005, + "balance_loss_clip": 0.0628765, + "balance_loss_mlp": 0.01253755, + "epoch": 0.4827897189237938, + "flos": 23186326705920.0, + "grad_norm": 2.2851559020013994, + "language_loss": 0.84759653, + "learning_rate": 2.2081789623478896e-06, + "loss": 0.92474234, + "num_input_tokens_seen": 172615255, + "router_z_loss_clip": 1.59863281, + "router_z_loss_mlp": 0.13256836, + "step": 8030, + "time_per_iteration": 2.523336410522461 + }, + { + "auxiliary_loss_clip": 0.0644383, + "auxiliary_loss_mlp": 0.0126632, + "balance_loss_clip": 0.06286349, + "balance_loss_mlp": 0.01253374, + "epoch": 0.48284984217646176, + "flos": 21659018808960.0, + "grad_norm": 2.6563677126547858, + "language_loss": 0.73703504, + "learning_rate": 2.2077916123024466e-06, + "loss": 0.81413656, + "num_input_tokens_seen": 172633185, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.12945557, + "step": 8031, + "time_per_iteration": 2.523465633392334 + }, + { + "auxiliary_loss_clip": 0.06451262, + "auxiliary_loss_mlp": 0.01268996, + "balance_loss_clip": 0.06285872, + "balance_loss_mlp": 0.01254548, + "epoch": 0.48290996542912973, + "flos": 31475501314560.0, + "grad_norm": 1.5957405541522132, + "language_loss": 0.71345282, + "learning_rate": 2.2074042543775245e-06, + "loss": 0.79065537, + "num_input_tokens_seen": 172654280, + "router_z_loss_clip": 1.65234375, + "router_z_loss_mlp": 0.14434814, + "step": 8032, + "time_per_iteration": 4.084775924682617 + }, + { + "auxiliary_loss_clip": 0.06441716, + "auxiliary_loss_mlp": 0.01271696, + "balance_loss_clip": 0.06285156, + "balance_loss_mlp": 0.01259066, + "epoch": 0.4829700886817977, + "flos": 24468803372160.0, + "grad_norm": 1.3669631944631024, + "language_loss": 0.74361598, + "learning_rate": 2.2070168885878126e-06, + "loss": 0.82075012, + "num_input_tokens_seen": 172675545, + "router_z_loss_clip": 1.56542969, + "router_z_loss_mlp": 0.12609863, + "step": 8033, + "time_per_iteration": 2.558655023574829 + }, + { + "auxiliary_loss_clip": 0.06455428, + "auxiliary_loss_mlp": 0.0126933, + "balance_loss_clip": 0.06290704, + "balance_loss_mlp": 0.01255436, + "epoch": 0.48303021193446566, + "flos": 25709170561920.0, + "grad_norm": 1.5251236339326817, + "language_loss": 0.83579373, + "learning_rate": 2.2066295149479996e-06, + "loss": 0.91304129, + "num_input_tokens_seen": 172696455, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.13909912, + "step": 8034, + "time_per_iteration": 4.034566402435303 + }, + { + "auxiliary_loss_clip": 0.06441804, + "auxiliary_loss_mlp": 0.01267333, + "balance_loss_clip": 0.06286483, + "balance_loss_mlp": 0.01255162, + "epoch": 0.4830903351871336, + "flos": 20091613933440.0, + "grad_norm": 1.4995747649605073, + "language_loss": 0.80011666, + "learning_rate": 2.2062421334727744e-06, + "loss": 0.87720799, + "num_input_tokens_seen": 172716720, + "router_z_loss_clip": 1.55273438, + "router_z_loss_mlp": 0.12176514, + "step": 8035, + "time_per_iteration": 2.560216188430786 + }, + { + "auxiliary_loss_clip": 0.06443267, + "auxiliary_loss_mlp": 0.01272391, + "balance_loss_clip": 0.06284694, + "balance_loss_mlp": 0.01257996, + "epoch": 0.4831504584398016, + "flos": 39460670910720.0, + "grad_norm": 2.4180718513556196, + "language_loss": 0.69735384, + "learning_rate": 2.2058547441768267e-06, + "loss": 0.77451038, + "num_input_tokens_seen": 172737435, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.14385986, + "step": 8036, + "time_per_iteration": 2.676248550415039 + }, + { + "auxiliary_loss_clip": 0.06441773, + "auxiliary_loss_mlp": 0.01267179, + "balance_loss_clip": 0.06283154, + "balance_loss_mlp": 0.01254638, + "epoch": 0.48321058169246955, + "flos": 20012006954880.0, + "grad_norm": 1.964916404489229, + "language_loss": 0.7269727, + "learning_rate": 2.205467347074847e-06, + "loss": 0.80406225, + "num_input_tokens_seen": 172755700, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.12536621, + "step": 8037, + "time_per_iteration": 2.5361721515655518 + }, + { + "auxiliary_loss_clip": 0.06449978, + "auxiliary_loss_mlp": 0.01267952, + "balance_loss_clip": 0.06284893, + "balance_loss_mlp": 0.01254594, + "epoch": 0.4832707049451375, + "flos": 20747869511040.0, + "grad_norm": 2.294242093364334, + "language_loss": 0.69135344, + "learning_rate": 2.205079942181525e-06, + "loss": 0.76853275, + "num_input_tokens_seen": 172775185, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.13366699, + "step": 8038, + "time_per_iteration": 2.5300488471984863 + }, + { + "auxiliary_loss_clip": 0.06441218, + "auxiliary_loss_mlp": 0.01266351, + "balance_loss_clip": 0.06284897, + "balance_loss_mlp": 0.01253161, + "epoch": 0.4833308281978055, + "flos": 33153889322880.0, + "grad_norm": 1.5080177559172256, + "language_loss": 0.79238868, + "learning_rate": 2.20469252951155e-06, + "loss": 0.8694644, + "num_input_tokens_seen": 172796990, + "router_z_loss_clip": 1.56347656, + "router_z_loss_mlp": 0.13201904, + "step": 8039, + "time_per_iteration": 4.106697082519531 + }, + { + "auxiliary_loss_clip": 0.06443603, + "auxiliary_loss_mlp": 0.01270239, + "balance_loss_clip": 0.06284612, + "balance_loss_mlp": 0.01257221, + "epoch": 0.48339095145047345, + "flos": 19105301923200.0, + "grad_norm": 2.5245127885531926, + "language_loss": 0.78196943, + "learning_rate": 2.2043051090796143e-06, + "loss": 0.85910785, + "num_input_tokens_seen": 172814915, + "router_z_loss_clip": 1.58886719, + "router_z_loss_mlp": 0.13024902, + "step": 8040, + "time_per_iteration": 2.51356840133667 + }, + { + "auxiliary_loss_clip": 0.06449578, + "auxiliary_loss_mlp": 0.01268689, + "balance_loss_clip": 0.06287356, + "balance_loss_mlp": 0.01254342, + "epoch": 0.4834510747031414, + "flos": 34468035632640.0, + "grad_norm": 1.5686841461958603, + "language_loss": 0.75648201, + "learning_rate": 2.203917680900409e-06, + "loss": 0.83366466, + "num_input_tokens_seen": 172837060, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.14337158, + "step": 8041, + "time_per_iteration": 2.6821110248565674 + }, + { + "auxiliary_loss_clip": 0.06444554, + "auxiliary_loss_mlp": 0.01274011, + "balance_loss_clip": 0.06290209, + "balance_loss_mlp": 0.01261244, + "epoch": 0.48351119795580944, + "flos": 27388187475840.0, + "grad_norm": 1.655786729526556, + "language_loss": 0.66309774, + "learning_rate": 2.203530244988624e-06, + "loss": 0.74028337, + "num_input_tokens_seen": 172856545, + "router_z_loss_clip": 1.54296875, + "router_z_loss_mlp": 0.12756348, + "step": 8042, + "time_per_iteration": 2.587979316711426 + }, + { + "auxiliary_loss_clip": 0.0635567, + "auxiliary_loss_mlp": 0.01262787, + "balance_loss_clip": 0.06287327, + "balance_loss_mlp": 0.012603, + "epoch": 0.4835713212084774, + "flos": 67162967815680.0, + "grad_norm": 0.683297043643475, + "language_loss": 0.58432257, + "learning_rate": 2.2031428013589517e-06, + "loss": 0.66050708, + "num_input_tokens_seen": 172923055, + "router_z_loss_clip": 0.68359375, + "router_z_loss_mlp": 0.02485657, + "step": 8043, + "time_per_iteration": 3.240037441253662 + }, + { + "auxiliary_loss_clip": 0.06448962, + "auxiliary_loss_mlp": 0.01270561, + "balance_loss_clip": 0.06288527, + "balance_loss_mlp": 0.01256548, + "epoch": 0.48363144446114537, + "flos": 17973234535680.0, + "grad_norm": 8.666689726695457, + "language_loss": 0.71932065, + "learning_rate": 2.2027553500260847e-06, + "loss": 0.79651588, + "num_input_tokens_seen": 172940700, + "router_z_loss_clip": 1.60546875, + "router_z_loss_mlp": 0.14013672, + "step": 8044, + "time_per_iteration": 2.557222604751587 + }, + { + "auxiliary_loss_clip": 0.06443186, + "auxiliary_loss_mlp": 0.01271215, + "balance_loss_clip": 0.06287612, + "balance_loss_mlp": 0.01257667, + "epoch": 0.48369156771381333, + "flos": 20599556584320.0, + "grad_norm": 1.2792089170093015, + "language_loss": 0.76084363, + "learning_rate": 2.202367891004714e-06, + "loss": 0.83798766, + "num_input_tokens_seen": 172961125, + "router_z_loss_clip": 1.55371094, + "router_z_loss_mlp": 0.13549805, + "step": 8045, + "time_per_iteration": 3.9927117824554443 + }, + { + "auxiliary_loss_clip": 0.06452677, + "auxiliary_loss_mlp": 0.01268119, + "balance_loss_clip": 0.06291251, + "balance_loss_mlp": 0.01255274, + "epoch": 0.4837516909664813, + "flos": 22681780145280.0, + "grad_norm": 1.8159113209886955, + "language_loss": 0.69591677, + "learning_rate": 2.201980424309533e-06, + "loss": 0.77312469, + "num_input_tokens_seen": 172980405, + "router_z_loss_clip": 1.61230469, + "router_z_loss_mlp": 0.12854004, + "step": 8046, + "time_per_iteration": 2.563061237335205 + }, + { + "auxiliary_loss_clip": 0.06444287, + "auxiliary_loss_mlp": 0.01272531, + "balance_loss_clip": 0.06285235, + "balance_loss_mlp": 0.01259674, + "epoch": 0.48381181421914926, + "flos": 25525414558080.0, + "grad_norm": 1.7918831202662233, + "language_loss": 0.83005214, + "learning_rate": 2.2015929499552337e-06, + "loss": 0.90722024, + "num_input_tokens_seen": 172999105, + "router_z_loss_clip": 1.58886719, + "router_z_loss_mlp": 0.12866211, + "step": 8047, + "time_per_iteration": 2.5624239444732666 + }, + { + "auxiliary_loss_clip": 0.06441472, + "auxiliary_loss_mlp": 0.01268193, + "balance_loss_clip": 0.06286557, + "balance_loss_mlp": 0.01255522, + "epoch": 0.4838719374718172, + "flos": 24214454703360.0, + "grad_norm": 3.8503425220093273, + "language_loss": 0.8051095, + "learning_rate": 2.2012054679565092e-06, + "loss": 0.88220614, + "num_input_tokens_seen": 173019935, + "router_z_loss_clip": 1.54785156, + "router_z_loss_mlp": 0.12664795, + "step": 8048, + "time_per_iteration": 2.5535151958465576 + }, + { + "auxiliary_loss_clip": 0.06450336, + "auxiliary_loss_mlp": 0.01269587, + "balance_loss_clip": 0.06287669, + "balance_loss_mlp": 0.01255091, + "epoch": 0.4839320607244852, + "flos": 26731889971200.0, + "grad_norm": 1.601579819484506, + "language_loss": 0.8118276, + "learning_rate": 2.200817978328054e-06, + "loss": 0.88902682, + "num_input_tokens_seen": 173039700, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.14477539, + "step": 8049, + "time_per_iteration": 2.576237440109253 + }, + { + "auxiliary_loss_clip": 0.0644124, + "auxiliary_loss_mlp": 0.01266224, + "balance_loss_clip": 0.0628837, + "balance_loss_mlp": 0.01254392, + "epoch": 0.48399218397715316, + "flos": 20455142872320.0, + "grad_norm": 1.6782620987313854, + "language_loss": 0.7275942, + "learning_rate": 2.2004304810845602e-06, + "loss": 0.8046689, + "num_input_tokens_seen": 173059170, + "router_z_loss_clip": 1.52832031, + "router_z_loss_mlp": 0.1184082, + "step": 8050, + "time_per_iteration": 2.5001842975616455 + }, + { + "auxiliary_loss_clip": 0.06348944, + "auxiliary_loss_mlp": 0.01254327, + "balance_loss_clip": 0.06280461, + "balance_loss_mlp": 0.01252052, + "epoch": 0.4840523072298211, + "flos": 67199626776960.0, + "grad_norm": 0.6876828937687306, + "language_loss": 0.56319511, + "learning_rate": 2.200042976240723e-06, + "loss": 0.63922787, + "num_input_tokens_seen": 173119000, + "router_z_loss_clip": 0.6875, + "router_z_loss_mlp": 0.02278137, + "step": 8051, + "time_per_iteration": 3.1732234954833984 + }, + { + "auxiliary_loss_clip": 0.06445932, + "auxiliary_loss_mlp": 0.01267371, + "balance_loss_clip": 0.06285888, + "balance_loss_mlp": 0.01254806, + "epoch": 0.4841124304824891, + "flos": 22416782008320.0, + "grad_norm": 1.9466323687223244, + "language_loss": 0.75329518, + "learning_rate": 2.199655463811236e-06, + "loss": 0.83042824, + "num_input_tokens_seen": 173137570, + "router_z_loss_clip": 1.60058594, + "router_z_loss_mlp": 0.12554932, + "step": 8052, + "time_per_iteration": 2.525742769241333 + }, + { + "auxiliary_loss_clip": 0.06445011, + "auxiliary_loss_mlp": 0.01268398, + "balance_loss_clip": 0.0628748, + "balance_loss_mlp": 0.01255797, + "epoch": 0.48417255373515705, + "flos": 13848926319360.0, + "grad_norm": 9.22847684329053, + "language_loss": 0.65932119, + "learning_rate": 2.1992679438107936e-06, + "loss": 0.73645532, + "num_input_tokens_seen": 173154355, + "router_z_loss_clip": 1.57226562, + "router_z_loss_mlp": 0.1260376, + "step": 8053, + "time_per_iteration": 2.508634328842163 + }, + { + "auxiliary_loss_clip": 0.06439514, + "auxiliary_loss_mlp": 0.01270848, + "balance_loss_clip": 0.06286003, + "balance_loss_mlp": 0.01258242, + "epoch": 0.484232676987825, + "flos": 31657747944960.0, + "grad_norm": 1.9001102819500506, + "language_loss": 0.69764733, + "learning_rate": 2.198880416254091e-06, + "loss": 0.77475095, + "num_input_tokens_seen": 173174845, + "router_z_loss_clip": 1.53613281, + "router_z_loss_mlp": 0.12609863, + "step": 8054, + "time_per_iteration": 2.6046009063720703 + }, + { + "auxiliary_loss_clip": 0.06439343, + "auxiliary_loss_mlp": 0.01266256, + "balance_loss_clip": 0.062842, + "balance_loss_mlp": 0.01253578, + "epoch": 0.48429280024049304, + "flos": 24101878343040.0, + "grad_norm": 1.6288967613161636, + "language_loss": 0.69845426, + "learning_rate": 2.1984928811558233e-06, + "loss": 0.77551031, + "num_input_tokens_seen": 173195025, + "router_z_loss_clip": 1.55273438, + "router_z_loss_mlp": 0.12683105, + "step": 8055, + "time_per_iteration": 2.5645036697387695 + }, + { + "auxiliary_loss_clip": 0.06441051, + "auxiliary_loss_mlp": 0.01270203, + "balance_loss_clip": 0.06283379, + "balance_loss_mlp": 0.01257621, + "epoch": 0.484352923493161, + "flos": 17535842622720.0, + "grad_norm": 2.1100630556312256, + "language_loss": 0.63363564, + "learning_rate": 2.198105338530685e-06, + "loss": 0.71074814, + "num_input_tokens_seen": 173213065, + "router_z_loss_clip": 1.57617188, + "router_z_loss_mlp": 0.12597656, + "step": 8056, + "time_per_iteration": 2.4887776374816895 + }, + { + "auxiliary_loss_clip": 0.06441829, + "auxiliary_loss_mlp": 0.01269551, + "balance_loss_clip": 0.06283918, + "balance_loss_mlp": 0.0125639, + "epoch": 0.48441304674582897, + "flos": 29174204453760.0, + "grad_norm": 1.7583270452203597, + "language_loss": 0.67791545, + "learning_rate": 2.1977177883933726e-06, + "loss": 0.75502926, + "num_input_tokens_seen": 173234545, + "router_z_loss_clip": 1.58007812, + "router_z_loss_mlp": 0.1315918, + "step": 8057, + "time_per_iteration": 2.6147687435150146 + }, + { + "auxiliary_loss_clip": 0.06438136, + "auxiliary_loss_mlp": 0.01270959, + "balance_loss_clip": 0.06284122, + "balance_loss_mlp": 0.0125933, + "epoch": 0.48447316999849693, + "flos": 15891933369600.0, + "grad_norm": 1.7129310149903716, + "language_loss": 0.81615114, + "learning_rate": 2.1973302307585827e-06, + "loss": 0.89324206, + "num_input_tokens_seen": 173252175, + "router_z_loss_clip": 1.5390625, + "router_z_loss_mlp": 0.11627197, + "step": 8058, + "time_per_iteration": 2.499464273452759 + }, + { + "auxiliary_loss_clip": 0.06444308, + "auxiliary_loss_mlp": 0.01272607, + "balance_loss_clip": 0.06283933, + "balance_loss_mlp": 0.01259619, + "epoch": 0.4845332932511649, + "flos": 24386974260480.0, + "grad_norm": 1.694669299967896, + "language_loss": 0.79782939, + "learning_rate": 2.1969426656410097e-06, + "loss": 0.87499857, + "num_input_tokens_seen": 173268790, + "router_z_loss_clip": 1.60351562, + "router_z_loss_mlp": 0.12988281, + "step": 8059, + "time_per_iteration": 2.5456764698028564 + }, + { + "auxiliary_loss_clip": 0.06445169, + "auxiliary_loss_mlp": 0.0126972, + "balance_loss_clip": 0.06283809, + "balance_loss_mlp": 0.01256065, + "epoch": 0.48459341650383286, + "flos": 37124434097280.0, + "grad_norm": 2.171534570518566, + "language_loss": 0.67115712, + "learning_rate": 2.196555093055352e-06, + "loss": 0.74830604, + "num_input_tokens_seen": 173288030, + "router_z_loss_clip": 1.61230469, + "router_z_loss_mlp": 0.13659668, + "step": 8060, + "time_per_iteration": 2.639552593231201 + }, + { + "auxiliary_loss_clip": 0.06448266, + "auxiliary_loss_mlp": 0.01267897, + "balance_loss_clip": 0.06291284, + "balance_loss_mlp": 0.01255404, + "epoch": 0.48465353975650083, + "flos": 22973500535040.0, + "grad_norm": 1.9145476252385885, + "language_loss": 0.67691833, + "learning_rate": 2.1961675130163046e-06, + "loss": 0.75407994, + "num_input_tokens_seen": 173305965, + "router_z_loss_clip": 1.56835938, + "router_z_loss_mlp": 0.12506104, + "step": 8061, + "time_per_iteration": 2.636291265487671 + }, + { + "auxiliary_loss_clip": 0.06440581, + "auxiliary_loss_mlp": 0.012731, + "balance_loss_clip": 0.06285343, + "balance_loss_mlp": 0.01259581, + "epoch": 0.4847136630091688, + "flos": 17712680664960.0, + "grad_norm": 1.8103717294603696, + "language_loss": 0.83217871, + "learning_rate": 2.1957799255385653e-06, + "loss": 0.90931553, + "num_input_tokens_seen": 173321985, + "router_z_loss_clip": 1.55078125, + "router_z_loss_mlp": 0.13531494, + "step": 8062, + "time_per_iteration": 2.5335779190063477 + }, + { + "auxiliary_loss_clip": 0.06441268, + "auxiliary_loss_mlp": 0.01271147, + "balance_loss_clip": 0.06286018, + "balance_loss_mlp": 0.01259077, + "epoch": 0.48477378626183676, + "flos": 22024853735040.0, + "grad_norm": 1.4198166357723545, + "language_loss": 0.74425852, + "learning_rate": 2.1953923306368325e-06, + "loss": 0.82138264, + "num_input_tokens_seen": 173341315, + "router_z_loss_clip": 1.55273438, + "router_z_loss_mlp": 0.1206665, + "step": 8063, + "time_per_iteration": 2.575752019882202 + }, + { + "auxiliary_loss_clip": 0.06438752, + "auxiliary_loss_mlp": 0.01268531, + "balance_loss_clip": 0.06282612, + "balance_loss_mlp": 0.01256276, + "epoch": 0.4848339095145047, + "flos": 27970118881920.0, + "grad_norm": 1.5830553745787852, + "language_loss": 0.79034185, + "learning_rate": 2.1950047283258023e-06, + "loss": 0.86741465, + "num_input_tokens_seen": 173361055, + "router_z_loss_clip": 1.56054688, + "router_z_loss_mlp": 0.12255859, + "step": 8064, + "time_per_iteration": 2.601557731628418 + }, + { + "auxiliary_loss_clip": 0.06441826, + "auxiliary_loss_mlp": 0.01266756, + "balance_loss_clip": 0.06290108, + "balance_loss_mlp": 0.01254817, + "epoch": 0.4848940327671727, + "flos": 21695090791680.0, + "grad_norm": 1.71958305783472, + "language_loss": 0.795892, + "learning_rate": 2.194617118620173e-06, + "loss": 0.87297779, + "num_input_tokens_seen": 173379255, + "router_z_loss_clip": 1.51757812, + "router_z_loss_mlp": 0.1194458, + "step": 8065, + "time_per_iteration": 2.5325217247009277 + }, + { + "auxiliary_loss_clip": 0.06434904, + "auxiliary_loss_mlp": 0.0126868, + "balance_loss_clip": 0.06285697, + "balance_loss_mlp": 0.01256813, + "epoch": 0.48495415601984065, + "flos": 20637892627200.0, + "grad_norm": 1.7068711802888106, + "language_loss": 0.76162863, + "learning_rate": 2.194229501534644e-06, + "loss": 0.83866447, + "num_input_tokens_seen": 173398370, + "router_z_loss_clip": 1.49121094, + "router_z_loss_mlp": 0.11865234, + "step": 8066, + "time_per_iteration": 2.506598949432373 + }, + { + "auxiliary_loss_clip": 0.06438506, + "auxiliary_loss_mlp": 0.01268819, + "balance_loss_clip": 0.06285724, + "balance_loss_mlp": 0.01257375, + "epoch": 0.4850142792725086, + "flos": 25634972171520.0, + "grad_norm": 1.302389197624331, + "language_loss": 0.72176784, + "learning_rate": 2.193841877083912e-06, + "loss": 0.79884112, + "num_input_tokens_seen": 173419595, + "router_z_loss_clip": 1.52636719, + "router_z_loss_mlp": 0.11444092, + "step": 8067, + "time_per_iteration": 2.5921640396118164 + }, + { + "auxiliary_loss_clip": 0.06438944, + "auxiliary_loss_mlp": 0.01268187, + "balance_loss_clip": 0.06282091, + "balance_loss_mlp": 0.01255986, + "epoch": 0.4850744025251766, + "flos": 13777075843200.0, + "grad_norm": 2.2825284137915975, + "language_loss": 0.79257572, + "learning_rate": 2.1934542452826767e-06, + "loss": 0.86964703, + "num_input_tokens_seen": 173435390, + "router_z_loss_clip": 1.5703125, + "router_z_loss_mlp": 0.12219238, + "step": 8068, + "time_per_iteration": 2.5287444591522217 + }, + { + "auxiliary_loss_clip": 0.06435382, + "auxiliary_loss_mlp": 0.01268403, + "balance_loss_clip": 0.06280828, + "balance_loss_mlp": 0.012565, + "epoch": 0.4851345257778446, + "flos": 20266691040000.0, + "grad_norm": 1.4034205816126453, + "language_loss": 0.84740359, + "learning_rate": 2.193066606145638e-06, + "loss": 0.92444146, + "num_input_tokens_seen": 173454095, + "router_z_loss_clip": 1.54394531, + "router_z_loss_mlp": 0.11901855, + "step": 8069, + "time_per_iteration": 2.548593044281006 + }, + { + "auxiliary_loss_clip": 0.06435016, + "auxiliary_loss_mlp": 0.01266308, + "balance_loss_clip": 0.06280835, + "balance_loss_mlp": 0.01254763, + "epoch": 0.48519464903051257, + "flos": 27097095991680.0, + "grad_norm": 1.771109080244907, + "language_loss": 0.78544027, + "learning_rate": 2.192678959687493e-06, + "loss": 0.86245352, + "num_input_tokens_seen": 173475300, + "router_z_loss_clip": 1.54003906, + "router_z_loss_mlp": 0.11553955, + "step": 8070, + "time_per_iteration": 2.581026315689087 + }, + { + "auxiliary_loss_clip": 0.06432221, + "auxiliary_loss_mlp": 0.01268982, + "balance_loss_clip": 0.06279641, + "balance_loss_mlp": 0.01256239, + "epoch": 0.48525477228318054, + "flos": 17132677902720.0, + "grad_norm": 3.597843949572919, + "language_loss": 0.77929389, + "learning_rate": 2.192291305922943e-06, + "loss": 0.85630596, + "num_input_tokens_seen": 173492005, + "router_z_loss_clip": 1.52539062, + "router_z_loss_mlp": 0.12756348, + "step": 8071, + "time_per_iteration": 3.963555335998535 + }, + { + "auxiliary_loss_clip": 0.06438918, + "auxiliary_loss_mlp": 0.01270068, + "balance_loss_clip": 0.06282261, + "balance_loss_mlp": 0.01256777, + "epoch": 0.4853148955358485, + "flos": 28187263537920.0, + "grad_norm": 2.115731418126265, + "language_loss": 0.72008896, + "learning_rate": 2.1919036448666873e-06, + "loss": 0.7971788, + "num_input_tokens_seen": 173511995, + "router_z_loss_clip": 1.56835938, + "router_z_loss_mlp": 0.13299561, + "step": 8072, + "time_per_iteration": 2.6861536502838135 + }, + { + "auxiliary_loss_clip": 0.06439583, + "auxiliary_loss_mlp": 0.0126679, + "balance_loss_clip": 0.06282715, + "balance_loss_mlp": 0.01253761, + "epoch": 0.48537501878851647, + "flos": 17499015953280.0, + "grad_norm": 1.8999559951356444, + "language_loss": 0.88288134, + "learning_rate": 2.1915159765334262e-06, + "loss": 0.95994508, + "num_input_tokens_seen": 173530215, + "router_z_loss_clip": 1.56835938, + "router_z_loss_mlp": 0.13037109, + "step": 8073, + "time_per_iteration": 2.4814834594726562 + }, + { + "auxiliary_loss_clip": 0.06432822, + "auxiliary_loss_mlp": 0.01269151, + "balance_loss_clip": 0.06283282, + "balance_loss_mlp": 0.01257731, + "epoch": 0.48543514204118443, + "flos": 28592398828800.0, + "grad_norm": 2.458004055687259, + "language_loss": 0.61317194, + "learning_rate": 2.19112830093786e-06, + "loss": 0.69019163, + "num_input_tokens_seen": 173550920, + "router_z_loss_clip": 1.49316406, + "router_z_loss_mlp": 0.11413574, + "step": 8074, + "time_per_iteration": 3.984229326248169 + }, + { + "auxiliary_loss_clip": 0.06435922, + "auxiliary_loss_mlp": 0.01265981, + "balance_loss_clip": 0.0627804, + "balance_loss_mlp": 0.01254024, + "epoch": 0.4854952652938524, + "flos": 20966355832320.0, + "grad_norm": 1.641968552330247, + "language_loss": 0.73514569, + "learning_rate": 2.19074061809469e-06, + "loss": 0.81216466, + "num_input_tokens_seen": 173569065, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.11962891, + "step": 8075, + "time_per_iteration": 2.5479941368103027 + }, + { + "auxiliary_loss_clip": 0.06429431, + "auxiliary_loss_mlp": 0.01268393, + "balance_loss_clip": 0.06278814, + "balance_loss_mlp": 0.01256704, + "epoch": 0.48555538854652036, + "flos": 66543344000640.0, + "grad_norm": 1.7202852105657789, + "language_loss": 0.81976241, + "learning_rate": 2.1903529280186163e-06, + "loss": 0.89674067, + "num_input_tokens_seen": 173596085, + "router_z_loss_clip": 1.50488281, + "router_z_loss_mlp": 0.11676025, + "step": 8076, + "time_per_iteration": 2.9675233364105225 + }, + { + "auxiliary_loss_clip": 0.06435271, + "auxiliary_loss_mlp": 0.01273017, + "balance_loss_clip": 0.06280246, + "balance_loss_mlp": 0.01259242, + "epoch": 0.4856155117991883, + "flos": 15930520974720.0, + "grad_norm": 1.9409864090603182, + "language_loss": 0.86392474, + "learning_rate": 2.1899652307243407e-06, + "loss": 0.94100761, + "num_input_tokens_seen": 173613900, + "router_z_loss_clip": 1.55078125, + "router_z_loss_mlp": 0.13781738, + "step": 8077, + "time_per_iteration": 2.5062685012817383 + }, + { + "auxiliary_loss_clip": 0.06325787, + "auxiliary_loss_mlp": 0.01252172, + "balance_loss_clip": 0.062584, + "balance_loss_mlp": 0.0125022, + "epoch": 0.4856756350518563, + "flos": 71066986848000.0, + "grad_norm": 0.9289783803731909, + "language_loss": 0.58378243, + "learning_rate": 2.189577526226564e-06, + "loss": 0.65956199, + "num_input_tokens_seen": 173671305, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 0.01950073, + "step": 8078, + "time_per_iteration": 4.502991199493408 + }, + { + "auxiliary_loss_clip": 0.06440585, + "auxiliary_loss_mlp": 0.01268963, + "balance_loss_clip": 0.06280588, + "balance_loss_mlp": 0.01255886, + "epoch": 0.48573575830452426, + "flos": 29833478778240.0, + "grad_norm": 2.317528327629363, + "language_loss": 0.72874224, + "learning_rate": 2.1891898145399884e-06, + "loss": 0.80583775, + "num_input_tokens_seen": 173692070, + "router_z_loss_clip": 1.59863281, + "router_z_loss_mlp": 0.1307373, + "step": 8079, + "time_per_iteration": 2.5839955806732178 + }, + { + "auxiliary_loss_clip": 0.06440279, + "auxiliary_loss_mlp": 0.01268912, + "balance_loss_clip": 0.06283288, + "balance_loss_mlp": 0.01256925, + "epoch": 0.4857958815571922, + "flos": 17645274455040.0, + "grad_norm": 2.8950752184508843, + "language_loss": 0.80285943, + "learning_rate": 2.1888020956793172e-06, + "loss": 0.87995136, + "num_input_tokens_seen": 173709785, + "router_z_loss_clip": 1.5703125, + "router_z_loss_mlp": 0.11999512, + "step": 8080, + "time_per_iteration": 2.542607307434082 + }, + { + "auxiliary_loss_clip": 0.06436758, + "auxiliary_loss_mlp": 0.01265336, + "balance_loss_clip": 0.06281016, + "balance_loss_mlp": 0.01252754, + "epoch": 0.4858560048098602, + "flos": 21111817720320.0, + "grad_norm": 1.934060586134842, + "language_loss": 0.84237295, + "learning_rate": 2.188414369659251e-06, + "loss": 0.9193939, + "num_input_tokens_seen": 173728770, + "router_z_loss_clip": 1.55957031, + "router_z_loss_mlp": 0.12579346, + "step": 8081, + "time_per_iteration": 2.523787021636963 + }, + { + "auxiliary_loss_clip": 0.06433021, + "auxiliary_loss_mlp": 0.01268596, + "balance_loss_clip": 0.06277841, + "balance_loss_mlp": 0.0125512, + "epoch": 0.4859161280625282, + "flos": 22097375043840.0, + "grad_norm": 1.530246142437005, + "language_loss": 0.83824933, + "learning_rate": 2.1880266364944924e-06, + "loss": 0.91526556, + "num_input_tokens_seen": 173747355, + "router_z_loss_clip": 1.55175781, + "router_z_loss_mlp": 0.13464355, + "step": 8082, + "time_per_iteration": 2.562739372253418 + }, + { + "auxiliary_loss_clip": 0.0643435, + "auxiliary_loss_mlp": 0.01268115, + "balance_loss_clip": 0.06283809, + "balance_loss_mlp": 0.01255849, + "epoch": 0.4859762513151962, + "flos": 17499183661440.0, + "grad_norm": 1.9064651850671037, + "language_loss": 0.87366831, + "learning_rate": 2.187638896199746e-06, + "loss": 0.95069289, + "num_input_tokens_seen": 173764825, + "router_z_loss_clip": 1.50292969, + "router_z_loss_mlp": 0.12268066, + "step": 8083, + "time_per_iteration": 2.5062954425811768 + }, + { + "auxiliary_loss_clip": 0.064337, + "auxiliary_loss_mlp": 0.01268463, + "balance_loss_clip": 0.06281679, + "balance_loss_mlp": 0.01255356, + "epoch": 0.48603637456786414, + "flos": 18010061205120.0, + "grad_norm": 1.6184381568123027, + "language_loss": 0.81531483, + "learning_rate": 2.1872511487897126e-06, + "loss": 0.89233649, + "num_input_tokens_seen": 173783215, + "router_z_loss_clip": 1.51855469, + "router_z_loss_mlp": 0.13110352, + "step": 8084, + "time_per_iteration": 3.9548635482788086 + }, + { + "auxiliary_loss_clip": 0.06438272, + "auxiliary_loss_mlp": 0.01269138, + "balance_loss_clip": 0.06283273, + "balance_loss_mlp": 0.01256645, + "epoch": 0.4860964978205321, + "flos": 22498611120000.0, + "grad_norm": 1.8856401579659385, + "language_loss": 0.68814772, + "learning_rate": 2.186863394279098e-06, + "loss": 0.76522183, + "num_input_tokens_seen": 173801905, + "router_z_loss_clip": 1.54980469, + "router_z_loss_mlp": 0.12475586, + "step": 8085, + "time_per_iteration": 2.525697708129883 + }, + { + "auxiliary_loss_clip": 0.06434157, + "auxiliary_loss_mlp": 0.01270175, + "balance_loss_clip": 0.0627964, + "balance_loss_mlp": 0.01257158, + "epoch": 0.48615662107320007, + "flos": 23380061345280.0, + "grad_norm": 1.4159205206948002, + "language_loss": 0.77895916, + "learning_rate": 2.1864756326826046e-06, + "loss": 0.85600245, + "num_input_tokens_seen": 173824690, + "router_z_loss_clip": 1.54492188, + "router_z_loss_mlp": 0.13024902, + "step": 8086, + "time_per_iteration": 2.5914857387542725 + }, + { + "auxiliary_loss_clip": 0.06433852, + "auxiliary_loss_mlp": 0.01265857, + "balance_loss_clip": 0.06279776, + "balance_loss_mlp": 0.01253292, + "epoch": 0.48621674432586803, + "flos": 34426722769920.0, + "grad_norm": 1.8125320165569008, + "language_loss": 0.69750226, + "learning_rate": 2.1860878640149355e-06, + "loss": 0.7744993, + "num_input_tokens_seen": 173844450, + "router_z_loss_clip": 1.54101562, + "router_z_loss_mlp": 0.12573242, + "step": 8087, + "time_per_iteration": 2.611724615097046 + }, + { + "auxiliary_loss_clip": 0.06440983, + "auxiliary_loss_mlp": 0.01266005, + "balance_loss_clip": 0.06277409, + "balance_loss_mlp": 0.0125254, + "epoch": 0.486276867578536, + "flos": 33115595207040.0, + "grad_norm": 1.9401027694089865, + "language_loss": 0.73050213, + "learning_rate": 2.1857000882907974e-06, + "loss": 0.80757201, + "num_input_tokens_seen": 173864975, + "router_z_loss_clip": 1.63476562, + "router_z_loss_mlp": 0.13482666, + "step": 8088, + "time_per_iteration": 2.6235716342926025 + }, + { + "auxiliary_loss_clip": 0.06434947, + "auxiliary_loss_mlp": 0.01270457, + "balance_loss_clip": 0.06279397, + "balance_loss_mlp": 0.01257982, + "epoch": 0.48633699083120396, + "flos": 21477149521920.0, + "grad_norm": 1.5117477196191362, + "language_loss": 0.75765258, + "learning_rate": 2.185312305524892e-06, + "loss": 0.83470654, + "num_input_tokens_seen": 173883805, + "router_z_loss_clip": 1.55566406, + "router_z_loss_mlp": 0.12481689, + "step": 8089, + "time_per_iteration": 2.522033214569092 + }, + { + "auxiliary_loss_clip": 0.06432723, + "auxiliary_loss_mlp": 0.01266623, + "balance_loss_clip": 0.06276575, + "balance_loss_mlp": 0.01254702, + "epoch": 0.48639711408387193, + "flos": 20090565757440.0, + "grad_norm": 2.0719257974800307, + "language_loss": 0.84617764, + "learning_rate": 2.184924515731926e-06, + "loss": 0.92317104, + "num_input_tokens_seen": 173903520, + "router_z_loss_clip": 1.5625, + "router_z_loss_mlp": 0.1192627, + "step": 8090, + "time_per_iteration": 2.6032962799072266 + }, + { + "auxiliary_loss_clip": 0.06428317, + "auxiliary_loss_mlp": 0.01267937, + "balance_loss_clip": 0.06279606, + "balance_loss_mlp": 0.01256362, + "epoch": 0.4864572373365399, + "flos": 20785450867200.0, + "grad_norm": 1.460241002220635, + "language_loss": 0.76103806, + "learning_rate": 2.1845367189266045e-06, + "loss": 0.8380006, + "num_input_tokens_seen": 173924255, + "router_z_loss_clip": 1.48632812, + "router_z_loss_mlp": 0.11578369, + "step": 8091, + "time_per_iteration": 2.534083127975464 + }, + { + "auxiliary_loss_clip": 0.06434517, + "auxiliary_loss_mlp": 0.01264632, + "balance_loss_clip": 0.0627959, + "balance_loss_mlp": 0.01252651, + "epoch": 0.48651736058920786, + "flos": 26031554346240.0, + "grad_norm": 1.4698762569471817, + "language_loss": 0.8086524, + "learning_rate": 2.184148915123631e-06, + "loss": 0.88564396, + "num_input_tokens_seen": 173943285, + "router_z_loss_clip": 1.546875, + "router_z_loss_mlp": 0.11987305, + "step": 8092, + "time_per_iteration": 2.5732295513153076 + }, + { + "auxiliary_loss_clip": 0.06434911, + "auxiliary_loss_mlp": 0.01268235, + "balance_loss_clip": 0.06279235, + "balance_loss_mlp": 0.01254711, + "epoch": 0.4865774838418758, + "flos": 20491885687680.0, + "grad_norm": 1.359461965274961, + "language_loss": 0.71901554, + "learning_rate": 2.1837611043377126e-06, + "loss": 0.79604697, + "num_input_tokens_seen": 173962205, + "router_z_loss_clip": 1.55566406, + "router_z_loss_mlp": 0.13537598, + "step": 8093, + "time_per_iteration": 2.5315988063812256 + }, + { + "auxiliary_loss_clip": 0.06430057, + "auxiliary_loss_mlp": 0.01268667, + "balance_loss_clip": 0.06278083, + "balance_loss_mlp": 0.01256424, + "epoch": 0.4866376070945438, + "flos": 23554048348800.0, + "grad_norm": 1.746145283456106, + "language_loss": 0.68340707, + "learning_rate": 2.1833732865835545e-06, + "loss": 0.76039433, + "num_input_tokens_seen": 173980945, + "router_z_loss_clip": 1.51855469, + "router_z_loss_mlp": 0.12237549, + "step": 8094, + "time_per_iteration": 2.5621020793914795 + }, + { + "auxiliary_loss_clip": 0.06439431, + "auxiliary_loss_mlp": 0.01276508, + "balance_loss_clip": 0.06280254, + "balance_loss_mlp": 0.01263502, + "epoch": 0.4866977303472118, + "flos": 16696166457600.0, + "grad_norm": 2.187009986392795, + "language_loss": 0.66443598, + "learning_rate": 2.1829854618758636e-06, + "loss": 0.74159545, + "num_input_tokens_seen": 173998860, + "router_z_loss_clip": 1.59277344, + "router_z_loss_mlp": 0.13006592, + "step": 8095, + "time_per_iteration": 2.4823923110961914 + }, + { + "auxiliary_loss_clip": 0.06436304, + "auxiliary_loss_mlp": 0.01266824, + "balance_loss_clip": 0.06279348, + "balance_loss_mlp": 0.01254444, + "epoch": 0.4867578535998798, + "flos": 17902012965120.0, + "grad_norm": 1.919238290363099, + "language_loss": 0.79046065, + "learning_rate": 2.182597630229345e-06, + "loss": 0.86749196, + "num_input_tokens_seen": 174016665, + "router_z_loss_clip": 1.56933594, + "router_z_loss_mlp": 0.12384033, + "step": 8096, + "time_per_iteration": 2.507293701171875 + }, + { + "auxiliary_loss_clip": 0.06432957, + "auxiliary_loss_mlp": 0.01269945, + "balance_loss_clip": 0.06279905, + "balance_loss_mlp": 0.01257154, + "epoch": 0.48681797685254774, + "flos": 22644366497280.0, + "grad_norm": 2.003337305767246, + "language_loss": 0.68162191, + "learning_rate": 2.1822097916587067e-06, + "loss": 0.75865096, + "num_input_tokens_seen": 174034800, + "router_z_loss_clip": 1.53222656, + "router_z_loss_mlp": 0.12799072, + "step": 8097, + "time_per_iteration": 2.5473361015319824 + }, + { + "auxiliary_loss_clip": 0.06432723, + "auxiliary_loss_mlp": 0.01272073, + "balance_loss_clip": 0.06279548, + "balance_loss_mlp": 0.01259944, + "epoch": 0.4868781001052157, + "flos": 20892283223040.0, + "grad_norm": 1.4401604045572658, + "language_loss": 0.71418583, + "learning_rate": 2.1818219461786543e-06, + "loss": 0.79123378, + "num_input_tokens_seen": 174054445, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.12127686, + "step": 8098, + "time_per_iteration": 2.5543363094329834 + }, + { + "auxiliary_loss_clip": 0.06441437, + "auxiliary_loss_mlp": 0.01269071, + "balance_loss_clip": 0.06279659, + "balance_loss_mlp": 0.01255725, + "epoch": 0.48693822335788367, + "flos": 41984688723840.0, + "grad_norm": 1.4376447542768653, + "language_loss": 0.66435724, + "learning_rate": 2.1814340938038956e-06, + "loss": 0.74146235, + "num_input_tokens_seen": 174077890, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.13348389, + "step": 8099, + "time_per_iteration": 2.711822032928467 + }, + { + "auxiliary_loss_clip": 0.0643863, + "auxiliary_loss_mlp": 0.01271817, + "balance_loss_clip": 0.06281494, + "balance_loss_mlp": 0.01259485, + "epoch": 0.48699834661055164, + "flos": 24250149342720.0, + "grad_norm": 1.5852242434455028, + "language_loss": 0.66993374, + "learning_rate": 2.181046234549138e-06, + "loss": 0.74703825, + "num_input_tokens_seen": 174097460, + "router_z_loss_clip": 1.5703125, + "router_z_loss_mlp": 0.12329102, + "step": 8100, + "time_per_iteration": 2.5218353271484375 + }, + { + "auxiliary_loss_clip": 0.0643635, + "auxiliary_loss_mlp": 0.0127283, + "balance_loss_clip": 0.06283123, + "balance_loss_mlp": 0.01260176, + "epoch": 0.4870584698632196, + "flos": 25931388389760.0, + "grad_norm": 1.294146562327305, + "language_loss": 0.76505142, + "learning_rate": 2.180658368429088e-06, + "loss": 0.84214324, + "num_input_tokens_seen": 174120775, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.12664795, + "step": 8101, + "time_per_iteration": 2.645095109939575 + }, + { + "auxiliary_loss_clip": 0.06345028, + "auxiliary_loss_mlp": 0.01254744, + "balance_loss_clip": 0.06277841, + "balance_loss_mlp": 0.01252564, + "epoch": 0.48711859311588757, + "flos": 70232006511360.0, + "grad_norm": 0.6692636412141889, + "language_loss": 0.5212009, + "learning_rate": 2.1802704954584565e-06, + "loss": 0.59719861, + "num_input_tokens_seen": 174189135, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 0.02183533, + "step": 8102, + "time_per_iteration": 3.2782585620880127 + }, + { + "auxiliary_loss_clip": 0.06439511, + "auxiliary_loss_mlp": 0.01266928, + "balance_loss_clip": 0.06284305, + "balance_loss_mlp": 0.01253523, + "epoch": 0.48717871636855553, + "flos": 12346831301760.0, + "grad_norm": 2.023585148758525, + "language_loss": 0.7395249, + "learning_rate": 2.1798826156519484e-06, + "loss": 0.81658924, + "num_input_tokens_seen": 174203250, + "router_z_loss_clip": 1.55078125, + "router_z_loss_mlp": 0.13415527, + "step": 8103, + "time_per_iteration": 2.5020487308502197 + }, + { + "auxiliary_loss_clip": 0.06437068, + "auxiliary_loss_mlp": 0.01271054, + "balance_loss_clip": 0.06280553, + "balance_loss_mlp": 0.01257059, + "epoch": 0.4872388396212235, + "flos": 23483874954240.0, + "grad_norm": 1.425095223977108, + "language_loss": 0.6284436, + "learning_rate": 2.1794947290242737e-06, + "loss": 0.70552492, + "num_input_tokens_seen": 174224145, + "router_z_loss_clip": 1.56835938, + "router_z_loss_mlp": 0.13989258, + "step": 8104, + "time_per_iteration": 2.5457305908203125 + }, + { + "auxiliary_loss_clip": 0.06436496, + "auxiliary_loss_mlp": 0.012688, + "balance_loss_clip": 0.06281868, + "balance_loss_mlp": 0.01255759, + "epoch": 0.48729896287389146, + "flos": 31435068919680.0, + "grad_norm": 2.8385892248494575, + "language_loss": 0.69637764, + "learning_rate": 2.1791068355901413e-06, + "loss": 0.77343059, + "num_input_tokens_seen": 174244435, + "router_z_loss_clip": 1.54492188, + "router_z_loss_mlp": 0.13043213, + "step": 8105, + "time_per_iteration": 2.6453042030334473 + }, + { + "auxiliary_loss_clip": 0.0643308, + "auxiliary_loss_mlp": 0.01270898, + "balance_loss_clip": 0.06279837, + "balance_loss_mlp": 0.01258464, + "epoch": 0.4873590861265594, + "flos": 19063192446720.0, + "grad_norm": 1.510355754545757, + "language_loss": 0.73659271, + "learning_rate": 2.178718935364259e-06, + "loss": 0.81363249, + "num_input_tokens_seen": 174262710, + "router_z_loss_clip": 1.53222656, + "router_z_loss_mlp": 0.12451172, + "step": 8106, + "time_per_iteration": 2.4909706115722656 + }, + { + "auxiliary_loss_clip": 0.0644394, + "auxiliary_loss_mlp": 0.01272973, + "balance_loss_clip": 0.06283985, + "balance_loss_mlp": 0.01258888, + "epoch": 0.4874192093792274, + "flos": 24354424149120.0, + "grad_norm": 1.669305756095907, + "language_loss": 0.77040148, + "learning_rate": 2.1783310283613373e-06, + "loss": 0.84757066, + "num_input_tokens_seen": 174281545, + "router_z_loss_clip": 1.59863281, + "router_z_loss_mlp": 0.14080811, + "step": 8107, + "time_per_iteration": 2.5784239768981934 + }, + { + "auxiliary_loss_clip": 0.06432547, + "auxiliary_loss_mlp": 0.01266802, + "balance_loss_clip": 0.06281953, + "balance_loss_mlp": 0.01254971, + "epoch": 0.4874793326318954, + "flos": 23119339766400.0, + "grad_norm": 3.7362093355788857, + "language_loss": 0.75508547, + "learning_rate": 2.1779431145960853e-06, + "loss": 0.83207899, + "num_input_tokens_seen": 174300290, + "router_z_loss_clip": 1.50585938, + "router_z_loss_mlp": 0.1182251, + "step": 8108, + "time_per_iteration": 2.51676607131958 + }, + { + "auxiliary_loss_clip": 0.06434841, + "auxiliary_loss_mlp": 0.01268609, + "balance_loss_clip": 0.06281565, + "balance_loss_mlp": 0.01257522, + "epoch": 0.4875394558845634, + "flos": 19032193635840.0, + "grad_norm": 1.6826296910838767, + "language_loss": 0.73853874, + "learning_rate": 2.177555194083212e-06, + "loss": 0.81557322, + "num_input_tokens_seen": 174318490, + "router_z_loss_clip": 1.53320312, + "router_z_loss_mlp": 0.11090088, + "step": 8109, + "time_per_iteration": 2.594315767288208 + }, + { + "auxiliary_loss_clip": 0.06429494, + "auxiliary_loss_mlp": 0.01265982, + "balance_loss_clip": 0.0628022, + "balance_loss_mlp": 0.01253853, + "epoch": 0.48759957913723134, + "flos": 21439945509120.0, + "grad_norm": 1.7035668673577407, + "language_loss": 0.78900838, + "learning_rate": 2.177167266837428e-06, + "loss": 0.86596316, + "num_input_tokens_seen": 174335505, + "router_z_loss_clip": 1.49414062, + "router_z_loss_mlp": 0.12121582, + "step": 8110, + "time_per_iteration": 2.517711639404297 + }, + { + "auxiliary_loss_clip": 0.06435961, + "auxiliary_loss_mlp": 0.01271496, + "balance_loss_clip": 0.06281072, + "balance_loss_mlp": 0.01259265, + "epoch": 0.4876597023898993, + "flos": 17754412798080.0, + "grad_norm": 2.2958034596154238, + "language_loss": 0.72586286, + "learning_rate": 2.176779332873444e-06, + "loss": 0.80293739, + "num_input_tokens_seen": 174353990, + "router_z_loss_clip": 1.55078125, + "router_z_loss_mlp": 0.12231445, + "step": 8111, + "time_per_iteration": 3.939528465270996 + }, + { + "auxiliary_loss_clip": 0.06434079, + "auxiliary_loss_mlp": 0.01270804, + "balance_loss_clip": 0.06283166, + "balance_loss_mlp": 0.01257947, + "epoch": 0.4877198256425673, + "flos": 17025384349440.0, + "grad_norm": 1.699620610729742, + "language_loss": 0.76073879, + "learning_rate": 2.17639139220597e-06, + "loss": 0.83778763, + "num_input_tokens_seen": 174373425, + "router_z_loss_clip": 1.50976562, + "router_z_loss_mlp": 0.128479, + "step": 8112, + "time_per_iteration": 2.614734172821045 + }, + { + "auxiliary_loss_clip": 0.06443445, + "auxiliary_loss_mlp": 0.01270845, + "balance_loss_clip": 0.06281452, + "balance_loss_mlp": 0.01257445, + "epoch": 0.48777994889523524, + "flos": 22390898296320.0, + "grad_norm": 1.829058055025175, + "language_loss": 0.756136, + "learning_rate": 2.1760034448497166e-06, + "loss": 0.83327889, + "num_input_tokens_seen": 174393070, + "router_z_loss_clip": 1.61914062, + "router_z_loss_mlp": 0.13397217, + "step": 8113, + "time_per_iteration": 3.978013277053833 + }, + { + "auxiliary_loss_clip": 0.0633374, + "auxiliary_loss_mlp": 0.01252792, + "balance_loss_clip": 0.06267424, + "balance_loss_mlp": 0.0125078, + "epoch": 0.4878400721479032, + "flos": 61261237664640.0, + "grad_norm": 0.785084950627043, + "language_loss": 0.48805469, + "learning_rate": 2.1756154908193943e-06, + "loss": 0.56391996, + "num_input_tokens_seen": 174446880, + "router_z_loss_clip": 0.6640625, + "router_z_loss_mlp": 0.02011108, + "step": 8114, + "time_per_iteration": 3.0476014614105225 + }, + { + "auxiliary_loss_clip": 0.06435857, + "auxiliary_loss_mlp": 0.01268853, + "balance_loss_clip": 0.06280373, + "balance_loss_mlp": 0.01255507, + "epoch": 0.48790019540057117, + "flos": 24543756449280.0, + "grad_norm": 1.6081028897323706, + "language_loss": 0.77215505, + "learning_rate": 2.1752275301297155e-06, + "loss": 0.84920216, + "num_input_tokens_seen": 174468485, + "router_z_loss_clip": 1.55664062, + "router_z_loss_mlp": 0.13348389, + "step": 8115, + "time_per_iteration": 2.615709066390991 + }, + { + "auxiliary_loss_clip": 0.06438144, + "auxiliary_loss_mlp": 0.01270465, + "balance_loss_clip": 0.06279679, + "balance_loss_mlp": 0.01256858, + "epoch": 0.48796031865323913, + "flos": 21840175336320.0, + "grad_norm": 1.938320357328723, + "language_loss": 0.72471654, + "learning_rate": 2.1748395627953915e-06, + "loss": 0.80180264, + "num_input_tokens_seen": 174486360, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.13586426, + "step": 8116, + "time_per_iteration": 2.502880573272705 + }, + { + "auxiliary_loss_clip": 0.06428684, + "auxiliary_loss_mlp": 0.01266227, + "balance_loss_clip": 0.06277922, + "balance_loss_mlp": 0.0125349, + "epoch": 0.4880204419059071, + "flos": 18594969431040.0, + "grad_norm": 1.5984683769851484, + "language_loss": 0.63217908, + "learning_rate": 2.1744515888311335e-06, + "loss": 0.70912814, + "num_input_tokens_seen": 174505075, + "router_z_loss_clip": 1.5078125, + "router_z_loss_mlp": 0.12750244, + "step": 8117, + "time_per_iteration": 2.5082454681396484 + }, + { + "auxiliary_loss_clip": 0.06432296, + "auxiliary_loss_mlp": 0.01268211, + "balance_loss_clip": 0.06278604, + "balance_loss_mlp": 0.0125558, + "epoch": 0.48808056515857506, + "flos": 19178242502400.0, + "grad_norm": 1.8182073979213524, + "language_loss": 0.79733717, + "learning_rate": 2.1740636082516533e-06, + "loss": 0.87434226, + "num_input_tokens_seen": 174523385, + "router_z_loss_clip": 1.53808594, + "router_z_loss_mlp": 0.1262207, + "step": 8118, + "time_per_iteration": 3.925899028778076 + }, + { + "auxiliary_loss_clip": 0.06436172, + "auxiliary_loss_mlp": 0.01267812, + "balance_loss_clip": 0.06280739, + "balance_loss_mlp": 0.01254669, + "epoch": 0.48814068841124303, + "flos": 20126679667200.0, + "grad_norm": 1.6934286727955359, + "language_loss": 0.63701898, + "learning_rate": 2.1736756210716645e-06, + "loss": 0.71405882, + "num_input_tokens_seen": 174542200, + "router_z_loss_clip": 1.55273438, + "router_z_loss_mlp": 0.13134766, + "step": 8119, + "time_per_iteration": 2.575894832611084 + }, + { + "auxiliary_loss_clip": 0.06432833, + "auxiliary_loss_mlp": 0.01267436, + "balance_loss_clip": 0.0627794, + "balance_loss_mlp": 0.01254698, + "epoch": 0.488200811663911, + "flos": 22972116942720.0, + "grad_norm": 1.6464989706708673, + "language_loss": 0.72632396, + "learning_rate": 2.173287627305878e-06, + "loss": 0.80332661, + "num_input_tokens_seen": 174563620, + "router_z_loss_clip": 1.54882812, + "router_z_loss_mlp": 0.12744141, + "step": 8120, + "time_per_iteration": 2.5209426879882812 + }, + { + "auxiliary_loss_clip": 0.06438597, + "auxiliary_loss_mlp": 0.01268649, + "balance_loss_clip": 0.06279586, + "balance_loss_mlp": 0.01255297, + "epoch": 0.48826093491657896, + "flos": 33918947827200.0, + "grad_norm": 1.7374615150704595, + "language_loss": 0.63695973, + "learning_rate": 2.1728996269690075e-06, + "loss": 0.71403223, + "num_input_tokens_seen": 174586465, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.13336182, + "step": 8121, + "time_per_iteration": 2.619035005569458 + }, + { + "auxiliary_loss_clip": 0.0644285, + "auxiliary_loss_mlp": 0.01267435, + "balance_loss_clip": 0.06282102, + "balance_loss_mlp": 0.01253643, + "epoch": 0.488321058169247, + "flos": 23076056332800.0, + "grad_norm": 1.857577186148328, + "language_loss": 0.82684505, + "learning_rate": 2.1725116200757664e-06, + "loss": 0.90394789, + "num_input_tokens_seen": 174604035, + "router_z_loss_clip": 1.60546875, + "router_z_loss_mlp": 0.13800049, + "step": 8122, + "time_per_iteration": 2.5246660709381104 + }, + { + "auxiliary_loss_clip": 0.06440943, + "auxiliary_loss_mlp": 0.01268474, + "balance_loss_clip": 0.06282523, + "balance_loss_mlp": 0.01255397, + "epoch": 0.48838118142191494, + "flos": 19323746317440.0, + "grad_norm": 1.8250600769951077, + "language_loss": 0.85500193, + "learning_rate": 2.172123606640866e-06, + "loss": 0.93209612, + "num_input_tokens_seen": 174621715, + "router_z_loss_clip": 1.58300781, + "router_z_loss_mlp": 0.13085938, + "step": 8123, + "time_per_iteration": 2.5317881107330322 + }, + { + "auxiliary_loss_clip": 0.06441107, + "auxiliary_loss_mlp": 0.01272894, + "balance_loss_clip": 0.06282164, + "balance_loss_mlp": 0.0125934, + "epoch": 0.4884413046745829, + "flos": 25417701734400.0, + "grad_norm": 1.3930130047769251, + "language_loss": 0.85569358, + "learning_rate": 2.1717355866790227e-06, + "loss": 0.93283355, + "num_input_tokens_seen": 174643835, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.13549805, + "step": 8124, + "time_per_iteration": 4.062820196151733 + }, + { + "auxiliary_loss_clip": 0.0644336, + "auxiliary_loss_mlp": 0.01266972, + "balance_loss_clip": 0.06285739, + "balance_loss_mlp": 0.01253769, + "epoch": 0.4885014279272509, + "flos": 20997103080960.0, + "grad_norm": 2.2053414232015363, + "language_loss": 0.80210352, + "learning_rate": 2.171347560204948e-06, + "loss": 0.87920684, + "num_input_tokens_seen": 174660955, + "router_z_loss_clip": 1.57519531, + "router_z_loss_mlp": 0.13201904, + "step": 8125, + "time_per_iteration": 2.5117287635803223 + }, + { + "auxiliary_loss_clip": 0.06437683, + "auxiliary_loss_mlp": 0.01269334, + "balance_loss_clip": 0.06283394, + "balance_loss_mlp": 0.01255976, + "epoch": 0.48856155117991884, + "flos": 13776656572800.0, + "grad_norm": 2.5222320452086016, + "language_loss": 0.72852308, + "learning_rate": 2.170959527233356e-06, + "loss": 0.80559325, + "num_input_tokens_seen": 174678270, + "router_z_loss_clip": 1.54296875, + "router_z_loss_mlp": 0.13348389, + "step": 8126, + "time_per_iteration": 2.5177037715911865 + }, + { + "auxiliary_loss_clip": 0.06445107, + "auxiliary_loss_mlp": 0.01269465, + "balance_loss_clip": 0.06285033, + "balance_loss_mlp": 0.01256113, + "epoch": 0.4886216744325868, + "flos": 32095936471680.0, + "grad_norm": 1.5739512034612657, + "language_loss": 0.68640763, + "learning_rate": 2.1705714877789633e-06, + "loss": 0.76355338, + "num_input_tokens_seen": 174698360, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.13372803, + "step": 8127, + "time_per_iteration": 2.606557846069336 + }, + { + "auxiliary_loss_clip": 0.06442467, + "auxiliary_loss_mlp": 0.01268043, + "balance_loss_clip": 0.06283246, + "balance_loss_mlp": 0.01254972, + "epoch": 0.48868179768525477, + "flos": 19616221393920.0, + "grad_norm": 1.6528567440124056, + "language_loss": 0.7688967, + "learning_rate": 2.170183441856481e-06, + "loss": 0.84600174, + "num_input_tokens_seen": 174716755, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.13085938, + "step": 8128, + "time_per_iteration": 2.564112901687622 + }, + { + "auxiliary_loss_clip": 0.06448022, + "auxiliary_loss_mlp": 0.01274106, + "balance_loss_clip": 0.06289175, + "balance_loss_mlp": 0.01260653, + "epoch": 0.48874192093792274, + "flos": 21293100028800.0, + "grad_norm": 1.6046032409788031, + "language_loss": 0.76479989, + "learning_rate": 2.1697953894806265e-06, + "loss": 0.84202117, + "num_input_tokens_seen": 174735560, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.13452148, + "step": 8129, + "time_per_iteration": 2.5374317169189453 + }, + { + "auxiliary_loss_clip": 0.06444047, + "auxiliary_loss_mlp": 0.01266373, + "balance_loss_clip": 0.06286857, + "balance_loss_mlp": 0.01252944, + "epoch": 0.4888020441905907, + "flos": 14178647335680.0, + "grad_norm": 2.0974560904884867, + "language_loss": 0.65812773, + "learning_rate": 2.169407330666114e-06, + "loss": 0.735232, + "num_input_tokens_seen": 174752730, + "router_z_loss_clip": 1.5703125, + "router_z_loss_mlp": 0.13452148, + "step": 8130, + "time_per_iteration": 2.5409111976623535 + }, + { + "auxiliary_loss_clip": 0.06440154, + "auxiliary_loss_mlp": 0.01269301, + "balance_loss_clip": 0.06286357, + "balance_loss_mlp": 0.01256528, + "epoch": 0.48886216744325867, + "flos": 24104813235840.0, + "grad_norm": 1.7915788803825166, + "language_loss": 0.72896582, + "learning_rate": 2.169019265427658e-06, + "loss": 0.80606037, + "num_input_tokens_seen": 174772520, + "router_z_loss_clip": 1.53808594, + "router_z_loss_mlp": 0.12768555, + "step": 8131, + "time_per_iteration": 2.56299090385437 + }, + { + "auxiliary_loss_clip": 0.06451105, + "auxiliary_loss_mlp": 0.01270383, + "balance_loss_clip": 0.06289683, + "balance_loss_mlp": 0.01256811, + "epoch": 0.48892229069592663, + "flos": 38439838218240.0, + "grad_norm": 1.2588039875779695, + "language_loss": 0.69597721, + "learning_rate": 2.1686311937799745e-06, + "loss": 0.77319217, + "num_input_tokens_seen": 174796540, + "router_z_loss_clip": 1.61230469, + "router_z_loss_mlp": 0.13586426, + "step": 8132, + "time_per_iteration": 2.70053768157959 + }, + { + "auxiliary_loss_clip": 0.06438366, + "auxiliary_loss_mlp": 0.01270585, + "balance_loss_clip": 0.06285742, + "balance_loss_mlp": 0.01257436, + "epoch": 0.4889824139485946, + "flos": 23850338785920.0, + "grad_norm": 2.3033814193981454, + "language_loss": 0.70031691, + "learning_rate": 2.1682431157377797e-06, + "loss": 0.77740639, + "num_input_tokens_seen": 174817840, + "router_z_loss_clip": 1.52636719, + "router_z_loss_mlp": 0.13146973, + "step": 8133, + "time_per_iteration": 2.5559158325195312 + }, + { + "auxiliary_loss_clip": 0.06443258, + "auxiliary_loss_mlp": 0.01270512, + "balance_loss_clip": 0.0629006, + "balance_loss_mlp": 0.01257548, + "epoch": 0.48904253720126256, + "flos": 24432731389440.0, + "grad_norm": 1.67073327790382, + "language_loss": 0.71227533, + "learning_rate": 2.1678550313157883e-06, + "loss": 0.78941303, + "num_input_tokens_seen": 174837885, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.12957764, + "step": 8134, + "time_per_iteration": 2.5545125007629395 + }, + { + "auxiliary_loss_clip": 0.06444804, + "auxiliary_loss_mlp": 0.01271014, + "balance_loss_clip": 0.06283658, + "balance_loss_mlp": 0.01257055, + "epoch": 0.4891026604539306, + "flos": 24177586106880.0, + "grad_norm": 1.7998075455300961, + "language_loss": 0.80179673, + "learning_rate": 2.167466940528718e-06, + "loss": 0.87895489, + "num_input_tokens_seen": 174855240, + "router_z_loss_clip": 1.61132812, + "router_z_loss_mlp": 0.13977051, + "step": 8135, + "time_per_iteration": 2.54832124710083 + }, + { + "auxiliary_loss_clip": 0.06439205, + "auxiliary_loss_mlp": 0.01267223, + "balance_loss_clip": 0.06284894, + "balance_loss_mlp": 0.01255004, + "epoch": 0.48916278370659855, + "flos": 21477443011200.0, + "grad_norm": 1.5753098834035062, + "language_loss": 0.74565232, + "learning_rate": 2.1670788433912843e-06, + "loss": 0.82271659, + "num_input_tokens_seen": 174875145, + "router_z_loss_clip": 1.54199219, + "router_z_loss_mlp": 0.12213135, + "step": 8136, + "time_per_iteration": 2.5225162506103516 + }, + { + "auxiliary_loss_clip": 0.06440099, + "auxiliary_loss_mlp": 0.01265964, + "balance_loss_clip": 0.06286249, + "balance_loss_mlp": 0.01253519, + "epoch": 0.4892229069592665, + "flos": 22316322562560.0, + "grad_norm": 1.5544220345156794, + "language_loss": 0.73698246, + "learning_rate": 2.166690739918204e-06, + "loss": 0.81404305, + "num_input_tokens_seen": 174894770, + "router_z_loss_clip": 1.5390625, + "router_z_loss_mlp": 0.12451172, + "step": 8137, + "time_per_iteration": 2.5138792991638184 + }, + { + "auxiliary_loss_clip": 0.06443799, + "auxiliary_loss_mlp": 0.01270566, + "balance_loss_clip": 0.06287944, + "balance_loss_mlp": 0.01257673, + "epoch": 0.4892830302119345, + "flos": 12791812008960.0, + "grad_norm": 2.1813813764641448, + "language_loss": 0.75360358, + "learning_rate": 2.1663026301241944e-06, + "loss": 0.83074719, + "num_input_tokens_seen": 174912780, + "router_z_loss_clip": 1.55761719, + "router_z_loss_mlp": 0.12890625, + "step": 8138, + "time_per_iteration": 2.52406644821167 + }, + { + "auxiliary_loss_clip": 0.06443107, + "auxiliary_loss_mlp": 0.01267703, + "balance_loss_clip": 0.06287149, + "balance_loss_mlp": 0.01255192, + "epoch": 0.48934315346460244, + "flos": 20820223111680.0, + "grad_norm": 1.5609881437350468, + "language_loss": 0.74361938, + "learning_rate": 2.165914514023972e-06, + "loss": 0.82072747, + "num_input_tokens_seen": 174931250, + "router_z_loss_clip": 1.56347656, + "router_z_loss_mlp": 0.12518311, + "step": 8139, + "time_per_iteration": 2.5139529705047607 + }, + { + "auxiliary_loss_clip": 0.0643822, + "auxiliary_loss_mlp": 0.01266126, + "balance_loss_clip": 0.06281914, + "balance_loss_mlp": 0.01253144, + "epoch": 0.4894032767172704, + "flos": 19761641354880.0, + "grad_norm": 2.1585110635090388, + "language_loss": 0.62118167, + "learning_rate": 2.165526391632255e-06, + "loss": 0.69822514, + "num_input_tokens_seen": 174951105, + "router_z_loss_clip": 1.56347656, + "router_z_loss_mlp": 0.12988281, + "step": 8140, + "time_per_iteration": 2.5321638584136963 + }, + { + "auxiliary_loss_clip": 0.06444136, + "auxiliary_loss_mlp": 0.01271459, + "balance_loss_clip": 0.06286128, + "balance_loss_mlp": 0.01257506, + "epoch": 0.4894633999699384, + "flos": 17824292703360.0, + "grad_norm": 1.8580247423308633, + "language_loss": 0.82388717, + "learning_rate": 2.1651382629637608e-06, + "loss": 0.90104312, + "num_input_tokens_seen": 174969120, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.13946533, + "step": 8141, + "time_per_iteration": 2.4724786281585693 + }, + { + "auxiliary_loss_clip": 0.06448226, + "auxiliary_loss_mlp": 0.01272495, + "balance_loss_clip": 0.06290399, + "balance_loss_mlp": 0.01258279, + "epoch": 0.48952352322260634, + "flos": 25530781219200.0, + "grad_norm": 1.6913372633538968, + "language_loss": 0.72726512, + "learning_rate": 2.1647501280332066e-06, + "loss": 0.80447233, + "num_input_tokens_seen": 174991295, + "router_z_loss_clip": 1.57714844, + "router_z_loss_mlp": 0.14208984, + "step": 8142, + "time_per_iteration": 2.5858702659606934 + }, + { + "auxiliary_loss_clip": 0.06437673, + "auxiliary_loss_mlp": 0.01270492, + "balance_loss_clip": 0.062835, + "balance_loss_mlp": 0.01257624, + "epoch": 0.4895836464752743, + "flos": 29062508561280.0, + "grad_norm": 1.575435552323968, + "language_loss": 0.6727252, + "learning_rate": 2.1643619868553105e-06, + "loss": 0.74980688, + "num_input_tokens_seen": 175012830, + "router_z_loss_clip": 1.54003906, + "router_z_loss_mlp": 0.12860107, + "step": 8143, + "time_per_iteration": 2.576084613800049 + }, + { + "auxiliary_loss_clip": 0.06441937, + "auxiliary_loss_mlp": 0.01266921, + "balance_loss_clip": 0.06288718, + "balance_loss_mlp": 0.01254678, + "epoch": 0.48964376972794227, + "flos": 33555335034240.0, + "grad_norm": 1.550815752793646, + "language_loss": 0.75150239, + "learning_rate": 2.163973839444793e-06, + "loss": 0.82859099, + "num_input_tokens_seen": 175035695, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.12243652, + "step": 8144, + "time_per_iteration": 2.641314744949341 + }, + { + "auxiliary_loss_clip": 0.06442292, + "auxiliary_loss_mlp": 0.01272411, + "balance_loss_clip": 0.06287357, + "balance_loss_mlp": 0.01259089, + "epoch": 0.48970389298061023, + "flos": 22060506447360.0, + "grad_norm": 1.55007225141579, + "language_loss": 0.75850821, + "learning_rate": 2.1635856858163695e-06, + "loss": 0.83565521, + "num_input_tokens_seen": 175056425, + "router_z_loss_clip": 1.54785156, + "router_z_loss_mlp": 0.13311768, + "step": 8145, + "time_per_iteration": 2.5283498764038086 + }, + { + "auxiliary_loss_clip": 0.0644419, + "auxiliary_loss_mlp": 0.0126844, + "balance_loss_clip": 0.0628912, + "balance_loss_mlp": 0.01254564, + "epoch": 0.4897640162332782, + "flos": 20090523830400.0, + "grad_norm": 1.8073715924768365, + "language_loss": 0.8057586, + "learning_rate": 2.163197525984761e-06, + "loss": 0.88288498, + "num_input_tokens_seen": 175074800, + "router_z_loss_clip": 1.54980469, + "router_z_loss_mlp": 0.13861084, + "step": 8146, + "time_per_iteration": 2.5433614253997803 + }, + { + "auxiliary_loss_clip": 0.06439323, + "auxiliary_loss_mlp": 0.01272664, + "balance_loss_clip": 0.06288785, + "balance_loss_mlp": 0.01260737, + "epoch": 0.48982413948594616, + "flos": 23813134773120.0, + "grad_norm": 1.5096911604618644, + "language_loss": 0.74847698, + "learning_rate": 2.162809359964687e-06, + "loss": 0.82559681, + "num_input_tokens_seen": 175094500, + "router_z_loss_clip": 1.50585938, + "router_z_loss_mlp": 0.11920166, + "step": 8147, + "time_per_iteration": 2.5623743534088135 + }, + { + "auxiliary_loss_clip": 0.06440282, + "auxiliary_loss_mlp": 0.01269967, + "balance_loss_clip": 0.06287088, + "balance_loss_mlp": 0.01256615, + "epoch": 0.4898842627386142, + "flos": 17645442163200.0, + "grad_norm": 1.9926710345073115, + "language_loss": 0.82984591, + "learning_rate": 2.162421187770864e-06, + "loss": 0.90694839, + "num_input_tokens_seen": 175112920, + "router_z_loss_clip": 1.53222656, + "router_z_loss_mlp": 0.13360596, + "step": 8148, + "time_per_iteration": 2.5547962188720703 + }, + { + "auxiliary_loss_clip": 0.0644103, + "auxiliary_loss_mlp": 0.01267177, + "balance_loss_clip": 0.0629115, + "balance_loss_mlp": 0.01255363, + "epoch": 0.48994438599128215, + "flos": 16623519367680.0, + "grad_norm": 2.084842951303776, + "language_loss": 0.74672109, + "learning_rate": 2.162033009418015e-06, + "loss": 0.82380313, + "num_input_tokens_seen": 175129910, + "router_z_loss_clip": 1.49804688, + "router_z_loss_mlp": 0.11810303, + "step": 8149, + "time_per_iteration": 2.533867120742798 + }, + { + "auxiliary_loss_clip": 0.06448293, + "auxiliary_loss_mlp": 0.01270293, + "balance_loss_clip": 0.06289135, + "balance_loss_mlp": 0.01256507, + "epoch": 0.4900045092439501, + "flos": 26622080795520.0, + "grad_norm": 1.692853589800977, + "language_loss": 0.76331913, + "learning_rate": 2.1616448249208567e-06, + "loss": 0.840505, + "num_input_tokens_seen": 175148705, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.13787842, + "step": 8150, + "time_per_iteration": 3.964707374572754 + }, + { + "auxiliary_loss_clip": 0.06450059, + "auxiliary_loss_mlp": 0.01271131, + "balance_loss_clip": 0.06294075, + "balance_loss_mlp": 0.01257833, + "epoch": 0.4900646324966181, + "flos": 19908361054080.0, + "grad_norm": 2.244817701974514, + "language_loss": 0.72999722, + "learning_rate": 2.1612566342941106e-06, + "loss": 0.80720913, + "num_input_tokens_seen": 175167425, + "router_z_loss_clip": 1.56054688, + "router_z_loss_mlp": 0.13299561, + "step": 8151, + "time_per_iteration": 2.5549871921539307 + }, + { + "auxiliary_loss_clip": 0.06359711, + "auxiliary_loss_mlp": 0.01259283, + "balance_loss_clip": 0.06292651, + "balance_loss_mlp": 0.01257264, + "epoch": 0.49012475574928605, + "flos": 59207245729920.0, + "grad_norm": 0.8143029783085558, + "language_loss": 0.54076481, + "learning_rate": 2.1608684375524977e-06, + "loss": 0.6169548, + "num_input_tokens_seen": 175227985, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 0.02018738, + "step": 8152, + "time_per_iteration": 3.1047332286834717 + }, + { + "auxiliary_loss_clip": 0.06453663, + "auxiliary_loss_mlp": 0.01270304, + "balance_loss_clip": 0.06293964, + "balance_loss_mlp": 0.01257018, + "epoch": 0.490184879001954, + "flos": 45270285096960.0, + "grad_norm": 1.7665437022978014, + "language_loss": 0.6121304, + "learning_rate": 2.1604802347107364e-06, + "loss": 0.68937004, + "num_input_tokens_seen": 175251895, + "router_z_loss_clip": 1.59570312, + "router_z_loss_mlp": 0.13293457, + "step": 8153, + "time_per_iteration": 4.15813422203064 + }, + { + "auxiliary_loss_clip": 0.06445354, + "auxiliary_loss_mlp": 0.01267264, + "balance_loss_clip": 0.06291656, + "balance_loss_mlp": 0.01254074, + "epoch": 0.490245002254622, + "flos": 28009754663040.0, + "grad_norm": 1.583608688205754, + "language_loss": 0.76979434, + "learning_rate": 2.160092025783549e-06, + "loss": 0.84692061, + "num_input_tokens_seen": 175272770, + "router_z_loss_clip": 1.53417969, + "router_z_loss_mlp": 0.13195801, + "step": 8154, + "time_per_iteration": 2.5994982719421387 + }, + { + "auxiliary_loss_clip": 0.06359019, + "auxiliary_loss_mlp": 0.01255517, + "balance_loss_clip": 0.06291451, + "balance_loss_mlp": 0.01253472, + "epoch": 0.49030512550728994, + "flos": 58971764229120.0, + "grad_norm": 1.0610708177187165, + "language_loss": 0.669397, + "learning_rate": 2.1597038107856564e-06, + "loss": 0.74554235, + "num_input_tokens_seen": 175336320, + "router_z_loss_clip": 0.67578125, + "router_z_loss_mlp": 0.02046204, + "step": 8155, + "time_per_iteration": 3.2433578968048096 + }, + { + "auxiliary_loss_clip": 0.06448951, + "auxiliary_loss_mlp": 0.01269488, + "balance_loss_clip": 0.06294696, + "balance_loss_mlp": 0.0125743, + "epoch": 0.4903652487599579, + "flos": 19797922972800.0, + "grad_norm": 1.7256067083752205, + "language_loss": 0.77014565, + "learning_rate": 2.1593155897317784e-06, + "loss": 0.84733009, + "num_input_tokens_seen": 175353540, + "router_z_loss_clip": 1.54296875, + "router_z_loss_mlp": 0.12072754, + "step": 8156, + "time_per_iteration": 2.5398688316345215 + }, + { + "auxiliary_loss_clip": 0.06449247, + "auxiliary_loss_mlp": 0.01273385, + "balance_loss_clip": 0.06294699, + "balance_loss_mlp": 0.01259384, + "epoch": 0.49042537201262587, + "flos": 21768492568320.0, + "grad_norm": 1.9286441434498818, + "language_loss": 0.84019762, + "learning_rate": 2.1589273626366377e-06, + "loss": 0.91742396, + "num_input_tokens_seen": 175370445, + "router_z_loss_clip": 1.54492188, + "router_z_loss_mlp": 0.14007568, + "step": 8157, + "time_per_iteration": 2.5673582553863525 + }, + { + "auxiliary_loss_clip": 0.06449863, + "auxiliary_loss_mlp": 0.01266635, + "balance_loss_clip": 0.06293592, + "balance_loss_mlp": 0.01253701, + "epoch": 0.49048549526529384, + "flos": 18959043421440.0, + "grad_norm": 1.7147218979138201, + "language_loss": 0.79903084, + "learning_rate": 2.158539129514956e-06, + "loss": 0.87619579, + "num_input_tokens_seen": 175389020, + "router_z_loss_clip": 1.56152344, + "router_z_loss_mlp": 0.12927246, + "step": 8158, + "time_per_iteration": 3.982774496078491 + }, + { + "auxiliary_loss_clip": 0.0645184, + "auxiliary_loss_mlp": 0.01273348, + "balance_loss_clip": 0.06292954, + "balance_loss_mlp": 0.01259615, + "epoch": 0.4905456185179618, + "flos": 26913633477120.0, + "grad_norm": 1.6654114756309404, + "language_loss": 0.69551659, + "learning_rate": 2.158150890381454e-06, + "loss": 0.77276844, + "num_input_tokens_seen": 175409545, + "router_z_loss_clip": 1.58886719, + "router_z_loss_mlp": 0.1373291, + "step": 8159, + "time_per_iteration": 2.6114954948425293 + }, + { + "auxiliary_loss_clip": 0.06446424, + "auxiliary_loss_mlp": 0.01266602, + "balance_loss_clip": 0.06292199, + "balance_loss_mlp": 0.01253591, + "epoch": 0.49060574177062977, + "flos": 20418567765120.0, + "grad_norm": 1.7624184717579066, + "language_loss": 0.73495585, + "learning_rate": 2.157762645250854e-06, + "loss": 0.81208611, + "num_input_tokens_seen": 175429335, + "router_z_loss_clip": 1.54101562, + "router_z_loss_mlp": 0.13006592, + "step": 8160, + "time_per_iteration": 2.5310287475585938 + }, + { + "auxiliary_loss_clip": 0.06446327, + "auxiliary_loss_mlp": 0.01268684, + "balance_loss_clip": 0.06286773, + "balance_loss_mlp": 0.01254718, + "epoch": 0.4906658650232978, + "flos": 17499477150720.0, + "grad_norm": 1.9303786573731354, + "language_loss": 0.71921647, + "learning_rate": 2.1573743941378796e-06, + "loss": 0.79636657, + "num_input_tokens_seen": 175446955, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.13952637, + "step": 8161, + "time_per_iteration": 2.548387050628662 + }, + { + "auxiliary_loss_clip": 0.06438495, + "auxiliary_loss_mlp": 0.01269834, + "balance_loss_clip": 0.06285487, + "balance_loss_mlp": 0.01257102, + "epoch": 0.49072598827596575, + "flos": 26621619598080.0, + "grad_norm": 1.7423183419157489, + "language_loss": 0.68838918, + "learning_rate": 2.1569861370572517e-06, + "loss": 0.76547247, + "num_input_tokens_seen": 175468195, + "router_z_loss_clip": 1.53027344, + "router_z_loss_mlp": 0.12738037, + "step": 8162, + "time_per_iteration": 2.5565345287323 + }, + { + "auxiliary_loss_clip": 0.06445014, + "auxiliary_loss_mlp": 0.01271543, + "balance_loss_clip": 0.06284854, + "balance_loss_mlp": 0.01258048, + "epoch": 0.4907861115286337, + "flos": 20418861254400.0, + "grad_norm": 1.5998221011516633, + "language_loss": 0.6369257, + "learning_rate": 2.1565978740236944e-06, + "loss": 0.7140913, + "num_input_tokens_seen": 175487455, + "router_z_loss_clip": 1.60058594, + "router_z_loss_mlp": 0.1350708, + "step": 8163, + "time_per_iteration": 2.545926094055176 + }, + { + "auxiliary_loss_clip": 0.0643242, + "auxiliary_loss_mlp": 0.01272916, + "balance_loss_clip": 0.06283394, + "balance_loss_mlp": 0.01260471, + "epoch": 0.4908462347813017, + "flos": 14069508992640.0, + "grad_norm": 1.9421890992027433, + "language_loss": 0.77104688, + "learning_rate": 2.1562096050519293e-06, + "loss": 0.84810019, + "num_input_tokens_seen": 175504450, + "router_z_loss_clip": 1.48828125, + "router_z_loss_mlp": 0.12438965, + "step": 8164, + "time_per_iteration": 3.93280029296875 + }, + { + "auxiliary_loss_clip": 0.06443131, + "auxiliary_loss_mlp": 0.01271936, + "balance_loss_clip": 0.06285694, + "balance_loss_mlp": 0.01258382, + "epoch": 0.49090635803396965, + "flos": 18741227932800.0, + "grad_norm": 1.56961735096587, + "language_loss": 0.77229172, + "learning_rate": 2.1558213301566806e-06, + "loss": 0.84944236, + "num_input_tokens_seen": 175523600, + "router_z_loss_clip": 1.57324219, + "router_z_loss_mlp": 0.13562012, + "step": 8165, + "time_per_iteration": 2.493861436843872 + }, + { + "auxiliary_loss_clip": 0.06434909, + "auxiliary_loss_mlp": 0.01271922, + "balance_loss_clip": 0.06283913, + "balance_loss_mlp": 0.01258922, + "epoch": 0.4909664812866376, + "flos": 20564784339840.0, + "grad_norm": 2.2518376482371862, + "language_loss": 0.77749753, + "learning_rate": 2.1554330493526716e-06, + "loss": 0.85456586, + "num_input_tokens_seen": 175542720, + "router_z_loss_clip": 1.50683594, + "router_z_loss_mlp": 0.13006592, + "step": 8166, + "time_per_iteration": 2.578685760498047 + }, + { + "auxiliary_loss_clip": 0.06343444, + "auxiliary_loss_mlp": 0.01254597, + "balance_loss_clip": 0.06276363, + "balance_loss_mlp": 0.01252508, + "epoch": 0.4910266045393056, + "flos": 54704006622720.0, + "grad_norm": 0.7970989298383858, + "language_loss": 0.54202092, + "learning_rate": 2.1550447626546253e-06, + "loss": 0.61800134, + "num_input_tokens_seen": 175598640, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 0.02090454, + "step": 8167, + "time_per_iteration": 3.1805777549743652 + }, + { + "auxiliary_loss_clip": 0.06435132, + "auxiliary_loss_mlp": 0.01271015, + "balance_loss_clip": 0.06282446, + "balance_loss_mlp": 0.01257902, + "epoch": 0.49108672779197354, + "flos": 16250892261120.0, + "grad_norm": 1.7548504171286585, + "language_loss": 0.86375958, + "learning_rate": 2.1546564700772665e-06, + "loss": 0.94082105, + "num_input_tokens_seen": 175615675, + "router_z_loss_clip": 1.52832031, + "router_z_loss_mlp": 0.13110352, + "step": 8168, + "time_per_iteration": 2.5346431732177734 + }, + { + "auxiliary_loss_clip": 0.06439523, + "auxiliary_loss_mlp": 0.01270106, + "balance_loss_clip": 0.06287682, + "balance_loss_mlp": 0.01257667, + "epoch": 0.4911468510446415, + "flos": 19831018135680.0, + "grad_norm": 1.6618595444085258, + "language_loss": 0.73708379, + "learning_rate": 2.1542681716353193e-06, + "loss": 0.81418014, + "num_input_tokens_seen": 175632255, + "router_z_loss_clip": 1.51757812, + "router_z_loss_mlp": 0.12438965, + "step": 8169, + "time_per_iteration": 2.519845962524414 + }, + { + "auxiliary_loss_clip": 0.06435073, + "auxiliary_loss_mlp": 0.01267032, + "balance_loss_clip": 0.06282359, + "balance_loss_mlp": 0.01254795, + "epoch": 0.4912069742973095, + "flos": 21218650076160.0, + "grad_norm": 1.7105636772686297, + "language_loss": 0.78364748, + "learning_rate": 2.1538798673435068e-06, + "loss": 0.86066854, + "num_input_tokens_seen": 175651625, + "router_z_loss_clip": 1.52441406, + "router_z_loss_mlp": 0.12237549, + "step": 8170, + "time_per_iteration": 2.5751500129699707 + }, + { + "auxiliary_loss_clip": 0.06441889, + "auxiliary_loss_mlp": 0.01268553, + "balance_loss_clip": 0.06285594, + "balance_loss_mlp": 0.01255547, + "epoch": 0.49126709754997744, + "flos": 19543280814720.0, + "grad_norm": 2.6389457816540527, + "language_loss": 0.76311809, + "learning_rate": 2.1534915572165545e-06, + "loss": 0.84022248, + "num_input_tokens_seen": 175669265, + "router_z_loss_clip": 1.56347656, + "router_z_loss_mlp": 0.12988281, + "step": 8171, + "time_per_iteration": 2.5004677772521973 + }, + { + "auxiliary_loss_clip": 0.06443939, + "auxiliary_loss_mlp": 0.01268404, + "balance_loss_clip": 0.06285004, + "balance_loss_mlp": 0.01255947, + "epoch": 0.4913272208026454, + "flos": 12244568993280.0, + "grad_norm": 2.2552468133898684, + "language_loss": 0.81709123, + "learning_rate": 2.1531032412691875e-06, + "loss": 0.89421463, + "num_input_tokens_seen": 175686065, + "router_z_loss_clip": 1.58789062, + "router_z_loss_mlp": 0.12457275, + "step": 8172, + "time_per_iteration": 2.5347814559936523 + }, + { + "auxiliary_loss_clip": 0.06338271, + "auxiliary_loss_mlp": 0.01256316, + "balance_loss_clip": 0.06271008, + "balance_loss_mlp": 0.0125441, + "epoch": 0.49138734405531337, + "flos": 65484663661440.0, + "grad_norm": 0.6802144154671269, + "language_loss": 0.5333854, + "learning_rate": 2.1527149195161295e-06, + "loss": 0.60933125, + "num_input_tokens_seen": 175748595, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 0.01902771, + "step": 8173, + "time_per_iteration": 3.1376869678497314 + }, + { + "auxiliary_loss_clip": 0.06444144, + "auxiliary_loss_mlp": 0.01267282, + "balance_loss_clip": 0.0628697, + "balance_loss_mlp": 0.01253663, + "epoch": 0.4914474673079814, + "flos": 18444434371200.0, + "grad_norm": 1.9185770389222636, + "language_loss": 0.6246022, + "learning_rate": 2.152326591972107e-06, + "loss": 0.70171648, + "num_input_tokens_seen": 175766770, + "router_z_loss_clip": 1.57226562, + "router_z_loss_mlp": 0.1361084, + "step": 8174, + "time_per_iteration": 2.5815811157226562 + }, + { + "auxiliary_loss_clip": 0.06439996, + "auxiliary_loss_mlp": 0.01273325, + "balance_loss_clip": 0.0628511, + "balance_loss_mlp": 0.0126051, + "epoch": 0.49150759056064935, + "flos": 21690772306560.0, + "grad_norm": 2.0568306898238045, + "language_loss": 0.69594127, + "learning_rate": 2.1519382586518445e-06, + "loss": 0.77307451, + "num_input_tokens_seen": 175783605, + "router_z_loss_clip": 1.54980469, + "router_z_loss_mlp": 0.1282959, + "step": 8175, + "time_per_iteration": 2.5219566822052 + }, + { + "auxiliary_loss_clip": 0.06442218, + "auxiliary_loss_mlp": 0.0126783, + "balance_loss_clip": 0.06288453, + "balance_loss_mlp": 0.01255021, + "epoch": 0.4915677138133173, + "flos": 22388969652480.0, + "grad_norm": 1.5433299767806794, + "language_loss": 0.74403, + "learning_rate": 2.151549919570068e-06, + "loss": 0.82113051, + "num_input_tokens_seen": 175801390, + "router_z_loss_clip": 1.53613281, + "router_z_loss_mlp": 0.12805176, + "step": 8176, + "time_per_iteration": 2.5598292350769043 + }, + { + "auxiliary_loss_clip": 0.0643885, + "auxiliary_loss_mlp": 0.01272965, + "balance_loss_clip": 0.0628263, + "balance_loss_mlp": 0.01259977, + "epoch": 0.4916278370659853, + "flos": 18408320461440.0, + "grad_norm": 1.8239688366126487, + "language_loss": 0.70529395, + "learning_rate": 2.1511615747415036e-06, + "loss": 0.78241211, + "num_input_tokens_seen": 175819830, + "router_z_loss_clip": 1.56054688, + "router_z_loss_mlp": 0.12988281, + "step": 8177, + "time_per_iteration": 2.5329604148864746 + }, + { + "auxiliary_loss_clip": 0.06340313, + "auxiliary_loss_mlp": 0.01256045, + "balance_loss_clip": 0.06273533, + "balance_loss_mlp": 0.01253889, + "epoch": 0.49168796031865325, + "flos": 66630147701760.0, + "grad_norm": 0.6656640602529083, + "language_loss": 0.46068031, + "learning_rate": 2.150773224180877e-06, + "loss": 0.53664386, + "num_input_tokens_seen": 175881765, + "router_z_loss_clip": 0.66796875, + "router_z_loss_mlp": 0.02159119, + "step": 8178, + "time_per_iteration": 3.170982837677002 + }, + { + "auxiliary_loss_clip": 0.06445555, + "auxiliary_loss_mlp": 0.01272894, + "balance_loss_clip": 0.06286988, + "balance_loss_mlp": 0.01259597, + "epoch": 0.4917480835713212, + "flos": 20965601145600.0, + "grad_norm": 2.2617000627187407, + "language_loss": 0.6597743, + "learning_rate": 2.1503848679029147e-06, + "loss": 0.73695886, + "num_input_tokens_seen": 175901795, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.13299561, + "step": 8179, + "time_per_iteration": 2.5594394207000732 + }, + { + "auxiliary_loss_clip": 0.06447062, + "auxiliary_loss_mlp": 0.01267463, + "balance_loss_clip": 0.06285466, + "balance_loss_mlp": 0.01254088, + "epoch": 0.4918082068239892, + "flos": 15777386438400.0, + "grad_norm": 2.2633588866978442, + "language_loss": 0.70069337, + "learning_rate": 2.149996505922343e-06, + "loss": 0.77783871, + "num_input_tokens_seen": 175917770, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.1338501, + "step": 8180, + "time_per_iteration": 2.489649772644043 + }, + { + "auxiliary_loss_clip": 0.0643749, + "auxiliary_loss_mlp": 0.01267489, + "balance_loss_clip": 0.06285596, + "balance_loss_mlp": 0.01254406, + "epoch": 0.49186833007665715, + "flos": 24611162659200.0, + "grad_norm": 1.7052643417851399, + "language_loss": 0.84654552, + "learning_rate": 2.1496081382538895e-06, + "loss": 0.92359537, + "num_input_tokens_seen": 175937000, + "router_z_loss_clip": 1.51953125, + "router_z_loss_mlp": 0.13098145, + "step": 8181, + "time_per_iteration": 2.570831298828125 + }, + { + "auxiliary_loss_clip": 0.06432545, + "auxiliary_loss_mlp": 0.0127158, + "balance_loss_clip": 0.06282885, + "balance_loss_mlp": 0.01259843, + "epoch": 0.4919284533293251, + "flos": 22097039627520.0, + "grad_norm": 1.9771399001803804, + "language_loss": 0.73092818, + "learning_rate": 2.1492197649122793e-06, + "loss": 0.80796945, + "num_input_tokens_seen": 175955170, + "router_z_loss_clip": 1.49609375, + "router_z_loss_mlp": 0.11743164, + "step": 8182, + "time_per_iteration": 2.4966702461242676 + }, + { + "auxiliary_loss_clip": 0.06435409, + "auxiliary_loss_mlp": 0.01272985, + "balance_loss_clip": 0.06280539, + "balance_loss_mlp": 0.01260826, + "epoch": 0.4919885765819931, + "flos": 23374820465280.0, + "grad_norm": 1.9470010509475855, + "language_loss": 0.73167384, + "learning_rate": 2.1488313859122412e-06, + "loss": 0.80875778, + "num_input_tokens_seen": 175973725, + "router_z_loss_clip": 1.54882812, + "router_z_loss_mlp": 0.1217041, + "step": 8183, + "time_per_iteration": 2.5529325008392334 + }, + { + "auxiliary_loss_clip": 0.06441429, + "auxiliary_loss_mlp": 0.01268017, + "balance_loss_clip": 0.06279727, + "balance_loss_mlp": 0.01254523, + "epoch": 0.49204869983466104, + "flos": 21366795294720.0, + "grad_norm": 2.013163662705091, + "language_loss": 0.77443838, + "learning_rate": 2.1484430012685015e-06, + "loss": 0.85153282, + "num_input_tokens_seen": 175993885, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.1348877, + "step": 8184, + "time_per_iteration": 2.508230209350586 + }, + { + "auxiliary_loss_clip": 0.06435518, + "auxiliary_loss_mlp": 0.01266873, + "balance_loss_clip": 0.06281742, + "balance_loss_mlp": 0.01254523, + "epoch": 0.492108823087329, + "flos": 21149147514240.0, + "grad_norm": 2.3088868689892674, + "language_loss": 0.71377504, + "learning_rate": 2.148054610995789e-06, + "loss": 0.79079902, + "num_input_tokens_seen": 176014210, + "router_z_loss_clip": 1.53710938, + "router_z_loss_mlp": 0.12347412, + "step": 8185, + "time_per_iteration": 2.545316219329834 + }, + { + "auxiliary_loss_clip": 0.06437825, + "auxiliary_loss_mlp": 0.01266771, + "balance_loss_clip": 0.06280625, + "balance_loss_mlp": 0.01253074, + "epoch": 0.49216894633999697, + "flos": 25123214160000.0, + "grad_norm": 1.8318004423040046, + "language_loss": 0.75395268, + "learning_rate": 2.147666215108831e-06, + "loss": 0.8309986, + "num_input_tokens_seen": 176033890, + "router_z_loss_clip": 1.57128906, + "router_z_loss_mlp": 0.13684082, + "step": 8186, + "time_per_iteration": 2.5238165855407715 + }, + { + "auxiliary_loss_clip": 0.06435218, + "auxiliary_loss_mlp": 0.01274022, + "balance_loss_clip": 0.06281888, + "balance_loss_mlp": 0.01261124, + "epoch": 0.49222906959266494, + "flos": 22644534205440.0, + "grad_norm": 2.2257308208746975, + "language_loss": 0.68571508, + "learning_rate": 2.1472778136223545e-06, + "loss": 0.76280749, + "num_input_tokens_seen": 176052720, + "router_z_loss_clip": 1.53515625, + "router_z_loss_mlp": 0.12908936, + "step": 8187, + "time_per_iteration": 2.561488151550293 + }, + { + "auxiliary_loss_clip": 0.06434098, + "auxiliary_loss_mlp": 0.01272206, + "balance_loss_clip": 0.06280753, + "balance_loss_mlp": 0.01259653, + "epoch": 0.49228919284533296, + "flos": 20416471413120.0, + "grad_norm": 1.3887162782350388, + "language_loss": 0.67211652, + "learning_rate": 2.1468894065510894e-06, + "loss": 0.7491796, + "num_input_tokens_seen": 176072545, + "router_z_loss_clip": 1.53417969, + "router_z_loss_mlp": 0.12567139, + "step": 8188, + "time_per_iteration": 2.5019164085388184 + }, + { + "auxiliary_loss_clip": 0.06437577, + "auxiliary_loss_mlp": 0.01267268, + "balance_loss_clip": 0.06282844, + "balance_loss_mlp": 0.012549, + "epoch": 0.4923493160980009, + "flos": 27129142978560.0, + "grad_norm": 1.6466242872646388, + "language_loss": 0.74921268, + "learning_rate": 2.1465009939097623e-06, + "loss": 0.8262611, + "num_input_tokens_seen": 176091490, + "router_z_loss_clip": 1.54785156, + "router_z_loss_mlp": 0.12365723, + "step": 8189, + "time_per_iteration": 2.6160171031951904 + }, + { + "auxiliary_loss_clip": 0.06432211, + "auxiliary_loss_mlp": 0.01271904, + "balance_loss_clip": 0.0627953, + "balance_loss_mlp": 0.01259363, + "epoch": 0.4924094393506689, + "flos": 35745522981120.0, + "grad_norm": 1.6094215463667148, + "language_loss": 0.64780444, + "learning_rate": 2.146112575713104e-06, + "loss": 0.72484565, + "num_input_tokens_seen": 176113200, + "router_z_loss_clip": 1.52734375, + "router_z_loss_mlp": 0.12542725, + "step": 8190, + "time_per_iteration": 4.0641090869903564 + }, + { + "auxiliary_loss_clip": 0.06438321, + "auxiliary_loss_mlp": 0.01273117, + "balance_loss_clip": 0.06285122, + "balance_loss_mlp": 0.01260486, + "epoch": 0.49246956260333685, + "flos": 20418735473280.0, + "grad_norm": 1.8613448606205585, + "language_loss": 0.71446037, + "learning_rate": 2.1457241519758413e-06, + "loss": 0.79157472, + "num_input_tokens_seen": 176132485, + "router_z_loss_clip": 1.53222656, + "router_z_loss_mlp": 0.12628174, + "step": 8191, + "time_per_iteration": 2.5388033390045166 + }, + { + "auxiliary_loss_clip": 0.06437817, + "auxiliary_loss_mlp": 0.01265513, + "balance_loss_clip": 0.06282701, + "balance_loss_mlp": 0.01253193, + "epoch": 0.4925296858560048, + "flos": 38985152590080.0, + "grad_norm": 1.8396866027790106, + "language_loss": 0.72404003, + "learning_rate": 2.1453357227127043e-06, + "loss": 0.80107331, + "num_input_tokens_seen": 176155755, + "router_z_loss_clip": 1.55078125, + "router_z_loss_mlp": 0.12335205, + "step": 8192, + "time_per_iteration": 2.696115255355835 + }, + { + "auxiliary_loss_clip": 0.06334923, + "auxiliary_loss_mlp": 0.01254622, + "balance_loss_clip": 0.06267789, + "balance_loss_mlp": 0.01252217, + "epoch": 0.4925898091086728, + "flos": 64300367652480.0, + "grad_norm": 0.7283072322766662, + "language_loss": 0.51975358, + "learning_rate": 2.1449472879384224e-06, + "loss": 0.59564906, + "num_input_tokens_seen": 176216295, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 0.02401733, + "step": 8193, + "time_per_iteration": 4.540759086608887 + }, + { + "auxiliary_loss_clip": 0.06434911, + "auxiliary_loss_mlp": 0.01271982, + "balance_loss_clip": 0.06282961, + "balance_loss_mlp": 0.01259417, + "epoch": 0.49264993236134075, + "flos": 23042541899520.0, + "grad_norm": 1.3982393371006636, + "language_loss": 0.77103728, + "learning_rate": 2.1445588476677246e-06, + "loss": 0.84810621, + "num_input_tokens_seen": 176235925, + "router_z_loss_clip": 1.51953125, + "router_z_loss_mlp": 0.12554932, + "step": 8194, + "time_per_iteration": 2.585632085800171 + }, + { + "auxiliary_loss_clip": 0.06434575, + "auxiliary_loss_mlp": 0.01269697, + "balance_loss_clip": 0.06280608, + "balance_loss_mlp": 0.01257376, + "epoch": 0.4927100556140087, + "flos": 24725248392960.0, + "grad_norm": 2.1551580003064186, + "language_loss": 0.70539922, + "learning_rate": 2.144170401915341e-06, + "loss": 0.78244197, + "num_input_tokens_seen": 176253865, + "router_z_loss_clip": 1.5390625, + "router_z_loss_mlp": 0.12329102, + "step": 8195, + "time_per_iteration": 2.5881664752960205 + }, + { + "auxiliary_loss_clip": 0.06438025, + "auxiliary_loss_mlp": 0.01269625, + "balance_loss_clip": 0.06284925, + "balance_loss_mlp": 0.01257687, + "epoch": 0.4927701788666767, + "flos": 23510932623360.0, + "grad_norm": 2.3036054872688765, + "language_loss": 0.81165189, + "learning_rate": 2.143781950696001e-06, + "loss": 0.88872838, + "num_input_tokens_seen": 176271525, + "router_z_loss_clip": 1.52929688, + "router_z_loss_mlp": 0.11932373, + "step": 8196, + "time_per_iteration": 2.5550785064697266 + }, + { + "auxiliary_loss_clip": 0.06437081, + "auxiliary_loss_mlp": 0.01270899, + "balance_loss_clip": 0.06279114, + "balance_loss_mlp": 0.01258311, + "epoch": 0.49283030211934464, + "flos": 22935374127360.0, + "grad_norm": 1.9095456135696567, + "language_loss": 0.70909548, + "learning_rate": 2.1433934940244356e-06, + "loss": 0.78617525, + "num_input_tokens_seen": 176290810, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.12597656, + "step": 8197, + "time_per_iteration": 4.003530263900757 + }, + { + "auxiliary_loss_clip": 0.06434973, + "auxiliary_loss_mlp": 0.01271256, + "balance_loss_clip": 0.0628255, + "balance_loss_mlp": 0.01259699, + "epoch": 0.4928904253720126, + "flos": 16878622723200.0, + "grad_norm": 1.745870627956974, + "language_loss": 0.84271383, + "learning_rate": 2.143005031915374e-06, + "loss": 0.91977608, + "num_input_tokens_seen": 176309165, + "router_z_loss_clip": 1.52441406, + "router_z_loss_mlp": 0.11553955, + "step": 8198, + "time_per_iteration": 2.498107671737671 + }, + { + "auxiliary_loss_clip": 0.06443786, + "auxiliary_loss_mlp": 0.01268462, + "balance_loss_clip": 0.06287393, + "balance_loss_mlp": 0.01254521, + "epoch": 0.4929505486246806, + "flos": 14871855363840.0, + "grad_norm": 1.7338591596570678, + "language_loss": 0.76126587, + "learning_rate": 2.1426165643835467e-06, + "loss": 0.83838832, + "num_input_tokens_seen": 176324960, + "router_z_loss_clip": 1.56347656, + "router_z_loss_mlp": 0.13946533, + "step": 8199, + "time_per_iteration": 2.5254313945770264 + }, + { + "auxiliary_loss_clip": 0.06436033, + "auxiliary_loss_mlp": 0.01266476, + "balance_loss_clip": 0.06279432, + "balance_loss_mlp": 0.01252808, + "epoch": 0.49301067187734854, + "flos": 23849206755840.0, + "grad_norm": 1.3683337876027823, + "language_loss": 0.60070461, + "learning_rate": 2.1422280914436864e-06, + "loss": 0.67772967, + "num_input_tokens_seen": 176346195, + "router_z_loss_clip": 1.56542969, + "router_z_loss_mlp": 0.13647461, + "step": 8200, + "time_per_iteration": 2.54241943359375 + }, + { + "auxiliary_loss_clip": 0.06429607, + "auxiliary_loss_mlp": 0.01273188, + "balance_loss_clip": 0.06281705, + "balance_loss_mlp": 0.01261541, + "epoch": 0.49307079513001656, + "flos": 22497730652160.0, + "grad_norm": 1.4845406915411774, + "language_loss": 0.79454738, + "learning_rate": 2.1418396131105213e-06, + "loss": 0.87157536, + "num_input_tokens_seen": 176366735, + "router_z_loss_clip": 1.47851562, + "router_z_loss_mlp": 0.11657715, + "step": 8201, + "time_per_iteration": 2.590289831161499 + }, + { + "auxiliary_loss_clip": 0.0644393, + "auxiliary_loss_mlp": 0.01272695, + "balance_loss_clip": 0.06281954, + "balance_loss_mlp": 0.01259171, + "epoch": 0.4931309183826845, + "flos": 15930059777280.0, + "grad_norm": 1.9752291134223394, + "language_loss": 0.66993362, + "learning_rate": 2.141451129398785e-06, + "loss": 0.74709988, + "num_input_tokens_seen": 176384475, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.13525391, + "step": 8202, + "time_per_iteration": 2.5706307888031006 + }, + { + "auxiliary_loss_clip": 0.06429332, + "auxiliary_loss_mlp": 0.01267886, + "balance_loss_clip": 0.06277282, + "balance_loss_mlp": 0.01256055, + "epoch": 0.4931910416353525, + "flos": 27316588561920.0, + "grad_norm": 1.8969992308716948, + "language_loss": 0.75337243, + "learning_rate": 2.1410626403232076e-06, + "loss": 0.83034456, + "num_input_tokens_seen": 176402645, + "router_z_loss_clip": 1.52148438, + "router_z_loss_mlp": 0.11834717, + "step": 8203, + "time_per_iteration": 4.0727972984313965 + }, + { + "auxiliary_loss_clip": 0.06434371, + "auxiliary_loss_mlp": 0.01265731, + "balance_loss_clip": 0.06279419, + "balance_loss_mlp": 0.01253626, + "epoch": 0.49325116488802045, + "flos": 20811166871040.0, + "grad_norm": 2.0494104605673935, + "language_loss": 0.80605292, + "learning_rate": 2.1406741458985197e-06, + "loss": 0.8830539, + "num_input_tokens_seen": 176416715, + "router_z_loss_clip": 1.54980469, + "router_z_loss_mlp": 0.12103271, + "step": 8204, + "time_per_iteration": 2.6136350631713867 + }, + { + "auxiliary_loss_clip": 0.0643463, + "auxiliary_loss_mlp": 0.0126736, + "balance_loss_clip": 0.06280951, + "balance_loss_mlp": 0.01254664, + "epoch": 0.4933112881406884, + "flos": 19872247144320.0, + "grad_norm": 1.7256783924705517, + "language_loss": 0.65881336, + "learning_rate": 2.140285646139455e-06, + "loss": 0.73583329, + "num_input_tokens_seen": 176435755, + "router_z_loss_clip": 1.53613281, + "router_z_loss_mlp": 0.12695312, + "step": 8205, + "time_per_iteration": 2.5172812938690186 + }, + { + "auxiliary_loss_clip": 0.06445079, + "auxiliary_loss_mlp": 0.01273568, + "balance_loss_clip": 0.06283986, + "balance_loss_mlp": 0.0125971, + "epoch": 0.4933714113933564, + "flos": 21833215447680.0, + "grad_norm": 1.6546444342030124, + "language_loss": 0.66620767, + "learning_rate": 2.139897141060744e-06, + "loss": 0.74339426, + "num_input_tokens_seen": 176453915, + "router_z_loss_clip": 1.61035156, + "router_z_loss_mlp": 0.13861084, + "step": 8206, + "time_per_iteration": 2.556596040725708 + }, + { + "auxiliary_loss_clip": 0.06434575, + "auxiliary_loss_mlp": 0.0126512, + "balance_loss_clip": 0.06278799, + "balance_loss_mlp": 0.01253539, + "epoch": 0.49343153464602435, + "flos": 27897304083840.0, + "grad_norm": 1.8364733010130068, + "language_loss": 0.77070463, + "learning_rate": 2.1395086306771196e-06, + "loss": 0.84770155, + "num_input_tokens_seen": 176475175, + "router_z_loss_clip": 1.55761719, + "router_z_loss_mlp": 0.11584473, + "step": 8207, + "time_per_iteration": 2.591074228286743 + }, + { + "auxiliary_loss_clip": 0.06430385, + "auxiliary_loss_mlp": 0.01268434, + "balance_loss_clip": 0.06277601, + "balance_loss_mlp": 0.01256174, + "epoch": 0.4934916578986923, + "flos": 24688002453120.0, + "grad_norm": 2.876199477758729, + "language_loss": 0.60526079, + "learning_rate": 2.1391201150033147e-06, + "loss": 0.68224895, + "num_input_tokens_seen": 176494250, + "router_z_loss_clip": 1.53027344, + "router_z_loss_mlp": 0.12261963, + "step": 8208, + "time_per_iteration": 2.5641872882843018 + }, + { + "auxiliary_loss_clip": 0.06432977, + "auxiliary_loss_mlp": 0.01268916, + "balance_loss_clip": 0.06279885, + "balance_loss_mlp": 0.01256548, + "epoch": 0.4935517811513603, + "flos": 23412024478080.0, + "grad_norm": 2.3268226049750025, + "language_loss": 0.79136336, + "learning_rate": 2.1387315940540598e-06, + "loss": 0.86838233, + "num_input_tokens_seen": 176513325, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.12365723, + "step": 8209, + "time_per_iteration": 2.5345427989959717 + }, + { + "auxiliary_loss_clip": 0.06431048, + "auxiliary_loss_mlp": 0.01266279, + "balance_loss_clip": 0.06279348, + "balance_loss_mlp": 0.01253917, + "epoch": 0.49361190440402825, + "flos": 21950948833920.0, + "grad_norm": 3.2965997735856423, + "language_loss": 0.79514015, + "learning_rate": 2.138343067844089e-06, + "loss": 0.87211347, + "num_input_tokens_seen": 176532915, + "router_z_loss_clip": 1.51757812, + "router_z_loss_mlp": 0.12359619, + "step": 8210, + "time_per_iteration": 2.5686817169189453 + }, + { + "auxiliary_loss_clip": 0.06438643, + "auxiliary_loss_mlp": 0.01268716, + "balance_loss_clip": 0.06280634, + "balance_loss_mlp": 0.01256629, + "epoch": 0.4936720276566962, + "flos": 25122124056960.0, + "grad_norm": 2.539502696257949, + "language_loss": 0.81421793, + "learning_rate": 2.1379545363881363e-06, + "loss": 0.8912915, + "num_input_tokens_seen": 176552775, + "router_z_loss_clip": 1.58007812, + "router_z_loss_mlp": 0.12084961, + "step": 8211, + "time_per_iteration": 2.5667943954467773 + }, + { + "auxiliary_loss_clip": 0.06429391, + "auxiliary_loss_mlp": 0.0126729, + "balance_loss_clip": 0.06274866, + "balance_loss_mlp": 0.01254803, + "epoch": 0.4937321509093642, + "flos": 26366055045120.0, + "grad_norm": 2.1078758653058913, + "language_loss": 0.91783321, + "learning_rate": 2.137565999700933e-06, + "loss": 0.99480009, + "num_input_tokens_seen": 176572185, + "router_z_loss_clip": 1.54394531, + "router_z_loss_mlp": 0.12506104, + "step": 8212, + "time_per_iteration": 2.5892627239227295 + }, + { + "auxiliary_loss_clip": 0.06437102, + "auxiliary_loss_mlp": 0.01269581, + "balance_loss_clip": 0.06282008, + "balance_loss_mlp": 0.01257511, + "epoch": 0.49379227416203214, + "flos": 22967211479040.0, + "grad_norm": 1.9203573298750467, + "language_loss": 0.65474772, + "learning_rate": 2.1371774577972138e-06, + "loss": 0.7318145, + "num_input_tokens_seen": 176591490, + "router_z_loss_clip": 1.55078125, + "router_z_loss_mlp": 0.1206665, + "step": 8213, + "time_per_iteration": 2.5766966342926025 + }, + { + "auxiliary_loss_clip": 0.06435272, + "auxiliary_loss_mlp": 0.01267129, + "balance_loss_clip": 0.06281263, + "balance_loss_mlp": 0.01254957, + "epoch": 0.49385239741470016, + "flos": 32497340256000.0, + "grad_norm": 5.5178519689557435, + "language_loss": 0.76015925, + "learning_rate": 2.136788910691711e-06, + "loss": 0.83718324, + "num_input_tokens_seen": 176612715, + "router_z_loss_clip": 1.54003906, + "router_z_loss_mlp": 0.1217041, + "step": 8214, + "time_per_iteration": 2.6815552711486816 + }, + { + "auxiliary_loss_clip": 0.06435767, + "auxiliary_loss_mlp": 0.01267382, + "balance_loss_clip": 0.06282468, + "balance_loss_mlp": 0.0125508, + "epoch": 0.4939125206673681, + "flos": 22499575441920.0, + "grad_norm": 1.6727543381074526, + "language_loss": 0.84167933, + "learning_rate": 2.1364003583991594e-06, + "loss": 0.91871083, + "num_input_tokens_seen": 176631950, + "router_z_loss_clip": 1.53417969, + "router_z_loss_mlp": 0.12298584, + "step": 8215, + "time_per_iteration": 2.6213715076446533 + }, + { + "auxiliary_loss_clip": 0.06426814, + "auxiliary_loss_mlp": 0.01268273, + "balance_loss_clip": 0.06280927, + "balance_loss_mlp": 0.0125696, + "epoch": 0.4939726439200361, + "flos": 31184493684480.0, + "grad_norm": 1.9918722360209278, + "language_loss": 0.83712834, + "learning_rate": 2.136011800934292e-06, + "loss": 0.91407919, + "num_input_tokens_seen": 176653060, + "router_z_loss_clip": 1.45996094, + "router_z_loss_mlp": 0.11315918, + "step": 8216, + "time_per_iteration": 2.619922637939453 + }, + { + "auxiliary_loss_clip": 0.06434111, + "auxiliary_loss_mlp": 0.0127241, + "balance_loss_clip": 0.06283373, + "balance_loss_mlp": 0.01260614, + "epoch": 0.49403276717270406, + "flos": 22680773896320.0, + "grad_norm": 1.6954468061355052, + "language_loss": 0.75099367, + "learning_rate": 2.1356232383118442e-06, + "loss": 0.82805896, + "num_input_tokens_seen": 176673895, + "router_z_loss_clip": 1.50585938, + "router_z_loss_mlp": 0.11791992, + "step": 8217, + "time_per_iteration": 2.5473809242248535 + }, + { + "auxiliary_loss_clip": 0.06434639, + "auxiliary_loss_mlp": 0.01271118, + "balance_loss_clip": 0.06285703, + "balance_loss_mlp": 0.01258422, + "epoch": 0.494092890425372, + "flos": 20747408313600.0, + "grad_norm": 1.6176152886760666, + "language_loss": 0.78781378, + "learning_rate": 2.1352346705465494e-06, + "loss": 0.86487138, + "num_input_tokens_seen": 176692550, + "router_z_loss_clip": 1.48925781, + "router_z_loss_mlp": 0.12689209, + "step": 8218, + "time_per_iteration": 2.542994976043701 + }, + { + "auxiliary_loss_clip": 0.06433167, + "auxiliary_loss_mlp": 0.01265257, + "balance_loss_clip": 0.06283546, + "balance_loss_mlp": 0.01253628, + "epoch": 0.49415301367804, + "flos": 18374889882240.0, + "grad_norm": 2.39829798701753, + "language_loss": 0.77065396, + "learning_rate": 2.134846097653142e-06, + "loss": 0.84763819, + "num_input_tokens_seen": 176709335, + "router_z_loss_clip": 1.49609375, + "router_z_loss_mlp": 0.11639404, + "step": 8219, + "time_per_iteration": 2.5450475215911865 + }, + { + "auxiliary_loss_clip": 0.06439486, + "auxiliary_loss_mlp": 0.01269777, + "balance_loss_clip": 0.06285974, + "balance_loss_mlp": 0.01258321, + "epoch": 0.49421313693070795, + "flos": 17536471528320.0, + "grad_norm": 2.258549541306087, + "language_loss": 0.62705898, + "learning_rate": 2.134457519646357e-06, + "loss": 0.70415157, + "num_input_tokens_seen": 176727715, + "router_z_loss_clip": 1.53613281, + "router_z_loss_mlp": 0.11462402, + "step": 8220, + "time_per_iteration": 2.5296928882598877 + }, + { + "auxiliary_loss_clip": 0.06433114, + "auxiliary_loss_mlp": 0.01270633, + "balance_loss_clip": 0.06280304, + "balance_loss_mlp": 0.01259076, + "epoch": 0.4942732601833759, + "flos": 20818210613760.0, + "grad_norm": 1.8931623619102378, + "language_loss": 0.72802091, + "learning_rate": 2.1340689365409296e-06, + "loss": 0.80505836, + "num_input_tokens_seen": 176747530, + "router_z_loss_clip": 1.52929688, + "router_z_loss_mlp": 0.11572266, + "step": 8221, + "time_per_iteration": 2.521430253982544 + }, + { + "auxiliary_loss_clip": 0.06441319, + "auxiliary_loss_mlp": 0.01270693, + "balance_loss_clip": 0.06292681, + "balance_loss_mlp": 0.01258761, + "epoch": 0.4943333834360439, + "flos": 15054269702400.0, + "grad_norm": 1.6896047494674526, + "language_loss": 0.79253769, + "learning_rate": 2.133680348351595e-06, + "loss": 0.86965781, + "num_input_tokens_seen": 176765260, + "router_z_loss_clip": 1.48828125, + "router_z_loss_mlp": 0.11920166, + "step": 8222, + "time_per_iteration": 2.533997058868408 + }, + { + "auxiliary_loss_clip": 0.06434612, + "auxiliary_loss_mlp": 0.01272431, + "balance_loss_clip": 0.06282104, + "balance_loss_mlp": 0.0126051, + "epoch": 0.49439350668871185, + "flos": 16075899008640.0, + "grad_norm": 6.490136916654426, + "language_loss": 0.72483402, + "learning_rate": 2.133291755093088e-06, + "loss": 0.80190444, + "num_input_tokens_seen": 176781770, + "router_z_loss_clip": 1.5234375, + "router_z_loss_mlp": 0.1192627, + "step": 8223, + "time_per_iteration": 2.457361936569214 + }, + { + "auxiliary_loss_clip": 0.06444422, + "auxiliary_loss_mlp": 0.01270468, + "balance_loss_clip": 0.06287469, + "balance_loss_mlp": 0.01257367, + "epoch": 0.4944536299413798, + "flos": 20885281407360.0, + "grad_norm": 1.6318042764148617, + "language_loss": 0.75256205, + "learning_rate": 2.132903156780144e-06, + "loss": 0.82971096, + "num_input_tokens_seen": 176800655, + "router_z_loss_clip": 1.56933594, + "router_z_loss_mlp": 0.13122559, + "step": 8224, + "time_per_iteration": 2.5326499938964844 + }, + { + "auxiliary_loss_clip": 0.06441943, + "auxiliary_loss_mlp": 0.01267954, + "balance_loss_clip": 0.06287307, + "balance_loss_mlp": 0.01255646, + "epoch": 0.4945137531940478, + "flos": 26615162833920.0, + "grad_norm": 2.58625148433793, + "language_loss": 0.64002287, + "learning_rate": 2.1325145534274997e-06, + "loss": 0.71712184, + "num_input_tokens_seen": 176820610, + "router_z_loss_clip": 1.54394531, + "router_z_loss_mlp": 0.12322998, + "step": 8225, + "time_per_iteration": 2.555088996887207 + }, + { + "auxiliary_loss_clip": 0.06438252, + "auxiliary_loss_mlp": 0.01269636, + "balance_loss_clip": 0.06283222, + "balance_loss_mlp": 0.01258007, + "epoch": 0.49457387644671574, + "flos": 23995004060160.0, + "grad_norm": 2.0569415863505554, + "language_loss": 0.77084112, + "learning_rate": 2.1321259450498893e-06, + "loss": 0.84792, + "num_input_tokens_seen": 176840520, + "router_z_loss_clip": 1.54980469, + "router_z_loss_mlp": 0.11627197, + "step": 8226, + "time_per_iteration": 2.557900905609131 + }, + { + "auxiliary_loss_clip": 0.06436731, + "auxiliary_loss_mlp": 0.01270529, + "balance_loss_clip": 0.06281079, + "balance_loss_mlp": 0.01256958, + "epoch": 0.49463399969938376, + "flos": 26983387601280.0, + "grad_norm": 1.6446627405679832, + "language_loss": 0.71402973, + "learning_rate": 2.131737331662051e-06, + "loss": 0.79110235, + "num_input_tokens_seen": 176860265, + "router_z_loss_clip": 1.55664062, + "router_z_loss_mlp": 0.13568115, + "step": 8227, + "time_per_iteration": 2.533468246459961 + }, + { + "auxiliary_loss_clip": 0.06441461, + "auxiliary_loss_mlp": 0.01270684, + "balance_loss_clip": 0.06282251, + "balance_loss_mlp": 0.01258477, + "epoch": 0.49469412295205173, + "flos": 29689610117760.0, + "grad_norm": 1.6469495440568809, + "language_loss": 0.7179364, + "learning_rate": 2.131348713278718e-06, + "loss": 0.79505783, + "num_input_tokens_seen": 176882910, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.12213135, + "step": 8228, + "time_per_iteration": 2.621777296066284 + }, + { + "auxiliary_loss_clip": 0.06432875, + "auxiliary_loss_mlp": 0.01271248, + "balance_loss_clip": 0.06283268, + "balance_loss_mlp": 0.01259768, + "epoch": 0.4947542462047197, + "flos": 24138285742080.0, + "grad_norm": 1.3686875437171686, + "language_loss": 0.84044397, + "learning_rate": 2.1309600899146304e-06, + "loss": 0.91748512, + "num_input_tokens_seen": 176903030, + "router_z_loss_clip": 1.49511719, + "router_z_loss_mlp": 0.1149292, + "step": 8229, + "time_per_iteration": 2.620849609375 + }, + { + "auxiliary_loss_clip": 0.06443636, + "auxiliary_loss_mlp": 0.01271474, + "balance_loss_clip": 0.0628624, + "balance_loss_mlp": 0.01258134, + "epoch": 0.49481436945738766, + "flos": 20050804195200.0, + "grad_norm": 2.3211713476829656, + "language_loss": 0.75208747, + "learning_rate": 2.1305714615845227e-06, + "loss": 0.82923853, + "num_input_tokens_seen": 176919025, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.13342285, + "step": 8230, + "time_per_iteration": 3.9126293659210205 + }, + { + "auxiliary_loss_clip": 0.06439002, + "auxiliary_loss_mlp": 0.01268847, + "balance_loss_clip": 0.06284901, + "balance_loss_mlp": 0.01256432, + "epoch": 0.4948744927100556, + "flos": 15675040275840.0, + "grad_norm": 1.9615207178823395, + "language_loss": 0.80548179, + "learning_rate": 2.1301828283031314e-06, + "loss": 0.88256031, + "num_input_tokens_seen": 176937945, + "router_z_loss_clip": 1.54101562, + "router_z_loss_mlp": 0.1239624, + "step": 8231, + "time_per_iteration": 2.525049924850464 + }, + { + "auxiliary_loss_clip": 0.06329959, + "auxiliary_loss_mlp": 0.01257972, + "balance_loss_clip": 0.06262948, + "balance_loss_mlp": 0.0125556, + "epoch": 0.4949346159627236, + "flos": 68893611644160.0, + "grad_norm": 0.7512177245674743, + "language_loss": 0.60052431, + "learning_rate": 2.1297941900851944e-06, + "loss": 0.67640364, + "num_input_tokens_seen": 177004575, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 0.02409363, + "step": 8232, + "time_per_iteration": 4.674450159072876 + }, + { + "auxiliary_loss_clip": 0.06440374, + "auxiliary_loss_mlp": 0.01269686, + "balance_loss_clip": 0.06279664, + "balance_loss_mlp": 0.0125631, + "epoch": 0.49499473921539155, + "flos": 24797182723200.0, + "grad_norm": 1.782814520641974, + "language_loss": 0.68933427, + "learning_rate": 2.1294055469454496e-06, + "loss": 0.76643485, + "num_input_tokens_seen": 177024155, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.13366699, + "step": 8233, + "time_per_iteration": 2.574759006500244 + }, + { + "auxiliary_loss_clip": 0.06426412, + "auxiliary_loss_mlp": 0.01270358, + "balance_loss_clip": 0.06276375, + "balance_loss_mlp": 0.01258508, + "epoch": 0.4950548624680595, + "flos": 32716161993600.0, + "grad_norm": 2.8586701341507355, + "language_loss": 0.6684472, + "learning_rate": 2.129016898898633e-06, + "loss": 0.74541491, + "num_input_tokens_seen": 177046185, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.1184082, + "step": 8234, + "time_per_iteration": 2.653381824493408 + }, + { + "auxiliary_loss_clip": 0.06329186, + "auxiliary_loss_mlp": 0.0125637, + "balance_loss_clip": 0.06261852, + "balance_loss_mlp": 0.01254119, + "epoch": 0.4951149857207275, + "flos": 50100616287360.0, + "grad_norm": 0.7779673724008701, + "language_loss": 0.58149666, + "learning_rate": 2.128628245959482e-06, + "loss": 0.65735215, + "num_input_tokens_seen": 177099025, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 0.02255249, + "step": 8235, + "time_per_iteration": 3.0858991146087646 + }, + { + "auxiliary_loss_clip": 0.06437027, + "auxiliary_loss_mlp": 0.01272544, + "balance_loss_clip": 0.06281243, + "balance_loss_mlp": 0.01259401, + "epoch": 0.49517510897339545, + "flos": 22243340056320.0, + "grad_norm": 1.7279160321905627, + "language_loss": 0.77504063, + "learning_rate": 2.1282395881427355e-06, + "loss": 0.85213637, + "num_input_tokens_seen": 177118365, + "router_z_loss_clip": 1.55664062, + "router_z_loss_mlp": 0.13134766, + "step": 8236, + "time_per_iteration": 2.5753977298736572 + }, + { + "auxiliary_loss_clip": 0.06428996, + "auxiliary_loss_mlp": 0.01267571, + "balance_loss_clip": 0.06278376, + "balance_loss_mlp": 0.01256037, + "epoch": 0.4952352322260634, + "flos": 25381126627200.0, + "grad_norm": 1.6842676088909172, + "language_loss": 0.72880518, + "learning_rate": 2.1278509254631315e-06, + "loss": 0.80577087, + "num_input_tokens_seen": 177136415, + "router_z_loss_clip": 1.50390625, + "router_z_loss_mlp": 0.11529541, + "step": 8237, + "time_per_iteration": 4.036882400512695 + }, + { + "auxiliary_loss_clip": 0.06434725, + "auxiliary_loss_mlp": 0.01270554, + "balance_loss_clip": 0.06283747, + "balance_loss_mlp": 0.0125787, + "epoch": 0.4952953554787314, + "flos": 24615732706560.0, + "grad_norm": 2.2000126991913285, + "language_loss": 0.75703216, + "learning_rate": 2.127462257935406e-06, + "loss": 0.83408493, + "num_input_tokens_seen": 177155690, + "router_z_loss_clip": 1.50976562, + "router_z_loss_mlp": 0.12664795, + "step": 8238, + "time_per_iteration": 2.549431085586548 + }, + { + "auxiliary_loss_clip": 0.06435382, + "auxiliary_loss_mlp": 0.01269682, + "balance_loss_clip": 0.06280845, + "balance_loss_mlp": 0.01257081, + "epoch": 0.49535547873139935, + "flos": 17317020885120.0, + "grad_norm": 2.278500195677925, + "language_loss": 0.74391794, + "learning_rate": 2.1270735855743008e-06, + "loss": 0.82096863, + "num_input_tokens_seen": 177173350, + "router_z_loss_clip": 1.54492188, + "router_z_loss_mlp": 0.12615967, + "step": 8239, + "time_per_iteration": 2.571343183517456 + }, + { + "auxiliary_loss_clip": 0.06438212, + "auxiliary_loss_mlp": 0.01271609, + "balance_loss_clip": 0.06280148, + "balance_loss_mlp": 0.01257917, + "epoch": 0.4954156019840673, + "flos": 20746527845760.0, + "grad_norm": 2.0000035114581927, + "language_loss": 0.79093564, + "learning_rate": 2.126684908394552e-06, + "loss": 0.86803377, + "num_input_tokens_seen": 177191115, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.13684082, + "step": 8240, + "time_per_iteration": 2.531712532043457 + }, + { + "auxiliary_loss_clip": 0.06430051, + "auxiliary_loss_mlp": 0.01267271, + "balance_loss_clip": 0.06279683, + "balance_loss_mlp": 0.0125594, + "epoch": 0.49547572523673533, + "flos": 12825200661120.0, + "grad_norm": 2.1298693498085592, + "language_loss": 0.86484092, + "learning_rate": 2.126296226410898e-06, + "loss": 0.94181418, + "num_input_tokens_seen": 177206155, + "router_z_loss_clip": 1.50292969, + "router_z_loss_mlp": 0.11334229, + "step": 8241, + "time_per_iteration": 2.5414860248565674 + }, + { + "auxiliary_loss_clip": 0.06427231, + "auxiliary_loss_mlp": 0.01270719, + "balance_loss_clip": 0.06279866, + "balance_loss_mlp": 0.01260003, + "epoch": 0.4955358484894033, + "flos": 15602602821120.0, + "grad_norm": 1.7100085929309539, + "language_loss": 0.77987742, + "learning_rate": 2.1259075396380794e-06, + "loss": 0.85685694, + "num_input_tokens_seen": 177224815, + "router_z_loss_clip": 1.47460938, + "router_z_loss_mlp": 0.10723877, + "step": 8242, + "time_per_iteration": 2.500761032104492 + }, + { + "auxiliary_loss_clip": 0.06436419, + "auxiliary_loss_mlp": 0.01265138, + "balance_loss_clip": 0.06284536, + "balance_loss_mlp": 0.0125308, + "epoch": 0.49559597174207126, + "flos": 26470832976000.0, + "grad_norm": 1.8102794432235507, + "language_loss": 0.67317849, + "learning_rate": 2.125518848090833e-06, + "loss": 0.75019407, + "num_input_tokens_seen": 177244490, + "router_z_loss_clip": 1.51855469, + "router_z_loss_mlp": 0.1206665, + "step": 8243, + "time_per_iteration": 4.062270641326904 + }, + { + "auxiliary_loss_clip": 0.06430024, + "auxiliary_loss_mlp": 0.01269105, + "balance_loss_clip": 0.06279217, + "balance_loss_mlp": 0.0125722, + "epoch": 0.4956560949947392, + "flos": 23154824770560.0, + "grad_norm": 2.721585758888369, + "language_loss": 0.68786383, + "learning_rate": 2.125130151783901e-06, + "loss": 0.76485521, + "num_input_tokens_seen": 177264340, + "router_z_loss_clip": 1.5078125, + "router_z_loss_mlp": 0.11889648, + "step": 8244, + "time_per_iteration": 2.55732798576355 + }, + { + "auxiliary_loss_clip": 0.06434646, + "auxiliary_loss_mlp": 0.01266504, + "balance_loss_clip": 0.06280981, + "balance_loss_mlp": 0.01254541, + "epoch": 0.4957162182474072, + "flos": 20779119884160.0, + "grad_norm": 2.485823072522516, + "language_loss": 0.75575739, + "learning_rate": 2.12474145073202e-06, + "loss": 0.83276892, + "num_input_tokens_seen": 177283055, + "router_z_loss_clip": 1.53515625, + "router_z_loss_mlp": 0.11962891, + "step": 8245, + "time_per_iteration": 2.5086231231689453 + }, + { + "auxiliary_loss_clip": 0.06428742, + "auxiliary_loss_mlp": 0.01268325, + "balance_loss_clip": 0.06280199, + "balance_loss_mlp": 0.01256762, + "epoch": 0.49577634150007516, + "flos": 18740179756800.0, + "grad_norm": 1.8890947976192427, + "language_loss": 0.81602311, + "learning_rate": 2.1243527449499306e-06, + "loss": 0.89299381, + "num_input_tokens_seen": 177301140, + "router_z_loss_clip": 1.48632812, + "router_z_loss_mlp": 0.11572266, + "step": 8246, + "time_per_iteration": 2.534557342529297 + }, + { + "auxiliary_loss_clip": 0.06440324, + "auxiliary_loss_mlp": 0.01268715, + "balance_loss_clip": 0.06283663, + "balance_loss_mlp": 0.01256347, + "epoch": 0.4958364647527431, + "flos": 25560815708160.0, + "grad_norm": 1.7539344008969155, + "language_loss": 0.84379256, + "learning_rate": 2.1239640344523733e-06, + "loss": 0.92088294, + "num_input_tokens_seen": 177323095, + "router_z_loss_clip": 1.56640625, + "router_z_loss_mlp": 0.12359619, + "step": 8247, + "time_per_iteration": 2.5563809871673584 + }, + { + "auxiliary_loss_clip": 0.06436694, + "auxiliary_loss_mlp": 0.01269797, + "balance_loss_clip": 0.06282616, + "balance_loss_mlp": 0.01257798, + "epoch": 0.4958965880054111, + "flos": 24432144410880.0, + "grad_norm": 2.2837128243369658, + "language_loss": 0.84184051, + "learning_rate": 2.123575319254087e-06, + "loss": 0.91890538, + "num_input_tokens_seen": 177339845, + "router_z_loss_clip": 1.54101562, + "router_z_loss_mlp": 0.12011719, + "step": 8248, + "time_per_iteration": 2.566392660140991 + }, + { + "auxiliary_loss_clip": 0.0643697, + "auxiliary_loss_mlp": 0.01268541, + "balance_loss_clip": 0.06282248, + "balance_loss_mlp": 0.01256024, + "epoch": 0.49595671125807905, + "flos": 25090622121600.0, + "grad_norm": 1.727142692455913, + "language_loss": 0.73609596, + "learning_rate": 2.123186599369812e-06, + "loss": 0.813151, + "num_input_tokens_seen": 177359980, + "router_z_loss_clip": 1.54492188, + "router_z_loss_mlp": 0.12518311, + "step": 8249, + "time_per_iteration": 2.548520088195801 + }, + { + "auxiliary_loss_clip": 0.06441288, + "auxiliary_loss_mlp": 0.01269234, + "balance_loss_clip": 0.06283297, + "balance_loss_mlp": 0.01256365, + "epoch": 0.496016834510747, + "flos": 16441524299520.0, + "grad_norm": 2.7229998624345115, + "language_loss": 0.76506901, + "learning_rate": 2.122797874814289e-06, + "loss": 0.84217423, + "num_input_tokens_seen": 177378580, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.12860107, + "step": 8250, + "time_per_iteration": 2.524714231491089 + }, + { + "auxiliary_loss_clip": 0.06438759, + "auxiliary_loss_mlp": 0.01269282, + "balance_loss_clip": 0.06282068, + "balance_loss_mlp": 0.01256551, + "epoch": 0.496076957763415, + "flos": 23444197246080.0, + "grad_norm": 1.6959600873244032, + "language_loss": 0.7021333, + "learning_rate": 2.1224091456022585e-06, + "loss": 0.77921373, + "num_input_tokens_seen": 177398790, + "router_z_loss_clip": 1.56542969, + "router_z_loss_mlp": 0.12738037, + "step": 8251, + "time_per_iteration": 2.531841516494751 + }, + { + "auxiliary_loss_clip": 0.06437311, + "auxiliary_loss_mlp": 0.01271839, + "balance_loss_clip": 0.06285296, + "balance_loss_mlp": 0.01259871, + "epoch": 0.49613708101608295, + "flos": 16915113976320.0, + "grad_norm": 1.8201441219473296, + "language_loss": 0.7993809, + "learning_rate": 2.122020411748461e-06, + "loss": 0.87647241, + "num_input_tokens_seen": 177416515, + "router_z_loss_clip": 1.51855469, + "router_z_loss_mlp": 0.11975098, + "step": 8252, + "time_per_iteration": 2.5806944370269775 + }, + { + "auxiliary_loss_clip": 0.06434863, + "auxiliary_loss_mlp": 0.01270348, + "balance_loss_clip": 0.06282027, + "balance_loss_mlp": 0.01255905, + "epoch": 0.4961972042687509, + "flos": 16623729002880.0, + "grad_norm": 1.8109031344325417, + "language_loss": 0.81898755, + "learning_rate": 2.1216316732676363e-06, + "loss": 0.89603961, + "num_input_tokens_seen": 177434425, + "router_z_loss_clip": 1.52636719, + "router_z_loss_mlp": 0.14447021, + "step": 8253, + "time_per_iteration": 2.4936153888702393 + }, + { + "auxiliary_loss_clip": 0.0643016, + "auxiliary_loss_mlp": 0.01264565, + "balance_loss_clip": 0.06279143, + "balance_loss_mlp": 0.01253139, + "epoch": 0.49625732752141893, + "flos": 28965529059840.0, + "grad_norm": 1.4049535238306547, + "language_loss": 0.67659622, + "learning_rate": 2.1212429301745275e-06, + "loss": 0.7535435, + "num_input_tokens_seen": 177459675, + "router_z_loss_clip": 1.51074219, + "router_z_loss_mlp": 0.11437988, + "step": 8254, + "time_per_iteration": 2.681328058242798 + }, + { + "auxiliary_loss_clip": 0.06436362, + "auxiliary_loss_mlp": 0.01267121, + "balance_loss_clip": 0.06281647, + "balance_loss_mlp": 0.01254729, + "epoch": 0.4963174507740869, + "flos": 23119046277120.0, + "grad_norm": 6.04751780380752, + "language_loss": 0.74611968, + "learning_rate": 2.1208541824838743e-06, + "loss": 0.82315457, + "num_input_tokens_seen": 177478895, + "router_z_loss_clip": 1.54785156, + "router_z_loss_mlp": 0.12384033, + "step": 8255, + "time_per_iteration": 2.5586442947387695 + }, + { + "auxiliary_loss_clip": 0.06430424, + "auxiliary_loss_mlp": 0.01268774, + "balance_loss_clip": 0.06278734, + "balance_loss_mlp": 0.01256972, + "epoch": 0.49637757402675486, + "flos": 13922998928640.0, + "grad_norm": 1.9051204382469373, + "language_loss": 0.81712639, + "learning_rate": 2.1204654302104183e-06, + "loss": 0.89411843, + "num_input_tokens_seen": 177494920, + "router_z_loss_clip": 1.515625, + "router_z_loss_mlp": 0.11798096, + "step": 8256, + "time_per_iteration": 2.525191307067871 + }, + { + "auxiliary_loss_clip": 0.06430264, + "auxiliary_loss_mlp": 0.01267515, + "balance_loss_clip": 0.06279526, + "balance_loss_mlp": 0.01256035, + "epoch": 0.49643769727942283, + "flos": 22315442094720.0, + "grad_norm": 1.4246388626256767, + "language_loss": 0.81285727, + "learning_rate": 2.120076673368901e-06, + "loss": 0.889835, + "num_input_tokens_seen": 177515455, + "router_z_loss_clip": 1.50585938, + "router_z_loss_mlp": 0.11474609, + "step": 8257, + "time_per_iteration": 2.5366289615631104 + }, + { + "auxiliary_loss_clip": 0.06441522, + "auxiliary_loss_mlp": 0.01265551, + "balance_loss_clip": 0.06281207, + "balance_loss_mlp": 0.01253153, + "epoch": 0.4964978205320908, + "flos": 19506328364160.0, + "grad_norm": 1.7556989119603337, + "language_loss": 0.66651785, + "learning_rate": 2.1196879119740647e-06, + "loss": 0.74358857, + "num_input_tokens_seen": 177534040, + "router_z_loss_clip": 1.60351562, + "router_z_loss_mlp": 0.1239624, + "step": 8258, + "time_per_iteration": 2.567802667617798 + }, + { + "auxiliary_loss_clip": 0.06427691, + "auxiliary_loss_mlp": 0.01266798, + "balance_loss_clip": 0.06277505, + "balance_loss_mlp": 0.0125607, + "epoch": 0.49655794378475876, + "flos": 23442562091520.0, + "grad_norm": 1.5238866764667018, + "language_loss": 0.7778039, + "learning_rate": 2.1192991460406502e-06, + "loss": 0.85474873, + "num_input_tokens_seen": 177554510, + "router_z_loss_clip": 1.50195312, + "router_z_loss_mlp": 0.10723877, + "step": 8259, + "time_per_iteration": 2.5521552562713623 + }, + { + "auxiliary_loss_clip": 0.06430545, + "auxiliary_loss_mlp": 0.01266762, + "balance_loss_clip": 0.06279439, + "balance_loss_mlp": 0.01254954, + "epoch": 0.4966180670374267, + "flos": 26837967640320.0, + "grad_norm": 1.4589343239403403, + "language_loss": 0.78972054, + "learning_rate": 2.1189103755834e-06, + "loss": 0.86669362, + "num_input_tokens_seen": 177575780, + "router_z_loss_clip": 1.51171875, + "router_z_loss_mlp": 0.11816406, + "step": 8260, + "time_per_iteration": 2.6012649536132812 + }, + { + "auxiliary_loss_clip": 0.06434717, + "auxiliary_loss_mlp": 0.01267655, + "balance_loss_clip": 0.06279895, + "balance_loss_mlp": 0.01255055, + "epoch": 0.4966781902900947, + "flos": 22014413902080.0, + "grad_norm": 2.8586716221878206, + "language_loss": 0.76515198, + "learning_rate": 2.1185216006170573e-06, + "loss": 0.84217566, + "num_input_tokens_seen": 177588965, + "router_z_loss_clip": 1.54785156, + "router_z_loss_mlp": 0.12591553, + "step": 8261, + "time_per_iteration": 2.4737415313720703 + }, + { + "auxiliary_loss_clip": 0.06427643, + "auxiliary_loss_mlp": 0.01267002, + "balance_loss_clip": 0.0627794, + "balance_loss_mlp": 0.01255772, + "epoch": 0.49673831354276266, + "flos": 26220509303040.0, + "grad_norm": 1.7291004140234418, + "language_loss": 0.89456958, + "learning_rate": 2.1181328211563627e-06, + "loss": 0.97151601, + "num_input_tokens_seen": 177608425, + "router_z_loss_clip": 1.49609375, + "router_z_loss_mlp": 0.11230469, + "step": 8262, + "time_per_iteration": 2.613236665725708 + }, + { + "auxiliary_loss_clip": 0.06431636, + "auxiliary_loss_mlp": 0.01268648, + "balance_loss_clip": 0.06281907, + "balance_loss_mlp": 0.01256817, + "epoch": 0.4967984367954306, + "flos": 23188464984960.0, + "grad_norm": 1.4347791599980126, + "language_loss": 0.73918176, + "learning_rate": 2.11774403721606e-06, + "loss": 0.81618452, + "num_input_tokens_seen": 177628240, + "router_z_loss_clip": 1.49804688, + "router_z_loss_mlp": 0.11834717, + "step": 8263, + "time_per_iteration": 2.595635414123535 + }, + { + "auxiliary_loss_clip": 0.06439725, + "auxiliary_loss_mlp": 0.01274389, + "balance_loss_clip": 0.06283052, + "balance_loss_mlp": 0.01260239, + "epoch": 0.4968585600480986, + "flos": 19287506626560.0, + "grad_norm": 2.258936930728745, + "language_loss": 0.69678748, + "learning_rate": 2.1173552488108923e-06, + "loss": 0.77392858, + "num_input_tokens_seen": 177645920, + "router_z_loss_clip": 1.56542969, + "router_z_loss_mlp": 0.14147949, + "step": 8264, + "time_per_iteration": 2.5913755893707275 + }, + { + "auxiliary_loss_clip": 0.06438377, + "auxiliary_loss_mlp": 0.01267325, + "balance_loss_clip": 0.06281792, + "balance_loss_mlp": 0.01255136, + "epoch": 0.49691868330076655, + "flos": 22535312008320.0, + "grad_norm": 1.388736059607974, + "language_loss": 0.65131235, + "learning_rate": 2.1169664559556007e-06, + "loss": 0.72836947, + "num_input_tokens_seen": 177667185, + "router_z_loss_clip": 1.56933594, + "router_z_loss_mlp": 0.12188721, + "step": 8265, + "time_per_iteration": 2.528193473815918 + }, + { + "auxiliary_loss_clip": 0.06333993, + "auxiliary_loss_mlp": 0.01255399, + "balance_loss_clip": 0.06266748, + "balance_loss_mlp": 0.01253268, + "epoch": 0.4969788065534345, + "flos": 66598897328640.0, + "grad_norm": 0.8036364801041208, + "language_loss": 0.53402334, + "learning_rate": 2.1165776586649304e-06, + "loss": 0.60991728, + "num_input_tokens_seen": 177733020, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 0.02133179, + "step": 8266, + "time_per_iteration": 3.1838197708129883 + }, + { + "auxiliary_loss_clip": 0.06428756, + "auxiliary_loss_mlp": 0.01272627, + "balance_loss_clip": 0.06282037, + "balance_loss_mlp": 0.01260592, + "epoch": 0.49703892980610254, + "flos": 24066099849600.0, + "grad_norm": 1.4975664699088878, + "language_loss": 0.79899192, + "learning_rate": 2.1161888569536223e-06, + "loss": 0.87600571, + "num_input_tokens_seen": 177753370, + "router_z_loss_clip": 1.46679688, + "router_z_loss_mlp": 0.12036133, + "step": 8267, + "time_per_iteration": 2.556995391845703 + }, + { + "auxiliary_loss_clip": 0.06434017, + "auxiliary_loss_mlp": 0.01269443, + "balance_loss_clip": 0.06279886, + "balance_loss_mlp": 0.01256295, + "epoch": 0.4970990530587705, + "flos": 29132807736960.0, + "grad_norm": 3.0454644456900155, + "language_loss": 0.75843596, + "learning_rate": 2.1158000508364223e-06, + "loss": 0.83547056, + "num_input_tokens_seen": 177771530, + "router_z_loss_clip": 1.54003906, + "router_z_loss_mlp": 0.13146973, + "step": 8268, + "time_per_iteration": 2.6049721240997314 + }, + { + "auxiliary_loss_clip": 0.06435575, + "auxiliary_loss_mlp": 0.01270084, + "balance_loss_clip": 0.06281421, + "balance_loss_mlp": 0.01257185, + "epoch": 0.49715917631143847, + "flos": 46036811047680.0, + "grad_norm": 1.4862794016102487, + "language_loss": 0.68007714, + "learning_rate": 2.115411240328073e-06, + "loss": 0.75713372, + "num_input_tokens_seen": 177796355, + "router_z_loss_clip": 1.54003906, + "router_z_loss_mlp": 0.12902832, + "step": 8269, + "time_per_iteration": 4.128691911697388 + }, + { + "auxiliary_loss_clip": 0.06433591, + "auxiliary_loss_mlp": 0.01270109, + "balance_loss_clip": 0.06283623, + "balance_loss_mlp": 0.01258444, + "epoch": 0.49721929956410643, + "flos": 20197104624000.0, + "grad_norm": 1.5327488108804688, + "language_loss": 0.85668087, + "learning_rate": 2.1150224254433167e-06, + "loss": 0.93371785, + "num_input_tokens_seen": 177814300, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.11669922, + "step": 8270, + "time_per_iteration": 2.518367290496826 + }, + { + "auxiliary_loss_clip": 0.06438391, + "auxiliary_loss_mlp": 0.012695, + "balance_loss_clip": 0.06282806, + "balance_loss_mlp": 0.01258443, + "epoch": 0.4972794228167744, + "flos": 21660108912000.0, + "grad_norm": 1.8194061326909323, + "language_loss": 0.71364737, + "learning_rate": 2.114633606196899e-06, + "loss": 0.7907263, + "num_input_tokens_seen": 177833615, + "router_z_loss_clip": 1.55664062, + "router_z_loss_mlp": 0.1105957, + "step": 8271, + "time_per_iteration": 2.5573620796203613 + }, + { + "auxiliary_loss_clip": 0.06437098, + "auxiliary_loss_mlp": 0.01269156, + "balance_loss_clip": 0.06284092, + "balance_loss_mlp": 0.0125646, + "epoch": 0.49733954606944236, + "flos": 24286598668800.0, + "grad_norm": 1.3024187792808712, + "language_loss": 0.78511107, + "learning_rate": 2.1142447826035635e-06, + "loss": 0.86217368, + "num_input_tokens_seen": 177855315, + "router_z_loss_clip": 1.52832031, + "router_z_loss_mlp": 0.12677002, + "step": 8272, + "time_per_iteration": 4.061326742172241 + }, + { + "auxiliary_loss_clip": 0.06438889, + "auxiliary_loss_mlp": 0.01270506, + "balance_loss_clip": 0.06285517, + "balance_loss_mlp": 0.01257548, + "epoch": 0.4973996693221103, + "flos": 37861722172800.0, + "grad_norm": 2.25975995369767, + "language_loss": 0.66725254, + "learning_rate": 2.1138559546780544e-06, + "loss": 0.7443465, + "num_input_tokens_seen": 177875590, + "router_z_loss_clip": 1.53320312, + "router_z_loss_mlp": 0.12957764, + "step": 8273, + "time_per_iteration": 2.645908832550049 + }, + { + "auxiliary_loss_clip": 0.06436634, + "auxiliary_loss_mlp": 0.01276274, + "balance_loss_clip": 0.06285357, + "balance_loss_mlp": 0.01264109, + "epoch": 0.4974597925747783, + "flos": 21367885397760.0, + "grad_norm": 1.5281958400790516, + "language_loss": 0.78156513, + "learning_rate": 2.1134671224351163e-06, + "loss": 0.8586942, + "num_input_tokens_seen": 177894175, + "router_z_loss_clip": 1.51171875, + "router_z_loss_mlp": 0.12182617, + "step": 8274, + "time_per_iteration": 2.535804271697998 + }, + { + "auxiliary_loss_clip": 0.06437881, + "auxiliary_loss_mlp": 0.0127292, + "balance_loss_clip": 0.06281041, + "balance_loss_mlp": 0.01259992, + "epoch": 0.49751991582744626, + "flos": 30746137449600.0, + "grad_norm": 1.6098675264323796, + "language_loss": 0.76012516, + "learning_rate": 2.113078285889493e-06, + "loss": 0.83723313, + "num_input_tokens_seen": 177913920, + "router_z_loss_clip": 1.56640625, + "router_z_loss_mlp": 0.12939453, + "step": 8275, + "time_per_iteration": 2.5787549018859863 + }, + { + "auxiliary_loss_clip": 0.06438003, + "auxiliary_loss_mlp": 0.01271635, + "balance_loss_clip": 0.06282246, + "balance_loss_mlp": 0.01257789, + "epoch": 0.4975800390801142, + "flos": 14105748683520.0, + "grad_norm": 1.8196816586022186, + "language_loss": 0.84079218, + "learning_rate": 2.1126894450559303e-06, + "loss": 0.91788852, + "num_input_tokens_seen": 177930425, + "router_z_loss_clip": 1.55664062, + "router_z_loss_mlp": 0.1385498, + "step": 8276, + "time_per_iteration": 2.5156893730163574 + }, + { + "auxiliary_loss_clip": 0.06426419, + "auxiliary_loss_mlp": 0.01277009, + "balance_loss_clip": 0.06279768, + "balance_loss_mlp": 0.01265398, + "epoch": 0.4976401623327822, + "flos": 24214203141120.0, + "grad_norm": 1.3141436658277077, + "language_loss": 0.70087981, + "learning_rate": 2.112300599949172e-06, + "loss": 0.77791417, + "num_input_tokens_seen": 177949885, + "router_z_loss_clip": 1.46582031, + "router_z_loss_mlp": 0.1161499, + "step": 8277, + "time_per_iteration": 3.9860711097717285 + }, + { + "auxiliary_loss_clip": 0.06429198, + "auxiliary_loss_mlp": 0.01270973, + "balance_loss_clip": 0.06280812, + "balance_loss_mlp": 0.01258754, + "epoch": 0.49770028558545015, + "flos": 21142229552640.0, + "grad_norm": 1.8219149953370526, + "language_loss": 0.82141137, + "learning_rate": 2.111911750583964e-06, + "loss": 0.89841306, + "num_input_tokens_seen": 177965720, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.12231445, + "step": 8278, + "time_per_iteration": 2.5353100299835205 + }, + { + "auxiliary_loss_clip": 0.06435424, + "auxiliary_loss_mlp": 0.01268936, + "balance_loss_clip": 0.06279474, + "balance_loss_mlp": 0.01256246, + "epoch": 0.4977604088381181, + "flos": 16769568234240.0, + "grad_norm": 1.8298360040603827, + "language_loss": 0.68205428, + "learning_rate": 2.111522896975052e-06, + "loss": 0.75909793, + "num_input_tokens_seen": 177983190, + "router_z_loss_clip": 1.55957031, + "router_z_loss_mlp": 0.12695312, + "step": 8279, + "time_per_iteration": 2.538273334503174 + }, + { + "auxiliary_loss_clip": 0.06430422, + "auxiliary_loss_mlp": 0.01271809, + "balance_loss_clip": 0.06277534, + "balance_loss_mlp": 0.01258129, + "epoch": 0.49782053209078614, + "flos": 15708596636160.0, + "grad_norm": 1.929140490148881, + "language_loss": 0.70948005, + "learning_rate": 2.1111340391371794e-06, + "loss": 0.78650236, + "num_input_tokens_seen": 178000155, + "router_z_loss_clip": 1.52832031, + "router_z_loss_mlp": 0.13665771, + "step": 8280, + "time_per_iteration": 2.5344486236572266 + }, + { + "auxiliary_loss_clip": 0.06432884, + "auxiliary_loss_mlp": 0.01270682, + "balance_loss_clip": 0.06279922, + "balance_loss_mlp": 0.01257331, + "epoch": 0.4978806553434541, + "flos": 24760565688960.0, + "grad_norm": 1.4498126802552027, + "language_loss": 0.6468308, + "learning_rate": 2.1107451770850936e-06, + "loss": 0.72386646, + "num_input_tokens_seen": 178021060, + "router_z_loss_clip": 1.53027344, + "router_z_loss_mlp": 0.13366699, + "step": 8281, + "time_per_iteration": 2.5905003547668457 + }, + { + "auxiliary_loss_clip": 0.06432123, + "auxiliary_loss_mlp": 0.01269379, + "balance_loss_clip": 0.06277686, + "balance_loss_mlp": 0.01256141, + "epoch": 0.49794077859612207, + "flos": 13120820265600.0, + "grad_norm": 2.543831826961268, + "language_loss": 0.73404002, + "learning_rate": 2.1103563108335387e-06, + "loss": 0.81105494, + "num_input_tokens_seen": 178038180, + "router_z_loss_clip": 1.54199219, + "router_z_loss_mlp": 0.13226318, + "step": 8282, + "time_per_iteration": 2.481513023376465 + }, + { + "auxiliary_loss_clip": 0.06433594, + "auxiliary_loss_mlp": 0.01272132, + "balance_loss_clip": 0.062822, + "balance_loss_mlp": 0.01260748, + "epoch": 0.49800090184879003, + "flos": 27532223844480.0, + "grad_norm": 1.4555237952962066, + "language_loss": 0.7312296, + "learning_rate": 2.109967440397263e-06, + "loss": 0.80828691, + "num_input_tokens_seen": 178057565, + "router_z_loss_clip": 1.51464844, + "router_z_loss_mlp": 0.1138916, + "step": 8283, + "time_per_iteration": 4.015530824661255 + }, + { + "auxiliary_loss_clip": 0.06430134, + "auxiliary_loss_mlp": 0.01267653, + "balance_loss_clip": 0.06279625, + "balance_loss_mlp": 0.01254791, + "epoch": 0.498061025101458, + "flos": 19798677659520.0, + "grad_norm": 1.429490370630744, + "language_loss": 0.78535879, + "learning_rate": 2.1095785657910095e-06, + "loss": 0.8623367, + "num_input_tokens_seen": 178076965, + "router_z_loss_clip": 1.50488281, + "router_z_loss_mlp": 0.12860107, + "step": 8284, + "time_per_iteration": 2.4994332790374756 + }, + { + "auxiliary_loss_clip": 0.06437389, + "auxiliary_loss_mlp": 0.01269907, + "balance_loss_clip": 0.06278685, + "balance_loss_mlp": 0.01255864, + "epoch": 0.49812114835412596, + "flos": 29900926915200.0, + "grad_norm": 1.711585124439885, + "language_loss": 0.7343573, + "learning_rate": 2.109189687029526e-06, + "loss": 0.81143022, + "num_input_tokens_seen": 178095105, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 0.14044189, + "step": 8285, + "time_per_iteration": 2.566572904586792 + }, + { + "auxiliary_loss_clip": 0.06430154, + "auxiliary_loss_mlp": 0.01270611, + "balance_loss_clip": 0.0627718, + "balance_loss_mlp": 0.01258404, + "epoch": 0.49818127160679393, + "flos": 23153441178240.0, + "grad_norm": 1.4871294259616603, + "language_loss": 0.74281567, + "learning_rate": 2.1088008041275598e-06, + "loss": 0.81982332, + "num_input_tokens_seen": 178114505, + "router_z_loss_clip": 1.52832031, + "router_z_loss_mlp": 0.12207031, + "step": 8286, + "time_per_iteration": 2.5136756896972656 + }, + { + "auxiliary_loss_clip": 0.06434155, + "auxiliary_loss_mlp": 0.01266169, + "balance_loss_clip": 0.06279751, + "balance_loss_mlp": 0.0125358, + "epoch": 0.4982413948594619, + "flos": 21659228444160.0, + "grad_norm": 1.6982664351725185, + "language_loss": 0.85701174, + "learning_rate": 2.1084119170998545e-06, + "loss": 0.93401492, + "num_input_tokens_seen": 178131595, + "router_z_loss_clip": 1.54492188, + "router_z_loss_mlp": 0.12579346, + "step": 8287, + "time_per_iteration": 2.518136501312256 + }, + { + "auxiliary_loss_clip": 0.06432185, + "auxiliary_loss_mlp": 0.01270528, + "balance_loss_clip": 0.06276216, + "balance_loss_mlp": 0.01256801, + "epoch": 0.49830151811212986, + "flos": 32494866560640.0, + "grad_norm": 1.6945408763753198, + "language_loss": 0.72708082, + "learning_rate": 2.108023025961159e-06, + "loss": 0.80410802, + "num_input_tokens_seen": 178152055, + "router_z_loss_clip": 1.55957031, + "router_z_loss_mlp": 0.13745117, + "step": 8288, + "time_per_iteration": 2.590862512588501 + }, + { + "auxiliary_loss_clip": 0.06436619, + "auxiliary_loss_mlp": 0.01272174, + "balance_loss_clip": 0.0627879, + "balance_loss_mlp": 0.01258972, + "epoch": 0.4983616413647978, + "flos": 18146886122880.0, + "grad_norm": 4.0455531591406855, + "language_loss": 0.81054366, + "learning_rate": 2.10763413072622e-06, + "loss": 0.8876316, + "num_input_tokens_seen": 178168150, + "router_z_loss_clip": 1.57617188, + "router_z_loss_mlp": 0.13201904, + "step": 8289, + "time_per_iteration": 2.504817008972168 + }, + { + "auxiliary_loss_clip": 0.06432903, + "auxiliary_loss_mlp": 0.01270371, + "balance_loss_clip": 0.06279443, + "balance_loss_mlp": 0.01257074, + "epoch": 0.4984217646174658, + "flos": 19724898539520.0, + "grad_norm": 2.471620750065275, + "language_loss": 0.73847377, + "learning_rate": 2.107245231409784e-06, + "loss": 0.81550646, + "num_input_tokens_seen": 178186150, + "router_z_loss_clip": 1.53613281, + "router_z_loss_mlp": 0.13305664, + "step": 8290, + "time_per_iteration": 2.492176055908203 + }, + { + "auxiliary_loss_clip": 0.0643364, + "auxiliary_loss_mlp": 0.01275224, + "balance_loss_clip": 0.06278273, + "balance_loss_mlp": 0.01261157, + "epoch": 0.49848188787013376, + "flos": 24943525079040.0, + "grad_norm": 1.4456375643187662, + "language_loss": 0.84330356, + "learning_rate": 2.106856328026598e-06, + "loss": 0.92039216, + "num_input_tokens_seen": 178207665, + "router_z_loss_clip": 1.55273438, + "router_z_loss_mlp": 0.140625, + "step": 8291, + "time_per_iteration": 2.5577101707458496 + }, + { + "auxiliary_loss_clip": 0.06438746, + "auxiliary_loss_mlp": 0.01270664, + "balance_loss_clip": 0.06277075, + "balance_loss_mlp": 0.01257379, + "epoch": 0.4985420111228017, + "flos": 22388969652480.0, + "grad_norm": 1.8626179833436056, + "language_loss": 0.67868197, + "learning_rate": 2.106467420591409e-06, + "loss": 0.75577605, + "num_input_tokens_seen": 178226325, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.13275146, + "step": 8292, + "time_per_iteration": 2.5227880477905273 + }, + { + "auxiliary_loss_clip": 0.06428275, + "auxiliary_loss_mlp": 0.01268796, + "balance_loss_clip": 0.06275518, + "balance_loss_mlp": 0.01256977, + "epoch": 0.4986021343754697, + "flos": 16221989802240.0, + "grad_norm": 1.635019918785358, + "language_loss": 0.67247725, + "learning_rate": 2.106078509118965e-06, + "loss": 0.749448, + "num_input_tokens_seen": 178244960, + "router_z_loss_clip": 1.52832031, + "router_z_loss_mlp": 0.11798096, + "step": 8293, + "time_per_iteration": 2.5051913261413574 + }, + { + "auxiliary_loss_clip": 0.0643108, + "auxiliary_loss_mlp": 0.01270371, + "balance_loss_clip": 0.06275735, + "balance_loss_mlp": 0.01258891, + "epoch": 0.4986622576281377, + "flos": 23410221615360.0, + "grad_norm": 1.789605024821123, + "language_loss": 0.82488304, + "learning_rate": 2.1056895936240133e-06, + "loss": 0.90189755, + "num_input_tokens_seen": 178265400, + "router_z_loss_clip": 1.5546875, + "router_z_loss_mlp": 0.11480713, + "step": 8294, + "time_per_iteration": 2.5429139137268066 + }, + { + "auxiliary_loss_clip": 0.06432615, + "auxiliary_loss_mlp": 0.01272563, + "balance_loss_clip": 0.06277893, + "balance_loss_mlp": 0.01260315, + "epoch": 0.49872238088080567, + "flos": 19980714654720.0, + "grad_norm": 2.5766475970916285, + "language_loss": 0.73639232, + "learning_rate": 2.1053006741213016e-06, + "loss": 0.81344408, + "num_input_tokens_seen": 178284535, + "router_z_loss_clip": 1.546875, + "router_z_loss_mlp": 0.12249756, + "step": 8295, + "time_per_iteration": 2.535090923309326 + }, + { + "auxiliary_loss_clip": 0.06427556, + "auxiliary_loss_mlp": 0.01272493, + "balance_loss_clip": 0.06276329, + "balance_loss_mlp": 0.01259911, + "epoch": 0.49878250413347364, + "flos": 22899595633920.0, + "grad_norm": 1.8257233918976585, + "language_loss": 0.68199098, + "learning_rate": 2.1049117506255775e-06, + "loss": 0.75899148, + "num_input_tokens_seen": 178302425, + "router_z_loss_clip": 1.50976562, + "router_z_loss_mlp": 0.12591553, + "step": 8296, + "time_per_iteration": 2.5079848766326904 + }, + { + "auxiliary_loss_clip": 0.06433527, + "auxiliary_loss_mlp": 0.01272036, + "balance_loss_clip": 0.06276954, + "balance_loss_mlp": 0.0125878, + "epoch": 0.4988426273861416, + "flos": 32606688234240.0, + "grad_norm": 1.801119189108274, + "language_loss": 0.64925557, + "learning_rate": 2.1045228231515895e-06, + "loss": 0.72631121, + "num_input_tokens_seen": 178323065, + "router_z_loss_clip": 1.56347656, + "router_z_loss_mlp": 0.13256836, + "step": 8297, + "time_per_iteration": 2.6275887489318848 + }, + { + "auxiliary_loss_clip": 0.06427586, + "auxiliary_loss_mlp": 0.01270462, + "balance_loss_clip": 0.06278079, + "balance_loss_mlp": 0.01258845, + "epoch": 0.49890275063880957, + "flos": 20929990360320.0, + "grad_norm": 1.5890674789628483, + "language_loss": 0.69987392, + "learning_rate": 2.1041338917140857e-06, + "loss": 0.77685434, + "num_input_tokens_seen": 178343985, + "router_z_loss_clip": 1.49511719, + "router_z_loss_mlp": 0.11621094, + "step": 8298, + "time_per_iteration": 2.527082681655884 + }, + { + "auxiliary_loss_clip": 0.06428695, + "auxiliary_loss_mlp": 0.01265195, + "balance_loss_clip": 0.06276681, + "balance_loss_mlp": 0.01253668, + "epoch": 0.49896287389147753, + "flos": 18630370581120.0, + "grad_norm": 3.032196085375079, + "language_loss": 0.85047698, + "learning_rate": 2.103744956327814e-06, + "loss": 0.92741591, + "num_input_tokens_seen": 178362345, + "router_z_loss_clip": 1.51953125, + "router_z_loss_mlp": 0.11517334, + "step": 8299, + "time_per_iteration": 2.531541585922241 + }, + { + "auxiliary_loss_clip": 0.06429411, + "auxiliary_loss_mlp": 0.01267168, + "balance_loss_clip": 0.06274673, + "balance_loss_mlp": 0.0125412, + "epoch": 0.4990229971441455, + "flos": 24833422414080.0, + "grad_norm": 2.041795476236588, + "language_loss": 0.69284618, + "learning_rate": 2.1033560170075234e-06, + "loss": 0.76981199, + "num_input_tokens_seen": 178383190, + "router_z_loss_clip": 1.54296875, + "router_z_loss_mlp": 0.13061523, + "step": 8300, + "time_per_iteration": 2.562002658843994 + }, + { + "auxiliary_loss_clip": 0.0633271, + "auxiliary_loss_mlp": 0.01269781, + "balance_loss_clip": 0.06265618, + "balance_loss_mlp": 0.01267531, + "epoch": 0.49908312039681346, + "flos": 71405638323840.0, + "grad_norm": 0.7392878070409407, + "language_loss": 0.51101816, + "learning_rate": 2.1029670737679623e-06, + "loss": 0.58704311, + "num_input_tokens_seen": 178444250, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 0.02253723, + "step": 8301, + "time_per_iteration": 3.3210127353668213 + }, + { + "auxiliary_loss_clip": 0.06423864, + "auxiliary_loss_mlp": 0.01270768, + "balance_loss_clip": 0.06275457, + "balance_loss_mlp": 0.01258173, + "epoch": 0.4991432436494814, + "flos": 19834791569280.0, + "grad_norm": 2.2486532521822302, + "language_loss": 0.84452468, + "learning_rate": 2.102578126623879e-06, + "loss": 0.921471, + "num_input_tokens_seen": 178463250, + "router_z_loss_clip": 1.48339844, + "router_z_loss_mlp": 0.12591553, + "step": 8302, + "time_per_iteration": 2.547562837600708 + }, + { + "auxiliary_loss_clip": 0.06428537, + "auxiliary_loss_mlp": 0.01271397, + "balance_loss_clip": 0.06279141, + "balance_loss_mlp": 0.01259607, + "epoch": 0.4992033669021494, + "flos": 15127252208640.0, + "grad_norm": 1.6659174741740037, + "language_loss": 0.69610626, + "learning_rate": 2.102189175590024e-06, + "loss": 0.77310562, + "num_input_tokens_seen": 178481340, + "router_z_loss_clip": 1.49414062, + "router_z_loss_mlp": 0.11785889, + "step": 8303, + "time_per_iteration": 2.473879337310791 + }, + { + "auxiliary_loss_clip": 0.06429437, + "auxiliary_loss_mlp": 0.01266243, + "balance_loss_clip": 0.0627458, + "balance_loss_mlp": 0.01253851, + "epoch": 0.49926349015481736, + "flos": 31215282860160.0, + "grad_norm": 1.7036998151712766, + "language_loss": 0.72999942, + "learning_rate": 2.101800220681144e-06, + "loss": 0.80695617, + "num_input_tokens_seen": 178501545, + "router_z_loss_clip": 1.54785156, + "router_z_loss_mlp": 0.1239624, + "step": 8304, + "time_per_iteration": 2.611502170562744 + }, + { + "auxiliary_loss_clip": 0.0642409, + "auxiliary_loss_mlp": 0.0126995, + "balance_loss_clip": 0.0627369, + "balance_loss_mlp": 0.01257683, + "epoch": 0.4993236134074853, + "flos": 24907201534080.0, + "grad_norm": 2.0593873642803486, + "language_loss": 0.81677687, + "learning_rate": 2.10141126191199e-06, + "loss": 0.89371729, + "num_input_tokens_seen": 178519700, + "router_z_loss_clip": 1.50292969, + "router_z_loss_mlp": 0.1227417, + "step": 8305, + "time_per_iteration": 2.57425594329834 + }, + { + "auxiliary_loss_clip": 0.0632831, + "auxiliary_loss_mlp": 0.01255041, + "balance_loss_clip": 0.06261367, + "balance_loss_mlp": 0.01252826, + "epoch": 0.4993837366601533, + "flos": 70438962896640.0, + "grad_norm": 0.7837813432026206, + "language_loss": 0.56909657, + "learning_rate": 2.1010222992973107e-06, + "loss": 0.64493006, + "num_input_tokens_seen": 178576740, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 0.02220154, + "step": 8306, + "time_per_iteration": 3.2806143760681152 + }, + { + "auxiliary_loss_clip": 0.06430675, + "auxiliary_loss_mlp": 0.01269703, + "balance_loss_clip": 0.06278585, + "balance_loss_mlp": 0.01255422, + "epoch": 0.4994438599128213, + "flos": 15966718738560.0, + "grad_norm": 1.7475082532303507, + "language_loss": 0.83157074, + "learning_rate": 2.1006333328518556e-06, + "loss": 0.90857446, + "num_input_tokens_seen": 178594745, + "router_z_loss_clip": 1.52148438, + "router_z_loss_mlp": 0.1427002, + "step": 8307, + "time_per_iteration": 2.4851419925689697 + }, + { + "auxiliary_loss_clip": 0.06426803, + "auxiliary_loss_mlp": 0.01271631, + "balance_loss_clip": 0.06277731, + "balance_loss_mlp": 0.01258458, + "epoch": 0.4995039831654893, + "flos": 27935765907840.0, + "grad_norm": 1.9977557260500436, + "language_loss": 0.61003512, + "learning_rate": 2.1002443625903748e-06, + "loss": 0.68701947, + "num_input_tokens_seen": 178614110, + "router_z_loss_clip": 1.48925781, + "router_z_loss_mlp": 0.13189697, + "step": 8308, + "time_per_iteration": 2.5943245887756348 + }, + { + "auxiliary_loss_clip": 0.06426641, + "auxiliary_loss_mlp": 0.01271422, + "balance_loss_clip": 0.06278297, + "balance_loss_mlp": 0.01259948, + "epoch": 0.49956410641815724, + "flos": 24211310175360.0, + "grad_norm": 1.573691211270805, + "language_loss": 0.74911636, + "learning_rate": 2.0998553885276168e-06, + "loss": 0.82609695, + "num_input_tokens_seen": 178634170, + "router_z_loss_clip": 1.48242188, + "router_z_loss_mlp": 0.11468506, + "step": 8309, + "time_per_iteration": 3.9743635654449463 + }, + { + "auxiliary_loss_clip": 0.06430435, + "auxiliary_loss_mlp": 0.01268231, + "balance_loss_clip": 0.0627753, + "balance_loss_mlp": 0.0125578, + "epoch": 0.4996242296708252, + "flos": 16185666257280.0, + "grad_norm": 2.033466484631739, + "language_loss": 0.80080384, + "learning_rate": 2.0994664106783335e-06, + "loss": 0.87779051, + "num_input_tokens_seen": 178651775, + "router_z_loss_clip": 1.52734375, + "router_z_loss_mlp": 0.12438965, + "step": 8310, + "time_per_iteration": 2.475815534591675 + }, + { + "auxiliary_loss_clip": 0.06429116, + "auxiliary_loss_mlp": 0.01267368, + "balance_loss_clip": 0.06274112, + "balance_loss_mlp": 0.01254541, + "epoch": 0.49968435292349317, + "flos": 16879209701760.0, + "grad_norm": 1.5486293297173337, + "language_loss": 0.71370041, + "learning_rate": 2.0990774290572735e-06, + "loss": 0.79066527, + "num_input_tokens_seen": 178669720, + "router_z_loss_clip": 1.55078125, + "router_z_loss_mlp": 0.12823486, + "step": 8311, + "time_per_iteration": 4.01245641708374 + }, + { + "auxiliary_loss_clip": 0.06428856, + "auxiliary_loss_mlp": 0.01266033, + "balance_loss_clip": 0.06277557, + "balance_loss_mlp": 0.01254636, + "epoch": 0.49974447617616113, + "flos": 14944837870080.0, + "grad_norm": 1.8003339909908787, + "language_loss": 0.77129757, + "learning_rate": 2.098688443679187e-06, + "loss": 0.8482464, + "num_input_tokens_seen": 178686765, + "router_z_loss_clip": 1.51074219, + "router_z_loss_mlp": 0.11401367, + "step": 8312, + "time_per_iteration": 2.4761128425598145 + }, + { + "auxiliary_loss_clip": 0.0643132, + "auxiliary_loss_mlp": 0.01266437, + "balance_loss_clip": 0.06279029, + "balance_loss_mlp": 0.01254206, + "epoch": 0.4998045994288291, + "flos": 26658823610880.0, + "grad_norm": 1.6524127143489034, + "language_loss": 0.84981465, + "learning_rate": 2.0982994545588256e-06, + "loss": 0.9267922, + "num_input_tokens_seen": 178705845, + "router_z_loss_clip": 1.52246094, + "router_z_loss_mlp": 0.12231445, + "step": 8313, + "time_per_iteration": 2.6057398319244385 + }, + { + "auxiliary_loss_clip": 0.06431891, + "auxiliary_loss_mlp": 0.01267877, + "balance_loss_clip": 0.06279939, + "balance_loss_mlp": 0.01256224, + "epoch": 0.49986472268149706, + "flos": 20959102454400.0, + "grad_norm": 1.6979548607445847, + "language_loss": 0.81193811, + "learning_rate": 2.097910461710939e-06, + "loss": 0.8889358, + "num_input_tokens_seen": 178723410, + "router_z_loss_clip": 1.52050781, + "router_z_loss_mlp": 0.11657715, + "step": 8314, + "time_per_iteration": 2.5246880054473877 + }, + { + "auxiliary_loss_clip": 0.06430186, + "auxiliary_loss_mlp": 0.01269627, + "balance_loss_clip": 0.06278808, + "balance_loss_mlp": 0.01256341, + "epoch": 0.49992484593416503, + "flos": 22790499217920.0, + "grad_norm": 1.7217224756504992, + "language_loss": 0.79857439, + "learning_rate": 2.0975214651502773e-06, + "loss": 0.8755725, + "num_input_tokens_seen": 178743560, + "router_z_loss_clip": 1.51464844, + "router_z_loss_mlp": 0.13305664, + "step": 8315, + "time_per_iteration": 2.5382394790649414 + }, + { + "auxiliary_loss_clip": 0.06427336, + "auxiliary_loss_mlp": 0.01267686, + "balance_loss_clip": 0.06276156, + "balance_loss_mlp": 0.0125595, + "epoch": 0.499984969186833, + "flos": 46796838307200.0, + "grad_norm": 1.6656557215916168, + "language_loss": 0.74803257, + "learning_rate": 2.0971324648915926e-06, + "loss": 0.82498288, + "num_input_tokens_seen": 178767225, + "router_z_loss_clip": 1.50976562, + "router_z_loss_mlp": 0.11749268, + "step": 8316, + "time_per_iteration": 4.178734540939331 + }, + { + "auxiliary_loss_clip": 0.06424455, + "auxiliary_loss_mlp": 0.01269425, + "balance_loss_clip": 0.0627817, + "balance_loss_mlp": 0.01258083, + "epoch": 0.500045092439501, + "flos": 25564086017280.0, + "grad_norm": 1.744541126829246, + "language_loss": 0.81478661, + "learning_rate": 2.0967434609496343e-06, + "loss": 0.89172542, + "num_input_tokens_seen": 178786810, + "router_z_loss_clip": 1.46191406, + "router_z_loss_mlp": 0.11346436, + "step": 8317, + "time_per_iteration": 2.537320613861084 + }, + { + "auxiliary_loss_clip": 0.06427011, + "auxiliary_loss_mlp": 0.01270425, + "balance_loss_clip": 0.06274804, + "balance_loss_mlp": 0.01257586, + "epoch": 0.5001052156921689, + "flos": 20711126695680.0, + "grad_norm": 1.5732702518161361, + "language_loss": 0.83390272, + "learning_rate": 2.0963544533391548e-06, + "loss": 0.91087711, + "num_input_tokens_seen": 178805660, + "router_z_loss_clip": 1.52050781, + "router_z_loss_mlp": 0.12835693, + "step": 8318, + "time_per_iteration": 2.534135103225708 + }, + { + "auxiliary_loss_clip": 0.06428336, + "auxiliary_loss_mlp": 0.01269138, + "balance_loss_clip": 0.06277522, + "balance_loss_mlp": 0.01257109, + "epoch": 0.500165338944837, + "flos": 21257405389440.0, + "grad_norm": 1.6807233025456896, + "language_loss": 0.82012349, + "learning_rate": 2.0959654420749045e-06, + "loss": 0.89709824, + "num_input_tokens_seen": 178824780, + "router_z_loss_clip": 1.5078125, + "router_z_loss_mlp": 0.12030029, + "step": 8319, + "time_per_iteration": 2.515835762023926 + }, + { + "auxiliary_loss_clip": 0.06428086, + "auxiliary_loss_mlp": 0.01265652, + "balance_loss_clip": 0.0627624, + "balance_loss_mlp": 0.01254697, + "epoch": 0.5002254621975049, + "flos": 27861693298560.0, + "grad_norm": 1.6360150103182107, + "language_loss": 0.72118968, + "learning_rate": 2.095576427171635e-06, + "loss": 0.79812706, + "num_input_tokens_seen": 178845640, + "router_z_loss_clip": 1.51757812, + "router_z_loss_mlp": 0.10955811, + "step": 8320, + "time_per_iteration": 2.5796635150909424 + }, + { + "auxiliary_loss_clip": 0.06441814, + "auxiliary_loss_mlp": 0.01267293, + "balance_loss_clip": 0.06280147, + "balance_loss_mlp": 0.01253858, + "epoch": 0.5002855854501729, + "flos": 15556049078400.0, + "grad_norm": 2.4313263695255696, + "language_loss": 0.76678413, + "learning_rate": 2.0951874086440978e-06, + "loss": 0.84387517, + "num_input_tokens_seen": 178862290, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.13439941, + "step": 8321, + "time_per_iteration": 2.4691002368927 + }, + { + "auxiliary_loss_clip": 0.06428922, + "auxiliary_loss_mlp": 0.01268744, + "balance_loss_clip": 0.06276058, + "balance_loss_mlp": 0.0125556, + "epoch": 0.5003457087028408, + "flos": 16112977240320.0, + "grad_norm": 1.7492839336280708, + "language_loss": 0.82910907, + "learning_rate": 2.0947983865070455e-06, + "loss": 0.90608579, + "num_input_tokens_seen": 178879805, + "router_z_loss_clip": 1.52636719, + "router_z_loss_mlp": 0.13183594, + "step": 8322, + "time_per_iteration": 2.515460252761841 + }, + { + "auxiliary_loss_clip": 0.06431515, + "auxiliary_loss_mlp": 0.0126974, + "balance_loss_clip": 0.06279334, + "balance_loss_mlp": 0.01256973, + "epoch": 0.5004058319555088, + "flos": 22717055514240.0, + "grad_norm": 3.787468052495824, + "language_loss": 0.74021679, + "learning_rate": 2.094409360775228e-06, + "loss": 0.81722933, + "num_input_tokens_seen": 178896985, + "router_z_loss_clip": 1.52246094, + "router_z_loss_mlp": 0.12774658, + "step": 8323, + "time_per_iteration": 3.9577157497406006 + }, + { + "auxiliary_loss_clip": 0.06425107, + "auxiliary_loss_mlp": 0.01267421, + "balance_loss_clip": 0.06273489, + "balance_loss_mlp": 0.01254761, + "epoch": 0.5004659552081767, + "flos": 30125870000640.0, + "grad_norm": 1.569659839153646, + "language_loss": 0.69694078, + "learning_rate": 2.0940203314633977e-06, + "loss": 0.77386606, + "num_input_tokens_seen": 178920605, + "router_z_loss_clip": 1.515625, + "router_z_loss_mlp": 0.12670898, + "step": 8324, + "time_per_iteration": 2.5927038192749023 + }, + { + "auxiliary_loss_clip": 0.06426285, + "auxiliary_loss_mlp": 0.01267566, + "balance_loss_clip": 0.06274655, + "balance_loss_mlp": 0.012554, + "epoch": 0.5005260784608447, + "flos": 18630664070400.0, + "grad_norm": 1.9637621432589805, + "language_loss": 0.72455752, + "learning_rate": 2.0936312985863077e-06, + "loss": 0.80149603, + "num_input_tokens_seen": 178937760, + "router_z_loss_clip": 1.51367188, + "router_z_loss_mlp": 0.12164307, + "step": 8325, + "time_per_iteration": 2.5748932361602783 + }, + { + "auxiliary_loss_clip": 0.06431422, + "auxiliary_loss_mlp": 0.01266152, + "balance_loss_clip": 0.06278826, + "balance_loss_mlp": 0.01253069, + "epoch": 0.5005862017135126, + "flos": 24866349868800.0, + "grad_norm": 1.7160687334315328, + "language_loss": 0.73386943, + "learning_rate": 2.093242262158709e-06, + "loss": 0.8108452, + "num_input_tokens_seen": 178957985, + "router_z_loss_clip": 1.52636719, + "router_z_loss_mlp": 0.13085938, + "step": 8326, + "time_per_iteration": 2.5720608234405518 + }, + { + "auxiliary_loss_clip": 0.06426796, + "auxiliary_loss_mlp": 0.01267135, + "balance_loss_clip": 0.06276905, + "balance_loss_mlp": 0.01255763, + "epoch": 0.5006463249661807, + "flos": 18740389392000.0, + "grad_norm": 1.5629486934520718, + "language_loss": 0.78059208, + "learning_rate": 2.0928532221953544e-06, + "loss": 0.85753143, + "num_input_tokens_seen": 178977070, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.11364746, + "step": 8327, + "time_per_iteration": 2.5033681392669678 + }, + { + "auxiliary_loss_clip": 0.06429915, + "auxiliary_loss_mlp": 0.01266866, + "balance_loss_clip": 0.06277432, + "balance_loss_mlp": 0.01254533, + "epoch": 0.5007064482188487, + "flos": 13047124999680.0, + "grad_norm": 2.5584329331081253, + "language_loss": 0.88066995, + "learning_rate": 2.092464178710997e-06, + "loss": 0.95763773, + "num_input_tokens_seen": 178994175, + "router_z_loss_clip": 1.52441406, + "router_z_loss_mlp": 0.12329102, + "step": 8328, + "time_per_iteration": 2.469723701477051 + }, + { + "auxiliary_loss_clip": 0.06430298, + "auxiliary_loss_mlp": 0.0126735, + "balance_loss_clip": 0.06274554, + "balance_loss_mlp": 0.01254302, + "epoch": 0.5007665714715166, + "flos": 21295154453760.0, + "grad_norm": 2.120857663767784, + "language_loss": 0.74578768, + "learning_rate": 2.092075131720388e-06, + "loss": 0.82276416, + "num_input_tokens_seen": 179013710, + "router_z_loss_clip": 1.55664062, + "router_z_loss_mlp": 0.1305542, + "step": 8329, + "time_per_iteration": 2.527421236038208 + }, + { + "auxiliary_loss_clip": 0.06427623, + "auxiliary_loss_mlp": 0.01269321, + "balance_loss_clip": 0.06278372, + "balance_loss_mlp": 0.01257626, + "epoch": 0.5008266947241846, + "flos": 29762676478080.0, + "grad_norm": 1.5806360237517383, + "language_loss": 0.80007339, + "learning_rate": 2.091686081238281e-06, + "loss": 0.87704277, + "num_input_tokens_seen": 179035255, + "router_z_loss_clip": 1.49316406, + "router_z_loss_mlp": 0.11688232, + "step": 8330, + "time_per_iteration": 2.589132785797119 + }, + { + "auxiliary_loss_clip": 0.063256, + "auxiliary_loss_mlp": 0.01256172, + "balance_loss_clip": 0.06259131, + "balance_loss_mlp": 0.0125421, + "epoch": 0.5008868179768525, + "flos": 63574498460160.0, + "grad_norm": 0.7051231310601146, + "language_loss": 0.56005836, + "learning_rate": 2.0912970272794282e-06, + "loss": 0.63587606, + "num_input_tokens_seen": 179090915, + "router_z_loss_clip": 0.6640625, + "router_z_loss_mlp": 0.01960754, + "step": 8331, + "time_per_iteration": 2.9798707962036133 + }, + { + "auxiliary_loss_clip": 0.06425481, + "auxiliary_loss_mlp": 0.01267706, + "balance_loss_clip": 0.06278575, + "balance_loss_mlp": 0.01256125, + "epoch": 0.5009469412295205, + "flos": 27382108055040.0, + "grad_norm": 1.8793466545943338, + "language_loss": 0.65444684, + "learning_rate": 2.0909079698585833e-06, + "loss": 0.73137867, + "num_input_tokens_seen": 179109160, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 0.11584473, + "step": 8332, + "time_per_iteration": 2.548846483230591 + }, + { + "auxiliary_loss_clip": 0.06424412, + "auxiliary_loss_mlp": 0.01265076, + "balance_loss_clip": 0.06275713, + "balance_loss_mlp": 0.01253578, + "epoch": 0.5010070644821885, + "flos": 27385839561600.0, + "grad_norm": 1.4154143625456153, + "language_loss": 0.75122535, + "learning_rate": 2.0905189089904993e-06, + "loss": 0.82812029, + "num_input_tokens_seen": 179130610, + "router_z_loss_clip": 1.48632812, + "router_z_loss_mlp": 0.1149292, + "step": 8333, + "time_per_iteration": 2.600377082824707 + }, + { + "auxiliary_loss_clip": 0.06429033, + "auxiliary_loss_mlp": 0.01268641, + "balance_loss_clip": 0.06276083, + "balance_loss_mlp": 0.01256481, + "epoch": 0.5010671877348565, + "flos": 20668178678400.0, + "grad_norm": 1.9411742898612023, + "language_loss": 0.80806357, + "learning_rate": 2.090129844689929e-06, + "loss": 0.88504034, + "num_input_tokens_seen": 179147860, + "router_z_loss_clip": 1.53222656, + "router_z_loss_mlp": 0.12158203, + "step": 8334, + "time_per_iteration": 2.490330457687378 + }, + { + "auxiliary_loss_clip": 0.0633373, + "auxiliary_loss_mlp": 0.01254486, + "balance_loss_clip": 0.06267349, + "balance_loss_mlp": 0.01252466, + "epoch": 0.5011273109875244, + "flos": 59148266855040.0, + "grad_norm": 0.880609822046852, + "language_loss": 0.62818438, + "learning_rate": 2.089740776971626e-06, + "loss": 0.70406651, + "num_input_tokens_seen": 179210490, + "router_z_loss_clip": 0.6640625, + "router_z_loss_mlp": 0.02020264, + "step": 8335, + "time_per_iteration": 3.1081318855285645 + }, + { + "auxiliary_loss_clip": 0.06426011, + "auxiliary_loss_mlp": 0.01265932, + "balance_loss_clip": 0.06278515, + "balance_loss_mlp": 0.01255334, + "epoch": 0.5011874342401924, + "flos": 25343126000640.0, + "grad_norm": 1.3778270209342711, + "language_loss": 0.80092967, + "learning_rate": 2.0893517058503435e-06, + "loss": 0.8778491, + "num_input_tokens_seen": 179231360, + "router_z_loss_clip": 1.47558594, + "router_z_loss_mlp": 0.105896, + "step": 8336, + "time_per_iteration": 2.5390379428863525 + }, + { + "auxiliary_loss_clip": 0.06428748, + "auxiliary_loss_mlp": 0.0126676, + "balance_loss_clip": 0.06278357, + "balance_loss_mlp": 0.01254923, + "epoch": 0.5012475574928603, + "flos": 20236153426560.0, + "grad_norm": 1.7537768303990948, + "language_loss": 0.81054461, + "learning_rate": 2.088962631340836e-06, + "loss": 0.88749969, + "num_input_tokens_seen": 179250625, + "router_z_loss_clip": 1.50585938, + "router_z_loss_mlp": 0.11834717, + "step": 8337, + "time_per_iteration": 2.5480427742004395 + }, + { + "auxiliary_loss_clip": 0.06436703, + "auxiliary_loss_mlp": 0.01267216, + "balance_loss_clip": 0.06279006, + "balance_loss_mlp": 0.01254973, + "epoch": 0.5013076807455283, + "flos": 22716594316800.0, + "grad_norm": 1.7916878418610642, + "language_loss": 0.79506505, + "learning_rate": 2.0885735534578555e-06, + "loss": 0.87210429, + "num_input_tokens_seen": 179267360, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.12255859, + "step": 8338, + "time_per_iteration": 2.5164718627929688 + }, + { + "auxiliary_loss_clip": 0.0643065, + "auxiliary_loss_mlp": 0.01265282, + "balance_loss_clip": 0.06277832, + "balance_loss_mlp": 0.01253176, + "epoch": 0.5013678039981962, + "flos": 24252329548800.0, + "grad_norm": 1.5889596080337545, + "language_loss": 0.85034919, + "learning_rate": 2.0881844722161583e-06, + "loss": 0.9273085, + "num_input_tokens_seen": 179289810, + "router_z_loss_clip": 1.52636719, + "router_z_loss_mlp": 0.12127686, + "step": 8339, + "time_per_iteration": 2.5785508155822754 + }, + { + "auxiliary_loss_clip": 0.06426719, + "auxiliary_loss_mlp": 0.01269107, + "balance_loss_clip": 0.06276147, + "balance_loss_mlp": 0.0125814, + "epoch": 0.5014279272508643, + "flos": 26183808414720.0, + "grad_norm": 1.5165096284579775, + "language_loss": 0.71162677, + "learning_rate": 2.0877953876304962e-06, + "loss": 0.78858501, + "num_input_tokens_seen": 179310620, + "router_z_loss_clip": 1.50488281, + "router_z_loss_mlp": 0.10968018, + "step": 8340, + "time_per_iteration": 2.5929582118988037 + }, + { + "auxiliary_loss_clip": 0.06433477, + "auxiliary_loss_mlp": 0.01270076, + "balance_loss_clip": 0.06277999, + "balance_loss_mlp": 0.01256867, + "epoch": 0.5014880505035323, + "flos": 21436255929600.0, + "grad_norm": 2.442832877053188, + "language_loss": 0.7829324, + "learning_rate": 2.0874062997156245e-06, + "loss": 0.85996789, + "num_input_tokens_seen": 179329005, + "router_z_loss_clip": 1.5546875, + "router_z_loss_mlp": 0.13208008, + "step": 8341, + "time_per_iteration": 2.5200908184051514 + }, + { + "auxiliary_loss_clip": 0.06435034, + "auxiliary_loss_mlp": 0.01267489, + "balance_loss_clip": 0.062792, + "balance_loss_mlp": 0.01255407, + "epoch": 0.5015481737562002, + "flos": 15774870816000.0, + "grad_norm": 2.1824930872588917, + "language_loss": 0.89806843, + "learning_rate": 2.0870172084862975e-06, + "loss": 0.97509372, + "num_input_tokens_seen": 179343785, + "router_z_loss_clip": 1.5546875, + "router_z_loss_mlp": 0.12091064, + "step": 8342, + "time_per_iteration": 2.502265691757202 + }, + { + "auxiliary_loss_clip": 0.06427857, + "auxiliary_loss_mlp": 0.01264552, + "balance_loss_clip": 0.06276843, + "balance_loss_mlp": 0.0125275, + "epoch": 0.5016082970088682, + "flos": 26837590296960.0, + "grad_norm": 1.7003073455140034, + "language_loss": 0.76872855, + "learning_rate": 2.0866281139572682e-06, + "loss": 0.84565264, + "num_input_tokens_seen": 179364070, + "router_z_loss_clip": 1.50878906, + "router_z_loss_mlp": 0.11804199, + "step": 8343, + "time_per_iteration": 2.5502099990844727 + }, + { + "auxiliary_loss_clip": 0.06426306, + "auxiliary_loss_mlp": 0.01267626, + "balance_loss_clip": 0.0627844, + "balance_loss_mlp": 0.01256724, + "epoch": 0.5016684202615361, + "flos": 21477023740800.0, + "grad_norm": 3.7325470711422466, + "language_loss": 0.67772466, + "learning_rate": 2.086239016143293e-06, + "loss": 0.75466394, + "num_input_tokens_seen": 179384225, + "router_z_loss_clip": 1.47949219, + "router_z_loss_mlp": 0.10900879, + "step": 8344, + "time_per_iteration": 2.5443081855773926 + }, + { + "auxiliary_loss_clip": 0.06429319, + "auxiliary_loss_mlp": 0.01271563, + "balance_loss_clip": 0.06277445, + "balance_loss_mlp": 0.01259803, + "epoch": 0.5017285435142042, + "flos": 26253478684800.0, + "grad_norm": 2.15637603402593, + "language_loss": 0.75492197, + "learning_rate": 2.0858499150591258e-06, + "loss": 0.83193076, + "num_input_tokens_seen": 179402595, + "router_z_loss_clip": 1.51757812, + "router_z_loss_mlp": 0.11767578, + "step": 8345, + "time_per_iteration": 2.5757455825805664 + }, + { + "auxiliary_loss_clip": 0.06426319, + "auxiliary_loss_mlp": 0.01267207, + "balance_loss_clip": 0.06275543, + "balance_loss_mlp": 0.0125441, + "epoch": 0.5017886667668721, + "flos": 20783899566720.0, + "grad_norm": 2.131359070350305, + "language_loss": 0.78573453, + "learning_rate": 2.0854608107195203e-06, + "loss": 0.86266983, + "num_input_tokens_seen": 179419635, + "router_z_loss_clip": 1.50585938, + "router_z_loss_mlp": 0.12805176, + "step": 8346, + "time_per_iteration": 2.5463459491729736 + }, + { + "auxiliary_loss_clip": 0.06428749, + "auxiliary_loss_mlp": 0.012678, + "balance_loss_clip": 0.0627691, + "balance_loss_mlp": 0.01256201, + "epoch": 0.5018487900195401, + "flos": 20162500087680.0, + "grad_norm": 1.4665059060371557, + "language_loss": 0.69395542, + "learning_rate": 2.0850717031392333e-06, + "loss": 0.77092093, + "num_input_tokens_seen": 179438770, + "router_z_loss_clip": 1.51757812, + "router_z_loss_mlp": 0.11608887, + "step": 8347, + "time_per_iteration": 2.5277669429779053 + }, + { + "auxiliary_loss_clip": 0.06433204, + "auxiliary_loss_mlp": 0.0126827, + "balance_loss_clip": 0.06278361, + "balance_loss_mlp": 0.01256236, + "epoch": 0.501908913272208, + "flos": 18156613196160.0, + "grad_norm": 2.582566868470837, + "language_loss": 0.7215631, + "learning_rate": 2.0846825923330174e-06, + "loss": 0.79857785, + "num_input_tokens_seen": 179457475, + "router_z_loss_clip": 1.54882812, + "router_z_loss_mlp": 0.12030029, + "step": 8348, + "time_per_iteration": 3.996784210205078 + }, + { + "auxiliary_loss_clip": 0.06424178, + "auxiliary_loss_mlp": 0.01269515, + "balance_loss_clip": 0.06277803, + "balance_loss_mlp": 0.01258166, + "epoch": 0.501969036524876, + "flos": 23118962423040.0, + "grad_norm": 1.4308074213434065, + "language_loss": 0.74796462, + "learning_rate": 2.0842934783156303e-06, + "loss": 0.82490146, + "num_input_tokens_seen": 179478140, + "router_z_loss_clip": 1.46386719, + "router_z_loss_mlp": 0.11346436, + "step": 8349, + "time_per_iteration": 2.5489115715026855 + }, + { + "auxiliary_loss_clip": 0.06429881, + "auxiliary_loss_mlp": 0.01269935, + "balance_loss_clip": 0.06276442, + "balance_loss_mlp": 0.01257442, + "epoch": 0.5020291597775439, + "flos": 11367814596480.0, + "grad_norm": 1.898459652208493, + "language_loss": 0.63674343, + "learning_rate": 2.0839043611018266e-06, + "loss": 0.71374166, + "num_input_tokens_seen": 179494325, + "router_z_loss_clip": 1.53417969, + "router_z_loss_mlp": 0.12493896, + "step": 8350, + "time_per_iteration": 2.487217426300049 + }, + { + "auxiliary_loss_clip": 0.06323833, + "auxiliary_loss_mlp": 0.01259522, + "balance_loss_clip": 0.06257538, + "balance_loss_mlp": 0.01257642, + "epoch": 0.5020892830302119, + "flos": 64030422124800.0, + "grad_norm": 0.7586308907420236, + "language_loss": 0.59914774, + "learning_rate": 2.0835152407063597e-06, + "loss": 0.6749813, + "num_input_tokens_seen": 179553545, + "router_z_loss_clip": 0.6640625, + "router_z_loss_mlp": 0.01876831, + "step": 8351, + "time_per_iteration": 4.69463324546814 + }, + { + "auxiliary_loss_clip": 0.06434566, + "auxiliary_loss_mlp": 0.01269503, + "balance_loss_clip": 0.06280354, + "balance_loss_mlp": 0.01258029, + "epoch": 0.5021494062828799, + "flos": 23739691069440.0, + "grad_norm": 1.6219034526425078, + "language_loss": 0.75496215, + "learning_rate": 2.0831261171439873e-06, + "loss": 0.83200288, + "num_input_tokens_seen": 179573645, + "router_z_loss_clip": 1.54296875, + "router_z_loss_mlp": 0.11474609, + "step": 8352, + "time_per_iteration": 2.5164549350738525 + }, + { + "auxiliary_loss_clip": 0.06428628, + "auxiliary_loss_mlp": 0.01267422, + "balance_loss_clip": 0.06277371, + "balance_loss_mlp": 0.01254845, + "epoch": 0.5022095295355479, + "flos": 21582640212480.0, + "grad_norm": 1.8174761726271038, + "language_loss": 0.71818656, + "learning_rate": 2.082736990429464e-06, + "loss": 0.795147, + "num_input_tokens_seen": 179591435, + "router_z_loss_clip": 1.51171875, + "router_z_loss_mlp": 0.12573242, + "step": 8353, + "time_per_iteration": 2.51479172706604 + }, + { + "auxiliary_loss_clip": 0.06434356, + "auxiliary_loss_mlp": 0.01268164, + "balance_loss_clip": 0.06281401, + "balance_loss_mlp": 0.01256105, + "epoch": 0.5022696527882159, + "flos": 21403580037120.0, + "grad_norm": 2.9144841273148154, + "language_loss": 0.74235505, + "learning_rate": 2.0823478605775455e-06, + "loss": 0.81938022, + "num_input_tokens_seen": 179609955, + "router_z_loss_clip": 1.52734375, + "router_z_loss_mlp": 0.12060547, + "step": 8354, + "time_per_iteration": 2.5085036754608154 + }, + { + "auxiliary_loss_clip": 0.06431521, + "auxiliary_loss_mlp": 0.0126797, + "balance_loss_clip": 0.06281638, + "balance_loss_mlp": 0.01256216, + "epoch": 0.5023297760408838, + "flos": 27167814437760.0, + "grad_norm": 1.5801517406711547, + "language_loss": 0.7257005, + "learning_rate": 2.0819587276029884e-06, + "loss": 0.80269539, + "num_input_tokens_seen": 179630875, + "router_z_loss_clip": 1.49804688, + "router_z_loss_mlp": 0.11755371, + "step": 8355, + "time_per_iteration": 2.559136152267456 + }, + { + "auxiliary_loss_clip": 0.06435544, + "auxiliary_loss_mlp": 0.01267978, + "balance_loss_clip": 0.06278937, + "balance_loss_mlp": 0.01255134, + "epoch": 0.5023898992935518, + "flos": 26221054354560.0, + "grad_norm": 1.801551244152151, + "language_loss": 0.8142066, + "learning_rate": 2.081569591520548e-06, + "loss": 0.89124179, + "num_input_tokens_seen": 179649835, + "router_z_loss_clip": 1.56445312, + "router_z_loss_mlp": 0.1282959, + "step": 8356, + "time_per_iteration": 3.978407144546509 + }, + { + "auxiliary_loss_clip": 0.06435513, + "auxiliary_loss_mlp": 0.01268474, + "balance_loss_clip": 0.06275411, + "balance_loss_mlp": 0.01255272, + "epoch": 0.5024500225462197, + "flos": 13444839204480.0, + "grad_norm": 2.072167033386685, + "language_loss": 0.7662456, + "learning_rate": 2.0811804523449803e-06, + "loss": 0.84328556, + "num_input_tokens_seen": 179667605, + "router_z_loss_clip": 1.60058594, + "router_z_loss_mlp": 0.13201904, + "step": 8357, + "time_per_iteration": 2.488581657409668 + }, + { + "auxiliary_loss_clip": 0.06431419, + "auxiliary_loss_mlp": 0.01272086, + "balance_loss_clip": 0.06275965, + "balance_loss_mlp": 0.01258758, + "epoch": 0.5025101457988878, + "flos": 21585952448640.0, + "grad_norm": 1.5828459742560037, + "language_loss": 0.76457655, + "learning_rate": 2.0807913100910417e-06, + "loss": 0.84161162, + "num_input_tokens_seen": 179686910, + "router_z_loss_clip": 1.55371094, + "router_z_loss_mlp": 0.13342285, + "step": 8358, + "time_per_iteration": 2.62697434425354 + }, + { + "auxiliary_loss_clip": 0.06429468, + "auxiliary_loss_mlp": 0.01266352, + "balance_loss_clip": 0.06276305, + "balance_loss_mlp": 0.01253877, + "epoch": 0.5025702690515557, + "flos": 24652140105600.0, + "grad_norm": 2.247340947262335, + "language_loss": 0.72276986, + "learning_rate": 2.0804021647734887e-06, + "loss": 0.79972816, + "num_input_tokens_seen": 179706395, + "router_z_loss_clip": 1.53027344, + "router_z_loss_mlp": 0.12481689, + "step": 8359, + "time_per_iteration": 2.577232599258423 + }, + { + "auxiliary_loss_clip": 0.0642844, + "auxiliary_loss_mlp": 0.01267714, + "balance_loss_clip": 0.06277584, + "balance_loss_mlp": 0.01255263, + "epoch": 0.5026303923042237, + "flos": 22096578430080.0, + "grad_norm": 1.7221298639434877, + "language_loss": 0.77017021, + "learning_rate": 2.080013016407077e-06, + "loss": 0.84713173, + "num_input_tokens_seen": 179725735, + "router_z_loss_clip": 1.50683594, + "router_z_loss_mlp": 0.12451172, + "step": 8360, + "time_per_iteration": 2.5449211597442627 + }, + { + "auxiliary_loss_clip": 0.0642498, + "auxiliary_loss_mlp": 0.01267029, + "balance_loss_clip": 0.06274442, + "balance_loss_mlp": 0.0125571, + "epoch": 0.5026905155568916, + "flos": 23704164138240.0, + "grad_norm": 3.319216273479951, + "language_loss": 0.76811969, + "learning_rate": 2.0796238650065645e-06, + "loss": 0.84503973, + "num_input_tokens_seen": 179746150, + "router_z_loss_clip": 1.50488281, + "router_z_loss_mlp": 0.11322021, + "step": 8361, + "time_per_iteration": 2.5360496044158936 + }, + { + "auxiliary_loss_clip": 0.06433755, + "auxiliary_loss_mlp": 0.01271718, + "balance_loss_clip": 0.06276754, + "balance_loss_mlp": 0.01258641, + "epoch": 0.5027506388095596, + "flos": 25819566716160.0, + "grad_norm": 1.6478894806212292, + "language_loss": 0.85182559, + "learning_rate": 2.0792347105867065e-06, + "loss": 0.92888033, + "num_input_tokens_seen": 179767550, + "router_z_loss_clip": 1.56835938, + "router_z_loss_mlp": 0.13067627, + "step": 8362, + "time_per_iteration": 4.023087739944458 + }, + { + "auxiliary_loss_clip": 0.06433062, + "auxiliary_loss_mlp": 0.01266272, + "balance_loss_clip": 0.06277543, + "balance_loss_mlp": 0.01253851, + "epoch": 0.5028107620622275, + "flos": 27533942853120.0, + "grad_norm": 1.6676304720736304, + "language_loss": 0.79210544, + "learning_rate": 2.0788455531622605e-06, + "loss": 0.86909878, + "num_input_tokens_seen": 179790075, + "router_z_loss_clip": 1.5546875, + "router_z_loss_mlp": 0.12420654, + "step": 8363, + "time_per_iteration": 2.610635757446289 + }, + { + "auxiliary_loss_clip": 0.0642155, + "auxiliary_loss_mlp": 0.0126839, + "balance_loss_clip": 0.06275487, + "balance_loss_mlp": 0.01255903, + "epoch": 0.5028708853148955, + "flos": 24541031191680.0, + "grad_norm": 2.470464307064636, + "language_loss": 0.76251006, + "learning_rate": 2.0784563927479838e-06, + "loss": 0.83940947, + "num_input_tokens_seen": 179806515, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.12493896, + "step": 8364, + "time_per_iteration": 2.510077953338623 + }, + { + "auxiliary_loss_clip": 0.06429755, + "auxiliary_loss_mlp": 0.01267093, + "balance_loss_clip": 0.0627771, + "balance_loss_mlp": 0.0125556, + "epoch": 0.5029310085675635, + "flos": 20819887695360.0, + "grad_norm": 1.5150578704653515, + "language_loss": 0.69785869, + "learning_rate": 2.0780672293586317e-06, + "loss": 0.77482712, + "num_input_tokens_seen": 179826450, + "router_z_loss_clip": 1.52050781, + "router_z_loss_mlp": 0.11529541, + "step": 8365, + "time_per_iteration": 2.523810386657715 + }, + { + "auxiliary_loss_clip": 0.064358, + "auxiliary_loss_mlp": 0.01267788, + "balance_loss_clip": 0.06276847, + "balance_loss_mlp": 0.01254365, + "epoch": 0.5029911318202315, + "flos": 22348411476480.0, + "grad_norm": 1.5746180090110224, + "language_loss": 0.73351806, + "learning_rate": 2.0776780630089635e-06, + "loss": 0.81055391, + "num_input_tokens_seen": 179846770, + "router_z_loss_clip": 1.58789062, + "router_z_loss_mlp": 0.13439941, + "step": 8366, + "time_per_iteration": 2.538522481918335 + }, + { + "auxiliary_loss_clip": 0.06433431, + "auxiliary_loss_mlp": 0.01266603, + "balance_loss_clip": 0.06282506, + "balance_loss_mlp": 0.01254324, + "epoch": 0.5030512550728995, + "flos": 24359581175040.0, + "grad_norm": 1.43168858878555, + "language_loss": 0.78766662, + "learning_rate": 2.077288893713735e-06, + "loss": 0.86466694, + "num_input_tokens_seen": 179866585, + "router_z_loss_clip": 1.50976562, + "router_z_loss_mlp": 0.12268066, + "step": 8367, + "time_per_iteration": 2.58542799949646 + }, + { + "auxiliary_loss_clip": 0.064292, + "auxiliary_loss_mlp": 0.01267625, + "balance_loss_clip": 0.06276654, + "balance_loss_mlp": 0.01255835, + "epoch": 0.5031113783255674, + "flos": 18265835393280.0, + "grad_norm": 1.7642536194953051, + "language_loss": 0.70319581, + "learning_rate": 2.0768997214877035e-06, + "loss": 0.78016406, + "num_input_tokens_seen": 179885575, + "router_z_loss_clip": 1.5234375, + "router_z_loss_mlp": 0.11804199, + "step": 8368, + "time_per_iteration": 2.4808216094970703 + }, + { + "auxiliary_loss_clip": 0.06318872, + "auxiliary_loss_mlp": 0.01256661, + "balance_loss_clip": 0.06252527, + "balance_loss_mlp": 0.01254704, + "epoch": 0.5031715015782354, + "flos": 57270022859520.0, + "grad_norm": 0.9058846668072361, + "language_loss": 0.63429594, + "learning_rate": 2.0765105463456274e-06, + "loss": 0.7100513, + "num_input_tokens_seen": 179939650, + "router_z_loss_clip": 0.6640625, + "router_z_loss_mlp": 0.01954651, + "step": 8369, + "time_per_iteration": 3.0813984870910645 + }, + { + "auxiliary_loss_clip": 0.06425582, + "auxiliary_loss_mlp": 0.0126821, + "balance_loss_clip": 0.06275157, + "balance_loss_mlp": 0.01256873, + "epoch": 0.5032316248309033, + "flos": 27534823320960.0, + "grad_norm": 1.9780482072247232, + "language_loss": 0.60450232, + "learning_rate": 2.076121368302263e-06, + "loss": 0.68144017, + "num_input_tokens_seen": 179961765, + "router_z_loss_clip": 1.50390625, + "router_z_loss_mlp": 0.11328125, + "step": 8370, + "time_per_iteration": 2.6361827850341797 + }, + { + "auxiliary_loss_clip": 0.06429368, + "auxiliary_loss_mlp": 0.01269199, + "balance_loss_clip": 0.06273827, + "balance_loss_mlp": 0.01255901, + "epoch": 0.5032917480835714, + "flos": 34504401104640.0, + "grad_norm": 1.6209694165930644, + "language_loss": 0.68475735, + "learning_rate": 2.0757321873723695e-06, + "loss": 0.76174301, + "num_input_tokens_seen": 179983015, + "router_z_loss_clip": 1.55664062, + "router_z_loss_mlp": 0.13293457, + "step": 8371, + "time_per_iteration": 2.6757090091705322 + }, + { + "auxiliary_loss_clip": 0.06428707, + "auxiliary_loss_mlp": 0.01269533, + "balance_loss_clip": 0.06274853, + "balance_loss_mlp": 0.01256158, + "epoch": 0.5033518713362393, + "flos": 33665228064000.0, + "grad_norm": 1.992355635042309, + "language_loss": 0.67781597, + "learning_rate": 2.0753430035707042e-06, + "loss": 0.75479841, + "num_input_tokens_seen": 180003210, + "router_z_loss_clip": 1.54199219, + "router_z_loss_mlp": 0.13397217, + "step": 8372, + "time_per_iteration": 2.625875234603882 + }, + { + "auxiliary_loss_clip": 0.06429783, + "auxiliary_loss_mlp": 0.0126941, + "balance_loss_clip": 0.06275001, + "balance_loss_mlp": 0.0125582, + "epoch": 0.5034119945889073, + "flos": 28193301031680.0, + "grad_norm": 1.502668832263038, + "language_loss": 0.67200899, + "learning_rate": 2.0749538169120235e-06, + "loss": 0.74900091, + "num_input_tokens_seen": 180025530, + "router_z_loss_clip": 1.546875, + "router_z_loss_mlp": 0.13604736, + "step": 8373, + "time_per_iteration": 2.605649709701538 + }, + { + "auxiliary_loss_clip": 0.06426984, + "auxiliary_loss_mlp": 0.01270724, + "balance_loss_clip": 0.06274835, + "balance_loss_mlp": 0.01258362, + "epoch": 0.5034721178415752, + "flos": 21364698942720.0, + "grad_norm": 1.6635937081301206, + "language_loss": 0.75186062, + "learning_rate": 2.0745646274110872e-06, + "loss": 0.82883763, + "num_input_tokens_seen": 180043180, + "router_z_loss_clip": 1.52050781, + "router_z_loss_mlp": 0.12365723, + "step": 8374, + "time_per_iteration": 2.503739595413208 + }, + { + "auxiliary_loss_clip": 0.06431206, + "auxiliary_loss_mlp": 0.01268819, + "balance_loss_clip": 0.06274216, + "balance_loss_mlp": 0.01255945, + "epoch": 0.5035322410942432, + "flos": 22681486656000.0, + "grad_norm": 1.5469346618590563, + "language_loss": 0.68547672, + "learning_rate": 2.0741754350826525e-06, + "loss": 0.76247704, + "num_input_tokens_seen": 180062905, + "router_z_loss_clip": 1.5703125, + "router_z_loss_mlp": 0.12878418, + "step": 8375, + "time_per_iteration": 2.5394840240478516 + }, + { + "auxiliary_loss_clip": 0.06436669, + "auxiliary_loss_mlp": 0.0127122, + "balance_loss_clip": 0.06277038, + "balance_loss_mlp": 0.01257285, + "epoch": 0.5035923643469111, + "flos": 19834875423360.0, + "grad_norm": 1.6007016499880733, + "language_loss": 0.78976023, + "learning_rate": 2.0737862399414777e-06, + "loss": 0.86683917, + "num_input_tokens_seen": 180082000, + "router_z_loss_clip": 1.59570312, + "router_z_loss_mlp": 0.1394043, + "step": 8376, + "time_per_iteration": 2.480931520462036 + }, + { + "auxiliary_loss_clip": 0.06429401, + "auxiliary_loss_mlp": 0.01267025, + "balance_loss_clip": 0.06272124, + "balance_loss_mlp": 0.01254722, + "epoch": 0.5036524875995791, + "flos": 30521823269760.0, + "grad_norm": 2.1513689232389686, + "language_loss": 0.59716964, + "learning_rate": 2.0733970420023213e-06, + "loss": 0.6741339, + "num_input_tokens_seen": 180101340, + "router_z_loss_clip": 1.57226562, + "router_z_loss_mlp": 0.12304688, + "step": 8377, + "time_per_iteration": 2.5793137550354004 + }, + { + "auxiliary_loss_clip": 0.06430321, + "auxiliary_loss_mlp": 0.01267909, + "balance_loss_clip": 0.06277174, + "balance_loss_mlp": 0.01254617, + "epoch": 0.5037126108522471, + "flos": 14725848424320.0, + "grad_norm": 1.9178870854351904, + "language_loss": 0.76377517, + "learning_rate": 2.0730078412799425e-06, + "loss": 0.84075749, + "num_input_tokens_seen": 180119160, + "router_z_loss_clip": 1.53320312, + "router_z_loss_mlp": 0.13305664, + "step": 8378, + "time_per_iteration": 2.4622483253479004 + }, + { + "auxiliary_loss_clip": 0.06432158, + "auxiliary_loss_mlp": 0.01267261, + "balance_loss_clip": 0.06278415, + "balance_loss_mlp": 0.01254815, + "epoch": 0.5037727341049151, + "flos": 25304119125120.0, + "grad_norm": 1.5376418940503571, + "language_loss": 0.746418, + "learning_rate": 2.0726186377890985e-06, + "loss": 0.82341218, + "num_input_tokens_seen": 180138730, + "router_z_loss_clip": 1.53710938, + "router_z_loss_mlp": 0.12457275, + "step": 8379, + "time_per_iteration": 2.55764102935791 + }, + { + "auxiliary_loss_clip": 0.06427328, + "auxiliary_loss_mlp": 0.01273275, + "balance_loss_clip": 0.06276823, + "balance_loss_mlp": 0.01260138, + "epoch": 0.5038328573575831, + "flos": 28548193000320.0, + "grad_norm": 1.8355606211356674, + "language_loss": 0.66636741, + "learning_rate": 2.072229431544548e-06, + "loss": 0.74337339, + "num_input_tokens_seen": 180158810, + "router_z_loss_clip": 1.50195312, + "router_z_loss_mlp": 0.13146973, + "step": 8380, + "time_per_iteration": 2.566993474960327 + }, + { + "auxiliary_loss_clip": 0.06426656, + "auxiliary_loss_mlp": 0.01266484, + "balance_loss_clip": 0.0627608, + "balance_loss_mlp": 0.01254259, + "epoch": 0.503892980610251, + "flos": 31657957580160.0, + "grad_norm": 1.8901892775526132, + "language_loss": 0.63646573, + "learning_rate": 2.071840222561051e-06, + "loss": 0.71339715, + "num_input_tokens_seen": 180179700, + "router_z_loss_clip": 1.50488281, + "router_z_loss_mlp": 0.12213135, + "step": 8381, + "time_per_iteration": 2.5915544033050537 + }, + { + "auxiliary_loss_clip": 0.06427336, + "auxiliary_loss_mlp": 0.01268764, + "balance_loss_clip": 0.06275158, + "balance_loss_mlp": 0.01257087, + "epoch": 0.503953103862919, + "flos": 27096718648320.0, + "grad_norm": 1.5372847630358786, + "language_loss": 0.67925096, + "learning_rate": 2.071451010853365e-06, + "loss": 0.756212, + "num_input_tokens_seen": 180199890, + "router_z_loss_clip": 1.52246094, + "router_z_loss_mlp": 0.11676025, + "step": 8382, + "time_per_iteration": 2.553654432296753 + }, + { + "auxiliary_loss_clip": 0.06443429, + "auxiliary_loss_mlp": 0.01271028, + "balance_loss_clip": 0.06281322, + "balance_loss_mlp": 0.0125745, + "epoch": 0.5040132271155869, + "flos": 15638423241600.0, + "grad_norm": 1.8104420976136362, + "language_loss": 0.62072217, + "learning_rate": 2.0710617964362506e-06, + "loss": 0.69786668, + "num_input_tokens_seen": 180217840, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.13598633, + "step": 8383, + "time_per_iteration": 2.525148630142212 + }, + { + "auxiliary_loss_clip": 0.06426074, + "auxiliary_loss_mlp": 0.01267471, + "balance_loss_clip": 0.06277263, + "balance_loss_mlp": 0.01255609, + "epoch": 0.504073350368255, + "flos": 13595290410240.0, + "grad_norm": 1.7264517386370961, + "language_loss": 0.6736567, + "learning_rate": 2.070672579324465e-06, + "loss": 0.75059223, + "num_input_tokens_seen": 180236465, + "router_z_loss_clip": 1.48730469, + "router_z_loss_mlp": 0.11853027, + "step": 8384, + "time_per_iteration": 2.4712305068969727 + }, + { + "auxiliary_loss_clip": 0.064311, + "auxiliary_loss_mlp": 0.01267671, + "balance_loss_clip": 0.06277114, + "balance_loss_mlp": 0.01255059, + "epoch": 0.5041334736209229, + "flos": 29065611162240.0, + "grad_norm": 1.6378210813415193, + "language_loss": 0.71431983, + "learning_rate": 2.0702833595327674e-06, + "loss": 0.79130751, + "num_input_tokens_seen": 180258025, + "router_z_loss_clip": 1.54003906, + "router_z_loss_mlp": 0.12609863, + "step": 8385, + "time_per_iteration": 2.573953151702881 + }, + { + "auxiliary_loss_clip": 0.06426452, + "auxiliary_loss_mlp": 0.01264681, + "balance_loss_clip": 0.0627909, + "balance_loss_mlp": 0.01252916, + "epoch": 0.5041935968735909, + "flos": 24615313436160.0, + "grad_norm": 1.6953325653845304, + "language_loss": 0.83098906, + "learning_rate": 2.069894137075919e-06, + "loss": 0.90790039, + "num_input_tokens_seen": 180277825, + "router_z_loss_clip": 1.47167969, + "router_z_loss_mlp": 0.11767578, + "step": 8386, + "time_per_iteration": 2.5524075031280518 + }, + { + "auxiliary_loss_clip": 0.06431791, + "auxiliary_loss_mlp": 0.01268931, + "balance_loss_clip": 0.06277502, + "balance_loss_mlp": 0.01256146, + "epoch": 0.5042537201262588, + "flos": 26294204568960.0, + "grad_norm": 1.4563010196783333, + "language_loss": 0.669891, + "learning_rate": 2.0695049119686766e-06, + "loss": 0.74689829, + "num_input_tokens_seen": 180300465, + "router_z_loss_clip": 1.54296875, + "router_z_loss_mlp": 0.12780762, + "step": 8387, + "time_per_iteration": 3.9810335636138916 + }, + { + "auxiliary_loss_clip": 0.064284, + "auxiliary_loss_mlp": 0.01266601, + "balance_loss_clip": 0.06276827, + "balance_loss_mlp": 0.01254608, + "epoch": 0.5043138433789268, + "flos": 22023805559040.0, + "grad_norm": 3.745410743833339, + "language_loss": 0.80531698, + "learning_rate": 2.0691156842258016e-06, + "loss": 0.882267, + "num_input_tokens_seen": 180321050, + "router_z_loss_clip": 1.51464844, + "router_z_loss_mlp": 0.11999512, + "step": 8388, + "time_per_iteration": 2.5729317665100098 + }, + { + "auxiliary_loss_clip": 0.06426677, + "auxiliary_loss_mlp": 0.01268377, + "balance_loss_clip": 0.06274392, + "balance_loss_mlp": 0.01256075, + "epoch": 0.5043739666315947, + "flos": 28774645459200.0, + "grad_norm": 1.9801629056940246, + "language_loss": 0.70134413, + "learning_rate": 2.0687264538620537e-06, + "loss": 0.77829468, + "num_input_tokens_seen": 180338870, + "router_z_loss_clip": 1.52246094, + "router_z_loss_mlp": 0.12298584, + "step": 8389, + "time_per_iteration": 2.5604100227355957 + }, + { + "auxiliary_loss_clip": 0.06432408, + "auxiliary_loss_mlp": 0.01269066, + "balance_loss_clip": 0.06276394, + "balance_loss_mlp": 0.01256328, + "epoch": 0.5044340898842627, + "flos": 27606548016000.0, + "grad_norm": 1.4709504779743863, + "language_loss": 0.69360697, + "learning_rate": 2.068337220892191e-06, + "loss": 0.77062166, + "num_input_tokens_seen": 180361285, + "router_z_loss_clip": 1.56054688, + "router_z_loss_mlp": 0.12750244, + "step": 8390, + "time_per_iteration": 4.074434041976929 + }, + { + "auxiliary_loss_clip": 0.06327184, + "auxiliary_loss_mlp": 0.01253766, + "balance_loss_clip": 0.06261003, + "balance_loss_mlp": 0.01251581, + "epoch": 0.5044942131369307, + "flos": 67474744058880.0, + "grad_norm": 0.7911094819234682, + "language_loss": 0.52874231, + "learning_rate": 2.067947985330974e-06, + "loss": 0.60455179, + "num_input_tokens_seen": 180415170, + "router_z_loss_clip": 0.6640625, + "router_z_loss_mlp": 0.0218811, + "step": 8391, + "time_per_iteration": 2.939533233642578 + }, + { + "auxiliary_loss_clip": 0.06334387, + "auxiliary_loss_mlp": 0.01253845, + "balance_loss_clip": 0.06267701, + "balance_loss_mlp": 0.01251732, + "epoch": 0.5045543363895987, + "flos": 58646460280320.0, + "grad_norm": 0.8187125498801333, + "language_loss": 0.60630977, + "learning_rate": 2.0675587471931628e-06, + "loss": 0.68219203, + "num_input_tokens_seen": 180468060, + "router_z_loss_clip": 0.66796875, + "router_z_loss_mlp": 0.02114868, + "step": 8392, + "time_per_iteration": 2.9839742183685303 + }, + { + "auxiliary_loss_clip": 0.06425072, + "auxiliary_loss_mlp": 0.01265494, + "balance_loss_clip": 0.06275131, + "balance_loss_mlp": 0.01252631, + "epoch": 0.5046144596422667, + "flos": 22532880240000.0, + "grad_norm": 1.6790063296091327, + "language_loss": 0.85000169, + "learning_rate": 2.067169506493517e-06, + "loss": 0.9269073, + "num_input_tokens_seen": 180486610, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.12866211, + "step": 8393, + "time_per_iteration": 2.5764622688293457 + }, + { + "auxiliary_loss_clip": 0.06430794, + "auxiliary_loss_mlp": 0.01270713, + "balance_loss_clip": 0.06278183, + "balance_loss_mlp": 0.01258869, + "epoch": 0.5046745828949346, + "flos": 27461673106560.0, + "grad_norm": 1.8013259480756436, + "language_loss": 0.5139519, + "learning_rate": 2.0667802632467974e-06, + "loss": 0.590967, + "num_input_tokens_seen": 180508135, + "router_z_loss_clip": 1.52734375, + "router_z_loss_mlp": 0.11834717, + "step": 8394, + "time_per_iteration": 2.5577075481414795 + }, + { + "auxiliary_loss_clip": 0.06430504, + "auxiliary_loss_mlp": 0.012693, + "balance_loss_clip": 0.06275499, + "balance_loss_mlp": 0.01256664, + "epoch": 0.5047347061476026, + "flos": 17280236142720.0, + "grad_norm": 1.62433976950566, + "language_loss": 0.75468862, + "learning_rate": 2.0663910174677627e-06, + "loss": 0.83168674, + "num_input_tokens_seen": 180527000, + "router_z_loss_clip": 1.54980469, + "router_z_loss_mlp": 0.12628174, + "step": 8395, + "time_per_iteration": 4.00100040435791 + }, + { + "auxiliary_loss_clip": 0.06430663, + "auxiliary_loss_mlp": 0.01264928, + "balance_loss_clip": 0.06276973, + "balance_loss_mlp": 0.01252876, + "epoch": 0.5047948294002705, + "flos": 16654308543360.0, + "grad_norm": 3.1739634410128446, + "language_loss": 0.68759549, + "learning_rate": 2.0660017691711737e-06, + "loss": 0.76455134, + "num_input_tokens_seen": 180544715, + "router_z_loss_clip": 1.53613281, + "router_z_loss_mlp": 0.1206665, + "step": 8396, + "time_per_iteration": 2.5608737468719482 + }, + { + "auxiliary_loss_clip": 0.0643612, + "auxiliary_loss_mlp": 0.01265513, + "balance_loss_clip": 0.06282924, + "balance_loss_mlp": 0.01253235, + "epoch": 0.5048549526529386, + "flos": 26872236760320.0, + "grad_norm": 1.7251064316936986, + "language_loss": 0.7921707, + "learning_rate": 2.065612518371792e-06, + "loss": 0.869187, + "num_input_tokens_seen": 180565365, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.12268066, + "step": 8397, + "time_per_iteration": 2.5829713344573975 + }, + { + "auxiliary_loss_clip": 0.06430176, + "auxiliary_loss_mlp": 0.01271123, + "balance_loss_clip": 0.06278492, + "balance_loss_mlp": 0.01258571, + "epoch": 0.5049150759056065, + "flos": 21840175336320.0, + "grad_norm": 1.4916236371554883, + "language_loss": 0.66563869, + "learning_rate": 2.065223265084376e-06, + "loss": 0.7426517, + "num_input_tokens_seen": 180586670, + "router_z_loss_clip": 1.51660156, + "router_z_loss_mlp": 0.12554932, + "step": 8398, + "time_per_iteration": 2.5790011882781982 + }, + { + "auxiliary_loss_clip": 0.06432331, + "auxiliary_loss_mlp": 0.01272223, + "balance_loss_clip": 0.06280147, + "balance_loss_mlp": 0.01259688, + "epoch": 0.5049751991582745, + "flos": 21691652774400.0, + "grad_norm": 1.5799272085735376, + "language_loss": 0.72252852, + "learning_rate": 2.064834009323688e-06, + "loss": 0.79957408, + "num_input_tokens_seen": 180605085, + "router_z_loss_clip": 1.52148438, + "router_z_loss_mlp": 0.12524414, + "step": 8399, + "time_per_iteration": 2.5528035163879395 + }, + { + "auxiliary_loss_clip": 0.06433836, + "auxiliary_loss_mlp": 0.01270059, + "balance_loss_clip": 0.06277353, + "balance_loss_mlp": 0.01257267, + "epoch": 0.5050353224109424, + "flos": 21365495556480.0, + "grad_norm": 1.7587629772693838, + "language_loss": 0.81515628, + "learning_rate": 2.0644447511044878e-06, + "loss": 0.89219522, + "num_input_tokens_seen": 180624370, + "router_z_loss_clip": 1.56347656, + "router_z_loss_mlp": 0.12792969, + "step": 8400, + "time_per_iteration": 2.550828456878662 + }, + { + "auxiliary_loss_clip": 0.06428652, + "auxiliary_loss_mlp": 0.01266624, + "balance_loss_clip": 0.06276295, + "balance_loss_mlp": 0.01254852, + "epoch": 0.5050954456636104, + "flos": 22826655054720.0, + "grad_norm": 2.5272013560823403, + "language_loss": 0.79016161, + "learning_rate": 2.0640554904415362e-06, + "loss": 0.86711431, + "num_input_tokens_seen": 180642450, + "router_z_loss_clip": 1.5234375, + "router_z_loss_mlp": 0.11779785, + "step": 8401, + "time_per_iteration": 2.525132894515991 + }, + { + "auxiliary_loss_clip": 0.06433861, + "auxiliary_loss_mlp": 0.01265271, + "balance_loss_clip": 0.06275853, + "balance_loss_mlp": 0.01252778, + "epoch": 0.5051555689162783, + "flos": 30456513411840.0, + "grad_norm": 1.509144939938127, + "language_loss": 0.70489848, + "learning_rate": 2.063666227349593e-06, + "loss": 0.7818898, + "num_input_tokens_seen": 180665250, + "router_z_loss_clip": 1.58203125, + "router_z_loss_mlp": 0.125, + "step": 8402, + "time_per_iteration": 4.0306360721588135 + }, + { + "auxiliary_loss_clip": 0.06429238, + "auxiliary_loss_mlp": 0.01267033, + "balance_loss_clip": 0.06274545, + "balance_loss_mlp": 0.01254915, + "epoch": 0.5052156921689464, + "flos": 21294315912960.0, + "grad_norm": 1.5960111955062717, + "language_loss": 0.6935674, + "learning_rate": 2.063276961843422e-06, + "loss": 0.77053005, + "num_input_tokens_seen": 180687425, + "router_z_loss_clip": 1.54589844, + "router_z_loss_mlp": 0.12121582, + "step": 8403, + "time_per_iteration": 2.558231830596924 + }, + { + "auxiliary_loss_clip": 0.06433211, + "auxiliary_loss_mlp": 0.01267338, + "balance_loss_clip": 0.06281981, + "balance_loss_mlp": 0.01255799, + "epoch": 0.5052758154216143, + "flos": 25088106499200.0, + "grad_norm": 1.463323664554185, + "language_loss": 0.86018717, + "learning_rate": 2.062887693937781e-06, + "loss": 0.93719262, + "num_input_tokens_seen": 180708725, + "router_z_loss_clip": 1.51269531, + "router_z_loss_mlp": 0.11547852, + "step": 8404, + "time_per_iteration": 2.618649959564209 + }, + { + "auxiliary_loss_clip": 0.06428184, + "auxiliary_loss_mlp": 0.01270079, + "balance_loss_clip": 0.06276304, + "balance_loss_mlp": 0.01258092, + "epoch": 0.5053359386742823, + "flos": 20891612390400.0, + "grad_norm": 1.5475179634828664, + "language_loss": 0.75802314, + "learning_rate": 2.0624984236474322e-06, + "loss": 0.83500576, + "num_input_tokens_seen": 180727990, + "router_z_loss_clip": 1.51757812, + "router_z_loss_mlp": 0.11987305, + "step": 8405, + "time_per_iteration": 2.5067524909973145 + }, + { + "auxiliary_loss_clip": 0.0643079, + "auxiliary_loss_mlp": 0.01267126, + "balance_loss_clip": 0.0627564, + "balance_loss_mlp": 0.01253882, + "epoch": 0.5053960619269503, + "flos": 37752499975680.0, + "grad_norm": 1.6248618607930092, + "language_loss": 0.73678941, + "learning_rate": 2.0621091509871378e-06, + "loss": 0.81376863, + "num_input_tokens_seen": 180749765, + "router_z_loss_clip": 1.55175781, + "router_z_loss_mlp": 0.13250732, + "step": 8406, + "time_per_iteration": 2.8841259479522705 + }, + { + "auxiliary_loss_clip": 0.06424634, + "auxiliary_loss_mlp": 0.01267238, + "balance_loss_clip": 0.06275164, + "balance_loss_mlp": 0.01254662, + "epoch": 0.5054561851796182, + "flos": 23520617769600.0, + "grad_norm": 1.7553784713680058, + "language_loss": 0.77329504, + "learning_rate": 2.0617198759716568e-06, + "loss": 0.85021389, + "num_input_tokens_seen": 180769580, + "router_z_loss_clip": 1.49414062, + "router_z_loss_mlp": 0.12579346, + "step": 8407, + "time_per_iteration": 2.5749242305755615 + }, + { + "auxiliary_loss_clip": 0.06430455, + "auxiliary_loss_mlp": 0.01267206, + "balance_loss_clip": 0.06274534, + "balance_loss_mlp": 0.01255434, + "epoch": 0.5055163084322862, + "flos": 30418261223040.0, + "grad_norm": 1.7587183909270583, + "language_loss": 0.63584411, + "learning_rate": 2.0613305986157535e-06, + "loss": 0.71282065, + "num_input_tokens_seen": 180790295, + "router_z_loss_clip": 1.55859375, + "router_z_loss_mlp": 0.11767578, + "step": 8408, + "time_per_iteration": 2.5872433185577393 + }, + { + "auxiliary_loss_clip": 0.06432275, + "auxiliary_loss_mlp": 0.01267048, + "balance_loss_clip": 0.06279387, + "balance_loss_mlp": 0.01253387, + "epoch": 0.5055764316849541, + "flos": 20264720469120.0, + "grad_norm": 2.4280351300793086, + "language_loss": 0.63813823, + "learning_rate": 2.0609413189341865e-06, + "loss": 0.71513146, + "num_input_tokens_seen": 180807875, + "router_z_loss_clip": 1.52734375, + "router_z_loss_mlp": 0.13659668, + "step": 8409, + "time_per_iteration": 2.5165858268737793 + }, + { + "auxiliary_loss_clip": 0.064235, + "auxiliary_loss_mlp": 0.01266011, + "balance_loss_clip": 0.06273322, + "balance_loss_mlp": 0.01254895, + "epoch": 0.5056365549376222, + "flos": 26078611213440.0, + "grad_norm": 1.3852804971458688, + "language_loss": 0.71039546, + "learning_rate": 2.0605520369417193e-06, + "loss": 0.78729057, + "num_input_tokens_seen": 180831300, + "router_z_loss_clip": 1.50097656, + "router_z_loss_mlp": 0.11132812, + "step": 8410, + "time_per_iteration": 2.594809055328369 + }, + { + "auxiliary_loss_clip": 0.0643055, + "auxiliary_loss_mlp": 0.01267353, + "balance_loss_clip": 0.0627602, + "balance_loss_mlp": 0.01254437, + "epoch": 0.5056966781902901, + "flos": 19284739441920.0, + "grad_norm": 1.6144456520966346, + "language_loss": 0.79591584, + "learning_rate": 2.060162752653113e-06, + "loss": 0.87289482, + "num_input_tokens_seen": 180849055, + "router_z_loss_clip": 1.546875, + "router_z_loss_mlp": 0.12921143, + "step": 8411, + "time_per_iteration": 2.53426194190979 + }, + { + "auxiliary_loss_clip": 0.06433219, + "auxiliary_loss_mlp": 0.0126873, + "balance_loss_clip": 0.06276312, + "balance_loss_mlp": 0.01254979, + "epoch": 0.5057568014429581, + "flos": 21329507427840.0, + "grad_norm": 1.7389096144894618, + "language_loss": 0.81907368, + "learning_rate": 2.0597734660831285e-06, + "loss": 0.89609325, + "num_input_tokens_seen": 180867395, + "router_z_loss_clip": 1.56835938, + "router_z_loss_mlp": 0.13757324, + "step": 8412, + "time_per_iteration": 2.517216444015503 + }, + { + "auxiliary_loss_clip": 0.06429601, + "auxiliary_loss_mlp": 0.01270568, + "balance_loss_clip": 0.0627761, + "balance_loss_mlp": 0.01258134, + "epoch": 0.505816924695626, + "flos": 17499351369600.0, + "grad_norm": 1.7713461187517285, + "language_loss": 0.80336094, + "learning_rate": 2.0593841772465283e-06, + "loss": 0.88036257, + "num_input_tokens_seen": 180886670, + "router_z_loss_clip": 1.51953125, + "router_z_loss_mlp": 0.12438965, + "step": 8413, + "time_per_iteration": 2.524210214614868 + }, + { + "auxiliary_loss_clip": 0.06428088, + "auxiliary_loss_mlp": 0.01274079, + "balance_loss_clip": 0.06273276, + "balance_loss_mlp": 0.01260328, + "epoch": 0.505877047948294, + "flos": 21148434754560.0, + "grad_norm": 1.7829708596435327, + "language_loss": 0.80812234, + "learning_rate": 2.0589948861580737e-06, + "loss": 0.885144, + "num_input_tokens_seen": 180904645, + "router_z_loss_clip": 1.54589844, + "router_z_loss_mlp": 0.1373291, + "step": 8414, + "time_per_iteration": 2.5200514793395996 + }, + { + "auxiliary_loss_clip": 0.06426316, + "auxiliary_loss_mlp": 0.01270081, + "balance_loss_clip": 0.06272253, + "balance_loss_mlp": 0.01257468, + "epoch": 0.5059371712009619, + "flos": 36357824292480.0, + "grad_norm": 2.3266509400680935, + "language_loss": 0.62741381, + "learning_rate": 2.058605592832528e-06, + "loss": 0.70437777, + "num_input_tokens_seen": 180922340, + "router_z_loss_clip": 1.5390625, + "router_z_loss_mlp": 0.12615967, + "step": 8415, + "time_per_iteration": 2.676204204559326 + }, + { + "auxiliary_loss_clip": 0.06428116, + "auxiliary_loss_mlp": 0.01272149, + "balance_loss_clip": 0.06274984, + "balance_loss_mlp": 0.01259882, + "epoch": 0.50599729445363, + "flos": 22679809574400.0, + "grad_norm": 1.4983327127759412, + "language_loss": 0.82398355, + "learning_rate": 2.0582162972846515e-06, + "loss": 0.90098619, + "num_input_tokens_seen": 180941350, + "router_z_loss_clip": 1.53320312, + "router_z_loss_mlp": 0.12261963, + "step": 8416, + "time_per_iteration": 2.540487289428711 + }, + { + "auxiliary_loss_clip": 0.06427394, + "auxiliary_loss_mlp": 0.01269018, + "balance_loss_clip": 0.06278178, + "balance_loss_mlp": 0.01257705, + "epoch": 0.5060574177062979, + "flos": 22754553016320.0, + "grad_norm": 1.8321417063208305, + "language_loss": 0.79700905, + "learning_rate": 2.0578269995292078e-06, + "loss": 0.87397313, + "num_input_tokens_seen": 180960720, + "router_z_loss_clip": 1.49023438, + "router_z_loss_mlp": 0.11328125, + "step": 8417, + "time_per_iteration": 2.5462777614593506 + }, + { + "auxiliary_loss_clip": 0.06425334, + "auxiliary_loss_mlp": 0.01268694, + "balance_loss_clip": 0.06277245, + "balance_loss_mlp": 0.01256875, + "epoch": 0.5061175409589659, + "flos": 21659689641600.0, + "grad_norm": 1.7824010317095476, + "language_loss": 0.63313794, + "learning_rate": 2.0574376995809588e-06, + "loss": 0.71007824, + "num_input_tokens_seen": 180979725, + "router_z_loss_clip": 1.48046875, + "router_z_loss_mlp": 0.11816406, + "step": 8418, + "time_per_iteration": 2.5203146934509277 + }, + { + "auxiliary_loss_clip": 0.0643232, + "auxiliary_loss_mlp": 0.01270126, + "balance_loss_clip": 0.06277534, + "balance_loss_mlp": 0.01257877, + "epoch": 0.5061776642116339, + "flos": 21622653336960.0, + "grad_norm": 1.6210660838966935, + "language_loss": 0.77937323, + "learning_rate": 2.0570483974546653e-06, + "loss": 0.85639775, + "num_input_tokens_seen": 180998980, + "router_z_loss_clip": 1.54492188, + "router_z_loss_mlp": 0.12249756, + "step": 8419, + "time_per_iteration": 2.549057722091675 + }, + { + "auxiliary_loss_clip": 0.06433055, + "auxiliary_loss_mlp": 0.01272716, + "balance_loss_clip": 0.06277718, + "balance_loss_mlp": 0.01259955, + "epoch": 0.5062377874643018, + "flos": 24433276440960.0, + "grad_norm": 1.7091767496398438, + "language_loss": 0.77142859, + "learning_rate": 2.0566590931650917e-06, + "loss": 0.8484863, + "num_input_tokens_seen": 181019165, + "router_z_loss_clip": 1.55273438, + "router_z_loss_mlp": 0.12762451, + "step": 8420, + "time_per_iteration": 2.533263921737671 + }, + { + "auxiliary_loss_clip": 0.06430572, + "auxiliary_loss_mlp": 0.0127647, + "balance_loss_clip": 0.06276705, + "balance_loss_mlp": 0.01264311, + "epoch": 0.5062979107169698, + "flos": 22530322690560.0, + "grad_norm": 1.6514243222666503, + "language_loss": 0.77777469, + "learning_rate": 2.056269786726999e-06, + "loss": 0.85484511, + "num_input_tokens_seen": 181037110, + "router_z_loss_clip": 1.5390625, + "router_z_loss_mlp": 0.121521, + "step": 8421, + "time_per_iteration": 2.535022497177124 + }, + { + "auxiliary_loss_clip": 0.06429385, + "auxiliary_loss_mlp": 0.01273249, + "balance_loss_clip": 0.06276778, + "balance_loss_mlp": 0.01261895, + "epoch": 0.5063580339696377, + "flos": 24578947964160.0, + "grad_norm": 1.4350674480860695, + "language_loss": 0.67189109, + "learning_rate": 2.0558804781551512e-06, + "loss": 0.74891746, + "num_input_tokens_seen": 181057775, + "router_z_loss_clip": 1.52636719, + "router_z_loss_mlp": 0.11352539, + "step": 8422, + "time_per_iteration": 2.555051803588867 + }, + { + "auxiliary_loss_clip": 0.064266, + "auxiliary_loss_mlp": 0.01271001, + "balance_loss_clip": 0.06276479, + "balance_loss_mlp": 0.01259241, + "epoch": 0.5064181572223058, + "flos": 22601837750400.0, + "grad_norm": 1.5827559778751017, + "language_loss": 0.81783563, + "learning_rate": 2.05549116746431e-06, + "loss": 0.89481163, + "num_input_tokens_seen": 181078260, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.11755371, + "step": 8423, + "time_per_iteration": 2.606844663619995 + }, + { + "auxiliary_loss_clip": 0.06427386, + "auxiliary_loss_mlp": 0.01268856, + "balance_loss_clip": 0.06273049, + "balance_loss_mlp": 0.01256411, + "epoch": 0.5064782804749737, + "flos": 26002148762880.0, + "grad_norm": 2.1055931359181086, + "language_loss": 0.74535251, + "learning_rate": 2.055101854669237e-06, + "loss": 0.82231486, + "num_input_tokens_seen": 181098755, + "router_z_loss_clip": 1.54199219, + "router_z_loss_mlp": 0.12451172, + "step": 8424, + "time_per_iteration": 2.5353689193725586 + }, + { + "auxiliary_loss_clip": 0.06427233, + "auxiliary_loss_mlp": 0.01264898, + "balance_loss_clip": 0.06278618, + "balance_loss_mlp": 0.0125268, + "epoch": 0.5065384037276417, + "flos": 28561358090880.0, + "grad_norm": 1.333495130602937, + "language_loss": 0.71332014, + "learning_rate": 2.0547125397846975e-06, + "loss": 0.79024142, + "num_input_tokens_seen": 181121570, + "router_z_loss_clip": 1.48632812, + "router_z_loss_mlp": 0.12231445, + "step": 8425, + "time_per_iteration": 2.624431610107422 + }, + { + "auxiliary_loss_clip": 0.06429943, + "auxiliary_loss_mlp": 0.01268875, + "balance_loss_clip": 0.06278015, + "balance_loss_mlp": 0.01257187, + "epoch": 0.5065985269803096, + "flos": 22972620067200.0, + "grad_norm": 1.8777832339890803, + "language_loss": 0.78901541, + "learning_rate": 2.0543232228254524e-06, + "loss": 0.86600357, + "num_input_tokens_seen": 181140240, + "router_z_loss_clip": 1.51855469, + "router_z_loss_mlp": 0.11700439, + "step": 8426, + "time_per_iteration": 3.936661958694458 + }, + { + "auxiliary_loss_clip": 0.06432042, + "auxiliary_loss_mlp": 0.0127276, + "balance_loss_clip": 0.06277739, + "balance_loss_mlp": 0.01260768, + "epoch": 0.5066586502329776, + "flos": 21613680950400.0, + "grad_norm": 2.2511428758914325, + "language_loss": 0.7803759, + "learning_rate": 2.053933903806265e-06, + "loss": 0.85742396, + "num_input_tokens_seen": 181158630, + "router_z_loss_clip": 1.54199219, + "router_z_loss_mlp": 0.12005615, + "step": 8427, + "time_per_iteration": 2.5481557846069336 + }, + { + "auxiliary_loss_clip": 0.06424822, + "auxiliary_loss_mlp": 0.01267004, + "balance_loss_clip": 0.06275385, + "balance_loss_mlp": 0.01255268, + "epoch": 0.5067187734856455, + "flos": 20346214164480.0, + "grad_norm": 1.5242931798978783, + "language_loss": 0.719284, + "learning_rate": 2.0535445827418997e-06, + "loss": 0.79620224, + "num_input_tokens_seen": 181176405, + "router_z_loss_clip": 1.49316406, + "router_z_loss_mlp": 0.11737061, + "step": 8428, + "time_per_iteration": 2.5370116233825684 + }, + { + "auxiliary_loss_clip": 0.06427782, + "auxiliary_loss_mlp": 0.01268707, + "balance_loss_clip": 0.0627581, + "balance_loss_mlp": 0.0125799, + "epoch": 0.5067788967383136, + "flos": 28848801922560.0, + "grad_norm": 1.7598513800416933, + "language_loss": 0.83218622, + "learning_rate": 2.0531552596471168e-06, + "loss": 0.90915114, + "num_input_tokens_seen": 181197595, + "router_z_loss_clip": 1.51953125, + "router_z_loss_mlp": 0.10717773, + "step": 8429, + "time_per_iteration": 2.5739033222198486 + }, + { + "auxiliary_loss_clip": 0.06435312, + "auxiliary_loss_mlp": 0.01266816, + "balance_loss_clip": 0.06276707, + "balance_loss_mlp": 0.01254013, + "epoch": 0.5068390199909815, + "flos": 32457997964160.0, + "grad_norm": 4.868596583088969, + "language_loss": 0.7373606, + "learning_rate": 2.052765934536682e-06, + "loss": 0.8143819, + "num_input_tokens_seen": 181218560, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 0.12805176, + "step": 8430, + "time_per_iteration": 4.062525749206543 + }, + { + "auxiliary_loss_clip": 0.06428299, + "auxiliary_loss_mlp": 0.01270046, + "balance_loss_clip": 0.06275186, + "balance_loss_mlp": 0.01258334, + "epoch": 0.5068991432436495, + "flos": 23152896126720.0, + "grad_norm": 1.801463516744859, + "language_loss": 0.76942408, + "learning_rate": 2.0523766074253575e-06, + "loss": 0.84640753, + "num_input_tokens_seen": 181237095, + "router_z_loss_clip": 1.53027344, + "router_z_loss_mlp": 0.1171875, + "step": 8431, + "time_per_iteration": 2.535198211669922 + }, + { + "auxiliary_loss_clip": 0.06426188, + "auxiliary_loss_mlp": 0.01266777, + "balance_loss_clip": 0.06275809, + "balance_loss_mlp": 0.0125488, + "epoch": 0.5069592664963174, + "flos": 19941917414400.0, + "grad_norm": 1.5385752235820749, + "language_loss": 0.72917402, + "learning_rate": 2.0519872783279074e-06, + "loss": 0.80610371, + "num_input_tokens_seen": 181255940, + "router_z_loss_clip": 1.50292969, + "router_z_loss_mlp": 0.11901855, + "step": 8432, + "time_per_iteration": 2.5343048572540283 + }, + { + "auxiliary_loss_clip": 0.06319194, + "auxiliary_loss_mlp": 0.01252325, + "balance_loss_clip": 0.06253257, + "balance_loss_mlp": 0.01250496, + "epoch": 0.5070193897489854, + "flos": 65812539888000.0, + "grad_norm": 0.7543358557352665, + "language_loss": 0.63621199, + "learning_rate": 2.0515979472590945e-06, + "loss": 0.71192724, + "num_input_tokens_seen": 181316945, + "router_z_loss_clip": 0.66015625, + "router_z_loss_mlp": 0.01824951, + "step": 8433, + "time_per_iteration": 3.1825270652770996 + }, + { + "auxiliary_loss_clip": 0.06432432, + "auxiliary_loss_mlp": 0.01266931, + "balance_loss_clip": 0.06279546, + "balance_loss_mlp": 0.01254414, + "epoch": 0.5070795130016534, + "flos": 17281158537600.0, + "grad_norm": 2.2002665512489505, + "language_loss": 0.77719331, + "learning_rate": 2.051208614233681e-06, + "loss": 0.85418689, + "num_input_tokens_seen": 181335555, + "router_z_loss_clip": 1.52832031, + "router_z_loss_mlp": 0.12512207, + "step": 8434, + "time_per_iteration": 2.51298451423645 + }, + { + "auxiliary_loss_clip": 0.06435563, + "auxiliary_loss_mlp": 0.01265248, + "balance_loss_clip": 0.06278686, + "balance_loss_mlp": 0.01253047, + "epoch": 0.5071396362543213, + "flos": 21076416570240.0, + "grad_norm": 1.9257186196996396, + "language_loss": 0.7107513, + "learning_rate": 2.0508192792664326e-06, + "loss": 0.78775942, + "num_input_tokens_seen": 181354580, + "router_z_loss_clip": 1.56738281, + "router_z_loss_mlp": 0.12207031, + "step": 8435, + "time_per_iteration": 3.9952967166900635 + }, + { + "auxiliary_loss_clip": 0.06431434, + "auxiliary_loss_mlp": 0.01269503, + "balance_loss_clip": 0.06278223, + "balance_loss_mlp": 0.01256646, + "epoch": 0.5071997595069894, + "flos": 23150841701760.0, + "grad_norm": 1.974114732671287, + "language_loss": 0.72623628, + "learning_rate": 2.050429942372112e-06, + "loss": 0.80324566, + "num_input_tokens_seen": 181374320, + "router_z_loss_clip": 1.53222656, + "router_z_loss_mlp": 0.128479, + "step": 8436, + "time_per_iteration": 2.5126936435699463 + }, + { + "auxiliary_loss_clip": 0.06431168, + "auxiliary_loss_mlp": 0.01266132, + "balance_loss_clip": 0.06278354, + "balance_loss_mlp": 0.01253449, + "epoch": 0.5072598827596573, + "flos": 22753756402560.0, + "grad_norm": 2.390958224451536, + "language_loss": 0.84374195, + "learning_rate": 2.050040603565483e-06, + "loss": 0.92071497, + "num_input_tokens_seen": 181392190, + "router_z_loss_clip": 1.52832031, + "router_z_loss_mlp": 0.12701416, + "step": 8437, + "time_per_iteration": 2.5411131381988525 + }, + { + "auxiliary_loss_clip": 0.06423598, + "auxiliary_loss_mlp": 0.01265882, + "balance_loss_clip": 0.06273607, + "balance_loss_mlp": 0.01254128, + "epoch": 0.5073200060123253, + "flos": 22573102999680.0, + "grad_norm": 1.4207198809320167, + "language_loss": 0.80947453, + "learning_rate": 2.049651262861309e-06, + "loss": 0.88636929, + "num_input_tokens_seen": 181413890, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.11749268, + "step": 8438, + "time_per_iteration": 2.5992414951324463 + }, + { + "auxiliary_loss_clip": 0.06431951, + "auxiliary_loss_mlp": 0.01267455, + "balance_loss_clip": 0.06277303, + "balance_loss_mlp": 0.0125458, + "epoch": 0.5073801292649932, + "flos": 25812481046400.0, + "grad_norm": 1.639362892711676, + "language_loss": 0.7992267, + "learning_rate": 2.0492619202743543e-06, + "loss": 0.87622082, + "num_input_tokens_seen": 181433240, + "router_z_loss_clip": 1.54589844, + "router_z_loss_mlp": 0.12872314, + "step": 8439, + "time_per_iteration": 2.5635995864868164 + }, + { + "auxiliary_loss_clip": 0.06422722, + "auxiliary_loss_mlp": 0.01265384, + "balance_loss_clip": 0.06272503, + "balance_loss_mlp": 0.01253833, + "epoch": 0.5074402525176612, + "flos": 25380916992000.0, + "grad_norm": 1.6123120964481592, + "language_loss": 0.71044374, + "learning_rate": 2.048872575819383e-06, + "loss": 0.78732479, + "num_input_tokens_seen": 181453535, + "router_z_loss_clip": 1.50292969, + "router_z_loss_mlp": 0.11560059, + "step": 8440, + "time_per_iteration": 2.54082989692688 + }, + { + "auxiliary_loss_clip": 0.0642738, + "auxiliary_loss_mlp": 0.0126602, + "balance_loss_clip": 0.06274064, + "balance_loss_mlp": 0.01254278, + "epoch": 0.5075003757703291, + "flos": 26071064346240.0, + "grad_norm": 1.625029424987906, + "language_loss": 0.71058178, + "learning_rate": 2.048483229511158e-06, + "loss": 0.78751576, + "num_input_tokens_seen": 181474195, + "router_z_loss_clip": 1.53417969, + "router_z_loss_mlp": 0.11743164, + "step": 8441, + "time_per_iteration": 2.5597851276397705 + }, + { + "auxiliary_loss_clip": 0.06432067, + "auxiliary_loss_mlp": 0.0126825, + "balance_loss_clip": 0.06275806, + "balance_loss_mlp": 0.01255608, + "epoch": 0.5075604990229972, + "flos": 21841936272000.0, + "grad_norm": 1.6251927502787415, + "language_loss": 0.64299369, + "learning_rate": 2.0480938813644445e-06, + "loss": 0.71999681, + "num_input_tokens_seen": 181494000, + "router_z_loss_clip": 1.56152344, + "router_z_loss_mlp": 0.12634277, + "step": 8442, + "time_per_iteration": 3.9658992290496826 + }, + { + "auxiliary_loss_clip": 0.06421914, + "auxiliary_loss_mlp": 0.01270692, + "balance_loss_clip": 0.06274506, + "balance_loss_mlp": 0.01259475, + "epoch": 0.5076206222756651, + "flos": 31986923909760.0, + "grad_norm": 1.4468343781265969, + "language_loss": 0.71796834, + "learning_rate": 2.047704531394006e-06, + "loss": 0.7948944, + "num_input_tokens_seen": 181515955, + "router_z_loss_clip": 1.47460938, + "router_z_loss_mlp": 0.11212158, + "step": 8443, + "time_per_iteration": 2.6133296489715576 + }, + { + "auxiliary_loss_clip": 0.06430129, + "auxiliary_loss_mlp": 0.01267886, + "balance_loss_clip": 0.06273551, + "balance_loss_mlp": 0.01255506, + "epoch": 0.5076807455283331, + "flos": 36913033445760.0, + "grad_norm": 1.2663152678698668, + "language_loss": 0.62379253, + "learning_rate": 2.047315179614607e-06, + "loss": 0.70077264, + "num_input_tokens_seen": 181540225, + "router_z_loss_clip": 1.56542969, + "router_z_loss_mlp": 0.12390137, + "step": 8444, + "time_per_iteration": 2.670844554901123 + }, + { + "auxiliary_loss_clip": 0.06426448, + "auxiliary_loss_mlp": 0.01266149, + "balance_loss_clip": 0.06273904, + "balance_loss_mlp": 0.01255158, + "epoch": 0.507740868781001, + "flos": 29870263520640.0, + "grad_norm": 1.5635527032998127, + "language_loss": 0.64163882, + "learning_rate": 2.046925826041012e-06, + "loss": 0.71856481, + "num_input_tokens_seen": 181560125, + "router_z_loss_clip": 1.52441406, + "router_z_loss_mlp": 0.10992432, + "step": 8445, + "time_per_iteration": 2.564972162246704 + }, + { + "auxiliary_loss_clip": 0.06326441, + "auxiliary_loss_mlp": 0.01258393, + "balance_loss_clip": 0.06260093, + "balance_loss_mlp": 0.0125657, + "epoch": 0.507800992033669, + "flos": 61935872014080.0, + "grad_norm": 0.8045039829713045, + "language_loss": 0.61588788, + "learning_rate": 2.0465364706879845e-06, + "loss": 0.69173622, + "num_input_tokens_seen": 181618830, + "router_z_loss_clip": 0.6640625, + "router_z_loss_mlp": 0.01817322, + "step": 8446, + "time_per_iteration": 3.1747779846191406 + }, + { + "auxiliary_loss_clip": 0.06424413, + "auxiliary_loss_mlp": 0.01266643, + "balance_loss_clip": 0.06272733, + "balance_loss_mlp": 0.01254394, + "epoch": 0.507861115286337, + "flos": 20706137377920.0, + "grad_norm": 4.618603604158377, + "language_loss": 0.80737472, + "learning_rate": 2.04614711357029e-06, + "loss": 0.88428527, + "num_input_tokens_seen": 181637120, + "router_z_loss_clip": 1.51757812, + "router_z_loss_mlp": 0.12243652, + "step": 8447, + "time_per_iteration": 2.510443687438965 + }, + { + "auxiliary_loss_clip": 0.06422254, + "auxiliary_loss_mlp": 0.01267237, + "balance_loss_clip": 0.06272172, + "balance_loss_mlp": 0.01255775, + "epoch": 0.507921238539005, + "flos": 30854982303360.0, + "grad_norm": 1.2702922663182385, + "language_loss": 0.70493698, + "learning_rate": 2.0457577547026916e-06, + "loss": 0.78183186, + "num_input_tokens_seen": 181659965, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.11456299, + "step": 8448, + "time_per_iteration": 2.6021034717559814 + }, + { + "auxiliary_loss_clip": 0.06427675, + "auxiliary_loss_mlp": 0.01268661, + "balance_loss_clip": 0.0627776, + "balance_loss_mlp": 0.0125745, + "epoch": 0.507981361791673, + "flos": 35709031728000.0, + "grad_norm": 1.3111664343686333, + "language_loss": 0.72171003, + "learning_rate": 2.045368394099955e-06, + "loss": 0.79867339, + "num_input_tokens_seen": 181685290, + "router_z_loss_clip": 1.49804688, + "router_z_loss_mlp": 0.11199951, + "step": 8449, + "time_per_iteration": 2.6752874851226807 + }, + { + "auxiliary_loss_clip": 0.06426987, + "auxiliary_loss_mlp": 0.01268113, + "balance_loss_clip": 0.06274859, + "balance_loss_mlp": 0.0125686, + "epoch": 0.5080414850443409, + "flos": 27168694905600.0, + "grad_norm": 1.3940572087719376, + "language_loss": 0.73039591, + "learning_rate": 2.044979031776844e-06, + "loss": 0.80734688, + "num_input_tokens_seen": 181706080, + "router_z_loss_clip": 1.52050781, + "router_z_loss_mlp": 0.11254883, + "step": 8450, + "time_per_iteration": 2.6428375244140625 + }, + { + "auxiliary_loss_clip": 0.06430449, + "auxiliary_loss_mlp": 0.0127298, + "balance_loss_clip": 0.06278583, + "balance_loss_mlp": 0.01261148, + "epoch": 0.5081016082970089, + "flos": 27091855111680.0, + "grad_norm": 1.6054602673211236, + "language_loss": 0.7744205, + "learning_rate": 2.0445896677481234e-06, + "loss": 0.85145479, + "num_input_tokens_seen": 181724805, + "router_z_loss_clip": 1.51757812, + "router_z_loss_mlp": 0.1184082, + "step": 8451, + "time_per_iteration": 2.6066558361053467 + }, + { + "auxiliary_loss_clip": 0.06429529, + "auxiliary_loss_mlp": 0.01266696, + "balance_loss_clip": 0.06276423, + "balance_loss_mlp": 0.01254531, + "epoch": 0.5081617315496768, + "flos": 22863104380800.0, + "grad_norm": 1.825930217148951, + "language_loss": 0.85374677, + "learning_rate": 2.044200302028559e-06, + "loss": 0.930709, + "num_input_tokens_seen": 181743725, + "router_z_loss_clip": 1.53027344, + "router_z_loss_mlp": 0.12158203, + "step": 8452, + "time_per_iteration": 2.5062003135681152 + }, + { + "auxiliary_loss_clip": 0.06431726, + "auxiliary_loss_mlp": 0.01267185, + "balance_loss_clip": 0.06276073, + "balance_loss_mlp": 0.01254716, + "epoch": 0.5082218548023448, + "flos": 16286167630080.0, + "grad_norm": 2.3752555926719343, + "language_loss": 0.77806371, + "learning_rate": 2.0438109346329143e-06, + "loss": 0.85505283, + "num_input_tokens_seen": 181757720, + "router_z_loss_clip": 1.55566406, + "router_z_loss_mlp": 0.12463379, + "step": 8453, + "time_per_iteration": 2.4981954097747803 + }, + { + "auxiliary_loss_clip": 0.06430794, + "auxiliary_loss_mlp": 0.01268463, + "balance_loss_clip": 0.06281981, + "balance_loss_mlp": 0.0125774, + "epoch": 0.5082819780550127, + "flos": 24467419779840.0, + "grad_norm": 1.5957908763151711, + "language_loss": 0.76932752, + "learning_rate": 2.0434215655759544e-06, + "loss": 0.84632009, + "num_input_tokens_seen": 181778545, + "router_z_loss_clip": 1.48828125, + "router_z_loss_mlp": 0.1072998, + "step": 8454, + "time_per_iteration": 2.6134133338928223 + }, + { + "auxiliary_loss_clip": 0.06431732, + "auxiliary_loss_mlp": 0.01271277, + "balance_loss_clip": 0.06279022, + "balance_loss_mlp": 0.01259118, + "epoch": 0.5083421013076808, + "flos": 23409844272000.0, + "grad_norm": 1.4822981638740835, + "language_loss": 0.89621413, + "learning_rate": 2.0430321948724446e-06, + "loss": 0.97324431, + "num_input_tokens_seen": 181799495, + "router_z_loss_clip": 1.52734375, + "router_z_loss_mlp": 0.1217041, + "step": 8455, + "time_per_iteration": 2.6085920333862305 + }, + { + "auxiliary_loss_clip": 0.06434034, + "auxiliary_loss_mlp": 0.01274373, + "balance_loss_clip": 0.06275303, + "balance_loss_mlp": 0.01260831, + "epoch": 0.5084022245603487, + "flos": 23878528485120.0, + "grad_norm": 1.6442671341978696, + "language_loss": 0.62785953, + "learning_rate": 2.042642822537149e-06, + "loss": 0.7049436, + "num_input_tokens_seen": 181818400, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.13555908, + "step": 8456, + "time_per_iteration": 2.5377745628356934 + }, + { + "auxiliary_loss_clip": 0.06329988, + "auxiliary_loss_mlp": 0.01255905, + "balance_loss_clip": 0.06263152, + "balance_loss_mlp": 0.01253715, + "epoch": 0.5084623478130167, + "flos": 62891352921600.0, + "grad_norm": 0.8103581861082657, + "language_loss": 0.62548244, + "learning_rate": 2.0422534485848343e-06, + "loss": 0.70134139, + "num_input_tokens_seen": 181875975, + "router_z_loss_clip": 0.67138672, + "router_z_loss_mlp": 0.02194214, + "step": 8457, + "time_per_iteration": 3.0378763675689697 + }, + { + "auxiliary_loss_clip": 0.06436984, + "auxiliary_loss_mlp": 0.01271319, + "balance_loss_clip": 0.06280852, + "balance_loss_mlp": 0.01258337, + "epoch": 0.5085224710656846, + "flos": 22352688034560.0, + "grad_norm": 1.5276658426580998, + "language_loss": 0.67559206, + "learning_rate": 2.0418640730302644e-06, + "loss": 0.75267512, + "num_input_tokens_seen": 181896450, + "router_z_loss_clip": 1.56152344, + "router_z_loss_mlp": 0.12976074, + "step": 8458, + "time_per_iteration": 2.5329530239105225 + }, + { + "auxiliary_loss_clip": 0.06432781, + "auxiliary_loss_mlp": 0.01272615, + "balance_loss_clip": 0.0627652, + "balance_loss_mlp": 0.01260015, + "epoch": 0.5085825943183526, + "flos": 26073202625280.0, + "grad_norm": 1.618055128351248, + "language_loss": 0.77449083, + "learning_rate": 2.0414746958882043e-06, + "loss": 0.85154486, + "num_input_tokens_seen": 181916770, + "router_z_loss_clip": 1.56152344, + "router_z_loss_mlp": 0.1260376, + "step": 8459, + "time_per_iteration": 2.5590224266052246 + }, + { + "auxiliary_loss_clip": 0.06437792, + "auxiliary_loss_mlp": 0.01271084, + "balance_loss_clip": 0.06279328, + "balance_loss_mlp": 0.01258132, + "epoch": 0.5086427175710206, + "flos": 17426494644480.0, + "grad_norm": 2.2202109072156664, + "language_loss": 0.81101096, + "learning_rate": 2.0410853171734196e-06, + "loss": 0.88809973, + "num_input_tokens_seen": 181932710, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 0.12945557, + "step": 8460, + "time_per_iteration": 2.4797065258026123 + }, + { + "auxiliary_loss_clip": 0.06432672, + "auxiliary_loss_mlp": 0.01272652, + "balance_loss_clip": 0.06276584, + "balance_loss_mlp": 0.01259968, + "epoch": 0.5087028408236886, + "flos": 20638102262400.0, + "grad_norm": 1.6011145053716882, + "language_loss": 0.69150776, + "learning_rate": 2.0406959369006754e-06, + "loss": 0.76856101, + "num_input_tokens_seen": 181950665, + "router_z_loss_clip": 1.56054688, + "router_z_loss_mlp": 0.12677002, + "step": 8461, + "time_per_iteration": 2.5423507690429688 + }, + { + "auxiliary_loss_clip": 0.06423958, + "auxiliary_loss_mlp": 0.01270241, + "balance_loss_clip": 0.06275716, + "balance_loss_mlp": 0.01258052, + "epoch": 0.5087629640763566, + "flos": 25600996540800.0, + "grad_norm": 1.5704547594862186, + "language_loss": 0.76788783, + "learning_rate": 2.0403065550847375e-06, + "loss": 0.84482986, + "num_input_tokens_seen": 181971270, + "router_z_loss_clip": 1.48046875, + "router_z_loss_mlp": 0.12200928, + "step": 8462, + "time_per_iteration": 2.5558974742889404 + }, + { + "auxiliary_loss_clip": 0.06431352, + "auxiliary_loss_mlp": 0.01267196, + "balance_loss_clip": 0.06279621, + "balance_loss_mlp": 0.01255251, + "epoch": 0.5088230873290245, + "flos": 13266743351040.0, + "grad_norm": 1.98943246577739, + "language_loss": 0.81940925, + "learning_rate": 2.0399171717403706e-06, + "loss": 0.89639473, + "num_input_tokens_seen": 181988410, + "router_z_loss_clip": 1.515625, + "router_z_loss_mlp": 0.11938477, + "step": 8463, + "time_per_iteration": 2.5092854499816895 + }, + { + "auxiliary_loss_clip": 0.06429717, + "auxiliary_loss_mlp": 0.01268295, + "balance_loss_clip": 0.06277439, + "balance_loss_mlp": 0.01255974, + "epoch": 0.5088832105816925, + "flos": 20048959405440.0, + "grad_norm": 4.395577464341562, + "language_loss": 0.76639092, + "learning_rate": 2.039527786882341e-06, + "loss": 0.84337103, + "num_input_tokens_seen": 182006530, + "router_z_loss_clip": 1.5234375, + "router_z_loss_mlp": 0.12310791, + "step": 8464, + "time_per_iteration": 2.5100886821746826 + }, + { + "auxiliary_loss_clip": 0.06332754, + "auxiliary_loss_mlp": 0.01251908, + "balance_loss_clip": 0.06266724, + "balance_loss_mlp": 0.01250196, + "epoch": 0.5089433338343604, + "flos": 67445072184960.0, + "grad_norm": 0.674227101372006, + "language_loss": 0.59172922, + "learning_rate": 2.0391384005254133e-06, + "loss": 0.66757584, + "num_input_tokens_seen": 182074240, + "router_z_loss_clip": 0.66357422, + "router_z_loss_mlp": 0.01716614, + "step": 8465, + "time_per_iteration": 3.288703441619873 + }, + { + "auxiliary_loss_clip": 0.06429654, + "auxiliary_loss_mlp": 0.01267036, + "balance_loss_clip": 0.06277246, + "balance_loss_mlp": 0.01255026, + "epoch": 0.5090034570870284, + "flos": 22716845879040.0, + "grad_norm": 1.7766724873518385, + "language_loss": 0.80341208, + "learning_rate": 2.038749012684354e-06, + "loss": 0.88037896, + "num_input_tokens_seen": 182093360, + "router_z_loss_clip": 1.52246094, + "router_z_loss_mlp": 0.12005615, + "step": 8466, + "time_per_iteration": 3.9034652709960938 + }, + { + "auxiliary_loss_clip": 0.06428038, + "auxiliary_loss_mlp": 0.01262494, + "balance_loss_clip": 0.06276771, + "balance_loss_mlp": 0.01250603, + "epoch": 0.5090635803396963, + "flos": 20451537146880.0, + "grad_norm": 1.506058765425311, + "language_loss": 0.78925973, + "learning_rate": 2.0383596233739286e-06, + "loss": 0.86616498, + "num_input_tokens_seen": 182110170, + "router_z_loss_clip": 1.51074219, + "router_z_loss_mlp": 0.11895752, + "step": 8467, + "time_per_iteration": 2.483701229095459 + }, + { + "auxiliary_loss_clip": 0.06425558, + "auxiliary_loss_mlp": 0.01269027, + "balance_loss_clip": 0.06277174, + "balance_loss_mlp": 0.01257565, + "epoch": 0.5091237035923644, + "flos": 23775637271040.0, + "grad_norm": 1.593164773968791, + "language_loss": 0.74572229, + "learning_rate": 2.0379702326089013e-06, + "loss": 0.82266819, + "num_input_tokens_seen": 182129570, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.11468506, + "step": 8468, + "time_per_iteration": 2.550657033920288 + }, + { + "auxiliary_loss_clip": 0.06425174, + "auxiliary_loss_mlp": 0.01264118, + "balance_loss_clip": 0.06274162, + "balance_loss_mlp": 0.01252108, + "epoch": 0.5091838268450323, + "flos": 18332990040960.0, + "grad_norm": 1.7522760366327397, + "language_loss": 0.78574747, + "learning_rate": 2.03758084040404e-06, + "loss": 0.86264038, + "num_input_tokens_seen": 182147565, + "router_z_loss_clip": 1.50976562, + "router_z_loss_mlp": 0.12011719, + "step": 8469, + "time_per_iteration": 2.4776134490966797 + }, + { + "auxiliary_loss_clip": 0.06431125, + "auxiliary_loss_mlp": 0.012685, + "balance_loss_clip": 0.0627888, + "balance_loss_mlp": 0.01256526, + "epoch": 0.5092439500977003, + "flos": 29064982256640.0, + "grad_norm": 1.429622552318455, + "language_loss": 0.6959703, + "learning_rate": 2.037191446774109e-06, + "loss": 0.7729665, + "num_input_tokens_seen": 182169695, + "router_z_loss_clip": 1.52050781, + "router_z_loss_mlp": 0.11968994, + "step": 8470, + "time_per_iteration": 4.06356954574585 + }, + { + "auxiliary_loss_clip": 0.06432179, + "auxiliary_loss_mlp": 0.01268896, + "balance_loss_clip": 0.06276524, + "balance_loss_mlp": 0.01256278, + "epoch": 0.5093040733503682, + "flos": 13559134573440.0, + "grad_norm": 1.739958995441318, + "language_loss": 0.73736298, + "learning_rate": 2.0368020517338745e-06, + "loss": 0.81437373, + "num_input_tokens_seen": 182186385, + "router_z_loss_clip": 1.55664062, + "router_z_loss_mlp": 0.12615967, + "step": 8471, + "time_per_iteration": 2.5252416133880615 + }, + { + "auxiliary_loss_clip": 0.06330768, + "auxiliary_loss_mlp": 0.01255323, + "balance_loss_clip": 0.06264758, + "balance_loss_mlp": 0.01253313, + "epoch": 0.5093641966030362, + "flos": 68927838837120.0, + "grad_norm": 0.738097810584446, + "language_loss": 0.58042324, + "learning_rate": 2.036412655298103e-06, + "loss": 0.65628415, + "num_input_tokens_seen": 182247095, + "router_z_loss_clip": 0.66015625, + "router_z_loss_mlp": 0.02009583, + "step": 8472, + "time_per_iteration": 3.1610372066497803 + }, + { + "auxiliary_loss_clip": 0.06430018, + "auxiliary_loss_mlp": 0.01266308, + "balance_loss_clip": 0.06275266, + "balance_loss_mlp": 0.01254953, + "epoch": 0.5094243198557042, + "flos": 21587545676160.0, + "grad_norm": 1.8344067804800992, + "language_loss": 0.69000626, + "learning_rate": 2.03602325748156e-06, + "loss": 0.76696956, + "num_input_tokens_seen": 182266380, + "router_z_loss_clip": 1.54492188, + "router_z_loss_mlp": 0.11358643, + "step": 8473, + "time_per_iteration": 2.5834267139434814 + }, + { + "auxiliary_loss_clip": 0.06430315, + "auxiliary_loss_mlp": 0.01267159, + "balance_loss_clip": 0.06279565, + "balance_loss_mlp": 0.01255143, + "epoch": 0.5094844431083722, + "flos": 28848382652160.0, + "grad_norm": 2.5664905714857422, + "language_loss": 0.85103536, + "learning_rate": 2.0356338582990105e-06, + "loss": 0.92801011, + "num_input_tokens_seen": 182284685, + "router_z_loss_clip": 1.50585938, + "router_z_loss_mlp": 0.12011719, + "step": 8474, + "time_per_iteration": 2.5577685832977295 + }, + { + "auxiliary_loss_clip": 0.06432322, + "auxiliary_loss_mlp": 0.0126557, + "balance_loss_clip": 0.06278027, + "balance_loss_mlp": 0.01253488, + "epoch": 0.5095445663610402, + "flos": 14981454904320.0, + "grad_norm": 1.910358455820602, + "language_loss": 0.64868319, + "learning_rate": 2.035244457765222e-06, + "loss": 0.72566211, + "num_input_tokens_seen": 182301810, + "router_z_loss_clip": 1.54101562, + "router_z_loss_mlp": 0.12091064, + "step": 8475, + "time_per_iteration": 3.9494359493255615 + }, + { + "auxiliary_loss_clip": 0.06435733, + "auxiliary_loss_mlp": 0.01268463, + "balance_loss_clip": 0.0627934, + "balance_loss_mlp": 0.01255779, + "epoch": 0.5096046896137081, + "flos": 20783354515200.0, + "grad_norm": 2.1677913618760623, + "language_loss": 0.8248105, + "learning_rate": 2.0348550558949605e-06, + "loss": 0.90185243, + "num_input_tokens_seen": 182320285, + "router_z_loss_clip": 1.56445312, + "router_z_loss_mlp": 0.12689209, + "step": 8476, + "time_per_iteration": 2.533986806869507 + }, + { + "auxiliary_loss_clip": 0.06432153, + "auxiliary_loss_mlp": 0.01267228, + "balance_loss_clip": 0.06275326, + "balance_loss_mlp": 0.01254628, + "epoch": 0.5096648128663761, + "flos": 23191735294080.0, + "grad_norm": 2.112211155301917, + "language_loss": 0.81339389, + "learning_rate": 2.0344656527029917e-06, + "loss": 0.89038771, + "num_input_tokens_seen": 182339465, + "router_z_loss_clip": 1.56738281, + "router_z_loss_mlp": 0.12609863, + "step": 8477, + "time_per_iteration": 2.614363193511963 + }, + { + "auxiliary_loss_clip": 0.06429507, + "auxiliary_loss_mlp": 0.01267776, + "balance_loss_clip": 0.0627466, + "balance_loss_mlp": 0.01254741, + "epoch": 0.509724936119044, + "flos": 22315945219200.0, + "grad_norm": 1.7511302636686703, + "language_loss": 0.61918831, + "learning_rate": 2.034076248204082e-06, + "loss": 0.69616115, + "num_input_tokens_seen": 182358375, + "router_z_loss_clip": 1.546875, + "router_z_loss_mlp": 0.13024902, + "step": 8478, + "time_per_iteration": 2.5054080486297607 + }, + { + "auxiliary_loss_clip": 0.06424017, + "auxiliary_loss_mlp": 0.01267914, + "balance_loss_clip": 0.06273499, + "balance_loss_mlp": 0.01256136, + "epoch": 0.509785059371712, + "flos": 26294372277120.0, + "grad_norm": 1.8013233320362476, + "language_loss": 0.66670853, + "learning_rate": 2.0336868424129968e-06, + "loss": 0.74362785, + "num_input_tokens_seen": 182377935, + "router_z_loss_clip": 1.50390625, + "router_z_loss_mlp": 0.11773682, + "step": 8479, + "time_per_iteration": 2.5773558616638184 + }, + { + "auxiliary_loss_clip": 0.06427336, + "auxiliary_loss_mlp": 0.01266645, + "balance_loss_clip": 0.06276052, + "balance_loss_mlp": 0.01254795, + "epoch": 0.50984518262438, + "flos": 22970942985600.0, + "grad_norm": 1.5048945656562989, + "language_loss": 0.69523573, + "learning_rate": 2.0332974353445037e-06, + "loss": 0.77217555, + "num_input_tokens_seen": 182396440, + "router_z_loss_clip": 1.51367188, + "router_z_loss_mlp": 0.1184082, + "step": 8480, + "time_per_iteration": 2.5308327674865723 + }, + { + "auxiliary_loss_clip": 0.06433358, + "auxiliary_loss_mlp": 0.01264781, + "balance_loss_clip": 0.06277278, + "balance_loss_mlp": 0.01252908, + "epoch": 0.509905305877048, + "flos": 26220551230080.0, + "grad_norm": 1.695627830792001, + "language_loss": 0.79513025, + "learning_rate": 2.0329080270133688e-06, + "loss": 0.87211168, + "num_input_tokens_seen": 182415890, + "router_z_loss_clip": 1.5625, + "router_z_loss_mlp": 0.11865234, + "step": 8481, + "time_per_iteration": 3.9862852096557617 + }, + { + "auxiliary_loss_clip": 0.06423856, + "auxiliary_loss_mlp": 0.01267023, + "balance_loss_clip": 0.06274414, + "balance_loss_mlp": 0.01255186, + "epoch": 0.5099654291297159, + "flos": 20346381872640.0, + "grad_norm": 1.4463685523965593, + "language_loss": 0.83447778, + "learning_rate": 2.0325186174343578e-06, + "loss": 0.91138661, + "num_input_tokens_seen": 182434235, + "router_z_loss_clip": 1.49414062, + "router_z_loss_mlp": 0.1184082, + "step": 8482, + "time_per_iteration": 2.539057970046997 + }, + { + "auxiliary_loss_clip": 0.06432243, + "auxiliary_loss_mlp": 0.01269925, + "balance_loss_clip": 0.0627501, + "balance_loss_mlp": 0.01257682, + "epoch": 0.5100255523823839, + "flos": 29061711947520.0, + "grad_norm": 1.7174746607832896, + "language_loss": 0.85923511, + "learning_rate": 2.032129206622238e-06, + "loss": 0.93625677, + "num_input_tokens_seen": 182454360, + "router_z_loss_clip": 1.57128906, + "router_z_loss_mlp": 0.12243652, + "step": 8483, + "time_per_iteration": 2.5567803382873535 + }, + { + "auxiliary_loss_clip": 0.06428108, + "auxiliary_loss_mlp": 0.01267951, + "balance_loss_clip": 0.06273945, + "balance_loss_mlp": 0.01256352, + "epoch": 0.5100856756350518, + "flos": 22462539137280.0, + "grad_norm": 3.7192784343186367, + "language_loss": 0.83011222, + "learning_rate": 2.031739794591775e-06, + "loss": 0.90707278, + "num_input_tokens_seen": 182471940, + "router_z_loss_clip": 1.54101562, + "router_z_loss_mlp": 0.11590576, + "step": 8484, + "time_per_iteration": 2.50913143157959 + }, + { + "auxiliary_loss_clip": 0.0642792, + "auxiliary_loss_mlp": 0.0126741, + "balance_loss_clip": 0.06274521, + "balance_loss_mlp": 0.01254953, + "epoch": 0.5101457988877198, + "flos": 19176942764160.0, + "grad_norm": 1.8545423824290383, + "language_loss": 0.81929463, + "learning_rate": 2.031350381357736e-06, + "loss": 0.89624798, + "num_input_tokens_seen": 182490685, + "router_z_loss_clip": 1.53222656, + "router_z_loss_mlp": 0.12463379, + "step": 8485, + "time_per_iteration": 2.479165554046631 + }, + { + "auxiliary_loss_clip": 0.06421156, + "auxiliary_loss_mlp": 0.01266312, + "balance_loss_clip": 0.06272486, + "balance_loss_mlp": 0.01254522, + "epoch": 0.5102059221403878, + "flos": 14871645728640.0, + "grad_norm": 1.8580884452241668, + "language_loss": 0.73778898, + "learning_rate": 2.0309609669348874e-06, + "loss": 0.81466365, + "num_input_tokens_seen": 182508325, + "router_z_loss_clip": 1.48632812, + "router_z_loss_mlp": 0.11791992, + "step": 8486, + "time_per_iteration": 2.502035140991211 + }, + { + "auxiliary_loss_clip": 0.06432486, + "auxiliary_loss_mlp": 0.01268204, + "balance_loss_clip": 0.06276038, + "balance_loss_mlp": 0.01255115, + "epoch": 0.5102660453930558, + "flos": 22966876062720.0, + "grad_norm": 1.455931130318143, + "language_loss": 0.6993084, + "learning_rate": 2.0305715513379953e-06, + "loss": 0.77631527, + "num_input_tokens_seen": 182527020, + "router_z_loss_clip": 1.56542969, + "router_z_loss_mlp": 0.13092041, + "step": 8487, + "time_per_iteration": 2.5022764205932617 + }, + { + "auxiliary_loss_clip": 0.06425266, + "auxiliary_loss_mlp": 0.01265042, + "balance_loss_clip": 0.06274921, + "balance_loss_mlp": 0.01252072, + "epoch": 0.5103261686457238, + "flos": 23156082581760.0, + "grad_norm": 2.025146562514191, + "language_loss": 0.72757244, + "learning_rate": 2.030182134581827e-06, + "loss": 0.80447549, + "num_input_tokens_seen": 182543505, + "router_z_loss_clip": 1.50097656, + "router_z_loss_mlp": 0.12963867, + "step": 8488, + "time_per_iteration": 2.5181195735931396 + }, + { + "auxiliary_loss_clip": 0.06435129, + "auxiliary_loss_mlp": 0.01271711, + "balance_loss_clip": 0.06278089, + "balance_loss_mlp": 0.01259861, + "epoch": 0.5103862918983917, + "flos": 14324444640000.0, + "grad_norm": 1.9274143081394266, + "language_loss": 0.69714773, + "learning_rate": 2.0297927166811503e-06, + "loss": 0.77421612, + "num_input_tokens_seen": 182562250, + "router_z_loss_clip": 1.56933594, + "router_z_loss_mlp": 0.11846924, + "step": 8489, + "time_per_iteration": 2.491626739501953 + }, + { + "auxiliary_loss_clip": 0.06427855, + "auxiliary_loss_mlp": 0.01262645, + "balance_loss_clip": 0.06272568, + "balance_loss_mlp": 0.01251231, + "epoch": 0.5104464151510597, + "flos": 25855638698880.0, + "grad_norm": 1.7641928011440773, + "language_loss": 0.73334658, + "learning_rate": 2.0294032976507297e-06, + "loss": 0.81025159, + "num_input_tokens_seen": 182581910, + "router_z_loss_clip": 1.55078125, + "router_z_loss_mlp": 0.11407471, + "step": 8490, + "time_per_iteration": 2.6192476749420166 + }, + { + "auxiliary_loss_clip": 0.06422485, + "auxiliary_loss_mlp": 0.01268102, + "balance_loss_clip": 0.06271752, + "balance_loss_mlp": 0.01256628, + "epoch": 0.5105065384037276, + "flos": 21659354225280.0, + "grad_norm": 1.995020059533993, + "language_loss": 0.8080864, + "learning_rate": 2.0290138775053337e-06, + "loss": 0.8849923, + "num_input_tokens_seen": 182601350, + "router_z_loss_clip": 1.50585938, + "router_z_loss_mlp": 0.11474609, + "step": 8491, + "time_per_iteration": 2.5444910526275635 + }, + { + "auxiliary_loss_clip": 0.0642098, + "auxiliary_loss_mlp": 0.01268766, + "balance_loss_clip": 0.06274496, + "balance_loss_mlp": 0.01257089, + "epoch": 0.5105666616563956, + "flos": 22498066068480.0, + "grad_norm": 2.247071959069697, + "language_loss": 0.79263282, + "learning_rate": 2.028624456259728e-06, + "loss": 0.86953026, + "num_input_tokens_seen": 182619660, + "router_z_loss_clip": 1.46386719, + "router_z_loss_mlp": 0.11676025, + "step": 8492, + "time_per_iteration": 2.656888008117676 + }, + { + "auxiliary_loss_clip": 0.06433547, + "auxiliary_loss_mlp": 0.01271088, + "balance_loss_clip": 0.06276479, + "balance_loss_mlp": 0.01257838, + "epoch": 0.5106267849090635, + "flos": 22462371429120.0, + "grad_norm": 1.9309641209432507, + "language_loss": 0.77830237, + "learning_rate": 2.0282350339286804e-06, + "loss": 0.85534871, + "num_input_tokens_seen": 182639815, + "router_z_loss_clip": 1.5703125, + "router_z_loss_mlp": 0.13256836, + "step": 8493, + "time_per_iteration": 2.550326347351074 + }, + { + "auxiliary_loss_clip": 0.06427996, + "auxiliary_loss_mlp": 0.01265337, + "balance_loss_clip": 0.06275648, + "balance_loss_mlp": 0.01252879, + "epoch": 0.5106869081617316, + "flos": 23553335589120.0, + "grad_norm": 1.7342765336142327, + "language_loss": 0.84044284, + "learning_rate": 2.0278456105269574e-06, + "loss": 0.91737616, + "num_input_tokens_seen": 182659655, + "router_z_loss_clip": 1.52539062, + "router_z_loss_mlp": 0.12457275, + "step": 8494, + "time_per_iteration": 2.582463026046753 + }, + { + "auxiliary_loss_clip": 0.06430838, + "auxiliary_loss_mlp": 0.0126671, + "balance_loss_clip": 0.0627555, + "balance_loss_mlp": 0.0125492, + "epoch": 0.5107470314143995, + "flos": 26799547743360.0, + "grad_norm": 2.0062643152671877, + "language_loss": 0.79773927, + "learning_rate": 2.027456186069326e-06, + "loss": 0.87471473, + "num_input_tokens_seen": 182677075, + "router_z_loss_clip": 1.55273438, + "router_z_loss_mlp": 0.11798096, + "step": 8495, + "time_per_iteration": 2.5472564697265625 + }, + { + "auxiliary_loss_clip": 0.06425454, + "auxiliary_loss_mlp": 0.01268533, + "balance_loss_clip": 0.06273226, + "balance_loss_mlp": 0.01256308, + "epoch": 0.5108071546670675, + "flos": 25746877699200.0, + "grad_norm": 1.417654874659872, + "language_loss": 0.78675163, + "learning_rate": 2.0270667605705535e-06, + "loss": 0.86369145, + "num_input_tokens_seen": 182699625, + "router_z_loss_clip": 1.52148438, + "router_z_loss_mlp": 0.12231445, + "step": 8496, + "time_per_iteration": 2.5841569900512695 + }, + { + "auxiliary_loss_clip": 0.06422253, + "auxiliary_loss_mlp": 0.01267746, + "balance_loss_clip": 0.06273818, + "balance_loss_mlp": 0.01255998, + "epoch": 0.5108672779197354, + "flos": 18703478868480.0, + "grad_norm": 1.866540646775448, + "language_loss": 0.7912823, + "learning_rate": 2.0266773340454066e-06, + "loss": 0.8681823, + "num_input_tokens_seen": 182717020, + "router_z_loss_clip": 1.48046875, + "router_z_loss_mlp": 0.11755371, + "step": 8497, + "time_per_iteration": 2.5111966133117676 + }, + { + "auxiliary_loss_clip": 0.06429158, + "auxiliary_loss_mlp": 0.0126396, + "balance_loss_clip": 0.06277271, + "balance_loss_mlp": 0.01252277, + "epoch": 0.5109274011724034, + "flos": 26695482572160.0, + "grad_norm": 1.6666059931479484, + "language_loss": 0.81941032, + "learning_rate": 2.0262879065086525e-06, + "loss": 0.89634144, + "num_input_tokens_seen": 182736955, + "router_z_loss_clip": 1.51757812, + "router_z_loss_mlp": 0.11682129, + "step": 8498, + "time_per_iteration": 2.608631134033203 + }, + { + "auxiliary_loss_clip": 0.06424002, + "auxiliary_loss_mlp": 0.01271992, + "balance_loss_clip": 0.06274551, + "balance_loss_mlp": 0.01260267, + "epoch": 0.5109875244250714, + "flos": 22790666926080.0, + "grad_norm": 1.6923312462183162, + "language_loss": 0.71301198, + "learning_rate": 2.0258984779750584e-06, + "loss": 0.78997189, + "num_input_tokens_seen": 182757620, + "router_z_loss_clip": 1.49316406, + "router_z_loss_mlp": 0.11724854, + "step": 8499, + "time_per_iteration": 2.5150094032287598 + }, + { + "auxiliary_loss_clip": 0.06427284, + "auxiliary_loss_mlp": 0.01266703, + "balance_loss_clip": 0.06273851, + "balance_loss_mlp": 0.01255003, + "epoch": 0.5110476476777394, + "flos": 35596958492160.0, + "grad_norm": 1.3954443671639698, + "language_loss": 0.72611153, + "learning_rate": 2.0255090484593914e-06, + "loss": 0.80305135, + "num_input_tokens_seen": 182780195, + "router_z_loss_clip": 1.53320312, + "router_z_loss_mlp": 0.11694336, + "step": 8500, + "time_per_iteration": 2.633239269256592 + }, + { + "auxiliary_loss_clip": 0.06435662, + "auxiliary_loss_mlp": 0.01270607, + "balance_loss_clip": 0.06276006, + "balance_loss_mlp": 0.01256803, + "epoch": 0.5111077709304074, + "flos": 19286751939840.0, + "grad_norm": 2.7349973685574973, + "language_loss": 0.63562721, + "learning_rate": 2.0251196179764183e-06, + "loss": 0.71268988, + "num_input_tokens_seen": 182795765, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.13800049, + "step": 8501, + "time_per_iteration": 2.5091230869293213 + }, + { + "auxiliary_loss_clip": 0.06434844, + "auxiliary_loss_mlp": 0.01273353, + "balance_loss_clip": 0.06276836, + "balance_loss_mlp": 0.01260848, + "epoch": 0.5111678941830753, + "flos": 20674551588480.0, + "grad_norm": 1.8816899756355796, + "language_loss": 0.88057411, + "learning_rate": 2.024730186540907e-06, + "loss": 0.95765609, + "num_input_tokens_seen": 182813120, + "router_z_loss_clip": 1.58007812, + "router_z_loss_mlp": 0.12506104, + "step": 8502, + "time_per_iteration": 2.517728090286255 + }, + { + "auxiliary_loss_clip": 0.06425811, + "auxiliary_loss_mlp": 0.01265447, + "balance_loss_clip": 0.06274389, + "balance_loss_mlp": 0.01253663, + "epoch": 0.5112280174357433, + "flos": 26295336599040.0, + "grad_norm": 1.4524091598864723, + "language_loss": 0.82627225, + "learning_rate": 2.0243407541676253e-06, + "loss": 0.90318477, + "num_input_tokens_seen": 182835745, + "router_z_loss_clip": 1.51367188, + "router_z_loss_mlp": 0.11779785, + "step": 8503, + "time_per_iteration": 2.711451768875122 + }, + { + "auxiliary_loss_clip": 0.06333953, + "auxiliary_loss_mlp": 0.01255603, + "balance_loss_clip": 0.06268184, + "balance_loss_mlp": 0.0125384, + "epoch": 0.5112881406884112, + "flos": 59490706492800.0, + "grad_norm": 0.8512772291593351, + "language_loss": 0.63800937, + "learning_rate": 2.023951320871339e-06, + "loss": 0.71390492, + "num_input_tokens_seen": 182892540, + "router_z_loss_clip": 0.65966797, + "router_z_loss_mlp": 0.01766968, + "step": 8504, + "time_per_iteration": 3.1690919399261475 + }, + { + "auxiliary_loss_clip": 0.06425914, + "auxiliary_loss_mlp": 0.01265825, + "balance_loss_clip": 0.06275845, + "balance_loss_mlp": 0.01253576, + "epoch": 0.5113482639410792, + "flos": 26476073856000.0, + "grad_norm": 1.7986544100736102, + "language_loss": 0.84377933, + "learning_rate": 2.023561886666816e-06, + "loss": 0.92069674, + "num_input_tokens_seen": 182911515, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.12261963, + "step": 8505, + "time_per_iteration": 2.5755858421325684 + }, + { + "auxiliary_loss_clip": 0.0643035, + "auxiliary_loss_mlp": 0.01265823, + "balance_loss_clip": 0.06279911, + "balance_loss_mlp": 0.01254229, + "epoch": 0.5114083871937471, + "flos": 29903190975360.0, + "grad_norm": 1.7295208629505698, + "language_loss": 0.75707996, + "learning_rate": 2.0231724515688246e-06, + "loss": 0.83404166, + "num_input_tokens_seen": 182930860, + "router_z_loss_clip": 1.50585938, + "router_z_loss_mlp": 0.11590576, + "step": 8506, + "time_per_iteration": 3.947927713394165 + }, + { + "auxiliary_loss_clip": 0.0642788, + "auxiliary_loss_mlp": 0.01268518, + "balance_loss_clip": 0.06276722, + "balance_loss_mlp": 0.01255303, + "epoch": 0.5114685104464152, + "flos": 24321161278080.0, + "grad_norm": 1.7165713389532073, + "language_loss": 0.58250427, + "learning_rate": 2.022783015592131e-06, + "loss": 0.65946829, + "num_input_tokens_seen": 182949960, + "router_z_loss_clip": 1.50976562, + "router_z_loss_mlp": 0.13214111, + "step": 8507, + "time_per_iteration": 2.5460915565490723 + }, + { + "auxiliary_loss_clip": 0.06432099, + "auxiliary_loss_mlp": 0.01269517, + "balance_loss_clip": 0.06281347, + "balance_loss_mlp": 0.01257023, + "epoch": 0.5115286336990831, + "flos": 17024965079040.0, + "grad_norm": 1.7959155859668763, + "language_loss": 0.8588531, + "learning_rate": 2.022393578751503e-06, + "loss": 0.93586934, + "num_input_tokens_seen": 182968085, + "router_z_loss_clip": 1.5078125, + "router_z_loss_mlp": 0.12475586, + "step": 8508, + "time_per_iteration": 2.501931667327881 + }, + { + "auxiliary_loss_clip": 0.06430113, + "auxiliary_loss_mlp": 0.01267037, + "balance_loss_clip": 0.06279224, + "balance_loss_mlp": 0.012544, + "epoch": 0.5115887569517511, + "flos": 23666121584640.0, + "grad_norm": 1.985741338533524, + "language_loss": 0.72740698, + "learning_rate": 2.022004141061709e-06, + "loss": 0.80437851, + "num_input_tokens_seen": 182987275, + "router_z_loss_clip": 1.50976562, + "router_z_loss_mlp": 0.12640381, + "step": 8509, + "time_per_iteration": 3.9570322036743164 + }, + { + "auxiliary_loss_clip": 0.06425552, + "auxiliary_loss_mlp": 0.01264949, + "balance_loss_clip": 0.06277531, + "balance_loss_mlp": 0.01254476, + "epoch": 0.511648880204419, + "flos": 16112725678080.0, + "grad_norm": 1.6522242028614569, + "language_loss": 0.76532018, + "learning_rate": 2.0216147025375153e-06, + "loss": 0.84222525, + "num_input_tokens_seen": 183004700, + "router_z_loss_clip": 1.47949219, + "router_z_loss_mlp": 0.10479736, + "step": 8510, + "time_per_iteration": 2.5000293254852295 + }, + { + "auxiliary_loss_clip": 0.06424148, + "auxiliary_loss_mlp": 0.01267464, + "balance_loss_clip": 0.06276409, + "balance_loss_mlp": 0.01256402, + "epoch": 0.511709003457087, + "flos": 32643221414400.0, + "grad_norm": 1.8483097722803792, + "language_loss": 0.71295965, + "learning_rate": 2.0212252631936907e-06, + "loss": 0.78987575, + "num_input_tokens_seen": 183025830, + "router_z_loss_clip": 1.47753906, + "router_z_loss_mlp": 0.11053467, + "step": 8511, + "time_per_iteration": 2.5970981121063232 + }, + { + "auxiliary_loss_clip": 0.06426742, + "auxiliary_loss_mlp": 0.01265633, + "balance_loss_clip": 0.06277999, + "balance_loss_mlp": 0.0125404, + "epoch": 0.511769126709755, + "flos": 21768492568320.0, + "grad_norm": 1.8966780464465567, + "language_loss": 0.67139721, + "learning_rate": 2.020835823045001e-06, + "loss": 0.74832094, + "num_input_tokens_seen": 183045140, + "router_z_loss_clip": 1.48632812, + "router_z_loss_mlp": 0.11584473, + "step": 8512, + "time_per_iteration": 2.5369138717651367 + }, + { + "auxiliary_loss_clip": 0.06426971, + "auxiliary_loss_mlp": 0.01266218, + "balance_loss_clip": 0.06273089, + "balance_loss_mlp": 0.01253588, + "epoch": 0.511829249962423, + "flos": 23922231189120.0, + "grad_norm": 1.7695600544803753, + "language_loss": 0.67171764, + "learning_rate": 2.0204463821062146e-06, + "loss": 0.7486496, + "num_input_tokens_seen": 183063935, + "router_z_loss_clip": 1.54003906, + "router_z_loss_mlp": 0.12628174, + "step": 8513, + "time_per_iteration": 2.517648220062256 + }, + { + "auxiliary_loss_clip": 0.06423096, + "auxiliary_loss_mlp": 0.01268209, + "balance_loss_clip": 0.06275445, + "balance_loss_mlp": 0.01255948, + "epoch": 0.511889373215091, + "flos": 23732856961920.0, + "grad_norm": 1.8747309224946216, + "language_loss": 0.68931103, + "learning_rate": 2.0200569403921e-06, + "loss": 0.76622409, + "num_input_tokens_seen": 183084135, + "router_z_loss_clip": 1.47753906, + "router_z_loss_mlp": 0.1227417, + "step": 8514, + "time_per_iteration": 3.969726085662842 + }, + { + "auxiliary_loss_clip": 0.06422693, + "auxiliary_loss_mlp": 0.01265425, + "balance_loss_clip": 0.06273951, + "balance_loss_mlp": 0.01254357, + "epoch": 0.5119494964677589, + "flos": 28119144568320.0, + "grad_norm": 1.955376754159203, + "language_loss": 0.66104603, + "learning_rate": 2.019667497917424e-06, + "loss": 0.7379272, + "num_input_tokens_seen": 183104570, + "router_z_loss_clip": 1.48632812, + "router_z_loss_mlp": 0.11065674, + "step": 8515, + "time_per_iteration": 2.586984872817993 + }, + { + "auxiliary_loss_clip": 0.06415779, + "auxiliary_loss_mlp": 0.01265644, + "balance_loss_clip": 0.0627024, + "balance_loss_mlp": 0.01254754, + "epoch": 0.5120096197204269, + "flos": 24980225967360.0, + "grad_norm": 1.8485741123105555, + "language_loss": 0.76016974, + "learning_rate": 2.019278054696955e-06, + "loss": 0.83698404, + "num_input_tokens_seen": 183123850, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.10894775, + "step": 8516, + "time_per_iteration": 2.5933895111083984 + }, + { + "auxiliary_loss_clip": 0.06425153, + "auxiliary_loss_mlp": 0.01265819, + "balance_loss_clip": 0.0627657, + "balance_loss_mlp": 0.01254136, + "epoch": 0.5120697429730948, + "flos": 17973863441280.0, + "grad_norm": 1.9611042257937292, + "language_loss": 0.78053069, + "learning_rate": 2.0188886107454595e-06, + "loss": 0.85744041, + "num_input_tokens_seen": 183141725, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.11694336, + "step": 8517, + "time_per_iteration": 2.4962363243103027 + }, + { + "auxiliary_loss_clip": 0.06430522, + "auxiliary_loss_mlp": 0.01271394, + "balance_loss_clip": 0.06276728, + "balance_loss_mlp": 0.01259211, + "epoch": 0.5121298662257628, + "flos": 23298651504000.0, + "grad_norm": 1.7759167489555023, + "language_loss": 0.74719632, + "learning_rate": 2.0184991660777063e-06, + "loss": 0.82421547, + "num_input_tokens_seen": 183161300, + "router_z_loss_clip": 1.53710938, + "router_z_loss_mlp": 0.12164307, + "step": 8518, + "time_per_iteration": 2.5037240982055664 + }, + { + "auxiliary_loss_clip": 0.06424905, + "auxiliary_loss_mlp": 0.0126823, + "balance_loss_clip": 0.06274471, + "balance_loss_mlp": 0.01256529, + "epoch": 0.5121899894784308, + "flos": 17316769322880.0, + "grad_norm": 1.687169580100827, + "language_loss": 0.78467947, + "learning_rate": 2.0181097207084625e-06, + "loss": 0.86161083, + "num_input_tokens_seen": 183180495, + "router_z_loss_clip": 1.50097656, + "router_z_loss_mlp": 0.11706543, + "step": 8519, + "time_per_iteration": 2.524724006652832 + }, + { + "auxiliary_loss_clip": 0.06422982, + "auxiliary_loss_mlp": 0.01264919, + "balance_loss_clip": 0.06273712, + "balance_loss_mlp": 0.01253016, + "epoch": 0.5122501127310988, + "flos": 24935978211840.0, + "grad_norm": 1.6239003664198155, + "language_loss": 0.79446238, + "learning_rate": 2.017720274652497e-06, + "loss": 0.87134135, + "num_input_tokens_seen": 183200330, + "router_z_loss_clip": 1.49121094, + "router_z_loss_mlp": 0.11907959, + "step": 8520, + "time_per_iteration": 2.522068500518799 + }, + { + "auxiliary_loss_clip": 0.06431363, + "auxiliary_loss_mlp": 0.01269526, + "balance_loss_clip": 0.06276108, + "balance_loss_mlp": 0.01256151, + "epoch": 0.5123102359837667, + "flos": 18448878637440.0, + "grad_norm": 1.8569595834923718, + "language_loss": 0.81725198, + "learning_rate": 2.0173308279245765e-06, + "loss": 0.89426088, + "num_input_tokens_seen": 183218230, + "router_z_loss_clip": 1.55175781, + "router_z_loss_mlp": 0.13366699, + "step": 8521, + "time_per_iteration": 3.956547498703003 + }, + { + "auxiliary_loss_clip": 0.06422685, + "auxiliary_loss_mlp": 0.01264857, + "balance_loss_clip": 0.0627308, + "balance_loss_mlp": 0.01253383, + "epoch": 0.5123703592364347, + "flos": 26691625284480.0, + "grad_norm": 3.145804815574879, + "language_loss": 0.68764591, + "learning_rate": 2.0169413805394692e-06, + "loss": 0.7645213, + "num_input_tokens_seen": 183236735, + "router_z_loss_clip": 1.49316406, + "router_z_loss_mlp": 0.11462402, + "step": 8522, + "time_per_iteration": 2.53696608543396 + }, + { + "auxiliary_loss_clip": 0.06430639, + "auxiliary_loss_mlp": 0.01269235, + "balance_loss_clip": 0.06276414, + "balance_loss_mlp": 0.01256039, + "epoch": 0.5124304824891026, + "flos": 28811555982720.0, + "grad_norm": 1.853417160064295, + "language_loss": 0.622962, + "learning_rate": 2.0165519325119433e-06, + "loss": 0.69996071, + "num_input_tokens_seen": 183257550, + "router_z_loss_clip": 1.54199219, + "router_z_loss_mlp": 0.13201904, + "step": 8523, + "time_per_iteration": 2.589885950088501 + }, + { + "auxiliary_loss_clip": 0.06424818, + "auxiliary_loss_mlp": 0.01265688, + "balance_loss_clip": 0.06274516, + "balance_loss_mlp": 0.01254685, + "epoch": 0.5124906057417706, + "flos": 21768199079040.0, + "grad_norm": 1.9669486922935226, + "language_loss": 0.77939785, + "learning_rate": 2.0161624838567656e-06, + "loss": 0.85630286, + "num_input_tokens_seen": 183275515, + "router_z_loss_clip": 1.50195312, + "router_z_loss_mlp": 0.11004639, + "step": 8524, + "time_per_iteration": 2.506647825241089 + }, + { + "auxiliary_loss_clip": 0.06424855, + "auxiliary_loss_mlp": 0.01266329, + "balance_loss_clip": 0.06275764, + "balance_loss_mlp": 0.01255344, + "epoch": 0.5125507289944387, + "flos": 18886605966720.0, + "grad_norm": 1.985021925330002, + "language_loss": 0.74904448, + "learning_rate": 2.015773034588706e-06, + "loss": 0.82595634, + "num_input_tokens_seen": 183293880, + "router_z_loss_clip": 1.49121094, + "router_z_loss_mlp": 0.10986328, + "step": 8525, + "time_per_iteration": 2.509902000427246 + }, + { + "auxiliary_loss_clip": 0.06429298, + "auxiliary_loss_mlp": 0.01270559, + "balance_loss_clip": 0.06276, + "balance_loss_mlp": 0.01258412, + "epoch": 0.5126108522471066, + "flos": 35636761981440.0, + "grad_norm": 1.5788283001431092, + "language_loss": 0.74868685, + "learning_rate": 2.015383584722531e-06, + "loss": 0.82568544, + "num_input_tokens_seen": 183315860, + "router_z_loss_clip": 1.53222656, + "router_z_loss_mlp": 0.12127686, + "step": 8526, + "time_per_iteration": 2.640554428100586 + }, + { + "auxiliary_loss_clip": 0.06428048, + "auxiliary_loss_mlp": 0.01267884, + "balance_loss_clip": 0.06275488, + "balance_loss_mlp": 0.01256613, + "epoch": 0.5126709754997746, + "flos": 20196685353600.0, + "grad_norm": 1.5376970768591331, + "language_loss": 0.658445, + "learning_rate": 2.0149941342730088e-06, + "loss": 0.73540437, + "num_input_tokens_seen": 183335480, + "router_z_loss_clip": 1.52539062, + "router_z_loss_mlp": 0.11279297, + "step": 8527, + "time_per_iteration": 2.5079874992370605 + }, + { + "auxiliary_loss_clip": 0.06421998, + "auxiliary_loss_mlp": 0.01268926, + "balance_loss_clip": 0.0627896, + "balance_loss_mlp": 0.01258644, + "epoch": 0.5127310987524425, + "flos": 18594550160640.0, + "grad_norm": 1.4224570841542155, + "language_loss": 0.74258637, + "learning_rate": 2.014604683254908e-06, + "loss": 0.81949556, + "num_input_tokens_seen": 183354395, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.10290527, + "step": 8528, + "time_per_iteration": 2.5583620071411133 + }, + { + "auxiliary_loss_clip": 0.06424492, + "auxiliary_loss_mlp": 0.01266445, + "balance_loss_clip": 0.06275051, + "balance_loss_mlp": 0.01254816, + "epoch": 0.5127912220051105, + "flos": 22461113617920.0, + "grad_norm": 1.747082224822374, + "language_loss": 0.83357608, + "learning_rate": 2.014215231682995e-06, + "loss": 0.91048539, + "num_input_tokens_seen": 183372980, + "router_z_loss_clip": 1.49316406, + "router_z_loss_mlp": 0.11621094, + "step": 8529, + "time_per_iteration": 2.5290021896362305 + }, + { + "auxiliary_loss_clip": 0.06427129, + "auxiliary_loss_mlp": 0.01266072, + "balance_loss_clip": 0.06279376, + "balance_loss_mlp": 0.01255206, + "epoch": 0.5128513452577784, + "flos": 19098845159040.0, + "grad_norm": 1.7753814294124612, + "language_loss": 0.7435441, + "learning_rate": 2.01382577957204e-06, + "loss": 0.82047611, + "num_input_tokens_seen": 183390160, + "router_z_loss_clip": 1.47753906, + "router_z_loss_mlp": 0.10852051, + "step": 8530, + "time_per_iteration": 2.5009660720825195 + }, + { + "auxiliary_loss_clip": 0.06336609, + "auxiliary_loss_mlp": 0.01264939, + "balance_loss_clip": 0.062712, + "balance_loss_mlp": 0.01263291, + "epoch": 0.5129114685104464, + "flos": 67914553011840.0, + "grad_norm": 0.7560442553547831, + "language_loss": 0.60794806, + "learning_rate": 2.0134363269368095e-06, + "loss": 0.68396354, + "num_input_tokens_seen": 183455280, + "router_z_loss_clip": 0.65625, + "router_z_loss_mlp": 0.01651001, + "step": 8531, + "time_per_iteration": 3.2641408443450928 + }, + { + "auxiliary_loss_clip": 0.06436025, + "auxiliary_loss_mlp": 0.0126867, + "balance_loss_clip": 0.062833, + "balance_loss_mlp": 0.0125722, + "epoch": 0.5129715917631144, + "flos": 20455436361600.0, + "grad_norm": 1.5619116128751078, + "language_loss": 0.76922929, + "learning_rate": 2.0130468737920725e-06, + "loss": 0.84627628, + "num_input_tokens_seen": 183473955, + "router_z_loss_clip": 1.52832031, + "router_z_loss_mlp": 0.11444092, + "step": 8532, + "time_per_iteration": 2.54885196685791 + }, + { + "auxiliary_loss_clip": 0.06429256, + "auxiliary_loss_mlp": 0.01269177, + "balance_loss_clip": 0.0627965, + "balance_loss_mlp": 0.0125747, + "epoch": 0.5130317150157824, + "flos": 35124836261760.0, + "grad_norm": 2.143443364581078, + "language_loss": 0.67464834, + "learning_rate": 2.012657420152597e-06, + "loss": 0.75163269, + "num_input_tokens_seen": 183497195, + "router_z_loss_clip": 1.49609375, + "router_z_loss_mlp": 0.11706543, + "step": 8533, + "time_per_iteration": 2.634751081466675 + }, + { + "auxiliary_loss_clip": 0.06435291, + "auxiliary_loss_mlp": 0.01270583, + "balance_loss_clip": 0.06282294, + "balance_loss_mlp": 0.01257995, + "epoch": 0.5130918382684503, + "flos": 19797671410560.0, + "grad_norm": 2.0992969405941526, + "language_loss": 0.82022768, + "learning_rate": 2.01226796603315e-06, + "loss": 0.89728636, + "num_input_tokens_seen": 183513675, + "router_z_loss_clip": 1.52734375, + "router_z_loss_mlp": 0.12585449, + "step": 8534, + "time_per_iteration": 2.527186632156372 + }, + { + "auxiliary_loss_clip": 0.06432565, + "auxiliary_loss_mlp": 0.01272989, + "balance_loss_clip": 0.06280594, + "balance_loss_mlp": 0.0126077, + "epoch": 0.5131519615211183, + "flos": 26330318478720.0, + "grad_norm": 1.396585887996991, + "language_loss": 0.64072168, + "learning_rate": 2.0118785114485017e-06, + "loss": 0.71777725, + "num_input_tokens_seen": 183535165, + "router_z_loss_clip": 1.51953125, + "router_z_loss_mlp": 0.12225342, + "step": 8535, + "time_per_iteration": 2.5608325004577637 + }, + { + "auxiliary_loss_clip": 0.06432404, + "auxiliary_loss_mlp": 0.01265724, + "balance_loss_clip": 0.06282519, + "balance_loss_mlp": 0.01254036, + "epoch": 0.5132120847737862, + "flos": 19177949013120.0, + "grad_norm": 1.677219086168078, + "language_loss": 0.70047057, + "learning_rate": 2.011489056413418e-06, + "loss": 0.77745175, + "num_input_tokens_seen": 183553780, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.11682129, + "step": 8536, + "time_per_iteration": 2.562103509902954 + }, + { + "auxiliary_loss_clip": 0.06443835, + "auxiliary_loss_mlp": 0.01273704, + "balance_loss_clip": 0.06287554, + "balance_loss_mlp": 0.01260359, + "epoch": 0.5132722080264542, + "flos": 20236698478080.0, + "grad_norm": 2.053357085489985, + "language_loss": 0.71648562, + "learning_rate": 2.011099600942669e-06, + "loss": 0.793661, + "num_input_tokens_seen": 183572285, + "router_z_loss_clip": 1.56054688, + "router_z_loss_mlp": 0.13348389, + "step": 8537, + "time_per_iteration": 2.5208451747894287 + }, + { + "auxiliary_loss_clip": 0.06435503, + "auxiliary_loss_mlp": 0.01264426, + "balance_loss_clip": 0.06282058, + "balance_loss_mlp": 0.01252559, + "epoch": 0.5133323312791223, + "flos": 16474619462400.0, + "grad_norm": 2.3096480270315487, + "language_loss": 0.80560482, + "learning_rate": 2.0107101450510214e-06, + "loss": 0.88260412, + "num_input_tokens_seen": 183589330, + "router_z_loss_clip": 1.53222656, + "router_z_loss_mlp": 0.11859131, + "step": 8538, + "time_per_iteration": 2.5136818885803223 + }, + { + "auxiliary_loss_clip": 0.06432489, + "auxiliary_loss_mlp": 0.01269896, + "balance_loss_clip": 0.06280679, + "balance_loss_mlp": 0.01258177, + "epoch": 0.5133924545317902, + "flos": 26075340904320.0, + "grad_norm": 1.6767929293826078, + "language_loss": 0.78499532, + "learning_rate": 2.0103206887532437e-06, + "loss": 0.86201918, + "num_input_tokens_seen": 183609205, + "router_z_loss_clip": 1.51757812, + "router_z_loss_mlp": 0.1171875, + "step": 8539, + "time_per_iteration": 2.5898549556732178 + }, + { + "auxiliary_loss_clip": 0.06434882, + "auxiliary_loss_mlp": 0.01267576, + "balance_loss_clip": 0.06283914, + "balance_loss_mlp": 0.01255703, + "epoch": 0.5134525777844582, + "flos": 29138467887360.0, + "grad_norm": 1.6389084641418472, + "language_loss": 0.76422769, + "learning_rate": 2.009931232064105e-06, + "loss": 0.84125227, + "num_input_tokens_seen": 183629985, + "router_z_loss_clip": 1.50976562, + "router_z_loss_mlp": 0.11877441, + "step": 8540, + "time_per_iteration": 2.695279359817505 + }, + { + "auxiliary_loss_clip": 0.06437706, + "auxiliary_loss_mlp": 0.01272086, + "balance_loss_clip": 0.06283282, + "balance_loss_mlp": 0.01258812, + "epoch": 0.5135127010371261, + "flos": 17460134858880.0, + "grad_norm": 1.735384048528371, + "language_loss": 0.74720204, + "learning_rate": 2.0095417749983724e-06, + "loss": 0.82429993, + "num_input_tokens_seen": 183648220, + "router_z_loss_clip": 1.54296875, + "router_z_loss_mlp": 0.1328125, + "step": 8541, + "time_per_iteration": 2.5028650760650635 + }, + { + "auxiliary_loss_clip": 0.06433722, + "auxiliary_loss_mlp": 0.01268404, + "balance_loss_clip": 0.06282187, + "balance_loss_mlp": 0.01255905, + "epoch": 0.5135728242897941, + "flos": 21951493885440.0, + "grad_norm": 1.7658048645767805, + "language_loss": 0.71345925, + "learning_rate": 2.0091523175708162e-06, + "loss": 0.79048049, + "num_input_tokens_seen": 183668230, + "router_z_loss_clip": 1.51367188, + "router_z_loss_mlp": 0.12493896, + "step": 8542, + "time_per_iteration": 2.55663800239563 + }, + { + "auxiliary_loss_clip": 0.06432796, + "auxiliary_loss_mlp": 0.01267795, + "balance_loss_clip": 0.06282645, + "balance_loss_mlp": 0.01255939, + "epoch": 0.513632947542462, + "flos": 22681528583040.0, + "grad_norm": 1.8429175926110044, + "language_loss": 0.79735661, + "learning_rate": 2.0087628597962023e-06, + "loss": 0.87436259, + "num_input_tokens_seen": 183687800, + "router_z_loss_clip": 1.50195312, + "router_z_loss_mlp": 0.11846924, + "step": 8543, + "time_per_iteration": 2.530942440032959 + }, + { + "auxiliary_loss_clip": 0.06431838, + "auxiliary_loss_mlp": 0.01268863, + "balance_loss_clip": 0.06281078, + "balance_loss_mlp": 0.0125693, + "epoch": 0.51369307079513, + "flos": 29464289688960.0, + "grad_norm": 1.9724623685644402, + "language_loss": 0.68434304, + "learning_rate": 2.008373401689299e-06, + "loss": 0.76135004, + "num_input_tokens_seen": 183709025, + "router_z_loss_clip": 1.50878906, + "router_z_loss_mlp": 0.11932373, + "step": 8544, + "time_per_iteration": 2.581965684890747 + }, + { + "auxiliary_loss_clip": 0.06435554, + "auxiliary_loss_mlp": 0.01269408, + "balance_loss_clip": 0.0628157, + "balance_loss_mlp": 0.01257314, + "epoch": 0.513753194047798, + "flos": 18995325039360.0, + "grad_norm": 1.9173308249452852, + "language_loss": 0.73101795, + "learning_rate": 2.0079839432648765e-06, + "loss": 0.80806756, + "num_input_tokens_seen": 183725740, + "router_z_loss_clip": 1.53808594, + "router_z_loss_mlp": 0.12103271, + "step": 8545, + "time_per_iteration": 3.9112906455993652 + }, + { + "auxiliary_loss_clip": 0.06434133, + "auxiliary_loss_mlp": 0.01273161, + "balance_loss_clip": 0.06280358, + "balance_loss_mlp": 0.01260745, + "epoch": 0.513813317300466, + "flos": 17827646866560.0, + "grad_norm": 2.3149125381427322, + "language_loss": 0.82387555, + "learning_rate": 2.0075944845377016e-06, + "loss": 0.90094852, + "num_input_tokens_seen": 183743995, + "router_z_loss_clip": 1.53613281, + "router_z_loss_mlp": 0.12408447, + "step": 8546, + "time_per_iteration": 2.4859204292297363 + }, + { + "auxiliary_loss_clip": 0.06431763, + "auxiliary_loss_mlp": 0.0126748, + "balance_loss_clip": 0.062795, + "balance_loss_mlp": 0.01255101, + "epoch": 0.5138734405531339, + "flos": 24068070420480.0, + "grad_norm": 1.656069587269211, + "language_loss": 0.73464745, + "learning_rate": 2.007205025522544e-06, + "loss": 0.81163985, + "num_input_tokens_seen": 183764150, + "router_z_loss_clip": 1.52539062, + "router_z_loss_mlp": 0.12384033, + "step": 8547, + "time_per_iteration": 2.5682289600372314 + }, + { + "auxiliary_loss_clip": 0.0643255, + "auxiliary_loss_mlp": 0.01266832, + "balance_loss_clip": 0.06281269, + "balance_loss_mlp": 0.01254697, + "epoch": 0.5139335638058019, + "flos": 26103279041280.0, + "grad_norm": 1.7029090715356687, + "language_loss": 0.7379564, + "learning_rate": 2.0068155662341702e-06, + "loss": 0.81495023, + "num_input_tokens_seen": 183783280, + "router_z_loss_clip": 1.51269531, + "router_z_loss_mlp": 0.12121582, + "step": 8548, + "time_per_iteration": 2.534795045852661 + }, + { + "auxiliary_loss_clip": 0.06433449, + "auxiliary_loss_mlp": 0.01270968, + "balance_loss_clip": 0.06279913, + "balance_loss_mlp": 0.01259124, + "epoch": 0.5139936870584698, + "flos": 18923181073920.0, + "grad_norm": 1.5199417717256292, + "language_loss": 0.82597619, + "learning_rate": 2.0064261066873495e-06, + "loss": 0.90302038, + "num_input_tokens_seen": 183800725, + "router_z_loss_clip": 1.53417969, + "router_z_loss_mlp": 0.11853027, + "step": 8549, + "time_per_iteration": 3.9844579696655273 + }, + { + "auxiliary_loss_clip": 0.06431821, + "auxiliary_loss_mlp": 0.01268578, + "balance_loss_clip": 0.06283253, + "balance_loss_mlp": 0.01256913, + "epoch": 0.5140538103111378, + "flos": 16149594274560.0, + "grad_norm": 1.7893333067818897, + "language_loss": 0.72460294, + "learning_rate": 2.0060366468968504e-06, + "loss": 0.80160695, + "num_input_tokens_seen": 183818735, + "router_z_loss_clip": 1.48339844, + "router_z_loss_mlp": 0.11669922, + "step": 8550, + "time_per_iteration": 2.6143221855163574 + }, + { + "auxiliary_loss_clip": 0.06436016, + "auxiliary_loss_mlp": 0.01265894, + "balance_loss_clip": 0.06278858, + "balance_loss_mlp": 0.01253341, + "epoch": 0.5141139335638057, + "flos": 22426886424960.0, + "grad_norm": 1.3843612466681816, + "language_loss": 0.7537846, + "learning_rate": 2.0056471868774408e-06, + "loss": 0.83080363, + "num_input_tokens_seen": 183840015, + "router_z_loss_clip": 1.56933594, + "router_z_loss_mlp": 0.12536621, + "step": 8551, + "time_per_iteration": 2.563551664352417 + }, + { + "auxiliary_loss_clip": 0.06427439, + "auxiliary_loss_mlp": 0.01266176, + "balance_loss_clip": 0.06281094, + "balance_loss_mlp": 0.01255233, + "epoch": 0.5141740568164738, + "flos": 27097054064640.0, + "grad_norm": 1.547590229430392, + "language_loss": 0.69192576, + "learning_rate": 2.0052577266438897e-06, + "loss": 0.76886189, + "num_input_tokens_seen": 183860145, + "router_z_loss_clip": 1.46191406, + "router_z_loss_mlp": 0.10949707, + "step": 8552, + "time_per_iteration": 2.598309278488159 + }, + { + "auxiliary_loss_clip": 0.06434312, + "auxiliary_loss_mlp": 0.01271227, + "balance_loss_clip": 0.06280888, + "balance_loss_mlp": 0.01258972, + "epoch": 0.5142341800691418, + "flos": 24980267894400.0, + "grad_norm": 1.7162445999633908, + "language_loss": 0.75295067, + "learning_rate": 2.004868266210965e-06, + "loss": 0.830006, + "num_input_tokens_seen": 183880540, + "router_z_loss_clip": 1.53515625, + "router_z_loss_mlp": 0.12255859, + "step": 8553, + "time_per_iteration": 2.56817364692688 + }, + { + "auxiliary_loss_clip": 0.06427588, + "auxiliary_loss_mlp": 0.01265909, + "balance_loss_clip": 0.06277347, + "balance_loss_mlp": 0.01253642, + "epoch": 0.5142943033218097, + "flos": 20710833206400.0, + "grad_norm": 1.5512777085285745, + "language_loss": 0.68091589, + "learning_rate": 2.004478805593435e-06, + "loss": 0.75785089, + "num_input_tokens_seen": 183900895, + "router_z_loss_clip": 1.50195312, + "router_z_loss_mlp": 0.1227417, + "step": 8554, + "time_per_iteration": 4.041098117828369 + }, + { + "auxiliary_loss_clip": 0.06434806, + "auxiliary_loss_mlp": 0.01269189, + "balance_loss_clip": 0.0627867, + "balance_loss_mlp": 0.0125514, + "epoch": 0.5143544265744777, + "flos": 22931391058560.0, + "grad_norm": 1.9544744043919176, + "language_loss": 0.73420155, + "learning_rate": 2.004089344806068e-06, + "loss": 0.81124151, + "num_input_tokens_seen": 183920335, + "router_z_loss_clip": 1.56054688, + "router_z_loss_mlp": 0.14050293, + "step": 8555, + "time_per_iteration": 2.560406446456909 + }, + { + "auxiliary_loss_clip": 0.0643023, + "auxiliary_loss_mlp": 0.01264405, + "balance_loss_clip": 0.06277946, + "balance_loss_mlp": 0.0125305, + "epoch": 0.5144145498271456, + "flos": 15926328270720.0, + "grad_norm": 3.1721710851325478, + "language_loss": 0.74827576, + "learning_rate": 2.003699883863633e-06, + "loss": 0.82522213, + "num_input_tokens_seen": 183936220, + "router_z_loss_clip": 1.52246094, + "router_z_loss_mlp": 0.11346436, + "step": 8556, + "time_per_iteration": 2.510631561279297 + }, + { + "auxiliary_loss_clip": 0.06426013, + "auxiliary_loss_mlp": 0.01266484, + "balance_loss_clip": 0.06279086, + "balance_loss_mlp": 0.01255374, + "epoch": 0.5144746730798136, + "flos": 19687107548160.0, + "grad_norm": 1.7802365486116365, + "language_loss": 0.86600292, + "learning_rate": 2.003310422780898e-06, + "loss": 0.9429279, + "num_input_tokens_seen": 183953250, + "router_z_loss_clip": 1.46972656, + "router_z_loss_mlp": 0.11114502, + "step": 8557, + "time_per_iteration": 2.4897682666778564 + }, + { + "auxiliary_loss_clip": 0.06427194, + "auxiliary_loss_mlp": 0.01265116, + "balance_loss_clip": 0.06280152, + "balance_loss_mlp": 0.0125372, + "epoch": 0.5145347963324816, + "flos": 23921476502400.0, + "grad_norm": 1.7088292247190593, + "language_loss": 0.89943027, + "learning_rate": 2.0029209615726307e-06, + "loss": 0.97635341, + "num_input_tokens_seen": 183973865, + "router_z_loss_clip": 1.46972656, + "router_z_loss_mlp": 0.11407471, + "step": 8558, + "time_per_iteration": 2.552520513534546 + }, + { + "auxiliary_loss_clip": 0.06426296, + "auxiliary_loss_mlp": 0.01270393, + "balance_loss_clip": 0.06281744, + "balance_loss_mlp": 0.01259337, + "epoch": 0.5145949195851496, + "flos": 18265919247360.0, + "grad_norm": 1.814909546317071, + "language_loss": 0.65665084, + "learning_rate": 2.002531500253602e-06, + "loss": 0.73361778, + "num_input_tokens_seen": 183992555, + "router_z_loss_clip": 1.4453125, + "router_z_loss_mlp": 0.1105957, + "step": 8559, + "time_per_iteration": 2.5509958267211914 + }, + { + "auxiliary_loss_clip": 0.06428455, + "auxiliary_loss_mlp": 0.0126663, + "balance_loss_clip": 0.0628074, + "balance_loss_mlp": 0.0125527, + "epoch": 0.5146550428378175, + "flos": 26220593157120.0, + "grad_norm": 1.5790337478872891, + "language_loss": 0.63388872, + "learning_rate": 2.002142038838577e-06, + "loss": 0.71083951, + "num_input_tokens_seen": 184010825, + "router_z_loss_clip": 1.47851562, + "router_z_loss_mlp": 0.11358643, + "step": 8560, + "time_per_iteration": 2.5824177265167236 + }, + { + "auxiliary_loss_clip": 0.06429952, + "auxiliary_loss_mlp": 0.01265572, + "balance_loss_clip": 0.06279366, + "balance_loss_mlp": 0.01253597, + "epoch": 0.5147151660904855, + "flos": 22680731969280.0, + "grad_norm": 1.6548160663474087, + "language_loss": 0.70604181, + "learning_rate": 2.0017525773423265e-06, + "loss": 0.78299701, + "num_input_tokens_seen": 184030155, + "router_z_loss_clip": 1.50390625, + "router_z_loss_mlp": 0.11975098, + "step": 8561, + "time_per_iteration": 4.051865816116333 + }, + { + "auxiliary_loss_clip": 0.06432293, + "auxiliary_loss_mlp": 0.01266304, + "balance_loss_clip": 0.0628119, + "balance_loss_mlp": 0.01254937, + "epoch": 0.5147752893431534, + "flos": 24979261645440.0, + "grad_norm": 1.5164557892601689, + "language_loss": 0.67091215, + "learning_rate": 2.0013631157796177e-06, + "loss": 0.7478981, + "num_input_tokens_seen": 184051440, + "router_z_loss_clip": 1.51171875, + "router_z_loss_mlp": 0.1137085, + "step": 8562, + "time_per_iteration": 2.587117910385132 + }, + { + "auxiliary_loss_clip": 0.06434688, + "auxiliary_loss_mlp": 0.0126818, + "balance_loss_clip": 0.06283362, + "balance_loss_mlp": 0.01256945, + "epoch": 0.5148354125958214, + "flos": 22750821509760.0, + "grad_norm": 1.6017474228640745, + "language_loss": 0.77982432, + "learning_rate": 2.0009736541652188e-06, + "loss": 0.85685301, + "num_input_tokens_seen": 184070205, + "router_z_loss_clip": 1.51367188, + "router_z_loss_mlp": 0.11248779, + "step": 8563, + "time_per_iteration": 2.5995922088623047 + }, + { + "auxiliary_loss_clip": 0.06441233, + "auxiliary_loss_mlp": 0.01269901, + "balance_loss_clip": 0.06284129, + "balance_loss_mlp": 0.01257235, + "epoch": 0.5148955358484893, + "flos": 23074253470080.0, + "grad_norm": 2.0871441030394426, + "language_loss": 0.83276081, + "learning_rate": 2.0005841925139e-06, + "loss": 0.90987211, + "num_input_tokens_seen": 184087345, + "router_z_loss_clip": 1.57128906, + "router_z_loss_mlp": 0.12658691, + "step": 8564, + "time_per_iteration": 2.5510189533233643 + }, + { + "auxiliary_loss_clip": 0.06436282, + "auxiliary_loss_mlp": 0.01266369, + "balance_loss_clip": 0.06281953, + "balance_loss_mlp": 0.01253918, + "epoch": 0.5149556591011574, + "flos": 20346465726720.0, + "grad_norm": 3.2981963875061915, + "language_loss": 0.73735076, + "learning_rate": 2.0001947308404283e-06, + "loss": 0.81437725, + "num_input_tokens_seen": 184107110, + "router_z_loss_clip": 1.54589844, + "router_z_loss_mlp": 0.12451172, + "step": 8565, + "time_per_iteration": 2.565485715866089 + }, + { + "auxiliary_loss_clip": 0.06439919, + "auxiliary_loss_mlp": 0.01271905, + "balance_loss_clip": 0.06283022, + "balance_loss_mlp": 0.01259478, + "epoch": 0.5150157823538254, + "flos": 22644869621760.0, + "grad_norm": 2.0080537974138424, + "language_loss": 0.6841439, + "learning_rate": 1.9998052691595715e-06, + "loss": 0.76126206, + "num_input_tokens_seen": 184127105, + "router_z_loss_clip": 1.56542969, + "router_z_loss_mlp": 0.12438965, + "step": 8566, + "time_per_iteration": 2.540060520172119 + }, + { + "auxiliary_loss_clip": 0.06439756, + "auxiliary_loss_mlp": 0.01270124, + "balance_loss_clip": 0.06282447, + "balance_loss_mlp": 0.0125828, + "epoch": 0.5150759056064933, + "flos": 26074795852800.0, + "grad_norm": 1.7193676063763261, + "language_loss": 0.78763425, + "learning_rate": 1.9994158074861005e-06, + "loss": 0.86473316, + "num_input_tokens_seen": 184148060, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.11834717, + "step": 8567, + "time_per_iteration": 2.610316276550293 + }, + { + "auxiliary_loss_clip": 0.06433998, + "auxiliary_loss_mlp": 0.0126364, + "balance_loss_clip": 0.06282104, + "balance_loss_mlp": 0.01251535, + "epoch": 0.5151360288591613, + "flos": 25958865329280.0, + "grad_norm": 1.8031823951648205, + "language_loss": 0.79058564, + "learning_rate": 1.9990263458347806e-06, + "loss": 0.86756206, + "num_input_tokens_seen": 184166175, + "router_z_loss_clip": 1.51855469, + "router_z_loss_mlp": 0.12091064, + "step": 8568, + "time_per_iteration": 2.5746078491210938 + }, + { + "auxiliary_loss_clip": 0.06425972, + "auxiliary_loss_mlp": 0.01263804, + "balance_loss_clip": 0.06277977, + "balance_loss_mlp": 0.01252705, + "epoch": 0.5151961521118292, + "flos": 18511840581120.0, + "grad_norm": 2.107330893228774, + "language_loss": 0.90881652, + "learning_rate": 1.9986368842203825e-06, + "loss": 0.98571432, + "num_input_tokens_seen": 184182600, + "router_z_loss_clip": 1.48046875, + "router_z_loss_mlp": 0.11096191, + "step": 8569, + "time_per_iteration": 2.5259969234466553 + }, + { + "auxiliary_loss_clip": 0.06436515, + "auxiliary_loss_mlp": 0.01273396, + "balance_loss_clip": 0.06282495, + "balance_loss_mlp": 0.01261225, + "epoch": 0.5152562753644973, + "flos": 22239734330880.0, + "grad_norm": 1.7160477900396784, + "language_loss": 0.77020866, + "learning_rate": 1.998247422657674e-06, + "loss": 0.84730774, + "num_input_tokens_seen": 184202020, + "router_z_loss_clip": 1.53808594, + "router_z_loss_mlp": 0.12188721, + "step": 8570, + "time_per_iteration": 2.5214664936065674 + }, + { + "auxiliary_loss_clip": 0.06435493, + "auxiliary_loss_mlp": 0.01269852, + "balance_loss_clip": 0.06284317, + "balance_loss_mlp": 0.01256817, + "epoch": 0.5153163986171652, + "flos": 38445833784960.0, + "grad_norm": 1.5069722692963965, + "language_loss": 0.73508942, + "learning_rate": 1.9978579611614227e-06, + "loss": 0.81214285, + "num_input_tokens_seen": 184224850, + "router_z_loss_clip": 1.51269531, + "router_z_loss_mlp": 0.1305542, + "step": 8571, + "time_per_iteration": 2.6566643714904785 + }, + { + "auxiliary_loss_clip": 0.06335695, + "auxiliary_loss_mlp": 0.01251905, + "balance_loss_clip": 0.06270696, + "balance_loss_mlp": 0.01250073, + "epoch": 0.5153765218698332, + "flos": 66404533783680.0, + "grad_norm": 0.7650204220049751, + "language_loss": 0.52955389, + "learning_rate": 1.9974684997463984e-06, + "loss": 0.60542989, + "num_input_tokens_seen": 184288520, + "router_z_loss_clip": 0.64990234, + "router_z_loss_mlp": 0.01826477, + "step": 8572, + "time_per_iteration": 3.231537103652954 + }, + { + "auxiliary_loss_clip": 0.06429811, + "auxiliary_loss_mlp": 0.01269417, + "balance_loss_clip": 0.06284182, + "balance_loss_mlp": 0.01257622, + "epoch": 0.5154366451225011, + "flos": 24031537240320.0, + "grad_norm": 1.6307698114257092, + "language_loss": 0.76929724, + "learning_rate": 1.9970790384273687e-06, + "loss": 0.84628952, + "num_input_tokens_seen": 184308565, + "router_z_loss_clip": 1.45703125, + "router_z_loss_mlp": 0.11791992, + "step": 8573, + "time_per_iteration": 2.5637993812561035 + }, + { + "auxiliary_loss_clip": 0.06429262, + "auxiliary_loss_mlp": 0.01267008, + "balance_loss_clip": 0.06281111, + "balance_loss_mlp": 0.01255099, + "epoch": 0.5154967683751691, + "flos": 23474189808000.0, + "grad_norm": 2.3679054324331967, + "language_loss": 0.77109015, + "learning_rate": 1.996689577219102e-06, + "loss": 0.84805286, + "num_input_tokens_seen": 184326795, + "router_z_loss_clip": 1.48046875, + "router_z_loss_mlp": 0.11914062, + "step": 8574, + "time_per_iteration": 2.53300404548645 + }, + { + "auxiliary_loss_clip": 0.06429033, + "auxiliary_loss_mlp": 0.01263951, + "balance_loss_clip": 0.06281316, + "balance_loss_mlp": 0.01252691, + "epoch": 0.515556891627837, + "flos": 23812463940480.0, + "grad_norm": 1.7644957150045186, + "language_loss": 0.85785985, + "learning_rate": 1.996300116136367e-06, + "loss": 0.93478966, + "num_input_tokens_seen": 184345990, + "router_z_loss_clip": 1.4765625, + "router_z_loss_mlp": 0.11248779, + "step": 8575, + "time_per_iteration": 2.577409029006958 + }, + { + "auxiliary_loss_clip": 0.06435408, + "auxiliary_loss_mlp": 0.01265893, + "balance_loss_clip": 0.06283233, + "balance_loss_mlp": 0.01253859, + "epoch": 0.515617014880505, + "flos": 19834665788160.0, + "grad_norm": 1.5082721708333224, + "language_loss": 0.76947051, + "learning_rate": 1.995910655193932e-06, + "loss": 0.84648347, + "num_input_tokens_seen": 184366300, + "router_z_loss_clip": 1.52050781, + "router_z_loss_mlp": 0.1204834, + "step": 8576, + "time_per_iteration": 2.5881736278533936 + }, + { + "auxiliary_loss_clip": 0.06444222, + "auxiliary_loss_mlp": 0.01270832, + "balance_loss_clip": 0.06283684, + "balance_loss_mlp": 0.01258083, + "epoch": 0.515677138133173, + "flos": 14251042863360.0, + "grad_norm": 2.2995750246066406, + "language_loss": 0.75517124, + "learning_rate": 1.9955211944065654e-06, + "loss": 0.83232176, + "num_input_tokens_seen": 184383030, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.12762451, + "step": 8577, + "time_per_iteration": 2.518495559692383 + }, + { + "auxiliary_loss_clip": 0.06436984, + "auxiliary_loss_mlp": 0.01270084, + "balance_loss_clip": 0.0628281, + "balance_loss_mlp": 0.01257037, + "epoch": 0.515737261385841, + "flos": 28296653443200.0, + "grad_norm": 4.0524023742876345, + "language_loss": 0.81602645, + "learning_rate": 1.9951317337890353e-06, + "loss": 0.89309716, + "num_input_tokens_seen": 184403410, + "router_z_loss_clip": 1.54101562, + "router_z_loss_mlp": 0.13049316, + "step": 8578, + "time_per_iteration": 2.5854508876800537 + }, + { + "auxiliary_loss_clip": 0.06431551, + "auxiliary_loss_mlp": 0.01266524, + "balance_loss_clip": 0.06281303, + "balance_loss_mlp": 0.01254746, + "epoch": 0.515797384638509, + "flos": 27899400435840.0, + "grad_norm": 1.724028071509101, + "language_loss": 0.7613306, + "learning_rate": 1.9947422733561105e-06, + "loss": 0.83831137, + "num_input_tokens_seen": 184423830, + "router_z_loss_clip": 1.50097656, + "router_z_loss_mlp": 0.11785889, + "step": 8579, + "time_per_iteration": 2.5765621662139893 + }, + { + "auxiliary_loss_clip": 0.06434369, + "auxiliary_loss_mlp": 0.01265499, + "balance_loss_clip": 0.06280281, + "balance_loss_mlp": 0.01253053, + "epoch": 0.5158575078911769, + "flos": 23046860384640.0, + "grad_norm": 1.6181814769530192, + "language_loss": 0.79290402, + "learning_rate": 1.994352813122559e-06, + "loss": 0.86990273, + "num_input_tokens_seen": 184445050, + "router_z_loss_clip": 1.54199219, + "router_z_loss_mlp": 0.12457275, + "step": 8580, + "time_per_iteration": 2.5879290103912354 + }, + { + "auxiliary_loss_clip": 0.0643789, + "auxiliary_loss_mlp": 0.01268597, + "balance_loss_clip": 0.06283616, + "balance_loss_mlp": 0.01254763, + "epoch": 0.5159176311438449, + "flos": 12646350120960.0, + "grad_norm": 1.9944005001089613, + "language_loss": 0.73488963, + "learning_rate": 1.99396335310315e-06, + "loss": 0.81195444, + "num_input_tokens_seen": 184460775, + "router_z_loss_clip": 1.54199219, + "router_z_loss_mlp": 0.1383667, + "step": 8581, + "time_per_iteration": 2.500063180923462 + }, + { + "auxiliary_loss_clip": 0.06434488, + "auxiliary_loss_mlp": 0.01266672, + "balance_loss_clip": 0.06284754, + "balance_loss_mlp": 0.01254781, + "epoch": 0.5159777543965128, + "flos": 15563302456320.0, + "grad_norm": 1.882801773214852, + "language_loss": 0.74207276, + "learning_rate": 1.9935738933126508e-06, + "loss": 0.81908435, + "num_input_tokens_seen": 184477365, + "router_z_loss_clip": 1.49511719, + "router_z_loss_mlp": 0.11901855, + "step": 8582, + "time_per_iteration": 2.518564462661743 + }, + { + "auxiliary_loss_clip": 0.06429887, + "auxiliary_loss_mlp": 0.01265806, + "balance_loss_clip": 0.06280613, + "balance_loss_mlp": 0.01254648, + "epoch": 0.5160378776491809, + "flos": 23228352328320.0, + "grad_norm": 1.8807127189493567, + "language_loss": 0.66238904, + "learning_rate": 1.99318443376583e-06, + "loss": 0.73934591, + "num_input_tokens_seen": 184497045, + "router_z_loss_clip": 1.49316406, + "router_z_loss_mlp": 0.11157227, + "step": 8583, + "time_per_iteration": 2.542539119720459 + }, + { + "auxiliary_loss_clip": 0.06437095, + "auxiliary_loss_mlp": 0.01269933, + "balance_loss_clip": 0.06283841, + "balance_loss_mlp": 0.01257404, + "epoch": 0.5160980009018488, + "flos": 21951074615040.0, + "grad_norm": 1.3417837681818925, + "language_loss": 0.760252, + "learning_rate": 1.9927949744774568e-06, + "loss": 0.83732229, + "num_input_tokens_seen": 184517675, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.12524414, + "step": 8584, + "time_per_iteration": 2.587082624435425 + }, + { + "auxiliary_loss_clip": 0.06437847, + "auxiliary_loss_mlp": 0.01266727, + "balance_loss_clip": 0.06283042, + "balance_loss_mlp": 0.01253579, + "epoch": 0.5161581241545168, + "flos": 22790708853120.0, + "grad_norm": 1.8159571462416286, + "language_loss": 0.78972226, + "learning_rate": 1.9924055154622983e-06, + "loss": 0.866768, + "num_input_tokens_seen": 184537745, + "router_z_loss_clip": 1.54785156, + "router_z_loss_mlp": 0.13153076, + "step": 8585, + "time_per_iteration": 3.918409824371338 + }, + { + "auxiliary_loss_clip": 0.06432407, + "auxiliary_loss_mlp": 0.01268664, + "balance_loss_clip": 0.06287332, + "balance_loss_mlp": 0.01257076, + "epoch": 0.5162182474071847, + "flos": 19680273440640.0, + "grad_norm": 1.974004410778628, + "language_loss": 0.81013006, + "learning_rate": 1.9920160567351238e-06, + "loss": 0.88714075, + "num_input_tokens_seen": 184553630, + "router_z_loss_clip": 1.45117188, + "router_z_loss_mlp": 0.11578369, + "step": 8586, + "time_per_iteration": 2.4944536685943604 + }, + { + "auxiliary_loss_clip": 0.06434685, + "auxiliary_loss_mlp": 0.01270978, + "balance_loss_clip": 0.06284505, + "balance_loss_mlp": 0.01258473, + "epoch": 0.5162783706598527, + "flos": 20052145860480.0, + "grad_norm": 2.892216813448522, + "language_loss": 0.71914274, + "learning_rate": 1.991626598310701e-06, + "loss": 0.79619938, + "num_input_tokens_seen": 184573530, + "router_z_loss_clip": 1.50097656, + "router_z_loss_mlp": 0.125, + "step": 8587, + "time_per_iteration": 2.500964403152466 + }, + { + "auxiliary_loss_clip": 0.06328937, + "auxiliary_loss_mlp": 0.01260473, + "balance_loss_clip": 0.06264381, + "balance_loss_mlp": 0.01258639, + "epoch": 0.5163384939125206, + "flos": 69980089610880.0, + "grad_norm": 0.7154986672608752, + "language_loss": 0.57844335, + "learning_rate": 1.9912371402037984e-06, + "loss": 0.65433741, + "num_input_tokens_seen": 184637875, + "router_z_loss_clip": 0.6484375, + "router_z_loss_mlp": 0.01829529, + "step": 8588, + "time_per_iteration": 4.569206476211548 + }, + { + "auxiliary_loss_clip": 0.06434999, + "auxiliary_loss_mlp": 0.01267755, + "balance_loss_clip": 0.06281946, + "balance_loss_mlp": 0.01254618, + "epoch": 0.5163986171651886, + "flos": 17422176159360.0, + "grad_norm": 8.344302755834537, + "language_loss": 0.75224382, + "learning_rate": 1.990847682429185e-06, + "loss": 0.82927144, + "num_input_tokens_seen": 184656125, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.13134766, + "step": 8589, + "time_per_iteration": 2.551936388015747 + }, + { + "auxiliary_loss_clip": 0.06436837, + "auxiliary_loss_mlp": 0.01265639, + "balance_loss_clip": 0.0628375, + "balance_loss_mlp": 0.01254607, + "epoch": 0.5164587404178566, + "flos": 21328752741120.0, + "grad_norm": 1.4649655682055334, + "language_loss": 0.67921245, + "learning_rate": 1.990458225001627e-06, + "loss": 0.75623721, + "num_input_tokens_seen": 184675920, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.11035156, + "step": 8590, + "time_per_iteration": 2.5104808807373047 + }, + { + "auxiliary_loss_clip": 0.06330067, + "auxiliary_loss_mlp": 0.01255277, + "balance_loss_clip": 0.06265621, + "balance_loss_mlp": 0.01253319, + "epoch": 0.5165188636705246, + "flos": 68076506954880.0, + "grad_norm": 0.7672531816981234, + "language_loss": 0.55843657, + "learning_rate": 1.990068767935895e-06, + "loss": 0.63428998, + "num_input_tokens_seen": 184730520, + "router_z_loss_clip": 0.64550781, + "router_z_loss_mlp": 0.01956177, + "step": 8591, + "time_per_iteration": 3.0606987476348877 + }, + { + "auxiliary_loss_clip": 0.06426874, + "auxiliary_loss_mlp": 0.01264002, + "balance_loss_clip": 0.06283274, + "balance_loss_mlp": 0.01253261, + "epoch": 0.5165789869231926, + "flos": 19390859038080.0, + "grad_norm": 1.5432128891960295, + "language_loss": 0.81508362, + "learning_rate": 1.9896793112467566e-06, + "loss": 0.89199233, + "num_input_tokens_seen": 184748340, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.10736084, + "step": 8592, + "time_per_iteration": 2.5063397884368896 + }, + { + "auxiliary_loss_clip": 0.0642782, + "auxiliary_loss_mlp": 0.01262629, + "balance_loss_clip": 0.06281757, + "balance_loss_mlp": 0.01251626, + "epoch": 0.5166391101758605, + "flos": 20966607394560.0, + "grad_norm": 1.7131386706837877, + "language_loss": 0.83462119, + "learning_rate": 1.989289854948979e-06, + "loss": 0.91152561, + "num_input_tokens_seen": 184766615, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.11010742, + "step": 8593, + "time_per_iteration": 3.951284170150757 + }, + { + "auxiliary_loss_clip": 0.06431139, + "auxiliary_loss_mlp": 0.01265605, + "balance_loss_clip": 0.06281991, + "balance_loss_mlp": 0.01253833, + "epoch": 0.5166992334285285, + "flos": 29470411036800.0, + "grad_norm": 1.8647556534792968, + "language_loss": 0.69381714, + "learning_rate": 1.9889003990573314e-06, + "loss": 0.77078462, + "num_input_tokens_seen": 184788075, + "router_z_loss_clip": 1.49121094, + "router_z_loss_mlp": 0.11761475, + "step": 8594, + "time_per_iteration": 2.600724220275879 + }, + { + "auxiliary_loss_clip": 0.06431773, + "auxiliary_loss_mlp": 0.01266128, + "balance_loss_clip": 0.06282206, + "balance_loss_mlp": 0.0125441, + "epoch": 0.5167593566811964, + "flos": 20310813014400.0, + "grad_norm": 1.4700297891307748, + "language_loss": 0.77611995, + "learning_rate": 1.988510943586582e-06, + "loss": 0.85309899, + "num_input_tokens_seen": 184808710, + "router_z_loss_clip": 1.49511719, + "router_z_loss_mlp": 0.1171875, + "step": 8595, + "time_per_iteration": 2.5478954315185547 + }, + { + "auxiliary_loss_clip": 0.06431342, + "auxiliary_loss_mlp": 0.01266673, + "balance_loss_clip": 0.06281155, + "balance_loss_mlp": 0.01255563, + "epoch": 0.5168194799338645, + "flos": 14616668154240.0, + "grad_norm": 1.457832438333805, + "language_loss": 0.65828246, + "learning_rate": 1.9881214885514986e-06, + "loss": 0.73526263, + "num_input_tokens_seen": 184826475, + "router_z_loss_clip": 1.50292969, + "router_z_loss_mlp": 0.11114502, + "step": 8596, + "time_per_iteration": 2.5720162391662598 + }, + { + "auxiliary_loss_clip": 0.06432624, + "auxiliary_loss_mlp": 0.01271477, + "balance_loss_clip": 0.06281975, + "balance_loss_mlp": 0.01258483, + "epoch": 0.5168796031865324, + "flos": 25013866181760.0, + "grad_norm": 1.4915456509806782, + "language_loss": 0.75734007, + "learning_rate": 1.9877320339668492e-06, + "loss": 0.8343811, + "num_input_tokens_seen": 184845245, + "router_z_loss_clip": 1.50683594, + "router_z_loss_mlp": 0.12988281, + "step": 8597, + "time_per_iteration": 2.5495989322662354 + }, + { + "auxiliary_loss_clip": 0.06427812, + "auxiliary_loss_mlp": 0.01266343, + "balance_loss_clip": 0.06278015, + "balance_loss_mlp": 0.01254583, + "epoch": 0.5169397264392004, + "flos": 26946728640000.0, + "grad_norm": 1.7231987845025152, + "language_loss": 0.8152492, + "learning_rate": 1.987342579847403e-06, + "loss": 0.89219069, + "num_input_tokens_seen": 184866605, + "router_z_loss_clip": 1.49609375, + "router_z_loss_mlp": 0.11773682, + "step": 8598, + "time_per_iteration": 2.6746177673339844 + }, + { + "auxiliary_loss_clip": 0.06427282, + "auxiliary_loss_mlp": 0.0126742, + "balance_loss_clip": 0.06279184, + "balance_loss_mlp": 0.0125523, + "epoch": 0.5169998496918683, + "flos": 25414347571200.0, + "grad_norm": 1.537627068096994, + "language_loss": 0.7597698, + "learning_rate": 1.9869531262079273e-06, + "loss": 0.83671683, + "num_input_tokens_seen": 184886945, + "router_z_loss_clip": 1.47851562, + "router_z_loss_mlp": 0.12194824, + "step": 8599, + "time_per_iteration": 2.548478841781616 + }, + { + "auxiliary_loss_clip": 0.06428513, + "auxiliary_loss_mlp": 0.01264151, + "balance_loss_clip": 0.06279552, + "balance_loss_mlp": 0.01253291, + "epoch": 0.5170599729445363, + "flos": 24687667036800.0, + "grad_norm": 4.521028695007152, + "language_loss": 0.72775459, + "learning_rate": 1.9865636730631904e-06, + "loss": 0.80468118, + "num_input_tokens_seen": 184905590, + "router_z_loss_clip": 1.48828125, + "router_z_loss_mlp": 0.10852051, + "step": 8600, + "time_per_iteration": 3.977342367172241 + }, + { + "auxiliary_loss_clip": 0.06427286, + "auxiliary_loss_mlp": 0.01268182, + "balance_loss_clip": 0.06278619, + "balance_loss_mlp": 0.01256732, + "epoch": 0.5171200961972042, + "flos": 21000499171200.0, + "grad_norm": 1.369345328324843, + "language_loss": 0.74472946, + "learning_rate": 1.9861742204279602e-06, + "loss": 0.82168412, + "num_input_tokens_seen": 184925555, + "router_z_loss_clip": 1.48535156, + "router_z_loss_mlp": 0.11444092, + "step": 8601, + "time_per_iteration": 2.5409762859344482 + }, + { + "auxiliary_loss_clip": 0.06429532, + "auxiliary_loss_mlp": 0.01271067, + "balance_loss_clip": 0.06278992, + "balance_loss_mlp": 0.01258467, + "epoch": 0.5171802194498722, + "flos": 22751953539840.0, + "grad_norm": 1.8713669852223682, + "language_loss": 0.83940291, + "learning_rate": 1.9857847683170045e-06, + "loss": 0.9164089, + "num_input_tokens_seen": 184944490, + "router_z_loss_clip": 1.50585938, + "router_z_loss_mlp": 0.12597656, + "step": 8602, + "time_per_iteration": 2.5086002349853516 + }, + { + "auxiliary_loss_clip": 0.06431083, + "auxiliary_loss_mlp": 0.01265946, + "balance_loss_clip": 0.06279787, + "balance_loss_mlp": 0.01254026, + "epoch": 0.5172403427025402, + "flos": 28183070833920.0, + "grad_norm": 1.835239532551919, + "language_loss": 0.74816436, + "learning_rate": 1.9853953167450926e-06, + "loss": 0.82513469, + "num_input_tokens_seen": 184963190, + "router_z_loss_clip": 1.51269531, + "router_z_loss_mlp": 0.1192627, + "step": 8603, + "time_per_iteration": 2.628830909729004 + }, + { + "auxiliary_loss_clip": 0.06434101, + "auxiliary_loss_mlp": 0.01267589, + "balance_loss_clip": 0.06281082, + "balance_loss_mlp": 0.01255566, + "epoch": 0.5173004659552082, + "flos": 20343782396160.0, + "grad_norm": 2.436721116583926, + "language_loss": 0.73165393, + "learning_rate": 1.9850058657269915e-06, + "loss": 0.80867082, + "num_input_tokens_seen": 184981220, + "router_z_loss_clip": 1.52734375, + "router_z_loss_mlp": 0.12017822, + "step": 8604, + "time_per_iteration": 2.521681785583496 + }, + { + "auxiliary_loss_clip": 0.06440152, + "auxiliary_loss_mlp": 0.01268375, + "balance_loss_clip": 0.06279815, + "balance_loss_mlp": 0.01254469, + "epoch": 0.5173605892078762, + "flos": 19069481502720.0, + "grad_norm": 1.6971244246662016, + "language_loss": 0.85418487, + "learning_rate": 1.984616415277469e-06, + "loss": 0.93127012, + "num_input_tokens_seen": 184998810, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.13922119, + "step": 8605, + "time_per_iteration": 2.5182762145996094 + }, + { + "auxiliary_loss_clip": 0.06430884, + "auxiliary_loss_mlp": 0.01270289, + "balance_loss_clip": 0.06279606, + "balance_loss_mlp": 0.01258893, + "epoch": 0.5174207124605441, + "flos": 28001620817280.0, + "grad_norm": 1.308601391892793, + "language_loss": 0.64964187, + "learning_rate": 1.984226965411294e-06, + "loss": 0.72665358, + "num_input_tokens_seen": 185021185, + "router_z_loss_clip": 1.51269531, + "router_z_loss_mlp": 0.1138916, + "step": 8606, + "time_per_iteration": 2.5762083530426025 + }, + { + "auxiliary_loss_clip": 0.06431288, + "auxiliary_loss_mlp": 0.01265541, + "balance_loss_clip": 0.06280211, + "balance_loss_mlp": 0.0125362, + "epoch": 0.5174808357132121, + "flos": 19502135660160.0, + "grad_norm": 1.5729301555613031, + "language_loss": 0.78141046, + "learning_rate": 1.983837516143234e-06, + "loss": 0.85837877, + "num_input_tokens_seen": 185038465, + "router_z_loss_clip": 1.50976562, + "router_z_loss_mlp": 0.11914062, + "step": 8607, + "time_per_iteration": 2.5321435928344727 + }, + { + "auxiliary_loss_clip": 0.06431965, + "auxiliary_loss_mlp": 0.01271738, + "balance_loss_clip": 0.06280412, + "balance_loss_mlp": 0.01259049, + "epoch": 0.51754095896588, + "flos": 22790834634240.0, + "grad_norm": 1.7409540075434562, + "language_loss": 0.72313815, + "learning_rate": 1.983448067488057e-06, + "loss": 0.80017519, + "num_input_tokens_seen": 185057340, + "router_z_loss_clip": 1.51660156, + "router_z_loss_mlp": 0.12677002, + "step": 8608, + "time_per_iteration": 2.52758526802063 + }, + { + "auxiliary_loss_clip": 0.06435958, + "auxiliary_loss_mlp": 0.01273384, + "balance_loss_clip": 0.06279105, + "balance_loss_mlp": 0.01261046, + "epoch": 0.5176010822185481, + "flos": 22674987964800.0, + "grad_norm": 1.7194792439439102, + "language_loss": 0.86816031, + "learning_rate": 1.983058619460531e-06, + "loss": 0.94525373, + "num_input_tokens_seen": 185074935, + "router_z_loss_clip": 1.56738281, + "router_z_loss_mlp": 0.12341309, + "step": 8609, + "time_per_iteration": 2.538146495819092 + }, + { + "auxiliary_loss_clip": 0.06431948, + "auxiliary_loss_mlp": 0.0126355, + "balance_loss_clip": 0.06280786, + "balance_loss_mlp": 0.01252201, + "epoch": 0.517661205471216, + "flos": 23957967755520.0, + "grad_norm": 2.0604849644666943, + "language_loss": 0.73853832, + "learning_rate": 1.9826691720754237e-06, + "loss": 0.81549335, + "num_input_tokens_seen": 185095050, + "router_z_loss_clip": 1.51074219, + "router_z_loss_mlp": 0.11352539, + "step": 8610, + "time_per_iteration": 2.5313732624053955 + }, + { + "auxiliary_loss_clip": 0.064363, + "auxiliary_loss_mlp": 0.01270735, + "balance_loss_clip": 0.06279181, + "balance_loss_mlp": 0.01258051, + "epoch": 0.517721328723884, + "flos": 15601470791040.0, + "grad_norm": 2.184245135297296, + "language_loss": 0.67738098, + "learning_rate": 1.9822797253475034e-06, + "loss": 0.75445139, + "num_input_tokens_seen": 185112275, + "router_z_loss_clip": 1.57128906, + "router_z_loss_mlp": 0.12689209, + "step": 8611, + "time_per_iteration": 2.510500431060791 + }, + { + "auxiliary_loss_clip": 0.06427399, + "auxiliary_loss_mlp": 0.0126573, + "balance_loss_clip": 0.06275965, + "balance_loss_mlp": 0.01253153, + "epoch": 0.5177814519765519, + "flos": 20966607394560.0, + "grad_norm": 1.678614110348905, + "language_loss": 0.77387339, + "learning_rate": 1.9818902792915373e-06, + "loss": 0.85080469, + "num_input_tokens_seen": 185132165, + "router_z_loss_clip": 1.51367188, + "router_z_loss_mlp": 0.12573242, + "step": 8612, + "time_per_iteration": 2.5206472873687744 + }, + { + "auxiliary_loss_clip": 0.064338, + "auxiliary_loss_mlp": 0.01269204, + "balance_loss_clip": 0.0628019, + "balance_loss_mlp": 0.01257641, + "epoch": 0.5178415752292199, + "flos": 17973653806080.0, + "grad_norm": 1.9437798274552756, + "language_loss": 0.82318223, + "learning_rate": 1.981500833922294e-06, + "loss": 0.90021223, + "num_input_tokens_seen": 185151025, + "router_z_loss_clip": 1.53613281, + "router_z_loss_mlp": 0.11560059, + "step": 8613, + "time_per_iteration": 2.4999184608459473 + }, + { + "auxiliary_loss_clip": 0.06431679, + "auxiliary_loss_mlp": 0.01268922, + "balance_loss_clip": 0.062784, + "balance_loss_mlp": 0.01255511, + "epoch": 0.5179016984818878, + "flos": 17827227596160.0, + "grad_norm": 2.2958122780571473, + "language_loss": 0.66944718, + "learning_rate": 1.981111389254541e-06, + "loss": 0.74645323, + "num_input_tokens_seen": 185168455, + "router_z_loss_clip": 1.53320312, + "router_z_loss_mlp": 0.1340332, + "step": 8614, + "time_per_iteration": 2.480762004852295 + }, + { + "auxiliary_loss_clip": 0.06432712, + "auxiliary_loss_mlp": 0.0126997, + "balance_loss_clip": 0.06278278, + "balance_loss_mlp": 0.01257465, + "epoch": 0.5179618217345558, + "flos": 17826011712000.0, + "grad_norm": 1.8941766649542733, + "language_loss": 0.87114352, + "learning_rate": 1.9807219453030453e-06, + "loss": 0.94817036, + "num_input_tokens_seen": 185184415, + "router_z_loss_clip": 1.54492188, + "router_z_loss_mlp": 0.12493896, + "step": 8615, + "time_per_iteration": 2.500279188156128 + }, + { + "auxiliary_loss_clip": 0.06431083, + "auxiliary_loss_mlp": 0.01270372, + "balance_loss_clip": 0.06278686, + "balance_loss_mlp": 0.01258731, + "epoch": 0.5180219449872238, + "flos": 22527639360000.0, + "grad_norm": 1.466896191984659, + "language_loss": 0.80947113, + "learning_rate": 1.9803325020825763e-06, + "loss": 0.8864857, + "num_input_tokens_seen": 185202910, + "router_z_loss_clip": 1.5234375, + "router_z_loss_mlp": 0.11639404, + "step": 8616, + "time_per_iteration": 2.523977279663086 + }, + { + "auxiliary_loss_clip": 0.06436383, + "auxiliary_loss_mlp": 0.01270292, + "balance_loss_clip": 0.0627937, + "balance_loss_mlp": 0.01257554, + "epoch": 0.5180820682398918, + "flos": 23922356970240.0, + "grad_norm": 2.681335053285678, + "language_loss": 0.75563776, + "learning_rate": 1.9799430596079e-06, + "loss": 0.83270454, + "num_input_tokens_seen": 185223085, + "router_z_loss_clip": 1.56835938, + "router_z_loss_mlp": 0.12744141, + "step": 8617, + "time_per_iteration": 2.5584635734558105 + }, + { + "auxiliary_loss_clip": 0.0643236, + "auxiliary_loss_mlp": 0.01270738, + "balance_loss_clip": 0.06279179, + "balance_loss_mlp": 0.01258215, + "epoch": 0.5181421914925598, + "flos": 16985119662720.0, + "grad_norm": 2.384459515549961, + "language_loss": 0.70321333, + "learning_rate": 1.979553617893785e-06, + "loss": 0.78024429, + "num_input_tokens_seen": 185241295, + "router_z_loss_clip": 1.53320312, + "router_z_loss_mlp": 0.12518311, + "step": 8618, + "time_per_iteration": 2.4864299297332764 + }, + { + "auxiliary_loss_clip": 0.06326556, + "auxiliary_loss_mlp": 0.01258187, + "balance_loss_clip": 0.0626248, + "balance_loss_mlp": 0.01256348, + "epoch": 0.5182023147452277, + "flos": 66080472917760.0, + "grad_norm": 0.9021946533901657, + "language_loss": 0.6731512, + "learning_rate": 1.979164176954999e-06, + "loss": 0.74899864, + "num_input_tokens_seen": 185298295, + "router_z_loss_clip": 0.640625, + "router_z_loss_mlp": 0.01834106, + "step": 8619, + "time_per_iteration": 3.1113593578338623 + }, + { + "auxiliary_loss_clip": 0.06429242, + "auxiliary_loss_mlp": 0.01268132, + "balance_loss_clip": 0.06279487, + "balance_loss_mlp": 0.01256235, + "epoch": 0.5182624379978957, + "flos": 18193775281920.0, + "grad_norm": 1.7875432352275369, + "language_loss": 0.79252517, + "learning_rate": 1.97877473680631e-06, + "loss": 0.86949891, + "num_input_tokens_seen": 185317000, + "router_z_loss_clip": 1.49804688, + "router_z_loss_mlp": 0.11883545, + "step": 8620, + "time_per_iteration": 2.490337371826172 + }, + { + "auxiliary_loss_clip": 0.06426805, + "auxiliary_loss_mlp": 0.01265045, + "balance_loss_clip": 0.06278054, + "balance_loss_mlp": 0.01253815, + "epoch": 0.5183225612505636, + "flos": 14031759928320.0, + "grad_norm": 2.0424555394318347, + "language_loss": 0.82670712, + "learning_rate": 1.9783852974624846e-06, + "loss": 0.90362567, + "num_input_tokens_seen": 185331185, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.11236572, + "step": 8621, + "time_per_iteration": 2.5358636379241943 + }, + { + "auxiliary_loss_clip": 0.06430708, + "auxiliary_loss_mlp": 0.01270453, + "balance_loss_clip": 0.06278727, + "balance_loss_mlp": 0.01257787, + "epoch": 0.5183826845032317, + "flos": 23666582782080.0, + "grad_norm": 3.572556492630201, + "language_loss": 0.65903664, + "learning_rate": 1.9779958589382905e-06, + "loss": 0.73604816, + "num_input_tokens_seen": 185348955, + "router_z_loss_clip": 1.51757812, + "router_z_loss_mlp": 0.12664795, + "step": 8622, + "time_per_iteration": 2.5054616928100586 + }, + { + "auxiliary_loss_clip": 0.06440182, + "auxiliary_loss_mlp": 0.0126943, + "balance_loss_clip": 0.06282417, + "balance_loss_mlp": 0.01257419, + "epoch": 0.5184428077558996, + "flos": 15894155502720.0, + "grad_norm": 2.003886693767472, + "language_loss": 0.60810971, + "learning_rate": 1.977606421248497e-06, + "loss": 0.68520582, + "num_input_tokens_seen": 185367330, + "router_z_loss_clip": 1.57617188, + "router_z_loss_mlp": 0.12011719, + "step": 8623, + "time_per_iteration": 2.517026662826538 + }, + { + "auxiliary_loss_clip": 0.06431899, + "auxiliary_loss_mlp": 0.01268479, + "balance_loss_clip": 0.06278786, + "balance_loss_mlp": 0.01256766, + "epoch": 0.5185029310085676, + "flos": 21036864643200.0, + "grad_norm": 1.709310334319468, + "language_loss": 0.76342779, + "learning_rate": 1.9772169844078685e-06, + "loss": 0.84043157, + "num_input_tokens_seen": 185385060, + "router_z_loss_clip": 1.53027344, + "router_z_loss_mlp": 0.11712646, + "step": 8624, + "time_per_iteration": 2.5128896236419678 + }, + { + "auxiliary_loss_clip": 0.0643063, + "auxiliary_loss_mlp": 0.01264535, + "balance_loss_clip": 0.06277324, + "balance_loss_mlp": 0.01251684, + "epoch": 0.5185630542612355, + "flos": 26550062611200.0, + "grad_norm": 2.453361725716909, + "language_loss": 0.71663254, + "learning_rate": 1.9768275484311756e-06, + "loss": 0.79358423, + "num_input_tokens_seen": 185403745, + "router_z_loss_clip": 1.53320312, + "router_z_loss_mlp": 0.12854004, + "step": 8625, + "time_per_iteration": 3.9488492012023926 + }, + { + "auxiliary_loss_clip": 0.06427859, + "auxiliary_loss_mlp": 0.01267162, + "balance_loss_clip": 0.06276631, + "balance_loss_mlp": 0.01255378, + "epoch": 0.5186231775139035, + "flos": 20674803150720.0, + "grad_norm": 1.8867804759418334, + "language_loss": 0.68206352, + "learning_rate": 1.976438113333184e-06, + "loss": 0.75901365, + "num_input_tokens_seen": 185422620, + "router_z_loss_clip": 1.51171875, + "router_z_loss_mlp": 0.11785889, + "step": 8626, + "time_per_iteration": 2.5555548667907715 + }, + { + "auxiliary_loss_clip": 0.06429964, + "auxiliary_loss_mlp": 0.01270465, + "balance_loss_clip": 0.06278128, + "balance_loss_mlp": 0.01257459, + "epoch": 0.5186833007665714, + "flos": 20891612390400.0, + "grad_norm": 1.918580922134282, + "language_loss": 0.70565557, + "learning_rate": 1.9760486791286612e-06, + "loss": 0.78265989, + "num_input_tokens_seen": 185439380, + "router_z_loss_clip": 1.51757812, + "router_z_loss_mlp": 0.13006592, + "step": 8627, + "time_per_iteration": 2.481426954269409 + }, + { + "auxiliary_loss_clip": 0.0643362, + "auxiliary_loss_mlp": 0.01266564, + "balance_loss_clip": 0.06277519, + "balance_loss_mlp": 0.01254399, + "epoch": 0.5187434240192395, + "flos": 20893247544960.0, + "grad_norm": 1.7293286755655957, + "language_loss": 0.73529112, + "learning_rate": 1.9756592458323753e-06, + "loss": 0.81229293, + "num_input_tokens_seen": 185458830, + "router_z_loss_clip": 1.56152344, + "router_z_loss_mlp": 0.12164307, + "step": 8628, + "time_per_iteration": 3.9418892860412598 + }, + { + "auxiliary_loss_clip": 0.0642761, + "auxiliary_loss_mlp": 0.01268136, + "balance_loss_clip": 0.06276411, + "balance_loss_mlp": 0.01255851, + "epoch": 0.5188035472719074, + "flos": 19865203401600.0, + "grad_norm": 1.86469754984735, + "language_loss": 0.77606678, + "learning_rate": 1.9752698134590927e-06, + "loss": 0.85302424, + "num_input_tokens_seen": 185477270, + "router_z_loss_clip": 1.51074219, + "router_z_loss_mlp": 0.1229248, + "step": 8629, + "time_per_iteration": 2.536813974380493 + }, + { + "auxiliary_loss_clip": 0.06431592, + "auxiliary_loss_mlp": 0.01268458, + "balance_loss_clip": 0.06276736, + "balance_loss_mlp": 0.01255923, + "epoch": 0.5188636705245754, + "flos": 21144032415360.0, + "grad_norm": 2.295438438275443, + "language_loss": 0.74746907, + "learning_rate": 1.9748803820235815e-06, + "loss": 0.82446957, + "num_input_tokens_seen": 185495795, + "router_z_loss_clip": 1.546875, + "router_z_loss_mlp": 0.12536621, + "step": 8630, + "time_per_iteration": 2.5338122844696045 + }, + { + "auxiliary_loss_clip": 0.06432383, + "auxiliary_loss_mlp": 0.0126778, + "balance_loss_clip": 0.06276915, + "balance_loss_mlp": 0.01253636, + "epoch": 0.5189237937772434, + "flos": 22426467154560.0, + "grad_norm": 1.6718033524216807, + "language_loss": 0.80433989, + "learning_rate": 1.9744909515406093e-06, + "loss": 0.88134158, + "num_input_tokens_seen": 185514885, + "router_z_loss_clip": 1.5546875, + "router_z_loss_mlp": 0.14141846, + "step": 8631, + "time_per_iteration": 2.5228912830352783 + }, + { + "auxiliary_loss_clip": 0.06431842, + "auxiliary_loss_mlp": 0.01268253, + "balance_loss_clip": 0.06276682, + "balance_loss_mlp": 0.01255187, + "epoch": 0.5189839170299113, + "flos": 25453647936000.0, + "grad_norm": 1.4304618482279687, + "language_loss": 0.74388516, + "learning_rate": 1.974101522024942e-06, + "loss": 0.82088614, + "num_input_tokens_seen": 185537155, + "router_z_loss_clip": 1.55175781, + "router_z_loss_mlp": 0.1305542, + "step": 8632, + "time_per_iteration": 2.5850229263305664 + }, + { + "auxiliary_loss_clip": 0.06424779, + "auxiliary_loss_mlp": 0.01268865, + "balance_loss_clip": 0.06277869, + "balance_loss_mlp": 0.01255865, + "epoch": 0.5190440402825793, + "flos": 18593585838720.0, + "grad_norm": 1.7732237266140687, + "language_loss": 0.79105878, + "learning_rate": 1.9737120934913477e-06, + "loss": 0.86799526, + "num_input_tokens_seen": 185555520, + "router_z_loss_clip": 1.46777344, + "router_z_loss_mlp": 0.13018799, + "step": 8633, + "time_per_iteration": 3.944106340408325 + }, + { + "auxiliary_loss_clip": 0.06433854, + "auxiliary_loss_mlp": 0.01265699, + "balance_loss_clip": 0.06279819, + "balance_loss_mlp": 0.01253492, + "epoch": 0.5191041635352472, + "flos": 21915170340480.0, + "grad_norm": 1.7747709828095277, + "language_loss": 0.80929339, + "learning_rate": 1.9733226659545936e-06, + "loss": 0.88628888, + "num_input_tokens_seen": 185573855, + "router_z_loss_clip": 1.5390625, + "router_z_loss_mlp": 0.12200928, + "step": 8634, + "time_per_iteration": 2.4922289848327637 + }, + { + "auxiliary_loss_clip": 0.0643179, + "auxiliary_loss_mlp": 0.01268829, + "balance_loss_clip": 0.06280308, + "balance_loss_mlp": 0.01256985, + "epoch": 0.5191642867879153, + "flos": 27535536080640.0, + "grad_norm": 1.4623629686344204, + "language_loss": 0.69064617, + "learning_rate": 1.9729332394294467e-06, + "loss": 0.76765239, + "num_input_tokens_seen": 185595145, + "router_z_loss_clip": 1.51660156, + "router_z_loss_mlp": 0.11846924, + "step": 8635, + "time_per_iteration": 2.5806636810302734 + }, + { + "auxiliary_loss_clip": 0.06433641, + "auxiliary_loss_mlp": 0.01269766, + "balance_loss_clip": 0.06278556, + "balance_loss_mlp": 0.01257356, + "epoch": 0.5192244100405832, + "flos": 15711489601920.0, + "grad_norm": 1.5680222184402974, + "language_loss": 0.77829492, + "learning_rate": 1.9725438139306742e-06, + "loss": 0.85532898, + "num_input_tokens_seen": 185613320, + "router_z_loss_clip": 1.55371094, + "router_z_loss_mlp": 0.12414551, + "step": 8636, + "time_per_iteration": 2.5346691608428955 + }, + { + "auxiliary_loss_clip": 0.0643746, + "auxiliary_loss_mlp": 0.01268889, + "balance_loss_clip": 0.06281422, + "balance_loss_mlp": 0.01256122, + "epoch": 0.5192845332932512, + "flos": 12061903092480.0, + "grad_norm": 2.0443106284945016, + "language_loss": 0.72005326, + "learning_rate": 1.9721543894730425e-06, + "loss": 0.7971167, + "num_input_tokens_seen": 185630730, + "router_z_loss_clip": 1.55859375, + "router_z_loss_mlp": 0.12768555, + "step": 8637, + "time_per_iteration": 2.5669779777526855 + }, + { + "auxiliary_loss_clip": 0.06428012, + "auxiliary_loss_mlp": 0.01270032, + "balance_loss_clip": 0.06279644, + "balance_loss_mlp": 0.01257724, + "epoch": 0.5193446565459191, + "flos": 18959211129600.0, + "grad_norm": 2.0277263511036625, + "language_loss": 0.76600313, + "learning_rate": 1.9717649660713194e-06, + "loss": 0.8429836, + "num_input_tokens_seen": 185648515, + "router_z_loss_clip": 1.48339844, + "router_z_loss_mlp": 0.12298584, + "step": 8638, + "time_per_iteration": 2.4836151599884033 + }, + { + "auxiliary_loss_clip": 0.06427278, + "auxiliary_loss_mlp": 0.012673, + "balance_loss_clip": 0.06276545, + "balance_loss_mlp": 0.0125548, + "epoch": 0.5194047797985871, + "flos": 20381028336000.0, + "grad_norm": 1.8081920937255338, + "language_loss": 0.74863744, + "learning_rate": 1.971375543740272e-06, + "loss": 0.82558322, + "num_input_tokens_seen": 185665220, + "router_z_loss_clip": 1.50878906, + "router_z_loss_mlp": 0.11828613, + "step": 8639, + "time_per_iteration": 2.508589029312134 + }, + { + "auxiliary_loss_clip": 0.06432048, + "auxiliary_loss_mlp": 0.01270657, + "balance_loss_clip": 0.06280512, + "balance_loss_mlp": 0.01258045, + "epoch": 0.519464903051255, + "flos": 24359916591360.0, + "grad_norm": 1.679129082437046, + "language_loss": 0.77792585, + "learning_rate": 1.9709861224946665e-06, + "loss": 0.85495287, + "num_input_tokens_seen": 185683750, + "router_z_loss_clip": 1.51367188, + "router_z_loss_mlp": 0.12628174, + "step": 8640, + "time_per_iteration": 4.030183553695679 + }, + { + "auxiliary_loss_clip": 0.06430673, + "auxiliary_loss_mlp": 0.012682, + "balance_loss_clip": 0.06282452, + "balance_loss_mlp": 0.01256482, + "epoch": 0.519525026303923, + "flos": 14066657953920.0, + "grad_norm": 1.8086687453592558, + "language_loss": 0.66518152, + "learning_rate": 1.97059670234927e-06, + "loss": 0.74217027, + "num_input_tokens_seen": 185700625, + "router_z_loss_clip": 1.48046875, + "router_z_loss_mlp": 0.11700439, + "step": 8641, + "time_per_iteration": 2.471047878265381 + }, + { + "auxiliary_loss_clip": 0.06427969, + "auxiliary_loss_mlp": 0.01270672, + "balance_loss_clip": 0.06279019, + "balance_loss_mlp": 0.01259228, + "epoch": 0.519585149556591, + "flos": 28842722501760.0, + "grad_norm": 1.7536948571823123, + "language_loss": 0.76330602, + "learning_rate": 1.97020728331885e-06, + "loss": 0.84029233, + "num_input_tokens_seen": 185721155, + "router_z_loss_clip": 1.48730469, + "router_z_loss_mlp": 0.11456299, + "step": 8642, + "time_per_iteration": 2.5977513790130615 + }, + { + "auxiliary_loss_clip": 0.06428998, + "auxiliary_loss_mlp": 0.01266151, + "balance_loss_clip": 0.06280753, + "balance_loss_mlp": 0.01254374, + "epoch": 0.519645272809259, + "flos": 25379826888960.0, + "grad_norm": 21.827473826572724, + "language_loss": 0.83256245, + "learning_rate": 1.9698178654181726e-06, + "loss": 0.90951395, + "num_input_tokens_seen": 185740990, + "router_z_loss_clip": 1.48339844, + "router_z_loss_mlp": 0.11767578, + "step": 8643, + "time_per_iteration": 2.547438621520996 + }, + { + "auxiliary_loss_clip": 0.06436369, + "auxiliary_loss_mlp": 0.01268573, + "balance_loss_clip": 0.06280598, + "balance_loss_mlp": 0.01255508, + "epoch": 0.519705396061927, + "flos": 25379659180800.0, + "grad_norm": 1.5731350893002956, + "language_loss": 0.70531744, + "learning_rate": 1.969428448662004e-06, + "loss": 0.78236687, + "num_input_tokens_seen": 185762235, + "router_z_loss_clip": 1.55664062, + "router_z_loss_mlp": 0.13067627, + "step": 8644, + "time_per_iteration": 2.5876879692077637 + }, + { + "auxiliary_loss_clip": 0.06430183, + "auxiliary_loss_mlp": 0.01266621, + "balance_loss_clip": 0.0627798, + "balance_loss_mlp": 0.01254825, + "epoch": 0.5197655193145949, + "flos": 28483889391360.0, + "grad_norm": 1.5934186274855324, + "language_loss": 0.80385697, + "learning_rate": 1.9690390330651133e-06, + "loss": 0.88082504, + "num_input_tokens_seen": 185783415, + "router_z_loss_clip": 1.52148438, + "router_z_loss_mlp": 0.11804199, + "step": 8645, + "time_per_iteration": 2.574620246887207 + }, + { + "auxiliary_loss_clip": 0.06430401, + "auxiliary_loss_mlp": 0.01271116, + "balance_loss_clip": 0.06280167, + "balance_loss_mlp": 0.01258898, + "epoch": 0.5198256425672629, + "flos": 20014983774720.0, + "grad_norm": 1.690489867798711, + "language_loss": 0.78455305, + "learning_rate": 1.968649618642264e-06, + "loss": 0.86156821, + "num_input_tokens_seen": 185801345, + "router_z_loss_clip": 1.50292969, + "router_z_loss_mlp": 0.12207031, + "step": 8646, + "time_per_iteration": 2.6401519775390625 + }, + { + "auxiliary_loss_clip": 0.06429573, + "auxiliary_loss_mlp": 0.01268342, + "balance_loss_clip": 0.06279829, + "balance_loss_mlp": 0.01256243, + "epoch": 0.5198857658199308, + "flos": 19835043131520.0, + "grad_norm": 2.3656488760516132, + "language_loss": 0.66367847, + "learning_rate": 1.9682602054082252e-06, + "loss": 0.74065757, + "num_input_tokens_seen": 185820815, + "router_z_loss_clip": 1.49804688, + "router_z_loss_mlp": 0.12091064, + "step": 8647, + "time_per_iteration": 2.599353551864624 + }, + { + "auxiliary_loss_clip": 0.06438218, + "auxiliary_loss_mlp": 0.01268132, + "balance_loss_clip": 0.06282619, + "balance_loss_mlp": 0.0125462, + "epoch": 0.5199458890725989, + "flos": 24468761445120.0, + "grad_norm": 1.778197055342432, + "language_loss": 0.71491444, + "learning_rate": 1.967870793377763e-06, + "loss": 0.79197794, + "num_input_tokens_seen": 185841450, + "router_z_loss_clip": 1.55566406, + "router_z_loss_mlp": 0.13513184, + "step": 8648, + "time_per_iteration": 2.572368860244751 + }, + { + "auxiliary_loss_clip": 0.06438164, + "auxiliary_loss_mlp": 0.01268937, + "balance_loss_clip": 0.06285776, + "balance_loss_mlp": 0.01255884, + "epoch": 0.5200060123252668, + "flos": 23411605207680.0, + "grad_norm": 2.1583755088943875, + "language_loss": 0.64699459, + "learning_rate": 1.967481382565642e-06, + "loss": 0.72406554, + "num_input_tokens_seen": 185859935, + "router_z_loss_clip": 1.5234375, + "router_z_loss_mlp": 0.13031006, + "step": 8649, + "time_per_iteration": 2.5117433071136475 + }, + { + "auxiliary_loss_clip": 0.06439677, + "auxiliary_loss_mlp": 0.01274224, + "balance_loss_clip": 0.06281672, + "balance_loss_mlp": 0.01260778, + "epoch": 0.5200661355779348, + "flos": 17207002074240.0, + "grad_norm": 5.161359302041442, + "language_loss": 0.70409989, + "learning_rate": 1.9670919729866315e-06, + "loss": 0.78123897, + "num_input_tokens_seen": 185876795, + "router_z_loss_clip": 1.58105469, + "router_z_loss_mlp": 0.13446045, + "step": 8650, + "time_per_iteration": 2.5144400596618652 + }, + { + "auxiliary_loss_clip": 0.06431218, + "auxiliary_loss_mlp": 0.01268732, + "balance_loss_clip": 0.06279574, + "balance_loss_mlp": 0.01256936, + "epoch": 0.5201262588306027, + "flos": 18520980675840.0, + "grad_norm": 1.6145243882323275, + "language_loss": 0.78030795, + "learning_rate": 1.966702564655496e-06, + "loss": 0.85730743, + "num_input_tokens_seen": 185895570, + "router_z_loss_clip": 1.515625, + "router_z_loss_mlp": 0.11791992, + "step": 8651, + "time_per_iteration": 2.467643976211548 + }, + { + "auxiliary_loss_clip": 0.06437017, + "auxiliary_loss_mlp": 0.01266893, + "balance_loss_clip": 0.06283189, + "balance_loss_mlp": 0.01253709, + "epoch": 0.5201863820832707, + "flos": 18624458868480.0, + "grad_norm": 1.6266187944599841, + "language_loss": 0.79176587, + "learning_rate": 1.966313157587003e-06, + "loss": 0.86880493, + "num_input_tokens_seen": 185913700, + "router_z_loss_clip": 1.53613281, + "router_z_loss_mlp": 0.13171387, + "step": 8652, + "time_per_iteration": 2.5569629669189453 + }, + { + "auxiliary_loss_clip": 0.06434878, + "auxiliary_loss_mlp": 0.01268954, + "balance_loss_clip": 0.0628317, + "balance_loss_mlp": 0.01255919, + "epoch": 0.5202465053359386, + "flos": 22863817140480.0, + "grad_norm": 1.9022927985659936, + "language_loss": 0.70460284, + "learning_rate": 1.9659237517959187e-06, + "loss": 0.78164113, + "num_input_tokens_seen": 185932460, + "router_z_loss_clip": 1.515625, + "router_z_loss_mlp": 0.13049316, + "step": 8653, + "time_per_iteration": 2.5013556480407715 + }, + { + "auxiliary_loss_clip": 0.06435711, + "auxiliary_loss_mlp": 0.01270582, + "balance_loss_clip": 0.06279919, + "balance_loss_mlp": 0.01257124, + "epoch": 0.5203066285886067, + "flos": 21988068992640.0, + "grad_norm": 1.7386916801416297, + "language_loss": 0.78877962, + "learning_rate": 1.965534347297008e-06, + "loss": 0.86584258, + "num_input_tokens_seen": 185952030, + "router_z_loss_clip": 1.55859375, + "router_z_loss_mlp": 0.13452148, + "step": 8654, + "time_per_iteration": 2.5205516815185547 + }, + { + "auxiliary_loss_clip": 0.06439671, + "auxiliary_loss_mlp": 0.01271817, + "balance_loss_clip": 0.06283241, + "balance_loss_mlp": 0.01258763, + "epoch": 0.5203667518412746, + "flos": 20240094568320.0, + "grad_norm": 1.7537160659546802, + "language_loss": 0.84438735, + "learning_rate": 1.9651449441050393e-06, + "loss": 0.92150223, + "num_input_tokens_seen": 185973130, + "router_z_loss_clip": 1.56347656, + "router_z_loss_mlp": 0.13043213, + "step": 8655, + "time_per_iteration": 2.523545026779175 + }, + { + "auxiliary_loss_clip": 0.06427735, + "auxiliary_loss_mlp": 0.01264722, + "balance_loss_clip": 0.06279121, + "balance_loss_mlp": 0.01253027, + "epoch": 0.5204268750939426, + "flos": 15710860696320.0, + "grad_norm": 2.477748600032862, + "language_loss": 0.66631675, + "learning_rate": 1.9647555422347777e-06, + "loss": 0.74324131, + "num_input_tokens_seen": 185990200, + "router_z_loss_clip": 1.48632812, + "router_z_loss_mlp": 0.11688232, + "step": 8656, + "time_per_iteration": 2.504314661026001 + }, + { + "auxiliary_loss_clip": 0.06430535, + "auxiliary_loss_mlp": 0.01266767, + "balance_loss_clip": 0.06278028, + "balance_loss_mlp": 0.01254203, + "epoch": 0.5204869983466105, + "flos": 27456096810240.0, + "grad_norm": 1.7743424381892883, + "language_loss": 0.73250526, + "learning_rate": 1.9643661417009893e-06, + "loss": 0.80947828, + "num_input_tokens_seen": 186009880, + "router_z_loss_clip": 1.52539062, + "router_z_loss_mlp": 0.12567139, + "step": 8657, + "time_per_iteration": 2.547746419906616 + }, + { + "auxiliary_loss_clip": 0.06431027, + "auxiliary_loss_mlp": 0.01268378, + "balance_loss_clip": 0.06281261, + "balance_loss_mlp": 0.01255611, + "epoch": 0.5205471215992785, + "flos": 20601820644480.0, + "grad_norm": 1.9136699042437477, + "language_loss": 0.71553123, + "learning_rate": 1.9639767425184408e-06, + "loss": 0.79252529, + "num_input_tokens_seen": 186026680, + "router_z_loss_clip": 1.49804688, + "router_z_loss_mlp": 0.12756348, + "step": 8658, + "time_per_iteration": 2.523796796798706 + }, + { + "auxiliary_loss_clip": 0.06426262, + "auxiliary_loss_mlp": 0.01268222, + "balance_loss_clip": 0.06275812, + "balance_loss_mlp": 0.01255669, + "epoch": 0.5206072448519465, + "flos": 22134537129600.0, + "grad_norm": 1.8507369766537312, + "language_loss": 0.83638287, + "learning_rate": 1.963587344701897e-06, + "loss": 0.91332769, + "num_input_tokens_seen": 186046920, + "router_z_loss_clip": 1.50585938, + "router_z_loss_mlp": 0.12554932, + "step": 8659, + "time_per_iteration": 2.5169432163238525 + }, + { + "auxiliary_loss_clip": 0.06437267, + "auxiliary_loss_mlp": 0.01269684, + "balance_loss_clip": 0.06277223, + "balance_loss_mlp": 0.01255587, + "epoch": 0.5206673681046144, + "flos": 18335924933760.0, + "grad_norm": 2.050641453841446, + "language_loss": 0.75738013, + "learning_rate": 1.9631979482661253e-06, + "loss": 0.83444965, + "num_input_tokens_seen": 186062090, + "router_z_loss_clip": 1.59863281, + "router_z_loss_mlp": 0.14093018, + "step": 8660, + "time_per_iteration": 2.557415723800659 + }, + { + "auxiliary_loss_clip": 0.06428091, + "auxiliary_loss_mlp": 0.0126833, + "balance_loss_clip": 0.06277187, + "balance_loss_mlp": 0.01256105, + "epoch": 0.5207274913572825, + "flos": 20236488842880.0, + "grad_norm": 1.6215362458867588, + "language_loss": 0.77692747, + "learning_rate": 1.9628085532258906e-06, + "loss": 0.85389173, + "num_input_tokens_seen": 186081135, + "router_z_loss_clip": 1.50683594, + "router_z_loss_mlp": 0.12231445, + "step": 8661, + "time_per_iteration": 2.509428024291992 + }, + { + "auxiliary_loss_clip": 0.06431398, + "auxiliary_loss_mlp": 0.01266033, + "balance_loss_clip": 0.06278183, + "balance_loss_mlp": 0.01254112, + "epoch": 0.5207876146099504, + "flos": 22133530880640.0, + "grad_norm": 1.7321078317719976, + "language_loss": 0.70359308, + "learning_rate": 1.9624191595959603e-06, + "loss": 0.78056741, + "num_input_tokens_seen": 186099700, + "router_z_loss_clip": 1.53027344, + "router_z_loss_mlp": 0.1192627, + "step": 8662, + "time_per_iteration": 2.5810325145721436 + }, + { + "auxiliary_loss_clip": 0.0642472, + "auxiliary_loss_mlp": 0.01270038, + "balance_loss_clip": 0.06276304, + "balance_loss_mlp": 0.01257169, + "epoch": 0.5208477378626184, + "flos": 23885781863040.0, + "grad_norm": 1.845579934529664, + "language_loss": 0.70074278, + "learning_rate": 1.962029767391098e-06, + "loss": 0.77769035, + "num_input_tokens_seen": 186119740, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.12872314, + "step": 8663, + "time_per_iteration": 2.528122901916504 + }, + { + "auxiliary_loss_clip": 0.06433125, + "auxiliary_loss_mlp": 0.01272195, + "balance_loss_clip": 0.06282328, + "balance_loss_mlp": 0.01259619, + "epoch": 0.5209078611152863, + "flos": 20968158695040.0, + "grad_norm": 1.5162641399491859, + "language_loss": 0.77111858, + "learning_rate": 1.961640376626072e-06, + "loss": 0.84817183, + "num_input_tokens_seen": 186140645, + "router_z_loss_clip": 1.50878906, + "router_z_loss_mlp": 0.12591553, + "step": 8664, + "time_per_iteration": 3.9675118923187256 + }, + { + "auxiliary_loss_clip": 0.06426796, + "auxiliary_loss_mlp": 0.01274545, + "balance_loss_clip": 0.06275374, + "balance_loss_mlp": 0.01261641, + "epoch": 0.5209679843679543, + "flos": 20674006536960.0, + "grad_norm": 1.9585914111684504, + "language_loss": 0.76477247, + "learning_rate": 1.961250987315646e-06, + "loss": 0.84178591, + "num_input_tokens_seen": 186160130, + "router_z_loss_clip": 1.51269531, + "router_z_loss_mlp": 0.12915039, + "step": 8665, + "time_per_iteration": 2.541412830352783 + }, + { + "auxiliary_loss_clip": 0.06427725, + "auxiliary_loss_mlp": 0.01272532, + "balance_loss_clip": 0.06278466, + "balance_loss_mlp": 0.01260593, + "epoch": 0.5210281076206222, + "flos": 20233050825600.0, + "grad_norm": 1.6923585849410518, + "language_loss": 0.72734976, + "learning_rate": 1.960861599474586e-06, + "loss": 0.80435228, + "num_input_tokens_seen": 186179485, + "router_z_loss_clip": 1.4921875, + "router_z_loss_mlp": 0.11920166, + "step": 8666, + "time_per_iteration": 2.4996509552001953 + }, + { + "auxiliary_loss_clip": 0.06442789, + "auxiliary_loss_mlp": 0.01270993, + "balance_loss_clip": 0.0628055, + "balance_loss_mlp": 0.01256199, + "epoch": 0.5210882308732903, + "flos": 16075395884160.0, + "grad_norm": 2.8085912573953093, + "language_loss": 0.69292629, + "learning_rate": 1.9604722131176592e-06, + "loss": 0.77006412, + "num_input_tokens_seen": 186197140, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.14794922, + "step": 8667, + "time_per_iteration": 3.966068744659424 + }, + { + "auxiliary_loss_clip": 0.06427799, + "auxiliary_loss_mlp": 0.0127319, + "balance_loss_clip": 0.06280097, + "balance_loss_mlp": 0.01261793, + "epoch": 0.5211483541259582, + "flos": 24831954967680.0, + "grad_norm": 1.4529640974986662, + "language_loss": 0.8142345, + "learning_rate": 1.960082828259629e-06, + "loss": 0.89124429, + "num_input_tokens_seen": 186216800, + "router_z_loss_clip": 1.47558594, + "router_z_loss_mlp": 0.11401367, + "step": 8668, + "time_per_iteration": 2.531757116317749 + }, + { + "auxiliary_loss_clip": 0.06428734, + "auxiliary_loss_mlp": 0.01265972, + "balance_loss_clip": 0.06277529, + "balance_loss_mlp": 0.01253485, + "epoch": 0.5212084773786262, + "flos": 20375997091200.0, + "grad_norm": 2.3545461183864793, + "language_loss": 0.6399523, + "learning_rate": 1.9596934449152623e-06, + "loss": 0.71689939, + "num_input_tokens_seen": 186235320, + "router_z_loss_clip": 1.51171875, + "router_z_loss_mlp": 0.12493896, + "step": 8669, + "time_per_iteration": 2.582458019256592 + }, + { + "auxiliary_loss_clip": 0.06433244, + "auxiliary_loss_mlp": 0.01270095, + "balance_loss_clip": 0.06281579, + "balance_loss_mlp": 0.01257846, + "epoch": 0.5212686006312941, + "flos": 23151596388480.0, + "grad_norm": 1.5489696479352357, + "language_loss": 0.66586244, + "learning_rate": 1.959304063099325e-06, + "loss": 0.74289578, + "num_input_tokens_seen": 186254460, + "router_z_loss_clip": 1.51757812, + "router_z_loss_mlp": 0.12261963, + "step": 8670, + "time_per_iteration": 2.5730559825897217 + }, + { + "auxiliary_loss_clip": 0.0642543, + "auxiliary_loss_mlp": 0.01273699, + "balance_loss_clip": 0.06278989, + "balance_loss_mlp": 0.01262195, + "epoch": 0.5213287238839621, + "flos": 27780073822080.0, + "grad_norm": 2.549693242202028, + "language_loss": 0.76187384, + "learning_rate": 1.9589146828265806e-06, + "loss": 0.83886516, + "num_input_tokens_seen": 186269465, + "router_z_loss_clip": 1.46386719, + "router_z_loss_mlp": 0.11505127, + "step": 8671, + "time_per_iteration": 2.5233168601989746 + }, + { + "auxiliary_loss_clip": 0.064327, + "auxiliary_loss_mlp": 0.01274872, + "balance_loss_clip": 0.06278658, + "balance_loss_mlp": 0.01262534, + "epoch": 0.5213888471366301, + "flos": 19943762204160.0, + "grad_norm": 1.8121341163261586, + "language_loss": 0.78893673, + "learning_rate": 1.958525304111796e-06, + "loss": 0.86601251, + "num_input_tokens_seen": 186288660, + "router_z_loss_clip": 1.54003906, + "router_z_loss_mlp": 0.12341309, + "step": 8672, + "time_per_iteration": 3.9492485523223877 + }, + { + "auxiliary_loss_clip": 0.06431769, + "auxiliary_loss_mlp": 0.01269371, + "balance_loss_clip": 0.06282303, + "balance_loss_mlp": 0.01257957, + "epoch": 0.521448970389298, + "flos": 16988389971840.0, + "grad_norm": 2.0794497937850327, + "language_loss": 0.72609621, + "learning_rate": 1.958135926969736e-06, + "loss": 0.80310762, + "num_input_tokens_seen": 186305760, + "router_z_loss_clip": 1.49414062, + "router_z_loss_mlp": 0.11425781, + "step": 8673, + "time_per_iteration": 2.4858944416046143 + }, + { + "auxiliary_loss_clip": 0.06430827, + "auxiliary_loss_mlp": 0.01267899, + "balance_loss_clip": 0.06280996, + "balance_loss_mlp": 0.01256133, + "epoch": 0.5215090936419661, + "flos": 18995744309760.0, + "grad_norm": 1.6692646430310563, + "language_loss": 0.75224721, + "learning_rate": 1.957746551415166e-06, + "loss": 0.82923448, + "num_input_tokens_seen": 186324135, + "router_z_loss_clip": 1.49609375, + "router_z_loss_mlp": 0.11755371, + "step": 8674, + "time_per_iteration": 2.528323173522949 + }, + { + "auxiliary_loss_clip": 0.06432723, + "auxiliary_loss_mlp": 0.01271657, + "balance_loss_clip": 0.06279887, + "balance_loss_mlp": 0.01258812, + "epoch": 0.521569216894634, + "flos": 16148923441920.0, + "grad_norm": 2.0098628900715694, + "language_loss": 0.86161578, + "learning_rate": 1.9573571774628506e-06, + "loss": 0.93865955, + "num_input_tokens_seen": 186340205, + "router_z_loss_clip": 1.52832031, + "router_z_loss_mlp": 0.128479, + "step": 8675, + "time_per_iteration": 2.486656665802002 + }, + { + "auxiliary_loss_clip": 0.06328152, + "auxiliary_loss_mlp": 0.0125317, + "balance_loss_clip": 0.06263625, + "balance_loss_mlp": 0.01251218, + "epoch": 0.521629340147302, + "flos": 57596054296320.0, + "grad_norm": 0.8389911483177593, + "language_loss": 0.62711406, + "learning_rate": 1.9569678051275556e-06, + "loss": 0.70292729, + "num_input_tokens_seen": 186396940, + "router_z_loss_clip": 0.6484375, + "router_z_loss_mlp": 0.01950073, + "step": 8676, + "time_per_iteration": 3.09920597076416 + }, + { + "auxiliary_loss_clip": 0.06427533, + "auxiliary_loss_mlp": 0.01264396, + "balance_loss_clip": 0.06277495, + "balance_loss_mlp": 0.01252839, + "epoch": 0.5216894633999699, + "flos": 26804117790720.0, + "grad_norm": 1.458201451867465, + "language_loss": 0.69111204, + "learning_rate": 1.956578434424046e-06, + "loss": 0.7680313, + "num_input_tokens_seen": 186418680, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.11572266, + "step": 8677, + "time_per_iteration": 2.5477073192596436 + }, + { + "auxiliary_loss_clip": 0.06427766, + "auxiliary_loss_mlp": 0.01266893, + "balance_loss_clip": 0.06279552, + "balance_loss_mlp": 0.01255127, + "epoch": 0.5217495866526379, + "flos": 26365803482880.0, + "grad_norm": 1.7210863244717929, + "language_loss": 0.65549737, + "learning_rate": 1.956189065367086e-06, + "loss": 0.73244393, + "num_input_tokens_seen": 186438265, + "router_z_loss_clip": 1.48046875, + "router_z_loss_mlp": 0.11749268, + "step": 8678, + "time_per_iteration": 2.566591739654541 + }, + { + "auxiliary_loss_clip": 0.06434263, + "auxiliary_loss_mlp": 0.01268698, + "balance_loss_clip": 0.06280728, + "balance_loss_mlp": 0.01255531, + "epoch": 0.5218097099053058, + "flos": 23590329966720.0, + "grad_norm": 2.9370978110790507, + "language_loss": 0.68504936, + "learning_rate": 1.9557996979714414e-06, + "loss": 0.762079, + "num_input_tokens_seen": 186456870, + "router_z_loss_clip": 1.53417969, + "router_z_loss_mlp": 0.1317749, + "step": 8679, + "time_per_iteration": 2.510748863220215 + }, + { + "auxiliary_loss_clip": 0.06433919, + "auxiliary_loss_mlp": 0.01268379, + "balance_loss_clip": 0.06281881, + "balance_loss_mlp": 0.01256345, + "epoch": 0.5218698331579739, + "flos": 18083253346560.0, + "grad_norm": 1.6397075137651071, + "language_loss": 0.67471087, + "learning_rate": 1.9554103322518764e-06, + "loss": 0.7517339, + "num_input_tokens_seen": 186476425, + "router_z_loss_clip": 1.51855469, + "router_z_loss_mlp": 0.12036133, + "step": 8680, + "time_per_iteration": 3.9219276905059814 + }, + { + "auxiliary_loss_clip": 0.06433384, + "auxiliary_loss_mlp": 0.01271487, + "balance_loss_clip": 0.06281422, + "balance_loss_mlp": 0.01259595, + "epoch": 0.5219299564106418, + "flos": 19287129283200.0, + "grad_norm": 1.8649470617465917, + "language_loss": 0.83311534, + "learning_rate": 1.955020968223156e-06, + "loss": 0.91016412, + "num_input_tokens_seen": 186492555, + "router_z_loss_clip": 1.51953125, + "router_z_loss_mlp": 0.11889648, + "step": 8681, + "time_per_iteration": 2.516465663909912 + }, + { + "auxiliary_loss_clip": 0.06426493, + "auxiliary_loss_mlp": 0.0126523, + "balance_loss_clip": 0.06276904, + "balance_loss_mlp": 0.01253792, + "epoch": 0.5219900796633098, + "flos": 26658613975680.0, + "grad_norm": 1.6454147062415487, + "language_loss": 0.77514279, + "learning_rate": 1.9546316059000454e-06, + "loss": 0.85205996, + "num_input_tokens_seen": 186513190, + "router_z_loss_clip": 1.49511719, + "router_z_loss_mlp": 0.11437988, + "step": 8682, + "time_per_iteration": 2.554325819015503 + }, + { + "auxiliary_loss_clip": 0.06427193, + "auxiliary_loss_mlp": 0.01266482, + "balance_loss_clip": 0.06279279, + "balance_loss_mlp": 0.01254949, + "epoch": 0.5220502029159777, + "flos": 34321148225280.0, + "grad_norm": 1.635540508166305, + "language_loss": 0.693317, + "learning_rate": 1.9542422452973082e-06, + "loss": 0.77025378, + "num_input_tokens_seen": 186534830, + "router_z_loss_clip": 1.47949219, + "router_z_loss_mlp": 0.11529541, + "step": 8683, + "time_per_iteration": 2.6571457386016846 + }, + { + "auxiliary_loss_clip": 0.06430393, + "auxiliary_loss_mlp": 0.01269896, + "balance_loss_clip": 0.06278116, + "balance_loss_mlp": 0.01257629, + "epoch": 0.5221103261686457, + "flos": 22161804433920.0, + "grad_norm": 1.5499745188789709, + "language_loss": 0.76029563, + "learning_rate": 1.9538528864297104e-06, + "loss": 0.83729851, + "num_input_tokens_seen": 186554390, + "router_z_loss_clip": 1.52441406, + "router_z_loss_mlp": 0.12255859, + "step": 8684, + "time_per_iteration": 2.5611672401428223 + }, + { + "auxiliary_loss_clip": 0.06422482, + "auxiliary_loss_mlp": 0.01267711, + "balance_loss_clip": 0.06276357, + "balance_loss_mlp": 0.01256123, + "epoch": 0.5221704494213137, + "flos": 19214440266240.0, + "grad_norm": 1.9689133598672337, + "language_loss": 0.75993264, + "learning_rate": 1.9534635293120153e-06, + "loss": 0.83683455, + "num_input_tokens_seen": 186572360, + "router_z_loss_clip": 1.46191406, + "router_z_loss_mlp": 0.11590576, + "step": 8685, + "time_per_iteration": 2.592336416244507 + }, + { + "auxiliary_loss_clip": 0.06433201, + "auxiliary_loss_mlp": 0.01267661, + "balance_loss_clip": 0.06280906, + "balance_loss_mlp": 0.01255549, + "epoch": 0.5222305726739817, + "flos": 19360069862400.0, + "grad_norm": 1.8592295664699974, + "language_loss": 0.81054503, + "learning_rate": 1.9530741739589876e-06, + "loss": 0.88755369, + "num_input_tokens_seen": 186590655, + "router_z_loss_clip": 1.52246094, + "router_z_loss_mlp": 0.12103271, + "step": 8686, + "time_per_iteration": 2.529801845550537 + }, + { + "auxiliary_loss_clip": 0.06419135, + "auxiliary_loss_mlp": 0.01266554, + "balance_loss_clip": 0.06276063, + "balance_loss_mlp": 0.01255021, + "epoch": 0.5222906959266497, + "flos": 27821554392960.0, + "grad_norm": 1.7724306724007597, + "language_loss": 0.7060039, + "learning_rate": 1.9526848203853927e-06, + "loss": 0.78286076, + "num_input_tokens_seen": 186610345, + "router_z_loss_clip": 1.43164062, + "router_z_loss_mlp": 0.11535645, + "step": 8687, + "time_per_iteration": 2.580845594406128 + }, + { + "auxiliary_loss_clip": 0.06421649, + "auxiliary_loss_mlp": 0.01267038, + "balance_loss_clip": 0.06277607, + "balance_loss_mlp": 0.01256297, + "epoch": 0.5223508191793176, + "flos": 12717781326720.0, + "grad_norm": 2.573153086937961, + "language_loss": 0.82975262, + "learning_rate": 1.9522954686059936e-06, + "loss": 0.90663946, + "num_input_tokens_seen": 186624360, + "router_z_loss_clip": 1.43945312, + "router_z_loss_mlp": 0.10736084, + "step": 8688, + "time_per_iteration": 2.479219436645508 + }, + { + "auxiliary_loss_clip": 0.06427407, + "auxiliary_loss_mlp": 0.01268772, + "balance_loss_clip": 0.06280096, + "balance_loss_mlp": 0.01256345, + "epoch": 0.5224109424319856, + "flos": 15637584700800.0, + "grad_norm": 2.221621058495187, + "language_loss": 0.74186772, + "learning_rate": 1.9519061186355558e-06, + "loss": 0.81882954, + "num_input_tokens_seen": 186638680, + "router_z_loss_clip": 1.47363281, + "router_z_loss_mlp": 0.12426758, + "step": 8689, + "time_per_iteration": 2.519578456878662 + }, + { + "auxiliary_loss_clip": 0.06423427, + "auxiliary_loss_mlp": 0.01264867, + "balance_loss_clip": 0.06277696, + "balance_loss_mlp": 0.01253858, + "epoch": 0.5224710656846535, + "flos": 15747687365760.0, + "grad_norm": 1.8795858532487468, + "language_loss": 0.8292582, + "learning_rate": 1.9515167704888417e-06, + "loss": 0.90614116, + "num_input_tokens_seen": 186655840, + "router_z_loss_clip": 1.45703125, + "router_z_loss_mlp": 0.11022949, + "step": 8690, + "time_per_iteration": 2.4795632362365723 + }, + { + "auxiliary_loss_clip": 0.06425175, + "auxiliary_loss_mlp": 0.01267616, + "balance_loss_clip": 0.06276759, + "balance_loss_mlp": 0.0125542, + "epoch": 0.5225311889373215, + "flos": 26038136891520.0, + "grad_norm": 1.8859654188369186, + "language_loss": 0.79290485, + "learning_rate": 1.9511274241806173e-06, + "loss": 0.86983275, + "num_input_tokens_seen": 186674150, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.12200928, + "step": 8691, + "time_per_iteration": 2.554316520690918 + }, + { + "auxiliary_loss_clip": 0.06425714, + "auxiliary_loss_mlp": 0.01267876, + "balance_loss_clip": 0.06275393, + "balance_loss_mlp": 0.01255044, + "epoch": 0.5225913121899894, + "flos": 18375183371520.0, + "grad_norm": 2.097465391576973, + "language_loss": 0.76909935, + "learning_rate": 1.950738079725646e-06, + "loss": 0.84603524, + "num_input_tokens_seen": 186690675, + "router_z_loss_clip": 1.50292969, + "router_z_loss_mlp": 0.12835693, + "step": 8692, + "time_per_iteration": 2.508985757827759 + }, + { + "auxiliary_loss_clip": 0.06422729, + "auxiliary_loss_mlp": 0.01266471, + "balance_loss_clip": 0.06279368, + "balance_loss_mlp": 0.01254872, + "epoch": 0.5226514354426575, + "flos": 29280407904000.0, + "grad_norm": 1.831817200061648, + "language_loss": 0.73045087, + "learning_rate": 1.950348737138691e-06, + "loss": 0.80734289, + "num_input_tokens_seen": 186710380, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.11608887, + "step": 8693, + "time_per_iteration": 2.5672616958618164 + }, + { + "auxiliary_loss_clip": 0.06430539, + "auxiliary_loss_mlp": 0.01265444, + "balance_loss_clip": 0.06276198, + "balance_loss_mlp": 0.01252802, + "epoch": 0.5227115586953254, + "flos": 22859330947200.0, + "grad_norm": 2.034375584307348, + "language_loss": 0.8244431, + "learning_rate": 1.949959396434517e-06, + "loss": 0.90140283, + "num_input_tokens_seen": 186729135, + "router_z_loss_clip": 1.54199219, + "router_z_loss_mlp": 0.12640381, + "step": 8694, + "time_per_iteration": 2.511063814163208 + }, + { + "auxiliary_loss_clip": 0.06334698, + "auxiliary_loss_mlp": 0.01264, + "balance_loss_clip": 0.06270603, + "balance_loss_mlp": 0.01262187, + "epoch": 0.5227716819479934, + "flos": 57491695635840.0, + "grad_norm": 0.936740482735722, + "language_loss": 0.55577236, + "learning_rate": 1.949570057627888e-06, + "loss": 0.63175929, + "num_input_tokens_seen": 186791115, + "router_z_loss_clip": 0.640625, + "router_z_loss_mlp": 0.01809692, + "step": 8695, + "time_per_iteration": 3.201383113861084 + }, + { + "auxiliary_loss_clip": 0.06426679, + "auxiliary_loss_mlp": 0.01263614, + "balance_loss_clip": 0.06277943, + "balance_loss_mlp": 0.01252074, + "epoch": 0.5228318052006613, + "flos": 13813357461120.0, + "grad_norm": 1.622631737546212, + "language_loss": 0.73801219, + "learning_rate": 1.9491807207335672e-06, + "loss": 0.81491518, + "num_input_tokens_seen": 186808660, + "router_z_loss_clip": 1.48730469, + "router_z_loss_mlp": 0.11547852, + "step": 8696, + "time_per_iteration": 2.542386770248413 + }, + { + "auxiliary_loss_clip": 0.06429457, + "auxiliary_loss_mlp": 0.01266915, + "balance_loss_clip": 0.06279002, + "balance_loss_mlp": 0.01254589, + "epoch": 0.5228919284533293, + "flos": 15601596572160.0, + "grad_norm": 1.5536675741091566, + "language_loss": 0.71410191, + "learning_rate": 1.948791385766319e-06, + "loss": 0.79106563, + "num_input_tokens_seen": 186825900, + "router_z_loss_clip": 1.50488281, + "router_z_loss_mlp": 0.12341309, + "step": 8697, + "time_per_iteration": 2.520252227783203 + }, + { + "auxiliary_loss_clip": 0.06423891, + "auxiliary_loss_mlp": 0.01265854, + "balance_loss_clip": 0.0627815, + "balance_loss_mlp": 0.0125453, + "epoch": 0.5229520517059973, + "flos": 22497982214400.0, + "grad_norm": 1.650008991843684, + "language_loss": 0.80845451, + "learning_rate": 1.948402052740906e-06, + "loss": 0.88535196, + "num_input_tokens_seen": 186843735, + "router_z_loss_clip": 1.45605469, + "router_z_loss_mlp": 0.11328125, + "step": 8698, + "time_per_iteration": 2.5636022090911865 + }, + { + "auxiliary_loss_clip": 0.06426111, + "auxiliary_loss_mlp": 0.01266716, + "balance_loss_clip": 0.06278659, + "balance_loss_mlp": 0.01254908, + "epoch": 0.5230121749586653, + "flos": 22097416970880.0, + "grad_norm": 3.7708298280456023, + "language_loss": 0.74449289, + "learning_rate": 1.948012721672093e-06, + "loss": 0.82142115, + "num_input_tokens_seen": 186862440, + "router_z_loss_clip": 1.47167969, + "router_z_loss_mlp": 0.1182251, + "step": 8699, + "time_per_iteration": 2.531606912612915 + }, + { + "auxiliary_loss_clip": 0.06432469, + "auxiliary_loss_mlp": 0.0126789, + "balance_loss_clip": 0.06277843, + "balance_loss_mlp": 0.01255325, + "epoch": 0.5230722982113333, + "flos": 22133656661760.0, + "grad_norm": 1.5875927962566738, + "language_loss": 0.73680252, + "learning_rate": 1.947623392574642e-06, + "loss": 0.81380606, + "num_input_tokens_seen": 186880940, + "router_z_loss_clip": 1.546875, + "router_z_loss_mlp": 0.12561035, + "step": 8700, + "time_per_iteration": 2.542734146118164 + }, + { + "auxiliary_loss_clip": 0.06429377, + "auxiliary_loss_mlp": 0.01275322, + "balance_loss_clip": 0.06277132, + "balance_loss_mlp": 0.01263127, + "epoch": 0.5231324214640012, + "flos": 25016214096000.0, + "grad_norm": 1.8967545071734793, + "language_loss": 0.67123276, + "learning_rate": 1.947234065463318e-06, + "loss": 0.74827981, + "num_input_tokens_seen": 186900785, + "router_z_loss_clip": 1.52148438, + "router_z_loss_mlp": 0.12207031, + "step": 8701, + "time_per_iteration": 2.543332815170288 + }, + { + "auxiliary_loss_clip": 0.06421816, + "auxiliary_loss_mlp": 0.01266038, + "balance_loss_clip": 0.06274643, + "balance_loss_mlp": 0.01254696, + "epoch": 0.5231925447166692, + "flos": 25747842021120.0, + "grad_norm": 1.6886589098280236, + "language_loss": 0.66874444, + "learning_rate": 1.9468447403528826e-06, + "loss": 0.74562299, + "num_input_tokens_seen": 186920895, + "router_z_loss_clip": 1.46972656, + "router_z_loss_mlp": 0.11340332, + "step": 8702, + "time_per_iteration": 2.5511581897735596 + }, + { + "auxiliary_loss_clip": 0.06426294, + "auxiliary_loss_mlp": 0.01268357, + "balance_loss_clip": 0.06277906, + "balance_loss_mlp": 0.01255906, + "epoch": 0.5232526679693371, + "flos": 21440322852480.0, + "grad_norm": 3.970152828937024, + "language_loss": 0.76360488, + "learning_rate": 1.946455417258101e-06, + "loss": 0.84055138, + "num_input_tokens_seen": 186940605, + "router_z_loss_clip": 1.48632812, + "router_z_loss_mlp": 0.12457275, + "step": 8703, + "time_per_iteration": 2.4913880825042725 + }, + { + "auxiliary_loss_clip": 0.06434231, + "auxiliary_loss_mlp": 0.01268071, + "balance_loss_clip": 0.06279694, + "balance_loss_mlp": 0.01255471, + "epoch": 0.5233127912220051, + "flos": 35307082892160.0, + "grad_norm": 2.0695890072195344, + "language_loss": 0.77554905, + "learning_rate": 1.9460660961937348e-06, + "loss": 0.85257214, + "num_input_tokens_seen": 186960820, + "router_z_loss_clip": 1.54589844, + "router_z_loss_mlp": 0.1260376, + "step": 8704, + "time_per_iteration": 4.093170642852783 + }, + { + "auxiliary_loss_clip": 0.06425636, + "auxiliary_loss_mlp": 0.01277604, + "balance_loss_clip": 0.06278675, + "balance_loss_mlp": 0.012665, + "epoch": 0.523372914474673, + "flos": 17056257379200.0, + "grad_norm": 1.7488135640398956, + "language_loss": 0.78527272, + "learning_rate": 1.9456767771745474e-06, + "loss": 0.86230516, + "num_input_tokens_seen": 186976240, + "router_z_loss_clip": 1.47070312, + "router_z_loss_mlp": 0.11108398, + "step": 8705, + "time_per_iteration": 2.487792730331421 + }, + { + "auxiliary_loss_clip": 0.06433457, + "auxiliary_loss_mlp": 0.01264626, + "balance_loss_clip": 0.06280416, + "balance_loss_mlp": 0.0125221, + "epoch": 0.5234330377273411, + "flos": 18412303530240.0, + "grad_norm": 1.822089906899261, + "language_loss": 0.69768077, + "learning_rate": 1.9452874602153027e-06, + "loss": 0.77466154, + "num_input_tokens_seen": 186992855, + "router_z_loss_clip": 1.52832031, + "router_z_loss_mlp": 0.12408447, + "step": 8706, + "time_per_iteration": 2.52415132522583 + }, + { + "auxiliary_loss_clip": 0.06339821, + "auxiliary_loss_mlp": 0.01262622, + "balance_loss_clip": 0.06275055, + "balance_loss_mlp": 0.01260974, + "epoch": 0.523493160980009, + "flos": 65872426429440.0, + "grad_norm": 0.668265925718786, + "language_loss": 0.52398658, + "learning_rate": 1.9448981453307623e-06, + "loss": 0.60001105, + "num_input_tokens_seen": 187051205, + "router_z_loss_clip": 0.6484375, + "router_z_loss_mlp": 0.01651001, + "step": 8707, + "time_per_iteration": 4.596412658691406 + }, + { + "auxiliary_loss_clip": 0.06431062, + "auxiliary_loss_mlp": 0.01267285, + "balance_loss_clip": 0.06282815, + "balance_loss_mlp": 0.01255829, + "epoch": 0.523553284232677, + "flos": 21878595233280.0, + "grad_norm": 1.763620445487087, + "language_loss": 0.75447237, + "learning_rate": 1.9445088325356904e-06, + "loss": 0.83145583, + "num_input_tokens_seen": 187070540, + "router_z_loss_clip": 1.47949219, + "router_z_loss_mlp": 0.11450195, + "step": 8708, + "time_per_iteration": 2.515388011932373 + }, + { + "auxiliary_loss_clip": 0.06425884, + "auxiliary_loss_mlp": 0.01269189, + "balance_loss_clip": 0.06279897, + "balance_loss_mlp": 0.01258252, + "epoch": 0.5236134074853449, + "flos": 20854156815360.0, + "grad_norm": 1.5562083670602136, + "language_loss": 0.78041285, + "learning_rate": 1.944119521844849e-06, + "loss": 0.85736358, + "num_input_tokens_seen": 187089975, + "router_z_loss_clip": 1.45800781, + "router_z_loss_mlp": 0.109375, + "step": 8709, + "time_per_iteration": 2.569312810897827 + }, + { + "auxiliary_loss_clip": 0.06434496, + "auxiliary_loss_mlp": 0.01269997, + "balance_loss_clip": 0.062785, + "balance_loss_mlp": 0.01256872, + "epoch": 0.5236735307380129, + "flos": 25527510910080.0, + "grad_norm": 1.8691534112354709, + "language_loss": 0.83896649, + "learning_rate": 1.9437302132730003e-06, + "loss": 0.91601145, + "num_input_tokens_seen": 187108775, + "router_z_loss_clip": 1.55859375, + "router_z_loss_mlp": 0.13128662, + "step": 8710, + "time_per_iteration": 2.5364856719970703 + }, + { + "auxiliary_loss_clip": 0.06424439, + "auxiliary_loss_mlp": 0.01271523, + "balance_loss_clip": 0.06278566, + "balance_loss_mlp": 0.01260347, + "epoch": 0.523733653990681, + "flos": 23589281790720.0, + "grad_norm": 1.796806294076298, + "language_loss": 0.69453466, + "learning_rate": 1.943340906834908e-06, + "loss": 0.77149427, + "num_input_tokens_seen": 187128830, + "router_z_loss_clip": 1.45898438, + "router_z_loss_mlp": 0.11181641, + "step": 8711, + "time_per_iteration": 2.5488204956054688 + }, + { + "auxiliary_loss_clip": 0.06423855, + "auxiliary_loss_mlp": 0.01267852, + "balance_loss_clip": 0.06275582, + "balance_loss_mlp": 0.01256539, + "epoch": 0.5237937772433489, + "flos": 21112698188160.0, + "grad_norm": 1.676774757059823, + "language_loss": 0.82997072, + "learning_rate": 1.9429516025453345e-06, + "loss": 0.90688783, + "num_input_tokens_seen": 187149570, + "router_z_loss_clip": 1.48242188, + "router_z_loss_mlp": 0.11322021, + "step": 8712, + "time_per_iteration": 4.064100980758667 + }, + { + "auxiliary_loss_clip": 0.0643232, + "auxiliary_loss_mlp": 0.01271192, + "balance_loss_clip": 0.06279981, + "balance_loss_mlp": 0.01259051, + "epoch": 0.5238539004960169, + "flos": 19179081043200.0, + "grad_norm": 1.8094880941691576, + "language_loss": 0.6993227, + "learning_rate": 1.9425623004190415e-06, + "loss": 0.77635783, + "num_input_tokens_seen": 187170575, + "router_z_loss_clip": 1.52246094, + "router_z_loss_mlp": 0.121521, + "step": 8713, + "time_per_iteration": 2.544586420059204 + }, + { + "auxiliary_loss_clip": 0.06435391, + "auxiliary_loss_mlp": 0.01268239, + "balance_loss_clip": 0.06280154, + "balance_loss_mlp": 0.01254834, + "epoch": 0.5239140237486848, + "flos": 17892914797440.0, + "grad_norm": 2.8365689324721597, + "language_loss": 0.76947498, + "learning_rate": 1.9421730004707925e-06, + "loss": 0.84651124, + "num_input_tokens_seen": 187187190, + "router_z_loss_clip": 1.55078125, + "router_z_loss_mlp": 0.13409424, + "step": 8714, + "time_per_iteration": 2.5225958824157715 + }, + { + "auxiliary_loss_clip": 0.06430446, + "auxiliary_loss_mlp": 0.01267137, + "balance_loss_clip": 0.06279821, + "balance_loss_mlp": 0.01255085, + "epoch": 0.5239741470013528, + "flos": 17936072449920.0, + "grad_norm": 1.8206248729771282, + "language_loss": 0.76218581, + "learning_rate": 1.9417837027153483e-06, + "loss": 0.83916163, + "num_input_tokens_seen": 187204350, + "router_z_loss_clip": 1.50390625, + "router_z_loss_mlp": 0.12060547, + "step": 8715, + "time_per_iteration": 2.479482650756836 + }, + { + "auxiliary_loss_clip": 0.06428694, + "auxiliary_loss_mlp": 0.01265255, + "balance_loss_clip": 0.06280876, + "balance_loss_mlp": 0.01253537, + "epoch": 0.5240342702540207, + "flos": 31001408513280.0, + "grad_norm": 1.518077309755953, + "language_loss": 0.71405065, + "learning_rate": 1.9413944071674723e-06, + "loss": 0.79099017, + "num_input_tokens_seen": 187225605, + "router_z_loss_clip": 1.47851562, + "router_z_loss_mlp": 0.1171875, + "step": 8716, + "time_per_iteration": 2.6313345432281494 + }, + { + "auxiliary_loss_clip": 0.06429261, + "auxiliary_loss_mlp": 0.01264727, + "balance_loss_clip": 0.06279399, + "balance_loss_mlp": 0.012541, + "epoch": 0.5240943935066887, + "flos": 25011308632320.0, + "grad_norm": 2.053994478361076, + "language_loss": 0.87371016, + "learning_rate": 1.941005113841926e-06, + "loss": 0.95065004, + "num_input_tokens_seen": 187241335, + "router_z_loss_clip": 1.49804688, + "router_z_loss_mlp": 0.10626221, + "step": 8717, + "time_per_iteration": 2.5242137908935547 + }, + { + "auxiliary_loss_clip": 0.06427871, + "auxiliary_loss_mlp": 0.01272314, + "balance_loss_clip": 0.06276905, + "balance_loss_mlp": 0.01260184, + "epoch": 0.5241545167593566, + "flos": 23665786168320.0, + "grad_norm": 1.9379813616750423, + "language_loss": 0.62001824, + "learning_rate": 1.9406158227534723e-06, + "loss": 0.69702005, + "num_input_tokens_seen": 187259925, + "router_z_loss_clip": 1.5078125, + "router_z_loss_mlp": 0.12139893, + "step": 8718, + "time_per_iteration": 2.5543830394744873 + }, + { + "auxiliary_loss_clip": 0.06436223, + "auxiliary_loss_mlp": 0.01271154, + "balance_loss_clip": 0.06282552, + "balance_loss_mlp": 0.01259006, + "epoch": 0.5242146400120247, + "flos": 23406490108800.0, + "grad_norm": 1.965252740565909, + "language_loss": 0.72457337, + "learning_rate": 1.940226533916872e-06, + "loss": 0.80164713, + "num_input_tokens_seen": 187279035, + "router_z_loss_clip": 1.53613281, + "router_z_loss_mlp": 0.12145996, + "step": 8719, + "time_per_iteration": 3.9948794841766357 + }, + { + "auxiliary_loss_clip": 0.06428128, + "auxiliary_loss_mlp": 0.01267914, + "balance_loss_clip": 0.0628122, + "balance_loss_mlp": 0.01256983, + "epoch": 0.5242747632646926, + "flos": 17754873995520.0, + "grad_norm": 2.179080036180393, + "language_loss": 0.73360658, + "learning_rate": 1.9398372473468877e-06, + "loss": 0.81056702, + "num_input_tokens_seen": 187297555, + "router_z_loss_clip": 1.46777344, + "router_z_loss_mlp": 0.10919189, + "step": 8720, + "time_per_iteration": 2.561491012573242 + }, + { + "auxiliary_loss_clip": 0.06431387, + "auxiliary_loss_mlp": 0.0126878, + "balance_loss_clip": 0.06281313, + "balance_loss_mlp": 0.01256227, + "epoch": 0.5243348865173606, + "flos": 32605849693440.0, + "grad_norm": 1.7043415367979953, + "language_loss": 0.70633399, + "learning_rate": 1.939447963058281e-06, + "loss": 0.78333569, + "num_input_tokens_seen": 187320265, + "router_z_loss_clip": 1.49902344, + "router_z_loss_mlp": 0.12561035, + "step": 8721, + "time_per_iteration": 2.6254172325134277 + }, + { + "auxiliary_loss_clip": 0.06427501, + "auxiliary_loss_mlp": 0.01269506, + "balance_loss_clip": 0.06277889, + "balance_loss_mlp": 0.01258008, + "epoch": 0.5243950097700285, + "flos": 25491229292160.0, + "grad_norm": 1.669973954204285, + "language_loss": 0.86888224, + "learning_rate": 1.939058681065813e-06, + "loss": 0.94585228, + "num_input_tokens_seen": 187338045, + "router_z_loss_clip": 1.49707031, + "router_z_loss_mlp": 0.1151123, + "step": 8722, + "time_per_iteration": 2.532735586166382 + }, + { + "auxiliary_loss_clip": 0.06423786, + "auxiliary_loss_mlp": 0.01270795, + "balance_loss_clip": 0.06276488, + "balance_loss_mlp": 0.01259041, + "epoch": 0.5244551330226965, + "flos": 15273846126720.0, + "grad_norm": 1.6547564845342364, + "language_loss": 0.80303264, + "learning_rate": 1.938669401384247e-06, + "loss": 0.87997842, + "num_input_tokens_seen": 187356040, + "router_z_loss_clip": 1.47265625, + "router_z_loss_mlp": 0.11743164, + "step": 8723, + "time_per_iteration": 2.519230842590332 + }, + { + "auxiliary_loss_clip": 0.06433833, + "auxiliary_loss_mlp": 0.01269065, + "balance_loss_clip": 0.06281124, + "balance_loss_mlp": 0.01256286, + "epoch": 0.5245152562753645, + "flos": 22243717399680.0, + "grad_norm": 1.8110090728616772, + "language_loss": 0.75572187, + "learning_rate": 1.9382801240283426e-06, + "loss": 0.83275086, + "num_input_tokens_seen": 187374185, + "router_z_loss_clip": 1.52539062, + "router_z_loss_mlp": 0.12780762, + "step": 8724, + "time_per_iteration": 2.503331422805786 + }, + { + "auxiliary_loss_clip": 0.06439602, + "auxiliary_loss_mlp": 0.01267267, + "balance_loss_clip": 0.06280126, + "balance_loss_mlp": 0.01254428, + "epoch": 0.5245753795280325, + "flos": 29434548689280.0, + "grad_norm": 1.6762764466906133, + "language_loss": 0.70858645, + "learning_rate": 1.9378908490128625e-06, + "loss": 0.78565514, + "num_input_tokens_seen": 187396640, + "router_z_loss_clip": 1.59179688, + "router_z_loss_mlp": 0.12835693, + "step": 8725, + "time_per_iteration": 2.6268577575683594 + }, + { + "auxiliary_loss_clip": 0.06331155, + "auxiliary_loss_mlp": 0.01254987, + "balance_loss_clip": 0.06266978, + "balance_loss_mlp": 0.01252628, + "epoch": 0.5246355027807005, + "flos": 58853569645440.0, + "grad_norm": 0.7398874669792804, + "language_loss": 0.55689812, + "learning_rate": 1.937501576352568e-06, + "loss": 0.63275951, + "num_input_tokens_seen": 187455945, + "router_z_loss_clip": 0.64111328, + "router_z_loss_mlp": 0.02354431, + "step": 8726, + "time_per_iteration": 3.1253981590270996 + }, + { + "auxiliary_loss_clip": 0.06326637, + "auxiliary_loss_mlp": 0.01254365, + "balance_loss_clip": 0.06262497, + "balance_loss_mlp": 0.01252303, + "epoch": 0.5246956260333684, + "flos": 64546792110720.0, + "grad_norm": 0.7865731844335093, + "language_loss": 0.58442128, + "learning_rate": 1.937112306062219e-06, + "loss": 0.66023123, + "num_input_tokens_seen": 187519975, + "router_z_loss_clip": 0.640625, + "router_z_loss_mlp": 0.02062988, + "step": 8727, + "time_per_iteration": 3.176279306411743 + }, + { + "auxiliary_loss_clip": 0.06432917, + "auxiliary_loss_mlp": 0.01270503, + "balance_loss_clip": 0.06279024, + "balance_loss_mlp": 0.01258118, + "epoch": 0.5247557492860364, + "flos": 24540276504960.0, + "grad_norm": 1.4599497814344178, + "language_loss": 0.70513123, + "learning_rate": 1.9367230381565786e-06, + "loss": 0.78216541, + "num_input_tokens_seen": 187541775, + "router_z_loss_clip": 1.53808594, + "router_z_loss_mlp": 0.12390137, + "step": 8728, + "time_per_iteration": 2.635087728500366 + }, + { + "auxiliary_loss_clip": 0.06426623, + "auxiliary_loss_mlp": 0.01271129, + "balance_loss_clip": 0.06274961, + "balance_loss_mlp": 0.01258815, + "epoch": 0.5248158725387043, + "flos": 18811946378880.0, + "grad_norm": 1.5300920869777792, + "language_loss": 0.69649124, + "learning_rate": 1.9363337726504062e-06, + "loss": 0.77346873, + "num_input_tokens_seen": 187560425, + "router_z_loss_clip": 1.51757812, + "router_z_loss_mlp": 0.12310791, + "step": 8729, + "time_per_iteration": 2.5286824703216553 + }, + { + "auxiliary_loss_clip": 0.06429707, + "auxiliary_loss_mlp": 0.01272402, + "balance_loss_clip": 0.06276232, + "balance_loss_mlp": 0.01260112, + "epoch": 0.5248759957913723, + "flos": 20961534222720.0, + "grad_norm": 1.931767440888087, + "language_loss": 0.83841878, + "learning_rate": 1.935944509558464e-06, + "loss": 0.91543984, + "num_input_tokens_seen": 187579930, + "router_z_loss_clip": 1.53417969, + "router_z_loss_mlp": 0.12280273, + "step": 8730, + "time_per_iteration": 2.50693678855896 + }, + { + "auxiliary_loss_clip": 0.06424531, + "auxiliary_loss_mlp": 0.01265175, + "balance_loss_clip": 0.06273736, + "balance_loss_mlp": 0.01253301, + "epoch": 0.5249361190440403, + "flos": 18666903761280.0, + "grad_norm": 2.7205788659727634, + "language_loss": 0.79795074, + "learning_rate": 1.9355552488955125e-06, + "loss": 0.87484777, + "num_input_tokens_seen": 187595365, + "router_z_loss_clip": 1.50976562, + "router_z_loss_mlp": 0.11877441, + "step": 8731, + "time_per_iteration": 2.5262162685394287 + }, + { + "auxiliary_loss_clip": 0.06421249, + "auxiliary_loss_mlp": 0.01268831, + "balance_loss_clip": 0.06275119, + "balance_loss_mlp": 0.01256653, + "epoch": 0.5249962422967083, + "flos": 24870249083520.0, + "grad_norm": 2.282421292997204, + "language_loss": 0.83455729, + "learning_rate": 1.935165990676312e-06, + "loss": 0.91145802, + "num_input_tokens_seen": 187614715, + "router_z_loss_clip": 1.46191406, + "router_z_loss_mlp": 0.12182617, + "step": 8732, + "time_per_iteration": 2.5442264080047607 + }, + { + "auxiliary_loss_clip": 0.06426094, + "auxiliary_loss_mlp": 0.01271634, + "balance_loss_clip": 0.06276669, + "balance_loss_mlp": 0.01259654, + "epoch": 0.5250563655493762, + "flos": 15267179727360.0, + "grad_norm": 1.5246135300121169, + "language_loss": 0.77770185, + "learning_rate": 1.9347767349156237e-06, + "loss": 0.85467911, + "num_input_tokens_seen": 187630745, + "router_z_loss_clip": 1.49414062, + "router_z_loss_mlp": 0.11975098, + "step": 8733, + "time_per_iteration": 2.5826051235198975 + }, + { + "auxiliary_loss_clip": 0.0643189, + "auxiliary_loss_mlp": 0.01266095, + "balance_loss_clip": 0.0627751, + "balance_loss_mlp": 0.01253655, + "epoch": 0.5251164888020442, + "flos": 18631209121920.0, + "grad_norm": 3.9739558224943683, + "language_loss": 0.81671995, + "learning_rate": 1.934387481628208e-06, + "loss": 0.89369977, + "num_input_tokens_seen": 187648200, + "router_z_loss_clip": 1.54492188, + "router_z_loss_mlp": 0.12445068, + "step": 8734, + "time_per_iteration": 2.496502637863159 + }, + { + "auxiliary_loss_clip": 0.0642469, + "auxiliary_loss_mlp": 0.01264669, + "balance_loss_clip": 0.06276481, + "balance_loss_mlp": 0.01253041, + "epoch": 0.5251766120547121, + "flos": 29717632108800.0, + "grad_norm": 1.407036688227265, + "language_loss": 0.77114183, + "learning_rate": 1.933998230828826e-06, + "loss": 0.84803545, + "num_input_tokens_seen": 187669205, + "router_z_loss_clip": 1.48144531, + "router_z_loss_mlp": 0.11627197, + "step": 8735, + "time_per_iteration": 2.5745790004730225 + }, + { + "auxiliary_loss_clip": 0.06423082, + "auxiliary_loss_mlp": 0.01265046, + "balance_loss_clip": 0.06273593, + "balance_loss_mlp": 0.01253632, + "epoch": 0.5252367353073801, + "flos": 23446964430720.0, + "grad_norm": 1.5621679512535565, + "language_loss": 0.80604559, + "learning_rate": 1.9336089825322376e-06, + "loss": 0.88292682, + "num_input_tokens_seen": 187690890, + "router_z_loss_clip": 1.49414062, + "router_z_loss_mlp": 0.11419678, + "step": 8736, + "time_per_iteration": 2.5257420539855957 + }, + { + "auxiliary_loss_clip": 0.06425665, + "auxiliary_loss_mlp": 0.0127044, + "balance_loss_clip": 0.06277201, + "balance_loss_mlp": 0.01258334, + "epoch": 0.5252968585600482, + "flos": 30818658758400.0, + "grad_norm": 2.1177707386756697, + "language_loss": 0.70240873, + "learning_rate": 1.9332197367532033e-06, + "loss": 0.77936983, + "num_input_tokens_seen": 187713045, + "router_z_loss_clip": 1.48339844, + "router_z_loss_mlp": 0.12097168, + "step": 8737, + "time_per_iteration": 2.5996742248535156 + }, + { + "auxiliary_loss_clip": 0.06423551, + "auxiliary_loss_mlp": 0.01268169, + "balance_loss_clip": 0.06272718, + "balance_loss_mlp": 0.01256564, + "epoch": 0.5253569818127161, + "flos": 20634035339520.0, + "grad_norm": 1.5486622918302246, + "language_loss": 0.7715745, + "learning_rate": 1.9328304935064833e-06, + "loss": 0.84849167, + "num_input_tokens_seen": 187733640, + "router_z_loss_clip": 1.50878906, + "router_z_loss_mlp": 0.11608887, + "step": 8738, + "time_per_iteration": 2.5352158546447754 + }, + { + "auxiliary_loss_clip": 0.06323943, + "auxiliary_loss_mlp": 0.01255398, + "balance_loss_clip": 0.06260057, + "balance_loss_mlp": 0.01253626, + "epoch": 0.5254171050653841, + "flos": 63448155302400.0, + "grad_norm": 0.7261228489339219, + "language_loss": 0.54416603, + "learning_rate": 1.932441252806837e-06, + "loss": 0.61995941, + "num_input_tokens_seen": 187792930, + "router_z_loss_clip": 0.640625, + "router_z_loss_mlp": 0.01774597, + "step": 8739, + "time_per_iteration": 3.1277644634246826 + }, + { + "auxiliary_loss_clip": 0.06426128, + "auxiliary_loss_mlp": 0.01267449, + "balance_loss_clip": 0.06276017, + "balance_loss_mlp": 0.01255457, + "epoch": 0.525477228318052, + "flos": 34678136545920.0, + "grad_norm": 1.6647555558701046, + "language_loss": 0.84639645, + "learning_rate": 1.9320520146690263e-06, + "loss": 0.92333221, + "num_input_tokens_seen": 187812495, + "router_z_loss_clip": 1.50195312, + "router_z_loss_mlp": 0.11993408, + "step": 8740, + "time_per_iteration": 2.658111572265625 + }, + { + "auxiliary_loss_clip": 0.06423901, + "auxiliary_loss_mlp": 0.01263794, + "balance_loss_clip": 0.06275214, + "balance_loss_mlp": 0.01251843, + "epoch": 0.52553735157072, + "flos": 17936575574400.0, + "grad_norm": 2.0969213447662156, + "language_loss": 0.69862366, + "learning_rate": 1.9316627791078093e-06, + "loss": 0.77550066, + "num_input_tokens_seen": 187829685, + "router_z_loss_clip": 1.48535156, + "router_z_loss_mlp": 0.11938477, + "step": 8741, + "time_per_iteration": 2.4757626056671143 + }, + { + "auxiliary_loss_clip": 0.0642582, + "auxiliary_loss_mlp": 0.01266561, + "balance_loss_clip": 0.06271701, + "balance_loss_mlp": 0.01254378, + "epoch": 0.5255974748233879, + "flos": 9945326557440.0, + "grad_norm": 2.083494644749303, + "language_loss": 0.66346633, + "learning_rate": 1.931273546137947e-06, + "loss": 0.74039018, + "num_input_tokens_seen": 187846495, + "router_z_loss_clip": 1.5390625, + "router_z_loss_mlp": 0.12188721, + "step": 8742, + "time_per_iteration": 2.4912760257720947 + }, + { + "auxiliary_loss_clip": 0.06430671, + "auxiliary_loss_mlp": 0.01267776, + "balance_loss_clip": 0.06273881, + "balance_loss_mlp": 0.01254592, + "epoch": 0.5256575980760559, + "flos": 16873256062080.0, + "grad_norm": 2.278792899782439, + "language_loss": 0.62974113, + "learning_rate": 1.9308843157741983e-06, + "loss": 0.7067256, + "num_input_tokens_seen": 187862010, + "router_z_loss_clip": 1.56835938, + "router_z_loss_mlp": 0.13195801, + "step": 8743, + "time_per_iteration": 3.8745810985565186 + }, + { + "auxiliary_loss_clip": 0.06328367, + "auxiliary_loss_mlp": 0.01251768, + "balance_loss_clip": 0.06264926, + "balance_loss_mlp": 0.01249956, + "epoch": 0.5257177213287239, + "flos": 62408105297280.0, + "grad_norm": 0.7594186151089873, + "language_loss": 0.54170012, + "learning_rate": 1.930495088031323e-06, + "loss": 0.6175015, + "num_input_tokens_seen": 187922730, + "router_z_loss_clip": 0.63671875, + "router_z_loss_mlp": 0.01808167, + "step": 8744, + "time_per_iteration": 3.2680962085723877 + }, + { + "auxiliary_loss_clip": 0.06434917, + "auxiliary_loss_mlp": 0.01266273, + "balance_loss_clip": 0.0627819, + "balance_loss_mlp": 0.01252635, + "epoch": 0.5257778445813919, + "flos": 20783144880000.0, + "grad_norm": 1.988296485781083, + "language_loss": 0.76358819, + "learning_rate": 1.9301058629240814e-06, + "loss": 0.84060007, + "num_input_tokens_seen": 187940160, + "router_z_loss_clip": 1.56640625, + "router_z_loss_mlp": 0.13653564, + "step": 8745, + "time_per_iteration": 2.5416345596313477 + }, + { + "auxiliary_loss_clip": 0.06422935, + "auxiliary_loss_mlp": 0.01269048, + "balance_loss_clip": 0.06273594, + "balance_loss_mlp": 0.0125733, + "epoch": 0.5258379678340598, + "flos": 17024168465280.0, + "grad_norm": 2.2863222877599703, + "language_loss": 0.81917781, + "learning_rate": 1.9297166404672324e-06, + "loss": 0.8960976, + "num_input_tokens_seen": 187958625, + "router_z_loss_clip": 1.49316406, + "router_z_loss_mlp": 0.1171875, + "step": 8746, + "time_per_iteration": 3.8924081325531006 + }, + { + "auxiliary_loss_clip": 0.06420557, + "auxiliary_loss_mlp": 0.01268181, + "balance_loss_clip": 0.06274772, + "balance_loss_mlp": 0.01257011, + "epoch": 0.5258980910867278, + "flos": 21075032977920.0, + "grad_norm": 1.8269554832422097, + "language_loss": 0.76250327, + "learning_rate": 1.9293274206755353e-06, + "loss": 0.83939064, + "num_input_tokens_seen": 187977575, + "router_z_loss_clip": 1.45703125, + "router_z_loss_mlp": 0.11157227, + "step": 8747, + "time_per_iteration": 2.5338385105133057 + }, + { + "auxiliary_loss_clip": 0.0641925, + "auxiliary_loss_mlp": 0.01266781, + "balance_loss_clip": 0.06273648, + "balance_loss_mlp": 0.01254443, + "epoch": 0.5259582143393957, + "flos": 18010312767360.0, + "grad_norm": 1.781184467493656, + "language_loss": 0.82852685, + "learning_rate": 1.9289382035637505e-06, + "loss": 0.90538716, + "num_input_tokens_seen": 187996650, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.12353516, + "step": 8748, + "time_per_iteration": 2.4989612102508545 + }, + { + "auxiliary_loss_clip": 0.06428373, + "auxiliary_loss_mlp": 0.0126857, + "balance_loss_clip": 0.06276021, + "balance_loss_mlp": 0.01255803, + "epoch": 0.5260183375920637, + "flos": 22790457290880.0, + "grad_norm": 2.0798716741461862, + "language_loss": 0.81033522, + "learning_rate": 1.9285489891466345e-06, + "loss": 0.88730466, + "num_input_tokens_seen": 188013510, + "router_z_loss_clip": 1.5234375, + "router_z_loss_mlp": 0.12756348, + "step": 8749, + "time_per_iteration": 2.541492462158203 + }, + { + "auxiliary_loss_clip": 0.06426647, + "auxiliary_loss_mlp": 0.01269736, + "balance_loss_clip": 0.06276764, + "balance_loss_mlp": 0.01257857, + "epoch": 0.5260784608447318, + "flos": 27059682343680.0, + "grad_norm": 1.8461671999009361, + "language_loss": 0.72827047, + "learning_rate": 1.9281597774389487e-06, + "loss": 0.80523431, + "num_input_tokens_seen": 188032085, + "router_z_loss_clip": 1.49804688, + "router_z_loss_mlp": 0.11877441, + "step": 8750, + "time_per_iteration": 2.55197811126709 + }, + { + "auxiliary_loss_clip": 0.06428036, + "auxiliary_loss_mlp": 0.01265815, + "balance_loss_clip": 0.06278102, + "balance_loss_mlp": 0.0125393, + "epoch": 0.5261385840973997, + "flos": 20668262532480.0, + "grad_norm": 1.3256906405876772, + "language_loss": 0.76755565, + "learning_rate": 1.9277705684554517e-06, + "loss": 0.8444941, + "num_input_tokens_seen": 188050590, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.11883545, + "step": 8751, + "time_per_iteration": 3.989189624786377 + }, + { + "auxiliary_loss_clip": 0.06427495, + "auxiliary_loss_mlp": 0.01266896, + "balance_loss_clip": 0.0627936, + "balance_loss_mlp": 0.01255286, + "epoch": 0.5261987073500677, + "flos": 23629336842240.0, + "grad_norm": 1.3401050149591014, + "language_loss": 0.76360512, + "learning_rate": 1.927381362210902e-06, + "loss": 0.84054899, + "num_input_tokens_seen": 188071620, + "router_z_loss_clip": 1.48242188, + "router_z_loss_mlp": 0.11608887, + "step": 8752, + "time_per_iteration": 2.6008472442626953 + }, + { + "auxiliary_loss_clip": 0.06432231, + "auxiliary_loss_mlp": 0.01266695, + "balance_loss_clip": 0.06278201, + "balance_loss_mlp": 0.01253487, + "epoch": 0.5262588306027356, + "flos": 27643626247680.0, + "grad_norm": 1.396446170400335, + "language_loss": 0.68317235, + "learning_rate": 1.926992158720058e-06, + "loss": 0.76016164, + "num_input_tokens_seen": 188091740, + "router_z_loss_clip": 1.53808594, + "router_z_loss_mlp": 0.13208008, + "step": 8753, + "time_per_iteration": 2.5851571559906006 + }, + { + "auxiliary_loss_clip": 0.06430234, + "auxiliary_loss_mlp": 0.01269545, + "balance_loss_clip": 0.06281005, + "balance_loss_mlp": 0.01257142, + "epoch": 0.5263189538554036, + "flos": 21765725383680.0, + "grad_norm": 1.5666571832863774, + "language_loss": 0.8392294, + "learning_rate": 1.9266029579976785e-06, + "loss": 0.91622722, + "num_input_tokens_seen": 188111165, + "router_z_loss_clip": 1.49121094, + "router_z_loss_mlp": 0.12384033, + "step": 8754, + "time_per_iteration": 2.552424907684326 + }, + { + "auxiliary_loss_clip": 0.06431299, + "auxiliary_loss_mlp": 0.01267122, + "balance_loss_clip": 0.06278868, + "balance_loss_mlp": 0.01254969, + "epoch": 0.5263790771080715, + "flos": 14280490373760.0, + "grad_norm": 9.005791031911038, + "language_loss": 0.87464845, + "learning_rate": 1.926213760058522e-06, + "loss": 0.95163268, + "num_input_tokens_seen": 188127825, + "router_z_loss_clip": 1.52539062, + "router_z_loss_mlp": 0.12139893, + "step": 8755, + "time_per_iteration": 2.4848403930664062 + }, + { + "auxiliary_loss_clip": 0.06329039, + "auxiliary_loss_mlp": 0.01251879, + "balance_loss_clip": 0.06265183, + "balance_loss_mlp": 0.01250204, + "epoch": 0.5264392003607395, + "flos": 65827298206080.0, + "grad_norm": 0.7019882104343015, + "language_loss": 0.5870319, + "learning_rate": 1.9258245649173477e-06, + "loss": 0.66284108, + "num_input_tokens_seen": 188194050, + "router_z_loss_clip": 0.640625, + "router_z_loss_mlp": 0.01678467, + "step": 8756, + "time_per_iteration": 3.275596857070923 + }, + { + "auxiliary_loss_clip": 0.06435139, + "auxiliary_loss_mlp": 0.0126978, + "balance_loss_clip": 0.06280214, + "balance_loss_mlp": 0.01257001, + "epoch": 0.5264993236134075, + "flos": 21038709432960.0, + "grad_norm": 1.5391071607522773, + "language_loss": 0.70246553, + "learning_rate": 1.925435372588913e-06, + "loss": 0.77951479, + "num_input_tokens_seen": 188212565, + "router_z_loss_clip": 1.54882812, + "router_z_loss_mlp": 0.12762451, + "step": 8757, + "time_per_iteration": 2.5078463554382324 + }, + { + "auxiliary_loss_clip": 0.06425242, + "auxiliary_loss_mlp": 0.01271353, + "balance_loss_clip": 0.06274789, + "balance_loss_mlp": 0.01259015, + "epoch": 0.5265594468660755, + "flos": 16623854784000.0, + "grad_norm": 1.5949031044885071, + "language_loss": 0.88366896, + "learning_rate": 1.9250461830879768e-06, + "loss": 0.96063495, + "num_input_tokens_seen": 188229505, + "router_z_loss_clip": 1.50683594, + "router_z_loss_mlp": 0.12341309, + "step": 8758, + "time_per_iteration": 2.503643751144409 + }, + { + "auxiliary_loss_clip": 0.06431897, + "auxiliary_loss_mlp": 0.01273559, + "balance_loss_clip": 0.06277955, + "balance_loss_mlp": 0.01260165, + "epoch": 0.5266195701187434, + "flos": 24141010999680.0, + "grad_norm": 1.3529199811462889, + "language_loss": 0.76677716, + "learning_rate": 1.9246569964292965e-06, + "loss": 0.84383172, + "num_input_tokens_seen": 188250395, + "router_z_loss_clip": 1.53808594, + "router_z_loss_mlp": 0.13391113, + "step": 8759, + "time_per_iteration": 4.0746564865112305 + }, + { + "auxiliary_loss_clip": 0.06426352, + "auxiliary_loss_mlp": 0.01272091, + "balance_loss_clip": 0.06278519, + "balance_loss_mlp": 0.01258603, + "epoch": 0.5266796933714114, + "flos": 15848314519680.0, + "grad_norm": 1.866695897182309, + "language_loss": 0.72062105, + "learning_rate": 1.9242678126276307e-06, + "loss": 0.79760551, + "num_input_tokens_seen": 188266785, + "router_z_loss_clip": 1.47753906, + "router_z_loss_mlp": 0.1348877, + "step": 8760, + "time_per_iteration": 2.4678292274475098 + }, + { + "auxiliary_loss_clip": 0.06434111, + "auxiliary_loss_mlp": 0.01266301, + "balance_loss_clip": 0.06277363, + "balance_loss_mlp": 0.01253152, + "epoch": 0.5267398166240793, + "flos": 20956377196800.0, + "grad_norm": 2.1261739839163263, + "language_loss": 0.76520377, + "learning_rate": 1.923878631697736e-06, + "loss": 0.84220791, + "num_input_tokens_seen": 188282525, + "router_z_loss_clip": 1.56835938, + "router_z_loss_mlp": 0.13140869, + "step": 8761, + "time_per_iteration": 2.5250892639160156 + }, + { + "auxiliary_loss_clip": 0.06431311, + "auxiliary_loss_mlp": 0.01268211, + "balance_loss_clip": 0.06277812, + "balance_loss_mlp": 0.01256696, + "epoch": 0.5267999398767473, + "flos": 21002763231360.0, + "grad_norm": 1.6289028393625449, + "language_loss": 0.7137605, + "learning_rate": 1.923489453654373e-06, + "loss": 0.79075569, + "num_input_tokens_seen": 188301395, + "router_z_loss_clip": 1.53417969, + "router_z_loss_mlp": 0.1151123, + "step": 8762, + "time_per_iteration": 2.50102162361145 + }, + { + "auxiliary_loss_clip": 0.06330161, + "auxiliary_loss_mlp": 0.01253956, + "balance_loss_clip": 0.06266978, + "balance_loss_mlp": 0.01252303, + "epoch": 0.5268600631294152, + "flos": 66867935189760.0, + "grad_norm": 0.9166133094312116, + "language_loss": 0.65129638, + "learning_rate": 1.9231002785122963e-06, + "loss": 0.72713745, + "num_input_tokens_seen": 188357665, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 0.01655579, + "step": 8763, + "time_per_iteration": 3.076136827468872 + }, + { + "auxiliary_loss_clip": 0.06428451, + "auxiliary_loss_mlp": 0.01268489, + "balance_loss_clip": 0.06276652, + "balance_loss_mlp": 0.01255918, + "epoch": 0.5269201863820833, + "flos": 17171307434880.0, + "grad_norm": 1.6120731347351738, + "language_loss": 0.71481144, + "learning_rate": 1.922711106286265e-06, + "loss": 0.79178083, + "num_input_tokens_seen": 188376935, + "router_z_loss_clip": 1.51660156, + "router_z_loss_mlp": 0.12579346, + "step": 8764, + "time_per_iteration": 2.5250110626220703 + }, + { + "auxiliary_loss_clip": 0.06431142, + "auxiliary_loss_mlp": 0.01269659, + "balance_loss_clip": 0.06278007, + "balance_loss_mlp": 0.01256141, + "epoch": 0.5269803096347513, + "flos": 20528963919360.0, + "grad_norm": 1.6456726211241999, + "language_loss": 0.74125087, + "learning_rate": 1.9223219369910368e-06, + "loss": 0.81825888, + "num_input_tokens_seen": 188394995, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.13531494, + "step": 8765, + "time_per_iteration": 2.552011251449585 + }, + { + "auxiliary_loss_clip": 0.06432463, + "auxiliary_loss_mlp": 0.01265724, + "balance_loss_clip": 0.06276315, + "balance_loss_mlp": 0.01253076, + "epoch": 0.5270404328874192, + "flos": 27237652416000.0, + "grad_norm": 1.4730640837864142, + "language_loss": 0.8564899, + "learning_rate": 1.9219327706413677e-06, + "loss": 0.9334718, + "num_input_tokens_seen": 188415475, + "router_z_loss_clip": 1.55957031, + "router_z_loss_mlp": 0.12640381, + "step": 8766, + "time_per_iteration": 2.5471248626708984 + }, + { + "auxiliary_loss_clip": 0.06432243, + "auxiliary_loss_mlp": 0.01271497, + "balance_loss_clip": 0.06278689, + "balance_loss_mlp": 0.01257812, + "epoch": 0.5271005561400872, + "flos": 23116866071040.0, + "grad_norm": 1.6309488802468612, + "language_loss": 0.79294145, + "learning_rate": 1.921543607252017e-06, + "loss": 0.8699789, + "num_input_tokens_seen": 188435665, + "router_z_loss_clip": 1.53515625, + "router_z_loss_mlp": 0.13690186, + "step": 8767, + "time_per_iteration": 2.5700509548187256 + }, + { + "auxiliary_loss_clip": 0.06431086, + "auxiliary_loss_mlp": 0.01269174, + "balance_loss_clip": 0.06277132, + "balance_loss_mlp": 0.01256532, + "epoch": 0.5271606793927551, + "flos": 22571342064000.0, + "grad_norm": 1.7993411408437945, + "language_loss": 0.73931158, + "learning_rate": 1.9211544468377394e-06, + "loss": 0.81631416, + "num_input_tokens_seen": 188455405, + "router_z_loss_clip": 1.5390625, + "router_z_loss_mlp": 0.12646484, + "step": 8768, + "time_per_iteration": 2.5251431465148926 + }, + { + "auxiliary_loss_clip": 0.06428067, + "auxiliary_loss_mlp": 0.01269059, + "balance_loss_clip": 0.0627723, + "balance_loss_mlp": 0.01257174, + "epoch": 0.5272208026454231, + "flos": 18769166069760.0, + "grad_norm": 1.6856667564577028, + "language_loss": 0.74105024, + "learning_rate": 1.9207652894132933e-06, + "loss": 0.81802148, + "num_input_tokens_seen": 188472940, + "router_z_loss_clip": 1.50878906, + "router_z_loss_mlp": 0.11883545, + "step": 8769, + "time_per_iteration": 2.518446683883667 + }, + { + "auxiliary_loss_clip": 0.06431002, + "auxiliary_loss_mlp": 0.01267563, + "balance_loss_clip": 0.06279421, + "balance_loss_mlp": 0.01255172, + "epoch": 0.5272809258980911, + "flos": 20418358129920.0, + "grad_norm": 1.672714058447801, + "language_loss": 0.74041271, + "learning_rate": 1.920376134993436e-06, + "loss": 0.81739843, + "num_input_tokens_seen": 188493035, + "router_z_loss_clip": 1.51367188, + "router_z_loss_mlp": 0.1239624, + "step": 8770, + "time_per_iteration": 2.5188913345336914 + }, + { + "auxiliary_loss_clip": 0.06428713, + "auxiliary_loss_mlp": 0.01271059, + "balance_loss_clip": 0.06278759, + "balance_loss_mlp": 0.01259085, + "epoch": 0.5273410491507591, + "flos": 28264271040000.0, + "grad_norm": 1.8244918854449486, + "language_loss": 0.68641269, + "learning_rate": 1.9199869835929224e-06, + "loss": 0.76341033, + "num_input_tokens_seen": 188513860, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.11987305, + "step": 8771, + "time_per_iteration": 2.5867247581481934 + }, + { + "auxiliary_loss_clip": 0.06424269, + "auxiliary_loss_mlp": 0.01271661, + "balance_loss_clip": 0.06276186, + "balance_loss_mlp": 0.01259704, + "epoch": 0.527401172403427, + "flos": 22461658669440.0, + "grad_norm": 11.676913645943259, + "language_loss": 0.7669906, + "learning_rate": 1.9195978352265115e-06, + "loss": 0.84394991, + "num_input_tokens_seen": 188533345, + "router_z_loss_clip": 1.48242188, + "router_z_loss_mlp": 0.11938477, + "step": 8772, + "time_per_iteration": 2.5199668407440186 + }, + { + "auxiliary_loss_clip": 0.06429616, + "auxiliary_loss_mlp": 0.01267782, + "balance_loss_clip": 0.0627689, + "balance_loss_mlp": 0.01255599, + "epoch": 0.527461295656095, + "flos": 21037158132480.0, + "grad_norm": 2.161876297932061, + "language_loss": 0.66294622, + "learning_rate": 1.9192086899089585e-06, + "loss": 0.73992014, + "num_input_tokens_seen": 188551550, + "router_z_loss_clip": 1.52734375, + "router_z_loss_mlp": 0.12176514, + "step": 8773, + "time_per_iteration": 2.5476229190826416 + }, + { + "auxiliary_loss_clip": 0.06430208, + "auxiliary_loss_mlp": 0.01267896, + "balance_loss_clip": 0.06277348, + "balance_loss_mlp": 0.01256643, + "epoch": 0.5275214189087629, + "flos": 26329060667520.0, + "grad_norm": 1.7199176113539936, + "language_loss": 0.86321867, + "learning_rate": 1.91881954765502e-06, + "loss": 0.94019973, + "num_input_tokens_seen": 188571615, + "router_z_loss_clip": 1.52929688, + "router_z_loss_mlp": 0.11254883, + "step": 8774, + "time_per_iteration": 2.545171022415161 + }, + { + "auxiliary_loss_clip": 0.06427547, + "auxiliary_loss_mlp": 0.01271648, + "balance_loss_clip": 0.06276767, + "balance_loss_mlp": 0.01259525, + "epoch": 0.5275815421614309, + "flos": 20053110182400.0, + "grad_norm": 1.6744248524719214, + "language_loss": 0.80195713, + "learning_rate": 1.9184304084794523e-06, + "loss": 0.87894905, + "num_input_tokens_seen": 188591965, + "router_z_loss_clip": 1.50683594, + "router_z_loss_mlp": 0.12121582, + "step": 8775, + "time_per_iteration": 2.544409990310669 + }, + { + "auxiliary_loss_clip": 0.06422298, + "auxiliary_loss_mlp": 0.01270371, + "balance_loss_clip": 0.06275839, + "balance_loss_mlp": 0.01257968, + "epoch": 0.5276416654140988, + "flos": 21438310354560.0, + "grad_norm": 1.5933640173688606, + "language_loss": 0.83310181, + "learning_rate": 1.918041272397012e-06, + "loss": 0.91002852, + "num_input_tokens_seen": 188610675, + "router_z_loss_clip": 1.46289062, + "router_z_loss_mlp": 0.1239624, + "step": 8776, + "time_per_iteration": 2.5175352096557617 + }, + { + "auxiliary_loss_clip": 0.06428739, + "auxiliary_loss_mlp": 0.012708, + "balance_loss_clip": 0.06277907, + "balance_loss_mlp": 0.0125867, + "epoch": 0.5277017886667669, + "flos": 17170762383360.0, + "grad_norm": 1.5849666431846519, + "language_loss": 0.67932826, + "learning_rate": 1.9176521394224547e-06, + "loss": 0.7563237, + "num_input_tokens_seen": 188628235, + "router_z_loss_clip": 1.50878906, + "router_z_loss_mlp": 0.12127686, + "step": 8777, + "time_per_iteration": 2.5778138637542725 + }, + { + "auxiliary_loss_clip": 0.06429909, + "auxiliary_loss_mlp": 0.01265517, + "balance_loss_clip": 0.06281164, + "balance_loss_mlp": 0.01253935, + "epoch": 0.5277619119194349, + "flos": 20454262404480.0, + "grad_norm": 1.855602906151282, + "language_loss": 0.82547855, + "learning_rate": 1.9172630095705358e-06, + "loss": 0.90243274, + "num_input_tokens_seen": 188648925, + "router_z_loss_clip": 1.48632812, + "router_z_loss_mlp": 0.11584473, + "step": 8778, + "time_per_iteration": 2.571700096130371 + }, + { + "auxiliary_loss_clip": 0.06433128, + "auxiliary_loss_mlp": 0.01269297, + "balance_loss_clip": 0.06280521, + "balance_loss_mlp": 0.01257114, + "epoch": 0.5278220351721028, + "flos": 24067944639360.0, + "grad_norm": 1.9512823836083997, + "language_loss": 0.79944891, + "learning_rate": 1.916873882856013e-06, + "loss": 0.87647313, + "num_input_tokens_seen": 188668125, + "router_z_loss_clip": 1.52441406, + "router_z_loss_mlp": 0.1217041, + "step": 8779, + "time_per_iteration": 2.562757968902588 + }, + { + "auxiliary_loss_clip": 0.06427805, + "auxiliary_loss_mlp": 0.01263718, + "balance_loss_clip": 0.06278832, + "balance_loss_mlp": 0.01252429, + "epoch": 0.5278821584247708, + "flos": 24649540629120.0, + "grad_norm": 2.3350915047762957, + "language_loss": 0.77251387, + "learning_rate": 1.9164847592936406e-06, + "loss": 0.84942913, + "num_input_tokens_seen": 188684410, + "router_z_loss_clip": 1.48828125, + "router_z_loss_mlp": 0.11291504, + "step": 8780, + "time_per_iteration": 2.517606258392334 + }, + { + "auxiliary_loss_clip": 0.0643455, + "auxiliary_loss_mlp": 0.01267518, + "balance_loss_clip": 0.06281555, + "balance_loss_mlp": 0.01254507, + "epoch": 0.5279422816774387, + "flos": 35417017848960.0, + "grad_norm": 1.6574386864631518, + "language_loss": 0.69489729, + "learning_rate": 1.916095638898174e-06, + "loss": 0.77191794, + "num_input_tokens_seen": 188706130, + "router_z_loss_clip": 1.52832031, + "router_z_loss_mlp": 0.13018799, + "step": 8781, + "time_per_iteration": 2.693525791168213 + }, + { + "auxiliary_loss_clip": 0.06421035, + "auxiliary_loss_mlp": 0.01270298, + "balance_loss_clip": 0.06274436, + "balance_loss_mlp": 0.01259051, + "epoch": 0.5280024049301068, + "flos": 22973794024320.0, + "grad_norm": 1.4417281394316688, + "language_loss": 0.7270093, + "learning_rate": 1.9157065216843696e-06, + "loss": 0.80392265, + "num_input_tokens_seen": 188725030, + "router_z_loss_clip": 1.46679688, + "router_z_loss_mlp": 0.11254883, + "step": 8782, + "time_per_iteration": 2.5421454906463623 + }, + { + "auxiliary_loss_clip": 0.06428084, + "auxiliary_loss_mlp": 0.01267241, + "balance_loss_clip": 0.06279479, + "balance_loss_mlp": 0.01255314, + "epoch": 0.5280625281827747, + "flos": 21514143899520.0, + "grad_norm": 1.839654531053583, + "language_loss": 0.68914783, + "learning_rate": 1.915317407666982e-06, + "loss": 0.76610112, + "num_input_tokens_seen": 188744325, + "router_z_loss_clip": 1.48535156, + "router_z_loss_mlp": 0.1192627, + "step": 8783, + "time_per_iteration": 4.037707328796387 + }, + { + "auxiliary_loss_clip": 0.06440329, + "auxiliary_loss_mlp": 0.01270068, + "balance_loss_clip": 0.06282043, + "balance_loss_mlp": 0.01256281, + "epoch": 0.5281226514354427, + "flos": 31215534422400.0, + "grad_norm": 1.947626233704344, + "language_loss": 0.69763857, + "learning_rate": 1.9149282968607674e-06, + "loss": 0.77474254, + "num_input_tokens_seen": 188765100, + "router_z_loss_clip": 1.58203125, + "router_z_loss_mlp": 0.13793945, + "step": 8784, + "time_per_iteration": 2.6415882110595703 + }, + { + "auxiliary_loss_clip": 0.06436743, + "auxiliary_loss_mlp": 0.01269839, + "balance_loss_clip": 0.06277036, + "balance_loss_mlp": 0.01256393, + "epoch": 0.5281827746881106, + "flos": 25084039576320.0, + "grad_norm": 1.9575438568521135, + "language_loss": 0.75138849, + "learning_rate": 1.91453918928048e-06, + "loss": 0.82845432, + "num_input_tokens_seen": 188783995, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.13458252, + "step": 8785, + "time_per_iteration": 2.5360119342803955 + }, + { + "auxiliary_loss_clip": 0.06430692, + "auxiliary_loss_mlp": 0.01270335, + "balance_loss_clip": 0.06279787, + "balance_loss_mlp": 0.01257806, + "epoch": 0.5282428979407786, + "flos": 20637515283840.0, + "grad_norm": 2.81532856062796, + "language_loss": 0.83379281, + "learning_rate": 1.9141500849408745e-06, + "loss": 0.91080302, + "num_input_tokens_seen": 188803120, + "router_z_loss_clip": 1.50976562, + "router_z_loss_mlp": 0.12518311, + "step": 8786, + "time_per_iteration": 3.923038959503174 + }, + { + "auxiliary_loss_clip": 0.06426571, + "auxiliary_loss_mlp": 0.01265911, + "balance_loss_clip": 0.0628151, + "balance_loss_mlp": 0.01255248, + "epoch": 0.5283030211934465, + "flos": 22426005957120.0, + "grad_norm": 2.0503071903036134, + "language_loss": 0.82639015, + "learning_rate": 1.9137609838567076e-06, + "loss": 0.90331495, + "num_input_tokens_seen": 188820960, + "router_z_loss_clip": 1.45117188, + "router_z_loss_mlp": 0.10650635, + "step": 8787, + "time_per_iteration": 2.549422025680542 + }, + { + "auxiliary_loss_clip": 0.06423321, + "auxiliary_loss_mlp": 0.01271192, + "balance_loss_clip": 0.06276572, + "balance_loss_mlp": 0.01259932, + "epoch": 0.5283631444461145, + "flos": 23620951434240.0, + "grad_norm": 1.6336970157139816, + "language_loss": 0.83324271, + "learning_rate": 1.9133718860427316e-06, + "loss": 0.91018784, + "num_input_tokens_seen": 188837165, + "router_z_loss_clip": 1.46582031, + "router_z_loss_mlp": 0.11260986, + "step": 8788, + "time_per_iteration": 2.4937057495117188 + }, + { + "auxiliary_loss_clip": 0.06426245, + "auxiliary_loss_mlp": 0.01271299, + "balance_loss_clip": 0.06279786, + "balance_loss_mlp": 0.0125886, + "epoch": 0.5284232676987825, + "flos": 32680341573120.0, + "grad_norm": 1.675322731323109, + "language_loss": 0.75004017, + "learning_rate": 1.9129827915137027e-06, + "loss": 0.82701558, + "num_input_tokens_seen": 188858555, + "router_z_loss_clip": 1.46484375, + "router_z_loss_mlp": 0.12451172, + "step": 8789, + "time_per_iteration": 2.6138312816619873 + }, + { + "auxiliary_loss_clip": 0.06430633, + "auxiliary_loss_mlp": 0.01265881, + "balance_loss_clip": 0.06280988, + "balance_loss_mlp": 0.01254139, + "epoch": 0.5284833909514505, + "flos": 26768213516160.0, + "grad_norm": 1.5707088647426293, + "language_loss": 0.70574284, + "learning_rate": 1.9125937002843754e-06, + "loss": 0.78270793, + "num_input_tokens_seen": 188879050, + "router_z_loss_clip": 1.49609375, + "router_z_loss_mlp": 0.11743164, + "step": 8790, + "time_per_iteration": 2.5883655548095703 + }, + { + "auxiliary_loss_clip": 0.06427436, + "auxiliary_loss_mlp": 0.01266819, + "balance_loss_clip": 0.06280458, + "balance_loss_mlp": 0.01255506, + "epoch": 0.5285435142041185, + "flos": 22097207335680.0, + "grad_norm": 1.512627214826232, + "language_loss": 0.79474425, + "learning_rate": 1.9122046123695036e-06, + "loss": 0.87168682, + "num_input_tokens_seen": 188898885, + "router_z_loss_clip": 1.46972656, + "router_z_loss_mlp": 0.11309814, + "step": 8791, + "time_per_iteration": 4.033270835876465 + }, + { + "auxiliary_loss_clip": 0.06429024, + "auxiliary_loss_mlp": 0.01266875, + "balance_loss_clip": 0.06280901, + "balance_loss_mlp": 0.01255205, + "epoch": 0.5286036374567864, + "flos": 20381615314560.0, + "grad_norm": 2.07521505612664, + "language_loss": 0.65493345, + "learning_rate": 1.9118155277838423e-06, + "loss": 0.73189247, + "num_input_tokens_seen": 188917225, + "router_z_loss_clip": 1.48144531, + "router_z_loss_mlp": 0.11676025, + "step": 8792, + "time_per_iteration": 2.521308183670044 + }, + { + "auxiliary_loss_clip": 0.06423797, + "auxiliary_loss_mlp": 0.01264198, + "balance_loss_clip": 0.06276767, + "balance_loss_mlp": 0.01253415, + "epoch": 0.5286637607094544, + "flos": 24358952269440.0, + "grad_norm": 2.076646851589869, + "language_loss": 0.79861224, + "learning_rate": 1.9114264465421443e-06, + "loss": 0.87549216, + "num_input_tokens_seen": 188936120, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 0.10778809, + "step": 8793, + "time_per_iteration": 2.5511038303375244 + }, + { + "auxiliary_loss_clip": 0.06422493, + "auxiliary_loss_mlp": 0.01268071, + "balance_loss_clip": 0.062755, + "balance_loss_mlp": 0.01256168, + "epoch": 0.5287238839621223, + "flos": 17276295000960.0, + "grad_norm": 2.078436862745294, + "language_loss": 0.85337698, + "learning_rate": 1.9110373686591645e-06, + "loss": 0.93028271, + "num_input_tokens_seen": 188953405, + "router_z_loss_clip": 1.46972656, + "router_z_loss_mlp": 0.11901855, + "step": 8794, + "time_per_iteration": 2.4898123741149902 + }, + { + "auxiliary_loss_clip": 0.06434184, + "auxiliary_loss_mlp": 0.01268266, + "balance_loss_clip": 0.0627749, + "balance_loss_mlp": 0.01255284, + "epoch": 0.5287840072147904, + "flos": 17572711219200.0, + "grad_norm": 2.1545808018265427, + "language_loss": 0.67890751, + "learning_rate": 1.9106482941496564e-06, + "loss": 0.75593209, + "num_input_tokens_seen": 188971150, + "router_z_loss_clip": 1.56542969, + "router_z_loss_mlp": 0.12982178, + "step": 8795, + "time_per_iteration": 2.5213987827301025 + }, + { + "auxiliary_loss_clip": 0.0642955, + "auxiliary_loss_mlp": 0.01269682, + "balance_loss_clip": 0.06279209, + "balance_loss_mlp": 0.01257714, + "epoch": 0.5288441304674583, + "flos": 18558100834560.0, + "grad_norm": 1.7521680482784363, + "language_loss": 0.80681872, + "learning_rate": 1.910259223028374e-06, + "loss": 0.88381112, + "num_input_tokens_seen": 188989550, + "router_z_loss_clip": 1.50390625, + "router_z_loss_mlp": 0.11968994, + "step": 8796, + "time_per_iteration": 2.4875407218933105 + }, + { + "auxiliary_loss_clip": 0.06428242, + "auxiliary_loss_mlp": 0.01267761, + "balance_loss_clip": 0.06279264, + "balance_loss_mlp": 0.01255656, + "epoch": 0.5289042537201263, + "flos": 20820935871360.0, + "grad_norm": 1.952583587455058, + "language_loss": 0.69353104, + "learning_rate": 1.909870155310071e-06, + "loss": 0.770491, + "num_input_tokens_seen": 189008795, + "router_z_loss_clip": 1.48925781, + "router_z_loss_mlp": 0.12097168, + "step": 8797, + "time_per_iteration": 2.5311903953552246 + }, + { + "auxiliary_loss_clip": 0.06424771, + "auxiliary_loss_mlp": 0.01268361, + "balance_loss_clip": 0.06280869, + "balance_loss_mlp": 0.01256857, + "epoch": 0.5289643769727942, + "flos": 15739553520000.0, + "grad_norm": 1.4672049002002021, + "language_loss": 0.82371795, + "learning_rate": 1.9094810910095005e-06, + "loss": 0.90064925, + "num_input_tokens_seen": 189025540, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.11499023, + "step": 8798, + "time_per_iteration": 3.947748899459839 + }, + { + "auxiliary_loss_clip": 0.06430193, + "auxiliary_loss_mlp": 0.01268372, + "balance_loss_clip": 0.06277348, + "balance_loss_mlp": 0.01255181, + "epoch": 0.5290245002254622, + "flos": 19543490449920.0, + "grad_norm": 2.0391495748491133, + "language_loss": 0.71206701, + "learning_rate": 1.9090920301414166e-06, + "loss": 0.78905261, + "num_input_tokens_seen": 189044885, + "router_z_loss_clip": 1.52929688, + "router_z_loss_mlp": 0.13201904, + "step": 8799, + "time_per_iteration": 2.5031862258911133 + }, + { + "auxiliary_loss_clip": 0.06420026, + "auxiliary_loss_mlp": 0.01267776, + "balance_loss_clip": 0.06277078, + "balance_loss_mlp": 0.01256124, + "epoch": 0.5290846234781301, + "flos": 15820586017920.0, + "grad_norm": 1.9322407735459124, + "language_loss": 0.69337815, + "learning_rate": 1.9087029727205716e-06, + "loss": 0.77025622, + "num_input_tokens_seen": 189061280, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.11657715, + "step": 8800, + "time_per_iteration": 2.5130701065063477 + }, + { + "auxiliary_loss_clip": 0.06335981, + "auxiliary_loss_mlp": 0.01252268, + "balance_loss_clip": 0.06272759, + "balance_loss_mlp": 0.01250352, + "epoch": 0.5291447467307981, + "flos": 70076272498560.0, + "grad_norm": 0.8722049049478691, + "language_loss": 0.5706265, + "learning_rate": 1.9083139187617193e-06, + "loss": 0.64650893, + "num_input_tokens_seen": 189114775, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 0.01913452, + "step": 8801, + "time_per_iteration": 3.0075480937957764 + }, + { + "auxiliary_loss_clip": 0.06425781, + "auxiliary_loss_mlp": 0.01269363, + "balance_loss_clip": 0.06275494, + "balance_loss_mlp": 0.01257978, + "epoch": 0.529204869983466, + "flos": 28371396885120.0, + "grad_norm": 1.559087936128458, + "language_loss": 0.64462554, + "learning_rate": 1.9079248682796123e-06, + "loss": 0.72157693, + "num_input_tokens_seen": 189134700, + "router_z_loss_clip": 1.50292969, + "router_z_loss_mlp": 0.1138916, + "step": 8802, + "time_per_iteration": 2.568263053894043 + }, + { + "auxiliary_loss_clip": 0.06423493, + "auxiliary_loss_mlp": 0.01268948, + "balance_loss_clip": 0.06277072, + "balance_loss_mlp": 0.01257969, + "epoch": 0.5292649932361341, + "flos": 33766064853120.0, + "grad_norm": 1.9436732858799899, + "language_loss": 0.69115645, + "learning_rate": 1.907535821289003e-06, + "loss": 0.76808089, + "num_input_tokens_seen": 189155365, + "router_z_loss_clip": 1.46289062, + "router_z_loss_mlp": 0.10980225, + "step": 8803, + "time_per_iteration": 2.637096881866455 + }, + { + "auxiliary_loss_clip": 0.06421783, + "auxiliary_loss_mlp": 0.01270558, + "balance_loss_clip": 0.0627604, + "balance_loss_mlp": 0.01258596, + "epoch": 0.5293251164888021, + "flos": 20453717352960.0, + "grad_norm": 1.815171914881367, + "language_loss": 0.75997305, + "learning_rate": 1.9071467778046458e-06, + "loss": 0.83689642, + "num_input_tokens_seen": 189173885, + "router_z_loss_clip": 1.45800781, + "router_z_loss_mlp": 0.11962891, + "step": 8804, + "time_per_iteration": 2.5163068771362305 + }, + { + "auxiliary_loss_clip": 0.0632845, + "auxiliary_loss_mlp": 0.01252381, + "balance_loss_clip": 0.06265265, + "balance_loss_mlp": 0.01250461, + "epoch": 0.52938523974147, + "flos": 66567856590720.0, + "grad_norm": 0.7410273965373205, + "language_loss": 0.52945232, + "learning_rate": 1.906757737841291e-06, + "loss": 0.60526061, + "num_input_tokens_seen": 189236515, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 0.01916504, + "step": 8805, + "time_per_iteration": 3.24060320854187 + }, + { + "auxiliary_loss_clip": 0.06328098, + "auxiliary_loss_mlp": 0.01252617, + "balance_loss_clip": 0.06265187, + "balance_loss_mlp": 0.01250968, + "epoch": 0.529445362994138, + "flos": 67172065983360.0, + "grad_norm": 1.018872897712542, + "language_loss": 0.63735455, + "learning_rate": 1.906368701413693e-06, + "loss": 0.71316171, + "num_input_tokens_seen": 189300500, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 0.01652527, + "step": 8806, + "time_per_iteration": 3.1444826126098633 + }, + { + "auxiliary_loss_clip": 0.06429877, + "auxiliary_loss_mlp": 0.01268417, + "balance_loss_clip": 0.06274825, + "balance_loss_mlp": 0.01256073, + "epoch": 0.5295054862468059, + "flos": 17755167484800.0, + "grad_norm": 1.837636262170248, + "language_loss": 0.7251606, + "learning_rate": 1.9059796685366026e-06, + "loss": 0.80214357, + "num_input_tokens_seen": 189319745, + "router_z_loss_clip": 1.54980469, + "router_z_loss_mlp": 0.12335205, + "step": 8807, + "time_per_iteration": 2.513139247894287 + }, + { + "auxiliary_loss_clip": 0.06424799, + "auxiliary_loss_mlp": 0.01267744, + "balance_loss_clip": 0.06278958, + "balance_loss_mlp": 0.01257241, + "epoch": 0.529565609499474, + "flos": 11401622519040.0, + "grad_norm": 2.5266289150801295, + "language_loss": 0.69956362, + "learning_rate": 1.9055906392247723e-06, + "loss": 0.77648908, + "num_input_tokens_seen": 189334550, + "router_z_loss_clip": 1.45800781, + "router_z_loss_mlp": 0.1050415, + "step": 8808, + "time_per_iteration": 2.472822666168213 + }, + { + "auxiliary_loss_clip": 0.06422195, + "auxiliary_loss_mlp": 0.0126947, + "balance_loss_clip": 0.06274572, + "balance_loss_mlp": 0.01258861, + "epoch": 0.5296257327521419, + "flos": 17201174215680.0, + "grad_norm": 2.036831994826339, + "language_loss": 0.87141514, + "learning_rate": 1.9052016134929554e-06, + "loss": 0.94833171, + "num_input_tokens_seen": 189351735, + "router_z_loss_clip": 1.47851562, + "router_z_loss_mlp": 0.10614014, + "step": 8809, + "time_per_iteration": 2.5245158672332764 + }, + { + "auxiliary_loss_clip": 0.06436493, + "auxiliary_loss_mlp": 0.01270155, + "balance_loss_clip": 0.062795, + "balance_loss_mlp": 0.01257138, + "epoch": 0.5296858560048099, + "flos": 39972806265600.0, + "grad_norm": 1.6505081453472243, + "language_loss": 0.64378583, + "learning_rate": 1.9048125913559016e-06, + "loss": 0.72085232, + "num_input_tokens_seen": 189373105, + "router_z_loss_clip": 1.56835938, + "router_z_loss_mlp": 0.13037109, + "step": 8810, + "time_per_iteration": 2.6857082843780518 + }, + { + "auxiliary_loss_clip": 0.06422746, + "auxiliary_loss_mlp": 0.01270623, + "balance_loss_clip": 0.06277126, + "balance_loss_mlp": 0.01259012, + "epoch": 0.5297459792574778, + "flos": 20968032913920.0, + "grad_norm": 1.5863211204070509, + "language_loss": 0.68117309, + "learning_rate": 1.9044235728283646e-06, + "loss": 0.75810677, + "num_input_tokens_seen": 189394615, + "router_z_loss_clip": 1.45605469, + "router_z_loss_mlp": 0.11608887, + "step": 8811, + "time_per_iteration": 2.5947864055633545 + }, + { + "auxiliary_loss_clip": 0.06326769, + "auxiliary_loss_mlp": 0.01252115, + "balance_loss_clip": 0.06264065, + "balance_loss_mlp": 0.0125052, + "epoch": 0.5298061025101458, + "flos": 66542532658560.0, + "grad_norm": 0.6560344299955198, + "language_loss": 0.53324163, + "learning_rate": 1.9040345579250953e-06, + "loss": 0.60903049, + "num_input_tokens_seen": 189459750, + "router_z_loss_clip": 0.62890625, + "router_z_loss_mlp": 0.01597595, + "step": 8812, + "time_per_iteration": 3.2503774166107178 + }, + { + "auxiliary_loss_clip": 0.06327102, + "auxiliary_loss_mlp": 0.01252134, + "balance_loss_clip": 0.06264044, + "balance_loss_mlp": 0.01250548, + "epoch": 0.5298662257628137, + "flos": 67683488578560.0, + "grad_norm": 0.7118690065629296, + "language_loss": 0.56452167, + "learning_rate": 1.9036455466608453e-06, + "loss": 0.64031398, + "num_input_tokens_seen": 189527540, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 0.01586151, + "step": 8813, + "time_per_iteration": 3.211704730987549 + }, + { + "auxiliary_loss_clip": 0.06420116, + "auxiliary_loss_mlp": 0.0126288, + "balance_loss_clip": 0.06277177, + "balance_loss_mlp": 0.01252223, + "epoch": 0.5299263490154817, + "flos": 19652544938880.0, + "grad_norm": 1.6476785970765333, + "language_loss": 0.82062042, + "learning_rate": 1.9032565390503657e-06, + "loss": 0.89745033, + "num_input_tokens_seen": 189546900, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.10656738, + "step": 8814, + "time_per_iteration": 2.5407004356384277 + }, + { + "auxiliary_loss_clip": 0.06433088, + "auxiliary_loss_mlp": 0.01266965, + "balance_loss_clip": 0.062782, + "balance_loss_mlp": 0.01255646, + "epoch": 0.5299864722681497, + "flos": 22061638477440.0, + "grad_norm": 1.5146312250557674, + "language_loss": 0.85424864, + "learning_rate": 1.9028675351084076e-06, + "loss": 0.93124914, + "num_input_tokens_seen": 189566490, + "router_z_loss_clip": 1.54882812, + "router_z_loss_mlp": 0.11322021, + "step": 8815, + "time_per_iteration": 2.511718273162842 + }, + { + "auxiliary_loss_clip": 0.06421779, + "auxiliary_loss_mlp": 0.01265999, + "balance_loss_clip": 0.0627707, + "balance_loss_mlp": 0.01254573, + "epoch": 0.5300465955208177, + "flos": 21770379285120.0, + "grad_norm": 2.2057457770846947, + "language_loss": 0.67210793, + "learning_rate": 1.9024785348497225e-06, + "loss": 0.74898565, + "num_input_tokens_seen": 189585580, + "router_z_loss_clip": 1.44824219, + "router_z_loss_mlp": 0.11431885, + "step": 8816, + "time_per_iteration": 2.564680576324463 + }, + { + "auxiliary_loss_clip": 0.06425485, + "auxiliary_loss_mlp": 0.01269628, + "balance_loss_clip": 0.06278205, + "balance_loss_mlp": 0.01258106, + "epoch": 0.5301067187734857, + "flos": 43006401884160.0, + "grad_norm": 1.5302739112082, + "language_loss": 0.72652006, + "learning_rate": 1.9020895382890611e-06, + "loss": 0.80347115, + "num_input_tokens_seen": 189608485, + "router_z_loss_clip": 1.47265625, + "router_z_loss_mlp": 0.1151123, + "step": 8817, + "time_per_iteration": 2.719486951828003 + }, + { + "auxiliary_loss_clip": 0.06425378, + "auxiliary_loss_mlp": 0.0126821, + "balance_loss_clip": 0.06274515, + "balance_loss_mlp": 0.01256957, + "epoch": 0.5301668420261536, + "flos": 20559878876160.0, + "grad_norm": 1.5998738611170542, + "language_loss": 0.65166581, + "learning_rate": 1.9017005454411743e-06, + "loss": 0.72860169, + "num_input_tokens_seen": 189627815, + "router_z_loss_clip": 1.5078125, + "router_z_loss_mlp": 0.11242676, + "step": 8818, + "time_per_iteration": 2.573202610015869 + }, + { + "auxiliary_loss_clip": 0.06425599, + "auxiliary_loss_mlp": 0.01266023, + "balance_loss_clip": 0.06275538, + "balance_loss_mlp": 0.0125378, + "epoch": 0.5302269652788216, + "flos": 17491259450880.0, + "grad_norm": 1.7883158874481297, + "language_loss": 0.75112927, + "learning_rate": 1.9013115563208126e-06, + "loss": 0.82804549, + "num_input_tokens_seen": 189644850, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.12249756, + "step": 8819, + "time_per_iteration": 2.4882779121398926 + }, + { + "auxiliary_loss_clip": 0.06426901, + "auxiliary_loss_mlp": 0.01268351, + "balance_loss_clip": 0.06273513, + "balance_loss_mlp": 0.01255995, + "epoch": 0.5302870885314895, + "flos": 14579380287360.0, + "grad_norm": 2.7239673645734905, + "language_loss": 0.82232261, + "learning_rate": 1.9009225709427267e-06, + "loss": 0.89927506, + "num_input_tokens_seen": 189660945, + "router_z_loss_clip": 1.53027344, + "router_z_loss_mlp": 0.12353516, + "step": 8820, + "time_per_iteration": 2.5082767009735107 + }, + { + "auxiliary_loss_clip": 0.06421572, + "auxiliary_loss_mlp": 0.0126791, + "balance_loss_clip": 0.06271127, + "balance_loss_mlp": 0.01257437, + "epoch": 0.5303472117841576, + "flos": 23444323027200.0, + "grad_norm": 1.7959737859178544, + "language_loss": 0.72743207, + "learning_rate": 1.9005335893216667e-06, + "loss": 0.80432689, + "num_input_tokens_seen": 189680425, + "router_z_loss_clip": 1.50390625, + "router_z_loss_mlp": 0.10479736, + "step": 8821, + "time_per_iteration": 2.5132317543029785 + }, + { + "auxiliary_loss_clip": 0.06418677, + "auxiliary_loss_mlp": 0.01266676, + "balance_loss_clip": 0.06273392, + "balance_loss_mlp": 0.01255643, + "epoch": 0.5304073350368255, + "flos": 22715294578560.0, + "grad_norm": 1.486709371307985, + "language_loss": 0.74618089, + "learning_rate": 1.9001446114723824e-06, + "loss": 0.82303441, + "num_input_tokens_seen": 189700375, + "router_z_loss_clip": 1.45214844, + "router_z_loss_mlp": 0.11035156, + "step": 8822, + "time_per_iteration": 2.528388261795044 + }, + { + "auxiliary_loss_clip": 0.06422541, + "auxiliary_loss_mlp": 0.0126649, + "balance_loss_clip": 0.06275284, + "balance_loss_mlp": 0.01255094, + "epoch": 0.5304674582894935, + "flos": 27936059397120.0, + "grad_norm": 1.8362514047395362, + "language_loss": 0.67618608, + "learning_rate": 1.8997556374096257e-06, + "loss": 0.75307631, + "num_input_tokens_seen": 189721225, + "router_z_loss_clip": 1.47167969, + "router_z_loss_mlp": 0.11401367, + "step": 8823, + "time_per_iteration": 3.9042444229125977 + }, + { + "auxiliary_loss_clip": 0.06425376, + "auxiliary_loss_mlp": 0.01269944, + "balance_loss_clip": 0.06273329, + "balance_loss_mlp": 0.01257969, + "epoch": 0.5305275815421614, + "flos": 21256860337920.0, + "grad_norm": 1.7650443733670647, + "language_loss": 0.69634396, + "learning_rate": 1.8993666671481444e-06, + "loss": 0.77329719, + "num_input_tokens_seen": 189740170, + "router_z_loss_clip": 1.51953125, + "router_z_loss_mlp": 0.11968994, + "step": 8824, + "time_per_iteration": 2.5146212577819824 + }, + { + "auxiliary_loss_clip": 0.06418572, + "auxiliary_loss_mlp": 0.01266795, + "balance_loss_clip": 0.06275523, + "balance_loss_mlp": 0.01256292, + "epoch": 0.5305877047948294, + "flos": 17608867056000.0, + "grad_norm": 1.7570108593506664, + "language_loss": 0.76559019, + "learning_rate": 1.898977700702689e-06, + "loss": 0.84244382, + "num_input_tokens_seen": 189757890, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.1050415, + "step": 8825, + "time_per_iteration": 2.4815242290496826 + }, + { + "auxiliary_loss_clip": 0.06420843, + "auxiliary_loss_mlp": 0.01268607, + "balance_loss_clip": 0.06275746, + "balance_loss_mlp": 0.01257335, + "epoch": 0.5306478280474973, + "flos": 15200947474560.0, + "grad_norm": 2.5706419514423526, + "language_loss": 0.85959315, + "learning_rate": 1.8985887380880103e-06, + "loss": 0.93648767, + "num_input_tokens_seen": 189775390, + "router_z_loss_clip": 1.45117188, + "router_z_loss_mlp": 0.11279297, + "step": 8826, + "time_per_iteration": 3.921194076538086 + }, + { + "auxiliary_loss_clip": 0.06417906, + "auxiliary_loss_mlp": 0.01264941, + "balance_loss_clip": 0.06272666, + "balance_loss_mlp": 0.01253759, + "epoch": 0.5307079513001653, + "flos": 15346660924800.0, + "grad_norm": 1.4506860249913964, + "language_loss": 0.64565361, + "learning_rate": 1.8981997793188558e-06, + "loss": 0.72248203, + "num_input_tokens_seen": 189793975, + "router_z_loss_clip": 1.45117188, + "router_z_loss_mlp": 0.11181641, + "step": 8827, + "time_per_iteration": 2.4920613765716553 + }, + { + "auxiliary_loss_clip": 0.06420277, + "auxiliary_loss_mlp": 0.01268465, + "balance_loss_clip": 0.06272143, + "balance_loss_mlp": 0.01256961, + "epoch": 0.5307680745528333, + "flos": 43554567294720.0, + "grad_norm": 1.8307336922940562, + "language_loss": 0.59537661, + "learning_rate": 1.8978108244099762e-06, + "loss": 0.6722641, + "num_input_tokens_seen": 189817870, + "router_z_loss_clip": 1.48046875, + "router_z_loss_mlp": 0.11499023, + "step": 8828, + "time_per_iteration": 2.7917306423187256 + }, + { + "auxiliary_loss_clip": 0.06423927, + "auxiliary_loss_mlp": 0.012663, + "balance_loss_clip": 0.06272669, + "balance_loss_mlp": 0.01254725, + "epoch": 0.5308281978055013, + "flos": 20055332315520.0, + "grad_norm": 1.5709125682754386, + "language_loss": 0.81926584, + "learning_rate": 1.8974218733761208e-06, + "loss": 0.89616817, + "num_input_tokens_seen": 189837905, + "router_z_loss_clip": 1.51171875, + "router_z_loss_mlp": 0.11578369, + "step": 8829, + "time_per_iteration": 2.606851100921631 + }, + { + "auxiliary_loss_clip": 0.06417149, + "auxiliary_loss_mlp": 0.01263824, + "balance_loss_clip": 0.06271777, + "balance_loss_mlp": 0.01253316, + "epoch": 0.5308883210581693, + "flos": 20710162373760.0, + "grad_norm": 1.3864012566435717, + "language_loss": 0.78353059, + "learning_rate": 1.8970329262320375e-06, + "loss": 0.86034036, + "num_input_tokens_seen": 189856970, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.1050415, + "step": 8830, + "time_per_iteration": 3.954951286315918 + }, + { + "auxiliary_loss_clip": 0.06420083, + "auxiliary_loss_mlp": 0.01268446, + "balance_loss_clip": 0.06272915, + "balance_loss_mlp": 0.01256924, + "epoch": 0.5309484443108372, + "flos": 14360684330880.0, + "grad_norm": 2.11171769837039, + "language_loss": 0.81423479, + "learning_rate": 1.8966439829924768e-06, + "loss": 0.89112008, + "num_input_tokens_seen": 189872830, + "router_z_loss_clip": 1.47167969, + "router_z_loss_mlp": 0.11517334, + "step": 8831, + "time_per_iteration": 2.469822883605957 + }, + { + "auxiliary_loss_clip": 0.06415518, + "auxiliary_loss_mlp": 0.01266871, + "balance_loss_clip": 0.06270008, + "balance_loss_mlp": 0.0125579, + "epoch": 0.5310085675635052, + "flos": 20016577002240.0, + "grad_norm": 1.695592927900533, + "language_loss": 0.73638004, + "learning_rate": 1.896255043672186e-06, + "loss": 0.81320393, + "num_input_tokens_seen": 189891635, + "router_z_loss_clip": 1.45507812, + "router_z_loss_mlp": 0.11071777, + "step": 8832, + "time_per_iteration": 2.527545213699341 + }, + { + "auxiliary_loss_clip": 0.06424195, + "auxiliary_loss_mlp": 0.01266175, + "balance_loss_clip": 0.06271979, + "balance_loss_mlp": 0.01253831, + "epoch": 0.5310686908161731, + "flos": 22133824369920.0, + "grad_norm": 1.9494235860340738, + "language_loss": 0.75823116, + "learning_rate": 1.8958661082859143e-06, + "loss": 0.83513486, + "num_input_tokens_seen": 189909050, + "router_z_loss_clip": 1.52246094, + "router_z_loss_mlp": 0.12341309, + "step": 8833, + "time_per_iteration": 2.497962236404419 + }, + { + "auxiliary_loss_clip": 0.06426589, + "auxiliary_loss_mlp": 0.01264835, + "balance_loss_clip": 0.06274767, + "balance_loss_mlp": 0.01252861, + "epoch": 0.5311288140688412, + "flos": 24724871049600.0, + "grad_norm": 1.6156023907192425, + "language_loss": 0.7400462, + "learning_rate": 1.8954771768484103e-06, + "loss": 0.81696039, + "num_input_tokens_seen": 189927405, + "router_z_loss_clip": 1.51660156, + "router_z_loss_mlp": 0.11975098, + "step": 8834, + "time_per_iteration": 2.5790417194366455 + }, + { + "auxiliary_loss_clip": 0.06429796, + "auxiliary_loss_mlp": 0.01267125, + "balance_loss_clip": 0.06274004, + "balance_loss_mlp": 0.01254322, + "epoch": 0.5311889373215091, + "flos": 24104603600640.0, + "grad_norm": 1.6077843194652517, + "language_loss": 0.77900589, + "learning_rate": 1.8950882493744226e-06, + "loss": 0.85597509, + "num_input_tokens_seen": 189947740, + "router_z_loss_clip": 1.55761719, + "router_z_loss_mlp": 0.12817383, + "step": 8835, + "time_per_iteration": 2.5299718379974365 + }, + { + "auxiliary_loss_clip": 0.06422241, + "auxiliary_loss_mlp": 0.01265258, + "balance_loss_clip": 0.06272303, + "balance_loss_mlp": 0.01253147, + "epoch": 0.5312490605741771, + "flos": 22023386288640.0, + "grad_norm": 1.8854276384026003, + "language_loss": 0.72502893, + "learning_rate": 1.8946993258786985e-06, + "loss": 0.80190396, + "num_input_tokens_seen": 189966495, + "router_z_loss_clip": 1.49804688, + "router_z_loss_mlp": 0.12115479, + "step": 8836, + "time_per_iteration": 2.548025131225586 + }, + { + "auxiliary_loss_clip": 0.06424102, + "auxiliary_loss_mlp": 0.01268272, + "balance_loss_clip": 0.06273144, + "balance_loss_mlp": 0.01255815, + "epoch": 0.531309183826845, + "flos": 19396561115520.0, + "grad_norm": 1.819661501339542, + "language_loss": 0.81157684, + "learning_rate": 1.894310406375987e-06, + "loss": 0.88850057, + "num_input_tokens_seen": 189985325, + "router_z_loss_clip": 1.50976562, + "router_z_loss_mlp": 0.12463379, + "step": 8837, + "time_per_iteration": 2.484968662261963 + }, + { + "auxiliary_loss_clip": 0.06418987, + "auxiliary_loss_mlp": 0.0126777, + "balance_loss_clip": 0.06274254, + "balance_loss_mlp": 0.01255778, + "epoch": 0.531369307079513, + "flos": 20195679104640.0, + "grad_norm": 1.8987589865078431, + "language_loss": 0.86269474, + "learning_rate": 1.893921490881035e-06, + "loss": 0.93956232, + "num_input_tokens_seen": 190003290, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.11981201, + "step": 8838, + "time_per_iteration": 3.9265315532684326 + }, + { + "auxiliary_loss_clip": 0.06418579, + "auxiliary_loss_mlp": 0.01264671, + "balance_loss_clip": 0.06271757, + "balance_loss_mlp": 0.01253584, + "epoch": 0.5314294303321809, + "flos": 18886144769280.0, + "grad_norm": 1.6029216559450563, + "language_loss": 0.73087633, + "learning_rate": 1.8935325794085906e-06, + "loss": 0.8077088, + "num_input_tokens_seen": 190023260, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 0.11077881, + "step": 8839, + "time_per_iteration": 2.595414876937866 + }, + { + "auxiliary_loss_clip": 0.06421834, + "auxiliary_loss_mlp": 0.01265258, + "balance_loss_clip": 0.06271024, + "balance_loss_mlp": 0.01253551, + "epoch": 0.531489553584849, + "flos": 23046818457600.0, + "grad_norm": 1.6603149015146987, + "language_loss": 0.76847923, + "learning_rate": 1.8931436719734023e-06, + "loss": 0.84535015, + "num_input_tokens_seen": 190042035, + "router_z_loss_clip": 1.5078125, + "router_z_loss_mlp": 0.11712646, + "step": 8840, + "time_per_iteration": 2.543708086013794 + }, + { + "auxiliary_loss_clip": 0.06426372, + "auxiliary_loss_mlp": 0.01267236, + "balance_loss_clip": 0.06275196, + "balance_loss_mlp": 0.01255291, + "epoch": 0.5315496768375169, + "flos": 19796329745280.0, + "grad_norm": 3.0684588696132553, + "language_loss": 0.7743901, + "learning_rate": 1.892754768590216e-06, + "loss": 0.85132617, + "num_input_tokens_seen": 190057545, + "router_z_loss_clip": 1.51171875, + "router_z_loss_mlp": 0.11932373, + "step": 8841, + "time_per_iteration": 2.5301966667175293 + }, + { + "auxiliary_loss_clip": 0.0631949, + "auxiliary_loss_mlp": 0.01253613, + "balance_loss_clip": 0.06256352, + "balance_loss_mlp": 0.01251976, + "epoch": 0.5316098000901849, + "flos": 71044876569600.0, + "grad_norm": 0.6765052539549429, + "language_loss": 0.56618965, + "learning_rate": 1.8923658692737793e-06, + "loss": 0.64192069, + "num_input_tokens_seen": 190123800, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 0.0164032, + "step": 8842, + "time_per_iteration": 3.2740724086761475 + }, + { + "auxiliary_loss_clip": 0.06425814, + "auxiliary_loss_mlp": 0.01266185, + "balance_loss_clip": 0.06272734, + "balance_loss_mlp": 0.01252876, + "epoch": 0.5316699233428529, + "flos": 16441146956160.0, + "grad_norm": 1.7388474755658287, + "language_loss": 0.73801279, + "learning_rate": 1.8919769740388407e-06, + "loss": 0.81493276, + "num_input_tokens_seen": 190141625, + "router_z_loss_clip": 1.53027344, + "router_z_loss_mlp": 0.13317871, + "step": 8843, + "time_per_iteration": 2.5188851356506348 + }, + { + "auxiliary_loss_clip": 0.06319, + "auxiliary_loss_mlp": 0.01253092, + "balance_loss_clip": 0.06256077, + "balance_loss_mlp": 0.01251205, + "epoch": 0.5317300465955208, + "flos": 67443478957440.0, + "grad_norm": 0.8484317442594647, + "language_loss": 0.60991502, + "learning_rate": 1.891588082900145e-06, + "loss": 0.68563592, + "num_input_tokens_seen": 190198110, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 0.01882935, + "step": 8844, + "time_per_iteration": 3.1943981647491455 + }, + { + "auxiliary_loss_clip": 0.06316474, + "auxiliary_loss_mlp": 0.01252227, + "balance_loss_clip": 0.06253788, + "balance_loss_mlp": 0.01250519, + "epoch": 0.5317901698481888, + "flos": 59524095144960.0, + "grad_norm": 0.8355266908782794, + "language_loss": 0.62249273, + "learning_rate": 1.8911991958724411e-06, + "loss": 0.69817972, + "num_input_tokens_seen": 190259950, + "router_z_loss_clip": 0.62744141, + "router_z_loss_mlp": 0.01712036, + "step": 8845, + "time_per_iteration": 3.149904727935791 + }, + { + "auxiliary_loss_clip": 0.06421602, + "auxiliary_loss_mlp": 0.01271191, + "balance_loss_clip": 0.06273656, + "balance_loss_mlp": 0.01258424, + "epoch": 0.5318502931008567, + "flos": 19134204382080.0, + "grad_norm": 1.8837935046538667, + "language_loss": 0.7569865, + "learning_rate": 1.890810312970474e-06, + "loss": 0.8339144, + "num_input_tokens_seen": 190278265, + "router_z_loss_clip": 1.48046875, + "router_z_loss_mlp": 0.12774658, + "step": 8846, + "time_per_iteration": 2.5158872604370117 + }, + { + "auxiliary_loss_clip": 0.0642429, + "auxiliary_loss_mlp": 0.01267758, + "balance_loss_clip": 0.06273554, + "balance_loss_mlp": 0.01256838, + "epoch": 0.5319104163535248, + "flos": 24687960526080.0, + "grad_norm": 1.6867562646607668, + "language_loss": 0.75546432, + "learning_rate": 1.8904214342089903e-06, + "loss": 0.83238477, + "num_input_tokens_seen": 190298400, + "router_z_loss_clip": 1.50585938, + "router_z_loss_mlp": 0.10913086, + "step": 8847, + "time_per_iteration": 2.5634870529174805 + }, + { + "auxiliary_loss_clip": 0.06415805, + "auxiliary_loss_mlp": 0.01265969, + "balance_loss_clip": 0.06269352, + "balance_loss_mlp": 0.01254823, + "epoch": 0.5319705396061927, + "flos": 19390691329920.0, + "grad_norm": 1.5354205561883685, + "language_loss": 0.87653261, + "learning_rate": 1.8900325596027378e-06, + "loss": 0.95335042, + "num_input_tokens_seen": 190316235, + "router_z_loss_clip": 1.46386719, + "router_z_loss_mlp": 0.1114502, + "step": 8848, + "time_per_iteration": 2.4771876335144043 + }, + { + "auxiliary_loss_clip": 0.06423473, + "auxiliary_loss_mlp": 0.01274581, + "balance_loss_clip": 0.06273171, + "balance_loss_mlp": 0.01261564, + "epoch": 0.5320306628588607, + "flos": 18265122633600.0, + "grad_norm": 1.744694135662772, + "language_loss": 0.74510658, + "learning_rate": 1.8896436891664609e-06, + "loss": 0.82208717, + "num_input_tokens_seen": 190335060, + "router_z_loss_clip": 1.50195312, + "router_z_loss_mlp": 0.13012695, + "step": 8849, + "time_per_iteration": 2.5036580562591553 + }, + { + "auxiliary_loss_clip": 0.06429593, + "auxiliary_loss_mlp": 0.01265611, + "balance_loss_clip": 0.06274542, + "balance_loss_mlp": 0.01253761, + "epoch": 0.5320907861115286, + "flos": 23739062163840.0, + "grad_norm": 1.9586489533772713, + "language_loss": 0.79968703, + "learning_rate": 1.8892548229149066e-06, + "loss": 0.87663901, + "num_input_tokens_seen": 190353265, + "router_z_loss_clip": 1.54980469, + "router_z_loss_mlp": 0.11853027, + "step": 8850, + "time_per_iteration": 2.5143027305603027 + }, + { + "auxiliary_loss_clip": 0.06426045, + "auxiliary_loss_mlp": 0.0126479, + "balance_loss_clip": 0.06276459, + "balance_loss_mlp": 0.01254086, + "epoch": 0.5321509093641966, + "flos": 34503730272000.0, + "grad_norm": 1.273724424531188, + "language_loss": 0.55058682, + "learning_rate": 1.888865960862821e-06, + "loss": 0.62749517, + "num_input_tokens_seen": 190376575, + "router_z_loss_clip": 1.49609375, + "router_z_loss_mlp": 0.1071167, + "step": 8851, + "time_per_iteration": 2.6221299171447754 + }, + { + "auxiliary_loss_clip": 0.06426491, + "auxiliary_loss_mlp": 0.01267353, + "balance_loss_clip": 0.06274278, + "balance_loss_mlp": 0.01255844, + "epoch": 0.5322110326168645, + "flos": 20017080126720.0, + "grad_norm": 1.7230657412679744, + "language_loss": 0.69354177, + "learning_rate": 1.8884771030249484e-06, + "loss": 0.77048028, + "num_input_tokens_seen": 190395185, + "router_z_loss_clip": 1.52050781, + "router_z_loss_mlp": 0.11517334, + "step": 8852, + "time_per_iteration": 2.483614206314087 + }, + { + "auxiliary_loss_clip": 0.06316812, + "auxiliary_loss_mlp": 0.01252104, + "balance_loss_clip": 0.06254005, + "balance_loss_mlp": 0.01250446, + "epoch": 0.5322711558695326, + "flos": 64650563792640.0, + "grad_norm": 0.7839220079179184, + "language_loss": 0.62548178, + "learning_rate": 1.8880882494160357e-06, + "loss": 0.70117098, + "num_input_tokens_seen": 190452595, + "router_z_loss_clip": 0.62890625, + "router_z_loss_mlp": 0.01661682, + "step": 8853, + "time_per_iteration": 3.085580587387085 + }, + { + "auxiliary_loss_clip": 0.06429263, + "auxiliary_loss_mlp": 0.01267576, + "balance_loss_clip": 0.06274428, + "balance_loss_mlp": 0.01256364, + "epoch": 0.5323312791222005, + "flos": 14944628234880.0, + "grad_norm": 2.314845805246822, + "language_loss": 0.79806542, + "learning_rate": 1.8876994000508278e-06, + "loss": 0.87503386, + "num_input_tokens_seen": 190469140, + "router_z_loss_clip": 1.54785156, + "router_z_loss_mlp": 0.11212158, + "step": 8854, + "time_per_iteration": 2.5530436038970947 + }, + { + "auxiliary_loss_clip": 0.06415577, + "auxiliary_loss_mlp": 0.01266542, + "balance_loss_clip": 0.06272486, + "balance_loss_mlp": 0.0125663, + "epoch": 0.5323914023748685, + "flos": 23447593336320.0, + "grad_norm": 2.5938972527955038, + "language_loss": 0.74205482, + "learning_rate": 1.8873105549440698e-06, + "loss": 0.81887597, + "num_input_tokens_seen": 190489015, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.09912109, + "step": 8855, + "time_per_iteration": 2.527981996536255 + }, + { + "auxiliary_loss_clip": 0.0641944, + "auxiliary_loss_mlp": 0.01263629, + "balance_loss_clip": 0.06272254, + "balance_loss_mlp": 0.01253371, + "epoch": 0.5324515256275365, + "flos": 26293324101120.0, + "grad_norm": 4.18366969320272, + "language_loss": 0.64945328, + "learning_rate": 1.886921714110507e-06, + "loss": 0.72628403, + "num_input_tokens_seen": 190508065, + "router_z_loss_clip": 1.47070312, + "router_z_loss_mlp": 0.10266113, + "step": 8856, + "time_per_iteration": 2.5942611694335938 + }, + { + "auxiliary_loss_clip": 0.06428003, + "auxiliary_loss_mlp": 0.01267402, + "balance_loss_clip": 0.06274043, + "balance_loss_mlp": 0.01255177, + "epoch": 0.5325116488802044, + "flos": 26878316181120.0, + "grad_norm": 1.8445625051613121, + "language_loss": 0.77944165, + "learning_rate": 1.8865328775648842e-06, + "loss": 0.85639572, + "num_input_tokens_seen": 190527045, + "router_z_loss_clip": 1.53808594, + "router_z_loss_mlp": 0.12231445, + "step": 8857, + "time_per_iteration": 2.551980972290039 + }, + { + "auxiliary_loss_clip": 0.06420985, + "auxiliary_loss_mlp": 0.01266182, + "balance_loss_clip": 0.06271584, + "balance_loss_mlp": 0.01254422, + "epoch": 0.5325717721328724, + "flos": 25891794535680.0, + "grad_norm": 1.6903303041385833, + "language_loss": 0.71116436, + "learning_rate": 1.8861440453219456e-06, + "loss": 0.78803611, + "num_input_tokens_seen": 190544075, + "router_z_loss_clip": 1.49316406, + "router_z_loss_mlp": 0.11749268, + "step": 8858, + "time_per_iteration": 2.564082384109497 + }, + { + "auxiliary_loss_clip": 0.0642374, + "auxiliary_loss_mlp": 0.01268133, + "balance_loss_clip": 0.06274494, + "balance_loss_mlp": 0.01255968, + "epoch": 0.5326318953855403, + "flos": 21805864289280.0, + "grad_norm": 3.8992078644613217, + "language_loss": 0.69476694, + "learning_rate": 1.8857552173964367e-06, + "loss": 0.77168566, + "num_input_tokens_seen": 190566030, + "router_z_loss_clip": 1.49316406, + "router_z_loss_mlp": 0.12158203, + "step": 8859, + "time_per_iteration": 2.5558056831359863 + }, + { + "auxiliary_loss_clip": 0.06418291, + "auxiliary_loss_mlp": 0.01266588, + "balance_loss_clip": 0.06275187, + "balance_loss_mlp": 0.0125624, + "epoch": 0.5326920186382084, + "flos": 20929193746560.0, + "grad_norm": 1.4322040270296341, + "language_loss": 0.69681478, + "learning_rate": 1.8853663938031013e-06, + "loss": 0.77366364, + "num_input_tokens_seen": 190585605, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.10339355, + "step": 8860, + "time_per_iteration": 2.5150671005249023 + }, + { + "auxiliary_loss_clip": 0.06419887, + "auxiliary_loss_mlp": 0.01266208, + "balance_loss_clip": 0.06273462, + "balance_loss_mlp": 0.01255259, + "epoch": 0.5327521418908763, + "flos": 21439735873920.0, + "grad_norm": 1.9652920134152139, + "language_loss": 0.77936381, + "learning_rate": 1.884977574556683e-06, + "loss": 0.85622478, + "num_input_tokens_seen": 190604625, + "router_z_loss_clip": 1.46484375, + "router_z_loss_mlp": 0.10955811, + "step": 8861, + "time_per_iteration": 2.527064561843872 + }, + { + "auxiliary_loss_clip": 0.06428909, + "auxiliary_loss_mlp": 0.01269839, + "balance_loss_clip": 0.06279886, + "balance_loss_mlp": 0.012579, + "epoch": 0.5328122651435443, + "flos": 21766354289280.0, + "grad_norm": 1.487259241409864, + "language_loss": 0.8585394, + "learning_rate": 1.8845887596719279e-06, + "loss": 0.93552685, + "num_input_tokens_seen": 190625060, + "router_z_loss_clip": 1.48828125, + "router_z_loss_mlp": 0.11938477, + "step": 8862, + "time_per_iteration": 4.031865358352661 + }, + { + "auxiliary_loss_clip": 0.06431703, + "auxiliary_loss_mlp": 0.01269915, + "balance_loss_clip": 0.06279312, + "balance_loss_mlp": 0.01257046, + "epoch": 0.5328723883962122, + "flos": 18302410500480.0, + "grad_norm": 1.6037650471474167, + "language_loss": 0.61557126, + "learning_rate": 1.8841999491635778e-06, + "loss": 0.69258749, + "num_input_tokens_seen": 190643150, + "router_z_loss_clip": 1.5234375, + "router_z_loss_mlp": 0.12866211, + "step": 8863, + "time_per_iteration": 2.499657154083252 + }, + { + "auxiliary_loss_clip": 0.06422713, + "auxiliary_loss_mlp": 0.01268054, + "balance_loss_clip": 0.06278422, + "balance_loss_mlp": 0.01257736, + "epoch": 0.5329325116488802, + "flos": 25382049022080.0, + "grad_norm": 1.8448114340212167, + "language_loss": 0.73693913, + "learning_rate": 1.883811143046377e-06, + "loss": 0.81384677, + "num_input_tokens_seen": 190662725, + "router_z_loss_clip": 1.44335938, + "router_z_loss_mlp": 0.10314941, + "step": 8864, + "time_per_iteration": 2.549104928970337 + }, + { + "auxiliary_loss_clip": 0.06424475, + "auxiliary_loss_mlp": 0.01267423, + "balance_loss_clip": 0.06276639, + "balance_loss_mlp": 0.0125636, + "epoch": 0.5329926349015481, + "flos": 25598984042880.0, + "grad_norm": 1.865165386122464, + "language_loss": 0.64464402, + "learning_rate": 1.8834223413350702e-06, + "loss": 0.72156298, + "num_input_tokens_seen": 190683680, + "router_z_loss_clip": 1.47851562, + "router_z_loss_mlp": 0.11065674, + "step": 8865, + "time_per_iteration": 4.099254608154297 + }, + { + "auxiliary_loss_clip": 0.0642702, + "auxiliary_loss_mlp": 0.01269229, + "balance_loss_clip": 0.06277309, + "balance_loss_mlp": 0.01257874, + "epoch": 0.5330527581542162, + "flos": 22895612565120.0, + "grad_norm": 1.6799514905357744, + "language_loss": 0.78778207, + "learning_rate": 1.8830335440443989e-06, + "loss": 0.86474454, + "num_input_tokens_seen": 190703350, + "router_z_loss_clip": 1.49511719, + "router_z_loss_mlp": 0.11346436, + "step": 8866, + "time_per_iteration": 2.505974531173706 + }, + { + "auxiliary_loss_clip": 0.06424611, + "auxiliary_loss_mlp": 0.01266962, + "balance_loss_clip": 0.06276287, + "balance_loss_mlp": 0.01255333, + "epoch": 0.5331128814068841, + "flos": 16031022347520.0, + "grad_norm": 1.850684934112151, + "language_loss": 0.74175781, + "learning_rate": 1.882644751189108e-06, + "loss": 0.81867361, + "num_input_tokens_seen": 190721170, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.11633301, + "step": 8867, + "time_per_iteration": 2.5437192916870117 + }, + { + "auxiliary_loss_clip": 0.0642608, + "auxiliary_loss_mlp": 0.0126656, + "balance_loss_clip": 0.06276974, + "balance_loss_mlp": 0.01254204, + "epoch": 0.5331730046595521, + "flos": 39353461211520.0, + "grad_norm": 1.4678278533937592, + "language_loss": 0.72377831, + "learning_rate": 1.88225596278394e-06, + "loss": 0.80070472, + "num_input_tokens_seen": 190743795, + "router_z_loss_clip": 1.49121094, + "router_z_loss_mlp": 0.12353516, + "step": 8868, + "time_per_iteration": 2.6680116653442383 + }, + { + "auxiliary_loss_clip": 0.06425264, + "auxiliary_loss_mlp": 0.01269354, + "balance_loss_clip": 0.06276006, + "balance_loss_mlp": 0.01258345, + "epoch": 0.5332331279122201, + "flos": 24031201824000.0, + "grad_norm": 1.7262272651388555, + "language_loss": 0.78884375, + "learning_rate": 1.881867178843637e-06, + "loss": 0.86578989, + "num_input_tokens_seen": 190761560, + "router_z_loss_clip": 1.49121094, + "router_z_loss_mlp": 0.11016846, + "step": 8869, + "time_per_iteration": 3.9937024116516113 + }, + { + "auxiliary_loss_clip": 0.06438692, + "auxiliary_loss_mlp": 0.0126748, + "balance_loss_clip": 0.06282986, + "balance_loss_mlp": 0.01255434, + "epoch": 0.533293251164888, + "flos": 17135109671040.0, + "grad_norm": 2.017265080243192, + "language_loss": 0.7622692, + "learning_rate": 1.8814783993829434e-06, + "loss": 0.83933091, + "num_input_tokens_seen": 190778875, + "router_z_loss_clip": 1.55566406, + "router_z_loss_mlp": 0.1204834, + "step": 8870, + "time_per_iteration": 2.520585536956787 + }, + { + "auxiliary_loss_clip": 0.06435512, + "auxiliary_loss_mlp": 0.01273068, + "balance_loss_clip": 0.06280903, + "balance_loss_mlp": 0.01260366, + "epoch": 0.533353374417556, + "flos": 22132734266880.0, + "grad_norm": 2.1166188019250316, + "language_loss": 0.76185441, + "learning_rate": 1.8810896244165997e-06, + "loss": 0.83894014, + "num_input_tokens_seen": 190799830, + "router_z_loss_clip": 1.546875, + "router_z_loss_mlp": 0.12713623, + "step": 8871, + "time_per_iteration": 2.5372307300567627 + }, + { + "auxiliary_loss_clip": 0.06427529, + "auxiliary_loss_mlp": 0.01272588, + "balance_loss_clip": 0.06279083, + "balance_loss_mlp": 0.01261383, + "epoch": 0.533413497670224, + "flos": 15016185221760.0, + "grad_norm": 1.8709318225271354, + "language_loss": 0.72608036, + "learning_rate": 1.8807008539593498e-06, + "loss": 0.80308151, + "num_input_tokens_seen": 190817155, + "router_z_loss_clip": 1.48339844, + "router_z_loss_mlp": 0.11206055, + "step": 8872, + "time_per_iteration": 2.486344337463379 + }, + { + "auxiliary_loss_clip": 0.06426945, + "auxiliary_loss_mlp": 0.01270876, + "balance_loss_clip": 0.06280041, + "balance_loss_mlp": 0.01258925, + "epoch": 0.533473620922892, + "flos": 19616095612800.0, + "grad_norm": 1.6405410033387824, + "language_loss": 0.65059078, + "learning_rate": 1.880312088025936e-06, + "loss": 0.72756892, + "num_input_tokens_seen": 190835240, + "router_z_loss_clip": 1.46777344, + "router_z_loss_mlp": 0.11956787, + "step": 8873, + "time_per_iteration": 2.4989571571350098 + }, + { + "auxiliary_loss_clip": 0.06430013, + "auxiliary_loss_mlp": 0.01270669, + "balance_loss_clip": 0.06281542, + "balance_loss_mlp": 0.01260113, + "epoch": 0.5335337441755599, + "flos": 14287827605760.0, + "grad_norm": 2.154155286859053, + "language_loss": 0.80397201, + "learning_rate": 1.879923326631099e-06, + "loss": 0.88097882, + "num_input_tokens_seen": 190851620, + "router_z_loss_clip": 1.48535156, + "router_z_loss_mlp": 0.10559082, + "step": 8874, + "time_per_iteration": 2.5248029232025146 + }, + { + "auxiliary_loss_clip": 0.06429289, + "auxiliary_loss_mlp": 0.01270488, + "balance_loss_clip": 0.06281012, + "balance_loss_mlp": 0.01259306, + "epoch": 0.5335938674282279, + "flos": 20821313214720.0, + "grad_norm": 1.9252791788754828, + "language_loss": 0.70199001, + "learning_rate": 1.879534569789582e-06, + "loss": 0.77898782, + "num_input_tokens_seen": 190870545, + "router_z_loss_clip": 1.48339844, + "router_z_loss_mlp": 0.11181641, + "step": 8875, + "time_per_iteration": 2.514606475830078 + }, + { + "auxiliary_loss_clip": 0.06327371, + "auxiliary_loss_mlp": 0.01252854, + "balance_loss_clip": 0.06264151, + "balance_loss_mlp": 0.01251167, + "epoch": 0.5336539906808958, + "flos": 71419558101120.0, + "grad_norm": 0.7076326652144627, + "language_loss": 0.59621203, + "learning_rate": 1.879145817516126e-06, + "loss": 0.6720143, + "num_input_tokens_seen": 190931995, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 0.01690674, + "step": 8876, + "time_per_iteration": 3.2623958587646484 + }, + { + "auxiliary_loss_clip": 0.06431912, + "auxiliary_loss_mlp": 0.0127027, + "balance_loss_clip": 0.06282675, + "balance_loss_mlp": 0.01259833, + "epoch": 0.5337141139335638, + "flos": 20158517018880.0, + "grad_norm": 1.761940945107411, + "language_loss": 0.75235462, + "learning_rate": 1.8787570698254727e-06, + "loss": 0.8293764, + "num_input_tokens_seen": 190949890, + "router_z_loss_clip": 1.49121094, + "router_z_loss_mlp": 0.10437012, + "step": 8877, + "time_per_iteration": 4.019563674926758 + }, + { + "auxiliary_loss_clip": 0.06329054, + "auxiliary_loss_mlp": 0.01254827, + "balance_loss_clip": 0.06265914, + "balance_loss_mlp": 0.01253019, + "epoch": 0.5337742371862317, + "flos": 67747624479360.0, + "grad_norm": 0.7353643225564799, + "language_loss": 0.57172877, + "learning_rate": 1.8783683267323629e-06, + "loss": 0.64756757, + "num_input_tokens_seen": 191008480, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 0.01803589, + "step": 8878, + "time_per_iteration": 3.0581912994384766 + }, + { + "auxiliary_loss_clip": 0.06440037, + "auxiliary_loss_mlp": 0.0127241, + "balance_loss_clip": 0.06285742, + "balance_loss_mlp": 0.01260573, + "epoch": 0.5338343604388998, + "flos": 25015794825600.0, + "grad_norm": 1.5270572668187339, + "language_loss": 0.7260288, + "learning_rate": 1.8779795882515395e-06, + "loss": 0.80315328, + "num_input_tokens_seen": 191028995, + "router_z_loss_clip": 1.54492188, + "router_z_loss_mlp": 0.11834717, + "step": 8879, + "time_per_iteration": 2.594075918197632 + }, + { + "auxiliary_loss_clip": 0.06432897, + "auxiliary_loss_mlp": 0.01271434, + "balance_loss_clip": 0.06280228, + "balance_loss_mlp": 0.01259644, + "epoch": 0.5338944836915677, + "flos": 17606728776960.0, + "grad_norm": 2.8683921774089445, + "language_loss": 0.84095323, + "learning_rate": 1.8775908543977416e-06, + "loss": 0.91799653, + "num_input_tokens_seen": 191045285, + "router_z_loss_clip": 1.52636719, + "router_z_loss_mlp": 0.11785889, + "step": 8880, + "time_per_iteration": 2.4828426837921143 + }, + { + "auxiliary_loss_clip": 0.06424058, + "auxiliary_loss_mlp": 0.01273011, + "balance_loss_clip": 0.06279065, + "balance_loss_mlp": 0.01262277, + "epoch": 0.5339546069442357, + "flos": 21730282306560.0, + "grad_norm": 1.3465483600758703, + "language_loss": 0.79582727, + "learning_rate": 1.8772021251857107e-06, + "loss": 0.87279797, + "num_input_tokens_seen": 191066105, + "router_z_loss_clip": 1.45019531, + "router_z_loss_mlp": 0.1072998, + "step": 8881, + "time_per_iteration": 2.5683958530426025 + }, + { + "auxiliary_loss_clip": 0.06324948, + "auxiliary_loss_mlp": 0.01252734, + "balance_loss_clip": 0.06261811, + "balance_loss_mlp": 0.01251199, + "epoch": 0.5340147301969036, + "flos": 69741226748160.0, + "grad_norm": 0.7871410050477539, + "language_loss": 0.5924378, + "learning_rate": 1.8768134006301882e-06, + "loss": 0.66821468, + "num_input_tokens_seen": 191126315, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 0.01533508, + "step": 8882, + "time_per_iteration": 3.0768346786499023 + }, + { + "auxiliary_loss_clip": 0.06325522, + "auxiliary_loss_mlp": 0.01253695, + "balance_loss_clip": 0.06262392, + "balance_loss_mlp": 0.01252035, + "epoch": 0.5340748534495716, + "flos": 63896504901120.0, + "grad_norm": 0.885852476410532, + "language_loss": 0.63786471, + "learning_rate": 1.876424680745913e-06, + "loss": 0.7136569, + "num_input_tokens_seen": 191174240, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 0.01663208, + "step": 8883, + "time_per_iteration": 2.967287063598633 + }, + { + "auxiliary_loss_clip": 0.06432307, + "auxiliary_loss_mlp": 0.01267155, + "balance_loss_clip": 0.06278822, + "balance_loss_mlp": 0.01254942, + "epoch": 0.5341349767022396, + "flos": 28701872588160.0, + "grad_norm": 2.199844959316804, + "language_loss": 0.82043612, + "learning_rate": 1.8760359655476272e-06, + "loss": 0.89743072, + "num_input_tokens_seen": 191193335, + "router_z_loss_clip": 1.53515625, + "router_z_loss_mlp": 0.12200928, + "step": 8884, + "time_per_iteration": 2.5675361156463623 + }, + { + "auxiliary_loss_clip": 0.06425676, + "auxiliary_loss_mlp": 0.01268668, + "balance_loss_clip": 0.06280383, + "balance_loss_mlp": 0.01257873, + "epoch": 0.5341950999549075, + "flos": 16295265797760.0, + "grad_norm": 1.5488539614491517, + "language_loss": 0.72820723, + "learning_rate": 1.8756472550500695e-06, + "loss": 0.80515063, + "num_input_tokens_seen": 191210900, + "router_z_loss_clip": 1.45214844, + "router_z_loss_mlp": 0.10784912, + "step": 8885, + "time_per_iteration": 2.5164196491241455 + }, + { + "auxiliary_loss_clip": 0.06432982, + "auxiliary_loss_mlp": 0.01266357, + "balance_loss_clip": 0.06277923, + "balance_loss_mlp": 0.01254525, + "epoch": 0.5342552232075756, + "flos": 14360852039040.0, + "grad_norm": 1.8494222651114738, + "language_loss": 0.78934276, + "learning_rate": 1.87525854926798e-06, + "loss": 0.86633611, + "num_input_tokens_seen": 191226730, + "router_z_loss_clip": 1.54980469, + "router_z_loss_mlp": 0.11834717, + "step": 8886, + "time_per_iteration": 2.524366855621338 + }, + { + "auxiliary_loss_clip": 0.06429981, + "auxiliary_loss_mlp": 0.01268189, + "balance_loss_clip": 0.06279354, + "balance_loss_mlp": 0.01255606, + "epoch": 0.5343153464602435, + "flos": 30305517154560.0, + "grad_norm": 1.3913460534471052, + "language_loss": 0.75135863, + "learning_rate": 1.8748698482160996e-06, + "loss": 0.82834035, + "num_input_tokens_seen": 191250435, + "router_z_loss_clip": 1.50683594, + "router_z_loss_mlp": 0.12579346, + "step": 8887, + "time_per_iteration": 2.6564323902130127 + }, + { + "auxiliary_loss_clip": 0.06427558, + "auxiliary_loss_mlp": 0.01265573, + "balance_loss_clip": 0.06278411, + "balance_loss_mlp": 0.0125401, + "epoch": 0.5343754697129115, + "flos": 15601722353280.0, + "grad_norm": 2.357980716065106, + "language_loss": 0.69295096, + "learning_rate": 1.8744811519091663e-06, + "loss": 0.76988232, + "num_input_tokens_seen": 191268315, + "router_z_loss_clip": 1.49023438, + "router_z_loss_mlp": 0.11560059, + "step": 8888, + "time_per_iteration": 2.4917025566101074 + }, + { + "auxiliary_loss_clip": 0.06442724, + "auxiliary_loss_mlp": 0.01272933, + "balance_loss_clip": 0.06283408, + "balance_loss_mlp": 0.01260935, + "epoch": 0.5344355929655794, + "flos": 16915239757440.0, + "grad_norm": 1.9387999695924976, + "language_loss": 0.78584576, + "learning_rate": 1.8740924603619208e-06, + "loss": 0.8630023, + "num_input_tokens_seen": 191287000, + "router_z_loss_clip": 1.59277344, + "router_z_loss_mlp": 0.12005615, + "step": 8889, + "time_per_iteration": 2.5028741359710693 + }, + { + "auxiliary_loss_clip": 0.06424284, + "auxiliary_loss_mlp": 0.01268375, + "balance_loss_clip": 0.06276136, + "balance_loss_mlp": 0.01256431, + "epoch": 0.5344957162182474, + "flos": 16803460010880.0, + "grad_norm": 1.9089962398127316, + "language_loss": 0.69733131, + "learning_rate": 1.873703773589102e-06, + "loss": 0.7742579, + "num_input_tokens_seen": 191304565, + "router_z_loss_clip": 1.47949219, + "router_z_loss_mlp": 0.1194458, + "step": 8890, + "time_per_iteration": 2.4705469608306885 + }, + { + "auxiliary_loss_clip": 0.06430273, + "auxiliary_loss_mlp": 0.01267824, + "balance_loss_clip": 0.0627601, + "balance_loss_mlp": 0.01255635, + "epoch": 0.5345558394709153, + "flos": 12709144356480.0, + "grad_norm": 3.2953855429591536, + "language_loss": 0.77688992, + "learning_rate": 1.8733150916054483e-06, + "loss": 0.85387087, + "num_input_tokens_seen": 191318300, + "router_z_loss_clip": 1.54101562, + "router_z_loss_mlp": 0.12182617, + "step": 8891, + "time_per_iteration": 2.500333547592163 + }, + { + "auxiliary_loss_clip": 0.06428199, + "auxiliary_loss_mlp": 0.01268573, + "balance_loss_clip": 0.06281698, + "balance_loss_mlp": 0.01257486, + "epoch": 0.5346159627235834, + "flos": 22461532888320.0, + "grad_norm": 1.516620120390114, + "language_loss": 0.74519014, + "learning_rate": 1.872926414425699e-06, + "loss": 0.82215786, + "num_input_tokens_seen": 191337925, + "router_z_loss_clip": 1.46484375, + "router_z_loss_mlp": 0.11102295, + "step": 8892, + "time_per_iteration": 2.4968128204345703 + }, + { + "auxiliary_loss_clip": 0.06427278, + "auxiliary_loss_mlp": 0.01264312, + "balance_loss_clip": 0.06277005, + "balance_loss_mlp": 0.01253566, + "epoch": 0.5346760859762513, + "flos": 22421771326080.0, + "grad_norm": 1.6631056082688196, + "language_loss": 0.87902844, + "learning_rate": 1.8725377420645932e-06, + "loss": 0.95594442, + "num_input_tokens_seen": 191357120, + "router_z_loss_clip": 1.50292969, + "router_z_loss_mlp": 0.10742188, + "step": 8893, + "time_per_iteration": 2.5580215454101562 + }, + { + "auxiliary_loss_clip": 0.06429157, + "auxiliary_loss_mlp": 0.01263801, + "balance_loss_clip": 0.06281421, + "balance_loss_mlp": 0.01253155, + "epoch": 0.5347362092289193, + "flos": 22822043080320.0, + "grad_norm": 1.612055893952936, + "language_loss": 0.72799695, + "learning_rate": 1.872149074536869e-06, + "loss": 0.80492651, + "num_input_tokens_seen": 191375395, + "router_z_loss_clip": 1.4765625, + "router_z_loss_mlp": 0.10650635, + "step": 8894, + "time_per_iteration": 2.54834246635437 + }, + { + "auxiliary_loss_clip": 0.06422012, + "auxiliary_loss_mlp": 0.01266432, + "balance_loss_clip": 0.06275687, + "balance_loss_mlp": 0.01254571, + "epoch": 0.5347963324815872, + "flos": 23225794778880.0, + "grad_norm": 1.4320398201671862, + "language_loss": 0.75047934, + "learning_rate": 1.8717604118572648e-06, + "loss": 0.82736373, + "num_input_tokens_seen": 191395595, + "router_z_loss_clip": 1.46289062, + "router_z_loss_mlp": 0.11865234, + "step": 8895, + "time_per_iteration": 2.5309391021728516 + }, + { + "auxiliary_loss_clip": 0.06432986, + "auxiliary_loss_mlp": 0.01266799, + "balance_loss_clip": 0.06282157, + "balance_loss_mlp": 0.01255606, + "epoch": 0.5348564557342552, + "flos": 22607917171200.0, + "grad_norm": 1.7183644079473714, + "language_loss": 0.77449572, + "learning_rate": 1.8713717540405178e-06, + "loss": 0.8514936, + "num_input_tokens_seen": 191413730, + "router_z_loss_clip": 1.50683594, + "router_z_loss_mlp": 0.11181641, + "step": 8896, + "time_per_iteration": 2.5175390243530273 + }, + { + "auxiliary_loss_clip": 0.06424737, + "auxiliary_loss_mlp": 0.01267928, + "balance_loss_clip": 0.06278285, + "balance_loss_mlp": 0.01256639, + "epoch": 0.5349165789869232, + "flos": 18007880999040.0, + "grad_norm": 1.7578614055599853, + "language_loss": 0.79043764, + "learning_rate": 1.8709831011013676e-06, + "loss": 0.86736429, + "num_input_tokens_seen": 191432400, + "router_z_loss_clip": 1.46386719, + "router_z_loss_mlp": 0.11297607, + "step": 8897, + "time_per_iteration": 2.5068724155426025 + }, + { + "auxiliary_loss_clip": 0.06429999, + "auxiliary_loss_mlp": 0.01264934, + "balance_loss_clip": 0.06279507, + "balance_loss_mlp": 0.01253365, + "epoch": 0.5349767022395912, + "flos": 17164557181440.0, + "grad_norm": 1.7104987912832146, + "language_loss": 0.76011693, + "learning_rate": 1.8705944530545509e-06, + "loss": 0.83706623, + "num_input_tokens_seen": 191448855, + "router_z_loss_clip": 1.50488281, + "router_z_loss_mlp": 0.11566162, + "step": 8898, + "time_per_iteration": 2.5468573570251465 + }, + { + "auxiliary_loss_clip": 0.06323466, + "auxiliary_loss_mlp": 0.01262304, + "balance_loss_clip": 0.06260733, + "balance_loss_mlp": 0.01260944, + "epoch": 0.5350368254922592, + "flos": 71014590518400.0, + "grad_norm": 0.8026406428525971, + "language_loss": 0.57916105, + "learning_rate": 1.8702058099148052e-06, + "loss": 0.65501881, + "num_input_tokens_seen": 191519690, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 0.01361847, + "step": 8899, + "time_per_iteration": 3.354367256164551 + }, + { + "auxiliary_loss_clip": 0.06428243, + "auxiliary_loss_mlp": 0.01266735, + "balance_loss_clip": 0.06281818, + "balance_loss_mlp": 0.01255857, + "epoch": 0.5350969487449271, + "flos": 27425265707520.0, + "grad_norm": 1.5056303351191316, + "language_loss": 0.70071346, + "learning_rate": 1.869817171696868e-06, + "loss": 0.77766323, + "num_input_tokens_seen": 191539380, + "router_z_loss_clip": 1.46484375, + "router_z_loss_mlp": 0.10882568, + "step": 8900, + "time_per_iteration": 2.596675395965576 + }, + { + "auxiliary_loss_clip": 0.0643241, + "auxiliary_loss_mlp": 0.01268767, + "balance_loss_clip": 0.06280074, + "balance_loss_mlp": 0.0125743, + "epoch": 0.5351570719975951, + "flos": 19321901527680.0, + "grad_norm": 1.5148336766284718, + "language_loss": 0.71324182, + "learning_rate": 1.8694285384154777e-06, + "loss": 0.79025364, + "num_input_tokens_seen": 191557400, + "router_z_loss_clip": 1.52148438, + "router_z_loss_mlp": 0.11346436, + "step": 8901, + "time_per_iteration": 2.526811122894287 + }, + { + "auxiliary_loss_clip": 0.06432061, + "auxiliary_loss_mlp": 0.01269417, + "balance_loss_clip": 0.06280375, + "balance_loss_mlp": 0.01257377, + "epoch": 0.535217195250263, + "flos": 19834707715200.0, + "grad_norm": 1.961594084549487, + "language_loss": 0.77373689, + "learning_rate": 1.8690399100853699e-06, + "loss": 0.85075164, + "num_input_tokens_seen": 191575860, + "router_z_loss_clip": 1.51757812, + "router_z_loss_mlp": 0.1204834, + "step": 8902, + "time_per_iteration": 3.931328773498535 + }, + { + "auxiliary_loss_clip": 0.06422594, + "auxiliary_loss_mlp": 0.01261364, + "balance_loss_clip": 0.0627951, + "balance_loss_mlp": 0.01250188, + "epoch": 0.535277318502931, + "flos": 22134495202560.0, + "grad_norm": 1.5214881410098744, + "language_loss": 0.7052539, + "learning_rate": 1.868651286721281e-06, + "loss": 0.78209347, + "num_input_tokens_seen": 191595775, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.1116333, + "step": 8903, + "time_per_iteration": 2.5344340801239014 + }, + { + "auxiliary_loss_clip": 0.06433277, + "auxiliary_loss_mlp": 0.01267717, + "balance_loss_clip": 0.06279396, + "balance_loss_mlp": 0.01255426, + "epoch": 0.5353374417555989, + "flos": 25052873057280.0, + "grad_norm": 1.5307499252390009, + "language_loss": 0.72374737, + "learning_rate": 1.86826266833795e-06, + "loss": 0.80075729, + "num_input_tokens_seen": 191617785, + "router_z_loss_clip": 1.53710938, + "router_z_loss_mlp": 0.12304688, + "step": 8904, + "time_per_iteration": 3.979325294494629 + }, + { + "auxiliary_loss_clip": 0.06430352, + "auxiliary_loss_mlp": 0.0127012, + "balance_loss_clip": 0.06280231, + "balance_loss_mlp": 0.01257961, + "epoch": 0.535397565008267, + "flos": 19394422836480.0, + "grad_norm": 1.7887132092295748, + "language_loss": 0.73359382, + "learning_rate": 1.8678740549501103e-06, + "loss": 0.81059849, + "num_input_tokens_seen": 191636900, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.121521, + "step": 8905, + "time_per_iteration": 2.5468502044677734 + }, + { + "auxiliary_loss_clip": 0.06426303, + "auxiliary_loss_mlp": 0.01263381, + "balance_loss_clip": 0.06282683, + "balance_loss_mlp": 0.01252402, + "epoch": 0.5354576882609349, + "flos": 21477736500480.0, + "grad_norm": 1.458955847450215, + "language_loss": 0.83904094, + "learning_rate": 1.8674854465725005e-06, + "loss": 0.91593778, + "num_input_tokens_seen": 191656720, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.10980225, + "step": 8906, + "time_per_iteration": 2.5199477672576904 + }, + { + "auxiliary_loss_clip": 0.06430362, + "auxiliary_loss_mlp": 0.01270808, + "balance_loss_clip": 0.06278186, + "balance_loss_mlp": 0.01258416, + "epoch": 0.5355178115136029, + "flos": 20783857639680.0, + "grad_norm": 1.893504710630849, + "language_loss": 0.74486792, + "learning_rate": 1.8670968432198563e-06, + "loss": 0.82187963, + "num_input_tokens_seen": 191674445, + "router_z_loss_clip": 1.52246094, + "router_z_loss_mlp": 0.1237793, + "step": 8907, + "time_per_iteration": 2.5200021266937256 + }, + { + "auxiliary_loss_clip": 0.06428273, + "auxiliary_loss_mlp": 0.01264992, + "balance_loss_clip": 0.06280483, + "balance_loss_mlp": 0.0125421, + "epoch": 0.5355779347662708, + "flos": 23520827404800.0, + "grad_norm": 1.6955230805298804, + "language_loss": 0.76706243, + "learning_rate": 1.866708244906912e-06, + "loss": 0.84399509, + "num_input_tokens_seen": 191695000, + "router_z_loss_clip": 1.47851562, + "router_z_loss_mlp": 0.10772705, + "step": 8908, + "time_per_iteration": 4.040110349655151 + }, + { + "auxiliary_loss_clip": 0.06432807, + "auxiliary_loss_mlp": 0.01271179, + "balance_loss_clip": 0.06280953, + "balance_loss_mlp": 0.01258835, + "epoch": 0.5356380580189388, + "flos": 20309471349120.0, + "grad_norm": 2.626231250487559, + "language_loss": 0.74318033, + "learning_rate": 1.8663196516484055e-06, + "loss": 0.82022017, + "num_input_tokens_seen": 191713295, + "router_z_loss_clip": 1.51855469, + "router_z_loss_mlp": 0.12347412, + "step": 8909, + "time_per_iteration": 2.503324031829834 + }, + { + "auxiliary_loss_clip": 0.06428281, + "auxiliary_loss_mlp": 0.01268046, + "balance_loss_clip": 0.06279926, + "balance_loss_mlp": 0.0125724, + "epoch": 0.5356981812716068, + "flos": 21368136960000.0, + "grad_norm": 2.2429477917403435, + "language_loss": 0.84013373, + "learning_rate": 1.8659310634590702e-06, + "loss": 0.91709697, + "num_input_tokens_seen": 191732725, + "router_z_loss_clip": 1.48242188, + "router_z_loss_mlp": 0.10803223, + "step": 8910, + "time_per_iteration": 2.532768726348877 + }, + { + "auxiliary_loss_clip": 0.06428899, + "auxiliary_loss_mlp": 0.01267044, + "balance_loss_clip": 0.06278617, + "balance_loss_mlp": 0.01255152, + "epoch": 0.5357583045242748, + "flos": 23117746538880.0, + "grad_norm": 1.5068539432144845, + "language_loss": 0.82170522, + "learning_rate": 1.8655424803536427e-06, + "loss": 0.89866459, + "num_input_tokens_seen": 191753765, + "router_z_loss_clip": 1.50292969, + "router_z_loss_mlp": 0.11895752, + "step": 8911, + "time_per_iteration": 2.530242681503296 + }, + { + "auxiliary_loss_clip": 0.06427851, + "auxiliary_loss_mlp": 0.01268226, + "balance_loss_clip": 0.06279093, + "balance_loss_mlp": 0.01256794, + "epoch": 0.5358184277769428, + "flos": 21148057411200.0, + "grad_norm": 1.7566097539058134, + "language_loss": 0.6953544, + "learning_rate": 1.8651539023468585e-06, + "loss": 0.7723152, + "num_input_tokens_seen": 191773560, + "router_z_loss_clip": 1.48730469, + "router_z_loss_mlp": 0.11425781, + "step": 8912, + "time_per_iteration": 2.52546763420105 + }, + { + "auxiliary_loss_clip": 0.06429117, + "auxiliary_loss_mlp": 0.01266082, + "balance_loss_clip": 0.06281352, + "balance_loss_mlp": 0.01255234, + "epoch": 0.5358785510296107, + "flos": 16286754608640.0, + "grad_norm": 1.7988140692342254, + "language_loss": 0.71504682, + "learning_rate": 1.8647653294534509e-06, + "loss": 0.79199886, + "num_input_tokens_seen": 191791255, + "router_z_loss_clip": 1.4765625, + "router_z_loss_mlp": 0.10858154, + "step": 8913, + "time_per_iteration": 2.4723551273345947 + }, + { + "auxiliary_loss_clip": 0.06437049, + "auxiliary_loss_mlp": 0.01269643, + "balance_loss_clip": 0.06283163, + "balance_loss_mlp": 0.01257883, + "epoch": 0.5359386742822787, + "flos": 16981555864320.0, + "grad_norm": 1.6333944745256754, + "language_loss": 0.72038394, + "learning_rate": 1.864376761688156e-06, + "loss": 0.7974509, + "num_input_tokens_seen": 191809325, + "router_z_loss_clip": 1.5390625, + "router_z_loss_mlp": 0.11761475, + "step": 8914, + "time_per_iteration": 2.5807461738586426 + }, + { + "auxiliary_loss_clip": 0.06438086, + "auxiliary_loss_mlp": 0.01272172, + "balance_loss_clip": 0.06283066, + "balance_loss_mlp": 0.01259327, + "epoch": 0.5359987975349466, + "flos": 20819091081600.0, + "grad_norm": 1.7157890571158112, + "language_loss": 0.706487, + "learning_rate": 1.8639881990657079e-06, + "loss": 0.7835896, + "num_input_tokens_seen": 191829795, + "router_z_loss_clip": 1.54785156, + "router_z_loss_mlp": 0.12841797, + "step": 8915, + "time_per_iteration": 2.542787790298462 + }, + { + "auxiliary_loss_clip": 0.06428587, + "auxiliary_loss_mlp": 0.01269302, + "balance_loss_clip": 0.06281634, + "balance_loss_mlp": 0.01257918, + "epoch": 0.5360589207876146, + "flos": 22206429532800.0, + "grad_norm": 1.674776865577312, + "language_loss": 0.75600839, + "learning_rate": 1.8635996416008408e-06, + "loss": 0.83298731, + "num_input_tokens_seen": 191850840, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 0.11383057, + "step": 8916, + "time_per_iteration": 2.5621731281280518 + }, + { + "auxiliary_loss_clip": 0.06429151, + "auxiliary_loss_mlp": 0.01268516, + "balance_loss_clip": 0.06277589, + "balance_loss_mlp": 0.01256995, + "epoch": 0.5361190440402825, + "flos": 31402393027200.0, + "grad_norm": 2.5448267428400655, + "language_loss": 0.72810572, + "learning_rate": 1.863211089308289e-06, + "loss": 0.80508238, + "num_input_tokens_seen": 191869520, + "router_z_loss_clip": 1.51660156, + "router_z_loss_mlp": 0.1151123, + "step": 8917, + "time_per_iteration": 4.027824401855469 + }, + { + "auxiliary_loss_clip": 0.06433325, + "auxiliary_loss_mlp": 0.01268717, + "balance_loss_clip": 0.06283134, + "balance_loss_mlp": 0.01257195, + "epoch": 0.5361791672929506, + "flos": 16075270103040.0, + "grad_norm": 1.844905450054995, + "language_loss": 0.71658254, + "learning_rate": 1.8628225422027865e-06, + "loss": 0.793603, + "num_input_tokens_seen": 191887240, + "router_z_loss_clip": 1.50097656, + "router_z_loss_mlp": 0.11529541, + "step": 8918, + "time_per_iteration": 2.5032598972320557 + }, + { + "auxiliary_loss_clip": 0.06431636, + "auxiliary_loss_mlp": 0.01270312, + "balance_loss_clip": 0.06282899, + "balance_loss_mlp": 0.01258933, + "epoch": 0.5362392905456185, + "flos": 20747240605440.0, + "grad_norm": 1.4549229797282903, + "language_loss": 0.75235254, + "learning_rate": 1.862434000299067e-06, + "loss": 0.82937205, + "num_input_tokens_seen": 191905690, + "router_z_loss_clip": 1.48632812, + "router_z_loss_mlp": 0.11383057, + "step": 8919, + "time_per_iteration": 2.5361175537109375 + }, + { + "auxiliary_loss_clip": 0.06430984, + "auxiliary_loss_mlp": 0.01266509, + "balance_loss_clip": 0.06280042, + "balance_loss_mlp": 0.01255244, + "epoch": 0.5362994137982865, + "flos": 17344539751680.0, + "grad_norm": 10.323313850773834, + "language_loss": 0.71843415, + "learning_rate": 1.862045463611864e-06, + "loss": 0.79540908, + "num_input_tokens_seen": 191920725, + "router_z_loss_clip": 1.50878906, + "router_z_loss_mlp": 0.11254883, + "step": 8920, + "time_per_iteration": 2.481144666671753 + }, + { + "auxiliary_loss_clip": 0.06425787, + "auxiliary_loss_mlp": 0.0126502, + "balance_loss_clip": 0.06276651, + "balance_loss_mlp": 0.01253659, + "epoch": 0.5363595370509544, + "flos": 42823819837440.0, + "grad_norm": 1.3389140049198536, + "language_loss": 0.68970168, + "learning_rate": 1.8616569321559105e-06, + "loss": 0.76660967, + "num_input_tokens_seen": 191944645, + "router_z_loss_clip": 1.49023438, + "router_z_loss_mlp": 0.11352539, + "step": 8921, + "time_per_iteration": 2.7377495765686035 + }, + { + "auxiliary_loss_clip": 0.06429093, + "auxiliary_loss_mlp": 0.01267258, + "balance_loss_clip": 0.06280531, + "balance_loss_mlp": 0.01255575, + "epoch": 0.5364196603036224, + "flos": 19177990940160.0, + "grad_norm": 2.2769865828018516, + "language_loss": 0.81912661, + "learning_rate": 1.86126840594594e-06, + "loss": 0.89609009, + "num_input_tokens_seen": 191962265, + "router_z_loss_clip": 1.48632812, + "router_z_loss_mlp": 0.11676025, + "step": 8922, + "time_per_iteration": 2.491041660308838 + }, + { + "auxiliary_loss_clip": 0.06431051, + "auxiliary_loss_mlp": 0.01267721, + "balance_loss_clip": 0.06279019, + "balance_loss_mlp": 0.01256539, + "epoch": 0.5364797835562904, + "flos": 17936827136640.0, + "grad_norm": 1.913279005224502, + "language_loss": 0.76818264, + "learning_rate": 1.860879884996686e-06, + "loss": 0.84517032, + "num_input_tokens_seen": 191978850, + "router_z_loss_clip": 1.52050781, + "router_z_loss_mlp": 0.11175537, + "step": 8923, + "time_per_iteration": 2.502797842025757 + }, + { + "auxiliary_loss_clip": 0.06430578, + "auxiliary_loss_mlp": 0.01268847, + "balance_loss_clip": 0.06277579, + "balance_loss_mlp": 0.01257052, + "epoch": 0.5365399068089584, + "flos": 30236098446720.0, + "grad_norm": 1.4167756526815838, + "language_loss": 0.70506531, + "learning_rate": 1.8604913693228804e-06, + "loss": 0.78205955, + "num_input_tokens_seen": 192002000, + "router_z_loss_clip": 1.53027344, + "router_z_loss_mlp": 0.11791992, + "step": 8924, + "time_per_iteration": 2.5783135890960693 + }, + { + "auxiliary_loss_clip": 0.06433783, + "auxiliary_loss_mlp": 0.01269029, + "balance_loss_clip": 0.06280564, + "balance_loss_mlp": 0.01256804, + "epoch": 0.5366000300616264, + "flos": 24897264825600.0, + "grad_norm": 2.5342740284522516, + "language_loss": 0.87064564, + "learning_rate": 1.8601028589392558e-06, + "loss": 0.9476738, + "num_input_tokens_seen": 192019100, + "router_z_loss_clip": 1.53320312, + "router_z_loss_mlp": 0.12231445, + "step": 8925, + "time_per_iteration": 2.555947780609131 + }, + { + "auxiliary_loss_clip": 0.0643315, + "auxiliary_loss_mlp": 0.012686, + "balance_loss_clip": 0.06278683, + "balance_loss_mlp": 0.01256911, + "epoch": 0.5366601533142943, + "flos": 29834610808320.0, + "grad_norm": 1.6615305931190325, + "language_loss": 0.78511882, + "learning_rate": 1.8597143538605455e-06, + "loss": 0.86213624, + "num_input_tokens_seen": 192041660, + "router_z_loss_clip": 1.54296875, + "router_z_loss_mlp": 0.11694336, + "step": 8926, + "time_per_iteration": 2.575540781021118 + }, + { + "auxiliary_loss_clip": 0.06420288, + "auxiliary_loss_mlp": 0.01265367, + "balance_loss_clip": 0.06276788, + "balance_loss_mlp": 0.0125437, + "epoch": 0.5367202765669623, + "flos": 27206821313280.0, + "grad_norm": 1.3335091711279083, + "language_loss": 0.66572356, + "learning_rate": 1.85932585410148e-06, + "loss": 0.74258018, + "num_input_tokens_seen": 192063540, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.11004639, + "step": 8927, + "time_per_iteration": 2.574263572692871 + }, + { + "auxiliary_loss_clip": 0.06429082, + "auxiliary_loss_mlp": 0.0126451, + "balance_loss_clip": 0.06277999, + "balance_loss_mlp": 0.0125309, + "epoch": 0.5367803998196302, + "flos": 20236153426560.0, + "grad_norm": 1.7727091217622297, + "language_loss": 0.73473167, + "learning_rate": 1.8589373596767929e-06, + "loss": 0.81166756, + "num_input_tokens_seen": 192081760, + "router_z_loss_clip": 1.51074219, + "router_z_loss_mlp": 0.11413574, + "step": 8928, + "time_per_iteration": 2.4792275428771973 + }, + { + "auxiliary_loss_clip": 0.06429128, + "auxiliary_loss_mlp": 0.01265529, + "balance_loss_clip": 0.06278329, + "balance_loss_mlp": 0.01254609, + "epoch": 0.5368405230722982, + "flos": 32161791381120.0, + "grad_norm": 1.7479222402462038, + "language_loss": 0.62972343, + "learning_rate": 1.8585488706012154e-06, + "loss": 0.70666999, + "num_input_tokens_seen": 192101620, + "router_z_loss_clip": 1.50683594, + "router_z_loss_mlp": 0.10919189, + "step": 8929, + "time_per_iteration": 2.622292995452881 + }, + { + "auxiliary_loss_clip": 0.06432647, + "auxiliary_loss_mlp": 0.01265269, + "balance_loss_clip": 0.0628202, + "balance_loss_mlp": 0.01254433, + "epoch": 0.5369006463249661, + "flos": 26254778423040.0, + "grad_norm": 1.591710131173975, + "language_loss": 0.66400939, + "learning_rate": 1.8581603868894781e-06, + "loss": 0.74098849, + "num_input_tokens_seen": 192121805, + "router_z_loss_clip": 1.50585938, + "router_z_loss_mlp": 0.10845947, + "step": 8930, + "time_per_iteration": 2.543949604034424 + }, + { + "auxiliary_loss_clip": 0.06424774, + "auxiliary_loss_mlp": 0.01264361, + "balance_loss_clip": 0.06279226, + "balance_loss_mlp": 0.01253299, + "epoch": 0.5369607695776342, + "flos": 26218119461760.0, + "grad_norm": 1.4676781117198738, + "language_loss": 0.67308921, + "learning_rate": 1.8577719085563136e-06, + "loss": 0.74998057, + "num_input_tokens_seen": 192141765, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.1105957, + "step": 8931, + "time_per_iteration": 2.5630295276641846 + }, + { + "auxiliary_loss_clip": 0.06432625, + "auxiliary_loss_mlp": 0.01268662, + "balance_loss_clip": 0.0628577, + "balance_loss_mlp": 0.01256598, + "epoch": 0.5370208928303021, + "flos": 25015920606720.0, + "grad_norm": 1.565512656212007, + "language_loss": 0.76494187, + "learning_rate": 1.8573834356164525e-06, + "loss": 0.84195477, + "num_input_tokens_seen": 192161560, + "router_z_loss_clip": 1.46777344, + "router_z_loss_mlp": 0.12072754, + "step": 8932, + "time_per_iteration": 2.5423011779785156 + }, + { + "auxiliary_loss_clip": 0.0642775, + "auxiliary_loss_mlp": 0.01267942, + "balance_loss_clip": 0.06280537, + "balance_loss_mlp": 0.01255723, + "epoch": 0.5370810160829701, + "flos": 31799646034560.0, + "grad_norm": 1.681669184165067, + "language_loss": 0.66588402, + "learning_rate": 1.8569949680846261e-06, + "loss": 0.74284095, + "num_input_tokens_seen": 192180190, + "router_z_loss_clip": 1.47167969, + "router_z_loss_mlp": 0.12219238, + "step": 8933, + "time_per_iteration": 2.6461243629455566 + }, + { + "auxiliary_loss_clip": 0.0642833, + "auxiliary_loss_mlp": 0.01268413, + "balance_loss_clip": 0.06281729, + "balance_loss_mlp": 0.01256515, + "epoch": 0.537141139335638, + "flos": 23849500245120.0, + "grad_norm": 1.5934461108199862, + "language_loss": 0.83294082, + "learning_rate": 1.856606505975565e-06, + "loss": 0.90990818, + "num_input_tokens_seen": 192198855, + "router_z_loss_clip": 1.46582031, + "router_z_loss_mlp": 0.11895752, + "step": 8934, + "time_per_iteration": 2.5241549015045166 + }, + { + "auxiliary_loss_clip": 0.06428687, + "auxiliary_loss_mlp": 0.01267543, + "balance_loss_clip": 0.06283442, + "balance_loss_mlp": 0.01256033, + "epoch": 0.537201262588306, + "flos": 18513685370880.0, + "grad_norm": 1.6222709830765285, + "language_loss": 0.7995823, + "learning_rate": 1.856218049303999e-06, + "loss": 0.87654459, + "num_input_tokens_seen": 192216555, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.11517334, + "step": 8935, + "time_per_iteration": 2.5692355632781982 + }, + { + "auxiliary_loss_clip": 0.06432107, + "auxiliary_loss_mlp": 0.01265757, + "balance_loss_clip": 0.06282724, + "balance_loss_mlp": 0.01253556, + "epoch": 0.537261385840974, + "flos": 25669492853760.0, + "grad_norm": 4.395420873174801, + "language_loss": 0.83744997, + "learning_rate": 1.855829598084659e-06, + "loss": 0.91442859, + "num_input_tokens_seen": 192236910, + "router_z_loss_clip": 1.49414062, + "router_z_loss_mlp": 0.12200928, + "step": 8936, + "time_per_iteration": 2.53723406791687 + }, + { + "auxiliary_loss_clip": 0.06430986, + "auxiliary_loss_mlp": 0.0126655, + "balance_loss_clip": 0.06284051, + "balance_loss_mlp": 0.01255458, + "epoch": 0.537321509093642, + "flos": 40744656950400.0, + "grad_norm": 1.238966659536207, + "language_loss": 0.73065245, + "learning_rate": 1.8554411523322754e-06, + "loss": 0.8076278, + "num_input_tokens_seen": 192260790, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 0.11096191, + "step": 8937, + "time_per_iteration": 2.7185041904449463 + }, + { + "auxiliary_loss_clip": 0.06432244, + "auxiliary_loss_mlp": 0.01269226, + "balance_loss_clip": 0.06281854, + "balance_loss_mlp": 0.01257591, + "epoch": 0.53738163234631, + "flos": 17244248014080.0, + "grad_norm": 2.3423795733880506, + "language_loss": 0.82399505, + "learning_rate": 1.8550527120615778e-06, + "loss": 0.90100974, + "num_input_tokens_seen": 192277230, + "router_z_loss_clip": 1.50585938, + "router_z_loss_mlp": 0.11645508, + "step": 8938, + "time_per_iteration": 2.497788906097412 + }, + { + "auxiliary_loss_clip": 0.06440363, + "auxiliary_loss_mlp": 0.01269336, + "balance_loss_clip": 0.06284846, + "balance_loss_mlp": 0.01257505, + "epoch": 0.5374417555989779, + "flos": 12826710034560.0, + "grad_norm": 2.237788663184982, + "language_loss": 0.80566859, + "learning_rate": 1.8546642772872957e-06, + "loss": 0.88276565, + "num_input_tokens_seen": 192292840, + "router_z_loss_clip": 1.5546875, + "router_z_loss_mlp": 0.1184082, + "step": 8939, + "time_per_iteration": 2.506603479385376 + }, + { + "auxiliary_loss_clip": 0.06330699, + "auxiliary_loss_mlp": 0.01256495, + "balance_loss_clip": 0.06268299, + "balance_loss_mlp": 0.01254887, + "epoch": 0.5375018788516459, + "flos": 67275502248960.0, + "grad_norm": 0.6889137998662954, + "language_loss": 0.5233649, + "learning_rate": 1.8542758480241589e-06, + "loss": 0.59923685, + "num_input_tokens_seen": 192358240, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 0.01609802, + "step": 8940, + "time_per_iteration": 3.1455881595611572 + }, + { + "auxiliary_loss_clip": 0.06424817, + "auxiliary_loss_mlp": 0.01265039, + "balance_loss_clip": 0.06280527, + "balance_loss_mlp": 0.01254197, + "epoch": 0.5375620021043138, + "flos": 18120080016000.0, + "grad_norm": 1.7572331791906293, + "language_loss": 0.71456778, + "learning_rate": 1.8538874242868965e-06, + "loss": 0.7914663, + "num_input_tokens_seen": 192377370, + "router_z_loss_clip": 1.44238281, + "router_z_loss_mlp": 0.1083374, + "step": 8941, + "time_per_iteration": 3.9169673919677734 + }, + { + "auxiliary_loss_clip": 0.06423429, + "auxiliary_loss_mlp": 0.01266734, + "balance_loss_clip": 0.06280611, + "balance_loss_mlp": 0.01256554, + "epoch": 0.5376221253569818, + "flos": 23156166435840.0, + "grad_norm": 1.5985240277338788, + "language_loss": 0.79660439, + "learning_rate": 1.853499006090237e-06, + "loss": 0.87350607, + "num_input_tokens_seen": 192396450, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.10174561, + "step": 8942, + "time_per_iteration": 2.5441763401031494 + }, + { + "auxiliary_loss_clip": 0.06433077, + "auxiliary_loss_mlp": 0.01269882, + "balance_loss_clip": 0.06281331, + "balance_loss_mlp": 0.01258229, + "epoch": 0.5376822486096497, + "flos": 29980240404480.0, + "grad_norm": 1.695957968467341, + "language_loss": 0.7061829, + "learning_rate": 1.853110593448911e-06, + "loss": 0.78321248, + "num_input_tokens_seen": 192417390, + "router_z_loss_clip": 1.51855469, + "router_z_loss_mlp": 0.11645508, + "step": 8943, + "time_per_iteration": 2.5876903533935547 + }, + { + "auxiliary_loss_clip": 0.06327454, + "auxiliary_loss_mlp": 0.01255314, + "balance_loss_clip": 0.06264913, + "balance_loss_mlp": 0.0125356, + "epoch": 0.5377423718623178, + "flos": 54188139761280.0, + "grad_norm": 0.7834151101556619, + "language_loss": 0.59688759, + "learning_rate": 1.852722186377645e-06, + "loss": 0.67271525, + "num_input_tokens_seen": 192478060, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 0.01757812, + "step": 8944, + "time_per_iteration": 4.5469114780426025 + }, + { + "auxiliary_loss_clip": 0.06439775, + "auxiliary_loss_mlp": 0.01267766, + "balance_loss_clip": 0.06283297, + "balance_loss_mlp": 0.01256066, + "epoch": 0.5378024951149857, + "flos": 23263585770240.0, + "grad_norm": 2.6705245070619754, + "language_loss": 0.776173, + "learning_rate": 1.852333784891169e-06, + "loss": 0.85324842, + "num_input_tokens_seen": 192495985, + "router_z_loss_clip": 1.56347656, + "router_z_loss_mlp": 0.11706543, + "step": 8945, + "time_per_iteration": 2.61606502532959 + }, + { + "auxiliary_loss_clip": 0.06428292, + "auxiliary_loss_mlp": 0.01263773, + "balance_loss_clip": 0.06278516, + "balance_loss_mlp": 0.01252883, + "epoch": 0.5378626183676537, + "flos": 24030866407680.0, + "grad_norm": 1.7469475045380867, + "language_loss": 0.68958521, + "learning_rate": 1.8519453890042112e-06, + "loss": 0.76650584, + "num_input_tokens_seen": 192515445, + "router_z_loss_clip": 1.49804688, + "router_z_loss_mlp": 0.10888672, + "step": 8946, + "time_per_iteration": 2.6660590171813965 + }, + { + "auxiliary_loss_clip": 0.06427687, + "auxiliary_loss_mlp": 0.0126763, + "balance_loss_clip": 0.06282603, + "balance_loss_mlp": 0.01256704, + "epoch": 0.5379227416203216, + "flos": 27169072248960.0, + "grad_norm": 1.5118478086705984, + "language_loss": 0.77489585, + "learning_rate": 1.851556998731498e-06, + "loss": 0.85184896, + "num_input_tokens_seen": 192536530, + "router_z_loss_clip": 1.45019531, + "router_z_loss_mlp": 0.10925293, + "step": 8947, + "time_per_iteration": 2.618797779083252 + }, + { + "auxiliary_loss_clip": 0.06429853, + "auxiliary_loss_mlp": 0.0126878, + "balance_loss_clip": 0.06282403, + "balance_loss_mlp": 0.01257688, + "epoch": 0.5379828648729896, + "flos": 24688631358720.0, + "grad_norm": 1.962883252611848, + "language_loss": 0.60299599, + "learning_rate": 1.8511686140877592e-06, + "loss": 0.6799823, + "num_input_tokens_seen": 192556075, + "router_z_loss_clip": 1.47265625, + "router_z_loss_mlp": 0.11090088, + "step": 8948, + "time_per_iteration": 3.99113392829895 + }, + { + "auxiliary_loss_clip": 0.06430186, + "auxiliary_loss_mlp": 0.01265436, + "balance_loss_clip": 0.06282011, + "balance_loss_mlp": 0.01254629, + "epoch": 0.5380429881256577, + "flos": 22528981025280.0, + "grad_norm": 1.6036817147437437, + "language_loss": 0.7965849, + "learning_rate": 1.8507802350877205e-06, + "loss": 0.87354112, + "num_input_tokens_seen": 192575535, + "router_z_loss_clip": 1.48046875, + "router_z_loss_mlp": 0.10803223, + "step": 8949, + "time_per_iteration": 2.5306220054626465 + }, + { + "auxiliary_loss_clip": 0.06424635, + "auxiliary_loss_mlp": 0.01267697, + "balance_loss_clip": 0.06281022, + "balance_loss_mlp": 0.01256796, + "epoch": 0.5381031113783256, + "flos": 26986825618560.0, + "grad_norm": 1.5758786571118277, + "language_loss": 0.78447008, + "learning_rate": 1.850391861746111e-06, + "loss": 0.86139345, + "num_input_tokens_seen": 192594490, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.10900879, + "step": 8950, + "time_per_iteration": 2.5665290355682373 + }, + { + "auxiliary_loss_clip": 0.0642289, + "auxiliary_loss_mlp": 0.01269045, + "balance_loss_clip": 0.06281261, + "balance_loss_mlp": 0.01258793, + "epoch": 0.5381632346309936, + "flos": 24761026886400.0, + "grad_norm": 1.6449806756094487, + "language_loss": 0.72907847, + "learning_rate": 1.8500034940776573e-06, + "loss": 0.80599785, + "num_input_tokens_seen": 192615650, + "router_z_loss_clip": 1.41503906, + "router_z_loss_mlp": 0.10253906, + "step": 8951, + "time_per_iteration": 2.5389561653137207 + }, + { + "auxiliary_loss_clip": 0.0643057, + "auxiliary_loss_mlp": 0.01265397, + "balance_loss_clip": 0.06280816, + "balance_loss_mlp": 0.01254626, + "epoch": 0.5382233578836615, + "flos": 15565524589440.0, + "grad_norm": 1.8886102084278436, + "language_loss": 0.75767493, + "learning_rate": 1.849615132097085e-06, + "loss": 0.83463454, + "num_input_tokens_seen": 192633840, + "router_z_loss_clip": 1.49707031, + "router_z_loss_mlp": 0.10760498, + "step": 8952, + "time_per_iteration": 2.5009233951568604 + }, + { + "auxiliary_loss_clip": 0.06423527, + "auxiliary_loss_mlp": 0.01265834, + "balance_loss_clip": 0.0627749, + "balance_loss_mlp": 0.01254384, + "epoch": 0.5382834811363295, + "flos": 25091838005760.0, + "grad_norm": 1.352822721598185, + "language_loss": 0.79742837, + "learning_rate": 1.8492267758191228e-06, + "loss": 0.87432194, + "num_input_tokens_seen": 192655890, + "router_z_loss_clip": 1.45996094, + "router_z_loss_mlp": 0.11456299, + "step": 8953, + "time_per_iteration": 2.5382277965545654 + }, + { + "auxiliary_loss_clip": 0.06422, + "auxiliary_loss_mlp": 0.01263666, + "balance_loss_clip": 0.06278996, + "balance_loss_mlp": 0.01253193, + "epoch": 0.5383436043889974, + "flos": 13302983041920.0, + "grad_norm": 1.682075048645487, + "language_loss": 0.80507964, + "learning_rate": 1.8488384252584964e-06, + "loss": 0.88193631, + "num_input_tokens_seen": 192673025, + "router_z_loss_clip": 1.42871094, + "router_z_loss_mlp": 0.10473633, + "step": 8954, + "time_per_iteration": 2.5006446838378906 + }, + { + "auxiliary_loss_clip": 0.06425533, + "auxiliary_loss_mlp": 0.01268977, + "balance_loss_clip": 0.06279075, + "balance_loss_mlp": 0.01258123, + "epoch": 0.5384037276416654, + "flos": 23046063770880.0, + "grad_norm": 2.297323300751636, + "language_loss": 0.77060652, + "learning_rate": 1.8484500804299318e-06, + "loss": 0.84755164, + "num_input_tokens_seen": 192692190, + "router_z_loss_clip": 1.46386719, + "router_z_loss_mlp": 0.10858154, + "step": 8955, + "time_per_iteration": 2.5469982624053955 + }, + { + "auxiliary_loss_clip": 0.06422862, + "auxiliary_loss_mlp": 0.01268692, + "balance_loss_clip": 0.06278117, + "balance_loss_mlp": 0.01257624, + "epoch": 0.5384638508943334, + "flos": 20637389502720.0, + "grad_norm": 1.4766809485278785, + "language_loss": 0.78634906, + "learning_rate": 1.8480617413481557e-06, + "loss": 0.86326456, + "num_input_tokens_seen": 192710380, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.11071777, + "step": 8956, + "time_per_iteration": 3.9486958980560303 + }, + { + "auxiliary_loss_clip": 0.06328554, + "auxiliary_loss_mlp": 0.01254386, + "balance_loss_clip": 0.0626571, + "balance_loss_mlp": 0.01252584, + "epoch": 0.5385239741470014, + "flos": 66755820026880.0, + "grad_norm": 0.8475755828975666, + "language_loss": 0.63483834, + "learning_rate": 1.8476734080278932e-06, + "loss": 0.71066773, + "num_input_tokens_seen": 192768995, + "router_z_loss_clip": 0.62890625, + "router_z_loss_mlp": 0.01797485, + "step": 8957, + "time_per_iteration": 3.0589206218719482 + }, + { + "auxiliary_loss_clip": 0.06326501, + "auxiliary_loss_mlp": 0.01256038, + "balance_loss_clip": 0.06263363, + "balance_loss_mlp": 0.01254215, + "epoch": 0.5385840973996693, + "flos": 64737466076160.0, + "grad_norm": 0.6942778211869604, + "language_loss": 0.51190817, + "learning_rate": 1.8472850804838705e-06, + "loss": 0.58773351, + "num_input_tokens_seen": 192825585, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 0.01818848, + "step": 8958, + "time_per_iteration": 3.1954948902130127 + }, + { + "auxiliary_loss_clip": 0.06433147, + "auxiliary_loss_mlp": 0.01266859, + "balance_loss_clip": 0.06283388, + "balance_loss_mlp": 0.01255189, + "epoch": 0.5386442206523373, + "flos": 26149161951360.0, + "grad_norm": 1.5085241385719446, + "language_loss": 0.77482343, + "learning_rate": 1.8468967587308128e-06, + "loss": 0.85182357, + "num_input_tokens_seen": 192847335, + "router_z_loss_clip": 1.49609375, + "router_z_loss_mlp": 0.11669922, + "step": 8959, + "time_per_iteration": 2.595390558242798 + }, + { + "auxiliary_loss_clip": 0.06429408, + "auxiliary_loss_mlp": 0.01266713, + "balance_loss_clip": 0.06280766, + "balance_loss_mlp": 0.01255269, + "epoch": 0.5387043439050052, + "flos": 18256401809280.0, + "grad_norm": 2.0832623304514373, + "language_loss": 0.84442693, + "learning_rate": 1.8465084427834455e-06, + "loss": 0.92138815, + "num_input_tokens_seen": 192862205, + "router_z_loss_clip": 1.48632812, + "router_z_loss_mlp": 0.11437988, + "step": 8960, + "time_per_iteration": 2.459411382675171 + }, + { + "auxiliary_loss_clip": 0.0642896, + "auxiliary_loss_mlp": 0.01265224, + "balance_loss_clip": 0.06281836, + "balance_loss_mlp": 0.01254495, + "epoch": 0.5387644671576732, + "flos": 29795939349120.0, + "grad_norm": 1.5299241540989073, + "language_loss": 0.78738272, + "learning_rate": 1.8461201326564933e-06, + "loss": 0.86432457, + "num_input_tokens_seen": 192883695, + "router_z_loss_clip": 1.47070312, + "router_z_loss_mlp": 0.1072998, + "step": 8961, + "time_per_iteration": 2.6379730701446533 + }, + { + "auxiliary_loss_clip": 0.06425574, + "auxiliary_loss_mlp": 0.01265079, + "balance_loss_clip": 0.06280299, + "balance_loss_mlp": 0.01254106, + "epoch": 0.5388245904103413, + "flos": 22379661849600.0, + "grad_norm": 1.7063822520278231, + "language_loss": 0.85018182, + "learning_rate": 1.845731828364681e-06, + "loss": 0.92708838, + "num_input_tokens_seen": 192900190, + "router_z_loss_clip": 1.45214844, + "router_z_loss_mlp": 0.10980225, + "step": 8962, + "time_per_iteration": 2.495314359664917 + }, + { + "auxiliary_loss_clip": 0.06324032, + "auxiliary_loss_mlp": 0.01253937, + "balance_loss_clip": 0.06261306, + "balance_loss_mlp": 0.01252085, + "epoch": 0.5388847136630092, + "flos": 69827332417920.0, + "grad_norm": 0.7252434381461927, + "language_loss": 0.54196495, + "learning_rate": 1.8453435299227333e-06, + "loss": 0.61774462, + "num_input_tokens_seen": 192958675, + "router_z_loss_clip": 0.62792969, + "router_z_loss_mlp": 0.01847839, + "step": 8963, + "time_per_iteration": 3.0685930252075195 + }, + { + "auxiliary_loss_clip": 0.06319527, + "auxiliary_loss_mlp": 0.01253383, + "balance_loss_clip": 0.0625699, + "balance_loss_mlp": 0.01251595, + "epoch": 0.5389448369156772, + "flos": 69844270942080.0, + "grad_norm": 0.7817796987422422, + "language_loss": 0.62972116, + "learning_rate": 1.8449552373453744e-06, + "loss": 0.7054503, + "num_input_tokens_seen": 193033135, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 0.01786804, + "step": 8964, + "time_per_iteration": 3.2163538932800293 + }, + { + "auxiliary_loss_clip": 0.0643357, + "auxiliary_loss_mlp": 0.01266947, + "balance_loss_clip": 0.06280617, + "balance_loss_mlp": 0.01255462, + "epoch": 0.5390049601683451, + "flos": 31730478888960.0, + "grad_norm": 1.575337207693627, + "language_loss": 0.70121396, + "learning_rate": 1.8445669506473287e-06, + "loss": 0.77821916, + "num_input_tokens_seen": 193055570, + "router_z_loss_clip": 1.52734375, + "router_z_loss_mlp": 0.11499023, + "step": 8965, + "time_per_iteration": 2.6127662658691406 + }, + { + "auxiliary_loss_clip": 0.06431293, + "auxiliary_loss_mlp": 0.01269597, + "balance_loss_clip": 0.06281815, + "balance_loss_mlp": 0.01258546, + "epoch": 0.5390650834210131, + "flos": 18119283402240.0, + "grad_norm": 2.027850604452939, + "language_loss": 0.82445288, + "learning_rate": 1.8441786698433192e-06, + "loss": 0.90146178, + "num_input_tokens_seen": 193073120, + "router_z_loss_clip": 1.49414062, + "router_z_loss_mlp": 0.11047363, + "step": 8966, + "time_per_iteration": 2.472459554672241 + }, + { + "auxiliary_loss_clip": 0.06426321, + "auxiliary_loss_mlp": 0.01267306, + "balance_loss_clip": 0.06281838, + "balance_loss_mlp": 0.01256326, + "epoch": 0.539125206673681, + "flos": 17421798816000.0, + "grad_norm": 2.5704499610569282, + "language_loss": 0.72936428, + "learning_rate": 1.8437903949480706e-06, + "loss": 0.80630052, + "num_input_tokens_seen": 193090105, + "router_z_loss_clip": 1.44335938, + "router_z_loss_mlp": 0.10980225, + "step": 8967, + "time_per_iteration": 2.4896764755249023 + }, + { + "auxiliary_loss_clip": 0.06424848, + "auxiliary_loss_mlp": 0.01264578, + "balance_loss_clip": 0.06278098, + "balance_loss_mlp": 0.01254493, + "epoch": 0.539185329926349, + "flos": 22205255575680.0, + "grad_norm": 1.5589784366040595, + "language_loss": 0.81895125, + "learning_rate": 1.8434021259763065e-06, + "loss": 0.89584547, + "num_input_tokens_seen": 193109325, + "router_z_loss_clip": 1.46679688, + "router_z_loss_mlp": 0.10083008, + "step": 8968, + "time_per_iteration": 2.5401480197906494 + }, + { + "auxiliary_loss_clip": 0.06428899, + "auxiliary_loss_mlp": 0.01265753, + "balance_loss_clip": 0.0628034, + "balance_loss_mlp": 0.01254118, + "epoch": 0.539245453179017, + "flos": 21440867904000.0, + "grad_norm": 1.4575649765742498, + "language_loss": 0.74243855, + "learning_rate": 1.8430138629427484e-06, + "loss": 0.81938505, + "num_input_tokens_seen": 193130595, + "router_z_loss_clip": 1.48632812, + "router_z_loss_mlp": 0.11633301, + "step": 8969, + "time_per_iteration": 2.553879976272583 + }, + { + "auxiliary_loss_clip": 0.06430885, + "auxiliary_loss_mlp": 0.01265593, + "balance_loss_clip": 0.06278199, + "balance_loss_mlp": 0.01254214, + "epoch": 0.539305576431685, + "flos": 20740322643840.0, + "grad_norm": 2.1595830648072347, + "language_loss": 0.827712, + "learning_rate": 1.8426256058621205e-06, + "loss": 0.90467674, + "num_input_tokens_seen": 193148930, + "router_z_loss_clip": 1.52636719, + "router_z_loss_mlp": 0.1137085, + "step": 8970, + "time_per_iteration": 2.478726863861084 + }, + { + "auxiliary_loss_clip": 0.06422678, + "auxiliary_loss_mlp": 0.01263676, + "balance_loss_clip": 0.06278254, + "balance_loss_mlp": 0.01253185, + "epoch": 0.5393656996843529, + "flos": 30928467934080.0, + "grad_norm": 1.400352356553148, + "language_loss": 0.75607336, + "learning_rate": 1.842237354749146e-06, + "loss": 0.83293688, + "num_input_tokens_seen": 193170140, + "router_z_loss_clip": 1.4453125, + "router_z_loss_mlp": 0.1048584, + "step": 8971, + "time_per_iteration": 2.5901689529418945 + }, + { + "auxiliary_loss_clip": 0.06318198, + "auxiliary_loss_mlp": 0.01253533, + "balance_loss_clip": 0.06255443, + "balance_loss_mlp": 0.0125168, + "epoch": 0.5394258229370209, + "flos": 50332953260160.0, + "grad_norm": 0.8588377208931133, + "language_loss": 0.60451257, + "learning_rate": 1.8418491096185465e-06, + "loss": 0.68022978, + "num_input_tokens_seen": 193227235, + "router_z_loss_clip": 0.62841797, + "router_z_loss_mlp": 0.01847839, + "step": 8972, + "time_per_iteration": 3.1413605213165283 + }, + { + "auxiliary_loss_clip": 0.06426257, + "auxiliary_loss_mlp": 0.01269177, + "balance_loss_clip": 0.06278059, + "balance_loss_mlp": 0.01257918, + "epoch": 0.5394859461896888, + "flos": 25419169180800.0, + "grad_norm": 1.5980875117754325, + "language_loss": 0.787233, + "learning_rate": 1.841460870485045e-06, + "loss": 0.8641873, + "num_input_tokens_seen": 193248435, + "router_z_loss_clip": 1.48046875, + "router_z_loss_mlp": 0.1126709, + "step": 8973, + "time_per_iteration": 2.5336296558380127 + }, + { + "auxiliary_loss_clip": 0.06433228, + "auxiliary_loss_mlp": 0.01267524, + "balance_loss_clip": 0.06279569, + "balance_loss_mlp": 0.0125546, + "epoch": 0.5395460694423568, + "flos": 25484646746880.0, + "grad_norm": 1.7949926655699973, + "language_loss": 0.7381959, + "learning_rate": 1.8410726373633623e-06, + "loss": 0.81520343, + "num_input_tokens_seen": 193267490, + "router_z_loss_clip": 1.53710938, + "router_z_loss_mlp": 0.12078857, + "step": 8974, + "time_per_iteration": 2.5483648777008057 + }, + { + "auxiliary_loss_clip": 0.06318444, + "auxiliary_loss_mlp": 0.01253276, + "balance_loss_clip": 0.06255525, + "balance_loss_mlp": 0.01251373, + "epoch": 0.5396061926950249, + "flos": 53267305317120.0, + "grad_norm": 0.7276638901828621, + "language_loss": 0.50946128, + "learning_rate": 1.8406844102682215e-06, + "loss": 0.58517849, + "num_input_tokens_seen": 193326050, + "router_z_loss_clip": 0.62890625, + "router_z_loss_mlp": 0.01899719, + "step": 8975, + "time_per_iteration": 3.125056028366089 + }, + { + "auxiliary_loss_clip": 0.06423691, + "auxiliary_loss_mlp": 0.01264945, + "balance_loss_clip": 0.06277017, + "balance_loss_mlp": 0.01253215, + "epoch": 0.5396663159476928, + "flos": 26732476949760.0, + "grad_norm": 1.546051077066994, + "language_loss": 0.72722358, + "learning_rate": 1.840296189214344e-06, + "loss": 0.80410993, + "num_input_tokens_seen": 193348785, + "router_z_loss_clip": 1.46777344, + "router_z_loss_mlp": 0.11724854, + "step": 8976, + "time_per_iteration": 2.563626766204834 + }, + { + "auxiliary_loss_clip": 0.06424834, + "auxiliary_loss_mlp": 0.01268763, + "balance_loss_clip": 0.06278136, + "balance_loss_mlp": 0.01257999, + "epoch": 0.5397264392003608, + "flos": 23259267285120.0, + "grad_norm": 1.9541916066514684, + "language_loss": 0.70649612, + "learning_rate": 1.8399079742164509e-06, + "loss": 0.78343207, + "num_input_tokens_seen": 193367080, + "router_z_loss_clip": 1.46679688, + "router_z_loss_mlp": 0.10766602, + "step": 8977, + "time_per_iteration": 2.5443131923675537 + }, + { + "auxiliary_loss_clip": 0.06428454, + "auxiliary_loss_mlp": 0.01267706, + "balance_loss_clip": 0.06278601, + "balance_loss_mlp": 0.01256691, + "epoch": 0.5397865624530287, + "flos": 18299727169920.0, + "grad_norm": 1.8457096410810847, + "language_loss": 0.72901827, + "learning_rate": 1.8395197652892636e-06, + "loss": 0.80597985, + "num_input_tokens_seen": 193383715, + "router_z_loss_clip": 1.49804688, + "router_z_loss_mlp": 0.11016846, + "step": 8978, + "time_per_iteration": 2.511715888977051 + }, + { + "auxiliary_loss_clip": 0.06434547, + "auxiliary_loss_mlp": 0.01269171, + "balance_loss_clip": 0.0627895, + "balance_loss_mlp": 0.01256821, + "epoch": 0.5398466857056967, + "flos": 15301742336640.0, + "grad_norm": 1.7083695222951265, + "language_loss": 0.74513042, + "learning_rate": 1.8391315624475028e-06, + "loss": 0.82216758, + "num_input_tokens_seen": 193400560, + "router_z_loss_clip": 1.5546875, + "router_z_loss_mlp": 0.12347412, + "step": 8979, + "time_per_iteration": 2.4654295444488525 + }, + { + "auxiliary_loss_clip": 0.06435215, + "auxiliary_loss_mlp": 0.01268104, + "balance_loss_clip": 0.062815, + "balance_loss_mlp": 0.0125551, + "epoch": 0.5399068089583646, + "flos": 17827521085440.0, + "grad_norm": 2.1729763122828567, + "language_loss": 0.77298462, + "learning_rate": 1.8387433657058892e-06, + "loss": 0.85001791, + "num_input_tokens_seen": 193418680, + "router_z_loss_clip": 1.53710938, + "router_z_loss_mlp": 0.12609863, + "step": 8980, + "time_per_iteration": 2.5131070613861084 + }, + { + "auxiliary_loss_clip": 0.06428653, + "auxiliary_loss_mlp": 0.01266817, + "balance_loss_clip": 0.06278711, + "balance_loss_mlp": 0.01256202, + "epoch": 0.5399669322110326, + "flos": 27389109870720.0, + "grad_norm": 1.7146505379249901, + "language_loss": 0.82213032, + "learning_rate": 1.8383551750791431e-06, + "loss": 0.89908504, + "num_input_tokens_seen": 193439310, + "router_z_loss_clip": 1.49902344, + "router_z_loss_mlp": 0.10626221, + "step": 8981, + "time_per_iteration": 4.00026273727417 + }, + { + "auxiliary_loss_clip": 0.06430832, + "auxiliary_loss_mlp": 0.01266682, + "balance_loss_clip": 0.06279931, + "balance_loss_mlp": 0.01255292, + "epoch": 0.5400270554637006, + "flos": 20455394434560.0, + "grad_norm": 1.8197401655909293, + "language_loss": 0.67626458, + "learning_rate": 1.8379669905819857e-06, + "loss": 0.75323975, + "num_input_tokens_seen": 193458115, + "router_z_loss_clip": 1.5078125, + "router_z_loss_mlp": 0.11395264, + "step": 8982, + "time_per_iteration": 2.7018609046936035 + }, + { + "auxiliary_loss_clip": 0.06430931, + "auxiliary_loss_mlp": 0.01272335, + "balance_loss_clip": 0.06282471, + "balance_loss_mlp": 0.0126123, + "epoch": 0.5400871787163686, + "flos": 21696055113600.0, + "grad_norm": 1.5105940902505235, + "language_loss": 0.82925522, + "learning_rate": 1.8375788122291358e-06, + "loss": 0.90628791, + "num_input_tokens_seen": 193477365, + "router_z_loss_clip": 1.48632812, + "router_z_loss_mlp": 0.11108398, + "step": 8983, + "time_per_iteration": 4.0147035121917725 + }, + { + "auxiliary_loss_clip": 0.06427681, + "auxiliary_loss_mlp": 0.01265838, + "balance_loss_clip": 0.06280811, + "balance_loss_mlp": 0.01254233, + "epoch": 0.5401473019690365, + "flos": 19210163708160.0, + "grad_norm": 2.5381589556683752, + "language_loss": 0.70748949, + "learning_rate": 1.8371906400353138e-06, + "loss": 0.78442466, + "num_input_tokens_seen": 193495595, + "router_z_loss_clip": 1.46679688, + "router_z_loss_mlp": 0.11608887, + "step": 8984, + "time_per_iteration": 2.485203742980957 + }, + { + "auxiliary_loss_clip": 0.06436664, + "auxiliary_loss_mlp": 0.01270492, + "balance_loss_clip": 0.06283301, + "balance_loss_mlp": 0.01258702, + "epoch": 0.5402074252217045, + "flos": 20632987163520.0, + "grad_norm": 1.6283776116809212, + "language_loss": 0.80336136, + "learning_rate": 1.8368024740152386e-06, + "loss": 0.88043296, + "num_input_tokens_seen": 193514035, + "router_z_loss_clip": 1.53417969, + "router_z_loss_mlp": 0.11798096, + "step": 8985, + "time_per_iteration": 2.5176138877868652 + }, + { + "auxiliary_loss_clip": 0.06421156, + "auxiliary_loss_mlp": 0.01266547, + "balance_loss_clip": 0.06279361, + "balance_loss_mlp": 0.01255497, + "epoch": 0.5402675484743724, + "flos": 24980519456640.0, + "grad_norm": 1.4261046169392377, + "language_loss": 0.79538441, + "learning_rate": 1.83641431418363e-06, + "loss": 0.87226146, + "num_input_tokens_seen": 193535445, + "router_z_loss_clip": 1.41796875, + "router_z_loss_mlp": 0.11053467, + "step": 8986, + "time_per_iteration": 2.528057098388672 + }, + { + "auxiliary_loss_clip": 0.06426872, + "auxiliary_loss_mlp": 0.01269311, + "balance_loss_clip": 0.06277602, + "balance_loss_mlp": 0.01258636, + "epoch": 0.5403276717270404, + "flos": 19464302741760.0, + "grad_norm": 1.7453745991771563, + "language_loss": 0.77310205, + "learning_rate": 1.8360261605552075e-06, + "loss": 0.85006386, + "num_input_tokens_seen": 193554780, + "router_z_loss_clip": 1.4921875, + "router_z_loss_mlp": 0.10681152, + "step": 8987, + "time_per_iteration": 3.9355413913726807 + }, + { + "auxiliary_loss_clip": 0.06426796, + "auxiliary_loss_mlp": 0.01265394, + "balance_loss_clip": 0.06278582, + "balance_loss_mlp": 0.01254147, + "epoch": 0.5403877949797083, + "flos": 18448040096640.0, + "grad_norm": 1.594164869128485, + "language_loss": 0.70988709, + "learning_rate": 1.8356380131446887e-06, + "loss": 0.78680897, + "num_input_tokens_seen": 193573580, + "router_z_loss_clip": 1.48046875, + "router_z_loss_mlp": 0.11248779, + "step": 8988, + "time_per_iteration": 2.529665470123291 + }, + { + "auxiliary_loss_clip": 0.06432524, + "auxiliary_loss_mlp": 0.01266539, + "balance_loss_clip": 0.06283048, + "balance_loss_mlp": 0.0125528, + "epoch": 0.5404479182323764, + "flos": 28300343022720.0, + "grad_norm": 2.353153070088846, + "language_loss": 0.68308997, + "learning_rate": 1.8352498719667934e-06, + "loss": 0.76008058, + "num_input_tokens_seen": 193590490, + "router_z_loss_clip": 1.49414062, + "router_z_loss_mlp": 0.11260986, + "step": 8989, + "time_per_iteration": 2.541705846786499 + }, + { + "auxiliary_loss_clip": 0.06425673, + "auxiliary_loss_mlp": 0.0126717, + "balance_loss_clip": 0.06277242, + "balance_loss_mlp": 0.01255071, + "epoch": 0.5405080414850444, + "flos": 23373981924480.0, + "grad_norm": 1.5774927452360248, + "language_loss": 0.77866185, + "learning_rate": 1.8348617370362399e-06, + "loss": 0.85559022, + "num_input_tokens_seen": 193609900, + "router_z_loss_clip": 1.48339844, + "router_z_loss_mlp": 0.12091064, + "step": 8990, + "time_per_iteration": 2.570016384124756 + }, + { + "auxiliary_loss_clip": 0.06423812, + "auxiliary_loss_mlp": 0.01264876, + "balance_loss_clip": 0.06277065, + "balance_loss_mlp": 0.01254517, + "epoch": 0.5405681647377123, + "flos": 21112907823360.0, + "grad_norm": 1.4794826200904196, + "language_loss": 0.69081038, + "learning_rate": 1.834473608367745e-06, + "loss": 0.76769722, + "num_input_tokens_seen": 193629775, + "router_z_loss_clip": 1.46679688, + "router_z_loss_mlp": 0.10357666, + "step": 8991, + "time_per_iteration": 2.491284132003784 + }, + { + "auxiliary_loss_clip": 0.06430428, + "auxiliary_loss_mlp": 0.01268215, + "balance_loss_clip": 0.06280528, + "balance_loss_mlp": 0.01256598, + "epoch": 0.5406282879903803, + "flos": 20455478288640.0, + "grad_norm": 1.6151673604367662, + "language_loss": 0.76260269, + "learning_rate": 1.8340854859760277e-06, + "loss": 0.83958906, + "num_input_tokens_seen": 193648070, + "router_z_loss_clip": 1.49707031, + "router_z_loss_mlp": 0.11621094, + "step": 8992, + "time_per_iteration": 2.506131649017334 + }, + { + "auxiliary_loss_clip": 0.06429817, + "auxiliary_loss_mlp": 0.01266516, + "balance_loss_clip": 0.06278399, + "balance_loss_mlp": 0.01255871, + "epoch": 0.5406884112430482, + "flos": 14214635464320.0, + "grad_norm": 2.867003800231527, + "language_loss": 0.7616564, + "learning_rate": 1.8336973698758056e-06, + "loss": 0.83861977, + "num_input_tokens_seen": 193665060, + "router_z_loss_clip": 1.51367188, + "router_z_loss_mlp": 0.10644531, + "step": 8993, + "time_per_iteration": 2.5104384422302246 + }, + { + "auxiliary_loss_clip": 0.06425033, + "auxiliary_loss_mlp": 0.01270182, + "balance_loss_clip": 0.06278533, + "balance_loss_mlp": 0.01259024, + "epoch": 0.5407485344957162, + "flos": 23881882648320.0, + "grad_norm": 1.5714876378286171, + "language_loss": 0.70600474, + "learning_rate": 1.8333092600817959e-06, + "loss": 0.78295696, + "num_input_tokens_seen": 193683620, + "router_z_loss_clip": 1.46289062, + "router_z_loss_mlp": 0.11151123, + "step": 8994, + "time_per_iteration": 2.557224988937378 + }, + { + "auxiliary_loss_clip": 0.06430587, + "auxiliary_loss_mlp": 0.01267062, + "balance_loss_clip": 0.06279735, + "balance_loss_mlp": 0.01255397, + "epoch": 0.5408086577483842, + "flos": 23155118259840.0, + "grad_norm": 1.7868138082728735, + "language_loss": 0.7559076, + "learning_rate": 1.8329211566087157e-06, + "loss": 0.83288407, + "num_input_tokens_seen": 193702990, + "router_z_loss_clip": 1.50878906, + "router_z_loss_mlp": 0.11657715, + "step": 8995, + "time_per_iteration": 4.038757085800171 + }, + { + "auxiliary_loss_clip": 0.06426084, + "auxiliary_loss_mlp": 0.01266333, + "balance_loss_clip": 0.06281247, + "balance_loss_mlp": 0.01255748, + "epoch": 0.5408687810010522, + "flos": 18777090280320.0, + "grad_norm": 1.7506118703188027, + "language_loss": 0.73407996, + "learning_rate": 1.832533059471282e-06, + "loss": 0.81100416, + "num_input_tokens_seen": 193721785, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.105896, + "step": 8996, + "time_per_iteration": 2.4787185192108154 + }, + { + "auxiliary_loss_clip": 0.06423852, + "auxiliary_loss_mlp": 0.01266299, + "balance_loss_clip": 0.06280176, + "balance_loss_mlp": 0.01254801, + "epoch": 0.5409289042537201, + "flos": 13886717310720.0, + "grad_norm": 1.8157411884483814, + "language_loss": 0.73422438, + "learning_rate": 1.8321449686842115e-06, + "loss": 0.81112587, + "num_input_tokens_seen": 193740315, + "router_z_loss_clip": 1.43652344, + "router_z_loss_mlp": 0.11499023, + "step": 8997, + "time_per_iteration": 2.5067830085754395 + }, + { + "auxiliary_loss_clip": 0.0643085, + "auxiliary_loss_mlp": 0.01267668, + "balance_loss_clip": 0.06281897, + "balance_loss_mlp": 0.01256802, + "epoch": 0.5409890275063881, + "flos": 14470619287680.0, + "grad_norm": 2.2163933004413625, + "language_loss": 0.72107315, + "learning_rate": 1.8317568842622207e-06, + "loss": 0.79805827, + "num_input_tokens_seen": 193757580, + "router_z_loss_clip": 1.48925781, + "router_z_loss_mlp": 0.10870361, + "step": 8998, + "time_per_iteration": 2.499892234802246 + }, + { + "auxiliary_loss_clip": 0.06424686, + "auxiliary_loss_mlp": 0.01266284, + "balance_loss_clip": 0.0627818, + "balance_loss_mlp": 0.01255281, + "epoch": 0.541049150759056, + "flos": 48987906721920.0, + "grad_norm": 1.4223172525448995, + "language_loss": 0.7060768, + "learning_rate": 1.8313688062200256e-06, + "loss": 0.78298652, + "num_input_tokens_seen": 193780965, + "router_z_loss_clip": 1.46386719, + "router_z_loss_mlp": 0.11004639, + "step": 8999, + "time_per_iteration": 2.75883412361145 + }, + { + "auxiliary_loss_clip": 0.06424989, + "auxiliary_loss_mlp": 0.01267453, + "balance_loss_clip": 0.06280144, + "balance_loss_mlp": 0.01255818, + "epoch": 0.541109274011724, + "flos": 18153007470720.0, + "grad_norm": 3.0241903502045884, + "language_loss": 0.8099103, + "learning_rate": 1.8309807345723422e-06, + "loss": 0.88683468, + "num_input_tokens_seen": 193797855, + "router_z_loss_clip": 1.44726562, + "router_z_loss_mlp": 0.11639404, + "step": 9000, + "time_per_iteration": 2.4591987133026123 + }, + { + "auxiliary_loss_clip": 0.06425589, + "auxiliary_loss_mlp": 0.01267626, + "balance_loss_clip": 0.0628029, + "balance_loss_mlp": 0.01256438, + "epoch": 0.541169397264392, + "flos": 20528921992320.0, + "grad_norm": 1.444857324942775, + "language_loss": 0.73542678, + "learning_rate": 1.8305926693338863e-06, + "loss": 0.81235898, + "num_input_tokens_seen": 193817375, + "router_z_loss_clip": 1.45117188, + "router_z_loss_mlp": 0.11193848, + "step": 9001, + "time_per_iteration": 2.5392372608184814 + }, + { + "auxiliary_loss_clip": 0.06428811, + "auxiliary_loss_mlp": 0.0126804, + "balance_loss_clip": 0.0627747, + "balance_loss_mlp": 0.01256489, + "epoch": 0.54122952051706, + "flos": 20049630238080.0, + "grad_norm": 2.1661909625933675, + "language_loss": 0.85214329, + "learning_rate": 1.8302046105193734e-06, + "loss": 0.92911184, + "num_input_tokens_seen": 193832205, + "router_z_loss_clip": 1.51171875, + "router_z_loss_mlp": 0.11560059, + "step": 9002, + "time_per_iteration": 2.4666826725006104 + }, + { + "auxiliary_loss_clip": 0.06425083, + "auxiliary_loss_mlp": 0.01263895, + "balance_loss_clip": 0.06280569, + "balance_loss_mlp": 0.01253792, + "epoch": 0.541289643769728, + "flos": 19068223691520.0, + "grad_norm": 1.8644067392145132, + "language_loss": 0.78467226, + "learning_rate": 1.8298165581435183e-06, + "loss": 0.86156201, + "num_input_tokens_seen": 193849830, + "router_z_loss_clip": 1.44433594, + "router_z_loss_mlp": 0.10101318, + "step": 9003, + "time_per_iteration": 2.536766767501831 + }, + { + "auxiliary_loss_clip": 0.06424496, + "auxiliary_loss_mlp": 0.01263823, + "balance_loss_clip": 0.06279116, + "balance_loss_mlp": 0.01253005, + "epoch": 0.5413497670223959, + "flos": 22388801944320.0, + "grad_norm": 1.7504010601062234, + "language_loss": 0.69487125, + "learning_rate": 1.8294285122210372e-06, + "loss": 0.77175444, + "num_input_tokens_seen": 193869945, + "router_z_loss_clip": 1.45019531, + "router_z_loss_mlp": 0.1081543, + "step": 9004, + "time_per_iteration": 2.522757053375244 + }, + { + "auxiliary_loss_clip": 0.06323519, + "auxiliary_loss_mlp": 0.01256562, + "balance_loss_clip": 0.0626113, + "balance_loss_mlp": 0.01254622, + "epoch": 0.5414098902750639, + "flos": 70052149722240.0, + "grad_norm": 0.9317133774182984, + "language_loss": 0.58728683, + "learning_rate": 1.8290404727666434e-06, + "loss": 0.66308761, + "num_input_tokens_seen": 193930860, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 0.01937866, + "step": 9005, + "time_per_iteration": 3.227922201156616 + }, + { + "auxiliary_loss_clip": 0.06426564, + "auxiliary_loss_mlp": 0.01264985, + "balance_loss_clip": 0.06276372, + "balance_loss_mlp": 0.01254477, + "epoch": 0.5414700135277318, + "flos": 21805445018880.0, + "grad_norm": 2.0206216562473416, + "language_loss": 0.78202778, + "learning_rate": 1.8286524397950517e-06, + "loss": 0.85894328, + "num_input_tokens_seen": 193949075, + "router_z_loss_clip": 1.50097656, + "router_z_loss_mlp": 0.10510254, + "step": 9006, + "time_per_iteration": 2.557199001312256 + }, + { + "auxiliary_loss_clip": 0.06423091, + "auxiliary_loss_mlp": 0.01269943, + "balance_loss_clip": 0.06278808, + "balance_loss_mlp": 0.01259965, + "epoch": 0.5415301367803999, + "flos": 16913269186560.0, + "grad_norm": 3.052189299631263, + "language_loss": 0.8345896, + "learning_rate": 1.8282644133209777e-06, + "loss": 0.91152, + "num_input_tokens_seen": 193967630, + "router_z_loss_clip": 1.44238281, + "router_z_loss_mlp": 0.09979248, + "step": 9007, + "time_per_iteration": 2.5309536457061768 + }, + { + "auxiliary_loss_clip": 0.06427018, + "auxiliary_loss_mlp": 0.01265497, + "balance_loss_clip": 0.06280112, + "balance_loss_mlp": 0.01254089, + "epoch": 0.5415902600330678, + "flos": 25711518476160.0, + "grad_norm": 1.8242309219870276, + "language_loss": 0.67383778, + "learning_rate": 1.8278763933591334e-06, + "loss": 0.750763, + "num_input_tokens_seen": 193988730, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 0.11401367, + "step": 9008, + "time_per_iteration": 2.5476038455963135 + }, + { + "auxiliary_loss_clip": 0.0643273, + "auxiliary_loss_mlp": 0.01271282, + "balance_loss_clip": 0.06281075, + "balance_loss_mlp": 0.01259432, + "epoch": 0.5416503832857358, + "flos": 19214146776960.0, + "grad_norm": 1.9758514689639541, + "language_loss": 0.7415235, + "learning_rate": 1.827488379924234e-06, + "loss": 0.81856364, + "num_input_tokens_seen": 194005160, + "router_z_loss_clip": 1.51464844, + "router_z_loss_mlp": 0.11846924, + "step": 9009, + "time_per_iteration": 2.519923448562622 + }, + { + "auxiliary_loss_clip": 0.06433536, + "auxiliary_loss_mlp": 0.012676, + "balance_loss_clip": 0.0628282, + "balance_loss_mlp": 0.01255691, + "epoch": 0.5417105065384037, + "flos": 12718619867520.0, + "grad_norm": 2.008927815850951, + "language_loss": 0.88025904, + "learning_rate": 1.8271003730309923e-06, + "loss": 0.95727038, + "num_input_tokens_seen": 194021700, + "router_z_loss_clip": 1.50683594, + "router_z_loss_mlp": 0.11907959, + "step": 9010, + "time_per_iteration": 2.4986653327941895 + }, + { + "auxiliary_loss_clip": 0.06425326, + "auxiliary_loss_mlp": 0.01266313, + "balance_loss_clip": 0.06279215, + "balance_loss_mlp": 0.0125562, + "epoch": 0.5417706297910717, + "flos": 30343727416320.0, + "grad_norm": 1.9869037800658418, + "language_loss": 0.64700162, + "learning_rate": 1.826712372694122e-06, + "loss": 0.72391802, + "num_input_tokens_seen": 194042620, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.10693359, + "step": 9011, + "time_per_iteration": 2.639526605606079 + }, + { + "auxiliary_loss_clip": 0.06426919, + "auxiliary_loss_mlp": 0.0126718, + "balance_loss_clip": 0.06280383, + "balance_loss_mlp": 0.01256368, + "epoch": 0.5418307530437396, + "flos": 29028323295360.0, + "grad_norm": 2.488283502034593, + "language_loss": 0.79704046, + "learning_rate": 1.8263243789283362e-06, + "loss": 0.87398142, + "num_input_tokens_seen": 194061800, + "router_z_loss_clip": 1.46386719, + "router_z_loss_mlp": 0.1081543, + "step": 9012, + "time_per_iteration": 2.546048641204834 + }, + { + "auxiliary_loss_clip": 0.06429458, + "auxiliary_loss_mlp": 0.01265294, + "balance_loss_clip": 0.06280975, + "balance_loss_mlp": 0.01254464, + "epoch": 0.5418908762964076, + "flos": 16879125847680.0, + "grad_norm": 2.3471098958204712, + "language_loss": 0.74353266, + "learning_rate": 1.8259363917483466e-06, + "loss": 0.82048023, + "num_input_tokens_seen": 194079890, + "router_z_loss_clip": 1.48242188, + "router_z_loss_mlp": 0.10839844, + "step": 9013, + "time_per_iteration": 2.544989585876465 + }, + { + "auxiliary_loss_clip": 0.06429175, + "auxiliary_loss_mlp": 0.01265654, + "balance_loss_clip": 0.06277567, + "balance_loss_mlp": 0.01254806, + "epoch": 0.5419509995490756, + "flos": 18955144206720.0, + "grad_norm": 2.592240526053277, + "language_loss": 0.72416294, + "learning_rate": 1.8255484111688667e-06, + "loss": 0.80111116, + "num_input_tokens_seen": 194097625, + "router_z_loss_clip": 1.51464844, + "router_z_loss_mlp": 0.10852051, + "step": 9014, + "time_per_iteration": 2.4757673740386963 + }, + { + "auxiliary_loss_clip": 0.06427553, + "auxiliary_loss_mlp": 0.01267434, + "balance_loss_clip": 0.06280749, + "balance_loss_mlp": 0.01256413, + "epoch": 0.5420111228017436, + "flos": 18083630689920.0, + "grad_norm": 1.4576837239395228, + "language_loss": 0.80686474, + "learning_rate": 1.8251604372046085e-06, + "loss": 0.88381469, + "num_input_tokens_seen": 194116055, + "router_z_loss_clip": 1.46777344, + "router_z_loss_mlp": 0.11010742, + "step": 9015, + "time_per_iteration": 2.50618839263916 + }, + { + "auxiliary_loss_clip": 0.06436689, + "auxiliary_loss_mlp": 0.01270112, + "balance_loss_clip": 0.06286176, + "balance_loss_mlp": 0.01259061, + "epoch": 0.5420712460544116, + "flos": 19067678640000.0, + "grad_norm": 2.2120132338352105, + "language_loss": 0.81892127, + "learning_rate": 1.8247724698702843e-06, + "loss": 0.8959893, + "num_input_tokens_seen": 194130365, + "router_z_loss_clip": 1.50488281, + "router_z_loss_mlp": 0.11053467, + "step": 9016, + "time_per_iteration": 2.475426197052002 + }, + { + "auxiliary_loss_clip": 0.06424853, + "auxiliary_loss_mlp": 0.01269653, + "balance_loss_clip": 0.06277838, + "balance_loss_mlp": 0.01259258, + "epoch": 0.5421313693070795, + "flos": 18193020595200.0, + "grad_norm": 1.7396358642065415, + "language_loss": 0.81981838, + "learning_rate": 1.8243845091806053e-06, + "loss": 0.89676344, + "num_input_tokens_seen": 194148975, + "router_z_loss_clip": 1.46777344, + "router_z_loss_mlp": 0.10388184, + "step": 9017, + "time_per_iteration": 2.4966297149658203 + }, + { + "auxiliary_loss_clip": 0.06421264, + "auxiliary_loss_mlp": 0.01267488, + "balance_loss_clip": 0.06278099, + "balance_loss_mlp": 0.01256301, + "epoch": 0.5421914925597475, + "flos": 13010969162880.0, + "grad_norm": 1.7307795983641447, + "language_loss": 0.77940953, + "learning_rate": 1.8239965551502837e-06, + "loss": 0.85629702, + "num_input_tokens_seen": 194167185, + "router_z_loss_clip": 1.43164062, + "router_z_loss_mlp": 0.11193848, + "step": 9018, + "time_per_iteration": 2.4861438274383545 + }, + { + "auxiliary_loss_clip": 0.0643111, + "auxiliary_loss_mlp": 0.01264327, + "balance_loss_clip": 0.06279995, + "balance_loss_mlp": 0.01253557, + "epoch": 0.5422516158124154, + "flos": 46769654856960.0, + "grad_norm": 1.436078593305458, + "language_loss": 0.66629684, + "learning_rate": 1.8236086077940303e-06, + "loss": 0.7432512, + "num_input_tokens_seen": 194192840, + "router_z_loss_clip": 1.51074219, + "router_z_loss_mlp": 0.10772705, + "step": 9019, + "time_per_iteration": 2.793942928314209 + }, + { + "auxiliary_loss_clip": 0.06420586, + "auxiliary_loss_mlp": 0.01266098, + "balance_loss_clip": 0.06277826, + "balance_loss_mlp": 0.01256627, + "epoch": 0.5423117390650835, + "flos": 31766634725760.0, + "grad_norm": 1.5531318778473993, + "language_loss": 0.69972849, + "learning_rate": 1.8232206671265555e-06, + "loss": 0.77659535, + "num_input_tokens_seen": 194213150, + "router_z_loss_clip": 1.42871094, + "router_z_loss_mlp": 0.0947876, + "step": 9020, + "time_per_iteration": 3.977450132369995 + }, + { + "auxiliary_loss_clip": 0.0642193, + "auxiliary_loss_mlp": 0.01268231, + "balance_loss_clip": 0.0627913, + "balance_loss_mlp": 0.01257586, + "epoch": 0.5423718623177514, + "flos": 27209881987200.0, + "grad_norm": 1.41400284004279, + "language_loss": 0.80270976, + "learning_rate": 1.8228327331625717e-06, + "loss": 0.87961137, + "num_input_tokens_seen": 194234665, + "router_z_loss_clip": 1.42871094, + "router_z_loss_mlp": 0.10650635, + "step": 9021, + "time_per_iteration": 2.5875015258789062 + }, + { + "auxiliary_loss_clip": 0.06426784, + "auxiliary_loss_mlp": 0.0126779, + "balance_loss_clip": 0.0628023, + "balance_loss_mlp": 0.01257162, + "epoch": 0.5424319855704194, + "flos": 23552580902400.0, + "grad_norm": 2.7424242746142298, + "language_loss": 0.78868818, + "learning_rate": 1.822444805916788e-06, + "loss": 0.86563396, + "num_input_tokens_seen": 194253790, + "router_z_loss_clip": 1.46582031, + "router_z_loss_mlp": 0.10626221, + "step": 9022, + "time_per_iteration": 2.6569435596466064 + }, + { + "auxiliary_loss_clip": 0.06421105, + "auxiliary_loss_mlp": 0.01267956, + "balance_loss_clip": 0.06275026, + "balance_loss_mlp": 0.01257132, + "epoch": 0.5424921088230873, + "flos": 26623003190400.0, + "grad_norm": 2.014349133750916, + "language_loss": 0.82876647, + "learning_rate": 1.822056885403915e-06, + "loss": 0.90565705, + "num_input_tokens_seen": 194274950, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.10827637, + "step": 9023, + "time_per_iteration": 4.035135746002197 + }, + { + "auxiliary_loss_clip": 0.06427208, + "auxiliary_loss_mlp": 0.01266773, + "balance_loss_clip": 0.06280831, + "balance_loss_mlp": 0.01256718, + "epoch": 0.5425522320757553, + "flos": 23593600275840.0, + "grad_norm": 1.5793438869499181, + "language_loss": 0.71421236, + "learning_rate": 1.8216689716386627e-06, + "loss": 0.79115218, + "num_input_tokens_seen": 194296155, + "router_z_loss_clip": 1.46386719, + "router_z_loss_mlp": 0.10058594, + "step": 9024, + "time_per_iteration": 2.540205717086792 + }, + { + "auxiliary_loss_clip": 0.06424701, + "auxiliary_loss_mlp": 0.01264518, + "balance_loss_clip": 0.06276532, + "balance_loss_mlp": 0.01253908, + "epoch": 0.5426123553284232, + "flos": 30600256291200.0, + "grad_norm": 1.6177082091395079, + "language_loss": 0.65074164, + "learning_rate": 1.8212810646357405e-06, + "loss": 0.72763383, + "num_input_tokens_seen": 194318025, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.10601807, + "step": 9025, + "time_per_iteration": 2.6120383739471436 + }, + { + "auxiliary_loss_clip": 0.06428426, + "auxiliary_loss_mlp": 0.01269591, + "balance_loss_clip": 0.06278306, + "balance_loss_mlp": 0.0125891, + "epoch": 0.5426724785810912, + "flos": 12500049692160.0, + "grad_norm": 9.095866287209772, + "language_loss": 0.73753297, + "learning_rate": 1.8208931644098591e-06, + "loss": 0.81451309, + "num_input_tokens_seen": 194336150, + "router_z_loss_clip": 1.49902344, + "router_z_loss_mlp": 0.10681152, + "step": 9026, + "time_per_iteration": 2.47986102104187 + }, + { + "auxiliary_loss_clip": 0.06430142, + "auxiliary_loss_mlp": 0.01269421, + "balance_loss_clip": 0.06282182, + "balance_loss_mlp": 0.01256993, + "epoch": 0.5427326018337592, + "flos": 26071273981440.0, + "grad_norm": 2.23504413576904, + "language_loss": 0.78765059, + "learning_rate": 1.8205052709757265e-06, + "loss": 0.8646462, + "num_input_tokens_seen": 194355980, + "router_z_loss_clip": 1.47949219, + "router_z_loss_mlp": 0.12432861, + "step": 9027, + "time_per_iteration": 3.9859650135040283 + }, + { + "auxiliary_loss_clip": 0.06320234, + "auxiliary_loss_mlp": 0.01252608, + "balance_loss_clip": 0.06257887, + "balance_loss_mlp": 0.01250684, + "epoch": 0.5427927250864272, + "flos": 66004974789120.0, + "grad_norm": 0.7416092139326844, + "language_loss": 0.56562424, + "learning_rate": 1.8201173843480515e-06, + "loss": 0.64135265, + "num_input_tokens_seen": 194422660, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 0.01921082, + "step": 9028, + "time_per_iteration": 3.155468702316284 + }, + { + "auxiliary_loss_clip": 0.06432774, + "auxiliary_loss_mlp": 0.01272049, + "balance_loss_clip": 0.06283672, + "balance_loss_mlp": 0.01260158, + "epoch": 0.5428528483390952, + "flos": 19981678976640.0, + "grad_norm": 2.1493249613849015, + "language_loss": 0.78262091, + "learning_rate": 1.8197295045415442e-06, + "loss": 0.85966909, + "num_input_tokens_seen": 194438545, + "router_z_loss_clip": 1.49023438, + "router_z_loss_mlp": 0.11883545, + "step": 9029, + "time_per_iteration": 2.59745192527771 + }, + { + "auxiliary_loss_clip": 0.06422626, + "auxiliary_loss_mlp": 0.0127098, + "balance_loss_clip": 0.06278758, + "balance_loss_mlp": 0.01260108, + "epoch": 0.5429129715917631, + "flos": 21838288619520.0, + "grad_norm": 1.5330300742008836, + "language_loss": 0.83522928, + "learning_rate": 1.8193416315709112e-06, + "loss": 0.9121654, + "num_input_tokens_seen": 194458060, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.10870361, + "step": 9030, + "time_per_iteration": 2.579742670059204 + }, + { + "auxiliary_loss_clip": 0.06426223, + "auxiliary_loss_mlp": 0.01263686, + "balance_loss_clip": 0.06282306, + "balance_loss_mlp": 0.01252903, + "epoch": 0.5429730948444311, + "flos": 27790178238720.0, + "grad_norm": 1.5430505390577234, + "language_loss": 0.75487745, + "learning_rate": 1.8189537654508623e-06, + "loss": 0.8317765, + "num_input_tokens_seen": 194477405, + "router_z_loss_clip": 1.43847656, + "router_z_loss_mlp": 0.10784912, + "step": 9031, + "time_per_iteration": 2.5645737648010254 + }, + { + "auxiliary_loss_clip": 0.06421311, + "auxiliary_loss_mlp": 0.01265953, + "balance_loss_clip": 0.0628026, + "balance_loss_mlp": 0.01256226, + "epoch": 0.543033218097099, + "flos": 26767668464640.0, + "grad_norm": 1.6242541501700514, + "language_loss": 0.85659242, + "learning_rate": 1.8185659061961045e-06, + "loss": 0.933465, + "num_input_tokens_seen": 194497085, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.097229, + "step": 9032, + "time_per_iteration": 2.5913095474243164 + }, + { + "auxiliary_loss_clip": 0.06434417, + "auxiliary_loss_mlp": 0.01272349, + "balance_loss_clip": 0.06282632, + "balance_loss_mlp": 0.01260815, + "epoch": 0.5430933413497671, + "flos": 22681989780480.0, + "grad_norm": 1.5840496509982642, + "language_loss": 0.74130201, + "learning_rate": 1.8181780538213457e-06, + "loss": 0.81836969, + "num_input_tokens_seen": 194516785, + "router_z_loss_clip": 1.515625, + "router_z_loss_mlp": 0.11535645, + "step": 9033, + "time_per_iteration": 2.546196937561035 + }, + { + "auxiliary_loss_clip": 0.06426211, + "auxiliary_loss_mlp": 0.01268108, + "balance_loss_clip": 0.06281157, + "balance_loss_mlp": 0.01256569, + "epoch": 0.543153464602435, + "flos": 24614307187200.0, + "grad_norm": 1.5750334880362715, + "language_loss": 0.76250172, + "learning_rate": 1.8177902083412935e-06, + "loss": 0.83944499, + "num_input_tokens_seen": 194536475, + "router_z_loss_clip": 1.44921875, + "router_z_loss_mlp": 0.11535645, + "step": 9034, + "time_per_iteration": 2.5637965202331543 + }, + { + "auxiliary_loss_clip": 0.0642693, + "auxiliary_loss_mlp": 0.0126457, + "balance_loss_clip": 0.06282238, + "balance_loss_mlp": 0.01254002, + "epoch": 0.543213587855103, + "flos": 19031690511360.0, + "grad_norm": 1.6968779523598936, + "language_loss": 0.84307218, + "learning_rate": 1.817402369770655e-06, + "loss": 0.91998708, + "num_input_tokens_seen": 194554495, + "router_z_loss_clip": 1.4453125, + "router_z_loss_mlp": 0.10583496, + "step": 9035, + "time_per_iteration": 4.028722524642944 + }, + { + "auxiliary_loss_clip": 0.063224, + "auxiliary_loss_mlp": 0.01251692, + "balance_loss_clip": 0.06260421, + "balance_loss_mlp": 0.01250003, + "epoch": 0.5432737111077709, + "flos": 65705539824000.0, + "grad_norm": 0.6842717349937131, + "language_loss": 0.55272961, + "learning_rate": 1.8170145381241364e-06, + "loss": 0.62847054, + "num_input_tokens_seen": 194617620, + "router_z_loss_clip": 0.61816406, + "router_z_loss_mlp": 0.01693726, + "step": 9036, + "time_per_iteration": 3.117825746536255 + }, + { + "auxiliary_loss_clip": 0.06427496, + "auxiliary_loss_mlp": 0.01266068, + "balance_loss_clip": 0.06278114, + "balance_loss_mlp": 0.0125423, + "epoch": 0.5433338343604389, + "flos": 22098339365760.0, + "grad_norm": 1.6522952339212897, + "language_loss": 0.75599706, + "learning_rate": 1.8166267134164451e-06, + "loss": 0.83293271, + "num_input_tokens_seen": 194637690, + "router_z_loss_clip": 1.4921875, + "router_z_loss_mlp": 0.1184082, + "step": 9037, + "time_per_iteration": 2.520371913909912 + }, + { + "auxiliary_loss_clip": 0.06428872, + "auxiliary_loss_mlp": 0.01263373, + "balance_loss_clip": 0.06282881, + "balance_loss_mlp": 0.01252561, + "epoch": 0.5433939576131068, + "flos": 34680316752000.0, + "grad_norm": 1.5920545337485463, + "language_loss": 0.66775727, + "learning_rate": 1.8162388956622875e-06, + "loss": 0.74467969, + "num_input_tokens_seen": 194659520, + "router_z_loss_clip": 1.45800781, + "router_z_loss_mlp": 0.1081543, + "step": 9038, + "time_per_iteration": 2.6492366790771484 + }, + { + "auxiliary_loss_clip": 0.06424891, + "auxiliary_loss_mlp": 0.01265017, + "balance_loss_clip": 0.06279261, + "balance_loss_mlp": 0.01254395, + "epoch": 0.5434540808657748, + "flos": 20309639057280.0, + "grad_norm": 2.8075357913922687, + "language_loss": 0.78373635, + "learning_rate": 1.8158510848763692e-06, + "loss": 0.8606354, + "num_input_tokens_seen": 194677645, + "router_z_loss_clip": 1.45507812, + "router_z_loss_mlp": 0.10626221, + "step": 9039, + "time_per_iteration": 2.528156280517578 + }, + { + "auxiliary_loss_clip": 0.06428317, + "auxiliary_loss_mlp": 0.01269709, + "balance_loss_clip": 0.06281251, + "balance_loss_mlp": 0.01258677, + "epoch": 0.5435142041184428, + "flos": 23119549401600.0, + "grad_norm": 1.7481925172590123, + "language_loss": 0.76885521, + "learning_rate": 1.8154632810733962e-06, + "loss": 0.84583545, + "num_input_tokens_seen": 194697400, + "router_z_loss_clip": 1.46777344, + "router_z_loss_mlp": 0.11029053, + "step": 9040, + "time_per_iteration": 2.5517256259918213 + }, + { + "auxiliary_loss_clip": 0.06319717, + "auxiliary_loss_mlp": 0.01257021, + "balance_loss_clip": 0.06257772, + "balance_loss_mlp": 0.01255075, + "epoch": 0.5435743273711108, + "flos": 64032350768640.0, + "grad_norm": 0.6699998863594594, + "language_loss": 0.52323502, + "learning_rate": 1.815075484268074e-06, + "loss": 0.59900236, + "num_input_tokens_seen": 194761205, + "router_z_loss_clip": 0.61865234, + "router_z_loss_mlp": 0.0194397, + "step": 9041, + "time_per_iteration": 3.166306972503662 + }, + { + "auxiliary_loss_clip": 0.06428386, + "auxiliary_loss_mlp": 0.01265505, + "balance_loss_clip": 0.06280383, + "balance_loss_mlp": 0.01254687, + "epoch": 0.5436344506237788, + "flos": 25125897490560.0, + "grad_norm": 1.7575616905304456, + "language_loss": 0.762761, + "learning_rate": 1.8146876944751078e-06, + "loss": 0.83969998, + "num_input_tokens_seen": 194782445, + "router_z_loss_clip": 1.48242188, + "router_z_loss_mlp": 0.10821533, + "step": 9042, + "time_per_iteration": 2.5450282096862793 + }, + { + "auxiliary_loss_clip": 0.0642225, + "auxiliary_loss_mlp": 0.01265245, + "balance_loss_clip": 0.06278253, + "balance_loss_mlp": 0.01254176, + "epoch": 0.5436945738764467, + "flos": 19579017381120.0, + "grad_norm": 2.3576554691894054, + "language_loss": 0.6770978, + "learning_rate": 1.8142999117092033e-06, + "loss": 0.75397277, + "num_input_tokens_seen": 194800325, + "router_z_loss_clip": 1.43945312, + "router_z_loss_mlp": 0.11065674, + "step": 9043, + "time_per_iteration": 2.5310070514678955 + }, + { + "auxiliary_loss_clip": 0.06421092, + "auxiliary_loss_mlp": 0.01266758, + "balance_loss_clip": 0.06277125, + "balance_loss_mlp": 0.01256065, + "epoch": 0.5437546971291147, + "flos": 21148937879040.0, + "grad_norm": 1.5176966924106092, + "language_loss": 0.84091616, + "learning_rate": 1.8139121359850644e-06, + "loss": 0.91779459, + "num_input_tokens_seen": 194818675, + "router_z_loss_clip": 1.43847656, + "router_z_loss_mlp": 0.10699463, + "step": 9044, + "time_per_iteration": 2.4937691688537598 + }, + { + "auxiliary_loss_clip": 0.06427783, + "auxiliary_loss_mlp": 0.01267965, + "balance_loss_clip": 0.06275944, + "balance_loss_mlp": 0.01256056, + "epoch": 0.5438148203817826, + "flos": 25125645928320.0, + "grad_norm": 1.559720453478778, + "language_loss": 0.62531364, + "learning_rate": 1.8135243673173956e-06, + "loss": 0.70227116, + "num_input_tokens_seen": 194836595, + "router_z_loss_clip": 1.51660156, + "router_z_loss_mlp": 0.11914062, + "step": 9045, + "time_per_iteration": 2.558842182159424 + }, + { + "auxiliary_loss_clip": 0.06425174, + "auxiliary_loss_mlp": 0.01267999, + "balance_loss_clip": 0.06278486, + "balance_loss_mlp": 0.01257312, + "epoch": 0.5438749436344507, + "flos": 23009614444800.0, + "grad_norm": 1.4475609839642107, + "language_loss": 0.70189548, + "learning_rate": 1.8131366057209023e-06, + "loss": 0.77882719, + "num_input_tokens_seen": 194857520, + "router_z_loss_clip": 1.46679688, + "router_z_loss_mlp": 0.10687256, + "step": 9046, + "time_per_iteration": 2.546400785446167 + }, + { + "auxiliary_loss_clip": 0.06422587, + "auxiliary_loss_mlp": 0.01263416, + "balance_loss_clip": 0.06278922, + "balance_loss_mlp": 0.01253087, + "epoch": 0.5439350668871186, + "flos": 15492458229120.0, + "grad_norm": 1.7829079763234368, + "language_loss": 0.77310658, + "learning_rate": 1.8127488512102868e-06, + "loss": 0.84996659, + "num_input_tokens_seen": 194876020, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.10333252, + "step": 9047, + "time_per_iteration": 2.5223042964935303 + }, + { + "auxiliary_loss_clip": 0.06424624, + "auxiliary_loss_mlp": 0.01269137, + "balance_loss_clip": 0.06278106, + "balance_loss_mlp": 0.01257598, + "epoch": 0.5439951901397866, + "flos": 17244164160000.0, + "grad_norm": 2.1796692597227363, + "language_loss": 0.73181236, + "learning_rate": 1.8123611038002547e-06, + "loss": 0.80874991, + "num_input_tokens_seen": 194894650, + "router_z_loss_clip": 1.46484375, + "router_z_loss_mlp": 0.11547852, + "step": 9048, + "time_per_iteration": 2.4901275634765625 + }, + { + "auxiliary_loss_clip": 0.06419719, + "auxiliary_loss_mlp": 0.01268414, + "balance_loss_clip": 0.06275168, + "balance_loss_mlp": 0.01256773, + "epoch": 0.5440553133924545, + "flos": 18666945688320.0, + "grad_norm": 2.2913555210162535, + "language_loss": 0.93342638, + "learning_rate": 1.8119733635055076e-06, + "loss": 1.01030767, + "num_input_tokens_seen": 194911935, + "router_z_loss_clip": 1.4453125, + "router_z_loss_mlp": 0.11639404, + "step": 9049, + "time_per_iteration": 2.5185091495513916 + }, + { + "auxiliary_loss_clip": 0.0641875, + "auxiliary_loss_mlp": 0.01267443, + "balance_loss_clip": 0.06274416, + "balance_loss_mlp": 0.01257155, + "epoch": 0.5441154366451225, + "flos": 27129813811200.0, + "grad_norm": 1.6778604645700708, + "language_loss": 0.74161297, + "learning_rate": 1.8115856303407492e-06, + "loss": 0.81847489, + "num_input_tokens_seen": 194931620, + "router_z_loss_clip": 1.44140625, + "router_z_loss_mlp": 0.10284424, + "step": 9050, + "time_per_iteration": 2.551227331161499 + }, + { + "auxiliary_loss_clip": 0.06424956, + "auxiliary_loss_mlp": 0.01268538, + "balance_loss_clip": 0.06277525, + "balance_loss_mlp": 0.01257684, + "epoch": 0.5441755598977904, + "flos": 26000890951680.0, + "grad_norm": 1.7704942450323604, + "language_loss": 0.67003465, + "learning_rate": 1.8111979043206832e-06, + "loss": 0.74696958, + "num_input_tokens_seen": 194952560, + "router_z_loss_clip": 1.47460938, + "router_z_loss_mlp": 0.10852051, + "step": 9051, + "time_per_iteration": 2.586360454559326 + }, + { + "auxiliary_loss_clip": 0.06422283, + "auxiliary_loss_mlp": 0.01264215, + "balance_loss_clip": 0.06277864, + "balance_loss_mlp": 0.01253629, + "epoch": 0.5442356831504584, + "flos": 32388327694080.0, + "grad_norm": 1.6805683860476124, + "language_loss": 0.68003166, + "learning_rate": 1.810810185460011e-06, + "loss": 0.75689662, + "num_input_tokens_seen": 194973915, + "router_z_loss_clip": 1.44238281, + "router_z_loss_mlp": 0.10583496, + "step": 9052, + "time_per_iteration": 2.595308303833008 + }, + { + "auxiliary_loss_clip": 0.0642236, + "auxiliary_loss_mlp": 0.01267208, + "balance_loss_clip": 0.06275343, + "balance_loss_mlp": 0.01255413, + "epoch": 0.5442958064031264, + "flos": 24170123093760.0, + "grad_norm": 1.9713868762163456, + "language_loss": 0.93283188, + "learning_rate": 1.810422473773436e-06, + "loss": 1.0097276, + "num_input_tokens_seen": 194990170, + "router_z_loss_clip": 1.46972656, + "router_z_loss_mlp": 0.11791992, + "step": 9053, + "time_per_iteration": 2.5700409412384033 + }, + { + "auxiliary_loss_clip": 0.06427357, + "auxiliary_loss_mlp": 0.0127068, + "balance_loss_clip": 0.06279093, + "balance_loss_mlp": 0.01258509, + "epoch": 0.5443559296557944, + "flos": 18769669194240.0, + "grad_norm": 1.9808667763978582, + "language_loss": 0.83683395, + "learning_rate": 1.8100347692756595e-06, + "loss": 0.91381431, + "num_input_tokens_seen": 195006395, + "router_z_loss_clip": 1.48339844, + "router_z_loss_mlp": 0.1217041, + "step": 9054, + "time_per_iteration": 2.4873886108398438 + }, + { + "auxiliary_loss_clip": 0.06424912, + "auxiliary_loss_mlp": 0.01271948, + "balance_loss_clip": 0.06277627, + "balance_loss_mlp": 0.01260021, + "epoch": 0.5444160529084624, + "flos": 22638245149440.0, + "grad_norm": 1.9496494567304603, + "language_loss": 0.68541598, + "learning_rate": 1.8096470719813836e-06, + "loss": 0.76238453, + "num_input_tokens_seen": 195025080, + "router_z_loss_clip": 1.47265625, + "router_z_loss_mlp": 0.11920166, + "step": 9055, + "time_per_iteration": 2.5629093647003174 + }, + { + "auxiliary_loss_clip": 0.06326497, + "auxiliary_loss_mlp": 0.01261063, + "balance_loss_clip": 0.06264114, + "balance_loss_mlp": 0.01259381, + "epoch": 0.5444761761611303, + "flos": 69693106976640.0, + "grad_norm": 0.7193405715621726, + "language_loss": 0.57599837, + "learning_rate": 1.80925938190531e-06, + "loss": 0.65187401, + "num_input_tokens_seen": 195085725, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 0.01686096, + "step": 9056, + "time_per_iteration": 3.1249008178710938 + }, + { + "auxiliary_loss_clip": 0.06428131, + "auxiliary_loss_mlp": 0.01267471, + "balance_loss_clip": 0.06279279, + "balance_loss_mlp": 0.01255676, + "epoch": 0.5445362994137983, + "flos": 14282922142080.0, + "grad_norm": 1.7879789013056906, + "language_loss": 0.69611216, + "learning_rate": 1.8088716990621395e-06, + "loss": 0.77306819, + "num_input_tokens_seen": 195102585, + "router_z_loss_clip": 1.48828125, + "router_z_loss_mlp": 0.11798096, + "step": 9057, + "time_per_iteration": 2.498568296432495 + }, + { + "auxiliary_loss_clip": 0.06425367, + "auxiliary_loss_mlp": 0.01267238, + "balance_loss_clip": 0.06281108, + "balance_loss_mlp": 0.01255651, + "epoch": 0.5445964226664662, + "flos": 28993802613120.0, + "grad_norm": 1.9346963255645138, + "language_loss": 0.75279379, + "learning_rate": 1.8084840234665738e-06, + "loss": 0.8297199, + "num_input_tokens_seen": 195120055, + "router_z_loss_clip": 1.44433594, + "router_z_loss_mlp": 0.11578369, + "step": 9058, + "time_per_iteration": 2.569481134414673 + }, + { + "auxiliary_loss_clip": 0.06324711, + "auxiliary_loss_mlp": 0.01255513, + "balance_loss_clip": 0.06262248, + "balance_loss_mlp": 0.01253708, + "epoch": 0.5446565459191343, + "flos": 68642323649280.0, + "grad_norm": 0.781118187376451, + "language_loss": 0.62576413, + "learning_rate": 1.808096355133312e-06, + "loss": 0.7015664, + "num_input_tokens_seen": 195181045, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 0.01800537, + "step": 9059, + "time_per_iteration": 4.5610737800598145 + }, + { + "auxiliary_loss_clip": 0.06421264, + "auxiliary_loss_mlp": 0.01268955, + "balance_loss_clip": 0.06278148, + "balance_loss_mlp": 0.01257993, + "epoch": 0.5447166691718022, + "flos": 16221989802240.0, + "grad_norm": 1.8006783567998876, + "language_loss": 0.79601544, + "learning_rate": 1.8077086940770572e-06, + "loss": 0.87291771, + "num_input_tokens_seen": 195198840, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.10961914, + "step": 9060, + "time_per_iteration": 2.511836290359497 + }, + { + "auxiliary_loss_clip": 0.06426552, + "auxiliary_loss_mlp": 0.0126624, + "balance_loss_clip": 0.06279396, + "balance_loss_mlp": 0.0125454, + "epoch": 0.5447767924244702, + "flos": 25856225677440.0, + "grad_norm": 1.542760917466334, + "language_loss": 0.80138546, + "learning_rate": 1.8073210403125072e-06, + "loss": 0.87831336, + "num_input_tokens_seen": 195218720, + "router_z_loss_clip": 1.46972656, + "router_z_loss_mlp": 0.11700439, + "step": 9061, + "time_per_iteration": 2.5398924350738525 + }, + { + "auxiliary_loss_clip": 0.06425673, + "auxiliary_loss_mlp": 0.01265991, + "balance_loss_clip": 0.06280909, + "balance_loss_mlp": 0.01255221, + "epoch": 0.5448369156771381, + "flos": 19682998698240.0, + "grad_norm": 1.6196021204279303, + "language_loss": 0.87203825, + "learning_rate": 1.8069333938543627e-06, + "loss": 0.94895482, + "num_input_tokens_seen": 195235770, + "router_z_loss_clip": 1.44921875, + "router_z_loss_mlp": 0.10772705, + "step": 9062, + "time_per_iteration": 4.0366997718811035 + }, + { + "auxiliary_loss_clip": 0.06433238, + "auxiliary_loss_mlp": 0.01268748, + "balance_loss_clip": 0.0628314, + "balance_loss_mlp": 0.01256392, + "epoch": 0.5448970389298061, + "flos": 19287925896960.0, + "grad_norm": 1.7163800985020743, + "language_loss": 0.82674021, + "learning_rate": 1.8065457547173233e-06, + "loss": 0.90376008, + "num_input_tokens_seen": 195254870, + "router_z_loss_clip": 1.50097656, + "router_z_loss_mlp": 0.12359619, + "step": 9063, + "time_per_iteration": 2.5397801399230957 + }, + { + "auxiliary_loss_clip": 0.06429115, + "auxiliary_loss_mlp": 0.01269549, + "balance_loss_clip": 0.0628127, + "balance_loss_mlp": 0.01257264, + "epoch": 0.544957162182474, + "flos": 20997270789120.0, + "grad_norm": 1.590898869425655, + "language_loss": 0.63855612, + "learning_rate": 1.8061581229160878e-06, + "loss": 0.71554273, + "num_input_tokens_seen": 195273390, + "router_z_loss_clip": 1.47558594, + "router_z_loss_mlp": 0.1227417, + "step": 9064, + "time_per_iteration": 2.511350631713867 + }, + { + "auxiliary_loss_clip": 0.06432661, + "auxiliary_loss_mlp": 0.01263974, + "balance_loss_clip": 0.06282693, + "balance_loss_mlp": 0.01251863, + "epoch": 0.545017285435142, + "flos": 25381671678720.0, + "grad_norm": 1.596100575558465, + "language_loss": 0.80746907, + "learning_rate": 1.8057704984653566e-06, + "loss": 0.88443542, + "num_input_tokens_seen": 195295635, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.12115479, + "step": 9065, + "time_per_iteration": 2.589707136154175 + }, + { + "auxiliary_loss_clip": 0.06425799, + "auxiliary_loss_mlp": 0.01266335, + "balance_loss_clip": 0.06280494, + "balance_loss_mlp": 0.01255916, + "epoch": 0.54507740868781, + "flos": 19140661146240.0, + "grad_norm": 1.9404249818077939, + "language_loss": 0.78152055, + "learning_rate": 1.805382881379827e-06, + "loss": 0.85844183, + "num_input_tokens_seen": 195312545, + "router_z_loss_clip": 1.45214844, + "router_z_loss_mlp": 0.10412598, + "step": 9066, + "time_per_iteration": 2.5037317276000977 + }, + { + "auxiliary_loss_clip": 0.06434928, + "auxiliary_loss_mlp": 0.01268701, + "balance_loss_clip": 0.06284117, + "balance_loss_mlp": 0.01256714, + "epoch": 0.545137531940478, + "flos": 26256958629120.0, + "grad_norm": 1.5302055737642422, + "language_loss": 0.76331961, + "learning_rate": 1.8049952716741975e-06, + "loss": 0.84035593, + "num_input_tokens_seen": 195332955, + "router_z_loss_clip": 1.5078125, + "router_z_loss_mlp": 0.11993408, + "step": 9067, + "time_per_iteration": 4.019241571426392 + }, + { + "auxiliary_loss_clip": 0.06438933, + "auxiliary_loss_mlp": 0.01268386, + "balance_loss_clip": 0.06285474, + "balance_loss_mlp": 0.01255685, + "epoch": 0.545197655193146, + "flos": 37563880435200.0, + "grad_norm": 1.8087199149855477, + "language_loss": 0.62992573, + "learning_rate": 1.8046076693631682e-06, + "loss": 0.70699894, + "num_input_tokens_seen": 195355930, + "router_z_loss_clip": 1.53417969, + "router_z_loss_mlp": 0.12701416, + "step": 9068, + "time_per_iteration": 2.6678848266601562 + }, + { + "auxiliary_loss_clip": 0.06424262, + "auxiliary_loss_mlp": 0.01267107, + "balance_loss_clip": 0.06280495, + "balance_loss_mlp": 0.01256163, + "epoch": 0.5452577784458139, + "flos": 26038430380800.0, + "grad_norm": 1.5391820181686233, + "language_loss": 0.72328687, + "learning_rate": 1.8042200744614343e-06, + "loss": 0.80020058, + "num_input_tokens_seen": 195376445, + "router_z_loss_clip": 1.43847656, + "router_z_loss_mlp": 0.10949707, + "step": 9069, + "time_per_iteration": 2.555837631225586 + }, + { + "auxiliary_loss_clip": 0.06424727, + "auxiliary_loss_mlp": 0.0126738, + "balance_loss_clip": 0.0628207, + "balance_loss_mlp": 0.01256723, + "epoch": 0.5453179016984819, + "flos": 17644729403520.0, + "grad_norm": 1.699483734463513, + "language_loss": 0.74651837, + "learning_rate": 1.8038324869836957e-06, + "loss": 0.82343948, + "num_input_tokens_seen": 195393725, + "router_z_loss_clip": 1.42480469, + "router_z_loss_mlp": 0.10662842, + "step": 9070, + "time_per_iteration": 2.493806838989258 + }, + { + "auxiliary_loss_clip": 0.06424981, + "auxiliary_loss_mlp": 0.01264741, + "balance_loss_clip": 0.06277809, + "balance_loss_mlp": 0.01253839, + "epoch": 0.5453780249511498, + "flos": 23222524469760.0, + "grad_norm": 1.8987434929949667, + "language_loss": 0.61238426, + "learning_rate": 1.8034449069446489e-06, + "loss": 0.68928152, + "num_input_tokens_seen": 195411380, + "router_z_loss_clip": 1.47167969, + "router_z_loss_mlp": 0.10900879, + "step": 9071, + "time_per_iteration": 2.522620677947998 + }, + { + "auxiliary_loss_clip": 0.06331067, + "auxiliary_loss_mlp": 0.01252658, + "balance_loss_clip": 0.06269144, + "balance_loss_mlp": 0.01250867, + "epoch": 0.5454381482038179, + "flos": 68719163443200.0, + "grad_norm": 0.6892933067721945, + "language_loss": 0.57065922, + "learning_rate": 1.80305733435899e-06, + "loss": 0.64649647, + "num_input_tokens_seen": 195482015, + "router_z_loss_clip": 0.61767578, + "router_z_loss_mlp": 0.01786804, + "step": 9072, + "time_per_iteration": 3.235288381576538 + }, + { + "auxiliary_loss_clip": 0.06422395, + "auxiliary_loss_mlp": 0.01268326, + "balance_loss_clip": 0.06280763, + "balance_loss_mlp": 0.01257424, + "epoch": 0.5454982714564858, + "flos": 13265569393920.0, + "grad_norm": 1.8411374110080903, + "language_loss": 0.69644904, + "learning_rate": 1.8026697692414174e-06, + "loss": 0.77335626, + "num_input_tokens_seen": 195500440, + "router_z_loss_clip": 1.41601562, + "router_z_loss_mlp": 0.10906982, + "step": 9073, + "time_per_iteration": 2.476053237915039 + }, + { + "auxiliary_loss_clip": 0.06421326, + "auxiliary_loss_mlp": 0.01272164, + "balance_loss_clip": 0.06280228, + "balance_loss_mlp": 0.01261477, + "epoch": 0.5455583947091538, + "flos": 21842439396480.0, + "grad_norm": 1.836952800264558, + "language_loss": 0.71413183, + "learning_rate": 1.802282211606627e-06, + "loss": 0.79106677, + "num_input_tokens_seen": 195520860, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.10687256, + "step": 9074, + "time_per_iteration": 3.981220006942749 + }, + { + "auxiliary_loss_clip": 0.06424403, + "auxiliary_loss_mlp": 0.01266647, + "balance_loss_clip": 0.06278551, + "balance_loss_mlp": 0.01255364, + "epoch": 0.5456185179618217, + "flos": 17822489840640.0, + "grad_norm": 1.975994190229167, + "language_loss": 0.68697762, + "learning_rate": 1.8018946614693148e-06, + "loss": 0.76388818, + "num_input_tokens_seen": 195538615, + "router_z_loss_clip": 1.45800781, + "router_z_loss_mlp": 0.112854, + "step": 9075, + "time_per_iteration": 2.506155490875244 + }, + { + "auxiliary_loss_clip": 0.06425694, + "auxiliary_loss_mlp": 0.01265713, + "balance_loss_clip": 0.06281726, + "balance_loss_mlp": 0.01254942, + "epoch": 0.5456786412144897, + "flos": 21075787664640.0, + "grad_norm": 1.6135772994791406, + "language_loss": 0.80784404, + "learning_rate": 1.8015071188441768e-06, + "loss": 0.88475811, + "num_input_tokens_seen": 195557460, + "router_z_loss_clip": 1.43847656, + "router_z_loss_mlp": 0.10778809, + "step": 9076, + "time_per_iteration": 2.538940906524658 + }, + { + "auxiliary_loss_clip": 0.06430642, + "auxiliary_loss_mlp": 0.01272688, + "balance_loss_clip": 0.06283286, + "balance_loss_mlp": 0.01261005, + "epoch": 0.5457387644671576, + "flos": 23301712177920.0, + "grad_norm": 1.7804219771063188, + "language_loss": 0.80408549, + "learning_rate": 1.8011195837459089e-06, + "loss": 0.88111883, + "num_input_tokens_seen": 195577985, + "router_z_loss_clip": 1.47363281, + "router_z_loss_mlp": 0.11682129, + "step": 9077, + "time_per_iteration": 2.6752305030822754 + }, + { + "auxiliary_loss_clip": 0.06424201, + "auxiliary_loss_mlp": 0.01267583, + "balance_loss_clip": 0.06278477, + "balance_loss_mlp": 0.0125698, + "epoch": 0.5457988877198257, + "flos": 21623575731840.0, + "grad_norm": 1.8316897806182997, + "language_loss": 0.67871404, + "learning_rate": 1.8007320561892064e-06, + "loss": 0.75563186, + "num_input_tokens_seen": 195597620, + "router_z_loss_clip": 1.45800781, + "router_z_loss_mlp": 0.1060791, + "step": 9078, + "time_per_iteration": 2.5634307861328125 + }, + { + "auxiliary_loss_clip": 0.06428619, + "auxiliary_loss_mlp": 0.01268679, + "balance_loss_clip": 0.0628078, + "balance_loss_mlp": 0.01256722, + "epoch": 0.5458590109724936, + "flos": 23768174257920.0, + "grad_norm": 2.0367985655242116, + "language_loss": 0.81582344, + "learning_rate": 1.800344536188764e-06, + "loss": 0.8927964, + "num_input_tokens_seen": 195615910, + "router_z_loss_clip": 1.4765625, + "router_z_loss_mlp": 0.1194458, + "step": 9079, + "time_per_iteration": 2.563260078430176 + }, + { + "auxiliary_loss_clip": 0.06434448, + "auxiliary_loss_mlp": 0.01267346, + "balance_loss_clip": 0.06280699, + "balance_loss_mlp": 0.01255341, + "epoch": 0.5459191342251616, + "flos": 24430928526720.0, + "grad_norm": 1.7111364231373303, + "language_loss": 0.76216662, + "learning_rate": 1.799957023759277e-06, + "loss": 0.83918452, + "num_input_tokens_seen": 195635620, + "router_z_loss_clip": 1.53710938, + "router_z_loss_mlp": 0.12011719, + "step": 9080, + "time_per_iteration": 2.538072347640991 + }, + { + "auxiliary_loss_clip": 0.06429628, + "auxiliary_loss_mlp": 0.0126983, + "balance_loss_clip": 0.06281854, + "balance_loss_mlp": 0.0125816, + "epoch": 0.5459792574778296, + "flos": 23629756112640.0, + "grad_norm": 1.9762884364861095, + "language_loss": 0.83489871, + "learning_rate": 1.7995695189154392e-06, + "loss": 0.91189325, + "num_input_tokens_seen": 195652495, + "router_z_loss_clip": 1.47753906, + "router_z_loss_mlp": 0.11669922, + "step": 9081, + "time_per_iteration": 2.583111047744751 + }, + { + "auxiliary_loss_clip": 0.06430145, + "auxiliary_loss_mlp": 0.01267495, + "balance_loss_clip": 0.0628006, + "balance_loss_mlp": 0.01256552, + "epoch": 0.5460393807304975, + "flos": 19141583541120.0, + "grad_norm": 2.327386206353707, + "language_loss": 0.70079756, + "learning_rate": 1.7991820216719461e-06, + "loss": 0.77777398, + "num_input_tokens_seen": 195671965, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.10943604, + "step": 9082, + "time_per_iteration": 2.5038371086120605 + }, + { + "auxiliary_loss_clip": 0.06421287, + "auxiliary_loss_mlp": 0.01265183, + "balance_loss_clip": 0.06277952, + "balance_loss_mlp": 0.01253959, + "epoch": 0.5460995039831655, + "flos": 35927308414080.0, + "grad_norm": 1.8952773157154152, + "language_loss": 0.66865891, + "learning_rate": 1.7987945320434906e-06, + "loss": 0.74552357, + "num_input_tokens_seen": 195694725, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.11224365, + "step": 9083, + "time_per_iteration": 2.6453137397766113 + }, + { + "auxiliary_loss_clip": 0.06418573, + "auxiliary_loss_mlp": 0.01266425, + "balance_loss_clip": 0.06276823, + "balance_loss_mlp": 0.01256019, + "epoch": 0.5461596272358334, + "flos": 26766242945280.0, + "grad_norm": 1.5423197483893423, + "language_loss": 0.7895304, + "learning_rate": 1.798407050044766e-06, + "loss": 0.86638033, + "num_input_tokens_seen": 195714090, + "router_z_loss_clip": 1.41992188, + "router_z_loss_mlp": 0.10406494, + "step": 9084, + "time_per_iteration": 2.5392911434173584 + }, + { + "auxiliary_loss_clip": 0.06427852, + "auxiliary_loss_mlp": 0.01262899, + "balance_loss_clip": 0.06280479, + "balance_loss_mlp": 0.01252004, + "epoch": 0.5462197504885015, + "flos": 20892870201600.0, + "grad_norm": 1.8818428979315067, + "language_loss": 0.75159836, + "learning_rate": 1.7980195756904675e-06, + "loss": 0.82850587, + "num_input_tokens_seen": 195733585, + "router_z_loss_clip": 1.47265625, + "router_z_loss_mlp": 0.10900879, + "step": 9085, + "time_per_iteration": 2.5238590240478516 + }, + { + "auxiliary_loss_clip": 0.06428534, + "auxiliary_loss_mlp": 0.012646, + "balance_loss_clip": 0.06279323, + "balance_loss_mlp": 0.01252995, + "epoch": 0.5462798737411694, + "flos": 25810887818880.0, + "grad_norm": 1.69825848629267, + "language_loss": 0.74606055, + "learning_rate": 1.7976321089952857e-06, + "loss": 0.82299185, + "num_input_tokens_seen": 195752820, + "router_z_loss_clip": 1.4921875, + "router_z_loss_mlp": 0.1161499, + "step": 9086, + "time_per_iteration": 2.5416669845581055 + }, + { + "auxiliary_loss_clip": 0.06424639, + "auxiliary_loss_mlp": 0.01267462, + "balance_loss_clip": 0.06277122, + "balance_loss_mlp": 0.01255834, + "epoch": 0.5463399969938374, + "flos": 25782027287040.0, + "grad_norm": 1.4075791244754594, + "language_loss": 0.76979077, + "learning_rate": 1.7972446499739155e-06, + "loss": 0.84671181, + "num_input_tokens_seen": 195773740, + "router_z_loss_clip": 1.47363281, + "router_z_loss_mlp": 0.11633301, + "step": 9087, + "time_per_iteration": 2.5764284133911133 + }, + { + "auxiliary_loss_clip": 0.0642488, + "auxiliary_loss_mlp": 0.01270837, + "balance_loss_clip": 0.06278133, + "balance_loss_mlp": 0.01258088, + "epoch": 0.5464001202465053, + "flos": 18849234245760.0, + "grad_norm": 1.6014949266825944, + "language_loss": 0.77368462, + "learning_rate": 1.7968571986410484e-06, + "loss": 0.85064179, + "num_input_tokens_seen": 195792125, + "router_z_loss_clip": 1.46777344, + "router_z_loss_mlp": 0.12744141, + "step": 9088, + "time_per_iteration": 2.4971888065338135 + }, + { + "auxiliary_loss_clip": 0.06317829, + "auxiliary_loss_mlp": 0.01258554, + "balance_loss_clip": 0.062563, + "balance_loss_mlp": 0.0125685, + "epoch": 0.5464602434991733, + "flos": 69070281978240.0, + "grad_norm": 0.7120973935253039, + "language_loss": 0.57630938, + "learning_rate": 1.7964697550113758e-06, + "loss": 0.6520732, + "num_input_tokens_seen": 195854935, + "router_z_loss_clip": 0.6171875, + "router_z_loss_mlp": 0.01708984, + "step": 9089, + "time_per_iteration": 3.251268148422241 + }, + { + "auxiliary_loss_clip": 0.06429952, + "auxiliary_loss_mlp": 0.01270687, + "balance_loss_clip": 0.06279282, + "balance_loss_mlp": 0.01258945, + "epoch": 0.5465203667518412, + "flos": 27566870307840.0, + "grad_norm": 1.7671189132091156, + "language_loss": 0.77121699, + "learning_rate": 1.7960823190995918e-06, + "loss": 0.84822339, + "num_input_tokens_seen": 195874715, + "router_z_loss_clip": 1.50585938, + "router_z_loss_mlp": 0.11743164, + "step": 9090, + "time_per_iteration": 2.5513298511505127 + }, + { + "auxiliary_loss_clip": 0.06428426, + "auxiliary_loss_mlp": 0.01269928, + "balance_loss_clip": 0.06277205, + "balance_loss_mlp": 0.01257268, + "epoch": 0.5465804900045093, + "flos": 21215757110400.0, + "grad_norm": 1.8390444270451474, + "language_loss": 0.73801088, + "learning_rate": 1.7956948909203855e-06, + "loss": 0.81499445, + "num_input_tokens_seen": 195892610, + "router_z_loss_clip": 1.51269531, + "router_z_loss_mlp": 0.12670898, + "step": 9091, + "time_per_iteration": 2.5593018531799316 + }, + { + "auxiliary_loss_clip": 0.06426038, + "auxiliary_loss_mlp": 0.01268102, + "balance_loss_clip": 0.06278463, + "balance_loss_mlp": 0.01255948, + "epoch": 0.5466406132571772, + "flos": 22495005394560.0, + "grad_norm": 3.020884161734631, + "language_loss": 0.77827132, + "learning_rate": 1.7953074704884498e-06, + "loss": 0.85521269, + "num_input_tokens_seen": 195911085, + "router_z_loss_clip": 1.47363281, + "router_z_loss_mlp": 0.12164307, + "step": 9092, + "time_per_iteration": 2.5000102519989014 + }, + { + "auxiliary_loss_clip": 0.06431385, + "auxiliary_loss_mlp": 0.01266206, + "balance_loss_clip": 0.06280962, + "balance_loss_mlp": 0.01254583, + "epoch": 0.5467007365098452, + "flos": 17681598000000.0, + "grad_norm": 2.033807673433485, + "language_loss": 0.75258666, + "learning_rate": 1.794920057818476e-06, + "loss": 0.82956254, + "num_input_tokens_seen": 195929845, + "router_z_loss_clip": 1.50195312, + "router_z_loss_mlp": 0.11627197, + "step": 9093, + "time_per_iteration": 2.5118560791015625 + }, + { + "auxiliary_loss_clip": 0.06426246, + "auxiliary_loss_mlp": 0.01271687, + "balance_loss_clip": 0.06277527, + "balance_loss_mlp": 0.01258634, + "epoch": 0.5467608597625132, + "flos": 15703146120960.0, + "grad_norm": 3.7072671758327993, + "language_loss": 0.69514894, + "learning_rate": 1.7945326529251533e-06, + "loss": 0.77212822, + "num_input_tokens_seen": 195946350, + "router_z_loss_clip": 1.48828125, + "router_z_loss_mlp": 0.13067627, + "step": 9094, + "time_per_iteration": 2.471296787261963 + }, + { + "auxiliary_loss_clip": 0.06427498, + "auxiliary_loss_mlp": 0.01268457, + "balance_loss_clip": 0.06281194, + "balance_loss_mlp": 0.0125799, + "epoch": 0.5468209830151811, + "flos": 24319106853120.0, + "grad_norm": 3.067574771902978, + "language_loss": 0.68405867, + "learning_rate": 1.7941452558231731e-06, + "loss": 0.76101816, + "num_input_tokens_seen": 195959840, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.10467529, + "step": 9095, + "time_per_iteration": 2.559969186782837 + }, + { + "auxiliary_loss_clip": 0.06427877, + "auxiliary_loss_mlp": 0.01266121, + "balance_loss_clip": 0.06280283, + "balance_loss_mlp": 0.01255058, + "epoch": 0.5468811062678491, + "flos": 29173575548160.0, + "grad_norm": 1.4017188918581747, + "language_loss": 0.67021394, + "learning_rate": 1.7937578665272256e-06, + "loss": 0.747154, + "num_input_tokens_seen": 195981125, + "router_z_loss_clip": 1.4765625, + "router_z_loss_mlp": 0.11065674, + "step": 9096, + "time_per_iteration": 2.5755646228790283 + }, + { + "auxiliary_loss_clip": 0.06321621, + "auxiliary_loss_mlp": 0.01252605, + "balance_loss_clip": 0.06259765, + "balance_loss_mlp": 0.01250808, + "epoch": 0.546941229520517, + "flos": 67885078302720.0, + "grad_norm": 0.7312259601273227, + "language_loss": 0.57564938, + "learning_rate": 1.7933704850520007e-06, + "loss": 0.65139174, + "num_input_tokens_seen": 196038880, + "router_z_loss_clip": 0.6171875, + "router_z_loss_mlp": 0.01792908, + "step": 9097, + "time_per_iteration": 3.239208698272705 + }, + { + "auxiliary_loss_clip": 0.06323195, + "auxiliary_loss_mlp": 0.01252523, + "balance_loss_clip": 0.06261444, + "balance_loss_mlp": 0.01250845, + "epoch": 0.5470013527731851, + "flos": 58286578993920.0, + "grad_norm": 0.8922489191245683, + "language_loss": 0.64733016, + "learning_rate": 1.7929831114121868e-06, + "loss": 0.72308731, + "num_input_tokens_seen": 196099215, + "router_z_loss_clip": 0.6171875, + "router_z_loss_mlp": 0.01681519, + "step": 9098, + "time_per_iteration": 4.485429763793945 + }, + { + "auxiliary_loss_clip": 0.06427541, + "auxiliary_loss_mlp": 0.01271404, + "balance_loss_clip": 0.06279691, + "balance_loss_mlp": 0.0125937, + "epoch": 0.547061476025853, + "flos": 22972494286080.0, + "grad_norm": 1.4988253633991158, + "language_loss": 0.73256373, + "learning_rate": 1.7925957456224753e-06, + "loss": 0.80955321, + "num_input_tokens_seen": 196120370, + "router_z_loss_clip": 1.47753906, + "router_z_loss_mlp": 0.12042236, + "step": 9099, + "time_per_iteration": 2.5771172046661377 + }, + { + "auxiliary_loss_clip": 0.06428638, + "auxiliary_loss_mlp": 0.01265011, + "balance_loss_clip": 0.06282665, + "balance_loss_mlp": 0.01254712, + "epoch": 0.547121599278521, + "flos": 29975502648960.0, + "grad_norm": 1.9003011025398133, + "language_loss": 0.73232269, + "learning_rate": 1.7922083876975537e-06, + "loss": 0.80925912, + "num_input_tokens_seen": 196139075, + "router_z_loss_clip": 1.45898438, + "router_z_loss_mlp": 0.10296631, + "step": 9100, + "time_per_iteration": 2.613353967666626 + }, + { + "auxiliary_loss_clip": 0.06426845, + "auxiliary_loss_mlp": 0.01268034, + "balance_loss_clip": 0.06282172, + "balance_loss_mlp": 0.01256376, + "epoch": 0.5471817225311889, + "flos": 36543760502400.0, + "grad_norm": 3.16405552040578, + "language_loss": 0.68177283, + "learning_rate": 1.7918210376521102e-06, + "loss": 0.75872165, + "num_input_tokens_seen": 196159990, + "router_z_loss_clip": 1.44726562, + "router_z_loss_mlp": 0.11663818, + "step": 9101, + "time_per_iteration": 2.645268440246582 + }, + { + "auxiliary_loss_clip": 0.06429439, + "auxiliary_loss_mlp": 0.01267587, + "balance_loss_clip": 0.06282283, + "balance_loss_mlp": 0.01256482, + "epoch": 0.5472418457838569, + "flos": 25782278849280.0, + "grad_norm": 1.6236525701759785, + "language_loss": 0.78028667, + "learning_rate": 1.7914336955008343e-06, + "loss": 0.85725689, + "num_input_tokens_seen": 196180570, + "router_z_loss_clip": 1.46972656, + "router_z_loss_mlp": 0.11114502, + "step": 9102, + "time_per_iteration": 4.018383264541626 + }, + { + "auxiliary_loss_clip": 0.06425326, + "auxiliary_loss_mlp": 0.01265935, + "balance_loss_clip": 0.06284064, + "balance_loss_mlp": 0.01255659, + "epoch": 0.5473019690365248, + "flos": 27894453045120.0, + "grad_norm": 1.4050316255430886, + "language_loss": 0.72370696, + "learning_rate": 1.791046361258413e-06, + "loss": 0.80061954, + "num_input_tokens_seen": 196200300, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.1027832, + "step": 9103, + "time_per_iteration": 2.613557815551758 + }, + { + "auxiliary_loss_clip": 0.06427938, + "auxiliary_loss_mlp": 0.01268597, + "balance_loss_clip": 0.06282217, + "balance_loss_mlp": 0.01257237, + "epoch": 0.5473620922891929, + "flos": 57644551411200.0, + "grad_norm": 1.2696818989696173, + "language_loss": 0.65471172, + "learning_rate": 1.7906590349395356e-06, + "loss": 0.73167711, + "num_input_tokens_seen": 196228525, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.11352539, + "step": 9104, + "time_per_iteration": 2.8648996353149414 + }, + { + "auxiliary_loss_clip": 0.0643408, + "auxiliary_loss_mlp": 0.01271697, + "balance_loss_clip": 0.06284557, + "balance_loss_mlp": 0.01259174, + "epoch": 0.5474222155418608, + "flos": 19360069862400.0, + "grad_norm": 1.73787664165883, + "language_loss": 0.8214826, + "learning_rate": 1.790271716558888e-06, + "loss": 0.89854038, + "num_input_tokens_seen": 196247690, + "router_z_loss_clip": 1.49609375, + "router_z_loss_mlp": 0.12536621, + "step": 9105, + "time_per_iteration": 2.5110819339752197 + }, + { + "auxiliary_loss_clip": 0.06424334, + "auxiliary_loss_mlp": 0.01267412, + "balance_loss_clip": 0.06280238, + "balance_loss_mlp": 0.01256474, + "epoch": 0.5474823387945288, + "flos": 25127700353280.0, + "grad_norm": 1.5738849579324676, + "language_loss": 0.80505264, + "learning_rate": 1.7898844061311575e-06, + "loss": 0.88197005, + "num_input_tokens_seen": 196268555, + "router_z_loss_clip": 1.43945312, + "router_z_loss_mlp": 0.10943604, + "step": 9106, + "time_per_iteration": 2.545797824859619 + }, + { + "auxiliary_loss_clip": 0.0642664, + "auxiliary_loss_mlp": 0.01267343, + "balance_loss_clip": 0.06280842, + "balance_loss_mlp": 0.01256334, + "epoch": 0.5475424620471967, + "flos": 18009977351040.0, + "grad_norm": 1.8936776188065845, + "language_loss": 0.69983113, + "learning_rate": 1.7894971036710322e-06, + "loss": 0.77677101, + "num_input_tokens_seen": 196285585, + "router_z_loss_clip": 1.45800781, + "router_z_loss_mlp": 0.11010742, + "step": 9107, + "time_per_iteration": 3.930511474609375 + }, + { + "auxiliary_loss_clip": 0.06431143, + "auxiliary_loss_mlp": 0.01263745, + "balance_loss_clip": 0.06281775, + "balance_loss_mlp": 0.01252438, + "epoch": 0.5476025852998647, + "flos": 22315819438080.0, + "grad_norm": 1.6441057037047366, + "language_loss": 0.63668221, + "learning_rate": 1.789109809193197e-06, + "loss": 0.71363103, + "num_input_tokens_seen": 196305085, + "router_z_loss_clip": 1.49316406, + "router_z_loss_mlp": 0.11309814, + "step": 9108, + "time_per_iteration": 2.548469305038452 + }, + { + "auxiliary_loss_clip": 0.06427735, + "auxiliary_loss_mlp": 0.01264812, + "balance_loss_clip": 0.06281575, + "balance_loss_mlp": 0.01254632, + "epoch": 0.5476627085525327, + "flos": 20126679667200.0, + "grad_norm": 1.6544017163405356, + "language_loss": 0.75096864, + "learning_rate": 1.7887225227123396e-06, + "loss": 0.82789409, + "num_input_tokens_seen": 196323945, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.10174561, + "step": 9109, + "time_per_iteration": 2.505537748336792 + }, + { + "auxiliary_loss_clip": 0.06426554, + "auxiliary_loss_mlp": 0.01271245, + "balance_loss_clip": 0.06282739, + "balance_loss_mlp": 0.01259235, + "epoch": 0.5477228318052006, + "flos": 17718382742400.0, + "grad_norm": 1.7609925306613563, + "language_loss": 0.78101015, + "learning_rate": 1.7883352442431457e-06, + "loss": 0.85798812, + "num_input_tokens_seen": 196342200, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.12005615, + "step": 9110, + "time_per_iteration": 2.5898001194000244 + }, + { + "auxiliary_loss_clip": 0.0642444, + "auxiliary_loss_mlp": 0.01264653, + "balance_loss_clip": 0.06281163, + "balance_loss_mlp": 0.01253948, + "epoch": 0.5477829550578687, + "flos": 25856057969280.0, + "grad_norm": 1.4117567478996924, + "language_loss": 0.71281165, + "learning_rate": 1.7879479738002993e-06, + "loss": 0.78970265, + "num_input_tokens_seen": 196362940, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.10699463, + "step": 9111, + "time_per_iteration": 2.5514800548553467 + }, + { + "auxiliary_loss_clip": 0.06428348, + "auxiliary_loss_mlp": 0.01265751, + "balance_loss_clip": 0.06282744, + "balance_loss_mlp": 0.01254021, + "epoch": 0.5478430783105366, + "flos": 23046399187200.0, + "grad_norm": 1.7318252125729088, + "language_loss": 0.71129775, + "learning_rate": 1.7875607113984876e-06, + "loss": 0.7882387, + "num_input_tokens_seen": 196383070, + "router_z_loss_clip": 1.45605469, + "router_z_loss_mlp": 0.1171875, + "step": 9112, + "time_per_iteration": 2.5733911991119385 + }, + { + "auxiliary_loss_clip": 0.06428306, + "auxiliary_loss_mlp": 0.01265183, + "balance_loss_clip": 0.06280322, + "balance_loss_mlp": 0.0125412, + "epoch": 0.5479032015632046, + "flos": 16076821403520.0, + "grad_norm": 1.865243038866792, + "language_loss": 0.88150853, + "learning_rate": 1.7871734570523953e-06, + "loss": 0.95844346, + "num_input_tokens_seen": 196398485, + "router_z_loss_clip": 1.47949219, + "router_z_loss_mlp": 0.1105957, + "step": 9113, + "time_per_iteration": 4.03569483757019 + }, + { + "auxiliary_loss_clip": 0.06427854, + "auxiliary_loss_mlp": 0.01265805, + "balance_loss_clip": 0.0628054, + "balance_loss_mlp": 0.01254171, + "epoch": 0.5479633248158725, + "flos": 24285382784640.0, + "grad_norm": 1.9056802782338742, + "language_loss": 0.73404038, + "learning_rate": 1.7867862107767067e-06, + "loss": 0.81097698, + "num_input_tokens_seen": 196417725, + "router_z_loss_clip": 1.47363281, + "router_z_loss_mlp": 0.11633301, + "step": 9114, + "time_per_iteration": 2.552778959274292 + }, + { + "auxiliary_loss_clip": 0.06424817, + "auxiliary_loss_mlp": 0.0126649, + "balance_loss_clip": 0.06279442, + "balance_loss_mlp": 0.0125582, + "epoch": 0.5480234480685405, + "flos": 26365216504320.0, + "grad_norm": 1.4540698273743113, + "language_loss": 0.72457099, + "learning_rate": 1.7863989725861066e-06, + "loss": 0.80148405, + "num_input_tokens_seen": 196437840, + "router_z_loss_clip": 1.45214844, + "router_z_loss_mlp": 0.10662842, + "step": 9115, + "time_per_iteration": 2.5838403701782227 + }, + { + "auxiliary_loss_clip": 0.06436512, + "auxiliary_loss_mlp": 0.01267671, + "balance_loss_clip": 0.06284098, + "balance_loss_mlp": 0.01256066, + "epoch": 0.5480835713212084, + "flos": 22061722331520.0, + "grad_norm": 1.7541916767056687, + "language_loss": 0.72373956, + "learning_rate": 1.7860117424952781e-06, + "loss": 0.80078137, + "num_input_tokens_seen": 196457300, + "router_z_loss_clip": 1.52539062, + "router_z_loss_mlp": 0.1161499, + "step": 9116, + "time_per_iteration": 2.5292439460754395 + }, + { + "auxiliary_loss_clip": 0.06426133, + "auxiliary_loss_mlp": 0.01267118, + "balance_loss_clip": 0.06279518, + "balance_loss_mlp": 0.01256205, + "epoch": 0.5481436945738765, + "flos": 25308018339840.0, + "grad_norm": 1.941043285146296, + "language_loss": 0.76906073, + "learning_rate": 1.7856245205189063e-06, + "loss": 0.84599322, + "num_input_tokens_seen": 196476720, + "router_z_loss_clip": 1.46582031, + "router_z_loss_mlp": 0.10906982, + "step": 9117, + "time_per_iteration": 2.5854122638702393 + }, + { + "auxiliary_loss_clip": 0.06421119, + "auxiliary_loss_mlp": 0.01264207, + "balance_loss_clip": 0.06279179, + "balance_loss_mlp": 0.01253532, + "epoch": 0.5482038178265444, + "flos": 33588807540480.0, + "grad_norm": 1.613198613591587, + "language_loss": 0.62954283, + "learning_rate": 1.785237306671674e-06, + "loss": 0.7063961, + "num_input_tokens_seen": 196496765, + "router_z_loss_clip": 1.41894531, + "router_z_loss_mlp": 0.10675049, + "step": 9118, + "time_per_iteration": 2.61136531829834 + }, + { + "auxiliary_loss_clip": 0.06429429, + "auxiliary_loss_mlp": 0.01266281, + "balance_loss_clip": 0.06280537, + "balance_loss_mlp": 0.0125436, + "epoch": 0.5482639410792124, + "flos": 19032235562880.0, + "grad_norm": 1.6774564392555322, + "language_loss": 0.79138243, + "learning_rate": 1.7848501009682646e-06, + "loss": 0.86833954, + "num_input_tokens_seen": 196516220, + "router_z_loss_clip": 1.48925781, + "router_z_loss_mlp": 0.11920166, + "step": 9119, + "time_per_iteration": 2.5309953689575195 + }, + { + "auxiliary_loss_clip": 0.06425598, + "auxiliary_loss_mlp": 0.01271106, + "balance_loss_clip": 0.06281713, + "balance_loss_mlp": 0.0126033, + "epoch": 0.5483240643318803, + "flos": 25417282464000.0, + "grad_norm": 1.5630724809093546, + "language_loss": 0.82719064, + "learning_rate": 1.7844629034233604e-06, + "loss": 0.9041577, + "num_input_tokens_seen": 196533860, + "router_z_loss_clip": 1.43945312, + "router_z_loss_mlp": 0.10772705, + "step": 9120, + "time_per_iteration": 2.551790952682495 + }, + { + "auxiliary_loss_clip": 0.06432922, + "auxiliary_loss_mlp": 0.01264861, + "balance_loss_clip": 0.06284823, + "balance_loss_mlp": 0.01253292, + "epoch": 0.5483841875845483, + "flos": 21472705255680.0, + "grad_norm": 1.7308751336861314, + "language_loss": 0.80248237, + "learning_rate": 1.7840757140516455e-06, + "loss": 0.87946028, + "num_input_tokens_seen": 196551305, + "router_z_loss_clip": 1.48242188, + "router_z_loss_mlp": 0.11566162, + "step": 9121, + "time_per_iteration": 2.5354321002960205 + }, + { + "auxiliary_loss_clip": 0.06429829, + "auxiliary_loss_mlp": 0.01267, + "balance_loss_clip": 0.06280297, + "balance_loss_mlp": 0.01255651, + "epoch": 0.5484443108372163, + "flos": 24753060748800.0, + "grad_norm": 1.8214688446413962, + "language_loss": 0.6171329, + "learning_rate": 1.7836885328678008e-06, + "loss": 0.69410121, + "num_input_tokens_seen": 196569420, + "router_z_loss_clip": 1.49609375, + "router_z_loss_mlp": 0.11352539, + "step": 9122, + "time_per_iteration": 2.536548614501953 + }, + { + "auxiliary_loss_clip": 0.06426375, + "auxiliary_loss_mlp": 0.01268013, + "balance_loss_clip": 0.06283108, + "balance_loss_mlp": 0.0125729, + "epoch": 0.5485044340898843, + "flos": 25382594073600.0, + "grad_norm": 1.6758320366866328, + "language_loss": 0.71812153, + "learning_rate": 1.7833013598865084e-06, + "loss": 0.7950654, + "num_input_tokens_seen": 196590610, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.1071167, + "step": 9123, + "time_per_iteration": 2.563128709793091 + }, + { + "auxiliary_loss_clip": 0.06422795, + "auxiliary_loss_mlp": 0.01264644, + "balance_loss_clip": 0.06277866, + "balance_loss_mlp": 0.01254839, + "epoch": 0.5485645573425523, + "flos": 12646140485760.0, + "grad_norm": 2.0499300220900367, + "language_loss": 0.83466411, + "learning_rate": 1.7829141951224505e-06, + "loss": 0.91153848, + "num_input_tokens_seen": 196606495, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.09802246, + "step": 9124, + "time_per_iteration": 2.4774932861328125 + }, + { + "auxiliary_loss_clip": 0.06423289, + "auxiliary_loss_mlp": 0.01272789, + "balance_loss_clip": 0.06280372, + "balance_loss_mlp": 0.01262054, + "epoch": 0.5486246805952202, + "flos": 28336918129920.0, + "grad_norm": 1.5704023496451165, + "language_loss": 0.80787551, + "learning_rate": 1.7825270385903075e-06, + "loss": 0.88483626, + "num_input_tokens_seen": 196626365, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.10736084, + "step": 9125, + "time_per_iteration": 2.6640827655792236 + }, + { + "auxiliary_loss_clip": 0.06429766, + "auxiliary_loss_mlp": 0.01266738, + "balance_loss_clip": 0.06281759, + "balance_loss_mlp": 0.0125558, + "epoch": 0.5486848038478882, + "flos": 16805598289920.0, + "grad_norm": 1.778522251586277, + "language_loss": 0.74475932, + "learning_rate": 1.7821398903047617e-06, + "loss": 0.82172436, + "num_input_tokens_seen": 196644465, + "router_z_loss_clip": 1.47753906, + "router_z_loss_mlp": 0.1114502, + "step": 9126, + "time_per_iteration": 2.4920494556427 + }, + { + "auxiliary_loss_clip": 0.0643461, + "auxiliary_loss_mlp": 0.01271917, + "balance_loss_clip": 0.06284419, + "balance_loss_mlp": 0.01260383, + "epoch": 0.5487449271005561, + "flos": 17241606610560.0, + "grad_norm": 2.5065680491325217, + "language_loss": 0.66843152, + "learning_rate": 1.7817527502804928e-06, + "loss": 0.74549675, + "num_input_tokens_seen": 196659160, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.11535645, + "step": 9127, + "time_per_iteration": 2.498995304107666 + }, + { + "auxiliary_loss_clip": 0.0642729, + "auxiliary_loss_mlp": 0.0126947, + "balance_loss_clip": 0.06281507, + "balance_loss_mlp": 0.01257072, + "epoch": 0.5488050503532241, + "flos": 17345462146560.0, + "grad_norm": 1.8347258108428224, + "language_loss": 0.83430481, + "learning_rate": 1.781365618532181e-06, + "loss": 0.91127241, + "num_input_tokens_seen": 196677410, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.1239624, + "step": 9128, + "time_per_iteration": 2.4851553440093994 + }, + { + "auxiliary_loss_clip": 0.06423862, + "auxiliary_loss_mlp": 0.01267411, + "balance_loss_clip": 0.06279477, + "balance_loss_mlp": 0.01256032, + "epoch": 0.548865173605892, + "flos": 17245044627840.0, + "grad_norm": 1.9721748285442382, + "language_loss": 0.73992771, + "learning_rate": 1.7809784950745078e-06, + "loss": 0.81684041, + "num_input_tokens_seen": 196696765, + "router_z_loss_clip": 1.44335938, + "router_z_loss_mlp": 0.1138916, + "step": 9129, + "time_per_iteration": 2.5088050365448 + }, + { + "auxiliary_loss_clip": 0.06436306, + "auxiliary_loss_mlp": 0.0126816, + "balance_loss_clip": 0.0628598, + "balance_loss_mlp": 0.01256108, + "epoch": 0.5489252968585601, + "flos": 17462398919040.0, + "grad_norm": 2.1982698674747745, + "language_loss": 0.63327444, + "learning_rate": 1.7805913799221511e-06, + "loss": 0.7103191, + "num_input_tokens_seen": 196714895, + "router_z_loss_clip": 1.50292969, + "router_z_loss_mlp": 0.12054443, + "step": 9130, + "time_per_iteration": 2.4861414432525635 + }, + { + "auxiliary_loss_clip": 0.06431893, + "auxiliary_loss_mlp": 0.01266818, + "balance_loss_clip": 0.06281481, + "balance_loss_mlp": 0.01255046, + "epoch": 0.548985420111228, + "flos": 26330653895040.0, + "grad_norm": 1.729948569228587, + "language_loss": 0.63358611, + "learning_rate": 1.7802042730897915e-06, + "loss": 0.71057326, + "num_input_tokens_seen": 196735510, + "router_z_loss_clip": 1.50390625, + "router_z_loss_mlp": 0.11773682, + "step": 9131, + "time_per_iteration": 2.589580535888672 + }, + { + "auxiliary_loss_clip": 0.0643028, + "auxiliary_loss_mlp": 0.01268323, + "balance_loss_clip": 0.06282265, + "balance_loss_mlp": 0.01255955, + "epoch": 0.549045543363896, + "flos": 18699034602240.0, + "grad_norm": 1.7539544854272515, + "language_loss": 0.75148702, + "learning_rate": 1.7798171745921084e-06, + "loss": 0.82847303, + "num_input_tokens_seen": 196752855, + "router_z_loss_clip": 1.47851562, + "router_z_loss_mlp": 0.12353516, + "step": 9132, + "time_per_iteration": 2.461970329284668 + }, + { + "auxiliary_loss_clip": 0.06429279, + "auxiliary_loss_mlp": 0.01266105, + "balance_loss_clip": 0.06280597, + "balance_loss_mlp": 0.01255234, + "epoch": 0.5491056666165639, + "flos": 24724284071040.0, + "grad_norm": 2.6052413777049144, + "language_loss": 0.8162328, + "learning_rate": 1.7794300844437795e-06, + "loss": 0.89318669, + "num_input_tokens_seen": 196772230, + "router_z_loss_clip": 1.48730469, + "router_z_loss_mlp": 0.10870361, + "step": 9133, + "time_per_iteration": 2.5799684524536133 + }, + { + "auxiliary_loss_clip": 0.06426433, + "auxiliary_loss_mlp": 0.01271009, + "balance_loss_clip": 0.06280407, + "balance_loss_mlp": 0.01259691, + "epoch": 0.5491657898692319, + "flos": 21582849847680.0, + "grad_norm": 1.8788464104374898, + "language_loss": 0.70385146, + "learning_rate": 1.7790430026594841e-06, + "loss": 0.78082585, + "num_input_tokens_seen": 196790405, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.11328125, + "step": 9134, + "time_per_iteration": 2.5116565227508545 + }, + { + "auxiliary_loss_clip": 0.06431407, + "auxiliary_loss_mlp": 0.01267106, + "balance_loss_clip": 0.06281983, + "balance_loss_mlp": 0.01256062, + "epoch": 0.5492259131219, + "flos": 50487653825280.0, + "grad_norm": 2.3217483044436955, + "language_loss": 0.61379695, + "learning_rate": 1.7786559292539004e-06, + "loss": 0.69078213, + "num_input_tokens_seen": 196813785, + "router_z_loss_clip": 1.49316406, + "router_z_loss_mlp": 0.11035156, + "step": 9135, + "time_per_iteration": 2.8019859790802 + }, + { + "auxiliary_loss_clip": 0.06430922, + "auxiliary_loss_mlp": 0.01266434, + "balance_loss_clip": 0.06280293, + "balance_loss_mlp": 0.01254591, + "epoch": 0.5492860363745679, + "flos": 25126316760960.0, + "grad_norm": 1.8569102400294533, + "language_loss": 0.72833902, + "learning_rate": 1.7782688642417058e-06, + "loss": 0.80531251, + "num_input_tokens_seen": 196834390, + "router_z_loss_clip": 1.50488281, + "router_z_loss_mlp": 0.11846924, + "step": 9136, + "time_per_iteration": 2.5313796997070312 + }, + { + "auxiliary_loss_clip": 0.06434008, + "auxiliary_loss_mlp": 0.01267878, + "balance_loss_clip": 0.06279632, + "balance_loss_mlp": 0.01255551, + "epoch": 0.5493461596272359, + "flos": 22639670668800.0, + "grad_norm": 2.4335907064216302, + "language_loss": 0.6873585, + "learning_rate": 1.7778818076375781e-06, + "loss": 0.76437736, + "num_input_tokens_seen": 196853290, + "router_z_loss_clip": 1.54296875, + "router_z_loss_mlp": 0.12329102, + "step": 9137, + "time_per_iteration": 2.606400489807129 + }, + { + "auxiliary_loss_clip": 0.06325421, + "auxiliary_loss_mlp": 0.01260391, + "balance_loss_clip": 0.06263588, + "balance_loss_mlp": 0.01258753, + "epoch": 0.5494062828799038, + "flos": 66169486281600.0, + "grad_norm": 0.7309885412732349, + "language_loss": 0.65176189, + "learning_rate": 1.7774947594561947e-06, + "loss": 0.72762001, + "num_input_tokens_seen": 196913120, + "router_z_loss_clip": 0.61767578, + "router_z_loss_mlp": 0.0164032, + "step": 9138, + "time_per_iteration": 4.603189945220947 + }, + { + "auxiliary_loss_clip": 0.06431855, + "auxiliary_loss_mlp": 0.01265593, + "balance_loss_clip": 0.06282654, + "balance_loss_mlp": 0.01253803, + "epoch": 0.5494664061325718, + "flos": 21112362771840.0, + "grad_norm": 1.7352131741027665, + "language_loss": 0.75659418, + "learning_rate": 1.7771077197122321e-06, + "loss": 0.83356863, + "num_input_tokens_seen": 196931530, + "router_z_loss_clip": 1.49121094, + "router_z_loss_mlp": 0.11785889, + "step": 9139, + "time_per_iteration": 2.5063250064849854 + }, + { + "auxiliary_loss_clip": 0.06427477, + "auxiliary_loss_mlp": 0.01268876, + "balance_loss_clip": 0.06281833, + "balance_loss_mlp": 0.01257599, + "epoch": 0.5495265293852397, + "flos": 14397846416640.0, + "grad_norm": 2.090947018102217, + "language_loss": 0.71453607, + "learning_rate": 1.7767206884203672e-06, + "loss": 0.79149961, + "num_input_tokens_seen": 196949430, + "router_z_loss_clip": 1.45507812, + "router_z_loss_mlp": 0.11273193, + "step": 9140, + "time_per_iteration": 2.516493558883667 + }, + { + "auxiliary_loss_clip": 0.06426564, + "auxiliary_loss_mlp": 0.01265679, + "balance_loss_clip": 0.06279987, + "balance_loss_mlp": 0.01254623, + "epoch": 0.5495866526379077, + "flos": 25554945922560.0, + "grad_norm": 1.591757169874098, + "language_loss": 0.76439172, + "learning_rate": 1.7763336655952762e-06, + "loss": 0.84131408, + "num_input_tokens_seen": 196968265, + "router_z_loss_clip": 1.46484375, + "router_z_loss_mlp": 0.1105957, + "step": 9141, + "time_per_iteration": 4.032621383666992 + }, + { + "auxiliary_loss_clip": 0.06420414, + "auxiliary_loss_mlp": 0.01268222, + "balance_loss_clip": 0.06278077, + "balance_loss_mlp": 0.01257648, + "epoch": 0.5496467758905756, + "flos": 21322421758080.0, + "grad_norm": 1.9135284052459163, + "language_loss": 0.75301933, + "learning_rate": 1.7759466512516346e-06, + "loss": 0.82990575, + "num_input_tokens_seen": 196984930, + "router_z_loss_clip": 1.42285156, + "router_z_loss_mlp": 0.10577393, + "step": 9142, + "time_per_iteration": 2.517458438873291 + }, + { + "auxiliary_loss_clip": 0.06433351, + "auxiliary_loss_mlp": 0.01271982, + "balance_loss_clip": 0.06284253, + "balance_loss_mlp": 0.01259895, + "epoch": 0.5497068991432437, + "flos": 22239021571200.0, + "grad_norm": 1.7111366793556597, + "language_loss": 0.77014959, + "learning_rate": 1.7755596454041192e-06, + "loss": 0.84720296, + "num_input_tokens_seen": 197002320, + "router_z_loss_clip": 1.49121094, + "router_z_loss_mlp": 0.12091064, + "step": 9143, + "time_per_iteration": 2.516505002975464 + }, + { + "auxiliary_loss_clip": 0.06424481, + "auxiliary_loss_mlp": 0.01268074, + "balance_loss_clip": 0.06278251, + "balance_loss_mlp": 0.01256416, + "epoch": 0.5497670223959116, + "flos": 18485076401280.0, + "grad_norm": 3.356687572137957, + "language_loss": 0.79973668, + "learning_rate": 1.7751726480674044e-06, + "loss": 0.87666219, + "num_input_tokens_seen": 197020825, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.11663818, + "step": 9144, + "time_per_iteration": 2.4832475185394287 + }, + { + "auxiliary_loss_clip": 0.0642961, + "auxiliary_loss_mlp": 0.01268496, + "balance_loss_clip": 0.06281358, + "balance_loss_mlp": 0.01257153, + "epoch": 0.5498271456485796, + "flos": 29212750131840.0, + "grad_norm": 1.7313830940317911, + "language_loss": 0.7154156, + "learning_rate": 1.7747856592561645e-06, + "loss": 0.79239666, + "num_input_tokens_seen": 197040450, + "router_z_loss_clip": 1.48242188, + "router_z_loss_mlp": 0.11346436, + "step": 9145, + "time_per_iteration": 2.6261048316955566 + }, + { + "auxiliary_loss_clip": 0.06426725, + "auxiliary_loss_mlp": 0.01264568, + "balance_loss_clip": 0.06279887, + "balance_loss_mlp": 0.01254197, + "epoch": 0.5498872689012475, + "flos": 34833032017920.0, + "grad_norm": 1.5682468167397778, + "language_loss": 0.70529747, + "learning_rate": 1.774398678985076e-06, + "loss": 0.78221035, + "num_input_tokens_seen": 197063930, + "router_z_loss_clip": 1.46582031, + "router_z_loss_mlp": 0.10369873, + "step": 9146, + "time_per_iteration": 4.087557315826416 + }, + { + "auxiliary_loss_clip": 0.06419109, + "auxiliary_loss_mlp": 0.01264014, + "balance_loss_clip": 0.06276917, + "balance_loss_mlp": 0.01253923, + "epoch": 0.5499473921539155, + "flos": 25929124329600.0, + "grad_norm": 2.0128119517228305, + "language_loss": 0.64188051, + "learning_rate": 1.7740117072688113e-06, + "loss": 0.71871173, + "num_input_tokens_seen": 197082660, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.10095215, + "step": 9147, + "time_per_iteration": 2.5406603813171387 + }, + { + "auxiliary_loss_clip": 0.06424303, + "auxiliary_loss_mlp": 0.01265827, + "balance_loss_clip": 0.06279408, + "balance_loss_mlp": 0.01255122, + "epoch": 0.5500075154065835, + "flos": 22280334433920.0, + "grad_norm": 1.893989099652022, + "language_loss": 0.81534255, + "learning_rate": 1.7736247441220458e-06, + "loss": 0.89224386, + "num_input_tokens_seen": 197100675, + "router_z_loss_clip": 1.44921875, + "router_z_loss_mlp": 0.1071167, + "step": 9148, + "time_per_iteration": 2.5051376819610596 + }, + { + "auxiliary_loss_clip": 0.06424436, + "auxiliary_loss_mlp": 0.01270935, + "balance_loss_clip": 0.06277981, + "balance_loss_mlp": 0.0125992, + "epoch": 0.5500676386592515, + "flos": 28044946177920.0, + "grad_norm": 1.7460739337347344, + "language_loss": 0.7916007, + "learning_rate": 1.773237789559453e-06, + "loss": 0.86855441, + "num_input_tokens_seen": 197121320, + "router_z_loss_clip": 1.46484375, + "router_z_loss_mlp": 0.11016846, + "step": 9149, + "time_per_iteration": 2.5586931705474854 + }, + { + "auxiliary_loss_clip": 0.0642364, + "auxiliary_loss_mlp": 0.01264747, + "balance_loss_clip": 0.06277739, + "balance_loss_mlp": 0.01253852, + "epoch": 0.5501277619119195, + "flos": 23921602283520.0, + "grad_norm": 2.0079288501902965, + "language_loss": 0.7263124, + "learning_rate": 1.7728508435957052e-06, + "loss": 0.80319625, + "num_input_tokens_seen": 197138965, + "router_z_loss_clip": 1.45996094, + "router_z_loss_mlp": 0.10888672, + "step": 9150, + "time_per_iteration": 2.5097196102142334 + }, + { + "auxiliary_loss_clip": 0.06428004, + "auxiliary_loss_mlp": 0.01265548, + "balance_loss_clip": 0.06278474, + "balance_loss_mlp": 0.01253454, + "epoch": 0.5501878851645874, + "flos": 20930199995520.0, + "grad_norm": 1.7516173490285718, + "language_loss": 0.74991822, + "learning_rate": 1.772463906245477e-06, + "loss": 0.82685369, + "num_input_tokens_seen": 197156460, + "router_z_loss_clip": 1.49316406, + "router_z_loss_mlp": 0.12103271, + "step": 9151, + "time_per_iteration": 2.4953532218933105 + }, + { + "auxiliary_loss_clip": 0.06421181, + "auxiliary_loss_mlp": 0.01264237, + "balance_loss_clip": 0.06275992, + "balance_loss_mlp": 0.01253317, + "epoch": 0.5502480084172554, + "flos": 20671155498240.0, + "grad_norm": 1.7180580365194615, + "language_loss": 0.76128006, + "learning_rate": 1.7720769775234394e-06, + "loss": 0.83813429, + "num_input_tokens_seen": 197175140, + "router_z_loss_clip": 1.45117188, + "router_z_loss_mlp": 0.10925293, + "step": 9152, + "time_per_iteration": 2.5041630268096924 + }, + { + "auxiliary_loss_clip": 0.06418908, + "auxiliary_loss_mlp": 0.01264981, + "balance_loss_clip": 0.06276076, + "balance_loss_mlp": 0.01254336, + "epoch": 0.5503081316699233, + "flos": 26439792238080.0, + "grad_norm": 3.86516963702514, + "language_loss": 0.82636946, + "learning_rate": 1.7716900574442662e-06, + "loss": 0.90320837, + "num_input_tokens_seen": 197194345, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.10650635, + "step": 9153, + "time_per_iteration": 4.000823259353638 + }, + { + "auxiliary_loss_clip": 0.06419568, + "auxiliary_loss_mlp": 0.01266317, + "balance_loss_clip": 0.0627673, + "balance_loss_mlp": 0.01254682, + "epoch": 0.5503682549225913, + "flos": 30637208741760.0, + "grad_norm": 1.7185020713354737, + "language_loss": 0.7442615, + "learning_rate": 1.7713031460226294e-06, + "loss": 0.82112032, + "num_input_tokens_seen": 197215535, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.11633301, + "step": 9154, + "time_per_iteration": 2.619478225708008 + }, + { + "auxiliary_loss_clip": 0.06431979, + "auxiliary_loss_mlp": 0.01267491, + "balance_loss_clip": 0.0627896, + "balance_loss_mlp": 0.01256273, + "epoch": 0.5504283781752592, + "flos": 22572096750720.0, + "grad_norm": 1.5448619232700234, + "language_loss": 0.73359931, + "learning_rate": 1.770916243273199e-06, + "loss": 0.81059402, + "num_input_tokens_seen": 197234945, + "router_z_loss_clip": 1.53027344, + "router_z_loss_mlp": 0.11212158, + "step": 9155, + "time_per_iteration": 2.5512940883636475 + }, + { + "auxiliary_loss_clip": 0.0632084, + "auxiliary_loss_mlp": 0.01252943, + "balance_loss_clip": 0.06258567, + "balance_loss_mlp": 0.01251311, + "epoch": 0.5504885014279273, + "flos": 67918634663040.0, + "grad_norm": 0.7176527357407121, + "language_loss": 0.5550307, + "learning_rate": 1.7705293492106483e-06, + "loss": 0.63076854, + "num_input_tokens_seen": 197302285, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 0.01634216, + "step": 9156, + "time_per_iteration": 3.3401191234588623 + }, + { + "auxiliary_loss_clip": 0.06423487, + "auxiliary_loss_mlp": 0.01263997, + "balance_loss_clip": 0.06277417, + "balance_loss_mlp": 0.01254115, + "epoch": 0.5505486246805952, + "flos": 22455705029760.0, + "grad_norm": 1.7228062733410818, + "language_loss": 0.82601535, + "learning_rate": 1.7701424638496475e-06, + "loss": 0.90289015, + "num_input_tokens_seen": 197321575, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.09881592, + "step": 9157, + "time_per_iteration": 2.5331945419311523 + }, + { + "auxiliary_loss_clip": 0.06433383, + "auxiliary_loss_mlp": 0.01267609, + "balance_loss_clip": 0.06279938, + "balance_loss_mlp": 0.01255885, + "epoch": 0.5506087479332632, + "flos": 26914220455680.0, + "grad_norm": 2.384583042502796, + "language_loss": 0.7632947, + "learning_rate": 1.7697555872048677e-06, + "loss": 0.84030461, + "num_input_tokens_seen": 197340255, + "router_z_loss_clip": 1.53320312, + "router_z_loss_mlp": 0.11743164, + "step": 9158, + "time_per_iteration": 2.5622854232788086 + }, + { + "auxiliary_loss_clip": 0.06422579, + "auxiliary_loss_mlp": 0.01265094, + "balance_loss_clip": 0.06281133, + "balance_loss_mlp": 0.01255134, + "epoch": 0.5506688711859311, + "flos": 22936967354880.0, + "grad_norm": 1.858566635879154, + "language_loss": 0.70421213, + "learning_rate": 1.769368719290979e-06, + "loss": 0.78108883, + "num_input_tokens_seen": 197360360, + "router_z_loss_clip": 1.41503906, + "router_z_loss_mlp": 0.09967041, + "step": 9159, + "time_per_iteration": 2.5299885272979736 + }, + { + "auxiliary_loss_clip": 0.06426555, + "auxiliary_loss_mlp": 0.01265176, + "balance_loss_clip": 0.06279982, + "balance_loss_mlp": 0.01254114, + "epoch": 0.5507289944385991, + "flos": 29614111989120.0, + "grad_norm": 1.5102709537150474, + "language_loss": 0.68438101, + "learning_rate": 1.7689818601226516e-06, + "loss": 0.7612983, + "num_input_tokens_seen": 197381905, + "router_z_loss_clip": 1.46484375, + "router_z_loss_mlp": 0.11065674, + "step": 9160, + "time_per_iteration": 2.5797348022460938 + }, + { + "auxiliary_loss_clip": 0.06423666, + "auxiliary_loss_mlp": 0.01264259, + "balance_loss_clip": 0.06278166, + "balance_loss_mlp": 0.01252774, + "epoch": 0.5507891176912671, + "flos": 15338736714240.0, + "grad_norm": 1.8978617290593418, + "language_loss": 0.7231009, + "learning_rate": 1.7685950097145552e-06, + "loss": 0.79998016, + "num_input_tokens_seen": 197398555, + "router_z_loss_clip": 1.45605469, + "router_z_loss_mlp": 0.11474609, + "step": 9161, + "time_per_iteration": 2.4746181964874268 + }, + { + "auxiliary_loss_clip": 0.06425308, + "auxiliary_loss_mlp": 0.01270177, + "balance_loss_clip": 0.0627985, + "balance_loss_mlp": 0.01259472, + "epoch": 0.5508492409439351, + "flos": 26585547615360.0, + "grad_norm": 4.143741197260591, + "language_loss": 0.69514179, + "learning_rate": 1.768208168081359e-06, + "loss": 0.77209663, + "num_input_tokens_seen": 197419630, + "router_z_loss_clip": 1.45507812, + "router_z_loss_mlp": 0.10717773, + "step": 9162, + "time_per_iteration": 2.601036548614502 + }, + { + "auxiliary_loss_clip": 0.06422161, + "auxiliary_loss_mlp": 0.01271792, + "balance_loss_clip": 0.06278013, + "balance_loss_mlp": 0.01261164, + "epoch": 0.5509093641966031, + "flos": 25449832575360.0, + "grad_norm": 1.6789972101454846, + "language_loss": 0.85959709, + "learning_rate": 1.767821335237733e-06, + "loss": 0.93653667, + "num_input_tokens_seen": 197438480, + "router_z_loss_clip": 1.44140625, + "router_z_loss_mlp": 0.10638428, + "step": 9163, + "time_per_iteration": 2.539546489715576 + }, + { + "auxiliary_loss_clip": 0.06425934, + "auxiliary_loss_mlp": 0.0126949, + "balance_loss_clip": 0.06282654, + "balance_loss_mlp": 0.01258856, + "epoch": 0.550969487449271, + "flos": 18704652825600.0, + "grad_norm": 1.572244133846192, + "language_loss": 0.81101871, + "learning_rate": 1.7674345111983441e-06, + "loss": 0.88797295, + "num_input_tokens_seen": 197456755, + "router_z_loss_clip": 1.43359375, + "router_z_loss_mlp": 0.10638428, + "step": 9164, + "time_per_iteration": 2.5266709327697754 + }, + { + "auxiliary_loss_clip": 0.06427547, + "auxiliary_loss_mlp": 0.01271715, + "balance_loss_clip": 0.06278498, + "balance_loss_mlp": 0.01260026, + "epoch": 0.551029610701939, + "flos": 22714959162240.0, + "grad_norm": 1.8760540237074659, + "language_loss": 0.73664248, + "learning_rate": 1.767047695977863e-06, + "loss": 0.81363511, + "num_input_tokens_seen": 197475530, + "router_z_loss_clip": 1.49121094, + "router_z_loss_mlp": 0.11688232, + "step": 9165, + "time_per_iteration": 2.511892318725586 + }, + { + "auxiliary_loss_clip": 0.06419477, + "auxiliary_loss_mlp": 0.01269172, + "balance_loss_clip": 0.06277155, + "balance_loss_mlp": 0.01258479, + "epoch": 0.5510897339546069, + "flos": 12425138542080.0, + "grad_norm": 2.0479120482719084, + "language_loss": 0.79496598, + "learning_rate": 1.7666608895909563e-06, + "loss": 0.87185252, + "num_input_tokens_seen": 197490835, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.10687256, + "step": 9166, + "time_per_iteration": 2.5217325687408447 + }, + { + "auxiliary_loss_clip": 0.06426241, + "auxiliary_loss_mlp": 0.01268783, + "balance_loss_clip": 0.06279847, + "balance_loss_mlp": 0.01257232, + "epoch": 0.5511498572072749, + "flos": 18776545228800.0, + "grad_norm": 2.094065158330193, + "language_loss": 0.77047074, + "learning_rate": 1.7662740920522913e-06, + "loss": 0.84742099, + "num_input_tokens_seen": 197508770, + "router_z_loss_clip": 1.46386719, + "router_z_loss_mlp": 0.11560059, + "step": 9167, + "time_per_iteration": 2.5210516452789307 + }, + { + "auxiliary_loss_clip": 0.06422734, + "auxiliary_loss_mlp": 0.01276612, + "balance_loss_clip": 0.06278996, + "balance_loss_mlp": 0.01264995, + "epoch": 0.5512099804599428, + "flos": 19579436651520.0, + "grad_norm": 1.8110306936777156, + "language_loss": 0.80698925, + "learning_rate": 1.7658873033765374e-06, + "loss": 0.88398266, + "num_input_tokens_seen": 197527340, + "router_z_loss_clip": 1.43847656, + "router_z_loss_mlp": 0.11627197, + "step": 9168, + "time_per_iteration": 2.5044801235198975 + }, + { + "auxiliary_loss_clip": 0.06426235, + "auxiliary_loss_mlp": 0.01266078, + "balance_loss_clip": 0.06278569, + "balance_loss_mlp": 0.01255206, + "epoch": 0.5512701037126109, + "flos": 26252053165440.0, + "grad_norm": 1.768039916500128, + "language_loss": 0.6941396, + "learning_rate": 1.7655005235783591e-06, + "loss": 0.77106273, + "num_input_tokens_seen": 197547280, + "router_z_loss_clip": 1.47460938, + "router_z_loss_mlp": 0.10876465, + "step": 9169, + "time_per_iteration": 2.5712435245513916 + }, + { + "auxiliary_loss_clip": 0.06426435, + "auxiliary_loss_mlp": 0.01277267, + "balance_loss_clip": 0.06284146, + "balance_loss_mlp": 0.01267092, + "epoch": 0.5513302269652788, + "flos": 21951997009920.0, + "grad_norm": 1.7919633768432253, + "language_loss": 0.85238504, + "learning_rate": 1.7651137526724251e-06, + "loss": 0.92942202, + "num_input_tokens_seen": 197565045, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.10174561, + "step": 9170, + "time_per_iteration": 2.6517226696014404 + }, + { + "auxiliary_loss_clip": 0.06339835, + "auxiliary_loss_mlp": 0.01252247, + "balance_loss_clip": 0.06277715, + "balance_loss_mlp": 0.01250597, + "epoch": 0.5513903502179468, + "flos": 68254728589440.0, + "grad_norm": 0.7663699077680228, + "language_loss": 0.59884483, + "learning_rate": 1.7647269906734017e-06, + "loss": 0.67476565, + "num_input_tokens_seen": 197625005, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 0.01652527, + "step": 9171, + "time_per_iteration": 3.190981864929199 + }, + { + "auxiliary_loss_clip": 0.06426144, + "auxiliary_loss_mlp": 0.01271114, + "balance_loss_clip": 0.06280371, + "balance_loss_mlp": 0.01260159, + "epoch": 0.5514504734706147, + "flos": 18740221683840.0, + "grad_norm": 1.5861452481841698, + "language_loss": 0.7047599, + "learning_rate": 1.7643402375959533e-06, + "loss": 0.78173256, + "num_input_tokens_seen": 197645050, + "router_z_loss_clip": 1.45800781, + "router_z_loss_mlp": 0.10961914, + "step": 9172, + "time_per_iteration": 2.5032176971435547 + }, + { + "auxiliary_loss_clip": 0.06426188, + "auxiliary_loss_mlp": 0.01273715, + "balance_loss_clip": 0.06281123, + "balance_loss_mlp": 0.01263218, + "epoch": 0.5515105967232827, + "flos": 22277147978880.0, + "grad_norm": 1.7175476935278873, + "language_loss": 0.76203263, + "learning_rate": 1.7639534934547474e-06, + "loss": 0.8390317, + "num_input_tokens_seen": 197663910, + "router_z_loss_clip": 1.44921875, + "router_z_loss_mlp": 0.10498047, + "step": 9173, + "time_per_iteration": 2.577878713607788 + }, + { + "auxiliary_loss_clip": 0.06421756, + "auxiliary_loss_mlp": 0.01264421, + "balance_loss_clip": 0.0627896, + "balance_loss_mlp": 0.01253359, + "epoch": 0.5515707199759508, + "flos": 22563040510080.0, + "grad_norm": 1.5999460100016771, + "language_loss": 0.75182664, + "learning_rate": 1.7635667582644484e-06, + "loss": 0.82868844, + "num_input_tokens_seen": 197681580, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.11077881, + "step": 9174, + "time_per_iteration": 2.520578384399414 + }, + { + "auxiliary_loss_clip": 0.06429856, + "auxiliary_loss_mlp": 0.0126509, + "balance_loss_clip": 0.06282729, + "balance_loss_mlp": 0.01253866, + "epoch": 0.5516308432286187, + "flos": 28298246670720.0, + "grad_norm": 1.7068220971376928, + "language_loss": 0.72958624, + "learning_rate": 1.7631800320397217e-06, + "loss": 0.80653572, + "num_input_tokens_seen": 197702095, + "router_z_loss_clip": 1.47070312, + "router_z_loss_mlp": 0.11206055, + "step": 9175, + "time_per_iteration": 2.5991220474243164 + }, + { + "auxiliary_loss_clip": 0.06423448, + "auxiliary_loss_mlp": 0.01272105, + "balance_loss_clip": 0.06278881, + "balance_loss_mlp": 0.01261192, + "epoch": 0.5516909664812867, + "flos": 18769417632000.0, + "grad_norm": 1.996679187528513, + "language_loss": 0.69295454, + "learning_rate": 1.7627933147952318e-06, + "loss": 0.7699101, + "num_input_tokens_seen": 197720720, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.10919189, + "step": 9176, + "time_per_iteration": 2.4903998374938965 + }, + { + "auxiliary_loss_clip": 0.06421016, + "auxiliary_loss_mlp": 0.01270885, + "balance_loss_clip": 0.06278497, + "balance_loss_mlp": 0.01260467, + "epoch": 0.5517510897339546, + "flos": 27746852878080.0, + "grad_norm": 1.714802927656724, + "language_loss": 0.71279752, + "learning_rate": 1.7624066065456435e-06, + "loss": 0.78971648, + "num_input_tokens_seen": 197741820, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.10418701, + "step": 9177, + "time_per_iteration": 3.9531290531158447 + }, + { + "auxiliary_loss_clip": 0.06428478, + "auxiliary_loss_mlp": 0.01269605, + "balance_loss_clip": 0.06282966, + "balance_loss_mlp": 0.0125924, + "epoch": 0.5518112129866226, + "flos": 18410165251200.0, + "grad_norm": 1.801915682479776, + "language_loss": 0.80691963, + "learning_rate": 1.7620199073056204e-06, + "loss": 0.8839004, + "num_input_tokens_seen": 197759160, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.10369873, + "step": 9178, + "time_per_iteration": 2.5356597900390625 + }, + { + "auxiliary_loss_clip": 0.06432515, + "auxiliary_loss_mlp": 0.01265625, + "balance_loss_clip": 0.06282209, + "balance_loss_mlp": 0.01254228, + "epoch": 0.5518713362392905, + "flos": 25089699726720.0, + "grad_norm": 1.5622133019409348, + "language_loss": 0.7545979, + "learning_rate": 1.761633217089826e-06, + "loss": 0.83157933, + "num_input_tokens_seen": 197779760, + "router_z_loss_clip": 1.50292969, + "router_z_loss_mlp": 0.11395264, + "step": 9179, + "time_per_iteration": 2.598055124282837 + }, + { + "auxiliary_loss_clip": 0.06425376, + "auxiliary_loss_mlp": 0.01269609, + "balance_loss_clip": 0.06280036, + "balance_loss_mlp": 0.01259005, + "epoch": 0.5519314594919585, + "flos": 36547911279360.0, + "grad_norm": 1.6999645614086591, + "language_loss": 0.70073718, + "learning_rate": 1.761246535912924e-06, + "loss": 0.77768701, + "num_input_tokens_seen": 197801545, + "router_z_loss_clip": 1.453125, + "router_z_loss_mlp": 0.1060791, + "step": 9180, + "time_per_iteration": 2.6791419982910156 + }, + { + "auxiliary_loss_clip": 0.06424871, + "auxiliary_loss_mlp": 0.01268506, + "balance_loss_clip": 0.06279478, + "balance_loss_mlp": 0.01257121, + "epoch": 0.5519915827446265, + "flos": 20454807456000.0, + "grad_norm": 1.7661274413355668, + "language_loss": 0.67505682, + "learning_rate": 1.7608598637895776e-06, + "loss": 0.75199056, + "num_input_tokens_seen": 197820760, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.11376953, + "step": 9181, + "time_per_iteration": 4.004978656768799 + }, + { + "auxiliary_loss_clip": 0.06431428, + "auxiliary_loss_mlp": 0.01267631, + "balance_loss_clip": 0.06280805, + "balance_loss_mlp": 0.01256682, + "epoch": 0.5520517059972945, + "flos": 23774672949120.0, + "grad_norm": 1.9095811471330626, + "language_loss": 0.79281217, + "learning_rate": 1.7604732007344486e-06, + "loss": 0.86980277, + "num_input_tokens_seen": 197840195, + "router_z_loss_clip": 1.50683594, + "router_z_loss_mlp": 0.10949707, + "step": 9182, + "time_per_iteration": 2.537867546081543 + }, + { + "auxiliary_loss_clip": 0.06428897, + "auxiliary_loss_mlp": 0.0126956, + "balance_loss_clip": 0.06281601, + "balance_loss_mlp": 0.01258259, + "epoch": 0.5521118292499624, + "flos": 22202362609920.0, + "grad_norm": 1.7640468757897252, + "language_loss": 0.83230162, + "learning_rate": 1.7600865467622003e-06, + "loss": 0.9092862, + "num_input_tokens_seen": 197859475, + "router_z_loss_clip": 1.47167969, + "router_z_loss_mlp": 0.11303711, + "step": 9183, + "time_per_iteration": 2.5279808044433594 + }, + { + "auxiliary_loss_clip": 0.0642349, + "auxiliary_loss_mlp": 0.01270068, + "balance_loss_clip": 0.0627853, + "balance_loss_mlp": 0.01259632, + "epoch": 0.5521719525026304, + "flos": 23589491425920.0, + "grad_norm": 1.2800662076099543, + "language_loss": 0.67446053, + "learning_rate": 1.7596999018874936e-06, + "loss": 0.75139618, + "num_input_tokens_seen": 197879395, + "router_z_loss_clip": 1.45019531, + "router_z_loss_mlp": 0.10437012, + "step": 9184, + "time_per_iteration": 2.684945821762085 + }, + { + "auxiliary_loss_clip": 0.06425154, + "auxiliary_loss_mlp": 0.01269673, + "balance_loss_clip": 0.06279694, + "balance_loss_mlp": 0.01258652, + "epoch": 0.5522320757552983, + "flos": 26144298414720.0, + "grad_norm": 1.5606033277911597, + "language_loss": 0.76214409, + "learning_rate": 1.7593132661249917e-06, + "loss": 0.83909237, + "num_input_tokens_seen": 197900815, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.11016846, + "step": 9185, + "time_per_iteration": 2.654999017715454 + }, + { + "auxiliary_loss_clip": 0.06428938, + "auxiliary_loss_mlp": 0.01270824, + "balance_loss_clip": 0.06280778, + "balance_loss_mlp": 0.01259661, + "epoch": 0.5522921990079663, + "flos": 24682258448640.0, + "grad_norm": 1.714573937603497, + "language_loss": 0.73903292, + "learning_rate": 1.7589266394893536e-06, + "loss": 0.8160305, + "num_input_tokens_seen": 197918985, + "router_z_loss_clip": 1.47949219, + "router_z_loss_mlp": 0.1116333, + "step": 9186, + "time_per_iteration": 4.173564672470093 + }, + { + "auxiliary_loss_clip": 0.06430478, + "auxiliary_loss_mlp": 0.0127082, + "balance_loss_clip": 0.06282008, + "balance_loss_mlp": 0.01260032, + "epoch": 0.5523523222606344, + "flos": 22754888432640.0, + "grad_norm": 1.9890242222634391, + "language_loss": 0.66822404, + "learning_rate": 1.7585400219952421e-06, + "loss": 0.74523699, + "num_input_tokens_seen": 197937725, + "router_z_loss_clip": 1.48339844, + "router_z_loss_mlp": 0.10784912, + "step": 9187, + "time_per_iteration": 2.5402488708496094 + }, + { + "auxiliary_loss_clip": 0.06424463, + "auxiliary_loss_mlp": 0.01272464, + "balance_loss_clip": 0.06278258, + "balance_loss_mlp": 0.01261663, + "epoch": 0.5524124455133023, + "flos": 19761976771200.0, + "grad_norm": 1.6249988598177185, + "language_loss": 0.77965587, + "learning_rate": 1.758153413657318e-06, + "loss": 0.85662508, + "num_input_tokens_seen": 197955635, + "router_z_loss_clip": 1.45996094, + "router_z_loss_mlp": 0.10803223, + "step": 9188, + "time_per_iteration": 2.4915547370910645 + }, + { + "auxiliary_loss_clip": 0.06426179, + "auxiliary_loss_mlp": 0.01274155, + "balance_loss_clip": 0.06280048, + "balance_loss_mlp": 0.01262579, + "epoch": 0.5524725687659703, + "flos": 23301544469760.0, + "grad_norm": 1.615723789328545, + "language_loss": 0.81586993, + "learning_rate": 1.7577668144902394e-06, + "loss": 0.89287329, + "num_input_tokens_seen": 197974490, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.11572266, + "step": 9189, + "time_per_iteration": 2.540083885192871 + }, + { + "auxiliary_loss_clip": 0.06419186, + "auxiliary_loss_mlp": 0.01269353, + "balance_loss_clip": 0.06276601, + "balance_loss_mlp": 0.0125776, + "epoch": 0.5525326920186382, + "flos": 24868907418240.0, + "grad_norm": 1.331008644060519, + "language_loss": 0.76847303, + "learning_rate": 1.7573802245086684e-06, + "loss": 0.84535837, + "num_input_tokens_seen": 197995735, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.1159668, + "step": 9190, + "time_per_iteration": 2.597717046737671 + }, + { + "auxiliary_loss_clip": 0.0643147, + "auxiliary_loss_mlp": 0.01272383, + "balance_loss_clip": 0.06278718, + "balance_loss_mlp": 0.01260438, + "epoch": 0.5525928152713062, + "flos": 13740710371200.0, + "grad_norm": 2.3910114977567787, + "language_loss": 0.79437977, + "learning_rate": 1.7569936437272627e-06, + "loss": 0.87141836, + "num_input_tokens_seen": 198009685, + "router_z_loss_clip": 1.52636719, + "router_z_loss_mlp": 0.11950684, + "step": 9191, + "time_per_iteration": 2.547445774078369 + }, + { + "auxiliary_loss_clip": 0.06422585, + "auxiliary_loss_mlp": 0.01264097, + "balance_loss_clip": 0.06276913, + "balance_loss_mlp": 0.01253624, + "epoch": 0.5526529385239741, + "flos": 13075398552960.0, + "grad_norm": 2.207227027061606, + "language_loss": 0.6899271, + "learning_rate": 1.7566070721606829e-06, + "loss": 0.76679391, + "num_input_tokens_seen": 198026845, + "router_z_loss_clip": 1.45800781, + "router_z_loss_mlp": 0.10473633, + "step": 9192, + "time_per_iteration": 2.4774858951568604 + }, + { + "auxiliary_loss_clip": 0.06421191, + "auxiliary_loss_mlp": 0.01268175, + "balance_loss_clip": 0.06277353, + "balance_loss_mlp": 0.01257786, + "epoch": 0.5527130617766421, + "flos": 23154992478720.0, + "grad_norm": 1.5351732563488263, + "language_loss": 0.77348876, + "learning_rate": 1.756220509823588e-06, + "loss": 0.85038239, + "num_input_tokens_seen": 198045275, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.10400391, + "step": 9193, + "time_per_iteration": 3.9115588665008545 + }, + { + "auxiliary_loss_clip": 0.06421337, + "auxiliary_loss_mlp": 0.01271193, + "balance_loss_clip": 0.06275223, + "balance_loss_mlp": 0.01260357, + "epoch": 0.55277318502931, + "flos": 21291506801280.0, + "grad_norm": 1.5126002389204065, + "language_loss": 0.79036456, + "learning_rate": 1.7558339567306344e-06, + "loss": 0.8672899, + "num_input_tokens_seen": 198065760, + "router_z_loss_clip": 1.45996094, + "router_z_loss_mlp": 0.1083374, + "step": 9194, + "time_per_iteration": 2.5319602489471436 + }, + { + "auxiliary_loss_clip": 0.06427231, + "auxiliary_loss_mlp": 0.01269531, + "balance_loss_clip": 0.06274066, + "balance_loss_mlp": 0.01258189, + "epoch": 0.5528333082819781, + "flos": 38333383205760.0, + "grad_norm": 1.8079647356103097, + "language_loss": 0.70506799, + "learning_rate": 1.7554474128964825e-06, + "loss": 0.78203559, + "num_input_tokens_seen": 198087595, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.11340332, + "step": 9195, + "time_per_iteration": 2.6384387016296387 + }, + { + "auxiliary_loss_clip": 0.06436112, + "auxiliary_loss_mlp": 0.01266283, + "balance_loss_clip": 0.06281462, + "balance_loss_mlp": 0.01253778, + "epoch": 0.552893431534646, + "flos": 13558799157120.0, + "grad_norm": 2.003941554047622, + "language_loss": 0.74570775, + "learning_rate": 1.7550608783357887e-06, + "loss": 0.82273173, + "num_input_tokens_seen": 198104620, + "router_z_loss_clip": 1.546875, + "router_z_loss_mlp": 0.12506104, + "step": 9196, + "time_per_iteration": 2.5033600330352783 + }, + { + "auxiliary_loss_clip": 0.06429259, + "auxiliary_loss_mlp": 0.01263784, + "balance_loss_clip": 0.0628302, + "balance_loss_mlp": 0.01252656, + "epoch": 0.552953554787314, + "flos": 21944995194240.0, + "grad_norm": 1.6318385903460113, + "language_loss": 0.77179539, + "learning_rate": 1.7546743530632115e-06, + "loss": 0.8487258, + "num_input_tokens_seen": 198123565, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.11126709, + "step": 9197, + "time_per_iteration": 2.500624895095825 + }, + { + "auxiliary_loss_clip": 0.06421226, + "auxiliary_loss_mlp": 0.01269574, + "balance_loss_clip": 0.06276499, + "balance_loss_mlp": 0.01259316, + "epoch": 0.5530136780399819, + "flos": 43668820736640.0, + "grad_norm": 1.4562548285485233, + "language_loss": 0.76468647, + "learning_rate": 1.754287837093407e-06, + "loss": 0.84159452, + "num_input_tokens_seen": 198148270, + "router_z_loss_clip": 1.44824219, + "router_z_loss_mlp": 0.1026001, + "step": 9198, + "time_per_iteration": 2.7432668209075928 + }, + { + "auxiliary_loss_clip": 0.06427757, + "auxiliary_loss_mlp": 0.0126746, + "balance_loss_clip": 0.06281044, + "balance_loss_mlp": 0.01256994, + "epoch": 0.5530738012926499, + "flos": 25052411859840.0, + "grad_norm": 1.5004430901507595, + "language_loss": 0.79301012, + "learning_rate": 1.7539013304410327e-06, + "loss": 0.86996233, + "num_input_tokens_seen": 198168810, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 0.10461426, + "step": 9199, + "time_per_iteration": 2.547755241394043 + }, + { + "auxiliary_loss_clip": 0.06422742, + "auxiliary_loss_mlp": 0.01266548, + "balance_loss_clip": 0.06276976, + "balance_loss_mlp": 0.01255962, + "epoch": 0.553133924545318, + "flos": 16477680136320.0, + "grad_norm": 1.9305306774012563, + "language_loss": 0.63492346, + "learning_rate": 1.7535148331207443e-06, + "loss": 0.71181637, + "num_input_tokens_seen": 198186200, + "router_z_loss_clip": 1.45605469, + "router_z_loss_mlp": 0.10577393, + "step": 9200, + "time_per_iteration": 2.5127363204956055 + }, + { + "auxiliary_loss_clip": 0.06431345, + "auxiliary_loss_mlp": 0.01265429, + "balance_loss_clip": 0.06280623, + "balance_loss_mlp": 0.01253866, + "epoch": 0.5531940477979859, + "flos": 24612797813760.0, + "grad_norm": 1.757338852617271, + "language_loss": 0.66817963, + "learning_rate": 1.7531283451471978e-06, + "loss": 0.74514735, + "num_input_tokens_seen": 198207050, + "router_z_loss_clip": 1.50683594, + "router_z_loss_mlp": 0.11560059, + "step": 9201, + "time_per_iteration": 2.5651068687438965 + }, + { + "auxiliary_loss_clip": 0.06425701, + "auxiliary_loss_mlp": 0.01270434, + "balance_loss_clip": 0.06278911, + "balance_loss_mlp": 0.0125871, + "epoch": 0.5532541710506539, + "flos": 22165410159360.0, + "grad_norm": 2.045638683899954, + "language_loss": 0.61266994, + "learning_rate": 1.7527418665350502e-06, + "loss": 0.68963134, + "num_input_tokens_seen": 198224565, + "router_z_loss_clip": 1.46679688, + "router_z_loss_mlp": 0.11737061, + "step": 9202, + "time_per_iteration": 2.5841257572174072 + }, + { + "auxiliary_loss_clip": 0.06419975, + "auxiliary_loss_mlp": 0.01264358, + "balance_loss_clip": 0.06278098, + "balance_loss_mlp": 0.01253493, + "epoch": 0.5533142943033218, + "flos": 21403621964160.0, + "grad_norm": 1.6777411475808515, + "language_loss": 0.64766765, + "learning_rate": 1.7523553972989548e-06, + "loss": 0.72451103, + "num_input_tokens_seen": 198244790, + "router_z_loss_clip": 1.41894531, + "router_z_loss_mlp": 0.10864258, + "step": 9203, + "time_per_iteration": 2.502300977706909 + }, + { + "auxiliary_loss_clip": 0.06425197, + "auxiliary_loss_mlp": 0.01269086, + "balance_loss_clip": 0.06279255, + "balance_loss_mlp": 0.01258065, + "epoch": 0.5533744175559898, + "flos": 23557360584960.0, + "grad_norm": 1.630044734052438, + "language_loss": 0.63918829, + "learning_rate": 1.7519689374535683e-06, + "loss": 0.71613109, + "num_input_tokens_seen": 198264375, + "router_z_loss_clip": 1.45996094, + "router_z_loss_mlp": 0.11022949, + "step": 9204, + "time_per_iteration": 2.5487308502197266 + }, + { + "auxiliary_loss_clip": 0.0642142, + "auxiliary_loss_mlp": 0.01264869, + "balance_loss_clip": 0.06278381, + "balance_loss_mlp": 0.01254451, + "epoch": 0.5534345408086577, + "flos": 24068447763840.0, + "grad_norm": 1.4496742073495597, + "language_loss": 0.77449042, + "learning_rate": 1.7515824870135445e-06, + "loss": 0.85135335, + "num_input_tokens_seen": 198283895, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.10418701, + "step": 9205, + "time_per_iteration": 2.5445451736450195 + }, + { + "auxiliary_loss_clip": 0.06419459, + "auxiliary_loss_mlp": 0.01264463, + "balance_loss_clip": 0.06277758, + "balance_loss_mlp": 0.01254104, + "epoch": 0.5534946640613257, + "flos": 33781242441600.0, + "grad_norm": 1.38023808830968, + "language_loss": 0.72729224, + "learning_rate": 1.751196045993537e-06, + "loss": 0.80413151, + "num_input_tokens_seen": 198310035, + "router_z_loss_clip": 1.41699219, + "router_z_loss_mlp": 0.1036377, + "step": 9206, + "time_per_iteration": 2.7339117527008057 + }, + { + "auxiliary_loss_clip": 0.06421407, + "auxiliary_loss_mlp": 0.01265704, + "balance_loss_clip": 0.06274875, + "balance_loss_mlp": 0.01255005, + "epoch": 0.5535547873139937, + "flos": 15164707783680.0, + "grad_norm": 1.9977188658051825, + "language_loss": 0.7547437, + "learning_rate": 1.7508096144082012e-06, + "loss": 0.83161485, + "num_input_tokens_seen": 198327810, + "router_z_loss_clip": 1.46386719, + "router_z_loss_mlp": 0.10699463, + "step": 9207, + "time_per_iteration": 2.482356548309326 + }, + { + "auxiliary_loss_clip": 0.06436527, + "auxiliary_loss_mlp": 0.01265889, + "balance_loss_clip": 0.06285885, + "balance_loss_mlp": 0.01254493, + "epoch": 0.5536149105666617, + "flos": 16986209765760.0, + "grad_norm": 2.498092208232672, + "language_loss": 0.61888683, + "learning_rate": 1.750423192272189e-06, + "loss": 0.69591099, + "num_input_tokens_seen": 198343150, + "router_z_loss_clip": 1.50488281, + "router_z_loss_mlp": 0.1138916, + "step": 9208, + "time_per_iteration": 2.493628740310669 + }, + { + "auxiliary_loss_clip": 0.06428279, + "auxiliary_loss_mlp": 0.01268207, + "balance_loss_clip": 0.06278799, + "balance_loss_mlp": 0.01256543, + "epoch": 0.5536750338193296, + "flos": 18155732728320.0, + "grad_norm": 2.094677241914043, + "language_loss": 0.64708155, + "learning_rate": 1.7500367796001547e-06, + "loss": 0.72404641, + "num_input_tokens_seen": 198360925, + "router_z_loss_clip": 1.49511719, + "router_z_loss_mlp": 0.11663818, + "step": 9209, + "time_per_iteration": 2.4616804122924805 + }, + { + "auxiliary_loss_clip": 0.06424735, + "auxiliary_loss_mlp": 0.01272111, + "balance_loss_clip": 0.06279891, + "balance_loss_mlp": 0.01260863, + "epoch": 0.5537351570719976, + "flos": 22754469162240.0, + "grad_norm": 1.8280568303571236, + "language_loss": 0.82967091, + "learning_rate": 1.7496503764067513e-06, + "loss": 0.90663934, + "num_input_tokens_seen": 198379265, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.11242676, + "step": 9210, + "time_per_iteration": 2.564713954925537 + }, + { + "auxiliary_loss_clip": 0.06418703, + "auxiliary_loss_mlp": 0.01265805, + "balance_loss_clip": 0.06275869, + "balance_loss_mlp": 0.01255381, + "epoch": 0.5537952803246655, + "flos": 26362658954880.0, + "grad_norm": 1.71176011345987, + "language_loss": 0.72960317, + "learning_rate": 1.74926398270663e-06, + "loss": 0.80644828, + "num_input_tokens_seen": 198399490, + "router_z_loss_clip": 1.42675781, + "router_z_loss_mlp": 0.10430908, + "step": 9211, + "time_per_iteration": 2.5312066078186035 + }, + { + "auxiliary_loss_clip": 0.06431179, + "auxiliary_loss_mlp": 0.01267507, + "balance_loss_clip": 0.06280635, + "balance_loss_mlp": 0.01256045, + "epoch": 0.5538554035773335, + "flos": 18042695170560.0, + "grad_norm": 2.3508559175952803, + "language_loss": 0.67497891, + "learning_rate": 1.7488775985144437e-06, + "loss": 0.75196576, + "num_input_tokens_seen": 198419110, + "router_z_loss_clip": 1.50585938, + "router_z_loss_mlp": 0.11462402, + "step": 9212, + "time_per_iteration": 2.5141408443450928 + }, + { + "auxiliary_loss_clip": 0.06429373, + "auxiliary_loss_mlp": 0.01268343, + "balance_loss_clip": 0.06279323, + "balance_loss_mlp": 0.0125554, + "epoch": 0.5539155268300014, + "flos": 31694323052160.0, + "grad_norm": 1.4365879651928444, + "language_loss": 0.5225575, + "learning_rate": 1.7484912238448443e-06, + "loss": 0.59953463, + "num_input_tokens_seen": 198441360, + "router_z_loss_clip": 1.49902344, + "router_z_loss_mlp": 0.12792969, + "step": 9213, + "time_per_iteration": 2.5764448642730713 + }, + { + "auxiliary_loss_clip": 0.06431083, + "auxiliary_loss_mlp": 0.01264603, + "balance_loss_clip": 0.06282363, + "balance_loss_mlp": 0.01253302, + "epoch": 0.5539756500826695, + "flos": 15198934976640.0, + "grad_norm": 1.6892906357761146, + "language_loss": 0.85764515, + "learning_rate": 1.7481048587124827e-06, + "loss": 0.93460202, + "num_input_tokens_seen": 198459835, + "router_z_loss_clip": 1.48632812, + "router_z_loss_mlp": 0.11303711, + "step": 9214, + "time_per_iteration": 2.5433578491210938 + }, + { + "auxiliary_loss_clip": 0.06422558, + "auxiliary_loss_mlp": 0.01262824, + "balance_loss_clip": 0.06277601, + "balance_loss_mlp": 0.01252333, + "epoch": 0.5540357733353375, + "flos": 26359262864640.0, + "grad_norm": 1.8961662277212366, + "language_loss": 0.70100081, + "learning_rate": 1.7477185031320108e-06, + "loss": 0.77785456, + "num_input_tokens_seen": 198478955, + "router_z_loss_clip": 1.45019531, + "router_z_loss_mlp": 0.10491943, + "step": 9215, + "time_per_iteration": 2.548687696456909 + }, + { + "auxiliary_loss_clip": 0.06428155, + "auxiliary_loss_mlp": 0.01266334, + "balance_loss_clip": 0.06279612, + "balance_loss_mlp": 0.01254825, + "epoch": 0.5540958965880054, + "flos": 21329926698240.0, + "grad_norm": 1.6927060371572338, + "language_loss": 0.73713386, + "learning_rate": 1.7473321571180773e-06, + "loss": 0.81407875, + "num_input_tokens_seen": 198499030, + "router_z_loss_clip": 1.48535156, + "router_z_loss_mlp": 0.1151123, + "step": 9216, + "time_per_iteration": 2.541210174560547 + }, + { + "auxiliary_loss_clip": 0.06421469, + "auxiliary_loss_mlp": 0.01265486, + "balance_loss_clip": 0.06278324, + "balance_loss_mlp": 0.01254471, + "epoch": 0.5541560198406734, + "flos": 25674020974080.0, + "grad_norm": 1.768513313341331, + "language_loss": 0.71651757, + "learning_rate": 1.7469458206853345e-06, + "loss": 0.79338706, + "num_input_tokens_seen": 198520265, + "router_z_loss_clip": 1.43164062, + "router_z_loss_mlp": 0.11029053, + "step": 9217, + "time_per_iteration": 4.048692226409912 + }, + { + "auxiliary_loss_clip": 0.0642062, + "auxiliary_loss_mlp": 0.01262573, + "balance_loss_clip": 0.06274968, + "balance_loss_mlp": 0.01251993, + "epoch": 0.5542161430933413, + "flos": 21945246756480.0, + "grad_norm": 1.641855173543887, + "language_loss": 0.78896093, + "learning_rate": 1.7465594938484315e-06, + "loss": 0.86579281, + "num_input_tokens_seen": 198539645, + "router_z_loss_clip": 1.45703125, + "router_z_loss_mlp": 0.10577393, + "step": 9218, + "time_per_iteration": 2.5090229511260986 + }, + { + "auxiliary_loss_clip": 0.06429659, + "auxiliary_loss_mlp": 0.01266909, + "balance_loss_clip": 0.06280088, + "balance_loss_mlp": 0.01255023, + "epoch": 0.5542762663460093, + "flos": 19577256445440.0, + "grad_norm": 1.9145093316494244, + "language_loss": 0.72342837, + "learning_rate": 1.7461731766220176e-06, + "loss": 0.80039406, + "num_input_tokens_seen": 198558710, + "router_z_loss_clip": 1.49316406, + "router_z_loss_mlp": 0.11889648, + "step": 9219, + "time_per_iteration": 2.6097207069396973 + }, + { + "auxiliary_loss_clip": 0.06423312, + "auxiliary_loss_mlp": 0.01267842, + "balance_loss_clip": 0.06275792, + "balance_loss_mlp": 0.01256809, + "epoch": 0.5543363895986773, + "flos": 19504944771840.0, + "grad_norm": 1.6265573389583097, + "language_loss": 0.7175796, + "learning_rate": 1.7457868690207426e-06, + "loss": 0.79449117, + "num_input_tokens_seen": 198577050, + "router_z_loss_clip": 1.47558594, + "router_z_loss_mlp": 0.11035156, + "step": 9220, + "time_per_iteration": 3.953366756439209 + }, + { + "auxiliary_loss_clip": 0.0641966, + "auxiliary_loss_mlp": 0.01266715, + "balance_loss_clip": 0.06276264, + "balance_loss_mlp": 0.01256154, + "epoch": 0.5543965128513453, + "flos": 22641808947840.0, + "grad_norm": 1.5837082117197903, + "language_loss": 0.79554594, + "learning_rate": 1.7454005710592547e-06, + "loss": 0.8724097, + "num_input_tokens_seen": 198595290, + "router_z_loss_clip": 1.43359375, + "router_z_loss_mlp": 0.10565186, + "step": 9221, + "time_per_iteration": 2.6012284755706787 + }, + { + "auxiliary_loss_clip": 0.06419835, + "auxiliary_loss_mlp": 0.01268367, + "balance_loss_clip": 0.06276818, + "balance_loss_mlp": 0.0125715, + "epoch": 0.5544566361040132, + "flos": 25996320904320.0, + "grad_norm": 1.7031606951897913, + "language_loss": 0.8378005, + "learning_rate": 1.7450142827522027e-06, + "loss": 0.91468251, + "num_input_tokens_seen": 198614110, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.11224365, + "step": 9222, + "time_per_iteration": 2.5621228218078613 + }, + { + "auxiliary_loss_clip": 0.06426205, + "auxiliary_loss_mlp": 0.01268401, + "balance_loss_clip": 0.06276226, + "balance_loss_mlp": 0.01256236, + "epoch": 0.5545167593566812, + "flos": 28265235361920.0, + "grad_norm": 1.624171595552914, + "language_loss": 0.75644016, + "learning_rate": 1.7446280041142344e-06, + "loss": 0.83338618, + "num_input_tokens_seen": 198633880, + "router_z_loss_clip": 1.50097656, + "router_z_loss_mlp": 0.1217041, + "step": 9223, + "time_per_iteration": 2.6189255714416504 + }, + { + "auxiliary_loss_clip": 0.06421085, + "auxiliary_loss_mlp": 0.012666, + "balance_loss_clip": 0.06275317, + "balance_loss_mlp": 0.01255168, + "epoch": 0.5545768826093491, + "flos": 28484266734720.0, + "grad_norm": 1.537609394832996, + "language_loss": 0.81879461, + "learning_rate": 1.7442417351599986e-06, + "loss": 0.89567149, + "num_input_tokens_seen": 198653505, + "router_z_loss_clip": 1.45800781, + "router_z_loss_mlp": 0.11425781, + "step": 9224, + "time_per_iteration": 2.5794196128845215 + }, + { + "auxiliary_loss_clip": 0.06424309, + "auxiliary_loss_mlp": 0.01271127, + "balance_loss_clip": 0.06276202, + "balance_loss_mlp": 0.01259432, + "epoch": 0.5546370058620171, + "flos": 18483860517120.0, + "grad_norm": 1.6794429489770297, + "language_loss": 0.57241935, + "learning_rate": 1.743855475904141e-06, + "loss": 0.64937371, + "num_input_tokens_seen": 198671890, + "router_z_loss_clip": 1.47949219, + "router_z_loss_mlp": 0.11688232, + "step": 9225, + "time_per_iteration": 3.9698383808135986 + }, + { + "auxiliary_loss_clip": 0.06422257, + "auxiliary_loss_mlp": 0.01267893, + "balance_loss_clip": 0.06275012, + "balance_loss_mlp": 0.01257009, + "epoch": 0.554697129114685, + "flos": 22937260844160.0, + "grad_norm": 1.5804786041677554, + "language_loss": 0.6778791, + "learning_rate": 1.7434692263613098e-06, + "loss": 0.75478059, + "num_input_tokens_seen": 198691995, + "router_z_loss_clip": 1.47460938, + "router_z_loss_mlp": 0.10870361, + "step": 9226, + "time_per_iteration": 2.5307633876800537 + }, + { + "auxiliary_loss_clip": 0.06423603, + "auxiliary_loss_mlp": 0.01267041, + "balance_loss_clip": 0.06275073, + "balance_loss_mlp": 0.01256002, + "epoch": 0.5547572523673531, + "flos": 21803348666880.0, + "grad_norm": 1.2977635143377364, + "language_loss": 0.74954712, + "learning_rate": 1.7430829865461518e-06, + "loss": 0.82645351, + "num_input_tokens_seen": 198712440, + "router_z_loss_clip": 1.48339844, + "router_z_loss_mlp": 0.11047363, + "step": 9227, + "time_per_iteration": 2.5083706378936768 + }, + { + "auxiliary_loss_clip": 0.06423934, + "auxiliary_loss_mlp": 0.01266224, + "balance_loss_clip": 0.06275739, + "balance_loss_mlp": 0.01254768, + "epoch": 0.5548173756200211, + "flos": 22348830746880.0, + "grad_norm": 1.524887798675916, + "language_loss": 0.73794919, + "learning_rate": 1.7426967564733118e-06, + "loss": 0.81485081, + "num_input_tokens_seen": 198731515, + "router_z_loss_clip": 1.48046875, + "router_z_loss_mlp": 0.11444092, + "step": 9228, + "time_per_iteration": 2.555020809173584 + }, + { + "auxiliary_loss_clip": 0.06423147, + "auxiliary_loss_mlp": 0.01263866, + "balance_loss_clip": 0.06276013, + "balance_loss_mlp": 0.01253465, + "epoch": 0.554877498872689, + "flos": 17864599317120.0, + "grad_norm": 1.7043498128680434, + "language_loss": 0.76352561, + "learning_rate": 1.7423105361574373e-06, + "loss": 0.84039581, + "num_input_tokens_seen": 198749750, + "router_z_loss_clip": 1.47167969, + "router_z_loss_mlp": 0.10400391, + "step": 9229, + "time_per_iteration": 2.4959444999694824 + }, + { + "auxiliary_loss_clip": 0.06423293, + "auxiliary_loss_mlp": 0.01266918, + "balance_loss_clip": 0.06275852, + "balance_loss_mlp": 0.0125464, + "epoch": 0.554937622125357, + "flos": 17244080305920.0, + "grad_norm": 1.4897541866361217, + "language_loss": 0.69068646, + "learning_rate": 1.741924325613172e-06, + "loss": 0.76758856, + "num_input_tokens_seen": 198768320, + "router_z_loss_clip": 1.47363281, + "router_z_loss_mlp": 0.12280273, + "step": 9230, + "time_per_iteration": 2.5090713500976562 + }, + { + "auxiliary_loss_clip": 0.06427252, + "auxiliary_loss_mlp": 0.01267128, + "balance_loss_clip": 0.06276985, + "balance_loss_mlp": 0.01254587, + "epoch": 0.5549977453780249, + "flos": 25374082884480.0, + "grad_norm": 2.3665837136773047, + "language_loss": 0.68808627, + "learning_rate": 1.741538124855163e-06, + "loss": 0.76503003, + "num_input_tokens_seen": 198787230, + "router_z_loss_clip": 1.50097656, + "router_z_loss_mlp": 0.12554932, + "step": 9231, + "time_per_iteration": 2.5350747108459473 + }, + { + "auxiliary_loss_clip": 0.06429425, + "auxiliary_loss_mlp": 0.01269438, + "balance_loss_clip": 0.06277338, + "balance_loss_mlp": 0.01256885, + "epoch": 0.555057868630693, + "flos": 25085548949760.0, + "grad_norm": 1.6698826084601515, + "language_loss": 0.78408533, + "learning_rate": 1.7411519338980548e-06, + "loss": 0.86107397, + "num_input_tokens_seen": 198806720, + "router_z_loss_clip": 1.51953125, + "router_z_loss_mlp": 0.12542725, + "step": 9232, + "time_per_iteration": 4.055214881896973 + }, + { + "auxiliary_loss_clip": 0.06416719, + "auxiliary_loss_mlp": 0.01266689, + "balance_loss_clip": 0.06273052, + "balance_loss_mlp": 0.01255972, + "epoch": 0.5551179918833609, + "flos": 26111412887040.0, + "grad_norm": 1.627879634610194, + "language_loss": 0.83063745, + "learning_rate": 1.7407657527564898e-06, + "loss": 0.90747154, + "num_input_tokens_seen": 198826235, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.10723877, + "step": 9233, + "time_per_iteration": 2.6376969814300537 + }, + { + "auxiliary_loss_clip": 0.06430396, + "auxiliary_loss_mlp": 0.01266353, + "balance_loss_clip": 0.06277359, + "balance_loss_mlp": 0.01254927, + "epoch": 0.5551781151360289, + "flos": 19389810862080.0, + "grad_norm": 2.483522309942904, + "language_loss": 0.7549684, + "learning_rate": 1.7403795814451142e-06, + "loss": 0.83193588, + "num_input_tokens_seen": 198842655, + "router_z_loss_clip": 1.52832031, + "router_z_loss_mlp": 0.11431885, + "step": 9234, + "time_per_iteration": 2.4859883785247803 + }, + { + "auxiliary_loss_clip": 0.06418739, + "auxiliary_loss_mlp": 0.01265554, + "balance_loss_clip": 0.06273869, + "balance_loss_mlp": 0.01255129, + "epoch": 0.5552382383886968, + "flos": 21732420585600.0, + "grad_norm": 1.8065340969909298, + "language_loss": 0.64963275, + "learning_rate": 1.7399934199785706e-06, + "loss": 0.72647566, + "num_input_tokens_seen": 198861210, + "router_z_loss_clip": 1.44824219, + "router_z_loss_mlp": 0.10418701, + "step": 9235, + "time_per_iteration": 2.523128032684326 + }, + { + "auxiliary_loss_clip": 0.06420863, + "auxiliary_loss_mlp": 0.01267129, + "balance_loss_clip": 0.06272598, + "balance_loss_mlp": 0.0125519, + "epoch": 0.5552983616413648, + "flos": 14361480944640.0, + "grad_norm": 1.6397834212981734, + "language_loss": 0.68087149, + "learning_rate": 1.7396072683715029e-06, + "loss": 0.75775141, + "num_input_tokens_seen": 198880045, + "router_z_loss_clip": 1.48339844, + "router_z_loss_mlp": 0.11932373, + "step": 9236, + "time_per_iteration": 2.506023406982422 + }, + { + "auxiliary_loss_clip": 0.06416081, + "auxiliary_loss_mlp": 0.01266517, + "balance_loss_clip": 0.06273347, + "balance_loss_mlp": 0.01256068, + "epoch": 0.5553584848940327, + "flos": 25484730600960.0, + "grad_norm": 1.5459271274239896, + "language_loss": 0.86436939, + "learning_rate": 1.7392211266385536e-06, + "loss": 0.94119537, + "num_input_tokens_seen": 198900210, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.10449219, + "step": 9237, + "time_per_iteration": 2.580103874206543 + }, + { + "auxiliary_loss_clip": 0.0641643, + "auxiliary_loss_mlp": 0.01267385, + "balance_loss_clip": 0.06273238, + "balance_loss_mlp": 0.01255875, + "epoch": 0.5554186081467007, + "flos": 22170399477120.0, + "grad_norm": 1.8042242059193758, + "language_loss": 0.73774469, + "learning_rate": 1.7388349947943652e-06, + "loss": 0.81458282, + "num_input_tokens_seen": 198919055, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.11517334, + "step": 9238, + "time_per_iteration": 2.5031590461730957 + }, + { + "auxiliary_loss_clip": 0.0642554, + "auxiliary_loss_mlp": 0.01266682, + "balance_loss_clip": 0.06275032, + "balance_loss_mlp": 0.01255924, + "epoch": 0.5554787313993687, + "flos": 49757744908800.0, + "grad_norm": 1.5320503148177431, + "language_loss": 0.78384852, + "learning_rate": 1.73844887285358e-06, + "loss": 0.86077076, + "num_input_tokens_seen": 198943505, + "router_z_loss_clip": 1.50585938, + "router_z_loss_mlp": 0.10766602, + "step": 9239, + "time_per_iteration": 2.7739756107330322 + }, + { + "auxiliary_loss_clip": 0.06423195, + "auxiliary_loss_mlp": 0.01266863, + "balance_loss_clip": 0.06276082, + "balance_loss_mlp": 0.0125546, + "epoch": 0.5555388546520367, + "flos": 22133908224000.0, + "grad_norm": 1.4777059666754715, + "language_loss": 0.80562818, + "learning_rate": 1.7380627608308393e-06, + "loss": 0.88252878, + "num_input_tokens_seen": 198963590, + "router_z_loss_clip": 1.47070312, + "router_z_loss_mlp": 0.11401367, + "step": 9240, + "time_per_iteration": 2.5036380290985107 + }, + { + "auxiliary_loss_clip": 0.06419357, + "auxiliary_loss_mlp": 0.01266651, + "balance_loss_clip": 0.06273453, + "balance_loss_mlp": 0.01255142, + "epoch": 0.5555989779047047, + "flos": 24689218337280.0, + "grad_norm": 1.7126628457644222, + "language_loss": 0.65465248, + "learning_rate": 1.737676658740786e-06, + "loss": 0.73151255, + "num_input_tokens_seen": 198982680, + "router_z_loss_clip": 1.45996094, + "router_z_loss_mlp": 0.1151123, + "step": 9241, + "time_per_iteration": 2.5851833820343018 + }, + { + "auxiliary_loss_clip": 0.06422672, + "auxiliary_loss_mlp": 0.01264033, + "balance_loss_clip": 0.06276439, + "balance_loss_mlp": 0.01252566, + "epoch": 0.5556591011573726, + "flos": 16111929064320.0, + "grad_norm": 1.8766289396676605, + "language_loss": 0.73123193, + "learning_rate": 1.7372905665980594e-06, + "loss": 0.80809897, + "num_input_tokens_seen": 199000185, + "router_z_loss_clip": 1.46289062, + "router_z_loss_mlp": 0.11474609, + "step": 9242, + "time_per_iteration": 2.467933416366577 + }, + { + "auxiliary_loss_clip": 0.06423976, + "auxiliary_loss_mlp": 0.0126539, + "balance_loss_clip": 0.06276064, + "balance_loss_mlp": 0.01253022, + "epoch": 0.5557192244100406, + "flos": 12938825197440.0, + "grad_norm": 6.974019127266796, + "language_loss": 0.64053857, + "learning_rate": 1.7369044844173012e-06, + "loss": 0.71743226, + "num_input_tokens_seen": 199018380, + "router_z_loss_clip": 1.47851562, + "router_z_loss_mlp": 0.12365723, + "step": 9243, + "time_per_iteration": 2.528529167175293 + }, + { + "auxiliary_loss_clip": 0.0642553, + "auxiliary_loss_mlp": 0.01269814, + "balance_loss_clip": 0.06280211, + "balance_loss_mlp": 0.01258614, + "epoch": 0.5557793476627085, + "flos": 23118291590400.0, + "grad_norm": 3.1703508621435095, + "language_loss": 0.75212169, + "learning_rate": 1.7365184122131509e-06, + "loss": 0.82907516, + "num_input_tokens_seen": 199037115, + "router_z_loss_clip": 1.45214844, + "router_z_loss_mlp": 0.11199951, + "step": 9244, + "time_per_iteration": 2.5159640312194824 + }, + { + "auxiliary_loss_clip": 0.06417421, + "auxiliary_loss_mlp": 0.01263368, + "balance_loss_clip": 0.06277108, + "balance_loss_mlp": 0.01252938, + "epoch": 0.5558394709153766, + "flos": 21433446817920.0, + "grad_norm": 2.161992759062338, + "language_loss": 0.74536991, + "learning_rate": 1.7361323500002486e-06, + "loss": 0.82217783, + "num_input_tokens_seen": 199053375, + "router_z_loss_clip": 1.40332031, + "router_z_loss_mlp": 0.10437012, + "step": 9245, + "time_per_iteration": 2.5320873260498047 + }, + { + "auxiliary_loss_clip": 0.06425805, + "auxiliary_loss_mlp": 0.01268074, + "balance_loss_clip": 0.06274477, + "balance_loss_mlp": 0.01255533, + "epoch": 0.5558995941680445, + "flos": 25084626554880.0, + "grad_norm": 2.1186554191459575, + "language_loss": 0.79345202, + "learning_rate": 1.7357462977932348e-06, + "loss": 0.87039083, + "num_input_tokens_seen": 199070930, + "router_z_loss_clip": 1.51269531, + "router_z_loss_mlp": 0.12530518, + "step": 9246, + "time_per_iteration": 2.5617494583129883 + }, + { + "auxiliary_loss_clip": 0.06425521, + "auxiliary_loss_mlp": 0.01270795, + "balance_loss_clip": 0.06276709, + "balance_loss_mlp": 0.01258993, + "epoch": 0.5559597174207125, + "flos": 20017331688960.0, + "grad_norm": 1.8080775090170724, + "language_loss": 0.7423467, + "learning_rate": 1.7353602556067471e-06, + "loss": 0.81930989, + "num_input_tokens_seen": 199088675, + "router_z_loss_clip": 1.48730469, + "router_z_loss_mlp": 0.11810303, + "step": 9247, + "time_per_iteration": 2.5472562313079834 + }, + { + "auxiliary_loss_clip": 0.06421669, + "auxiliary_loss_mlp": 0.01265666, + "balance_loss_clip": 0.06275357, + "balance_loss_mlp": 0.01254007, + "epoch": 0.5560198406733804, + "flos": 16841125221120.0, + "grad_norm": 2.9360607038713127, + "language_loss": 0.75686443, + "learning_rate": 1.7349742234554254e-06, + "loss": 0.83373785, + "num_input_tokens_seen": 199103075, + "router_z_loss_clip": 1.46386719, + "router_z_loss_mlp": 0.11645508, + "step": 9248, + "time_per_iteration": 2.4991230964660645 + }, + { + "auxiliary_loss_clip": 0.06332292, + "auxiliary_loss_mlp": 0.01252325, + "balance_loss_clip": 0.06269792, + "balance_loss_mlp": 0.01250564, + "epoch": 0.5560799639260484, + "flos": 70719012840960.0, + "grad_norm": 0.8521249277155936, + "language_loss": 0.5948171, + "learning_rate": 1.7345882013539081e-06, + "loss": 0.67066324, + "num_input_tokens_seen": 199160325, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 0.01763916, + "step": 9249, + "time_per_iteration": 3.2450287342071533 + }, + { + "auxiliary_loss_clip": 0.06424973, + "auxiliary_loss_mlp": 0.0126469, + "balance_loss_clip": 0.06276406, + "balance_loss_mlp": 0.01253943, + "epoch": 0.5561400871787163, + "flos": 23155244040960.0, + "grad_norm": 2.0335955894649036, + "language_loss": 0.79889202, + "learning_rate": 1.734202189316832e-06, + "loss": 0.87578869, + "num_input_tokens_seen": 199179760, + "router_z_loss_clip": 1.48339844, + "router_z_loss_mlp": 0.10748291, + "step": 9250, + "time_per_iteration": 2.5372138023376465 + }, + { + "auxiliary_loss_clip": 0.06427802, + "auxiliary_loss_mlp": 0.0126907, + "balance_loss_clip": 0.06277002, + "balance_loss_mlp": 0.01257471, + "epoch": 0.5562002104313843, + "flos": 17572166167680.0, + "grad_norm": 3.4851408255327856, + "language_loss": 0.69400316, + "learning_rate": 1.733816187358836e-06, + "loss": 0.77097189, + "num_input_tokens_seen": 199196695, + "router_z_loss_clip": 1.5078125, + "router_z_loss_mlp": 0.11584473, + "step": 9251, + "time_per_iteration": 2.554487943649292 + }, + { + "auxiliary_loss_clip": 0.06422772, + "auxiliary_loss_mlp": 0.01265424, + "balance_loss_clip": 0.06275512, + "balance_loss_mlp": 0.01253676, + "epoch": 0.5562603336840523, + "flos": 25052328005760.0, + "grad_norm": 1.4438817767967254, + "language_loss": 0.75297302, + "learning_rate": 1.7334301954945569e-06, + "loss": 0.82985497, + "num_input_tokens_seen": 199217845, + "router_z_loss_clip": 1.47265625, + "router_z_loss_mlp": 0.11743164, + "step": 9252, + "time_per_iteration": 2.554103374481201 + }, + { + "auxiliary_loss_clip": 0.06427599, + "auxiliary_loss_mlp": 0.01265088, + "balance_loss_clip": 0.0627709, + "balance_loss_mlp": 0.01254115, + "epoch": 0.5563204569367203, + "flos": 29066617411200.0, + "grad_norm": 1.5076691298158018, + "language_loss": 0.72903025, + "learning_rate": 1.7330442137386313e-06, + "loss": 0.80595708, + "num_input_tokens_seen": 199239250, + "router_z_loss_clip": 1.50488281, + "router_z_loss_mlp": 0.10980225, + "step": 9253, + "time_per_iteration": 2.5654473304748535 + }, + { + "auxiliary_loss_clip": 0.06422551, + "auxiliary_loss_mlp": 0.01269621, + "balance_loss_clip": 0.06276682, + "balance_loss_mlp": 0.01259161, + "epoch": 0.5563805801893883, + "flos": 22096913846400.0, + "grad_norm": 1.9717474280435598, + "language_loss": 0.83141911, + "learning_rate": 1.7326582421056965e-06, + "loss": 0.90834075, + "num_input_tokens_seen": 199258320, + "router_z_loss_clip": 1.45996094, + "router_z_loss_mlp": 0.10455322, + "step": 9254, + "time_per_iteration": 2.5113630294799805 + }, + { + "auxiliary_loss_clip": 0.06332405, + "auxiliary_loss_mlp": 0.01255231, + "balance_loss_clip": 0.06269685, + "balance_loss_mlp": 0.01253453, + "epoch": 0.5564407034420562, + "flos": 58652623555200.0, + "grad_norm": 0.8548643960281289, + "language_loss": 0.64887053, + "learning_rate": 1.732272280610387e-06, + "loss": 0.72474694, + "num_input_tokens_seen": 199314840, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 0.01777649, + "step": 9255, + "time_per_iteration": 2.980931043624878 + }, + { + "auxiliary_loss_clip": 0.06420524, + "auxiliary_loss_mlp": 0.01265076, + "balance_loss_clip": 0.06275329, + "balance_loss_mlp": 0.01254175, + "epoch": 0.5565008266947242, + "flos": 23119004350080.0, + "grad_norm": 1.731717948076331, + "language_loss": 0.69607276, + "learning_rate": 1.7318863292673399e-06, + "loss": 0.77292871, + "num_input_tokens_seen": 199335405, + "router_z_loss_clip": 1.45117188, + "router_z_loss_mlp": 0.10900879, + "step": 9256, + "time_per_iteration": 3.9532642364501953 + }, + { + "auxiliary_loss_clip": 0.06418847, + "auxiliary_loss_mlp": 0.01264994, + "balance_loss_clip": 0.06276, + "balance_loss_mlp": 0.01254551, + "epoch": 0.5565609499473921, + "flos": 21584568856320.0, + "grad_norm": 1.4749881970234011, + "language_loss": 0.76680368, + "learning_rate": 1.73150038809119e-06, + "loss": 0.84364206, + "num_input_tokens_seen": 199354345, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.10443115, + "step": 9257, + "time_per_iteration": 2.4937705993652344 + }, + { + "auxiliary_loss_clip": 0.06425476, + "auxiliary_loss_mlp": 0.01273625, + "balance_loss_clip": 0.0627654, + "balance_loss_mlp": 0.01262735, + "epoch": 0.5566210732000602, + "flos": 18375602641920.0, + "grad_norm": 2.7130999997532563, + "language_loss": 0.61334699, + "learning_rate": 1.7311144570965724e-06, + "loss": 0.69033802, + "num_input_tokens_seen": 199372250, + "router_z_loss_clip": 1.48828125, + "router_z_loss_mlp": 0.10894775, + "step": 9258, + "time_per_iteration": 2.5560710430145264 + }, + { + "auxiliary_loss_clip": 0.06420255, + "auxiliary_loss_mlp": 0.01266708, + "balance_loss_clip": 0.0627327, + "balance_loss_mlp": 0.01255431, + "epoch": 0.5566811964527281, + "flos": 25710554154240.0, + "grad_norm": 1.5983859944569927, + "language_loss": 0.79631943, + "learning_rate": 1.7307285362981215e-06, + "loss": 0.87318903, + "num_input_tokens_seen": 199392815, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 0.11279297, + "step": 9259, + "time_per_iteration": 2.582550525665283 + }, + { + "auxiliary_loss_clip": 0.06421982, + "auxiliary_loss_mlp": 0.01267837, + "balance_loss_clip": 0.06275143, + "balance_loss_mlp": 0.01257013, + "epoch": 0.5567413197053961, + "flos": 26951424468480.0, + "grad_norm": 1.7768491917262519, + "language_loss": 0.81632483, + "learning_rate": 1.7303426257104712e-06, + "loss": 0.89322305, + "num_input_tokens_seen": 199412375, + "router_z_loss_clip": 1.46777344, + "router_z_loss_mlp": 0.10821533, + "step": 9260, + "time_per_iteration": 3.994185209274292 + }, + { + "auxiliary_loss_clip": 0.0642475, + "auxiliary_loss_mlp": 0.0126987, + "balance_loss_clip": 0.06276532, + "balance_loss_mlp": 0.01257598, + "epoch": 0.556801442958064, + "flos": 20856965927040.0, + "grad_norm": 1.6577209620324271, + "language_loss": 0.69569898, + "learning_rate": 1.729956725348256e-06, + "loss": 0.77264518, + "num_input_tokens_seen": 199431490, + "router_z_loss_clip": 1.48144531, + "router_z_loss_mlp": 0.1227417, + "step": 9261, + "time_per_iteration": 2.558511734008789 + }, + { + "auxiliary_loss_clip": 0.06317247, + "auxiliary_loss_mlp": 0.01254512, + "balance_loss_clip": 0.06255186, + "balance_loss_mlp": 0.01252651, + "epoch": 0.556861566210732, + "flos": 70517395918080.0, + "grad_norm": 0.7170849600938061, + "language_loss": 0.61090672, + "learning_rate": 1.729570835226108e-06, + "loss": 0.68662429, + "num_input_tokens_seen": 199495855, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 0.01856995, + "step": 9262, + "time_per_iteration": 3.134216070175171 + }, + { + "auxiliary_loss_clip": 0.06422806, + "auxiliary_loss_mlp": 0.01270562, + "balance_loss_clip": 0.06273758, + "balance_loss_mlp": 0.01259214, + "epoch": 0.5569216894633999, + "flos": 25344216103680.0, + "grad_norm": 1.5027402480240113, + "language_loss": 0.64822662, + "learning_rate": 1.7291849553586622e-06, + "loss": 0.72516024, + "num_input_tokens_seen": 199515870, + "router_z_loss_clip": 1.48925781, + "router_z_loss_mlp": 0.11340332, + "step": 9263, + "time_per_iteration": 2.5533127784729004 + }, + { + "auxiliary_loss_clip": 0.06420417, + "auxiliary_loss_mlp": 0.01271706, + "balance_loss_clip": 0.06274161, + "balance_loss_mlp": 0.01260679, + "epoch": 0.556981812716068, + "flos": 22645456600320.0, + "grad_norm": 1.647856593864945, + "language_loss": 0.73077464, + "learning_rate": 1.7287990857605497e-06, + "loss": 0.80769587, + "num_input_tokens_seen": 199535745, + "router_z_loss_clip": 1.46484375, + "router_z_loss_mlp": 0.11035156, + "step": 9264, + "time_per_iteration": 2.5055153369903564 + }, + { + "auxiliary_loss_clip": 0.06421056, + "auxiliary_loss_mlp": 0.01267322, + "balance_loss_clip": 0.06273742, + "balance_loss_mlp": 0.01255765, + "epoch": 0.5570419359687359, + "flos": 11040567275520.0, + "grad_norm": 1.7723772076526776, + "language_loss": 0.7667138, + "learning_rate": 1.7284132264464022e-06, + "loss": 0.84359753, + "num_input_tokens_seen": 199554035, + "router_z_loss_clip": 1.47265625, + "router_z_loss_mlp": 0.11553955, + "step": 9265, + "time_per_iteration": 3.964038372039795 + }, + { + "auxiliary_loss_clip": 0.064167, + "auxiliary_loss_mlp": 0.01273186, + "balance_loss_clip": 0.06276511, + "balance_loss_mlp": 0.01262368, + "epoch": 0.5571020592214039, + "flos": 22830218853120.0, + "grad_norm": 1.7025735740351078, + "language_loss": 0.71389985, + "learning_rate": 1.7280273774308536e-06, + "loss": 0.79079872, + "num_input_tokens_seen": 199576120, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.1081543, + "step": 9266, + "time_per_iteration": 2.5572071075439453 + }, + { + "auxiliary_loss_clip": 0.06418756, + "auxiliary_loss_mlp": 0.01270352, + "balance_loss_clip": 0.06275168, + "balance_loss_mlp": 0.01259701, + "epoch": 0.5571621824740719, + "flos": 22934074389120.0, + "grad_norm": 1.5846567867344512, + "language_loss": 0.68614411, + "learning_rate": 1.727641538728533e-06, + "loss": 0.76303518, + "num_input_tokens_seen": 199593780, + "router_z_loss_clip": 1.43457031, + "router_z_loss_mlp": 0.10656738, + "step": 9267, + "time_per_iteration": 2.4949660301208496 + }, + { + "auxiliary_loss_clip": 0.06419186, + "auxiliary_loss_mlp": 0.01266996, + "balance_loss_clip": 0.06277707, + "balance_loss_mlp": 0.01255677, + "epoch": 0.5572223057267398, + "flos": 22973416680960.0, + "grad_norm": 2.0664301257613684, + "language_loss": 0.75132561, + "learning_rate": 1.7272557103540736e-06, + "loss": 0.82818741, + "num_input_tokens_seen": 199613220, + "router_z_loss_clip": 1.41601562, + "router_z_loss_mlp": 0.11315918, + "step": 9268, + "time_per_iteration": 2.5834717750549316 + }, + { + "auxiliary_loss_clip": 0.06420539, + "auxiliary_loss_mlp": 0.01262996, + "balance_loss_clip": 0.06276375, + "balance_loss_mlp": 0.01252184, + "epoch": 0.5572824289794078, + "flos": 20966439686400.0, + "grad_norm": 2.076388090189787, + "language_loss": 0.75247812, + "learning_rate": 1.726869892322104e-06, + "loss": 0.8293134, + "num_input_tokens_seen": 199632085, + "router_z_loss_clip": 1.44042969, + "router_z_loss_mlp": 0.10803223, + "step": 9269, + "time_per_iteration": 2.6340525150299072 + }, + { + "auxiliary_loss_clip": 0.06420279, + "auxiliary_loss_mlp": 0.01268076, + "balance_loss_clip": 0.06274693, + "balance_loss_mlp": 0.01257091, + "epoch": 0.5573425522320757, + "flos": 25048806134400.0, + "grad_norm": 1.9328220368280318, + "language_loss": 0.82704222, + "learning_rate": 1.726484084647256e-06, + "loss": 0.90392578, + "num_input_tokens_seen": 199649295, + "router_z_loss_clip": 1.45507812, + "router_z_loss_mlp": 0.10986328, + "step": 9270, + "time_per_iteration": 2.6455605030059814 + }, + { + "auxiliary_loss_clip": 0.06426194, + "auxiliary_loss_mlp": 0.01267053, + "balance_loss_clip": 0.06276838, + "balance_loss_mlp": 0.01255657, + "epoch": 0.5574026754847438, + "flos": 23666415073920.0, + "grad_norm": 1.8553396052443616, + "language_loss": 0.79884106, + "learning_rate": 1.7260982873441591e-06, + "loss": 0.87577355, + "num_input_tokens_seen": 199668870, + "router_z_loss_clip": 1.49414062, + "router_z_loss_mlp": 0.1138916, + "step": 9271, + "time_per_iteration": 4.060855388641357 + }, + { + "auxiliary_loss_clip": 0.0642622, + "auxiliary_loss_mlp": 0.01265728, + "balance_loss_clip": 0.0627868, + "balance_loss_mlp": 0.01254153, + "epoch": 0.5574627987374117, + "flos": 24787791066240.0, + "grad_norm": 1.7644146130703546, + "language_loss": 0.90646034, + "learning_rate": 1.725712500427442e-06, + "loss": 0.9833799, + "num_input_tokens_seen": 199684870, + "router_z_loss_clip": 1.47265625, + "router_z_loss_mlp": 0.11572266, + "step": 9272, + "time_per_iteration": 2.534665107727051 + }, + { + "auxiliary_loss_clip": 0.0641982, + "auxiliary_loss_mlp": 0.01265463, + "balance_loss_clip": 0.06279024, + "balance_loss_mlp": 0.0125446, + "epoch": 0.5575229219900797, + "flos": 21841349293440.0, + "grad_norm": 1.8989818213493146, + "language_loss": 0.84368634, + "learning_rate": 1.7253267239117347e-06, + "loss": 0.92053914, + "num_input_tokens_seen": 199701975, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.10992432, + "step": 9273, + "time_per_iteration": 2.5200788974761963 + }, + { + "auxiliary_loss_clip": 0.06423581, + "auxiliary_loss_mlp": 0.01268606, + "balance_loss_clip": 0.06278996, + "balance_loss_mlp": 0.01256059, + "epoch": 0.5575830452427476, + "flos": 27821973663360.0, + "grad_norm": 1.9193499092419828, + "language_loss": 0.75017828, + "learning_rate": 1.7249409578116655e-06, + "loss": 0.82710016, + "num_input_tokens_seen": 199721865, + "router_z_loss_clip": 1.44433594, + "router_z_loss_mlp": 0.12548828, + "step": 9274, + "time_per_iteration": 2.548865795135498 + }, + { + "auxiliary_loss_clip": 0.06435296, + "auxiliary_loss_mlp": 0.01273341, + "balance_loss_clip": 0.06282236, + "balance_loss_mlp": 0.01260806, + "epoch": 0.5576431684954156, + "flos": 17817081252480.0, + "grad_norm": 2.8160029917848397, + "language_loss": 0.78999293, + "learning_rate": 1.7245552021418629e-06, + "loss": 0.86707926, + "num_input_tokens_seen": 199736455, + "router_z_loss_clip": 1.52832031, + "router_z_loss_mlp": 0.12530518, + "step": 9275, + "time_per_iteration": 2.503168821334839 + }, + { + "auxiliary_loss_clip": 0.06426495, + "auxiliary_loss_mlp": 0.01264959, + "balance_loss_clip": 0.06279385, + "balance_loss_mlp": 0.01253372, + "epoch": 0.5577032917480835, + "flos": 15492290520960.0, + "grad_norm": 1.5722489245589244, + "language_loss": 0.75639874, + "learning_rate": 1.7241694569169546e-06, + "loss": 0.83331323, + "num_input_tokens_seen": 199753125, + "router_z_loss_clip": 1.47070312, + "router_z_loss_mlp": 0.11584473, + "step": 9276, + "time_per_iteration": 2.466275215148926 + }, + { + "auxiliary_loss_clip": 0.06423229, + "auxiliary_loss_mlp": 0.012674, + "balance_loss_clip": 0.06277048, + "balance_loss_mlp": 0.01256379, + "epoch": 0.5577634150007516, + "flos": 21586162083840.0, + "grad_norm": 1.8200099839217898, + "language_loss": 0.75387412, + "learning_rate": 1.7237837221515678e-06, + "loss": 0.83078039, + "num_input_tokens_seen": 199771365, + "router_z_loss_clip": 1.46191406, + "router_z_loss_mlp": 0.11022949, + "step": 9277, + "time_per_iteration": 2.514432907104492 + }, + { + "auxiliary_loss_clip": 0.06420221, + "auxiliary_loss_mlp": 0.01266216, + "balance_loss_clip": 0.06277104, + "balance_loss_mlp": 0.01255535, + "epoch": 0.5578235382534195, + "flos": 21145709496960.0, + "grad_norm": 1.5944068660293211, + "language_loss": 0.7198559, + "learning_rate": 1.7233979978603304e-06, + "loss": 0.79672027, + "num_input_tokens_seen": 199790035, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.10681152, + "step": 9278, + "time_per_iteration": 2.4954776763916016 + }, + { + "auxiliary_loss_clip": 0.06425839, + "auxiliary_loss_mlp": 0.01267939, + "balance_loss_clip": 0.06276909, + "balance_loss_mlp": 0.01255166, + "epoch": 0.5578836615060875, + "flos": 26512397400960.0, + "grad_norm": 1.4623548994871365, + "language_loss": 0.75693482, + "learning_rate": 1.723012284057868e-06, + "loss": 0.83387262, + "num_input_tokens_seen": 199811125, + "router_z_loss_clip": 1.48925781, + "router_z_loss_mlp": 0.12786865, + "step": 9279, + "time_per_iteration": 2.5537941455841064 + }, + { + "auxiliary_loss_clip": 0.06422286, + "auxiliary_loss_mlp": 0.01267149, + "balance_loss_clip": 0.06276134, + "balance_loss_mlp": 0.01255354, + "epoch": 0.5579437847587555, + "flos": 20159439413760.0, + "grad_norm": 1.637545301877737, + "language_loss": 0.67443848, + "learning_rate": 1.7226265807588082e-06, + "loss": 0.75133282, + "num_input_tokens_seen": 199829915, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.11791992, + "step": 9280, + "time_per_iteration": 2.489867925643921 + }, + { + "auxiliary_loss_clip": 0.06426547, + "auxiliary_loss_mlp": 0.01266943, + "balance_loss_clip": 0.06276332, + "balance_loss_mlp": 0.01255851, + "epoch": 0.5580039080114234, + "flos": 26109148826880.0, + "grad_norm": 1.5394249927656036, + "language_loss": 0.7336756, + "learning_rate": 1.7222408879777763e-06, + "loss": 0.81061053, + "num_input_tokens_seen": 199850670, + "router_z_loss_clip": 1.50195312, + "router_z_loss_mlp": 0.11090088, + "step": 9281, + "time_per_iteration": 2.693004846572876 + }, + { + "auxiliary_loss_clip": 0.06420805, + "auxiliary_loss_mlp": 0.01265902, + "balance_loss_clip": 0.06277525, + "balance_loss_mlp": 0.01255244, + "epoch": 0.5580640312640914, + "flos": 13776740426880.0, + "grad_norm": 2.347269898773066, + "language_loss": 0.75313729, + "learning_rate": 1.7218552057293974e-06, + "loss": 0.83000439, + "num_input_tokens_seen": 199867645, + "router_z_loss_clip": 1.43359375, + "router_z_loss_mlp": 0.10662842, + "step": 9282, + "time_per_iteration": 2.472775936126709 + }, + { + "auxiliary_loss_clip": 0.06421494, + "auxiliary_loss_mlp": 0.01270202, + "balance_loss_clip": 0.0627737, + "balance_loss_mlp": 0.01258871, + "epoch": 0.5581241545167593, + "flos": 17681765708160.0, + "grad_norm": 1.6208158464679243, + "language_loss": 0.66451746, + "learning_rate": 1.721469534028297e-06, + "loss": 0.74143445, + "num_input_tokens_seen": 199886320, + "router_z_loss_clip": 1.44140625, + "router_z_loss_mlp": 0.11334229, + "step": 9283, + "time_per_iteration": 2.495039224624634 + }, + { + "auxiliary_loss_clip": 0.06423882, + "auxiliary_loss_mlp": 0.01268075, + "balance_loss_clip": 0.06277125, + "balance_loss_mlp": 0.01257489, + "epoch": 0.5581842777694274, + "flos": 19574573114880.0, + "grad_norm": 1.8440828180500004, + "language_loss": 0.83265072, + "learning_rate": 1.7210838728890994e-06, + "loss": 0.90957028, + "num_input_tokens_seen": 199904895, + "router_z_loss_clip": 1.46582031, + "router_z_loss_mlp": 0.10583496, + "step": 9284, + "time_per_iteration": 2.479743719100952 + }, + { + "auxiliary_loss_clip": 0.06423684, + "auxiliary_loss_mlp": 0.01266546, + "balance_loss_clip": 0.06278059, + "balance_loss_mlp": 0.01255412, + "epoch": 0.5582444010220953, + "flos": 20601485228160.0, + "grad_norm": 2.4189186360573407, + "language_loss": 0.86142218, + "learning_rate": 1.7206982223264304e-06, + "loss": 0.93832451, + "num_input_tokens_seen": 199921090, + "router_z_loss_clip": 1.45800781, + "router_z_loss_mlp": 0.11132812, + "step": 9285, + "time_per_iteration": 2.528890371322632 + }, + { + "auxiliary_loss_clip": 0.06422924, + "auxiliary_loss_mlp": 0.01266589, + "balance_loss_clip": 0.06277917, + "balance_loss_mlp": 0.01255818, + "epoch": 0.5583045242747633, + "flos": 19141541614080.0, + "grad_norm": 2.3862114712175013, + "language_loss": 0.74476177, + "learning_rate": 1.720312582354912e-06, + "loss": 0.82165694, + "num_input_tokens_seen": 199939925, + "router_z_loss_clip": 1.44726562, + "router_z_loss_mlp": 0.10772705, + "step": 9286, + "time_per_iteration": 2.502807378768921 + }, + { + "auxiliary_loss_clip": 0.06421416, + "auxiliary_loss_mlp": 0.01267963, + "balance_loss_clip": 0.06276793, + "balance_loss_mlp": 0.01256448, + "epoch": 0.5583646475274312, + "flos": 27462050449920.0, + "grad_norm": 1.681368685974995, + "language_loss": 0.74959427, + "learning_rate": 1.7199269529891684e-06, + "loss": 0.82648808, + "num_input_tokens_seen": 199960015, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.11529541, + "step": 9287, + "time_per_iteration": 2.5700645446777344 + }, + { + "auxiliary_loss_clip": 0.06430193, + "auxiliary_loss_mlp": 0.01266629, + "balance_loss_clip": 0.06279745, + "balance_loss_mlp": 0.01254601, + "epoch": 0.5584247707800992, + "flos": 23659580966400.0, + "grad_norm": 1.4753035778898818, + "language_loss": 0.75157738, + "learning_rate": 1.7195413342438233e-06, + "loss": 0.82854563, + "num_input_tokens_seen": 199980505, + "router_z_loss_clip": 1.50488281, + "router_z_loss_mlp": 0.12036133, + "step": 9288, + "time_per_iteration": 2.529250383377075 + }, + { + "auxiliary_loss_clip": 0.06424332, + "auxiliary_loss_mlp": 0.01267185, + "balance_loss_clip": 0.06280167, + "balance_loss_mlp": 0.01254847, + "epoch": 0.5584848940327671, + "flos": 13703967555840.0, + "grad_norm": 2.2558701039351696, + "language_loss": 0.78180242, + "learning_rate": 1.7191557261334984e-06, + "loss": 0.85871768, + "num_input_tokens_seen": 199999020, + "router_z_loss_clip": 1.44238281, + "router_z_loss_mlp": 0.12329102, + "step": 9289, + "time_per_iteration": 2.5093841552734375 + }, + { + "auxiliary_loss_clip": 0.06428449, + "auxiliary_loss_mlp": 0.0126783, + "balance_loss_clip": 0.06276964, + "balance_loss_mlp": 0.01255921, + "epoch": 0.5585450172854352, + "flos": 27023526506880.0, + "grad_norm": 1.7277790144481269, + "language_loss": 0.61688149, + "learning_rate": 1.718770128672817e-06, + "loss": 0.69384426, + "num_input_tokens_seen": 200019020, + "router_z_loss_clip": 1.51464844, + "router_z_loss_mlp": 0.11914062, + "step": 9290, + "time_per_iteration": 2.5534214973449707 + }, + { + "auxiliary_loss_clip": 0.0642647, + "auxiliary_loss_mlp": 0.01268365, + "balance_loss_clip": 0.06277582, + "balance_loss_mlp": 0.01256581, + "epoch": 0.5586051405381031, + "flos": 23192406126720.0, + "grad_norm": 2.1760973422208965, + "language_loss": 0.67914414, + "learning_rate": 1.7183845418764e-06, + "loss": 0.75609255, + "num_input_tokens_seen": 200038110, + "router_z_loss_clip": 1.48925781, + "router_z_loss_mlp": 0.11767578, + "step": 9291, + "time_per_iteration": 2.5376763343811035 + }, + { + "auxiliary_loss_clip": 0.0642361, + "auxiliary_loss_mlp": 0.01267339, + "balance_loss_clip": 0.06277996, + "balance_loss_mlp": 0.01255764, + "epoch": 0.5586652637907711, + "flos": 20781551652480.0, + "grad_norm": 1.760966459417108, + "language_loss": 0.84366935, + "learning_rate": 1.7179989657588698e-06, + "loss": 0.92057884, + "num_input_tokens_seen": 200056210, + "router_z_loss_clip": 1.45703125, + "router_z_loss_mlp": 0.11578369, + "step": 9292, + "time_per_iteration": 2.5204405784606934 + }, + { + "auxiliary_loss_clip": 0.06422292, + "auxiliary_loss_mlp": 0.01268661, + "balance_loss_clip": 0.06279489, + "balance_loss_mlp": 0.01257848, + "epoch": 0.5587253870434391, + "flos": 28227360516480.0, + "grad_norm": 1.8754942991534513, + "language_loss": 0.7459076, + "learning_rate": 1.7176134003348476e-06, + "loss": 0.82281709, + "num_input_tokens_seen": 200075620, + "router_z_loss_clip": 1.42871094, + "router_z_loss_mlp": 0.10821533, + "step": 9293, + "time_per_iteration": 2.6592154502868652 + }, + { + "auxiliary_loss_clip": 0.06418014, + "auxiliary_loss_mlp": 0.01265353, + "balance_loss_clip": 0.06274671, + "balance_loss_mlp": 0.01254809, + "epoch": 0.558785510296107, + "flos": 26623128971520.0, + "grad_norm": 1.7285534178917525, + "language_loss": 0.72416651, + "learning_rate": 1.7172278456189523e-06, + "loss": 0.80100018, + "num_input_tokens_seen": 200095945, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.10546875, + "step": 9294, + "time_per_iteration": 2.538320779800415 + }, + { + "auxiliary_loss_clip": 0.06421927, + "auxiliary_loss_mlp": 0.01268134, + "balance_loss_clip": 0.06276325, + "balance_loss_mlp": 0.01257208, + "epoch": 0.558845633548775, + "flos": 20162919358080.0, + "grad_norm": 2.7937117268116656, + "language_loss": 0.69210899, + "learning_rate": 1.716842301625806e-06, + "loss": 0.76900959, + "num_input_tokens_seen": 200114185, + "router_z_loss_clip": 1.45507812, + "router_z_loss_mlp": 0.109375, + "step": 9295, + "time_per_iteration": 2.5218520164489746 + }, + { + "auxiliary_loss_clip": 0.06418794, + "auxiliary_loss_mlp": 0.0126519, + "balance_loss_clip": 0.06273518, + "balance_loss_mlp": 0.01253776, + "epoch": 0.5589057568014429, + "flos": 24357317114880.0, + "grad_norm": 1.5440712557728564, + "language_loss": 0.80893242, + "learning_rate": 1.7164567683700281e-06, + "loss": 0.88577229, + "num_input_tokens_seen": 200135030, + "router_z_loss_clip": 1.45117188, + "router_z_loss_mlp": 0.11419678, + "step": 9296, + "time_per_iteration": 3.9467618465423584 + }, + { + "auxiliary_loss_clip": 0.06419219, + "auxiliary_loss_mlp": 0.01265962, + "balance_loss_clip": 0.06275849, + "balance_loss_mlp": 0.01255019, + "epoch": 0.558965880054111, + "flos": 21111440376960.0, + "grad_norm": 1.9869508208087105, + "language_loss": 0.65690488, + "learning_rate": 1.7160712458662379e-06, + "loss": 0.73375666, + "num_input_tokens_seen": 200154290, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.10955811, + "step": 9297, + "time_per_iteration": 2.528181791305542 + }, + { + "auxiliary_loss_clip": 0.06424123, + "auxiliary_loss_mlp": 0.01267328, + "balance_loss_clip": 0.06275574, + "balance_loss_mlp": 0.0125527, + "epoch": 0.5590260033067789, + "flos": 18440954426880.0, + "grad_norm": 1.490575561372924, + "language_loss": 0.75263643, + "learning_rate": 1.7156857341290544e-06, + "loss": 0.82955098, + "num_input_tokens_seen": 200171555, + "router_z_loss_clip": 1.48632812, + "router_z_loss_mlp": 0.12054443, + "step": 9298, + "time_per_iteration": 2.5208308696746826 + }, + { + "auxiliary_loss_clip": 0.06311645, + "auxiliary_loss_mlp": 0.01252986, + "balance_loss_clip": 0.06249566, + "balance_loss_mlp": 0.01251184, + "epoch": 0.5590861265594469, + "flos": 70597673729280.0, + "grad_norm": 0.6945904868111653, + "language_loss": 0.52248931, + "learning_rate": 1.7153002331730967e-06, + "loss": 0.59813559, + "num_input_tokens_seen": 200237010, + "router_z_loss_clip": 0.62158203, + "router_z_loss_mlp": 0.01797485, + "step": 9299, + "time_per_iteration": 4.702880144119263 + }, + { + "auxiliary_loss_clip": 0.06418106, + "auxiliary_loss_mlp": 0.01267473, + "balance_loss_clip": 0.06276019, + "balance_loss_mlp": 0.01256905, + "epoch": 0.5591462498121148, + "flos": 30672274475520.0, + "grad_norm": 1.7758709427362191, + "language_loss": 0.68987107, + "learning_rate": 1.7149147430129824e-06, + "loss": 0.76672685, + "num_input_tokens_seen": 200260820, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.10571289, + "step": 9300, + "time_per_iteration": 2.6169886589050293 + }, + { + "auxiliary_loss_clip": 0.06428309, + "auxiliary_loss_mlp": 0.01266499, + "balance_loss_clip": 0.06278549, + "balance_loss_mlp": 0.01254727, + "epoch": 0.5592063730647828, + "flos": 18156319706880.0, + "grad_norm": 3.029569475440017, + "language_loss": 0.81908011, + "learning_rate": 1.7145292636633293e-06, + "loss": 0.89602816, + "num_input_tokens_seen": 200278035, + "router_z_loss_clip": 1.49804688, + "router_z_loss_mlp": 0.11761475, + "step": 9301, + "time_per_iteration": 2.4880383014678955 + }, + { + "auxiliary_loss_clip": 0.06421784, + "auxiliary_loss_mlp": 0.0126742, + "balance_loss_clip": 0.06274376, + "balance_loss_mlp": 0.01256101, + "epoch": 0.5592664963174507, + "flos": 24067148025600.0, + "grad_norm": 2.0495431587104216, + "language_loss": 0.67981839, + "learning_rate": 1.714143795138756e-06, + "loss": 0.75671041, + "num_input_tokens_seen": 200297255, + "router_z_loss_clip": 1.47265625, + "router_z_loss_mlp": 0.11315918, + "step": 9302, + "time_per_iteration": 2.5440263748168945 + }, + { + "auxiliary_loss_clip": 0.06427488, + "auxiliary_loss_mlp": 0.01266173, + "balance_loss_clip": 0.0627801, + "balance_loss_mlp": 0.01254121, + "epoch": 0.5593266195701188, + "flos": 19833911101440.0, + "grad_norm": 1.543967288464222, + "language_loss": 0.70932961, + "learning_rate": 1.713758337453878e-06, + "loss": 0.78626627, + "num_input_tokens_seen": 200317505, + "router_z_loss_clip": 1.49609375, + "router_z_loss_mlp": 0.12042236, + "step": 9303, + "time_per_iteration": 2.52182674407959 + }, + { + "auxiliary_loss_clip": 0.06417537, + "auxiliary_loss_mlp": 0.01265621, + "balance_loss_clip": 0.06276484, + "balance_loss_mlp": 0.01255453, + "epoch": 0.5593867428227867, + "flos": 25307682923520.0, + "grad_norm": 1.5891501411536748, + "language_loss": 0.73189592, + "learning_rate": 1.7133728906233124e-06, + "loss": 0.8087275, + "num_input_tokens_seen": 200338350, + "router_z_loss_clip": 1.41210938, + "router_z_loss_mlp": 0.10168457, + "step": 9304, + "time_per_iteration": 3.999878406524658 + }, + { + "auxiliary_loss_clip": 0.06421353, + "auxiliary_loss_mlp": 0.01266821, + "balance_loss_clip": 0.06276563, + "balance_loss_mlp": 0.01255693, + "epoch": 0.5594468660754547, + "flos": 12938028583680.0, + "grad_norm": 2.1417504305353563, + "language_loss": 0.78262866, + "learning_rate": 1.7129874546616763e-06, + "loss": 0.85951042, + "num_input_tokens_seen": 200353965, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.11132812, + "step": 9305, + "time_per_iteration": 2.5058751106262207 + }, + { + "auxiliary_loss_clip": 0.06419225, + "auxiliary_loss_mlp": 0.0126404, + "balance_loss_clip": 0.06278518, + "balance_loss_mlp": 0.01253341, + "epoch": 0.5595069893281227, + "flos": 19068768743040.0, + "grad_norm": 1.6214418695958237, + "language_loss": 0.69748855, + "learning_rate": 1.7126020295835836e-06, + "loss": 0.7743212, + "num_input_tokens_seen": 200373595, + "router_z_loss_clip": 1.40429688, + "router_z_loss_mlp": 0.10705566, + "step": 9306, + "time_per_iteration": 2.5216495990753174 + }, + { + "auxiliary_loss_clip": 0.06329086, + "auxiliary_loss_mlp": 0.01251264, + "balance_loss_clip": 0.06266434, + "balance_loss_mlp": 0.01249626, + "epoch": 0.5595671125807906, + "flos": 70291530437760.0, + "grad_norm": 0.8883282828550626, + "language_loss": 0.60321748, + "learning_rate": 1.7122166154036518e-06, + "loss": 0.679021, + "num_input_tokens_seen": 200429155, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 0.0164032, + "step": 9307, + "time_per_iteration": 3.2440812587738037 + }, + { + "auxiliary_loss_clip": 0.06421244, + "auxiliary_loss_mlp": 0.01267485, + "balance_loss_clip": 0.06278248, + "balance_loss_mlp": 0.01257013, + "epoch": 0.5596272358334586, + "flos": 20671407060480.0, + "grad_norm": 1.5654652346016935, + "language_loss": 0.7418704, + "learning_rate": 1.7118312121364943e-06, + "loss": 0.81875765, + "num_input_tokens_seen": 200448290, + "router_z_loss_clip": 1.42871094, + "router_z_loss_mlp": 0.10467529, + "step": 9308, + "time_per_iteration": 2.527722120285034 + }, + { + "auxiliary_loss_clip": 0.06423165, + "auxiliary_loss_mlp": 0.01265447, + "balance_loss_clip": 0.06275736, + "balance_loss_mlp": 0.01253371, + "epoch": 0.5596873590861265, + "flos": 25047170979840.0, + "grad_norm": 1.7977154981427412, + "language_loss": 0.70390081, + "learning_rate": 1.7114458197967257e-06, + "loss": 0.78078693, + "num_input_tokens_seen": 200466555, + "router_z_loss_clip": 1.47363281, + "router_z_loss_mlp": 0.12072754, + "step": 9309, + "time_per_iteration": 2.5592753887176514 + }, + { + "auxiliary_loss_clip": 0.06425751, + "auxiliary_loss_mlp": 0.01268716, + "balance_loss_clip": 0.06278521, + "balance_loss_mlp": 0.01255889, + "epoch": 0.5597474823387946, + "flos": 25965573655680.0, + "grad_norm": 1.826608872454741, + "language_loss": 0.7546587, + "learning_rate": 1.7110604383989613e-06, + "loss": 0.83160329, + "num_input_tokens_seen": 200485980, + "router_z_loss_clip": 1.47265625, + "router_z_loss_mlp": 0.12835693, + "step": 9310, + "time_per_iteration": 2.5775809288024902 + }, + { + "auxiliary_loss_clip": 0.06428897, + "auxiliary_loss_mlp": 0.01266019, + "balance_loss_clip": 0.06280525, + "balance_loss_mlp": 0.0125343, + "epoch": 0.5598076055914625, + "flos": 26184688882560.0, + "grad_norm": 2.287225356977705, + "language_loss": 0.70149207, + "learning_rate": 1.7106750679578133e-06, + "loss": 0.77844125, + "num_input_tokens_seen": 200504555, + "router_z_loss_clip": 1.48339844, + "router_z_loss_mlp": 0.12579346, + "step": 9311, + "time_per_iteration": 3.9833383560180664 + }, + { + "auxiliary_loss_clip": 0.06422099, + "auxiliary_loss_mlp": 0.01265696, + "balance_loss_clip": 0.06277782, + "balance_loss_mlp": 0.01254061, + "epoch": 0.5598677288441305, + "flos": 11660541235200.0, + "grad_norm": 2.2749325214124605, + "language_loss": 0.72917002, + "learning_rate": 1.7102897084878962e-06, + "loss": 0.80604798, + "num_input_tokens_seen": 200522700, + "router_z_loss_clip": 1.44335938, + "router_z_loss_mlp": 0.11645508, + "step": 9312, + "time_per_iteration": 2.5323050022125244 + }, + { + "auxiliary_loss_clip": 0.06420854, + "auxiliary_loss_mlp": 0.01267281, + "balance_loss_clip": 0.06276432, + "balance_loss_mlp": 0.01255772, + "epoch": 0.5599278520967984, + "flos": 22973290899840.0, + "grad_norm": 1.8427769518341257, + "language_loss": 0.89498973, + "learning_rate": 1.709904360003822e-06, + "loss": 0.97187102, + "num_input_tokens_seen": 200541910, + "router_z_loss_clip": 1.44335938, + "router_z_loss_mlp": 0.1151123, + "step": 9313, + "time_per_iteration": 2.5141191482543945 + }, + { + "auxiliary_loss_clip": 0.06423395, + "auxiliary_loss_mlp": 0.01268039, + "balance_loss_clip": 0.06279235, + "balance_loss_mlp": 0.01256804, + "epoch": 0.5599879753494664, + "flos": 21222004239360.0, + "grad_norm": 1.3323867384007686, + "language_loss": 0.7802453, + "learning_rate": 1.709519022520204e-06, + "loss": 0.85715961, + "num_input_tokens_seen": 200562600, + "router_z_loss_clip": 1.44140625, + "router_z_loss_mlp": 0.11242676, + "step": 9314, + "time_per_iteration": 2.587451934814453 + }, + { + "auxiliary_loss_clip": 0.06420899, + "auxiliary_loss_mlp": 0.01265189, + "balance_loss_clip": 0.06276683, + "balance_loss_mlp": 0.01254109, + "epoch": 0.5600480986021343, + "flos": 31911006510720.0, + "grad_norm": 1.5829567025911722, + "language_loss": 0.70587456, + "learning_rate": 1.7091336960516537e-06, + "loss": 0.78273547, + "num_input_tokens_seen": 200584795, + "router_z_loss_clip": 1.44140625, + "router_z_loss_mlp": 0.11083984, + "step": 9315, + "time_per_iteration": 2.585667371749878 + }, + { + "auxiliary_loss_clip": 0.06425041, + "auxiliary_loss_mlp": 0.01268206, + "balance_loss_clip": 0.06275864, + "balance_loss_mlp": 0.01256571, + "epoch": 0.5601082218548024, + "flos": 28483679756160.0, + "grad_norm": 1.7585144874491871, + "language_loss": 0.67066777, + "learning_rate": 1.7087483806127824e-06, + "loss": 0.7476002, + "num_input_tokens_seen": 200606945, + "router_z_loss_clip": 1.49023438, + "router_z_loss_mlp": 0.11645508, + "step": 9316, + "time_per_iteration": 2.5536792278289795 + }, + { + "auxiliary_loss_clip": 0.06421398, + "auxiliary_loss_mlp": 0.01264577, + "balance_loss_clip": 0.06276462, + "balance_loss_mlp": 0.01253324, + "epoch": 0.5601683451074703, + "flos": 24103974695040.0, + "grad_norm": 1.9270955506174936, + "language_loss": 0.87415564, + "learning_rate": 1.7083630762182022e-06, + "loss": 0.95101541, + "num_input_tokens_seen": 200626340, + "router_z_loss_clip": 1.45019531, + "router_z_loss_mlp": 0.11236572, + "step": 9317, + "time_per_iteration": 2.6297550201416016 + }, + { + "auxiliary_loss_clip": 0.06425779, + "auxiliary_loss_mlp": 0.01267741, + "balance_loss_clip": 0.06277692, + "balance_loss_mlp": 0.01255122, + "epoch": 0.5602284683601383, + "flos": 26362868590080.0, + "grad_norm": 1.81541721599753, + "language_loss": 0.77282947, + "learning_rate": 1.7079777828825233e-06, + "loss": 0.84976465, + "num_input_tokens_seen": 200644520, + "router_z_loss_clip": 1.47949219, + "router_z_loss_mlp": 0.1260376, + "step": 9318, + "time_per_iteration": 2.558359146118164 + }, + { + "auxiliary_loss_clip": 0.06418364, + "auxiliary_loss_mlp": 0.01266654, + "balance_loss_clip": 0.06273092, + "balance_loss_mlp": 0.01256301, + "epoch": 0.5602885916128063, + "flos": 24502904784000.0, + "grad_norm": 1.570238706906967, + "language_loss": 0.76465648, + "learning_rate": 1.7075925006203558e-06, + "loss": 0.84150666, + "num_input_tokens_seen": 200664845, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.10357666, + "step": 9319, + "time_per_iteration": 2.526543617248535 + }, + { + "auxiliary_loss_clip": 0.06418289, + "auxiliary_loss_mlp": 0.01264734, + "balance_loss_clip": 0.06273629, + "balance_loss_mlp": 0.01253427, + "epoch": 0.5603487148654742, + "flos": 27352450909440.0, + "grad_norm": 1.3333617188310043, + "language_loss": 0.85846102, + "learning_rate": 1.7072072294463101e-06, + "loss": 0.93529117, + "num_input_tokens_seen": 200686535, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.11309814, + "step": 9320, + "time_per_iteration": 2.5673651695251465 + }, + { + "auxiliary_loss_clip": 0.06334086, + "auxiliary_loss_mlp": 0.01252081, + "balance_loss_clip": 0.06272272, + "balance_loss_mlp": 0.01250187, + "epoch": 0.5604088381181422, + "flos": 54105555962880.0, + "grad_norm": 0.7541324814402665, + "language_loss": 0.52607638, + "learning_rate": 1.706821969374996e-06, + "loss": 0.60193801, + "num_input_tokens_seen": 200736965, + "router_z_loss_clip": 0.6171875, + "router_z_loss_mlp": 0.01890564, + "step": 9321, + "time_per_iteration": 2.977881908416748 + }, + { + "auxiliary_loss_clip": 0.06418586, + "auxiliary_loss_mlp": 0.01265276, + "balance_loss_clip": 0.06276635, + "balance_loss_mlp": 0.01254208, + "epoch": 0.5604689613708101, + "flos": 22242878858880.0, + "grad_norm": 1.3667787345793438, + "language_loss": 0.7480129, + "learning_rate": 1.7064367204210216e-06, + "loss": 0.82485151, + "num_input_tokens_seen": 200757420, + "router_z_loss_clip": 1.41699219, + "router_z_loss_mlp": 0.1105957, + "step": 9322, + "time_per_iteration": 2.532274007797241 + }, + { + "auxiliary_loss_clip": 0.06422681, + "auxiliary_loss_mlp": 0.01271383, + "balance_loss_clip": 0.06276275, + "balance_loss_mlp": 0.01258842, + "epoch": 0.5605290846234782, + "flos": 35306370132480.0, + "grad_norm": 1.7253794934771503, + "language_loss": 0.73680359, + "learning_rate": 1.7060514825989963e-06, + "loss": 0.81374425, + "num_input_tokens_seen": 200779520, + "router_z_loss_clip": 1.46289062, + "router_z_loss_mlp": 0.12542725, + "step": 9323, + "time_per_iteration": 2.6399970054626465 + }, + { + "auxiliary_loss_clip": 0.06425279, + "auxiliary_loss_mlp": 0.01266665, + "balance_loss_clip": 0.06275266, + "balance_loss_mlp": 0.01254505, + "epoch": 0.5605892078761461, + "flos": 20268997027200.0, + "grad_norm": 1.5398366577575928, + "language_loss": 0.62584162, + "learning_rate": 1.7056662559235286e-06, + "loss": 0.70276111, + "num_input_tokens_seen": 200799485, + "router_z_loss_clip": 1.49902344, + "router_z_loss_mlp": 0.12164307, + "step": 9324, + "time_per_iteration": 2.5179386138916016 + }, + { + "auxiliary_loss_clip": 0.06420085, + "auxiliary_loss_mlp": 0.01268132, + "balance_loss_clip": 0.0627415, + "balance_loss_mlp": 0.01255055, + "epoch": 0.5606493311288141, + "flos": 17313582867840.0, + "grad_norm": 2.467078298144656, + "language_loss": 0.88032669, + "learning_rate": 1.705281040409226e-06, + "loss": 0.95720887, + "num_input_tokens_seen": 200817540, + "router_z_loss_clip": 1.45898438, + "router_z_loss_mlp": 0.13092041, + "step": 9325, + "time_per_iteration": 2.5009984970092773 + }, + { + "auxiliary_loss_clip": 0.06425651, + "auxiliary_loss_mlp": 0.01271739, + "balance_loss_clip": 0.0627806, + "balance_loss_mlp": 0.01259454, + "epoch": 0.560709454381482, + "flos": 21659438079360.0, + "grad_norm": 1.5802994463075606, + "language_loss": 0.74048662, + "learning_rate": 1.7048958360706952e-06, + "loss": 0.81746054, + "num_input_tokens_seen": 200838380, + "router_z_loss_clip": 1.47558594, + "router_z_loss_mlp": 0.1229248, + "step": 9326, + "time_per_iteration": 2.53534197807312 + }, + { + "auxiliary_loss_clip": 0.06427591, + "auxiliary_loss_mlp": 0.0127498, + "balance_loss_clip": 0.06276761, + "balance_loss_mlp": 0.01262648, + "epoch": 0.56076957763415, + "flos": 20309639057280.0, + "grad_norm": 1.7151684776487535, + "language_loss": 0.79090071, + "learning_rate": 1.7045106429225447e-06, + "loss": 0.86792642, + "num_input_tokens_seen": 200855640, + "router_z_loss_clip": 1.50976562, + "router_z_loss_mlp": 0.12329102, + "step": 9327, + "time_per_iteration": 2.505734920501709 + }, + { + "auxiliary_loss_clip": 0.06422938, + "auxiliary_loss_mlp": 0.01268373, + "balance_loss_clip": 0.06277183, + "balance_loss_mlp": 0.01256201, + "epoch": 0.5608297008868179, + "flos": 25052873057280.0, + "grad_norm": 1.3540928387883675, + "language_loss": 0.7848016, + "learning_rate": 1.7041254609793795e-06, + "loss": 0.86171472, + "num_input_tokens_seen": 200876585, + "router_z_loss_clip": 1.45605469, + "router_z_loss_mlp": 0.12176514, + "step": 9328, + "time_per_iteration": 2.5479724407196045 + }, + { + "auxiliary_loss_clip": 0.06421052, + "auxiliary_loss_mlp": 0.01265937, + "balance_loss_clip": 0.06277333, + "balance_loss_mlp": 0.01255023, + "epoch": 0.560889824139486, + "flos": 19873253393280.0, + "grad_norm": 1.4144017329991472, + "language_loss": 0.7383225, + "learning_rate": 1.7037402902558066e-06, + "loss": 0.8151924, + "num_input_tokens_seen": 200898175, + "router_z_loss_clip": 1.43652344, + "router_z_loss_mlp": 0.10913086, + "step": 9329, + "time_per_iteration": 2.665193796157837 + }, + { + "auxiliary_loss_clip": 0.06430677, + "auxiliary_loss_mlp": 0.01265446, + "balance_loss_clip": 0.06278004, + "balance_loss_mlp": 0.01253269, + "epoch": 0.5609499473921539, + "flos": 22935961105920.0, + "grad_norm": 1.4811079467360542, + "language_loss": 0.83903289, + "learning_rate": 1.7033551307664324e-06, + "loss": 0.91599417, + "num_input_tokens_seen": 200917515, + "router_z_loss_clip": 1.5234375, + "router_z_loss_mlp": 0.12176514, + "step": 9330, + "time_per_iteration": 2.574812650680542 + }, + { + "auxiliary_loss_clip": 0.06343255, + "auxiliary_loss_mlp": 0.01254504, + "balance_loss_clip": 0.06281585, + "balance_loss_mlp": 0.01252853, + "epoch": 0.5610100706448219, + "flos": 53054479146240.0, + "grad_norm": 0.7010589280292991, + "language_loss": 0.57785869, + "learning_rate": 1.7029699825258603e-06, + "loss": 0.65383625, + "num_input_tokens_seen": 200978615, + "router_z_loss_clip": 0.6171875, + "router_z_loss_mlp": 0.01654053, + "step": 9331, + "time_per_iteration": 3.16204833984375 + }, + { + "auxiliary_loss_clip": 0.06429492, + "auxiliary_loss_mlp": 0.01266406, + "balance_loss_clip": 0.06280065, + "balance_loss_mlp": 0.01254723, + "epoch": 0.5610701938974898, + "flos": 21841349293440.0, + "grad_norm": 1.62115536838187, + "language_loss": 0.81915009, + "learning_rate": 1.7025848455486971e-06, + "loss": 0.89610904, + "num_input_tokens_seen": 200997745, + "router_z_loss_clip": 1.49414062, + "router_z_loss_mlp": 0.11682129, + "step": 9332, + "time_per_iteration": 2.503162145614624 + }, + { + "auxiliary_loss_clip": 0.06436246, + "auxiliary_loss_mlp": 0.01268376, + "balance_loss_clip": 0.06285603, + "balance_loss_mlp": 0.01255936, + "epoch": 0.5611303171501578, + "flos": 17462943970560.0, + "grad_norm": 2.4447262023658314, + "language_loss": 0.8238855, + "learning_rate": 1.7021997198495454e-06, + "loss": 0.90093172, + "num_input_tokens_seen": 201016370, + "router_z_loss_clip": 1.50585938, + "router_z_loss_mlp": 0.12451172, + "step": 9333, + "time_per_iteration": 2.5434911251068115 + }, + { + "auxiliary_loss_clip": 0.06429712, + "auxiliary_loss_mlp": 0.01266007, + "balance_loss_clip": 0.062811, + "balance_loss_mlp": 0.01254843, + "epoch": 0.5611904404028258, + "flos": 22644366497280.0, + "grad_norm": 1.7517485290647843, + "language_loss": 0.73036361, + "learning_rate": 1.7018146054430108e-06, + "loss": 0.80732077, + "num_input_tokens_seen": 201034310, + "router_z_loss_clip": 1.48535156, + "router_z_loss_mlp": 0.11157227, + "step": 9334, + "time_per_iteration": 2.5099892616271973 + }, + { + "auxiliary_loss_clip": 0.06427494, + "auxiliary_loss_mlp": 0.01271173, + "balance_loss_clip": 0.06281948, + "balance_loss_mlp": 0.01259771, + "epoch": 0.5612505636554938, + "flos": 14321048549760.0, + "grad_norm": 1.6258746678295788, + "language_loss": 0.71251893, + "learning_rate": 1.7014295023436961e-06, + "loss": 0.7895056, + "num_input_tokens_seen": 201052030, + "router_z_loss_clip": 1.453125, + "router_z_loss_mlp": 0.11395264, + "step": 9335, + "time_per_iteration": 3.8910462856292725 + }, + { + "auxiliary_loss_clip": 0.06430685, + "auxiliary_loss_mlp": 0.01266094, + "balance_loss_clip": 0.06283418, + "balance_loss_mlp": 0.01254149, + "epoch": 0.5613106869081618, + "flos": 16513835973120.0, + "grad_norm": 1.6562270786725333, + "language_loss": 0.7703501, + "learning_rate": 1.701044410566205e-06, + "loss": 0.84731793, + "num_input_tokens_seen": 201068445, + "router_z_loss_clip": 1.46972656, + "router_z_loss_mlp": 0.11932373, + "step": 9336, + "time_per_iteration": 2.5473687648773193 + }, + { + "auxiliary_loss_clip": 0.0642574, + "auxiliary_loss_mlp": 0.01265654, + "balance_loss_clip": 0.06282386, + "balance_loss_mlp": 0.0125489, + "epoch": 0.5613708101608297, + "flos": 24065009746560.0, + "grad_norm": 2.1630350478443625, + "language_loss": 0.64571506, + "learning_rate": 1.7006593301251393e-06, + "loss": 0.72262907, + "num_input_tokens_seen": 201082140, + "router_z_loss_clip": 1.43359375, + "router_z_loss_mlp": 0.10766602, + "step": 9337, + "time_per_iteration": 2.5193097591400146 + }, + { + "auxiliary_loss_clip": 0.06341661, + "auxiliary_loss_mlp": 0.01252845, + "balance_loss_clip": 0.06279477, + "balance_loss_mlp": 0.01251057, + "epoch": 0.5614309334134977, + "flos": 64922284984320.0, + "grad_norm": 0.883081868959654, + "language_loss": 0.62614578, + "learning_rate": 1.700274261035102e-06, + "loss": 0.7020908, + "num_input_tokens_seen": 201137245, + "router_z_loss_clip": 0.62353516, + "router_z_loss_mlp": 0.01785278, + "step": 9338, + "time_per_iteration": 3.115088939666748 + }, + { + "auxiliary_loss_clip": 0.06430536, + "auxiliary_loss_mlp": 0.01265908, + "balance_loss_clip": 0.0628281, + "balance_loss_mlp": 0.01254428, + "epoch": 0.5614910566661656, + "flos": 32926975666560.0, + "grad_norm": 1.7643724476932883, + "language_loss": 0.66069186, + "learning_rate": 1.6998892033106946e-06, + "loss": 0.73765635, + "num_input_tokens_seen": 201157270, + "router_z_loss_clip": 1.47851562, + "router_z_loss_mlp": 0.11474609, + "step": 9339, + "time_per_iteration": 4.156280040740967 + }, + { + "auxiliary_loss_clip": 0.06427112, + "auxiliary_loss_mlp": 0.01266835, + "balance_loss_clip": 0.06283177, + "balance_loss_mlp": 0.01254055, + "epoch": 0.5615511799188336, + "flos": 18594927504000.0, + "grad_norm": 1.6693116386089952, + "language_loss": 0.69893128, + "learning_rate": 1.6995041569665184e-06, + "loss": 0.77587074, + "num_input_tokens_seen": 201174530, + "router_z_loss_clip": 1.44042969, + "router_z_loss_mlp": 0.12774658, + "step": 9340, + "time_per_iteration": 2.4951670169830322 + }, + { + "auxiliary_loss_clip": 0.06425936, + "auxiliary_loss_mlp": 0.0126872, + "balance_loss_clip": 0.06286716, + "balance_loss_mlp": 0.01257168, + "epoch": 0.5616113031715015, + "flos": 22826571200640.0, + "grad_norm": 1.554264314492227, + "language_loss": 0.77897537, + "learning_rate": 1.6991191220171756e-06, + "loss": 0.85592192, + "num_input_tokens_seen": 201194905, + "router_z_loss_clip": 1.39257812, + "router_z_loss_mlp": 0.11566162, + "step": 9341, + "time_per_iteration": 2.557020902633667 + }, + { + "auxiliary_loss_clip": 0.06432091, + "auxiliary_loss_mlp": 0.01269049, + "balance_loss_clip": 0.06284195, + "balance_loss_mlp": 0.01256776, + "epoch": 0.5616714264241696, + "flos": 22352184910080.0, + "grad_norm": 1.797407374183417, + "language_loss": 0.80132401, + "learning_rate": 1.6987340984772653e-06, + "loss": 0.87833536, + "num_input_tokens_seen": 201213715, + "router_z_loss_clip": 1.47851562, + "router_z_loss_mlp": 0.12261963, + "step": 9342, + "time_per_iteration": 2.5441479682922363 + }, + { + "auxiliary_loss_clip": 0.06439396, + "auxiliary_loss_mlp": 0.01269037, + "balance_loss_clip": 0.06290646, + "balance_loss_mlp": 0.01257325, + "epoch": 0.5617315496768375, + "flos": 18813875022720.0, + "grad_norm": 2.3951377685236346, + "language_loss": 0.75757158, + "learning_rate": 1.6983490863613882e-06, + "loss": 0.83465594, + "num_input_tokens_seen": 201231415, + "router_z_loss_clip": 1.48828125, + "router_z_loss_mlp": 0.1171875, + "step": 9343, + "time_per_iteration": 2.552783489227295 + }, + { + "auxiliary_loss_clip": 0.06435137, + "auxiliary_loss_mlp": 0.01268416, + "balance_loss_clip": 0.06290908, + "balance_loss_mlp": 0.0125656, + "epoch": 0.5617916729295055, + "flos": 18375225298560.0, + "grad_norm": 1.7365132961619254, + "language_loss": 0.69429743, + "learning_rate": 1.6979640856841442e-06, + "loss": 0.77133292, + "num_input_tokens_seen": 201249625, + "router_z_loss_clip": 1.44140625, + "router_z_loss_mlp": 0.11853027, + "step": 9344, + "time_per_iteration": 3.940319061279297 + }, + { + "auxiliary_loss_clip": 0.06436205, + "auxiliary_loss_mlp": 0.01265611, + "balance_loss_clip": 0.06290596, + "balance_loss_mlp": 0.01254048, + "epoch": 0.5618517961821734, + "flos": 28186844267520.0, + "grad_norm": 2.084209166838754, + "language_loss": 0.66667032, + "learning_rate": 1.6975790964601318e-06, + "loss": 0.74368846, + "num_input_tokens_seen": 201271205, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.11560059, + "step": 9345, + "time_per_iteration": 2.5695786476135254 + }, + { + "auxiliary_loss_clip": 0.06434141, + "auxiliary_loss_mlp": 0.01269002, + "balance_loss_clip": 0.06287882, + "balance_loss_mlp": 0.01257683, + "epoch": 0.5619119194348414, + "flos": 15492290520960.0, + "grad_norm": 1.7418235878832828, + "language_loss": 0.88078266, + "learning_rate": 1.6971941187039512e-06, + "loss": 0.9578141, + "num_input_tokens_seen": 201287700, + "router_z_loss_clip": 1.46289062, + "router_z_loss_mlp": 0.11328125, + "step": 9346, + "time_per_iteration": 2.470212697982788 + }, + { + "auxiliary_loss_clip": 0.06433322, + "auxiliary_loss_mlp": 0.01273387, + "balance_loss_clip": 0.06289656, + "balance_loss_mlp": 0.01261257, + "epoch": 0.5619720426875094, + "flos": 29135700702720.0, + "grad_norm": 2.0124429779516335, + "language_loss": 0.5980221, + "learning_rate": 1.6968091524301993e-06, + "loss": 0.67508924, + "num_input_tokens_seen": 201307530, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.12139893, + "step": 9347, + "time_per_iteration": 2.5825982093811035 + }, + { + "auxiliary_loss_clip": 0.06435403, + "auxiliary_loss_mlp": 0.01270938, + "balance_loss_clip": 0.06288013, + "balance_loss_mlp": 0.01258349, + "epoch": 0.5620321659401774, + "flos": 18009474226560.0, + "grad_norm": 2.2126455504112066, + "language_loss": 0.69822383, + "learning_rate": 1.6964241976534745e-06, + "loss": 0.77528727, + "num_input_tokens_seen": 201326210, + "router_z_loss_clip": 1.47265625, + "router_z_loss_mlp": 0.12609863, + "step": 9348, + "time_per_iteration": 2.5037167072296143 + }, + { + "auxiliary_loss_clip": 0.0644159, + "auxiliary_loss_mlp": 0.01266077, + "balance_loss_clip": 0.06289469, + "balance_loss_mlp": 0.01254037, + "epoch": 0.5620922891928454, + "flos": 20600730541440.0, + "grad_norm": 3.445873194626742, + "language_loss": 0.79441649, + "learning_rate": 1.6960392543883754e-06, + "loss": 0.87149316, + "num_input_tokens_seen": 201346120, + "router_z_loss_clip": 1.52148438, + "router_z_loss_mlp": 0.12036133, + "step": 9349, + "time_per_iteration": 2.5519816875457764 + }, + { + "auxiliary_loss_clip": 0.06431362, + "auxiliary_loss_mlp": 0.01269513, + "balance_loss_clip": 0.06285249, + "balance_loss_mlp": 0.01257014, + "epoch": 0.5621524124455133, + "flos": 26294288423040.0, + "grad_norm": 2.015932955485816, + "language_loss": 0.67743355, + "learning_rate": 1.6956543226494975e-06, + "loss": 0.75444239, + "num_input_tokens_seen": 201365700, + "router_z_loss_clip": 1.45996094, + "router_z_loss_mlp": 0.12493896, + "step": 9350, + "time_per_iteration": 4.01330304145813 + }, + { + "auxiliary_loss_clip": 0.06434298, + "auxiliary_loss_mlp": 0.012681, + "balance_loss_clip": 0.06285301, + "balance_loss_mlp": 0.01256281, + "epoch": 0.5622125356981813, + "flos": 12755236901760.0, + "grad_norm": 2.011118504157059, + "language_loss": 0.78970456, + "learning_rate": 1.6952694024514381e-06, + "loss": 0.86672854, + "num_input_tokens_seen": 201382795, + "router_z_loss_clip": 1.49023438, + "router_z_loss_mlp": 0.11834717, + "step": 9351, + "time_per_iteration": 2.502434015274048 + }, + { + "auxiliary_loss_clip": 0.06430681, + "auxiliary_loss_mlp": 0.01265572, + "balance_loss_clip": 0.06279105, + "balance_loss_mlp": 0.01252894, + "epoch": 0.5622726589508492, + "flos": 23812086597120.0, + "grad_norm": 1.4860121982116354, + "language_loss": 0.59339732, + "learning_rate": 1.6948844938087945e-06, + "loss": 0.67035985, + "num_input_tokens_seen": 201402780, + "router_z_loss_clip": 1.51660156, + "router_z_loss_mlp": 0.12677002, + "step": 9352, + "time_per_iteration": 2.5574684143066406 + }, + { + "auxiliary_loss_clip": 0.06420172, + "auxiliary_loss_mlp": 0.01265668, + "balance_loss_clip": 0.062802, + "balance_loss_mlp": 0.01255041, + "epoch": 0.5623327822035172, + "flos": 24725248392960.0, + "grad_norm": 2.450009031651053, + "language_loss": 0.72177416, + "learning_rate": 1.6944995967361604e-06, + "loss": 0.7986325, + "num_input_tokens_seen": 201424140, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.10632324, + "step": 9353, + "time_per_iteration": 2.5429112911224365 + }, + { + "auxiliary_loss_clip": 0.06427602, + "auxiliary_loss_mlp": 0.01266418, + "balance_loss_clip": 0.06280185, + "balance_loss_mlp": 0.01255207, + "epoch": 0.5623929054561851, + "flos": 14023081031040.0, + "grad_norm": 3.091375667054191, + "language_loss": 0.7687071, + "learning_rate": 1.6941147112481327e-06, + "loss": 0.84564734, + "num_input_tokens_seen": 201439645, + "router_z_loss_clip": 1.47460938, + "router_z_loss_mlp": 0.11212158, + "step": 9354, + "time_per_iteration": 2.511843204498291 + }, + { + "auxiliary_loss_clip": 0.0643307, + "auxiliary_loss_mlp": 0.01268158, + "balance_loss_clip": 0.0628096, + "balance_loss_mlp": 0.01256672, + "epoch": 0.5624530287088532, + "flos": 20710707425280.0, + "grad_norm": 1.9243574999426976, + "language_loss": 0.72663665, + "learning_rate": 1.6937298373593056e-06, + "loss": 0.80364901, + "num_input_tokens_seen": 201459970, + "router_z_loss_clip": 1.52050781, + "router_z_loss_mlp": 0.1149292, + "step": 9355, + "time_per_iteration": 2.5472323894500732 + }, + { + "auxiliary_loss_clip": 0.06422609, + "auxiliary_loss_mlp": 0.01264166, + "balance_loss_clip": 0.06276853, + "balance_loss_mlp": 0.01252638, + "epoch": 0.5625131519615211, + "flos": 21477401084160.0, + "grad_norm": 1.4661709593952188, + "language_loss": 0.73949313, + "learning_rate": 1.693344975084274e-06, + "loss": 0.81636083, + "num_input_tokens_seen": 201480055, + "router_z_loss_clip": 1.45800781, + "router_z_loss_mlp": 0.11535645, + "step": 9356, + "time_per_iteration": 2.5417375564575195 + }, + { + "auxiliary_loss_clip": 0.06421204, + "auxiliary_loss_mlp": 0.01265523, + "balance_loss_clip": 0.0627971, + "balance_loss_mlp": 0.01254043, + "epoch": 0.5625732752141891, + "flos": 18704023920000.0, + "grad_norm": 1.8811670281572186, + "language_loss": 0.83384252, + "learning_rate": 1.6929601244376318e-06, + "loss": 0.9107098, + "num_input_tokens_seen": 201497645, + "router_z_loss_clip": 1.41601562, + "router_z_loss_mlp": 0.11480713, + "step": 9357, + "time_per_iteration": 2.4678521156311035 + }, + { + "auxiliary_loss_clip": 0.06426045, + "auxiliary_loss_mlp": 0.01266314, + "balance_loss_clip": 0.06279635, + "balance_loss_mlp": 0.01255705, + "epoch": 0.562633398466857, + "flos": 16222492926720.0, + "grad_norm": 2.0645024289256293, + "language_loss": 0.7263062, + "learning_rate": 1.6925752854339722e-06, + "loss": 0.80322981, + "num_input_tokens_seen": 201515455, + "router_z_loss_clip": 1.46289062, + "router_z_loss_mlp": 0.1060791, + "step": 9358, + "time_per_iteration": 2.5186126232147217 + }, + { + "auxiliary_loss_clip": 0.06416523, + "auxiliary_loss_mlp": 0.01266054, + "balance_loss_clip": 0.06273469, + "balance_loss_mlp": 0.0125408, + "epoch": 0.562693521719525, + "flos": 22498485338880.0, + "grad_norm": 1.808809546066597, + "language_loss": 0.78313565, + "learning_rate": 1.6921904580878885e-06, + "loss": 0.85996139, + "num_input_tokens_seen": 201534500, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.11981201, + "step": 9359, + "time_per_iteration": 2.4950146675109863 + }, + { + "auxiliary_loss_clip": 0.06422278, + "auxiliary_loss_mlp": 0.01266031, + "balance_loss_clip": 0.06277263, + "balance_loss_mlp": 0.01254123, + "epoch": 0.562753644972193, + "flos": 25337088506880.0, + "grad_norm": 1.6393117198147682, + "language_loss": 0.70198202, + "learning_rate": 1.6918056424139736e-06, + "loss": 0.77886516, + "num_input_tokens_seen": 201553280, + "router_z_loss_clip": 1.44921875, + "router_z_loss_mlp": 0.11920166, + "step": 9360, + "time_per_iteration": 2.5677337646484375 + }, + { + "auxiliary_loss_clip": 0.06333196, + "auxiliary_loss_mlp": 0.01259618, + "balance_loss_clip": 0.06271995, + "balance_loss_mlp": 0.01258209, + "epoch": 0.562813768224861, + "flos": 67410566231040.0, + "grad_norm": 0.7608015706194778, + "language_loss": 0.55599511, + "learning_rate": 1.6914208384268197e-06, + "loss": 0.63192326, + "num_input_tokens_seen": 201610030, + "router_z_loss_clip": 0.61328125, + "router_z_loss_mlp": 0.0140686, + "step": 9361, + "time_per_iteration": 3.047746419906616 + }, + { + "auxiliary_loss_clip": 0.06421309, + "auxiliary_loss_mlp": 0.01270958, + "balance_loss_clip": 0.06278641, + "balance_loss_mlp": 0.01260271, + "epoch": 0.562873891477529, + "flos": 23337868014720.0, + "grad_norm": 1.4415772957289732, + "language_loss": 0.82031697, + "learning_rate": 1.691036046141018e-06, + "loss": 0.89723963, + "num_input_tokens_seen": 201628370, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.10687256, + "step": 9362, + "time_per_iteration": 2.5085341930389404 + }, + { + "auxiliary_loss_clip": 0.06425183, + "auxiliary_loss_mlp": 0.01265052, + "balance_loss_clip": 0.06282046, + "balance_loss_mlp": 0.01254067, + "epoch": 0.5629340147301969, + "flos": 38482073475840.0, + "grad_norm": 1.5514506959778531, + "language_loss": 0.74991751, + "learning_rate": 1.6906512655711614e-06, + "loss": 0.8268199, + "num_input_tokens_seen": 201649790, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.10992432, + "step": 9363, + "time_per_iteration": 2.6483652591705322 + }, + { + "auxiliary_loss_clip": 0.06428041, + "auxiliary_loss_mlp": 0.01269517, + "balance_loss_clip": 0.06280389, + "balance_loss_mlp": 0.01257573, + "epoch": 0.5629941379828649, + "flos": 29249744509440.0, + "grad_norm": 1.527132274705304, + "language_loss": 0.82966727, + "learning_rate": 1.690266496731839e-06, + "loss": 0.90664279, + "num_input_tokens_seen": 201669175, + "router_z_loss_clip": 1.47460938, + "router_z_loss_mlp": 0.11962891, + "step": 9364, + "time_per_iteration": 2.585028648376465 + }, + { + "auxiliary_loss_clip": 0.06420554, + "auxiliary_loss_mlp": 0.01264228, + "balance_loss_clip": 0.06281281, + "balance_loss_mlp": 0.01253207, + "epoch": 0.5630542612355328, + "flos": 19425882844800.0, + "grad_norm": 1.9441356766600106, + "language_loss": 0.65449685, + "learning_rate": 1.689881739637642e-06, + "loss": 0.7313447, + "num_input_tokens_seen": 201687000, + "router_z_loss_clip": 1.39257812, + "router_z_loss_mlp": 0.11022949, + "step": 9365, + "time_per_iteration": 2.5320210456848145 + }, + { + "auxiliary_loss_clip": 0.06432588, + "auxiliary_loss_mlp": 0.01270623, + "balance_loss_clip": 0.06279749, + "balance_loss_mlp": 0.0125841, + "epoch": 0.5631143844882008, + "flos": 22271697463680.0, + "grad_norm": 2.4081978900655114, + "language_loss": 0.81779563, + "learning_rate": 1.6894969943031611e-06, + "loss": 0.89482784, + "num_input_tokens_seen": 201703335, + "router_z_loss_clip": 1.52929688, + "router_z_loss_mlp": 0.12213135, + "step": 9366, + "time_per_iteration": 2.5602293014526367 + }, + { + "auxiliary_loss_clip": 0.06419416, + "auxiliary_loss_mlp": 0.01263434, + "balance_loss_clip": 0.06277686, + "balance_loss_mlp": 0.01253033, + "epoch": 0.5631745077408687, + "flos": 22971781526400.0, + "grad_norm": 1.4555155937951827, + "language_loss": 0.73903221, + "learning_rate": 1.6891122607429845e-06, + "loss": 0.81586075, + "num_input_tokens_seen": 201723495, + "router_z_loss_clip": 1.41894531, + "router_z_loss_mlp": 0.10400391, + "step": 9367, + "time_per_iteration": 2.5222184658050537 + }, + { + "auxiliary_loss_clip": 0.0633425, + "auxiliary_loss_mlp": 0.01256933, + "balance_loss_clip": 0.06272865, + "balance_loss_mlp": 0.01255295, + "epoch": 0.5632346309935368, + "flos": 65101917409920.0, + "grad_norm": 0.6175920076853201, + "language_loss": 0.5334087, + "learning_rate": 1.6887275389717028e-06, + "loss": 0.60932058, + "num_input_tokens_seen": 201792615, + "router_z_loss_clip": 0.61669922, + "router_z_loss_mlp": 0.0164032, + "step": 9368, + "time_per_iteration": 3.3093104362487793 + }, + { + "auxiliary_loss_clip": 0.06421301, + "auxiliary_loss_mlp": 0.0127307, + "balance_loss_clip": 0.06277905, + "balance_loss_mlp": 0.01261757, + "epoch": 0.5632947542462047, + "flos": 23009572517760.0, + "grad_norm": 1.6075197920052449, + "language_loss": 0.69183493, + "learning_rate": 1.6883428290039046e-06, + "loss": 0.76877862, + "num_input_tokens_seen": 201812520, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.11315918, + "step": 9369, + "time_per_iteration": 2.5406625270843506 + }, + { + "auxiliary_loss_clip": 0.06420332, + "auxiliary_loss_mlp": 0.01269293, + "balance_loss_clip": 0.06275883, + "balance_loss_mlp": 0.01258105, + "epoch": 0.5633548774988727, + "flos": 30490530969600.0, + "grad_norm": 1.6779781841725052, + "language_loss": 0.76048809, + "learning_rate": 1.6879581308541763e-06, + "loss": 0.83738434, + "num_input_tokens_seen": 201834185, + "router_z_loss_clip": 1.4453125, + "router_z_loss_mlp": 0.11175537, + "step": 9370, + "time_per_iteration": 2.591212272644043 + }, + { + "auxiliary_loss_clip": 0.06424968, + "auxiliary_loss_mlp": 0.01266151, + "balance_loss_clip": 0.06275932, + "balance_loss_mlp": 0.01253908, + "epoch": 0.5634150007515406, + "flos": 18520938748800.0, + "grad_norm": 1.8374331787518619, + "language_loss": 0.76029092, + "learning_rate": 1.687573444537108e-06, + "loss": 0.83720207, + "num_input_tokens_seen": 201851305, + "router_z_loss_clip": 1.49023438, + "router_z_loss_mlp": 0.12237549, + "step": 9371, + "time_per_iteration": 2.5327818393707275 + }, + { + "auxiliary_loss_clip": 0.06421979, + "auxiliary_loss_mlp": 0.01268189, + "balance_loss_clip": 0.06277596, + "balance_loss_mlp": 0.01256739, + "epoch": 0.5634751240042086, + "flos": 19250679957120.0, + "grad_norm": 1.7360135917661768, + "language_loss": 0.762514, + "learning_rate": 1.687188770067285e-06, + "loss": 0.83941567, + "num_input_tokens_seen": 201870350, + "router_z_loss_clip": 1.44335938, + "router_z_loss_mlp": 0.11456299, + "step": 9372, + "time_per_iteration": 2.519404411315918 + }, + { + "auxiliary_loss_clip": 0.06422761, + "auxiliary_loss_mlp": 0.01266353, + "balance_loss_clip": 0.06280088, + "balance_loss_mlp": 0.01255016, + "epoch": 0.5635352472568766, + "flos": 12025453766400.0, + "grad_norm": 1.884768041604824, + "language_loss": 0.71853095, + "learning_rate": 1.6868041074592956e-06, + "loss": 0.79542208, + "num_input_tokens_seen": 201886800, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.11334229, + "step": 9373, + "time_per_iteration": 2.5053837299346924 + }, + { + "auxiliary_loss_clip": 0.06422034, + "auxiliary_loss_mlp": 0.01268801, + "balance_loss_clip": 0.06277832, + "balance_loss_mlp": 0.01256367, + "epoch": 0.5635953705095446, + "flos": 21878092108800.0, + "grad_norm": 1.841933865019323, + "language_loss": 0.83263683, + "learning_rate": 1.6864194567277264e-06, + "loss": 0.90954518, + "num_input_tokens_seen": 201904730, + "router_z_loss_clip": 1.44238281, + "router_z_loss_mlp": 0.12438965, + "step": 9374, + "time_per_iteration": 3.904900074005127 + }, + { + "auxiliary_loss_clip": 0.06420377, + "auxiliary_loss_mlp": 0.01266878, + "balance_loss_clip": 0.06277412, + "balance_loss_mlp": 0.01256131, + "epoch": 0.5636554937622126, + "flos": 27133587244800.0, + "grad_norm": 2.5670866003984583, + "language_loss": 0.66696084, + "learning_rate": 1.6860348178871618e-06, + "loss": 0.74383336, + "num_input_tokens_seen": 201924850, + "router_z_loss_clip": 1.42871094, + "router_z_loss_mlp": 0.10754395, + "step": 9375, + "time_per_iteration": 2.581921339035034 + }, + { + "auxiliary_loss_clip": 0.06426428, + "auxiliary_loss_mlp": 0.01265809, + "balance_loss_clip": 0.06279501, + "balance_loss_mlp": 0.0125433, + "epoch": 0.5637156170148805, + "flos": 12930314008320.0, + "grad_norm": 12.279905367602915, + "language_loss": 0.81403673, + "learning_rate": 1.6856501909521889e-06, + "loss": 0.89095908, + "num_input_tokens_seen": 201939500, + "router_z_loss_clip": 1.46972656, + "router_z_loss_mlp": 0.11474609, + "step": 9376, + "time_per_iteration": 2.5271008014678955 + }, + { + "auxiliary_loss_clip": 0.06430367, + "auxiliary_loss_mlp": 0.01265466, + "balance_loss_clip": 0.06280433, + "balance_loss_mlp": 0.01253974, + "epoch": 0.5637757402675485, + "flos": 45561460435200.0, + "grad_norm": 1.3765625381603785, + "language_loss": 0.69569075, + "learning_rate": 1.6852655759373925e-06, + "loss": 0.77264911, + "num_input_tokens_seen": 201963000, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.1149292, + "step": 9377, + "time_per_iteration": 2.7878713607788086 + }, + { + "auxiliary_loss_clip": 0.06418754, + "auxiliary_loss_mlp": 0.01265562, + "balance_loss_clip": 0.06278635, + "balance_loss_mlp": 0.01254887, + "epoch": 0.5638358635202164, + "flos": 20892241296000.0, + "grad_norm": 1.4815499035204616, + "language_loss": 0.75006419, + "learning_rate": 1.6848809728573565e-06, + "loss": 0.82690734, + "num_input_tokens_seen": 201983145, + "router_z_loss_clip": 1.40039062, + "router_z_loss_mlp": 0.10668945, + "step": 9378, + "time_per_iteration": 2.5742552280426025 + }, + { + "auxiliary_loss_clip": 0.06432593, + "auxiliary_loss_mlp": 0.01271419, + "balance_loss_clip": 0.06279133, + "balance_loss_mlp": 0.01258837, + "epoch": 0.5638959867728844, + "flos": 18812449503360.0, + "grad_norm": 2.3058329321149555, + "language_loss": 0.81874716, + "learning_rate": 1.6844963817266656e-06, + "loss": 0.8957873, + "num_input_tokens_seen": 202000335, + "router_z_loss_clip": 1.53710938, + "router_z_loss_mlp": 0.12585449, + "step": 9379, + "time_per_iteration": 3.9022350311279297 + }, + { + "auxiliary_loss_clip": 0.06428088, + "auxiliary_loss_mlp": 0.012688, + "balance_loss_clip": 0.06281307, + "balance_loss_mlp": 0.01256933, + "epoch": 0.5639561100255523, + "flos": 27497703162240.0, + "grad_norm": 1.9515300720121755, + "language_loss": 0.71783185, + "learning_rate": 1.6841118025599042e-06, + "loss": 0.79480064, + "num_input_tokens_seen": 202018275, + "router_z_loss_clip": 1.46582031, + "router_z_loss_mlp": 0.11859131, + "step": 9380, + "time_per_iteration": 2.6338086128234863 + }, + { + "auxiliary_loss_clip": 0.0642691, + "auxiliary_loss_mlp": 0.01266641, + "balance_loss_clip": 0.06279925, + "balance_loss_mlp": 0.01254857, + "epoch": 0.5640162332782204, + "flos": 18082289024640.0, + "grad_norm": 2.0751114915079687, + "language_loss": 0.75207865, + "learning_rate": 1.6837272353716542e-06, + "loss": 0.82901412, + "num_input_tokens_seen": 202034330, + "router_z_loss_clip": 1.46777344, + "router_z_loss_mlp": 0.11779785, + "step": 9381, + "time_per_iteration": 2.4637959003448486 + }, + { + "auxiliary_loss_clip": 0.06430316, + "auxiliary_loss_mlp": 0.01273879, + "balance_loss_clip": 0.06282466, + "balance_loss_mlp": 0.01262822, + "epoch": 0.5640763565308883, + "flos": 20890857703680.0, + "grad_norm": 2.2840815632275846, + "language_loss": 0.72823429, + "learning_rate": 1.683342680176499e-06, + "loss": 0.80527627, + "num_input_tokens_seen": 202053100, + "router_z_loss_clip": 1.47753906, + "router_z_loss_mlp": 0.11053467, + "step": 9382, + "time_per_iteration": 2.6038217544555664 + }, + { + "auxiliary_loss_clip": 0.0632898, + "auxiliary_loss_mlp": 0.01252773, + "balance_loss_clip": 0.06268109, + "balance_loss_mlp": 0.01251134, + "epoch": 0.5641364797835563, + "flos": 64467143205120.0, + "grad_norm": 0.7593633930380659, + "language_loss": 0.54457784, + "learning_rate": 1.682958136989022e-06, + "loss": 0.62039542, + "num_input_tokens_seen": 202120125, + "router_z_loss_clip": 0.609375, + "router_z_loss_mlp": 0.01641846, + "step": 9383, + "time_per_iteration": 4.702574253082275 + }, + { + "auxiliary_loss_clip": 0.06430694, + "auxiliary_loss_mlp": 0.01271925, + "balance_loss_clip": 0.06278884, + "balance_loss_mlp": 0.01260129, + "epoch": 0.5641966030362242, + "flos": 18666861834240.0, + "grad_norm": 1.6723183303987958, + "language_loss": 0.71441197, + "learning_rate": 1.6825736058238033e-06, + "loss": 0.79143822, + "num_input_tokens_seen": 202138030, + "router_z_loss_clip": 1.51757812, + "router_z_loss_mlp": 0.11798096, + "step": 9384, + "time_per_iteration": 2.4753105640411377 + }, + { + "auxiliary_loss_clip": 0.06421386, + "auxiliary_loss_mlp": 0.01266582, + "balance_loss_clip": 0.0627472, + "balance_loss_mlp": 0.01254626, + "epoch": 0.5642567262888922, + "flos": 22498946536320.0, + "grad_norm": 1.9187169203117838, + "language_loss": 0.76415217, + "learning_rate": 1.6821890866954263e-06, + "loss": 0.84103185, + "num_input_tokens_seen": 202155580, + "router_z_loss_clip": 1.46386719, + "router_z_loss_mlp": 0.1194458, + "step": 9385, + "time_per_iteration": 2.5245208740234375 + }, + { + "auxiliary_loss_clip": 0.06417953, + "auxiliary_loss_mlp": 0.01265769, + "balance_loss_clip": 0.0627504, + "balance_loss_mlp": 0.01255028, + "epoch": 0.5643168495415603, + "flos": 13008663175680.0, + "grad_norm": 1.914249541829808, + "language_loss": 0.82386243, + "learning_rate": 1.6818045796184703e-06, + "loss": 0.90069962, + "num_input_tokens_seen": 202170365, + "router_z_loss_clip": 1.42871094, + "router_z_loss_mlp": 0.10748291, + "step": 9386, + "time_per_iteration": 2.4669172763824463 + }, + { + "auxiliary_loss_clip": 0.06427868, + "auxiliary_loss_mlp": 0.01266292, + "balance_loss_clip": 0.06277144, + "balance_loss_mlp": 0.01255014, + "epoch": 0.5643769727942282, + "flos": 18594256671360.0, + "grad_norm": 1.9656567849197715, + "language_loss": 0.70471108, + "learning_rate": 1.681420084607516e-06, + "loss": 0.78165275, + "num_input_tokens_seen": 202189095, + "router_z_loss_clip": 1.50488281, + "router_z_loss_mlp": 0.112854, + "step": 9387, + "time_per_iteration": 2.5076122283935547 + }, + { + "auxiliary_loss_clip": 0.0642679, + "auxiliary_loss_mlp": 0.01267525, + "balance_loss_clip": 0.06276885, + "balance_loss_mlp": 0.01255348, + "epoch": 0.5644370960468962, + "flos": 33815343853440.0, + "grad_norm": 1.4623673546412521, + "language_loss": 0.75064629, + "learning_rate": 1.6810356016771452e-06, + "loss": 0.82758939, + "num_input_tokens_seen": 202213500, + "router_z_loss_clip": 1.49707031, + "router_z_loss_mlp": 0.12176514, + "step": 9388, + "time_per_iteration": 2.651616096496582 + }, + { + "auxiliary_loss_clip": 0.06417996, + "auxiliary_loss_mlp": 0.01267245, + "balance_loss_clip": 0.06276226, + "balance_loss_mlp": 0.01256892, + "epoch": 0.5644972192995641, + "flos": 21221249552640.0, + "grad_norm": 1.4874039445981817, + "language_loss": 0.82212514, + "learning_rate": 1.6806511308419353e-06, + "loss": 0.89897752, + "num_input_tokens_seen": 202231920, + "router_z_loss_clip": 1.41699219, + "router_z_loss_mlp": 0.10357666, + "step": 9389, + "time_per_iteration": 2.5609359741210938 + }, + { + "auxiliary_loss_clip": 0.06426319, + "auxiliary_loss_mlp": 0.01270818, + "balance_loss_clip": 0.06278206, + "balance_loss_mlp": 0.01258468, + "epoch": 0.5645573425522321, + "flos": 18593585838720.0, + "grad_norm": 2.1560569688057036, + "language_loss": 0.64486635, + "learning_rate": 1.680266672116467e-06, + "loss": 0.72183776, + "num_input_tokens_seen": 202247600, + "router_z_loss_clip": 1.47949219, + "router_z_loss_mlp": 0.12329102, + "step": 9390, + "time_per_iteration": 3.8905534744262695 + }, + { + "auxiliary_loss_clip": 0.06417844, + "auxiliary_loss_mlp": 0.01265997, + "balance_loss_clip": 0.06276117, + "balance_loss_mlp": 0.01255334, + "epoch": 0.5646174658049, + "flos": 18119660745600.0, + "grad_norm": 1.743379462466535, + "language_loss": 0.92393249, + "learning_rate": 1.6798822255153192e-06, + "loss": 1.00077093, + "num_input_tokens_seen": 202265350, + "router_z_loss_clip": 1.41992188, + "router_z_loss_mlp": 0.10662842, + "step": 9391, + "time_per_iteration": 2.4846012592315674 + }, + { + "auxiliary_loss_clip": 0.06426747, + "auxiliary_loss_mlp": 0.01269795, + "balance_loss_clip": 0.06274952, + "balance_loss_mlp": 0.0125751, + "epoch": 0.564677589057568, + "flos": 28337547035520.0, + "grad_norm": 2.079245602273352, + "language_loss": 0.60616773, + "learning_rate": 1.6794977910530684e-06, + "loss": 0.68313313, + "num_input_tokens_seen": 202284285, + "router_z_loss_clip": 1.51660156, + "router_z_loss_mlp": 0.12286377, + "step": 9392, + "time_per_iteration": 2.5709118843078613 + }, + { + "auxiliary_loss_clip": 0.06418676, + "auxiliary_loss_mlp": 0.01266956, + "balance_loss_clip": 0.06274032, + "balance_loss_mlp": 0.01255619, + "epoch": 0.564737712310236, + "flos": 22170273696000.0, + "grad_norm": 2.32400153493691, + "language_loss": 0.81762815, + "learning_rate": 1.6791133687442937e-06, + "loss": 0.8944844, + "num_input_tokens_seen": 202303450, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.11334229, + "step": 9393, + "time_per_iteration": 2.49820613861084 + }, + { + "auxiliary_loss_clip": 0.06420048, + "auxiliary_loss_mlp": 0.01268955, + "balance_loss_clip": 0.06274317, + "balance_loss_mlp": 0.01257434, + "epoch": 0.564797835562904, + "flos": 20965223802240.0, + "grad_norm": 1.8189771095125196, + "language_loss": 0.87738705, + "learning_rate": 1.6787289586035725e-06, + "loss": 0.95427704, + "num_input_tokens_seen": 202322315, + "router_z_loss_clip": 1.45703125, + "router_z_loss_mlp": 0.11523438, + "step": 9394, + "time_per_iteration": 2.5385193824768066 + }, + { + "auxiliary_loss_clip": 0.06421189, + "auxiliary_loss_mlp": 0.01271733, + "balance_loss_clip": 0.06278495, + "balance_loss_mlp": 0.01261135, + "epoch": 0.5648579588155719, + "flos": 17425991520000.0, + "grad_norm": 1.7000053900358165, + "language_loss": 0.84579873, + "learning_rate": 1.6783445606454814e-06, + "loss": 0.92272794, + "num_input_tokens_seen": 202339905, + "router_z_loss_clip": 1.42871094, + "router_z_loss_mlp": 0.1060791, + "step": 9395, + "time_per_iteration": 2.470017433166504 + }, + { + "auxiliary_loss_clip": 0.06326792, + "auxiliary_loss_mlp": 0.01253109, + "balance_loss_clip": 0.06265698, + "balance_loss_mlp": 0.01251535, + "epoch": 0.5649180820682399, + "flos": 69951187152000.0, + "grad_norm": 0.7657809500788333, + "language_loss": 0.57918489, + "learning_rate": 1.677960174884597e-06, + "loss": 0.65498388, + "num_input_tokens_seen": 202397320, + "router_z_loss_clip": 0.61132812, + "router_z_loss_mlp": 0.01573944, + "step": 9396, + "time_per_iteration": 3.1468727588653564 + }, + { + "auxiliary_loss_clip": 0.06423569, + "auxiliary_loss_mlp": 0.01267357, + "balance_loss_clip": 0.06276156, + "balance_loss_mlp": 0.01256205, + "epoch": 0.5649782053209078, + "flos": 24980058259200.0, + "grad_norm": 1.9294071175656426, + "language_loss": 0.70135093, + "learning_rate": 1.6775758013354943e-06, + "loss": 0.77826023, + "num_input_tokens_seen": 202416865, + "router_z_loss_clip": 1.47460938, + "router_z_loss_mlp": 0.11157227, + "step": 9397, + "time_per_iteration": 2.5551769733428955 + }, + { + "auxiliary_loss_clip": 0.06421924, + "auxiliary_loss_mlp": 0.01267113, + "balance_loss_clip": 0.06274733, + "balance_loss_mlp": 0.01256277, + "epoch": 0.5650383285735758, + "flos": 21733175272320.0, + "grad_norm": 3.1535749018048094, + "language_loss": 0.67165595, + "learning_rate": 1.67719144001275e-06, + "loss": 0.74854636, + "num_input_tokens_seen": 202436210, + "router_z_loss_clip": 1.47167969, + "router_z_loss_mlp": 0.10839844, + "step": 9398, + "time_per_iteration": 2.5690701007843018 + }, + { + "auxiliary_loss_clip": 0.06324084, + "auxiliary_loss_mlp": 0.01251867, + "balance_loss_clip": 0.06263297, + "balance_loss_mlp": 0.01250375, + "epoch": 0.5650984518262439, + "flos": 65923481093760.0, + "grad_norm": 0.7518933539640298, + "language_loss": 0.58143103, + "learning_rate": 1.6768070909309386e-06, + "loss": 0.65719062, + "num_input_tokens_seen": 202492925, + "router_z_loss_clip": 0.609375, + "router_z_loss_mlp": 0.01491547, + "step": 9399, + "time_per_iteration": 3.073493719100952 + }, + { + "auxiliary_loss_clip": 0.06425194, + "auxiliary_loss_mlp": 0.01269138, + "balance_loss_clip": 0.06275368, + "balance_loss_mlp": 0.01257158, + "epoch": 0.5651585750789118, + "flos": 21038919068160.0, + "grad_norm": 2.9284187471842213, + "language_loss": 0.73483676, + "learning_rate": 1.6764227541046347e-06, + "loss": 0.8117801, + "num_input_tokens_seen": 202511905, + "router_z_loss_clip": 1.49902344, + "router_z_loss_mlp": 0.11987305, + "step": 9400, + "time_per_iteration": 2.5129287242889404 + }, + { + "auxiliary_loss_clip": 0.06431332, + "auxiliary_loss_mlp": 0.01270587, + "balance_loss_clip": 0.06281202, + "balance_loss_mlp": 0.01258267, + "epoch": 0.5652186983315798, + "flos": 18557891199360.0, + "grad_norm": 1.781312568353633, + "language_loss": 0.61062682, + "learning_rate": 1.676038429548412e-06, + "loss": 0.68764603, + "num_input_tokens_seen": 202529815, + "router_z_loss_clip": 1.50097656, + "router_z_loss_mlp": 0.12322998, + "step": 9401, + "time_per_iteration": 2.484562397003174 + }, + { + "auxiliary_loss_clip": 0.06419288, + "auxiliary_loss_mlp": 0.01272594, + "balance_loss_clip": 0.06274588, + "balance_loss_mlp": 0.01261859, + "epoch": 0.5652788215842477, + "flos": 18484573276800.0, + "grad_norm": 1.8682667341725439, + "language_loss": 0.81175613, + "learning_rate": 1.6756541172768453e-06, + "loss": 0.88867497, + "num_input_tokens_seen": 202547710, + "router_z_loss_clip": 1.4453125, + "router_z_loss_mlp": 0.10736084, + "step": 9402, + "time_per_iteration": 2.5402467250823975 + }, + { + "auxiliary_loss_clip": 0.0641814, + "auxiliary_loss_mlp": 0.01269888, + "balance_loss_clip": 0.06276071, + "balance_loss_mlp": 0.0125898, + "epoch": 0.5653389448369157, + "flos": 30051797391360.0, + "grad_norm": 1.3435358668606565, + "language_loss": 0.77710259, + "learning_rate": 1.6752698173045068e-06, + "loss": 0.85398287, + "num_input_tokens_seen": 202568835, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.10900879, + "step": 9403, + "time_per_iteration": 2.5728204250335693 + }, + { + "auxiliary_loss_clip": 0.06421928, + "auxiliary_loss_mlp": 0.01268633, + "balance_loss_clip": 0.06276687, + "balance_loss_mlp": 0.01257458, + "epoch": 0.5653990680895836, + "flos": 16733202762240.0, + "grad_norm": 1.6255859835861872, + "language_loss": 0.69364876, + "learning_rate": 1.6748855296459685e-06, + "loss": 0.7705543, + "num_input_tokens_seen": 202587385, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.11187744, + "step": 9404, + "time_per_iteration": 2.5076894760131836 + }, + { + "auxiliary_loss_clip": 0.06414986, + "auxiliary_loss_mlp": 0.01268861, + "balance_loss_clip": 0.06274591, + "balance_loss_mlp": 0.01258156, + "epoch": 0.5654591913422516, + "flos": 14543517939840.0, + "grad_norm": 1.937007916536723, + "language_loss": 0.6753332, + "learning_rate": 1.6745012543158045e-06, + "loss": 0.75217164, + "num_input_tokens_seen": 202604815, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.1071167, + "step": 9405, + "time_per_iteration": 2.4678986072540283 + }, + { + "auxiliary_loss_clip": 0.06417301, + "auxiliary_loss_mlp": 0.01269096, + "balance_loss_clip": 0.0627932, + "balance_loss_mlp": 0.0125891, + "epoch": 0.5655193145949196, + "flos": 26216484307200.0, + "grad_norm": 1.7078210782531607, + "language_loss": 0.74488431, + "learning_rate": 1.6741169913285852e-06, + "loss": 0.82174826, + "num_input_tokens_seen": 202623775, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.10180664, + "step": 9406, + "time_per_iteration": 2.5344419479370117 + }, + { + "auxiliary_loss_clip": 0.06423233, + "auxiliary_loss_mlp": 0.01269998, + "balance_loss_clip": 0.06274547, + "balance_loss_mlp": 0.01258101, + "epoch": 0.5655794378475876, + "flos": 25053669671040.0, + "grad_norm": 1.6572482823915473, + "language_loss": 0.80165344, + "learning_rate": 1.673732740698882e-06, + "loss": 0.87858582, + "num_input_tokens_seen": 202643375, + "router_z_loss_clip": 1.48828125, + "router_z_loss_mlp": 0.11901855, + "step": 9407, + "time_per_iteration": 2.5318515300750732 + }, + { + "auxiliary_loss_clip": 0.06414818, + "auxiliary_loss_mlp": 0.01281674, + "balance_loss_clip": 0.06276679, + "balance_loss_mlp": 0.01270641, + "epoch": 0.5656395611002555, + "flos": 31041379710720.0, + "grad_norm": 1.3106223538314048, + "language_loss": 0.71445584, + "learning_rate": 1.6733485024412666e-06, + "loss": 0.79142082, + "num_input_tokens_seen": 202668400, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.1104126, + "step": 9408, + "time_per_iteration": 2.6315321922302246 + }, + { + "auxiliary_loss_clip": 0.06416275, + "auxiliary_loss_mlp": 0.01273077, + "balance_loss_clip": 0.06275165, + "balance_loss_mlp": 0.01262151, + "epoch": 0.5656996843529235, + "flos": 20235650302080.0, + "grad_norm": 1.8647463769564316, + "language_loss": 0.81496549, + "learning_rate": 1.672964276570308e-06, + "loss": 0.89185899, + "num_input_tokens_seen": 202685125, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.109375, + "step": 9409, + "time_per_iteration": 2.4874367713928223 + }, + { + "auxiliary_loss_clip": 0.06420213, + "auxiliary_loss_mlp": 0.01273147, + "balance_loss_clip": 0.06275219, + "balance_loss_mlp": 0.01261953, + "epoch": 0.5657598076055914, + "flos": 21002595523200.0, + "grad_norm": 1.5982364261864173, + "language_loss": 0.78488803, + "learning_rate": 1.6725800631005776e-06, + "loss": 0.86182165, + "num_input_tokens_seen": 202703830, + "router_z_loss_clip": 1.44921875, + "router_z_loss_mlp": 0.11187744, + "step": 9410, + "time_per_iteration": 2.568018913269043 + }, + { + "auxiliary_loss_clip": 0.06420635, + "auxiliary_loss_mlp": 0.01269622, + "balance_loss_clip": 0.06277133, + "balance_loss_mlp": 0.01258607, + "epoch": 0.5658199308582594, + "flos": 11550690132480.0, + "grad_norm": 1.9303419986806551, + "language_loss": 0.83679706, + "learning_rate": 1.6721958620466432e-06, + "loss": 0.91369963, + "num_input_tokens_seen": 202719835, + "router_z_loss_clip": 1.43457031, + "router_z_loss_mlp": 0.11016846, + "step": 9411, + "time_per_iteration": 2.4616551399230957 + }, + { + "auxiliary_loss_clip": 0.06428169, + "auxiliary_loss_mlp": 0.01269272, + "balance_loss_clip": 0.06277955, + "balance_loss_mlp": 0.01256725, + "epoch": 0.5658800541109275, + "flos": 14177137962240.0, + "grad_norm": 2.370687982223235, + "language_loss": 0.67829227, + "learning_rate": 1.6718116734230749e-06, + "loss": 0.75526661, + "num_input_tokens_seen": 202736795, + "router_z_loss_clip": 1.50292969, + "router_z_loss_mlp": 0.12548828, + "step": 9412, + "time_per_iteration": 2.5216641426086426 + }, + { + "auxiliary_loss_clip": 0.06415425, + "auxiliary_loss_mlp": 0.01268228, + "balance_loss_clip": 0.06277046, + "balance_loss_mlp": 0.01258488, + "epoch": 0.5659401773635954, + "flos": 27311934660480.0, + "grad_norm": 1.581889394574198, + "language_loss": 0.58742762, + "learning_rate": 1.6714274972444413e-06, + "loss": 0.6642642, + "num_input_tokens_seen": 202756900, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.09741211, + "step": 9413, + "time_per_iteration": 2.564143657684326 + }, + { + "auxiliary_loss_clip": 0.06415551, + "auxiliary_loss_mlp": 0.01265095, + "balance_loss_clip": 0.06274314, + "balance_loss_mlp": 0.01254294, + "epoch": 0.5660003006162634, + "flos": 16733957448960.0, + "grad_norm": 2.47913455673049, + "language_loss": 0.69196904, + "learning_rate": 1.6710433335253092e-06, + "loss": 0.76877546, + "num_input_tokens_seen": 202775145, + "router_z_loss_clip": 1.40917969, + "router_z_loss_mlp": 0.10791016, + "step": 9414, + "time_per_iteration": 3.924028158187866 + }, + { + "auxiliary_loss_clip": 0.0641676, + "auxiliary_loss_mlp": 0.01269168, + "balance_loss_clip": 0.06275219, + "balance_loss_mlp": 0.01258475, + "epoch": 0.5660604238689313, + "flos": 21659983130880.0, + "grad_norm": 1.6269222060357784, + "language_loss": 0.78177273, + "learning_rate": 1.670659182280247e-06, + "loss": 0.85863203, + "num_input_tokens_seen": 202794505, + "router_z_loss_clip": 1.41601562, + "router_z_loss_mlp": 0.10693359, + "step": 9415, + "time_per_iteration": 2.5426433086395264 + }, + { + "auxiliary_loss_clip": 0.06321331, + "auxiliary_loss_mlp": 0.01255911, + "balance_loss_clip": 0.06260875, + "balance_loss_mlp": 0.01254426, + "epoch": 0.5661205471215993, + "flos": 68843619884160.0, + "grad_norm": 0.6697066651048145, + "language_loss": 0.48973382, + "learning_rate": 1.670275043523822e-06, + "loss": 0.56550622, + "num_input_tokens_seen": 202858580, + "router_z_loss_clip": 0.60546875, + "router_z_loss_mlp": 0.0148468, + "step": 9416, + "time_per_iteration": 3.2625491619110107 + }, + { + "auxiliary_loss_clip": 0.06421995, + "auxiliary_loss_mlp": 0.01268122, + "balance_loss_clip": 0.06277312, + "balance_loss_mlp": 0.01256416, + "epoch": 0.5661806703742672, + "flos": 28629393206400.0, + "grad_norm": 1.9136616805420137, + "language_loss": 0.63439846, + "learning_rate": 1.6698909172706e-06, + "loss": 0.7112996, + "num_input_tokens_seen": 202878565, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.11706543, + "step": 9417, + "time_per_iteration": 2.5860400199890137 + }, + { + "auxiliary_loss_clip": 0.06423697, + "auxiliary_loss_mlp": 0.01269251, + "balance_loss_clip": 0.06277792, + "balance_loss_mlp": 0.01257419, + "epoch": 0.5662407936269352, + "flos": 21404418577920.0, + "grad_norm": 2.3766145169256485, + "language_loss": 0.6936692, + "learning_rate": 1.6695068035351479e-06, + "loss": 0.77059871, + "num_input_tokens_seen": 202897350, + "router_z_loss_clip": 1.45703125, + "router_z_loss_mlp": 0.1184082, + "step": 9418, + "time_per_iteration": 3.955557346343994 + }, + { + "auxiliary_loss_clip": 0.0642141, + "auxiliary_loss_mlp": 0.01267462, + "balance_loss_clip": 0.06276925, + "balance_loss_mlp": 0.01255261, + "epoch": 0.5663009168796032, + "flos": 25666054836480.0, + "grad_norm": 1.7349550199621107, + "language_loss": 0.65210938, + "learning_rate": 1.6691227023320304e-06, + "loss": 0.72899818, + "num_input_tokens_seen": 202916745, + "router_z_loss_clip": 1.44335938, + "router_z_loss_mlp": 0.12219238, + "step": 9419, + "time_per_iteration": 2.5426688194274902 + }, + { + "auxiliary_loss_clip": 0.06328249, + "auxiliary_loss_mlp": 0.01252694, + "balance_loss_clip": 0.06267616, + "balance_loss_mlp": 0.01251344, + "epoch": 0.5663610401322712, + "flos": 67953014835840.0, + "grad_norm": 0.7058455662611458, + "language_loss": 0.59640646, + "learning_rate": 1.6687386136758135e-06, + "loss": 0.67221588, + "num_input_tokens_seen": 202982375, + "router_z_loss_clip": 0.609375, + "router_z_loss_mlp": 0.01351929, + "step": 9420, + "time_per_iteration": 3.2174880504608154 + }, + { + "auxiliary_loss_clip": 0.064177, + "auxiliary_loss_mlp": 0.0126554, + "balance_loss_clip": 0.06276117, + "balance_loss_mlp": 0.01255235, + "epoch": 0.5664211633849391, + "flos": 24616487393280.0, + "grad_norm": 1.6106095517088517, + "language_loss": 0.74370563, + "learning_rate": 1.6683545375810618e-06, + "loss": 0.82053804, + "num_input_tokens_seen": 203002430, + "router_z_loss_clip": 1.41503906, + "router_z_loss_mlp": 0.10308838, + "step": 9421, + "time_per_iteration": 2.5415146350860596 + }, + { + "auxiliary_loss_clip": 0.06425875, + "auxiliary_loss_mlp": 0.0127111, + "balance_loss_clip": 0.0627939, + "balance_loss_mlp": 0.0125941, + "epoch": 0.5664812866376071, + "flos": 11652407389440.0, + "grad_norm": 1.8136120935488778, + "language_loss": 0.73536521, + "learning_rate": 1.6679704740623389e-06, + "loss": 0.81233501, + "num_input_tokens_seen": 203019425, + "router_z_loss_clip": 1.46679688, + "router_z_loss_mlp": 0.11700439, + "step": 9422, + "time_per_iteration": 2.4822769165039062 + }, + { + "auxiliary_loss_clip": 0.06420115, + "auxiliary_loss_mlp": 0.01264, + "balance_loss_clip": 0.06278713, + "balance_loss_mlp": 0.01253355, + "epoch": 0.566541409890275, + "flos": 24650798440320.0, + "grad_norm": 1.7038149529307767, + "language_loss": 0.8178972, + "learning_rate": 1.6675864231342085e-06, + "loss": 0.89473832, + "num_input_tokens_seen": 203039035, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.10656738, + "step": 9423, + "time_per_iteration": 4.039041519165039 + }, + { + "auxiliary_loss_clip": 0.06420702, + "auxiliary_loss_mlp": 0.01272474, + "balance_loss_clip": 0.06276573, + "balance_loss_mlp": 0.01260392, + "epoch": 0.566601533142943, + "flos": 22276686781440.0, + "grad_norm": 2.1916345423108092, + "language_loss": 0.81182116, + "learning_rate": 1.6672023848112353e-06, + "loss": 0.88875294, + "num_input_tokens_seen": 203059320, + "router_z_loss_clip": 1.44140625, + "router_z_loss_mlp": 0.12091064, + "step": 9424, + "time_per_iteration": 2.6186363697052 + }, + { + "auxiliary_loss_clip": 0.06424181, + "auxiliary_loss_mlp": 0.01267084, + "balance_loss_clip": 0.06276239, + "balance_loss_mlp": 0.01254788, + "epoch": 0.5666616563956111, + "flos": 29979485717760.0, + "grad_norm": 1.8421028893936136, + "language_loss": 0.79108143, + "learning_rate": 1.6668183591079805e-06, + "loss": 0.86799419, + "num_input_tokens_seen": 203078490, + "router_z_loss_clip": 1.47949219, + "router_z_loss_mlp": 0.1229248, + "step": 9425, + "time_per_iteration": 2.6103405952453613 + }, + { + "auxiliary_loss_clip": 0.06423585, + "auxiliary_loss_mlp": 0.01266807, + "balance_loss_clip": 0.06280398, + "balance_loss_mlp": 0.01254958, + "epoch": 0.566721779648279, + "flos": 17786585566080.0, + "grad_norm": 1.8792171756054583, + "language_loss": 0.59002221, + "learning_rate": 1.6664343460390064e-06, + "loss": 0.66692609, + "num_input_tokens_seen": 203096065, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.11853027, + "step": 9426, + "time_per_iteration": 2.5017449855804443 + }, + { + "auxiliary_loss_clip": 0.06425668, + "auxiliary_loss_mlp": 0.01271587, + "balance_loss_clip": 0.06278071, + "balance_loss_mlp": 0.01259881, + "epoch": 0.566781902900947, + "flos": 21039967244160.0, + "grad_norm": 1.8634987355301997, + "language_loss": 0.82228333, + "learning_rate": 1.6660503456188764e-06, + "loss": 0.89925593, + "num_input_tokens_seen": 203115270, + "router_z_loss_clip": 1.47558594, + "router_z_loss_mlp": 0.1171875, + "step": 9427, + "time_per_iteration": 2.565479040145874 + }, + { + "auxiliary_loss_clip": 0.06418218, + "auxiliary_loss_mlp": 0.01268516, + "balance_loss_clip": 0.06277822, + "balance_loss_mlp": 0.01257853, + "epoch": 0.5668420261536149, + "flos": 23155244040960.0, + "grad_norm": 1.8170517561621367, + "language_loss": 0.86107284, + "learning_rate": 1.6656663578621498e-06, + "loss": 0.93794018, + "num_input_tokens_seen": 203134290, + "router_z_loss_clip": 1.40332031, + "router_z_loss_mlp": 0.10662842, + "step": 9428, + "time_per_iteration": 2.5440726280212402 + }, + { + "auxiliary_loss_clip": 0.06425078, + "auxiliary_loss_mlp": 0.01266256, + "balance_loss_clip": 0.06276559, + "balance_loss_mlp": 0.01254549, + "epoch": 0.5669021494062829, + "flos": 22608210660480.0, + "grad_norm": 1.979218692390264, + "language_loss": 0.74058932, + "learning_rate": 1.6652823827833886e-06, + "loss": 0.81750262, + "num_input_tokens_seen": 203152935, + "router_z_loss_clip": 1.48339844, + "router_z_loss_mlp": 0.11700439, + "step": 9429, + "time_per_iteration": 2.5536460876464844 + }, + { + "auxiliary_loss_clip": 0.06425272, + "auxiliary_loss_mlp": 0.01264673, + "balance_loss_clip": 0.06277645, + "balance_loss_mlp": 0.01252943, + "epoch": 0.5669622726589508, + "flos": 17386481520000.0, + "grad_norm": 1.7940156011993331, + "language_loss": 0.75663137, + "learning_rate": 1.6648984203971538e-06, + "loss": 0.8335309, + "num_input_tokens_seen": 203170110, + "router_z_loss_clip": 1.47460938, + "router_z_loss_mlp": 0.11724854, + "step": 9430, + "time_per_iteration": 3.9432384967803955 + }, + { + "auxiliary_loss_clip": 0.06418042, + "auxiliary_loss_mlp": 0.01265203, + "balance_loss_clip": 0.06273438, + "balance_loss_mlp": 0.01254498, + "epoch": 0.5670223959116188, + "flos": 18767992112640.0, + "grad_norm": 1.7725274526585868, + "language_loss": 0.73046589, + "learning_rate": 1.6645144707180032e-06, + "loss": 0.80729836, + "num_input_tokens_seen": 203188825, + "router_z_loss_clip": 1.44726562, + "router_z_loss_mlp": 0.10705566, + "step": 9431, + "time_per_iteration": 2.4891881942749023 + }, + { + "auxiliary_loss_clip": 0.06413169, + "auxiliary_loss_mlp": 0.01269495, + "balance_loss_clip": 0.06278919, + "balance_loss_mlp": 0.0125907, + "epoch": 0.5670825191642868, + "flos": 13558463740800.0, + "grad_norm": 1.5232840780961514, + "language_loss": 0.7352109, + "learning_rate": 1.6641305337604984e-06, + "loss": 0.81203753, + "num_input_tokens_seen": 203206860, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.10424805, + "step": 9432, + "time_per_iteration": 2.539503812789917 + }, + { + "auxiliary_loss_clip": 0.06419028, + "auxiliary_loss_mlp": 0.0126609, + "balance_loss_clip": 0.0627542, + "balance_loss_mlp": 0.01254914, + "epoch": 0.5671426424169548, + "flos": 22060506447360.0, + "grad_norm": 1.4799006758092328, + "language_loss": 0.78516906, + "learning_rate": 1.663746609539197e-06, + "loss": 0.86202025, + "num_input_tokens_seen": 203225625, + "router_z_loss_clip": 1.43457031, + "router_z_loss_mlp": 0.11169434, + "step": 9433, + "time_per_iteration": 2.5004031658172607 + }, + { + "auxiliary_loss_clip": 0.06427075, + "auxiliary_loss_mlp": 0.01270712, + "balance_loss_clip": 0.06279536, + "balance_loss_mlp": 0.01257569, + "epoch": 0.5672027656696227, + "flos": 21330262114560.0, + "grad_norm": 1.7709414309866778, + "language_loss": 0.63719839, + "learning_rate": 1.6633626980686582e-06, + "loss": 0.71417624, + "num_input_tokens_seen": 203242920, + "router_z_loss_clip": 1.47363281, + "router_z_loss_mlp": 0.13134766, + "step": 9434, + "time_per_iteration": 2.5424575805664062 + }, + { + "auxiliary_loss_clip": 0.06413743, + "auxiliary_loss_mlp": 0.0126735, + "balance_loss_clip": 0.06274401, + "balance_loss_mlp": 0.01257188, + "epoch": 0.5672628889222907, + "flos": 23520869331840.0, + "grad_norm": 1.9335938837076005, + "language_loss": 0.66754067, + "learning_rate": 1.6629787993634399e-06, + "loss": 0.74435163, + "num_input_tokens_seen": 203261995, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.10162354, + "step": 9435, + "time_per_iteration": 2.5177414417266846 + }, + { + "auxiliary_loss_clip": 0.06416117, + "auxiliary_loss_mlp": 0.0126839, + "balance_loss_clip": 0.06274259, + "balance_loss_mlp": 0.01257333, + "epoch": 0.5673230121749586, + "flos": 27128639854080.0, + "grad_norm": 1.3319121805553942, + "language_loss": 0.71799958, + "learning_rate": 1.6625949134380984e-06, + "loss": 0.79484463, + "num_input_tokens_seen": 203280670, + "router_z_loss_clip": 1.41992188, + "router_z_loss_mlp": 0.11053467, + "step": 9436, + "time_per_iteration": 2.6037702560424805 + }, + { + "auxiliary_loss_clip": 0.06424177, + "auxiliary_loss_mlp": 0.01266696, + "balance_loss_clip": 0.06276658, + "balance_loss_mlp": 0.01254548, + "epoch": 0.5673831354276266, + "flos": 31150476126720.0, + "grad_norm": 1.399584944388347, + "language_loss": 0.7441892, + "learning_rate": 1.6622110403071921e-06, + "loss": 0.82109791, + "num_input_tokens_seen": 203304800, + "router_z_loss_clip": 1.47460938, + "router_z_loss_mlp": 0.12145996, + "step": 9437, + "time_per_iteration": 2.5982627868652344 + }, + { + "auxiliary_loss_clip": 0.0642609, + "auxiliary_loss_mlp": 0.01270521, + "balance_loss_clip": 0.06280209, + "balance_loss_mlp": 0.01258719, + "epoch": 0.5674432586802945, + "flos": 27680662552320.0, + "grad_norm": 1.8153515221603815, + "language_loss": 0.61647224, + "learning_rate": 1.661827179985277e-06, + "loss": 0.69343835, + "num_input_tokens_seen": 203324060, + "router_z_loss_clip": 1.45800781, + "router_z_loss_mlp": 0.11798096, + "step": 9438, + "time_per_iteration": 2.6188385486602783 + }, + { + "auxiliary_loss_clip": 0.0642384, + "auxiliary_loss_mlp": 0.01263986, + "balance_loss_clip": 0.06276964, + "balance_loss_mlp": 0.01252935, + "epoch": 0.5675033819329626, + "flos": 26622458138880.0, + "grad_norm": 1.4984637138093548, + "language_loss": 0.75628054, + "learning_rate": 1.661443332486909e-06, + "loss": 0.83315879, + "num_input_tokens_seen": 203344360, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 0.11053467, + "step": 9439, + "time_per_iteration": 2.5383174419403076 + }, + { + "auxiliary_loss_clip": 0.06420992, + "auxiliary_loss_mlp": 0.01270038, + "balance_loss_clip": 0.06280455, + "balance_loss_mlp": 0.0125798, + "epoch": 0.5675635051856306, + "flos": 19104295674240.0, + "grad_norm": 1.7526345830300347, + "language_loss": 0.8402319, + "learning_rate": 1.6610594978266438e-06, + "loss": 0.91714221, + "num_input_tokens_seen": 203362115, + "router_z_loss_clip": 1.40332031, + "router_z_loss_mlp": 0.1206665, + "step": 9440, + "time_per_iteration": 2.5894699096679688 + }, + { + "auxiliary_loss_clip": 0.06425986, + "auxiliary_loss_mlp": 0.01267618, + "balance_loss_clip": 0.06275898, + "balance_loss_mlp": 0.01255393, + "epoch": 0.5676236284382985, + "flos": 17572040386560.0, + "grad_norm": 2.304829714160468, + "language_loss": 0.75825876, + "learning_rate": 1.6606756760190365e-06, + "loss": 0.83519483, + "num_input_tokens_seen": 203380550, + "router_z_loss_clip": 1.50097656, + "router_z_loss_mlp": 0.12231445, + "step": 9441, + "time_per_iteration": 2.4910314083099365 + }, + { + "auxiliary_loss_clip": 0.0641818, + "auxiliary_loss_mlp": 0.0126441, + "balance_loss_clip": 0.0627504, + "balance_loss_mlp": 0.01253454, + "epoch": 0.5676837516909665, + "flos": 15958375257600.0, + "grad_norm": 1.9240949658540871, + "language_loss": 0.83086008, + "learning_rate": 1.6602918670786413e-06, + "loss": 0.907686, + "num_input_tokens_seen": 203396590, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.10955811, + "step": 9442, + "time_per_iteration": 2.53488826751709 + }, + { + "auxiliary_loss_clip": 0.06416862, + "auxiliary_loss_mlp": 0.01269111, + "balance_loss_clip": 0.06279622, + "balance_loss_mlp": 0.01258543, + "epoch": 0.5677438749436344, + "flos": 18301739667840.0, + "grad_norm": 1.8387898612646743, + "language_loss": 0.74695265, + "learning_rate": 1.6599080710200126e-06, + "loss": 0.82381237, + "num_input_tokens_seen": 203414280, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.10571289, + "step": 9443, + "time_per_iteration": 2.4844577312469482 + }, + { + "auxiliary_loss_clip": 0.06418682, + "auxiliary_loss_mlp": 0.01270397, + "balance_loss_clip": 0.06275757, + "balance_loss_mlp": 0.01258947, + "epoch": 0.5678039981963025, + "flos": 17937120625920.0, + "grad_norm": 2.224999400227568, + "language_loss": 0.77901411, + "learning_rate": 1.6595242878577046e-06, + "loss": 0.85590482, + "num_input_tokens_seen": 203433280, + "router_z_loss_clip": 1.42871094, + "router_z_loss_mlp": 0.11450195, + "step": 9444, + "time_per_iteration": 2.5525596141815186 + }, + { + "auxiliary_loss_clip": 0.06428226, + "auxiliary_loss_mlp": 0.01266607, + "balance_loss_clip": 0.06281613, + "balance_loss_mlp": 0.01255228, + "epoch": 0.5678641214489704, + "flos": 19322153089920.0, + "grad_norm": 1.7258632756557413, + "language_loss": 0.81218302, + "learning_rate": 1.6591405176062687e-06, + "loss": 0.88913137, + "num_input_tokens_seen": 203449935, + "router_z_loss_clip": 1.46484375, + "router_z_loss_mlp": 0.11376953, + "step": 9445, + "time_per_iteration": 2.501241683959961 + }, + { + "auxiliary_loss_clip": 0.06419222, + "auxiliary_loss_mlp": 0.01265991, + "balance_loss_clip": 0.06275924, + "balance_loss_mlp": 0.01255548, + "epoch": 0.5679242447016384, + "flos": 27759389063040.0, + "grad_norm": 1.2498061463372896, + "language_loss": 0.71243447, + "learning_rate": 1.658756760280259e-06, + "loss": 0.78928661, + "num_input_tokens_seen": 203473025, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.10443115, + "step": 9446, + "time_per_iteration": 2.6276121139526367 + }, + { + "auxiliary_loss_clip": 0.06425235, + "auxiliary_loss_mlp": 0.01269109, + "balance_loss_clip": 0.06276199, + "balance_loss_mlp": 0.01257277, + "epoch": 0.5679843679543063, + "flos": 23775888833280.0, + "grad_norm": 1.7407480451238082, + "language_loss": 0.73674792, + "learning_rate": 1.6583730158942276e-06, + "loss": 0.81369138, + "num_input_tokens_seen": 203492895, + "router_z_loss_clip": 1.48925781, + "router_z_loss_mlp": 0.11828613, + "step": 9447, + "time_per_iteration": 2.5189285278320312 + }, + { + "auxiliary_loss_clip": 0.06428251, + "auxiliary_loss_mlp": 0.01269652, + "balance_loss_clip": 0.06280248, + "balance_loss_mlp": 0.01257272, + "epoch": 0.5680444912069743, + "flos": 25598732480640.0, + "grad_norm": 1.8734928972182148, + "language_loss": 0.75381124, + "learning_rate": 1.657989284462725e-06, + "loss": 0.83079028, + "num_input_tokens_seen": 203513710, + "router_z_loss_clip": 1.47851562, + "router_z_loss_mlp": 0.1239624, + "step": 9448, + "time_per_iteration": 2.5984859466552734 + }, + { + "auxiliary_loss_clip": 0.06428179, + "auxiliary_loss_mlp": 0.01269794, + "balance_loss_clip": 0.0627953, + "balance_loss_mlp": 0.01258415, + "epoch": 0.5681046144596422, + "flos": 23702528983680.0, + "grad_norm": 2.0524228921166556, + "language_loss": 0.76618403, + "learning_rate": 1.6576055660003038e-06, + "loss": 0.84316373, + "num_input_tokens_seen": 203531630, + "router_z_loss_clip": 1.48632812, + "router_z_loss_mlp": 0.1137085, + "step": 9449, + "time_per_iteration": 2.515456438064575 + }, + { + "auxiliary_loss_clip": 0.06423233, + "auxiliary_loss_mlp": 0.012706, + "balance_loss_clip": 0.06278287, + "balance_loss_mlp": 0.01259174, + "epoch": 0.5681647377123102, + "flos": 28008161435520.0, + "grad_norm": 1.4260887566171934, + "language_loss": 0.74914038, + "learning_rate": 1.6572218605215128e-06, + "loss": 0.82607877, + "num_input_tokens_seen": 203551885, + "router_z_loss_clip": 1.44921875, + "router_z_loss_mlp": 0.11425781, + "step": 9450, + "time_per_iteration": 2.5997612476348877 + }, + { + "auxiliary_loss_clip": 0.06425043, + "auxiliary_loss_mlp": 0.01263493, + "balance_loss_clip": 0.06278814, + "balance_loss_mlp": 0.01252526, + "epoch": 0.5682248609649782, + "flos": 22754427235200.0, + "grad_norm": 1.6712621343134006, + "language_loss": 0.66650134, + "learning_rate": 1.6568381680409038e-06, + "loss": 0.74338675, + "num_input_tokens_seen": 203572250, + "router_z_loss_clip": 1.46191406, + "router_z_loss_mlp": 0.10974121, + "step": 9451, + "time_per_iteration": 2.5041069984436035 + }, + { + "auxiliary_loss_clip": 0.06437647, + "auxiliary_loss_mlp": 0.01268609, + "balance_loss_clip": 0.06282589, + "balance_loss_mlp": 0.01255126, + "epoch": 0.5682849842176462, + "flos": 21295070599680.0, + "grad_norm": 1.8399857372619135, + "language_loss": 0.72354877, + "learning_rate": 1.656454488573026e-06, + "loss": 0.80061138, + "num_input_tokens_seen": 203590605, + "router_z_loss_clip": 1.54980469, + "router_z_loss_mlp": 0.1348877, + "step": 9452, + "time_per_iteration": 2.529772996902466 + }, + { + "auxiliary_loss_clip": 0.06419612, + "auxiliary_loss_mlp": 0.01265219, + "balance_loss_clip": 0.06277338, + "balance_loss_mlp": 0.01253799, + "epoch": 0.5683451074703142, + "flos": 21147973557120.0, + "grad_norm": 1.3918203076927713, + "language_loss": 0.70862073, + "learning_rate": 1.656070822132428e-06, + "loss": 0.78546906, + "num_input_tokens_seen": 203610080, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.11419678, + "step": 9453, + "time_per_iteration": 3.975252151489258 + }, + { + "auxiliary_loss_clip": 0.06420393, + "auxiliary_loss_mlp": 0.01265438, + "balance_loss_clip": 0.06276751, + "balance_loss_mlp": 0.01255001, + "epoch": 0.5684052307229821, + "flos": 22350759390720.0, + "grad_norm": 1.7444047953592532, + "language_loss": 0.70346195, + "learning_rate": 1.6556871687336592e-06, + "loss": 0.78032023, + "num_input_tokens_seen": 203630060, + "router_z_loss_clip": 1.43652344, + "router_z_loss_mlp": 0.10443115, + "step": 9454, + "time_per_iteration": 2.530397415161133 + }, + { + "auxiliary_loss_clip": 0.06417777, + "auxiliary_loss_mlp": 0.01265567, + "balance_loss_clip": 0.06276377, + "balance_loss_mlp": 0.01255572, + "epoch": 0.5684653539756501, + "flos": 21805067675520.0, + "grad_norm": 2.3221034941278256, + "language_loss": 0.6090889, + "learning_rate": 1.6553035283912671e-06, + "loss": 0.68592238, + "num_input_tokens_seen": 203649065, + "router_z_loss_clip": 1.41308594, + "router_z_loss_mlp": 0.10003662, + "step": 9455, + "time_per_iteration": 2.5284998416900635 + }, + { + "auxiliary_loss_clip": 0.06432047, + "auxiliary_loss_mlp": 0.01270821, + "balance_loss_clip": 0.06281373, + "balance_loss_mlp": 0.01259144, + "epoch": 0.568525477228318, + "flos": 23005757157120.0, + "grad_norm": 1.7024948062012655, + "language_loss": 0.73315781, + "learning_rate": 1.6549199011198e-06, + "loss": 0.81018651, + "num_input_tokens_seen": 203667545, + "router_z_loss_clip": 1.5078125, + "router_z_loss_mlp": 0.11669922, + "step": 9456, + "time_per_iteration": 2.5266809463500977 + }, + { + "auxiliary_loss_clip": 0.06419168, + "auxiliary_loss_mlp": 0.01265297, + "balance_loss_clip": 0.06275652, + "balance_loss_mlp": 0.01254771, + "epoch": 0.568585600480986, + "flos": 21398045667840.0, + "grad_norm": 1.7476092517075434, + "language_loss": 0.77197653, + "learning_rate": 1.6545362869338048e-06, + "loss": 0.84882128, + "num_input_tokens_seen": 203686025, + "router_z_loss_clip": 1.43457031, + "router_z_loss_mlp": 0.10534668, + "step": 9457, + "time_per_iteration": 2.6098482608795166 + }, + { + "auxiliary_loss_clip": 0.06424686, + "auxiliary_loss_mlp": 0.01267717, + "balance_loss_clip": 0.06278071, + "balance_loss_mlp": 0.01255969, + "epoch": 0.568645723733654, + "flos": 30015054576000.0, + "grad_norm": 1.8479320449106564, + "language_loss": 0.6697377, + "learning_rate": 1.6541526858478285e-06, + "loss": 0.74666172, + "num_input_tokens_seen": 203705540, + "router_z_loss_clip": 1.46484375, + "router_z_loss_mlp": 0.11749268, + "step": 9458, + "time_per_iteration": 4.003401756286621 + }, + { + "auxiliary_loss_clip": 0.06424286, + "auxiliary_loss_mlp": 0.01264614, + "balance_loss_clip": 0.06276263, + "balance_loss_mlp": 0.01253295, + "epoch": 0.568705846986322, + "flos": 20418945108480.0, + "grad_norm": 2.1992346625709427, + "language_loss": 0.68311954, + "learning_rate": 1.6537690978764167e-06, + "loss": 0.76000857, + "num_input_tokens_seen": 203723670, + "router_z_loss_clip": 1.47949219, + "router_z_loss_mlp": 0.11315918, + "step": 9459, + "time_per_iteration": 2.5213470458984375 + }, + { + "auxiliary_loss_clip": 0.06427266, + "auxiliary_loss_mlp": 0.01264866, + "balance_loss_clip": 0.06277259, + "balance_loss_mlp": 0.01253756, + "epoch": 0.5687659702389899, + "flos": 17462440846080.0, + "grad_norm": 2.588089844490271, + "language_loss": 0.77003014, + "learning_rate": 1.6533855230341155e-06, + "loss": 0.84695148, + "num_input_tokens_seen": 203739705, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.11102295, + "step": 9460, + "time_per_iteration": 2.5016860961914062 + }, + { + "auxiliary_loss_clip": 0.06424034, + "auxiliary_loss_mlp": 0.01270464, + "balance_loss_clip": 0.0627469, + "balance_loss_mlp": 0.01258865, + "epoch": 0.5688260934916579, + "flos": 25412335073280.0, + "grad_norm": 1.5686079353810067, + "language_loss": 0.72504562, + "learning_rate": 1.65300196133547e-06, + "loss": 0.80199063, + "num_input_tokens_seen": 203759000, + "router_z_loss_clip": 1.49316406, + "router_z_loss_mlp": 0.11602783, + "step": 9461, + "time_per_iteration": 2.652650833129883 + }, + { + "auxiliary_loss_clip": 0.06420281, + "auxiliary_loss_mlp": 0.01265549, + "balance_loss_clip": 0.06276302, + "balance_loss_mlp": 0.01254707, + "epoch": 0.5688862167443258, + "flos": 21613052044800.0, + "grad_norm": 1.8456676032626356, + "language_loss": 0.73588586, + "learning_rate": 1.6526184127950249e-06, + "loss": 0.81274414, + "num_input_tokens_seen": 203774295, + "router_z_loss_clip": 1.43945312, + "router_z_loss_mlp": 0.10839844, + "step": 9462, + "time_per_iteration": 3.9915239810943604 + }, + { + "auxiliary_loss_clip": 0.06414893, + "auxiliary_loss_mlp": 0.01264818, + "balance_loss_clip": 0.06275715, + "balance_loss_mlp": 0.01254715, + "epoch": 0.5689463399969938, + "flos": 22425544759680.0, + "grad_norm": 2.0067901163228212, + "language_loss": 0.72924364, + "learning_rate": 1.6522348774273246e-06, + "loss": 0.80604076, + "num_input_tokens_seen": 203792710, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.10107422, + "step": 9463, + "time_per_iteration": 2.5026743412017822 + }, + { + "auxiliary_loss_clip": 0.06417, + "auxiliary_loss_mlp": 0.01266249, + "balance_loss_clip": 0.06272251, + "balance_loss_mlp": 0.01255115, + "epoch": 0.5690064632496618, + "flos": 18302787843840.0, + "grad_norm": 1.7796234570298675, + "language_loss": 0.7436375, + "learning_rate": 1.6518513552469123e-06, + "loss": 0.82046998, + "num_input_tokens_seen": 203811645, + "router_z_loss_clip": 1.4453125, + "router_z_loss_mlp": 0.11126709, + "step": 9464, + "time_per_iteration": 2.5418522357940674 + }, + { + "auxiliary_loss_clip": 0.06420638, + "auxiliary_loss_mlp": 0.01265209, + "balance_loss_clip": 0.06273931, + "balance_loss_mlp": 0.01253169, + "epoch": 0.5690665865023298, + "flos": 21585575105280.0, + "grad_norm": 1.531985348456469, + "language_loss": 0.84518385, + "learning_rate": 1.6514678462683312e-06, + "loss": 0.92204237, + "num_input_tokens_seen": 203830040, + "router_z_loss_clip": 1.46679688, + "router_z_loss_mlp": 0.12060547, + "step": 9465, + "time_per_iteration": 2.501640558242798 + }, + { + "auxiliary_loss_clip": 0.06416291, + "auxiliary_loss_mlp": 0.01262582, + "balance_loss_clip": 0.06275291, + "balance_loss_mlp": 0.01251954, + "epoch": 0.5691267097549978, + "flos": 24427616290560.0, + "grad_norm": 1.5399864144711508, + "language_loss": 0.72636294, + "learning_rate": 1.651084350506125e-06, + "loss": 0.80315161, + "num_input_tokens_seen": 203851245, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.10638428, + "step": 9466, + "time_per_iteration": 2.5872812271118164 + }, + { + "auxiliary_loss_clip": 0.06322309, + "auxiliary_loss_mlp": 0.01252779, + "balance_loss_clip": 0.06261392, + "balance_loss_mlp": 0.01251253, + "epoch": 0.5691868330076657, + "flos": 61679915389440.0, + "grad_norm": 0.706168287542021, + "language_loss": 0.55225098, + "learning_rate": 1.6507008679748343e-06, + "loss": 0.62800181, + "num_input_tokens_seen": 203916400, + "router_z_loss_clip": 0.609375, + "router_z_loss_mlp": 0.01525879, + "step": 9467, + "time_per_iteration": 3.1809115409851074 + }, + { + "auxiliary_loss_clip": 0.06421535, + "auxiliary_loss_mlp": 0.01265338, + "balance_loss_clip": 0.06275938, + "balance_loss_mlp": 0.01253471, + "epoch": 0.5692469562603337, + "flos": 21331687633920.0, + "grad_norm": 1.821723086609738, + "language_loss": 0.64103729, + "learning_rate": 1.6503173986890023e-06, + "loss": 0.717906, + "num_input_tokens_seen": 203935870, + "router_z_loss_clip": 1.45605469, + "router_z_loss_mlp": 0.11865234, + "step": 9468, + "time_per_iteration": 2.5419483184814453 + }, + { + "auxiliary_loss_clip": 0.06420718, + "auxiliary_loss_mlp": 0.01268612, + "balance_loss_clip": 0.06276828, + "balance_loss_mlp": 0.01257508, + "epoch": 0.5693070795130016, + "flos": 23374652757120.0, + "grad_norm": 2.0216455322076885, + "language_loss": 0.79510915, + "learning_rate": 1.64993394266317e-06, + "loss": 0.87200236, + "num_input_tokens_seen": 203954950, + "router_z_loss_clip": 1.43945312, + "router_z_loss_mlp": 0.11102295, + "step": 9469, + "time_per_iteration": 3.974965810775757 + }, + { + "auxiliary_loss_clip": 0.06424933, + "auxiliary_loss_mlp": 0.01267744, + "balance_loss_clip": 0.06275818, + "balance_loss_mlp": 0.01256133, + "epoch": 0.5693672027656697, + "flos": 18703143452160.0, + "grad_norm": 1.8253898689046395, + "language_loss": 0.69934285, + "learning_rate": 1.6495504999118769e-06, + "loss": 0.77626961, + "num_input_tokens_seen": 203972715, + "router_z_loss_clip": 1.48925781, + "router_z_loss_mlp": 0.11608887, + "step": 9470, + "time_per_iteration": 2.490144729614258 + }, + { + "auxiliary_loss_clip": 0.06418116, + "auxiliary_loss_mlp": 0.01266169, + "balance_loss_clip": 0.06273302, + "balance_loss_mlp": 0.01254391, + "epoch": 0.5694273260183376, + "flos": 20455478288640.0, + "grad_norm": 2.1472118271494574, + "language_loss": 0.75247335, + "learning_rate": 1.6491670704496644e-06, + "loss": 0.82931614, + "num_input_tokens_seen": 203990775, + "router_z_loss_clip": 1.44824219, + "router_z_loss_mlp": 0.11785889, + "step": 9471, + "time_per_iteration": 2.5518500804901123 + }, + { + "auxiliary_loss_clip": 0.06417546, + "auxiliary_loss_mlp": 0.01266321, + "balance_loss_clip": 0.06276481, + "balance_loss_mlp": 0.01255616, + "epoch": 0.5694874492710056, + "flos": 17608992837120.0, + "grad_norm": 1.6827496814774499, + "language_loss": 0.57877314, + "learning_rate": 1.6487836542910716e-06, + "loss": 0.65561181, + "num_input_tokens_seen": 204008845, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.10705566, + "step": 9472, + "time_per_iteration": 2.535846710205078 + }, + { + "auxiliary_loss_clip": 0.06416848, + "auxiliary_loss_mlp": 0.01268789, + "balance_loss_clip": 0.06277969, + "balance_loss_mlp": 0.01257411, + "epoch": 0.5695475725236735, + "flos": 13375923621120.0, + "grad_norm": 1.7815747768820038, + "language_loss": 0.73987466, + "learning_rate": 1.648400251450638e-06, + "loss": 0.81673104, + "num_input_tokens_seen": 204023755, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.11376953, + "step": 9473, + "time_per_iteration": 2.4858133792877197 + }, + { + "auxiliary_loss_clip": 0.06327727, + "auxiliary_loss_mlp": 0.01252353, + "balance_loss_clip": 0.06266978, + "balance_loss_mlp": 0.01250914, + "epoch": 0.5696076957763415, + "flos": 68195078881920.0, + "grad_norm": 0.6484051468543478, + "language_loss": 0.57388628, + "learning_rate": 1.6480168619429023e-06, + "loss": 0.64968711, + "num_input_tokens_seen": 204091255, + "router_z_loss_clip": 0.609375, + "router_z_loss_mlp": 0.01437378, + "step": 9474, + "time_per_iteration": 3.1554436683654785 + }, + { + "auxiliary_loss_clip": 0.06415011, + "auxiliary_loss_mlp": 0.01264959, + "balance_loss_clip": 0.06274811, + "balance_loss_mlp": 0.01254111, + "epoch": 0.5696678190290094, + "flos": 33846636153600.0, + "grad_norm": 1.6105466561987234, + "language_loss": 0.54358017, + "learning_rate": 1.6476334857824017e-06, + "loss": 0.62037987, + "num_input_tokens_seen": 204113285, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.10845947, + "step": 9475, + "time_per_iteration": 2.6193020343780518 + }, + { + "auxiliary_loss_clip": 0.06419323, + "auxiliary_loss_mlp": 0.01263613, + "balance_loss_clip": 0.06274848, + "balance_loss_mlp": 0.01252234, + "epoch": 0.5697279422816774, + "flos": 26363329787520.0, + "grad_norm": 2.008545727860435, + "language_loss": 0.79765999, + "learning_rate": 1.647250122983675e-06, + "loss": 0.87448931, + "num_input_tokens_seen": 204133045, + "router_z_loss_clip": 1.44726562, + "router_z_loss_mlp": 0.11383057, + "step": 9476, + "time_per_iteration": 2.543100595474243 + }, + { + "auxiliary_loss_clip": 0.06428041, + "auxiliary_loss_mlp": 0.01271624, + "balance_loss_clip": 0.06279552, + "balance_loss_mlp": 0.01260209, + "epoch": 0.5697880655343454, + "flos": 22937260844160.0, + "grad_norm": 1.735529425276041, + "language_loss": 0.66121185, + "learning_rate": 1.6468667735612592e-06, + "loss": 0.73820853, + "num_input_tokens_seen": 204152590, + "router_z_loss_clip": 1.48339844, + "router_z_loss_mlp": 0.11407471, + "step": 9477, + "time_per_iteration": 2.5366005897521973 + }, + { + "auxiliary_loss_clip": 0.06423311, + "auxiliary_loss_mlp": 0.01266336, + "balance_loss_clip": 0.06277082, + "balance_loss_mlp": 0.0125553, + "epoch": 0.5698481887870134, + "flos": 26768674713600.0, + "grad_norm": 1.6190739346076362, + "language_loss": 0.71115196, + "learning_rate": 1.6464834375296906e-06, + "loss": 0.78804839, + "num_input_tokens_seen": 204171815, + "router_z_loss_clip": 1.46289062, + "router_z_loss_mlp": 0.1081543, + "step": 9478, + "time_per_iteration": 2.5513012409210205 + }, + { + "auxiliary_loss_clip": 0.06415288, + "auxiliary_loss_mlp": 0.01266638, + "balance_loss_clip": 0.06277218, + "balance_loss_mlp": 0.01255718, + "epoch": 0.5699083120396814, + "flos": 15747729292800.0, + "grad_norm": 1.4794360727515914, + "language_loss": 0.69306439, + "learning_rate": 1.6461001149035055e-06, + "loss": 0.76988363, + "num_input_tokens_seen": 204188535, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.10913086, + "step": 9479, + "time_per_iteration": 2.5828471183776855 + }, + { + "auxiliary_loss_clip": 0.06413876, + "auxiliary_loss_mlp": 0.01267563, + "balance_loss_clip": 0.06275865, + "balance_loss_mlp": 0.0125734, + "epoch": 0.5699684352923493, + "flos": 19543448522880.0, + "grad_norm": 1.5013072139655574, + "language_loss": 0.71621788, + "learning_rate": 1.6457168056972392e-06, + "loss": 0.79303229, + "num_input_tokens_seen": 204208365, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.10223389, + "step": 9480, + "time_per_iteration": 2.5247299671173096 + }, + { + "auxiliary_loss_clip": 0.06418922, + "auxiliary_loss_mlp": 0.01267091, + "balance_loss_clip": 0.06276189, + "balance_loss_mlp": 0.01255319, + "epoch": 0.5700285585450173, + "flos": 16258942252800.0, + "grad_norm": 4.885605743124815, + "language_loss": 0.72444856, + "learning_rate": 1.6453335099254276e-06, + "loss": 0.80130869, + "num_input_tokens_seen": 204226560, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.11779785, + "step": 9481, + "time_per_iteration": 2.508589506149292 + }, + { + "auxiliary_loss_clip": 0.06421519, + "auxiliary_loss_mlp": 0.01270221, + "balance_loss_clip": 0.06279288, + "balance_loss_mlp": 0.01258461, + "epoch": 0.5700886817976852, + "flos": 19871115114240.0, + "grad_norm": 1.897422682992244, + "language_loss": 0.78625083, + "learning_rate": 1.6449502276026041e-06, + "loss": 0.86316824, + "num_input_tokens_seen": 204245410, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.11761475, + "step": 9482, + "time_per_iteration": 2.5139269828796387 + }, + { + "auxiliary_loss_clip": 0.06417527, + "auxiliary_loss_mlp": 0.01263816, + "balance_loss_clip": 0.06276704, + "balance_loss_mlp": 0.01253242, + "epoch": 0.5701488050503533, + "flos": 23848452069120.0, + "grad_norm": 2.496783055499815, + "language_loss": 0.78338385, + "learning_rate": 1.6445669587433043e-06, + "loss": 0.86019731, + "num_input_tokens_seen": 204264840, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.10571289, + "step": 9483, + "time_per_iteration": 2.547522783279419 + }, + { + "auxiliary_loss_clip": 0.06420138, + "auxiliary_loss_mlp": 0.01264109, + "balance_loss_clip": 0.06276282, + "balance_loss_mlp": 0.0125369, + "epoch": 0.5702089283030212, + "flos": 23666457000960.0, + "grad_norm": 1.5289248173251733, + "language_loss": 0.81642497, + "learning_rate": 1.6441837033620612e-06, + "loss": 0.89326739, + "num_input_tokens_seen": 204284335, + "router_z_loss_clip": 1.43652344, + "router_z_loss_mlp": 0.10424805, + "step": 9484, + "time_per_iteration": 2.546597719192505 + }, + { + "auxiliary_loss_clip": 0.06420925, + "auxiliary_loss_mlp": 0.01267488, + "balance_loss_clip": 0.06277504, + "balance_loss_mlp": 0.01255924, + "epoch": 0.5702690515556892, + "flos": 27898519968000.0, + "grad_norm": 1.8682928794178455, + "language_loss": 0.61101806, + "learning_rate": 1.6438004614734073e-06, + "loss": 0.68790221, + "num_input_tokens_seen": 204302590, + "router_z_loss_clip": 1.43457031, + "router_z_loss_mlp": 0.11560059, + "step": 9485, + "time_per_iteration": 2.5931575298309326 + }, + { + "auxiliary_loss_clip": 0.06421611, + "auxiliary_loss_mlp": 0.01267401, + "balance_loss_clip": 0.06277725, + "balance_loss_mlp": 0.01255748, + "epoch": 0.5703291748083571, + "flos": 24030698699520.0, + "grad_norm": 1.7282499785723824, + "language_loss": 0.65970731, + "learning_rate": 1.6434172330918757e-06, + "loss": 0.73659742, + "num_input_tokens_seen": 204323055, + "router_z_loss_clip": 1.43945312, + "router_z_loss_mlp": 0.11645508, + "step": 9486, + "time_per_iteration": 2.546604871749878 + }, + { + "auxiliary_loss_clip": 0.06330933, + "auxiliary_loss_mlp": 0.01257137, + "balance_loss_clip": 0.06271148, + "balance_loss_mlp": 0.01255769, + "epoch": 0.5703892980610251, + "flos": 57044478067200.0, + "grad_norm": 0.6556389442355417, + "language_loss": 0.47978726, + "learning_rate": 1.6430340182319978e-06, + "loss": 0.55566794, + "num_input_tokens_seen": 204386160, + "router_z_loss_clip": 0.60058594, + "router_z_loss_mlp": 0.01370239, + "step": 9487, + "time_per_iteration": 3.216449499130249 + }, + { + "auxiliary_loss_clip": 0.06419921, + "auxiliary_loss_mlp": 0.01266304, + "balance_loss_clip": 0.06275571, + "balance_loss_mlp": 0.01255212, + "epoch": 0.570449421313693, + "flos": 24357610604160.0, + "grad_norm": 1.4009858057112485, + "language_loss": 0.8597424, + "learning_rate": 1.6426508169083067e-06, + "loss": 0.93660462, + "num_input_tokens_seen": 204406315, + "router_z_loss_clip": 1.44335938, + "router_z_loss_mlp": 0.11102295, + "step": 9488, + "time_per_iteration": 2.5608506202697754 + }, + { + "auxiliary_loss_clip": 0.06428364, + "auxiliary_loss_mlp": 0.01270308, + "balance_loss_clip": 0.06281118, + "balance_loss_mlp": 0.01259055, + "epoch": 0.570509544566361, + "flos": 24835770328320.0, + "grad_norm": 1.8825828159705935, + "language_loss": 0.79195142, + "learning_rate": 1.6422676291353314e-06, + "loss": 0.86893809, + "num_input_tokens_seen": 204427645, + "router_z_loss_clip": 1.47265625, + "router_z_loss_mlp": 0.11260986, + "step": 9489, + "time_per_iteration": 2.553471088409424 + }, + { + "auxiliary_loss_clip": 0.06419341, + "auxiliary_loss_mlp": 0.01263993, + "balance_loss_clip": 0.06276694, + "balance_loss_mlp": 0.01253646, + "epoch": 0.570569667819029, + "flos": 21403663891200.0, + "grad_norm": 1.6360729178743676, + "language_loss": 0.7047472, + "learning_rate": 1.641884454927604e-06, + "loss": 0.78158057, + "num_input_tokens_seen": 204445910, + "router_z_loss_clip": 1.42480469, + "router_z_loss_mlp": 0.10345459, + "step": 9490, + "time_per_iteration": 2.5905275344848633 + }, + { + "auxiliary_loss_clip": 0.06421432, + "auxiliary_loss_mlp": 0.01269104, + "balance_loss_clip": 0.06279342, + "balance_loss_mlp": 0.01257803, + "epoch": 0.570629791071697, + "flos": 23222608323840.0, + "grad_norm": 1.4492809017584538, + "language_loss": 0.76252091, + "learning_rate": 1.6415012942996548e-06, + "loss": 0.83942628, + "num_input_tokens_seen": 204464680, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.11291504, + "step": 9491, + "time_per_iteration": 2.523472309112549 + }, + { + "auxiliary_loss_clip": 0.06328943, + "auxiliary_loss_mlp": 0.01263516, + "balance_loss_clip": 0.06268945, + "balance_loss_mlp": 0.01261694, + "epoch": 0.570689914324365, + "flos": 65303632915200.0, + "grad_norm": 0.7890932915341226, + "language_loss": 0.57371008, + "learning_rate": 1.641118147266011e-06, + "loss": 0.64963466, + "num_input_tokens_seen": 204525580, + "router_z_loss_clip": 0.6015625, + "router_z_loss_mlp": 0.01817322, + "step": 9492, + "time_per_iteration": 4.556811571121216 + }, + { + "auxiliary_loss_clip": 0.06420883, + "auxiliary_loss_mlp": 0.01266854, + "balance_loss_clip": 0.0627829, + "balance_loss_mlp": 0.01255809, + "epoch": 0.5707500375770329, + "flos": 21148225119360.0, + "grad_norm": 2.4823752626433357, + "language_loss": 0.71714401, + "learning_rate": 1.6407350138412035e-06, + "loss": 0.79402137, + "num_input_tokens_seen": 204541320, + "router_z_loss_clip": 1.42480469, + "router_z_loss_mlp": 0.1104126, + "step": 9493, + "time_per_iteration": 2.5404999256134033 + }, + { + "auxiliary_loss_clip": 0.06425234, + "auxiliary_loss_mlp": 0.01270244, + "balance_loss_clip": 0.06277438, + "balance_loss_mlp": 0.01258812, + "epoch": 0.5708101608297009, + "flos": 20818881446400.0, + "grad_norm": 1.6649189140980358, + "language_loss": 0.77940559, + "learning_rate": 1.6403518940397606e-06, + "loss": 0.85636032, + "num_input_tokens_seen": 204560275, + "router_z_loss_clip": 1.4765625, + "router_z_loss_mlp": 0.11431885, + "step": 9494, + "time_per_iteration": 2.5486340522766113 + }, + { + "auxiliary_loss_clip": 0.06427161, + "auxiliary_loss_mlp": 0.01267162, + "balance_loss_clip": 0.06276955, + "balance_loss_mlp": 0.01255026, + "epoch": 0.5708702840823688, + "flos": 25819482862080.0, + "grad_norm": 2.058789415113096, + "language_loss": 0.80377084, + "learning_rate": 1.6399687878762096e-06, + "loss": 0.88071406, + "num_input_tokens_seen": 204579430, + "router_z_loss_clip": 1.49902344, + "router_z_loss_mlp": 0.12127686, + "step": 9495, + "time_per_iteration": 2.5960187911987305 + }, + { + "auxiliary_loss_clip": 0.06429706, + "auxiliary_loss_mlp": 0.01275013, + "balance_loss_clip": 0.06277497, + "balance_loss_mlp": 0.01261567, + "epoch": 0.5709304073350369, + "flos": 23657400760320.0, + "grad_norm": 1.9375866549540641, + "language_loss": 0.66475153, + "learning_rate": 1.6395856953650784e-06, + "loss": 0.74179876, + "num_input_tokens_seen": 204597710, + "router_z_loss_clip": 1.5234375, + "router_z_loss_mlp": 0.13446045, + "step": 9496, + "time_per_iteration": 2.536844253540039 + }, + { + "auxiliary_loss_clip": 0.06424591, + "auxiliary_loss_mlp": 0.0126837, + "balance_loss_clip": 0.06275633, + "balance_loss_mlp": 0.01256485, + "epoch": 0.5709905305877048, + "flos": 16113144948480.0, + "grad_norm": 2.1097086993227068, + "language_loss": 0.70119512, + "learning_rate": 1.6392026165208938e-06, + "loss": 0.77812475, + "num_input_tokens_seen": 204616140, + "router_z_loss_clip": 1.49023438, + "router_z_loss_mlp": 0.11877441, + "step": 9497, + "time_per_iteration": 2.5001566410064697 + }, + { + "auxiliary_loss_clip": 0.06421457, + "auxiliary_loss_mlp": 0.01273203, + "balance_loss_clip": 0.06275579, + "balance_loss_mlp": 0.01261455, + "epoch": 0.5710506538403728, + "flos": 24757211525760.0, + "grad_norm": 5.203790092819982, + "language_loss": 0.81695306, + "learning_rate": 1.638819551358182e-06, + "loss": 0.89389962, + "num_input_tokens_seen": 204636470, + "router_z_loss_clip": 1.45800781, + "router_z_loss_mlp": 0.11755371, + "step": 9498, + "time_per_iteration": 3.979785203933716 + }, + { + "auxiliary_loss_clip": 0.06421061, + "auxiliary_loss_mlp": 0.01268836, + "balance_loss_clip": 0.06273817, + "balance_loss_mlp": 0.0125707, + "epoch": 0.5711107770930407, + "flos": 21988907533440.0, + "grad_norm": 1.778867640796668, + "language_loss": 0.66763413, + "learning_rate": 1.638436499891469e-06, + "loss": 0.74453306, + "num_input_tokens_seen": 204656640, + "router_z_loss_clip": 1.47167969, + "router_z_loss_mlp": 0.11767578, + "step": 9499, + "time_per_iteration": 2.560131788253784 + }, + { + "auxiliary_loss_clip": 0.06422064, + "auxiliary_loss_mlp": 0.01267168, + "balance_loss_clip": 0.06276537, + "balance_loss_mlp": 0.01255432, + "epoch": 0.5711709003457087, + "flos": 19580233265280.0, + "grad_norm": 1.5461706893268885, + "language_loss": 0.71884078, + "learning_rate": 1.6380534621352805e-06, + "loss": 0.79573303, + "num_input_tokens_seen": 204675475, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.11743164, + "step": 9500, + "time_per_iteration": 2.51857852935791 + }, + { + "auxiliary_loss_clip": 0.06426705, + "auxiliary_loss_mlp": 0.01270529, + "balance_loss_clip": 0.06277592, + "balance_loss_mlp": 0.01257893, + "epoch": 0.5712310235983766, + "flos": 24249436583040.0, + "grad_norm": 1.9132916799477426, + "language_loss": 0.76773643, + "learning_rate": 1.6376704381041407e-06, + "loss": 0.8447088, + "num_input_tokens_seen": 204695385, + "router_z_loss_clip": 1.4921875, + "router_z_loss_mlp": 0.12640381, + "step": 9501, + "time_per_iteration": 2.585303544998169 + }, + { + "auxiliary_loss_clip": 0.06424866, + "auxiliary_loss_mlp": 0.01265647, + "balance_loss_clip": 0.06278552, + "balance_loss_mlp": 0.01254233, + "epoch": 0.5712911468510447, + "flos": 21002469742080.0, + "grad_norm": 1.6366629976038132, + "language_loss": 0.75004148, + "learning_rate": 1.6372874278125742e-06, + "loss": 0.82694662, + "num_input_tokens_seen": 204714730, + "router_z_loss_clip": 1.46289062, + "router_z_loss_mlp": 0.11419678, + "step": 9502, + "time_per_iteration": 3.9893364906311035 + }, + { + "auxiliary_loss_clip": 0.06420161, + "auxiliary_loss_mlp": 0.0126738, + "balance_loss_clip": 0.0627653, + "balance_loss_mlp": 0.01256561, + "epoch": 0.5713512701037126, + "flos": 18923055292800.0, + "grad_norm": 1.7156142062685982, + "language_loss": 0.82350051, + "learning_rate": 1.636904431275105e-06, + "loss": 0.90037596, + "num_input_tokens_seen": 204735025, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.10827637, + "step": 9503, + "time_per_iteration": 2.5289459228515625 + }, + { + "auxiliary_loss_clip": 0.06420251, + "auxiliary_loss_mlp": 0.01271521, + "balance_loss_clip": 0.06276201, + "balance_loss_mlp": 0.01260375, + "epoch": 0.5714113933563806, + "flos": 17417983455360.0, + "grad_norm": 2.1350982520901827, + "language_loss": 0.86264861, + "learning_rate": 1.6365214485062553e-06, + "loss": 0.93956631, + "num_input_tokens_seen": 204751365, + "router_z_loss_clip": 1.44140625, + "router_z_loss_mlp": 0.1114502, + "step": 9504, + "time_per_iteration": 2.5180015563964844 + }, + { + "auxiliary_loss_clip": 0.06417073, + "auxiliary_loss_mlp": 0.01266636, + "balance_loss_clip": 0.06275576, + "balance_loss_mlp": 0.01255651, + "epoch": 0.5714715166090486, + "flos": 20199536392320.0, + "grad_norm": 2.0316869593340265, + "language_loss": 0.75480437, + "learning_rate": 1.6361384795205496e-06, + "loss": 0.83164144, + "num_input_tokens_seen": 204768980, + "router_z_loss_clip": 1.41503906, + "router_z_loss_mlp": 0.10980225, + "step": 9505, + "time_per_iteration": 2.497009754180908 + }, + { + "auxiliary_loss_clip": 0.06418754, + "auxiliary_loss_mlp": 0.01267922, + "balance_loss_clip": 0.06276823, + "balance_loss_mlp": 0.01256419, + "epoch": 0.5715316398617165, + "flos": 18557597710080.0, + "grad_norm": 1.6474042198541896, + "language_loss": 0.82215714, + "learning_rate": 1.635755524332509e-06, + "loss": 0.89902395, + "num_input_tokens_seen": 204788110, + "router_z_loss_clip": 1.41796875, + "router_z_loss_mlp": 0.1151123, + "step": 9506, + "time_per_iteration": 2.5657498836517334 + }, + { + "auxiliary_loss_clip": 0.06418438, + "auxiliary_loss_mlp": 0.01265118, + "balance_loss_clip": 0.0627599, + "balance_loss_mlp": 0.01254568, + "epoch": 0.5715917631143845, + "flos": 18484028225280.0, + "grad_norm": 1.482727560680873, + "language_loss": 0.77285796, + "learning_rate": 1.6353725829566552e-06, + "loss": 0.84969354, + "num_input_tokens_seen": 204807240, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.10546875, + "step": 9507, + "time_per_iteration": 2.485496997833252 + }, + { + "auxiliary_loss_clip": 0.06422855, + "auxiliary_loss_mlp": 0.01269089, + "balance_loss_clip": 0.06276034, + "balance_loss_mlp": 0.01257091, + "epoch": 0.5716518863670524, + "flos": 24026128652160.0, + "grad_norm": 1.4323391248104125, + "language_loss": 0.68799454, + "learning_rate": 1.63498965540751e-06, + "loss": 0.76491398, + "num_input_tokens_seen": 204826415, + "router_z_loss_clip": 1.46777344, + "router_z_loss_mlp": 0.12005615, + "step": 9508, + "time_per_iteration": 2.5643258094787598 + }, + { + "auxiliary_loss_clip": 0.06422228, + "auxiliary_loss_mlp": 0.01264898, + "balance_loss_clip": 0.06276274, + "balance_loss_mlp": 0.012529, + "epoch": 0.5717120096197205, + "flos": 17824879681920.0, + "grad_norm": 2.05386002816889, + "language_loss": 0.80054557, + "learning_rate": 1.634606741699593e-06, + "loss": 0.87741685, + "num_input_tokens_seen": 204844305, + "router_z_loss_clip": 1.45605469, + "router_z_loss_mlp": 0.11987305, + "step": 9509, + "time_per_iteration": 3.8947436809539795 + }, + { + "auxiliary_loss_clip": 0.06415324, + "auxiliary_loss_mlp": 0.0126599, + "balance_loss_clip": 0.06274744, + "balance_loss_mlp": 0.01255691, + "epoch": 0.5717721328723884, + "flos": 21871551490560.0, + "grad_norm": 1.798702817725972, + "language_loss": 0.72265553, + "learning_rate": 1.6342238418474255e-06, + "loss": 0.79946876, + "num_input_tokens_seen": 204861765, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.10302734, + "step": 9510, + "time_per_iteration": 2.496246099472046 + }, + { + "auxiliary_loss_clip": 0.06419715, + "auxiliary_loss_mlp": 0.01266842, + "balance_loss_clip": 0.0627699, + "balance_loss_mlp": 0.01255946, + "epoch": 0.5718322561250564, + "flos": 28444924442880.0, + "grad_norm": 1.3126461366590796, + "language_loss": 0.69652188, + "learning_rate": 1.6338409558655264e-06, + "loss": 0.77338743, + "num_input_tokens_seen": 204882505, + "router_z_loss_clip": 1.42480469, + "router_z_loss_mlp": 0.10906982, + "step": 9511, + "time_per_iteration": 2.5713541507720947 + }, + { + "auxiliary_loss_clip": 0.06420782, + "auxiliary_loss_mlp": 0.01268426, + "balance_loss_clip": 0.06277648, + "balance_loss_mlp": 0.01257136, + "epoch": 0.5718923793777243, + "flos": 13556702805120.0, + "grad_norm": 2.0681515910732715, + "language_loss": 0.61827439, + "learning_rate": 1.6334580837684152e-06, + "loss": 0.69516647, + "num_input_tokens_seen": 204899830, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.112854, + "step": 9512, + "time_per_iteration": 2.49580454826355 + }, + { + "auxiliary_loss_clip": 0.06421502, + "auxiliary_loss_mlp": 0.01268423, + "balance_loss_clip": 0.06278209, + "balance_loss_mlp": 0.01257498, + "epoch": 0.5719525026303923, + "flos": 17827856501760.0, + "grad_norm": 2.3676523534955685, + "language_loss": 0.76396298, + "learning_rate": 1.6330752255706104e-06, + "loss": 0.84086221, + "num_input_tokens_seen": 204918100, + "router_z_loss_clip": 1.43359375, + "router_z_loss_mlp": 0.10919189, + "step": 9513, + "time_per_iteration": 2.500870704650879 + }, + { + "auxiliary_loss_clip": 0.06326592, + "auxiliary_loss_mlp": 0.01253708, + "balance_loss_clip": 0.06266873, + "balance_loss_mlp": 0.01252076, + "epoch": 0.5720126258830602, + "flos": 61314724097280.0, + "grad_norm": 0.891161207726192, + "language_loss": 0.66879886, + "learning_rate": 1.6326923812866288e-06, + "loss": 0.74460191, + "num_input_tokens_seen": 204972925, + "router_z_loss_clip": 0.59765625, + "router_z_loss_mlp": 0.01634216, + "step": 9514, + "time_per_iteration": 3.1455137729644775 + }, + { + "auxiliary_loss_clip": 0.06430741, + "auxiliary_loss_mlp": 0.0127094, + "balance_loss_clip": 0.06282684, + "balance_loss_mlp": 0.01258941, + "epoch": 0.5720727491357283, + "flos": 23994878279040.0, + "grad_norm": 2.149685980416527, + "language_loss": 0.81938076, + "learning_rate": 1.63230955093099e-06, + "loss": 0.89639759, + "num_input_tokens_seen": 204990910, + "router_z_loss_clip": 1.48046875, + "router_z_loss_mlp": 0.12005615, + "step": 9515, + "time_per_iteration": 2.5996580123901367 + }, + { + "auxiliary_loss_clip": 0.0641297, + "auxiliary_loss_mlp": 0.01267881, + "balance_loss_clip": 0.06274894, + "balance_loss_mlp": 0.01257259, + "epoch": 0.5721328723883962, + "flos": 23412359894400.0, + "grad_norm": 1.6126279146943563, + "language_loss": 0.86095083, + "learning_rate": 1.6319267345182092e-06, + "loss": 0.93775928, + "num_input_tokens_seen": 205010500, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.10620117, + "step": 9516, + "time_per_iteration": 2.5553810596466064 + }, + { + "auxiliary_loss_clip": 0.06417726, + "auxiliary_loss_mlp": 0.01271814, + "balance_loss_clip": 0.06275768, + "balance_loss_mlp": 0.01260572, + "epoch": 0.5721929956410642, + "flos": 18810520859520.0, + "grad_norm": 2.197571780359881, + "language_loss": 0.87770617, + "learning_rate": 1.6315439320628038e-06, + "loss": 0.95460165, + "num_input_tokens_seen": 205028560, + "router_z_loss_clip": 1.41992188, + "router_z_loss_mlp": 0.11242676, + "step": 9517, + "time_per_iteration": 2.5858652591705322 + }, + { + "auxiliary_loss_clip": 0.06417002, + "auxiliary_loss_mlp": 0.01265386, + "balance_loss_clip": 0.0627486, + "balance_loss_mlp": 0.01254114, + "epoch": 0.5722531188937322, + "flos": 27203676785280.0, + "grad_norm": 1.5341934137919409, + "language_loss": 0.85065883, + "learning_rate": 1.6311611435792893e-06, + "loss": 0.92748272, + "num_input_tokens_seen": 205048650, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.11273193, + "step": 9518, + "time_per_iteration": 2.5850136280059814 + }, + { + "auxiliary_loss_clip": 0.06417416, + "auxiliary_loss_mlp": 0.01266921, + "balance_loss_clip": 0.06278273, + "balance_loss_mlp": 0.01256044, + "epoch": 0.5723132421464001, + "flos": 15201157109760.0, + "grad_norm": 1.5672659775495308, + "language_loss": 0.78797317, + "learning_rate": 1.6307783690821812e-06, + "loss": 0.86481655, + "num_input_tokens_seen": 205066480, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.10870361, + "step": 9519, + "time_per_iteration": 2.5459818840026855 + }, + { + "auxiliary_loss_clip": 0.06418845, + "auxiliary_loss_mlp": 0.01271535, + "balance_loss_clip": 0.06277601, + "balance_loss_mlp": 0.01260675, + "epoch": 0.5723733653990681, + "flos": 27606757651200.0, + "grad_norm": 1.4075514987328583, + "language_loss": 0.83134615, + "learning_rate": 1.6303956085859944e-06, + "loss": 0.90824991, + "num_input_tokens_seen": 205087475, + "router_z_loss_clip": 1.41210938, + "router_z_loss_mlp": 0.10864258, + "step": 9520, + "time_per_iteration": 2.66892671585083 + }, + { + "auxiliary_loss_clip": 0.06426139, + "auxiliary_loss_mlp": 0.01264412, + "balance_loss_clip": 0.06279796, + "balance_loss_mlp": 0.01253022, + "epoch": 0.572433488651736, + "flos": 18228673307520.0, + "grad_norm": 1.9996427544433133, + "language_loss": 0.73064411, + "learning_rate": 1.630012862105243e-06, + "loss": 0.80754966, + "num_input_tokens_seen": 205106495, + "router_z_loss_clip": 1.46484375, + "router_z_loss_mlp": 0.11383057, + "step": 9521, + "time_per_iteration": 2.5980701446533203 + }, + { + "auxiliary_loss_clip": 0.06419297, + "auxiliary_loss_mlp": 0.01270088, + "balance_loss_clip": 0.06276461, + "balance_loss_mlp": 0.01259073, + "epoch": 0.5724936119044041, + "flos": 31257224628480.0, + "grad_norm": 1.5867052207792396, + "language_loss": 0.77991247, + "learning_rate": 1.6296301296544415e-06, + "loss": 0.85680634, + "num_input_tokens_seen": 205128285, + "router_z_loss_clip": 1.42675781, + "router_z_loss_mlp": 0.11022949, + "step": 9522, + "time_per_iteration": 2.5890755653381348 + }, + { + "auxiliary_loss_clip": 0.06416851, + "auxiliary_loss_mlp": 0.01267889, + "balance_loss_clip": 0.06278282, + "balance_loss_mlp": 0.01257649, + "epoch": 0.572553735157072, + "flos": 19207186888320.0, + "grad_norm": 1.441878230551161, + "language_loss": 0.72110128, + "learning_rate": 1.629247411248102e-06, + "loss": 0.79794878, + "num_input_tokens_seen": 205146595, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.10235596, + "step": 9523, + "time_per_iteration": 2.511115789413452 + }, + { + "auxiliary_loss_clip": 0.06417882, + "auxiliary_loss_mlp": 0.0126736, + "balance_loss_clip": 0.06277744, + "balance_loss_mlp": 0.01257025, + "epoch": 0.57261385840974, + "flos": 21221249552640.0, + "grad_norm": 1.7953059857975224, + "language_loss": 0.70372975, + "learning_rate": 1.628864706900738e-06, + "loss": 0.78058219, + "num_input_tokens_seen": 205164295, + "router_z_loss_clip": 1.40039062, + "router_z_loss_mlp": 0.10339355, + "step": 9524, + "time_per_iteration": 2.507387161254883 + }, + { + "auxiliary_loss_clip": 0.0641823, + "auxiliary_loss_mlp": 0.0127028, + "balance_loss_clip": 0.06276852, + "balance_loss_mlp": 0.01259188, + "epoch": 0.5726739816624079, + "flos": 33992936582400.0, + "grad_norm": 1.3727338087163001, + "language_loss": 0.6519655, + "learning_rate": 1.6284820166268615e-06, + "loss": 0.7288506, + "num_input_tokens_seen": 205185380, + "router_z_loss_clip": 1.41503906, + "router_z_loss_mlp": 0.11096191, + "step": 9525, + "time_per_iteration": 2.6264822483062744 + }, + { + "auxiliary_loss_clip": 0.0641274, + "auxiliary_loss_mlp": 0.01266201, + "balance_loss_clip": 0.06272839, + "balance_loss_mlp": 0.01255842, + "epoch": 0.5727341049150759, + "flos": 24282196329600.0, + "grad_norm": 1.6388418597669483, + "language_loss": 0.72797775, + "learning_rate": 1.628099340440984e-06, + "loss": 0.80476719, + "num_input_tokens_seen": 205204895, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.10351562, + "step": 9526, + "time_per_iteration": 2.5209100246429443 + }, + { + "auxiliary_loss_clip": 0.06418388, + "auxiliary_loss_mlp": 0.01268542, + "balance_loss_clip": 0.06280835, + "balance_loss_mlp": 0.01257897, + "epoch": 0.5727942281677438, + "flos": 28407762357120.0, + "grad_norm": 1.5546981496666945, + "language_loss": 0.80170763, + "learning_rate": 1.6277166783576176e-06, + "loss": 0.87857693, + "num_input_tokens_seen": 205223440, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.10650635, + "step": 9527, + "time_per_iteration": 2.6143245697021484 + }, + { + "auxiliary_loss_clip": 0.06413873, + "auxiliary_loss_mlp": 0.01269872, + "balance_loss_clip": 0.06275712, + "balance_loss_mlp": 0.01258983, + "epoch": 0.5728543514204119, + "flos": 19542861544320.0, + "grad_norm": 2.5128112924339585, + "language_loss": 0.72641492, + "learning_rate": 1.6273340303912713e-06, + "loss": 0.8032524, + "num_input_tokens_seen": 205242800, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.10894775, + "step": 9528, + "time_per_iteration": 2.4896552562713623 + }, + { + "auxiliary_loss_clip": 0.06418886, + "auxiliary_loss_mlp": 0.01267185, + "balance_loss_clip": 0.06277183, + "balance_loss_mlp": 0.0125577, + "epoch": 0.5729144746730798, + "flos": 21513137650560.0, + "grad_norm": 1.7938485336826149, + "language_loss": 0.85978115, + "learning_rate": 1.6269513965564557e-06, + "loss": 0.93664181, + "num_input_tokens_seen": 205259465, + "router_z_loss_clip": 1.41699219, + "router_z_loss_mlp": 0.11407471, + "step": 9529, + "time_per_iteration": 2.539447784423828 + }, + { + "auxiliary_loss_clip": 0.063314, + "auxiliary_loss_mlp": 0.01256121, + "balance_loss_clip": 0.06271826, + "balance_loss_mlp": 0.0125448, + "epoch": 0.5729745979257478, + "flos": 58699638495360.0, + "grad_norm": 0.750499003321047, + "language_loss": 0.55969286, + "learning_rate": 1.6265687768676813e-06, + "loss": 0.63556802, + "num_input_tokens_seen": 205314100, + "router_z_loss_clip": 0.59375, + "router_z_loss_mlp": 0.01643372, + "step": 9530, + "time_per_iteration": 3.007678747177124 + }, + { + "auxiliary_loss_clip": 0.06425051, + "auxiliary_loss_mlp": 0.01265649, + "balance_loss_clip": 0.06280611, + "balance_loss_mlp": 0.01254276, + "epoch": 0.5730347211784158, + "flos": 18558100834560.0, + "grad_norm": 1.9102815745402744, + "language_loss": 0.66843903, + "learning_rate": 1.6261861713394553e-06, + "loss": 0.74534607, + "num_input_tokens_seen": 205333420, + "router_z_loss_clip": 1.4453125, + "router_z_loss_mlp": 0.1137085, + "step": 9531, + "time_per_iteration": 3.9059529304504395 + }, + { + "auxiliary_loss_clip": 0.06417044, + "auxiliary_loss_mlp": 0.01269124, + "balance_loss_clip": 0.06274498, + "balance_loss_mlp": 0.01257966, + "epoch": 0.5730948444310837, + "flos": 38040069588480.0, + "grad_norm": 1.9862057863273674, + "language_loss": 0.75881588, + "learning_rate": 1.6258035799862876e-06, + "loss": 0.83567762, + "num_input_tokens_seen": 205350995, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.11169434, + "step": 9532, + "time_per_iteration": 2.640389919281006 + }, + { + "auxiliary_loss_clip": 0.06421025, + "auxiliary_loss_mlp": 0.01267077, + "balance_loss_clip": 0.06278558, + "balance_loss_mlp": 0.01255794, + "epoch": 0.5731549676837517, + "flos": 25233861876480.0, + "grad_norm": 1.2592580925122039, + "language_loss": 0.79252976, + "learning_rate": 1.625421002822686e-06, + "loss": 0.86941075, + "num_input_tokens_seen": 205372675, + "router_z_loss_clip": 1.42480469, + "router_z_loss_mlp": 0.11291504, + "step": 9533, + "time_per_iteration": 2.559293508529663 + }, + { + "auxiliary_loss_clip": 0.06417587, + "auxiliary_loss_mlp": 0.01266539, + "balance_loss_clip": 0.06278279, + "balance_loss_mlp": 0.01256067, + "epoch": 0.5732150909364196, + "flos": 23375030100480.0, + "grad_norm": 3.634749275276224, + "language_loss": 0.8597486, + "learning_rate": 1.6250384398631574e-06, + "loss": 0.93658984, + "num_input_tokens_seen": 205392590, + "router_z_loss_clip": 1.39257812, + "router_z_loss_mlp": 0.10467529, + "step": 9534, + "time_per_iteration": 2.539487838745117 + }, + { + "auxiliary_loss_clip": 0.06421855, + "auxiliary_loss_mlp": 0.01269069, + "balance_loss_clip": 0.06279784, + "balance_loss_mlp": 0.01257625, + "epoch": 0.5732752141890877, + "flos": 23086621946880.0, + "grad_norm": 1.944302626791885, + "language_loss": 0.75668436, + "learning_rate": 1.6246558911222085e-06, + "loss": 0.83359355, + "num_input_tokens_seen": 205414885, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.11444092, + "step": 9535, + "time_per_iteration": 2.5488839149475098 + }, + { + "auxiliary_loss_clip": 0.06425361, + "auxiliary_loss_mlp": 0.01268179, + "balance_loss_clip": 0.06278601, + "balance_loss_mlp": 0.01256288, + "epoch": 0.5733353374417556, + "flos": 24359078050560.0, + "grad_norm": 1.5155376410848522, + "language_loss": 0.71395552, + "learning_rate": 1.624273356614346e-06, + "loss": 0.79089081, + "num_input_tokens_seen": 205434440, + "router_z_loss_clip": 1.46582031, + "router_z_loss_mlp": 0.11895752, + "step": 9536, + "time_per_iteration": 2.553239345550537 + }, + { + "auxiliary_loss_clip": 0.06416988, + "auxiliary_loss_mlp": 0.01269432, + "balance_loss_clip": 0.06275923, + "balance_loss_mlp": 0.01258244, + "epoch": 0.5733954606944236, + "flos": 27206234334720.0, + "grad_norm": 1.742372783929404, + "language_loss": 0.70031548, + "learning_rate": 1.6238908363540755e-06, + "loss": 0.77717972, + "num_input_tokens_seen": 205454225, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.11187744, + "step": 9537, + "time_per_iteration": 2.5490598678588867 + }, + { + "auxiliary_loss_clip": 0.06419763, + "auxiliary_loss_mlp": 0.0126804, + "balance_loss_clip": 0.06277005, + "balance_loss_mlp": 0.01257317, + "epoch": 0.5734555839470915, + "flos": 28772339472000.0, + "grad_norm": 2.334146865026381, + "language_loss": 0.63052773, + "learning_rate": 1.623508330355902e-06, + "loss": 0.70740581, + "num_input_tokens_seen": 205474750, + "router_z_loss_clip": 1.42480469, + "router_z_loss_mlp": 0.10723877, + "step": 9538, + "time_per_iteration": 4.013959169387817 + }, + { + "auxiliary_loss_clip": 0.0641904, + "auxiliary_loss_mlp": 0.01273663, + "balance_loss_clip": 0.06277157, + "balance_loss_mlp": 0.0126136, + "epoch": 0.5735157071997595, + "flos": 22973542462080.0, + "grad_norm": 1.806157803076428, + "language_loss": 0.82720077, + "learning_rate": 1.6231258386343306e-06, + "loss": 0.90412778, + "num_input_tokens_seen": 205495495, + "router_z_loss_clip": 1.41796875, + "router_z_loss_mlp": 0.12310791, + "step": 9539, + "time_per_iteration": 2.554189682006836 + }, + { + "auxiliary_loss_clip": 0.06422378, + "auxiliary_loss_mlp": 0.01264392, + "balance_loss_clip": 0.06276339, + "balance_loss_mlp": 0.01253115, + "epoch": 0.5735758304524274, + "flos": 18995450820480.0, + "grad_norm": 2.0055639259958107, + "language_loss": 0.73150325, + "learning_rate": 1.6227433612038647e-06, + "loss": 0.80837095, + "num_input_tokens_seen": 205510070, + "router_z_loss_clip": 1.45996094, + "router_z_loss_mlp": 0.11279297, + "step": 9540, + "time_per_iteration": 2.500077486038208 + }, + { + "auxiliary_loss_clip": 0.0641907, + "auxiliary_loss_mlp": 0.01265571, + "balance_loss_clip": 0.06276584, + "balance_loss_mlp": 0.01255039, + "epoch": 0.5736359537050955, + "flos": 28404701683200.0, + "grad_norm": 2.024476848130698, + "language_loss": 0.80249465, + "learning_rate": 1.6223608980790089e-06, + "loss": 0.87934107, + "num_input_tokens_seen": 205530190, + "router_z_loss_clip": 1.42675781, + "router_z_loss_mlp": 0.10528564, + "step": 9541, + "time_per_iteration": 4.051165342330933 + }, + { + "auxiliary_loss_clip": 0.06425047, + "auxiliary_loss_mlp": 0.01265692, + "balance_loss_clip": 0.06278428, + "balance_loss_mlp": 0.01253998, + "epoch": 0.5736960769577634, + "flos": 15631714915200.0, + "grad_norm": 2.008860171144918, + "language_loss": 0.64482939, + "learning_rate": 1.6219784492742654e-06, + "loss": 0.72173679, + "num_input_tokens_seen": 205547380, + "router_z_loss_clip": 1.46386719, + "router_z_loss_mlp": 0.11700439, + "step": 9542, + "time_per_iteration": 2.5055642127990723 + }, + { + "auxiliary_loss_clip": 0.06417751, + "auxiliary_loss_mlp": 0.01265337, + "balance_loss_clip": 0.0627488, + "balance_loss_mlp": 0.01254691, + "epoch": 0.5737562002104314, + "flos": 18009767715840.0, + "grad_norm": 2.2598183554381146, + "language_loss": 0.83200055, + "learning_rate": 1.6215960148041365e-06, + "loss": 0.90883142, + "num_input_tokens_seen": 205566540, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.10638428, + "step": 9543, + "time_per_iteration": 2.4916088581085205 + }, + { + "auxiliary_loss_clip": 0.06426359, + "auxiliary_loss_mlp": 0.0126626, + "balance_loss_clip": 0.06279086, + "balance_loss_mlp": 0.01254422, + "epoch": 0.5738163234630994, + "flos": 20703454047360.0, + "grad_norm": 1.617850922862876, + "language_loss": 0.74024302, + "learning_rate": 1.6212135946831257e-06, + "loss": 0.81716919, + "num_input_tokens_seen": 205584200, + "router_z_loss_clip": 1.47460938, + "router_z_loss_mlp": 0.1184082, + "step": 9544, + "time_per_iteration": 2.536583662033081 + }, + { + "auxiliary_loss_clip": 0.06424204, + "auxiliary_loss_mlp": 0.01268479, + "balance_loss_clip": 0.06278355, + "balance_loss_mlp": 0.01256809, + "epoch": 0.5738764467157673, + "flos": 23156082581760.0, + "grad_norm": 3.1974440280178595, + "language_loss": 0.76412272, + "learning_rate": 1.620831188925733e-06, + "loss": 0.84104949, + "num_input_tokens_seen": 205604675, + "router_z_loss_clip": 1.45898438, + "router_z_loss_mlp": 0.11676025, + "step": 9545, + "time_per_iteration": 2.5427141189575195 + }, + { + "auxiliary_loss_clip": 0.06423136, + "auxiliary_loss_mlp": 0.01267499, + "balance_loss_clip": 0.06279323, + "balance_loss_mlp": 0.01256162, + "epoch": 0.5739365699684353, + "flos": 29499942401280.0, + "grad_norm": 2.3578945444753447, + "language_loss": 0.56573224, + "learning_rate": 1.620448797546459e-06, + "loss": 0.64263856, + "num_input_tokens_seen": 205624680, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.11334229, + "step": 9546, + "time_per_iteration": 2.608128309249878 + }, + { + "auxiliary_loss_clip": 0.06422536, + "auxiliary_loss_mlp": 0.01268737, + "balance_loss_clip": 0.0627693, + "balance_loss_mlp": 0.01257746, + "epoch": 0.5739966932211032, + "flos": 14032388833920.0, + "grad_norm": 2.2022917684402996, + "language_loss": 0.76728261, + "learning_rate": 1.6200664205598055e-06, + "loss": 0.84419537, + "num_input_tokens_seen": 205641950, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.10980225, + "step": 9547, + "time_per_iteration": 2.5017452239990234 + }, + { + "auxiliary_loss_clip": 0.06421655, + "auxiliary_loss_mlp": 0.01268546, + "balance_loss_clip": 0.06277436, + "balance_loss_mlp": 0.01257114, + "epoch": 0.5740568164737713, + "flos": 19067972129280.0, + "grad_norm": 1.9505887412268983, + "language_loss": 0.7442795, + "learning_rate": 1.6196840579802704e-06, + "loss": 0.82118154, + "num_input_tokens_seen": 205660130, + "router_z_loss_clip": 1.44042969, + "router_z_loss_mlp": 0.11444092, + "step": 9548, + "time_per_iteration": 2.549558639526367 + }, + { + "auxiliary_loss_clip": 0.06418206, + "auxiliary_loss_mlp": 0.01266472, + "balance_loss_clip": 0.0627545, + "balance_loss_mlp": 0.01255064, + "epoch": 0.5741169397264392, + "flos": 22134453275520.0, + "grad_norm": 2.3791642109865228, + "language_loss": 0.69704068, + "learning_rate": 1.619301709822355e-06, + "loss": 0.77388746, + "num_input_tokens_seen": 205678895, + "router_z_loss_clip": 1.42871094, + "router_z_loss_mlp": 0.11419678, + "step": 9549, + "time_per_iteration": 3.933781147003174 + }, + { + "auxiliary_loss_clip": 0.06420065, + "auxiliary_loss_mlp": 0.01265483, + "balance_loss_clip": 0.06279664, + "balance_loss_mlp": 0.01254611, + "epoch": 0.5741770629791072, + "flos": 24943860495360.0, + "grad_norm": 1.461228472430463, + "language_loss": 0.79521686, + "learning_rate": 1.6189193761005564e-06, + "loss": 0.87207234, + "num_input_tokens_seen": 205698450, + "router_z_loss_clip": 1.40527344, + "router_z_loss_mlp": 0.10870361, + "step": 9550, + "time_per_iteration": 2.577768087387085 + }, + { + "auxiliary_loss_clip": 0.06419414, + "auxiliary_loss_mlp": 0.01265674, + "balance_loss_clip": 0.06276274, + "balance_loss_mlp": 0.01254832, + "epoch": 0.5742371862317751, + "flos": 18806495863680.0, + "grad_norm": 2.119345289493334, + "language_loss": 0.68877375, + "learning_rate": 1.6185370568293727e-06, + "loss": 0.76562458, + "num_input_tokens_seen": 205714870, + "router_z_loss_clip": 1.43164062, + "router_z_loss_mlp": 0.10845947, + "step": 9551, + "time_per_iteration": 2.480468273162842 + }, + { + "auxiliary_loss_clip": 0.06424205, + "auxiliary_loss_mlp": 0.01267294, + "balance_loss_clip": 0.06276421, + "balance_loss_mlp": 0.0125579, + "epoch": 0.5742973094844431, + "flos": 24467293998720.0, + "grad_norm": 1.5487820488887025, + "language_loss": 0.72033125, + "learning_rate": 1.6181547520233031e-06, + "loss": 0.79724622, + "num_input_tokens_seen": 205736045, + "router_z_loss_clip": 1.47460938, + "router_z_loss_mlp": 0.11505127, + "step": 9552, + "time_per_iteration": 2.5759360790252686 + }, + { + "auxiliary_loss_clip": 0.06417461, + "auxiliary_loss_mlp": 0.01265348, + "balance_loss_clip": 0.06274983, + "balance_loss_mlp": 0.0125469, + "epoch": 0.574357432737111, + "flos": 21659186517120.0, + "grad_norm": 3.0495771997900163, + "language_loss": 0.79982221, + "learning_rate": 1.617772461696843e-06, + "loss": 0.87665033, + "num_input_tokens_seen": 205754445, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.10662842, + "step": 9553, + "time_per_iteration": 2.49290132522583 + }, + { + "auxiliary_loss_clip": 0.06423397, + "auxiliary_loss_mlp": 0.01264041, + "balance_loss_clip": 0.06275378, + "balance_loss_mlp": 0.0125333, + "epoch": 0.5744175559897791, + "flos": 16550285299200.0, + "grad_norm": 2.1324379432349425, + "language_loss": 0.83817756, + "learning_rate": 1.6173901858644895e-06, + "loss": 0.91505194, + "num_input_tokens_seen": 205770595, + "router_z_loss_clip": 1.48144531, + "router_z_loss_mlp": 0.1071167, + "step": 9554, + "time_per_iteration": 2.5118370056152344 + }, + { + "auxiliary_loss_clip": 0.06422277, + "auxiliary_loss_mlp": 0.01267015, + "balance_loss_clip": 0.06277014, + "balance_loss_mlp": 0.0125575, + "epoch": 0.574477679242447, + "flos": 24214580484480.0, + "grad_norm": 1.3861221814355518, + "language_loss": 0.71406233, + "learning_rate": 1.6170079245407385e-06, + "loss": 0.79095531, + "num_input_tokens_seen": 205791935, + "router_z_loss_clip": 1.45117188, + "router_z_loss_mlp": 0.11254883, + "step": 9555, + "time_per_iteration": 2.5466480255126953 + }, + { + "auxiliary_loss_clip": 0.06421511, + "auxiliary_loss_mlp": 0.01268077, + "balance_loss_clip": 0.06277835, + "balance_loss_mlp": 0.01256478, + "epoch": 0.574537802495115, + "flos": 14908304689920.0, + "grad_norm": 2.185347344801511, + "language_loss": 0.73004574, + "learning_rate": 1.6166256777400853e-06, + "loss": 0.80694163, + "num_input_tokens_seen": 205807260, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.1159668, + "step": 9556, + "time_per_iteration": 2.4900078773498535 + }, + { + "auxiliary_loss_clip": 0.0641879, + "auxiliary_loss_mlp": 0.01265172, + "balance_loss_clip": 0.06276919, + "balance_loss_mlp": 0.01253406, + "epoch": 0.5745979257477829, + "flos": 24941680289280.0, + "grad_norm": 1.5306662340422301, + "language_loss": 0.74479866, + "learning_rate": 1.6162434454770248e-06, + "loss": 0.82163835, + "num_input_tokens_seen": 205826885, + "router_z_loss_clip": 1.41796875, + "router_z_loss_mlp": 0.11761475, + "step": 9557, + "time_per_iteration": 2.576296329498291 + }, + { + "auxiliary_loss_clip": 0.06420197, + "auxiliary_loss_mlp": 0.01263736, + "balance_loss_clip": 0.06277291, + "balance_loss_mlp": 0.01252572, + "epoch": 0.5746580490004509, + "flos": 17241061559040.0, + "grad_norm": 1.5775139248237169, + "language_loss": 0.68007201, + "learning_rate": 1.6158612277660514e-06, + "loss": 0.75691128, + "num_input_tokens_seen": 205844630, + "router_z_loss_clip": 1.42871094, + "router_z_loss_mlp": 0.11157227, + "step": 9558, + "time_per_iteration": 2.531812906265259 + }, + { + "auxiliary_loss_clip": 0.06424935, + "auxiliary_loss_mlp": 0.01267243, + "balance_loss_clip": 0.06275487, + "balance_loss_mlp": 0.01253779, + "epoch": 0.5747181722531189, + "flos": 13192838449920.0, + "grad_norm": 2.425506842460266, + "language_loss": 0.71628273, + "learning_rate": 1.615479024621659e-06, + "loss": 0.79320455, + "num_input_tokens_seen": 205860960, + "router_z_loss_clip": 1.49511719, + "router_z_loss_mlp": 0.13482666, + "step": 9559, + "time_per_iteration": 2.473419189453125 + }, + { + "auxiliary_loss_clip": 0.06419484, + "auxiliary_loss_mlp": 0.01266983, + "balance_loss_clip": 0.06277612, + "balance_loss_mlp": 0.01256921, + "epoch": 0.5747782955057869, + "flos": 22969098195840.0, + "grad_norm": 1.5670628486073652, + "language_loss": 0.79416776, + "learning_rate": 1.6150968360583398e-06, + "loss": 0.87103242, + "num_input_tokens_seen": 205880675, + "router_z_loss_clip": 1.41796875, + "router_z_loss_mlp": 0.10064697, + "step": 9560, + "time_per_iteration": 2.532862663269043 + }, + { + "auxiliary_loss_clip": 0.06421925, + "auxiliary_loss_mlp": 0.01267007, + "balance_loss_clip": 0.06276737, + "balance_loss_mlp": 0.01255581, + "epoch": 0.5748384187584549, + "flos": 23409802344960.0, + "grad_norm": 1.793006683486937, + "language_loss": 0.64777875, + "learning_rate": 1.614714662090588e-06, + "loss": 0.72466803, + "num_input_tokens_seen": 205900050, + "router_z_loss_clip": 1.45214844, + "router_z_loss_mlp": 0.11431885, + "step": 9561, + "time_per_iteration": 2.5111758708953857 + }, + { + "auxiliary_loss_clip": 0.06426983, + "auxiliary_loss_mlp": 0.01268046, + "balance_loss_clip": 0.06277155, + "balance_loss_mlp": 0.01256369, + "epoch": 0.5748985420111228, + "flos": 17791323321600.0, + "grad_norm": 1.4966227163397983, + "language_loss": 0.7114228, + "learning_rate": 1.6143325027328945e-06, + "loss": 0.78837311, + "num_input_tokens_seen": 205918855, + "router_z_loss_clip": 1.49804688, + "router_z_loss_mlp": 0.11682129, + "step": 9562, + "time_per_iteration": 2.5162081718444824 + }, + { + "auxiliary_loss_clip": 0.06425486, + "auxiliary_loss_mlp": 0.01266976, + "balance_loss_clip": 0.06280454, + "balance_loss_mlp": 0.01256081, + "epoch": 0.5749586652637908, + "flos": 19872582560640.0, + "grad_norm": 1.4328664867345224, + "language_loss": 0.84269559, + "learning_rate": 1.613950357999751e-06, + "loss": 0.91962022, + "num_input_tokens_seen": 205936970, + "router_z_loss_clip": 1.44921875, + "router_z_loss_mlp": 0.10888672, + "step": 9563, + "time_per_iteration": 2.5183188915252686 + }, + { + "auxiliary_loss_clip": 0.06421089, + "auxiliary_loss_mlp": 0.01268857, + "balance_loss_clip": 0.06273992, + "balance_loss_mlp": 0.01256733, + "epoch": 0.5750187885164587, + "flos": 21293477372160.0, + "grad_norm": 2.089685167133714, + "language_loss": 0.57297182, + "learning_rate": 1.6135682279056488e-06, + "loss": 0.64987123, + "num_input_tokens_seen": 205954630, + "router_z_loss_clip": 1.46972656, + "router_z_loss_mlp": 0.12127686, + "step": 9564, + "time_per_iteration": 2.5219571590423584 + }, + { + "auxiliary_loss_clip": 0.06414357, + "auxiliary_loss_mlp": 0.01264565, + "balance_loss_clip": 0.06276927, + "balance_loss_mlp": 0.0125389, + "epoch": 0.5750789117691267, + "flos": 18810227370240.0, + "grad_norm": 1.5824685354584669, + "language_loss": 0.76484299, + "learning_rate": 1.613186112465078e-06, + "loss": 0.84163225, + "num_input_tokens_seen": 205971510, + "router_z_loss_clip": 1.37402344, + "router_z_loss_mlp": 0.10681152, + "step": 9565, + "time_per_iteration": 2.4752280712127686 + }, + { + "auxiliary_loss_clip": 0.06321105, + "auxiliary_loss_mlp": 0.01250694, + "balance_loss_clip": 0.06260607, + "balance_loss_mlp": 0.01249219, + "epoch": 0.5751390350217946, + "flos": 70685624188800.0, + "grad_norm": 0.721103953507815, + "language_loss": 0.6068033, + "learning_rate": 1.6128040116925287e-06, + "loss": 0.68252128, + "num_input_tokens_seen": 206035125, + "router_z_loss_clip": 0.609375, + "router_z_loss_mlp": 0.01473999, + "step": 9566, + "time_per_iteration": 3.222144603729248 + }, + { + "auxiliary_loss_clip": 0.06420306, + "auxiliary_loss_mlp": 0.01268432, + "balance_loss_clip": 0.06276581, + "balance_loss_mlp": 0.01257673, + "epoch": 0.5751991582744627, + "flos": 14251545987840.0, + "grad_norm": 2.0959328312792467, + "language_loss": 0.75654471, + "learning_rate": 1.6124219256024901e-06, + "loss": 0.83343208, + "num_input_tokens_seen": 206052075, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.10760498, + "step": 9567, + "time_per_iteration": 2.4892570972442627 + }, + { + "auxiliary_loss_clip": 0.06417775, + "auxiliary_loss_mlp": 0.01267193, + "balance_loss_clip": 0.06274199, + "balance_loss_mlp": 0.01255875, + "epoch": 0.5752592815271306, + "flos": 18333283530240.0, + "grad_norm": 1.4488652909067903, + "language_loss": 0.75253701, + "learning_rate": 1.6120398542094504e-06, + "loss": 0.82938665, + "num_input_tokens_seen": 206069970, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.11322021, + "step": 9568, + "time_per_iteration": 2.473475217819214 + }, + { + "auxiliary_loss_clip": 0.06419896, + "auxiliary_loss_mlp": 0.01265316, + "balance_loss_clip": 0.06276227, + "balance_loss_mlp": 0.01254349, + "epoch": 0.5753194047797986, + "flos": 20928984111360.0, + "grad_norm": 1.5107907301615, + "language_loss": 0.71293747, + "learning_rate": 1.6116577975278994e-06, + "loss": 0.78978956, + "num_input_tokens_seen": 206088950, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.10968018, + "step": 9569, + "time_per_iteration": 2.6541481018066406 + }, + { + "auxiliary_loss_clip": 0.06420765, + "auxiliary_loss_mlp": 0.01267458, + "balance_loss_clip": 0.06275727, + "balance_loss_mlp": 0.01255764, + "epoch": 0.5753795280324665, + "flos": 19287925896960.0, + "grad_norm": 2.027519323892087, + "language_loss": 0.56120193, + "learning_rate": 1.6112757555723223e-06, + "loss": 0.63808417, + "num_input_tokens_seen": 206107780, + "router_z_loss_clip": 1.45019531, + "router_z_loss_mlp": 0.11694336, + "step": 9570, + "time_per_iteration": 2.5568745136260986 + }, + { + "auxiliary_loss_clip": 0.0641574, + "auxiliary_loss_mlp": 0.01264384, + "balance_loss_clip": 0.06274444, + "balance_loss_mlp": 0.01253715, + "epoch": 0.5754396512851345, + "flos": 21659312298240.0, + "grad_norm": 3.8103947749492355, + "language_loss": 0.64502007, + "learning_rate": 1.6108937283572082e-06, + "loss": 0.72182131, + "num_input_tokens_seen": 206127445, + "router_z_loss_clip": 1.41210938, + "router_z_loss_mlp": 0.10675049, + "step": 9571, + "time_per_iteration": 3.9861292839050293 + }, + { + "auxiliary_loss_clip": 0.06417111, + "auxiliary_loss_mlp": 0.01267965, + "balance_loss_clip": 0.06275386, + "balance_loss_mlp": 0.01257153, + "epoch": 0.5754997745378025, + "flos": 51032674707840.0, + "grad_norm": 1.44401056534108, + "language_loss": 0.67167187, + "learning_rate": 1.6105117158970434e-06, + "loss": 0.74852264, + "num_input_tokens_seen": 206152005, + "router_z_loss_clip": 1.41601562, + "router_z_loss_mlp": 0.10821533, + "step": 9572, + "time_per_iteration": 2.775322198867798 + }, + { + "auxiliary_loss_clip": 0.06417632, + "auxiliary_loss_mlp": 0.0126415, + "balance_loss_clip": 0.06276821, + "balance_loss_mlp": 0.01252378, + "epoch": 0.5755598977904705, + "flos": 22863523651200.0, + "grad_norm": 1.9643261986613603, + "language_loss": 0.72534865, + "learning_rate": 1.6101297182063123e-06, + "loss": 0.80216646, + "num_input_tokens_seen": 206169875, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.11767578, + "step": 9573, + "time_per_iteration": 2.504248857498169 + }, + { + "auxiliary_loss_clip": 0.06413124, + "auxiliary_loss_mlp": 0.01264891, + "balance_loss_clip": 0.06276227, + "balance_loss_mlp": 0.0125495, + "epoch": 0.5756200210431385, + "flos": 38482073475840.0, + "grad_norm": 1.6390607800794645, + "language_loss": 0.76527274, + "learning_rate": 1.6097477352995022e-06, + "loss": 0.84205294, + "num_input_tokens_seen": 206192635, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.09954834, + "step": 9574, + "time_per_iteration": 2.675445079803467 + }, + { + "auxiliary_loss_clip": 0.06426176, + "auxiliary_loss_mlp": 0.01264732, + "balance_loss_clip": 0.06277125, + "balance_loss_mlp": 0.01252865, + "epoch": 0.5756801442958064, + "flos": 23915984060160.0, + "grad_norm": 3.486560074307127, + "language_loss": 0.67186499, + "learning_rate": 1.6093657671910968e-06, + "loss": 0.74877405, + "num_input_tokens_seen": 206211485, + "router_z_loss_clip": 1.49121094, + "router_z_loss_mlp": 0.11877441, + "step": 9575, + "time_per_iteration": 2.5086028575897217 + }, + { + "auxiliary_loss_clip": 0.06414266, + "auxiliary_loss_mlp": 0.01263942, + "balance_loss_clip": 0.06275645, + "balance_loss_mlp": 0.01253899, + "epoch": 0.5757402675484744, + "flos": 21111566158080.0, + "grad_norm": 1.4184952738773886, + "language_loss": 0.80574554, + "learning_rate": 1.6089838138955804e-06, + "loss": 0.88252765, + "num_input_tokens_seen": 206231740, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.1005249, + "step": 9576, + "time_per_iteration": 2.502372980117798 + }, + { + "auxiliary_loss_clip": 0.06413178, + "auxiliary_loss_mlp": 0.01266947, + "balance_loss_clip": 0.06273341, + "balance_loss_mlp": 0.01256439, + "epoch": 0.5758003908011423, + "flos": 20565497099520.0, + "grad_norm": 1.5791511975506907, + "language_loss": 0.69807208, + "learning_rate": 1.6086018754274372e-06, + "loss": 0.77487338, + "num_input_tokens_seen": 206250975, + "router_z_loss_clip": 1.39648438, + "router_z_loss_mlp": 0.10510254, + "step": 9577, + "time_per_iteration": 4.000526428222656 + }, + { + "auxiliary_loss_clip": 0.06420817, + "auxiliary_loss_mlp": 0.0126492, + "balance_loss_clip": 0.06274913, + "balance_loss_mlp": 0.012544, + "epoch": 0.5758605140538103, + "flos": 16478770239360.0, + "grad_norm": 1.7483336770936004, + "language_loss": 0.66710907, + "learning_rate": 1.6082199518011504e-06, + "loss": 0.74396646, + "num_input_tokens_seen": 206268800, + "router_z_loss_clip": 1.45800781, + "router_z_loss_mlp": 0.10510254, + "step": 9578, + "time_per_iteration": 2.495589256286621 + }, + { + "auxiliary_loss_clip": 0.06417773, + "auxiliary_loss_mlp": 0.01264669, + "balance_loss_clip": 0.06276586, + "balance_loss_mlp": 0.01254274, + "epoch": 0.5759206373064782, + "flos": 21293854715520.0, + "grad_norm": 1.4632151435184575, + "language_loss": 0.72808439, + "learning_rate": 1.6078380430312016e-06, + "loss": 0.80490887, + "num_input_tokens_seen": 206287190, + "router_z_loss_clip": 1.41210938, + "router_z_loss_mlp": 0.10388184, + "step": 9579, + "time_per_iteration": 2.4900078773498535 + }, + { + "auxiliary_loss_clip": 0.06426738, + "auxiliary_loss_mlp": 0.01266533, + "balance_loss_clip": 0.06278113, + "balance_loss_mlp": 0.01254451, + "epoch": 0.5759807605591463, + "flos": 26075089342080.0, + "grad_norm": 2.9637416190029597, + "language_loss": 0.64800644, + "learning_rate": 1.6074561491320742e-06, + "loss": 0.72493923, + "num_input_tokens_seen": 206307020, + "router_z_loss_clip": 1.48535156, + "router_z_loss_mlp": 0.12072754, + "step": 9580, + "time_per_iteration": 2.532273292541504 + }, + { + "auxiliary_loss_clip": 0.06420532, + "auxiliary_loss_mlp": 0.01266688, + "balance_loss_clip": 0.06275357, + "balance_loss_mlp": 0.01255554, + "epoch": 0.5760408838118142, + "flos": 18877885142400.0, + "grad_norm": 1.6521602857434026, + "language_loss": 0.85497582, + "learning_rate": 1.6070742701182486e-06, + "loss": 0.93184799, + "num_input_tokens_seen": 206324095, + "router_z_loss_clip": 1.45214844, + "router_z_loss_mlp": 0.11132812, + "step": 9581, + "time_per_iteration": 3.9159321784973145 + }, + { + "auxiliary_loss_clip": 0.06425697, + "auxiliary_loss_mlp": 0.01268939, + "balance_loss_clip": 0.06276281, + "balance_loss_mlp": 0.01257483, + "epoch": 0.5761010070644822, + "flos": 15383655302400.0, + "grad_norm": 2.053627577895993, + "language_loss": 0.67847329, + "learning_rate": 1.6066924060042057e-06, + "loss": 0.75541961, + "num_input_tokens_seen": 206343210, + "router_z_loss_clip": 1.49414062, + "router_z_loss_mlp": 0.11450195, + "step": 9582, + "time_per_iteration": 2.468289613723755 + }, + { + "auxiliary_loss_clip": 0.06323063, + "auxiliary_loss_mlp": 0.0125238, + "balance_loss_clip": 0.06262786, + "balance_loss_mlp": 0.01250932, + "epoch": 0.5761611303171501, + "flos": 71495475500160.0, + "grad_norm": 0.6295597289579254, + "language_loss": 0.5722791, + "learning_rate": 1.6063105568044271e-06, + "loss": 0.64803356, + "num_input_tokens_seen": 206415935, + "router_z_loss_clip": 0.60253906, + "router_z_loss_mlp": 0.0144577, + "step": 9583, + "time_per_iteration": 3.280832052230835 + }, + { + "auxiliary_loss_clip": 0.06416009, + "auxiliary_loss_mlp": 0.0126413, + "balance_loss_clip": 0.06274246, + "balance_loss_mlp": 0.01253437, + "epoch": 0.5762212535698181, + "flos": 16250556844800.0, + "grad_norm": 1.895482028357212, + "language_loss": 0.82933408, + "learning_rate": 1.6059287225333912e-06, + "loss": 0.90613544, + "num_input_tokens_seen": 206431900, + "router_z_loss_clip": 1.41894531, + "router_z_loss_mlp": 0.10693359, + "step": 9584, + "time_per_iteration": 2.473771333694458 + }, + { + "auxiliary_loss_clip": 0.06325932, + "auxiliary_loss_mlp": 0.01252168, + "balance_loss_clip": 0.06265227, + "balance_loss_mlp": 0.01250696, + "epoch": 0.5762813768224861, + "flos": 70207254829440.0, + "grad_norm": 0.6148723792494001, + "language_loss": 0.49547607, + "learning_rate": 1.6055469032055773e-06, + "loss": 0.57125711, + "num_input_tokens_seen": 206501200, + "router_z_loss_clip": 0.609375, + "router_z_loss_mlp": 0.0147171, + "step": 9585, + "time_per_iteration": 3.220283031463623 + }, + { + "auxiliary_loss_clip": 0.06417918, + "auxiliary_loss_mlp": 0.0126733, + "balance_loss_clip": 0.06276701, + "balance_loss_mlp": 0.01256446, + "epoch": 0.5763415000751541, + "flos": 20523639185280.0, + "grad_norm": 1.396891707955096, + "language_loss": 0.84832788, + "learning_rate": 1.605165098835465e-06, + "loss": 0.92518032, + "num_input_tokens_seen": 206520575, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.10876465, + "step": 9586, + "time_per_iteration": 2.5044658184051514 + }, + { + "auxiliary_loss_clip": 0.0641425, + "auxiliary_loss_mlp": 0.01268611, + "balance_loss_clip": 0.06270906, + "balance_loss_mlp": 0.01257584, + "epoch": 0.5764016233278221, + "flos": 15821047215360.0, + "grad_norm": 1.5476594832750246, + "language_loss": 0.80150878, + "learning_rate": 1.6047833094375308e-06, + "loss": 0.87833744, + "num_input_tokens_seen": 206538060, + "router_z_loss_clip": 1.43457031, + "router_z_loss_mlp": 0.11035156, + "step": 9587, + "time_per_iteration": 2.494929552078247 + }, + { + "auxiliary_loss_clip": 0.06421454, + "auxiliary_loss_mlp": 0.01267229, + "balance_loss_clip": 0.06277972, + "balance_loss_mlp": 0.01256184, + "epoch": 0.57646174658049, + "flos": 20777778218880.0, + "grad_norm": 1.3785070074858572, + "language_loss": 0.6626485, + "learning_rate": 1.6044015350262542e-06, + "loss": 0.73953533, + "num_input_tokens_seen": 206557320, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.11047363, + "step": 9588, + "time_per_iteration": 3.990769863128662 + }, + { + "auxiliary_loss_clip": 0.06420319, + "auxiliary_loss_mlp": 0.01268847, + "balance_loss_clip": 0.0627601, + "balance_loss_mlp": 0.01256491, + "epoch": 0.576521869833158, + "flos": 23556647825280.0, + "grad_norm": 1.8252792275452514, + "language_loss": 0.79050291, + "learning_rate": 1.6040197756161104e-06, + "loss": 0.86739457, + "num_input_tokens_seen": 206575780, + "router_z_loss_clip": 1.44335938, + "router_z_loss_mlp": 0.1237793, + "step": 9589, + "time_per_iteration": 2.5151610374450684 + }, + { + "auxiliary_loss_clip": 0.06414266, + "auxiliary_loss_mlp": 0.01264887, + "balance_loss_clip": 0.06275681, + "balance_loss_mlp": 0.01254652, + "epoch": 0.5765819930858259, + "flos": 20272812387840.0, + "grad_norm": 1.9044444718181142, + "language_loss": 0.79799986, + "learning_rate": 1.6036380312215762e-06, + "loss": 0.87479138, + "num_input_tokens_seen": 206594100, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.10229492, + "step": 9590, + "time_per_iteration": 2.502588987350464 + }, + { + "auxiliary_loss_clip": 0.06424554, + "auxiliary_loss_mlp": 0.01266306, + "balance_loss_clip": 0.06279668, + "balance_loss_mlp": 0.01256096, + "epoch": 0.5766421163384939, + "flos": 23155453676160.0, + "grad_norm": 1.9323149052957644, + "language_loss": 0.63195986, + "learning_rate": 1.6032563018571283e-06, + "loss": 0.7088685, + "num_input_tokens_seen": 206613325, + "router_z_loss_clip": 1.44921875, + "router_z_loss_mlp": 0.10217285, + "step": 9591, + "time_per_iteration": 2.5217199325561523 + }, + { + "auxiliary_loss_clip": 0.0641837, + "auxiliary_loss_mlp": 0.0126852, + "balance_loss_clip": 0.06274436, + "balance_loss_mlp": 0.01258048, + "epoch": 0.5767022395911618, + "flos": 25856057969280.0, + "grad_norm": 1.7751118346977903, + "language_loss": 0.78161305, + "learning_rate": 1.6028745875372406e-06, + "loss": 0.85848188, + "num_input_tokens_seen": 206634265, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.10473633, + "step": 9592, + "time_per_iteration": 2.586398124694824 + }, + { + "auxiliary_loss_clip": 0.06325077, + "auxiliary_loss_mlp": 0.0125376, + "balance_loss_clip": 0.06264462, + "balance_loss_mlp": 0.01252203, + "epoch": 0.5767623628438299, + "flos": 68315579452800.0, + "grad_norm": 0.723864489522512, + "language_loss": 0.59626555, + "learning_rate": 1.6024928882763885e-06, + "loss": 0.67205393, + "num_input_tokens_seen": 206696990, + "router_z_loss_clip": 0.60888672, + "router_z_loss_mlp": 0.01555634, + "step": 9593, + "time_per_iteration": 3.245339870452881 + }, + { + "auxiliary_loss_clip": 0.06419121, + "auxiliary_loss_mlp": 0.01266388, + "balance_loss_clip": 0.06272256, + "balance_loss_mlp": 0.01254432, + "epoch": 0.5768224860964978, + "flos": 30195959541120.0, + "grad_norm": 1.4712512924104606, + "language_loss": 0.70970887, + "learning_rate": 1.6021112040890463e-06, + "loss": 0.78656393, + "num_input_tokens_seen": 206717815, + "router_z_loss_clip": 1.46972656, + "router_z_loss_mlp": 0.11956787, + "step": 9594, + "time_per_iteration": 2.575716018676758 + }, + { + "auxiliary_loss_clip": 0.06417293, + "auxiliary_loss_mlp": 0.01269346, + "balance_loss_clip": 0.0627408, + "balance_loss_mlp": 0.01259237, + "epoch": 0.5768826093491658, + "flos": 17900880935040.0, + "grad_norm": 1.6705807126416699, + "language_loss": 0.71305418, + "learning_rate": 1.6017295349896863e-06, + "loss": 0.78992057, + "num_input_tokens_seen": 206735985, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.10101318, + "step": 9595, + "time_per_iteration": 2.492614269256592 + }, + { + "auxiliary_loss_clip": 0.06416321, + "auxiliary_loss_mlp": 0.01269009, + "balance_loss_clip": 0.06273369, + "balance_loss_mlp": 0.01257481, + "epoch": 0.5769427326018337, + "flos": 17462943970560.0, + "grad_norm": 1.9433978950195214, + "language_loss": 0.69787997, + "learning_rate": 1.6013478809927828e-06, + "loss": 0.77473325, + "num_input_tokens_seen": 206753370, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.11529541, + "step": 9596, + "time_per_iteration": 2.527899742126465 + }, + { + "auxiliary_loss_clip": 0.06425576, + "auxiliary_loss_mlp": 0.01267355, + "balance_loss_clip": 0.06275462, + "balance_loss_mlp": 0.01254558, + "epoch": 0.5770028558545017, + "flos": 39431181473280.0, + "grad_norm": 1.7020557646527, + "language_loss": 0.67913234, + "learning_rate": 1.6009662421128074e-06, + "loss": 0.75606167, + "num_input_tokens_seen": 206777645, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.12792969, + "step": 9597, + "time_per_iteration": 2.6754841804504395 + }, + { + "auxiliary_loss_clip": 0.06417054, + "auxiliary_loss_mlp": 0.01265896, + "balance_loss_clip": 0.06273974, + "balance_loss_mlp": 0.01255322, + "epoch": 0.5770629791071697, + "flos": 21541620839040.0, + "grad_norm": 1.8412029810529236, + "language_loss": 0.82291842, + "learning_rate": 1.6005846183642323e-06, + "loss": 0.89974791, + "num_input_tokens_seen": 206794865, + "router_z_loss_clip": 1.43164062, + "router_z_loss_mlp": 0.105896, + "step": 9598, + "time_per_iteration": 2.510817527770996 + }, + { + "auxiliary_loss_clip": 0.06420396, + "auxiliary_loss_mlp": 0.01268157, + "balance_loss_clip": 0.06275374, + "balance_loss_mlp": 0.01256511, + "epoch": 0.5771231023598377, + "flos": 20893121763840.0, + "grad_norm": 1.43847663479929, + "language_loss": 0.73386133, + "learning_rate": 1.6002030097615277e-06, + "loss": 0.81074691, + "num_input_tokens_seen": 206814095, + "router_z_loss_clip": 1.44921875, + "router_z_loss_mlp": 0.11639404, + "step": 9599, + "time_per_iteration": 2.492751121520996 + }, + { + "auxiliary_loss_clip": 0.06411996, + "auxiliary_loss_mlp": 0.01265144, + "balance_loss_clip": 0.06272705, + "balance_loss_mlp": 0.01254772, + "epoch": 0.5771832256125057, + "flos": 18083043711360.0, + "grad_norm": 1.7867114623476337, + "language_loss": 0.78284144, + "learning_rate": 1.5998214163191663e-06, + "loss": 0.85961294, + "num_input_tokens_seen": 206832245, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.10369873, + "step": 9600, + "time_per_iteration": 2.4890565872192383 + }, + { + "auxiliary_loss_clip": 0.06422748, + "auxiliary_loss_mlp": 0.01268331, + "balance_loss_clip": 0.06276144, + "balance_loss_mlp": 0.01256893, + "epoch": 0.5772433488651736, + "flos": 26366222753280.0, + "grad_norm": 1.8856132517408855, + "language_loss": 0.72472572, + "learning_rate": 1.5994398380516163e-06, + "loss": 0.80163646, + "num_input_tokens_seen": 206851535, + "router_z_loss_clip": 1.46582031, + "router_z_loss_mlp": 0.11450195, + "step": 9601, + "time_per_iteration": 2.536994218826294 + }, + { + "auxiliary_loss_clip": 0.06415705, + "auxiliary_loss_mlp": 0.0126476, + "balance_loss_clip": 0.06274568, + "balance_loss_mlp": 0.01253506, + "epoch": 0.5773034721178416, + "flos": 19686814058880.0, + "grad_norm": 1.49916876372247, + "language_loss": 0.68989396, + "learning_rate": 1.599058274973348e-06, + "loss": 0.7666986, + "num_input_tokens_seen": 206870595, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.11254883, + "step": 9602, + "time_per_iteration": 2.4855434894561768 + }, + { + "auxiliary_loss_clip": 0.06409699, + "auxiliary_loss_mlp": 0.01267414, + "balance_loss_clip": 0.06272521, + "balance_loss_mlp": 0.01257287, + "epoch": 0.5773635953705095, + "flos": 25089951288960.0, + "grad_norm": 1.4178586949074146, + "language_loss": 0.73199558, + "learning_rate": 1.5986767270988297e-06, + "loss": 0.80876672, + "num_input_tokens_seen": 206892320, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.10125732, + "step": 9603, + "time_per_iteration": 2.5496528148651123 + }, + { + "auxiliary_loss_clip": 0.06418322, + "auxiliary_loss_mlp": 0.01267189, + "balance_loss_clip": 0.06276152, + "balance_loss_mlp": 0.01256162, + "epoch": 0.5774237186231775, + "flos": 21039380265600.0, + "grad_norm": 1.5159674911644692, + "language_loss": 0.76686621, + "learning_rate": 1.5982951944425298e-06, + "loss": 0.84372133, + "num_input_tokens_seen": 206912485, + "router_z_loss_clip": 1.41992188, + "router_z_loss_mlp": 0.11035156, + "step": 9604, + "time_per_iteration": 2.522033452987671 + }, + { + "auxiliary_loss_clip": 0.06420808, + "auxiliary_loss_mlp": 0.01271467, + "balance_loss_clip": 0.06277063, + "balance_loss_mlp": 0.01259373, + "epoch": 0.5774838418758454, + "flos": 15237145238400.0, + "grad_norm": 2.0065352138527808, + "language_loss": 0.83384192, + "learning_rate": 1.5979136770189174e-06, + "loss": 0.91076463, + "num_input_tokens_seen": 206929100, + "router_z_loss_clip": 1.43652344, + "router_z_loss_mlp": 0.12097168, + "step": 9605, + "time_per_iteration": 2.4643824100494385 + }, + { + "auxiliary_loss_clip": 0.0643101, + "auxiliary_loss_mlp": 0.01267132, + "balance_loss_clip": 0.06278086, + "balance_loss_mlp": 0.01254913, + "epoch": 0.5775439651285135, + "flos": 23588694812160.0, + "grad_norm": 1.6400067603153077, + "language_loss": 0.78330255, + "learning_rate": 1.5975321748424581e-06, + "loss": 0.86028397, + "num_input_tokens_seen": 206947020, + "router_z_loss_clip": 1.52832031, + "router_z_loss_mlp": 0.12207031, + "step": 9606, + "time_per_iteration": 2.5217928886413574 + }, + { + "auxiliary_loss_clip": 0.06417712, + "auxiliary_loss_mlp": 0.0126431, + "balance_loss_clip": 0.06273665, + "balance_loss_mlp": 0.01252687, + "epoch": 0.5776040883811814, + "flos": 18046300896000.0, + "grad_norm": 1.7192315062710783, + "language_loss": 0.73891246, + "learning_rate": 1.597150687927619e-06, + "loss": 0.81573272, + "num_input_tokens_seen": 206964065, + "router_z_loss_clip": 1.43847656, + "router_z_loss_mlp": 0.11633301, + "step": 9607, + "time_per_iteration": 2.4798216819763184 + }, + { + "auxiliary_loss_clip": 0.06424229, + "auxiliary_loss_mlp": 0.01268528, + "balance_loss_clip": 0.06277244, + "balance_loss_mlp": 0.01256368, + "epoch": 0.5776642116338494, + "flos": 18630580216320.0, + "grad_norm": 1.602339688767026, + "language_loss": 0.69749868, + "learning_rate": 1.5967692162888664e-06, + "loss": 0.77442622, + "num_input_tokens_seen": 206981940, + "router_z_loss_clip": 1.46972656, + "router_z_loss_mlp": 0.121521, + "step": 9608, + "time_per_iteration": 2.5238630771636963 + }, + { + "auxiliary_loss_clip": 0.06419271, + "auxiliary_loss_mlp": 0.01267568, + "balance_loss_clip": 0.06275126, + "balance_loss_mlp": 0.01255814, + "epoch": 0.5777243348865173, + "flos": 28410068344320.0, + "grad_norm": 1.9615645043462706, + "language_loss": 0.76945466, + "learning_rate": 1.596387759940665e-06, + "loss": 0.84632301, + "num_input_tokens_seen": 207002365, + "router_z_loss_clip": 1.44042969, + "router_z_loss_mlp": 0.11749268, + "step": 9609, + "time_per_iteration": 2.549933671951294 + }, + { + "auxiliary_loss_clip": 0.0642001, + "auxiliary_loss_mlp": 0.01267285, + "balance_loss_clip": 0.06273153, + "balance_loss_mlp": 0.01255084, + "epoch": 0.5777844581391853, + "flos": 24031579167360.0, + "grad_norm": 1.544459178362984, + "language_loss": 0.77057648, + "learning_rate": 1.5960063188974808e-06, + "loss": 0.84744948, + "num_input_tokens_seen": 207021195, + "router_z_loss_clip": 1.46777344, + "router_z_loss_mlp": 0.12200928, + "step": 9610, + "time_per_iteration": 2.5409657955169678 + }, + { + "auxiliary_loss_clip": 0.06419136, + "auxiliary_loss_mlp": 0.01273329, + "balance_loss_clip": 0.06273989, + "balance_loss_mlp": 0.01261104, + "epoch": 0.5778445813918534, + "flos": 17781805883520.0, + "grad_norm": 2.0334076468596463, + "language_loss": 0.69377804, + "learning_rate": 1.5956248931737777e-06, + "loss": 0.77070266, + "num_input_tokens_seen": 207037465, + "router_z_loss_clip": 1.44921875, + "router_z_loss_mlp": 0.12231445, + "step": 9611, + "time_per_iteration": 3.8771145343780518 + }, + { + "auxiliary_loss_clip": 0.06415454, + "auxiliary_loss_mlp": 0.01265667, + "balance_loss_clip": 0.06272358, + "balance_loss_mlp": 0.01254795, + "epoch": 0.5779047046445213, + "flos": 22239147352320.0, + "grad_norm": 1.7756554406320284, + "language_loss": 0.84048247, + "learning_rate": 1.5952434827840185e-06, + "loss": 0.91729373, + "num_input_tokens_seen": 207054230, + "router_z_loss_clip": 1.43359375, + "router_z_loss_mlp": 0.10876465, + "step": 9612, + "time_per_iteration": 2.4897758960723877 + }, + { + "auxiliary_loss_clip": 0.06417899, + "auxiliary_loss_mlp": 0.01267936, + "balance_loss_clip": 0.06275887, + "balance_loss_mlp": 0.01257046, + "epoch": 0.5779648278971893, + "flos": 21440825976960.0, + "grad_norm": 1.4853190478070708, + "language_loss": 0.80038643, + "learning_rate": 1.594862087742667e-06, + "loss": 0.87724483, + "num_input_tokens_seen": 207073150, + "router_z_loss_clip": 1.41894531, + "router_z_loss_mlp": 0.10894775, + "step": 9613, + "time_per_iteration": 2.512202501296997 + }, + { + "auxiliary_loss_clip": 0.06417654, + "auxiliary_loss_mlp": 0.01265916, + "balance_loss_clip": 0.06274515, + "balance_loss_mlp": 0.01254996, + "epoch": 0.5780249511498572, + "flos": 19032151708800.0, + "grad_norm": 1.6718641196950235, + "language_loss": 0.7774657, + "learning_rate": 1.5944807080641863e-06, + "loss": 0.85430139, + "num_input_tokens_seen": 207090375, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.10925293, + "step": 9614, + "time_per_iteration": 2.4882118701934814 + }, + { + "auxiliary_loss_clip": 0.06421545, + "auxiliary_loss_mlp": 0.0126591, + "balance_loss_clip": 0.06274751, + "balance_loss_mlp": 0.01254543, + "epoch": 0.5780850744025252, + "flos": 12128596542720.0, + "grad_norm": 2.0494146854902175, + "language_loss": 0.82224047, + "learning_rate": 1.5940993437630375e-06, + "loss": 0.89911503, + "num_input_tokens_seen": 207106030, + "router_z_loss_clip": 1.46679688, + "router_z_loss_mlp": 0.1137085, + "step": 9615, + "time_per_iteration": 2.472621440887451 + }, + { + "auxiliary_loss_clip": 0.0642141, + "auxiliary_loss_mlp": 0.01267646, + "balance_loss_clip": 0.06274787, + "balance_loss_mlp": 0.01255552, + "epoch": 0.5781451976551931, + "flos": 25051154048640.0, + "grad_norm": 1.4669220513135932, + "language_loss": 0.67472255, + "learning_rate": 1.5937179948536825e-06, + "loss": 0.75161308, + "num_input_tokens_seen": 207125435, + "router_z_loss_clip": 1.46679688, + "router_z_loss_mlp": 0.12097168, + "step": 9616, + "time_per_iteration": 2.534846782684326 + }, + { + "auxiliary_loss_clip": 0.06417294, + "auxiliary_loss_mlp": 0.01269205, + "balance_loss_clip": 0.06275527, + "balance_loss_mlp": 0.01257528, + "epoch": 0.5782053209078611, + "flos": 19251770060160.0, + "grad_norm": 1.8155832257801603, + "language_loss": 0.77963018, + "learning_rate": 1.5933366613505812e-06, + "loss": 0.85649514, + "num_input_tokens_seen": 207145095, + "router_z_loss_clip": 1.41699219, + "router_z_loss_mlp": 0.11669922, + "step": 9617, + "time_per_iteration": 4.014554977416992 + }, + { + "auxiliary_loss_clip": 0.064207, + "auxiliary_loss_mlp": 0.01269929, + "balance_loss_clip": 0.06277206, + "balance_loss_mlp": 0.012578, + "epoch": 0.578265444160529, + "flos": 26000849024640.0, + "grad_norm": 1.3678407791087424, + "language_loss": 0.75333905, + "learning_rate": 1.5929553432681947e-06, + "loss": 0.83024538, + "num_input_tokens_seen": 207166045, + "router_z_loss_clip": 1.43457031, + "router_z_loss_mlp": 0.12139893, + "step": 9618, + "time_per_iteration": 2.5390572547912598 + }, + { + "auxiliary_loss_clip": 0.06416163, + "auxiliary_loss_mlp": 0.01265823, + "balance_loss_clip": 0.06273779, + "balance_loss_mlp": 0.01254355, + "epoch": 0.5783255674131971, + "flos": 21805025748480.0, + "grad_norm": 1.6109172194310035, + "language_loss": 0.81657064, + "learning_rate": 1.5925740406209826e-06, + "loss": 0.89339048, + "num_input_tokens_seen": 207185290, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.11468506, + "step": 9619, + "time_per_iteration": 2.505831718444824 + }, + { + "auxiliary_loss_clip": 0.06420539, + "auxiliary_loss_mlp": 0.01265219, + "balance_loss_clip": 0.06275585, + "balance_loss_mlp": 0.01253972, + "epoch": 0.578385690665865, + "flos": 24796553817600.0, + "grad_norm": 1.540190718879446, + "language_loss": 0.72668874, + "learning_rate": 1.5921927534234039e-06, + "loss": 0.80354631, + "num_input_tokens_seen": 207205505, + "router_z_loss_clip": 1.44921875, + "router_z_loss_mlp": 0.11248779, + "step": 9620, + "time_per_iteration": 3.9673268795013428 + }, + { + "auxiliary_loss_clip": 0.06423381, + "auxiliary_loss_mlp": 0.01270714, + "balance_loss_clip": 0.06277235, + "balance_loss_mlp": 0.01258942, + "epoch": 0.578445813918533, + "flos": 21218859711360.0, + "grad_norm": 1.6605075192862409, + "language_loss": 0.77349472, + "learning_rate": 1.591811481689916e-06, + "loss": 0.85043567, + "num_input_tokens_seen": 207225315, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.11767578, + "step": 9621, + "time_per_iteration": 2.5077648162841797 + }, + { + "auxiliary_loss_clip": 0.06420489, + "auxiliary_loss_mlp": 0.01264338, + "balance_loss_clip": 0.0627306, + "balance_loss_mlp": 0.01252477, + "epoch": 0.5785059371712009, + "flos": 25053921233280.0, + "grad_norm": 1.4404835359445094, + "language_loss": 0.7094593, + "learning_rate": 1.5914302254349787e-06, + "loss": 0.78630757, + "num_input_tokens_seen": 207247690, + "router_z_loss_clip": 1.47558594, + "router_z_loss_mlp": 0.11859131, + "step": 9622, + "time_per_iteration": 2.5468451976776123 + }, + { + "auxiliary_loss_clip": 0.06311069, + "auxiliary_loss_mlp": 0.01252444, + "balance_loss_clip": 0.06251176, + "balance_loss_mlp": 0.01250508, + "epoch": 0.5785660604238689, + "flos": 70865187488640.0, + "grad_norm": 0.7596176351080388, + "language_loss": 0.55852556, + "learning_rate": 1.5910489846730476e-06, + "loss": 0.6341607, + "num_input_tokens_seen": 207301735, + "router_z_loss_clip": 0.6015625, + "router_z_loss_mlp": 0.01933289, + "step": 9623, + "time_per_iteration": 3.153353452682495 + }, + { + "auxiliary_loss_clip": 0.06425077, + "auxiliary_loss_mlp": 0.01267172, + "balance_loss_clip": 0.06277281, + "balance_loss_mlp": 0.01255233, + "epoch": 0.578626183676537, + "flos": 31658083361280.0, + "grad_norm": 2.2034040135587936, + "language_loss": 0.71319884, + "learning_rate": 1.5906677594185799e-06, + "loss": 0.79012132, + "num_input_tokens_seen": 207321240, + "router_z_loss_clip": 1.47949219, + "router_z_loss_mlp": 0.1194458, + "step": 9624, + "time_per_iteration": 2.5956926345825195 + }, + { + "auxiliary_loss_clip": 0.06420659, + "auxiliary_loss_mlp": 0.01270578, + "balance_loss_clip": 0.06275962, + "balance_loss_mlp": 0.01258222, + "epoch": 0.5786863069292049, + "flos": 21870545241600.0, + "grad_norm": 1.7015470008848133, + "language_loss": 0.82409322, + "learning_rate": 1.5902865496860322e-06, + "loss": 0.90100557, + "num_input_tokens_seen": 207339540, + "router_z_loss_clip": 1.44824219, + "router_z_loss_mlp": 0.12353516, + "step": 9625, + "time_per_iteration": 2.5166807174682617 + }, + { + "auxiliary_loss_clip": 0.06417123, + "auxiliary_loss_mlp": 0.01266026, + "balance_loss_clip": 0.06274764, + "balance_loss_mlp": 0.01253647, + "epoch": 0.5787464301818729, + "flos": 23371214739840.0, + "grad_norm": 1.4015207824111633, + "language_loss": 0.70712119, + "learning_rate": 1.5899053554898591e-06, + "loss": 0.78395265, + "num_input_tokens_seen": 207360470, + "router_z_loss_clip": 1.42382812, + "router_z_loss_mlp": 0.12384033, + "step": 9626, + "time_per_iteration": 2.5232555866241455 + }, + { + "auxiliary_loss_clip": 0.06417292, + "auxiliary_loss_mlp": 0.01266097, + "balance_loss_clip": 0.06275232, + "balance_loss_mlp": 0.01255278, + "epoch": 0.5788065534345408, + "flos": 30011155361280.0, + "grad_norm": 1.650883867076693, + "language_loss": 0.71934295, + "learning_rate": 1.5895241768445166e-06, + "loss": 0.79617685, + "num_input_tokens_seen": 207383080, + "router_z_loss_clip": 1.41992188, + "router_z_loss_mlp": 0.10827637, + "step": 9627, + "time_per_iteration": 2.5862505435943604 + }, + { + "auxiliary_loss_clip": 0.06419323, + "auxiliary_loss_mlp": 0.01268778, + "balance_loss_clip": 0.06276532, + "balance_loss_mlp": 0.01257643, + "epoch": 0.5788666766872088, + "flos": 24533526251520.0, + "grad_norm": 1.6845581870111699, + "language_loss": 0.84154361, + "learning_rate": 1.589143013764458e-06, + "loss": 0.91842461, + "num_input_tokens_seen": 207401000, + "router_z_loss_clip": 1.42675781, + "router_z_loss_mlp": 0.11138916, + "step": 9628, + "time_per_iteration": 4.011742830276489 + }, + { + "auxiliary_loss_clip": 0.06420035, + "auxiliary_loss_mlp": 0.01267996, + "balance_loss_clip": 0.06274278, + "balance_loss_mlp": 0.01255443, + "epoch": 0.5789267999398767, + "flos": 23739649142400.0, + "grad_norm": 1.4211285900013286, + "language_loss": 0.72366357, + "learning_rate": 1.5887618662641376e-06, + "loss": 0.8005439, + "num_input_tokens_seen": 207419230, + "router_z_loss_clip": 1.45898438, + "router_z_loss_mlp": 0.12548828, + "step": 9629, + "time_per_iteration": 2.535161018371582 + }, + { + "auxiliary_loss_clip": 0.06419079, + "auxiliary_loss_mlp": 0.01266785, + "balance_loss_clip": 0.06275524, + "balance_loss_mlp": 0.01254894, + "epoch": 0.5789869231925447, + "flos": 21140217054720.0, + "grad_norm": 1.8234862135922645, + "language_loss": 0.74396068, + "learning_rate": 1.5883807343580087e-06, + "loss": 0.82081938, + "num_input_tokens_seen": 207437615, + "router_z_loss_clip": 1.43652344, + "router_z_loss_mlp": 0.11883545, + "step": 9630, + "time_per_iteration": 2.4906413555145264 + }, + { + "auxiliary_loss_clip": 0.06409539, + "auxiliary_loss_mlp": 0.0126483, + "balance_loss_clip": 0.06270717, + "balance_loss_mlp": 0.0125344, + "epoch": 0.5790470464452127, + "flos": 21215086277760.0, + "grad_norm": 1.5521366007555986, + "language_loss": 0.78864127, + "learning_rate": 1.587999618060523e-06, + "loss": 0.86538494, + "num_input_tokens_seen": 207457270, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.11395264, + "step": 9631, + "time_per_iteration": 2.500326633453369 + }, + { + "auxiliary_loss_clip": 0.06417775, + "auxiliary_loss_mlp": 0.01264538, + "balance_loss_clip": 0.06272215, + "balance_loss_mlp": 0.01253147, + "epoch": 0.5791071696978807, + "flos": 23411144010240.0, + "grad_norm": 1.6622191818478913, + "language_loss": 0.7546376, + "learning_rate": 1.5876185173861333e-06, + "loss": 0.83146071, + "num_input_tokens_seen": 207477890, + "router_z_loss_clip": 1.45605469, + "router_z_loss_mlp": 0.1138916, + "step": 9632, + "time_per_iteration": 2.5060648918151855 + }, + { + "auxiliary_loss_clip": 0.06419455, + "auxiliary_loss_mlp": 0.01267637, + "balance_loss_clip": 0.06274837, + "balance_loss_mlp": 0.0125562, + "epoch": 0.5791672929505486, + "flos": 24213322673280.0, + "grad_norm": 1.7292582736877316, + "language_loss": 0.79532528, + "learning_rate": 1.5872374323492915e-06, + "loss": 0.8721962, + "num_input_tokens_seen": 207497670, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.12011719, + "step": 9633, + "time_per_iteration": 2.516359567642212 + }, + { + "auxiliary_loss_clip": 0.0643272, + "auxiliary_loss_mlp": 0.01269361, + "balance_loss_clip": 0.06278707, + "balance_loss_mlp": 0.01256635, + "epoch": 0.5792274162032166, + "flos": 24355094981760.0, + "grad_norm": 1.6340208840931036, + "language_loss": 0.7790345, + "learning_rate": 1.5868563629644464e-06, + "loss": 0.85605538, + "num_input_tokens_seen": 207516105, + "router_z_loss_clip": 1.5390625, + "router_z_loss_mlp": 0.1272583, + "step": 9634, + "time_per_iteration": 2.541090488433838 + }, + { + "auxiliary_loss_clip": 0.06422533, + "auxiliary_loss_mlp": 0.01268072, + "balance_loss_clip": 0.06273677, + "balance_loss_mlp": 0.01255406, + "epoch": 0.5792875394558845, + "flos": 20455729850880.0, + "grad_norm": 1.975369322400224, + "language_loss": 0.64063549, + "learning_rate": 1.5864753092460502e-06, + "loss": 0.71754158, + "num_input_tokens_seen": 207533685, + "router_z_loss_clip": 1.48828125, + "router_z_loss_mlp": 0.12652588, + "step": 9635, + "time_per_iteration": 2.4916157722473145 + }, + { + "auxiliary_loss_clip": 0.06417014, + "auxiliary_loss_mlp": 0.01265438, + "balance_loss_clip": 0.06273466, + "balance_loss_mlp": 0.01253327, + "epoch": 0.5793476627085525, + "flos": 24067064171520.0, + "grad_norm": 1.4766518541506428, + "language_loss": 0.77494228, + "learning_rate": 1.5860942712085516e-06, + "loss": 0.85176682, + "num_input_tokens_seen": 207552840, + "router_z_loss_clip": 1.43359375, + "router_z_loss_mlp": 0.12115479, + "step": 9636, + "time_per_iteration": 2.516622304916382 + }, + { + "auxiliary_loss_clip": 0.06411137, + "auxiliary_loss_mlp": 0.01268567, + "balance_loss_clip": 0.06271987, + "balance_loss_mlp": 0.01258226, + "epoch": 0.5794077859612206, + "flos": 22060799936640.0, + "grad_norm": 1.6556351940576073, + "language_loss": 0.68772542, + "learning_rate": 1.5857132488663998e-06, + "loss": 0.76452249, + "num_input_tokens_seen": 207572095, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.10333252, + "step": 9637, + "time_per_iteration": 2.509833812713623 + }, + { + "auxiliary_loss_clip": 0.06421766, + "auxiliary_loss_mlp": 0.0126905, + "balance_loss_clip": 0.06273458, + "balance_loss_mlp": 0.01256784, + "epoch": 0.5794679092138885, + "flos": 11439245802240.0, + "grad_norm": 2.540580609640148, + "language_loss": 0.72712755, + "learning_rate": 1.585332242234043e-06, + "loss": 0.80403578, + "num_input_tokens_seen": 207587495, + "router_z_loss_clip": 1.48339844, + "router_z_loss_mlp": 0.12261963, + "step": 9638, + "time_per_iteration": 2.4528071880340576 + }, + { + "auxiliary_loss_clip": 0.06416277, + "auxiliary_loss_mlp": 0.01266332, + "balance_loss_clip": 0.06273618, + "balance_loss_mlp": 0.0125521, + "epoch": 0.5795280324665565, + "flos": 18886228623360.0, + "grad_norm": 1.607875789180523, + "language_loss": 0.72792935, + "learning_rate": 1.5849512513259291e-06, + "loss": 0.80475545, + "num_input_tokens_seen": 207606795, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.11120605, + "step": 9639, + "time_per_iteration": 2.510347604751587 + }, + { + "auxiliary_loss_clip": 0.06418437, + "auxiliary_loss_mlp": 0.01269692, + "balance_loss_clip": 0.06273493, + "balance_loss_mlp": 0.01258332, + "epoch": 0.5795881557192244, + "flos": 13010969162880.0, + "grad_norm": 1.751039086833101, + "language_loss": 0.69813907, + "learning_rate": 1.5845702761565054e-06, + "loss": 0.7750203, + "num_input_tokens_seen": 207623620, + "router_z_loss_clip": 1.44824219, + "router_z_loss_mlp": 0.11364746, + "step": 9640, + "time_per_iteration": 2.453831672668457 + }, + { + "auxiliary_loss_clip": 0.06430758, + "auxiliary_loss_mlp": 0.01271889, + "balance_loss_clip": 0.0627775, + "balance_loss_mlp": 0.01259509, + "epoch": 0.5796482789718924, + "flos": 19937598929280.0, + "grad_norm": 2.3188274360648298, + "language_loss": 0.78378308, + "learning_rate": 1.5841893167402183e-06, + "loss": 0.8608095, + "num_input_tokens_seen": 207639380, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.12371826, + "step": 9641, + "time_per_iteration": 2.487333059310913 + }, + { + "auxiliary_loss_clip": 0.06416615, + "auxiliary_loss_mlp": 0.01268516, + "balance_loss_clip": 0.06271899, + "balance_loss_mlp": 0.01256685, + "epoch": 0.5797084022245603, + "flos": 21656880529920.0, + "grad_norm": 2.422042135441505, + "language_loss": 0.74201375, + "learning_rate": 1.5838083730915143e-06, + "loss": 0.81886506, + "num_input_tokens_seen": 207657915, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.1182251, + "step": 9642, + "time_per_iteration": 2.4917688369750977 + }, + { + "auxiliary_loss_clip": 0.06419542, + "auxiliary_loss_mlp": 0.01264152, + "balance_loss_clip": 0.06275794, + "balance_loss_mlp": 0.01252582, + "epoch": 0.5797685254772283, + "flos": 26038807724160.0, + "grad_norm": 1.4983613319397562, + "language_loss": 0.73538697, + "learning_rate": 1.5834274452248378e-06, + "loss": 0.81222391, + "num_input_tokens_seen": 207678620, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.11566162, + "step": 9643, + "time_per_iteration": 2.5357465744018555 + }, + { + "auxiliary_loss_clip": 0.06417159, + "auxiliary_loss_mlp": 0.01264721, + "balance_loss_clip": 0.06270476, + "balance_loss_mlp": 0.01253175, + "epoch": 0.5798286487298963, + "flos": 22710808385280.0, + "grad_norm": 1.6774180539317567, + "language_loss": 0.67605746, + "learning_rate": 1.5830465331546352e-06, + "loss": 0.75287628, + "num_input_tokens_seen": 207696980, + "router_z_loss_clip": 1.46679688, + "router_z_loss_mlp": 0.11547852, + "step": 9644, + "time_per_iteration": 2.485366106033325 + }, + { + "auxiliary_loss_clip": 0.06425455, + "auxiliary_loss_mlp": 0.01268613, + "balance_loss_clip": 0.06276956, + "balance_loss_mlp": 0.01256078, + "epoch": 0.5798887719825643, + "flos": 23155705238400.0, + "grad_norm": 2.0120452642465865, + "language_loss": 0.85497642, + "learning_rate": 1.5826656368953496e-06, + "loss": 0.93191713, + "num_input_tokens_seen": 207714065, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.12542725, + "step": 9645, + "time_per_iteration": 2.505467414855957 + }, + { + "auxiliary_loss_clip": 0.06418729, + "auxiliary_loss_mlp": 0.01266861, + "balance_loss_clip": 0.06275458, + "balance_loss_mlp": 0.01255774, + "epoch": 0.5799488952352322, + "flos": 24432982951680.0, + "grad_norm": 1.7616171208033915, + "language_loss": 0.75737381, + "learning_rate": 1.5822847564614244e-06, + "loss": 0.83422971, + "num_input_tokens_seen": 207734720, + "router_z_loss_clip": 1.43164062, + "router_z_loss_mlp": 0.11102295, + "step": 9646, + "time_per_iteration": 2.527848958969116 + }, + { + "auxiliary_loss_clip": 0.06425247, + "auxiliary_loss_mlp": 0.01268889, + "balance_loss_clip": 0.06276453, + "balance_loss_mlp": 0.01256461, + "epoch": 0.5800090184879002, + "flos": 38404478995200.0, + "grad_norm": 1.7871006843554935, + "language_loss": 0.59099573, + "learning_rate": 1.5819038918673038e-06, + "loss": 0.6679371, + "num_input_tokens_seen": 207755435, + "router_z_loss_clip": 1.48828125, + "router_z_loss_mlp": 0.12426758, + "step": 9647, + "time_per_iteration": 2.643890142440796 + }, + { + "auxiliary_loss_clip": 0.06425125, + "auxiliary_loss_mlp": 0.01271805, + "balance_loss_clip": 0.06275211, + "balance_loss_mlp": 0.01259276, + "epoch": 0.5800691417405681, + "flos": 19789747200000.0, + "grad_norm": 1.4917917867847632, + "language_loss": 0.84483784, + "learning_rate": 1.5815230431274288e-06, + "loss": 0.92180717, + "num_input_tokens_seen": 207773570, + "router_z_loss_clip": 1.49902344, + "router_z_loss_mlp": 0.12524414, + "step": 9648, + "time_per_iteration": 2.48917818069458 + }, + { + "auxiliary_loss_clip": 0.06311809, + "auxiliary_loss_mlp": 0.01252996, + "balance_loss_clip": 0.06251512, + "balance_loss_mlp": 0.01251245, + "epoch": 0.5801292649932361, + "flos": 70333514133120.0, + "grad_norm": 0.8366168453621474, + "language_loss": 0.63013005, + "learning_rate": 1.581142210256242e-06, + "loss": 0.70577806, + "num_input_tokens_seen": 207830095, + "router_z_loss_clip": 0.60449219, + "router_z_loss_mlp": 0.01756287, + "step": 9649, + "time_per_iteration": 3.167630434036255 + }, + { + "auxiliary_loss_clip": 0.064106, + "auxiliary_loss_mlp": 0.01264864, + "balance_loss_clip": 0.06269349, + "balance_loss_mlp": 0.01253903, + "epoch": 0.5801893882459042, + "flos": 18740892516480.0, + "grad_norm": 1.6385207780550837, + "language_loss": 0.82320833, + "learning_rate": 1.5807613932681857e-06, + "loss": 0.89996296, + "num_input_tokens_seen": 207848555, + "router_z_loss_clip": 1.41308594, + "router_z_loss_mlp": 0.10968018, + "step": 9650, + "time_per_iteration": 2.495060920715332 + }, + { + "auxiliary_loss_clip": 0.06424958, + "auxiliary_loss_mlp": 0.01267787, + "balance_loss_clip": 0.0627567, + "balance_loss_mlp": 0.01256194, + "epoch": 0.5802495114985721, + "flos": 15601973915520.0, + "grad_norm": 2.051158244012986, + "language_loss": 0.77640611, + "learning_rate": 1.580380592177698e-06, + "loss": 0.85333359, + "num_input_tokens_seen": 207867060, + "router_z_loss_clip": 1.49121094, + "router_z_loss_mlp": 0.11584473, + "step": 9651, + "time_per_iteration": 3.9003303050994873 + }, + { + "auxiliary_loss_clip": 0.06421195, + "auxiliary_loss_mlp": 0.01270828, + "balance_loss_clip": 0.0627306, + "balance_loss_mlp": 0.01258627, + "epoch": 0.5803096347512401, + "flos": 18260552586240.0, + "grad_norm": 1.678926948492491, + "language_loss": 0.74017727, + "learning_rate": 1.5799998069992213e-06, + "loss": 0.81709743, + "num_input_tokens_seen": 207884520, + "router_z_loss_clip": 1.48046875, + "router_z_loss_mlp": 0.12207031, + "step": 9652, + "time_per_iteration": 2.5226869583129883 + }, + { + "auxiliary_loss_clip": 0.0642662, + "auxiliary_loss_mlp": 0.01267654, + "balance_loss_clip": 0.06278314, + "balance_loss_mlp": 0.012559, + "epoch": 0.580369758003908, + "flos": 22899763342080.0, + "grad_norm": 1.9284827518212118, + "language_loss": 0.77118474, + "learning_rate": 1.579619037747193e-06, + "loss": 0.84812748, + "num_input_tokens_seen": 207905370, + "router_z_loss_clip": 1.48242188, + "router_z_loss_mlp": 0.11749268, + "step": 9653, + "time_per_iteration": 2.5736207962036133 + }, + { + "auxiliary_loss_clip": 0.06425463, + "auxiliary_loss_mlp": 0.01265074, + "balance_loss_clip": 0.06277624, + "balance_loss_mlp": 0.01252789, + "epoch": 0.580429881256576, + "flos": 18703646576640.0, + "grad_norm": 1.9366371532767657, + "language_loss": 0.75627828, + "learning_rate": 1.5792382844360534e-06, + "loss": 0.83318365, + "num_input_tokens_seen": 207923790, + "router_z_loss_clip": 1.47851562, + "router_z_loss_mlp": 0.1229248, + "step": 9654, + "time_per_iteration": 2.667048931121826 + }, + { + "auxiliary_loss_clip": 0.06413651, + "auxiliary_loss_mlp": 0.01265944, + "balance_loss_clip": 0.062739, + "balance_loss_mlp": 0.01254959, + "epoch": 0.5804900045092439, + "flos": 24689050629120.0, + "grad_norm": 1.638178903008904, + "language_loss": 0.70858634, + "learning_rate": 1.5788575470802408e-06, + "loss": 0.78538227, + "num_input_tokens_seen": 207942335, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.10992432, + "step": 9655, + "time_per_iteration": 2.5496294498443604 + }, + { + "auxiliary_loss_clip": 0.06424456, + "auxiliary_loss_mlp": 0.01266011, + "balance_loss_clip": 0.06273113, + "balance_loss_mlp": 0.0125378, + "epoch": 0.580550127761912, + "flos": 23119549401600.0, + "grad_norm": 2.0310142592924314, + "language_loss": 0.70043373, + "learning_rate": 1.5784768256941915e-06, + "loss": 0.77733833, + "num_input_tokens_seen": 207961975, + "router_z_loss_clip": 1.51171875, + "router_z_loss_mlp": 0.12231445, + "step": 9656, + "time_per_iteration": 4.0007078647613525 + }, + { + "auxiliary_loss_clip": 0.06411725, + "auxiliary_loss_mlp": 0.01265789, + "balance_loss_clip": 0.0627184, + "balance_loss_mlp": 0.01255203, + "epoch": 0.5806102510145799, + "flos": 18481093332480.0, + "grad_norm": 1.6851014534608593, + "language_loss": 0.71761322, + "learning_rate": 1.5780961202923433e-06, + "loss": 0.79438841, + "num_input_tokens_seen": 207979520, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.105896, + "step": 9657, + "time_per_iteration": 2.52081298828125 + }, + { + "auxiliary_loss_clip": 0.06426618, + "auxiliary_loss_mlp": 0.01265336, + "balance_loss_clip": 0.06275696, + "balance_loss_mlp": 0.01252843, + "epoch": 0.5806703742672479, + "flos": 23922566605440.0, + "grad_norm": 1.7911249599131025, + "language_loss": 0.70450497, + "learning_rate": 1.5777154308891328e-06, + "loss": 0.78142452, + "num_input_tokens_seen": 207998375, + "router_z_loss_clip": 1.50976562, + "router_z_loss_mlp": 0.12506104, + "step": 9658, + "time_per_iteration": 2.509723424911499 + }, + { + "auxiliary_loss_clip": 0.06307676, + "auxiliary_loss_mlp": 0.01252681, + "balance_loss_clip": 0.06247197, + "balance_loss_mlp": 0.01250939, + "epoch": 0.5807304975199158, + "flos": 66332096328960.0, + "grad_norm": 0.6445385314606554, + "language_loss": 0.53559077, + "learning_rate": 1.5773347574989953e-06, + "loss": 0.61119437, + "num_input_tokens_seen": 208060605, + "router_z_loss_clip": 0.60546875, + "router_z_loss_mlp": 0.01747131, + "step": 9659, + "time_per_iteration": 3.164217233657837 + }, + { + "auxiliary_loss_clip": 0.0642177, + "auxiliary_loss_mlp": 0.01266172, + "balance_loss_clip": 0.06271978, + "balance_loss_mlp": 0.01254191, + "epoch": 0.5807906207725838, + "flos": 31730478888960.0, + "grad_norm": 1.678223545722946, + "language_loss": 0.62300181, + "learning_rate": 1.576954100136366e-06, + "loss": 0.69988132, + "num_input_tokens_seen": 208080320, + "router_z_loss_clip": 1.49804688, + "router_z_loss_mlp": 0.11987305, + "step": 9660, + "time_per_iteration": 4.055291175842285 + }, + { + "auxiliary_loss_clip": 0.06418584, + "auxiliary_loss_mlp": 0.01267971, + "balance_loss_clip": 0.06270796, + "balance_loss_mlp": 0.01256443, + "epoch": 0.5808507440252517, + "flos": 23807223060480.0, + "grad_norm": 1.5142376676823694, + "language_loss": 0.65793735, + "learning_rate": 1.5765734588156797e-06, + "loss": 0.73480284, + "num_input_tokens_seen": 208099305, + "router_z_loss_clip": 1.4765625, + "router_z_loss_mlp": 0.11541748, + "step": 9661, + "time_per_iteration": 2.50545334815979 + }, + { + "auxiliary_loss_clip": 0.06409734, + "auxiliary_loss_mlp": 0.01265632, + "balance_loss_clip": 0.062701, + "balance_loss_mlp": 0.01255565, + "epoch": 0.5809108672779197, + "flos": 13703464431360.0, + "grad_norm": 1.88238902360882, + "language_loss": 0.74297959, + "learning_rate": 1.5761928335513704e-06, + "loss": 0.81973332, + "num_input_tokens_seen": 208116960, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.10070801, + "step": 9662, + "time_per_iteration": 2.4924473762512207 + }, + { + "auxiliary_loss_clip": 0.06306686, + "auxiliary_loss_mlp": 0.01251122, + "balance_loss_clip": 0.06246165, + "balance_loss_mlp": 0.0124951, + "epoch": 0.5809709905305876, + "flos": 69157687386240.0, + "grad_norm": 0.8243605057954629, + "language_loss": 0.58189029, + "learning_rate": 1.5758122243578709e-06, + "loss": 0.65746832, + "num_input_tokens_seen": 208182190, + "router_z_loss_clip": 0.60546875, + "router_z_loss_mlp": 0.0161438, + "step": 9663, + "time_per_iteration": 3.215336799621582 + }, + { + "auxiliary_loss_clip": 0.06414537, + "auxiliary_loss_mlp": 0.01265807, + "balance_loss_clip": 0.06272955, + "balance_loss_mlp": 0.01254392, + "epoch": 0.5810311137832557, + "flos": 19833491831040.0, + "grad_norm": 2.48301510503896, + "language_loss": 0.82404405, + "learning_rate": 1.5754316312496152e-06, + "loss": 0.90084743, + "num_input_tokens_seen": 208197015, + "router_z_loss_clip": 1.41601562, + "router_z_loss_mlp": 0.11413574, + "step": 9664, + "time_per_iteration": 2.663583278656006 + }, + { + "auxiliary_loss_clip": 0.06419012, + "auxiliary_loss_mlp": 0.01263414, + "balance_loss_clip": 0.06271498, + "balance_loss_mlp": 0.01252423, + "epoch": 0.5810912370359237, + "flos": 29245635659520.0, + "grad_norm": 1.676690255308112, + "language_loss": 0.81861937, + "learning_rate": 1.5750510542410337e-06, + "loss": 0.89544368, + "num_input_tokens_seen": 208215795, + "router_z_loss_clip": 1.47460938, + "router_z_loss_mlp": 0.10992432, + "step": 9665, + "time_per_iteration": 2.5936458110809326 + }, + { + "auxiliary_loss_clip": 0.06425443, + "auxiliary_loss_mlp": 0.01269377, + "balance_loss_clip": 0.0627546, + "balance_loss_mlp": 0.01257098, + "epoch": 0.5811513602885916, + "flos": 22792469788800.0, + "grad_norm": 1.7928396623098657, + "language_loss": 0.80963171, + "learning_rate": 1.5746704933465599e-06, + "loss": 0.88657987, + "num_input_tokens_seen": 208234655, + "router_z_loss_clip": 1.49804688, + "router_z_loss_mlp": 0.12268066, + "step": 9666, + "time_per_iteration": 2.556262969970703 + }, + { + "auxiliary_loss_clip": 0.06412445, + "auxiliary_loss_mlp": 0.01266794, + "balance_loss_clip": 0.06271029, + "balance_loss_mlp": 0.01256059, + "epoch": 0.5812114835412596, + "flos": 18740347464960.0, + "grad_norm": 1.6774912146747003, + "language_loss": 0.79895651, + "learning_rate": 1.5742899485806227e-06, + "loss": 0.87574893, + "num_input_tokens_seen": 208251300, + "router_z_loss_clip": 1.41308594, + "router_z_loss_mlp": 0.1072998, + "step": 9667, + "time_per_iteration": 3.980412483215332 + }, + { + "auxiliary_loss_clip": 0.06427534, + "auxiliary_loss_mlp": 0.01265338, + "balance_loss_clip": 0.06274875, + "balance_loss_mlp": 0.01252791, + "epoch": 0.5812716067939275, + "flos": 26438324791680.0, + "grad_norm": 1.482922365624984, + "language_loss": 0.79118401, + "learning_rate": 1.573909419957653e-06, + "loss": 0.86811268, + "num_input_tokens_seen": 208272685, + "router_z_loss_clip": 1.52636719, + "router_z_loss_mlp": 0.12536621, + "step": 9668, + "time_per_iteration": 2.565986156463623 + }, + { + "auxiliary_loss_clip": 0.06418585, + "auxiliary_loss_mlp": 0.01270366, + "balance_loss_clip": 0.06273644, + "balance_loss_mlp": 0.0125872, + "epoch": 0.5813317300465956, + "flos": 43407847595520.0, + "grad_norm": 1.832859625901051, + "language_loss": 0.64703673, + "learning_rate": 1.5735289074920819e-06, + "loss": 0.72392619, + "num_input_tokens_seen": 208294315, + "router_z_loss_clip": 1.44726562, + "router_z_loss_mlp": 0.11657715, + "step": 9669, + "time_per_iteration": 2.804957151412964 + }, + { + "auxiliary_loss_clip": 0.06415828, + "auxiliary_loss_mlp": 0.01266389, + "balance_loss_clip": 0.0627243, + "balance_loss_mlp": 0.01254969, + "epoch": 0.5813918532992635, + "flos": 24791564499840.0, + "grad_norm": 1.4489654033865982, + "language_loss": 0.73791713, + "learning_rate": 1.5731484111983363e-06, + "loss": 0.81473929, + "num_input_tokens_seen": 208315610, + "router_z_loss_clip": 1.43457031, + "router_z_loss_mlp": 0.11425781, + "step": 9670, + "time_per_iteration": 2.54849910736084 + }, + { + "auxiliary_loss_clip": 0.0641885, + "auxiliary_loss_mlp": 0.01269355, + "balance_loss_clip": 0.06272031, + "balance_loss_mlp": 0.0125822, + "epoch": 0.5814519765519315, + "flos": 22864068702720.0, + "grad_norm": 1.8471376195746119, + "language_loss": 0.79354227, + "learning_rate": 1.5727679310908464e-06, + "loss": 0.87042427, + "num_input_tokens_seen": 208334725, + "router_z_loss_clip": 1.46679688, + "router_z_loss_mlp": 0.11138916, + "step": 9671, + "time_per_iteration": 2.553971529006958 + }, + { + "auxiliary_loss_clip": 0.06426669, + "auxiliary_loss_mlp": 0.0126691, + "balance_loss_clip": 0.06274676, + "balance_loss_mlp": 0.01254685, + "epoch": 0.5815120998045994, + "flos": 24067651150080.0, + "grad_norm": 2.0867956489424495, + "language_loss": 0.61609662, + "learning_rate": 1.5723874671840399e-06, + "loss": 0.6930325, + "num_input_tokens_seen": 208353825, + "router_z_loss_clip": 1.51855469, + "router_z_loss_mlp": 0.12219238, + "step": 9672, + "time_per_iteration": 2.5135464668273926 + }, + { + "auxiliary_loss_clip": 0.06413487, + "auxiliary_loss_mlp": 0.01267774, + "balance_loss_clip": 0.06271096, + "balance_loss_mlp": 0.01256735, + "epoch": 0.5815722230572674, + "flos": 24286305179520.0, + "grad_norm": 2.966012751852424, + "language_loss": 0.81724179, + "learning_rate": 1.572007019492342e-06, + "loss": 0.89405441, + "num_input_tokens_seen": 208374160, + "router_z_loss_clip": 1.42285156, + "router_z_loss_mlp": 0.1104126, + "step": 9673, + "time_per_iteration": 2.531637668609619 + }, + { + "auxiliary_loss_clip": 0.06422119, + "auxiliary_loss_mlp": 0.01271004, + "balance_loss_clip": 0.06272921, + "balance_loss_mlp": 0.01258976, + "epoch": 0.5816323463099353, + "flos": 22206932657280.0, + "grad_norm": 1.7930668974507213, + "language_loss": 0.88784432, + "learning_rate": 1.5716265880301817e-06, + "loss": 0.9647755, + "num_input_tokens_seen": 208392105, + "router_z_loss_clip": 1.49121094, + "router_z_loss_mlp": 0.12030029, + "step": 9674, + "time_per_iteration": 2.490135908126831 + }, + { + "auxiliary_loss_clip": 0.06420779, + "auxiliary_loss_mlp": 0.01264457, + "balance_loss_clip": 0.0627536, + "balance_loss_mlp": 0.01253799, + "epoch": 0.5816924695626033, + "flos": 24141388343040.0, + "grad_norm": 1.4439307600636533, + "language_loss": 0.78848791, + "learning_rate": 1.571246172811984e-06, + "loss": 0.86534023, + "num_input_tokens_seen": 208411755, + "router_z_loss_clip": 1.453125, + "router_z_loss_mlp": 0.10656738, + "step": 9675, + "time_per_iteration": 2.570401191711426 + }, + { + "auxiliary_loss_clip": 0.06415851, + "auxiliary_loss_mlp": 0.01264178, + "balance_loss_clip": 0.06271321, + "balance_loss_mlp": 0.01252901, + "epoch": 0.5817525928152713, + "flos": 21330555603840.0, + "grad_norm": 2.1244098418378234, + "language_loss": 0.70489943, + "learning_rate": 1.5708657738521748e-06, + "loss": 0.78169978, + "num_input_tokens_seen": 208429995, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.11279297, + "step": 9676, + "time_per_iteration": 2.5234405994415283 + }, + { + "auxiliary_loss_clip": 0.06419084, + "auxiliary_loss_mlp": 0.01273498, + "balance_loss_clip": 0.06272397, + "balance_loss_mlp": 0.01262579, + "epoch": 0.5818127160679393, + "flos": 26940355729920.0, + "grad_norm": 2.3696751764318478, + "language_loss": 0.63762164, + "learning_rate": 1.5704853911651779e-06, + "loss": 0.71454746, + "num_input_tokens_seen": 208443655, + "router_z_loss_clip": 1.46582031, + "router_z_loss_mlp": 0.10906982, + "step": 9677, + "time_per_iteration": 2.5408287048339844 + }, + { + "auxiliary_loss_clip": 0.06307964, + "auxiliary_loss_mlp": 0.01264809, + "balance_loss_clip": 0.06247746, + "balance_loss_mlp": 0.01262844, + "epoch": 0.5818728393206073, + "flos": 63940779855360.0, + "grad_norm": 0.7897947317556949, + "language_loss": 0.54107881, + "learning_rate": 1.5701050247654182e-06, + "loss": 0.61680651, + "num_input_tokens_seen": 208498405, + "router_z_loss_clip": 0.6015625, + "router_z_loss_mlp": 0.01963806, + "step": 9678, + "time_per_iteration": 3.1962106227874756 + }, + { + "auxiliary_loss_clip": 0.0631143, + "auxiliary_loss_mlp": 0.0126129, + "balance_loss_clip": 0.06251128, + "balance_loss_mlp": 0.01259724, + "epoch": 0.5819329625732752, + "flos": 64972654087680.0, + "grad_norm": 0.717265543619072, + "language_loss": 0.56126428, + "learning_rate": 1.569724674667319e-06, + "loss": 0.6369915, + "num_input_tokens_seen": 208559075, + "router_z_loss_clip": 0.60546875, + "router_z_loss_mlp": 0.01565552, + "step": 9679, + "time_per_iteration": 3.0475993156433105 + }, + { + "auxiliary_loss_clip": 0.06420414, + "auxiliary_loss_mlp": 0.01271497, + "balance_loss_clip": 0.06274636, + "balance_loss_mlp": 0.01260386, + "epoch": 0.5819930858259432, + "flos": 21221668823040.0, + "grad_norm": 1.5334769221386826, + "language_loss": 0.65937847, + "learning_rate": 1.5693443408853032e-06, + "loss": 0.73629761, + "num_input_tokens_seen": 208577770, + "router_z_loss_clip": 1.45703125, + "router_z_loss_mlp": 0.11102295, + "step": 9680, + "time_per_iteration": 2.526440382003784 + }, + { + "auxiliary_loss_clip": 0.06418791, + "auxiliary_loss_mlp": 0.01266513, + "balance_loss_clip": 0.06274027, + "balance_loss_mlp": 0.01255909, + "epoch": 0.5820532090786111, + "flos": 19463715763200.0, + "grad_norm": 1.789175734331282, + "language_loss": 0.84067512, + "learning_rate": 1.5689640234337933e-06, + "loss": 0.91752815, + "num_input_tokens_seen": 208595110, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.10601807, + "step": 9681, + "time_per_iteration": 2.4850056171417236 + }, + { + "auxiliary_loss_clip": 0.06416699, + "auxiliary_loss_mlp": 0.01267084, + "balance_loss_clip": 0.06272473, + "balance_loss_mlp": 0.01255908, + "epoch": 0.5821133323312792, + "flos": 17718424669440.0, + "grad_norm": 2.261651210831951, + "language_loss": 0.76110494, + "learning_rate": 1.5685837223272109e-06, + "loss": 0.83794284, + "num_input_tokens_seen": 208612080, + "router_z_loss_clip": 1.44140625, + "router_z_loss_mlp": 0.11181641, + "step": 9682, + "time_per_iteration": 2.5017287731170654 + }, + { + "auxiliary_loss_clip": 0.06430176, + "auxiliary_loss_mlp": 0.01270705, + "balance_loss_clip": 0.06278756, + "balance_loss_mlp": 0.01258951, + "epoch": 0.5821734555839471, + "flos": 24578738328960.0, + "grad_norm": 2.1342093378293785, + "language_loss": 0.75805819, + "learning_rate": 1.568203437579977e-06, + "loss": 0.83506703, + "num_input_tokens_seen": 208630235, + "router_z_loss_clip": 1.515625, + "router_z_loss_mlp": 0.11749268, + "step": 9683, + "time_per_iteration": 2.5426952838897705 + }, + { + "auxiliary_loss_clip": 0.06429425, + "auxiliary_loss_mlp": 0.01275466, + "balance_loss_clip": 0.06278548, + "balance_loss_mlp": 0.0126283, + "epoch": 0.5822335788366151, + "flos": 22388760017280.0, + "grad_norm": 1.6377653311732083, + "language_loss": 0.74168241, + "learning_rate": 1.5678231692065116e-06, + "loss": 0.81873143, + "num_input_tokens_seen": 208647925, + "router_z_loss_clip": 1.50683594, + "router_z_loss_mlp": 0.12646484, + "step": 9684, + "time_per_iteration": 2.521773338317871 + }, + { + "auxiliary_loss_clip": 0.06424329, + "auxiliary_loss_mlp": 0.01273987, + "balance_loss_clip": 0.06276318, + "balance_loss_mlp": 0.01262114, + "epoch": 0.582293702089283, + "flos": 26729458202880.0, + "grad_norm": 2.7880175036552446, + "language_loss": 0.78406078, + "learning_rate": 1.5674429172212348e-06, + "loss": 0.86104393, + "num_input_tokens_seen": 208666180, + "router_z_loss_clip": 1.47851562, + "router_z_loss_mlp": 0.11871338, + "step": 9685, + "time_per_iteration": 2.53759503364563 + }, + { + "auxiliary_loss_clip": 0.06423293, + "auxiliary_loss_mlp": 0.01274993, + "balance_loss_clip": 0.06276082, + "balance_loss_mlp": 0.0126337, + "epoch": 0.582353825341951, + "flos": 17354560314240.0, + "grad_norm": 1.6209571199936617, + "language_loss": 0.75622851, + "learning_rate": 1.5670626816385667e-06, + "loss": 0.83321142, + "num_input_tokens_seen": 208684240, + "router_z_loss_clip": 1.47167969, + "router_z_loss_mlp": 0.11627197, + "step": 9686, + "time_per_iteration": 2.5203354358673096 + }, + { + "auxiliary_loss_clip": 0.06317171, + "auxiliary_loss_mlp": 0.01254478, + "balance_loss_clip": 0.06256813, + "balance_loss_mlp": 0.012529, + "epoch": 0.5824139485946189, + "flos": 55491133478400.0, + "grad_norm": 0.7976004724910164, + "language_loss": 0.57134593, + "learning_rate": 1.5666824624729244e-06, + "loss": 0.64706242, + "num_input_tokens_seen": 208736090, + "router_z_loss_clip": 0.60546875, + "router_z_loss_mlp": 0.01578522, + "step": 9687, + "time_per_iteration": 2.9669835567474365 + }, + { + "auxiliary_loss_clip": 0.06422709, + "auxiliary_loss_mlp": 0.01267333, + "balance_loss_clip": 0.06275669, + "balance_loss_mlp": 0.01255221, + "epoch": 0.582474071847287, + "flos": 20309261713920.0, + "grad_norm": 1.877177452165203, + "language_loss": 0.70002449, + "learning_rate": 1.566302259738727e-06, + "loss": 0.77692491, + "num_input_tokens_seen": 208754600, + "router_z_loss_clip": 1.47070312, + "router_z_loss_mlp": 0.12109375, + "step": 9688, + "time_per_iteration": 2.506741762161255 + }, + { + "auxiliary_loss_clip": 0.06417575, + "auxiliary_loss_mlp": 0.01265264, + "balance_loss_clip": 0.0627282, + "balance_loss_mlp": 0.01254673, + "epoch": 0.5825341950999549, + "flos": 23884733687040.0, + "grad_norm": 2.896352551150335, + "language_loss": 0.65452719, + "learning_rate": 1.5659220734503918e-06, + "loss": 0.73135561, + "num_input_tokens_seen": 208773140, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.10595703, + "step": 9689, + "time_per_iteration": 2.506406784057617 + }, + { + "auxiliary_loss_clip": 0.06415856, + "auxiliary_loss_mlp": 0.01273228, + "balance_loss_clip": 0.06272023, + "balance_loss_mlp": 0.0126126, + "epoch": 0.5825943183526229, + "flos": 23119842890880.0, + "grad_norm": 1.995545981005341, + "language_loss": 0.73637474, + "learning_rate": 1.5655419036223341e-06, + "loss": 0.81326556, + "num_input_tokens_seen": 208793410, + "router_z_loss_clip": 1.43847656, + "router_z_loss_mlp": 0.11956787, + "step": 9690, + "time_per_iteration": 3.9373486042022705 + }, + { + "auxiliary_loss_clip": 0.0642629, + "auxiliary_loss_mlp": 0.01267094, + "balance_loss_clip": 0.06275761, + "balance_loss_mlp": 0.01254887, + "epoch": 0.5826544416052909, + "flos": 22864152556800.0, + "grad_norm": 1.6091940048024238, + "language_loss": 0.76358879, + "learning_rate": 1.5651617502689717e-06, + "loss": 0.84052265, + "num_input_tokens_seen": 208811920, + "router_z_loss_clip": 1.50488281, + "router_z_loss_mlp": 0.12207031, + "step": 9691, + "time_per_iteration": 2.5036911964416504 + }, + { + "auxiliary_loss_clip": 0.06422149, + "auxiliary_loss_mlp": 0.01270283, + "balance_loss_clip": 0.06274154, + "balance_loss_mlp": 0.0125906, + "epoch": 0.5827145648579588, + "flos": 31509560799360.0, + "grad_norm": 1.692225094183595, + "language_loss": 0.80700606, + "learning_rate": 1.5647816134047184e-06, + "loss": 0.88393039, + "num_input_tokens_seen": 208834720, + "router_z_loss_clip": 1.48046875, + "router_z_loss_mlp": 0.11218262, + "step": 9692, + "time_per_iteration": 2.588819980621338 + }, + { + "auxiliary_loss_clip": 0.06307849, + "auxiliary_loss_mlp": 0.01251158, + "balance_loss_clip": 0.06247954, + "balance_loss_mlp": 0.01249412, + "epoch": 0.5827746881106268, + "flos": 69832028246400.0, + "grad_norm": 0.7844854120913538, + "language_loss": 0.5681411, + "learning_rate": 1.5644014930439907e-06, + "loss": 0.64373118, + "num_input_tokens_seen": 208898415, + "router_z_loss_clip": 0.6015625, + "router_z_loss_mlp": 0.01751709, + "step": 9693, + "time_per_iteration": 3.1347033977508545 + }, + { + "auxiliary_loss_clip": 0.0641888, + "auxiliary_loss_mlp": 0.01268479, + "balance_loss_clip": 0.06273088, + "balance_loss_mlp": 0.0125815, + "epoch": 0.5828348113632947, + "flos": 23119088204160.0, + "grad_norm": 1.522522739802819, + "language_loss": 0.78923696, + "learning_rate": 1.5640213892012025e-06, + "loss": 0.86611056, + "num_input_tokens_seen": 208919045, + "router_z_loss_clip": 1.45703125, + "router_z_loss_mlp": 0.10327148, + "step": 9694, + "time_per_iteration": 2.5068466663360596 + }, + { + "auxiliary_loss_clip": 0.06411383, + "auxiliary_loss_mlp": 0.01263322, + "balance_loss_clip": 0.06271289, + "balance_loss_mlp": 0.01253302, + "epoch": 0.5828949346159628, + "flos": 21879769190400.0, + "grad_norm": 1.3653324202123376, + "language_loss": 0.76330042, + "learning_rate": 1.5636413018907656e-06, + "loss": 0.84004748, + "num_input_tokens_seen": 208939375, + "router_z_loss_clip": 1.40332031, + "router_z_loss_mlp": 0.10021973, + "step": 9695, + "time_per_iteration": 2.556309700012207 + }, + { + "auxiliary_loss_clip": 0.06315481, + "auxiliary_loss_mlp": 0.01251352, + "balance_loss_clip": 0.06255624, + "balance_loss_mlp": 0.01249797, + "epoch": 0.5829550578686307, + "flos": 65985170497920.0, + "grad_norm": 0.7496740614083074, + "language_loss": 0.54866987, + "learning_rate": 1.563261231127095e-06, + "loss": 0.62433827, + "num_input_tokens_seen": 209004760, + "router_z_loss_clip": 0.6015625, + "router_z_loss_mlp": 0.01553345, + "step": 9696, + "time_per_iteration": 4.669760704040527 + }, + { + "auxiliary_loss_clip": 0.06418857, + "auxiliary_loss_mlp": 0.01264307, + "balance_loss_clip": 0.06272456, + "balance_loss_mlp": 0.01252893, + "epoch": 0.5830151811212987, + "flos": 16295391578880.0, + "grad_norm": 1.8785254946392194, + "language_loss": 0.76464188, + "learning_rate": 1.5628811769246021e-06, + "loss": 0.84147352, + "num_input_tokens_seen": 209022930, + "router_z_loss_clip": 1.46289062, + "router_z_loss_mlp": 0.11413574, + "step": 9697, + "time_per_iteration": 2.5041255950927734 + }, + { + "auxiliary_loss_clip": 0.06422254, + "auxiliary_loss_mlp": 0.01268851, + "balance_loss_clip": 0.06272788, + "balance_loss_mlp": 0.01256668, + "epoch": 0.5830753043739666, + "flos": 24175447827840.0, + "grad_norm": 1.6024364882265518, + "language_loss": 0.77965522, + "learning_rate": 1.5625011392976991e-06, + "loss": 0.85656625, + "num_input_tokens_seen": 209043740, + "router_z_loss_clip": 1.49511719, + "router_z_loss_mlp": 0.12188721, + "step": 9698, + "time_per_iteration": 2.5902624130249023 + }, + { + "auxiliary_loss_clip": 0.06415899, + "auxiliary_loss_mlp": 0.01273709, + "balance_loss_clip": 0.06272474, + "balance_loss_mlp": 0.01260894, + "epoch": 0.5831354276266346, + "flos": 27067438846080.0, + "grad_norm": 1.5547381527883266, + "language_loss": 0.84016132, + "learning_rate": 1.5621211182607966e-06, + "loss": 0.91705739, + "num_input_tokens_seen": 209068885, + "router_z_loss_clip": 1.43359375, + "router_z_loss_mlp": 0.12817383, + "step": 9699, + "time_per_iteration": 2.6469032764434814 + }, + { + "auxiliary_loss_clip": 0.0642215, + "auxiliary_loss_mlp": 0.01265721, + "balance_loss_clip": 0.06274705, + "balance_loss_mlp": 0.01254104, + "epoch": 0.5831955508793025, + "flos": 23630301164160.0, + "grad_norm": 1.933998465104238, + "language_loss": 0.65971506, + "learning_rate": 1.561741113828305e-06, + "loss": 0.73659378, + "num_input_tokens_seen": 209087340, + "router_z_loss_clip": 1.47558594, + "router_z_loss_mlp": 0.1161499, + "step": 9700, + "time_per_iteration": 3.9589943885803223 + }, + { + "auxiliary_loss_clip": 0.06417754, + "auxiliary_loss_mlp": 0.0126768, + "balance_loss_clip": 0.0627218, + "balance_loss_mlp": 0.01256086, + "epoch": 0.5832556741319705, + "flos": 24980267894400.0, + "grad_norm": 1.7460823027462598, + "language_loss": 0.71739107, + "learning_rate": 1.5613611260146344e-06, + "loss": 0.79424536, + "num_input_tokens_seen": 209108840, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.1159668, + "step": 9701, + "time_per_iteration": 2.591634511947632 + }, + { + "auxiliary_loss_clip": 0.06415233, + "auxiliary_loss_mlp": 0.01264901, + "balance_loss_clip": 0.06270908, + "balance_loss_mlp": 0.01253278, + "epoch": 0.5833157973846385, + "flos": 23228226547200.0, + "grad_norm": 1.7061750612547373, + "language_loss": 0.85686189, + "learning_rate": 1.5609811548341936e-06, + "loss": 0.93366319, + "num_input_tokens_seen": 209127985, + "router_z_loss_clip": 1.44335938, + "router_z_loss_mlp": 0.11627197, + "step": 9702, + "time_per_iteration": 2.552055835723877 + }, + { + "auxiliary_loss_clip": 0.0641585, + "auxiliary_loss_mlp": 0.01263882, + "balance_loss_clip": 0.0627341, + "balance_loss_mlp": 0.01253511, + "epoch": 0.5833759206373065, + "flos": 21983876288640.0, + "grad_norm": 1.4269240656932136, + "language_loss": 0.78200948, + "learning_rate": 1.560601200301392e-06, + "loss": 0.85880685, + "num_input_tokens_seen": 209146885, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.10369873, + "step": 9703, + "time_per_iteration": 2.500241279602051 + }, + { + "auxiliary_loss_clip": 0.06420664, + "auxiliary_loss_mlp": 0.01264639, + "balance_loss_clip": 0.06272513, + "balance_loss_mlp": 0.01252831, + "epoch": 0.5834360438899745, + "flos": 21768869911680.0, + "grad_norm": 1.5504614474031426, + "language_loss": 0.71309936, + "learning_rate": 1.5602212624306366e-06, + "loss": 0.78995246, + "num_input_tokens_seen": 209166130, + "router_z_loss_clip": 1.48242188, + "router_z_loss_mlp": 0.11816406, + "step": 9704, + "time_per_iteration": 2.5374741554260254 + }, + { + "auxiliary_loss_clip": 0.06421441, + "auxiliary_loss_mlp": 0.0126726, + "balance_loss_clip": 0.06276259, + "balance_loss_mlp": 0.01256919, + "epoch": 0.5834961671426424, + "flos": 15997214424960.0, + "grad_norm": 1.6199693671180324, + "language_loss": 0.81965989, + "learning_rate": 1.559841341236335e-06, + "loss": 0.89654684, + "num_input_tokens_seen": 209183350, + "router_z_loss_clip": 1.45117188, + "router_z_loss_mlp": 0.10339355, + "step": 9705, + "time_per_iteration": 2.5450189113616943 + }, + { + "auxiliary_loss_clip": 0.06418713, + "auxiliary_loss_mlp": 0.01264358, + "balance_loss_clip": 0.06273229, + "balance_loss_mlp": 0.01253379, + "epoch": 0.5835562903953104, + "flos": 22824600629760.0, + "grad_norm": 1.6206416307327924, + "language_loss": 0.80445373, + "learning_rate": 1.5594614367328937e-06, + "loss": 0.88128448, + "num_input_tokens_seen": 209203945, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.10986328, + "step": 9706, + "time_per_iteration": 2.5352673530578613 + }, + { + "auxiliary_loss_clip": 0.06415439, + "auxiliary_loss_mlp": 0.01273281, + "balance_loss_clip": 0.06272696, + "balance_loss_mlp": 0.01261003, + "epoch": 0.5836164136479783, + "flos": 48478664332800.0, + "grad_norm": 1.6746295019388222, + "language_loss": 0.74755418, + "learning_rate": 1.5590815489347187e-06, + "loss": 0.82444143, + "num_input_tokens_seen": 209227080, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.1227417, + "step": 9707, + "time_per_iteration": 4.184760808944702 + }, + { + "auxiliary_loss_clip": 0.06414578, + "auxiliary_loss_mlp": 0.01264036, + "balance_loss_clip": 0.06273817, + "balance_loss_mlp": 0.01253463, + "epoch": 0.5836765369006464, + "flos": 26913172279680.0, + "grad_norm": 1.726633366654796, + "language_loss": 0.81783116, + "learning_rate": 1.5587016778562163e-06, + "loss": 0.89461732, + "num_input_tokens_seen": 209248170, + "router_z_loss_clip": 1.40527344, + "router_z_loss_mlp": 0.10571289, + "step": 9708, + "time_per_iteration": 2.5494630336761475 + }, + { + "auxiliary_loss_clip": 0.064155, + "auxiliary_loss_mlp": 0.01267312, + "balance_loss_clip": 0.06274238, + "balance_loss_mlp": 0.01256404, + "epoch": 0.5837366601533143, + "flos": 20090230341120.0, + "grad_norm": 1.3928808196753693, + "language_loss": 0.78363276, + "learning_rate": 1.5583218235117896e-06, + "loss": 0.86046088, + "num_input_tokens_seen": 209267730, + "router_z_loss_clip": 1.41308594, + "router_z_loss_mlp": 0.10906982, + "step": 9709, + "time_per_iteration": 2.54146409034729 + }, + { + "auxiliary_loss_clip": 0.06313366, + "auxiliary_loss_mlp": 0.01252195, + "balance_loss_clip": 0.06253533, + "balance_loss_mlp": 0.01250684, + "epoch": 0.5837967834059823, + "flos": 65383910726400.0, + "grad_norm": 0.7481338178050596, + "language_loss": 0.5665468, + "learning_rate": 1.557941985915844e-06, + "loss": 0.64220238, + "num_input_tokens_seen": 209332510, + "router_z_loss_clip": 0.60058594, + "router_z_loss_mlp": 0.0151062, + "step": 9710, + "time_per_iteration": 3.130523443222046 + }, + { + "auxiliary_loss_clip": 0.06414168, + "auxiliary_loss_mlp": 0.01266687, + "balance_loss_clip": 0.06273045, + "balance_loss_mlp": 0.01256495, + "epoch": 0.5838569066586502, + "flos": 25345809331200.0, + "grad_norm": 1.5024705126599753, + "language_loss": 0.65656877, + "learning_rate": 1.5575621650827833e-06, + "loss": 0.73337734, + "num_input_tokens_seen": 209353355, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.10198975, + "step": 9711, + "time_per_iteration": 2.558560609817505 + }, + { + "auxiliary_loss_clip": 0.06425221, + "auxiliary_loss_mlp": 0.0126656, + "balance_loss_clip": 0.06273845, + "balance_loss_mlp": 0.0125393, + "epoch": 0.5839170299113182, + "flos": 22234535377920.0, + "grad_norm": 1.9299970772651502, + "language_loss": 0.79264128, + "learning_rate": 1.5571823610270085e-06, + "loss": 0.86955917, + "num_input_tokens_seen": 209370960, + "router_z_loss_clip": 1.51269531, + "router_z_loss_mlp": 0.12640381, + "step": 9712, + "time_per_iteration": 2.571164131164551 + }, + { + "auxiliary_loss_clip": 0.06417041, + "auxiliary_loss_mlp": 0.01264809, + "balance_loss_clip": 0.06273463, + "balance_loss_mlp": 0.01254021, + "epoch": 0.5839771531639861, + "flos": 22206513386880.0, + "grad_norm": 1.5054581881557743, + "language_loss": 0.73669749, + "learning_rate": 1.5568025737629234e-06, + "loss": 0.81351602, + "num_input_tokens_seen": 209390955, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.10784912, + "step": 9713, + "time_per_iteration": 2.5475780963897705 + }, + { + "auxiliary_loss_clip": 0.06424147, + "auxiliary_loss_mlp": 0.01265979, + "balance_loss_clip": 0.06274505, + "balance_loss_mlp": 0.01252932, + "epoch": 0.5840372764166541, + "flos": 22425964030080.0, + "grad_norm": 1.9255335004661567, + "language_loss": 0.70002109, + "learning_rate": 1.5564228033049292e-06, + "loss": 0.77692235, + "num_input_tokens_seen": 209410260, + "router_z_loss_clip": 1.49511719, + "router_z_loss_mlp": 0.13049316, + "step": 9714, + "time_per_iteration": 2.523638963699341 + }, + { + "auxiliary_loss_clip": 0.06419174, + "auxiliary_loss_mlp": 0.01266096, + "balance_loss_clip": 0.06273787, + "balance_loss_mlp": 0.012543, + "epoch": 0.5840973996693221, + "flos": 19834330371840.0, + "grad_norm": 1.8598920078622099, + "language_loss": 0.80627859, + "learning_rate": 1.5560430496674268e-06, + "loss": 0.88313133, + "num_input_tokens_seen": 209429920, + "router_z_loss_clip": 1.453125, + "router_z_loss_mlp": 0.11798096, + "step": 9715, + "time_per_iteration": 2.5382297039031982 + }, + { + "auxiliary_loss_clip": 0.06417744, + "auxiliary_loss_mlp": 0.01265495, + "balance_loss_clip": 0.0627513, + "balance_loss_mlp": 0.01254194, + "epoch": 0.5841575229219901, + "flos": 21149482930560.0, + "grad_norm": 1.9876848107590372, + "language_loss": 0.73826301, + "learning_rate": 1.5556633128648167e-06, + "loss": 0.81509537, + "num_input_tokens_seen": 209449470, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.11303711, + "step": 9716, + "time_per_iteration": 2.5080726146698 + }, + { + "auxiliary_loss_clip": 0.06413358, + "auxiliary_loss_mlp": 0.01264669, + "balance_loss_clip": 0.0627432, + "balance_loss_mlp": 0.01254202, + "epoch": 0.5842176461746581, + "flos": 24646521882240.0, + "grad_norm": 2.3723983049620876, + "language_loss": 0.75045407, + "learning_rate": 1.5552835929114976e-06, + "loss": 0.82723433, + "num_input_tokens_seen": 209467695, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.10467529, + "step": 9717, + "time_per_iteration": 2.5569300651550293 + }, + { + "auxiliary_loss_clip": 0.06420394, + "auxiliary_loss_mlp": 0.01265893, + "balance_loss_clip": 0.06276444, + "balance_loss_mlp": 0.01254759, + "epoch": 0.584277769427326, + "flos": 19136468442240.0, + "grad_norm": 2.2457444336667343, + "language_loss": 0.80242944, + "learning_rate": 1.5549038898218697e-06, + "loss": 0.87929225, + "num_input_tokens_seen": 209484250, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.11132812, + "step": 9718, + "time_per_iteration": 2.5623273849487305 + }, + { + "auxiliary_loss_clip": 0.06421262, + "auxiliary_loss_mlp": 0.01264972, + "balance_loss_clip": 0.0627823, + "balance_loss_mlp": 0.01253117, + "epoch": 0.584337892679994, + "flos": 22681822072320.0, + "grad_norm": 1.5991831303569484, + "language_loss": 0.67348599, + "learning_rate": 1.5545242036103306e-06, + "loss": 0.75034833, + "num_input_tokens_seen": 209502830, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.11853027, + "step": 9719, + "time_per_iteration": 2.5381717681884766 + }, + { + "auxiliary_loss_clip": 0.0641831, + "auxiliary_loss_mlp": 0.01263454, + "balance_loss_clip": 0.06271739, + "balance_loss_mlp": 0.01252022, + "epoch": 0.5843980159326619, + "flos": 31291954945920.0, + "grad_norm": 1.728104183061379, + "language_loss": 0.75697351, + "learning_rate": 1.5541445342912786e-06, + "loss": 0.83379114, + "num_input_tokens_seen": 209525995, + "router_z_loss_clip": 1.46679688, + "router_z_loss_mlp": 0.11425781, + "step": 9720, + "time_per_iteration": 2.6132402420043945 + }, + { + "auxiliary_loss_clip": 0.06421956, + "auxiliary_loss_mlp": 0.01266891, + "balance_loss_clip": 0.06276225, + "balance_loss_mlp": 0.01255799, + "epoch": 0.58445813918533, + "flos": 22754846505600.0, + "grad_norm": 1.447216358863969, + "language_loss": 0.83020425, + "learning_rate": 1.5537648818791105e-06, + "loss": 0.90709275, + "num_input_tokens_seen": 209545895, + "router_z_loss_clip": 1.45605469, + "router_z_loss_mlp": 0.11090088, + "step": 9721, + "time_per_iteration": 2.5127675533294678 + }, + { + "auxiliary_loss_clip": 0.06310159, + "auxiliary_loss_mlp": 0.01253726, + "balance_loss_clip": 0.06250554, + "balance_loss_mlp": 0.01252051, + "epoch": 0.5845182624379979, + "flos": 60704602992000.0, + "grad_norm": 0.9150346622366115, + "language_loss": 0.71186364, + "learning_rate": 1.5533852463882226e-06, + "loss": 0.78750253, + "num_input_tokens_seen": 209602315, + "router_z_loss_clip": 0.59765625, + "router_z_loss_mlp": 0.01678467, + "step": 9722, + "time_per_iteration": 3.1494555473327637 + }, + { + "auxiliary_loss_clip": 0.06417061, + "auxiliary_loss_mlp": 0.01268389, + "balance_loss_clip": 0.06274655, + "balance_loss_mlp": 0.01257255, + "epoch": 0.5845783856906659, + "flos": 16367996741760.0, + "grad_norm": 1.9087918582550145, + "language_loss": 0.8944329, + "learning_rate": 1.5530056278330113e-06, + "loss": 0.97128743, + "num_input_tokens_seen": 209617615, + "router_z_loss_clip": 1.42285156, + "router_z_loss_mlp": 0.11132812, + "step": 9723, + "time_per_iteration": 2.4576761722564697 + }, + { + "auxiliary_loss_clip": 0.06417491, + "auxiliary_loss_mlp": 0.01267794, + "balance_loss_clip": 0.06274551, + "balance_loss_mlp": 0.01256922, + "epoch": 0.5846385089433338, + "flos": 20089475654400.0, + "grad_norm": 1.3439404505357262, + "language_loss": 0.68925285, + "learning_rate": 1.5526260262278709e-06, + "loss": 0.76610565, + "num_input_tokens_seen": 209637005, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.10870361, + "step": 9724, + "time_per_iteration": 2.5088019371032715 + }, + { + "auxiliary_loss_clip": 0.06417604, + "auxiliary_loss_mlp": 0.01265081, + "balance_loss_clip": 0.06271344, + "balance_loss_mlp": 0.01252922, + "epoch": 0.5846986321960018, + "flos": 17316769322880.0, + "grad_norm": 2.3711774156816188, + "language_loss": 0.86716926, + "learning_rate": 1.552246441587197e-06, + "loss": 0.94399607, + "num_input_tokens_seen": 209653170, + "router_z_loss_clip": 1.46289062, + "router_z_loss_mlp": 0.121521, + "step": 9725, + "time_per_iteration": 2.4511706829071045 + }, + { + "auxiliary_loss_clip": 0.06423703, + "auxiliary_loss_mlp": 0.0127082, + "balance_loss_clip": 0.06276515, + "balance_loss_mlp": 0.01258995, + "epoch": 0.5847587554486697, + "flos": 17202977078400.0, + "grad_norm": 1.45457124956925, + "language_loss": 0.8335436, + "learning_rate": 1.5518668739253821e-06, + "loss": 0.91048884, + "num_input_tokens_seen": 209671275, + "router_z_loss_clip": 1.47265625, + "router_z_loss_mlp": 0.1182251, + "step": 9726, + "time_per_iteration": 2.506606340408325 + }, + { + "auxiliary_loss_clip": 0.06418396, + "auxiliary_loss_mlp": 0.01263644, + "balance_loss_clip": 0.06271654, + "balance_loss_mlp": 0.01252957, + "epoch": 0.5848188787013378, + "flos": 24534993697920.0, + "grad_norm": 1.7434091697787477, + "language_loss": 0.67301726, + "learning_rate": 1.5514873232568206e-06, + "loss": 0.7498377, + "num_input_tokens_seen": 209690380, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 0.10675049, + "step": 9727, + "time_per_iteration": 2.5283849239349365 + }, + { + "auxiliary_loss_clip": 0.06419774, + "auxiliary_loss_mlp": 0.01272592, + "balance_loss_clip": 0.06275018, + "balance_loss_mlp": 0.0126054, + "epoch": 0.5848790019540057, + "flos": 20634161120640.0, + "grad_norm": 1.6131340234861964, + "language_loss": 0.82272881, + "learning_rate": 1.5511077895959055e-06, + "loss": 0.89965248, + "num_input_tokens_seen": 209708845, + "router_z_loss_clip": 1.44824219, + "router_z_loss_mlp": 0.12060547, + "step": 9728, + "time_per_iteration": 2.5226187705993652 + }, + { + "auxiliary_loss_clip": 0.06412318, + "auxiliary_loss_mlp": 0.01270439, + "balance_loss_clip": 0.06272879, + "balance_loss_mlp": 0.01260198, + "epoch": 0.5849391252066737, + "flos": 22425377051520.0, + "grad_norm": 1.6963428440366448, + "language_loss": 0.78290164, + "learning_rate": 1.550728272957027e-06, + "loss": 0.85972917, + "num_input_tokens_seen": 209729000, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.10241699, + "step": 9729, + "time_per_iteration": 3.922197103500366 + }, + { + "auxiliary_loss_clip": 0.06418414, + "auxiliary_loss_mlp": 0.01265654, + "balance_loss_clip": 0.06272924, + "balance_loss_mlp": 0.01254228, + "epoch": 0.5849992484593417, + "flos": 25417995223680.0, + "grad_norm": 1.7817091958189777, + "language_loss": 0.71144295, + "learning_rate": 1.5503487733545782e-06, + "loss": 0.78828371, + "num_input_tokens_seen": 209747435, + "router_z_loss_clip": 1.45605469, + "router_z_loss_mlp": 0.11419678, + "step": 9730, + "time_per_iteration": 2.5403687953948975 + }, + { + "auxiliary_loss_clip": 0.06422406, + "auxiliary_loss_mlp": 0.01268067, + "balance_loss_clip": 0.0627557, + "balance_loss_mlp": 0.01256188, + "epoch": 0.5850593717120096, + "flos": 21070840273920.0, + "grad_norm": 1.6620919701985222, + "language_loss": 0.78394347, + "learning_rate": 1.5499692908029482e-06, + "loss": 0.86084819, + "num_input_tokens_seen": 209764910, + "router_z_loss_clip": 1.46679688, + "router_z_loss_mlp": 0.11883545, + "step": 9731, + "time_per_iteration": 2.5166611671447754 + }, + { + "auxiliary_loss_clip": 0.06415913, + "auxiliary_loss_mlp": 0.01267443, + "balance_loss_clip": 0.0627268, + "balance_loss_mlp": 0.01256088, + "epoch": 0.5851194949646776, + "flos": 25308605318400.0, + "grad_norm": 2.100344301849282, + "language_loss": 0.70174819, + "learning_rate": 1.549589825316528e-06, + "loss": 0.77858174, + "num_input_tokens_seen": 209786115, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.11352539, + "step": 9732, + "time_per_iteration": 2.538188934326172 + }, + { + "auxiliary_loss_clip": 0.06423078, + "auxiliary_loss_mlp": 0.01269533, + "balance_loss_clip": 0.06275669, + "balance_loss_mlp": 0.01256707, + "epoch": 0.5851796182173455, + "flos": 23594103400320.0, + "grad_norm": 2.4062469566098685, + "language_loss": 0.53286588, + "learning_rate": 1.5492103769097075e-06, + "loss": 0.60979199, + "num_input_tokens_seen": 209806095, + "router_z_loss_clip": 1.47363281, + "router_z_loss_mlp": 0.12823486, + "step": 9733, + "time_per_iteration": 2.511302947998047 + }, + { + "auxiliary_loss_clip": 0.06417008, + "auxiliary_loss_mlp": 0.01268655, + "balance_loss_clip": 0.06273425, + "balance_loss_mlp": 0.01256657, + "epoch": 0.5852397414700136, + "flos": 24828936220800.0, + "grad_norm": 2.0225140710518184, + "language_loss": 0.87949061, + "learning_rate": 1.5488309455968739e-06, + "loss": 0.95634717, + "num_input_tokens_seen": 209823650, + "router_z_loss_clip": 1.43652344, + "router_z_loss_mlp": 0.12005615, + "step": 9734, + "time_per_iteration": 2.538619041442871 + }, + { + "auxiliary_loss_clip": 0.06415038, + "auxiliary_loss_mlp": 0.01266318, + "balance_loss_clip": 0.06276681, + "balance_loss_mlp": 0.01255667, + "epoch": 0.5852998647226815, + "flos": 19943887985280.0, + "grad_norm": 1.4699537388912873, + "language_loss": 0.72430563, + "learning_rate": 1.5484515313924163e-06, + "loss": 0.80111921, + "num_input_tokens_seen": 209843220, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.10656738, + "step": 9735, + "time_per_iteration": 3.9566004276275635 + }, + { + "auxiliary_loss_clip": 0.06418768, + "auxiliary_loss_mlp": 0.01267652, + "balance_loss_clip": 0.06273651, + "balance_loss_mlp": 0.0125563, + "epoch": 0.5853599879753495, + "flos": 16724817354240.0, + "grad_norm": 2.1987965595401135, + "language_loss": 0.7462939, + "learning_rate": 1.5480721343107217e-06, + "loss": 0.82315814, + "num_input_tokens_seen": 209854880, + "router_z_loss_clip": 1.45117188, + "router_z_loss_mlp": 0.12017822, + "step": 9736, + "time_per_iteration": 2.4270691871643066 + }, + { + "auxiliary_loss_clip": 0.06417002, + "auxiliary_loss_mlp": 0.01263204, + "balance_loss_clip": 0.06274146, + "balance_loss_mlp": 0.0125241, + "epoch": 0.5854201112280174, + "flos": 44466848622720.0, + "grad_norm": 1.4975519288318198, + "language_loss": 0.7076987, + "learning_rate": 1.5476927543661772e-06, + "loss": 0.78450084, + "num_input_tokens_seen": 209877870, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.10791016, + "step": 9737, + "time_per_iteration": 2.744206190109253 + }, + { + "auxiliary_loss_clip": 0.06416388, + "auxiliary_loss_mlp": 0.01270708, + "balance_loss_clip": 0.06274648, + "balance_loss_mlp": 0.01259556, + "epoch": 0.5854802344806854, + "flos": 20345375623680.0, + "grad_norm": 1.6871127807078519, + "language_loss": 0.82840961, + "learning_rate": 1.547313391573169e-06, + "loss": 0.90528059, + "num_input_tokens_seen": 209896690, + "router_z_loss_clip": 1.41796875, + "router_z_loss_mlp": 0.11151123, + "step": 9738, + "time_per_iteration": 2.4849019050598145 + }, + { + "auxiliary_loss_clip": 0.06422549, + "auxiliary_loss_mlp": 0.01269287, + "balance_loss_clip": 0.06275184, + "balance_loss_mlp": 0.01257431, + "epoch": 0.5855403577333533, + "flos": 20927013540480.0, + "grad_norm": 1.6194676695443784, + "language_loss": 0.69157064, + "learning_rate": 1.546934045946082e-06, + "loss": 0.768489, + "num_input_tokens_seen": 209914640, + "router_z_loss_clip": 1.47363281, + "router_z_loss_mlp": 0.11846924, + "step": 9739, + "time_per_iteration": 3.941681146621704 + }, + { + "auxiliary_loss_clip": 0.0641816, + "auxiliary_loss_mlp": 0.01267651, + "balance_loss_clip": 0.06272583, + "balance_loss_mlp": 0.01255796, + "epoch": 0.5856004809860214, + "flos": 20454849383040.0, + "grad_norm": 2.1509507460713038, + "language_loss": 0.59265625, + "learning_rate": 1.5465547174993017e-06, + "loss": 0.66951436, + "num_input_tokens_seen": 209933375, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.11859131, + "step": 9740, + "time_per_iteration": 2.5459988117218018 + }, + { + "auxiliary_loss_clip": 0.06417701, + "auxiliary_loss_mlp": 0.01265897, + "balance_loss_clip": 0.06273193, + "balance_loss_mlp": 0.0125487, + "epoch": 0.5856606042386893, + "flos": 19645962393600.0, + "grad_norm": 1.6784070122461718, + "language_loss": 0.75433791, + "learning_rate": 1.5461754062472113e-06, + "loss": 0.83117396, + "num_input_tokens_seen": 209952055, + "router_z_loss_clip": 1.44726562, + "router_z_loss_mlp": 0.11029053, + "step": 9741, + "time_per_iteration": 2.488905668258667 + }, + { + "auxiliary_loss_clip": 0.06418155, + "auxiliary_loss_mlp": 0.01263599, + "balance_loss_clip": 0.06272431, + "balance_loss_mlp": 0.01251857, + "epoch": 0.5857207274913573, + "flos": 21692072044800.0, + "grad_norm": 1.4885669249171192, + "language_loss": 0.76157856, + "learning_rate": 1.5457961122041959e-06, + "loss": 0.83839613, + "num_input_tokens_seen": 209971190, + "router_z_loss_clip": 1.45605469, + "router_z_loss_mlp": 0.11743164, + "step": 9742, + "time_per_iteration": 2.5480451583862305 + }, + { + "auxiliary_loss_clip": 0.06415333, + "auxiliary_loss_mlp": 0.01266181, + "balance_loss_clip": 0.06272702, + "balance_loss_mlp": 0.01254737, + "epoch": 0.5857808507440253, + "flos": 23188968109440.0, + "grad_norm": 1.7165353954706328, + "language_loss": 0.75240624, + "learning_rate": 1.5454168353846369e-06, + "loss": 0.82922137, + "num_input_tokens_seen": 209990695, + "router_z_loss_clip": 1.42480469, + "router_z_loss_mlp": 0.11444092, + "step": 9743, + "time_per_iteration": 2.503702163696289 + }, + { + "auxiliary_loss_clip": 0.0641541, + "auxiliary_loss_mlp": 0.01265703, + "balance_loss_clip": 0.06275813, + "balance_loss_mlp": 0.01254944, + "epoch": 0.5858409739966932, + "flos": 27242683660800.0, + "grad_norm": 1.53753206771929, + "language_loss": 0.81320727, + "learning_rate": 1.5450375758029172e-06, + "loss": 0.8900184, + "num_input_tokens_seen": 210010210, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.10760498, + "step": 9744, + "time_per_iteration": 2.5923476219177246 + }, + { + "auxiliary_loss_clip": 0.06429034, + "auxiliary_loss_mlp": 0.01268911, + "balance_loss_clip": 0.06278567, + "balance_loss_mlp": 0.01256847, + "epoch": 0.5859010972493612, + "flos": 27862993036800.0, + "grad_norm": 1.7800190043611435, + "language_loss": 0.71494257, + "learning_rate": 1.5446583334734183e-06, + "loss": 0.79192197, + "num_input_tokens_seen": 210030030, + "router_z_loss_clip": 1.50390625, + "router_z_loss_mlp": 0.12072754, + "step": 9745, + "time_per_iteration": 2.5417301654815674 + }, + { + "auxiliary_loss_clip": 0.06318981, + "auxiliary_loss_mlp": 0.01251832, + "balance_loss_clip": 0.06258826, + "balance_loss_mlp": 0.01250336, + "epoch": 0.5859612205020291, + "flos": 70029452465280.0, + "grad_norm": 0.7182748841957548, + "language_loss": 0.53236032, + "learning_rate": 1.5442791084105204e-06, + "loss": 0.60806841, + "num_input_tokens_seen": 210094840, + "router_z_loss_clip": 0.6015625, + "router_z_loss_mlp": 0.01495361, + "step": 9746, + "time_per_iteration": 4.6102893352508545 + }, + { + "auxiliary_loss_clip": 0.06421819, + "auxiliary_loss_mlp": 0.01265345, + "balance_loss_clip": 0.06276383, + "balance_loss_mlp": 0.01253907, + "epoch": 0.5860213437546972, + "flos": 24062032926720.0, + "grad_norm": 1.805241505686608, + "language_loss": 0.7322374, + "learning_rate": 1.5438999006286054e-06, + "loss": 0.80910903, + "num_input_tokens_seen": 210114660, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.11437988, + "step": 9747, + "time_per_iteration": 2.5299086570739746 + }, + { + "auxiliary_loss_clip": 0.06420729, + "auxiliary_loss_mlp": 0.01264964, + "balance_loss_clip": 0.06275554, + "balance_loss_mlp": 0.01253806, + "epoch": 0.5860814670073651, + "flos": 18952670511360.0, + "grad_norm": 1.7528078306488855, + "language_loss": 0.81229597, + "learning_rate": 1.543520710142051e-06, + "loss": 0.88915294, + "num_input_tokens_seen": 210132770, + "router_z_loss_clip": 1.45214844, + "router_z_loss_mlp": 0.1116333, + "step": 9748, + "time_per_iteration": 2.5070362091064453 + }, + { + "auxiliary_loss_clip": 0.06422453, + "auxiliary_loss_mlp": 0.01268094, + "balance_loss_clip": 0.06275974, + "balance_loss_mlp": 0.01256674, + "epoch": 0.5861415902600331, + "flos": 22567904046720.0, + "grad_norm": 2.1315206911445217, + "language_loss": 0.72122687, + "learning_rate": 1.5431415369652375e-06, + "loss": 0.7981323, + "num_input_tokens_seen": 210151895, + "router_z_loss_clip": 1.46484375, + "router_z_loss_mlp": 0.11419678, + "step": 9749, + "time_per_iteration": 2.5568935871124268 + }, + { + "auxiliary_loss_clip": 0.06413895, + "auxiliary_loss_mlp": 0.01265815, + "balance_loss_clip": 0.06272951, + "balance_loss_mlp": 0.01254765, + "epoch": 0.586201713512701, + "flos": 14397217511040.0, + "grad_norm": 2.3126679183899608, + "language_loss": 0.75373948, + "learning_rate": 1.5427623811125428e-06, + "loss": 0.8305366, + "num_input_tokens_seen": 210168040, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.11053467, + "step": 9750, + "time_per_iteration": 2.456709623336792 + }, + { + "auxiliary_loss_clip": 0.06418054, + "auxiliary_loss_mlp": 0.01267589, + "balance_loss_clip": 0.06274709, + "balance_loss_mlp": 0.01256091, + "epoch": 0.586261836765369, + "flos": 19504357793280.0, + "grad_norm": 1.5048801591853769, + "language_loss": 0.70914859, + "learning_rate": 1.542383242598344e-06, + "loss": 0.78600496, + "num_input_tokens_seen": 210187720, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.11505127, + "step": 9751, + "time_per_iteration": 2.516965389251709 + }, + { + "auxiliary_loss_clip": 0.06427741, + "auxiliary_loss_mlp": 0.01267026, + "balance_loss_clip": 0.06278099, + "balance_loss_mlp": 0.01254748, + "epoch": 0.5863219600180369, + "flos": 20707688678400.0, + "grad_norm": 2.2695397417566134, + "language_loss": 0.74817115, + "learning_rate": 1.5420041214370184e-06, + "loss": 0.82511884, + "num_input_tokens_seen": 210206080, + "router_z_loss_clip": 1.49511719, + "router_z_loss_mlp": 0.12280273, + "step": 9752, + "time_per_iteration": 2.4829437732696533 + }, + { + "auxiliary_loss_clip": 0.06419428, + "auxiliary_loss_mlp": 0.01266631, + "balance_loss_clip": 0.06275827, + "balance_loss_mlp": 0.01255026, + "epoch": 0.586382083270705, + "flos": 19798258389120.0, + "grad_norm": 1.7375633359019997, + "language_loss": 0.77788973, + "learning_rate": 1.541625017642943e-06, + "loss": 0.85475028, + "num_input_tokens_seen": 210225660, + "router_z_loss_clip": 1.43457031, + "router_z_loss_mlp": 0.1159668, + "step": 9753, + "time_per_iteration": 2.5376296043395996 + }, + { + "auxiliary_loss_clip": 0.06415142, + "auxiliary_loss_mlp": 0.01266521, + "balance_loss_clip": 0.06275599, + "balance_loss_mlp": 0.01256478, + "epoch": 0.5864422065233729, + "flos": 16504821659520.0, + "grad_norm": 1.5941521516898884, + "language_loss": 0.71418774, + "learning_rate": 1.5412459312304927e-06, + "loss": 0.79100442, + "num_input_tokens_seen": 210242725, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.1003418, + "step": 9754, + "time_per_iteration": 2.482060670852661 + }, + { + "auxiliary_loss_clip": 0.06418964, + "auxiliary_loss_mlp": 0.01266127, + "balance_loss_clip": 0.06275275, + "balance_loss_mlp": 0.01254706, + "epoch": 0.5865023297760409, + "flos": 20419657868160.0, + "grad_norm": 1.5122611907827943, + "language_loss": 0.72473872, + "learning_rate": 1.540866862214043e-06, + "loss": 0.80158961, + "num_input_tokens_seen": 210263225, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.11407471, + "step": 9755, + "time_per_iteration": 2.5370032787323 + }, + { + "auxiliary_loss_clip": 0.06317496, + "auxiliary_loss_mlp": 0.01251101, + "balance_loss_clip": 0.06257688, + "balance_loss_mlp": 0.01249532, + "epoch": 0.5865624530287089, + "flos": 63369386864640.0, + "grad_norm": 0.7287908319651881, + "language_loss": 0.56949997, + "learning_rate": 1.540487810607967e-06, + "loss": 0.64518595, + "num_input_tokens_seen": 210322310, + "router_z_loss_clip": 0.60058594, + "router_z_loss_mlp": 0.01570129, + "step": 9756, + "time_per_iteration": 3.10322904586792 + }, + { + "auxiliary_loss_clip": 0.06418074, + "auxiliary_loss_mlp": 0.01268383, + "balance_loss_clip": 0.06273533, + "balance_loss_mlp": 0.01258048, + "epoch": 0.5866225762813768, + "flos": 27023610360960.0, + "grad_norm": 1.7386050489235434, + "language_loss": 0.76836097, + "learning_rate": 1.5401087764266396e-06, + "loss": 0.84522557, + "num_input_tokens_seen": 210340845, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.10333252, + "step": 9757, + "time_per_iteration": 2.5645911693573 + }, + { + "auxiliary_loss_clip": 0.06316153, + "auxiliary_loss_mlp": 0.01253974, + "balance_loss_clip": 0.06255822, + "balance_loss_mlp": 0.01252219, + "epoch": 0.5866826995340448, + "flos": 73007941224960.0, + "grad_norm": 0.8367731636564993, + "language_loss": 0.60245061, + "learning_rate": 1.5397297596844337e-06, + "loss": 0.67815191, + "num_input_tokens_seen": 210397815, + "router_z_loss_clip": 0.60449219, + "router_z_loss_mlp": 0.01760864, + "step": 9758, + "time_per_iteration": 3.129420042037964 + }, + { + "auxiliary_loss_clip": 0.06425761, + "auxiliary_loss_mlp": 0.01269834, + "balance_loss_clip": 0.06276144, + "balance_loss_mlp": 0.0125824, + "epoch": 0.5867428227867127, + "flos": 21291716436480.0, + "grad_norm": 2.341889353580635, + "language_loss": 0.7231499, + "learning_rate": 1.5393507603957212e-06, + "loss": 0.80010581, + "num_input_tokens_seen": 210413900, + "router_z_loss_clip": 1.49316406, + "router_z_loss_mlp": 0.11602783, + "step": 9759, + "time_per_iteration": 2.5044219493865967 + }, + { + "auxiliary_loss_clip": 0.06416983, + "auxiliary_loss_mlp": 0.01266034, + "balance_loss_clip": 0.06274659, + "balance_loss_mlp": 0.01254924, + "epoch": 0.5868029460393808, + "flos": 33476356961280.0, + "grad_norm": 1.459885556596891, + "language_loss": 0.73556709, + "learning_rate": 1.5389717785748742e-06, + "loss": 0.8123973, + "num_input_tokens_seen": 210434110, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.11114502, + "step": 9760, + "time_per_iteration": 2.662318229675293 + }, + { + "auxiliary_loss_clip": 0.06416824, + "auxiliary_loss_mlp": 0.01264293, + "balance_loss_clip": 0.06273922, + "balance_loss_mlp": 0.01252944, + "epoch": 0.5868630692920487, + "flos": 17894382243840.0, + "grad_norm": 1.6271911446451897, + "language_loss": 0.7251972, + "learning_rate": 1.5385928142362637e-06, + "loss": 0.80200839, + "num_input_tokens_seen": 210451685, + "router_z_loss_clip": 1.42871094, + "router_z_loss_mlp": 0.11352539, + "step": 9761, + "time_per_iteration": 2.635671377182007 + }, + { + "auxiliary_loss_clip": 0.06421126, + "auxiliary_loss_mlp": 0.01265487, + "balance_loss_clip": 0.06272967, + "balance_loss_mlp": 0.01253274, + "epoch": 0.5869231925447167, + "flos": 21041770106880.0, + "grad_norm": 1.8098960680000724, + "language_loss": 0.74938971, + "learning_rate": 1.5382138673942597e-06, + "loss": 0.8262558, + "num_input_tokens_seen": 210470825, + "router_z_loss_clip": 1.47949219, + "router_z_loss_mlp": 0.12200928, + "step": 9762, + "time_per_iteration": 2.511338472366333 + }, + { + "auxiliary_loss_clip": 0.06414436, + "auxiliary_loss_mlp": 0.01266483, + "balance_loss_clip": 0.06275184, + "balance_loss_mlp": 0.01255766, + "epoch": 0.5869833157973846, + "flos": 74753288974080.0, + "grad_norm": 1.2323244190692502, + "language_loss": 0.72678411, + "learning_rate": 1.5378349380632317e-06, + "loss": 0.80359328, + "num_input_tokens_seen": 210500075, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.10723877, + "step": 9763, + "time_per_iteration": 2.966012716293335 + }, + { + "auxiliary_loss_clip": 0.06416167, + "auxiliary_loss_mlp": 0.01264221, + "balance_loss_clip": 0.06274015, + "balance_loss_mlp": 0.01253296, + "epoch": 0.5870434390500526, + "flos": 17644687476480.0, + "grad_norm": 1.6070407244149296, + "language_loss": 0.79883134, + "learning_rate": 1.53745602625755e-06, + "loss": 0.87563521, + "num_input_tokens_seen": 210518150, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.10931396, + "step": 9764, + "time_per_iteration": 2.5360097885131836 + }, + { + "auxiliary_loss_clip": 0.06420099, + "auxiliary_loss_mlp": 0.01269959, + "balance_loss_clip": 0.06274946, + "balance_loss_mlp": 0.01258342, + "epoch": 0.5871035623027205, + "flos": 21512424890880.0, + "grad_norm": 2.0596306569779967, + "language_loss": 0.79149717, + "learning_rate": 1.5370771319915819e-06, + "loss": 0.86839771, + "num_input_tokens_seen": 210537760, + "router_z_loss_clip": 1.45019531, + "router_z_loss_mlp": 0.1161499, + "step": 9765, + "time_per_iteration": 2.523232936859131 + }, + { + "auxiliary_loss_clip": 0.06413256, + "auxiliary_loss_mlp": 0.01264834, + "balance_loss_clip": 0.06272542, + "balance_loss_mlp": 0.01254427, + "epoch": 0.5871636855553886, + "flos": 13556744732160.0, + "grad_norm": 1.6377752901078153, + "language_loss": 0.83660257, + "learning_rate": 1.5366982552796947e-06, + "loss": 0.91338348, + "num_input_tokens_seen": 210555515, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.10406494, + "step": 9766, + "time_per_iteration": 2.468043804168701 + }, + { + "auxiliary_loss_clip": 0.06423902, + "auxiliary_loss_mlp": 0.01268958, + "balance_loss_clip": 0.06274862, + "balance_loss_mlp": 0.01257639, + "epoch": 0.5872238088080565, + "flos": 26220006178560.0, + "grad_norm": 1.5173362705755495, + "language_loss": 0.69876915, + "learning_rate": 1.536319396136257e-06, + "loss": 0.77569771, + "num_input_tokens_seen": 210575000, + "router_z_loss_clip": 1.49121094, + "router_z_loss_mlp": 0.11322021, + "step": 9767, + "time_per_iteration": 2.53935170173645 + }, + { + "auxiliary_loss_clip": 0.06416009, + "auxiliary_loss_mlp": 0.01268588, + "balance_loss_clip": 0.06271268, + "balance_loss_mlp": 0.0125743, + "epoch": 0.5872839320607245, + "flos": 30673196870400.0, + "grad_norm": 6.458419959703109, + "language_loss": 0.64030594, + "learning_rate": 1.5359405545756336e-06, + "loss": 0.71715188, + "num_input_tokens_seen": 210595185, + "router_z_loss_clip": 1.44824219, + "router_z_loss_mlp": 0.11151123, + "step": 9768, + "time_per_iteration": 2.6036899089813232 + }, + { + "auxiliary_loss_clip": 0.06324692, + "auxiliary_loss_mlp": 0.01254391, + "balance_loss_clip": 0.06264571, + "balance_loss_mlp": 0.01252818, + "epoch": 0.5873440553133924, + "flos": 60324623925120.0, + "grad_norm": 0.7185710562845293, + "language_loss": 0.53754711, + "learning_rate": 1.5355617306121914e-06, + "loss": 0.61333793, + "num_input_tokens_seen": 210653210, + "router_z_loss_clip": 0.6015625, + "router_z_loss_mlp": 0.01573944, + "step": 9769, + "time_per_iteration": 4.53153133392334 + }, + { + "auxiliary_loss_clip": 0.06416724, + "auxiliary_loss_mlp": 0.01267359, + "balance_loss_clip": 0.0627375, + "balance_loss_mlp": 0.01256409, + "epoch": 0.5874041785660604, + "flos": 21545016929280.0, + "grad_norm": 1.3491952646211745, + "language_loss": 0.70993185, + "learning_rate": 1.5351829242602945e-06, + "loss": 0.78677267, + "num_input_tokens_seen": 210673750, + "router_z_loss_clip": 1.42871094, + "router_z_loss_mlp": 0.10949707, + "step": 9770, + "time_per_iteration": 2.5152831077575684 + }, + { + "auxiliary_loss_clip": 0.06416201, + "auxiliary_loss_mlp": 0.01268245, + "balance_loss_clip": 0.06274108, + "balance_loss_mlp": 0.01256801, + "epoch": 0.5874643018187284, + "flos": 24395778938880.0, + "grad_norm": 1.9550841164663295, + "language_loss": 0.67880088, + "learning_rate": 1.5348041355343077e-06, + "loss": 0.75564533, + "num_input_tokens_seen": 210692960, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.11444092, + "step": 9771, + "time_per_iteration": 2.518069267272949 + }, + { + "auxiliary_loss_clip": 0.06421787, + "auxiliary_loss_mlp": 0.0126791, + "balance_loss_clip": 0.06274431, + "balance_loss_mlp": 0.0125531, + "epoch": 0.5875244250713964, + "flos": 28155300405120.0, + "grad_norm": 1.4791048602495522, + "language_loss": 0.66491324, + "learning_rate": 1.5344253644485954e-06, + "loss": 0.74181026, + "num_input_tokens_seen": 210714040, + "router_z_loss_clip": 1.47363281, + "router_z_loss_mlp": 0.1260376, + "step": 9772, + "time_per_iteration": 2.5565338134765625 + }, + { + "auxiliary_loss_clip": 0.0642426, + "auxiliary_loss_mlp": 0.01271472, + "balance_loss_clip": 0.06276119, + "balance_loss_mlp": 0.01258866, + "epoch": 0.5875845483240644, + "flos": 25819566716160.0, + "grad_norm": 1.5545187987766196, + "language_loss": 0.7466417, + "learning_rate": 1.534046611017519e-06, + "loss": 0.82359904, + "num_input_tokens_seen": 210733710, + "router_z_loss_clip": 1.48144531, + "router_z_loss_mlp": 0.12615967, + "step": 9773, + "time_per_iteration": 2.533243179321289 + }, + { + "auxiliary_loss_clip": 0.06421398, + "auxiliary_loss_mlp": 0.0126674, + "balance_loss_clip": 0.06276072, + "balance_loss_mlp": 0.01255606, + "epoch": 0.5876446715767323, + "flos": 26913843112320.0, + "grad_norm": 1.8911636717759477, + "language_loss": 0.54071677, + "learning_rate": 1.5336678752554421e-06, + "loss": 0.61759812, + "num_input_tokens_seen": 210753580, + "router_z_loss_clip": 1.45507812, + "router_z_loss_mlp": 0.11138916, + "step": 9774, + "time_per_iteration": 2.5565576553344727 + }, + { + "auxiliary_loss_clip": 0.06419463, + "auxiliary_loss_mlp": 0.01264428, + "balance_loss_clip": 0.06276506, + "balance_loss_mlp": 0.01253192, + "epoch": 0.5877047948294003, + "flos": 36693750510720.0, + "grad_norm": 2.5652883668591886, + "language_loss": 0.65881801, + "learning_rate": 1.5332891571767264e-06, + "loss": 0.73565692, + "num_input_tokens_seen": 210773495, + "router_z_loss_clip": 1.42871094, + "router_z_loss_mlp": 0.11242676, + "step": 9775, + "time_per_iteration": 4.102318525314331 + }, + { + "auxiliary_loss_clip": 0.06418855, + "auxiliary_loss_mlp": 0.01267575, + "balance_loss_clip": 0.06274112, + "balance_loss_mlp": 0.01256459, + "epoch": 0.5877649180820682, + "flos": 26732057679360.0, + "grad_norm": 1.541611587459476, + "language_loss": 0.73877925, + "learning_rate": 1.5329104567957326e-06, + "loss": 0.81564349, + "num_input_tokens_seen": 210793645, + "router_z_loss_clip": 1.44726562, + "router_z_loss_mlp": 0.11114502, + "step": 9776, + "time_per_iteration": 2.534105062484741 + }, + { + "auxiliary_loss_clip": 0.06416035, + "auxiliary_loss_mlp": 0.01267161, + "balance_loss_clip": 0.06270815, + "balance_loss_mlp": 0.0125586, + "epoch": 0.5878250413347362, + "flos": 21038457870720.0, + "grad_norm": 1.5037279013590201, + "language_loss": 0.7431531, + "learning_rate": 1.532531774126821e-06, + "loss": 0.81998503, + "num_input_tokens_seen": 210813415, + "router_z_loss_clip": 1.45117188, + "router_z_loss_mlp": 0.11315918, + "step": 9777, + "time_per_iteration": 2.501791000366211 + }, + { + "auxiliary_loss_clip": 0.06412566, + "auxiliary_loss_mlp": 0.01267719, + "balance_loss_clip": 0.06273127, + "balance_loss_mlp": 0.01257407, + "epoch": 0.5878851645874041, + "flos": 25491397000320.0, + "grad_norm": 1.389592011343503, + "language_loss": 0.74136406, + "learning_rate": 1.5321531091843512e-06, + "loss": 0.81816691, + "num_input_tokens_seen": 210833850, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.10302734, + "step": 9778, + "time_per_iteration": 2.5198276042938232 + }, + { + "auxiliary_loss_clip": 0.06416066, + "auxiliary_loss_mlp": 0.01272779, + "balance_loss_clip": 0.06272696, + "balance_loss_mlp": 0.01261293, + "epoch": 0.5879452878400722, + "flos": 23775930760320.0, + "grad_norm": 1.6684393614308786, + "language_loss": 0.70061487, + "learning_rate": 1.5317744619826824e-06, + "loss": 0.77750337, + "num_input_tokens_seen": 210853115, + "router_z_loss_clip": 1.43457031, + "router_z_loss_mlp": 0.11486816, + "step": 9779, + "time_per_iteration": 3.9999070167541504 + }, + { + "auxiliary_loss_clip": 0.06419669, + "auxiliary_loss_mlp": 0.01264938, + "balance_loss_clip": 0.06273909, + "balance_loss_mlp": 0.0125331, + "epoch": 0.5880054110927401, + "flos": 17830749467520.0, + "grad_norm": 1.9325071243234666, + "language_loss": 0.67414713, + "learning_rate": 1.5313958325361727e-06, + "loss": 0.75099313, + "num_input_tokens_seen": 210872090, + "router_z_loss_clip": 1.45800781, + "router_z_loss_mlp": 0.11633301, + "step": 9780, + "time_per_iteration": 2.525421142578125 + }, + { + "auxiliary_loss_clip": 0.06422442, + "auxiliary_loss_mlp": 0.01271374, + "balance_loss_clip": 0.0627559, + "balance_loss_mlp": 0.0125981, + "epoch": 0.5880655343454081, + "flos": 19469417840640.0, + "grad_norm": 1.9086155780635632, + "language_loss": 0.73100537, + "learning_rate": 1.5310172208591807e-06, + "loss": 0.80794352, + "num_input_tokens_seen": 210888490, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 0.11572266, + "step": 9781, + "time_per_iteration": 2.4647257328033447 + }, + { + "auxiliary_loss_clip": 0.06415875, + "auxiliary_loss_mlp": 0.01269752, + "balance_loss_clip": 0.06273176, + "balance_loss_mlp": 0.01258731, + "epoch": 0.588125657598076, + "flos": 21403999307520.0, + "grad_norm": 1.283507981192047, + "language_loss": 0.7022016, + "learning_rate": 1.5306386269660622e-06, + "loss": 0.77905786, + "num_input_tokens_seen": 210908220, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.11016846, + "step": 9782, + "time_per_iteration": 2.531780481338501 + }, + { + "auxiliary_loss_clip": 0.06420694, + "auxiliary_loss_mlp": 0.01268128, + "balance_loss_clip": 0.06274669, + "balance_loss_mlp": 0.01256314, + "epoch": 0.588185780850744, + "flos": 16040246296320.0, + "grad_norm": 2.020771184042221, + "language_loss": 0.71036118, + "learning_rate": 1.5302600508711741e-06, + "loss": 0.78724945, + "num_input_tokens_seen": 210923945, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.11804199, + "step": 9783, + "time_per_iteration": 2.452061176300049 + }, + { + "auxiliary_loss_clip": 0.06426281, + "auxiliary_loss_mlp": 0.01267542, + "balance_loss_clip": 0.06277394, + "balance_loss_mlp": 0.01255538, + "epoch": 0.588245904103412, + "flos": 23734282481280.0, + "grad_norm": 1.861465214251895, + "language_loss": 0.69312334, + "learning_rate": 1.5298814925888719e-06, + "loss": 0.77006149, + "num_input_tokens_seen": 210941955, + "router_z_loss_clip": 1.48828125, + "router_z_loss_mlp": 0.12005615, + "step": 9784, + "time_per_iteration": 2.552767515182495 + }, + { + "auxiliary_loss_clip": 0.06421058, + "auxiliary_loss_mlp": 0.01265879, + "balance_loss_clip": 0.06273105, + "balance_loss_mlp": 0.01254596, + "epoch": 0.58830602735608, + "flos": 33810983441280.0, + "grad_norm": 1.7066395827536198, + "language_loss": 0.69576097, + "learning_rate": 1.5295029521335102e-06, + "loss": 0.77263039, + "num_input_tokens_seen": 210963105, + "router_z_loss_clip": 1.47753906, + "router_z_loss_mlp": 0.112854, + "step": 9785, + "time_per_iteration": 3.9847395420074463 + }, + { + "auxiliary_loss_clip": 0.06415717, + "auxiliary_loss_mlp": 0.01265718, + "balance_loss_clip": 0.06274907, + "balance_loss_mlp": 0.01255352, + "epoch": 0.588366150608748, + "flos": 17096144722560.0, + "grad_norm": 1.8665479354272698, + "language_loss": 0.78022271, + "learning_rate": 1.5291244295194448e-06, + "loss": 0.85703707, + "num_input_tokens_seen": 210978720, + "router_z_loss_clip": 1.40722656, + "router_z_loss_mlp": 0.10369873, + "step": 9786, + "time_per_iteration": 2.4842867851257324 + }, + { + "auxiliary_loss_clip": 0.06423976, + "auxiliary_loss_mlp": 0.01266691, + "balance_loss_clip": 0.06278287, + "balance_loss_mlp": 0.01255128, + "epoch": 0.5884262738614159, + "flos": 22133698588800.0, + "grad_norm": 1.4734886628165487, + "language_loss": 0.78796208, + "learning_rate": 1.5287459247610276e-06, + "loss": 0.86486876, + "num_input_tokens_seen": 210998750, + "router_z_loss_clip": 1.45605469, + "router_z_loss_mlp": 0.11566162, + "step": 9787, + "time_per_iteration": 2.497192144393921 + }, + { + "auxiliary_loss_clip": 0.06418703, + "auxiliary_loss_mlp": 0.01265555, + "balance_loss_clip": 0.06275064, + "balance_loss_mlp": 0.01254617, + "epoch": 0.5884863971140839, + "flos": 21038038600320.0, + "grad_norm": 1.5088398107909506, + "language_loss": 0.66488671, + "learning_rate": 1.5283674378726116e-06, + "loss": 0.74172926, + "num_input_tokens_seen": 211017550, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.10943604, + "step": 9788, + "time_per_iteration": 2.5208425521850586 + }, + { + "auxiliary_loss_clip": 0.06416389, + "auxiliary_loss_mlp": 0.01266859, + "balance_loss_clip": 0.0627557, + "balance_loss_mlp": 0.01255212, + "epoch": 0.5885465203667518, + "flos": 23811835034880.0, + "grad_norm": 2.124690797246634, + "language_loss": 0.8100794, + "learning_rate": 1.5279889688685506e-06, + "loss": 0.88691187, + "num_input_tokens_seen": 211034135, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.11651611, + "step": 9789, + "time_per_iteration": 2.497751235961914 + }, + { + "auxiliary_loss_clip": 0.06413969, + "auxiliary_loss_mlp": 0.01268072, + "balance_loss_clip": 0.06274658, + "balance_loss_mlp": 0.01257432, + "epoch": 0.5886066436194198, + "flos": 18886647893760.0, + "grad_norm": 1.5219157367370164, + "language_loss": 0.69998693, + "learning_rate": 1.5276105177631944e-06, + "loss": 0.77680737, + "num_input_tokens_seen": 211053850, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.10638428, + "step": 9790, + "time_per_iteration": 2.5238122940063477 + }, + { + "auxiliary_loss_clip": 0.06416899, + "auxiliary_loss_mlp": 0.01266137, + "balance_loss_clip": 0.06275025, + "balance_loss_mlp": 0.01254484, + "epoch": 0.5886667668720877, + "flos": 24797015015040.0, + "grad_norm": 1.9547129753533632, + "language_loss": 0.83327186, + "learning_rate": 1.527232084570895e-06, + "loss": 0.91010225, + "num_input_tokens_seen": 211072165, + "router_z_loss_clip": 1.41601562, + "router_z_loss_mlp": 0.11651611, + "step": 9791, + "time_per_iteration": 2.518833637237549 + }, + { + "auxiliary_loss_clip": 0.06420578, + "auxiliary_loss_mlp": 0.01270103, + "balance_loss_clip": 0.06276245, + "balance_loss_mlp": 0.01259297, + "epoch": 0.5887268901247558, + "flos": 21620473130880.0, + "grad_norm": 1.5293641441028467, + "language_loss": 0.76486295, + "learning_rate": 1.5268536693060026e-06, + "loss": 0.84176975, + "num_input_tokens_seen": 211089630, + "router_z_loss_clip": 1.44140625, + "router_z_loss_mlp": 0.1081543, + "step": 9792, + "time_per_iteration": 2.5101959705352783 + }, + { + "auxiliary_loss_clip": 0.06421857, + "auxiliary_loss_mlp": 0.01269547, + "balance_loss_clip": 0.06273879, + "balance_loss_mlp": 0.01258424, + "epoch": 0.5887870133774237, + "flos": 20487357567360.0, + "grad_norm": 2.1847202997614477, + "language_loss": 0.69169068, + "learning_rate": 1.5264752719828662e-06, + "loss": 0.76860476, + "num_input_tokens_seen": 211106120, + "router_z_loss_clip": 1.48046875, + "router_z_loss_mlp": 0.11114502, + "step": 9793, + "time_per_iteration": 2.4927995204925537 + }, + { + "auxiliary_loss_clip": 0.06418081, + "auxiliary_loss_mlp": 0.01269605, + "balance_loss_clip": 0.06276278, + "balance_loss_mlp": 0.01258483, + "epoch": 0.5888471366300917, + "flos": 19211966570880.0, + "grad_norm": 1.7416997591947727, + "language_loss": 0.60439771, + "learning_rate": 1.5260968926158353e-06, + "loss": 0.68127453, + "num_input_tokens_seen": 211122450, + "router_z_loss_clip": 1.41796875, + "router_z_loss_mlp": 0.11132812, + "step": 9794, + "time_per_iteration": 2.543231248855591 + }, + { + "auxiliary_loss_clip": 0.06420963, + "auxiliary_loss_mlp": 0.01267396, + "balance_loss_clip": 0.06275123, + "balance_loss_mlp": 0.01256113, + "epoch": 0.5889072598827596, + "flos": 19978786010880.0, + "grad_norm": 1.5723031838894885, + "language_loss": 0.65483499, + "learning_rate": 1.525718531219257e-06, + "loss": 0.73171854, + "num_input_tokens_seen": 211141765, + "router_z_loss_clip": 1.45703125, + "router_z_loss_mlp": 0.11291504, + "step": 9795, + "time_per_iteration": 2.502537965774536 + }, + { + "auxiliary_loss_clip": 0.06414207, + "auxiliary_loss_mlp": 0.01265609, + "balance_loss_clip": 0.06274657, + "balance_loss_mlp": 0.01255197, + "epoch": 0.5889673831354276, + "flos": 20747617948800.0, + "grad_norm": 1.4841948976653832, + "language_loss": 0.74256188, + "learning_rate": 1.5253401878074801e-06, + "loss": 0.81936008, + "num_input_tokens_seen": 211160475, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.10418701, + "step": 9796, + "time_per_iteration": 2.496511220932007 + }, + { + "auxiliary_loss_clip": 0.06417978, + "auxiliary_loss_mlp": 0.01267283, + "balance_loss_clip": 0.06275263, + "balance_loss_mlp": 0.01256238, + "epoch": 0.5890275063880956, + "flos": 25307892558720.0, + "grad_norm": 2.3243895650299566, + "language_loss": 0.83142781, + "learning_rate": 1.5249618623948507e-06, + "loss": 0.90828037, + "num_input_tokens_seen": 211180480, + "router_z_loss_clip": 1.42675781, + "router_z_loss_mlp": 0.11047363, + "step": 9797, + "time_per_iteration": 2.5991365909576416 + }, + { + "auxiliary_loss_clip": 0.06417, + "auxiliary_loss_mlp": 0.01261637, + "balance_loss_clip": 0.06275804, + "balance_loss_mlp": 0.01250806, + "epoch": 0.5890876296407636, + "flos": 11770182702720.0, + "grad_norm": 1.5626242229143896, + "language_loss": 0.79473782, + "learning_rate": 1.5245835549957152e-06, + "loss": 0.87152421, + "num_input_tokens_seen": 211198000, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.1083374, + "step": 9798, + "time_per_iteration": 2.5399045944213867 + }, + { + "auxiliary_loss_clip": 0.06414175, + "auxiliary_loss_mlp": 0.01264907, + "balance_loss_clip": 0.06274281, + "balance_loss_mlp": 0.01254584, + "epoch": 0.5891477528934316, + "flos": 13594535723520.0, + "grad_norm": 2.254418827792415, + "language_loss": 0.75000322, + "learning_rate": 1.5242052656244186e-06, + "loss": 0.82679403, + "num_input_tokens_seen": 211214765, + "router_z_loss_clip": 1.39941406, + "router_z_loss_mlp": 0.10321045, + "step": 9799, + "time_per_iteration": 2.4642131328582764 + }, + { + "auxiliary_loss_clip": 0.06420485, + "auxiliary_loss_mlp": 0.01266976, + "balance_loss_clip": 0.06274568, + "balance_loss_mlp": 0.01254798, + "epoch": 0.5892078761460995, + "flos": 15054563191680.0, + "grad_norm": 1.9320779180150096, + "language_loss": 0.76666486, + "learning_rate": 1.5238269942953064e-06, + "loss": 0.84353948, + "num_input_tokens_seen": 211232335, + "router_z_loss_clip": 1.45898438, + "router_z_loss_mlp": 0.12182617, + "step": 9800, + "time_per_iteration": 2.5170304775238037 + }, + { + "auxiliary_loss_clip": 0.06421179, + "auxiliary_loss_mlp": 0.01264846, + "balance_loss_clip": 0.06275316, + "balance_loss_mlp": 0.0125361, + "epoch": 0.5892679993987675, + "flos": 15783591640320.0, + "grad_norm": 1.6350760782373632, + "language_loss": 0.79415876, + "learning_rate": 1.523448741022722e-06, + "loss": 0.87101901, + "num_input_tokens_seen": 211249985, + "router_z_loss_clip": 1.45898438, + "router_z_loss_mlp": 0.11242676, + "step": 9801, + "time_per_iteration": 2.4804494380950928 + }, + { + "auxiliary_loss_clip": 0.06421967, + "auxiliary_loss_mlp": 0.01265274, + "balance_loss_clip": 0.06274579, + "balance_loss_mlp": 0.01253467, + "epoch": 0.5893281226514354, + "flos": 25272281773440.0, + "grad_norm": 1.6257193775599612, + "language_loss": 0.6664654, + "learning_rate": 1.5230705058210088e-06, + "loss": 0.74333781, + "num_input_tokens_seen": 211268425, + "router_z_loss_clip": 1.47265625, + "router_z_loss_mlp": 0.11804199, + "step": 9802, + "time_per_iteration": 2.536524534225464 + }, + { + "auxiliary_loss_clip": 0.06417859, + "auxiliary_loss_mlp": 0.01267449, + "balance_loss_clip": 0.06276833, + "balance_loss_mlp": 0.01256475, + "epoch": 0.5893882459041034, + "flos": 19463380346880.0, + "grad_norm": 2.7221530495776953, + "language_loss": 0.78339422, + "learning_rate": 1.5226922887045108e-06, + "loss": 0.86024731, + "num_input_tokens_seen": 211286680, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.10986328, + "step": 9803, + "time_per_iteration": 2.4658396244049072 + }, + { + "auxiliary_loss_clip": 0.06422158, + "auxiliary_loss_mlp": 0.01266134, + "balance_loss_clip": 0.06275959, + "balance_loss_mlp": 0.01255143, + "epoch": 0.5894483691567713, + "flos": 20640785592960.0, + "grad_norm": 1.3509589673333673, + "language_loss": 0.73070806, + "learning_rate": 1.5223140896875686e-06, + "loss": 0.80759096, + "num_input_tokens_seen": 211307700, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.10986328, + "step": 9804, + "time_per_iteration": 2.5561769008636475 + }, + { + "auxiliary_loss_clip": 0.06421436, + "auxiliary_loss_mlp": 0.01267021, + "balance_loss_clip": 0.06279321, + "balance_loss_mlp": 0.01255779, + "epoch": 0.5895084924094394, + "flos": 17782812132480.0, + "grad_norm": 4.893575785915148, + "language_loss": 0.74802667, + "learning_rate": 1.5219359087845234e-06, + "loss": 0.82491124, + "num_input_tokens_seen": 211324835, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.11254883, + "step": 9805, + "time_per_iteration": 2.4777255058288574 + }, + { + "auxiliary_loss_clip": 0.06430615, + "auxiliary_loss_mlp": 0.01266439, + "balance_loss_clip": 0.06278822, + "balance_loss_mlp": 0.01254542, + "epoch": 0.5895686156621073, + "flos": 20127350499840.0, + "grad_norm": 1.9675390106462767, + "language_loss": 0.78339982, + "learning_rate": 1.5215577460097174e-06, + "loss": 0.8603704, + "num_input_tokens_seen": 211344130, + "router_z_loss_clip": 1.515625, + "router_z_loss_mlp": 0.11901855, + "step": 9806, + "time_per_iteration": 2.556187868118286 + }, + { + "auxiliary_loss_clip": 0.06426841, + "auxiliary_loss_mlp": 0.01268335, + "balance_loss_clip": 0.06283563, + "balance_loss_mlp": 0.01256813, + "epoch": 0.5896287389147753, + "flos": 20856337021440.0, + "grad_norm": 1.8953677951134942, + "language_loss": 0.77413982, + "learning_rate": 1.5211796013774887e-06, + "loss": 0.85109162, + "num_input_tokens_seen": 211362915, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.11523438, + "step": 9807, + "time_per_iteration": 2.519200325012207 + }, + { + "auxiliary_loss_clip": 0.06425367, + "auxiliary_loss_mlp": 0.01268029, + "balance_loss_clip": 0.06276954, + "balance_loss_mlp": 0.01256341, + "epoch": 0.5896888621674432, + "flos": 14543098669440.0, + "grad_norm": 1.5805632295861456, + "language_loss": 0.75183058, + "learning_rate": 1.5208014749021786e-06, + "loss": 0.82876456, + "num_input_tokens_seen": 211380700, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.11694336, + "step": 9808, + "time_per_iteration": 3.908586025238037 + }, + { + "auxiliary_loss_clip": 0.06422409, + "auxiliary_loss_mlp": 0.01266023, + "balance_loss_clip": 0.06277257, + "balance_loss_mlp": 0.01253912, + "epoch": 0.5897489854201112, + "flos": 20893079836800.0, + "grad_norm": 1.9290339931200338, + "language_loss": 0.71909666, + "learning_rate": 1.5204233665981236e-06, + "loss": 0.79598099, + "num_input_tokens_seen": 211400095, + "router_z_loss_clip": 1.45019531, + "router_z_loss_mlp": 0.12103271, + "step": 9809, + "time_per_iteration": 2.5768144130706787 + }, + { + "auxiliary_loss_clip": 0.06423716, + "auxiliary_loss_mlp": 0.01266116, + "balance_loss_clip": 0.0627635, + "balance_loss_mlp": 0.01254272, + "epoch": 0.5898091086727792, + "flos": 20017331688960.0, + "grad_norm": 2.0062119760557473, + "language_loss": 0.82969332, + "learning_rate": 1.5200452764796627e-06, + "loss": 0.90659165, + "num_input_tokens_seen": 211417810, + "router_z_loss_clip": 1.47070312, + "router_z_loss_mlp": 0.1184082, + "step": 9810, + "time_per_iteration": 2.5024096965789795 + }, + { + "auxiliary_loss_clip": 0.06418087, + "auxiliary_loss_mlp": 0.01268409, + "balance_loss_clip": 0.06278655, + "balance_loss_mlp": 0.01257394, + "epoch": 0.5898692319254472, + "flos": 16258816471680.0, + "grad_norm": 2.656719323590735, + "language_loss": 0.81247234, + "learning_rate": 1.5196672045611336e-06, + "loss": 0.8893373, + "num_input_tokens_seen": 211436020, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.11016846, + "step": 9811, + "time_per_iteration": 2.5079774856567383 + }, + { + "auxiliary_loss_clip": 0.06424809, + "auxiliary_loss_mlp": 0.01266333, + "balance_loss_clip": 0.06278014, + "balance_loss_mlp": 0.01254442, + "epoch": 0.5899293551781152, + "flos": 20454723601920.0, + "grad_norm": 1.7175276958807264, + "language_loss": 0.7698791, + "learning_rate": 1.5192891508568715e-06, + "loss": 0.84679055, + "num_input_tokens_seen": 211454335, + "router_z_loss_clip": 1.46777344, + "router_z_loss_mlp": 0.11883545, + "step": 9812, + "time_per_iteration": 2.4813108444213867 + }, + { + "auxiliary_loss_clip": 0.06419283, + "auxiliary_loss_mlp": 0.01264428, + "balance_loss_clip": 0.0627578, + "balance_loss_mlp": 0.01253992, + "epoch": 0.5899894784307831, + "flos": 13886885018880.0, + "grad_norm": 1.6786934004730485, + "language_loss": 0.71137106, + "learning_rate": 1.5189111153812133e-06, + "loss": 0.78820813, + "num_input_tokens_seen": 211472775, + "router_z_loss_clip": 1.43359375, + "router_z_loss_mlp": 0.10437012, + "step": 9813, + "time_per_iteration": 2.5212063789367676 + }, + { + "auxiliary_loss_clip": 0.0641876, + "auxiliary_loss_mlp": 0.01270874, + "balance_loss_clip": 0.06273647, + "balance_loss_mlp": 0.01259394, + "epoch": 0.5900496016834511, + "flos": 20089936851840.0, + "grad_norm": 1.420675326684763, + "language_loss": 0.7244218, + "learning_rate": 1.518533098148494e-06, + "loss": 0.80131817, + "num_input_tokens_seen": 211492195, + "router_z_loss_clip": 1.45019531, + "router_z_loss_mlp": 0.11468506, + "step": 9814, + "time_per_iteration": 2.4773387908935547 + }, + { + "auxiliary_loss_clip": 0.06421163, + "auxiliary_loss_mlp": 0.01268081, + "balance_loss_clip": 0.06276704, + "balance_loss_mlp": 0.01256768, + "epoch": 0.590109724936119, + "flos": 20264133490560.0, + "grad_norm": 1.7152732807584992, + "language_loss": 0.7885775, + "learning_rate": 1.5181550991730476e-06, + "loss": 0.86546993, + "num_input_tokens_seen": 211510220, + "router_z_loss_clip": 1.4453125, + "router_z_loss_mlp": 0.11309814, + "step": 9815, + "time_per_iteration": 3.939445972442627 + }, + { + "auxiliary_loss_clip": 0.06427211, + "auxiliary_loss_mlp": 0.01267373, + "balance_loss_clip": 0.06275927, + "balance_loss_mlp": 0.01255142, + "epoch": 0.590169848188787, + "flos": 24240548050560.0, + "grad_norm": 1.7218203048390952, + "language_loss": 0.76316988, + "learning_rate": 1.5177771184692083e-06, + "loss": 0.84011579, + "num_input_tokens_seen": 211526260, + "router_z_loss_clip": 1.51171875, + "router_z_loss_mlp": 0.12243652, + "step": 9816, + "time_per_iteration": 2.5245048999786377 + }, + { + "auxiliary_loss_clip": 0.06419881, + "auxiliary_loss_mlp": 0.01267479, + "balance_loss_clip": 0.06277047, + "balance_loss_mlp": 0.01255725, + "epoch": 0.590229971441455, + "flos": 17790400926720.0, + "grad_norm": 1.8371364848215923, + "language_loss": 0.81572855, + "learning_rate": 1.517399156051309e-06, + "loss": 0.89260209, + "num_input_tokens_seen": 211542890, + "router_z_loss_clip": 1.42871094, + "router_z_loss_mlp": 0.11743164, + "step": 9817, + "time_per_iteration": 2.4621410369873047 + }, + { + "auxiliary_loss_clip": 0.06418833, + "auxiliary_loss_mlp": 0.01268261, + "balance_loss_clip": 0.06274004, + "balance_loss_mlp": 0.01257544, + "epoch": 0.590290094694123, + "flos": 22243465837440.0, + "grad_norm": 1.5541077044812335, + "language_loss": 0.76864719, + "learning_rate": 1.517021211933682e-06, + "loss": 0.84551811, + "num_input_tokens_seen": 211562685, + "router_z_loss_clip": 1.44726562, + "router_z_loss_mlp": 0.10717773, + "step": 9818, + "time_per_iteration": 2.5125410556793213 + }, + { + "auxiliary_loss_clip": 0.06416667, + "auxiliary_loss_mlp": 0.01265866, + "balance_loss_clip": 0.06275138, + "balance_loss_mlp": 0.01255501, + "epoch": 0.5903502179467909, + "flos": 19104589163520.0, + "grad_norm": 1.8321116335564553, + "language_loss": 0.67227435, + "learning_rate": 1.5166432861306592e-06, + "loss": 0.74909973, + "num_input_tokens_seen": 211579960, + "router_z_loss_clip": 1.41601562, + "router_z_loss_mlp": 0.10369873, + "step": 9819, + "time_per_iteration": 4.011074066162109 + }, + { + "auxiliary_loss_clip": 0.06420997, + "auxiliary_loss_mlp": 0.01266547, + "balance_loss_clip": 0.06275985, + "balance_loss_mlp": 0.01255819, + "epoch": 0.5904103411994589, + "flos": 24241051175040.0, + "grad_norm": 1.4923193447304384, + "language_loss": 0.7829935, + "learning_rate": 1.5162653786565714e-06, + "loss": 0.85986888, + "num_input_tokens_seen": 211599310, + "router_z_loss_clip": 1.45117188, + "router_z_loss_mlp": 0.10723877, + "step": 9820, + "time_per_iteration": 2.5523388385772705 + }, + { + "auxiliary_loss_clip": 0.06318125, + "auxiliary_loss_mlp": 0.01254512, + "balance_loss_clip": 0.06258737, + "balance_loss_mlp": 0.01253092, + "epoch": 0.5904704644521268, + "flos": 64894388774400.0, + "grad_norm": 0.9340841048050909, + "language_loss": 0.65183949, + "learning_rate": 1.5158874895257487e-06, + "loss": 0.72756588, + "num_input_tokens_seen": 211658790, + "router_z_loss_clip": 0.59375, + "router_z_loss_mlp": 0.01417542, + "step": 9821, + "time_per_iteration": 3.1619784832000732 + }, + { + "auxiliary_loss_clip": 0.06416959, + "auxiliary_loss_mlp": 0.0126236, + "balance_loss_clip": 0.06275654, + "balance_loss_mlp": 0.01251935, + "epoch": 0.5905305877047948, + "flos": 19616137539840.0, + "grad_norm": 2.101599923194391, + "language_loss": 0.6190716, + "learning_rate": 1.515509618752521e-06, + "loss": 0.69586486, + "num_input_tokens_seen": 211677240, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.10412598, + "step": 9822, + "time_per_iteration": 2.519482374191284 + }, + { + "auxiliary_loss_clip": 0.06419894, + "auxiliary_loss_mlp": 0.01268851, + "balance_loss_clip": 0.06275024, + "balance_loss_mlp": 0.01257365, + "epoch": 0.5905907109574628, + "flos": 18995660455680.0, + "grad_norm": 1.8507285157055846, + "language_loss": 0.82910419, + "learning_rate": 1.5151317663512173e-06, + "loss": 0.90599167, + "num_input_tokens_seen": 211695485, + "router_z_loss_clip": 1.44921875, + "router_z_loss_mlp": 0.1149292, + "step": 9823, + "time_per_iteration": 2.5134451389312744 + }, + { + "auxiliary_loss_clip": 0.06417045, + "auxiliary_loss_mlp": 0.01267549, + "balance_loss_clip": 0.06275238, + "balance_loss_mlp": 0.01256546, + "epoch": 0.5906508342101308, + "flos": 22206974584320.0, + "grad_norm": 1.8772651852061113, + "language_loss": 0.73388183, + "learning_rate": 1.514753932336165e-06, + "loss": 0.81072783, + "num_input_tokens_seen": 211713090, + "router_z_loss_clip": 1.41796875, + "router_z_loss_mlp": 0.11004639, + "step": 9824, + "time_per_iteration": 3.8841147422790527 + }, + { + "auxiliary_loss_clip": 0.064331, + "auxiliary_loss_mlp": 0.01267625, + "balance_loss_clip": 0.06277563, + "balance_loss_mlp": 0.01255013, + "epoch": 0.5907109574627988, + "flos": 20892995982720.0, + "grad_norm": 1.9523854086350827, + "language_loss": 0.82938302, + "learning_rate": 1.514376116721693e-06, + "loss": 0.90639031, + "num_input_tokens_seen": 211732510, + "router_z_loss_clip": 1.55566406, + "router_z_loss_mlp": 0.12609863, + "step": 9825, + "time_per_iteration": 2.527808427810669 + }, + { + "auxiliary_loss_clip": 0.06417271, + "auxiliary_loss_mlp": 0.01264281, + "balance_loss_clip": 0.06277614, + "balance_loss_mlp": 0.0125422, + "epoch": 0.5907710807154667, + "flos": 21513011869440.0, + "grad_norm": 1.8272335212588457, + "language_loss": 0.76679188, + "learning_rate": 1.5139983195221272e-06, + "loss": 0.84360743, + "num_input_tokens_seen": 211748695, + "router_z_loss_clip": 1.39648438, + "router_z_loss_mlp": 0.10058594, + "step": 9826, + "time_per_iteration": 2.4867746829986572 + }, + { + "auxiliary_loss_clip": 0.06416261, + "auxiliary_loss_mlp": 0.01262552, + "balance_loss_clip": 0.06274769, + "balance_loss_mlp": 0.01252419, + "epoch": 0.5908312039681347, + "flos": 22024979516160.0, + "grad_norm": 1.5050840799955296, + "language_loss": 0.7292102, + "learning_rate": 1.513620540751793e-06, + "loss": 0.80599833, + "num_input_tokens_seen": 211768545, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.10131836, + "step": 9827, + "time_per_iteration": 2.5261569023132324 + }, + { + "auxiliary_loss_clip": 0.06419525, + "auxiliary_loss_mlp": 0.01266997, + "balance_loss_clip": 0.0627335, + "balance_loss_mlp": 0.0125588, + "epoch": 0.5908913272208026, + "flos": 18485579525760.0, + "grad_norm": 1.8170415974974599, + "language_loss": 0.80223072, + "learning_rate": 1.5132427804250178e-06, + "loss": 0.87909591, + "num_input_tokens_seen": 211786665, + "router_z_loss_clip": 1.46191406, + "router_z_loss_mlp": 0.11120605, + "step": 9828, + "time_per_iteration": 2.4725866317749023 + }, + { + "auxiliary_loss_clip": 0.06421993, + "auxiliary_loss_mlp": 0.01272492, + "balance_loss_clip": 0.06275676, + "balance_loss_mlp": 0.01260375, + "epoch": 0.5909514504734706, + "flos": 12317006448000.0, + "grad_norm": 1.8455350152663679, + "language_loss": 0.88620806, + "learning_rate": 1.5128650385561241e-06, + "loss": 0.96315295, + "num_input_tokens_seen": 211801215, + "router_z_loss_clip": 1.46289062, + "router_z_loss_mlp": 0.12133789, + "step": 9829, + "time_per_iteration": 2.4783804416656494 + }, + { + "auxiliary_loss_clip": 0.06324679, + "auxiliary_loss_mlp": 0.01254341, + "balance_loss_clip": 0.06265787, + "balance_loss_mlp": 0.01252693, + "epoch": 0.5910115737261386, + "flos": 70233557811840.0, + "grad_norm": 0.7549892406299625, + "language_loss": 0.57903004, + "learning_rate": 1.5124873151594376e-06, + "loss": 0.6548202, + "num_input_tokens_seen": 211857005, + "router_z_loss_clip": 0.58984375, + "router_z_loss_mlp": 0.01651001, + "step": 9830, + "time_per_iteration": 3.0390307903289795 + }, + { + "auxiliary_loss_clip": 0.0643173, + "auxiliary_loss_mlp": 0.01269908, + "balance_loss_clip": 0.06281478, + "balance_loss_mlp": 0.01257308, + "epoch": 0.5910716969788066, + "flos": 22024266756480.0, + "grad_norm": 2.1560619163105965, + "language_loss": 0.75963652, + "learning_rate": 1.5121096102492812e-06, + "loss": 0.83665287, + "num_input_tokens_seen": 211876675, + "router_z_loss_clip": 1.50097656, + "router_z_loss_mlp": 0.12591553, + "step": 9831, + "time_per_iteration": 2.5367510318756104 + }, + { + "auxiliary_loss_clip": 0.06409759, + "auxiliary_loss_mlp": 0.01262704, + "balance_loss_clip": 0.06272636, + "balance_loss_mlp": 0.01252124, + "epoch": 0.5911318202314745, + "flos": 21258034295040.0, + "grad_norm": 1.5753423885742641, + "language_loss": 0.77885556, + "learning_rate": 1.5117319238399767e-06, + "loss": 0.85558021, + "num_input_tokens_seen": 211895725, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.10583496, + "step": 9832, + "time_per_iteration": 2.504584789276123 + }, + { + "auxiliary_loss_clip": 0.06416824, + "auxiliary_loss_mlp": 0.01265662, + "balance_loss_clip": 0.06274946, + "balance_loss_mlp": 0.01254797, + "epoch": 0.5911919434841425, + "flos": 17827353377280.0, + "grad_norm": 1.6998910709640538, + "language_loss": 0.83265263, + "learning_rate": 1.511354255945847e-06, + "loss": 0.90947747, + "num_input_tokens_seen": 211913860, + "router_z_loss_clip": 1.41894531, + "router_z_loss_mlp": 0.10864258, + "step": 9833, + "time_per_iteration": 2.508920192718506 + }, + { + "auxiliary_loss_clip": 0.06420296, + "auxiliary_loss_mlp": 0.01269729, + "balance_loss_clip": 0.06274877, + "balance_loss_mlp": 0.01259006, + "epoch": 0.5912520667368104, + "flos": 20380818700800.0, + "grad_norm": 1.4145847544307324, + "language_loss": 0.74488783, + "learning_rate": 1.5109766065812123e-06, + "loss": 0.82178807, + "num_input_tokens_seen": 211932880, + "router_z_loss_clip": 1.453125, + "router_z_loss_mlp": 0.10723877, + "step": 9834, + "time_per_iteration": 2.515340566635132 + }, + { + "auxiliary_loss_clip": 0.06420908, + "auxiliary_loss_mlp": 0.0126652, + "balance_loss_clip": 0.06276181, + "balance_loss_mlp": 0.01255308, + "epoch": 0.5913121899894784, + "flos": 17936240158080.0, + "grad_norm": 2.2554155860211296, + "language_loss": 0.78118962, + "learning_rate": 1.5105989757603942e-06, + "loss": 0.85806394, + "num_input_tokens_seen": 211948625, + "router_z_loss_clip": 1.44824219, + "router_z_loss_mlp": 0.11212158, + "step": 9835, + "time_per_iteration": 2.516449213027954 + }, + { + "auxiliary_loss_clip": 0.06422424, + "auxiliary_loss_mlp": 0.01268422, + "balance_loss_clip": 0.06274521, + "balance_loss_mlp": 0.0125724, + "epoch": 0.5913723132421465, + "flos": 22133405099520.0, + "grad_norm": 1.7910918924229287, + "language_loss": 0.74562353, + "learning_rate": 1.5102213634977117e-06, + "loss": 0.82253206, + "num_input_tokens_seen": 211965355, + "router_z_loss_clip": 1.47949219, + "router_z_loss_mlp": 0.11187744, + "step": 9836, + "time_per_iteration": 2.4944818019866943 + }, + { + "auxiliary_loss_clip": 0.06421088, + "auxiliary_loss_mlp": 0.01263965, + "balance_loss_clip": 0.06274953, + "balance_loss_mlp": 0.01252396, + "epoch": 0.5914324364948144, + "flos": 15702056017920.0, + "grad_norm": 1.9466597288818261, + "language_loss": 0.82267582, + "learning_rate": 1.5098437698074841e-06, + "loss": 0.89952636, + "num_input_tokens_seen": 211982245, + "router_z_loss_clip": 1.46289062, + "router_z_loss_mlp": 0.11572266, + "step": 9837, + "time_per_iteration": 2.5073657035827637 + }, + { + "auxiliary_loss_clip": 0.06423111, + "auxiliary_loss_mlp": 0.01265723, + "balance_loss_clip": 0.06276567, + "balance_loss_mlp": 0.01253665, + "epoch": 0.5914925597474824, + "flos": 22753924110720.0, + "grad_norm": 1.6146002375859378, + "language_loss": 0.7983368, + "learning_rate": 1.5094661947040304e-06, + "loss": 0.87522513, + "num_input_tokens_seen": 212000250, + "router_z_loss_clip": 1.46386719, + "router_z_loss_mlp": 0.1206665, + "step": 9838, + "time_per_iteration": 2.5024936199188232 + }, + { + "auxiliary_loss_clip": 0.06421801, + "auxiliary_loss_mlp": 0.01267887, + "balance_loss_clip": 0.06276052, + "balance_loss_mlp": 0.01256503, + "epoch": 0.5915526830001503, + "flos": 18298092015360.0, + "grad_norm": 1.7930328536333848, + "language_loss": 0.70194936, + "learning_rate": 1.5090886382016673e-06, + "loss": 0.77884626, + "num_input_tokens_seen": 212017505, + "router_z_loss_clip": 1.45703125, + "router_z_loss_mlp": 0.11383057, + "step": 9839, + "time_per_iteration": 2.5000133514404297 + }, + { + "auxiliary_loss_clip": 0.06421608, + "auxiliary_loss_mlp": 0.01265072, + "balance_loss_clip": 0.06275722, + "balance_loss_mlp": 0.01254462, + "epoch": 0.5916128062528183, + "flos": 17024713516800.0, + "grad_norm": 2.2460586823912254, + "language_loss": 0.65840614, + "learning_rate": 1.5087111003147124e-06, + "loss": 0.73527294, + "num_input_tokens_seen": 212034595, + "router_z_loss_clip": 1.45898438, + "router_z_loss_mlp": 0.10614014, + "step": 9840, + "time_per_iteration": 2.472325325012207 + }, + { + "auxiliary_loss_clip": 0.06421183, + "auxiliary_loss_mlp": 0.01269035, + "balance_loss_clip": 0.06273993, + "balance_loss_mlp": 0.01257019, + "epoch": 0.5916729295054862, + "flos": 24761194594560.0, + "grad_norm": 7.488465580129743, + "language_loss": 0.82013118, + "learning_rate": 1.5083335810574813e-06, + "loss": 0.89703333, + "num_input_tokens_seen": 212055775, + "router_z_loss_clip": 1.47265625, + "router_z_loss_mlp": 0.12023926, + "step": 9841, + "time_per_iteration": 2.539569139480591 + }, + { + "auxiliary_loss_clip": 0.06417108, + "auxiliary_loss_mlp": 0.01266112, + "balance_loss_clip": 0.06275231, + "balance_loss_mlp": 0.01255782, + "epoch": 0.5917330527581542, + "flos": 15963196867200.0, + "grad_norm": 1.7355438933283587, + "language_loss": 0.69817364, + "learning_rate": 1.507956080444291e-06, + "loss": 0.77500588, + "num_input_tokens_seen": 212074000, + "router_z_loss_clip": 1.41796875, + "router_z_loss_mlp": 0.10333252, + "step": 9842, + "time_per_iteration": 2.4748387336730957 + }, + { + "auxiliary_loss_clip": 0.06423896, + "auxiliary_loss_mlp": 0.0126808, + "balance_loss_clip": 0.06278209, + "balance_loss_mlp": 0.01256332, + "epoch": 0.5917931760108222, + "flos": 23806719936000.0, + "grad_norm": 2.0642371985300105, + "language_loss": 0.83243513, + "learning_rate": 1.5075785984894549e-06, + "loss": 0.90935493, + "num_input_tokens_seen": 212091415, + "router_z_loss_clip": 1.45703125, + "router_z_loss_mlp": 0.11755371, + "step": 9843, + "time_per_iteration": 2.5579354763031006 + }, + { + "auxiliary_loss_clip": 0.06423706, + "auxiliary_loss_mlp": 0.01264185, + "balance_loss_clip": 0.06277691, + "balance_loss_mlp": 0.01252419, + "epoch": 0.5918532992634902, + "flos": 23254864945920.0, + "grad_norm": 2.21208381325965, + "language_loss": 0.81869078, + "learning_rate": 1.5072011352072875e-06, + "loss": 0.89556968, + "num_input_tokens_seen": 212105255, + "router_z_loss_clip": 1.45898438, + "router_z_loss_mlp": 0.11773682, + "step": 9844, + "time_per_iteration": 2.4732062816619873 + }, + { + "auxiliary_loss_clip": 0.06423113, + "auxiliary_loss_mlp": 0.01264577, + "balance_loss_clip": 0.0627753, + "balance_loss_mlp": 0.01253496, + "epoch": 0.5919134225161581, + "flos": 19505867166720.0, + "grad_norm": 2.0396261684123966, + "language_loss": 0.74979722, + "learning_rate": 1.5068236906121032e-06, + "loss": 0.8266741, + "num_input_tokens_seen": 212122765, + "router_z_loss_clip": 1.45507812, + "router_z_loss_mlp": 0.11077881, + "step": 9845, + "time_per_iteration": 2.5498902797698975 + }, + { + "auxiliary_loss_clip": 0.0642004, + "auxiliary_loss_mlp": 0.01267189, + "balance_loss_clip": 0.06273404, + "balance_loss_mlp": 0.01255215, + "epoch": 0.5919735457688261, + "flos": 38810201264640.0, + "grad_norm": 1.7793580681254029, + "language_loss": 0.64624578, + "learning_rate": 1.506446264718213e-06, + "loss": 0.72311807, + "num_input_tokens_seen": 212143960, + "router_z_loss_clip": 1.46484375, + "router_z_loss_mlp": 0.11962891, + "step": 9846, + "time_per_iteration": 2.6562187671661377 + }, + { + "auxiliary_loss_clip": 0.0641156, + "auxiliary_loss_mlp": 0.01268591, + "balance_loss_clip": 0.06275991, + "balance_loss_mlp": 0.01258851, + "epoch": 0.592033669021494, + "flos": 22170567185280.0, + "grad_norm": 1.5989871653678733, + "language_loss": 0.76435882, + "learning_rate": 1.506068857539931e-06, + "loss": 0.84116036, + "num_input_tokens_seen": 212162005, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.09735107, + "step": 9847, + "time_per_iteration": 2.5877273082733154 + }, + { + "auxiliary_loss_clip": 0.06420001, + "auxiliary_loss_mlp": 0.01267428, + "balance_loss_clip": 0.06274936, + "balance_loss_mlp": 0.01255477, + "epoch": 0.592093792274162, + "flos": 22717600565760.0, + "grad_norm": 1.9085044692476394, + "language_loss": 0.62601185, + "learning_rate": 1.5056914690915667e-06, + "loss": 0.70288616, + "num_input_tokens_seen": 212181635, + "router_z_loss_clip": 1.44921875, + "router_z_loss_mlp": 0.11956787, + "step": 9848, + "time_per_iteration": 3.9838032722473145 + }, + { + "auxiliary_loss_clip": 0.06422321, + "auxiliary_loss_mlp": 0.01263974, + "balance_loss_clip": 0.06275022, + "balance_loss_mlp": 0.01252959, + "epoch": 0.59215391552683, + "flos": 22535605497600.0, + "grad_norm": 2.0066393042716855, + "language_loss": 0.76503384, + "learning_rate": 1.5053140993874312e-06, + "loss": 0.84189683, + "num_input_tokens_seen": 212201615, + "router_z_loss_clip": 1.47265625, + "router_z_loss_mlp": 0.11022949, + "step": 9849, + "time_per_iteration": 2.5015931129455566 + }, + { + "auxiliary_loss_clip": 0.06421839, + "auxiliary_loss_mlp": 0.01268681, + "balance_loss_clip": 0.06277264, + "balance_loss_mlp": 0.01256671, + "epoch": 0.592214038779498, + "flos": 24505965457920.0, + "grad_norm": 1.745648722955103, + "language_loss": 0.75836027, + "learning_rate": 1.5049367484418353e-06, + "loss": 0.8352654, + "num_input_tokens_seen": 212219355, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.12005615, + "step": 9850, + "time_per_iteration": 2.600179672241211 + }, + { + "auxiliary_loss_clip": 0.06417172, + "auxiliary_loss_mlp": 0.01268411, + "balance_loss_clip": 0.06275059, + "balance_loss_mlp": 0.01257367, + "epoch": 0.592274162032166, + "flos": 21837156589440.0, + "grad_norm": 1.6508975523953922, + "language_loss": 0.75545883, + "learning_rate": 1.5045594162690868e-06, + "loss": 0.83231473, + "num_input_tokens_seen": 212236710, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.1105957, + "step": 9851, + "time_per_iteration": 2.4818735122680664 + }, + { + "auxiliary_loss_clip": 0.06419359, + "auxiliary_loss_mlp": 0.01266702, + "balance_loss_clip": 0.06275028, + "balance_loss_mlp": 0.01254918, + "epoch": 0.5923342852848339, + "flos": 24615061873920.0, + "grad_norm": 1.7463946887344501, + "language_loss": 0.70506394, + "learning_rate": 1.5041821028834954e-06, + "loss": 0.78192449, + "num_input_tokens_seen": 212256195, + "router_z_loss_clip": 1.44238281, + "router_z_loss_mlp": 0.11779785, + "step": 9852, + "time_per_iteration": 2.587822675704956 + }, + { + "auxiliary_loss_clip": 0.06423963, + "auxiliary_loss_mlp": 0.01273382, + "balance_loss_clip": 0.06275325, + "balance_loss_mlp": 0.01261043, + "epoch": 0.5923944085375019, + "flos": 19944307255680.0, + "grad_norm": 1.582534152024796, + "language_loss": 0.80272847, + "learning_rate": 1.5038048082993685e-06, + "loss": 0.87970185, + "num_input_tokens_seen": 212274085, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.12347412, + "step": 9853, + "time_per_iteration": 2.4834022521972656 + }, + { + "auxiliary_loss_clip": 0.06412584, + "auxiliary_loss_mlp": 0.01264493, + "balance_loss_clip": 0.06272356, + "balance_loss_mlp": 0.01253985, + "epoch": 0.5924545317901698, + "flos": 28666177948800.0, + "grad_norm": 1.4145056961897013, + "language_loss": 0.67743915, + "learning_rate": 1.5034275325310124e-06, + "loss": 0.75421, + "num_input_tokens_seen": 212295530, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.1050415, + "step": 9854, + "time_per_iteration": 3.9716901779174805 + }, + { + "auxiliary_loss_clip": 0.06417395, + "auxiliary_loss_mlp": 0.01268291, + "balance_loss_clip": 0.06274853, + "balance_loss_mlp": 0.01257514, + "epoch": 0.5925146550428378, + "flos": 19870989333120.0, + "grad_norm": 1.7006302713228023, + "language_loss": 0.89085132, + "learning_rate": 1.5030502755927344e-06, + "loss": 0.96770817, + "num_input_tokens_seen": 212313770, + "router_z_loss_clip": 1.42675781, + "router_z_loss_mlp": 0.10772705, + "step": 9855, + "time_per_iteration": 2.54018235206604 + }, + { + "auxiliary_loss_clip": 0.06414687, + "auxiliary_loss_mlp": 0.01266215, + "balance_loss_clip": 0.06275393, + "balance_loss_mlp": 0.0125585, + "epoch": 0.5925747782955058, + "flos": 15128510019840.0, + "grad_norm": 1.7501100927117066, + "language_loss": 0.86997199, + "learning_rate": 1.5026730374988397e-06, + "loss": 0.94678098, + "num_input_tokens_seen": 212331525, + "router_z_loss_clip": 1.39257812, + "router_z_loss_mlp": 0.10369873, + "step": 9856, + "time_per_iteration": 2.5016441345214844 + }, + { + "auxiliary_loss_clip": 0.06422357, + "auxiliary_loss_mlp": 0.01265204, + "balance_loss_clip": 0.06275797, + "balance_loss_mlp": 0.01254177, + "epoch": 0.5926349015481738, + "flos": 18411297281280.0, + "grad_norm": 1.7487529922228526, + "language_loss": 0.77790916, + "learning_rate": 1.5022958182636332e-06, + "loss": 0.85478473, + "num_input_tokens_seen": 212347295, + "router_z_loss_clip": 1.46484375, + "router_z_loss_mlp": 0.11016846, + "step": 9857, + "time_per_iteration": 2.5232088565826416 + }, + { + "auxiliary_loss_clip": 0.06421745, + "auxiliary_loss_mlp": 0.01266127, + "balance_loss_clip": 0.06278913, + "balance_loss_mlp": 0.01254689, + "epoch": 0.5926950248008417, + "flos": 23117620757760.0, + "grad_norm": 2.3581492349261524, + "language_loss": 0.65045798, + "learning_rate": 1.501918617901419e-06, + "loss": 0.72733665, + "num_input_tokens_seen": 212365750, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.11431885, + "step": 9858, + "time_per_iteration": 4.080450773239136 + }, + { + "auxiliary_loss_clip": 0.06418257, + "auxiliary_loss_mlp": 0.01268065, + "balance_loss_clip": 0.06277932, + "balance_loss_mlp": 0.01256662, + "epoch": 0.5927551480535097, + "flos": 28040753473920.0, + "grad_norm": 1.620046821031832, + "language_loss": 0.77013564, + "learning_rate": 1.501541436426501e-06, + "loss": 0.84699887, + "num_input_tokens_seen": 212385300, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.11395264, + "step": 9859, + "time_per_iteration": 2.5496175289154053 + }, + { + "auxiliary_loss_clip": 0.06422819, + "auxiliary_loss_mlp": 0.01272084, + "balance_loss_clip": 0.06277181, + "balance_loss_mlp": 0.01260217, + "epoch": 0.5928152713061776, + "flos": 21805109602560.0, + "grad_norm": 2.0806402016169914, + "language_loss": 0.75381404, + "learning_rate": 1.5011642738531818e-06, + "loss": 0.8307631, + "num_input_tokens_seen": 212402140, + "router_z_loss_clip": 1.45507812, + "router_z_loss_mlp": 0.11865234, + "step": 9860, + "time_per_iteration": 2.4913806915283203 + }, + { + "auxiliary_loss_clip": 0.06419, + "auxiliary_loss_mlp": 0.01267797, + "balance_loss_clip": 0.06277152, + "balance_loss_mlp": 0.01257557, + "epoch": 0.5928753945588456, + "flos": 24323802681600.0, + "grad_norm": 1.5719426663731493, + "language_loss": 0.7657429, + "learning_rate": 1.500787130195763e-06, + "loss": 0.84261084, + "num_input_tokens_seen": 212421790, + "router_z_loss_clip": 1.41894531, + "router_z_loss_mlp": 0.10235596, + "step": 9861, + "time_per_iteration": 2.542318344116211 + }, + { + "auxiliary_loss_clip": 0.06416907, + "auxiliary_loss_mlp": 0.01266144, + "balance_loss_clip": 0.0627644, + "balance_loss_mlp": 0.01255355, + "epoch": 0.5929355178115137, + "flos": 26471126465280.0, + "grad_norm": 1.7884263747312634, + "language_loss": 0.70557332, + "learning_rate": 1.5004100054685465e-06, + "loss": 0.78240383, + "num_input_tokens_seen": 212442115, + "router_z_loss_clip": 1.40429688, + "router_z_loss_mlp": 0.10797119, + "step": 9862, + "time_per_iteration": 2.5269577503204346 + }, + { + "auxiliary_loss_clip": 0.06422247, + "auxiliary_loss_mlp": 0.01262904, + "balance_loss_clip": 0.06279124, + "balance_loss_mlp": 0.01252455, + "epoch": 0.5929956410641816, + "flos": 24971798632320.0, + "grad_norm": 1.7042567790148921, + "language_loss": 0.7816, + "learning_rate": 1.500032899685832e-06, + "loss": 0.85845149, + "num_input_tokens_seen": 212459535, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.10449219, + "step": 9863, + "time_per_iteration": 2.560952663421631 + }, + { + "auxiliary_loss_clip": 0.06423997, + "auxiliary_loss_mlp": 0.01269473, + "balance_loss_clip": 0.06280629, + "balance_loss_mlp": 0.01258917, + "epoch": 0.5930557643168496, + "flos": 26214639517440.0, + "grad_norm": 1.987432864542063, + "language_loss": 0.71297693, + "learning_rate": 1.499655812861921e-06, + "loss": 0.78991163, + "num_input_tokens_seen": 212479385, + "router_z_loss_clip": 1.43359375, + "router_z_loss_mlp": 0.10565186, + "step": 9864, + "time_per_iteration": 4.022796869277954 + }, + { + "auxiliary_loss_clip": 0.0642028, + "auxiliary_loss_mlp": 0.01268386, + "balance_loss_clip": 0.06276219, + "balance_loss_mlp": 0.01256578, + "epoch": 0.5931158875695175, + "flos": 27862322204160.0, + "grad_norm": 2.045271412380321, + "language_loss": 0.67615211, + "learning_rate": 1.4992787450111112e-06, + "loss": 0.75303876, + "num_input_tokens_seen": 212500060, + "router_z_loss_clip": 1.43847656, + "router_z_loss_mlp": 0.11816406, + "step": 9865, + "time_per_iteration": 2.542477607727051 + }, + { + "auxiliary_loss_clip": 0.06424178, + "auxiliary_loss_mlp": 0.01265131, + "balance_loss_clip": 0.06278679, + "balance_loss_mlp": 0.01253597, + "epoch": 0.5931760108221855, + "flos": 15419014525440.0, + "grad_norm": 2.0467341556470906, + "language_loss": 0.78422129, + "learning_rate": 1.4989016961477015e-06, + "loss": 0.86111438, + "num_input_tokens_seen": 212518590, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.11535645, + "step": 9866, + "time_per_iteration": 2.5601937770843506 + }, + { + "auxiliary_loss_clip": 0.06417245, + "auxiliary_loss_mlp": 0.01267033, + "balance_loss_clip": 0.06280121, + "balance_loss_mlp": 0.01256114, + "epoch": 0.5932361340748534, + "flos": 30196043395200.0, + "grad_norm": 1.6991427361252174, + "language_loss": 0.72385359, + "learning_rate": 1.4985246662859903e-06, + "loss": 0.80069637, + "num_input_tokens_seen": 212538190, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.10919189, + "step": 9867, + "time_per_iteration": 2.582200527191162 + }, + { + "auxiliary_loss_clip": 0.06421208, + "auxiliary_loss_mlp": 0.01268228, + "balance_loss_clip": 0.06280105, + "balance_loss_mlp": 0.0125589, + "epoch": 0.5932962573275214, + "flos": 20163841752960.0, + "grad_norm": 1.4126147288957658, + "language_loss": 0.6694321, + "learning_rate": 1.4981476554402732e-06, + "loss": 0.74632645, + "num_input_tokens_seen": 212557820, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.12335205, + "step": 9868, + "time_per_iteration": 2.515268087387085 + }, + { + "auxiliary_loss_clip": 0.06420252, + "auxiliary_loss_mlp": 0.01266526, + "balance_loss_clip": 0.06275701, + "balance_loss_mlp": 0.01255046, + "epoch": 0.5933563805801894, + "flos": 25452725541120.0, + "grad_norm": 1.59033500525529, + "language_loss": 0.75624323, + "learning_rate": 1.4977706636248478e-06, + "loss": 0.83311105, + "num_input_tokens_seen": 212577645, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.11474609, + "step": 9869, + "time_per_iteration": 2.5264642238616943 + }, + { + "auxiliary_loss_clip": 0.06425707, + "auxiliary_loss_mlp": 0.01265896, + "balance_loss_clip": 0.06281111, + "balance_loss_mlp": 0.01254779, + "epoch": 0.5934165038328574, + "flos": 60007971674880.0, + "grad_norm": 1.9233451977688907, + "language_loss": 0.74787021, + "learning_rate": 1.4973936908540091e-06, + "loss": 0.82478619, + "num_input_tokens_seen": 212603430, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.11114502, + "step": 9870, + "time_per_iteration": 2.8604302406311035 + }, + { + "auxiliary_loss_clip": 0.06422332, + "auxiliary_loss_mlp": 0.01265883, + "balance_loss_clip": 0.0627723, + "balance_loss_mlp": 0.01254719, + "epoch": 0.5934766270855253, + "flos": 24426568114560.0, + "grad_norm": 2.4352017906666226, + "language_loss": 0.72491121, + "learning_rate": 1.4970167371420517e-06, + "loss": 0.80179334, + "num_input_tokens_seen": 212620730, + "router_z_loss_clip": 1.45214844, + "router_z_loss_mlp": 0.11169434, + "step": 9871, + "time_per_iteration": 2.504990577697754 + }, + { + "auxiliary_loss_clip": 0.06424776, + "auxiliary_loss_mlp": 0.01266102, + "balance_loss_clip": 0.0627915, + "balance_loss_mlp": 0.01254843, + "epoch": 0.5935367503381933, + "flos": 23519821155840.0, + "grad_norm": 2.2688315988077736, + "language_loss": 0.74858117, + "learning_rate": 1.496639802503271e-06, + "loss": 0.82548994, + "num_input_tokens_seen": 212639745, + "router_z_loss_clip": 1.45703125, + "router_z_loss_mlp": 0.11254883, + "step": 9872, + "time_per_iteration": 2.5957329273223877 + }, + { + "auxiliary_loss_clip": 0.06431574, + "auxiliary_loss_mlp": 0.01267461, + "balance_loss_clip": 0.06283869, + "balance_loss_mlp": 0.01255517, + "epoch": 0.5935968735908612, + "flos": 18953550979200.0, + "grad_norm": 11.679124704717912, + "language_loss": 0.79073173, + "learning_rate": 1.4962628869519583e-06, + "loss": 0.86772209, + "num_input_tokens_seen": 212655915, + "router_z_loss_clip": 1.4765625, + "router_z_loss_mlp": 0.1194458, + "step": 9873, + "time_per_iteration": 2.4669687747955322 + }, + { + "auxiliary_loss_clip": 0.064208, + "auxiliary_loss_mlp": 0.01267302, + "balance_loss_clip": 0.06276259, + "balance_loss_mlp": 0.01255459, + "epoch": 0.5936569968435292, + "flos": 25490432678400.0, + "grad_norm": 1.6349451241448802, + "language_loss": 0.85223055, + "learning_rate": 1.4958859905024078e-06, + "loss": 0.9291116, + "num_input_tokens_seen": 212676115, + "router_z_loss_clip": 1.44726562, + "router_z_loss_mlp": 0.11853027, + "step": 9874, + "time_per_iteration": 2.5542490482330322 + }, + { + "auxiliary_loss_clip": 0.06322969, + "auxiliary_loss_mlp": 0.01256968, + "balance_loss_clip": 0.0626381, + "balance_loss_mlp": 0.01255485, + "epoch": 0.5937171200961973, + "flos": 66397364259840.0, + "grad_norm": 0.7006393782995821, + "language_loss": 0.59778833, + "learning_rate": 1.4955091131689115e-06, + "loss": 0.67358768, + "num_input_tokens_seen": 212737560, + "router_z_loss_clip": 0.59375, + "router_z_loss_mlp": 0.01482391, + "step": 9875, + "time_per_iteration": 3.2118613719940186 + }, + { + "auxiliary_loss_clip": 0.06429566, + "auxiliary_loss_mlp": 0.01269748, + "balance_loss_clip": 0.06278439, + "balance_loss_mlp": 0.01257302, + "epoch": 0.5937772433488652, + "flos": 14908849741440.0, + "grad_norm": 2.56951836872527, + "language_loss": 0.78072035, + "learning_rate": 1.4951322549657594e-06, + "loss": 0.85771352, + "num_input_tokens_seen": 212755365, + "router_z_loss_clip": 1.51367188, + "router_z_loss_mlp": 0.12451172, + "step": 9876, + "time_per_iteration": 2.488849401473999 + }, + { + "auxiliary_loss_clip": 0.06411201, + "auxiliary_loss_mlp": 0.0126454, + "balance_loss_clip": 0.06273002, + "balance_loss_mlp": 0.01253764, + "epoch": 0.5938373666015332, + "flos": 22567484776320.0, + "grad_norm": 1.5512644369371444, + "language_loss": 0.7603606, + "learning_rate": 1.494755415907243e-06, + "loss": 0.83711803, + "num_input_tokens_seen": 212773875, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.10772705, + "step": 9877, + "time_per_iteration": 2.5584661960601807 + }, + { + "auxiliary_loss_clip": 0.06419433, + "auxiliary_loss_mlp": 0.01268567, + "balance_loss_clip": 0.06274508, + "balance_loss_mlp": 0.01256801, + "epoch": 0.5938974898542011, + "flos": 18446572650240.0, + "grad_norm": 2.5934425226299243, + "language_loss": 0.81566256, + "learning_rate": 1.4943785960076522e-06, + "loss": 0.8925426, + "num_input_tokens_seen": 212790590, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.11779785, + "step": 9878, + "time_per_iteration": 2.498063802719116 + }, + { + "auxiliary_loss_clip": 0.0642112, + "auxiliary_loss_mlp": 0.0126802, + "balance_loss_clip": 0.06274901, + "balance_loss_mlp": 0.01256993, + "epoch": 0.5939576131068691, + "flos": 45597029293440.0, + "grad_norm": 1.6161422600744055, + "language_loss": 0.71359301, + "learning_rate": 1.4940017952812754e-06, + "loss": 0.79048443, + "num_input_tokens_seen": 212812265, + "router_z_loss_clip": 1.46191406, + "router_z_loss_mlp": 0.11029053, + "step": 9879, + "time_per_iteration": 2.7588438987731934 + }, + { + "auxiliary_loss_clip": 0.06414315, + "auxiliary_loss_mlp": 0.0126561, + "balance_loss_clip": 0.06272938, + "balance_loss_mlp": 0.01254166, + "epoch": 0.594017736359537, + "flos": 23594648451840.0, + "grad_norm": 1.558347600048505, + "language_loss": 0.57834136, + "learning_rate": 1.493625013742401e-06, + "loss": 0.65514064, + "num_input_tokens_seen": 212831915, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.11431885, + "step": 9880, + "time_per_iteration": 2.5477280616760254 + }, + { + "auxiliary_loss_clip": 0.0641728, + "auxiliary_loss_mlp": 0.012706, + "balance_loss_clip": 0.06272745, + "balance_loss_mlp": 0.01258751, + "epoch": 0.594077859612205, + "flos": 29464373543040.0, + "grad_norm": 1.9254284711947285, + "language_loss": 0.78115642, + "learning_rate": 1.4932482514053177e-06, + "loss": 0.85803521, + "num_input_tokens_seen": 212851350, + "router_z_loss_clip": 1.4453125, + "router_z_loss_mlp": 0.11846924, + "step": 9881, + "time_per_iteration": 2.596902847290039 + }, + { + "auxiliary_loss_clip": 0.06421138, + "auxiliary_loss_mlp": 0.0126373, + "balance_loss_clip": 0.06276222, + "balance_loss_mlp": 0.01252882, + "epoch": 0.594137982864873, + "flos": 16805682144000.0, + "grad_norm": 2.173471904433077, + "language_loss": 0.83138072, + "learning_rate": 1.4928715082843112e-06, + "loss": 0.90822935, + "num_input_tokens_seen": 212867995, + "router_z_loss_clip": 1.44726562, + "router_z_loss_mlp": 0.10839844, + "step": 9882, + "time_per_iteration": 2.483264446258545 + }, + { + "auxiliary_loss_clip": 0.06420217, + "auxiliary_loss_mlp": 0.01271488, + "balance_loss_clip": 0.06276472, + "balance_loss_mlp": 0.01260318, + "epoch": 0.594198106117541, + "flos": 12755194974720.0, + "grad_norm": 2.093124407330454, + "language_loss": 0.79720157, + "learning_rate": 1.492494784393667e-06, + "loss": 0.87411857, + "num_input_tokens_seen": 212885220, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.11175537, + "step": 9883, + "time_per_iteration": 2.5007734298706055 + }, + { + "auxiliary_loss_clip": 0.06424005, + "auxiliary_loss_mlp": 0.01269731, + "balance_loss_clip": 0.06275944, + "balance_loss_mlp": 0.01258097, + "epoch": 0.5942582293702089, + "flos": 21002930939520.0, + "grad_norm": 1.7867915832733556, + "language_loss": 0.7479161, + "learning_rate": 1.4921180797476725e-06, + "loss": 0.82485354, + "num_input_tokens_seen": 212903195, + "router_z_loss_clip": 1.48144531, + "router_z_loss_mlp": 0.11645508, + "step": 9884, + "time_per_iteration": 2.5044338703155518 + }, + { + "auxiliary_loss_clip": 0.06419083, + "auxiliary_loss_mlp": 0.01265524, + "balance_loss_clip": 0.06275263, + "balance_loss_mlp": 0.01253549, + "epoch": 0.5943183526228769, + "flos": 28298665941120.0, + "grad_norm": 2.661403390475952, + "language_loss": 0.6670655, + "learning_rate": 1.4917413943606106e-06, + "loss": 0.7439115, + "num_input_tokens_seen": 212923340, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.11975098, + "step": 9885, + "time_per_iteration": 2.592233180999756 + }, + { + "auxiliary_loss_clip": 0.06417437, + "auxiliary_loss_mlp": 0.01268066, + "balance_loss_clip": 0.06274392, + "balance_loss_mlp": 0.01256884, + "epoch": 0.5943784758755448, + "flos": 26621829233280.0, + "grad_norm": 2.23147400779812, + "language_loss": 0.76914746, + "learning_rate": 1.4913647282467667e-06, + "loss": 0.84600246, + "num_input_tokens_seen": 212942755, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.11181641, + "step": 9886, + "time_per_iteration": 2.5211451053619385 + }, + { + "auxiliary_loss_clip": 0.06318811, + "auxiliary_loss_mlp": 0.01252302, + "balance_loss_clip": 0.06259875, + "balance_loss_mlp": 0.01250785, + "epoch": 0.5944385991282128, + "flos": 64209859643520.0, + "grad_norm": 0.8085761446732002, + "language_loss": 0.64425516, + "learning_rate": 1.490988081420423e-06, + "loss": 0.71996629, + "num_input_tokens_seen": 212999355, + "router_z_loss_clip": 0.58984375, + "router_z_loss_mlp": 0.01515961, + "step": 9887, + "time_per_iteration": 4.4216148853302 + }, + { + "auxiliary_loss_clip": 0.06419201, + "auxiliary_loss_mlp": 0.01265936, + "balance_loss_clip": 0.06275857, + "balance_loss_mlp": 0.01254307, + "epoch": 0.5944987223808808, + "flos": 19577885351040.0, + "grad_norm": 1.7443994329425772, + "language_loss": 0.691764, + "learning_rate": 1.4906114538958615e-06, + "loss": 0.76861531, + "num_input_tokens_seen": 213018570, + "router_z_loss_clip": 1.43164062, + "router_z_loss_mlp": 0.11633301, + "step": 9888, + "time_per_iteration": 2.558119058609009 + }, + { + "auxiliary_loss_clip": 0.06419526, + "auxiliary_loss_mlp": 0.01269907, + "balance_loss_clip": 0.06276903, + "balance_loss_mlp": 0.01258773, + "epoch": 0.5945588456335488, + "flos": 26184856590720.0, + "grad_norm": 1.5028057851776446, + "language_loss": 0.7952224, + "learning_rate": 1.490234845687366e-06, + "loss": 0.87211674, + "num_input_tokens_seen": 213037735, + "router_z_loss_clip": 1.42675781, + "router_z_loss_mlp": 0.11138916, + "step": 9889, + "time_per_iteration": 2.556455612182617 + }, + { + "auxiliary_loss_clip": 0.06416804, + "auxiliary_loss_mlp": 0.01267591, + "balance_loss_clip": 0.06273508, + "balance_loss_mlp": 0.01257076, + "epoch": 0.5946189688862168, + "flos": 20452333760640.0, + "grad_norm": 1.5171149074997012, + "language_loss": 0.70987219, + "learning_rate": 1.4898582568092154e-06, + "loss": 0.7867161, + "num_input_tokens_seen": 213057160, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.1050415, + "step": 9890, + "time_per_iteration": 2.572852373123169 + }, + { + "auxiliary_loss_clip": 0.06420811, + "auxiliary_loss_mlp": 0.01269509, + "balance_loss_clip": 0.06275058, + "balance_loss_mlp": 0.01258041, + "epoch": 0.5946790921388847, + "flos": 13441568895360.0, + "grad_norm": 1.9815921383050485, + "language_loss": 0.697523, + "learning_rate": 1.489481687275691e-06, + "loss": 0.77442622, + "num_input_tokens_seen": 213073630, + "router_z_loss_clip": 1.45800781, + "router_z_loss_mlp": 0.11468506, + "step": 9891, + "time_per_iteration": 2.474308729171753 + }, + { + "auxiliary_loss_clip": 0.06419806, + "auxiliary_loss_mlp": 0.01266103, + "balance_loss_clip": 0.06277567, + "balance_loss_mlp": 0.01255839, + "epoch": 0.5947392153915527, + "flos": 20418483911040.0, + "grad_norm": 1.7485359350265648, + "language_loss": 0.53498697, + "learning_rate": 1.4891051371010726e-06, + "loss": 0.61184609, + "num_input_tokens_seen": 213092450, + "router_z_loss_clip": 1.42285156, + "router_z_loss_mlp": 0.10266113, + "step": 9892, + "time_per_iteration": 2.534221649169922 + }, + { + "auxiliary_loss_clip": 0.06313733, + "auxiliary_loss_mlp": 0.01253007, + "balance_loss_clip": 0.06254771, + "balance_loss_mlp": 0.01251455, + "epoch": 0.5947993386442206, + "flos": 65639181790080.0, + "grad_norm": 0.6531062006914405, + "language_loss": 0.54571462, + "learning_rate": 1.4887286062996375e-06, + "loss": 0.621382, + "num_input_tokens_seen": 213155465, + "router_z_loss_clip": 0.58984375, + "router_z_loss_mlp": 0.01551056, + "step": 9893, + "time_per_iteration": 3.1853702068328857 + }, + { + "auxiliary_loss_clip": 0.064126, + "auxiliary_loss_mlp": 0.0126532, + "balance_loss_clip": 0.06272365, + "balance_loss_mlp": 0.01254841, + "epoch": 0.5948594618968887, + "flos": 23189429306880.0, + "grad_norm": 1.6806512476713673, + "language_loss": 0.75017619, + "learning_rate": 1.4883520948856658e-06, + "loss": 0.82695538, + "num_input_tokens_seen": 213174875, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.10473633, + "step": 9894, + "time_per_iteration": 4.046506643295288 + }, + { + "auxiliary_loss_clip": 0.06415449, + "auxiliary_loss_mlp": 0.01265281, + "balance_loss_clip": 0.06273435, + "balance_loss_mlp": 0.01253831, + "epoch": 0.5949195851495566, + "flos": 13631991298560.0, + "grad_norm": 1.844376504699444, + "language_loss": 0.77997828, + "learning_rate": 1.487975602873434e-06, + "loss": 0.8567856, + "num_input_tokens_seen": 213192695, + "router_z_loss_clip": 1.41894531, + "router_z_loss_mlp": 0.11444092, + "step": 9895, + "time_per_iteration": 2.5028066635131836 + }, + { + "auxiliary_loss_clip": 0.06421571, + "auxiliary_loss_mlp": 0.01264682, + "balance_loss_clip": 0.06273872, + "balance_loss_mlp": 0.01252862, + "epoch": 0.5949797084022246, + "flos": 19756358547840.0, + "grad_norm": 2.034072439962686, + "language_loss": 0.79318964, + "learning_rate": 1.4875991302772182e-06, + "loss": 0.8700521, + "num_input_tokens_seen": 213211195, + "router_z_loss_clip": 1.47753906, + "router_z_loss_mlp": 0.11816406, + "step": 9896, + "time_per_iteration": 2.496610164642334 + }, + { + "auxiliary_loss_clip": 0.06420637, + "auxiliary_loss_mlp": 0.01265344, + "balance_loss_clip": 0.06275238, + "balance_loss_mlp": 0.01253709, + "epoch": 0.5950398316548925, + "flos": 25780685621760.0, + "grad_norm": 1.4418973411464253, + "language_loss": 0.8331461, + "learning_rate": 1.4872226771112954e-06, + "loss": 0.91000593, + "num_input_tokens_seen": 213231975, + "router_z_loss_clip": 1.45507812, + "router_z_loss_mlp": 0.11645508, + "step": 9897, + "time_per_iteration": 2.6055963039398193 + }, + { + "auxiliary_loss_clip": 0.06422365, + "auxiliary_loss_mlp": 0.0126489, + "balance_loss_clip": 0.06278124, + "balance_loss_mlp": 0.01254012, + "epoch": 0.5950999549075605, + "flos": 23045644500480.0, + "grad_norm": 2.157917564883112, + "language_loss": 0.71089602, + "learning_rate": 1.486846243389939e-06, + "loss": 0.78776848, + "num_input_tokens_seen": 213249760, + "router_z_loss_clip": 1.44335938, + "router_z_loss_mlp": 0.10882568, + "step": 9898, + "time_per_iteration": 3.95219087600708 + }, + { + "auxiliary_loss_clip": 0.06426959, + "auxiliary_loss_mlp": 0.01267336, + "balance_loss_clip": 0.06277014, + "balance_loss_mlp": 0.01254897, + "epoch": 0.5951600781602284, + "flos": 32453553697920.0, + "grad_norm": 2.106705884146929, + "language_loss": 0.63699448, + "learning_rate": 1.4864698291274251e-06, + "loss": 0.71393746, + "num_input_tokens_seen": 213269890, + "router_z_loss_clip": 1.49902344, + "router_z_loss_mlp": 0.12451172, + "step": 9899, + "time_per_iteration": 2.597721576690674 + }, + { + "auxiliary_loss_clip": 0.06419618, + "auxiliary_loss_mlp": 0.01270579, + "balance_loss_clip": 0.06276435, + "balance_loss_mlp": 0.01259999, + "epoch": 0.5952202014128964, + "flos": 23806887644160.0, + "grad_norm": 1.5164228353921223, + "language_loss": 0.72182071, + "learning_rate": 1.4860934343380267e-06, + "loss": 0.79872268, + "num_input_tokens_seen": 213289400, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.10571289, + "step": 9900, + "time_per_iteration": 2.5579535961151123 + }, + { + "auxiliary_loss_clip": 0.06414567, + "auxiliary_loss_mlp": 0.01267005, + "balance_loss_clip": 0.06274517, + "balance_loss_mlp": 0.01255484, + "epoch": 0.5952803246655644, + "flos": 22498778828160.0, + "grad_norm": 1.774545476213964, + "language_loss": 0.84691358, + "learning_rate": 1.4857170590360169e-06, + "loss": 0.9237293, + "num_input_tokens_seen": 213308040, + "router_z_loss_clip": 1.40039062, + "router_z_loss_mlp": 0.11523438, + "step": 9901, + "time_per_iteration": 2.532650947570801 + }, + { + "auxiliary_loss_clip": 0.06311554, + "auxiliary_loss_mlp": 0.01252152, + "balance_loss_clip": 0.06252782, + "balance_loss_mlp": 0.01250599, + "epoch": 0.5953404479182324, + "flos": 51250810884480.0, + "grad_norm": 0.7741789718205083, + "language_loss": 0.58204901, + "learning_rate": 1.4853407032356674e-06, + "loss": 0.65768605, + "num_input_tokens_seen": 213358585, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 0.01550293, + "step": 9902, + "time_per_iteration": 2.995508909225464 + }, + { + "auxiliary_loss_clip": 0.06422149, + "auxiliary_loss_mlp": 0.01268252, + "balance_loss_clip": 0.06274737, + "balance_loss_mlp": 0.01256653, + "epoch": 0.5954005711709004, + "flos": 23119423620480.0, + "grad_norm": 1.8631652775155525, + "language_loss": 0.77643347, + "learning_rate": 1.4849643669512503e-06, + "loss": 0.85333747, + "num_input_tokens_seen": 213379585, + "router_z_loss_clip": 1.47363281, + "router_z_loss_mlp": 0.11608887, + "step": 9903, + "time_per_iteration": 2.526265859603882 + }, + { + "auxiliary_loss_clip": 0.06419012, + "auxiliary_loss_mlp": 0.01265075, + "balance_loss_clip": 0.06274754, + "balance_loss_mlp": 0.01253691, + "epoch": 0.5954606944235683, + "flos": 35963464250880.0, + "grad_norm": 1.7611381352056217, + "language_loss": 0.78137469, + "learning_rate": 1.4845880501970362e-06, + "loss": 0.85821557, + "num_input_tokens_seen": 213401465, + "router_z_loss_clip": 1.44140625, + "router_z_loss_mlp": 0.1138916, + "step": 9904, + "time_per_iteration": 4.04362940788269 + }, + { + "auxiliary_loss_clip": 0.0642558, + "auxiliary_loss_mlp": 0.012642, + "balance_loss_clip": 0.06275237, + "balance_loss_mlp": 0.01252619, + "epoch": 0.5955208176762363, + "flos": 30451188677760.0, + "grad_norm": 1.2800711014437993, + "language_loss": 0.72963494, + "learning_rate": 1.4842117529872942e-06, + "loss": 0.80653274, + "num_input_tokens_seen": 213422720, + "router_z_loss_clip": 1.50390625, + "router_z_loss_mlp": 0.11566162, + "step": 9905, + "time_per_iteration": 2.630237340927124 + }, + { + "auxiliary_loss_clip": 0.06417751, + "auxiliary_loss_mlp": 0.01267213, + "balance_loss_clip": 0.06273159, + "balance_loss_mlp": 0.01255942, + "epoch": 0.5955809409289042, + "flos": 17645987214720.0, + "grad_norm": 2.1926975812717524, + "language_loss": 0.70104027, + "learning_rate": 1.483835475336295e-06, + "loss": 0.77788991, + "num_input_tokens_seen": 213439480, + "router_z_loss_clip": 1.4453125, + "router_z_loss_mlp": 0.11273193, + "step": 9906, + "time_per_iteration": 2.5136594772338867 + }, + { + "auxiliary_loss_clip": 0.06423035, + "auxiliary_loss_mlp": 0.0126641, + "balance_loss_clip": 0.06276789, + "balance_loss_mlp": 0.01254316, + "epoch": 0.5956410641815723, + "flos": 24286766376960.0, + "grad_norm": 1.7055783949352592, + "language_loss": 0.74976909, + "learning_rate": 1.4834592172583057e-06, + "loss": 0.82666361, + "num_input_tokens_seen": 213458895, + "router_z_loss_clip": 1.46191406, + "router_z_loss_mlp": 0.12103271, + "step": 9907, + "time_per_iteration": 2.5186941623687744 + }, + { + "auxiliary_loss_clip": 0.06419441, + "auxiliary_loss_mlp": 0.01268122, + "balance_loss_clip": 0.06274839, + "balance_loss_mlp": 0.01256618, + "epoch": 0.5957011874342402, + "flos": 35742713869440.0, + "grad_norm": 1.9121613205115942, + "language_loss": 0.67437243, + "learning_rate": 1.483082978767595e-06, + "loss": 0.75124806, + "num_input_tokens_seen": 213481730, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.11505127, + "step": 9908, + "time_per_iteration": 2.641977310180664 + }, + { + "auxiliary_loss_clip": 0.06417987, + "auxiliary_loss_mlp": 0.01266521, + "balance_loss_clip": 0.0627388, + "balance_loss_mlp": 0.01255459, + "epoch": 0.5957613106869082, + "flos": 21250277792640.0, + "grad_norm": 1.9262426125407, + "language_loss": 0.7637223, + "learning_rate": 1.4827067598784298e-06, + "loss": 0.84056735, + "num_input_tokens_seen": 213497225, + "router_z_loss_clip": 1.43945312, + "router_z_loss_mlp": 0.1105957, + "step": 9909, + "time_per_iteration": 2.4708259105682373 + }, + { + "auxiliary_loss_clip": 0.06309633, + "auxiliary_loss_mlp": 0.01253319, + "balance_loss_clip": 0.06250934, + "balance_loss_mlp": 0.01251702, + "epoch": 0.5958214339395761, + "flos": 65959972346880.0, + "grad_norm": 0.8925366465224025, + "language_loss": 0.73392916, + "learning_rate": 1.4823305606050753e-06, + "loss": 0.80955869, + "num_input_tokens_seen": 213556890, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 0.01618958, + "step": 9910, + "time_per_iteration": 3.2132058143615723 + }, + { + "auxiliary_loss_clip": 0.06420797, + "auxiliary_loss_mlp": 0.01266027, + "balance_loss_clip": 0.06273291, + "balance_loss_mlp": 0.01253838, + "epoch": 0.5958815571922441, + "flos": 23224872384000.0, + "grad_norm": 1.906132958424511, + "language_loss": 0.69966662, + "learning_rate": 1.481954380961799e-06, + "loss": 0.77653486, + "num_input_tokens_seen": 213575800, + "router_z_loss_clip": 1.47753906, + "router_z_loss_mlp": 0.12194824, + "step": 9911, + "time_per_iteration": 2.5891547203063965 + }, + { + "auxiliary_loss_clip": 0.06430559, + "auxiliary_loss_mlp": 0.01269185, + "balance_loss_clip": 0.06277213, + "balance_loss_mlp": 0.01256471, + "epoch": 0.595941680444912, + "flos": 16543157702400.0, + "grad_norm": 1.8117496085568294, + "language_loss": 0.65995622, + "learning_rate": 1.4815782209628631e-06, + "loss": 0.73695368, + "num_input_tokens_seen": 213592740, + "router_z_loss_clip": 1.53515625, + "router_z_loss_mlp": 0.12713623, + "step": 9912, + "time_per_iteration": 2.5106897354125977 + }, + { + "auxiliary_loss_clip": 0.06418723, + "auxiliary_loss_mlp": 0.01269847, + "balance_loss_clip": 0.06273462, + "balance_loss_mlp": 0.01257681, + "epoch": 0.59600180369758, + "flos": 27826334075520.0, + "grad_norm": 1.8937269812557305, + "language_loss": 0.73603946, + "learning_rate": 1.4812020806225337e-06, + "loss": 0.81292516, + "num_input_tokens_seen": 213611970, + "router_z_loss_clip": 1.45117188, + "router_z_loss_mlp": 0.12145996, + "step": 9913, + "time_per_iteration": 2.5845842361450195 + }, + { + "auxiliary_loss_clip": 0.06422256, + "auxiliary_loss_mlp": 0.01265933, + "balance_loss_clip": 0.06272183, + "balance_loss_mlp": 0.01254316, + "epoch": 0.596061926950248, + "flos": 29498349173760.0, + "grad_norm": 2.1687664822630692, + "language_loss": 0.79983938, + "learning_rate": 1.4808259599550738e-06, + "loss": 0.87672126, + "num_input_tokens_seen": 213632230, + "router_z_loss_clip": 1.50097656, + "router_z_loss_mlp": 0.1161499, + "step": 9914, + "time_per_iteration": 2.677943229675293 + }, + { + "auxiliary_loss_clip": 0.06418366, + "auxiliary_loss_mlp": 0.01267743, + "balance_loss_clip": 0.06274056, + "balance_loss_mlp": 0.01256233, + "epoch": 0.596122050202916, + "flos": 16842424959360.0, + "grad_norm": 1.662988077903936, + "language_loss": 0.67750293, + "learning_rate": 1.4804498589747448e-06, + "loss": 0.75436401, + "num_input_tokens_seen": 213649645, + "router_z_loss_clip": 1.44238281, + "router_z_loss_mlp": 0.1149292, + "step": 9915, + "time_per_iteration": 2.527804374694824 + }, + { + "auxiliary_loss_clip": 0.06422138, + "auxiliary_loss_mlp": 0.01266284, + "balance_loss_clip": 0.06274668, + "balance_loss_mlp": 0.01254888, + "epoch": 0.596182173455584, + "flos": 21003056720640.0, + "grad_norm": 1.4119869222981658, + "language_loss": 0.7862711, + "learning_rate": 1.4800737776958095e-06, + "loss": 0.86315531, + "num_input_tokens_seen": 213668850, + "router_z_loss_clip": 1.47363281, + "router_z_loss_mlp": 0.11395264, + "step": 9916, + "time_per_iteration": 2.5146098136901855 + }, + { + "auxiliary_loss_clip": 0.06422624, + "auxiliary_loss_mlp": 0.01266472, + "balance_loss_clip": 0.06273377, + "balance_loss_mlp": 0.01254808, + "epoch": 0.5962422967082519, + "flos": 16070364639360.0, + "grad_norm": 1.8279133386942186, + "language_loss": 0.83302379, + "learning_rate": 1.4796977161325286e-06, + "loss": 0.90991473, + "num_input_tokens_seen": 213685695, + "router_z_loss_clip": 1.49511719, + "router_z_loss_mlp": 0.11657715, + "step": 9917, + "time_per_iteration": 2.5148332118988037 + }, + { + "auxiliary_loss_clip": 0.06418853, + "auxiliary_loss_mlp": 0.0126709, + "balance_loss_clip": 0.06274682, + "balance_loss_mlp": 0.01256236, + "epoch": 0.5963024199609199, + "flos": 12171879976320.0, + "grad_norm": 1.6879177929284592, + "language_loss": 0.77521312, + "learning_rate": 1.4793216742991625e-06, + "loss": 0.85207248, + "num_input_tokens_seen": 213703515, + "router_z_loss_clip": 1.44335938, + "router_z_loss_mlp": 0.10852051, + "step": 9918, + "time_per_iteration": 2.4897613525390625 + }, + { + "auxiliary_loss_clip": 0.06419399, + "auxiliary_loss_mlp": 0.01267485, + "balance_loss_clip": 0.06274245, + "balance_loss_mlp": 0.01256661, + "epoch": 0.5963625432135878, + "flos": 28081772847360.0, + "grad_norm": 1.5296515450402863, + "language_loss": 0.7930398, + "learning_rate": 1.4789456522099707e-06, + "loss": 0.86990869, + "num_input_tokens_seen": 213724170, + "router_z_loss_clip": 1.45214844, + "router_z_loss_mlp": 0.10821533, + "step": 9919, + "time_per_iteration": 2.6023364067077637 + }, + { + "auxiliary_loss_clip": 0.06424099, + "auxiliary_loss_mlp": 0.01265086, + "balance_loss_clip": 0.06277885, + "balance_loss_mlp": 0.01253434, + "epoch": 0.5964226664662559, + "flos": 19865664599040.0, + "grad_norm": 2.0582572283345537, + "language_loss": 0.77598941, + "learning_rate": 1.4785696498792122e-06, + "loss": 0.85288125, + "num_input_tokens_seen": 213740620, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.11645508, + "step": 9920, + "time_per_iteration": 2.499610424041748 + }, + { + "auxiliary_loss_clip": 0.06428593, + "auxiliary_loss_mlp": 0.01269926, + "balance_loss_clip": 0.06280707, + "balance_loss_mlp": 0.01258124, + "epoch": 0.5964827897189238, + "flos": 12937567386240.0, + "grad_norm": 2.9535163377991647, + "language_loss": 0.8317768, + "learning_rate": 1.4781936673211446e-06, + "loss": 0.90876198, + "num_input_tokens_seen": 213755390, + "router_z_loss_clip": 1.47753906, + "router_z_loss_mlp": 0.11798096, + "step": 9921, + "time_per_iteration": 2.5134449005126953 + }, + { + "auxiliary_loss_clip": 0.06416389, + "auxiliary_loss_mlp": 0.01268083, + "balance_loss_clip": 0.0627341, + "balance_loss_mlp": 0.01256389, + "epoch": 0.5965429129715918, + "flos": 18156738977280.0, + "grad_norm": 1.8928045831706461, + "language_loss": 0.80601788, + "learning_rate": 1.4778177045500252e-06, + "loss": 0.88286257, + "num_input_tokens_seen": 213773225, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.11694336, + "step": 9922, + "time_per_iteration": 2.4813597202301025 + }, + { + "auxiliary_loss_clip": 0.06417114, + "auxiliary_loss_mlp": 0.01269772, + "balance_loss_clip": 0.06271716, + "balance_loss_mlp": 0.01258828, + "epoch": 0.5966030362242597, + "flos": 21769834233600.0, + "grad_norm": 3.055273537118157, + "language_loss": 0.7726593, + "learning_rate": 1.477441761580111e-06, + "loss": 0.84952813, + "num_input_tokens_seen": 213791860, + "router_z_loss_clip": 1.453125, + "router_z_loss_mlp": 0.10949707, + "step": 9923, + "time_per_iteration": 2.5638489723205566 + }, + { + "auxiliary_loss_clip": 0.06424043, + "auxiliary_loss_mlp": 0.01268694, + "balance_loss_clip": 0.06273048, + "balance_loss_mlp": 0.01254973, + "epoch": 0.5966631594769277, + "flos": 18813204190080.0, + "grad_norm": 1.8922524994378742, + "language_loss": 0.76095831, + "learning_rate": 1.4770658384256573e-06, + "loss": 0.83788568, + "num_input_tokens_seen": 213809455, + "router_z_loss_clip": 1.51074219, + "router_z_loss_mlp": 0.13720703, + "step": 9924, + "time_per_iteration": 2.4999732971191406 + }, + { + "auxiliary_loss_clip": 0.06413831, + "auxiliary_loss_mlp": 0.01268542, + "balance_loss_clip": 0.06272236, + "balance_loss_mlp": 0.01256633, + "epoch": 0.5967232827295956, + "flos": 14069383211520.0, + "grad_norm": 1.7112851014893713, + "language_loss": 0.66830564, + "learning_rate": 1.4766899351009204e-06, + "loss": 0.74512935, + "num_input_tokens_seen": 213826615, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.11920166, + "step": 9925, + "time_per_iteration": 2.5139551162719727 + }, + { + "auxiliary_loss_clip": 0.06421202, + "auxiliary_loss_mlp": 0.0126999, + "balance_loss_clip": 0.06279947, + "balance_loss_mlp": 0.01258409, + "epoch": 0.5967834059822636, + "flos": 17243954524800.0, + "grad_norm": 1.861204364539265, + "language_loss": 0.72200316, + "learning_rate": 1.4763140516201528e-06, + "loss": 0.79891503, + "num_input_tokens_seen": 213844495, + "router_z_loss_clip": 1.41210938, + "router_z_loss_mlp": 0.11584473, + "step": 9926, + "time_per_iteration": 3.9693188667297363 + }, + { + "auxiliary_loss_clip": 0.06422362, + "auxiliary_loss_mlp": 0.01270656, + "balance_loss_clip": 0.06274919, + "balance_loss_mlp": 0.01258556, + "epoch": 0.5968435292349316, + "flos": 42529751533440.0, + "grad_norm": 1.9299553445847866, + "language_loss": 0.70147216, + "learning_rate": 1.4759381879976088e-06, + "loss": 0.77840233, + "num_input_tokens_seen": 213869125, + "router_z_loss_clip": 1.47460938, + "router_z_loss_mlp": 0.12103271, + "step": 9927, + "time_per_iteration": 2.7299752235412598 + }, + { + "auxiliary_loss_clip": 0.06429256, + "auxiliary_loss_mlp": 0.01266883, + "balance_loss_clip": 0.06277983, + "balance_loss_mlp": 0.0125467, + "epoch": 0.5969036524875996, + "flos": 37639546272000.0, + "grad_norm": 1.5668113041571725, + "language_loss": 0.63611758, + "learning_rate": 1.4755623442475415e-06, + "loss": 0.71307898, + "num_input_tokens_seen": 213891115, + "router_z_loss_clip": 1.51269531, + "router_z_loss_mlp": 0.12213135, + "step": 9928, + "time_per_iteration": 2.7166144847869873 + }, + { + "auxiliary_loss_clip": 0.06418041, + "auxiliary_loss_mlp": 0.01265529, + "balance_loss_clip": 0.06274209, + "balance_loss_mlp": 0.01254454, + "epoch": 0.5969637757402676, + "flos": 23154992478720.0, + "grad_norm": 2.1979213221977596, + "language_loss": 0.69668317, + "learning_rate": 1.4751865203842022e-06, + "loss": 0.77351892, + "num_input_tokens_seen": 213911925, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.1105957, + "step": 9929, + "time_per_iteration": 2.51379656791687 + }, + { + "auxiliary_loss_clip": 0.0641327, + "auxiliary_loss_mlp": 0.01270831, + "balance_loss_clip": 0.06274718, + "balance_loss_mlp": 0.01259697, + "epoch": 0.5970238989929355, + "flos": 24027176828160.0, + "grad_norm": 1.690473988948275, + "language_loss": 0.7685796, + "learning_rate": 1.4748107164218431e-06, + "loss": 0.8454206, + "num_input_tokens_seen": 213930715, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.11138916, + "step": 9930, + "time_per_iteration": 2.590068817138672 + }, + { + "auxiliary_loss_clip": 0.06427103, + "auxiliary_loss_mlp": 0.01271306, + "balance_loss_clip": 0.06277532, + "balance_loss_mlp": 0.01259206, + "epoch": 0.5970840222456035, + "flos": 19432884660480.0, + "grad_norm": 1.4319660868037594, + "language_loss": 0.69073558, + "learning_rate": 1.4744349323747146e-06, + "loss": 0.76771963, + "num_input_tokens_seen": 213950015, + "router_z_loss_clip": 1.49511719, + "router_z_loss_mlp": 0.12097168, + "step": 9931, + "time_per_iteration": 2.4879701137542725 + }, + { + "auxiliary_loss_clip": 0.06314774, + "auxiliary_loss_mlp": 0.01252398, + "balance_loss_clip": 0.06255934, + "balance_loss_mlp": 0.01250752, + "epoch": 0.5971441454982714, + "flos": 62993615230080.0, + "grad_norm": 0.8560146868595252, + "language_loss": 0.64260876, + "learning_rate": 1.474059168257065e-06, + "loss": 0.71828043, + "num_input_tokens_seen": 214003330, + "router_z_loss_clip": 0.58984375, + "router_z_loss_mlp": 0.01649475, + "step": 9932, + "time_per_iteration": 3.0806198120117188 + }, + { + "auxiliary_loss_clip": 0.06415366, + "auxiliary_loss_mlp": 0.01270842, + "balance_loss_clip": 0.06272191, + "balance_loss_mlp": 0.01259976, + "epoch": 0.5972042687509395, + "flos": 20272393117440.0, + "grad_norm": 1.7768464871728415, + "language_loss": 0.74403048, + "learning_rate": 1.4736834240831454e-06, + "loss": 0.82089257, + "num_input_tokens_seen": 214021680, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.10864258, + "step": 9933, + "time_per_iteration": 3.9164891242980957 + }, + { + "auxiliary_loss_clip": 0.06316046, + "auxiliary_loss_mlp": 0.01258623, + "balance_loss_clip": 0.06257492, + "balance_loss_mlp": 0.01256835, + "epoch": 0.5972643920036074, + "flos": 71675625778560.0, + "grad_norm": 0.666650666050939, + "language_loss": 0.51957405, + "learning_rate": 1.473307699867203e-06, + "loss": 0.59532076, + "num_input_tokens_seen": 214090265, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 0.01785278, + "step": 9934, + "time_per_iteration": 3.263599157333374 + }, + { + "auxiliary_loss_clip": 0.06320157, + "auxiliary_loss_mlp": 0.01253316, + "balance_loss_clip": 0.06261201, + "balance_loss_mlp": 0.01251523, + "epoch": 0.5973245152562754, + "flos": 56910225427200.0, + "grad_norm": 0.8129555240105609, + "language_loss": 0.54121673, + "learning_rate": 1.4729319956234849e-06, + "loss": 0.61695147, + "num_input_tokens_seen": 214146375, + "router_z_loss_clip": 0.58984375, + "router_z_loss_mlp": 0.0178833, + "step": 9935, + "time_per_iteration": 3.13610577583313 + }, + { + "auxiliary_loss_clip": 0.0641949, + "auxiliary_loss_mlp": 0.01266102, + "balance_loss_clip": 0.06273362, + "balance_loss_mlp": 0.01254229, + "epoch": 0.5973846385089433, + "flos": 24170206947840.0, + "grad_norm": 1.6283043946182527, + "language_loss": 0.65934885, + "learning_rate": 1.4725563113662394e-06, + "loss": 0.7362048, + "num_input_tokens_seen": 214165340, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.11883545, + "step": 9936, + "time_per_iteration": 2.5317225456237793 + }, + { + "auxiliary_loss_clip": 0.06426519, + "auxiliary_loss_mlp": 0.01266905, + "balance_loss_clip": 0.06278973, + "balance_loss_mlp": 0.01256027, + "epoch": 0.5974447617616113, + "flos": 17675476652160.0, + "grad_norm": 1.977673103112211, + "language_loss": 0.67786443, + "learning_rate": 1.4721806471097103e-06, + "loss": 0.75479865, + "num_input_tokens_seen": 214181360, + "router_z_loss_clip": 1.47460938, + "router_z_loss_mlp": 0.10882568, + "step": 9937, + "time_per_iteration": 2.51056170463562 + }, + { + "auxiliary_loss_clip": 0.0642201, + "auxiliary_loss_mlp": 0.01272578, + "balance_loss_clip": 0.06274251, + "balance_loss_mlp": 0.01260073, + "epoch": 0.5975048850142792, + "flos": 22899008655360.0, + "grad_norm": 2.0510739773646853, + "language_loss": 0.77639204, + "learning_rate": 1.4718050028681442e-06, + "loss": 0.85333794, + "num_input_tokens_seen": 214198525, + "router_z_loss_clip": 1.47558594, + "router_z_loss_mlp": 0.12512207, + "step": 9938, + "time_per_iteration": 3.988826274871826 + }, + { + "auxiliary_loss_clip": 0.06425326, + "auxiliary_loss_mlp": 0.01266797, + "balance_loss_clip": 0.06278642, + "balance_loss_mlp": 0.01255145, + "epoch": 0.5975650082669473, + "flos": 24360042372480.0, + "grad_norm": 1.4729050693859964, + "language_loss": 0.76065636, + "learning_rate": 1.4714293786557855e-06, + "loss": 0.83757758, + "num_input_tokens_seen": 214218710, + "router_z_loss_clip": 1.46679688, + "router_z_loss_mlp": 0.11645508, + "step": 9939, + "time_per_iteration": 2.556417226791382 + }, + { + "auxiliary_loss_clip": 0.06427339, + "auxiliary_loss_mlp": 0.01268522, + "balance_loss_clip": 0.06275803, + "balance_loss_mlp": 0.01255206, + "epoch": 0.5976251315196152, + "flos": 20929696871040.0, + "grad_norm": 2.2639919876209498, + "language_loss": 0.68839771, + "learning_rate": 1.471053774486878e-06, + "loss": 0.7653563, + "num_input_tokens_seen": 214237800, + "router_z_loss_clip": 1.515625, + "router_z_loss_mlp": 0.13323975, + "step": 9940, + "time_per_iteration": 2.5342793464660645 + }, + { + "auxiliary_loss_clip": 0.06417148, + "auxiliary_loss_mlp": 0.01270575, + "balance_loss_clip": 0.06276263, + "balance_loss_mlp": 0.01259602, + "epoch": 0.5976852547722832, + "flos": 35853193877760.0, + "grad_norm": 1.2345186889810322, + "language_loss": 0.69966424, + "learning_rate": 1.470678190375664e-06, + "loss": 0.77654147, + "num_input_tokens_seen": 214260355, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.10968018, + "step": 9941, + "time_per_iteration": 2.6775453090667725 + }, + { + "auxiliary_loss_clip": 0.06416304, + "auxiliary_loss_mlp": 0.01265548, + "balance_loss_clip": 0.06272396, + "balance_loss_mlp": 0.0125433, + "epoch": 0.5977453780249512, + "flos": 12860266394880.0, + "grad_norm": 1.7893879951427467, + "language_loss": 0.77519101, + "learning_rate": 1.470302626336386e-06, + "loss": 0.85200953, + "num_input_tokens_seen": 214277120, + "router_z_loss_clip": 1.43847656, + "router_z_loss_mlp": 0.11224365, + "step": 9942, + "time_per_iteration": 2.5630502700805664 + }, + { + "auxiliary_loss_clip": 0.06422595, + "auxiliary_loss_mlp": 0.01267488, + "balance_loss_clip": 0.06273595, + "balance_loss_mlp": 0.0125478, + "epoch": 0.5978055012776191, + "flos": 20965391510400.0, + "grad_norm": 1.999196380936964, + "language_loss": 0.76118851, + "learning_rate": 1.4699270823832857e-06, + "loss": 0.83808935, + "num_input_tokens_seen": 214295300, + "router_z_loss_clip": 1.48828125, + "router_z_loss_mlp": 0.12713623, + "step": 9943, + "time_per_iteration": 3.9001221656799316 + }, + { + "auxiliary_loss_clip": 0.06417957, + "auxiliary_loss_mlp": 0.01266022, + "balance_loss_clip": 0.06274446, + "balance_loss_mlp": 0.01255728, + "epoch": 0.5978656245302871, + "flos": 34066506067200.0, + "grad_norm": 1.9908445339246823, + "language_loss": 0.62211335, + "learning_rate": 1.4695515585306032e-06, + "loss": 0.69895315, + "num_input_tokens_seen": 214317050, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.10296631, + "step": 9944, + "time_per_iteration": 2.6546871662139893 + }, + { + "auxiliary_loss_clip": 0.06420632, + "auxiliary_loss_mlp": 0.01266771, + "balance_loss_clip": 0.06276795, + "balance_loss_mlp": 0.01255333, + "epoch": 0.597925747782955, + "flos": 37381508023680.0, + "grad_norm": 1.6358533401507223, + "language_loss": 0.72854936, + "learning_rate": 1.4691760547925795e-06, + "loss": 0.80542344, + "num_input_tokens_seen": 214337470, + "router_z_loss_clip": 1.43945312, + "router_z_loss_mlp": 0.11450195, + "step": 9945, + "time_per_iteration": 2.631753444671631 + }, + { + "auxiliary_loss_clip": 0.06419382, + "auxiliary_loss_mlp": 0.01270411, + "balance_loss_clip": 0.06275386, + "balance_loss_mlp": 0.01258997, + "epoch": 0.5979858710356231, + "flos": 25381923240960.0, + "grad_norm": 1.7624660559370904, + "language_loss": 0.67425656, + "learning_rate": 1.4688005711834522e-06, + "loss": 0.75115454, + "num_input_tokens_seen": 214357975, + "router_z_loss_clip": 1.43945312, + "router_z_loss_mlp": 0.11401367, + "step": 9946, + "time_per_iteration": 2.5964295864105225 + }, + { + "auxiliary_loss_clip": 0.06427635, + "auxiliary_loss_mlp": 0.01269885, + "balance_loss_clip": 0.06277838, + "balance_loss_mlp": 0.01257678, + "epoch": 0.598045994288291, + "flos": 13703422504320.0, + "grad_norm": 1.825350503307894, + "language_loss": 0.88689518, + "learning_rate": 1.468425107717461e-06, + "loss": 0.96387035, + "num_input_tokens_seen": 214374125, + "router_z_loss_clip": 1.49804688, + "router_z_loss_mlp": 0.12194824, + "step": 9947, + "time_per_iteration": 2.47194766998291 + }, + { + "auxiliary_loss_clip": 0.06412566, + "auxiliary_loss_mlp": 0.01263948, + "balance_loss_clip": 0.06274778, + "balance_loss_mlp": 0.01253409, + "epoch": 0.598106117540959, + "flos": 21987859357440.0, + "grad_norm": 1.5868690486029033, + "language_loss": 0.71892309, + "learning_rate": 1.4680496644088432e-06, + "loss": 0.79568821, + "num_input_tokens_seen": 214393395, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.10540771, + "step": 9948, + "time_per_iteration": 2.519465446472168 + }, + { + "auxiliary_loss_clip": 0.06424625, + "auxiliary_loss_mlp": 0.0126681, + "balance_loss_clip": 0.06277405, + "balance_loss_mlp": 0.01255015, + "epoch": 0.5981662407936269, + "flos": 20565790588800.0, + "grad_norm": 1.9625714193598658, + "language_loss": 0.89521587, + "learning_rate": 1.4676742412718347e-06, + "loss": 0.97213024, + "num_input_tokens_seen": 214411550, + "router_z_loss_clip": 1.46972656, + "router_z_loss_mlp": 0.11791992, + "step": 9949, + "time_per_iteration": 2.512617588043213 + }, + { + "auxiliary_loss_clip": 0.0641937, + "auxiliary_loss_mlp": 0.01266363, + "balance_loss_clip": 0.06276002, + "balance_loss_mlp": 0.0125524, + "epoch": 0.5982263640462949, + "flos": 14069005868160.0, + "grad_norm": 2.2044341220338484, + "language_loss": 0.70866632, + "learning_rate": 1.467298838320673e-06, + "loss": 0.78552365, + "num_input_tokens_seen": 214429780, + "router_z_loss_clip": 1.43359375, + "router_z_loss_mlp": 0.11126709, + "step": 9950, + "time_per_iteration": 2.4983901977539062 + }, + { + "auxiliary_loss_clip": 0.06423427, + "auxiliary_loss_mlp": 0.01265207, + "balance_loss_clip": 0.06276861, + "balance_loss_mlp": 0.01254103, + "epoch": 0.5982864872989628, + "flos": 17712135613440.0, + "grad_norm": 1.7147951868971159, + "language_loss": 0.7865026, + "learning_rate": 1.4669234555695921e-06, + "loss": 0.86338896, + "num_input_tokens_seen": 214447775, + "router_z_loss_clip": 1.46484375, + "router_z_loss_mlp": 0.11102295, + "step": 9951, + "time_per_iteration": 2.5179500579833984 + }, + { + "auxiliary_loss_clip": 0.06422336, + "auxiliary_loss_mlp": 0.01268893, + "balance_loss_clip": 0.06276189, + "balance_loss_mlp": 0.01256215, + "epoch": 0.5983466105516309, + "flos": 16770574483200.0, + "grad_norm": 2.724642744329358, + "language_loss": 0.73936313, + "learning_rate": 1.4665480930328275e-06, + "loss": 0.81627548, + "num_input_tokens_seen": 214467245, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.12689209, + "step": 9952, + "time_per_iteration": 2.5671274662017822 + }, + { + "auxiliary_loss_clip": 0.06420863, + "auxiliary_loss_mlp": 0.01266742, + "balance_loss_clip": 0.06275067, + "balance_loss_mlp": 0.01254243, + "epoch": 0.5984067338042988, + "flos": 20048078937600.0, + "grad_norm": 1.9086154248374307, + "language_loss": 0.79033399, + "learning_rate": 1.466172750724613e-06, + "loss": 0.86721003, + "num_input_tokens_seen": 214484385, + "router_z_loss_clip": 1.45800781, + "router_z_loss_mlp": 0.12512207, + "step": 9953, + "time_per_iteration": 2.5575039386749268 + }, + { + "auxiliary_loss_clip": 0.06419245, + "auxiliary_loss_mlp": 0.01268437, + "balance_loss_clip": 0.06276231, + "balance_loss_mlp": 0.01257267, + "epoch": 0.5984668570569668, + "flos": 26326586972160.0, + "grad_norm": 1.3586799739820394, + "language_loss": 0.69871485, + "learning_rate": 1.4657974286591807e-06, + "loss": 0.77559167, + "num_input_tokens_seen": 214503465, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.1116333, + "step": 9954, + "time_per_iteration": 2.5664639472961426 + }, + { + "auxiliary_loss_clip": 0.06421678, + "auxiliary_loss_mlp": 0.01264771, + "balance_loss_clip": 0.06275603, + "balance_loss_mlp": 0.01253953, + "epoch": 0.5985269803096348, + "flos": 20599808146560.0, + "grad_norm": 3.504460387705041, + "language_loss": 0.73099947, + "learning_rate": 1.4654221268507637e-06, + "loss": 0.80786395, + "num_input_tokens_seen": 214520725, + "router_z_loss_clip": 1.46191406, + "router_z_loss_mlp": 0.10803223, + "step": 9955, + "time_per_iteration": 2.5450916290283203 + }, + { + "auxiliary_loss_clip": 0.06417805, + "auxiliary_loss_mlp": 0.01264034, + "balance_loss_clip": 0.06273872, + "balance_loss_mlp": 0.01252632, + "epoch": 0.5985871035623027, + "flos": 26871859416960.0, + "grad_norm": 1.7558609344018261, + "language_loss": 0.68993962, + "learning_rate": 1.4650468453135934e-06, + "loss": 0.76675797, + "num_input_tokens_seen": 214540675, + "router_z_loss_clip": 1.43945312, + "router_z_loss_mlp": 0.11401367, + "step": 9956, + "time_per_iteration": 2.596081256866455 + }, + { + "auxiliary_loss_clip": 0.06423829, + "auxiliary_loss_mlp": 0.01264045, + "balance_loss_clip": 0.06278121, + "balance_loss_mlp": 0.01253346, + "epoch": 0.5986472268149707, + "flos": 19615802123520.0, + "grad_norm": 2.031153762409854, + "language_loss": 0.74002242, + "learning_rate": 1.4646715840618999e-06, + "loss": 0.81690115, + "num_input_tokens_seen": 214559910, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.10699463, + "step": 9957, + "time_per_iteration": 2.5518100261688232 + }, + { + "auxiliary_loss_clip": 0.06412163, + "auxiliary_loss_mlp": 0.01266872, + "balance_loss_clip": 0.06272288, + "balance_loss_mlp": 0.01256071, + "epoch": 0.5987073500676386, + "flos": 21800371847040.0, + "grad_norm": 1.7255020808995434, + "language_loss": 0.84429491, + "learning_rate": 1.4642963431099138e-06, + "loss": 0.92108524, + "num_input_tokens_seen": 214575960, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.10803223, + "step": 9958, + "time_per_iteration": 2.5053975582122803 + }, + { + "auxiliary_loss_clip": 0.06420925, + "auxiliary_loss_mlp": 0.01267847, + "balance_loss_clip": 0.06275073, + "balance_loss_mlp": 0.01256594, + "epoch": 0.5987674733203067, + "flos": 24320909715840.0, + "grad_norm": 1.676255529467866, + "language_loss": 0.66404957, + "learning_rate": 1.463921122471864e-06, + "loss": 0.74093723, + "num_input_tokens_seen": 214594230, + "router_z_loss_clip": 1.45703125, + "router_z_loss_mlp": 0.11248779, + "step": 9959, + "time_per_iteration": 2.577558994293213 + }, + { + "auxiliary_loss_clip": 0.06423216, + "auxiliary_loss_mlp": 0.01263705, + "balance_loss_clip": 0.06278974, + "balance_loss_mlp": 0.01253418, + "epoch": 0.5988275965729746, + "flos": 21325859775360.0, + "grad_norm": 1.5343309289681366, + "language_loss": 0.83860743, + "learning_rate": 1.4635459221619796e-06, + "loss": 0.91547662, + "num_input_tokens_seen": 214613130, + "router_z_loss_clip": 1.44335938, + "router_z_loss_mlp": 0.10296631, + "step": 9960, + "time_per_iteration": 2.5171096324920654 + }, + { + "auxiliary_loss_clip": 0.06416292, + "auxiliary_loss_mlp": 0.01266192, + "balance_loss_clip": 0.0627322, + "balance_loss_mlp": 0.01254927, + "epoch": 0.5988877198256426, + "flos": 25124891241600.0, + "grad_norm": 1.3977520489587403, + "language_loss": 0.79645187, + "learning_rate": 1.4631707421944868e-06, + "loss": 0.87327671, + "num_input_tokens_seen": 214634470, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.11260986, + "step": 9961, + "time_per_iteration": 2.5664830207824707 + }, + { + "auxiliary_loss_clip": 0.06418522, + "auxiliary_loss_mlp": 0.01263845, + "balance_loss_clip": 0.0627479, + "balance_loss_mlp": 0.01253337, + "epoch": 0.5989478430783105, + "flos": 26435767242240.0, + "grad_norm": 1.8145848373023497, + "language_loss": 0.67511421, + "learning_rate": 1.4627955825836136e-06, + "loss": 0.75193793, + "num_input_tokens_seen": 214654030, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.10516357, + "step": 9962, + "time_per_iteration": 2.5658552646636963 + }, + { + "auxiliary_loss_clip": 0.06419411, + "auxiliary_loss_mlp": 0.01269677, + "balance_loss_clip": 0.06275185, + "balance_loss_mlp": 0.01258698, + "epoch": 0.5990079663309785, + "flos": 25786010355840.0, + "grad_norm": 1.2715525883777674, + "language_loss": 0.74696618, + "learning_rate": 1.4624204433435857e-06, + "loss": 0.82385707, + "num_input_tokens_seen": 214676985, + "router_z_loss_clip": 1.44238281, + "router_z_loss_mlp": 0.10980225, + "step": 9963, + "time_per_iteration": 2.5959842205047607 + }, + { + "auxiliary_loss_clip": 0.06414087, + "auxiliary_loss_mlp": 0.01266086, + "balance_loss_clip": 0.06273367, + "balance_loss_mlp": 0.01255494, + "epoch": 0.5990680895836464, + "flos": 36840889480320.0, + "grad_norm": 1.7000475586235915, + "language_loss": 0.68318057, + "learning_rate": 1.4620453244886281e-06, + "loss": 0.75998235, + "num_input_tokens_seen": 214700105, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.10601807, + "step": 9964, + "time_per_iteration": 2.652066230773926 + }, + { + "auxiliary_loss_clip": 0.06415234, + "auxiliary_loss_mlp": 0.01266775, + "balance_loss_clip": 0.06276559, + "balance_loss_mlp": 0.01256219, + "epoch": 0.5991282128363145, + "flos": 24140340167040.0, + "grad_norm": 1.9446201927807645, + "language_loss": 0.77307773, + "learning_rate": 1.4616702260329662e-06, + "loss": 0.84989786, + "num_input_tokens_seen": 214717885, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.10559082, + "step": 9965, + "time_per_iteration": 2.5652666091918945 + }, + { + "auxiliary_loss_clip": 0.0641766, + "auxiliary_loss_mlp": 0.01265032, + "balance_loss_clip": 0.0627239, + "balance_loss_mlp": 0.01254076, + "epoch": 0.5991883360889824, + "flos": 10308310444800.0, + "grad_norm": 2.43508720605834, + "language_loss": 0.77253437, + "learning_rate": 1.4612951479908229e-06, + "loss": 0.8493613, + "num_input_tokens_seen": 214733680, + "router_z_loss_clip": 1.45214844, + "router_z_loss_mlp": 0.10955811, + "step": 9966, + "time_per_iteration": 3.8983960151672363 + }, + { + "auxiliary_loss_clip": 0.06418956, + "auxiliary_loss_mlp": 0.01264547, + "balance_loss_clip": 0.06277221, + "balance_loss_mlp": 0.01254462, + "epoch": 0.5992484593416504, + "flos": 23957967755520.0, + "grad_norm": 1.382537362814459, + "language_loss": 0.73829538, + "learning_rate": 1.460920090376422e-06, + "loss": 0.81513047, + "num_input_tokens_seen": 214753285, + "router_z_loss_clip": 1.41894531, + "router_z_loss_mlp": 0.10095215, + "step": 9967, + "time_per_iteration": 2.55789852142334 + }, + { + "auxiliary_loss_clip": 0.06430869, + "auxiliary_loss_mlp": 0.01269853, + "balance_loss_clip": 0.06279887, + "balance_loss_mlp": 0.01258177, + "epoch": 0.5993085825943184, + "flos": 11948320483200.0, + "grad_norm": 2.02451624384261, + "language_loss": 0.69043863, + "learning_rate": 1.4605450532039847e-06, + "loss": 0.76744592, + "num_input_tokens_seen": 214767810, + "router_z_loss_clip": 1.50878906, + "router_z_loss_mlp": 0.11669922, + "step": 9968, + "time_per_iteration": 2.4782519340515137 + }, + { + "auxiliary_loss_clip": 0.06417669, + "auxiliary_loss_mlp": 0.01265537, + "balance_loss_clip": 0.06270653, + "balance_loss_mlp": 0.01253926, + "epoch": 0.5993687058469863, + "flos": 19032990249600.0, + "grad_norm": 1.5128271497944086, + "language_loss": 0.79284239, + "learning_rate": 1.4601700364877334e-06, + "loss": 0.86967438, + "num_input_tokens_seen": 214786040, + "router_z_loss_clip": 1.46972656, + "router_z_loss_mlp": 0.11608887, + "step": 9969, + "time_per_iteration": 2.5151612758636475 + }, + { + "auxiliary_loss_clip": 0.06416395, + "auxiliary_loss_mlp": 0.01265086, + "balance_loss_clip": 0.06272908, + "balance_loss_mlp": 0.0125369, + "epoch": 0.5994288290996543, + "flos": 14288204949120.0, + "grad_norm": 1.5374697799261579, + "language_loss": 0.81015587, + "learning_rate": 1.4597950402418889e-06, + "loss": 0.88697076, + "num_input_tokens_seen": 214803110, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.11383057, + "step": 9970, + "time_per_iteration": 2.5037295818328857 + }, + { + "auxiliary_loss_clip": 0.06425726, + "auxiliary_loss_mlp": 0.01265933, + "balance_loss_clip": 0.06278643, + "balance_loss_mlp": 0.01253136, + "epoch": 0.5994889523523222, + "flos": 19212385841280.0, + "grad_norm": 1.7784771847806544, + "language_loss": 0.6253432, + "learning_rate": 1.4594200644806697e-06, + "loss": 0.70225984, + "num_input_tokens_seen": 214819945, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 0.12805176, + "step": 9971, + "time_per_iteration": 2.5600948333740234 + }, + { + "auxiliary_loss_clip": 0.0641441, + "auxiliary_loss_mlp": 0.01262981, + "balance_loss_clip": 0.06275569, + "balance_loss_mlp": 0.01252121, + "epoch": 0.5995490756049903, + "flos": 28044401126400.0, + "grad_norm": 1.5809560666799003, + "language_loss": 0.79321986, + "learning_rate": 1.4590451092182962e-06, + "loss": 0.86999381, + "num_input_tokens_seen": 214838810, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.10864258, + "step": 9972, + "time_per_iteration": 2.5908236503601074 + }, + { + "auxiliary_loss_clip": 0.06426332, + "auxiliary_loss_mlp": 0.01268265, + "balance_loss_clip": 0.06275315, + "balance_loss_mlp": 0.01256595, + "epoch": 0.5996091988576582, + "flos": 29059531741440.0, + "grad_norm": 2.0347749890566957, + "language_loss": 0.76122165, + "learning_rate": 1.4586701744689864e-06, + "loss": 0.83816767, + "num_input_tokens_seen": 214857040, + "router_z_loss_clip": 1.50878906, + "router_z_loss_mlp": 0.11663818, + "step": 9973, + "time_per_iteration": 4.03744912147522 + }, + { + "auxiliary_loss_clip": 0.06415765, + "auxiliary_loss_mlp": 0.01269004, + "balance_loss_clip": 0.06271605, + "balance_loss_mlp": 0.01258048, + "epoch": 0.5996693221103262, + "flos": 20820306965760.0, + "grad_norm": 8.14230844682113, + "language_loss": 0.65456331, + "learning_rate": 1.4582952602469578e-06, + "loss": 0.73141098, + "num_input_tokens_seen": 214873375, + "router_z_loss_clip": 1.44140625, + "router_z_loss_mlp": 0.10961914, + "step": 9974, + "time_per_iteration": 2.545727491378784 + }, + { + "auxiliary_loss_clip": 0.06421987, + "auxiliary_loss_mlp": 0.01267073, + "balance_loss_clip": 0.06277154, + "balance_loss_mlp": 0.0125607, + "epoch": 0.5997294453629941, + "flos": 23775679198080.0, + "grad_norm": 1.6348808694128185, + "language_loss": 0.74560261, + "learning_rate": 1.457920366566428e-06, + "loss": 0.8224932, + "num_input_tokens_seen": 214893900, + "router_z_loss_clip": 1.44726562, + "router_z_loss_mlp": 0.11010742, + "step": 9975, + "time_per_iteration": 2.515960931777954 + }, + { + "auxiliary_loss_clip": 0.06416074, + "auxiliary_loss_mlp": 0.01267839, + "balance_loss_clip": 0.06272042, + "balance_loss_mlp": 0.01256985, + "epoch": 0.5997895686156621, + "flos": 20966397759360.0, + "grad_norm": 1.627086760059136, + "language_loss": 0.77381539, + "learning_rate": 1.457545493441611e-06, + "loss": 0.85065448, + "num_input_tokens_seen": 214912110, + "router_z_loss_clip": 1.44042969, + "router_z_loss_mlp": 0.10864258, + "step": 9976, + "time_per_iteration": 2.5143842697143555 + }, + { + "auxiliary_loss_clip": 0.06419265, + "auxiliary_loss_mlp": 0.01265963, + "balance_loss_clip": 0.06276691, + "balance_loss_mlp": 0.01255162, + "epoch": 0.59984969186833, + "flos": 28372864331520.0, + "grad_norm": 2.2336999868815837, + "language_loss": 0.75166976, + "learning_rate": 1.4571706408867237e-06, + "loss": 0.82852209, + "num_input_tokens_seen": 214930140, + "router_z_loss_clip": 1.42480469, + "router_z_loss_mlp": 0.10803223, + "step": 9977, + "time_per_iteration": 2.5434179306030273 + }, + { + "auxiliary_loss_clip": 0.06417818, + "auxiliary_loss_mlp": 0.01269861, + "balance_loss_clip": 0.06272452, + "balance_loss_mlp": 0.01258358, + "epoch": 0.5999098151209981, + "flos": 22572641802240.0, + "grad_norm": 1.5140714638849335, + "language_loss": 0.69135988, + "learning_rate": 1.4567958089159802e-06, + "loss": 0.76823664, + "num_input_tokens_seen": 214949200, + "router_z_loss_clip": 1.453125, + "router_z_loss_mlp": 0.11499023, + "step": 9978, + "time_per_iteration": 3.9952354431152344 + }, + { + "auxiliary_loss_clip": 0.06421594, + "auxiliary_loss_mlp": 0.01266213, + "balance_loss_clip": 0.06274537, + "balance_loss_mlp": 0.01254977, + "epoch": 0.599969938373666, + "flos": 18774365022720.0, + "grad_norm": 1.8838130799328623, + "language_loss": 0.81737733, + "learning_rate": 1.456420997543594e-06, + "loss": 0.89425546, + "num_input_tokens_seen": 214965775, + "router_z_loss_clip": 1.46972656, + "router_z_loss_mlp": 0.11236572, + "step": 9979, + "time_per_iteration": 2.5352437496185303 + }, + { + "auxiliary_loss_clip": 0.06412499, + "auxiliary_loss_mlp": 0.01267556, + "balance_loss_clip": 0.06274675, + "balance_loss_mlp": 0.01257239, + "epoch": 0.600030061626334, + "flos": 11331910321920.0, + "grad_norm": 1.7106471218945785, + "language_loss": 0.70199746, + "learning_rate": 1.4560462067837782e-06, + "loss": 0.77879798, + "num_input_tokens_seen": 214982480, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.10314941, + "step": 9980, + "time_per_iteration": 2.4757728576660156 + }, + { + "auxiliary_loss_clip": 0.06423149, + "auxiliary_loss_mlp": 0.01269991, + "balance_loss_clip": 0.06274426, + "balance_loss_mlp": 0.01258463, + "epoch": 0.600090184879002, + "flos": 16583799732480.0, + "grad_norm": 2.417469697653489, + "language_loss": 0.690139, + "learning_rate": 1.4556714366507445e-06, + "loss": 0.76707041, + "num_input_tokens_seen": 214998110, + "router_z_loss_clip": 1.48925781, + "router_z_loss_mlp": 0.11523438, + "step": 9981, + "time_per_iteration": 2.4791438579559326 + }, + { + "auxiliary_loss_clip": 0.0641709, + "auxiliary_loss_mlp": 0.01265689, + "balance_loss_clip": 0.0627474, + "balance_loss_mlp": 0.01255342, + "epoch": 0.6001503081316699, + "flos": 23624641013760.0, + "grad_norm": 3.5503488009813275, + "language_loss": 0.78682542, + "learning_rate": 1.4552966871587048e-06, + "loss": 0.86365318, + "num_input_tokens_seen": 215017995, + "router_z_loss_clip": 1.42382812, + "router_z_loss_mlp": 0.10345459, + "step": 9982, + "time_per_iteration": 2.517265796661377 + }, + { + "auxiliary_loss_clip": 0.06418465, + "auxiliary_loss_mlp": 0.01269533, + "balance_loss_clip": 0.06276916, + "balance_loss_mlp": 0.01258852, + "epoch": 0.6002104313843379, + "flos": 20673922682880.0, + "grad_norm": 1.4834511581102687, + "language_loss": 0.72993171, + "learning_rate": 1.4549219583218686e-06, + "loss": 0.80681169, + "num_input_tokens_seen": 215038285, + "router_z_loss_clip": 1.41308594, + "router_z_loss_mlp": 0.10681152, + "step": 9983, + "time_per_iteration": 2.5322060585021973 + }, + { + "auxiliary_loss_clip": 0.06419442, + "auxiliary_loss_mlp": 0.01265277, + "balance_loss_clip": 0.0627455, + "balance_loss_mlp": 0.01254274, + "epoch": 0.6002705546370058, + "flos": 22461742523520.0, + "grad_norm": 1.817313812044092, + "language_loss": 0.77973288, + "learning_rate": 1.454547250154447e-06, + "loss": 0.85658008, + "num_input_tokens_seen": 215057825, + "router_z_loss_clip": 1.44921875, + "router_z_loss_mlp": 0.10998535, + "step": 9984, + "time_per_iteration": 3.889902353286743 + } + ], + "logging_steps": 1.0, + "max_steps": 16632, + "num_input_tokens_seen": 215057825, + "num_train_epochs": 1, + "save_steps": 3328, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 8.386747939086664e+17, + "train_batch_size": 5, + "trial_name": null, + "trial_params": null +} diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-9984/training_args.bin b/sft/revise_Full_smoe_tcmoe/checkpoint-9984/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..97c752df28a864c1e1da329f5474435eefe7778b --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-9984/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fda08a1e9d46ee3a47070dfbfdde239474b3b39c0e298dedbf0b0dd9cdd3c27e +size 7992 diff --git a/sft/revise_Full_smoe_tcmoe/checkpoint-9984/zero_to_fp32.py b/sft/revise_Full_smoe_tcmoe/checkpoint-9984/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/checkpoint-9984/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/sft/revise_Full_smoe_tcmoe/config.json b/sft/revise_Full_smoe_tcmoe/config.json new file mode 100644 index 0000000000000000000000000000000000000000..8a73f2c80008d5cc51a8db0d9cdd11e47fe6e845 --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/config.json @@ -0,0 +1,203 @@ +{ + "_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "architectures": [ + "LlavaPhiForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_phi3.Phi3Config", + "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM" + }, + "bal_comp_loss_coef": 0.01, + "balance_loss_coef": 0.01, + "bos_token_id": 1, + "clip_smoe": true, + "diversity_loss_coef": 0.01, + "dropout": false, + "e_loss_coef": 0.001, + "embd_pdrop": 0.0, + "entropy_advance_loss": false, + "eos_token_id": 32000, + "freeze_backbone": false, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 3072, + "hybrid": false, + "image_aspect_ratio": "pad", + "init_weight": true, + "initializer_range": 0.02, + "intermediate_size": 8192, + "is_cosine": false, + "is_norm_weight": false, + "local_rank": 0, + "loss1": "balanceloss", + "loss2": "zloss", + "luna": false, + "max_compete_in_iter": 3, + "max_position_embeddings": 131072, + "mlp_smoe": true, + "mm_hidden_size": 1152, + "mm_patch_merge_type": "flat", + "mm_projector_lr": null, + "mm_projector_type": "moe", + "mm_use_im_patch_token": false, + "mm_use_im_start_end": false, + "mm_vision_select_feature": "patch", + "mm_vision_select_layer": -2, + "mm_vision_tower": "google/siglip-so400m-patch14-224", + "model_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "model_type": "llava_phi", + "moe_name": "smoe_tcmoe", + "moe_relu_l1_reg_coeff_multiplier": 1.2, + "mp_pixel_shuffle_factor": 1, + "norm_softmax": false, + "normalization": true, + "num_attention_heads": 32, + "num_experts": 4, + "num_hidden_layers": 32, + "num_key_value_heads": 32, + "num_layers": 3, + "num_selected": 2, + "number_of_previous_tokens": 2, + "original_max_position_embeddings": 4096, + "pad_token_id": 32000, + "pretrain_mm_mlp_adapter": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft/mm_projector.bin", + "rate_compete": 0.2, + "rate_flip": 0.05, + "resid_pdrop": 0.0, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "long_factor": [ + 1.0800000429153442, + 1.1100000143051147, + 1.1399999856948853, + 1.340000033378601, + 1.5899999141693115, + 1.600000023841858, + 1.6200000047683716, + 2.620000123977661, + 3.2300000190734863, + 3.2300000190734863, + 4.789999961853027, + 7.400000095367432, + 7.700000286102295, + 9.09000015258789, + 12.199999809265137, + 17.670000076293945, + 24.46000099182129, + 28.57000160217285, + 30.420001983642578, + 30.840002059936523, + 32.590003967285156, + 32.93000411987305, + 42.320003509521484, + 44.96000289916992, + 50.340003967285156, + 50.45000457763672, + 57.55000305175781, + 57.93000411987305, + 58.21000289916992, + 60.1400032043457, + 62.61000442504883, + 62.62000274658203, + 62.71000289916992, + 63.1400032043457, + 63.1400032043457, + 63.77000427246094, + 63.93000411987305, + 63.96000289916992, + 63.970001220703125, + 64.02999877929688, + 64.06999969482422, + 64.08000183105469, + 64.12000274658203, + 64.41000366210938, + 64.4800033569336, + 64.51000213623047, + 64.52999877929688, + 64.83999633789062 + ], + "short_factor": [ + 1.0, + 1.0199999809265137, + 1.0299999713897705, + 1.0299999713897705, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0699999332427979, + 1.0999999046325684, + 1.1099998950958252, + 1.1599998474121094, + 1.1599998474121094, + 1.1699998378753662, + 1.2899998426437378, + 1.339999794960022, + 1.679999828338623, + 1.7899998426437378, + 1.8199998140335083, + 1.8499997854232788, + 1.8799997568130493, + 1.9099997282028198, + 1.9399996995925903, + 1.9899996519088745, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0799996852874756, + 2.0899996757507324, + 2.189999580383301, + 2.2199995517730713, + 2.5899994373321533, + 2.729999542236328, + 2.749999523162842, + 2.8399994373321533 + ], + "type": "longrope" + }, + "rope_theta": 10000.0, + "router_loss_coef": 0.01, + "router_theta": 0.1, + "router_z_loss_coef": 0.001, + "scales": [ + 1, + 3 + ], + "sliding_window": 262144, + "sparse_upcycling": true, + "std_gate": 0.02, + "strategy_train": "base", + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "topk_max": 2, + "topk_min": 1, + "torch_dtype": "bfloat16", + "training": true, + "transformers_version": "4.43.0", + "tune_mm_mlp_adapter": false, + "unit_test": true, + "use_cache": true, + "use_mm_proj": true, + "use_old": false, + "version": "phi35", + "vision_tower": "google/siglip-so400m-patch14-224", + "vision_tower_dir": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft/clip.bin", + "vocab_size": 32064, + "warm_up": 0.05 +} diff --git a/sft/revise_Full_smoe_tcmoe/generation_config.json b/sft/revise_Full_smoe_tcmoe/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..dad5c4578f0dc5969b38755d095fc30c368bb54a --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/generation_config.json @@ -0,0 +1,12 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "do_sample": true, + "eos_token_id": [ + 32007, + 32001, + 32000 + ], + "pad_token_id": 32000, + "transformers_version": "4.43.0" +} diff --git a/sft/revise_Full_smoe_tcmoe/model-00001-of-00003.safetensors b/sft/revise_Full_smoe_tcmoe/model-00001-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..798443627f1263899c866256fd59ce2fbe0df56e --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/model-00001-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8cfb15cbd71963baf99565fd4d9fd14e98800e52fdf006178b86ce47ea734594 +size 4972489328 diff --git a/sft/revise_Full_smoe_tcmoe/model-00002-of-00003.safetensors b/sft/revise_Full_smoe_tcmoe/model-00002-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..73d57713990bf2eb68dce8f3c9b432b6de492551 --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/model-00002-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:362553433cdf9d93f9a4467259536767eb6e05ac7c09ef282b394d2a64ba8130 +size 4985902928 diff --git a/sft/revise_Full_smoe_tcmoe/model-00003-of-00003.safetensors b/sft/revise_Full_smoe_tcmoe/model-00003-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e75f96a46fc6b9dddc4cdc76f9b3a70117d39c36 --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/model-00003-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:95ca63b9e9d9ded2c4a3a328f34c7a554c0b0ad5d0ef0a874d9fb47cf24df3bd +size 248971200 diff --git a/sft/revise_Full_smoe_tcmoe/model.safetensors.index.json b/sft/revise_Full_smoe_tcmoe/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..3197289c4553bb4cba30dd31a8c232b7496a92b5 --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/model.safetensors.index.json @@ -0,0 +1,1005 @@ +{ + "metadata": { + "total_size": 10207220352 + }, + "weight_map": { + "lm_head.weight": "model-00003-of-00003.safetensors", + "model.embed_tokens.weight": "model-00001-of-00003.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.16.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.16.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.17.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.17.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.18.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.18.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.19.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.19.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.21.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.25.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.25.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.25.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.26.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.26.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.26.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.27.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.27.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.27.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.28.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.28.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.28.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.29.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.29.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.29.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.30.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.30.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.30.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.31.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.31.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.31.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.mm_projector.moelayer.experts.0.0.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.0.0.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.0.2.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.0.2.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.1.0.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.1.0.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.1.2.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.1.2.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.2.0.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.2.0.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.2.2.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.2.2.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.3.0.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.3.0.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.3.2.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.3.2.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.gate.weight": "model-00003-of-00003.safetensors", + "model.norm.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00002-of-00003.safetensors" + } +} diff --git a/sft/revise_Full_smoe_tcmoe/special_tokens_map.json b/sft/revise_Full_smoe_tcmoe/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3e4d5a5bc1cb51753cc9ae0305ece0da60052b10 --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/sft/revise_Full_smoe_tcmoe/tokenizer.model b/sft/revise_Full_smoe_tcmoe/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/sft/revise_Full_smoe_tcmoe/tokenizer_config.json b/sft/revise_Full_smoe_tcmoe/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d579bb0b91b24b214ea3c2e487e27a65017cdc4a --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/tokenizer_config.json @@ -0,0 +1,132 @@ +{ + "add_bos_token": false, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": false + }, + "32000": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32001": { + "content": "<|assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32002": { + "content": "<|placeholder1|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32003": { + "content": "<|placeholder2|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32004": { + "content": "<|placeholder3|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32005": { + "content": "<|placeholder4|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32006": { + "content": "<|system|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32007": { + "content": "<|end|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32008": { + "content": "<|placeholder5|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32009": { + "content": "<|placeholder6|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32010": { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "chat_template": "{% for message in messages %}{% if message['role'] == 'system' and message['content'] %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "legacy": false, + "model_max_length": 2048, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/sft/revise_Full_smoe_tcmoe/trainer_state.json b/sft/revise_Full_smoe_tcmoe/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..545069f086e40ff9237ef93f552cfc1e646a1df4 --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/trainer_state.json @@ -0,0 +1,282787 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.999969938373666, + "eval_steps": 500, + "global_step": 16632, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "auxiliary_loss_clip": 0.20073968, + "auxiliary_loss_mlp": 1.0941844, + "balance_loss_clip": 0.12873733, + "balance_loss_mlp": 0.03705556, + "epoch": 6.012325266796934e-05, + "flos": 24462952254720.0, + "grad_norm": 941654.8300602314, + "language_loss": 24.32558632, + "learning_rate": 0.0, + "loss": 16.92002487, + "num_input_tokens_seen": 19155, + "router_z_loss_clip": 72.03125, + "router_z_loss_mlp": 1058.5, + "step": 1, + "time_per_iteration": 18.343486785888672 + }, + { + "auxiliary_loss_clip": 0.13316599, + "auxiliary_loss_mlp": 0.71558112, + "balance_loss_clip": 0.08576315, + "balance_loss_mlp": 0.02466314, + "epoch": 0.00012024650533593868, + "flos": 20231457598080.0, + "grad_norm": 271164.48776572174, + "language_loss": 15.90828419, + "learning_rate": 4.4628432569317594e-07, + "loss": 16.75703049, + "num_input_tokens_seen": 36175, + "router_z_loss_clip": 47.40625, + "router_z_loss_mlp": 691.5, + "step": 2, + "time_per_iteration": 2.4823946952819824 + }, + { + "auxiliary_loss_clip": 0.13345747, + "auxiliary_loss_mlp": 0.73460984, + "balance_loss_clip": 0.08591475, + "balance_loss_mlp": 0.02464893, + "epoch": 0.000180369758003908, + "flos": 22316532197760.0, + "grad_norm": 30890.300344628693, + "language_loss": 15.82156086, + "learning_rate": 7.073439208833112e-07, + "loss": 16.68962669, + "num_input_tokens_seen": 54870, + "router_z_loss_clip": 47.46875, + "router_z_loss_mlp": 711.0, + "step": 3, + "time_per_iteration": 2.4773216247558594 + }, + { + "auxiliary_loss_clip": 0.13399127, + "auxiliary_loss_mlp": 0.72687411, + "balance_loss_clip": 0.08587996, + "balance_loss_mlp": 0.02472562, + "epoch": 0.00024049301067187735, + "flos": 22420471587840.0, + "grad_norm": 3825.373736974443, + "language_loss": 15.7262888, + "learning_rate": 8.925686513863519e-07, + "loss": 16.58715439, + "num_input_tokens_seen": 74575, + "router_z_loss_clip": 48.15625, + "router_z_loss_mlp": 703.0, + "step": 4, + "time_per_iteration": 2.492133378982544 + }, + { + "auxiliary_loss_clip": 0.13353133, + "auxiliary_loss_mlp": 0.72775936, + "balance_loss_clip": 0.08579096, + "balance_loss_mlp": 0.02463434, + "epoch": 0.0003006162633398467, + "flos": 21403286547840.0, + "grad_norm": 4441.394942298188, + "language_loss": 15.57899952, + "learning_rate": 1.0362401141348472e-06, + "loss": 16.44029045, + "num_input_tokens_seen": 92580, + "router_z_loss_clip": 47.65625, + "router_z_loss_mlp": 704.0, + "step": 5, + "time_per_iteration": 2.7607173919677734 + }, + { + "auxiliary_loss_clip": 0.13327441, + "auxiliary_loss_mlp": 0.71557182, + "balance_loss_clip": 0.08570103, + "balance_loss_mlp": 0.02465384, + "epoch": 0.000360739516007816, + "flos": 21658725319680.0, + "grad_norm": 2540.715684092784, + "language_loss": 14.90827179, + "learning_rate": 1.153628246576487e-06, + "loss": 15.75711823, + "num_input_tokens_seen": 109705, + "router_z_loss_clip": 47.5625, + "router_z_loss_mlp": 691.5, + "step": 6, + "time_per_iteration": 2.6497979164123535 + }, + { + "auxiliary_loss_clip": 0.13351092, + "auxiliary_loss_mlp": 0.7340821, + "balance_loss_clip": 0.08562777, + "balance_loss_mlp": 0.02460942, + "epoch": 0.0004208627686757854, + "flos": 27166682407680.0, + "grad_norm": 2502.417206046203, + "language_loss": 14.593853, + "learning_rate": 1.2528784983718962e-06, + "loss": 15.46144581, + "num_input_tokens_seen": 129425, + "router_z_loss_clip": 47.875, + "router_z_loss_mlp": 710.5, + "step": 7, + "time_per_iteration": 2.7325549125671387 + }, + { + "auxiliary_loss_clip": 0.13360947, + "auxiliary_loss_mlp": 0.73910165, + "balance_loss_clip": 0.08574936, + "balance_loss_mlp": 0.02474618, + "epoch": 0.0004809860213437547, + "flos": 31326727190400.0, + "grad_norm": 4081.02679202092, + "language_loss": 14.47960091, + "learning_rate": 1.338852977079528e-06, + "loss": 15.35231113, + "num_input_tokens_seen": 149210, + "router_z_loss_clip": 47.78125, + "router_z_loss_mlp": 715.5, + "step": 8, + "time_per_iteration": 2.7674574851989746 + }, + { + "auxiliary_loss_clip": 0.13345738, + "auxiliary_loss_mlp": 0.74048162, + "balance_loss_clip": 0.08564517, + "balance_loss_mlp": 0.02466127, + "epoch": 0.000541109274011724, + "flos": 32168541634560.0, + "grad_norm": 2607.7195165159947, + "language_loss": 13.74505424, + "learning_rate": 1.4146878417666224e-06, + "loss": 14.61899281, + "num_input_tokens_seen": 169055, + "router_z_loss_clip": 47.78125, + "router_z_loss_mlp": 716.5, + "step": 9, + "time_per_iteration": 2.8135807514190674 + }, + { + "auxiliary_loss_clip": 0.13289651, + "auxiliary_loss_mlp": 0.7478379, + "balance_loss_clip": 0.08548209, + "balance_loss_mlp": 0.02469334, + "epoch": 0.0006012325266796934, + "flos": 18922845657600.0, + "grad_norm": 8226.203152944285, + "language_loss": 12.47718525, + "learning_rate": 1.4825244398280232e-06, + "loss": 13.35791969, + "num_input_tokens_seen": 188045, + "router_z_loss_clip": 47.375, + "router_z_loss_mlp": 724.5, + "step": 10, + "time_per_iteration": 2.665703296661377 + }, + { + "auxiliary_loss_clip": 0.1330242, + "auxiliary_loss_mlp": 0.74298382, + "balance_loss_clip": 0.08549603, + "balance_loss_mlp": 0.02472211, + "epoch": 0.0006613557793476627, + "flos": 20780755038720.0, + "grad_norm": 29924.608712817644, + "language_loss": 12.23305321, + "learning_rate": 1.5438901072051983e-06, + "loss": 13.10906219, + "num_input_tokens_seen": 207035, + "router_z_loss_clip": 47.53125, + "router_z_loss_mlp": 719.0, + "step": 11, + "time_per_iteration": 2.6799204349517822 + }, + { + "auxiliary_loss_clip": 0.133246, + "auxiliary_loss_mlp": 0.74782056, + "balance_loss_clip": 0.08560382, + "balance_loss_mlp": 0.02467602, + "epoch": 0.000721479032015632, + "flos": 16587321603840.0, + "grad_norm": 24119.088684995622, + "language_loss": 11.84583473, + "learning_rate": 1.5999125722696629e-06, + "loss": 12.72690105, + "num_input_tokens_seen": 223225, + "router_z_loss_clip": 47.6875, + "router_z_loss_mlp": 723.5, + "step": 12, + "time_per_iteration": 2.707231044769287 + }, + { + "auxiliary_loss_clip": 0.13276552, + "auxiliary_loss_mlp": 0.74238944, + "balance_loss_clip": 0.08559544, + "balance_loss_mlp": 0.02461605, + "epoch": 0.0007816022846836014, + "flos": 23812254305280.0, + "grad_norm": 118556.26638855682, + "language_loss": 11.36912918, + "learning_rate": 1.6514482443788434e-06, + "loss": 12.24428368, + "num_input_tokens_seen": 242570, + "router_z_loss_clip": 47.1875, + "router_z_loss_mlp": 718.0, + "step": 13, + "time_per_iteration": 2.696007251739502 + }, + { + "auxiliary_loss_clip": 0.13292459, + "auxiliary_loss_mlp": 0.74095768, + "balance_loss_clip": 0.0856985, + "balance_loss_mlp": 0.02464909, + "epoch": 0.0008417255373515708, + "flos": 19178284429440.0, + "grad_norm": 181106.81391623587, + "language_loss": 10.94849205, + "learning_rate": 1.6991628240650723e-06, + "loss": 11.82237434, + "num_input_tokens_seen": 261215, + "router_z_loss_clip": 47.1875, + "router_z_loss_mlp": 716.5, + "step": 14, + "time_per_iteration": 2.676393985748291 + }, + { + "auxiliary_loss_clip": 0.13372461, + "auxiliary_loss_mlp": 0.75321233, + "balance_loss_clip": 0.08592231, + "balance_loss_mlp": 0.02469672, + "epoch": 0.00090184879001954, + "flos": 26402714006400.0, + "grad_norm": 8872.944602873076, + "language_loss": 11.40745831, + "learning_rate": 1.7435840350181584e-06, + "loss": 12.29439545, + "num_input_tokens_seen": 280035, + "router_z_loss_clip": 47.78125, + "router_z_loss_mlp": 729.5, + "step": 15, + "time_per_iteration": 2.716722249984741 + }, + { + "auxiliary_loss_clip": 0.13287091, + "auxiliary_loss_mlp": 0.73999238, + "balance_loss_clip": 0.0855229, + "balance_loss_mlp": 0.02466036, + "epoch": 0.0009619720426875094, + "flos": 24686157663360.0, + "grad_norm": 5195.838129438997, + "language_loss": 10.71900749, + "learning_rate": 1.7851373027727038e-06, + "loss": 11.59187126, + "num_input_tokens_seen": 300265, + "router_z_loss_clip": 47.3125, + "router_z_loss_mlp": 716.5, + "step": 16, + "time_per_iteration": 2.744054079055786 + }, + { + "auxiliary_loss_clip": 0.13309729, + "auxiliary_loss_mlp": 0.76006317, + "balance_loss_clip": 0.08562544, + "balance_loss_mlp": 0.0247116, + "epoch": 0.0010220952953554788, + "flos": 18630454435200.0, + "grad_norm": 4421.362455936007, + "language_loss": 10.42590714, + "learning_rate": 1.8241705979033208e-06, + "loss": 11.319067, + "num_input_tokens_seen": 317375, + "router_z_loss_clip": 47.5, + "router_z_loss_mlp": 736.0, + "step": 17, + "time_per_iteration": 4.191499471664429 + }, + { + "auxiliary_loss_clip": 0.13315202, + "auxiliary_loss_mlp": 0.7600373, + "balance_loss_clip": 0.08556177, + "balance_loss_mlp": 0.02468574, + "epoch": 0.001082218548023448, + "flos": 26150042419200.0, + "grad_norm": 7888.125072686045, + "language_loss": 9.94283867, + "learning_rate": 1.860972167459798e-06, + "loss": 10.83602905, + "num_input_tokens_seen": 337975, + "router_z_loss_clip": 47.625, + "router_z_loss_mlp": 735.5, + "step": 18, + "time_per_iteration": 2.7808027267456055 + }, + { + "auxiliary_loss_clip": 0.13318592, + "auxiliary_loss_mlp": 0.73953104, + "balance_loss_clip": 0.08563764, + "balance_loss_mlp": 0.02468731, + "epoch": 0.0011423418006914173, + "flos": 19615885977600.0, + "grad_norm": 21999.592558043798, + "language_loss": 8.84625435, + "learning_rate": 1.89578346593066e-06, + "loss": 9.71897125, + "num_input_tokens_seen": 356635, + "router_z_loss_clip": 47.53125, + "router_z_loss_mlp": 716.0, + "step": 19, + "time_per_iteration": 4.131728172302246 + }, + { + "auxiliary_loss_clip": 0.13303626, + "auxiliary_loss_mlp": 0.74244332, + "balance_loss_clip": 0.08565694, + "balance_loss_mlp": 0.02466989, + "epoch": 0.0012024650533593868, + "flos": 17901258278400.0, + "grad_norm": 4121.169450537968, + "language_loss": 8.27947521, + "learning_rate": 1.928808765521199e-06, + "loss": 9.15495491, + "num_input_tokens_seen": 375625, + "router_z_loss_clip": 47.34375, + "router_z_loss_mlp": 718.5, + "step": 20, + "time_per_iteration": 2.708914279937744 + }, + { + "auxiliary_loss_clip": 0.13338368, + "auxiliary_loss_mlp": 0.76394671, + "balance_loss_clip": 0.08570746, + "balance_loss_mlp": 0.02468888, + "epoch": 0.001262588306027356, + "flos": 21258495492480.0, + "grad_norm": 4514.811048777073, + "language_loss": 8.72282791, + "learning_rate": 1.9602224192552076e-06, + "loss": 9.62015915, + "num_input_tokens_seen": 394350, + "router_z_loss_clip": 47.6875, + "router_z_loss_mlp": 740.0, + "step": 21, + "time_per_iteration": 2.685307502746582 + }, + { + "auxiliary_loss_clip": 0.13281943, + "auxiliary_loss_mlp": 0.75118458, + "balance_loss_clip": 0.08552284, + "balance_loss_mlp": 0.02462207, + "epoch": 0.0013227115586953253, + "flos": 26111245178880.0, + "grad_norm": 4471.445911682346, + "language_loss": 8.71503925, + "learning_rate": 1.9901744328983746e-06, + "loss": 9.5990448, + "num_input_tokens_seen": 413255, + "router_z_loss_clip": 47.28125, + "router_z_loss_mlp": 727.5, + "step": 22, + "time_per_iteration": 2.734961748123169 + }, + { + "auxiliary_loss_clip": 0.13285899, + "auxiliary_loss_mlp": 0.73805398, + "balance_loss_clip": 0.08560154, + "balance_loss_mlp": 0.02467511, + "epoch": 0.0013828348113632948, + "flos": 23958177390720.0, + "grad_norm": 2111.5818511880134, + "language_loss": 8.18912506, + "learning_rate": 2.018794797290208e-06, + "loss": 9.06003761, + "num_input_tokens_seen": 433065, + "router_z_loss_clip": 47.3125, + "router_z_loss_mlp": 714.5, + "step": 23, + "time_per_iteration": 2.756584882736206 + }, + { + "auxiliary_loss_clip": 0.13278747, + "auxiliary_loss_mlp": 0.74887347, + "balance_loss_clip": 0.08537573, + "balance_loss_mlp": 0.0247524, + "epoch": 0.001442958064031264, + "flos": 15965125511040.0, + "grad_norm": 1807.1551511559412, + "language_loss": 8.28752899, + "learning_rate": 2.046196897962839e-06, + "loss": 9.16918945, + "num_input_tokens_seen": 451175, + "router_z_loss_clip": 47.4375, + "router_z_loss_mlp": 724.5, + "step": 24, + "time_per_iteration": 2.6928858757019043 + }, + { + "auxiliary_loss_clip": 0.13229564, + "auxiliary_loss_mlp": 0.73557305, + "balance_loss_clip": 0.08544464, + "balance_loss_mlp": 0.02463556, + "epoch": 0.0015030813166992333, + "flos": 18113287835520.0, + "grad_norm": 1186.4376598888527, + "language_loss": 7.80813074, + "learning_rate": 2.0724802282696944e-06, + "loss": 8.67599869, + "num_input_tokens_seen": 468775, + "router_z_loss_clip": 46.875, + "router_z_loss_mlp": 712.0, + "step": 25, + "time_per_iteration": 2.7093117237091064 + }, + { + "auxiliary_loss_clip": 0.13238442, + "auxiliary_loss_mlp": 0.7248075, + "balance_loss_clip": 0.085484, + "balance_loss_mlp": 0.02461214, + "epoch": 0.0015632045693672028, + "flos": 22240740579840.0, + "grad_norm": 3090.3782450571143, + "language_loss": 8.51009178, + "learning_rate": 2.0977325700720194e-06, + "loss": 9.36728287, + "num_input_tokens_seen": 488530, + "router_z_loss_clip": 46.875, + "router_z_loss_mlp": 701.0, + "step": 26, + "time_per_iteration": 2.7142887115478516 + }, + { + "auxiliary_loss_clip": 0.13264546, + "auxiliary_loss_mlp": 0.74387956, + "balance_loss_clip": 0.085568, + "balance_loss_mlp": 0.02464127, + "epoch": 0.001623327822035172, + "flos": 23999448326400.0, + "grad_norm": 883.8040958014411, + "language_loss": 8.80418682, + "learning_rate": 2.122031762649933e-06, + "loss": 9.68071175, + "num_input_tokens_seen": 510495, + "router_z_loss_clip": 47.03125, + "router_z_loss_mlp": 720.5, + "step": 27, + "time_per_iteration": 2.739086389541626 + }, + { + "auxiliary_loss_clip": 0.13261499, + "auxiliary_loss_mlp": 0.74588925, + "balance_loss_clip": 0.08545862, + "balance_loss_mlp": 0.02469785, + "epoch": 0.0016834510747031415, + "flos": 19682914844160.0, + "grad_norm": 778.9563997110462, + "language_loss": 7.52667618, + "learning_rate": 2.1454471497582483e-06, + "loss": 8.40517998, + "num_input_tokens_seen": 528605, + "router_z_loss_clip": 47.125, + "router_z_loss_mlp": 722.0, + "step": 28, + "time_per_iteration": 2.684328079223633 + }, + { + "auxiliary_loss_clip": 0.1322532, + "auxiliary_loss_mlp": 0.72868228, + "balance_loss_clip": 0.08545788, + "balance_loss_mlp": 0.02458075, + "epoch": 0.0017435743273711108, + "flos": 20930241922560.0, + "grad_norm": 711.3301469780024, + "language_loss": 7.32490015, + "learning_rate": 2.1680407726407727e-06, + "loss": 8.18583584, + "num_input_tokens_seen": 548515, + "router_z_loss_clip": 46.84375, + "router_z_loss_mlp": 705.0, + "step": 29, + "time_per_iteration": 2.6822586059570312 + }, + { + "auxiliary_loss_clip": 0.13197789, + "auxiliary_loss_mlp": 0.72772777, + "balance_loss_clip": 0.08529261, + "balance_loss_mlp": 0.02460276, + "epoch": 0.00180369758003908, + "flos": 19533763376640.0, + "grad_norm": 596.7513494595695, + "language_loss": 7.62213326, + "learning_rate": 2.189868360711334e-06, + "loss": 8.48183823, + "num_input_tokens_seen": 564025, + "router_z_loss_clip": 46.65625, + "router_z_loss_mlp": 703.5, + "step": 30, + "time_per_iteration": 2.66929030418396 + }, + { + "auxiliary_loss_clip": 0.13220352, + "auxiliary_loss_mlp": 0.73066145, + "balance_loss_clip": 0.08544487, + "balance_loss_mlp": 0.02460678, + "epoch": 0.0018638208327070496, + "flos": 27460415295360.0, + "grad_norm": 562.9814252823624, + "language_loss": 6.46621895, + "learning_rate": 2.2109801597326265e-06, + "loss": 7.32908344, + "num_input_tokens_seen": 583345, + "router_z_loss_clip": 46.78125, + "router_z_loss_mlp": 707.0, + "step": 31, + "time_per_iteration": 2.769524574279785 + }, + { + "auxiliary_loss_clip": 0.13217463, + "auxiliary_loss_mlp": 0.72719908, + "balance_loss_clip": 0.08546316, + "balance_loss_mlp": 0.02456231, + "epoch": 0.0019239440853750188, + "flos": 13594535723520.0, + "grad_norm": 932.7202356227122, + "language_loss": 6.38840246, + "learning_rate": 2.2314216284658796e-06, + "loss": 7.24777603, + "num_input_tokens_seen": 600010, + "router_z_loss_clip": 46.65625, + "router_z_loss_mlp": 703.0, + "step": 32, + "time_per_iteration": 2.6535158157348633 + }, + { + "auxiliary_loss_clip": 0.13187753, + "auxiliary_loss_mlp": 0.73303366, + "balance_loss_clip": 0.08555806, + "balance_loss_mlp": 0.02453755, + "epoch": 0.001984067338042988, + "flos": 11258466618240.0, + "grad_norm": 1313.3745045414653, + "language_loss": 6.49637842, + "learning_rate": 2.2512340280885094e-06, + "loss": 7.36128998, + "num_input_tokens_seen": 616295, + "router_z_loss_clip": 46.34375, + "router_z_loss_mlp": 709.5, + "step": 33, + "time_per_iteration": 2.7210733890533447 + }, + { + "auxiliary_loss_clip": 0.13162288, + "auxiliary_loss_mlp": 0.73504317, + "balance_loss_clip": 0.08544378, + "balance_loss_mlp": 0.02459392, + "epoch": 0.0020441905907109576, + "flos": 22393413918720.0, + "grad_norm": 826.9088902553285, + "language_loss": 6.77253819, + "learning_rate": 2.270454923596497e-06, + "loss": 7.6392045, + "num_input_tokens_seen": 637640, + "router_z_loss_clip": 46.0625, + "router_z_loss_mlp": 711.5, + "step": 34, + "time_per_iteration": 2.7001218795776367 + }, + { + "auxiliary_loss_clip": 0.13097668, + "auxiliary_loss_mlp": 0.75116229, + "balance_loss_clip": 0.08524574, + "balance_loss_mlp": 0.02459984, + "epoch": 0.0021043138433789266, + "flos": 49788911427840.0, + "grad_norm": 577.9485802079388, + "language_loss": 6.20400715, + "learning_rate": 2.2891186125067434e-06, + "loss": 7.08614588, + "num_input_tokens_seen": 659710, + "router_z_loss_clip": 45.6875, + "router_z_loss_mlp": 727.0, + "step": 35, + "time_per_iteration": 3.031013250350952 + }, + { + "auxiliary_loss_clip": 0.13148203, + "auxiliary_loss_mlp": 0.75109303, + "balance_loss_clip": 0.08537915, + "balance_loss_mlp": 0.02453051, + "epoch": 0.002164437096046896, + "flos": 20564155434240.0, + "grad_norm": 623.9821605724222, + "language_loss": 6.06852198, + "learning_rate": 2.307256493152974e-06, + "loss": 6.95109653, + "num_input_tokens_seen": 679670, + "router_z_loss_clip": 46.0625, + "router_z_loss_mlp": 727.0, + "step": 36, + "time_per_iteration": 2.7437260150909424 + }, + { + "auxiliary_loss_clip": 0.13138273, + "auxiliary_loss_mlp": 0.77219343, + "balance_loss_clip": 0.08535384, + "balance_loss_mlp": 0.02463487, + "epoch": 0.0022245603487148656, + "flos": 26549601413760.0, + "grad_norm": 1356.3181729473308, + "language_loss": 6.23619747, + "learning_rate": 2.3248973825097614e-06, + "loss": 7.13977337, + "num_input_tokens_seen": 700170, + "router_z_loss_clip": 46.03125, + "router_z_loss_mlp": 747.5, + "step": 37, + "time_per_iteration": 2.761021375656128 + }, + { + "auxiliary_loss_clip": 0.1308586, + "auxiliary_loss_mlp": 0.75746208, + "balance_loss_clip": 0.0852948, + "balance_loss_mlp": 0.02455192, + "epoch": 0.0022846836013828346, + "flos": 20344201666560.0, + "grad_norm": 550.1318567752543, + "language_loss": 6.76989794, + "learning_rate": 2.3420677916238357e-06, + "loss": 7.65821838, + "num_input_tokens_seen": 718545, + "router_z_loss_clip": 45.53125, + "router_z_loss_mlp": 733.5, + "step": 38, + "time_per_iteration": 2.797001600265503 + }, + { + "auxiliary_loss_clip": 0.13035053, + "auxiliary_loss_mlp": 0.76824772, + "balance_loss_clip": 0.08534516, + "balance_loss_mlp": 0.02459541, + "epoch": 0.002344806854050804, + "flos": 26254359152640.0, + "grad_norm": 327.614641212253, + "language_loss": 6.69246101, + "learning_rate": 2.358792165262154e-06, + "loss": 7.59105968, + "num_input_tokens_seen": 739865, + "router_z_loss_clip": 45.0, + "router_z_loss_mlp": 744.0, + "step": 39, + "time_per_iteration": 2.7852022647857666 + }, + { + "auxiliary_loss_clip": 0.1300399, + "auxiliary_loss_mlp": 0.74368668, + "balance_loss_clip": 0.08536238, + "balance_loss_mlp": 0.0244484, + "epoch": 0.0024049301067187736, + "flos": 11806296612480.0, + "grad_norm": 474.92846081285364, + "language_loss": 5.92113161, + "learning_rate": 2.3750930912143747e-06, + "loss": 6.79485798, + "num_input_tokens_seen": 755770, + "router_z_loss_clip": 44.6875, + "router_z_loss_mlp": 720.0, + "step": 40, + "time_per_iteration": 2.679415464401245 + }, + { + "auxiliary_loss_clip": 0.1309007, + "auxiliary_loss_mlp": 0.78535652, + "balance_loss_clip": 0.08556648, + "balance_loss_mlp": 0.02461432, + "epoch": 0.0024650533593867426, + "flos": 20637808773120.0, + "grad_norm": 345.5419638030077, + "language_loss": 6.47731018, + "learning_rate": 2.3909914837471044e-06, + "loss": 7.39356709, + "num_input_tokens_seen": 773440, + "router_z_loss_clip": 45.3125, + "router_z_loss_mlp": 760.0, + "step": 41, + "time_per_iteration": 2.835094928741455 + }, + { + "auxiliary_loss_clip": 0.13010421, + "auxiliary_loss_mlp": 0.76229548, + "balance_loss_clip": 0.08534975, + "balance_loss_mlp": 0.02450255, + "epoch": 0.002525176612054712, + "flos": 18412093895040.0, + "grad_norm": 622.6550674421553, + "language_loss": 6.03043365, + "learning_rate": 2.4065067449483835e-06, + "loss": 6.92283392, + "num_input_tokens_seen": 790455, + "router_z_loss_clip": 44.75, + "router_z_loss_mlp": 738.0, + "step": 42, + "time_per_iteration": 2.66955828666687 + }, + { + "auxiliary_loss_clip": 0.13026509, + "auxiliary_loss_mlp": 0.76781166, + "balance_loss_clip": 0.08538143, + "balance_loss_mlp": 0.02464763, + "epoch": 0.0025852998647226816, + "flos": 28191582023040.0, + "grad_norm": 8462.035545761653, + "language_loss": 5.972929, + "learning_rate": 2.4216569070848724e-06, + "loss": 6.87100601, + "num_input_tokens_seen": 810645, + "router_z_loss_clip": 44.875, + "router_z_loss_mlp": 744.0, + "step": 43, + "time_per_iteration": 2.7703070640563965 + }, + { + "auxiliary_loss_clip": 0.13056265, + "auxiliary_loss_mlp": 0.74383116, + "balance_loss_clip": 0.0856277, + "balance_loss_mlp": 0.02459292, + "epoch": 0.0026454231173906506, + "flos": 14288372657280.0, + "grad_norm": 293.14149660558166, + "language_loss": 5.65497112, + "learning_rate": 2.4364587585915504e-06, + "loss": 6.52936459, + "num_input_tokens_seen": 827470, + "router_z_loss_clip": 44.875, + "router_z_loss_mlp": 720.0, + "step": 44, + "time_per_iteration": 2.655585527420044 + }, + { + "auxiliary_loss_clip": 0.13054577, + "auxiliary_loss_mlp": 0.75350422, + "balance_loss_clip": 0.08569255, + "balance_loss_mlp": 0.02450033, + "epoch": 0.00270554637005862, + "flos": 22425796321920.0, + "grad_norm": 174.2843578867089, + "language_loss": 6.01187468, + "learning_rate": 2.450927955901469e-06, + "loss": 6.89592457, + "num_input_tokens_seen": 847285, + "router_z_loss_clip": 44.84375, + "router_z_loss_mlp": 730.0, + "step": 45, + "time_per_iteration": 2.705265522003174 + }, + { + "auxiliary_loss_clip": 0.12984964, + "auxiliary_loss_mlp": 0.73199093, + "balance_loss_clip": 0.08560722, + "balance_loss_mlp": 0.02447144, + "epoch": 0.0027656696227265896, + "flos": 23992236875520.0, + "grad_norm": 191.3929439681521, + "language_loss": 6.48347139, + "learning_rate": 2.465079122983384e-06, + "loss": 7.34531212, + "num_input_tokens_seen": 867545, + "router_z_loss_clip": 44.1875, + "router_z_loss_mlp": 708.5, + "step": 46, + "time_per_iteration": 2.733833074569702 + }, + { + "auxiliary_loss_clip": 0.12997682, + "auxiliary_loss_mlp": 0.73999059, + "balance_loss_clip": 0.08536641, + "balance_loss_mlp": 0.02465855, + "epoch": 0.0028257928753945586, + "flos": 37678511220480.0, + "grad_norm": 214.21785552289575, + "language_loss": 5.68396425, + "learning_rate": 2.4789259401737868e-06, + "loss": 6.55393171, + "num_input_tokens_seen": 889915, + "router_z_loss_clip": 44.5625, + "router_z_loss_mlp": 716.0, + "step": 47, + "time_per_iteration": 2.8230926990509033 + }, + { + "auxiliary_loss_clip": 0.1297729, + "auxiliary_loss_mlp": 0.74471426, + "balance_loss_clip": 0.08536708, + "balance_loss_mlp": 0.0244994, + "epoch": 0.002885916128062528, + "flos": 22460945909760.0, + "grad_norm": 449.4004858001912, + "language_loss": 5.75540733, + "learning_rate": 2.492481223656015e-06, + "loss": 6.62989426, + "num_input_tokens_seen": 908975, + "router_z_loss_clip": 44.40625, + "router_z_loss_mlp": 721.5, + "step": 48, + "time_per_iteration": 2.7284624576568604 + }, + { + "auxiliary_loss_clip": 0.12959239, + "auxiliary_loss_mlp": 0.73848325, + "balance_loss_clip": 0.08549985, + "balance_loss_mlp": 0.02461606, + "epoch": 0.0029460393807304976, + "flos": 27019543438080.0, + "grad_norm": 230.30029270071188, + "language_loss": 6.70517731, + "learning_rate": 2.5057569967437924e-06, + "loss": 7.57325315, + "num_input_tokens_seen": 929810, + "router_z_loss_clip": 44.0625, + "router_z_loss_mlp": 715.0, + "step": 49, + "time_per_iteration": 2.792755603790283 + }, + { + "auxiliary_loss_clip": 0.12996669, + "auxiliary_loss_mlp": 0.71446228, + "balance_loss_clip": 0.08555867, + "balance_loss_mlp": 0.02452083, + "epoch": 0.0030061626333984666, + "flos": 15857328833280.0, + "grad_norm": 311.93786428729913, + "language_loss": 5.55702782, + "learning_rate": 2.51876455396287e-06, + "loss": 6.40145731, + "num_input_tokens_seen": 948650, + "router_z_loss_clip": 44.34375, + "router_z_loss_mlp": 690.5, + "step": 50, + "time_per_iteration": 2.689176559448242 + }, + { + "auxiliary_loss_clip": 0.12955803, + "auxiliary_loss_mlp": 0.71350002, + "balance_loss_clip": 0.08553191, + "balance_loss_mlp": 0.02453516, + "epoch": 0.003066285886066436, + "flos": 31834292497920.0, + "grad_norm": 326.0050772098012, + "language_loss": 6.42039013, + "learning_rate": 2.5315145187866316e-06, + "loss": 7.26344872, + "num_input_tokens_seen": 966455, + "router_z_loss_clip": 44.0, + "router_z_loss_mlp": 689.5, + "step": 51, + "time_per_iteration": 2.751997232437134 + }, + { + "auxiliary_loss_clip": 0.12936625, + "auxiliary_loss_mlp": 0.71062022, + "balance_loss_clip": 0.08552323, + "balance_loss_mlp": 0.02458507, + "epoch": 0.0031264091387344056, + "flos": 41437110291840.0, + "grad_norm": 467.7969407780881, + "language_loss": 5.78601551, + "learning_rate": 2.5440168957651953e-06, + "loss": 6.62600183, + "num_input_tokens_seen": 988110, + "router_z_loss_clip": 43.84375, + "router_z_loss_mlp": 686.5, + "step": 52, + "time_per_iteration": 2.8259687423706055 + }, + { + "auxiliary_loss_clip": 0.12935326, + "auxiliary_loss_mlp": 0.69343221, + "balance_loss_clip": 0.08550587, + "balance_loss_mlp": 0.02448688, + "epoch": 0.0031865323914023747, + "flos": 23447719117440.0, + "grad_norm": 4084.3297995155954, + "language_loss": 5.79331207, + "learning_rate": 2.5562811176888872e-06, + "loss": 6.61609745, + "num_input_tokens_seen": 1008550, + "router_z_loss_clip": 43.78125, + "router_z_loss_mlp": 669.0, + "step": 53, + "time_per_iteration": 2.6902496814727783 + }, + { + "auxiliary_loss_clip": 0.12926383, + "auxiliary_loss_mlp": 0.69104648, + "balance_loss_clip": 0.08542258, + "balance_loss_mlp": 0.02454257, + "epoch": 0.003246655644070344, + "flos": 14434505377920.0, + "grad_norm": 247.18448581495338, + "language_loss": 5.53028297, + "learning_rate": 2.5683160883431093e-06, + "loss": 6.35059309, + "num_input_tokens_seen": 1026840, + "router_z_loss_clip": 43.75, + "router_z_loss_mlp": 666.5, + "step": 54, + "time_per_iteration": 2.642801523208618 + }, + { + "auxiliary_loss_clip": 0.12913677, + "auxiliary_loss_mlp": 0.68966341, + "balance_loss_clip": 0.08543722, + "balance_loss_mlp": 0.02462436, + "epoch": 0.0033067788967383136, + "flos": 35926972997760.0, + "grad_norm": 431.229914559421, + "language_loss": 5.18386555, + "learning_rate": 2.580130221340046e-06, + "loss": 6.00266552, + "num_input_tokens_seen": 1048875, + "router_z_loss_clip": 43.6875, + "router_z_loss_mlp": 665.0, + "step": 55, + "time_per_iteration": 2.7916810512542725 + }, + { + "auxiliary_loss_clip": 0.12884736, + "auxiliary_loss_mlp": 0.68559694, + "balance_loss_clip": 0.08553176, + "balance_loss_mlp": 0.02446416, + "epoch": 0.003366902149406283, + "flos": 22964108878080.0, + "grad_norm": 559.5224439968259, + "language_loss": 5.74156904, + "learning_rate": 2.5917314754514246e-06, + "loss": 6.55601311, + "num_input_tokens_seen": 1066435, + "router_z_loss_clip": 43.28125, + "router_z_loss_mlp": 661.0, + "step": 56, + "time_per_iteration": 2.638873338699341 + }, + { + "auxiliary_loss_clip": 0.12877631, + "auxiliary_loss_mlp": 0.65916806, + "balance_loss_clip": 0.08553813, + "balance_loss_mlp": 0.02440244, + "epoch": 0.003427025402074252, + "flos": 26590830422400.0, + "grad_norm": 1293.1571760901363, + "language_loss": 6.61670828, + "learning_rate": 2.6031273868139713e-06, + "loss": 7.4046526, + "num_input_tokens_seen": 1090330, + "router_z_loss_clip": 43.28125, + "router_z_loss_mlp": 634.0, + "step": 57, + "time_per_iteration": 4.246931314468384 + }, + { + "auxiliary_loss_clip": 0.12864697, + "auxiliary_loss_mlp": 0.66109824, + "balance_loss_clip": 0.08544569, + "balance_loss_mlp": 0.02437945, + "epoch": 0.0034871486547422216, + "flos": 23957967755520.0, + "grad_norm": 1581.401693587077, + "language_loss": 6.75815916, + "learning_rate": 2.614325098333948e-06, + "loss": 7.54790401, + "num_input_tokens_seen": 1109840, + "router_z_loss_clip": 43.25, + "router_z_loss_mlp": 636.0, + "step": 58, + "time_per_iteration": 4.129940986633301 + }, + { + "auxiliary_loss_clip": 0.12923497, + "auxiliary_loss_mlp": 0.64957327, + "balance_loss_clip": 0.08577307, + "balance_loss_mlp": 0.02457325, + "epoch": 0.003547271907410191, + "flos": 21221333406720.0, + "grad_norm": 1242.7465016222895, + "language_loss": 5.84827662, + "learning_rate": 2.625331386578098e-06, + "loss": 6.62708521, + "num_input_tokens_seen": 1128415, + "router_z_loss_clip": 43.40625, + "router_z_loss_mlp": 624.0, + "step": 59, + "time_per_iteration": 2.81791090965271 + }, + { + "auxiliary_loss_clip": 0.1292145, + "auxiliary_loss_mlp": 0.65939367, + "balance_loss_clip": 0.08575267, + "balance_loss_mlp": 0.02462805, + "epoch": 0.00360739516007816, + "flos": 16509894831360.0, + "grad_norm": 2163.0106173410372, + "language_loss": 6.19513655, + "learning_rate": 2.63615268640451e-06, + "loss": 6.98374462, + "num_input_tokens_seen": 1146515, + "router_z_loss_clip": 43.4375, + "router_z_loss_mlp": 634.0, + "step": 60, + "time_per_iteration": 2.6462490558624268 + }, + { + "auxiliary_loss_clip": 0.12888563, + "auxiliary_loss_mlp": 0.64225286, + "balance_loss_clip": 0.08565725, + "balance_loss_mlp": 0.0245771, + "epoch": 0.0036675184127461296, + "flos": 19471052995200.0, + "grad_norm": 635.7445513752676, + "language_loss": 5.79569387, + "learning_rate": 2.6467951135575943e-06, + "loss": 6.56683254, + "num_input_tokens_seen": 1166330, + "router_z_loss_clip": 43.21875, + "router_z_loss_mlp": 617.0, + "step": 61, + "time_per_iteration": 2.681910753250122 + }, + { + "auxiliary_loss_clip": 0.12824672, + "auxiliary_loss_mlp": 0.63430971, + "balance_loss_clip": 0.08548941, + "balance_loss_mlp": 0.02444647, + "epoch": 0.003727641665414099, + "flos": 20963253231360.0, + "grad_norm": 899.0914058712833, + "language_loss": 5.87668133, + "learning_rate": 2.657264485425803e-06, + "loss": 6.63923836, + "num_input_tokens_seen": 1186010, + "router_z_loss_clip": 42.71875, + "router_z_loss_mlp": 609.0, + "step": 62, + "time_per_iteration": 2.6819515228271484 + }, + { + "auxiliary_loss_clip": 0.12823591, + "auxiliary_loss_mlp": 0.6255362, + "balance_loss_clip": 0.08562292, + "balance_loss_mlp": 0.02446202, + "epoch": 0.003787764918082068, + "flos": 18412010040960.0, + "grad_norm": 1285.0325266073119, + "language_loss": 5.71324301, + "learning_rate": 2.6675663401385186e-06, + "loss": 6.46701479, + "num_input_tokens_seen": 1204985, + "router_z_loss_clip": 42.59375, + "router_z_loss_mlp": 600.0, + "step": 63, + "time_per_iteration": 2.6705985069274902 + }, + { + "auxiliary_loss_clip": 0.12830947, + "auxiliary_loss_mlp": 0.62154531, + "balance_loss_clip": 0.08567161, + "balance_loss_mlp": 0.02437731, + "epoch": 0.0038478881707500376, + "flos": 12464271198720.0, + "grad_norm": 1843.6770385957534, + "language_loss": 5.25008583, + "learning_rate": 2.677705954159056e-06, + "loss": 5.99994087, + "num_input_tokens_seen": 1223545, + "router_z_loss_clip": 42.6875, + "router_z_loss_mlp": 597.0, + "step": 64, + "time_per_iteration": 2.7688894271850586 + }, + { + "auxiliary_loss_clip": 0.12807481, + "auxiliary_loss_mlp": 0.61575615, + "balance_loss_clip": 0.08564365, + "balance_loss_mlp": 0.02444756, + "epoch": 0.003908011423418007, + "flos": 13558463740800.0, + "grad_norm": 1007.498474071754, + "language_loss": 5.29735851, + "learning_rate": 2.6876883585136904e-06, + "loss": 6.04118919, + "num_input_tokens_seen": 1241175, + "router_z_loss_clip": 42.40625, + "router_z_loss_mlp": 590.5, + "step": 65, + "time_per_iteration": 2.7044079303741455 + }, + { + "auxiliary_loss_clip": 0.12739113, + "auxiliary_loss_mlp": 0.60150075, + "balance_loss_clip": 0.08550942, + "balance_loss_mlp": 0.02435229, + "epoch": 0.003968134676085976, + "flos": 18339488732160.0, + "grad_norm": 1472.5993340381553, + "language_loss": 5.05529404, + "learning_rate": 2.697518353781685e-06, + "loss": 5.78418589, + "num_input_tokens_seen": 1259315, + "router_z_loss_clip": 41.90625, + "router_z_loss_mlp": 577.0, + "step": 66, + "time_per_iteration": 2.639763116836548 + }, + { + "auxiliary_loss_clip": 0.12713413, + "auxiliary_loss_mlp": 0.58826029, + "balance_loss_clip": 0.08548602, + "balance_loss_mlp": 0.02429543, + "epoch": 0.004028257928753946, + "flos": 20491466417280.0, + "grad_norm": 2128.447716031984, + "language_loss": 5.57779789, + "learning_rate": 2.7072005239581103e-06, + "loss": 6.29319191, + "num_input_tokens_seen": 1277055, + "router_z_loss_clip": 41.65625, + "router_z_loss_mlp": 564.0, + "step": 67, + "time_per_iteration": 2.6764183044433594 + }, + { + "auxiliary_loss_clip": 0.12659386, + "auxiliary_loss_mlp": 0.59566367, + "balance_loss_clip": 0.08534892, + "balance_loss_mlp": 0.02437462, + "epoch": 0.004088381181421915, + "flos": 18849863151360.0, + "grad_norm": 1300.1095038466112, + "language_loss": 5.65431881, + "learning_rate": 2.7167392492896727e-06, + "loss": 6.37657642, + "num_input_tokens_seen": 1294355, + "router_z_loss_clip": 41.21875, + "router_z_loss_mlp": 571.5, + "step": 68, + "time_per_iteration": 2.6499533653259277 + }, + { + "auxiliary_loss_clip": 0.12670201, + "auxiliary_loss_mlp": 0.59023213, + "balance_loss_clip": 0.08528139, + "balance_loss_mlp": 0.02431421, + "epoch": 0.004148504434089885, + "flos": 19433974763520.0, + "grad_norm": 775.8661457915586, + "language_loss": 5.68540192, + "learning_rate": 2.7261387181735195e-06, + "loss": 6.40233564, + "num_input_tokens_seen": 1313525, + "router_z_loss_clip": 41.375, + "router_z_loss_mlp": 566.0, + "step": 69, + "time_per_iteration": 2.680570363998413 + }, + { + "auxiliary_loss_clip": 0.12638462, + "auxiliary_loss_mlp": 0.5930984, + "balance_loss_clip": 0.08532386, + "balance_loss_mlp": 0.02425073, + "epoch": 0.004208627686757853, + "flos": 20816868948480.0, + "grad_norm": 532.7078221445815, + "language_loss": 6.55753994, + "learning_rate": 2.7354029381999196e-06, + "loss": 7.27702332, + "num_input_tokens_seen": 1330505, + "router_z_loss_clip": 41.09375, + "router_z_loss_mlp": 570.0, + "step": 70, + "time_per_iteration": 2.6596553325653076 + }, + { + "auxiliary_loss_clip": 0.12589023, + "auxiliary_loss_mlp": 0.57596606, + "balance_loss_clip": 0.08525643, + "balance_loss_mlp": 0.02420826, + "epoch": 0.004268750939425823, + "flos": 19104589163520.0, + "grad_norm": 3523.620393185992, + "language_loss": 4.99572229, + "learning_rate": 2.7445357464116983e-06, + "loss": 5.69757891, + "num_input_tokens_seen": 1349615, + "router_z_loss_clip": 40.71875, + "router_z_loss_mlp": 552.5, + "step": 71, + "time_per_iteration": 2.6517086029052734 + }, + { + "auxiliary_loss_clip": 0.13345143, + "auxiliary_loss_mlp": 0.53337634, + "balance_loss_clip": 0.08910056, + "balance_loss_mlp": 0.02458726, + "epoch": 0.004328874192093792, + "flos": 52456112340480.0, + "grad_norm": 24.73254947156558, + "language_loss": 0.75920403, + "learning_rate": 2.75354081884615e-06, + "loss": 1.42603183, + "num_input_tokens_seen": 1410275, + "router_z_loss_clip": 44.375, + "router_z_loss_mlp": 508.25, + "step": 72, + "time_per_iteration": 3.4461121559143066 + }, + { + "auxiliary_loss_clip": 0.13279217, + "auxiliary_loss_mlp": 0.51093936, + "balance_loss_clip": 0.08903308, + "balance_loss_mlp": 0.02436709, + "epoch": 0.004388997444761762, + "flos": 66495922260480.0, + "grad_norm": 24.018429481505308, + "language_loss": 0.70889235, + "learning_rate": 2.7624216794188286e-06, + "loss": 1.35262394, + "num_input_tokens_seen": 1473020, + "router_z_loss_clip": 43.71875, + "router_z_loss_mlp": 486.25, + "step": 73, + "time_per_iteration": 3.8973076343536377 + }, + { + "auxiliary_loss_clip": 0.12491501, + "auxiliary_loss_mlp": 0.53349555, + "balance_loss_clip": 0.08502775, + "balance_loss_mlp": 0.02397403, + "epoch": 0.004449120697429731, + "flos": 18958959567360.0, + "grad_norm": 3320.4524015503866, + "language_loss": 5.2433157, + "learning_rate": 2.771181708202938e-06, + "loss": 5.90172577, + "num_input_tokens_seen": 1490385, + "router_z_loss_clip": 39.90625, + "router_z_loss_mlp": 509.5, + "step": 74, + "time_per_iteration": 2.6803529262542725 + }, + { + "auxiliary_loss_clip": 0.12445074, + "auxiliary_loss_mlp": 0.51731253, + "balance_loss_clip": 0.08501716, + "balance_loss_mlp": 0.02390428, + "epoch": 0.004509243950097701, + "flos": 21111817720320.0, + "grad_norm": 2097.466788992517, + "language_loss": 5.57566261, + "learning_rate": 2.779824149153005e-06, + "loss": 6.21742582, + "num_input_tokens_seen": 1509725, + "router_z_loss_clip": 39.4375, + "router_z_loss_mlp": 493.0, + "step": 75, + "time_per_iteration": 2.687678575515747 + }, + { + "auxiliary_loss_clip": 0.12385009, + "auxiliary_loss_mlp": 0.49917772, + "balance_loss_clip": 0.08505447, + "balance_loss_mlp": 0.0235918, + "epoch": 0.004569367202765669, + "flos": 20704082952960.0, + "grad_norm": 7030.779065512956, + "language_loss": 5.64007378, + "learning_rate": 2.788352117317012e-06, + "loss": 6.26310158, + "num_input_tokens_seen": 1527245, + "router_z_loss_clip": 38.8125, + "router_z_loss_mlp": 475.25, + "step": 76, + "time_per_iteration": 2.666630744934082 + }, + { + "auxiliary_loss_clip": 0.12336895, + "auxiliary_loss_mlp": 0.48941305, + "balance_loss_clip": 0.08483945, + "balance_loss_mlp": 0.02359273, + "epoch": 0.004629490455433639, + "flos": 28666136021760.0, + "grad_norm": 620.4309602119407, + "language_loss": 5.72052956, + "learning_rate": 2.796768605577095e-06, + "loss": 6.33331108, + "num_input_tokens_seen": 1548930, + "router_z_loss_clip": 38.5, + "router_z_loss_mlp": 465.5, + "step": 77, + "time_per_iteration": 2.7469568252563477 + }, + { + "auxiliary_loss_clip": 0.12308235, + "auxiliary_loss_mlp": 0.48191378, + "balance_loss_clip": 0.08460534, + "balance_loss_mlp": 0.02366182, + "epoch": 0.004689613708101608, + "flos": 11077142382720.0, + "grad_norm": 1643.3438058920954, + "language_loss": 5.09305811, + "learning_rate": 2.80507649095533e-06, + "loss": 5.69805431, + "num_input_tokens_seen": 1565695, + "router_z_loss_clip": 38.5, + "router_z_loss_mlp": 458.25, + "step": 78, + "time_per_iteration": 2.6558547019958496 + }, + { + "auxiliary_loss_clip": 0.12249273, + "auxiliary_loss_mlp": 0.46293706, + "balance_loss_clip": 0.08442898, + "balance_loss_mlp": 0.02348393, + "epoch": 0.004749736960769578, + "flos": 21805612727040.0, + "grad_norm": 2200.9167741447113, + "language_loss": 4.90451622, + "learning_rate": 2.813278540517843e-06, + "loss": 5.48994637, + "num_input_tokens_seen": 1582625, + "router_z_loss_clip": 38.0625, + "router_z_loss_mlp": 439.75, + "step": 79, + "time_per_iteration": 2.7162697315216064 + }, + { + "auxiliary_loss_clip": 0.12262511, + "auxiliary_loss_mlp": 0.46983981, + "balance_loss_clip": 0.08447941, + "balance_loss_mlp": 0.02355075, + "epoch": 0.004809860213437547, + "flos": 19798803440640.0, + "grad_norm": 344.66463824801895, + "language_loss": 5.05523586, + "learning_rate": 2.8213774169075505e-06, + "loss": 5.64770126, + "num_input_tokens_seen": 1601725, + "router_z_loss_clip": 38.125, + "router_z_loss_mlp": 446.75, + "step": 80, + "time_per_iteration": 2.687460422515869 + }, + { + "auxiliary_loss_clip": 0.12261841, + "auxiliary_loss_mlp": 0.45211679, + "balance_loss_clip": 0.08451226, + "balance_loss_mlp": 0.02364997, + "epoch": 0.004869983466105517, + "flos": 26580893713920.0, + "grad_norm": 1677.7099343970488, + "language_loss": 5.56453705, + "learning_rate": 2.829375683533245e-06, + "loss": 6.13927221, + "num_input_tokens_seen": 1622420, + "router_z_loss_clip": 38.125, + "router_z_loss_mlp": 428.5, + "step": 81, + "time_per_iteration": 2.7709527015686035 + }, + { + "auxiliary_loss_clip": 0.12245495, + "auxiliary_loss_mlp": 0.44303346, + "balance_loss_clip": 0.08439148, + "balance_loss_mlp": 0.02335574, + "epoch": 0.004930106718773485, + "flos": 12828345189120.0, + "grad_norm": 4679.4395433895315, + "language_loss": 4.60398674, + "learning_rate": 2.8372758094402803e-06, + "loss": 5.16947508, + "num_input_tokens_seen": 1640715, + "router_z_loss_clip": 38.125, + "router_z_loss_mlp": 419.75, + "step": 82, + "time_per_iteration": 2.6463286876678467 + }, + { + "auxiliary_loss_clip": 0.12233329, + "auxiliary_loss_mlp": 0.44903332, + "balance_loss_clip": 0.0843938, + "balance_loss_mlp": 0.0234962, + "epoch": 0.004990229971441455, + "flos": 25781901505920.0, + "grad_norm": 1468.5073951038269, + "language_loss": 5.41148376, + "learning_rate": 2.84508017388607e-06, + "loss": 5.98285007, + "num_input_tokens_seen": 1662210, + "router_z_loss_clip": 37.96875, + "router_z_loss_mlp": 425.5, + "step": 83, + "time_per_iteration": 2.751582145690918 + }, + { + "auxiliary_loss_clip": 0.12286501, + "auxiliary_loss_mlp": 0.44843888, + "balance_loss_clip": 0.08466095, + "balance_loss_mlp": 0.0236342, + "epoch": 0.005050353224109424, + "flos": 17463027824640.0, + "grad_norm": 333.54187308321605, + "language_loss": 4.89241934, + "learning_rate": 2.852791070641559e-06, + "loss": 5.46372318, + "num_input_tokens_seen": 1681070, + "router_z_loss_clip": 38.21875, + "router_z_loss_mlp": 425.0, + "step": 84, + "time_per_iteration": 2.6613667011260986 + }, + { + "auxiliary_loss_clip": 0.12715524, + "auxiliary_loss_mlp": 0.33666173, + "balance_loss_clip": 0.08695208, + "balance_loss_mlp": 0.02245275, + "epoch": 0.005110476476777394, + "flos": 69824607160320.0, + "grad_norm": 16.750834021856043, + "language_loss": 0.63998127, + "learning_rate": 2.8604107120381682e-06, + "loss": 1.10379827, + "num_input_tokens_seen": 1747140, + "router_z_loss_clip": 40.09375, + "router_z_loss_mlp": 313.75, + "step": 85, + "time_per_iteration": 3.4564764499664307 + }, + { + "auxiliary_loss_clip": 0.12209877, + "auxiliary_loss_mlp": 0.42757708, + "balance_loss_clip": 0.08426955, + "balance_loss_mlp": 0.02352437, + "epoch": 0.005170599729445363, + "flos": 24796973088000.0, + "grad_norm": 542.703970895993, + "language_loss": 4.92362881, + "learning_rate": 2.8679412327780482e-06, + "loss": 5.47330475, + "num_input_tokens_seen": 1767475, + "router_z_loss_clip": 37.90625, + "router_z_loss_mlp": 403.75, + "step": 86, + "time_per_iteration": 2.775689125061035 + }, + { + "auxiliary_loss_clip": 0.12224952, + "auxiliary_loss_mlp": 0.4164477, + "balance_loss_clip": 0.08412233, + "balance_loss_mlp": 0.02362544, + "epoch": 0.005230722982113333, + "flos": 23264717800320.0, + "grad_norm": 4371.207136836947, + "language_loss": 5.4414258, + "learning_rate": 2.8753846935240833e-06, + "loss": 5.98012304, + "num_input_tokens_seen": 1784980, + "router_z_loss_clip": 38.15625, + "router_z_loss_mlp": 392.25, + "step": 87, + "time_per_iteration": 2.7322311401367188 + }, + { + "auxiliary_loss_clip": 0.12200201, + "auxiliary_loss_mlp": 0.41744971, + "balance_loss_clip": 0.08406796, + "balance_loss_mlp": 0.02365087, + "epoch": 0.005290846234781301, + "flos": 16733622032640.0, + "grad_norm": 2919.861295310318, + "language_loss": 4.86351013, + "learning_rate": 2.8827430842847267e-06, + "loss": 5.40296173, + "num_input_tokens_seen": 1803030, + "router_z_loss_clip": 38.0, + "router_z_loss_mlp": 393.75, + "step": 88, + "time_per_iteration": 2.7260544300079346 + }, + { + "auxiliary_loss_clip": 0.1219901, + "auxiliary_loss_mlp": 0.40224642, + "balance_loss_clip": 0.08417168, + "balance_loss_mlp": 0.02358433, + "epoch": 0.005350969487449271, + "flos": 20892283223040.0, + "grad_norm": 1645.58162705774, + "language_loss": 5.16751766, + "learning_rate": 2.8900183276075957e-06, + "loss": 5.69175386, + "num_input_tokens_seen": 1822865, + "router_z_loss_clip": 37.875, + "router_z_loss_mlp": 378.5, + "step": 89, + "time_per_iteration": 2.674370288848877 + }, + { + "auxiliary_loss_clip": 0.12154645, + "auxiliary_loss_mlp": 0.38342261, + "balance_loss_clip": 0.0840472, + "balance_loss_mlp": 0.02331517, + "epoch": 0.00541109274011724, + "flos": 26216568161280.0, + "grad_norm": 1270.091627450628, + "language_loss": 4.37986279, + "learning_rate": 2.8972122815946455e-06, + "loss": 4.88483191, + "num_input_tokens_seen": 1842435, + "router_z_loss_clip": 37.5, + "router_z_loss_mlp": 360.75, + "step": 90, + "time_per_iteration": 2.7423648834228516 + }, + { + "auxiliary_loss_clip": 0.12150387, + "auxiliary_loss_mlp": 0.38653693, + "balance_loss_clip": 0.08385181, + "balance_loss_mlp": 0.02349981, + "epoch": 0.00547121599278521, + "flos": 21184926007680.0, + "grad_norm": 803.9563265609303, + "language_loss": 5.31085825, + "learning_rate": 2.90432674275074e-06, + "loss": 5.81889915, + "num_input_tokens_seen": 1860065, + "router_z_loss_clip": 37.6875, + "router_z_loss_mlp": 363.0, + "step": 91, + "time_per_iteration": 2.6603400707244873 + }, + { + "auxiliary_loss_clip": 0.12079477, + "auxiliary_loss_mlp": 0.37034535, + "balance_loss_clip": 0.08381163, + "balance_loss_mlp": 0.02342154, + "epoch": 0.005531339245453179, + "flos": 19724856612480.0, + "grad_norm": 829.7403965041182, + "language_loss": 4.4634366, + "learning_rate": 2.91136344867656e-06, + "loss": 4.95457649, + "num_input_tokens_seen": 1878135, + "router_z_loss_clip": 37.0, + "router_z_loss_mlp": 347.25, + "step": 92, + "time_per_iteration": 2.6818525791168213 + }, + { + "auxiliary_loss_clip": 0.1209444, + "auxiliary_loss_mlp": 0.35073167, + "balance_loss_clip": 0.08383686, + "balance_loss_mlp": 0.02309498, + "epoch": 0.005591462498121149, + "flos": 17641291386240.0, + "grad_norm": 1625.08326205636, + "language_loss": 4.56070709, + "learning_rate": 2.918324080615938e-06, + "loss": 5.03238297, + "num_input_tokens_seen": 1894895, + "router_z_loss_clip": 37.125, + "router_z_loss_mlp": 327.5, + "step": 93, + "time_per_iteration": 2.612030029296875 + }, + { + "auxiliary_loss_clip": 0.12023389, + "auxiliary_loss_mlp": 0.34590679, + "balance_loss_clip": 0.08357395, + "balance_loss_mlp": 0.02290875, + "epoch": 0.005651585750789117, + "flos": 20017415543040.0, + "grad_norm": 681.2724931544728, + "language_loss": 4.70847607, + "learning_rate": 2.925210265866963e-06, + "loss": 5.17461681, + "num_input_tokens_seen": 1913220, + "router_z_loss_clip": 36.625, + "router_z_loss_mlp": 322.75, + "step": 94, + "time_per_iteration": 2.6726646423339844 + }, + { + "auxiliary_loss_clip": 0.12331794, + "auxiliary_loss_mlp": 0.21429604, + "balance_loss_clip": 0.08515669, + "balance_loss_mlp": 0.01873939, + "epoch": 0.005711709003457087, + "flos": 59831202758400.0, + "grad_norm": 11.50707364837694, + "language_loss": 0.68575168, + "learning_rate": 2.932023580065507e-06, + "loss": 1.02336574, + "num_input_tokens_seen": 1970970, + "router_z_loss_clip": 38.0, + "router_z_loss_mlp": 195.25, + "step": 95, + "time_per_iteration": 3.168633222579956 + }, + { + "auxiliary_loss_clip": 0.11899618, + "auxiliary_loss_mlp": 0.32138801, + "balance_loss_clip": 0.08329217, + "balance_loss_mlp": 0.02231575, + "epoch": 0.005771832256125056, + "flos": 15564979537920.0, + "grad_norm": 1013.3395640383166, + "language_loss": 4.49414778, + "learning_rate": 2.9387655493491906e-06, + "loss": 4.93453217, + "num_input_tokens_seen": 1988930, + "router_z_loss_clip": 35.71875, + "router_z_loss_mlp": 298.5, + "step": 96, + "time_per_iteration": 5.5690062046051025 + }, + { + "auxiliary_loss_clip": 0.11822618, + "auxiliary_loss_mlp": 0.30064785, + "balance_loss_clip": 0.08285143, + "balance_loss_mlp": 0.02220548, + "epoch": 0.005831955508793026, + "flos": 22534934664960.0, + "grad_norm": 2356.5481695677104, + "language_loss": 5.16498899, + "learning_rate": 2.9454376524092147e-06, + "loss": 5.58386326, + "num_input_tokens_seen": 2006285, + "router_z_loss_clip": 35.4375, + "router_z_loss_mlp": 278.375, + "step": 97, + "time_per_iteration": 4.129577159881592 + }, + { + "auxiliary_loss_clip": 0.11772624, + "auxiliary_loss_mlp": 0.27429676, + "balance_loss_clip": 0.08268203, + "balance_loss_mlp": 0.02161121, + "epoch": 0.005892078761460995, + "flos": 22055600983680.0, + "grad_norm": 1442.767046866879, + "language_loss": 4.65611029, + "learning_rate": 2.952041322436969e-06, + "loss": 5.04813337, + "num_input_tokens_seen": 2024905, + "router_z_loss_clip": 35.03125, + "router_z_loss_mlp": 252.75, + "step": 98, + "time_per_iteration": 4.072925567626953 + }, + { + "auxiliary_loss_clip": 0.12124368, + "auxiliary_loss_mlp": 0.12855935, + "balance_loss_clip": 0.08381641, + "balance_loss_mlp": 0.01625466, + "epoch": 0.005952202014128965, + "flos": 68559865632000.0, + "grad_norm": 9.945172746585492, + "language_loss": 0.65681642, + "learning_rate": 2.9585779489718204e-06, + "loss": 0.90661949, + "num_input_tokens_seen": 2086220, + "router_z_loss_clip": 37.46875, + "router_z_loss_mlp": 112.4375, + "step": 99, + "time_per_iteration": 3.3806052207946777 + }, + { + "auxiliary_loss_clip": 0.11659142, + "auxiliary_loss_mlp": 0.25495899, + "balance_loss_clip": 0.08219896, + "balance_loss_mlp": 0.02095021, + "epoch": 0.006012325266796933, + "flos": 22966624500480.0, + "grad_norm": 5439.355539233552, + "language_loss": 4.89178705, + "learning_rate": 2.9650488796560464e-06, + "loss": 5.26333714, + "num_input_tokens_seen": 2103365, + "router_z_loss_clip": 34.34375, + "router_z_loss_mlp": 233.875, + "step": 100, + "time_per_iteration": 2.6920084953308105 + }, + { + "auxiliary_loss_clip": 0.11642508, + "auxiliary_loss_mlp": 0.23216301, + "balance_loss_clip": 0.08225508, + "balance_loss_mlp": 0.02037103, + "epoch": 0.006072448519464903, + "flos": 17353721773440.0, + "grad_norm": 71170.85330308754, + "language_loss": 4.95652103, + "learning_rate": 2.971455421902446e-06, + "loss": 5.30510902, + "num_input_tokens_seen": 2121995, + "router_z_loss_clip": 34.15625, + "router_z_loss_mlp": 211.875, + "step": 101, + "time_per_iteration": 2.652926206588745 + }, + { + "auxiliary_loss_clip": 0.11583164, + "auxiliary_loss_mlp": 0.214275, + "balance_loss_clip": 0.08206252, + "balance_loss_mlp": 0.01957287, + "epoch": 0.006132571772132872, + "flos": 24688044380160.0, + "grad_norm": 7482.306451170957, + "language_loss": 5.13341808, + "learning_rate": 2.9777988444798075e-06, + "loss": 5.4635253, + "num_input_tokens_seen": 2141815, + "router_z_loss_clip": 33.78125, + "router_z_loss_mlp": 194.625, + "step": 102, + "time_per_iteration": 2.7020983695983887 + }, + { + "auxiliary_loss_clip": 0.11553724, + "auxiliary_loss_mlp": 0.20282698, + "balance_loss_clip": 0.08193958, + "balance_loss_mlp": 0.01923322, + "epoch": 0.006192695024800842, + "flos": 21471279736320.0, + "grad_norm": 1966.1076689836887, + "language_loss": 4.95062399, + "learning_rate": 2.9840803790210285e-06, + "loss": 5.26898813, + "num_input_tokens_seen": 2161125, + "router_z_loss_clip": 33.59375, + "router_z_loss_mlp": 183.75, + "step": 103, + "time_per_iteration": 2.652406692504883 + }, + { + "auxiliary_loss_clip": 0.11498895, + "auxiliary_loss_mlp": 0.18188542, + "balance_loss_clip": 0.08159411, + "balance_loss_mlp": 0.01855535, + "epoch": 0.006252818277468811, + "flos": 17426117301120.0, + "grad_norm": 4017.94727583705, + "language_loss": 4.81252193, + "learning_rate": 2.990301221458371e-06, + "loss": 5.10939646, + "num_input_tokens_seen": 2179510, + "router_z_loss_clip": 33.40625, + "router_z_loss_mlp": 163.25, + "step": 104, + "time_per_iteration": 2.6669459342956543 + }, + { + "auxiliary_loss_clip": 0.11507185, + "auxiliary_loss_mlp": 0.18210354, + "balance_loss_clip": 0.081876, + "balance_loss_mlp": 0.01852931, + "epoch": 0.006312941530136781, + "flos": 19105679266560.0, + "grad_norm": 5275.119248926157, + "language_loss": 4.54453945, + "learning_rate": 2.9964625333900544e-06, + "loss": 4.84171486, + "num_input_tokens_seen": 2197870, + "router_z_loss_clip": 33.1875, + "router_z_loss_mlp": 163.625, + "step": 105, + "time_per_iteration": 2.6467208862304688 + }, + { + "auxiliary_loss_clip": 0.11489026, + "auxiliary_loss_mlp": 0.17571044, + "balance_loss_clip": 0.08164956, + "balance_loss_mlp": 0.01872801, + "epoch": 0.006373064782804749, + "flos": 24067651150080.0, + "grad_norm": 56669.614766689854, + "language_loss": 4.9280014, + "learning_rate": 3.002565443382063e-06, + "loss": 5.2186017, + "num_input_tokens_seen": 2217495, + "router_z_loss_clip": 33.1875, + "router_z_loss_mlp": 157.0, + "step": 106, + "time_per_iteration": 2.7375807762145996 + }, + { + "auxiliary_loss_clip": 0.11464141, + "auxiliary_loss_mlp": 0.16512999, + "balance_loss_clip": 0.08158538, + "balance_loss_mlp": 0.01815734, + "epoch": 0.006433188035472719, + "flos": 18338272848000.0, + "grad_norm": 94457.61945163306, + "language_loss": 4.08243847, + "learning_rate": 3.008611048208843e-06, + "loss": 4.36221027, + "num_input_tokens_seen": 2236520, + "router_z_loss_clip": 33.0625, + "router_z_loss_mlp": 146.875, + "step": 107, + "time_per_iteration": 2.6703994274139404 + }, + { + "auxiliary_loss_clip": 0.12281319, + "auxiliary_loss_mlp": 0.04033342, + "balance_loss_clip": 0.08292686, + "balance_loss_mlp": 0.01773516, + "epoch": 0.006493311288140688, + "flos": 62583266257920.0, + "grad_norm": 1.9990534397749096, + "language_loss": 0.6506741, + "learning_rate": 3.014600414036285e-06, + "loss": 0.81382072, + "num_input_tokens_seen": 2300140, + "router_z_loss_clip": 40.0, + "router_z_loss_mlp": 22.640625, + "step": 108, + "time_per_iteration": 3.3318073749542236 + }, + { + "auxiliary_loss_clip": 0.1146347, + "auxiliary_loss_mlp": 0.17600623, + "balance_loss_clip": 0.08161052, + "balance_loss_mlp": 0.01902381, + "epoch": 0.006553434540808658, + "flos": 19506202583040.0, + "grad_norm": 2213.052526088781, + "language_loss": 5.47699499, + "learning_rate": 3.0205345775501937e-06, + "loss": 5.76763535, + "num_input_tokens_seen": 2317320, + "router_z_loss_clip": 33.03125, + "router_z_loss_mlp": 156.875, + "step": 109, + "time_per_iteration": 2.719162940979004 + }, + { + "auxiliary_loss_clip": 0.11452536, + "auxiliary_loss_mlp": 0.16698027, + "balance_loss_clip": 0.08172794, + "balance_loss_mlp": 0.01903106, + "epoch": 0.006613557793476627, + "flos": 21111398449920.0, + "grad_norm": 8171.333832946622, + "language_loss": 4.33011436, + "learning_rate": 3.0264145470332218e-06, + "loss": 4.61161995, + "num_input_tokens_seen": 2337820, + "router_z_loss_clip": 32.765625, + "router_z_loss_mlp": 147.75, + "step": 110, + "time_per_iteration": 2.7021584510803223 + }, + { + "auxiliary_loss_clip": 0.11498255, + "auxiliary_loss_mlp": 0.16723976, + "balance_loss_clip": 0.08168858, + "balance_loss_mlp": 0.01916846, + "epoch": 0.006673681046144597, + "flos": 26037843402240.0, + "grad_norm": 85243.79091039153, + "language_loss": 5.33909988, + "learning_rate": 3.032241303393073e-06, + "loss": 5.62132263, + "num_input_tokens_seen": 2358560, + "router_z_loss_clip": 33.25, + "router_z_loss_mlp": 148.0625, + "step": 111, + "time_per_iteration": 2.763227939605713 + }, + { + "auxiliary_loss_clip": 0.11479855, + "auxiliary_loss_mlp": 0.17865081, + "balance_loss_clip": 0.08154993, + "balance_loss_mlp": 0.01983733, + "epoch": 0.006733804298812566, + "flos": 23154279719040.0, + "grad_norm": 75829.31622331966, + "language_loss": 4.96874857, + "learning_rate": 3.0380158011446e-06, + "loss": 5.26219797, + "num_input_tokens_seen": 2379005, + "router_z_loss_clip": 33.1875, + "router_z_loss_mlp": 158.875, + "step": 112, + "time_per_iteration": 2.656294822692871 + }, + { + "auxiliary_loss_clip": 0.1147141, + "auxiliary_loss_mlp": 0.17070231, + "balance_loss_clip": 0.08172764, + "balance_loss_mlp": 0.01933513, + "epoch": 0.006793927551480535, + "flos": 11769092599680.0, + "grad_norm": 3384.2074822155987, + "language_loss": 4.32218456, + "learning_rate": 3.0437389693482466e-06, + "loss": 4.60760117, + "num_input_tokens_seen": 2395610, + "router_z_loss_clip": 32.96875, + "router_z_loss_mlp": 151.25, + "step": 113, + "time_per_iteration": 2.6669225692749023 + }, + { + "auxiliary_loss_clip": 0.11510996, + "auxiliary_loss_mlp": 0.18198231, + "balance_loss_clip": 0.08184206, + "balance_loss_mlp": 0.019995, + "epoch": 0.006854050804148504, + "flos": 19177990940160.0, + "grad_norm": 1118.9556792976962, + "language_loss": 4.58965397, + "learning_rate": 3.0494117125071475e-06, + "loss": 4.88674641, + "num_input_tokens_seen": 2415005, + "router_z_loss_clip": 33.28125, + "router_z_loss_mlp": 161.875, + "step": 114, + "time_per_iteration": 2.6245124340057373 + }, + { + "auxiliary_loss_clip": 0.11491105, + "auxiliary_loss_mlp": 0.15876909, + "balance_loss_clip": 0.08183911, + "balance_loss_mlp": 0.01912064, + "epoch": 0.006914174056816474, + "flos": 21988488263040.0, + "grad_norm": 3570.8470324102345, + "language_loss": 4.92026377, + "learning_rate": 3.055034911425055e-06, + "loss": 5.19394398, + "num_input_tokens_seen": 2433965, + "router_z_loss_clip": 33.09375, + "router_z_loss_mlp": 139.625, + "step": 115, + "time_per_iteration": 2.694258689880371 + }, + { + "auxiliary_loss_clip": 0.11497033, + "auxiliary_loss_mlp": 0.17786066, + "balance_loss_clip": 0.08183155, + "balance_loss_mlp": 0.02014583, + "epoch": 0.006974297309484443, + "flos": 16294636892160.0, + "grad_norm": 28497.885490954828, + "language_loss": 4.11111546, + "learning_rate": 3.0606094240271244e-06, + "loss": 4.40394688, + "num_input_tokens_seen": 2451605, + "router_z_loss_clip": 33.125, + "router_z_loss_mlp": 157.75, + "step": 116, + "time_per_iteration": 2.6153717041015625 + }, + { + "auxiliary_loss_clip": 0.11479296, + "auxiliary_loss_mlp": 0.17568065, + "balance_loss_clip": 0.08183482, + "balance_loss_mlp": 0.02040722, + "epoch": 0.007034420562152413, + "flos": 26111161324800.0, + "grad_norm": 6129.230277666204, + "language_loss": 4.56221914, + "learning_rate": 3.0661360861454656e-06, + "loss": 4.8526926, + "num_input_tokens_seen": 2472035, + "router_z_loss_clip": 32.90625, + "router_z_loss_mlp": 155.25, + "step": 117, + "time_per_iteration": 2.698347568511963 + }, + { + "auxiliary_loss_clip": 0.11602448, + "auxiliary_loss_mlp": 0.18875569, + "balance_loss_clip": 0.08221327, + "balance_loss_mlp": 0.02151936, + "epoch": 0.007094543814820382, + "flos": 14208933386880.0, + "grad_norm": 568.8145863995832, + "language_loss": 4.50002289, + "learning_rate": 3.071615712271274e-06, + "loss": 4.80480337, + "num_input_tokens_seen": 2489285, + "router_z_loss_clip": 33.75, + "router_z_loss_mlp": 167.375, + "step": 118, + "time_per_iteration": 2.614288091659546 + }, + { + "auxiliary_loss_clip": 0.11586175, + "auxiliary_loss_mlp": 0.17393641, + "balance_loss_clip": 0.08235049, + "balance_loss_mlp": 0.02086024, + "epoch": 0.007154667067488351, + "flos": 14981329123200.0, + "grad_norm": 337.3163881950513, + "language_loss": 4.89806128, + "learning_rate": 3.0770490962752172e-06, + "loss": 5.18785954, + "num_input_tokens_seen": 2506460, + "router_z_loss_clip": 33.5625, + "router_z_loss_mlp": 153.0, + "step": 119, + "time_per_iteration": 2.6733670234680176 + }, + { + "auxiliary_loss_clip": 0.11613901, + "auxiliary_loss_mlp": 0.17884746, + "balance_loss_clip": 0.08224175, + "balance_loss_mlp": 0.02088849, + "epoch": 0.00721479032015632, + "flos": 20199452538240.0, + "grad_norm": 4431.2993639449, + "language_loss": 4.39706039, + "learning_rate": 3.082437012097686e-06, + "loss": 4.69204712, + "num_input_tokens_seen": 2525565, + "router_z_loss_clip": 33.9375, + "router_z_loss_mlp": 157.75, + "step": 120, + "time_per_iteration": 2.6733429431915283 + }, + { + "auxiliary_loss_clip": 0.11614023, + "auxiliary_loss_mlp": 0.18062758, + "balance_loss_clip": 0.0821183, + "balance_loss_mlp": 0.02144791, + "epoch": 0.00727491357282429, + "flos": 23153650813440.0, + "grad_norm": 6523.034573603343, + "language_loss": 5.06446743, + "learning_rate": 3.0877802144103967e-06, + "loss": 5.36123562, + "num_input_tokens_seen": 2546605, + "router_z_loss_clip": 34.0, + "router_z_loss_mlp": 159.0, + "step": 121, + "time_per_iteration": 2.726327419281006 + }, + { + "auxiliary_loss_clip": 0.11618941, + "auxiliary_loss_mlp": 0.17642631, + "balance_loss_clip": 0.08232379, + "balance_loss_mlp": 0.02127495, + "epoch": 0.007335036825492259, + "flos": 15526811203200.0, + "grad_norm": 1010.4173973733286, + "language_loss": 4.56235886, + "learning_rate": 3.09307943925077e-06, + "loss": 4.85497475, + "num_input_tokens_seen": 2560730, + "router_z_loss_clip": 33.875, + "router_z_loss_mlp": 155.125, + "step": 122, + "time_per_iteration": 2.640110969543457 + }, + { + "auxiliary_loss_clip": 0.11591011, + "auxiliary_loss_mlp": 0.16755471, + "balance_loss_clip": 0.08221178, + "balance_loss_mlp": 0.02094828, + "epoch": 0.007395160078160229, + "flos": 24250233196800.0, + "grad_norm": 4778.191954305265, + "language_loss": 4.97837877, + "learning_rate": 3.0983354046304154e-06, + "loss": 5.2618432, + "num_input_tokens_seen": 2579550, + "router_z_loss_clip": 33.625, + "router_z_loss_mlp": 146.625, + "step": 123, + "time_per_iteration": 2.689462661743164 + }, + { + "auxiliary_loss_clip": 0.11583175, + "auxiliary_loss_mlp": 0.16522312, + "balance_loss_clip": 0.08218054, + "balance_loss_mlp": 0.02069187, + "epoch": 0.007455283330828198, + "flos": 31767976391040.0, + "grad_norm": 918.147653305623, + "language_loss": 4.24658871, + "learning_rate": 3.103548811118979e-06, + "loss": 4.5276432, + "num_input_tokens_seen": 2600390, + "router_z_loss_clip": 33.6875, + "router_z_loss_mlp": 144.625, + "step": 124, + "time_per_iteration": 2.79850172996521 + }, + { + "auxiliary_loss_clip": 0.11631332, + "auxiliary_loss_mlp": 0.17508414, + "balance_loss_clip": 0.08243011, + "balance_loss_mlp": 0.02151969, + "epoch": 0.007515406583496167, + "flos": 26622458138880.0, + "grad_norm": 2521.4972321949017, + "language_loss": 4.22364092, + "learning_rate": 3.108720342404542e-06, + "loss": 4.51503849, + "num_input_tokens_seen": 2620770, + "router_z_loss_clip": 33.90625, + "router_z_loss_mlp": 153.375, + "step": 125, + "time_per_iteration": 2.699488401412964 + }, + { + "auxiliary_loss_clip": 0.11621339, + "auxiliary_loss_mlp": 0.16743667, + "balance_loss_clip": 0.08258513, + "balance_loss_mlp": 0.02131851, + "epoch": 0.007575529836164136, + "flos": 18229637629440.0, + "grad_norm": 2114.724785338214, + "language_loss": 4.42466068, + "learning_rate": 3.1138506658316945e-06, + "loss": 4.70831108, + "num_input_tokens_seen": 2639900, + "router_z_loss_clip": 33.625, + "router_z_loss_mlp": 146.125, + "step": 126, + "time_per_iteration": 2.65913987159729 + }, + { + "auxiliary_loss_clip": 0.11678092, + "auxiliary_loss_mlp": 0.16983882, + "balance_loss_clip": 0.08243092, + "balance_loss_mlp": 0.02127924, + "epoch": 0.007635653088832106, + "flos": 21586916770560.0, + "grad_norm": 719.841664884419, + "language_loss": 3.98921776, + "learning_rate": 3.1189404329183404e-06, + "loss": 4.2758379, + "num_input_tokens_seen": 2657450, + "router_z_loss_clip": 34.3125, + "router_z_loss_mlp": 148.625, + "step": 127, + "time_per_iteration": 2.6392276287078857 + }, + { + "auxiliary_loss_clip": 0.11679719, + "auxiliary_loss_mlp": 0.17065403, + "balance_loss_clip": 0.08245254, + "balance_loss_mlp": 0.02160617, + "epoch": 0.007695776341500075, + "flos": 25382216730240.0, + "grad_norm": 1269.777428310943, + "language_loss": 4.33711529, + "learning_rate": 3.1239902798522317e-06, + "loss": 4.62456656, + "num_input_tokens_seen": 2678150, + "router_z_loss_clip": 34.3125, + "router_z_loss_mlp": 149.125, + "step": 128, + "time_per_iteration": 2.698997974395752 + }, + { + "auxiliary_loss_clip": 0.11722346, + "auxiliary_loss_mlp": 0.16804715, + "balance_loss_clip": 0.08270991, + "balance_loss_mlp": 0.02131863, + "epoch": 0.007755899594168045, + "flos": 22350088558080.0, + "grad_norm": 1159.6537901720856, + "language_loss": 4.87967634, + "learning_rate": 3.129000827968184e-06, + "loss": 5.16494703, + "num_input_tokens_seen": 2698290, + "router_z_loss_clip": 34.5, + "router_z_loss_mlp": 146.625, + "step": 129, + "time_per_iteration": 2.6568491458892822 + }, + { + "auxiliary_loss_clip": 0.11725748, + "auxiliary_loss_mlp": 0.17228858, + "balance_loss_clip": 0.08278215, + "balance_loss_mlp": 0.02165382, + "epoch": 0.007816022846836013, + "flos": 22644869621760.0, + "grad_norm": 436.4430863377033, + "language_loss": 5.01482534, + "learning_rate": 3.133972684206866e-06, + "loss": 5.30437136, + "num_input_tokens_seen": 2717630, + "router_z_loss_clip": 34.4375, + "router_z_loss_mlp": 150.5, + "step": 130, + "time_per_iteration": 2.7268729209899902 + }, + { + "auxiliary_loss_clip": 0.11697873, + "auxiliary_loss_mlp": 0.16884172, + "balance_loss_clip": 0.08257942, + "balance_loss_mlp": 0.02162493, + "epoch": 0.007876146099503984, + "flos": 18188115131520.0, + "grad_norm": 1162.2622739405722, + "language_loss": 4.07958698, + "learning_rate": 3.138906441556014e-06, + "loss": 4.36540699, + "num_input_tokens_seen": 2735835, + "router_z_loss_clip": 34.40625, + "router_z_loss_mlp": 147.25, + "step": 131, + "time_per_iteration": 2.7109498977661133 + }, + { + "auxiliary_loss_clip": 0.11733647, + "auxiliary_loss_mlp": 0.16117501, + "balance_loss_clip": 0.08280095, + "balance_loss_mlp": 0.02128244, + "epoch": 0.007936269352171952, + "flos": 27125788815360.0, + "grad_norm": 7543.348079431309, + "language_loss": 4.20423412, + "learning_rate": 3.143802679474861e-06, + "loss": 4.48274565, + "num_input_tokens_seen": 2756335, + "router_z_loss_clip": 34.5, + "router_z_loss_mlp": 140.0, + "step": 132, + "time_per_iteration": 2.717806816101074 + }, + { + "auxiliary_loss_clip": 0.11797122, + "auxiliary_loss_mlp": 0.16945273, + "balance_loss_clip": 0.08290964, + "balance_loss_mlp": 0.0219918, + "epoch": 0.007996392604839923, + "flos": 19032403271040.0, + "grad_norm": 824.1057706186339, + "language_loss": 4.52130318, + "learning_rate": 3.1486619643025565e-06, + "loss": 4.80872679, + "num_input_tokens_seen": 2775090, + "router_z_loss_clip": 34.96875, + "router_z_loss_mlp": 147.375, + "step": 133, + "time_per_iteration": 2.6183056831359863 + }, + { + "auxiliary_loss_clip": 0.11778916, + "auxiliary_loss_mlp": 0.1607928, + "balance_loss_clip": 0.08279899, + "balance_loss_mlp": 0.02163264, + "epoch": 0.008056515857507891, + "flos": 25491271219200.0, + "grad_norm": 23901.09716796145, + "language_loss": 3.33778429, + "learning_rate": 3.153484849651286e-06, + "loss": 3.61636591, + "num_input_tokens_seen": 2795320, + "router_z_loss_clip": 34.96875, + "router_z_loss_mlp": 139.25, + "step": 134, + "time_per_iteration": 2.715651750564575 + }, + { + "auxiliary_loss_clip": 0.11796138, + "auxiliary_loss_mlp": 0.16928384, + "balance_loss_clip": 0.08284588, + "balance_loss_mlp": 0.02206703, + "epoch": 0.00811663911017586, + "flos": 20563694236800.0, + "grad_norm": 532.3002515432323, + "language_loss": 4.31598186, + "learning_rate": 3.1582718767847806e-06, + "loss": 4.60322666, + "num_input_tokens_seen": 2812815, + "router_z_loss_clip": 35.1875, + "router_z_loss_mlp": 147.25, + "step": 135, + "time_per_iteration": 2.658189296722412 + }, + { + "auxiliary_loss_clip": 0.11834078, + "auxiliary_loss_mlp": 0.17649791, + "balance_loss_clip": 0.08286304, + "balance_loss_mlp": 0.02256724, + "epoch": 0.00817676236284383, + "flos": 18804483365760.0, + "grad_norm": 591.2706889750153, + "language_loss": 4.16468382, + "learning_rate": 3.1630235749828485e-06, + "loss": 4.45952272, + "num_input_tokens_seen": 2830445, + "router_z_loss_clip": 35.4375, + "router_z_loss_mlp": 153.75, + "step": 136, + "time_per_iteration": 5.634068250656128 + }, + { + "auxiliary_loss_clip": 0.11831227, + "auxiliary_loss_mlp": 0.16616376, + "balance_loss_clip": 0.08291583, + "balance_loss_mlp": 0.02193768, + "epoch": 0.008236885615511799, + "flos": 23879576661120.0, + "grad_norm": 754.59577193491, + "language_loss": 4.28476763, + "learning_rate": 3.1677404618925676e-06, + "loss": 4.56924391, + "num_input_tokens_seen": 2846965, + "router_z_loss_clip": 35.46875, + "router_z_loss_mlp": 144.25, + "step": 137, + "time_per_iteration": 2.6984925270080566 + }, + { + "auxiliary_loss_clip": 0.11840196, + "auxiliary_loss_mlp": 0.16576298, + "balance_loss_clip": 0.08293904, + "balance_loss_mlp": 0.02214726, + "epoch": 0.00829700886817977, + "flos": 24650379169920.0, + "grad_norm": 767.1857414798482, + "language_loss": 4.50048828, + "learning_rate": 3.1724230438666953e-06, + "loss": 4.78465271, + "num_input_tokens_seen": 2867520, + "router_z_loss_clip": 35.5, + "router_z_loss_mlp": 143.5625, + "step": 138, + "time_per_iteration": 4.106135368347168 + }, + { + "auxiliary_loss_clip": 0.11846266, + "auxiliary_loss_mlp": 0.16453376, + "balance_loss_clip": 0.08313362, + "balance_loss_mlp": 0.02219978, + "epoch": 0.008357132120847738, + "flos": 25268550266880.0, + "grad_norm": 3135.202751990444, + "language_loss": 4.53827906, + "learning_rate": 3.177071816289865e-06, + "loss": 4.82127523, + "num_input_tokens_seen": 2885675, + "router_z_loss_clip": 35.34375, + "router_z_loss_mlp": 142.5, + "step": 139, + "time_per_iteration": 2.6956582069396973 + }, + { + "auxiliary_loss_clip": 0.11892673, + "auxiliary_loss_mlp": 0.17064422, + "balance_loss_clip": 0.08314734, + "balance_loss_mlp": 0.02245087, + "epoch": 0.008417255373515706, + "flos": 27352325128320.0, + "grad_norm": 729.9492101747932, + "language_loss": 3.41289186, + "learning_rate": 3.181687263893095e-06, + "loss": 3.70246267, + "num_input_tokens_seen": 2905960, + "router_z_loss_clip": 35.71875, + "router_z_loss_mlp": 148.125, + "step": 140, + "time_per_iteration": 2.6964235305786133 + }, + { + "auxiliary_loss_clip": 0.1186142, + "auxiliary_loss_mlp": 0.16847792, + "balance_loss_clip": 0.08325124, + "balance_loss_mlp": 0.02223768, + "epoch": 0.008477378626183677, + "flos": 17644771330560.0, + "grad_norm": 9248.736899536998, + "language_loss": 3.54738212, + "learning_rate": 3.186269861057098e-06, + "loss": 3.83447456, + "num_input_tokens_seen": 2922780, + "router_z_loss_clip": 35.375, + "router_z_loss_mlp": 146.125, + "step": 141, + "time_per_iteration": 2.6551992893218994 + }, + { + "auxiliary_loss_clip": 0.11875261, + "auxiliary_loss_mlp": 0.17182453, + "balance_loss_clip": 0.08333448, + "balance_loss_mlp": 0.02241047, + "epoch": 0.008537501878851645, + "flos": 13886465748480.0, + "grad_norm": 1195.8886145818353, + "language_loss": 3.75801992, + "learning_rate": 3.1908200721048745e-06, + "loss": 4.04859734, + "num_input_tokens_seen": 2938765, + "router_z_loss_clip": 35.46875, + "router_z_loss_mlp": 149.375, + "step": 142, + "time_per_iteration": 2.613173246383667 + }, + { + "auxiliary_loss_clip": 0.11767568, + "auxiliary_loss_mlp": 0.03479403, + "balance_loss_clip": 0.08269441, + "balance_loss_mlp": 0.01324862, + "epoch": 0.008597625131519616, + "flos": 71270783976960.0, + "grad_norm": 1.6897091068609469, + "language_loss": 0.6651473, + "learning_rate": 3.195338351584042e-06, + "loss": 0.81761706, + "num_input_tokens_seen": 3006665, + "router_z_loss_clip": 35.0, + "router_z_loss_mlp": 21.5625, + "step": 143, + "time_per_iteration": 3.571974754333496 + }, + { + "auxiliary_loss_clip": 0.11831102, + "auxiliary_loss_mlp": 0.18004906, + "balance_loss_clip": 0.08322103, + "balance_loss_mlp": 0.02245629, + "epoch": 0.008657748384187584, + "flos": 17608573566720.0, + "grad_norm": 764.3395719536082, + "language_loss": 4.02781963, + "learning_rate": 3.1998251445393258e-06, + "loss": 4.32617998, + "num_input_tokens_seen": 3024335, + "router_z_loss_clip": 35.125, + "router_z_loss_mlp": 157.625, + "step": 144, + "time_per_iteration": 2.950308322906494 + }, + { + "auxiliary_loss_clip": 0.11815393, + "auxiliary_loss_mlp": 0.1653876, + "balance_loss_clip": 0.08320558, + "balance_loss_mlp": 0.021955, + "epoch": 0.008717871636855555, + "flos": 19720789689600.0, + "grad_norm": 995.118837229873, + "language_loss": 3.85104275, + "learning_rate": 3.204280886775619e-06, + "loss": 4.13458443, + "num_input_tokens_seen": 3043300, + "router_z_loss_clip": 34.9375, + "router_z_loss_mlp": 143.625, + "step": 145, + "time_per_iteration": 2.704049587249756 + }, + { + "auxiliary_loss_clip": 0.11712223, + "auxiliary_loss_mlp": 0.1568643, + "balance_loss_clip": 0.08270143, + "balance_loss_mlp": 0.02154936, + "epoch": 0.008777994889523523, + "flos": 24724325998080.0, + "grad_norm": 15039.120691806027, + "language_loss": 3.98885298, + "learning_rate": 3.208706005112005e-06, + "loss": 4.26283932, + "num_input_tokens_seen": 3064610, + "router_z_loss_clip": 34.40625, + "router_z_loss_mlp": 135.4375, + "step": 146, + "time_per_iteration": 2.7329108715057373 + }, + { + "auxiliary_loss_clip": 0.11446112, + "auxiliary_loss_mlp": 0.02845502, + "balance_loss_clip": 0.08152023, + "balance_loss_mlp": 0.01408125, + "epoch": 0.008838118142191492, + "flos": 70150974013440.0, + "grad_norm": 1.1651618479175945, + "language_loss": 0.59517723, + "learning_rate": 3.213100917627104e-06, + "loss": 0.73809338, + "num_input_tokens_seen": 3130385, + "router_z_loss_clip": 33.0, + "router_z_loss_mlp": 14.3671875, + "step": 147, + "time_per_iteration": 3.3949942588806152 + }, + { + "auxiliary_loss_clip": 0.11677637, + "auxiliary_loss_mlp": 0.16713935, + "balance_loss_clip": 0.08274397, + "balance_loss_mlp": 0.02199776, + "epoch": 0.008898241394859462, + "flos": 20050510705920.0, + "grad_norm": 1889.1884601694564, + "language_loss": 4.35780334, + "learning_rate": 3.2174660338961135e-06, + "loss": 4.64171886, + "num_input_tokens_seen": 3149760, + "router_z_loss_clip": 33.96875, + "router_z_loss_mlp": 145.25, + "step": 148, + "time_per_iteration": 2.7146079540252686 + }, + { + "auxiliary_loss_clip": 0.1159438, + "auxiliary_loss_mlp": 0.16573352, + "balance_loss_clip": 0.08248326, + "balance_loss_mlp": 0.02217881, + "epoch": 0.008958364647527431, + "flos": 10748217980160.0, + "grad_norm": 637.0991660467967, + "language_loss": 4.14174032, + "learning_rate": 3.2218017552198588e-06, + "loss": 4.42341805, + "num_input_tokens_seen": 3164500, + "router_z_loss_clip": 33.4375, + "router_z_loss_mlp": 143.625, + "step": 149, + "time_per_iteration": 2.661672353744507 + }, + { + "auxiliary_loss_clip": 0.11618437, + "auxiliary_loss_mlp": 0.16563556, + "balance_loss_clip": 0.08263792, + "balance_loss_mlp": 0.02201984, + "epoch": 0.009018487900195401, + "flos": 29134317110400.0, + "grad_norm": 1769.3998229499293, + "language_loss": 4.95698929, + "learning_rate": 3.226108474846181e-06, + "loss": 5.23880959, + "num_input_tokens_seen": 3182455, + "router_z_loss_clip": 33.5625, + "router_z_loss_mlp": 143.6875, + "step": 150, + "time_per_iteration": 2.7311227321624756 + }, + { + "auxiliary_loss_clip": 0.11585926, + "auxiliary_loss_mlp": 0.16123089, + "balance_loss_clip": 0.08249478, + "balance_loss_mlp": 0.02219281, + "epoch": 0.00907861115286337, + "flos": 32972020035840.0, + "grad_norm": 2114.6136002652206, + "language_loss": 3.36094427, + "learning_rate": 3.2303865781839817e-06, + "loss": 3.63803458, + "num_input_tokens_seen": 3203995, + "router_z_loss_clip": 33.34375, + "router_z_loss_mlp": 139.125, + "step": 151, + "time_per_iteration": 2.7520253658294678 + }, + { + "auxiliary_loss_clip": 0.115492, + "auxiliary_loss_mlp": 0.15748456, + "balance_loss_clip": 0.08239767, + "balance_loss_mlp": 0.02198652, + "epoch": 0.009138734405531338, + "flos": 21768911838720.0, + "grad_norm": 3311.474565423633, + "language_loss": 3.73547316, + "learning_rate": 3.234636443010188e-06, + "loss": 4.00844955, + "num_input_tokens_seen": 3222575, + "router_z_loss_clip": 33.09375, + "router_z_loss_mlp": 135.625, + "step": 152, + "time_per_iteration": 2.694563865661621 + }, + { + "auxiliary_loss_clip": 0.1159073, + "auxiliary_loss_mlp": 0.1623821, + "balance_loss_clip": 0.08250044, + "balance_loss_mlp": 0.02248952, + "epoch": 0.009198857658199309, + "flos": 20847532343040.0, + "grad_norm": 1087.0956983151382, + "language_loss": 3.84302998, + "learning_rate": 3.238858439669943e-06, + "loss": 4.12131977, + "num_input_tokens_seen": 3240180, + "router_z_loss_clip": 33.40625, + "router_z_loss_mlp": 139.875, + "step": 153, + "time_per_iteration": 2.6366450786590576 + }, + { + "auxiliary_loss_clip": 0.11564142, + "auxiliary_loss_mlp": 0.15476364, + "balance_loss_clip": 0.08260261, + "balance_loss_mlp": 0.02207321, + "epoch": 0.009258980910867277, + "flos": 24834386736000.0, + "grad_norm": 8366.148944916698, + "language_loss": 4.13687325, + "learning_rate": 3.2430529312702712e-06, + "loss": 4.40727806, + "num_input_tokens_seen": 3259800, + "router_z_loss_clip": 33.03125, + "router_z_loss_mlp": 132.8125, + "step": 154, + "time_per_iteration": 2.7312138080596924 + }, + { + "auxiliary_loss_clip": 0.11535051, + "auxiliary_loss_mlp": 0.15077396, + "balance_loss_clip": 0.08268774, + "balance_loss_mlp": 0.02198978, + "epoch": 0.009319104163535248, + "flos": 28775442072960.0, + "grad_norm": 662.1258045248602, + "language_loss": 4.14579964, + "learning_rate": 3.2472202738674737e-06, + "loss": 4.41192484, + "num_input_tokens_seen": 3280400, + "router_z_loss_clip": 32.71875, + "router_z_loss_mlp": 128.6875, + "step": 155, + "time_per_iteration": 2.755199909210205 + }, + { + "auxiliary_loss_clip": 0.11566834, + "auxiliary_loss_mlp": 0.15004471, + "balance_loss_clip": 0.08261703, + "balance_loss_mlp": 0.02193191, + "epoch": 0.009379227416203216, + "flos": 16587698947200.0, + "grad_norm": 731.5664855161135, + "language_loss": 3.49704862, + "learning_rate": 3.2513608166485063e-06, + "loss": 3.76276183, + "num_input_tokens_seen": 3297600, + "router_z_loss_clip": 33.09375, + "router_z_loss_mlp": 128.125, + "step": 156, + "time_per_iteration": 2.7707407474517822 + }, + { + "auxiliary_loss_clip": 0.11568415, + "auxiliary_loss_mlp": 0.15332887, + "balance_loss_clip": 0.08266081, + "balance_loss_mlp": 0.02216432, + "epoch": 0.009439350668871187, + "flos": 18335337955200.0, + "grad_norm": 795.683005311381, + "language_loss": 3.94911337, + "learning_rate": 3.2554749021065498e-06, + "loss": 4.2181263, + "num_input_tokens_seen": 3313635, + "router_z_loss_clip": 32.96875, + "router_z_loss_mlp": 131.25, + "step": 157, + "time_per_iteration": 2.6737098693847656 + }, + { + "auxiliary_loss_clip": 0.11567172, + "auxiliary_loss_mlp": 0.15600383, + "balance_loss_clip": 0.0828969, + "balance_loss_mlp": 0.02264203, + "epoch": 0.009499473921539155, + "flos": 24356310865920.0, + "grad_norm": 748.6515809747107, + "language_loss": 3.9944849, + "learning_rate": 3.2595628662110186e-06, + "loss": 4.26616049, + "num_input_tokens_seen": 3333735, + "router_z_loss_clip": 32.75, + "router_z_loss_mlp": 133.5625, + "step": 158, + "time_per_iteration": 2.6704254150390625 + }, + { + "auxiliary_loss_clip": 0.11561831, + "auxiliary_loss_mlp": 0.15665153, + "balance_loss_clip": 0.08273103, + "balance_loss_mlp": 0.02231314, + "epoch": 0.009559597174207124, + "flos": 16404949192320.0, + "grad_norm": 1901.311070356518, + "language_loss": 3.80921197, + "learning_rate": 3.2636250385721982e-06, + "loss": 4.08148146, + "num_input_tokens_seen": 3348800, + "router_z_loss_clip": 32.90625, + "router_z_loss_mlp": 134.4375, + "step": 159, + "time_per_iteration": 2.6218996047973633 + }, + { + "auxiliary_loss_clip": 0.11580203, + "auxiliary_loss_mlp": 0.15643886, + "balance_loss_clip": 0.08278053, + "balance_loss_mlp": 0.02252773, + "epoch": 0.009619720426875094, + "flos": 22863523651200.0, + "grad_norm": 1785.522909187837, + "language_loss": 3.8831954, + "learning_rate": 3.2676617426007263e-06, + "loss": 4.15543652, + "num_input_tokens_seen": 3368595, + "router_z_loss_clip": 33.0, + "router_z_loss_mlp": 134.0, + "step": 160, + "time_per_iteration": 2.6699254512786865 + }, + { + "auxiliary_loss_clip": 0.11567888, + "auxiliary_loss_mlp": 0.15128596, + "balance_loss_clip": 0.08280417, + "balance_loss_mlp": 0.02237971, + "epoch": 0.009679843679543063, + "flos": 19140954635520.0, + "grad_norm": 1894.5705497879367, + "language_loss": 4.38242626, + "learning_rate": 3.2716732956621042e-06, + "loss": 4.6493907, + "num_input_tokens_seen": 3384975, + "router_z_loss_clip": 32.890625, + "router_z_loss_mlp": 129.0, + "step": 161, + "time_per_iteration": 2.692594289779663 + }, + { + "auxiliary_loss_clip": 0.11596949, + "auxiliary_loss_mlp": 0.15413821, + "balance_loss_clip": 0.08296333, + "balance_loss_mlp": 0.02279055, + "epoch": 0.009739966932211033, + "flos": 20309219786880.0, + "grad_norm": 1092.6315431795774, + "language_loss": 3.67637897, + "learning_rate": 3.2756600092264203e-06, + "loss": 3.94648647, + "num_input_tokens_seen": 3404755, + "router_z_loss_clip": 33.0, + "router_z_loss_mlp": 131.4375, + "step": 162, + "time_per_iteration": 2.684589147567749 + }, + { + "auxiliary_loss_clip": 0.10812573, + "auxiliary_loss_mlp": 0.02121325, + "balance_loss_clip": 0.08169468, + "balance_loss_mlp": 0.01469775, + "epoch": 0.009800090184879002, + "flos": 67053200567040.0, + "grad_norm": 1.455168404801105, + "language_loss": 0.72263706, + "learning_rate": 3.279622189013474e-06, + "loss": 0.85197604, + "num_input_tokens_seen": 3467210, + "router_z_loss_clip": 26.484375, + "router_z_loss_mlp": 6.515625, + "step": 163, + "time_per_iteration": 3.2609994411468506 + }, + { + "auxiliary_loss_clip": 0.1158057, + "auxiliary_loss_mlp": 0.15459523, + "balance_loss_clip": 0.08303102, + "balance_loss_mlp": 0.02282033, + "epoch": 0.00986021343754697, + "flos": 17170301185920.0, + "grad_norm": 728.8786194893343, + "language_loss": 3.07243919, + "learning_rate": 3.283560135133457e-06, + "loss": 3.34283996, + "num_input_tokens_seen": 3483220, + "router_z_loss_clip": 32.765625, + "router_z_loss_mlp": 131.8125, + "step": 164, + "time_per_iteration": 2.6558001041412354 + }, + { + "auxiliary_loss_clip": 0.11589515, + "auxiliary_loss_mlp": 0.15754591, + "balance_loss_clip": 0.08312181, + "balance_loss_mlp": 0.02308546, + "epoch": 0.00992033669021494, + "flos": 17755293265920.0, + "grad_norm": 847.0745501241739, + "language_loss": 3.51890922, + "learning_rate": 3.2874741422233565e-06, + "loss": 3.79235029, + "num_input_tokens_seen": 3501465, + "router_z_loss_clip": 32.78125, + "router_z_loss_mlp": 134.4375, + "step": 165, + "time_per_iteration": 2.661271095275879 + }, + { + "auxiliary_loss_clip": 0.11568248, + "auxiliary_loss_mlp": 0.15508898, + "balance_loss_clip": 0.08301617, + "balance_loss_mlp": 0.02294787, + "epoch": 0.00998045994288291, + "flos": 25303490219520.0, + "grad_norm": 327.0790624727143, + "language_loss": 3.23893571, + "learning_rate": 3.2913644995792465e-06, + "loss": 3.50970697, + "num_input_tokens_seen": 3520480, + "router_z_loss_clip": 32.6875, + "router_z_loss_mlp": 132.3125, + "step": 166, + "time_per_iteration": 2.710336923599243 + }, + { + "auxiliary_loss_clip": 0.11574914, + "auxiliary_loss_mlp": 0.14880663, + "balance_loss_clip": 0.08314175, + "balance_loss_mlp": 0.02301317, + "epoch": 0.01004058319555088, + "flos": 32305869676800.0, + "grad_norm": 776.5856268380442, + "language_loss": 4.07326555, + "learning_rate": 3.2952314912845914e-06, + "loss": 4.33782148, + "num_input_tokens_seen": 3539570, + "router_z_loss_clip": 32.609375, + "router_z_loss_mlp": 125.8125, + "step": 167, + "time_per_iteration": 2.779219150543213 + }, + { + "auxiliary_loss_clip": 0.1150827, + "auxiliary_loss_mlp": 0.15720402, + "balance_loss_clip": 0.083069, + "balance_loss_mlp": 0.02304874, + "epoch": 0.010100706448218848, + "flos": 11323399132800.0, + "grad_norm": 2394.835407434967, + "language_loss": 3.28905821, + "learning_rate": 3.299075396334735e-06, + "loss": 3.5613451, + "num_input_tokens_seen": 3555465, + "router_z_loss_clip": 32.0, + "router_z_loss_mlp": 134.25, + "step": 168, + "time_per_iteration": 2.6511645317077637 + }, + { + "auxiliary_loss_clip": 0.11477365, + "auxiliary_loss_mlp": 0.1529358, + "balance_loss_clip": 0.08283502, + "balance_loss_mlp": 0.02299196, + "epoch": 0.010160829700886819, + "flos": 29727820379520.0, + "grad_norm": 656.1528496227621, + "language_loss": 3.4663558, + "learning_rate": 3.3028964887576868e-06, + "loss": 3.73406529, + "num_input_tokens_seen": 3578970, + "router_z_loss_clip": 31.921875, + "router_z_loss_mlp": 130.0, + "step": 169, + "time_per_iteration": 2.744943141937256 + }, + { + "auxiliary_loss_clip": 0.1151928, + "auxiliary_loss_mlp": 0.1559048, + "balance_loss_clip": 0.08316396, + "balance_loss_mlp": 0.02315333, + "epoch": 0.010220952953554787, + "flos": 20418567765120.0, + "grad_norm": 1313.5821328962659, + "language_loss": 3.30928183, + "learning_rate": 3.306695037731344e-06, + "loss": 3.58037925, + "num_input_tokens_seen": 3597275, + "router_z_loss_clip": 32.03125, + "router_z_loss_mlp": 132.75, + "step": 170, + "time_per_iteration": 2.6904942989349365 + }, + { + "auxiliary_loss_clip": 0.11476055, + "auxiliary_loss_mlp": 0.14880618, + "balance_loss_clip": 0.08295664, + "balance_loss_mlp": 0.02301271, + "epoch": 0.010281076206222756, + "flos": 31293170830080.0, + "grad_norm": 1393.3935417181144, + "language_loss": 3.61100364, + "learning_rate": 3.3104713076972827e-06, + "loss": 3.87457037, + "num_input_tokens_seen": 3618905, + "router_z_loss_clip": 31.84375, + "router_z_loss_mlp": 125.75, + "step": 171, + "time_per_iteration": 2.7253830432891846 + }, + { + "auxiliary_loss_clip": 0.11506656, + "auxiliary_loss_mlp": 0.15002409, + "balance_loss_clip": 0.08299719, + "balance_loss_mlp": 0.02294889, + "epoch": 0.010341199458890726, + "flos": 21988949460480.0, + "grad_norm": 857.6014739419991, + "language_loss": 3.63604832, + "learning_rate": 3.314225558471224e-06, + "loss": 3.90113878, + "num_input_tokens_seen": 3639610, + "router_z_loss_clip": 32.015625, + "router_z_loss_mlp": 127.1875, + "step": 172, + "time_per_iteration": 2.687918186187744 + }, + { + "auxiliary_loss_clip": 0.11501465, + "auxiliary_loss_mlp": 0.15934135, + "balance_loss_clip": 0.08304699, + "balance_loss_mlp": 0.02359916, + "epoch": 0.010401322711558695, + "flos": 30818449123200.0, + "grad_norm": 2776.6711688344126, + "language_loss": 3.43709183, + "learning_rate": 3.317958045350308e-06, + "loss": 3.71144772, + "num_input_tokens_seen": 3664030, + "router_z_loss_clip": 31.9375, + "router_z_loss_mlp": 135.6875, + "step": 173, + "time_per_iteration": 2.760416030883789 + }, + { + "auxiliary_loss_clip": 0.11548179, + "auxiliary_loss_mlp": 0.15753293, + "balance_loss_clip": 0.08317138, + "balance_loss_mlp": 0.02337765, + "epoch": 0.010461445964226665, + "flos": 24721642667520.0, + "grad_norm": 1049.1047345334737, + "language_loss": 3.46181607, + "learning_rate": 3.3216690192172596e-06, + "loss": 3.73483086, + "num_input_tokens_seen": 3683615, + "router_z_loss_clip": 32.28125, + "router_z_loss_mlp": 134.125, + "step": 174, + "time_per_iteration": 2.8112432956695557 + }, + { + "auxiliary_loss_clip": 0.11529493, + "auxiliary_loss_mlp": 0.16248052, + "balance_loss_clip": 0.08304952, + "balance_loss_mlp": 0.02319829, + "epoch": 0.010521569216894634, + "flos": 27717950419200.0, + "grad_norm": 1443.6409322594398, + "language_loss": 3.14877939, + "learning_rate": 3.325358726641591e-06, + "loss": 3.42655468, + "num_input_tokens_seen": 3704540, + "router_z_loss_clip": 32.265625, + "router_z_loss_mlp": 139.25, + "step": 175, + "time_per_iteration": 5.6078009605407715 + }, + { + "auxiliary_loss_clip": 0.11549105, + "auxiliary_loss_mlp": 0.15645993, + "balance_loss_clip": 0.08317456, + "balance_loss_mlp": 0.02328122, + "epoch": 0.010581692469562603, + "flos": 12463223022720.0, + "grad_norm": 956.7802143525229, + "language_loss": 3.34866667, + "learning_rate": 3.329027409977902e-06, + "loss": 3.62061763, + "num_input_tokens_seen": 3721320, + "router_z_loss_clip": 32.34375, + "router_z_loss_mlp": 133.375, + "step": 176, + "time_per_iteration": 4.057558059692383 + }, + { + "auxiliary_loss_clip": 0.11580729, + "auxiliary_loss_mlp": 0.16905147, + "balance_loss_clip": 0.08321375, + "balance_loss_mlp": 0.02378779, + "epoch": 0.010641815722230573, + "flos": 19433723201280.0, + "grad_norm": 1505.424754847227, + "language_loss": 3.25544405, + "learning_rate": 3.3326753074614087e-06, + "loss": 3.54030275, + "num_input_tokens_seen": 3739385, + "router_z_loss_clip": 32.5625, + "router_z_loss_mlp": 145.25, + "step": 177, + "time_per_iteration": 4.175410032272339 + }, + { + "auxiliary_loss_clip": 0.11632887, + "auxiliary_loss_mlp": 0.17182559, + "balance_loss_clip": 0.08330977, + "balance_loss_mlp": 0.02387638, + "epoch": 0.010701938974898541, + "flos": 18338440556160.0, + "grad_norm": 1009.0094276513727, + "language_loss": 3.02760315, + "learning_rate": 3.3363026533007716e-06, + "loss": 3.31575751, + "num_input_tokens_seen": 3756360, + "router_z_loss_clip": 33.046875, + "router_z_loss_mlp": 148.0, + "step": 178, + "time_per_iteration": 2.6476314067840576 + }, + { + "auxiliary_loss_clip": 0.11659138, + "auxiliary_loss_mlp": 0.17559879, + "balance_loss_clip": 0.0834986, + "balance_loss_mlp": 0.02398745, + "epoch": 0.010762062227566512, + "flos": 19209283240320.0, + "grad_norm": 645.2944722680985, + "language_loss": 3.18850112, + "learning_rate": 3.3399096777683303e-06, + "loss": 3.48069143, + "num_input_tokens_seen": 3773930, + "router_z_loss_clip": 33.03125, + "router_z_loss_mlp": 151.5, + "step": 179, + "time_per_iteration": 2.673020601272583 + }, + { + "auxiliary_loss_clip": 0.11646449, + "auxiliary_loss_mlp": 0.17152536, + "balance_loss_clip": 0.0833544, + "balance_loss_mlp": 0.02369822, + "epoch": 0.01082218548023448, + "flos": 31432553297280.0, + "grad_norm": 1138.8337468152163, + "language_loss": 3.61664343, + "learning_rate": 3.3434966072878213e-06, + "loss": 3.90463305, + "num_input_tokens_seen": 3793630, + "router_z_loss_clip": 33.125, + "router_z_loss_mlp": 147.75, + "step": 180, + "time_per_iteration": 2.7129592895507812 + }, + { + "auxiliary_loss_clip": 0.1163583, + "auxiliary_loss_mlp": 0.17579561, + "balance_loss_clip": 0.08352019, + "balance_loss_mlp": 0.02406223, + "epoch": 0.01088230873290245, + "flos": 25053501962880.0, + "grad_norm": 1023.6426422721124, + "language_loss": 3.16591597, + "learning_rate": 3.3470636645196674e-06, + "loss": 3.45807004, + "num_input_tokens_seen": 3813610, + "router_z_loss_clip": 32.875, + "router_z_loss_mlp": 151.5, + "step": 181, + "time_per_iteration": 2.7088735103607178 + }, + { + "auxiliary_loss_clip": 0.11667231, + "auxiliary_loss_mlp": 0.17749819, + "balance_loss_clip": 0.08358228, + "balance_loss_mlp": 0.02381167, + "epoch": 0.01094243198557042, + "flos": 22900056831360.0, + "grad_norm": 355.45097956691654, + "language_loss": 3.57462454, + "learning_rate": 3.3506110684439156e-06, + "loss": 3.86879492, + "num_input_tokens_seen": 3831390, + "router_z_loss_clip": 33.03125, + "router_z_loss_mlp": 153.625, + "step": 182, + "time_per_iteration": 2.6655702590942383 + }, + { + "auxiliary_loss_clip": 0.11774068, + "auxiliary_loss_mlp": 0.186405, + "balance_loss_clip": 0.08392486, + "balance_loss_mlp": 0.02429562, + "epoch": 0.011002555238238388, + "flos": 17170720456320.0, + "grad_norm": 544.9308642616941, + "language_loss": 3.01895189, + "learning_rate": 3.3541390344409054e-06, + "loss": 3.32309771, + "num_input_tokens_seen": 3849705, + "router_z_loss_clip": 33.8125, + "router_z_loss_mlp": 162.0, + "step": 183, + "time_per_iteration": 2.672084331512451 + }, + { + "auxiliary_loss_clip": 0.11731043, + "auxiliary_loss_mlp": 0.17741105, + "balance_loss_clip": 0.0838448, + "balance_loss_mlp": 0.02409074, + "epoch": 0.011062678490906358, + "flos": 22316783760000.0, + "grad_norm": 900.0159693716428, + "language_loss": 3.54977012, + "learning_rate": 3.357647774369736e-06, + "loss": 3.84449148, + "num_input_tokens_seen": 3869230, + "router_z_loss_clip": 33.4375, + "router_z_loss_mlp": 153.25, + "step": 184, + "time_per_iteration": 2.664008140563965 + }, + { + "auxiliary_loss_clip": 0.11698474, + "auxiliary_loss_mlp": 0.18400645, + "balance_loss_clip": 0.08363934, + "balance_loss_mlp": 0.02433849, + "epoch": 0.011122801743574327, + "flos": 24395108106240.0, + "grad_norm": 434.928327577731, + "language_loss": 3.09638596, + "learning_rate": 3.3611374966446085e-06, + "loss": 3.39737701, + "num_input_tokens_seen": 3889735, + "router_z_loss_clip": 33.34375, + "router_z_loss_mlp": 159.5, + "step": 185, + "time_per_iteration": 2.726417303085327 + }, + { + "auxiliary_loss_clip": 0.11759127, + "auxiliary_loss_mlp": 0.17777845, + "balance_loss_clip": 0.08374798, + "balance_loss_mlp": 0.02421399, + "epoch": 0.011182924996242297, + "flos": 18156110071680.0, + "grad_norm": 629.7246053366609, + "language_loss": 2.4891119, + "learning_rate": 3.3646084063091142e-06, + "loss": 2.78448153, + "num_input_tokens_seen": 3908855, + "router_z_loss_clip": 33.84375, + "router_z_loss_mlp": 153.5, + "step": 186, + "time_per_iteration": 2.694352865219116 + }, + { + "auxiliary_loss_clip": 0.11730683, + "auxiliary_loss_mlp": 0.17846453, + "balance_loss_clip": 0.08379789, + "balance_loss_mlp": 0.0240456, + "epoch": 0.011243048248910266, + "flos": 15492206666880.0, + "grad_norm": 204.67136476740635, + "language_loss": 3.6299262, + "learning_rate": 3.3680607051085194e-06, + "loss": 3.9256978, + "num_input_tokens_seen": 3923865, + "router_z_loss_clip": 33.53125, + "router_z_loss_mlp": 154.25, + "step": 187, + "time_per_iteration": 2.6440258026123047 + }, + { + "auxiliary_loss_clip": 0.11782947, + "auxiliary_loss_mlp": 0.18885629, + "balance_loss_clip": 0.08391893, + "balance_loss_mlp": 0.02454964, + "epoch": 0.011303171501578235, + "flos": 40926442383360.0, + "grad_norm": 245.45256433797323, + "language_loss": 2.78124428, + "learning_rate": 3.371494591560139e-06, + "loss": 3.0879302, + "num_input_tokens_seen": 3946870, + "router_z_loss_clip": 33.875, + "router_z_loss_mlp": 164.25, + "step": 188, + "time_per_iteration": 2.8504083156585693 + }, + { + "auxiliary_loss_clip": 0.10094331, + "auxiliary_loss_mlp": 0.0271045, + "balance_loss_clip": 0.08081996, + "balance_loss_mlp": 0.01840699, + "epoch": 0.011363294754246205, + "flos": 66321237225600.0, + "grad_norm": 2.5418158680058287, + "language_loss": 0.5572542, + "learning_rate": 3.3749102610218297e-06, + "loss": 0.68530196, + "num_input_tokens_seen": 4010005, + "router_z_loss_clip": 20.140625, + "router_z_loss_mlp": 8.71875, + "step": 189, + "time_per_iteration": 3.351346492767334 + }, + { + "auxiliary_loss_clip": 0.11787133, + "auxiliary_loss_mlp": 0.18362574, + "balance_loss_clip": 0.08391854, + "balance_loss_mlp": 0.02444606, + "epoch": 0.011423418006914174, + "flos": 24907285388160.0, + "grad_norm": 1404.1743205968703, + "language_loss": 3.09611416, + "learning_rate": 3.3783079057586833e-06, + "loss": 3.39761114, + "num_input_tokens_seen": 4029035, + "router_z_loss_clip": 34.0, + "router_z_loss_mlp": 159.125, + "step": 190, + "time_per_iteration": 2.7106430530548096 + }, + { + "auxiliary_loss_clip": 0.11759384, + "auxiliary_loss_mlp": 0.1804318, + "balance_loss_clip": 0.08374631, + "balance_loss_mlp": 0.02442593, + "epoch": 0.011483541259582144, + "flos": 19797964899840.0, + "grad_norm": 958.8286854390585, + "language_loss": 3.06252718, + "learning_rate": 3.3816877150079665e-06, + "loss": 3.36055326, + "num_input_tokens_seen": 4046995, + "router_z_loss_clip": 33.875, + "router_z_loss_mlp": 156.0, + "step": 191, + "time_per_iteration": 2.6592226028442383 + }, + { + "auxiliary_loss_clip": 0.11741614, + "auxiliary_loss_mlp": 0.17628413, + "balance_loss_clip": 0.08397849, + "balance_loss_mlp": 0.02442867, + "epoch": 0.011543664512250112, + "flos": 26184101904000.0, + "grad_norm": 872.0200851454543, + "language_loss": 3.40287876, + "learning_rate": 3.385049875042367e-06, + "loss": 3.69657874, + "num_input_tokens_seen": 4065865, + "router_z_loss_clip": 33.4375, + "router_z_loss_mlp": 151.625, + "step": 192, + "time_per_iteration": 2.7246127128601074 + }, + { + "auxiliary_loss_clip": 0.11744646, + "auxiliary_loss_mlp": 0.1831618, + "balance_loss_clip": 0.08387344, + "balance_loss_mlp": 0.02459247, + "epoch": 0.011603787764918083, + "flos": 23775763052160.0, + "grad_norm": 255.22859463919886, + "language_loss": 3.03195429, + "learning_rate": 3.3883945692315938e-06, + "loss": 3.33256245, + "num_input_tokens_seen": 4085305, + "router_z_loss_clip": 33.59375, + "router_z_loss_mlp": 158.375, + "step": 193, + "time_per_iteration": 2.683800220489502 + }, + { + "auxiliary_loss_clip": 0.11792802, + "auxiliary_loss_mlp": 0.18172303, + "balance_loss_clip": 0.08409159, + "balance_loss_mlp": 0.02449647, + "epoch": 0.011663911017586051, + "flos": 25961255170560.0, + "grad_norm": 151.45813274947093, + "language_loss": 3.26517797, + "learning_rate": 3.3917219781023906e-06, + "loss": 3.56482911, + "num_input_tokens_seen": 4105185, + "router_z_loss_clip": 33.875, + "router_z_loss_mlp": 157.0, + "step": 194, + "time_per_iteration": 2.6878743171691895 + }, + { + "auxiliary_loss_clip": 0.11706592, + "auxiliary_loss_mlp": 0.17706957, + "balance_loss_clip": 0.08367997, + "balance_loss_mlp": 0.0244817, + "epoch": 0.01172403427025402, + "flos": 17901006716160.0, + "grad_norm": 341.36308265873936, + "language_loss": 3.21669102, + "learning_rate": 3.3950322793970014e-06, + "loss": 3.51082659, + "num_input_tokens_seen": 4123160, + "router_z_loss_clip": 33.375, + "router_z_loss_mlp": 152.25, + "step": 195, + "time_per_iteration": 2.6620969772338867 + }, + { + "auxiliary_loss_clip": 0.11741272, + "auxiliary_loss_mlp": 0.18081686, + "balance_loss_clip": 0.08387178, + "balance_loss_mlp": 0.02468893, + "epoch": 0.01178415752292199, + "flos": 17900293956480.0, + "grad_norm": 232.42067340374058, + "language_loss": 3.00283194, + "learning_rate": 3.3983256481301445e-06, + "loss": 3.30106115, + "num_input_tokens_seen": 4140425, + "router_z_loss_clip": 33.53125, + "router_z_loss_mlp": 156.0, + "step": 196, + "time_per_iteration": 2.608747720718384 + }, + { + "auxiliary_loss_clip": 0.11721249, + "auxiliary_loss_mlp": 0.17373422, + "balance_loss_clip": 0.08370736, + "balance_loss_mlp": 0.02444223, + "epoch": 0.011844280775589959, + "flos": 22900224539520.0, + "grad_norm": 115.37051275011517, + "language_loss": 2.93469787, + "learning_rate": 3.4016022566445335e-06, + "loss": 3.22564435, + "num_input_tokens_seen": 4159555, + "router_z_loss_clip": 33.5, + "router_z_loss_mlp": 149.0, + "step": 197, + "time_per_iteration": 2.6884865760803223 + }, + { + "auxiliary_loss_clip": 0.11780085, + "auxiliary_loss_mlp": 0.17500654, + "balance_loss_clip": 0.08412851, + "balance_loss_mlp": 0.02486004, + "epoch": 0.01190440402825793, + "flos": 26987748013440.0, + "grad_norm": 594.5655905086047, + "language_loss": 2.93459964, + "learning_rate": 3.4048622746649966e-06, + "loss": 3.22740698, + "num_input_tokens_seen": 4180480, + "router_z_loss_clip": 33.65625, + "router_z_loss_mlp": 150.25, + "step": 198, + "time_per_iteration": 2.7313427925109863 + }, + { + "auxiliary_loss_clip": 0.11754367, + "auxiliary_loss_mlp": 0.16903168, + "balance_loss_clip": 0.08420561, + "balance_loss_mlp": 0.02462251, + "epoch": 0.011964527280925898, + "flos": 20527789962240.0, + "grad_norm": 145.17481727818333, + "language_loss": 2.84690857, + "learning_rate": 3.4081058693512278e-06, + "loss": 3.13348389, + "num_input_tokens_seen": 4198835, + "router_z_loss_clip": 33.34375, + "router_z_loss_mlp": 144.5, + "step": 199, + "time_per_iteration": 2.688974618911743 + }, + { + "auxiliary_loss_clip": 0.11798929, + "auxiliary_loss_mlp": 0.17447452, + "balance_loss_clip": 0.08422767, + "balance_loss_mlp": 0.02481632, + "epoch": 0.012024650533593867, + "flos": 27753435423360.0, + "grad_norm": 82.0113766879368, + "language_loss": 2.56142473, + "learning_rate": 3.411333205349222e-06, + "loss": 2.85388851, + "num_input_tokens_seen": 4219335, + "router_z_loss_clip": 33.75, + "router_z_loss_mlp": 149.5, + "step": 200, + "time_per_iteration": 2.745638608932495 + }, + { + "auxiliary_loss_clip": 0.11760798, + "auxiliary_loss_mlp": 0.1661135, + "balance_loss_clip": 0.08439215, + "balance_loss_mlp": 0.02475607, + "epoch": 0.012084773786261837, + "flos": 10456623371520.0, + "grad_norm": 81.29107841083456, + "language_loss": 2.49306059, + "learning_rate": 3.4145444448414217e-06, + "loss": 2.77678204, + "num_input_tokens_seen": 4236940, + "router_z_loss_clip": 33.25, + "router_z_loss_mlp": 141.375, + "step": 201, + "time_per_iteration": 2.7527854442596436 + }, + { + "auxiliary_loss_clip": 0.1174719, + "auxiliary_loss_mlp": 0.16602293, + "balance_loss_clip": 0.08432734, + "balance_loss_mlp": 0.02490965, + "epoch": 0.012144897038929806, + "flos": 23111331701760.0, + "grad_norm": 843.8800494285322, + "language_loss": 2.70319819, + "learning_rate": 3.4177397475956223e-06, + "loss": 2.98669291, + "num_input_tokens_seen": 4256755, + "router_z_loss_clip": 33.21875, + "router_z_loss_mlp": 141.125, + "step": 202, + "time_per_iteration": 2.739138603210449 + }, + { + "auxiliary_loss_clip": 0.11772437, + "auxiliary_loss_mlp": 0.16814882, + "balance_loss_clip": 0.08448092, + "balance_loss_mlp": 0.02483826, + "epoch": 0.012205020291597776, + "flos": 21039631827840.0, + "grad_norm": 111.22984226607618, + "language_loss": 2.69834185, + "learning_rate": 3.4209192710126685e-06, + "loss": 2.98421502, + "num_input_tokens_seen": 4276505, + "router_z_loss_clip": 33.25, + "router_z_loss_mlp": 143.375, + "step": 203, + "time_per_iteration": 2.6849801540374756 + }, + { + "auxiliary_loss_clip": 0.09996115, + "auxiliary_loss_mlp": 0.01763683, + "balance_loss_clip": 0.08022483, + "balance_loss_mlp": 0.01355129, + "epoch": 0.012265143544265745, + "flos": 68465416481280.0, + "grad_norm": 2.5939001011358327, + "language_loss": 0.60663998, + "learning_rate": 3.4240831701729837e-06, + "loss": 0.72423798, + "num_input_tokens_seen": 4330965, + "router_z_loss_clip": 19.75, + "router_z_loss_mlp": 4.08984375, + "step": 204, + "time_per_iteration": 3.218200922012329 + }, + { + "auxiliary_loss_clip": 0.11829591, + "auxiliary_loss_mlp": 0.16426852, + "balance_loss_clip": 0.08460154, + "balance_loss_mlp": 0.02486424, + "epoch": 0.012325266796933715, + "flos": 17024923152000.0, + "grad_norm": 175.923318576614, + "language_loss": 2.6947825, + "learning_rate": 3.4272315978819516e-06, + "loss": 2.9773469, + "num_input_tokens_seen": 4348200, + "router_z_loss_clip": 33.6875, + "router_z_loss_mlp": 139.5, + "step": 205, + "time_per_iteration": 2.6580400466918945 + }, + { + "auxiliary_loss_clip": 0.11821875, + "auxiliary_loss_mlp": 0.15477848, + "balance_loss_clip": 0.0845597, + "balance_loss_mlp": 0.02483464, + "epoch": 0.012385390049601683, + "flos": 20195679104640.0, + "grad_norm": 179.20336452265943, + "language_loss": 2.76609898, + "learning_rate": 3.4303647047142043e-06, + "loss": 3.03909636, + "num_input_tokens_seen": 4365460, + "router_z_loss_clip": 33.71875, + "router_z_loss_mlp": 130.0625, + "step": 206, + "time_per_iteration": 2.732661724090576 + }, + { + "auxiliary_loss_clip": 0.11876252, + "auxiliary_loss_mlp": 0.15609139, + "balance_loss_clip": 0.0847889, + "balance_loss_mlp": 0.02498787, + "epoch": 0.012445513302269652, + "flos": 16258690690560.0, + "grad_norm": 37.57079461410369, + "language_loss": 2.63663292, + "learning_rate": 3.43348263905683e-06, + "loss": 2.91148686, + "num_input_tokens_seen": 4383650, + "router_z_loss_clip": 33.9375, + "router_z_loss_mlp": 131.25, + "step": 207, + "time_per_iteration": 2.655898332595825 + }, + { + "auxiliary_loss_clip": 0.11858118, + "auxiliary_loss_mlp": 0.15964949, + "balance_loss_clip": 0.08469288, + "balance_loss_mlp": 0.02500593, + "epoch": 0.012505636554937622, + "flos": 23776224249600.0, + "grad_norm": 80.16610328924297, + "language_loss": 2.31757832, + "learning_rate": 3.436585547151547e-06, + "loss": 2.59580898, + "num_input_tokens_seen": 4403765, + "router_z_loss_clip": 33.90625, + "router_z_loss_mlp": 134.8125, + "step": 208, + "time_per_iteration": 2.7096707820892334 + }, + { + "auxiliary_loss_clip": 0.11891477, + "auxiliary_loss_mlp": 0.15333374, + "balance_loss_clip": 0.08512411, + "balance_loss_mlp": 0.02509888, + "epoch": 0.012565759807605591, + "flos": 30599417750400.0, + "grad_norm": 94.61742092763181, + "language_loss": 2.89340639, + "learning_rate": 3.4396735731358586e-06, + "loss": 3.16565466, + "num_input_tokens_seen": 4421935, + "router_z_loss_clip": 33.8125, + "router_z_loss_mlp": 128.3125, + "step": 209, + "time_per_iteration": 2.7260549068450928 + }, + { + "auxiliary_loss_clip": 0.11866176, + "auxiliary_loss_mlp": 0.14843261, + "balance_loss_clip": 0.08489646, + "balance_loss_mlp": 0.02508056, + "epoch": 0.012625883060273561, + "flos": 40122838200960.0, + "grad_norm": 70.02885877178691, + "language_loss": 2.47040462, + "learning_rate": 3.4427468590832302e-06, + "loss": 2.737499, + "num_input_tokens_seen": 4441470, + "router_z_loss_clip": 33.78125, + "router_z_loss_mlp": 123.375, + "step": 210, + "time_per_iteration": 2.8969995975494385 + }, + { + "auxiliary_loss_clip": 0.1188697, + "auxiliary_loss_mlp": 0.14057073, + "balance_loss_clip": 0.08471721, + "balance_loss_mlp": 0.02497014, + "epoch": 0.01268600631294153, + "flos": 27096509013120.0, + "grad_norm": 122.06391807709156, + "language_loss": 2.54189563, + "learning_rate": 3.445805545042314e-06, + "loss": 2.80133629, + "num_input_tokens_seen": 4459950, + "router_z_loss_clip": 34.15625, + "router_z_loss_mlp": 115.625, + "step": 211, + "time_per_iteration": 2.708080768585205 + }, + { + "auxiliary_loss_clip": 0.11883873, + "auxiliary_loss_mlp": 0.13339609, + "balance_loss_clip": 0.08499163, + "balance_loss_mlp": 0.02499764, + "epoch": 0.012746129565609499, + "flos": 16988431898880.0, + "grad_norm": 126.44131700603937, + "language_loss": 2.37998009, + "learning_rate": 3.448849769075239e-06, + "loss": 2.63221502, + "num_input_tokens_seen": 4478390, + "router_z_loss_clip": 33.84375, + "router_z_loss_mlp": 108.375, + "step": 212, + "time_per_iteration": 2.6480045318603516 + }, + { + "auxiliary_loss_clip": 0.11928719, + "auxiliary_loss_mlp": 0.13044119, + "balance_loss_clip": 0.08510935, + "balance_loss_mlp": 0.02497243, + "epoch": 0.012806252818277469, + "flos": 46543621668480.0, + "grad_norm": 186.42729164055353, + "language_loss": 2.21970725, + "learning_rate": 3.4518796672950093e-06, + "loss": 2.46943569, + "num_input_tokens_seen": 4501665, + "router_z_loss_clip": 34.15625, + "router_z_loss_mlp": 105.5625, + "step": 213, + "time_per_iteration": 2.871330738067627 + }, + { + "auxiliary_loss_clip": 0.119517, + "auxiliary_loss_mlp": 0.12083894, + "balance_loss_clip": 0.08513753, + "balance_loss_mlp": 0.02489167, + "epoch": 0.012866376070945438, + "flos": 14393234442240.0, + "grad_norm": 59.129237382202305, + "language_loss": 2.15201378, + "learning_rate": 3.4548953739020187e-06, + "loss": 2.39236999, + "num_input_tokens_seen": 4519055, + "router_z_loss_clip": 34.40625, + "router_z_loss_mlp": 95.9375, + "step": 214, + "time_per_iteration": 2.677279472351074 + }, + { + "auxiliary_loss_clip": 0.11979187, + "auxiliary_loss_mlp": 0.11437444, + "balance_loss_clip": 0.08527225, + "balance_loss_mlp": 0.02483585, + "epoch": 0.012926499323613408, + "flos": 26148029921280.0, + "grad_norm": 82.8472801825022, + "language_loss": 2.01005268, + "learning_rate": 3.4578970212197196e-06, + "loss": 2.24421906, + "num_input_tokens_seen": 4540870, + "router_z_loss_clip": 34.5, + "router_z_loss_mlp": 89.625, + "step": 215, + "time_per_iteration": 5.505565881729126 + }, + { + "auxiliary_loss_clip": 0.11977073, + "auxiliary_loss_mlp": 0.10736242, + "balance_loss_clip": 0.08518873, + "balance_loss_mlp": 0.02484289, + "epoch": 0.012986622576281377, + "flos": 30124989532800.0, + "grad_norm": 444.29299491343255, + "language_loss": 2.23052669, + "learning_rate": 3.460884739729461e-06, + "loss": 2.45765996, + "num_input_tokens_seen": 4560395, + "router_z_loss_clip": 34.625, + "router_z_loss_mlp": 82.5, + "step": 216, + "time_per_iteration": 4.0875208377838135 + }, + { + "auxiliary_loss_clip": 0.11978886, + "auxiliary_loss_mlp": 0.10150906, + "balance_loss_clip": 0.0852896, + "balance_loss_mlp": 0.02478787, + "epoch": 0.013046745828949347, + "flos": 13959112838400.0, + "grad_norm": 45.21271501184753, + "language_loss": 2.33321786, + "learning_rate": 3.463858658104523e-06, + "loss": 2.55451584, + "num_input_tokens_seen": 4575785, + "router_z_loss_clip": 34.46875, + "router_z_loss_mlp": 76.625, + "step": 217, + "time_per_iteration": 4.032313585281372 + }, + { + "auxiliary_loss_clip": 0.11990365, + "auxiliary_loss_mlp": 0.09330522, + "balance_loss_clip": 0.08498306, + "balance_loss_mlp": 0.02482377, + "epoch": 0.013106869081617315, + "flos": 17353595992320.0, + "grad_norm": 48.7496700865691, + "language_loss": 2.077981, + "learning_rate": 3.4668189032433696e-06, + "loss": 2.29119015, + "num_input_tokens_seen": 4594985, + "router_z_loss_clip": 34.9375, + "router_z_loss_mlp": 68.625, + "step": 218, + "time_per_iteration": 2.655488967895508 + }, + { + "auxiliary_loss_clip": 0.12044869, + "auxiliary_loss_mlp": 0.08778962, + "balance_loss_clip": 0.08527655, + "balance_loss_mlp": 0.02477083, + "epoch": 0.013166992334285284, + "flos": 25892004170880.0, + "grad_norm": 58.49845250600888, + "language_loss": 2.1651845, + "learning_rate": 3.46976560030214e-06, + "loss": 2.3734231, + "num_input_tokens_seen": 4616125, + "router_z_loss_clip": 35.15625, + "router_z_loss_mlp": 63.0, + "step": 219, + "time_per_iteration": 2.7416553497314453 + }, + { + "auxiliary_loss_clip": 0.12097923, + "auxiliary_loss_mlp": 0.08351351, + "balance_loss_clip": 0.08555256, + "balance_loss_mlp": 0.0248282, + "epoch": 0.013227115586953254, + "flos": 31184032487040.0, + "grad_norm": 65.30096795058861, + "language_loss": 2.22661948, + "learning_rate": 3.4726988727263976e-06, + "loss": 2.43111229, + "num_input_tokens_seen": 4637795, + "router_z_loss_clip": 35.40625, + "router_z_loss_mlp": 58.625, + "step": 220, + "time_per_iteration": 2.825364351272583 + }, + { + "auxiliary_loss_clip": 0.12091806, + "auxiliary_loss_mlp": 0.07555279, + "balance_loss_clip": 0.08557573, + "balance_loss_mlp": 0.02477154, + "epoch": 0.013287238839621223, + "flos": 20415213601920.0, + "grad_norm": 85.51848477504389, + "language_loss": 2.08907223, + "learning_rate": 3.475618842282164e-06, + "loss": 2.2855432, + "num_input_tokens_seen": 4656835, + "router_z_loss_clip": 35.375, + "router_z_loss_mlp": 50.75, + "step": 221, + "time_per_iteration": 2.699341058731079 + }, + { + "auxiliary_loss_clip": 0.12102397, + "auxiliary_loss_mlp": 0.07188272, + "balance_loss_clip": 0.08552121, + "balance_loss_mlp": 0.02482462, + "epoch": 0.013347362092289193, + "flos": 14142365717760.0, + "grad_norm": 45.70301732891132, + "language_loss": 2.16536474, + "learning_rate": 3.4785256290862486e-06, + "loss": 2.3582716, + "num_input_tokens_seen": 4673015, + "router_z_loss_clip": 35.5, + "router_z_loss_mlp": 47.0, + "step": 222, + "time_per_iteration": 2.635849714279175 + }, + { + "auxiliary_loss_clip": 0.12141806, + "auxiliary_loss_mlp": 0.06919794, + "balance_loss_clip": 0.08555885, + "balance_loss_mlp": 0.0248864, + "epoch": 0.013407485344957162, + "flos": 21803977572480.0, + "grad_norm": 133.93360024755185, + "language_loss": 2.13315558, + "learning_rate": 3.481419351635897e-06, + "loss": 2.32377172, + "num_input_tokens_seen": 4692355, + "router_z_loss_clip": 35.84375, + "router_z_loss_mlp": 44.375, + "step": 223, + "time_per_iteration": 2.677440881729126 + }, + { + "auxiliary_loss_clip": 0.12133283, + "auxiliary_loss_mlp": 0.06662595, + "balance_loss_clip": 0.08527759, + "balance_loss_mlp": 0.0248779, + "epoch": 0.013467608597625132, + "flos": 18627058344960.0, + "grad_norm": 45.82649386348146, + "language_loss": 2.04508209, + "learning_rate": 3.484300126837776e-06, + "loss": 2.23304057, + "num_input_tokens_seen": 4710080, + "router_z_loss_clip": 36.0, + "router_z_loss_mlp": 41.71875, + "step": 224, + "time_per_iteration": 2.647221803665161 + }, + { + "auxiliary_loss_clip": 0.12132762, + "auxiliary_loss_mlp": 0.06591167, + "balance_loss_clip": 0.0855926, + "balance_loss_mlp": 0.02489604, + "epoch": 0.013527731850293101, + "flos": 18558352396800.0, + "grad_norm": 35.4602333373948, + "language_loss": 1.96751869, + "learning_rate": 3.487168070036317e-06, + "loss": 2.15475798, + "num_input_tokens_seen": 4728980, + "router_z_loss_clip": 35.71875, + "router_z_loss_mlp": 41.0, + "step": 225, + "time_per_iteration": 2.6572558879852295 + }, + { + "auxiliary_loss_clip": 0.12111218, + "auxiliary_loss_mlp": 0.06338836, + "balance_loss_clip": 0.08540972, + "balance_loss_mlp": 0.02487518, + "epoch": 0.01358785510296107, + "flos": 19170318291840.0, + "grad_norm": 35.010295897234684, + "language_loss": 2.14010954, + "learning_rate": 3.4900232950414224e-06, + "loss": 2.32460999, + "num_input_tokens_seen": 4747020, + "router_z_loss_clip": 35.6875, + "router_z_loss_mlp": 38.46875, + "step": 226, + "time_per_iteration": 2.6925666332244873 + }, + { + "auxiliary_loss_clip": 0.12106597, + "auxiliary_loss_mlp": 0.06106333, + "balance_loss_clip": 0.08537765, + "balance_loss_mlp": 0.02477793, + "epoch": 0.01364797835562904, + "flos": 23336442495360.0, + "grad_norm": 62.289483146556975, + "language_loss": 1.89336014, + "learning_rate": 3.4928659141555727e-06, + "loss": 2.07548952, + "num_input_tokens_seen": 4765000, + "router_z_loss_clip": 35.71875, + "router_z_loss_mlp": 36.25, + "step": 227, + "time_per_iteration": 2.662459373474121 + }, + { + "auxiliary_loss_clip": 0.09852038, + "auxiliary_loss_mlp": 0.02028254, + "balance_loss_clip": 0.08093569, + "balance_loss_mlp": 0.01678827, + "epoch": 0.013708101608297009, + "flos": 71016561089280.0, + "grad_norm": 1.118625578373922, + "language_loss": 0.572559, + "learning_rate": 3.4956960382003234e-06, + "loss": 0.6913619, + "num_input_tokens_seen": 4833210, + "router_z_loss_clip": 17.53125, + "router_z_loss_mlp": 3.49804688, + "step": 228, + "time_per_iteration": 3.3785295486450195 + }, + { + "auxiliary_loss_clip": 0.12056112, + "auxiliary_loss_mlp": 0.05858175, + "balance_loss_clip": 0.08522452, + "balance_loss_mlp": 0.02485983, + "epoch": 0.013768224860964979, + "flos": 16330583093760.0, + "grad_norm": 67.20403392826273, + "language_loss": 1.83727443, + "learning_rate": 3.4985137765422354e-06, + "loss": 2.0164175, + "num_input_tokens_seen": 4850120, + "router_z_loss_clip": 35.34375, + "router_z_loss_mlp": 33.765625, + "step": 229, + "time_per_iteration": 2.6247904300689697 + }, + { + "auxiliary_loss_clip": 0.11999249, + "auxiliary_loss_mlp": 0.05601757, + "balance_loss_clip": 0.08509874, + "balance_loss_mlp": 0.02482861, + "epoch": 0.013828348113632948, + "flos": 20199159048960.0, + "grad_norm": 53.50045183346903, + "language_loss": 1.8795563, + "learning_rate": 3.501319237118231e-06, + "loss": 2.05556631, + "num_input_tokens_seen": 4866215, + "router_z_loss_clip": 34.9375, + "router_z_loss_mlp": 31.1875, + "step": 230, + "time_per_iteration": 2.7507057189941406 + }, + { + "auxiliary_loss_clip": 0.12064129, + "auxiliary_loss_mlp": 0.05470717, + "balance_loss_clip": 0.08557475, + "balance_loss_mlp": 0.02487624, + "epoch": 0.013888471366300916, + "flos": 20747408313600.0, + "grad_norm": 34.266749882440614, + "language_loss": 1.64469385, + "learning_rate": 3.5041125264604056e-06, + "loss": 1.82004225, + "num_input_tokens_seen": 4885630, + "router_z_loss_clip": 35.09375, + "router_z_loss_mlp": 29.796875, + "step": 231, + "time_per_iteration": 2.641220808029175 + }, + { + "auxiliary_loss_clip": 0.12051, + "auxiliary_loss_mlp": 0.05321148, + "balance_loss_clip": 0.08549553, + "balance_loss_mlp": 0.02486065, + "epoch": 0.013948594618968886, + "flos": 22097123481600.0, + "grad_norm": 189.27377216215737, + "language_loss": 1.70564377, + "learning_rate": 3.5068937497203002e-06, + "loss": 1.87936521, + "num_input_tokens_seen": 4905570, + "router_z_loss_clip": 35.0, + "router_z_loss_mlp": 28.34375, + "step": 232, + "time_per_iteration": 2.6656322479248047 + }, + { + "auxiliary_loss_clip": 0.12035383, + "auxiliary_loss_mlp": 0.0510756, + "balance_loss_clip": 0.08542152, + "balance_loss_mlp": 0.02483049, + "epoch": 0.014008717871636855, + "flos": 19069229940480.0, + "grad_norm": 76.31242813901656, + "language_loss": 1.64492762, + "learning_rate": 3.509663010692652e-06, + "loss": 1.81635702, + "num_input_tokens_seen": 4923535, + "router_z_loss_clip": 34.96875, + "router_z_loss_mlp": 26.25, + "step": 233, + "time_per_iteration": 2.6354150772094727 + }, + { + "auxiliary_loss_clip": 0.12088259, + "auxiliary_loss_mlp": 0.05079982, + "balance_loss_clip": 0.08570465, + "balance_loss_mlp": 0.02490566, + "epoch": 0.014068841124304825, + "flos": 14534839042560.0, + "grad_norm": 50.00852440461159, + "language_loss": 1.75618017, + "learning_rate": 3.512420411838642e-06, + "loss": 1.92786264, + "num_input_tokens_seen": 4939200, + "router_z_loss_clip": 35.15625, + "router_z_loss_mlp": 25.890625, + "step": 234, + "time_per_iteration": 2.666630983352661 + }, + { + "auxiliary_loss_clip": 0.11989364, + "auxiliary_loss_mlp": 0.05021151, + "balance_loss_clip": 0.08533135, + "balance_loss_mlp": 0.0249277, + "epoch": 0.014128964376972794, + "flos": 18083253346560.0, + "grad_norm": 159.74277839526525, + "language_loss": 1.68861091, + "learning_rate": 3.515166054308634e-06, + "loss": 1.85871601, + "num_input_tokens_seen": 4956620, + "router_z_loss_clip": 34.625, + "router_z_loss_mlp": 25.28125, + "step": 235, + "time_per_iteration": 2.6749186515808105 + }, + { + "auxiliary_loss_clip": 0.12056133, + "auxiliary_loss_mlp": 0.04976581, + "balance_loss_clip": 0.08549982, + "balance_loss_mlp": 0.02495502, + "epoch": 0.014189087629640764, + "flos": 25340778086400.0, + "grad_norm": 181.61682318003585, + "language_loss": 1.60946572, + "learning_rate": 3.5179000379644498e-06, + "loss": 1.77979279, + "num_input_tokens_seen": 4975650, + "router_z_loss_clip": 35.03125, + "router_z_loss_mlp": 24.8125, + "step": 236, + "time_per_iteration": 2.744683027267456 + }, + { + "auxiliary_loss_clip": 0.11981137, + "auxiliary_loss_mlp": 0.04688486, + "balance_loss_clip": 0.08556408, + "balance_loss_mlp": 0.02492746, + "epoch": 0.014249210882308733, + "flos": 36148939263360.0, + "grad_norm": 53.559601436427585, + "language_loss": 1.50691867, + "learning_rate": 3.520622461401154e-06, + "loss": 1.67361498, + "num_input_tokens_seen": 4997415, + "router_z_loss_clip": 34.25, + "router_z_loss_mlp": 21.96875, + "step": 237, + "time_per_iteration": 2.845082998275757 + }, + { + "auxiliary_loss_clip": 0.12020621, + "auxiliary_loss_mlp": 0.04751597, + "balance_loss_clip": 0.08577786, + "balance_loss_mlp": 0.02497874, + "epoch": 0.014309334134976702, + "flos": 12937986656640.0, + "grad_norm": 74.10279300011292, + "language_loss": 1.46138978, + "learning_rate": 3.5233334219683935e-06, + "loss": 1.62911201, + "num_input_tokens_seen": 5013905, + "router_z_loss_clip": 34.4375, + "router_z_loss_mlp": 22.5625, + "step": 238, + "time_per_iteration": 2.658674716949463 + }, + { + "auxiliary_loss_clip": 0.11937614, + "auxiliary_loss_mlp": 0.04392426, + "balance_loss_clip": 0.08564249, + "balance_loss_mlp": 0.02485077, + "epoch": 0.014369457387644672, + "flos": 20783857639680.0, + "grad_norm": 42.588620022932425, + "language_loss": 1.53544843, + "learning_rate": 3.526033015791284e-06, + "loss": 1.69874883, + "num_input_tokens_seen": 5033645, + "router_z_loss_clip": 33.78125, + "router_z_loss_mlp": 19.046875, + "step": 239, + "time_per_iteration": 2.700894355773926 + }, + { + "auxiliary_loss_clip": 0.11902035, + "auxiliary_loss_mlp": 0.04253633, + "balance_loss_clip": 0.08564246, + "balance_loss_mlp": 0.02488191, + "epoch": 0.01442958064031264, + "flos": 25855638698880.0, + "grad_norm": 34.671761903295156, + "language_loss": 1.53386331, + "learning_rate": 3.528721337790862e-06, + "loss": 1.69542003, + "num_input_tokens_seen": 5052875, + "router_z_loss_clip": 33.4375, + "router_z_loss_mlp": 17.671875, + "step": 240, + "time_per_iteration": 2.712979555130005 + }, + { + "auxiliary_loss_clip": 0.11883197, + "auxiliary_loss_mlp": 0.04123231, + "balance_loss_clip": 0.08562298, + "balance_loss_mlp": 0.02487489, + "epoch": 0.014489703892980611, + "flos": 28227150881280.0, + "grad_norm": 79.00201559956153, + "language_loss": 1.47835279, + "learning_rate": 3.531398481704111e-06, + "loss": 1.63841701, + "num_input_tokens_seen": 5075005, + "router_z_loss_clip": 33.15625, + "router_z_loss_mlp": 16.359375, + "step": 241, + "time_per_iteration": 2.7748684883117676 + }, + { + "auxiliary_loss_clip": 0.11856598, + "auxiliary_loss_mlp": 0.0397551, + "balance_loss_clip": 0.08558369, + "balance_loss_mlp": 0.02488541, + "epoch": 0.01454982714564858, + "flos": 22497311381760.0, + "grad_norm": 26.156771136535646, + "language_loss": 1.46749806, + "learning_rate": 3.534064540103573e-06, + "loss": 1.62581909, + "num_input_tokens_seen": 5091875, + "router_z_loss_clip": 32.984375, + "router_z_loss_mlp": 14.875, + "step": 242, + "time_per_iteration": 2.69297456741333 + }, + { + "auxiliary_loss_clip": 0.11859537, + "auxiliary_loss_mlp": 0.03845835, + "balance_loss_clip": 0.08550237, + "balance_loss_mlp": 0.0248704, + "epoch": 0.014609950398316548, + "flos": 21659689641600.0, + "grad_norm": 40.62615504318681, + "language_loss": 1.44594622, + "learning_rate": 3.536719604416555e-06, + "loss": 1.60299993, + "num_input_tokens_seen": 5111290, + "router_z_loss_clip": 33.03125, + "router_z_loss_mlp": 13.5859375, + "step": 243, + "time_per_iteration": 2.7429516315460205 + }, + { + "auxiliary_loss_clip": 0.11778541, + "auxiliary_loss_mlp": 0.03809229, + "balance_loss_clip": 0.08539546, + "balance_loss_mlp": 0.02486292, + "epoch": 0.014670073650984519, + "flos": 21876163464960.0, + "grad_norm": 100.86422067940943, + "language_loss": 1.56203103, + "learning_rate": 3.5393637649439464e-06, + "loss": 1.71790862, + "num_input_tokens_seen": 5132265, + "router_z_loss_clip": 32.34375, + "router_z_loss_mlp": 13.2265625, + "step": 244, + "time_per_iteration": 2.6750683784484863 + }, + { + "auxiliary_loss_clip": 0.11823894, + "auxiliary_loss_mlp": 0.03778996, + "balance_loss_clip": 0.08550587, + "balance_loss_mlp": 0.02497257, + "epoch": 0.014730196903652487, + "flos": 23190142066560.0, + "grad_norm": 48.52251723310838, + "language_loss": 1.50476313, + "learning_rate": 3.54199711087864e-06, + "loss": 1.66079211, + "num_input_tokens_seen": 5148575, + "router_z_loss_clip": 32.71875, + "router_z_loss_mlp": 12.8125, + "step": 245, + "time_per_iteration": 2.72153639793396 + }, + { + "auxiliary_loss_clip": 0.11763392, + "auxiliary_loss_mlp": 0.03610927, + "balance_loss_clip": 0.08551488, + "balance_loss_mlp": 0.02484828, + "epoch": 0.014790320156320457, + "flos": 23229442431360.0, + "grad_norm": 98.70024924690004, + "language_loss": 1.52072549, + "learning_rate": 3.5446197303235913e-06, + "loss": 1.67446864, + "num_input_tokens_seen": 5170415, + "router_z_loss_clip": 32.078125, + "router_z_loss_mlp": 11.265625, + "step": 246, + "time_per_iteration": 2.739284038543701 + }, + { + "auxiliary_loss_clip": 0.11731501, + "auxiliary_loss_mlp": 0.03545591, + "balance_loss_clip": 0.08530955, + "balance_loss_mlp": 0.0246832, + "epoch": 0.014850443408988426, + "flos": 15821005288320.0, + "grad_norm": 33.98035395755878, + "language_loss": 1.40319586, + "learning_rate": 3.5472317103095034e-06, + "loss": 1.55596685, + "num_input_tokens_seen": 5188565, + "router_z_loss_clip": 31.96875, + "router_z_loss_mlp": 10.7734375, + "step": 247, + "time_per_iteration": 2.7273683547973633 + }, + { + "auxiliary_loss_clip": 0.1172208, + "auxiliary_loss_mlp": 0.03547119, + "balance_loss_clip": 0.08564139, + "balance_loss_mlp": 0.02478241, + "epoch": 0.014910566661656396, + "flos": 22787899741440.0, + "grad_norm": 52.371226674183355, + "language_loss": 1.30089116, + "learning_rate": 3.549833136812155e-06, + "loss": 1.453583, + "num_input_tokens_seen": 5207810, + "router_z_loss_clip": 31.578125, + "router_z_loss_mlp": 10.6953125, + "step": 248, + "time_per_iteration": 2.7991907596588135 + }, + { + "auxiliary_loss_clip": 0.11678547, + "auxiliary_loss_mlp": 0.03475812, + "balance_loss_clip": 0.08537906, + "balance_loss_mlp": 0.02466443, + "epoch": 0.014970689914324365, + "flos": 26871440146560.0, + "grad_norm": 39.139484540660874, + "language_loss": 1.33625245, + "learning_rate": 3.552424094769381e-06, + "loss": 1.48779607, + "num_input_tokens_seen": 5226210, + "router_z_loss_clip": 31.390625, + "router_z_loss_mlp": 10.0859375, + "step": 249, + "time_per_iteration": 2.7439961433410645 + }, + { + "auxiliary_loss_clip": 0.11684404, + "auxiliary_loss_mlp": 0.03406032, + "balance_loss_clip": 0.08537483, + "balance_loss_mlp": 0.02458461, + "epoch": 0.015030813166992334, + "flos": 13989943941120.0, + "grad_norm": 151.47532384589994, + "language_loss": 1.465379, + "learning_rate": 3.5550046680977174e-06, + "loss": 1.6162833, + "num_input_tokens_seen": 5241660, + "router_z_loss_clip": 31.46875, + "router_z_loss_mlp": 9.4765625, + "step": 250, + "time_per_iteration": 2.68412184715271 + }, + { + "auxiliary_loss_clip": 0.11659358, + "auxiliary_loss_mlp": 0.03389172, + "balance_loss_clip": 0.08554412, + "balance_loss_mlp": 0.02466397, + "epoch": 0.015090936419660304, + "flos": 24724787195520.0, + "grad_norm": 46.474949555678066, + "language_loss": 1.48383927, + "learning_rate": 3.5575749397087034e-06, + "loss": 1.63432467, + "num_input_tokens_seen": 5261090, + "router_z_loss_clip": 31.0625, + "router_z_loss_mlp": 9.22265625, + "step": 251, + "time_per_iteration": 2.7403595447540283 + }, + { + "auxiliary_loss_clip": 0.11684091, + "auxiliary_loss_mlp": 0.0341421, + "balance_loss_clip": 0.08552309, + "balance_loss_mlp": 0.02502498, + "epoch": 0.015151059672328273, + "flos": 25745829523200.0, + "grad_norm": 38.842940432028065, + "language_loss": 1.35644555, + "learning_rate": 3.5601349915248707e-06, + "loss": 1.50742865, + "num_input_tokens_seen": 5279175, + "router_z_loss_clip": 31.296875, + "router_z_loss_mlp": 9.1171875, + "step": 252, + "time_per_iteration": 2.791579246520996 + }, + { + "auxiliary_loss_clip": 0.11669001, + "auxiliary_loss_mlp": 0.03442915, + "balance_loss_clip": 0.08573347, + "balance_loss_mlp": 0.02537305, + "epoch": 0.015211182924996243, + "flos": 21877588984320.0, + "grad_norm": 62.5379323018988, + "language_loss": 1.55304623, + "learning_rate": 3.5626849044954064e-06, + "loss": 1.70416546, + "num_input_tokens_seen": 5296975, + "router_z_loss_clip": 30.96875, + "router_z_loss_mlp": 9.0625, + "step": 253, + "time_per_iteration": 2.6943836212158203 + }, + { + "auxiliary_loss_clip": 0.09242393, + "auxiliary_loss_mlp": 0.017157, + "balance_loss_clip": 0.07774388, + "balance_loss_mlp": 0.01455537, + "epoch": 0.015271306177664212, + "flos": 66915159765120.0, + "grad_norm": 1.2208472030610649, + "language_loss": 0.55767465, + "learning_rate": 3.5652247586115167e-06, + "loss": 0.66725558, + "num_input_tokens_seen": 5358375, + "router_z_loss_clip": 14.65625, + "router_z_loss_mlp": 2.6015625, + "step": 254, + "time_per_iteration": 4.672732353210449 + }, + { + "auxiliary_loss_clip": 0.11620437, + "auxiliary_loss_mlp": 0.03323486, + "balance_loss_clip": 0.08537702, + "balance_loss_mlp": 0.02497223, + "epoch": 0.01533142943033218, + "flos": 26841405657600.0, + "grad_norm": 25.800997540380294, + "language_loss": 1.37205672, + "learning_rate": 3.567754632921479e-06, + "loss": 1.52149594, + "num_input_tokens_seen": 5377255, + "router_z_loss_clip": 30.84375, + "router_z_loss_mlp": 8.265625, + "step": 255, + "time_per_iteration": 5.487545490264893 + }, + { + "auxiliary_loss_clip": 0.11549303, + "auxiliary_loss_mlp": 0.03243715, + "balance_loss_clip": 0.08531242, + "balance_loss_mlp": 0.02464373, + "epoch": 0.01539155268300015, + "flos": 20820055403520.0, + "grad_norm": 51.38147970022548, + "language_loss": 1.3568666, + "learning_rate": 3.5702746055454075e-06, + "loss": 1.50479686, + "num_input_tokens_seen": 5395320, + "router_z_loss_clip": 30.171875, + "router_z_loss_mlp": 7.7890625, + "step": 256, + "time_per_iteration": 2.7118937969207764 + }, + { + "auxiliary_loss_clip": 0.11515065, + "auxiliary_loss_mlp": 0.0323028, + "balance_loss_clip": 0.08509345, + "balance_loss_mlp": 0.02460093, + "epoch": 0.01545167593566812, + "flos": 15967473425280.0, + "grad_norm": 27.629045104410558, + "language_loss": 1.28094459, + "learning_rate": 3.5727847536897254e-06, + "loss": 1.42839789, + "num_input_tokens_seen": 5411970, + "router_z_loss_clip": 30.046875, + "router_z_loss_mlp": 7.69921875, + "step": 257, + "time_per_iteration": 4.093847751617432 + }, + { + "auxiliary_loss_clip": 0.11514995, + "auxiliary_loss_mlp": 0.03174197, + "balance_loss_clip": 0.08523524, + "balance_loss_mlp": 0.02457415, + "epoch": 0.01551179918833609, + "flos": 22608378368640.0, + "grad_norm": 22.193359085523966, + "language_loss": 1.37467206, + "learning_rate": 3.5752851536613596e-06, + "loss": 1.52156401, + "num_input_tokens_seen": 5430245, + "router_z_loss_clip": 29.921875, + "router_z_loss_mlp": 7.171875, + "step": 258, + "time_per_iteration": 2.6789233684539795 + }, + { + "auxiliary_loss_clip": 0.11490995, + "auxiliary_loss_mlp": 0.03125494, + "balance_loss_clip": 0.08525682, + "balance_loss_mlp": 0.02450675, + "epoch": 0.015571922441004058, + "flos": 22822713912960.0, + "grad_norm": 41.08352403819959, + "language_loss": 1.35431111, + "learning_rate": 3.577775880881658e-06, + "loss": 1.50047588, + "num_input_tokens_seen": 5448905, + "router_z_loss_clip": 29.640625, + "router_z_loss_mlp": 6.75390625, + "step": 259, + "time_per_iteration": 2.716095209121704 + }, + { + "auxiliary_loss_clip": 0.11409761, + "auxiliary_loss_mlp": 0.03065479, + "balance_loss_clip": 0.08500087, + "balance_loss_mlp": 0.02439868, + "epoch": 0.015632045693672027, + "flos": 18952502803200.0, + "grad_norm": 45.41794645804665, + "language_loss": 1.35833013, + "learning_rate": 3.5802570099000424e-06, + "loss": 1.50308251, + "num_input_tokens_seen": 5466405, + "router_z_loss_clip": 29.109375, + "router_z_loss_mlp": 6.25390625, + "step": 260, + "time_per_iteration": 2.63728666305542 + }, + { + "auxiliary_loss_clip": 0.11363758, + "auxiliary_loss_mlp": 0.03047284, + "balance_loss_clip": 0.0847533, + "balance_loss_mlp": 0.02422818, + "epoch": 0.015692168946339995, + "flos": 29979569571840.0, + "grad_norm": 14.449297272648009, + "language_loss": 1.30485594, + "learning_rate": 3.5827286144073947e-06, + "loss": 1.44896626, + "num_input_tokens_seen": 5487055, + "router_z_loss_clip": 28.921875, + "router_z_loss_mlp": 6.23828125, + "step": 261, + "time_per_iteration": 2.7847509384155273 + }, + { + "auxiliary_loss_clip": 0.11379428, + "auxiliary_loss_mlp": 0.03054321, + "balance_loss_clip": 0.08507971, + "balance_loss_mlp": 0.02459991, + "epoch": 0.015752292199007967, + "flos": 19398363978240.0, + "grad_norm": 31.701786044094614, + "language_loss": 1.03000259, + "learning_rate": 3.5851907672491904e-06, + "loss": 1.17434001, + "num_input_tokens_seen": 5506600, + "router_z_loss_clip": 28.71875, + "router_z_loss_mlp": 5.94140625, + "step": 262, + "time_per_iteration": 2.6821658611297607 + }, + { + "auxiliary_loss_clip": 0.11303549, + "auxiliary_loss_mlp": 0.02991728, + "balance_loss_clip": 0.0846238, + "balance_loss_mlp": 0.02461103, + "epoch": 0.015812415451675936, + "flos": 20346088383360.0, + "grad_norm": 21.20591685993131, + "language_loss": 1.06071973, + "learning_rate": 3.587643540438383e-06, + "loss": 1.20367253, + "num_input_tokens_seen": 5524350, + "router_z_loss_clip": 28.421875, + "router_z_loss_mlp": 5.30859375, + "step": 263, + "time_per_iteration": 2.6878163814544678 + }, + { + "auxiliary_loss_clip": 0.11343089, + "auxiliary_loss_mlp": 0.02942515, + "balance_loss_clip": 0.08484475, + "balance_loss_mlp": 0.0242982, + "epoch": 0.015872538704343905, + "flos": 17530392107520.0, + "grad_norm": 30.142563573193335, + "language_loss": 1.29773152, + "learning_rate": 3.590087005168037e-06, + "loss": 1.44058764, + "num_input_tokens_seen": 5542145, + "router_z_loss_clip": 28.59375, + "router_z_loss_mlp": 5.125, + "step": 264, + "time_per_iteration": 2.662154197692871 + }, + { + "auxiliary_loss_clip": 0.11317942, + "auxiliary_loss_mlp": 0.02875043, + "balance_loss_clip": 0.08491537, + "balance_loss_mlp": 0.02415754, + "epoch": 0.015932661957011873, + "flos": 15264622177920.0, + "grad_norm": 32.942584170075996, + "language_loss": 1.38455915, + "learning_rate": 3.5925212318237344e-06, + "loss": 1.52648902, + "num_input_tokens_seen": 5557920, + "router_z_loss_clip": 28.28125, + "router_z_loss_mlp": 4.59375, + "step": 265, + "time_per_iteration": 2.6390388011932373 + }, + { + "auxiliary_loss_clip": 0.11291553, + "auxiliary_loss_mlp": 0.02864291, + "balance_loss_clip": 0.08442727, + "balance_loss_mlp": 0.02421405, + "epoch": 0.015992785209679845, + "flos": 20308674735360.0, + "grad_norm": 55.122223701442024, + "language_loss": 1.13817394, + "learning_rate": 3.5949462899957323e-06, + "loss": 1.27973235, + "num_input_tokens_seen": 5576290, + "router_z_loss_clip": 28.484375, + "router_z_loss_mlp": 4.42773438, + "step": 266, + "time_per_iteration": 2.7511661052703857 + }, + { + "auxiliary_loss_clip": 0.11267024, + "auxiliary_loss_mlp": 0.02842336, + "balance_loss_clip": 0.08455394, + "balance_loss_mlp": 0.02423863, + "epoch": 0.016052908462347814, + "flos": 23368195992960.0, + "grad_norm": 26.951368678186665, + "language_loss": 1.23554707, + "learning_rate": 3.5973622484909068e-06, + "loss": 1.3766408, + "num_input_tokens_seen": 5595205, + "router_z_loss_clip": 28.140625, + "router_z_loss_mlp": 4.17773438, + "step": 267, + "time_per_iteration": 2.681403875350952 + }, + { + "auxiliary_loss_clip": 0.11252864, + "auxiliary_loss_mlp": 0.02837055, + "balance_loss_clip": 0.0845217, + "balance_loss_mlp": 0.02411335, + "epoch": 0.016113031715015783, + "flos": 21292722685440.0, + "grad_norm": 64.20150221953703, + "language_loss": 1.24742389, + "learning_rate": 3.599769175344462e-06, + "loss": 1.38832319, + "num_input_tokens_seen": 5612645, + "router_z_loss_clip": 28.0, + "router_z_loss_mlp": 4.2578125, + "step": 268, + "time_per_iteration": 2.72198224067688 + }, + { + "auxiliary_loss_clip": 0.11163211, + "auxiliary_loss_mlp": 0.02866759, + "balance_loss_clip": 0.08415397, + "balance_loss_mlp": 0.0243093, + "epoch": 0.01617315496768375, + "flos": 18920371962240.0, + "grad_norm": 170.41239636292127, + "language_loss": 1.22916961, + "learning_rate": 3.602167137831432e-06, + "loss": 1.3694694, + "num_input_tokens_seen": 5628345, + "router_z_loss_clip": 27.46875, + "router_z_loss_mlp": 4.36132812, + "step": 269, + "time_per_iteration": 2.6403703689575195 + }, + { + "auxiliary_loss_clip": 0.11217365, + "auxiliary_loss_mlp": 0.02780488, + "balance_loss_clip": 0.08470169, + "balance_loss_mlp": 0.02398446, + "epoch": 0.01623327822035172, + "flos": 16552339724160.0, + "grad_norm": 38.966481299889274, + "language_loss": 1.32494903, + "learning_rate": 3.6045562024779565e-06, + "loss": 1.46492743, + "num_input_tokens_seen": 5645940, + "router_z_loss_clip": 27.515625, + "router_z_loss_mlp": 3.82226562, + "step": 270, + "time_per_iteration": 2.7300021648406982 + }, + { + "auxiliary_loss_clip": 0.11115253, + "auxiliary_loss_mlp": 0.02879213, + "balance_loss_clip": 0.08416284, + "balance_loss_mlp": 0.02523302, + "epoch": 0.016293401473019692, + "flos": 23520198499200.0, + "grad_norm": 74.8782587112652, + "language_loss": 1.26303077, + "learning_rate": 3.606936435072361e-06, + "loss": 1.40297556, + "num_input_tokens_seen": 5665690, + "router_z_loss_clip": 26.984375, + "router_z_loss_mlp": 3.55859375, + "step": 271, + "time_per_iteration": 2.7073349952697754 + }, + { + "auxiliary_loss_clip": 0.11099713, + "auxiliary_loss_mlp": 0.02833465, + "balance_loss_clip": 0.08408779, + "balance_loss_mlp": 0.02473739, + "epoch": 0.01635352472568766, + "flos": 29022579290880.0, + "grad_norm": 92.09487601801163, + "language_loss": 1.22523308, + "learning_rate": 3.609307900676025e-06, + "loss": 1.36456478, + "num_input_tokens_seen": 5683190, + "router_z_loss_clip": 26.921875, + "router_z_loss_mlp": 3.59765625, + "step": 272, + "time_per_iteration": 2.767242670059204 + }, + { + "auxiliary_loss_clip": 0.11100094, + "auxiliary_loss_mlp": 0.02845915, + "balance_loss_clip": 0.08419856, + "balance_loss_mlp": 0.02489432, + "epoch": 0.01641364797835563, + "flos": 13375546277760.0, + "grad_norm": 162.68643260209848, + "language_loss": 1.12912893, + "learning_rate": 3.611670663634051e-06, + "loss": 1.26858902, + "num_input_tokens_seen": 5699780, + "router_z_loss_clip": 26.828125, + "router_z_loss_mlp": 3.5625, + "step": 273, + "time_per_iteration": 2.6756341457366943 + }, + { + "auxiliary_loss_clip": 0.11082844, + "auxiliary_loss_mlp": 0.02877946, + "balance_loss_clip": 0.08410685, + "balance_loss_mlp": 0.02487702, + "epoch": 0.016473771231023598, + "flos": 18883922636160.0, + "grad_norm": 33.34014800610017, + "language_loss": 1.30194449, + "learning_rate": 3.614024787585744e-06, + "loss": 1.44155228, + "num_input_tokens_seen": 5716980, + "router_z_loss_clip": 26.734375, + "router_z_loss_mlp": 3.90234375, + "step": 274, + "time_per_iteration": 2.7216930389404297 + }, + { + "auxiliary_loss_clip": 0.11044294, + "auxiliary_loss_mlp": 0.02852219, + "balance_loss_clip": 0.08402658, + "balance_loss_mlp": 0.02501839, + "epoch": 0.016533894483691566, + "flos": 22608252587520.0, + "grad_norm": 44.408233256015265, + "language_loss": 1.22405624, + "learning_rate": 3.6163703354748927e-06, + "loss": 1.36302137, + "num_input_tokens_seen": 5737780, + "router_z_loss_clip": 26.453125, + "router_z_loss_mlp": 3.50390625, + "step": 275, + "time_per_iteration": 2.6909008026123047 + }, + { + "auxiliary_loss_clip": 0.10985737, + "auxiliary_loss_mlp": 0.02874438, + "balance_loss_clip": 0.08389083, + "balance_loss_mlp": 0.02526728, + "epoch": 0.01659401773635954, + "flos": 21513640775040.0, + "grad_norm": 44.25598676438703, + "language_loss": 1.11958659, + "learning_rate": 3.6187073695598707e-06, + "loss": 1.25818849, + "num_input_tokens_seen": 5758330, + "router_z_loss_clip": 25.984375, + "router_z_loss_mlp": 3.4765625, + "step": 276, + "time_per_iteration": 2.700979471206665 + }, + { + "auxiliary_loss_clip": 0.10974017, + "auxiliary_loss_mlp": 0.02898641, + "balance_loss_clip": 0.08386508, + "balance_loss_mlp": 0.02528615, + "epoch": 0.016654140989027507, + "flos": 32858772842880.0, + "grad_norm": 42.11334181974309, + "language_loss": 1.14762068, + "learning_rate": 3.621035951423551e-06, + "loss": 1.28634739, + "num_input_tokens_seen": 5778340, + "router_z_loss_clip": 25.859375, + "router_z_loss_mlp": 3.703125, + "step": 277, + "time_per_iteration": 2.8497049808502197 + }, + { + "auxiliary_loss_clip": 0.10973347, + "auxiliary_loss_mlp": 0.02864523, + "balance_loss_clip": 0.08391111, + "balance_loss_mlp": 0.02533217, + "epoch": 0.016714264241695476, + "flos": 12310046559360.0, + "grad_norm": 887.2068563232498, + "language_loss": 1.11253488, + "learning_rate": 3.623356141983041e-06, + "loss": 1.25091362, + "num_input_tokens_seen": 5794295, + "router_z_loss_clip": 25.796875, + "router_z_loss_mlp": 3.3125, + "step": 278, + "time_per_iteration": 2.6813693046569824 + }, + { + "auxiliary_loss_clip": 0.10953625, + "auxiliary_loss_mlp": 0.02843702, + "balance_loss_clip": 0.08367237, + "balance_loss_mlp": 0.02501333, + "epoch": 0.016774387494363444, + "flos": 27130820060160.0, + "grad_norm": 34.273698880479216, + "language_loss": 1.25525784, + "learning_rate": 3.6256680014992486e-06, + "loss": 1.39323103, + "num_input_tokens_seen": 5814405, + "router_z_loss_clip": 25.859375, + "router_z_loss_mlp": 3.42382812, + "step": 279, + "time_per_iteration": 2.784980058670044 + }, + { + "auxiliary_loss_clip": 0.10968237, + "auxiliary_loss_mlp": 0.02757426, + "balance_loss_clip": 0.0838433, + "balance_loss_mlp": 0.02447863, + "epoch": 0.016834510747031413, + "flos": 20197356186240.0, + "grad_norm": 53.49395148263472, + "language_loss": 1.29536223, + "learning_rate": 3.6279715895862713e-06, + "loss": 1.43261886, + "num_input_tokens_seen": 5832795, + "router_z_loss_clip": 25.859375, + "router_z_loss_mlp": 3.09570312, + "step": 280, + "time_per_iteration": 2.681295871734619 + }, + { + "auxiliary_loss_clip": 0.10977297, + "auxiliary_loss_mlp": 0.02731509, + "balance_loss_clip": 0.083787, + "balance_loss_mlp": 0.02426143, + "epoch": 0.016894633999699385, + "flos": 27282067879680.0, + "grad_norm": 34.532536985404526, + "language_loss": 1.04021847, + "learning_rate": 3.6302669652206183e-06, + "loss": 1.17730653, + "num_input_tokens_seen": 5855750, + "router_z_loss_clip": 26.0, + "router_z_loss_mlp": 3.0546875, + "step": 281, + "time_per_iteration": 2.760214328765869 + }, + { + "auxiliary_loss_clip": 0.10965681, + "auxiliary_loss_mlp": 0.02675743, + "balance_loss_clip": 0.08379069, + "balance_loss_mlp": 0.02375717, + "epoch": 0.016954757252367354, + "flos": 14908262762880.0, + "grad_norm": 196.2497312811754, + "language_loss": 1.22675765, + "learning_rate": 3.632554186750274e-06, + "loss": 1.36317194, + "num_input_tokens_seen": 5872610, + "router_z_loss_clip": 25.875, + "router_z_loss_mlp": 2.99609375, + "step": 282, + "time_per_iteration": 2.619256019592285 + }, + { + "auxiliary_loss_clip": 0.10984524, + "auxiliary_loss_mlp": 0.02614953, + "balance_loss_clip": 0.0837212, + "balance_loss_mlp": 0.02316834, + "epoch": 0.017014880505035322, + "flos": 21364824723840.0, + "grad_norm": 113.89697119062544, + "language_loss": 1.1510148, + "learning_rate": 3.6348333119035937e-06, + "loss": 1.28700948, + "num_input_tokens_seen": 5892985, + "router_z_loss_clip": 26.125, + "router_z_loss_mlp": 2.98046875, + "step": 283, + "time_per_iteration": 2.7038846015930176 + }, + { + "auxiliary_loss_clip": 0.10939686, + "auxiliary_loss_mlp": 0.02615653, + "balance_loss_clip": 0.08368152, + "balance_loss_mlp": 0.02314101, + "epoch": 0.01707500375770329, + "flos": 35341561647360.0, + "grad_norm": 2832.5964725422496, + "language_loss": 1.17971587, + "learning_rate": 3.6371043977980503e-06, + "loss": 1.31526923, + "num_input_tokens_seen": 5914060, + "router_z_loss_clip": 25.703125, + "router_z_loss_mlp": 3.015625, + "step": 284, + "time_per_iteration": 2.779290199279785 + }, + { + "auxiliary_loss_clip": 0.11009269, + "auxiliary_loss_mlp": 0.02623795, + "balance_loss_clip": 0.08394658, + "balance_loss_mlp": 0.02300118, + "epoch": 0.01713512701037126, + "flos": 23588065906560.0, + "grad_norm": 202.09490986405962, + "language_loss": 1.3942194, + "learning_rate": 3.639367500948819e-06, + "loss": 1.53055, + "num_input_tokens_seen": 5932860, + "router_z_loss_clip": 26.15625, + "router_z_loss_mlp": 3.23632812, + "step": 285, + "time_per_iteration": 2.708090305328369 + }, + { + "auxiliary_loss_clip": 0.10991548, + "auxiliary_loss_mlp": 0.02635612, + "balance_loss_clip": 0.08366679, + "balance_loss_mlp": 0.02286949, + "epoch": 0.01719525026303923, + "flos": 27641781457920.0, + "grad_norm": 356.15135022069484, + "language_loss": 1.3973043, + "learning_rate": 3.6416226772772178e-06, + "loss": 1.53357589, + "num_input_tokens_seen": 5952725, + "router_z_loss_clip": 26.265625, + "router_z_loss_mlp": 3.48828125, + "step": 286, + "time_per_iteration": 2.719446897506714 + }, + { + "auxiliary_loss_clip": 0.11012185, + "auxiliary_loss_mlp": 0.02632762, + "balance_loss_clip": 0.08369677, + "balance_loss_mlp": 0.02288295, + "epoch": 0.0172553735157072, + "flos": 26987035253760.0, + "grad_norm": 104.57350843719594, + "language_loss": 1.20868826, + "learning_rate": 3.643869982119001e-06, + "loss": 1.34513772, + "num_input_tokens_seen": 5970560, + "router_z_loss_clip": 26.4375, + "router_z_loss_mlp": 3.44335938, + "step": 287, + "time_per_iteration": 2.729893207550049 + }, + { + "auxiliary_loss_clip": 0.10980022, + "auxiliary_loss_mlp": 0.02642429, + "balance_loss_clip": 0.08353196, + "balance_loss_mlp": 0.02284801, + "epoch": 0.01731549676837517, + "flos": 14060578533120.0, + "grad_norm": 166.25914626432441, + "language_loss": 1.43957901, + "learning_rate": 3.646109470232502e-06, + "loss": 1.57580352, + "num_input_tokens_seen": 5982980, + "router_z_loss_clip": 26.21875, + "router_z_loss_mlp": 3.57617188, + "step": 288, + "time_per_iteration": 2.649275779724121 + }, + { + "auxiliary_loss_clip": 0.08934768, + "auxiliary_loss_mlp": 0.02473956, + "balance_loss_clip": 0.07674165, + "balance_loss_mlp": 0.02246409, + "epoch": 0.017375620021043137, + "flos": 66533545543680.0, + "grad_norm": 1.4063062090104488, + "language_loss": 0.6396153, + "learning_rate": 3.6483411958066417e-06, + "loss": 0.75370252, + "num_input_tokens_seen": 6049445, + "router_z_loss_clip": 12.625, + "router_z_loss_mlp": 2.27734375, + "step": 289, + "time_per_iteration": 3.379565954208374 + }, + { + "auxiliary_loss_clip": 0.10942794, + "auxiliary_loss_mlp": 0.0259406, + "balance_loss_clip": 0.08345533, + "balance_loss_mlp": 0.02290982, + "epoch": 0.01743574327371111, + "flos": 15229472590080.0, + "grad_norm": 77.68078787610818, + "language_loss": 1.23036659, + "learning_rate": 3.6505652124687957e-06, + "loss": 1.36573505, + "num_input_tokens_seen": 6064150, + "router_z_loss_clip": 26.0, + "router_z_loss_mlp": 3.03320312, + "step": 290, + "time_per_iteration": 2.6509203910827637 + }, + { + "auxiliary_loss_clip": 0.10926615, + "auxiliary_loss_mlp": 0.02615048, + "balance_loss_clip": 0.08348773, + "balance_loss_mlp": 0.02310254, + "epoch": 0.017495866526379078, + "flos": 25380833137920.0, + "grad_norm": 27.564120325217353, + "language_loss": 1.14881706, + "learning_rate": 3.6527815732925258e-06, + "loss": 1.28423381, + "num_input_tokens_seen": 6083920, + "router_z_loss_clip": 25.796875, + "router_z_loss_mlp": 3.046875, + "step": 291, + "time_per_iteration": 2.7178046703338623 + }, + { + "auxiliary_loss_clip": 0.10883434, + "auxiliary_loss_mlp": 0.02591836, + "balance_loss_clip": 0.08332369, + "balance_loss_mlp": 0.02272164, + "epoch": 0.017555989779047047, + "flos": 26366683950720.0, + "grad_norm": 17.764405326344416, + "language_loss": 0.99533927, + "learning_rate": 3.6549903308051806e-06, + "loss": 1.13009202, + "num_input_tokens_seen": 6105460, + "router_z_loss_clip": 25.53125, + "router_z_loss_mlp": 3.1953125, + "step": 292, + "time_per_iteration": 2.788431406021118 + }, + { + "auxiliary_loss_clip": 0.10899352, + "auxiliary_loss_mlp": 0.02663543, + "balance_loss_clip": 0.08339885, + "balance_loss_mlp": 0.02329948, + "epoch": 0.017616113031715015, + "flos": 22344134918400.0, + "grad_norm": 26.042803645754148, + "language_loss": 1.17510223, + "learning_rate": 3.6571915369953646e-06, + "loss": 1.31073129, + "num_input_tokens_seen": 6122890, + "router_z_loss_clip": 25.59375, + "router_z_loss_mlp": 3.33398438, + "step": 293, + "time_per_iteration": 2.6952950954437256 + }, + { + "auxiliary_loss_clip": 0.10900117, + "auxiliary_loss_mlp": 0.02710556, + "balance_loss_clip": 0.08334709, + "balance_loss_mlp": 0.02379822, + "epoch": 0.017676236284382984, + "flos": 20163087066240.0, + "grad_norm": 32.066823918561106, + "language_loss": 1.13700342, + "learning_rate": 3.6593852433202797e-06, + "loss": 1.27311015, + "num_input_tokens_seen": 6142890, + "router_z_loss_clip": 25.640625, + "router_z_loss_mlp": 3.30859375, + "step": 294, + "time_per_iteration": 5.568135976791382 + }, + { + "auxiliary_loss_clip": 0.10885305, + "auxiliary_loss_mlp": 0.02641671, + "balance_loss_clip": 0.08332892, + "balance_loss_mlp": 0.02322953, + "epoch": 0.017736359537050956, + "flos": 25229501464320.0, + "grad_norm": 23.522869629200528, + "language_loss": 1.10671854, + "learning_rate": 3.6615715007129453e-06, + "loss": 1.24198818, + "num_input_tokens_seen": 6162030, + "router_z_loss_clip": 25.515625, + "router_z_loss_mlp": 3.1875, + "step": 295, + "time_per_iteration": 4.106949090957642 + }, + { + "auxiliary_loss_clip": 0.10915332, + "auxiliary_loss_mlp": 0.02662487, + "balance_loss_clip": 0.08334074, + "balance_loss_mlp": 0.02339572, + "epoch": 0.017796482789718925, + "flos": 20344914426240.0, + "grad_norm": 21.437764161161574, + "language_loss": 1.11617136, + "learning_rate": 3.6637503595892897e-06, + "loss": 1.25194955, + "num_input_tokens_seen": 6180540, + "router_z_loss_clip": 25.8125, + "router_z_loss_mlp": 3.22851562, + "step": 296, + "time_per_iteration": 2.6804072856903076 + }, + { + "auxiliary_loss_clip": 0.10889067, + "auxiliary_loss_mlp": 0.02644786, + "balance_loss_clip": 0.08324579, + "balance_loss_mlp": 0.02326259, + "epoch": 0.017856606042386893, + "flos": 22385196218880.0, + "grad_norm": 24.793293378850404, + "language_loss": 1.13374424, + "learning_rate": 3.665921869855132e-06, + "loss": 1.26908278, + "num_input_tokens_seen": 6199425, + "router_z_loss_clip": 25.671875, + "router_z_loss_mlp": 3.18554688, + "step": 297, + "time_per_iteration": 4.217481851577759 + }, + { + "auxiliary_loss_clip": 0.10852176, + "auxiliary_loss_mlp": 0.02688673, + "balance_loss_clip": 0.08303393, + "balance_loss_mlp": 0.02347639, + "epoch": 0.017916729295054862, + "flos": 20236279207680.0, + "grad_norm": 36.45374269731938, + "language_loss": 1.20502043, + "learning_rate": 3.6680860809130346e-06, + "loss": 1.34042883, + "num_input_tokens_seen": 6219170, + "router_z_loss_clip": 25.515625, + "router_z_loss_mlp": 3.40820312, + "step": 298, + "time_per_iteration": 2.6716575622558594 + }, + { + "auxiliary_loss_clip": 0.10865816, + "auxiliary_loss_mlp": 0.02644256, + "balance_loss_clip": 0.08315772, + "balance_loss_mlp": 0.02343848, + "epoch": 0.01797685254772283, + "flos": 19397064240000.0, + "grad_norm": 34.948505853119244, + "language_loss": 1.10227847, + "learning_rate": 3.6702430416690516e-06, + "loss": 1.23737931, + "num_input_tokens_seen": 6237930, + "router_z_loss_clip": 25.5, + "router_z_loss_mlp": 3.00390625, + "step": 299, + "time_per_iteration": 2.6678671836853027 + }, + { + "auxiliary_loss_clip": 0.10841461, + "auxiliary_loss_mlp": 0.02622314, + "balance_loss_clip": 0.08293117, + "balance_loss_mlp": 0.02329536, + "epoch": 0.018036975800390802, + "flos": 24432941024640.0, + "grad_norm": 19.38461643101093, + "language_loss": 0.93498641, + "learning_rate": 3.672392800539357e-06, + "loss": 1.06962407, + "num_input_tokens_seen": 6257170, + "router_z_loss_clip": 25.46875, + "router_z_loss_mlp": 2.92578125, + "step": 300, + "time_per_iteration": 2.678161382675171 + }, + { + "auxiliary_loss_clip": 0.10806506, + "auxiliary_loss_mlp": 0.02621871, + "balance_loss_clip": 0.08281456, + "balance_loss_mlp": 0.02336723, + "epoch": 0.01809709905305877, + "flos": 15784430181120.0, + "grad_norm": 20.696646248156853, + "language_loss": 1.21024799, + "learning_rate": 3.6745354054567686e-06, + "loss": 1.34453177, + "num_input_tokens_seen": 6274780, + "router_z_loss_clip": 25.265625, + "router_z_loss_mlp": 2.85351562, + "step": 301, + "time_per_iteration": 2.6817290782928467 + }, + { + "auxiliary_loss_clip": 0.0850801, + "auxiliary_loss_mlp": 0.01826254, + "balance_loss_clip": 0.07523113, + "balance_loss_mlp": 0.01690356, + "epoch": 0.01815722230572674, + "flos": 67371125356800.0, + "grad_norm": 1.2503467181890604, + "language_loss": 0.62148851, + "learning_rate": 3.676670903877158e-06, + "loss": 0.72483116, + "num_input_tokens_seen": 6340435, + "router_z_loss_clip": 9.859375, + "router_z_loss_mlp": 1.36035156, + "step": 302, + "time_per_iteration": 3.424029588699341 + }, + { + "auxiliary_loss_clip": 0.10791934, + "auxiliary_loss_mlp": 0.02578435, + "balance_loss_clip": 0.08265001, + "balance_loss_mlp": 0.02299963, + "epoch": 0.01821734555839471, + "flos": 15490823074560.0, + "grad_norm": 21.711544566316807, + "language_loss": 1.17839396, + "learning_rate": 3.6787993427857567e-06, + "loss": 1.31209755, + "num_input_tokens_seen": 6358160, + "router_z_loss_clip": 25.265625, + "router_z_loss_mlp": 2.78320312, + "step": 303, + "time_per_iteration": 2.6523215770721436 + }, + { + "auxiliary_loss_clip": 0.10728209, + "auxiliary_loss_mlp": 0.02544189, + "balance_loss_clip": 0.08224705, + "balance_loss_mlp": 0.02301288, + "epoch": 0.018277468811062677, + "flos": 24104268184320.0, + "grad_norm": 23.704422815160775, + "language_loss": 1.0746634, + "learning_rate": 3.680920768703364e-06, + "loss": 1.20738745, + "num_input_tokens_seen": 6378485, + "router_z_loss_clip": 25.03125, + "router_z_loss_mlp": 2.42675781, + "step": 304, + "time_per_iteration": 2.7344958782196045 + }, + { + "auxiliary_loss_clip": 0.1066777, + "auxiliary_loss_mlp": 0.02483555, + "balance_loss_clip": 0.08210013, + "balance_loss_mlp": 0.02260681, + "epoch": 0.01833759206373065, + "flos": 20965601145600.0, + "grad_norm": 30.99837504160223, + "language_loss": 1.03348625, + "learning_rate": 3.6830352276924415e-06, + "loss": 1.16499949, + "num_input_tokens_seen": 6397845, + "router_z_loss_clip": 24.5625, + "router_z_loss_mlp": 2.22949219, + "step": 305, + "time_per_iteration": 2.7260208129882812 + }, + { + "auxiliary_loss_clip": 0.10687442, + "auxiliary_loss_mlp": 0.0251225, + "balance_loss_clip": 0.08201034, + "balance_loss_mlp": 0.0229529, + "epoch": 0.018397715316398618, + "flos": 19396812677760.0, + "grad_norm": 19.918754118514013, + "language_loss": 1.13116205, + "learning_rate": 3.685142765363119e-06, + "loss": 1.26315892, + "num_input_tokens_seen": 6416475, + "router_z_loss_clip": 24.828125, + "router_z_loss_mlp": 2.16992188, + "step": 306, + "time_per_iteration": 2.691499948501587 + }, + { + "auxiliary_loss_clip": 0.10669354, + "auxiliary_loss_mlp": 0.02508631, + "balance_loss_clip": 0.08186156, + "balance_loss_mlp": 0.02314558, + "epoch": 0.018457838569066586, + "flos": 29140228823040.0, + "grad_norm": 47.10981354198648, + "language_loss": 1.13449669, + "learning_rate": 3.687243426879095e-06, + "loss": 1.2662766, + "num_input_tokens_seen": 6437520, + "router_z_loss_clip": 24.859375, + "router_z_loss_mlp": 1.94335938, + "step": 307, + "time_per_iteration": 2.7379393577575684 + }, + { + "auxiliary_loss_clip": 0.10625106, + "auxiliary_loss_mlp": 0.02487612, + "balance_loss_clip": 0.08165652, + "balance_loss_mlp": 0.02317095, + "epoch": 0.018517961821734555, + "flos": 19214733755520.0, + "grad_norm": 42.1678147839251, + "language_loss": 0.98589212, + "learning_rate": 3.6893372569634466e-06, + "loss": 1.11701941, + "num_input_tokens_seen": 6455680, + "router_z_loss_clip": 24.609375, + "router_z_loss_mlp": 1.70605469, + "step": 308, + "time_per_iteration": 2.702864646911621 + }, + { + "auxiliary_loss_clip": 0.1055109, + "auxiliary_loss_mlp": 0.02395341, + "balance_loss_clip": 0.08134291, + "balance_loss_mlp": 0.02218911, + "epoch": 0.018578085074402523, + "flos": 19868809127040.0, + "grad_norm": 28.65950876073581, + "language_loss": 1.1383698, + "learning_rate": 3.6914242999043395e-06, + "loss": 1.26783419, + "num_input_tokens_seen": 6474880, + "router_z_loss_clip": 24.171875, + "router_z_loss_mlp": 1.765625, + "step": 309, + "time_per_iteration": 2.6683051586151123 + }, + { + "auxiliary_loss_clip": 0.10586038, + "auxiliary_loss_mlp": 0.02405273, + "balance_loss_clip": 0.08121731, + "balance_loss_mlp": 0.02230465, + "epoch": 0.018638208327070496, + "flos": 29614740894720.0, + "grad_norm": 52.453360042586766, + "language_loss": 1.0296793, + "learning_rate": 3.69350459956065e-06, + "loss": 1.15959239, + "num_input_tokens_seen": 6495945, + "router_z_loss_clip": 24.625, + "router_z_loss_mlp": 1.74804688, + "step": 310, + "time_per_iteration": 2.775391101837158 + }, + { + "auxiliary_loss_clip": 0.10563378, + "auxiliary_loss_mlp": 0.02371235, + "balance_loss_clip": 0.08112171, + "balance_loss_mlp": 0.02215118, + "epoch": 0.018698331579738464, + "flos": 45741694567680.0, + "grad_norm": 23.410275827875097, + "language_loss": 0.97821265, + "learning_rate": 3.695578199367497e-06, + "loss": 1.10755873, + "num_input_tokens_seen": 6519930, + "router_z_loss_clip": 24.5, + "router_z_loss_mlp": 1.56054688, + "step": 311, + "time_per_iteration": 2.8839335441589355 + }, + { + "auxiliary_loss_clip": 0.10531655, + "auxiliary_loss_mlp": 0.02336008, + "balance_loss_clip": 0.08109175, + "balance_loss_mlp": 0.02177126, + "epoch": 0.018758454832406433, + "flos": 20489621627520.0, + "grad_norm": 82.59483456267918, + "language_loss": 1.18671477, + "learning_rate": 3.6976451423416825e-06, + "loss": 1.31539142, + "num_input_tokens_seen": 6535070, + "router_z_loss_clip": 24.203125, + "router_z_loss_mlp": 1.58886719, + "step": 312, + "time_per_iteration": 2.770037889480591 + }, + { + "auxiliary_loss_clip": 0.10558081, + "auxiliary_loss_mlp": 0.02280057, + "balance_loss_clip": 0.08105703, + "balance_loss_mlp": 0.02130998, + "epoch": 0.0188185780850744, + "flos": 15783088515840.0, + "grad_norm": 63.63527142809732, + "language_loss": 1.19325101, + "learning_rate": 3.699705471087043e-06, + "loss": 1.32163239, + "num_input_tokens_seen": 6554135, + "router_z_loss_clip": 24.515625, + "router_z_loss_mlp": 1.49121094, + "step": 313, + "time_per_iteration": 2.6673521995544434 + }, + { + "auxiliary_loss_clip": 0.10532573, + "auxiliary_loss_mlp": 0.02284473, + "balance_loss_clip": 0.08092797, + "balance_loss_mlp": 0.02119774, + "epoch": 0.018878701337742373, + "flos": 22462329502080.0, + "grad_norm": 55.57556601394066, + "language_loss": 1.1492281, + "learning_rate": 3.7017592277997256e-06, + "loss": 1.27739859, + "num_input_tokens_seen": 6572275, + "router_z_loss_clip": 24.375, + "router_z_loss_mlp": 1.6484375, + "step": 314, + "time_per_iteration": 2.6694388389587402 + }, + { + "auxiliary_loss_clip": 0.10578424, + "auxiliary_loss_mlp": 0.02246847, + "balance_loss_clip": 0.08105191, + "balance_loss_mlp": 0.02083482, + "epoch": 0.018938824590410342, + "flos": 31001576221440.0, + "grad_norm": 45.405049918855795, + "language_loss": 1.21203804, + "learning_rate": 3.7038064542733654e-06, + "loss": 1.34029078, + "num_input_tokens_seen": 6594520, + "router_z_loss_clip": 24.734375, + "router_z_loss_mlp": 1.6328125, + "step": 315, + "time_per_iteration": 2.7529938220977783 + }, + { + "auxiliary_loss_clip": 0.10473935, + "auxiliary_loss_mlp": 0.02224543, + "balance_loss_clip": 0.08059986, + "balance_loss_mlp": 0.02047731, + "epoch": 0.01899894784307831, + "flos": 23265724049280.0, + "grad_norm": 52.87369135887914, + "language_loss": 1.09085321, + "learning_rate": 3.7058471919041945e-06, + "loss": 1.21783805, + "num_input_tokens_seen": 6614245, + "router_z_loss_clip": 24.15625, + "router_z_loss_mlp": 1.76855469, + "step": 316, + "time_per_iteration": 2.7019717693328857 + }, + { + "auxiliary_loss_clip": 0.1049989, + "auxiliary_loss_mlp": 0.02224334, + "balance_loss_clip": 0.08073364, + "balance_loss_mlp": 0.02044757, + "epoch": 0.01905907109574628, + "flos": 17463782511360.0, + "grad_norm": 120.61991368810097, + "language_loss": 1.19369888, + "learning_rate": 3.7078814816960605e-06, + "loss": 1.32094109, + "num_input_tokens_seen": 6632015, + "router_z_loss_clip": 24.234375, + "router_z_loss_mlp": 1.79492188, + "step": 317, + "time_per_iteration": 2.6503257751464844 + }, + { + "auxiliary_loss_clip": 0.10466437, + "auxiliary_loss_mlp": 0.02269676, + "balance_loss_clip": 0.08054706, + "balance_loss_mlp": 0.02081039, + "epoch": 0.019119194348414248, + "flos": 14974578869760.0, + "grad_norm": 61.86297235247138, + "language_loss": 1.22225165, + "learning_rate": 3.709909364265374e-06, + "loss": 1.34961283, + "num_input_tokens_seen": 6649015, + "router_z_loss_clip": 24.109375, + "router_z_loss_mlp": 1.88769531, + "step": 318, + "time_per_iteration": 2.631645917892456 + }, + { + "auxiliary_loss_clip": 0.1039573, + "auxiliary_loss_mlp": 0.02220381, + "balance_loss_clip": 0.08026896, + "balance_loss_mlp": 0.02036608, + "epoch": 0.01917931760108222, + "flos": 25489719918720.0, + "grad_norm": 79.56078914423522, + "language_loss": 1.24628842, + "learning_rate": 3.7119308798459706e-06, + "loss": 1.3724494, + "num_input_tokens_seen": 6669225, + "router_z_loss_clip": 23.65625, + "router_z_loss_mlp": 1.83789062, + "step": 319, + "time_per_iteration": 2.723235607147217 + }, + { + "auxiliary_loss_clip": 0.08211939, + "auxiliary_loss_mlp": 0.01803451, + "balance_loss_clip": 0.07311222, + "balance_loss_mlp": 0.01697974, + "epoch": 0.01923944085375019, + "flos": 71576438872320.0, + "grad_norm": 0.9540157623115577, + "language_loss": 0.59494603, + "learning_rate": 3.7139460682939026e-06, + "loss": 0.69509989, + "num_input_tokens_seen": 6725775, + "router_z_loss_clip": 9.0, + "router_z_loss_mlp": 1.05664062, + "step": 320, + "time_per_iteration": 3.180224895477295 + }, + { + "auxiliary_loss_clip": 0.10427548, + "auxiliary_loss_mlp": 0.02254004, + "balance_loss_clip": 0.0803239, + "balance_loss_mlp": 0.02062601, + "epoch": 0.019299564106418157, + "flos": 19688574994560.0, + "grad_norm": 36.291900925718565, + "language_loss": 1.21542251, + "learning_rate": 3.715954969092154e-06, + "loss": 1.34223795, + "num_input_tokens_seen": 6744170, + "router_z_loss_clip": 23.921875, + "router_z_loss_mlp": 1.9140625, + "step": 321, + "time_per_iteration": 2.682126045227051 + }, + { + "auxiliary_loss_clip": 0.10335587, + "auxiliary_loss_mlp": 0.02247301, + "balance_loss_clip": 0.079924, + "balance_loss_mlp": 0.02050463, + "epoch": 0.019359687359086126, + "flos": 24393682586880.0, + "grad_norm": 33.259970226975035, + "language_loss": 1.13044763, + "learning_rate": 3.7179576213552805e-06, + "loss": 1.25627637, + "num_input_tokens_seen": 6764565, + "router_z_loss_clip": 23.40625, + "router_z_loss_mlp": 1.96972656, + "step": 322, + "time_per_iteration": 2.707108736038208 + }, + { + "auxiliary_loss_clip": 0.10356271, + "auxiliary_loss_mlp": 0.02232923, + "balance_loss_clip": 0.08007558, + "balance_loss_mlp": 0.02039518, + "epoch": 0.019419810611754094, + "flos": 23958177390720.0, + "grad_norm": 36.53278953975959, + "language_loss": 0.99391961, + "learning_rate": 3.719954063833981e-06, + "loss": 1.11981153, + "num_input_tokens_seen": 6785310, + "router_z_loss_clip": 23.46875, + "router_z_loss_mlp": 1.93554688, + "step": 323, + "time_per_iteration": 2.723851442337036 + }, + { + "auxiliary_loss_clip": 0.10368463, + "auxiliary_loss_mlp": 0.02256046, + "balance_loss_clip": 0.08015804, + "balance_loss_mlp": 0.02064739, + "epoch": 0.019479933864422067, + "flos": 22166164846080.0, + "grad_norm": 31.715264393756637, + "language_loss": 1.15310884, + "learning_rate": 3.721944334919596e-06, + "loss": 1.27935386, + "num_input_tokens_seen": 6803290, + "router_z_loss_clip": 23.5, + "router_z_loss_mlp": 1.9140625, + "step": 324, + "time_per_iteration": 2.696791887283325 + }, + { + "auxiliary_loss_clip": 0.10296808, + "auxiliary_loss_mlp": 0.02240866, + "balance_loss_clip": 0.08005355, + "balance_loss_mlp": 0.02052992, + "epoch": 0.019540057117090035, + "flos": 22243381983360.0, + "grad_norm": 43.49790109423306, + "language_loss": 0.94611681, + "learning_rate": 3.7239284726485375e-06, + "loss": 1.07149351, + "num_input_tokens_seen": 6822570, + "router_z_loss_clip": 22.90625, + "router_z_loss_mlp": 1.87890625, + "step": 325, + "time_per_iteration": 2.653348207473755 + }, + { + "auxiliary_loss_clip": 0.10282885, + "auxiliary_loss_mlp": 0.02182889, + "balance_loss_clip": 0.07997272, + "balance_loss_mlp": 0.02001023, + "epoch": 0.019600180369758004, + "flos": 23083603200000.0, + "grad_norm": 27.315965412731057, + "language_loss": 0.98057997, + "learning_rate": 3.72590651470665e-06, + "loss": 1.10523772, + "num_input_tokens_seen": 6841910, + "router_z_loss_clip": 22.859375, + "router_z_loss_mlp": 1.81835938, + "step": 326, + "time_per_iteration": 2.712902545928955 + }, + { + "auxiliary_loss_clip": 0.10212934, + "auxiliary_loss_mlp": 0.0211514, + "balance_loss_clip": 0.07960281, + "balance_loss_mlp": 0.01952062, + "epoch": 0.019660303622425972, + "flos": 25417911369600.0, + "grad_norm": 35.757935523376304, + "language_loss": 1.00482905, + "learning_rate": 3.727878498433505e-06, + "loss": 1.12810981, + "num_input_tokens_seen": 6862480, + "router_z_loss_clip": 22.53125, + "router_z_loss_mlp": 1.63085938, + "step": 327, + "time_per_iteration": 2.7241063117980957 + }, + { + "auxiliary_loss_clip": 0.10138492, + "auxiliary_loss_mlp": 0.02035691, + "balance_loss_clip": 0.07947245, + "balance_loss_mlp": 0.01881101, + "epoch": 0.01972042687509394, + "flos": 23663941378560.0, + "grad_norm": 104.32864902308236, + "language_loss": 1.03565025, + "learning_rate": 3.7298444608266328e-06, + "loss": 1.15739202, + "num_input_tokens_seen": 6882015, + "router_z_loss_clip": 21.9375, + "router_z_loss_mlp": 1.54492188, + "step": 328, + "time_per_iteration": 2.709101438522339 + }, + { + "auxiliary_loss_clip": 0.10164856, + "auxiliary_loss_mlp": 0.01970008, + "balance_loss_clip": 0.0795281, + "balance_loss_mlp": 0.01821044, + "epoch": 0.019780550127761913, + "flos": 18229386067200.0, + "grad_norm": 42.1606706132577, + "language_loss": 1.2875843, + "learning_rate": 3.731804438545683e-06, + "loss": 1.40893316, + "num_input_tokens_seen": 6899785, + "router_z_loss_clip": 22.125, + "router_z_loss_mlp": 1.49023438, + "step": 329, + "time_per_iteration": 2.6586227416992188 + }, + { + "auxiliary_loss_clip": 0.10175324, + "auxiliary_loss_mlp": 0.0194808, + "balance_loss_clip": 0.07956892, + "balance_loss_mlp": 0.0180417, + "epoch": 0.01984067338042988, + "flos": 22425293197440.0, + "grad_norm": 45.342797810033126, + "language_loss": 1.05014217, + "learning_rate": 3.7337584679165324e-06, + "loss": 1.17137623, + "num_input_tokens_seen": 6918575, + "router_z_loss_clip": 22.1875, + "router_z_loss_mlp": 1.43847656, + "step": 330, + "time_per_iteration": 2.7214515209198 + }, + { + "auxiliary_loss_clip": 0.10115402, + "auxiliary_loss_mlp": 0.01893459, + "balance_loss_clip": 0.07927606, + "balance_loss_mlp": 0.01745353, + "epoch": 0.01990079663309785, + "flos": 17060785499520.0, + "grad_norm": 59.15314637886723, + "language_loss": 1.25238144, + "learning_rate": 3.7357065849353186e-06, + "loss": 1.37247014, + "num_input_tokens_seen": 6936965, + "router_z_loss_clip": 21.890625, + "router_z_loss_mlp": 1.48046875, + "step": 331, + "time_per_iteration": 2.657338857650757 + }, + { + "auxiliary_loss_clip": 0.10080996, + "auxiliary_loss_mlp": 0.01847509, + "balance_loss_clip": 0.07917192, + "balance_loss_mlp": 0.01704076, + "epoch": 0.01996091988576582, + "flos": 15967389571200.0, + "grad_norm": 98.01539887897596, + "language_loss": 1.18547392, + "learning_rate": 3.737648825272422e-06, + "loss": 1.30475891, + "num_input_tokens_seen": 6953475, + "router_z_loss_clip": 21.625, + "router_z_loss_mlp": 1.43457031, + "step": 332, + "time_per_iteration": 2.653959035873413 + }, + { + "auxiliary_loss_clip": 0.10103545, + "auxiliary_loss_mlp": 0.01800932, + "balance_loss_clip": 0.07904914, + "balance_loss_mlp": 0.01663794, + "epoch": 0.02002104313843379, + "flos": 23593181005440.0, + "grad_norm": 35.094478760810134, + "language_loss": 1.10768199, + "learning_rate": 3.739585224276384e-06, + "loss": 1.22672677, + "num_input_tokens_seen": 6971630, + "router_z_loss_clip": 21.96875, + "router_z_loss_mlp": 1.37207031, + "step": 333, + "time_per_iteration": 4.1371009349823 + }, + { + "auxiliary_loss_clip": 0.10097618, + "auxiliary_loss_mlp": 0.01781343, + "balance_loss_clip": 0.07907948, + "balance_loss_mlp": 0.01654028, + "epoch": 0.02008116639110176, + "flos": 34103458517760.0, + "grad_norm": 136.68327853765982, + "language_loss": 1.06974816, + "learning_rate": 3.7415158169777673e-06, + "loss": 1.18853784, + "num_input_tokens_seen": 6992775, + "router_z_loss_clip": 21.921875, + "router_z_loss_mlp": 1.2734375, + "step": 334, + "time_per_iteration": 4.332135200500488 + }, + { + "auxiliary_loss_clip": 0.10031913, + "auxiliary_loss_mlp": 0.01781208, + "balance_loss_clip": 0.07884848, + "balance_loss_mlp": 0.01645405, + "epoch": 0.020141289643769728, + "flos": 19690000513920.0, + "grad_norm": 127.35413263461035, + "language_loss": 1.06165111, + "learning_rate": 3.7434406380929575e-06, + "loss": 1.17978239, + "num_input_tokens_seen": 7011425, + "router_z_loss_clip": 21.453125, + "router_z_loss_mlp": 1.35742188, + "step": 335, + "time_per_iteration": 2.6845688819885254 + }, + { + "auxiliary_loss_clip": 0.10012034, + "auxiliary_loss_mlp": 0.01785006, + "balance_loss_clip": 0.07876636, + "balance_loss_mlp": 0.01652064, + "epoch": 0.020201412896437697, + "flos": 20746821335040.0, + "grad_norm": 92.68671579424392, + "language_loss": 1.17325389, + "learning_rate": 3.745359722027911e-06, + "loss": 1.29122424, + "num_input_tokens_seen": 7029450, + "router_z_loss_clip": 21.34375, + "router_z_loss_mlp": 1.33007812, + "step": 336, + "time_per_iteration": 4.08910059928894 + }, + { + "auxiliary_loss_clip": 0.1002828, + "auxiliary_loss_mlp": 0.01777388, + "balance_loss_clip": 0.07887816, + "balance_loss_mlp": 0.01649119, + "epoch": 0.020261536149105665, + "flos": 20272728533760.0, + "grad_norm": 120.00954497896274, + "language_loss": 1.09627342, + "learning_rate": 3.7472731028818428e-06, + "loss": 1.21433008, + "num_input_tokens_seen": 7047555, + "router_z_loss_clip": 21.40625, + "router_z_loss_mlp": 1.28222656, + "step": 337, + "time_per_iteration": 2.805793285369873 + }, + { + "auxiliary_loss_clip": 0.09984031, + "auxiliary_loss_mlp": 0.01793779, + "balance_loss_clip": 0.07868993, + "balance_loss_mlp": 0.01666368, + "epoch": 0.020321659401773638, + "flos": 25855890261120.0, + "grad_norm": 28.99860578242643, + "language_loss": 1.06755781, + "learning_rate": 3.7491808144508626e-06, + "loss": 1.18533587, + "num_input_tokens_seen": 7068185, + "router_z_loss_clip": 21.15625, + "router_z_loss_mlp": 1.2734375, + "step": 338, + "time_per_iteration": 2.731576919555664 + }, + { + "auxiliary_loss_clip": 0.09960704, + "auxiliary_loss_mlp": 0.01799352, + "balance_loss_clip": 0.0785647, + "balance_loss_mlp": 0.01663931, + "epoch": 0.020381782654441606, + "flos": 17501028451200.0, + "grad_norm": 48.687202060804886, + "language_loss": 1.0690763, + "learning_rate": 3.7510828902315576e-06, + "loss": 1.18667698, + "num_input_tokens_seen": 7085955, + "router_z_loss_clip": 21.03125, + "router_z_loss_mlp": 1.35449219, + "step": 339, + "time_per_iteration": 2.6707966327667236 + }, + { + "auxiliary_loss_clip": 0.09979145, + "auxiliary_loss_mlp": 0.01800383, + "balance_loss_clip": 0.07839093, + "balance_loss_mlp": 0.01661433, + "epoch": 0.020441905907109575, + "flos": 24250904029440.0, + "grad_norm": 71.79969186636298, + "language_loss": 1.09025931, + "learning_rate": 3.75297936342452e-06, + "loss": 1.20805454, + "num_input_tokens_seen": 7106345, + "router_z_loss_clip": 21.4375, + "router_z_loss_mlp": 1.38964844, + "step": 340, + "time_per_iteration": 2.6860833168029785 + }, + { + "auxiliary_loss_clip": 0.09942168, + "auxiliary_loss_mlp": 0.01812594, + "balance_loss_clip": 0.07835533, + "balance_loss_mlp": 0.01670592, + "epoch": 0.020502029159777543, + "flos": 22239273133440.0, + "grad_norm": 33.37713513104353, + "language_loss": 1.09787846, + "learning_rate": 3.7548702669378253e-06, + "loss": 1.21542597, + "num_input_tokens_seen": 7125070, + "router_z_loss_clip": 21.09375, + "router_z_loss_mlp": 1.41992188, + "step": 341, + "time_per_iteration": 2.6922483444213867 + }, + { + "auxiliary_loss_clip": 0.09939329, + "auxiliary_loss_mlp": 0.01828812, + "balance_loss_clip": 0.07839939, + "balance_loss_mlp": 0.01694249, + "epoch": 0.020562152412445512, + "flos": 23994668643840.0, + "grad_norm": 29.77192234960925, + "language_loss": 1.11667454, + "learning_rate": 3.756755633390458e-06, + "loss": 1.23435605, + "num_input_tokens_seen": 7144675, + "router_z_loss_clip": 21.0, + "router_z_loss_mlp": 1.34472656, + "step": 342, + "time_per_iteration": 2.6834869384765625 + }, + { + "auxiliary_loss_clip": 0.09933892, + "auxiliary_loss_mlp": 0.01819402, + "balance_loss_clip": 0.07828948, + "balance_loss_mlp": 0.0168541, + "epoch": 0.020622275665113484, + "flos": 26981878227840.0, + "grad_norm": 22.197931915509507, + "language_loss": 1.07990003, + "learning_rate": 3.7586354951156886e-06, + "loss": 1.19743299, + "num_input_tokens_seen": 7165505, + "router_z_loss_clip": 21.0625, + "router_z_loss_mlp": 1.34082031, + "step": 343, + "time_per_iteration": 2.749616861343384 + }, + { + "auxiliary_loss_clip": 0.09917849, + "auxiliary_loss_mlp": 0.01848479, + "balance_loss_clip": 0.07828984, + "balance_loss_mlp": 0.01717921, + "epoch": 0.020682398917781453, + "flos": 22607162484480.0, + "grad_norm": 141.8901696404303, + "language_loss": 0.98407257, + "learning_rate": 3.7605098841644e-06, + "loss": 1.10173583, + "num_input_tokens_seen": 7184605, + "router_z_loss_clip": 20.859375, + "router_z_loss_mlp": 1.30566406, + "step": 344, + "time_per_iteration": 2.675349235534668 + }, + { + "auxiliary_loss_clip": 0.09898005, + "auxiliary_loss_mlp": 0.01869082, + "balance_loss_clip": 0.07812598, + "balance_loss_mlp": 0.01731467, + "epoch": 0.02074252217044942, + "flos": 15019120114560.0, + "grad_norm": 18.785611022256134, + "language_loss": 0.99672723, + "learning_rate": 3.7623788323083666e-06, + "loss": 1.11439812, + "num_input_tokens_seen": 7203065, + "router_z_loss_clip": 20.84375, + "router_z_loss_mlp": 1.37597656, + "step": 345, + "time_per_iteration": 2.692946434020996 + }, + { + "auxiliary_loss_clip": 0.09874325, + "auxiliary_loss_mlp": 0.01900277, + "balance_loss_clip": 0.07799722, + "balance_loss_mlp": 0.01757512, + "epoch": 0.02080264542311739, + "flos": 25345012717440.0, + "grad_norm": 55.83425603592709, + "language_loss": 1.104882, + "learning_rate": 3.7642423710434837e-06, + "loss": 1.222628, + "num_input_tokens_seen": 7222995, + "router_z_loss_clip": 20.734375, + "router_z_loss_mlp": 1.42871094, + "step": 346, + "time_per_iteration": 2.6843760013580322 + }, + { + "auxiliary_loss_clip": 0.09857361, + "auxiliary_loss_mlp": 0.01900508, + "balance_loss_clip": 0.07793791, + "balance_loss_mlp": 0.01751067, + "epoch": 0.02086276867578536, + "flos": 24395611230720.0, + "grad_norm": 77.40789728508068, + "language_loss": 1.02947056, + "learning_rate": 3.7661005315929563e-06, + "loss": 1.14704919, + "num_input_tokens_seen": 7244625, + "router_z_loss_clip": 20.625, + "router_z_loss_mlp": 1.49511719, + "step": 347, + "time_per_iteration": 2.7445502281188965 + }, + { + "auxiliary_loss_clip": 0.09829693, + "auxiliary_loss_mlp": 0.01850064, + "balance_loss_clip": 0.07772936, + "balance_loss_mlp": 0.01707585, + "epoch": 0.02092289192845333, + "flos": 24469096861440.0, + "grad_norm": 39.57326474220843, + "language_loss": 0.95316571, + "learning_rate": 3.7679533449104354e-06, + "loss": 1.06996334, + "num_input_tokens_seen": 7263255, + "router_z_loss_clip": 20.546875, + "router_z_loss_mlp": 1.42578125, + "step": 348, + "time_per_iteration": 2.8197853565216064 + }, + { + "auxiliary_loss_clip": 0.09904477, + "auxiliary_loss_mlp": 0.01869566, + "balance_loss_clip": 0.07792602, + "balance_loss_mlp": 0.01723273, + "epoch": 0.0209830151811213, + "flos": 17455942154880.0, + "grad_norm": 162.53223734199824, + "language_loss": 1.06930375, + "learning_rate": 3.7698008416831116e-06, + "loss": 1.18704414, + "num_input_tokens_seen": 7279275, + "router_z_loss_clip": 21.125, + "router_z_loss_mlp": 1.46289062, + "step": 349, + "time_per_iteration": 2.752092123031616 + }, + { + "auxiliary_loss_clip": 0.09846102, + "auxiliary_loss_mlp": 0.01921246, + "balance_loss_clip": 0.07772378, + "balance_loss_mlp": 0.01771328, + "epoch": 0.021043138433789268, + "flos": 24581295878400.0, + "grad_norm": 27.656933027979164, + "language_loss": 1.05012357, + "learning_rate": 3.7716430523347664e-06, + "loss": 1.16779709, + "num_input_tokens_seen": 7300180, + "router_z_loss_clip": 20.71875, + "router_z_loss_mlp": 1.49902344, + "step": 350, + "time_per_iteration": 2.766042947769165 + }, + { + "auxiliary_loss_clip": 0.0987936, + "auxiliary_loss_mlp": 0.01878538, + "balance_loss_clip": 0.07780807, + "balance_loss_mlp": 0.01733103, + "epoch": 0.021103261686457236, + "flos": 24459579423360.0, + "grad_norm": 79.75623451753691, + "language_loss": 0.99250925, + "learning_rate": 3.773480007028776e-06, + "loss": 1.11008823, + "num_input_tokens_seen": 7317430, + "router_z_loss_clip": 21.0, + "router_z_loss_mlp": 1.45507812, + "step": 351, + "time_per_iteration": 2.7852492332458496 + }, + { + "auxiliary_loss_clip": 0.09914102, + "auxiliary_loss_mlp": 0.01872584, + "balance_loss_clip": 0.07798491, + "balance_loss_mlp": 0.01732013, + "epoch": 0.021163384939125205, + "flos": 14688183214080.0, + "grad_norm": 45.172979776217204, + "language_loss": 1.05138326, + "learning_rate": 3.775311735671078e-06, + "loss": 1.16925001, + "num_input_tokens_seen": 7334875, + "router_z_loss_clip": 21.15625, + "router_z_loss_mlp": 1.40527344, + "step": 352, + "time_per_iteration": 2.670952558517456 + }, + { + "auxiliary_loss_clip": 0.09916839, + "auxiliary_loss_mlp": 0.0188162, + "balance_loss_clip": 0.07782572, + "balance_loss_mlp": 0.01727792, + "epoch": 0.021223508191793177, + "flos": 24499173277440.0, + "grad_norm": 32.69809617550279, + "language_loss": 1.02695966, + "learning_rate": 3.7771382679130878e-06, + "loss": 1.14494431, + "num_input_tokens_seen": 7355185, + "router_z_loss_clip": 21.375, + "router_z_loss_mlp": 1.5390625, + "step": 353, + "time_per_iteration": 2.7037458419799805 + }, + { + "auxiliary_loss_clip": 0.09877251, + "auxiliary_loss_mlp": 0.01866766, + "balance_loss_clip": 0.07783737, + "balance_loss_mlp": 0.01718565, + "epoch": 0.021283631444461146, + "flos": 24132667518720.0, + "grad_norm": 42.14264864151201, + "language_loss": 1.01166749, + "learning_rate": 3.7789596331545845e-06, + "loss": 1.12910759, + "num_input_tokens_seen": 7374425, + "router_z_loss_clip": 20.921875, + "router_z_loss_mlp": 1.48242188, + "step": 354, + "time_per_iteration": 2.692936658859253 + }, + { + "auxiliary_loss_clip": 0.0993467, + "auxiliary_loss_mlp": 0.0189021, + "balance_loss_clip": 0.07795032, + "balance_loss_mlp": 0.01743726, + "epoch": 0.021343754697129114, + "flos": 25199299267200.0, + "grad_norm": 49.082565254141, + "language_loss": 1.02249849, + "learning_rate": 3.780775860546545e-06, + "loss": 1.14074731, + "num_input_tokens_seen": 7394175, + "router_z_loss_clip": 21.359375, + "router_z_loss_mlp": 1.46484375, + "step": 355, + "time_per_iteration": 2.703904151916504 + }, + { + "auxiliary_loss_clip": 0.09890301, + "auxiliary_loss_mlp": 0.01933568, + "balance_loss_clip": 0.07771169, + "balance_loss_mlp": 0.01774495, + "epoch": 0.021403877949797083, + "flos": 17279816872320.0, + "grad_norm": 33.424095724347985, + "language_loss": 1.12320316, + "learning_rate": 3.7825869789939474e-06, + "loss": 1.24144173, + "num_input_tokens_seen": 7412645, + "router_z_loss_clip": 21.21875, + "router_z_loss_mlp": 1.58984375, + "step": 356, + "time_per_iteration": 2.7039332389831543 + }, + { + "auxiliary_loss_clip": 0.09926872, + "auxiliary_loss_mlp": 0.01913321, + "balance_loss_clip": 0.07763862, + "balance_loss_mlp": 0.01768648, + "epoch": 0.021464001202465055, + "flos": 30924946062720.0, + "grad_norm": 28.358403300745604, + "language_loss": 1.00492048, + "learning_rate": 3.784393017158528e-06, + "loss": 1.12332249, + "num_input_tokens_seen": 7432275, + "router_z_loss_clip": 21.640625, + "router_z_loss_mlp": 1.44628906, + "step": 357, + "time_per_iteration": 2.7567434310913086 + }, + { + "auxiliary_loss_clip": 0.09896905, + "auxiliary_loss_mlp": 0.0189471, + "balance_loss_clip": 0.0777001, + "balance_loss_mlp": 0.01751087, + "epoch": 0.021524124455133024, + "flos": 18192182054400.0, + "grad_norm": 311.83490549391024, + "language_loss": 1.00049341, + "learning_rate": 3.786194003461506e-06, + "loss": 1.11840951, + "num_input_tokens_seen": 7450245, + "router_z_loss_clip": 21.28125, + "router_z_loss_mlp": 1.43652344, + "step": 358, + "time_per_iteration": 2.697567939758301 + }, + { + "auxiliary_loss_clip": 0.09952264, + "auxiliary_loss_mlp": 0.01876113, + "balance_loss_clip": 0.0777906, + "balance_loss_mlp": 0.01737449, + "epoch": 0.021584247707800992, + "flos": 13810464495360.0, + "grad_norm": 74.44924093849752, + "language_loss": 1.11748183, + "learning_rate": 3.787989966086264e-06, + "loss": 1.2357657, + "num_input_tokens_seen": 7466845, + "router_z_loss_clip": 21.734375, + "router_z_loss_mlp": 1.38671875, + "step": 359, + "time_per_iteration": 2.683791399002075 + }, + { + "auxiliary_loss_clip": 0.09922898, + "auxiliary_loss_mlp": 0.01885242, + "balance_loss_clip": 0.07765573, + "balance_loss_mlp": 0.01746292, + "epoch": 0.02164437096046896, + "flos": 23301418688640.0, + "grad_norm": 64.98362502413198, + "language_loss": 1.06271791, + "learning_rate": 3.789780932980997e-06, + "loss": 1.18079925, + "num_input_tokens_seen": 7485450, + "router_z_loss_clip": 21.5625, + "router_z_loss_mlp": 1.38867188, + "step": 360, + "time_per_iteration": 2.7144362926483154 + }, + { + "auxiliary_loss_clip": 0.08207352, + "auxiliary_loss_mlp": 0.01776906, + "balance_loss_clip": 0.07236059, + "balance_loss_mlp": 0.01669809, + "epoch": 0.02170449421313693, + "flos": 68919621137280.0, + "grad_norm": 1.0217512577987982, + "language_loss": 0.65141213, + "learning_rate": 3.79156693186132e-06, + "loss": 0.75125468, + "num_input_tokens_seen": 7553780, + "router_z_loss_clip": 9.734375, + "router_z_loss_mlp": 1.07324219, + "step": 361, + "time_per_iteration": 3.3981525897979736 + }, + { + "auxiliary_loss_clip": 0.09926173, + "auxiliary_loss_mlp": 0.01850484, + "balance_loss_clip": 0.07767443, + "balance_loss_mlp": 0.01710961, + "epoch": 0.0217646174658049, + "flos": 25235580885120.0, + "grad_norm": 46.06075194478587, + "language_loss": 1.07240796, + "learning_rate": 3.7933479902128433e-06, + "loss": 1.19017458, + "num_input_tokens_seen": 7574155, + "router_z_loss_clip": 21.5625, + "router_z_loss_mlp": 1.39550781, + "step": 362, + "time_per_iteration": 2.7112934589385986 + }, + { + "auxiliary_loss_clip": 0.09902073, + "auxiliary_loss_mlp": 0.01838434, + "balance_loss_clip": 0.07771316, + "balance_loss_mlp": 0.01689852, + "epoch": 0.02182474071847287, + "flos": 22899721415040.0, + "grad_norm": 31.847388073363284, + "language_loss": 1.10624099, + "learning_rate": 3.7951241352937077e-06, + "loss": 1.22364616, + "num_input_tokens_seen": 7592320, + "router_z_loss_clip": 21.3125, + "router_z_loss_mlp": 1.48632812, + "step": 363, + "time_per_iteration": 2.7391881942749023 + }, + { + "auxiliary_loss_clip": 0.09905075, + "auxiliary_loss_mlp": 0.01804412, + "balance_loss_clip": 0.0776676, + "balance_loss_mlp": 0.01661742, + "epoch": 0.02188486397114084, + "flos": 23665660387200.0, + "grad_norm": 28.541039167709148, + "language_loss": 1.08880925, + "learning_rate": 3.7968953941370915e-06, + "loss": 1.20590401, + "num_input_tokens_seen": 7611185, + "router_z_loss_clip": 21.359375, + "router_z_loss_mlp": 1.42578125, + "step": 364, + "time_per_iteration": 2.7092103958129883 + }, + { + "auxiliary_loss_clip": 0.09940802, + "auxiliary_loss_mlp": 0.01790674, + "balance_loss_clip": 0.07771328, + "balance_loss_mlp": 0.01644666, + "epoch": 0.021944987223808807, + "flos": 21550090101120.0, + "grad_norm": 29.41270562877638, + "language_loss": 1.01945662, + "learning_rate": 3.798661793553676e-06, + "loss": 1.13677144, + "num_input_tokens_seen": 7631970, + "router_z_loss_clip": 21.6875, + "router_z_loss_mlp": 1.4609375, + "step": 365, + "time_per_iteration": 2.7039554119110107 + }, + { + "auxiliary_loss_clip": 0.09880184, + "auxiliary_loss_mlp": 0.01787501, + "balance_loss_clip": 0.07767902, + "balance_loss_mlp": 0.01639968, + "epoch": 0.022005110476476776, + "flos": 16076444060160.0, + "grad_norm": 25.357242967570325, + "language_loss": 1.00391948, + "learning_rate": 3.8004233601340808e-06, + "loss": 1.12059641, + "num_input_tokens_seen": 7649745, + "router_z_loss_clip": 21.125, + "router_z_loss_mlp": 1.47558594, + "step": 366, + "time_per_iteration": 2.6410672664642334 + }, + { + "auxiliary_loss_clip": 0.09886092, + "auxiliary_loss_mlp": 0.01802461, + "balance_loss_clip": 0.07774624, + "balance_loss_mlp": 0.01645009, + "epoch": 0.022065233729144748, + "flos": 21440071290240.0, + "grad_norm": 44.529255844390654, + "language_loss": 1.12988663, + "learning_rate": 3.8021801202512694e-06, + "loss": 1.24677217, + "num_input_tokens_seen": 7668830, + "router_z_loss_clip": 21.09375, + "router_z_loss_mlp": 1.57421875, + "step": 367, + "time_per_iteration": 2.742794990539551 + }, + { + "auxiliary_loss_clip": 0.09926969, + "auxiliary_loss_mlp": 0.01819149, + "balance_loss_clip": 0.0779452, + "balance_loss_mlp": 0.01654545, + "epoch": 0.022125356981812717, + "flos": 21550173955200.0, + "grad_norm": 31.338184320621753, + "language_loss": 1.07241869, + "learning_rate": 3.803932100062912e-06, + "loss": 1.18987989, + "num_input_tokens_seen": 7687240, + "router_z_loss_clip": 21.34375, + "router_z_loss_mlp": 1.64648438, + "step": 368, + "time_per_iteration": 2.660156488418579 + }, + { + "auxiliary_loss_clip": 0.09893043, + "auxiliary_loss_mlp": 0.01817736, + "balance_loss_clip": 0.07784697, + "balance_loss_mlp": 0.01649699, + "epoch": 0.022185480234480685, + "flos": 20710413936000.0, + "grad_norm": 81.09585500154182, + "language_loss": 1.0770272, + "learning_rate": 3.8056793255137264e-06, + "loss": 1.19413495, + "num_input_tokens_seen": 7704440, + "router_z_loss_clip": 21.09375, + "router_z_loss_mlp": 1.6796875, + "step": 369, + "time_per_iteration": 2.6966772079467773 + }, + { + "auxiliary_loss_clip": 0.09905175, + "auxiliary_loss_mlp": 0.01835143, + "balance_loss_clip": 0.07793829, + "balance_loss_mlp": 0.01659667, + "epoch": 0.022245603487148654, + "flos": 25200431297280.0, + "grad_norm": 48.526199326230525, + "language_loss": 1.05259717, + "learning_rate": 3.8074218223377844e-06, + "loss": 1.17000043, + "num_input_tokens_seen": 7727160, + "router_z_loss_clip": 21.09375, + "router_z_loss_mlp": 1.75585938, + "step": 370, + "time_per_iteration": 2.726882219314575 + }, + { + "auxiliary_loss_clip": 0.09840686, + "auxiliary_loss_mlp": 0.01849254, + "balance_loss_clip": 0.0775683, + "balance_loss_mlp": 0.01677497, + "epoch": 0.022305726739816623, + "flos": 21402070663680.0, + "grad_norm": 32.14486041550045, + "language_loss": 1.00516605, + "learning_rate": 3.8091596160607834e-06, + "loss": 1.12206554, + "num_input_tokens_seen": 7747730, + "router_z_loss_clip": 20.828125, + "router_z_loss_mlp": 1.71875, + "step": 371, + "time_per_iteration": 2.6846559047698975 + }, + { + "auxiliary_loss_clip": 0.09844472, + "auxiliary_loss_mlp": 0.01857578, + "balance_loss_clip": 0.07769165, + "balance_loss_mlp": 0.01683151, + "epoch": 0.022365849992484595, + "flos": 22498736901120.0, + "grad_norm": 33.301604666823, + "language_loss": 1.06231499, + "learning_rate": 3.8108927320022896e-06, + "loss": 1.17933559, + "num_input_tokens_seen": 7766765, + "router_z_loss_clip": 20.734375, + "router_z_loss_mlp": 1.74511719, + "step": 372, + "time_per_iteration": 2.7052745819091797 + }, + { + "auxiliary_loss_clip": 0.09826015, + "auxiliary_loss_mlp": 0.01853945, + "balance_loss_clip": 0.07764611, + "balance_loss_mlp": 0.01673796, + "epoch": 0.022425973245152563, + "flos": 17862083694720.0, + "grad_norm": 41.636352487556145, + "language_loss": 1.03913403, + "learning_rate": 3.8126211952779548e-06, + "loss": 1.15593362, + "num_input_tokens_seen": 7784010, + "router_z_loss_clip": 20.640625, + "router_z_loss_mlp": 1.80078125, + "step": 373, + "time_per_iteration": 4.106141090393066 + }, + { + "auxiliary_loss_clip": 0.09845725, + "auxiliary_loss_mlp": 0.01869282, + "balance_loss_clip": 0.07777153, + "balance_loss_mlp": 0.01685128, + "epoch": 0.022486096497820532, + "flos": 15487804327680.0, + "grad_norm": 61.54476347228186, + "language_loss": 1.0650835, + "learning_rate": 3.8143450308016952e-06, + "loss": 1.18223345, + "num_input_tokens_seen": 7801305, + "router_z_loss_clip": 20.703125, + "router_z_loss_mlp": 1.84277344, + "step": 374, + "time_per_iteration": 4.033753871917725 + }, + { + "auxiliary_loss_clip": 0.09812269, + "auxiliary_loss_mlp": 0.01856399, + "balance_loss_clip": 0.07757415, + "balance_loss_mlp": 0.01667095, + "epoch": 0.0225462197504885, + "flos": 27791897247360.0, + "grad_norm": 56.210759270114224, + "language_loss": 1.03319001, + "learning_rate": 3.8160642632878525e-06, + "loss": 1.14987683, + "num_input_tokens_seen": 7823965, + "router_z_loss_clip": 20.5625, + "router_z_loss_mlp": 1.89257812, + "step": 375, + "time_per_iteration": 2.7545790672302246 + }, + { + "auxiliary_loss_clip": 0.0981497, + "auxiliary_loss_mlp": 0.01843627, + "balance_loss_clip": 0.07751609, + "balance_loss_mlp": 0.01665767, + "epoch": 0.02260634300315647, + "flos": 19981804757760.0, + "grad_norm": 57.812718044092065, + "language_loss": 1.07001138, + "learning_rate": 3.817778917253314e-06, + "loss": 1.18659735, + "num_input_tokens_seen": 7842115, + "router_z_loss_clip": 20.625, + "router_z_loss_mlp": 1.77734375, + "step": 376, + "time_per_iteration": 4.076448202133179 + }, + { + "auxiliary_loss_clip": 0.09767978, + "auxiliary_loss_mlp": 0.01843169, + "balance_loss_clip": 0.07741934, + "balance_loss_mlp": 0.01659587, + "epoch": 0.02266646625582444, + "flos": 16032699429120.0, + "grad_norm": 49.61569881920644, + "language_loss": 1.03111744, + "learning_rate": 3.8194890170196155e-06, + "loss": 1.14722896, + "num_input_tokens_seen": 7857830, + "router_z_loss_clip": 20.265625, + "router_z_loss_mlp": 1.83691406, + "step": 377, + "time_per_iteration": 2.7254374027252197 + }, + { + "auxiliary_loss_clip": 0.09738941, + "auxiliary_loss_mlp": 0.01853994, + "balance_loss_clip": 0.07719769, + "balance_loss_mlp": 0.01670221, + "epoch": 0.02272658950849241, + "flos": 20409553451520.0, + "grad_norm": 48.84797020114705, + "language_loss": 1.2001133, + "learning_rate": 3.8211945867150055e-06, + "loss": 1.31604266, + "num_input_tokens_seen": 7875840, + "router_z_loss_clip": 20.171875, + "router_z_loss_mlp": 1.83691406, + "step": 378, + "time_per_iteration": 2.648167848587036 + }, + { + "auxiliary_loss_clip": 0.08046754, + "auxiliary_loss_mlp": 0.0138253, + "balance_loss_clip": 0.07155026, + "balance_loss_mlp": 0.01272953, + "epoch": 0.02278671276116038, + "flos": 69867387469440.0, + "grad_norm": 0.9915915427532991, + "language_loss": 0.75403833, + "learning_rate": 3.822895650276492e-06, + "loss": 0.84833115, + "num_input_tokens_seen": 7940190, + "router_z_loss_clip": 8.90625, + "router_z_loss_mlp": 1.09863281, + "step": 379, + "time_per_iteration": 3.301997661590576 + }, + { + "auxiliary_loss_clip": 0.09709425, + "auxiliary_loss_mlp": 0.01844372, + "balance_loss_clip": 0.07733691, + "balance_loss_mlp": 0.0167643, + "epoch": 0.022846836013828347, + "flos": 38517935823360.0, + "grad_norm": 57.599828595547535, + "language_loss": 1.02933359, + "learning_rate": 3.824592231451859e-06, + "loss": 1.14487147, + "num_input_tokens_seen": 7960840, + "router_z_loss_clip": 19.75, + "router_z_loss_mlp": 1.6796875, + "step": 380, + "time_per_iteration": 2.817310094833374 + }, + { + "auxiliary_loss_clip": 0.09699684, + "auxiliary_loss_mlp": 0.01850822, + "balance_loss_clip": 0.07715706, + "balance_loss_mlp": 0.01682976, + "epoch": 0.02290695926649632, + "flos": 20965768853760.0, + "grad_norm": 97.98649595332142, + "language_loss": 1.19140625, + "learning_rate": 3.826284353801652e-06, + "loss": 1.30691135, + "num_input_tokens_seen": 7975500, + "router_z_loss_clip": 19.875, + "router_z_loss_mlp": 1.6796875, + "step": 381, + "time_per_iteration": 2.6415421962738037 + }, + { + "auxiliary_loss_clip": 0.09691618, + "auxiliary_loss_mlp": 0.01878712, + "balance_loss_clip": 0.0772172, + "balance_loss_mlp": 0.01696942, + "epoch": 0.022967082519164288, + "flos": 24028895836800.0, + "grad_norm": 71.67825440631948, + "language_loss": 1.08586979, + "learning_rate": 3.827972040701142e-06, + "loss": 1.20157313, + "num_input_tokens_seen": 7993880, + "router_z_loss_clip": 19.703125, + "router_z_loss_mlp": 1.81640625, + "step": 382, + "time_per_iteration": 2.688380718231201 + }, + { + "auxiliary_loss_clip": 0.0969088, + "auxiliary_loss_mlp": 0.0187998, + "balance_loss_clip": 0.07735589, + "balance_loss_mlp": 0.01704695, + "epoch": 0.023027205771832256, + "flos": 21003643699200.0, + "grad_norm": 97.39739491884717, + "language_loss": 1.06533158, + "learning_rate": 3.829655315342268e-06, + "loss": 1.18104029, + "num_input_tokens_seen": 8012730, + "router_z_loss_clip": 19.53125, + "router_z_loss_mlp": 1.75292969, + "step": 383, + "time_per_iteration": 2.697038173675537 + }, + { + "auxiliary_loss_clip": 0.09652471, + "auxiliary_loss_mlp": 0.01917586, + "balance_loss_clip": 0.07717164, + "balance_loss_mlp": 0.017485, + "epoch": 0.023087329024500225, + "flos": 21367172638080.0, + "grad_norm": 19.8768776799836, + "language_loss": 1.04799581, + "learning_rate": 3.831334200735543e-06, + "loss": 1.16369653, + "num_input_tokens_seen": 8031275, + "router_z_loss_clip": 19.34375, + "router_z_loss_mlp": 1.68945312, + "step": 384, + "time_per_iteration": 2.778743028640747 + }, + { + "auxiliary_loss_clip": 0.09638548, + "auxiliary_loss_mlp": 0.01934173, + "balance_loss_clip": 0.07711613, + "balance_loss_mlp": 0.01771858, + "epoch": 0.023147452277168194, + "flos": 21879014503680.0, + "grad_norm": 73.36535290584087, + "language_loss": 1.05852127, + "learning_rate": 3.8330087197119426e-06, + "loss": 1.17424858, + "num_input_tokens_seen": 8051600, + "router_z_loss_clip": 19.265625, + "router_z_loss_mlp": 1.62402344, + "step": 385, + "time_per_iteration": 2.6939914226531982 + }, + { + "auxiliary_loss_clip": 0.09652182, + "auxiliary_loss_mlp": 0.01965061, + "balance_loss_clip": 0.07710169, + "balance_loss_mlp": 0.01799503, + "epoch": 0.023207575529836166, + "flos": 18922719876480.0, + "grad_norm": 50.36598663544367, + "language_loss": 0.83061486, + "learning_rate": 3.83467889492477e-06, + "loss": 0.9467873, + "num_input_tokens_seen": 8070600, + "router_z_loss_clip": 19.390625, + "router_z_loss_mlp": 1.65527344, + "step": 386, + "time_per_iteration": 2.655557870864868 + }, + { + "auxiliary_loss_clip": 0.09622966, + "auxiliary_loss_mlp": 0.01950141, + "balance_loss_clip": 0.07707699, + "balance_loss_mlp": 0.01772281, + "epoch": 0.023267698782504134, + "flos": 25052998838400.0, + "grad_norm": 988.1002722416383, + "language_loss": 1.04901791, + "learning_rate": 3.836344748851495e-06, + "loss": 1.16474891, + "num_input_tokens_seen": 8090680, + "router_z_loss_clip": 19.171875, + "router_z_loss_mlp": 1.77832031, + "step": 387, + "time_per_iteration": 2.7180447578430176 + }, + { + "auxiliary_loss_clip": 0.09642081, + "auxiliary_loss_mlp": 0.01949741, + "balance_loss_clip": 0.0771786, + "balance_loss_mlp": 0.0177932, + "epoch": 0.023327822035172103, + "flos": 28887221819520.0, + "grad_norm": 25.325317169555962, + "language_loss": 1.03613186, + "learning_rate": 3.838006303795566e-06, + "loss": 1.15205002, + "num_input_tokens_seen": 8114610, + "router_z_loss_clip": 19.21875, + "router_z_loss_mlp": 1.70410156, + "step": 388, + "time_per_iteration": 2.7562358379364014 + }, + { + "auxiliary_loss_clip": 0.09633669, + "auxiliary_loss_mlp": 0.01946229, + "balance_loss_clip": 0.0770783, + "balance_loss_mlp": 0.01764268, + "epoch": 0.02338794528784007, + "flos": 27128178656640.0, + "grad_norm": 20.981666659787948, + "language_loss": 1.1374321, + "learning_rate": 3.839663581888206e-06, + "loss": 1.25323105, + "num_input_tokens_seen": 8133975, + "router_z_loss_clip": 19.25, + "router_z_loss_mlp": 1.8203125, + "step": 389, + "time_per_iteration": 2.762704372406006 + }, + { + "auxiliary_loss_clip": 0.09556312, + "auxiliary_loss_mlp": 0.01957007, + "balance_loss_clip": 0.07663149, + "balance_loss_mlp": 0.01788016, + "epoch": 0.02344806854050804, + "flos": 21328375397760.0, + "grad_norm": 32.87948782751001, + "language_loss": 1.07566035, + "learning_rate": 3.841316605090178e-06, + "loss": 1.19079351, + "num_input_tokens_seen": 8153570, + "router_z_loss_clip": 18.921875, + "router_z_loss_mlp": 1.68945312, + "step": 390, + "time_per_iteration": 2.659283399581909 + }, + { + "auxiliary_loss_clip": 0.09492537, + "auxiliary_loss_mlp": 0.01896556, + "balance_loss_clip": 0.07636442, + "balance_loss_mlp": 0.01733001, + "epoch": 0.023508191793176012, + "flos": 24796847306880.0, + "grad_norm": 140.16785757024044, + "language_loss": 1.15910161, + "learning_rate": 3.842965395193529e-06, + "loss": 1.27299261, + "num_input_tokens_seen": 8170075, + "router_z_loss_clip": 18.546875, + "router_z_loss_mlp": 1.63476562, + "step": 391, + "time_per_iteration": 2.713545799255371 + }, + { + "auxiliary_loss_clip": 0.09538671, + "auxiliary_loss_mlp": 0.0188554, + "balance_loss_clip": 0.0766757, + "balance_loss_mlp": 0.01730473, + "epoch": 0.02356831504584398, + "flos": 26002651887360.0, + "grad_norm": 36.4029876381944, + "language_loss": 1.06844151, + "learning_rate": 3.84460997382332e-06, + "loss": 1.18268371, + "num_input_tokens_seen": 8190420, + "router_z_loss_clip": 18.6875, + "router_z_loss_mlp": 1.54882812, + "step": 392, + "time_per_iteration": 2.738403081893921 + }, + { + "auxiliary_loss_clip": 0.09424435, + "auxiliary_loss_mlp": 0.01937068, + "balance_loss_clip": 0.07618648, + "balance_loss_mlp": 0.01782287, + "epoch": 0.02362843829851195, + "flos": 19068475253760.0, + "grad_norm": 23.190572901307267, + "language_loss": 1.05277753, + "learning_rate": 3.8462503624393256e-06, + "loss": 1.16639256, + "num_input_tokens_seen": 8208790, + "router_z_loss_clip": 18.0625, + "router_z_loss_mlp": 1.546875, + "step": 393, + "time_per_iteration": 2.730311155319214 + }, + { + "auxiliary_loss_clip": 0.09391345, + "auxiliary_loss_mlp": 0.01894272, + "balance_loss_clip": 0.07595266, + "balance_loss_mlp": 0.01726616, + "epoch": 0.023688561551179918, + "flos": 16076611768320.0, + "grad_norm": 91.86478442531423, + "language_loss": 1.00682688, + "learning_rate": 3.84788658233771e-06, + "loss": 1.11968303, + "num_input_tokens_seen": 8226885, + "router_z_loss_clip": 17.953125, + "router_z_loss_mlp": 1.67578125, + "step": 394, + "time_per_iteration": 2.705462694168091 + }, + { + "auxiliary_loss_clip": 0.09387165, + "auxiliary_loss_mlp": 0.01881808, + "balance_loss_clip": 0.07597888, + "balance_loss_mlp": 0.01708144, + "epoch": 0.023748684803847887, + "flos": 21730575795840.0, + "grad_norm": 29.466731361634597, + "language_loss": 1.02469492, + "learning_rate": 3.84951865465269e-06, + "loss": 1.13738465, + "num_input_tokens_seen": 8246825, + "router_z_loss_clip": 17.875, + "router_z_loss_mlp": 1.73632812, + "step": 395, + "time_per_iteration": 2.67728328704834 + }, + { + "auxiliary_loss_clip": 0.07807533, + "auxiliary_loss_mlp": 0.01422272, + "balance_loss_clip": 0.06998962, + "balance_loss_mlp": 0.01324949, + "epoch": 0.02380880805651586, + "flos": 61944299349120.0, + "grad_norm": 0.9675883167947973, + "language_loss": 0.63979137, + "learning_rate": 3.851146600358172e-06, + "loss": 0.7320894, + "num_input_tokens_seen": 8302835, + "router_z_loss_clip": 8.09375, + "router_z_loss_mlp": 0.97216797, + "step": 396, + "time_per_iteration": 3.085773468017578 + }, + { + "auxiliary_loss_clip": 0.09369384, + "auxiliary_loss_mlp": 0.01878876, + "balance_loss_clip": 0.07592572, + "balance_loss_mlp": 0.01705307, + "epoch": 0.023868931309183827, + "flos": 20272518898560.0, + "grad_norm": 448.6329753345253, + "language_loss": 1.09206522, + "learning_rate": 3.852770440269372e-06, + "loss": 1.20454776, + "num_input_tokens_seen": 8320745, + "router_z_loss_clip": 17.765625, + "router_z_loss_mlp": 1.73632812, + "step": 397, + "time_per_iteration": 2.645312786102295 + }, + { + "auxiliary_loss_clip": 0.09360366, + "auxiliary_loss_mlp": 0.01887806, + "balance_loss_clip": 0.07592075, + "balance_loss_mlp": 0.01703461, + "epoch": 0.023929054561851796, + "flos": 21144954810240.0, + "grad_norm": 35.15382244199787, + "language_loss": 1.09138823, + "learning_rate": 3.854390195044404e-06, + "loss": 1.20386982, + "num_input_tokens_seen": 8339540, + "router_z_loss_clip": 17.671875, + "router_z_loss_mlp": 1.84277344, + "step": 398, + "time_per_iteration": 2.7186756134033203 + }, + { + "auxiliary_loss_clip": 0.09363802, + "auxiliary_loss_mlp": 0.01863352, + "balance_loss_clip": 0.07595689, + "balance_loss_mlp": 0.01681963, + "epoch": 0.023989177814519765, + "flos": 13703548285440.0, + "grad_norm": 79.14501576371894, + "language_loss": 1.17455924, + "learning_rate": 3.856005885185868e-06, + "loss": 1.2868309, + "num_input_tokens_seen": 8354890, + "router_z_loss_clip": 17.6875, + "router_z_loss_mlp": 1.81347656, + "step": 399, + "time_per_iteration": 2.6266868114471436 + }, + { + "auxiliary_loss_clip": 0.09350164, + "auxiliary_loss_mlp": 0.01862402, + "balance_loss_clip": 0.07603092, + "balance_loss_mlp": 0.0168683, + "epoch": 0.024049301067187733, + "flos": 26329060667520.0, + "grad_norm": 31.26445557719831, + "language_loss": 1.02793097, + "learning_rate": 3.857617531042398e-06, + "loss": 1.14005673, + "num_input_tokens_seen": 8375845, + "router_z_loss_clip": 17.46875, + "router_z_loss_mlp": 1.75585938, + "step": 400, + "time_per_iteration": 2.766996145248413 + }, + { + "auxiliary_loss_clip": 0.09326777, + "auxiliary_loss_mlp": 0.01879183, + "balance_loss_clip": 0.07581857, + "balance_loss_mlp": 0.01707522, + "epoch": 0.024109424319855705, + "flos": 24432270192000.0, + "grad_norm": 165.70452294486532, + "language_loss": 0.98901701, + "learning_rate": 3.8592251528102065e-06, + "loss": 1.1010766, + "num_input_tokens_seen": 8395240, + "router_z_loss_clip": 17.46875, + "router_z_loss_mlp": 1.71679688, + "step": 401, + "time_per_iteration": 2.6877481937408447 + }, + { + "auxiliary_loss_clip": 0.09325443, + "auxiliary_loss_mlp": 0.01927273, + "balance_loss_clip": 0.0761469, + "balance_loss_mlp": 0.01736538, + "epoch": 0.024169547572523674, + "flos": 29611764074880.0, + "grad_norm": 158.83382742696674, + "language_loss": 1.04086566, + "learning_rate": 3.8608287705345976e-06, + "loss": 1.15339279, + "num_input_tokens_seen": 8416950, + "router_z_loss_clip": 17.09375, + "router_z_loss_mlp": 1.90722656, + "step": 402, + "time_per_iteration": 2.7297163009643555 + }, + { + "auxiliary_loss_clip": 0.09320071, + "auxiliary_loss_mlp": 0.01914681, + "balance_loss_clip": 0.07593916, + "balance_loss_mlp": 0.01724327, + "epoch": 0.024229670825191642, + "flos": 22608042952320.0, + "grad_norm": 474.9195361774189, + "language_loss": 1.23886442, + "learning_rate": 3.86242840411147e-06, + "loss": 1.35121191, + "num_input_tokens_seen": 8433660, + "router_z_loss_clip": 17.265625, + "router_z_loss_mlp": 1.90234375, + "step": 403, + "time_per_iteration": 2.6663832664489746 + }, + { + "auxiliary_loss_clip": 0.09310063, + "auxiliary_loss_mlp": 0.01918458, + "balance_loss_clip": 0.07606195, + "balance_loss_mlp": 0.01729535, + "epoch": 0.02428979407785961, + "flos": 18156110071680.0, + "grad_norm": 557.4725363749534, + "language_loss": 1.23195148, + "learning_rate": 3.864024073288798e-06, + "loss": 1.34423661, + "num_input_tokens_seen": 8450180, + "router_z_loss_clip": 17.0625, + "router_z_loss_mlp": 1.88867188, + "step": 404, + "time_per_iteration": 2.6930551528930664 + }, + { + "auxiliary_loss_clip": 0.09236102, + "auxiliary_loss_mlp": 0.01972168, + "balance_loss_clip": 0.07543309, + "balance_loss_mlp": 0.01765125, + "epoch": 0.024349917330527583, + "flos": 15310463160960.0, + "grad_norm": 32.91094539461264, + "language_loss": 1.10026622, + "learning_rate": 3.865615797668091e-06, + "loss": 1.21234894, + "num_input_tokens_seen": 8467775, + "router_z_loss_clip": 16.921875, + "router_z_loss_mlp": 2.0703125, + "step": 405, + "time_per_iteration": 2.7313172817230225 + }, + { + "auxiliary_loss_clip": 0.09182028, + "auxiliary_loss_mlp": 0.01998566, + "balance_loss_clip": 0.0751636, + "balance_loss_mlp": 0.01782559, + "epoch": 0.024410040583195552, + "flos": 20779623008640.0, + "grad_norm": 51.884422925202074, + "language_loss": 1.20401216, + "learning_rate": 3.867203596705844e-06, + "loss": 1.31581819, + "num_input_tokens_seen": 8486765, + "router_z_loss_clip": 16.65625, + "router_z_loss_mlp": 2.16015625, + "step": 406, + "time_per_iteration": 2.687269449234009 + }, + { + "auxiliary_loss_clip": 0.09164648, + "auxiliary_loss_mlp": 0.02058169, + "balance_loss_clip": 0.07528092, + "balance_loss_mlp": 0.01824328, + "epoch": 0.02447016383586352, + "flos": 21805319237760.0, + "grad_norm": 51.34272238318618, + "language_loss": 1.09166133, + "learning_rate": 3.86878748971496e-06, + "loss": 1.20388949, + "num_input_tokens_seen": 8506515, + "router_z_loss_clip": 16.375, + "router_z_loss_mlp": 2.33789062, + "step": 407, + "time_per_iteration": 2.7443573474884033 + }, + { + "auxiliary_loss_clip": 0.0913244, + "auxiliary_loss_mlp": 0.02070529, + "balance_loss_clip": 0.07525964, + "balance_loss_mlp": 0.01834208, + "epoch": 0.02453028708853149, + "flos": 33956529183360.0, + "grad_norm": 76.90003006133684, + "language_loss": 0.92362475, + "learning_rate": 3.8703674958661596e-06, + "loss": 1.03565443, + "num_input_tokens_seen": 8528035, + "router_z_loss_clip": 16.0546875, + "router_z_loss_mlp": 2.36132812, + "step": 408, + "time_per_iteration": 2.78354549407959 + }, + { + "auxiliary_loss_clip": 0.09112523, + "auxiliary_loss_mlp": 0.02060747, + "balance_loss_clip": 0.07508834, + "balance_loss_mlp": 0.01828241, + "epoch": 0.024590410341199458, + "flos": 21798485130240.0, + "grad_norm": 96.45423831363296, + "language_loss": 1.18704772, + "learning_rate": 3.871943634189376e-06, + "loss": 1.29878044, + "num_input_tokens_seen": 8546455, + "router_z_loss_clip": 16.015625, + "router_z_loss_mlp": 2.32421875, + "step": 409, + "time_per_iteration": 2.7200136184692383 + }, + { + "auxiliary_loss_clip": 0.09154539, + "auxiliary_loss_mlp": 0.02068674, + "balance_loss_clip": 0.07541502, + "balance_loss_mlp": 0.01836741, + "epoch": 0.02465053359386743, + "flos": 35123243034240.0, + "grad_norm": 76.46793311342431, + "language_loss": 1.05106175, + "learning_rate": 3.873515923575128e-06, + "loss": 1.16329384, + "num_input_tokens_seen": 8568450, + "router_z_loss_clip": 16.1171875, + "router_z_loss_mlp": 2.3203125, + "step": 410, + "time_per_iteration": 2.7935402393341064 + }, + { + "auxiliary_loss_clip": 0.09179245, + "auxiliary_loss_mlp": 0.02052485, + "balance_loss_clip": 0.07555975, + "balance_loss_mlp": 0.01831042, + "epoch": 0.0247106568465354, + "flos": 27458360870400.0, + "grad_norm": 178.4501833385731, + "language_loss": 1.0301317, + "learning_rate": 3.875084382775879e-06, + "loss": 1.14244902, + "num_input_tokens_seen": 8589340, + "router_z_loss_clip": 16.25, + "router_z_loss_mlp": 2.21679688, + "step": 411, + "time_per_iteration": 2.810314416885376 + }, + { + "auxiliary_loss_clip": 0.09117973, + "auxiliary_loss_mlp": 0.02147569, + "balance_loss_clip": 0.07523946, + "balance_loss_mlp": 0.01899232, + "epoch": 0.024770780099203367, + "flos": 20709994665600.0, + "grad_norm": 31.381834451084366, + "language_loss": 1.07807076, + "learning_rate": 3.87664903040738e-06, + "loss": 1.19072616, + "num_input_tokens_seen": 8607150, + "router_z_loss_clip": 15.9375, + "router_z_loss_mlp": 2.48242188, + "step": 412, + "time_per_iteration": 4.135298252105713 + }, + { + "auxiliary_loss_clip": 0.0766484, + "auxiliary_loss_mlp": 0.01383218, + "balance_loss_clip": 0.06950212, + "balance_loss_mlp": 0.01289853, + "epoch": 0.024830903351871336, + "flos": 69571264740480.0, + "grad_norm": 0.8458100626859368, + "language_loss": 0.58554661, + "learning_rate": 3.878209884949994e-06, + "loss": 0.67602718, + "num_input_tokens_seen": 8669865, + "router_z_loss_clip": 7.13671875, + "router_z_loss_mlp": 0.93261719, + "step": 413, + "time_per_iteration": 4.813804864883423 + }, + { + "auxiliary_loss_clip": 0.09105721, + "auxiliary_loss_mlp": 0.02060854, + "balance_loss_clip": 0.07511897, + "balance_loss_mlp": 0.01837503, + "epoch": 0.024891026604539304, + "flos": 32278728153600.0, + "grad_norm": 48.89104730966055, + "language_loss": 0.9726972, + "learning_rate": 3.879766964750006e-06, + "loss": 1.08436298, + "num_input_tokens_seen": 8690235, + "router_z_loss_clip": 15.921875, + "router_z_loss_mlp": 2.234375, + "step": 414, + "time_per_iteration": 2.777872323989868 + }, + { + "auxiliary_loss_clip": 0.0905456, + "auxiliary_loss_mlp": 0.02077859, + "balance_loss_clip": 0.07483284, + "balance_loss_mlp": 0.0185365, + "epoch": 0.024951149857207276, + "flos": 18845712374400.0, + "grad_norm": 208.18956686369972, + "language_loss": 1.01095724, + "learning_rate": 3.881320288020917e-06, + "loss": 1.12228131, + "num_input_tokens_seen": 8706295, + "router_z_loss_clip": 15.71875, + "router_z_loss_mlp": 2.24023438, + "step": 415, + "time_per_iteration": 4.142550230026245 + }, + { + "auxiliary_loss_clip": 0.09080397, + "auxiliary_loss_mlp": 0.02074643, + "balance_loss_clip": 0.07484584, + "balance_loss_mlp": 0.0184805, + "epoch": 0.025011273109875245, + "flos": 15382565199360.0, + "grad_norm": 178.52142115782007, + "language_loss": 1.28543544, + "learning_rate": 3.882869872844723e-06, + "loss": 1.39698577, + "num_input_tokens_seen": 8724200, + "router_z_loss_clip": 15.953125, + "router_z_loss_mlp": 2.26757812, + "step": 416, + "time_per_iteration": 2.6912667751312256 + }, + { + "auxiliary_loss_clip": 0.09093624, + "auxiliary_loss_mlp": 0.02048458, + "balance_loss_clip": 0.07498566, + "balance_loss_mlp": 0.01806797, + "epoch": 0.025071396362543213, + "flos": 18921336284160.0, + "grad_norm": 52.83271193802728, + "language_loss": 0.94415307, + "learning_rate": 3.884415737173176e-06, + "loss": 1.05557394, + "num_input_tokens_seen": 8744170, + "router_z_loss_clip": 15.9609375, + "router_z_loss_mlp": 2.41796875, + "step": 417, + "time_per_iteration": 2.6768579483032227 + }, + { + "auxiliary_loss_clip": 0.0906695, + "auxiliary_loss_mlp": 0.02050523, + "balance_loss_clip": 0.07510033, + "balance_loss_mlp": 0.01817826, + "epoch": 0.025131519615211182, + "flos": 25345012717440.0, + "grad_norm": 47.28632079324067, + "language_loss": 0.95738804, + "learning_rate": 3.8859578988290344e-06, + "loss": 1.06856275, + "num_input_tokens_seen": 8765120, + "router_z_loss_clip": 15.5625, + "router_z_loss_mlp": 2.328125, + "step": 418, + "time_per_iteration": 2.7193026542663574 + }, + { + "auxiliary_loss_clip": 0.09048779, + "auxiliary_loss_mlp": 0.02107992, + "balance_loss_clip": 0.07468801, + "balance_loss_mlp": 0.01844969, + "epoch": 0.02519164286787915, + "flos": 18959169202560.0, + "grad_norm": 64.96228222580599, + "language_loss": 1.10502434, + "learning_rate": 3.887496375507294e-06, + "loss": 1.21659207, + "num_input_tokens_seen": 8783500, + "router_z_loss_clip": 15.7890625, + "router_z_loss_mlp": 2.62890625, + "step": 419, + "time_per_iteration": 2.661895513534546 + }, + { + "auxiliary_loss_clip": 0.09047179, + "auxiliary_loss_mlp": 0.02074314, + "balance_loss_clip": 0.07473344, + "balance_loss_mlp": 0.01826931, + "epoch": 0.025251766120547123, + "flos": 17426913914880.0, + "grad_norm": 60.48178105720379, + "language_loss": 0.91689897, + "learning_rate": 3.8890311847764065e-06, + "loss": 1.02811384, + "num_input_tokens_seen": 8801175, + "router_z_loss_clip": 15.7265625, + "router_z_loss_mlp": 2.47070312, + "step": 420, + "time_per_iteration": 2.690960168838501 + }, + { + "auxiliary_loss_clip": 0.09091747, + "auxiliary_loss_mlp": 0.02038651, + "balance_loss_clip": 0.07504605, + "balance_loss_mlp": 0.01800423, + "epoch": 0.02531188937321509, + "flos": 25052328005760.0, + "grad_norm": 83.61542449738408, + "language_loss": 0.95396888, + "learning_rate": 3.890562344079484e-06, + "loss": 1.06527293, + "num_input_tokens_seen": 8820215, + "router_z_loss_clip": 15.875, + "router_z_loss_mlp": 2.38085938, + "step": 421, + "time_per_iteration": 2.713627338409424 + }, + { + "auxiliary_loss_clip": 0.0910122, + "auxiliary_loss_mlp": 0.02078743, + "balance_loss_clip": 0.07504999, + "balance_loss_mlp": 0.0184185, + "epoch": 0.02537201262588306, + "flos": 30600214364160.0, + "grad_norm": 131.53322969932037, + "language_loss": 1.06396794, + "learning_rate": 3.89208987073549e-06, + "loss": 1.17576766, + "num_input_tokens_seen": 8839660, + "router_z_loss_clip": 15.96875, + "router_z_loss_mlp": 2.36914062, + "step": 422, + "time_per_iteration": 2.779984712600708 + }, + { + "auxiliary_loss_clip": 0.09149099, + "auxiliary_loss_mlp": 0.02005588, + "balance_loss_clip": 0.07524605, + "balance_loss_mlp": 0.01778041, + "epoch": 0.02543213587855103, + "flos": 26072154449280.0, + "grad_norm": 215.69560731113194, + "language_loss": 1.02335918, + "learning_rate": 3.893613781940409e-06, + "loss": 1.13490605, + "num_input_tokens_seen": 8859280, + "router_z_loss_clip": 16.2265625, + "router_z_loss_mlp": 2.27148438, + "step": 423, + "time_per_iteration": 2.72013783454895 + }, + { + "auxiliary_loss_clip": 0.09173086, + "auxiliary_loss_mlp": 0.0200403, + "balance_loss_clip": 0.07535084, + "balance_loss_mlp": 0.01785067, + "epoch": 0.025492259131218997, + "flos": 36030744679680.0, + "grad_norm": 27.081185373152007, + "language_loss": 0.91272038, + "learning_rate": 3.895134094768415e-06, + "loss": 1.02449155, + "num_input_tokens_seen": 8880560, + "router_z_loss_clip": 16.375, + "router_z_loss_mlp": 2.18945312, + "step": 424, + "time_per_iteration": 2.8317928314208984 + }, + { + "auxiliary_loss_clip": 0.09242675, + "auxiliary_loss_mlp": 0.01968499, + "balance_loss_clip": 0.07578178, + "balance_loss_mlp": 0.01753446, + "epoch": 0.02555238238388697, + "flos": 18593963182080.0, + "grad_norm": 166.26721899755887, + "language_loss": 1.05789995, + "learning_rate": 3.896650826173015e-06, + "loss": 1.17001164, + "num_input_tokens_seen": 8899155, + "router_z_loss_clip": 16.625, + "router_z_loss_mlp": 2.15332031, + "step": 425, + "time_per_iteration": 2.660106897354126 + }, + { + "auxiliary_loss_clip": 0.0923897, + "auxiliary_loss_mlp": 0.01943853, + "balance_loss_clip": 0.07566722, + "balance_loss_mlp": 0.01731852, + "epoch": 0.025612505636554938, + "flos": 24250023561600.0, + "grad_norm": 44.6180367993383, + "language_loss": 1.08164155, + "learning_rate": 3.898163992988186e-06, + "loss": 1.19346988, + "num_input_tokens_seen": 8917890, + "router_z_loss_clip": 16.703125, + "router_z_loss_mlp": 2.12109375, + "step": 426, + "time_per_iteration": 2.713566303253174 + }, + { + "auxiliary_loss_clip": 0.07567823, + "auxiliary_loss_mlp": 0.0137553, + "balance_loss_clip": 0.06925757, + "balance_loss_mlp": 0.01282499, + "epoch": 0.025672628889222907, + "flos": 60606617241600.0, + "grad_norm": 0.882551554014491, + "language_loss": 0.57127881, + "learning_rate": 3.899673611929491e-06, + "loss": 0.66071236, + "num_input_tokens_seen": 8978260, + "router_z_loss_clip": 6.43359375, + "router_z_loss_mlp": 0.92919922, + "step": 427, + "time_per_iteration": 3.3642380237579346 + }, + { + "auxiliary_loss_clip": 0.09344095, + "auxiliary_loss_mlp": 0.01954303, + "balance_loss_clip": 0.0761513, + "balance_loss_mlp": 0.01743541, + "epoch": 0.025732752141890875, + "flos": 19579352797440.0, + "grad_norm": 32.1114157010126, + "language_loss": 1.08901465, + "learning_rate": 3.901179699595194e-06, + "loss": 1.20199859, + "num_input_tokens_seen": 8994460, + "router_z_loss_clip": 17.296875, + "router_z_loss_mlp": 2.10839844, + "step": 428, + "time_per_iteration": 2.6606802940368652 + }, + { + "auxiliary_loss_clip": 0.09310514, + "auxiliary_loss_mlp": 0.01961632, + "balance_loss_clip": 0.07603246, + "balance_loss_mlp": 0.01752969, + "epoch": 0.025792875394558847, + "flos": 31292164581120.0, + "grad_norm": 36.551830180207176, + "language_loss": 1.00762367, + "learning_rate": 3.902682272467353e-06, + "loss": 1.12034512, + "num_input_tokens_seen": 9016670, + "router_z_loss_clip": 17.046875, + "router_z_loss_mlp": 2.08984375, + "step": 429, + "time_per_iteration": 2.8459787368774414 + }, + { + "auxiliary_loss_clip": 0.09338318, + "auxiliary_loss_mlp": 0.01955653, + "balance_loss_clip": 0.07623117, + "balance_loss_mlp": 0.01745367, + "epoch": 0.025852998647226816, + "flos": 32387824569600.0, + "grad_norm": 62.5354126598028, + "language_loss": 1.05025983, + "learning_rate": 3.904181346912895e-06, + "loss": 1.16319966, + "num_input_tokens_seen": 9039720, + "router_z_loss_clip": 17.15625, + "router_z_loss_mlp": 2.10644531, + "step": 430, + "time_per_iteration": 2.8446128368377686 + }, + { + "auxiliary_loss_clip": 0.09278628, + "auxiliary_loss_mlp": 0.01943414, + "balance_loss_clip": 0.07600376, + "balance_loss_mlp": 0.01729219, + "epoch": 0.025913121899894784, + "flos": 20199452538240.0, + "grad_norm": 28.225993864396795, + "language_loss": 1.00378919, + "learning_rate": 3.905676939184698e-06, + "loss": 1.11600959, + "num_input_tokens_seen": 9059850, + "router_z_loss_clip": 16.78125, + "router_z_loss_mlp": 2.14453125, + "step": 431, + "time_per_iteration": 2.735534906387329 + }, + { + "auxiliary_loss_clip": 0.09339449, + "auxiliary_loss_mlp": 0.01919694, + "balance_loss_clip": 0.07634744, + "balance_loss_mlp": 0.01714844, + "epoch": 0.025973245152562753, + "flos": 14725680716160.0, + "grad_norm": 242.91179280184718, + "language_loss": 1.11488628, + "learning_rate": 3.907169065422638e-06, + "loss": 1.22747779, + "num_input_tokens_seen": 9077590, + "router_z_loss_clip": 17.046875, + "router_z_loss_mlp": 2.04882812, + "step": 432, + "time_per_iteration": 2.6356372833251953 + }, + { + "auxiliary_loss_clip": 0.09349881, + "auxiliary_loss_mlp": 0.01923388, + "balance_loss_clip": 0.07619249, + "balance_loss_mlp": 0.01717585, + "epoch": 0.02603336840523072, + "flos": 31000947315840.0, + "grad_norm": 39.86728122976192, + "language_loss": 0.95303321, + "learning_rate": 3.908657741654636e-06, + "loss": 1.06576586, + "num_input_tokens_seen": 9099880, + "router_z_loss_clip": 17.328125, + "router_z_loss_mlp": 2.06054688, + "step": 433, + "time_per_iteration": 2.7784080505371094 + }, + { + "auxiliary_loss_clip": 0.09401309, + "auxiliary_loss_mlp": 0.0191169, + "balance_loss_clip": 0.07644869, + "balance_loss_mlp": 0.01712276, + "epoch": 0.026093491657898694, + "flos": 17679753210240.0, + "grad_norm": 1553.0281168066135, + "language_loss": 1.08543563, + "learning_rate": 3.910142983797699e-06, + "loss": 1.19856548, + "num_input_tokens_seen": 9118620, + "router_z_loss_clip": 17.5625, + "router_z_loss_mlp": 1.99511719, + "step": 434, + "time_per_iteration": 2.668267250061035 + }, + { + "auxiliary_loss_clip": 0.09433939, + "auxiliary_loss_mlp": 0.01869234, + "balance_loss_clip": 0.07651832, + "balance_loss_mlp": 0.01678308, + "epoch": 0.026153614910566662, + "flos": 17863593068160.0, + "grad_norm": 33.64342024905016, + "language_loss": 1.03063393, + "learning_rate": 3.9116248076589305e-06, + "loss": 1.14366555, + "num_input_tokens_seen": 9135655, + "router_z_loss_clip": 17.8125, + "router_z_loss_mlp": 1.90917969, + "step": 435, + "time_per_iteration": 2.6838159561157227 + }, + { + "auxiliary_loss_clip": 0.09478317, + "auxiliary_loss_mlp": 0.01863685, + "balance_loss_clip": 0.07678007, + "balance_loss_mlp": 0.01671615, + "epoch": 0.02621373816323463, + "flos": 20017289761920.0, + "grad_norm": 41.08687640619308, + "language_loss": 1.07638645, + "learning_rate": 3.913103228936546e-06, + "loss": 1.18980646, + "num_input_tokens_seen": 9153520, + "router_z_loss_clip": 18.0, + "router_z_loss_mlp": 1.91992188, + "step": 436, + "time_per_iteration": 2.760547399520874 + }, + { + "auxiliary_loss_clip": 0.09473966, + "auxiliary_loss_mlp": 0.0187601, + "balance_loss_clip": 0.07674257, + "balance_loss_mlp": 0.01688708, + "epoch": 0.0262738614159026, + "flos": 19287213137280.0, + "grad_norm": 53.25711722147742, + "language_loss": 0.98595166, + "learning_rate": 3.914578263220868e-06, + "loss": 1.09945142, + "num_input_tokens_seen": 9170750, + "router_z_loss_clip": 18.0, + "router_z_loss_mlp": 1.87402344, + "step": 437, + "time_per_iteration": 2.6779754161834717 + }, + { + "auxiliary_loss_clip": 0.0942243, + "auxiliary_loss_mlp": 0.01861842, + "balance_loss_clip": 0.0761686, + "balance_loss_mlp": 0.01679594, + "epoch": 0.026333984668570568, + "flos": 18813204190080.0, + "grad_norm": 25.40915552443808, + "language_loss": 1.10034943, + "learning_rate": 3.916049925995316e-06, + "loss": 1.21319222, + "num_input_tokens_seen": 9188430, + "router_z_loss_clip": 18.03125, + "router_z_loss_mlp": 1.82421875, + "step": 438, + "time_per_iteration": 2.6451144218444824 + }, + { + "auxiliary_loss_clip": 0.07475804, + "auxiliary_loss_mlp": 0.01367854, + "balance_loss_clip": 0.06865337, + "balance_loss_mlp": 0.01290463, + "epoch": 0.02639410792123854, + "flos": 64593723196800.0, + "grad_norm": 0.9063737016618233, + "language_loss": 0.62703174, + "learning_rate": 3.917518232637377e-06, + "loss": 0.71546829, + "num_input_tokens_seen": 9255835, + "router_z_loss_clip": 6.09765625, + "router_z_loss_mlp": 0.77294922, + "step": 439, + "time_per_iteration": 3.321974992752075 + }, + { + "auxiliary_loss_clip": 0.09522887, + "auxiliary_loss_mlp": 0.0184955, + "balance_loss_clip": 0.07696441, + "balance_loss_mlp": 0.01671499, + "epoch": 0.02645423117390651, + "flos": 28480661009280.0, + "grad_norm": 87.92324241889918, + "language_loss": 0.94047898, + "learning_rate": 3.918983198419573e-06, + "loss": 1.05420327, + "num_input_tokens_seen": 9276835, + "router_z_loss_clip": 18.25, + "router_z_loss_mlp": 1.78027344, + "step": 440, + "time_per_iteration": 2.7474722862243652 + }, + { + "auxiliary_loss_clip": 0.09507709, + "auxiliary_loss_mlp": 0.01844884, + "balance_loss_clip": 0.07691655, + "balance_loss_mlp": 0.01676846, + "epoch": 0.026514354426574478, + "flos": 18557094585600.0, + "grad_norm": 21.281112340814676, + "language_loss": 1.01854694, + "learning_rate": 3.920444838510415e-06, + "loss": 1.13207293, + "num_input_tokens_seen": 9295075, + "router_z_loss_clip": 18.171875, + "router_z_loss_mlp": 1.68066406, + "step": 441, + "time_per_iteration": 2.6456263065338135 + }, + { + "auxiliary_loss_clip": 0.09501958, + "auxiliary_loss_mlp": 0.01843855, + "balance_loss_clip": 0.07712354, + "balance_loss_mlp": 0.01682208, + "epoch": 0.026574477679242446, + "flos": 20674090391040.0, + "grad_norm": 41.33053095224922, + "language_loss": 0.97709602, + "learning_rate": 3.92190316797534e-06, + "loss": 1.09055424, + "num_input_tokens_seen": 9314205, + "router_z_loss_clip": 17.890625, + "router_z_loss_mlp": 1.61621094, + "step": 442, + "time_per_iteration": 2.672673463821411 + }, + { + "auxiliary_loss_clip": 0.07433579, + "auxiliary_loss_mlp": 0.01330966, + "balance_loss_clip": 0.06849352, + "balance_loss_mlp": 0.01265354, + "epoch": 0.026634600931910415, + "flos": 57974718896640.0, + "grad_norm": 0.9677279434812149, + "language_loss": 0.64635992, + "learning_rate": 3.92335820177765e-06, + "loss": 0.73400539, + "num_input_tokens_seen": 9367395, + "router_z_loss_clip": 5.83203125, + "router_z_loss_mlp": 0.65625, + "step": 443, + "time_per_iteration": 3.173064947128296 + }, + { + "auxiliary_loss_clip": 0.09527416, + "auxiliary_loss_mlp": 0.01860056, + "balance_loss_clip": 0.07710861, + "balance_loss_mlp": 0.01695928, + "epoch": 0.026694724184578387, + "flos": 15820586017920.0, + "grad_norm": 61.63283491372988, + "language_loss": 1.0548501, + "learning_rate": 3.924809954779425e-06, + "loss": 1.16872489, + "num_input_tokens_seen": 9385185, + "router_z_loss_clip": 18.15625, + "router_z_loss_mlp": 1.64160156, + "step": 444, + "time_per_iteration": 2.639677047729492 + }, + { + "auxiliary_loss_clip": 0.09502187, + "auxiliary_loss_mlp": 0.01838362, + "balance_loss_clip": 0.07703182, + "balance_loss_mlp": 0.01668608, + "epoch": 0.026754847437246355, + "flos": 23446922503680.0, + "grad_norm": 26.361183363910182, + "language_loss": 1.13923943, + "learning_rate": 3.9262584417424425e-06, + "loss": 1.2526449, + "num_input_tokens_seen": 9403225, + "router_z_loss_clip": 17.96875, + "router_z_loss_mlp": 1.69824219, + "step": 445, + "time_per_iteration": 2.6820874214172363 + }, + { + "auxiliary_loss_clip": 0.09478995, + "auxiliary_loss_mlp": 0.01847369, + "balance_loss_clip": 0.07693952, + "balance_loss_mlp": 0.01688678, + "epoch": 0.026814970689914324, + "flos": 17346552249600.0, + "grad_norm": 24.407324377890284, + "language_loss": 1.13474417, + "learning_rate": 3.9277036773290725e-06, + "loss": 1.24800777, + "num_input_tokens_seen": 9420540, + "router_z_loss_clip": 17.84375, + "router_z_loss_mlp": 1.5859375, + "step": 446, + "time_per_iteration": 2.6508054733276367 + }, + { + "auxiliary_loss_clip": 0.09462097, + "auxiliary_loss_mlp": 0.01860509, + "balance_loss_clip": 0.07703365, + "balance_loss_mlp": 0.01698385, + "epoch": 0.026875093942582293, + "flos": 17900503591680.0, + "grad_norm": 17.536194577693298, + "language_loss": 0.97970635, + "learning_rate": 3.92914567610317e-06, + "loss": 1.09293234, + "num_input_tokens_seen": 9438840, + "router_z_loss_clip": 17.609375, + "router_z_loss_mlp": 1.62109375, + "step": 447, + "time_per_iteration": 2.6584267616271973 + }, + { + "auxiliary_loss_clip": 0.0948635, + "auxiliary_loss_mlp": 0.01891451, + "balance_loss_clip": 0.0770483, + "balance_loss_mlp": 0.01723413, + "epoch": 0.026935217195250265, + "flos": 21730114598400.0, + "grad_norm": 21.562911901589327, + "language_loss": 1.05652094, + "learning_rate": 3.930584452530952e-06, + "loss": 1.17029905, + "num_input_tokens_seen": 9457215, + "router_z_loss_clip": 17.8125, + "router_z_loss_mlp": 1.67871094, + "step": 448, + "time_per_iteration": 2.672372341156006 + }, + { + "auxiliary_loss_clip": 0.09413482, + "auxiliary_loss_mlp": 0.01902533, + "balance_loss_clip": 0.07671943, + "balance_loss_mlp": 0.01741266, + "epoch": 0.026995340447918233, + "flos": 23629378769280.0, + "grad_norm": 23.02833788504926, + "language_loss": 1.03788567, + "learning_rate": 3.9320200209818755e-06, + "loss": 1.1510458, + "num_input_tokens_seen": 9475615, + "router_z_loss_clip": 17.421875, + "router_z_loss_mlp": 1.61328125, + "step": 449, + "time_per_iteration": 2.7325220108032227 + }, + { + "auxiliary_loss_clip": 0.09437311, + "auxiliary_loss_mlp": 0.01924822, + "balance_loss_clip": 0.07667883, + "balance_loss_mlp": 0.0175955, + "epoch": 0.027055463700586202, + "flos": 17937078698880.0, + "grad_norm": 25.829396596685555, + "language_loss": 1.03924859, + "learning_rate": 3.933452395729493e-06, + "loss": 1.15286994, + "num_input_tokens_seen": 9493975, + "router_z_loss_clip": 17.703125, + "router_z_loss_mlp": 1.65332031, + "step": 450, + "time_per_iteration": 2.7811074256896973 + }, + { + "auxiliary_loss_clip": 0.09359707, + "auxiliary_loss_mlp": 0.01970194, + "balance_loss_clip": 0.0764256, + "balance_loss_mlp": 0.01786802, + "epoch": 0.02711558695325417, + "flos": 25125897490560.0, + "grad_norm": 13.607653987068408, + "language_loss": 0.94443107, + "learning_rate": 3.934881590952304e-06, + "loss": 1.05773008, + "num_input_tokens_seen": 9514810, + "router_z_loss_clip": 17.171875, + "router_z_loss_mlp": 1.83398438, + "step": 451, + "time_per_iteration": 2.7412643432617188 + }, + { + "auxiliary_loss_clip": 0.09335385, + "auxiliary_loss_mlp": 0.02017307, + "balance_loss_clip": 0.07637483, + "balance_loss_mlp": 0.0183115, + "epoch": 0.02717571020592214, + "flos": 24245788930560.0, + "grad_norm": 37.22783951143226, + "language_loss": 0.88836813, + "learning_rate": 3.936307620734599e-06, + "loss": 1.00189495, + "num_input_tokens_seen": 9533635, + "router_z_loss_clip": 16.984375, + "router_z_loss_mlp": 1.86132812, + "step": 452, + "time_per_iteration": 4.115676403045654 + }, + { + "auxiliary_loss_clip": 0.09290475, + "auxiliary_loss_mlp": 0.0203207, + "balance_loss_clip": 0.07611442, + "balance_loss_mlp": 0.01843815, + "epoch": 0.02723583345859011, + "flos": 25125939417600.0, + "grad_norm": 26.908598142012707, + "language_loss": 0.85555518, + "learning_rate": 3.937730499067294e-06, + "loss": 0.96878058, + "num_input_tokens_seen": 9555420, + "router_z_loss_clip": 16.796875, + "router_z_loss_mlp": 1.88378906, + "step": 453, + "time_per_iteration": 4.138639211654663 + }, + { + "auxiliary_loss_clip": 0.09325944, + "auxiliary_loss_mlp": 0.02084866, + "balance_loss_clip": 0.07637945, + "balance_loss_mlp": 0.01890889, + "epoch": 0.02729595671125808, + "flos": 42751550090880.0, + "grad_norm": 24.937148454808558, + "language_loss": 1.02160192, + "learning_rate": 3.939150239848748e-06, + "loss": 1.13570988, + "num_input_tokens_seen": 9578950, + "router_z_loss_clip": 16.90625, + "router_z_loss_mlp": 1.94140625, + "step": 454, + "time_per_iteration": 2.851925849914551 + }, + { + "auxiliary_loss_clip": 0.09296365, + "auxiliary_loss_mlp": 0.02123722, + "balance_loss_clip": 0.07621342, + "balance_loss_mlp": 0.01917728, + "epoch": 0.02735607996392605, + "flos": 21436884835200.0, + "grad_norm": 33.11607572615514, + "language_loss": 0.89587128, + "learning_rate": 3.9405668568855866e-06, + "loss": 1.01007211, + "num_input_tokens_seen": 9598160, + "router_z_loss_clip": 16.734375, + "router_z_loss_mlp": 2.0625, + "step": 455, + "time_per_iteration": 4.109623432159424 + }, + { + "auxiliary_loss_clip": 0.09291606, + "auxiliary_loss_mlp": 0.02163595, + "balance_loss_clip": 0.07605162, + "balance_loss_mlp": 0.01945966, + "epoch": 0.027416203216594017, + "flos": 20857762540800.0, + "grad_norm": 21.694013226548094, + "language_loss": 0.99008209, + "learning_rate": 3.941980363893499e-06, + "loss": 1.10463405, + "num_input_tokens_seen": 9616010, + "router_z_loss_clip": 16.84375, + "router_z_loss_mlp": 2.17773438, + "step": 456, + "time_per_iteration": 2.6782984733581543 + }, + { + "auxiliary_loss_clip": 0.09230845, + "auxiliary_loss_mlp": 0.02187109, + "balance_loss_clip": 0.07574348, + "balance_loss_mlp": 0.01970243, + "epoch": 0.027476326469261986, + "flos": 13229497411200.0, + "grad_norm": 28.08353344684151, + "language_loss": 0.97085631, + "learning_rate": 3.9433907744980384e-06, + "loss": 1.0850358, + "num_input_tokens_seen": 9634000, + "router_z_loss_clip": 16.5625, + "router_z_loss_mlp": 2.16894531, + "step": 457, + "time_per_iteration": 2.6582846641540527 + }, + { + "auxiliary_loss_clip": 0.09249748, + "auxiliary_loss_mlp": 0.02209668, + "balance_loss_clip": 0.07581042, + "balance_loss_mlp": 0.01978497, + "epoch": 0.027536449721929958, + "flos": 24031369532160.0, + "grad_norm": 45.18041952436337, + "language_loss": 1.10011601, + "learning_rate": 3.944798102235412e-06, + "loss": 1.21471024, + "num_input_tokens_seen": 9653455, + "router_z_loss_clip": 16.671875, + "router_z_loss_mlp": 2.31054688, + "step": 458, + "time_per_iteration": 2.723140239715576 + }, + { + "auxiliary_loss_clip": 0.09220205, + "auxiliary_loss_mlp": 0.02210297, + "balance_loss_clip": 0.07555029, + "balance_loss_mlp": 0.01976265, + "epoch": 0.027596572974597926, + "flos": 13011094944000.0, + "grad_norm": 45.239920259124276, + "language_loss": 1.02681351, + "learning_rate": 3.9462023605532545e-06, + "loss": 1.14111853, + "num_input_tokens_seen": 9669650, + "router_z_loss_clip": 16.640625, + "router_z_loss_mlp": 2.33984375, + "step": 459, + "time_per_iteration": 2.671720027923584 + }, + { + "auxiliary_loss_clip": 0.09208341, + "auxiliary_loss_mlp": 0.02210187, + "balance_loss_clip": 0.07567435, + "balance_loss_mlp": 0.0198264, + "epoch": 0.027656696227265895, + "flos": 26150671324800.0, + "grad_norm": 19.623434288041715, + "language_loss": 0.97685856, + "learning_rate": 3.947603562811407e-06, + "loss": 1.09104395, + "num_input_tokens_seen": 9691415, + "router_z_loss_clip": 16.40625, + "router_z_loss_mlp": 2.2734375, + "step": 460, + "time_per_iteration": 2.757227897644043 + }, + { + "auxiliary_loss_clip": 0.07349286, + "auxiliary_loss_mlp": 0.01457289, + "balance_loss_clip": 0.06801966, + "balance_loss_mlp": 0.01381853, + "epoch": 0.027716819479933864, + "flos": 60717055322880.0, + "grad_norm": 1.34871546657126, + "language_loss": 0.73767412, + "learning_rate": 3.949001722282675e-06, + "loss": 0.8257398, + "num_input_tokens_seen": 9755605, + "router_z_loss_clip": 5.48828125, + "router_z_loss_mlp": 0.75292969, + "step": 461, + "time_per_iteration": 3.225203514099121 + }, + { + "auxiliary_loss_clip": 0.09153335, + "auxiliary_loss_mlp": 0.02158036, + "balance_loss_clip": 0.07562718, + "balance_loss_mlp": 0.01941456, + "epoch": 0.027776942732601832, + "flos": 31219936761600.0, + "grad_norm": 25.337070845847826, + "language_loss": 1.02236819, + "learning_rate": 3.950396852153582e-06, + "loss": 1.13548183, + "num_input_tokens_seen": 9776270, + "router_z_loss_clip": 15.921875, + "router_z_loss_mlp": 2.16503906, + "step": 462, + "time_per_iteration": 2.761122941970825 + }, + { + "auxiliary_loss_clip": 0.0917296, + "auxiliary_loss_mlp": 0.02143298, + "balance_loss_clip": 0.07564321, + "balance_loss_mlp": 0.01926432, + "epoch": 0.027837065985269804, + "flos": 22681277020800.0, + "grad_norm": 25.879214952659087, + "language_loss": 1.11945248, + "learning_rate": 3.951788965525118e-06, + "loss": 1.23261511, + "num_input_tokens_seen": 9794465, + "router_z_loss_clip": 16.09375, + "router_z_loss_mlp": 2.16796875, + "step": 463, + "time_per_iteration": 2.6517393589019775 + }, + { + "auxiliary_loss_clip": 0.07315847, + "auxiliary_loss_mlp": 0.01337025, + "balance_loss_clip": 0.06773283, + "balance_loss_mlp": 0.01272986, + "epoch": 0.027897189237937773, + "flos": 62200786296960.0, + "grad_norm": 0.9076693638551637, + "language_loss": 0.58966231, + "learning_rate": 3.953178075413476e-06, + "loss": 0.67619097, + "num_input_tokens_seen": 9849685, + "router_z_loss_clip": 5.4375, + "router_z_loss_mlp": 0.64013672, + "step": 464, + "time_per_iteration": 3.2396233081817627 + }, + { + "auxiliary_loss_clip": 0.09172998, + "auxiliary_loss_mlp": 0.02120585, + "balance_loss_clip": 0.07578301, + "balance_loss_mlp": 0.01918502, + "epoch": 0.02795731249060574, + "flos": 24499131350400.0, + "grad_norm": 45.20349334546378, + "language_loss": 1.03495145, + "learning_rate": 3.954564194750784e-06, + "loss": 1.14788723, + "num_input_tokens_seen": 9869505, + "router_z_loss_clip": 15.953125, + "router_z_loss_mlp": 2.02148438, + "step": 465, + "time_per_iteration": 2.725616931915283 + }, + { + "auxiliary_loss_clip": 0.09135859, + "auxiliary_loss_mlp": 0.0204377, + "balance_loss_clip": 0.07563674, + "balance_loss_mlp": 0.01849125, + "epoch": 0.02801743574327371, + "flos": 23739858777600.0, + "grad_norm": 33.78948466858622, + "language_loss": 0.95100033, + "learning_rate": 3.955947336385828e-06, + "loss": 1.06279659, + "num_input_tokens_seen": 9890950, + "router_z_loss_clip": 15.703125, + "router_z_loss_mlp": 1.94628906, + "step": 466, + "time_per_iteration": 2.7096307277679443 + }, + { + "auxiliary_loss_clip": 0.09162845, + "auxiliary_loss_mlp": 0.02091556, + "balance_loss_clip": 0.07588789, + "balance_loss_mlp": 0.0189424, + "epoch": 0.02807755899594168, + "flos": 20634999661440.0, + "grad_norm": 17.071922366982022, + "language_loss": 1.01469541, + "learning_rate": 3.957327513084761e-06, + "loss": 1.12723947, + "num_input_tokens_seen": 9911265, + "router_z_loss_clip": 15.75, + "router_z_loss_mlp": 1.97265625, + "step": 467, + "time_per_iteration": 2.697120189666748 + }, + { + "auxiliary_loss_clip": 0.0908498, + "auxiliary_loss_mlp": 0.02113688, + "balance_loss_clip": 0.07555597, + "balance_loss_mlp": 0.01908934, + "epoch": 0.02813768224860965, + "flos": 19250554176000.0, + "grad_norm": 23.52868546244156, + "language_loss": 1.03801823, + "learning_rate": 3.958704737531818e-06, + "loss": 1.15000498, + "num_input_tokens_seen": 9929025, + "router_z_loss_clip": 15.2734375, + "router_z_loss_mlp": 2.04882812, + "step": 468, + "time_per_iteration": 2.6348235607147217 + }, + { + "auxiliary_loss_clip": 0.09087479, + "auxiliary_loss_mlp": 0.02120186, + "balance_loss_clip": 0.07563758, + "balance_loss_mlp": 0.01912189, + "epoch": 0.02819780550127762, + "flos": 20820306965760.0, + "grad_norm": 34.78387665912523, + "language_loss": 1.11076498, + "learning_rate": 3.9600790223300065e-06, + "loss": 1.2228415, + "num_input_tokens_seen": 9945190, + "router_z_loss_clip": 15.2265625, + "router_z_loss_mlp": 2.08300781, + "step": 469, + "time_per_iteration": 2.6886401176452637 + }, + { + "auxiliary_loss_clip": 0.09051213, + "auxiliary_loss_mlp": 0.02126417, + "balance_loss_clip": 0.07552808, + "balance_loss_mlp": 0.01921949, + "epoch": 0.028257928753945588, + "flos": 19980211530240.0, + "grad_norm": 43.4409759227761, + "language_loss": 1.05499089, + "learning_rate": 3.96145038000181e-06, + "loss": 1.16676712, + "num_input_tokens_seen": 9962820, + "router_z_loss_clip": 15.0078125, + "router_z_loss_mlp": 2.046875, + "step": 470, + "time_per_iteration": 2.649240255355835 + }, + { + "auxiliary_loss_clip": 0.09054536, + "auxiliary_loss_mlp": 0.02164254, + "balance_loss_clip": 0.0753805, + "balance_loss_mlp": 0.0194281, + "epoch": 0.028318052006613557, + "flos": 20490585949440.0, + "grad_norm": 34.229925481391405, + "language_loss": 1.11025834, + "learning_rate": 3.962818822989861e-06, + "loss": 1.2224462, + "num_input_tokens_seen": 9982595, + "router_z_loss_clip": 15.1796875, + "router_z_loss_mlp": 2.21484375, + "step": 471, + "time_per_iteration": 2.694502592086792 + }, + { + "auxiliary_loss_clip": 0.0901389, + "auxiliary_loss_mlp": 0.02100335, + "balance_loss_clip": 0.07527161, + "balance_loss_mlp": 0.01902638, + "epoch": 0.02837817525928153, + "flos": 28522854339840.0, + "grad_norm": 28.640745518781863, + "language_loss": 0.93263328, + "learning_rate": 3.964184363657625e-06, + "loss": 1.04377556, + "num_input_tokens_seen": 10004645, + "router_z_loss_clip": 14.859375, + "router_z_loss_mlp": 1.9765625, + "step": 472, + "time_per_iteration": 2.723616123199463 + }, + { + "auxiliary_loss_clip": 0.09058346, + "auxiliary_loss_mlp": 0.02156495, + "balance_loss_clip": 0.07551048, + "balance_loss_mlp": 0.01941347, + "epoch": 0.028438298511949497, + "flos": 18557597710080.0, + "grad_norm": 31.883678895195217, + "language_loss": 1.09761989, + "learning_rate": 3.965547014290071e-06, + "loss": 1.2097683, + "num_input_tokens_seen": 10022555, + "router_z_loss_clip": 15.078125, + "router_z_loss_mlp": 2.15136719, + "step": 473, + "time_per_iteration": 2.678131580352783 + }, + { + "auxiliary_loss_clip": 0.09018995, + "auxiliary_loss_mlp": 0.02143272, + "balance_loss_clip": 0.07526669, + "balance_loss_mlp": 0.01926216, + "epoch": 0.028498421764617466, + "flos": 16915952517120.0, + "grad_norm": 82.06010961294956, + "language_loss": 1.11515367, + "learning_rate": 3.96690678709433e-06, + "loss": 1.22677636, + "num_input_tokens_seen": 10041025, + "router_z_loss_clip": 14.921875, + "router_z_loss_mlp": 2.171875, + "step": 474, + "time_per_iteration": 2.6410977840423584 + }, + { + "auxiliary_loss_clip": 0.08995185, + "auxiliary_loss_mlp": 0.02205209, + "balance_loss_clip": 0.0752454, + "balance_loss_mlp": 0.01985291, + "epoch": 0.028558545017285435, + "flos": 27785524337280.0, + "grad_norm": 24.826629982331372, + "language_loss": 0.97130352, + "learning_rate": 3.968263694200355e-06, + "loss": 1.0833075, + "num_input_tokens_seen": 10060775, + "router_z_loss_clip": 14.6953125, + "router_z_loss_mlp": 2.19726562, + "step": 475, + "time_per_iteration": 2.7301735877990723 + }, + { + "auxiliary_loss_clip": 0.07259832, + "auxiliary_loss_mlp": 0.01404773, + "balance_loss_clip": 0.06728013, + "balance_loss_mlp": 0.01346599, + "epoch": 0.028618668269953403, + "flos": 65674205596800.0, + "grad_norm": 0.9437348671950723, + "language_loss": 0.66932654, + "learning_rate": 3.969617747661569e-06, + "loss": 0.75597262, + "num_input_tokens_seen": 10120225, + "router_z_loss_clip": 5.3125, + "router_z_loss_mlp": 0.58154297, + "step": 476, + "time_per_iteration": 3.247438430786133 + }, + { + "auxiliary_loss_clip": 0.08952022, + "auxiliary_loss_mlp": 0.02252624, + "balance_loss_clip": 0.07508352, + "balance_loss_mlp": 0.02028701, + "epoch": 0.028678791522621375, + "flos": 21942269936640.0, + "grad_norm": 144.43661292546363, + "language_loss": 1.05051386, + "learning_rate": 3.970968959455509e-06, + "loss": 1.16256034, + "num_input_tokens_seen": 10137880, + "router_z_loss_clip": 14.4296875, + "router_z_loss_mlp": 2.24023438, + "step": 477, + "time_per_iteration": 2.6508686542510986 + }, + { + "auxiliary_loss_clip": 0.08993904, + "auxiliary_loss_mlp": 0.02256823, + "balance_loss_clip": 0.0754967, + "balance_loss_mlp": 0.02029467, + "epoch": 0.028738914775289344, + "flos": 24579115672320.0, + "grad_norm": 33.20185721324117, + "language_loss": 1.03065133, + "learning_rate": 3.97231734148446e-06, + "loss": 1.14315856, + "num_input_tokens_seen": 10156930, + "router_z_loss_clip": 14.453125, + "router_z_loss_mlp": 2.2734375, + "step": 478, + "time_per_iteration": 2.7467830181121826 + }, + { + "auxiliary_loss_clip": 0.08933547, + "auxiliary_loss_mlp": 0.0224041, + "balance_loss_clip": 0.07500903, + "balance_loss_mlp": 0.02019921, + "epoch": 0.028799038027957313, + "flos": 23264633946240.0, + "grad_norm": 28.885721108677235, + "language_loss": 1.00177026, + "learning_rate": 3.973662905576082e-06, + "loss": 1.11350989, + "num_input_tokens_seen": 10176295, + "router_z_loss_clip": 14.328125, + "router_z_loss_mlp": 2.20507812, + "step": 479, + "time_per_iteration": 2.7295467853546143 + }, + { + "auxiliary_loss_clip": 0.08948811, + "auxiliary_loss_mlp": 0.02267472, + "balance_loss_clip": 0.07523456, + "balance_loss_mlp": 0.02031152, + "epoch": 0.02885916128062528, + "flos": 22170692966400.0, + "grad_norm": 33.357673755660976, + "language_loss": 0.91625684, + "learning_rate": 3.975005663484038e-06, + "loss": 1.02841961, + "num_input_tokens_seen": 10195790, + "router_z_loss_clip": 14.25, + "router_z_loss_mlp": 2.36328125, + "step": 480, + "time_per_iteration": 2.766277551651001 + }, + { + "auxiliary_loss_clip": 0.08903027, + "auxiliary_loss_mlp": 0.02291788, + "balance_loss_clip": 0.07483099, + "balance_loss_mlp": 0.02045358, + "epoch": 0.02891928453329325, + "flos": 22939986101760.0, + "grad_norm": 22.287574516605755, + "language_loss": 1.01525128, + "learning_rate": 3.976345626888605e-06, + "loss": 1.12719941, + "num_input_tokens_seen": 10218405, + "router_z_loss_clip": 14.1875, + "router_z_loss_mlp": 2.4609375, + "step": 481, + "time_per_iteration": 2.692387580871582 + }, + { + "auxiliary_loss_clip": 0.07204929, + "auxiliary_loss_mlp": 0.0133661, + "balance_loss_clip": 0.06688471, + "balance_loss_mlp": 0.01279295, + "epoch": 0.028979407785961222, + "flos": 57449376524160.0, + "grad_norm": 0.8487290952821426, + "language_loss": 0.65879083, + "learning_rate": 3.9776828073972864e-06, + "loss": 0.74420619, + "num_input_tokens_seen": 10271005, + "router_z_loss_clip": 5.16015625, + "router_z_loss_mlp": 0.57275391, + "step": 482, + "time_per_iteration": 3.019406318664551 + }, + { + "auxiliary_loss_clip": 0.08916203, + "auxiliary_loss_mlp": 0.02251093, + "balance_loss_clip": 0.0748857, + "balance_loss_mlp": 0.02018397, + "epoch": 0.02903953103862919, + "flos": 16727584538880.0, + "grad_norm": 104.5991727322302, + "language_loss": 1.06331348, + "learning_rate": 3.979017216545415e-06, + "loss": 1.17498636, + "num_input_tokens_seen": 10288405, + "router_z_loss_clip": 14.28125, + "router_z_loss_mlp": 2.32421875, + "step": 483, + "time_per_iteration": 2.609882354736328 + }, + { + "auxiliary_loss_clip": 0.08908117, + "auxiliary_loss_mlp": 0.02236577, + "balance_loss_clip": 0.07510938, + "balance_loss_mlp": 0.02016469, + "epoch": 0.02909965429129716, + "flos": 16769232817920.0, + "grad_norm": 23.083678473769563, + "language_loss": 0.94234419, + "learning_rate": 3.980348865796749e-06, + "loss": 1.05379117, + "num_input_tokens_seen": 10306875, + "router_z_loss_clip": 13.96875, + "router_z_loss_mlp": 2.20507812, + "step": 484, + "time_per_iteration": 2.6507458686828613 + }, + { + "auxiliary_loss_clip": 0.08915585, + "auxiliary_loss_mlp": 0.02232887, + "balance_loss_clip": 0.07503805, + "balance_loss_mlp": 0.02011253, + "epoch": 0.029159777543965128, + "flos": 19790334178560.0, + "grad_norm": 110.91894314268477, + "language_loss": 1.00352454, + "learning_rate": 3.9816777665440615e-06, + "loss": 1.11500931, + "num_input_tokens_seen": 10323965, + "router_z_loss_clip": 14.125, + "router_z_loss_mlp": 2.21679688, + "step": 485, + "time_per_iteration": 2.7673757076263428 + }, + { + "auxiliary_loss_clip": 0.08880442, + "auxiliary_loss_mlp": 0.02237809, + "balance_loss_clip": 0.07482816, + "balance_loss_mlp": 0.02005876, + "epoch": 0.029219900796633096, + "flos": 19648184526720.0, + "grad_norm": 27.10228237086094, + "language_loss": 1.06272924, + "learning_rate": 3.983003930109732e-06, + "loss": 1.17391181, + "num_input_tokens_seen": 10342620, + "router_z_loss_clip": 13.96875, + "router_z_loss_mlp": 2.31835938, + "step": 486, + "time_per_iteration": 2.6508092880249023 + }, + { + "auxiliary_loss_clip": 0.08911004, + "auxiliary_loss_mlp": 0.02193732, + "balance_loss_clip": 0.0752122, + "balance_loss_mlp": 0.01974864, + "epoch": 0.02928002404930107, + "flos": 25892926565760.0, + "grad_norm": 15.693662583850747, + "language_loss": 1.04105806, + "learning_rate": 3.984327367746315e-06, + "loss": 1.15210545, + "num_input_tokens_seen": 10364610, + "router_z_loss_clip": 13.90625, + "router_z_loss_mlp": 2.19042969, + "step": 487, + "time_per_iteration": 2.81233286857605 + }, + { + "auxiliary_loss_clip": 0.0888624, + "auxiliary_loss_mlp": 0.02210903, + "balance_loss_clip": 0.07486838, + "balance_loss_mlp": 0.02002811, + "epoch": 0.029340147301969037, + "flos": 20665243785600.0, + "grad_norm": 49.61563210000309, + "language_loss": 1.12978697, + "learning_rate": 3.985648090637122e-06, + "loss": 1.24075842, + "num_input_tokens_seen": 10380910, + "router_z_loss_clip": 13.9921875, + "router_z_loss_mlp": 2.08300781, + "step": 488, + "time_per_iteration": 2.674189567565918 + }, + { + "auxiliary_loss_clip": 0.08953497, + "auxiliary_loss_mlp": 0.02211393, + "balance_loss_clip": 0.07543504, + "balance_loss_mlp": 0.02002347, + "epoch": 0.029400270554637006, + "flos": 24435288938880.0, + "grad_norm": 19.90256121713189, + "language_loss": 1.00477099, + "learning_rate": 3.986966109896785e-06, + "loss": 1.11641979, + "num_input_tokens_seen": 10400665, + "router_z_loss_clip": 14.1015625, + "router_z_loss_mlp": 2.09277344, + "step": 489, + "time_per_iteration": 2.7639148235321045 + }, + { + "auxiliary_loss_clip": 0.0892607, + "auxiliary_loss_mlp": 0.0220073, + "balance_loss_clip": 0.07529595, + "balance_loss_mlp": 0.01982529, + "epoch": 0.029460393807304974, + "flos": 20127140864640.0, + "grad_norm": 27.578366038116485, + "language_loss": 1.02338409, + "learning_rate": 3.988281436571815e-06, + "loss": 1.13465214, + "num_input_tokens_seen": 10420150, + "router_z_loss_clip": 13.96875, + "router_z_loss_mlp": 2.18359375, + "step": 490, + "time_per_iteration": 2.6444106101989746 + }, + { + "auxiliary_loss_clip": 0.08913176, + "auxiliary_loss_mlp": 0.02195572, + "balance_loss_clip": 0.07533699, + "balance_loss_mlp": 0.0197432, + "epoch": 0.029520517059972943, + "flos": 17681681854080.0, + "grad_norm": 29.015537112342308, + "language_loss": 1.11532688, + "learning_rate": 3.989594081641164e-06, + "loss": 1.22641444, + "num_input_tokens_seen": 10438210, + "router_z_loss_clip": 13.7890625, + "router_z_loss_mlp": 2.21289062, + "step": 491, + "time_per_iteration": 5.5153045654296875 + }, + { + "auxiliary_loss_clip": 0.08889591, + "auxiliary_loss_mlp": 0.02207651, + "balance_loss_clip": 0.07520857, + "balance_loss_mlp": 0.0199317, + "epoch": 0.029580640312640915, + "flos": 18959211129600.0, + "grad_norm": 14.57626480214455, + "language_loss": 0.9931764, + "learning_rate": 3.9909040560167675e-06, + "loss": 1.10414886, + "num_input_tokens_seen": 10455125, + "router_z_loss_clip": 13.6875, + "router_z_loss_mlp": 2.14550781, + "step": 492, + "time_per_iteration": 4.12203049659729 + }, + { + "auxiliary_loss_clip": 0.08912461, + "auxiliary_loss_mlp": 0.02272215, + "balance_loss_clip": 0.07548416, + "balance_loss_mlp": 0.02033606, + "epoch": 0.029640763565308884, + "flos": 18730746172800.0, + "grad_norm": 23.908228280746865, + "language_loss": 1.05753922, + "learning_rate": 3.992211370544093e-06, + "loss": 1.16938591, + "num_input_tokens_seen": 10470990, + "router_z_loss_clip": 13.625, + "router_z_loss_mlp": 2.3828125, + "step": 493, + "time_per_iteration": 2.6953020095825195 + }, + { + "auxiliary_loss_clip": 0.08946873, + "auxiliary_loss_mlp": 0.02207101, + "balance_loss_clip": 0.07561117, + "balance_loss_mlp": 0.01985753, + "epoch": 0.029700886817976852, + "flos": 20601652936320.0, + "grad_norm": 59.82783301164341, + "language_loss": 1.05118871, + "learning_rate": 3.99351603600268e-06, + "loss": 1.16272855, + "num_input_tokens_seen": 10490685, + "router_z_loss_clip": 13.8515625, + "router_z_loss_mlp": 2.21386719, + "step": 494, + "time_per_iteration": 2.6631805896759033 + }, + { + "auxiliary_loss_clip": 0.08915924, + "auxiliary_loss_mlp": 0.02239191, + "balance_loss_clip": 0.07543083, + "balance_loss_mlp": 0.0199753, + "epoch": 0.02976101007064482, + "flos": 22243423910400.0, + "grad_norm": 26.318413946561634, + "language_loss": 1.04354262, + "learning_rate": 3.994818063106668e-06, + "loss": 1.15509367, + "num_input_tokens_seen": 10509435, + "router_z_loss_clip": 13.7265625, + "router_z_loss_mlp": 2.4140625, + "step": 495, + "time_per_iteration": 4.107235908508301 + }, + { + "auxiliary_loss_clip": 0.08888054, + "auxiliary_loss_mlp": 0.02273613, + "balance_loss_clip": 0.07541628, + "balance_loss_mlp": 0.02036148, + "epoch": 0.029821133323312793, + "flos": 23739439507200.0, + "grad_norm": 14.252476342508674, + "language_loss": 0.79374158, + "learning_rate": 3.99611746250533e-06, + "loss": 0.9053582, + "num_input_tokens_seen": 10530050, + "router_z_loss_clip": 13.4609375, + "router_z_loss_mlp": 2.37304688, + "step": 496, + "time_per_iteration": 2.757887363433838 + }, + { + "auxiliary_loss_clip": 0.08908898, + "auxiliary_loss_mlp": 0.0225322, + "balance_loss_clip": 0.07561936, + "balance_loss_mlp": 0.02023385, + "epoch": 0.02988125657598076, + "flos": 22426131738240.0, + "grad_norm": 48.93797296748546, + "language_loss": 1.05435932, + "learning_rate": 3.997414244783595e-06, + "loss": 1.16598058, + "num_input_tokens_seen": 10551370, + "router_z_loss_clip": 13.453125, + "router_z_loss_mlp": 2.296875, + "step": 497, + "time_per_iteration": 2.698960781097412 + }, + { + "auxiliary_loss_clip": 0.08959304, + "auxiliary_loss_mlp": 0.0221962, + "balance_loss_clip": 0.07595803, + "balance_loss_mlp": 0.01998176, + "epoch": 0.02994137982864873, + "flos": 13850267984640.0, + "grad_norm": 57.28331954677374, + "language_loss": 1.09360301, + "learning_rate": 3.998708420462557e-06, + "loss": 1.20539236, + "num_input_tokens_seen": 10569225, + "router_z_loss_clip": 13.640625, + "router_z_loss_mlp": 2.21289062, + "step": 498, + "time_per_iteration": 2.699470281600952 + }, + { + "auxiliary_loss_clip": 0.08942117, + "auxiliary_loss_mlp": 0.02291662, + "balance_loss_clip": 0.07576901, + "balance_loss_mlp": 0.02053434, + "epoch": 0.0300015030813167, + "flos": 23914055416320.0, + "grad_norm": 30.471494656970325, + "language_loss": 1.05517888, + "learning_rate": 4e-06, + "loss": 1.16751671, + "num_input_tokens_seen": 10586170, + "router_z_loss_clip": 13.65625, + "router_z_loss_mlp": 2.37890625, + "step": 499, + "time_per_iteration": 2.6825146675109863 + }, + { + "auxiliary_loss_clip": 0.08909643, + "auxiliary_loss_mlp": 0.02277073, + "balance_loss_clip": 0.07578171, + "balance_loss_mlp": 0.02052769, + "epoch": 0.030061626333984667, + "flos": 22023134726400.0, + "grad_norm": 15.715356901732157, + "language_loss": 0.96281993, + "learning_rate": 3.9999999620799e-06, + "loss": 1.07468712, + "num_input_tokens_seen": 10606205, + "router_z_loss_clip": 13.3046875, + "router_z_loss_mlp": 2.24414062, + "step": 500, + "time_per_iteration": 2.7350914478302 + }, + { + "auxiliary_loss_clip": 0.08887713, + "auxiliary_loss_mlp": 0.02297984, + "balance_loss_clip": 0.07557485, + "balance_loss_mlp": 0.02069103, + "epoch": 0.03012174958665264, + "flos": 23046483041280.0, + "grad_norm": 15.325261953037035, + "language_loss": 1.09255648, + "learning_rate": 3.9999998483196e-06, + "loss": 1.20441341, + "num_input_tokens_seen": 10625995, + "router_z_loss_clip": 13.296875, + "router_z_loss_mlp": 2.2890625, + "step": 501, + "time_per_iteration": 2.6515860557556152 + }, + { + "auxiliary_loss_clip": 0.0895866, + "auxiliary_loss_mlp": 0.02279337, + "balance_loss_clip": 0.07618586, + "balance_loss_mlp": 0.02058275, + "epoch": 0.030181872839320608, + "flos": 18959294983680.0, + "grad_norm": 442.08874740717613, + "language_loss": 1.0616231, + "learning_rate": 3.9999996587191065e-06, + "loss": 1.17400312, + "num_input_tokens_seen": 10644105, + "router_z_loss_clip": 13.40625, + "router_z_loss_mlp": 2.21289062, + "step": 502, + "time_per_iteration": 2.6650314331054688 + }, + { + "auxiliary_loss_clip": 0.08926746, + "auxiliary_loss_mlp": 0.02313635, + "balance_loss_clip": 0.07593986, + "balance_loss_mlp": 0.02080176, + "epoch": 0.030241996091988577, + "flos": 16733747813760.0, + "grad_norm": 40.11923719359636, + "language_loss": 1.00487685, + "learning_rate": 3.999999393278425e-06, + "loss": 1.11728072, + "num_input_tokens_seen": 10661090, + "router_z_loss_clip": 13.3125, + "router_z_loss_mlp": 2.3359375, + "step": 503, + "time_per_iteration": 2.6301283836364746 + }, + { + "auxiliary_loss_clip": 0.08950677, + "auxiliary_loss_mlp": 0.02299167, + "balance_loss_clip": 0.07607222, + "balance_loss_mlp": 0.02070094, + "epoch": 0.030302119344656545, + "flos": 28628806227840.0, + "grad_norm": 16.096297116013613, + "language_loss": 1.02800179, + "learning_rate": 3.999999051997567e-06, + "loss": 1.14050031, + "num_input_tokens_seen": 10682380, + "router_z_loss_clip": 13.4375, + "router_z_loss_mlp": 2.28808594, + "step": 504, + "time_per_iteration": 2.7234466075897217 + }, + { + "auxiliary_loss_clip": 0.08954775, + "auxiliary_loss_mlp": 0.022733, + "balance_loss_clip": 0.07610564, + "balance_loss_mlp": 0.02054241, + "epoch": 0.030362242597324514, + "flos": 15674788713600.0, + "grad_norm": 53.80634610199122, + "language_loss": 0.90572113, + "learning_rate": 3.9999986348765425e-06, + "loss": 1.01800191, + "num_input_tokens_seen": 10699925, + "router_z_loss_clip": 13.453125, + "router_z_loss_mlp": 2.19042969, + "step": 505, + "time_per_iteration": 2.6355271339416504 + }, + { + "auxiliary_loss_clip": 0.07202613, + "auxiliary_loss_mlp": 0.01385887, + "balance_loss_clip": 0.06702607, + "balance_loss_mlp": 0.01312073, + "epoch": 0.030422365849992486, + "flos": 72149173528320.0, + "grad_norm": 1.0312424009228802, + "language_loss": 0.55707914, + "learning_rate": 3.999998141915371e-06, + "loss": 0.64296412, + "num_input_tokens_seen": 10766525, + "router_z_loss_clip": 5.0, + "router_z_loss_mlp": 0.73779297, + "step": 506, + "time_per_iteration": 3.4425716400146484 + }, + { + "auxiliary_loss_clip": 0.08947556, + "auxiliary_loss_mlp": 0.0229462, + "balance_loss_clip": 0.07588895, + "balance_loss_mlp": 0.02080234, + "epoch": 0.030482489102660455, + "flos": 19433974763520.0, + "grad_norm": 15.732874937996321, + "language_loss": 0.96318799, + "learning_rate": 3.999997573114069e-06, + "loss": 1.07560968, + "num_input_tokens_seen": 10786725, + "router_z_loss_clip": 13.5703125, + "router_z_loss_mlp": 2.14648438, + "step": 507, + "time_per_iteration": 2.6885857582092285 + }, + { + "auxiliary_loss_clip": 0.08928548, + "auxiliary_loss_mlp": 0.02259048, + "balance_loss_clip": 0.07588597, + "balance_loss_mlp": 0.02042945, + "epoch": 0.030542612355328423, + "flos": 20382034584960.0, + "grad_norm": 22.351883402694675, + "language_loss": 1.05944586, + "learning_rate": 3.999996928472659e-06, + "loss": 1.17132187, + "num_input_tokens_seen": 10805390, + "router_z_loss_clip": 13.3984375, + "router_z_loss_mlp": 2.15722656, + "step": 508, + "time_per_iteration": 2.659903049468994 + }, + { + "auxiliary_loss_clip": 0.08911724, + "auxiliary_loss_mlp": 0.02284852, + "balance_loss_clip": 0.07589735, + "balance_loss_mlp": 0.02067796, + "epoch": 0.030602735607996392, + "flos": 34685809194240.0, + "grad_norm": 36.57726962187856, + "language_loss": 0.84476292, + "learning_rate": 3.999996207991165e-06, + "loss": 0.95672864, + "num_input_tokens_seen": 10828030, + "router_z_loss_clip": 13.1953125, + "router_z_loss_mlp": 2.17089844, + "step": 509, + "time_per_iteration": 2.8194127082824707 + }, + { + "auxiliary_loss_clip": 0.08892205, + "auxiliary_loss_mlp": 0.02281797, + "balance_loss_clip": 0.07575735, + "balance_loss_mlp": 0.02065503, + "epoch": 0.03066285886066436, + "flos": 23665283043840.0, + "grad_norm": 17.47434487382061, + "language_loss": 0.97325271, + "learning_rate": 3.999995411669614e-06, + "loss": 1.08499277, + "num_input_tokens_seen": 10845240, + "router_z_loss_clip": 13.15625, + "router_z_loss_mlp": 2.16210938, + "step": 510, + "time_per_iteration": 2.6817235946655273 + }, + { + "auxiliary_loss_clip": 0.08892487, + "auxiliary_loss_mlp": 0.02360194, + "balance_loss_clip": 0.07583004, + "balance_loss_mlp": 0.02123492, + "epoch": 0.030722982113332332, + "flos": 23009656371840.0, + "grad_norm": 18.905046526469672, + "language_loss": 1.01792526, + "learning_rate": 3.999994539508036e-06, + "loss": 1.13045216, + "num_input_tokens_seen": 10864325, + "router_z_loss_clip": 13.109375, + "router_z_loss_mlp": 2.36328125, + "step": 511, + "time_per_iteration": 2.7218635082244873 + }, + { + "auxiliary_loss_clip": 0.08893925, + "auxiliary_loss_mlp": 0.02289988, + "balance_loss_clip": 0.07569309, + "balance_loss_mlp": 0.02083041, + "epoch": 0.0307831053660003, + "flos": 24757253452800.0, + "grad_norm": 19.668331583944035, + "language_loss": 0.98058987, + "learning_rate": 3.9999935915064655e-06, + "loss": 1.09242892, + "num_input_tokens_seen": 10883860, + "router_z_loss_clip": 13.25, + "router_z_loss_mlp": 2.07226562, + "step": 512, + "time_per_iteration": 2.6965620517730713 + }, + { + "auxiliary_loss_clip": 0.08852743, + "auxiliary_loss_mlp": 0.02379446, + "balance_loss_clip": 0.0755362, + "balance_loss_mlp": 0.02156858, + "epoch": 0.03084322861866827, + "flos": 26148113775360.0, + "grad_norm": 13.468181826610785, + "language_loss": 1.01916862, + "learning_rate": 3.9999925676649374e-06, + "loss": 1.13149047, + "num_input_tokens_seen": 10904555, + "router_z_loss_clip": 12.984375, + "router_z_loss_mlp": 2.22460938, + "step": 513, + "time_per_iteration": 2.711587429046631 + }, + { + "auxiliary_loss_clip": 0.08845583, + "auxiliary_loss_mlp": 0.02430958, + "balance_loss_clip": 0.07545915, + "balance_loss_mlp": 0.02204555, + "epoch": 0.03090335187133624, + "flos": 18777383769600.0, + "grad_norm": 6.55607776583441, + "language_loss": 0.95138013, + "learning_rate": 3.999991467983491e-06, + "loss": 1.06414557, + "num_input_tokens_seen": 10923700, + "router_z_loss_clip": 13.0, + "router_z_loss_mlp": 2.26269531, + "step": 514, + "time_per_iteration": 2.6500775814056396 + }, + { + "auxiliary_loss_clip": 0.08815307, + "auxiliary_loss_mlp": 0.02407072, + "balance_loss_clip": 0.07539771, + "balance_loss_mlp": 0.02187917, + "epoch": 0.030963475124004207, + "flos": 23228603890560.0, + "grad_norm": 18.204719930438795, + "language_loss": 0.97247916, + "learning_rate": 3.999990292462167e-06, + "loss": 1.08470297, + "num_input_tokens_seen": 10942730, + "router_z_loss_clip": 12.7578125, + "router_z_loss_mlp": 2.19335938, + "step": 515, + "time_per_iteration": 2.7167558670043945 + }, + { + "auxiliary_loss_clip": 0.08806405, + "auxiliary_loss_mlp": 0.02437712, + "balance_loss_clip": 0.0752582, + "balance_loss_mlp": 0.02208258, + "epoch": 0.03102359837667218, + "flos": 42535998662400.0, + "grad_norm": 5.904658856542002, + "language_loss": 1.00314569, + "learning_rate": 3.999989041101011e-06, + "loss": 1.11558676, + "num_input_tokens_seen": 10967120, + "router_z_loss_clip": 12.8203125, + "router_z_loss_mlp": 2.29492188, + "step": 516, + "time_per_iteration": 2.932173013687134 + }, + { + "auxiliary_loss_clip": 0.08796877, + "auxiliary_loss_mlp": 0.02455233, + "balance_loss_clip": 0.07514809, + "balance_loss_mlp": 0.02220629, + "epoch": 0.031083721629340148, + "flos": 21183039290880.0, + "grad_norm": 45.02393900109363, + "language_loss": 0.9180311, + "learning_rate": 3.999987713900071e-06, + "loss": 1.03055215, + "num_input_tokens_seen": 10986775, + "router_z_loss_clip": 12.8359375, + "router_z_loss_mlp": 2.34375, + "step": 517, + "time_per_iteration": 2.666154623031616 + }, + { + "auxiliary_loss_clip": 0.08820206, + "auxiliary_loss_mlp": 0.02414127, + "balance_loss_clip": 0.07551458, + "balance_loss_mlp": 0.02194306, + "epoch": 0.031143844882008116, + "flos": 29723963091840.0, + "grad_norm": 7.285252117980509, + "language_loss": 0.99479294, + "learning_rate": 3.999986310859396e-06, + "loss": 1.10713625, + "num_input_tokens_seen": 11011360, + "router_z_loss_clip": 12.6796875, + "router_z_loss_mlp": 2.19824219, + "step": 518, + "time_per_iteration": 2.752505302429199 + }, + { + "auxiliary_loss_clip": 0.08830461, + "auxiliary_loss_mlp": 0.024645, + "balance_loss_clip": 0.07556459, + "balance_loss_mlp": 0.02246586, + "epoch": 0.031203968134676085, + "flos": 23119172058240.0, + "grad_norm": 20.736865355911096, + "language_loss": 1.01917171, + "learning_rate": 3.999984831979039e-06, + "loss": 1.13212132, + "num_input_tokens_seen": 11030150, + "router_z_loss_clip": 12.734375, + "router_z_loss_mlp": 2.1796875, + "step": 519, + "time_per_iteration": 2.6659457683563232 + }, + { + "auxiliary_loss_clip": 0.08817208, + "auxiliary_loss_mlp": 0.02465606, + "balance_loss_clip": 0.07545176, + "balance_loss_mlp": 0.02241778, + "epoch": 0.03126409138734405, + "flos": 20959815214080.0, + "grad_norm": 7.142122271726701, + "language_loss": 1.00803113, + "learning_rate": 3.999983277259057e-06, + "loss": 1.12085938, + "num_input_tokens_seen": 11049145, + "router_z_loss_clip": 12.7265625, + "router_z_loss_mlp": 2.23632812, + "step": 520, + "time_per_iteration": 2.7612173557281494 + }, + { + "auxiliary_loss_clip": 0.08873951, + "auxiliary_loss_mlp": 0.02427922, + "balance_loss_clip": 0.07591425, + "balance_loss_mlp": 0.02219163, + "epoch": 0.031324214640012026, + "flos": 21656083916160.0, + "grad_norm": 5386.394179139514, + "language_loss": 1.03191018, + "learning_rate": 3.999981646699509e-06, + "loss": 1.14492893, + "num_input_tokens_seen": 11068835, + "router_z_loss_clip": 12.8203125, + "router_z_loss_mlp": 2.08886719, + "step": 521, + "time_per_iteration": 2.6934170722961426 + }, + { + "auxiliary_loss_clip": 0.08889641, + "auxiliary_loss_mlp": 0.02359363, + "balance_loss_clip": 0.07604645, + "balance_loss_mlp": 0.02163669, + "epoch": 0.03138433789267999, + "flos": 23448180314880.0, + "grad_norm": 8.073235529869596, + "language_loss": 0.83005708, + "learning_rate": 3.999979940300456e-06, + "loss": 0.94254714, + "num_input_tokens_seen": 11088980, + "router_z_loss_clip": 12.84375, + "router_z_loss_mlp": 1.95800781, + "step": 522, + "time_per_iteration": 2.8722758293151855 + }, + { + "auxiliary_loss_clip": 0.08903908, + "auxiliary_loss_mlp": 0.02254118, + "balance_loss_clip": 0.07622182, + "balance_loss_mlp": 0.0208465, + "epoch": 0.03144446114534796, + "flos": 18986939631360.0, + "grad_norm": 12.411483225368043, + "language_loss": 1.05680871, + "learning_rate": 3.999978158061963e-06, + "loss": 1.16838908, + "num_input_tokens_seen": 11104300, + "router_z_loss_clip": 12.8046875, + "router_z_loss_mlp": 1.6953125, + "step": 523, + "time_per_iteration": 2.650547742843628 + }, + { + "auxiliary_loss_clip": 0.08934012, + "auxiliary_loss_mlp": 0.02230434, + "balance_loss_clip": 0.07644011, + "balance_loss_mlp": 0.0206087, + "epoch": 0.031504584398015935, + "flos": 22644240716160.0, + "grad_norm": 13.96543726868128, + "language_loss": 1.08792841, + "learning_rate": 3.999976299984099e-06, + "loss": 1.1995728, + "num_input_tokens_seen": 11123335, + "router_z_loss_clip": 12.8984375, + "router_z_loss_mlp": 1.69628906, + "step": 524, + "time_per_iteration": 2.7135303020477295 + }, + { + "auxiliary_loss_clip": 0.08891568, + "auxiliary_loss_mlp": 0.02091454, + "balance_loss_clip": 0.07603844, + "balance_loss_mlp": 0.0193486, + "epoch": 0.0315647076506839, + "flos": 25303364438400.0, + "grad_norm": 13.325751395918596, + "language_loss": 0.96287918, + "learning_rate": 3.999974366066933e-06, + "loss": 1.07270944, + "num_input_tokens_seen": 11140880, + "router_z_loss_clip": 12.875, + "router_z_loss_mlp": 1.56542969, + "step": 525, + "time_per_iteration": 2.7008469104766846 + }, + { + "auxiliary_loss_clip": 0.08895689, + "auxiliary_loss_mlp": 0.02060743, + "balance_loss_clip": 0.07611247, + "balance_loss_mlp": 0.01902052, + "epoch": 0.03162483090335187, + "flos": 16988515752960.0, + "grad_norm": 10.865036443132793, + "language_loss": 0.93799376, + "learning_rate": 3.999972356310538e-06, + "loss": 1.04755807, + "num_input_tokens_seen": 11158710, + "router_z_loss_clip": 12.84375, + "router_z_loss_mlp": 1.58789062, + "step": 526, + "time_per_iteration": 2.6346511840820312 + }, + { + "auxiliary_loss_clip": 0.08917748, + "auxiliary_loss_mlp": 0.01935945, + "balance_loss_clip": 0.07596096, + "balance_loss_mlp": 0.01773629, + "epoch": 0.03168495415601984, + "flos": 18740515173120.0, + "grad_norm": 57.85895101220995, + "language_loss": 0.99752951, + "learning_rate": 3.999970270714991e-06, + "loss": 1.10606647, + "num_input_tokens_seen": 11177550, + "router_z_loss_clip": 13.2109375, + "router_z_loss_mlp": 1.62402344, + "step": 527, + "time_per_iteration": 2.679004669189453 + }, + { + "auxiliary_loss_clip": 0.08855803, + "auxiliary_loss_mlp": 0.01834989, + "balance_loss_clip": 0.07585346, + "balance_loss_mlp": 0.01673914, + "epoch": 0.03174507740868781, + "flos": 21221207625600.0, + "grad_norm": 46.02909291045389, + "language_loss": 1.11322296, + "learning_rate": 3.999968109280371e-06, + "loss": 1.22013092, + "num_input_tokens_seen": 11196230, + "router_z_loss_clip": 12.703125, + "router_z_loss_mlp": 1.61035156, + "step": 528, + "time_per_iteration": 2.6590561866760254 + }, + { + "auxiliary_loss_clip": 0.08896849, + "auxiliary_loss_mlp": 0.01846134, + "balance_loss_clip": 0.07587088, + "balance_loss_mlp": 0.01668655, + "epoch": 0.03180520066135578, + "flos": 24794122049280.0, + "grad_norm": 60.37354361545739, + "language_loss": 0.97275496, + "learning_rate": 3.99996587200676e-06, + "loss": 1.08018494, + "num_input_tokens_seen": 11214935, + "router_z_loss_clip": 13.09375, + "router_z_loss_mlp": 1.77539062, + "step": 529, + "time_per_iteration": 2.7260618209838867 + }, + { + "auxiliary_loss_clip": 0.08883977, + "auxiliary_loss_mlp": 0.01771414, + "balance_loss_clip": 0.07582102, + "balance_loss_mlp": 0.01579535, + "epoch": 0.03186532391402375, + "flos": 24871339186560.0, + "grad_norm": 10627.611218983826, + "language_loss": 1.18170238, + "learning_rate": 3.999963558894243e-06, + "loss": 1.28825641, + "num_input_tokens_seen": 11235310, + "router_z_loss_clip": 13.015625, + "router_z_loss_mlp": 1.91894531, + "step": 530, + "time_per_iteration": 2.7020938396453857 + }, + { + "auxiliary_loss_clip": 0.08833256, + "auxiliary_loss_mlp": 0.01774458, + "balance_loss_clip": 0.07546531, + "balance_loss_mlp": 0.01588683, + "epoch": 0.03192544716669172, + "flos": 21221417260800.0, + "grad_norm": 74.92861353079512, + "language_loss": 0.92192125, + "learning_rate": 3.999961169942907e-06, + "loss": 1.02799833, + "num_input_tokens_seen": 11254425, + "router_z_loss_clip": 12.8671875, + "router_z_loss_mlp": 1.85644531, + "step": 531, + "time_per_iteration": 5.536854028701782 + }, + { + "auxiliary_loss_clip": 0.08819988, + "auxiliary_loss_mlp": 0.0179185, + "balance_loss_clip": 0.07536054, + "balance_loss_mlp": 0.01611224, + "epoch": 0.03198557041935969, + "flos": 24360168153600.0, + "grad_norm": 15.362611414198588, + "language_loss": 1.04843593, + "learning_rate": 3.999958705152843e-06, + "loss": 1.15455437, + "num_input_tokens_seen": 11274595, + "router_z_loss_clip": 12.8359375, + "router_z_loss_mlp": 1.8046875, + "step": 532, + "time_per_iteration": 4.078269958496094 + }, + { + "auxiliary_loss_clip": 0.07593378, + "auxiliary_loss_mlp": 0.01964501, + "balance_loss_clip": 0.07000267, + "balance_loss_mlp": 0.01595619, + "epoch": 0.032045693672027656, + "flos": 61847235993600.0, + "grad_norm": 0.8955673428440366, + "language_loss": 0.58032346, + "learning_rate": 3.9999561645241445e-06, + "loss": 0.67590225, + "num_input_tokens_seen": 11336705, + "router_z_loss_clip": 5.9375, + "router_z_loss_mlp": 3.68554688, + "step": 533, + "time_per_iteration": 3.319361925125122 + }, + { + "auxiliary_loss_clip": 0.08788651, + "auxiliary_loss_mlp": 0.01742728, + "balance_loss_clip": 0.07528964, + "balance_loss_mlp": 0.01567061, + "epoch": 0.03210581692469563, + "flos": 28408475116800.0, + "grad_norm": 18.42557842883857, + "language_loss": 0.99417937, + "learning_rate": 3.999953548056907e-06, + "loss": 1.09949315, + "num_input_tokens_seen": 11356820, + "router_z_loss_clip": 12.5859375, + "router_z_loss_mlp": 1.75585938, + "step": 534, + "time_per_iteration": 4.265074729919434 + }, + { + "auxiliary_loss_clip": 0.08770919, + "auxiliary_loss_mlp": 0.0174947, + "balance_loss_clip": 0.07504185, + "balance_loss_mlp": 0.01577809, + "epoch": 0.03216594017736359, + "flos": 24724661414400.0, + "grad_norm": 508.9639434919875, + "language_loss": 0.94137996, + "learning_rate": 3.999950855751232e-06, + "loss": 1.04658389, + "num_input_tokens_seen": 11376645, + "router_z_loss_clip": 12.671875, + "router_z_loss_mlp": 1.71777344, + "step": 535, + "time_per_iteration": 2.7245981693267822 + }, + { + "auxiliary_loss_clip": 0.08758718, + "auxiliary_loss_mlp": 0.01725335, + "balance_loss_clip": 0.07518992, + "balance_loss_mlp": 0.01554437, + "epoch": 0.032226063430031565, + "flos": 31183445508480.0, + "grad_norm": 22.532643943929422, + "language_loss": 0.94802475, + "learning_rate": 3.999948087607219e-06, + "loss": 1.05286527, + "num_input_tokens_seen": 11397310, + "router_z_loss_clip": 12.390625, + "router_z_loss_mlp": 1.70996094, + "step": 536, + "time_per_iteration": 2.7583792209625244 + }, + { + "auxiliary_loss_clip": 0.08705089, + "auxiliary_loss_mlp": 0.01729852, + "balance_loss_clip": 0.07491484, + "balance_loss_mlp": 0.01569253, + "epoch": 0.03228618668269954, + "flos": 32206584188160.0, + "grad_norm": 18.146665662297185, + "language_loss": 0.83908743, + "learning_rate": 3.999945243624975e-06, + "loss": 0.94343686, + "num_input_tokens_seen": 11418475, + "router_z_loss_clip": 12.1484375, + "router_z_loss_mlp": 1.60546875, + "step": 537, + "time_per_iteration": 2.770418167114258 + }, + { + "auxiliary_loss_clip": 0.08731261, + "auxiliary_loss_mlp": 0.01758368, + "balance_loss_clip": 0.07496089, + "balance_loss_mlp": 0.0159672, + "epoch": 0.0323463099353675, + "flos": 22676036140800.0, + "grad_norm": 12.39933899749453, + "language_loss": 0.95942801, + "learning_rate": 3.999942323804607e-06, + "loss": 1.06432438, + "num_input_tokens_seen": 11436630, + "router_z_loss_clip": 12.3515625, + "router_z_loss_mlp": 1.6171875, + "step": 538, + "time_per_iteration": 2.7392029762268066 + }, + { + "auxiliary_loss_clip": 0.0875225, + "auxiliary_loss_mlp": 0.01750456, + "balance_loss_clip": 0.07507962, + "balance_loss_mlp": 0.01584802, + "epoch": 0.032406433188035474, + "flos": 26912207957760.0, + "grad_norm": 95.24255955505957, + "language_loss": 0.90228236, + "learning_rate": 3.999939328146225e-06, + "loss": 1.00730944, + "num_input_tokens_seen": 11457275, + "router_z_loss_clip": 12.4453125, + "router_z_loss_mlp": 1.65625, + "step": 539, + "time_per_iteration": 2.760545253753662 + }, + { + "auxiliary_loss_clip": 0.08700242, + "auxiliary_loss_mlp": 0.01788145, + "balance_loss_clip": 0.07481987, + "balance_loss_mlp": 0.0161162, + "epoch": 0.03246655644070344, + "flos": 31511992567680.0, + "grad_norm": 15.31403595077071, + "language_loss": 0.89398444, + "learning_rate": 3.999936256649943e-06, + "loss": 0.99886829, + "num_input_tokens_seen": 11476925, + "router_z_loss_clip": 12.1875, + "router_z_loss_mlp": 1.76757812, + "step": 540, + "time_per_iteration": 2.791525363922119 + }, + { + "auxiliary_loss_clip": 0.08740143, + "auxiliary_loss_mlp": 0.01834392, + "balance_loss_clip": 0.07499444, + "balance_loss_mlp": 0.01643276, + "epoch": 0.03252667969337141, + "flos": 23224453113600.0, + "grad_norm": 73.47244628512628, + "language_loss": 0.99572086, + "learning_rate": 3.999933109315878e-06, + "loss": 1.10146618, + "num_input_tokens_seen": 11496830, + "router_z_loss_clip": 12.40625, + "router_z_loss_mlp": 1.90917969, + "step": 541, + "time_per_iteration": 2.698315143585205 + }, + { + "auxiliary_loss_clip": 0.08765414, + "auxiliary_loss_mlp": 0.01821723, + "balance_loss_clip": 0.07523992, + "balance_loss_mlp": 0.01612201, + "epoch": 0.032586802946039384, + "flos": 14762800874880.0, + "grad_norm": 49.77821697975532, + "language_loss": 1.00654817, + "learning_rate": 3.9999298861441496e-06, + "loss": 1.11241961, + "num_input_tokens_seen": 11515605, + "router_z_loss_clip": 12.4296875, + "router_z_loss_mlp": 2.09667969, + "step": 542, + "time_per_iteration": 2.6720223426818848 + }, + { + "auxiliary_loss_clip": 0.08722232, + "auxiliary_loss_mlp": 0.01879557, + "balance_loss_clip": 0.07465587, + "balance_loss_mlp": 0.01644953, + "epoch": 0.03264692619870735, + "flos": 24287688771840.0, + "grad_norm": 65.19472082730613, + "language_loss": 0.83699101, + "learning_rate": 3.999926587134879e-06, + "loss": 0.9430089, + "num_input_tokens_seen": 11536230, + "router_z_loss_clip": 12.5625, + "router_z_loss_mlp": 2.34375, + "step": 543, + "time_per_iteration": 2.692474842071533 + }, + { + "auxiliary_loss_clip": 0.0878472, + "auxiliary_loss_mlp": 0.01882603, + "balance_loss_clip": 0.07507792, + "balance_loss_mlp": 0.01631214, + "epoch": 0.03270704945137532, + "flos": 22899763342080.0, + "grad_norm": 1912.553873416959, + "language_loss": 1.09316349, + "learning_rate": 3.999923212288192e-06, + "loss": 1.19983673, + "num_input_tokens_seen": 11554715, + "router_z_loss_clip": 12.7734375, + "router_z_loss_mlp": 2.51367188, + "step": 544, + "time_per_iteration": 2.663267135620117 + }, + { + "auxiliary_loss_clip": 0.0881625, + "auxiliary_loss_mlp": 0.01879222, + "balance_loss_clip": 0.07490219, + "balance_loss_mlp": 0.01537997, + "epoch": 0.032767172704043286, + "flos": 18046887874560.0, + "grad_norm": 1976.6790975556307, + "language_loss": 0.85651809, + "learning_rate": 3.999919761604216e-06, + "loss": 0.96347284, + "num_input_tokens_seen": 11571370, + "router_z_loss_clip": 13.265625, + "router_z_loss_mlp": 3.41210938, + "step": 545, + "time_per_iteration": 2.6566007137298584 + }, + { + "auxiliary_loss_clip": 0.08881226, + "auxiliary_loss_mlp": 0.01919651, + "balance_loss_clip": 0.07538594, + "balance_loss_mlp": 0.01591969, + "epoch": 0.03282729595671126, + "flos": 22535353935360.0, + "grad_norm": 36635.99630864103, + "language_loss": 1.19350576, + "learning_rate": 3.999916235083083e-06, + "loss": 1.30151451, + "num_input_tokens_seen": 11588560, + "router_z_loss_clip": 13.421875, + "router_z_loss_mlp": 3.27539062, + "step": 546, + "time_per_iteration": 2.6508443355560303 + }, + { + "auxiliary_loss_clip": 0.0885489, + "auxiliary_loss_mlp": 0.01969573, + "balance_loss_clip": 0.07525921, + "balance_loss_mlp": 0.01650092, + "epoch": 0.03288741920937923, + "flos": 20416555267200.0, + "grad_norm": 175.83782863941582, + "language_loss": 1.0484463, + "learning_rate": 3.999912632724925e-06, + "loss": 1.15669084, + "num_input_tokens_seen": 11605685, + "router_z_loss_clip": 13.28125, + "router_z_loss_mlp": 3.1953125, + "step": 547, + "time_per_iteration": 2.709317445755005 + }, + { + "auxiliary_loss_clip": 0.08846241, + "auxiliary_loss_mlp": 0.02054837, + "balance_loss_clip": 0.07521404, + "balance_loss_mlp": 0.01724484, + "epoch": 0.032947542462047195, + "flos": 20784402691200.0, + "grad_norm": 1231.4634556281662, + "language_loss": 0.99917918, + "learning_rate": 3.999908954529881e-06, + "loss": 1.10818994, + "num_input_tokens_seen": 11626290, + "router_z_loss_clip": 13.2578125, + "router_z_loss_mlp": 3.30664062, + "step": 548, + "time_per_iteration": 2.761152744293213 + }, + { + "auxiliary_loss_clip": 0.08837526, + "auxiliary_loss_mlp": 0.02099407, + "balance_loss_clip": 0.07500955, + "balance_loss_mlp": 0.01773059, + "epoch": 0.03300766571471517, + "flos": 19907354805120.0, + "grad_norm": 538.4476306780408, + "language_loss": 0.89559388, + "learning_rate": 3.999905200498087e-06, + "loss": 1.00496316, + "num_input_tokens_seen": 11643950, + "router_z_loss_clip": 13.3671875, + "router_z_loss_mlp": 3.26367188, + "step": 549, + "time_per_iteration": 2.7063941955566406 + }, + { + "auxiliary_loss_clip": 0.08802217, + "auxiliary_loss_mlp": 0.02104246, + "balance_loss_clip": 0.07490957, + "balance_loss_mlp": 0.0178324, + "epoch": 0.03306778896738313, + "flos": 17973569952000.0, + "grad_norm": 95.24031464069257, + "language_loss": 1.00179911, + "learning_rate": 3.999901370629689e-06, + "loss": 1.1108638, + "num_input_tokens_seen": 11662560, + "router_z_loss_clip": 13.125, + "router_z_loss_mlp": 3.20703125, + "step": 550, + "time_per_iteration": 2.6905276775360107 + }, + { + "auxiliary_loss_clip": 0.08789266, + "auxiliary_loss_mlp": 0.02134598, + "balance_loss_clip": 0.07500902, + "balance_loss_mlp": 0.01818551, + "epoch": 0.033127912220051105, + "flos": 21659899276800.0, + "grad_norm": 52.30662645055097, + "language_loss": 0.93777549, + "learning_rate": 3.99989746492483e-06, + "loss": 1.04701412, + "num_input_tokens_seen": 11682265, + "router_z_loss_clip": 12.8984375, + "router_z_loss_mlp": 3.16015625, + "step": 551, + "time_per_iteration": 2.7061314582824707 + }, + { + "auxiliary_loss_clip": 0.08738074, + "auxiliary_loss_mlp": 0.02134365, + "balance_loss_clip": 0.07474738, + "balance_loss_mlp": 0.01835484, + "epoch": 0.03318803547271908, + "flos": 30195875687040.0, + "grad_norm": 81.64424293941155, + "language_loss": 1.06586599, + "learning_rate": 3.999893483383658e-06, + "loss": 1.17459035, + "num_input_tokens_seen": 11699300, + "router_z_loss_clip": 12.6484375, + "router_z_loss_mlp": 2.98828125, + "step": 552, + "time_per_iteration": 2.7557857036590576 + }, + { + "auxiliary_loss_clip": 0.08738689, + "auxiliary_loss_mlp": 0.02132193, + "balance_loss_clip": 0.07474653, + "balance_loss_mlp": 0.01841513, + "epoch": 0.03324815872538704, + "flos": 20382286147200.0, + "grad_norm": 103.46520912531122, + "language_loss": 1.07230687, + "learning_rate": 3.999889426006326e-06, + "loss": 1.18101549, + "num_input_tokens_seen": 11716955, + "router_z_loss_clip": 12.6328125, + "router_z_loss_mlp": 2.90625, + "step": 553, + "time_per_iteration": 2.6690380573272705 + }, + { + "auxiliary_loss_clip": 0.0876793, + "auxiliary_loss_mlp": 0.02203825, + "balance_loss_clip": 0.07493228, + "balance_loss_mlp": 0.01878431, + "epoch": 0.033308281978055014, + "flos": 24500766504960.0, + "grad_norm": 2577.3704160991106, + "language_loss": 0.91311669, + "learning_rate": 3.999885292792986e-06, + "loss": 1.0228343, + "num_input_tokens_seen": 11736130, + "router_z_loss_clip": 12.75, + "router_z_loss_mlp": 3.25390625, + "step": 554, + "time_per_iteration": 2.690467119216919 + }, + { + "auxiliary_loss_clip": 0.08781252, + "auxiliary_loss_mlp": 0.02161472, + "balance_loss_clip": 0.0750941, + "balance_loss_mlp": 0.01854961, + "epoch": 0.03336840523072298, + "flos": 23406406254720.0, + "grad_norm": 23.66967902789698, + "language_loss": 0.92365468, + "learning_rate": 3.999881083743795e-06, + "loss": 1.03308201, + "num_input_tokens_seen": 11754425, + "router_z_loss_clip": 12.7265625, + "router_z_loss_mlp": 3.06445312, + "step": 555, + "time_per_iteration": 2.7009239196777344 + }, + { + "auxiliary_loss_clip": 0.0871176, + "auxiliary_loss_mlp": 0.02191896, + "balance_loss_clip": 0.0746032, + "balance_loss_mlp": 0.01904268, + "epoch": 0.03342852848339095, + "flos": 30557685617280.0, + "grad_norm": 32.47411862244808, + "language_loss": 1.03816569, + "learning_rate": 3.999876798858914e-06, + "loss": 1.14720225, + "num_input_tokens_seen": 11772845, + "router_z_loss_clip": 12.5234375, + "router_z_loss_mlp": 2.875, + "step": 556, + "time_per_iteration": 2.7751269340515137 + }, + { + "auxiliary_loss_clip": 0.08728363, + "auxiliary_loss_mlp": 0.02208938, + "balance_loss_clip": 0.07497713, + "balance_loss_mlp": 0.01914825, + "epoch": 0.03348865173605892, + "flos": 22899931050240.0, + "grad_norm": 26.350622314910414, + "language_loss": 0.97158062, + "learning_rate": 3.999872438138503e-06, + "loss": 1.0809536, + "num_input_tokens_seen": 11792850, + "router_z_loss_clip": 12.3046875, + "router_z_loss_mlp": 2.93945312, + "step": 557, + "time_per_iteration": 2.6803956031799316 + }, + { + "auxiliary_loss_clip": 0.08708371, + "auxiliary_loss_mlp": 0.02154386, + "balance_loss_clip": 0.0748485, + "balance_loss_mlp": 0.01905477, + "epoch": 0.03354877498872689, + "flos": 17681807635200.0, + "grad_norm": 18.772470179547817, + "language_loss": 1.10132766, + "learning_rate": 3.999868001582729e-06, + "loss": 1.20995522, + "num_input_tokens_seen": 11809670, + "router_z_loss_clip": 12.2265625, + "router_z_loss_mlp": 2.49023438, + "step": 558, + "time_per_iteration": 2.650348663330078 + }, + { + "auxiliary_loss_clip": 0.08667068, + "auxiliary_loss_mlp": 0.02131925, + "balance_loss_clip": 0.07472065, + "balance_loss_mlp": 0.01914487, + "epoch": 0.03360889824139486, + "flos": 21659438079360.0, + "grad_norm": 17.45552884003481, + "language_loss": 0.92322779, + "learning_rate": 3.99986348919176e-06, + "loss": 1.03121769, + "num_input_tokens_seen": 11829665, + "router_z_loss_clip": 11.953125, + "router_z_loss_mlp": 2.17578125, + "step": 559, + "time_per_iteration": 2.69866681098938 + }, + { + "auxiliary_loss_clip": 0.08715945, + "auxiliary_loss_mlp": 0.02064835, + "balance_loss_clip": 0.07521564, + "balance_loss_mlp": 0.01861607, + "epoch": 0.033669021494062826, + "flos": 21801671585280.0, + "grad_norm": 8.293279297555102, + "language_loss": 0.96911502, + "learning_rate": 3.9998589009657675e-06, + "loss": 1.07692266, + "num_input_tokens_seen": 11848190, + "router_z_loss_clip": 11.9453125, + "router_z_loss_mlp": 2.03417969, + "step": 560, + "time_per_iteration": 2.7140135765075684 + }, + { + "auxiliary_loss_clip": 0.08642244, + "auxiliary_loss_mlp": 0.01977364, + "balance_loss_clip": 0.07480196, + "balance_loss_mlp": 0.01790062, + "epoch": 0.0337291447467308, + "flos": 21871761125760.0, + "grad_norm": 36.168101096947126, + "language_loss": 0.91244531, + "learning_rate": 3.999854236904925e-06, + "loss": 1.01864135, + "num_input_tokens_seen": 11864795, + "router_z_loss_clip": 11.640625, + "router_z_loss_mlp": 1.875, + "step": 561, + "time_per_iteration": 2.6863293647766113 + }, + { + "auxiliary_loss_clip": 0.08645087, + "auxiliary_loss_mlp": 0.01996294, + "balance_loss_clip": 0.07495341, + "balance_loss_mlp": 0.01809374, + "epoch": 0.03378926799939877, + "flos": 24253251943680.0, + "grad_norm": 9.210066016696686, + "language_loss": 0.90415317, + "learning_rate": 3.999849497009409e-06, + "loss": 1.01056707, + "num_input_tokens_seen": 11885275, + "router_z_loss_clip": 11.4921875, + "router_z_loss_mlp": 1.86914062, + "step": 562, + "time_per_iteration": 2.724127769470215 + }, + { + "auxiliary_loss_clip": 0.08630846, + "auxiliary_loss_mlp": 0.01896325, + "balance_loss_clip": 0.07475269, + "balance_loss_mlp": 0.0172867, + "epoch": 0.033849391252066735, + "flos": 16513290921600.0, + "grad_norm": 8.70795014369516, + "language_loss": 0.93251538, + "learning_rate": 3.999844681279401e-06, + "loss": 1.03778696, + "num_input_tokens_seen": 11903595, + "router_z_loss_clip": 11.5546875, + "router_z_loss_mlp": 1.67773438, + "step": 563, + "time_per_iteration": 2.653869867324829 + }, + { + "auxiliary_loss_clip": 0.08601731, + "auxiliary_loss_mlp": 0.0185707, + "balance_loss_clip": 0.07466102, + "balance_loss_mlp": 0.01686648, + "epoch": 0.03390951450473471, + "flos": 15674746786560.0, + "grad_norm": 12.715008158349837, + "language_loss": 1.03361213, + "learning_rate": 3.99983978971508e-06, + "loss": 1.13820004, + "num_input_tokens_seen": 11917815, + "router_z_loss_clip": 11.34375, + "router_z_loss_mlp": 1.70507812, + "step": 564, + "time_per_iteration": 2.6272659301757812 + }, + { + "auxiliary_loss_clip": 0.08544251, + "auxiliary_loss_mlp": 0.01761406, + "balance_loss_clip": 0.07418631, + "balance_loss_mlp": 0.01609581, + "epoch": 0.03396963775740267, + "flos": 22681444728960.0, + "grad_norm": 17.830043780961535, + "language_loss": 1.06299067, + "learning_rate": 3.999834822316635e-06, + "loss": 1.1660471, + "num_input_tokens_seen": 11936305, + "router_z_loss_clip": 11.2578125, + "router_z_loss_mlp": 1.51855469, + "step": 565, + "time_per_iteration": 2.6662397384643555 + }, + { + "auxiliary_loss_clip": 0.07533604, + "auxiliary_loss_mlp": 0.01361189, + "balance_loss_clip": 0.07012594, + "balance_loss_mlp": 0.01291713, + "epoch": 0.034029761010070644, + "flos": 64414872656640.0, + "grad_norm": 1.941550580035849, + "language_loss": 0.56352836, + "learning_rate": 3.9998297790842535e-06, + "loss": 0.65247625, + "num_input_tokens_seen": 11998940, + "router_z_loss_clip": 5.203125, + "router_z_loss_mlp": 0.6953125, + "step": 566, + "time_per_iteration": 3.3542587757110596 + }, + { + "auxiliary_loss_clip": 0.08492532, + "auxiliary_loss_mlp": 0.0159982, + "balance_loss_clip": 0.07380439, + "balance_loss_mlp": 0.01460488, + "epoch": 0.034089884262738616, + "flos": 25010302383360.0, + "grad_norm": 17.320262523662066, + "language_loss": 0.91644871, + "learning_rate": 3.999824660018126e-06, + "loss": 1.01737225, + "num_input_tokens_seen": 12018860, + "router_z_loss_clip": 11.125, + "router_z_loss_mlp": 1.39355469, + "step": 567, + "time_per_iteration": 2.7798964977264404 + }, + { + "auxiliary_loss_clip": 0.08452182, + "auxiliary_loss_mlp": 0.01578824, + "balance_loss_clip": 0.07376789, + "balance_loss_mlp": 0.01451318, + "epoch": 0.03415000751540658, + "flos": 28446643451520.0, + "grad_norm": 16.848598157475653, + "language_loss": 0.91613495, + "learning_rate": 3.999819465118447e-06, + "loss": 1.01644492, + "num_input_tokens_seen": 12039675, + "router_z_loss_clip": 10.7578125, + "router_z_loss_mlp": 1.27539062, + "step": 568, + "time_per_iteration": 2.7506062984466553 + }, + { + "auxiliary_loss_clip": 0.08471178, + "auxiliary_loss_mlp": 0.01592293, + "balance_loss_clip": 0.07369491, + "balance_loss_mlp": 0.0146307, + "epoch": 0.034210130768074554, + "flos": 21474843534720.0, + "grad_norm": 19.531015605864777, + "language_loss": 0.96641582, + "learning_rate": 3.999814194385413e-06, + "loss": 1.06705046, + "num_input_tokens_seen": 12057680, + "router_z_loss_clip": 11.0234375, + "router_z_loss_mlp": 1.29199219, + "step": 569, + "time_per_iteration": 2.679094076156616 + }, + { + "auxiliary_loss_clip": 0.08444348, + "auxiliary_loss_mlp": 0.01572924, + "balance_loss_clip": 0.07354259, + "balance_loss_mlp": 0.01444559, + "epoch": 0.03427025402074252, + "flos": 18703436941440.0, + "grad_norm": 10.09748529662486, + "language_loss": 1.03407526, + "learning_rate": 3.9998088478192255e-06, + "loss": 1.13424802, + "num_input_tokens_seen": 12076135, + "router_z_loss_clip": 10.90625, + "router_z_loss_mlp": 1.28320312, + "step": 570, + "time_per_iteration": 5.62298059463501 + }, + { + "auxiliary_loss_clip": 0.08452979, + "auxiliary_loss_mlp": 0.01597574, + "balance_loss_clip": 0.07344566, + "balance_loss_mlp": 0.01465204, + "epoch": 0.03433037727341049, + "flos": 20856253167360.0, + "grad_norm": 7.817701028438559, + "language_loss": 0.91945982, + "learning_rate": 3.9998034254200846e-06, + "loss": 1.01996529, + "num_input_tokens_seen": 12094785, + "router_z_loss_clip": 11.09375, + "router_z_loss_mlp": 1.32421875, + "step": 571, + "time_per_iteration": 2.654836654663086 + }, + { + "auxiliary_loss_clip": 0.08401142, + "auxiliary_loss_mlp": 0.01674875, + "balance_loss_clip": 0.073204, + "balance_loss_mlp": 0.01534971, + "epoch": 0.03439050052607846, + "flos": 25417240536960.0, + "grad_norm": 10.131092922686104, + "language_loss": 0.93731064, + "learning_rate": 3.999797927188199e-06, + "loss": 1.0380708, + "num_input_tokens_seen": 12114590, + "router_z_loss_clip": 10.8046875, + "router_z_loss_mlp": 1.39941406, + "step": 572, + "time_per_iteration": 4.118088483810425 + }, + { + "auxiliary_loss_clip": 0.08396388, + "auxiliary_loss_mlp": 0.01765484, + "balance_loss_clip": 0.07306887, + "balance_loss_mlp": 0.01610417, + "epoch": 0.03445062377874643, + "flos": 17646029141760.0, + "grad_norm": 20.127104681387284, + "language_loss": 0.93513721, + "learning_rate": 3.999792353123774e-06, + "loss": 1.03675592, + "num_input_tokens_seen": 12132390, + "router_z_loss_clip": 10.8984375, + "router_z_loss_mlp": 1.55078125, + "step": 573, + "time_per_iteration": 2.743281841278076 + }, + { + "auxiliary_loss_clip": 0.08402257, + "auxiliary_loss_mlp": 0.01880152, + "balance_loss_clip": 0.07297936, + "balance_loss_mlp": 0.01694757, + "epoch": 0.0345107470314144, + "flos": 16770239066880.0, + "grad_norm": 36.525489937717154, + "language_loss": 0.90410393, + "learning_rate": 3.999786703227023e-06, + "loss": 1.00692797, + "num_input_tokens_seen": 12149035, + "router_z_loss_clip": 11.046875, + "router_z_loss_mlp": 1.85351562, + "step": 574, + "time_per_iteration": 4.080662250518799 + }, + { + "auxiliary_loss_clip": 0.08410574, + "auxiliary_loss_mlp": 0.01951083, + "balance_loss_clip": 0.0729783, + "balance_loss_mlp": 0.01742514, + "epoch": 0.03457087028408237, + "flos": 14689776441600.0, + "grad_norm": 44.337021824182244, + "language_loss": 0.94332999, + "learning_rate": 3.9997809774981606e-06, + "loss": 1.04694653, + "num_input_tokens_seen": 12167530, + "router_z_loss_clip": 11.125, + "router_z_loss_mlp": 2.08398438, + "step": 575, + "time_per_iteration": 2.6497297286987305 + }, + { + "auxiliary_loss_clip": 0.0841077, + "auxiliary_loss_mlp": 0.02005797, + "balance_loss_clip": 0.07284614, + "balance_loss_mlp": 0.01780635, + "epoch": 0.03463099353675034, + "flos": 20017499397120.0, + "grad_norm": 29.883353134979416, + "language_loss": 0.90882921, + "learning_rate": 3.9997751759374025e-06, + "loss": 1.01299489, + "num_input_tokens_seen": 12186340, + "router_z_loss_clip": 11.265625, + "router_z_loss_mlp": 2.24804688, + "step": 576, + "time_per_iteration": 2.67240309715271 + }, + { + "auxiliary_loss_clip": 0.08418353, + "auxiliary_loss_mlp": 0.02062659, + "balance_loss_clip": 0.07293572, + "balance_loss_mlp": 0.01817947, + "epoch": 0.03469111678941831, + "flos": 25308144120960.0, + "grad_norm": 230.42461275956111, + "language_loss": 0.94618452, + "learning_rate": 3.99976929854497e-06, + "loss": 1.05099463, + "num_input_tokens_seen": 12204090, + "router_z_loss_clip": 11.2421875, + "router_z_loss_mlp": 2.44921875, + "step": 577, + "time_per_iteration": 2.6817197799682617 + }, + { + "auxiliary_loss_clip": 0.08418664, + "auxiliary_loss_mlp": 0.02057238, + "balance_loss_clip": 0.07282382, + "balance_loss_mlp": 0.01803943, + "epoch": 0.034751240042086275, + "flos": 23266311027840.0, + "grad_norm": 40.134119868020754, + "language_loss": 0.81416667, + "learning_rate": 3.9997633453210845e-06, + "loss": 0.9189257, + "num_input_tokens_seen": 12224850, + "router_z_loss_clip": 11.359375, + "router_z_loss_mlp": 2.53320312, + "step": 578, + "time_per_iteration": 2.6971585750579834 + }, + { + "auxiliary_loss_clip": 0.08457734, + "auxiliary_loss_mlp": 0.0202791, + "balance_loss_clip": 0.07290839, + "balance_loss_mlp": 0.0177881, + "epoch": 0.03481136329475425, + "flos": 23776056541440.0, + "grad_norm": 24.631913893483972, + "language_loss": 0.86342728, + "learning_rate": 3.999757316265973e-06, + "loss": 0.96828371, + "num_input_tokens_seen": 12244935, + "router_z_loss_clip": 11.6640625, + "router_z_loss_mlp": 2.4921875, + "step": 579, + "time_per_iteration": 2.694719076156616 + }, + { + "auxiliary_loss_clip": 0.08425288, + "auxiliary_loss_mlp": 0.0202294, + "balance_loss_clip": 0.07289667, + "balance_loss_mlp": 0.01773459, + "epoch": 0.03487148654742222, + "flos": 20163799825920.0, + "grad_norm": 24.746236106534205, + "language_loss": 0.94137156, + "learning_rate": 3.999751211379863e-06, + "loss": 1.04585385, + "num_input_tokens_seen": 12262140, + "router_z_loss_clip": 11.3671875, + "router_z_loss_mlp": 2.49609375, + "step": 580, + "time_per_iteration": 2.6965222358703613 + }, + { + "auxiliary_loss_clip": 0.08429064, + "auxiliary_loss_mlp": 0.02027245, + "balance_loss_clip": 0.07292753, + "balance_loss_mlp": 0.01790066, + "epoch": 0.034931609800090184, + "flos": 15675082202880.0, + "grad_norm": 72.69729205239823, + "language_loss": 0.92401338, + "learning_rate": 3.999745030662987e-06, + "loss": 1.02857637, + "num_input_tokens_seen": 12280930, + "router_z_loss_clip": 11.34375, + "router_z_loss_mlp": 2.37011719, + "step": 581, + "time_per_iteration": 2.6485416889190674 + }, + { + "auxiliary_loss_clip": 0.08388546, + "auxiliary_loss_mlp": 0.01934185, + "balance_loss_clip": 0.07261664, + "balance_loss_mlp": 0.01722183, + "epoch": 0.034991733052758156, + "flos": 16367912887680.0, + "grad_norm": 7.903206829146829, + "language_loss": 0.86330044, + "learning_rate": 3.99973877411558e-06, + "loss": 0.96652782, + "num_input_tokens_seen": 12299125, + "router_z_loss_clip": 11.28125, + "router_z_loss_mlp": 2.11914062, + "step": 582, + "time_per_iteration": 2.649725914001465 + }, + { + "auxiliary_loss_clip": 0.08328964, + "auxiliary_loss_mlp": 0.01871683, + "balance_loss_clip": 0.07243238, + "balance_loss_mlp": 0.01678087, + "epoch": 0.03505185630542612, + "flos": 19392787681920.0, + "grad_norm": 16.174360943611433, + "language_loss": 0.95958614, + "learning_rate": 3.999732441737877e-06, + "loss": 1.06159258, + "num_input_tokens_seen": 12316905, + "router_z_loss_clip": 10.859375, + "router_z_loss_mlp": 1.9375, + "step": 583, + "time_per_iteration": 2.643488645553589 + }, + { + "auxiliary_loss_clip": 0.08363868, + "auxiliary_loss_mlp": 0.01881498, + "balance_loss_clip": 0.07254223, + "balance_loss_mlp": 0.0168199, + "epoch": 0.03511197955809409, + "flos": 21330094406400.0, + "grad_norm": 77.84633741200611, + "language_loss": 0.91128743, + "learning_rate": 3.99972603353012e-06, + "loss": 1.01374114, + "num_input_tokens_seen": 12335070, + "router_z_loss_clip": 11.09375, + "router_z_loss_mlp": 1.99511719, + "step": 584, + "time_per_iteration": 2.6665167808532715 + }, + { + "auxiliary_loss_clip": 0.08332659, + "auxiliary_loss_mlp": 0.01830344, + "balance_loss_clip": 0.07228079, + "balance_loss_mlp": 0.01642279, + "epoch": 0.035172102810762065, + "flos": 14141736812160.0, + "grad_norm": 18.638483190058057, + "language_loss": 1.05479646, + "learning_rate": 3.999719549492551e-06, + "loss": 1.15642655, + "num_input_tokens_seen": 12350315, + "router_z_loss_clip": 11.046875, + "router_z_loss_mlp": 1.88183594, + "step": 585, + "time_per_iteration": 2.6243345737457275 + }, + { + "auxiliary_loss_clip": 0.08346213, + "auxiliary_loss_mlp": 0.01757237, + "balance_loss_clip": 0.07237425, + "balance_loss_mlp": 0.01597305, + "epoch": 0.03523222606343003, + "flos": 20302092190080.0, + "grad_norm": 16.531437097419627, + "language_loss": 0.96612549, + "learning_rate": 3.9997129896254165e-06, + "loss": 1.06716001, + "num_input_tokens_seen": 12366030, + "router_z_loss_clip": 11.0859375, + "router_z_loss_mlp": 1.59960938, + "step": 586, + "time_per_iteration": 2.79085373878479 + }, + { + "auxiliary_loss_clip": 0.08346236, + "auxiliary_loss_mlp": 0.01816744, + "balance_loss_clip": 0.07224018, + "balance_loss_mlp": 0.01643652, + "epoch": 0.035292349316098, + "flos": 20382034584960.0, + "grad_norm": 18.968444028471765, + "language_loss": 0.85692161, + "learning_rate": 3.999706353928965e-06, + "loss": 0.95855141, + "num_input_tokens_seen": 12384895, + "router_z_loss_clip": 11.234375, + "router_z_loss_mlp": 1.73242188, + "step": 587, + "time_per_iteration": 2.6773126125335693 + }, + { + "auxiliary_loss_clip": 0.08336938, + "auxiliary_loss_mlp": 0.01864921, + "balance_loss_clip": 0.07205997, + "balance_loss_mlp": 0.01679527, + "epoch": 0.03535247256876597, + "flos": 21475011242880.0, + "grad_norm": 15.49018014588467, + "language_loss": 0.87486923, + "learning_rate": 3.999699642403449e-06, + "loss": 0.97688788, + "num_input_tokens_seen": 12404980, + "router_z_loss_clip": 11.3203125, + "router_z_loss_mlp": 1.85546875, + "step": 588, + "time_per_iteration": 2.7011075019836426 + }, + { + "auxiliary_loss_clip": 0.08372419, + "auxiliary_loss_mlp": 0.01837943, + "balance_loss_clip": 0.07240701, + "balance_loss_mlp": 0.01648257, + "epoch": 0.03541259582143394, + "flos": 23629798039680.0, + "grad_norm": 7.372880070726386, + "language_loss": 1.04957795, + "learning_rate": 3.99969285504912e-06, + "loss": 1.15168166, + "num_input_tokens_seen": 12423835, + "router_z_loss_clip": 11.3203125, + "router_z_loss_mlp": 1.8984375, + "step": 589, + "time_per_iteration": 2.6905288696289062 + }, + { + "auxiliary_loss_clip": 0.08381461, + "auxiliary_loss_mlp": 0.01904967, + "balance_loss_clip": 0.07235886, + "balance_loss_mlp": 0.0170708, + "epoch": 0.03547271907410191, + "flos": 33734269428480.0, + "grad_norm": 5.900447642035286, + "language_loss": 0.93457747, + "learning_rate": 3.99968599186624e-06, + "loss": 1.03744173, + "num_input_tokens_seen": 12443135, + "router_z_loss_clip": 11.4609375, + "router_z_loss_mlp": 1.98046875, + "step": 590, + "time_per_iteration": 2.7626585960388184 + }, + { + "auxiliary_loss_clip": 0.08363292, + "auxiliary_loss_mlp": 0.01913512, + "balance_loss_clip": 0.07212853, + "balance_loss_mlp": 0.01716864, + "epoch": 0.03553284232676988, + "flos": 21149147514240.0, + "grad_norm": 8.056614912073432, + "language_loss": 0.93932045, + "learning_rate": 3.999679052855065e-06, + "loss": 1.04208851, + "num_input_tokens_seen": 12462895, + "router_z_loss_clip": 11.5, + "router_z_loss_mlp": 1.96484375, + "step": 591, + "time_per_iteration": 2.6892929077148438 + }, + { + "auxiliary_loss_clip": 0.08372159, + "auxiliary_loss_mlp": 0.0192709, + "balance_loss_clip": 0.0721619, + "balance_loss_mlp": 0.01729871, + "epoch": 0.03559296557943785, + "flos": 20052607057920.0, + "grad_norm": 11.504016210282687, + "language_loss": 0.90931952, + "learning_rate": 3.999672038015861e-06, + "loss": 1.01231205, + "num_input_tokens_seen": 12481515, + "router_z_loss_clip": 11.5546875, + "router_z_loss_mlp": 1.97363281, + "step": 592, + "time_per_iteration": 2.682248830795288 + }, + { + "auxiliary_loss_clip": 0.07476875, + "auxiliary_loss_mlp": 0.01418694, + "balance_loss_clip": 0.06931903, + "balance_loss_mlp": 0.01348551, + "epoch": 0.035653088832105814, + "flos": 60354742268160.0, + "grad_norm": 1.7390456768388496, + "language_loss": 0.61271667, + "learning_rate": 3.999664947348893e-06, + "loss": 0.70167232, + "num_input_tokens_seen": 12548220, + "router_z_loss_clip": 5.4375, + "router_z_loss_mlp": 0.70214844, + "step": 593, + "time_per_iteration": 3.372291088104248 + }, + { + "auxiliary_loss_clip": 0.08396088, + "auxiliary_loss_mlp": 0.01873215, + "balance_loss_clip": 0.07235788, + "balance_loss_mlp": 0.0169402, + "epoch": 0.035713212084773786, + "flos": 20118084624000.0, + "grad_norm": 4.056543882896522, + "language_loss": 0.9366371, + "learning_rate": 3.999657780854429e-06, + "loss": 1.03933024, + "num_input_tokens_seen": 12566105, + "router_z_loss_clip": 11.609375, + "router_z_loss_mlp": 1.79199219, + "step": 594, + "time_per_iteration": 2.656702756881714 + }, + { + "auxiliary_loss_clip": 0.08370538, + "auxiliary_loss_mlp": 0.01864142, + "balance_loss_clip": 0.07210694, + "balance_loss_mlp": 0.01671786, + "epoch": 0.03577333533744176, + "flos": 26292862903680.0, + "grad_norm": 7.659859705492133, + "language_loss": 0.90299201, + "learning_rate": 3.999650538532742e-06, + "loss": 1.00533891, + "num_input_tokens_seen": 12586680, + "router_z_loss_clip": 11.609375, + "router_z_loss_mlp": 1.92480469, + "step": 595, + "time_per_iteration": 2.735182285308838 + }, + { + "auxiliary_loss_clip": 0.08357747, + "auxiliary_loss_mlp": 0.01819213, + "balance_loss_clip": 0.07199049, + "balance_loss_mlp": 0.01642402, + "epoch": 0.035833458590109724, + "flos": 10894392627840.0, + "grad_norm": 11.312857601205495, + "language_loss": 1.05936086, + "learning_rate": 3.999643220384106e-06, + "loss": 1.16113043, + "num_input_tokens_seen": 12601605, + "router_z_loss_clip": 11.6015625, + "router_z_loss_mlp": 1.76953125, + "step": 596, + "time_per_iteration": 2.6456210613250732 + }, + { + "auxiliary_loss_clip": 0.08308871, + "auxiliary_loss_mlp": 0.01797355, + "balance_loss_clip": 0.07171883, + "balance_loss_mlp": 0.01627124, + "epoch": 0.035893581842777696, + "flos": 22096620357120.0, + "grad_norm": 9.130935198122538, + "language_loss": 0.90824974, + "learning_rate": 3.999635826408799e-06, + "loss": 1.00931203, + "num_input_tokens_seen": 12620365, + "router_z_loss_clip": 11.3671875, + "router_z_loss_mlp": 1.70117188, + "step": 597, + "time_per_iteration": 2.6823341846466064 + }, + { + "auxiliary_loss_clip": 0.08270305, + "auxiliary_loss_mlp": 0.01746721, + "balance_loss_clip": 0.0715827, + "balance_loss_mlp": 0.01584406, + "epoch": 0.03595370509544566, + "flos": 23044847886720.0, + "grad_norm": 9.111056149089638, + "language_loss": 0.87109864, + "learning_rate": 3.999628356607101e-06, + "loss": 0.97126889, + "num_input_tokens_seen": 12641140, + "router_z_loss_clip": 11.1171875, + "router_z_loss_mlp": 1.62402344, + "step": 598, + "time_per_iteration": 2.720789670944214 + }, + { + "auxiliary_loss_clip": 0.08249436, + "auxiliary_loss_mlp": 0.01768458, + "balance_loss_clip": 0.07144348, + "balance_loss_mlp": 0.01596511, + "epoch": 0.03601382834811363, + "flos": 20784109201920.0, + "grad_norm": 3.8408259345244593, + "language_loss": 0.87403977, + "learning_rate": 3.999620810979295e-06, + "loss": 0.97421879, + "num_input_tokens_seen": 12661080, + "router_z_loss_clip": 11.0546875, + "router_z_loss_mlp": 1.71972656, + "step": 599, + "time_per_iteration": 2.648764133453369 + }, + { + "auxiliary_loss_clip": 0.08292407, + "auxiliary_loss_mlp": 0.01772624, + "balance_loss_clip": 0.07133689, + "balance_loss_mlp": 0.01594573, + "epoch": 0.036073951600781605, + "flos": 23958470880000.0, + "grad_norm": 6.448569836830266, + "language_loss": 0.96199447, + "learning_rate": 3.999613189525668e-06, + "loss": 1.06264472, + "num_input_tokens_seen": 12678270, + "router_z_loss_clip": 11.6015625, + "router_z_loss_mlp": 1.78027344, + "step": 600, + "time_per_iteration": 2.677182197570801 + }, + { + "auxiliary_loss_clip": 0.08248397, + "auxiliary_loss_mlp": 0.01755802, + "balance_loss_clip": 0.07142025, + "balance_loss_mlp": 0.01582996, + "epoch": 0.03613407485344957, + "flos": 18917562850560.0, + "grad_norm": 6.503034140887701, + "language_loss": 0.8985101, + "learning_rate": 3.999605492246508e-06, + "loss": 0.9985522, + "num_input_tokens_seen": 12697295, + "router_z_loss_clip": 11.0703125, + "router_z_loss_mlp": 1.72753906, + "step": 601, + "time_per_iteration": 2.6344988346099854 + }, + { + "auxiliary_loss_clip": 0.08262836, + "auxiliary_loss_mlp": 0.01796413, + "balance_loss_clip": 0.07111854, + "balance_loss_mlp": 0.01602054, + "epoch": 0.03619419810611754, + "flos": 23045057521920.0, + "grad_norm": 7.606856937764795, + "language_loss": 0.83811623, + "learning_rate": 3.999597719142107e-06, + "loss": 0.93870872, + "num_input_tokens_seen": 12716165, + "router_z_loss_clip": 11.5234375, + "router_z_loss_mlp": 1.94335938, + "step": 602, + "time_per_iteration": 2.6544992923736572 + }, + { + "auxiliary_loss_clip": 0.08245073, + "auxiliary_loss_mlp": 0.01805812, + "balance_loss_clip": 0.07111835, + "balance_loss_mlp": 0.01607543, + "epoch": 0.03625432135878551, + "flos": 29465002448640.0, + "grad_norm": 10.358505294515373, + "language_loss": 0.86272752, + "learning_rate": 3.999589870212761e-06, + "loss": 0.96323633, + "num_input_tokens_seen": 12735475, + "router_z_loss_clip": 11.328125, + "router_z_loss_mlp": 1.984375, + "step": 603, + "time_per_iteration": 2.7074103355407715 + }, + { + "auxiliary_loss_clip": 0.08216999, + "auxiliary_loss_mlp": 0.01791145, + "balance_loss_clip": 0.07080936, + "balance_loss_mlp": 0.01602794, + "epoch": 0.03631444461145348, + "flos": 23514412567680.0, + "grad_norm": 4.761739949728406, + "language_loss": 0.93545526, + "learning_rate": 3.9995819454587664e-06, + "loss": 1.03553677, + "num_input_tokens_seen": 12754540, + "router_z_loss_clip": 11.3671875, + "router_z_loss_mlp": 1.88574219, + "step": 604, + "time_per_iteration": 2.683458089828491 + }, + { + "auxiliary_loss_clip": 0.08179027, + "auxiliary_loss_mlp": 0.01779272, + "balance_loss_clip": 0.07038404, + "balance_loss_mlp": 0.01587965, + "epoch": 0.03637456786412145, + "flos": 16623770929920.0, + "grad_norm": 10.408229209770424, + "language_loss": 0.89575511, + "learning_rate": 3.999573944880424e-06, + "loss": 0.99533808, + "num_input_tokens_seen": 12773050, + "router_z_loss_clip": 11.4140625, + "router_z_loss_mlp": 1.91308594, + "step": 605, + "time_per_iteration": 2.6058335304260254 + }, + { + "auxiliary_loss_clip": 0.08185698, + "auxiliary_loss_mlp": 0.0179345, + "balance_loss_clip": 0.07041989, + "balance_loss_mlp": 0.01587933, + "epoch": 0.03643469111678942, + "flos": 15857328833280.0, + "grad_norm": 18.44965350869095, + "language_loss": 0.94496262, + "learning_rate": 3.9995658684780375e-06, + "loss": 1.04475403, + "num_input_tokens_seen": 12791240, + "router_z_loss_clip": 11.421875, + "router_z_loss_mlp": 2.05566406, + "step": 606, + "time_per_iteration": 2.6620774269104004 + }, + { + "auxiliary_loss_clip": 0.0816614, + "auxiliary_loss_mlp": 0.01748117, + "balance_loss_clip": 0.07028672, + "balance_loss_mlp": 0.01549944, + "epoch": 0.03649481436945739, + "flos": 23626695438720.0, + "grad_norm": 22.881578639374155, + "language_loss": 0.89864534, + "learning_rate": 3.999557716251912e-06, + "loss": 0.99778789, + "num_input_tokens_seen": 12812245, + "router_z_loss_clip": 11.3828125, + "router_z_loss_mlp": 1.98144531, + "step": 607, + "time_per_iteration": 2.643644332885742 + }, + { + "auxiliary_loss_clip": 0.08159362, + "auxiliary_loss_mlp": 0.01746593, + "balance_loss_clip": 0.07035235, + "balance_loss_mlp": 0.01550708, + "epoch": 0.036554937622125354, + "flos": 21760903774080.0, + "grad_norm": 5.869564247499357, + "language_loss": 0.89574814, + "learning_rate": 3.999549488202358e-06, + "loss": 0.99480766, + "num_input_tokens_seen": 12831085, + "router_z_loss_clip": 11.2421875, + "router_z_loss_mlp": 1.95800781, + "step": 608, + "time_per_iteration": 2.6450629234313965 + }, + { + "auxiliary_loss_clip": 0.08127657, + "auxiliary_loss_mlp": 0.01727103, + "balance_loss_clip": 0.07009961, + "balance_loss_mlp": 0.01525497, + "epoch": 0.036615060874793326, + "flos": 17825215098240.0, + "grad_norm": 10.044459064109706, + "language_loss": 0.90011758, + "learning_rate": 3.999541184329688e-06, + "loss": 0.99866509, + "num_input_tokens_seen": 12849115, + "router_z_loss_clip": 11.171875, + "router_z_loss_mlp": 2.01464844, + "step": 609, + "time_per_iteration": 4.030602216720581 + }, + { + "auxiliary_loss_clip": 0.08147175, + "auxiliary_loss_mlp": 0.01709632, + "balance_loss_clip": 0.07004737, + "balance_loss_mlp": 0.01506309, + "epoch": 0.0366751841274613, + "flos": 26759911962240.0, + "grad_norm": 23.288197653985222, + "language_loss": 0.89072526, + "learning_rate": 3.999532804634215e-06, + "loss": 0.98929334, + "num_input_tokens_seen": 12868005, + "router_z_loss_clip": 11.421875, + "router_z_loss_mlp": 2.03515625, + "step": 610, + "time_per_iteration": 4.13908052444458 + }, + { + "auxiliary_loss_clip": 0.08141156, + "auxiliary_loss_mlp": 0.01701532, + "balance_loss_clip": 0.06999695, + "balance_loss_mlp": 0.01503454, + "epoch": 0.03673530738012926, + "flos": 22202949588480.0, + "grad_norm": 12.716864123026268, + "language_loss": 0.93839324, + "learning_rate": 3.9995243491162575e-06, + "loss": 1.03682017, + "num_input_tokens_seen": 12886890, + "router_z_loss_clip": 11.421875, + "router_z_loss_mlp": 1.98046875, + "step": 611, + "time_per_iteration": 4.084355354309082 + }, + { + "auxiliary_loss_clip": 0.08129553, + "auxiliary_loss_mlp": 0.01677889, + "balance_loss_clip": 0.07002232, + "balance_loss_mlp": 0.01494783, + "epoch": 0.036795430632797235, + "flos": 24688673285760.0, + "grad_norm": 5.856966427284507, + "language_loss": 0.80289567, + "learning_rate": 3.999515817776136e-06, + "loss": 0.9009701, + "num_input_tokens_seen": 12906130, + "router_z_loss_clip": 11.296875, + "router_z_loss_mlp": 1.83007812, + "step": 612, + "time_per_iteration": 2.797450065612793 + }, + { + "auxiliary_loss_clip": 0.08124618, + "auxiliary_loss_mlp": 0.01670571, + "balance_loss_clip": 0.06981046, + "balance_loss_mlp": 0.01486607, + "epoch": 0.0368555538854652, + "flos": 17754706287360.0, + "grad_norm": 13.343841316796098, + "language_loss": 0.86962521, + "learning_rate": 3.999507210614175e-06, + "loss": 0.9675771, + "num_input_tokens_seen": 12925260, + "router_z_loss_clip": 11.4453125, + "router_z_loss_mlp": 1.83984375, + "step": 613, + "time_per_iteration": 4.1074419021606445 + }, + { + "auxiliary_loss_clip": 0.0806347, + "auxiliary_loss_mlp": 0.01642999, + "balance_loss_clip": 0.0695873, + "balance_loss_mlp": 0.01476392, + "epoch": 0.03691567713813317, + "flos": 20600772468480.0, + "grad_norm": 5.522225672422525, + "language_loss": 1.0065136, + "learning_rate": 3.9994985276307e-06, + "loss": 1.10357833, + "num_input_tokens_seen": 12944590, + "router_z_loss_clip": 11.046875, + "router_z_loss_mlp": 1.66699219, + "step": 614, + "time_per_iteration": 2.645425796508789 + }, + { + "auxiliary_loss_clip": 0.08091287, + "auxiliary_loss_mlp": 0.01664825, + "balance_loss_clip": 0.06965354, + "balance_loss_mlp": 0.01476188, + "epoch": 0.036975800390801145, + "flos": 33657765050880.0, + "grad_norm": 13.032636577175042, + "language_loss": 0.81820416, + "learning_rate": 3.999489768826041e-06, + "loss": 0.91576523, + "num_input_tokens_seen": 12964785, + "router_z_loss_clip": 11.265625, + "router_z_loss_mlp": 1.88671875, + "step": 615, + "time_per_iteration": 2.781172752380371 + }, + { + "auxiliary_loss_clip": 0.08073606, + "auxiliary_loss_mlp": 0.01648642, + "balance_loss_clip": 0.06957066, + "balance_loss_mlp": 0.01467158, + "epoch": 0.03703592364346911, + "flos": 28301307344640.0, + "grad_norm": 5.888176936290721, + "language_loss": 0.88226712, + "learning_rate": 3.999480934200528e-06, + "loss": 0.97948968, + "num_input_tokens_seen": 12986705, + "router_z_loss_clip": 11.171875, + "router_z_loss_mlp": 1.81445312, + "step": 616, + "time_per_iteration": 2.712480068206787 + }, + { + "auxiliary_loss_clip": 0.08063665, + "auxiliary_loss_mlp": 0.01595674, + "balance_loss_clip": 0.06951402, + "balance_loss_mlp": 0.01438985, + "epoch": 0.03709604689613708, + "flos": 31512327984000.0, + "grad_norm": 15.942016878304402, + "language_loss": 0.7623843, + "learning_rate": 3.999472023754499e-06, + "loss": 0.85897768, + "num_input_tokens_seen": 13010560, + "router_z_loss_clip": 11.1171875, + "router_z_loss_mlp": 1.56738281, + "step": 617, + "time_per_iteration": 2.738520622253418 + }, + { + "auxiliary_loss_clip": 0.08034836, + "auxiliary_loss_mlp": 0.01559373, + "balance_loss_clip": 0.06941325, + "balance_loss_mlp": 0.01401445, + "epoch": 0.03715617014880505, + "flos": 19615424780160.0, + "grad_norm": 6.714823910826054, + "language_loss": 0.88676983, + "learning_rate": 3.99946303748829e-06, + "loss": 0.98271191, + "num_input_tokens_seen": 13028935, + "router_z_loss_clip": 10.953125, + "router_z_loss_mlp": 1.57910156, + "step": 618, + "time_per_iteration": 2.6463687419891357 + }, + { + "auxiliary_loss_clip": 0.08035833, + "auxiliary_loss_mlp": 0.0158681, + "balance_loss_clip": 0.06917505, + "balance_loss_mlp": 0.01430789, + "epoch": 0.03721629340147302, + "flos": 15929598579840.0, + "grad_norm": 200.27470015941975, + "language_loss": 0.97611117, + "learning_rate": 3.999453975402242e-06, + "loss": 1.07233763, + "num_input_tokens_seen": 13046000, + "router_z_loss_clip": 11.171875, + "router_z_loss_mlp": 1.55957031, + "step": 619, + "time_per_iteration": 2.6415488719940186 + }, + { + "auxiliary_loss_clip": 0.08024481, + "auxiliary_loss_mlp": 0.01545146, + "balance_loss_clip": 0.06915386, + "balance_loss_mlp": 0.01399139, + "epoch": 0.03727641665414099, + "flos": 21110182565760.0, + "grad_norm": 5.601090655471351, + "language_loss": 1.00407517, + "learning_rate": 3.9994448374967e-06, + "loss": 1.0997715, + "num_input_tokens_seen": 13062995, + "router_z_loss_clip": 11.1015625, + "router_z_loss_mlp": 1.4609375, + "step": 620, + "time_per_iteration": 2.6569979190826416 + }, + { + "auxiliary_loss_clip": 0.08002374, + "auxiliary_loss_mlp": 0.01557386, + "balance_loss_clip": 0.06899319, + "balance_loss_mlp": 0.01406705, + "epoch": 0.037336539906808956, + "flos": 24138159960960.0, + "grad_norm": 36.40398806521908, + "language_loss": 0.83474398, + "learning_rate": 3.999435623772008e-06, + "loss": 0.9303416, + "num_input_tokens_seen": 13084120, + "router_z_loss_clip": 11.046875, + "router_z_loss_mlp": 1.5078125, + "step": 621, + "time_per_iteration": 2.690336227416992 + }, + { + "auxiliary_loss_clip": 0.07971206, + "auxiliary_loss_mlp": 0.01523645, + "balance_loss_clip": 0.06889994, + "balance_loss_mlp": 0.01385266, + "epoch": 0.03739666315947693, + "flos": 22352981523840.0, + "grad_norm": 9.446463642728892, + "language_loss": 0.92411411, + "learning_rate": 3.999426334228518e-06, + "loss": 1.01906252, + "num_input_tokens_seen": 13100035, + "router_z_loss_clip": 10.828125, + "router_z_loss_mlp": 1.38378906, + "step": 622, + "time_per_iteration": 2.658414363861084 + }, + { + "auxiliary_loss_clip": 0.07994708, + "auxiliary_loss_mlp": 0.01510841, + "balance_loss_clip": 0.06888318, + "balance_loss_mlp": 0.01382, + "epoch": 0.0374567864121449, + "flos": 20455855632000.0, + "grad_norm": 11.361437110202797, + "language_loss": 0.97279346, + "learning_rate": 3.999416968866581e-06, + "loss": 1.06784892, + "num_input_tokens_seen": 13118070, + "router_z_loss_clip": 11.0546875, + "router_z_loss_mlp": 1.2890625, + "step": 623, + "time_per_iteration": 2.641080617904663 + }, + { + "auxiliary_loss_clip": 0.07990901, + "auxiliary_loss_mlp": 0.01512746, + "balance_loss_clip": 0.06881022, + "balance_loss_mlp": 0.0138009, + "epoch": 0.037516909664812866, + "flos": 19214020995840.0, + "grad_norm": 6.5992711028490865, + "language_loss": 0.9044131, + "learning_rate": 3.999407527686551e-06, + "loss": 0.99944961, + "num_input_tokens_seen": 13136355, + "router_z_loss_clip": 11.1171875, + "router_z_loss_mlp": 1.32714844, + "step": 624, + "time_per_iteration": 2.6581132411956787 + }, + { + "auxiliary_loss_clip": 0.07970337, + "auxiliary_loss_mlp": 0.0150074, + "balance_loss_clip": 0.06882318, + "balance_loss_mlp": 0.01368561, + "epoch": 0.03757703291748084, + "flos": 35013643493760.0, + "grad_norm": 9.813739409664771, + "language_loss": 0.77213168, + "learning_rate": 3.999398010688788e-06, + "loss": 0.86684251, + "num_input_tokens_seen": 13155435, + "router_z_loss_clip": 10.8828125, + "router_z_loss_mlp": 1.32128906, + "step": 625, + "time_per_iteration": 2.741912603378296 + }, + { + "auxiliary_loss_clip": 0.07975402, + "auxiliary_loss_mlp": 0.01499832, + "balance_loss_clip": 0.06869578, + "balance_loss_mlp": 0.01362599, + "epoch": 0.0376371561701488, + "flos": 25490977729920.0, + "grad_norm": 10.795152981420221, + "language_loss": 0.84230971, + "learning_rate": 3.999388417873652e-06, + "loss": 0.93706203, + "num_input_tokens_seen": 13174295, + "router_z_loss_clip": 11.0625, + "router_z_loss_mlp": 1.37207031, + "step": 626, + "time_per_iteration": 2.7070746421813965 + }, + { + "auxiliary_loss_clip": 0.07968426, + "auxiliary_loss_mlp": 0.01497735, + "balance_loss_clip": 0.06873227, + "balance_loss_mlp": 0.01361264, + "epoch": 0.037697279422816775, + "flos": 18191301586560.0, + "grad_norm": 4.940336590948721, + "language_loss": 0.86271065, + "learning_rate": 3.999378749241506e-06, + "loss": 0.95737231, + "num_input_tokens_seen": 13192500, + "router_z_loss_clip": 10.953125, + "router_z_loss_mlp": 1.36425781, + "step": 627, + "time_per_iteration": 2.622081756591797 + }, + { + "auxiliary_loss_clip": 0.07952641, + "auxiliary_loss_mlp": 0.01462314, + "balance_loss_clip": 0.06847817, + "balance_loss_mlp": 0.01327273, + "epoch": 0.03775740267548475, + "flos": 24651133856640.0, + "grad_norm": 5.044807916969655, + "language_loss": 0.93558288, + "learning_rate": 3.999369004792719e-06, + "loss": 1.02973247, + "num_input_tokens_seen": 13213470, + "router_z_loss_clip": 11.046875, + "router_z_loss_mlp": 1.35058594, + "step": 628, + "time_per_iteration": 2.699890375137329 + }, + { + "auxiliary_loss_clip": 0.07954629, + "auxiliary_loss_mlp": 0.01473174, + "balance_loss_clip": 0.06867678, + "balance_loss_mlp": 0.01340232, + "epoch": 0.03781752592815271, + "flos": 21294609402240.0, + "grad_norm": 4.416786805856079, + "language_loss": 0.86205798, + "learning_rate": 3.999359184527658e-06, + "loss": 0.95633596, + "num_input_tokens_seen": 13232365, + "router_z_loss_clip": 10.8828125, + "router_z_loss_mlp": 1.32910156, + "step": 629, + "time_per_iteration": 2.629606246948242 + }, + { + "auxiliary_loss_clip": 0.07949786, + "auxiliary_loss_mlp": 0.01478041, + "balance_loss_clip": 0.06862906, + "balance_loss_mlp": 0.01348436, + "epoch": 0.037877649180820684, + "flos": 22095949524480.0, + "grad_norm": 11.02025815590499, + "language_loss": 0.82977569, + "learning_rate": 3.999349288446696e-06, + "loss": 0.92405391, + "num_input_tokens_seen": 13251920, + "router_z_loss_clip": 10.8671875, + "router_z_loss_mlp": 1.29589844, + "step": 630, + "time_per_iteration": 2.6579172611236572 + }, + { + "auxiliary_loss_clip": 0.07989411, + "auxiliary_loss_mlp": 0.01449511, + "balance_loss_clip": 0.06879212, + "balance_loss_mlp": 0.01315711, + "epoch": 0.03793777243348865, + "flos": 14506523562240.0, + "grad_norm": 6.642300097880606, + "language_loss": 0.99746037, + "learning_rate": 3.99933931655021e-06, + "loss": 1.09184957, + "num_input_tokens_seen": 13267440, + "router_z_loss_clip": 11.1015625, + "router_z_loss_mlp": 1.33789062, + "step": 631, + "time_per_iteration": 2.5856504440307617 + }, + { + "auxiliary_loss_clip": 0.079531, + "auxiliary_loss_mlp": 0.0144806, + "balance_loss_clip": 0.06880549, + "balance_loss_mlp": 0.01321221, + "epoch": 0.03799789568615662, + "flos": 21914918778240.0, + "grad_norm": 6.504165414948274, + "language_loss": 0.96511495, + "learning_rate": 3.999329268838575e-06, + "loss": 1.05912662, + "num_input_tokens_seen": 13287850, + "router_z_loss_clip": 10.71875, + "router_z_loss_mlp": 1.26953125, + "step": 632, + "time_per_iteration": 2.6638169288635254 + }, + { + "auxiliary_loss_clip": 0.07980786, + "auxiliary_loss_mlp": 0.01460671, + "balance_loss_clip": 0.06883863, + "balance_loss_mlp": 0.0132668, + "epoch": 0.03805801893882459, + "flos": 24833967465600.0, + "grad_norm": 3.720972995518591, + "language_loss": 0.88515753, + "learning_rate": 3.999319145312175e-06, + "loss": 0.97957206, + "num_input_tokens_seen": 13307760, + "router_z_loss_clip": 10.984375, + "router_z_loss_mlp": 1.33984375, + "step": 633, + "time_per_iteration": 2.7479147911071777 + }, + { + "auxiliary_loss_clip": 0.07973721, + "auxiliary_loss_mlp": 0.01476512, + "balance_loss_clip": 0.06873562, + "balance_loss_mlp": 0.01335273, + "epoch": 0.03811814219149256, + "flos": 30490950240000.0, + "grad_norm": 5.013866846245917, + "language_loss": 0.74909431, + "learning_rate": 3.999308945971392e-06, + "loss": 0.84359664, + "num_input_tokens_seen": 13331230, + "router_z_loss_clip": 11.0078125, + "router_z_loss_mlp": 1.4140625, + "step": 634, + "time_per_iteration": 2.7746760845184326 + }, + { + "auxiliary_loss_clip": 0.07892692, + "auxiliary_loss_mlp": 0.01617175, + "balance_loss_clip": 0.0733197, + "balance_loss_mlp": 0.01455336, + "epoch": 0.03817826544416053, + "flos": 67010671820160.0, + "grad_norm": 1.8703584651187424, + "language_loss": 0.63503969, + "learning_rate": 3.999298670816614e-06, + "loss": 0.73013842, + "num_input_tokens_seen": 13394760, + "router_z_loss_clip": 5.6171875, + "router_z_loss_mlp": 1.61816406, + "step": 635, + "time_per_iteration": 3.2972047328948975 + }, + { + "auxiliary_loss_clip": 0.08014892, + "auxiliary_loss_mlp": 0.01535345, + "balance_loss_clip": 0.06916042, + "balance_loss_mlp": 0.01392198, + "epoch": 0.038238388696828496, + "flos": 20491592198400.0, + "grad_norm": 9.695955755206388, + "language_loss": 0.90505767, + "learning_rate": 3.9992883198482294e-06, + "loss": 1.00056005, + "num_input_tokens_seen": 13412775, + "router_z_loss_clip": 10.9921875, + "router_z_loss_mlp": 1.43066406, + "step": 636, + "time_per_iteration": 2.6479721069335938 + }, + { + "auxiliary_loss_clip": 0.08042439, + "auxiliary_loss_mlp": 0.01559473, + "balance_loss_clip": 0.06923507, + "balance_loss_mlp": 0.01399637, + "epoch": 0.03829851194949647, + "flos": 17971389745920.0, + "grad_norm": 32.79410112755353, + "language_loss": 0.88142544, + "learning_rate": 3.999277893066632e-06, + "loss": 0.97744453, + "num_input_tokens_seen": 13427835, + "router_z_loss_clip": 11.1796875, + "router_z_loss_mlp": 1.59667969, + "step": 637, + "time_per_iteration": 2.6563000679016113 + }, + { + "auxiliary_loss_clip": 0.08110388, + "auxiliary_loss_mlp": 0.0159766, + "balance_loss_clip": 0.06951486, + "balance_loss_mlp": 0.0144078, + "epoch": 0.03835863520216444, + "flos": 22463251896960.0, + "grad_norm": 37.67076952511291, + "language_loss": 0.91187263, + "learning_rate": 3.999267390472215e-06, + "loss": 1.00895298, + "num_input_tokens_seen": 13447295, + "router_z_loss_clip": 11.578125, + "router_z_loss_mlp": 1.56933594, + "step": 638, + "time_per_iteration": 2.6984195709228516 + }, + { + "auxiliary_loss_clip": 0.08094786, + "auxiliary_loss_mlp": 0.01648944, + "balance_loss_clip": 0.0693827, + "balance_loss_mlp": 0.01462406, + "epoch": 0.038418758454832405, + "flos": 22171070309760.0, + "grad_norm": 8.895472090968715, + "language_loss": 0.76717615, + "learning_rate": 3.999256812065381e-06, + "loss": 0.86461353, + "num_input_tokens_seen": 13468455, + "router_z_loss_clip": 11.5703125, + "router_z_loss_mlp": 1.86621094, + "step": 639, + "time_per_iteration": 2.7338461875915527 + }, + { + "auxiliary_loss_clip": 0.08159171, + "auxiliary_loss_mlp": 0.0166434, + "balance_loss_clip": 0.06976852, + "balance_loss_mlp": 0.01475227, + "epoch": 0.03847888170750038, + "flos": 22754049891840.0, + "grad_norm": 14.750114797034104, + "language_loss": 0.93037415, + "learning_rate": 3.999246157846526e-06, + "loss": 1.02860928, + "num_input_tokens_seen": 13489085, + "router_z_loss_clip": 11.8203125, + "router_z_loss_mlp": 1.890625, + "step": 640, + "time_per_iteration": 2.6571292877197266 + }, + { + "auxiliary_loss_clip": 0.08171181, + "auxiliary_loss_mlp": 0.01715232, + "balance_loss_clip": 0.06975375, + "balance_loss_mlp": 0.01501704, + "epoch": 0.03853900496016834, + "flos": 22717852128000.0, + "grad_norm": 10.934463540103733, + "language_loss": 0.90094578, + "learning_rate": 3.9992354278160574e-06, + "loss": 0.99980986, + "num_input_tokens_seen": 13509120, + "router_z_loss_clip": 11.953125, + "router_z_loss_mlp": 2.1328125, + "step": 641, + "time_per_iteration": 2.6885619163513184 + }, + { + "auxiliary_loss_clip": 0.07644878, + "auxiliary_loss_mlp": 0.01447392, + "balance_loss_clip": 0.07120143, + "balance_loss_mlp": 0.01325512, + "epoch": 0.038599128212836314, + "flos": 70420039073280.0, + "grad_norm": 0.9281695288015585, + "language_loss": 0.65025115, + "learning_rate": 3.999224621974381e-06, + "loss": 0.74117386, + "num_input_tokens_seen": 13562005, + "router_z_loss_clip": 5.25, + "router_z_loss_mlp": 1.21679688, + "step": 642, + "time_per_iteration": 3.2678098678588867 + }, + { + "auxiliary_loss_clip": 0.08201542, + "auxiliary_loss_mlp": 0.01819887, + "balance_loss_clip": 0.07001273, + "balance_loss_mlp": 0.01562014, + "epoch": 0.03865925146550429, + "flos": 23301921813120.0, + "grad_norm": 11.481508748032715, + "language_loss": 0.86633605, + "learning_rate": 3.999213740321906e-06, + "loss": 0.96655035, + "num_input_tokens_seen": 13582185, + "router_z_loss_clip": 11.9921875, + "router_z_loss_mlp": 2.57617188, + "step": 643, + "time_per_iteration": 2.659075975418091 + }, + { + "auxiliary_loss_clip": 0.08181606, + "auxiliary_loss_mlp": 0.01825318, + "balance_loss_clip": 0.06992409, + "balance_loss_mlp": 0.01547799, + "epoch": 0.03871937471817225, + "flos": 21436255929600.0, + "grad_norm": 51.325604168223556, + "language_loss": 0.89457649, + "learning_rate": 3.999202782859046e-06, + "loss": 0.99464566, + "num_input_tokens_seen": 13599555, + "router_z_loss_clip": 11.890625, + "router_z_loss_mlp": 2.77539062, + "step": 644, + "time_per_iteration": 2.659674882888794 + }, + { + "auxiliary_loss_clip": 0.08227627, + "auxiliary_loss_mlp": 0.01840427, + "balance_loss_clip": 0.07032949, + "balance_loss_mlp": 0.01557186, + "epoch": 0.038779497970840224, + "flos": 34285914783360.0, + "grad_norm": 72.96819975442757, + "language_loss": 0.90063643, + "learning_rate": 3.9991917495862165e-06, + "loss": 1.00131702, + "num_input_tokens_seen": 13621160, + "router_z_loss_clip": 11.953125, + "router_z_loss_mlp": 2.83007812, + "step": 645, + "time_per_iteration": 2.732840061187744 + }, + { + "auxiliary_loss_clip": 0.08212948, + "auxiliary_loss_mlp": 0.01875445, + "balance_loss_clip": 0.07012647, + "balance_loss_mlp": 0.01580378, + "epoch": 0.03883962122350819, + "flos": 22754930359680.0, + "grad_norm": 12.262203154186425, + "language_loss": 0.90520537, + "learning_rate": 3.9991806405038345e-06, + "loss": 1.00608933, + "num_input_tokens_seen": 13641915, + "router_z_loss_clip": 12.0078125, + "router_z_loss_mlp": 2.95117188, + "step": 646, + "time_per_iteration": 2.6865735054016113 + }, + { + "auxiliary_loss_clip": 0.08250429, + "auxiliary_loss_mlp": 0.01894148, + "balance_loss_clip": 0.07030701, + "balance_loss_mlp": 0.01611288, + "epoch": 0.03889974447617616, + "flos": 21952500134400.0, + "grad_norm": 17.1595872898191, + "language_loss": 0.88891035, + "learning_rate": 3.999169455612323e-06, + "loss": 0.99035615, + "num_input_tokens_seen": 13661410, + "router_z_loss_clip": 12.1953125, + "router_z_loss_mlp": 2.83007812, + "step": 647, + "time_per_iteration": 2.648667097091675 + }, + { + "auxiliary_loss_clip": 0.08277115, + "auxiliary_loss_mlp": 0.01910975, + "balance_loss_clip": 0.0706424, + "balance_loss_mlp": 0.01610376, + "epoch": 0.03895986772884413, + "flos": 31513040743680.0, + "grad_norm": 19.91369953833428, + "language_loss": 0.91710514, + "learning_rate": 3.999158194912106e-06, + "loss": 1.01898599, + "num_input_tokens_seen": 13681705, + "router_z_loss_clip": 12.125, + "router_z_loss_mlp": 3.00585938, + "step": 648, + "time_per_iteration": 2.7659173011779785 + }, + { + "auxiliary_loss_clip": 0.08252379, + "auxiliary_loss_mlp": 0.0196062, + "balance_loss_clip": 0.0704875, + "balance_loss_mlp": 0.01647243, + "epoch": 0.0390199909815121, + "flos": 19907061315840.0, + "grad_norm": 11.116514995705378, + "language_loss": 0.90245318, + "learning_rate": 3.9991468584036086e-06, + "loss": 1.00458312, + "num_input_tokens_seen": 13700400, + "router_z_loss_clip": 12.0234375, + "router_z_loss_mlp": 3.1328125, + "step": 649, + "time_per_iteration": 4.126534938812256 + }, + { + "auxiliary_loss_clip": 0.08304022, + "auxiliary_loss_mlp": 0.01986477, + "balance_loss_clip": 0.07056045, + "balance_loss_mlp": 0.01679394, + "epoch": 0.03908011423418007, + "flos": 21618250997760.0, + "grad_norm": 9.336868328216912, + "language_loss": 0.85345471, + "learning_rate": 3.999135446087263e-06, + "loss": 0.95635974, + "num_input_tokens_seen": 13720145, + "router_z_loss_clip": 12.484375, + "router_z_loss_mlp": 3.07421875, + "step": 650, + "time_per_iteration": 4.1806252002716064 + }, + { + "auxiliary_loss_clip": 0.08239638, + "auxiliary_loss_mlp": 0.01912282, + "balance_loss_clip": 0.0705025, + "balance_loss_mlp": 0.01647351, + "epoch": 0.039140237486848035, + "flos": 18667406885760.0, + "grad_norm": 11.202480244033193, + "language_loss": 0.84588236, + "learning_rate": 3.9991239579635e-06, + "loss": 0.94740158, + "num_input_tokens_seen": 13737500, + "router_z_loss_clip": 11.890625, + "router_z_loss_mlp": 2.6484375, + "step": 651, + "time_per_iteration": 4.02846360206604 + }, + { + "auxiliary_loss_clip": 0.08228613, + "auxiliary_loss_mlp": 0.01893436, + "balance_loss_clip": 0.07038778, + "balance_loss_mlp": 0.01631557, + "epoch": 0.03920036073951601, + "flos": 18667071469440.0, + "grad_norm": 33.17940308554231, + "language_loss": 0.9516173, + "learning_rate": 3.999112394032757e-06, + "loss": 1.05283785, + "num_input_tokens_seen": 13754750, + "router_z_loss_clip": 11.90625, + "router_z_loss_mlp": 2.6171875, + "step": 652, + "time_per_iteration": 2.6877963542938232 + }, + { + "auxiliary_loss_clip": 0.08188264, + "auxiliary_loss_mlp": 0.01841461, + "balance_loss_clip": 0.07017257, + "balance_loss_mlp": 0.01607716, + "epoch": 0.03926048399218398, + "flos": 31361918705280.0, + "grad_norm": 14.717862862310868, + "language_loss": 0.87065995, + "learning_rate": 3.999100754295471e-06, + "loss": 0.97095722, + "num_input_tokens_seen": 13771990, + "router_z_loss_clip": 11.7109375, + "router_z_loss_mlp": 2.33691406, + "step": 653, + "time_per_iteration": 4.161829948425293 + }, + { + "auxiliary_loss_clip": 0.08235107, + "auxiliary_loss_mlp": 0.01869742, + "balance_loss_clip": 0.07023594, + "balance_loss_mlp": 0.01632659, + "epoch": 0.039320607244851945, + "flos": 29610715898880.0, + "grad_norm": 12.720561465838024, + "language_loss": 0.92308909, + "learning_rate": 3.999089038752085e-06, + "loss": 1.0241375, + "num_input_tokens_seen": 13792750, + "router_z_loss_clip": 12.125, + "router_z_loss_mlp": 2.37304688, + "step": 654, + "time_per_iteration": 2.7182300090789795 + }, + { + "auxiliary_loss_clip": 0.07219759, + "auxiliary_loss_mlp": 0.01432266, + "balance_loss_clip": 0.0672446, + "balance_loss_mlp": 0.01342621, + "epoch": 0.03938073049751992, + "flos": 66555362332800.0, + "grad_norm": 4.21609108891928, + "language_loss": 0.5259136, + "learning_rate": 3.999077247403041e-06, + "loss": 0.61243391, + "num_input_tokens_seen": 13858570, + "router_z_loss_clip": 4.94140625, + "router_z_loss_mlp": 0.89599609, + "step": 655, + "time_per_iteration": 3.3539531230926514 + }, + { + "auxiliary_loss_clip": 0.08163472, + "auxiliary_loss_mlp": 0.01789512, + "balance_loss_clip": 0.07021941, + "balance_loss_mlp": 0.01601352, + "epoch": 0.03944085375018788, + "flos": 23374568903040.0, + "grad_norm": 42.09331718280733, + "language_loss": 0.85369515, + "learning_rate": 3.9990653802487886e-06, + "loss": 0.95322502, + "num_input_tokens_seen": 13876335, + "router_z_loss_clip": 11.4140625, + "router_z_loss_mlp": 1.88183594, + "step": 656, + "time_per_iteration": 2.696443557739258 + }, + { + "auxiliary_loss_clip": 0.08208387, + "auxiliary_loss_mlp": 0.01830457, + "balance_loss_clip": 0.07014482, + "balance_loss_mlp": 0.01624177, + "epoch": 0.039500977002855854, + "flos": 18553656568320.0, + "grad_norm": 12.61442729870119, + "language_loss": 0.83751947, + "learning_rate": 3.999053437289776e-06, + "loss": 0.93790793, + "num_input_tokens_seen": 13892640, + "router_z_loss_clip": 11.9296875, + "router_z_loss_mlp": 2.06347656, + "step": 657, + "time_per_iteration": 2.6805458068847656 + }, + { + "auxiliary_loss_clip": 0.08160911, + "auxiliary_loss_mlp": 0.01759172, + "balance_loss_clip": 0.07011348, + "balance_loss_mlp": 0.0155871, + "epoch": 0.039561100255523826, + "flos": 25345264279680.0, + "grad_norm": 59.81491010429953, + "language_loss": 0.86573362, + "learning_rate": 3.999041418526457e-06, + "loss": 0.96493447, + "num_input_tokens_seen": 13910085, + "router_z_loss_clip": 11.5, + "router_z_loss_mlp": 2.00488281, + "step": 658, + "time_per_iteration": 2.7667956352233887 + }, + { + "auxiliary_loss_clip": 0.08139389, + "auxiliary_loss_mlp": 0.01752558, + "balance_loss_clip": 0.07002386, + "balance_loss_mlp": 0.01577368, + "epoch": 0.03962122350819179, + "flos": 18225193363200.0, + "grad_norm": 13.067415763006752, + "language_loss": 0.97220278, + "learning_rate": 3.999029323959287e-06, + "loss": 1.07112217, + "num_input_tokens_seen": 13928800, + "router_z_loss_clip": 11.375, + "router_z_loss_mlp": 1.75097656, + "step": 659, + "time_per_iteration": 2.7390072345733643 + }, + { + "auxiliary_loss_clip": 0.08160311, + "auxiliary_loss_mlp": 0.01767653, + "balance_loss_clip": 0.07020363, + "balance_loss_mlp": 0.01584643, + "epoch": 0.03968134676085976, + "flos": 20528544648960.0, + "grad_norm": 6.696604257077815, + "language_loss": 0.85069668, + "learning_rate": 3.999017153588724e-06, + "loss": 0.94997621, + "num_input_tokens_seen": 13948325, + "router_z_loss_clip": 11.40625, + "router_z_loss_mlp": 1.83203125, + "step": 660, + "time_per_iteration": 2.6942412853240967 + }, + { + "auxiliary_loss_clip": 0.08128712, + "auxiliary_loss_mlp": 0.01673628, + "balance_loss_clip": 0.07018431, + "balance_loss_mlp": 0.01512361, + "epoch": 0.03974147001352773, + "flos": 22429737463680.0, + "grad_norm": 7.3843033134333425, + "language_loss": 0.86255896, + "learning_rate": 3.999004907415231e-06, + "loss": 0.96058238, + "num_input_tokens_seen": 13969090, + "router_z_loss_clip": 11.109375, + "router_z_loss_mlp": 1.61132812, + "step": 661, + "time_per_iteration": 2.688343048095703 + }, + { + "auxiliary_loss_clip": 0.07200997, + "auxiliary_loss_mlp": 0.01397595, + "balance_loss_clip": 0.06707223, + "balance_loss_mlp": 0.01289354, + "epoch": 0.0398015932661957, + "flos": 71149780281600.0, + "grad_norm": 0.9134370604104062, + "language_loss": 0.69827634, + "learning_rate": 3.998992585439272e-06, + "loss": 0.78426224, + "num_input_tokens_seen": 14037555, + "router_z_loss_clip": 4.9375, + "router_z_loss_mlp": 1.08496094, + "step": 662, + "time_per_iteration": 3.4075381755828857 + }, + { + "auxiliary_loss_clip": 0.08114735, + "auxiliary_loss_mlp": 0.01667295, + "balance_loss_clip": 0.06992006, + "balance_loss_mlp": 0.01495347, + "epoch": 0.03986171651886367, + "flos": 16806688392960.0, + "grad_norm": 88.3041379662575, + "language_loss": 0.8901574, + "learning_rate": 3.998980187661314e-06, + "loss": 0.98797774, + "num_input_tokens_seen": 14055765, + "router_z_loss_clip": 11.234375, + "router_z_loss_mlp": 1.71875, + "step": 663, + "time_per_iteration": 2.6151316165924072 + }, + { + "auxiliary_loss_clip": 0.08116017, + "auxiliary_loss_mlp": 0.01665745, + "balance_loss_clip": 0.06974875, + "balance_loss_mlp": 0.01491318, + "epoch": 0.03992183977153164, + "flos": 24541953586560.0, + "grad_norm": 13.584726936237926, + "language_loss": 0.92355931, + "learning_rate": 3.998967714081826e-06, + "loss": 1.02137709, + "num_input_tokens_seen": 14074195, + "router_z_loss_clip": 11.3984375, + "router_z_loss_mlp": 1.74511719, + "step": 664, + "time_per_iteration": 2.7008705139160156 + }, + { + "auxiliary_loss_clip": 0.08040652, + "auxiliary_loss_mlp": 0.01593066, + "balance_loss_clip": 0.06989275, + "balance_loss_mlp": 0.01449252, + "epoch": 0.03998196302419961, + "flos": 15601261155840.0, + "grad_norm": 12.968973833741712, + "language_loss": 0.90573943, + "learning_rate": 3.998955164701281e-06, + "loss": 1.00207651, + "num_input_tokens_seen": 14090215, + "router_z_loss_clip": 10.5078125, + "router_z_loss_mlp": 1.43847656, + "step": 665, + "time_per_iteration": 2.588078737258911 + }, + { + "auxiliary_loss_clip": 0.0806282, + "auxiliary_loss_mlp": 0.01620663, + "balance_loss_clip": 0.06955597, + "balance_loss_mlp": 0.01454533, + "epoch": 0.04004208627686758, + "flos": 25312714168320.0, + "grad_norm": 13.194143098844163, + "language_loss": 0.86261296, + "learning_rate": 3.998942539520158e-06, + "loss": 0.9594478, + "num_input_tokens_seen": 14112150, + "router_z_loss_clip": 11.0859375, + "router_z_loss_mlp": 1.66113281, + "step": 666, + "time_per_iteration": 2.7150063514709473 + }, + { + "auxiliary_loss_clip": 0.08039176, + "auxiliary_loss_mlp": 0.01580059, + "balance_loss_clip": 0.06968041, + "balance_loss_mlp": 0.01428235, + "epoch": 0.04010220952953555, + "flos": 23482365580800.0, + "grad_norm": 143.76139759772911, + "language_loss": 0.91256213, + "learning_rate": 3.998929838538932e-06, + "loss": 1.00875449, + "num_input_tokens_seen": 14131475, + "router_z_loss_clip": 10.71875, + "router_z_loss_mlp": 1.51855469, + "step": 667, + "time_per_iteration": 2.6658053398132324 + }, + { + "auxiliary_loss_clip": 0.08004649, + "auxiliary_loss_mlp": 0.01530234, + "balance_loss_clip": 0.06972381, + "balance_loss_mlp": 0.01387469, + "epoch": 0.04016233278220352, + "flos": 18621691683840.0, + "grad_norm": 22.359711377029505, + "language_loss": 0.8821072, + "learning_rate": 3.998917061758087e-06, + "loss": 0.97745597, + "num_input_tokens_seen": 14146165, + "router_z_loss_clip": 10.3046875, + "router_z_loss_mlp": 1.42773438, + "step": 668, + "time_per_iteration": 2.6255545616149902 + }, + { + "auxiliary_loss_clip": 0.07152489, + "auxiliary_loss_mlp": 0.01341531, + "balance_loss_clip": 0.06666718, + "balance_loss_mlp": 0.01260421, + "epoch": 0.040222456034871484, + "flos": 70926556204800.0, + "grad_norm": 1.1799050230194268, + "language_loss": 0.60729092, + "learning_rate": 3.998904209178107e-06, + "loss": 0.69223112, + "num_input_tokens_seen": 14215005, + "router_z_loss_clip": 4.875, + "router_z_loss_mlp": 0.81103516, + "step": 669, + "time_per_iteration": 3.3595035076141357 + }, + { + "auxiliary_loss_clip": 0.08017544, + "auxiliary_loss_mlp": 0.01537312, + "balance_loss_clip": 0.06961209, + "balance_loss_mlp": 0.0138749, + "epoch": 0.040282579287539456, + "flos": 23770773734400.0, + "grad_norm": 21.749949136203163, + "language_loss": 0.91578722, + "learning_rate": 3.9988912807994785e-06, + "loss": 1.01133573, + "num_input_tokens_seen": 14235510, + "router_z_loss_clip": 10.5703125, + "router_z_loss_mlp": 1.49707031, + "step": 670, + "time_per_iteration": 2.66859769821167 + }, + { + "auxiliary_loss_clip": 0.08002704, + "auxiliary_loss_mlp": 0.01555976, + "balance_loss_clip": 0.0695509, + "balance_loss_mlp": 0.01413116, + "epoch": 0.04034270254020743, + "flos": 18484405568640.0, + "grad_norm": 9.221564261110139, + "language_loss": 0.80103904, + "learning_rate": 3.998878276622692e-06, + "loss": 0.89662588, + "num_input_tokens_seen": 14254565, + "router_z_loss_clip": 10.484375, + "router_z_loss_mlp": 1.4296875, + "step": 671, + "time_per_iteration": 2.6671946048736572 + }, + { + "auxiliary_loss_clip": 0.07994901, + "auxiliary_loss_mlp": 0.01548628, + "balance_loss_clip": 0.06957932, + "balance_loss_mlp": 0.01400332, + "epoch": 0.040402825792875394, + "flos": 17207589052800.0, + "grad_norm": 12.445045366932057, + "language_loss": 0.98976898, + "learning_rate": 3.998865196648242e-06, + "loss": 1.08520412, + "num_input_tokens_seen": 14271885, + "router_z_loss_clip": 10.375, + "router_z_loss_mlp": 1.484375, + "step": 672, + "time_per_iteration": 2.6043524742126465 + }, + { + "auxiliary_loss_clip": 0.08007569, + "auxiliary_loss_mlp": 0.01577526, + "balance_loss_clip": 0.06955793, + "balance_loss_mlp": 0.01428181, + "epoch": 0.040462949045543366, + "flos": 19178242502400.0, + "grad_norm": 16.68355787547426, + "language_loss": 0.95323932, + "learning_rate": 3.998852040876622e-06, + "loss": 1.04909039, + "num_input_tokens_seen": 14289670, + "router_z_loss_clip": 10.53125, + "router_z_loss_mlp": 1.49316406, + "step": 673, + "time_per_iteration": 2.67228102684021 + }, + { + "auxiliary_loss_clip": 0.07999671, + "auxiliary_loss_mlp": 0.01557213, + "balance_loss_clip": 0.06955186, + "balance_loss_mlp": 0.01413161, + "epoch": 0.04052307229821133, + "flos": 24025877089920.0, + "grad_norm": 7.385878323717427, + "language_loss": 0.80140877, + "learning_rate": 3.998838809308334e-06, + "loss": 0.89697754, + "num_input_tokens_seen": 14309285, + "router_z_loss_clip": 10.4375, + "router_z_loss_mlp": 1.43994141, + "step": 674, + "time_per_iteration": 2.6599738597869873 + }, + { + "auxiliary_loss_clip": 0.08032155, + "auxiliary_loss_mlp": 0.01590571, + "balance_loss_clip": 0.06966965, + "balance_loss_mlp": 0.01439795, + "epoch": 0.0405831955508793, + "flos": 16442362840320.0, + "grad_norm": 8.615330731484576, + "language_loss": 0.83709693, + "learning_rate": 3.9988255019438766e-06, + "loss": 0.93332422, + "num_input_tokens_seen": 14328300, + "router_z_loss_clip": 10.6484375, + "router_z_loss_mlp": 1.50683594, + "step": 675, + "time_per_iteration": 2.68145751953125 + }, + { + "auxiliary_loss_clip": 0.07989661, + "auxiliary_loss_mlp": 0.01530552, + "balance_loss_clip": 0.06954966, + "balance_loss_mlp": 0.01384926, + "epoch": 0.040643318803547275, + "flos": 24286808304000.0, + "grad_norm": 7.342047246701879, + "language_loss": 0.80985713, + "learning_rate": 3.998812118783757e-06, + "loss": 0.90505934, + "num_input_tokens_seen": 14346395, + "router_z_loss_clip": 10.3359375, + "router_z_loss_mlp": 1.45605469, + "step": 676, + "time_per_iteration": 2.6827666759490967 + }, + { + "auxiliary_loss_clip": 0.0800771, + "auxiliary_loss_mlp": 0.01548704, + "balance_loss_clip": 0.06941711, + "balance_loss_mlp": 0.01395925, + "epoch": 0.04070344205621524, + "flos": 17717795763840.0, + "grad_norm": 11.552804849972091, + "language_loss": 0.9000327, + "learning_rate": 3.9987986598284804e-06, + "loss": 0.99559683, + "num_input_tokens_seen": 14364605, + "router_z_loss_clip": 10.6640625, + "router_z_loss_mlp": 1.52734375, + "step": 677, + "time_per_iteration": 2.647284984588623 + }, + { + "auxiliary_loss_clip": 0.0795664, + "auxiliary_loss_mlp": 0.01525712, + "balance_loss_clip": 0.06946824, + "balance_loss_mlp": 0.01385522, + "epoch": 0.04076356530888321, + "flos": 26184940444800.0, + "grad_norm": 15.722345117009269, + "language_loss": 0.81235254, + "learning_rate": 3.998785125078559e-06, + "loss": 0.90717608, + "num_input_tokens_seen": 14385265, + "router_z_loss_clip": 10.09375, + "router_z_loss_mlp": 1.40039062, + "step": 678, + "time_per_iteration": 2.713604688644409 + }, + { + "auxiliary_loss_clip": 0.07982595, + "auxiliary_loss_mlp": 0.01542507, + "balance_loss_clip": 0.06946435, + "balance_loss_mlp": 0.01393447, + "epoch": 0.04082368856155118, + "flos": 35782349650560.0, + "grad_norm": 7.406308464158208, + "language_loss": 0.87816763, + "learning_rate": 3.998771514534505e-06, + "loss": 0.97341865, + "num_input_tokens_seen": 14406090, + "router_z_loss_clip": 10.3671875, + "router_z_loss_mlp": 1.4921875, + "step": 679, + "time_per_iteration": 2.7753264904022217 + }, + { + "auxiliary_loss_clip": 0.07950564, + "auxiliary_loss_mlp": 0.01522729, + "balance_loss_clip": 0.06942166, + "balance_loss_mlp": 0.01383969, + "epoch": 0.04088381181421915, + "flos": 28154042593920.0, + "grad_norm": 7.465466597866811, + "language_loss": 0.8230598, + "learning_rate": 3.998757828196835e-06, + "loss": 0.91779268, + "num_input_tokens_seen": 14425130, + "router_z_loss_clip": 10.0859375, + "router_z_loss_mlp": 1.38671875, + "step": 680, + "time_per_iteration": 2.729719400405884 + }, + { + "auxiliary_loss_clip": 0.07993592, + "auxiliary_loss_mlp": 0.01532905, + "balance_loss_clip": 0.06938143, + "balance_loss_mlp": 0.01378696, + "epoch": 0.04094393506688712, + "flos": 27604703226240.0, + "grad_norm": 9.665492233492547, + "language_loss": 0.8765927, + "learning_rate": 3.9987440660660685e-06, + "loss": 0.97185767, + "num_input_tokens_seen": 14447355, + "router_z_loss_clip": 10.5703125, + "router_z_loss_mlp": 1.54199219, + "step": 681, + "time_per_iteration": 2.752514600753784 + }, + { + "auxiliary_loss_clip": 0.07989424, + "auxiliary_loss_mlp": 0.01553673, + "balance_loss_clip": 0.0693374, + "balance_loss_mlp": 0.01390118, + "epoch": 0.04100405831955509, + "flos": 23118668933760.0, + "grad_norm": 7.019008438585821, + "language_loss": 0.77474326, + "learning_rate": 3.998730228142726e-06, + "loss": 0.87017429, + "num_input_tokens_seen": 14466790, + "router_z_loss_clip": 10.5546875, + "router_z_loss_mlp": 1.63476562, + "step": 682, + "time_per_iteration": 2.6727144718170166 + }, + { + "auxiliary_loss_clip": 0.07959605, + "auxiliary_loss_mlp": 0.01503527, + "balance_loss_clip": 0.06938009, + "balance_loss_mlp": 0.01370394, + "epoch": 0.04106418157222306, + "flos": 20162877431040.0, + "grad_norm": 10.358969831785554, + "language_loss": 0.77842575, + "learning_rate": 3.998716314427333e-06, + "loss": 0.87305701, + "num_input_tokens_seen": 14485195, + "router_z_loss_clip": 10.2109375, + "router_z_loss_mlp": 1.33007812, + "step": 683, + "time_per_iteration": 2.6043591499328613 + }, + { + "auxiliary_loss_clip": 0.07972776, + "auxiliary_loss_mlp": 0.01527418, + "balance_loss_clip": 0.06933653, + "balance_loss_mlp": 0.01377405, + "epoch": 0.041124304824891024, + "flos": 17426452717440.0, + "grad_norm": 41.27076771704703, + "language_loss": 0.86504227, + "learning_rate": 3.998702324920417e-06, + "loss": 0.96004421, + "num_input_tokens_seen": 14503370, + "router_z_loss_clip": 10.3984375, + "router_z_loss_mlp": 1.5, + "step": 684, + "time_per_iteration": 2.6286985874176025 + }, + { + "auxiliary_loss_clip": 0.07935933, + "auxiliary_loss_mlp": 0.01488839, + "balance_loss_clip": 0.06928104, + "balance_loss_mlp": 0.01343976, + "epoch": 0.041184428077558996, + "flos": 25788022853760.0, + "grad_norm": 3.9155930370094065, + "language_loss": 0.94948566, + "learning_rate": 3.9986882596225085e-06, + "loss": 1.04373336, + "num_input_tokens_seen": 14526415, + "router_z_loss_clip": 10.0859375, + "router_z_loss_mlp": 1.44824219, + "step": 685, + "time_per_iteration": 2.7345352172851562 + }, + { + "auxiliary_loss_clip": 0.07948299, + "auxiliary_loss_mlp": 0.0149691, + "balance_loss_clip": 0.06921411, + "balance_loss_mlp": 0.01346992, + "epoch": 0.04124455133022697, + "flos": 22971152620800.0, + "grad_norm": 3.7671102410224577, + "language_loss": 0.94070864, + "learning_rate": 3.998674118534141e-06, + "loss": 1.03516078, + "num_input_tokens_seen": 14546595, + "router_z_loss_clip": 10.2734375, + "router_z_loss_mlp": 1.5, + "step": 686, + "time_per_iteration": 2.6663894653320312 + }, + { + "auxiliary_loss_clip": 0.0795872, + "auxiliary_loss_mlp": 0.01501087, + "balance_loss_clip": 0.06920497, + "balance_loss_mlp": 0.01356414, + "epoch": 0.04130467458289493, + "flos": 21295615651200.0, + "grad_norm": 39.86585208650635, + "language_loss": 0.77225804, + "learning_rate": 3.998659901655851e-06, + "loss": 0.8668561, + "num_input_tokens_seen": 14566590, + "router_z_loss_clip": 10.3828125, + "router_z_loss_mlp": 1.44628906, + "step": 687, + "time_per_iteration": 2.6355550289154053 + }, + { + "auxiliary_loss_clip": 0.07898364, + "auxiliary_loss_mlp": 0.01464255, + "balance_loss_clip": 0.06899062, + "balance_loss_mlp": 0.01340564, + "epoch": 0.041364797835562905, + "flos": 19980337311360.0, + "grad_norm": 4.212344971526593, + "language_loss": 0.91093004, + "learning_rate": 3.998645608988177e-06, + "loss": 1.00455618, + "num_input_tokens_seen": 14585965, + "router_z_loss_clip": 10.0078125, + "router_z_loss_mlp": 1.23730469, + "step": 688, + "time_per_iteration": 4.057282209396362 + }, + { + "auxiliary_loss_clip": 0.07878294, + "auxiliary_loss_mlp": 0.01448978, + "balance_loss_clip": 0.06897704, + "balance_loss_mlp": 0.01329388, + "epoch": 0.04142492108823087, + "flos": 21912361228800.0, + "grad_norm": 22.971814885863903, + "language_loss": 0.88008463, + "learning_rate": 3.998631240531661e-06, + "loss": 0.97335738, + "num_input_tokens_seen": 14606015, + "router_z_loss_clip": 9.796875, + "router_z_loss_mlp": 1.19628906, + "step": 689, + "time_per_iteration": 4.07433295249939 + }, + { + "auxiliary_loss_clip": 0.07866906, + "auxiliary_loss_mlp": 0.01444557, + "balance_loss_clip": 0.06897521, + "balance_loss_mlp": 0.01326349, + "epoch": 0.04148504434089884, + "flos": 27647567389440.0, + "grad_norm": 6.767605845927541, + "language_loss": 0.72533339, + "learning_rate": 3.998616796286848e-06, + "loss": 0.81844807, + "num_input_tokens_seen": 14629955, + "router_z_loss_clip": 9.6953125, + "router_z_loss_mlp": 1.18212891, + "step": 690, + "time_per_iteration": 4.110247611999512 + }, + { + "auxiliary_loss_clip": 0.07835479, + "auxiliary_loss_mlp": 0.01439264, + "balance_loss_clip": 0.06874412, + "balance_loss_mlp": 0.01314809, + "epoch": 0.041545167593566815, + "flos": 20524058455680.0, + "grad_norm": 9.225891193910236, + "language_loss": 0.79284167, + "learning_rate": 3.998602276254286e-06, + "loss": 0.88558906, + "num_input_tokens_seen": 14648000, + "router_z_loss_clip": 9.6171875, + "router_z_loss_mlp": 1.24316406, + "step": 691, + "time_per_iteration": 2.667081594467163 + }, + { + "auxiliary_loss_clip": 0.07827538, + "auxiliary_loss_mlp": 0.01419803, + "balance_loss_clip": 0.06878158, + "balance_loss_mlp": 0.01303931, + "epoch": 0.04160529084623478, + "flos": 11872738500480.0, + "grad_norm": 5.1056325398424125, + "language_loss": 0.88591456, + "learning_rate": 3.998587680434526e-06, + "loss": 0.97838795, + "num_input_tokens_seen": 14662235, + "router_z_loss_clip": 9.484375, + "router_z_loss_mlp": 1.15820312, + "step": 692, + "time_per_iteration": 4.027364015579224 + }, + { + "auxiliary_loss_clip": 0.07869601, + "auxiliary_loss_mlp": 0.01461887, + "balance_loss_clip": 0.0685929, + "balance_loss_mlp": 0.01322936, + "epoch": 0.04166541409890275, + "flos": 14833309685760.0, + "grad_norm": 14.964488884578895, + "language_loss": 0.94025421, + "learning_rate": 3.99857300882812e-06, + "loss": 1.0335691, + "num_input_tokens_seen": 14676065, + "router_z_loss_clip": 10.1171875, + "router_z_loss_mlp": 1.38867188, + "step": 693, + "time_per_iteration": 2.6548287868499756 + }, + { + "auxiliary_loss_clip": 0.07852003, + "auxiliary_loss_mlp": 0.01436954, + "balance_loss_clip": 0.06875066, + "balance_loss_mlp": 0.01312977, + "epoch": 0.04172553735157072, + "flos": 25814577398400.0, + "grad_norm": 10.760604695701561, + "language_loss": 0.88156736, + "learning_rate": 3.998558261435626e-06, + "loss": 0.97445703, + "num_input_tokens_seen": 14694955, + "router_z_loss_clip": 9.765625, + "router_z_loss_mlp": 1.24023438, + "step": 694, + "time_per_iteration": 2.6794655323028564 + }, + { + "auxiliary_loss_clip": 0.07850839, + "auxiliary_loss_mlp": 0.01460734, + "balance_loss_clip": 0.0686307, + "balance_loss_mlp": 0.01329222, + "epoch": 0.04178566060423869, + "flos": 24286682522880.0, + "grad_norm": 6.107694720201945, + "language_loss": 0.89735746, + "learning_rate": 3.9985434382576015e-06, + "loss": 0.99047321, + "num_input_tokens_seen": 14715510, + "router_z_loss_clip": 9.890625, + "router_z_loss_mlp": 1.31445312, + "step": 695, + "time_per_iteration": 2.7562625408172607 + }, + { + "auxiliary_loss_clip": 0.07797342, + "auxiliary_loss_mlp": 0.01449631, + "balance_loss_clip": 0.0684258, + "balance_loss_mlp": 0.01321648, + "epoch": 0.04184578385690666, + "flos": 18227667058560.0, + "grad_norm": 4.8539800399764195, + "language_loss": 0.91097277, + "learning_rate": 3.99852853929461e-06, + "loss": 1.00344253, + "num_input_tokens_seen": 14731755, + "router_z_loss_clip": 9.5625, + "router_z_loss_mlp": 1.28027344, + "step": 696, + "time_per_iteration": 2.6180830001831055 + }, + { + "auxiliary_loss_clip": 0.07759669, + "auxiliary_loss_mlp": 0.01436884, + "balance_loss_clip": 0.06835265, + "balance_loss_mlp": 0.01318438, + "epoch": 0.041905907109574626, + "flos": 22781694539520.0, + "grad_norm": 8.248305080547661, + "language_loss": 0.97183168, + "learning_rate": 3.998513564547216e-06, + "loss": 1.06379724, + "num_input_tokens_seen": 14750810, + "router_z_loss_clip": 9.234375, + "router_z_loss_mlp": 1.18359375, + "step": 697, + "time_per_iteration": 2.6976754665374756 + }, + { + "auxiliary_loss_clip": 0.0775051, + "auxiliary_loss_mlp": 0.0142093, + "balance_loss_clip": 0.06823087, + "balance_loss_mlp": 0.01301005, + "epoch": 0.0419660303622426, + "flos": 20163128993280.0, + "grad_norm": 6.669627081417543, + "language_loss": 0.90090138, + "learning_rate": 3.998498514015987e-06, + "loss": 0.99261582, + "num_input_tokens_seen": 14768435, + "router_z_loss_clip": 9.2890625, + "router_z_loss_mlp": 1.20068359, + "step": 698, + "time_per_iteration": 2.6525814533233643 + }, + { + "auxiliary_loss_clip": 0.07798302, + "auxiliary_loss_mlp": 0.01439823, + "balance_loss_clip": 0.06844427, + "balance_loss_mlp": 0.01318039, + "epoch": 0.042026153614910564, + "flos": 23083142002560.0, + "grad_norm": 12.169844049295248, + "language_loss": 0.96140921, + "learning_rate": 3.998483387701495e-06, + "loss": 1.05379045, + "num_input_tokens_seen": 14786690, + "router_z_loss_clip": 9.546875, + "router_z_loss_mlp": 1.21728516, + "step": 699, + "time_per_iteration": 2.700636625289917 + }, + { + "auxiliary_loss_clip": 0.0715683, + "auxiliary_loss_mlp": 0.01383088, + "balance_loss_clip": 0.06685513, + "balance_loss_mlp": 0.01307272, + "epoch": 0.042086276867578536, + "flos": 64516296424320.0, + "grad_norm": 2.8955425132907755, + "language_loss": 0.7356112, + "learning_rate": 3.998468185604312e-06, + "loss": 0.82101035, + "num_input_tokens_seen": 14853840, + "router_z_loss_clip": 4.71875, + "router_z_loss_mlp": 0.75683594, + "step": 700, + "time_per_iteration": 3.2564964294433594 + }, + { + "auxiliary_loss_clip": 0.07741027, + "auxiliary_loss_mlp": 0.01429077, + "balance_loss_clip": 0.0681721, + "balance_loss_mlp": 0.01313587, + "epoch": 0.04214640012024651, + "flos": 15492458229120.0, + "grad_norm": 9.391497638208355, + "language_loss": 0.93962044, + "learning_rate": 3.998452907725016e-06, + "loss": 1.03132153, + "num_input_tokens_seen": 14869580, + "router_z_loss_clip": 9.2421875, + "router_z_loss_mlp": 1.15527344, + "step": 701, + "time_per_iteration": 2.66644024848938 + }, + { + "auxiliary_loss_clip": 0.07737128, + "auxiliary_loss_mlp": 0.01419929, + "balance_loss_clip": 0.06809002, + "balance_loss_mlp": 0.01302341, + "epoch": 0.04220652337291447, + "flos": 23883601656960.0, + "grad_norm": 33.27176662769112, + "language_loss": 0.71847737, + "learning_rate": 3.998437554064184e-06, + "loss": 0.81004792, + "num_input_tokens_seen": 14891065, + "router_z_loss_clip": 9.2890625, + "router_z_loss_mlp": 1.17529297, + "step": 702, + "time_per_iteration": 2.7162067890167236 + }, + { + "auxiliary_loss_clip": 0.07125677, + "auxiliary_loss_mlp": 0.01365095, + "balance_loss_clip": 0.06657615, + "balance_loss_mlp": 0.01297575, + "epoch": 0.042266646625582445, + "flos": 63815289966720.0, + "grad_norm": 0.8674304256332159, + "language_loss": 0.6110186, + "learning_rate": 3.9984221246224006e-06, + "loss": 0.69592631, + "num_input_tokens_seen": 14954815, + "router_z_loss_clip": 4.6875, + "router_z_loss_mlp": 0.67578125, + "step": 703, + "time_per_iteration": 3.3240442276000977 + }, + { + "auxiliary_loss_clip": 0.0710092, + "auxiliary_loss_mlp": 0.01355985, + "balance_loss_clip": 0.06631917, + "balance_loss_mlp": 0.01291803, + "epoch": 0.04232676987825041, + "flos": 50038912154880.0, + "grad_norm": 1.041495616235658, + "language_loss": 0.58151424, + "learning_rate": 3.9984066194002494e-06, + "loss": 0.66608322, + "num_input_tokens_seen": 15003050, + "router_z_loss_clip": 4.6875, + "router_z_loss_mlp": 0.64160156, + "step": 704, + "time_per_iteration": 3.174765110015869 + }, + { + "auxiliary_loss_clip": 0.07745479, + "auxiliary_loss_mlp": 0.01449155, + "balance_loss_clip": 0.06810448, + "balance_loss_mlp": 0.01329278, + "epoch": 0.04238689313091838, + "flos": 21622485628800.0, + "grad_norm": 12.557351496220864, + "language_loss": 0.93966371, + "learning_rate": 3.998391038398319e-06, + "loss": 1.03161013, + "num_input_tokens_seen": 15021990, + "router_z_loss_clip": 9.3515625, + "router_z_loss_mlp": 1.19775391, + "step": 705, + "time_per_iteration": 2.6435232162475586 + }, + { + "auxiliary_loss_clip": 0.07677379, + "auxiliary_loss_mlp": 0.01427121, + "balance_loss_clip": 0.06791299, + "balance_loss_mlp": 0.01325698, + "epoch": 0.042447016383586354, + "flos": 19141080416640.0, + "grad_norm": 3.7381942579388303, + "language_loss": 0.75889277, + "learning_rate": 3.998375381617201e-06, + "loss": 0.8499378, + "num_input_tokens_seen": 15040700, + "router_z_loss_clip": 8.8515625, + "router_z_loss_mlp": 1.01269531, + "step": 706, + "time_per_iteration": 2.671828508377075 + }, + { + "auxiliary_loss_clip": 0.07719514, + "auxiliary_loss_mlp": 0.01450054, + "balance_loss_clip": 0.06807585, + "balance_loss_mlp": 0.01336471, + "epoch": 0.04250713963625432, + "flos": 24432941024640.0, + "grad_norm": 29.794541170575812, + "language_loss": 0.97812521, + "learning_rate": 3.9983596490574875e-06, + "loss": 1.06982088, + "num_input_tokens_seen": 15056725, + "router_z_loss_clip": 9.1171875, + "router_z_loss_mlp": 1.13427734, + "step": 707, + "time_per_iteration": 2.6550920009613037 + }, + { + "auxiliary_loss_clip": 0.07717137, + "auxiliary_loss_mlp": 0.01443639, + "balance_loss_clip": 0.06809401, + "balance_loss_mlp": 0.01333776, + "epoch": 0.04256726288892229, + "flos": 30374348883840.0, + "grad_norm": 14.849267761051758, + "language_loss": 0.85616708, + "learning_rate": 3.998343840719776e-06, + "loss": 0.94777477, + "num_input_tokens_seen": 15077550, + "router_z_loss_clip": 9.09375, + "router_z_loss_mlp": 1.09863281, + "step": 708, + "time_per_iteration": 2.7447280883789062 + }, + { + "auxiliary_loss_clip": 0.07730591, + "auxiliary_loss_mlp": 0.01453146, + "balance_loss_clip": 0.06808455, + "balance_loss_mlp": 0.01341232, + "epoch": 0.04262738614159026, + "flos": 16368248304000.0, + "grad_norm": 3.836638557890093, + "language_loss": 0.88926339, + "learning_rate": 3.998327956604666e-06, + "loss": 0.98110074, + "num_input_tokens_seen": 15094955, + "router_z_loss_clip": 9.21875, + "router_z_loss_mlp": 1.11914062, + "step": 709, + "time_per_iteration": 2.632735252380371 + }, + { + "auxiliary_loss_clip": 0.07711782, + "auxiliary_loss_mlp": 0.01472, + "balance_loss_clip": 0.06786519, + "balance_loss_mlp": 0.01342396, + "epoch": 0.04268750939425823, + "flos": 20418609692160.0, + "grad_norm": 7.682824070104421, + "language_loss": 0.92841685, + "learning_rate": 3.99831199671276e-06, + "loss": 1.02025461, + "num_input_tokens_seen": 15113395, + "router_z_loss_clip": 9.2421875, + "router_z_loss_mlp": 1.296875, + "step": 710, + "time_per_iteration": 2.6799728870391846 + }, + { + "auxiliary_loss_clip": 0.07731062, + "auxiliary_loss_mlp": 0.01465957, + "balance_loss_clip": 0.06815341, + "balance_loss_mlp": 0.01351993, + "epoch": 0.0427476326469262, + "flos": 20309177859840.0, + "grad_norm": 5.073822997040578, + "language_loss": 0.89081585, + "learning_rate": 3.998295961044662e-06, + "loss": 0.98278606, + "num_input_tokens_seen": 15132920, + "router_z_loss_clip": 9.1484375, + "router_z_loss_mlp": 1.13867188, + "step": 711, + "time_per_iteration": 2.6377625465393066 + }, + { + "auxiliary_loss_clip": 0.07695919, + "auxiliary_loss_mlp": 0.01446717, + "balance_loss_clip": 0.06801347, + "balance_loss_mlp": 0.01336377, + "epoch": 0.042807755899594166, + "flos": 21656880529920.0, + "grad_norm": 4.571300727713509, + "language_loss": 0.91390419, + "learning_rate": 3.9982798496009804e-06, + "loss": 1.00533056, + "num_input_tokens_seen": 15153115, + "router_z_loss_clip": 8.9453125, + "router_z_loss_mlp": 1.10302734, + "step": 712, + "time_per_iteration": 2.6158323287963867 + }, + { + "auxiliary_loss_clip": 0.07722442, + "auxiliary_loss_mlp": 0.01473663, + "balance_loss_clip": 0.06794881, + "balance_loss_mlp": 0.01356647, + "epoch": 0.04286787915226214, + "flos": 21441580663680.0, + "grad_norm": 10.343893565695913, + "language_loss": 0.96509683, + "learning_rate": 3.998263662382328e-06, + "loss": 1.05705786, + "num_input_tokens_seen": 15172770, + "router_z_loss_clip": 9.265625, + "router_z_loss_mlp": 1.17041016, + "step": 713, + "time_per_iteration": 2.668109655380249 + }, + { + "auxiliary_loss_clip": 0.07025006, + "auxiliary_loss_mlp": 0.01310492, + "balance_loss_clip": 0.06573053, + "balance_loss_mlp": 0.01250029, + "epoch": 0.04292800240493011, + "flos": 66420256423680.0, + "grad_norm": 1.0671347208063184, + "language_loss": 0.65522671, + "learning_rate": 3.9982473993893165e-06, + "loss": 0.73858166, + "num_input_tokens_seen": 15240055, + "router_z_loss_clip": 4.5078125, + "router_z_loss_mlp": 0.60351562, + "step": 714, + "time_per_iteration": 3.317920207977295 + }, + { + "auxiliary_loss_clip": 0.07647526, + "auxiliary_loss_mlp": 0.01441108, + "balance_loss_clip": 0.0677468, + "balance_loss_mlp": 0.01326476, + "epoch": 0.042988125657598075, + "flos": 31658418777600.0, + "grad_norm": 3.6319248406792983, + "language_loss": 0.79793668, + "learning_rate": 3.998231060622563e-06, + "loss": 0.88882303, + "num_input_tokens_seen": 15261585, + "router_z_loss_clip": 8.73046875, + "router_z_loss_mlp": 1.14550781, + "step": 715, + "time_per_iteration": 2.717393398284912 + }, + { + "auxiliary_loss_clip": 0.07645463, + "auxiliary_loss_mlp": 0.01445614, + "balance_loss_clip": 0.06767702, + "balance_loss_mlp": 0.01331984, + "epoch": 0.04304824891026605, + "flos": 33255690433920.0, + "grad_norm": 29.540799393093693, + "language_loss": 0.77394652, + "learning_rate": 3.998214646082688e-06, + "loss": 0.86485732, + "num_input_tokens_seen": 15281160, + "router_z_loss_clip": 8.7890625, + "router_z_loss_mlp": 1.13623047, + "step": 716, + "time_per_iteration": 2.7298099994659424 + }, + { + "auxiliary_loss_clip": 0.07019071, + "auxiliary_loss_mlp": 0.01306888, + "balance_loss_clip": 0.06569381, + "balance_loss_mlp": 0.01252815, + "epoch": 0.04310837216293401, + "flos": 64086996430080.0, + "grad_norm": 0.9619131870502678, + "language_loss": 0.6602453, + "learning_rate": 3.998198155770314e-06, + "loss": 0.74350488, + "num_input_tokens_seen": 15344505, + "router_z_loss_clip": 4.5, + "router_z_loss_mlp": 0.54199219, + "step": 717, + "time_per_iteration": 3.2711920738220215 + }, + { + "auxiliary_loss_clip": 0.06998679, + "auxiliary_loss_mlp": 0.01302753, + "balance_loss_clip": 0.06550965, + "balance_loss_mlp": 0.01248918, + "epoch": 0.043168495415601985, + "flos": 61361990599680.0, + "grad_norm": 0.9806748941419274, + "language_loss": 0.58663344, + "learning_rate": 3.998181589686065e-06, + "loss": 0.66964775, + "num_input_tokens_seen": 15404050, + "router_z_loss_clip": 4.49609375, + "router_z_loss_mlp": 0.53955078, + "step": 718, + "time_per_iteration": 3.083362579345703 + }, + { + "auxiliary_loss_clip": 0.07634784, + "auxiliary_loss_mlp": 0.01408365, + "balance_loss_clip": 0.06757121, + "balance_loss_mlp": 0.01309135, + "epoch": 0.04322861866826996, + "flos": 20710539717120.0, + "grad_norm": 8.670927241625472, + "language_loss": 0.97469372, + "learning_rate": 3.99816494783057e-06, + "loss": 1.06512523, + "num_input_tokens_seen": 15424190, + "router_z_loss_clip": 8.78125, + "router_z_loss_mlp": 0.99316406, + "step": 719, + "time_per_iteration": 2.620244264602661 + }, + { + "auxiliary_loss_clip": 0.07617359, + "auxiliary_loss_mlp": 0.01437239, + "balance_loss_clip": 0.06746139, + "balance_loss_mlp": 0.01327042, + "epoch": 0.04328874192093792, + "flos": 30381308772480.0, + "grad_norm": 7.103043460272315, + "language_loss": 0.71241379, + "learning_rate": 3.99814823020446e-06, + "loss": 0.8029598, + "num_input_tokens_seen": 15446500, + "router_z_loss_clip": 8.703125, + "router_z_loss_mlp": 1.10253906, + "step": 720, + "time_per_iteration": 2.7137084007263184 + }, + { + "auxiliary_loss_clip": 0.07571768, + "auxiliary_loss_mlp": 0.01420566, + "balance_loss_clip": 0.06721878, + "balance_loss_mlp": 0.01314518, + "epoch": 0.043348865173605894, + "flos": 21951284250240.0, + "grad_norm": 7.242521234745598, + "language_loss": 0.82826072, + "learning_rate": 3.9981314368083684e-06, + "loss": 0.91818404, + "num_input_tokens_seen": 15465830, + "router_z_loss_clip": 8.5078125, + "router_z_loss_mlp": 1.06152344, + "step": 721, + "time_per_iteration": 2.6496849060058594 + }, + { + "auxiliary_loss_clip": 0.07618188, + "auxiliary_loss_mlp": 0.01421571, + "balance_loss_clip": 0.06749155, + "balance_loss_mlp": 0.01323009, + "epoch": 0.04340898842627386, + "flos": 15268982590080.0, + "grad_norm": 11.950148766430376, + "language_loss": 0.94630802, + "learning_rate": 3.998114567642933e-06, + "loss": 1.03670549, + "num_input_tokens_seen": 15479985, + "router_z_loss_clip": 8.6953125, + "router_z_loss_mlp": 0.98486328, + "step": 722, + "time_per_iteration": 2.665302038192749 + }, + { + "auxiliary_loss_clip": 0.07582939, + "auxiliary_loss_mlp": 0.01410079, + "balance_loss_clip": 0.06720737, + "balance_loss_mlp": 0.01309896, + "epoch": 0.04346911167894183, + "flos": 27973011847680.0, + "grad_norm": 7.626593725821058, + "language_loss": 0.90292984, + "learning_rate": 3.998097622708792e-06, + "loss": 0.99286008, + "num_input_tokens_seen": 15501545, + "router_z_loss_clip": 8.625, + "router_z_loss_mlp": 1.00195312, + "step": 723, + "time_per_iteration": 2.6893301010131836 + }, + { + "auxiliary_loss_clip": 0.0756183, + "auxiliary_loss_mlp": 0.01404071, + "balance_loss_clip": 0.06712201, + "balance_loss_mlp": 0.01307798, + "epoch": 0.0435292349316098, + "flos": 29249954144640.0, + "grad_norm": 5.654199567369001, + "language_loss": 0.8762064, + "learning_rate": 3.99808060200659e-06, + "loss": 0.96586531, + "num_input_tokens_seen": 15521725, + "router_z_loss_clip": 8.5, + "router_z_loss_mlp": 0.96337891, + "step": 724, + "time_per_iteration": 2.7862863540649414 + }, + { + "auxiliary_loss_clip": 0.07522231, + "auxiliary_loss_mlp": 0.01408898, + "balance_loss_clip": 0.06700347, + "balance_loss_mlp": 0.01310479, + "epoch": 0.04358935818427777, + "flos": 20564616631680.0, + "grad_norm": 17.469159252810304, + "language_loss": 0.84563124, + "learning_rate": 3.998063505536971e-06, + "loss": 0.93494248, + "num_input_tokens_seen": 15540910, + "router_z_loss_clip": 8.2109375, + "router_z_loss_mlp": 0.98339844, + "step": 725, + "time_per_iteration": 2.6348090171813965 + }, + { + "auxiliary_loss_clip": 0.07563804, + "auxiliary_loss_mlp": 0.01414464, + "balance_loss_clip": 0.06708695, + "balance_loss_mlp": 0.01317428, + "epoch": 0.04364948143694574, + "flos": 14470116163200.0, + "grad_norm": 13.275228581754149, + "language_loss": 0.94372833, + "learning_rate": 3.998046333300584e-06, + "loss": 1.03351104, + "num_input_tokens_seen": 15558640, + "router_z_loss_clip": 8.5546875, + "router_z_loss_mlp": 0.96972656, + "step": 726, + "time_per_iteration": 2.6198081970214844 + }, + { + "auxiliary_loss_clip": 0.06976914, + "auxiliary_loss_mlp": 0.01364793, + "balance_loss_clip": 0.0652867, + "balance_loss_mlp": 0.01297797, + "epoch": 0.043709604689613706, + "flos": 50083216565760.0, + "grad_norm": 0.973992689315138, + "language_loss": 0.56151426, + "learning_rate": 3.998029085298079e-06, + "loss": 0.64493132, + "num_input_tokens_seen": 15612975, + "router_z_loss_clip": 4.4921875, + "router_z_loss_mlp": 0.67041016, + "step": 727, + "time_per_iteration": 3.331416368484497 + }, + { + "auxiliary_loss_clip": 0.07546923, + "auxiliary_loss_mlp": 0.01412171, + "balance_loss_clip": 0.06696635, + "balance_loss_mlp": 0.01320475, + "epoch": 0.04376972794228168, + "flos": 13996861902720.0, + "grad_norm": 5.257747667032763, + "language_loss": 0.87717295, + "learning_rate": 3.998011761530112e-06, + "loss": 0.96676385, + "num_input_tokens_seen": 15631070, + "router_z_loss_clip": 8.51953125, + "router_z_loss_mlp": 0.91699219, + "step": 728, + "time_per_iteration": 3.989957571029663 + }, + { + "auxiliary_loss_clip": 0.07508835, + "auxiliary_loss_mlp": 0.01424416, + "balance_loss_clip": 0.06694756, + "balance_loss_mlp": 0.0133787, + "epoch": 0.04382985119494965, + "flos": 22015084734720.0, + "grad_norm": 7.636957371182376, + "language_loss": 0.80325305, + "learning_rate": 3.997994361997338e-06, + "loss": 0.89258564, + "num_input_tokens_seen": 15647825, + "router_z_loss_clip": 8.14453125, + "router_z_loss_mlp": 0.86572266, + "step": 729, + "time_per_iteration": 4.069265365600586 + }, + { + "auxiliary_loss_clip": 0.07515953, + "auxiliary_loss_mlp": 0.01429781, + "balance_loss_clip": 0.06682766, + "balance_loss_mlp": 0.01337561, + "epoch": 0.043889974447617615, + "flos": 24213322673280.0, + "grad_norm": 4.547809577279536, + "language_loss": 1.00979817, + "learning_rate": 3.997976886700417e-06, + "loss": 1.09925556, + "num_input_tokens_seen": 15668260, + "router_z_loss_clip": 8.33203125, + "router_z_loss_mlp": 0.92285156, + "step": 730, + "time_per_iteration": 4.043174982070923 + }, + { + "auxiliary_loss_clip": 0.07549515, + "auxiliary_loss_mlp": 0.01462607, + "balance_loss_clip": 0.06684491, + "balance_loss_mlp": 0.0135055, + "epoch": 0.04395009770028559, + "flos": 17280236142720.0, + "grad_norm": 42.34250232752857, + "language_loss": 0.93866402, + "learning_rate": 3.997959335640013e-06, + "loss": 1.02878523, + "num_input_tokens_seen": 15685630, + "router_z_loss_clip": 8.6640625, + "router_z_loss_mlp": 1.12011719, + "step": 731, + "time_per_iteration": 2.6158339977264404 + }, + { + "auxiliary_loss_clip": 0.07507139, + "auxiliary_loss_mlp": 0.01450773, + "balance_loss_clip": 0.06690555, + "balance_loss_mlp": 0.0135059, + "epoch": 0.04401022095295355, + "flos": 12314784314880.0, + "grad_norm": 29.143956092822908, + "language_loss": 0.9731133, + "learning_rate": 3.997941708816791e-06, + "loss": 1.0626924, + "num_input_tokens_seen": 15698645, + "router_z_loss_clip": 8.1640625, + "router_z_loss_mlp": 1.00146484, + "step": 732, + "time_per_iteration": 4.100733995437622 + }, + { + "auxiliary_loss_clip": 0.07525843, + "auxiliary_loss_mlp": 0.01458711, + "balance_loss_clip": 0.06679834, + "balance_loss_mlp": 0.01353854, + "epoch": 0.044070344205621524, + "flos": 20965978488960.0, + "grad_norm": 13.482370943505323, + "language_loss": 0.90961432, + "learning_rate": 3.997924006231419e-06, + "loss": 0.9994598, + "num_input_tokens_seen": 15716775, + "router_z_loss_clip": 8.46875, + "router_z_loss_mlp": 1.04785156, + "step": 733, + "time_per_iteration": 2.6597700119018555 + }, + { + "auxiliary_loss_clip": 0.07518548, + "auxiliary_loss_mlp": 0.01469977, + "balance_loss_clip": 0.06685109, + "balance_loss_mlp": 0.01364262, + "epoch": 0.044130467458289496, + "flos": 13850477619840.0, + "grad_norm": 7.4867822080691235, + "language_loss": 0.95689577, + "learning_rate": 3.9979062278845685e-06, + "loss": 1.04678106, + "num_input_tokens_seen": 15733320, + "router_z_loss_clip": 8.34375, + "router_z_loss_mlp": 1.05664062, + "step": 734, + "time_per_iteration": 2.5865581035614014 + }, + { + "auxiliary_loss_clip": 0.0748552, + "auxiliary_loss_mlp": 0.01451415, + "balance_loss_clip": 0.06673294, + "balance_loss_mlp": 0.01355809, + "epoch": 0.04419059071095746, + "flos": 28662152952960.0, + "grad_norm": 3.9560769382385237, + "language_loss": 0.82954776, + "learning_rate": 3.9978883737769125e-06, + "loss": 0.91891712, + "num_input_tokens_seen": 15752705, + "router_z_loss_clip": 8.12890625, + "router_z_loss_mlp": 0.95605469, + "step": 735, + "time_per_iteration": 2.7034595012664795 + }, + { + "auxiliary_loss_clip": 0.07501128, + "auxiliary_loss_mlp": 0.01471986, + "balance_loss_clip": 0.06663659, + "balance_loss_mlp": 0.01360931, + "epoch": 0.04425071396362543, + "flos": 28190743482240.0, + "grad_norm": 5.551572813958511, + "language_loss": 0.95522362, + "learning_rate": 3.9978704439091305e-06, + "loss": 1.04495478, + "num_input_tokens_seen": 15772800, + "router_z_loss_clip": 8.375, + "router_z_loss_mlp": 1.11132812, + "step": 736, + "time_per_iteration": 2.6946370601654053 + }, + { + "auxiliary_loss_clip": 0.07478474, + "auxiliary_loss_mlp": 0.01445427, + "balance_loss_clip": 0.06672784, + "balance_loss_mlp": 0.01338806, + "epoch": 0.0443108372162934, + "flos": 23665031481600.0, + "grad_norm": 16.744954570362566, + "language_loss": 0.88981938, + "learning_rate": 3.997852438281901e-06, + "loss": 0.97905844, + "num_input_tokens_seen": 15793665, + "router_z_loss_clip": 8.0625, + "router_z_loss_mlp": 1.06640625, + "step": 737, + "time_per_iteration": 2.715646266937256 + }, + { + "auxiliary_loss_clip": 0.07480585, + "auxiliary_loss_mlp": 0.01439926, + "balance_loss_clip": 0.0667211, + "balance_loss_mlp": 0.01326964, + "epoch": 0.04437096046896137, + "flos": 33987486067200.0, + "grad_norm": 222.55096495156016, + "language_loss": 0.89570022, + "learning_rate": 3.997834356895906e-06, + "loss": 0.98490536, + "num_input_tokens_seen": 15813175, + "router_z_loss_clip": 8.0859375, + "router_z_loss_mlp": 1.12988281, + "step": 738, + "time_per_iteration": 2.7859413623809814 + }, + { + "auxiliary_loss_clip": 0.06961473, + "auxiliary_loss_mlp": 0.01305245, + "balance_loss_clip": 0.06532852, + "balance_loss_mlp": 0.01250504, + "epoch": 0.04443108372162934, + "flos": 67416268308480.0, + "grad_norm": 0.9420923573397554, + "language_loss": 0.59376323, + "learning_rate": 3.9978161997518324e-06, + "loss": 0.67643034, + "num_input_tokens_seen": 15872050, + "router_z_loss_clip": 4.3046875, + "router_z_loss_mlp": 0.54882812, + "step": 739, + "time_per_iteration": 3.1967270374298096 + }, + { + "auxiliary_loss_clip": 0.07502826, + "auxiliary_loss_mlp": 0.01427717, + "balance_loss_clip": 0.06669345, + "balance_loss_mlp": 0.01320858, + "epoch": 0.04449120697429731, + "flos": 29760454344960.0, + "grad_norm": 6.6049127408313915, + "language_loss": 0.9770751, + "learning_rate": 3.997797966850369e-06, + "loss": 1.0663805, + "num_input_tokens_seen": 15891085, + "router_z_loss_clip": 8.3359375, + "router_z_loss_mlp": 1.06933594, + "step": 740, + "time_per_iteration": 2.768758535385132 + }, + { + "auxiliary_loss_clip": 0.07489674, + "auxiliary_loss_mlp": 0.0143368, + "balance_loss_clip": 0.06660549, + "balance_loss_mlp": 0.01330111, + "epoch": 0.04455133022696528, + "flos": 36510958828800.0, + "grad_norm": 21.062626098117025, + "language_loss": 0.76799577, + "learning_rate": 3.997779658192205e-06, + "loss": 0.85722935, + "num_input_tokens_seen": 15914225, + "router_z_loss_clip": 8.3046875, + "router_z_loss_mlp": 1.03515625, + "step": 741, + "time_per_iteration": 2.755948543548584 + }, + { + "auxiliary_loss_clip": 0.0744606, + "auxiliary_loss_mlp": 0.01441267, + "balance_loss_clip": 0.06655986, + "balance_loss_mlp": 0.01339128, + "epoch": 0.044611453479633245, + "flos": 28811220566400.0, + "grad_norm": 10.341428331493303, + "language_loss": 0.9204191, + "learning_rate": 3.997761273778037e-06, + "loss": 1.00929236, + "num_input_tokens_seen": 15934540, + "router_z_loss_clip": 7.90234375, + "router_z_loss_mlp": 1.02148438, + "step": 742, + "time_per_iteration": 2.6964497566223145 + }, + { + "auxiliary_loss_clip": 0.07461847, + "auxiliary_loss_mlp": 0.01424939, + "balance_loss_clip": 0.06654513, + "balance_loss_mlp": 0.01322085, + "epoch": 0.04467157673230122, + "flos": 20017122053760.0, + "grad_norm": 7.31366885778202, + "language_loss": 0.89204007, + "learning_rate": 3.997742813608561e-06, + "loss": 0.98090798, + "num_input_tokens_seen": 15952560, + "router_z_loss_clip": 8.078125, + "router_z_loss_mlp": 1.02880859, + "step": 743, + "time_per_iteration": 2.6080615520477295 + }, + { + "auxiliary_loss_clip": 0.07439004, + "auxiliary_loss_mlp": 0.01432385, + "balance_loss_clip": 0.06638713, + "balance_loss_mlp": 0.01329913, + "epoch": 0.04473169998496919, + "flos": 18010899745920.0, + "grad_norm": 13.675273731760388, + "language_loss": 0.85338962, + "learning_rate": 3.997724277684479e-06, + "loss": 0.94210356, + "num_input_tokens_seen": 15970620, + "router_z_loss_clip": 8.00390625, + "router_z_loss_mlp": 1.02490234, + "step": 744, + "time_per_iteration": 2.697763204574585 + }, + { + "auxiliary_loss_clip": 0.07427198, + "auxiliary_loss_mlp": 0.01407828, + "balance_loss_clip": 0.06637768, + "balance_loss_mlp": 0.01313938, + "epoch": 0.044791823237637154, + "flos": 20638060335360.0, + "grad_norm": 8.258556171326942, + "language_loss": 0.89771521, + "learning_rate": 3.99770566600649e-06, + "loss": 0.98606539, + "num_input_tokens_seen": 15987325, + "router_z_loss_clip": 7.89453125, + "router_z_loss_mlp": 0.93896484, + "step": 745, + "time_per_iteration": 2.609206438064575 + }, + { + "auxiliary_loss_clip": 0.07450528, + "auxiliary_loss_mlp": 0.01413412, + "balance_loss_clip": 0.06646559, + "balance_loss_mlp": 0.01313371, + "epoch": 0.04485194649030513, + "flos": 31184284049280.0, + "grad_norm": 12.351211228960139, + "language_loss": 0.73676586, + "learning_rate": 3.997686978575302e-06, + "loss": 0.82540524, + "num_input_tokens_seen": 16008310, + "router_z_loss_clip": 8.05859375, + "router_z_loss_mlp": 1.0, + "step": 746, + "time_per_iteration": 2.8217551708221436 + }, + { + "auxiliary_loss_clip": 0.07421336, + "auxiliary_loss_mlp": 0.01411005, + "balance_loss_clip": 0.06631814, + "balance_loss_mlp": 0.01308485, + "epoch": 0.04491206974297309, + "flos": 26150922887040.0, + "grad_norm": 4.52399420645529, + "language_loss": 0.7370531, + "learning_rate": 3.997668215391625e-06, + "loss": 0.82537645, + "num_input_tokens_seen": 16029620, + "router_z_loss_clip": 7.89453125, + "router_z_loss_mlp": 1.02587891, + "step": 747, + "time_per_iteration": 2.724240303039551 + }, + { + "auxiliary_loss_clip": 0.0741486, + "auxiliary_loss_mlp": 0.01407706, + "balance_loss_clip": 0.06629101, + "balance_loss_mlp": 0.0131005, + "epoch": 0.044972192995641064, + "flos": 20673922682880.0, + "grad_norm": 4.695342378066542, + "language_loss": 0.7142753, + "learning_rate": 3.997649376456168e-06, + "loss": 0.80250096, + "num_input_tokens_seen": 16049065, + "router_z_loss_clip": 7.859375, + "router_z_loss_mlp": 0.97607422, + "step": 748, + "time_per_iteration": 2.6020255088806152 + }, + { + "auxiliary_loss_clip": 0.0743566, + "auxiliary_loss_mlp": 0.01385894, + "balance_loss_clip": 0.06626688, + "balance_loss_mlp": 0.01281753, + "epoch": 0.045032316248309036, + "flos": 16112306407680.0, + "grad_norm": 6.462262226814603, + "language_loss": 0.81646264, + "learning_rate": 3.997630461769647e-06, + "loss": 0.90467817, + "num_input_tokens_seen": 16066765, + "router_z_loss_clip": 8.08984375, + "router_z_loss_mlp": 1.04199219, + "step": 749, + "time_per_iteration": 2.715440273284912 + }, + { + "auxiliary_loss_clip": 0.07424041, + "auxiliary_loss_mlp": 0.01391269, + "balance_loss_clip": 0.06627008, + "balance_loss_mlp": 0.01284601, + "epoch": 0.045092439500977, + "flos": 17864725098240.0, + "grad_norm": 4.760324696153287, + "language_loss": 0.94018352, + "learning_rate": 3.997611471332778e-06, + "loss": 1.02833652, + "num_input_tokens_seen": 16085980, + "router_z_loss_clip": 7.96484375, + "router_z_loss_mlp": 1.06542969, + "step": 750, + "time_per_iteration": 2.603782892227173 + }, + { + "auxiliary_loss_clip": 0.07430436, + "auxiliary_loss_mlp": 0.01400307, + "balance_loss_clip": 0.06634089, + "balance_loss_mlp": 0.01284579, + "epoch": 0.04515256275364497, + "flos": 24469809621120.0, + "grad_norm": 8.436133500985974, + "language_loss": 0.79776669, + "learning_rate": 3.9975924051462825e-06, + "loss": 0.88607413, + "num_input_tokens_seen": 16106260, + "router_z_loss_clip": 7.97265625, + "router_z_loss_mlp": 1.15673828, + "step": 751, + "time_per_iteration": 2.6831071376800537 + }, + { + "auxiliary_loss_clip": 0.07439418, + "auxiliary_loss_mlp": 0.01393415, + "balance_loss_clip": 0.06633066, + "balance_loss_mlp": 0.01282932, + "epoch": 0.04521268600631294, + "flos": 20921563025280.0, + "grad_norm": 6.241833654243461, + "language_loss": 0.75070345, + "learning_rate": 3.997573263210883e-06, + "loss": 0.83903182, + "num_input_tokens_seen": 16123475, + "router_z_loss_clip": 8.05859375, + "router_z_loss_mlp": 1.10351562, + "step": 752, + "time_per_iteration": 2.6177663803100586 + }, + { + "auxiliary_loss_clip": 0.07437599, + "auxiliary_loss_mlp": 0.01387858, + "balance_loss_clip": 0.06631324, + "balance_loss_mlp": 0.01275515, + "epoch": 0.04527280925898091, + "flos": 13376552526720.0, + "grad_norm": 9.915844804632899, + "language_loss": 0.97712451, + "learning_rate": 3.997554045527305e-06, + "loss": 1.06537914, + "num_input_tokens_seen": 16138335, + "router_z_loss_clip": 8.0703125, + "router_z_loss_mlp": 1.125, + "step": 753, + "time_per_iteration": 2.613664388656616 + }, + { + "auxiliary_loss_clip": 0.07467066, + "auxiliary_loss_mlp": 0.0138957, + "balance_loss_clip": 0.06645191, + "balance_loss_mlp": 0.01278133, + "epoch": 0.04533293251164888, + "flos": 23260650877440.0, + "grad_norm": 4.960920268809469, + "language_loss": 0.95308006, + "learning_rate": 3.997534752096277e-06, + "loss": 1.04164636, + "num_input_tokens_seen": 16157110, + "router_z_loss_clip": 8.23046875, + "router_z_loss_mlp": 1.11376953, + "step": 754, + "time_per_iteration": 2.6214957237243652 + }, + { + "auxiliary_loss_clip": 0.07402018, + "auxiliary_loss_mlp": 0.01373244, + "balance_loss_clip": 0.06614807, + "balance_loss_mlp": 0.01264812, + "epoch": 0.04539305576431685, + "flos": 12426899477760.0, + "grad_norm": 4.312204742226669, + "language_loss": 0.84473336, + "learning_rate": 3.997515382918531e-06, + "loss": 0.93248594, + "num_input_tokens_seen": 16174155, + "router_z_loss_clip": 7.87890625, + "router_z_loss_mlp": 1.08544922, + "step": 755, + "time_per_iteration": 2.659515857696533 + }, + { + "auxiliary_loss_clip": 0.07425568, + "auxiliary_loss_mlp": 0.01385083, + "balance_loss_clip": 0.06618007, + "balance_loss_mlp": 0.01261582, + "epoch": 0.04545317901698482, + "flos": 16076569841280.0, + "grad_norm": 4.663949688306233, + "language_loss": 0.85189492, + "learning_rate": 3.9974959379948015e-06, + "loss": 0.94000149, + "num_input_tokens_seen": 16192240, + "router_z_loss_clip": 8.078125, + "router_z_loss_mlp": 1.23632812, + "step": 756, + "time_per_iteration": 2.5948095321655273 + }, + { + "auxiliary_loss_clip": 0.0692629, + "auxiliary_loss_mlp": 0.01345145, + "balance_loss_clip": 0.06492035, + "balance_loss_mlp": 0.01295292, + "epoch": 0.045513302269652785, + "flos": 66418118144640.0, + "grad_norm": 0.7901603277703675, + "language_loss": 0.62960637, + "learning_rate": 3.997476417325827e-06, + "loss": 0.71232069, + "num_input_tokens_seen": 16255775, + "router_z_loss_clip": 4.34375, + "router_z_loss_mlp": 0.49829102, + "step": 757, + "time_per_iteration": 3.255581855773926 + }, + { + "auxiliary_loss_clip": 0.07416959, + "auxiliary_loss_mlp": 0.01380818, + "balance_loss_clip": 0.06624802, + "balance_loss_mlp": 0.01258747, + "epoch": 0.04557342552232076, + "flos": 21477694573440.0, + "grad_norm": 3.09506424046452, + "language_loss": 0.87773216, + "learning_rate": 3.997456820912346e-06, + "loss": 0.96570992, + "num_input_tokens_seen": 16277015, + "router_z_loss_clip": 7.921875, + "router_z_loss_mlp": 1.22070312, + "step": 758, + "time_per_iteration": 2.661123514175415 + }, + { + "auxiliary_loss_clip": 0.0740035, + "auxiliary_loss_mlp": 0.01375063, + "balance_loss_clip": 0.06621221, + "balance_loss_mlp": 0.01257952, + "epoch": 0.04563354877498873, + "flos": 23739481434240.0, + "grad_norm": 2.638413914831674, + "language_loss": 0.92492557, + "learning_rate": 3.997437148755101e-06, + "loss": 1.0126797, + "num_input_tokens_seen": 16296005, + "router_z_loss_clip": 7.78515625, + "router_z_loss_mlp": 1.17089844, + "step": 759, + "time_per_iteration": 2.668470859527588 + }, + { + "auxiliary_loss_clip": 0.07430892, + "auxiliary_loss_mlp": 0.01383461, + "balance_loss_clip": 0.06623936, + "balance_loss_mlp": 0.01266541, + "epoch": 0.045693672027656694, + "flos": 25742265724800.0, + "grad_norm": 3.8629420904701237, + "language_loss": 0.79697698, + "learning_rate": 3.9974174008548405e-06, + "loss": 0.88512051, + "num_input_tokens_seen": 16315300, + "router_z_loss_clip": 8.07421875, + "router_z_loss_mlp": 1.16992188, + "step": 760, + "time_per_iteration": 2.716425895690918 + }, + { + "auxiliary_loss_clip": 0.07406907, + "auxiliary_loss_mlp": 0.01369419, + "balance_loss_clip": 0.06620169, + "balance_loss_mlp": 0.01267519, + "epoch": 0.045753795280324666, + "flos": 19725108174720.0, + "grad_norm": 2.8686759977967458, + "language_loss": 0.87246794, + "learning_rate": 3.9973975772123105e-06, + "loss": 0.96023118, + "num_input_tokens_seen": 16333820, + "router_z_loss_clip": 7.87109375, + "router_z_loss_mlp": 1.01855469, + "step": 761, + "time_per_iteration": 2.6261487007141113 + }, + { + "auxiliary_loss_clip": 0.07379207, + "auxiliary_loss_mlp": 0.01371916, + "balance_loss_clip": 0.06607988, + "balance_loss_mlp": 0.01259764, + "epoch": 0.04581391853299264, + "flos": 23262076396800.0, + "grad_norm": 2.7268346941502273, + "language_loss": 0.83904314, + "learning_rate": 3.997377677828266e-06, + "loss": 0.92655438, + "num_input_tokens_seen": 16355290, + "router_z_loss_clip": 7.71875, + "router_z_loss_mlp": 1.12304688, + "step": 762, + "time_per_iteration": 2.677358627319336 + }, + { + "auxiliary_loss_clip": 0.06917945, + "auxiliary_loss_mlp": 0.01342542, + "balance_loss_clip": 0.06491472, + "balance_loss_mlp": 0.01301057, + "epoch": 0.0458740417856606, + "flos": 64250711308800.0, + "grad_norm": 0.9293980504879501, + "language_loss": 0.59131134, + "learning_rate": 3.9973577027034585e-06, + "loss": 0.67391622, + "num_input_tokens_seen": 16415995, + "router_z_loss_clip": 4.25, + "router_z_loss_mlp": 0.41503906, + "step": 763, + "time_per_iteration": 3.262456178665161 + }, + { + "auxiliary_loss_clip": 0.07421511, + "auxiliary_loss_mlp": 0.01399391, + "balance_loss_clip": 0.0662367, + "balance_loss_mlp": 0.01283425, + "epoch": 0.045934165038328575, + "flos": 20775220669440.0, + "grad_norm": 3.4758610459340535, + "language_loss": 0.92935646, + "learning_rate": 3.9973376518386475e-06, + "loss": 1.01756549, + "num_input_tokens_seen": 16433120, + "router_z_loss_clip": 7.98046875, + "router_z_loss_mlp": 1.15869141, + "step": 764, + "time_per_iteration": 2.66152024269104 + }, + { + "auxiliary_loss_clip": 0.07451791, + "auxiliary_loss_mlp": 0.01391333, + "balance_loss_clip": 0.06637829, + "balance_loss_mlp": 0.01274556, + "epoch": 0.04599428829099654, + "flos": 30270661056000.0, + "grad_norm": 3.768496915542153, + "language_loss": 0.90699267, + "learning_rate": 3.997317525234592e-06, + "loss": 0.99542397, + "num_input_tokens_seen": 16453360, + "router_z_loss_clip": 8.14453125, + "router_z_loss_mlp": 1.16845703, + "step": 765, + "time_per_iteration": 2.6835410594940186 + }, + { + "auxiliary_loss_clip": 0.07426902, + "auxiliary_loss_mlp": 0.01398616, + "balance_loss_clip": 0.0662117, + "balance_loss_mlp": 0.01278883, + "epoch": 0.04605441154366451, + "flos": 23045518719360.0, + "grad_norm": 7.076643019058991, + "language_loss": 0.94406933, + "learning_rate": 3.997297322892056e-06, + "loss": 1.03232455, + "num_input_tokens_seen": 16471160, + "router_z_loss_clip": 8.0625, + "router_z_loss_mlp": 1.19580078, + "step": 766, + "time_per_iteration": 2.6382553577423096 + }, + { + "auxiliary_loss_clip": 0.07415807, + "auxiliary_loss_mlp": 0.01393781, + "balance_loss_clip": 0.06614047, + "balance_loss_mlp": 0.01284967, + "epoch": 0.046114534796332485, + "flos": 22023847486080.0, + "grad_norm": 4.776611740874826, + "language_loss": 0.89285934, + "learning_rate": 3.997277044811806e-06, + "loss": 0.98095518, + "num_input_tokens_seen": 16488940, + "router_z_loss_clip": 8.01953125, + "router_z_loss_mlp": 1.08789062, + "step": 767, + "time_per_iteration": 4.195739984512329 + }, + { + "auxiliary_loss_clip": 0.07392205, + "auxiliary_loss_mlp": 0.01374375, + "balance_loss_clip": 0.0661349, + "balance_loss_mlp": 0.01267278, + "epoch": 0.04617465804900045, + "flos": 29870221593600.0, + "grad_norm": 7.642963435689524, + "language_loss": 0.92056656, + "learning_rate": 3.99725669099461e-06, + "loss": 1.00823236, + "num_input_tokens_seen": 16509505, + "router_z_loss_clip": 7.7890625, + "router_z_loss_mlp": 1.0703125, + "step": 768, + "time_per_iteration": 4.208758354187012 + }, + { + "auxiliary_loss_clip": 0.07427865, + "auxiliary_loss_mlp": 0.01386956, + "balance_loss_clip": 0.06619686, + "balance_loss_mlp": 0.01278571, + "epoch": 0.04623478130166842, + "flos": 25637194304640.0, + "grad_norm": 3.542997425401238, + "language_loss": 0.79400444, + "learning_rate": 3.9972362614412395e-06, + "loss": 0.88215268, + "num_input_tokens_seen": 16528840, + "router_z_loss_clip": 8.078125, + "router_z_loss_mlp": 1.08447266, + "step": 769, + "time_per_iteration": 4.17974328994751 + }, + { + "auxiliary_loss_clip": 0.07375413, + "auxiliary_loss_mlp": 0.01385881, + "balance_loss_clip": 0.06606276, + "balance_loss_mlp": 0.01275923, + "epoch": 0.04629490455433639, + "flos": 20455352507520.0, + "grad_norm": 2.7800745603564185, + "language_loss": 0.89842647, + "learning_rate": 3.997215756152471e-06, + "loss": 0.9860394, + "num_input_tokens_seen": 16548335, + "router_z_loss_clip": 7.69140625, + "router_z_loss_mlp": 1.10009766, + "step": 770, + "time_per_iteration": 2.656651735305786 + }, + { + "auxiliary_loss_clip": 0.07423855, + "auxiliary_loss_mlp": 0.01400348, + "balance_loss_clip": 0.06619771, + "balance_loss_mlp": 0.01292678, + "epoch": 0.04635502780700436, + "flos": 23155411749120.0, + "grad_norm": 4.755062709171144, + "language_loss": 0.92055309, + "learning_rate": 3.99719517512908e-06, + "loss": 1.00879514, + "num_input_tokens_seen": 16567725, + "router_z_loss_clip": 8.04296875, + "router_z_loss_mlp": 1.07714844, + "step": 771, + "time_per_iteration": 4.008092403411865 + }, + { + "auxiliary_loss_clip": 0.07446887, + "auxiliary_loss_mlp": 0.0141094, + "balance_loss_clip": 0.06623209, + "balance_loss_mlp": 0.01295641, + "epoch": 0.04641515105967233, + "flos": 23298274160640.0, + "grad_norm": 7.281609081858744, + "language_loss": 0.88918245, + "learning_rate": 3.997174518371848e-06, + "loss": 0.97776067, + "num_input_tokens_seen": 16588175, + "router_z_loss_clip": 8.2265625, + "router_z_loss_mlp": 1.15380859, + "step": 772, + "time_per_iteration": 2.6240971088409424 + }, + { + "auxiliary_loss_clip": 0.07388498, + "auxiliary_loss_mlp": 0.01396403, + "balance_loss_clip": 0.06612748, + "balance_loss_mlp": 0.01294503, + "epoch": 0.046475274312340296, + "flos": 25121579005440.0, + "grad_norm": 3.47084722704317, + "language_loss": 0.78166652, + "learning_rate": 3.997153785881557e-06, + "loss": 0.86951548, + "num_input_tokens_seen": 16607735, + "router_z_loss_clip": 7.765625, + "router_z_loss_mlp": 1.01904297, + "step": 773, + "time_per_iteration": 2.6761457920074463 + }, + { + "auxiliary_loss_clip": 0.07362784, + "auxiliary_loss_mlp": 0.01412458, + "balance_loss_clip": 0.06602354, + "balance_loss_mlp": 0.0130703, + "epoch": 0.04653539756500827, + "flos": 25271946357120.0, + "grad_norm": 3.68531082302782, + "language_loss": 0.82003927, + "learning_rate": 3.997132977658996e-06, + "loss": 0.90779173, + "num_input_tokens_seen": 16627225, + "router_z_loss_clip": 7.609375, + "router_z_loss_mlp": 1.05419922, + "step": 774, + "time_per_iteration": 2.6333625316619873 + }, + { + "auxiliary_loss_clip": 0.0737831, + "auxiliary_loss_mlp": 0.01410602, + "balance_loss_clip": 0.06605712, + "balance_loss_mlp": 0.0129783, + "epoch": 0.046595520817676234, + "flos": 35412238166400.0, + "grad_norm": 3.362442863286837, + "language_loss": 0.78172398, + "learning_rate": 3.997112093704952e-06, + "loss": 0.86961305, + "num_input_tokens_seen": 16647785, + "router_z_loss_clip": 7.73046875, + "router_z_loss_mlp": 1.12792969, + "step": 775, + "time_per_iteration": 2.7341220378875732 + }, + { + "auxiliary_loss_clip": 0.07397586, + "auxiliary_loss_mlp": 0.01408088, + "balance_loss_clip": 0.0662451, + "balance_loss_mlp": 0.01303994, + "epoch": 0.046655644070344206, + "flos": 18118151372160.0, + "grad_norm": 4.938605745427105, + "language_loss": 0.81674814, + "learning_rate": 3.997091134020217e-06, + "loss": 0.90480489, + "num_input_tokens_seen": 16667555, + "router_z_loss_clip": 7.734375, + "router_z_loss_mlp": 1.04052734, + "step": 776, + "time_per_iteration": 2.631185293197632 + }, + { + "auxiliary_loss_clip": 0.07349464, + "auxiliary_loss_mlp": 0.01382372, + "balance_loss_clip": 0.06605366, + "balance_loss_mlp": 0.01283905, + "epoch": 0.04671576732301218, + "flos": 29212959767040.0, + "grad_norm": 3.9530223985438724, + "language_loss": 0.76411474, + "learning_rate": 3.997070098605585e-06, + "loss": 0.85143304, + "num_input_tokens_seen": 16686875, + "router_z_loss_clip": 7.4375, + "router_z_loss_mlp": 0.98535156, + "step": 777, + "time_per_iteration": 2.6883299350738525 + }, + { + "auxiliary_loss_clip": 0.07356873, + "auxiliary_loss_mlp": 0.01403802, + "balance_loss_clip": 0.06604887, + "balance_loss_mlp": 0.0129618, + "epoch": 0.04677589057568014, + "flos": 30485541651840.0, + "grad_norm": 5.886017158674543, + "language_loss": 0.8144322, + "learning_rate": 3.997048987461856e-06, + "loss": 0.90203899, + "num_input_tokens_seen": 16706420, + "router_z_loss_clip": 7.52734375, + "router_z_loss_mlp": 1.07568359, + "step": 778, + "time_per_iteration": 2.685317277908325 + }, + { + "auxiliary_loss_clip": 0.07353938, + "auxiliary_loss_mlp": 0.01397494, + "balance_loss_clip": 0.06609853, + "balance_loss_mlp": 0.01301697, + "epoch": 0.046836013828348115, + "flos": 20563820017920.0, + "grad_norm": 3.1633004103469644, + "language_loss": 0.83870596, + "learning_rate": 3.997027800589829e-06, + "loss": 0.92622018, + "num_input_tokens_seen": 16726390, + "router_z_loss_clip": 7.4375, + "router_z_loss_mlp": 0.95849609, + "step": 779, + "time_per_iteration": 2.737780809402466 + }, + { + "auxiliary_loss_clip": 0.07349363, + "auxiliary_loss_mlp": 0.01400206, + "balance_loss_clip": 0.06610721, + "balance_loss_mlp": 0.01301119, + "epoch": 0.04689613708101608, + "flos": 25454444549760.0, + "grad_norm": 5.859193350473668, + "language_loss": 0.80411738, + "learning_rate": 3.997006537990308e-06, + "loss": 0.89161313, + "num_input_tokens_seen": 16748965, + "router_z_loss_clip": 7.38671875, + "router_z_loss_mlp": 0.99023438, + "step": 780, + "time_per_iteration": 2.7168006896972656 + }, + { + "auxiliary_loss_clip": 0.07343157, + "auxiliary_loss_mlp": 0.0140195, + "balance_loss_clip": 0.06612131, + "balance_loss_mlp": 0.01309253, + "epoch": 0.04695626033368405, + "flos": 23007811582080.0, + "grad_norm": 3.4762604948204707, + "language_loss": 0.80410504, + "learning_rate": 3.996985199664099e-06, + "loss": 0.89155614, + "num_input_tokens_seen": 16768620, + "router_z_loss_clip": 7.3125, + "router_z_loss_mlp": 0.92724609, + "step": 781, + "time_per_iteration": 2.6267943382263184 + }, + { + "auxiliary_loss_clip": 0.07401444, + "auxiliary_loss_mlp": 0.01433849, + "balance_loss_clip": 0.06619258, + "balance_loss_mlp": 0.01321363, + "epoch": 0.047016383586352024, + "flos": 29141193144960.0, + "grad_norm": 4.331089591937386, + "language_loss": 0.79331714, + "learning_rate": 3.99696378561201e-06, + "loss": 0.88167012, + "num_input_tokens_seen": 16789755, + "router_z_loss_clip": 7.83984375, + "router_z_loss_mlp": 1.12451172, + "step": 782, + "time_per_iteration": 2.7272114753723145 + }, + { + "auxiliary_loss_clip": 0.07364355, + "auxiliary_loss_mlp": 0.01439388, + "balance_loss_clip": 0.06623092, + "balance_loss_mlp": 0.01338251, + "epoch": 0.04707650683901999, + "flos": 14981706466560.0, + "grad_norm": 6.433414878185146, + "language_loss": 0.85460365, + "learning_rate": 3.996942295834855e-06, + "loss": 0.94264108, + "num_input_tokens_seen": 16807585, + "router_z_loss_clip": 7.421875, + "router_z_loss_mlp": 1.01269531, + "step": 783, + "time_per_iteration": 2.6950912475585938 + }, + { + "auxiliary_loss_clip": 0.07354224, + "auxiliary_loss_mlp": 0.01436959, + "balance_loss_clip": 0.06629962, + "balance_loss_mlp": 0.01332722, + "epoch": 0.04713663009168796, + "flos": 21657257873280.0, + "grad_norm": 5.367904788236997, + "language_loss": 0.87574267, + "learning_rate": 3.996920730333448e-06, + "loss": 0.96365452, + "num_input_tokens_seen": 16827220, + "router_z_loss_clip": 7.234375, + "router_z_loss_mlp": 1.04150391, + "step": 784, + "time_per_iteration": 2.649948835372925 + }, + { + "auxiliary_loss_clip": 0.07386977, + "auxiliary_loss_mlp": 0.01467498, + "balance_loss_clip": 0.06641141, + "balance_loss_mlp": 0.01344665, + "epoch": 0.04719675334435593, + "flos": 21331939196160.0, + "grad_norm": 33.75407076232228, + "language_loss": 0.85470867, + "learning_rate": 3.996899089108607e-06, + "loss": 0.9432534, + "num_input_tokens_seen": 16846230, + "router_z_loss_clip": 7.453125, + "router_z_loss_mlp": 1.22753906, + "step": 785, + "time_per_iteration": 2.641284227371216 + }, + { + "auxiliary_loss_clip": 0.07399641, + "auxiliary_loss_mlp": 0.01481075, + "balance_loss_clip": 0.06649202, + "balance_loss_mlp": 0.01357002, + "epoch": 0.0472568765970239, + "flos": 17937204480000.0, + "grad_norm": 4.826067054081543, + "language_loss": 0.94969213, + "learning_rate": 3.996877372161152e-06, + "loss": 1.03849936, + "num_input_tokens_seen": 16865325, + "router_z_loss_clip": 7.51953125, + "router_z_loss_mlp": 1.24023438, + "step": 786, + "time_per_iteration": 2.6160340309143066 + }, + { + "auxiliary_loss_clip": 0.07465263, + "auxiliary_loss_mlp": 0.01521969, + "balance_loss_clip": 0.06653383, + "balance_loss_mlp": 0.01371384, + "epoch": 0.04731699984969187, + "flos": 18083169492480.0, + "grad_norm": 10.690384669742231, + "language_loss": 0.84019518, + "learning_rate": 3.9968555794919065e-06, + "loss": 0.93006748, + "num_input_tokens_seen": 16882930, + "router_z_loss_clip": 8.1328125, + "router_z_loss_mlp": 1.50488281, + "step": 787, + "time_per_iteration": 2.5864908695220947 + }, + { + "auxiliary_loss_clip": 0.07389308, + "auxiliary_loss_mlp": 0.01468371, + "balance_loss_clip": 0.06647876, + "balance_loss_mlp": 0.01332663, + "epoch": 0.047377123102359836, + "flos": 23191735294080.0, + "grad_norm": 8.892570877156906, + "language_loss": 0.85964632, + "learning_rate": 3.996833711101698e-06, + "loss": 0.94822311, + "num_input_tokens_seen": 16900710, + "router_z_loss_clip": 7.41796875, + "router_z_loss_mlp": 1.35839844, + "step": 788, + "time_per_iteration": 2.6390748023986816 + }, + { + "auxiliary_loss_clip": 0.07401264, + "auxiliary_loss_mlp": 0.01469979, + "balance_loss_clip": 0.06672339, + "balance_loss_mlp": 0.01334367, + "epoch": 0.04743724635502781, + "flos": 22754469162240.0, + "grad_norm": 17.026258111429804, + "language_loss": 0.89192903, + "learning_rate": 3.996811766991355e-06, + "loss": 0.98064142, + "num_input_tokens_seen": 16919210, + "router_z_loss_clip": 7.29296875, + "router_z_loss_mlp": 1.35449219, + "step": 789, + "time_per_iteration": 2.6131770610809326 + }, + { + "auxiliary_loss_clip": 0.07421435, + "auxiliary_loss_mlp": 0.01479761, + "balance_loss_clip": 0.06683871, + "balance_loss_mlp": 0.01339475, + "epoch": 0.04749736960769577, + "flos": 17244499576320.0, + "grad_norm": 30.32315054606697, + "language_loss": 0.88307178, + "learning_rate": 3.996789747161709e-06, + "loss": 0.97208381, + "num_input_tokens_seen": 16937125, + "router_z_loss_clip": 7.37890625, + "router_z_loss_mlp": 1.40136719, + "step": 790, + "time_per_iteration": 2.618745803833008 + }, + { + "auxiliary_loss_clip": 0.07412322, + "auxiliary_loss_mlp": 0.01470303, + "balance_loss_clip": 0.06664298, + "balance_loss_mlp": 0.01331687, + "epoch": 0.047557492860363745, + "flos": 40488798908160.0, + "grad_norm": 154.88106341207603, + "language_loss": 0.94037831, + "learning_rate": 3.996767651613597e-06, + "loss": 1.02920461, + "num_input_tokens_seen": 16958610, + "router_z_loss_clip": 7.48046875, + "router_z_loss_mlp": 1.38623047, + "step": 791, + "time_per_iteration": 2.7700016498565674 + }, + { + "auxiliary_loss_clip": 0.07422841, + "auxiliary_loss_mlp": 0.01462484, + "balance_loss_clip": 0.06681914, + "balance_loss_mlp": 0.01322198, + "epoch": 0.04761761611303172, + "flos": 18704023920000.0, + "grad_norm": 23.33805920811653, + "language_loss": 0.9476828, + "learning_rate": 3.996745480347854e-06, + "loss": 1.03653598, + "num_input_tokens_seen": 16977300, + "router_z_loss_clip": 7.4140625, + "router_z_loss_mlp": 1.40332031, + "step": 792, + "time_per_iteration": 2.605254888534546 + }, + { + "auxiliary_loss_clip": 0.07424683, + "auxiliary_loss_mlp": 0.01473205, + "balance_loss_clip": 0.0668014, + "balance_loss_mlp": 0.01333396, + "epoch": 0.04767773936569968, + "flos": 20928103643520.0, + "grad_norm": 9.340139883580587, + "language_loss": 0.78320849, + "learning_rate": 3.996723233365324e-06, + "loss": 0.87218744, + "num_input_tokens_seen": 16994950, + "router_z_loss_clip": 7.44921875, + "router_z_loss_mlp": 1.39697266, + "step": 793, + "time_per_iteration": 2.589350938796997 + }, + { + "auxiliary_loss_clip": 0.07421647, + "auxiliary_loss_mlp": 0.01474475, + "balance_loss_clip": 0.06679038, + "balance_loss_mlp": 0.01333379, + "epoch": 0.047737862618367655, + "flos": 23739481434240.0, + "grad_norm": 17.45910394468578, + "language_loss": 0.91955769, + "learning_rate": 3.996700910666847e-06, + "loss": 1.00851893, + "num_input_tokens_seen": 17014760, + "router_z_loss_clip": 7.4296875, + "router_z_loss_mlp": 1.41064453, + "step": 794, + "time_per_iteration": 2.65012264251709 + }, + { + "auxiliary_loss_clip": 0.07410855, + "auxiliary_loss_mlp": 0.01451088, + "balance_loss_clip": 0.06674555, + "balance_loss_mlp": 0.01322247, + "epoch": 0.04779798587103562, + "flos": 23702487056640.0, + "grad_norm": 25.87656480685072, + "language_loss": 0.77586949, + "learning_rate": 3.996678512253272e-06, + "loss": 0.8644889, + "num_input_tokens_seen": 17032715, + "router_z_loss_clip": 7.3671875, + "router_z_loss_mlp": 1.28808594, + "step": 795, + "time_per_iteration": 2.6948788166046143 + }, + { + "auxiliary_loss_clip": 0.07379565, + "auxiliary_loss_mlp": 0.01431544, + "balance_loss_clip": 0.06667496, + "balance_loss_mlp": 0.01302989, + "epoch": 0.04785810912370359, + "flos": 23190058212480.0, + "grad_norm": 8.675826434601191, + "language_loss": 0.85312498, + "learning_rate": 3.996656038125449e-06, + "loss": 0.94123614, + "num_input_tokens_seen": 17052215, + "router_z_loss_clip": 7.12109375, + "router_z_loss_mlp": 1.28466797, + "step": 796, + "time_per_iteration": 2.7435877323150635 + }, + { + "auxiliary_loss_clip": 0.07385565, + "auxiliary_loss_mlp": 0.0140352, + "balance_loss_clip": 0.06662786, + "balance_loss_mlp": 0.01285074, + "epoch": 0.047918232376371564, + "flos": 18046426677120.0, + "grad_norm": 54.926272560680225, + "language_loss": 0.8855834, + "learning_rate": 3.996633488284228e-06, + "loss": 0.97347426, + "num_input_tokens_seen": 17069225, + "router_z_loss_clip": 7.23046875, + "router_z_loss_mlp": 1.18359375, + "step": 797, + "time_per_iteration": 2.6623764038085938 + }, + { + "auxiliary_loss_clip": 0.07094701, + "auxiliary_loss_mlp": 0.01316158, + "balance_loss_clip": 0.0666967, + "balance_loss_mlp": 0.01274649, + "epoch": 0.04797835562903953, + "flos": 62461717511040.0, + "grad_norm": 0.9155106497251145, + "language_loss": 0.64821255, + "learning_rate": 3.996610862730465e-06, + "loss": 0.73232114, + "num_input_tokens_seen": 17126680, + "router_z_loss_clip": 4.25, + "router_z_loss_mlp": 0.4152832, + "step": 798, + "time_per_iteration": 3.148404121398926 + }, + { + "auxiliary_loss_clip": 0.07427999, + "auxiliary_loss_mlp": 0.01422996, + "balance_loss_clip": 0.06684162, + "balance_loss_mlp": 0.01303215, + "epoch": 0.0480384788817075, + "flos": 21513766556160.0, + "grad_norm": 16.018908533164023, + "language_loss": 0.96157068, + "learning_rate": 3.996588161465018e-06, + "loss": 1.05008054, + "num_input_tokens_seen": 17144835, + "router_z_loss_clip": 7.4453125, + "router_z_loss_mlp": 1.19775391, + "step": 799, + "time_per_iteration": 2.6639058589935303 + }, + { + "auxiliary_loss_clip": 0.07364519, + "auxiliary_loss_mlp": 0.01407648, + "balance_loss_clip": 0.06657426, + "balance_loss_mlp": 0.01297594, + "epoch": 0.048098602134375466, + "flos": 21733301053440.0, + "grad_norm": 22.047266878511874, + "language_loss": 0.92366803, + "learning_rate": 3.996565384488748e-06, + "loss": 1.01138973, + "num_input_tokens_seen": 17165030, + "router_z_loss_clip": 7.07421875, + "router_z_loss_mlp": 1.10253906, + "step": 800, + "time_per_iteration": 2.646414041519165 + }, + { + "auxiliary_loss_clip": 0.07370388, + "auxiliary_loss_mlp": 0.01385117, + "balance_loss_clip": 0.06655432, + "balance_loss_mlp": 0.01282549, + "epoch": 0.04815872538704344, + "flos": 22937931676800.0, + "grad_norm": 10.357052219396058, + "language_loss": 0.89344579, + "learning_rate": 3.996542531802518e-06, + "loss": 0.98100084, + "num_input_tokens_seen": 17184895, + "router_z_loss_clip": 7.1484375, + "router_z_loss_mlp": 1.02636719, + "step": 801, + "time_per_iteration": 2.6882050037384033 + }, + { + "auxiliary_loss_clip": 0.07345966, + "auxiliary_loss_mlp": 0.01362249, + "balance_loss_clip": 0.06635958, + "balance_loss_mlp": 0.01265022, + "epoch": 0.04821884863971141, + "flos": 43183952686080.0, + "grad_norm": 6.136831614794949, + "language_loss": 0.85035717, + "learning_rate": 3.996519603407196e-06, + "loss": 0.93743926, + "num_input_tokens_seen": 17208225, + "router_z_loss_clip": 7.10546875, + "router_z_loss_mlp": 0.97216797, + "step": 802, + "time_per_iteration": 2.79622220993042 + }, + { + "auxiliary_loss_clip": 0.07318079, + "auxiliary_loss_mlp": 0.01347073, + "balance_loss_clip": 0.06636789, + "balance_loss_mlp": 0.01265057, + "epoch": 0.048278971892379376, + "flos": 18625171628160.0, + "grad_norm": 43.20373329941697, + "language_loss": 0.91245079, + "learning_rate": 3.996496599303649e-06, + "loss": 0.99910235, + "num_input_tokens_seen": 17226305, + "router_z_loss_clip": 6.81640625, + "router_z_loss_mlp": 0.81982422, + "step": 803, + "time_per_iteration": 2.624542236328125 + }, + { + "auxiliary_loss_clip": 0.07327777, + "auxiliary_loss_mlp": 0.01365974, + "balance_loss_clip": 0.06626104, + "balance_loss_mlp": 0.01271798, + "epoch": 0.04833909514504735, + "flos": 20236279207680.0, + "grad_norm": 95.48194102470296, + "language_loss": 0.905747, + "learning_rate": 3.996473519492753e-06, + "loss": 0.99268442, + "num_input_tokens_seen": 17244545, + "router_z_loss_clip": 7.01953125, + "router_z_loss_mlp": 0.94238281, + "step": 804, + "time_per_iteration": 2.597118854522705 + }, + { + "auxiliary_loss_clip": 0.07322634, + "auxiliary_loss_mlp": 0.01340955, + "balance_loss_clip": 0.0662351, + "balance_loss_mlp": 0.01259273, + "epoch": 0.04839921839771532, + "flos": 24652182032640.0, + "grad_norm": 4.3863417773594096, + "language_loss": 0.91238397, + "learning_rate": 3.99645036397538e-06, + "loss": 0.99901986, + "num_input_tokens_seen": 17265730, + "router_z_loss_clip": 6.9921875, + "router_z_loss_mlp": 0.81689453, + "step": 805, + "time_per_iteration": 2.6999049186706543 + }, + { + "auxiliary_loss_clip": 0.07332969, + "auxiliary_loss_mlp": 0.01347421, + "balance_loss_clip": 0.06628814, + "balance_loss_mlp": 0.01263783, + "epoch": 0.048459341650383285, + "flos": 24834470590080.0, + "grad_norm": 14.417666191465669, + "language_loss": 0.71703786, + "learning_rate": 3.9964271327524085e-06, + "loss": 0.80384171, + "num_input_tokens_seen": 17284820, + "router_z_loss_clip": 7.046875, + "router_z_loss_mlp": 0.8359375, + "step": 806, + "time_per_iteration": 4.025094985961914 + }, + { + "auxiliary_loss_clip": 0.07307116, + "auxiliary_loss_mlp": 0.01343001, + "balance_loss_clip": 0.06628814, + "balance_loss_mlp": 0.01262844, + "epoch": 0.04851946490305126, + "flos": 22169644790400.0, + "grad_norm": 6.037392612651371, + "language_loss": 0.81120235, + "learning_rate": 3.9964038258247214e-06, + "loss": 0.89770353, + "num_input_tokens_seen": 17305085, + "router_z_loss_clip": 6.7734375, + "router_z_loss_mlp": 0.80126953, + "step": 807, + "time_per_iteration": 4.06866717338562 + }, + { + "auxiliary_loss_clip": 0.07289852, + "auxiliary_loss_mlp": 0.01348053, + "balance_loss_clip": 0.06616738, + "balance_loss_mlp": 0.01266228, + "epoch": 0.04857958815571922, + "flos": 19798132608000.0, + "grad_norm": 11.228648532877324, + "language_loss": 0.92036742, + "learning_rate": 3.9963804431932005e-06, + "loss": 1.00674641, + "num_input_tokens_seen": 17322715, + "router_z_loss_clip": 6.73046875, + "router_z_loss_mlp": 0.81738281, + "step": 808, + "time_per_iteration": 3.9916791915893555 + }, + { + "auxiliary_loss_clip": 0.07360442, + "auxiliary_loss_mlp": 0.01352716, + "balance_loss_clip": 0.06635769, + "balance_loss_mlp": 0.01261115, + "epoch": 0.048639711408387194, + "flos": 18703981992960.0, + "grad_norm": 6.742572767322423, + "language_loss": 0.95677304, + "learning_rate": 3.996356984858732e-06, + "loss": 1.04390454, + "num_input_tokens_seen": 17341455, + "router_z_loss_clip": 7.2421875, + "router_z_loss_mlp": 0.91699219, + "step": 809, + "time_per_iteration": 2.6680333614349365 + }, + { + "auxiliary_loss_clip": 0.07315584, + "auxiliary_loss_mlp": 0.01344649, + "balance_loss_clip": 0.06624336, + "balance_loss_mlp": 0.01256863, + "epoch": 0.048699834661055166, + "flos": 24870458718720.0, + "grad_norm": 4.628704942448529, + "language_loss": 0.90077579, + "learning_rate": 3.996333450822208e-06, + "loss": 0.98737824, + "num_input_tokens_seen": 17360765, + "router_z_loss_clip": 6.92578125, + "router_z_loss_mlp": 0.87841797, + "step": 810, + "time_per_iteration": 2.6677091121673584 + }, + { + "auxiliary_loss_clip": 0.07363133, + "auxiliary_loss_mlp": 0.01339196, + "balance_loss_clip": 0.06638221, + "balance_loss_mlp": 0.0126109, + "epoch": 0.04875995791372313, + "flos": 20710246227840.0, + "grad_norm": 31.095133807277897, + "language_loss": 0.84460914, + "learning_rate": 3.99630984108452e-06, + "loss": 0.9316324, + "num_input_tokens_seen": 17380625, + "router_z_loss_clip": 7.25, + "router_z_loss_mlp": 0.78125, + "step": 811, + "time_per_iteration": 4.020594358444214 + }, + { + "auxiliary_loss_clip": 0.07316839, + "auxiliary_loss_mlp": 0.01338146, + "balance_loss_clip": 0.06624701, + "balance_loss_mlp": 0.01256941, + "epoch": 0.048820081166391104, + "flos": 18594256671360.0, + "grad_norm": 4.82975857058881, + "language_loss": 0.78335881, + "learning_rate": 3.9962861556465615e-06, + "loss": 0.86990869, + "num_input_tokens_seen": 17399355, + "router_z_loss_clip": 6.92578125, + "router_z_loss_mlp": 0.81152344, + "step": 812, + "time_per_iteration": 2.614077091217041 + }, + { + "auxiliary_loss_clip": 0.0728099, + "auxiliary_loss_mlp": 0.01351533, + "balance_loss_clip": 0.06610497, + "balance_loss_mlp": 0.0127009, + "epoch": 0.04888020441905907, + "flos": 22713324007680.0, + "grad_norm": 17.655616040127313, + "language_loss": 0.94109142, + "learning_rate": 3.996262394509233e-06, + "loss": 1.02741659, + "num_input_tokens_seen": 17418240, + "router_z_loss_clip": 6.703125, + "router_z_loss_mlp": 0.81494141, + "step": 813, + "time_per_iteration": 2.5956995487213135 + }, + { + "auxiliary_loss_clip": 0.07318511, + "auxiliary_loss_mlp": 0.01349544, + "balance_loss_clip": 0.0662335, + "balance_loss_mlp": 0.01262807, + "epoch": 0.04894032767172704, + "flos": 22791044269440.0, + "grad_norm": 7.289252550466507, + "language_loss": 0.78803051, + "learning_rate": 3.9962385576734335e-06, + "loss": 0.87471104, + "num_input_tokens_seen": 17436250, + "router_z_loss_clip": 6.953125, + "router_z_loss_mlp": 0.8671875, + "step": 814, + "time_per_iteration": 2.625399351119995 + }, + { + "auxiliary_loss_clip": 0.07335538, + "auxiliary_loss_mlp": 0.01355257, + "balance_loss_clip": 0.06626598, + "balance_loss_mlp": 0.01267948, + "epoch": 0.04900045092439501, + "flos": 25522521592320.0, + "grad_norm": 46.975949242566905, + "language_loss": 0.87790531, + "learning_rate": 3.9962146451400675e-06, + "loss": 0.96481323, + "num_input_tokens_seen": 17455750, + "router_z_loss_clip": 7.1015625, + "router_z_loss_mlp": 0.87451172, + "step": 815, + "time_per_iteration": 2.6799027919769287 + }, + { + "auxiliary_loss_clip": 0.0734727, + "auxiliary_loss_mlp": 0.0137345, + "balance_loss_clip": 0.06619896, + "balance_loss_mlp": 0.01271788, + "epoch": 0.04906057417706298, + "flos": 25965280166400.0, + "grad_norm": 11.89199068240792, + "language_loss": 0.95818853, + "learning_rate": 3.996190656910043e-06, + "loss": 1.04539561, + "num_input_tokens_seen": 17474995, + "router_z_loss_clip": 7.28125, + "router_z_loss_mlp": 1.01757812, + "step": 816, + "time_per_iteration": 2.668058395385742 + }, + { + "auxiliary_loss_clip": 0.07340101, + "auxiliary_loss_mlp": 0.01360138, + "balance_loss_clip": 0.066241, + "balance_loss_mlp": 0.01271828, + "epoch": 0.04912069742973095, + "flos": 18630580216320.0, + "grad_norm": 8.092720893633917, + "language_loss": 0.84299397, + "learning_rate": 3.996166592984268e-06, + "loss": 0.92999631, + "num_input_tokens_seen": 17493395, + "router_z_loss_clip": 7.1484375, + "router_z_loss_mlp": 0.88330078, + "step": 817, + "time_per_iteration": 2.5901565551757812 + }, + { + "auxiliary_loss_clip": 0.07312281, + "auxiliary_loss_mlp": 0.01371477, + "balance_loss_clip": 0.06618914, + "balance_loss_mlp": 0.01282404, + "epoch": 0.049180820682398915, + "flos": 23707182885120.0, + "grad_norm": 5.174214831161968, + "language_loss": 0.88566625, + "learning_rate": 3.996142453363656e-06, + "loss": 0.97250384, + "num_input_tokens_seen": 17514565, + "router_z_loss_clip": 6.93359375, + "router_z_loss_mlp": 0.89013672, + "step": 818, + "time_per_iteration": 2.6751646995544434 + }, + { + "auxiliary_loss_clip": 0.07361554, + "auxiliary_loss_mlp": 0.01384487, + "balance_loss_clip": 0.06625406, + "balance_loss_mlp": 0.01290598, + "epoch": 0.04924094393506689, + "flos": 22427179914240.0, + "grad_norm": 6.808629946314654, + "language_loss": 0.81731856, + "learning_rate": 3.996118238049124e-06, + "loss": 0.90477902, + "num_input_tokens_seen": 17534590, + "router_z_loss_clip": 7.36328125, + "router_z_loss_mlp": 0.93798828, + "step": 819, + "time_per_iteration": 2.638293504714966 + }, + { + "auxiliary_loss_clip": 0.07319279, + "auxiliary_loss_mlp": 0.01377789, + "balance_loss_clip": 0.06608901, + "balance_loss_mlp": 0.01285903, + "epoch": 0.04930106718773486, + "flos": 15743033464320.0, + "grad_norm": 10.609665501519604, + "language_loss": 0.88234192, + "learning_rate": 3.996093947041586e-06, + "loss": 0.96931261, + "num_input_tokens_seen": 17551900, + "router_z_loss_clip": 7.109375, + "router_z_loss_mlp": 0.91845703, + "step": 820, + "time_per_iteration": 2.6076858043670654 + }, + { + "auxiliary_loss_clip": 0.07310833, + "auxiliary_loss_mlp": 0.01372579, + "balance_loss_clip": 0.06604609, + "balance_loss_mlp": 0.01282171, + "epoch": 0.049361190440402825, + "flos": 26257922951040.0, + "grad_norm": 5.648893665912937, + "language_loss": 0.94581264, + "learning_rate": 3.996069580341966e-06, + "loss": 1.03264678, + "num_input_tokens_seen": 17571485, + "router_z_loss_clip": 7.0703125, + "router_z_loss_mlp": 0.90380859, + "step": 821, + "time_per_iteration": 2.7164249420166016 + }, + { + "auxiliary_loss_clip": 0.07296955, + "auxiliary_loss_mlp": 0.01366561, + "balance_loss_clip": 0.0660333, + "balance_loss_mlp": 0.01277488, + "epoch": 0.0494213136930708, + "flos": 21258872835840.0, + "grad_norm": 13.842694995476421, + "language_loss": 0.93458569, + "learning_rate": 3.996045137951188e-06, + "loss": 1.02122092, + "num_input_tokens_seen": 17591410, + "router_z_loss_clip": 6.9453125, + "router_z_loss_mlp": 0.890625, + "step": 822, + "time_per_iteration": 2.6453444957733154 + }, + { + "auxiliary_loss_clip": 0.07319045, + "auxiliary_loss_mlp": 0.01374655, + "balance_loss_clip": 0.06613644, + "balance_loss_mlp": 0.0128048, + "epoch": 0.04948143694573876, + "flos": 27973095701760.0, + "grad_norm": 7.088849816783062, + "language_loss": 0.7121917, + "learning_rate": 3.996020619870178e-06, + "loss": 0.79912865, + "num_input_tokens_seen": 17612010, + "router_z_loss_clip": 7.05078125, + "router_z_loss_mlp": 0.94238281, + "step": 823, + "time_per_iteration": 2.6804885864257812 + }, + { + "auxiliary_loss_clip": 0.06953795, + "auxiliary_loss_mlp": 0.01404355, + "balance_loss_clip": 0.06535611, + "balance_loss_mlp": 0.01345371, + "epoch": 0.049541560198406734, + "flos": 66197466345600.0, + "grad_norm": 1.28356919167216, + "language_loss": 0.63197851, + "learning_rate": 3.995996026099866e-06, + "loss": 0.71555996, + "num_input_tokens_seen": 17673430, + "router_z_loss_clip": 4.1875, + "router_z_loss_mlp": 0.58837891, + "step": 824, + "time_per_iteration": 3.3058674335479736 + }, + { + "auxiliary_loss_clip": 0.07323784, + "auxiliary_loss_mlp": 0.01374745, + "balance_loss_clip": 0.06612824, + "balance_loss_mlp": 0.01280998, + "epoch": 0.049601683451074706, + "flos": 22899218290560.0, + "grad_norm": 5.8210235967171435, + "language_loss": 0.9564544, + "learning_rate": 3.995971356641185e-06, + "loss": 1.04343963, + "num_input_tokens_seen": 17689545, + "router_z_loss_clip": 7.11328125, + "router_z_loss_mlp": 0.9375, + "step": 825, + "time_per_iteration": 2.62613844871521 + }, + { + "auxiliary_loss_clip": 0.07281419, + "auxiliary_loss_mlp": 0.01365594, + "balance_loss_clip": 0.06597939, + "balance_loss_mlp": 0.0127695, + "epoch": 0.04966180670374267, + "flos": 21439987436160.0, + "grad_norm": 7.03533776815666, + "language_loss": 0.71345061, + "learning_rate": 3.9959466114950695e-06, + "loss": 0.7999208, + "num_input_tokens_seen": 17705965, + "router_z_loss_clip": 6.83984375, + "router_z_loss_mlp": 0.88671875, + "step": 826, + "time_per_iteration": 2.607252359390259 + }, + { + "auxiliary_loss_clip": 0.07308409, + "auxiliary_loss_mlp": 0.01368352, + "balance_loss_clip": 0.06603594, + "balance_loss_mlp": 0.0127885, + "epoch": 0.04972192995641064, + "flos": 23113218418560.0, + "grad_norm": 6.719033594417253, + "language_loss": 0.82099521, + "learning_rate": 3.995921790662459e-06, + "loss": 0.90776283, + "num_input_tokens_seen": 17724580, + "router_z_loss_clip": 7.05078125, + "router_z_loss_mlp": 0.89550781, + "step": 827, + "time_per_iteration": 2.6468021869659424 + }, + { + "auxiliary_loss_clip": 0.07312737, + "auxiliary_loss_mlp": 0.01384514, + "balance_loss_clip": 0.06605525, + "balance_loss_mlp": 0.01293009, + "epoch": 0.04978205320907861, + "flos": 40415648693760.0, + "grad_norm": 3.6071356819257336, + "language_loss": 0.83064795, + "learning_rate": 3.995896894144294e-06, + "loss": 0.91762054, + "num_input_tokens_seen": 17747755, + "router_z_loss_clip": 7.05859375, + "router_z_loss_mlp": 0.91455078, + "step": 828, + "time_per_iteration": 2.7598366737365723 + }, + { + "auxiliary_loss_clip": 0.07248655, + "auxiliary_loss_mlp": 0.01357422, + "balance_loss_clip": 0.06587116, + "balance_loss_mlp": 0.01271687, + "epoch": 0.04984217646174658, + "flos": 25235580885120.0, + "grad_norm": 7.916023460171269, + "language_loss": 0.88066685, + "learning_rate": 3.995871921941519e-06, + "loss": 0.96672761, + "num_input_tokens_seen": 17768550, + "router_z_loss_clip": 6.609375, + "router_z_loss_mlp": 0.85791016, + "step": 829, + "time_per_iteration": 2.664443016052246 + }, + { + "auxiliary_loss_clip": 0.07290308, + "auxiliary_loss_mlp": 0.01371956, + "balance_loss_clip": 0.06599583, + "balance_loss_mlp": 0.01282025, + "epoch": 0.04990229971441455, + "flos": 15964873948800.0, + "grad_norm": 30.23399077612731, + "language_loss": 0.79482603, + "learning_rate": 3.99584687405508e-06, + "loss": 0.88144869, + "num_input_tokens_seen": 17786080, + "router_z_loss_clip": 6.90625, + "router_z_loss_mlp": 0.90039062, + "step": 830, + "time_per_iteration": 2.5562844276428223 + }, + { + "auxiliary_loss_clip": 0.07284638, + "auxiliary_loss_mlp": 0.01358745, + "balance_loss_clip": 0.06602956, + "balance_loss_mlp": 0.01273677, + "epoch": 0.04996242296708252, + "flos": 18410919937920.0, + "grad_norm": 6.720833612775693, + "language_loss": 0.82703733, + "learning_rate": 3.995821750485929e-06, + "loss": 0.91347122, + "num_input_tokens_seen": 17803635, + "router_z_loss_clip": 6.81640625, + "router_z_loss_mlp": 0.85058594, + "step": 831, + "time_per_iteration": 2.6576318740844727 + }, + { + "auxiliary_loss_clip": 0.07282449, + "auxiliary_loss_mlp": 0.01350763, + "balance_loss_clip": 0.06587234, + "balance_loss_mlp": 0.01262882, + "epoch": 0.05002254621975049, + "flos": 17863802703360.0, + "grad_norm": 5.424543563535015, + "language_loss": 0.97343409, + "learning_rate": 3.995796551235016e-06, + "loss": 1.05976629, + "num_input_tokens_seen": 17822190, + "router_z_loss_clip": 6.953125, + "router_z_loss_mlp": 0.87939453, + "step": 832, + "time_per_iteration": 2.5859360694885254 + }, + { + "auxiliary_loss_clip": 0.07242593, + "auxiliary_loss_mlp": 0.01355446, + "balance_loss_clip": 0.06576244, + "balance_loss_mlp": 0.01268804, + "epoch": 0.050082669472418455, + "flos": 45670682632320.0, + "grad_norm": 14.668918539875873, + "language_loss": 0.86283791, + "learning_rate": 3.9957712763032974e-06, + "loss": 0.94881833, + "num_input_tokens_seen": 17846915, + "router_z_loss_clip": 6.66015625, + "router_z_loss_mlp": 0.86621094, + "step": 833, + "time_per_iteration": 2.8055691719055176 + }, + { + "auxiliary_loss_clip": 0.07249285, + "auxiliary_loss_mlp": 0.01350346, + "balance_loss_clip": 0.06584433, + "balance_loss_mlp": 0.01262561, + "epoch": 0.05014279272508643, + "flos": 37971237859200.0, + "grad_norm": 3.800888643683855, + "language_loss": 0.8636179, + "learning_rate": 3.995745925691733e-06, + "loss": 0.94961417, + "num_input_tokens_seen": 17867270, + "router_z_loss_clip": 6.64453125, + "router_z_loss_mlp": 0.87695312, + "step": 834, + "time_per_iteration": 2.757873296737671 + }, + { + "auxiliary_loss_clip": 0.07281981, + "auxiliary_loss_mlp": 0.01348084, + "balance_loss_clip": 0.0659239, + "balance_loss_mlp": 0.01265353, + "epoch": 0.0502029159777544, + "flos": 21002511669120.0, + "grad_norm": 6.832202768967494, + "language_loss": 0.96576416, + "learning_rate": 3.995720499401282e-06, + "loss": 1.0520649, + "num_input_tokens_seen": 17884880, + "router_z_loss_clip": 6.890625, + "router_z_loss_mlp": 0.82666016, + "step": 835, + "time_per_iteration": 2.5905637741088867 + }, + { + "auxiliary_loss_clip": 0.07274499, + "auxiliary_loss_mlp": 0.01349147, + "balance_loss_clip": 0.06586967, + "balance_loss_mlp": 0.01266273, + "epoch": 0.050263039230422364, + "flos": 15893526597120.0, + "grad_norm": 5.723886418395804, + "language_loss": 0.82083344, + "learning_rate": 3.995694997432911e-06, + "loss": 0.90706992, + "num_input_tokens_seen": 17903695, + "router_z_loss_clip": 6.87890625, + "router_z_loss_mlp": 0.82861328, + "step": 836, + "time_per_iteration": 2.6167397499084473 + }, + { + "auxiliary_loss_clip": 0.0721738, + "auxiliary_loss_mlp": 0.01338932, + "balance_loss_clip": 0.06569374, + "balance_loss_mlp": 0.01261065, + "epoch": 0.050323162483090336, + "flos": 23739565288320.0, + "grad_norm": 23.66781297023958, + "language_loss": 0.88235295, + "learning_rate": 3.9956694197875855e-06, + "loss": 0.96791613, + "num_input_tokens_seen": 17920745, + "router_z_loss_clip": 6.48046875, + "router_z_loss_mlp": 0.77832031, + "step": 837, + "time_per_iteration": 2.614959955215454 + }, + { + "auxiliary_loss_clip": 0.07221343, + "auxiliary_loss_mlp": 0.01354096, + "balance_loss_clip": 0.06550418, + "balance_loss_mlp": 0.01265261, + "epoch": 0.0503832857357583, + "flos": 20272393117440.0, + "grad_norm": 6.0443181189796995, + "language_loss": 0.76965159, + "learning_rate": 3.995643766466275e-06, + "loss": 0.85540605, + "num_input_tokens_seen": 17938220, + "router_z_loss_clip": 6.7109375, + "router_z_loss_mlp": 0.88769531, + "step": 838, + "time_per_iteration": 2.622648239135742 + }, + { + "auxiliary_loss_clip": 0.0724083, + "auxiliary_loss_mlp": 0.01341893, + "balance_loss_clip": 0.06561115, + "balance_loss_mlp": 0.01259353, + "epoch": 0.05044340898842627, + "flos": 17790736343040.0, + "grad_norm": 4.747797763129113, + "language_loss": 0.86986995, + "learning_rate": 3.995618037469953e-06, + "loss": 0.95569718, + "num_input_tokens_seen": 17957325, + "router_z_loss_clip": 6.796875, + "router_z_loss_mlp": 0.82519531, + "step": 839, + "time_per_iteration": 2.5999207496643066 + }, + { + "auxiliary_loss_clip": 0.07210248, + "auxiliary_loss_mlp": 0.01342514, + "balance_loss_clip": 0.06558718, + "balance_loss_mlp": 0.01262024, + "epoch": 0.050503532241094246, + "flos": 22973207045760.0, + "grad_norm": 3.66950577076863, + "language_loss": 0.88844591, + "learning_rate": 3.995592232799595e-06, + "loss": 0.97397357, + "num_input_tokens_seen": 17975875, + "router_z_loss_clip": 6.51953125, + "router_z_loss_mlp": 0.80517578, + "step": 840, + "time_per_iteration": 2.688936948776245 + }, + { + "auxiliary_loss_clip": 0.07223296, + "auxiliary_loss_mlp": 0.01348235, + "balance_loss_clip": 0.06565775, + "balance_loss_mlp": 0.01264264, + "epoch": 0.05056365549376221, + "flos": 22782449226240.0, + "grad_norm": 5.237976654716359, + "language_loss": 0.98182797, + "learning_rate": 3.99556635245618e-06, + "loss": 1.06754327, + "num_input_tokens_seen": 17994340, + "router_z_loss_clip": 6.57421875, + "router_z_loss_mlp": 0.84033203, + "step": 841, + "time_per_iteration": 2.626171588897705 + }, + { + "auxiliary_loss_clip": 0.07216457, + "auxiliary_loss_mlp": 0.01346197, + "balance_loss_clip": 0.06556017, + "balance_loss_mlp": 0.01263227, + "epoch": 0.05062377874643018, + "flos": 30924401011200.0, + "grad_norm": 3.922284831716734, + "language_loss": 0.81540143, + "learning_rate": 3.995540396440688e-06, + "loss": 0.90102798, + "num_input_tokens_seen": 18015260, + "router_z_loss_clip": 6.609375, + "router_z_loss_mlp": 0.82958984, + "step": 842, + "time_per_iteration": 2.707146167755127 + }, + { + "auxiliary_loss_clip": 0.07236033, + "auxiliary_loss_mlp": 0.01355891, + "balance_loss_clip": 0.06555693, + "balance_loss_mlp": 0.0126391, + "epoch": 0.05068390199909815, + "flos": 19653425406720.0, + "grad_norm": 6.4717382946502635, + "language_loss": 0.81965601, + "learning_rate": 3.995514364754105e-06, + "loss": 0.90557522, + "num_input_tokens_seen": 18033960, + "router_z_loss_clip": 6.80078125, + "router_z_loss_mlp": 0.91943359, + "step": 843, + "time_per_iteration": 2.672064781188965 + }, + { + "auxiliary_loss_clip": 0.07235807, + "auxiliary_loss_mlp": 0.01361352, + "balance_loss_clip": 0.06552228, + "balance_loss_mlp": 0.01271992, + "epoch": 0.05074402525176612, + "flos": 37971279786240.0, + "grad_norm": 2.407141650516338, + "language_loss": 0.87016606, + "learning_rate": 3.995488257397417e-06, + "loss": 0.95613766, + "num_input_tokens_seen": 18056700, + "router_z_loss_clip": 6.83203125, + "router_z_loss_mlp": 0.89404297, + "step": 844, + "time_per_iteration": 2.7541916370391846 + }, + { + "auxiliary_loss_clip": 0.07238596, + "auxiliary_loss_mlp": 0.01357268, + "balance_loss_clip": 0.06561587, + "balance_loss_mlp": 0.01275109, + "epoch": 0.05080414850443409, + "flos": 22061177280000.0, + "grad_norm": 5.7438919546505876, + "language_loss": 0.80192208, + "learning_rate": 3.995462074371614e-06, + "loss": 0.8878808, + "num_input_tokens_seen": 18075815, + "router_z_loss_clip": 6.76953125, + "router_z_loss_mlp": 0.82226562, + "step": 845, + "time_per_iteration": 2.5944912433624268 + }, + { + "auxiliary_loss_clip": 0.07213366, + "auxiliary_loss_mlp": 0.01353915, + "balance_loss_clip": 0.06554674, + "balance_loss_mlp": 0.01268561, + "epoch": 0.05086427175710206, + "flos": 20231289889920.0, + "grad_norm": 4.0486216034950475, + "language_loss": 0.91612351, + "learning_rate": 3.99543581567769e-06, + "loss": 1.00179636, + "num_input_tokens_seen": 18095095, + "router_z_loss_clip": 6.578125, + "router_z_loss_mlp": 0.85400391, + "step": 846, + "time_per_iteration": 4.029407739639282 + }, + { + "auxiliary_loss_clip": 0.07198675, + "auxiliary_loss_mlp": 0.01353444, + "balance_loss_clip": 0.06555093, + "balance_loss_mlp": 0.01271094, + "epoch": 0.05092439500977003, + "flos": 15164707783680.0, + "grad_norm": 2.8334464640278307, + "language_loss": 0.91321969, + "learning_rate": 3.9954094813166394e-06, + "loss": 0.99874079, + "num_input_tokens_seen": 18112675, + "router_z_loss_clip": 6.4375, + "router_z_loss_mlp": 0.82324219, + "step": 847, + "time_per_iteration": 4.004042863845825 + }, + { + "auxiliary_loss_clip": 0.07199422, + "auxiliary_loss_mlp": 0.01355266, + "balance_loss_clip": 0.0654697, + "balance_loss_mlp": 0.01273202, + "epoch": 0.050984518262437994, + "flos": 22061806185600.0, + "grad_norm": 3.421485941815423, + "language_loss": 0.86160553, + "learning_rate": 3.995383071289462e-06, + "loss": 0.94715238, + "num_input_tokens_seen": 18130745, + "router_z_loss_clip": 6.52734375, + "router_z_loss_mlp": 0.82080078, + "step": 848, + "time_per_iteration": 4.033248662948608 + }, + { + "auxiliary_loss_clip": 0.07196971, + "auxiliary_loss_mlp": 0.01345708, + "balance_loss_clip": 0.06533228, + "balance_loss_mlp": 0.01262166, + "epoch": 0.05104464151510597, + "flos": 30232911991680.0, + "grad_norm": 3.7966495356829357, + "language_loss": 0.90386808, + "learning_rate": 3.995356585597158e-06, + "loss": 0.98929483, + "num_input_tokens_seen": 18152410, + "router_z_loss_clip": 6.640625, + "router_z_loss_mlp": 0.83544922, + "step": 849, + "time_per_iteration": 2.6612625122070312 + }, + { + "auxiliary_loss_clip": 0.07179346, + "auxiliary_loss_mlp": 0.01359214, + "balance_loss_clip": 0.06533284, + "balance_loss_mlp": 0.01279106, + "epoch": 0.05110476476777394, + "flos": 18338817899520.0, + "grad_norm": 8.277424439503498, + "language_loss": 0.88001835, + "learning_rate": 3.995330024240732e-06, + "loss": 0.96540397, + "num_input_tokens_seen": 18170870, + "router_z_loss_clip": 6.45703125, + "router_z_loss_mlp": 0.80126953, + "step": 850, + "time_per_iteration": 2.591169834136963 + }, + { + "auxiliary_loss_clip": 0.07213688, + "auxiliary_loss_mlp": 0.01358343, + "balance_loss_clip": 0.06542021, + "balance_loss_mlp": 0.01272131, + "epoch": 0.051164888020441904, + "flos": 38007938747520.0, + "grad_norm": 2.8793275004055894, + "language_loss": 0.702048, + "learning_rate": 3.995303387221192e-06, + "loss": 0.78776836, + "num_input_tokens_seen": 18191555, + "router_z_loss_clip": 6.72265625, + "router_z_loss_mlp": 0.86328125, + "step": 851, + "time_per_iteration": 4.218145132064819 + }, + { + "auxiliary_loss_clip": 0.07192284, + "auxiliary_loss_mlp": 0.0136467, + "balance_loss_clip": 0.06527439, + "balance_loss_mlp": 0.01276741, + "epoch": 0.051225011273109876, + "flos": 23045183303040.0, + "grad_norm": 3.6723766751173894, + "language_loss": 0.87184155, + "learning_rate": 3.995276674539547e-06, + "loss": 0.95741105, + "num_input_tokens_seen": 18208620, + "router_z_loss_clip": 6.66015625, + "router_z_loss_mlp": 0.87939453, + "step": 852, + "time_per_iteration": 2.629037380218506 + }, + { + "auxiliary_loss_clip": 0.07206973, + "auxiliary_loss_mlp": 0.01354841, + "balance_loss_clip": 0.06534127, + "balance_loss_mlp": 0.01269678, + "epoch": 0.05128513452577785, + "flos": 18265709612160.0, + "grad_norm": 3.821037496712823, + "language_loss": 0.8378402, + "learning_rate": 3.995249886196811e-06, + "loss": 0.92345834, + "num_input_tokens_seen": 18226370, + "router_z_loss_clip": 6.73046875, + "router_z_loss_mlp": 0.8515625, + "step": 853, + "time_per_iteration": 2.6316466331481934 + }, + { + "auxiliary_loss_clip": 0.07211602, + "auxiliary_loss_mlp": 0.01339797, + "balance_loss_clip": 0.06537303, + "balance_loss_mlp": 0.01257733, + "epoch": 0.05134525777844581, + "flos": 27206360115840.0, + "grad_norm": 3.182696022693741, + "language_loss": 0.80133533, + "learning_rate": 3.995223022193999e-06, + "loss": 0.88684934, + "num_input_tokens_seen": 18247075, + "router_z_loss_clip": 6.7421875, + "router_z_loss_mlp": 0.82080078, + "step": 854, + "time_per_iteration": 2.6477131843566895 + }, + { + "auxiliary_loss_clip": 0.07215541, + "auxiliary_loss_mlp": 0.01344733, + "balance_loss_clip": 0.0654063, + "balance_loss_mlp": 0.01263146, + "epoch": 0.051405381031113785, + "flos": 28369132824960.0, + "grad_norm": 35.99472555736179, + "language_loss": 0.85045469, + "learning_rate": 3.99519608253213e-06, + "loss": 0.93605745, + "num_input_tokens_seen": 18265680, + "router_z_loss_clip": 6.74609375, + "router_z_loss_mlp": 0.81542969, + "step": 855, + "time_per_iteration": 2.6279296875 + }, + { + "auxiliary_loss_clip": 0.06909335, + "auxiliary_loss_mlp": 0.01436301, + "balance_loss_clip": 0.0650633, + "balance_loss_mlp": 0.01398083, + "epoch": 0.05146550428378175, + "flos": 65638049760000.0, + "grad_norm": 0.9716530477482218, + "language_loss": 0.65818644, + "learning_rate": 3.995169067212227e-06, + "loss": 0.74164271, + "num_input_tokens_seen": 18327015, + "router_z_loss_clip": 4.01171875, + "router_z_loss_mlp": 0.3815918, + "step": 856, + "time_per_iteration": 3.1742889881134033 + }, + { + "auxiliary_loss_clip": 0.0715993, + "auxiliary_loss_mlp": 0.01330963, + "balance_loss_clip": 0.06518224, + "balance_loss_mlp": 0.01252571, + "epoch": 0.05152562753644972, + "flos": 22061470769280.0, + "grad_norm": 29.089515075725927, + "language_loss": 0.80351281, + "learning_rate": 3.9951419762353116e-06, + "loss": 0.88842171, + "num_input_tokens_seen": 18345235, + "router_z_loss_clip": 6.41796875, + "router_z_loss_mlp": 0.78417969, + "step": 857, + "time_per_iteration": 2.6136977672576904 + }, + { + "auxiliary_loss_clip": 0.07196955, + "auxiliary_loss_mlp": 0.01347875, + "balance_loss_clip": 0.06528607, + "balance_loss_mlp": 0.01259422, + "epoch": 0.051585750789117694, + "flos": 18514523911680.0, + "grad_norm": 4.501526487205694, + "language_loss": 0.9266271, + "learning_rate": 3.995114809602412e-06, + "loss": 1.01207542, + "num_input_tokens_seen": 18362350, + "router_z_loss_clip": 6.6875, + "router_z_loss_mlp": 0.88427734, + "step": 858, + "time_per_iteration": 2.606518268585205 + }, + { + "auxiliary_loss_clip": 0.07190363, + "auxiliary_loss_mlp": 0.0134683, + "balance_loss_clip": 0.06527077, + "balance_loss_mlp": 0.01261381, + "epoch": 0.05164587404178566, + "flos": 23736630395520.0, + "grad_norm": 4.049462391518637, + "language_loss": 0.80811787, + "learning_rate": 3.9950875673145605e-06, + "loss": 0.89348972, + "num_input_tokens_seen": 18383390, + "router_z_loss_clip": 6.6328125, + "router_z_loss_mlp": 0.85400391, + "step": 859, + "time_per_iteration": 2.624462604522705 + }, + { + "auxiliary_loss_clip": 0.07202329, + "auxiliary_loss_mlp": 0.01352935, + "balance_loss_clip": 0.06525081, + "balance_loss_mlp": 0.01264196, + "epoch": 0.05170599729445363, + "flos": 16258397201280.0, + "grad_norm": 12.806303000100046, + "language_loss": 0.95290452, + "learning_rate": 3.995060249372788e-06, + "loss": 1.03845716, + "num_input_tokens_seen": 18399220, + "router_z_loss_clip": 6.78125, + "router_z_loss_mlp": 0.88769531, + "step": 860, + "time_per_iteration": 2.6383068561553955 + }, + { + "auxiliary_loss_clip": 0.07167631, + "auxiliary_loss_mlp": 0.01344788, + "balance_loss_clip": 0.06524719, + "balance_loss_mlp": 0.01262868, + "epoch": 0.0517661205471216, + "flos": 23992404583680.0, + "grad_norm": 3.0591302489664116, + "language_loss": 0.86028093, + "learning_rate": 3.99503285577813e-06, + "loss": 0.94540519, + "num_input_tokens_seen": 18419005, + "router_z_loss_clip": 6.4375, + "router_z_loss_mlp": 0.81884766, + "step": 861, + "time_per_iteration": 2.6825718879699707 + }, + { + "auxiliary_loss_clip": 0.07179172, + "auxiliary_loss_mlp": 0.01338271, + "balance_loss_clip": 0.06521305, + "balance_loss_mlp": 0.01256732, + "epoch": 0.05182624379978957, + "flos": 29285313367680.0, + "grad_norm": 3.256695777108904, + "language_loss": 0.8236177, + "learning_rate": 3.995005386531627e-06, + "loss": 0.90879214, + "num_input_tokens_seen": 18440550, + "router_z_loss_clip": 6.578125, + "router_z_loss_mlp": 0.81542969, + "step": 862, + "time_per_iteration": 2.723032236099243 + }, + { + "auxiliary_loss_clip": 0.07146881, + "auxiliary_loss_mlp": 0.01338015, + "balance_loss_clip": 0.06502384, + "balance_loss_mlp": 0.01256428, + "epoch": 0.05188636705245754, + "flos": 24177753815040.0, + "grad_norm": 4.080001789672534, + "language_loss": 0.92516744, + "learning_rate": 3.9949778416343195e-06, + "loss": 1.01001632, + "num_input_tokens_seen": 18461950, + "router_z_loss_clip": 6.44140625, + "router_z_loss_mlp": 0.81591797, + "step": 863, + "time_per_iteration": 2.624147653579712 + }, + { + "auxiliary_loss_clip": 0.07156427, + "auxiliary_loss_mlp": 0.0133763, + "balance_loss_clip": 0.06515339, + "balance_loss_mlp": 0.01253897, + "epoch": 0.051946490305125506, + "flos": 26767961953920.0, + "grad_norm": 5.3541817649382875, + "language_loss": 0.7963919, + "learning_rate": 3.9949502210872525e-06, + "loss": 0.88133246, + "num_input_tokens_seen": 18480555, + "router_z_loss_clip": 6.41015625, + "router_z_loss_mlp": 0.83789062, + "step": 864, + "time_per_iteration": 2.6928389072418213 + }, + { + "auxiliary_loss_clip": 0.07167269, + "auxiliary_loss_mlp": 0.01333883, + "balance_loss_clip": 0.0651238, + "balance_loss_mlp": 0.01252963, + "epoch": 0.05200661355779348, + "flos": 21508190259840.0, + "grad_norm": 2.900845784392114, + "language_loss": 0.83983421, + "learning_rate": 3.994922524891474e-06, + "loss": 0.9248457, + "num_input_tokens_seen": 18499645, + "router_z_loss_clip": 6.546875, + "router_z_loss_mlp": 0.80908203, + "step": 865, + "time_per_iteration": 2.6349294185638428 + }, + { + "auxiliary_loss_clip": 0.07157271, + "auxiliary_loss_mlp": 0.01343197, + "balance_loss_clip": 0.06511506, + "balance_loss_mlp": 0.01259417, + "epoch": 0.05206673681046144, + "flos": 18120457359360.0, + "grad_norm": 4.23578044185309, + "language_loss": 0.89868104, + "learning_rate": 3.994894753048032e-06, + "loss": 0.98368573, + "num_input_tokens_seen": 18516810, + "router_z_loss_clip": 6.453125, + "router_z_loss_mlp": 0.83789062, + "step": 866, + "time_per_iteration": 2.605546236038208 + }, + { + "auxiliary_loss_clip": 0.07133412, + "auxiliary_loss_mlp": 0.01337077, + "balance_loss_clip": 0.06502427, + "balance_loss_mlp": 0.01258494, + "epoch": 0.052126860063129415, + "flos": 17528966588160.0, + "grad_norm": 5.089693219930068, + "language_loss": 0.91889334, + "learning_rate": 3.9948669055579815e-06, + "loss": 1.00359821, + "num_input_tokens_seen": 18532510, + "router_z_loss_clip": 6.30859375, + "router_z_loss_mlp": 0.78564453, + "step": 867, + "time_per_iteration": 2.5601866245269775 + }, + { + "auxiliary_loss_clip": 0.07109866, + "auxiliary_loss_mlp": 0.01340108, + "balance_loss_clip": 0.06500173, + "balance_loss_mlp": 0.0126019, + "epoch": 0.05218698331579739, + "flos": 32606227036800.0, + "grad_norm": 2.1025104258361558, + "language_loss": 0.66466248, + "learning_rate": 3.9948389824223785e-06, + "loss": 0.7491622, + "num_input_tokens_seen": 18557380, + "router_z_loss_clip": 6.09765625, + "router_z_loss_mlp": 0.79882812, + "step": 868, + "time_per_iteration": 2.6942384243011475 + }, + { + "auxiliary_loss_clip": 0.0714476, + "auxiliary_loss_mlp": 0.01358483, + "balance_loss_clip": 0.06494892, + "balance_loss_mlp": 0.01263545, + "epoch": 0.05224710656846535, + "flos": 22133824369920.0, + "grad_norm": 2.980657220865539, + "language_loss": 0.87344658, + "learning_rate": 3.994810983642281e-06, + "loss": 0.95847905, + "num_input_tokens_seen": 18575720, + "router_z_loss_clip": 6.5, + "router_z_loss_mlp": 0.94921875, + "step": 869, + "time_per_iteration": 2.5877575874328613 + }, + { + "auxiliary_loss_clip": 0.07143813, + "auxiliary_loss_mlp": 0.01349092, + "balance_loss_clip": 0.06488257, + "balance_loss_mlp": 0.01260353, + "epoch": 0.052307229821133325, + "flos": 11149789472640.0, + "grad_norm": 7.7840171376663285, + "language_loss": 0.91889322, + "learning_rate": 3.994782909218751e-06, + "loss": 1.00382233, + "num_input_tokens_seen": 18592185, + "router_z_loss_clip": 6.55859375, + "router_z_loss_mlp": 0.88720703, + "step": 870, + "time_per_iteration": 2.608442783355713 + }, + { + "auxiliary_loss_clip": 0.07122661, + "auxiliary_loss_mlp": 0.01356358, + "balance_loss_clip": 0.064864, + "balance_loss_mlp": 0.01265759, + "epoch": 0.05236735307380129, + "flos": 19132862716800.0, + "grad_norm": 2.918328667759454, + "language_loss": 0.843858, + "learning_rate": 3.994754759152854e-06, + "loss": 0.92864817, + "num_input_tokens_seen": 18609560, + "router_z_loss_clip": 6.3671875, + "router_z_loss_mlp": 0.90722656, + "step": 871, + "time_per_iteration": 2.5879244804382324 + }, + { + "auxiliary_loss_clip": 0.07078928, + "auxiliary_loss_mlp": 0.01364934, + "balance_loss_clip": 0.06478463, + "balance_loss_mlp": 0.01281488, + "epoch": 0.05242747632646926, + "flos": 20967152446080.0, + "grad_norm": 2.587533245039743, + "language_loss": 0.8462553, + "learning_rate": 3.994726533445656e-06, + "loss": 0.93069392, + "num_input_tokens_seen": 18629405, + "router_z_loss_clip": 6.0078125, + "router_z_loss_mlp": 0.83496094, + "step": 872, + "time_per_iteration": 2.6208133697509766 + }, + { + "auxiliary_loss_clip": 0.06844061, + "auxiliary_loss_mlp": 0.01482571, + "balance_loss_clip": 0.06436051, + "balance_loss_mlp": 0.0141405, + "epoch": 0.052487599579137234, + "flos": 65038005872640.0, + "grad_norm": 0.8977590463147395, + "language_loss": 0.61953008, + "learning_rate": 3.9946982320982274e-06, + "loss": 0.70279646, + "num_input_tokens_seen": 18681480, + "router_z_loss_clip": 4.0625, + "router_z_loss_mlp": 0.68603516, + "step": 873, + "time_per_iteration": 3.134603500366211 + }, + { + "auxiliary_loss_clip": 0.07129098, + "auxiliary_loss_mlp": 0.01340569, + "balance_loss_clip": 0.06492221, + "balance_loss_mlp": 0.01259269, + "epoch": 0.0525477228318052, + "flos": 23294584581120.0, + "grad_norm": 2.232892718211453, + "language_loss": 0.92670178, + "learning_rate": 3.994669855111643e-06, + "loss": 1.01139832, + "num_input_tokens_seen": 18700390, + "router_z_loss_clip": 6.37109375, + "router_z_loss_mlp": 0.81298828, + "step": 874, + "time_per_iteration": 2.6136653423309326 + }, + { + "auxiliary_loss_clip": 0.07136606, + "auxiliary_loss_mlp": 0.01342837, + "balance_loss_clip": 0.0649495, + "balance_loss_mlp": 0.01262681, + "epoch": 0.05260784608447317, + "flos": 32237834561280.0, + "grad_norm": 3.6657665933203796, + "language_loss": 0.78140688, + "learning_rate": 3.994641402486977e-06, + "loss": 0.86620128, + "num_input_tokens_seen": 18721280, + "router_z_loss_clip": 6.41796875, + "router_z_loss_mlp": 0.80175781, + "step": 875, + "time_per_iteration": 2.72760272026062 + }, + { + "auxiliary_loss_clip": 0.07132401, + "auxiliary_loss_mlp": 0.01330422, + "balance_loss_clip": 0.06503764, + "balance_loss_mlp": 0.01255511, + "epoch": 0.052667969337141136, + "flos": 24470270818560.0, + "grad_norm": 2.6184423818700684, + "language_loss": 0.96137547, + "learning_rate": 3.99461287422531e-06, + "loss": 1.04600358, + "num_input_tokens_seen": 18741545, + "router_z_loss_clip": 6.28515625, + "router_z_loss_mlp": 0.74902344, + "step": 876, + "time_per_iteration": 2.627152681350708 + }, + { + "auxiliary_loss_clip": 0.06850941, + "auxiliary_loss_mlp": 0.01378053, + "balance_loss_clip": 0.06451087, + "balance_loss_mlp": 0.01329487, + "epoch": 0.05272809258980911, + "flos": 57804673034880.0, + "grad_norm": 0.7984915998280667, + "language_loss": 0.63229537, + "learning_rate": 3.994584270327722e-06, + "loss": 0.7145853, + "num_input_tokens_seen": 18801400, + "router_z_loss_clip": 4.0, + "router_z_loss_mlp": 0.48510742, + "step": 877, + "time_per_iteration": 3.2541913986206055 + }, + { + "auxiliary_loss_clip": 0.0712804, + "auxiliary_loss_mlp": 0.01326088, + "balance_loss_clip": 0.06496318, + "balance_loss_mlp": 0.01255087, + "epoch": 0.05278821584247708, + "flos": 17426578498560.0, + "grad_norm": 2.7186428977077624, + "language_loss": 0.89685273, + "learning_rate": 3.994555590795299e-06, + "loss": 0.98139405, + "num_input_tokens_seen": 18819670, + "router_z_loss_clip": 6.3125, + "router_z_loss_mlp": 0.71044922, + "step": 878, + "time_per_iteration": 2.5782718658447266 + }, + { + "auxiliary_loss_clip": 0.07154611, + "auxiliary_loss_mlp": 0.0135536, + "balance_loss_clip": 0.06498797, + "balance_loss_mlp": 0.01272485, + "epoch": 0.052848339095145046, + "flos": 26143879144320.0, + "grad_norm": 3.677878171007489, + "language_loss": 0.873586, + "learning_rate": 3.9945268356291275e-06, + "loss": 0.9586857, + "num_input_tokens_seen": 18840580, + "router_z_loss_clip": 6.55859375, + "router_z_loss_mlp": 0.82910156, + "step": 879, + "time_per_iteration": 2.6588823795318604 + }, + { + "auxiliary_loss_clip": 0.07119917, + "auxiliary_loss_mlp": 0.01353348, + "balance_loss_clip": 0.06497534, + "balance_loss_mlp": 0.01274622, + "epoch": 0.05290846234781302, + "flos": 16477680136320.0, + "grad_norm": 3.320308324601447, + "language_loss": 0.88939857, + "learning_rate": 3.9944980048302985e-06, + "loss": 0.97413123, + "num_input_tokens_seen": 18859295, + "router_z_loss_clip": 6.2265625, + "router_z_loss_mlp": 0.78710938, + "step": 880, + "time_per_iteration": 2.578577756881714 + }, + { + "auxiliary_loss_clip": 0.07141528, + "auxiliary_loss_mlp": 0.01362108, + "balance_loss_clip": 0.06505635, + "balance_loss_mlp": 0.0127971, + "epoch": 0.05296858560048098, + "flos": 19871324749440.0, + "grad_norm": 13.59148063097553, + "language_loss": 0.93088204, + "learning_rate": 3.994469098399906e-06, + "loss": 1.01591837, + "num_input_tokens_seen": 18877485, + "router_z_loss_clip": 6.3671875, + "router_z_loss_mlp": 0.82421875, + "step": 881, + "time_per_iteration": 2.5984764099121094 + }, + { + "auxiliary_loss_clip": 0.07145406, + "auxiliary_loss_mlp": 0.01363259, + "balance_loss_clip": 0.06503064, + "balance_loss_mlp": 0.01280146, + "epoch": 0.053028708853148955, + "flos": 24395359668480.0, + "grad_norm": 2.511110361208876, + "language_loss": 0.91561359, + "learning_rate": 3.994440116339046e-06, + "loss": 1.00070024, + "num_input_tokens_seen": 18898275, + "router_z_loss_clip": 6.4140625, + "router_z_loss_mlp": 0.83203125, + "step": 882, + "time_per_iteration": 2.6321942806243896 + }, + { + "auxiliary_loss_clip": 0.07153618, + "auxiliary_loss_mlp": 0.01379213, + "balance_loss_clip": 0.06501983, + "balance_loss_mlp": 0.0129343, + "epoch": 0.05308883210581693, + "flos": 36402072048000.0, + "grad_norm": 3.8602802151834035, + "language_loss": 0.74549603, + "learning_rate": 3.994411058648816e-06, + "loss": 0.83082438, + "num_input_tokens_seen": 18920665, + "router_z_loss_clip": 6.515625, + "router_z_loss_mlp": 0.85839844, + "step": 883, + "time_per_iteration": 2.758694648742676 + }, + { + "auxiliary_loss_clip": 0.07123835, + "auxiliary_loss_mlp": 0.01365604, + "balance_loss_clip": 0.06493074, + "balance_loss_mlp": 0.01279965, + "epoch": 0.05314895535848489, + "flos": 22861427299200.0, + "grad_norm": 3.506018870992282, + "language_loss": 0.79542196, + "learning_rate": 3.994381925330319e-06, + "loss": 0.88031638, + "num_input_tokens_seen": 18939835, + "router_z_loss_clip": 6.3125, + "router_z_loss_mlp": 0.85644531, + "step": 884, + "time_per_iteration": 2.638016700744629 + }, + { + "auxiliary_loss_clip": 0.07094033, + "auxiliary_loss_mlp": 0.01359391, + "balance_loss_clip": 0.06489642, + "balance_loss_mlp": 0.01288057, + "epoch": 0.053209078611152864, + "flos": 12865381493760.0, + "grad_norm": 6.565904312623652, + "language_loss": 0.90469623, + "learning_rate": 3.994352716384659e-06, + "loss": 0.98923051, + "num_input_tokens_seen": 18958405, + "router_z_loss_clip": 6.0390625, + "router_z_loss_mlp": 0.71289062, + "step": 885, + "time_per_iteration": 2.5900588035583496 + }, + { + "auxiliary_loss_clip": 0.07139361, + "auxiliary_loss_mlp": 0.01377795, + "balance_loss_clip": 0.06508732, + "balance_loss_mlp": 0.0129225, + "epoch": 0.05326920186382083, + "flos": 12169112791680.0, + "grad_norm": 9.079017579739912, + "language_loss": 0.91530603, + "learning_rate": 3.994323431812945e-06, + "loss": 1.00047755, + "num_input_tokens_seen": 18975445, + "router_z_loss_clip": 6.3046875, + "router_z_loss_mlp": 0.85595703, + "step": 886, + "time_per_iteration": 4.099337339401245 + }, + { + "auxiliary_loss_clip": 0.07124092, + "auxiliary_loss_mlp": 0.01379295, + "balance_loss_clip": 0.06500152, + "balance_loss_mlp": 0.01295754, + "epoch": 0.0533293251164888, + "flos": 22710011771520.0, + "grad_norm": 3.9905004918105202, + "language_loss": 0.93810099, + "learning_rate": 3.994294071616286e-06, + "loss": 1.02313483, + "num_input_tokens_seen": 18991930, + "router_z_loss_clip": 6.23828125, + "router_z_loss_mlp": 0.83447266, + "step": 887, + "time_per_iteration": 2.5987393856048584 + }, + { + "auxiliary_loss_clip": 0.0714867, + "auxiliary_loss_mlp": 0.01405803, + "balance_loss_clip": 0.06507815, + "balance_loss_mlp": 0.01314536, + "epoch": 0.053389448369156774, + "flos": 26947860670080.0, + "grad_norm": 3.06900720752712, + "language_loss": 0.79354906, + "learning_rate": 3.994264635795796e-06, + "loss": 0.87909377, + "num_input_tokens_seen": 19009790, + "router_z_loss_clip": 6.40234375, + "router_z_loss_mlp": 0.91259766, + "step": 888, + "time_per_iteration": 4.025885820388794 + }, + { + "auxiliary_loss_clip": 0.07115386, + "auxiliary_loss_mlp": 0.01373999, + "balance_loss_clip": 0.06494455, + "balance_loss_mlp": 0.01293223, + "epoch": 0.05344957162182474, + "flos": 25563331330560.0, + "grad_norm": 6.088733603359691, + "language_loss": 0.92500973, + "learning_rate": 3.994235124352592e-06, + "loss": 1.00990355, + "num_input_tokens_seen": 19030170, + "router_z_loss_clip": 6.21484375, + "router_z_loss_mlp": 0.80761719, + "step": 889, + "time_per_iteration": 2.7182345390319824 + }, + { + "auxiliary_loss_clip": 0.07091353, + "auxiliary_loss_mlp": 0.01359755, + "balance_loss_clip": 0.06492079, + "balance_loss_mlp": 0.01289135, + "epoch": 0.05350969487449271, + "flos": 19725779007360.0, + "grad_norm": 3.9732892090836818, + "language_loss": 0.92642856, + "learning_rate": 3.994205537287791e-06, + "loss": 1.0109396, + "num_input_tokens_seen": 19048075, + "router_z_loss_clip": 5.99609375, + "router_z_loss_mlp": 0.70654297, + "step": 890, + "time_per_iteration": 4.055738925933838 + }, + { + "auxiliary_loss_clip": 0.071067, + "auxiliary_loss_mlp": 0.01356348, + "balance_loss_clip": 0.06478938, + "balance_loss_mlp": 0.01276573, + "epoch": 0.053569818127160676, + "flos": 27023694215040.0, + "grad_norm": 3.5767216506214523, + "language_loss": 0.98853362, + "learning_rate": 3.994175874602517e-06, + "loss": 1.07316399, + "num_input_tokens_seen": 19067465, + "router_z_loss_clip": 6.27734375, + "router_z_loss_mlp": 0.79785156, + "step": 891, + "time_per_iteration": 2.651681661605835 + }, + { + "auxiliary_loss_clip": 0.07084872, + "auxiliary_loss_mlp": 0.01351507, + "balance_loss_clip": 0.06476413, + "balance_loss_mlp": 0.01277788, + "epoch": 0.05362994137982865, + "flos": 13193383501440.0, + "grad_norm": 5.794831179079165, + "language_loss": 0.75768781, + "learning_rate": 3.994146136297893e-06, + "loss": 0.84205151, + "num_input_tokens_seen": 19085505, + "router_z_loss_clip": 6.08203125, + "router_z_loss_mlp": 0.73779297, + "step": 892, + "time_per_iteration": 2.5933892726898193 + }, + { + "auxiliary_loss_clip": 0.07096062, + "auxiliary_loss_mlp": 0.01350672, + "balance_loss_clip": 0.0647971, + "balance_loss_mlp": 0.01278002, + "epoch": 0.05369006463249662, + "flos": 28665590970240.0, + "grad_norm": 4.507397126758742, + "language_loss": 0.85958588, + "learning_rate": 3.994116322375049e-06, + "loss": 0.94405323, + "num_input_tokens_seen": 19104360, + "router_z_loss_clip": 6.16796875, + "router_z_loss_mlp": 0.7265625, + "step": 893, + "time_per_iteration": 2.6546378135681152 + }, + { + "auxiliary_loss_clip": 0.07101032, + "auxiliary_loss_mlp": 0.01336529, + "balance_loss_clip": 0.06474701, + "balance_loss_mlp": 0.01265099, + "epoch": 0.053750187885164585, + "flos": 28920736252800.0, + "grad_norm": 9.639579848612797, + "language_loss": 0.85423577, + "learning_rate": 3.994086432835114e-06, + "loss": 0.93861139, + "num_input_tokens_seen": 19124680, + "router_z_loss_clip": 6.265625, + "router_z_loss_mlp": 0.71484375, + "step": 894, + "time_per_iteration": 2.649336099624634 + }, + { + "auxiliary_loss_clip": 0.07051332, + "auxiliary_loss_mlp": 0.0132645, + "balance_loss_clip": 0.06452148, + "balance_loss_mlp": 0.01260742, + "epoch": 0.05381031113783256, + "flos": 15164246586240.0, + "grad_norm": 3.2292453008689215, + "language_loss": 0.79914492, + "learning_rate": 3.994056467679221e-06, + "loss": 0.88292277, + "num_input_tokens_seen": 19142895, + "router_z_loss_clip": 5.9921875, + "router_z_loss_mlp": 0.65722656, + "step": 895, + "time_per_iteration": 2.5825929641723633 + }, + { + "auxiliary_loss_clip": 0.07075687, + "auxiliary_loss_mlp": 0.01335812, + "balance_loss_clip": 0.06453281, + "balance_loss_mlp": 0.01257229, + "epoch": 0.05387043439050053, + "flos": 21841684709760.0, + "grad_norm": 4.836504932030544, + "language_loss": 0.91227436, + "learning_rate": 3.9940264269085065e-06, + "loss": 0.99638927, + "num_input_tokens_seen": 19163125, + "router_z_loss_clip": 6.2265625, + "router_z_loss_mlp": 0.78564453, + "step": 896, + "time_per_iteration": 2.657710313796997 + }, + { + "auxiliary_loss_clip": 0.07047559, + "auxiliary_loss_mlp": 0.0133946, + "balance_loss_clip": 0.06444345, + "balance_loss_mlp": 0.01266504, + "epoch": 0.053930557643168495, + "flos": 17315888855040.0, + "grad_norm": 5.716166538264852, + "language_loss": 0.91855001, + "learning_rate": 3.9939963105241115e-06, + "loss": 1.00242019, + "num_input_tokens_seen": 19179385, + "router_z_loss_clip": 6.0390625, + "router_z_loss_mlp": 0.72998047, + "step": 897, + "time_per_iteration": 2.5864884853363037 + }, + { + "auxiliary_loss_clip": 0.06997538, + "auxiliary_loss_mlp": 0.013383, + "balance_loss_clip": 0.06422779, + "balance_loss_mlp": 0.0126625, + "epoch": 0.05399068089583647, + "flos": 17354350679040.0, + "grad_norm": 28.355738836577903, + "language_loss": 0.93759477, + "learning_rate": 3.993966118527175e-06, + "loss": 1.02095306, + "num_input_tokens_seen": 19198725, + "router_z_loss_clip": 5.74609375, + "router_z_loss_mlp": 0.72070312, + "step": 898, + "time_per_iteration": 2.6132631301879883 + }, + { + "auxiliary_loss_clip": 0.07036521, + "auxiliary_loss_mlp": 0.01343105, + "balance_loss_clip": 0.06425488, + "balance_loss_mlp": 0.01264809, + "epoch": 0.05405080414850443, + "flos": 17491594867200.0, + "grad_norm": 4.630068897804509, + "language_loss": 0.97064686, + "learning_rate": 3.993935850918845e-06, + "loss": 1.05444312, + "num_input_tokens_seen": 19212380, + "router_z_loss_clip": 6.10546875, + "router_z_loss_mlp": 0.78320312, + "step": 899, + "time_per_iteration": 2.5816986560821533 + }, + { + "auxiliary_loss_clip": 0.07002847, + "auxiliary_loss_mlp": 0.01337851, + "balance_loss_clip": 0.06429946, + "balance_loss_mlp": 0.01263131, + "epoch": 0.054110927401172404, + "flos": 24503365981440.0, + "grad_norm": 5.469084454178289, + "language_loss": 0.79532343, + "learning_rate": 3.9939055077002665e-06, + "loss": 0.87873036, + "num_input_tokens_seen": 19232235, + "router_z_loss_clip": 5.73046875, + "router_z_loss_mlp": 0.74755859, + "step": 900, + "time_per_iteration": 2.6616973876953125 + }, + { + "auxiliary_loss_clip": 0.07026203, + "auxiliary_loss_mlp": 0.01335204, + "balance_loss_clip": 0.06429055, + "balance_loss_mlp": 0.01261628, + "epoch": 0.054171050653840376, + "flos": 22936715792640.0, + "grad_norm": 9.114074112173778, + "language_loss": 0.79687816, + "learning_rate": 3.993875088872592e-06, + "loss": 0.88049221, + "num_input_tokens_seen": 19251460, + "router_z_loss_clip": 5.9765625, + "router_z_loss_mlp": 0.73681641, + "step": 901, + "time_per_iteration": 2.6217994689941406 + }, + { + "auxiliary_loss_clip": 0.06969521, + "auxiliary_loss_mlp": 0.01353187, + "balance_loss_clip": 0.06413257, + "balance_loss_mlp": 0.01276941, + "epoch": 0.05423117390650834, + "flos": 12938238218880.0, + "grad_norm": 4.5794905652094675, + "language_loss": 0.8858788, + "learning_rate": 3.9938445944369745e-06, + "loss": 0.96910584, + "num_input_tokens_seen": 19269060, + "router_z_loss_clip": 5.56640625, + "router_z_loss_mlp": 0.76220703, + "step": 902, + "time_per_iteration": 2.600041151046753 + }, + { + "auxiliary_loss_clip": 0.07010742, + "auxiliary_loss_mlp": 0.01348168, + "balance_loss_clip": 0.0642361, + "balance_loss_mlp": 0.01272208, + "epoch": 0.05429129715917631, + "flos": 19907438659200.0, + "grad_norm": 3.5235627900978987, + "language_loss": 0.90038717, + "learning_rate": 3.993814024394569e-06, + "loss": 0.98397624, + "num_input_tokens_seen": 19288620, + "router_z_loss_clip": 5.875, + "router_z_loss_mlp": 0.75927734, + "step": 903, + "time_per_iteration": 2.654343843460083 + }, + { + "auxiliary_loss_clip": 0.07027672, + "auxiliary_loss_mlp": 0.01351984, + "balance_loss_clip": 0.06429485, + "balance_loss_mlp": 0.01276739, + "epoch": 0.05435142041184428, + "flos": 16914065800320.0, + "grad_norm": 3.6682943607818808, + "language_loss": 0.79433787, + "learning_rate": 3.993783378746537e-06, + "loss": 0.87813443, + "num_input_tokens_seen": 19306615, + "router_z_loss_clip": 5.98046875, + "router_z_loss_mlp": 0.75292969, + "step": 904, + "time_per_iteration": 2.5959675312042236 + }, + { + "auxiliary_loss_clip": 0.07042356, + "auxiliary_loss_mlp": 0.01361745, + "balance_loss_clip": 0.06427713, + "balance_loss_mlp": 0.01279062, + "epoch": 0.05441154366451225, + "flos": 23954613592320.0, + "grad_norm": 4.579053653377249, + "language_loss": 0.88901699, + "learning_rate": 3.993752657494039e-06, + "loss": 0.97305799, + "num_input_tokens_seen": 19321680, + "router_z_loss_clip": 6.140625, + "router_z_loss_mlp": 0.82714844, + "step": 905, + "time_per_iteration": 2.6219427585601807 + }, + { + "auxiliary_loss_clip": 0.06998053, + "auxiliary_loss_mlp": 0.01347731, + "balance_loss_clip": 0.06429392, + "balance_loss_mlp": 0.01274727, + "epoch": 0.05447166691718022, + "flos": 19981678976640.0, + "grad_norm": 3.7765145633999624, + "language_loss": 0.78233027, + "learning_rate": 3.993721860638241e-06, + "loss": 0.8657881, + "num_input_tokens_seen": 19339760, + "router_z_loss_clip": 5.6875, + "router_z_loss_mlp": 0.73046875, + "step": 906, + "time_per_iteration": 2.6213393211364746 + }, + { + "auxiliary_loss_clip": 0.07034522, + "auxiliary_loss_mlp": 0.01354415, + "balance_loss_clip": 0.06439427, + "balance_loss_mlp": 0.01281221, + "epoch": 0.05453179016984819, + "flos": 24943483152000.0, + "grad_norm": 3.1487164244038546, + "language_loss": 0.91526973, + "learning_rate": 3.993690988180309e-06, + "loss": 0.9991591, + "num_input_tokens_seen": 19359585, + "router_z_loss_clip": 5.94921875, + "router_z_loss_mlp": 0.73242188, + "step": 907, + "time_per_iteration": 2.6804075241088867 + }, + { + "auxiliary_loss_clip": 0.07033581, + "auxiliary_loss_mlp": 0.01357567, + "balance_loss_clip": 0.06437694, + "balance_loss_mlp": 0.01279461, + "epoch": 0.05459191342251616, + "flos": 18121170119040.0, + "grad_norm": 6.406912601020187, + "language_loss": 0.90540731, + "learning_rate": 3.9936600401214165e-06, + "loss": 0.98931873, + "num_input_tokens_seen": 19378590, + "router_z_loss_clip": 5.953125, + "router_z_loss_mlp": 0.78076172, + "step": 908, + "time_per_iteration": 2.645015001296997 + }, + { + "auxiliary_loss_clip": 0.07043326, + "auxiliary_loss_mlp": 0.01345219, + "balance_loss_clip": 0.06445918, + "balance_loss_mlp": 0.01274695, + "epoch": 0.054652036675184125, + "flos": 19214314485120.0, + "grad_norm": 7.110019645600745, + "language_loss": 0.94541007, + "learning_rate": 3.9936290164627345e-06, + "loss": 1.02929544, + "num_input_tokens_seen": 19397910, + "router_z_loss_clip": 5.96875, + "router_z_loss_mlp": 0.70507812, + "step": 909, + "time_per_iteration": 2.6648013591766357 + }, + { + "auxiliary_loss_clip": 0.07070212, + "auxiliary_loss_mlp": 0.01367531, + "balance_loss_clip": 0.06454301, + "balance_loss_mlp": 0.01287184, + "epoch": 0.0547121599278521, + "flos": 16331253926400.0, + "grad_norm": 4.130588011927331, + "language_loss": 0.76068008, + "learning_rate": 3.99359791720544e-06, + "loss": 0.84505749, + "num_input_tokens_seen": 19415950, + "router_z_loss_clip": 6.15625, + "router_z_loss_mlp": 0.80273438, + "step": 910, + "time_per_iteration": 2.588240146636963 + }, + { + "auxiliary_loss_clip": 0.07039558, + "auxiliary_loss_mlp": 0.0135407, + "balance_loss_clip": 0.06453503, + "balance_loss_mlp": 0.01281829, + "epoch": 0.05477228318052007, + "flos": 20345165988480.0, + "grad_norm": 30.49086914574189, + "language_loss": 0.86822844, + "learning_rate": 3.993566742350714e-06, + "loss": 0.95216471, + "num_input_tokens_seen": 19435275, + "router_z_loss_clip": 5.8671875, + "router_z_loss_mlp": 0.72265625, + "step": 911, + "time_per_iteration": 2.6324408054351807 + }, + { + "auxiliary_loss_clip": 0.07064489, + "auxiliary_loss_mlp": 0.01358074, + "balance_loss_clip": 0.06459624, + "balance_loss_mlp": 0.01280207, + "epoch": 0.054832406433188034, + "flos": 21978216138240.0, + "grad_norm": 33.1555590789585, + "language_loss": 0.80294693, + "learning_rate": 3.993535491899736e-06, + "loss": 0.88717258, + "num_input_tokens_seen": 19452090, + "router_z_loss_clip": 6.046875, + "router_z_loss_mlp": 0.77880859, + "step": 912, + "time_per_iteration": 2.590373992919922 + }, + { + "auxiliary_loss_clip": 0.0703726, + "auxiliary_loss_mlp": 0.01353834, + "balance_loss_clip": 0.06456903, + "balance_loss_mlp": 0.01284979, + "epoch": 0.054892529685856006, + "flos": 16404487994880.0, + "grad_norm": 20.678206909589232, + "language_loss": 0.87077272, + "learning_rate": 3.993504165853694e-06, + "loss": 0.9546836, + "num_input_tokens_seen": 19470865, + "router_z_loss_clip": 5.8046875, + "router_z_loss_mlp": 0.68896484, + "step": 913, + "time_per_iteration": 2.6207854747772217 + }, + { + "auxiliary_loss_clip": 0.07058232, + "auxiliary_loss_mlp": 0.01355937, + "balance_loss_clip": 0.06467378, + "balance_loss_mlp": 0.01279214, + "epoch": 0.05495265293852397, + "flos": 23918709317760.0, + "grad_norm": 2.929829982992902, + "language_loss": 0.86646307, + "learning_rate": 3.993472764213772e-06, + "loss": 0.9506048, + "num_input_tokens_seen": 19492145, + "router_z_loss_clip": 5.91015625, + "router_z_loss_mlp": 0.76708984, + "step": 914, + "time_per_iteration": 2.653738260269165 + }, + { + "auxiliary_loss_clip": 0.07080867, + "auxiliary_loss_mlp": 0.01347963, + "balance_loss_clip": 0.06487378, + "balance_loss_mlp": 0.01278583, + "epoch": 0.055012776191191944, + "flos": 23593767984000.0, + "grad_norm": 5.681880132712419, + "language_loss": 0.94313538, + "learning_rate": 3.9934412869811655e-06, + "loss": 1.02742374, + "num_input_tokens_seen": 19511015, + "router_z_loss_clip": 5.93359375, + "router_z_loss_mlp": 0.69433594, + "step": 915, + "time_per_iteration": 2.6307506561279297 + }, + { + "auxiliary_loss_clip": 0.07055361, + "auxiliary_loss_mlp": 0.01345822, + "balance_loss_clip": 0.06473369, + "balance_loss_mlp": 0.01276442, + "epoch": 0.055072899443859916, + "flos": 17533997832960.0, + "grad_norm": 9.383060565186796, + "language_loss": 0.9327727, + "learning_rate": 3.993409734157064e-06, + "loss": 1.01678455, + "num_input_tokens_seen": 19529040, + "router_z_loss_clip": 5.8203125, + "router_z_loss_mlp": 0.69384766, + "step": 916, + "time_per_iteration": 2.5821292400360107 + }, + { + "auxiliary_loss_clip": 0.0710435, + "auxiliary_loss_mlp": 0.01382873, + "balance_loss_clip": 0.06478155, + "balance_loss_mlp": 0.01299808, + "epoch": 0.05513302269652788, + "flos": 21693246001920.0, + "grad_norm": 9.219504726961107, + "language_loss": 0.83272588, + "learning_rate": 3.993378105742666e-06, + "loss": 0.91759813, + "num_input_tokens_seen": 19549540, + "router_z_loss_clip": 6.2578125, + "router_z_loss_mlp": 0.83056641, + "step": 917, + "time_per_iteration": 2.620739221572876 + }, + { + "auxiliary_loss_clip": 0.07102817, + "auxiliary_loss_mlp": 0.01375299, + "balance_loss_clip": 0.06484253, + "balance_loss_mlp": 0.01293473, + "epoch": 0.05519314594919585, + "flos": 21619257246720.0, + "grad_norm": 3.775060612193374, + "language_loss": 0.84478474, + "learning_rate": 3.9933464017391705e-06, + "loss": 0.92956591, + "num_input_tokens_seen": 19567570, + "router_z_loss_clip": 6.1875, + "router_z_loss_mlp": 0.81787109, + "step": 918, + "time_per_iteration": 2.594416379928589 + }, + { + "auxiliary_loss_clip": 0.07101964, + "auxiliary_loss_mlp": 0.01367305, + "balance_loss_clip": 0.06485492, + "balance_loss_mlp": 0.01289151, + "epoch": 0.05525326920186382, + "flos": 21804983821440.0, + "grad_norm": 30.311763596206674, + "language_loss": 0.92698455, + "learning_rate": 3.99331462214778e-06, + "loss": 1.01167727, + "num_input_tokens_seen": 19585330, + "router_z_loss_clip": 6.171875, + "router_z_loss_mlp": 0.78125, + "step": 919, + "time_per_iteration": 2.652820587158203 + }, + { + "auxiliary_loss_clip": 0.07067424, + "auxiliary_loss_mlp": 0.01355052, + "balance_loss_clip": 0.06469625, + "balance_loss_mlp": 0.01279807, + "epoch": 0.05531339245453179, + "flos": 28447272357120.0, + "grad_norm": 10.071293586926402, + "language_loss": 0.91352344, + "learning_rate": 3.993282766969699e-06, + "loss": 0.99774826, + "num_input_tokens_seen": 19604970, + "router_z_loss_clip": 5.97265625, + "router_z_loss_mlp": 0.75244141, + "step": 920, + "time_per_iteration": 2.676198720932007 + }, + { + "auxiliary_loss_clip": 0.0705073, + "auxiliary_loss_mlp": 0.01349539, + "balance_loss_clip": 0.06465692, + "balance_loss_mlp": 0.01277489, + "epoch": 0.05537351570719976, + "flos": 37383688229760.0, + "grad_norm": 4.912310342767309, + "language_loss": 0.69610375, + "learning_rate": 3.993250836206136e-06, + "loss": 0.78010643, + "num_input_tokens_seen": 19626235, + "router_z_loss_clip": 5.85546875, + "router_z_loss_mlp": 0.72021484, + "step": 921, + "time_per_iteration": 2.729602098464966 + }, + { + "auxiliary_loss_clip": 0.07080688, + "auxiliary_loss_mlp": 0.01369369, + "balance_loss_clip": 0.06465121, + "balance_loss_mlp": 0.01287687, + "epoch": 0.05543363895986773, + "flos": 20090733465600.0, + "grad_norm": 4.2535446135467785, + "language_loss": 0.76117694, + "learning_rate": 3.993218829858301e-06, + "loss": 0.8456775, + "num_input_tokens_seen": 19644305, + "router_z_loss_clip": 6.1640625, + "router_z_loss_mlp": 0.81689453, + "step": 922, + "time_per_iteration": 2.5846810340881348 + }, + { + "auxiliary_loss_clip": 0.07077445, + "auxiliary_loss_mlp": 0.01375095, + "balance_loss_clip": 0.06466563, + "balance_loss_mlp": 0.01293842, + "epoch": 0.0554937622125357, + "flos": 24539773380480.0, + "grad_norm": 5.782149663492731, + "language_loss": 0.86474669, + "learning_rate": 3.993186747927408e-06, + "loss": 0.9492721, + "num_input_tokens_seen": 19662130, + "router_z_loss_clip": 6.1171875, + "router_z_loss_mlp": 0.81298828, + "step": 923, + "time_per_iteration": 2.6038758754730225 + }, + { + "auxiliary_loss_clip": 0.07066977, + "auxiliary_loss_mlp": 0.01365852, + "balance_loss_clip": 0.06460079, + "balance_loss_mlp": 0.01286125, + "epoch": 0.055553885465203665, + "flos": 14325408961920.0, + "grad_norm": 4.5524709486596695, + "language_loss": 0.82890737, + "learning_rate": 3.993154590414675e-06, + "loss": 0.91323566, + "num_input_tokens_seen": 19680715, + "router_z_loss_clip": 6.0703125, + "router_z_loss_mlp": 0.79736328, + "step": 924, + "time_per_iteration": 2.563229560852051 + }, + { + "auxiliary_loss_clip": 0.07049644, + "auxiliary_loss_mlp": 0.01383238, + "balance_loss_clip": 0.06458092, + "balance_loss_mlp": 0.01303654, + "epoch": 0.05561400871787164, + "flos": 27388522892160.0, + "grad_norm": 5.4957057534226115, + "language_loss": 1.05798936, + "learning_rate": 3.993122357321319e-06, + "loss": 1.14231825, + "num_input_tokens_seen": 19700535, + "router_z_loss_clip": 5.9140625, + "router_z_loss_mlp": 0.79492188, + "step": 925, + "time_per_iteration": 4.167480230331421 + }, + { + "auxiliary_loss_clip": 0.07051321, + "auxiliary_loss_mlp": 0.01368022, + "balance_loss_clip": 0.06456822, + "balance_loss_mlp": 0.01291585, + "epoch": 0.05567413197053961, + "flos": 23227681495680.0, + "grad_norm": 4.150968516842117, + "language_loss": 0.85383534, + "learning_rate": 3.993090048648564e-06, + "loss": 0.93802875, + "num_input_tokens_seen": 19718825, + "router_z_loss_clip": 5.94921875, + "router_z_loss_mlp": 0.76367188, + "step": 926, + "time_per_iteration": 4.156589031219482 + }, + { + "auxiliary_loss_clip": 0.07111964, + "auxiliary_loss_mlp": 0.01390888, + "balance_loss_clip": 0.06470172, + "balance_loss_mlp": 0.0130129, + "epoch": 0.055734255223207574, + "flos": 25271988284160.0, + "grad_norm": 8.095313947782397, + "language_loss": 0.79582185, + "learning_rate": 3.993057664397634e-06, + "loss": 0.88085037, + "num_input_tokens_seen": 19739080, + "router_z_loss_clip": 6.42578125, + "router_z_loss_mlp": 0.89550781, + "step": 927, + "time_per_iteration": 2.6851751804351807 + }, + { + "auxiliary_loss_clip": 0.06860578, + "auxiliary_loss_mlp": 0.01306525, + "balance_loss_clip": 0.06486383, + "balance_loss_mlp": 0.01261607, + "epoch": 0.055794378475875546, + "flos": 66524698938240.0, + "grad_norm": 0.7865808163657396, + "language_loss": 0.59965324, + "learning_rate": 3.9930252045697585e-06, + "loss": 0.68132424, + "num_input_tokens_seen": 19802960, + "router_z_loss_clip": 3.74804688, + "router_z_loss_mlp": 0.44921875, + "step": 928, + "time_per_iteration": 4.694532632827759 + }, + { + "auxiliary_loss_clip": 0.0702403, + "auxiliary_loss_mlp": 0.01398439, + "balance_loss_clip": 0.06437568, + "balance_loss_mlp": 0.01313991, + "epoch": 0.05585450172854351, + "flos": 25344635374080.0, + "grad_norm": 5.300738051002958, + "language_loss": 0.99270105, + "learning_rate": 3.992992669166168e-06, + "loss": 1.07692575, + "num_input_tokens_seen": 19822765, + "router_z_loss_clip": 5.8671875, + "router_z_loss_mlp": 0.84472656, + "step": 929, + "time_per_iteration": 2.652329444885254 + }, + { + "auxiliary_loss_clip": 0.07033007, + "auxiliary_loss_mlp": 0.01402576, + "balance_loss_clip": 0.06441823, + "balance_loss_mlp": 0.01318938, + "epoch": 0.05591462498121148, + "flos": 33920163711360.0, + "grad_norm": 20.10669872289237, + "language_loss": 0.7473861, + "learning_rate": 3.992960058188094e-06, + "loss": 0.83174193, + "num_input_tokens_seen": 19843590, + "router_z_loss_clip": 5.91015625, + "router_z_loss_mlp": 0.83691406, + "step": 930, + "time_per_iteration": 4.218009948730469 + }, + { + "auxiliary_loss_clip": 0.0703931, + "auxiliary_loss_mlp": 0.01397804, + "balance_loss_clip": 0.06446733, + "balance_loss_mlp": 0.01313929, + "epoch": 0.055974748233879455, + "flos": 17936617501440.0, + "grad_norm": 4.521391546474749, + "language_loss": 0.88519967, + "learning_rate": 3.992927371636776e-06, + "loss": 0.96957082, + "num_input_tokens_seen": 19860230, + "router_z_loss_clip": 5.91796875, + "router_z_loss_mlp": 0.83886719, + "step": 931, + "time_per_iteration": 2.5678892135620117 + }, + { + "auxiliary_loss_clip": 0.07037735, + "auxiliary_loss_mlp": 0.01413156, + "balance_loss_clip": 0.06439222, + "balance_loss_mlp": 0.01325466, + "epoch": 0.05603487148654742, + "flos": 24028392712320.0, + "grad_norm": 3.3508446860260355, + "language_loss": 0.86982858, + "learning_rate": 3.9928946095134525e-06, + "loss": 0.95433742, + "num_input_tokens_seen": 19880795, + "router_z_loss_clip": 5.9921875, + "router_z_loss_mlp": 0.87695312, + "step": 932, + "time_per_iteration": 2.6454596519470215 + }, + { + "auxiliary_loss_clip": 0.07046005, + "auxiliary_loss_mlp": 0.01409303, + "balance_loss_clip": 0.06444195, + "balance_loss_mlp": 0.01322901, + "epoch": 0.05609499473921539, + "flos": 17312912035200.0, + "grad_norm": 4.63721211876497, + "language_loss": 0.79083282, + "learning_rate": 3.992861771819365e-06, + "loss": 0.87538588, + "num_input_tokens_seen": 19897960, + "router_z_loss_clip": 6.02734375, + "router_z_loss_mlp": 0.86328125, + "step": 933, + "time_per_iteration": 2.5537846088409424 + }, + { + "auxiliary_loss_clip": 0.07023589, + "auxiliary_loss_mlp": 0.01416541, + "balance_loss_clip": 0.06434061, + "balance_loss_mlp": 0.01334287, + "epoch": 0.05615511799188336, + "flos": 21000834587520.0, + "grad_norm": 6.948998666256607, + "language_loss": 0.90410703, + "learning_rate": 3.99282885855576e-06, + "loss": 0.98850828, + "num_input_tokens_seen": 19913315, + "router_z_loss_clip": 5.89453125, + "router_z_loss_mlp": 0.82275391, + "step": 934, + "time_per_iteration": 2.5762336254119873 + }, + { + "auxiliary_loss_clip": 0.06990926, + "auxiliary_loss_mlp": 0.01429171, + "balance_loss_clip": 0.06438624, + "balance_loss_mlp": 0.01345153, + "epoch": 0.05621524124455133, + "flos": 17279062185600.0, + "grad_norm": 7.5646674228018265, + "language_loss": 0.84164441, + "learning_rate": 3.992795869723885e-06, + "loss": 0.92584538, + "num_input_tokens_seen": 19928790, + "router_z_loss_clip": 5.52734375, + "router_z_loss_mlp": 0.83984375, + "step": 935, + "time_per_iteration": 2.6203958988189697 + }, + { + "auxiliary_loss_clip": 0.06841761, + "auxiliary_loss_mlp": 0.01418196, + "balance_loss_clip": 0.06462182, + "balance_loss_mlp": 0.01359927, + "epoch": 0.0562753644972193, + "flos": 58737597194880.0, + "grad_norm": 0.8140808506826857, + "language_loss": 0.69178045, + "learning_rate": 3.99276280532499e-06, + "loss": 0.77438003, + "num_input_tokens_seen": 19988785, + "router_z_loss_clip": 3.79101562, + "router_z_loss_mlp": 0.58105469, + "step": 936, + "time_per_iteration": 3.1629393100738525 + }, + { + "auxiliary_loss_clip": 0.070338, + "auxiliary_loss_mlp": 0.01416227, + "balance_loss_clip": 0.06443301, + "balance_loss_mlp": 0.0133178, + "epoch": 0.05633548774988727, + "flos": 17462776262400.0, + "grad_norm": 4.591481841632389, + "language_loss": 0.81027842, + "learning_rate": 3.992729665360331e-06, + "loss": 0.89477861, + "num_input_tokens_seen": 20007685, + "router_z_loss_clip": 5.8984375, + "router_z_loss_mlp": 0.84472656, + "step": 937, + "time_per_iteration": 2.650186538696289 + }, + { + "auxiliary_loss_clip": 0.0684337, + "auxiliary_loss_mlp": 0.01393468, + "balance_loss_clip": 0.06467308, + "balance_loss_mlp": 0.01340683, + "epoch": 0.05639561100255524, + "flos": 70675939042560.0, + "grad_norm": 0.8752420339339617, + "language_loss": 0.64563346, + "learning_rate": 3.992696449831162e-06, + "loss": 0.72800183, + "num_input_tokens_seen": 20072750, + "router_z_loss_clip": 3.75, + "router_z_loss_mlp": 0.52880859, + "step": 938, + "time_per_iteration": 3.200669050216675 + }, + { + "auxiliary_loss_clip": 0.07073379, + "auxiliary_loss_mlp": 0.01391777, + "balance_loss_clip": 0.06460777, + "balance_loss_mlp": 0.01309332, + "epoch": 0.056455734255223204, + "flos": 20492346885120.0, + "grad_norm": 5.43214954330628, + "language_loss": 0.84251928, + "learning_rate": 3.992663158738745e-06, + "loss": 0.92717087, + "num_input_tokens_seen": 20089070, + "router_z_loss_clip": 6.125, + "router_z_loss_mlp": 0.82373047, + "step": 939, + "time_per_iteration": 2.622727870941162 + }, + { + "auxiliary_loss_clip": 0.07029171, + "auxiliary_loss_mlp": 0.01403853, + "balance_loss_clip": 0.06452838, + "balance_loss_mlp": 0.01326081, + "epoch": 0.056515857507891176, + "flos": 22059961395840.0, + "grad_norm": 5.005416621507547, + "language_loss": 0.76388282, + "learning_rate": 3.992629792084341e-06, + "loss": 0.84821308, + "num_input_tokens_seen": 20108790, + "router_z_loss_clip": 5.765625, + "router_z_loss_mlp": 0.77734375, + "step": 940, + "time_per_iteration": 2.6560001373291016 + }, + { + "auxiliary_loss_clip": 0.07005631, + "auxiliary_loss_mlp": 0.01389365, + "balance_loss_clip": 0.06443679, + "balance_loss_mlp": 0.01314073, + "epoch": 0.05657598076055915, + "flos": 24032291927040.0, + "grad_norm": 11.024308816683174, + "language_loss": 0.7415117, + "learning_rate": 3.992596349869216e-06, + "loss": 0.82546163, + "num_input_tokens_seen": 20128455, + "router_z_loss_clip": 5.6171875, + "router_z_loss_mlp": 0.75341797, + "step": 941, + "time_per_iteration": 2.691328525543213 + }, + { + "auxiliary_loss_clip": 0.07028662, + "auxiliary_loss_mlp": 0.01392256, + "balance_loss_clip": 0.06448376, + "balance_loss_mlp": 0.0131496, + "epoch": 0.05663610401322711, + "flos": 20486057829120.0, + "grad_norm": 6.757951792278694, + "language_loss": 0.8311438, + "learning_rate": 3.992562832094637e-06, + "loss": 0.91535294, + "num_input_tokens_seen": 20145775, + "router_z_loss_clip": 5.80859375, + "router_z_loss_mlp": 0.77246094, + "step": 942, + "time_per_iteration": 2.5987863540649414 + }, + { + "auxiliary_loss_clip": 0.07036945, + "auxiliary_loss_mlp": 0.01378378, + "balance_loss_clip": 0.06460088, + "balance_loss_mlp": 0.01303896, + "epoch": 0.056696227265895086, + "flos": 21075368394240.0, + "grad_norm": 21.600438823460475, + "language_loss": 0.92831737, + "learning_rate": 3.9925292387618755e-06, + "loss": 1.01247072, + "num_input_tokens_seen": 20164315, + "router_z_loss_clip": 5.765625, + "router_z_loss_mlp": 0.74462891, + "step": 943, + "time_per_iteration": 2.62147855758667 + }, + { + "auxiliary_loss_clip": 0.07040788, + "auxiliary_loss_mlp": 0.01386269, + "balance_loss_clip": 0.06462353, + "balance_loss_mlp": 0.01313027, + "epoch": 0.05675635051856306, + "flos": 17827017960960.0, + "grad_norm": 6.279897483523164, + "language_loss": 0.7991842, + "learning_rate": 3.992495569872206e-06, + "loss": 0.8834548, + "num_input_tokens_seen": 20182760, + "router_z_loss_clip": 5.78125, + "router_z_loss_mlp": 0.73242188, + "step": 944, + "time_per_iteration": 2.5755181312561035 + }, + { + "auxiliary_loss_clip": 0.0704762, + "auxiliary_loss_mlp": 0.01372731, + "balance_loss_clip": 0.06471305, + "balance_loss_mlp": 0.01300109, + "epoch": 0.05681647377123102, + "flos": 23122065024000.0, + "grad_norm": 11.186502162192404, + "language_loss": 0.82437181, + "learning_rate": 3.992461825426906e-06, + "loss": 0.90857524, + "num_input_tokens_seen": 20203830, + "router_z_loss_clip": 5.76171875, + "router_z_loss_mlp": 0.7265625, + "step": 945, + "time_per_iteration": 2.646212339401245 + }, + { + "auxiliary_loss_clip": 0.07062095, + "auxiliary_loss_mlp": 0.01352146, + "balance_loss_clip": 0.06473356, + "balance_loss_mlp": 0.01276854, + "epoch": 0.056876597023898995, + "flos": 16076024789760.0, + "grad_norm": 6.503065924665904, + "language_loss": 0.86640823, + "learning_rate": 3.992428005427252e-06, + "loss": 0.95055068, + "num_input_tokens_seen": 20220365, + "router_z_loss_clip": 5.890625, + "router_z_loss_mlp": 0.75195312, + "step": 946, + "time_per_iteration": 2.5955421924591064 + }, + { + "auxiliary_loss_clip": 0.07105307, + "auxiliary_loss_mlp": 0.01349465, + "balance_loss_clip": 0.06487983, + "balance_loss_mlp": 0.01268975, + "epoch": 0.05693672027656696, + "flos": 16841083294080.0, + "grad_norm": 30.160109907470417, + "language_loss": 0.83428961, + "learning_rate": 3.992394109874529e-06, + "loss": 0.91883731, + "num_input_tokens_seen": 20238640, + "router_z_loss_clip": 6.171875, + "router_z_loss_mlp": 0.80517578, + "step": 947, + "time_per_iteration": 2.578885078430176 + }, + { + "auxiliary_loss_clip": 0.07120173, + "auxiliary_loss_mlp": 0.01346427, + "balance_loss_clip": 0.06479014, + "balance_loss_mlp": 0.01264888, + "epoch": 0.05699684352923493, + "flos": 21394104526080.0, + "grad_norm": 7.760122513642949, + "language_loss": 0.89679337, + "learning_rate": 3.9923601387700225e-06, + "loss": 0.98145938, + "num_input_tokens_seen": 20251025, + "router_z_loss_clip": 6.40625, + "router_z_loss_mlp": 0.81542969, + "step": 948, + "time_per_iteration": 2.6047542095184326 + }, + { + "auxiliary_loss_clip": 0.07067588, + "auxiliary_loss_mlp": 0.01342886, + "balance_loss_clip": 0.06478094, + "balance_loss_mlp": 0.01268786, + "epoch": 0.057056966781902904, + "flos": 15565818078720.0, + "grad_norm": 4.718676024566818, + "language_loss": 0.91130018, + "learning_rate": 3.992326092115019e-06, + "loss": 0.99540496, + "num_input_tokens_seen": 20269775, + "router_z_loss_clip": 5.8984375, + "router_z_loss_mlp": 0.74121094, + "step": 949, + "time_per_iteration": 2.59798526763916 + }, + { + "auxiliary_loss_clip": 0.07052803, + "auxiliary_loss_mlp": 0.01334514, + "balance_loss_clip": 0.06479354, + "balance_loss_mlp": 0.01265897, + "epoch": 0.05711709003457087, + "flos": 19943971839360.0, + "grad_norm": 5.50050902669799, + "language_loss": 0.81973231, + "learning_rate": 3.992291969910811e-06, + "loss": 0.90360546, + "num_input_tokens_seen": 20287715, + "router_z_loss_clip": 5.73828125, + "router_z_loss_mlp": 0.68603516, + "step": 950, + "time_per_iteration": 2.6259987354278564 + }, + { + "auxiliary_loss_clip": 0.07096414, + "auxiliary_loss_mlp": 0.01341844, + "balance_loss_clip": 0.06496268, + "balance_loss_mlp": 0.0126536, + "epoch": 0.05717721328723884, + "flos": 30339953982720.0, + "grad_norm": 5.942643661235501, + "language_loss": 0.85793424, + "learning_rate": 3.992257772158691e-06, + "loss": 0.94231689, + "num_input_tokens_seen": 20307070, + "router_z_loss_clip": 5.99609375, + "router_z_loss_mlp": 0.76464844, + "step": 951, + "time_per_iteration": 2.6625497341156006 + }, + { + "auxiliary_loss_clip": 0.07096014, + "auxiliary_loss_mlp": 0.01337385, + "balance_loss_clip": 0.06490001, + "balance_loss_mlp": 0.0125494, + "epoch": 0.05723733653990681, + "flos": 23660251799040.0, + "grad_norm": 12.14793274648965, + "language_loss": 0.90794688, + "learning_rate": 3.992223498859958e-06, + "loss": 0.9922809, + "num_input_tokens_seen": 20324945, + "router_z_loss_clip": 6.06640625, + "router_z_loss_mlp": 0.82373047, + "step": 952, + "time_per_iteration": 2.6754026412963867 + }, + { + "auxiliary_loss_clip": 0.07150276, + "auxiliary_loss_mlp": 0.01358536, + "balance_loss_clip": 0.06509267, + "balance_loss_mlp": 0.01266268, + "epoch": 0.05729745979257478, + "flos": 22062518945280.0, + "grad_norm": 4.876026783534778, + "language_loss": 0.83819556, + "learning_rate": 3.9921891500159084e-06, + "loss": 0.92328364, + "num_input_tokens_seen": 20346135, + "router_z_loss_clip": 6.4140625, + "router_z_loss_mlp": 0.92333984, + "step": 953, + "time_per_iteration": 2.6004669666290283 + }, + { + "auxiliary_loss_clip": 0.07094061, + "auxiliary_loss_mlp": 0.01342327, + "balance_loss_clip": 0.06495301, + "balance_loss_mlp": 0.01262409, + "epoch": 0.05735758304524275, + "flos": 19609554994560.0, + "grad_norm": 6.9064094964387, + "language_loss": 0.9058758, + "learning_rate": 3.992154725627848e-06, + "loss": 0.99023962, + "num_input_tokens_seen": 20364450, + "router_z_loss_clip": 5.98046875, + "router_z_loss_mlp": 0.79931641, + "step": 954, + "time_per_iteration": 2.6270759105682373 + }, + { + "auxiliary_loss_clip": 0.07104363, + "auxiliary_loss_mlp": 0.01340099, + "balance_loss_clip": 0.06505129, + "balance_loss_mlp": 0.01262661, + "epoch": 0.057417706297910716, + "flos": 19105050360960.0, + "grad_norm": 6.439393268367411, + "language_loss": 0.9193548, + "learning_rate": 3.9921202256970804e-06, + "loss": 1.00379944, + "num_input_tokens_seen": 20383500, + "router_z_loss_clip": 6.0, + "router_z_loss_mlp": 0.77490234, + "step": 955, + "time_per_iteration": 2.5784714221954346 + }, + { + "auxiliary_loss_clip": 0.07088242, + "auxiliary_loss_mlp": 0.01339912, + "balance_loss_clip": 0.06500716, + "balance_loss_mlp": 0.01263379, + "epoch": 0.05747782955057869, + "flos": 16660136401920.0, + "grad_norm": 130.9595542139282, + "language_loss": 0.93622941, + "learning_rate": 3.992085650224914e-06, + "loss": 1.02051091, + "num_input_tokens_seen": 20400295, + "router_z_loss_clip": 5.875, + "router_z_loss_mlp": 0.765625, + "step": 956, + "time_per_iteration": 2.654709815979004 + }, + { + "auxiliary_loss_clip": 0.07069805, + "auxiliary_loss_mlp": 0.01336322, + "balance_loss_clip": 0.06513655, + "balance_loss_mlp": 0.01263795, + "epoch": 0.05753795280324665, + "flos": 14507362103040.0, + "grad_norm": 7.35623901329006, + "language_loss": 0.79601187, + "learning_rate": 3.99205099921266e-06, + "loss": 0.88007313, + "num_input_tokens_seen": 20419085, + "router_z_loss_clip": 5.5625, + "router_z_loss_mlp": 0.72509766, + "step": 957, + "time_per_iteration": 2.5814363956451416 + }, + { + "auxiliary_loss_clip": 0.07102334, + "auxiliary_loss_mlp": 0.013347, + "balance_loss_clip": 0.06516448, + "balance_loss_mlp": 0.01260171, + "epoch": 0.057598076055914625, + "flos": 18081995535360.0, + "grad_norm": 9.445676211161578, + "language_loss": 0.8370564, + "learning_rate": 3.992016272661633e-06, + "loss": 0.92142671, + "num_input_tokens_seen": 20437465, + "router_z_loss_clip": 5.859375, + "router_z_loss_mlp": 0.74511719, + "step": 958, + "time_per_iteration": 2.6244523525238037 + }, + { + "auxiliary_loss_clip": 0.0710094, + "auxiliary_loss_mlp": 0.01346675, + "balance_loss_clip": 0.06526074, + "balance_loss_mlp": 0.01272241, + "epoch": 0.0576581993085826, + "flos": 22130679841920.0, + "grad_norm": 4.908180525960309, + "language_loss": 0.91401774, + "learning_rate": 3.99198147057315e-06, + "loss": 0.99849397, + "num_input_tokens_seen": 20456235, + "router_z_loss_clip": 5.74609375, + "router_z_loss_mlp": 0.74365234, + "step": 959, + "time_per_iteration": 2.5950703620910645 + }, + { + "auxiliary_loss_clip": 0.07097997, + "auxiliary_loss_mlp": 0.01349298, + "balance_loss_clip": 0.06514278, + "balance_loss_mlp": 0.01272431, + "epoch": 0.05771832256125056, + "flos": 33190003232640.0, + "grad_norm": 5.502917231642364, + "language_loss": 0.82885253, + "learning_rate": 3.991946592948529e-06, + "loss": 0.91332549, + "num_input_tokens_seen": 20476825, + "router_z_loss_clip": 5.83203125, + "router_z_loss_mlp": 0.76904297, + "step": 960, + "time_per_iteration": 2.7026655673980713 + }, + { + "auxiliary_loss_clip": 0.07121219, + "auxiliary_loss_mlp": 0.0136329, + "balance_loss_clip": 0.06516127, + "balance_loss_mlp": 0.01276888, + "epoch": 0.057778445813918534, + "flos": 24176957201280.0, + "grad_norm": 10.105803552355386, + "language_loss": 0.96418011, + "learning_rate": 3.991911639789094e-06, + "loss": 1.0490253, + "num_input_tokens_seen": 20496965, + "router_z_loss_clip": 6.0546875, + "router_z_loss_mlp": 0.86425781, + "step": 961, + "time_per_iteration": 2.621075391769409 + }, + { + "auxiliary_loss_clip": 0.07137178, + "auxiliary_loss_mlp": 0.0136525, + "balance_loss_clip": 0.06529568, + "balance_loss_mlp": 0.01280421, + "epoch": 0.0578385690665865, + "flos": 29650770950400.0, + "grad_norm": 15.740079848034652, + "language_loss": 0.72144246, + "learning_rate": 3.991876611096169e-06, + "loss": 0.80646676, + "num_input_tokens_seen": 20518035, + "router_z_loss_clip": 6.08203125, + "router_z_loss_mlp": 0.84863281, + "step": 962, + "time_per_iteration": 2.662982702255249 + }, + { + "auxiliary_loss_clip": 0.07124397, + "auxiliary_loss_mlp": 0.01385383, + "balance_loss_clip": 0.06529254, + "balance_loss_mlp": 0.01300888, + "epoch": 0.05789869231925447, + "flos": 20891528536320.0, + "grad_norm": 6.9214750574770765, + "language_loss": 0.92274594, + "learning_rate": 3.991841506871084e-06, + "loss": 1.00784373, + "num_input_tokens_seen": 20534740, + "router_z_loss_clip": 5.953125, + "router_z_loss_mlp": 0.84521484, + "step": 963, + "time_per_iteration": 2.6076695919036865 + }, + { + "auxiliary_loss_clip": 0.07119042, + "auxiliary_loss_mlp": 0.01381304, + "balance_loss_clip": 0.06523143, + "balance_loss_mlp": 0.01297953, + "epoch": 0.057958815571922444, + "flos": 26038262672640.0, + "grad_norm": 11.895031253661099, + "language_loss": 0.8968147, + "learning_rate": 3.99180632711517e-06, + "loss": 0.98181814, + "num_input_tokens_seen": 20553485, + "router_z_loss_clip": 5.96484375, + "router_z_loss_mlp": 0.83300781, + "step": 964, + "time_per_iteration": 2.686906337738037 + }, + { + "auxiliary_loss_clip": 0.07105853, + "auxiliary_loss_mlp": 0.01387507, + "balance_loss_clip": 0.06517063, + "balance_loss_mlp": 0.01305252, + "epoch": 0.05801893882459041, + "flos": 18083588762880.0, + "grad_norm": 5.536598394443464, + "language_loss": 0.80100715, + "learning_rate": 3.99177107182976e-06, + "loss": 0.88594079, + "num_input_tokens_seen": 20572155, + "router_z_loss_clip": 5.88671875, + "router_z_loss_mlp": 0.82275391, + "step": 965, + "time_per_iteration": 4.090426921844482 + }, + { + "auxiliary_loss_clip": 0.07108907, + "auxiliary_loss_mlp": 0.01388674, + "balance_loss_clip": 0.0653006, + "balance_loss_mlp": 0.01307803, + "epoch": 0.05807906207725838, + "flos": 17754664360320.0, + "grad_norm": 8.638909024191255, + "language_loss": 0.85803884, + "learning_rate": 3.99173574101619e-06, + "loss": 0.94301462, + "num_input_tokens_seen": 20590395, + "router_z_loss_clip": 5.79296875, + "router_z_loss_mlp": 0.80859375, + "step": 966, + "time_per_iteration": 2.593015670776367 + }, + { + "auxiliary_loss_clip": 0.07081844, + "auxiliary_loss_mlp": 0.01385278, + "balance_loss_clip": 0.06515825, + "balance_loss_mlp": 0.01308507, + "epoch": 0.058139185329926346, + "flos": 18046133187840.0, + "grad_norm": 11.004143242377477, + "language_loss": 0.80350578, + "learning_rate": 3.9917003346758035e-06, + "loss": 0.88817692, + "num_input_tokens_seen": 20608435, + "router_z_loss_clip": 5.671875, + "router_z_loss_mlp": 0.76855469, + "step": 967, + "time_per_iteration": 4.057944297790527 + }, + { + "auxiliary_loss_clip": 0.06839906, + "auxiliary_loss_mlp": 0.01357839, + "balance_loss_clip": 0.06483683, + "balance_loss_mlp": 0.01313065, + "epoch": 0.05819930858259432, + "flos": 62381355845760.0, + "grad_norm": 0.8360355245003168, + "language_loss": 0.57554376, + "learning_rate": 3.991664852809939e-06, + "loss": 0.65752125, + "num_input_tokens_seen": 20668575, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 0.44799805, + "step": 968, + "time_per_iteration": 3.167989730834961 + }, + { + "auxiliary_loss_clip": 0.07096039, + "auxiliary_loss_mlp": 0.01391053, + "balance_loss_clip": 0.06529184, + "balance_loss_mlp": 0.01317, + "epoch": 0.05825943183526229, + "flos": 19141373905920.0, + "grad_norm": 7.005112994692607, + "language_loss": 0.84630275, + "learning_rate": 3.991629295419945e-06, + "loss": 0.93117368, + "num_input_tokens_seen": 20687355, + "router_z_loss_clip": 5.67578125, + "router_z_loss_mlp": 0.74072266, + "step": 969, + "time_per_iteration": 4.074899911880493 + }, + { + "auxiliary_loss_clip": 0.07116528, + "auxiliary_loss_mlp": 0.0138256, + "balance_loss_clip": 0.06523499, + "balance_loss_mlp": 0.01301068, + "epoch": 0.058319555087930255, + "flos": 29030042304000.0, + "grad_norm": 8.083926871251307, + "language_loss": 0.82668531, + "learning_rate": 3.991593662507167e-06, + "loss": 0.91167617, + "num_input_tokens_seen": 20705710, + "router_z_loss_clip": 5.9296875, + "router_z_loss_mlp": 0.81542969, + "step": 970, + "time_per_iteration": 2.659989833831787 + }, + { + "auxiliary_loss_clip": 0.07099806, + "auxiliary_loss_mlp": 0.01400005, + "balance_loss_clip": 0.06510817, + "balance_loss_mlp": 0.01317321, + "epoch": 0.05837967834059823, + "flos": 18885977061120.0, + "grad_norm": 16.518563352615757, + "language_loss": 0.96487081, + "learning_rate": 3.991557954072958e-06, + "loss": 1.04986882, + "num_input_tokens_seen": 20722405, + "router_z_loss_clip": 5.890625, + "router_z_loss_mlp": 0.82714844, + "step": 971, + "time_per_iteration": 2.666133165359497 + }, + { + "auxiliary_loss_clip": 0.07087609, + "auxiliary_loss_mlp": 0.01388607, + "balance_loss_clip": 0.06502773, + "balance_loss_mlp": 0.01310834, + "epoch": 0.05843980159326619, + "flos": 25710218737920.0, + "grad_norm": 16.27135895590574, + "language_loss": 0.89295512, + "learning_rate": 3.991522170118673e-06, + "loss": 0.97771728, + "num_input_tokens_seen": 20741480, + "router_z_loss_clip": 5.84765625, + "router_z_loss_mlp": 0.77832031, + "step": 972, + "time_per_iteration": 2.655470848083496 + }, + { + "auxiliary_loss_clip": 0.07066658, + "auxiliary_loss_mlp": 0.01374677, + "balance_loss_clip": 0.0650342, + "balance_loss_mlp": 0.01301482, + "epoch": 0.058499924845934165, + "flos": 25558425866880.0, + "grad_norm": 4.193788183762945, + "language_loss": 0.90456176, + "learning_rate": 3.991486310645667e-06, + "loss": 0.98897511, + "num_input_tokens_seen": 20759685, + "router_z_loss_clip": 5.62890625, + "router_z_loss_mlp": 0.73144531, + "step": 973, + "time_per_iteration": 2.6482443809509277 + }, + { + "auxiliary_loss_clip": 0.0705331, + "auxiliary_loss_mlp": 0.01383547, + "balance_loss_clip": 0.06485617, + "balance_loss_mlp": 0.01307635, + "epoch": 0.05856004809860214, + "flos": 16441859715840.0, + "grad_norm": 11.262132273646074, + "language_loss": 0.77443254, + "learning_rate": 3.991450375655301e-06, + "loss": 0.85880107, + "num_input_tokens_seen": 20778180, + "router_z_loss_clip": 5.6796875, + "router_z_loss_mlp": 0.75878906, + "step": 974, + "time_per_iteration": 2.57619047164917 + }, + { + "auxiliary_loss_clip": 0.07050242, + "auxiliary_loss_mlp": 0.01379524, + "balance_loss_clip": 0.06485987, + "balance_loss_mlp": 0.01304852, + "epoch": 0.0586201713512701, + "flos": 39468385486080.0, + "grad_norm": 6.566272929573762, + "language_loss": 0.79448825, + "learning_rate": 3.991414365148936e-06, + "loss": 0.87878591, + "num_input_tokens_seen": 20802705, + "router_z_loss_clip": 5.640625, + "router_z_loss_mlp": 0.74707031, + "step": 975, + "time_per_iteration": 2.79398250579834 + }, + { + "auxiliary_loss_clip": 0.07056309, + "auxiliary_loss_mlp": 0.0138878, + "balance_loss_clip": 0.06472544, + "balance_loss_mlp": 0.01304809, + "epoch": 0.058680294603938074, + "flos": 23371466302080.0, + "grad_norm": 4.828568059250088, + "language_loss": 0.79758298, + "learning_rate": 3.99137827912794e-06, + "loss": 0.88203388, + "num_input_tokens_seen": 20822540, + "router_z_loss_clip": 5.83984375, + "router_z_loss_mlp": 0.83984375, + "step": 976, + "time_per_iteration": 2.6214101314544678 + }, + { + "auxiliary_loss_clip": 0.07040592, + "auxiliary_loss_mlp": 0.01371791, + "balance_loss_clip": 0.06474636, + "balance_loss_mlp": 0.01299216, + "epoch": 0.05874041785660604, + "flos": 32239930913280.0, + "grad_norm": 7.236872171762386, + "language_loss": 0.89953148, + "learning_rate": 3.991342117593679e-06, + "loss": 0.98365533, + "num_input_tokens_seen": 20844175, + "router_z_loss_clip": 5.66015625, + "router_z_loss_mlp": 0.72607422, + "step": 977, + "time_per_iteration": 2.681955099105835 + }, + { + "auxiliary_loss_clip": 0.07041348, + "auxiliary_loss_mlp": 0.01373201, + "balance_loss_clip": 0.06467118, + "balance_loss_mlp": 0.01295619, + "epoch": 0.05880054110927401, + "flos": 22316657978880.0, + "grad_norm": 7.280318669233247, + "language_loss": 0.82238227, + "learning_rate": 3.991305880547527e-06, + "loss": 0.90652776, + "num_input_tokens_seen": 20864730, + "router_z_loss_clip": 5.734375, + "router_z_loss_mlp": 0.77587891, + "step": 978, + "time_per_iteration": 2.614290952682495 + }, + { + "auxiliary_loss_clip": 0.0707294, + "auxiliary_loss_mlp": 0.0136034, + "balance_loss_clip": 0.06484175, + "balance_loss_mlp": 0.01280184, + "epoch": 0.05886066436194198, + "flos": 27387726278400.0, + "grad_norm": 155.96057049304315, + "language_loss": 0.83328485, + "learning_rate": 3.991269567990855e-06, + "loss": 0.91761768, + "num_input_tokens_seen": 20885200, + "router_z_loss_clip": 5.89453125, + "router_z_loss_mlp": 0.80175781, + "step": 979, + "time_per_iteration": 2.635091543197632 + }, + { + "auxiliary_loss_clip": 0.0672864, + "auxiliary_loss_mlp": 0.01304756, + "balance_loss_clip": 0.06376771, + "balance_loss_mlp": 0.01257311, + "epoch": 0.05892078761460995, + "flos": 59601102647040.0, + "grad_norm": 0.9093094214807238, + "language_loss": 0.59396595, + "learning_rate": 3.9912331799250415e-06, + "loss": 0.67429984, + "num_input_tokens_seen": 20940325, + "router_z_loss_clip": 3.51953125, + "router_z_loss_mlp": 0.47387695, + "step": 980, + "time_per_iteration": 3.1261343955993652 + }, + { + "auxiliary_loss_clip": 0.07034945, + "auxiliary_loss_mlp": 0.01348733, + "balance_loss_clip": 0.06472749, + "balance_loss_mlp": 0.01274394, + "epoch": 0.05898091086727792, + "flos": 15419517649920.0, + "grad_norm": 3.186788863209633, + "language_loss": 0.90080172, + "learning_rate": 3.9911967163514665e-06, + "loss": 0.98463851, + "num_input_tokens_seen": 20958220, + "router_z_loss_clip": 5.625, + "router_z_loss_mlp": 0.74267578, + "step": 981, + "time_per_iteration": 2.5808515548706055 + }, + { + "auxiliary_loss_clip": 0.0705516, + "auxiliary_loss_mlp": 0.01348366, + "balance_loss_clip": 0.06484837, + "balance_loss_mlp": 0.0127746, + "epoch": 0.059041034119945886, + "flos": 23661383829120.0, + "grad_norm": 5.662656134717616, + "language_loss": 0.82531273, + "learning_rate": 3.991160177271513e-06, + "loss": 0.90934801, + "num_input_tokens_seen": 20978920, + "router_z_loss_clip": 5.703125, + "router_z_loss_mlp": 0.70898438, + "step": 982, + "time_per_iteration": 2.7105038166046143 + }, + { + "auxiliary_loss_clip": 0.07084571, + "auxiliary_loss_mlp": 0.01361032, + "balance_loss_clip": 0.06488383, + "balance_loss_mlp": 0.01281162, + "epoch": 0.05910115737261386, + "flos": 24761026886400.0, + "grad_norm": 3.604575523078559, + "language_loss": 0.87251258, + "learning_rate": 3.9911235626865654e-06, + "loss": 0.95696855, + "num_input_tokens_seen": 20999490, + "router_z_loss_clip": 5.9609375, + "router_z_loss_mlp": 0.79882812, + "step": 983, + "time_per_iteration": 2.744180917739868 + }, + { + "auxiliary_loss_clip": 0.07044654, + "auxiliary_loss_mlp": 0.01351466, + "balance_loss_clip": 0.06470264, + "balance_loss_mlp": 0.01274648, + "epoch": 0.05916128062528183, + "flos": 11733523741440.0, + "grad_norm": 4.930042751750388, + "language_loss": 0.87498015, + "learning_rate": 3.9910868725980125e-06, + "loss": 0.95894134, + "num_input_tokens_seen": 21017865, + "router_z_loss_clip": 5.734375, + "router_z_loss_mlp": 0.76806641, + "step": 984, + "time_per_iteration": 2.651169538497925 + }, + { + "auxiliary_loss_clip": 0.0704496, + "auxiliary_loss_mlp": 0.01342068, + "balance_loss_clip": 0.06470487, + "balance_loss_mlp": 0.01264582, + "epoch": 0.059221403877949795, + "flos": 21908587795200.0, + "grad_norm": 5.844491017467261, + "language_loss": 0.80473924, + "learning_rate": 3.9910501070072465e-06, + "loss": 0.88860953, + "num_input_tokens_seen": 21035900, + "router_z_loss_clip": 5.74609375, + "router_z_loss_mlp": 0.77490234, + "step": 985, + "time_per_iteration": 2.6289291381835938 + }, + { + "auxiliary_loss_clip": 0.07058708, + "auxiliary_loss_mlp": 0.01361985, + "balance_loss_clip": 0.06475725, + "balance_loss_mlp": 0.01284213, + "epoch": 0.05928152713061777, + "flos": 20519614189440.0, + "grad_norm": 6.301686711015131, + "language_loss": 0.93571031, + "learning_rate": 3.991013265915661e-06, + "loss": 1.01991737, + "num_input_tokens_seen": 21053235, + "router_z_loss_clip": 5.83203125, + "router_z_loss_mlp": 0.77783203, + "step": 986, + "time_per_iteration": 2.655438184738159 + }, + { + "auxiliary_loss_clip": 0.0708475, + "auxiliary_loss_mlp": 0.01349267, + "balance_loss_clip": 0.06479746, + "balance_loss_mlp": 0.01270303, + "epoch": 0.05934165038328574, + "flos": 24501437337600.0, + "grad_norm": 4.15562600287031, + "language_loss": 0.79382873, + "learning_rate": 3.9909763493246525e-06, + "loss": 0.87816888, + "num_input_tokens_seen": 21073090, + "router_z_loss_clip": 6.0546875, + "router_z_loss_mlp": 0.79003906, + "step": 987, + "time_per_iteration": 2.635974168777466 + }, + { + "auxiliary_loss_clip": 0.07112011, + "auxiliary_loss_mlp": 0.01375395, + "balance_loss_clip": 0.06492966, + "balance_loss_mlp": 0.0128861, + "epoch": 0.059401773635953704, + "flos": 38737302612480.0, + "grad_norm": 3.024721532830348, + "language_loss": 0.74664164, + "learning_rate": 3.990939357235621e-06, + "loss": 0.83151573, + "num_input_tokens_seen": 21094895, + "router_z_loss_clip": 6.19140625, + "router_z_loss_mlp": 0.8671875, + "step": 988, + "time_per_iteration": 2.8440210819244385 + }, + { + "auxiliary_loss_clip": 0.06738614, + "auxiliary_loss_mlp": 0.01302441, + "balance_loss_clip": 0.06389277, + "balance_loss_mlp": 0.01254757, + "epoch": 0.059461896888621676, + "flos": 58041244638720.0, + "grad_norm": 0.9346440677006217, + "language_loss": 0.71295583, + "learning_rate": 3.99090228964997e-06, + "loss": 0.79336637, + "num_input_tokens_seen": 21147555, + "router_z_loss_clip": 3.5, + "router_z_loss_mlp": 0.4765625, + "step": 989, + "time_per_iteration": 3.0397932529449463 + }, + { + "auxiliary_loss_clip": 0.07105568, + "auxiliary_loss_mlp": 0.01373719, + "balance_loss_clip": 0.06490866, + "balance_loss_mlp": 0.01288604, + "epoch": 0.05952202014128964, + "flos": 22134369421440.0, + "grad_norm": 3.813782873152628, + "language_loss": 0.81950057, + "learning_rate": 3.990865146569105e-06, + "loss": 0.90429342, + "num_input_tokens_seen": 21167845, + "router_z_loss_clip": 6.1484375, + "router_z_loss_mlp": 0.85107422, + "step": 990, + "time_per_iteration": 2.679490804672241 + }, + { + "auxiliary_loss_clip": 0.07070604, + "auxiliary_loss_mlp": 0.0136635, + "balance_loss_clip": 0.0648191, + "balance_loss_mlp": 0.01286957, + "epoch": 0.059582143393957614, + "flos": 20451495219840.0, + "grad_norm": 3.1821025671437786, + "language_loss": 0.88952839, + "learning_rate": 3.990827927994434e-06, + "loss": 0.97389793, + "num_input_tokens_seen": 21185085, + "router_z_loss_clip": 5.890625, + "router_z_loss_mlp": 0.79443359, + "step": 991, + "time_per_iteration": 2.6212010383605957 + }, + { + "auxiliary_loss_clip": 0.07097097, + "auxiliary_loss_mlp": 0.01373652, + "balance_loss_clip": 0.06486384, + "balance_loss_mlp": 0.012893, + "epoch": 0.059642266646625586, + "flos": 20601149811840.0, + "grad_norm": 4.7552664277712475, + "language_loss": 0.80401003, + "learning_rate": 3.9907906339273674e-06, + "loss": 0.88871753, + "num_input_tokens_seen": 21204230, + "router_z_loss_clip": 6.1171875, + "router_z_loss_mlp": 0.84375, + "step": 992, + "time_per_iteration": 2.6194934844970703 + }, + { + "auxiliary_loss_clip": 0.07081859, + "auxiliary_loss_mlp": 0.01371261, + "balance_loss_clip": 0.06485239, + "balance_loss_mlp": 0.01292869, + "epoch": 0.05970238989929355, + "flos": 19358434707840.0, + "grad_norm": 7.615023287218043, + "language_loss": 0.78822339, + "learning_rate": 3.9907532643693215e-06, + "loss": 0.87275457, + "num_input_tokens_seen": 21222655, + "router_z_loss_clip": 5.96875, + "router_z_loss_mlp": 0.78417969, + "step": 993, + "time_per_iteration": 2.5962717533111572 + }, + { + "auxiliary_loss_clip": 0.07073358, + "auxiliary_loss_mlp": 0.01364747, + "balance_loss_clip": 0.06486119, + "balance_loss_mlp": 0.01289073, + "epoch": 0.05976251315196152, + "flos": 30272002721280.0, + "grad_norm": 5.1352604598244, + "language_loss": 0.83427668, + "learning_rate": 3.990715819321712e-06, + "loss": 0.91865766, + "num_input_tokens_seen": 21242310, + "router_z_loss_clip": 5.875, + "router_z_loss_mlp": 0.75634766, + "step": 994, + "time_per_iteration": 2.677586317062378 + }, + { + "auxiliary_loss_clip": 0.07096842, + "auxiliary_loss_mlp": 0.01391454, + "balance_loss_clip": 0.06492864, + "balance_loss_mlp": 0.01313491, + "epoch": 0.05982263640462949, + "flos": 23191819148160.0, + "grad_norm": 4.423928105923456, + "language_loss": 0.83424294, + "learning_rate": 3.99067829878596e-06, + "loss": 0.91912591, + "num_input_tokens_seen": 21261410, + "router_z_loss_clip": 6.046875, + "router_z_loss_mlp": 0.77978516, + "step": 995, + "time_per_iteration": 2.62821364402771 + }, + { + "auxiliary_loss_clip": 0.07109222, + "auxiliary_loss_mlp": 0.01389117, + "balance_loss_clip": 0.06503183, + "balance_loss_mlp": 0.01309247, + "epoch": 0.05988275965729746, + "flos": 27857584448640.0, + "grad_norm": 3.07551937102457, + "language_loss": 0.89631027, + "learning_rate": 3.990640702763487e-06, + "loss": 0.98129368, + "num_input_tokens_seen": 21280080, + "router_z_loss_clip": 6.05859375, + "router_z_loss_mlp": 0.79785156, + "step": 996, + "time_per_iteration": 2.6472525596618652 + }, + { + "auxiliary_loss_clip": 0.0709434, + "auxiliary_loss_mlp": 0.01374144, + "balance_loss_clip": 0.06487706, + "balance_loss_mlp": 0.01292461, + "epoch": 0.05994288290996543, + "flos": 24686744641920.0, + "grad_norm": 3.8490454271878023, + "language_loss": 0.91812748, + "learning_rate": 3.990603031255718e-06, + "loss": 1.00281239, + "num_input_tokens_seen": 21296765, + "router_z_loss_clip": 6.05078125, + "router_z_loss_mlp": 0.81689453, + "step": 997, + "time_per_iteration": 2.6353485584259033 + }, + { + "auxiliary_loss_clip": 0.06747872, + "auxiliary_loss_mlp": 0.0129538, + "balance_loss_clip": 0.06402076, + "balance_loss_mlp": 0.01256113, + "epoch": 0.0600030061626334, + "flos": 69951187152000.0, + "grad_norm": 1.0138660307708214, + "language_loss": 0.75495923, + "learning_rate": 3.990565284264083e-06, + "loss": 0.83539176, + "num_input_tokens_seen": 21363345, + "router_z_loss_clip": 3.4609375, + "router_z_loss_mlp": 0.39233398, + "step": 998, + "time_per_iteration": 3.2664620876312256 + }, + { + "auxiliary_loss_clip": 0.07050692, + "auxiliary_loss_mlp": 0.01361564, + "balance_loss_clip": 0.06468829, + "balance_loss_mlp": 0.01286844, + "epoch": 0.06006312941530137, + "flos": 26547085791360.0, + "grad_norm": 6.665102912139699, + "language_loss": 0.78679419, + "learning_rate": 3.990527461790013e-06, + "loss": 0.87091672, + "num_input_tokens_seen": 21385290, + "router_z_loss_clip": 5.8203125, + "router_z_loss_mlp": 0.74707031, + "step": 999, + "time_per_iteration": 2.6708481311798096 + }, + { + "auxiliary_loss_clip": 0.07090119, + "auxiliary_loss_mlp": 0.01381378, + "balance_loss_clip": 0.06486722, + "balance_loss_mlp": 0.01301603, + "epoch": 0.060123252667969335, + "flos": 27351276952320.0, + "grad_norm": 3.7400701542168013, + "language_loss": 0.85150427, + "learning_rate": 3.990489563834943e-06, + "loss": 0.93621922, + "num_input_tokens_seen": 21407625, + "router_z_loss_clip": 6.03515625, + "router_z_loss_mlp": 0.79833984, + "step": 1000, + "time_per_iteration": 2.643961191177368 + }, + { + "auxiliary_loss_clip": 0.07061431, + "auxiliary_loss_mlp": 0.01377664, + "balance_loss_clip": 0.06471995, + "balance_loss_mlp": 0.01297555, + "epoch": 0.06018337592063731, + "flos": 27024113485440.0, + "grad_norm": 4.060867986193189, + "language_loss": 0.88738573, + "learning_rate": 3.990451590400309e-06, + "loss": 0.97177666, + "num_input_tokens_seen": 21426835, + "router_z_loss_clip": 5.890625, + "router_z_loss_mlp": 0.80126953, + "step": 1001, + "time_per_iteration": 2.629136323928833 + }, + { + "auxiliary_loss_clip": 0.07032709, + "auxiliary_loss_mlp": 0.01355395, + "balance_loss_clip": 0.06470643, + "balance_loss_mlp": 0.01289306, + "epoch": 0.06024349917330528, + "flos": 25599990291840.0, + "grad_norm": 3.249124655019378, + "language_loss": 0.76097226, + "learning_rate": 3.990413541487551e-06, + "loss": 0.84485334, + "num_input_tokens_seen": 21444920, + "router_z_loss_clip": 5.6171875, + "router_z_loss_mlp": 0.66162109, + "step": 1002, + "time_per_iteration": 2.6258249282836914 + }, + { + "auxiliary_loss_clip": 0.07068716, + "auxiliary_loss_mlp": 0.01374313, + "balance_loss_clip": 0.06480874, + "balance_loss_mlp": 0.01298067, + "epoch": 0.060303622425973244, + "flos": 26139225242880.0, + "grad_norm": 4.8561241229026075, + "language_loss": 0.78990388, + "learning_rate": 3.990375417098112e-06, + "loss": 0.87433422, + "num_input_tokens_seen": 21463555, + "router_z_loss_clip": 5.8828125, + "router_z_loss_mlp": 0.76220703, + "step": 1003, + "time_per_iteration": 2.7662932872772217 + }, + { + "auxiliary_loss_clip": 0.0707517, + "auxiliary_loss_mlp": 0.01365139, + "balance_loss_clip": 0.0647432, + "balance_loss_mlp": 0.01284077, + "epoch": 0.060363745678641216, + "flos": 20383627812480.0, + "grad_norm": 4.219450714846169, + "language_loss": 0.73012471, + "learning_rate": 3.990337217233437e-06, + "loss": 0.81452775, + "num_input_tokens_seen": 21481990, + "router_z_loss_clip": 6.015625, + "router_z_loss_mlp": 0.81005859, + "step": 1004, + "time_per_iteration": 5.472697734832764 + }, + { + "auxiliary_loss_clip": 0.07068998, + "auxiliary_loss_mlp": 0.01370949, + "balance_loss_clip": 0.06471765, + "balance_loss_mlp": 0.0129313, + "epoch": 0.06042386893130918, + "flos": 17754999776640.0, + "grad_norm": 3.350107422381743, + "language_loss": 0.86839885, + "learning_rate": 3.990298941894976e-06, + "loss": 0.95279837, + "num_input_tokens_seen": 21500385, + "router_z_loss_clip": 5.96875, + "router_z_loss_mlp": 0.77832031, + "step": 1005, + "time_per_iteration": 2.628612518310547 + }, + { + "auxiliary_loss_clip": 0.06732726, + "auxiliary_loss_mlp": 0.01300149, + "balance_loss_clip": 0.06388327, + "balance_loss_mlp": 0.01255518, + "epoch": 0.06048399218397715, + "flos": 68559110945280.0, + "grad_norm": 0.8658661250215584, + "language_loss": 0.59003174, + "learning_rate": 3.9902605910841794e-06, + "loss": 0.67036045, + "num_input_tokens_seen": 21561040, + "router_z_loss_clip": 3.4375, + "router_z_loss_mlp": 0.4465332, + "step": 1006, + "time_per_iteration": 3.2709102630615234 + }, + { + "auxiliary_loss_clip": 0.07070711, + "auxiliary_loss_mlp": 0.01360281, + "balance_loss_clip": 0.06464767, + "balance_loss_mlp": 0.01278123, + "epoch": 0.060544115436645125, + "flos": 23265262851840.0, + "grad_norm": 3.0418653981095973, + "language_loss": 0.77645856, + "learning_rate": 3.990222164802503e-06, + "loss": 0.8607685, + "num_input_tokens_seen": 21580655, + "router_z_loss_clip": 6.05859375, + "router_z_loss_mlp": 0.82128906, + "step": 1007, + "time_per_iteration": 4.056382894515991 + }, + { + "auxiliary_loss_clip": 0.07091306, + "auxiliary_loss_mlp": 0.01370917, + "balance_loss_clip": 0.06486145, + "balance_loss_mlp": 0.01290475, + "epoch": 0.06060423868931309, + "flos": 23885236811520.0, + "grad_norm": 3.189900491688776, + "language_loss": 0.83630216, + "learning_rate": 3.9901836630514006e-06, + "loss": 0.92092443, + "num_input_tokens_seen": 21599650, + "router_z_loss_clip": 6.05859375, + "router_z_loss_mlp": 0.8046875, + "step": 1008, + "time_per_iteration": 2.6701247692108154 + }, + { + "auxiliary_loss_clip": 0.07042849, + "auxiliary_loss_mlp": 0.01344814, + "balance_loss_clip": 0.06474254, + "balance_loss_mlp": 0.01273718, + "epoch": 0.06066436194198106, + "flos": 18733010232960.0, + "grad_norm": 8.677434751337552, + "language_loss": 0.80948377, + "learning_rate": 3.990145085832335e-06, + "loss": 0.89336038, + "num_input_tokens_seen": 21617550, + "router_z_loss_clip": 5.6875, + "router_z_loss_mlp": 0.71142578, + "step": 1009, + "time_per_iteration": 4.013457536697388 + }, + { + "auxiliary_loss_clip": 0.07022181, + "auxiliary_loss_mlp": 0.01332483, + "balance_loss_clip": 0.06467105, + "balance_loss_mlp": 0.01266345, + "epoch": 0.06072448519464903, + "flos": 24646689590400.0, + "grad_norm": 3.258884654543471, + "language_loss": 0.95985019, + "learning_rate": 3.990106433146769e-06, + "loss": 1.04339683, + "num_input_tokens_seen": 21635865, + "router_z_loss_clip": 5.55078125, + "router_z_loss_mlp": 0.66162109, + "step": 1010, + "time_per_iteration": 2.631512403488159 + }, + { + "auxiliary_loss_clip": 0.07117961, + "auxiliary_loss_mlp": 0.01383111, + "balance_loss_clip": 0.06489638, + "balance_loss_mlp": 0.01291845, + "epoch": 0.060784608447317, + "flos": 17383672408320.0, + "grad_norm": 3.3823449890168145, + "language_loss": 0.75409305, + "learning_rate": 3.9900677049961665e-06, + "loss": 0.83910382, + "num_input_tokens_seen": 21653945, + "router_z_loss_clip": 6.28125, + "router_z_loss_mlp": 0.91259766, + "step": 1011, + "time_per_iteration": 2.5896708965301514 + }, + { + "auxiliary_loss_clip": 0.07033786, + "auxiliary_loss_mlp": 0.01345512, + "balance_loss_clip": 0.06462559, + "balance_loss_mlp": 0.0126526, + "epoch": 0.06084473169998497, + "flos": 23698336279680.0, + "grad_norm": 3.246815093008435, + "language_loss": 0.89853048, + "learning_rate": 3.990028901381999e-06, + "loss": 0.98232347, + "num_input_tokens_seen": 21671230, + "router_z_loss_clip": 5.7109375, + "router_z_loss_mlp": 0.80273438, + "step": 1012, + "time_per_iteration": 2.637019157409668 + }, + { + "auxiliary_loss_clip": 0.07040339, + "auxiliary_loss_mlp": 0.01338129, + "balance_loss_clip": 0.06458548, + "balance_loss_mlp": 0.01258211, + "epoch": 0.06090485495265294, + "flos": 23552455121280.0, + "grad_norm": 2.5392970439405116, + "language_loss": 0.79602826, + "learning_rate": 3.989990022305734e-06, + "loss": 0.8798129, + "num_input_tokens_seen": 21691155, + "router_z_loss_clip": 5.81640625, + "router_z_loss_mlp": 0.79980469, + "step": 1013, + "time_per_iteration": 2.658986806869507 + }, + { + "auxiliary_loss_clip": 0.0703081, + "auxiliary_loss_mlp": 0.01334151, + "balance_loss_clip": 0.06449694, + "balance_loss_mlp": 0.01255664, + "epoch": 0.06096497820532091, + "flos": 20345501404800.0, + "grad_norm": 3.5799775107607585, + "language_loss": 0.88768977, + "learning_rate": 3.98995106776885e-06, + "loss": 0.97133934, + "num_input_tokens_seen": 21707405, + "router_z_loss_clip": 5.8203125, + "router_z_loss_mlp": 0.78515625, + "step": 1014, + "time_per_iteration": 2.6026017665863037 + }, + { + "auxiliary_loss_clip": 0.07069368, + "auxiliary_loss_mlp": 0.01344703, + "balance_loss_clip": 0.06459542, + "balance_loss_mlp": 0.01260589, + "epoch": 0.061025101457988874, + "flos": 26945638536960.0, + "grad_norm": 5.148864357756937, + "language_loss": 0.77818727, + "learning_rate": 3.98991203777282e-06, + "loss": 0.86232805, + "num_input_tokens_seen": 21728090, + "router_z_loss_clip": 6.1015625, + "router_z_loss_mlp": 0.84082031, + "step": 1015, + "time_per_iteration": 2.6645917892456055 + }, + { + "auxiliary_loss_clip": 0.07000691, + "auxiliary_loss_mlp": 0.01326184, + "balance_loss_clip": 0.06455131, + "balance_loss_mlp": 0.01257949, + "epoch": 0.061085224710656846, + "flos": 25382216730240.0, + "grad_norm": 2.4567185281472868, + "language_loss": 0.82061088, + "learning_rate": 3.9898729323191275e-06, + "loss": 0.90387964, + "num_input_tokens_seen": 21747950, + "router_z_loss_clip": 5.453125, + "router_z_loss_mlp": 0.68359375, + "step": 1016, + "time_per_iteration": 2.631394863128662 + }, + { + "auxiliary_loss_clip": 0.07014458, + "auxiliary_loss_mlp": 0.01339398, + "balance_loss_clip": 0.06457797, + "balance_loss_mlp": 0.01263962, + "epoch": 0.06114534796332482, + "flos": 24831326062080.0, + "grad_norm": 2.2885034058804363, + "language_loss": 0.78705657, + "learning_rate": 3.989833751409254e-06, + "loss": 0.8705951, + "num_input_tokens_seen": 21767900, + "router_z_loss_clip": 5.55859375, + "router_z_loss_mlp": 0.75390625, + "step": 1017, + "time_per_iteration": 2.657306432723999 + }, + { + "auxiliary_loss_clip": 0.07054974, + "auxiliary_loss_mlp": 0.0134134, + "balance_loss_clip": 0.06458369, + "balance_loss_mlp": 0.01256225, + "epoch": 0.061205471215992784, + "flos": 20637724919040.0, + "grad_norm": 9.632952296777574, + "language_loss": 0.88575757, + "learning_rate": 3.989794495044685e-06, + "loss": 0.96972066, + "num_input_tokens_seen": 21787375, + "router_z_loss_clip": 5.97265625, + "router_z_loss_mlp": 0.85107422, + "step": 1018, + "time_per_iteration": 2.5989861488342285 + }, + { + "auxiliary_loss_clip": 0.07009743, + "auxiliary_loss_mlp": 0.01334982, + "balance_loss_clip": 0.06455217, + "balance_loss_mlp": 0.01259165, + "epoch": 0.061265594468660756, + "flos": 16513919827200.0, + "grad_norm": 8.927182809216816, + "language_loss": 0.8225174, + "learning_rate": 3.989755163226909e-06, + "loss": 0.90596467, + "num_input_tokens_seen": 21806275, + "router_z_loss_clip": 5.54296875, + "router_z_loss_mlp": 0.75878906, + "step": 1019, + "time_per_iteration": 2.596885919570923 + }, + { + "auxiliary_loss_clip": 0.07013386, + "auxiliary_loss_mlp": 0.01335228, + "balance_loss_clip": 0.06456258, + "balance_loss_mlp": 0.01263417, + "epoch": 0.06132571772132872, + "flos": 26252765925120.0, + "grad_norm": 3.333827515378615, + "language_loss": 0.86933666, + "learning_rate": 3.989715755957418e-06, + "loss": 0.9528228, + "num_input_tokens_seen": 21826430, + "router_z_loss_clip": 5.5625, + "router_z_loss_mlp": 0.71826172, + "step": 1020, + "time_per_iteration": 2.6224961280822754 + }, + { + "auxiliary_loss_clip": 0.06996658, + "auxiliary_loss_mlp": 0.01346945, + "balance_loss_clip": 0.06447957, + "balance_loss_mlp": 0.01273989, + "epoch": 0.06138584097399669, + "flos": 37423869062400.0, + "grad_norm": 2.8232559173096914, + "language_loss": 0.81487918, + "learning_rate": 3.989676273237705e-06, + "loss": 0.89831525, + "num_input_tokens_seen": 21847800, + "router_z_loss_clip": 5.48828125, + "router_z_loss_mlp": 0.72949219, + "step": 1021, + "time_per_iteration": 2.771052598953247 + }, + { + "auxiliary_loss_clip": 0.06976922, + "auxiliary_loss_mlp": 0.0136383, + "balance_loss_clip": 0.06428508, + "balance_loss_mlp": 0.01285295, + "epoch": 0.061445964226664665, + "flos": 17426410790400.0, + "grad_norm": 7.734725170769636, + "language_loss": 0.9093855, + "learning_rate": 3.9896367150692705e-06, + "loss": 0.99279296, + "num_input_tokens_seen": 21863385, + "router_z_loss_clip": 5.48046875, + "router_z_loss_mlp": 0.78466797, + "step": 1022, + "time_per_iteration": 2.5622968673706055 + }, + { + "auxiliary_loss_clip": 0.0697528, + "auxiliary_loss_mlp": 0.01365327, + "balance_loss_clip": 0.06437931, + "balance_loss_mlp": 0.01295518, + "epoch": 0.06150608747933263, + "flos": 22606365870720.0, + "grad_norm": 3.61040283013288, + "language_loss": 0.84977013, + "learning_rate": 3.989597081453611e-06, + "loss": 0.93317622, + "num_input_tokens_seen": 21881880, + "router_z_loss_clip": 5.37109375, + "router_z_loss_mlp": 0.69824219, + "step": 1023, + "time_per_iteration": 2.6407079696655273 + }, + { + "auxiliary_loss_clip": 0.0673309, + "auxiliary_loss_mlp": 0.01419946, + "balance_loss_clip": 0.06385664, + "balance_loss_mlp": 0.0137119, + "epoch": 0.0615662107320006, + "flos": 56758097139840.0, + "grad_norm": 0.9164460168563352, + "language_loss": 0.64884549, + "learning_rate": 3.989557372392231e-06, + "loss": 0.73037589, + "num_input_tokens_seen": 21940550, + "router_z_loss_clip": 3.46875, + "router_z_loss_mlp": 0.48706055, + "step": 1024, + "time_per_iteration": 3.240457534790039 + }, + { + "auxiliary_loss_clip": 0.06995942, + "auxiliary_loss_mlp": 0.01352799, + "balance_loss_clip": 0.06434722, + "balance_loss_mlp": 0.01272356, + "epoch": 0.06162633398466857, + "flos": 22571342064000.0, + "grad_norm": 2.66796346315112, + "language_loss": 0.91765183, + "learning_rate": 3.989517587886636e-06, + "loss": 1.00113928, + "num_input_tokens_seen": 21958390, + "router_z_loss_clip": 5.61328125, + "router_z_loss_mlp": 0.80371094, + "step": 1025, + "time_per_iteration": 2.6372737884521484 + }, + { + "auxiliary_loss_clip": 0.06986167, + "auxiliary_loss_mlp": 0.01374261, + "balance_loss_clip": 0.06435852, + "balance_loss_mlp": 0.01300828, + "epoch": 0.06168645723733654, + "flos": 25600158000000.0, + "grad_norm": 2.4272602971827535, + "language_loss": 0.871768, + "learning_rate": 3.989477727938335e-06, + "loss": 0.95537233, + "num_input_tokens_seen": 21978625, + "router_z_loss_clip": 5.50390625, + "router_z_loss_mlp": 0.73486328, + "step": 1026, + "time_per_iteration": 2.6508452892303467 + }, + { + "auxiliary_loss_clip": 0.06989977, + "auxiliary_loss_mlp": 0.01363012, + "balance_loss_clip": 0.06439693, + "balance_loss_mlp": 0.01286622, + "epoch": 0.06174658049000451, + "flos": 16003461553920.0, + "grad_norm": 3.495791258705881, + "language_loss": 0.8437736, + "learning_rate": 3.989437792548839e-06, + "loss": 0.92730343, + "num_input_tokens_seen": 21996035, + "router_z_loss_clip": 5.5, + "router_z_loss_mlp": 0.76416016, + "step": 1027, + "time_per_iteration": 2.613172769546509 + }, + { + "auxiliary_loss_clip": 0.06973707, + "auxiliary_loss_mlp": 0.01359003, + "balance_loss_clip": 0.0641673, + "balance_loss_mlp": 0.01281422, + "epoch": 0.06180670374267248, + "flos": 11289842772480.0, + "grad_norm": 3.8173647671524793, + "language_loss": 0.87086433, + "learning_rate": 3.989397781719663e-06, + "loss": 0.95419139, + "num_input_tokens_seen": 22011625, + "router_z_loss_clip": 5.5625, + "router_z_loss_mlp": 0.77539062, + "step": 1028, + "time_per_iteration": 2.6524107456207275 + }, + { + "auxiliary_loss_clip": 0.06704632, + "auxiliary_loss_mlp": 0.01372349, + "balance_loss_clip": 0.06357226, + "balance_loss_mlp": 0.01321519, + "epoch": 0.06186682699534045, + "flos": 65147647340160.0, + "grad_norm": 0.9176628937357996, + "language_loss": 0.60490429, + "learning_rate": 3.989357695452323e-06, + "loss": 0.68567419, + "num_input_tokens_seen": 22066035, + "router_z_loss_clip": 3.46875, + "router_z_loss_mlp": 0.50830078, + "step": 1029, + "time_per_iteration": 3.218085289001465 + }, + { + "auxiliary_loss_clip": 0.07009555, + "auxiliary_loss_mlp": 0.01372678, + "balance_loss_clip": 0.06434123, + "balance_loss_mlp": 0.01287372, + "epoch": 0.061926950248008414, + "flos": 21112111209600.0, + "grad_norm": 3.737194986722716, + "language_loss": 0.85668898, + "learning_rate": 3.98931753374834e-06, + "loss": 0.94051135, + "num_input_tokens_seen": 22085015, + "router_z_loss_clip": 5.75390625, + "router_z_loss_mlp": 0.85253906, + "step": 1030, + "time_per_iteration": 2.7052202224731445 + }, + { + "auxiliary_loss_clip": 0.06989674, + "auxiliary_loss_mlp": 0.01357455, + "balance_loss_clip": 0.06431329, + "balance_loss_mlp": 0.01280446, + "epoch": 0.061987073500676386, + "flos": 17754161235840.0, + "grad_norm": 3.4423452178420013, + "language_loss": 0.83235556, + "learning_rate": 3.989277296609237e-06, + "loss": 0.91582686, + "num_input_tokens_seen": 22102775, + "router_z_loss_clip": 5.5859375, + "router_z_loss_mlp": 0.77050781, + "step": 1031, + "time_per_iteration": 2.588575839996338 + }, + { + "auxiliary_loss_clip": 0.06983647, + "auxiliary_loss_mlp": 0.01355074, + "balance_loss_clip": 0.06433594, + "balance_loss_mlp": 0.01283453, + "epoch": 0.06204719675334436, + "flos": 21842858666880.0, + "grad_norm": 14.220096224086527, + "language_loss": 0.80345309, + "learning_rate": 3.98923698403654e-06, + "loss": 0.88684022, + "num_input_tokens_seen": 22121680, + "router_z_loss_clip": 5.50390625, + "router_z_loss_mlp": 0.71582031, + "step": 1032, + "time_per_iteration": 2.6636962890625 + }, + { + "auxiliary_loss_clip": 0.06996015, + "auxiliary_loss_mlp": 0.01349932, + "balance_loss_clip": 0.064355, + "balance_loss_mlp": 0.01272828, + "epoch": 0.06210732000601232, + "flos": 19359650592000.0, + "grad_norm": 3.724079257252284, + "language_loss": 0.9305315, + "learning_rate": 3.989196596031776e-06, + "loss": 1.01399088, + "num_input_tokens_seen": 22138155, + "router_z_loss_clip": 5.60546875, + "router_z_loss_mlp": 0.77197266, + "step": 1033, + "time_per_iteration": 2.5974748134613037 + }, + { + "auxiliary_loss_clip": 0.06988779, + "auxiliary_loss_mlp": 0.01347157, + "balance_loss_clip": 0.06438898, + "balance_loss_mlp": 0.0127525, + "epoch": 0.062167443258680295, + "flos": 24755534444160.0, + "grad_norm": 3.649174890809254, + "language_loss": 0.87141907, + "learning_rate": 3.989156132596479e-06, + "loss": 0.95477843, + "num_input_tokens_seen": 22157420, + "router_z_loss_clip": 5.49609375, + "router_z_loss_mlp": 0.71875, + "step": 1034, + "time_per_iteration": 2.6747853755950928 + }, + { + "auxiliary_loss_clip": 0.06962503, + "auxiliary_loss_mlp": 0.01360042, + "balance_loss_clip": 0.06434912, + "balance_loss_mlp": 0.01290854, + "epoch": 0.06222756651134827, + "flos": 34466903602560.0, + "grad_norm": 3.3762373845942313, + "language_loss": 0.84657645, + "learning_rate": 3.989115593732182e-06, + "loss": 0.92980194, + "num_input_tokens_seen": 22178620, + "router_z_loss_clip": 5.2734375, + "router_z_loss_mlp": 0.69189453, + "step": 1035, + "time_per_iteration": 2.690265655517578 + }, + { + "auxiliary_loss_clip": 0.06995995, + "auxiliary_loss_mlp": 0.01348638, + "balance_loss_clip": 0.06441504, + "balance_loss_mlp": 0.01275015, + "epoch": 0.06228768976401623, + "flos": 25673601703680.0, + "grad_norm": 4.464615872821339, + "language_loss": 0.81925672, + "learning_rate": 3.989074979440421e-06, + "loss": 0.90270305, + "num_input_tokens_seen": 22197125, + "router_z_loss_clip": 5.5390625, + "router_z_loss_mlp": 0.73583984, + "step": 1036, + "time_per_iteration": 2.6662774085998535 + }, + { + "auxiliary_loss_clip": 0.07003354, + "auxiliary_loss_mlp": 0.01370226, + "balance_loss_clip": 0.064463, + "balance_loss_mlp": 0.01293693, + "epoch": 0.062347813016684205, + "flos": 25301687356800.0, + "grad_norm": 3.754285367283167, + "language_loss": 0.89123344, + "learning_rate": 3.989034289722739e-06, + "loss": 0.97496927, + "num_input_tokens_seen": 22217575, + "router_z_loss_clip": 5.56640625, + "router_z_loss_mlp": 0.76513672, + "step": 1037, + "time_per_iteration": 2.609894037246704 + }, + { + "auxiliary_loss_clip": 0.07008456, + "auxiliary_loss_mlp": 0.01342836, + "balance_loss_clip": 0.06453587, + "balance_loss_mlp": 0.01269641, + "epoch": 0.06240793626935217, + "flos": 26914388163840.0, + "grad_norm": 15.327798453817612, + "language_loss": 0.8346867, + "learning_rate": 3.988993524580676e-06, + "loss": 0.91819966, + "num_input_tokens_seen": 22236840, + "router_z_loss_clip": 5.54296875, + "router_z_loss_mlp": 0.73095703, + "step": 1038, + "time_per_iteration": 2.6626057624816895 + }, + { + "auxiliary_loss_clip": 0.06993866, + "auxiliary_loss_mlp": 0.01340149, + "balance_loss_clip": 0.0645204, + "balance_loss_mlp": 0.01267956, + "epoch": 0.06246805952202014, + "flos": 21622108285440.0, + "grad_norm": 3.08050473605758, + "language_loss": 0.88628823, + "learning_rate": 3.98895268401578e-06, + "loss": 0.96962833, + "num_input_tokens_seen": 22256465, + "router_z_loss_clip": 5.41796875, + "router_z_loss_mlp": 0.72167969, + "step": 1039, + "time_per_iteration": 2.6248486042022705 + }, + { + "auxiliary_loss_clip": 0.0701851, + "auxiliary_loss_mlp": 0.01340836, + "balance_loss_clip": 0.06453219, + "balance_loss_mlp": 0.01264352, + "epoch": 0.0625281827746881, + "flos": 19316954136960.0, + "grad_norm": 4.220230384937809, + "language_loss": 0.85023952, + "learning_rate": 3.9889117680296e-06, + "loss": 0.933833, + "num_input_tokens_seen": 22274025, + "router_z_loss_clip": 5.6484375, + "router_z_loss_mlp": 0.76513672, + "step": 1040, + "time_per_iteration": 2.6646370887756348 + }, + { + "auxiliary_loss_clip": 0.07036482, + "auxiliary_loss_mlp": 0.01364298, + "balance_loss_clip": 0.06464302, + "balance_loss_mlp": 0.01274987, + "epoch": 0.06258830602735609, + "flos": 27753183861120.0, + "grad_norm": 4.590358257909823, + "language_loss": 0.72318321, + "learning_rate": 3.988870776623685e-06, + "loss": 0.80719095, + "num_input_tokens_seen": 22292245, + "router_z_loss_clip": 5.71875, + "router_z_loss_mlp": 0.89306641, + "step": 1041, + "time_per_iteration": 2.6730599403381348 + }, + { + "auxiliary_loss_clip": 0.07040736, + "auxiliary_loss_mlp": 0.01378227, + "balance_loss_clip": 0.06470466, + "balance_loss_mlp": 0.01298548, + "epoch": 0.06264842928002405, + "flos": 23229442431360.0, + "grad_norm": 2.706616424442574, + "language_loss": 0.84952104, + "learning_rate": 3.9888297097995905e-06, + "loss": 0.93371069, + "num_input_tokens_seen": 22311455, + "router_z_loss_clip": 5.6953125, + "router_z_loss_mlp": 0.796875, + "step": 1042, + "time_per_iteration": 2.6521389484405518 + }, + { + "auxiliary_loss_clip": 0.0703849, + "auxiliary_loss_mlp": 0.0134851, + "balance_loss_clip": 0.06476429, + "balance_loss_mlp": 0.01272598, + "epoch": 0.06270855253269202, + "flos": 38408671699200.0, + "grad_norm": 3.072391396873047, + "language_loss": 0.79772788, + "learning_rate": 3.988788567558874e-06, + "loss": 0.88159788, + "num_input_tokens_seen": 22333750, + "router_z_loss_clip": 5.62109375, + "router_z_loss_mlp": 0.75927734, + "step": 1043, + "time_per_iteration": 4.132354021072388 + }, + { + "auxiliary_loss_clip": 0.07023476, + "auxiliary_loss_mlp": 0.01365807, + "balance_loss_clip": 0.06473523, + "balance_loss_mlp": 0.01289656, + "epoch": 0.06276867578535998, + "flos": 22459771952640.0, + "grad_norm": 8.578696431093903, + "language_loss": 0.95484012, + "learning_rate": 3.988747349903097e-06, + "loss": 1.03873289, + "num_input_tokens_seen": 22351940, + "router_z_loss_clip": 5.49609375, + "router_z_loss_mlp": 0.76123047, + "step": 1044, + "time_per_iteration": 4.0872087478637695 + }, + { + "auxiliary_loss_clip": 0.0702454, + "auxiliary_loss_mlp": 0.0136404, + "balance_loss_clip": 0.06474113, + "balance_loss_mlp": 0.0129156, + "epoch": 0.06282879903802796, + "flos": 22937176990080.0, + "grad_norm": 5.298315501835511, + "language_loss": 0.88737643, + "learning_rate": 3.988706056833821e-06, + "loss": 0.97126228, + "num_input_tokens_seen": 22372085, + "router_z_loss_clip": 5.5, + "router_z_loss_mlp": 0.72412109, + "step": 1045, + "time_per_iteration": 2.6359164714813232 + }, + { + "auxiliary_loss_clip": 0.07016507, + "auxiliary_loss_mlp": 0.01377248, + "balance_loss_clip": 0.06467608, + "balance_loss_mlp": 0.01300334, + "epoch": 0.06288892229069593, + "flos": 34827036451200.0, + "grad_norm": 2.8748954821383803, + "language_loss": 0.81643683, + "learning_rate": 3.9886646883526125e-06, + "loss": 0.90037435, + "num_input_tokens_seen": 22392020, + "router_z_loss_clip": 5.484375, + "router_z_loss_mlp": 0.76855469, + "step": 1046, + "time_per_iteration": 4.205566883087158 + }, + { + "auxiliary_loss_clip": 0.07049687, + "auxiliary_loss_mlp": 0.01383919, + "balance_loss_clip": 0.0647831, + "balance_loss_mlp": 0.01309628, + "epoch": 0.06294904554336389, + "flos": 19433178149760.0, + "grad_norm": 3.049904917466256, + "language_loss": 0.8054778, + "learning_rate": 3.988623244461039e-06, + "loss": 0.8898139, + "num_input_tokens_seen": 22411180, + "router_z_loss_clip": 5.71484375, + "router_z_loss_mlp": 0.74267578, + "step": 1047, + "time_per_iteration": 2.628453493118286 + }, + { + "auxiliary_loss_clip": 0.07082113, + "auxiliary_loss_mlp": 0.01418593, + "balance_loss_clip": 0.06488797, + "balance_loss_mlp": 0.01332237, + "epoch": 0.06300916879603187, + "flos": 40671464808960.0, + "grad_norm": 5.477739593856775, + "language_loss": 0.80062962, + "learning_rate": 3.988581725160672e-06, + "loss": 0.88563669, + "num_input_tokens_seen": 22435105, + "router_z_loss_clip": 5.921875, + "router_z_loss_mlp": 0.86279297, + "step": 1048, + "time_per_iteration": 4.191184997558594 + }, + { + "auxiliary_loss_clip": 0.07059699, + "auxiliary_loss_mlp": 0.01409495, + "balance_loss_clip": 0.06479897, + "balance_loss_mlp": 0.01322902, + "epoch": 0.06306929204869983, + "flos": 23810703004800.0, + "grad_norm": 4.634968800445174, + "language_loss": 0.81291783, + "learning_rate": 3.988540130453087e-06, + "loss": 0.89760983, + "num_input_tokens_seen": 22452710, + "router_z_loss_clip": 5.796875, + "router_z_loss_mlp": 0.86669922, + "step": 1049, + "time_per_iteration": 2.650202989578247 + }, + { + "auxiliary_loss_clip": 0.07039324, + "auxiliary_loss_mlp": 0.01395065, + "balance_loss_clip": 0.06466646, + "balance_loss_mlp": 0.01316435, + "epoch": 0.0631294153013678, + "flos": 18921671700480.0, + "grad_norm": 5.321703459602036, + "language_loss": 0.85613585, + "learning_rate": 3.988498460339862e-06, + "loss": 0.9404797, + "num_input_tokens_seen": 22470175, + "router_z_loss_clip": 5.71875, + "router_z_loss_mlp": 0.78662109, + "step": 1050, + "time_per_iteration": 2.6393301486968994 + }, + { + "auxiliary_loss_clip": 0.07003346, + "auxiliary_loss_mlp": 0.01381224, + "balance_loss_clip": 0.06475418, + "balance_loss_mlp": 0.01309221, + "epoch": 0.06318953855403578, + "flos": 24287101793280.0, + "grad_norm": 2.921652621723748, + "language_loss": 0.80915332, + "learning_rate": 3.988456714822575e-06, + "loss": 0.89299899, + "num_input_tokens_seen": 22490020, + "router_z_loss_clip": 5.2734375, + "router_z_loss_mlp": 0.71972656, + "step": 1051, + "time_per_iteration": 2.6563098430633545 + }, + { + "auxiliary_loss_clip": 0.07019964, + "auxiliary_loss_mlp": 0.01395256, + "balance_loss_clip": 0.06461668, + "balance_loss_mlp": 0.01314957, + "epoch": 0.06324966180670374, + "flos": 22535563570560.0, + "grad_norm": 3.4102512673670256, + "language_loss": 0.84142733, + "learning_rate": 3.98841489390281e-06, + "loss": 0.92557955, + "num_input_tokens_seen": 22509685, + "router_z_loss_clip": 5.57421875, + "router_z_loss_mlp": 0.80224609, + "step": 1052, + "time_per_iteration": 2.6776039600372314 + }, + { + "auxiliary_loss_clip": 0.07036786, + "auxiliary_loss_mlp": 0.01379519, + "balance_loss_clip": 0.06459802, + "balance_loss_mlp": 0.01299411, + "epoch": 0.06330978505937171, + "flos": 15783465859200.0, + "grad_norm": 2.8507947153873663, + "language_loss": 0.80809307, + "learning_rate": 3.988372997582155e-06, + "loss": 0.89225614, + "num_input_tokens_seen": 22527905, + "router_z_loss_clip": 5.76953125, + "router_z_loss_mlp": 0.80175781, + "step": 1053, + "time_per_iteration": 2.6043174266815186 + }, + { + "auxiliary_loss_clip": 0.06984901, + "auxiliary_loss_mlp": 0.01368181, + "balance_loss_clip": 0.06446727, + "balance_loss_mlp": 0.0129532, + "epoch": 0.06336990831203967, + "flos": 21477610719360.0, + "grad_norm": 4.159955078588776, + "language_loss": 0.88012934, + "learning_rate": 3.988331025862195e-06, + "loss": 0.96366018, + "num_input_tokens_seen": 22546335, + "router_z_loss_clip": 5.3828125, + "router_z_loss_mlp": 0.72802734, + "step": 1054, + "time_per_iteration": 2.647026777267456 + }, + { + "auxiliary_loss_clip": 0.06987712, + "auxiliary_loss_mlp": 0.01370375, + "balance_loss_clip": 0.06445334, + "balance_loss_mlp": 0.01301568, + "epoch": 0.06343003156470765, + "flos": 18484824839040.0, + "grad_norm": 2.8104304693341837, + "language_loss": 0.89331806, + "learning_rate": 3.9882889787445225e-06, + "loss": 0.97689891, + "num_input_tokens_seen": 22563885, + "router_z_loss_clip": 5.421875, + "router_z_loss_mlp": 0.68798828, + "step": 1055, + "time_per_iteration": 2.5695717334747314 + }, + { + "auxiliary_loss_clip": 0.07031021, + "auxiliary_loss_mlp": 0.01393239, + "balance_loss_clip": 0.06440826, + "balance_loss_mlp": 0.01302354, + "epoch": 0.06349015481737562, + "flos": 25161801765120.0, + "grad_norm": 4.1133835551619224, + "language_loss": 0.85196388, + "learning_rate": 3.988246856230734e-06, + "loss": 0.93620646, + "num_input_tokens_seen": 22583035, + "router_z_loss_clip": 5.89453125, + "router_z_loss_mlp": 0.90820312, + "step": 1056, + "time_per_iteration": 2.685821056365967 + }, + { + "auxiliary_loss_clip": 0.07029925, + "auxiliary_loss_mlp": 0.01408784, + "balance_loss_clip": 0.06446205, + "balance_loss_mlp": 0.01319377, + "epoch": 0.06355027807004358, + "flos": 26879322430080.0, + "grad_norm": 5.02877545894497, + "language_loss": 0.84474576, + "learning_rate": 3.988204658322426e-06, + "loss": 0.92913282, + "num_input_tokens_seen": 22605055, + "router_z_loss_clip": 5.8359375, + "router_z_loss_mlp": 0.89501953, + "step": 1057, + "time_per_iteration": 2.6688387393951416 + }, + { + "auxiliary_loss_clip": 0.06953399, + "auxiliary_loss_mlp": 0.01345887, + "balance_loss_clip": 0.06428042, + "balance_loss_mlp": 0.01278987, + "epoch": 0.06361040132271156, + "flos": 21402951131520.0, + "grad_norm": 3.9641222811805337, + "language_loss": 0.85986251, + "learning_rate": 3.988162385021196e-06, + "loss": 0.94285542, + "num_input_tokens_seen": 22623760, + "router_z_loss_clip": 5.25, + "router_z_loss_mlp": 0.66845703, + "step": 1058, + "time_per_iteration": 2.6371591091156006 + }, + { + "auxiliary_loss_clip": 0.0698344, + "auxiliary_loss_mlp": 0.01353949, + "balance_loss_clip": 0.06427366, + "balance_loss_mlp": 0.01275796, + "epoch": 0.06367052457537953, + "flos": 25739959737600.0, + "grad_norm": 3.2277693096185125, + "language_loss": 0.90202904, + "learning_rate": 3.988120036328651e-06, + "loss": 0.98540288, + "num_input_tokens_seen": 22643000, + "router_z_loss_clip": 5.5625, + "router_z_loss_mlp": 0.78173828, + "step": 1059, + "time_per_iteration": 2.6188669204711914 + }, + { + "auxiliary_loss_clip": 0.06969759, + "auxiliary_loss_mlp": 0.01343893, + "balance_loss_clip": 0.06422018, + "balance_loss_mlp": 0.01267218, + "epoch": 0.0637306478280475, + "flos": 17635840871040.0, + "grad_norm": 3.450468160359764, + "language_loss": 0.94701946, + "learning_rate": 3.988077612246394e-06, + "loss": 1.0301559, + "num_input_tokens_seen": 22660460, + "router_z_loss_clip": 5.48046875, + "router_z_loss_mlp": 0.76708984, + "step": 1060, + "time_per_iteration": 2.659820079803467 + }, + { + "auxiliary_loss_clip": 0.06957703, + "auxiliary_loss_mlp": 0.0133292, + "balance_loss_clip": 0.06419823, + "balance_loss_mlp": 0.01262396, + "epoch": 0.06379077108071547, + "flos": 13667727864960.0, + "grad_norm": 3.5269486179455622, + "language_loss": 0.91039562, + "learning_rate": 3.988035112776035e-06, + "loss": 0.99330181, + "num_input_tokens_seen": 22679270, + "router_z_loss_clip": 5.38671875, + "router_z_loss_mlp": 0.70483398, + "step": 1061, + "time_per_iteration": 2.595237970352173 + }, + { + "auxiliary_loss_clip": 0.07004992, + "auxiliary_loss_mlp": 0.0134989, + "balance_loss_clip": 0.06433421, + "balance_loss_mlp": 0.01272071, + "epoch": 0.06385089433338344, + "flos": 28486950065280.0, + "grad_norm": 26.387846770017223, + "language_loss": 0.80432439, + "learning_rate": 3.987992537919185e-06, + "loss": 0.88787317, + "num_input_tokens_seen": 22699330, + "router_z_loss_clip": 5.7109375, + "router_z_loss_mlp": 0.77832031, + "step": 1062, + "time_per_iteration": 2.69326114654541 + }, + { + "auxiliary_loss_clip": 0.06971388, + "auxiliary_loss_mlp": 0.01333448, + "balance_loss_clip": 0.06420203, + "balance_loss_mlp": 0.01260349, + "epoch": 0.0639110175860514, + "flos": 24317052428160.0, + "grad_norm": 14.259145516712906, + "language_loss": 0.90426183, + "learning_rate": 3.987949887677459e-06, + "loss": 0.98731029, + "num_input_tokens_seen": 22717945, + "router_z_loss_clip": 5.5, + "router_z_loss_mlp": 0.73095703, + "step": 1063, + "time_per_iteration": 2.642476797103882 + }, + { + "auxiliary_loss_clip": 0.06974378, + "auxiliary_loss_mlp": 0.01332583, + "balance_loss_clip": 0.06425211, + "balance_loss_mlp": 0.01259436, + "epoch": 0.06397114083871938, + "flos": 22097291189760.0, + "grad_norm": 2.9601227778370176, + "language_loss": 0.82562792, + "learning_rate": 3.9879071620524744e-06, + "loss": 0.90869761, + "num_input_tokens_seen": 22736790, + "router_z_loss_clip": 5.48828125, + "router_z_loss_mlp": 0.73144531, + "step": 1064, + "time_per_iteration": 2.661435604095459 + }, + { + "auxiliary_loss_clip": 0.06941259, + "auxiliary_loss_mlp": 0.01342729, + "balance_loss_clip": 0.06412596, + "balance_loss_mlp": 0.01271298, + "epoch": 0.06403126409138735, + "flos": 19578849672960.0, + "grad_norm": 3.2505919469988727, + "language_loss": 0.86995006, + "learning_rate": 3.987864361045851e-06, + "loss": 0.95278984, + "num_input_tokens_seen": 22754745, + "router_z_loss_clip": 5.28515625, + "router_z_loss_mlp": 0.71386719, + "step": 1065, + "time_per_iteration": 2.5758113861083984 + }, + { + "auxiliary_loss_clip": 0.06963679, + "auxiliary_loss_mlp": 0.01340247, + "balance_loss_clip": 0.06401139, + "balance_loss_mlp": 0.01265669, + "epoch": 0.06409138734405531, + "flos": 40816968624000.0, + "grad_norm": 2.0842805851080395, + "language_loss": 0.71325147, + "learning_rate": 3.987821484659211e-06, + "loss": 0.79629076, + "num_input_tokens_seen": 22776780, + "router_z_loss_clip": 5.625, + "router_z_loss_mlp": 0.74609375, + "step": 1066, + "time_per_iteration": 2.7917239665985107 + }, + { + "auxiliary_loss_clip": 0.06944396, + "auxiliary_loss_mlp": 0.0133661, + "balance_loss_clip": 0.06404863, + "balance_loss_mlp": 0.01266419, + "epoch": 0.06415151059672328, + "flos": 20446631683200.0, + "grad_norm": 3.9323967107233093, + "language_loss": 0.93839109, + "learning_rate": 3.987778532894181e-06, + "loss": 1.02120125, + "num_input_tokens_seen": 22793915, + "router_z_loss_clip": 5.390625, + "router_z_loss_mlp": 0.70166016, + "step": 1067, + "time_per_iteration": 2.6115174293518066 + }, + { + "auxiliary_loss_clip": 0.06956208, + "auxiliary_loss_mlp": 0.0134, + "balance_loss_clip": 0.06410809, + "balance_loss_mlp": 0.01270954, + "epoch": 0.06421163384939126, + "flos": 18077006217600.0, + "grad_norm": 2.3907527813163947, + "language_loss": 0.86262715, + "learning_rate": 3.987735505752391e-06, + "loss": 0.94558918, + "num_input_tokens_seen": 22812670, + "router_z_loss_clip": 5.453125, + "router_z_loss_mlp": 0.68994141, + "step": 1068, + "time_per_iteration": 2.6069822311401367 + }, + { + "auxiliary_loss_clip": 0.06937677, + "auxiliary_loss_mlp": 0.01339596, + "balance_loss_clip": 0.0640877, + "balance_loss_mlp": 0.01269787, + "epoch": 0.06427175710205922, + "flos": 25126526396160.0, + "grad_norm": 3.0644651013361175, + "language_loss": 0.92719203, + "learning_rate": 3.987692403235471e-06, + "loss": 1.0099647, + "num_input_tokens_seen": 22832440, + "router_z_loss_clip": 5.296875, + "router_z_loss_mlp": 0.69775391, + "step": 1069, + "time_per_iteration": 2.6751255989074707 + }, + { + "auxiliary_loss_clip": 0.06952519, + "auxiliary_loss_mlp": 0.01331878, + "balance_loss_clip": 0.06402327, + "balance_loss_mlp": 0.01256777, + "epoch": 0.06433188035472719, + "flos": 17385684906240.0, + "grad_norm": 4.001862380962301, + "language_loss": 0.98985177, + "learning_rate": 3.987649225345056e-06, + "loss": 1.07269573, + "num_input_tokens_seen": 22845495, + "router_z_loss_clip": 5.5078125, + "router_z_loss_mlp": 0.75048828, + "step": 1070, + "time_per_iteration": 2.5646464824676514 + }, + { + "auxiliary_loss_clip": 0.06933151, + "auxiliary_loss_mlp": 0.01337757, + "balance_loss_clip": 0.0639724, + "balance_loss_mlp": 0.01267042, + "epoch": 0.06439200360739517, + "flos": 23552371267200.0, + "grad_norm": 2.5082910657712474, + "language_loss": 0.90418053, + "learning_rate": 3.987605972082782e-06, + "loss": 0.98688966, + "num_input_tokens_seen": 22865390, + "router_z_loss_clip": 5.359375, + "router_z_loss_mlp": 0.70703125, + "step": 1071, + "time_per_iteration": 2.6427106857299805 + }, + { + "auxiliary_loss_clip": 0.06918223, + "auxiliary_loss_mlp": 0.01334321, + "balance_loss_clip": 0.06398708, + "balance_loss_mlp": 0.01262414, + "epoch": 0.06445212686006313, + "flos": 21986014567680.0, + "grad_norm": 1.871300371090536, + "language_loss": 0.79228568, + "learning_rate": 3.987562643450292e-06, + "loss": 0.87481117, + "num_input_tokens_seen": 22885495, + "router_z_loss_clip": 5.19921875, + "router_z_loss_mlp": 0.71923828, + "step": 1072, + "time_per_iteration": 2.647038698196411 + }, + { + "auxiliary_loss_clip": 0.06937171, + "auxiliary_loss_mlp": 0.01329872, + "balance_loss_clip": 0.06401432, + "balance_loss_mlp": 0.01259205, + "epoch": 0.0645122501127311, + "flos": 25928369642880.0, + "grad_norm": 2.655186985808554, + "language_loss": 0.84775895, + "learning_rate": 3.987519239449226e-06, + "loss": 0.9304294, + "num_input_tokens_seen": 22904845, + "router_z_loss_clip": 5.35546875, + "router_z_loss_mlp": 0.70800781, + "step": 1073, + "time_per_iteration": 2.658341646194458 + }, + { + "auxiliary_loss_clip": 0.06906792, + "auxiliary_loss_mlp": 0.01330074, + "balance_loss_clip": 0.06396446, + "balance_loss_mlp": 0.01263412, + "epoch": 0.06457237336539907, + "flos": 25632498476160.0, + "grad_norm": 1.923481252052909, + "language_loss": 0.82366061, + "learning_rate": 3.987475760081233e-06, + "loss": 0.90602928, + "num_input_tokens_seen": 22925940, + "router_z_loss_clip": 5.09765625, + "router_z_loss_mlp": 0.66650391, + "step": 1074, + "time_per_iteration": 2.6500589847564697 + }, + { + "auxiliary_loss_clip": 0.06911084, + "auxiliary_loss_mlp": 0.01341632, + "balance_loss_clip": 0.0638795, + "balance_loss_mlp": 0.01268152, + "epoch": 0.06463249661806704, + "flos": 19470088673280.0, + "grad_norm": 4.283359791903129, + "language_loss": 0.82960403, + "learning_rate": 3.987432205347958e-06, + "loss": 0.91213125, + "num_input_tokens_seen": 22944375, + "router_z_loss_clip": 5.23046875, + "router_z_loss_mlp": 0.73486328, + "step": 1075, + "time_per_iteration": 2.620055675506592 + }, + { + "auxiliary_loss_clip": 0.06919183, + "auxiliary_loss_mlp": 0.01329908, + "balance_loss_clip": 0.06393343, + "balance_loss_mlp": 0.01260528, + "epoch": 0.064692619870735, + "flos": 24504833427840.0, + "grad_norm": 4.7074268898703, + "language_loss": 0.90130782, + "learning_rate": 3.987388575251055e-06, + "loss": 0.98379874, + "num_input_tokens_seen": 22959145, + "router_z_loss_clip": 5.26171875, + "router_z_loss_mlp": 0.69335938, + "step": 1076, + "time_per_iteration": 2.6410202980041504 + }, + { + "auxiliary_loss_clip": 0.06917243, + "auxiliary_loss_mlp": 0.01324517, + "balance_loss_clip": 0.06391963, + "balance_loss_mlp": 0.01256901, + "epoch": 0.06475274312340297, + "flos": 17024252319360.0, + "grad_norm": 4.89859871786138, + "language_loss": 0.84430212, + "learning_rate": 3.98734486979218e-06, + "loss": 0.92671967, + "num_input_tokens_seen": 22978100, + "router_z_loss_clip": 5.25390625, + "router_z_loss_mlp": 0.67578125, + "step": 1077, + "time_per_iteration": 2.6577157974243164 + }, + { + "auxiliary_loss_clip": 0.06961326, + "auxiliary_loss_mlp": 0.0134572, + "balance_loss_clip": 0.06399816, + "balance_loss_mlp": 0.01265659, + "epoch": 0.06481286637607095, + "flos": 24579409161600.0, + "grad_norm": 2.525164880783881, + "language_loss": 0.95071888, + "learning_rate": 3.987301088972986e-06, + "loss": 1.03378928, + "num_input_tokens_seen": 22997285, + "router_z_loss_clip": 5.609375, + "router_z_loss_mlp": 0.80078125, + "step": 1078, + "time_per_iteration": 2.60807466506958 + }, + { + "auxiliary_loss_clip": 0.0696152, + "auxiliary_loss_mlp": 0.01348441, + "balance_loss_clip": 0.0639492, + "balance_loss_mlp": 0.01266616, + "epoch": 0.06487298962873891, + "flos": 21111985428480.0, + "grad_norm": 2.577127703708103, + "language_loss": 0.81118071, + "learning_rate": 3.987257232795137e-06, + "loss": 0.89428037, + "num_input_tokens_seen": 23016285, + "router_z_loss_clip": 5.6640625, + "router_z_loss_mlp": 0.81835938, + "step": 1079, + "time_per_iteration": 2.6317968368530273 + }, + { + "auxiliary_loss_clip": 0.06928547, + "auxiliary_loss_mlp": 0.01328554, + "balance_loss_clip": 0.06390582, + "balance_loss_mlp": 0.01256837, + "epoch": 0.06493311288140688, + "flos": 24615103800960.0, + "grad_norm": 2.4676521714353865, + "language_loss": 0.72843546, + "learning_rate": 3.987213301260294e-06, + "loss": 0.81100643, + "num_input_tokens_seen": 23036420, + "router_z_loss_clip": 5.37109375, + "router_z_loss_mlp": 0.71728516, + "step": 1080, + "time_per_iteration": 2.6215646266937256 + }, + { + "auxiliary_loss_clip": 0.06919578, + "auxiliary_loss_mlp": 0.01334283, + "balance_loss_clip": 0.06385017, + "balance_loss_mlp": 0.01258323, + "epoch": 0.06499323613407486, + "flos": 25345054644480.0, + "grad_norm": 2.8195024652173233, + "language_loss": 0.76152724, + "learning_rate": 3.987169294370123e-06, + "loss": 0.8440659, + "num_input_tokens_seen": 23056945, + "router_z_loss_clip": 5.34375, + "router_z_loss_mlp": 0.75927734, + "step": 1081, + "time_per_iteration": 2.619861364364624 + }, + { + "auxiliary_loss_clip": 0.06903991, + "auxiliary_loss_mlp": 0.01330699, + "balance_loss_clip": 0.06382824, + "balance_loss_mlp": 0.01260985, + "epoch": 0.06505335938674282, + "flos": 20381908803840.0, + "grad_norm": 3.8302016885059436, + "language_loss": 0.87991226, + "learning_rate": 3.987125212126294e-06, + "loss": 0.96225917, + "num_input_tokens_seen": 23074940, + "router_z_loss_clip": 5.2109375, + "router_z_loss_mlp": 0.69726562, + "step": 1082, + "time_per_iteration": 3.9682254791259766 + }, + { + "auxiliary_loss_clip": 0.06965172, + "auxiliary_loss_mlp": 0.01343743, + "balance_loss_clip": 0.06394538, + "balance_loss_mlp": 0.01265304, + "epoch": 0.06511348263941079, + "flos": 25344970790400.0, + "grad_norm": 3.078052560557278, + "language_loss": 0.85807657, + "learning_rate": 3.987081054530478e-06, + "loss": 0.94116569, + "num_input_tokens_seen": 23093420, + "router_z_loss_clip": 5.70703125, + "router_z_loss_mlp": 0.78417969, + "step": 1083, + "time_per_iteration": 4.172176361083984 + }, + { + "auxiliary_loss_clip": 0.06918654, + "auxiliary_loss_mlp": 0.01347933, + "balance_loss_clip": 0.06379002, + "balance_loss_mlp": 0.01269684, + "epoch": 0.06517360589207877, + "flos": 20337912610560.0, + "grad_norm": 5.768369350853526, + "language_loss": 0.82737648, + "learning_rate": 3.987036821584348e-06, + "loss": 0.91004241, + "num_input_tokens_seen": 23111550, + "router_z_loss_clip": 5.40234375, + "router_z_loss_mlp": 0.78173828, + "step": 1084, + "time_per_iteration": 2.5647377967834473 + }, + { + "auxiliary_loss_clip": 0.06925946, + "auxiliary_loss_mlp": 0.01344614, + "balance_loss_clip": 0.06381474, + "balance_loss_mlp": 0.0126379, + "epoch": 0.06523372914474673, + "flos": 31688956391040.0, + "grad_norm": 2.8637661589946664, + "language_loss": 0.69041795, + "learning_rate": 3.986992513289584e-06, + "loss": 0.7731235, + "num_input_tokens_seen": 23130335, + "router_z_loss_clip": 5.44921875, + "router_z_loss_mlp": 0.80908203, + "step": 1085, + "time_per_iteration": 2.6726510524749756 + }, + { + "auxiliary_loss_clip": 0.06912835, + "auxiliary_loss_mlp": 0.01346265, + "balance_loss_clip": 0.06394207, + "balance_loss_mlp": 0.01271496, + "epoch": 0.0652938523974147, + "flos": 20784612326400.0, + "grad_norm": 3.652482458321433, + "language_loss": 0.80282378, + "learning_rate": 3.9869481296478645e-06, + "loss": 0.88541472, + "num_input_tokens_seen": 23152380, + "router_z_loss_clip": 5.1875, + "router_z_loss_mlp": 0.74707031, + "step": 1086, + "time_per_iteration": 4.0445778369903564 + }, + { + "auxiliary_loss_clip": 0.06903446, + "auxiliary_loss_mlp": 0.01343539, + "balance_loss_clip": 0.06383859, + "balance_loss_mlp": 0.01271489, + "epoch": 0.06535397565008266, + "flos": 16696627655040.0, + "grad_norm": 2.983342921031512, + "language_loss": 0.88718885, + "learning_rate": 3.986903670660872e-06, + "loss": 0.96965867, + "num_input_tokens_seen": 23171630, + "router_z_loss_clip": 5.1953125, + "router_z_loss_mlp": 0.72021484, + "step": 1087, + "time_per_iteration": 2.612272024154663 + }, + { + "auxiliary_loss_clip": 0.06922436, + "auxiliary_loss_mlp": 0.01359561, + "balance_loss_clip": 0.06381297, + "balance_loss_mlp": 0.01282457, + "epoch": 0.06541409890275064, + "flos": 26875171653120.0, + "grad_norm": 4.165814553604834, + "language_loss": 0.81038088, + "learning_rate": 3.9868591363302945e-06, + "loss": 0.89320087, + "num_input_tokens_seen": 23192520, + "router_z_loss_clip": 5.4140625, + "router_z_loss_mlp": 0.77099609, + "step": 1088, + "time_per_iteration": 4.128512620925903 + }, + { + "auxiliary_loss_clip": 0.06905861, + "auxiliary_loss_mlp": 0.01369914, + "balance_loss_clip": 0.0637981, + "balance_loss_mlp": 0.01292333, + "epoch": 0.06547422215541861, + "flos": 20527831889280.0, + "grad_norm": 2.3905965673188043, + "language_loss": 0.73899305, + "learning_rate": 3.9868145266578186e-06, + "loss": 0.82175082, + "num_input_tokens_seen": 23210710, + "router_z_loss_clip": 5.26171875, + "router_z_loss_mlp": 0.77587891, + "step": 1089, + "time_per_iteration": 2.5846424102783203 + }, + { + "auxiliary_loss_clip": 0.06903853, + "auxiliary_loss_mlp": 0.01367809, + "balance_loss_clip": 0.06390744, + "balance_loss_mlp": 0.01297094, + "epoch": 0.06553434540808657, + "flos": 22022925091200.0, + "grad_norm": 2.5933459275490005, + "language_loss": 0.88925481, + "learning_rate": 3.9867698416451366e-06, + "loss": 0.97197139, + "num_input_tokens_seen": 23230305, + "router_z_loss_clip": 5.12890625, + "router_z_loss_mlp": 0.70751953, + "step": 1090, + "time_per_iteration": 2.632730722427368 + }, + { + "auxiliary_loss_clip": 0.06923388, + "auxiliary_loss_mlp": 0.01379562, + "balance_loss_clip": 0.06394897, + "balance_loss_mlp": 0.01304031, + "epoch": 0.06559446866075455, + "flos": 24615648852480.0, + "grad_norm": 5.07637209675267, + "language_loss": 0.7519111, + "learning_rate": 3.9867250812939434e-06, + "loss": 0.83494061, + "num_input_tokens_seen": 23249015, + "router_z_loss_clip": 5.28125, + "router_z_loss_mlp": 0.75634766, + "step": 1091, + "time_per_iteration": 2.6071624755859375 + }, + { + "auxiliary_loss_clip": 0.06920849, + "auxiliary_loss_mlp": 0.01367283, + "balance_loss_clip": 0.06403629, + "balance_loss_mlp": 0.01298141, + "epoch": 0.06565459191342252, + "flos": 24280686956160.0, + "grad_norm": 3.183278775232349, + "language_loss": 0.85751635, + "learning_rate": 3.986680245605936e-06, + "loss": 0.94039762, + "num_input_tokens_seen": 23265105, + "router_z_loss_clip": 5.1640625, + "router_z_loss_mlp": 0.69091797, + "step": 1092, + "time_per_iteration": 2.605273962020874 + }, + { + "auxiliary_loss_clip": 0.06938382, + "auxiliary_loss_mlp": 0.01382517, + "balance_loss_clip": 0.06414036, + "balance_loss_mlp": 0.0131123, + "epoch": 0.06571471516609048, + "flos": 24793493143680.0, + "grad_norm": 3.590473362105347, + "language_loss": 0.74473059, + "learning_rate": 3.986635334582814e-06, + "loss": 0.82793957, + "num_input_tokens_seen": 23283950, + "router_z_loss_clip": 5.2421875, + "router_z_loss_mlp": 0.71337891, + "step": 1093, + "time_per_iteration": 2.638237237930298 + }, + { + "auxiliary_loss_clip": 0.06921268, + "auxiliary_loss_mlp": 0.01380472, + "balance_loss_clip": 0.06396792, + "balance_loss_mlp": 0.01303797, + "epoch": 0.06577483841875846, + "flos": 26221347843840.0, + "grad_norm": 88.21387149104662, + "language_loss": 0.90390575, + "learning_rate": 3.986590348226282e-06, + "loss": 0.98692322, + "num_input_tokens_seen": 23305005, + "router_z_loss_clip": 5.2421875, + "router_z_loss_mlp": 0.76660156, + "step": 1094, + "time_per_iteration": 2.6458590030670166 + }, + { + "auxiliary_loss_clip": 0.06927408, + "auxiliary_loss_mlp": 0.01386993, + "balance_loss_clip": 0.06403756, + "balance_loss_mlp": 0.01310603, + "epoch": 0.06583496167142643, + "flos": 25087519520640.0, + "grad_norm": 2.736930049066649, + "language_loss": 0.83897924, + "learning_rate": 3.986545286538044e-06, + "loss": 0.92212319, + "num_input_tokens_seen": 23323220, + "router_z_loss_clip": 5.23046875, + "router_z_loss_mlp": 0.76416016, + "step": 1095, + "time_per_iteration": 2.678666114807129 + }, + { + "auxiliary_loss_clip": 0.06935441, + "auxiliary_loss_mlp": 0.01385344, + "balance_loss_clip": 0.06404546, + "balance_loss_mlp": 0.01317443, + "epoch": 0.06589508492409439, + "flos": 25636900815360.0, + "grad_norm": 5.395614329655057, + "language_loss": 0.73154068, + "learning_rate": 3.986500149519811e-06, + "loss": 0.81474853, + "num_input_tokens_seen": 23342235, + "router_z_loss_clip": 5.3046875, + "router_z_loss_mlp": 0.67871094, + "step": 1096, + "time_per_iteration": 2.6446287631988525 + }, + { + "auxiliary_loss_clip": 0.06917029, + "auxiliary_loss_mlp": 0.01365132, + "balance_loss_clip": 0.06399326, + "balance_loss_mlp": 0.01297755, + "epoch": 0.06595520817676236, + "flos": 23627701687680.0, + "grad_norm": 3.583666651431395, + "language_loss": 0.80129099, + "learning_rate": 3.986454937173292e-06, + "loss": 0.8841126, + "num_input_tokens_seen": 23363680, + "router_z_loss_clip": 5.171875, + "router_z_loss_mlp": 0.67285156, + "step": 1097, + "time_per_iteration": 2.610381603240967 + }, + { + "auxiliary_loss_clip": 0.06948523, + "auxiliary_loss_mlp": 0.01368674, + "balance_loss_clip": 0.0639759, + "balance_loss_mlp": 0.01295384, + "epoch": 0.06601533142943034, + "flos": 33810019119360.0, + "grad_norm": 2.548144949478092, + "language_loss": 0.80388427, + "learning_rate": 3.986409649500203e-06, + "loss": 0.88705623, + "num_input_tokens_seen": 23385590, + "router_z_loss_clip": 5.50390625, + "router_z_loss_mlp": 0.73339844, + "step": 1098, + "time_per_iteration": 2.720482110977173 + }, + { + "auxiliary_loss_clip": 0.06938128, + "auxiliary_loss_mlp": 0.01366931, + "balance_loss_clip": 0.06409903, + "balance_loss_mlp": 0.01293498, + "epoch": 0.0660754546820983, + "flos": 20264175417600.0, + "grad_norm": 10.171489722923557, + "language_loss": 0.84726501, + "learning_rate": 3.986364286502261e-06, + "loss": 0.93031561, + "num_input_tokens_seen": 23402945, + "router_z_loss_clip": 5.28125, + "router_z_loss_mlp": 0.73486328, + "step": 1099, + "time_per_iteration": 2.598655939102173 + }, + { + "auxiliary_loss_clip": 0.06904539, + "auxiliary_loss_mlp": 0.01375441, + "balance_loss_clip": 0.0639468, + "balance_loss_mlp": 0.01307397, + "epoch": 0.06613557793476627, + "flos": 19360195643520.0, + "grad_norm": 3.568327868722517, + "language_loss": 0.8664155, + "learning_rate": 3.986318848181186e-06, + "loss": 0.94921529, + "num_input_tokens_seen": 23421410, + "router_z_loss_clip": 5.09765625, + "router_z_loss_mlp": 0.68066406, + "step": 1100, + "time_per_iteration": 2.577528238296509 + }, + { + "auxiliary_loss_clip": 0.06927315, + "auxiliary_loss_mlp": 0.01369622, + "balance_loss_clip": 0.06391686, + "balance_loss_mlp": 0.01299861, + "epoch": 0.06619570118743424, + "flos": 13777788602880.0, + "grad_norm": 2.758398197018795, + "language_loss": 0.76281518, + "learning_rate": 3.986273334538702e-06, + "loss": 0.84578454, + "num_input_tokens_seen": 23438870, + "router_z_loss_clip": 5.3515625, + "router_z_loss_mlp": 0.69775391, + "step": 1101, + "time_per_iteration": 2.6156139373779297 + }, + { + "auxiliary_loss_clip": 0.06904308, + "auxiliary_loss_mlp": 0.01359683, + "balance_loss_clip": 0.06387865, + "balance_loss_mlp": 0.01295215, + "epoch": 0.06625582444010221, + "flos": 17863593068160.0, + "grad_norm": 4.389912717391851, + "language_loss": 0.89471924, + "learning_rate": 3.986227745576533e-06, + "loss": 0.97735918, + "num_input_tokens_seen": 23456975, + "router_z_loss_clip": 5.16796875, + "router_z_loss_mlp": 0.64501953, + "step": 1102, + "time_per_iteration": 2.569350242614746 + }, + { + "auxiliary_loss_clip": 0.0692213, + "auxiliary_loss_mlp": 0.01377442, + "balance_loss_clip": 0.06385392, + "balance_loss_mlp": 0.01306584, + "epoch": 0.06631594769277017, + "flos": 11843584479360.0, + "grad_norm": 3.5425773042581055, + "language_loss": 0.86216784, + "learning_rate": 3.98618208129641e-06, + "loss": 0.94516355, + "num_input_tokens_seen": 23473440, + "router_z_loss_clip": 5.36328125, + "router_z_loss_mlp": 0.70898438, + "step": 1103, + "time_per_iteration": 2.6067960262298584 + }, + { + "auxiliary_loss_clip": 0.06886483, + "auxiliary_loss_mlp": 0.01371541, + "balance_loss_clip": 0.06376658, + "balance_loss_mlp": 0.01305547, + "epoch": 0.06637607094543815, + "flos": 19799683908480.0, + "grad_norm": 2.4626452299406383, + "language_loss": 0.8457936, + "learning_rate": 3.986136341700063e-06, + "loss": 0.92837381, + "num_input_tokens_seen": 23493880, + "router_z_loss_clip": 5.09765625, + "router_z_loss_mlp": 0.66015625, + "step": 1104, + "time_per_iteration": 2.5836308002471924 + }, + { + "auxiliary_loss_clip": 0.06882686, + "auxiliary_loss_mlp": 0.01367781, + "balance_loss_clip": 0.0637526, + "balance_loss_mlp": 0.01303408, + "epoch": 0.06643619419810612, + "flos": 25493032154880.0, + "grad_norm": 1.7655477747418094, + "language_loss": 0.83173895, + "learning_rate": 3.986090526789227e-06, + "loss": 0.91424364, + "num_input_tokens_seen": 23514920, + "router_z_loss_clip": 5.0703125, + "router_z_loss_mlp": 0.64306641, + "step": 1105, + "time_per_iteration": 2.662261486053467 + }, + { + "auxiliary_loss_clip": 0.06873615, + "auxiliary_loss_mlp": 0.01369586, + "balance_loss_clip": 0.06380346, + "balance_loss_mlp": 0.01308694, + "epoch": 0.06649631745077408, + "flos": 16952234135040.0, + "grad_norm": 2.812403865753697, + "language_loss": 0.99235487, + "learning_rate": 3.986044636565639e-06, + "loss": 1.0747869, + "num_input_tokens_seen": 23531635, + "router_z_loss_clip": 4.9296875, + "router_z_loss_mlp": 0.60839844, + "step": 1106, + "time_per_iteration": 2.55377459526062 + }, + { + "auxiliary_loss_clip": 0.0691068, + "auxiliary_loss_mlp": 0.01368117, + "balance_loss_clip": 0.06380811, + "balance_loss_mlp": 0.01299977, + "epoch": 0.06655644070344206, + "flos": 17864431608960.0, + "grad_norm": 9.796712570365342, + "language_loss": 0.85572082, + "learning_rate": 3.985998671031039e-06, + "loss": 0.93850881, + "num_input_tokens_seen": 23551020, + "router_z_loss_clip": 5.296875, + "router_z_loss_mlp": 0.68115234, + "step": 1107, + "time_per_iteration": 2.607999324798584 + }, + { + "auxiliary_loss_clip": 0.06769384, + "auxiliary_loss_mlp": 0.01408352, + "balance_loss_clip": 0.06440101, + "balance_loss_mlp": 0.01358189, + "epoch": 0.06661656395611003, + "flos": 61438033779840.0, + "grad_norm": 0.835907980773472, + "language_loss": 0.57139766, + "learning_rate": 3.9859526301871705e-06, + "loss": 0.653175, + "num_input_tokens_seen": 23610675, + "router_z_loss_clip": 3.28515625, + "router_z_loss_mlp": 0.50195312, + "step": 1108, + "time_per_iteration": 3.1505634784698486 + }, + { + "auxiliary_loss_clip": 0.06919513, + "auxiliary_loss_mlp": 0.01358617, + "balance_loss_clip": 0.06388947, + "balance_loss_mlp": 0.01289285, + "epoch": 0.066676687208778, + "flos": 20668304459520.0, + "grad_norm": 4.7813305453067985, + "language_loss": 0.74593651, + "learning_rate": 3.9859065140357795e-06, + "loss": 0.82871783, + "num_input_tokens_seen": 23628710, + "router_z_loss_clip": 5.30078125, + "router_z_loss_mlp": 0.69384766, + "step": 1109, + "time_per_iteration": 2.5951621532440186 + }, + { + "auxiliary_loss_clip": 0.06901313, + "auxiliary_loss_mlp": 0.01359309, + "balance_loss_clip": 0.06382284, + "balance_loss_mlp": 0.01292219, + "epoch": 0.06673681046144596, + "flos": 20929613016960.0, + "grad_norm": 2.4423466539648686, + "language_loss": 0.81162918, + "learning_rate": 3.985860322578614e-06, + "loss": 0.89423537, + "num_input_tokens_seen": 23649160, + "router_z_loss_clip": 5.18359375, + "router_z_loss_mlp": 0.66992188, + "step": 1110, + "time_per_iteration": 2.5594658851623535 + }, + { + "auxiliary_loss_clip": 0.06916048, + "auxiliary_loss_mlp": 0.01350686, + "balance_loss_clip": 0.06385787, + "balance_loss_mlp": 0.01283261, + "epoch": 0.06679693371411394, + "flos": 31073762113920.0, + "grad_norm": 3.192640550751645, + "language_loss": 0.74339402, + "learning_rate": 3.985814055817427e-06, + "loss": 0.82606131, + "num_input_tokens_seen": 23671995, + "router_z_loss_clip": 5.296875, + "router_z_loss_mlp": 0.67431641, + "step": 1111, + "time_per_iteration": 2.6675732135772705 + }, + { + "auxiliary_loss_clip": 0.0692247, + "auxiliary_loss_mlp": 0.01336011, + "balance_loss_clip": 0.0638883, + "balance_loss_mlp": 0.01269492, + "epoch": 0.0668570569667819, + "flos": 21732630220800.0, + "grad_norm": 3.09844838926034, + "language_loss": 0.81051421, + "learning_rate": 3.985767713753971e-06, + "loss": 0.89309895, + "num_input_tokens_seen": 23690705, + "router_z_loss_clip": 5.3359375, + "router_z_loss_mlp": 0.66455078, + "step": 1112, + "time_per_iteration": 2.5785021781921387 + }, + { + "auxiliary_loss_clip": 0.06900664, + "auxiliary_loss_mlp": 0.01347702, + "balance_loss_clip": 0.06384552, + "balance_loss_mlp": 0.01282185, + "epoch": 0.06691718021944987, + "flos": 22753840256640.0, + "grad_norm": 2.9756537070092466, + "language_loss": 0.82400674, + "learning_rate": 3.985721296390005e-06, + "loss": 0.90649039, + "num_input_tokens_seen": 23709990, + "router_z_loss_clip": 5.1640625, + "router_z_loss_mlp": 0.65576172, + "step": 1113, + "time_per_iteration": 2.6159799098968506 + }, + { + "auxiliary_loss_clip": 0.06872059, + "auxiliary_loss_mlp": 0.01337269, + "balance_loss_clip": 0.06376456, + "balance_loss_mlp": 0.01280382, + "epoch": 0.06697730347211785, + "flos": 16551333475200.0, + "grad_norm": 3.049422068587495, + "language_loss": 0.85146165, + "learning_rate": 3.985674803727289e-06, + "loss": 0.93355489, + "num_input_tokens_seen": 23728485, + "router_z_loss_clip": 4.95703125, + "router_z_loss_mlp": 0.56884766, + "step": 1114, + "time_per_iteration": 2.5442495346069336 + }, + { + "auxiliary_loss_clip": 0.06720632, + "auxiliary_loss_mlp": 0.01311166, + "balance_loss_clip": 0.06393555, + "balance_loss_mlp": 0.01264675, + "epoch": 0.06703742672478581, + "flos": 59801545612800.0, + "grad_norm": 0.814822871226623, + "language_loss": 0.58299243, + "learning_rate": 3.985628235767584e-06, + "loss": 0.66331041, + "num_input_tokens_seen": 23786650, + "router_z_loss_clip": 3.2734375, + "router_z_loss_mlp": 0.46435547, + "step": 1115, + "time_per_iteration": 3.1831469535827637 + }, + { + "auxiliary_loss_clip": 0.06912658, + "auxiliary_loss_mlp": 0.01326736, + "balance_loss_clip": 0.06393988, + "balance_loss_mlp": 0.01261314, + "epoch": 0.06709754997745378, + "flos": 16805807925120.0, + "grad_norm": 5.78180725653176, + "language_loss": 0.94695258, + "learning_rate": 3.985581592512658e-06, + "loss": 1.02934647, + "num_input_tokens_seen": 23802555, + "router_z_loss_clip": 5.1875, + "router_z_loss_mlp": 0.65332031, + "step": 1116, + "time_per_iteration": 2.6025443077087402 + }, + { + "auxiliary_loss_clip": 0.06950381, + "auxiliary_loss_mlp": 0.01352294, + "balance_loss_clip": 0.06407215, + "balance_loss_mlp": 0.01283105, + "epoch": 0.06715767323012176, + "flos": 22129883228160.0, + "grad_norm": 3.297350824619057, + "language_loss": 0.90161335, + "learning_rate": 3.985534873964279e-06, + "loss": 0.98464012, + "num_input_tokens_seen": 23822945, + "router_z_loss_clip": 5.42578125, + "router_z_loss_mlp": 0.69189453, + "step": 1117, + "time_per_iteration": 2.640014410018921 + }, + { + "auxiliary_loss_clip": 0.06703123, + "auxiliary_loss_mlp": 0.01296382, + "balance_loss_clip": 0.06378835, + "balance_loss_mlp": 0.01254898, + "epoch": 0.06721779648278972, + "flos": 66634522842240.0, + "grad_norm": 0.828477744144983, + "language_loss": 0.59793437, + "learning_rate": 3.985488080124218e-06, + "loss": 0.67792934, + "num_input_tokens_seen": 23874075, + "router_z_loss_clip": 3.24804688, + "router_z_loss_mlp": 0.41503906, + "step": 1118, + "time_per_iteration": 3.1895816326141357 + }, + { + "auxiliary_loss_clip": 0.0694533, + "auxiliary_loss_mlp": 0.0134688, + "balance_loss_clip": 0.06400572, + "balance_loss_mlp": 0.0127092, + "epoch": 0.06727791973545769, + "flos": 22389011579520.0, + "grad_norm": 4.072656467009049, + "language_loss": 0.87426257, + "learning_rate": 3.985441210994251e-06, + "loss": 0.95718467, + "num_input_tokens_seen": 23889720, + "router_z_loss_clip": 5.453125, + "router_z_loss_mlp": 0.76025391, + "step": 1119, + "time_per_iteration": 2.588590621948242 + }, + { + "auxiliary_loss_clip": 0.0690966, + "auxiliary_loss_mlp": 0.01331486, + "balance_loss_clip": 0.06396869, + "balance_loss_mlp": 0.01269116, + "epoch": 0.06733804298812565, + "flos": 24287143720320.0, + "grad_norm": 3.964620176038611, + "language_loss": 0.88010037, + "learning_rate": 3.9853942665761545e-06, + "loss": 0.9625119, + "num_input_tokens_seen": 23909385, + "router_z_loss_clip": 5.12109375, + "router_z_loss_mlp": 0.62451172, + "step": 1120, + "time_per_iteration": 2.6959142684936523 + }, + { + "auxiliary_loss_clip": 0.06922112, + "auxiliary_loss_mlp": 0.01340271, + "balance_loss_clip": 0.06406626, + "balance_loss_mlp": 0.01275421, + "epoch": 0.06739816624079363, + "flos": 15922638691200.0, + "grad_norm": 2.824028723834481, + "language_loss": 0.81958008, + "learning_rate": 3.985347246871708e-06, + "loss": 0.90220392, + "num_input_tokens_seen": 23926830, + "router_z_loss_clip": 5.15625, + "router_z_loss_mlp": 0.6484375, + "step": 1121, + "time_per_iteration": 2.5337889194488525 + }, + { + "auxiliary_loss_clip": 0.0669936, + "auxiliary_loss_mlp": 0.01328619, + "balance_loss_clip": 0.0637704, + "balance_loss_mlp": 0.01291044, + "epoch": 0.0674582894934616, + "flos": 71422031796480.0, + "grad_norm": 0.7591545371637793, + "language_loss": 0.58392835, + "learning_rate": 3.985300151882694e-06, + "loss": 0.66420811, + "num_input_tokens_seen": 23992640, + "router_z_loss_clip": 3.21875, + "router_z_loss_mlp": 0.375, + "step": 1122, + "time_per_iteration": 4.871971130371094 + }, + { + "auxiliary_loss_clip": 0.06934178, + "auxiliary_loss_mlp": 0.01339594, + "balance_loss_clip": 0.06410946, + "balance_loss_mlp": 0.01275269, + "epoch": 0.06751841274612956, + "flos": 25271988284160.0, + "grad_norm": 2.7004693252579286, + "language_loss": 0.75033748, + "learning_rate": 3.985252981610901e-06, + "loss": 0.83307523, + "num_input_tokens_seen": 24011135, + "router_z_loss_clip": 5.23046875, + "router_z_loss_mlp": 0.64355469, + "step": 1123, + "time_per_iteration": 4.122293472290039 + }, + { + "auxiliary_loss_clip": 0.06974602, + "auxiliary_loss_mlp": 0.0135696, + "balance_loss_clip": 0.06425263, + "balance_loss_mlp": 0.01278282, + "epoch": 0.06757853599879754, + "flos": 23809067850240.0, + "grad_norm": 9.643312426369809, + "language_loss": 0.82052922, + "learning_rate": 3.985205736058114e-06, + "loss": 0.90384483, + "num_input_tokens_seen": 24030695, + "router_z_loss_clip": 5.49609375, + "router_z_loss_mlp": 0.78637695, + "step": 1124, + "time_per_iteration": 2.6173415184020996 + }, + { + "auxiliary_loss_clip": 0.06911455, + "auxiliary_loss_mlp": 0.01341629, + "balance_loss_clip": 0.06401114, + "balance_loss_mlp": 0.01274705, + "epoch": 0.0676386592514655, + "flos": 21040260733440.0, + "grad_norm": 3.063274936287039, + "language_loss": 0.74925935, + "learning_rate": 3.985158415226128e-06, + "loss": 0.83179009, + "num_input_tokens_seen": 24050680, + "router_z_loss_clip": 5.10546875, + "router_z_loss_mlp": 0.66870117, + "step": 1125, + "time_per_iteration": 3.984415292739868 + }, + { + "auxiliary_loss_clip": 0.0694951, + "auxiliary_loss_mlp": 0.01360506, + "balance_loss_clip": 0.06422167, + "balance_loss_mlp": 0.01290745, + "epoch": 0.06769878250413347, + "flos": 25563331330560.0, + "grad_norm": 3.6371795971434935, + "language_loss": 0.84025776, + "learning_rate": 3.985111019116736e-06, + "loss": 0.92335784, + "num_input_tokens_seen": 24067205, + "router_z_loss_clip": 5.2734375, + "router_z_loss_mlp": 0.69726562, + "step": 1126, + "time_per_iteration": 2.6536872386932373 + }, + { + "auxiliary_loss_clip": 0.06684255, + "auxiliary_loss_mlp": 0.01367323, + "balance_loss_clip": 0.06366412, + "balance_loss_mlp": 0.01329891, + "epoch": 0.06775890575680145, + "flos": 70676316385920.0, + "grad_norm": 0.9685337357274917, + "language_loss": 0.60214978, + "learning_rate": 3.985063547731735e-06, + "loss": 0.68266553, + "num_input_tokens_seen": 24131320, + "router_z_loss_clip": 3.1796875, + "router_z_loss_mlp": 0.37353516, + "step": 1127, + "time_per_iteration": 3.2334144115448 + }, + { + "auxiliary_loss_clip": 0.06927685, + "auxiliary_loss_mlp": 0.01345826, + "balance_loss_clip": 0.0640737, + "balance_loss_mlp": 0.01276304, + "epoch": 0.06781902900946941, + "flos": 24241051175040.0, + "grad_norm": 3.0319163993738307, + "language_loss": 0.83925569, + "learning_rate": 3.985016001072925e-06, + "loss": 0.92199081, + "num_input_tokens_seen": 24149930, + "router_z_loss_clip": 5.19921875, + "router_z_loss_mlp": 0.6953125, + "step": 1128, + "time_per_iteration": 4.002989053726196 + }, + { + "auxiliary_loss_clip": 0.06986301, + "auxiliary_loss_mlp": 0.01369711, + "balance_loss_clip": 0.06426411, + "balance_loss_mlp": 0.01288792, + "epoch": 0.06787915226213738, + "flos": 22423825751040.0, + "grad_norm": 5.128906887201041, + "language_loss": 0.79490405, + "learning_rate": 3.984968379142109e-06, + "loss": 0.87846416, + "num_input_tokens_seen": 24169590, + "router_z_loss_clip": 5.59375, + "router_z_loss_mlp": 0.80908203, + "step": 1129, + "time_per_iteration": 2.6091246604919434 + }, + { + "auxiliary_loss_clip": 0.06950344, + "auxiliary_loss_mlp": 0.0134506, + "balance_loss_clip": 0.06413193, + "balance_loss_mlp": 0.01275251, + "epoch": 0.06793927551480534, + "flos": 37716092576640.0, + "grad_norm": 7.724208809946286, + "language_loss": 0.75193048, + "learning_rate": 3.984920681941094e-06, + "loss": 0.83488452, + "num_input_tokens_seen": 24189965, + "router_z_loss_clip": 5.37109375, + "router_z_loss_mlp": 0.69873047, + "step": 1130, + "time_per_iteration": 2.747319221496582 + }, + { + "auxiliary_loss_clip": 0.06924557, + "auxiliary_loss_mlp": 0.01342805, + "balance_loss_clip": 0.06402417, + "balance_loss_mlp": 0.01275428, + "epoch": 0.06799939876747332, + "flos": 20637682992000.0, + "grad_norm": 3.4742611596039583, + "language_loss": 0.83601421, + "learning_rate": 3.984872909471688e-06, + "loss": 0.91868782, + "num_input_tokens_seen": 24208045, + "router_z_loss_clip": 5.21484375, + "router_z_loss_mlp": 0.67333984, + "step": 1131, + "time_per_iteration": 2.619173765182495 + }, + { + "auxiliary_loss_clip": 0.06889838, + "auxiliary_loss_mlp": 0.01323899, + "balance_loss_clip": 0.06390625, + "balance_loss_mlp": 0.01266011, + "epoch": 0.06805952202014129, + "flos": 14869759011840.0, + "grad_norm": 6.452833361572522, + "language_loss": 0.83523953, + "learning_rate": 3.984825061735701e-06, + "loss": 0.91737688, + "num_input_tokens_seen": 24223805, + "router_z_loss_clip": 4.99609375, + "router_z_loss_mlp": 0.57958984, + "step": 1132, + "time_per_iteration": 2.5897791385650635 + }, + { + "auxiliary_loss_clip": 0.06909724, + "auxiliary_loss_mlp": 0.01329094, + "balance_loss_clip": 0.06400912, + "balance_loss_mlp": 0.0126813, + "epoch": 0.06811964527280925, + "flos": 48920710147200.0, + "grad_norm": 2.2815724812180056, + "language_loss": 0.66480637, + "learning_rate": 3.9847771387349495e-06, + "loss": 0.74719459, + "num_input_tokens_seen": 24249475, + "router_z_loss_clip": 5.09375, + "router_z_loss_mlp": 0.61035156, + "step": 1133, + "time_per_iteration": 2.830873966217041 + }, + { + "auxiliary_loss_clip": 0.06951424, + "auxiliary_loss_mlp": 0.01351356, + "balance_loss_clip": 0.06402567, + "balance_loss_mlp": 0.0127573, + "epoch": 0.06817976852547723, + "flos": 15382649053440.0, + "grad_norm": 2.526233551435035, + "language_loss": 0.78033423, + "learning_rate": 3.9847291404712506e-06, + "loss": 0.86336207, + "num_input_tokens_seen": 24267980, + "router_z_loss_clip": 5.484375, + "router_z_loss_mlp": 0.75634766, + "step": 1134, + "time_per_iteration": 2.5770034790039062 + }, + { + "auxiliary_loss_clip": 0.06920115, + "auxiliary_loss_mlp": 0.0133773, + "balance_loss_clip": 0.06399941, + "balance_loss_mlp": 0.01275216, + "epoch": 0.0682398917781452, + "flos": 20161661546880.0, + "grad_norm": 3.170480536995333, + "language_loss": 0.89855266, + "learning_rate": 3.984681066946423e-06, + "loss": 0.98113102, + "num_input_tokens_seen": 24286805, + "router_z_loss_clip": 5.19921875, + "router_z_loss_mlp": 0.625, + "step": 1135, + "time_per_iteration": 2.574153423309326 + }, + { + "auxiliary_loss_clip": 0.06912802, + "auxiliary_loss_mlp": 0.01339867, + "balance_loss_clip": 0.06390901, + "balance_loss_mlp": 0.01268723, + "epoch": 0.06830001503081316, + "flos": 23447341774080.0, + "grad_norm": 4.323885929511343, + "language_loss": 0.81566894, + "learning_rate": 3.984632918162291e-06, + "loss": 0.89819562, + "num_input_tokens_seen": 24305855, + "router_z_loss_clip": 5.2109375, + "router_z_loss_mlp": 0.7109375, + "step": 1136, + "time_per_iteration": 2.632093906402588 + }, + { + "auxiliary_loss_clip": 0.0691568, + "auxiliary_loss_mlp": 0.01339988, + "balance_loss_clip": 0.06395651, + "balance_loss_mlp": 0.01271133, + "epoch": 0.06836013828348114, + "flos": 34358352238080.0, + "grad_norm": 3.452027949613855, + "language_loss": 0.86628962, + "learning_rate": 3.984584694120679e-06, + "loss": 0.94884622, + "num_input_tokens_seen": 24326535, + "router_z_loss_clip": 5.1953125, + "router_z_loss_mlp": 0.68798828, + "step": 1137, + "time_per_iteration": 2.7281885147094727 + }, + { + "auxiliary_loss_clip": 0.0688309, + "auxiliary_loss_mlp": 0.01332345, + "balance_loss_clip": 0.06381994, + "balance_loss_mlp": 0.01269736, + "epoch": 0.06842026153614911, + "flos": 23155537530240.0, + "grad_norm": 8.291551749105667, + "language_loss": 0.81329322, + "learning_rate": 3.984536394823418e-06, + "loss": 0.89544761, + "num_input_tokens_seen": 24345810, + "router_z_loss_clip": 5.0078125, + "router_z_loss_mlp": 0.62646484, + "step": 1138, + "time_per_iteration": 2.605118989944458 + }, + { + "auxiliary_loss_clip": 0.06915967, + "auxiliary_loss_mlp": 0.01331137, + "balance_loss_clip": 0.06396595, + "balance_loss_mlp": 0.01263808, + "epoch": 0.06848038478881707, + "flos": 24616026195840.0, + "grad_norm": 3.6376188064113704, + "language_loss": 0.88301587, + "learning_rate": 3.984488020272336e-06, + "loss": 0.96548682, + "num_input_tokens_seen": 24366095, + "router_z_loss_clip": 5.203125, + "router_z_loss_mlp": 0.67382812, + "step": 1139, + "time_per_iteration": 2.5919554233551025 + }, + { + "auxiliary_loss_clip": 0.06913859, + "auxiliary_loss_mlp": 0.01335261, + "balance_loss_clip": 0.0640454, + "balance_loss_mlp": 0.01272175, + "epoch": 0.06854050804148504, + "flos": 40890663889920.0, + "grad_norm": 3.4360954602414515, + "language_loss": 0.78086925, + "learning_rate": 3.984439570469271e-06, + "loss": 0.8633604, + "num_input_tokens_seen": 24388665, + "router_z_loss_clip": 5.09375, + "router_z_loss_mlp": 0.6315918, + "step": 1140, + "time_per_iteration": 2.805285930633545 + }, + { + "auxiliary_loss_clip": 0.06922249, + "auxiliary_loss_mlp": 0.01343333, + "balance_loss_clip": 0.06401816, + "balance_loss_mlp": 0.01273191, + "epoch": 0.06860063129415302, + "flos": 31694448833280.0, + "grad_norm": 3.650068739701382, + "language_loss": 0.7214306, + "learning_rate": 3.9843910454160574e-06, + "loss": 0.80408645, + "num_input_tokens_seen": 24407705, + "router_z_loss_clip": 5.1953125, + "router_z_loss_mlp": 0.70166016, + "step": 1141, + "time_per_iteration": 2.661224603652954 + }, + { + "auxiliary_loss_clip": 0.06967719, + "auxiliary_loss_mlp": 0.0134803, + "balance_loss_clip": 0.06416196, + "balance_loss_mlp": 0.01274931, + "epoch": 0.06866075454682098, + "flos": 26549265997440.0, + "grad_norm": 3.4867433558806664, + "language_loss": 0.81973946, + "learning_rate": 3.984342445114538e-06, + "loss": 0.902897, + "num_input_tokens_seen": 24428390, + "router_z_loss_clip": 5.515625, + "router_z_loss_mlp": 0.73095703, + "step": 1142, + "time_per_iteration": 2.6615188121795654 + }, + { + "auxiliary_loss_clip": 0.06894746, + "auxiliary_loss_mlp": 0.01330861, + "balance_loss_clip": 0.06396586, + "balance_loss_mlp": 0.01266488, + "epoch": 0.06872087779948895, + "flos": 29797658357760.0, + "grad_norm": 2.7600235318020157, + "language_loss": 0.71011055, + "learning_rate": 3.984293769566553e-06, + "loss": 0.79236662, + "num_input_tokens_seen": 24450810, + "router_z_loss_clip": 4.97265625, + "router_z_loss_mlp": 0.64404297, + "step": 1143, + "time_per_iteration": 2.6366419792175293 + }, + { + "auxiliary_loss_clip": 0.06881121, + "auxiliary_loss_mlp": 0.01324263, + "balance_loss_clip": 0.06384973, + "balance_loss_mlp": 0.01260987, + "epoch": 0.06878100105215693, + "flos": 26948070305280.0, + "grad_norm": 2.948232373137099, + "language_loss": 0.77426863, + "learning_rate": 3.98424501877395e-06, + "loss": 0.85632247, + "num_input_tokens_seen": 24469965, + "router_z_loss_clip": 4.9609375, + "router_z_loss_mlp": 0.63232422, + "step": 1144, + "time_per_iteration": 2.6423499584198 + }, + { + "auxiliary_loss_clip": 0.06941762, + "auxiliary_loss_mlp": 0.01342145, + "balance_loss_clip": 0.0640377, + "balance_loss_mlp": 0.01268617, + "epoch": 0.06884112430482489, + "flos": 10675361255040.0, + "grad_norm": 11.35172742857112, + "language_loss": 0.95204943, + "learning_rate": 3.984196192738577e-06, + "loss": 1.03488851, + "num_input_tokens_seen": 24486370, + "router_z_loss_clip": 5.37890625, + "router_z_loss_mlp": 0.73486328, + "step": 1145, + "time_per_iteration": 2.5397605895996094 + }, + { + "auxiliary_loss_clip": 0.06956828, + "auxiliary_loss_mlp": 0.01350992, + "balance_loss_clip": 0.06409793, + "balance_loss_mlp": 0.01275032, + "epoch": 0.06890124755749286, + "flos": 20199871808640.0, + "grad_norm": 2.888200090327115, + "language_loss": 0.85492933, + "learning_rate": 3.984147291462285e-06, + "loss": 0.93800759, + "num_input_tokens_seen": 24503780, + "router_z_loss_clip": 5.47265625, + "router_z_loss_mlp": 0.76025391, + "step": 1146, + "time_per_iteration": 2.594526529312134 + }, + { + "auxiliary_loss_clip": 0.06872599, + "auxiliary_loss_mlp": 0.01322623, + "balance_loss_clip": 0.06383249, + "balance_loss_mlp": 0.01261373, + "epoch": 0.06896137081016084, + "flos": 20455520215680.0, + "grad_norm": 3.1845992476426472, + "language_loss": 0.87540007, + "learning_rate": 3.98409831494693e-06, + "loss": 0.95735222, + "num_input_tokens_seen": 24522320, + "router_z_loss_clip": 4.890625, + "router_z_loss_mlp": 0.61303711, + "step": 1147, + "time_per_iteration": 2.583275556564331 + }, + { + "auxiliary_loss_clip": 0.06904457, + "auxiliary_loss_mlp": 0.01331833, + "balance_loss_clip": 0.06408815, + "balance_loss_mlp": 0.01268628, + "epoch": 0.0690214940628288, + "flos": 18374512538880.0, + "grad_norm": 2.487655094523106, + "language_loss": 0.88253343, + "learning_rate": 3.984049263194367e-06, + "loss": 0.96489632, + "num_input_tokens_seen": 24540445, + "router_z_loss_clip": 4.9453125, + "router_z_loss_mlp": 0.63232422, + "step": 1148, + "time_per_iteration": 2.6046411991119385 + }, + { + "auxiliary_loss_clip": 0.06914362, + "auxiliary_loss_mlp": 0.01331137, + "balance_loss_clip": 0.0640358, + "balance_loss_mlp": 0.01259516, + "epoch": 0.06908161731549677, + "flos": 20564239288320.0, + "grad_norm": 4.03707404203517, + "language_loss": 0.7250514, + "learning_rate": 3.9840001362064575e-06, + "loss": 0.80750638, + "num_input_tokens_seen": 24557105, + "router_z_loss_clip": 5.10546875, + "router_z_loss_mlp": 0.71606445, + "step": 1149, + "time_per_iteration": 2.598886489868164 + }, + { + "auxiliary_loss_clip": 0.06921704, + "auxiliary_loss_mlp": 0.01339506, + "balance_loss_clip": 0.06409335, + "balance_loss_mlp": 0.01271891, + "epoch": 0.06914174056816474, + "flos": 27571104938880.0, + "grad_norm": 5.60622478722484, + "language_loss": 0.87750047, + "learning_rate": 3.983950933985064e-06, + "loss": 0.96011257, + "num_input_tokens_seen": 24578240, + "router_z_loss_clip": 5.125, + "router_z_loss_mlp": 0.67626953, + "step": 1150, + "time_per_iteration": 2.618924379348755 + }, + { + "auxiliary_loss_clip": 0.06931552, + "auxiliary_loss_mlp": 0.01344517, + "balance_loss_clip": 0.06421608, + "balance_loss_mlp": 0.01277283, + "epoch": 0.06920186382083271, + "flos": 15309331130880.0, + "grad_norm": 4.140310732721626, + "language_loss": 0.85321879, + "learning_rate": 3.983901656532052e-06, + "loss": 0.93597955, + "num_input_tokens_seen": 24593585, + "router_z_loss_clip": 5.08984375, + "router_z_loss_mlp": 0.671875, + "step": 1151, + "time_per_iteration": 2.561635971069336 + }, + { + "auxiliary_loss_clip": 0.06954889, + "auxiliary_loss_mlp": 0.01331032, + "balance_loss_clip": 0.06432007, + "balance_loss_mlp": 0.01262987, + "epoch": 0.06926198707350067, + "flos": 25198125310080.0, + "grad_norm": 6.641784633133515, + "language_loss": 0.8773886, + "learning_rate": 3.983852303849291e-06, + "loss": 0.96024776, + "num_input_tokens_seen": 24613110, + "router_z_loss_clip": 5.2265625, + "router_z_loss_mlp": 0.68066406, + "step": 1152, + "time_per_iteration": 2.610301971435547 + }, + { + "auxiliary_loss_clip": 0.06939621, + "auxiliary_loss_mlp": 0.01350234, + "balance_loss_clip": 0.06435804, + "balance_loss_mlp": 0.01282142, + "epoch": 0.06932211032616864, + "flos": 13260328513920.0, + "grad_norm": 2.8280818960049046, + "language_loss": 0.93534935, + "learning_rate": 3.983802875938651e-06, + "loss": 1.01824796, + "num_input_tokens_seen": 24628795, + "router_z_loss_clip": 5.03125, + "router_z_loss_mlp": 0.68066406, + "step": 1153, + "time_per_iteration": 2.595799207687378 + }, + { + "auxiliary_loss_clip": 0.06937614, + "auxiliary_loss_mlp": 0.01346443, + "balance_loss_clip": 0.06424908, + "balance_loss_mlp": 0.01280687, + "epoch": 0.06938223357883662, + "flos": 24834386736000.0, + "grad_norm": 3.275555077522592, + "language_loss": 0.83502865, + "learning_rate": 3.983753372802008e-06, + "loss": 0.91786921, + "num_input_tokens_seen": 24645480, + "router_z_loss_clip": 5.125, + "router_z_loss_mlp": 0.65771484, + "step": 1154, + "time_per_iteration": 2.615935802459717 + }, + { + "auxiliary_loss_clip": 0.06924553, + "auxiliary_loss_mlp": 0.01343071, + "balance_loss_clip": 0.06417688, + "balance_loss_mlp": 0.01275837, + "epoch": 0.06944235683150458, + "flos": 27274730647680.0, + "grad_norm": 2.790851822686811, + "language_loss": 0.77858025, + "learning_rate": 3.983703794441237e-06, + "loss": 0.86125654, + "num_input_tokens_seen": 24664630, + "router_z_loss_clip": 5.06640625, + "router_z_loss_mlp": 0.67285156, + "step": 1155, + "time_per_iteration": 2.6646928787231445 + }, + { + "auxiliary_loss_clip": 0.06934217, + "auxiliary_loss_mlp": 0.01349275, + "balance_loss_clip": 0.06429212, + "balance_loss_mlp": 0.01284616, + "epoch": 0.06950248008417255, + "flos": 25814493544320.0, + "grad_norm": 4.449978036613599, + "language_loss": 0.73122412, + "learning_rate": 3.98365414085822e-06, + "loss": 0.81405902, + "num_input_tokens_seen": 24684210, + "router_z_loss_clip": 5.05078125, + "router_z_loss_mlp": 0.64697266, + "step": 1156, + "time_per_iteration": 2.6129708290100098 + }, + { + "auxiliary_loss_clip": 0.06933945, + "auxiliary_loss_mlp": 0.0134792, + "balance_loss_clip": 0.06418756, + "balance_loss_mlp": 0.01275202, + "epoch": 0.06956260333684053, + "flos": 22277818811520.0, + "grad_norm": 6.490327446037073, + "language_loss": 0.77343124, + "learning_rate": 3.98360441205484e-06, + "loss": 0.85624993, + "num_input_tokens_seen": 24702490, + "router_z_loss_clip": 5.1484375, + "router_z_loss_mlp": 0.7265625, + "step": 1157, + "time_per_iteration": 2.617549419403076 + }, + { + "auxiliary_loss_clip": 0.06920086, + "auxiliary_loss_mlp": 0.01334116, + "balance_loss_clip": 0.06410048, + "balance_loss_mlp": 0.01268265, + "epoch": 0.0696227265895085, + "flos": 29689442409600.0, + "grad_norm": 3.2808507481159785, + "language_loss": 0.7421459, + "learning_rate": 3.983554608032982e-06, + "loss": 0.8246879, + "num_input_tokens_seen": 24724340, + "router_z_loss_clip": 5.1015625, + "router_z_loss_mlp": 0.65869141, + "step": 1158, + "time_per_iteration": 2.649886131286621 + }, + { + "auxiliary_loss_clip": 0.0693851, + "auxiliary_loss_mlp": 0.01343202, + "balance_loss_clip": 0.06428596, + "balance_loss_mlp": 0.01279401, + "epoch": 0.06968284984217646, + "flos": 25531158562560.0, + "grad_norm": 2.8574838231568687, + "language_loss": 0.82572293, + "learning_rate": 3.983504728794533e-06, + "loss": 0.90854007, + "num_input_tokens_seen": 24745550, + "router_z_loss_clip": 5.09765625, + "router_z_loss_mlp": 0.63818359, + "step": 1159, + "time_per_iteration": 2.657604694366455 + }, + { + "auxiliary_loss_clip": 0.06916194, + "auxiliary_loss_mlp": 0.01333029, + "balance_loss_clip": 0.06403087, + "balance_loss_mlp": 0.01260598, + "epoch": 0.06974297309484444, + "flos": 20703454047360.0, + "grad_norm": 4.319041132998911, + "language_loss": 0.83704364, + "learning_rate": 3.983454774341387e-06, + "loss": 0.91953588, + "num_input_tokens_seen": 24762575, + "router_z_loss_clip": 5.125, + "router_z_loss_mlp": 0.72460938, + "step": 1160, + "time_per_iteration": 2.5699267387390137 + }, + { + "auxiliary_loss_clip": 0.06909285, + "auxiliary_loss_mlp": 0.01331612, + "balance_loss_clip": 0.06406631, + "balance_loss_mlp": 0.01266857, + "epoch": 0.0698030963475124, + "flos": 26512397400960.0, + "grad_norm": 2.5893552087800598, + "language_loss": 0.78334123, + "learning_rate": 3.983404744675437e-06, + "loss": 0.86575019, + "num_input_tokens_seen": 24782605, + "router_z_loss_clip": 5.0234375, + "router_z_loss_mlp": 0.64794922, + "step": 1161, + "time_per_iteration": 4.190939664840698 + }, + { + "auxiliary_loss_clip": 0.06900249, + "auxiliary_loss_mlp": 0.0132851, + "balance_loss_clip": 0.06396457, + "balance_loss_mlp": 0.01263899, + "epoch": 0.06986321960018037, + "flos": 23047279655040.0, + "grad_norm": 6.695162889354259, + "language_loss": 0.8492136, + "learning_rate": 3.9833546397985794e-06, + "loss": 0.93150115, + "num_input_tokens_seen": 24802910, + "router_z_loss_clip": 5.0390625, + "router_z_loss_mlp": 0.64575195, + "step": 1162, + "time_per_iteration": 2.639911413192749 + }, + { + "auxiliary_loss_clip": 0.06873773, + "auxiliary_loss_mlp": 0.01325161, + "balance_loss_clip": 0.06388026, + "balance_loss_mlp": 0.01266557, + "epoch": 0.06992334285284833, + "flos": 28592356901760.0, + "grad_norm": 3.1892890701678778, + "language_loss": 0.82525402, + "learning_rate": 3.983304459712716e-06, + "loss": 0.90724337, + "num_input_tokens_seen": 24823305, + "router_z_loss_clip": 4.8515625, + "router_z_loss_mlp": 0.58642578, + "step": 1163, + "time_per_iteration": 4.1009368896484375 + }, + { + "auxiliary_loss_clip": 0.06902477, + "auxiliary_loss_mlp": 0.0132859, + "balance_loss_clip": 0.06390633, + "balance_loss_mlp": 0.01260832, + "epoch": 0.06998346610551631, + "flos": 20601694863360.0, + "grad_norm": 2.8425577951758956, + "language_loss": 0.8088491, + "learning_rate": 3.983254204419749e-06, + "loss": 0.89115977, + "num_input_tokens_seen": 24842155, + "router_z_loss_clip": 5.1171875, + "router_z_loss_mlp": 0.67773438, + "step": 1164, + "time_per_iteration": 2.6123766899108887 + }, + { + "auxiliary_loss_clip": 0.06897761, + "auxiliary_loss_mlp": 0.01323941, + "balance_loss_clip": 0.06385773, + "balance_loss_mlp": 0.012589, + "epoch": 0.07004358935818428, + "flos": 22535437789440.0, + "grad_norm": 2.2246598791524903, + "language_loss": 0.75642318, + "learning_rate": 3.983203873921583e-06, + "loss": 0.83864021, + "num_input_tokens_seen": 24862080, + "router_z_loss_clip": 5.1171875, + "router_z_loss_mlp": 0.64941406, + "step": 1165, + "time_per_iteration": 4.041048288345337 + }, + { + "auxiliary_loss_clip": 0.06871405, + "auxiliary_loss_mlp": 0.01319453, + "balance_loss_clip": 0.06375992, + "balance_loss_mlp": 0.01258847, + "epoch": 0.07010371261085224, + "flos": 28957646776320.0, + "grad_norm": 2.442665636555923, + "language_loss": 0.83451885, + "learning_rate": 3.983153468220128e-06, + "loss": 0.91642749, + "num_input_tokens_seen": 24886165, + "router_z_loss_clip": 4.94921875, + "router_z_loss_mlp": 0.60668945, + "step": 1166, + "time_per_iteration": 2.652954339981079 + }, + { + "auxiliary_loss_clip": 0.06883232, + "auxiliary_loss_mlp": 0.01318395, + "balance_loss_clip": 0.06374976, + "balance_loss_mlp": 0.01257599, + "epoch": 0.07016383586352022, + "flos": 23665870022400.0, + "grad_norm": 2.9279177018628393, + "language_loss": 0.87250483, + "learning_rate": 3.983102987317295e-06, + "loss": 0.95452112, + "num_input_tokens_seen": 24905775, + "router_z_loss_clip": 5.07421875, + "router_z_loss_mlp": 0.60791016, + "step": 1167, + "time_per_iteration": 3.997807502746582 + }, + { + "auxiliary_loss_clip": 0.06869654, + "auxiliary_loss_mlp": 0.01315759, + "balance_loss_clip": 0.0637234, + "balance_loss_mlp": 0.01256608, + "epoch": 0.07022395911618819, + "flos": 19798258389120.0, + "grad_norm": 3.2057139816430826, + "language_loss": 0.9293927, + "learning_rate": 3.983052431214997e-06, + "loss": 1.01124692, + "num_input_tokens_seen": 24924295, + "router_z_loss_clip": 4.97265625, + "router_z_loss_mlp": 0.59106445, + "step": 1168, + "time_per_iteration": 2.6452579498291016 + }, + { + "auxiliary_loss_clip": 0.06893629, + "auxiliary_loss_mlp": 0.01330714, + "balance_loss_clip": 0.06368282, + "balance_loss_mlp": 0.01258331, + "epoch": 0.07028408236885615, + "flos": 21695551989120.0, + "grad_norm": 11.495675802169094, + "language_loss": 0.91365838, + "learning_rate": 3.983001799915153e-06, + "loss": 0.99590182, + "num_input_tokens_seen": 24943210, + "router_z_loss_clip": 5.24609375, + "router_z_loss_mlp": 0.72363281, + "step": 1169, + "time_per_iteration": 2.647975444793701 + }, + { + "auxiliary_loss_clip": 0.06888205, + "auxiliary_loss_mlp": 0.01328046, + "balance_loss_clip": 0.06373216, + "balance_loss_mlp": 0.01262696, + "epoch": 0.07034420562152413, + "flos": 25637445866880.0, + "grad_norm": 2.8251979605986515, + "language_loss": 0.87019682, + "learning_rate": 3.982951093419681e-06, + "loss": 0.95235932, + "num_input_tokens_seen": 24960360, + "router_z_loss_clip": 5.14453125, + "router_z_loss_mlp": 0.65356445, + "step": 1170, + "time_per_iteration": 2.6168391704559326 + }, + { + "auxiliary_loss_clip": 0.06855451, + "auxiliary_loss_mlp": 0.01322256, + "balance_loss_clip": 0.06370235, + "balance_loss_mlp": 0.01265703, + "epoch": 0.0704043288741921, + "flos": 20816198115840.0, + "grad_norm": 5.8134102676021175, + "language_loss": 0.77777052, + "learning_rate": 3.982900311730506e-06, + "loss": 0.85954762, + "num_input_tokens_seen": 24978290, + "router_z_loss_clip": 4.84765625, + "router_z_loss_mlp": 0.56542969, + "step": 1171, + "time_per_iteration": 2.5752956867218018 + }, + { + "auxiliary_loss_clip": 0.06854077, + "auxiliary_loss_mlp": 0.01325506, + "balance_loss_clip": 0.06365283, + "balance_loss_mlp": 0.01268191, + "epoch": 0.07046445212686006, + "flos": 25600241854080.0, + "grad_norm": 2.1487650465547463, + "language_loss": 0.92066246, + "learning_rate": 3.9828494548495514e-06, + "loss": 1.00245833, + "num_input_tokens_seen": 24997055, + "router_z_loss_clip": 4.88671875, + "router_z_loss_mlp": 0.57373047, + "step": 1172, + "time_per_iteration": 2.6476805210113525 + }, + { + "auxiliary_loss_clip": 0.06885421, + "auxiliary_loss_mlp": 0.01324663, + "balance_loss_clip": 0.06371161, + "balance_loss_mlp": 0.01262006, + "epoch": 0.07052457537952803, + "flos": 25564086017280.0, + "grad_norm": 2.603738764291359, + "language_loss": 0.84748065, + "learning_rate": 3.982798522778748e-06, + "loss": 0.92958152, + "num_input_tokens_seen": 25017490, + "router_z_loss_clip": 5.140625, + "router_z_loss_mlp": 0.62695312, + "step": 1173, + "time_per_iteration": 2.6071321964263916 + }, + { + "auxiliary_loss_clip": 0.06857952, + "auxiliary_loss_mlp": 0.01331109, + "balance_loss_clip": 0.06368312, + "balance_loss_mlp": 0.01273054, + "epoch": 0.070584698632196, + "flos": 17974450419840.0, + "grad_norm": 3.5775835502164868, + "language_loss": 0.85116845, + "learning_rate": 3.9827475155200245e-06, + "loss": 0.9330591, + "num_input_tokens_seen": 25035660, + "router_z_loss_clip": 4.89453125, + "router_z_loss_mlp": 0.58129883, + "step": 1174, + "time_per_iteration": 2.57753324508667 + }, + { + "auxiliary_loss_clip": 0.06853965, + "auxiliary_loss_mlp": 0.01334878, + "balance_loss_clip": 0.06364483, + "balance_loss_mlp": 0.01276847, + "epoch": 0.07064482188486397, + "flos": 25377353193600.0, + "grad_norm": 2.5795508468108053, + "language_loss": 0.87789464, + "learning_rate": 3.982696433075317e-06, + "loss": 0.95978308, + "num_input_tokens_seen": 25054785, + "router_z_loss_clip": 4.89453125, + "router_z_loss_mlp": 0.58056641, + "step": 1175, + "time_per_iteration": 2.610611915588379 + }, + { + "auxiliary_loss_clip": 0.06871554, + "auxiliary_loss_mlp": 0.01331862, + "balance_loss_clip": 0.06373453, + "balance_loss_mlp": 0.0127116, + "epoch": 0.07070494513753194, + "flos": 24906782263680.0, + "grad_norm": 2.676154874226604, + "language_loss": 0.87147272, + "learning_rate": 3.982645275446563e-06, + "loss": 0.95350683, + "num_input_tokens_seen": 25075180, + "router_z_loss_clip": 4.97265625, + "router_z_loss_mlp": 0.60644531, + "step": 1176, + "time_per_iteration": 2.6749603748321533 + }, + { + "auxiliary_loss_clip": 0.06855497, + "auxiliary_loss_mlp": 0.01331059, + "balance_loss_clip": 0.06369121, + "balance_loss_mlp": 0.01272075, + "epoch": 0.07076506839019991, + "flos": 22343715648000.0, + "grad_norm": 7.137695949749425, + "language_loss": 0.76855987, + "learning_rate": 3.982594042635701e-06, + "loss": 0.85042542, + "num_input_tokens_seen": 25093035, + "router_z_loss_clip": 4.86328125, + "router_z_loss_mlp": 0.58984375, + "step": 1177, + "time_per_iteration": 2.57594895362854 + }, + { + "auxiliary_loss_clip": 0.06883623, + "auxiliary_loss_mlp": 0.0132835, + "balance_loss_clip": 0.06377017, + "balance_loss_mlp": 0.01265599, + "epoch": 0.07082519164286788, + "flos": 18666694126080.0, + "grad_norm": 2.8035814441303164, + "language_loss": 0.8769573, + "learning_rate": 3.982542734644673e-06, + "loss": 0.959077, + "num_input_tokens_seen": 25112520, + "router_z_loss_clip": 5.0625, + "router_z_loss_mlp": 0.62695312, + "step": 1178, + "time_per_iteration": 2.6013543605804443 + }, + { + "auxiliary_loss_clip": 0.06703987, + "auxiliary_loss_mlp": 0.0134181, + "balance_loss_clip": 0.06385635, + "balance_loss_mlp": 0.01304808, + "epoch": 0.07088531489553584, + "flos": 63674691615360.0, + "grad_norm": 0.8655968349167181, + "language_loss": 0.63642812, + "learning_rate": 3.982491351475427e-06, + "loss": 0.71688616, + "num_input_tokens_seen": 25177760, + "router_z_loss_clip": 3.1875, + "router_z_loss_mlp": 0.36938477, + "step": 1179, + "time_per_iteration": 3.3081142902374268 + }, + { + "auxiliary_loss_clip": 0.06890059, + "auxiliary_loss_mlp": 0.01335612, + "balance_loss_clip": 0.06383069, + "balance_loss_mlp": 0.01270047, + "epoch": 0.07094543814820382, + "flos": 21577902456960.0, + "grad_norm": 4.088495173814758, + "language_loss": 0.87769747, + "learning_rate": 3.98243989312991e-06, + "loss": 0.9599542, + "num_input_tokens_seen": 25195260, + "router_z_loss_clip": 5.0703125, + "router_z_loss_mlp": 0.65625, + "step": 1180, + "time_per_iteration": 2.559685707092285 + }, + { + "auxiliary_loss_clip": 0.06872466, + "auxiliary_loss_mlp": 0.01339604, + "balance_loss_clip": 0.06370541, + "balance_loss_mlp": 0.01274754, + "epoch": 0.07100556140087179, + "flos": 22096326867840.0, + "grad_norm": 6.479686279022214, + "language_loss": 0.90814912, + "learning_rate": 3.982388359610074e-06, + "loss": 0.99026984, + "num_input_tokens_seen": 25212740, + "router_z_loss_clip": 5.015625, + "router_z_loss_mlp": 0.6484375, + "step": 1181, + "time_per_iteration": 2.616978883743286 + }, + { + "auxiliary_loss_clip": 0.06848356, + "auxiliary_loss_mlp": 0.01339504, + "balance_loss_clip": 0.06372169, + "balance_loss_mlp": 0.01279351, + "epoch": 0.07106568465353975, + "flos": 47933056471680.0, + "grad_norm": 6.025910143763993, + "language_loss": 0.86037725, + "learning_rate": 3.9823367509178725e-06, + "loss": 0.94225585, + "num_input_tokens_seen": 25236420, + "router_z_loss_clip": 4.75390625, + "router_z_loss_mlp": 0.60131836, + "step": 1182, + "time_per_iteration": 2.7946407794952393 + }, + { + "auxiliary_loss_clip": 0.06876318, + "auxiliary_loss_mlp": 0.0134218, + "balance_loss_clip": 0.06371553, + "balance_loss_mlp": 0.01276806, + "epoch": 0.07112580790620772, + "flos": 23447551409280.0, + "grad_norm": 3.676638851024929, + "language_loss": 0.82862288, + "learning_rate": 3.982285067055262e-06, + "loss": 0.91080785, + "num_input_tokens_seen": 25255120, + "router_z_loss_clip": 5.04296875, + "router_z_loss_mlp": 0.65332031, + "step": 1183, + "time_per_iteration": 2.60546612739563 + }, + { + "auxiliary_loss_clip": 0.06882935, + "auxiliary_loss_mlp": 0.01336855, + "balance_loss_clip": 0.0637991, + "balance_loss_mlp": 0.01272101, + "epoch": 0.0711859311588757, + "flos": 31877030880000.0, + "grad_norm": 4.3786669508725335, + "language_loss": 0.81657791, + "learning_rate": 3.982233308024204e-06, + "loss": 0.8987757, + "num_input_tokens_seen": 25275150, + "router_z_loss_clip": 5.02734375, + "router_z_loss_mlp": 0.64794922, + "step": 1184, + "time_per_iteration": 2.651372194290161 + }, + { + "auxiliary_loss_clip": 0.06854693, + "auxiliary_loss_mlp": 0.013301, + "balance_loss_clip": 0.06374621, + "balance_loss_mlp": 0.01271926, + "epoch": 0.07124605441154366, + "flos": 19616514883200.0, + "grad_norm": 2.502972307695957, + "language_loss": 0.79704922, + "learning_rate": 3.98218147382666e-06, + "loss": 0.87889707, + "num_input_tokens_seen": 25293680, + "router_z_loss_clip": 4.796875, + "router_z_loss_mlp": 0.58178711, + "step": 1185, + "time_per_iteration": 2.591947555541992 + }, + { + "auxiliary_loss_clip": 0.06869413, + "auxiliary_loss_mlp": 0.01332248, + "balance_loss_clip": 0.06377724, + "balance_loss_mlp": 0.0127169, + "epoch": 0.07130617766421163, + "flos": 14689776441600.0, + "grad_norm": 8.952451247795917, + "language_loss": 0.68110502, + "learning_rate": 3.982129564464596e-06, + "loss": 0.7631216, + "num_input_tokens_seen": 25310050, + "router_z_loss_clip": 4.91015625, + "router_z_loss_mlp": 0.60546875, + "step": 1186, + "time_per_iteration": 2.52742862701416 + }, + { + "auxiliary_loss_clip": 0.06856332, + "auxiliary_loss_mlp": 0.01335213, + "balance_loss_clip": 0.06375858, + "balance_loss_mlp": 0.01277587, + "epoch": 0.07136630091687961, + "flos": 26075131269120.0, + "grad_norm": 3.0050123348369984, + "language_loss": 0.72187626, + "learning_rate": 3.98207757993998e-06, + "loss": 0.8037917, + "num_input_tokens_seen": 25331020, + "router_z_loss_clip": 4.796875, + "router_z_loss_mlp": 0.57641602, + "step": 1187, + "time_per_iteration": 2.6516740322113037 + }, + { + "auxiliary_loss_clip": 0.06852362, + "auxiliary_loss_mlp": 0.01318955, + "balance_loss_clip": 0.06373794, + "balance_loss_mlp": 0.01261901, + "epoch": 0.07142642416954757, + "flos": 15674621005440.0, + "grad_norm": 8.213543534109728, + "language_loss": 0.81159407, + "learning_rate": 3.9820255202547845e-06, + "loss": 0.89330727, + "num_input_tokens_seen": 25347875, + "router_z_loss_clip": 4.78125, + "router_z_loss_mlp": 0.57006836, + "step": 1188, + "time_per_iteration": 2.535729169845581 + }, + { + "auxiliary_loss_clip": 0.06864372, + "auxiliary_loss_mlp": 0.01337634, + "balance_loss_clip": 0.06379133, + "balance_loss_mlp": 0.01275216, + "epoch": 0.07148654742221554, + "flos": 19761389792640.0, + "grad_norm": 3.9335979273681794, + "language_loss": 0.87605166, + "learning_rate": 3.981973385410981e-06, + "loss": 0.95807171, + "num_input_tokens_seen": 25366715, + "router_z_loss_clip": 4.84765625, + "router_z_loss_mlp": 0.62402344, + "step": 1189, + "time_per_iteration": 2.6562387943267822 + }, + { + "auxiliary_loss_clip": 0.06861293, + "auxiliary_loss_mlp": 0.01342124, + "balance_loss_clip": 0.06382903, + "balance_loss_mlp": 0.01281685, + "epoch": 0.07154667067488352, + "flos": 23477669752320.0, + "grad_norm": 2.556740892092056, + "language_loss": 0.79916418, + "learning_rate": 3.9819211754105494e-06, + "loss": 0.88119841, + "num_input_tokens_seen": 25385450, + "router_z_loss_clip": 4.78125, + "router_z_loss_mlp": 0.60473633, + "step": 1190, + "time_per_iteration": 2.5854697227478027 + }, + { + "auxiliary_loss_clip": 0.06877136, + "auxiliary_loss_mlp": 0.01341277, + "balance_loss_clip": 0.06381981, + "balance_loss_mlp": 0.01274925, + "epoch": 0.07160679392755148, + "flos": 18338859826560.0, + "grad_norm": 3.405692469784563, + "language_loss": 0.78708088, + "learning_rate": 3.981868890255468e-06, + "loss": 0.86926508, + "num_input_tokens_seen": 25403940, + "router_z_loss_clip": 4.94140625, + "router_z_loss_mlp": 0.6628418, + "step": 1191, + "time_per_iteration": 2.638591766357422 + }, + { + "auxiliary_loss_clip": 0.06881537, + "auxiliary_loss_mlp": 0.01331932, + "balance_loss_clip": 0.06384552, + "balance_loss_mlp": 0.01271493, + "epoch": 0.07166691718021945, + "flos": 17752484154240.0, + "grad_norm": 4.470338815774188, + "language_loss": 0.76098609, + "learning_rate": 3.981816529947719e-06, + "loss": 0.84312069, + "num_input_tokens_seen": 25420410, + "router_z_loss_clip": 4.9609375, + "router_z_loss_mlp": 0.60424805, + "step": 1192, + "time_per_iteration": 2.5505447387695312 + }, + { + "auxiliary_loss_clip": 0.06871057, + "auxiliary_loss_mlp": 0.01335615, + "balance_loss_clip": 0.06381638, + "balance_loss_mlp": 0.01275009, + "epoch": 0.07172704043288743, + "flos": 22457885235840.0, + "grad_norm": 6.182703134969588, + "language_loss": 0.8089788, + "learning_rate": 3.9817640944892896e-06, + "loss": 0.89104557, + "num_input_tokens_seen": 25439415, + "router_z_loss_clip": 4.88671875, + "router_z_loss_mlp": 0.60644531, + "step": 1193, + "time_per_iteration": 2.633073329925537 + }, + { + "auxiliary_loss_clip": 0.06859954, + "auxiliary_loss_mlp": 0.01339771, + "balance_loss_clip": 0.06379488, + "balance_loss_mlp": 0.0127733, + "epoch": 0.07178716368555539, + "flos": 23228981233920.0, + "grad_norm": 5.198460731675794, + "language_loss": 0.88664103, + "learning_rate": 3.981711583882166e-06, + "loss": 0.96863824, + "num_input_tokens_seen": 25458715, + "router_z_loss_clip": 4.80078125, + "router_z_loss_mlp": 0.62426758, + "step": 1194, + "time_per_iteration": 2.5827341079711914 + }, + { + "auxiliary_loss_clip": 0.06866181, + "auxiliary_loss_mlp": 0.01325528, + "balance_loss_clip": 0.06383646, + "balance_loss_mlp": 0.01270096, + "epoch": 0.07184728693822336, + "flos": 25157064009600.0, + "grad_norm": 6.369260359442203, + "language_loss": 0.83872163, + "learning_rate": 3.981658998128341e-06, + "loss": 0.92063868, + "num_input_tokens_seen": 25477985, + "router_z_loss_clip": 4.828125, + "router_z_loss_mlp": 0.55444336, + "step": 1195, + "time_per_iteration": 2.6193504333496094 + }, + { + "auxiliary_loss_clip": 0.06856936, + "auxiliary_loss_mlp": 0.01324202, + "balance_loss_clip": 0.06375654, + "balance_loss_mlp": 0.01265241, + "epoch": 0.07190741019089132, + "flos": 22717894055040.0, + "grad_norm": 2.883346879050408, + "language_loss": 0.81836474, + "learning_rate": 3.981606337229808e-06, + "loss": 0.90017617, + "num_input_tokens_seen": 25497110, + "router_z_loss_clip": 4.8046875, + "router_z_loss_mlp": 0.58984375, + "step": 1196, + "time_per_iteration": 2.586151123046875 + }, + { + "auxiliary_loss_clip": 0.06870347, + "auxiliary_loss_mlp": 0.0135034, + "balance_loss_clip": 0.06381004, + "balance_loss_mlp": 0.0128828, + "epoch": 0.0719675334435593, + "flos": 29357247697920.0, + "grad_norm": 3.757214572000768, + "language_loss": 0.74150658, + "learning_rate": 3.9815536011885655e-06, + "loss": 0.82371342, + "num_input_tokens_seen": 25516555, + "router_z_loss_clip": 4.89453125, + "router_z_loss_mlp": 0.62109375, + "step": 1197, + "time_per_iteration": 2.653139114379883 + }, + { + "auxiliary_loss_clip": 0.06849834, + "auxiliary_loss_mlp": 0.01333514, + "balance_loss_clip": 0.0637273, + "balance_loss_mlp": 0.01277867, + "epoch": 0.07202765669622727, + "flos": 17645609871360.0, + "grad_norm": 7.565571046606514, + "language_loss": 0.88836908, + "learning_rate": 3.98150079000661e-06, + "loss": 0.97020251, + "num_input_tokens_seen": 25533895, + "router_z_loss_clip": 4.76953125, + "router_z_loss_mlp": 0.55664062, + "step": 1198, + "time_per_iteration": 2.558506727218628 + }, + { + "auxiliary_loss_clip": 0.06868395, + "auxiliary_loss_mlp": 0.01336115, + "balance_loss_clip": 0.06385568, + "balance_loss_mlp": 0.01278942, + "epoch": 0.07208777994889523, + "flos": 21440448633600.0, + "grad_norm": 9.650241915118821, + "language_loss": 0.86308157, + "learning_rate": 3.981447903685947e-06, + "loss": 0.94512665, + "num_input_tokens_seen": 25554195, + "router_z_loss_clip": 4.828125, + "router_z_loss_mlp": 0.57202148, + "step": 1199, + "time_per_iteration": 2.593768835067749 + }, + { + "auxiliary_loss_clip": 0.06879794, + "auxiliary_loss_mlp": 0.01340676, + "balance_loss_clip": 0.06389172, + "balance_loss_mlp": 0.01281167, + "epoch": 0.07214790320156321, + "flos": 26947776816000.0, + "grad_norm": 2.5713335496183136, + "language_loss": 0.78793061, + "learning_rate": 3.981394942228581e-06, + "loss": 0.87013531, + "num_input_tokens_seen": 25574155, + "router_z_loss_clip": 4.91015625, + "router_z_loss_mlp": 0.59521484, + "step": 1200, + "time_per_iteration": 2.6549324989318848 + }, + { + "auxiliary_loss_clip": 0.06889373, + "auxiliary_loss_mlp": 0.01341905, + "balance_loss_clip": 0.06398184, + "balance_loss_mlp": 0.01281109, + "epoch": 0.07220802645423118, + "flos": 23886997747200.0, + "grad_norm": 3.3919476714664185, + "language_loss": 0.84325218, + "learning_rate": 3.98134190563652e-06, + "loss": 0.925565, + "num_input_tokens_seen": 25592735, + "router_z_loss_clip": 4.91015625, + "router_z_loss_mlp": 0.60839844, + "step": 1201, + "time_per_iteration": 3.9977235794067383 + }, + { + "auxiliary_loss_clip": 0.06908435, + "auxiliary_loss_mlp": 0.01338574, + "balance_loss_clip": 0.06397285, + "balance_loss_mlp": 0.0127382, + "epoch": 0.07226814970689914, + "flos": 19249464072960.0, + "grad_norm": 2.7243272317134624, + "language_loss": 0.71221054, + "learning_rate": 3.981288793911775e-06, + "loss": 0.7946806, + "num_input_tokens_seen": 25611510, + "router_z_loss_clip": 5.109375, + "router_z_loss_mlp": 0.6472168, + "step": 1202, + "time_per_iteration": 4.006861925125122 + }, + { + "auxiliary_loss_clip": 0.06890082, + "auxiliary_loss_mlp": 0.01341886, + "balance_loss_clip": 0.06389347, + "balance_loss_mlp": 0.01278705, + "epoch": 0.07232827295956712, + "flos": 19178074794240.0, + "grad_norm": 3.218171076661328, + "language_loss": 0.89525115, + "learning_rate": 3.98123560705636e-06, + "loss": 0.97757077, + "num_input_tokens_seen": 25629560, + "router_z_loss_clip": 5.00390625, + "router_z_loss_mlp": 0.63232422, + "step": 1203, + "time_per_iteration": 2.6098897457122803 + }, + { + "auxiliary_loss_clip": 0.069024, + "auxiliary_loss_mlp": 0.01349525, + "balance_loss_clip": 0.06393193, + "balance_loss_mlp": 0.01279335, + "epoch": 0.07238839621223508, + "flos": 17645567944320.0, + "grad_norm": 3.0614329982122266, + "language_loss": 0.81485641, + "learning_rate": 3.981182345072293e-06, + "loss": 0.89737558, + "num_input_tokens_seen": 25648330, + "router_z_loss_clip": 5.09375, + "router_z_loss_mlp": 0.70214844, + "step": 1204, + "time_per_iteration": 3.999619960784912 + }, + { + "auxiliary_loss_clip": 0.06911701, + "auxiliary_loss_mlp": 0.01333494, + "balance_loss_clip": 0.06413823, + "balance_loss_mlp": 0.01269693, + "epoch": 0.07244851946490305, + "flos": 28299797971200.0, + "grad_norm": 3.782046298297649, + "language_loss": 0.84954846, + "learning_rate": 3.981129007961593e-06, + "loss": 0.9320004, + "num_input_tokens_seen": 25669470, + "router_z_loss_clip": 4.97265625, + "router_z_loss_mlp": 0.63818359, + "step": 1205, + "time_per_iteration": 2.658663272857666 + }, + { + "auxiliary_loss_clip": 0.06914138, + "auxiliary_loss_mlp": 0.0134752, + "balance_loss_clip": 0.06405394, + "balance_loss_mlp": 0.01278021, + "epoch": 0.07250864271757101, + "flos": 22571383991040.0, + "grad_norm": 9.50364615421703, + "language_loss": 0.78291214, + "learning_rate": 3.981075595726283e-06, + "loss": 0.86552876, + "num_input_tokens_seen": 25690470, + "router_z_loss_clip": 5.078125, + "router_z_loss_mlp": 0.69458008, + "step": 1206, + "time_per_iteration": 2.6500728130340576 + }, + { + "auxiliary_loss_clip": 0.06879818, + "auxiliary_loss_mlp": 0.01347642, + "balance_loss_clip": 0.06386471, + "balance_loss_mlp": 0.0128594, + "epoch": 0.072568765970239, + "flos": 21768869911680.0, + "grad_norm": 3.061800504881848, + "language_loss": 0.79528189, + "learning_rate": 3.981022108368387e-06, + "loss": 0.87755644, + "num_input_tokens_seen": 25709205, + "router_z_loss_clip": 4.9375, + "router_z_loss_mlp": 0.61767578, + "step": 1207, + "time_per_iteration": 4.111234903335571 + }, + { + "auxiliary_loss_clip": 0.06890166, + "auxiliary_loss_mlp": 0.0133734, + "balance_loss_clip": 0.06392397, + "balance_loss_mlp": 0.01278618, + "epoch": 0.07262888922290696, + "flos": 25526672369280.0, + "grad_norm": 2.516808639831756, + "language_loss": 0.82780725, + "learning_rate": 3.9809685458899345e-06, + "loss": 0.91008234, + "num_input_tokens_seen": 25728485, + "router_z_loss_clip": 4.98046875, + "router_z_loss_mlp": 0.58789062, + "step": 1208, + "time_per_iteration": 2.65267276763916 + }, + { + "auxiliary_loss_clip": 0.06873606, + "auxiliary_loss_mlp": 0.01329274, + "balance_loss_clip": 0.06393886, + "balance_loss_mlp": 0.01270813, + "epoch": 0.07268901247557492, + "flos": 21252080655360.0, + "grad_norm": 3.726862788271486, + "language_loss": 0.80825698, + "learning_rate": 3.980914908292955e-06, + "loss": 0.89028573, + "num_input_tokens_seen": 25747730, + "router_z_loss_clip": 4.80078125, + "router_z_loss_mlp": 0.58496094, + "step": 1209, + "time_per_iteration": 2.5653858184814453 + }, + { + "auxiliary_loss_clip": 0.06887256, + "auxiliary_loss_mlp": 0.01333341, + "balance_loss_clip": 0.06401981, + "balance_loss_mlp": 0.012714, + "epoch": 0.0727491357282429, + "flos": 25485611068800.0, + "grad_norm": 85.1554110577333, + "language_loss": 0.83058631, + "learning_rate": 3.980861195579486e-06, + "loss": 0.91279227, + "num_input_tokens_seen": 25768050, + "router_z_loss_clip": 4.84375, + "router_z_loss_mlp": 0.61962891, + "step": 1210, + "time_per_iteration": 2.6290841102600098 + }, + { + "auxiliary_loss_clip": 0.06912959, + "auxiliary_loss_mlp": 0.01335995, + "balance_loss_clip": 0.064188, + "balance_loss_mlp": 0.01275437, + "epoch": 0.07280925898091087, + "flos": 24469054934400.0, + "grad_norm": 2.3690681332483092, + "language_loss": 0.87872899, + "learning_rate": 3.98080740775156e-06, + "loss": 0.96121848, + "num_input_tokens_seen": 25787985, + "router_z_loss_clip": 4.93359375, + "router_z_loss_mlp": 0.60571289, + "step": 1211, + "time_per_iteration": 2.601407289505005 + }, + { + "auxiliary_loss_clip": 0.06907704, + "auxiliary_loss_mlp": 0.01325307, + "balance_loss_clip": 0.06408024, + "balance_loss_mlp": 0.01262221, + "epoch": 0.07286938223357883, + "flos": 18292725354240.0, + "grad_norm": 12.676001298421971, + "language_loss": 0.94102865, + "learning_rate": 3.98075354481122e-06, + "loss": 1.0233587, + "num_input_tokens_seen": 25803620, + "router_z_loss_clip": 5.0, + "router_z_loss_mlp": 0.63134766, + "step": 1212, + "time_per_iteration": 2.583038806915283 + }, + { + "auxiliary_loss_clip": 0.06906819, + "auxiliary_loss_mlp": 0.0132597, + "balance_loss_clip": 0.06410546, + "balance_loss_mlp": 0.01265579, + "epoch": 0.07292950548624681, + "flos": 21221123771520.0, + "grad_norm": 2.174057870864043, + "language_loss": 0.74973536, + "learning_rate": 3.9806996067605055e-06, + "loss": 0.8320632, + "num_input_tokens_seen": 25823315, + "router_z_loss_clip": 4.96484375, + "router_z_loss_mlp": 0.60449219, + "step": 1213, + "time_per_iteration": 2.58750319480896 + }, + { + "auxiliary_loss_clip": 0.06919889, + "auxiliary_loss_mlp": 0.01335737, + "balance_loss_clip": 0.06414144, + "balance_loss_mlp": 0.01270815, + "epoch": 0.07298962873891478, + "flos": 24648492453120.0, + "grad_norm": 3.5327448066046547, + "language_loss": 0.86681479, + "learning_rate": 3.980645593601465e-06, + "loss": 0.9493711, + "num_input_tokens_seen": 25842605, + "router_z_loss_clip": 5.046875, + "router_z_loss_mlp": 0.64868164, + "step": 1214, + "time_per_iteration": 2.6603875160217285 + }, + { + "auxiliary_loss_clip": 0.0691122, + "auxiliary_loss_mlp": 0.01328745, + "balance_loss_clip": 0.06415356, + "balance_loss_mlp": 0.01268855, + "epoch": 0.07304975199158274, + "flos": 27060101614080.0, + "grad_norm": 2.7007963802747197, + "language_loss": 0.87098217, + "learning_rate": 3.980591505336144e-06, + "loss": 0.95338178, + "num_input_tokens_seen": 25863030, + "router_z_loss_clip": 4.95703125, + "router_z_loss_mlp": 0.59863281, + "step": 1215, + "time_per_iteration": 2.6591246128082275 + }, + { + "auxiliary_loss_clip": 0.06944987, + "auxiliary_loss_mlp": 0.01336211, + "balance_loss_clip": 0.06434523, + "balance_loss_mlp": 0.01269025, + "epoch": 0.07310987524425071, + "flos": 33558353781120.0, + "grad_norm": 3.0486240121539385, + "language_loss": 0.83975989, + "learning_rate": 3.980537341966595e-06, + "loss": 0.9225719, + "num_input_tokens_seen": 25888015, + "router_z_loss_clip": 5.1015625, + "router_z_loss_mlp": 0.67138672, + "step": 1216, + "time_per_iteration": 2.7674107551574707 + }, + { + "auxiliary_loss_clip": 0.06944714, + "auxiliary_loss_mlp": 0.01339054, + "balance_loss_clip": 0.06429577, + "balance_loss_mlp": 0.01274585, + "epoch": 0.07316999849691869, + "flos": 28118473735680.0, + "grad_norm": 3.328421621220486, + "language_loss": 0.78921533, + "learning_rate": 3.980483103494872e-06, + "loss": 0.87205303, + "num_input_tokens_seen": 25908660, + "router_z_loss_clip": 5.1484375, + "router_z_loss_mlp": 0.64550781, + "step": 1217, + "time_per_iteration": 2.672692060470581 + }, + { + "auxiliary_loss_clip": 0.06904574, + "auxiliary_loss_mlp": 0.01321216, + "balance_loss_clip": 0.06406265, + "balance_loss_mlp": 0.01263614, + "epoch": 0.07323012174958665, + "flos": 14397888343680.0, + "grad_norm": 2.4648840381938752, + "language_loss": 0.88704532, + "learning_rate": 3.98042878992303e-06, + "loss": 0.96930325, + "num_input_tokens_seen": 25927215, + "router_z_loss_clip": 4.984375, + "router_z_loss_mlp": 0.57592773, + "step": 1218, + "time_per_iteration": 2.6067652702331543 + }, + { + "auxiliary_loss_clip": 0.06908453, + "auxiliary_loss_mlp": 0.01339024, + "balance_loss_clip": 0.06418494, + "balance_loss_mlp": 0.01277607, + "epoch": 0.07329024500225462, + "flos": 21622862972160.0, + "grad_norm": 2.509726295852636, + "language_loss": 0.89056909, + "learning_rate": 3.9803744012531305e-06, + "loss": 0.9730438, + "num_input_tokens_seen": 25945500, + "router_z_loss_clip": 4.89453125, + "router_z_loss_mlp": 0.61376953, + "step": 1219, + "time_per_iteration": 2.644948959350586 + }, + { + "auxiliary_loss_clip": 0.0689719, + "auxiliary_loss_mlp": 0.01336847, + "balance_loss_clip": 0.06407624, + "balance_loss_mlp": 0.01275287, + "epoch": 0.0733503682549226, + "flos": 13229078140800.0, + "grad_norm": 3.459180464583836, + "language_loss": 0.87265766, + "learning_rate": 3.980319937487235e-06, + "loss": 0.95499802, + "num_input_tokens_seen": 25963105, + "router_z_loss_clip": 4.8984375, + "router_z_loss_mlp": 0.61621094, + "step": 1220, + "time_per_iteration": 2.575570583343506 + }, + { + "auxiliary_loss_clip": 0.06925908, + "auxiliary_loss_mlp": 0.01352206, + "balance_loss_clip": 0.06422862, + "balance_loss_mlp": 0.0128974, + "epoch": 0.07341049150759056, + "flos": 20893331399040.0, + "grad_norm": 4.615259324948809, + "language_loss": 0.79933828, + "learning_rate": 3.98026539862741e-06, + "loss": 0.88211942, + "num_input_tokens_seen": 25981690, + "router_z_loss_clip": 5.03125, + "router_z_loss_mlp": 0.62451172, + "step": 1221, + "time_per_iteration": 2.6174440383911133 + }, + { + "auxiliary_loss_clip": 0.06900848, + "auxiliary_loss_mlp": 0.01351796, + "balance_loss_clip": 0.06404451, + "balance_loss_mlp": 0.01290761, + "epoch": 0.07347061476025853, + "flos": 15418972598400.0, + "grad_norm": 2.5998624424358106, + "language_loss": 0.95159388, + "learning_rate": 3.980210784675722e-06, + "loss": 1.03412032, + "num_input_tokens_seen": 25999890, + "router_z_loss_clip": 4.9609375, + "router_z_loss_mlp": 0.61035156, + "step": 1222, + "time_per_iteration": 2.5956273078918457 + }, + { + "auxiliary_loss_clip": 0.06908462, + "auxiliary_loss_mlp": 0.01358079, + "balance_loss_clip": 0.06414389, + "balance_loss_mlp": 0.01303147, + "epoch": 0.0735307380129265, + "flos": 11113591708800.0, + "grad_norm": 14.551194351183868, + "language_loss": 0.93725538, + "learning_rate": 3.980156095634242e-06, + "loss": 1.01992083, + "num_input_tokens_seen": 26016445, + "router_z_loss_clip": 4.94140625, + "router_z_loss_mlp": 0.54907227, + "step": 1223, + "time_per_iteration": 2.5886712074279785 + }, + { + "auxiliary_loss_clip": 0.06916398, + "auxiliary_loss_mlp": 0.01394841, + "balance_loss_clip": 0.06417241, + "balance_loss_mlp": 0.01330874, + "epoch": 0.07359086126559447, + "flos": 23739146017920.0, + "grad_norm": 2.48832330955176, + "language_loss": 0.84952593, + "learning_rate": 3.980101331505045e-06, + "loss": 0.93263835, + "num_input_tokens_seen": 26036080, + "router_z_loss_clip": 4.984375, + "router_z_loss_mlp": 0.63989258, + "step": 1224, + "time_per_iteration": 2.600796937942505 + }, + { + "auxiliary_loss_clip": 0.06916806, + "auxiliary_loss_mlp": 0.01413444, + "balance_loss_clip": 0.06410658, + "balance_loss_mlp": 0.0134354, + "epoch": 0.07365098451826244, + "flos": 20999115578880.0, + "grad_norm": 3.5000549679052932, + "language_loss": 0.86487269, + "learning_rate": 3.9800464922902076e-06, + "loss": 0.94817519, + "num_input_tokens_seen": 26055805, + "router_z_loss_clip": 5.0625, + "router_z_loss_mlp": 0.69921875, + "step": 1225, + "time_per_iteration": 2.6348657608032227 + }, + { + "auxiliary_loss_clip": 0.06893472, + "auxiliary_loss_mlp": 0.01405003, + "balance_loss_clip": 0.06406252, + "balance_loss_mlp": 0.01345017, + "epoch": 0.0737111077709304, + "flos": 19938982521600.0, + "grad_norm": 2.4160640893773544, + "language_loss": 0.93043572, + "learning_rate": 3.979991577991808e-06, + "loss": 1.01342046, + "num_input_tokens_seen": 26073905, + "router_z_loss_clip": 4.8671875, + "router_z_loss_mlp": 0.59960938, + "step": 1226, + "time_per_iteration": 2.5814220905303955 + }, + { + "auxiliary_loss_clip": 0.06951886, + "auxiliary_loss_mlp": 0.01454874, + "balance_loss_clip": 0.06431323, + "balance_loss_mlp": 0.01382633, + "epoch": 0.07377123102359838, + "flos": 16587153895680.0, + "grad_norm": 17.71044350544229, + "language_loss": 0.81177175, + "learning_rate": 3.97993658861193e-06, + "loss": 0.89583939, + "num_input_tokens_seen": 26091700, + "router_z_loss_clip": 5.21484375, + "router_z_loss_mlp": 0.72216797, + "step": 1227, + "time_per_iteration": 2.562495708465576 + }, + { + "auxiliary_loss_clip": 0.06910308, + "auxiliary_loss_mlp": 0.0141995, + "balance_loss_clip": 0.06419577, + "balance_loss_mlp": 0.01357318, + "epoch": 0.07383135427626634, + "flos": 28335911880960.0, + "grad_norm": 2.0840618907227113, + "language_loss": 0.88551241, + "learning_rate": 3.9798815241526575e-06, + "loss": 0.96881503, + "num_input_tokens_seen": 26114105, + "router_z_loss_clip": 4.90625, + "router_z_loss_mlp": 0.6262207, + "step": 1228, + "time_per_iteration": 2.6383354663848877 + }, + { + "auxiliary_loss_clip": 0.06927899, + "auxiliary_loss_mlp": 0.01421335, + "balance_loss_clip": 0.06420749, + "balance_loss_mlp": 0.01352098, + "epoch": 0.07389147752893431, + "flos": 20053277890560.0, + "grad_norm": 2.9618119227327493, + "language_loss": 0.82374752, + "learning_rate": 3.97982638461608e-06, + "loss": 0.90723979, + "num_input_tokens_seen": 26131165, + "router_z_loss_clip": 5.0625, + "router_z_loss_mlp": 0.69238281, + "step": 1229, + "time_per_iteration": 2.572110414505005 + }, + { + "auxiliary_loss_clip": 0.06918953, + "auxiliary_loss_mlp": 0.01426217, + "balance_loss_clip": 0.06413613, + "balance_loss_mlp": 0.01351926, + "epoch": 0.07395160078160229, + "flos": 18120038088960.0, + "grad_norm": 2.8764105468999697, + "language_loss": 0.81244183, + "learning_rate": 3.979771170004287e-06, + "loss": 0.89589357, + "num_input_tokens_seen": 26150040, + "router_z_loss_clip": 5.046875, + "router_z_loss_mlp": 0.74267578, + "step": 1230, + "time_per_iteration": 2.580080270767212 + }, + { + "auxiliary_loss_clip": 0.06901585, + "auxiliary_loss_mlp": 0.01391553, + "balance_loss_clip": 0.06406316, + "balance_loss_mlp": 0.01325273, + "epoch": 0.07401172403427025, + "flos": 23593726056960.0, + "grad_norm": 2.3354922031953547, + "language_loss": 0.83756942, + "learning_rate": 3.979715880319372e-06, + "loss": 0.92050081, + "num_input_tokens_seen": 26169380, + "router_z_loss_clip": 4.95703125, + "router_z_loss_mlp": 0.66210938, + "step": 1231, + "time_per_iteration": 2.6182961463928223 + }, + { + "auxiliary_loss_clip": 0.06916339, + "auxiliary_loss_mlp": 0.01398184, + "balance_loss_clip": 0.06416178, + "balance_loss_mlp": 0.01340868, + "epoch": 0.07407184728693822, + "flos": 26367187075200.0, + "grad_norm": 2.448759958115063, + "language_loss": 0.97958755, + "learning_rate": 3.979660515563434e-06, + "loss": 1.0627327, + "num_input_tokens_seen": 26189420, + "router_z_loss_clip": 5.0, + "router_z_loss_mlp": 0.57373047, + "step": 1232, + "time_per_iteration": 2.6219074726104736 + }, + { + "auxiliary_loss_clip": 0.06881506, + "auxiliary_loss_mlp": 0.01383375, + "balance_loss_clip": 0.06404279, + "balance_loss_mlp": 0.01327991, + "epoch": 0.0741319705396062, + "flos": 22207016511360.0, + "grad_norm": 2.790382340569057, + "language_loss": 0.83657277, + "learning_rate": 3.979605075738569e-06, + "loss": 0.91922164, + "num_input_tokens_seen": 26209300, + "router_z_loss_clip": 4.7734375, + "router_z_loss_mlp": 0.55395508, + "step": 1233, + "time_per_iteration": 2.6186439990997314 + }, + { + "auxiliary_loss_clip": 0.06909496, + "auxiliary_loss_mlp": 0.0136395, + "balance_loss_clip": 0.06408279, + "balance_loss_mlp": 0.01302462, + "epoch": 0.07419209379227416, + "flos": 39209508696960.0, + "grad_norm": 3.1172656995673393, + "language_loss": 0.73086953, + "learning_rate": 3.979549560846883e-06, + "loss": 0.813604, + "num_input_tokens_seen": 26228110, + "router_z_loss_clip": 5.0078125, + "router_z_loss_mlp": 0.61450195, + "step": 1234, + "time_per_iteration": 2.750397205352783 + }, + { + "auxiliary_loss_clip": 0.0689207, + "auxiliary_loss_mlp": 0.01355226, + "balance_loss_clip": 0.06398024, + "balance_loss_mlp": 0.01294786, + "epoch": 0.07425221704494213, + "flos": 22787899741440.0, + "grad_norm": 2.355636628350322, + "language_loss": 0.789891, + "learning_rate": 3.979493970890478e-06, + "loss": 0.87236392, + "num_input_tokens_seen": 26247020, + "router_z_loss_clip": 4.9375, + "router_z_loss_mlp": 0.60473633, + "step": 1235, + "time_per_iteration": 2.5847980976104736 + }, + { + "auxiliary_loss_clip": 0.06876536, + "auxiliary_loss_mlp": 0.0134157, + "balance_loss_clip": 0.0640441, + "balance_loss_mlp": 0.01286972, + "epoch": 0.0743123402976101, + "flos": 22279495893120.0, + "grad_norm": 4.38662001374288, + "language_loss": 0.84938204, + "learning_rate": 3.979438305871464e-06, + "loss": 0.93156314, + "num_input_tokens_seen": 26265750, + "router_z_loss_clip": 4.71484375, + "router_z_loss_mlp": 0.54589844, + "step": 1236, + "time_per_iteration": 2.6517555713653564 + }, + { + "auxiliary_loss_clip": 0.06904443, + "auxiliary_loss_mlp": 0.013457, + "balance_loss_clip": 0.06407445, + "balance_loss_mlp": 0.01288479, + "epoch": 0.07437246355027807, + "flos": 29322768942720.0, + "grad_norm": 2.2405587930301705, + "language_loss": 0.78282797, + "learning_rate": 3.979382565791951e-06, + "loss": 0.86532938, + "num_input_tokens_seen": 26287905, + "router_z_loss_clip": 4.96875, + "router_z_loss_mlp": 0.57275391, + "step": 1237, + "time_per_iteration": 2.729818105697632 + }, + { + "auxiliary_loss_clip": 0.06881858, + "auxiliary_loss_mlp": 0.01325868, + "balance_loss_clip": 0.06397796, + "balance_loss_mlp": 0.01274488, + "epoch": 0.07443258680294604, + "flos": 31953367549440.0, + "grad_norm": 2.5947803667316123, + "language_loss": 0.79746008, + "learning_rate": 3.979326750654053e-06, + "loss": 0.87953734, + "num_input_tokens_seen": 26311795, + "router_z_loss_clip": 4.84765625, + "router_z_loss_mlp": 0.51391602, + "step": 1238, + "time_per_iteration": 2.7127678394317627 + }, + { + "auxiliary_loss_clip": 0.06888152, + "auxiliary_loss_mlp": 0.01350045, + "balance_loss_clip": 0.06387939, + "balance_loss_mlp": 0.01285982, + "epoch": 0.074492710055614, + "flos": 22682031707520.0, + "grad_norm": 6.17193517167714, + "language_loss": 0.88359845, + "learning_rate": 3.9792708604598854e-06, + "loss": 0.96598047, + "num_input_tokens_seen": 26330330, + "router_z_loss_clip": 5.00390625, + "router_z_loss_mlp": 0.64038086, + "step": 1239, + "time_per_iteration": 2.5982487201690674 + }, + { + "auxiliary_loss_clip": 0.06867203, + "auxiliary_loss_mlp": 0.01339139, + "balance_loss_clip": 0.06376298, + "balance_loss_mlp": 0.01279201, + "epoch": 0.07455283330828198, + "flos": 21290752114560.0, + "grad_norm": 4.728508562946579, + "language_loss": 0.9183414, + "learning_rate": 3.979214895211569e-06, + "loss": 1.00040483, + "num_input_tokens_seen": 26348865, + "router_z_loss_clip": 4.90625, + "router_z_loss_mlp": 0.59960938, + "step": 1240, + "time_per_iteration": 3.982212781906128 + }, + { + "auxiliary_loss_clip": 0.0687404, + "auxiliary_loss_mlp": 0.01344277, + "balance_loss_clip": 0.06383809, + "balance_loss_mlp": 0.01287676, + "epoch": 0.07461295656094995, + "flos": 24395150033280.0, + "grad_norm": 2.7209561023558506, + "language_loss": 0.903265, + "learning_rate": 3.979158854911225e-06, + "loss": 0.98544812, + "num_input_tokens_seen": 26368210, + "router_z_loss_clip": 4.8984375, + "router_z_loss_mlp": 0.56616211, + "step": 1241, + "time_per_iteration": 2.622676372528076 + }, + { + "auxiliary_loss_clip": 0.06764787, + "auxiliary_loss_mlp": 0.01319561, + "balance_loss_clip": 0.06452408, + "balance_loss_mlp": 0.01283775, + "epoch": 0.07467307981361791, + "flos": 62127971498880.0, + "grad_norm": 0.8806411506129102, + "language_loss": 0.63242501, + "learning_rate": 3.979102739560979e-06, + "loss": 0.71326846, + "num_input_tokens_seen": 26424890, + "router_z_loss_clip": 3.125, + "router_z_loss_mlp": 0.35864258, + "step": 1242, + "time_per_iteration": 4.608001947402954 + }, + { + "auxiliary_loss_clip": 0.06884564, + "auxiliary_loss_mlp": 0.01350666, + "balance_loss_clip": 0.06376857, + "balance_loss_mlp": 0.01288319, + "epoch": 0.07473320306628589, + "flos": 24870039448320.0, + "grad_norm": 20.01115775481137, + "language_loss": 0.65988898, + "learning_rate": 3.9790465491629595e-06, + "loss": 0.74224126, + "num_input_tokens_seen": 26446405, + "router_z_loss_clip": 5.08203125, + "router_z_loss_mlp": 0.6237793, + "step": 1243, + "time_per_iteration": 2.686720371246338 + }, + { + "auxiliary_loss_clip": 0.068617, + "auxiliary_loss_mlp": 0.01347661, + "balance_loss_clip": 0.06381305, + "balance_loss_mlp": 0.01292491, + "epoch": 0.07479332631895386, + "flos": 24903973152000.0, + "grad_norm": 3.6813184842747346, + "language_loss": 0.78008217, + "learning_rate": 3.978990283719296e-06, + "loss": 0.86217576, + "num_input_tokens_seen": 26466070, + "router_z_loss_clip": 4.8046875, + "router_z_loss_mlp": 0.55175781, + "step": 1244, + "time_per_iteration": 4.040115833282471 + }, + { + "auxiliary_loss_clip": 0.06851211, + "auxiliary_loss_mlp": 0.01348909, + "balance_loss_clip": 0.06370524, + "balance_loss_mlp": 0.01292833, + "epoch": 0.07485344957162182, + "flos": 17819932291200.0, + "grad_norm": 21.86650929914808, + "language_loss": 0.72362238, + "learning_rate": 3.978933943232123e-06, + "loss": 0.80562365, + "num_input_tokens_seen": 26479350, + "router_z_loss_clip": 4.80078125, + "router_z_loss_mlp": 0.56103516, + "step": 1245, + "time_per_iteration": 2.524477481842041 + }, + { + "auxiliary_loss_clip": 0.06865877, + "auxiliary_loss_mlp": 0.01375645, + "balance_loss_clip": 0.06379819, + "balance_loss_mlp": 0.01317042, + "epoch": 0.0749135728242898, + "flos": 25017304199040.0, + "grad_norm": 2.436107230077969, + "language_loss": 0.90751457, + "learning_rate": 3.978877527703576e-06, + "loss": 0.98992985, + "num_input_tokens_seen": 26498255, + "router_z_loss_clip": 4.85546875, + "router_z_loss_mlp": 0.58642578, + "step": 1246, + "time_per_iteration": 4.0361082553863525 + }, + { + "auxiliary_loss_clip": 0.06889592, + "auxiliary_loss_mlp": 0.01353914, + "balance_loss_clip": 0.06373734, + "balance_loss_mlp": 0.0128978, + "epoch": 0.07497369607695777, + "flos": 17827898428800.0, + "grad_norm": 3.630435288529284, + "language_loss": 0.91536689, + "learning_rate": 3.9788210371357945e-06, + "loss": 0.99780184, + "num_input_tokens_seen": 26515375, + "router_z_loss_clip": 5.15234375, + "router_z_loss_mlp": 0.64111328, + "step": 1247, + "time_per_iteration": 2.558710813522339 + }, + { + "auxiliary_loss_clip": 0.06850724, + "auxiliary_loss_mlp": 0.01373111, + "balance_loss_clip": 0.06373762, + "balance_loss_mlp": 0.01312124, + "epoch": 0.07503381932962573, + "flos": 15126287886720.0, + "grad_norm": 2.9459859952497336, + "language_loss": 0.67146099, + "learning_rate": 3.978764471530921e-06, + "loss": 0.7536993, + "num_input_tokens_seen": 26533595, + "router_z_loss_clip": 4.7578125, + "router_z_loss_mlp": 0.60986328, + "step": 1248, + "time_per_iteration": 2.5451245307922363 + }, + { + "auxiliary_loss_clip": 0.06826814, + "auxiliary_loss_mlp": 0.0138466, + "balance_loss_clip": 0.06362367, + "balance_loss_mlp": 0.01326009, + "epoch": 0.0750939425822937, + "flos": 12820588686720.0, + "grad_norm": 4.865871965779137, + "language_loss": 0.76126468, + "learning_rate": 3.978707830891102e-06, + "loss": 0.84337938, + "num_input_tokens_seen": 26549405, + "router_z_loss_clip": 4.64453125, + "router_z_loss_mlp": 0.58642578, + "step": 1249, + "time_per_iteration": 2.547814130783081 + }, + { + "auxiliary_loss_clip": 0.06878477, + "auxiliary_loss_mlp": 0.01356674, + "balance_loss_clip": 0.06384575, + "balance_loss_mlp": 0.01291156, + "epoch": 0.07515406583496168, + "flos": 24213700016640.0, + "grad_norm": 3.3650478618726805, + "language_loss": 0.84855753, + "learning_rate": 3.978651115218482e-06, + "loss": 0.93090904, + "num_input_tokens_seen": 26567200, + "router_z_loss_clip": 4.9296875, + "router_z_loss_mlp": 0.65429688, + "step": 1250, + "time_per_iteration": 2.6201655864715576 + }, + { + "auxiliary_loss_clip": 0.0685844, + "auxiliary_loss_mlp": 0.01372833, + "balance_loss_clip": 0.06383228, + "balance_loss_mlp": 0.01312036, + "epoch": 0.07521418908762964, + "flos": 26695482572160.0, + "grad_norm": 2.950747307093222, + "language_loss": 0.7010417, + "learning_rate": 3.978594324515215e-06, + "loss": 0.7833544, + "num_input_tokens_seen": 26586190, + "router_z_loss_clip": 4.74609375, + "router_z_loss_mlp": 0.60742188, + "step": 1251, + "time_per_iteration": 2.6431658267974854 + }, + { + "auxiliary_loss_clip": 0.06735167, + "auxiliary_loss_mlp": 0.01321971, + "balance_loss_clip": 0.06424966, + "balance_loss_mlp": 0.0128411, + "epoch": 0.0752743123402976, + "flos": 59115255546240.0, + "grad_norm": 0.864981950603712, + "language_loss": 0.69976699, + "learning_rate": 3.9785374587834515e-06, + "loss": 0.78033841, + "num_input_tokens_seen": 26650710, + "router_z_loss_clip": 3.10351562, + "router_z_loss_mlp": 0.37792969, + "step": 1252, + "time_per_iteration": 3.2185781002044678 + }, + { + "auxiliary_loss_clip": 0.06854245, + "auxiliary_loss_mlp": 0.01348889, + "balance_loss_clip": 0.06374305, + "balance_loss_mlp": 0.01288426, + "epoch": 0.07533443559296558, + "flos": 23483749173120.0, + "grad_norm": 3.3162526589419876, + "language_loss": 0.82824075, + "learning_rate": 3.97848051802535e-06, + "loss": 0.91027212, + "num_input_tokens_seen": 26669000, + "router_z_loss_clip": 4.7890625, + "router_z_loss_mlp": 0.60498047, + "step": 1253, + "time_per_iteration": 2.6227848529815674 + }, + { + "auxiliary_loss_clip": 0.06867173, + "auxiliary_loss_mlp": 0.01358456, + "balance_loss_clip": 0.06365065, + "balance_loss_mlp": 0.01293749, + "epoch": 0.07539455884563355, + "flos": 20884149377280.0, + "grad_norm": 6.3858164660002625, + "language_loss": 0.96525204, + "learning_rate": 3.978423502243069e-06, + "loss": 1.04750824, + "num_input_tokens_seen": 26683075, + "router_z_loss_clip": 5.015625, + "router_z_loss_mlp": 0.64697266, + "step": 1254, + "time_per_iteration": 2.5511484146118164 + }, + { + "auxiliary_loss_clip": 0.06840456, + "auxiliary_loss_mlp": 0.0135521, + "balance_loss_clip": 0.06368542, + "balance_loss_mlp": 0.012916, + "epoch": 0.07545468209830151, + "flos": 27680327136000.0, + "grad_norm": 2.4514498349060307, + "language_loss": 0.9076122, + "learning_rate": 3.97836641143877e-06, + "loss": 0.98956883, + "num_input_tokens_seen": 26701875, + "router_z_loss_clip": 4.71875, + "router_z_loss_mlp": 0.63525391, + "step": 1255, + "time_per_iteration": 2.6308302879333496 + }, + { + "auxiliary_loss_clip": 0.06840869, + "auxiliary_loss_mlp": 0.01347194, + "balance_loss_clip": 0.06364559, + "balance_loss_mlp": 0.01285968, + "epoch": 0.0755148053509695, + "flos": 14142198009600.0, + "grad_norm": 2.7245497332904325, + "language_loss": 0.81970763, + "learning_rate": 3.978309245614618e-06, + "loss": 0.90158832, + "num_input_tokens_seen": 26719050, + "router_z_loss_clip": 4.75390625, + "router_z_loss_mlp": 0.61230469, + "step": 1256, + "time_per_iteration": 2.552151679992676 + }, + { + "auxiliary_loss_clip": 0.06681269, + "auxiliary_loss_mlp": 0.01315431, + "balance_loss_clip": 0.06378952, + "balance_loss_mlp": 0.01282076, + "epoch": 0.07557492860363746, + "flos": 58251764822400.0, + "grad_norm": 0.7695886437006154, + "language_loss": 0.58049726, + "learning_rate": 3.9782520047727825e-06, + "loss": 0.66046429, + "num_input_tokens_seen": 26780650, + "router_z_loss_clip": 3.03125, + "router_z_loss_mlp": 0.33374023, + "step": 1257, + "time_per_iteration": 3.304816246032715 + }, + { + "auxiliary_loss_clip": 0.06853162, + "auxiliary_loss_mlp": 0.0135189, + "balance_loss_clip": 0.0636155, + "balance_loss_mlp": 0.01284012, + "epoch": 0.07563505185630542, + "flos": 24651259637760.0, + "grad_norm": 2.373470459060695, + "language_loss": 0.93104446, + "learning_rate": 3.978194688915432e-06, + "loss": 1.0130949, + "num_input_tokens_seen": 26798725, + "router_z_loss_clip": 4.91015625, + "router_z_loss_mlp": 0.6784668, + "step": 1258, + "time_per_iteration": 2.6907479763031006 + }, + { + "auxiliary_loss_clip": 0.06829782, + "auxiliary_loss_mlp": 0.01330684, + "balance_loss_clip": 0.06361564, + "balance_loss_mlp": 0.01273559, + "epoch": 0.07569517510897339, + "flos": 15528362503680.0, + "grad_norm": 3.094615329702446, + "language_loss": 0.84079689, + "learning_rate": 3.978137298044741e-06, + "loss": 0.92240155, + "num_input_tokens_seen": 26817005, + "router_z_loss_clip": 4.6796875, + "router_z_loss_mlp": 0.57128906, + "step": 1259, + "time_per_iteration": 2.5581536293029785 + }, + { + "auxiliary_loss_clip": 0.06848526, + "auxiliary_loss_mlp": 0.0132832, + "balance_loss_clip": 0.06371632, + "balance_loss_mlp": 0.01271052, + "epoch": 0.07575529836164137, + "flos": 22934954856960.0, + "grad_norm": 3.148240250348832, + "language_loss": 0.77577376, + "learning_rate": 3.978079832162885e-06, + "loss": 0.85754222, + "num_input_tokens_seen": 26836655, + "router_z_loss_clip": 4.76171875, + "router_z_loss_mlp": 0.57275391, + "step": 1260, + "time_per_iteration": 2.601511240005493 + }, + { + "auxiliary_loss_clip": 0.06837059, + "auxiliary_loss_mlp": 0.01329742, + "balance_loss_clip": 0.06359653, + "balance_loss_mlp": 0.01268421, + "epoch": 0.07581542161430933, + "flos": 19506537999360.0, + "grad_norm": 2.0302273693268535, + "language_loss": 0.87771595, + "learning_rate": 3.978022291272044e-06, + "loss": 0.95938396, + "num_input_tokens_seen": 26854925, + "router_z_loss_clip": 4.77734375, + "router_z_loss_mlp": 0.61328125, + "step": 1261, + "time_per_iteration": 2.5501255989074707 + }, + { + "auxiliary_loss_clip": 0.06841564, + "auxiliary_loss_mlp": 0.01315914, + "balance_loss_clip": 0.06369701, + "balance_loss_mlp": 0.01256547, + "epoch": 0.0758755448669773, + "flos": 24980519456640.0, + "grad_norm": 2.7189086354386407, + "language_loss": 0.84886664, + "learning_rate": 3.977964675374399e-06, + "loss": 0.93044144, + "num_input_tokens_seen": 26876170, + "router_z_loss_clip": 4.70703125, + "router_z_loss_mlp": 0.59423828, + "step": 1262, + "time_per_iteration": 2.642197370529175 + }, + { + "auxiliary_loss_clip": 0.06848589, + "auxiliary_loss_mlp": 0.01328257, + "balance_loss_clip": 0.06354951, + "balance_loss_mlp": 0.01263312, + "epoch": 0.07593566811964528, + "flos": 22754678797440.0, + "grad_norm": 3.7332355829542183, + "language_loss": 0.84859836, + "learning_rate": 3.977906984472136e-06, + "loss": 0.93036681, + "num_input_tokens_seen": 26895005, + "router_z_loss_clip": 4.94140625, + "router_z_loss_mlp": 0.64941406, + "step": 1263, + "time_per_iteration": 2.5762293338775635 + }, + { + "auxiliary_loss_clip": 0.06852871, + "auxiliary_loss_mlp": 0.01316465, + "balance_loss_clip": 0.06365145, + "balance_loss_mlp": 0.0126039, + "epoch": 0.07599579137231324, + "flos": 23119088204160.0, + "grad_norm": 2.8380907470503036, + "language_loss": 0.78429461, + "learning_rate": 3.977849218567442e-06, + "loss": 0.86598796, + "num_input_tokens_seen": 26913930, + "router_z_loss_clip": 4.87890625, + "router_z_loss_mlp": 0.56103516, + "step": 1264, + "time_per_iteration": 2.7333550453186035 + }, + { + "auxiliary_loss_clip": 0.06862055, + "auxiliary_loss_mlp": 0.01331538, + "balance_loss_clip": 0.06363812, + "balance_loss_mlp": 0.01272362, + "epoch": 0.07605591462498121, + "flos": 14507362103040.0, + "grad_norm": 3.0292139687816455, + "language_loss": 0.84203875, + "learning_rate": 3.977791377662507e-06, + "loss": 0.92397463, + "num_input_tokens_seen": 26931485, + "router_z_loss_clip": 4.984375, + "router_z_loss_mlp": 0.59179688, + "step": 1265, + "time_per_iteration": 2.587218761444092 + }, + { + "auxiliary_loss_clip": 0.06855778, + "auxiliary_loss_mlp": 0.01328532, + "balance_loss_clip": 0.0636021, + "balance_loss_mlp": 0.01264779, + "epoch": 0.07611603787764919, + "flos": 23521037040000.0, + "grad_norm": 3.3546410086249976, + "language_loss": 0.67662913, + "learning_rate": 3.977733461759524e-06, + "loss": 0.7584722, + "num_input_tokens_seen": 26951670, + "router_z_loss_clip": 4.953125, + "router_z_loss_mlp": 0.63720703, + "step": 1266, + "time_per_iteration": 2.6307120323181152 + }, + { + "auxiliary_loss_clip": 0.06869242, + "auxiliary_loss_mlp": 0.01332957, + "balance_loss_clip": 0.06363578, + "balance_loss_mlp": 0.01267201, + "epoch": 0.07617616113031715, + "flos": 21513640775040.0, + "grad_norm": 2.4484297039949894, + "language_loss": 0.81777161, + "learning_rate": 3.977675470860691e-06, + "loss": 0.89979357, + "num_input_tokens_seen": 26970335, + "router_z_loss_clip": 5.0546875, + "router_z_loss_mlp": 0.65673828, + "step": 1267, + "time_per_iteration": 2.5816946029663086 + }, + { + "auxiliary_loss_clip": 0.06859374, + "auxiliary_loss_mlp": 0.01329793, + "balance_loss_clip": 0.06364329, + "balance_loss_mlp": 0.01269354, + "epoch": 0.07623628438298512, + "flos": 14578164403200.0, + "grad_norm": 3.901991680203772, + "language_loss": 0.74711108, + "learning_rate": 3.977617404968205e-06, + "loss": 0.82900274, + "num_input_tokens_seen": 26986025, + "router_z_loss_clip": 4.94140625, + "router_z_loss_mlp": 0.60498047, + "step": 1268, + "time_per_iteration": 2.5329971313476562 + }, + { + "auxiliary_loss_clip": 0.06849901, + "auxiliary_loss_mlp": 0.01321442, + "balance_loss_clip": 0.06367739, + "balance_loss_mlp": 0.01263959, + "epoch": 0.07629640763565308, + "flos": 14725638789120.0, + "grad_norm": 7.47291205592579, + "language_loss": 0.85124403, + "learning_rate": 3.977559264084269e-06, + "loss": 0.93295747, + "num_input_tokens_seen": 27004045, + "router_z_loss_clip": 4.8125, + "router_z_loss_mlp": 0.57421875, + "step": 1269, + "time_per_iteration": 2.5311200618743896 + }, + { + "auxiliary_loss_clip": 0.06839523, + "auxiliary_loss_mlp": 0.01320369, + "balance_loss_clip": 0.0637067, + "balance_loss_mlp": 0.01264126, + "epoch": 0.07635653088832106, + "flos": 14908220835840.0, + "grad_norm": 2.6697300314393355, + "language_loss": 0.91628265, + "learning_rate": 3.977501048211088e-06, + "loss": 0.99788159, + "num_input_tokens_seen": 27022070, + "router_z_loss_clip": 4.6875, + "router_z_loss_mlp": 0.5625, + "step": 1270, + "time_per_iteration": 2.590938091278076 + }, + { + "auxiliary_loss_clip": 0.06847905, + "auxiliary_loss_mlp": 0.01334774, + "balance_loss_clip": 0.06368862, + "balance_loss_mlp": 0.01272309, + "epoch": 0.07641665414098903, + "flos": 26658865537920.0, + "grad_norm": 4.240829447117421, + "language_loss": 0.73391259, + "learning_rate": 3.977442757350869e-06, + "loss": 0.81573939, + "num_input_tokens_seen": 27041755, + "router_z_loss_clip": 4.7890625, + "router_z_loss_mlp": 0.625, + "step": 1271, + "time_per_iteration": 2.5961694717407227 + }, + { + "auxiliary_loss_clip": 0.06838269, + "auxiliary_loss_mlp": 0.01329276, + "balance_loss_clip": 0.06381856, + "balance_loss_mlp": 0.01278445, + "epoch": 0.07647677739365699, + "flos": 25199970099840.0, + "grad_norm": 3.136617280050721, + "language_loss": 0.8526597, + "learning_rate": 3.977384391505823e-06, + "loss": 0.93433517, + "num_input_tokens_seen": 27061540, + "router_z_loss_clip": 4.55859375, + "router_z_loss_mlp": 0.50878906, + "step": 1272, + "time_per_iteration": 2.6091222763061523 + }, + { + "auxiliary_loss_clip": 0.06845278, + "auxiliary_loss_mlp": 0.01336295, + "balance_loss_clip": 0.06370107, + "balance_loss_mlp": 0.01279599, + "epoch": 0.07653690064632497, + "flos": 20564365069440.0, + "grad_norm": 3.1222866186562674, + "language_loss": 0.82570672, + "learning_rate": 3.977325950678162e-06, + "loss": 0.90752244, + "num_input_tokens_seen": 27081395, + "router_z_loss_clip": 4.75, + "router_z_loss_mlp": 0.56713867, + "step": 1273, + "time_per_iteration": 2.5675384998321533 + }, + { + "auxiliary_loss_clip": 0.06864737, + "auxiliary_loss_mlp": 0.01336748, + "balance_loss_clip": 0.06374316, + "balance_loss_mlp": 0.01277787, + "epoch": 0.07659702389899294, + "flos": 22275219335040.0, + "grad_norm": 2.5887634532412123, + "language_loss": 0.83504725, + "learning_rate": 3.977267434870103e-06, + "loss": 0.91706204, + "num_input_tokens_seen": 27101175, + "router_z_loss_clip": 4.90234375, + "router_z_loss_mlp": 0.58862305, + "step": 1274, + "time_per_iteration": 2.594106912612915 + }, + { + "auxiliary_loss_clip": 0.06835781, + "auxiliary_loss_mlp": 0.01338776, + "balance_loss_clip": 0.06372908, + "balance_loss_mlp": 0.01281961, + "epoch": 0.0766571471516609, + "flos": 32644563079680.0, + "grad_norm": 2.657989216371077, + "language_loss": 0.75383544, + "learning_rate": 3.977208844083865e-06, + "loss": 0.835581, + "num_input_tokens_seen": 27124505, + "router_z_loss_clip": 4.62109375, + "router_z_loss_mlp": 0.56835938, + "step": 1275, + "time_per_iteration": 2.6635921001434326 + }, + { + "auxiliary_loss_clip": 0.06867371, + "auxiliary_loss_mlp": 0.01354656, + "balance_loss_clip": 0.06370118, + "balance_loss_mlp": 0.01289377, + "epoch": 0.07671727040432888, + "flos": 15272672169600.0, + "grad_norm": 3.4268385774262637, + "language_loss": 0.82329005, + "learning_rate": 3.9771501783216685e-06, + "loss": 0.90551031, + "num_input_tokens_seen": 27140960, + "router_z_loss_clip": 4.98046875, + "router_z_loss_mlp": 0.65234375, + "step": 1276, + "time_per_iteration": 2.5468428134918213 + }, + { + "auxiliary_loss_clip": 0.06860888, + "auxiliary_loss_mlp": 0.01344496, + "balance_loss_clip": 0.06380928, + "balance_loss_mlp": 0.01285964, + "epoch": 0.07677739365699685, + "flos": 28191665877120.0, + "grad_norm": 8.54617583390301, + "language_loss": 0.61651218, + "learning_rate": 3.97709143758574e-06, + "loss": 0.69856602, + "num_input_tokens_seen": 27160985, + "router_z_loss_clip": 4.78515625, + "router_z_loss_mlp": 0.58544922, + "step": 1277, + "time_per_iteration": 2.6240146160125732 + }, + { + "auxiliary_loss_clip": 0.06864151, + "auxiliary_loss_mlp": 0.01358552, + "balance_loss_clip": 0.06375778, + "balance_loss_mlp": 0.01298471, + "epoch": 0.07683751690966481, + "flos": 18301991230080.0, + "grad_norm": 2.6958136098916565, + "language_loss": 0.76683849, + "learning_rate": 3.977032621878305e-06, + "loss": 0.84906554, + "num_input_tokens_seen": 27178390, + "router_z_loss_clip": 4.875, + "router_z_loss_mlp": 0.60058594, + "step": 1278, + "time_per_iteration": 2.595947742462158 + }, + { + "auxiliary_loss_clip": 0.06835216, + "auxiliary_loss_mlp": 0.01346069, + "balance_loss_clip": 0.06372848, + "balance_loss_mlp": 0.01289683, + "epoch": 0.07689764016233278, + "flos": 21987565868160.0, + "grad_norm": 3.428980152963994, + "language_loss": 0.90527773, + "learning_rate": 3.976973731201596e-06, + "loss": 0.98709059, + "num_input_tokens_seen": 27197505, + "router_z_loss_clip": 4.62109375, + "router_z_loss_mlp": 0.56420898, + "step": 1279, + "time_per_iteration": 3.962568521499634 + }, + { + "auxiliary_loss_clip": 0.06834365, + "auxiliary_loss_mlp": 0.01339419, + "balance_loss_clip": 0.06362047, + "balance_loss_mlp": 0.01287301, + "epoch": 0.07695776341500075, + "flos": 22242417661440.0, + "grad_norm": 3.3495960477632685, + "language_loss": 0.85256732, + "learning_rate": 3.976914765557845e-06, + "loss": 0.93430507, + "num_input_tokens_seen": 27214260, + "router_z_loss_clip": 4.71484375, + "router_z_loss_mlp": 0.52148438, + "step": 1280, + "time_per_iteration": 2.5692243576049805 + }, + { + "auxiliary_loss_clip": 0.06832324, + "auxiliary_loss_mlp": 0.01339262, + "balance_loss_clip": 0.06368576, + "balance_loss_mlp": 0.01283662, + "epoch": 0.07701788666766872, + "flos": 16149300785280.0, + "grad_norm": 2.5153075146211274, + "language_loss": 0.78576446, + "learning_rate": 3.9768557249492875e-06, + "loss": 0.8674804, + "num_input_tokens_seen": 27232525, + "router_z_loss_clip": 4.63671875, + "router_z_loss_mlp": 0.55541992, + "step": 1281, + "time_per_iteration": 4.005364894866943 + }, + { + "auxiliary_loss_clip": 0.06866302, + "auxiliary_loss_mlp": 0.01356763, + "balance_loss_clip": 0.06371205, + "balance_loss_mlp": 0.01291317, + "epoch": 0.07707800992033668, + "flos": 19468998570240.0, + "grad_norm": 5.650134420498799, + "language_loss": 0.77910447, + "learning_rate": 3.9767966093781634e-06, + "loss": 0.8613351, + "num_input_tokens_seen": 27249800, + "router_z_loss_clip": 4.95703125, + "router_z_loss_mlp": 0.65429688, + "step": 1282, + "time_per_iteration": 2.6096553802490234 + }, + { + "auxiliary_loss_clip": 0.06843832, + "auxiliary_loss_mlp": 0.01354603, + "balance_loss_clip": 0.06370867, + "balance_loss_mlp": 0.01298647, + "epoch": 0.07713813317300466, + "flos": 18996415142400.0, + "grad_norm": 3.5179830835441974, + "language_loss": 0.86225599, + "learning_rate": 3.976737418846713e-06, + "loss": 0.94424033, + "num_input_tokens_seen": 27268895, + "router_z_loss_clip": 4.72265625, + "router_z_loss_mlp": 0.55932617, + "step": 1283, + "time_per_iteration": 2.605346202850342 + }, + { + "auxiliary_loss_clip": 0.06835528, + "auxiliary_loss_mlp": 0.01347471, + "balance_loss_clip": 0.06358841, + "balance_loss_mlp": 0.01292039, + "epoch": 0.07719825642567263, + "flos": 18119828453760.0, + "grad_norm": 2.430743235056626, + "language_loss": 0.77539676, + "learning_rate": 3.976678153357181e-06, + "loss": 0.85722673, + "num_input_tokens_seen": 27288180, + "router_z_loss_clip": 4.76171875, + "router_z_loss_mlp": 0.55444336, + "step": 1284, + "time_per_iteration": 3.990124225616455 + }, + { + "auxiliary_loss_clip": 0.06827543, + "auxiliary_loss_mlp": 0.01355487, + "balance_loss_clip": 0.06358978, + "balance_loss_mlp": 0.01300294, + "epoch": 0.0772583796783406, + "flos": 42204307075200.0, + "grad_norm": 2.435341154952095, + "language_loss": 0.78285027, + "learning_rate": 3.976618812911817e-06, + "loss": 0.86468053, + "num_input_tokens_seen": 27311815, + "router_z_loss_clip": 4.6796875, + "router_z_loss_mlp": 0.55200195, + "step": 1285, + "time_per_iteration": 2.7569363117218018 + }, + { + "auxiliary_loss_clip": 0.06851525, + "auxiliary_loss_mlp": 0.01337351, + "balance_loss_clip": 0.06371935, + "balance_loss_mlp": 0.01278081, + "epoch": 0.07731850293100857, + "flos": 24760565688960.0, + "grad_norm": 2.195462031898389, + "language_loss": 0.86501926, + "learning_rate": 3.9765593975128685e-06, + "loss": 0.946908, + "num_input_tokens_seen": 27331890, + "router_z_loss_clip": 4.79296875, + "router_z_loss_mlp": 0.59277344, + "step": 1286, + "time_per_iteration": 4.058920383453369 + }, + { + "auxiliary_loss_clip": 0.06876462, + "auxiliary_loss_mlp": 0.01367501, + "balance_loss_clip": 0.0637191, + "balance_loss_mlp": 0.01299314, + "epoch": 0.07737862618367654, + "flos": 17571537262080.0, + "grad_norm": 2.773879522110049, + "language_loss": 0.79808044, + "learning_rate": 3.97649990716259e-06, + "loss": 0.88052011, + "num_input_tokens_seen": 27348320, + "router_z_loss_clip": 5.04296875, + "router_z_loss_mlp": 0.68212891, + "step": 1287, + "time_per_iteration": 2.562206506729126 + }, + { + "auxiliary_loss_clip": 0.06845251, + "auxiliary_loss_mlp": 0.01340112, + "balance_loss_clip": 0.06370382, + "balance_loss_mlp": 0.01288136, + "epoch": 0.0774387494363445, + "flos": 25633798214400.0, + "grad_norm": 2.3847373218246983, + "language_loss": 0.8715058, + "learning_rate": 3.976440341863237e-06, + "loss": 0.95335943, + "num_input_tokens_seen": 27367670, + "router_z_loss_clip": 4.74609375, + "router_z_loss_mlp": 0.51953125, + "step": 1288, + "time_per_iteration": 2.600308656692505 + }, + { + "auxiliary_loss_clip": 0.0688329, + "auxiliary_loss_mlp": 0.01364865, + "balance_loss_clip": 0.06375885, + "balance_loss_mlp": 0.01300611, + "epoch": 0.07749887268901248, + "flos": 12244778628480.0, + "grad_norm": 3.451146773235629, + "language_loss": 0.8824665, + "learning_rate": 3.976380701617068e-06, + "loss": 0.96494806, + "num_input_tokens_seen": 27385485, + "router_z_loss_clip": 5.0703125, + "router_z_loss_mlp": 0.64306641, + "step": 1289, + "time_per_iteration": 2.6120755672454834 + }, + { + "auxiliary_loss_clip": 0.06845821, + "auxiliary_loss_mlp": 0.01332003, + "balance_loss_clip": 0.06365949, + "balance_loss_mlp": 0.0127781, + "epoch": 0.07755899594168045, + "flos": 25088609623680.0, + "grad_norm": 3.9721153981819377, + "language_loss": 0.87731397, + "learning_rate": 3.976320986426344e-06, + "loss": 0.95909214, + "num_input_tokens_seen": 27405110, + "router_z_loss_clip": 4.80078125, + "router_z_loss_mlp": 0.54150391, + "step": 1290, + "time_per_iteration": 2.6039535999298096 + }, + { + "auxiliary_loss_clip": 0.06849636, + "auxiliary_loss_mlp": 0.0134794, + "balance_loss_clip": 0.0637328, + "balance_loss_mlp": 0.01286833, + "epoch": 0.07761911919434841, + "flos": 14251629841920.0, + "grad_norm": 2.80389948255575, + "language_loss": 0.9359982, + "learning_rate": 3.9762611962933315e-06, + "loss": 1.0179739, + "num_input_tokens_seen": 27422855, + "router_z_loss_clip": 4.765625, + "router_z_loss_mlp": 0.61157227, + "step": 1291, + "time_per_iteration": 2.620960235595703 + }, + { + "auxiliary_loss_clip": 0.06740145, + "auxiliary_loss_mlp": 0.01502792, + "balance_loss_clip": 0.06432445, + "balance_loss_mlp": 0.01475422, + "epoch": 0.07767924244701638, + "flos": 67259639099520.0, + "grad_norm": 0.9524065323514693, + "language_loss": 0.65448344, + "learning_rate": 3.9762013312202955e-06, + "loss": 0.73691273, + "num_input_tokens_seen": 27487190, + "router_z_loss_clip": 3.078125, + "router_z_loss_mlp": 0.27416992, + "step": 1292, + "time_per_iteration": 3.3147408962249756 + }, + { + "auxiliary_loss_clip": 0.06863274, + "auxiliary_loss_mlp": 0.01339428, + "balance_loss_clip": 0.06369414, + "balance_loss_mlp": 0.01279203, + "epoch": 0.07773936569968436, + "flos": 28558548979200.0, + "grad_norm": 5.92776916982661, + "language_loss": 0.89760518, + "learning_rate": 3.9761413912095075e-06, + "loss": 0.97963214, + "num_input_tokens_seen": 27510465, + "router_z_loss_clip": 4.9375, + "router_z_loss_mlp": 0.60229492, + "step": 1293, + "time_per_iteration": 2.649545431137085 + }, + { + "auxiliary_loss_clip": 0.06850281, + "auxiliary_loss_mlp": 0.0134015, + "balance_loss_clip": 0.06365186, + "balance_loss_mlp": 0.01280689, + "epoch": 0.07779948895235232, + "flos": 27497619308160.0, + "grad_norm": 4.7786851588669315, + "language_loss": 0.88117272, + "learning_rate": 3.976081376263239e-06, + "loss": 0.96307707, + "num_input_tokens_seen": 27528645, + "router_z_loss_clip": 4.84765625, + "router_z_loss_mlp": 0.59521484, + "step": 1294, + "time_per_iteration": 2.7246196269989014 + }, + { + "auxiliary_loss_clip": 0.06872948, + "auxiliary_loss_mlp": 0.01341599, + "balance_loss_clip": 0.06369777, + "balance_loss_mlp": 0.01276034, + "epoch": 0.07785961220502029, + "flos": 18229176432000.0, + "grad_norm": 2.917147299599652, + "language_loss": 0.82283127, + "learning_rate": 3.976021286383768e-06, + "loss": 0.90497679, + "num_input_tokens_seen": 27546165, + "router_z_loss_clip": 5.03125, + "router_z_loss_mlp": 0.65576172, + "step": 1295, + "time_per_iteration": 2.565981149673462 + }, + { + "auxiliary_loss_clip": 0.06823503, + "auxiliary_loss_mlp": 0.0131494, + "balance_loss_clip": 0.06354046, + "balance_loss_mlp": 0.01258459, + "epoch": 0.07791973545768827, + "flos": 24615145728000.0, + "grad_norm": 2.406299450212834, + "language_loss": 0.90690672, + "learning_rate": 3.975961121573371e-06, + "loss": 0.9882912, + "num_input_tokens_seen": 27566520, + "router_z_loss_clip": 4.69140625, + "router_z_loss_mlp": 0.56494141, + "step": 1296, + "time_per_iteration": 2.6269545555114746 + }, + { + "auxiliary_loss_clip": 0.06845632, + "auxiliary_loss_mlp": 0.01328069, + "balance_loss_clip": 0.06355733, + "balance_loss_mlp": 0.01267058, + "epoch": 0.07797985871035623, + "flos": 14287156773120.0, + "grad_norm": 2.6954148658412636, + "language_loss": 0.98733974, + "learning_rate": 3.9759008818343305e-06, + "loss": 1.06907678, + "num_input_tokens_seen": 27581960, + "router_z_loss_clip": 4.8984375, + "router_z_loss_mlp": 0.61010742, + "step": 1297, + "time_per_iteration": 2.550185441970825 + }, + { + "auxiliary_loss_clip": 0.06845116, + "auxiliary_loss_mlp": 0.01318807, + "balance_loss_clip": 0.06359702, + "balance_loss_mlp": 0.01258606, + "epoch": 0.0780399819630242, + "flos": 26616965696640.0, + "grad_norm": 2.8603722020093287, + "language_loss": 0.7874198, + "learning_rate": 3.97584056716893e-06, + "loss": 0.86905909, + "num_input_tokens_seen": 27601415, + "router_z_loss_clip": 4.8515625, + "router_z_loss_mlp": 0.60229492, + "step": 1298, + "time_per_iteration": 2.6391749382019043 + }, + { + "auxiliary_loss_clip": 0.06826787, + "auxiliary_loss_mlp": 0.01312488, + "balance_loss_clip": 0.06351642, + "balance_loss_mlp": 0.01258558, + "epoch": 0.07810010521569218, + "flos": 21840846168960.0, + "grad_norm": 2.2381109850938077, + "language_loss": 0.83600903, + "learning_rate": 3.9757801775794575e-06, + "loss": 0.91740179, + "num_input_tokens_seen": 27621490, + "router_z_loss_clip": 4.75, + "router_z_loss_mlp": 0.53979492, + "step": 1299, + "time_per_iteration": 2.5871427059173584 + }, + { + "auxiliary_loss_clip": 0.0681142, + "auxiliary_loss_mlp": 0.01314166, + "balance_loss_clip": 0.06352274, + "balance_loss_mlp": 0.01260713, + "epoch": 0.07816022846836014, + "flos": 25088022645120.0, + "grad_norm": 2.404074331576357, + "language_loss": 0.89199561, + "learning_rate": 3.975719713068202e-06, + "loss": 0.97325152, + "num_input_tokens_seen": 27640600, + "router_z_loss_clip": 4.58984375, + "router_z_loss_mlp": 0.53442383, + "step": 1300, + "time_per_iteration": 2.633734941482544 + }, + { + "auxiliary_loss_clip": 0.06848504, + "auxiliary_loss_mlp": 0.01319579, + "balance_loss_clip": 0.0636059, + "balance_loss_mlp": 0.0125964, + "epoch": 0.0782203517210281, + "flos": 40927197070080.0, + "grad_norm": 2.022718991796153, + "language_loss": 0.7445091, + "learning_rate": 3.975659173637458e-06, + "loss": 0.82618994, + "num_input_tokens_seen": 27663070, + "router_z_loss_clip": 4.87890625, + "router_z_loss_mlp": 0.59936523, + "step": 1301, + "time_per_iteration": 2.7330377101898193 + }, + { + "auxiliary_loss_clip": 0.06825704, + "auxiliary_loss_mlp": 0.01316028, + "balance_loss_clip": 0.0635405, + "balance_loss_mlp": 0.01261335, + "epoch": 0.07828047497369607, + "flos": 41181587665920.0, + "grad_norm": 2.1366155853756275, + "language_loss": 0.73607302, + "learning_rate": 3.97559855928952e-06, + "loss": 0.81749034, + "num_input_tokens_seen": 27686425, + "router_z_loss_clip": 4.7109375, + "router_z_loss_mlp": 0.54736328, + "step": 1302, + "time_per_iteration": 2.781339168548584 + }, + { + "auxiliary_loss_clip": 0.06820317, + "auxiliary_loss_mlp": 0.01324174, + "balance_loss_clip": 0.06356553, + "balance_loss_mlp": 0.01270124, + "epoch": 0.07834059822636405, + "flos": 23513951370240.0, + "grad_norm": 3.2246124193670433, + "language_loss": 0.84486687, + "learning_rate": 3.9755378700266864e-06, + "loss": 0.92631173, + "num_input_tokens_seen": 27704900, + "router_z_loss_clip": 4.640625, + "router_z_loss_mlp": 0.54101562, + "step": 1303, + "time_per_iteration": 2.5946569442749023 + }, + { + "auxiliary_loss_clip": 0.06814861, + "auxiliary_loss_mlp": 0.01309278, + "balance_loss_clip": 0.06343949, + "balance_loss_mlp": 0.01254919, + "epoch": 0.07840072147903202, + "flos": 20200165297920.0, + "grad_norm": 2.085099882897468, + "language_loss": 0.77159727, + "learning_rate": 3.9754771058512585e-06, + "loss": 0.85283864, + "num_input_tokens_seen": 27724890, + "router_z_loss_clip": 4.69921875, + "router_z_loss_mlp": 0.54394531, + "step": 1304, + "time_per_iteration": 2.5800909996032715 + }, + { + "auxiliary_loss_clip": 0.06828763, + "auxiliary_loss_mlp": 0.01313707, + "balance_loss_clip": 0.06349462, + "balance_loss_mlp": 0.01258799, + "epoch": 0.07846084473169998, + "flos": 21367172638080.0, + "grad_norm": 2.1177139553290734, + "language_loss": 0.7841258, + "learning_rate": 3.975416266765542e-06, + "loss": 0.86555046, + "num_input_tokens_seen": 27743115, + "router_z_loss_clip": 4.78515625, + "router_z_loss_mlp": 0.54882812, + "step": 1305, + "time_per_iteration": 2.569558620452881 + }, + { + "auxiliary_loss_clip": 0.06855056, + "auxiliary_loss_mlp": 0.01321096, + "balance_loss_clip": 0.06367438, + "balance_loss_mlp": 0.01261348, + "epoch": 0.07852096798436796, + "flos": 25418037150720.0, + "grad_norm": 3.9004874062794057, + "language_loss": 0.88314414, + "learning_rate": 3.975355352771841e-06, + "loss": 0.96490562, + "num_input_tokens_seen": 27763570, + "router_z_loss_clip": 4.87109375, + "router_z_loss_mlp": 0.59765625, + "step": 1306, + "time_per_iteration": 2.6575305461883545 + }, + { + "auxiliary_loss_clip": 0.06810681, + "auxiliary_loss_mlp": 0.01315273, + "balance_loss_clip": 0.06347391, + "balance_loss_mlp": 0.01263608, + "epoch": 0.07858109123703592, + "flos": 24578360985600.0, + "grad_norm": 4.395850337278793, + "language_loss": 0.93214571, + "learning_rate": 3.975294363872468e-06, + "loss": 1.01340532, + "num_input_tokens_seen": 27780030, + "router_z_loss_clip": 4.6328125, + "router_z_loss_mlp": 0.51660156, + "step": 1307, + "time_per_iteration": 2.592435359954834 + }, + { + "auxiliary_loss_clip": 0.0682511, + "auxiliary_loss_mlp": 0.0131993, + "balance_loss_clip": 0.06345625, + "balance_loss_mlp": 0.01262566, + "epoch": 0.07864121448970389, + "flos": 20704250661120.0, + "grad_norm": 3.2307026300408683, + "language_loss": 0.8507998, + "learning_rate": 3.975233300069735e-06, + "loss": 0.93225014, + "num_input_tokens_seen": 27796225, + "router_z_loss_clip": 4.7890625, + "router_z_loss_mlp": 0.57373047, + "step": 1308, + "time_per_iteration": 2.597881555557251 + }, + { + "auxiliary_loss_clip": 0.06792136, + "auxiliary_loss_mlp": 0.01314144, + "balance_loss_clip": 0.06338251, + "balance_loss_mlp": 0.01262598, + "epoch": 0.07870133774237187, + "flos": 22973207045760.0, + "grad_norm": 1.9389316858499817, + "language_loss": 0.79464692, + "learning_rate": 3.975172161365958e-06, + "loss": 0.87570971, + "num_input_tokens_seen": 27815975, + "router_z_loss_clip": 4.53515625, + "router_z_loss_mlp": 0.515625, + "step": 1309, + "time_per_iteration": 2.599799871444702 + }, + { + "auxiliary_loss_clip": 0.06823064, + "auxiliary_loss_mlp": 0.01328854, + "balance_loss_clip": 0.06347175, + "balance_loss_mlp": 0.01272683, + "epoch": 0.07876146099503983, + "flos": 18848689194240.0, + "grad_norm": 2.5866734138361345, + "language_loss": 0.83378398, + "learning_rate": 3.975110947763453e-06, + "loss": 0.91530323, + "num_input_tokens_seen": 27832255, + "router_z_loss_clip": 4.765625, + "router_z_loss_mlp": 0.56176758, + "step": 1310, + "time_per_iteration": 2.5724973678588867 + }, + { + "auxiliary_loss_clip": 0.0678651, + "auxiliary_loss_mlp": 0.01315999, + "balance_loss_clip": 0.06338531, + "balance_loss_mlp": 0.01264811, + "epoch": 0.0788215842477078, + "flos": 23812631648640.0, + "grad_norm": 2.2765510373912683, + "language_loss": 0.76230896, + "learning_rate": 3.9750496592645435e-06, + "loss": 0.84333402, + "num_input_tokens_seen": 27852180, + "router_z_loss_clip": 4.48046875, + "router_z_loss_mlp": 0.51123047, + "step": 1311, + "time_per_iteration": 2.632310628890991 + }, + { + "auxiliary_loss_clip": 0.0680154, + "auxiliary_loss_mlp": 0.01319845, + "balance_loss_clip": 0.06336971, + "balance_loss_mlp": 0.01265009, + "epoch": 0.07888170750037576, + "flos": 21586329792000.0, + "grad_norm": 3.554782909684318, + "language_loss": 0.88360095, + "learning_rate": 3.974988295871553e-06, + "loss": 0.96481478, + "num_input_tokens_seen": 27871435, + "router_z_loss_clip": 4.640625, + "router_z_loss_mlp": 0.54882812, + "step": 1312, + "time_per_iteration": 2.7384519577026367 + }, + { + "auxiliary_loss_clip": 0.06786558, + "auxiliary_loss_mlp": 0.01318936, + "balance_loss_clip": 0.06334423, + "balance_loss_mlp": 0.01270561, + "epoch": 0.07894183075304374, + "flos": 19870947406080.0, + "grad_norm": 2.1624292410526773, + "language_loss": 0.84578681, + "learning_rate": 3.9749268575868085e-06, + "loss": 0.92684174, + "num_input_tokens_seen": 27890625, + "router_z_loss_clip": 4.5234375, + "router_z_loss_mlp": 0.48388672, + "step": 1313, + "time_per_iteration": 2.6043031215667725 + }, + { + "auxiliary_loss_clip": 0.06836893, + "auxiliary_loss_mlp": 0.01334789, + "balance_loss_clip": 0.06342322, + "balance_loss_mlp": 0.01270368, + "epoch": 0.07900195400571171, + "flos": 16148965368960.0, + "grad_norm": 3.8741474948490717, + "language_loss": 0.75254732, + "learning_rate": 3.97486534441264e-06, + "loss": 0.83426416, + "num_input_tokens_seen": 27906530, + "router_z_loss_clip": 4.94140625, + "router_z_loss_mlp": 0.64404297, + "step": 1314, + "time_per_iteration": 2.532270669937134 + }, + { + "auxiliary_loss_clip": 0.06814209, + "auxiliary_loss_mlp": 0.01316459, + "balance_loss_clip": 0.06346349, + "balance_loss_mlp": 0.01263363, + "epoch": 0.07906207725837967, + "flos": 23736840030720.0, + "grad_norm": 2.0058439737114826, + "language_loss": 0.8208642, + "learning_rate": 3.974803756351379e-06, + "loss": 0.9021709, + "num_input_tokens_seen": 27926725, + "router_z_loss_clip": 4.67578125, + "router_z_loss_mlp": 0.53125, + "step": 1315, + "time_per_iteration": 2.6085028648376465 + }, + { + "auxiliary_loss_clip": 0.06824351, + "auxiliary_loss_mlp": 0.01326067, + "balance_loss_clip": 0.06345295, + "balance_loss_mlp": 0.01265914, + "epoch": 0.07912220051104765, + "flos": 24322712578560.0, + "grad_norm": 1.9106769346900934, + "language_loss": 0.76054502, + "learning_rate": 3.974742093405362e-06, + "loss": 0.84204924, + "num_input_tokens_seen": 27947875, + "router_z_loss_clip": 4.79296875, + "router_z_loss_mlp": 0.60083008, + "step": 1316, + "time_per_iteration": 2.586472749710083 + }, + { + "auxiliary_loss_clip": 0.0684765, + "auxiliary_loss_mlp": 0.01325754, + "balance_loss_clip": 0.06349534, + "balance_loss_mlp": 0.01266244, + "epoch": 0.07918232376371562, + "flos": 18886018988160.0, + "grad_norm": 4.4995832003619, + "language_loss": 0.68677568, + "learning_rate": 3.974680355576927e-06, + "loss": 0.76850969, + "num_input_tokens_seen": 27965040, + "router_z_loss_clip": 4.98046875, + "router_z_loss_mlp": 0.59472656, + "step": 1317, + "time_per_iteration": 2.5489861965179443 + }, + { + "auxiliary_loss_clip": 0.06869859, + "auxiliary_loss_mlp": 0.01349552, + "balance_loss_clip": 0.06357804, + "balance_loss_mlp": 0.01281912, + "epoch": 0.07924244701638358, + "flos": 27382862741760.0, + "grad_norm": 3.047310758275923, + "language_loss": 0.75324464, + "learning_rate": 3.974618542868415e-06, + "loss": 0.83543873, + "num_input_tokens_seen": 27985330, + "router_z_loss_clip": 5.12109375, + "router_z_loss_mlp": 0.67700195, + "step": 1318, + "time_per_iteration": 2.5918128490448 + }, + { + "auxiliary_loss_clip": 0.06830844, + "auxiliary_loss_mlp": 0.01322573, + "balance_loss_clip": 0.06359029, + "balance_loss_mlp": 0.01269692, + "epoch": 0.07930257026905156, + "flos": 25127574572160.0, + "grad_norm": 1.9442087070115428, + "language_loss": 0.92534363, + "learning_rate": 3.97455665528217e-06, + "loss": 1.0068779, + "num_input_tokens_seen": 28007615, + "router_z_loss_clip": 4.7109375, + "router_z_loss_mlp": 0.52929688, + "step": 1319, + "time_per_iteration": 3.993619203567505 + }, + { + "auxiliary_loss_clip": 0.06832193, + "auxiliary_loss_mlp": 0.0132254, + "balance_loss_clip": 0.06361841, + "balance_loss_mlp": 0.01272902, + "epoch": 0.07936269352171953, + "flos": 21840804241920.0, + "grad_norm": 2.144433650708689, + "language_loss": 0.81964207, + "learning_rate": 3.974494692820539e-06, + "loss": 0.90118945, + "num_input_tokens_seen": 28027765, + "router_z_loss_clip": 4.703125, + "router_z_loss_mlp": 0.49633789, + "step": 1320, + "time_per_iteration": 3.991323232650757 + }, + { + "auxiliary_loss_clip": 0.06858893, + "auxiliary_loss_mlp": 0.01331954, + "balance_loss_clip": 0.06361651, + "balance_loss_mlp": 0.01271801, + "epoch": 0.07942281677438749, + "flos": 16944477632640.0, + "grad_norm": 2.2380017082009576, + "language_loss": 0.71816266, + "learning_rate": 3.974432655485872e-06, + "loss": 0.80007118, + "num_input_tokens_seen": 28044225, + "router_z_loss_clip": 4.96484375, + "router_z_loss_mlp": 0.60205078, + "step": 1321, + "time_per_iteration": 2.5437092781066895 + }, + { + "auxiliary_loss_clip": 0.06835557, + "auxiliary_loss_mlp": 0.01340758, + "balance_loss_clip": 0.06363731, + "balance_loss_mlp": 0.01282297, + "epoch": 0.07948294002705546, + "flos": 18992515927680.0, + "grad_norm": 2.7756488817332943, + "language_loss": 0.86391938, + "learning_rate": 3.9743705432805195e-06, + "loss": 0.94568253, + "num_input_tokens_seen": 28062915, + "router_z_loss_clip": 4.7109375, + "router_z_loss_mlp": 0.5847168, + "step": 1322, + "time_per_iteration": 2.579061985015869 + }, + { + "auxiliary_loss_clip": 0.06837995, + "auxiliary_loss_mlp": 0.01339731, + "balance_loss_clip": 0.0636203, + "balance_loss_mlp": 0.01284681, + "epoch": 0.07954306327972344, + "flos": 21659983130880.0, + "grad_norm": 2.3668510426442144, + "language_loss": 0.92888951, + "learning_rate": 3.974308356206838e-06, + "loss": 1.01066673, + "num_input_tokens_seen": 28082175, + "router_z_loss_clip": 4.7578125, + "router_z_loss_mlp": 0.55053711, + "step": 1323, + "time_per_iteration": 3.9885079860687256 + }, + { + "auxiliary_loss_clip": 0.06820317, + "auxiliary_loss_mlp": 0.01320075, + "balance_loss_clip": 0.06361794, + "balance_loss_mlp": 0.01267504, + "epoch": 0.0796031865323914, + "flos": 23226717173760.0, + "grad_norm": 4.577989929254941, + "language_loss": 0.84617591, + "learning_rate": 3.974246094267187e-06, + "loss": 0.92757982, + "num_input_tokens_seen": 28102645, + "router_z_loss_clip": 4.58203125, + "router_z_loss_mlp": 0.52661133, + "step": 1324, + "time_per_iteration": 2.575162410736084 + }, + { + "auxiliary_loss_clip": 0.0682738, + "auxiliary_loss_mlp": 0.01317412, + "balance_loss_clip": 0.06365715, + "balance_loss_mlp": 0.0126372, + "epoch": 0.07966330978505937, + "flos": 23301209053440.0, + "grad_norm": 4.146924168553952, + "language_loss": 0.81619465, + "learning_rate": 3.974183757463925e-06, + "loss": 0.89764249, + "num_input_tokens_seen": 28122805, + "router_z_loss_clip": 4.6171875, + "router_z_loss_mlp": 0.53710938, + "step": 1325, + "time_per_iteration": 3.9960508346557617 + }, + { + "auxiliary_loss_clip": 0.06838783, + "auxiliary_loss_mlp": 0.01317663, + "balance_loss_clip": 0.06375229, + "balance_loss_mlp": 0.01262112, + "epoch": 0.07972343303772735, + "flos": 18368768534400.0, + "grad_norm": 3.482553532723253, + "language_loss": 0.90544963, + "learning_rate": 3.974121345799418e-06, + "loss": 0.98701411, + "num_input_tokens_seen": 28140530, + "router_z_loss_clip": 4.63671875, + "router_z_loss_mlp": 0.55493164, + "step": 1326, + "time_per_iteration": 2.5401828289031982 + }, + { + "auxiliary_loss_clip": 0.0682137, + "auxiliary_loss_mlp": 0.01316322, + "balance_loss_clip": 0.06366737, + "balance_loss_mlp": 0.01263488, + "epoch": 0.07978355629039531, + "flos": 21768995692800.0, + "grad_norm": 2.4962093100336085, + "language_loss": 0.85295928, + "learning_rate": 3.974058859276032e-06, + "loss": 0.93433619, + "num_input_tokens_seen": 28159640, + "router_z_loss_clip": 4.54296875, + "router_z_loss_mlp": 0.52856445, + "step": 1327, + "time_per_iteration": 2.6081485748291016 + }, + { + "auxiliary_loss_clip": 0.0686523, + "auxiliary_loss_mlp": 0.01320845, + "balance_loss_clip": 0.06376741, + "balance_loss_mlp": 0.01260119, + "epoch": 0.07984367954306328, + "flos": 18557178439680.0, + "grad_norm": 3.6856767873413077, + "language_loss": 0.82425529, + "learning_rate": 3.9739962978961354e-06, + "loss": 0.90611601, + "num_input_tokens_seen": 28177050, + "router_z_loss_clip": 4.88671875, + "router_z_loss_mlp": 0.60742188, + "step": 1328, + "time_per_iteration": 2.5963807106018066 + }, + { + "auxiliary_loss_clip": 0.06855517, + "auxiliary_loss_mlp": 0.01323941, + "balance_loss_clip": 0.06378672, + "balance_loss_mlp": 0.01266315, + "epoch": 0.07990380279573125, + "flos": 16908741066240.0, + "grad_norm": 2.810501054411486, + "language_loss": 0.77465802, + "learning_rate": 3.973933661662101e-06, + "loss": 0.85645258, + "num_input_tokens_seen": 28193245, + "router_z_loss_clip": 4.76953125, + "router_z_loss_mlp": 0.57666016, + "step": 1329, + "time_per_iteration": 2.5654993057250977 + }, + { + "auxiliary_loss_clip": 0.06870389, + "auxiliary_loss_mlp": 0.01332359, + "balance_loss_clip": 0.06403654, + "balance_loss_mlp": 0.01277785, + "epoch": 0.07996392604839922, + "flos": 24105358287360.0, + "grad_norm": 3.2158550447724354, + "language_loss": 0.83423603, + "learning_rate": 3.973870950576305e-06, + "loss": 0.91626346, + "num_input_tokens_seen": 28213570, + "router_z_loss_clip": 4.66796875, + "router_z_loss_mlp": 0.5456543, + "step": 1330, + "time_per_iteration": 2.689359426498413 + }, + { + "auxiliary_loss_clip": 0.06871998, + "auxiliary_loss_mlp": 0.01327325, + "balance_loss_clip": 0.06395264, + "balance_loss_mlp": 0.01271893, + "epoch": 0.08002404930106718, + "flos": 14283257558400.0, + "grad_norm": 2.3593668670474375, + "language_loss": 0.91363919, + "learning_rate": 3.9738081646411255e-06, + "loss": 0.99563241, + "num_input_tokens_seen": 28229980, + "router_z_loss_clip": 4.765625, + "router_z_loss_mlp": 0.5534668, + "step": 1331, + "time_per_iteration": 2.535022735595703 + }, + { + "auxiliary_loss_clip": 0.06886654, + "auxiliary_loss_mlp": 0.01331981, + "balance_loss_clip": 0.0639886, + "balance_loss_mlp": 0.01274283, + "epoch": 0.08008417255373516, + "flos": 40415732547840.0, + "grad_norm": 8.382777264974079, + "language_loss": 0.75984204, + "learning_rate": 3.973745303858942e-06, + "loss": 0.84202838, + "num_input_tokens_seen": 28253840, + "router_z_loss_clip": 4.875, + "router_z_loss_mlp": 0.57666016, + "step": 1332, + "time_per_iteration": 2.798543691635132 + }, + { + "auxiliary_loss_clip": 0.06853566, + "auxiliary_loss_mlp": 0.01322273, + "balance_loss_clip": 0.06399575, + "balance_loss_mlp": 0.01270894, + "epoch": 0.08014429580640313, + "flos": 18484866766080.0, + "grad_norm": 3.077187306300229, + "language_loss": 0.84502465, + "learning_rate": 3.973682368232138e-06, + "loss": 0.92678297, + "num_input_tokens_seen": 28271675, + "router_z_loss_clip": 4.54296875, + "router_z_loss_mlp": 0.51318359, + "step": 1333, + "time_per_iteration": 2.55322003364563 + }, + { + "auxiliary_loss_clip": 0.06860092, + "auxiliary_loss_mlp": 0.01337998, + "balance_loss_clip": 0.06402323, + "balance_loss_mlp": 0.01283972, + "epoch": 0.0802044190590711, + "flos": 22059835614720.0, + "grad_norm": 5.409358557797253, + "language_loss": 0.77425432, + "learning_rate": 3.9736193577631015e-06, + "loss": 0.85623527, + "num_input_tokens_seen": 28291850, + "router_z_loss_clip": 4.57421875, + "router_z_loss_mlp": 0.54052734, + "step": 1334, + "time_per_iteration": 2.6176130771636963 + }, + { + "auxiliary_loss_clip": 0.06866166, + "auxiliary_loss_mlp": 0.01339925, + "balance_loss_clip": 0.06404187, + "balance_loss_mlp": 0.01288045, + "epoch": 0.08026454231173906, + "flos": 24579115672320.0, + "grad_norm": 2.171957673256717, + "language_loss": 0.82094586, + "learning_rate": 3.973556272454221e-06, + "loss": 0.90300679, + "num_input_tokens_seen": 28310780, + "router_z_loss_clip": 4.6171875, + "router_z_loss_mlp": 0.51855469, + "step": 1335, + "time_per_iteration": 2.5995283126831055 + }, + { + "auxiliary_loss_clip": 0.0666078, + "auxiliary_loss_mlp": 0.0130214, + "balance_loss_clip": 0.06361455, + "balance_loss_mlp": 0.01275747, + "epoch": 0.08032466556440704, + "flos": 52597716940800.0, + "grad_norm": 0.7171954407460774, + "language_loss": 0.56264853, + "learning_rate": 3.973493112307889e-06, + "loss": 0.64227772, + "num_input_tokens_seen": 28369985, + "router_z_loss_clip": 3.0, + "router_z_loss_mlp": 0.2644043, + "step": 1336, + "time_per_iteration": 3.246748447418213 + }, + { + "auxiliary_loss_clip": 0.06839207, + "auxiliary_loss_mlp": 0.01326336, + "balance_loss_clip": 0.06379974, + "balance_loss_mlp": 0.01274528, + "epoch": 0.080384788817075, + "flos": 23849500245120.0, + "grad_norm": 4.030100704660237, + "language_loss": 0.70582694, + "learning_rate": 3.9734298773265005e-06, + "loss": 0.78748238, + "num_input_tokens_seen": 28388670, + "router_z_loss_clip": 4.58984375, + "router_z_loss_mlp": 0.51757812, + "step": 1337, + "time_per_iteration": 2.6283833980560303 + }, + { + "auxiliary_loss_clip": 0.06838794, + "auxiliary_loss_mlp": 0.01334035, + "balance_loss_clip": 0.06387126, + "balance_loss_mlp": 0.01282751, + "epoch": 0.08044491206974297, + "flos": 25307640996480.0, + "grad_norm": 2.123866739454124, + "language_loss": 0.89543176, + "learning_rate": 3.973366567512453e-06, + "loss": 0.97716004, + "num_input_tokens_seen": 28411845, + "router_z_loss_clip": 4.515625, + "router_z_loss_mlp": 0.51245117, + "step": 1338, + "time_per_iteration": 2.657308340072632 + }, + { + "auxiliary_loss_clip": 0.0684766, + "auxiliary_loss_mlp": 0.01327669, + "balance_loss_clip": 0.06375088, + "balance_loss_mlp": 0.01275956, + "epoch": 0.08050503532241095, + "flos": 22382093617920.0, + "grad_norm": 3.2141596734882705, + "language_loss": 0.89268589, + "learning_rate": 3.973303182868147e-06, + "loss": 0.97443926, + "num_input_tokens_seen": 28427875, + "router_z_loss_clip": 4.7265625, + "router_z_loss_mlp": 0.51708984, + "step": 1339, + "time_per_iteration": 2.592478036880493 + }, + { + "auxiliary_loss_clip": 0.06819817, + "auxiliary_loss_mlp": 0.01317452, + "balance_loss_clip": 0.06381136, + "balance_loss_mlp": 0.01272391, + "epoch": 0.08056515857507891, + "flos": 18375351079680.0, + "grad_norm": 3.0627135326619093, + "language_loss": 0.91607487, + "learning_rate": 3.973239723395988e-06, + "loss": 0.99744761, + "num_input_tokens_seen": 28446615, + "router_z_loss_clip": 4.390625, + "router_z_loss_mlp": 0.45019531, + "step": 1340, + "time_per_iteration": 2.576737403869629 + }, + { + "auxiliary_loss_clip": 0.06633395, + "auxiliary_loss_mlp": 0.01308679, + "balance_loss_clip": 0.06341641, + "balance_loss_mlp": 0.01279282, + "epoch": 0.08062528182774688, + "flos": 51364938545280.0, + "grad_norm": 0.8608858843500025, + "language_loss": 0.65432441, + "learning_rate": 3.97317618909838e-06, + "loss": 0.73374522, + "num_input_tokens_seen": 28505290, + "router_z_loss_clip": 2.91992188, + "router_z_loss_mlp": 0.29321289, + "step": 1341, + "time_per_iteration": 3.1589889526367188 + }, + { + "auxiliary_loss_clip": 0.06851779, + "auxiliary_loss_mlp": 0.01330947, + "balance_loss_clip": 0.06375904, + "balance_loss_mlp": 0.01274966, + "epoch": 0.08068540508041486, + "flos": 17604925914240.0, + "grad_norm": 3.057229978757205, + "language_loss": 0.9131434, + "learning_rate": 3.973112579977733e-06, + "loss": 0.99497068, + "num_input_tokens_seen": 28522735, + "router_z_loss_clip": 4.7578125, + "router_z_loss_mlp": 0.55932617, + "step": 1342, + "time_per_iteration": 2.5444014072418213 + }, + { + "auxiliary_loss_clip": 0.06830276, + "auxiliary_loss_mlp": 0.01334079, + "balance_loss_clip": 0.06376267, + "balance_loss_mlp": 0.01283748, + "epoch": 0.08074552833308282, + "flos": 10565761714560.0, + "grad_norm": 4.354152160697022, + "language_loss": 0.78571475, + "learning_rate": 3.973048896036459e-06, + "loss": 0.86735827, + "num_input_tokens_seen": 28539460, + "router_z_loss_clip": 4.54296875, + "router_z_loss_mlp": 0.50268555, + "step": 1343, + "time_per_iteration": 2.5960419178009033 + }, + { + "auxiliary_loss_clip": 0.06624237, + "auxiliary_loss_mlp": 0.01296199, + "balance_loss_clip": 0.06332739, + "balance_loss_mlp": 0.0127157, + "epoch": 0.08080565158575079, + "flos": 60859624245120.0, + "grad_norm": 0.7713053801929547, + "language_loss": 0.57751364, + "learning_rate": 3.972985137276974e-06, + "loss": 0.65671802, + "num_input_tokens_seen": 28599855, + "router_z_loss_clip": 2.90625, + "router_z_loss_mlp": 0.24609375, + "step": 1344, + "time_per_iteration": 3.101456880569458 + }, + { + "auxiliary_loss_clip": 0.06825489, + "auxiliary_loss_mlp": 0.01321695, + "balance_loss_clip": 0.06367917, + "balance_loss_mlp": 0.01271937, + "epoch": 0.08086577483841875, + "flos": 18338188993920.0, + "grad_norm": 5.096262211204216, + "language_loss": 0.90334368, + "learning_rate": 3.972921303701695e-06, + "loss": 0.98481554, + "num_input_tokens_seen": 28617585, + "router_z_loss_clip": 4.578125, + "router_z_loss_mlp": 0.49780273, + "step": 1345, + "time_per_iteration": 2.586388349533081 + }, + { + "auxiliary_loss_clip": 0.0679345, + "auxiliary_loss_mlp": 0.013189, + "balance_loss_clip": 0.06356402, + "balance_loss_mlp": 0.01272527, + "epoch": 0.08092589809108673, + "flos": 21550048174080.0, + "grad_norm": 2.3072860000969437, + "language_loss": 0.89656544, + "learning_rate": 3.972857395313042e-06, + "loss": 0.97768891, + "num_input_tokens_seen": 28636355, + "router_z_loss_clip": 4.37109375, + "router_z_loss_mlp": 0.46386719, + "step": 1346, + "time_per_iteration": 2.582712411880493 + }, + { + "auxiliary_loss_clip": 0.06790248, + "auxiliary_loss_mlp": 0.01314356, + "balance_loss_clip": 0.06353667, + "balance_loss_mlp": 0.0126734, + "epoch": 0.0809860213437547, + "flos": 22134662910720.0, + "grad_norm": 2.14729633171376, + "language_loss": 0.94647479, + "learning_rate": 3.972793412113439e-06, + "loss": 1.0275209, + "num_input_tokens_seen": 28656260, + "router_z_loss_clip": 4.36328125, + "router_z_loss_mlp": 0.47021484, + "step": 1347, + "time_per_iteration": 2.625967025756836 + }, + { + "auxiliary_loss_clip": 0.06793564, + "auxiliary_loss_mlp": 0.01318721, + "balance_loss_clip": 0.06355867, + "balance_loss_mlp": 0.01268487, + "epoch": 0.08104614459642266, + "flos": 21731875534080.0, + "grad_norm": 1.9969105850097444, + "language_loss": 0.91454613, + "learning_rate": 3.972729354105312e-06, + "loss": 0.99566901, + "num_input_tokens_seen": 28675865, + "router_z_loss_clip": 4.37109375, + "router_z_loss_mlp": 0.50219727, + "step": 1348, + "time_per_iteration": 2.5634779930114746 + }, + { + "auxiliary_loss_clip": 0.06800284, + "auxiliary_loss_mlp": 0.01324319, + "balance_loss_clip": 0.06360676, + "balance_loss_mlp": 0.01274585, + "epoch": 0.08110626784909064, + "flos": 23958764369280.0, + "grad_norm": 1.9721965286660104, + "language_loss": 0.78618681, + "learning_rate": 3.97266522129109e-06, + "loss": 0.86743283, + "num_input_tokens_seen": 28696255, + "router_z_loss_clip": 4.39453125, + "router_z_loss_mlp": 0.49731445, + "step": 1349, + "time_per_iteration": 2.6185498237609863 + }, + { + "auxiliary_loss_clip": 0.06800876, + "auxiliary_loss_mlp": 0.01313559, + "balance_loss_clip": 0.06350809, + "balance_loss_mlp": 0.01260082, + "epoch": 0.0811663911017586, + "flos": 19031648584320.0, + "grad_norm": 2.1691769325426407, + "language_loss": 0.90292668, + "learning_rate": 3.972601013673205e-06, + "loss": 0.98407102, + "num_input_tokens_seen": 28713905, + "router_z_loss_clip": 4.5, + "router_z_loss_mlp": 0.53491211, + "step": 1350, + "time_per_iteration": 2.5529837608337402 + }, + { + "auxiliary_loss_clip": 0.06778225, + "auxiliary_loss_mlp": 0.01313184, + "balance_loss_clip": 0.06345821, + "balance_loss_mlp": 0.01263522, + "epoch": 0.08122651435442657, + "flos": 15346744778880.0, + "grad_norm": 2.4256402439075524, + "language_loss": 0.84302771, + "learning_rate": 3.972536731254092e-06, + "loss": 0.92394179, + "num_input_tokens_seen": 28732075, + "router_z_loss_clip": 4.33203125, + "router_z_loss_mlp": 0.49633789, + "step": 1351, + "time_per_iteration": 2.574605941772461 + }, + { + "auxiliary_loss_clip": 0.06780043, + "auxiliary_loss_mlp": 0.01313675, + "balance_loss_clip": 0.06340061, + "balance_loss_mlp": 0.01260365, + "epoch": 0.08128663760709455, + "flos": 23228226547200.0, + "grad_norm": 2.4241077577089296, + "language_loss": 0.77524561, + "learning_rate": 3.972472374036189e-06, + "loss": 0.85618269, + "num_input_tokens_seen": 28751150, + "router_z_loss_clip": 4.39453125, + "router_z_loss_mlp": 0.53393555, + "step": 1352, + "time_per_iteration": 2.5638983249664307 + }, + { + "auxiliary_loss_clip": 0.06784214, + "auxiliary_loss_mlp": 0.01317971, + "balance_loss_clip": 0.06339107, + "balance_loss_mlp": 0.01263802, + "epoch": 0.08134676085976252, + "flos": 22972158869760.0, + "grad_norm": 2.0098905052691154, + "language_loss": 0.84226817, + "learning_rate": 3.972407942021935e-06, + "loss": 0.92329001, + "num_input_tokens_seen": 28773360, + "router_z_loss_clip": 4.44140625, + "router_z_loss_mlp": 0.54223633, + "step": 1353, + "time_per_iteration": 2.64945125579834 + }, + { + "auxiliary_loss_clip": 0.06608218, + "auxiliary_loss_mlp": 0.01309213, + "balance_loss_clip": 0.06325812, + "balance_loss_mlp": 0.01278219, + "epoch": 0.08140688411243048, + "flos": 64338592642560.0, + "grad_norm": 0.8262871142057754, + "language_loss": 0.5983628, + "learning_rate": 3.972343435213775e-06, + "loss": 0.67753708, + "num_input_tokens_seen": 28833390, + "router_z_loss_clip": 2.828125, + "router_z_loss_mlp": 0.30957031, + "step": 1354, + "time_per_iteration": 3.1732943058013916 + }, + { + "auxiliary_loss_clip": 0.06774879, + "auxiliary_loss_mlp": 0.0130121, + "balance_loss_clip": 0.0634238, + "balance_loss_mlp": 0.01251332, + "epoch": 0.08146700736509845, + "flos": 22498401484800.0, + "grad_norm": 1.9500881523267093, + "language_loss": 0.84588456, + "learning_rate": 3.972278853614154e-06, + "loss": 0.92664552, + "num_input_tokens_seen": 28852430, + "router_z_loss_clip": 4.32421875, + "router_z_loss_mlp": 0.49853516, + "step": 1355, + "time_per_iteration": 2.6024701595306396 + }, + { + "auxiliary_loss_clip": 0.06776839, + "auxiliary_loss_mlp": 0.01312133, + "balance_loss_clip": 0.06341404, + "balance_loss_mlp": 0.01258727, + "epoch": 0.08152713061776642, + "flos": 20453885061120.0, + "grad_norm": 2.065670918937768, + "language_loss": 0.73062277, + "learning_rate": 3.972214197225521e-06, + "loss": 0.81151247, + "num_input_tokens_seen": 28870685, + "router_z_loss_clip": 4.3515625, + "router_z_loss_mlp": 0.53393555, + "step": 1356, + "time_per_iteration": 2.72872257232666 + }, + { + "auxiliary_loss_clip": 0.06800745, + "auxiliary_loss_mlp": 0.01315187, + "balance_loss_clip": 0.06343117, + "balance_loss_mlp": 0.01261305, + "epoch": 0.08158725387043439, + "flos": 23556983241600.0, + "grad_norm": 2.136910900826005, + "language_loss": 0.72079623, + "learning_rate": 3.972149466050329e-06, + "loss": 0.80195546, + "num_input_tokens_seen": 28889860, + "router_z_loss_clip": 4.57421875, + "router_z_loss_mlp": 0.5390625, + "step": 1357, + "time_per_iteration": 2.5841641426086426 + }, + { + "auxiliary_loss_clip": 0.06792152, + "auxiliary_loss_mlp": 0.01312262, + "balance_loss_clip": 0.06345978, + "balance_loss_mlp": 0.01258093, + "epoch": 0.08164737712310235, + "flos": 22023763632000.0, + "grad_norm": 3.905031036394957, + "language_loss": 0.86688, + "learning_rate": 3.97208466009103e-06, + "loss": 0.94792414, + "num_input_tokens_seen": 28905865, + "router_z_loss_clip": 4.45703125, + "router_z_loss_mlp": 0.54150391, + "step": 1358, + "time_per_iteration": 4.091388940811157 + }, + { + "auxiliary_loss_clip": 0.0678063, + "auxiliary_loss_mlp": 0.01322843, + "balance_loss_clip": 0.06336431, + "balance_loss_mlp": 0.01268985, + "epoch": 0.08170750037577033, + "flos": 23374568903040.0, + "grad_norm": 2.183092150408785, + "language_loss": 1.0464294, + "learning_rate": 3.972019779350084e-06, + "loss": 1.12746406, + "num_input_tokens_seen": 28925250, + "router_z_loss_clip": 4.4453125, + "router_z_loss_mlp": 0.53857422, + "step": 1359, + "time_per_iteration": 2.638028860092163 + }, + { + "auxiliary_loss_clip": 0.06798591, + "auxiliary_loss_mlp": 0.01334932, + "balance_loss_clip": 0.06339104, + "balance_loss_mlp": 0.01274732, + "epoch": 0.0817676236284383, + "flos": 28404743610240.0, + "grad_norm": 2.2550025008974335, + "language_loss": 0.86049831, + "learning_rate": 3.971954823829951e-06, + "loss": 0.9418335, + "num_input_tokens_seen": 28943445, + "router_z_loss_clip": 4.59765625, + "router_z_loss_mlp": 0.60229492, + "step": 1360, + "time_per_iteration": 4.079089164733887 + }, + { + "auxiliary_loss_clip": 0.06791367, + "auxiliary_loss_mlp": 0.01327265, + "balance_loss_clip": 0.06338443, + "balance_loss_mlp": 0.01274146, + "epoch": 0.08182774688110626, + "flos": 19215027244800.0, + "grad_norm": 8.376592298607987, + "language_loss": 0.74940681, + "learning_rate": 3.971889793533093e-06, + "loss": 0.83059311, + "num_input_tokens_seen": 28962695, + "router_z_loss_clip": 4.53125, + "router_z_loss_mlp": 0.53125, + "step": 1361, + "time_per_iteration": 2.6070094108581543 + }, + { + "auxiliary_loss_clip": 0.06780887, + "auxiliary_loss_mlp": 0.01320749, + "balance_loss_clip": 0.06343664, + "balance_loss_mlp": 0.01270443, + "epoch": 0.08188787013377424, + "flos": 22790750780160.0, + "grad_norm": 2.8909747766913574, + "language_loss": 0.79067749, + "learning_rate": 3.971824688461976e-06, + "loss": 0.87169385, + "num_input_tokens_seen": 28982120, + "router_z_loss_clip": 4.3671875, + "router_z_loss_mlp": 0.50244141, + "step": 1362, + "time_per_iteration": 2.575406074523926 + }, + { + "auxiliary_loss_clip": 0.06776625, + "auxiliary_loss_mlp": 0.01317112, + "balance_loss_clip": 0.06338399, + "balance_loss_mlp": 0.01266543, + "epoch": 0.08194799338644221, + "flos": 16473026234880.0, + "grad_norm": 2.5840358465526787, + "language_loss": 0.74518561, + "learning_rate": 3.971759508619069e-06, + "loss": 0.826123, + "num_input_tokens_seen": 28998100, + "router_z_loss_clip": 4.3828125, + "router_z_loss_mlp": 0.50537109, + "step": 1363, + "time_per_iteration": 3.9524402618408203 + }, + { + "auxiliary_loss_clip": 0.06785508, + "auxiliary_loss_mlp": 0.01321755, + "balance_loss_clip": 0.06342393, + "balance_loss_mlp": 0.01265846, + "epoch": 0.08200811663911017, + "flos": 23920218691200.0, + "grad_norm": 2.478943630227512, + "language_loss": 0.79175317, + "learning_rate": 3.971694254006844e-06, + "loss": 0.87282574, + "num_input_tokens_seen": 29017095, + "router_z_loss_clip": 4.43359375, + "router_z_loss_mlp": 0.55859375, + "step": 1364, + "time_per_iteration": 2.607170343399048 + }, + { + "auxiliary_loss_clip": 0.06783722, + "auxiliary_loss_mlp": 0.01316868, + "balance_loss_clip": 0.06340142, + "balance_loss_mlp": 0.01262867, + "epoch": 0.08206823989177814, + "flos": 17902641870720.0, + "grad_norm": 2.8411268969790275, + "language_loss": 0.83563399, + "learning_rate": 3.971628924627776e-06, + "loss": 0.91663992, + "num_input_tokens_seen": 29037240, + "router_z_loss_clip": 4.4375, + "router_z_loss_mlp": 0.54003906, + "step": 1365, + "time_per_iteration": 4.020315647125244 + }, + { + "auxiliary_loss_clip": 0.06767645, + "auxiliary_loss_mlp": 0.01324198, + "balance_loss_clip": 0.06336691, + "balance_loss_mlp": 0.01274917, + "epoch": 0.08212836314444612, + "flos": 22094272442880.0, + "grad_norm": 1.9744562731627089, + "language_loss": 0.83576512, + "learning_rate": 3.97156352048434e-06, + "loss": 0.91668355, + "num_input_tokens_seen": 29056250, + "router_z_loss_clip": 4.30078125, + "router_z_loss_mlp": 0.49243164, + "step": 1366, + "time_per_iteration": 2.5904746055603027 + }, + { + "auxiliary_loss_clip": 0.06785953, + "auxiliary_loss_mlp": 0.01321056, + "balance_loss_clip": 0.06344087, + "balance_loss_mlp": 0.01269963, + "epoch": 0.08218848639711408, + "flos": 17602326437760.0, + "grad_norm": 2.595099293602591, + "language_loss": 0.84101415, + "learning_rate": 3.97149804157902e-06, + "loss": 0.92208421, + "num_input_tokens_seen": 29073380, + "router_z_loss_clip": 4.41015625, + "router_z_loss_mlp": 0.51074219, + "step": 1367, + "time_per_iteration": 2.547091007232666 + }, + { + "auxiliary_loss_clip": 0.06812844, + "auxiliary_loss_mlp": 0.01336623, + "balance_loss_clip": 0.06357861, + "balance_loss_mlp": 0.01283504, + "epoch": 0.08224860964978205, + "flos": 17863551141120.0, + "grad_norm": 3.794710967606561, + "language_loss": 0.85955203, + "learning_rate": 3.9714324879142946e-06, + "loss": 0.94104671, + "num_input_tokens_seen": 29091330, + "router_z_loss_clip": 4.546875, + "router_z_loss_mlp": 0.53100586, + "step": 1368, + "time_per_iteration": 2.6025125980377197 + }, + { + "auxiliary_loss_clip": 0.06754048, + "auxiliary_loss_mlp": 0.01305347, + "balance_loss_clip": 0.06340475, + "balance_loss_mlp": 0.01259881, + "epoch": 0.08230873290245003, + "flos": 25234406928000.0, + "grad_norm": 1.7485210372757418, + "language_loss": 0.82751203, + "learning_rate": 3.971366859492653e-06, + "loss": 0.90810603, + "num_input_tokens_seen": 29110375, + "router_z_loss_clip": 4.12890625, + "router_z_loss_mlp": 0.45458984, + "step": 1369, + "time_per_iteration": 2.6027116775512695 + }, + { + "auxiliary_loss_clip": 0.06772825, + "auxiliary_loss_mlp": 0.01314688, + "balance_loss_clip": 0.06341462, + "balance_loss_mlp": 0.01264811, + "epoch": 0.08236885615511799, + "flos": 31768144099200.0, + "grad_norm": 4.8921113569353425, + "language_loss": 0.77775633, + "learning_rate": 3.971301156316582e-06, + "loss": 0.85863149, + "num_input_tokens_seen": 29129395, + "router_z_loss_clip": 4.31640625, + "router_z_loss_mlp": 0.49902344, + "step": 1370, + "time_per_iteration": 2.685317039489746 + }, + { + "auxiliary_loss_clip": 0.06783543, + "auxiliary_loss_mlp": 0.01317271, + "balance_loss_clip": 0.06345622, + "balance_loss_mlp": 0.01265153, + "epoch": 0.08242897940778596, + "flos": 23192615761920.0, + "grad_norm": 2.053394395942029, + "language_loss": 0.76803637, + "learning_rate": 3.971235378388573e-06, + "loss": 0.84904444, + "num_input_tokens_seen": 29148650, + "router_z_loss_clip": 4.37890625, + "router_z_loss_mlp": 0.52124023, + "step": 1371, + "time_per_iteration": 2.6406354904174805 + }, + { + "auxiliary_loss_clip": 0.06769266, + "auxiliary_loss_mlp": 0.01317025, + "balance_loss_clip": 0.06335683, + "balance_loss_mlp": 0.01267625, + "epoch": 0.08248910266045394, + "flos": 34499327932800.0, + "grad_norm": 3.0324747361967557, + "language_loss": 0.72827047, + "learning_rate": 3.971169525711122e-06, + "loss": 0.80913335, + "num_input_tokens_seen": 29170785, + "router_z_loss_clip": 4.33203125, + "router_z_loss_mlp": 0.49438477, + "step": 1372, + "time_per_iteration": 2.709796905517578 + }, + { + "auxiliary_loss_clip": 0.06798708, + "auxiliary_loss_mlp": 0.01317216, + "balance_loss_clip": 0.06345405, + "balance_loss_mlp": 0.01260854, + "epoch": 0.0825492259131219, + "flos": 13440059521920.0, + "grad_norm": 3.0329353190283075, + "language_loss": 0.9010855, + "learning_rate": 3.9711035982867246e-06, + "loss": 0.98224467, + "num_input_tokens_seen": 29185210, + "router_z_loss_clip": 4.53125, + "router_z_loss_mlp": 0.56420898, + "step": 1373, + "time_per_iteration": 2.5570318698883057 + }, + { + "auxiliary_loss_clip": 0.06774755, + "auxiliary_loss_mlp": 0.01317124, + "balance_loss_clip": 0.0634156, + "balance_loss_mlp": 0.01267056, + "epoch": 0.08260934916578987, + "flos": 25819608643200.0, + "grad_norm": 3.0603308178325657, + "language_loss": 0.84582615, + "learning_rate": 3.971037596117882e-06, + "loss": 0.92674494, + "num_input_tokens_seen": 29205210, + "router_z_loss_clip": 4.3359375, + "router_z_loss_mlp": 0.50024414, + "step": 1374, + "time_per_iteration": 2.596226215362549 + }, + { + "auxiliary_loss_clip": 0.06626149, + "auxiliary_loss_mlp": 0.0129603, + "balance_loss_clip": 0.06341976, + "balance_loss_mlp": 0.01265918, + "epoch": 0.08266947241845783, + "flos": 63478609061760.0, + "grad_norm": 0.8009341803089134, + "language_loss": 0.60659707, + "learning_rate": 3.970971519207095e-06, + "loss": 0.68581879, + "num_input_tokens_seen": 29265350, + "router_z_loss_clip": 2.84375, + "router_z_loss_mlp": 0.30053711, + "step": 1375, + "time_per_iteration": 3.177459716796875 + }, + { + "auxiliary_loss_clip": 0.06618689, + "auxiliary_loss_mlp": 0.01286424, + "balance_loss_clip": 0.06334813, + "balance_loss_mlp": 0.01256718, + "epoch": 0.08272959567112581, + "flos": 70013855606400.0, + "grad_norm": 0.886054791003263, + "language_loss": 0.62275791, + "learning_rate": 3.970905367556871e-06, + "loss": 0.70180905, + "num_input_tokens_seen": 29321475, + "router_z_loss_clip": 2.84375, + "router_z_loss_mlp": 0.29638672, + "step": 1376, + "time_per_iteration": 3.1206676959991455 + }, + { + "auxiliary_loss_clip": 0.06771185, + "auxiliary_loss_mlp": 0.01316915, + "balance_loss_clip": 0.06341776, + "balance_loss_mlp": 0.01268611, + "epoch": 0.08278971892379378, + "flos": 20419574014080.0, + "grad_norm": 2.5198182509144735, + "language_loss": 0.84768277, + "learning_rate": 3.970839141169718e-06, + "loss": 0.92856377, + "num_input_tokens_seen": 29341405, + "router_z_loss_clip": 4.2890625, + "router_z_loss_mlp": 0.48266602, + "step": 1377, + "time_per_iteration": 2.6820216178894043 + }, + { + "auxiliary_loss_clip": 0.06764729, + "auxiliary_loss_mlp": 0.01308146, + "balance_loss_clip": 0.06342821, + "balance_loss_mlp": 0.0126144, + "epoch": 0.08284984217646174, + "flos": 26257461753600.0, + "grad_norm": 2.286420184169047, + "language_loss": 0.86602247, + "learning_rate": 3.970772840048147e-06, + "loss": 0.94675124, + "num_input_tokens_seen": 29361955, + "router_z_loss_clip": 4.2109375, + "router_z_loss_mlp": 0.46728516, + "step": 1378, + "time_per_iteration": 2.5983967781066895 + }, + { + "auxiliary_loss_clip": 0.06779523, + "auxiliary_loss_mlp": 0.01324128, + "balance_loss_clip": 0.06348801, + "balance_loss_mlp": 0.01275396, + "epoch": 0.08290996542912972, + "flos": 27201370798080.0, + "grad_norm": 4.155383498543994, + "language_loss": 0.9020921, + "learning_rate": 3.970706464194672e-06, + "loss": 0.98312867, + "num_input_tokens_seen": 29382395, + "router_z_loss_clip": 4.30664062, + "router_z_loss_mlp": 0.48779297, + "step": 1379, + "time_per_iteration": 2.6558284759521484 + }, + { + "auxiliary_loss_clip": 0.06771149, + "auxiliary_loss_mlp": 0.01307486, + "balance_loss_clip": 0.06347619, + "balance_loss_mlp": 0.01261972, + "epoch": 0.08297008868179769, + "flos": 38627367655680.0, + "grad_norm": 2.766384510146163, + "language_loss": 0.80964148, + "learning_rate": 3.970640013611812e-06, + "loss": 0.89042783, + "num_input_tokens_seen": 29404460, + "router_z_loss_clip": 4.23828125, + "router_z_loss_mlp": 0.45483398, + "step": 1380, + "time_per_iteration": 2.7228140830993652 + }, + { + "auxiliary_loss_clip": 0.06759404, + "auxiliary_loss_mlp": 0.01314619, + "balance_loss_clip": 0.06340429, + "balance_loss_mlp": 0.01265576, + "epoch": 0.08303021193446565, + "flos": 19980924289920.0, + "grad_norm": 2.7915027065661593, + "language_loss": 0.88561881, + "learning_rate": 3.970573488302083e-06, + "loss": 0.96635896, + "num_input_tokens_seen": 29422675, + "router_z_loss_clip": 4.18554688, + "router_z_loss_mlp": 0.49023438, + "step": 1381, + "time_per_iteration": 2.6598143577575684 + }, + { + "auxiliary_loss_clip": 0.06800985, + "auxiliary_loss_mlp": 0.0131809, + "balance_loss_clip": 0.06359053, + "balance_loss_mlp": 0.01265972, + "epoch": 0.08309033518713363, + "flos": 13667769792000.0, + "grad_norm": 3.693105114641136, + "language_loss": 0.91473186, + "learning_rate": 3.970506888268011e-06, + "loss": 0.99592257, + "num_input_tokens_seen": 29439840, + "router_z_loss_clip": 4.41796875, + "router_z_loss_mlp": 0.52148438, + "step": 1382, + "time_per_iteration": 2.5975959300994873 + }, + { + "auxiliary_loss_clip": 0.06790116, + "auxiliary_loss_mlp": 0.01312438, + "balance_loss_clip": 0.06361018, + "balance_loss_mlp": 0.01263229, + "epoch": 0.0831504584398016, + "flos": 17974492346880.0, + "grad_norm": 2.495217268396043, + "language_loss": 0.78734231, + "learning_rate": 3.970440213512121e-06, + "loss": 0.86836791, + "num_input_tokens_seen": 29457360, + "router_z_loss_clip": 4.28515625, + "router_z_loss_mlp": 0.49243164, + "step": 1383, + "time_per_iteration": 2.625793695449829 + }, + { + "auxiliary_loss_clip": 0.06786636, + "auxiliary_loss_mlp": 0.01320002, + "balance_loss_clip": 0.06359254, + "balance_loss_mlp": 0.01273797, + "epoch": 0.08321058169246956, + "flos": 22607959098240.0, + "grad_norm": 2.963836437118746, + "language_loss": 0.85324878, + "learning_rate": 3.97037346403694e-06, + "loss": 0.93431515, + "num_input_tokens_seen": 29477040, + "router_z_loss_clip": 4.26953125, + "router_z_loss_mlp": 0.46240234, + "step": 1384, + "time_per_iteration": 2.6376733779907227 + }, + { + "auxiliary_loss_clip": 0.06818897, + "auxiliary_loss_mlp": 0.01334638, + "balance_loss_clip": 0.06359202, + "balance_loss_mlp": 0.01276106, + "epoch": 0.08327070494513754, + "flos": 22855976784000.0, + "grad_norm": 3.1601990232642225, + "language_loss": 0.86789215, + "learning_rate": 3.970306639845e-06, + "loss": 0.94942749, + "num_input_tokens_seen": 29492010, + "router_z_loss_clip": 4.59375, + "router_z_loss_mlp": 0.58569336, + "step": 1385, + "time_per_iteration": 2.568554639816284 + }, + { + "auxiliary_loss_clip": 0.06798602, + "auxiliary_loss_mlp": 0.0132055, + "balance_loss_clip": 0.06352767, + "balance_loss_mlp": 0.01267978, + "epoch": 0.0833308281978055, + "flos": 22789451041920.0, + "grad_norm": 2.43217008586481, + "language_loss": 0.71394652, + "learning_rate": 3.970239740938835e-06, + "loss": 0.795138, + "num_input_tokens_seen": 29511850, + "router_z_loss_clip": 4.45703125, + "router_z_loss_mlp": 0.52563477, + "step": 1386, + "time_per_iteration": 2.6096982955932617 + }, + { + "auxiliary_loss_clip": 0.06791467, + "auxiliary_loss_mlp": 0.01322523, + "balance_loss_clip": 0.06356902, + "balance_loss_mlp": 0.01273099, + "epoch": 0.08339095145047347, + "flos": 20818713738240.0, + "grad_norm": 2.3900622326762133, + "language_loss": 0.84172809, + "learning_rate": 3.97017276732098e-06, + "loss": 0.92286795, + "num_input_tokens_seen": 29531415, + "router_z_loss_clip": 4.34375, + "router_z_loss_mlp": 0.49389648, + "step": 1387, + "time_per_iteration": 2.575343132019043 + }, + { + "auxiliary_loss_clip": 0.06797379, + "auxiliary_loss_mlp": 0.01318956, + "balance_loss_clip": 0.06353064, + "balance_loss_mlp": 0.01265598, + "epoch": 0.08345107470314143, + "flos": 18521274165120.0, + "grad_norm": 5.434584550719809, + "language_loss": 0.79640985, + "learning_rate": 3.970105718993978e-06, + "loss": 0.87757325, + "num_input_tokens_seen": 29549525, + "router_z_loss_clip": 4.44140625, + "router_z_loss_mlp": 0.53369141, + "step": 1388, + "time_per_iteration": 2.567218780517578 + }, + { + "auxiliary_loss_clip": 0.06780161, + "auxiliary_loss_mlp": 0.01317075, + "balance_loss_clip": 0.06354657, + "balance_loss_mlp": 0.0126932, + "epoch": 0.08351119795580941, + "flos": 18813623460480.0, + "grad_norm": 2.631761877844796, + "language_loss": 0.82141799, + "learning_rate": 3.970038595960369e-06, + "loss": 0.90239036, + "num_input_tokens_seen": 29568705, + "router_z_loss_clip": 4.2578125, + "router_z_loss_mlp": 0.47827148, + "step": 1389, + "time_per_iteration": 2.5653841495513916 + }, + { + "auxiliary_loss_clip": 0.06804, + "auxiliary_loss_mlp": 0.01321664, + "balance_loss_clip": 0.06357203, + "balance_loss_mlp": 0.01264014, + "epoch": 0.08357132120847738, + "flos": 18447662753280.0, + "grad_norm": 4.4672809610096005, + "language_loss": 0.89901805, + "learning_rate": 3.969971398222699e-06, + "loss": 0.9802748, + "num_input_tokens_seen": 29585855, + "router_z_loss_clip": 4.46484375, + "router_z_loss_mlp": 0.57666016, + "step": 1390, + "time_per_iteration": 2.5599520206451416 + }, + { + "auxiliary_loss_clip": 0.06784607, + "auxiliary_loss_mlp": 0.01318322, + "balance_loss_clip": 0.06351756, + "balance_loss_mlp": 0.01268469, + "epoch": 0.08363144446114534, + "flos": 25929585527040.0, + "grad_norm": 2.0099549817565, + "language_loss": 0.88354278, + "learning_rate": 3.969904125783517e-06, + "loss": 0.96457207, + "num_input_tokens_seen": 29607280, + "router_z_loss_clip": 4.328125, + "router_z_loss_mlp": 0.49853516, + "step": 1391, + "time_per_iteration": 2.611985921859741 + }, + { + "auxiliary_loss_clip": 0.06815389, + "auxiliary_loss_mlp": 0.01329624, + "balance_loss_clip": 0.06354406, + "balance_loss_mlp": 0.01268851, + "epoch": 0.08369156771381332, + "flos": 18047223290880.0, + "grad_norm": 3.4660821416963805, + "language_loss": 0.90262675, + "learning_rate": 3.969836778645371e-06, + "loss": 0.98407698, + "num_input_tokens_seen": 29624130, + "router_z_loss_clip": 4.609375, + "router_z_loss_mlp": 0.60791016, + "step": 1392, + "time_per_iteration": 2.5649681091308594 + }, + { + "auxiliary_loss_clip": 0.06784143, + "auxiliary_loss_mlp": 0.01319854, + "balance_loss_clip": 0.06346482, + "balance_loss_mlp": 0.01270025, + "epoch": 0.08375169096648129, + "flos": 22681822072320.0, + "grad_norm": 4.398591622405809, + "language_loss": 0.82388842, + "learning_rate": 3.969769356810819e-06, + "loss": 0.90492845, + "num_input_tokens_seen": 29643210, + "router_z_loss_clip": 4.375, + "router_z_loss_mlp": 0.4987793, + "step": 1393, + "time_per_iteration": 2.596484899520874 + }, + { + "auxiliary_loss_clip": 0.06777762, + "auxiliary_loss_mlp": 0.01325984, + "balance_loss_clip": 0.06353533, + "balance_loss_mlp": 0.01276679, + "epoch": 0.08381181421914925, + "flos": 26110238929920.0, + "grad_norm": 2.2804276198164386, + "language_loss": 0.86896241, + "learning_rate": 3.969701860282415e-06, + "loss": 0.94999981, + "num_input_tokens_seen": 29663920, + "router_z_loss_clip": 4.24414062, + "router_z_loss_mlp": 0.49291992, + "step": 1394, + "time_per_iteration": 2.6082303524017334 + }, + { + "auxiliary_loss_clip": 0.06795013, + "auxiliary_loss_mlp": 0.01318108, + "balance_loss_clip": 0.06360835, + "balance_loss_mlp": 0.01267063, + "epoch": 0.08387193747181723, + "flos": 20635796275200.0, + "grad_norm": 2.9482675367733306, + "language_loss": 0.84974355, + "learning_rate": 3.969634289062719e-06, + "loss": 0.93087476, + "num_input_tokens_seen": 29683825, + "router_z_loss_clip": 4.3359375, + "router_z_loss_mlp": 0.51098633, + "step": 1395, + "time_per_iteration": 2.579622745513916 + }, + { + "auxiliary_loss_clip": 0.06798401, + "auxiliary_loss_mlp": 0.01311309, + "balance_loss_clip": 0.06349191, + "balance_loss_mlp": 0.01256282, + "epoch": 0.0839320607244852, + "flos": 13448193367680.0, + "grad_norm": 3.513957453818194, + "language_loss": 0.85002828, + "learning_rate": 3.969566643154293e-06, + "loss": 0.93112534, + "num_input_tokens_seen": 29698775, + "router_z_loss_clip": 4.48828125, + "router_z_loss_mlp": 0.55078125, + "step": 1396, + "time_per_iteration": 2.5521080493927 + }, + { + "auxiliary_loss_clip": 0.06784061, + "auxiliary_loss_mlp": 0.0131232, + "balance_loss_clip": 0.06356047, + "balance_loss_mlp": 0.0126261, + "epoch": 0.08399218397715316, + "flos": 23484000735360.0, + "grad_norm": 4.145800578493811, + "language_loss": 0.79030329, + "learning_rate": 3.969498922559703e-06, + "loss": 0.87126708, + "num_input_tokens_seen": 29719430, + "router_z_loss_clip": 4.28515625, + "router_z_loss_mlp": 0.49682617, + "step": 1397, + "time_per_iteration": 4.026551961898804 + }, + { + "auxiliary_loss_clip": 0.06777123, + "auxiliary_loss_mlp": 0.01309701, + "balance_loss_clip": 0.06349255, + "balance_loss_mlp": 0.01258655, + "epoch": 0.08405230722982113, + "flos": 25927698810240.0, + "grad_norm": 3.1837358420566173, + "language_loss": 0.79802477, + "learning_rate": 3.969431127281516e-06, + "loss": 0.87889296, + "num_input_tokens_seen": 29739685, + "router_z_loss_clip": 4.27734375, + "router_z_loss_mlp": 0.51123047, + "step": 1398, + "time_per_iteration": 2.6027841567993164 + }, + { + "auxiliary_loss_clip": 0.06793746, + "auxiliary_loss_mlp": 0.01312625, + "balance_loss_clip": 0.06375143, + "balance_loss_mlp": 0.01265299, + "epoch": 0.0841124304824891, + "flos": 17973192608640.0, + "grad_norm": 3.0716222673767404, + "language_loss": 0.96745825, + "learning_rate": 3.969363257322304e-06, + "loss": 1.048522, + "num_input_tokens_seen": 29756165, + "router_z_loss_clip": 4.1875, + "router_z_loss_mlp": 0.47290039, + "step": 1399, + "time_per_iteration": 3.9915521144866943 + }, + { + "auxiliary_loss_clip": 0.06813341, + "auxiliary_loss_mlp": 0.01316281, + "balance_loss_clip": 0.06352973, + "balance_loss_mlp": 0.01258012, + "epoch": 0.08417255373515707, + "flos": 25636733107200.0, + "grad_norm": 6.6751707009018055, + "language_loss": 0.83959824, + "learning_rate": 3.96929531268464e-06, + "loss": 0.92089444, + "num_input_tokens_seen": 29776425, + "router_z_loss_clip": 4.6015625, + "router_z_loss_mlp": 0.58300781, + "step": 1400, + "time_per_iteration": 2.6097705364227295 + }, + { + "auxiliary_loss_clip": 0.06801295, + "auxiliary_loss_mlp": 0.01317439, + "balance_loss_clip": 0.06362335, + "balance_loss_mlp": 0.01264868, + "epoch": 0.08423267698782504, + "flos": 26256874775040.0, + "grad_norm": 2.3612401801911487, + "language_loss": 0.8841815, + "learning_rate": 3.969227293371099e-06, + "loss": 0.96536887, + "num_input_tokens_seen": 29796440, + "router_z_loss_clip": 4.38671875, + "router_z_loss_mlp": 0.52539062, + "step": 1401, + "time_per_iteration": 2.654085874557495 + }, + { + "auxiliary_loss_clip": 0.06806403, + "auxiliary_loss_mlp": 0.01316426, + "balance_loss_clip": 0.0637629, + "balance_loss_mlp": 0.01264594, + "epoch": 0.08429280024049302, + "flos": 20125757272320.0, + "grad_norm": 2.1446358728684753, + "language_loss": 0.90116793, + "learning_rate": 3.969159199384263e-06, + "loss": 0.98239625, + "num_input_tokens_seen": 29814755, + "router_z_loss_clip": 4.296875, + "router_z_loss_mlp": 0.51733398, + "step": 1402, + "time_per_iteration": 4.018750905990601 + }, + { + "auxiliary_loss_clip": 0.067935, + "auxiliary_loss_mlp": 0.01308153, + "balance_loss_clip": 0.06370865, + "balance_loss_mlp": 0.01261519, + "epoch": 0.08435292349316098, + "flos": 42934593335040.0, + "grad_norm": 3.3097945414979324, + "language_loss": 0.91613716, + "learning_rate": 3.9690910307267125e-06, + "loss": 0.99715364, + "num_input_tokens_seen": 29834785, + "router_z_loss_clip": 4.21484375, + "router_z_loss_mlp": 0.46655273, + "step": 1403, + "time_per_iteration": 2.75314998626709 + }, + { + "auxiliary_loss_clip": 0.06802634, + "auxiliary_loss_mlp": 0.01312918, + "balance_loss_clip": 0.0636553, + "balance_loss_mlp": 0.01259679, + "epoch": 0.08441304674582895, + "flos": 22863984848640.0, + "grad_norm": 2.1842752098613696, + "language_loss": 0.8341198, + "learning_rate": 3.969022787401033e-06, + "loss": 0.91527522, + "num_input_tokens_seen": 29854695, + "router_z_loss_clip": 4.37109375, + "router_z_loss_mlp": 0.5324707, + "step": 1404, + "time_per_iteration": 4.128188371658325 + }, + { + "auxiliary_loss_clip": 0.06814778, + "auxiliary_loss_mlp": 0.01317505, + "balance_loss_clip": 0.06364593, + "balance_loss_mlp": 0.01263884, + "epoch": 0.08447316999849692, + "flos": 18703436941440.0, + "grad_norm": 2.408821192970914, + "language_loss": 0.85791099, + "learning_rate": 3.968954469409811e-06, + "loss": 0.93923384, + "num_input_tokens_seen": 29872180, + "router_z_loss_clip": 4.5, + "router_z_loss_mlp": 0.53588867, + "step": 1405, + "time_per_iteration": 2.6186141967773438 + }, + { + "auxiliary_loss_clip": 0.06785356, + "auxiliary_loss_mlp": 0.01307288, + "balance_loss_clip": 0.06358731, + "balance_loss_mlp": 0.01261488, + "epoch": 0.08453329325116489, + "flos": 25491061584000.0, + "grad_norm": 2.376275583502495, + "language_loss": 0.82456648, + "learning_rate": 3.968886076755639e-06, + "loss": 0.9054929, + "num_input_tokens_seen": 29893205, + "router_z_loss_clip": 4.2578125, + "router_z_loss_mlp": 0.45825195, + "step": 1406, + "time_per_iteration": 2.620391845703125 + }, + { + "auxiliary_loss_clip": 0.06791453, + "auxiliary_loss_mlp": 0.01321291, + "balance_loss_clip": 0.06356591, + "balance_loss_mlp": 0.01271461, + "epoch": 0.08459341650383286, + "flos": 20925839583360.0, + "grad_norm": 2.994077443847897, + "language_loss": 0.81261843, + "learning_rate": 3.96881760944111e-06, + "loss": 0.8937459, + "num_input_tokens_seen": 29911970, + "router_z_loss_clip": 4.34765625, + "router_z_loss_mlp": 0.49853516, + "step": 1407, + "time_per_iteration": 2.6037673950195312 + }, + { + "auxiliary_loss_clip": 0.06790854, + "auxiliary_loss_mlp": 0.01321715, + "balance_loss_clip": 0.06351606, + "balance_loss_mlp": 0.01269525, + "epoch": 0.08465353975650082, + "flos": 13048215102720.0, + "grad_norm": 4.665844838977458, + "language_loss": 0.93093699, + "learning_rate": 3.968749067468819e-06, + "loss": 1.01206267, + "num_input_tokens_seen": 29929925, + "router_z_loss_clip": 4.39453125, + "router_z_loss_mlp": 0.52197266, + "step": 1408, + "time_per_iteration": 2.5401058197021484 + }, + { + "auxiliary_loss_clip": 0.06614841, + "auxiliary_loss_mlp": 0.0131788, + "balance_loss_clip": 0.06340891, + "balance_loss_mlp": 0.01289985, + "epoch": 0.0847136630091688, + "flos": 60896912112000.0, + "grad_norm": 0.8563868358173309, + "language_loss": 0.62132567, + "learning_rate": 3.968680450841368e-06, + "loss": 0.7006529, + "num_input_tokens_seen": 29985950, + "router_z_loss_clip": 2.75, + "router_z_loss_mlp": 0.27954102, + "step": 1409, + "time_per_iteration": 3.2652077674865723 + }, + { + "auxiliary_loss_clip": 0.06755531, + "auxiliary_loss_mlp": 0.01311791, + "balance_loss_clip": 0.06338526, + "balance_loss_mlp": 0.01266802, + "epoch": 0.08477378626183676, + "flos": 22051743696000.0, + "grad_norm": 2.2146573769232916, + "language_loss": 0.88621575, + "learning_rate": 3.968611759561355e-06, + "loss": 0.96688896, + "num_input_tokens_seen": 30004330, + "router_z_loss_clip": 4.1640625, + "router_z_loss_mlp": 0.44995117, + "step": 1410, + "time_per_iteration": 2.5771710872650146 + }, + { + "auxiliary_loss_clip": 0.06769306, + "auxiliary_loss_mlp": 0.01318797, + "balance_loss_clip": 0.06336072, + "balance_loss_mlp": 0.01268253, + "epoch": 0.08483390951450473, + "flos": 16695537552000.0, + "grad_norm": 2.3714211979189987, + "language_loss": 0.76187658, + "learning_rate": 3.968542993631388e-06, + "loss": 0.84275758, + "num_input_tokens_seen": 30022555, + "router_z_loss_clip": 4.328125, + "router_z_loss_mlp": 0.50585938, + "step": 1411, + "time_per_iteration": 2.5831918716430664 + }, + { + "auxiliary_loss_clip": 0.06605848, + "auxiliary_loss_mlp": 0.01302084, + "balance_loss_clip": 0.06331737, + "balance_loss_mlp": 0.01268491, + "epoch": 0.08489403276717271, + "flos": 51604430313600.0, + "grad_norm": 0.8982882759913209, + "language_loss": 0.57100856, + "learning_rate": 3.968474153054073e-06, + "loss": 0.65008789, + "num_input_tokens_seen": 30077220, + "router_z_loss_clip": 2.7421875, + "router_z_loss_mlp": 0.33618164, + "step": 1412, + "time_per_iteration": 3.1449196338653564 + }, + { + "auxiliary_loss_clip": 0.06776647, + "auxiliary_loss_mlp": 0.0131046, + "balance_loss_clip": 0.06348051, + "balance_loss_mlp": 0.01261393, + "epoch": 0.08495415601984067, + "flos": 17098031439360.0, + "grad_norm": 4.4528738806487, + "language_loss": 0.91184032, + "learning_rate": 3.96840523783202e-06, + "loss": 0.99271137, + "num_input_tokens_seen": 30094600, + "router_z_loss_clip": 4.28515625, + "router_z_loss_mlp": 0.49145508, + "step": 1413, + "time_per_iteration": 2.5736677646636963 + }, + { + "auxiliary_loss_clip": 0.06762269, + "auxiliary_loss_mlp": 0.01310346, + "balance_loss_clip": 0.06341726, + "balance_loss_mlp": 0.01261685, + "epoch": 0.08501427927250864, + "flos": 23155034405760.0, + "grad_norm": 2.1658829941413997, + "language_loss": 0.9017415, + "learning_rate": 3.968336247967844e-06, + "loss": 0.98246765, + "num_input_tokens_seen": 30114475, + "router_z_loss_clip": 4.20703125, + "router_z_loss_mlp": 0.48706055, + "step": 1414, + "time_per_iteration": 2.6087806224823 + }, + { + "auxiliary_loss_clip": 0.06782193, + "auxiliary_loss_mlp": 0.01303484, + "balance_loss_clip": 0.06352735, + "balance_loss_mlp": 0.01258423, + "epoch": 0.08507440252517662, + "flos": 19069649210880.0, + "grad_norm": 2.082765030572706, + "language_loss": 0.79920703, + "learning_rate": 3.96826718346416e-06, + "loss": 0.88006377, + "num_input_tokens_seen": 30133350, + "router_z_loss_clip": 4.2890625, + "router_z_loss_mlp": 0.45068359, + "step": 1415, + "time_per_iteration": 2.5629544258117676 + }, + { + "auxiliary_loss_clip": 0.06759159, + "auxiliary_loss_mlp": 0.01306699, + "balance_loss_clip": 0.06336564, + "balance_loss_mlp": 0.01259492, + "epoch": 0.08513452577784458, + "flos": 60195249550080.0, + "grad_norm": 8.264598666401978, + "language_loss": 0.72300386, + "learning_rate": 3.968198044323587e-06, + "loss": 0.80366242, + "num_input_tokens_seen": 30159005, + "router_z_loss_clip": 4.21875, + "router_z_loss_mlp": 0.47216797, + "step": 1416, + "time_per_iteration": 2.9444239139556885 + }, + { + "auxiliary_loss_clip": 0.06803774, + "auxiliary_loss_mlp": 0.01317561, + "balance_loss_clip": 0.0635466, + "balance_loss_mlp": 0.01264608, + "epoch": 0.08519464903051255, + "flos": 27315917729280.0, + "grad_norm": 2.5149113887395407, + "language_loss": 0.77021283, + "learning_rate": 3.968128830548748e-06, + "loss": 0.85142624, + "num_input_tokens_seen": 30179450, + "router_z_loss_clip": 4.48046875, + "router_z_loss_mlp": 0.5300293, + "step": 1417, + "time_per_iteration": 2.619328260421753 + }, + { + "auxiliary_loss_clip": 0.06779526, + "auxiliary_loss_mlp": 0.01310101, + "balance_loss_clip": 0.06341187, + "balance_loss_mlp": 0.01259341, + "epoch": 0.08525477228318051, + "flos": 20272644679680.0, + "grad_norm": 2.930615198621333, + "language_loss": 0.84423447, + "learning_rate": 3.968059542142265e-06, + "loss": 0.92513078, + "num_input_tokens_seen": 30197235, + "router_z_loss_clip": 4.37890625, + "router_z_loss_mlp": 0.5078125, + "step": 1418, + "time_per_iteration": 2.5782899856567383 + }, + { + "auxiliary_loss_clip": 0.06606524, + "auxiliary_loss_mlp": 0.01274095, + "balance_loss_clip": 0.06333332, + "balance_loss_mlp": 0.01249931, + "epoch": 0.08531489553584849, + "flos": 67633580672640.0, + "grad_norm": 0.9458512268838744, + "language_loss": 0.5659793, + "learning_rate": 3.9679901791067685e-06, + "loss": 0.64478552, + "num_input_tokens_seen": 30257410, + "router_z_loss_clip": 2.73046875, + "router_z_loss_mlp": 0.24157715, + "step": 1419, + "time_per_iteration": 3.1296868324279785 + }, + { + "auxiliary_loss_clip": 0.06790996, + "auxiliary_loss_mlp": 0.01306783, + "balance_loss_clip": 0.06354627, + "balance_loss_mlp": 0.01259004, + "epoch": 0.08537501878851646, + "flos": 27534362123520.0, + "grad_norm": 2.6126551890980076, + "language_loss": 0.72536588, + "learning_rate": 3.967920741444886e-06, + "loss": 0.80634367, + "num_input_tokens_seen": 30277865, + "router_z_loss_clip": 4.359375, + "router_z_loss_mlp": 0.4777832, + "step": 1420, + "time_per_iteration": 2.629305839538574 + }, + { + "auxiliary_loss_clip": 0.06772007, + "auxiliary_loss_mlp": 0.01307483, + "balance_loss_clip": 0.06343359, + "balance_loss_mlp": 0.01257272, + "epoch": 0.08543514204118442, + "flos": 22790918488320.0, + "grad_norm": 2.3388359886837917, + "language_loss": 0.89903885, + "learning_rate": 3.967851229159252e-06, + "loss": 0.97983378, + "num_input_tokens_seen": 30298545, + "router_z_loss_clip": 4.27929688, + "router_z_loss_mlp": 0.50244141, + "step": 1421, + "time_per_iteration": 2.5863590240478516 + }, + { + "auxiliary_loss_clip": 0.06597036, + "auxiliary_loss_mlp": 0.01275978, + "balance_loss_clip": 0.06325173, + "balance_loss_mlp": 0.01249919, + "epoch": 0.0854952652938524, + "flos": 61010872064640.0, + "grad_norm": 0.7745811005373293, + "language_loss": 0.63692141, + "learning_rate": 3.967781642252502e-06, + "loss": 0.71565151, + "num_input_tokens_seen": 30361725, + "router_z_loss_clip": 2.71875, + "router_z_loss_mlp": 0.26098633, + "step": 1422, + "time_per_iteration": 3.19461989402771 + }, + { + "auxiliary_loss_clip": 0.06765623, + "auxiliary_loss_mlp": 0.01311314, + "balance_loss_clip": 0.06344545, + "balance_loss_mlp": 0.01266444, + "epoch": 0.08555538854652037, + "flos": 28045575083520.0, + "grad_norm": 3.3087422543747205, + "language_loss": 0.84878761, + "learning_rate": 3.967711980727276e-06, + "loss": 0.92955703, + "num_input_tokens_seen": 30382180, + "router_z_loss_clip": 4.21289062, + "router_z_loss_mlp": 0.44873047, + "step": 1423, + "time_per_iteration": 2.6554226875305176 + }, + { + "auxiliary_loss_clip": 0.06776007, + "auxiliary_loss_mlp": 0.01303967, + "balance_loss_clip": 0.06351057, + "balance_loss_mlp": 0.01261314, + "epoch": 0.08561551179918833, + "flos": 23515293035520.0, + "grad_norm": 2.569087931646671, + "language_loss": 0.7765131, + "learning_rate": 3.967642244586213e-06, + "loss": 0.85731286, + "num_input_tokens_seen": 30402980, + "router_z_loss_clip": 4.24609375, + "router_z_loss_mlp": 0.42602539, + "step": 1424, + "time_per_iteration": 2.7058026790618896 + }, + { + "auxiliary_loss_clip": 0.06765693, + "auxiliary_loss_mlp": 0.01310667, + "balance_loss_clip": 0.06343248, + "balance_loss_mlp": 0.01265988, + "epoch": 0.08567563505185631, + "flos": 17932005527040.0, + "grad_norm": 1.9981101747379681, + "language_loss": 0.78279495, + "learning_rate": 3.96757243383196e-06, + "loss": 0.86355859, + "num_input_tokens_seen": 30420800, + "router_z_loss_clip": 4.22265625, + "router_z_loss_mlp": 0.44677734, + "step": 1425, + "time_per_iteration": 2.575941801071167 + }, + { + "auxiliary_loss_clip": 0.06768522, + "auxiliary_loss_mlp": 0.01310756, + "balance_loss_clip": 0.06347974, + "balance_loss_mlp": 0.01264074, + "epoch": 0.08573575830452428, + "flos": 19725695153280.0, + "grad_norm": 2.337358950389625, + "language_loss": 0.95636088, + "learning_rate": 3.9675025484671624e-06, + "loss": 1.03715372, + "num_input_tokens_seen": 30439620, + "router_z_loss_clip": 4.20507812, + "router_z_loss_mlp": 0.46679688, + "step": 1426, + "time_per_iteration": 2.5706772804260254 + }, + { + "auxiliary_loss_clip": 0.06791019, + "auxiliary_loss_mlp": 0.01318941, + "balance_loss_clip": 0.06355577, + "balance_loss_mlp": 0.01267776, + "epoch": 0.08579588155719224, + "flos": 17937414115200.0, + "grad_norm": 3.6077969135085945, + "language_loss": 0.78100324, + "learning_rate": 3.967432588494471e-06, + "loss": 0.86210281, + "num_input_tokens_seen": 30457300, + "router_z_loss_clip": 4.3515625, + "router_z_loss_mlp": 0.51196289, + "step": 1427, + "time_per_iteration": 2.620664119720459 + }, + { + "auxiliary_loss_clip": 0.06773555, + "auxiliary_loss_mlp": 0.01322231, + "balance_loss_clip": 0.06351949, + "balance_loss_mlp": 0.01272831, + "epoch": 0.08585600480986022, + "flos": 16038694995840.0, + "grad_norm": 4.670417341284444, + "language_loss": 0.84344131, + "learning_rate": 3.96736255391654e-06, + "loss": 0.92439914, + "num_input_tokens_seen": 30471580, + "router_z_loss_clip": 4.21679688, + "router_z_loss_mlp": 0.49414062, + "step": 1428, + "time_per_iteration": 2.5323448181152344 + }, + { + "auxiliary_loss_clip": 0.06797348, + "auxiliary_loss_mlp": 0.01327926, + "balance_loss_clip": 0.06359121, + "balance_loss_mlp": 0.01274211, + "epoch": 0.08591612806252819, + "flos": 28664920137600.0, + "grad_norm": 3.8563401660428136, + "language_loss": 0.82438064, + "learning_rate": 3.967292444736023e-06, + "loss": 0.90563333, + "num_input_tokens_seen": 30492720, + "router_z_loss_clip": 4.375, + "router_z_loss_mlp": 0.53710938, + "step": 1429, + "time_per_iteration": 2.6729156970977783 + }, + { + "auxiliary_loss_clip": 0.06787296, + "auxiliary_loss_mlp": 0.01320421, + "balance_loss_clip": 0.06368907, + "balance_loss_mlp": 0.0127586, + "epoch": 0.08597625131519615, + "flos": 20965349583360.0, + "grad_norm": 2.123464733030403, + "language_loss": 0.90146309, + "learning_rate": 3.967222260955578e-06, + "loss": 0.98254025, + "num_input_tokens_seen": 30509535, + "router_z_loss_clip": 4.18554688, + "router_z_loss_mlp": 0.44580078, + "step": 1430, + "time_per_iteration": 2.553527355194092 + }, + { + "auxiliary_loss_clip": 0.06773631, + "auxiliary_loss_mlp": 0.01318779, + "balance_loss_clip": 0.06357691, + "balance_loss_mlp": 0.01274552, + "epoch": 0.08603637456786412, + "flos": 23262747229440.0, + "grad_norm": 2.0722520617005924, + "language_loss": 0.84170914, + "learning_rate": 3.96715200257787e-06, + "loss": 0.92263317, + "num_input_tokens_seen": 30529490, + "router_z_loss_clip": 4.16015625, + "router_z_loss_mlp": 0.44213867, + "step": 1431, + "time_per_iteration": 2.5954349040985107 + }, + { + "auxiliary_loss_clip": 0.06773046, + "auxiliary_loss_mlp": 0.01317231, + "balance_loss_clip": 0.06352717, + "balance_loss_mlp": 0.01270858, + "epoch": 0.0860964978205321, + "flos": 28701704880000.0, + "grad_norm": 5.769747909175534, + "language_loss": 0.79544812, + "learning_rate": 3.967081669605559e-06, + "loss": 0.87635088, + "num_input_tokens_seen": 30550205, + "router_z_loss_clip": 4.19726562, + "router_z_loss_mlp": 0.46362305, + "step": 1432, + "time_per_iteration": 2.6024515628814697 + }, + { + "auxiliary_loss_clip": 0.06771973, + "auxiliary_loss_mlp": 0.01314171, + "balance_loss_clip": 0.06355675, + "balance_loss_mlp": 0.01269325, + "epoch": 0.08615662107320006, + "flos": 19324542931200.0, + "grad_norm": 3.3903634053002336, + "language_loss": 0.75487757, + "learning_rate": 3.967011262041315e-06, + "loss": 0.83573902, + "num_input_tokens_seen": 30568830, + "router_z_loss_clip": 4.1640625, + "router_z_loss_mlp": 0.44848633, + "step": 1433, + "time_per_iteration": 2.5895845890045166 + }, + { + "auxiliary_loss_clip": 0.06795658, + "auxiliary_loss_mlp": 0.01322619, + "balance_loss_clip": 0.0636312, + "balance_loss_mlp": 0.01272313, + "epoch": 0.08621674432586802, + "flos": 15857161125120.0, + "grad_norm": 4.641351982999466, + "language_loss": 0.88055921, + "learning_rate": 3.9669407798878065e-06, + "loss": 0.96174198, + "num_input_tokens_seen": 30585730, + "router_z_loss_clip": 4.328125, + "router_z_loss_mlp": 0.50268555, + "step": 1434, + "time_per_iteration": 2.5355098247528076 + }, + { + "auxiliary_loss_clip": 0.06779063, + "auxiliary_loss_mlp": 0.01311558, + "balance_loss_clip": 0.06353655, + "balance_loss_mlp": 0.01263803, + "epoch": 0.086276867578536, + "flos": 14105874464640.0, + "grad_norm": 4.793331202343017, + "language_loss": 0.80184627, + "learning_rate": 3.966870223147707e-06, + "loss": 0.88275254, + "num_input_tokens_seen": 30603180, + "router_z_loss_clip": 4.25195312, + "router_z_loss_mlp": 0.4777832, + "step": 1435, + "time_per_iteration": 2.57381272315979 + }, + { + "auxiliary_loss_clip": 0.06627634, + "auxiliary_loss_mlp": 0.01282391, + "balance_loss_clip": 0.06350996, + "balance_loss_mlp": 0.01255616, + "epoch": 0.08633699083120397, + "flos": 70206500142720.0, + "grad_norm": 0.941958531658993, + "language_loss": 0.58419931, + "learning_rate": 3.96679959182369e-06, + "loss": 0.66329956, + "num_input_tokens_seen": 30668895, + "router_z_loss_clip": 2.7734375, + "router_z_loss_mlp": 0.26831055, + "step": 1436, + "time_per_iteration": 3.282787561416626 + }, + { + "auxiliary_loss_clip": 0.06781173, + "auxiliary_loss_mlp": 0.01309156, + "balance_loss_clip": 0.06351152, + "balance_loss_mlp": 0.01261949, + "epoch": 0.08639711408387193, + "flos": 30306565330560.0, + "grad_norm": 3.136203943019662, + "language_loss": 0.71995145, + "learning_rate": 3.966728885918437e-06, + "loss": 0.80085474, + "num_input_tokens_seen": 30688955, + "router_z_loss_clip": 4.296875, + "router_z_loss_mlp": 0.47167969, + "step": 1437, + "time_per_iteration": 4.062320232391357 + }, + { + "auxiliary_loss_clip": 0.06771993, + "auxiliary_loss_mlp": 0.01311453, + "balance_loss_clip": 0.06345055, + "balance_loss_mlp": 0.01262553, + "epoch": 0.08645723733653991, + "flos": 20303014584960.0, + "grad_norm": 2.1552544434513154, + "language_loss": 0.74663305, + "learning_rate": 3.966658105434627e-06, + "loss": 0.82746744, + "num_input_tokens_seen": 30706095, + "router_z_loss_clip": 4.26757812, + "router_z_loss_mlp": 0.48925781, + "step": 1438, + "time_per_iteration": 2.5902743339538574 + }, + { + "auxiliary_loss_clip": 0.06752677, + "auxiliary_loss_mlp": 0.01311557, + "balance_loss_clip": 0.06331892, + "balance_loss_mlp": 0.01263468, + "epoch": 0.08651736058920788, + "flos": 32898911748480.0, + "grad_norm": 2.1102638652127093, + "language_loss": 0.6610049, + "learning_rate": 3.966587250374945e-06, + "loss": 0.7416473, + "num_input_tokens_seen": 30729025, + "router_z_loss_clip": 4.20703125, + "router_z_loss_mlp": 0.48071289, + "step": 1439, + "time_per_iteration": 4.177356719970703 + }, + { + "auxiliary_loss_clip": 0.06767576, + "auxiliary_loss_mlp": 0.01319213, + "balance_loss_clip": 0.06342776, + "balance_loss_mlp": 0.01270934, + "epoch": 0.08657748384187584, + "flos": 22643863372800.0, + "grad_norm": 6.195931442958794, + "language_loss": 0.89298683, + "learning_rate": 3.966516320742077e-06, + "loss": 0.97385472, + "num_input_tokens_seen": 30746155, + "router_z_loss_clip": 4.25, + "router_z_loss_mlp": 0.4831543, + "step": 1440, + "time_per_iteration": 2.5557472705841064 + }, + { + "auxiliary_loss_clip": 0.06781097, + "auxiliary_loss_mlp": 0.01307911, + "balance_loss_clip": 0.06338568, + "balance_loss_mlp": 0.01254028, + "epoch": 0.08663760709454381, + "flos": 23664947627520.0, + "grad_norm": 2.369224573412665, + "language_loss": 0.86471045, + "learning_rate": 3.9664453165387124e-06, + "loss": 0.94560057, + "num_input_tokens_seen": 30761410, + "router_z_loss_clip": 4.421875, + "router_z_loss_mlp": 0.53833008, + "step": 1441, + "time_per_iteration": 2.65085768699646 + }, + { + "auxiliary_loss_clip": 0.06611373, + "auxiliary_loss_mlp": 0.01295436, + "balance_loss_clip": 0.06333591, + "balance_loss_mlp": 0.01268138, + "epoch": 0.08669773034721179, + "flos": 62703823484160.0, + "grad_norm": 0.803695610307685, + "language_loss": 0.60671109, + "learning_rate": 3.966374237767545e-06, + "loss": 0.68577921, + "num_input_tokens_seen": 30823010, + "router_z_loss_clip": 2.78125, + "router_z_loss_mlp": 0.27368164, + "step": 1442, + "time_per_iteration": 4.761855125427246 + }, + { + "auxiliary_loss_clip": 0.0676527, + "auxiliary_loss_mlp": 0.0130763, + "balance_loss_clip": 0.06333362, + "balance_loss_mlp": 0.0125885, + "epoch": 0.08675785359987975, + "flos": 20673713047680.0, + "grad_norm": 2.753695330350272, + "language_loss": 0.81546146, + "learning_rate": 3.96630308443127e-06, + "loss": 0.8961904, + "num_input_tokens_seen": 30841980, + "router_z_loss_clip": 4.31640625, + "router_z_loss_mlp": 0.48803711, + "step": 1443, + "time_per_iteration": 2.581735134124756 + }, + { + "auxiliary_loss_clip": 0.06751874, + "auxiliary_loss_mlp": 0.01309584, + "balance_loss_clip": 0.06329648, + "balance_loss_mlp": 0.01264404, + "epoch": 0.08681797685254772, + "flos": 26948070305280.0, + "grad_norm": 2.052695672066824, + "language_loss": 0.83898687, + "learning_rate": 3.966231856532584e-06, + "loss": 0.91960144, + "num_input_tokens_seen": 30863280, + "router_z_loss_clip": 4.22460938, + "router_z_loss_mlp": 0.45166016, + "step": 1444, + "time_per_iteration": 4.03491473197937 + }, + { + "auxiliary_loss_clip": 0.06771353, + "auxiliary_loss_mlp": 0.01313762, + "balance_loss_clip": 0.063327, + "balance_loss_mlp": 0.01263408, + "epoch": 0.0868781001052157, + "flos": 17718676231680.0, + "grad_norm": 2.3029002758170236, + "language_loss": 0.89515543, + "learning_rate": 3.966160554074189e-06, + "loss": 0.97600663, + "num_input_tokens_seen": 30881710, + "router_z_loss_clip": 4.3828125, + "router_z_loss_mlp": 0.50341797, + "step": 1445, + "time_per_iteration": 2.53659987449646 + }, + { + "auxiliary_loss_clip": 0.06757164, + "auxiliary_loss_mlp": 0.01319102, + "balance_loss_clip": 0.0633342, + "balance_loss_mlp": 0.01269916, + "epoch": 0.08693822335788366, + "flos": 19901820435840.0, + "grad_norm": 2.912516601595955, + "language_loss": 0.84297967, + "learning_rate": 3.96608917705879e-06, + "loss": 0.92374229, + "num_input_tokens_seen": 30900225, + "router_z_loss_clip": 4.23632812, + "router_z_loss_mlp": 0.49169922, + "step": 1446, + "time_per_iteration": 2.5991437435150146 + }, + { + "auxiliary_loss_clip": 0.06602339, + "auxiliary_loss_mlp": 0.01278086, + "balance_loss_clip": 0.06327674, + "balance_loss_mlp": 0.01252623, + "epoch": 0.08699834661055163, + "flos": 67040957871360.0, + "grad_norm": 0.7332106315857324, + "language_loss": 0.54912937, + "learning_rate": 3.966017725489091e-06, + "loss": 0.62793368, + "num_input_tokens_seen": 30959580, + "router_z_loss_clip": 2.75, + "router_z_loss_mlp": 0.25488281, + "step": 1447, + "time_per_iteration": 3.2708306312561035 + }, + { + "auxiliary_loss_clip": 0.06739033, + "auxiliary_loss_mlp": 0.01328667, + "balance_loss_clip": 0.06324905, + "balance_loss_mlp": 0.01282223, + "epoch": 0.0870584698632196, + "flos": 13485648942720.0, + "grad_norm": 3.073032874929238, + "language_loss": 0.86241722, + "learning_rate": 3.965946199367804e-06, + "loss": 0.94309419, + "num_input_tokens_seen": 30976775, + "router_z_loss_clip": 4.140625, + "router_z_loss_mlp": 0.46508789, + "step": 1448, + "time_per_iteration": 2.537522792816162 + }, + { + "auxiliary_loss_clip": 0.067637, + "auxiliary_loss_mlp": 0.01323636, + "balance_loss_clip": 0.06333195, + "balance_loss_mlp": 0.01275666, + "epoch": 0.08711859311588757, + "flos": 16112516042880.0, + "grad_norm": 5.523495984670142, + "language_loss": 0.81949937, + "learning_rate": 3.965874598697638e-06, + "loss": 0.90037274, + "num_input_tokens_seen": 30990495, + "router_z_loss_clip": 4.3046875, + "router_z_loss_mlp": 0.47949219, + "step": 1449, + "time_per_iteration": 2.57389760017395 + }, + { + "auxiliary_loss_clip": 0.06749628, + "auxiliary_loss_mlp": 0.01305238, + "balance_loss_clip": 0.06335508, + "balance_loss_mlp": 0.01262227, + "epoch": 0.08717871636855554, + "flos": 38481528424320.0, + "grad_norm": 2.3810554922577354, + "language_loss": 0.73064238, + "learning_rate": 3.965802923481313e-06, + "loss": 0.81119096, + "num_input_tokens_seen": 31014080, + "router_z_loss_clip": 4.140625, + "router_z_loss_mlp": 0.43017578, + "step": 1450, + "time_per_iteration": 2.7252304553985596 + }, + { + "auxiliary_loss_clip": 0.06761701, + "auxiliary_loss_mlp": 0.01323911, + "balance_loss_clip": 0.06337759, + "balance_loss_mlp": 0.01275416, + "epoch": 0.0872388396212235, + "flos": 17605932163200.0, + "grad_norm": 2.1112425767796474, + "language_loss": 0.85553432, + "learning_rate": 3.965731173721542e-06, + "loss": 0.9363904, + "num_input_tokens_seen": 31031210, + "router_z_loss_clip": 4.24414062, + "router_z_loss_mlp": 0.48486328, + "step": 1451, + "time_per_iteration": 2.556896209716797 + }, + { + "auxiliary_loss_clip": 0.06751224, + "auxiliary_loss_mlp": 0.01307951, + "balance_loss_clip": 0.06344092, + "balance_loss_mlp": 0.01266395, + "epoch": 0.08729896287389148, + "flos": 25265489592960.0, + "grad_norm": 2.067410826923288, + "language_loss": 0.76721281, + "learning_rate": 3.965659349421049e-06, + "loss": 0.84780455, + "num_input_tokens_seen": 31049710, + "router_z_loss_clip": 4.0703125, + "router_z_loss_mlp": 0.41577148, + "step": 1452, + "time_per_iteration": 2.5980234146118164 + }, + { + "auxiliary_loss_clip": 0.06767467, + "auxiliary_loss_mlp": 0.01321022, + "balance_loss_clip": 0.06343699, + "balance_loss_mlp": 0.01272623, + "epoch": 0.08735908612655945, + "flos": 15637836263040.0, + "grad_norm": 4.836985480100509, + "language_loss": 0.8246457, + "learning_rate": 3.965587450582556e-06, + "loss": 0.90553057, + "num_input_tokens_seen": 31066160, + "router_z_loss_clip": 4.23828125, + "router_z_loss_mlp": 0.48364258, + "step": 1453, + "time_per_iteration": 2.5459630489349365 + }, + { + "auxiliary_loss_clip": 0.06754768, + "auxiliary_loss_mlp": 0.0129928, + "balance_loss_clip": 0.06342497, + "balance_loss_mlp": 0.0125646, + "epoch": 0.08741920937922741, + "flos": 20345920675200.0, + "grad_norm": 3.0656217118084, + "language_loss": 0.72998244, + "learning_rate": 3.96551547720879e-06, + "loss": 0.81052291, + "num_input_tokens_seen": 31085270, + "router_z_loss_clip": 4.12695312, + "router_z_loss_mlp": 0.42822266, + "step": 1454, + "time_per_iteration": 2.551548957824707 + }, + { + "auxiliary_loss_clip": 0.0662789, + "auxiliary_loss_mlp": 0.01303999, + "balance_loss_clip": 0.06353966, + "balance_loss_mlp": 0.01280789, + "epoch": 0.08747933263189539, + "flos": 62841052944000.0, + "grad_norm": 0.7529223255178736, + "language_loss": 0.58298737, + "learning_rate": 3.96544342930248e-06, + "loss": 0.66230631, + "num_input_tokens_seen": 31148445, + "router_z_loss_clip": 2.734375, + "router_z_loss_mlp": 0.23181152, + "step": 1455, + "time_per_iteration": 3.2130184173583984 + }, + { + "auxiliary_loss_clip": 0.06774339, + "auxiliary_loss_mlp": 0.01313917, + "balance_loss_clip": 0.06350334, + "balance_loss_mlp": 0.01265303, + "epoch": 0.08753945588456336, + "flos": 33044122074240.0, + "grad_norm": 1.7776650768799964, + "language_loss": 0.79278296, + "learning_rate": 3.965371306866359e-06, + "loss": 0.87366557, + "num_input_tokens_seen": 31168770, + "router_z_loss_clip": 4.23632812, + "router_z_loss_mlp": 0.4855957, + "step": 1456, + "time_per_iteration": 2.6745898723602295 + }, + { + "auxiliary_loss_clip": 0.06785175, + "auxiliary_loss_mlp": 0.01319613, + "balance_loss_clip": 0.06356893, + "balance_loss_mlp": 0.01271881, + "epoch": 0.08759957913723132, + "flos": 35554807088640.0, + "grad_norm": 2.255439619282858, + "language_loss": 0.74143755, + "learning_rate": 3.96529910990316e-06, + "loss": 0.82248545, + "num_input_tokens_seen": 31189270, + "router_z_loss_clip": 4.28515625, + "router_z_loss_mlp": 0.47753906, + "step": 1457, + "time_per_iteration": 2.6837821006774902 + }, + { + "auxiliary_loss_clip": 0.06763137, + "auxiliary_loss_mlp": 0.01308035, + "balance_loss_clip": 0.06348729, + "balance_loss_mlp": 0.01264738, + "epoch": 0.0876597023898993, + "flos": 23917283798400.0, + "grad_norm": 1.7808177247023305, + "language_loss": 0.88680792, + "learning_rate": 3.965226838415622e-06, + "loss": 0.96751964, + "num_input_tokens_seen": 31210385, + "router_z_loss_clip": 4.140625, + "router_z_loss_mlp": 0.43261719, + "step": 1458, + "time_per_iteration": 2.5912857055664062 + }, + { + "auxiliary_loss_clip": 0.0677645, + "auxiliary_loss_mlp": 0.01313188, + "balance_loss_clip": 0.06355318, + "balance_loss_mlp": 0.01268151, + "epoch": 0.08771982564256726, + "flos": 18119912307840.0, + "grad_norm": 3.1042726617035297, + "language_loss": 0.82429975, + "learning_rate": 3.965154492406486e-06, + "loss": 0.90519613, + "num_input_tokens_seen": 31229745, + "router_z_loss_clip": 4.20703125, + "router_z_loss_mlp": 0.45043945, + "step": 1459, + "time_per_iteration": 2.5870959758758545 + }, + { + "auxiliary_loss_clip": 0.0679104, + "auxiliary_loss_mlp": 0.01327895, + "balance_loss_clip": 0.06355593, + "balance_loss_mlp": 0.01275062, + "epoch": 0.08777994889523523, + "flos": 17717711909760.0, + "grad_norm": 7.236455309064537, + "language_loss": 0.8621763, + "learning_rate": 3.9650820718784945e-06, + "loss": 0.94336569, + "num_input_tokens_seen": 31248280, + "router_z_loss_clip": 4.35546875, + "router_z_loss_mlp": 0.52856445, + "step": 1460, + "time_per_iteration": 2.574669361114502 + }, + { + "auxiliary_loss_clip": 0.06771254, + "auxiliary_loss_mlp": 0.01315799, + "balance_loss_clip": 0.06352662, + "balance_loss_mlp": 0.01271215, + "epoch": 0.0878400721479032, + "flos": 12824320193280.0, + "grad_norm": 3.2811276479841847, + "language_loss": 0.83160508, + "learning_rate": 3.965009576834394e-06, + "loss": 0.91247559, + "num_input_tokens_seen": 31262190, + "router_z_loss_clip": 4.18359375, + "router_z_loss_mlp": 0.44580078, + "step": 1461, + "time_per_iteration": 2.575343608856201 + }, + { + "auxiliary_loss_clip": 0.06765963, + "auxiliary_loss_mlp": 0.01303985, + "balance_loss_clip": 0.06350134, + "balance_loss_mlp": 0.01261094, + "epoch": 0.08790019540057117, + "flos": 26399359843200.0, + "grad_norm": 3.960130795636661, + "language_loss": 0.77723432, + "learning_rate": 3.964937007276932e-06, + "loss": 0.85793376, + "num_input_tokens_seen": 31283690, + "router_z_loss_clip": 4.15625, + "router_z_loss_mlp": 0.42895508, + "step": 1462, + "time_per_iteration": 2.6177735328674316 + }, + { + "auxiliary_loss_clip": 0.06788168, + "auxiliary_loss_mlp": 0.01309058, + "balance_loss_clip": 0.06352487, + "balance_loss_mlp": 0.01258371, + "epoch": 0.08796031865323914, + "flos": 19139822605440.0, + "grad_norm": 5.369695457360621, + "language_loss": 0.76475191, + "learning_rate": 3.9648643632088634e-06, + "loss": 0.84572417, + "num_input_tokens_seen": 31302505, + "router_z_loss_clip": 4.359375, + "router_z_loss_mlp": 0.50732422, + "step": 1463, + "time_per_iteration": 2.532130241394043 + }, + { + "auxiliary_loss_clip": 0.06770946, + "auxiliary_loss_mlp": 0.01316317, + "balance_loss_clip": 0.06331752, + "balance_loss_mlp": 0.01261218, + "epoch": 0.0880204419059071, + "flos": 26070896638080.0, + "grad_norm": 3.6430076592813427, + "language_loss": 0.85532415, + "learning_rate": 3.964791644632941e-06, + "loss": 0.9361968, + "num_input_tokens_seen": 31323070, + "router_z_loss_clip": 4.39453125, + "router_z_loss_mlp": 0.55126953, + "step": 1464, + "time_per_iteration": 2.606081962585449 + }, + { + "auxiliary_loss_clip": 0.06766248, + "auxiliary_loss_mlp": 0.01314801, + "balance_loss_clip": 0.06340823, + "balance_loss_mlp": 0.01264948, + "epoch": 0.08808056515857508, + "flos": 22383602991360.0, + "grad_norm": 2.6056498019463774, + "language_loss": 0.80711126, + "learning_rate": 3.964718851551923e-06, + "loss": 0.88792181, + "num_input_tokens_seen": 31341880, + "router_z_loss_clip": 4.25, + "router_z_loss_mlp": 0.4987793, + "step": 1465, + "time_per_iteration": 2.555612325668335 + }, + { + "auxiliary_loss_clip": 0.06765096, + "auxiliary_loss_mlp": 0.0132391, + "balance_loss_clip": 0.06346563, + "balance_loss_mlp": 0.01275654, + "epoch": 0.08814068841124305, + "flos": 23191986856320.0, + "grad_norm": 5.208613872763048, + "language_loss": 0.8713969, + "learning_rate": 3.9646459839685675e-06, + "loss": 0.95228696, + "num_input_tokens_seen": 31361995, + "router_z_loss_clip": 4.18554688, + "router_z_loss_mlp": 0.48266602, + "step": 1466, + "time_per_iteration": 2.5865933895111084 + }, + { + "auxiliary_loss_clip": 0.067513, + "auxiliary_loss_mlp": 0.01319742, + "balance_loss_clip": 0.06332761, + "balance_loss_mlp": 0.01270842, + "epoch": 0.08820081166391101, + "flos": 25162262962560.0, + "grad_norm": 2.171865464101356, + "language_loss": 0.85806906, + "learning_rate": 3.964573041885641e-06, + "loss": 0.93877947, + "num_input_tokens_seen": 31381515, + "router_z_loss_clip": 4.18359375, + "router_z_loss_mlp": 0.48852539, + "step": 1467, + "time_per_iteration": 2.5861306190490723 + }, + { + "auxiliary_loss_clip": 0.06751268, + "auxiliary_loss_mlp": 0.0130998, + "balance_loss_clip": 0.06337693, + "balance_loss_mlp": 0.01262654, + "epoch": 0.08826093491657899, + "flos": 22237386416640.0, + "grad_norm": 2.29409858909566, + "language_loss": 0.78131318, + "learning_rate": 3.964500025305907e-06, + "loss": 0.86192572, + "num_input_tokens_seen": 31400345, + "router_z_loss_clip": 4.1328125, + "router_z_loss_mlp": 0.47387695, + "step": 1468, + "time_per_iteration": 2.5800206661224365 + }, + { + "auxiliary_loss_clip": 0.06742708, + "auxiliary_loss_mlp": 0.01311969, + "balance_loss_clip": 0.06332668, + "balance_loss_mlp": 0.01265501, + "epoch": 0.08832105816924696, + "flos": 22133279318400.0, + "grad_norm": 1.8356690071746322, + "language_loss": 0.82406783, + "learning_rate": 3.9644269342321355e-06, + "loss": 0.90461457, + "num_input_tokens_seen": 31419620, + "router_z_loss_clip": 4.09570312, + "router_z_loss_mlp": 0.46459961, + "step": 1469, + "time_per_iteration": 2.5584611892700195 + }, + { + "auxiliary_loss_clip": 0.06744162, + "auxiliary_loss_mlp": 0.01313281, + "balance_loss_clip": 0.06327502, + "balance_loss_mlp": 0.01264739, + "epoch": 0.08838118142191492, + "flos": 17572250021760.0, + "grad_norm": 2.2192924058432615, + "language_loss": 0.79711461, + "learning_rate": 3.9643537686670974e-06, + "loss": 0.877689, + "num_input_tokens_seen": 31437970, + "router_z_loss_clip": 4.16210938, + "router_z_loss_mlp": 0.48535156, + "step": 1470, + "time_per_iteration": 2.5447630882263184 + }, + { + "auxiliary_loss_clip": 0.06739189, + "auxiliary_loss_mlp": 0.01312164, + "balance_loss_clip": 0.06326798, + "balance_loss_mlp": 0.0126274, + "epoch": 0.0884413046745829, + "flos": 20783480296320.0, + "grad_norm": 2.030528760335608, + "language_loss": 0.86272311, + "learning_rate": 3.964280528613569e-06, + "loss": 0.94323671, + "num_input_tokens_seen": 31457040, + "router_z_loss_clip": 4.125, + "router_z_loss_mlp": 0.49511719, + "step": 1471, + "time_per_iteration": 2.7219297885894775 + }, + { + "auxiliary_loss_clip": 0.06719133, + "auxiliary_loss_mlp": 0.01304039, + "balance_loss_clip": 0.06321308, + "balance_loss_mlp": 0.01263222, + "epoch": 0.08850142792725087, + "flos": 22131686090880.0, + "grad_norm": 5.945068157557599, + "language_loss": 0.85369575, + "learning_rate": 3.964207214074324e-06, + "loss": 0.93392742, + "num_input_tokens_seen": 31477520, + "router_z_loss_clip": 3.97851562, + "router_z_loss_mlp": 0.40820312, + "step": 1472, + "time_per_iteration": 2.6007394790649414 + }, + { + "auxiliary_loss_clip": 0.06741676, + "auxiliary_loss_mlp": 0.01307162, + "balance_loss_clip": 0.06323978, + "balance_loss_mlp": 0.01258811, + "epoch": 0.08856155117991883, + "flos": 22425251270400.0, + "grad_norm": 4.024487815181785, + "language_loss": 0.85227764, + "learning_rate": 3.964133825052146e-06, + "loss": 0.93276608, + "num_input_tokens_seen": 31495575, + "router_z_loss_clip": 4.1796875, + "router_z_loss_mlp": 0.48388672, + "step": 1473, + "time_per_iteration": 2.610280752182007 + }, + { + "auxiliary_loss_clip": 0.06745915, + "auxiliary_loss_mlp": 0.01303107, + "balance_loss_clip": 0.0632661, + "balance_loss_mlp": 0.01257998, + "epoch": 0.0886216744325868, + "flos": 29945132743680.0, + "grad_norm": 1.5926466073589443, + "language_loss": 0.80301654, + "learning_rate": 3.964060361549816e-06, + "loss": 0.88350677, + "num_input_tokens_seen": 31520020, + "router_z_loss_clip": 4.1953125, + "router_z_loss_mlp": 0.45092773, + "step": 1474, + "time_per_iteration": 2.74392032623291 + }, + { + "auxiliary_loss_clip": 0.0673038, + "auxiliary_loss_mlp": 0.01308218, + "balance_loss_clip": 0.06324204, + "balance_loss_mlp": 0.01263062, + "epoch": 0.08868179768525478, + "flos": 23988798858240.0, + "grad_norm": 2.028999420252469, + "language_loss": 0.80928683, + "learning_rate": 3.963986823570121e-06, + "loss": 0.88967282, + "num_input_tokens_seen": 31539265, + "router_z_loss_clip": 4.05859375, + "router_z_loss_mlp": 0.45166016, + "step": 1475, + "time_per_iteration": 2.570007801055908 + }, + { + "auxiliary_loss_clip": 0.06742392, + "auxiliary_loss_mlp": 0.01303332, + "balance_loss_clip": 0.06327485, + "balance_loss_mlp": 0.01256387, + "epoch": 0.08874192093792274, + "flos": 43187264922240.0, + "grad_norm": 1.8785525854248355, + "language_loss": 0.76261604, + "learning_rate": 3.963913211115848e-06, + "loss": 0.84307337, + "num_input_tokens_seen": 31563425, + "router_z_loss_clip": 4.1484375, + "router_z_loss_mlp": 0.46972656, + "step": 1476, + "time_per_iteration": 4.163857460021973 + }, + { + "auxiliary_loss_clip": 0.06743093, + "auxiliary_loss_mlp": 0.01308468, + "balance_loss_clip": 0.06333718, + "balance_loss_mlp": 0.01262405, + "epoch": 0.0888020441905907, + "flos": 32860491851520.0, + "grad_norm": 1.6890231836232912, + "language_loss": 0.76270819, + "learning_rate": 3.9638395241897895e-06, + "loss": 0.84322381, + "num_input_tokens_seen": 31584525, + "router_z_loss_clip": 4.09375, + "router_z_loss_mlp": 0.46069336, + "step": 1477, + "time_per_iteration": 2.6772334575653076 + }, + { + "auxiliary_loss_clip": 0.06751049, + "auxiliary_loss_mlp": 0.01308123, + "balance_loss_clip": 0.06334269, + "balance_loss_mlp": 0.01263468, + "epoch": 0.08886216744325869, + "flos": 23156124508800.0, + "grad_norm": 2.600680931100332, + "language_loss": 0.88817739, + "learning_rate": 3.963765762794739e-06, + "loss": 0.96876919, + "num_input_tokens_seen": 31603325, + "router_z_loss_clip": 4.16601562, + "router_z_loss_mlp": 0.44677734, + "step": 1478, + "time_per_iteration": 4.08270525932312 + }, + { + "auxiliary_loss_clip": 0.0675, + "auxiliary_loss_mlp": 0.01309174, + "balance_loss_clip": 0.06336476, + "balance_loss_mlp": 0.01263803, + "epoch": 0.08892229069592665, + "flos": 23338371139200.0, + "grad_norm": 1.8272738608530537, + "language_loss": 0.79003656, + "learning_rate": 3.963691926933495e-06, + "loss": 0.87062836, + "num_input_tokens_seen": 31624820, + "router_z_loss_clip": 4.1328125, + "router_z_loss_mlp": 0.45361328, + "step": 1479, + "time_per_iteration": 2.5917623043060303 + }, + { + "auxiliary_loss_clip": 0.06747445, + "auxiliary_loss_mlp": 0.01303872, + "balance_loss_clip": 0.06333964, + "balance_loss_mlp": 0.01256665, + "epoch": 0.08898241394859462, + "flos": 26221012427520.0, + "grad_norm": 4.931621721483509, + "language_loss": 0.80906087, + "learning_rate": 3.9636180166088555e-06, + "loss": 0.88957405, + "num_input_tokens_seen": 31646080, + "router_z_loss_clip": 4.1328125, + "router_z_loss_mlp": 0.47265625, + "step": 1480, + "time_per_iteration": 2.6102962493896484 + }, + { + "auxiliary_loss_clip": 0.06771734, + "auxiliary_loss_mlp": 0.01331796, + "balance_loss_clip": 0.06338413, + "balance_loss_mlp": 0.01278986, + "epoch": 0.0890425372012626, + "flos": 23557444439040.0, + "grad_norm": 2.1143063599710135, + "language_loss": 0.68804622, + "learning_rate": 3.963544031823624e-06, + "loss": 0.76908153, + "num_input_tokens_seen": 31665770, + "router_z_loss_clip": 4.33203125, + "router_z_loss_mlp": 0.52807617, + "step": 1481, + "time_per_iteration": 4.085212707519531 + }, + { + "auxiliary_loss_clip": 0.06743339, + "auxiliary_loss_mlp": 0.01307322, + "balance_loss_clip": 0.06335256, + "balance_loss_mlp": 0.01264358, + "epoch": 0.08910266045393056, + "flos": 23009446736640.0, + "grad_norm": 2.5169726563525234, + "language_loss": 0.99559236, + "learning_rate": 3.9634699725806065e-06, + "loss": 1.07609892, + "num_input_tokens_seen": 31683805, + "router_z_loss_clip": 4.07617188, + "router_z_loss_mlp": 0.42993164, + "step": 1482, + "time_per_iteration": 2.564034938812256 + }, + { + "auxiliary_loss_clip": 0.06760907, + "auxiliary_loss_mlp": 0.0131259, + "balance_loss_clip": 0.06338564, + "balance_loss_mlp": 0.01264024, + "epoch": 0.08916278370659853, + "flos": 31943766257280.0, + "grad_norm": 3.2036096398767993, + "language_loss": 0.81227845, + "learning_rate": 3.96339583888261e-06, + "loss": 0.89301342, + "num_input_tokens_seen": 31704630, + "router_z_loss_clip": 4.22460938, + "router_z_loss_mlp": 0.48535156, + "step": 1483, + "time_per_iteration": 4.063607215881348 + }, + { + "auxiliary_loss_clip": 0.06743906, + "auxiliary_loss_mlp": 0.01316489, + "balance_loss_clip": 0.06329283, + "balance_loss_mlp": 0.01268519, + "epoch": 0.08922290695926649, + "flos": 17536219966080.0, + "grad_norm": 10.926297293099243, + "language_loss": 0.87554848, + "learning_rate": 3.963321630732448e-06, + "loss": 0.95615244, + "num_input_tokens_seen": 31723255, + "router_z_loss_clip": 4.140625, + "router_z_loss_mlp": 0.47998047, + "step": 1484, + "time_per_iteration": 2.5457398891448975 + }, + { + "auxiliary_loss_clip": 0.06757183, + "auxiliary_loss_mlp": 0.01321525, + "balance_loss_clip": 0.06330685, + "balance_loss_mlp": 0.01272315, + "epoch": 0.08928303021193447, + "flos": 32133392046720.0, + "grad_norm": 2.337720635500538, + "language_loss": 0.82324612, + "learning_rate": 3.963247348132932e-06, + "loss": 0.90403324, + "num_input_tokens_seen": 31747045, + "router_z_loss_clip": 4.265625, + "router_z_loss_mlp": 0.49267578, + "step": 1485, + "time_per_iteration": 2.6794724464416504 + }, + { + "auxiliary_loss_clip": 0.06736165, + "auxiliary_loss_mlp": 0.01302402, + "balance_loss_clip": 0.06326707, + "balance_loss_mlp": 0.01256125, + "epoch": 0.08934315346460243, + "flos": 22131392601600.0, + "grad_norm": 3.158284640334893, + "language_loss": 0.84766626, + "learning_rate": 3.96317299108688e-06, + "loss": 0.92805195, + "num_input_tokens_seen": 31766615, + "router_z_loss_clip": 4.09765625, + "router_z_loss_mlp": 0.46264648, + "step": 1486, + "time_per_iteration": 2.5732409954071045 + }, + { + "auxiliary_loss_clip": 0.06736217, + "auxiliary_loss_mlp": 0.0130934, + "balance_loss_clip": 0.06328043, + "balance_loss_mlp": 0.01267569, + "epoch": 0.0894032767172704, + "flos": 22572264458880.0, + "grad_norm": 1.7672180345851645, + "language_loss": 0.78605509, + "learning_rate": 3.963098559597111e-06, + "loss": 0.86651075, + "num_input_tokens_seen": 31785855, + "router_z_loss_clip": 4.07617188, + "router_z_loss_mlp": 0.41748047, + "step": 1487, + "time_per_iteration": 2.5952718257904053 + }, + { + "auxiliary_loss_clip": 0.06736919, + "auxiliary_loss_mlp": 0.01308401, + "balance_loss_clip": 0.06326038, + "balance_loss_mlp": 0.0126353, + "epoch": 0.08946339996993838, + "flos": 20199578319360.0, + "grad_norm": 4.25204894574284, + "language_loss": 0.85387635, + "learning_rate": 3.963024053666449e-06, + "loss": 0.93432951, + "num_input_tokens_seen": 31804210, + "router_z_loss_clip": 4.10546875, + "router_z_loss_mlp": 0.44873047, + "step": 1488, + "time_per_iteration": 2.5534958839416504 + }, + { + "auxiliary_loss_clip": 0.06725559, + "auxiliary_loss_mlp": 0.01303445, + "balance_loss_clip": 0.06320536, + "balance_loss_mlp": 0.01259838, + "epoch": 0.08952352322260634, + "flos": 48371035363200.0, + "grad_norm": 2.4620081078023173, + "language_loss": 0.74370039, + "learning_rate": 3.962949473297718e-06, + "loss": 0.82399046, + "num_input_tokens_seen": 31826150, + "router_z_loss_clip": 4.04882812, + "router_z_loss_mlp": 0.43554688, + "step": 1489, + "time_per_iteration": 2.780122756958008 + }, + { + "auxiliary_loss_clip": 0.06736162, + "auxiliary_loss_mlp": 0.01308033, + "balance_loss_clip": 0.06324734, + "balance_loss_mlp": 0.01264092, + "epoch": 0.08958364647527431, + "flos": 31800736137600.0, + "grad_norm": 2.6258968543660584, + "language_loss": 0.91654348, + "learning_rate": 3.962874818493745e-06, + "loss": 0.99698538, + "num_input_tokens_seen": 31848060, + "router_z_loss_clip": 4.11132812, + "router_z_loss_mlp": 0.43945312, + "step": 1490, + "time_per_iteration": 2.619051456451416 + }, + { + "auxiliary_loss_clip": 0.06748827, + "auxiliary_loss_mlp": 0.01303631, + "balance_loss_clip": 0.06332797, + "balance_loss_mlp": 0.01258737, + "epoch": 0.08964376972794229, + "flos": 23374988173440.0, + "grad_norm": 2.6637397886572076, + "language_loss": 0.76370478, + "learning_rate": 3.9628000892573635e-06, + "loss": 0.84422934, + "num_input_tokens_seen": 31870040, + "router_z_loss_clip": 4.1640625, + "router_z_loss_mlp": 0.44897461, + "step": 1491, + "time_per_iteration": 2.590679407119751 + }, + { + "auxiliary_loss_clip": 0.06728335, + "auxiliary_loss_mlp": 0.01302455, + "balance_loss_clip": 0.06325481, + "balance_loss_mlp": 0.01261804, + "epoch": 0.08970389298061025, + "flos": 23301502542720.0, + "grad_norm": 1.853626118240874, + "language_loss": 0.78431886, + "learning_rate": 3.9627252855914055e-06, + "loss": 0.86462677, + "num_input_tokens_seen": 31890400, + "router_z_loss_clip": 4.02539062, + "router_z_loss_mlp": 0.40673828, + "step": 1492, + "time_per_iteration": 2.5715339183807373 + }, + { + "auxiliary_loss_clip": 0.06729841, + "auxiliary_loss_mlp": 0.01304764, + "balance_loss_clip": 0.06324601, + "balance_loss_mlp": 0.01260298, + "epoch": 0.08976401623327822, + "flos": 33769419016320.0, + "grad_norm": 3.870321699477457, + "language_loss": 0.73167109, + "learning_rate": 3.962650407498707e-06, + "loss": 0.81201714, + "num_input_tokens_seen": 31913435, + "router_z_loss_clip": 4.046875, + "router_z_loss_mlp": 0.44433594, + "step": 1493, + "time_per_iteration": 2.6644091606140137 + }, + { + "auxiliary_loss_clip": 0.0673489, + "auxiliary_loss_mlp": 0.01306407, + "balance_loss_clip": 0.06327641, + "balance_loss_mlp": 0.01259987, + "epoch": 0.08982413948594618, + "flos": 23917535360640.0, + "grad_norm": 1.970514386565943, + "language_loss": 0.88832223, + "learning_rate": 3.962575454982109e-06, + "loss": 0.96873516, + "num_input_tokens_seen": 31932435, + "router_z_loss_clip": 4.07617188, + "router_z_loss_mlp": 0.46435547, + "step": 1494, + "time_per_iteration": 2.58363676071167 + }, + { + "auxiliary_loss_clip": 0.06728575, + "auxiliary_loss_mlp": 0.01309753, + "balance_loss_clip": 0.06328882, + "balance_loss_mlp": 0.01267792, + "epoch": 0.08988426273861416, + "flos": 16843305427200.0, + "grad_norm": 4.2307100076147774, + "language_loss": 0.84796005, + "learning_rate": 3.962500428044454e-06, + "loss": 0.92834336, + "num_input_tokens_seen": 31950125, + "router_z_loss_clip": 3.99414062, + "router_z_loss_mlp": 0.41967773, + "step": 1495, + "time_per_iteration": 2.5592563152313232 + }, + { + "auxiliary_loss_clip": 0.06737964, + "auxiliary_loss_mlp": 0.01307798, + "balance_loss_clip": 0.06329042, + "balance_loss_mlp": 0.01263476, + "epoch": 0.08994438599128213, + "flos": 14798621295360.0, + "grad_norm": 2.6872032858380885, + "language_loss": 0.72458923, + "learning_rate": 3.962425326688585e-06, + "loss": 0.80504692, + "num_input_tokens_seen": 31968050, + "router_z_loss_clip": 4.08203125, + "router_z_loss_mlp": 0.44287109, + "step": 1496, + "time_per_iteration": 2.527702569961548 + }, + { + "auxiliary_loss_clip": 0.06731858, + "auxiliary_loss_mlp": 0.01301643, + "balance_loss_clip": 0.06328158, + "balance_loss_mlp": 0.01259038, + "epoch": 0.09000450924395009, + "flos": 17390087245440.0, + "grad_norm": 1.9873412980644265, + "language_loss": 0.82173735, + "learning_rate": 3.962350150917351e-06, + "loss": 0.90207237, + "num_input_tokens_seen": 31985675, + "router_z_loss_clip": 4.03515625, + "router_z_loss_mlp": 0.42578125, + "step": 1497, + "time_per_iteration": 2.5877413749694824 + }, + { + "auxiliary_loss_clip": 0.06743819, + "auxiliary_loss_mlp": 0.01303103, + "balance_loss_clip": 0.06327296, + "balance_loss_mlp": 0.01257064, + "epoch": 0.09006463249661807, + "flos": 24287269501440.0, + "grad_norm": 4.64905554567639, + "language_loss": 0.85617393, + "learning_rate": 3.9622749007336035e-06, + "loss": 0.93664312, + "num_input_tokens_seen": 32005180, + "router_z_loss_clip": 4.1640625, + "router_z_loss_mlp": 0.4609375, + "step": 1498, + "time_per_iteration": 2.5904557704925537 + }, + { + "auxiliary_loss_clip": 0.06749868, + "auxiliary_loss_mlp": 0.01309538, + "balance_loss_clip": 0.06334974, + "balance_loss_mlp": 0.01263666, + "epoch": 0.09012475574928604, + "flos": 13666931251200.0, + "grad_norm": 3.85109419291821, + "language_loss": 0.81540704, + "learning_rate": 3.962199576140195e-06, + "loss": 0.89600116, + "num_input_tokens_seen": 32022970, + "router_z_loss_clip": 4.1484375, + "router_z_loss_mlp": 0.45849609, + "step": 1499, + "time_per_iteration": 2.5302114486694336 + }, + { + "auxiliary_loss_clip": 0.06728019, + "auxiliary_loss_mlp": 0.01300863, + "balance_loss_clip": 0.06331602, + "balance_loss_mlp": 0.01261142, + "epoch": 0.090184879001954, + "flos": 23333884945920.0, + "grad_norm": 2.0381377997897636, + "language_loss": 0.94349372, + "learning_rate": 3.962124177139981e-06, + "loss": 1.02378249, + "num_input_tokens_seen": 32043055, + "router_z_loss_clip": 3.96484375, + "router_z_loss_mlp": 0.3972168, + "step": 1500, + "time_per_iteration": 2.5795865058898926 + }, + { + "auxiliary_loss_clip": 0.0677222, + "auxiliary_loss_mlp": 0.01314156, + "balance_loss_clip": 0.06350215, + "balance_loss_mlp": 0.01263539, + "epoch": 0.09024500225462198, + "flos": 23009320955520.0, + "grad_norm": 3.436423392701186, + "language_loss": 0.77039468, + "learning_rate": 3.962048703735822e-06, + "loss": 0.8512584, + "num_input_tokens_seen": 32061900, + "router_z_loss_clip": 4.21875, + "router_z_loss_mlp": 0.50634766, + "step": 1501, + "time_per_iteration": 2.5764503479003906 + }, + { + "auxiliary_loss_clip": 0.06607839, + "auxiliary_loss_mlp": 0.01283791, + "balance_loss_clip": 0.06328217, + "balance_loss_mlp": 0.01261165, + "epoch": 0.09030512550728995, + "flos": 62208626653440.0, + "grad_norm": 0.7031155649326037, + "language_loss": 0.58089769, + "learning_rate": 3.96197315593058e-06, + "loss": 0.659814, + "num_input_tokens_seen": 32122745, + "router_z_loss_clip": 2.796875, + "router_z_loss_mlp": 0.22619629, + "step": 1502, + "time_per_iteration": 3.1644375324249268 + }, + { + "auxiliary_loss_clip": 0.06763642, + "auxiliary_loss_mlp": 0.01313188, + "balance_loss_clip": 0.06354539, + "balance_loss_mlp": 0.01269653, + "epoch": 0.09036524875995791, + "flos": 38809907775360.0, + "grad_norm": 3.4086152145479427, + "language_loss": 0.72101718, + "learning_rate": 3.961897533727119e-06, + "loss": 0.80178547, + "num_input_tokens_seen": 32145125, + "router_z_loss_clip": 4.09179688, + "router_z_loss_mlp": 0.43579102, + "step": 1503, + "time_per_iteration": 2.724386215209961 + }, + { + "auxiliary_loss_clip": 0.06781425, + "auxiliary_loss_mlp": 0.01307874, + "balance_loss_clip": 0.06363953, + "balance_loss_mlp": 0.01263075, + "epoch": 0.09042537201262588, + "flos": 21696642092160.0, + "grad_norm": 2.1842796361034793, + "language_loss": 0.881266, + "learning_rate": 3.961821837128306e-06, + "loss": 0.96215898, + "num_input_tokens_seen": 32166255, + "router_z_loss_clip": 4.1796875, + "router_z_loss_mlp": 0.44848633, + "step": 1504, + "time_per_iteration": 2.5873734951019287 + }, + { + "auxiliary_loss_clip": 0.06790902, + "auxiliary_loss_mlp": 0.01331983, + "balance_loss_clip": 0.06361797, + "balance_loss_mlp": 0.01280795, + "epoch": 0.09048549526529386, + "flos": 22272536004480.0, + "grad_norm": 3.0474410186464427, + "language_loss": 0.75017542, + "learning_rate": 3.961746066137014e-06, + "loss": 0.83140427, + "num_input_tokens_seen": 32184010, + "router_z_loss_clip": 4.2890625, + "router_z_loss_mlp": 0.51171875, + "step": 1505, + "time_per_iteration": 2.542175054550171 + }, + { + "auxiliary_loss_clip": 0.06765792, + "auxiliary_loss_mlp": 0.0131069, + "balance_loss_clip": 0.06354111, + "balance_loss_mlp": 0.01263936, + "epoch": 0.09054561851796182, + "flos": 14616165029760.0, + "grad_norm": 3.6481054719455166, + "language_loss": 0.83357459, + "learning_rate": 3.961670220756114e-06, + "loss": 0.91433942, + "num_input_tokens_seen": 32201635, + "router_z_loss_clip": 4.11914062, + "router_z_loss_mlp": 0.46777344, + "step": 1506, + "time_per_iteration": 2.5811927318573 + }, + { + "auxiliary_loss_clip": 0.06768796, + "auxiliary_loss_mlp": 0.01305475, + "balance_loss_clip": 0.06366544, + "balance_loss_mlp": 0.01262584, + "epoch": 0.09060574177062979, + "flos": 27643542393600.0, + "grad_norm": 2.7002549048976388, + "language_loss": 0.78016138, + "learning_rate": 3.961594300988482e-06, + "loss": 0.8609041, + "num_input_tokens_seen": 32221940, + "router_z_loss_clip": 4.02148438, + "router_z_loss_mlp": 0.42871094, + "step": 1507, + "time_per_iteration": 2.6117966175079346 + }, + { + "auxiliary_loss_clip": 0.06588461, + "auxiliary_loss_mlp": 0.01287299, + "balance_loss_clip": 0.06317182, + "balance_loss_mlp": 0.01264351, + "epoch": 0.09066586502329776, + "flos": 66104637621120.0, + "grad_norm": 0.7149959192610794, + "language_loss": 0.57417059, + "learning_rate": 3.961518306836998e-06, + "loss": 0.65292823, + "num_input_tokens_seen": 32276495, + "router_z_loss_clip": 2.71875, + "router_z_loss_mlp": 0.22924805, + "step": 1508, + "time_per_iteration": 3.055577516555786 + }, + { + "auxiliary_loss_clip": 0.06765939, + "auxiliary_loss_mlp": 0.01315934, + "balance_loss_clip": 0.06356797, + "balance_loss_mlp": 0.01271135, + "epoch": 0.09072598827596573, + "flos": 18922426387200.0, + "grad_norm": 2.757411639882116, + "language_loss": 0.87097013, + "learning_rate": 3.961442238304543e-06, + "loss": 0.95178884, + "num_input_tokens_seen": 32294130, + "router_z_loss_clip": 4.09179688, + "router_z_loss_mlp": 0.44775391, + "step": 1509, + "time_per_iteration": 2.5325253009796143 + }, + { + "auxiliary_loss_clip": 0.06796411, + "auxiliary_loss_mlp": 0.01325092, + "balance_loss_clip": 0.06366567, + "balance_loss_mlp": 0.01275358, + "epoch": 0.0907861115286337, + "flos": 24827804190720.0, + "grad_norm": 3.0354649762753896, + "language_loss": 0.86899114, + "learning_rate": 3.961366095394002e-06, + "loss": 0.95020616, + "num_input_tokens_seen": 32313555, + "router_z_loss_clip": 4.29492188, + "router_z_loss_mlp": 0.49707031, + "step": 1510, + "time_per_iteration": 2.608421564102173 + }, + { + "auxiliary_loss_clip": 0.06775412, + "auxiliary_loss_mlp": 0.01304282, + "balance_loss_clip": 0.06358128, + "balance_loss_mlp": 0.01260127, + "epoch": 0.09084623478130167, + "flos": 21659270371200.0, + "grad_norm": 2.4633218193770103, + "language_loss": 0.89968181, + "learning_rate": 3.961289878108262e-06, + "loss": 0.98047876, + "num_input_tokens_seen": 32331430, + "router_z_loss_clip": 4.17773438, + "router_z_loss_mlp": 0.44140625, + "step": 1511, + "time_per_iteration": 2.566403388977051 + }, + { + "auxiliary_loss_clip": 0.0674355, + "auxiliary_loss_mlp": 0.01315251, + "balance_loss_clip": 0.06338912, + "balance_loss_mlp": 0.01272121, + "epoch": 0.09090635803396964, + "flos": 27647148119040.0, + "grad_norm": 2.09202487509347, + "language_loss": 0.86417758, + "learning_rate": 3.9612135864502135e-06, + "loss": 0.94476557, + "num_input_tokens_seen": 32353705, + "router_z_loss_clip": 4.0546875, + "router_z_loss_mlp": 0.43164062, + "step": 1512, + "time_per_iteration": 2.665790319442749 + }, + { + "auxiliary_loss_clip": 0.06752454, + "auxiliary_loss_mlp": 0.0130495, + "balance_loss_clip": 0.06350584, + "balance_loss_mlp": 0.01262726, + "epoch": 0.0909664812866376, + "flos": 17673757643520.0, + "grad_norm": 2.5146334197942926, + "language_loss": 0.88217908, + "learning_rate": 3.961137220422749e-06, + "loss": 0.96275318, + "num_input_tokens_seen": 32370520, + "router_z_loss_clip": 4.02148438, + "router_z_loss_mlp": 0.42211914, + "step": 1513, + "time_per_iteration": 2.531816244125366 + }, + { + "auxiliary_loss_clip": 0.06760095, + "auxiliary_loss_mlp": 0.01314183, + "balance_loss_clip": 0.06354512, + "balance_loss_mlp": 0.01272078, + "epoch": 0.09102660453930557, + "flos": 23958261244800.0, + "grad_norm": 5.873122305201123, + "language_loss": 0.88520277, + "learning_rate": 3.961060780028764e-06, + "loss": 0.9659456, + "num_input_tokens_seen": 32389105, + "router_z_loss_clip": 4.05664062, + "router_z_loss_mlp": 0.42138672, + "step": 1514, + "time_per_iteration": 2.609802722930908 + }, + { + "auxiliary_loss_clip": 0.06748682, + "auxiliary_loss_mlp": 0.01305229, + "balance_loss_clip": 0.06345841, + "balance_loss_mlp": 0.01266104, + "epoch": 0.09108672779197355, + "flos": 25820195621760.0, + "grad_norm": 1.9733366853077507, + "language_loss": 0.91259241, + "learning_rate": 3.960984265271159e-06, + "loss": 0.99313152, + "num_input_tokens_seen": 32408065, + "router_z_loss_clip": 4.02929688, + "router_z_loss_mlp": 0.39111328, + "step": 1515, + "time_per_iteration": 2.626183271408081 + }, + { + "auxiliary_loss_clip": 0.06753635, + "auxiliary_loss_mlp": 0.01307479, + "balance_loss_clip": 0.06346089, + "balance_loss_mlp": 0.01264754, + "epoch": 0.09114685104464151, + "flos": 29646620173440.0, + "grad_norm": 2.1883056599674195, + "language_loss": 0.87669599, + "learning_rate": 3.9609076761528335e-06, + "loss": 0.9573071, + "num_input_tokens_seen": 32427225, + "router_z_loss_clip": 4.078125, + "router_z_loss_mlp": 0.42700195, + "step": 1516, + "time_per_iteration": 4.0171709060668945 + }, + { + "auxiliary_loss_clip": 0.06753673, + "auxiliary_loss_mlp": 0.01309986, + "balance_loss_clip": 0.06344739, + "balance_loss_mlp": 0.01267643, + "epoch": 0.09120697429730948, + "flos": 33738084789120.0, + "grad_norm": 1.96049698042547, + "language_loss": 0.82941747, + "learning_rate": 3.960831012676692e-06, + "loss": 0.91005409, + "num_input_tokens_seen": 32450510, + "router_z_loss_clip": 4.0859375, + "router_z_loss_mlp": 0.42285156, + "step": 1517, + "time_per_iteration": 4.134803056716919 + }, + { + "auxiliary_loss_clip": 0.06748644, + "auxiliary_loss_mlp": 0.01313239, + "balance_loss_clip": 0.06338718, + "balance_loss_mlp": 0.0127061, + "epoch": 0.09126709754997746, + "flos": 18406559525760.0, + "grad_norm": 1.9085933618955446, + "language_loss": 0.79150838, + "learning_rate": 3.960754274845642e-06, + "loss": 0.87212718, + "num_input_tokens_seen": 32468425, + "router_z_loss_clip": 4.09375, + "router_z_loss_mlp": 0.42626953, + "step": 1518, + "time_per_iteration": 2.609239101409912 + }, + { + "auxiliary_loss_clip": 0.06742416, + "auxiliary_loss_mlp": 0.01311508, + "balance_loss_clip": 0.0633543, + "balance_loss_mlp": 0.01267853, + "epoch": 0.09132722080264542, + "flos": 22098674782080.0, + "grad_norm": 1.8265694387954685, + "language_loss": 0.88381147, + "learning_rate": 3.960677462662594e-06, + "loss": 0.9643507, + "num_input_tokens_seen": 32487510, + "router_z_loss_clip": 4.0703125, + "router_z_loss_mlp": 0.43676758, + "step": 1519, + "time_per_iteration": 2.559178590774536 + }, + { + "auxiliary_loss_clip": 0.06749827, + "auxiliary_loss_mlp": 0.01303758, + "balance_loss_clip": 0.06334724, + "balance_loss_mlp": 0.01259507, + "epoch": 0.09138734405531339, + "flos": 21039547973760.0, + "grad_norm": 3.1504469624820497, + "language_loss": 0.75833631, + "learning_rate": 3.96060057613046e-06, + "loss": 0.83887213, + "num_input_tokens_seen": 32507250, + "router_z_loss_clip": 4.15625, + "router_z_loss_mlp": 0.44238281, + "step": 1520, + "time_per_iteration": 2.5994057655334473 + }, + { + "auxiliary_loss_clip": 0.06753822, + "auxiliary_loss_mlp": 0.0130995, + "balance_loss_clip": 0.06342606, + "balance_loss_mlp": 0.01263912, + "epoch": 0.09144746730798137, + "flos": 20090104560000.0, + "grad_norm": 3.4850769207863648, + "language_loss": 0.8813951, + "learning_rate": 3.960523615252156e-06, + "loss": 0.96203285, + "num_input_tokens_seen": 32526045, + "router_z_loss_clip": 4.1171875, + "router_z_loss_mlp": 0.45996094, + "step": 1521, + "time_per_iteration": 3.9595701694488525 + }, + { + "auxiliary_loss_clip": 0.06768003, + "auxiliary_loss_mlp": 0.0131471, + "balance_loss_clip": 0.06346045, + "balance_loss_mlp": 0.01269864, + "epoch": 0.09150759056064933, + "flos": 22783874745600.0, + "grad_norm": 2.490873911959668, + "language_loss": 0.85374022, + "learning_rate": 3.960446580030599e-06, + "loss": 0.93456733, + "num_input_tokens_seen": 32546575, + "router_z_loss_clip": 4.21875, + "router_z_loss_mlp": 0.44824219, + "step": 1522, + "time_per_iteration": 4.0201475620269775 + }, + { + "auxiliary_loss_clip": 0.06745256, + "auxiliary_loss_mlp": 0.01307893, + "balance_loss_clip": 0.06349748, + "balance_loss_mlp": 0.01265359, + "epoch": 0.0915677138133173, + "flos": 27571733844480.0, + "grad_norm": 3.0013683058651974, + "language_loss": 0.82841086, + "learning_rate": 3.960369470468711e-06, + "loss": 0.90894234, + "num_input_tokens_seen": 32568795, + "router_z_loss_clip": 3.95117188, + "router_z_loss_mlp": 0.42504883, + "step": 1523, + "time_per_iteration": 2.6468050479888916 + }, + { + "auxiliary_loss_clip": 0.0678298, + "auxiliary_loss_mlp": 0.01311185, + "balance_loss_clip": 0.06364655, + "balance_loss_mlp": 0.01265838, + "epoch": 0.09162783706598528, + "flos": 17680340188800.0, + "grad_norm": 4.7132272646544395, + "language_loss": 0.75685203, + "learning_rate": 3.960292286569418e-06, + "loss": 0.83779365, + "num_input_tokens_seen": 32587010, + "router_z_loss_clip": 4.1796875, + "router_z_loss_mlp": 0.45361328, + "step": 1524, + "time_per_iteration": 2.521636962890625 + }, + { + "auxiliary_loss_clip": 0.06770191, + "auxiliary_loss_mlp": 0.01303707, + "balance_loss_clip": 0.06361801, + "balance_loss_mlp": 0.01259814, + "epoch": 0.09168796031865324, + "flos": 18484028225280.0, + "grad_norm": 2.538080589714564, + "language_loss": 0.88912833, + "learning_rate": 3.960215028335644e-06, + "loss": 0.96986729, + "num_input_tokens_seen": 32602375, + "router_z_loss_clip": 4.08398438, + "router_z_loss_mlp": 0.43920898, + "step": 1525, + "time_per_iteration": 2.523988962173462 + }, + { + "auxiliary_loss_clip": 0.06788673, + "auxiliary_loss_mlp": 0.01309343, + "balance_loss_clip": 0.06375777, + "balance_loss_mlp": 0.01263877, + "epoch": 0.0917480835713212, + "flos": 29395290251520.0, + "grad_norm": 2.947838768384084, + "language_loss": 0.76479626, + "learning_rate": 3.96013769577032e-06, + "loss": 0.84577644, + "num_input_tokens_seen": 32621460, + "router_z_loss_clip": 4.125, + "router_z_loss_mlp": 0.45458984, + "step": 1526, + "time_per_iteration": 2.622180700302124 + }, + { + "auxiliary_loss_clip": 0.06764297, + "auxiliary_loss_mlp": 0.0130915, + "balance_loss_clip": 0.06361825, + "balance_loss_mlp": 0.01267212, + "epoch": 0.09180820682398917, + "flos": 19835504328960.0, + "grad_norm": 3.217414250452265, + "language_loss": 0.78915322, + "learning_rate": 3.960060288876378e-06, + "loss": 0.86988777, + "num_input_tokens_seen": 32640440, + "router_z_loss_clip": 4.01953125, + "router_z_loss_mlp": 0.41967773, + "step": 1527, + "time_per_iteration": 2.574036121368408 + }, + { + "auxiliary_loss_clip": 0.0678985, + "auxiliary_loss_mlp": 0.0131218, + "balance_loss_clip": 0.0637854, + "balance_loss_mlp": 0.01269146, + "epoch": 0.09186833007665715, + "flos": 23848619777280.0, + "grad_norm": 2.3845621342237284, + "language_loss": 0.81092995, + "learning_rate": 3.959982807656753e-06, + "loss": 0.89195025, + "num_input_tokens_seen": 32660020, + "router_z_loss_clip": 4.109375, + "router_z_loss_mlp": 0.42993164, + "step": 1528, + "time_per_iteration": 2.55942440032959 + }, + { + "auxiliary_loss_clip": 0.067963, + "auxiliary_loss_mlp": 0.01308536, + "balance_loss_clip": 0.06370017, + "balance_loss_mlp": 0.01259708, + "epoch": 0.09192845332932512, + "flos": 12937693167360.0, + "grad_norm": 3.969055249882827, + "language_loss": 0.79179597, + "learning_rate": 3.959905252114384e-06, + "loss": 0.87284434, + "num_input_tokens_seen": 32678170, + "router_z_loss_clip": 4.26171875, + "router_z_loss_mlp": 0.48828125, + "step": 1529, + "time_per_iteration": 2.559513807296753 + }, + { + "auxiliary_loss_clip": 0.06793401, + "auxiliary_loss_mlp": 0.01313121, + "balance_loss_clip": 0.06376834, + "balance_loss_mlp": 0.01266081, + "epoch": 0.09198857658199308, + "flos": 24574503697920.0, + "grad_norm": 2.3851695624911433, + "language_loss": 0.84393311, + "learning_rate": 3.959827622252211e-06, + "loss": 0.92499834, + "num_input_tokens_seen": 32697540, + "router_z_loss_clip": 4.1640625, + "router_z_loss_mlp": 0.47021484, + "step": 1530, + "time_per_iteration": 2.586825132369995 + }, + { + "auxiliary_loss_clip": 0.06782777, + "auxiliary_loss_mlp": 0.01307988, + "balance_loss_clip": 0.0637871, + "balance_loss_mlp": 0.01264596, + "epoch": 0.09204869983466106, + "flos": 20273231658240.0, + "grad_norm": 2.9699033759595728, + "language_loss": 0.85435712, + "learning_rate": 3.959749918073179e-06, + "loss": 0.93526471, + "num_input_tokens_seen": 32716805, + "router_z_loss_clip": 4.0390625, + "router_z_loss_mlp": 0.43383789, + "step": 1531, + "time_per_iteration": 2.592822313308716 + }, + { + "auxiliary_loss_clip": 0.06784501, + "auxiliary_loss_mlp": 0.01306885, + "balance_loss_clip": 0.06371005, + "balance_loss_mlp": 0.01261967, + "epoch": 0.09210882308732903, + "flos": 20891780098560.0, + "grad_norm": 2.1537883780568907, + "language_loss": 0.82955891, + "learning_rate": 3.959672139580233e-06, + "loss": 0.91047275, + "num_input_tokens_seen": 32736385, + "router_z_loss_clip": 4.13671875, + "router_z_loss_mlp": 0.44897461, + "step": 1532, + "time_per_iteration": 2.5733680725097656 + }, + { + "auxiliary_loss_clip": 0.06776289, + "auxiliary_loss_mlp": 0.01303592, + "balance_loss_clip": 0.06368969, + "balance_loss_mlp": 0.01262059, + "epoch": 0.09216894633999699, + "flos": 30964246427520.0, + "grad_norm": 3.2208618489711593, + "language_loss": 0.85266644, + "learning_rate": 3.9595942867763235e-06, + "loss": 0.93346524, + "num_input_tokens_seen": 32757140, + "router_z_loss_clip": 4.06835938, + "router_z_loss_mlp": 0.41552734, + "step": 1533, + "time_per_iteration": 2.640906810760498 + }, + { + "auxiliary_loss_clip": 0.06779255, + "auxiliary_loss_mlp": 0.01307047, + "balance_loss_clip": 0.06369043, + "balance_loss_mlp": 0.01263369, + "epoch": 0.09222906959266497, + "flos": 13156556832000.0, + "grad_norm": 2.5924628709665987, + "language_loss": 0.91772735, + "learning_rate": 3.959516359664402e-06, + "loss": 0.99859047, + "num_input_tokens_seen": 32774860, + "router_z_loss_clip": 4.09960938, + "router_z_loss_mlp": 0.43652344, + "step": 1534, + "time_per_iteration": 2.5586555004119873 + }, + { + "auxiliary_loss_clip": 0.06771498, + "auxiliary_loss_mlp": 0.01306705, + "balance_loss_clip": 0.06357232, + "balance_loss_mlp": 0.01260142, + "epoch": 0.09228919284533293, + "flos": 26001603711360.0, + "grad_norm": 3.0123317324125694, + "language_loss": 0.77440608, + "learning_rate": 3.959438358247424e-06, + "loss": 0.85518813, + "num_input_tokens_seen": 32795250, + "router_z_loss_clip": 4.14257812, + "router_z_loss_mlp": 0.46557617, + "step": 1535, + "time_per_iteration": 2.5873541831970215 + }, + { + "auxiliary_loss_clip": 0.06759383, + "auxiliary_loss_mlp": 0.0131007, + "balance_loss_clip": 0.06362146, + "balance_loss_mlp": 0.012688, + "epoch": 0.0923493160980009, + "flos": 18666694126080.0, + "grad_norm": 2.0947698011843707, + "language_loss": 0.83399653, + "learning_rate": 3.959360282528346e-06, + "loss": 0.91469115, + "num_input_tokens_seen": 32813805, + "router_z_loss_clip": 3.97070312, + "router_z_loss_mlp": 0.41235352, + "step": 1536, + "time_per_iteration": 2.5708868503570557 + }, + { + "auxiliary_loss_clip": 0.06743568, + "auxiliary_loss_mlp": 0.01297679, + "balance_loss_clip": 0.06350097, + "balance_loss_mlp": 0.01257767, + "epoch": 0.09240943935066886, + "flos": 21146673818880.0, + "grad_norm": 2.077431495660488, + "language_loss": 0.91567117, + "learning_rate": 3.959282132510131e-06, + "loss": 0.99608374, + "num_input_tokens_seen": 32830960, + "router_z_loss_clip": 3.93945312, + "router_z_loss_mlp": 0.39916992, + "step": 1537, + "time_per_iteration": 2.5669217109680176 + }, + { + "auxiliary_loss_clip": 0.06758659, + "auxiliary_loss_mlp": 0.01302061, + "balance_loss_clip": 0.06354217, + "balance_loss_mlp": 0.01258288, + "epoch": 0.09246956260333684, + "flos": 20598298773120.0, + "grad_norm": 2.764633424079652, + "language_loss": 0.82388502, + "learning_rate": 3.959203908195741e-06, + "loss": 0.9044922, + "num_input_tokens_seen": 32848275, + "router_z_loss_clip": 4.04296875, + "router_z_loss_mlp": 0.43774414, + "step": 1538, + "time_per_iteration": 2.5693938732147217 + }, + { + "auxiliary_loss_clip": 0.06616426, + "auxiliary_loss_mlp": 0.01331188, + "balance_loss_clip": 0.06353034, + "balance_loss_mlp": 0.01300217, + "epoch": 0.09252968585600481, + "flos": 67580052312960.0, + "grad_norm": 0.7302597602699774, + "language_loss": 0.57435596, + "learning_rate": 3.959125609588142e-06, + "loss": 0.65383208, + "num_input_tokens_seen": 32917730, + "router_z_loss_clip": 2.625, + "router_z_loss_mlp": 0.30932617, + "step": 1539, + "time_per_iteration": 3.310535430908203 + }, + { + "auxiliary_loss_clip": 0.06755982, + "auxiliary_loss_mlp": 0.01299614, + "balance_loss_clip": 0.06351999, + "balance_loss_mlp": 0.01256174, + "epoch": 0.09258980910867277, + "flos": 17389542193920.0, + "grad_norm": 3.846304679224495, + "language_loss": 0.7084049, + "learning_rate": 3.959047236690304e-06, + "loss": 0.78896087, + "num_input_tokens_seen": 32934910, + "router_z_loss_clip": 4.03320312, + "router_z_loss_mlp": 0.43457031, + "step": 1540, + "time_per_iteration": 2.5759708881378174 + }, + { + "auxiliary_loss_clip": 0.06744132, + "auxiliary_loss_mlp": 0.01299701, + "balance_loss_clip": 0.0634924, + "balance_loss_mlp": 0.0125824, + "epoch": 0.09264993236134075, + "flos": 19872205217280.0, + "grad_norm": 1.8486482297190108, + "language_loss": 0.8567428, + "learning_rate": 3.958968789505198e-06, + "loss": 0.93718112, + "num_input_tokens_seen": 32953840, + "router_z_loss_clip": 3.95117188, + "router_z_loss_mlp": 0.41455078, + "step": 1541, + "time_per_iteration": 2.5332911014556885 + }, + { + "auxiliary_loss_clip": 0.06613824, + "auxiliary_loss_mlp": 0.01296188, + "balance_loss_clip": 0.06351398, + "balance_loss_mlp": 0.01268222, + "epoch": 0.09271005561400872, + "flos": 62301455377920.0, + "grad_norm": 0.8853632542817719, + "language_loss": 0.62370431, + "learning_rate": 3.9588902680358e-06, + "loss": 0.70280445, + "num_input_tokens_seen": 33011410, + "router_z_loss_clip": 2.625, + "router_z_loss_mlp": 0.28027344, + "step": 1542, + "time_per_iteration": 3.234708309173584 + }, + { + "auxiliary_loss_clip": 0.06759306, + "auxiliary_loss_mlp": 0.01304245, + "balance_loss_clip": 0.06356558, + "balance_loss_mlp": 0.01259923, + "epoch": 0.09277017886667668, + "flos": 23336358641280.0, + "grad_norm": 2.3970894213309, + "language_loss": 0.84548283, + "learning_rate": 3.958811672285086e-06, + "loss": 0.92611837, + "num_input_tokens_seen": 33031675, + "router_z_loss_clip": 4.03320312, + "router_z_loss_mlp": 0.44360352, + "step": 1543, + "time_per_iteration": 2.5636215209960938 + }, + { + "auxiliary_loss_clip": 0.06747155, + "auxiliary_loss_mlp": 0.01303454, + "balance_loss_clip": 0.06351274, + "balance_loss_mlp": 0.01258178, + "epoch": 0.09283030211934466, + "flos": 54757088513280.0, + "grad_norm": 2.335606951107943, + "language_loss": 0.73961073, + "learning_rate": 3.958733002256038e-06, + "loss": 0.82011688, + "num_input_tokens_seen": 33056355, + "router_z_loss_clip": 3.96289062, + "router_z_loss_mlp": 0.45288086, + "step": 1544, + "time_per_iteration": 2.8664584159851074 + }, + { + "auxiliary_loss_clip": 0.06775358, + "auxiliary_loss_mlp": 0.01304809, + "balance_loss_clip": 0.06364222, + "balance_loss_mlp": 0.01260082, + "epoch": 0.09289042537201263, + "flos": 30342385751040.0, + "grad_norm": 2.3360980643139673, + "language_loss": 0.78971326, + "learning_rate": 3.958654257951637e-06, + "loss": 0.87051487, + "num_input_tokens_seen": 33079520, + "router_z_loss_clip": 4.109375, + "router_z_loss_mlp": 0.44750977, + "step": 1545, + "time_per_iteration": 2.6384429931640625 + }, + { + "auxiliary_loss_clip": 0.0674521, + "auxiliary_loss_mlp": 0.01308675, + "balance_loss_clip": 0.06349306, + "balance_loss_mlp": 0.01266499, + "epoch": 0.09295054862468059, + "flos": 17752274519040.0, + "grad_norm": 3.8854693427637796, + "language_loss": 0.77781618, + "learning_rate": 3.9585754393748706e-06, + "loss": 0.85835493, + "num_input_tokens_seen": 33096135, + "router_z_loss_clip": 3.95703125, + "router_z_loss_mlp": 0.42163086, + "step": 1546, + "time_per_iteration": 2.5352087020874023 + }, + { + "auxiliary_loss_clip": 0.06760454, + "auxiliary_loss_mlp": 0.01300982, + "balance_loss_clip": 0.06357808, + "balance_loss_mlp": 0.01258066, + "epoch": 0.09301067187734856, + "flos": 23664528357120.0, + "grad_norm": 2.488248885797729, + "language_loss": 0.85732055, + "learning_rate": 3.9584965465287275e-06, + "loss": 0.93793488, + "num_input_tokens_seen": 33115245, + "router_z_loss_clip": 4.02734375, + "router_z_loss_mlp": 0.42919922, + "step": 1547, + "time_per_iteration": 2.6185734272003174 + }, + { + "auxiliary_loss_clip": 0.0676943, + "auxiliary_loss_mlp": 0.01302462, + "balance_loss_clip": 0.06361516, + "balance_loss_mlp": 0.01256733, + "epoch": 0.09307079513001654, + "flos": 27535242591360.0, + "grad_norm": 10.105633046635301, + "language_loss": 0.69631422, + "learning_rate": 3.958417579416199e-06, + "loss": 0.77703309, + "num_input_tokens_seen": 33136640, + "router_z_loss_clip": 4.078125, + "router_z_loss_mlp": 0.45703125, + "step": 1548, + "time_per_iteration": 2.590592861175537 + }, + { + "auxiliary_loss_clip": 0.06756231, + "auxiliary_loss_mlp": 0.01308751, + "balance_loss_clip": 0.06351212, + "balance_loss_mlp": 0.01262164, + "epoch": 0.0931309183826845, + "flos": 20632945236480.0, + "grad_norm": 2.778765119974638, + "language_loss": 0.85783607, + "learning_rate": 3.9583385380402795e-06, + "loss": 0.93848586, + "num_input_tokens_seen": 33155060, + "router_z_loss_clip": 4.0546875, + "router_z_loss_mlp": 0.46582031, + "step": 1549, + "time_per_iteration": 2.5733652114868164 + }, + { + "auxiliary_loss_clip": 0.0674461, + "auxiliary_loss_mlp": 0.0130734, + "balance_loss_clip": 0.06348558, + "balance_loss_mlp": 0.01260515, + "epoch": 0.09319104163535247, + "flos": 29028239441280.0, + "grad_norm": 2.291130376172184, + "language_loss": 0.78293371, + "learning_rate": 3.958259422403966e-06, + "loss": 0.86345315, + "num_input_tokens_seen": 33175420, + "router_z_loss_clip": 3.96289062, + "router_z_loss_mlp": 0.46777344, + "step": 1550, + "time_per_iteration": 2.675468683242798 + }, + { + "auxiliary_loss_clip": 0.06764482, + "auxiliary_loss_mlp": 0.01307112, + "balance_loss_clip": 0.06363475, + "balance_loss_mlp": 0.01261932, + "epoch": 0.09325116488802045, + "flos": 25308605318400.0, + "grad_norm": 3.8025580487165827, + "language_loss": 0.85284662, + "learning_rate": 3.95818023251026e-06, + "loss": 0.93356252, + "num_input_tokens_seen": 33194120, + "router_z_loss_clip": 4.00976562, + "router_z_loss_mlp": 0.4519043, + "step": 1551, + "time_per_iteration": 2.6053500175476074 + }, + { + "auxiliary_loss_clip": 0.06596169, + "auxiliary_loss_mlp": 0.0130535, + "balance_loss_clip": 0.0633968, + "balance_loss_mlp": 0.01277837, + "epoch": 0.09331128814068841, + "flos": 61556144509440.0, + "grad_norm": 0.7233822491319317, + "language_loss": 0.61895663, + "learning_rate": 3.958100968362163e-06, + "loss": 0.69797182, + "num_input_tokens_seen": 33261080, + "router_z_loss_clip": 2.5625, + "router_z_loss_mlp": 0.27587891, + "step": 1552, + "time_per_iteration": 3.3384416103363037 + }, + { + "auxiliary_loss_clip": 0.06590016, + "auxiliary_loss_mlp": 0.01301581, + "balance_loss_clip": 0.06333126, + "balance_loss_mlp": 0.012734, + "epoch": 0.09337141139335638, + "flos": 53312810883840.0, + "grad_norm": 0.7946952857616146, + "language_loss": 0.59040678, + "learning_rate": 3.958021629962681e-06, + "loss": 0.66932273, + "num_input_tokens_seen": 33330235, + "router_z_loss_clip": 2.5625, + "router_z_loss_mlp": 0.28222656, + "step": 1553, + "time_per_iteration": 3.328634262084961 + }, + { + "auxiliary_loss_clip": 0.06762205, + "auxiliary_loss_mlp": 0.01305187, + "balance_loss_clip": 0.06356394, + "balance_loss_mlp": 0.01259005, + "epoch": 0.09343153464602436, + "flos": 23483539537920.0, + "grad_norm": 2.4998209031659853, + "language_loss": 0.888143, + "learning_rate": 3.957942217314823e-06, + "loss": 0.96881694, + "num_input_tokens_seen": 33349035, + "router_z_loss_clip": 4.05078125, + "router_z_loss_mlp": 0.46142578, + "step": 1554, + "time_per_iteration": 2.581807851791382 + }, + { + "auxiliary_loss_clip": 0.06741555, + "auxiliary_loss_mlp": 0.01307833, + "balance_loss_clip": 0.06351957, + "balance_loss_mlp": 0.01266014, + "epoch": 0.09349165789869232, + "flos": 19359399029760.0, + "grad_norm": 2.344370035353047, + "language_loss": 0.83131635, + "learning_rate": 3.957862730421599e-06, + "loss": 0.91181016, + "num_input_tokens_seen": 33368060, + "router_z_loss_clip": 3.89453125, + "router_z_loss_mlp": 0.41772461, + "step": 1555, + "time_per_iteration": 2.5902695655822754 + }, + { + "auxiliary_loss_clip": 0.06587426, + "auxiliary_loss_mlp": 0.01289293, + "balance_loss_clip": 0.06331394, + "balance_loss_mlp": 0.01264736, + "epoch": 0.09355178115136029, + "flos": 67520626968960.0, + "grad_norm": 0.861973728001382, + "language_loss": 0.59963852, + "learning_rate": 3.957783169286024e-06, + "loss": 0.67840576, + "num_input_tokens_seen": 33430825, + "router_z_loss_clip": 2.5625, + "router_z_loss_mlp": 0.2454834, + "step": 1556, + "time_per_iteration": 4.633097410202026 + }, + { + "auxiliary_loss_clip": 0.06743869, + "auxiliary_loss_mlp": 0.01306461, + "balance_loss_clip": 0.06350282, + "balance_loss_mlp": 0.01262378, + "epoch": 0.09361190440402825, + "flos": 37350676920960.0, + "grad_norm": 4.324378965941339, + "language_loss": 0.86094332, + "learning_rate": 3.9577035339111155e-06, + "loss": 0.94144666, + "num_input_tokens_seen": 33454855, + "router_z_loss_clip": 3.93359375, + "router_z_loss_mlp": 0.44091797, + "step": 1557, + "time_per_iteration": 4.159425258636475 + }, + { + "auxiliary_loss_clip": 0.06735416, + "auxiliary_loss_mlp": 0.01305568, + "balance_loss_clip": 0.0634184, + "balance_loss_mlp": 0.01261961, + "epoch": 0.09367202765669623, + "flos": 24906614555520.0, + "grad_norm": 1.8416864834979163, + "language_loss": 0.79618692, + "learning_rate": 3.957623824299893e-06, + "loss": 0.87659669, + "num_input_tokens_seen": 33476000, + "router_z_loss_clip": 3.9375, + "router_z_loss_mlp": 0.4362793, + "step": 1558, + "time_per_iteration": 2.592564105987549 + }, + { + "auxiliary_loss_clip": 0.0675108, + "auxiliary_loss_mlp": 0.01310633, + "balance_loss_clip": 0.06350247, + "balance_loss_mlp": 0.0126562, + "epoch": 0.0937321509093642, + "flos": 15710986477440.0, + "grad_norm": 2.1774663365636555, + "language_loss": 0.81722063, + "learning_rate": 3.957544040455379e-06, + "loss": 0.89783776, + "num_input_tokens_seen": 33493845, + "router_z_loss_clip": 4.00390625, + "router_z_loss_mlp": 0.44995117, + "step": 1559, + "time_per_iteration": 2.6032233238220215 + }, + { + "auxiliary_loss_clip": 0.06735763, + "auxiliary_loss_mlp": 0.01315647, + "balance_loss_clip": 0.06339972, + "balance_loss_mlp": 0.0126844, + "epoch": 0.09379227416203216, + "flos": 20489663554560.0, + "grad_norm": 4.6744208078316785, + "language_loss": 0.77938354, + "learning_rate": 3.957464182380599e-06, + "loss": 0.85989761, + "num_input_tokens_seen": 33510850, + "router_z_loss_clip": 3.95117188, + "router_z_loss_mlp": 0.47216797, + "step": 1560, + "time_per_iteration": 4.077486753463745 + }, + { + "auxiliary_loss_clip": 0.06748343, + "auxiliary_loss_mlp": 0.01308417, + "balance_loss_clip": 0.06347422, + "balance_loss_mlp": 0.01262736, + "epoch": 0.09385239741470014, + "flos": 24359329612800.0, + "grad_norm": 2.0394992370655975, + "language_loss": 0.82801652, + "learning_rate": 3.95738425007858e-06, + "loss": 0.90858412, + "num_input_tokens_seen": 33530430, + "router_z_loss_clip": 4.0078125, + "router_z_loss_mlp": 0.45678711, + "step": 1561, + "time_per_iteration": 2.596116781234741 + }, + { + "auxiliary_loss_clip": 0.06752103, + "auxiliary_loss_mlp": 0.01323602, + "balance_loss_clip": 0.06347683, + "balance_loss_mlp": 0.01280186, + "epoch": 0.0939125206673681, + "flos": 33299812408320.0, + "grad_norm": 7.4214047506541085, + "language_loss": 0.63655907, + "learning_rate": 3.957304243552354e-06, + "loss": 0.71731609, + "num_input_tokens_seen": 33551975, + "router_z_loss_clip": 4.046875, + "router_z_loss_mlp": 0.43457031, + "step": 1562, + "time_per_iteration": 4.075207710266113 + }, + { + "auxiliary_loss_clip": 0.06726522, + "auxiliary_loss_mlp": 0.01325114, + "balance_loss_clip": 0.06341539, + "balance_loss_mlp": 0.012796, + "epoch": 0.09397264392003607, + "flos": 19250973446400.0, + "grad_norm": 3.0209063418471516, + "language_loss": 0.87167883, + "learning_rate": 3.957224162804956e-06, + "loss": 0.95219523, + "num_input_tokens_seen": 33569850, + "router_z_loss_clip": 3.84960938, + "router_z_loss_mlp": 0.45556641, + "step": 1563, + "time_per_iteration": 2.5672974586486816 + }, + { + "auxiliary_loss_clip": 0.06731268, + "auxiliary_loss_mlp": 0.01318973, + "balance_loss_clip": 0.06341776, + "balance_loss_mlp": 0.01275843, + "epoch": 0.09403276717270405, + "flos": 19323997879680.0, + "grad_norm": 4.036825223775372, + "language_loss": 0.77853692, + "learning_rate": 3.9571440078394205e-06, + "loss": 0.85903931, + "num_input_tokens_seen": 33590510, + "router_z_loss_clip": 3.90039062, + "router_z_loss_mlp": 0.43139648, + "step": 1564, + "time_per_iteration": 2.586803913116455 + }, + { + "auxiliary_loss_clip": 0.06734219, + "auxiliary_loss_mlp": 0.0132655, + "balance_loss_clip": 0.06344242, + "balance_loss_mlp": 0.01285876, + "epoch": 0.09409289042537201, + "flos": 23589701061120.0, + "grad_norm": 2.2846066488683725, + "language_loss": 0.81194431, + "learning_rate": 3.9570637786587895e-06, + "loss": 0.89255196, + "num_input_tokens_seen": 33608810, + "router_z_loss_clip": 3.90039062, + "router_z_loss_mlp": 0.40649414, + "step": 1565, + "time_per_iteration": 2.5794317722320557 + }, + { + "auxiliary_loss_clip": 0.06753047, + "auxiliary_loss_mlp": 0.01322466, + "balance_loss_clip": 0.06351732, + "balance_loss_mlp": 0.01275616, + "epoch": 0.09415301367803998, + "flos": 20083689722880.0, + "grad_norm": 2.6435222335860984, + "language_loss": 0.77859378, + "learning_rate": 3.956983475266103e-06, + "loss": 0.85934889, + "num_input_tokens_seen": 33627265, + "router_z_loss_clip": 4.01171875, + "router_z_loss_mlp": 0.46850586, + "step": 1566, + "time_per_iteration": 2.585827112197876 + }, + { + "auxiliary_loss_clip": 0.06732298, + "auxiliary_loss_mlp": 0.01317656, + "balance_loss_clip": 0.06341095, + "balance_loss_mlp": 0.01273048, + "epoch": 0.09421313693070796, + "flos": 21067234548480.0, + "grad_norm": 2.512043511854747, + "language_loss": 0.79885954, + "learning_rate": 3.956903097664407e-06, + "loss": 0.87935913, + "num_input_tokens_seen": 33644810, + "router_z_loss_clip": 3.91015625, + "router_z_loss_mlp": 0.44555664, + "step": 1567, + "time_per_iteration": 2.6127569675445557 + }, + { + "auxiliary_loss_clip": 0.06736939, + "auxiliary_loss_mlp": 0.01312026, + "balance_loss_clip": 0.06345257, + "balance_loss_mlp": 0.01268467, + "epoch": 0.09427326018337592, + "flos": 24323006067840.0, + "grad_norm": 2.023408518632979, + "language_loss": 0.8442241, + "learning_rate": 3.956822645856749e-06, + "loss": 0.92471373, + "num_input_tokens_seen": 33665665, + "router_z_loss_clip": 3.91796875, + "router_z_loss_mlp": 0.43505859, + "step": 1568, + "time_per_iteration": 2.569720506668091 + }, + { + "auxiliary_loss_clip": 0.06755883, + "auxiliary_loss_mlp": 0.01306618, + "balance_loss_clip": 0.06353641, + "balance_loss_mlp": 0.01263583, + "epoch": 0.09433338343604389, + "flos": 20269667859840.0, + "grad_norm": 2.477497103121254, + "language_loss": 0.77784359, + "learning_rate": 3.9567421198461814e-06, + "loss": 0.85846859, + "num_input_tokens_seen": 33684760, + "router_z_loss_clip": 4.01757812, + "router_z_loss_mlp": 0.43041992, + "step": 1569, + "time_per_iteration": 2.573776960372925 + }, + { + "auxiliary_loss_clip": 0.06750233, + "auxiliary_loss_mlp": 0.01322236, + "balance_loss_clip": 0.06360742, + "balance_loss_mlp": 0.01281443, + "epoch": 0.09439350668871185, + "flos": 12746683785600.0, + "grad_norm": 3.1104432371221495, + "language_loss": 0.87103617, + "learning_rate": 3.956661519635756e-06, + "loss": 0.95176083, + "num_input_tokens_seen": 33700750, + "router_z_loss_clip": 3.8984375, + "router_z_loss_mlp": 0.40795898, + "step": 1570, + "time_per_iteration": 2.5129590034484863 + }, + { + "auxiliary_loss_clip": 0.06749961, + "auxiliary_loss_mlp": 0.01311255, + "balance_loss_clip": 0.06350505, + "balance_loss_mlp": 0.01269007, + "epoch": 0.09445362994137983, + "flos": 25970101776000.0, + "grad_norm": 2.3671248077954297, + "language_loss": 0.7803812, + "learning_rate": 3.95658084522853e-06, + "loss": 0.86099339, + "num_input_tokens_seen": 33724430, + "router_z_loss_clip": 3.99609375, + "router_z_loss_mlp": 0.42236328, + "step": 1571, + "time_per_iteration": 2.7541556358337402 + }, + { + "auxiliary_loss_clip": 0.0672407, + "auxiliary_loss_mlp": 0.01308455, + "balance_loss_clip": 0.06346194, + "balance_loss_mlp": 0.01269807, + "epoch": 0.0945137531940478, + "flos": 19720831616640.0, + "grad_norm": 2.4306247586771934, + "language_loss": 0.81068146, + "learning_rate": 3.956500096627561e-06, + "loss": 0.89100671, + "num_input_tokens_seen": 33743455, + "router_z_loss_clip": 3.78125, + "router_z_loss_mlp": 0.38623047, + "step": 1572, + "time_per_iteration": 2.5679988861083984 + }, + { + "auxiliary_loss_clip": 0.06744019, + "auxiliary_loss_mlp": 0.01308416, + "balance_loss_clip": 0.06344286, + "balance_loss_mlp": 0.01265691, + "epoch": 0.09457387644671576, + "flos": 23622796224000.0, + "grad_norm": 3.3370924728894185, + "language_loss": 0.8915112, + "learning_rate": 3.956419273835913e-06, + "loss": 0.97203565, + "num_input_tokens_seen": 33763435, + "router_z_loss_clip": 3.99804688, + "router_z_loss_mlp": 0.42700195, + "step": 1573, + "time_per_iteration": 2.607600688934326 + }, + { + "auxiliary_loss_clip": 0.06757497, + "auxiliary_loss_mlp": 0.01304776, + "balance_loss_clip": 0.0635422, + "balance_loss_mlp": 0.0125919, + "epoch": 0.09463399969938374, + "flos": 26914681653120.0, + "grad_norm": 3.5983977458342764, + "language_loss": 0.83351094, + "learning_rate": 3.95633837685665e-06, + "loss": 0.91413361, + "num_input_tokens_seen": 33784325, + "router_z_loss_clip": 4.02734375, + "router_z_loss_mlp": 0.45605469, + "step": 1574, + "time_per_iteration": 2.629686117172241 + }, + { + "auxiliary_loss_clip": 0.06738517, + "auxiliary_loss_mlp": 0.01306377, + "balance_loss_clip": 0.06343692, + "balance_loss_mlp": 0.01264463, + "epoch": 0.0946941229520517, + "flos": 23666331219840.0, + "grad_norm": 2.307572986084867, + "language_loss": 0.82900977, + "learning_rate": 3.95625740569284e-06, + "loss": 0.9094587, + "num_input_tokens_seen": 33802510, + "router_z_loss_clip": 3.9453125, + "router_z_loss_mlp": 0.41918945, + "step": 1575, + "time_per_iteration": 2.6788809299468994 + }, + { + "auxiliary_loss_clip": 0.06738277, + "auxiliary_loss_mlp": 0.013099, + "balance_loss_clip": 0.06341611, + "balance_loss_mlp": 0.01265912, + "epoch": 0.09475424620471967, + "flos": 24140927145600.0, + "grad_norm": 3.091827797586119, + "language_loss": 0.88420904, + "learning_rate": 3.956176360347553e-06, + "loss": 0.9646908, + "num_input_tokens_seen": 33819980, + "router_z_loss_clip": 3.96875, + "router_z_loss_mlp": 0.43969727, + "step": 1576, + "time_per_iteration": 2.579481840133667 + }, + { + "auxiliary_loss_clip": 0.06599005, + "auxiliary_loss_mlp": 0.01293963, + "balance_loss_clip": 0.06343846, + "balance_loss_mlp": 0.01269894, + "epoch": 0.09481436945738765, + "flos": 68446283022720.0, + "grad_norm": 0.9736372426009887, + "language_loss": 0.66026628, + "learning_rate": 3.956095240823862e-06, + "loss": 0.73919594, + "num_input_tokens_seen": 33878925, + "router_z_loss_clip": 2.5546875, + "router_z_loss_mlp": 0.24060059, + "step": 1577, + "time_per_iteration": 3.1515533924102783 + }, + { + "auxiliary_loss_clip": 0.06730399, + "auxiliary_loss_mlp": 0.01300904, + "balance_loss_clip": 0.06338648, + "balance_loss_mlp": 0.01260373, + "epoch": 0.09487449271005562, + "flos": 16659633277440.0, + "grad_norm": 8.095983487206498, + "language_loss": 0.81352609, + "learning_rate": 3.956014047124844e-06, + "loss": 0.89383912, + "num_input_tokens_seen": 33897600, + "router_z_loss_clip": 3.91601562, + "router_z_loss_mlp": 0.40551758, + "step": 1578, + "time_per_iteration": 2.5477943420410156 + }, + { + "auxiliary_loss_clip": 0.06728384, + "auxiliary_loss_mlp": 0.01305272, + "balance_loss_clip": 0.06339101, + "balance_loss_mlp": 0.01262261, + "epoch": 0.09493461596272358, + "flos": 24281860913280.0, + "grad_norm": 2.2398618164761674, + "language_loss": 0.79482144, + "learning_rate": 3.955932779253578e-06, + "loss": 0.87515795, + "num_input_tokens_seen": 33917365, + "router_z_loss_clip": 3.89453125, + "router_z_loss_mlp": 0.43017578, + "step": 1579, + "time_per_iteration": 2.5968475341796875 + }, + { + "auxiliary_loss_clip": 0.06732477, + "auxiliary_loss_mlp": 0.01300696, + "balance_loss_clip": 0.06336749, + "balance_loss_mlp": 0.012579, + "epoch": 0.09499473921539155, + "flos": 21876373100160.0, + "grad_norm": 2.5076146880491406, + "language_loss": 0.75397295, + "learning_rate": 3.955851437213144e-06, + "loss": 0.83430469, + "num_input_tokens_seen": 33936680, + "router_z_loss_clip": 3.95703125, + "router_z_loss_mlp": 0.42822266, + "step": 1580, + "time_per_iteration": 2.570138931274414 + }, + { + "auxiliary_loss_clip": 0.06724589, + "auxiliary_loss_mlp": 0.01310914, + "balance_loss_clip": 0.06333821, + "balance_loss_mlp": 0.01268666, + "epoch": 0.09505486246805953, + "flos": 33555544669440.0, + "grad_norm": 5.064476993970354, + "language_loss": 0.78532892, + "learning_rate": 3.955770021006627e-06, + "loss": 0.86568391, + "num_input_tokens_seen": 33960685, + "router_z_loss_clip": 3.90625, + "router_z_loss_mlp": 0.42236328, + "step": 1581, + "time_per_iteration": 2.6650803089141846 + }, + { + "auxiliary_loss_clip": 0.06722299, + "auxiliary_loss_mlp": 0.01301656, + "balance_loss_clip": 0.06332248, + "balance_loss_mlp": 0.01261006, + "epoch": 0.09511498572072749, + "flos": 21221752677120.0, + "grad_norm": 5.1362606458817925, + "language_loss": 0.89191097, + "learning_rate": 3.955688530637116e-06, + "loss": 0.97215056, + "num_input_tokens_seen": 33980015, + "router_z_loss_clip": 3.90429688, + "router_z_loss_mlp": 0.40698242, + "step": 1582, + "time_per_iteration": 2.5564815998077393 + }, + { + "auxiliary_loss_clip": 0.06727481, + "auxiliary_loss_mlp": 0.01303544, + "balance_loss_clip": 0.06332925, + "balance_loss_mlp": 0.01261773, + "epoch": 0.09517510897339546, + "flos": 14616542373120.0, + "grad_norm": 2.3229781210723393, + "language_loss": 0.68368226, + "learning_rate": 3.955606966107699e-06, + "loss": 0.76399243, + "num_input_tokens_seen": 33997705, + "router_z_loss_clip": 3.94140625, + "router_z_loss_mlp": 0.41772461, + "step": 1583, + "time_per_iteration": 2.6164753437042236 + }, + { + "auxiliary_loss_clip": 0.06727771, + "auxiliary_loss_mlp": 0.01304751, + "balance_loss_clip": 0.06331809, + "balance_loss_mlp": 0.01261048, + "epoch": 0.09523523222606343, + "flos": 27824531212800.0, + "grad_norm": 3.115442275670272, + "language_loss": 0.72724044, + "learning_rate": 3.95552532742147e-06, + "loss": 0.80756557, + "num_input_tokens_seen": 34017465, + "router_z_loss_clip": 3.95703125, + "router_z_loss_mlp": 0.43725586, + "step": 1584, + "time_per_iteration": 2.604071855545044 + }, + { + "auxiliary_loss_clip": 0.06722259, + "auxiliary_loss_mlp": 0.01304961, + "balance_loss_clip": 0.06331295, + "balance_loss_mlp": 0.01265431, + "epoch": 0.0952953554787314, + "flos": 20712887631360.0, + "grad_norm": 1.6075041233622491, + "language_loss": 0.82572448, + "learning_rate": 3.955443614581525e-06, + "loss": 0.90599668, + "num_input_tokens_seen": 34038550, + "router_z_loss_clip": 3.91210938, + "router_z_loss_mlp": 0.39550781, + "step": 1585, + "time_per_iteration": 2.586507797241211 + }, + { + "auxiliary_loss_clip": 0.0673333, + "auxiliary_loss_mlp": 0.01317767, + "balance_loss_clip": 0.06331026, + "balance_loss_mlp": 0.01272039, + "epoch": 0.09535547873139937, + "flos": 24794080122240.0, + "grad_norm": 2.5515489551775854, + "language_loss": 0.74444079, + "learning_rate": 3.955361827590961e-06, + "loss": 0.82495177, + "num_input_tokens_seen": 34058665, + "router_z_loss_clip": 4.01953125, + "router_z_loss_mlp": 0.45727539, + "step": 1586, + "time_per_iteration": 2.629486083984375 + }, + { + "auxiliary_loss_clip": 0.06581648, + "auxiliary_loss_mlp": 0.01282136, + "balance_loss_clip": 0.06328419, + "balance_loss_mlp": 0.01258128, + "epoch": 0.09541560198406734, + "flos": 71930114956800.0, + "grad_norm": 0.7905774049307454, + "language_loss": 0.55110765, + "learning_rate": 3.955279966452883e-06, + "loss": 0.62974548, + "num_input_tokens_seen": 34109655, + "router_z_loss_clip": 2.53125, + "router_z_loss_mlp": 0.23974609, + "step": 1587, + "time_per_iteration": 2.9765305519104004 + }, + { + "auxiliary_loss_clip": 0.06737173, + "auxiliary_loss_mlp": 0.01308566, + "balance_loss_clip": 0.06336194, + "balance_loss_mlp": 0.01264316, + "epoch": 0.09547572523673531, + "flos": 28989609909120.0, + "grad_norm": 3.1625529132554835, + "language_loss": 0.82650244, + "learning_rate": 3.955198031170391e-06, + "loss": 0.90695989, + "num_input_tokens_seen": 34131115, + "router_z_loss_clip": 4.01171875, + "router_z_loss_mlp": 0.44213867, + "step": 1588, + "time_per_iteration": 2.6358370780944824 + }, + { + "auxiliary_loss_clip": 0.06726347, + "auxiliary_loss_mlp": 0.01313798, + "balance_loss_clip": 0.06331095, + "balance_loss_mlp": 0.01270716, + "epoch": 0.09553584848940327, + "flos": 24140759437440.0, + "grad_norm": 5.541794796195464, + "language_loss": 0.83084911, + "learning_rate": 3.955116021746594e-06, + "loss": 0.91125059, + "num_input_tokens_seen": 34151925, + "router_z_loss_clip": 3.95507812, + "router_z_loss_mlp": 0.43066406, + "step": 1589, + "time_per_iteration": 2.609682559967041 + }, + { + "auxiliary_loss_clip": 0.06720543, + "auxiliary_loss_mlp": 0.01306342, + "balance_loss_clip": 0.06330015, + "balance_loss_mlp": 0.01265263, + "epoch": 0.09559597174207124, + "flos": 42861401193600.0, + "grad_norm": 2.659540476465126, + "language_loss": 0.66428804, + "learning_rate": 3.955033938184601e-06, + "loss": 0.7445569, + "num_input_tokens_seen": 34175395, + "router_z_loss_clip": 3.91015625, + "router_z_loss_mlp": 0.41113281, + "step": 1590, + "time_per_iteration": 2.7904412746429443 + }, + { + "auxiliary_loss_clip": 0.06727439, + "auxiliary_loss_mlp": 0.01307692, + "balance_loss_clip": 0.06336293, + "balance_loss_mlp": 0.01267947, + "epoch": 0.09565609499473922, + "flos": 32678999907840.0, + "grad_norm": 1.976054240399588, + "language_loss": 0.84640449, + "learning_rate": 3.954951780487526e-06, + "loss": 0.92675579, + "num_input_tokens_seen": 34197760, + "router_z_loss_clip": 3.91210938, + "router_z_loss_mlp": 0.39746094, + "step": 1591, + "time_per_iteration": 2.677856683731079 + }, + { + "auxiliary_loss_clip": 0.0673625, + "auxiliary_loss_mlp": 0.01301164, + "balance_loss_clip": 0.06335758, + "balance_loss_mlp": 0.01259751, + "epoch": 0.09571621824740718, + "flos": 18484279787520.0, + "grad_norm": 3.2019409014799245, + "language_loss": 0.76485634, + "learning_rate": 3.9548695486584835e-06, + "loss": 0.84523046, + "num_input_tokens_seen": 34215330, + "router_z_loss_clip": 4.00976562, + "router_z_loss_mlp": 0.41381836, + "step": 1592, + "time_per_iteration": 2.5469346046447754 + }, + { + "auxiliary_loss_clip": 0.06718349, + "auxiliary_loss_mlp": 0.01308454, + "balance_loss_clip": 0.06327368, + "balance_loss_mlp": 0.01266444, + "epoch": 0.09577634150007515, + "flos": 29395164470400.0, + "grad_norm": 2.5830614134690757, + "language_loss": 0.75440031, + "learning_rate": 3.954787242700592e-06, + "loss": 0.8346684, + "num_input_tokens_seen": 34237745, + "router_z_loss_clip": 3.90625, + "router_z_loss_mlp": 0.42041016, + "step": 1593, + "time_per_iteration": 2.6077914237976074 + }, + { + "auxiliary_loss_clip": 0.06715257, + "auxiliary_loss_mlp": 0.01313469, + "balance_loss_clip": 0.06327495, + "balance_loss_mlp": 0.01269863, + "epoch": 0.09583646475274313, + "flos": 22754511089280.0, + "grad_norm": 3.098780608368182, + "language_loss": 0.70938909, + "learning_rate": 3.954704862616971e-06, + "loss": 0.78967637, + "num_input_tokens_seen": 34256565, + "router_z_loss_clip": 3.87890625, + "router_z_loss_mlp": 0.4362793, + "step": 1594, + "time_per_iteration": 2.6091833114624023 + }, + { + "auxiliary_loss_clip": 0.06719844, + "auxiliary_loss_mlp": 0.01312184, + "balance_loss_clip": 0.06326512, + "balance_loss_mlp": 0.01271247, + "epoch": 0.0958965880054111, + "flos": 23224495040640.0, + "grad_norm": 3.065197690061672, + "language_loss": 0.83355862, + "learning_rate": 3.954622408410747e-06, + "loss": 0.91387886, + "num_input_tokens_seen": 34275970, + "router_z_loss_clip": 3.92773438, + "router_z_loss_mlp": 0.40942383, + "step": 1595, + "time_per_iteration": 3.978273630142212 + }, + { + "auxiliary_loss_clip": 0.06729501, + "auxiliary_loss_mlp": 0.01321195, + "balance_loss_clip": 0.06329941, + "balance_loss_mlp": 0.01278638, + "epoch": 0.09595671125807906, + "flos": 21330807166080.0, + "grad_norm": 2.8509518249201866, + "language_loss": 0.87066317, + "learning_rate": 3.954539880085045e-06, + "loss": 0.95117009, + "num_input_tokens_seen": 34295490, + "router_z_loss_clip": 3.99414062, + "router_z_loss_mlp": 0.42529297, + "step": 1596, + "time_per_iteration": 4.032626390457153 + }, + { + "auxiliary_loss_clip": 0.06723377, + "auxiliary_loss_mlp": 0.01316069, + "balance_loss_clip": 0.06335501, + "balance_loss_mlp": 0.01273273, + "epoch": 0.09601683451074704, + "flos": 39612841125120.0, + "grad_norm": 3.1423731979310587, + "language_loss": 0.70766866, + "learning_rate": 3.9544572776429945e-06, + "loss": 0.78806317, + "num_input_tokens_seen": 34319990, + "router_z_loss_clip": 3.8828125, + "router_z_loss_mlp": 0.42773438, + "step": 1597, + "time_per_iteration": 2.7174298763275146 + }, + { + "auxiliary_loss_clip": 0.06742129, + "auxiliary_loss_mlp": 0.01306146, + "balance_loss_clip": 0.06339651, + "balance_loss_mlp": 0.01265687, + "epoch": 0.096076957763415, + "flos": 23739523361280.0, + "grad_norm": 3.050895337571829, + "language_loss": 0.77272135, + "learning_rate": 3.954374601087729e-06, + "loss": 0.85320413, + "num_input_tokens_seen": 34339225, + "router_z_loss_clip": 4.02734375, + "router_z_loss_mlp": 0.40429688, + "step": 1598, + "time_per_iteration": 2.5799829959869385 + }, + { + "auxiliary_loss_clip": 0.06737213, + "auxiliary_loss_mlp": 0.01319114, + "balance_loss_clip": 0.06339812, + "balance_loss_mlp": 0.01276103, + "epoch": 0.09613708101608297, + "flos": 34686689662080.0, + "grad_norm": 4.982256482437043, + "language_loss": 0.70875788, + "learning_rate": 3.954291850422382e-06, + "loss": 0.78932118, + "num_input_tokens_seen": 34361020, + "router_z_loss_clip": 3.96679688, + "router_z_loss_mlp": 0.43041992, + "step": 1599, + "time_per_iteration": 4.165144443511963 + }, + { + "auxiliary_loss_clip": 0.0672265, + "auxiliary_loss_mlp": 0.01315059, + "balance_loss_clip": 0.06336158, + "balance_loss_mlp": 0.01275029, + "epoch": 0.09619720426875093, + "flos": 20746192429440.0, + "grad_norm": 2.7563705555600655, + "language_loss": 0.85738063, + "learning_rate": 3.954209025650093e-06, + "loss": 0.93775773, + "num_input_tokens_seen": 34378630, + "router_z_loss_clip": 3.86523438, + "router_z_loss_mlp": 0.40014648, + "step": 1600, + "time_per_iteration": 2.583336591720581 + }, + { + "auxiliary_loss_clip": 0.06737998, + "auxiliary_loss_mlp": 0.01310218, + "balance_loss_clip": 0.06341977, + "balance_loss_mlp": 0.01270641, + "epoch": 0.09625732752141891, + "flos": 13047795832320.0, + "grad_norm": 2.909698328635622, + "language_loss": 0.82446879, + "learning_rate": 3.954126126774001e-06, + "loss": 0.90495098, + "num_input_tokens_seen": 34397110, + "router_z_loss_clip": 3.96484375, + "router_z_loss_mlp": 0.39599609, + "step": 1601, + "time_per_iteration": 3.9834721088409424 + }, + { + "auxiliary_loss_clip": 0.06743482, + "auxiliary_loss_mlp": 0.01303448, + "balance_loss_clip": 0.06337628, + "balance_loss_mlp": 0.01262368, + "epoch": 0.09631745077408688, + "flos": 22280250579840.0, + "grad_norm": 5.887605287140624, + "language_loss": 0.84592891, + "learning_rate": 3.954043153797251e-06, + "loss": 0.92639828, + "num_input_tokens_seen": 34414165, + "router_z_loss_clip": 4.0546875, + "router_z_loss_mlp": 0.41088867, + "step": 1602, + "time_per_iteration": 2.5633962154388428 + }, + { + "auxiliary_loss_clip": 0.06747036, + "auxiliary_loss_mlp": 0.01307728, + "balance_loss_clip": 0.06349348, + "balance_loss_mlp": 0.012661, + "epoch": 0.09637757402675484, + "flos": 24761236521600.0, + "grad_norm": 2.955003508709107, + "language_loss": 0.65285349, + "learning_rate": 3.953960106722989e-06, + "loss": 0.73340118, + "num_input_tokens_seen": 34434445, + "router_z_loss_clip": 3.97851562, + "router_z_loss_mlp": 0.41625977, + "step": 1603, + "time_per_iteration": 2.6790709495544434 + }, + { + "auxiliary_loss_clip": 0.06770037, + "auxiliary_loss_mlp": 0.01301761, + "balance_loss_clip": 0.06360609, + "balance_loss_mlp": 0.01258321, + "epoch": 0.09643769727942282, + "flos": 22531873991040.0, + "grad_norm": 5.353230367509213, + "language_loss": 0.72867018, + "learning_rate": 3.953876985554364e-06, + "loss": 0.80938816, + "num_input_tokens_seen": 34453095, + "router_z_loss_clip": 4.09570312, + "router_z_loss_mlp": 0.43505859, + "step": 1604, + "time_per_iteration": 2.608727216720581 + }, + { + "auxiliary_loss_clip": 0.06740201, + "auxiliary_loss_mlp": 0.01291258, + "balance_loss_clip": 0.06351058, + "balance_loss_mlp": 0.01254327, + "epoch": 0.09649782053209079, + "flos": 30929138766720.0, + "grad_norm": 4.793252253869783, + "language_loss": 0.80923069, + "learning_rate": 3.953793790294527e-06, + "loss": 0.88954532, + "num_input_tokens_seen": 34473680, + "router_z_loss_clip": 3.890625, + "router_z_loss_mlp": 0.36938477, + "step": 1605, + "time_per_iteration": 2.6763031482696533 + }, + { + "auxiliary_loss_clip": 0.06759577, + "auxiliary_loss_mlp": 0.01298287, + "balance_loss_clip": 0.06351094, + "balance_loss_mlp": 0.01258805, + "epoch": 0.09655794378475875, + "flos": 25344635374080.0, + "grad_norm": 2.3859738867756524, + "language_loss": 0.77227855, + "learning_rate": 3.953710520946634e-06, + "loss": 0.85285711, + "num_input_tokens_seen": 34492610, + "router_z_loss_clip": 4.08203125, + "router_z_loss_mlp": 0.39501953, + "step": 1606, + "time_per_iteration": 2.5902390480041504 + }, + { + "auxiliary_loss_clip": 0.0675118, + "auxiliary_loss_mlp": 0.0129606, + "balance_loss_clip": 0.06355944, + "balance_loss_mlp": 0.01258009, + "epoch": 0.09661806703742673, + "flos": 22352604180480.0, + "grad_norm": 2.2398823980048133, + "language_loss": 0.77161521, + "learning_rate": 3.953627177513843e-06, + "loss": 0.85208762, + "num_input_tokens_seen": 34511855, + "router_z_loss_clip": 3.953125, + "router_z_loss_mlp": 0.38085938, + "step": 1607, + "time_per_iteration": 2.5747807025909424 + }, + { + "auxiliary_loss_clip": 0.06767638, + "auxiliary_loss_mlp": 0.01306362, + "balance_loss_clip": 0.06365312, + "balance_loss_mlp": 0.01268597, + "epoch": 0.0966781902900947, + "flos": 17463405168000.0, + "grad_norm": 2.424309477239619, + "language_loss": 0.89527833, + "learning_rate": 3.953543759999312e-06, + "loss": 0.97601831, + "num_input_tokens_seen": 34528905, + "router_z_loss_clip": 4.02539062, + "router_z_loss_mlp": 0.37768555, + "step": 1608, + "time_per_iteration": 2.528881072998047 + }, + { + "auxiliary_loss_clip": 0.06782863, + "auxiliary_loss_mlp": 0.01306552, + "balance_loss_clip": 0.06378618, + "balance_loss_mlp": 0.01264471, + "epoch": 0.09673831354276266, + "flos": 36912991518720.0, + "grad_norm": 7.970472148643012, + "language_loss": 0.74000025, + "learning_rate": 3.953460268406207e-06, + "loss": 0.82089442, + "num_input_tokens_seen": 34548480, + "router_z_loss_clip": 4.0390625, + "router_z_loss_mlp": 0.4206543, + "step": 1609, + "time_per_iteration": 2.734060764312744 + }, + { + "auxiliary_loss_clip": 0.06767572, + "auxiliary_loss_mlp": 0.01304591, + "balance_loss_clip": 0.06368488, + "balance_loss_mlp": 0.01264418, + "epoch": 0.09679843679543064, + "flos": 20707185553920.0, + "grad_norm": 3.4585784172758123, + "language_loss": 0.86017323, + "learning_rate": 3.953376702737693e-06, + "loss": 0.94089484, + "num_input_tokens_seen": 34565410, + "router_z_loss_clip": 3.99414062, + "router_z_loss_mlp": 0.40185547, + "step": 1610, + "time_per_iteration": 2.6115059852600098 + }, + { + "auxiliary_loss_clip": 0.06763892, + "auxiliary_loss_mlp": 0.01304909, + "balance_loss_clip": 0.06364195, + "balance_loss_mlp": 0.01263877, + "epoch": 0.0968585600480986, + "flos": 23521288602240.0, + "grad_norm": 2.270672864322457, + "language_loss": 0.68734491, + "learning_rate": 3.953293062996939e-06, + "loss": 0.76803291, + "num_input_tokens_seen": 34584840, + "router_z_loss_clip": 3.9921875, + "router_z_loss_mlp": 0.41040039, + "step": 1611, + "time_per_iteration": 2.614010810852051 + }, + { + "auxiliary_loss_clip": 0.06775121, + "auxiliary_loss_mlp": 0.01302817, + "balance_loss_clip": 0.06373329, + "balance_loss_mlp": 0.01263239, + "epoch": 0.09691868330076657, + "flos": 20127350499840.0, + "grad_norm": 2.139701940573329, + "language_loss": 0.82997268, + "learning_rate": 3.953209349187115e-06, + "loss": 0.91075206, + "num_input_tokens_seen": 34603360, + "router_z_loss_clip": 4.0234375, + "router_z_loss_mlp": 0.39599609, + "step": 1612, + "time_per_iteration": 2.5493521690368652 + }, + { + "auxiliary_loss_clip": 0.06771481, + "auxiliary_loss_mlp": 0.01301111, + "balance_loss_clip": 0.06373016, + "balance_loss_mlp": 0.01260509, + "epoch": 0.09697880655343454, + "flos": 16550243372160.0, + "grad_norm": 8.083682244788854, + "language_loss": 0.82256299, + "learning_rate": 3.953125561311398e-06, + "loss": 0.90328896, + "num_input_tokens_seen": 34620760, + "router_z_loss_clip": 3.984375, + "router_z_loss_mlp": 0.40600586, + "step": 1613, + "time_per_iteration": 2.597912311553955 + }, + { + "auxiliary_loss_clip": 0.06750716, + "auxiliary_loss_mlp": 0.01299993, + "balance_loss_clip": 0.06359349, + "balance_loss_mlp": 0.01259724, + "epoch": 0.09703892980610251, + "flos": 26111370960000.0, + "grad_norm": 2.0260319330855654, + "language_loss": 0.86653531, + "learning_rate": 3.953041699372964e-06, + "loss": 0.94704247, + "num_input_tokens_seen": 34640695, + "router_z_loss_clip": 3.91210938, + "router_z_loss_mlp": 0.40258789, + "step": 1614, + "time_per_iteration": 2.6904046535491943 + }, + { + "auxiliary_loss_clip": 0.06673412, + "auxiliary_loss_mlp": 0.0133076, + "balance_loss_clip": 0.06412064, + "balance_loss_mlp": 0.01308611, + "epoch": 0.09709905305877048, + "flos": 60463712903040.0, + "grad_norm": 0.7036996820791193, + "language_loss": 0.54819673, + "learning_rate": 3.952957763374992e-06, + "loss": 0.6282385, + "num_input_tokens_seen": 34702395, + "router_z_loss_clip": 2.61914062, + "router_z_loss_mlp": 0.22180176, + "step": 1615, + "time_per_iteration": 3.235962152481079 + }, + { + "auxiliary_loss_clip": 0.06658442, + "auxiliary_loss_mlp": 0.01303789, + "balance_loss_clip": 0.06397749, + "balance_loss_mlp": 0.01282129, + "epoch": 0.09715917631143844, + "flos": 57660510885120.0, + "grad_norm": 0.7526049722603284, + "language_loss": 0.58190084, + "learning_rate": 3.952873753320666e-06, + "loss": 0.66152322, + "num_input_tokens_seen": 34768910, + "router_z_loss_clip": 2.59765625, + "router_z_loss_mlp": 0.21691895, + "step": 1616, + "time_per_iteration": 3.387523889541626 + }, + { + "auxiliary_loss_clip": 0.06757308, + "auxiliary_loss_mlp": 0.01307733, + "balance_loss_clip": 0.06359798, + "balance_loss_mlp": 0.01265652, + "epoch": 0.09721929956410642, + "flos": 20564448923520.0, + "grad_norm": 2.209089082853045, + "language_loss": 0.70192569, + "learning_rate": 3.952789669213172e-06, + "loss": 0.78257608, + "num_input_tokens_seen": 34787680, + "router_z_loss_clip": 3.97265625, + "router_z_loss_mlp": 0.42041016, + "step": 1617, + "time_per_iteration": 2.5756118297576904 + }, + { + "auxiliary_loss_clip": 0.06757677, + "auxiliary_loss_mlp": 0.0131002, + "balance_loss_clip": 0.06358766, + "balance_loss_mlp": 0.01269298, + "epoch": 0.09727942281677439, + "flos": 27351696222720.0, + "grad_norm": 2.235248973511229, + "language_loss": 0.81849337, + "learning_rate": 3.952705511055698e-06, + "loss": 0.89917034, + "num_input_tokens_seen": 34808330, + "router_z_loss_clip": 3.98632812, + "router_z_loss_mlp": 0.40722656, + "step": 1618, + "time_per_iteration": 2.6768393516540527 + }, + { + "auxiliary_loss_clip": 0.0674091, + "auxiliary_loss_mlp": 0.01309795, + "balance_loss_clip": 0.06356256, + "balance_loss_mlp": 0.01273293, + "epoch": 0.09733954606944235, + "flos": 24906991898880.0, + "grad_norm": 1.9369475823390685, + "language_loss": 0.94461536, + "learning_rate": 3.952621278851435e-06, + "loss": 1.0251224, + "num_input_tokens_seen": 34830020, + "router_z_loss_clip": 3.84375, + "router_z_loss_mlp": 0.36474609, + "step": 1619, + "time_per_iteration": 2.6324799060821533 + }, + { + "auxiliary_loss_clip": 0.06749003, + "auxiliary_loss_mlp": 0.01319848, + "balance_loss_clip": 0.06356695, + "balance_loss_mlp": 0.01280556, + "epoch": 0.09739966932211033, + "flos": 31511992567680.0, + "grad_norm": 2.8077555075872183, + "language_loss": 0.90160304, + "learning_rate": 3.9525369726035784e-06, + "loss": 0.98229158, + "num_input_tokens_seen": 34850330, + "router_z_loss_clip": 3.91601562, + "router_z_loss_mlp": 0.39257812, + "step": 1620, + "time_per_iteration": 2.658043146133423 + }, + { + "auxiliary_loss_clip": 0.06742691, + "auxiliary_loss_mlp": 0.01310778, + "balance_loss_clip": 0.06352507, + "balance_loss_mlp": 0.01268602, + "epoch": 0.0974597925747783, + "flos": 23885614154880.0, + "grad_norm": 11.754534189846764, + "language_loss": 0.78833234, + "learning_rate": 3.952452592315324e-06, + "loss": 0.86886704, + "num_input_tokens_seen": 34871640, + "router_z_loss_clip": 3.90429688, + "router_z_loss_mlp": 0.421875, + "step": 1621, + "time_per_iteration": 2.575810432434082 + }, + { + "auxiliary_loss_clip": 0.06744215, + "auxiliary_loss_mlp": 0.01311535, + "balance_loss_clip": 0.06357577, + "balance_loss_mlp": 0.01271863, + "epoch": 0.09751991582744626, + "flos": 17025300495360.0, + "grad_norm": 3.321884403192612, + "language_loss": 0.7956326, + "learning_rate": 3.952368137989871e-06, + "loss": 0.87619019, + "num_input_tokens_seen": 34888100, + "router_z_loss_clip": 3.86523438, + "router_z_loss_mlp": 0.39648438, + "step": 1622, + "time_per_iteration": 2.5544931888580322 + }, + { + "auxiliary_loss_clip": 0.06764823, + "auxiliary_loss_mlp": 0.01312235, + "balance_loss_clip": 0.06359966, + "balance_loss_mlp": 0.0127199, + "epoch": 0.09758003908011423, + "flos": 28410403760640.0, + "grad_norm": 4.629544309513281, + "language_loss": 0.86985308, + "learning_rate": 3.9522836096304225e-06, + "loss": 0.95062363, + "num_input_tokens_seen": 34910485, + "router_z_loss_clip": 4.046875, + "router_z_loss_mlp": 0.40209961, + "step": 1623, + "time_per_iteration": 2.612455129623413 + }, + { + "auxiliary_loss_clip": 0.06759211, + "auxiliary_loss_mlp": 0.01313929, + "balance_loss_clip": 0.06368798, + "balance_loss_mlp": 0.01275353, + "epoch": 0.09764016233278221, + "flos": 18149150183040.0, + "grad_norm": 2.3724260177997, + "language_loss": 0.82168519, + "learning_rate": 3.952199007240184e-06, + "loss": 0.90241659, + "num_input_tokens_seen": 34928615, + "router_z_loss_clip": 3.90429688, + "router_z_loss_mlp": 0.38598633, + "step": 1624, + "time_per_iteration": 2.572327136993408 + }, + { + "auxiliary_loss_clip": 0.06750062, + "auxiliary_loss_mlp": 0.01321107, + "balance_loss_clip": 0.06362263, + "balance_loss_mlp": 0.01284462, + "epoch": 0.09770028558545017, + "flos": 15270869306880.0, + "grad_norm": 2.8002590375685195, + "language_loss": 0.87639892, + "learning_rate": 3.952114330822364e-06, + "loss": 0.95711064, + "num_input_tokens_seen": 34946045, + "router_z_loss_clip": 3.87890625, + "router_z_loss_mlp": 0.36645508, + "step": 1625, + "time_per_iteration": 2.5327792167663574 + }, + { + "auxiliary_loss_clip": 0.06781108, + "auxiliary_loss_mlp": 0.01314743, + "balance_loss_clip": 0.06374431, + "balance_loss_mlp": 0.01273353, + "epoch": 0.09776040883811814, + "flos": 23478382512000.0, + "grad_norm": 2.111707696763749, + "language_loss": 0.8695811, + "learning_rate": 3.952029580380172e-06, + "loss": 0.95053965, + "num_input_tokens_seen": 34962865, + "router_z_loss_clip": 4.06445312, + "router_z_loss_mlp": 0.4140625, + "step": 1626, + "time_per_iteration": 2.631251096725464 + }, + { + "auxiliary_loss_clip": 0.067652, + "auxiliary_loss_mlp": 0.01306731, + "balance_loss_clip": 0.06367379, + "balance_loss_mlp": 0.01267177, + "epoch": 0.09782053209078612, + "flos": 24506510509440.0, + "grad_norm": 2.38090987978409, + "language_loss": 0.84928203, + "learning_rate": 3.9519447559168234e-06, + "loss": 0.93000138, + "num_input_tokens_seen": 34983505, + "router_z_loss_clip": 3.97460938, + "router_z_loss_mlp": 0.39550781, + "step": 1627, + "time_per_iteration": 2.6171953678131104 + }, + { + "auxiliary_loss_clip": 0.06749414, + "auxiliary_loss_mlp": 0.01311575, + "balance_loss_clip": 0.06362557, + "balance_loss_mlp": 0.01274334, + "epoch": 0.09788065534345408, + "flos": 21586623281280.0, + "grad_norm": 2.0465991602511107, + "language_loss": 0.86433482, + "learning_rate": 3.951859857435534e-06, + "loss": 0.94494474, + "num_input_tokens_seen": 35001825, + "router_z_loss_clip": 3.8671875, + "router_z_loss_mlp": 0.37255859, + "step": 1628, + "time_per_iteration": 2.5730161666870117 + }, + { + "auxiliary_loss_clip": 0.06751154, + "auxiliary_loss_mlp": 0.013221, + "balance_loss_clip": 0.06365977, + "balance_loss_mlp": 0.0128362, + "epoch": 0.09794077859612205, + "flos": 23849332536960.0, + "grad_norm": 2.074450963540643, + "language_loss": 0.76707101, + "learning_rate": 3.951774884939523e-06, + "loss": 0.84780353, + "num_input_tokens_seen": 35023075, + "router_z_loss_clip": 3.8515625, + "router_z_loss_mlp": 0.38452148, + "step": 1629, + "time_per_iteration": 2.615643262863159 + }, + { + "auxiliary_loss_clip": 0.06753751, + "auxiliary_loss_mlp": 0.01312675, + "balance_loss_clip": 0.06363355, + "balance_loss_mlp": 0.01273288, + "epoch": 0.09800090184879003, + "flos": 23666708563200.0, + "grad_norm": 2.0658158581699806, + "language_loss": 0.79474878, + "learning_rate": 3.951689838432013e-06, + "loss": 0.87541306, + "num_input_tokens_seen": 35043480, + "router_z_loss_clip": 3.91015625, + "router_z_loss_mlp": 0.39379883, + "step": 1630, + "time_per_iteration": 2.5846662521362305 + }, + { + "auxiliary_loss_clip": 0.06751612, + "auxiliary_loss_mlp": 0.01306103, + "balance_loss_clip": 0.06359278, + "balance_loss_mlp": 0.01266335, + "epoch": 0.09806102510145799, + "flos": 17061456332160.0, + "grad_norm": 3.092577982684634, + "language_loss": 0.88391125, + "learning_rate": 3.951604717916228e-06, + "loss": 0.96448845, + "num_input_tokens_seen": 35061490, + "router_z_loss_clip": 3.92382812, + "router_z_loss_mlp": 0.39770508, + "step": 1631, + "time_per_iteration": 2.545468807220459 + }, + { + "auxiliary_loss_clip": 0.06742664, + "auxiliary_loss_mlp": 0.01296447, + "balance_loss_clip": 0.06359032, + "balance_loss_mlp": 0.01259039, + "epoch": 0.09812114835412596, + "flos": 23885278738560.0, + "grad_norm": 2.2303411170681566, + "language_loss": 0.8421644, + "learning_rate": 3.9515195233953975e-06, + "loss": 0.92255551, + "num_input_tokens_seen": 35079670, + "router_z_loss_clip": 3.8359375, + "router_z_loss_mlp": 0.37426758, + "step": 1632, + "time_per_iteration": 2.5765457153320312 + }, + { + "auxiliary_loss_clip": 0.06746343, + "auxiliary_loss_mlp": 0.01300275, + "balance_loss_clip": 0.0636283, + "balance_loss_mlp": 0.01262557, + "epoch": 0.09818127160679392, + "flos": 20601862571520.0, + "grad_norm": 2.054168262723839, + "language_loss": 0.80421484, + "learning_rate": 3.951434254872751e-06, + "loss": 0.88468099, + "num_input_tokens_seen": 35099205, + "router_z_loss_clip": 3.83984375, + "router_z_loss_mlp": 0.37744141, + "step": 1633, + "time_per_iteration": 2.5900163650512695 + }, + { + "auxiliary_loss_clip": 0.06752759, + "auxiliary_loss_mlp": 0.01296054, + "balance_loss_clip": 0.06366011, + "balance_loss_mlp": 0.01257931, + "epoch": 0.0982413948594619, + "flos": 15492835572480.0, + "grad_norm": 3.0165255601535743, + "language_loss": 0.74936914, + "learning_rate": 3.951348912351521e-06, + "loss": 0.82985729, + "num_input_tokens_seen": 35115270, + "router_z_loss_clip": 3.86914062, + "router_z_loss_mlp": 0.38134766, + "step": 1634, + "time_per_iteration": 3.9524917602539062 + }, + { + "auxiliary_loss_clip": 0.06754396, + "auxiliary_loss_mlp": 0.01296894, + "balance_loss_clip": 0.06358244, + "balance_loss_mlp": 0.01258485, + "epoch": 0.09830151811212987, + "flos": 24214999754880.0, + "grad_norm": 4.629396807552869, + "language_loss": 0.75166363, + "learning_rate": 3.951263495834947e-06, + "loss": 0.83217651, + "num_input_tokens_seen": 35134065, + "router_z_loss_clip": 3.95898438, + "router_z_loss_mlp": 0.3840332, + "step": 1635, + "time_per_iteration": 2.619173049926758 + }, + { + "auxiliary_loss_clip": 0.06750873, + "auxiliary_loss_mlp": 0.01303971, + "balance_loss_clip": 0.0635405, + "balance_loss_mlp": 0.01262486, + "epoch": 0.09836164136479783, + "flos": 20600814395520.0, + "grad_norm": 5.1262872331137945, + "language_loss": 0.79884511, + "learning_rate": 3.951178005326264e-06, + "loss": 0.87939358, + "num_input_tokens_seen": 35154870, + "router_z_loss_clip": 3.96875, + "router_z_loss_mlp": 0.41455078, + "step": 1636, + "time_per_iteration": 4.063632965087891 + }, + { + "auxiliary_loss_clip": 0.06755228, + "auxiliary_loss_mlp": 0.0130259, + "balance_loss_clip": 0.06357834, + "balance_loss_mlp": 0.01260486, + "epoch": 0.09842176461746581, + "flos": 19939653354240.0, + "grad_norm": 2.182253503011162, + "language_loss": 0.72318256, + "learning_rate": 3.951092440828715e-06, + "loss": 0.80376077, + "num_input_tokens_seen": 35171850, + "router_z_loss_clip": 3.97460938, + "router_z_loss_mlp": 0.42163086, + "step": 1637, + "time_per_iteration": 2.573108196258545 + }, + { + "auxiliary_loss_clip": 0.0673624, + "auxiliary_loss_mlp": 0.01302289, + "balance_loss_clip": 0.06349343, + "balance_loss_mlp": 0.01263045, + "epoch": 0.09848188787013377, + "flos": 21220956063360.0, + "grad_norm": 2.9423896219595016, + "language_loss": 0.79459947, + "learning_rate": 3.951006802345545e-06, + "loss": 0.87498474, + "num_input_tokens_seen": 35188795, + "router_z_loss_clip": 3.87304688, + "router_z_loss_mlp": 0.39257812, + "step": 1638, + "time_per_iteration": 2.620058536529541 + }, + { + "auxiliary_loss_clip": 0.06725241, + "auxiliary_loss_mlp": 0.01294434, + "balance_loss_clip": 0.06345727, + "balance_loss_mlp": 0.01258027, + "epoch": 0.09854201112280174, + "flos": 30162109691520.0, + "grad_norm": 1.743966069044169, + "language_loss": 0.7446866, + "learning_rate": 3.950921089880003e-06, + "loss": 0.82488334, + "num_input_tokens_seen": 35212100, + "router_z_loss_clip": 3.79296875, + "router_z_loss_mlp": 0.36401367, + "step": 1639, + "time_per_iteration": 4.186578750610352 + }, + { + "auxiliary_loss_clip": 0.06740695, + "auxiliary_loss_mlp": 0.01301032, + "balance_loss_clip": 0.06346842, + "balance_loss_mlp": 0.01260025, + "epoch": 0.09860213437546972, + "flos": 21801671585280.0, + "grad_norm": 2.1837560711862114, + "language_loss": 0.90050477, + "learning_rate": 3.950835303435337e-06, + "loss": 0.9809221, + "num_input_tokens_seen": 35230390, + "router_z_loss_clip": 3.93945312, + "router_z_loss_mlp": 0.41040039, + "step": 1640, + "time_per_iteration": 2.571072816848755 + }, + { + "auxiliary_loss_clip": 0.06734361, + "auxiliary_loss_mlp": 0.01304387, + "balance_loss_clip": 0.06346233, + "balance_loss_mlp": 0.01265548, + "epoch": 0.09866225762813768, + "flos": 21842062053120.0, + "grad_norm": 2.730520486163119, + "language_loss": 0.82726961, + "learning_rate": 3.950749443014801e-06, + "loss": 0.90765709, + "num_input_tokens_seen": 35250405, + "router_z_loss_clip": 3.87695312, + "router_z_loss_mlp": 0.38818359, + "step": 1641, + "time_per_iteration": 3.9849867820739746 + }, + { + "auxiliary_loss_clip": 0.06739942, + "auxiliary_loss_mlp": 0.01313392, + "balance_loss_clip": 0.06347778, + "balance_loss_mlp": 0.01271692, + "epoch": 0.09872238088080565, + "flos": 17605093622400.0, + "grad_norm": 3.096093902434135, + "language_loss": 0.88531339, + "learning_rate": 3.95066350862165e-06, + "loss": 0.96584678, + "num_input_tokens_seen": 35262820, + "router_z_loss_clip": 3.92382812, + "router_z_loss_mlp": 0.41699219, + "step": 1642, + "time_per_iteration": 2.516415596008301 + }, + { + "auxiliary_loss_clip": 0.06737699, + "auxiliary_loss_mlp": 0.01318919, + "balance_loss_clip": 0.06353228, + "balance_loss_mlp": 0.01281606, + "epoch": 0.09878250413347361, + "flos": 27643500466560.0, + "grad_norm": 2.0791034906225883, + "language_loss": 0.82263941, + "learning_rate": 3.950577500259144e-06, + "loss": 0.90320563, + "num_input_tokens_seen": 35284490, + "router_z_loss_clip": 3.84179688, + "router_z_loss_mlp": 0.37304688, + "step": 1643, + "time_per_iteration": 2.647494077682495 + }, + { + "auxiliary_loss_clip": 0.06734201, + "auxiliary_loss_mlp": 0.01331721, + "balance_loss_clip": 0.06346507, + "balance_loss_mlp": 0.01293407, + "epoch": 0.0988426273861416, + "flos": 16550285299200.0, + "grad_norm": 2.4456553195112574, + "language_loss": 0.84032261, + "learning_rate": 3.950491417930543e-06, + "loss": 0.92098182, + "num_input_tokens_seen": 35302815, + "router_z_loss_clip": 3.88085938, + "router_z_loss_mlp": 0.3828125, + "step": 1644, + "time_per_iteration": 2.532773733139038 + }, + { + "auxiliary_loss_clip": 0.06725995, + "auxiliary_loss_mlp": 0.01324281, + "balance_loss_clip": 0.06350633, + "balance_loss_mlp": 0.0128499, + "epoch": 0.09890275063880956, + "flos": 21221668823040.0, + "grad_norm": 2.0467133061416956, + "language_loss": 0.70372713, + "learning_rate": 3.9504052616391124e-06, + "loss": 0.78422999, + "num_input_tokens_seen": 35321175, + "router_z_loss_clip": 3.75390625, + "router_z_loss_mlp": 0.39282227, + "step": 1645, + "time_per_iteration": 2.622675657272339 + }, + { + "auxiliary_loss_clip": 0.06615774, + "auxiliary_loss_mlp": 0.01318713, + "balance_loss_clip": 0.06367776, + "balance_loss_mlp": 0.01297721, + "epoch": 0.09896287389147752, + "flos": 59398255111680.0, + "grad_norm": 0.866313536392572, + "language_loss": 0.6076256, + "learning_rate": 3.950319031388119e-06, + "loss": 0.68697047, + "num_input_tokens_seen": 35381740, + "router_z_loss_clip": 2.48242188, + "router_z_loss_mlp": 0.21008301, + "step": 1646, + "time_per_iteration": 3.1056430339813232 + }, + { + "auxiliary_loss_clip": 0.06736847, + "auxiliary_loss_mlp": 0.01330956, + "balance_loss_clip": 0.06343894, + "balance_loss_mlp": 0.01288517, + "epoch": 0.0990229971441455, + "flos": 29650351680000.0, + "grad_norm": 13.669187568501263, + "language_loss": 0.74906254, + "learning_rate": 3.950232727180833e-06, + "loss": 0.82974058, + "num_input_tokens_seen": 35403760, + "router_z_loss_clip": 3.93164062, + "router_z_loss_mlp": 0.42456055, + "step": 1647, + "time_per_iteration": 2.6270813941955566 + }, + { + "auxiliary_loss_clip": 0.06742343, + "auxiliary_loss_mlp": 0.01344997, + "balance_loss_clip": 0.0635362, + "balance_loss_mlp": 0.01305277, + "epoch": 0.09908312039681347, + "flos": 21841265439360.0, + "grad_norm": 3.219880040136517, + "language_loss": 0.86054468, + "learning_rate": 3.950146349020525e-06, + "loss": 0.94141805, + "num_input_tokens_seen": 35424050, + "router_z_loss_clip": 3.88671875, + "router_z_loss_mlp": 0.3972168, + "step": 1648, + "time_per_iteration": 2.6192800998687744 + }, + { + "auxiliary_loss_clip": 0.06595583, + "auxiliary_loss_mlp": 0.01312987, + "balance_loss_clip": 0.06350748, + "balance_loss_mlp": 0.01292542, + "epoch": 0.09914324364948143, + "flos": 57584425777920.0, + "grad_norm": 0.7273762983113155, + "language_loss": 0.5560773, + "learning_rate": 3.950059896910473e-06, + "loss": 0.63516295, + "num_input_tokens_seen": 35481690, + "router_z_loss_clip": 2.44140625, + "router_z_loss_mlp": 0.20446777, + "step": 1649, + "time_per_iteration": 3.1318249702453613 + }, + { + "auxiliary_loss_clip": 0.06736004, + "auxiliary_loss_mlp": 0.01331784, + "balance_loss_clip": 0.06347787, + "balance_loss_mlp": 0.01293232, + "epoch": 0.09920336690214941, + "flos": 34131270873600.0, + "grad_norm": 3.80404299498915, + "language_loss": 0.92154968, + "learning_rate": 3.949973370853954e-06, + "loss": 1.00222754, + "num_input_tokens_seen": 35498635, + "router_z_loss_clip": 3.88085938, + "router_z_loss_mlp": 0.38574219, + "step": 1650, + "time_per_iteration": 2.640519142150879 + }, + { + "auxiliary_loss_clip": 0.06583999, + "auxiliary_loss_mlp": 0.012899, + "balance_loss_clip": 0.06337862, + "balance_loss_mlp": 0.012688, + "epoch": 0.09926349015481738, + "flos": 71239910947200.0, + "grad_norm": 0.7750953568391499, + "language_loss": 0.63578606, + "learning_rate": 3.94988677085425e-06, + "loss": 0.71452504, + "num_input_tokens_seen": 35565720, + "router_z_loss_clip": 2.46679688, + "router_z_loss_mlp": 0.21118164, + "step": 1651, + "time_per_iteration": 3.380758047103882 + }, + { + "auxiliary_loss_clip": 0.06739324, + "auxiliary_loss_mlp": 0.01313359, + "balance_loss_clip": 0.06352896, + "balance_loss_mlp": 0.01275236, + "epoch": 0.09932361340748534, + "flos": 23155369822080.0, + "grad_norm": 3.694899481712973, + "language_loss": 0.89802289, + "learning_rate": 3.949800096914643e-06, + "loss": 0.97854972, + "num_input_tokens_seen": 35586000, + "router_z_loss_clip": 3.8671875, + "router_z_loss_mlp": 0.38110352, + "step": 1652, + "time_per_iteration": 2.571901321411133 + }, + { + "auxiliary_loss_clip": 0.06737585, + "auxiliary_loss_mlp": 0.01305643, + "balance_loss_clip": 0.06349514, + "balance_loss_mlp": 0.01267735, + "epoch": 0.09938373666015332, + "flos": 19834791569280.0, + "grad_norm": 2.586330184077195, + "language_loss": 0.8401894, + "learning_rate": 3.949713349038422e-06, + "loss": 0.92062169, + "num_input_tokens_seen": 35604355, + "router_z_loss_clip": 3.88085938, + "router_z_loss_mlp": 0.37890625, + "step": 1653, + "time_per_iteration": 2.5631346702575684 + }, + { + "auxiliary_loss_clip": 0.0674301, + "auxiliary_loss_mlp": 0.01306602, + "balance_loss_clip": 0.06348432, + "balance_loss_mlp": 0.01266428, + "epoch": 0.09944385991282129, + "flos": 22097165408640.0, + "grad_norm": 3.5179958225358914, + "language_loss": 0.81669748, + "learning_rate": 3.949626527228875e-06, + "loss": 0.89719361, + "num_input_tokens_seen": 35625495, + "router_z_loss_clip": 3.94335938, + "router_z_loss_mlp": 0.40136719, + "step": 1654, + "time_per_iteration": 2.602562427520752 + }, + { + "auxiliary_loss_clip": 0.06716993, + "auxiliary_loss_mlp": 0.01303058, + "balance_loss_clip": 0.0634619, + "balance_loss_mlp": 0.01268178, + "epoch": 0.09950398316548925, + "flos": 19835043131520.0, + "grad_norm": 8.671208784933132, + "language_loss": 0.83012509, + "learning_rate": 3.949539631489295e-06, + "loss": 0.91032565, + "num_input_tokens_seen": 35645030, + "router_z_loss_clip": 3.70703125, + "router_z_loss_mlp": 0.34863281, + "step": 1655, + "time_per_iteration": 2.5673985481262207 + }, + { + "auxiliary_loss_clip": 0.06726938, + "auxiliary_loss_mlp": 0.01297279, + "balance_loss_clip": 0.06340201, + "balance_loss_mlp": 0.01259799, + "epoch": 0.09956410641815722, + "flos": 25009715404800.0, + "grad_norm": 2.461628043042503, + "language_loss": 0.82767576, + "learning_rate": 3.9494526618229765e-06, + "loss": 0.90791798, + "num_input_tokens_seen": 35664305, + "router_z_loss_clip": 3.86523438, + "router_z_loss_mlp": 0.37475586, + "step": 1656, + "time_per_iteration": 2.581664800643921 + }, + { + "auxiliary_loss_clip": 0.06710893, + "auxiliary_loss_mlp": 0.01307317, + "balance_loss_clip": 0.06336491, + "balance_loss_mlp": 0.01268812, + "epoch": 0.0996242296708252, + "flos": 19323746317440.0, + "grad_norm": 1.719286888169867, + "language_loss": 0.90283895, + "learning_rate": 3.949365618233217e-06, + "loss": 0.98302102, + "num_input_tokens_seen": 35684060, + "router_z_loss_clip": 3.74804688, + "router_z_loss_mlp": 0.38525391, + "step": 1657, + "time_per_iteration": 2.57688045501709 + }, + { + "auxiliary_loss_clip": 0.06739774, + "auxiliary_loss_mlp": 0.01311666, + "balance_loss_clip": 0.06340782, + "balance_loss_mlp": 0.01267869, + "epoch": 0.09968435292349316, + "flos": 21878050181760.0, + "grad_norm": 2.9029706728478533, + "language_loss": 0.87311482, + "learning_rate": 3.9492785007233195e-06, + "loss": 0.95362926, + "num_input_tokens_seen": 35703250, + "router_z_loss_clip": 3.98632812, + "router_z_loss_mlp": 0.43823242, + "step": 1658, + "time_per_iteration": 2.628093719482422 + }, + { + "auxiliary_loss_clip": 0.06571998, + "auxiliary_loss_mlp": 0.01376397, + "balance_loss_clip": 0.06328425, + "balance_loss_mlp": 0.01349933, + "epoch": 0.09974447617616113, + "flos": 65401912154880.0, + "grad_norm": 0.9037243571562794, + "language_loss": 0.60433233, + "learning_rate": 3.949191309296585e-06, + "loss": 0.68381631, + "num_input_tokens_seen": 35762165, + "router_z_loss_clip": 2.4375, + "router_z_loss_mlp": 0.26513672, + "step": 1659, + "time_per_iteration": 3.2305996417999268 + }, + { + "auxiliary_loss_clip": 0.06713426, + "auxiliary_loss_mlp": 0.01317119, + "balance_loss_clip": 0.06331229, + "balance_loss_mlp": 0.0127735, + "epoch": 0.0998045994288291, + "flos": 23666624709120.0, + "grad_norm": 2.0571407511312865, + "language_loss": 0.87086773, + "learning_rate": 3.949104043956321e-06, + "loss": 0.95117325, + "num_input_tokens_seen": 35781520, + "router_z_loss_clip": 3.82226562, + "router_z_loss_mlp": 0.39746094, + "step": 1660, + "time_per_iteration": 2.5779190063476562 + }, + { + "auxiliary_loss_clip": 0.0670151, + "auxiliary_loss_mlp": 0.01332109, + "balance_loss_clip": 0.06323117, + "balance_loss_mlp": 0.01290529, + "epoch": 0.09986472268149707, + "flos": 19615802123520.0, + "grad_norm": 2.4762315311071315, + "language_loss": 0.80644435, + "learning_rate": 3.949016704705836e-06, + "loss": 0.88678062, + "num_input_tokens_seen": 35799565, + "router_z_loss_clip": 3.78710938, + "router_z_loss_mlp": 0.41552734, + "step": 1661, + "time_per_iteration": 2.691804885864258 + }, + { + "auxiliary_loss_clip": 0.06725313, + "auxiliary_loss_mlp": 0.0132162, + "balance_loss_clip": 0.0632514, + "balance_loss_mlp": 0.01278443, + "epoch": 0.09992484593416504, + "flos": 26220467376000.0, + "grad_norm": 2.2620896744149412, + "language_loss": 0.8613416, + "learning_rate": 3.948929291548443e-06, + "loss": 0.94181097, + "num_input_tokens_seen": 35821085, + "router_z_loss_clip": 4.00585938, + "router_z_loss_mlp": 0.43164062, + "step": 1662, + "time_per_iteration": 2.6255035400390625 + }, + { + "auxiliary_loss_clip": 0.06704119, + "auxiliary_loss_mlp": 0.0133037, + "balance_loss_clip": 0.06321694, + "balance_loss_mlp": 0.0128941, + "epoch": 0.09998496918683301, + "flos": 17499393296640.0, + "grad_norm": 2.3672212997838993, + "language_loss": 0.90448183, + "learning_rate": 3.9488418044874546e-06, + "loss": 0.98482674, + "num_input_tokens_seen": 35839840, + "router_z_loss_clip": 3.82226562, + "router_z_loss_mlp": 0.40966797, + "step": 1663, + "time_per_iteration": 2.6671247482299805 + }, + { + "auxiliary_loss_clip": 0.06712753, + "auxiliary_loss_mlp": 0.01334758, + "balance_loss_clip": 0.06319161, + "balance_loss_mlp": 0.01292105, + "epoch": 0.10004509243950098, + "flos": 22791715102080.0, + "grad_norm": 2.952995005402735, + "language_loss": 0.72149938, + "learning_rate": 3.948754243526191e-06, + "loss": 0.80197442, + "num_input_tokens_seen": 35861545, + "router_z_loss_clip": 3.93164062, + "router_z_loss_mlp": 0.42651367, + "step": 1664, + "time_per_iteration": 2.619164228439331 + }, + { + "auxiliary_loss_clip": 0.06713652, + "auxiliary_loss_mlp": 0.01325429, + "balance_loss_clip": 0.06323303, + "balance_loss_mlp": 0.01284159, + "epoch": 0.10010521569216894, + "flos": 16258984179840.0, + "grad_norm": 39.90990553234195, + "language_loss": 0.80576968, + "learning_rate": 3.94866660866797e-06, + "loss": 0.88616049, + "num_input_tokens_seen": 35878295, + "router_z_loss_clip": 3.90429688, + "router_z_loss_mlp": 0.41235352, + "step": 1665, + "time_per_iteration": 2.605639934539795 + }, + { + "auxiliary_loss_clip": 0.06714154, + "auxiliary_loss_mlp": 0.01316999, + "balance_loss_clip": 0.06327689, + "balance_loss_mlp": 0.01278017, + "epoch": 0.10016533894483691, + "flos": 23409047658240.0, + "grad_norm": 2.1899546372821566, + "language_loss": 0.71735048, + "learning_rate": 3.9485788999161165e-06, + "loss": 0.79766202, + "num_input_tokens_seen": 35898990, + "router_z_loss_clip": 3.86523438, + "router_z_loss_mlp": 0.38964844, + "step": 1666, + "time_per_iteration": 2.565112352371216 + }, + { + "auxiliary_loss_clip": 0.06721501, + "auxiliary_loss_mlp": 0.01334152, + "balance_loss_clip": 0.06329556, + "balance_loss_mlp": 0.01286492, + "epoch": 0.10022546219750489, + "flos": 19360195643520.0, + "grad_norm": 2.4453770076419055, + "language_loss": 0.80451995, + "learning_rate": 3.948491117273956e-06, + "loss": 0.88507646, + "num_input_tokens_seen": 35916225, + "router_z_loss_clip": 3.921875, + "router_z_loss_mlp": 0.47680664, + "step": 1667, + "time_per_iteration": 2.5686376094818115 + }, + { + "auxiliary_loss_clip": 0.06714002, + "auxiliary_loss_mlp": 0.01313023, + "balance_loss_clip": 0.06328776, + "balance_loss_mlp": 0.01272492, + "epoch": 0.10028558545017285, + "flos": 27092525944320.0, + "grad_norm": 3.3659339438704357, + "language_loss": 0.79832667, + "learning_rate": 3.948403260744817e-06, + "loss": 0.8785969, + "num_input_tokens_seen": 35934630, + "router_z_loss_clip": 3.84960938, + "router_z_loss_mlp": 0.40551758, + "step": 1668, + "time_per_iteration": 2.5726866722106934 + }, + { + "auxiliary_loss_clip": 0.0670673, + "auxiliary_loss_mlp": 0.013093, + "balance_loss_clip": 0.06318925, + "balance_loss_mlp": 0.01268101, + "epoch": 0.10034570870284082, + "flos": 25854003544320.0, + "grad_norm": 2.568927800509246, + "language_loss": 0.79338908, + "learning_rate": 3.948315330332031e-06, + "loss": 0.87354934, + "num_input_tokens_seen": 35953855, + "router_z_loss_clip": 3.87695312, + "router_z_loss_mlp": 0.41235352, + "step": 1669, + "time_per_iteration": 2.6188042163848877 + }, + { + "auxiliary_loss_clip": 0.06725293, + "auxiliary_loss_mlp": 0.0130808, + "balance_loss_clip": 0.06329028, + "balance_loss_mlp": 0.01264497, + "epoch": 0.1004058319555088, + "flos": 26256707066880.0, + "grad_norm": 15.895164476932296, + "language_loss": 0.87389982, + "learning_rate": 3.948227326038933e-06, + "loss": 0.95423353, + "num_input_tokens_seen": 35974555, + "router_z_loss_clip": 3.9609375, + "router_z_loss_mlp": 0.43579102, + "step": 1670, + "time_per_iteration": 2.6586272716522217 + }, + { + "auxiliary_loss_clip": 0.06691795, + "auxiliary_loss_mlp": 0.01298769, + "balance_loss_clip": 0.06322314, + "balance_loss_mlp": 0.0126098, + "epoch": 0.10046595520817676, + "flos": 25381545897600.0, + "grad_norm": 1.8967452212827218, + "language_loss": 0.7865597, + "learning_rate": 3.9481392478688586e-06, + "loss": 0.86646533, + "num_input_tokens_seen": 35996830, + "router_z_loss_clip": 3.69335938, + "router_z_loss_mlp": 0.37817383, + "step": 1671, + "time_per_iteration": 2.6737799644470215 + }, + { + "auxiliary_loss_clip": 0.06549042, + "auxiliary_loss_mlp": 0.01335852, + "balance_loss_clip": 0.06305933, + "balance_loss_mlp": 0.01310293, + "epoch": 0.10052607846084473, + "flos": 67479146398080.0, + "grad_norm": 0.7871321089675286, + "language_loss": 0.60865933, + "learning_rate": 3.948051095825149e-06, + "loss": 0.68750823, + "num_input_tokens_seen": 36054465, + "router_z_loss_clip": 2.4375, + "router_z_loss_mlp": 0.25585938, + "step": 1672, + "time_per_iteration": 3.1528263092041016 + }, + { + "auxiliary_loss_clip": 0.06706591, + "auxiliary_loss_mlp": 0.01299319, + "balance_loss_clip": 0.06322384, + "balance_loss_mlp": 0.01258406, + "epoch": 0.10058620171351271, + "flos": 21366795294720.0, + "grad_norm": 25.353895208902486, + "language_loss": 0.78260916, + "learning_rate": 3.947962869911147e-06, + "loss": 0.86266828, + "num_input_tokens_seen": 36073480, + "router_z_loss_clip": 3.83984375, + "router_z_loss_mlp": 0.40917969, + "step": 1673, + "time_per_iteration": 2.548840045928955 + }, + { + "auxiliary_loss_clip": 0.06713213, + "auxiliary_loss_mlp": 0.01301927, + "balance_loss_clip": 0.06326719, + "balance_loss_mlp": 0.01261419, + "epoch": 0.10064632496618067, + "flos": 16805724071040.0, + "grad_norm": 3.2623460746575867, + "language_loss": 0.75444734, + "learning_rate": 3.947874570130197e-06, + "loss": 0.83459872, + "num_input_tokens_seen": 36091830, + "router_z_loss_clip": 3.8671875, + "router_z_loss_mlp": 0.4050293, + "step": 1674, + "time_per_iteration": 3.9417338371276855 + }, + { + "auxiliary_loss_clip": 0.06701215, + "auxiliary_loss_mlp": 0.01303034, + "balance_loss_clip": 0.0631593, + "balance_loss_mlp": 0.01264124, + "epoch": 0.10070644821884864, + "flos": 23631433194240.0, + "grad_norm": 2.3845334341515905, + "language_loss": 0.80716002, + "learning_rate": 3.947786196485649e-06, + "loss": 0.88720256, + "num_input_tokens_seen": 36111400, + "router_z_loss_clip": 3.8515625, + "router_z_loss_mlp": 0.38891602, + "step": 1675, + "time_per_iteration": 2.6035287380218506 + }, + { + "auxiliary_loss_clip": 0.06711227, + "auxiliary_loss_mlp": 0.01308342, + "balance_loss_clip": 0.06320765, + "balance_loss_mlp": 0.01266404, + "epoch": 0.1007665714715166, + "flos": 24469516131840.0, + "grad_norm": 3.2401043480386122, + "language_loss": 0.82723379, + "learning_rate": 3.947697748980853e-06, + "loss": 0.90742946, + "num_input_tokens_seen": 36129345, + "router_z_loss_clip": 3.90234375, + "router_z_loss_mlp": 0.41943359, + "step": 1676, + "time_per_iteration": 4.029613256454468 + }, + { + "auxiliary_loss_clip": 0.06714617, + "auxiliary_loss_mlp": 0.01315911, + "balance_loss_clip": 0.0632771, + "balance_loss_mlp": 0.0127476, + "epoch": 0.10082669472418458, + "flos": 16804550113920.0, + "grad_norm": 2.3128991920650295, + "language_loss": 0.87477523, + "learning_rate": 3.947609227619163e-06, + "loss": 0.95508051, + "num_input_tokens_seen": 36146255, + "router_z_loss_clip": 3.87109375, + "router_z_loss_mlp": 0.41113281, + "step": 1677, + "time_per_iteration": 2.593122720718384 + }, + { + "auxiliary_loss_clip": 0.06712872, + "auxiliary_loss_mlp": 0.01323048, + "balance_loss_clip": 0.06321359, + "balance_loss_mlp": 0.01280586, + "epoch": 0.10088681797685255, + "flos": 13558673376000.0, + "grad_norm": 2.3885344519990017, + "language_loss": 0.87886804, + "learning_rate": 3.947520632403936e-06, + "loss": 0.9592272, + "num_input_tokens_seen": 36164050, + "router_z_loss_clip": 3.9140625, + "router_z_loss_mlp": 0.42480469, + "step": 1678, + "time_per_iteration": 4.02148962020874 + }, + { + "auxiliary_loss_clip": 0.06711318, + "auxiliary_loss_mlp": 0.01321227, + "balance_loss_clip": 0.06328011, + "balance_loss_mlp": 0.01282985, + "epoch": 0.10094694122952051, + "flos": 25272868752000.0, + "grad_norm": 13.556620814946344, + "language_loss": 0.91124773, + "learning_rate": 3.947431963338532e-06, + "loss": 0.99157315, + "num_input_tokens_seen": 36183530, + "router_z_loss_clip": 3.83007812, + "router_z_loss_mlp": 0.38256836, + "step": 1679, + "time_per_iteration": 2.593204975128174 + }, + { + "auxiliary_loss_clip": 0.06551328, + "auxiliary_loss_mlp": 0.01270219, + "balance_loss_clip": 0.06307815, + "balance_loss_mlp": 0.01249143, + "epoch": 0.10100706448218849, + "flos": 69875521315200.0, + "grad_norm": 0.8658555731993547, + "language_loss": 0.53157437, + "learning_rate": 3.947343220426312e-06, + "loss": 0.60978985, + "num_input_tokens_seen": 36248550, + "router_z_loss_clip": 2.4375, + "router_z_loss_mlp": 0.2109375, + "step": 1680, + "time_per_iteration": 4.680401802062988 + }, + { + "auxiliary_loss_clip": 0.06706315, + "auxiliary_loss_mlp": 0.01330393, + "balance_loss_clip": 0.06326837, + "balance_loss_mlp": 0.0129103, + "epoch": 0.10106718773485646, + "flos": 20012677787520.0, + "grad_norm": 2.2086252291478403, + "language_loss": 0.78363287, + "learning_rate": 3.947254403670641e-06, + "loss": 0.86399996, + "num_input_tokens_seen": 36266065, + "router_z_loss_clip": 3.79296875, + "router_z_loss_mlp": 0.39331055, + "step": 1681, + "time_per_iteration": 2.5842180252075195 + }, + { + "auxiliary_loss_clip": 0.06727763, + "auxiliary_loss_mlp": 0.0133733, + "balance_loss_clip": 0.06334171, + "balance_loss_mlp": 0.01293271, + "epoch": 0.10112731098752442, + "flos": 13484852328960.0, + "grad_norm": 2.7825426019965707, + "language_loss": 0.9580273, + "learning_rate": 3.947165513074889e-06, + "loss": 1.03867817, + "num_input_tokens_seen": 36280960, + "router_z_loss_clip": 3.9375, + "router_z_loss_mlp": 0.44067383, + "step": 1682, + "time_per_iteration": 2.5091476440429688 + }, + { + "auxiliary_loss_clip": 0.06722884, + "auxiliary_loss_mlp": 0.01333979, + "balance_loss_clip": 0.06334428, + "balance_loss_mlp": 0.01291803, + "epoch": 0.1011874342401924, + "flos": 18521944997760.0, + "grad_norm": 4.013093374062749, + "language_loss": 0.88974559, + "learning_rate": 3.947076548642425e-06, + "loss": 0.97031426, + "num_input_tokens_seen": 36299010, + "router_z_loss_clip": 3.8828125, + "router_z_loss_mlp": 0.421875, + "step": 1683, + "time_per_iteration": 2.583263635635376 + }, + { + "auxiliary_loss_clip": 0.0671032, + "auxiliary_loss_mlp": 0.01319793, + "balance_loss_clip": 0.06327897, + "balance_loss_mlp": 0.0128074, + "epoch": 0.10124755749286037, + "flos": 20708904562560.0, + "grad_norm": 3.51695946667963, + "language_loss": 0.76482016, + "learning_rate": 3.946987510376624e-06, + "loss": 0.84512126, + "num_input_tokens_seen": 36318400, + "router_z_loss_clip": 3.82226562, + "router_z_loss_mlp": 0.390625, + "step": 1684, + "time_per_iteration": 2.5566201210021973 + }, + { + "auxiliary_loss_clip": 0.06545618, + "auxiliary_loss_mlp": 0.01270157, + "balance_loss_clip": 0.06304231, + "balance_loss_mlp": 0.01252085, + "epoch": 0.10130768074552833, + "flos": 56130100387200.0, + "grad_norm": 0.7359306974182547, + "language_loss": 0.6108619, + "learning_rate": 3.9468983982808615e-06, + "loss": 0.68901968, + "num_input_tokens_seen": 36381815, + "router_z_loss_clip": 2.40625, + "router_z_loss_mlp": 0.1809082, + "step": 1685, + "time_per_iteration": 3.2871286869049072 + }, + { + "auxiliary_loss_clip": 0.06715102, + "auxiliary_loss_mlp": 0.01314643, + "balance_loss_clip": 0.06328554, + "balance_loss_mlp": 0.01273612, + "epoch": 0.1013678039981963, + "flos": 33410921322240.0, + "grad_norm": 2.782312478618552, + "language_loss": 0.61882973, + "learning_rate": 3.946809212358516e-06, + "loss": 0.6991272, + "num_input_tokens_seen": 36404320, + "router_z_loss_clip": 3.86328125, + "router_z_loss_mlp": 0.41064453, + "step": 1686, + "time_per_iteration": 2.6534583568573 + }, + { + "auxiliary_loss_clip": 0.0670934, + "auxiliary_loss_mlp": 0.01311437, + "balance_loss_clip": 0.0633449, + "balance_loss_mlp": 0.01272622, + "epoch": 0.10142792725086427, + "flos": 31913480206080.0, + "grad_norm": 4.585581221965215, + "language_loss": 0.8288697, + "learning_rate": 3.946719952612972e-06, + "loss": 0.90907753, + "num_input_tokens_seen": 36427510, + "router_z_loss_clip": 3.74804688, + "router_z_loss_mlp": 0.38793945, + "step": 1687, + "time_per_iteration": 2.6766278743743896 + }, + { + "auxiliary_loss_clip": 0.06718412, + "auxiliary_loss_mlp": 0.0131249, + "balance_loss_clip": 0.06331126, + "balance_loss_mlp": 0.01271601, + "epoch": 0.10148805050353224, + "flos": 28483512048000.0, + "grad_norm": 2.9352499009147386, + "language_loss": 0.73686063, + "learning_rate": 3.94663061904761e-06, + "loss": 0.81716961, + "num_input_tokens_seen": 36448230, + "router_z_loss_clip": 3.88085938, + "router_z_loss_mlp": 0.40917969, + "step": 1688, + "time_per_iteration": 2.625084400177002 + }, + { + "auxiliary_loss_clip": 0.06704164, + "auxiliary_loss_mlp": 0.01310415, + "balance_loss_clip": 0.06328401, + "balance_loss_mlp": 0.01267905, + "epoch": 0.1015481737562002, + "flos": 25154799949440.0, + "grad_norm": 2.7691275113498293, + "language_loss": 0.88195848, + "learning_rate": 3.94654121166582e-06, + "loss": 0.9621042, + "num_input_tokens_seen": 36464395, + "router_z_loss_clip": 3.7578125, + "router_z_loss_mlp": 0.42480469, + "step": 1689, + "time_per_iteration": 2.595492362976074 + }, + { + "auxiliary_loss_clip": 0.06716056, + "auxiliary_loss_mlp": 0.01310716, + "balance_loss_clip": 0.06332745, + "balance_loss_mlp": 0.01270328, + "epoch": 0.10160829700886818, + "flos": 30890593088640.0, + "grad_norm": 2.202394662859946, + "language_loss": 0.89776945, + "learning_rate": 3.946451730470993e-06, + "loss": 0.97803724, + "num_input_tokens_seen": 36486475, + "router_z_loss_clip": 3.83398438, + "router_z_loss_mlp": 0.40429688, + "step": 1690, + "time_per_iteration": 2.6406383514404297 + }, + { + "auxiliary_loss_clip": 0.06720668, + "auxiliary_loss_mlp": 0.01309465, + "balance_loss_clip": 0.06337205, + "balance_loss_mlp": 0.01267932, + "epoch": 0.10166842026153615, + "flos": 20418190421760.0, + "grad_norm": 2.5850789066585595, + "language_loss": 0.85274917, + "learning_rate": 3.946362175466521e-06, + "loss": 0.93305051, + "num_input_tokens_seen": 36505310, + "router_z_loss_clip": 3.83789062, + "router_z_loss_mlp": 0.4152832, + "step": 1691, + "time_per_iteration": 2.6336474418640137 + }, + { + "auxiliary_loss_clip": 0.06720576, + "auxiliary_loss_mlp": 0.01308382, + "balance_loss_clip": 0.06329723, + "balance_loss_mlp": 0.01266039, + "epoch": 0.10172854351420411, + "flos": 33485832472320.0, + "grad_norm": 1.9210168222319979, + "language_loss": 0.67985535, + "learning_rate": 3.946272546655801e-06, + "loss": 0.76014495, + "num_input_tokens_seen": 36529820, + "router_z_loss_clip": 3.91210938, + "router_z_loss_mlp": 0.4230957, + "step": 1692, + "time_per_iteration": 2.7298569679260254 + }, + { + "auxiliary_loss_clip": 0.0670909, + "auxiliary_loss_mlp": 0.01313275, + "balance_loss_clip": 0.06329532, + "balance_loss_mlp": 0.01271933, + "epoch": 0.1017886667668721, + "flos": 23557109022720.0, + "grad_norm": 2.364359015626866, + "language_loss": 0.77791357, + "learning_rate": 3.94618284404223e-06, + "loss": 0.85813725, + "num_input_tokens_seen": 36549000, + "router_z_loss_clip": 3.79101562, + "router_z_loss_mlp": 0.41333008, + "step": 1693, + "time_per_iteration": 2.5772159099578857 + }, + { + "auxiliary_loss_clip": 0.06718149, + "auxiliary_loss_mlp": 0.01308582, + "balance_loss_clip": 0.06332842, + "balance_loss_mlp": 0.01267813, + "epoch": 0.10184879001954006, + "flos": 23303011916160.0, + "grad_norm": 1.7868831519316952, + "language_loss": 0.88559091, + "learning_rate": 3.9460930676292105e-06, + "loss": 0.96585822, + "num_input_tokens_seen": 36567515, + "router_z_loss_clip": 3.85742188, + "router_z_loss_mlp": 0.4074707, + "step": 1694, + "time_per_iteration": 2.6128172874450684 + }, + { + "auxiliary_loss_clip": 0.06728393, + "auxiliary_loss_mlp": 0.01308189, + "balance_loss_clip": 0.06335086, + "balance_loss_mlp": 0.01266681, + "epoch": 0.10190891327220802, + "flos": 18339069461760.0, + "grad_norm": 12.701803193315635, + "language_loss": 0.81483626, + "learning_rate": 3.946003217420147e-06, + "loss": 0.89520216, + "num_input_tokens_seen": 36586190, + "router_z_loss_clip": 3.9375, + "router_z_loss_mlp": 0.41503906, + "step": 1695, + "time_per_iteration": 2.5383710861206055 + }, + { + "auxiliary_loss_clip": 0.06719907, + "auxiliary_loss_mlp": 0.01309327, + "balance_loss_clip": 0.06335149, + "balance_loss_mlp": 0.01268152, + "epoch": 0.10196903652487599, + "flos": 26472006933120.0, + "grad_norm": 2.5208321376903173, + "language_loss": 0.87899506, + "learning_rate": 3.945913293418447e-06, + "loss": 0.95928741, + "num_input_tokens_seen": 36607495, + "router_z_loss_clip": 3.84765625, + "router_z_loss_mlp": 0.41186523, + "step": 1696, + "time_per_iteration": 2.651993989944458 + }, + { + "auxiliary_loss_clip": 0.067072, + "auxiliary_loss_mlp": 0.01308456, + "balance_loss_clip": 0.06329801, + "balance_loss_mlp": 0.01268545, + "epoch": 0.10202915977754397, + "flos": 21875618413440.0, + "grad_norm": 1.9807901580601361, + "language_loss": 0.83342528, + "learning_rate": 3.945823295627519e-06, + "loss": 0.91358191, + "num_input_tokens_seen": 36628555, + "router_z_loss_clip": 3.77734375, + "router_z_loss_mlp": 0.39916992, + "step": 1697, + "time_per_iteration": 2.5826144218444824 + }, + { + "auxiliary_loss_clip": 0.06717139, + "auxiliary_loss_mlp": 0.01309728, + "balance_loss_clip": 0.06333424, + "balance_loss_mlp": 0.01268339, + "epoch": 0.10208928303021193, + "flos": 22316322562560.0, + "grad_norm": 4.080073154744023, + "language_loss": 0.82607067, + "learning_rate": 3.9457332240507775e-06, + "loss": 0.90633935, + "num_input_tokens_seen": 36646250, + "router_z_loss_clip": 3.83789062, + "router_z_loss_mlp": 0.4140625, + "step": 1698, + "time_per_iteration": 2.6105751991271973 + }, + { + "auxiliary_loss_clip": 0.06711876, + "auxiliary_loss_mlp": 0.01312643, + "balance_loss_clip": 0.06331024, + "balance_loss_mlp": 0.01272541, + "epoch": 0.1021494062828799, + "flos": 22131811872000.0, + "grad_norm": 3.7730678992984594, + "language_loss": 0.78052682, + "learning_rate": 3.945643078691637e-06, + "loss": 0.86077201, + "num_input_tokens_seen": 36666675, + "router_z_loss_clip": 3.8046875, + "router_z_loss_mlp": 0.40112305, + "step": 1699, + "time_per_iteration": 2.554769515991211 + }, + { + "auxiliary_loss_clip": 0.06706256, + "auxiliary_loss_mlp": 0.01310666, + "balance_loss_clip": 0.06325917, + "balance_loss_mlp": 0.01269253, + "epoch": 0.10220952953554788, + "flos": 19652922282240.0, + "grad_norm": 2.595218153740113, + "language_loss": 0.81135154, + "learning_rate": 3.945552859553516e-06, + "loss": 0.89152074, + "num_input_tokens_seen": 36685225, + "router_z_loss_clip": 3.80273438, + "router_z_loss_mlp": 0.41430664, + "step": 1700, + "time_per_iteration": 2.6276824474334717 + }, + { + "auxiliary_loss_clip": 0.06713387, + "auxiliary_loss_mlp": 0.01308957, + "balance_loss_clip": 0.06330973, + "balance_loss_mlp": 0.01269284, + "epoch": 0.10226965278821584, + "flos": 29794765392000.0, + "grad_norm": 1.915620858004171, + "language_loss": 0.78195202, + "learning_rate": 3.945462566639836e-06, + "loss": 0.86217546, + "num_input_tokens_seen": 36705985, + "router_z_loss_clip": 3.82421875, + "router_z_loss_mlp": 0.39697266, + "step": 1701, + "time_per_iteration": 2.6159350872039795 + }, + { + "auxiliary_loss_clip": 0.06729369, + "auxiliary_loss_mlp": 0.01324821, + "balance_loss_clip": 0.06331599, + "balance_loss_mlp": 0.01279617, + "epoch": 0.10232977604088381, + "flos": 27024239266560.0, + "grad_norm": 2.5261274720011473, + "language_loss": 0.79135132, + "learning_rate": 3.945372199954019e-06, + "loss": 0.87189317, + "num_input_tokens_seen": 36725815, + "router_z_loss_clip": 3.98046875, + "router_z_loss_mlp": 0.4519043, + "step": 1702, + "time_per_iteration": 2.629913806915283 + }, + { + "auxiliary_loss_clip": 0.06706569, + "auxiliary_loss_mlp": 0.01317465, + "balance_loss_clip": 0.06326532, + "balance_loss_mlp": 0.01277983, + "epoch": 0.10238989929355179, + "flos": 20783857639680.0, + "grad_norm": 2.3222724065629494, + "language_loss": 0.95639896, + "learning_rate": 3.945281759499494e-06, + "loss": 1.03663921, + "num_input_tokens_seen": 36742345, + "router_z_loss_clip": 3.80078125, + "router_z_loss_mlp": 0.39501953, + "step": 1703, + "time_per_iteration": 2.601848840713501 + }, + { + "auxiliary_loss_clip": 0.06547229, + "auxiliary_loss_mlp": 0.01318477, + "balance_loss_clip": 0.06308849, + "balance_loss_mlp": 0.01299118, + "epoch": 0.10245002254621975, + "flos": 57716471013120.0, + "grad_norm": 0.8331319138238726, + "language_loss": 0.55242068, + "learning_rate": 3.94519124527969e-06, + "loss": 0.63107777, + "num_input_tokens_seen": 36798775, + "router_z_loss_clip": 2.375, + "router_z_loss_mlp": 0.19335938, + "step": 1704, + "time_per_iteration": 3.1248717308044434 + }, + { + "auxiliary_loss_clip": 0.06706051, + "auxiliary_loss_mlp": 0.01308758, + "balance_loss_clip": 0.06321411, + "balance_loss_mlp": 0.0126775, + "epoch": 0.10251014579888772, + "flos": 16805724071040.0, + "grad_norm": 2.30707717904525, + "language_loss": 0.8659755, + "learning_rate": 3.945100657298039e-06, + "loss": 0.94612348, + "num_input_tokens_seen": 36816295, + "router_z_loss_clip": 3.84960938, + "router_z_loss_mlp": 0.41015625, + "step": 1705, + "time_per_iteration": 2.5850555896759033 + }, + { + "auxiliary_loss_clip": 0.06541149, + "auxiliary_loss_mlp": 0.01304681, + "balance_loss_clip": 0.06304285, + "balance_loss_mlp": 0.01286478, + "epoch": 0.1025702690515557, + "flos": 68584533459840.0, + "grad_norm": 0.7436655566620352, + "language_loss": 0.60505682, + "learning_rate": 3.9450099955579765e-06, + "loss": 0.68351519, + "num_input_tokens_seen": 36882030, + "router_z_loss_clip": 2.375, + "router_z_loss_mlp": 0.18212891, + "step": 1706, + "time_per_iteration": 3.239501953125 + }, + { + "auxiliary_loss_clip": 0.06703549, + "auxiliary_loss_mlp": 0.01305907, + "balance_loss_clip": 0.0632052, + "balance_loss_mlp": 0.01262729, + "epoch": 0.10263039230422366, + "flos": 14871939217920.0, + "grad_norm": 2.8485004441458637, + "language_loss": 0.88280994, + "learning_rate": 3.94491926006294e-06, + "loss": 0.96290451, + "num_input_tokens_seen": 36899245, + "router_z_loss_clip": 3.828125, + "router_z_loss_mlp": 0.43188477, + "step": 1707, + "time_per_iteration": 2.6399993896484375 + }, + { + "auxiliary_loss_clip": 0.0669533, + "auxiliary_loss_mlp": 0.01302799, + "balance_loss_clip": 0.06323209, + "balance_loss_mlp": 0.01262887, + "epoch": 0.10269051555689163, + "flos": 25344593447040.0, + "grad_norm": 2.5980108077369604, + "language_loss": 0.74784869, + "learning_rate": 3.944828450816369e-06, + "loss": 0.82783002, + "num_input_tokens_seen": 36920950, + "router_z_loss_clip": 3.71875, + "router_z_loss_mlp": 0.39892578, + "step": 1708, + "time_per_iteration": 2.654852867126465 + }, + { + "auxiliary_loss_clip": 0.06703041, + "auxiliary_loss_mlp": 0.01305178, + "balance_loss_clip": 0.06323138, + "balance_loss_mlp": 0.01263049, + "epoch": 0.10275063880955959, + "flos": 21075116832000.0, + "grad_norm": 2.060667127210552, + "language_loss": 0.92398179, + "learning_rate": 3.944737567821709e-06, + "loss": 1.00406396, + "num_input_tokens_seen": 36938900, + "router_z_loss_clip": 3.80078125, + "router_z_loss_mlp": 0.42114258, + "step": 1709, + "time_per_iteration": 2.573854446411133 + }, + { + "auxiliary_loss_clip": 0.06702737, + "auxiliary_loss_mlp": 0.01298282, + "balance_loss_clip": 0.06322797, + "balance_loss_mlp": 0.01257703, + "epoch": 0.10281076206222757, + "flos": 30373636124160.0, + "grad_norm": 12.814317235362356, + "language_loss": 0.90276158, + "learning_rate": 3.944646611082406e-06, + "loss": 0.98277175, + "num_input_tokens_seen": 36957010, + "router_z_loss_clip": 3.79882812, + "router_z_loss_mlp": 0.40551758, + "step": 1710, + "time_per_iteration": 2.6228139400482178 + }, + { + "auxiliary_loss_clip": 0.06701953, + "auxiliary_loss_mlp": 0.01305177, + "balance_loss_clip": 0.06325494, + "balance_loss_mlp": 0.01263096, + "epoch": 0.10287088531489554, + "flos": 22424748145920.0, + "grad_norm": 2.0240875797159554, + "language_loss": 0.80754149, + "learning_rate": 3.944555580601908e-06, + "loss": 0.88761282, + "num_input_tokens_seen": 36977690, + "router_z_loss_clip": 3.75976562, + "router_z_loss_mlp": 0.42089844, + "step": 1711, + "time_per_iteration": 2.583343982696533 + }, + { + "auxiliary_loss_clip": 0.06708579, + "auxiliary_loss_mlp": 0.01306816, + "balance_loss_clip": 0.06325286, + "balance_loss_mlp": 0.01263447, + "epoch": 0.1029310085675635, + "flos": 25122501400320.0, + "grad_norm": 2.3794944473216684, + "language_loss": 0.74649823, + "learning_rate": 3.944464476383668e-06, + "loss": 0.82665217, + "num_input_tokens_seen": 36997300, + "router_z_loss_clip": 3.83398438, + "router_z_loss_mlp": 0.43383789, + "step": 1712, + "time_per_iteration": 2.571152687072754 + }, + { + "auxiliary_loss_clip": 0.06692443, + "auxiliary_loss_mlp": 0.01305083, + "balance_loss_clip": 0.0632696, + "balance_loss_mlp": 0.01265911, + "epoch": 0.10299113182023148, + "flos": 19871869800960.0, + "grad_norm": 3.881117444097493, + "language_loss": 0.88232982, + "learning_rate": 3.94437329843114e-06, + "loss": 0.96230507, + "num_input_tokens_seen": 37016110, + "router_z_loss_clip": 3.65429688, + "router_z_loss_mlp": 0.3918457, + "step": 1713, + "time_per_iteration": 4.005250453948975 + }, + { + "auxiliary_loss_clip": 0.06698017, + "auxiliary_loss_mlp": 0.01309494, + "balance_loss_clip": 0.06326848, + "balance_loss_mlp": 0.0126789, + "epoch": 0.10305125507289944, + "flos": 20453633498880.0, + "grad_norm": 1.7755930908575366, + "language_loss": 0.74034607, + "learning_rate": 3.944282046747782e-06, + "loss": 0.82042122, + "num_input_tokens_seen": 37036405, + "router_z_loss_clip": 3.70703125, + "router_z_loss_mlp": 0.41601562, + "step": 1714, + "time_per_iteration": 2.5871846675872803 + }, + { + "auxiliary_loss_clip": 0.06718543, + "auxiliary_loss_mlp": 0.01323459, + "balance_loss_clip": 0.06333546, + "balance_loss_mlp": 0.01278446, + "epoch": 0.10311137832556741, + "flos": 26258090659200.0, + "grad_norm": 2.9350503756017425, + "language_loss": 0.92344153, + "learning_rate": 3.944190721337053e-06, + "loss": 1.00386155, + "num_input_tokens_seen": 37057580, + "router_z_loss_clip": 3.84765625, + "router_z_loss_mlp": 0.45043945, + "step": 1715, + "time_per_iteration": 4.0185253620147705 + }, + { + "auxiliary_loss_clip": 0.06704861, + "auxiliary_loss_mlp": 0.01311537, + "balance_loss_clip": 0.06330159, + "balance_loss_mlp": 0.01269957, + "epoch": 0.10317150157823539, + "flos": 35307711797760.0, + "grad_norm": 2.2230189858401834, + "language_loss": 0.77534348, + "learning_rate": 3.944099322202418e-06, + "loss": 0.85550749, + "num_input_tokens_seen": 37079120, + "router_z_loss_clip": 3.74804688, + "router_z_loss_mlp": 0.41577148, + "step": 1716, + "time_per_iteration": 2.6924543380737305 + }, + { + "auxiliary_loss_clip": 0.06704281, + "auxiliary_loss_mlp": 0.01322549, + "balance_loss_clip": 0.06326932, + "balance_loss_mlp": 0.01278037, + "epoch": 0.10323162483090335, + "flos": 25747171188480.0, + "grad_norm": 4.647251493858166, + "language_loss": 0.87329108, + "learning_rate": 3.944007849347342e-06, + "loss": 0.9535594, + "num_input_tokens_seen": 37099710, + "router_z_loss_clip": 3.76757812, + "router_z_loss_mlp": 0.44506836, + "step": 1717, + "time_per_iteration": 2.5771939754486084 + }, + { + "auxiliary_loss_clip": 0.06709914, + "auxiliary_loss_mlp": 0.01337871, + "balance_loss_clip": 0.06322803, + "balance_loss_mlp": 0.0129393, + "epoch": 0.10329174808357132, + "flos": 16295475432960.0, + "grad_norm": 2.5245058321168297, + "language_loss": 0.84142077, + "learning_rate": 3.943916302775292e-06, + "loss": 0.9218986, + "num_input_tokens_seen": 37117775, + "router_z_loss_clip": 3.87109375, + "router_z_loss_mlp": 0.43945312, + "step": 1718, + "time_per_iteration": 3.9576940536499023 + }, + { + "auxiliary_loss_clip": 0.06693481, + "auxiliary_loss_mlp": 0.01328919, + "balance_loss_clip": 0.06322589, + "balance_loss_mlp": 0.01288626, + "epoch": 0.10335187133623928, + "flos": 36696475768320.0, + "grad_norm": 4.723677538171457, + "language_loss": 0.75181365, + "learning_rate": 3.943824682489742e-06, + "loss": 0.83203769, + "num_input_tokens_seen": 37140280, + "router_z_loss_clip": 3.70703125, + "router_z_loss_mlp": 0.40283203, + "step": 1719, + "time_per_iteration": 4.132940769195557 + }, + { + "auxiliary_loss_clip": 0.06689329, + "auxiliary_loss_mlp": 0.01317642, + "balance_loss_clip": 0.06317558, + "balance_loss_mlp": 0.01278064, + "epoch": 0.10341199458890726, + "flos": 14980909852800.0, + "grad_norm": 1.9928809485399477, + "language_loss": 0.94301736, + "learning_rate": 3.9437329884941665e-06, + "loss": 1.02308702, + "num_input_tokens_seen": 37158350, + "router_z_loss_clip": 3.71679688, + "router_z_loss_mlp": 0.39575195, + "step": 1720, + "time_per_iteration": 2.53070068359375 + }, + { + "auxiliary_loss_clip": 0.06693915, + "auxiliary_loss_mlp": 0.01322313, + "balance_loss_clip": 0.06316631, + "balance_loss_mlp": 0.0127811, + "epoch": 0.10347211784157523, + "flos": 21037745111040.0, + "grad_norm": 2.2577738133608944, + "language_loss": 0.80850732, + "learning_rate": 3.943641220792039e-06, + "loss": 0.88866961, + "num_input_tokens_seen": 37177120, + "router_z_loss_clip": 3.77148438, + "router_z_loss_mlp": 0.44213867, + "step": 1721, + "time_per_iteration": 2.6165122985839844 + }, + { + "auxiliary_loss_clip": 0.06711201, + "auxiliary_loss_mlp": 0.01332384, + "balance_loss_clip": 0.06324577, + "balance_loss_mlp": 0.01286345, + "epoch": 0.1035322410942432, + "flos": 19798216462080.0, + "grad_norm": 2.2916288774806137, + "language_loss": 0.81885946, + "learning_rate": 3.9435493793868434e-06, + "loss": 0.89929533, + "num_input_tokens_seen": 37195895, + "router_z_loss_clip": 3.87109375, + "router_z_loss_mlp": 0.46044922, + "step": 1722, + "time_per_iteration": 2.585881471633911 + }, + { + "auxiliary_loss_clip": 0.06547391, + "auxiliary_loss_mlp": 0.01290481, + "balance_loss_clip": 0.06313527, + "balance_loss_mlp": 0.01272635, + "epoch": 0.10359236434691117, + "flos": 52716037305600.0, + "grad_norm": 0.9610809671594381, + "language_loss": 0.66722119, + "learning_rate": 3.943457464282059e-06, + "loss": 0.74559999, + "num_input_tokens_seen": 37247270, + "router_z_loss_clip": 2.34375, + "router_z_loss_mlp": 0.17883301, + "step": 1723, + "time_per_iteration": 2.9245951175689697 + }, + { + "auxiliary_loss_clip": 0.0669903, + "auxiliary_loss_mlp": 0.01310212, + "balance_loss_clip": 0.06318312, + "balance_loss_mlp": 0.01267582, + "epoch": 0.10365248759957914, + "flos": 18411255354240.0, + "grad_norm": 3.390195963482514, + "language_loss": 0.78785694, + "learning_rate": 3.9433654754811745e-06, + "loss": 0.86794937, + "num_input_tokens_seen": 37265595, + "router_z_loss_clip": 3.8046875, + "router_z_loss_mlp": 0.42651367, + "step": 1724, + "time_per_iteration": 2.587998151779175 + }, + { + "auxiliary_loss_clip": 0.06701188, + "auxiliary_loss_mlp": 0.01310671, + "balance_loss_clip": 0.06321733, + "balance_loss_mlp": 0.01269663, + "epoch": 0.1037126108522471, + "flos": 47563615820160.0, + "grad_norm": 2.288753840195378, + "language_loss": 0.76223904, + "learning_rate": 3.943273412987676e-06, + "loss": 0.84235764, + "num_input_tokens_seen": 37286660, + "router_z_loss_clip": 3.79296875, + "router_z_loss_mlp": 0.41015625, + "step": 1725, + "time_per_iteration": 2.7683663368225098 + }, + { + "auxiliary_loss_clip": 0.06675334, + "auxiliary_loss_mlp": 0.01298882, + "balance_loss_clip": 0.06309348, + "balance_loss_mlp": 0.01258041, + "epoch": 0.10377273410491508, + "flos": 22822671985920.0, + "grad_norm": 2.2764288322332265, + "language_loss": 0.76062018, + "learning_rate": 3.943181276805054e-06, + "loss": 0.84036231, + "num_input_tokens_seen": 37304915, + "router_z_loss_clip": 3.66601562, + "router_z_loss_mlp": 0.40869141, + "step": 1726, + "time_per_iteration": 2.587892770767212 + }, + { + "auxiliary_loss_clip": 0.06701919, + "auxiliary_loss_mlp": 0.01307243, + "balance_loss_clip": 0.0631658, + "balance_loss_mlp": 0.0126316, + "epoch": 0.10383285735758305, + "flos": 26145556225920.0, + "grad_norm": 2.697441848061202, + "language_loss": 0.76235563, + "learning_rate": 3.9430890669368035e-06, + "loss": 0.84244722, + "num_input_tokens_seen": 37325265, + "router_z_loss_clip": 3.85351562, + "router_z_loss_mlp": 0.44042969, + "step": 1727, + "time_per_iteration": 2.6308248043060303 + }, + { + "auxiliary_loss_clip": 0.06691539, + "auxiliary_loss_mlp": 0.0130793, + "balance_loss_clip": 0.0631765, + "balance_loss_mlp": 0.01265277, + "epoch": 0.10389298061025101, + "flos": 17097402533760.0, + "grad_norm": 2.4502843901442315, + "language_loss": 0.86415958, + "learning_rate": 3.942996783386422e-06, + "loss": 0.94415426, + "num_input_tokens_seen": 37341650, + "router_z_loss_clip": 3.73828125, + "router_z_loss_mlp": 0.42675781, + "step": 1728, + "time_per_iteration": 2.5618197917938232 + }, + { + "auxiliary_loss_clip": 0.06685561, + "auxiliary_loss_mlp": 0.01302161, + "balance_loss_clip": 0.06312057, + "balance_loss_mlp": 0.01259484, + "epoch": 0.10395310386291898, + "flos": 20782683682560.0, + "grad_norm": 2.0546311064170726, + "language_loss": 0.71406788, + "learning_rate": 3.942904426157406e-06, + "loss": 0.79394507, + "num_input_tokens_seen": 37360270, + "router_z_loss_clip": 3.73632812, + "router_z_loss_mlp": 0.42675781, + "step": 1729, + "time_per_iteration": 2.5618793964385986 + }, + { + "auxiliary_loss_clip": 0.06693864, + "auxiliary_loss_mlp": 0.01305753, + "balance_loss_clip": 0.06314608, + "balance_loss_mlp": 0.01260954, + "epoch": 0.10401322711558696, + "flos": 12825032952960.0, + "grad_norm": 2.8841772006205617, + "language_loss": 0.83575559, + "learning_rate": 3.9428119952532605e-06, + "loss": 0.91575181, + "num_input_tokens_seen": 37375225, + "router_z_loss_clip": 3.79492188, + "router_z_loss_mlp": 0.44775391, + "step": 1730, + "time_per_iteration": 2.623878002166748 + }, + { + "auxiliary_loss_clip": 0.06680113, + "auxiliary_loss_mlp": 0.01302214, + "balance_loss_clip": 0.06313114, + "balance_loss_mlp": 0.01260681, + "epoch": 0.10407335036825492, + "flos": 23191274096640.0, + "grad_norm": 1.835927341089653, + "language_loss": 0.77408624, + "learning_rate": 3.942719490677489e-06, + "loss": 0.85390949, + "num_input_tokens_seen": 37395165, + "router_z_loss_clip": 3.67382812, + "router_z_loss_mlp": 0.4152832, + "step": 1731, + "time_per_iteration": 2.5633392333984375 + }, + { + "auxiliary_loss_clip": 0.0668644, + "auxiliary_loss_mlp": 0.01313118, + "balance_loss_clip": 0.0632073, + "balance_loss_mlp": 0.01273159, + "epoch": 0.10413347362092289, + "flos": 26111370960000.0, + "grad_norm": 1.90471773366097, + "language_loss": 0.84198594, + "learning_rate": 3.9426269124336e-06, + "loss": 0.92198151, + "num_input_tokens_seen": 37414845, + "router_z_loss_clip": 3.65625, + "router_z_loss_mlp": 0.39941406, + "step": 1732, + "time_per_iteration": 2.6176345348358154 + }, + { + "auxiliary_loss_clip": 0.06683554, + "auxiliary_loss_mlp": 0.01314534, + "balance_loss_clip": 0.06312263, + "balance_loss_mlp": 0.01271905, + "epoch": 0.10419359687359087, + "flos": 12646014704640.0, + "grad_norm": 2.549467420686237, + "language_loss": 0.8515988, + "learning_rate": 3.942534260525104e-06, + "loss": 0.93157971, + "num_input_tokens_seen": 37432490, + "router_z_loss_clip": 3.71679688, + "router_z_loss_mlp": 0.42626953, + "step": 1733, + "time_per_iteration": 2.529829978942871 + }, + { + "auxiliary_loss_clip": 0.06699164, + "auxiliary_loss_mlp": 0.01313294, + "balance_loss_clip": 0.06323372, + "balance_loss_mlp": 0.01269139, + "epoch": 0.10425372012625883, + "flos": 12129099667200.0, + "grad_norm": 4.348408719624472, + "language_loss": 0.78445566, + "learning_rate": 3.942441534955514e-06, + "loss": 0.86458015, + "num_input_tokens_seen": 37449435, + "router_z_loss_clip": 3.7578125, + "router_z_loss_mlp": 0.44165039, + "step": 1734, + "time_per_iteration": 2.5436649322509766 + }, + { + "auxiliary_loss_clip": 0.06683113, + "auxiliary_loss_mlp": 0.01310658, + "balance_loss_clip": 0.06320634, + "balance_loss_mlp": 0.01270937, + "epoch": 0.1043138433789268, + "flos": 25344551520000.0, + "grad_norm": 1.8276863047745044, + "language_loss": 0.76546466, + "learning_rate": 3.9423487357283465e-06, + "loss": 0.84540236, + "num_input_tokens_seen": 37469105, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 0.3972168, + "step": 1735, + "time_per_iteration": 2.6129813194274902 + }, + { + "auxiliary_loss_clip": 0.06697765, + "auxiliary_loss_mlp": 0.01313856, + "balance_loss_clip": 0.06318491, + "balance_loss_mlp": 0.01269438, + "epoch": 0.10437396663159478, + "flos": 29174539870080.0, + "grad_norm": 2.0479038136948735, + "language_loss": 0.80253965, + "learning_rate": 3.94225586284712e-06, + "loss": 0.88265586, + "num_input_tokens_seen": 37490540, + "router_z_loss_clip": 3.79492188, + "router_z_loss_mlp": 0.44360352, + "step": 1736, + "time_per_iteration": 2.6438446044921875 + }, + { + "auxiliary_loss_clip": 0.06694648, + "auxiliary_loss_mlp": 0.01312039, + "balance_loss_clip": 0.06322388, + "balance_loss_mlp": 0.01269267, + "epoch": 0.10443408988426274, + "flos": 25087687228800.0, + "grad_norm": 4.638523885209388, + "language_loss": 0.71961701, + "learning_rate": 3.942162916315356e-06, + "loss": 0.79968387, + "num_input_tokens_seen": 37511905, + "router_z_loss_clip": 3.72460938, + "router_z_loss_mlp": 0.42773438, + "step": 1737, + "time_per_iteration": 2.5947039127349854 + }, + { + "auxiliary_loss_clip": 0.06704547, + "auxiliary_loss_mlp": 0.01309535, + "balance_loss_clip": 0.06322168, + "balance_loss_mlp": 0.01263305, + "epoch": 0.1044942131369307, + "flos": 26766746069760.0, + "grad_norm": 2.5677527060209715, + "language_loss": 0.83228981, + "learning_rate": 3.942069896136581e-06, + "loss": 0.91243058, + "num_input_tokens_seen": 37533635, + "router_z_loss_clip": 3.82617188, + "router_z_loss_mlp": 0.46191406, + "step": 1738, + "time_per_iteration": 2.615252733230591 + }, + { + "auxiliary_loss_clip": 0.06695886, + "auxiliary_loss_mlp": 0.01310975, + "balance_loss_clip": 0.06315427, + "balance_loss_mlp": 0.01265747, + "epoch": 0.10455433638959867, + "flos": 18448543221120.0, + "grad_norm": 2.179337588406841, + "language_loss": 0.76366144, + "learning_rate": 3.9419768023143196e-06, + "loss": 0.84373009, + "num_input_tokens_seen": 37552035, + "router_z_loss_clip": 3.80273438, + "router_z_loss_mlp": 0.45239258, + "step": 1739, + "time_per_iteration": 2.5386781692504883 + }, + { + "auxiliary_loss_clip": 0.06684839, + "auxiliary_loss_mlp": 0.01316183, + "balance_loss_clip": 0.06310752, + "balance_loss_mlp": 0.01271456, + "epoch": 0.10461445964226665, + "flos": 23225207800320.0, + "grad_norm": 1.9549702888486553, + "language_loss": 0.7847473, + "learning_rate": 3.941883634852104e-06, + "loss": 0.86475754, + "num_input_tokens_seen": 37571540, + "router_z_loss_clip": 3.74023438, + "router_z_loss_mlp": 0.44775391, + "step": 1740, + "time_per_iteration": 2.6215531826019287 + }, + { + "auxiliary_loss_clip": 0.06687017, + "auxiliary_loss_mlp": 0.01315844, + "balance_loss_clip": 0.06320937, + "balance_loss_mlp": 0.01273953, + "epoch": 0.10467458289493461, + "flos": 24350860350720.0, + "grad_norm": 2.5281783737696246, + "language_loss": 0.86859214, + "learning_rate": 3.941790393753467e-06, + "loss": 0.94862068, + "num_input_tokens_seen": 37588265, + "router_z_loss_clip": 3.66210938, + "router_z_loss_mlp": 0.41894531, + "step": 1741, + "time_per_iteration": 2.5947859287261963 + }, + { + "auxiliary_loss_clip": 0.06689818, + "auxiliary_loss_mlp": 0.01306432, + "balance_loss_clip": 0.06307445, + "balance_loss_mlp": 0.01259201, + "epoch": 0.10473470614760258, + "flos": 21294315912960.0, + "grad_norm": 3.2114625668667367, + "language_loss": 0.76732343, + "learning_rate": 3.941697079021942e-06, + "loss": 0.84728593, + "num_input_tokens_seen": 37606860, + "router_z_loss_clip": 3.82421875, + "router_z_loss_mlp": 0.47265625, + "step": 1742, + "time_per_iteration": 2.5832579135894775 + }, + { + "auxiliary_loss_clip": 0.06678567, + "auxiliary_loss_mlp": 0.01303781, + "balance_loss_clip": 0.06306475, + "balance_loss_mlp": 0.01260628, + "epoch": 0.10479482940027056, + "flos": 21693287928960.0, + "grad_norm": 9.553870000179, + "language_loss": 0.89069176, + "learning_rate": 3.94160369066107e-06, + "loss": 0.97051525, + "num_input_tokens_seen": 37625210, + "router_z_loss_clip": 3.71875, + "router_z_loss_mlp": 0.43164062, + "step": 1743, + "time_per_iteration": 2.5764474868774414 + }, + { + "auxiliary_loss_clip": 0.06671779, + "auxiliary_loss_mlp": 0.01307955, + "balance_loss_clip": 0.06307401, + "balance_loss_mlp": 0.01264801, + "epoch": 0.10485495265293852, + "flos": 21579076414080.0, + "grad_norm": 2.2332748103162907, + "language_loss": 0.77711093, + "learning_rate": 3.941510228674391e-06, + "loss": 0.8569082, + "num_input_tokens_seen": 37644110, + "router_z_loss_clip": 3.64648438, + "router_z_loss_mlp": 0.43164062, + "step": 1744, + "time_per_iteration": 2.5712687969207764 + }, + { + "auxiliary_loss_clip": 0.06674588, + "auxiliary_loss_mlp": 0.01310978, + "balance_loss_clip": 0.06307609, + "balance_loss_mlp": 0.01270685, + "epoch": 0.10491507590560649, + "flos": 37971070151040.0, + "grad_norm": 4.071178521090377, + "language_loss": 0.81752264, + "learning_rate": 3.941416693065451e-06, + "loss": 0.89737833, + "num_input_tokens_seen": 37665800, + "router_z_loss_clip": 3.671875, + "router_z_loss_mlp": 0.40332031, + "step": 1745, + "time_per_iteration": 2.7351014614105225 + }, + { + "auxiliary_loss_clip": 0.06685829, + "auxiliary_loss_mlp": 0.01305127, + "balance_loss_clip": 0.0631006, + "balance_loss_mlp": 0.01260472, + "epoch": 0.10497519915827447, + "flos": 26403552547200.0, + "grad_norm": 2.408878958176613, + "language_loss": 0.84535897, + "learning_rate": 3.941323083837794e-06, + "loss": 0.92526853, + "num_input_tokens_seen": 37685095, + "router_z_loss_clip": 3.7578125, + "router_z_loss_mlp": 0.44628906, + "step": 1746, + "time_per_iteration": 2.6103639602661133 + }, + { + "auxiliary_loss_clip": 0.06678679, + "auxiliary_loss_mlp": 0.01312181, + "balance_loss_clip": 0.06308784, + "balance_loss_mlp": 0.01272174, + "epoch": 0.10503532241094243, + "flos": 40671842152320.0, + "grad_norm": 2.4792988701606444, + "language_loss": 0.72187877, + "learning_rate": 3.941229400994971e-06, + "loss": 0.80178738, + "num_input_tokens_seen": 37707445, + "router_z_loss_clip": 3.69921875, + "router_z_loss_mlp": 0.40014648, + "step": 1747, + "time_per_iteration": 2.7907614707946777 + }, + { + "auxiliary_loss_clip": 0.06697921, + "auxiliary_loss_mlp": 0.01310121, + "balance_loss_clip": 0.06312211, + "balance_loss_mlp": 0.0126432, + "epoch": 0.1050954456636104, + "flos": 29797239087360.0, + "grad_norm": 4.268942313212568, + "language_loss": 0.86334866, + "learning_rate": 3.941135644540535e-06, + "loss": 0.94342911, + "num_input_tokens_seen": 37728325, + "router_z_loss_clip": 3.859375, + "router_z_loss_mlp": 0.45825195, + "step": 1748, + "time_per_iteration": 2.6081960201263428 + }, + { + "auxiliary_loss_clip": 0.06687598, + "auxiliary_loss_mlp": 0.01305718, + "balance_loss_clip": 0.06311792, + "balance_loss_mlp": 0.0126409, + "epoch": 0.10515556891627838, + "flos": 23955116716800.0, + "grad_norm": 1.9464829787737532, + "language_loss": 0.73449892, + "learning_rate": 3.941041814478041e-06, + "loss": 0.81443208, + "num_input_tokens_seen": 37748910, + "router_z_loss_clip": 3.76171875, + "router_z_loss_mlp": 0.41625977, + "step": 1749, + "time_per_iteration": 2.628052234649658 + }, + { + "auxiliary_loss_clip": 0.06669957, + "auxiliary_loss_mlp": 0.01310674, + "balance_loss_clip": 0.0630856, + "balance_loss_mlp": 0.01270882, + "epoch": 0.10521569216894634, + "flos": 18265458049920.0, + "grad_norm": 3.456638635747079, + "language_loss": 0.84465253, + "learning_rate": 3.940947910811047e-06, + "loss": 0.92445886, + "num_input_tokens_seen": 37765745, + "router_z_loss_clip": 3.61328125, + "router_z_loss_mlp": 0.39794922, + "step": 1750, + "time_per_iteration": 2.537736177444458 + }, + { + "auxiliary_loss_clip": 0.06687038, + "auxiliary_loss_mlp": 0.01306152, + "balance_loss_clip": 0.06307652, + "balance_loss_mlp": 0.01264238, + "epoch": 0.10527581542161431, + "flos": 15636033400320.0, + "grad_norm": 3.4228490231822364, + "language_loss": 0.94313812, + "learning_rate": 3.940853933543114e-06, + "loss": 1.0230701, + "num_input_tokens_seen": 37780520, + "router_z_loss_clip": 3.79101562, + "router_z_loss_mlp": 0.41918945, + "step": 1751, + "time_per_iteration": 2.525054931640625 + }, + { + "auxiliary_loss_clip": 0.06674927, + "auxiliary_loss_mlp": 0.01302904, + "balance_loss_clip": 0.06309814, + "balance_loss_mlp": 0.01265686, + "epoch": 0.10533593867428227, + "flos": 18302494354560.0, + "grad_norm": 3.1318677329631757, + "language_loss": 0.8055681, + "learning_rate": 3.940759882677805e-06, + "loss": 0.88534641, + "num_input_tokens_seen": 37799515, + "router_z_loss_clip": 3.65234375, + "router_z_loss_mlp": 0.37207031, + "step": 1752, + "time_per_iteration": 2.61299467086792 + }, + { + "auxiliary_loss_clip": 0.06668897, + "auxiliary_loss_mlp": 0.01309257, + "balance_loss_clip": 0.06304127, + "balance_loss_mlp": 0.01268869, + "epoch": 0.10539606192695025, + "flos": 29030922771840.0, + "grad_norm": 1.9587092194109417, + "language_loss": 0.77260768, + "learning_rate": 3.940665758218686e-06, + "loss": 0.85238922, + "num_input_tokens_seen": 37818695, + "router_z_loss_clip": 3.64453125, + "router_z_loss_mlp": 0.40356445, + "step": 1753, + "time_per_iteration": 3.9985692501068115 + }, + { + "auxiliary_loss_clip": 0.06682716, + "auxiliary_loss_mlp": 0.01311036, + "balance_loss_clip": 0.06304091, + "balance_loss_mlp": 0.01267, + "epoch": 0.10545618517961822, + "flos": 19974593306880.0, + "grad_norm": 2.3568862676270244, + "language_loss": 0.85363507, + "learning_rate": 3.940571560169328e-06, + "loss": 0.93357253, + "num_input_tokens_seen": 37837860, + "router_z_loss_clip": 3.78710938, + "router_z_loss_mlp": 0.44067383, + "step": 1754, + "time_per_iteration": 2.5938985347747803 + }, + { + "auxiliary_loss_clip": 0.06682456, + "auxiliary_loss_mlp": 0.01316264, + "balance_loss_clip": 0.06304919, + "balance_loss_mlp": 0.012723, + "epoch": 0.10551630843228618, + "flos": 16148923441920.0, + "grad_norm": 4.265882829931168, + "language_loss": 0.71315837, + "learning_rate": 3.940477288533302e-06, + "loss": 0.7931456, + "num_input_tokens_seen": 37856260, + "router_z_loss_clip": 3.77148438, + "router_z_loss_mlp": 0.43969727, + "step": 1755, + "time_per_iteration": 3.9860999584198 + }, + { + "auxiliary_loss_clip": 0.06684709, + "auxiliary_loss_mlp": 0.01318348, + "balance_loss_clip": 0.06302933, + "balance_loss_mlp": 0.01273025, + "epoch": 0.10557643168495416, + "flos": 23446754795520.0, + "grad_norm": 2.7157076999837364, + "language_loss": 0.78681093, + "learning_rate": 3.940382943314182e-06, + "loss": 0.86684155, + "num_input_tokens_seen": 37876960, + "router_z_loss_clip": 3.8203125, + "router_z_loss_mlp": 0.453125, + "step": 1756, + "time_per_iteration": 2.616227149963379 + }, + { + "auxiliary_loss_clip": 0.06683522, + "auxiliary_loss_mlp": 0.01310683, + "balance_loss_clip": 0.06306458, + "balance_loss_mlp": 0.0126927, + "epoch": 0.10563655493762213, + "flos": 21805528872960.0, + "grad_norm": 1.8370818155350874, + "language_loss": 0.81619543, + "learning_rate": 3.940288524515547e-06, + "loss": 0.89613748, + "num_input_tokens_seen": 37897070, + "router_z_loss_clip": 3.77148438, + "router_z_loss_mlp": 0.41381836, + "step": 1757, + "time_per_iteration": 2.5410592555999756 + }, + { + "auxiliary_loss_clip": 0.06685489, + "auxiliary_loss_mlp": 0.01318192, + "balance_loss_clip": 0.06307954, + "balance_loss_mlp": 0.01272177, + "epoch": 0.10569667819029009, + "flos": 53813347176960.0, + "grad_norm": 2.270274116106966, + "language_loss": 0.800345, + "learning_rate": 3.940194032140976e-06, + "loss": 0.88038182, + "num_input_tokens_seen": 37923635, + "router_z_loss_clip": 3.77734375, + "router_z_loss_mlp": 0.46020508, + "step": 1758, + "time_per_iteration": 4.229799032211304 + }, + { + "auxiliary_loss_clip": 0.06687906, + "auxiliary_loss_mlp": 0.01314474, + "balance_loss_clip": 0.06312382, + "balance_loss_mlp": 0.01272537, + "epoch": 0.10575680144295807, + "flos": 22931432985600.0, + "grad_norm": 1.92460183667747, + "language_loss": 0.93262696, + "learning_rate": 3.940099466194054e-06, + "loss": 1.01265085, + "num_input_tokens_seen": 37942650, + "router_z_loss_clip": 3.75585938, + "router_z_loss_mlp": 0.41967773, + "step": 1759, + "time_per_iteration": 4.090106248855591 + }, + { + "auxiliary_loss_clip": 0.066918, + "auxiliary_loss_mlp": 0.01305635, + "balance_loss_clip": 0.06315835, + "balance_loss_mlp": 0.01262219, + "epoch": 0.10581692469562604, + "flos": 14141820666240.0, + "grad_norm": 3.0343588084928204, + "language_loss": 0.78992438, + "learning_rate": 3.940004826678365e-06, + "loss": 0.86989868, + "num_input_tokens_seen": 37960660, + "router_z_loss_clip": 3.75976562, + "router_z_loss_mlp": 0.43383789, + "step": 1760, + "time_per_iteration": 2.5582082271575928 + }, + { + "auxiliary_loss_clip": 0.06697676, + "auxiliary_loss_mlp": 0.0131432, + "balance_loss_clip": 0.06312977, + "balance_loss_mlp": 0.01266588, + "epoch": 0.105877047948294, + "flos": 25965909072000.0, + "grad_norm": 2.31808263898244, + "language_loss": 0.91032952, + "learning_rate": 3.939910113597498e-06, + "loss": 0.99044949, + "num_input_tokens_seen": 37978625, + "router_z_loss_clip": 3.8515625, + "router_z_loss_mlp": 0.47729492, + "step": 1761, + "time_per_iteration": 2.5757992267608643 + }, + { + "auxiliary_loss_clip": 0.06676473, + "auxiliary_loss_mlp": 0.01306238, + "balance_loss_clip": 0.06308871, + "balance_loss_mlp": 0.01264229, + "epoch": 0.10593717120096197, + "flos": 30672693745920.0, + "grad_norm": 2.4539135080814862, + "language_loss": 0.79606199, + "learning_rate": 3.9398153269550464e-06, + "loss": 0.87588912, + "num_input_tokens_seen": 38000005, + "router_z_loss_clip": 3.6796875, + "router_z_loss_mlp": 0.42041016, + "step": 1762, + "time_per_iteration": 2.6716315746307373 + }, + { + "auxiliary_loss_clip": 0.06617578, + "auxiliary_loss_mlp": 0.01351391, + "balance_loss_clip": 0.06387473, + "balance_loss_mlp": 0.01331745, + "epoch": 0.10599729445362994, + "flos": 66459347153280.0, + "grad_norm": 0.7549006377741803, + "language_loss": 0.60690284, + "learning_rate": 3.939720466754602e-06, + "loss": 0.68659246, + "num_input_tokens_seen": 38066165, + "router_z_loss_clip": 2.31054688, + "router_z_loss_mlp": 0.19628906, + "step": 1763, + "time_per_iteration": 3.3268401622772217 + }, + { + "auxiliary_loss_clip": 0.06678826, + "auxiliary_loss_mlp": 0.01304205, + "balance_loss_clip": 0.06307326, + "balance_loss_mlp": 0.01263221, + "epoch": 0.10605741770629791, + "flos": 23954445884160.0, + "grad_norm": 2.5468873407149744, + "language_loss": 0.81550586, + "learning_rate": 3.939625532999763e-06, + "loss": 0.89533615, + "num_input_tokens_seen": 38086150, + "router_z_loss_clip": 3.71289062, + "router_z_loss_mlp": 0.40991211, + "step": 1764, + "time_per_iteration": 2.6332688331604004 + }, + { + "auxiliary_loss_clip": 0.06680285, + "auxiliary_loss_mlp": 0.01305528, + "balance_loss_clip": 0.06314, + "balance_loss_mlp": 0.0126359, + "epoch": 0.10611754095896588, + "flos": 19393039244160.0, + "grad_norm": 2.1888720223736384, + "language_loss": 0.81130767, + "learning_rate": 3.9395305256941314e-06, + "loss": 0.89116579, + "num_input_tokens_seen": 38104205, + "router_z_loss_clip": 3.66210938, + "router_z_loss_mlp": 0.41943359, + "step": 1765, + "time_per_iteration": 2.5613298416137695 + }, + { + "auxiliary_loss_clip": 0.0667872, + "auxiliary_loss_mlp": 0.01306506, + "balance_loss_clip": 0.06306241, + "balance_loss_mlp": 0.01263328, + "epoch": 0.10617766421163385, + "flos": 22244472086400.0, + "grad_norm": 2.2657345433152853, + "language_loss": 0.78213799, + "learning_rate": 3.939435444841306e-06, + "loss": 0.86199021, + "num_input_tokens_seen": 38122005, + "router_z_loss_clip": 3.72460938, + "router_z_loss_mlp": 0.43188477, + "step": 1766, + "time_per_iteration": 2.596531867980957 + }, + { + "auxiliary_loss_clip": 0.0668143, + "auxiliary_loss_mlp": 0.01312404, + "balance_loss_clip": 0.06318849, + "balance_loss_mlp": 0.01270705, + "epoch": 0.10623778746430182, + "flos": 28412248550400.0, + "grad_norm": 1.8379569457301719, + "language_loss": 0.78568375, + "learning_rate": 3.939340290444895e-06, + "loss": 0.8656221, + "num_input_tokens_seen": 38143365, + "router_z_loss_clip": 3.62304688, + "router_z_loss_mlp": 0.41674805, + "step": 1767, + "time_per_iteration": 2.6066575050354004 + }, + { + "auxiliary_loss_clip": 0.06566842, + "auxiliary_loss_mlp": 0.01278755, + "balance_loss_clip": 0.06337046, + "balance_loss_mlp": 0.01260039, + "epoch": 0.10629791071696978, + "flos": 64254778231680.0, + "grad_norm": 0.6896173149576642, + "language_loss": 0.57757622, + "learning_rate": 3.939245062508506e-06, + "loss": 0.6560322, + "num_input_tokens_seen": 38210035, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.18688965, + "step": 1768, + "time_per_iteration": 3.3073205947875977 + }, + { + "auxiliary_loss_clip": 0.06681848, + "auxiliary_loss_mlp": 0.01302238, + "balance_loss_clip": 0.06313933, + "balance_loss_mlp": 0.01260634, + "epoch": 0.10635803396963776, + "flos": 22754217600000.0, + "grad_norm": 1.7735238866189138, + "language_loss": 0.88016206, + "learning_rate": 3.939149761035749e-06, + "loss": 0.9600029, + "num_input_tokens_seen": 38231230, + "router_z_loss_clip": 3.67578125, + "router_z_loss_mlp": 0.41625977, + "step": 1769, + "time_per_iteration": 2.59757924079895 + }, + { + "auxiliary_loss_clip": 0.06688489, + "auxiliary_loss_mlp": 0.01307377, + "balance_loss_clip": 0.06315921, + "balance_loss_mlp": 0.01266035, + "epoch": 0.10641815722230573, + "flos": 31403818546560.0, + "grad_norm": 1.8774824554466385, + "language_loss": 0.62396371, + "learning_rate": 3.9390543860302395e-06, + "loss": 0.70392233, + "num_input_tokens_seen": 38253890, + "router_z_loss_clip": 3.72460938, + "router_z_loss_mlp": 0.41357422, + "step": 1770, + "time_per_iteration": 2.619767904281616 + }, + { + "auxiliary_loss_clip": 0.06544405, + "auxiliary_loss_mlp": 0.01277398, + "balance_loss_clip": 0.06314689, + "balance_loss_mlp": 0.01260136, + "epoch": 0.1064782804749737, + "flos": 58567230645120.0, + "grad_norm": 0.8566843095142983, + "language_loss": 0.57127362, + "learning_rate": 3.9389589374955925e-06, + "loss": 0.64949167, + "num_input_tokens_seen": 38304290, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.17285156, + "step": 1771, + "time_per_iteration": 3.075225353240967 + }, + { + "auxiliary_loss_clip": 0.06680871, + "auxiliary_loss_mlp": 0.01316894, + "balance_loss_clip": 0.06314114, + "balance_loss_mlp": 0.01274432, + "epoch": 0.10653840372764166, + "flos": 23994626716800.0, + "grad_norm": 1.9413884947034454, + "language_loss": 0.90273499, + "learning_rate": 3.938863415435429e-06, + "loss": 0.98271263, + "num_input_tokens_seen": 38324725, + "router_z_loss_clip": 3.66601562, + "router_z_loss_mlp": 0.42431641, + "step": 1772, + "time_per_iteration": 2.5640146732330322 + }, + { + "auxiliary_loss_clip": 0.06695, + "auxiliary_loss_mlp": 0.01317722, + "balance_loss_clip": 0.0631227, + "balance_loss_mlp": 0.01272828, + "epoch": 0.10659852698030964, + "flos": 18300272221440.0, + "grad_norm": 4.259637608820723, + "language_loss": 0.78636491, + "learning_rate": 3.93876781985337e-06, + "loss": 0.86649209, + "num_input_tokens_seen": 38340735, + "router_z_loss_clip": 3.83203125, + "router_z_loss_mlp": 0.44824219, + "step": 1773, + "time_per_iteration": 2.528411626815796 + }, + { + "auxiliary_loss_clip": 0.06679896, + "auxiliary_loss_mlp": 0.01313366, + "balance_loss_clip": 0.06312554, + "balance_loss_mlp": 0.01272024, + "epoch": 0.1066586502329776, + "flos": 32168751269760.0, + "grad_norm": 2.123173958110219, + "language_loss": 0.84472597, + "learning_rate": 3.938672150753041e-06, + "loss": 0.92465854, + "num_input_tokens_seen": 38361315, + "router_z_loss_clip": 3.67578125, + "router_z_loss_mlp": 0.41333008, + "step": 1774, + "time_per_iteration": 2.6232900619506836 + }, + { + "auxiliary_loss_clip": 0.06689709, + "auxiliary_loss_mlp": 0.01315484, + "balance_loss_clip": 0.06314571, + "balance_loss_mlp": 0.0127245, + "epoch": 0.10671877348564557, + "flos": 17790904051200.0, + "grad_norm": 3.7633279602301326, + "language_loss": 0.78288794, + "learning_rate": 3.9385764081380704e-06, + "loss": 0.86293983, + "num_input_tokens_seen": 38377425, + "router_z_loss_clip": 3.75585938, + "router_z_loss_mlp": 0.43066406, + "step": 1775, + "time_per_iteration": 2.5444161891937256 + }, + { + "auxiliary_loss_clip": 0.06541309, + "auxiliary_loss_mlp": 0.01282331, + "balance_loss_clip": 0.06314777, + "balance_loss_mlp": 0.0126594, + "epoch": 0.10677889673831355, + "flos": 63531074517120.0, + "grad_norm": 0.8449773894494127, + "language_loss": 0.57561356, + "learning_rate": 3.9384805920120876e-06, + "loss": 0.65384996, + "num_input_tokens_seen": 38440275, + "router_z_loss_clip": 2.265625, + "router_z_loss_mlp": 0.16394043, + "step": 1776, + "time_per_iteration": 3.194715976715088 + }, + { + "auxiliary_loss_clip": 0.06668387, + "auxiliary_loss_mlp": 0.01308478, + "balance_loss_clip": 0.063052, + "balance_loss_mlp": 0.01266421, + "epoch": 0.10683901999098151, + "flos": 22024182902400.0, + "grad_norm": 4.182030492494299, + "language_loss": 0.84917277, + "learning_rate": 3.938384702378727e-06, + "loss": 0.92894137, + "num_input_tokens_seen": 38461820, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 0.42041016, + "step": 1777, + "time_per_iteration": 2.595827102661133 + }, + { + "auxiliary_loss_clip": 0.06665277, + "auxiliary_loss_mlp": 0.01305083, + "balance_loss_clip": 0.06308808, + "balance_loss_mlp": 0.01265076, + "epoch": 0.10689914324364948, + "flos": 25049435040000.0, + "grad_norm": 3.105295988575609, + "language_loss": 0.89778632, + "learning_rate": 3.938288739241625e-06, + "loss": 0.97748995, + "num_input_tokens_seen": 38482235, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 0.40014648, + "step": 1778, + "time_per_iteration": 2.5659501552581787 + }, + { + "auxiliary_loss_clip": 0.06673209, + "auxiliary_loss_mlp": 0.0130986, + "balance_loss_clip": 0.06311059, + "balance_loss_mlp": 0.01270068, + "epoch": 0.10695926649631746, + "flos": 16440643831680.0, + "grad_norm": 2.394911901784639, + "language_loss": 0.85383832, + "learning_rate": 3.938192702604417e-06, + "loss": 0.93366897, + "num_input_tokens_seen": 38500690, + "router_z_loss_clip": 3.62304688, + "router_z_loss_mlp": 0.39794922, + "step": 1779, + "time_per_iteration": 2.593081474304199 + }, + { + "auxiliary_loss_clip": 0.06673639, + "auxiliary_loss_mlp": 0.01307049, + "balance_loss_clip": 0.06310658, + "balance_loss_mlp": 0.01266255, + "epoch": 0.10701938974898542, + "flos": 16984281121920.0, + "grad_norm": 6.263456292034634, + "language_loss": 0.689089, + "learning_rate": 3.9380965924707495e-06, + "loss": 0.76889586, + "num_input_tokens_seen": 38518405, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 0.40844727, + "step": 1780, + "time_per_iteration": 2.5288658142089844 + }, + { + "auxiliary_loss_clip": 0.06670965, + "auxiliary_loss_mlp": 0.01308635, + "balance_loss_clip": 0.06307741, + "balance_loss_mlp": 0.01267675, + "epoch": 0.10707951300165339, + "flos": 15893568524160.0, + "grad_norm": 2.7813039840033116, + "language_loss": 0.94183797, + "learning_rate": 3.938000408844265e-06, + "loss": 1.02163386, + "num_input_tokens_seen": 38535060, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 0.40942383, + "step": 1781, + "time_per_iteration": 2.5472099781036377 + }, + { + "auxiliary_loss_clip": 0.06674273, + "auxiliary_loss_mlp": 0.01309874, + "balance_loss_clip": 0.06307364, + "balance_loss_mlp": 0.01267793, + "epoch": 0.10713963625432135, + "flos": 14252510309760.0, + "grad_norm": 2.902551508287184, + "language_loss": 0.80661923, + "learning_rate": 3.9379041517286105e-06, + "loss": 0.88646066, + "num_input_tokens_seen": 38552855, + "router_z_loss_clip": 3.671875, + "router_z_loss_mlp": 0.4206543, + "step": 1782, + "time_per_iteration": 2.510643482208252 + }, + { + "auxiliary_loss_clip": 0.06686161, + "auxiliary_loss_mlp": 0.01310662, + "balance_loss_clip": 0.06313431, + "balance_loss_mlp": 0.01267341, + "epoch": 0.10719975950698933, + "flos": 16761224753280.0, + "grad_norm": 2.870404925374148, + "language_loss": 0.80170923, + "learning_rate": 3.937807821127436e-06, + "loss": 0.88167745, + "num_input_tokens_seen": 38570075, + "router_z_loss_clip": 3.7265625, + "router_z_loss_mlp": 0.43334961, + "step": 1783, + "time_per_iteration": 2.5342109203338623 + }, + { + "auxiliary_loss_clip": 0.06683534, + "auxiliary_loss_mlp": 0.01311834, + "balance_loss_clip": 0.063077, + "balance_loss_mlp": 0.0126818, + "epoch": 0.1072598827596573, + "flos": 22717181295360.0, + "grad_norm": 2.882000106412139, + "language_loss": 0.88123596, + "learning_rate": 3.937711417044395e-06, + "loss": 0.96118969, + "num_input_tokens_seen": 38587970, + "router_z_loss_clip": 3.75390625, + "router_z_loss_mlp": 0.4362793, + "step": 1784, + "time_per_iteration": 2.5347747802734375 + }, + { + "auxiliary_loss_clip": 0.0667218, + "auxiliary_loss_mlp": 0.0129997, + "balance_loss_clip": 0.06303082, + "balance_loss_mlp": 0.01257484, + "epoch": 0.10732000601232526, + "flos": 23264969362560.0, + "grad_norm": 3.307544320202646, + "language_loss": 1.02124667, + "learning_rate": 3.937614939483143e-06, + "loss": 1.10096812, + "num_input_tokens_seen": 38605840, + "router_z_loss_clip": 3.6875, + "router_z_loss_mlp": 0.42480469, + "step": 1785, + "time_per_iteration": 2.573028802871704 + }, + { + "auxiliary_loss_clip": 0.06653184, + "auxiliary_loss_mlp": 0.01298346, + "balance_loss_clip": 0.06301528, + "balance_loss_mlp": 0.01260676, + "epoch": 0.10738012926499324, + "flos": 24213951578880.0, + "grad_norm": 1.5126040850021356, + "language_loss": 0.86291718, + "learning_rate": 3.937518388447339e-06, + "loss": 0.94243246, + "num_input_tokens_seen": 38627070, + "router_z_loss_clip": 3.51757812, + "router_z_loss_mlp": 0.37670898, + "step": 1786, + "time_per_iteration": 2.583588123321533 + }, + { + "auxiliary_loss_clip": 0.06674268, + "auxiliary_loss_mlp": 0.01305446, + "balance_loss_clip": 0.06299917, + "balance_loss_mlp": 0.01260337, + "epoch": 0.1074402525176612, + "flos": 20929361454720.0, + "grad_norm": 2.204457856509681, + "language_loss": 0.80718577, + "learning_rate": 3.937421763940642e-06, + "loss": 0.88698298, + "num_input_tokens_seen": 38645840, + "router_z_loss_clip": 3.74414062, + "router_z_loss_mlp": 0.45092773, + "step": 1787, + "time_per_iteration": 2.5648107528686523 + }, + { + "auxiliary_loss_clip": 0.06675328, + "auxiliary_loss_mlp": 0.01304706, + "balance_loss_clip": 0.06304328, + "balance_loss_mlp": 0.01262769, + "epoch": 0.10750037577032917, + "flos": 16952695332480.0, + "grad_norm": 2.64327450986053, + "language_loss": 0.8385697, + "learning_rate": 3.937325065966719e-06, + "loss": 0.91837001, + "num_input_tokens_seen": 38664770, + "router_z_loss_clip": 3.71484375, + "router_z_loss_mlp": 0.41943359, + "step": 1788, + "time_per_iteration": 2.5402321815490723 + }, + { + "auxiliary_loss_clip": 0.06668989, + "auxiliary_loss_mlp": 0.01316653, + "balance_loss_clip": 0.0630315, + "balance_loss_mlp": 0.01276384, + "epoch": 0.10756049902299715, + "flos": 20272770460800.0, + "grad_norm": 2.8631598958886135, + "language_loss": 0.79821587, + "learning_rate": 3.9372282945292335e-06, + "loss": 0.87807226, + "num_input_tokens_seen": 38683865, + "router_z_loss_clip": 3.66015625, + "router_z_loss_mlp": 0.40258789, + "step": 1789, + "time_per_iteration": 2.5255203247070312 + }, + { + "auxiliary_loss_clip": 0.06671752, + "auxiliary_loss_mlp": 0.01304626, + "balance_loss_clip": 0.06304207, + "balance_loss_mlp": 0.01261019, + "epoch": 0.10762062227566511, + "flos": 23593264859520.0, + "grad_norm": 3.1602441142249584, + "language_loss": 0.75890934, + "learning_rate": 3.937131449631859e-06, + "loss": 0.83867311, + "num_input_tokens_seen": 38702485, + "router_z_loss_clip": 3.671875, + "router_z_loss_mlp": 0.43603516, + "step": 1790, + "time_per_iteration": 2.6021804809570312 + }, + { + "auxiliary_loss_clip": 0.06681746, + "auxiliary_loss_mlp": 0.01304108, + "balance_loss_clip": 0.06308466, + "balance_loss_mlp": 0.01261741, + "epoch": 0.10768074552833308, + "flos": 24316549303680.0, + "grad_norm": 2.153087509424505, + "language_loss": 0.80275488, + "learning_rate": 3.9370345312782645e-06, + "loss": 0.88261342, + "num_input_tokens_seen": 38722475, + "router_z_loss_clip": 3.734375, + "router_z_loss_mlp": 0.42333984, + "step": 1791, + "time_per_iteration": 2.546696662902832 + }, + { + "auxiliary_loss_clip": 0.06660049, + "auxiliary_loss_mlp": 0.01311951, + "balance_loss_clip": 0.06307684, + "balance_loss_mlp": 0.01273255, + "epoch": 0.10774086878100106, + "flos": 25306760528640.0, + "grad_norm": 1.9333309848647533, + "language_loss": 0.72259545, + "learning_rate": 3.936937539472126e-06, + "loss": 0.80231547, + "num_input_tokens_seen": 38743285, + "router_z_loss_clip": 3.5234375, + "router_z_loss_mlp": 0.38647461, + "step": 1792, + "time_per_iteration": 3.9801604747772217 + }, + { + "auxiliary_loss_clip": 0.06673245, + "auxiliary_loss_mlp": 0.01302989, + "balance_loss_clip": 0.06307209, + "balance_loss_mlp": 0.01260813, + "epoch": 0.10780099203366902, + "flos": 22060506447360.0, + "grad_norm": 2.562098500680419, + "language_loss": 0.78115147, + "learning_rate": 3.9368404742171236e-06, + "loss": 0.86091387, + "num_input_tokens_seen": 38763035, + "router_z_loss_clip": 3.65820312, + "router_z_loss_mlp": 0.42163086, + "step": 1793, + "time_per_iteration": 2.5435540676116943 + }, + { + "auxiliary_loss_clip": 0.06668183, + "auxiliary_loss_mlp": 0.01304414, + "balance_loss_clip": 0.06312631, + "balance_loss_mlp": 0.01268151, + "epoch": 0.10786111528633699, + "flos": 22754091818880.0, + "grad_norm": 1.5894120102976992, + "language_loss": 0.86093199, + "learning_rate": 3.936743335516936e-06, + "loss": 0.94065803, + "num_input_tokens_seen": 38784900, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 0.36279297, + "step": 1794, + "time_per_iteration": 4.001549482345581 + }, + { + "auxiliary_loss_clip": 0.0669271, + "auxiliary_loss_mlp": 0.01312602, + "balance_loss_clip": 0.06319374, + "balance_loss_mlp": 0.01269472, + "epoch": 0.10792123853900495, + "flos": 20857510978560.0, + "grad_norm": 2.1590787324009257, + "language_loss": 0.77325815, + "learning_rate": 3.936646123375246e-06, + "loss": 0.8533113, + "num_input_tokens_seen": 38804695, + "router_z_loss_clip": 3.734375, + "router_z_loss_mlp": 0.43115234, + "step": 1795, + "time_per_iteration": 2.601548910140991 + }, + { + "auxiliary_loss_clip": 0.06686068, + "auxiliary_loss_mlp": 0.01301313, + "balance_loss_clip": 0.06317562, + "balance_loss_mlp": 0.01262212, + "epoch": 0.10798136179167293, + "flos": 17754454725120.0, + "grad_norm": 3.0035183040345306, + "language_loss": 0.83787191, + "learning_rate": 3.936548837795741e-06, + "loss": 0.91774577, + "num_input_tokens_seen": 38822395, + "router_z_loss_clip": 3.68359375, + "router_z_loss_mlp": 0.39086914, + "step": 1796, + "time_per_iteration": 2.506821870803833 + }, + { + "auxiliary_loss_clip": 0.06692545, + "auxiliary_loss_mlp": 0.01329164, + "balance_loss_clip": 0.06318776, + "balance_loss_mlp": 0.01285318, + "epoch": 0.1080414850443409, + "flos": 13594745358720.0, + "grad_norm": 2.560788533662373, + "language_loss": 0.7551347, + "learning_rate": 3.936451478782111e-06, + "loss": 0.83535177, + "num_input_tokens_seen": 38839865, + "router_z_loss_clip": 3.73828125, + "router_z_loss_mlp": 0.43847656, + "step": 1797, + "time_per_iteration": 3.9367597103118896 + }, + { + "auxiliary_loss_clip": 0.06662647, + "auxiliary_loss_mlp": 0.01300606, + "balance_loss_clip": 0.06308785, + "balance_loss_mlp": 0.0126265, + "epoch": 0.10810160829700886, + "flos": 16259026106880.0, + "grad_norm": 2.354924251941542, + "language_loss": 0.83353364, + "learning_rate": 3.936354046338046e-06, + "loss": 0.91316622, + "num_input_tokens_seen": 38857300, + "router_z_loss_clip": 3.53515625, + "router_z_loss_mlp": 0.37939453, + "step": 1798, + "time_per_iteration": 4.009509086608887 + }, + { + "auxiliary_loss_clip": 0.06672391, + "auxiliary_loss_mlp": 0.01305094, + "balance_loss_clip": 0.06315865, + "balance_loss_mlp": 0.01265635, + "epoch": 0.10816173154967684, + "flos": 15163282264320.0, + "grad_norm": 3.5539012768628786, + "language_loss": 0.87248892, + "learning_rate": 3.936256540467242e-06, + "loss": 0.95226371, + "num_input_tokens_seen": 38874960, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 0.39477539, + "step": 1799, + "time_per_iteration": 2.5058934688568115 + }, + { + "auxiliary_loss_clip": 0.06677136, + "auxiliary_loss_mlp": 0.01305557, + "balance_loss_clip": 0.06318786, + "balance_loss_mlp": 0.01268459, + "epoch": 0.10822185480234481, + "flos": 17791113686400.0, + "grad_norm": 2.263102555339672, + "language_loss": 0.78951424, + "learning_rate": 3.9361589611733955e-06, + "loss": 0.86934125, + "num_input_tokens_seen": 38893610, + "router_z_loss_clip": 3.58203125, + "router_z_loss_mlp": 0.37084961, + "step": 1800, + "time_per_iteration": 2.546147584915161 + }, + { + "auxiliary_loss_clip": 0.06672224, + "auxiliary_loss_mlp": 0.01299, + "balance_loss_clip": 0.06316296, + "balance_loss_mlp": 0.01262546, + "epoch": 0.10828197805501277, + "flos": 25563708673920.0, + "grad_norm": 5.510395821762047, + "language_loss": 0.74356997, + "learning_rate": 3.9360613084602075e-06, + "loss": 0.82328218, + "num_input_tokens_seen": 38913485, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 0.36425781, + "step": 1801, + "time_per_iteration": 2.6982262134552 + }, + { + "auxiliary_loss_clip": 0.06691626, + "auxiliary_loss_mlp": 0.01309625, + "balance_loss_clip": 0.06324095, + "balance_loss_mlp": 0.01272813, + "epoch": 0.10834210130768075, + "flos": 28991748188160.0, + "grad_norm": 2.1562213268616355, + "language_loss": 0.67963791, + "learning_rate": 3.935963582331381e-06, + "loss": 0.75965041, + "num_input_tokens_seen": 38935650, + "router_z_loss_clip": 3.67578125, + "router_z_loss_mlp": 0.3684082, + "step": 1802, + "time_per_iteration": 2.633770704269409 + }, + { + "auxiliary_loss_clip": 0.06676073, + "auxiliary_loss_mlp": 0.01309023, + "balance_loss_clip": 0.0632169, + "balance_loss_mlp": 0.01273379, + "epoch": 0.10840222456034872, + "flos": 20270045203200.0, + "grad_norm": 4.600711865085207, + "language_loss": 0.83367407, + "learning_rate": 3.935865782790621e-06, + "loss": 0.9135251, + "num_input_tokens_seen": 38954130, + "router_z_loss_clip": 3.55078125, + "router_z_loss_mlp": 0.35668945, + "step": 1803, + "time_per_iteration": 2.5231714248657227 + }, + { + "auxiliary_loss_clip": 0.06688153, + "auxiliary_loss_mlp": 0.01302267, + "balance_loss_clip": 0.06328186, + "balance_loss_mlp": 0.01263286, + "epoch": 0.10846234781301668, + "flos": 19868851054080.0, + "grad_norm": 2.166179009667806, + "language_loss": 0.92279881, + "learning_rate": 3.9357679098416365e-06, + "loss": 1.00270307, + "num_input_tokens_seen": 38972905, + "router_z_loss_clip": 3.59765625, + "router_z_loss_mlp": 0.39013672, + "step": 1804, + "time_per_iteration": 2.5790512561798096 + }, + { + "auxiliary_loss_clip": 0.06684472, + "auxiliary_loss_mlp": 0.01313096, + "balance_loss_clip": 0.06322414, + "balance_loss_mlp": 0.01273327, + "epoch": 0.10852247106568465, + "flos": 26476283491200.0, + "grad_norm": 2.1541825231451384, + "language_loss": 0.7834245, + "learning_rate": 3.935669963488139e-06, + "loss": 0.8634001, + "num_input_tokens_seen": 38993255, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 0.39794922, + "step": 1805, + "time_per_iteration": 2.579225778579712 + }, + { + "auxiliary_loss_clip": 0.06686831, + "auxiliary_loss_mlp": 0.01314489, + "balance_loss_clip": 0.06327775, + "balance_loss_mlp": 0.01276938, + "epoch": 0.10858259431835263, + "flos": 30089420674560.0, + "grad_norm": 1.8150777160293243, + "language_loss": 0.87391019, + "learning_rate": 3.935571943733843e-06, + "loss": 0.95392346, + "num_input_tokens_seen": 39012610, + "router_z_loss_clip": 3.58789062, + "router_z_loss_mlp": 0.37548828, + "step": 1806, + "time_per_iteration": 2.6113767623901367 + }, + { + "auxiliary_loss_clip": 0.06674515, + "auxiliary_loss_mlp": 0.01306373, + "balance_loss_clip": 0.06320654, + "balance_loss_mlp": 0.01270038, + "epoch": 0.10864271757102059, + "flos": 19069313794560.0, + "grad_norm": 2.587857349139583, + "language_loss": 0.81862879, + "learning_rate": 3.9354738505824635e-06, + "loss": 0.89843768, + "num_input_tokens_seen": 39030120, + "router_z_loss_clip": 3.53515625, + "router_z_loss_mlp": 0.36328125, + "step": 1807, + "time_per_iteration": 2.5133659839630127 + }, + { + "auxiliary_loss_clip": 0.06671922, + "auxiliary_loss_mlp": 0.01298096, + "balance_loss_clip": 0.06316403, + "balance_loss_mlp": 0.01264193, + "epoch": 0.10870284082368856, + "flos": 24721558813440.0, + "grad_norm": 5.872677105154593, + "language_loss": 0.80080831, + "learning_rate": 3.9353756840377225e-06, + "loss": 0.88050854, + "num_input_tokens_seen": 39049875, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 0.33911133, + "step": 1808, + "time_per_iteration": 2.615813732147217 + }, + { + "auxiliary_loss_clip": 0.06679243, + "auxiliary_loss_mlp": 0.01305785, + "balance_loss_clip": 0.06317936, + "balance_loss_mlp": 0.0126926, + "epoch": 0.10876296407635654, + "flos": 20633322579840.0, + "grad_norm": 1.9478579539752536, + "language_loss": 0.80837792, + "learning_rate": 3.935277444103342e-06, + "loss": 0.88822818, + "num_input_tokens_seen": 39068935, + "router_z_loss_clip": 3.61328125, + "router_z_loss_mlp": 0.36523438, + "step": 1809, + "time_per_iteration": 2.5448191165924072 + }, + { + "auxiliary_loss_clip": 0.0666375, + "auxiliary_loss_mlp": 0.01303981, + "balance_loss_clip": 0.06309726, + "balance_loss_mlp": 0.01265119, + "epoch": 0.1088230873290245, + "flos": 21586245937920.0, + "grad_norm": 2.4636813373380213, + "language_loss": 0.86466354, + "learning_rate": 3.935179130783046e-06, + "loss": 0.94434083, + "num_input_tokens_seen": 39087370, + "router_z_loss_clip": 3.54101562, + "router_z_loss_mlp": 0.38891602, + "step": 1810, + "time_per_iteration": 2.603607654571533 + }, + { + "auxiliary_loss_clip": 0.06689243, + "auxiliary_loss_mlp": 0.01306323, + "balance_loss_clip": 0.06319645, + "balance_loss_mlp": 0.01268367, + "epoch": 0.10888321058169247, + "flos": 26476283491200.0, + "grad_norm": 1.9747664396184277, + "language_loss": 0.65524805, + "learning_rate": 3.935080744080564e-06, + "loss": 0.73520374, + "num_input_tokens_seen": 39106635, + "router_z_loss_clip": 3.69726562, + "router_z_loss_mlp": 0.37939453, + "step": 1811, + "time_per_iteration": 2.581341505050659 + }, + { + "auxiliary_loss_clip": 0.0667599, + "auxiliary_loss_mlp": 0.01304861, + "balance_loss_clip": 0.06313843, + "balance_loss_mlp": 0.01266166, + "epoch": 0.10894333383436045, + "flos": 25855722552960.0, + "grad_norm": 2.675746043218001, + "language_loss": 0.75747859, + "learning_rate": 3.934982283999626e-06, + "loss": 0.83728707, + "num_input_tokens_seen": 39126335, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 0.38671875, + "step": 1812, + "time_per_iteration": 2.6015379428863525 + }, + { + "auxiliary_loss_clip": 0.06657378, + "auxiliary_loss_mlp": 0.01303294, + "balance_loss_clip": 0.06303936, + "balance_loss_mlp": 0.01265219, + "epoch": 0.10900345708702841, + "flos": 19543238887680.0, + "grad_norm": 2.31852988369708, + "language_loss": 0.74425399, + "learning_rate": 3.934883750543966e-06, + "loss": 0.82386076, + "num_input_tokens_seen": 39144820, + "router_z_loss_clip": 3.53320312, + "router_z_loss_mlp": 0.38085938, + "step": 1813, + "time_per_iteration": 2.5689308643341064 + }, + { + "auxiliary_loss_clip": 0.06659622, + "auxiliary_loss_mlp": 0.01293341, + "balance_loss_clip": 0.06308373, + "balance_loss_mlp": 0.01258556, + "epoch": 0.10906358033969638, + "flos": 23630091528960.0, + "grad_norm": 1.8365155089256564, + "language_loss": 0.84168994, + "learning_rate": 3.93478514371732e-06, + "loss": 0.92121959, + "num_input_tokens_seen": 39165945, + "router_z_loss_clip": 3.51367188, + "router_z_loss_mlp": 0.34790039, + "step": 1814, + "time_per_iteration": 2.5616791248321533 + }, + { + "auxiliary_loss_clip": 0.06670845, + "auxiliary_loss_mlp": 0.01300399, + "balance_loss_clip": 0.06307411, + "balance_loss_mlp": 0.01261036, + "epoch": 0.10912370359236434, + "flos": 21221039917440.0, + "grad_norm": 3.301230683958358, + "language_loss": 0.85154849, + "learning_rate": 3.934686463523429e-06, + "loss": 0.93126094, + "num_input_tokens_seen": 39183520, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 0.39355469, + "step": 1815, + "time_per_iteration": 2.57688307762146 + }, + { + "auxiliary_loss_clip": 0.06661555, + "auxiliary_loss_mlp": 0.01302183, + "balance_loss_clip": 0.06312289, + "balance_loss_mlp": 0.01263726, + "epoch": 0.10918382684503232, + "flos": 13558296032640.0, + "grad_norm": 2.7300514950641714, + "language_loss": 0.73428917, + "learning_rate": 3.9345877099660315e-06, + "loss": 0.81392652, + "num_input_tokens_seen": 39201190, + "router_z_loss_clip": 3.4921875, + "router_z_loss_mlp": 0.38476562, + "step": 1816, + "time_per_iteration": 2.503822088241577 + }, + { + "auxiliary_loss_clip": 0.06674603, + "auxiliary_loss_mlp": 0.01310351, + "balance_loss_clip": 0.06307942, + "balance_loss_mlp": 0.01269105, + "epoch": 0.10924395009770028, + "flos": 27971712109440.0, + "grad_norm": 2.9873916021139078, + "language_loss": 0.74010128, + "learning_rate": 3.9344888830488744e-06, + "loss": 0.81995082, + "num_input_tokens_seen": 39221210, + "router_z_loss_clip": 3.66601562, + "router_z_loss_mlp": 0.41235352, + "step": 1817, + "time_per_iteration": 2.636141300201416 + }, + { + "auxiliary_loss_clip": 0.06667508, + "auxiliary_loss_mlp": 0.01306282, + "balance_loss_clip": 0.06316356, + "balance_loss_mlp": 0.01268659, + "epoch": 0.10930407335036825, + "flos": 25600912686720.0, + "grad_norm": 1.8767258076281454, + "language_loss": 0.68811858, + "learning_rate": 3.934389982775706e-06, + "loss": 0.76785648, + "num_input_tokens_seen": 39242025, + "router_z_loss_clip": 3.51171875, + "router_z_loss_mlp": 0.37597656, + "step": 1818, + "time_per_iteration": 2.5823488235473633 + }, + { + "auxiliary_loss_clip": 0.06675036, + "auxiliary_loss_mlp": 0.01306463, + "balance_loss_clip": 0.06313543, + "balance_loss_mlp": 0.01266575, + "epoch": 0.10936419660303623, + "flos": 18412177749120.0, + "grad_norm": 2.168064712705315, + "language_loss": 0.74997962, + "learning_rate": 3.934291009150275e-06, + "loss": 0.82979459, + "num_input_tokens_seen": 39259870, + "router_z_loss_clip": 3.61523438, + "router_z_loss_mlp": 0.39892578, + "step": 1819, + "time_per_iteration": 2.5780999660491943 + }, + { + "auxiliary_loss_clip": 0.0666959, + "auxiliary_loss_mlp": 0.01302484, + "balance_loss_clip": 0.06316112, + "balance_loss_mlp": 0.01264123, + "epoch": 0.1094243198557042, + "flos": 23846523425280.0, + "grad_norm": 2.805852177899608, + "language_loss": 0.75565147, + "learning_rate": 3.934191962176335e-06, + "loss": 0.83537227, + "num_input_tokens_seen": 39278500, + "router_z_loss_clip": 3.53515625, + "router_z_loss_mlp": 0.38354492, + "step": 1820, + "time_per_iteration": 2.55102801322937 + }, + { + "auxiliary_loss_clip": 0.06670672, + "auxiliary_loss_mlp": 0.01301119, + "balance_loss_clip": 0.06311028, + "balance_loss_mlp": 0.01261065, + "epoch": 0.10948444310837216, + "flos": 14648589360000.0, + "grad_norm": 3.185311290283081, + "language_loss": 0.84421206, + "learning_rate": 3.934092841857642e-06, + "loss": 0.92392999, + "num_input_tokens_seen": 39294800, + "router_z_loss_clip": 3.59765625, + "router_z_loss_mlp": 0.40039062, + "step": 1821, + "time_per_iteration": 2.557086229324341 + }, + { + "auxiliary_loss_clip": 0.06666994, + "auxiliary_loss_mlp": 0.01310986, + "balance_loss_clip": 0.06314231, + "balance_loss_mlp": 0.01271409, + "epoch": 0.10954456636104014, + "flos": 27826250221440.0, + "grad_norm": 3.7637860321271117, + "language_loss": 0.78284943, + "learning_rate": 3.933993648197955e-06, + "loss": 0.86262918, + "num_input_tokens_seen": 39314625, + "router_z_loss_clip": 3.52734375, + "router_z_loss_mlp": 0.39575195, + "step": 1822, + "time_per_iteration": 2.607753038406372 + }, + { + "auxiliary_loss_clip": 0.06665225, + "auxiliary_loss_mlp": 0.01305751, + "balance_loss_clip": 0.06311564, + "balance_loss_mlp": 0.01267271, + "epoch": 0.1096046896137081, + "flos": 33629491497600.0, + "grad_norm": 2.4721955378281133, + "language_loss": 0.81345534, + "learning_rate": 3.933894381201034e-06, + "loss": 0.89316511, + "num_input_tokens_seen": 39336465, + "router_z_loss_clip": 3.53515625, + "router_z_loss_mlp": 0.38525391, + "step": 1823, + "time_per_iteration": 2.7046356201171875 + }, + { + "auxiliary_loss_clip": 0.06663416, + "auxiliary_loss_mlp": 0.01297526, + "balance_loss_clip": 0.06311031, + "balance_loss_mlp": 0.01260643, + "epoch": 0.10966481286637607, + "flos": 26987370670080.0, + "grad_norm": 1.5405254615008266, + "language_loss": 0.8184576, + "learning_rate": 3.933795040870645e-06, + "loss": 0.898067, + "num_input_tokens_seen": 39357930, + "router_z_loss_clip": 3.5234375, + "router_z_loss_mlp": 0.36889648, + "step": 1824, + "time_per_iteration": 2.6020491123199463 + }, + { + "auxiliary_loss_clip": 0.06675697, + "auxiliary_loss_mlp": 0.01302612, + "balance_loss_clip": 0.06317075, + "balance_loss_mlp": 0.01262796, + "epoch": 0.10972493611904403, + "flos": 23042751534720.0, + "grad_norm": 2.030784567379419, + "language_loss": 0.88740194, + "learning_rate": 3.933695627210554e-06, + "loss": 0.96718502, + "num_input_tokens_seen": 39376380, + "router_z_loss_clip": 3.5859375, + "router_z_loss_mlp": 0.3984375, + "step": 1825, + "time_per_iteration": 2.6143786907196045 + }, + { + "auxiliary_loss_clip": 0.06672946, + "auxiliary_loss_mlp": 0.01304094, + "balance_loss_clip": 0.06315491, + "balance_loss_mlp": 0.01265113, + "epoch": 0.10978505937171201, + "flos": 38113261729920.0, + "grad_norm": 4.39958169553056, + "language_loss": 0.77133435, + "learning_rate": 3.933596140224532e-06, + "loss": 0.85110474, + "num_input_tokens_seen": 39399935, + "router_z_loss_clip": 3.57617188, + "router_z_loss_mlp": 0.39013672, + "step": 1826, + "time_per_iteration": 2.6767754554748535 + }, + { + "auxiliary_loss_clip": 0.06562361, + "auxiliary_loss_mlp": 0.01306115, + "balance_loss_clip": 0.06342762, + "balance_loss_mlp": 0.01289641, + "epoch": 0.10984518262437998, + "flos": 59867987500800.0, + "grad_norm": 0.8265503512589908, + "language_loss": 0.55217832, + "learning_rate": 3.93349657991635e-06, + "loss": 0.63086313, + "num_input_tokens_seen": 39460685, + "router_z_loss_clip": 2.19921875, + "router_z_loss_mlp": 0.16479492, + "step": 1827, + "time_per_iteration": 3.2042500972747803 + }, + { + "auxiliary_loss_clip": 0.06558152, + "auxiliary_loss_mlp": 0.01284859, + "balance_loss_clip": 0.06338888, + "balance_loss_mlp": 0.01267704, + "epoch": 0.10990530587704794, + "flos": 66741088907520.0, + "grad_norm": 0.7202592314019287, + "language_loss": 0.55369592, + "learning_rate": 3.933396946289784e-06, + "loss": 0.63212597, + "num_input_tokens_seen": 39524765, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.17175293, + "step": 1828, + "time_per_iteration": 3.2514500617980957 + }, + { + "auxiliary_loss_clip": 0.06692256, + "auxiliary_loss_mlp": 0.01311884, + "balance_loss_clip": 0.06327218, + "balance_loss_mlp": 0.01270018, + "epoch": 0.10996542912971592, + "flos": 25454234914560.0, + "grad_norm": 6.114677648786519, + "language_loss": 0.86263084, + "learning_rate": 3.933297239348612e-06, + "loss": 0.94267225, + "num_input_tokens_seen": 39543640, + "router_z_loss_clip": 3.65234375, + "router_z_loss_mlp": 0.41918945, + "step": 1829, + "time_per_iteration": 2.586923360824585 + }, + { + "auxiliary_loss_clip": 0.06682983, + "auxiliary_loss_mlp": 0.01319143, + "balance_loss_clip": 0.06320649, + "balance_loss_mlp": 0.01279207, + "epoch": 0.11002555238238389, + "flos": 44028282752640.0, + "grad_norm": 2.5270889660052025, + "language_loss": 0.90112162, + "learning_rate": 3.933197459096614e-06, + "loss": 0.98114288, + "num_input_tokens_seen": 39567525, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 0.3996582, + "step": 1830, + "time_per_iteration": 2.8102030754089355 + }, + { + "auxiliary_loss_clip": 0.06544227, + "auxiliary_loss_mlp": 0.01284934, + "balance_loss_clip": 0.06324031, + "balance_loss_mlp": 0.01268376, + "epoch": 0.11008567563505185, + "flos": 54085248547200.0, + "grad_norm": 0.6738836054555057, + "language_loss": 0.55525172, + "learning_rate": 3.9330976055375756e-06, + "loss": 0.63354337, + "num_input_tokens_seen": 39628470, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.16564941, + "step": 1831, + "time_per_iteration": 4.652044057846069 + }, + { + "auxiliary_loss_clip": 0.06700309, + "auxiliary_loss_mlp": 0.01328613, + "balance_loss_clip": 0.06332322, + "balance_loss_mlp": 0.01284744, + "epoch": 0.11014579888771983, + "flos": 24249981634560.0, + "grad_norm": 4.072580491450979, + "language_loss": 0.92313743, + "learning_rate": 3.932997678675282e-06, + "loss": 1.00342667, + "num_input_tokens_seen": 39646670, + "router_z_loss_clip": 3.67773438, + "router_z_loss_mlp": 0.43823242, + "step": 1832, + "time_per_iteration": 2.6010701656341553 + }, + { + "auxiliary_loss_clip": 0.06543858, + "auxiliary_loss_mlp": 0.01268849, + "balance_loss_clip": 0.06322708, + "balance_loss_mlp": 0.0125247, + "epoch": 0.1102059221403878, + "flos": 57763653661440.0, + "grad_norm": 0.681716215184674, + "language_loss": 0.59753174, + "learning_rate": 3.932897678513523e-06, + "loss": 0.67565876, + "num_input_tokens_seen": 39712915, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.16381836, + "step": 1833, + "time_per_iteration": 3.3245253562927246 + }, + { + "auxiliary_loss_clip": 0.0668912, + "auxiliary_loss_mlp": 0.01321784, + "balance_loss_clip": 0.06319445, + "balance_loss_mlp": 0.01278773, + "epoch": 0.11026604539305576, + "flos": 16800818607360.0, + "grad_norm": 5.311308312768562, + "language_loss": 0.81575066, + "learning_rate": 3.93279760505609e-06, + "loss": 0.89585972, + "num_input_tokens_seen": 39730650, + "router_z_loss_clip": 3.703125, + "router_z_loss_mlp": 0.42993164, + "step": 1834, + "time_per_iteration": 4.020633697509766 + }, + { + "auxiliary_loss_clip": 0.0668771, + "auxiliary_loss_mlp": 0.01323505, + "balance_loss_clip": 0.0632341, + "balance_loss_mlp": 0.01282997, + "epoch": 0.11032616864572373, + "flos": 23994920206080.0, + "grad_norm": 4.522465656610911, + "language_loss": 0.91756475, + "learning_rate": 3.932697458306779e-06, + "loss": 0.99767691, + "num_input_tokens_seen": 39751065, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 0.40478516, + "step": 1835, + "time_per_iteration": 2.5956919193267822 + }, + { + "auxiliary_loss_clip": 0.06685364, + "auxiliary_loss_mlp": 0.01321402, + "balance_loss_clip": 0.06324954, + "balance_loss_mlp": 0.01281729, + "epoch": 0.1103862918983917, + "flos": 19689329681280.0, + "grad_norm": 3.000861759629478, + "language_loss": 0.66412532, + "learning_rate": 3.932597238269386e-06, + "loss": 0.74419296, + "num_input_tokens_seen": 39769245, + "router_z_loss_clip": 3.60546875, + "router_z_loss_mlp": 0.39648438, + "step": 1836, + "time_per_iteration": 2.5927958488464355 + }, + { + "auxiliary_loss_clip": 0.06670263, + "auxiliary_loss_mlp": 0.01319261, + "balance_loss_clip": 0.06317647, + "balance_loss_mlp": 0.01279541, + "epoch": 0.11044641515105967, + "flos": 32169086686080.0, + "grad_norm": 2.1343283023714865, + "language_loss": 0.74546272, + "learning_rate": 3.932496944947711e-06, + "loss": 0.82535791, + "num_input_tokens_seen": 39790830, + "router_z_loss_clip": 3.52539062, + "router_z_loss_mlp": 0.3972168, + "step": 1837, + "time_per_iteration": 5.453325033187866 + }, + { + "auxiliary_loss_clip": 0.06688204, + "auxiliary_loss_mlp": 0.01319143, + "balance_loss_clip": 0.06321806, + "balance_loss_mlp": 0.01281496, + "epoch": 0.11050653840372764, + "flos": 16694573230080.0, + "grad_norm": 2.107729732197389, + "language_loss": 0.79967713, + "learning_rate": 3.93239657834556e-06, + "loss": 0.87975061, + "num_input_tokens_seen": 39809475, + "router_z_loss_clip": 3.66210938, + "router_z_loss_mlp": 0.3762207, + "step": 1838, + "time_per_iteration": 2.5330708026885986 + }, + { + "auxiliary_loss_clip": 0.06681567, + "auxiliary_loss_mlp": 0.01310209, + "balance_loss_clip": 0.06323014, + "balance_loss_mlp": 0.01271013, + "epoch": 0.11056666165639562, + "flos": 21214205809920.0, + "grad_norm": 1.83916180844076, + "language_loss": 0.72651547, + "learning_rate": 3.932296138466736e-06, + "loss": 0.8064332, + "num_input_tokens_seen": 39826355, + "router_z_loss_clip": 3.58398438, + "router_z_loss_mlp": 0.39160156, + "step": 1839, + "time_per_iteration": 2.5494542121887207 + }, + { + "auxiliary_loss_clip": 0.06685573, + "auxiliary_loss_mlp": 0.01308897, + "balance_loss_clip": 0.06317459, + "balance_loss_mlp": 0.0126777, + "epoch": 0.11062678490906358, + "flos": 19170444072960.0, + "grad_norm": 2.2710606045718835, + "language_loss": 0.80620813, + "learning_rate": 3.93219562531505e-06, + "loss": 0.88615286, + "num_input_tokens_seen": 39845335, + "router_z_loss_clip": 3.68359375, + "router_z_loss_mlp": 0.41137695, + "step": 1840, + "time_per_iteration": 2.525967836380005 + }, + { + "auxiliary_loss_clip": 0.0666925, + "auxiliary_loss_mlp": 0.01306907, + "balance_loss_clip": 0.06314851, + "balance_loss_mlp": 0.01271287, + "epoch": 0.11068690816173155, + "flos": 24901457529600.0, + "grad_norm": 1.7471100044619239, + "language_loss": 0.89207804, + "learning_rate": 3.932095038894311e-06, + "loss": 0.97183955, + "num_input_tokens_seen": 39865065, + "router_z_loss_clip": 3.54296875, + "router_z_loss_mlp": 0.35620117, + "step": 1841, + "time_per_iteration": 2.6120924949645996 + }, + { + "auxiliary_loss_clip": 0.06674149, + "auxiliary_loss_mlp": 0.01316221, + "balance_loss_clip": 0.06318908, + "balance_loss_mlp": 0.01276739, + "epoch": 0.11074703141439952, + "flos": 16478015552640.0, + "grad_norm": 2.1111741847875822, + "language_loss": 0.92148924, + "learning_rate": 3.931994379208334e-06, + "loss": 1.00139296, + "num_input_tokens_seen": 39882780, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 0.39477539, + "step": 1842, + "time_per_iteration": 2.5187559127807617 + }, + { + "auxiliary_loss_clip": 0.06674332, + "auxiliary_loss_mlp": 0.01308171, + "balance_loss_clip": 0.06317849, + "balance_loss_mlp": 0.01269166, + "epoch": 0.11080715466706749, + "flos": 19178535991680.0, + "grad_norm": 2.023955120097268, + "language_loss": 0.87531722, + "learning_rate": 3.931893646260937e-06, + "loss": 0.95514226, + "num_input_tokens_seen": 39900295, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 0.39038086, + "step": 1843, + "time_per_iteration": 2.6090967655181885 + }, + { + "auxiliary_loss_clip": 0.06693342, + "auxiliary_loss_mlp": 0.01302224, + "balance_loss_clip": 0.0632928, + "balance_loss_mlp": 0.01261073, + "epoch": 0.11086727791973545, + "flos": 27711325946880.0, + "grad_norm": 2.219830309112563, + "language_loss": 0.75884986, + "learning_rate": 3.931792840055941e-06, + "loss": 0.8388055, + "num_input_tokens_seen": 39922075, + "router_z_loss_clip": 3.64257812, + "router_z_loss_mlp": 0.41137695, + "step": 1844, + "time_per_iteration": 2.6051831245422363 + }, + { + "auxiliary_loss_clip": 0.06685966, + "auxiliary_loss_mlp": 0.01305534, + "balance_loss_clip": 0.06324236, + "balance_loss_mlp": 0.01264311, + "epoch": 0.11092740117240343, + "flos": 18520854894720.0, + "grad_norm": 2.695467374521673, + "language_loss": 0.77040052, + "learning_rate": 3.931691960597165e-06, + "loss": 0.85031545, + "num_input_tokens_seen": 39940115, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 0.41235352, + "step": 1845, + "time_per_iteration": 2.6330642700195312 + }, + { + "auxiliary_loss_clip": 0.06677614, + "auxiliary_loss_mlp": 0.01301707, + "balance_loss_clip": 0.06324686, + "balance_loss_mlp": 0.01264681, + "epoch": 0.1109875244250714, + "flos": 20528796211200.0, + "grad_norm": 2.004922205839187, + "language_loss": 0.77657044, + "learning_rate": 3.9315910078884375e-06, + "loss": 0.85636371, + "num_input_tokens_seen": 39959920, + "router_z_loss_clip": 3.52929688, + "router_z_loss_mlp": 0.37036133, + "step": 1846, + "time_per_iteration": 2.5549449920654297 + }, + { + "auxiliary_loss_clip": 0.06701723, + "auxiliary_loss_mlp": 0.01300229, + "balance_loss_clip": 0.0633509, + "balance_loss_mlp": 0.01259627, + "epoch": 0.11104764767773936, + "flos": 14103484623360.0, + "grad_norm": 2.935889161115543, + "language_loss": 0.88190699, + "learning_rate": 3.931489981933584e-06, + "loss": 0.96192646, + "num_input_tokens_seen": 39974755, + "router_z_loss_clip": 3.671875, + "router_z_loss_mlp": 0.40600586, + "step": 1847, + "time_per_iteration": 2.544952869415283 + }, + { + "auxiliary_loss_clip": 0.06695546, + "auxiliary_loss_mlp": 0.01304809, + "balance_loss_clip": 0.06331737, + "balance_loss_mlp": 0.01263944, + "epoch": 0.11110777093040733, + "flos": 20600730541440.0, + "grad_norm": 2.320230631722476, + "language_loss": 0.79106438, + "learning_rate": 3.931388882736438e-06, + "loss": 0.87106788, + "num_input_tokens_seen": 39993355, + "router_z_loss_clip": 3.63476562, + "router_z_loss_mlp": 0.40893555, + "step": 1848, + "time_per_iteration": 2.6920952796936035 + }, + { + "auxiliary_loss_clip": 0.0668249, + "auxiliary_loss_mlp": 0.01302322, + "balance_loss_clip": 0.06330639, + "balance_loss_mlp": 0.01266702, + "epoch": 0.11116789418307531, + "flos": 21876247319040.0, + "grad_norm": 2.02298107620041, + "language_loss": 0.79027736, + "learning_rate": 3.931287710300832e-06, + "loss": 0.87012547, + "num_input_tokens_seen": 40012410, + "router_z_loss_clip": 3.51757812, + "router_z_loss_mlp": 0.35595703, + "step": 1849, + "time_per_iteration": 2.630244255065918 + }, + { + "auxiliary_loss_clip": 0.0669456, + "auxiliary_loss_mlp": 0.01300991, + "balance_loss_clip": 0.06327619, + "balance_loss_mlp": 0.01259363, + "epoch": 0.11122801743574327, + "flos": 15528488284800.0, + "grad_norm": 3.153012159345978, + "language_loss": 0.73516262, + "learning_rate": 3.931186464630601e-06, + "loss": 0.81511813, + "num_input_tokens_seen": 40029315, + "router_z_loss_clip": 3.66601562, + "router_z_loss_mlp": 0.41625977, + "step": 1850, + "time_per_iteration": 2.5095834732055664 + }, + { + "auxiliary_loss_clip": 0.06693517, + "auxiliary_loss_mlp": 0.01305101, + "balance_loss_clip": 0.06331346, + "balance_loss_mlp": 0.01265952, + "epoch": 0.11128814068841124, + "flos": 14397511000320.0, + "grad_norm": 2.7195587095410594, + "language_loss": 0.83262205, + "learning_rate": 3.931085145729588e-06, + "loss": 0.91260827, + "num_input_tokens_seen": 40045765, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 0.39135742, + "step": 1851, + "time_per_iteration": 2.5094821453094482 + }, + { + "auxiliary_loss_clip": 0.06681279, + "auxiliary_loss_mlp": 0.01301356, + "balance_loss_clip": 0.06326675, + "balance_loss_mlp": 0.01266285, + "epoch": 0.11134826394107922, + "flos": 16659465569280.0, + "grad_norm": 3.1935743698172874, + "language_loss": 0.90682918, + "learning_rate": 3.930983753601631e-06, + "loss": 0.98665553, + "num_input_tokens_seen": 40061660, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 0.35083008, + "step": 1852, + "time_per_iteration": 2.5097947120666504 + }, + { + "auxiliary_loss_clip": 0.06688742, + "auxiliary_loss_mlp": 0.0130004, + "balance_loss_clip": 0.06332849, + "balance_loss_mlp": 0.01261392, + "epoch": 0.11140838719374718, + "flos": 16696627655040.0, + "grad_norm": 2.055655946127079, + "language_loss": 0.73742187, + "learning_rate": 3.930882288250578e-06, + "loss": 0.81730974, + "num_input_tokens_seen": 40080180, + "router_z_loss_clip": 3.56445312, + "router_z_loss_mlp": 0.38647461, + "step": 1853, + "time_per_iteration": 2.5568370819091797 + }, + { + "auxiliary_loss_clip": 0.06563053, + "auxiliary_loss_mlp": 0.01299008, + "balance_loss_clip": 0.06346013, + "balance_loss_mlp": 0.01281771, + "epoch": 0.11146851044641515, + "flos": 60994101248640.0, + "grad_norm": 0.7599812832333546, + "language_loss": 0.53835392, + "learning_rate": 3.930780749680273e-06, + "loss": 0.61697447, + "num_input_tokens_seen": 40138910, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.17260742, + "step": 1854, + "time_per_iteration": 3.1410884857177734 + }, + { + "auxiliary_loss_clip": 0.06710939, + "auxiliary_loss_mlp": 0.01301728, + "balance_loss_clip": 0.06327829, + "balance_loss_mlp": 0.01258336, + "epoch": 0.11152863369908313, + "flos": 22199301936000.0, + "grad_norm": 2.170007206040738, + "language_loss": 0.86019069, + "learning_rate": 3.9306791378945705e-06, + "loss": 0.94031739, + "num_input_tokens_seen": 40157745, + "router_z_loss_clip": 3.83398438, + "router_z_loss_mlp": 0.43383789, + "step": 1855, + "time_per_iteration": 2.5451245307922363 + }, + { + "auxiliary_loss_clip": 0.06687084, + "auxiliary_loss_mlp": 0.01297488, + "balance_loss_clip": 0.0632429, + "balance_loss_mlp": 0.01258745, + "epoch": 0.11158875695175109, + "flos": 19543742012160.0, + "grad_norm": 2.6985711919434054, + "language_loss": 0.83108622, + "learning_rate": 3.9305774528973205e-06, + "loss": 0.91093194, + "num_input_tokens_seen": 40175375, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 0.38720703, + "step": 1856, + "time_per_iteration": 2.578641653060913 + }, + { + "auxiliary_loss_clip": 0.06667097, + "auxiliary_loss_mlp": 0.01293205, + "balance_loss_clip": 0.06315985, + "balance_loss_mlp": 0.01257824, + "epoch": 0.11164888020441906, + "flos": 25448994034560.0, + "grad_norm": 1.90457681551641, + "language_loss": 0.84520233, + "learning_rate": 3.93047569469238e-06, + "loss": 0.92480534, + "num_input_tokens_seen": 40195715, + "router_z_loss_clip": 3.51171875, + "router_z_loss_mlp": 0.35375977, + "step": 1857, + "time_per_iteration": 2.581700086593628 + }, + { + "auxiliary_loss_clip": 0.06686676, + "auxiliary_loss_mlp": 0.01304106, + "balance_loss_clip": 0.06318156, + "balance_loss_mlp": 0.01263289, + "epoch": 0.11170900345708702, + "flos": 15638171679360.0, + "grad_norm": 2.609725880853407, + "language_loss": 0.85109961, + "learning_rate": 3.930373863283608e-06, + "loss": 0.9310075, + "num_input_tokens_seen": 40213975, + "router_z_loss_clip": 3.68164062, + "router_z_loss_mlp": 0.40795898, + "step": 1858, + "time_per_iteration": 2.536013603210449 + }, + { + "auxiliary_loss_clip": 0.0668328, + "auxiliary_loss_mlp": 0.01297406, + "balance_loss_clip": 0.06323688, + "balance_loss_mlp": 0.01259569, + "epoch": 0.111769126709755, + "flos": 23046105697920.0, + "grad_norm": 2.4700078024873102, + "language_loss": 0.92790282, + "learning_rate": 3.930271958674866e-06, + "loss": 1.00770962, + "num_input_tokens_seen": 40233905, + "router_z_loss_clip": 3.59570312, + "router_z_loss_mlp": 0.37841797, + "step": 1859, + "time_per_iteration": 2.541881799697876 + }, + { + "auxiliary_loss_clip": 0.06691643, + "auxiliary_loss_mlp": 0.01299678, + "balance_loss_clip": 0.06318307, + "balance_loss_mlp": 0.0125774, + "epoch": 0.11182924996242297, + "flos": 20857091708160.0, + "grad_norm": 2.367815973832506, + "language_loss": 0.8396585, + "learning_rate": 3.930169980870018e-06, + "loss": 0.9195717, + "num_input_tokens_seen": 40252810, + "router_z_loss_clip": 3.72851562, + "router_z_loss_mlp": 0.41943359, + "step": 1860, + "time_per_iteration": 2.565051555633545 + }, + { + "auxiliary_loss_clip": 0.06669357, + "auxiliary_loss_mlp": 0.01300378, + "balance_loss_clip": 0.06315688, + "balance_loss_mlp": 0.01263065, + "epoch": 0.11188937321509093, + "flos": 17460763764480.0, + "grad_norm": 2.7908462123762026, + "language_loss": 0.7628203, + "learning_rate": 3.930067929872931e-06, + "loss": 0.84251761, + "num_input_tokens_seen": 40272000, + "router_z_loss_clip": 3.53710938, + "router_z_loss_mlp": 0.37304688, + "step": 1861, + "time_per_iteration": 2.5033557415008545 + }, + { + "auxiliary_loss_clip": 0.06670874, + "auxiliary_loss_mlp": 0.01303256, + "balance_loss_clip": 0.0631748, + "balance_loss_mlp": 0.01266635, + "epoch": 0.11194949646775891, + "flos": 24102507248640.0, + "grad_norm": 2.306450242478339, + "language_loss": 0.90480924, + "learning_rate": 3.929965805687474e-06, + "loss": 0.9845506, + "num_input_tokens_seen": 40290660, + "router_z_loss_clip": 3.53515625, + "router_z_loss_mlp": 0.3659668, + "step": 1862, + "time_per_iteration": 2.582846164703369 + }, + { + "auxiliary_loss_clip": 0.06675294, + "auxiliary_loss_mlp": 0.01301536, + "balance_loss_clip": 0.0632014, + "balance_loss_mlp": 0.01265273, + "epoch": 0.11200961972042688, + "flos": 25160627808000.0, + "grad_norm": 2.402216402179579, + "language_loss": 0.88216799, + "learning_rate": 3.92986360831752e-06, + "loss": 0.9619363, + "num_input_tokens_seen": 40307820, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 0.36279297, + "step": 1863, + "time_per_iteration": 2.548849105834961 + }, + { + "auxiliary_loss_clip": 0.06661677, + "auxiliary_loss_mlp": 0.01299701, + "balance_loss_clip": 0.06311835, + "balance_loss_mlp": 0.01259933, + "epoch": 0.11206974297309484, + "flos": 21294735183360.0, + "grad_norm": 3.3365899426908574, + "language_loss": 0.65844059, + "learning_rate": 3.929761337766945e-06, + "loss": 0.73805434, + "num_input_tokens_seen": 40327430, + "router_z_loss_clip": 3.5, + "router_z_loss_mlp": 0.39770508, + "step": 1864, + "time_per_iteration": 2.5405185222625732 + }, + { + "auxiliary_loss_clip": 0.06660779, + "auxiliary_loss_mlp": 0.01305926, + "balance_loss_clip": 0.06303211, + "balance_loss_mlp": 0.01270211, + "epoch": 0.11212986622576282, + "flos": 18921881335680.0, + "grad_norm": 2.2819326265061717, + "language_loss": 0.75939113, + "learning_rate": 3.929658994039627e-06, + "loss": 0.83905816, + "num_input_tokens_seen": 40344545, + "router_z_loss_clip": 3.58007812, + "router_z_loss_mlp": 0.35693359, + "step": 1865, + "time_per_iteration": 2.518132209777832 + }, + { + "auxiliary_loss_clip": 0.06676203, + "auxiliary_loss_mlp": 0.01303479, + "balance_loss_clip": 0.06308948, + "balance_loss_mlp": 0.01262066, + "epoch": 0.11218998947843078, + "flos": 22061344988160.0, + "grad_norm": 2.4630430297676087, + "language_loss": 0.86701274, + "learning_rate": 3.929556577139446e-06, + "loss": 0.94680953, + "num_input_tokens_seen": 40362300, + "router_z_loss_clip": 3.67382812, + "router_z_loss_mlp": 0.4140625, + "step": 1866, + "time_per_iteration": 2.559826135635376 + }, + { + "auxiliary_loss_clip": 0.06668604, + "auxiliary_loss_mlp": 0.0129946, + "balance_loss_clip": 0.06308037, + "balance_loss_mlp": 0.01259405, + "epoch": 0.11225011273109875, + "flos": 24578612547840.0, + "grad_norm": 1.6697676286935108, + "language_loss": 0.82806516, + "learning_rate": 3.929454087070286e-06, + "loss": 0.90774584, + "num_input_tokens_seen": 40384720, + "router_z_loss_clip": 3.60546875, + "router_z_loss_mlp": 0.40014648, + "step": 1867, + "time_per_iteration": 2.6024861335754395 + }, + { + "auxiliary_loss_clip": 0.06666633, + "auxiliary_loss_mlp": 0.01303841, + "balance_loss_clip": 0.06308746, + "balance_loss_mlp": 0.01266099, + "epoch": 0.11231023598376672, + "flos": 28446140327040.0, + "grad_norm": 2.646357828465267, + "language_loss": 0.88275552, + "learning_rate": 3.929351523836035e-06, + "loss": 0.96246034, + "num_input_tokens_seen": 40404000, + "router_z_loss_clip": 3.58007812, + "router_z_loss_mlp": 0.37744141, + "step": 1868, + "time_per_iteration": 2.6040542125701904 + }, + { + "auxiliary_loss_clip": 0.06659871, + "auxiliary_loss_mlp": 0.01297203, + "balance_loss_clip": 0.06306987, + "balance_loss_mlp": 0.01259866, + "epoch": 0.1123703592364347, + "flos": 14431318922880.0, + "grad_norm": 2.6026187077821796, + "language_loss": 0.69696379, + "learning_rate": 3.9292488874405795e-06, + "loss": 0.77653456, + "num_input_tokens_seen": 40418665, + "router_z_loss_clip": 3.52929688, + "router_z_loss_mlp": 0.3737793, + "step": 1869, + "time_per_iteration": 2.562173843383789 + }, + { + "auxiliary_loss_clip": 0.06669002, + "auxiliary_loss_mlp": 0.01308207, + "balance_loss_clip": 0.06307223, + "balance_loss_mlp": 0.01267629, + "epoch": 0.11243048248910266, + "flos": 22242753077760.0, + "grad_norm": 2.004713314117072, + "language_loss": 0.78550231, + "learning_rate": 3.929146177887814e-06, + "loss": 0.86527443, + "num_input_tokens_seen": 40437870, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 0.40600586, + "step": 1870, + "time_per_iteration": 2.5912842750549316 + }, + { + "auxiliary_loss_clip": 0.06677727, + "auxiliary_loss_mlp": 0.01300065, + "balance_loss_clip": 0.06308755, + "balance_loss_mlp": 0.01259462, + "epoch": 0.11249060574177062, + "flos": 18589435061760.0, + "grad_norm": 2.325375460191994, + "language_loss": 0.77409399, + "learning_rate": 3.929043395181631e-06, + "loss": 0.85387194, + "num_input_tokens_seen": 40455570, + "router_z_loss_clip": 3.69140625, + "router_z_loss_mlp": 0.40625, + "step": 1871, + "time_per_iteration": 3.970134735107422 + }, + { + "auxiliary_loss_clip": 0.06669156, + "auxiliary_loss_mlp": 0.01304929, + "balance_loss_clip": 0.06304972, + "balance_loss_mlp": 0.01264803, + "epoch": 0.1125507289944386, + "flos": 22863146307840.0, + "grad_norm": 2.5010943819542395, + "language_loss": 0.83236814, + "learning_rate": 3.928940539325929e-06, + "loss": 0.91210902, + "num_input_tokens_seen": 40473600, + "router_z_loss_clip": 3.64257812, + "router_z_loss_mlp": 0.40112305, + "step": 1872, + "time_per_iteration": 2.53498911857605 + }, + { + "auxiliary_loss_clip": 0.0666475, + "auxiliary_loss_mlp": 0.0132478, + "balance_loss_clip": 0.06302819, + "balance_loss_mlp": 0.01284344, + "epoch": 0.11261085224710657, + "flos": 19681447397760.0, + "grad_norm": 2.9026103981965963, + "language_loss": 0.84496641, + "learning_rate": 3.9288376103246095e-06, + "loss": 0.92486167, + "num_input_tokens_seen": 40490025, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 0.40454102, + "step": 1873, + "time_per_iteration": 3.988614082336426 + }, + { + "auxiliary_loss_clip": 0.06668855, + "auxiliary_loss_mlp": 0.01305813, + "balance_loss_clip": 0.06300959, + "balance_loss_mlp": 0.01266664, + "epoch": 0.11267097549977453, + "flos": 26069680753920.0, + "grad_norm": 2.0146094287088454, + "language_loss": 0.92890203, + "learning_rate": 3.928734608181575e-06, + "loss": 1.00864863, + "num_input_tokens_seen": 40511580, + "router_z_loss_clip": 3.67578125, + "router_z_loss_mlp": 0.3918457, + "step": 1874, + "time_per_iteration": 2.594095230102539 + }, + { + "auxiliary_loss_clip": 0.06647091, + "auxiliary_loss_mlp": 0.01311618, + "balance_loss_clip": 0.06294458, + "balance_loss_mlp": 0.01272589, + "epoch": 0.11273109875244251, + "flos": 21074194437120.0, + "grad_norm": 2.447545582518425, + "language_loss": 0.7598331, + "learning_rate": 3.928631532900729e-06, + "loss": 0.8394202, + "num_input_tokens_seen": 40530155, + "router_z_loss_clip": 3.52539062, + "router_z_loss_mlp": 0.39038086, + "step": 1875, + "time_per_iteration": 2.5846669673919678 + }, + { + "auxiliary_loss_clip": 0.06650866, + "auxiliary_loss_mlp": 0.01305089, + "balance_loss_clip": 0.06300622, + "balance_loss_mlp": 0.01270042, + "epoch": 0.11279122200511048, + "flos": 27096299377920.0, + "grad_norm": 2.1373581639008603, + "language_loss": 0.73336905, + "learning_rate": 3.928528384485984e-06, + "loss": 0.81292862, + "num_input_tokens_seen": 40549500, + "router_z_loss_clip": 3.50585938, + "router_z_loss_mlp": 0.3503418, + "step": 1876, + "time_per_iteration": 3.9819693565368652 + }, + { + "auxiliary_loss_clip": 0.06655607, + "auxiliary_loss_mlp": 0.01304943, + "balance_loss_clip": 0.06303705, + "balance_loss_mlp": 0.01268489, + "epoch": 0.11285134525777844, + "flos": 20193163482240.0, + "grad_norm": 1.9863695087931013, + "language_loss": 0.78284073, + "learning_rate": 3.9284251629412475e-06, + "loss": 0.86244625, + "num_input_tokens_seen": 40567475, + "router_z_loss_clip": 3.51953125, + "router_z_loss_mlp": 0.36474609, + "step": 1877, + "time_per_iteration": 4.03458046913147 + }, + { + "auxiliary_loss_clip": 0.06652889, + "auxiliary_loss_mlp": 0.01306338, + "balance_loss_clip": 0.06294097, + "balance_loss_mlp": 0.01265139, + "epoch": 0.11291146851044641, + "flos": 12463348803840.0, + "grad_norm": 2.614643448765401, + "language_loss": 0.8943826, + "learning_rate": 3.928321868270436e-06, + "loss": 0.97397494, + "num_input_tokens_seen": 40583280, + "router_z_loss_clip": 3.58789062, + "router_z_loss_mlp": 0.41186523, + "step": 1878, + "time_per_iteration": 2.5039942264556885 + }, + { + "auxiliary_loss_clip": 0.06650617, + "auxiliary_loss_mlp": 0.01298934, + "balance_loss_clip": 0.0629722, + "balance_loss_mlp": 0.01262981, + "epoch": 0.11297159176311439, + "flos": 23849164828800.0, + "grad_norm": 2.5452203644148748, + "language_loss": 0.83347368, + "learning_rate": 3.928218500477466e-06, + "loss": 0.91296917, + "num_input_tokens_seen": 40603080, + "router_z_loss_clip": 3.53710938, + "router_z_loss_mlp": 0.35961914, + "step": 1879, + "time_per_iteration": 2.597705125808716 + }, + { + "auxiliary_loss_clip": 0.06658179, + "auxiliary_loss_mlp": 0.01304624, + "balance_loss_clip": 0.06296952, + "balance_loss_mlp": 0.01265333, + "epoch": 0.11303171501578235, + "flos": 29937585876480.0, + "grad_norm": 2.2031468075921765, + "language_loss": 0.71889591, + "learning_rate": 3.928115059566259e-06, + "loss": 0.79852396, + "num_input_tokens_seen": 40623255, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 0.39306641, + "step": 1880, + "time_per_iteration": 2.5943877696990967 + }, + { + "auxiliary_loss_clip": 0.06640352, + "auxiliary_loss_mlp": 0.01299738, + "balance_loss_clip": 0.06297569, + "balance_loss_mlp": 0.01262163, + "epoch": 0.11309183826845032, + "flos": 16186169381760.0, + "grad_norm": 2.477930763311184, + "language_loss": 0.74137151, + "learning_rate": 3.928011545540734e-06, + "loss": 0.82077241, + "num_input_tokens_seen": 40641570, + "router_z_loss_clip": 3.43164062, + "router_z_loss_mlp": 0.37573242, + "step": 1881, + "time_per_iteration": 2.5628225803375244 + }, + { + "auxiliary_loss_clip": 0.06661209, + "auxiliary_loss_mlp": 0.01303844, + "balance_loss_clip": 0.06301182, + "balance_loss_mlp": 0.01264767, + "epoch": 0.1131519615211183, + "flos": 12025537620480.0, + "grad_norm": 2.71671437451568, + "language_loss": 0.75070721, + "learning_rate": 3.927907958404819e-06, + "loss": 0.83035773, + "num_input_tokens_seen": 40658775, + "router_z_loss_clip": 3.59765625, + "router_z_loss_mlp": 0.39111328, + "step": 1882, + "time_per_iteration": 2.5252811908721924 + }, + { + "auxiliary_loss_clip": 0.06659748, + "auxiliary_loss_mlp": 0.01301896, + "balance_loss_clip": 0.06302463, + "balance_loss_mlp": 0.0126363, + "epoch": 0.11321208477378626, + "flos": 26257335972480.0, + "grad_norm": 2.360500107686341, + "language_loss": 0.81115943, + "learning_rate": 3.92780429816244e-06, + "loss": 0.89077592, + "num_input_tokens_seen": 40679555, + "router_z_loss_clip": 3.57226562, + "router_z_loss_mlp": 0.3828125, + "step": 1883, + "time_per_iteration": 2.6215126514434814 + }, + { + "auxiliary_loss_clip": 0.06662337, + "auxiliary_loss_mlp": 0.01301794, + "balance_loss_clip": 0.06305344, + "balance_loss_mlp": 0.01264076, + "epoch": 0.11327220802645423, + "flos": 13631530101120.0, + "grad_norm": 4.398339236734383, + "language_loss": 0.78793007, + "learning_rate": 3.927700564817529e-06, + "loss": 0.86757141, + "num_input_tokens_seen": 40697295, + "router_z_loss_clip": 3.56835938, + "router_z_loss_mlp": 0.37719727, + "step": 1884, + "time_per_iteration": 2.5176398754119873 + }, + { + "auxiliary_loss_clip": 0.06509344, + "auxiliary_loss_mlp": 0.0129272, + "balance_loss_clip": 0.06295511, + "balance_loss_mlp": 0.0127789, + "epoch": 0.1133323312791222, + "flos": 57210582787200.0, + "grad_norm": 0.8090343621743066, + "language_loss": 0.55328304, + "learning_rate": 3.927596758374019e-06, + "loss": 0.63130367, + "num_input_tokens_seen": 40758095, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.14794922, + "step": 1885, + "time_per_iteration": 3.0971505641937256 + }, + { + "auxiliary_loss_clip": 0.06646755, + "auxiliary_loss_mlp": 0.01313183, + "balance_loss_clip": 0.06301701, + "balance_loss_mlp": 0.01277062, + "epoch": 0.11339245453179017, + "flos": 24358407217920.0, + "grad_norm": 2.1975512476365444, + "language_loss": 0.917539, + "learning_rate": 3.927492878835848e-06, + "loss": 0.99713838, + "num_input_tokens_seen": 40777140, + "router_z_loss_clip": 3.45117188, + "router_z_loss_mlp": 0.36132812, + "step": 1886, + "time_per_iteration": 2.557039260864258 + }, + { + "auxiliary_loss_clip": 0.06661782, + "auxiliary_loss_mlp": 0.01305618, + "balance_loss_clip": 0.06311518, + "balance_loss_mlp": 0.01271882, + "epoch": 0.11345257778445814, + "flos": 22676665046400.0, + "grad_norm": 2.7768273002598427, + "language_loss": 0.86747134, + "learning_rate": 3.927388926206953e-06, + "loss": 0.94714534, + "num_input_tokens_seen": 40797505, + "router_z_loss_clip": 3.50195312, + "router_z_loss_mlp": 0.33740234, + "step": 1887, + "time_per_iteration": 2.586071014404297 + }, + { + "auxiliary_loss_clip": 0.06653242, + "auxiliary_loss_mlp": 0.01304972, + "balance_loss_clip": 0.06302808, + "balance_loss_mlp": 0.01268279, + "epoch": 0.11351270103712612, + "flos": 20993245793280.0, + "grad_norm": 4.850859640376328, + "language_loss": 0.7868247, + "learning_rate": 3.927284900491277e-06, + "loss": 0.86640686, + "num_input_tokens_seen": 40812970, + "router_z_loss_clip": 3.50195312, + "router_z_loss_mlp": 0.36694336, + "step": 1888, + "time_per_iteration": 2.5445072650909424 + }, + { + "auxiliary_loss_clip": 0.06662205, + "auxiliary_loss_mlp": 0.01311301, + "balance_loss_clip": 0.06306933, + "balance_loss_mlp": 0.01271366, + "epoch": 0.11357282428979408, + "flos": 37358014152960.0, + "grad_norm": 2.243152205453325, + "language_loss": 0.69439191, + "learning_rate": 3.927180801692764e-06, + "loss": 0.77412695, + "num_input_tokens_seen": 40837745, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 0.39916992, + "step": 1889, + "time_per_iteration": 2.7570948600769043 + }, + { + "auxiliary_loss_clip": 0.06658383, + "auxiliary_loss_mlp": 0.01303074, + "balance_loss_clip": 0.06306529, + "balance_loss_mlp": 0.01266811, + "epoch": 0.11363294754246205, + "flos": 21762580855680.0, + "grad_norm": 2.3560992330068, + "language_loss": 0.85365129, + "learning_rate": 3.927076629815362e-06, + "loss": 0.93326581, + "num_input_tokens_seen": 40856490, + "router_z_loss_clip": 3.51953125, + "router_z_loss_mlp": 0.36279297, + "step": 1890, + "time_per_iteration": 2.539299964904785 + }, + { + "auxiliary_loss_clip": 0.06646931, + "auxiliary_loss_mlp": 0.0130946, + "balance_loss_clip": 0.06299055, + "balance_loss_mlp": 0.01272887, + "epoch": 0.11369307079513001, + "flos": 22608252587520.0, + "grad_norm": 3.2867804654433734, + "language_loss": 0.66679269, + "learning_rate": 3.926972384863022e-06, + "loss": 0.74635661, + "num_input_tokens_seen": 40874070, + "router_z_loss_clip": 3.48046875, + "router_z_loss_mlp": 0.36572266, + "step": 1891, + "time_per_iteration": 2.5804758071899414 + }, + { + "auxiliary_loss_clip": 0.06662975, + "auxiliary_loss_mlp": 0.01306025, + "balance_loss_clip": 0.06305033, + "balance_loss_mlp": 0.01268188, + "epoch": 0.11375319404779799, + "flos": 21950655344640.0, + "grad_norm": 2.3010503008358887, + "language_loss": 0.89755237, + "learning_rate": 3.9268680668396956e-06, + "loss": 0.97724235, + "num_input_tokens_seen": 40892425, + "router_z_loss_clip": 3.58398438, + "router_z_loss_mlp": 0.37817383, + "step": 1892, + "time_per_iteration": 2.5231149196624756 + }, + { + "auxiliary_loss_clip": 0.06664805, + "auxiliary_loss_mlp": 0.01310273, + "balance_loss_clip": 0.06304479, + "balance_loss_mlp": 0.01271149, + "epoch": 0.11381331730046595, + "flos": 26402588225280.0, + "grad_norm": 2.9760722646413966, + "language_loss": 0.75163257, + "learning_rate": 3.926763675749339e-06, + "loss": 0.83138341, + "num_input_tokens_seen": 40912190, + "router_z_loss_clip": 3.60546875, + "router_z_loss_mlp": 0.39111328, + "step": 1893, + "time_per_iteration": 2.6722171306610107 + }, + { + "auxiliary_loss_clip": 0.06657124, + "auxiliary_loss_mlp": 0.0130867, + "balance_loss_clip": 0.06306865, + "balance_loss_mlp": 0.01271405, + "epoch": 0.11387344055313392, + "flos": 23811373837440.0, + "grad_norm": 2.1739305302665417, + "language_loss": 0.81218535, + "learning_rate": 3.92665921159591e-06, + "loss": 0.89184326, + "num_input_tokens_seen": 40928395, + "router_z_loss_clip": 3.5, + "router_z_loss_mlp": 0.37255859, + "step": 1894, + "time_per_iteration": 2.5737743377685547 + }, + { + "auxiliary_loss_clip": 0.06661002, + "auxiliary_loss_mlp": 0.01313123, + "balance_loss_clip": 0.06302214, + "balance_loss_mlp": 0.01272187, + "epoch": 0.1139335638058019, + "flos": 34529865546240.0, + "grad_norm": 3.0499673553250317, + "language_loss": 0.81167793, + "learning_rate": 3.926554674383371e-06, + "loss": 0.89141917, + "num_input_tokens_seen": 40946555, + "router_z_loss_clip": 3.5859375, + "router_z_loss_mlp": 0.40991211, + "step": 1895, + "time_per_iteration": 2.6510303020477295 + }, + { + "auxiliary_loss_clip": 0.06495596, + "auxiliary_loss_mlp": 0.01269878, + "balance_loss_clip": 0.06284232, + "balance_loss_mlp": 0.01256026, + "epoch": 0.11399368705846986, + "flos": 70609790643840.0, + "grad_norm": 0.7664991761837657, + "language_loss": 0.63306981, + "learning_rate": 3.926450064115686e-06, + "loss": 0.71072453, + "num_input_tokens_seen": 41004910, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.13891602, + "step": 1896, + "time_per_iteration": 3.2715020179748535 + }, + { + "auxiliary_loss_clip": 0.06653456, + "auxiliary_loss_mlp": 0.01306088, + "balance_loss_clip": 0.06306494, + "balance_loss_mlp": 0.01266224, + "epoch": 0.11405381031113783, + "flos": 21330597530880.0, + "grad_norm": 2.7976416245645988, + "language_loss": 0.86136234, + "learning_rate": 3.926345380796821e-06, + "loss": 0.94095778, + "num_input_tokens_seen": 41026385, + "router_z_loss_clip": 3.47070312, + "router_z_loss_mlp": 0.3984375, + "step": 1897, + "time_per_iteration": 2.602890729904175 + }, + { + "auxiliary_loss_clip": 0.06656732, + "auxiliary_loss_mlp": 0.01307974, + "balance_loss_clip": 0.06304093, + "balance_loss_mlp": 0.01270041, + "epoch": 0.11411393356380581, + "flos": 19725820934400.0, + "grad_norm": 2.6374143353220068, + "language_loss": 0.80644619, + "learning_rate": 3.9262406244307465e-06, + "loss": 0.88609326, + "num_input_tokens_seen": 41045315, + "router_z_loss_clip": 3.52734375, + "router_z_loss_mlp": 0.37915039, + "step": 1898, + "time_per_iteration": 2.5834596157073975 + }, + { + "auxiliary_loss_clip": 0.06665078, + "auxiliary_loss_mlp": 0.0130214, + "balance_loss_clip": 0.06307302, + "balance_loss_mlp": 0.01261823, + "epoch": 0.11417405681647377, + "flos": 17536261893120.0, + "grad_norm": 3.558801225381502, + "language_loss": 0.74948764, + "learning_rate": 3.926135795021435e-06, + "loss": 0.82915986, + "num_input_tokens_seen": 41063390, + "router_z_loss_clip": 3.57226562, + "router_z_loss_mlp": 0.40283203, + "step": 1899, + "time_per_iteration": 2.5195093154907227 + }, + { + "auxiliary_loss_clip": 0.06484325, + "auxiliary_loss_mlp": 0.01277698, + "balance_loss_clip": 0.06276824, + "balance_loss_mlp": 0.01262463, + "epoch": 0.11423418006914174, + "flos": 59694168205440.0, + "grad_norm": 0.8563849035990295, + "language_loss": 0.63607001, + "learning_rate": 3.92603089257286e-06, + "loss": 0.71369016, + "num_input_tokens_seen": 41124180, + "router_z_loss_clip": 2.078125, + "router_z_loss_mlp": 0.15209961, + "step": 1900, + "time_per_iteration": 3.140596389770508 + }, + { + "auxiliary_loss_clip": 0.06654657, + "auxiliary_loss_mlp": 0.01295658, + "balance_loss_clip": 0.06311209, + "balance_loss_mlp": 0.01260706, + "epoch": 0.1142943033218097, + "flos": 22969223976960.0, + "grad_norm": 2.413799712437086, + "language_loss": 0.7948848, + "learning_rate": 3.925925917089001e-06, + "loss": 0.87438798, + "num_input_tokens_seen": 41143485, + "router_z_loss_clip": 3.43359375, + "router_z_loss_mlp": 0.34960938, + "step": 1901, + "time_per_iteration": 2.5521771907806396 + }, + { + "auxiliary_loss_clip": 0.06657314, + "auxiliary_loss_mlp": 0.01303255, + "balance_loss_clip": 0.06311248, + "balance_loss_mlp": 0.01264011, + "epoch": 0.11435442657447768, + "flos": 18261558835200.0, + "grad_norm": 2.3832212906881862, + "language_loss": 0.8530966, + "learning_rate": 3.925820868573839e-06, + "loss": 0.93270218, + "num_input_tokens_seen": 41161695, + "router_z_loss_clip": 3.4609375, + "router_z_loss_mlp": 0.39257812, + "step": 1902, + "time_per_iteration": 2.538130521774292 + }, + { + "auxiliary_loss_clip": 0.06657556, + "auxiliary_loss_mlp": 0.01298528, + "balance_loss_clip": 0.06305373, + "balance_loss_mlp": 0.01259737, + "epoch": 0.11441454982714565, + "flos": 24068070420480.0, + "grad_norm": 1.6413453356185448, + "language_loss": 0.79046285, + "learning_rate": 3.925715747031356e-06, + "loss": 0.87002361, + "num_input_tokens_seen": 41181715, + "router_z_loss_clip": 3.52539062, + "router_z_loss_mlp": 0.38793945, + "step": 1903, + "time_per_iteration": 2.5491714477539062 + }, + { + "auxiliary_loss_clip": 0.0665084, + "auxiliary_loss_mlp": 0.01296782, + "balance_loss_clip": 0.06302907, + "balance_loss_mlp": 0.01262021, + "epoch": 0.11447467307981361, + "flos": 25344719228160.0, + "grad_norm": 2.444047148927425, + "language_loss": 0.7716713, + "learning_rate": 3.925610552465539e-06, + "loss": 0.85114753, + "num_input_tokens_seen": 41201770, + "router_z_loss_clip": 3.47851562, + "router_z_loss_mlp": 0.34765625, + "step": 1904, + "time_per_iteration": 2.581732749938965 + }, + { + "auxiliary_loss_clip": 0.0665014, + "auxiliary_loss_mlp": 0.01305214, + "balance_loss_clip": 0.0630317, + "balance_loss_mlp": 0.01263967, + "epoch": 0.11453479633248159, + "flos": 21732546366720.0, + "grad_norm": 2.531757155305884, + "language_loss": 0.9328481, + "learning_rate": 3.9255052848803764e-06, + "loss": 1.01240158, + "num_input_tokens_seen": 41220590, + "router_z_loss_clip": 3.46484375, + "router_z_loss_mlp": 0.41259766, + "step": 1905, + "time_per_iteration": 2.5455148220062256 + }, + { + "auxiliary_loss_clip": 0.06677254, + "auxiliary_loss_mlp": 0.01302143, + "balance_loss_clip": 0.06310458, + "balance_loss_mlp": 0.0126185, + "epoch": 0.11459491958514956, + "flos": 12974771399040.0, + "grad_norm": 15.201644676234393, + "language_loss": 0.79179782, + "learning_rate": 3.925399944279861e-06, + "loss": 0.87159181, + "num_input_tokens_seen": 41237250, + "router_z_loss_clip": 3.66992188, + "router_z_loss_mlp": 0.40258789, + "step": 1906, + "time_per_iteration": 2.557220220565796 + }, + { + "auxiliary_loss_clip": 0.06651148, + "auxiliary_loss_mlp": 0.01309487, + "balance_loss_clip": 0.06300925, + "balance_loss_mlp": 0.0127022, + "epoch": 0.11465504283781752, + "flos": 22717935982080.0, + "grad_norm": 2.7916231383135903, + "language_loss": 0.84417903, + "learning_rate": 3.925294530667986e-06, + "loss": 0.92378545, + "num_input_tokens_seen": 41256680, + "router_z_loss_clip": 3.50195312, + "router_z_loss_mlp": 0.39257812, + "step": 1907, + "time_per_iteration": 2.538357734680176 + }, + { + "auxiliary_loss_clip": 0.06659371, + "auxiliary_loss_mlp": 0.01305713, + "balance_loss_clip": 0.06306633, + "balance_loss_mlp": 0.01266064, + "epoch": 0.1147151660904855, + "flos": 23404142194560.0, + "grad_norm": 5.983288386648609, + "language_loss": 0.85784996, + "learning_rate": 3.92518904404875e-06, + "loss": 0.93750072, + "num_input_tokens_seen": 41270955, + "router_z_loss_clip": 3.53125, + "router_z_loss_mlp": 0.39648438, + "step": 1908, + "time_per_iteration": 2.566323757171631 + }, + { + "auxiliary_loss_clip": 0.06483665, + "auxiliary_loss_mlp": 0.01269821, + "balance_loss_clip": 0.0627609, + "balance_loss_mlp": 0.01254252, + "epoch": 0.11477528934315347, + "flos": 63028639036800.0, + "grad_norm": 0.8722245963969955, + "language_loss": 0.60927975, + "learning_rate": 3.925083484426153e-06, + "loss": 0.68681461, + "num_input_tokens_seen": 41319180, + "router_z_loss_clip": 2.07617188, + "router_z_loss_mlp": 0.15551758, + "step": 1909, + "time_per_iteration": 2.9047083854675293 + }, + { + "auxiliary_loss_clip": 0.06651932, + "auxiliary_loss_mlp": 0.01304657, + "balance_loss_clip": 0.06305454, + "balance_loss_mlp": 0.01265223, + "epoch": 0.11483541259582143, + "flos": 16331086218240.0, + "grad_norm": 2.669666495614271, + "language_loss": 0.8074221, + "learning_rate": 3.924977851804197e-06, + "loss": 0.88698798, + "num_input_tokens_seen": 41337480, + "router_z_loss_clip": 3.46289062, + "router_z_loss_mlp": 0.39404297, + "step": 1910, + "time_per_iteration": 2.5531835556030273 + }, + { + "auxiliary_loss_clip": 0.06656756, + "auxiliary_loss_mlp": 0.01297855, + "balance_loss_clip": 0.06303862, + "balance_loss_mlp": 0.01258516, + "epoch": 0.1148955358484894, + "flos": 21586916770560.0, + "grad_norm": 2.9098941838716046, + "language_loss": 0.78589714, + "learning_rate": 3.9248721461868875e-06, + "loss": 0.86544329, + "num_input_tokens_seen": 41354650, + "router_z_loss_clip": 3.52734375, + "router_z_loss_mlp": 0.39331055, + "step": 1911, + "time_per_iteration": 3.928828477859497 + }, + { + "auxiliary_loss_clip": 0.06639488, + "auxiliary_loss_mlp": 0.01303362, + "balance_loss_clip": 0.06301475, + "balance_loss_mlp": 0.01266931, + "epoch": 0.11495565910115738, + "flos": 27681249530880.0, + "grad_norm": 2.02553210679246, + "language_loss": 0.80990648, + "learning_rate": 3.9247663675782336e-06, + "loss": 0.88933504, + "num_input_tokens_seen": 41376935, + "router_z_loss_clip": 3.38085938, + "router_z_loss_mlp": 0.36401367, + "step": 1912, + "time_per_iteration": 2.5985615253448486 + }, + { + "auxiliary_loss_clip": 0.06649567, + "auxiliary_loss_mlp": 0.01304436, + "balance_loss_clip": 0.06303079, + "balance_loss_mlp": 0.01266575, + "epoch": 0.11501578235382534, + "flos": 20638815022080.0, + "grad_norm": 2.0778571754475124, + "language_loss": 0.79150605, + "learning_rate": 3.924660515982246e-06, + "loss": 0.87104607, + "num_input_tokens_seen": 41396105, + "router_z_loss_clip": 3.46484375, + "router_z_loss_mlp": 0.37866211, + "step": 1913, + "time_per_iteration": 3.9840147495269775 + }, + { + "auxiliary_loss_clip": 0.06649221, + "auxiliary_loss_mlp": 0.01302596, + "balance_loss_clip": 0.06302825, + "balance_loss_mlp": 0.01266214, + "epoch": 0.1150759056064933, + "flos": 19835252766720.0, + "grad_norm": 2.174223201073213, + "language_loss": 0.71977127, + "learning_rate": 3.924554591402939e-06, + "loss": 0.79928941, + "num_input_tokens_seen": 41415600, + "router_z_loss_clip": 3.46484375, + "router_z_loss_mlp": 0.36352539, + "step": 1914, + "time_per_iteration": 2.564162492752075 + }, + { + "auxiliary_loss_clip": 0.06490675, + "auxiliary_loss_mlp": 0.01271492, + "balance_loss_clip": 0.06283194, + "balance_loss_mlp": 0.01257139, + "epoch": 0.11513602885916129, + "flos": 70068543194880.0, + "grad_norm": 0.7330745369663106, + "language_loss": 0.61048496, + "learning_rate": 3.92444859384433e-06, + "loss": 0.68810666, + "num_input_tokens_seen": 41478760, + "router_z_loss_clip": 2.0625, + "router_z_loss_mlp": 0.14343262, + "step": 1915, + "time_per_iteration": 4.616885662078857 + }, + { + "auxiliary_loss_clip": 0.06646329, + "auxiliary_loss_mlp": 0.01309796, + "balance_loss_clip": 0.06301694, + "balance_loss_mlp": 0.01271697, + "epoch": 0.11519615211182925, + "flos": 15747100387200.0, + "grad_norm": 2.8536727053056077, + "language_loss": 0.94662005, + "learning_rate": 3.924342523310436e-06, + "loss": 1.02618122, + "num_input_tokens_seen": 41495720, + "router_z_loss_clip": 3.44726562, + "router_z_loss_mlp": 0.38085938, + "step": 1916, + "time_per_iteration": 2.544074058532715 + }, + { + "auxiliary_loss_clip": 0.06649305, + "auxiliary_loss_mlp": 0.01297855, + "balance_loss_clip": 0.06298364, + "balance_loss_mlp": 0.01258945, + "epoch": 0.11525627536449722, + "flos": 20673880755840.0, + "grad_norm": 1.9176091228095486, + "language_loss": 0.73714519, + "learning_rate": 3.9242363798052806e-06, + "loss": 0.81661683, + "num_input_tokens_seen": 41513585, + "router_z_loss_clip": 3.5078125, + "router_z_loss_mlp": 0.3894043, + "step": 1917, + "time_per_iteration": 3.988520383834839 + }, + { + "auxiliary_loss_clip": 0.06637132, + "auxiliary_loss_mlp": 0.01303977, + "balance_loss_clip": 0.06296226, + "balance_loss_mlp": 0.01264876, + "epoch": 0.1153163986171652, + "flos": 20309555203200.0, + "grad_norm": 2.2006178662795546, + "language_loss": 0.7638135, + "learning_rate": 3.92413016333289e-06, + "loss": 0.84322459, + "num_input_tokens_seen": 41533390, + "router_z_loss_clip": 3.40820312, + "router_z_loss_mlp": 0.39135742, + "step": 1918, + "time_per_iteration": 2.531501531600952 + }, + { + "auxiliary_loss_clip": 0.06653848, + "auxiliary_loss_mlp": 0.01302011, + "balance_loss_clip": 0.06300295, + "balance_loss_mlp": 0.01263983, + "epoch": 0.11537652186983316, + "flos": 17645064819840.0, + "grad_norm": 6.624924967769877, + "language_loss": 0.87652063, + "learning_rate": 3.92402387389729e-06, + "loss": 0.95607924, + "num_input_tokens_seen": 41551015, + "router_z_loss_clip": 3.53320312, + "router_z_loss_mlp": 0.38037109, + "step": 1919, + "time_per_iteration": 2.5388336181640625 + }, + { + "auxiliary_loss_clip": 0.06642918, + "auxiliary_loss_mlp": 0.01303256, + "balance_loss_clip": 0.06300272, + "balance_loss_mlp": 0.01265872, + "epoch": 0.11543664512250112, + "flos": 21075787664640.0, + "grad_norm": 2.5165855021660697, + "language_loss": 0.87737721, + "learning_rate": 3.923917511502512e-06, + "loss": 0.95683897, + "num_input_tokens_seen": 41568055, + "router_z_loss_clip": 3.42773438, + "router_z_loss_mlp": 0.37402344, + "step": 1920, + "time_per_iteration": 2.536255121231079 + }, + { + "auxiliary_loss_clip": 0.0663945, + "auxiliary_loss_mlp": 0.01300031, + "balance_loss_clip": 0.06300904, + "balance_loss_mlp": 0.01262671, + "epoch": 0.11549676837516909, + "flos": 22754175672960.0, + "grad_norm": 2.0755692503441696, + "language_loss": 0.81216776, + "learning_rate": 3.923811076152589e-06, + "loss": 0.89156258, + "num_input_tokens_seen": 41587435, + "router_z_loss_clip": 3.3828125, + "router_z_loss_mlp": 0.3737793, + "step": 1921, + "time_per_iteration": 2.5809693336486816 + }, + { + "auxiliary_loss_clip": 0.06661837, + "auxiliary_loss_mlp": 0.01301821, + "balance_loss_clip": 0.06303193, + "balance_loss_mlp": 0.0126036, + "epoch": 0.11555689162783707, + "flos": 19174510995840.0, + "grad_norm": 2.11935003712056, + "language_loss": 0.79765266, + "learning_rate": 3.923704567851557e-06, + "loss": 0.87728924, + "num_input_tokens_seen": 41604975, + "router_z_loss_clip": 3.5859375, + "router_z_loss_mlp": 0.41455078, + "step": 1922, + "time_per_iteration": 2.521562099456787 + }, + { + "auxiliary_loss_clip": 0.06651014, + "auxiliary_loss_mlp": 0.01303966, + "balance_loss_clip": 0.06302896, + "balance_loss_mlp": 0.01265939, + "epoch": 0.11561701488050503, + "flos": 24579031818240.0, + "grad_norm": 1.9630494189649508, + "language_loss": 0.85855269, + "learning_rate": 3.923597986603456e-06, + "loss": 0.93810248, + "num_input_tokens_seen": 41626155, + "router_z_loss_clip": 3.48046875, + "router_z_loss_mlp": 0.38037109, + "step": 1923, + "time_per_iteration": 2.6439831256866455 + }, + { + "auxiliary_loss_clip": 0.06647194, + "auxiliary_loss_mlp": 0.01294133, + "balance_loss_clip": 0.0630134, + "balance_loss_mlp": 0.01258465, + "epoch": 0.115677138133173, + "flos": 17098283001600.0, + "grad_norm": 2.06344411433486, + "language_loss": 0.8208636, + "learning_rate": 3.9234913324123264e-06, + "loss": 0.90027684, + "num_input_tokens_seen": 41644805, + "router_z_loss_clip": 3.4609375, + "router_z_loss_mlp": 0.35668945, + "step": 1924, + "time_per_iteration": 2.5213494300842285 + }, + { + "auxiliary_loss_clip": 0.06494077, + "auxiliary_loss_mlp": 0.01268349, + "balance_loss_clip": 0.06289093, + "balance_loss_mlp": 0.01252459, + "epoch": 0.11573726138584098, + "flos": 62724032317440.0, + "grad_norm": 0.8075731701213882, + "language_loss": 0.60936594, + "learning_rate": 3.923384605282212e-06, + "loss": 0.6869902, + "num_input_tokens_seen": 41709345, + "router_z_loss_clip": 2.06054688, + "router_z_loss_mlp": 0.15881348, + "step": 1925, + "time_per_iteration": 3.2047207355499268 + }, + { + "auxiliary_loss_clip": 0.06648477, + "auxiliary_loss_mlp": 0.01300045, + "balance_loss_clip": 0.06303966, + "balance_loss_mlp": 0.01261016, + "epoch": 0.11579738463850894, + "flos": 22607665608960.0, + "grad_norm": 2.013389480073572, + "language_loss": 0.76518846, + "learning_rate": 3.923277805217161e-06, + "loss": 0.84467369, + "num_input_tokens_seen": 41730210, + "router_z_loss_clip": 3.44726562, + "router_z_loss_mlp": 0.39038086, + "step": 1926, + "time_per_iteration": 2.55283784866333 + }, + { + "auxiliary_loss_clip": 0.06666763, + "auxiliary_loss_mlp": 0.01299238, + "balance_loss_clip": 0.06301835, + "balance_loss_mlp": 0.01255583, + "epoch": 0.11585750789117691, + "flos": 21732630220800.0, + "grad_norm": 5.887246019394102, + "language_loss": 0.7431767, + "learning_rate": 3.923170932221222e-06, + "loss": 0.82283664, + "num_input_tokens_seen": 41750270, + "router_z_loss_clip": 3.64648438, + "router_z_loss_mlp": 0.43652344, + "step": 1927, + "time_per_iteration": 2.560518503189087 + }, + { + "auxiliary_loss_clip": 0.06652652, + "auxiliary_loss_mlp": 0.01306042, + "balance_loss_clip": 0.0630243, + "balance_loss_mlp": 0.01264986, + "epoch": 0.11591763114384489, + "flos": 26294917328640.0, + "grad_norm": 2.5509114333241873, + "language_loss": 0.88765574, + "learning_rate": 3.92306398629845e-06, + "loss": 0.96724266, + "num_input_tokens_seen": 41772975, + "router_z_loss_clip": 3.50585938, + "router_z_loss_mlp": 0.41064453, + "step": 1928, + "time_per_iteration": 2.6590919494628906 + }, + { + "auxiliary_loss_clip": 0.06657438, + "auxiliary_loss_mlp": 0.01301093, + "balance_loss_clip": 0.06300268, + "balance_loss_mlp": 0.01261468, + "epoch": 0.11597775439651285, + "flos": 23006721479040.0, + "grad_norm": 2.0893495121762844, + "language_loss": 0.7806766, + "learning_rate": 3.922956967452898e-06, + "loss": 0.86026198, + "num_input_tokens_seen": 41791765, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 0.39648438, + "step": 1929, + "time_per_iteration": 2.5792133808135986 + }, + { + "auxiliary_loss_clip": 0.06650299, + "auxiliary_loss_mlp": 0.01295794, + "balance_loss_clip": 0.06304935, + "balance_loss_mlp": 0.01259626, + "epoch": 0.11603787764918082, + "flos": 31949845678080.0, + "grad_norm": 1.6257603780251215, + "language_loss": 0.78351086, + "learning_rate": 3.922849875688626e-06, + "loss": 0.86297178, + "num_input_tokens_seen": 41815615, + "router_z_loss_clip": 3.453125, + "router_z_loss_mlp": 0.36181641, + "step": 1930, + "time_per_iteration": 2.6880123615264893 + }, + { + "auxiliary_loss_clip": 0.06647912, + "auxiliary_loss_mlp": 0.01295728, + "balance_loss_clip": 0.06302683, + "balance_loss_mlp": 0.01257438, + "epoch": 0.1160980009018488, + "flos": 22277944592640.0, + "grad_norm": 1.7868265367767153, + "language_loss": 0.73173678, + "learning_rate": 3.922742711009693e-06, + "loss": 0.81117314, + "num_input_tokens_seen": 41834810, + "router_z_loss_clip": 3.453125, + "router_z_loss_mlp": 0.3828125, + "step": 1931, + "time_per_iteration": 2.5717685222625732 + }, + { + "auxiliary_loss_clip": 0.06652078, + "auxiliary_loss_mlp": 0.01303044, + "balance_loss_clip": 0.06304099, + "balance_loss_mlp": 0.01264539, + "epoch": 0.11615812415451676, + "flos": 22790205728640.0, + "grad_norm": 1.6665760080165584, + "language_loss": 0.8340829, + "learning_rate": 3.922635473420164e-06, + "loss": 0.91363412, + "num_input_tokens_seen": 41854975, + "router_z_loss_clip": 3.47851562, + "router_z_loss_mlp": 0.38500977, + "step": 1932, + "time_per_iteration": 2.601752519607544 + }, + { + "auxiliary_loss_clip": 0.0648433, + "auxiliary_loss_mlp": 0.01265345, + "balance_loss_clip": 0.06279489, + "balance_loss_mlp": 0.01250242, + "epoch": 0.11621824740718473, + "flos": 67165483438080.0, + "grad_norm": 0.7530575515980809, + "language_loss": 0.61312342, + "learning_rate": 3.922528162924105e-06, + "loss": 0.69062018, + "num_input_tokens_seen": 41911105, + "router_z_loss_clip": 2.046875, + "router_z_loss_mlp": 0.15075684, + "step": 1933, + "time_per_iteration": 3.078101873397827 + }, + { + "auxiliary_loss_clip": 0.06656399, + "auxiliary_loss_mlp": 0.01297791, + "balance_loss_clip": 0.06303177, + "balance_loss_mlp": 0.01259239, + "epoch": 0.11627837065985269, + "flos": 20382160366080.0, + "grad_norm": 2.5724054750959446, + "language_loss": 0.8773917, + "learning_rate": 3.922420779525586e-06, + "loss": 0.95693362, + "num_input_tokens_seen": 41931750, + "router_z_loss_clip": 3.52929688, + "router_z_loss_mlp": 0.38574219, + "step": 1934, + "time_per_iteration": 2.5999112129211426 + }, + { + "auxiliary_loss_clip": 0.06669597, + "auxiliary_loss_mlp": 0.01303802, + "balance_loss_clip": 0.0630424, + "balance_loss_mlp": 0.01260386, + "epoch": 0.11633849391252067, + "flos": 21732252877440.0, + "grad_norm": 3.12484100633917, + "language_loss": 0.67964768, + "learning_rate": 3.9223133232286776e-06, + "loss": 0.75938165, + "num_input_tokens_seen": 41949400, + "router_z_loss_clip": 3.65625, + "router_z_loss_mlp": 0.43408203, + "step": 1935, + "time_per_iteration": 2.5801587104797363 + }, + { + "auxiliary_loss_clip": 0.06657647, + "auxiliary_loss_mlp": 0.01296559, + "balance_loss_clip": 0.06305058, + "balance_loss_mlp": 0.01259485, + "epoch": 0.11639861716518864, + "flos": 18811023984000.0, + "grad_norm": 1.935927362539055, + "language_loss": 0.77021551, + "learning_rate": 3.922205794037456e-06, + "loss": 0.84975761, + "num_input_tokens_seen": 41968100, + "router_z_loss_clip": 3.52734375, + "router_z_loss_mlp": 0.37084961, + "step": 1936, + "time_per_iteration": 2.5624840259552 + }, + { + "auxiliary_loss_clip": 0.06655373, + "auxiliary_loss_mlp": 0.01299017, + "balance_loss_clip": 0.06303351, + "balance_loss_mlp": 0.01259678, + "epoch": 0.1164587404178566, + "flos": 21221333406720.0, + "grad_norm": 1.9207342779057202, + "language_loss": 0.85928023, + "learning_rate": 3.922098191955998e-06, + "loss": 0.93882406, + "num_input_tokens_seen": 41986375, + "router_z_loss_clip": 3.51953125, + "router_z_loss_mlp": 0.39355469, + "step": 1937, + "time_per_iteration": 2.5510001182556152 + }, + { + "auxiliary_loss_clip": 0.06649198, + "auxiliary_loss_mlp": 0.01298206, + "balance_loss_clip": 0.06305847, + "balance_loss_mlp": 0.01261561, + "epoch": 0.11651886367052458, + "flos": 27826040586240.0, + "grad_norm": 2.6065443485594613, + "language_loss": 0.78032261, + "learning_rate": 3.921990516988384e-06, + "loss": 0.85979664, + "num_input_tokens_seen": 42006055, + "router_z_loss_clip": 3.43359375, + "router_z_loss_mlp": 0.36645508, + "step": 1938, + "time_per_iteration": 2.6225640773773193 + }, + { + "auxiliary_loss_clip": 0.06663075, + "auxiliary_loss_mlp": 0.01303768, + "balance_loss_clip": 0.06310252, + "balance_loss_mlp": 0.01266098, + "epoch": 0.11657898692319255, + "flos": 22895570638080.0, + "grad_norm": 1.931552039208485, + "language_loss": 0.80530608, + "learning_rate": 3.921882769138696e-06, + "loss": 0.88497448, + "num_input_tokens_seen": 42024995, + "router_z_loss_clip": 3.52929688, + "router_z_loss_mlp": 0.37670898, + "step": 1939, + "time_per_iteration": 2.5451977252960205 + }, + { + "auxiliary_loss_clip": 0.06656967, + "auxiliary_loss_mlp": 0.01296552, + "balance_loss_clip": 0.06312265, + "balance_loss_mlp": 0.01261409, + "epoch": 0.11663911017586051, + "flos": 24322712578560.0, + "grad_norm": 2.6690615994939795, + "language_loss": 0.88347197, + "learning_rate": 3.9217749484110215e-06, + "loss": 0.96300709, + "num_input_tokens_seen": 42042640, + "router_z_loss_clip": 3.4453125, + "router_z_loss_mlp": 0.3515625, + "step": 1940, + "time_per_iteration": 2.572737216949463 + }, + { + "auxiliary_loss_clip": 0.06642211, + "auxiliary_loss_mlp": 0.01298321, + "balance_loss_clip": 0.06303503, + "balance_loss_mlp": 0.01262987, + "epoch": 0.11669923342852849, + "flos": 42350020525440.0, + "grad_norm": 1.538525373225641, + "language_loss": 0.7696858, + "learning_rate": 3.921667054809449e-06, + "loss": 0.84909111, + "num_input_tokens_seen": 42067005, + "router_z_loss_clip": 3.38671875, + "router_z_loss_mlp": 0.35327148, + "step": 1941, + "time_per_iteration": 2.72994065284729 + }, + { + "auxiliary_loss_clip": 0.06658466, + "auxiliary_loss_mlp": 0.01294978, + "balance_loss_clip": 0.06313083, + "balance_loss_mlp": 0.01259525, + "epoch": 0.11675935668119646, + "flos": 14646660716160.0, + "grad_norm": 2.147321627209633, + "language_loss": 0.9028796, + "learning_rate": 3.921559088338068e-06, + "loss": 0.98241401, + "num_input_tokens_seen": 42082295, + "router_z_loss_clip": 3.453125, + "router_z_loss_mlp": 0.35449219, + "step": 1942, + "time_per_iteration": 2.550832986831665 + }, + { + "auxiliary_loss_clip": 0.06645136, + "auxiliary_loss_mlp": 0.0129601, + "balance_loss_clip": 0.06305736, + "balance_loss_mlp": 0.01262154, + "epoch": 0.11681947993386442, + "flos": 35125213605120.0, + "grad_norm": 1.8932460092328547, + "language_loss": 0.69414169, + "learning_rate": 3.921451049000975e-06, + "loss": 0.77355313, + "num_input_tokens_seen": 42105295, + "router_z_loss_clip": 3.39453125, + "router_z_loss_mlp": 0.33813477, + "step": 1943, + "time_per_iteration": 2.6689436435699463 + }, + { + "auxiliary_loss_clip": 0.06646268, + "auxiliary_loss_mlp": 0.01301771, + "balance_loss_clip": 0.06305961, + "balance_loss_mlp": 0.01264721, + "epoch": 0.11687960318653239, + "flos": 38992531749120.0, + "grad_norm": 3.030291623904481, + "language_loss": 0.71275461, + "learning_rate": 3.921342936802265e-06, + "loss": 0.79223496, + "num_input_tokens_seen": 42125520, + "router_z_loss_clip": 3.40429688, + "router_z_loss_mlp": 0.37060547, + "step": 1944, + "time_per_iteration": 2.8050050735473633 + }, + { + "auxiliary_loss_clip": 0.06641431, + "auxiliary_loss_mlp": 0.01296797, + "balance_loss_clip": 0.06306483, + "balance_loss_mlp": 0.01261606, + "epoch": 0.11693972643920036, + "flos": 26002190689920.0, + "grad_norm": 1.654338946560172, + "language_loss": 0.83736217, + "learning_rate": 3.921234751746038e-06, + "loss": 0.91674441, + "num_input_tokens_seen": 42146335, + "router_z_loss_clip": 3.34765625, + "router_z_loss_mlp": 0.35205078, + "step": 1945, + "time_per_iteration": 2.6361136436462402 + }, + { + "auxiliary_loss_clip": 0.06650846, + "auxiliary_loss_mlp": 0.01293506, + "balance_loss_clip": 0.06312834, + "balance_loss_mlp": 0.01259579, + "epoch": 0.11699984969186833, + "flos": 27279552257280.0, + "grad_norm": 2.078454883436641, + "language_loss": 0.78074771, + "learning_rate": 3.9211264938363975e-06, + "loss": 0.86019123, + "num_input_tokens_seen": 42165320, + "router_z_loss_clip": 3.38085938, + "router_z_loss_mlp": 0.33935547, + "step": 1946, + "time_per_iteration": 2.6417500972747803 + }, + { + "auxiliary_loss_clip": 0.06645864, + "auxiliary_loss_mlp": 0.01291798, + "balance_loss_clip": 0.06307344, + "balance_loss_mlp": 0.01256083, + "epoch": 0.1170599729445363, + "flos": 15273217221120.0, + "grad_norm": 2.310732730392425, + "language_loss": 0.70257539, + "learning_rate": 3.921018163077448e-06, + "loss": 0.78195202, + "num_input_tokens_seen": 42182955, + "router_z_loss_clip": 3.38671875, + "router_z_loss_mlp": 0.35717773, + "step": 1947, + "time_per_iteration": 2.536513090133667 + }, + { + "auxiliary_loss_clip": 0.0665355, + "auxiliary_loss_mlp": 0.01301689, + "balance_loss_clip": 0.0630812, + "balance_loss_mlp": 0.01263113, + "epoch": 0.11712009619720427, + "flos": 17170007696640.0, + "grad_norm": 1.8188768357243443, + "language_loss": 0.86507225, + "learning_rate": 3.920909759473295e-06, + "loss": 0.94462466, + "num_input_tokens_seen": 42200760, + "router_z_loss_clip": 3.45507812, + "router_z_loss_mlp": 0.38574219, + "step": 1948, + "time_per_iteration": 2.515779495239258 + }, + { + "auxiliary_loss_clip": 0.06494473, + "auxiliary_loss_mlp": 0.01265792, + "balance_loss_clip": 0.06290484, + "balance_loss_mlp": 0.01249031, + "epoch": 0.11718021944987224, + "flos": 70961076887040.0, + "grad_norm": 2.567078438362061, + "language_loss": 0.65165019, + "learning_rate": 3.920801283028054e-06, + "loss": 0.72925287, + "num_input_tokens_seen": 42265745, + "router_z_loss_clip": 2.04101562, + "router_z_loss_mlp": 0.16772461, + "step": 1949, + "time_per_iteration": 3.177534341812134 + }, + { + "auxiliary_loss_clip": 0.06637877, + "auxiliary_loss_mlp": 0.0129446, + "balance_loss_clip": 0.06306669, + "balance_loss_mlp": 0.01261344, + "epoch": 0.1172403427025402, + "flos": 27460750711680.0, + "grad_norm": 1.6361907196052987, + "language_loss": 0.73358595, + "learning_rate": 3.920692733745835e-06, + "loss": 0.81290931, + "num_input_tokens_seen": 42286245, + "router_z_loss_clip": 3.31054688, + "router_z_loss_mlp": 0.33129883, + "step": 1950, + "time_per_iteration": 4.022751808166504 + }, + { + "auxiliary_loss_clip": 0.06660106, + "auxiliary_loss_mlp": 0.01302647, + "balance_loss_clip": 0.063132, + "balance_loss_mlp": 0.01265382, + "epoch": 0.11730046595520818, + "flos": 15674075953920.0, + "grad_norm": 2.7331916034067363, + "language_loss": 0.77657926, + "learning_rate": 3.920584111630755e-06, + "loss": 0.85620677, + "num_input_tokens_seen": 42302710, + "router_z_loss_clip": 3.46875, + "router_z_loss_mlp": 0.37280273, + "step": 1951, + "time_per_iteration": 2.5281777381896973 + }, + { + "auxiliary_loss_clip": 0.06648034, + "auxiliary_loss_mlp": 0.01294944, + "balance_loss_clip": 0.06303104, + "balance_loss_mlp": 0.01259801, + "epoch": 0.11736058920787615, + "flos": 25637320085760.0, + "grad_norm": 1.948975435069226, + "language_loss": 0.77674389, + "learning_rate": 3.9204754166869325e-06, + "loss": 0.85617363, + "num_input_tokens_seen": 42324115, + "router_z_loss_clip": 3.44921875, + "router_z_loss_mlp": 0.35131836, + "step": 1952, + "time_per_iteration": 4.001826286315918 + }, + { + "auxiliary_loss_clip": 0.06657356, + "auxiliary_loss_mlp": 0.01307688, + "balance_loss_clip": 0.06309209, + "balance_loss_mlp": 0.01270828, + "epoch": 0.11742071246054411, + "flos": 21440742122880.0, + "grad_norm": 9.62552088472932, + "language_loss": 0.73713255, + "learning_rate": 3.920366648918491e-06, + "loss": 0.81678301, + "num_input_tokens_seen": 42342505, + "router_z_loss_clip": 3.48242188, + "router_z_loss_mlp": 0.3684082, + "step": 1953, + "time_per_iteration": 2.5549252033233643 + }, + { + "auxiliary_loss_clip": 0.06670918, + "auxiliary_loss_mlp": 0.0130466, + "balance_loss_clip": 0.06317334, + "balance_loss_mlp": 0.01266203, + "epoch": 0.11748083571321208, + "flos": 16003377699840.0, + "grad_norm": 2.536716983337743, + "language_loss": 0.80894691, + "learning_rate": 3.920257808329552e-06, + "loss": 0.88870263, + "num_input_tokens_seen": 42360525, + "router_z_loss_clip": 3.53710938, + "router_z_loss_mlp": 0.38452148, + "step": 1954, + "time_per_iteration": 2.5963521003723145 + }, + { + "auxiliary_loss_clip": 0.06659664, + "auxiliary_loss_mlp": 0.01298566, + "balance_loss_clip": 0.06309056, + "balance_loss_mlp": 0.01260037, + "epoch": 0.11754095896588006, + "flos": 16185582403200.0, + "grad_norm": 1.9904438509588216, + "language_loss": 0.86966431, + "learning_rate": 3.920148894924246e-06, + "loss": 0.94924664, + "num_input_tokens_seen": 42377045, + "router_z_loss_clip": 3.50195312, + "router_z_loss_mlp": 0.38500977, + "step": 1955, + "time_per_iteration": 3.9597103595733643 + }, + { + "auxiliary_loss_clip": 0.06656501, + "auxiliary_loss_mlp": 0.0129708, + "balance_loss_clip": 0.06311554, + "balance_loss_mlp": 0.01262962, + "epoch": 0.11760108221854802, + "flos": 13266701424000.0, + "grad_norm": 2.228472811519511, + "language_loss": 0.79745102, + "learning_rate": 3.920039908706701e-06, + "loss": 0.8769868, + "num_input_tokens_seen": 42393960, + "router_z_loss_clip": 3.44726562, + "router_z_loss_mlp": 0.34130859, + "step": 1956, + "time_per_iteration": 3.990912437438965 + }, + { + "auxiliary_loss_clip": 0.0665153, + "auxiliary_loss_mlp": 0.01299416, + "balance_loss_clip": 0.06313992, + "balance_loss_mlp": 0.01266014, + "epoch": 0.11766120547121599, + "flos": 24505294625280.0, + "grad_norm": 2.0751916947238755, + "language_loss": 0.81691504, + "learning_rate": 3.91993084968105e-06, + "loss": 0.89642453, + "num_input_tokens_seen": 42413160, + "router_z_loss_clip": 3.37695312, + "router_z_loss_mlp": 0.33398438, + "step": 1957, + "time_per_iteration": 2.6472387313842773 + }, + { + "auxiliary_loss_clip": 0.06660254, + "auxiliary_loss_mlp": 0.01296947, + "balance_loss_clip": 0.06313962, + "balance_loss_mlp": 0.01261757, + "epoch": 0.11772132872388397, + "flos": 17789562385920.0, + "grad_norm": 3.000987002447453, + "language_loss": 0.80231309, + "learning_rate": 3.919821717851428e-06, + "loss": 0.88188511, + "num_input_tokens_seen": 42432590, + "router_z_loss_clip": 3.45898438, + "router_z_loss_mlp": 0.35180664, + "step": 1958, + "time_per_iteration": 2.5531046390533447 + }, + { + "auxiliary_loss_clip": 0.06667449, + "auxiliary_loss_mlp": 0.01302997, + "balance_loss_clip": 0.06316346, + "balance_loss_mlp": 0.01263968, + "epoch": 0.11778145197655193, + "flos": 13220776586880.0, + "grad_norm": 3.2848276198767725, + "language_loss": 0.78886813, + "learning_rate": 3.919712513221976e-06, + "loss": 0.86857259, + "num_input_tokens_seen": 42450135, + "router_z_loss_clip": 3.51171875, + "router_z_loss_mlp": 0.39038086, + "step": 1959, + "time_per_iteration": 2.57987642288208 + }, + { + "auxiliary_loss_clip": 0.06661299, + "auxiliary_loss_mlp": 0.01290487, + "balance_loss_clip": 0.06313363, + "balance_loss_mlp": 0.0125656, + "epoch": 0.1178415752292199, + "flos": 20236446915840.0, + "grad_norm": 2.2069161558777033, + "language_loss": 0.72216022, + "learning_rate": 3.919603235796832e-06, + "loss": 0.80167806, + "num_input_tokens_seen": 42470050, + "router_z_loss_clip": 3.47851562, + "router_z_loss_mlp": 0.33911133, + "step": 1960, + "time_per_iteration": 2.568760633468628 + }, + { + "auxiliary_loss_clip": 0.06675136, + "auxiliary_loss_mlp": 0.0129754, + "balance_loss_clip": 0.0632275, + "balance_loss_mlp": 0.01260156, + "epoch": 0.11790169848188788, + "flos": 13044777085440.0, + "grad_norm": 2.729190408722114, + "language_loss": 0.83173323, + "learning_rate": 3.9194938855801406e-06, + "loss": 0.91146004, + "num_input_tokens_seen": 42484335, + "router_z_loss_clip": 3.51953125, + "router_z_loss_mlp": 0.3737793, + "step": 1961, + "time_per_iteration": 2.5375704765319824 + }, + { + "auxiliary_loss_clip": 0.06648357, + "auxiliary_loss_mlp": 0.01294811, + "balance_loss_clip": 0.06310797, + "balance_loss_mlp": 0.01261671, + "epoch": 0.11796182173455584, + "flos": 22271026631040.0, + "grad_norm": 1.7537121481691995, + "language_loss": 0.93383837, + "learning_rate": 3.919384462576049e-06, + "loss": 1.01327002, + "num_input_tokens_seen": 42502720, + "router_z_loss_clip": 3.375, + "router_z_loss_mlp": 0.33105469, + "step": 1962, + "time_per_iteration": 2.5976755619049072 + }, + { + "auxiliary_loss_clip": 0.06656337, + "auxiliary_loss_mlp": 0.01295869, + "balance_loss_clip": 0.06308894, + "balance_loss_mlp": 0.0125994, + "epoch": 0.1180219449872238, + "flos": 10639750469760.0, + "grad_norm": 2.255465148131723, + "language_loss": 0.89418864, + "learning_rate": 3.919274966788707e-06, + "loss": 0.97371072, + "num_input_tokens_seen": 42519460, + "router_z_loss_clip": 3.4765625, + "router_z_loss_mlp": 0.35961914, + "step": 1963, + "time_per_iteration": 2.543811321258545 + }, + { + "auxiliary_loss_clip": 0.06669922, + "auxiliary_loss_mlp": 0.01296273, + "balance_loss_clip": 0.0631619, + "balance_loss_mlp": 0.01260963, + "epoch": 0.11808206823989177, + "flos": 20929906506240.0, + "grad_norm": 1.978622705265592, + "language_loss": 0.85645056, + "learning_rate": 3.919165398222265e-06, + "loss": 0.93611252, + "num_input_tokens_seen": 42539420, + "router_z_loss_clip": 3.5390625, + "router_z_loss_mlp": 0.35327148, + "step": 1964, + "time_per_iteration": 2.623378276824951 + }, + { + "auxiliary_loss_clip": 0.06654269, + "auxiliary_loss_mlp": 0.01293841, + "balance_loss_clip": 0.06309862, + "balance_loss_mlp": 0.01258722, + "epoch": 0.11814219149255975, + "flos": 20784151128960.0, + "grad_norm": 2.5088973707394833, + "language_loss": 0.84141672, + "learning_rate": 3.919055756880879e-06, + "loss": 0.92089784, + "num_input_tokens_seen": 42558225, + "router_z_loss_clip": 3.44335938, + "router_z_loss_mlp": 0.35107422, + "step": 1965, + "time_per_iteration": 2.5660836696624756 + }, + { + "auxiliary_loss_clip": 0.0666364, + "auxiliary_loss_mlp": 0.01301878, + "balance_loss_clip": 0.06310593, + "balance_loss_mlp": 0.01261681, + "epoch": 0.11820231474522772, + "flos": 48770594357760.0, + "grad_norm": 7.622964926374016, + "language_loss": 0.75756431, + "learning_rate": 3.918946042768707e-06, + "loss": 0.83721948, + "num_input_tokens_seen": 42580790, + "router_z_loss_clip": 3.53320312, + "router_z_loss_mlp": 0.40185547, + "step": 1966, + "time_per_iteration": 2.82966947555542 + }, + { + "auxiliary_loss_clip": 0.06671088, + "auxiliary_loss_mlp": 0.01309316, + "balance_loss_clip": 0.06322029, + "balance_loss_mlp": 0.01273887, + "epoch": 0.11826243799789568, + "flos": 16696166457600.0, + "grad_norm": 4.386609320764267, + "language_loss": 0.74750423, + "learning_rate": 3.918836255889908e-06, + "loss": 0.8273083, + "num_input_tokens_seen": 42597355, + "router_z_loss_clip": 3.49414062, + "router_z_loss_mlp": 0.35449219, + "step": 1967, + "time_per_iteration": 2.5282158851623535 + }, + { + "auxiliary_loss_clip": 0.06658092, + "auxiliary_loss_mlp": 0.01304409, + "balance_loss_clip": 0.06307551, + "balance_loss_mlp": 0.01268003, + "epoch": 0.11832256125056366, + "flos": 16915533246720.0, + "grad_norm": 2.9401944207789934, + "language_loss": 0.90244436, + "learning_rate": 3.9187263962486456e-06, + "loss": 0.98206937, + "num_input_tokens_seen": 42616060, + "router_z_loss_clip": 3.5078125, + "router_z_loss_mlp": 0.36401367, + "step": 1968, + "time_per_iteration": 2.573209285736084 + }, + { + "auxiliary_loss_clip": 0.06659393, + "auxiliary_loss_mlp": 0.01300215, + "balance_loss_clip": 0.06313194, + "balance_loss_mlp": 0.01266264, + "epoch": 0.11838268450323162, + "flos": 22827032398080.0, + "grad_norm": 2.909458687960279, + "language_loss": 0.68506658, + "learning_rate": 3.918616463849087e-06, + "loss": 0.76466268, + "num_input_tokens_seen": 42636285, + "router_z_loss_clip": 3.46289062, + "router_z_loss_mlp": 0.33935547, + "step": 1969, + "time_per_iteration": 2.574584484100342 + }, + { + "auxiliary_loss_clip": 0.06652254, + "auxiliary_loss_mlp": 0.01317322, + "balance_loss_clip": 0.06307729, + "balance_loss_mlp": 0.01281034, + "epoch": 0.11844280775589959, + "flos": 33554035296000.0, + "grad_norm": 1.9192483322460232, + "language_loss": 0.81922328, + "learning_rate": 3.918506458695399e-06, + "loss": 0.89891899, + "num_input_tokens_seen": 42658320, + "router_z_loss_clip": 3.44335938, + "router_z_loss_mlp": 0.36303711, + "step": 1970, + "time_per_iteration": 2.688477039337158 + }, + { + "auxiliary_loss_clip": 0.06493312, + "auxiliary_loss_mlp": 0.01272243, + "balance_loss_clip": 0.06287479, + "balance_loss_mlp": 0.01257163, + "epoch": 0.11850293100856757, + "flos": 66371522474880.0, + "grad_norm": 0.7778041955901001, + "language_loss": 0.66349763, + "learning_rate": 3.918396380791754e-06, + "loss": 0.74115324, + "num_input_tokens_seen": 42721500, + "router_z_loss_clip": 2.0625, + "router_z_loss_mlp": 0.1505127, + "step": 1971, + "time_per_iteration": 3.1715264320373535 + }, + { + "auxiliary_loss_clip": 0.06664559, + "auxiliary_loss_mlp": 0.01309662, + "balance_loss_clip": 0.06317366, + "balance_loss_mlp": 0.01274996, + "epoch": 0.11856305426123553, + "flos": 24687960526080.0, + "grad_norm": 2.78038897761295, + "language_loss": 0.81843936, + "learning_rate": 3.918286230142327e-06, + "loss": 0.89818156, + "num_input_tokens_seen": 42739825, + "router_z_loss_clip": 3.47070312, + "router_z_loss_mlp": 0.34643555, + "step": 1972, + "time_per_iteration": 2.6285483837127686 + }, + { + "auxiliary_loss_clip": 0.06645221, + "auxiliary_loss_mlp": 0.01320916, + "balance_loss_clip": 0.06308153, + "balance_loss_mlp": 0.01286179, + "epoch": 0.1186231775139035, + "flos": 24287017939200.0, + "grad_norm": 2.7493832888964116, + "language_loss": 0.746387, + "learning_rate": 3.918176006751292e-06, + "loss": 0.82604837, + "num_input_tokens_seen": 42758695, + "router_z_loss_clip": 3.36914062, + "router_z_loss_mlp": 0.34716797, + "step": 1973, + "time_per_iteration": 2.607680082321167 + }, + { + "auxiliary_loss_clip": 0.06639803, + "auxiliary_loss_mlp": 0.0131421, + "balance_loss_clip": 0.06300108, + "balance_loss_mlp": 0.01277851, + "epoch": 0.11868330076657148, + "flos": 21763042053120.0, + "grad_norm": 1.6365219196166583, + "language_loss": 0.73750299, + "learning_rate": 3.918065710622832e-06, + "loss": 0.81704313, + "num_input_tokens_seen": 42778510, + "router_z_loss_clip": 3.3984375, + "router_z_loss_mlp": 0.36352539, + "step": 1974, + "time_per_iteration": 2.603078603744507 + }, + { + "auxiliary_loss_clip": 0.06653641, + "auxiliary_loss_mlp": 0.01323127, + "balance_loss_clip": 0.06305285, + "balance_loss_mlp": 0.01286196, + "epoch": 0.11874342401923944, + "flos": 17197568490240.0, + "grad_norm": 3.7102130607090893, + "language_loss": 0.79475862, + "learning_rate": 3.917955341761128e-06, + "loss": 0.87452626, + "num_input_tokens_seen": 42793995, + "router_z_loss_clip": 3.48242188, + "router_z_loss_mlp": 0.36914062, + "step": 1975, + "time_per_iteration": 2.529472827911377 + }, + { + "auxiliary_loss_clip": 0.06637481, + "auxiliary_loss_mlp": 0.01318957, + "balance_loss_clip": 0.06305119, + "balance_loss_mlp": 0.01286246, + "epoch": 0.11880354727190741, + "flos": 15234629616000.0, + "grad_norm": 3.277775960681522, + "language_loss": 0.77101427, + "learning_rate": 3.917844900170364e-06, + "loss": 0.85057861, + "num_input_tokens_seen": 42809000, + "router_z_loss_clip": 3.3203125, + "router_z_loss_mlp": 0.32714844, + "step": 1976, + "time_per_iteration": 2.5576260089874268 + }, + { + "auxiliary_loss_clip": 0.06648317, + "auxiliary_loss_mlp": 0.01301156, + "balance_loss_clip": 0.0630495, + "balance_loss_mlp": 0.0126537, + "epoch": 0.11886367052457537, + "flos": 27317343248640.0, + "grad_norm": 1.6788870618385208, + "language_loss": 0.76201534, + "learning_rate": 3.91773438585473e-06, + "loss": 0.84151006, + "num_input_tokens_seen": 42831585, + "router_z_loss_clip": 3.43164062, + "router_z_loss_mlp": 0.35791016, + "step": 1977, + "time_per_iteration": 2.6103506088256836 + }, + { + "auxiliary_loss_clip": 0.06654633, + "auxiliary_loss_mlp": 0.01297753, + "balance_loss_clip": 0.06301999, + "balance_loss_mlp": 0.01261346, + "epoch": 0.11892379377724335, + "flos": 21804648405120.0, + "grad_norm": 2.329560685386949, + "language_loss": 0.75601208, + "learning_rate": 3.9176237988184165e-06, + "loss": 0.835536, + "num_input_tokens_seen": 42848420, + "router_z_loss_clip": 3.5234375, + "router_z_loss_mlp": 0.36401367, + "step": 1978, + "time_per_iteration": 2.556502103805542 + }, + { + "auxiliary_loss_clip": 0.06647499, + "auxiliary_loss_mlp": 0.01294249, + "balance_loss_clip": 0.06307921, + "balance_loss_mlp": 0.0126068, + "epoch": 0.11898391702991132, + "flos": 13996191070080.0, + "grad_norm": 1.8023230195278173, + "language_loss": 0.74423146, + "learning_rate": 3.917513139065616e-06, + "loss": 0.82364893, + "num_input_tokens_seen": 42866645, + "router_z_loss_clip": 3.40039062, + "router_z_loss_mlp": 0.33569336, + "step": 1979, + "time_per_iteration": 2.595372200012207 + }, + { + "auxiliary_loss_clip": 0.0664144, + "auxiliary_loss_mlp": 0.01296465, + "balance_loss_clip": 0.06302245, + "balance_loss_mlp": 0.01261965, + "epoch": 0.11904404028257928, + "flos": 32242907733120.0, + "grad_norm": 1.646895354500375, + "language_loss": 0.99974936, + "learning_rate": 3.917402406600525e-06, + "loss": 1.07912838, + "num_input_tokens_seen": 42888515, + "router_z_loss_clip": 3.39257812, + "router_z_loss_mlp": 0.34521484, + "step": 1980, + "time_per_iteration": 2.6381077766418457 + }, + { + "auxiliary_loss_clip": 0.06647406, + "auxiliary_loss_mlp": 0.01292706, + "balance_loss_clip": 0.0630409, + "balance_loss_mlp": 0.01256299, + "epoch": 0.11910416353524726, + "flos": 23592971370240.0, + "grad_norm": 2.6857595325388095, + "language_loss": 0.87083352, + "learning_rate": 3.917291601427342e-06, + "loss": 0.95023465, + "num_input_tokens_seen": 42909035, + "router_z_loss_clip": 3.43359375, + "router_z_loss_mlp": 0.36401367, + "step": 1981, + "time_per_iteration": 2.5953710079193115 + }, + { + "auxiliary_loss_clip": 0.0664432, + "auxiliary_loss_mlp": 0.01298025, + "balance_loss_clip": 0.06305191, + "balance_loss_mlp": 0.01263287, + "epoch": 0.11916428678791523, + "flos": 25339268712960.0, + "grad_norm": 1.936683956575477, + "language_loss": 0.86578631, + "learning_rate": 3.91718072355027e-06, + "loss": 0.94520986, + "num_input_tokens_seen": 42927555, + "router_z_loss_clip": 3.38867188, + "router_z_loss_mlp": 0.34765625, + "step": 1982, + "time_per_iteration": 2.5845234394073486 + }, + { + "auxiliary_loss_clip": 0.06636401, + "auxiliary_loss_mlp": 0.01296498, + "balance_loss_clip": 0.06295685, + "balance_loss_mlp": 0.0126095, + "epoch": 0.11922441004058319, + "flos": 19793939904000.0, + "grad_norm": 2.0505681107153273, + "language_loss": 0.86230731, + "learning_rate": 3.917069772973513e-06, + "loss": 0.94163632, + "num_input_tokens_seen": 42945300, + "router_z_loss_clip": 3.40820312, + "router_z_loss_mlp": 0.35571289, + "step": 1983, + "time_per_iteration": 2.554844379425049 + }, + { + "auxiliary_loss_clip": 0.06654783, + "auxiliary_loss_mlp": 0.01292763, + "balance_loss_clip": 0.06302382, + "balance_loss_mlp": 0.01256858, + "epoch": 0.11928453329325117, + "flos": 21541578912000.0, + "grad_norm": 3.6464912777756373, + "language_loss": 0.78593659, + "learning_rate": 3.916958749701277e-06, + "loss": 0.86541206, + "num_input_tokens_seen": 42961295, + "router_z_loss_clip": 3.51757812, + "router_z_loss_mlp": 0.35913086, + "step": 1984, + "time_per_iteration": 2.5320324897766113 + }, + { + "auxiliary_loss_clip": 0.06647135, + "auxiliary_loss_mlp": 0.01292695, + "balance_loss_clip": 0.0630364, + "balance_loss_mlp": 0.0125574, + "epoch": 0.11934465654591914, + "flos": 20821522849920.0, + "grad_norm": 1.8707303629344072, + "language_loss": 0.84522444, + "learning_rate": 3.9168476537377745e-06, + "loss": 0.92462277, + "num_input_tokens_seen": 42980330, + "router_z_loss_clip": 3.43164062, + "router_z_loss_mlp": 0.36962891, + "step": 1985, + "time_per_iteration": 2.6096858978271484 + }, + { + "auxiliary_loss_clip": 0.06641059, + "auxiliary_loss_mlp": 0.01296367, + "balance_loss_clip": 0.06304613, + "balance_loss_mlp": 0.01263346, + "epoch": 0.1194047797985871, + "flos": 19066169266560.0, + "grad_norm": 3.6983230286651945, + "language_loss": 0.75468755, + "learning_rate": 3.916736485087216e-06, + "loss": 0.83406186, + "num_input_tokens_seen": 42996125, + "router_z_loss_clip": 3.3671875, + "router_z_loss_mlp": 0.33007812, + "step": 1986, + "time_per_iteration": 2.497166633605957 + }, + { + "auxiliary_loss_clip": 0.06650525, + "auxiliary_loss_mlp": 0.01300056, + "balance_loss_clip": 0.06311469, + "balance_loss_mlp": 0.01265771, + "epoch": 0.11946490305125507, + "flos": 27196842677760.0, + "grad_norm": 2.5090300356015227, + "language_loss": 0.73365855, + "learning_rate": 3.916625243753819e-06, + "loss": 0.81316435, + "num_input_tokens_seen": 43014180, + "router_z_loss_clip": 3.390625, + "router_z_loss_mlp": 0.34301758, + "step": 1987, + "time_per_iteration": 2.6316466331481934 + }, + { + "auxiliary_loss_clip": 0.06659403, + "auxiliary_loss_mlp": 0.01313937, + "balance_loss_clip": 0.06313819, + "balance_loss_mlp": 0.01275886, + "epoch": 0.11952502630392305, + "flos": 21146925381120.0, + "grad_norm": 1.9895182313514284, + "language_loss": 0.73564172, + "learning_rate": 3.916513929741799e-06, + "loss": 0.81537521, + "num_input_tokens_seen": 43032120, + "router_z_loss_clip": 3.453125, + "router_z_loss_mlp": 0.38012695, + "step": 1988, + "time_per_iteration": 2.538780450820923 + }, + { + "auxiliary_loss_clip": 0.06646325, + "auxiliary_loss_mlp": 0.01300531, + "balance_loss_clip": 0.06309503, + "balance_loss_mlp": 0.01265817, + "epoch": 0.11958514955659101, + "flos": 22130260571520.0, + "grad_norm": 2.1843811344265434, + "language_loss": 0.82602763, + "learning_rate": 3.91640254305538e-06, + "loss": 0.90549618, + "num_input_tokens_seen": 43052215, + "router_z_loss_clip": 3.3671875, + "router_z_loss_mlp": 0.34716797, + "step": 1989, + "time_per_iteration": 2.6741979122161865 + }, + { + "auxiliary_loss_clip": 0.06651568, + "auxiliary_loss_mlp": 0.01303723, + "balance_loss_clip": 0.06308736, + "balance_loss_mlp": 0.01266482, + "epoch": 0.11964527280925898, + "flos": 17427333185280.0, + "grad_norm": 3.1495832164614828, + "language_loss": 0.77526391, + "learning_rate": 3.916291083698784e-06, + "loss": 0.85481679, + "num_input_tokens_seen": 43069720, + "router_z_loss_clip": 3.42578125, + "router_z_loss_mlp": 0.37255859, + "step": 1990, + "time_per_iteration": 3.9906837940216064 + }, + { + "auxiliary_loss_clip": 0.06541168, + "auxiliary_loss_mlp": 0.0131986, + "balance_loss_clip": 0.06337936, + "balance_loss_mlp": 0.01304852, + "epoch": 0.11970539606192696, + "flos": 70698804007680.0, + "grad_norm": 0.8660684283454352, + "language_loss": 0.55407226, + "learning_rate": 3.916179551676238e-06, + "loss": 0.63268256, + "num_input_tokens_seen": 43123130, + "router_z_loss_clip": 2.03125, + "router_z_loss_mlp": 0.14978027, + "step": 1991, + "time_per_iteration": 4.6956093311309814 + }, + { + "auxiliary_loss_clip": 0.06638116, + "auxiliary_loss_mlp": 0.01295675, + "balance_loss_clip": 0.06307568, + "balance_loss_mlp": 0.01263345, + "epoch": 0.11976551931459492, + "flos": 21221375333760.0, + "grad_norm": 2.476959921909238, + "language_loss": 0.79074007, + "learning_rate": 3.916067946991971e-06, + "loss": 0.87007797, + "num_input_tokens_seen": 43140015, + "router_z_loss_clip": 3.3046875, + "router_z_loss_mlp": 0.32348633, + "step": 1992, + "time_per_iteration": 2.5945029258728027 + }, + { + "auxiliary_loss_clip": 0.06650865, + "auxiliary_loss_mlp": 0.01302479, + "balance_loss_clip": 0.06309184, + "balance_loss_mlp": 0.01267647, + "epoch": 0.11982564256726289, + "flos": 25995566217600.0, + "grad_norm": 2.0953190944700215, + "language_loss": 0.800017, + "learning_rate": 3.915956269650216e-06, + "loss": 0.87955046, + "num_input_tokens_seen": 43160105, + "router_z_loss_clip": 3.41601562, + "router_z_loss_mlp": 0.34838867, + "step": 1993, + "time_per_iteration": 2.5923471450805664 + }, + { + "auxiliary_loss_clip": 0.06641386, + "auxiliary_loss_mlp": 0.0130103, + "balance_loss_clip": 0.06304519, + "balance_loss_mlp": 0.01266793, + "epoch": 0.11988576581993086, + "flos": 21656964384000.0, + "grad_norm": 1.8929635889117382, + "language_loss": 0.83093858, + "learning_rate": 3.915844519655208e-06, + "loss": 0.91036278, + "num_input_tokens_seen": 43179835, + "router_z_loss_clip": 3.37304688, + "router_z_loss_mlp": 0.3425293, + "step": 1994, + "time_per_iteration": 2.58314847946167 + }, + { + "auxiliary_loss_clip": 0.06638885, + "auxiliary_loss_mlp": 0.01299925, + "balance_loss_clip": 0.06306463, + "balance_loss_mlp": 0.01265617, + "epoch": 0.11994588907259883, + "flos": 17863048016640.0, + "grad_norm": 2.42141016996774, + "language_loss": 0.90494514, + "learning_rate": 3.915732697011183e-06, + "loss": 0.98433328, + "num_input_tokens_seen": 43197210, + "router_z_loss_clip": 3.3203125, + "router_z_loss_mlp": 0.34301758, + "step": 1995, + "time_per_iteration": 5.38932991027832 + }, + { + "auxiliary_loss_clip": 0.06647271, + "auxiliary_loss_mlp": 0.01300085, + "balance_loss_clip": 0.06306107, + "balance_loss_mlp": 0.01263583, + "epoch": 0.1200060123252668, + "flos": 24469725767040.0, + "grad_norm": 3.463827549229225, + "language_loss": 0.75938386, + "learning_rate": 3.9156208017223825e-06, + "loss": 0.83885741, + "num_input_tokens_seen": 43215050, + "router_z_loss_clip": 3.41015625, + "router_z_loss_mlp": 0.36523438, + "step": 1996, + "time_per_iteration": 2.630936861038208 + }, + { + "auxiliary_loss_clip": 0.06633951, + "auxiliary_loss_mlp": 0.01306595, + "balance_loss_clip": 0.06300932, + "balance_loss_mlp": 0.01273097, + "epoch": 0.12006613557793476, + "flos": 18737831842560.0, + "grad_norm": 2.002664476767551, + "language_loss": 0.88733006, + "learning_rate": 3.915508833793048e-06, + "loss": 0.96673548, + "num_input_tokens_seen": 43233900, + "router_z_loss_clip": 3.328125, + "router_z_loss_mlp": 0.33496094, + "step": 1997, + "time_per_iteration": 2.542490243911743 + }, + { + "auxiliary_loss_clip": 0.06639601, + "auxiliary_loss_mlp": 0.01299934, + "balance_loss_clip": 0.06303362, + "balance_loss_mlp": 0.01265864, + "epoch": 0.12012625883060274, + "flos": 22273374545280.0, + "grad_norm": 2.268718132008626, + "language_loss": 0.8047471, + "learning_rate": 3.915396793227428e-06, + "loss": 0.88414252, + "num_input_tokens_seen": 43252105, + "router_z_loss_clip": 3.36523438, + "router_z_loss_mlp": 0.34033203, + "step": 1998, + "time_per_iteration": 2.6070334911346436 + }, + { + "auxiliary_loss_clip": 0.06640439, + "auxiliary_loss_mlp": 0.01306471, + "balance_loss_clip": 0.06312488, + "balance_loss_mlp": 0.01272401, + "epoch": 0.1201863820832707, + "flos": 21764761061760.0, + "grad_norm": 2.100057893204002, + "language_loss": 0.73916173, + "learning_rate": 3.915284680029769e-06, + "loss": 0.81863081, + "num_input_tokens_seen": 43270315, + "router_z_loss_clip": 3.27929688, + "router_z_loss_mlp": 0.34033203, + "step": 1999, + "time_per_iteration": 2.5563113689422607 + }, + { + "auxiliary_loss_clip": 0.0664693, + "auxiliary_loss_mlp": 0.01298334, + "balance_loss_clip": 0.06304446, + "balance_loss_mlp": 0.01263763, + "epoch": 0.12024650533593867, + "flos": 21914415653760.0, + "grad_norm": 2.961282874650153, + "language_loss": 0.76137137, + "learning_rate": 3.915172494204323e-06, + "loss": 0.84082401, + "num_input_tokens_seen": 43289935, + "router_z_loss_clip": 3.42578125, + "router_z_loss_mlp": 0.34545898, + "step": 2000, + "time_per_iteration": 2.6174545288085938 + }, + { + "auxiliary_loss_clip": 0.0664265, + "auxiliary_loss_mlp": 0.0131017, + "balance_loss_clip": 0.06307586, + "balance_loss_mlp": 0.012756, + "epoch": 0.12030662858860665, + "flos": 21695635843200.0, + "grad_norm": 1.7187756113932227, + "language_loss": 0.86554497, + "learning_rate": 3.915060235755344e-06, + "loss": 0.94507325, + "num_input_tokens_seen": 43309325, + "router_z_loss_clip": 3.34960938, + "router_z_loss_mlp": 0.34545898, + "step": 2001, + "time_per_iteration": 2.575740098953247 + }, + { + "auxiliary_loss_clip": 0.06635608, + "auxiliary_loss_mlp": 0.01303825, + "balance_loss_clip": 0.06303231, + "balance_loss_mlp": 0.01270232, + "epoch": 0.12036675184127461, + "flos": 12938280145920.0, + "grad_norm": 3.0530773908117297, + "language_loss": 0.75370091, + "learning_rate": 3.91494790468709e-06, + "loss": 0.83309525, + "num_input_tokens_seen": 43327010, + "router_z_loss_clip": 3.32226562, + "router_z_loss_mlp": 0.33618164, + "step": 2002, + "time_per_iteration": 2.5708627700805664 + }, + { + "auxiliary_loss_clip": 0.06653483, + "auxiliary_loss_mlp": 0.01301657, + "balance_loss_clip": 0.06308778, + "balance_loss_mlp": 0.01265322, + "epoch": 0.12042687509394258, + "flos": 20857469051520.0, + "grad_norm": 3.724600785525669, + "language_loss": 0.79714429, + "learning_rate": 3.9148355010038185e-06, + "loss": 0.87669575, + "num_input_tokens_seen": 43345650, + "router_z_loss_clip": 3.44726562, + "router_z_loss_mlp": 0.36352539, + "step": 2003, + "time_per_iteration": 2.5530362129211426 + }, + { + "auxiliary_loss_clip": 0.06639621, + "auxiliary_loss_mlp": 0.01310661, + "balance_loss_clip": 0.06304517, + "balance_loss_mlp": 0.01276638, + "epoch": 0.12048699834661056, + "flos": 23885320665600.0, + "grad_norm": 3.082354768272036, + "language_loss": 0.72748882, + "learning_rate": 3.914723024709793e-06, + "loss": 0.80699164, + "num_input_tokens_seen": 43365555, + "router_z_loss_clip": 3.3515625, + "router_z_loss_mlp": 0.34008789, + "step": 2004, + "time_per_iteration": 2.583922863006592 + }, + { + "auxiliary_loss_clip": 0.06642192, + "auxiliary_loss_mlp": 0.01300449, + "balance_loss_clip": 0.06302966, + "balance_loss_mlp": 0.01263899, + "epoch": 0.12054712159927852, + "flos": 19762605676800.0, + "grad_norm": 1.8151207739831152, + "language_loss": 0.79435182, + "learning_rate": 3.914610475809279e-06, + "loss": 0.87377822, + "num_input_tokens_seen": 43384990, + "router_z_loss_clip": 3.39257812, + "router_z_loss_mlp": 0.36547852, + "step": 2005, + "time_per_iteration": 2.5544016361236572 + }, + { + "auxiliary_loss_clip": 0.06498255, + "auxiliary_loss_mlp": 0.01304889, + "balance_loss_clip": 0.06296292, + "balance_loss_mlp": 0.01289821, + "epoch": 0.12060724485194649, + "flos": 51688999411200.0, + "grad_norm": 0.895152271859771, + "language_loss": 0.5819217, + "learning_rate": 3.914497854306543e-06, + "loss": 0.65995312, + "num_input_tokens_seen": 43436335, + "router_z_loss_clip": 2.02539062, + "router_z_loss_mlp": 0.15039062, + "step": 2006, + "time_per_iteration": 2.9925737380981445 + }, + { + "auxiliary_loss_clip": 0.06637617, + "auxiliary_loss_mlp": 0.01298518, + "balance_loss_clip": 0.06307045, + "balance_loss_mlp": 0.01264042, + "epoch": 0.12066736810461445, + "flos": 18996582850560.0, + "grad_norm": 2.2145885601274653, + "language_loss": 0.77570707, + "learning_rate": 3.9143851602058575e-06, + "loss": 0.85506845, + "num_input_tokens_seen": 43456495, + "router_z_loss_clip": 3.30664062, + "router_z_loss_mlp": 0.34472656, + "step": 2007, + "time_per_iteration": 2.5426108837127686 + }, + { + "auxiliary_loss_clip": 0.0663473, + "auxiliary_loss_mlp": 0.01296019, + "balance_loss_clip": 0.06301288, + "balance_loss_mlp": 0.01260352, + "epoch": 0.12072749135728243, + "flos": 16477554355200.0, + "grad_norm": 3.5055454300142346, + "language_loss": 0.8601926, + "learning_rate": 3.914272393511494e-06, + "loss": 0.93950009, + "num_input_tokens_seen": 43473085, + "router_z_loss_clip": 3.328125, + "router_z_loss_mlp": 0.35668945, + "step": 2008, + "time_per_iteration": 2.5499417781829834 + }, + { + "auxiliary_loss_clip": 0.06641807, + "auxiliary_loss_mlp": 0.01291488, + "balance_loss_clip": 0.06305657, + "balance_loss_mlp": 0.0125768, + "epoch": 0.1207876146099504, + "flos": 18082917930240.0, + "grad_norm": 2.14462830622821, + "language_loss": 0.84945571, + "learning_rate": 3.91415955422773e-06, + "loss": 0.92878866, + "num_input_tokens_seen": 43491135, + "router_z_loss_clip": 3.359375, + "router_z_loss_mlp": 0.33813477, + "step": 2009, + "time_per_iteration": 2.5377557277679443 + }, + { + "auxiliary_loss_clip": 0.06634751, + "auxiliary_loss_mlp": 0.01300176, + "balance_loss_clip": 0.06306206, + "balance_loss_mlp": 0.01266225, + "epoch": 0.12084773786261836, + "flos": 21878008254720.0, + "grad_norm": 2.1676887329617336, + "language_loss": 0.85496145, + "learning_rate": 3.914046642358844e-06, + "loss": 0.93431073, + "num_input_tokens_seen": 43510440, + "router_z_loss_clip": 3.29101562, + "router_z_loss_mlp": 0.33959961, + "step": 2010, + "time_per_iteration": 2.577526330947876 + }, + { + "auxiliary_loss_clip": 0.06654292, + "auxiliary_loss_mlp": 0.0131443, + "balance_loss_clip": 0.06313477, + "balance_loss_mlp": 0.01277666, + "epoch": 0.12090786111528634, + "flos": 18338985607680.0, + "grad_norm": 2.943319840268963, + "language_loss": 0.85397738, + "learning_rate": 3.9139336579091174e-06, + "loss": 0.93366468, + "num_input_tokens_seen": 43530145, + "router_z_loss_clip": 3.40625, + "router_z_loss_mlp": 0.36767578, + "step": 2011, + "time_per_iteration": 2.5281803607940674 + }, + { + "auxiliary_loss_clip": 0.06651285, + "auxiliary_loss_mlp": 0.01306451, + "balance_loss_clip": 0.06310041, + "balance_loss_mlp": 0.01270975, + "epoch": 0.1209679843679543, + "flos": 21112236990720.0, + "grad_norm": 2.078534673475464, + "language_loss": 0.97477353, + "learning_rate": 3.913820600882834e-06, + "loss": 1.05435085, + "num_input_tokens_seen": 43549315, + "router_z_loss_clip": 3.4140625, + "router_z_loss_mlp": 0.35498047, + "step": 2012, + "time_per_iteration": 2.607473611831665 + }, + { + "auxiliary_loss_clip": 0.06639741, + "auxiliary_loss_mlp": 0.01302196, + "balance_loss_clip": 0.06309405, + "balance_loss_mlp": 0.01268865, + "epoch": 0.12102810762062227, + "flos": 29248612479360.0, + "grad_norm": 1.9848767494674133, + "language_loss": 0.81610048, + "learning_rate": 3.913707471284283e-06, + "loss": 0.89551985, + "num_input_tokens_seen": 43569240, + "router_z_loss_clip": 3.30078125, + "router_z_loss_mlp": 0.33325195, + "step": 2013, + "time_per_iteration": 2.616990566253662 + }, + { + "auxiliary_loss_clip": 0.06652003, + "auxiliary_loss_mlp": 0.01311561, + "balance_loss_clip": 0.06309032, + "balance_loss_mlp": 0.0127525, + "epoch": 0.12108823087329025, + "flos": 17936407866240.0, + "grad_norm": 5.4278493881784415, + "language_loss": 0.78293782, + "learning_rate": 3.9135942691177515e-06, + "loss": 0.8625735, + "num_input_tokens_seen": 43587710, + "router_z_loss_clip": 3.43164062, + "router_z_loss_mlp": 0.36328125, + "step": 2014, + "time_per_iteration": 2.651820421218872 + }, + { + "auxiliary_loss_clip": 0.06640598, + "auxiliary_loss_mlp": 0.01320367, + "balance_loss_clip": 0.0630708, + "balance_loss_mlp": 0.01286344, + "epoch": 0.12114835412595822, + "flos": 22098549000960.0, + "grad_norm": 2.982829144387911, + "language_loss": 0.88284999, + "learning_rate": 3.913480994387535e-06, + "loss": 0.96245968, + "num_input_tokens_seen": 43606000, + "router_z_loss_clip": 3.33398438, + "router_z_loss_mlp": 0.34008789, + "step": 2015, + "time_per_iteration": 2.5447444915771484 + }, + { + "auxiliary_loss_clip": 0.06640744, + "auxiliary_loss_mlp": 0.01318151, + "balance_loss_clip": 0.06308715, + "balance_loss_mlp": 0.01284534, + "epoch": 0.12120847737862618, + "flos": 20418567765120.0, + "grad_norm": 2.096885211944344, + "language_loss": 0.70457768, + "learning_rate": 3.913367647097926e-06, + "loss": 0.78416657, + "num_input_tokens_seen": 43624815, + "router_z_loss_clip": 3.32226562, + "router_z_loss_mlp": 0.3359375, + "step": 2016, + "time_per_iteration": 2.596148729324341 + }, + { + "auxiliary_loss_clip": 0.06646016, + "auxiliary_loss_mlp": 0.01314653, + "balance_loss_clip": 0.06304827, + "balance_loss_mlp": 0.01276792, + "epoch": 0.12126860063129415, + "flos": 22315484021760.0, + "grad_norm": 2.9748504234470214, + "language_loss": 0.80719239, + "learning_rate": 3.913254227253225e-06, + "loss": 0.8867991, + "num_input_tokens_seen": 43643960, + "router_z_loss_clip": 3.41015625, + "router_z_loss_mlp": 0.37890625, + "step": 2017, + "time_per_iteration": 2.531651020050049 + }, + { + "auxiliary_loss_clip": 0.06646961, + "auxiliary_loss_mlp": 0.01325201, + "balance_loss_clip": 0.06301364, + "balance_loss_mlp": 0.01289128, + "epoch": 0.12132872388396213, + "flos": 13704428753280.0, + "grad_norm": 11.74399096976628, + "language_loss": 0.70780957, + "learning_rate": 3.913140734857731e-06, + "loss": 0.78753114, + "num_input_tokens_seen": 43662650, + "router_z_loss_clip": 3.45507812, + "router_z_loss_mlp": 0.3605957, + "step": 2018, + "time_per_iteration": 2.555253267288208 + }, + { + "auxiliary_loss_clip": 0.06636061, + "auxiliary_loss_mlp": 0.01298517, + "balance_loss_clip": 0.06301028, + "balance_loss_mlp": 0.01264828, + "epoch": 0.12138884713663009, + "flos": 26473851722880.0, + "grad_norm": 2.8042762769346714, + "language_loss": 0.73802805, + "learning_rate": 3.91302716991575e-06, + "loss": 0.81737387, + "num_input_tokens_seen": 43684205, + "router_z_loss_clip": 3.34765625, + "router_z_loss_mlp": 0.33691406, + "step": 2019, + "time_per_iteration": 2.6203458309173584 + }, + { + "auxiliary_loss_clip": 0.06639916, + "auxiliary_loss_mlp": 0.01311356, + "balance_loss_clip": 0.06299765, + "balance_loss_mlp": 0.01277238, + "epoch": 0.12144897038929806, + "flos": 26148952316160.0, + "grad_norm": 1.829808829925435, + "language_loss": 0.93501657, + "learning_rate": 3.912913532431586e-06, + "loss": 1.01452923, + "num_input_tokens_seen": 43706320, + "router_z_loss_clip": 3.40234375, + "router_z_loss_mlp": 0.34130859, + "step": 2020, + "time_per_iteration": 2.5888445377349854 + }, + { + "auxiliary_loss_clip": 0.06633772, + "auxiliary_loss_mlp": 0.01299116, + "balance_loss_clip": 0.06297548, + "balance_loss_mlp": 0.01263568, + "epoch": 0.12150909364196603, + "flos": 24724451779200.0, + "grad_norm": 2.526616616661372, + "language_loss": 0.78976464, + "learning_rate": 3.912799822409549e-06, + "loss": 0.86909354, + "num_input_tokens_seen": 43724805, + "router_z_loss_clip": 3.36328125, + "router_z_loss_mlp": 0.35546875, + "step": 2021, + "time_per_iteration": 2.6022841930389404 + }, + { + "auxiliary_loss_clip": 0.0663517, + "auxiliary_loss_mlp": 0.01299013, + "balance_loss_clip": 0.06302813, + "balance_loss_mlp": 0.01266898, + "epoch": 0.121569216894634, + "flos": 25193177919360.0, + "grad_norm": 2.2515588789305645, + "language_loss": 0.8175382, + "learning_rate": 3.912686039853952e-06, + "loss": 0.89688003, + "num_input_tokens_seen": 43742320, + "router_z_loss_clip": 3.32421875, + "router_z_loss_mlp": 0.32128906, + "step": 2022, + "time_per_iteration": 2.5850207805633545 + }, + { + "auxiliary_loss_clip": 0.0664625, + "auxiliary_loss_mlp": 0.01295093, + "balance_loss_clip": 0.06304103, + "balance_loss_mlp": 0.0125964, + "epoch": 0.12162934014730196, + "flos": 13449241543680.0, + "grad_norm": 2.226180845904462, + "language_loss": 0.8644762, + "learning_rate": 3.912572184769108e-06, + "loss": 0.94388956, + "num_input_tokens_seen": 43760665, + "router_z_loss_clip": 3.421875, + "router_z_loss_mlp": 0.35424805, + "step": 2023, + "time_per_iteration": 2.541822671890259 + }, + { + "auxiliary_loss_clip": 0.06652313, + "auxiliary_loss_mlp": 0.01299326, + "balance_loss_clip": 0.06306356, + "balance_loss_mlp": 0.01261394, + "epoch": 0.12168946339996994, + "flos": 16951772937600.0, + "grad_norm": 3.6496728157667477, + "language_loss": 0.87528783, + "learning_rate": 3.912458257159335e-06, + "loss": 0.95480424, + "num_input_tokens_seen": 43779020, + "router_z_loss_clip": 3.4609375, + "router_z_loss_mlp": 0.37963867, + "step": 2024, + "time_per_iteration": 2.510047674179077 + }, + { + "auxiliary_loss_clip": 0.06637174, + "auxiliary_loss_mlp": 0.01298516, + "balance_loss_clip": 0.06299831, + "balance_loss_mlp": 0.01262872, + "epoch": 0.12174958665263791, + "flos": 29828699095680.0, + "grad_norm": 2.180683853985422, + "language_loss": 0.73548269, + "learning_rate": 3.912344257028954e-06, + "loss": 0.8148396, + "num_input_tokens_seen": 43798850, + "router_z_loss_clip": 3.37695312, + "router_z_loss_mlp": 0.35620117, + "step": 2025, + "time_per_iteration": 2.612072229385376 + }, + { + "auxiliary_loss_clip": 0.06640136, + "auxiliary_loss_mlp": 0.01296236, + "balance_loss_clip": 0.06301836, + "balance_loss_mlp": 0.01260425, + "epoch": 0.12180970990530587, + "flos": 24648366672000.0, + "grad_norm": 1.6158057232252747, + "language_loss": 0.77162802, + "learning_rate": 3.912230184382286e-06, + "loss": 0.85099173, + "num_input_tokens_seen": 43820130, + "router_z_loss_clip": 3.38476562, + "router_z_loss_mlp": 0.35766602, + "step": 2026, + "time_per_iteration": 2.5995230674743652 + }, + { + "auxiliary_loss_clip": 0.06645372, + "auxiliary_loss_mlp": 0.01300506, + "balance_loss_clip": 0.06307228, + "balance_loss_mlp": 0.01264219, + "epoch": 0.12186983315797385, + "flos": 20527915743360.0, + "grad_norm": 2.387338120412035, + "language_loss": 0.90280318, + "learning_rate": 3.912116039223659e-06, + "loss": 0.9822619, + "num_input_tokens_seen": 43838485, + "router_z_loss_clip": 3.37890625, + "router_z_loss_mlp": 0.36254883, + "step": 2027, + "time_per_iteration": 2.534867763519287 + }, + { + "auxiliary_loss_clip": 0.06634748, + "auxiliary_loss_mlp": 0.0129945, + "balance_loss_clip": 0.06304284, + "balance_loss_mlp": 0.01266905, + "epoch": 0.12192995641064182, + "flos": 27825705169920.0, + "grad_norm": 2.1781707070906644, + "language_loss": 0.76798415, + "learning_rate": 3.912001821557399e-06, + "loss": 0.84732616, + "num_input_tokens_seen": 43859080, + "router_z_loss_clip": 3.30859375, + "router_z_loss_mlp": 0.32543945, + "step": 2028, + "time_per_iteration": 2.578725576400757 + }, + { + "auxiliary_loss_clip": 0.0664517, + "auxiliary_loss_mlp": 0.01295232, + "balance_loss_clip": 0.06306128, + "balance_loss_mlp": 0.012614, + "epoch": 0.12199007966330978, + "flos": 22023512069760.0, + "grad_norm": 2.4518178731886318, + "language_loss": 0.78897178, + "learning_rate": 3.911887531387839e-06, + "loss": 0.86837584, + "num_input_tokens_seen": 43879030, + "router_z_loss_clip": 3.39257812, + "router_z_loss_mlp": 0.33813477, + "step": 2029, + "time_per_iteration": 2.5508341789245605 + }, + { + "auxiliary_loss_clip": 0.06643746, + "auxiliary_loss_mlp": 0.01296807, + "balance_loss_clip": 0.06307071, + "balance_loss_mlp": 0.01262475, + "epoch": 0.12205020291597775, + "flos": 23302005667200.0, + "grad_norm": 2.091887383256169, + "language_loss": 0.80821085, + "learning_rate": 3.911773168719313e-06, + "loss": 0.8876164, + "num_input_tokens_seen": 43898505, + "router_z_loss_clip": 3.36328125, + "router_z_loss_mlp": 0.34326172, + "step": 2030, + "time_per_iteration": 3.9340591430664062 + }, + { + "auxiliary_loss_clip": 0.06641008, + "auxiliary_loss_mlp": 0.01296523, + "balance_loss_clip": 0.06307271, + "balance_loss_mlp": 0.01263097, + "epoch": 0.12211032616864573, + "flos": 26038849651200.0, + "grad_norm": 4.123821558530392, + "language_loss": 0.75410855, + "learning_rate": 3.911658733556155e-06, + "loss": 0.83348382, + "num_input_tokens_seen": 43917945, + "router_z_loss_clip": 3.33984375, + "router_z_loss_mlp": 0.33398438, + "step": 2031, + "time_per_iteration": 4.0164101123809814 + }, + { + "auxiliary_loss_clip": 0.06642319, + "auxiliary_loss_mlp": 0.01298968, + "balance_loss_clip": 0.06307532, + "balance_loss_mlp": 0.01265947, + "epoch": 0.12217044942131369, + "flos": 20416932610560.0, + "grad_norm": 1.945082071582731, + "language_loss": 0.76790285, + "learning_rate": 3.911544225902707e-06, + "loss": 0.84731567, + "num_input_tokens_seen": 43937385, + "router_z_loss_clip": 3.34765625, + "router_z_loss_mlp": 0.33032227, + "step": 2032, + "time_per_iteration": 2.5583930015563965 + }, + { + "auxiliary_loss_clip": 0.0663031, + "auxiliary_loss_mlp": 0.01300948, + "balance_loss_clip": 0.06305249, + "balance_loss_mlp": 0.01266901, + "epoch": 0.12223057267398166, + "flos": 22863817140480.0, + "grad_norm": 1.7389762148633483, + "language_loss": 0.89850545, + "learning_rate": 3.911429645763311e-06, + "loss": 0.97781807, + "num_input_tokens_seen": 43958130, + "router_z_loss_clip": 3.24609375, + "router_z_loss_mlp": 0.34057617, + "step": 2033, + "time_per_iteration": 2.5717952251434326 + }, + { + "auxiliary_loss_clip": 0.06656118, + "auxiliary_loss_mlp": 0.01295873, + "balance_loss_clip": 0.06305313, + "balance_loss_mlp": 0.01260063, + "epoch": 0.12229069592664964, + "flos": 20053739088000.0, + "grad_norm": 2.329108980084039, + "language_loss": 0.67293733, + "learning_rate": 3.911314993142311e-06, + "loss": 0.75245726, + "num_input_tokens_seen": 43976800, + "router_z_loss_clip": 3.50585938, + "router_z_loss_mlp": 0.3581543, + "step": 2034, + "time_per_iteration": 5.42257833480835 + }, + { + "auxiliary_loss_clip": 0.06636314, + "auxiliary_loss_mlp": 0.01296044, + "balance_loss_clip": 0.06304356, + "balance_loss_mlp": 0.0126164, + "epoch": 0.1223508191793176, + "flos": 22280963339520.0, + "grad_norm": 1.830897331176389, + "language_loss": 0.77330279, + "learning_rate": 3.911200268044055e-06, + "loss": 0.85262644, + "num_input_tokens_seen": 43996620, + "router_z_loss_clip": 3.32226562, + "router_z_loss_mlp": 0.34375, + "step": 2035, + "time_per_iteration": 2.636413097381592 + }, + { + "auxiliary_loss_clip": 0.06651293, + "auxiliary_loss_mlp": 0.01293249, + "balance_loss_clip": 0.06302898, + "balance_loss_mlp": 0.01258893, + "epoch": 0.12241094243198557, + "flos": 21292009925760.0, + "grad_norm": 2.7740017238095187, + "language_loss": 0.73084652, + "learning_rate": 3.911085470472892e-06, + "loss": 0.81029195, + "num_input_tokens_seen": 44016175, + "router_z_loss_clip": 3.48632812, + "router_z_loss_mlp": 0.34350586, + "step": 2036, + "time_per_iteration": 2.528167724609375 + }, + { + "auxiliary_loss_clip": 0.06639268, + "auxiliary_loss_mlp": 0.01290851, + "balance_loss_clip": 0.06301118, + "balance_loss_mlp": 0.01256185, + "epoch": 0.12247106568465355, + "flos": 17387823185280.0, + "grad_norm": 1.824605307650974, + "language_loss": 0.84228837, + "learning_rate": 3.910970600433178e-06, + "loss": 0.92158961, + "num_input_tokens_seen": 44035060, + "router_z_loss_clip": 3.3828125, + "router_z_loss_mlp": 0.34692383, + "step": 2037, + "time_per_iteration": 2.554356575012207 + }, + { + "auxiliary_loss_clip": 0.06640968, + "auxiliary_loss_mlp": 0.0129909, + "balance_loss_clip": 0.06304546, + "balance_loss_mlp": 0.01265043, + "epoch": 0.12253118893732151, + "flos": 27051548497920.0, + "grad_norm": 3.231665500772768, + "language_loss": 0.81365263, + "learning_rate": 3.910855657929267e-06, + "loss": 0.89305323, + "num_input_tokens_seen": 44053330, + "router_z_loss_clip": 3.36523438, + "router_z_loss_mlp": 0.34057617, + "step": 2038, + "time_per_iteration": 2.5666050910949707 + }, + { + "auxiliary_loss_clip": 0.0649721, + "auxiliary_loss_mlp": 0.01268916, + "balance_loss_clip": 0.06293084, + "balance_loss_mlp": 0.01256113, + "epoch": 0.12259131218998948, + "flos": 53878055328000.0, + "grad_norm": 0.7896182211698063, + "language_loss": 0.58607936, + "learning_rate": 3.910740642965518e-06, + "loss": 0.66374058, + "num_input_tokens_seen": 44107575, + "router_z_loss_clip": 2.04101562, + "router_z_loss_mlp": 0.12817383, + "step": 2039, + "time_per_iteration": 3.1232099533081055 + }, + { + "auxiliary_loss_clip": 0.06641525, + "auxiliary_loss_mlp": 0.0129467, + "balance_loss_clip": 0.06306375, + "balance_loss_mlp": 0.01261053, + "epoch": 0.12265143544265744, + "flos": 17897233282560.0, + "grad_norm": 3.4610063472864065, + "language_loss": 0.82137585, + "learning_rate": 3.910625555546292e-06, + "loss": 0.90073782, + "num_input_tokens_seen": 44126075, + "router_z_loss_clip": 3.34765625, + "router_z_loss_mlp": 0.33569336, + "step": 2040, + "time_per_iteration": 2.5443432331085205 + }, + { + "auxiliary_loss_clip": 0.06629258, + "auxiliary_loss_mlp": 0.01288004, + "balance_loss_clip": 0.06301395, + "balance_loss_mlp": 0.01255031, + "epoch": 0.12271155869532542, + "flos": 21806577048960.0, + "grad_norm": 2.3749836007198546, + "language_loss": 0.84196723, + "learning_rate": 3.910510395675953e-06, + "loss": 0.92113984, + "num_input_tokens_seen": 44145605, + "router_z_loss_clip": 3.27539062, + "router_z_loss_mlp": 0.32983398, + "step": 2041, + "time_per_iteration": 2.5387189388275146 + }, + { + "auxiliary_loss_clip": 0.06646631, + "auxiliary_loss_mlp": 0.01292367, + "balance_loss_clip": 0.06301489, + "balance_loss_mlp": 0.0125627, + "epoch": 0.12277168194799339, + "flos": 19834917350400.0, + "grad_norm": 2.032940304960421, + "language_loss": 0.68564701, + "learning_rate": 3.9103951633588694e-06, + "loss": 0.76503706, + "num_input_tokens_seen": 44164770, + "router_z_loss_clip": 3.44726562, + "router_z_loss_mlp": 0.36083984, + "step": 2042, + "time_per_iteration": 2.5871469974517822 + }, + { + "auxiliary_loss_clip": 0.06626363, + "auxiliary_loss_mlp": 0.01291525, + "balance_loss_clip": 0.06293724, + "balance_loss_mlp": 0.01258957, + "epoch": 0.12283180520066135, + "flos": 23227597641600.0, + "grad_norm": 4.507885061874762, + "language_loss": 0.82501084, + "learning_rate": 3.910279858599409e-06, + "loss": 0.90418965, + "num_input_tokens_seen": 44184025, + "router_z_loss_clip": 3.328125, + "router_z_loss_mlp": 0.32568359, + "step": 2043, + "time_per_iteration": 2.5436289310455322 + }, + { + "auxiliary_loss_clip": 0.06642601, + "auxiliary_loss_mlp": 0.01293474, + "balance_loss_clip": 0.06301275, + "balance_loss_mlp": 0.01260501, + "epoch": 0.12289192845332933, + "flos": 18594466306560.0, + "grad_norm": 1.8262165625903515, + "language_loss": 0.8169322, + "learning_rate": 3.910164481401946e-06, + "loss": 0.89629292, + "num_input_tokens_seen": 44202950, + "router_z_loss_clip": 3.41210938, + "router_z_loss_mlp": 0.32983398, + "step": 2044, + "time_per_iteration": 2.5594139099121094 + }, + { + "auxiliary_loss_clip": 0.06635186, + "auxiliary_loss_mlp": 0.0128851, + "balance_loss_clip": 0.06299295, + "balance_loss_mlp": 0.01254416, + "epoch": 0.1229520517059973, + "flos": 25775612449920.0, + "grad_norm": 1.8452303970598702, + "language_loss": 0.79028547, + "learning_rate": 3.910049031770853e-06, + "loss": 0.86952239, + "num_input_tokens_seen": 44221115, + "router_z_loss_clip": 3.35742188, + "router_z_loss_mlp": 0.34082031, + "step": 2045, + "time_per_iteration": 2.5465781688690186 + }, + { + "auxiliary_loss_clip": 0.06636953, + "auxiliary_loss_mlp": 0.01295167, + "balance_loss_clip": 0.06298777, + "balance_loss_mlp": 0.01262408, + "epoch": 0.12301217495866526, + "flos": 20893541034240.0, + "grad_norm": 1.9769865564806426, + "language_loss": 0.69156218, + "learning_rate": 3.90993350971051e-06, + "loss": 0.77088338, + "num_input_tokens_seen": 44240575, + "router_z_loss_clip": 3.38671875, + "router_z_loss_mlp": 0.32763672, + "step": 2046, + "time_per_iteration": 2.5848565101623535 + }, + { + "auxiliary_loss_clip": 0.06628656, + "auxiliary_loss_mlp": 0.01290131, + "balance_loss_clip": 0.06297234, + "balance_loss_mlp": 0.01257277, + "epoch": 0.12307229821133324, + "flos": 22384735021440.0, + "grad_norm": 2.0992511324886713, + "language_loss": 0.73182803, + "learning_rate": 3.909817915225297e-06, + "loss": 0.8110159, + "num_input_tokens_seen": 44257145, + "router_z_loss_clip": 3.31640625, + "router_z_loss_mlp": 0.32861328, + "step": 2047, + "time_per_iteration": 2.5309009552001953 + }, + { + "auxiliary_loss_clip": 0.06630135, + "auxiliary_loss_mlp": 0.0129866, + "balance_loss_clip": 0.06297912, + "balance_loss_mlp": 0.01263732, + "epoch": 0.1231324214640012, + "flos": 23374065778560.0, + "grad_norm": 2.486188262823441, + "language_loss": 0.77457881, + "learning_rate": 3.909702248319597e-06, + "loss": 0.85386682, + "num_input_tokens_seen": 44278035, + "router_z_loss_clip": 3.32226562, + "router_z_loss_mlp": 0.34912109, + "step": 2048, + "time_per_iteration": 2.6273012161254883 + }, + { + "auxiliary_loss_clip": 0.06627734, + "auxiliary_loss_mlp": 0.01290224, + "balance_loss_clip": 0.06297483, + "balance_loss_mlp": 0.01258514, + "epoch": 0.12319254471666917, + "flos": 23773624773120.0, + "grad_norm": 1.9256853930308273, + "language_loss": 0.8659687, + "learning_rate": 3.909586508997797e-06, + "loss": 0.94514829, + "num_input_tokens_seen": 44296980, + "router_z_loss_clip": 3.30273438, + "router_z_loss_mlp": 0.31665039, + "step": 2049, + "time_per_iteration": 2.559253692626953 + }, + { + "auxiliary_loss_clip": 0.06639866, + "auxiliary_loss_mlp": 0.01291416, + "balance_loss_clip": 0.06300847, + "balance_loss_mlp": 0.01257751, + "epoch": 0.12325266796933713, + "flos": 23556899387520.0, + "grad_norm": 2.574663902354124, + "language_loss": 0.76814753, + "learning_rate": 3.909470697264285e-06, + "loss": 0.84746033, + "num_input_tokens_seen": 44318005, + "router_z_loss_clip": 3.390625, + "router_z_loss_mlp": 0.33691406, + "step": 2050, + "time_per_iteration": 2.6138648986816406 + }, + { + "auxiliary_loss_clip": 0.06634495, + "auxiliary_loss_mlp": 0.0128935, + "balance_loss_clip": 0.06301371, + "balance_loss_mlp": 0.01256353, + "epoch": 0.12331279122200511, + "flos": 24430593110400.0, + "grad_norm": 2.4676515957678826, + "language_loss": 0.82809746, + "learning_rate": 3.909354813123452e-06, + "loss": 0.90733588, + "num_input_tokens_seen": 44335260, + "router_z_loss_clip": 3.328125, + "router_z_loss_mlp": 0.32983398, + "step": 2051, + "time_per_iteration": 2.53440260887146 + }, + { + "auxiliary_loss_clip": 0.06631288, + "auxiliary_loss_mlp": 0.01288335, + "balance_loss_clip": 0.06299216, + "balance_loss_mlp": 0.01256625, + "epoch": 0.12337291447467308, + "flos": 25491438927360.0, + "grad_norm": 2.0266783151609666, + "language_loss": 0.81273621, + "learning_rate": 3.909238856579693e-06, + "loss": 0.89193243, + "num_input_tokens_seen": 44355315, + "router_z_loss_clip": 3.32421875, + "router_z_loss_mlp": 0.3170166, + "step": 2052, + "time_per_iteration": 2.5801045894622803 + }, + { + "auxiliary_loss_clip": 0.06643972, + "auxiliary_loss_mlp": 0.012894, + "balance_loss_clip": 0.06304064, + "balance_loss_mlp": 0.0125533, + "epoch": 0.12343303772734104, + "flos": 23556731679360.0, + "grad_norm": 2.520879144307052, + "language_loss": 0.75331706, + "learning_rate": 3.909122827637406e-06, + "loss": 0.83265078, + "num_input_tokens_seen": 44373020, + "router_z_loss_clip": 3.40039062, + "router_z_loss_mlp": 0.34082031, + "step": 2053, + "time_per_iteration": 2.5373435020446777 + }, + { + "auxiliary_loss_clip": 0.06645267, + "auxiliary_loss_mlp": 0.01289892, + "balance_loss_clip": 0.06306874, + "balance_loss_mlp": 0.01256919, + "epoch": 0.12349316098000902, + "flos": 47567724670080.0, + "grad_norm": 1.6252086945457442, + "language_loss": 0.75631851, + "learning_rate": 3.909006726300991e-06, + "loss": 0.83567011, + "num_input_tokens_seen": 44397525, + "router_z_loss_clip": 3.37890625, + "router_z_loss_mlp": 0.32983398, + "step": 2054, + "time_per_iteration": 2.7952961921691895 + }, + { + "auxiliary_loss_clip": 0.06634779, + "auxiliary_loss_mlp": 0.01287596, + "balance_loss_clip": 0.06307411, + "balance_loss_mlp": 0.0125715, + "epoch": 0.12355328423267699, + "flos": 25052956911360.0, + "grad_norm": 1.7485213657356729, + "language_loss": 0.86270738, + "learning_rate": 3.908890552574849e-06, + "loss": 0.94193119, + "num_input_tokens_seen": 44415890, + "router_z_loss_clip": 3.27148438, + "router_z_loss_mlp": 0.30419922, + "step": 2055, + "time_per_iteration": 2.553056001663208 + }, + { + "auxiliary_loss_clip": 0.06643809, + "auxiliary_loss_mlp": 0.01295066, + "balance_loss_clip": 0.06311696, + "balance_loss_mlp": 0.0126226, + "epoch": 0.12361340748534495, + "flos": 27716524899840.0, + "grad_norm": 2.053117172443155, + "language_loss": 0.78908336, + "learning_rate": 3.908774306463384e-06, + "loss": 0.86847222, + "num_input_tokens_seen": 44436625, + "router_z_loss_clip": 3.32421875, + "router_z_loss_mlp": 0.328125, + "step": 2056, + "time_per_iteration": 2.632049322128296 + }, + { + "auxiliary_loss_clip": 0.06652766, + "auxiliary_loss_mlp": 0.01294236, + "balance_loss_clip": 0.06316112, + "balance_loss_mlp": 0.01262002, + "epoch": 0.12367353073801293, + "flos": 26147778359040.0, + "grad_norm": 2.0516910638510835, + "language_loss": 0.84512216, + "learning_rate": 3.908657987971009e-06, + "loss": 0.92459214, + "num_input_tokens_seen": 44455265, + "router_z_loss_clip": 3.3671875, + "router_z_loss_mlp": 0.32226562, + "step": 2057, + "time_per_iteration": 2.5529589653015137 + }, + { + "auxiliary_loss_clip": 0.06650747, + "auxiliary_loss_mlp": 0.0129436, + "balance_loss_clip": 0.06317189, + "balance_loss_mlp": 0.01261553, + "epoch": 0.1237336539906809, + "flos": 25163143430400.0, + "grad_norm": 1.8863431007110945, + "language_loss": 0.7932052, + "learning_rate": 3.90854159710213e-06, + "loss": 0.87265623, + "num_input_tokens_seen": 44475815, + "router_z_loss_clip": 3.33398438, + "router_z_loss_mlp": 0.328125, + "step": 2058, + "time_per_iteration": 2.636936902999878 + }, + { + "auxiliary_loss_clip": 0.06652544, + "auxiliary_loss_mlp": 0.01294377, + "balance_loss_clip": 0.06313539, + "balance_loss_mlp": 0.01259782, + "epoch": 0.12379377724334886, + "flos": 15310001963520.0, + "grad_norm": 2.1631103181071865, + "language_loss": 0.84899569, + "learning_rate": 3.9084251338611624e-06, + "loss": 0.92846489, + "num_input_tokens_seen": 44494045, + "router_z_loss_clip": 3.38671875, + "router_z_loss_mlp": 0.34619141, + "step": 2059, + "time_per_iteration": 2.534330129623413 + }, + { + "auxiliary_loss_clip": 0.06649262, + "auxiliary_loss_mlp": 0.01290616, + "balance_loss_clip": 0.06311791, + "balance_loss_mlp": 0.01258405, + "epoch": 0.12385390049601683, + "flos": 21321792852480.0, + "grad_norm": 2.425291985469593, + "language_loss": 0.82626045, + "learning_rate": 3.908308598252523e-06, + "loss": 0.90565926, + "num_input_tokens_seen": 44509120, + "router_z_loss_clip": 3.37695312, + "router_z_loss_mlp": 0.32177734, + "step": 2060, + "time_per_iteration": 2.6014535427093506 + }, + { + "auxiliary_loss_clip": 0.06642138, + "auxiliary_loss_mlp": 0.01290673, + "balance_loss_clip": 0.06310271, + "balance_loss_mlp": 0.01256579, + "epoch": 0.1239140237486848, + "flos": 15120711590400.0, + "grad_norm": 2.0800945388405734, + "language_loss": 0.87935984, + "learning_rate": 3.9081919902806306e-06, + "loss": 0.95868802, + "num_input_tokens_seen": 44525780, + "router_z_loss_clip": 3.31640625, + "router_z_loss_mlp": 0.34082031, + "step": 2061, + "time_per_iteration": 2.494584321975708 + }, + { + "auxiliary_loss_clip": 0.0663335, + "auxiliary_loss_mlp": 0.01291205, + "balance_loss_clip": 0.06306711, + "balance_loss_mlp": 0.01260259, + "epoch": 0.12397414700135277, + "flos": 21982534623360.0, + "grad_norm": 1.9753177189275368, + "language_loss": 0.85858583, + "learning_rate": 3.908075309949906e-06, + "loss": 0.9378314, + "num_input_tokens_seen": 44543125, + "router_z_loss_clip": 3.26953125, + "router_z_loss_mlp": 0.30932617, + "step": 2062, + "time_per_iteration": 2.5650103092193604 + }, + { + "auxiliary_loss_clip": 0.06642005, + "auxiliary_loss_mlp": 0.01288926, + "balance_loss_clip": 0.06311023, + "balance_loss_mlp": 0.01256549, + "epoch": 0.12403427025402074, + "flos": 13404909934080.0, + "grad_norm": 1.7604795458830171, + "language_loss": 0.80305374, + "learning_rate": 3.907958557264774e-06, + "loss": 0.88236302, + "num_input_tokens_seen": 44560275, + "router_z_loss_clip": 3.30859375, + "router_z_loss_mlp": 0.32373047, + "step": 2063, + "time_per_iteration": 2.5019121170043945 + }, + { + "auxiliary_loss_clip": 0.06644779, + "auxiliary_loss_mlp": 0.0129093, + "balance_loss_clip": 0.06312533, + "balance_loss_mlp": 0.01257146, + "epoch": 0.12409439350668872, + "flos": 15309750401280.0, + "grad_norm": 2.5047408324670832, + "language_loss": 0.80646086, + "learning_rate": 3.907841732229663e-06, + "loss": 0.885818, + "num_input_tokens_seen": 44577640, + "router_z_loss_clip": 3.32421875, + "router_z_loss_mlp": 0.33789062, + "step": 2064, + "time_per_iteration": 2.5915873050689697 + }, + { + "auxiliary_loss_clip": 0.06642206, + "auxiliary_loss_mlp": 0.01295102, + "balance_loss_clip": 0.06310631, + "balance_loss_mlp": 0.01263583, + "epoch": 0.12415451675935668, + "flos": 25016339877120.0, + "grad_norm": 2.4114555321806677, + "language_loss": 0.93642998, + "learning_rate": 3.907724834849002e-06, + "loss": 1.0158031, + "num_input_tokens_seen": 44594860, + "router_z_loss_clip": 3.31445312, + "router_z_loss_mlp": 0.31542969, + "step": 2065, + "time_per_iteration": 2.561858892440796 + }, + { + "auxiliary_loss_clip": 0.06650305, + "auxiliary_loss_mlp": 0.01289676, + "balance_loss_clip": 0.06313996, + "balance_loss_mlp": 0.01256845, + "epoch": 0.12421464001202465, + "flos": 23666457000960.0, + "grad_norm": 2.189266948105698, + "language_loss": 0.81909287, + "learning_rate": 3.907607865127225e-06, + "loss": 0.89849269, + "num_input_tokens_seen": 44614780, + "router_z_loss_clip": 3.36523438, + "router_z_loss_mlp": 0.32836914, + "step": 2066, + "time_per_iteration": 2.593202590942383 + }, + { + "auxiliary_loss_clip": 0.06490391, + "auxiliary_loss_mlp": 0.01263186, + "balance_loss_clip": 0.06289329, + "balance_loss_mlp": 0.01251599, + "epoch": 0.12427476326469263, + "flos": 65753686794240.0, + "grad_norm": 0.8319051039342746, + "language_loss": 0.63633674, + "learning_rate": 3.907490823068766e-06, + "loss": 0.71387255, + "num_input_tokens_seen": 44671240, + "router_z_loss_clip": 2.00195312, + "router_z_loss_mlp": 0.11578369, + "step": 2067, + "time_per_iteration": 3.1761627197265625 + }, + { + "auxiliary_loss_clip": 0.06645706, + "auxiliary_loss_mlp": 0.01298846, + "balance_loss_clip": 0.0631035, + "balance_loss_mlp": 0.01263441, + "epoch": 0.12433488651736059, + "flos": 24542372856960.0, + "grad_norm": 1.826307317776044, + "language_loss": 0.94409752, + "learning_rate": 3.907373708678063e-06, + "loss": 1.023543, + "num_input_tokens_seen": 44691050, + "router_z_loss_clip": 3.3515625, + "router_z_loss_mlp": 0.35375977, + "step": 2068, + "time_per_iteration": 2.548051357269287 + }, + { + "auxiliary_loss_clip": 0.06634392, + "auxiliary_loss_mlp": 0.01295819, + "balance_loss_clip": 0.06307046, + "balance_loss_mlp": 0.01265087, + "epoch": 0.12439500977002856, + "flos": 21037828965120.0, + "grad_norm": 2.192174211914145, + "language_loss": 0.82850045, + "learning_rate": 3.9072565219595596e-06, + "loss": 0.90780252, + "num_input_tokens_seen": 44709850, + "router_z_loss_clip": 3.2734375, + "router_z_loss_mlp": 0.30712891, + "step": 2069, + "time_per_iteration": 3.9771463871002197 + }, + { + "auxiliary_loss_clip": 0.0664653, + "auxiliary_loss_mlp": 0.01287176, + "balance_loss_clip": 0.06312294, + "balance_loss_mlp": 0.01255276, + "epoch": 0.12445513302269653, + "flos": 26837380661760.0, + "grad_norm": 2.140489528942806, + "language_loss": 0.78554291, + "learning_rate": 3.907139262917696e-06, + "loss": 0.86487997, + "num_input_tokens_seen": 44731475, + "router_z_loss_clip": 3.33984375, + "router_z_loss_mlp": 0.31884766, + "step": 2070, + "time_per_iteration": 2.5697221755981445 + }, + { + "auxiliary_loss_clip": 0.06645045, + "auxiliary_loss_mlp": 0.01288939, + "balance_loss_clip": 0.06311486, + "balance_loss_mlp": 0.01258469, + "epoch": 0.1245152562753645, + "flos": 18374764101120.0, + "grad_norm": 2.28424874253062, + "language_loss": 0.81667042, + "learning_rate": 3.907021931556922e-06, + "loss": 0.89601028, + "num_input_tokens_seen": 44749685, + "router_z_loss_clip": 3.3359375, + "router_z_loss_mlp": 0.3046875, + "step": 2071, + "time_per_iteration": 3.9356284141540527 + }, + { + "auxiliary_loss_clip": 0.06624742, + "auxiliary_loss_mlp": 0.01289094, + "balance_loss_clip": 0.06303577, + "balance_loss_mlp": 0.01256407, + "epoch": 0.12457537952803246, + "flos": 33116098331520.0, + "grad_norm": 2.0527550980706626, + "language_loss": 0.79415953, + "learning_rate": 3.906904527881684e-06, + "loss": 0.87329787, + "num_input_tokens_seen": 44772165, + "router_z_loss_clip": 3.20898438, + "router_z_loss_mlp": 0.32666016, + "step": 2072, + "time_per_iteration": 2.659824848175049 + }, + { + "auxiliary_loss_clip": 0.06639021, + "auxiliary_loss_mlp": 0.01293554, + "balance_loss_clip": 0.06306598, + "balance_loss_mlp": 0.01260819, + "epoch": 0.12463550278070043, + "flos": 22276267511040.0, + "grad_norm": 2.0170209718237144, + "language_loss": 0.76458508, + "learning_rate": 3.9067870518964355e-06, + "loss": 0.84391081, + "num_input_tokens_seen": 44790580, + "router_z_loss_clip": 3.32226562, + "router_z_loss_mlp": 0.32739258, + "step": 2073, + "time_per_iteration": 4.0372233390808105 + }, + { + "auxiliary_loss_clip": 0.06627664, + "auxiliary_loss_mlp": 0.012867, + "balance_loss_clip": 0.06303963, + "balance_loss_mlp": 0.01255491, + "epoch": 0.12469562603336841, + "flos": 14683445458560.0, + "grad_norm": 1.9751185197934578, + "language_loss": 0.9136548, + "learning_rate": 3.906669503605631e-06, + "loss": 0.99279845, + "num_input_tokens_seen": 44806730, + "router_z_loss_clip": 3.24023438, + "router_z_loss_mlp": 0.3125, + "step": 2074, + "time_per_iteration": 3.880718946456909 + }, + { + "auxiliary_loss_clip": 0.06644025, + "auxiliary_loss_mlp": 0.01296508, + "balance_loss_clip": 0.06306964, + "balance_loss_mlp": 0.0126065, + "epoch": 0.12475574928603637, + "flos": 24651720835200.0, + "grad_norm": 2.411338932827457, + "language_loss": 0.85379255, + "learning_rate": 3.906551883013728e-06, + "loss": 0.93319792, + "num_input_tokens_seen": 44825550, + "router_z_loss_clip": 3.36914062, + "router_z_loss_mlp": 0.35839844, + "step": 2075, + "time_per_iteration": 2.593402147293091 + }, + { + "auxiliary_loss_clip": 0.06632458, + "auxiliary_loss_mlp": 0.01300353, + "balance_loss_clip": 0.06302904, + "balance_loss_mlp": 0.01267166, + "epoch": 0.12481587253870434, + "flos": 21769540744320.0, + "grad_norm": 1.9904013424210072, + "language_loss": 0.73795271, + "learning_rate": 3.9064341901251865e-06, + "loss": 0.81728083, + "num_input_tokens_seen": 44844155, + "router_z_loss_clip": 3.29296875, + "router_z_loss_mlp": 0.33227539, + "step": 2076, + "time_per_iteration": 2.5252525806427 + }, + { + "auxiliary_loss_clip": 0.06619625, + "auxiliary_loss_mlp": 0.01296003, + "balance_loss_clip": 0.06298469, + "balance_loss_mlp": 0.0126632, + "epoch": 0.12487599579137232, + "flos": 21438687697920.0, + "grad_norm": 2.119852671968812, + "language_loss": 0.76853049, + "learning_rate": 3.906316424944469e-06, + "loss": 0.84768671, + "num_input_tokens_seen": 44863780, + "router_z_loss_clip": 3.2109375, + "router_z_loss_mlp": 0.29663086, + "step": 2077, + "time_per_iteration": 2.5812795162200928 + }, + { + "auxiliary_loss_clip": 0.06627834, + "auxiliary_loss_mlp": 0.01294428, + "balance_loss_clip": 0.06298409, + "balance_loss_mlp": 0.01261503, + "epoch": 0.12493611904404028, + "flos": 16113228802560.0, + "grad_norm": 2.6079444778137906, + "language_loss": 0.83980322, + "learning_rate": 3.906198587476043e-06, + "loss": 0.9190259, + "num_input_tokens_seen": 44881480, + "router_z_loss_clip": 3.296875, + "router_z_loss_mlp": 0.3293457, + "step": 2078, + "time_per_iteration": 2.5144779682159424 + }, + { + "auxiliary_loss_clip": 0.06633472, + "auxiliary_loss_mlp": 0.01297977, + "balance_loss_clip": 0.06301548, + "balance_loss_mlp": 0.01265337, + "epoch": 0.12499624229670825, + "flos": 21586749062400.0, + "grad_norm": 2.088353376240652, + "language_loss": 0.7681694, + "learning_rate": 3.906080677724374e-06, + "loss": 0.84748387, + "num_input_tokens_seen": 44900390, + "router_z_loss_clip": 3.32226562, + "router_z_loss_mlp": 0.32617188, + "step": 2079, + "time_per_iteration": 2.638761043548584 + }, + { + "auxiliary_loss_clip": 0.06640807, + "auxiliary_loss_mlp": 0.01295919, + "balance_loss_clip": 0.06307015, + "balance_loss_mlp": 0.01263351, + "epoch": 0.1250563655493762, + "flos": 25705522909440.0, + "grad_norm": 2.3726479932939064, + "language_loss": 0.85245967, + "learning_rate": 3.905962695693935e-06, + "loss": 0.93182695, + "num_input_tokens_seen": 44920375, + "router_z_loss_clip": 3.33984375, + "router_z_loss_mlp": 0.32592773, + "step": 2080, + "time_per_iteration": 2.5898683071136475 + }, + { + "auxiliary_loss_clip": 0.06618196, + "auxiliary_loss_mlp": 0.0130361, + "balance_loss_clip": 0.06292213, + "balance_loss_mlp": 0.01269993, + "epoch": 0.12511648880204418, + "flos": 16915113976320.0, + "grad_norm": 2.1047824756143263, + "language_loss": 0.86146665, + "learning_rate": 3.9058446413892e-06, + "loss": 0.94068468, + "num_input_tokens_seen": 44938415, + "router_z_loss_clip": 3.25976562, + "router_z_loss_mlp": 0.3359375, + "step": 2081, + "time_per_iteration": 2.5291430950164795 + }, + { + "auxiliary_loss_clip": 0.06628423, + "auxiliary_loss_mlp": 0.01299212, + "balance_loss_clip": 0.06304745, + "balance_loss_mlp": 0.01268289, + "epoch": 0.12517661205471217, + "flos": 17573423978880.0, + "grad_norm": 1.9525319716543403, + "language_loss": 0.77591729, + "learning_rate": 3.905726514814646e-06, + "loss": 0.85519361, + "num_input_tokens_seen": 44957135, + "router_z_loss_clip": 3.23632812, + "router_z_loss_mlp": 0.30908203, + "step": 2082, + "time_per_iteration": 2.5817041397094727 + }, + { + "auxiliary_loss_clip": 0.06645833, + "auxiliary_loss_mlp": 0.01295307, + "balance_loss_clip": 0.06304055, + "balance_loss_mlp": 0.01261118, + "epoch": 0.12523673530738014, + "flos": 16039240047360.0, + "grad_norm": 3.06086551706414, + "language_loss": 0.80167735, + "learning_rate": 3.9056083159747495e-06, + "loss": 0.88108873, + "num_input_tokens_seen": 44974480, + "router_z_loss_clip": 3.421875, + "router_z_loss_mlp": 0.34179688, + "step": 2083, + "time_per_iteration": 2.6278059482574463 + }, + { + "auxiliary_loss_clip": 0.06632711, + "auxiliary_loss_mlp": 0.01297422, + "balance_loss_clip": 0.06298797, + "balance_loss_mlp": 0.0126297, + "epoch": 0.1252968585600481, + "flos": 18813833095680.0, + "grad_norm": 3.451384720222282, + "language_loss": 0.92214763, + "learning_rate": 3.9054900448739966e-06, + "loss": 1.00144899, + "num_input_tokens_seen": 44990310, + "router_z_loss_clip": 3.33789062, + "router_z_loss_mlp": 0.34472656, + "step": 2084, + "time_per_iteration": 2.501530647277832 + }, + { + "auxiliary_loss_clip": 0.0662484, + "auxiliary_loss_mlp": 0.01295191, + "balance_loss_clip": 0.06296518, + "balance_loss_mlp": 0.01263171, + "epoch": 0.12535698181271607, + "flos": 27278923351680.0, + "grad_norm": 1.9702751102582312, + "language_loss": 0.81308639, + "learning_rate": 3.905371701516869e-06, + "loss": 0.89228666, + "num_input_tokens_seen": 45010720, + "router_z_loss_clip": 3.28320312, + "router_z_loss_mlp": 0.32006836, + "step": 2085, + "time_per_iteration": 2.5993080139160156 + }, + { + "auxiliary_loss_clip": 0.06621981, + "auxiliary_loss_mlp": 0.01314133, + "balance_loss_clip": 0.06297316, + "balance_loss_mlp": 0.01281469, + "epoch": 0.12541710506538403, + "flos": 22060590301440.0, + "grad_norm": 2.513443994409739, + "language_loss": 0.89793539, + "learning_rate": 3.905253285907856e-06, + "loss": 0.97729653, + "num_input_tokens_seen": 45030360, + "router_z_loss_clip": 3.24609375, + "router_z_loss_mlp": 0.32641602, + "step": 2086, + "time_per_iteration": 2.526017427444458 + }, + { + "auxiliary_loss_clip": 0.0661508, + "auxiliary_loss_mlp": 0.01297904, + "balance_loss_clip": 0.06298057, + "balance_loss_mlp": 0.01269651, + "epoch": 0.125477228318052, + "flos": 12607888296960.0, + "grad_norm": 2.458580206146656, + "language_loss": 0.88740981, + "learning_rate": 3.905134798051447e-06, + "loss": 0.96653962, + "num_input_tokens_seen": 45045085, + "router_z_loss_clip": 3.16992188, + "router_z_loss_mlp": 0.28271484, + "step": 2087, + "time_per_iteration": 2.6768429279327393 + }, + { + "auxiliary_loss_clip": 0.06626555, + "auxiliary_loss_mlp": 0.0130267, + "balance_loss_clip": 0.06301963, + "balance_loss_mlp": 0.0127077, + "epoch": 0.12553735157071996, + "flos": 23885362592640.0, + "grad_norm": 1.907782132807464, + "language_loss": 0.74902099, + "learning_rate": 3.905016237952136e-06, + "loss": 0.82831323, + "num_input_tokens_seen": 45065145, + "router_z_loss_clip": 3.24804688, + "router_z_loss_mlp": 0.3190918, + "step": 2088, + "time_per_iteration": 2.584322690963745 + }, + { + "auxiliary_loss_clip": 0.06515329, + "auxiliary_loss_mlp": 0.01279784, + "balance_loss_clip": 0.06318291, + "balance_loss_mlp": 0.01264752, + "epoch": 0.12559747482338796, + "flos": 69940998881280.0, + "grad_norm": 0.7370797813517723, + "language_loss": 0.61766195, + "learning_rate": 3.904897605614418e-06, + "loss": 0.69561303, + "num_input_tokens_seen": 45126230, + "router_z_loss_clip": 1.96875, + "router_z_loss_mlp": 0.15002441, + "step": 2089, + "time_per_iteration": 3.1401424407958984 + }, + { + "auxiliary_loss_clip": 0.06624255, + "auxiliary_loss_mlp": 0.01293606, + "balance_loss_clip": 0.06302167, + "balance_loss_mlp": 0.01262707, + "epoch": 0.12565759807605592, + "flos": 24286389033600.0, + "grad_norm": 1.9922861494736146, + "language_loss": 0.80224949, + "learning_rate": 3.904778901042793e-06, + "loss": 0.88142806, + "num_input_tokens_seen": 45145545, + "router_z_loss_clip": 3.22070312, + "router_z_loss_mlp": 0.30883789, + "step": 2090, + "time_per_iteration": 2.6044373512268066 + }, + { + "auxiliary_loss_clip": 0.0651547, + "auxiliary_loss_mlp": 0.01267283, + "balance_loss_clip": 0.06318653, + "balance_loss_mlp": 0.01254635, + "epoch": 0.12571772132872389, + "flos": 56468011904640.0, + "grad_norm": 0.7384472353065198, + "language_loss": 0.58865118, + "learning_rate": 3.90466012424176e-06, + "loss": 0.66647875, + "num_input_tokens_seen": 45206845, + "router_z_loss_clip": 1.96875, + "router_z_loss_mlp": 0.12646484, + "step": 2091, + "time_per_iteration": 3.1160824298858643 + }, + { + "auxiliary_loss_clip": 0.06630008, + "auxiliary_loss_mlp": 0.01289162, + "balance_loss_clip": 0.06302688, + "balance_loss_mlp": 0.0125781, + "epoch": 0.12577784458139185, + "flos": 41255576421120.0, + "grad_norm": 1.8290499485408422, + "language_loss": 0.65244853, + "learning_rate": 3.904541275215825e-06, + "loss": 0.73164022, + "num_input_tokens_seen": 45228495, + "router_z_loss_clip": 3.26953125, + "router_z_loss_mlp": 0.31347656, + "step": 2092, + "time_per_iteration": 2.743067741394043 + }, + { + "auxiliary_loss_clip": 0.06640761, + "auxiliary_loss_mlp": 0.01299851, + "balance_loss_clip": 0.06305548, + "balance_loss_mlp": 0.01265542, + "epoch": 0.12583796783405982, + "flos": 19761599427840.0, + "grad_norm": 2.082922063254684, + "language_loss": 0.82319552, + "learning_rate": 3.904422353969493e-06, + "loss": 0.9026016, + "num_input_tokens_seen": 45245720, + "router_z_loss_clip": 3.34960938, + "router_z_loss_mlp": 0.34277344, + "step": 2093, + "time_per_iteration": 2.5252139568328857 + }, + { + "auxiliary_loss_clip": 0.06622188, + "auxiliary_loss_mlp": 0.01291379, + "balance_loss_clip": 0.06303331, + "balance_loss_mlp": 0.01260766, + "epoch": 0.12589809108672778, + "flos": 22608797639040.0, + "grad_norm": 2.0047110075262635, + "language_loss": 0.76888406, + "learning_rate": 3.904303360507276e-06, + "loss": 0.84801972, + "num_input_tokens_seen": 45265650, + "router_z_loss_clip": 3.1875, + "router_z_loss_mlp": 0.30639648, + "step": 2094, + "time_per_iteration": 2.5590462684631348 + }, + { + "auxiliary_loss_clip": 0.06619669, + "auxiliary_loss_mlp": 0.01298019, + "balance_loss_clip": 0.06299751, + "balance_loss_mlp": 0.01266309, + "epoch": 0.12595821433939577, + "flos": 45233248792320.0, + "grad_norm": 1.7774170004570267, + "language_loss": 0.78170305, + "learning_rate": 3.9041842948336835e-06, + "loss": 0.8608799, + "num_input_tokens_seen": 45287790, + "router_z_loss_clip": 3.20117188, + "router_z_loss_mlp": 0.31689453, + "step": 2095, + "time_per_iteration": 2.7437078952789307 + }, + { + "auxiliary_loss_clip": 0.06632219, + "auxiliary_loss_mlp": 0.01294772, + "balance_loss_clip": 0.06299502, + "balance_loss_mlp": 0.01263492, + "epoch": 0.12601833759206374, + "flos": 14325115472640.0, + "grad_norm": 2.871933509106217, + "language_loss": 0.84611917, + "learning_rate": 3.904065156953232e-06, + "loss": 0.92538905, + "num_input_tokens_seen": 45305720, + "router_z_loss_clip": 3.328125, + "router_z_loss_mlp": 0.31274414, + "step": 2096, + "time_per_iteration": 2.530060052871704 + }, + { + "auxiliary_loss_clip": 0.06630743, + "auxiliary_loss_mlp": 0.01306013, + "balance_loss_clip": 0.06297809, + "balance_loss_mlp": 0.01272038, + "epoch": 0.1260784608447317, + "flos": 21294651329280.0, + "grad_norm": 2.3649533335504365, + "language_loss": 0.7677502, + "learning_rate": 3.903945946870439e-06, + "loss": 0.84711778, + "num_input_tokens_seen": 45325290, + "router_z_loss_clip": 3.33203125, + "router_z_loss_mlp": 0.33984375, + "step": 2097, + "time_per_iteration": 2.5258843898773193 + }, + { + "auxiliary_loss_clip": 0.06624204, + "auxiliary_loss_mlp": 0.01300362, + "balance_loss_clip": 0.06299201, + "balance_loss_mlp": 0.0127025, + "epoch": 0.12613858409739967, + "flos": 26258719564800.0, + "grad_norm": 2.151256625756143, + "language_loss": 0.88275403, + "learning_rate": 3.9038266645898246e-06, + "loss": 0.96199965, + "num_input_tokens_seen": 45344465, + "router_z_loss_clip": 3.25195312, + "router_z_loss_mlp": 0.30102539, + "step": 2098, + "time_per_iteration": 2.5916357040405273 + }, + { + "auxiliary_loss_clip": 0.0664238, + "auxiliary_loss_mlp": 0.01307801, + "balance_loss_clip": 0.06306277, + "balance_loss_mlp": 0.0127149, + "epoch": 0.12619870735006763, + "flos": 21586413646080.0, + "grad_norm": 1.8808679634119545, + "language_loss": 0.71169508, + "learning_rate": 3.903707310115912e-06, + "loss": 0.79119694, + "num_input_tokens_seen": 45362465, + "router_z_loss_clip": 3.359375, + "router_z_loss_mlp": 0.36303711, + "step": 2099, + "time_per_iteration": 2.525548219680786 + }, + { + "auxiliary_loss_clip": 0.06636767, + "auxiliary_loss_mlp": 0.01301654, + "balance_loss_clip": 0.06306287, + "balance_loss_mlp": 0.0126756, + "epoch": 0.1262588306027356, + "flos": 23373646508160.0, + "grad_norm": 3.191355313927065, + "language_loss": 0.83154678, + "learning_rate": 3.903587883453228e-06, + "loss": 0.91093099, + "num_input_tokens_seen": 45382700, + "router_z_loss_clip": 3.30664062, + "router_z_loss_mlp": 0.34106445, + "step": 2100, + "time_per_iteration": 2.581777572631836 + }, + { + "auxiliary_loss_clip": 0.06632592, + "auxiliary_loss_mlp": 0.01304584, + "balance_loss_clip": 0.06304123, + "balance_loss_mlp": 0.01271325, + "epoch": 0.12631895385540357, + "flos": 23955619841280.0, + "grad_norm": 1.9586534535799036, + "language_loss": 0.81579792, + "learning_rate": 3.903468384606302e-06, + "loss": 0.89516962, + "num_input_tokens_seen": 45401005, + "router_z_loss_clip": 3.28515625, + "router_z_loss_mlp": 0.33227539, + "step": 2101, + "time_per_iteration": 2.579571008682251 + }, + { + "auxiliary_loss_clip": 0.06508025, + "auxiliary_loss_mlp": 0.01260999, + "balance_loss_clip": 0.06310984, + "balance_loss_mlp": 0.0125033, + "epoch": 0.12637907710807156, + "flos": 70301760635520.0, + "grad_norm": 0.6797956524806741, + "language_loss": 0.57154572, + "learning_rate": 3.903348813579662e-06, + "loss": 0.6492359, + "num_input_tokens_seen": 45466555, + "router_z_loss_clip": 1.96875, + "router_z_loss_mlp": 0.10681152, + "step": 2102, + "time_per_iteration": 3.2542574405670166 + }, + { + "auxiliary_loss_clip": 0.06635006, + "auxiliary_loss_mlp": 0.0129624, + "balance_loss_clip": 0.06302785, + "balance_loss_mlp": 0.01264888, + "epoch": 0.12643920036073952, + "flos": 18920833159680.0, + "grad_norm": 2.1103424848105177, + "language_loss": 0.95015359, + "learning_rate": 3.903229170377845e-06, + "loss": 1.02946603, + "num_input_tokens_seen": 45485165, + "router_z_loss_clip": 3.32617188, + "router_z_loss_mlp": 0.31396484, + "step": 2103, + "time_per_iteration": 2.554858684539795 + }, + { + "auxiliary_loss_clip": 0.06615217, + "auxiliary_loss_mlp": 0.01290733, + "balance_loss_clip": 0.0629935, + "balance_loss_mlp": 0.0126099, + "epoch": 0.1264993236134075, + "flos": 27789926676480.0, + "grad_norm": 1.8409874759375768, + "language_loss": 0.79467118, + "learning_rate": 3.903109455005387e-06, + "loss": 0.8737306, + "num_input_tokens_seen": 45504630, + "router_z_loss_clip": 3.15820312, + "router_z_loss_mlp": 0.29711914, + "step": 2104, + "time_per_iteration": 2.6194100379943848 + }, + { + "auxiliary_loss_clip": 0.06630556, + "auxiliary_loss_mlp": 0.01294269, + "balance_loss_clip": 0.06301397, + "balance_loss_mlp": 0.0126256, + "epoch": 0.12655944686607545, + "flos": 24761739646080.0, + "grad_norm": 2.4857210053550625, + "language_loss": 0.82356828, + "learning_rate": 3.902989667466828e-06, + "loss": 0.90281653, + "num_input_tokens_seen": 45524885, + "router_z_loss_clip": 3.29296875, + "router_z_loss_mlp": 0.31713867, + "step": 2105, + "time_per_iteration": 2.6011011600494385 + }, + { + "auxiliary_loss_clip": 0.06645899, + "auxiliary_loss_mlp": 0.01301591, + "balance_loss_clip": 0.0630343, + "balance_loss_mlp": 0.01263587, + "epoch": 0.12661957011874342, + "flos": 24139753188480.0, + "grad_norm": 2.6380144602222653, + "language_loss": 0.84079802, + "learning_rate": 3.90286980776671e-06, + "loss": 0.92027295, + "num_input_tokens_seen": 45545000, + "router_z_loss_clip": 3.421875, + "router_z_loss_mlp": 0.37963867, + "step": 2106, + "time_per_iteration": 2.572817087173462 + }, + { + "auxiliary_loss_clip": 0.0662559, + "auxiliary_loss_mlp": 0.012898, + "balance_loss_clip": 0.06298016, + "balance_loss_mlp": 0.01256422, + "epoch": 0.12667969337141138, + "flos": 24576180779520.0, + "grad_norm": 1.9395738781277843, + "language_loss": 0.74407184, + "learning_rate": 3.902749875909578e-06, + "loss": 0.82322574, + "num_input_tokens_seen": 45564210, + "router_z_loss_clip": 3.2734375, + "router_z_loss_mlp": 0.33374023, + "step": 2107, + "time_per_iteration": 2.6193723678588867 + }, + { + "auxiliary_loss_clip": 0.06622959, + "auxiliary_loss_mlp": 0.01290393, + "balance_loss_clip": 0.06299001, + "balance_loss_mlp": 0.01259017, + "epoch": 0.12673981662407935, + "flos": 22967546895360.0, + "grad_norm": 2.0472212441306175, + "language_loss": 0.80444276, + "learning_rate": 3.90262987189998e-06, + "loss": 0.88357627, + "num_input_tokens_seen": 45583030, + "router_z_loss_clip": 3.24023438, + "router_z_loss_mlp": 0.31396484, + "step": 2108, + "time_per_iteration": 2.5497617721557617 + }, + { + "auxiliary_loss_clip": 0.06627882, + "auxiliary_loss_mlp": 0.01288653, + "balance_loss_clip": 0.06298642, + "balance_loss_mlp": 0.01256562, + "epoch": 0.12679993987674734, + "flos": 17280613486080.0, + "grad_norm": 2.14760795310841, + "language_loss": 0.77326792, + "learning_rate": 3.902509795742467e-06, + "loss": 0.85243326, + "num_input_tokens_seen": 45602265, + "router_z_loss_clip": 3.29296875, + "router_z_loss_mlp": 0.32080078, + "step": 2109, + "time_per_iteration": 3.9535577297210693 + }, + { + "auxiliary_loss_clip": 0.06619301, + "auxiliary_loss_mlp": 0.01294051, + "balance_loss_clip": 0.0629691, + "balance_loss_mlp": 0.01260672, + "epoch": 0.1268600631294153, + "flos": 17280865048320.0, + "grad_norm": 1.6861552096477337, + "language_loss": 0.83234507, + "learning_rate": 3.902389647441592e-06, + "loss": 0.91147858, + "num_input_tokens_seen": 45620595, + "router_z_loss_clip": 3.22265625, + "router_z_loss_mlp": 0.33374023, + "step": 2110, + "time_per_iteration": 3.975102424621582 + }, + { + "auxiliary_loss_clip": 0.06634356, + "auxiliary_loss_mlp": 0.01289468, + "balance_loss_clip": 0.06303843, + "balance_loss_mlp": 0.01256661, + "epoch": 0.12692018638208327, + "flos": 24067902712320.0, + "grad_norm": 1.6854035382994426, + "language_loss": 0.79946983, + "learning_rate": 3.90226942700191e-06, + "loss": 0.878708, + "num_input_tokens_seen": 45641140, + "router_z_loss_clip": 3.31054688, + "router_z_loss_mlp": 0.32788086, + "step": 2111, + "time_per_iteration": 2.549649953842163 + }, + { + "auxiliary_loss_clip": 0.06640926, + "auxiliary_loss_mlp": 0.0129832, + "balance_loss_clip": 0.06299084, + "balance_loss_mlp": 0.01261199, + "epoch": 0.12698030963475124, + "flos": 31839952648320.0, + "grad_norm": 2.9365318295255984, + "language_loss": 0.78364569, + "learning_rate": 3.902149134427982e-06, + "loss": 0.86303812, + "num_input_tokens_seen": 45662315, + "router_z_loss_clip": 3.4140625, + "router_z_loss_mlp": 0.37109375, + "step": 2112, + "time_per_iteration": 2.641850233078003 + }, + { + "auxiliary_loss_clip": 0.06616612, + "auxiliary_loss_mlp": 0.01293574, + "balance_loss_clip": 0.062942, + "balance_loss_mlp": 0.01262342, + "epoch": 0.1270404328874192, + "flos": 25194058387200.0, + "grad_norm": 2.0317084660262688, + "language_loss": 0.86970478, + "learning_rate": 3.902028769724367e-06, + "loss": 0.94880664, + "num_input_tokens_seen": 45680335, + "router_z_loss_clip": 3.22851562, + "router_z_loss_mlp": 0.31225586, + "step": 2113, + "time_per_iteration": 5.534189224243164 + }, + { + "auxiliary_loss_clip": 0.06626937, + "auxiliary_loss_mlp": 0.01298292, + "balance_loss_clip": 0.06295247, + "balance_loss_mlp": 0.01265462, + "epoch": 0.12710055614008717, + "flos": 16002790721280.0, + "grad_norm": 2.427248740860799, + "language_loss": 0.75266403, + "learning_rate": 3.9019083328956315e-06, + "loss": 0.83191633, + "num_input_tokens_seen": 45696240, + "router_z_loss_clip": 3.3203125, + "router_z_loss_mlp": 0.32788086, + "step": 2114, + "time_per_iteration": 2.491520643234253 + }, + { + "auxiliary_loss_clip": 0.06621046, + "auxiliary_loss_mlp": 0.01302494, + "balance_loss_clip": 0.06295703, + "balance_loss_mlp": 0.01270975, + "epoch": 0.12716067939275516, + "flos": 15091012517760.0, + "grad_norm": 2.3252793600318125, + "language_loss": 0.85064435, + "learning_rate": 3.901787823946341e-06, + "loss": 0.92987972, + "num_input_tokens_seen": 45713695, + "router_z_loss_clip": 3.2578125, + "router_z_loss_mlp": 0.31518555, + "step": 2115, + "time_per_iteration": 2.5152101516723633 + }, + { + "auxiliary_loss_clip": 0.06622103, + "auxiliary_loss_mlp": 0.01292068, + "balance_loss_clip": 0.06295006, + "balance_loss_mlp": 0.01260787, + "epoch": 0.12722080264542313, + "flos": 28374373704960.0, + "grad_norm": 1.6080767966631377, + "language_loss": 0.88167703, + "learning_rate": 3.901667242881065e-06, + "loss": 0.96081877, + "num_input_tokens_seen": 45736655, + "router_z_loss_clip": 3.2734375, + "router_z_loss_mlp": 0.3125, + "step": 2116, + "time_per_iteration": 2.61238169670105 + }, + { + "auxiliary_loss_clip": 0.06614063, + "auxiliary_loss_mlp": 0.01310146, + "balance_loss_clip": 0.06294715, + "balance_loss_mlp": 0.0127877, + "epoch": 0.1272809258980911, + "flos": 32388159985920.0, + "grad_norm": 4.443941469464488, + "language_loss": 0.72083235, + "learning_rate": 3.9015465897043775e-06, + "loss": 0.8000744, + "num_input_tokens_seen": 45758195, + "router_z_loss_clip": 3.1953125, + "router_z_loss_mlp": 0.3137207, + "step": 2117, + "time_per_iteration": 2.6185410022735596 + }, + { + "auxiliary_loss_clip": 0.06630652, + "auxiliary_loss_mlp": 0.01300593, + "balance_loss_clip": 0.06301345, + "balance_loss_mlp": 0.0126781, + "epoch": 0.12734104915075906, + "flos": 16039952807040.0, + "grad_norm": 1.9850917523754936, + "language_loss": 0.87703407, + "learning_rate": 3.901425864420852e-06, + "loss": 0.95634645, + "num_input_tokens_seen": 45774280, + "router_z_loss_clip": 3.29101562, + "router_z_loss_mlp": 0.32739258, + "step": 2118, + "time_per_iteration": 2.503112316131592 + }, + { + "auxiliary_loss_clip": 0.06623712, + "auxiliary_loss_mlp": 0.01308307, + "balance_loss_clip": 0.06299254, + "balance_loss_mlp": 0.01276359, + "epoch": 0.12740117240342702, + "flos": 18266296590720.0, + "grad_norm": 1.8669738886398666, + "language_loss": 0.88737518, + "learning_rate": 3.901305067035068e-06, + "loss": 0.96669531, + "num_input_tokens_seen": 45792760, + "router_z_loss_clip": 3.24414062, + "router_z_loss_mlp": 0.31945801, + "step": 2119, + "time_per_iteration": 2.541663885116577 + }, + { + "auxiliary_loss_clip": 0.06633841, + "auxiliary_loss_mlp": 0.01294245, + "balance_loss_clip": 0.06305236, + "balance_loss_mlp": 0.01260652, + "epoch": 0.127461295656095, + "flos": 12125242379520.0, + "grad_norm": 2.4570566612421154, + "language_loss": 0.88616729, + "learning_rate": 3.901184197551605e-06, + "loss": 0.96544814, + "num_input_tokens_seen": 45804300, + "router_z_loss_clip": 3.2890625, + "router_z_loss_mlp": 0.33569336, + "step": 2120, + "time_per_iteration": 2.481060743331909 + }, + { + "auxiliary_loss_clip": 0.06631807, + "auxiliary_loss_mlp": 0.01302004, + "balance_loss_clip": 0.06303513, + "balance_loss_mlp": 0.01269079, + "epoch": 0.12752141890876295, + "flos": 23155831019520.0, + "grad_norm": 1.9663880058350043, + "language_loss": 0.7779758, + "learning_rate": 3.901063255975046e-06, + "loss": 0.85731387, + "num_input_tokens_seen": 45823780, + "router_z_loss_clip": 3.28515625, + "router_z_loss_mlp": 0.3293457, + "step": 2121, + "time_per_iteration": 2.5578267574310303 + }, + { + "auxiliary_loss_clip": 0.06632394, + "auxiliary_loss_mlp": 0.01293067, + "balance_loss_clip": 0.06304775, + "balance_loss_mlp": 0.01258949, + "epoch": 0.12758154216143094, + "flos": 21622359847680.0, + "grad_norm": 2.5772818076611976, + "language_loss": 0.84019601, + "learning_rate": 3.900942242309978e-06, + "loss": 0.91945064, + "num_input_tokens_seen": 45840495, + "router_z_loss_clip": 3.28125, + "router_z_loss_mlp": 0.34106445, + "step": 2122, + "time_per_iteration": 2.5861244201660156 + }, + { + "auxiliary_loss_clip": 0.06629082, + "auxiliary_loss_mlp": 0.01293636, + "balance_loss_clip": 0.06302215, + "balance_loss_mlp": 0.01260162, + "epoch": 0.1276416654140989, + "flos": 15930395193600.0, + "grad_norm": 1.9995911681983476, + "language_loss": 0.80520052, + "learning_rate": 3.90082115656099e-06, + "loss": 0.88442767, + "num_input_tokens_seen": 45857735, + "router_z_loss_clip": 3.26953125, + "router_z_loss_mlp": 0.33496094, + "step": 2123, + "time_per_iteration": 2.543966770172119 + }, + { + "auxiliary_loss_clip": 0.06636834, + "auxiliary_loss_mlp": 0.01289825, + "balance_loss_clip": 0.06312384, + "balance_loss_mlp": 0.01257687, + "epoch": 0.12770178866676687, + "flos": 22389263141760.0, + "grad_norm": 1.6312979029769639, + "language_loss": 0.80678988, + "learning_rate": 3.900699998732673e-06, + "loss": 0.88605642, + "num_input_tokens_seen": 45876485, + "router_z_loss_clip": 3.24023438, + "router_z_loss_mlp": 0.3215332, + "step": 2124, + "time_per_iteration": 2.590118169784546 + }, + { + "auxiliary_loss_clip": 0.06636873, + "auxiliary_loss_mlp": 0.01291865, + "balance_loss_clip": 0.06307361, + "balance_loss_mlp": 0.01261228, + "epoch": 0.12776191191943484, + "flos": 21658851100800.0, + "grad_norm": 2.2926076774548765, + "language_loss": 0.76290202, + "learning_rate": 3.900578768829623e-06, + "loss": 0.84218943, + "num_input_tokens_seen": 45894645, + "router_z_loss_clip": 3.29492188, + "router_z_loss_mlp": 0.30639648, + "step": 2125, + "time_per_iteration": 2.5684149265289307 + }, + { + "auxiliary_loss_clip": 0.06631321, + "auxiliary_loss_mlp": 0.01289055, + "balance_loss_clip": 0.0630435, + "balance_loss_mlp": 0.01257011, + "epoch": 0.1278220351721028, + "flos": 25742056089600.0, + "grad_norm": 2.526811883204058, + "language_loss": 0.79172325, + "learning_rate": 3.900457466856434e-06, + "loss": 0.87092698, + "num_input_tokens_seen": 45913755, + "router_z_loss_clip": 3.26757812, + "router_z_loss_mlp": 0.3203125, + "step": 2126, + "time_per_iteration": 2.6264641284942627 + }, + { + "auxiliary_loss_clip": 0.06645348, + "auxiliary_loss_mlp": 0.01292083, + "balance_loss_clip": 0.06316036, + "balance_loss_mlp": 0.01259563, + "epoch": 0.12788215842477077, + "flos": 41252515747200.0, + "grad_norm": 1.559600581864003, + "language_loss": 0.70510435, + "learning_rate": 3.9003360928177085e-06, + "loss": 0.7844786, + "num_input_tokens_seen": 45936095, + "router_z_loss_clip": 3.29101562, + "router_z_loss_mlp": 0.32543945, + "step": 2127, + "time_per_iteration": 2.7501988410949707 + }, + { + "auxiliary_loss_clip": 0.06512339, + "auxiliary_loss_mlp": 0.01271557, + "balance_loss_clip": 0.06312746, + "balance_loss_mlp": 0.01259123, + "epoch": 0.12794228167743876, + "flos": 70899079265280.0, + "grad_norm": 0.8027421200972868, + "language_loss": 0.6268698, + "learning_rate": 3.900214646718047e-06, + "loss": 0.70470876, + "num_input_tokens_seen": 46004655, + "router_z_loss_clip": 2.0, + "router_z_loss_mlp": 0.12438965, + "step": 2128, + "time_per_iteration": 3.2327187061309814 + }, + { + "auxiliary_loss_clip": 0.06647713, + "auxiliary_loss_mlp": 0.0129082, + "balance_loss_clip": 0.06314018, + "balance_loss_mlp": 0.01255987, + "epoch": 0.12800240493010673, + "flos": 16295307724800.0, + "grad_norm": 3.2224372102485757, + "language_loss": 0.78878236, + "learning_rate": 3.900093128562056e-06, + "loss": 0.86816764, + "num_input_tokens_seen": 46023610, + "router_z_loss_clip": 3.3359375, + "router_z_loss_mlp": 0.34790039, + "step": 2129, + "time_per_iteration": 2.513296365737915 + }, + { + "auxiliary_loss_clip": 0.06653494, + "auxiliary_loss_mlp": 0.01302761, + "balance_loss_clip": 0.06312658, + "balance_loss_mlp": 0.012649, + "epoch": 0.1280625281827747, + "flos": 20637850700160.0, + "grad_norm": 2.4415165367574394, + "language_loss": 0.80974901, + "learning_rate": 3.899971538354343e-06, + "loss": 0.88931155, + "num_input_tokens_seen": 46041725, + "router_z_loss_clip": 3.40820312, + "router_z_loss_mlp": 0.37866211, + "step": 2130, + "time_per_iteration": 2.551335573196411 + }, + { + "auxiliary_loss_clip": 0.06635942, + "auxiliary_loss_mlp": 0.01301168, + "balance_loss_clip": 0.06304602, + "balance_loss_mlp": 0.01268457, + "epoch": 0.12812265143544266, + "flos": 22644869621760.0, + "grad_norm": 1.8063453022697407, + "language_loss": 0.73535526, + "learning_rate": 3.899849876099518e-06, + "loss": 0.81472635, + "num_input_tokens_seen": 46061095, + "router_z_loss_clip": 3.31445312, + "router_z_loss_mlp": 0.3269043, + "step": 2131, + "time_per_iteration": 2.591715097427368 + }, + { + "auxiliary_loss_clip": 0.06649061, + "auxiliary_loss_mlp": 0.01307481, + "balance_loss_clip": 0.06316839, + "balance_loss_mlp": 0.01274961, + "epoch": 0.12818277468811062, + "flos": 34723306696320.0, + "grad_norm": 2.4480572994081213, + "language_loss": 0.74477613, + "learning_rate": 3.899728141802197e-06, + "loss": 0.8243416, + "num_input_tokens_seen": 46082670, + "router_z_loss_clip": 3.32226562, + "router_z_loss_mlp": 0.32519531, + "step": 2132, + "time_per_iteration": 2.644005060195923 + }, + { + "auxiliary_loss_clip": 0.06630264, + "auxiliary_loss_mlp": 0.01301188, + "balance_loss_clip": 0.06311467, + "balance_loss_mlp": 0.01268573, + "epoch": 0.1282428979407786, + "flos": 23118752787840.0, + "grad_norm": 2.134664592917613, + "language_loss": 0.83662349, + "learning_rate": 3.8996063354669935e-06, + "loss": 0.91593802, + "num_input_tokens_seen": 46102410, + "router_z_loss_clip": 3.1875, + "router_z_loss_mlp": 0.32617188, + "step": 2133, + "time_per_iteration": 2.526437520980835 + }, + { + "auxiliary_loss_clip": 0.06657492, + "auxiliary_loss_mlp": 0.01312656, + "balance_loss_clip": 0.06318928, + "balance_loss_mlp": 0.01277823, + "epoch": 0.12830302119344655, + "flos": 20892786347520.0, + "grad_norm": 3.0593036297338223, + "language_loss": 0.82609046, + "learning_rate": 3.899484457098528e-06, + "loss": 0.90579188, + "num_input_tokens_seen": 46121145, + "router_z_loss_clip": 3.38476562, + "router_z_loss_mlp": 0.34814453, + "step": 2134, + "time_per_iteration": 2.57069993019104 + }, + { + "auxiliary_loss_clip": 0.06644946, + "auxiliary_loss_mlp": 0.01299694, + "balance_loss_clip": 0.0631265, + "balance_loss_mlp": 0.01266363, + "epoch": 0.12836314444611455, + "flos": 21404208942720.0, + "grad_norm": 1.8809028559826366, + "language_loss": 0.84531921, + "learning_rate": 3.899362506701421e-06, + "loss": 0.92476559, + "num_input_tokens_seen": 46140740, + "router_z_loss_clip": 3.3203125, + "router_z_loss_mlp": 0.33325195, + "step": 2135, + "time_per_iteration": 2.5816993713378906 + }, + { + "auxiliary_loss_clip": 0.06641332, + "auxiliary_loss_mlp": 0.01305378, + "balance_loss_clip": 0.06312244, + "balance_loss_mlp": 0.01272142, + "epoch": 0.1284232676987825, + "flos": 13667560156800.0, + "grad_norm": 3.0323333945799176, + "language_loss": 0.78892457, + "learning_rate": 3.899240484280298e-06, + "loss": 0.86839169, + "num_input_tokens_seen": 46156805, + "router_z_loss_clip": 3.2890625, + "router_z_loss_mlp": 0.33227539, + "step": 2136, + "time_per_iteration": 2.529231548309326 + }, + { + "auxiliary_loss_clip": 0.06499572, + "auxiliary_loss_mlp": 0.01289102, + "balance_loss_clip": 0.06299701, + "balance_loss_mlp": 0.01276156, + "epoch": 0.12848339095145048, + "flos": 60012904337280.0, + "grad_norm": 0.8797489168749767, + "language_loss": 0.5947628, + "learning_rate": 3.899118389839785e-06, + "loss": 0.67264956, + "num_input_tokens_seen": 46222085, + "router_z_loss_clip": 2.0, + "router_z_loss_mlp": 0.12957764, + "step": 2137, + "time_per_iteration": 3.308232545852661 + }, + { + "auxiliary_loss_clip": 0.06652065, + "auxiliary_loss_mlp": 0.01307251, + "balance_loss_clip": 0.06317523, + "balance_loss_mlp": 0.01273515, + "epoch": 0.12854351420411844, + "flos": 13886507675520.0, + "grad_norm": 2.603073013301421, + "language_loss": 0.84481782, + "learning_rate": 3.898996223384512e-06, + "loss": 0.924411, + "num_input_tokens_seen": 46239970, + "router_z_loss_clip": 3.34765625, + "router_z_loss_mlp": 0.3371582, + "step": 2138, + "time_per_iteration": 2.5150487422943115 + }, + { + "auxiliary_loss_clip": 0.0665133, + "auxiliary_loss_mlp": 0.01300544, + "balance_loss_clip": 0.06310506, + "balance_loss_mlp": 0.01263136, + "epoch": 0.1286036374567864, + "flos": 22644534205440.0, + "grad_norm": 2.3721539245571237, + "language_loss": 0.79668736, + "learning_rate": 3.898873984919113e-06, + "loss": 0.87620616, + "num_input_tokens_seen": 46257740, + "router_z_loss_clip": 3.41015625, + "router_z_loss_mlp": 0.37402344, + "step": 2139, + "time_per_iteration": 2.5760304927825928 + }, + { + "auxiliary_loss_clip": 0.06645858, + "auxiliary_loss_mlp": 0.01289965, + "balance_loss_clip": 0.06314536, + "balance_loss_mlp": 0.0125754, + "epoch": 0.12866376070945437, + "flos": 16330121896320.0, + "grad_norm": 1.944874099387006, + "language_loss": 0.86374593, + "learning_rate": 3.8987516744482215e-06, + "loss": 0.94310415, + "num_input_tokens_seen": 46275445, + "router_z_loss_clip": 3.30859375, + "router_z_loss_mlp": 0.32421875, + "step": 2140, + "time_per_iteration": 2.5656511783599854 + }, + { + "auxiliary_loss_clip": 0.06634524, + "auxiliary_loss_mlp": 0.01284799, + "balance_loss_clip": 0.06308289, + "balance_loss_mlp": 0.01254496, + "epoch": 0.12872388396212234, + "flos": 11879321045760.0, + "grad_norm": 2.00800168780761, + "language_loss": 0.87046349, + "learning_rate": 3.898629291976476e-06, + "loss": 0.94965667, + "num_input_tokens_seen": 46291710, + "router_z_loss_clip": 3.2578125, + "router_z_loss_mlp": 0.30322266, + "step": 2141, + "time_per_iteration": 2.589749336242676 + }, + { + "auxiliary_loss_clip": 0.06646, + "auxiliary_loss_mlp": 0.01294177, + "balance_loss_clip": 0.06311622, + "balance_loss_mlp": 0.01261037, + "epoch": 0.12878400721479033, + "flos": 28374331777920.0, + "grad_norm": 2.3143248810569563, + "language_loss": 0.69344199, + "learning_rate": 3.898506837508518e-06, + "loss": 0.77284372, + "num_input_tokens_seen": 46311335, + "router_z_loss_clip": 3.34375, + "router_z_loss_mlp": 0.33154297, + "step": 2142, + "time_per_iteration": 2.631613254547119 + }, + { + "auxiliary_loss_clip": 0.06645877, + "auxiliary_loss_mlp": 0.01292532, + "balance_loss_clip": 0.06308207, + "balance_loss_mlp": 0.01257723, + "epoch": 0.1288441304674583, + "flos": 25892842711680.0, + "grad_norm": 1.8471793604151003, + "language_loss": 0.84538341, + "learning_rate": 3.89838431104899e-06, + "loss": 0.92476749, + "num_input_tokens_seen": 46330985, + "router_z_loss_clip": 3.38085938, + "router_z_loss_mlp": 0.34814453, + "step": 2143, + "time_per_iteration": 2.62510085105896 + }, + { + "auxiliary_loss_clip": 0.06646847, + "auxiliary_loss_mlp": 0.01296075, + "balance_loss_clip": 0.06309757, + "balance_loss_mlp": 0.01261194, + "epoch": 0.12890425372012626, + "flos": 20820097330560.0, + "grad_norm": 2.9481033880232284, + "language_loss": 0.82936227, + "learning_rate": 3.898261712602539e-06, + "loss": 0.90879142, + "num_input_tokens_seen": 46351295, + "router_z_loss_clip": 3.37109375, + "router_z_loss_mlp": 0.34912109, + "step": 2144, + "time_per_iteration": 2.562148332595825 + }, + { + "auxiliary_loss_clip": 0.06632444, + "auxiliary_loss_mlp": 0.01299578, + "balance_loss_clip": 0.06302489, + "balance_loss_mlp": 0.01263196, + "epoch": 0.12896437697279423, + "flos": 22572599875200.0, + "grad_norm": 2.2245116542983046, + "language_loss": 0.80073792, + "learning_rate": 3.898139042173813e-06, + "loss": 0.88005811, + "num_input_tokens_seen": 46368600, + "router_z_loss_clip": 3.29882812, + "router_z_loss_mlp": 0.36376953, + "step": 2145, + "time_per_iteration": 2.5510518550872803 + }, + { + "auxiliary_loss_clip": 0.06636346, + "auxiliary_loss_mlp": 0.0130688, + "balance_loss_clip": 0.06306225, + "balance_loss_mlp": 0.01269877, + "epoch": 0.1290245002254622, + "flos": 17499561004800.0, + "grad_norm": 2.1761731102138686, + "language_loss": 0.83456767, + "learning_rate": 3.898016299767465e-06, + "loss": 0.91399992, + "num_input_tokens_seen": 46387370, + "router_z_loss_clip": 3.30273438, + "router_z_loss_mlp": 0.36987305, + "step": 2146, + "time_per_iteration": 2.5113868713378906 + }, + { + "auxiliary_loss_clip": 0.06626259, + "auxiliary_loss_mlp": 0.01301495, + "balance_loss_clip": 0.06300884, + "balance_loss_mlp": 0.01266042, + "epoch": 0.12908462347813016, + "flos": 36324142151040.0, + "grad_norm": 4.395125583857354, + "language_loss": 0.72594023, + "learning_rate": 3.897893485388149e-06, + "loss": 0.8052178, + "num_input_tokens_seen": 46409570, + "router_z_loss_clip": 3.25585938, + "router_z_loss_mlp": 0.35449219, + "step": 2147, + "time_per_iteration": 2.7282183170318604 + }, + { + "auxiliary_loss_clip": 0.06638759, + "auxiliary_loss_mlp": 0.01311135, + "balance_loss_clip": 0.0630547, + "balance_loss_mlp": 0.0127685, + "epoch": 0.12914474673079815, + "flos": 22535312008320.0, + "grad_norm": 2.709676387149746, + "language_loss": 0.73026669, + "learning_rate": 3.897770599040521e-06, + "loss": 0.80976564, + "num_input_tokens_seen": 46429320, + "router_z_loss_clip": 3.33203125, + "router_z_loss_mlp": 0.34326172, + "step": 2148, + "time_per_iteration": 2.5520236492156982 + }, + { + "auxiliary_loss_clip": 0.0663462, + "auxiliary_loss_mlp": 0.01329577, + "balance_loss_clip": 0.06310473, + "balance_loss_mlp": 0.01295626, + "epoch": 0.12920486998346611, + "flos": 21478533114240.0, + "grad_norm": 1.8799370652963014, + "language_loss": 0.80598587, + "learning_rate": 3.897647640729242e-06, + "loss": 0.88562787, + "num_input_tokens_seen": 46450155, + "router_z_loss_clip": 3.2421875, + "router_z_loss_mlp": 0.33959961, + "step": 2149, + "time_per_iteration": 3.9808621406555176 + }, + { + "auxiliary_loss_clip": 0.06633235, + "auxiliary_loss_mlp": 0.01311577, + "balance_loss_clip": 0.06304948, + "balance_loss_mlp": 0.01273907, + "epoch": 0.12926499323613408, + "flos": 27316001583360.0, + "grad_norm": 1.9848043356035314, + "language_loss": 0.77766216, + "learning_rate": 3.897524610458975e-06, + "loss": 0.85711026, + "num_input_tokens_seen": 46470280, + "router_z_loss_clip": 3.27929688, + "router_z_loss_mlp": 0.37646484, + "step": 2150, + "time_per_iteration": 4.050567388534546 + }, + { + "auxiliary_loss_clip": 0.06637069, + "auxiliary_loss_mlp": 0.01309125, + "balance_loss_clip": 0.06305329, + "balance_loss_mlp": 0.01273791, + "epoch": 0.12932511648880204, + "flos": 22097710460160.0, + "grad_norm": 2.600129389398131, + "language_loss": 0.71828127, + "learning_rate": 3.8974015082343835e-06, + "loss": 0.79774326, + "num_input_tokens_seen": 46487605, + "router_z_loss_clip": 3.31835938, + "router_z_loss_mlp": 0.35351562, + "step": 2151, + "time_per_iteration": 2.539199113845825 + }, + { + "auxiliary_loss_clip": 0.06638855, + "auxiliary_loss_mlp": 0.01316478, + "balance_loss_clip": 0.06308948, + "balance_loss_mlp": 0.01280716, + "epoch": 0.12938523974147, + "flos": 20308968224640.0, + "grad_norm": 2.09152011854814, + "language_loss": 0.85415232, + "learning_rate": 3.897278334060137e-06, + "loss": 0.93370569, + "num_input_tokens_seen": 46505100, + "router_z_loss_clip": 3.296875, + "router_z_loss_mlp": 0.35766602, + "step": 2152, + "time_per_iteration": 4.064931631088257 + }, + { + "auxiliary_loss_clip": 0.06626976, + "auxiliary_loss_mlp": 0.0130895, + "balance_loss_clip": 0.06301983, + "balance_loss_mlp": 0.01275118, + "epoch": 0.12944536299413797, + "flos": 19505992947840.0, + "grad_norm": 2.0734690645371865, + "language_loss": 0.79983026, + "learning_rate": 3.897155087940906e-06, + "loss": 0.87918949, + "num_input_tokens_seen": 46524020, + "router_z_loss_clip": 3.25, + "router_z_loss_mlp": 0.33837891, + "step": 2153, + "time_per_iteration": 3.9787750244140625 + }, + { + "auxiliary_loss_clip": 0.06634978, + "auxiliary_loss_mlp": 0.01296438, + "balance_loss_clip": 0.06309275, + "balance_loss_mlp": 0.01262845, + "epoch": 0.12950548624680594, + "flos": 27715099380480.0, + "grad_norm": 1.6134334939452253, + "language_loss": 0.81228089, + "learning_rate": 3.897031769881364e-06, + "loss": 0.89159513, + "num_input_tokens_seen": 46544640, + "router_z_loss_clip": 3.2578125, + "router_z_loss_mlp": 0.3359375, + "step": 2154, + "time_per_iteration": 2.6176583766937256 + }, + { + "auxiliary_loss_clip": 0.06634305, + "auxiliary_loss_mlp": 0.01301182, + "balance_loss_clip": 0.06307935, + "balance_loss_mlp": 0.01267756, + "epoch": 0.12956560949947393, + "flos": 17571369553920.0, + "grad_norm": 5.013009585067341, + "language_loss": 0.84744835, + "learning_rate": 3.896908379886188e-06, + "loss": 0.92680323, + "num_input_tokens_seen": 46561395, + "router_z_loss_clip": 3.265625, + "router_z_loss_mlp": 0.33422852, + "step": 2155, + "time_per_iteration": 2.512476921081543 + }, + { + "auxiliary_loss_clip": 0.06635429, + "auxiliary_loss_mlp": 0.01300286, + "balance_loss_clip": 0.06301479, + "balance_loss_mlp": 0.01265668, + "epoch": 0.1296257327521419, + "flos": 20746989043200.0, + "grad_norm": 7.629659850029062, + "language_loss": 0.77301121, + "learning_rate": 3.896784917960055e-06, + "loss": 0.85236835, + "num_input_tokens_seen": 46579395, + "router_z_loss_clip": 3.34179688, + "router_z_loss_mlp": 0.34619141, + "step": 2156, + "time_per_iteration": 2.5492148399353027 + }, + { + "auxiliary_loss_clip": 0.06627367, + "auxiliary_loss_mlp": 0.01301012, + "balance_loss_clip": 0.06305566, + "balance_loss_mlp": 0.01268063, + "epoch": 0.12968585600480986, + "flos": 16400756488320.0, + "grad_norm": 2.322189413476167, + "language_loss": 0.88143146, + "learning_rate": 3.896661384107648e-06, + "loss": 0.96071517, + "num_input_tokens_seen": 46597090, + "router_z_loss_clip": 3.21875, + "router_z_loss_mlp": 0.32910156, + "step": 2157, + "time_per_iteration": 2.571720838546753 + }, + { + "auxiliary_loss_clip": 0.06642087, + "auxiliary_loss_mlp": 0.0129196, + "balance_loss_clip": 0.06308718, + "balance_loss_mlp": 0.01257699, + "epoch": 0.12974597925747783, + "flos": 28337043911040.0, + "grad_norm": 2.3553612027238753, + "language_loss": 0.82135451, + "learning_rate": 3.896537778333651e-06, + "loss": 0.90069497, + "num_input_tokens_seen": 46617355, + "router_z_loss_clip": 3.33398438, + "router_z_loss_mlp": 0.34277344, + "step": 2158, + "time_per_iteration": 2.5973830223083496 + }, + { + "auxiliary_loss_clip": 0.06639753, + "auxiliary_loss_mlp": 0.0129687, + "balance_loss_clip": 0.06306097, + "balance_loss_mlp": 0.01263467, + "epoch": 0.1298061025101458, + "flos": 9687036746880.0, + "grad_norm": 2.577133138726625, + "language_loss": 0.76591945, + "learning_rate": 3.896414100642752e-06, + "loss": 0.84528571, + "num_input_tokens_seen": 46633130, + "router_z_loss_clip": 3.33984375, + "router_z_loss_mlp": 0.33422852, + "step": 2159, + "time_per_iteration": 2.4932103157043457 + }, + { + "auxiliary_loss_clip": 0.06634657, + "auxiliary_loss_mlp": 0.01294131, + "balance_loss_clip": 0.06308954, + "balance_loss_mlp": 0.01261086, + "epoch": 0.12986622576281376, + "flos": 27716986097280.0, + "grad_norm": 2.475517406269625, + "language_loss": 0.83553314, + "learning_rate": 3.89629035103964e-06, + "loss": 0.91482103, + "num_input_tokens_seen": 46650575, + "router_z_loss_clip": 3.26171875, + "router_z_loss_mlp": 0.33056641, + "step": 2160, + "time_per_iteration": 2.603818655014038 + }, + { + "auxiliary_loss_clip": 0.06627609, + "auxiliary_loss_mlp": 0.01293116, + "balance_loss_clip": 0.06306535, + "balance_loss_mlp": 0.01259118, + "epoch": 0.12992634901548175, + "flos": 18807963310080.0, + "grad_norm": 1.593154120113757, + "language_loss": 0.83271182, + "learning_rate": 3.896166529529008e-06, + "loss": 0.91191912, + "num_input_tokens_seen": 46668780, + "router_z_loss_clip": 3.21484375, + "router_z_loss_mlp": 0.33984375, + "step": 2161, + "time_per_iteration": 2.5266897678375244 + }, + { + "auxiliary_loss_clip": 0.06639621, + "auxiliary_loss_mlp": 0.01302779, + "balance_loss_clip": 0.06313581, + "balance_loss_mlp": 0.01268423, + "epoch": 0.12998647226814972, + "flos": 29134442891520.0, + "grad_norm": 2.3185391348432254, + "language_loss": 0.83230841, + "learning_rate": 3.896042636115551e-06, + "loss": 0.91173244, + "num_input_tokens_seen": 46687550, + "router_z_loss_clip": 3.25976562, + "router_z_loss_mlp": 0.34375, + "step": 2162, + "time_per_iteration": 2.65075945854187 + }, + { + "auxiliary_loss_clip": 0.06644595, + "auxiliary_loss_mlp": 0.0130915, + "balance_loss_clip": 0.06308532, + "balance_loss_mlp": 0.01275485, + "epoch": 0.13004659552081768, + "flos": 19579855921920.0, + "grad_norm": 2.844531827385147, + "language_loss": 0.74537766, + "learning_rate": 3.895918670803968e-06, + "loss": 0.82491517, + "num_input_tokens_seen": 46706730, + "router_z_loss_clip": 3.36132812, + "router_z_loss_mlp": 0.33666992, + "step": 2163, + "time_per_iteration": 2.54642653465271 + }, + { + "auxiliary_loss_clip": 0.06640218, + "auxiliary_loss_mlp": 0.0130695, + "balance_loss_clip": 0.06307475, + "balance_loss_mlp": 0.01271259, + "epoch": 0.13010671877348565, + "flos": 22497059819520.0, + "grad_norm": 2.8300840640024605, + "language_loss": 0.82687104, + "learning_rate": 3.895794633598958e-06, + "loss": 0.90634274, + "num_input_tokens_seen": 46724250, + "router_z_loss_clip": 3.33007812, + "router_z_loss_mlp": 0.35668945, + "step": 2164, + "time_per_iteration": 2.5606889724731445 + }, + { + "auxiliary_loss_clip": 0.06643611, + "auxiliary_loss_mlp": 0.01308241, + "balance_loss_clip": 0.0631078, + "balance_loss_mlp": 0.0127317, + "epoch": 0.1301668420261536, + "flos": 23884985249280.0, + "grad_norm": 2.1372618334431004, + "language_loss": 0.72789967, + "learning_rate": 3.8956705245052256e-06, + "loss": 0.80741817, + "num_input_tokens_seen": 46744105, + "router_z_loss_clip": 3.33007812, + "router_z_loss_mlp": 0.35058594, + "step": 2165, + "time_per_iteration": 2.5799126625061035 + }, + { + "auxiliary_loss_clip": 0.06653779, + "auxiliary_loss_mlp": 0.01315345, + "balance_loss_clip": 0.06317334, + "balance_loss_mlp": 0.0127932, + "epoch": 0.13022696527882158, + "flos": 23156963049600.0, + "grad_norm": 2.4025078023781563, + "language_loss": 0.76332915, + "learning_rate": 3.8955463435274765e-06, + "loss": 0.84302044, + "num_input_tokens_seen": 46764250, + "router_z_loss_clip": 3.3671875, + "router_z_loss_mlp": 0.35986328, + "step": 2166, + "time_per_iteration": 2.6160640716552734 + }, + { + "auxiliary_loss_clip": 0.06650659, + "auxiliary_loss_mlp": 0.01325427, + "balance_loss_clip": 0.06318434, + "balance_loss_mlp": 0.01292144, + "epoch": 0.13028708853148954, + "flos": 26916149099520.0, + "grad_norm": 2.7267776489226945, + "language_loss": 0.84227574, + "learning_rate": 3.895422090670421e-06, + "loss": 0.92203659, + "num_input_tokens_seen": 46786865, + "router_z_loss_clip": 3.3203125, + "router_z_loss_mlp": 0.33276367, + "step": 2167, + "time_per_iteration": 2.6118650436401367 + }, + { + "auxiliary_loss_clip": 0.0665281, + "auxiliary_loss_mlp": 0.01322266, + "balance_loss_clip": 0.06323615, + "balance_loss_mlp": 0.01284524, + "epoch": 0.13034721178415754, + "flos": 21257824659840.0, + "grad_norm": 1.882236850474067, + "language_loss": 0.84621233, + "learning_rate": 3.89529776593877e-06, + "loss": 0.9259631, + "num_input_tokens_seen": 46807030, + "router_z_loss_clip": 3.29101562, + "router_z_loss_mlp": 0.37719727, + "step": 2168, + "time_per_iteration": 2.599341869354248 + }, + { + "auxiliary_loss_clip": 0.06651181, + "auxiliary_loss_mlp": 0.01330045, + "balance_loss_clip": 0.0631827, + "balance_loss_mlp": 0.01296166, + "epoch": 0.1304073350368255, + "flos": 18772646014080.0, + "grad_norm": 2.6769280516725495, + "language_loss": 0.81258374, + "learning_rate": 3.8951733693372375e-06, + "loss": 0.89239597, + "num_input_tokens_seen": 46826280, + "router_z_loss_clip": 3.32617188, + "router_z_loss_mlp": 0.33886719, + "step": 2169, + "time_per_iteration": 2.551320791244507 + }, + { + "auxiliary_loss_clip": 0.06645042, + "auxiliary_loss_mlp": 0.01325755, + "balance_loss_clip": 0.06314517, + "balance_loss_mlp": 0.01290898, + "epoch": 0.13046745828949347, + "flos": 28371941936640.0, + "grad_norm": 2.6264294111585285, + "language_loss": 0.6902529, + "learning_rate": 3.8950489008705406e-06, + "loss": 0.76996082, + "num_input_tokens_seen": 46846505, + "router_z_loss_clip": 3.30664062, + "router_z_loss_mlp": 0.34838867, + "step": 2170, + "time_per_iteration": 2.636103868484497 + }, + { + "auxiliary_loss_clip": 0.06639146, + "auxiliary_loss_mlp": 0.01323013, + "balance_loss_clip": 0.063104, + "balance_loss_mlp": 0.01289826, + "epoch": 0.13052758154216143, + "flos": 29612518761600.0, + "grad_norm": 2.576487358768087, + "language_loss": 0.68392706, + "learning_rate": 3.8949243605434e-06, + "loss": 0.76354867, + "num_input_tokens_seen": 46867380, + "router_z_loss_clip": 3.28125, + "router_z_loss_mlp": 0.33178711, + "step": 2171, + "time_per_iteration": 2.6055140495300293 + }, + { + "auxiliary_loss_clip": 0.06645554, + "auxiliary_loss_mlp": 0.01327149, + "balance_loss_clip": 0.06309786, + "balance_loss_mlp": 0.0129215, + "epoch": 0.1305877047948294, + "flos": 19396938458880.0, + "grad_norm": 3.1003670458212973, + "language_loss": 0.73706764, + "learning_rate": 3.894799748360537e-06, + "loss": 0.81679469, + "num_input_tokens_seen": 46886810, + "router_z_loss_clip": 3.35742188, + "router_z_loss_mlp": 0.35009766, + "step": 2172, + "time_per_iteration": 2.541368007659912 + }, + { + "auxiliary_loss_clip": 0.06633269, + "auxiliary_loss_mlp": 0.01311381, + "balance_loss_clip": 0.06310625, + "balance_loss_mlp": 0.01278884, + "epoch": 0.13064782804749736, + "flos": 16879209701760.0, + "grad_norm": 2.044770569718403, + "language_loss": 0.7695576, + "learning_rate": 3.894675064326678e-06, + "loss": 0.84900403, + "num_input_tokens_seen": 46905620, + "router_z_loss_clip": 3.22460938, + "router_z_loss_mlp": 0.32470703, + "step": 2173, + "time_per_iteration": 2.5094704627990723 + }, + { + "auxiliary_loss_clip": 0.06648449, + "auxiliary_loss_mlp": 0.0132515, + "balance_loss_clip": 0.06310691, + "balance_loss_mlp": 0.01289125, + "epoch": 0.13070795130016533, + "flos": 24506049312000.0, + "grad_norm": 2.8505370909687575, + "language_loss": 0.725703, + "learning_rate": 3.894550308446551e-06, + "loss": 0.805439, + "num_input_tokens_seen": 46925120, + "router_z_loss_clip": 3.3828125, + "router_z_loss_mlp": 0.36035156, + "step": 2174, + "time_per_iteration": 2.5734338760375977 + }, + { + "auxiliary_loss_clip": 0.06505907, + "auxiliary_loss_mlp": 0.01291883, + "balance_loss_clip": 0.0631025, + "balance_loss_mlp": 0.0128004, + "epoch": 0.13076807455283332, + "flos": 71075288401920.0, + "grad_norm": 0.7747015133023086, + "language_loss": 0.58868217, + "learning_rate": 3.894425480724886e-06, + "loss": 0.66666007, + "num_input_tokens_seen": 46988195, + "router_z_loss_clip": 1.953125, + "router_z_loss_mlp": 0.11834717, + "step": 2175, + "time_per_iteration": 3.2926440238952637 + }, + { + "auxiliary_loss_clip": 0.0663542, + "auxiliary_loss_mlp": 0.01313196, + "balance_loss_clip": 0.06304372, + "balance_loss_mlp": 0.01276337, + "epoch": 0.13082819780550128, + "flos": 20270380619520.0, + "grad_norm": 2.4663196598164543, + "language_loss": 0.8129558, + "learning_rate": 3.894300581166417e-06, + "loss": 0.89244199, + "num_input_tokens_seen": 47004720, + "router_z_loss_clip": 3.30664062, + "router_z_loss_mlp": 0.36865234, + "step": 2176, + "time_per_iteration": 2.509202480316162 + }, + { + "auxiliary_loss_clip": 0.06636009, + "auxiliary_loss_mlp": 0.01308249, + "balance_loss_clip": 0.06307728, + "balance_loss_mlp": 0.01275204, + "epoch": 0.13088832105816925, + "flos": 34211884101120.0, + "grad_norm": 2.555490160200695, + "language_loss": 0.75945169, + "learning_rate": 3.894175609775881e-06, + "loss": 0.83889425, + "num_input_tokens_seen": 47024255, + "router_z_loss_clip": 3.28710938, + "router_z_loss_mlp": 0.33056641, + "step": 2177, + "time_per_iteration": 2.666957378387451 + }, + { + "auxiliary_loss_clip": 0.06632685, + "auxiliary_loss_mlp": 0.01303929, + "balance_loss_clip": 0.0630488, + "balance_loss_mlp": 0.01266378, + "epoch": 0.13094844431083721, + "flos": 17900797080960.0, + "grad_norm": 1.8104390236362107, + "language_loss": 0.8256914, + "learning_rate": 3.894050566558015e-06, + "loss": 0.90505755, + "num_input_tokens_seen": 47042465, + "router_z_loss_clip": 3.27929688, + "router_z_loss_mlp": 0.37548828, + "step": 2178, + "time_per_iteration": 2.5337579250335693 + }, + { + "auxiliary_loss_clip": 0.06635031, + "auxiliary_loss_mlp": 0.01298768, + "balance_loss_clip": 0.06305701, + "balance_loss_mlp": 0.01263625, + "epoch": 0.13100856756350518, + "flos": 17317062812160.0, + "grad_norm": 2.2347658227591327, + "language_loss": 0.76173234, + "learning_rate": 3.893925451517562e-06, + "loss": 0.84107035, + "num_input_tokens_seen": 47060370, + "router_z_loss_clip": 3.296875, + "router_z_loss_mlp": 0.35131836, + "step": 2179, + "time_per_iteration": 2.606982469558716 + }, + { + "auxiliary_loss_clip": 0.06624588, + "auxiliary_loss_mlp": 0.01289469, + "balance_loss_clip": 0.0630476, + "balance_loss_mlp": 0.01256281, + "epoch": 0.13106869081617314, + "flos": 22207142292480.0, + "grad_norm": 2.1299268574103074, + "language_loss": 0.85375142, + "learning_rate": 3.893800264659266e-06, + "loss": 0.93289196, + "num_input_tokens_seen": 47081415, + "router_z_loss_clip": 3.19921875, + "router_z_loss_mlp": 0.33154297, + "step": 2180, + "time_per_iteration": 2.585345506668091 + }, + { + "auxiliary_loss_clip": 0.06632008, + "auxiliary_loss_mlp": 0.01298661, + "balance_loss_clip": 0.06304625, + "balance_loss_mlp": 0.01265282, + "epoch": 0.13112881406884114, + "flos": 21769708452480.0, + "grad_norm": 1.7694842435775522, + "language_loss": 0.9062323, + "learning_rate": 3.8936750059878746e-06, + "loss": 0.98553902, + "num_input_tokens_seen": 47099860, + "router_z_loss_clip": 3.2734375, + "router_z_loss_mlp": 0.33374023, + "step": 2181, + "time_per_iteration": 2.5587892532348633 + }, + { + "auxiliary_loss_clip": 0.06634288, + "auxiliary_loss_mlp": 0.01294395, + "balance_loss_clip": 0.06307417, + "balance_loss_mlp": 0.01259776, + "epoch": 0.1311889373215091, + "flos": 23337784160640.0, + "grad_norm": 2.2247782487696557, + "language_loss": 0.70639372, + "learning_rate": 3.893549675508137e-06, + "loss": 0.78568053, + "num_input_tokens_seen": 47118540, + "router_z_loss_clip": 3.26953125, + "router_z_loss_mlp": 0.34594727, + "step": 2182, + "time_per_iteration": 2.5555248260498047 + }, + { + "auxiliary_loss_clip": 0.06638541, + "auxiliary_loss_mlp": 0.0130911, + "balance_loss_clip": 0.06305085, + "balance_loss_mlp": 0.01272799, + "epoch": 0.13124906057417707, + "flos": 21473250307200.0, + "grad_norm": 2.348832160211932, + "language_loss": 0.79619586, + "learning_rate": 3.893424273224806e-06, + "loss": 0.8756724, + "num_input_tokens_seen": 47136710, + "router_z_loss_clip": 3.33789062, + "router_z_loss_mlp": 0.36303711, + "step": 2183, + "time_per_iteration": 2.6583075523376465 + }, + { + "auxiliary_loss_clip": 0.06622553, + "auxiliary_loss_mlp": 0.01296715, + "balance_loss_clip": 0.06301284, + "balance_loss_mlp": 0.0126379, + "epoch": 0.13130918382684503, + "flos": 23261531345280.0, + "grad_norm": 1.7633024883927577, + "language_loss": 0.86310816, + "learning_rate": 3.893298799142636e-06, + "loss": 0.94230086, + "num_input_tokens_seen": 47157155, + "router_z_loss_clip": 3.2109375, + "router_z_loss_mlp": 0.32910156, + "step": 2184, + "time_per_iteration": 2.565059185028076 + }, + { + "auxiliary_loss_clip": 0.06636564, + "auxiliary_loss_mlp": 0.01289356, + "balance_loss_clip": 0.06310757, + "balance_loss_mlp": 0.0125593, + "epoch": 0.131369307079513, + "flos": 20856588583680.0, + "grad_norm": 2.0374007595813106, + "language_loss": 0.83394486, + "learning_rate": 3.893173253266387e-06, + "loss": 0.91320401, + "num_input_tokens_seen": 47176820, + "router_z_loss_clip": 3.25585938, + "router_z_loss_mlp": 0.33447266, + "step": 2185, + "time_per_iteration": 2.581048011779785 + }, + { + "auxiliary_loss_clip": 0.06633392, + "auxiliary_loss_mlp": 0.01301523, + "balance_loss_clip": 0.063053, + "balance_loss_mlp": 0.012675, + "epoch": 0.13142943033218096, + "flos": 17864138119680.0, + "grad_norm": 2.061355049120503, + "language_loss": 0.7394222, + "learning_rate": 3.893047635600818e-06, + "loss": 0.8187713, + "num_input_tokens_seen": 47195855, + "router_z_loss_clip": 3.27734375, + "router_z_loss_mlp": 0.33984375, + "step": 2186, + "time_per_iteration": 2.5314900875091553 + }, + { + "auxiliary_loss_clip": 0.06633774, + "auxiliary_loss_mlp": 0.01305006, + "balance_loss_clip": 0.06309012, + "balance_loss_mlp": 0.01268337, + "epoch": 0.13148955358484893, + "flos": 21002343960960.0, + "grad_norm": 2.3237992911957748, + "language_loss": 0.8187871, + "learning_rate": 3.892921946150693e-06, + "loss": 0.89817482, + "num_input_tokens_seen": 47214535, + "router_z_loss_clip": 3.24609375, + "router_z_loss_mlp": 0.36669922, + "step": 2187, + "time_per_iteration": 2.575146198272705 + }, + { + "auxiliary_loss_clip": 0.0650041, + "auxiliary_loss_mlp": 0.01303078, + "balance_loss_clip": 0.06306808, + "balance_loss_mlp": 0.01287998, + "epoch": 0.13154967683751692, + "flos": 70192035313920.0, + "grad_norm": 0.8229480574179819, + "language_loss": 0.58883667, + "learning_rate": 3.892796184920778e-06, + "loss": 0.66687155, + "num_input_tokens_seen": 47270300, + "router_z_loss_clip": 1.9375, + "router_z_loss_mlp": 0.1505127, + "step": 2188, + "time_per_iteration": 4.631601572036743 + }, + { + "auxiliary_loss_clip": 0.06627252, + "auxiliary_loss_mlp": 0.01301964, + "balance_loss_clip": 0.06307825, + "balance_loss_mlp": 0.01268609, + "epoch": 0.1316098000901849, + "flos": 20382411928320.0, + "grad_norm": 1.8739878728488704, + "language_loss": 0.75486964, + "learning_rate": 3.892670351915842e-06, + "loss": 0.83416182, + "num_input_tokens_seen": 47290720, + "router_z_loss_clip": 3.1953125, + "router_z_loss_mlp": 0.33300781, + "step": 2189, + "time_per_iteration": 4.007068395614624 + }, + { + "auxiliary_loss_clip": 0.06638934, + "auxiliary_loss_mlp": 0.01302262, + "balance_loss_clip": 0.06312171, + "balance_loss_mlp": 0.01267691, + "epoch": 0.13166992334285285, + "flos": 23227723422720.0, + "grad_norm": 2.019862807668573, + "language_loss": 0.73193908, + "learning_rate": 3.892544447140657e-06, + "loss": 0.81135106, + "num_input_tokens_seen": 47311820, + "router_z_loss_clip": 3.26757812, + "router_z_loss_mlp": 0.34570312, + "step": 2190, + "time_per_iteration": 2.5776755809783936 + }, + { + "auxiliary_loss_clip": 0.06636755, + "auxiliary_loss_mlp": 0.01299753, + "balance_loss_clip": 0.06315562, + "balance_loss_mlp": 0.01266828, + "epoch": 0.13173004659552082, + "flos": 23337616452480.0, + "grad_norm": 1.8457361126651268, + "language_loss": 0.75608957, + "learning_rate": 3.892418470599996e-06, + "loss": 0.83545464, + "num_input_tokens_seen": 47331605, + "router_z_loss_clip": 3.21484375, + "router_z_loss_mlp": 0.32958984, + "step": 2191, + "time_per_iteration": 2.580988645553589 + }, + { + "auxiliary_loss_clip": 0.06637161, + "auxiliary_loss_mlp": 0.01295844, + "balance_loss_clip": 0.06311083, + "balance_loss_mlp": 0.01258699, + "epoch": 0.13179016984818878, + "flos": 21257866586880.0, + "grad_norm": 2.0212941585210613, + "language_loss": 0.80481809, + "learning_rate": 3.892292422298637e-06, + "loss": 0.88414812, + "num_input_tokens_seen": 47350455, + "router_z_loss_clip": 3.2578125, + "router_z_loss_mlp": 0.37133789, + "step": 2192, + "time_per_iteration": 5.4770941734313965 + }, + { + "auxiliary_loss_clip": 0.06644538, + "auxiliary_loss_mlp": 0.01301425, + "balance_loss_clip": 0.06318243, + "balance_loss_mlp": 0.01265758, + "epoch": 0.13185029310085675, + "flos": 17783357184000.0, + "grad_norm": 2.540381366914011, + "language_loss": 0.86697793, + "learning_rate": 3.892166302241361e-06, + "loss": 0.94643748, + "num_input_tokens_seen": 47368225, + "router_z_loss_clip": 3.26757812, + "router_z_loss_mlp": 0.35693359, + "step": 2193, + "time_per_iteration": 2.5420453548431396 + }, + { + "auxiliary_loss_clip": 0.06500036, + "auxiliary_loss_mlp": 0.01269775, + "balance_loss_clip": 0.06307782, + "balance_loss_mlp": 0.01257103, + "epoch": 0.1319104163535247, + "flos": 69872586422400.0, + "grad_norm": 0.721919772393688, + "language_loss": 0.54093373, + "learning_rate": 3.8920401104329475e-06, + "loss": 0.61863184, + "num_input_tokens_seen": 47427125, + "router_z_loss_clip": 1.921875, + "router_z_loss_mlp": 0.12683105, + "step": 2194, + "time_per_iteration": 3.1521217823028564 + }, + { + "auxiliary_loss_clip": 0.06633582, + "auxiliary_loss_mlp": 0.01294441, + "balance_loss_clip": 0.06310762, + "balance_loss_mlp": 0.01261277, + "epoch": 0.1319705396061927, + "flos": 25200305516160.0, + "grad_norm": 1.726437316735012, + "language_loss": 0.7434622, + "learning_rate": 3.891913846878185e-06, + "loss": 0.82274246, + "num_input_tokens_seen": 47450275, + "router_z_loss_clip": 3.23046875, + "router_z_loss_mlp": 0.33154297, + "step": 2195, + "time_per_iteration": 2.593909740447998 + }, + { + "auxiliary_loss_clip": 0.06639563, + "auxiliary_loss_mlp": 0.01299138, + "balance_loss_clip": 0.0630713, + "balance_loss_mlp": 0.01264305, + "epoch": 0.13203066285886067, + "flos": 20746695553920.0, + "grad_norm": 1.9416785711103928, + "language_loss": 0.79390305, + "learning_rate": 3.891787511581859e-06, + "loss": 0.87329006, + "num_input_tokens_seen": 47469155, + "router_z_loss_clip": 3.32421875, + "router_z_loss_mlp": 0.34838867, + "step": 2196, + "time_per_iteration": 2.5824716091156006 + }, + { + "auxiliary_loss_clip": 0.06635743, + "auxiliary_loss_mlp": 0.01302288, + "balance_loss_clip": 0.06304654, + "balance_loss_mlp": 0.01269148, + "epoch": 0.13209078611152864, + "flos": 22060925717760.0, + "grad_norm": 8.075867999821003, + "language_loss": 0.76482284, + "learning_rate": 3.89166110454876e-06, + "loss": 0.84420311, + "num_input_tokens_seen": 47488405, + "router_z_loss_clip": 3.31054688, + "router_z_loss_mlp": 0.33105469, + "step": 2197, + "time_per_iteration": 2.5501832962036133 + }, + { + "auxiliary_loss_clip": 0.06635305, + "auxiliary_loss_mlp": 0.01300777, + "balance_loss_clip": 0.06304274, + "balance_loss_mlp": 0.01266063, + "epoch": 0.1321509093641966, + "flos": 16289731428480.0, + "grad_norm": 2.9293196732039126, + "language_loss": 0.81022984, + "learning_rate": 3.891534625783685e-06, + "loss": 0.88959062, + "num_input_tokens_seen": 47505650, + "router_z_loss_clip": 3.31054688, + "router_z_loss_mlp": 0.34716797, + "step": 2198, + "time_per_iteration": 2.570861577987671 + }, + { + "auxiliary_loss_clip": 0.06631541, + "auxiliary_loss_mlp": 0.01313296, + "balance_loss_clip": 0.06305937, + "balance_loss_mlp": 0.01279513, + "epoch": 0.13221103261686457, + "flos": 16988725388160.0, + "grad_norm": 2.4451285716665914, + "language_loss": 0.83851683, + "learning_rate": 3.891408075291425e-06, + "loss": 0.91796517, + "num_input_tokens_seen": 47521540, + "router_z_loss_clip": 3.25390625, + "router_z_loss_mlp": 0.33764648, + "step": 2199, + "time_per_iteration": 2.521033525466919 + }, + { + "auxiliary_loss_clip": 0.06631772, + "auxiliary_loss_mlp": 0.01306909, + "balance_loss_clip": 0.06307507, + "balance_loss_mlp": 0.01272887, + "epoch": 0.13227115586953253, + "flos": 34240996195200.0, + "grad_norm": 1.9425616182298255, + "language_loss": 0.71189994, + "learning_rate": 3.8912814530767826e-06, + "loss": 0.79128671, + "num_input_tokens_seen": 47543625, + "router_z_loss_clip": 3.24023438, + "router_z_loss_mlp": 0.34033203, + "step": 2200, + "time_per_iteration": 2.670046806335449 + }, + { + "auxiliary_loss_clip": 0.06617988, + "auxiliary_loss_mlp": 0.01304715, + "balance_loss_clip": 0.06300868, + "balance_loss_mlp": 0.01274341, + "epoch": 0.13233127912220052, + "flos": 20711000914560.0, + "grad_norm": 2.1724926946699754, + "language_loss": 0.86090875, + "learning_rate": 3.891154759144557e-06, + "loss": 0.94013584, + "num_input_tokens_seen": 47563740, + "router_z_loss_clip": 3.17773438, + "router_z_loss_mlp": 0.30371094, + "step": 2201, + "time_per_iteration": 2.570223569869995 + }, + { + "auxiliary_loss_clip": 0.06631213, + "auxiliary_loss_mlp": 0.01297349, + "balance_loss_clip": 0.06304044, + "balance_loss_mlp": 0.01263828, + "epoch": 0.1323914023748685, + "flos": 25810971672960.0, + "grad_norm": 1.9172071001088793, + "language_loss": 0.87768662, + "learning_rate": 3.891027993499554e-06, + "loss": 0.95697218, + "num_input_tokens_seen": 47582655, + "router_z_loss_clip": 3.2734375, + "router_z_loss_mlp": 0.33496094, + "step": 2202, + "time_per_iteration": 2.6102631092071533 + }, + { + "auxiliary_loss_clip": 0.06636258, + "auxiliary_loss_mlp": 0.012969, + "balance_loss_clip": 0.06311007, + "balance_loss_mlp": 0.01264427, + "epoch": 0.13245152562753645, + "flos": 21257908513920.0, + "grad_norm": 2.5432278039111202, + "language_loss": 0.73953617, + "learning_rate": 3.89090115614658e-06, + "loss": 0.81886774, + "num_input_tokens_seen": 47600875, + "router_z_loss_clip": 3.25390625, + "router_z_loss_mlp": 0.32470703, + "step": 2203, + "time_per_iteration": 2.582125425338745 + }, + { + "auxiliary_loss_clip": 0.0663885, + "auxiliary_loss_mlp": 0.01297802, + "balance_loss_clip": 0.06312627, + "balance_loss_mlp": 0.01266879, + "epoch": 0.13251164888020442, + "flos": 26617552675200.0, + "grad_norm": 2.0999892579623918, + "language_loss": 0.74886954, + "learning_rate": 3.890774247090444e-06, + "loss": 0.82823604, + "num_input_tokens_seen": 47619250, + "router_z_loss_clip": 3.26171875, + "router_z_loss_mlp": 0.30883789, + "step": 2204, + "time_per_iteration": 2.634873867034912 + }, + { + "auxiliary_loss_clip": 0.06637383, + "auxiliary_loss_mlp": 0.01309474, + "balance_loss_clip": 0.06314126, + "balance_loss_mlp": 0.01276119, + "epoch": 0.13257177213287238, + "flos": 29834485027200.0, + "grad_norm": 2.4895096645832235, + "language_loss": 0.79621047, + "learning_rate": 3.89064726633596e-06, + "loss": 0.87567902, + "num_input_tokens_seen": 47639445, + "router_z_loss_clip": 3.23242188, + "router_z_loss_mlp": 0.33349609, + "step": 2205, + "time_per_iteration": 2.619999647140503 + }, + { + "auxiliary_loss_clip": 0.06630976, + "auxiliary_loss_mlp": 0.01295213, + "balance_loss_clip": 0.06307817, + "balance_loss_mlp": 0.01261548, + "epoch": 0.13263189538554035, + "flos": 21294902891520.0, + "grad_norm": 2.228894402461185, + "language_loss": 0.80627573, + "learning_rate": 3.890520213887941e-06, + "loss": 0.88553762, + "num_input_tokens_seen": 47658740, + "router_z_loss_clip": 3.234375, + "router_z_loss_mlp": 0.33666992, + "step": 2206, + "time_per_iteration": 2.5711123943328857 + }, + { + "auxiliary_loss_clip": 0.06638241, + "auxiliary_loss_mlp": 0.01297492, + "balance_loss_clip": 0.06313571, + "balance_loss_mlp": 0.0126676, + "epoch": 0.13269201863820831, + "flos": 16879880534400.0, + "grad_norm": 2.2771237083056297, + "language_loss": 0.76153713, + "learning_rate": 3.890393089751208e-06, + "loss": 0.84089446, + "num_input_tokens_seen": 47676880, + "router_z_loss_clip": 3.2421875, + "router_z_loss_mlp": 0.30688477, + "step": 2207, + "time_per_iteration": 2.5054686069488525 + }, + { + "auxiliary_loss_clip": 0.06632576, + "auxiliary_loss_mlp": 0.01289317, + "balance_loss_clip": 0.06313936, + "balance_loss_mlp": 0.01259014, + "epoch": 0.1327521418908763, + "flos": 23775679198080.0, + "grad_norm": 2.287917678450009, + "language_loss": 0.85195792, + "learning_rate": 3.890265893930578e-06, + "loss": 0.9311769, + "num_input_tokens_seen": 47696635, + "router_z_loss_clip": 3.18359375, + "router_z_loss_mlp": 0.30322266, + "step": 2208, + "time_per_iteration": 2.609978675842285 + }, + { + "auxiliary_loss_clip": 0.0661916, + "auxiliary_loss_mlp": 0.0129287, + "balance_loss_clip": 0.06309634, + "balance_loss_mlp": 0.01263712, + "epoch": 0.13281226514354427, + "flos": 26512858598400.0, + "grad_norm": 2.1774657992842923, + "language_loss": 0.86578667, + "learning_rate": 3.890138626430876e-06, + "loss": 0.94490695, + "num_input_tokens_seen": 47717760, + "router_z_loss_clip": 3.09765625, + "router_z_loss_mlp": 0.29174805, + "step": 2209, + "time_per_iteration": 2.5905022621154785 + }, + { + "auxiliary_loss_clip": 0.06630558, + "auxiliary_loss_mlp": 0.01296527, + "balance_loss_clip": 0.06307525, + "balance_loss_mlp": 0.01264817, + "epoch": 0.13287238839621224, + "flos": 24505671968640.0, + "grad_norm": 2.0974790857001255, + "language_loss": 0.83324587, + "learning_rate": 3.890011287256929e-06, + "loss": 0.91251671, + "num_input_tokens_seen": 47737685, + "router_z_loss_clip": 3.234375, + "router_z_loss_mlp": 0.31689453, + "step": 2210, + "time_per_iteration": 2.605640172958374 + }, + { + "auxiliary_loss_clip": 0.06520031, + "auxiliary_loss_mlp": 0.01268108, + "balance_loss_clip": 0.06330763, + "balance_loss_mlp": 0.01256634, + "epoch": 0.1329325116488802, + "flos": 67713984264960.0, + "grad_norm": 0.7321997743468096, + "language_loss": 0.57977009, + "learning_rate": 3.889883876413563e-06, + "loss": 0.65765154, + "num_input_tokens_seen": 47802415, + "router_z_loss_clip": 1.890625, + "router_z_loss_mlp": 0.11456299, + "step": 2211, + "time_per_iteration": 3.2822937965393066 + }, + { + "auxiliary_loss_clip": 0.06521661, + "auxiliary_loss_mlp": 0.01258942, + "balance_loss_clip": 0.0633207, + "balance_loss_mlp": 0.01247897, + "epoch": 0.13299263490154817, + "flos": 72283440896640.0, + "grad_norm": 0.7669964089142771, + "language_loss": 0.54991639, + "learning_rate": 3.889756393905611e-06, + "loss": 0.62772238, + "num_input_tokens_seen": 47871485, + "router_z_loss_clip": 1.89648438, + "router_z_loss_mlp": 0.1105957, + "step": 2212, + "time_per_iteration": 3.2838916778564453 + }, + { + "auxiliary_loss_clip": 0.0664072, + "auxiliary_loss_mlp": 0.01298095, + "balance_loss_clip": 0.06314459, + "balance_loss_mlp": 0.012661, + "epoch": 0.13305275815421613, + "flos": 17937078698880.0, + "grad_norm": 3.2445802523020144, + "language_loss": 0.75483733, + "learning_rate": 3.889628839737908e-06, + "loss": 0.83422554, + "num_input_tokens_seen": 47888315, + "router_z_loss_clip": 3.26367188, + "router_z_loss_mlp": 0.31982422, + "step": 2213, + "time_per_iteration": 2.599457025527954 + }, + { + "auxiliary_loss_clip": 0.06623878, + "auxiliary_loss_mlp": 0.01290528, + "balance_loss_clip": 0.06308766, + "balance_loss_mlp": 0.01260917, + "epoch": 0.13311288140688413, + "flos": 22346566686720.0, + "grad_norm": 1.7850496574832224, + "language_loss": 0.80468798, + "learning_rate": 3.889501213915291e-06, + "loss": 0.88383198, + "num_input_tokens_seen": 47906600, + "router_z_loss_clip": 3.15429688, + "router_z_loss_mlp": 0.29614258, + "step": 2214, + "time_per_iteration": 2.572476625442505 + }, + { + "auxiliary_loss_clip": 0.06633762, + "auxiliary_loss_mlp": 0.01291249, + "balance_loss_clip": 0.06310902, + "balance_loss_mlp": 0.01259992, + "epoch": 0.1331730046595521, + "flos": 31877030880000.0, + "grad_norm": 1.879682062967662, + "language_loss": 0.71106076, + "learning_rate": 3.889373516442597e-06, + "loss": 0.79031086, + "num_input_tokens_seen": 47927630, + "router_z_loss_clip": 3.22851562, + "router_z_loss_mlp": 0.3125, + "step": 2215, + "time_per_iteration": 2.6289784908294678 + }, + { + "auxiliary_loss_clip": 0.06635362, + "auxiliary_loss_mlp": 0.01297639, + "balance_loss_clip": 0.06308068, + "balance_loss_mlp": 0.01264762, + "epoch": 0.13323312791222006, + "flos": 22573438416000.0, + "grad_norm": 2.1877299894623063, + "language_loss": 0.81866241, + "learning_rate": 3.889245747324671e-06, + "loss": 0.89799237, + "num_input_tokens_seen": 47947935, + "router_z_loss_clip": 3.27148438, + "router_z_loss_mlp": 0.32861328, + "step": 2216, + "time_per_iteration": 2.5978689193725586 + }, + { + "auxiliary_loss_clip": 0.06628902, + "auxiliary_loss_mlp": 0.01291342, + "balance_loss_clip": 0.06306753, + "balance_loss_mlp": 0.01260229, + "epoch": 0.13329325116488802, + "flos": 15090635174400.0, + "grad_norm": 1.945076656101512, + "language_loss": 0.8810879, + "learning_rate": 3.889117906566356e-06, + "loss": 0.96029037, + "num_input_tokens_seen": 47965515, + "router_z_loss_clip": 3.22265625, + "router_z_loss_mlp": 0.3112793, + "step": 2217, + "time_per_iteration": 2.5901639461517334 + }, + { + "auxiliary_loss_clip": 0.0662536, + "auxiliary_loss_mlp": 0.0129587, + "balance_loss_clip": 0.06307805, + "balance_loss_mlp": 0.01262563, + "epoch": 0.133353374417556, + "flos": 27461002273920.0, + "grad_norm": 2.771116888328456, + "language_loss": 0.75384659, + "learning_rate": 3.888989994172501e-06, + "loss": 0.83305889, + "num_input_tokens_seen": 47985675, + "router_z_loss_clip": 3.1796875, + "router_z_loss_mlp": 0.33349609, + "step": 2218, + "time_per_iteration": 2.5716331005096436 + }, + { + "auxiliary_loss_clip": 0.06631406, + "auxiliary_loss_mlp": 0.01293158, + "balance_loss_clip": 0.06307958, + "balance_loss_mlp": 0.01259875, + "epoch": 0.13341349767022395, + "flos": 24101081729280.0, + "grad_norm": 1.6852729372488615, + "language_loss": 0.88550645, + "learning_rate": 3.8888620101479565e-06, + "loss": 0.96475214, + "num_input_tokens_seen": 48004985, + "router_z_loss_clip": 3.23242188, + "router_z_loss_mlp": 0.33300781, + "step": 2219, + "time_per_iteration": 2.6070170402526855 + }, + { + "auxiliary_loss_clip": 0.06621003, + "auxiliary_loss_mlp": 0.01287851, + "balance_loss_clip": 0.06303806, + "balance_loss_mlp": 0.01257381, + "epoch": 0.13347362092289192, + "flos": 24140088604800.0, + "grad_norm": 2.0906842838932556, + "language_loss": 0.7815029, + "learning_rate": 3.888733954497574e-06, + "loss": 0.86059141, + "num_input_tokens_seen": 48024965, + "router_z_loss_clip": 3.16992188, + "router_z_loss_mlp": 0.3046875, + "step": 2220, + "time_per_iteration": 2.5560426712036133 + }, + { + "auxiliary_loss_clip": 0.06625573, + "auxiliary_loss_mlp": 0.01294385, + "balance_loss_clip": 0.06307516, + "balance_loss_mlp": 0.0126432, + "epoch": 0.1335337441755599, + "flos": 18441499478400.0, + "grad_norm": 3.5848326197945974, + "language_loss": 0.80259734, + "learning_rate": 3.888605827226212e-06, + "loss": 0.88179696, + "num_input_tokens_seen": 48040890, + "router_z_loss_clip": 3.18164062, + "router_z_loss_mlp": 0.30078125, + "step": 2221, + "time_per_iteration": 2.554230213165283 + }, + { + "auxiliary_loss_clip": 0.06500886, + "auxiliary_loss_mlp": 0.01279151, + "balance_loss_clip": 0.06314573, + "balance_loss_mlp": 0.01265382, + "epoch": 0.13359386742822787, + "flos": 50627608542720.0, + "grad_norm": 0.9620548374199929, + "language_loss": 0.69134498, + "learning_rate": 3.8884776283387275e-06, + "loss": 0.76914537, + "num_input_tokens_seen": 48091855, + "router_z_loss_clip": 1.86621094, + "router_z_loss_mlp": 0.13806152, + "step": 2222, + "time_per_iteration": 3.0396814346313477 + }, + { + "auxiliary_loss_clip": 0.0662626, + "auxiliary_loss_mlp": 0.01285858, + "balance_loss_clip": 0.06309929, + "balance_loss_mlp": 0.01257987, + "epoch": 0.13365399068089584, + "flos": 22784294016000.0, + "grad_norm": 6.993006748631453, + "language_loss": 0.68394774, + "learning_rate": 3.888349357839982e-06, + "loss": 0.76306891, + "num_input_tokens_seen": 48111350, + "router_z_loss_clip": 3.16015625, + "router_z_loss_mlp": 0.27856445, + "step": 2223, + "time_per_iteration": 2.6058313846588135 + }, + { + "auxiliary_loss_clip": 0.06624826, + "auxiliary_loss_mlp": 0.01288517, + "balance_loss_clip": 0.06304329, + "balance_loss_mlp": 0.01257296, + "epoch": 0.1337141139335638, + "flos": 12536540945280.0, + "grad_norm": 2.4608215865303937, + "language_loss": 0.8412739, + "learning_rate": 3.88822101573484e-06, + "loss": 0.9204073, + "num_input_tokens_seen": 48129840, + "router_z_loss_clip": 3.2109375, + "router_z_loss_mlp": 0.31213379, + "step": 2224, + "time_per_iteration": 2.5372307300567627 + }, + { + "auxiliary_loss_clip": 0.066294, + "auxiliary_loss_mlp": 0.01287352, + "balance_loss_clip": 0.06301981, + "balance_loss_mlp": 0.01255499, + "epoch": 0.13377423718623177, + "flos": 23045560646400.0, + "grad_norm": 2.2168840240666294, + "language_loss": 0.67877412, + "learning_rate": 3.888092602028167e-06, + "loss": 0.7579416, + "num_input_tokens_seen": 48149240, + "router_z_loss_clip": 3.27734375, + "router_z_loss_mlp": 0.31835938, + "step": 2225, + "time_per_iteration": 2.567253589630127 + }, + { + "auxiliary_loss_clip": 0.06627665, + "auxiliary_loss_mlp": 0.01285599, + "balance_loss_clip": 0.06307095, + "balance_loss_mlp": 0.01257406, + "epoch": 0.13383436043889974, + "flos": 16221905948160.0, + "grad_norm": 2.1695875347778184, + "language_loss": 0.90785301, + "learning_rate": 3.887964116724835e-06, + "loss": 0.98698568, + "num_input_tokens_seen": 48166330, + "router_z_loss_clip": 3.20703125, + "router_z_loss_mlp": 0.28186035, + "step": 2226, + "time_per_iteration": 2.6064305305480957 + }, + { + "auxiliary_loss_clip": 0.06623043, + "auxiliary_loss_mlp": 0.0129267, + "balance_loss_clip": 0.06300287, + "balance_loss_mlp": 0.01261771, + "epoch": 0.1338944836915677, + "flos": 24286514814720.0, + "grad_norm": 2.574481606503262, + "language_loss": 0.75021911, + "learning_rate": 3.887835559829712e-06, + "loss": 0.82937622, + "num_input_tokens_seen": 48187600, + "router_z_loss_clip": 3.23046875, + "router_z_loss_mlp": 0.30883789, + "step": 2227, + "time_per_iteration": 4.016468286514282 + }, + { + "auxiliary_loss_clip": 0.06618345, + "auxiliary_loss_mlp": 0.01292665, + "balance_loss_clip": 0.0629885, + "balance_loss_mlp": 0.01261265, + "epoch": 0.1339546069442357, + "flos": 17603793884160.0, + "grad_norm": 2.0025343623105214, + "language_loss": 0.8591758, + "learning_rate": 3.8877069313476764e-06, + "loss": 0.93828595, + "num_input_tokens_seen": 48204400, + "router_z_loss_clip": 3.1953125, + "router_z_loss_mlp": 0.31396484, + "step": 2228, + "time_per_iteration": 2.55798077583313 + }, + { + "auxiliary_loss_clip": 0.06615113, + "auxiliary_loss_mlp": 0.01284588, + "balance_loss_clip": 0.06298958, + "balance_loss_mlp": 0.01255548, + "epoch": 0.13401473019690366, + "flos": 18996163580160.0, + "grad_norm": 1.8879365390563052, + "language_loss": 0.82201439, + "learning_rate": 3.8875782312836054e-06, + "loss": 0.90101147, + "num_input_tokens_seen": 48222180, + "router_z_loss_clip": 3.15820312, + "router_z_loss_mlp": 0.29052734, + "step": 2229, + "time_per_iteration": 4.120098829269409 + }, + { + "auxiliary_loss_clip": 0.06619616, + "auxiliary_loss_mlp": 0.01290736, + "balance_loss_clip": 0.06300908, + "balance_loss_mlp": 0.01259849, + "epoch": 0.13407485344957162, + "flos": 26951214833280.0, + "grad_norm": 2.2979177943800386, + "language_loss": 0.7564404, + "learning_rate": 3.887449459642378e-06, + "loss": 0.83554387, + "num_input_tokens_seen": 48243245, + "router_z_loss_clip": 3.1875, + "router_z_loss_mlp": 0.30871582, + "step": 2230, + "time_per_iteration": 2.6150131225585938 + }, + { + "auxiliary_loss_clip": 0.06620437, + "auxiliary_loss_mlp": 0.01289621, + "balance_loss_clip": 0.06302108, + "balance_loss_mlp": 0.01261059, + "epoch": 0.1341349767022396, + "flos": 20345585258880.0, + "grad_norm": 1.8496833611889134, + "language_loss": 0.81113201, + "learning_rate": 3.8873206164288785e-06, + "loss": 0.89023262, + "num_input_tokens_seen": 48262600, + "router_z_loss_clip": 3.1796875, + "router_z_loss_mlp": 0.28564453, + "step": 2231, + "time_per_iteration": 2.5791971683502197 + }, + { + "auxiliary_loss_clip": 0.06629717, + "auxiliary_loss_mlp": 0.01304097, + "balance_loss_clip": 0.0629984, + "balance_loss_mlp": 0.01268811, + "epoch": 0.13419509995490755, + "flos": 29869802323200.0, + "grad_norm": 3.0058197712179218, + "language_loss": 0.73244405, + "learning_rate": 3.887191701647992e-06, + "loss": 0.81178224, + "num_input_tokens_seen": 48285075, + "router_z_loss_clip": 3.30273438, + "router_z_loss_mlp": 0.3527832, + "step": 2232, + "time_per_iteration": 4.126416444778442 + }, + { + "auxiliary_loss_clip": 0.06625827, + "auxiliary_loss_mlp": 0.01292477, + "balance_loss_clip": 0.06298069, + "balance_loss_mlp": 0.01260052, + "epoch": 0.13425522320757552, + "flos": 26950250511360.0, + "grad_norm": 2.8502119867979823, + "language_loss": 0.67005944, + "learning_rate": 3.8870627153046066e-06, + "loss": 0.74924242, + "num_input_tokens_seen": 48301285, + "router_z_loss_clip": 3.27539062, + "router_z_loss_mlp": 0.32421875, + "step": 2233, + "time_per_iteration": 2.57535457611084 + }, + { + "auxiliary_loss_clip": 0.0661561, + "auxiliary_loss_mlp": 0.01292122, + "balance_loss_clip": 0.0629602, + "balance_loss_mlp": 0.0126096, + "epoch": 0.1343153464602435, + "flos": 15782501537280.0, + "grad_norm": 2.818232021038303, + "language_loss": 0.82633889, + "learning_rate": 3.886933657403615e-06, + "loss": 0.90541625, + "num_input_tokens_seen": 48317835, + "router_z_loss_clip": 3.19726562, + "router_z_loss_mlp": 0.31176758, + "step": 2234, + "time_per_iteration": 2.5729787349700928 + }, + { + "auxiliary_loss_clip": 0.06617501, + "auxiliary_loss_mlp": 0.01296303, + "balance_loss_clip": 0.06299153, + "balance_loss_mlp": 0.01266668, + "epoch": 0.13437546971291148, + "flos": 24321370913280.0, + "grad_norm": 2.028590274897441, + "language_loss": 0.82841778, + "learning_rate": 3.886804527949909e-06, + "loss": 0.90755594, + "num_input_tokens_seen": 48335670, + "router_z_loss_clip": 3.18359375, + "router_z_loss_mlp": 0.29638672, + "step": 2235, + "time_per_iteration": 2.593050241470337 + }, + { + "auxiliary_loss_clip": 0.06612507, + "auxiliary_loss_mlp": 0.01293723, + "balance_loss_clip": 0.06294179, + "balance_loss_mlp": 0.01261989, + "epoch": 0.13443559296557944, + "flos": 26657817361920.0, + "grad_norm": 1.9716678370354759, + "language_loss": 0.87708902, + "learning_rate": 3.8866753269483864e-06, + "loss": 0.95615125, + "num_input_tokens_seen": 48357805, + "router_z_loss_clip": 3.18554688, + "router_z_loss_mlp": 0.31738281, + "step": 2236, + "time_per_iteration": 2.5910720825195312 + }, + { + "auxiliary_loss_clip": 0.06621092, + "auxiliary_loss_mlp": 0.01294743, + "balance_loss_clip": 0.06297852, + "balance_loss_mlp": 0.012627, + "epoch": 0.1344957162182474, + "flos": 21802216636800.0, + "grad_norm": 1.7646832896946034, + "language_loss": 0.78455186, + "learning_rate": 3.886546054403946e-06, + "loss": 0.86371022, + "num_input_tokens_seen": 48377845, + "router_z_loss_clip": 3.23632812, + "router_z_loss_mlp": 0.32080078, + "step": 2237, + "time_per_iteration": 2.5423593521118164 + }, + { + "auxiliary_loss_clip": 0.06621015, + "auxiliary_loss_mlp": 0.01296744, + "balance_loss_clip": 0.06297819, + "balance_loss_mlp": 0.01263746, + "epoch": 0.13455583947091537, + "flos": 19871785946880.0, + "grad_norm": 2.139876962287315, + "language_loss": 0.80559266, + "learning_rate": 3.886416710321491e-06, + "loss": 0.88477021, + "num_input_tokens_seen": 48394735, + "router_z_loss_clip": 3.23046875, + "router_z_loss_mlp": 0.33007812, + "step": 2238, + "time_per_iteration": 2.547511100769043 + }, + { + "auxiliary_loss_clip": 0.0662026, + "auxiliary_loss_mlp": 0.01290468, + "balance_loss_clip": 0.06300892, + "balance_loss_mlp": 0.0125945, + "epoch": 0.13461596272358334, + "flos": 30854730741120.0, + "grad_norm": 2.2946937997388983, + "language_loss": 0.69019175, + "learning_rate": 3.886287294705924e-06, + "loss": 0.76929903, + "num_input_tokens_seen": 48414200, + "router_z_loss_clip": 3.19335938, + "router_z_loss_mlp": 0.31005859, + "step": 2239, + "time_per_iteration": 2.6161396503448486 + }, + { + "auxiliary_loss_clip": 0.06626255, + "auxiliary_loss_mlp": 0.0129458, + "balance_loss_clip": 0.06302193, + "balance_loss_mlp": 0.01262609, + "epoch": 0.1346760859762513, + "flos": 12499253078400.0, + "grad_norm": 2.740092234793679, + "language_loss": 0.83294439, + "learning_rate": 3.8861578075621555e-06, + "loss": 0.91215271, + "num_input_tokens_seen": 48431065, + "router_z_loss_clip": 3.2421875, + "router_z_loss_mlp": 0.31958008, + "step": 2240, + "time_per_iteration": 2.531810998916626 + }, + { + "auxiliary_loss_clip": 0.06621873, + "auxiliary_loss_mlp": 0.01289824, + "balance_loss_clip": 0.06297486, + "balance_loss_mlp": 0.01256278, + "epoch": 0.1347362092289193, + "flos": 21842607104640.0, + "grad_norm": 1.6487000610588447, + "language_loss": 0.78665066, + "learning_rate": 3.886028248895093e-06, + "loss": 0.86576766, + "num_input_tokens_seen": 48450335, + "router_z_loss_clip": 3.2421875, + "router_z_loss_mlp": 0.33569336, + "step": 2241, + "time_per_iteration": 2.5346198081970215 + }, + { + "auxiliary_loss_clip": 0.06618196, + "auxiliary_loss_mlp": 0.01285675, + "balance_loss_clip": 0.06305367, + "balance_loss_mlp": 0.01256636, + "epoch": 0.13479633248158726, + "flos": 23515502670720.0, + "grad_norm": 1.8184249012274396, + "language_loss": 0.84641361, + "learning_rate": 3.88589861870965e-06, + "loss": 0.92545235, + "num_input_tokens_seen": 48468555, + "router_z_loss_clip": 3.12890625, + "router_z_loss_mlp": 0.29052734, + "step": 2242, + "time_per_iteration": 2.6532411575317383 + }, + { + "auxiliary_loss_clip": 0.0662721, + "auxiliary_loss_mlp": 0.01293952, + "balance_loss_clip": 0.06304164, + "balance_loss_mlp": 0.01261098, + "epoch": 0.13485645573425523, + "flos": 29350874787840.0, + "grad_norm": 2.677815565759994, + "language_loss": 0.66332561, + "learning_rate": 3.885768917010744e-06, + "loss": 0.74253726, + "num_input_tokens_seen": 48488515, + "router_z_loss_clip": 3.22851562, + "router_z_loss_mlp": 0.32836914, + "step": 2243, + "time_per_iteration": 2.599304437637329 + }, + { + "auxiliary_loss_clip": 0.06611082, + "auxiliary_loss_mlp": 0.01284736, + "balance_loss_clip": 0.06295401, + "balance_loss_mlp": 0.01256042, + "epoch": 0.1349165789869232, + "flos": 28044484980480.0, + "grad_norm": 1.4756823100545766, + "language_loss": 0.73444742, + "learning_rate": 3.8856391438032895e-06, + "loss": 0.81340563, + "num_input_tokens_seen": 48510515, + "router_z_loss_clip": 3.15234375, + "router_z_loss_mlp": 0.28662109, + "step": 2244, + "time_per_iteration": 2.640366554260254 + }, + { + "auxiliary_loss_clip": 0.06614108, + "auxiliary_loss_mlp": 0.01291938, + "balance_loss_clip": 0.062971, + "balance_loss_mlp": 0.01260133, + "epoch": 0.13497670223959116, + "flos": 22859834071680.0, + "grad_norm": 7.9965666613423, + "language_loss": 0.87522435, + "learning_rate": 3.88550929909221e-06, + "loss": 0.95428485, + "num_input_tokens_seen": 48529940, + "router_z_loss_clip": 3.17578125, + "router_z_loss_mlp": 0.31787109, + "step": 2245, + "time_per_iteration": 2.537259340286255 + }, + { + "auxiliary_loss_clip": 0.06609753, + "auxiliary_loss_mlp": 0.01291064, + "balance_loss_clip": 0.06298651, + "balance_loss_mlp": 0.0126119, + "epoch": 0.13503682549225912, + "flos": 16509517488000.0, + "grad_norm": 1.6351770671547161, + "language_loss": 0.80275553, + "learning_rate": 3.88537938288243e-06, + "loss": 0.88176376, + "num_input_tokens_seen": 48548190, + "router_z_loss_clip": 3.11328125, + "router_z_loss_mlp": 0.29858398, + "step": 2246, + "time_per_iteration": 2.576324224472046 + }, + { + "auxiliary_loss_clip": 0.06503996, + "auxiliary_loss_mlp": 0.01268266, + "balance_loss_clip": 0.06311698, + "balance_loss_mlp": 0.01256631, + "epoch": 0.1350969487449271, + "flos": 70775979217920.0, + "grad_norm": 0.7288766997222871, + "language_loss": 0.60674834, + "learning_rate": 3.885249395178874e-06, + "loss": 0.68447095, + "num_input_tokens_seen": 48613165, + "router_z_loss_clip": 1.921875, + "router_z_loss_mlp": 0.11621094, + "step": 2247, + "time_per_iteration": 3.295891046524048 + }, + { + "auxiliary_loss_clip": 0.06638567, + "auxiliary_loss_mlp": 0.01298182, + "balance_loss_clip": 0.06305797, + "balance_loss_mlp": 0.01262229, + "epoch": 0.13515707199759508, + "flos": 23082680805120.0, + "grad_norm": 2.7104639981136662, + "language_loss": 0.82279253, + "learning_rate": 3.885119335986473e-06, + "loss": 0.90216005, + "num_input_tokens_seen": 48631705, + "router_z_loss_clip": 3.33203125, + "router_z_loss_mlp": 0.359375, + "step": 2248, + "time_per_iteration": 2.575490951538086 + }, + { + "auxiliary_loss_clip": 0.06606994, + "auxiliary_loss_mlp": 0.01284005, + "balance_loss_clip": 0.0629556, + "balance_loss_mlp": 0.01255013, + "epoch": 0.13521719525026304, + "flos": 23193244667520.0, + "grad_norm": 1.8435286673705464, + "language_loss": 0.7853781, + "learning_rate": 3.884989205310157e-06, + "loss": 0.86428809, + "num_input_tokens_seen": 48649740, + "router_z_loss_clip": 3.11328125, + "router_z_loss_mlp": 0.2902832, + "step": 2249, + "time_per_iteration": 2.5745737552642822 + }, + { + "auxiliary_loss_clip": 0.06615513, + "auxiliary_loss_mlp": 0.01290474, + "balance_loss_clip": 0.06300813, + "balance_loss_mlp": 0.01262293, + "epoch": 0.135277318502931, + "flos": 24797937409920.0, + "grad_norm": 1.7186486055988894, + "language_loss": 0.86064833, + "learning_rate": 3.884859003154862e-06, + "loss": 0.93970823, + "num_input_tokens_seen": 48671565, + "router_z_loss_clip": 3.14453125, + "router_z_loss_mlp": 0.28210449, + "step": 2250, + "time_per_iteration": 2.594517469406128 + }, + { + "auxiliary_loss_clip": 0.06621417, + "auxiliary_loss_mlp": 0.01303153, + "balance_loss_clip": 0.06298415, + "balance_loss_mlp": 0.01270108, + "epoch": 0.13533744175559898, + "flos": 21915044559360.0, + "grad_norm": 3.4195422131585564, + "language_loss": 0.83116192, + "learning_rate": 3.884728729525524e-06, + "loss": 0.91040766, + "num_input_tokens_seen": 48690425, + "router_z_loss_clip": 3.22851562, + "router_z_loss_mlp": 0.33032227, + "step": 2251, + "time_per_iteration": 2.5615222454071045 + }, + { + "auxiliary_loss_clip": 0.066163, + "auxiliary_loss_mlp": 0.01290158, + "balance_loss_clip": 0.06295693, + "balance_loss_mlp": 0.01258579, + "epoch": 0.13539756500826694, + "flos": 21217434192000.0, + "grad_norm": 1.7358628614083547, + "language_loss": 0.86943758, + "learning_rate": 3.884598384427084e-06, + "loss": 0.94850212, + "num_input_tokens_seen": 48707505, + "router_z_loss_clip": 3.20703125, + "router_z_loss_mlp": 0.31555176, + "step": 2252, + "time_per_iteration": 2.5325772762298584 + }, + { + "auxiliary_loss_clip": 0.06482528, + "auxiliary_loss_mlp": 0.01279879, + "balance_loss_clip": 0.06294215, + "balance_loss_mlp": 0.01267404, + "epoch": 0.1354576882609349, + "flos": 63260835500160.0, + "grad_norm": 0.7528010548037618, + "language_loss": 0.61151105, + "learning_rate": 3.884467967864485e-06, + "loss": 0.68913507, + "num_input_tokens_seen": 48775895, + "router_z_loss_clip": 1.88867188, + "router_z_loss_mlp": 0.12481689, + "step": 2253, + "time_per_iteration": 3.2731101512908936 + }, + { + "auxiliary_loss_clip": 0.06617865, + "auxiliary_loss_mlp": 0.01297527, + "balance_loss_clip": 0.06298327, + "balance_loss_mlp": 0.01266961, + "epoch": 0.1355178115136029, + "flos": 25489971480960.0, + "grad_norm": 1.734180018549956, + "language_loss": 0.90171039, + "learning_rate": 3.884337479842671e-06, + "loss": 0.98086423, + "num_input_tokens_seen": 48798370, + "router_z_loss_clip": 3.19726562, + "router_z_loss_mlp": 0.30517578, + "step": 2254, + "time_per_iteration": 2.5830373764038086 + }, + { + "auxiliary_loss_clip": 0.06624171, + "auxiliary_loss_mlp": 0.01291824, + "balance_loss_clip": 0.06297128, + "balance_loss_mlp": 0.01259709, + "epoch": 0.13557793476627086, + "flos": 21623491877760.0, + "grad_norm": 2.5405517045767865, + "language_loss": 0.85834336, + "learning_rate": 3.884206920366591e-06, + "loss": 0.93750322, + "num_input_tokens_seen": 48817955, + "router_z_loss_clip": 3.26757812, + "router_z_loss_mlp": 0.32104492, + "step": 2255, + "time_per_iteration": 2.5849766731262207 + }, + { + "auxiliary_loss_clip": 0.06615041, + "auxiliary_loss_mlp": 0.01294235, + "balance_loss_clip": 0.06296261, + "balance_loss_mlp": 0.01264862, + "epoch": 0.13563805801893883, + "flos": 24933839932800.0, + "grad_norm": 2.4937460094050534, + "language_loss": 0.7602762, + "learning_rate": 3.884076289441196e-06, + "loss": 0.83936894, + "num_input_tokens_seen": 48836330, + "router_z_loss_clip": 3.18945312, + "router_z_loss_mlp": 0.29370117, + "step": 2256, + "time_per_iteration": 2.5914275646209717 + }, + { + "auxiliary_loss_clip": 0.06621285, + "auxiliary_loss_mlp": 0.01288962, + "balance_loss_clip": 0.06294358, + "balance_loss_mlp": 0.01257563, + "epoch": 0.1356981812716068, + "flos": 14754415466880.0, + "grad_norm": 2.129121942862091, + "language_loss": 0.84234703, + "learning_rate": 3.88394558707144e-06, + "loss": 0.92144954, + "num_input_tokens_seen": 48851890, + "router_z_loss_clip": 3.26367188, + "router_z_loss_mlp": 0.31420898, + "step": 2257, + "time_per_iteration": 2.5664286613464355 + }, + { + "auxiliary_loss_clip": 0.06630847, + "auxiliary_loss_mlp": 0.0129256, + "balance_loss_clip": 0.06299773, + "balance_loss_mlp": 0.01259658, + "epoch": 0.13575830452427476, + "flos": 11113256292480.0, + "grad_norm": 1.9364367185101232, + "language_loss": 0.83362973, + "learning_rate": 3.883814813262277e-06, + "loss": 0.91286373, + "num_input_tokens_seen": 48865510, + "router_z_loss_clip": 3.3125, + "router_z_loss_mlp": 0.32910156, + "step": 2258, + "time_per_iteration": 2.521657705307007 + }, + { + "auxiliary_loss_clip": 0.06621088, + "auxiliary_loss_mlp": 0.01297355, + "balance_loss_clip": 0.0629478, + "balance_loss_mlp": 0.01264858, + "epoch": 0.13581842777694272, + "flos": 17964849127680.0, + "grad_norm": 2.721301656824917, + "language_loss": 0.83752787, + "learning_rate": 3.883683968018669e-06, + "loss": 0.91671234, + "num_input_tokens_seen": 48882360, + "router_z_loss_clip": 3.26171875, + "router_z_loss_mlp": 0.32519531, + "step": 2259, + "time_per_iteration": 2.521693706512451 + }, + { + "auxiliary_loss_clip": 0.0660786, + "auxiliary_loss_mlp": 0.01289157, + "balance_loss_clip": 0.06291058, + "balance_loss_mlp": 0.01260952, + "epoch": 0.1358785510296107, + "flos": 22863817140480.0, + "grad_norm": 2.0214358343175927, + "language_loss": 0.74903429, + "learning_rate": 3.8835530513455755e-06, + "loss": 0.82800448, + "num_input_tokens_seen": 48902700, + "router_z_loss_clip": 3.16992188, + "router_z_loss_mlp": 0.28198242, + "step": 2260, + "time_per_iteration": 2.5302374362945557 + }, + { + "auxiliary_loss_clip": 0.0660997, + "auxiliary_loss_mlp": 0.0129096, + "balance_loss_clip": 0.06293269, + "balance_loss_mlp": 0.01260859, + "epoch": 0.13593867428227868, + "flos": 25746542282880.0, + "grad_norm": 2.2338901691781925, + "language_loss": 0.76686287, + "learning_rate": 3.883422063247961e-06, + "loss": 0.84587216, + "num_input_tokens_seen": 48922525, + "router_z_loss_clip": 3.16796875, + "router_z_loss_mlp": 0.30114746, + "step": 2261, + "time_per_iteration": 2.5939574241638184 + }, + { + "auxiliary_loss_clip": 0.06616522, + "auxiliary_loss_mlp": 0.01291008, + "balance_loss_clip": 0.0629552, + "balance_loss_mlp": 0.01259132, + "epoch": 0.13599879753494665, + "flos": 31257350409600.0, + "grad_norm": 2.2895573692407547, + "language_loss": 0.6521523, + "learning_rate": 3.883291003730794e-06, + "loss": 0.73122764, + "num_input_tokens_seen": 48942510, + "router_z_loss_clip": 3.21289062, + "router_z_loss_mlp": 0.31884766, + "step": 2262, + "time_per_iteration": 2.615324020385742 + }, + { + "auxiliary_loss_clip": 0.0662135, + "auxiliary_loss_mlp": 0.01300411, + "balance_loss_clip": 0.06298003, + "balance_loss_mlp": 0.01269584, + "epoch": 0.1360589207876146, + "flos": 23921853845760.0, + "grad_norm": 2.421989013841254, + "language_loss": 0.84175652, + "learning_rate": 3.883159872799043e-06, + "loss": 0.92097414, + "num_input_tokens_seen": 48962625, + "router_z_loss_clip": 3.23632812, + "router_z_loss_mlp": 0.30859375, + "step": 2263, + "time_per_iteration": 2.5566399097442627 + }, + { + "auxiliary_loss_clip": 0.06629188, + "auxiliary_loss_mlp": 0.01291754, + "balance_loss_clip": 0.06304573, + "balance_loss_mlp": 0.0125859, + "epoch": 0.13611904404028258, + "flos": 19980295384320.0, + "grad_norm": 2.5264058207475215, + "language_loss": 0.89336157, + "learning_rate": 3.8830286704576815e-06, + "loss": 0.97257102, + "num_input_tokens_seen": 48982525, + "router_z_loss_clip": 3.24804688, + "router_z_loss_mlp": 0.33178711, + "step": 2264, + "time_per_iteration": 2.5305962562561035 + }, + { + "auxiliary_loss_clip": 0.06637362, + "auxiliary_loss_mlp": 0.0129781, + "balance_loss_clip": 0.06308438, + "balance_loss_mlp": 0.01265195, + "epoch": 0.13617916729295054, + "flos": 15345990092160.0, + "grad_norm": 2.7927094576438716, + "language_loss": 0.71764517, + "learning_rate": 3.882897396711683e-06, + "loss": 0.79699689, + "num_input_tokens_seen": 48997605, + "router_z_loss_clip": 3.28515625, + "router_z_loss_mlp": 0.32617188, + "step": 2265, + "time_per_iteration": 2.561797857284546 + }, + { + "auxiliary_loss_clip": 0.06615983, + "auxiliary_loss_mlp": 0.01290453, + "balance_loss_clip": 0.06299248, + "balance_loss_mlp": 0.01262034, + "epoch": 0.1362392905456185, + "flos": 27458402797440.0, + "grad_norm": 2.5604448311617825, + "language_loss": 0.67458075, + "learning_rate": 3.882766051566027e-06, + "loss": 0.75364506, + "num_input_tokens_seen": 49018535, + "router_z_loss_clip": 3.16992188, + "router_z_loss_mlp": 0.28381348, + "step": 2266, + "time_per_iteration": 2.5694286823272705 + }, + { + "auxiliary_loss_clip": 0.06624304, + "auxiliary_loss_mlp": 0.01294932, + "balance_loss_clip": 0.06304609, + "balance_loss_mlp": 0.01263711, + "epoch": 0.1362994137982865, + "flos": 25015920606720.0, + "grad_norm": 2.0527906242943983, + "language_loss": 0.77445233, + "learning_rate": 3.882634635025694e-06, + "loss": 0.85364473, + "num_input_tokens_seen": 49038865, + "router_z_loss_clip": 3.1953125, + "router_z_loss_mlp": 0.31237793, + "step": 2267, + "time_per_iteration": 4.004362106323242 + }, + { + "auxiliary_loss_clip": 0.06632047, + "auxiliary_loss_mlp": 0.01290209, + "balance_loss_clip": 0.0631062, + "balance_loss_mlp": 0.01259882, + "epoch": 0.13635953705095447, + "flos": 20309261713920.0, + "grad_norm": 1.8370610095313742, + "language_loss": 0.836191, + "learning_rate": 3.882503147095667e-06, + "loss": 0.91541362, + "num_input_tokens_seen": 49058010, + "router_z_loss_clip": 3.21875, + "router_z_loss_mlp": 0.30322266, + "step": 2268, + "time_per_iteration": 3.9506208896636963 + }, + { + "auxiliary_loss_clip": 0.06630498, + "auxiliary_loss_mlp": 0.01294319, + "balance_loss_clip": 0.06311751, + "balance_loss_mlp": 0.01262013, + "epoch": 0.13641966030362243, + "flos": 31366530679680.0, + "grad_norm": 1.9828007462930386, + "language_loss": 0.7747438, + "learning_rate": 3.882371587780931e-06, + "loss": 0.85399193, + "num_input_tokens_seen": 49080330, + "router_z_loss_clip": 3.1875, + "router_z_loss_mlp": 0.32299805, + "step": 2269, + "time_per_iteration": 2.653453826904297 + }, + { + "auxiliary_loss_clip": 0.06638865, + "auxiliary_loss_mlp": 0.01296587, + "balance_loss_clip": 0.06316057, + "balance_loss_mlp": 0.0126545, + "epoch": 0.1364797835562904, + "flos": 20483122936320.0, + "grad_norm": 2.359526754249971, + "language_loss": 0.8236903, + "learning_rate": 3.882239957086477e-06, + "loss": 0.90304482, + "num_input_tokens_seen": 49097035, + "router_z_loss_clip": 3.22851562, + "router_z_loss_mlp": 0.31152344, + "step": 2270, + "time_per_iteration": 2.5675413608551025 + }, + { + "auxiliary_loss_clip": 0.06635441, + "auxiliary_loss_mlp": 0.01293131, + "balance_loss_clip": 0.06311204, + "balance_loss_mlp": 0.01261254, + "epoch": 0.13653990680895836, + "flos": 13083280836480.0, + "grad_norm": 2.670574241660613, + "language_loss": 0.77002323, + "learning_rate": 3.882108255017295e-06, + "loss": 0.84930891, + "num_input_tokens_seen": 49113945, + "router_z_loss_clip": 3.24414062, + "router_z_loss_mlp": 0.31884766, + "step": 2271, + "time_per_iteration": 3.976745367050171 + }, + { + "auxiliary_loss_clip": 0.06636623, + "auxiliary_loss_mlp": 0.01296686, + "balance_loss_clip": 0.06313315, + "balance_loss_mlp": 0.0126419, + "epoch": 0.13660003006162633, + "flos": 16952443770240.0, + "grad_norm": 2.320627701174975, + "language_loss": 0.81754398, + "learning_rate": 3.881976481578379e-06, + "loss": 0.89687717, + "num_input_tokens_seen": 49132855, + "router_z_loss_clip": 3.23242188, + "router_z_loss_mlp": 0.32495117, + "step": 2272, + "time_per_iteration": 4.03596043586731 + }, + { + "auxiliary_loss_clip": 0.0650102, + "auxiliary_loss_mlp": 0.01266825, + "balance_loss_clip": 0.06312356, + "balance_loss_mlp": 0.01255327, + "epoch": 0.1366601533142943, + "flos": 68703105386880.0, + "grad_norm": 0.6745755938751765, + "language_loss": 0.60570937, + "learning_rate": 3.8818446367747255e-06, + "loss": 0.68338782, + "num_input_tokens_seen": 49198310, + "router_z_loss_clip": 1.890625, + "router_z_loss_mlp": 0.11480713, + "step": 2273, + "time_per_iteration": 3.287332534790039 + }, + { + "auxiliary_loss_clip": 0.06625689, + "auxiliary_loss_mlp": 0.01290706, + "balance_loss_clip": 0.06308322, + "balance_loss_mlp": 0.01259831, + "epoch": 0.13672027656696228, + "flos": 19250176832640.0, + "grad_norm": 1.730825672757131, + "language_loss": 0.79225731, + "learning_rate": 3.881712720611336e-06, + "loss": 0.87142122, + "num_input_tokens_seen": 49217250, + "router_z_loss_clip": 3.171875, + "router_z_loss_mlp": 0.30883789, + "step": 2274, + "time_per_iteration": 2.562556743621826 + }, + { + "auxiliary_loss_clip": 0.06626303, + "auxiliary_loss_mlp": 0.01302977, + "balance_loss_clip": 0.06308225, + "balance_loss_mlp": 0.01270457, + "epoch": 0.13678039981963025, + "flos": 24541785878400.0, + "grad_norm": 2.937872524874316, + "language_loss": 0.79763901, + "learning_rate": 3.881580733093211e-06, + "loss": 0.87693179, + "num_input_tokens_seen": 49236615, + "router_z_loss_clip": 3.17773438, + "router_z_loss_mlp": 0.32519531, + "step": 2275, + "time_per_iteration": 2.560577630996704 + }, + { + "auxiliary_loss_clip": 0.06630076, + "auxiliary_loss_mlp": 0.01293627, + "balance_loss_clip": 0.06306267, + "balance_loss_mlp": 0.01259914, + "epoch": 0.13684052307229821, + "flos": 15674788713600.0, + "grad_norm": 2.8834689051693196, + "language_loss": 0.82202291, + "learning_rate": 3.881448674225356e-06, + "loss": 0.9012599, + "num_input_tokens_seen": 49253935, + "router_z_loss_clip": 3.2421875, + "router_z_loss_mlp": 0.33691406, + "step": 2276, + "time_per_iteration": 2.6382758617401123 + }, + { + "auxiliary_loss_clip": 0.06636757, + "auxiliary_loss_mlp": 0.01296316, + "balance_loss_clip": 0.06304651, + "balance_loss_mlp": 0.01260839, + "epoch": 0.13690064632496618, + "flos": 28371983863680.0, + "grad_norm": 2.682466270477189, + "language_loss": 0.71951526, + "learning_rate": 3.881316544012779e-06, + "loss": 0.79884601, + "num_input_tokens_seen": 49273605, + "router_z_loss_clip": 3.32421875, + "router_z_loss_mlp": 0.35473633, + "step": 2277, + "time_per_iteration": 2.59140944480896 + }, + { + "auxiliary_loss_clip": 0.06638919, + "auxiliary_loss_mlp": 0.01298071, + "balance_loss_clip": 0.06309501, + "balance_loss_mlp": 0.01265312, + "epoch": 0.13696076957763414, + "flos": 23411605207680.0, + "grad_norm": 2.2485386037649144, + "language_loss": 0.82153767, + "learning_rate": 3.88118434246049e-06, + "loss": 0.90090752, + "num_input_tokens_seen": 49291785, + "router_z_loss_clip": 3.296875, + "router_z_loss_mlp": 0.32739258, + "step": 2278, + "time_per_iteration": 2.5540530681610107 + }, + { + "auxiliary_loss_clip": 0.06627095, + "auxiliary_loss_mlp": 0.01287889, + "balance_loss_clip": 0.06304022, + "balance_loss_mlp": 0.01256358, + "epoch": 0.1370208928303021, + "flos": 37205760084480.0, + "grad_norm": 2.776511982198055, + "language_loss": 0.76353186, + "learning_rate": 3.881052069573502e-06, + "loss": 0.84268171, + "num_input_tokens_seen": 49311405, + "router_z_loss_clip": 3.234375, + "router_z_loss_mlp": 0.31506348, + "step": 2279, + "time_per_iteration": 2.659834623336792 + }, + { + "auxiliary_loss_clip": 0.06632279, + "auxiliary_loss_mlp": 0.01290702, + "balance_loss_clip": 0.06309781, + "balance_loss_mlp": 0.01260041, + "epoch": 0.13708101608297008, + "flos": 26983052184960.0, + "grad_norm": 1.8236300001025265, + "language_loss": 0.78161544, + "learning_rate": 3.880919725356831e-06, + "loss": 0.86084521, + "num_input_tokens_seen": 49331835, + "router_z_loss_clip": 3.22460938, + "router_z_loss_mlp": 0.30639648, + "step": 2280, + "time_per_iteration": 2.5933265686035156 + }, + { + "auxiliary_loss_clip": 0.06616117, + "auxiliary_loss_mlp": 0.01291386, + "balance_loss_clip": 0.06299774, + "balance_loss_mlp": 0.01259009, + "epoch": 0.13714113933563807, + "flos": 32564243341440.0, + "grad_norm": 2.0971089694494003, + "language_loss": 0.80573678, + "learning_rate": 3.880787309815496e-06, + "loss": 0.88481188, + "num_input_tokens_seen": 49352290, + "router_z_loss_clip": 3.1640625, + "router_z_loss_mlp": 0.32373047, + "step": 2281, + "time_per_iteration": 2.6089248657226562 + }, + { + "auxiliary_loss_clip": 0.06637304, + "auxiliary_loss_mlp": 0.01294147, + "balance_loss_clip": 0.06310696, + "balance_loss_mlp": 0.01260601, + "epoch": 0.13720126258830603, + "flos": 16105807716480.0, + "grad_norm": 1.9438647514298306, + "language_loss": 0.84104228, + "learning_rate": 3.880654822954518e-06, + "loss": 0.92035675, + "num_input_tokens_seen": 49370285, + "router_z_loss_clip": 3.26367188, + "router_z_loss_mlp": 0.33544922, + "step": 2282, + "time_per_iteration": 2.6252219676971436 + }, + { + "auxiliary_loss_clip": 0.06621532, + "auxiliary_loss_mlp": 0.01288566, + "balance_loss_clip": 0.06310192, + "balance_loss_mlp": 0.01258716, + "epoch": 0.137261385840974, + "flos": 18959630400000.0, + "grad_norm": 1.6598116001029841, + "language_loss": 0.74414694, + "learning_rate": 3.8805222647789195e-06, + "loss": 0.82324791, + "num_input_tokens_seen": 49389610, + "router_z_loss_clip": 3.1171875, + "router_z_loss_mlp": 0.29858398, + "step": 2283, + "time_per_iteration": 2.510495185852051 + }, + { + "auxiliary_loss_clip": 0.06626984, + "auxiliary_loss_mlp": 0.01293133, + "balance_loss_clip": 0.06314456, + "balance_loss_mlp": 0.01261686, + "epoch": 0.13732150909364196, + "flos": 23302173375360.0, + "grad_norm": 4.31542841231349, + "language_loss": 0.85737264, + "learning_rate": 3.880389635293729e-06, + "loss": 0.93657386, + "num_input_tokens_seen": 49408390, + "router_z_loss_clip": 3.125, + "router_z_loss_mlp": 0.31445312, + "step": 2284, + "time_per_iteration": 2.569772720336914 + }, + { + "auxiliary_loss_clip": 0.06637374, + "auxiliary_loss_mlp": 0.01296079, + "balance_loss_clip": 0.06309589, + "balance_loss_mlp": 0.01263702, + "epoch": 0.13738163234630993, + "flos": 29358966706560.0, + "grad_norm": 2.3287060101811643, + "language_loss": 0.76374751, + "learning_rate": 3.880256934503974e-06, + "loss": 0.84308201, + "num_input_tokens_seen": 49427725, + "router_z_loss_clip": 3.27734375, + "router_z_loss_mlp": 0.32348633, + "step": 2285, + "time_per_iteration": 2.618502140045166 + }, + { + "auxiliary_loss_clip": 0.06630811, + "auxiliary_loss_mlp": 0.01295468, + "balance_loss_clip": 0.06312186, + "balance_loss_mlp": 0.0126619, + "epoch": 0.1374417555989779, + "flos": 26658572048640.0, + "grad_norm": 1.8592668297074675, + "language_loss": 0.76012349, + "learning_rate": 3.880124162414689e-06, + "loss": 0.83938622, + "num_input_tokens_seen": 49449000, + "router_z_loss_clip": 3.18164062, + "router_z_loss_mlp": 0.29296875, + "step": 2286, + "time_per_iteration": 2.7475874423980713 + }, + { + "auxiliary_loss_clip": 0.06634222, + "auxiliary_loss_mlp": 0.01290764, + "balance_loss_clip": 0.06310531, + "balance_loss_mlp": 0.01258029, + "epoch": 0.1375018788516459, + "flos": 28411074593280.0, + "grad_norm": 5.375995383381602, + "language_loss": 0.87619269, + "learning_rate": 3.879991319030908e-06, + "loss": 0.95544249, + "num_input_tokens_seen": 49468360, + "router_z_loss_clip": 3.23242188, + "router_z_loss_mlp": 0.32763672, + "step": 2287, + "time_per_iteration": 2.7319629192352295 + }, + { + "auxiliary_loss_clip": 0.06638976, + "auxiliary_loss_mlp": 0.01305844, + "balance_loss_clip": 0.06320731, + "balance_loss_mlp": 0.01274683, + "epoch": 0.13756200210431385, + "flos": 37422695105280.0, + "grad_norm": 2.4551568049715486, + "language_loss": 0.70291626, + "learning_rate": 3.879858404357666e-06, + "loss": 0.78236449, + "num_input_tokens_seen": 49493450, + "router_z_loss_clip": 3.18359375, + "router_z_loss_mlp": 0.3112793, + "step": 2288, + "time_per_iteration": 2.6788651943206787 + }, + { + "auxiliary_loss_clip": 0.06632806, + "auxiliary_loss_mlp": 0.01293292, + "balance_loss_clip": 0.06312902, + "balance_loss_mlp": 0.01262667, + "epoch": 0.13762212535698182, + "flos": 22717642492800.0, + "grad_norm": 3.117032975681255, + "language_loss": 0.88826561, + "learning_rate": 3.879725418400005e-06, + "loss": 0.96752661, + "num_input_tokens_seen": 49511220, + "router_z_loss_clip": 3.1953125, + "router_z_loss_mlp": 0.30651855, + "step": 2289, + "time_per_iteration": 2.5602166652679443 + }, + { + "auxiliary_loss_clip": 0.06632558, + "auxiliary_loss_mlp": 0.01293233, + "balance_loss_clip": 0.06320693, + "balance_loss_mlp": 0.01263181, + "epoch": 0.13768224860964978, + "flos": 23959057858560.0, + "grad_norm": 1.9772525840465298, + "language_loss": 0.75630605, + "learning_rate": 3.879592361162969e-06, + "loss": 0.8355639, + "num_input_tokens_seen": 49529820, + "router_z_loss_clip": 3.11914062, + "router_z_loss_mlp": 0.30065918, + "step": 2290, + "time_per_iteration": 2.5592398643493652 + }, + { + "auxiliary_loss_clip": 0.06540786, + "auxiliary_loss_mlp": 0.01268874, + "balance_loss_clip": 0.06353199, + "balance_loss_mlp": 0.01257585, + "epoch": 0.13774237186231775, + "flos": 63612568212480.0, + "grad_norm": 0.6705422790130379, + "language_loss": 0.51642907, + "learning_rate": 3.8794592326516015e-06, + "loss": 0.59452564, + "num_input_tokens_seen": 49595325, + "router_z_loss_clip": 1.875, + "router_z_loss_mlp": 0.112854, + "step": 2291, + "time_per_iteration": 3.2724592685699463 + }, + { + "auxiliary_loss_clip": 0.06630601, + "auxiliary_loss_mlp": 0.01294866, + "balance_loss_clip": 0.0631279, + "balance_loss_mlp": 0.01263657, + "epoch": 0.1378024951149857, + "flos": 24286263252480.0, + "grad_norm": 2.140362896023876, + "language_loss": 0.72877645, + "learning_rate": 3.879326032870952e-06, + "loss": 0.80803108, + "num_input_tokens_seen": 49615850, + "router_z_loss_clip": 3.1796875, + "router_z_loss_mlp": 0.31201172, + "step": 2292, + "time_per_iteration": 2.571537971496582 + }, + { + "auxiliary_loss_clip": 0.0663756, + "auxiliary_loss_mlp": 0.01294271, + "balance_loss_clip": 0.06317808, + "balance_loss_mlp": 0.01261179, + "epoch": 0.13786261836765368, + "flos": 14025722434560.0, + "grad_norm": 2.9525020540096842, + "language_loss": 0.81376028, + "learning_rate": 3.879192761826071e-06, + "loss": 0.89307863, + "num_input_tokens_seen": 49631860, + "router_z_loss_clip": 3.19726562, + "router_z_loss_mlp": 0.33056641, + "step": 2293, + "time_per_iteration": 2.520320177078247 + }, + { + "auxiliary_loss_clip": 0.06629369, + "auxiliary_loss_mlp": 0.01294538, + "balance_loss_clip": 0.06308883, + "balance_loss_mlp": 0.01262065, + "epoch": 0.13792274162032167, + "flos": 28886592913920.0, + "grad_norm": 15.103956304175181, + "language_loss": 0.79534554, + "learning_rate": 3.879059419522011e-06, + "loss": 0.87458467, + "num_input_tokens_seen": 49652145, + "router_z_loss_clip": 3.20507812, + "router_z_loss_mlp": 0.32470703, + "step": 2294, + "time_per_iteration": 2.5958240032196045 + }, + { + "auxiliary_loss_clip": 0.06628333, + "auxiliary_loss_mlp": 0.01293802, + "balance_loss_clip": 0.06314936, + "balance_loss_mlp": 0.01264739, + "epoch": 0.13798286487298964, + "flos": 21147344651520.0, + "grad_norm": 2.1249265647314575, + "language_loss": 0.82119411, + "learning_rate": 3.878926005963831e-06, + "loss": 0.90041548, + "num_input_tokens_seen": 49669880, + "router_z_loss_clip": 3.1328125, + "router_z_loss_mlp": 0.29040527, + "step": 2295, + "time_per_iteration": 2.5259695053100586 + }, + { + "auxiliary_loss_clip": 0.06624444, + "auxiliary_loss_mlp": 0.0128892, + "balance_loss_clip": 0.06304439, + "balance_loss_mlp": 0.01258569, + "epoch": 0.1380429881256576, + "flos": 22493286385920.0, + "grad_norm": 1.9411162070190993, + "language_loss": 0.79297817, + "learning_rate": 3.878792521156588e-06, + "loss": 0.8721118, + "num_input_tokens_seen": 49687255, + "router_z_loss_clip": 3.203125, + "router_z_loss_mlp": 0.3034668, + "step": 2296, + "time_per_iteration": 2.5404605865478516 + }, + { + "auxiliary_loss_clip": 0.06623581, + "auxiliary_loss_mlp": 0.01292011, + "balance_loss_clip": 0.06309658, + "balance_loss_mlp": 0.01261755, + "epoch": 0.13810311137832557, + "flos": 21399429260160.0, + "grad_norm": 1.8193304302063846, + "language_loss": 0.79101717, + "learning_rate": 3.8786589651053446e-06, + "loss": 0.87017298, + "num_input_tokens_seen": 49706650, + "router_z_loss_clip": 3.13867188, + "router_z_loss_mlp": 0.30249023, + "step": 2297, + "time_per_iteration": 2.544902801513672 + }, + { + "auxiliary_loss_clip": 0.06617336, + "auxiliary_loss_mlp": 0.01292431, + "balance_loss_clip": 0.06304273, + "balance_loss_mlp": 0.01261162, + "epoch": 0.13816323463099353, + "flos": 25996195123200.0, + "grad_norm": 2.1649336589446113, + "language_loss": 0.70034248, + "learning_rate": 3.878525337815164e-06, + "loss": 0.77944016, + "num_input_tokens_seen": 49725715, + "router_z_loss_clip": 3.12890625, + "router_z_loss_mlp": 0.31286621, + "step": 2298, + "time_per_iteration": 2.7027747631073 + }, + { + "auxiliary_loss_clip": 0.06625488, + "auxiliary_loss_mlp": 0.01293838, + "balance_loss_clip": 0.06304887, + "balance_loss_mlp": 0.01263511, + "epoch": 0.1382233578836615, + "flos": 19250260686720.0, + "grad_norm": 1.8032659924791181, + "language_loss": 0.87816125, + "learning_rate": 3.878391639291116e-06, + "loss": 0.95735455, + "num_input_tokens_seen": 49744710, + "router_z_loss_clip": 3.20703125, + "router_z_loss_mlp": 0.30310059, + "step": 2299, + "time_per_iteration": 2.5216784477233887 + }, + { + "auxiliary_loss_clip": 0.06619459, + "auxiliary_loss_mlp": 0.01291843, + "balance_loss_clip": 0.06297824, + "balance_loss_mlp": 0.01258965, + "epoch": 0.1382834811363295, + "flos": 25673392068480.0, + "grad_norm": 1.8041271752460513, + "language_loss": 0.77313578, + "learning_rate": 3.878257869538267e-06, + "loss": 0.85224879, + "num_input_tokens_seen": 49764300, + "router_z_loss_clip": 3.21484375, + "router_z_loss_mlp": 0.32910156, + "step": 2300, + "time_per_iteration": 2.5652666091918945 + }, + { + "auxiliary_loss_clip": 0.06615824, + "auxiliary_loss_mlp": 0.01292831, + "balance_loss_clip": 0.06301995, + "balance_loss_mlp": 0.01263219, + "epoch": 0.13834360438899745, + "flos": 19788992513280.0, + "grad_norm": 2.607101946436598, + "language_loss": 0.84398985, + "learning_rate": 3.878124028561692e-06, + "loss": 0.92307633, + "num_input_tokens_seen": 49778380, + "router_z_loss_clip": 3.13867188, + "router_z_loss_mlp": 0.29589844, + "step": 2301, + "time_per_iteration": 2.5100109577178955 + }, + { + "auxiliary_loss_clip": 0.06616862, + "auxiliary_loss_mlp": 0.01292457, + "balance_loss_clip": 0.06302989, + "balance_loss_mlp": 0.01262631, + "epoch": 0.13840372764166542, + "flos": 26659200954240.0, + "grad_norm": 1.960897603887865, + "language_loss": 0.87807304, + "learning_rate": 3.877990116366466e-06, + "loss": 0.95716619, + "num_input_tokens_seen": 49797460, + "router_z_loss_clip": 3.140625, + "router_z_loss_mlp": 0.2980957, + "step": 2302, + "time_per_iteration": 2.5661840438842773 + }, + { + "auxiliary_loss_clip": 0.0648245, + "auxiliary_loss_mlp": 0.01256791, + "balance_loss_clip": 0.06296428, + "balance_loss_mlp": 0.01245943, + "epoch": 0.13846385089433338, + "flos": 70532321944320.0, + "grad_norm": 0.7317106160807376, + "language_loss": 0.65412122, + "learning_rate": 3.877856132957667e-06, + "loss": 0.73151362, + "num_input_tokens_seen": 49868005, + "router_z_loss_clip": 1.859375, + "router_z_loss_mlp": 0.10864258, + "step": 2303, + "time_per_iteration": 3.325839042663574 + }, + { + "auxiliary_loss_clip": 0.06609396, + "auxiliary_loss_mlp": 0.01287851, + "balance_loss_clip": 0.0630075, + "balance_loss_mlp": 0.01258263, + "epoch": 0.13852397414700135, + "flos": 17354644168320.0, + "grad_norm": 2.0774651772022885, + "language_loss": 0.79740053, + "learning_rate": 3.877722078340374e-06, + "loss": 0.87637299, + "num_input_tokens_seen": 49885825, + "router_z_loss_clip": 3.09179688, + "router_z_loss_mlp": 0.29589844, + "step": 2304, + "time_per_iteration": 2.543011426925659 + }, + { + "auxiliary_loss_clip": 0.06619786, + "auxiliary_loss_mlp": 0.01290997, + "balance_loss_clip": 0.06300867, + "balance_loss_mlp": 0.01261147, + "epoch": 0.13858409739966931, + "flos": 21550257809280.0, + "grad_norm": 3.5409811557707527, + "language_loss": 0.78727001, + "learning_rate": 3.877587952519672e-06, + "loss": 0.86637783, + "num_input_tokens_seen": 49905975, + "router_z_loss_clip": 3.19335938, + "router_z_loss_mlp": 0.2980957, + "step": 2305, + "time_per_iteration": 2.546365261077881 + }, + { + "auxiliary_loss_clip": 0.06604174, + "auxiliary_loss_mlp": 0.01290068, + "balance_loss_clip": 0.06297874, + "balance_loss_mlp": 0.01261624, + "epoch": 0.13864422065233728, + "flos": 21586329792000.0, + "grad_norm": 1.8829847036148735, + "language_loss": 0.89061654, + "learning_rate": 3.877453755500647e-06, + "loss": 0.96955895, + "num_input_tokens_seen": 49925800, + "router_z_loss_clip": 3.06054688, + "router_z_loss_mlp": 0.28442383, + "step": 2306, + "time_per_iteration": 2.564483165740967 + }, + { + "auxiliary_loss_clip": 0.06468673, + "auxiliary_loss_mlp": 0.0125835, + "balance_loss_clip": 0.0628318, + "balance_loss_mlp": 0.01247258, + "epoch": 0.13870434390500527, + "flos": 53384927650560.0, + "grad_norm": 0.8396257339497795, + "language_loss": 0.58554721, + "learning_rate": 3.877319487288387e-06, + "loss": 0.66281742, + "num_input_tokens_seen": 49977620, + "router_z_loss_clip": 1.859375, + "router_z_loss_mlp": 0.11108398, + "step": 2307, + "time_per_iteration": 4.632705450057983 + }, + { + "auxiliary_loss_clip": 0.0661881, + "auxiliary_loss_mlp": 0.01288588, + "balance_loss_clip": 0.06295981, + "balance_loss_mlp": 0.01258022, + "epoch": 0.13876446715767324, + "flos": 22572641802240.0, + "grad_norm": 1.7746642333134461, + "language_loss": 0.80762124, + "learning_rate": 3.877185147887984e-06, + "loss": 0.88669527, + "num_input_tokens_seen": 49996650, + "router_z_loss_clip": 3.23046875, + "router_z_loss_mlp": 0.30566406, + "step": 2308, + "time_per_iteration": 3.985261917114258 + }, + { + "auxiliary_loss_clip": 0.06612652, + "auxiliary_loss_mlp": 0.0129232, + "balance_loss_clip": 0.06302111, + "balance_loss_mlp": 0.01262208, + "epoch": 0.1388245904103412, + "flos": 20711671747200.0, + "grad_norm": 2.3070434354932425, + "language_loss": 0.7942912, + "learning_rate": 3.877050737304533e-06, + "loss": 0.8733409, + "num_input_tokens_seen": 50015640, + "router_z_loss_clip": 3.10742188, + "router_z_loss_mlp": 0.30102539, + "step": 2309, + "time_per_iteration": 2.5814623832702637 + }, + { + "auxiliary_loss_clip": 0.06621584, + "auxiliary_loss_mlp": 0.01295268, + "balance_loss_clip": 0.06297516, + "balance_loss_mlp": 0.0126444, + "epoch": 0.13888471366300917, + "flos": 20560382000640.0, + "grad_norm": 2.2863258472271437, + "language_loss": 0.6975733, + "learning_rate": 3.876916255543129e-06, + "loss": 0.77674186, + "num_input_tokens_seen": 50033500, + "router_z_loss_clip": 3.24023438, + "router_z_loss_mlp": 0.30786133, + "step": 2310, + "time_per_iteration": 2.5402469635009766 + }, + { + "auxiliary_loss_clip": 0.06612189, + "auxiliary_loss_mlp": 0.01299127, + "balance_loss_clip": 0.06296849, + "balance_loss_mlp": 0.01268967, + "epoch": 0.13894483691567713, + "flos": 13842008357760.0, + "grad_norm": 1.8909078278877924, + "language_loss": 0.85131961, + "learning_rate": 3.8767817026088725e-06, + "loss": 0.9304328, + "num_input_tokens_seen": 50050075, + "router_z_loss_clip": 3.15625, + "router_z_loss_mlp": 0.30126953, + "step": 2311, + "time_per_iteration": 5.377658128738403 + }, + { + "auxiliary_loss_clip": 0.06618226, + "auxiliary_loss_mlp": 0.01294733, + "balance_loss_clip": 0.06296492, + "balance_loss_mlp": 0.01264358, + "epoch": 0.1390049601683451, + "flos": 28037567018880.0, + "grad_norm": 2.5894979273704783, + "language_loss": 0.83215213, + "learning_rate": 3.876647078506866e-06, + "loss": 0.9112817, + "num_input_tokens_seen": 50070080, + "router_z_loss_clip": 3.21679688, + "router_z_loss_mlp": 0.30395508, + "step": 2312, + "time_per_iteration": 2.6039178371429443 + }, + { + "auxiliary_loss_clip": 0.06618522, + "auxiliary_loss_mlp": 0.01290839, + "balance_loss_clip": 0.06296252, + "balance_loss_mlp": 0.01259964, + "epoch": 0.13906508342101306, + "flos": 26763475760640.0, + "grad_norm": 1.7282329609081795, + "language_loss": 0.87823701, + "learning_rate": 3.876512383242215e-06, + "loss": 0.95733058, + "num_input_tokens_seen": 50090040, + "router_z_loss_clip": 3.22460938, + "router_z_loss_mlp": 0.30883789, + "step": 2313, + "time_per_iteration": 2.6105740070343018 + }, + { + "auxiliary_loss_clip": 0.06614069, + "auxiliary_loss_mlp": 0.01289702, + "balance_loss_clip": 0.06295129, + "balance_loss_mlp": 0.01259185, + "epoch": 0.13912520667368106, + "flos": 24541995513600.0, + "grad_norm": 1.8286826676096326, + "language_loss": 0.81090409, + "learning_rate": 3.876377616820024e-06, + "loss": 0.88994175, + "num_input_tokens_seen": 50110595, + "router_z_loss_clip": 3.1875, + "router_z_loss_mlp": 0.30541992, + "step": 2314, + "time_per_iteration": 2.581137180328369 + }, + { + "auxiliary_loss_clip": 0.06609131, + "auxiliary_loss_mlp": 0.0129379, + "balance_loss_clip": 0.06295457, + "balance_loss_mlp": 0.01263678, + "epoch": 0.13918532992634902, + "flos": 19388007999360.0, + "grad_norm": 4.757536248820732, + "language_loss": 0.86588097, + "learning_rate": 3.876242779245409e-06, + "loss": 0.94491017, + "num_input_tokens_seen": 50125430, + "router_z_loss_clip": 3.140625, + "router_z_loss_mlp": 0.30126953, + "step": 2315, + "time_per_iteration": 2.5262932777404785 + }, + { + "auxiliary_loss_clip": 0.06611065, + "auxiliary_loss_mlp": 0.01285772, + "balance_loss_clip": 0.06296186, + "balance_loss_mlp": 0.01255159, + "epoch": 0.139245453179017, + "flos": 21330010552320.0, + "grad_norm": 2.405797075318415, + "language_loss": 0.78922898, + "learning_rate": 3.876107870523477e-06, + "loss": 0.86819738, + "num_input_tokens_seen": 50144120, + "router_z_loss_clip": 3.15039062, + "router_z_loss_mlp": 0.30615234, + "step": 2316, + "time_per_iteration": 2.529972553253174 + }, + { + "auxiliary_loss_clip": 0.06613404, + "auxiliary_loss_mlp": 0.01292141, + "balance_loss_clip": 0.06296711, + "balance_loss_mlp": 0.01260026, + "epoch": 0.13930557643168495, + "flos": 19506747634560.0, + "grad_norm": 1.7528689753979556, + "language_loss": 0.77613419, + "learning_rate": 3.875972890659349e-06, + "loss": 0.85518968, + "num_input_tokens_seen": 50162500, + "router_z_loss_clip": 3.16992188, + "router_z_loss_mlp": 0.32116699, + "step": 2317, + "time_per_iteration": 2.5425355434417725 + }, + { + "auxiliary_loss_clip": 0.06624125, + "auxiliary_loss_mlp": 0.01286591, + "balance_loss_clip": 0.0630217, + "balance_loss_mlp": 0.01257027, + "epoch": 0.13936569968435292, + "flos": 25417869442560.0, + "grad_norm": 1.999588880264202, + "language_loss": 0.81447107, + "learning_rate": 3.875837839658139e-06, + "loss": 0.89357817, + "num_input_tokens_seen": 50182415, + "router_z_loss_clip": 3.22265625, + "router_z_loss_mlp": 0.2956543, + "step": 2318, + "time_per_iteration": 2.577786922454834 + }, + { + "auxiliary_loss_clip": 0.06479447, + "auxiliary_loss_mlp": 0.01268448, + "balance_loss_clip": 0.06297284, + "balance_loss_mlp": 0.01257373, + "epoch": 0.13942582293702088, + "flos": 70793211231360.0, + "grad_norm": 0.8224169172372592, + "language_loss": 0.59232461, + "learning_rate": 3.87570271752497e-06, + "loss": 0.66980362, + "num_input_tokens_seen": 50245160, + "router_z_loss_clip": 1.82421875, + "router_z_loss_mlp": 0.11090088, + "step": 2319, + "time_per_iteration": 3.204317092895508 + }, + { + "auxiliary_loss_clip": 0.06613657, + "auxiliary_loss_mlp": 0.01294413, + "balance_loss_clip": 0.06293797, + "balance_loss_mlp": 0.01263514, + "epoch": 0.13948594618968888, + "flos": 35599725676800.0, + "grad_norm": 2.1444622790100762, + "language_loss": 0.66576529, + "learning_rate": 3.875567524264967e-06, + "loss": 0.74484605, + "num_input_tokens_seen": 50268215, + "router_z_loss_clip": 3.20117188, + "router_z_loss_mlp": 0.30957031, + "step": 2320, + "time_per_iteration": 2.677716016769409 + }, + { + "auxiliary_loss_clip": 0.06604615, + "auxiliary_loss_mlp": 0.01292225, + "balance_loss_clip": 0.062957, + "balance_loss_mlp": 0.01263245, + "epoch": 0.13954606944235684, + "flos": 21111482304000.0, + "grad_norm": 1.7128433163135388, + "language_loss": 0.7132194, + "learning_rate": 3.875432259883256e-06, + "loss": 0.79218775, + "num_input_tokens_seen": 50288575, + "router_z_loss_clip": 3.08984375, + "router_z_loss_mlp": 0.28967285, + "step": 2321, + "time_per_iteration": 2.5557823181152344 + }, + { + "auxiliary_loss_clip": 0.06610114, + "auxiliary_loss_mlp": 0.01289737, + "balance_loss_clip": 0.06294077, + "balance_loss_mlp": 0.01258158, + "epoch": 0.1396061926950248, + "flos": 25051154048640.0, + "grad_norm": 2.1088337541486215, + "language_loss": 0.87096989, + "learning_rate": 3.875296924384965e-06, + "loss": 0.9499684, + "num_input_tokens_seen": 50308735, + "router_z_loss_clip": 3.15820312, + "router_z_loss_mlp": 0.3157959, + "step": 2322, + "time_per_iteration": 2.563751459121704 + }, + { + "auxiliary_loss_clip": 0.06602737, + "auxiliary_loss_mlp": 0.01287424, + "balance_loss_clip": 0.06298044, + "balance_loss_mlp": 0.01258718, + "epoch": 0.13966631594769277, + "flos": 37643193924480.0, + "grad_norm": 1.6181543517844332, + "language_loss": 0.68045509, + "learning_rate": 3.875161517775226e-06, + "loss": 0.75935674, + "num_input_tokens_seen": 50331025, + "router_z_loss_clip": 3.05078125, + "router_z_loss_mlp": 0.28710938, + "step": 2323, + "time_per_iteration": 2.8503611087799072 + }, + { + "auxiliary_loss_clip": 0.06623898, + "auxiliary_loss_mlp": 0.01286528, + "balance_loss_clip": 0.06301014, + "balance_loss_mlp": 0.01257393, + "epoch": 0.13972643920036074, + "flos": 16696627655040.0, + "grad_norm": 2.142170673512178, + "language_loss": 0.90579832, + "learning_rate": 3.875026040059175e-06, + "loss": 0.98490262, + "num_input_tokens_seen": 50349725, + "router_z_loss_clip": 3.23046875, + "router_z_loss_mlp": 0.29150391, + "step": 2324, + "time_per_iteration": 2.5540571212768555 + }, + { + "auxiliary_loss_clip": 0.06618317, + "auxiliary_loss_mlp": 0.01286509, + "balance_loss_clip": 0.0630155, + "balance_loss_mlp": 0.01256659, + "epoch": 0.1397865624530287, + "flos": 23337742233600.0, + "grad_norm": 4.139742528061125, + "language_loss": 0.72620469, + "learning_rate": 3.8748904912419485e-06, + "loss": 0.80525297, + "num_input_tokens_seen": 50367965, + "router_z_loss_clip": 3.16992188, + "router_z_loss_mlp": 0.29821777, + "step": 2325, + "time_per_iteration": 2.5619618892669678 + }, + { + "auxiliary_loss_clip": 0.0662512, + "auxiliary_loss_mlp": 0.01293129, + "balance_loss_clip": 0.06308709, + "balance_loss_mlp": 0.01264591, + "epoch": 0.13984668570569667, + "flos": 22784000526720.0, + "grad_norm": 2.1958407614138, + "language_loss": 0.83206451, + "learning_rate": 3.874754871328688e-06, + "loss": 0.91124701, + "num_input_tokens_seen": 50385605, + "router_z_loss_clip": 3.16210938, + "router_z_loss_mlp": 0.28503418, + "step": 2326, + "time_per_iteration": 2.544154167175293 + }, + { + "auxiliary_loss_clip": 0.06607386, + "auxiliary_loss_mlp": 0.0128622, + "balance_loss_clip": 0.06303836, + "balance_loss_mlp": 0.01256764, + "epoch": 0.13990680895836466, + "flos": 19470759505920.0, + "grad_norm": 1.8381162719470834, + "language_loss": 0.90198052, + "learning_rate": 3.874619180324534e-06, + "loss": 0.98091662, + "num_input_tokens_seen": 50403985, + "router_z_loss_clip": 3.03515625, + "router_z_loss_mlp": 0.2947998, + "step": 2327, + "time_per_iteration": 2.544022798538208 + }, + { + "auxiliary_loss_clip": 0.06612301, + "auxiliary_loss_mlp": 0.01294926, + "balance_loss_clip": 0.06299497, + "balance_loss_mlp": 0.01263479, + "epoch": 0.13996693221103262, + "flos": 20309555203200.0, + "grad_norm": 2.1153988454525927, + "language_loss": 0.86492193, + "learning_rate": 3.874483418234632e-06, + "loss": 0.9439941, + "num_input_tokens_seen": 50421590, + "router_z_loss_clip": 3.12890625, + "router_z_loss_mlp": 0.31433105, + "step": 2328, + "time_per_iteration": 2.498436212539673 + }, + { + "auxiliary_loss_clip": 0.06619829, + "auxiliary_loss_mlp": 0.01290779, + "balance_loss_clip": 0.06303053, + "balance_loss_mlp": 0.01261239, + "epoch": 0.1400270554637006, + "flos": 26625434958720.0, + "grad_norm": 2.232478376897894, + "language_loss": 0.74862719, + "learning_rate": 3.874347585064131e-06, + "loss": 0.82773322, + "num_input_tokens_seen": 50443945, + "router_z_loss_clip": 3.16992188, + "router_z_loss_mlp": 0.29541016, + "step": 2329, + "time_per_iteration": 2.625213146209717 + }, + { + "auxiliary_loss_clip": 0.06613478, + "auxiliary_loss_mlp": 0.01291404, + "balance_loss_clip": 0.06302348, + "balance_loss_mlp": 0.01261912, + "epoch": 0.14008717871636855, + "flos": 19397651218560.0, + "grad_norm": 2.9962397362189797, + "language_loss": 0.79502976, + "learning_rate": 3.874211680818183e-06, + "loss": 0.87407863, + "num_input_tokens_seen": 50462065, + "router_z_loss_clip": 3.11132812, + "router_z_loss_mlp": 0.29516602, + "step": 2330, + "time_per_iteration": 2.526705265045166 + }, + { + "auxiliary_loss_clip": 0.06610473, + "auxiliary_loss_mlp": 0.01292963, + "balance_loss_clip": 0.06299306, + "balance_loss_mlp": 0.01265187, + "epoch": 0.14014730196903652, + "flos": 15309624620160.0, + "grad_norm": 3.126642482841082, + "language_loss": 0.73399383, + "learning_rate": 3.87407570550194e-06, + "loss": 0.81302822, + "num_input_tokens_seen": 50479565, + "router_z_loss_clip": 3.11328125, + "router_z_loss_mlp": 0.27783203, + "step": 2331, + "time_per_iteration": 2.5545501708984375 + }, + { + "auxiliary_loss_clip": 0.06595145, + "auxiliary_loss_mlp": 0.01295524, + "balance_loss_clip": 0.06296061, + "balance_loss_mlp": 0.01267176, + "epoch": 0.14020742522170448, + "flos": 14945047505280.0, + "grad_norm": 1.5446780905805184, + "language_loss": 0.73888373, + "learning_rate": 3.873939659120557e-06, + "loss": 0.81779039, + "num_input_tokens_seen": 50497305, + "router_z_loss_clip": 2.9921875, + "router_z_loss_mlp": 0.28344727, + "step": 2332, + "time_per_iteration": 2.5132856369018555 + }, + { + "auxiliary_loss_clip": 0.06469279, + "auxiliary_loss_mlp": 0.01265718, + "balance_loss_clip": 0.0628898, + "balance_loss_mlp": 0.01254947, + "epoch": 0.14026754847437245, + "flos": 48839956410240.0, + "grad_norm": 0.7856293848414069, + "language_loss": 0.55978549, + "learning_rate": 3.873803541679196e-06, + "loss": 0.63713545, + "num_input_tokens_seen": 50549735, + "router_z_loss_clip": 1.80371094, + "router_z_loss_mlp": 0.10784912, + "step": 2333, + "time_per_iteration": 3.0545504093170166 + }, + { + "auxiliary_loss_clip": 0.06614032, + "auxiliary_loss_mlp": 0.01304219, + "balance_loss_clip": 0.06302805, + "balance_loss_mlp": 0.01274512, + "epoch": 0.14032767172704044, + "flos": 25779972862080.0, + "grad_norm": 1.7607916686559548, + "language_loss": 0.83699584, + "learning_rate": 3.873667353183016e-06, + "loss": 0.91617835, + "num_input_tokens_seen": 50570100, + "router_z_loss_clip": 3.11328125, + "router_z_loss_mlp": 0.29699707, + "step": 2334, + "time_per_iteration": 2.6067097187042236 + }, + { + "auxiliary_loss_clip": 0.06611067, + "auxiliary_loss_mlp": 0.01296359, + "balance_loss_clip": 0.06295306, + "balance_loss_mlp": 0.01268023, + "epoch": 0.1403877949797084, + "flos": 21222884707200.0, + "grad_norm": 3.2536049566200846, + "language_loss": 0.81910211, + "learning_rate": 3.8735310936371825e-06, + "loss": 0.89817637, + "num_input_tokens_seen": 50589185, + "router_z_loss_clip": 3.15625, + "router_z_loss_mlp": 0.28356934, + "step": 2335, + "time_per_iteration": 2.5793120861053467 + }, + { + "auxiliary_loss_clip": 0.06618994, + "auxiliary_loss_mlp": 0.0129466, + "balance_loss_clip": 0.06299357, + "balance_loss_mlp": 0.01262044, + "epoch": 0.14044791823237637, + "flos": 22754678797440.0, + "grad_norm": 1.8425920337650705, + "language_loss": 0.83025301, + "learning_rate": 3.873394763046862e-06, + "loss": 0.9093895, + "num_input_tokens_seen": 50609645, + "router_z_loss_clip": 3.19335938, + "router_z_loss_mlp": 0.32617188, + "step": 2336, + "time_per_iteration": 2.5754895210266113 + }, + { + "auxiliary_loss_clip": 0.0660933, + "auxiliary_loss_mlp": 0.0129189, + "balance_loss_clip": 0.06299824, + "balance_loss_mlp": 0.01261516, + "epoch": 0.14050804148504434, + "flos": 22970775277440.0, + "grad_norm": 1.9428001111866895, + "language_loss": 0.81449389, + "learning_rate": 3.873258361417225e-06, + "loss": 0.89350611, + "num_input_tokens_seen": 50628385, + "router_z_loss_clip": 3.09765625, + "router_z_loss_mlp": 0.30371094, + "step": 2337, + "time_per_iteration": 2.542494773864746 + }, + { + "auxiliary_loss_clip": 0.06620462, + "auxiliary_loss_mlp": 0.01292117, + "balance_loss_clip": 0.06304233, + "balance_loss_mlp": 0.01262493, + "epoch": 0.1405681647377123, + "flos": 22206890730240.0, + "grad_norm": 2.099495755823345, + "language_loss": 0.80428421, + "learning_rate": 3.873121888753442e-06, + "loss": 0.88341004, + "num_input_tokens_seen": 50647260, + "router_z_loss_clip": 3.16796875, + "router_z_loss_mlp": 0.29626465, + "step": 2338, + "time_per_iteration": 2.5587832927703857 + }, + { + "auxiliary_loss_clip": 0.06618391, + "auxiliary_loss_mlp": 0.01291133, + "balance_loss_clip": 0.06299177, + "balance_loss_mlp": 0.01259447, + "epoch": 0.14062828799038027, + "flos": 23739607215360.0, + "grad_norm": 2.563407914599119, + "language_loss": 0.81585765, + "learning_rate": 3.87298534506069e-06, + "loss": 0.89495289, + "num_input_tokens_seen": 50666130, + "router_z_loss_clip": 3.19140625, + "router_z_loss_mlp": 0.31689453, + "step": 2339, + "time_per_iteration": 2.541985273361206 + }, + { + "auxiliary_loss_clip": 0.06608106, + "auxiliary_loss_mlp": 0.01284227, + "balance_loss_clip": 0.06301871, + "balance_loss_mlp": 0.01254735, + "epoch": 0.14068841124304826, + "flos": 39211856611200.0, + "grad_norm": 1.7427009821835167, + "language_loss": 0.66622555, + "learning_rate": 3.872848730344146e-06, + "loss": 0.7451489, + "num_input_tokens_seen": 50687440, + "router_z_loss_clip": 3.06640625, + "router_z_loss_mlp": 0.29492188, + "step": 2340, + "time_per_iteration": 2.7599191665649414 + }, + { + "auxiliary_loss_clip": 0.06615461, + "auxiliary_loss_mlp": 0.01296967, + "balance_loss_clip": 0.06309174, + "balance_loss_mlp": 0.01267952, + "epoch": 0.14074853449571623, + "flos": 20198278581120.0, + "grad_norm": 2.455789479029152, + "language_loss": 0.80003643, + "learning_rate": 3.87271204460899e-06, + "loss": 0.87916064, + "num_input_tokens_seen": 50704030, + "router_z_loss_clip": 3.06445312, + "router_z_loss_mlp": 0.2902832, + "step": 2341, + "time_per_iteration": 2.5097782611846924 + }, + { + "auxiliary_loss_clip": 0.06617275, + "auxiliary_loss_mlp": 0.01290109, + "balance_loss_clip": 0.06306843, + "balance_loss_mlp": 0.01261118, + "epoch": 0.1408086577483842, + "flos": 18411800405760.0, + "grad_norm": 1.7920815266740484, + "language_loss": 0.81707942, + "learning_rate": 3.8725752878604066e-06, + "loss": 0.89615333, + "num_input_tokens_seen": 50723305, + "router_z_loss_clip": 3.10351562, + "router_z_loss_mlp": 0.29003906, + "step": 2342, + "time_per_iteration": 2.5234599113464355 + }, + { + "auxiliary_loss_clip": 0.06617711, + "auxiliary_loss_mlp": 0.01285014, + "balance_loss_clip": 0.06315217, + "balance_loss_mlp": 0.01257858, + "epoch": 0.14086878100105216, + "flos": 25271569013760.0, + "grad_norm": 1.8907393143090194, + "language_loss": 0.79096431, + "learning_rate": 3.87243846010358e-06, + "loss": 0.8699916, + "num_input_tokens_seen": 50743270, + "router_z_loss_clip": 3.02734375, + "router_z_loss_mlp": 0.27172852, + "step": 2343, + "time_per_iteration": 2.566734552383423 + }, + { + "auxiliary_loss_clip": 0.0648333, + "auxiliary_loss_mlp": 0.01280273, + "balance_loss_clip": 0.06304723, + "balance_loss_mlp": 0.01268566, + "epoch": 0.14092890425372012, + "flos": 65997553703040.0, + "grad_norm": 0.8105470614930316, + "language_loss": 0.61667693, + "learning_rate": 3.872301561343699e-06, + "loss": 0.69431293, + "num_input_tokens_seen": 50802710, + "router_z_loss_clip": 1.78125, + "router_z_loss_mlp": 0.11694336, + "step": 2344, + "time_per_iteration": 3.107311964035034 + }, + { + "auxiliary_loss_clip": 0.06612515, + "auxiliary_loss_mlp": 0.01296816, + "balance_loss_clip": 0.06307824, + "balance_loss_mlp": 0.01267514, + "epoch": 0.1409890275063881, + "flos": 23701564661760.0, + "grad_norm": 1.4479662088391603, + "language_loss": 0.66076458, + "learning_rate": 3.872164591585956e-06, + "loss": 0.73985791, + "num_input_tokens_seen": 50822625, + "router_z_loss_clip": 3.046875, + "router_z_loss_mlp": 0.29321289, + "step": 2345, + "time_per_iteration": 2.548482656478882 + }, + { + "auxiliary_loss_clip": 0.06630909, + "auxiliary_loss_mlp": 0.0129167, + "balance_loss_clip": 0.06307563, + "balance_loss_mlp": 0.01260676, + "epoch": 0.14104915075905605, + "flos": 23629923820800.0, + "grad_norm": 2.297389176264822, + "language_loss": 0.7525146, + "learning_rate": 3.8720275508355435e-06, + "loss": 0.83174026, + "num_input_tokens_seen": 50842330, + "router_z_loss_clip": 3.23242188, + "router_z_loss_mlp": 0.31005859, + "step": 2346, + "time_per_iteration": 3.9794979095458984 + }, + { + "auxiliary_loss_clip": 0.06626198, + "auxiliary_loss_mlp": 0.01293091, + "balance_loss_clip": 0.06312405, + "balance_loss_mlp": 0.0126162, + "epoch": 0.14110927401172405, + "flos": 20601485228160.0, + "grad_norm": 2.0524474508447876, + "language_loss": 0.7827574, + "learning_rate": 3.8718904390976585e-06, + "loss": 0.86195028, + "num_input_tokens_seen": 50861035, + "router_z_loss_clip": 3.13867188, + "router_z_loss_mlp": 0.31445312, + "step": 2347, + "time_per_iteration": 3.98130202293396 + }, + { + "auxiliary_loss_clip": 0.06624688, + "auxiliary_loss_mlp": 0.01292693, + "balance_loss_clip": 0.06315368, + "balance_loss_mlp": 0.01263725, + "epoch": 0.141169397264392, + "flos": 28555530232320.0, + "grad_norm": 2.266106813963602, + "language_loss": 0.77906024, + "learning_rate": 3.8717532563775e-06, + "loss": 0.85823405, + "num_input_tokens_seen": 50880105, + "router_z_loss_clip": 3.09765625, + "router_z_loss_mlp": 0.28955078, + "step": 2348, + "time_per_iteration": 2.594891309738159 + }, + { + "auxiliary_loss_clip": 0.06614843, + "auxiliary_loss_mlp": 0.01295406, + "balance_loss_clip": 0.06306847, + "balance_loss_mlp": 0.01267558, + "epoch": 0.14122952051705998, + "flos": 17097947585280.0, + "grad_norm": 2.2615839491571097, + "language_loss": 0.88040984, + "learning_rate": 3.871616002680272e-06, + "loss": 0.95951235, + "num_input_tokens_seen": 50897720, + "router_z_loss_clip": 3.08007812, + "router_z_loss_mlp": 0.27856445, + "step": 2349, + "time_per_iteration": 2.547189712524414 + }, + { + "auxiliary_loss_clip": 0.06613597, + "auxiliary_loss_mlp": 0.01290937, + "balance_loss_clip": 0.06307055, + "balance_loss_mlp": 0.01260754, + "epoch": 0.14128964376972794, + "flos": 28953915269760.0, + "grad_norm": 1.755772853620136, + "language_loss": 0.89833802, + "learning_rate": 3.871478678011177e-06, + "loss": 0.97738338, + "num_input_tokens_seen": 50918385, + "router_z_loss_clip": 3.06445312, + "router_z_loss_mlp": 0.30200195, + "step": 2350, + "time_per_iteration": 2.5965797901153564 + }, + { + "auxiliary_loss_clip": 0.06614771, + "auxiliary_loss_mlp": 0.01295884, + "balance_loss_clip": 0.06303953, + "balance_loss_mlp": 0.0126626, + "epoch": 0.1413497670223959, + "flos": 18995828163840.0, + "grad_norm": 2.169076392434691, + "language_loss": 0.81670076, + "learning_rate": 3.871341282375423e-06, + "loss": 0.89580733, + "num_input_tokens_seen": 50938270, + "router_z_loss_clip": 3.10742188, + "router_z_loss_mlp": 0.29638672, + "step": 2351, + "time_per_iteration": 4.039130687713623 + }, + { + "auxiliary_loss_clip": 0.06617273, + "auxiliary_loss_mlp": 0.012885, + "balance_loss_clip": 0.06303668, + "balance_loss_mlp": 0.01259246, + "epoch": 0.14140989027506387, + "flos": 29870053885440.0, + "grad_norm": 2.711725731055931, + "language_loss": 0.85320342, + "learning_rate": 3.871203815778219e-06, + "loss": 0.93226123, + "num_input_tokens_seen": 50958155, + "router_z_loss_clip": 3.1328125, + "router_z_loss_mlp": 0.29223633, + "step": 2352, + "time_per_iteration": 2.6179373264312744 + }, + { + "auxiliary_loss_clip": 0.06476805, + "auxiliary_loss_mlp": 0.01279755, + "balance_loss_clip": 0.06299812, + "balance_loss_mlp": 0.01267614, + "epoch": 0.14147001352773186, + "flos": 62098901331840.0, + "grad_norm": 0.8822482530682503, + "language_loss": 0.61915213, + "learning_rate": 3.87106627822478e-06, + "loss": 0.69671774, + "num_input_tokens_seen": 51020705, + "router_z_loss_clip": 1.76953125, + "router_z_loss_mlp": 0.12139893, + "step": 2353, + "time_per_iteration": 3.087498188018799 + }, + { + "auxiliary_loss_clip": 0.06606863, + "auxiliary_loss_mlp": 0.01289785, + "balance_loss_clip": 0.06297718, + "balance_loss_mlp": 0.01259458, + "epoch": 0.14153013678039983, + "flos": 22023973267200.0, + "grad_norm": 1.6072508509392793, + "language_loss": 0.88457793, + "learning_rate": 3.8709286697203196e-06, + "loss": 0.96354443, + "num_input_tokens_seen": 51039995, + "router_z_loss_clip": 3.09179688, + "router_z_loss_mlp": 0.30297852, + "step": 2354, + "time_per_iteration": 2.5465357303619385 + }, + { + "auxiliary_loss_clip": 0.06612588, + "auxiliary_loss_mlp": 0.01286583, + "balance_loss_clip": 0.0630111, + "balance_loss_mlp": 0.01255231, + "epoch": 0.1415902600330678, + "flos": 19726365985920.0, + "grad_norm": 1.842515646240357, + "language_loss": 0.75627196, + "learning_rate": 3.870790990270057e-06, + "loss": 0.83526361, + "num_input_tokens_seen": 51059075, + "router_z_loss_clip": 3.11523438, + "router_z_loss_mlp": 0.31347656, + "step": 2355, + "time_per_iteration": 2.5172102451324463 + }, + { + "auxiliary_loss_clip": 0.0647012, + "auxiliary_loss_mlp": 0.01269619, + "balance_loss_clip": 0.06293327, + "balance_loss_mlp": 0.01258312, + "epoch": 0.14165038328573576, + "flos": 65919330316800.0, + "grad_norm": 0.6582247032564781, + "language_loss": 0.51791292, + "learning_rate": 3.870653239879212e-06, + "loss": 0.59531033, + "num_input_tokens_seen": 51120380, + "router_z_loss_clip": 1.765625, + "router_z_loss_mlp": 0.11303711, + "step": 2356, + "time_per_iteration": 3.150625228881836 + }, + { + "auxiliary_loss_clip": 0.06615196, + "auxiliary_loss_mlp": 0.01292015, + "balance_loss_clip": 0.06304777, + "balance_loss_mlp": 0.01263262, + "epoch": 0.14171050653840372, + "flos": 12135011379840.0, + "grad_norm": 2.2420127528599973, + "language_loss": 0.71637189, + "learning_rate": 3.8705154185530095e-06, + "loss": 0.79544401, + "num_input_tokens_seen": 51136950, + "router_z_loss_clip": 3.10742188, + "router_z_loss_mlp": 0.28759766, + "step": 2357, + "time_per_iteration": 2.552600383758545 + }, + { + "auxiliary_loss_clip": 0.06616427, + "auxiliary_loss_mlp": 0.01288449, + "balance_loss_clip": 0.06301764, + "balance_loss_mlp": 0.01259624, + "epoch": 0.1417706297910717, + "flos": 20418735473280.0, + "grad_norm": 1.865810969860464, + "language_loss": 0.83125997, + "learning_rate": 3.870377526296674e-06, + "loss": 0.91030866, + "num_input_tokens_seen": 51155175, + "router_z_loss_clip": 3.15039062, + "router_z_loss_mlp": 0.28833008, + "step": 2358, + "time_per_iteration": 2.5359318256378174 + }, + { + "auxiliary_loss_clip": 0.06627464, + "auxiliary_loss_mlp": 0.01304325, + "balance_loss_clip": 0.06307626, + "balance_loss_mlp": 0.01270685, + "epoch": 0.14183075304373965, + "flos": 22386831373440.0, + "grad_norm": 2.098054947183796, + "language_loss": 0.72660583, + "learning_rate": 3.870239563115436e-06, + "loss": 0.8059237, + "num_input_tokens_seen": 51174500, + "router_z_loss_clip": 3.19726562, + "router_z_loss_mlp": 0.33642578, + "step": 2359, + "time_per_iteration": 2.5888121128082275 + }, + { + "auxiliary_loss_clip": 0.06615248, + "auxiliary_loss_mlp": 0.01292517, + "balance_loss_clip": 0.06299685, + "balance_loss_mlp": 0.0126126, + "epoch": 0.14189087629640765, + "flos": 21587503749120.0, + "grad_norm": 2.25647767982073, + "language_loss": 0.77278101, + "learning_rate": 3.870101529014526e-06, + "loss": 0.85185868, + "num_input_tokens_seen": 51194270, + "router_z_loss_clip": 3.16015625, + "router_z_loss_mlp": 0.31225586, + "step": 2360, + "time_per_iteration": 2.579084634780884 + }, + { + "auxiliary_loss_clip": 0.06601179, + "auxiliary_loss_mlp": 0.01289048, + "balance_loss_clip": 0.06295604, + "balance_loss_mlp": 0.01258936, + "epoch": 0.1419509995490756, + "flos": 20014312942080.0, + "grad_norm": 2.059957260866831, + "language_loss": 0.83125579, + "learning_rate": 3.869963423999178e-06, + "loss": 0.91015804, + "num_input_tokens_seen": 51211850, + "router_z_loss_clip": 3.0546875, + "router_z_loss_mlp": 0.30102539, + "step": 2361, + "time_per_iteration": 2.5846474170684814 + }, + { + "auxiliary_loss_clip": 0.06605215, + "auxiliary_loss_mlp": 0.01291381, + "balance_loss_clip": 0.06296673, + "balance_loss_mlp": 0.01261745, + "epoch": 0.14201112280174358, + "flos": 31949552188800.0, + "grad_norm": 1.940007653055607, + "language_loss": 0.75587547, + "learning_rate": 3.86982524807463e-06, + "loss": 0.83484137, + "num_input_tokens_seen": 51233545, + "router_z_loss_clip": 3.08789062, + "router_z_loss_mlp": 0.29663086, + "step": 2362, + "time_per_iteration": 2.6412899494171143 + }, + { + "auxiliary_loss_clip": 0.06603248, + "auxiliary_loss_mlp": 0.01291653, + "balance_loss_clip": 0.06299227, + "balance_loss_mlp": 0.01262948, + "epoch": 0.14207124605441154, + "flos": 41473811180160.0, + "grad_norm": 1.7220107932789903, + "language_loss": 0.74775076, + "learning_rate": 3.869687001246122e-06, + "loss": 0.82669979, + "num_input_tokens_seen": 51257615, + "router_z_loss_clip": 3.04296875, + "router_z_loss_mlp": 0.28686523, + "step": 2363, + "time_per_iteration": 2.7700705528259277 + }, + { + "auxiliary_loss_clip": 0.0660228, + "auxiliary_loss_mlp": 0.01297174, + "balance_loss_clip": 0.06294844, + "balance_loss_mlp": 0.01268051, + "epoch": 0.1421313693070795, + "flos": 31913186716800.0, + "grad_norm": 1.995738601500514, + "language_loss": 0.74229443, + "learning_rate": 3.8695486835188946e-06, + "loss": 0.82128894, + "num_input_tokens_seen": 51279645, + "router_z_loss_clip": 3.07226562, + "router_z_loss_mlp": 0.2911377, + "step": 2364, + "time_per_iteration": 2.636725664138794 + }, + { + "auxiliary_loss_clip": 0.06596863, + "auxiliary_loss_mlp": 0.01292827, + "balance_loss_clip": 0.06297632, + "balance_loss_mlp": 0.01264741, + "epoch": 0.14219149255974747, + "flos": 26878609670400.0, + "grad_norm": 3.4348232103303853, + "language_loss": 0.91282582, + "learning_rate": 3.869410294898195e-06, + "loss": 0.9917227, + "num_input_tokens_seen": 51299775, + "router_z_loss_clip": 2.9921875, + "router_z_loss_mlp": 0.28100586, + "step": 2365, + "time_per_iteration": 2.6131789684295654 + }, + { + "auxiliary_loss_clip": 0.06604894, + "auxiliary_loss_mlp": 0.01286963, + "balance_loss_clip": 0.06295748, + "balance_loss_mlp": 0.01257613, + "epoch": 0.14225161581241544, + "flos": 27461882741760.0, + "grad_norm": 1.7987446671320764, + "language_loss": 0.67002726, + "learning_rate": 3.869271835389268e-06, + "loss": 0.74894583, + "num_input_tokens_seen": 51319430, + "router_z_loss_clip": 3.09570312, + "router_z_loss_mlp": 0.29345703, + "step": 2366, + "time_per_iteration": 2.5887913703918457 + }, + { + "auxiliary_loss_clip": 0.06604536, + "auxiliary_loss_mlp": 0.01294035, + "balance_loss_clip": 0.06302322, + "balance_loss_mlp": 0.01266069, + "epoch": 0.14231173906508343, + "flos": 10566055203840.0, + "grad_norm": 1.9092553080536903, + "language_loss": 0.81985664, + "learning_rate": 3.8691333049973665e-06, + "loss": 0.89884233, + "num_input_tokens_seen": 51336045, + "router_z_loss_clip": 3.0234375, + "router_z_loss_mlp": 0.27978516, + "step": 2367, + "time_per_iteration": 2.5478296279907227 + }, + { + "auxiliary_loss_clip": 0.06620896, + "auxiliary_loss_mlp": 0.01287452, + "balance_loss_clip": 0.06312472, + "balance_loss_mlp": 0.01257244, + "epoch": 0.1423718623177514, + "flos": 28367539597440.0, + "grad_norm": 1.7968709236925184, + "language_loss": 0.83861458, + "learning_rate": 3.868994703727742e-06, + "loss": 0.91769814, + "num_input_tokens_seen": 51357030, + "router_z_loss_clip": 3.08984375, + "router_z_loss_mlp": 0.30224609, + "step": 2368, + "time_per_iteration": 2.6346163749694824 + }, + { + "auxiliary_loss_clip": 0.06607647, + "auxiliary_loss_mlp": 0.01292051, + "balance_loss_clip": 0.06299834, + "balance_loss_mlp": 0.01262558, + "epoch": 0.14243198557041936, + "flos": 19360279497600.0, + "grad_norm": 2.15297979683556, + "language_loss": 0.8844623, + "learning_rate": 3.868856031585652e-06, + "loss": 0.96345925, + "num_input_tokens_seen": 51374890, + "router_z_loss_clip": 3.078125, + "router_z_loss_mlp": 0.29516602, + "step": 2369, + "time_per_iteration": 2.539616823196411 + }, + { + "auxiliary_loss_clip": 0.06609218, + "auxiliary_loss_mlp": 0.01286988, + "balance_loss_clip": 0.06298466, + "balance_loss_mlp": 0.01257067, + "epoch": 0.14249210882308733, + "flos": 28814952072960.0, + "grad_norm": 1.4943626605358518, + "language_loss": 0.76837498, + "learning_rate": 3.868717288576354e-06, + "loss": 0.84733701, + "num_input_tokens_seen": 51398100, + "router_z_loss_clip": 3.10546875, + "router_z_loss_mlp": 0.29931641, + "step": 2370, + "time_per_iteration": 2.6086556911468506 + }, + { + "auxiliary_loss_clip": 0.06600792, + "auxiliary_loss_mlp": 0.01298284, + "balance_loss_clip": 0.06298122, + "balance_loss_mlp": 0.01270198, + "epoch": 0.1425522320757553, + "flos": 21841433147520.0, + "grad_norm": 1.5553091357309907, + "language_loss": 0.83888042, + "learning_rate": 3.868578474705109e-06, + "loss": 0.91787124, + "num_input_tokens_seen": 51418745, + "router_z_loss_clip": 3.02734375, + "router_z_loss_mlp": 0.28076172, + "step": 2371, + "time_per_iteration": 2.5464093685150146 + }, + { + "auxiliary_loss_clip": 0.06608661, + "auxiliary_loss_mlp": 0.01298037, + "balance_loss_clip": 0.06299958, + "balance_loss_mlp": 0.01267448, + "epoch": 0.14261235532842326, + "flos": 17317230520320.0, + "grad_norm": 1.80299500179396, + "language_loss": 0.84039259, + "learning_rate": 3.868439589977181e-06, + "loss": 0.91945958, + "num_input_tokens_seen": 51437455, + "router_z_loss_clip": 3.08984375, + "router_z_loss_mlp": 0.30615234, + "step": 2372, + "time_per_iteration": 2.6340725421905518 + }, + { + "auxiliary_loss_clip": 0.0660327, + "auxiliary_loss_mlp": 0.01297499, + "balance_loss_clip": 0.06296232, + "balance_loss_mlp": 0.01267149, + "epoch": 0.14267247858109125, + "flos": 18812659138560.0, + "grad_norm": 1.948811934487197, + "language_loss": 0.8570497, + "learning_rate": 3.868300634397836e-06, + "loss": 0.93605745, + "num_input_tokens_seen": 51455710, + "router_z_loss_clip": 3.07226562, + "router_z_loss_mlp": 0.30322266, + "step": 2373, + "time_per_iteration": 2.580719232559204 + }, + { + "auxiliary_loss_clip": 0.06601362, + "auxiliary_loss_mlp": 0.01295253, + "balance_loss_clip": 0.06296989, + "balance_loss_mlp": 0.01266547, + "epoch": 0.14273260183375922, + "flos": 11362783351680.0, + "grad_norm": 1.9518464435556906, + "language_loss": 0.87130672, + "learning_rate": 3.8681616079723445e-06, + "loss": 0.95027292, + "num_input_tokens_seen": 51471270, + "router_z_loss_clip": 3.04101562, + "router_z_loss_mlp": 0.28710938, + "step": 2374, + "time_per_iteration": 2.499939441680908 + }, + { + "auxiliary_loss_clip": 0.0660402, + "auxiliary_loss_mlp": 0.01294805, + "balance_loss_clip": 0.06292336, + "balance_loss_mlp": 0.01264526, + "epoch": 0.14279272508642718, + "flos": 27575800767360.0, + "grad_norm": 1.5586534981326832, + "language_loss": 0.79946959, + "learning_rate": 3.868022510705977e-06, + "loss": 0.87845778, + "num_input_tokens_seen": 51492705, + "router_z_loss_clip": 3.1171875, + "router_z_loss_mlp": 0.30273438, + "step": 2375, + "time_per_iteration": 2.610959768295288 + }, + { + "auxiliary_loss_clip": 0.06608847, + "auxiliary_loss_mlp": 0.01308792, + "balance_loss_clip": 0.06302035, + "balance_loss_mlp": 0.01278454, + "epoch": 0.14285284833909515, + "flos": 16258019857920.0, + "grad_norm": 4.976375068021591, + "language_loss": 0.77988309, + "learning_rate": 3.867883342604009e-06, + "loss": 0.85905945, + "num_input_tokens_seen": 51510780, + "router_z_loss_clip": 3.0703125, + "router_z_loss_mlp": 0.30310059, + "step": 2376, + "time_per_iteration": 2.5109288692474365 + }, + { + "auxiliary_loss_clip": 0.06606634, + "auxiliary_loss_mlp": 0.01292138, + "balance_loss_clip": 0.06302465, + "balance_loss_mlp": 0.01263742, + "epoch": 0.1429129715917631, + "flos": 19761725208960.0, + "grad_norm": 1.9346292161061796, + "language_loss": 0.94255036, + "learning_rate": 3.867744103671717e-06, + "loss": 1.02153814, + "num_input_tokens_seen": 51531400, + "router_z_loss_clip": 3.04101562, + "router_z_loss_mlp": 0.28393555, + "step": 2377, + "time_per_iteration": 2.5885112285614014 + }, + { + "auxiliary_loss_clip": 0.06608409, + "auxiliary_loss_mlp": 0.01297565, + "balance_loss_clip": 0.06298596, + "balance_loss_mlp": 0.01267524, + "epoch": 0.14297309484443108, + "flos": 21142606896000.0, + "grad_norm": 1.9262255620531108, + "language_loss": 0.92638403, + "learning_rate": 3.867604793914382e-06, + "loss": 1.00544381, + "num_input_tokens_seen": 51548215, + "router_z_loss_clip": 3.09960938, + "router_z_loss_mlp": 0.30029297, + "step": 2378, + "time_per_iteration": 2.5396018028259277 + }, + { + "auxiliary_loss_clip": 0.06602019, + "auxiliary_loss_mlp": 0.01288289, + "balance_loss_clip": 0.06294227, + "balance_loss_mlp": 0.01259667, + "epoch": 0.14303321809709904, + "flos": 23593432567680.0, + "grad_norm": 1.925396398414909, + "language_loss": 0.7506215, + "learning_rate": 3.8674654133372864e-06, + "loss": 0.82952458, + "num_input_tokens_seen": 51566820, + "router_z_loss_clip": 3.07421875, + "router_z_loss_mlp": 0.28649902, + "step": 2379, + "time_per_iteration": 2.5452654361724854 + }, + { + "auxiliary_loss_clip": 0.06604548, + "auxiliary_loss_mlp": 0.01289072, + "balance_loss_clip": 0.06300471, + "balance_loss_mlp": 0.01259342, + "epoch": 0.14309334134976703, + "flos": 15893778159360.0, + "grad_norm": 2.089306422098332, + "language_loss": 0.80051982, + "learning_rate": 3.867325961945714e-06, + "loss": 0.87945604, + "num_input_tokens_seen": 51585075, + "router_z_loss_clip": 3.0390625, + "router_z_loss_mlp": 0.29736328, + "step": 2380, + "time_per_iteration": 2.526667594909668 + }, + { + "auxiliary_loss_clip": 0.06614038, + "auxiliary_loss_mlp": 0.01293901, + "balance_loss_clip": 0.06305015, + "balance_loss_mlp": 0.01263348, + "epoch": 0.143153464602435, + "flos": 16331086218240.0, + "grad_norm": 2.094305551914021, + "language_loss": 0.88833153, + "learning_rate": 3.867186439744955e-06, + "loss": 0.96741092, + "num_input_tokens_seen": 51603185, + "router_z_loss_clip": 3.09179688, + "router_z_loss_mlp": 0.30578613, + "step": 2381, + "time_per_iteration": 2.5728068351745605 + }, + { + "auxiliary_loss_clip": 0.06602444, + "auxiliary_loss_mlp": 0.0128486, + "balance_loss_clip": 0.06299065, + "balance_loss_mlp": 0.01256226, + "epoch": 0.14321358785510296, + "flos": 17097737950080.0, + "grad_norm": 2.316632685614806, + "language_loss": 0.77740443, + "learning_rate": 3.867046846740299e-06, + "loss": 0.85627747, + "num_input_tokens_seen": 51620880, + "router_z_loss_clip": 3.02929688, + "router_z_loss_mlp": 0.28625488, + "step": 2382, + "time_per_iteration": 2.5297727584838867 + }, + { + "auxiliary_loss_clip": 0.06601999, + "auxiliary_loss_mlp": 0.01286872, + "balance_loss_clip": 0.06297128, + "balance_loss_mlp": 0.01257904, + "epoch": 0.14327371110777093, + "flos": 26330108843520.0, + "grad_norm": 2.004241684907444, + "language_loss": 0.78048921, + "learning_rate": 3.866907182937039e-06, + "loss": 0.85937786, + "num_input_tokens_seen": 51640170, + "router_z_loss_clip": 3.05078125, + "router_z_loss_mlp": 0.28955078, + "step": 2383, + "time_per_iteration": 2.598944664001465 + }, + { + "auxiliary_loss_clip": 0.06614614, + "auxiliary_loss_mlp": 0.01292365, + "balance_loss_clip": 0.06304877, + "balance_loss_mlp": 0.01261513, + "epoch": 0.1433338343604389, + "flos": 18082163243520.0, + "grad_norm": 3.628436675924041, + "language_loss": 0.88476908, + "learning_rate": 3.866767448340471e-06, + "loss": 0.96383882, + "num_input_tokens_seen": 51656580, + "router_z_loss_clip": 3.09765625, + "router_z_loss_mlp": 0.30834961, + "step": 2384, + "time_per_iteration": 2.5066895484924316 + }, + { + "auxiliary_loss_clip": 0.06611983, + "auxiliary_loss_mlp": 0.01297446, + "balance_loss_clip": 0.06300933, + "balance_loss_mlp": 0.0126719, + "epoch": 0.14339395761310686, + "flos": 15528110941440.0, + "grad_norm": 5.651210237348795, + "language_loss": 0.81964046, + "learning_rate": 3.866627642955895e-06, + "loss": 0.89873475, + "num_input_tokens_seen": 51674645, + "router_z_loss_clip": 3.11132812, + "router_z_loss_mlp": 0.30273438, + "step": 2385, + "time_per_iteration": 3.9016833305358887 + }, + { + "auxiliary_loss_clip": 0.06612079, + "auxiliary_loss_mlp": 0.01294874, + "balance_loss_clip": 0.06302845, + "balance_loss_mlp": 0.01266406, + "epoch": 0.14345408086577485, + "flos": 28556368773120.0, + "grad_norm": 2.028141972046204, + "language_loss": 0.76766604, + "learning_rate": 3.866487766788612e-06, + "loss": 0.8467356, + "num_input_tokens_seen": 51695770, + "router_z_loss_clip": 3.09570312, + "router_z_loss_mlp": 0.28479004, + "step": 2386, + "time_per_iteration": 4.032405376434326 + }, + { + "auxiliary_loss_clip": 0.06616995, + "auxiliary_loss_mlp": 0.01287556, + "balance_loss_clip": 0.06312285, + "balance_loss_mlp": 0.01258958, + "epoch": 0.14351420411844282, + "flos": 20236279207680.0, + "grad_norm": 2.123480501578919, + "language_loss": 0.79237044, + "learning_rate": 3.866347819843925e-06, + "loss": 0.87141591, + "num_input_tokens_seen": 51714165, + "router_z_loss_clip": 3.046875, + "router_z_loss_mlp": 0.28601074, + "step": 2387, + "time_per_iteration": 2.5608971118927 + }, + { + "auxiliary_loss_clip": 0.06612308, + "auxiliary_loss_mlp": 0.01293206, + "balance_loss_clip": 0.06306893, + "balance_loss_mlp": 0.01263023, + "epoch": 0.14357432737111078, + "flos": 19871157041280.0, + "grad_norm": 2.5788985385847396, + "language_loss": 0.83602524, + "learning_rate": 3.866207802127143e-06, + "loss": 0.91508037, + "num_input_tokens_seen": 51734440, + "router_z_loss_clip": 3.06054688, + "router_z_loss_mlp": 0.30200195, + "step": 2388, + "time_per_iteration": 2.5413224697113037 + }, + { + "auxiliary_loss_clip": 0.06619543, + "auxiliary_loss_mlp": 0.01287669, + "balance_loss_clip": 0.06312172, + "balance_loss_mlp": 0.0126006, + "epoch": 0.14363445062377875, + "flos": 28264354894080.0, + "grad_norm": 2.5598639084548176, + "language_loss": 0.83343434, + "learning_rate": 3.866067713643573e-06, + "loss": 0.91250646, + "num_input_tokens_seen": 51753730, + "router_z_loss_clip": 3.07421875, + "router_z_loss_mlp": 0.27648926, + "step": 2389, + "time_per_iteration": 2.6027376651763916 + }, + { + "auxiliary_loss_clip": 0.06612187, + "auxiliary_loss_mlp": 0.01286457, + "balance_loss_clip": 0.06301727, + "balance_loss_mlp": 0.01257013, + "epoch": 0.1436945738764467, + "flos": 18192517470720.0, + "grad_norm": 2.036228542153499, + "language_loss": 0.84029567, + "learning_rate": 3.8659275543985285e-06, + "loss": 0.91928208, + "num_input_tokens_seen": 51771195, + "router_z_loss_clip": 3.10546875, + "router_z_loss_mlp": 0.29467773, + "step": 2390, + "time_per_iteration": 5.428901672363281 + }, + { + "auxiliary_loss_clip": 0.06612678, + "auxiliary_loss_mlp": 0.01293631, + "balance_loss_clip": 0.06306715, + "balance_loss_mlp": 0.01264282, + "epoch": 0.14375469712911468, + "flos": 27315246896640.0, + "grad_norm": 2.34202135113637, + "language_loss": 0.75496042, + "learning_rate": 3.865787324397324e-06, + "loss": 0.83402348, + "num_input_tokens_seen": 51792290, + "router_z_loss_clip": 3.06054688, + "router_z_loss_mlp": 0.29345703, + "step": 2391, + "time_per_iteration": 2.599823236465454 + }, + { + "auxiliary_loss_clip": 0.06462222, + "auxiliary_loss_mlp": 0.01318708, + "balance_loss_clip": 0.06290679, + "balance_loss_mlp": 0.01307848, + "epoch": 0.14381482038178264, + "flos": 56908757980800.0, + "grad_norm": 0.847659725006037, + "language_loss": 0.61820173, + "learning_rate": 3.865647023645277e-06, + "loss": 0.69601095, + "num_input_tokens_seen": 51843675, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.10876465, + "step": 2392, + "time_per_iteration": 3.007570266723633 + }, + { + "auxiliary_loss_clip": 0.06623066, + "auxiliary_loss_mlp": 0.01297432, + "balance_loss_clip": 0.06308551, + "balance_loss_mlp": 0.01267105, + "epoch": 0.14387494363445064, + "flos": 14287282554240.0, + "grad_norm": 6.716541515366395, + "language_loss": 0.77778554, + "learning_rate": 3.865506652147709e-06, + "loss": 0.85699052, + "num_input_tokens_seen": 51860285, + "router_z_loss_clip": 3.14453125, + "router_z_loss_mlp": 0.30322266, + "step": 2393, + "time_per_iteration": 2.5064942836761475 + }, + { + "auxiliary_loss_clip": 0.06614703, + "auxiliary_loss_mlp": 0.01296275, + "balance_loss_clip": 0.06308223, + "balance_loss_mlp": 0.01266687, + "epoch": 0.1439350668871186, + "flos": 26768884348800.0, + "grad_norm": 2.0037821703408287, + "language_loss": 0.78038269, + "learning_rate": 3.865366209909941e-06, + "loss": 0.85949242, + "num_input_tokens_seen": 51880105, + "router_z_loss_clip": 3.06445312, + "router_z_loss_mlp": 0.2956543, + "step": 2394, + "time_per_iteration": 2.6112003326416016 + }, + { + "auxiliary_loss_clip": 0.06611894, + "auxiliary_loss_mlp": 0.01285238, + "balance_loss_clip": 0.06308618, + "balance_loss_mlp": 0.01256866, + "epoch": 0.14399519013978657, + "flos": 40709926632960.0, + "grad_norm": 2.2776605014778, + "language_loss": 0.87247694, + "learning_rate": 3.8652256969372994e-06, + "loss": 0.95144826, + "num_input_tokens_seen": 51905175, + "router_z_loss_clip": 3.03320312, + "router_z_loss_mlp": 0.28381348, + "step": 2395, + "time_per_iteration": 2.708005428314209 + }, + { + "auxiliary_loss_clip": 0.06606728, + "auxiliary_loss_mlp": 0.0129272, + "balance_loss_clip": 0.06306736, + "balance_loss_mlp": 0.01262846, + "epoch": 0.14405531339245453, + "flos": 20563652309760.0, + "grad_norm": 1.5258430726739798, + "language_loss": 0.83690441, + "learning_rate": 3.865085113235113e-06, + "loss": 0.91589892, + "num_input_tokens_seen": 51924490, + "router_z_loss_clip": 3.00195312, + "router_z_loss_mlp": 0.29882812, + "step": 2396, + "time_per_iteration": 2.554426431655884 + }, + { + "auxiliary_loss_clip": 0.06608565, + "auxiliary_loss_mlp": 0.01286347, + "balance_loss_clip": 0.06309813, + "balance_loss_mlp": 0.0125664, + "epoch": 0.1441154366451225, + "flos": 19578975454080.0, + "grad_norm": 3.4820488024482787, + "language_loss": 0.83915055, + "learning_rate": 3.864944458808712e-06, + "loss": 0.9180997, + "num_input_tokens_seen": 51940490, + "router_z_loss_clip": 2.98828125, + "router_z_loss_mlp": 0.29711914, + "step": 2397, + "time_per_iteration": 2.504763603210449 + }, + { + "auxiliary_loss_clip": 0.0661477, + "auxiliary_loss_mlp": 0.01289633, + "balance_loss_clip": 0.0631109, + "balance_loss_mlp": 0.01261452, + "epoch": 0.14417555989779046, + "flos": 18521735362560.0, + "grad_norm": 2.264494400552882, + "language_loss": 0.81188649, + "learning_rate": 3.86480373366343e-06, + "loss": 0.89093053, + "num_input_tokens_seen": 51957910, + "router_z_loss_clip": 3.03710938, + "router_z_loss_mlp": 0.28186035, + "step": 2398, + "time_per_iteration": 2.5385115146636963 + }, + { + "auxiliary_loss_clip": 0.0661198, + "auxiliary_loss_mlp": 0.01292634, + "balance_loss_clip": 0.06310214, + "balance_loss_mlp": 0.01263535, + "epoch": 0.14423568315045843, + "flos": 26038933505280.0, + "grad_norm": 2.0391001830721014, + "language_loss": 0.65964776, + "learning_rate": 3.864662937804603e-06, + "loss": 0.73869389, + "num_input_tokens_seen": 51978010, + "router_z_loss_clip": 3.01367188, + "router_z_loss_mlp": 0.2911377, + "step": 2399, + "time_per_iteration": 2.5843687057495117 + }, + { + "auxiliary_loss_clip": 0.06611193, + "auxiliary_loss_mlp": 0.01283302, + "balance_loss_clip": 0.06308104, + "balance_loss_mlp": 0.01253953, + "epoch": 0.14429580640312642, + "flos": 21295238307840.0, + "grad_norm": 1.6766317515480094, + "language_loss": 0.83645046, + "learning_rate": 3.864522071237571e-06, + "loss": 0.91539544, + "num_input_tokens_seen": 51998515, + "router_z_loss_clip": 3.03320312, + "router_z_loss_mlp": 0.29321289, + "step": 2400, + "time_per_iteration": 2.555400848388672 + }, + { + "auxiliary_loss_clip": 0.06611119, + "auxiliary_loss_mlp": 0.01295227, + "balance_loss_clip": 0.06304638, + "balance_loss_mlp": 0.01263494, + "epoch": 0.14435592965579438, + "flos": 25634636755200.0, + "grad_norm": 1.4775307939223221, + "language_loss": 0.75889075, + "learning_rate": 3.864381133967676e-06, + "loss": 0.83795416, + "num_input_tokens_seen": 52019270, + "router_z_loss_clip": 3.06445312, + "router_z_loss_mlp": 0.31738281, + "step": 2401, + "time_per_iteration": 2.595510959625244 + }, + { + "auxiliary_loss_clip": 0.06599294, + "auxiliary_loss_mlp": 0.01290815, + "balance_loss_clip": 0.06300685, + "balance_loss_mlp": 0.01262991, + "epoch": 0.14441605290846235, + "flos": 22971488037120.0, + "grad_norm": 3.551603969288966, + "language_loss": 0.81723303, + "learning_rate": 3.86424012600026e-06, + "loss": 0.89613414, + "num_input_tokens_seen": 52039315, + "router_z_loss_clip": 2.984375, + "router_z_loss_mlp": 0.27832031, + "step": 2402, + "time_per_iteration": 2.586766242980957 + }, + { + "auxiliary_loss_clip": 0.06609451, + "auxiliary_loss_mlp": 0.0129576, + "balance_loss_clip": 0.06306025, + "balance_loss_mlp": 0.01267246, + "epoch": 0.14447617616113032, + "flos": 17353386357120.0, + "grad_norm": 2.060017923221776, + "language_loss": 0.8556419, + "learning_rate": 3.864099047340673e-06, + "loss": 0.93469405, + "num_input_tokens_seen": 52056555, + "router_z_loss_clip": 3.03320312, + "router_z_loss_mlp": 0.28491211, + "step": 2403, + "time_per_iteration": 2.607682943344116 + }, + { + "auxiliary_loss_clip": 0.06604473, + "auxiliary_loss_mlp": 0.01296468, + "balance_loss_clip": 0.06304755, + "balance_loss_mlp": 0.01267644, + "epoch": 0.14453629941379828, + "flos": 24066896463360.0, + "grad_norm": 1.6573993279871784, + "language_loss": 0.71218109, + "learning_rate": 3.863957897994262e-06, + "loss": 0.79119051, + "num_input_tokens_seen": 52075800, + "router_z_loss_clip": 2.99609375, + "router_z_loss_mlp": 0.28833008, + "step": 2404, + "time_per_iteration": 2.5632174015045166 + }, + { + "auxiliary_loss_clip": 0.06603173, + "auxiliary_loss_mlp": 0.0129217, + "balance_loss_clip": 0.06303019, + "balance_loss_mlp": 0.0126282, + "epoch": 0.14459642266646625, + "flos": 14434924648320.0, + "grad_norm": 2.334574719230043, + "language_loss": 0.74209595, + "learning_rate": 3.863816677966381e-06, + "loss": 0.82104933, + "num_input_tokens_seen": 52092585, + "router_z_loss_clip": 3.0, + "router_z_loss_mlp": 0.29345703, + "step": 2405, + "time_per_iteration": 2.520474910736084 + }, + { + "auxiliary_loss_clip": 0.06599967, + "auxiliary_loss_mlp": 0.01307828, + "balance_loss_clip": 0.06301095, + "balance_loss_mlp": 0.01279647, + "epoch": 0.14465654591913424, + "flos": 9871337802240.0, + "grad_norm": 2.8694662985653245, + "language_loss": 0.74507034, + "learning_rate": 3.863675387262386e-06, + "loss": 0.8241483, + "num_input_tokens_seen": 52108990, + "router_z_loss_clip": 2.984375, + "router_z_loss_mlp": 0.28173828, + "step": 2406, + "time_per_iteration": 2.5204012393951416 + }, + { + "auxiliary_loss_clip": 0.0660891, + "auxiliary_loss_mlp": 0.01299289, + "balance_loss_clip": 0.06308217, + "balance_loss_mlp": 0.01270584, + "epoch": 0.1447166691718022, + "flos": 24979890551040.0, + "grad_norm": 2.4466515535741027, + "language_loss": 0.77524543, + "learning_rate": 3.8635340258876325e-06, + "loss": 0.85432744, + "num_input_tokens_seen": 52125385, + "router_z_loss_clip": 3.00976562, + "router_z_loss_mlp": 0.28686523, + "step": 2407, + "time_per_iteration": 2.5871012210845947 + }, + { + "auxiliary_loss_clip": 0.06596132, + "auxiliary_loss_mlp": 0.01309759, + "balance_loss_clip": 0.06298497, + "balance_loss_mlp": 0.01281459, + "epoch": 0.14477679242447017, + "flos": 21914457580800.0, + "grad_norm": 2.4005439664015156, + "language_loss": 0.80167431, + "learning_rate": 3.8633925938474826e-06, + "loss": 0.88073325, + "num_input_tokens_seen": 52144985, + "router_z_loss_clip": 2.9765625, + "router_z_loss_mlp": 0.28320312, + "step": 2408, + "time_per_iteration": 2.5400643348693848 + }, + { + "auxiliary_loss_clip": 0.06608425, + "auxiliary_loss_mlp": 0.01300861, + "balance_loss_clip": 0.06305376, + "balance_loss_mlp": 0.0126939, + "epoch": 0.14483691567713813, + "flos": 20747030970240.0, + "grad_norm": 2.230633188895553, + "language_loss": 0.83653724, + "learning_rate": 3.863251091147299e-06, + "loss": 0.9156301, + "num_input_tokens_seen": 52163885, + "router_z_loss_clip": 3.03320312, + "router_z_loss_mlp": 0.31445312, + "step": 2409, + "time_per_iteration": 2.5423808097839355 + }, + { + "auxiliary_loss_clip": 0.06608373, + "auxiliary_loss_mlp": 0.0129938, + "balance_loss_clip": 0.06298821, + "balance_loss_mlp": 0.0126978, + "epoch": 0.1448970389298061, + "flos": 35416388943360.0, + "grad_norm": 2.041474654068305, + "language_loss": 0.76231539, + "learning_rate": 3.863109517792446e-06, + "loss": 0.84139293, + "num_input_tokens_seen": 52184325, + "router_z_loss_clip": 3.09765625, + "router_z_loss_mlp": 0.29602051, + "step": 2410, + "time_per_iteration": 2.6380317211151123 + }, + { + "auxiliary_loss_clip": 0.0660304, + "auxiliary_loss_mlp": 0.01294458, + "balance_loss_clip": 0.06304888, + "balance_loss_mlp": 0.01265491, + "epoch": 0.14495716218247406, + "flos": 15419853066240.0, + "grad_norm": 1.847852108753089, + "language_loss": 0.8233192, + "learning_rate": 3.8629678737882945e-06, + "loss": 0.90229416, + "num_input_tokens_seen": 52202740, + "router_z_loss_clip": 2.98046875, + "router_z_loss_mlp": 0.28942871, + "step": 2411, + "time_per_iteration": 2.5439260005950928 + }, + { + "auxiliary_loss_clip": 0.06610366, + "auxiliary_loss_mlp": 0.0129153, + "balance_loss_clip": 0.06308557, + "balance_loss_mlp": 0.01262514, + "epoch": 0.14501728543514203, + "flos": 33701677390080.0, + "grad_norm": 2.23940850930143, + "language_loss": 0.71979284, + "learning_rate": 3.862826159140214e-06, + "loss": 0.79881179, + "num_input_tokens_seen": 52223100, + "router_z_loss_clip": 3.01757812, + "router_z_loss_mlp": 0.29003906, + "step": 2412, + "time_per_iteration": 2.654892921447754 + }, + { + "auxiliary_loss_clip": 0.06603752, + "auxiliary_loss_mlp": 0.01292883, + "balance_loss_clip": 0.06306557, + "balance_loss_mlp": 0.01265465, + "epoch": 0.14507740868781002, + "flos": 15601512718080.0, + "grad_norm": 1.90667529133839, + "language_loss": 0.78426313, + "learning_rate": 3.862684373853579e-06, + "loss": 0.86322957, + "num_input_tokens_seen": 52239690, + "router_z_loss_clip": 2.96875, + "router_z_loss_mlp": 0.27441406, + "step": 2413, + "time_per_iteration": 2.5105841159820557 + }, + { + "auxiliary_loss_clip": 0.06474504, + "auxiliary_loss_mlp": 0.01256457, + "balance_loss_clip": 0.06298508, + "balance_loss_mlp": 0.01246152, + "epoch": 0.145137531940478, + "flos": 66695247924480.0, + "grad_norm": 0.8850823768955927, + "language_loss": 0.58774322, + "learning_rate": 3.8625425179337656e-06, + "loss": 0.66505289, + "num_input_tokens_seen": 52296705, + "router_z_loss_clip": 1.76269531, + "router_z_loss_mlp": 0.10308838, + "step": 2414, + "time_per_iteration": 3.0886166095733643 + }, + { + "auxiliary_loss_clip": 0.06466582, + "auxiliary_loss_mlp": 0.01255839, + "balance_loss_clip": 0.06291236, + "balance_loss_mlp": 0.01245486, + "epoch": 0.14519765519314595, + "flos": 67542806373120.0, + "grad_norm": 0.8215511806181923, + "language_loss": 0.61917955, + "learning_rate": 3.862400591386154e-06, + "loss": 0.69640374, + "num_input_tokens_seen": 52361830, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.10357666, + "step": 2415, + "time_per_iteration": 3.1800529956817627 + }, + { + "auxiliary_loss_clip": 0.06605236, + "auxiliary_loss_mlp": 0.0128974, + "balance_loss_clip": 0.06304489, + "balance_loss_mlp": 0.01261226, + "epoch": 0.14525777844581392, + "flos": 17204151035520.0, + "grad_norm": 1.9287382315286696, + "language_loss": 0.72791839, + "learning_rate": 3.8622585942161245e-06, + "loss": 0.80686808, + "num_input_tokens_seen": 52379420, + "router_z_loss_clip": 3.0078125, + "router_z_loss_mlp": 0.28540039, + "step": 2416, + "time_per_iteration": 2.5888171195983887 + }, + { + "auxiliary_loss_clip": 0.06466876, + "auxiliary_loss_mlp": 0.01256349, + "balance_loss_clip": 0.06292045, + "balance_loss_mlp": 0.01246574, + "epoch": 0.14531790169848188, + "flos": 65425349370240.0, + "grad_norm": 0.6779730680906524, + "language_loss": 0.60441911, + "learning_rate": 3.8621165264290635e-06, + "loss": 0.68165135, + "num_input_tokens_seen": 52446290, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.09765625, + "step": 2417, + "time_per_iteration": 3.256091356277466 + }, + { + "auxiliary_loss_clip": 0.06611343, + "auxiliary_loss_mlp": 0.01295709, + "balance_loss_clip": 0.06300741, + "balance_loss_mlp": 0.0126543, + "epoch": 0.14537802495114985, + "flos": 32570783959680.0, + "grad_norm": 9.327498524911116, + "language_loss": 0.80428064, + "learning_rate": 3.861974388030356e-06, + "loss": 0.88335121, + "num_input_tokens_seen": 52467295, + "router_z_loss_clip": 3.10742188, + "router_z_loss_mlp": 0.30297852, + "step": 2418, + "time_per_iteration": 2.6627931594848633 + }, + { + "auxiliary_loss_clip": 0.06597205, + "auxiliary_loss_mlp": 0.01293692, + "balance_loss_clip": 0.06300168, + "balance_loss_mlp": 0.01265952, + "epoch": 0.1454381482038178, + "flos": 20232338065920.0, + "grad_norm": 1.7107019560934957, + "language_loss": 0.72557437, + "learning_rate": 3.861832179025394e-06, + "loss": 0.80448335, + "num_input_tokens_seen": 52487295, + "router_z_loss_clip": 2.97265625, + "router_z_loss_mlp": 0.27746582, + "step": 2419, + "time_per_iteration": 2.55110764503479 + }, + { + "auxiliary_loss_clip": 0.06605242, + "auxiliary_loss_mlp": 0.01287615, + "balance_loss_clip": 0.06300443, + "balance_loss_mlp": 0.01258563, + "epoch": 0.1454982714564858, + "flos": 22899721415040.0, + "grad_norm": 2.764675065682222, + "language_loss": 0.91167969, + "learning_rate": 3.861689899419569e-06, + "loss": 0.99060822, + "num_input_tokens_seen": 52504220, + "router_z_loss_clip": 3.04882812, + "router_z_loss_mlp": 0.29064941, + "step": 2420, + "time_per_iteration": 2.554682731628418 + }, + { + "auxiliary_loss_clip": 0.06610379, + "auxiliary_loss_mlp": 0.01289829, + "balance_loss_clip": 0.06309067, + "balance_loss_mlp": 0.01262757, + "epoch": 0.14555839470915377, + "flos": 20236027645440.0, + "grad_norm": 2.2697741355192034, + "language_loss": 0.83967364, + "learning_rate": 3.861547549218276e-06, + "loss": 0.91867572, + "num_input_tokens_seen": 52521900, + "router_z_loss_clip": 3.015625, + "router_z_loss_mlp": 0.27050781, + "step": 2421, + "time_per_iteration": 2.5464484691619873 + }, + { + "auxiliary_loss_clip": 0.06610221, + "auxiliary_loss_mlp": 0.01287397, + "balance_loss_clip": 0.0630337, + "balance_loss_mlp": 0.01259216, + "epoch": 0.14561851796182174, + "flos": 22242753077760.0, + "grad_norm": 1.9618808249376125, + "language_loss": 0.82542074, + "learning_rate": 3.861405128426914e-06, + "loss": 0.90439695, + "num_input_tokens_seen": 52540495, + "router_z_loss_clip": 3.06835938, + "router_z_loss_mlp": 0.28173828, + "step": 2422, + "time_per_iteration": 2.5524632930755615 + }, + { + "auxiliary_loss_clip": 0.06461698, + "auxiliary_loss_mlp": 0.01262269, + "balance_loss_clip": 0.06287467, + "balance_loss_mlp": 0.01252607, + "epoch": 0.1456786412144897, + "flos": 52655758692480.0, + "grad_norm": 0.899920685315801, + "language_loss": 0.63252938, + "learning_rate": 3.861262637050883e-06, + "loss": 0.70976901, + "num_input_tokens_seen": 52603305, + "router_z_loss_clip": 1.74316406, + "router_z_loss_mlp": 0.09649658, + "step": 2423, + "time_per_iteration": 3.186488151550293 + }, + { + "auxiliary_loss_clip": 0.06612016, + "auxiliary_loss_mlp": 0.01288368, + "balance_loss_clip": 0.06311088, + "balance_loss_mlp": 0.01261402, + "epoch": 0.14573876446715767, + "flos": 23228352328320.0, + "grad_norm": 1.6675722488639018, + "language_loss": 0.82883829, + "learning_rate": 3.861120075095585e-06, + "loss": 0.90784216, + "num_input_tokens_seen": 52623435, + "router_z_loss_clip": 3.00976562, + "router_z_loss_mlp": 0.26928711, + "step": 2424, + "time_per_iteration": 2.6136088371276855 + }, + { + "auxiliary_loss_clip": 0.0660837, + "auxiliary_loss_mlp": 0.01282475, + "balance_loss_clip": 0.06310098, + "balance_loss_mlp": 0.01254246, + "epoch": 0.14579888771982563, + "flos": 18120331578240.0, + "grad_norm": 3.5994104334935733, + "language_loss": 0.79757202, + "learning_rate": 3.860977442566429e-06, + "loss": 0.87648046, + "num_input_tokens_seen": 52642255, + "router_z_loss_clip": 2.98242188, + "router_z_loss_mlp": 0.28271484, + "step": 2425, + "time_per_iteration": 4.07472825050354 + }, + { + "auxiliary_loss_clip": 0.06616544, + "auxiliary_loss_mlp": 0.01291448, + "balance_loss_clip": 0.06312044, + "balance_loss_mlp": 0.01263577, + "epoch": 0.14585901097249362, + "flos": 23007476165760.0, + "grad_norm": 3.905152777460985, + "language_loss": 0.84682351, + "learning_rate": 3.860834739468821e-06, + "loss": 0.92590338, + "num_input_tokens_seen": 52658700, + "router_z_loss_clip": 3.046875, + "router_z_loss_mlp": 0.27893066, + "step": 2426, + "time_per_iteration": 3.9595530033111572 + }, + { + "auxiliary_loss_clip": 0.066182, + "auxiliary_loss_mlp": 0.01297578, + "balance_loss_clip": 0.06312812, + "balance_loss_mlp": 0.0126904, + "epoch": 0.1459191342251616, + "flos": 21915212267520.0, + "grad_norm": 3.268887858496738, + "language_loss": 0.87538207, + "learning_rate": 3.860691965808173e-06, + "loss": 0.95453984, + "num_input_tokens_seen": 52678140, + "router_z_loss_clip": 3.0546875, + "router_z_loss_mlp": 0.28564453, + "step": 2427, + "time_per_iteration": 2.5644760131835938 + }, + { + "auxiliary_loss_clip": 0.0661422, + "auxiliary_loss_mlp": 0.01289371, + "balance_loss_clip": 0.06305077, + "balance_loss_mlp": 0.01258805, + "epoch": 0.14597925747782955, + "flos": 14980742144640.0, + "grad_norm": 1.9191014162631195, + "language_loss": 0.67673224, + "learning_rate": 3.8605491215899e-06, + "loss": 0.75576818, + "num_input_tokens_seen": 52696825, + "router_z_loss_clip": 3.09179688, + "router_z_loss_mlp": 0.3059082, + "step": 2428, + "time_per_iteration": 2.507455348968506 + }, + { + "auxiliary_loss_clip": 0.06609876, + "auxiliary_loss_mlp": 0.01290631, + "balance_loss_clip": 0.06305391, + "balance_loss_mlp": 0.01261807, + "epoch": 0.14603938073049752, + "flos": 21075200686080.0, + "grad_norm": 1.7530902442774277, + "language_loss": 0.84668899, + "learning_rate": 3.860406206819417e-06, + "loss": 0.92569411, + "num_input_tokens_seen": 52715125, + "router_z_loss_clip": 3.046875, + "router_z_loss_mlp": 0.28833008, + "step": 2429, + "time_per_iteration": 2.5743284225463867 + }, + { + "auxiliary_loss_clip": 0.06606025, + "auxiliary_loss_mlp": 0.01297985, + "balance_loss_clip": 0.06307633, + "balance_loss_mlp": 0.01269661, + "epoch": 0.14609950398316549, + "flos": 19870863552000.0, + "grad_norm": 1.787324656259552, + "language_loss": 0.80119967, + "learning_rate": 3.860263221502145e-06, + "loss": 0.88023973, + "num_input_tokens_seen": 52734015, + "router_z_loss_clip": 2.98242188, + "router_z_loss_mlp": 0.28308105, + "step": 2430, + "time_per_iteration": 3.9587552547454834 + }, + { + "auxiliary_loss_clip": 0.06618911, + "auxiliary_loss_mlp": 0.01299566, + "balance_loss_clip": 0.06312407, + "balance_loss_mlp": 0.01271552, + "epoch": 0.14615962723583345, + "flos": 22425377051520.0, + "grad_norm": 2.031204881913862, + "language_loss": 0.84236491, + "learning_rate": 3.860120165643504e-06, + "loss": 0.92154968, + "num_input_tokens_seen": 52753025, + "router_z_loss_clip": 3.0625, + "router_z_loss_mlp": 0.28051758, + "step": 2431, + "time_per_iteration": 2.5258126258850098 + }, + { + "auxiliary_loss_clip": 0.06622316, + "auxiliary_loss_mlp": 0.01304388, + "balance_loss_clip": 0.06307245, + "balance_loss_mlp": 0.01273823, + "epoch": 0.14621975048850142, + "flos": 22352813815680.0, + "grad_norm": 2.3067012157334976, + "language_loss": 0.79905456, + "learning_rate": 3.859977039248921e-06, + "loss": 0.87832165, + "num_input_tokens_seen": 52773420, + "router_z_loss_clip": 3.15039062, + "router_z_loss_mlp": 0.30566406, + "step": 2432, + "time_per_iteration": 2.5560994148254395 + }, + { + "auxiliary_loss_clip": 0.06613283, + "auxiliary_loss_mlp": 0.01299078, + "balance_loss_clip": 0.06307921, + "balance_loss_mlp": 0.01268894, + "epoch": 0.1462798737411694, + "flos": 24396030501120.0, + "grad_norm": 3.9772219479987796, + "language_loss": 0.8163479, + "learning_rate": 3.859833842323822e-06, + "loss": 0.89547151, + "num_input_tokens_seen": 52792870, + "router_z_loss_clip": 3.0546875, + "router_z_loss_mlp": 0.30175781, + "step": 2433, + "time_per_iteration": 2.5528087615966797 + }, + { + "auxiliary_loss_clip": 0.06603821, + "auxiliary_loss_mlp": 0.01308033, + "balance_loss_clip": 0.06304027, + "balance_loss_mlp": 0.0128052, + "epoch": 0.14633999699383737, + "flos": 19250679957120.0, + "grad_norm": 5.860215383122996, + "language_loss": 0.79175711, + "learning_rate": 3.859690574873638e-06, + "loss": 0.87087572, + "num_input_tokens_seen": 52811615, + "router_z_loss_clip": 3.0, + "router_z_loss_mlp": 0.27526855, + "step": 2434, + "time_per_iteration": 2.5396053791046143 + }, + { + "auxiliary_loss_clip": 0.0649661, + "auxiliary_loss_mlp": 0.01339476, + "balance_loss_clip": 0.0632303, + "balance_loss_mlp": 0.01328705, + "epoch": 0.14640012024650534, + "flos": 62679658780800.0, + "grad_norm": 0.822335797554765, + "language_loss": 0.58256161, + "learning_rate": 3.8595472369038e-06, + "loss": 0.66092247, + "num_input_tokens_seen": 52873230, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.10784912, + "step": 2435, + "time_per_iteration": 3.147134304046631 + }, + { + "auxiliary_loss_clip": 0.06602708, + "auxiliary_loss_mlp": 0.0130236, + "balance_loss_clip": 0.06305322, + "balance_loss_mlp": 0.01274036, + "epoch": 0.1464602434991733, + "flos": 12281144100480.0, + "grad_norm": 2.2533392469478453, + "language_loss": 0.89637053, + "learning_rate": 3.859403828419744e-06, + "loss": 0.97542119, + "num_input_tokens_seen": 52889325, + "router_z_loss_clip": 2.97460938, + "router_z_loss_mlp": 0.28320312, + "step": 2436, + "time_per_iteration": 2.5397794246673584 + }, + { + "auxiliary_loss_clip": 0.06608147, + "auxiliary_loss_mlp": 0.01302382, + "balance_loss_clip": 0.06305888, + "balance_loss_mlp": 0.01274391, + "epoch": 0.14652036675184127, + "flos": 20928480986880.0, + "grad_norm": 2.9920720004583194, + "language_loss": 0.75810778, + "learning_rate": 3.85926034942691e-06, + "loss": 0.83721304, + "num_input_tokens_seen": 52909705, + "router_z_loss_clip": 3.02148438, + "router_z_loss_mlp": 0.2800293, + "step": 2437, + "time_per_iteration": 2.5667972564697266 + }, + { + "auxiliary_loss_clip": 0.06610391, + "auxiliary_loss_mlp": 0.01306019, + "balance_loss_clip": 0.06306973, + "balance_loss_mlp": 0.01277123, + "epoch": 0.14658049000450923, + "flos": 27710151989760.0, + "grad_norm": 2.606428121821339, + "language_loss": 0.7401824, + "learning_rate": 3.859116799930736e-06, + "loss": 0.81934643, + "num_input_tokens_seen": 52930300, + "router_z_loss_clip": 3.03515625, + "router_z_loss_mlp": 0.28857422, + "step": 2438, + "time_per_iteration": 2.6476521492004395 + }, + { + "auxiliary_loss_clip": 0.06605977, + "auxiliary_loss_mlp": 0.01303285, + "balance_loss_clip": 0.06305391, + "balance_loss_mlp": 0.01274865, + "epoch": 0.14664061325717723, + "flos": 24943483152000.0, + "grad_norm": 2.0459162456522595, + "language_loss": 0.7577256, + "learning_rate": 3.858973179936668e-06, + "loss": 0.83681822, + "num_input_tokens_seen": 52949955, + "router_z_loss_clip": 3.00585938, + "router_z_loss_mlp": 0.28442383, + "step": 2439, + "time_per_iteration": 2.5789241790771484 + }, + { + "auxiliary_loss_clip": 0.06618818, + "auxiliary_loss_mlp": 0.01305858, + "balance_loss_clip": 0.06318325, + "balance_loss_mlp": 0.01278261, + "epoch": 0.1467007365098452, + "flos": 40307306964480.0, + "grad_norm": 4.636382420589035, + "language_loss": 0.74925351, + "learning_rate": 3.85882948945015e-06, + "loss": 0.82850027, + "num_input_tokens_seen": 52972905, + "router_z_loss_clip": 3.00976562, + "router_z_loss_mlp": 0.27624512, + "step": 2440, + "time_per_iteration": 2.7299485206604004 + }, + { + "auxiliary_loss_clip": 0.06605764, + "auxiliary_loss_mlp": 0.01314168, + "balance_loss_clip": 0.06310172, + "balance_loss_mlp": 0.01287667, + "epoch": 0.14676085976251316, + "flos": 26548175894400.0, + "grad_norm": 2.8544116905201755, + "language_loss": 0.84429544, + "learning_rate": 3.85868572847663e-06, + "loss": 0.92349476, + "num_input_tokens_seen": 52994850, + "router_z_loss_clip": 2.95703125, + "router_z_loss_mlp": 0.26513672, + "step": 2441, + "time_per_iteration": 2.631824016571045 + }, + { + "auxiliary_loss_clip": 0.0662398, + "auxiliary_loss_mlp": 0.01301683, + "balance_loss_clip": 0.06313129, + "balance_loss_mlp": 0.0127188, + "epoch": 0.14682098301518112, + "flos": 23556857460480.0, + "grad_norm": 2.3203183858424175, + "language_loss": 0.73868263, + "learning_rate": 3.858541897021563e-06, + "loss": 0.81793922, + "num_input_tokens_seen": 53014740, + "router_z_loss_clip": 3.11132812, + "router_z_loss_mlp": 0.29785156, + "step": 2442, + "time_per_iteration": 2.549813747406006 + }, + { + "auxiliary_loss_clip": 0.06618661, + "auxiliary_loss_mlp": 0.01300103, + "balance_loss_clip": 0.06309915, + "balance_loss_mlp": 0.01271934, + "epoch": 0.1468811062678491, + "flos": 11655048792960.0, + "grad_norm": 3.9053582460255756, + "language_loss": 0.82657981, + "learning_rate": 3.8583979950904e-06, + "loss": 0.90576744, + "num_input_tokens_seen": 53029780, + "router_z_loss_clip": 3.08789062, + "router_z_loss_mlp": 0.28161621, + "step": 2443, + "time_per_iteration": 2.5171542167663574 + }, + { + "auxiliary_loss_clip": 0.06611481, + "auxiliary_loss_mlp": 0.01308471, + "balance_loss_clip": 0.06310362, + "balance_loss_mlp": 0.0128184, + "epoch": 0.14694122952051705, + "flos": 23009237101440.0, + "grad_norm": 2.0286604977239477, + "language_loss": 0.84266245, + "learning_rate": 3.858254022688599e-06, + "loss": 0.92186195, + "num_input_tokens_seen": 53048620, + "router_z_loss_clip": 3.01171875, + "router_z_loss_mlp": 0.26635742, + "step": 2444, + "time_per_iteration": 2.5373833179473877 + }, + { + "auxiliary_loss_clip": 0.06614003, + "auxiliary_loss_mlp": 0.01304434, + "balance_loss_clip": 0.0631294, + "balance_loss_mlp": 0.0127692, + "epoch": 0.14700135277318502, + "flos": 26509797924480.0, + "grad_norm": 1.800920496835182, + "language_loss": 0.72034383, + "learning_rate": 3.85810997982162e-06, + "loss": 0.79952818, + "num_input_tokens_seen": 53070055, + "router_z_loss_clip": 3.0078125, + "router_z_loss_mlp": 0.27539062, + "step": 2445, + "time_per_iteration": 2.6035430431365967 + }, + { + "auxiliary_loss_clip": 0.0652153, + "auxiliary_loss_mlp": 0.01258872, + "balance_loss_clip": 0.06346728, + "balance_loss_mlp": 0.01251392, + "epoch": 0.147061476025853, + "flos": 59467841527680.0, + "grad_norm": 0.7965915579325233, + "language_loss": 0.62555134, + "learning_rate": 3.857965866494923e-06, + "loss": 0.70335531, + "num_input_tokens_seen": 53126945, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.074646, + "step": 2446, + "time_per_iteration": 3.0864346027374268 + }, + { + "auxiliary_loss_clip": 0.06631434, + "auxiliary_loss_mlp": 0.01305294, + "balance_loss_clip": 0.06324492, + "balance_loss_mlp": 0.01278603, + "epoch": 0.14712159927852098, + "flos": 28338637138560.0, + "grad_norm": 5.819879904445231, + "language_loss": 0.75890815, + "learning_rate": 3.857821682713975e-06, + "loss": 0.83827543, + "num_input_tokens_seen": 53149130, + "router_z_loss_clip": 3.0703125, + "router_z_loss_mlp": 0.26708984, + "step": 2447, + "time_per_iteration": 2.6405458450317383 + }, + { + "auxiliary_loss_clip": 0.0662236, + "auxiliary_loss_mlp": 0.01295421, + "balance_loss_clip": 0.06319176, + "balance_loss_mlp": 0.01267097, + "epoch": 0.14718172253118894, + "flos": 27097263699840.0, + "grad_norm": 3.1585594254982094, + "language_loss": 0.86766493, + "learning_rate": 3.857677428484242e-06, + "loss": 0.94684267, + "num_input_tokens_seen": 53167120, + "router_z_loss_clip": 3.03515625, + "router_z_loss_mlp": 0.28344727, + "step": 2448, + "time_per_iteration": 2.588178873062134 + }, + { + "auxiliary_loss_clip": 0.06500641, + "auxiliary_loss_mlp": 0.01262898, + "balance_loss_clip": 0.0632707, + "balance_loss_mlp": 0.01254792, + "epoch": 0.1472418457838569, + "flos": 66725827464960.0, + "grad_norm": 0.7311302410121435, + "language_loss": 0.56820273, + "learning_rate": 3.857533103811195e-06, + "loss": 0.64583808, + "num_input_tokens_seen": 53227945, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.08105469, + "step": 2449, + "time_per_iteration": 3.1432383060455322 + }, + { + "auxiliary_loss_clip": 0.06619844, + "auxiliary_loss_mlp": 0.01304126, + "balance_loss_clip": 0.06319091, + "balance_loss_mlp": 0.01278663, + "epoch": 0.14730196903652487, + "flos": 19579730140800.0, + "grad_norm": 2.3714801519715185, + "language_loss": 0.86300421, + "learning_rate": 3.857388708700307e-06, + "loss": 0.94224387, + "num_input_tokens_seen": 53244615, + "router_z_loss_clip": 3.0078125, + "router_z_loss_mlp": 0.2545166, + "step": 2450, + "time_per_iteration": 2.6230788230895996 + }, + { + "auxiliary_loss_clip": 0.06624465, + "auxiliary_loss_mlp": 0.01292799, + "balance_loss_clip": 0.06318057, + "balance_loss_mlp": 0.01265774, + "epoch": 0.14736209228919284, + "flos": 16076611768320.0, + "grad_norm": 3.0293103266492336, + "language_loss": 0.76407862, + "learning_rate": 3.857244243157052e-06, + "loss": 0.84325123, + "num_input_tokens_seen": 53262205, + "router_z_loss_clip": 3.06445312, + "router_z_loss_mlp": 0.2701416, + "step": 2451, + "time_per_iteration": 2.562429428100586 + }, + { + "auxiliary_loss_clip": 0.06606978, + "auxiliary_loss_mlp": 0.0129124, + "balance_loss_clip": 0.0631422, + "balance_loss_mlp": 0.01263881, + "epoch": 0.1474222155418608, + "flos": 23046147624960.0, + "grad_norm": 2.189425489790517, + "language_loss": 0.82725209, + "learning_rate": 3.85709970718691e-06, + "loss": 0.90623426, + "num_input_tokens_seen": 53282445, + "router_z_loss_clip": 2.92578125, + "router_z_loss_mlp": 0.27355957, + "step": 2452, + "time_per_iteration": 2.5850419998168945 + }, + { + "auxiliary_loss_clip": 0.06614233, + "auxiliary_loss_mlp": 0.01290168, + "balance_loss_clip": 0.06316262, + "balance_loss_mlp": 0.01264562, + "epoch": 0.1474823387945288, + "flos": 17024210392320.0, + "grad_norm": 1.704036472783103, + "language_loss": 0.7534892, + "learning_rate": 3.856955100795361e-06, + "loss": 0.83253324, + "num_input_tokens_seen": 53299060, + "router_z_loss_clip": 2.98046875, + "router_z_loss_mlp": 0.2565918, + "step": 2453, + "time_per_iteration": 2.56315016746521 + }, + { + "auxiliary_loss_clip": 0.06629206, + "auxiliary_loss_mlp": 0.01291559, + "balance_loss_clip": 0.06321974, + "balance_loss_mlp": 0.01263521, + "epoch": 0.14754246204719676, + "flos": 17900880935040.0, + "grad_norm": 2.0859032314961836, + "language_loss": 0.7740314, + "learning_rate": 3.856810423987889e-06, + "loss": 0.853239, + "num_input_tokens_seen": 53315970, + "router_z_loss_clip": 3.06835938, + "router_z_loss_mlp": 0.28076172, + "step": 2454, + "time_per_iteration": 2.512051582336426 + }, + { + "auxiliary_loss_clip": 0.06621231, + "auxiliary_loss_mlp": 0.01296513, + "balance_loss_clip": 0.06321682, + "balance_loss_mlp": 0.01269392, + "epoch": 0.14760258529986472, + "flos": 13084161304320.0, + "grad_norm": 2.060710477094934, + "language_loss": 0.84565163, + "learning_rate": 3.856665676769979e-06, + "loss": 0.92482901, + "num_input_tokens_seen": 53332940, + "router_z_loss_clip": 2.99609375, + "router_z_loss_mlp": 0.2713623, + "step": 2455, + "time_per_iteration": 2.5514066219329834 + }, + { + "auxiliary_loss_clip": 0.06633241, + "auxiliary_loss_mlp": 0.01283691, + "balance_loss_clip": 0.06325488, + "balance_loss_mlp": 0.01257393, + "epoch": 0.1476627085525327, + "flos": 30813627513600.0, + "grad_norm": 5.872574686414898, + "language_loss": 0.85135001, + "learning_rate": 3.85652085914712e-06, + "loss": 0.93051934, + "num_input_tokens_seen": 53353295, + "router_z_loss_clip": 3.07421875, + "router_z_loss_mlp": 0.26281738, + "step": 2456, + "time_per_iteration": 2.638485908508301 + }, + { + "auxiliary_loss_clip": 0.0661984, + "auxiliary_loss_mlp": 0.01288462, + "balance_loss_clip": 0.06324227, + "balance_loss_mlp": 0.01261926, + "epoch": 0.14772283180520066, + "flos": 21695887405440.0, + "grad_norm": 3.5788318870076674, + "language_loss": 0.85374033, + "learning_rate": 3.856375971124805e-06, + "loss": 0.93282336, + "num_input_tokens_seen": 53373410, + "router_z_loss_clip": 2.9609375, + "router_z_loss_mlp": 0.26550293, + "step": 2457, + "time_per_iteration": 2.5397539138793945 + }, + { + "auxiliary_loss_clip": 0.06612187, + "auxiliary_loss_mlp": 0.01285174, + "balance_loss_clip": 0.06322154, + "balance_loss_mlp": 0.01258817, + "epoch": 0.14778295505786862, + "flos": 18776335593600.0, + "grad_norm": 2.2072082990650896, + "language_loss": 0.76667166, + "learning_rate": 3.856231012708527e-06, + "loss": 0.84564531, + "num_input_tokens_seen": 53391430, + "router_z_loss_clip": 2.90039062, + "router_z_loss_mlp": 0.26379395, + "step": 2458, + "time_per_iteration": 2.5479953289031982 + }, + { + "auxiliary_loss_clip": 0.0664083, + "auxiliary_loss_mlp": 0.01290982, + "balance_loss_clip": 0.06331704, + "balance_loss_mlp": 0.01262992, + "epoch": 0.1478430783105366, + "flos": 22900224539520.0, + "grad_norm": 2.4431680555354185, + "language_loss": 0.84230208, + "learning_rate": 3.856085983903782e-06, + "loss": 0.92162013, + "num_input_tokens_seen": 53409960, + "router_z_loss_clip": 3.09375, + "router_z_loss_mlp": 0.28027344, + "step": 2459, + "time_per_iteration": 2.555878162384033 + }, + { + "auxiliary_loss_clip": 0.06625295, + "auxiliary_loss_mlp": 0.01283208, + "balance_loss_clip": 0.06332543, + "balance_loss_mlp": 0.01257983, + "epoch": 0.14790320156320458, + "flos": 15090635174400.0, + "grad_norm": 2.440333441232677, + "language_loss": 0.76468259, + "learning_rate": 3.855940884716071e-06, + "loss": 0.84376764, + "num_input_tokens_seen": 53426160, + "router_z_loss_clip": 2.92773438, + "router_z_loss_mlp": 0.2520752, + "step": 2460, + "time_per_iteration": 2.528325319290161 + }, + { + "auxiliary_loss_clip": 0.06624737, + "auxiliary_loss_mlp": 0.01287086, + "balance_loss_clip": 0.06318681, + "balance_loss_mlp": 0.0125912, + "epoch": 0.14796332481587254, + "flos": 26511894276480.0, + "grad_norm": 1.7434250987621476, + "language_loss": 0.82039559, + "learning_rate": 3.855795715150896e-06, + "loss": 0.89951384, + "num_input_tokens_seen": 53448530, + "router_z_loss_clip": 3.0625, + "router_z_loss_mlp": 0.27941895, + "step": 2461, + "time_per_iteration": 2.609023332595825 + }, + { + "auxiliary_loss_clip": 0.06627606, + "auxiliary_loss_mlp": 0.0129144, + "balance_loss_clip": 0.06326235, + "balance_loss_mlp": 0.01263497, + "epoch": 0.1480234480685405, + "flos": 17568392734080.0, + "grad_norm": 4.638743932579621, + "language_loss": 0.6665929, + "learning_rate": 3.855650475213761e-06, + "loss": 0.74578333, + "num_input_tokens_seen": 53465915, + "router_z_loss_clip": 3.015625, + "router_z_loss_mlp": 0.27954102, + "step": 2462, + "time_per_iteration": 2.5234897136688232 + }, + { + "auxiliary_loss_clip": 0.06619708, + "auxiliary_loss_mlp": 0.01287497, + "balance_loss_clip": 0.06320504, + "balance_loss_mlp": 0.01260925, + "epoch": 0.14808357132120847, + "flos": 53594693147520.0, + "grad_norm": 12.154278546197556, + "language_loss": 0.68225503, + "learning_rate": 3.8555051649101745e-06, + "loss": 0.76132703, + "num_input_tokens_seen": 53496055, + "router_z_loss_clip": 2.9921875, + "router_z_loss_mlp": 0.26574707, + "step": 2463, + "time_per_iteration": 2.847352981567383 + }, + { + "auxiliary_loss_clip": 0.06631631, + "auxiliary_loss_mlp": 0.01292564, + "balance_loss_clip": 0.06328086, + "balance_loss_mlp": 0.01264788, + "epoch": 0.14814369457387644, + "flos": 19835420474880.0, + "grad_norm": 2.5558663587768917, + "language_loss": 0.77389717, + "learning_rate": 3.855359784245646e-06, + "loss": 0.85313916, + "num_input_tokens_seen": 53513790, + "router_z_loss_clip": 3.03515625, + "router_z_loss_mlp": 0.27783203, + "step": 2464, + "time_per_iteration": 3.9868950843811035 + }, + { + "auxiliary_loss_clip": 0.0661262, + "auxiliary_loss_mlp": 0.01291855, + "balance_loss_clip": 0.06322042, + "balance_loss_mlp": 0.01266356, + "epoch": 0.1482038178265444, + "flos": 23921769991680.0, + "grad_norm": 1.9637026483751652, + "language_loss": 0.80667269, + "learning_rate": 3.855214333225688e-06, + "loss": 0.88571739, + "num_input_tokens_seen": 53533410, + "router_z_loss_clip": 2.90820312, + "router_z_loss_mlp": 0.25500488, + "step": 2465, + "time_per_iteration": 4.024165630340576 + }, + { + "auxiliary_loss_clip": 0.06628035, + "auxiliary_loss_mlp": 0.01295444, + "balance_loss_clip": 0.06321928, + "balance_loss_mlp": 0.01265976, + "epoch": 0.1482639410792124, + "flos": 24177376471680.0, + "grad_norm": 3.100026638907138, + "language_loss": 0.77266049, + "learning_rate": 3.855068811855817e-06, + "loss": 0.85189527, + "num_input_tokens_seen": 53554775, + "router_z_loss_clip": 3.06054688, + "router_z_loss_mlp": 0.29467773, + "step": 2466, + "time_per_iteration": 2.583932638168335 + }, + { + "auxiliary_loss_clip": 0.06510445, + "auxiliary_loss_mlp": 0.01275284, + "balance_loss_clip": 0.06339325, + "balance_loss_mlp": 0.012657, + "epoch": 0.14832406433188036, + "flos": 66209205916800.0, + "grad_norm": 0.9642098795906485, + "language_loss": 0.60506117, + "learning_rate": 3.854923220141551e-06, + "loss": 0.68291849, + "num_input_tokens_seen": 53609675, + "router_z_loss_clip": 1.71191406, + "router_z_loss_mlp": 0.09570312, + "step": 2467, + "time_per_iteration": 3.206559419631958 + }, + { + "auxiliary_loss_clip": 0.06627056, + "auxiliary_loss_mlp": 0.0129155, + "balance_loss_clip": 0.06326642, + "balance_loss_mlp": 0.01264573, + "epoch": 0.14838418758454833, + "flos": 25418372567040.0, + "grad_norm": 2.1383686818257877, + "language_loss": 0.88646448, + "learning_rate": 3.85477755808841e-06, + "loss": 0.96565056, + "num_input_tokens_seen": 53626950, + "router_z_loss_clip": 3.00390625, + "router_z_loss_mlp": 0.26965332, + "step": 2468, + "time_per_iteration": 2.586428642272949 + }, + { + "auxiliary_loss_clip": 0.06632069, + "auxiliary_loss_mlp": 0.01295941, + "balance_loss_clip": 0.0632536, + "balance_loss_mlp": 0.01267236, + "epoch": 0.1484443108372163, + "flos": 23295800465280.0, + "grad_norm": 2.089009169061615, + "language_loss": 0.76661634, + "learning_rate": 3.854631825701919e-06, + "loss": 0.84589648, + "num_input_tokens_seen": 53644200, + "router_z_loss_clip": 3.0625, + "router_z_loss_mlp": 0.28686523, + "step": 2469, + "time_per_iteration": 5.45016884803772 + }, + { + "auxiliary_loss_clip": 0.06630681, + "auxiliary_loss_mlp": 0.01291477, + "balance_loss_clip": 0.06328478, + "balance_loss_mlp": 0.01264131, + "epoch": 0.14850443408988426, + "flos": 14652949772160.0, + "grad_norm": 3.485678754962802, + "language_loss": 0.76790643, + "learning_rate": 3.854486022987603e-06, + "loss": 0.84712803, + "num_input_tokens_seen": 53659650, + "router_z_loss_clip": 3.0234375, + "router_z_loss_mlp": 0.2734375, + "step": 2470, + "time_per_iteration": 2.514772653579712 + }, + { + "auxiliary_loss_clip": 0.06622952, + "auxiliary_loss_mlp": 0.01299835, + "balance_loss_clip": 0.06329592, + "balance_loss_mlp": 0.0127324, + "epoch": 0.14856455734255222, + "flos": 23554761108480.0, + "grad_norm": 3.1357945603829576, + "language_loss": 0.73019731, + "learning_rate": 3.8543401499509905e-06, + "loss": 0.80942523, + "num_input_tokens_seen": 53680275, + "router_z_loss_clip": 2.93359375, + "router_z_loss_mlp": 0.26623535, + "step": 2471, + "time_per_iteration": 2.5867044925689697 + }, + { + "auxiliary_loss_clip": 0.06632146, + "auxiliary_loss_mlp": 0.01309567, + "balance_loss_clip": 0.06325525, + "balance_loss_mlp": 0.01281862, + "epoch": 0.1486246805952202, + "flos": 18083127565440.0, + "grad_norm": 2.6270207816723894, + "language_loss": 0.90878981, + "learning_rate": 3.854194206597615e-06, + "loss": 0.98820698, + "num_input_tokens_seen": 53698270, + "router_z_loss_clip": 3.0625, + "router_z_loss_mlp": 0.27709961, + "step": 2472, + "time_per_iteration": 2.5934388637542725 + }, + { + "auxiliary_loss_clip": 0.06620878, + "auxiliary_loss_mlp": 0.01314043, + "balance_loss_clip": 0.06322667, + "balance_loss_mlp": 0.01286136, + "epoch": 0.14868480384788818, + "flos": 19359566737920.0, + "grad_norm": 2.5877207728101332, + "language_loss": 0.81794894, + "learning_rate": 3.854048192933008e-06, + "loss": 0.89729816, + "num_input_tokens_seen": 53716845, + "router_z_loss_clip": 2.984375, + "router_z_loss_mlp": 0.2791748, + "step": 2473, + "time_per_iteration": 2.551769256591797 + }, + { + "auxiliary_loss_clip": 0.06630681, + "auxiliary_loss_mlp": 0.01339003, + "balance_loss_clip": 0.06328606, + "balance_loss_mlp": 0.01311346, + "epoch": 0.14874492710055615, + "flos": 22206723022080.0, + "grad_norm": 2.4925002468384423, + "language_loss": 0.79495537, + "learning_rate": 3.853902108962709e-06, + "loss": 0.87465227, + "num_input_tokens_seen": 53734970, + "router_z_loss_clip": 3.01953125, + "router_z_loss_mlp": 0.27624512, + "step": 2474, + "time_per_iteration": 2.55029034614563 + }, + { + "auxiliary_loss_clip": 0.06643772, + "auxiliary_loss_mlp": 0.01336817, + "balance_loss_clip": 0.06335679, + "balance_loss_mlp": 0.01309256, + "epoch": 0.1488050503532241, + "flos": 21109427879040.0, + "grad_norm": 2.598618910298095, + "language_loss": 0.8324194, + "learning_rate": 3.853755954692255e-06, + "loss": 0.91222525, + "num_input_tokens_seen": 53753415, + "router_z_loss_clip": 3.08007812, + "router_z_loss_mlp": 0.27575684, + "step": 2475, + "time_per_iteration": 2.557748794555664 + }, + { + "auxiliary_loss_clip": 0.06641456, + "auxiliary_loss_mlp": 0.01357893, + "balance_loss_clip": 0.06342697, + "balance_loss_mlp": 0.01329998, + "epoch": 0.14886517360589208, + "flos": 12791476592640.0, + "grad_norm": 3.118918756982401, + "language_loss": 0.81896377, + "learning_rate": 3.85360973012719e-06, + "loss": 0.89895725, + "num_input_tokens_seen": 53770305, + "router_z_loss_clip": 2.984375, + "router_z_loss_mlp": 0.27929688, + "step": 2476, + "time_per_iteration": 2.5228424072265625 + }, + { + "auxiliary_loss_clip": 0.06643493, + "auxiliary_loss_mlp": 0.01381513, + "balance_loss_clip": 0.06351461, + "balance_loss_mlp": 0.01354202, + "epoch": 0.14892529685856004, + "flos": 29030503501440.0, + "grad_norm": 5.933104141951435, + "language_loss": 0.78306687, + "learning_rate": 3.853463435273058e-06, + "loss": 0.86331695, + "num_input_tokens_seen": 53788895, + "router_z_loss_clip": 2.921875, + "router_z_loss_mlp": 0.27307129, + "step": 2477, + "time_per_iteration": 2.6379337310791016 + }, + { + "auxiliary_loss_clip": 0.06518018, + "auxiliary_loss_mlp": 0.01346882, + "balance_loss_clip": 0.06346889, + "balance_loss_mlp": 0.01337793, + "epoch": 0.148985420111228, + "flos": 61944215495040.0, + "grad_norm": 0.7948106415234558, + "language_loss": 0.60108519, + "learning_rate": 3.853317070135407e-06, + "loss": 0.67973411, + "num_input_tokens_seen": 53850260, + "router_z_loss_clip": 1.7109375, + "router_z_loss_mlp": 0.09100342, + "step": 2478, + "time_per_iteration": 3.2091856002807617 + }, + { + "auxiliary_loss_clip": 0.06656381, + "auxiliary_loss_mlp": 0.01381988, + "balance_loss_clip": 0.06356013, + "balance_loss_mlp": 0.01354606, + "epoch": 0.149045543363896, + "flos": 23921937699840.0, + "grad_norm": 3.933079411076695, + "language_loss": 0.71247137, + "learning_rate": 3.853170634719787e-06, + "loss": 0.79285508, + "num_input_tokens_seen": 53867520, + "router_z_loss_clip": 3.00195312, + "router_z_loss_mlp": 0.27392578, + "step": 2479, + "time_per_iteration": 2.613901376724243 + }, + { + "auxiliary_loss_clip": 0.06657803, + "auxiliary_loss_mlp": 0.01383638, + "balance_loss_clip": 0.06357619, + "balance_loss_mlp": 0.01356411, + "epoch": 0.14910566661656396, + "flos": 23660293726080.0, + "grad_norm": 3.520474403550157, + "language_loss": 0.82057166, + "learning_rate": 3.853024129031751e-06, + "loss": 0.90098608, + "num_input_tokens_seen": 53886620, + "router_z_loss_clip": 3.00390625, + "router_z_loss_mlp": 0.27246094, + "step": 2480, + "time_per_iteration": 2.6175220012664795 + }, + { + "auxiliary_loss_clip": 0.06659204, + "auxiliary_loss_mlp": 0.01416958, + "balance_loss_clip": 0.06354087, + "balance_loss_mlp": 0.01387727, + "epoch": 0.14916578986923193, + "flos": 20520452730240.0, + "grad_norm": 2.2296604280919805, + "language_loss": 0.85048115, + "learning_rate": 3.852877553076854e-06, + "loss": 0.9312427, + "num_input_tokens_seen": 53902230, + "router_z_loss_clip": 3.05273438, + "router_z_loss_mlp": 0.29248047, + "step": 2481, + "time_per_iteration": 2.617551565170288 + }, + { + "auxiliary_loss_clip": 0.06647365, + "auxiliary_loss_mlp": 0.01423314, + "balance_loss_clip": 0.06347671, + "balance_loss_mlp": 0.01393416, + "epoch": 0.1492259131218999, + "flos": 22498359557760.0, + "grad_norm": 1.912212150867571, + "language_loss": 0.78788674, + "learning_rate": 3.8527309068606546e-06, + "loss": 0.86859351, + "num_input_tokens_seen": 53919475, + "router_z_loss_clip": 2.9921875, + "router_z_loss_mlp": 0.29882812, + "step": 2482, + "time_per_iteration": 2.5733768939971924 + }, + { + "auxiliary_loss_clip": 0.06663539, + "auxiliary_loss_mlp": 0.0143468, + "balance_loss_clip": 0.06351975, + "balance_loss_mlp": 0.01405808, + "epoch": 0.14928603637456786, + "flos": 23192657688960.0, + "grad_norm": 2.2991604479376777, + "language_loss": 0.80652654, + "learning_rate": 3.852584190388713e-06, + "loss": 0.88750875, + "num_input_tokens_seen": 53939150, + "router_z_loss_clip": 3.1171875, + "router_z_loss_mlp": 0.28857422, + "step": 2483, + "time_per_iteration": 2.597843647003174 + }, + { + "auxiliary_loss_clip": 0.06641878, + "auxiliary_loss_mlp": 0.01472083, + "balance_loss_clip": 0.06352127, + "balance_loss_mlp": 0.01442948, + "epoch": 0.14934615962723582, + "flos": 21659731568640.0, + "grad_norm": 2.0225233992765728, + "language_loss": 0.71627355, + "learning_rate": 3.852437403666595e-06, + "loss": 0.79741317, + "num_input_tokens_seen": 53958735, + "router_z_loss_clip": 2.90039062, + "router_z_loss_mlp": 0.2911377, + "step": 2484, + "time_per_iteration": 2.5717227458953857 + }, + { + "auxiliary_loss_clip": 0.06650308, + "auxiliary_loss_mlp": 0.01467216, + "balance_loss_clip": 0.06347484, + "balance_loss_mlp": 0.01435006, + "epoch": 0.1494062828799038, + "flos": 27016356983040.0, + "grad_norm": 2.0068383034806154, + "language_loss": 0.85284823, + "learning_rate": 3.852290546699863e-06, + "loss": 0.9340235, + "num_input_tokens_seen": 53975065, + "router_z_loss_clip": 3.02539062, + "router_z_loss_mlp": 0.32226562, + "step": 2485, + "time_per_iteration": 2.7037456035614014 + }, + { + "auxiliary_loss_clip": 0.0664534, + "auxiliary_loss_mlp": 0.01441016, + "balance_loss_clip": 0.06342804, + "balance_loss_mlp": 0.01410952, + "epoch": 0.14946640613257178, + "flos": 21221291479680.0, + "grad_norm": 2.0879118929126133, + "language_loss": 0.85614496, + "learning_rate": 3.8521436194940894e-06, + "loss": 0.93700856, + "num_input_tokens_seen": 53993330, + "router_z_loss_clip": 3.02539062, + "router_z_loss_mlp": 0.30053711, + "step": 2486, + "time_per_iteration": 2.5492942333221436 + }, + { + "auxiliary_loss_clip": 0.06628142, + "auxiliary_loss_mlp": 0.01484598, + "balance_loss_clip": 0.06337839, + "balance_loss_mlp": 0.01454963, + "epoch": 0.14952652938523975, + "flos": 13375965548160.0, + "grad_norm": 2.5864541617313805, + "language_loss": 0.75625527, + "learning_rate": 3.851996622054842e-06, + "loss": 0.83738261, + "num_input_tokens_seen": 54010515, + "router_z_loss_clip": 2.90039062, + "router_z_loss_mlp": 0.29638672, + "step": 2487, + "time_per_iteration": 2.6050243377685547 + }, + { + "auxiliary_loss_clip": 0.06636909, + "auxiliary_loss_mlp": 0.01458272, + "balance_loss_clip": 0.06336737, + "balance_loss_mlp": 0.01427635, + "epoch": 0.1495866526379077, + "flos": 35526491608320.0, + "grad_norm": 2.6345212857914415, + "language_loss": 0.72756326, + "learning_rate": 3.8518495543877e-06, + "loss": 0.80851501, + "num_input_tokens_seen": 54031315, + "router_z_loss_clip": 3.0, + "router_z_loss_mlp": 0.30639648, + "step": 2488, + "time_per_iteration": 2.7038300037384033 + }, + { + "auxiliary_loss_clip": 0.06629623, + "auxiliary_loss_mlp": 0.01463441, + "balance_loss_clip": 0.06324254, + "balance_loss_mlp": 0.01431421, + "epoch": 0.14964677589057568, + "flos": 17637392171520.0, + "grad_norm": 3.2533111651102633, + "language_loss": 0.71329439, + "learning_rate": 3.851702416498235e-06, + "loss": 0.79422504, + "num_input_tokens_seen": 54045965, + "router_z_loss_clip": 3.05273438, + "router_z_loss_mlp": 0.3203125, + "step": 2489, + "time_per_iteration": 2.6397132873535156 + }, + { + "auxiliary_loss_clip": 0.06627091, + "auxiliary_loss_mlp": 0.01445303, + "balance_loss_clip": 0.06321006, + "balance_loss_mlp": 0.01412807, + "epoch": 0.14970689914324364, + "flos": 20190102808320.0, + "grad_norm": 15.387963507460157, + "language_loss": 0.82698536, + "learning_rate": 3.8515552083920295e-06, + "loss": 0.90770924, + "num_input_tokens_seen": 54059960, + "router_z_loss_clip": 3.05859375, + "router_z_loss_mlp": 0.32446289, + "step": 2490, + "time_per_iteration": 2.560051918029785 + }, + { + "auxiliary_loss_clip": 0.0662353, + "auxiliary_loss_mlp": 0.01421627, + "balance_loss_clip": 0.06318316, + "balance_loss_mlp": 0.013913, + "epoch": 0.1497670223959116, + "flos": 37237136238720.0, + "grad_norm": 2.555318554574921, + "language_loss": 0.81524169, + "learning_rate": 3.851407930074666e-06, + "loss": 0.8956933, + "num_input_tokens_seen": 54079330, + "router_z_loss_clip": 3.05273438, + "router_z_loss_mlp": 0.30322266, + "step": 2491, + "time_per_iteration": 2.7191121578216553 + }, + { + "auxiliary_loss_clip": 0.06628857, + "auxiliary_loss_mlp": 0.01437567, + "balance_loss_clip": 0.06323408, + "balance_loss_mlp": 0.01406072, + "epoch": 0.1498271456485796, + "flos": 24461675775360.0, + "grad_norm": 2.0859620961652032, + "language_loss": 0.91616488, + "learning_rate": 3.851260581551727e-06, + "loss": 0.99682909, + "num_input_tokens_seen": 54097555, + "router_z_loss_clip": 3.05273438, + "router_z_loss_mlp": 0.31469727, + "step": 2492, + "time_per_iteration": 2.5775644779205322 + }, + { + "auxiliary_loss_clip": 0.06620014, + "auxiliary_loss_mlp": 0.01407656, + "balance_loss_clip": 0.06319647, + "balance_loss_mlp": 0.01375589, + "epoch": 0.14988726890124757, + "flos": 16259235742080.0, + "grad_norm": 4.194340578044498, + "language_loss": 0.80698526, + "learning_rate": 3.851113162828802e-06, + "loss": 0.88726199, + "num_input_tokens_seen": 54115600, + "router_z_loss_clip": 3.00585938, + "router_z_loss_mlp": 0.3203125, + "step": 2493, + "time_per_iteration": 2.522217273712158 + }, + { + "auxiliary_loss_clip": 0.06625558, + "auxiliary_loss_mlp": 0.01423964, + "balance_loss_clip": 0.06320652, + "balance_loss_mlp": 0.01391014, + "epoch": 0.14994739215391553, + "flos": 20672622944640.0, + "grad_norm": 1.92476481647275, + "language_loss": 0.81586623, + "learning_rate": 3.85096567391148e-06, + "loss": 0.89636147, + "num_input_tokens_seen": 54135220, + "router_z_loss_clip": 3.05078125, + "router_z_loss_mlp": 0.32958984, + "step": 2494, + "time_per_iteration": 2.5768370628356934 + }, + { + "auxiliary_loss_clip": 0.06620924, + "auxiliary_loss_mlp": 0.01381746, + "balance_loss_clip": 0.06323613, + "balance_loss_mlp": 0.01351562, + "epoch": 0.1500075154065835, + "flos": 70666855603200.0, + "grad_norm": 1.9921469546830013, + "language_loss": 0.67712897, + "learning_rate": 3.850818114805354e-06, + "loss": 0.75715572, + "num_input_tokens_seen": 54161065, + "router_z_loss_clip": 2.97851562, + "router_z_loss_mlp": 0.30187988, + "step": 2495, + "time_per_iteration": 2.9661571979522705 + }, + { + "auxiliary_loss_clip": 0.06548879, + "auxiliary_loss_mlp": 0.01321563, + "balance_loss_clip": 0.06377496, + "balance_loss_mlp": 0.01310876, + "epoch": 0.15006763865925146, + "flos": 68029827431040.0, + "grad_norm": 0.8769612772619841, + "language_loss": 0.5954529, + "learning_rate": 3.850670485516019e-06, + "loss": 0.67415726, + "num_input_tokens_seen": 54225095, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.10699463, + "step": 2496, + "time_per_iteration": 3.202047109603882 + }, + { + "auxiliary_loss_clip": 0.06631249, + "auxiliary_loss_mlp": 0.0133476, + "balance_loss_clip": 0.06323538, + "balance_loss_mlp": 0.01304254, + "epoch": 0.15012776191191943, + "flos": 18922216752000.0, + "grad_norm": 2.34505525234942, + "language_loss": 0.66916072, + "learning_rate": 3.850522786049075e-06, + "loss": 0.74882078, + "num_input_tokens_seen": 54243750, + "router_z_loss_clip": 3.08007812, + "router_z_loss_mlp": 0.30505371, + "step": 2497, + "time_per_iteration": 2.5355312824249268 + }, + { + "auxiliary_loss_clip": 0.06621728, + "auxiliary_loss_mlp": 0.01327478, + "balance_loss_clip": 0.06319709, + "balance_loss_mlp": 0.01299762, + "epoch": 0.1501878851645874, + "flos": 23708985747840.0, + "grad_norm": 1.6926191632820315, + "language_loss": 0.76545727, + "learning_rate": 3.850375016410121e-06, + "loss": 0.84494931, + "num_input_tokens_seen": 54266185, + "router_z_loss_clip": 3.0234375, + "router_z_loss_mlp": 0.27746582, + "step": 2498, + "time_per_iteration": 2.6315629482269287 + }, + { + "auxiliary_loss_clip": 0.06625126, + "auxiliary_loss_mlp": 0.0132033, + "balance_loss_clip": 0.06315958, + "balance_loss_mlp": 0.01288454, + "epoch": 0.15024800841725539, + "flos": 20418777400320.0, + "grad_norm": 2.3031515729251377, + "language_loss": 0.72851908, + "learning_rate": 3.850227176604761e-06, + "loss": 0.80797374, + "num_input_tokens_seen": 54283940, + "router_z_loss_clip": 3.09570312, + "router_z_loss_mlp": 0.3190918, + "step": 2499, + "time_per_iteration": 2.550572395324707 + }, + { + "auxiliary_loss_clip": 0.06615321, + "auxiliary_loss_mlp": 0.01299804, + "balance_loss_clip": 0.06312654, + "balance_loss_mlp": 0.01270002, + "epoch": 0.15030813166992335, + "flos": 31838904472320.0, + "grad_norm": 2.1036429780105204, + "language_loss": 0.72527623, + "learning_rate": 3.850079266638601e-06, + "loss": 0.80442744, + "num_input_tokens_seen": 54304830, + "router_z_loss_clip": 3.02539062, + "router_z_loss_mlp": 0.29760742, + "step": 2500, + "time_per_iteration": 2.66140079498291 + }, + { + "auxiliary_loss_clip": 0.06611083, + "auxiliary_loss_mlp": 0.01296332, + "balance_loss_clip": 0.06309603, + "balance_loss_mlp": 0.0126765, + "epoch": 0.15036825492259132, + "flos": 35665664440320.0, + "grad_norm": 2.1651988912264697, + "language_loss": 0.6639303, + "learning_rate": 3.849931286517249e-06, + "loss": 0.74300444, + "num_input_tokens_seen": 54325595, + "router_z_loss_clip": 3.01953125, + "router_z_loss_mlp": 0.28686523, + "step": 2501, + "time_per_iteration": 2.6920387744903564 + }, + { + "auxiliary_loss_clip": 0.06617519, + "auxiliary_loss_mlp": 0.0129342, + "balance_loss_clip": 0.06313312, + "balance_loss_mlp": 0.01262283, + "epoch": 0.15042837817525928, + "flos": 18843238679040.0, + "grad_norm": 2.189390095106363, + "language_loss": 0.84965289, + "learning_rate": 3.849783236246318e-06, + "loss": 0.92876226, + "num_input_tokens_seen": 54342180, + "router_z_loss_clip": 3.04101562, + "router_z_loss_mlp": 0.31152344, + "step": 2502, + "time_per_iteration": 2.5896334648132324 + }, + { + "auxiliary_loss_clip": 0.06611362, + "auxiliary_loss_mlp": 0.01289243, + "balance_loss_clip": 0.06310903, + "balance_loss_mlp": 0.0126142, + "epoch": 0.15048850142792725, + "flos": 19541436024960.0, + "grad_norm": 2.1165990533687746, + "language_loss": 0.78282011, + "learning_rate": 3.849635115831421e-06, + "loss": 0.86182618, + "num_input_tokens_seen": 54360255, + "router_z_loss_clip": 3.00585938, + "router_z_loss_mlp": 0.2779541, + "step": 2503, + "time_per_iteration": 3.9853694438934326 + }, + { + "auxiliary_loss_clip": 0.06603716, + "auxiliary_loss_mlp": 0.01289674, + "balance_loss_clip": 0.06307186, + "balance_loss_mlp": 0.01263102, + "epoch": 0.1505486246805952, + "flos": 22024015194240.0, + "grad_norm": 1.9675013040349558, + "language_loss": 0.8635025, + "learning_rate": 3.849486925278176e-06, + "loss": 0.94243646, + "num_input_tokens_seen": 54378260, + "router_z_loss_clip": 2.96679688, + "router_z_loss_mlp": 0.26586914, + "step": 2504, + "time_per_iteration": 2.544656991958618 + }, + { + "auxiliary_loss_clip": 0.06603047, + "auxiliary_loss_mlp": 0.0129183, + "balance_loss_clip": 0.06305411, + "balance_loss_mlp": 0.01264794, + "epoch": 0.15060874793326318, + "flos": 20749840081920.0, + "grad_norm": 2.8187796049403127, + "language_loss": 0.83803535, + "learning_rate": 3.8493386645922e-06, + "loss": 0.91698414, + "num_input_tokens_seen": 54399745, + "router_z_loss_clip": 2.97851562, + "router_z_loss_mlp": 0.27050781, + "step": 2505, + "time_per_iteration": 3.988954544067383 + }, + { + "auxiliary_loss_clip": 0.06600159, + "auxiliary_loss_mlp": 0.01291215, + "balance_loss_clip": 0.06303117, + "balance_loss_mlp": 0.01263249, + "epoch": 0.15066887118593117, + "flos": 16477470501120.0, + "grad_norm": 1.903749804745976, + "language_loss": 0.77148849, + "learning_rate": 3.849190333779117e-06, + "loss": 0.85040224, + "num_input_tokens_seen": 54417105, + "router_z_loss_clip": 2.97070312, + "router_z_loss_mlp": 0.27978516, + "step": 2506, + "time_per_iteration": 2.548551559448242 + }, + { + "auxiliary_loss_clip": 0.06619012, + "auxiliary_loss_mlp": 0.01287214, + "balance_loss_clip": 0.06307869, + "balance_loss_mlp": 0.01257722, + "epoch": 0.15072899443859913, + "flos": 19864490641920.0, + "grad_norm": 4.281401041045214, + "language_loss": 0.78119665, + "learning_rate": 3.849041932844552e-06, + "loss": 0.86025894, + "num_input_tokens_seen": 54433920, + "router_z_loss_clip": 3.11328125, + "router_z_loss_mlp": 0.29467773, + "step": 2507, + "time_per_iteration": 2.494123697280884 + }, + { + "auxiliary_loss_clip": 0.06598042, + "auxiliary_loss_mlp": 0.01289211, + "balance_loss_clip": 0.06304646, + "balance_loss_mlp": 0.01262532, + "epoch": 0.1507891176912671, + "flos": 20782348266240.0, + "grad_norm": 1.9743385281698682, + "language_loss": 0.69510758, + "learning_rate": 3.848893461794131e-06, + "loss": 0.77398014, + "num_input_tokens_seen": 54451540, + "router_z_loss_clip": 2.93554688, + "router_z_loss_mlp": 0.26647949, + "step": 2508, + "time_per_iteration": 2.53487491607666 + }, + { + "auxiliary_loss_clip": 0.06608425, + "auxiliary_loss_mlp": 0.01288258, + "balance_loss_clip": 0.06303222, + "balance_loss_mlp": 0.01259946, + "epoch": 0.15084924094393506, + "flos": 23593390640640.0, + "grad_norm": 1.8413842263271991, + "language_loss": 0.78278601, + "learning_rate": 3.8487449206334845e-06, + "loss": 0.86175287, + "num_input_tokens_seen": 54470800, + "router_z_loss_clip": 3.0546875, + "router_z_loss_mlp": 0.28320312, + "step": 2509, + "time_per_iteration": 5.512920141220093 + }, + { + "auxiliary_loss_clip": 0.06619874, + "auxiliary_loss_mlp": 0.01301611, + "balance_loss_clip": 0.06305903, + "balance_loss_mlp": 0.01270879, + "epoch": 0.15090936419660303, + "flos": 18916430820480.0, + "grad_norm": 3.8878243194331756, + "language_loss": 0.82607746, + "learning_rate": 3.848596309368246e-06, + "loss": 0.90529227, + "num_input_tokens_seen": 54486525, + "router_z_loss_clip": 3.140625, + "router_z_loss_mlp": 0.30688477, + "step": 2510, + "time_per_iteration": 2.4956603050231934 + }, + { + "auxiliary_loss_clip": 0.0661021, + "auxiliary_loss_mlp": 0.01290438, + "balance_loss_clip": 0.06301613, + "balance_loss_mlp": 0.01258919, + "epoch": 0.150969487449271, + "flos": 17933514900480.0, + "grad_norm": 2.455863983709149, + "language_loss": 0.74876237, + "learning_rate": 3.8484476280040495e-06, + "loss": 0.82776886, + "num_input_tokens_seen": 54503795, + "router_z_loss_clip": 3.08789062, + "router_z_loss_mlp": 0.31518555, + "step": 2511, + "time_per_iteration": 2.551175832748413 + }, + { + "auxiliary_loss_clip": 0.06603982, + "auxiliary_loss_mlp": 0.0129301, + "balance_loss_clip": 0.06306278, + "balance_loss_mlp": 0.012649, + "epoch": 0.151029610701939, + "flos": 24249897780480.0, + "grad_norm": 3.2919067663681854, + "language_loss": 0.6990515, + "learning_rate": 3.848298876546534e-06, + "loss": 0.77802145, + "num_input_tokens_seen": 54523025, + "router_z_loss_clip": 2.9765625, + "router_z_loss_mlp": 0.28100586, + "step": 2512, + "time_per_iteration": 2.592564344406128 + }, + { + "auxiliary_loss_clip": 0.06602003, + "auxiliary_loss_mlp": 0.01290201, + "balance_loss_clip": 0.06302576, + "balance_loss_mlp": 0.01260136, + "epoch": 0.15108973395460695, + "flos": 30270199858560.0, + "grad_norm": 3.311694411348407, + "language_loss": 0.75370401, + "learning_rate": 3.84815005500134e-06, + "loss": 0.8326261, + "num_input_tokens_seen": 54545025, + "router_z_loss_clip": 2.99609375, + "router_z_loss_mlp": 0.30078125, + "step": 2513, + "time_per_iteration": 2.675105571746826 + }, + { + "auxiliary_loss_clip": 0.06516539, + "auxiliary_loss_mlp": 0.01341982, + "balance_loss_clip": 0.06344443, + "balance_loss_mlp": 0.01333804, + "epoch": 0.15114985720727492, + "flos": 60456711087360.0, + "grad_norm": 0.8564181084280313, + "language_loss": 0.64582717, + "learning_rate": 3.84800116337411e-06, + "loss": 0.72441238, + "num_input_tokens_seen": 54604545, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.08178711, + "step": 2514, + "time_per_iteration": 3.1119604110717773 + }, + { + "auxiliary_loss_clip": 0.06602134, + "auxiliary_loss_mlp": 0.01300136, + "balance_loss_clip": 0.06303127, + "balance_loss_mlp": 0.01271299, + "epoch": 0.15120998045994288, + "flos": 20527915743360.0, + "grad_norm": 2.3848506685629487, + "language_loss": 0.74193883, + "learning_rate": 3.8478522016704916e-06, + "loss": 0.82096153, + "num_input_tokens_seen": 54620590, + "router_z_loss_clip": 2.9921875, + "router_z_loss_mlp": 0.28869629, + "step": 2515, + "time_per_iteration": 2.554006814956665 + }, + { + "auxiliary_loss_clip": 0.06601816, + "auxiliary_loss_mlp": 0.01297055, + "balance_loss_clip": 0.06304994, + "balance_loss_mlp": 0.01269577, + "epoch": 0.15127010371261085, + "flos": 21185303351040.0, + "grad_norm": 1.9231590772251361, + "language_loss": 0.78707075, + "learning_rate": 3.8477031698961325e-06, + "loss": 0.86605948, + "num_input_tokens_seen": 54640410, + "router_z_loss_clip": 2.96875, + "router_z_loss_mlp": 0.27490234, + "step": 2516, + "time_per_iteration": 2.5447309017181396 + }, + { + "auxiliary_loss_clip": 0.06496674, + "auxiliary_loss_mlp": 0.01300995, + "balance_loss_clip": 0.063242, + "balance_loss_mlp": 0.01292406, + "epoch": 0.1513302269652788, + "flos": 65339537189760.0, + "grad_norm": 0.7164418146378366, + "language_loss": 0.54901356, + "learning_rate": 3.8475540680566835e-06, + "loss": 0.62699026, + "num_input_tokens_seen": 54701430, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.08599854, + "step": 2517, + "time_per_iteration": 3.1926348209381104 + }, + { + "auxiliary_loss_clip": 0.06606746, + "auxiliary_loss_mlp": 0.01299298, + "balance_loss_clip": 0.06308446, + "balance_loss_mlp": 0.01269257, + "epoch": 0.15139035021794678, + "flos": 19141918957440.0, + "grad_norm": 1.8480469380115683, + "language_loss": 0.79359663, + "learning_rate": 3.8474048961577995e-06, + "loss": 0.87265706, + "num_input_tokens_seen": 54720845, + "router_z_loss_clip": 2.98046875, + "router_z_loss_mlp": 0.30078125, + "step": 2518, + "time_per_iteration": 2.563261032104492 + }, + { + "auxiliary_loss_clip": 0.06615496, + "auxiliary_loss_mlp": 0.01294147, + "balance_loss_clip": 0.06308527, + "balance_loss_mlp": 0.01264154, + "epoch": 0.15145047347061477, + "flos": 26585841104640.0, + "grad_norm": 2.595059574569343, + "language_loss": 0.71604168, + "learning_rate": 3.847255654205137e-06, + "loss": 0.79513812, + "num_input_tokens_seen": 54740495, + "router_z_loss_clip": 3.0703125, + "router_z_loss_mlp": 0.29980469, + "step": 2519, + "time_per_iteration": 2.5810017585754395 + }, + { + "auxiliary_loss_clip": 0.06607082, + "auxiliary_loss_mlp": 0.01285902, + "balance_loss_clip": 0.06307598, + "balance_loss_mlp": 0.01257483, + "epoch": 0.15151059672328274, + "flos": 20309177859840.0, + "grad_norm": 2.5486902935962368, + "language_loss": 0.80309343, + "learning_rate": 3.847106342204354e-06, + "loss": 0.88202327, + "num_input_tokens_seen": 54758415, + "router_z_loss_clip": 2.99804688, + "router_z_loss_mlp": 0.28393555, + "step": 2520, + "time_per_iteration": 2.5701065063476562 + }, + { + "auxiliary_loss_clip": 0.06607689, + "auxiliary_loss_mlp": 0.01293848, + "balance_loss_clip": 0.06306153, + "balance_loss_mlp": 0.01262853, + "epoch": 0.1515707199759507, + "flos": 27234591742080.0, + "grad_norm": 2.513682116437687, + "language_loss": 0.7522434, + "learning_rate": 3.846956960161114e-06, + "loss": 0.83125877, + "num_input_tokens_seen": 54779355, + "router_z_loss_clip": 3.01953125, + "router_z_loss_mlp": 0.31005859, + "step": 2521, + "time_per_iteration": 2.6066393852233887 + }, + { + "auxiliary_loss_clip": 0.06609409, + "auxiliary_loss_mlp": 0.01293912, + "balance_loss_clip": 0.06305401, + "balance_loss_mlp": 0.012643, + "epoch": 0.15163084322861867, + "flos": 23594229181440.0, + "grad_norm": 3.360256579964136, + "language_loss": 0.82804251, + "learning_rate": 3.84680750808108e-06, + "loss": 0.9070757, + "num_input_tokens_seen": 54799465, + "router_z_loss_clip": 3.04101562, + "router_z_loss_mlp": 0.29614258, + "step": 2522, + "time_per_iteration": 2.6204471588134766 + }, + { + "auxiliary_loss_clip": 0.06466869, + "auxiliary_loss_mlp": 0.01261371, + "balance_loss_clip": 0.06295171, + "balance_loss_mlp": 0.01253491, + "epoch": 0.15169096648128663, + "flos": 66908786855040.0, + "grad_norm": 0.8016115215940587, + "language_loss": 0.58029842, + "learning_rate": 3.846657985969922e-06, + "loss": 0.65758073, + "num_input_tokens_seen": 54857665, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.07873535, + "step": 2523, + "time_per_iteration": 3.1140880584716797 + }, + { + "auxiliary_loss_clip": 0.06599564, + "auxiliary_loss_mlp": 0.0128657, + "balance_loss_clip": 0.0630584, + "balance_loss_mlp": 0.0125821, + "epoch": 0.1517510897339546, + "flos": 29103024810240.0, + "grad_norm": 3.3848907238065324, + "language_loss": 0.7552231, + "learning_rate": 3.8465083938333066e-06, + "loss": 0.83408445, + "num_input_tokens_seen": 54879895, + "router_z_loss_clip": 2.93945312, + "router_z_loss_mlp": 0.2833252, + "step": 2524, + "time_per_iteration": 2.6701698303222656 + }, + { + "auxiliary_loss_clip": 0.066016, + "auxiliary_loss_mlp": 0.01289357, + "balance_loss_clip": 0.0629995, + "balance_loss_mlp": 0.01259889, + "epoch": 0.1518112129866226, + "flos": 18412597019520.0, + "grad_norm": 1.915224291313093, + "language_loss": 0.75580716, + "learning_rate": 3.8463587316769085e-06, + "loss": 0.8347168, + "num_input_tokens_seen": 54898245, + "router_z_loss_clip": 3.01757812, + "router_z_loss_mlp": 0.29443359, + "step": 2525, + "time_per_iteration": 2.5224146842956543 + }, + { + "auxiliary_loss_clip": 0.06610245, + "auxiliary_loss_mlp": 0.01284071, + "balance_loss_clip": 0.06303011, + "balance_loss_mlp": 0.01254436, + "epoch": 0.15187133623929056, + "flos": 19431165651840.0, + "grad_norm": 1.8765466933559616, + "language_loss": 0.80763042, + "learning_rate": 3.846208999506402e-06, + "loss": 0.88657361, + "num_input_tokens_seen": 54917060, + "router_z_loss_clip": 3.0703125, + "router_z_loss_mlp": 0.29638672, + "step": 2526, + "time_per_iteration": 2.6248834133148193 + }, + { + "auxiliary_loss_clip": 0.06594585, + "auxiliary_loss_mlp": 0.01286752, + "balance_loss_clip": 0.06300339, + "balance_loss_mlp": 0.01258869, + "epoch": 0.15193145949195852, + "flos": 17571914605440.0, + "grad_norm": 1.7842428302313325, + "language_loss": 0.8627159, + "learning_rate": 3.846059197327466e-06, + "loss": 0.94152921, + "num_input_tokens_seen": 54936365, + "router_z_loss_clip": 2.94335938, + "router_z_loss_mlp": 0.27893066, + "step": 2527, + "time_per_iteration": 2.5703248977661133 + }, + { + "auxiliary_loss_clip": 0.06595106, + "auxiliary_loss_mlp": 0.01287139, + "balance_loss_clip": 0.06298759, + "balance_loss_mlp": 0.01258386, + "epoch": 0.15199158274462649, + "flos": 36185472443520.0, + "grad_norm": 2.5277358880769034, + "language_loss": 0.69832277, + "learning_rate": 3.845909325145779e-06, + "loss": 0.77714521, + "num_input_tokens_seen": 54961365, + "router_z_loss_clip": 2.9609375, + "router_z_loss_mlp": 0.28710938, + "step": 2528, + "time_per_iteration": 2.6980392932891846 + }, + { + "auxiliary_loss_clip": 0.06594975, + "auxiliary_loss_mlp": 0.01296705, + "balance_loss_clip": 0.06302442, + "balance_loss_mlp": 0.01268142, + "epoch": 0.15205170599729445, + "flos": 23080416744960.0, + "grad_norm": 1.7045403282780136, + "language_loss": 0.87845027, + "learning_rate": 3.845759382967026e-06, + "loss": 0.95736718, + "num_input_tokens_seen": 54980750, + "router_z_loss_clip": 2.92382812, + "router_z_loss_mlp": 0.28588867, + "step": 2529, + "time_per_iteration": 2.557424545288086 + }, + { + "auxiliary_loss_clip": 0.06594887, + "auxiliary_loss_mlp": 0.01282389, + "balance_loss_clip": 0.06300049, + "balance_loss_mlp": 0.01254446, + "epoch": 0.15211182924996242, + "flos": 21914876851200.0, + "grad_norm": 2.4637975770903227, + "language_loss": 0.84209996, + "learning_rate": 3.845609370796893e-06, + "loss": 0.92087275, + "num_input_tokens_seen": 54999675, + "router_z_loss_clip": 2.9453125, + "router_z_loss_mlp": 0.27929688, + "step": 2530, + "time_per_iteration": 2.567228317260742 + }, + { + "auxiliary_loss_clip": 0.06598973, + "auxiliary_loss_mlp": 0.01283946, + "balance_loss_clip": 0.06302072, + "balance_loss_mlp": 0.01255336, + "epoch": 0.15217195250263038, + "flos": 13886675383680.0, + "grad_norm": 2.4321779104905312, + "language_loss": 0.82142234, + "learning_rate": 3.845459288641066e-06, + "loss": 0.90025157, + "num_input_tokens_seen": 55018295, + "router_z_loss_clip": 2.96875, + "router_z_loss_mlp": 0.28637695, + "step": 2531, + "time_per_iteration": 2.516402006149292 + }, + { + "auxiliary_loss_clip": 0.06592906, + "auxiliary_loss_mlp": 0.01285145, + "balance_loss_clip": 0.06298403, + "balance_loss_mlp": 0.01258085, + "epoch": 0.15223207575529837, + "flos": 24542247075840.0, + "grad_norm": 1.9096136580750296, + "language_loss": 0.79480046, + "learning_rate": 3.8453091365052394e-06, + "loss": 0.87358099, + "num_input_tokens_seen": 55037975, + "router_z_loss_clip": 2.94726562, + "router_z_loss_mlp": 0.27050781, + "step": 2532, + "time_per_iteration": 2.602570056915283 + }, + { + "auxiliary_loss_clip": 0.06598103, + "auxiliary_loss_mlp": 0.01292588, + "balance_loss_clip": 0.06306568, + "balance_loss_mlp": 0.01264038, + "epoch": 0.15229219900796634, + "flos": 25563876382080.0, + "grad_norm": 2.360683407186041, + "language_loss": 0.88639164, + "learning_rate": 3.845158914395105e-06, + "loss": 0.96529853, + "num_input_tokens_seen": 55057135, + "router_z_loss_clip": 2.91796875, + "router_z_loss_mlp": 0.28552246, + "step": 2533, + "time_per_iteration": 2.5762295722961426 + }, + { + "auxiliary_loss_clip": 0.06594107, + "auxiliary_loss_mlp": 0.01284606, + "balance_loss_clip": 0.06298208, + "balance_loss_mlp": 0.01254935, + "epoch": 0.1523523222606343, + "flos": 18222761594880.0, + "grad_norm": 2.499608410280873, + "language_loss": 0.79898536, + "learning_rate": 3.84500862231636e-06, + "loss": 0.87777245, + "num_input_tokens_seen": 55075525, + "router_z_loss_clip": 2.95898438, + "router_z_loss_mlp": 0.29650879, + "step": 2534, + "time_per_iteration": 2.5181829929351807 + }, + { + "auxiliary_loss_clip": 0.06609488, + "auxiliary_loss_mlp": 0.01289006, + "balance_loss_clip": 0.0630374, + "balance_loss_mlp": 0.01258965, + "epoch": 0.15241244551330227, + "flos": 13264940488320.0, + "grad_norm": 3.191609676619316, + "language_loss": 0.77956164, + "learning_rate": 3.844858260274702e-06, + "loss": 0.8585465, + "num_input_tokens_seen": 55090845, + "router_z_loss_clip": 3.05859375, + "router_z_loss_mlp": 0.30029297, + "step": 2535, + "time_per_iteration": 2.503119945526123 + }, + { + "auxiliary_loss_clip": 0.06608094, + "auxiliary_loss_mlp": 0.01284526, + "balance_loss_clip": 0.06304367, + "balance_loss_mlp": 0.01254271, + "epoch": 0.15247256876597023, + "flos": 19721083178880.0, + "grad_norm": 3.2947050027003066, + "language_loss": 0.79165435, + "learning_rate": 3.844707828275835e-06, + "loss": 0.87058055, + "num_input_tokens_seen": 55108750, + "router_z_loss_clip": 3.03710938, + "router_z_loss_mlp": 0.30249023, + "step": 2536, + "time_per_iteration": 2.5530476570129395 + }, + { + "auxiliary_loss_clip": 0.06598002, + "auxiliary_loss_mlp": 0.0128534, + "balance_loss_clip": 0.06305596, + "balance_loss_mlp": 0.01255537, + "epoch": 0.1525326920186382, + "flos": 20382076512000.0, + "grad_norm": 2.2639852442912174, + "language_loss": 0.76164496, + "learning_rate": 3.844557326325461e-06, + "loss": 0.84047836, + "num_input_tokens_seen": 55126750, + "router_z_loss_clip": 2.92578125, + "router_z_loss_mlp": 0.29785156, + "step": 2537, + "time_per_iteration": 2.5634751319885254 + }, + { + "auxiliary_loss_clip": 0.06616107, + "auxiliary_loss_mlp": 0.01291403, + "balance_loss_clip": 0.06314284, + "balance_loss_mlp": 0.0126017, + "epoch": 0.15259281527130616, + "flos": 13595122702080.0, + "grad_norm": 2.083719097909717, + "language_loss": 0.78846097, + "learning_rate": 3.8444067544292896e-06, + "loss": 0.86753607, + "num_input_tokens_seen": 55144690, + "router_z_loss_clip": 3.01953125, + "router_z_loss_mlp": 0.31225586, + "step": 2538, + "time_per_iteration": 2.525216579437256 + }, + { + "auxiliary_loss_clip": 0.0661103, + "auxiliary_loss_mlp": 0.01284923, + "balance_loss_clip": 0.06318808, + "balance_loss_mlp": 0.0125735, + "epoch": 0.15265293852397416, + "flos": 22867590574080.0, + "grad_norm": 1.595971485409624, + "language_loss": 0.90629852, + "learning_rate": 3.844256112593029e-06, + "loss": 0.98525798, + "num_input_tokens_seen": 55166055, + "router_z_loss_clip": 2.921875, + "router_z_loss_mlp": 0.27600098, + "step": 2539, + "time_per_iteration": 2.5915887355804443 + }, + { + "auxiliary_loss_clip": 0.06619261, + "auxiliary_loss_mlp": 0.01284998, + "balance_loss_clip": 0.06323005, + "balance_loss_mlp": 0.01258056, + "epoch": 0.15271306177664212, + "flos": 29245174462080.0, + "grad_norm": 1.9545185046664433, + "language_loss": 0.94507146, + "learning_rate": 3.844105400822391e-06, + "loss": 1.02411401, + "num_input_tokens_seen": 55186285, + "router_z_loss_clip": 2.9609375, + "router_z_loss_mlp": 0.26953125, + "step": 2540, + "time_per_iteration": 2.5957653522491455 + }, + { + "auxiliary_loss_clip": 0.06626961, + "auxiliary_loss_mlp": 0.01293534, + "balance_loss_clip": 0.06334557, + "balance_loss_mlp": 0.01266021, + "epoch": 0.1527731850293101, + "flos": 31253912392320.0, + "grad_norm": 1.8583637495379903, + "language_loss": 0.76235664, + "learning_rate": 3.843954619123092e-06, + "loss": 0.84156162, + "num_input_tokens_seen": 55207915, + "router_z_loss_clip": 2.92578125, + "router_z_loss_mlp": 0.27490234, + "step": 2541, + "time_per_iteration": 2.6641690731048584 + }, + { + "auxiliary_loss_clip": 0.06626125, + "auxiliary_loss_mlp": 0.01288118, + "balance_loss_clip": 0.06332077, + "balance_loss_mlp": 0.01259139, + "epoch": 0.15283330828197805, + "flos": 22388550382080.0, + "grad_norm": 1.961487412354616, + "language_loss": 0.82183802, + "learning_rate": 3.84380376750085e-06, + "loss": 0.90098047, + "num_input_tokens_seen": 55227860, + "router_z_loss_clip": 2.94140625, + "router_z_loss_mlp": 0.28991699, + "step": 2542, + "time_per_iteration": 2.5667076110839844 + }, + { + "auxiliary_loss_clip": 0.06644198, + "auxiliary_loss_mlp": 0.01293823, + "balance_loss_clip": 0.0634245, + "balance_loss_mlp": 0.01263568, + "epoch": 0.15289343153464602, + "flos": 25527175493760.0, + "grad_norm": 2.1541705335190597, + "language_loss": 0.78364998, + "learning_rate": 3.843652845961383e-06, + "loss": 0.8630302, + "num_input_tokens_seen": 55247330, + "router_z_loss_clip": 3.01953125, + "router_z_loss_mlp": 0.3026123, + "step": 2543, + "time_per_iteration": 3.986154556274414 + }, + { + "auxiliary_loss_clip": 0.06638096, + "auxiliary_loss_mlp": 0.01299522, + "balance_loss_clip": 0.06343587, + "balance_loss_mlp": 0.01271616, + "epoch": 0.15295355478731398, + "flos": 22716468535680.0, + "grad_norm": 3.1436155023596886, + "language_loss": 0.88072753, + "learning_rate": 3.843501854510416e-06, + "loss": 0.96010375, + "num_input_tokens_seen": 55266195, + "router_z_loss_clip": 2.9453125, + "router_z_loss_mlp": 0.27905273, + "step": 2544, + "time_per_iteration": 3.9873733520507812 + }, + { + "auxiliary_loss_clip": 0.06648069, + "auxiliary_loss_mlp": 0.01297216, + "balance_loss_clip": 0.06342938, + "balance_loss_mlp": 0.01266937, + "epoch": 0.15301367803998198, + "flos": 23257548276480.0, + "grad_norm": 3.867712661232465, + "language_loss": 0.83686781, + "learning_rate": 3.843350793153673e-06, + "loss": 0.91632062, + "num_input_tokens_seen": 55283305, + "router_z_loss_clip": 3.05078125, + "router_z_loss_mlp": 0.30273438, + "step": 2545, + "time_per_iteration": 2.5443849563598633 + }, + { + "auxiliary_loss_clip": 0.06650628, + "auxiliary_loss_mlp": 0.01286742, + "balance_loss_clip": 0.06356554, + "balance_loss_mlp": 0.01259086, + "epoch": 0.15307380129264994, + "flos": 25893597398400.0, + "grad_norm": 2.572032347282614, + "language_loss": 0.71873057, + "learning_rate": 3.843199661896884e-06, + "loss": 0.79810423, + "num_input_tokens_seen": 55303035, + "router_z_loss_clip": 2.94335938, + "router_z_loss_mlp": 0.27661133, + "step": 2546, + "time_per_iteration": 2.650826930999756 + }, + { + "auxiliary_loss_clip": 0.06637084, + "auxiliary_loss_mlp": 0.0129342, + "balance_loss_clip": 0.06340081, + "balance_loss_mlp": 0.01263164, + "epoch": 0.1531339245453179, + "flos": 46983780766080.0, + "grad_norm": 1.694960648035813, + "language_loss": 0.78831929, + "learning_rate": 3.843048460745779e-06, + "loss": 0.86762434, + "num_input_tokens_seen": 55327570, + "router_z_loss_clip": 2.96875, + "router_z_loss_mlp": 0.30249023, + "step": 2547, + "time_per_iteration": 2.7530312538146973 + }, + { + "auxiliary_loss_clip": 0.06643492, + "auxiliary_loss_mlp": 0.01284901, + "balance_loss_clip": 0.06342105, + "balance_loss_mlp": 0.0125579, + "epoch": 0.15319404779798587, + "flos": 35890817160960.0, + "grad_norm": 3.38346990001629, + "language_loss": 0.75178528, + "learning_rate": 3.842897189706092e-06, + "loss": 0.83106923, + "num_input_tokens_seen": 55351090, + "router_z_loss_clip": 3.015625, + "router_z_loss_mlp": 0.29138184, + "step": 2548, + "time_per_iteration": 4.090601682662964 + }, + { + "auxiliary_loss_clip": 0.06638174, + "auxiliary_loss_mlp": 0.01283175, + "balance_loss_clip": 0.06343598, + "balance_loss_mlp": 0.01255757, + "epoch": 0.15325417105065384, + "flos": 25671463424640.0, + "grad_norm": 1.8173203040893826, + "language_loss": 0.82054353, + "learning_rate": 3.842745848783558e-06, + "loss": 0.89975703, + "num_input_tokens_seen": 55371050, + "router_z_loss_clip": 2.94335938, + "router_z_loss_mlp": 0.27416992, + "step": 2549, + "time_per_iteration": 4.0024590492248535 + }, + { + "auxiliary_loss_clip": 0.06642953, + "auxiliary_loss_mlp": 0.01284523, + "balance_loss_clip": 0.06343073, + "balance_loss_mlp": 0.01256366, + "epoch": 0.1533142943033218, + "flos": 18776838718080.0, + "grad_norm": 1.6738213226373704, + "language_loss": 0.76089072, + "learning_rate": 3.842594437983917e-06, + "loss": 0.84016538, + "num_input_tokens_seen": 55390375, + "router_z_loss_clip": 2.99804688, + "router_z_loss_mlp": 0.28137207, + "step": 2550, + "time_per_iteration": 2.5584487915039062 + }, + { + "auxiliary_loss_clip": 0.06640078, + "auxiliary_loss_mlp": 0.01284284, + "balance_loss_clip": 0.063375, + "balance_loss_mlp": 0.01257093, + "epoch": 0.15337441755598977, + "flos": 23113218418560.0, + "grad_norm": 2.77223179347166, + "language_loss": 0.78078097, + "learning_rate": 3.8424429573129115e-06, + "loss": 0.86002457, + "num_input_tokens_seen": 55408890, + "router_z_loss_clip": 3.02734375, + "router_z_loss_mlp": 0.27172852, + "step": 2551, + "time_per_iteration": 2.5581319332122803 + }, + { + "auxiliary_loss_clip": 0.06594751, + "auxiliary_loss_mlp": 0.01264842, + "balance_loss_clip": 0.0641477, + "balance_loss_mlp": 0.01255657, + "epoch": 0.15343454080865776, + "flos": 59881278372480.0, + "grad_norm": 0.9086682427744472, + "language_loss": 0.56718183, + "learning_rate": 3.842291406776283e-06, + "loss": 0.6457777, + "num_input_tokens_seen": 55463815, + "router_z_loss_clip": 1.796875, + "router_z_loss_mlp": 0.09179688, + "step": 2552, + "time_per_iteration": 3.099020004272461 + }, + { + "auxiliary_loss_clip": 0.06649399, + "auxiliary_loss_mlp": 0.01294284, + "balance_loss_clip": 0.06343735, + "balance_loss_mlp": 0.01263695, + "epoch": 0.15349466406132573, + "flos": 11915644590720.0, + "grad_norm": 7.1683362370520625, + "language_loss": 0.89047897, + "learning_rate": 3.84213978637978e-06, + "loss": 0.96991581, + "num_input_tokens_seen": 55481050, + "router_z_loss_clip": 3.05664062, + "router_z_loss_mlp": 0.30615234, + "step": 2553, + "time_per_iteration": 2.5545389652252197 + }, + { + "auxiliary_loss_clip": 0.06633511, + "auxiliary_loss_mlp": 0.01288342, + "balance_loss_clip": 0.0633003, + "balance_loss_mlp": 0.01258575, + "epoch": 0.1535547873139937, + "flos": 24103681205760.0, + "grad_norm": 2.37345039804312, + "language_loss": 0.79193908, + "learning_rate": 3.841988096129152e-06, + "loss": 0.87115765, + "num_input_tokens_seen": 55500050, + "router_z_loss_clip": 3.03515625, + "router_z_loss_mlp": 0.29748535, + "step": 2554, + "time_per_iteration": 2.5949606895446777 + }, + { + "auxiliary_loss_clip": 0.06630482, + "auxiliary_loss_mlp": 0.01286402, + "balance_loss_clip": 0.06329404, + "balance_loss_mlp": 0.01256278, + "epoch": 0.15361491056666166, + "flos": 17572208094720.0, + "grad_norm": 5.650486163134607, + "language_loss": 0.79014289, + "learning_rate": 3.841836336030151e-06, + "loss": 0.86931169, + "num_input_tokens_seen": 55518125, + "router_z_loss_clip": 3.01171875, + "router_z_loss_mlp": 0.3013916, + "step": 2555, + "time_per_iteration": 2.5340495109558105 + }, + { + "auxiliary_loss_clip": 0.0662353, + "auxiliary_loss_mlp": 0.01288339, + "balance_loss_clip": 0.06330266, + "balance_loss_mlp": 0.01260671, + "epoch": 0.15367503381932962, + "flos": 25053040765440.0, + "grad_norm": 1.6796179562313394, + "language_loss": 0.78025055, + "learning_rate": 3.8416845060885305e-06, + "loss": 0.85936922, + "num_input_tokens_seen": 55540960, + "router_z_loss_clip": 2.93554688, + "router_z_loss_mlp": 0.2767334, + "step": 2556, + "time_per_iteration": 2.623685121536255 + }, + { + "auxiliary_loss_clip": 0.06620497, + "auxiliary_loss_mlp": 0.01288231, + "balance_loss_clip": 0.0633128, + "balance_loss_mlp": 0.01260086, + "epoch": 0.15373515707199759, + "flos": 21513808483200.0, + "grad_norm": 2.256114728182097, + "language_loss": 0.91304088, + "learning_rate": 3.84153260631005e-06, + "loss": 0.99212819, + "num_input_tokens_seen": 55559210, + "router_z_loss_clip": 2.890625, + "router_z_loss_mlp": 0.28161621, + "step": 2557, + "time_per_iteration": 2.6546642780303955 + }, + { + "auxiliary_loss_clip": 0.06632135, + "auxiliary_loss_mlp": 0.01294079, + "balance_loss_clip": 0.0633366, + "balance_loss_mlp": 0.0126411, + "epoch": 0.15379528032466555, + "flos": 26001897200640.0, + "grad_norm": 2.0796567985016656, + "language_loss": 0.71532625, + "learning_rate": 3.841380636700468e-06, + "loss": 0.79458839, + "num_input_tokens_seen": 55578925, + "router_z_loss_clip": 2.98632812, + "router_z_loss_mlp": 0.29980469, + "step": 2558, + "time_per_iteration": 2.604158401489258 + }, + { + "auxiliary_loss_clip": 0.06622511, + "auxiliary_loss_mlp": 0.01287721, + "balance_loss_clip": 0.06324002, + "balance_loss_mlp": 0.01258336, + "epoch": 0.15385540357733354, + "flos": 19282685016960.0, + "grad_norm": 2.0921223854633166, + "language_loss": 0.93401122, + "learning_rate": 3.841228597265548e-06, + "loss": 1.0131135, + "num_input_tokens_seen": 55597255, + "router_z_loss_clip": 2.98242188, + "router_z_loss_mlp": 0.29382324, + "step": 2559, + "time_per_iteration": 2.546621799468994 + }, + { + "auxiliary_loss_clip": 0.06626738, + "auxiliary_loss_mlp": 0.01291924, + "balance_loss_clip": 0.06328855, + "balance_loss_mlp": 0.01262289, + "epoch": 0.1539155268300015, + "flos": 28556788043520.0, + "grad_norm": 2.7498914144184994, + "language_loss": 0.65563196, + "learning_rate": 3.841076488011055e-06, + "loss": 0.73481858, + "num_input_tokens_seen": 55619515, + "router_z_loss_clip": 2.97851562, + "router_z_loss_mlp": 0.29638672, + "step": 2560, + "time_per_iteration": 2.633558511734009 + }, + { + "auxiliary_loss_clip": 0.06620878, + "auxiliary_loss_mlp": 0.01293003, + "balance_loss_clip": 0.06320217, + "balance_loss_mlp": 0.01262927, + "epoch": 0.15397565008266947, + "flos": 23554257984000.0, + "grad_norm": 1.9722034302545564, + "language_loss": 0.89109504, + "learning_rate": 3.8409243089427574e-06, + "loss": 0.9702338, + "num_input_tokens_seen": 55640050, + "router_z_loss_clip": 3.00585938, + "router_z_loss_mlp": 0.30065918, + "step": 2561, + "time_per_iteration": 2.593822479248047 + }, + { + "auxiliary_loss_clip": 0.06618848, + "auxiliary_loss_mlp": 0.01287729, + "balance_loss_clip": 0.06331521, + "balance_loss_mlp": 0.01260811, + "epoch": 0.15403577333533744, + "flos": 17135696649600.0, + "grad_norm": 2.292455015225775, + "language_loss": 0.83781528, + "learning_rate": 3.840772060066425e-06, + "loss": 0.91688108, + "num_input_tokens_seen": 55658695, + "router_z_loss_clip": 2.87109375, + "router_z_loss_mlp": 0.26928711, + "step": 2562, + "time_per_iteration": 2.5630288124084473 + }, + { + "auxiliary_loss_clip": 0.06628443, + "auxiliary_loss_mlp": 0.01297123, + "balance_loss_clip": 0.06321231, + "balance_loss_mlp": 0.01265175, + "epoch": 0.1540958965880054, + "flos": 17900252029440.0, + "grad_norm": 3.685635027542056, + "language_loss": 0.75855017, + "learning_rate": 3.840619741387832e-06, + "loss": 0.83780587, + "num_input_tokens_seen": 55676340, + "router_z_loss_clip": 3.0703125, + "router_z_loss_mlp": 0.31958008, + "step": 2563, + "time_per_iteration": 2.5140066146850586 + }, + { + "auxiliary_loss_clip": 0.06627464, + "auxiliary_loss_mlp": 0.01290382, + "balance_loss_clip": 0.06320702, + "balance_loss_mlp": 0.01258481, + "epoch": 0.15415601984067337, + "flos": 32169296321280.0, + "grad_norm": 2.478610974211426, + "language_loss": 0.77803361, + "learning_rate": 3.8404673529127534e-06, + "loss": 0.85721207, + "num_input_tokens_seen": 55698890, + "router_z_loss_clip": 3.0703125, + "router_z_loss_mlp": 0.3190918, + "step": 2564, + "time_per_iteration": 2.659982681274414 + }, + { + "auxiliary_loss_clip": 0.06615369, + "auxiliary_loss_mlp": 0.0129364, + "balance_loss_clip": 0.06320594, + "balance_loss_mlp": 0.01264267, + "epoch": 0.15421614309334136, + "flos": 24031243751040.0, + "grad_norm": 1.9916685694635767, + "language_loss": 0.71840364, + "learning_rate": 3.840314894646969e-06, + "loss": 0.7974937, + "num_input_tokens_seen": 55718535, + "router_z_loss_clip": 2.94726562, + "router_z_loss_mlp": 0.29321289, + "step": 2565, + "time_per_iteration": 2.553128480911255 + }, + { + "auxiliary_loss_clip": 0.06614129, + "auxiliary_loss_mlp": 0.01296634, + "balance_loss_clip": 0.06317951, + "balance_loss_mlp": 0.01266212, + "epoch": 0.15427626634600933, + "flos": 24392676337920.0, + "grad_norm": 2.5526224211901676, + "language_loss": 0.72527832, + "learning_rate": 3.840162366596259e-06, + "loss": 0.8043859, + "num_input_tokens_seen": 55738970, + "router_z_loss_clip": 2.9609375, + "router_z_loss_mlp": 0.30419922, + "step": 2566, + "time_per_iteration": 2.6016533374786377 + }, + { + "auxiliary_loss_clip": 0.06605071, + "auxiliary_loss_mlp": 0.01292884, + "balance_loss_clip": 0.06314062, + "balance_loss_mlp": 0.01265263, + "epoch": 0.1543363895986773, + "flos": 23338287285120.0, + "grad_norm": 2.301564838599309, + "language_loss": 0.86417472, + "learning_rate": 3.840009768766408e-06, + "loss": 0.94315434, + "num_input_tokens_seen": 55759585, + "router_z_loss_clip": 2.9140625, + "router_z_loss_mlp": 0.27612305, + "step": 2567, + "time_per_iteration": 2.5882625579833984 + }, + { + "auxiliary_loss_clip": 0.06608227, + "auxiliary_loss_mlp": 0.01293398, + "balance_loss_clip": 0.06315389, + "balance_loss_mlp": 0.01265348, + "epoch": 0.15439651285134526, + "flos": 24280225758720.0, + "grad_norm": 2.3922484360691576, + "language_loss": 0.79661417, + "learning_rate": 3.839857101163202e-06, + "loss": 0.87563044, + "num_input_tokens_seen": 55779250, + "router_z_loss_clip": 2.93164062, + "router_z_loss_mlp": 0.28039551, + "step": 2568, + "time_per_iteration": 2.6128549575805664 + }, + { + "auxiliary_loss_clip": 0.06604031, + "auxiliary_loss_mlp": 0.01296391, + "balance_loss_clip": 0.06313319, + "balance_loss_mlp": 0.01268103, + "epoch": 0.15445663610401322, + "flos": 22462832626560.0, + "grad_norm": 2.2987457723616482, + "language_loss": 0.71156412, + "learning_rate": 3.83970436379243e-06, + "loss": 0.79056835, + "num_input_tokens_seen": 55800470, + "router_z_loss_clip": 2.90820312, + "router_z_loss_mlp": 0.28295898, + "step": 2569, + "time_per_iteration": 2.555661916732788 + }, + { + "auxiliary_loss_clip": 0.06609643, + "auxiliary_loss_mlp": 0.0129108, + "balance_loss_clip": 0.06317194, + "balance_loss_mlp": 0.0126197, + "epoch": 0.1545167593566812, + "flos": 22055223640320.0, + "grad_norm": 2.1871959478456433, + "language_loss": 0.7775144, + "learning_rate": 3.839551556659884e-06, + "loss": 0.85652161, + "num_input_tokens_seen": 55817795, + "router_z_loss_clip": 2.92773438, + "router_z_loss_mlp": 0.29150391, + "step": 2570, + "time_per_iteration": 2.5834736824035645 + }, + { + "auxiliary_loss_clip": 0.06598657, + "auxiliary_loss_mlp": 0.01290077, + "balance_loss_clip": 0.06308745, + "balance_loss_mlp": 0.01260513, + "epoch": 0.15457688260934915, + "flos": 19324375223040.0, + "grad_norm": 2.749201239461968, + "language_loss": 0.7861867, + "learning_rate": 3.839398679771359e-06, + "loss": 0.86507404, + "num_input_tokens_seen": 55836125, + "router_z_loss_clip": 2.8984375, + "router_z_loss_mlp": 0.29541016, + "step": 2571, + "time_per_iteration": 2.5391428470611572 + }, + { + "auxiliary_loss_clip": 0.06606804, + "auxiliary_loss_mlp": 0.01294872, + "balance_loss_clip": 0.06313352, + "balance_loss_mlp": 0.01265785, + "epoch": 0.15463700586201715, + "flos": 24140843291520.0, + "grad_norm": 1.901838675989398, + "language_loss": 0.83756542, + "learning_rate": 3.839245733132652e-06, + "loss": 0.91658223, + "num_input_tokens_seen": 55855280, + "router_z_loss_clip": 2.93554688, + "router_z_loss_mlp": 0.29101562, + "step": 2572, + "time_per_iteration": 2.597111463546753 + }, + { + "auxiliary_loss_clip": 0.06611877, + "auxiliary_loss_mlp": 0.01296064, + "balance_loss_clip": 0.06316563, + "balance_loss_mlp": 0.01266393, + "epoch": 0.1546971291146851, + "flos": 22427808819840.0, + "grad_norm": 2.3334374955274466, + "language_loss": 0.91633451, + "learning_rate": 3.839092716749563e-06, + "loss": 0.9954139, + "num_input_tokens_seen": 55875695, + "router_z_loss_clip": 2.95898438, + "router_z_loss_mlp": 0.29699707, + "step": 2573, + "time_per_iteration": 2.553586721420288 + }, + { + "auxiliary_loss_clip": 0.06606219, + "auxiliary_loss_mlp": 0.01288918, + "balance_loss_clip": 0.06312492, + "balance_loss_mlp": 0.01258639, + "epoch": 0.15475725236735308, + "flos": 17536010330880.0, + "grad_norm": 1.5970575826599196, + "language_loss": 0.71088636, + "learning_rate": 3.838939630627893e-06, + "loss": 0.78983772, + "num_input_tokens_seen": 55894575, + "router_z_loss_clip": 2.93554688, + "router_z_loss_mlp": 0.30249023, + "step": 2574, + "time_per_iteration": 2.5485129356384277 + }, + { + "auxiliary_loss_clip": 0.06606239, + "auxiliary_loss_mlp": 0.01287836, + "balance_loss_clip": 0.06312916, + "balance_loss_mlp": 0.01258439, + "epoch": 0.15481737562002104, + "flos": 22567778265600.0, + "grad_norm": 2.064736624590997, + "language_loss": 0.83194166, + "learning_rate": 3.838786474773448e-06, + "loss": 0.91088241, + "num_input_tokens_seen": 55912855, + "router_z_loss_clip": 2.93359375, + "router_z_loss_mlp": 0.29394531, + "step": 2575, + "time_per_iteration": 2.5202696323394775 + }, + { + "auxiliary_loss_clip": 0.06611623, + "auxiliary_loss_mlp": 0.01295032, + "balance_loss_clip": 0.06317705, + "balance_loss_mlp": 0.01267137, + "epoch": 0.154877498872689, + "flos": 24907620804480.0, + "grad_norm": 1.9923268704643078, + "language_loss": 0.8600359, + "learning_rate": 3.838633249192036e-06, + "loss": 0.93910241, + "num_input_tokens_seen": 55932375, + "router_z_loss_clip": 2.93945312, + "router_z_loss_mlp": 0.27929688, + "step": 2576, + "time_per_iteration": 2.5677525997161865 + }, + { + "auxiliary_loss_clip": 0.06609543, + "auxiliary_loss_mlp": 0.01301269, + "balance_loss_clip": 0.06318229, + "balance_loss_mlp": 0.01275126, + "epoch": 0.15493762212535697, + "flos": 28155048842880.0, + "grad_norm": 2.065090565667539, + "language_loss": 0.82887769, + "learning_rate": 3.838479953889465e-06, + "loss": 0.90798575, + "num_input_tokens_seen": 55953970, + "router_z_loss_clip": 2.91796875, + "router_z_loss_mlp": 0.26147461, + "step": 2577, + "time_per_iteration": 2.5728230476379395 + }, + { + "auxiliary_loss_clip": 0.06618612, + "auxiliary_loss_mlp": 0.01306082, + "balance_loss_clip": 0.06324668, + "balance_loss_mlp": 0.01276852, + "epoch": 0.15499774537802496, + "flos": 25418162931840.0, + "grad_norm": 2.85112064725787, + "language_loss": 0.77597427, + "learning_rate": 3.8383265888715525e-06, + "loss": 0.85522127, + "num_input_tokens_seen": 55973120, + "router_z_loss_clip": 2.94335938, + "router_z_loss_mlp": 0.29199219, + "step": 2578, + "time_per_iteration": 2.5934667587280273 + }, + { + "auxiliary_loss_clip": 0.06630063, + "auxiliary_loss_mlp": 0.01289241, + "balance_loss_clip": 0.06328662, + "balance_loss_mlp": 0.01259224, + "epoch": 0.15505786863069293, + "flos": 22098213584640.0, + "grad_norm": 1.7655677053725216, + "language_loss": 0.8325448, + "learning_rate": 3.83817315414411e-06, + "loss": 0.91173792, + "num_input_tokens_seen": 55993260, + "router_z_loss_clip": 3.01367188, + "router_z_loss_mlp": 0.30004883, + "step": 2579, + "time_per_iteration": 2.5339503288269043 + }, + { + "auxiliary_loss_clip": 0.06624122, + "auxiliary_loss_mlp": 0.01293638, + "balance_loss_clip": 0.06327586, + "balance_loss_mlp": 0.01264074, + "epoch": 0.1551179918833609, + "flos": 18923223000960.0, + "grad_norm": 3.703462791860066, + "language_loss": 0.81290895, + "learning_rate": 3.838019649712958e-06, + "loss": 0.89208651, + "num_input_tokens_seen": 56012130, + "router_z_loss_clip": 2.96484375, + "router_z_loss_mlp": 0.2956543, + "step": 2580, + "time_per_iteration": 2.547076940536499 + }, + { + "auxiliary_loss_clip": 0.06553604, + "auxiliary_loss_mlp": 0.01296097, + "balance_loss_clip": 0.06379167, + "balance_loss_mlp": 0.01287341, + "epoch": 0.15517811513602886, + "flos": 66259281530880.0, + "grad_norm": 0.8290210768149422, + "language_loss": 0.59028411, + "learning_rate": 3.8378660755839166e-06, + "loss": 0.6687811, + "num_input_tokens_seen": 56079045, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.08770752, + "step": 2581, + "time_per_iteration": 4.748734712600708 + }, + { + "auxiliary_loss_clip": 0.06615421, + "auxiliary_loss_mlp": 0.01287932, + "balance_loss_clip": 0.06319774, + "balance_loss_mlp": 0.01259286, + "epoch": 0.15523823838869683, + "flos": 24027344536320.0, + "grad_norm": 2.048194408824491, + "language_loss": 0.86481762, + "learning_rate": 3.8377124317628095e-06, + "loss": 0.94385123, + "num_input_tokens_seen": 56098745, + "router_z_loss_clip": 2.95703125, + "router_z_loss_mlp": 0.28625488, + "step": 2582, + "time_per_iteration": 2.5417592525482178 + }, + { + "auxiliary_loss_clip": 0.0661144, + "auxiliary_loss_mlp": 0.01292493, + "balance_loss_clip": 0.06316175, + "balance_loss_mlp": 0.01262262, + "epoch": 0.1552983616413648, + "flos": 20491256782080.0, + "grad_norm": 2.196568898095916, + "language_loss": 0.79934382, + "learning_rate": 3.8375587182554625e-06, + "loss": 0.87838316, + "num_input_tokens_seen": 56117655, + "router_z_loss_clip": 2.95117188, + "router_z_loss_mlp": 0.30236816, + "step": 2583, + "time_per_iteration": 4.1261961460113525 + }, + { + "auxiliary_loss_clip": 0.06610835, + "auxiliary_loss_mlp": 0.01301507, + "balance_loss_clip": 0.06316249, + "balance_loss_mlp": 0.01272956, + "epoch": 0.15535848489403276, + "flos": 32131798819200.0, + "grad_norm": 2.2182475294075643, + "language_loss": 0.77203268, + "learning_rate": 3.837404935067705e-06, + "loss": 0.85115612, + "num_input_tokens_seen": 56141960, + "router_z_loss_clip": 2.9453125, + "router_z_loss_mlp": 0.28515625, + "step": 2584, + "time_per_iteration": 2.71648907661438 + }, + { + "auxiliary_loss_clip": 0.06603897, + "auxiliary_loss_mlp": 0.01292119, + "balance_loss_clip": 0.06309253, + "balance_loss_mlp": 0.01263676, + "epoch": 0.15541860814670075, + "flos": 19104379528320.0, + "grad_norm": 2.0708341386331157, + "language_loss": 0.76718783, + "learning_rate": 3.837251082205368e-06, + "loss": 0.84614801, + "num_input_tokens_seen": 56161430, + "router_z_loss_clip": 2.9453125, + "router_z_loss_mlp": 0.28442383, + "step": 2585, + "time_per_iteration": 2.548250198364258 + }, + { + "auxiliary_loss_clip": 0.06590863, + "auxiliary_loss_mlp": 0.01288896, + "balance_loss_clip": 0.06303678, + "balance_loss_mlp": 0.01260607, + "epoch": 0.1554787313993687, + "flos": 19178158648320.0, + "grad_norm": 2.0117198745869134, + "language_loss": 0.6235339, + "learning_rate": 3.837097159674286e-06, + "loss": 0.70233154, + "num_input_tokens_seen": 56179390, + "router_z_loss_clip": 2.87109375, + "router_z_loss_mlp": 0.28283691, + "step": 2586, + "time_per_iteration": 2.5397160053253174 + }, + { + "auxiliary_loss_clip": 0.06596754, + "auxiliary_loss_mlp": 0.01289508, + "balance_loss_clip": 0.0630295, + "balance_loss_mlp": 0.0126023, + "epoch": 0.15553885465203668, + "flos": 16149384639360.0, + "grad_norm": 2.0060039427442065, + "language_loss": 0.82540935, + "learning_rate": 3.836943167480296e-06, + "loss": 0.90427202, + "num_input_tokens_seen": 56198020, + "router_z_loss_clip": 2.9453125, + "router_z_loss_mlp": 0.29321289, + "step": 2587, + "time_per_iteration": 2.5246498584747314 + }, + { + "auxiliary_loss_clip": 0.06596097, + "auxiliary_loss_mlp": 0.01287288, + "balance_loss_clip": 0.06299823, + "balance_loss_mlp": 0.01257152, + "epoch": 0.15559897790470464, + "flos": 25344803082240.0, + "grad_norm": 1.8823875807099288, + "language_loss": 0.8996799, + "learning_rate": 3.836789105629236e-06, + "loss": 0.97851378, + "num_input_tokens_seen": 56218165, + "router_z_loss_clip": 2.9609375, + "router_z_loss_mlp": 0.30126953, + "step": 2588, + "time_per_iteration": 4.054608345031738 + }, + { + "auxiliary_loss_clip": 0.06588855, + "auxiliary_loss_mlp": 0.01285264, + "balance_loss_clip": 0.06298578, + "balance_loss_mlp": 0.01255628, + "epoch": 0.1556591011573726, + "flos": 23155453676160.0, + "grad_norm": 2.3276735592444253, + "language_loss": 0.65979421, + "learning_rate": 3.83663497412695e-06, + "loss": 0.7385354, + "num_input_tokens_seen": 56237160, + "router_z_loss_clip": 2.90234375, + "router_z_loss_mlp": 0.29614258, + "step": 2589, + "time_per_iteration": 2.5870378017425537 + }, + { + "auxiliary_loss_clip": 0.06587367, + "auxiliary_loss_mlp": 0.01283128, + "balance_loss_clip": 0.06296779, + "balance_loss_mlp": 0.01254554, + "epoch": 0.15571922441004057, + "flos": 25377353193600.0, + "grad_norm": 1.8444510343536653, + "language_loss": 0.83209628, + "learning_rate": 3.836480772979281e-06, + "loss": 0.91080129, + "num_input_tokens_seen": 56257610, + "router_z_loss_clip": 2.90820312, + "router_z_loss_mlp": 0.2857666, + "step": 2590, + "time_per_iteration": 2.567789316177368 + }, + { + "auxiliary_loss_clip": 0.06586926, + "auxiliary_loss_mlp": 0.01284797, + "balance_loss_clip": 0.06295232, + "balance_loss_mlp": 0.0125819, + "epoch": 0.15577934766270854, + "flos": 14506565489280.0, + "grad_norm": 2.5394168350381956, + "language_loss": 0.80645335, + "learning_rate": 3.836326502192077e-06, + "loss": 0.88517064, + "num_input_tokens_seen": 56275215, + "router_z_loss_clip": 2.91796875, + "router_z_loss_mlp": 0.26635742, + "step": 2591, + "time_per_iteration": 2.552945852279663 + }, + { + "auxiliary_loss_clip": 0.06583126, + "auxiliary_loss_mlp": 0.0128094, + "balance_loss_clip": 0.06296018, + "balance_loss_mlp": 0.01255953, + "epoch": 0.15583947091537653, + "flos": 37423575573120.0, + "grad_norm": 4.213698124732034, + "language_loss": 0.6586749, + "learning_rate": 3.836172161771189e-06, + "loss": 0.73731554, + "num_input_tokens_seen": 56297130, + "router_z_loss_clip": 2.87109375, + "router_z_loss_mlp": 0.25024414, + "step": 2592, + "time_per_iteration": 2.6843414306640625 + }, + { + "auxiliary_loss_clip": 0.06601857, + "auxiliary_loss_mlp": 0.01282978, + "balance_loss_clip": 0.06306329, + "balance_loss_mlp": 0.01254547, + "epoch": 0.1558995941680445, + "flos": 21841097731200.0, + "grad_norm": 2.3724666239354804, + "language_loss": 0.83576721, + "learning_rate": 3.836017751722467e-06, + "loss": 0.91461557, + "num_input_tokens_seen": 56314995, + "router_z_loss_clip": 2.95898438, + "router_z_loss_mlp": 0.28442383, + "step": 2593, + "time_per_iteration": 2.565361738204956 + }, + { + "auxiliary_loss_clip": 0.06586924, + "auxiliary_loss_mlp": 0.01289301, + "balance_loss_clip": 0.06303876, + "balance_loss_mlp": 0.01261526, + "epoch": 0.15595971742071246, + "flos": 19798845367680.0, + "grad_norm": 2.2297480783075847, + "language_loss": 0.74099863, + "learning_rate": 3.8358632720517695e-06, + "loss": 0.8197608, + "num_input_tokens_seen": 56334005, + "router_z_loss_clip": 2.83203125, + "router_z_loss_mlp": 0.27819824, + "step": 2594, + "time_per_iteration": 2.55253267288208 + }, + { + "auxiliary_loss_clip": 0.06601368, + "auxiliary_loss_mlp": 0.01282916, + "balance_loss_clip": 0.06319516, + "balance_loss_mlp": 0.01257346, + "epoch": 0.15601984067338043, + "flos": 26729038932480.0, + "grad_norm": 2.826820029132309, + "language_loss": 0.82562411, + "learning_rate": 3.835708722764952e-06, + "loss": 0.90446699, + "num_input_tokens_seen": 56353795, + "router_z_loss_clip": 2.81835938, + "router_z_loss_mlp": 0.2557373, + "step": 2595, + "time_per_iteration": 2.640240430831909 + }, + { + "auxiliary_loss_clip": 0.06626514, + "auxiliary_loss_mlp": 0.01281437, + "balance_loss_clip": 0.06334631, + "balance_loss_mlp": 0.01254936, + "epoch": 0.1560799639260484, + "flos": 18375183371520.0, + "grad_norm": 9.37489887619581, + "language_loss": 0.87632233, + "learning_rate": 3.835554103867876e-06, + "loss": 0.95540184, + "num_input_tokens_seen": 56373195, + "router_z_loss_clip": 2.91992188, + "router_z_loss_mlp": 0.26538086, + "step": 2596, + "time_per_iteration": 2.529327869415283 + }, + { + "auxiliary_loss_clip": 0.06606492, + "auxiliary_loss_mlp": 0.01287289, + "balance_loss_clip": 0.06323552, + "balance_loss_mlp": 0.01261015, + "epoch": 0.15614008717871636, + "flos": 22605149986560.0, + "grad_norm": 2.807545322610708, + "language_loss": 0.69688505, + "learning_rate": 3.835399415366404e-06, + "loss": 0.77582288, + "num_input_tokens_seen": 56391525, + "router_z_loss_clip": 2.82617188, + "router_z_loss_mlp": 0.26306152, + "step": 2597, + "time_per_iteration": 2.5685815811157227 + }, + { + "auxiliary_loss_clip": 0.0662894, + "auxiliary_loss_mlp": 0.01280666, + "balance_loss_clip": 0.06348241, + "balance_loss_mlp": 0.01256455, + "epoch": 0.15620021043138435, + "flos": 22753379059200.0, + "grad_norm": 2.0232351113841514, + "language_loss": 0.80914307, + "learning_rate": 3.8352446572664035e-06, + "loss": 0.88823915, + "num_input_tokens_seen": 56410715, + "router_z_loss_clip": 2.81054688, + "router_z_loss_mlp": 0.2421875, + "step": 2598, + "time_per_iteration": 2.554202079772949 + }, + { + "auxiliary_loss_clip": 0.0662708, + "auxiliary_loss_mlp": 0.01284312, + "balance_loss_clip": 0.06344105, + "balance_loss_mlp": 0.01257895, + "epoch": 0.15626033368405232, + "flos": 13119897870720.0, + "grad_norm": 2.0408523791990016, + "language_loss": 0.83276039, + "learning_rate": 3.8350898295737405e-06, + "loss": 0.91187429, + "num_input_tokens_seen": 56429170, + "router_z_loss_clip": 2.83007812, + "router_z_loss_mlp": 0.26391602, + "step": 2599, + "time_per_iteration": 2.66353702545166 + }, + { + "auxiliary_loss_clip": 0.06639346, + "auxiliary_loss_mlp": 0.01292644, + "balance_loss_clip": 0.06344323, + "balance_loss_mlp": 0.0126469, + "epoch": 0.15632045693672028, + "flos": 16477931698560.0, + "grad_norm": 2.3045518919772046, + "language_loss": 0.82379115, + "learning_rate": 3.834934932294287e-06, + "loss": 0.9031111, + "num_input_tokens_seen": 56445685, + "router_z_loss_clip": 2.95117188, + "router_z_loss_mlp": 0.27941895, + "step": 2600, + "time_per_iteration": 2.50607967376709 + }, + { + "auxiliary_loss_clip": 0.06646761, + "auxiliary_loss_mlp": 0.01287391, + "balance_loss_clip": 0.0635706, + "balance_loss_mlp": 0.01259305, + "epoch": 0.15638058018938825, + "flos": 20856672437760.0, + "grad_norm": 2.020166421544308, + "language_loss": 0.88839436, + "learning_rate": 3.834779965433917e-06, + "loss": 0.96773589, + "num_input_tokens_seen": 56465900, + "router_z_loss_clip": 2.8984375, + "router_z_loss_mlp": 0.28076172, + "step": 2601, + "time_per_iteration": 2.574437141418457 + }, + { + "auxiliary_loss_clip": 0.06648471, + "auxiliary_loss_mlp": 0.01294906, + "balance_loss_clip": 0.06352241, + "balance_loss_mlp": 0.01267989, + "epoch": 0.1564407034420562, + "flos": 21878762941440.0, + "grad_norm": 2.51177361833528, + "language_loss": 0.79510248, + "learning_rate": 3.834624928998508e-06, + "loss": 0.87453628, + "num_input_tokens_seen": 56485020, + "router_z_loss_clip": 2.96679688, + "router_z_loss_mlp": 0.26940918, + "step": 2602, + "time_per_iteration": 2.5957844257354736 + }, + { + "auxiliary_loss_clip": 0.06633168, + "auxiliary_loss_mlp": 0.01292264, + "balance_loss_clip": 0.06345348, + "balance_loss_mlp": 0.01265979, + "epoch": 0.15650082669472418, + "flos": 21840888096000.0, + "grad_norm": 1.9170738392352888, + "language_loss": 0.7431488, + "learning_rate": 3.8344698229939376e-06, + "loss": 0.82240313, + "num_input_tokens_seen": 56505205, + "router_z_loss_clip": 2.875, + "router_z_loss_mlp": 0.26293945, + "step": 2603, + "time_per_iteration": 2.5696704387664795 + }, + { + "auxiliary_loss_clip": 0.06625052, + "auxiliary_loss_mlp": 0.01287753, + "balance_loss_clip": 0.06337333, + "balance_loss_mlp": 0.01261217, + "epoch": 0.15656094994739214, + "flos": 13804343147520.0, + "grad_norm": 2.480258971716289, + "language_loss": 0.88529468, + "learning_rate": 3.8343146474260865e-06, + "loss": 0.9644227, + "num_input_tokens_seen": 56521495, + "router_z_loss_clip": 2.87890625, + "router_z_loss_mlp": 0.26538086, + "step": 2604, + "time_per_iteration": 2.5110373497009277 + }, + { + "auxiliary_loss_clip": 0.06634312, + "auxiliary_loss_mlp": 0.01291425, + "balance_loss_clip": 0.06341597, + "balance_loss_mlp": 0.01266558, + "epoch": 0.15662107320006013, + "flos": 27315582312960.0, + "grad_norm": 2.192350516429204, + "language_loss": 0.85880566, + "learning_rate": 3.834159402300841e-06, + "loss": 0.93806314, + "num_input_tokens_seen": 56540665, + "router_z_loss_clip": 2.9296875, + "router_z_loss_mlp": 0.2487793, + "step": 2605, + "time_per_iteration": 2.6109507083892822 + }, + { + "auxiliary_loss_clip": 0.06649123, + "auxiliary_loss_mlp": 0.01294389, + "balance_loss_clip": 0.06348212, + "balance_loss_mlp": 0.01265802, + "epoch": 0.1566811964527281, + "flos": 26691876846720.0, + "grad_norm": 1.9127965853266395, + "language_loss": 0.73996091, + "learning_rate": 3.834004087624087e-06, + "loss": 0.81939602, + "num_input_tokens_seen": 56560805, + "router_z_loss_clip": 3.00976562, + "router_z_loss_mlp": 0.28564453, + "step": 2606, + "time_per_iteration": 2.7345151901245117 + }, + { + "auxiliary_loss_clip": 0.06621392, + "auxiliary_loss_mlp": 0.01286091, + "balance_loss_clip": 0.06334884, + "balance_loss_mlp": 0.01260246, + "epoch": 0.15674131970539606, + "flos": 16108323338880.0, + "grad_norm": 2.273122789948623, + "language_loss": 0.77297181, + "learning_rate": 3.8338487034017145e-06, + "loss": 0.85204661, + "num_input_tokens_seen": 56576335, + "router_z_loss_clip": 2.8671875, + "router_z_loss_mlp": 0.25842285, + "step": 2607, + "time_per_iteration": 2.571983575820923 + }, + { + "auxiliary_loss_clip": 0.06614074, + "auxiliary_loss_mlp": 0.01286338, + "balance_loss_clip": 0.06327923, + "balance_loss_mlp": 0.01260791, + "epoch": 0.15680144295806403, + "flos": 19175349536640.0, + "grad_norm": 1.917731361959034, + "language_loss": 0.8328836, + "learning_rate": 3.833693249639615e-06, + "loss": 0.91188771, + "num_input_tokens_seen": 56595880, + "router_z_loss_clip": 2.86328125, + "router_z_loss_mlp": 0.25598145, + "step": 2608, + "time_per_iteration": 2.5823540687561035 + }, + { + "auxiliary_loss_clip": 0.06622173, + "auxiliary_loss_mlp": 0.01295073, + "balance_loss_clip": 0.06326167, + "balance_loss_mlp": 0.01264901, + "epoch": 0.156861566210732, + "flos": 20819678060160.0, + "grad_norm": 2.1481617307418017, + "language_loss": 0.73101258, + "learning_rate": 3.833537726343684e-06, + "loss": 0.81018502, + "num_input_tokens_seen": 56615130, + "router_z_loss_clip": 2.9609375, + "router_z_loss_mlp": 0.30163574, + "step": 2609, + "time_per_iteration": 2.572356700897217 + }, + { + "auxiliary_loss_clip": 0.06605803, + "auxiliary_loss_mlp": 0.01286832, + "balance_loss_clip": 0.06311236, + "balance_loss_mlp": 0.01260928, + "epoch": 0.15692168946339996, + "flos": 20054158358400.0, + "grad_norm": 2.0130429141277446, + "language_loss": 0.73445058, + "learning_rate": 3.833382133519818e-06, + "loss": 0.8133769, + "num_input_tokens_seen": 56634005, + "router_z_loss_clip": 2.94726562, + "router_z_loss_mlp": 0.2590332, + "step": 2610, + "time_per_iteration": 2.567537784576416 + }, + { + "auxiliary_loss_clip": 0.06606032, + "auxiliary_loss_mlp": 0.01288962, + "balance_loss_clip": 0.06310159, + "balance_loss_mlp": 0.01258873, + "epoch": 0.15698181271606793, + "flos": 21404502432000.0, + "grad_norm": 1.9787082052238874, + "language_loss": 0.73279381, + "learning_rate": 3.833226471173919e-06, + "loss": 0.81174374, + "num_input_tokens_seen": 56653480, + "router_z_loss_clip": 2.95898438, + "router_z_loss_mlp": 0.30065918, + "step": 2611, + "time_per_iteration": 2.582390308380127 + }, + { + "auxiliary_loss_clip": 0.06594902, + "auxiliary_loss_mlp": 0.01284265, + "balance_loss_clip": 0.06304685, + "balance_loss_mlp": 0.01259172, + "epoch": 0.15704193596873592, + "flos": 20851347703680.0, + "grad_norm": 2.098501694873674, + "language_loss": 0.71879792, + "learning_rate": 3.833070739311887e-06, + "loss": 0.79758954, + "num_input_tokens_seen": 56672270, + "router_z_loss_clip": 2.90234375, + "router_z_loss_mlp": 0.25097656, + "step": 2612, + "time_per_iteration": 2.577627658843994 + }, + { + "auxiliary_loss_clip": 0.0659887, + "auxiliary_loss_mlp": 0.01283795, + "balance_loss_clip": 0.06308534, + "balance_loss_mlp": 0.0125832, + "epoch": 0.15710205922140388, + "flos": 21769456890240.0, + "grad_norm": 2.359608918603851, + "language_loss": 0.77193695, + "learning_rate": 3.83291493793963e-06, + "loss": 0.85076362, + "num_input_tokens_seen": 56691510, + "router_z_loss_clip": 2.90429688, + "router_z_loss_mlp": 0.2545166, + "step": 2613, + "time_per_iteration": 2.5632479190826416 + }, + { + "auxiliary_loss_clip": 0.06608421, + "auxiliary_loss_mlp": 0.01292559, + "balance_loss_clip": 0.06315231, + "balance_loss_mlp": 0.01266106, + "epoch": 0.15716218247407185, + "flos": 25014453160320.0, + "grad_norm": 1.6622650675423762, + "language_loss": 0.66684031, + "learning_rate": 3.832759067063055e-06, + "loss": 0.74585009, + "num_input_tokens_seen": 56712230, + "router_z_loss_clip": 2.93359375, + "router_z_loss_mlp": 0.26428223, + "step": 2614, + "time_per_iteration": 2.684286117553711 + }, + { + "auxiliary_loss_clip": 0.0661184, + "auxiliary_loss_mlp": 0.01292567, + "balance_loss_clip": 0.06314493, + "balance_loss_mlp": 0.01264255, + "epoch": 0.1572223057267398, + "flos": 20197691602560.0, + "grad_norm": 3.2869095787841576, + "language_loss": 0.76402575, + "learning_rate": 3.832603126688072e-06, + "loss": 0.84306979, + "num_input_tokens_seen": 56727490, + "router_z_loss_clip": 2.9765625, + "router_z_loss_mlp": 0.28308105, + "step": 2615, + "time_per_iteration": 2.551769971847534 + }, + { + "auxiliary_loss_clip": 0.06589202, + "auxiliary_loss_mlp": 0.01285836, + "balance_loss_clip": 0.06304425, + "balance_loss_mlp": 0.01260587, + "epoch": 0.15728242897940778, + "flos": 20965810780800.0, + "grad_norm": 1.7986527043954237, + "language_loss": 0.74040192, + "learning_rate": 3.832447116820594e-06, + "loss": 0.81915236, + "num_input_tokens_seen": 56747385, + "router_z_loss_clip": 2.84960938, + "router_z_loss_mlp": 0.25256348, + "step": 2616, + "time_per_iteration": 2.5935630798339844 + }, + { + "auxiliary_loss_clip": 0.06601542, + "auxiliary_loss_mlp": 0.01283526, + "balance_loss_clip": 0.06305884, + "balance_loss_mlp": 0.01256966, + "epoch": 0.15734255223207574, + "flos": 23044764032640.0, + "grad_norm": 2.1005464521191426, + "language_loss": 0.73305666, + "learning_rate": 3.832291037466539e-06, + "loss": 0.81190741, + "num_input_tokens_seen": 56768055, + "router_z_loss_clip": 2.95703125, + "router_z_loss_mlp": 0.265625, + "step": 2617, + "time_per_iteration": 2.6116013526916504 + }, + { + "auxiliary_loss_clip": 0.06593003, + "auxiliary_loss_mlp": 0.01287239, + "balance_loss_clip": 0.06306564, + "balance_loss_mlp": 0.012605, + "epoch": 0.15740267548474374, + "flos": 20556357004800.0, + "grad_norm": 2.1735503953171813, + "language_loss": 0.75337285, + "learning_rate": 3.8321348886318235e-06, + "loss": 0.83217525, + "num_input_tokens_seen": 56785110, + "router_z_loss_clip": 2.86523438, + "router_z_loss_mlp": 0.26745605, + "step": 2618, + "time_per_iteration": 2.558271884918213 + }, + { + "auxiliary_loss_clip": 0.06606486, + "auxiliary_loss_mlp": 0.01288019, + "balance_loss_clip": 0.06305802, + "balance_loss_mlp": 0.01260052, + "epoch": 0.1574627987374117, + "flos": 22672262707200.0, + "grad_norm": 2.4653942739702277, + "language_loss": 0.79897004, + "learning_rate": 3.8319786703223695e-06, + "loss": 0.87791508, + "num_input_tokens_seen": 56804975, + "router_z_loss_clip": 3.00585938, + "router_z_loss_mlp": 0.2800293, + "step": 2619, + "time_per_iteration": 2.5732688903808594 + }, + { + "auxiliary_loss_clip": 0.06592336, + "auxiliary_loss_mlp": 0.01289339, + "balance_loss_clip": 0.06304029, + "balance_loss_mlp": 0.01263304, + "epoch": 0.15752292199007967, + "flos": 16806352976640.0, + "grad_norm": 1.8956550238632917, + "language_loss": 0.77960408, + "learning_rate": 3.831822382544101e-06, + "loss": 0.85842085, + "num_input_tokens_seen": 56822470, + "router_z_loss_clip": 2.88476562, + "router_z_loss_mlp": 0.26013184, + "step": 2620, + "time_per_iteration": 2.556342363357544 + }, + { + "auxiliary_loss_clip": 0.06608844, + "auxiliary_loss_mlp": 0.01287118, + "balance_loss_clip": 0.06316274, + "balance_loss_mlp": 0.01259843, + "epoch": 0.15758304524274763, + "flos": 29833856121600.0, + "grad_norm": 1.8795614053933318, + "language_loss": 0.72243416, + "learning_rate": 3.831666025302944e-06, + "loss": 0.80139381, + "num_input_tokens_seen": 56842100, + "router_z_loss_clip": 2.92382812, + "router_z_loss_mlp": 0.27282715, + "step": 2621, + "time_per_iteration": 4.014448881149292 + }, + { + "auxiliary_loss_clip": 0.06605494, + "auxiliary_loss_mlp": 0.01287754, + "balance_loss_clip": 0.06309334, + "balance_loss_mlp": 0.01260813, + "epoch": 0.1576431684954156, + "flos": 53589116851200.0, + "grad_norm": 5.362699165833927, + "language_loss": 0.73428345, + "learning_rate": 3.831509598604828e-06, + "loss": 0.81321585, + "num_input_tokens_seen": 56865920, + "router_z_loss_clip": 2.96484375, + "router_z_loss_mlp": 0.26940918, + "step": 2622, + "time_per_iteration": 2.9332852363586426 + }, + { + "auxiliary_loss_clip": 0.06587812, + "auxiliary_loss_mlp": 0.01287353, + "balance_loss_clip": 0.06302886, + "balance_loss_mlp": 0.01262284, + "epoch": 0.15770329174808356, + "flos": 20819887695360.0, + "grad_norm": 1.8034719431418926, + "language_loss": 0.88731241, + "learning_rate": 3.831353102455684e-06, + "loss": 0.96606404, + "num_input_tokens_seen": 56885265, + "router_z_loss_clip": 2.84765625, + "router_z_loss_mlp": 0.25085449, + "step": 2623, + "time_per_iteration": 3.993907928466797 + }, + { + "auxiliary_loss_clip": 0.06595732, + "auxiliary_loss_mlp": 0.01282154, + "balance_loss_clip": 0.0630911, + "balance_loss_mlp": 0.01255594, + "epoch": 0.15776341500075153, + "flos": 24981148362240.0, + "grad_norm": 2.539905380031208, + "language_loss": 0.82629728, + "learning_rate": 3.831196536861448e-06, + "loss": 0.90507615, + "num_input_tokens_seen": 56906710, + "router_z_loss_clip": 2.86523438, + "router_z_loss_mlp": 0.265625, + "step": 2624, + "time_per_iteration": 2.5706846714019775 + }, + { + "auxiliary_loss_clip": 0.06606949, + "auxiliary_loss_mlp": 0.01292533, + "balance_loss_clip": 0.06309812, + "balance_loss_mlp": 0.01266093, + "epoch": 0.15782353825341952, + "flos": 21914331799680.0, + "grad_norm": 3.0693090763099815, + "language_loss": 0.81940538, + "learning_rate": 3.831039901828054e-06, + "loss": 0.89840019, + "num_input_tokens_seen": 56924275, + "router_z_loss_clip": 2.9765625, + "router_z_loss_mlp": 0.26452637, + "step": 2625, + "time_per_iteration": 2.569840669631958 + }, + { + "auxiliary_loss_clip": 0.06593765, + "auxiliary_loss_mlp": 0.01293944, + "balance_loss_clip": 0.06303135, + "balance_loss_mlp": 0.01268064, + "epoch": 0.15788366150608749, + "flos": 26184395393280.0, + "grad_norm": 2.523517901800404, + "language_loss": 0.81776226, + "learning_rate": 3.830883197361445e-06, + "loss": 0.89663935, + "num_input_tokens_seen": 56941525, + "router_z_loss_clip": 2.90429688, + "router_z_loss_mlp": 0.25891113, + "step": 2626, + "time_per_iteration": 2.561379909515381 + }, + { + "auxiliary_loss_clip": 0.06594853, + "auxiliary_loss_mlp": 0.01294161, + "balance_loss_clip": 0.06304863, + "balance_loss_mlp": 0.01267434, + "epoch": 0.15794378475875545, + "flos": 27717321513600.0, + "grad_norm": 1.6929688421529916, + "language_loss": 0.7457962, + "learning_rate": 3.830726423467561e-06, + "loss": 0.82468635, + "num_input_tokens_seen": 56962145, + "router_z_loss_clip": 2.90039062, + "router_z_loss_mlp": 0.26708984, + "step": 2627, + "time_per_iteration": 2.596707344055176 + }, + { + "auxiliary_loss_clip": 0.06587663, + "auxiliary_loss_mlp": 0.01294139, + "balance_loss_clip": 0.06296949, + "balance_loss_mlp": 0.01267007, + "epoch": 0.15800390801142342, + "flos": 12135011379840.0, + "grad_norm": 2.3877400099999413, + "language_loss": 0.87097675, + "learning_rate": 3.830569580152348e-06, + "loss": 0.94979477, + "num_input_tokens_seen": 56977505, + "router_z_loss_clip": 2.91015625, + "router_z_loss_mlp": 0.27172852, + "step": 2628, + "time_per_iteration": 5.372643709182739 + }, + { + "auxiliary_loss_clip": 0.06588875, + "auxiliary_loss_mlp": 0.01280598, + "balance_loss_clip": 0.06300817, + "balance_loss_mlp": 0.0125548, + "epoch": 0.15806403126409138, + "flos": 20711084768640.0, + "grad_norm": 2.1789511738163236, + "language_loss": 0.77439439, + "learning_rate": 3.830412667421752e-06, + "loss": 0.85308909, + "num_input_tokens_seen": 56996770, + "router_z_loss_clip": 2.88085938, + "router_z_loss_mlp": 0.25097656, + "step": 2629, + "time_per_iteration": 2.571425199508667 + }, + { + "auxiliary_loss_clip": 0.06593206, + "auxiliary_loss_mlp": 0.0128531, + "balance_loss_clip": 0.06298864, + "balance_loss_mlp": 0.01257117, + "epoch": 0.15812415451675935, + "flos": 17827479158400.0, + "grad_norm": 2.6284348264521853, + "language_loss": 0.74838495, + "learning_rate": 3.8302556852817245e-06, + "loss": 0.82717013, + "num_input_tokens_seen": 57014970, + "router_z_loss_clip": 2.94726562, + "router_z_loss_mlp": 0.28186035, + "step": 2630, + "time_per_iteration": 2.538496971130371 + }, + { + "auxiliary_loss_clip": 0.06592915, + "auxiliary_loss_mlp": 0.01286291, + "balance_loss_clip": 0.06295606, + "balance_loss_mlp": 0.0125904, + "epoch": 0.15818427776942734, + "flos": 20090230341120.0, + "grad_norm": 3.888480122572148, + "language_loss": 0.84692156, + "learning_rate": 3.8300986337382184e-06, + "loss": 0.9257136, + "num_input_tokens_seen": 57034045, + "router_z_loss_clip": 2.97070312, + "router_z_loss_mlp": 0.27270508, + "step": 2631, + "time_per_iteration": 2.6821517944335938 + }, + { + "auxiliary_loss_clip": 0.06584532, + "auxiliary_loss_mlp": 0.01280599, + "balance_loss_clip": 0.06294788, + "balance_loss_mlp": 0.01253563, + "epoch": 0.1582444010220953, + "flos": 21221249552640.0, + "grad_norm": 8.851391146614638, + "language_loss": 0.79768324, + "learning_rate": 3.8299415127971895e-06, + "loss": 0.87633461, + "num_input_tokens_seen": 57053695, + "router_z_loss_clip": 2.8984375, + "router_z_loss_mlp": 0.27050781, + "step": 2632, + "time_per_iteration": 2.5977976322174072 + }, + { + "auxiliary_loss_clip": 0.06588165, + "auxiliary_loss_mlp": 0.01281414, + "balance_loss_clip": 0.06294183, + "balance_loss_mlp": 0.01255414, + "epoch": 0.15830452427476327, + "flos": 17864138119680.0, + "grad_norm": 1.985726901466477, + "language_loss": 0.83594966, + "learning_rate": 3.829784322464594e-06, + "loss": 0.91464543, + "num_input_tokens_seen": 57071290, + "router_z_loss_clip": 2.93945312, + "router_z_loss_mlp": 0.2598877, + "step": 2633, + "time_per_iteration": 2.569474220275879 + }, + { + "auxiliary_loss_clip": 0.0658908, + "auxiliary_loss_mlp": 0.0128242, + "balance_loss_clip": 0.0629508, + "balance_loss_mlp": 0.01256265, + "epoch": 0.15836464752743123, + "flos": 24541827805440.0, + "grad_norm": 1.6688248008006443, + "language_loss": 0.78379452, + "learning_rate": 3.829627062746394e-06, + "loss": 0.86250955, + "num_input_tokens_seen": 57091465, + "router_z_loss_clip": 2.93945312, + "router_z_loss_mlp": 0.26196289, + "step": 2634, + "time_per_iteration": 2.5919923782348633 + }, + { + "auxiliary_loss_clip": 0.06593279, + "auxiliary_loss_mlp": 0.01291316, + "balance_loss_clip": 0.06295943, + "balance_loss_mlp": 0.01263337, + "epoch": 0.1584247707800992, + "flos": 20127057010560.0, + "grad_norm": 2.0830753641117306, + "language_loss": 0.89997375, + "learning_rate": 3.829469733648552e-06, + "loss": 0.97881973, + "num_input_tokens_seen": 57110075, + "router_z_loss_clip": 2.9765625, + "router_z_loss_mlp": 0.27966309, + "step": 2635, + "time_per_iteration": 2.5786406993865967 + }, + { + "auxiliary_loss_clip": 0.06588058, + "auxiliary_loss_mlp": 0.01288113, + "balance_loss_clip": 0.06292774, + "balance_loss_mlp": 0.01260218, + "epoch": 0.15848489403276717, + "flos": 20382202293120.0, + "grad_norm": 2.014850044069841, + "language_loss": 0.7709136, + "learning_rate": 3.829312335177034e-06, + "loss": 0.8496753, + "num_input_tokens_seen": 57128945, + "router_z_loss_clip": 2.95703125, + "router_z_loss_mlp": 0.27868652, + "step": 2636, + "time_per_iteration": 2.6201331615448 + }, + { + "auxiliary_loss_clip": 0.06586573, + "auxiliary_loss_mlp": 0.0128751, + "balance_loss_clip": 0.06290652, + "balance_loss_mlp": 0.0126101, + "epoch": 0.15854501728543513, + "flos": 39356018760960.0, + "grad_norm": 2.044553358008507, + "language_loss": 0.73238122, + "learning_rate": 3.82915486733781e-06, + "loss": 0.81112206, + "num_input_tokens_seen": 57152385, + "router_z_loss_clip": 2.95898438, + "router_z_loss_mlp": 0.26489258, + "step": 2637, + "time_per_iteration": 2.742854595184326 + }, + { + "auxiliary_loss_clip": 0.06583421, + "auxiliary_loss_mlp": 0.01288932, + "balance_loss_clip": 0.06292208, + "balance_loss_mlp": 0.01262468, + "epoch": 0.15860514053810312, + "flos": 24871297259520.0, + "grad_norm": 1.8074381255816763, + "language_loss": 0.79285657, + "learning_rate": 3.82899733013685e-06, + "loss": 0.87158012, + "num_input_tokens_seen": 57172620, + "router_z_loss_clip": 2.91601562, + "router_z_loss_mlp": 0.26489258, + "step": 2638, + "time_per_iteration": 2.5642874240875244 + }, + { + "auxiliary_loss_clip": 0.06588158, + "auxiliary_loss_mlp": 0.01287351, + "balance_loss_clip": 0.06294204, + "balance_loss_mlp": 0.01258908, + "epoch": 0.1586652637907711, + "flos": 26184982371840.0, + "grad_norm": 2.3471549301232844, + "language_loss": 0.76132977, + "learning_rate": 3.828839723580128e-06, + "loss": 0.84008479, + "num_input_tokens_seen": 57194680, + "router_z_loss_clip": 2.94335938, + "router_z_loss_mlp": 0.28491211, + "step": 2639, + "time_per_iteration": 2.615779399871826 + }, + { + "auxiliary_loss_clip": 0.06586854, + "auxiliary_loss_mlp": 0.01295396, + "balance_loss_clip": 0.06293523, + "balance_loss_mlp": 0.01267299, + "epoch": 0.15872538704343905, + "flos": 19798174535040.0, + "grad_norm": 1.8583301329388602, + "language_loss": 0.82681525, + "learning_rate": 3.82868204767362e-06, + "loss": 0.90563774, + "num_input_tokens_seen": 57214675, + "router_z_loss_clip": 2.93164062, + "router_z_loss_mlp": 0.28076172, + "step": 2640, + "time_per_iteration": 2.5406789779663086 + }, + { + "auxiliary_loss_clip": 0.06583565, + "auxiliary_loss_mlp": 0.0129063, + "balance_loss_clip": 0.06294291, + "balance_loss_mlp": 0.01262342, + "epoch": 0.15878551029610702, + "flos": 28482883142400.0, + "grad_norm": 1.847395702831907, + "language_loss": 0.67676318, + "learning_rate": 3.828524302423306e-06, + "loss": 0.75550508, + "num_input_tokens_seen": 57235830, + "router_z_loss_clip": 2.89453125, + "router_z_loss_mlp": 0.28308105, + "step": 2641, + "time_per_iteration": 2.6107757091522217 + }, + { + "auxiliary_loss_clip": 0.06593709, + "auxiliary_loss_mlp": 0.01287834, + "balance_loss_clip": 0.06291051, + "balance_loss_mlp": 0.01259199, + "epoch": 0.15884563354877498, + "flos": 24213532308480.0, + "grad_norm": 2.4455482341546366, + "language_loss": 0.77487421, + "learning_rate": 3.828366487835167e-06, + "loss": 0.85368967, + "num_input_tokens_seen": 57255970, + "router_z_loss_clip": 3.02929688, + "router_z_loss_mlp": 0.28674316, + "step": 2642, + "time_per_iteration": 2.549790382385254 + }, + { + "auxiliary_loss_clip": 0.06588584, + "auxiliary_loss_mlp": 0.01290508, + "balance_loss_clip": 0.06297128, + "balance_loss_mlp": 0.0126303, + "epoch": 0.15890575680144295, + "flos": 23956332600960.0, + "grad_norm": 2.206510162678276, + "language_loss": 0.71574652, + "learning_rate": 3.828208603915186e-06, + "loss": 0.79453743, + "num_input_tokens_seen": 57274435, + "router_z_loss_clip": 2.91992188, + "router_z_loss_mlp": 0.27478027, + "step": 2643, + "time_per_iteration": 2.5622386932373047 + }, + { + "auxiliary_loss_clip": 0.06581764, + "auxiliary_loss_mlp": 0.01292278, + "balance_loss_clip": 0.06295977, + "balance_loss_mlp": 0.01265432, + "epoch": 0.15896588005411091, + "flos": 21221375333760.0, + "grad_norm": 1.9554363630175624, + "language_loss": 0.78877175, + "learning_rate": 3.828050650669353e-06, + "loss": 0.86751211, + "num_input_tokens_seen": 57293115, + "router_z_loss_clip": 2.859375, + "router_z_loss_mlp": 0.26867676, + "step": 2644, + "time_per_iteration": 2.519049644470215 + }, + { + "auxiliary_loss_clip": 0.06584983, + "auxiliary_loss_mlp": 0.01285638, + "balance_loss_clip": 0.06294875, + "balance_loss_mlp": 0.01257588, + "epoch": 0.1590260033067789, + "flos": 24359203831680.0, + "grad_norm": 1.8306681743440225, + "language_loss": 0.83401352, + "learning_rate": 3.827892628103657e-06, + "loss": 0.91271967, + "num_input_tokens_seen": 57312565, + "router_z_loss_clip": 2.90234375, + "router_z_loss_mlp": 0.28039551, + "step": 2645, + "time_per_iteration": 2.5938899517059326 + }, + { + "auxiliary_loss_clip": 0.06594808, + "auxiliary_loss_mlp": 0.01293395, + "balance_loss_clip": 0.063001, + "balance_loss_mlp": 0.01263914, + "epoch": 0.15908612655944687, + "flos": 32056719960960.0, + "grad_norm": 2.510422612834076, + "language_loss": 0.70788723, + "learning_rate": 3.827734536224087e-06, + "loss": 0.78676921, + "num_input_tokens_seen": 57333360, + "router_z_loss_clip": 2.94921875, + "router_z_loss_mlp": 0.2947998, + "step": 2646, + "time_per_iteration": 2.6329824924468994 + }, + { + "auxiliary_loss_clip": 0.06588359, + "auxiliary_loss_mlp": 0.01289443, + "balance_loss_clip": 0.06303679, + "balance_loss_mlp": 0.01262728, + "epoch": 0.15914624981211484, + "flos": 17791155613440.0, + "grad_norm": 1.930709185953096, + "language_loss": 0.63532102, + "learning_rate": 3.827576375036642e-06, + "loss": 0.71409905, + "num_input_tokens_seen": 57350575, + "router_z_loss_clip": 2.84765625, + "router_z_loss_mlp": 0.26696777, + "step": 2647, + "time_per_iteration": 2.5299501419067383 + }, + { + "auxiliary_loss_clip": 0.06584711, + "auxiliary_loss_mlp": 0.01288467, + "balance_loss_clip": 0.06297973, + "balance_loss_mlp": 0.0126174, + "epoch": 0.1592063730647828, + "flos": 17718298888320.0, + "grad_norm": 2.1247786745604818, + "language_loss": 0.90530396, + "learning_rate": 3.827418144547318e-06, + "loss": 0.98403573, + "num_input_tokens_seen": 57367570, + "router_z_loss_clip": 2.86914062, + "router_z_loss_mlp": 0.26757812, + "step": 2648, + "time_per_iteration": 2.5112242698669434 + }, + { + "auxiliary_loss_clip": 0.06582057, + "auxiliary_loss_mlp": 0.01285915, + "balance_loss_clip": 0.06301906, + "balance_loss_mlp": 0.01259915, + "epoch": 0.15926649631745077, + "flos": 18808927632000.0, + "grad_norm": 2.0063837423825044, + "language_loss": 0.92929685, + "learning_rate": 3.827259844762114e-06, + "loss": 1.00797653, + "num_input_tokens_seen": 57383980, + "router_z_loss_clip": 2.80078125, + "router_z_loss_mlp": 0.26013184, + "step": 2649, + "time_per_iteration": 2.5400166511535645 + }, + { + "auxiliary_loss_clip": 0.06614827, + "auxiliary_loss_mlp": 0.01289461, + "balance_loss_clip": 0.0630791, + "balance_loss_mlp": 0.01258156, + "epoch": 0.15932661957011873, + "flos": 17571956532480.0, + "grad_norm": 3.5338623134858924, + "language_loss": 0.73033249, + "learning_rate": 3.827101475687033e-06, + "loss": 0.80937541, + "num_input_tokens_seen": 57400840, + "router_z_loss_clip": 3.07226562, + "router_z_loss_mlp": 0.31311035, + "step": 2650, + "time_per_iteration": 2.499260187149048 + }, + { + "auxiliary_loss_clip": 0.06585062, + "auxiliary_loss_mlp": 0.01286624, + "balance_loss_clip": 0.06301688, + "balance_loss_mlp": 0.01259837, + "epoch": 0.15938674282278673, + "flos": 13339432368000.0, + "grad_norm": 2.105429239138805, + "language_loss": 0.72751939, + "learning_rate": 3.826943037328082e-06, + "loss": 0.80623615, + "num_input_tokens_seen": 57419230, + "router_z_loss_clip": 2.83398438, + "router_z_loss_mlp": 0.2677002, + "step": 2651, + "time_per_iteration": 2.5559604167938232 + }, + { + "auxiliary_loss_clip": 0.06597096, + "auxiliary_loss_mlp": 0.01284795, + "balance_loss_clip": 0.06307643, + "balance_loss_mlp": 0.01257925, + "epoch": 0.1594468660754547, + "flos": 22494879613440.0, + "grad_norm": 1.8417049105495777, + "language_loss": 0.80598879, + "learning_rate": 3.8267845296912674e-06, + "loss": 0.88480765, + "num_input_tokens_seen": 57439315, + "router_z_loss_clip": 2.89257812, + "router_z_loss_mlp": 0.26855469, + "step": 2652, + "time_per_iteration": 2.562206745147705 + }, + { + "auxiliary_loss_clip": 0.06582868, + "auxiliary_loss_mlp": 0.01288009, + "balance_loss_clip": 0.06299073, + "balance_loss_mlp": 0.01260745, + "epoch": 0.15950698932812266, + "flos": 15011782882560.0, + "grad_norm": 3.0665030726784233, + "language_loss": 0.71219099, + "learning_rate": 3.826625952782601e-06, + "loss": 0.79089975, + "num_input_tokens_seen": 57454635, + "router_z_loss_clip": 2.83984375, + "router_z_loss_mlp": 0.27258301, + "step": 2653, + "time_per_iteration": 2.5217130184173584 + }, + { + "auxiliary_loss_clip": 0.06588405, + "auxiliary_loss_mlp": 0.01286539, + "balance_loss_clip": 0.06299819, + "balance_loss_mlp": 0.01261064, + "epoch": 0.15956711258079062, + "flos": 30163074013440.0, + "grad_norm": 3.2964270915620655, + "language_loss": 0.78400207, + "learning_rate": 3.826467306608095e-06, + "loss": 0.86275154, + "num_input_tokens_seen": 57476805, + "router_z_loss_clip": 2.890625, + "router_z_loss_mlp": 0.25488281, + "step": 2654, + "time_per_iteration": 2.68938946723938 + }, + { + "auxiliary_loss_clip": 0.06585521, + "auxiliary_loss_mlp": 0.01284621, + "balance_loss_clip": 0.06301536, + "balance_loss_mlp": 0.01259265, + "epoch": 0.1596272358334586, + "flos": 21039044849280.0, + "grad_norm": 1.8634603693624054, + "language_loss": 0.82786137, + "learning_rate": 3.826308591173765e-06, + "loss": 0.90656281, + "num_input_tokens_seen": 57496400, + "router_z_loss_clip": 2.83984375, + "router_z_loss_mlp": 0.25341797, + "step": 2655, + "time_per_iteration": 2.5611259937286377 + }, + { + "auxiliary_loss_clip": 0.06585874, + "auxiliary_loss_mlp": 0.01285173, + "balance_loss_clip": 0.06296754, + "balance_loss_mlp": 0.01259937, + "epoch": 0.15968735908612655, + "flos": 15273426856320.0, + "grad_norm": 1.9406686852412747, + "language_loss": 0.74707991, + "learning_rate": 3.826149806485631e-06, + "loss": 0.82579041, + "num_input_tokens_seen": 57513700, + "router_z_loss_clip": 2.890625, + "router_z_loss_mlp": 0.25244141, + "step": 2656, + "time_per_iteration": 2.510824680328369 + }, + { + "auxiliary_loss_clip": 0.06577112, + "auxiliary_loss_mlp": 0.0129381, + "balance_loss_clip": 0.06299932, + "balance_loss_mlp": 0.01268705, + "epoch": 0.15974748233879452, + "flos": 52677338647680.0, + "grad_norm": 1.8958398061879393, + "language_loss": 0.78470719, + "learning_rate": 3.825990952549713e-06, + "loss": 0.86341643, + "num_input_tokens_seen": 57536180, + "router_z_loss_clip": 2.7734375, + "router_z_loss_mlp": 0.25109863, + "step": 2657, + "time_per_iteration": 2.8164706230163574 + }, + { + "auxiliary_loss_clip": 0.06582649, + "auxiliary_loss_mlp": 0.01286585, + "balance_loss_clip": 0.062974, + "balance_loss_mlp": 0.01260514, + "epoch": 0.1598076055914625, + "flos": 18739047726720.0, + "grad_norm": 1.7078792593137306, + "language_loss": 0.75124943, + "learning_rate": 3.825832029372035e-06, + "loss": 0.82994181, + "num_input_tokens_seen": 57555025, + "router_z_loss_clip": 2.84960938, + "router_z_loss_mlp": 0.26098633, + "step": 2658, + "time_per_iteration": 2.539357900619507 + }, + { + "auxiliary_loss_clip": 0.06584077, + "auxiliary_loss_mlp": 0.01290613, + "balance_loss_clip": 0.06297718, + "balance_loss_mlp": 0.0126354, + "epoch": 0.15986772884413047, + "flos": 34357681405440.0, + "grad_norm": 1.7106510421340806, + "language_loss": 0.76173538, + "learning_rate": 3.825673036958624e-06, + "loss": 0.84048235, + "num_input_tokens_seen": 57577660, + "router_z_loss_clip": 2.86328125, + "router_z_loss_mlp": 0.27087402, + "step": 2659, + "time_per_iteration": 2.7063279151916504 + }, + { + "auxiliary_loss_clip": 0.06590043, + "auxiliary_loss_mlp": 0.01292057, + "balance_loss_clip": 0.06300306, + "balance_loss_mlp": 0.01265164, + "epoch": 0.15992785209679844, + "flos": 22061596550400.0, + "grad_norm": 2.109703300615196, + "language_loss": 0.91436422, + "learning_rate": 3.825513975315508e-06, + "loss": 0.99318516, + "num_input_tokens_seen": 57596335, + "router_z_loss_clip": 2.8984375, + "router_z_loss_mlp": 0.26855469, + "step": 2660, + "time_per_iteration": 3.960657835006714 + }, + { + "auxiliary_loss_clip": 0.06587565, + "auxiliary_loss_mlp": 0.01283697, + "balance_loss_clip": 0.06297715, + "balance_loss_mlp": 0.01257018, + "epoch": 0.1599879753494664, + "flos": 33073946928000.0, + "grad_norm": 2.772952590222661, + "language_loss": 0.79090029, + "learning_rate": 3.82535484444872e-06, + "loss": 0.86961293, + "num_input_tokens_seen": 57616830, + "router_z_loss_clip": 2.90039062, + "router_z_loss_mlp": 0.26647949, + "step": 2661, + "time_per_iteration": 2.64117693901062 + }, + { + "auxiliary_loss_clip": 0.0657732, + "auxiliary_loss_mlp": 0.01287922, + "balance_loss_clip": 0.06293119, + "balance_loss_mlp": 0.01262495, + "epoch": 0.16004809860213437, + "flos": 28045533156480.0, + "grad_norm": 1.8363743510340895, + "language_loss": 0.74837106, + "learning_rate": 3.825195644364292e-06, + "loss": 0.82702351, + "num_input_tokens_seen": 57635515, + "router_z_loss_clip": 2.84375, + "router_z_loss_mlp": 0.25390625, + "step": 2662, + "time_per_iteration": 4.100783586502075 + }, + { + "auxiliary_loss_clip": 0.06590086, + "auxiliary_loss_mlp": 0.01285907, + "balance_loss_clip": 0.06299042, + "balance_loss_mlp": 0.01259967, + "epoch": 0.16010822185480234, + "flos": 22786096878720.0, + "grad_norm": 1.8771670502098623, + "language_loss": 0.82632995, + "learning_rate": 3.825036375068263e-06, + "loss": 0.90508991, + "num_input_tokens_seen": 57654250, + "router_z_loss_clip": 2.91015625, + "router_z_loss_mlp": 0.25964355, + "step": 2663, + "time_per_iteration": 2.5558366775512695 + }, + { + "auxiliary_loss_clip": 0.06586467, + "auxiliary_loss_mlp": 0.01285272, + "balance_loss_clip": 0.06297847, + "balance_loss_mlp": 0.01260011, + "epoch": 0.16016834510747033, + "flos": 20090188414080.0, + "grad_norm": 3.3923647685745344, + "language_loss": 0.81316251, + "learning_rate": 3.824877036566672e-06, + "loss": 0.89187992, + "num_input_tokens_seen": 57672645, + "router_z_loss_clip": 2.88671875, + "router_z_loss_mlp": 0.25268555, + "step": 2664, + "time_per_iteration": 2.5118319988250732 + }, + { + "auxiliary_loss_clip": 0.06584498, + "auxiliary_loss_mlp": 0.01285586, + "balance_loss_clip": 0.06298545, + "balance_loss_mlp": 0.01259038, + "epoch": 0.1602284683601383, + "flos": 21179391638400.0, + "grad_norm": 1.6927431664351194, + "language_loss": 0.94832575, + "learning_rate": 3.824717628865561e-06, + "loss": 1.02702665, + "num_input_tokens_seen": 57691055, + "router_z_loss_clip": 2.86132812, + "router_z_loss_mlp": 0.26550293, + "step": 2665, + "time_per_iteration": 2.54654860496521 + }, + { + "auxiliary_loss_clip": 0.06588221, + "auxiliary_loss_mlp": 0.0128992, + "balance_loss_clip": 0.06298642, + "balance_loss_mlp": 0.01263051, + "epoch": 0.16028859161280626, + "flos": 14652823991040.0, + "grad_norm": 2.069431022104881, + "language_loss": 0.85796285, + "learning_rate": 3.824558151970974e-06, + "loss": 0.93674427, + "num_input_tokens_seen": 57707235, + "router_z_loss_clip": 2.89648438, + "router_z_loss_mlp": 0.26879883, + "step": 2666, + "time_per_iteration": 2.483457088470459 + }, + { + "auxiliary_loss_clip": 0.06582008, + "auxiliary_loss_mlp": 0.01292714, + "balance_loss_clip": 0.06294423, + "balance_loss_mlp": 0.01268645, + "epoch": 0.16034871486547422, + "flos": 20995677561600.0, + "grad_norm": 1.9110296287370478, + "language_loss": 0.82042331, + "learning_rate": 3.8243986058889595e-06, + "loss": 0.89917052, + "num_input_tokens_seen": 57724190, + "router_z_loss_clip": 2.875, + "router_z_loss_mlp": 0.24072266, + "step": 2667, + "time_per_iteration": 3.9772729873657227 + }, + { + "auxiliary_loss_clip": 0.06585021, + "auxiliary_loss_mlp": 0.01299108, + "balance_loss_clip": 0.06302348, + "balance_loss_mlp": 0.01272608, + "epoch": 0.1604088381181422, + "flos": 21404167015680.0, + "grad_norm": 2.2548046072843664, + "language_loss": 0.74520987, + "learning_rate": 3.824238990625567e-06, + "loss": 0.82405114, + "num_input_tokens_seen": 57743620, + "router_z_loss_clip": 2.82421875, + "router_z_loss_mlp": 0.26513672, + "step": 2668, + "time_per_iteration": 2.5379245281219482 + }, + { + "auxiliary_loss_clip": 0.06581191, + "auxiliary_loss_mlp": 0.01286404, + "balance_loss_clip": 0.06295477, + "balance_loss_mlp": 0.01259296, + "epoch": 0.16046896137081015, + "flos": 23883601656960.0, + "grad_norm": 1.6904761581724046, + "language_loss": 0.78225315, + "learning_rate": 3.824079306186848e-06, + "loss": 0.86092913, + "num_input_tokens_seen": 57764810, + "router_z_loss_clip": 2.86132812, + "router_z_loss_mlp": 0.27124023, + "step": 2669, + "time_per_iteration": 2.5322623252868652 + }, + { + "auxiliary_loss_clip": 0.06461855, + "auxiliary_loss_mlp": 0.01262059, + "balance_loss_clip": 0.06292316, + "balance_loss_mlp": 0.01253518, + "epoch": 0.16052908462347812, + "flos": 59823907453440.0, + "grad_norm": 0.8025105121256505, + "language_loss": 0.55497211, + "learning_rate": 3.823919552578861e-06, + "loss": 0.63221133, + "num_input_tokens_seen": 57824390, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.08551025, + "step": 2670, + "time_per_iteration": 3.0635480880737305 + }, + { + "auxiliary_loss_clip": 0.06584324, + "auxiliary_loss_mlp": 0.01300694, + "balance_loss_clip": 0.06294604, + "balance_loss_mlp": 0.01273097, + "epoch": 0.1605892078761461, + "flos": 18302494354560.0, + "grad_norm": 1.9278903563018932, + "language_loss": 0.79113603, + "learning_rate": 3.82375972980766e-06, + "loss": 0.86998624, + "num_input_tokens_seen": 57843665, + "router_z_loss_clip": 2.8984375, + "router_z_loss_mlp": 0.27587891, + "step": 2671, + "time_per_iteration": 2.5478527545928955 + }, + { + "auxiliary_loss_clip": 0.06586512, + "auxiliary_loss_mlp": 0.01285282, + "balance_loss_clip": 0.06298812, + "balance_loss_mlp": 0.01259914, + "epoch": 0.16064933112881408, + "flos": 32168918977920.0, + "grad_norm": 2.1901870356390964, + "language_loss": 0.65440154, + "learning_rate": 3.8235998378793086e-06, + "loss": 0.73311949, + "num_input_tokens_seen": 57863305, + "router_z_loss_clip": 2.87890625, + "router_z_loss_mlp": 0.25378418, + "step": 2672, + "time_per_iteration": 2.659353494644165 + }, + { + "auxiliary_loss_clip": 0.06589735, + "auxiliary_loss_mlp": 0.01293218, + "balance_loss_clip": 0.06296135, + "balance_loss_mlp": 0.01263916, + "epoch": 0.16070945438148204, + "flos": 19834959277440.0, + "grad_norm": 2.1290275432047037, + "language_loss": 0.86193001, + "learning_rate": 3.8234398767998675e-06, + "loss": 0.94075954, + "num_input_tokens_seen": 57883025, + "router_z_loss_clip": 2.93554688, + "router_z_loss_mlp": 0.29296875, + "step": 2673, + "time_per_iteration": 2.5288193225860596 + }, + { + "auxiliary_loss_clip": 0.06583102, + "auxiliary_loss_mlp": 0.01291016, + "balance_loss_clip": 0.06295739, + "balance_loss_mlp": 0.0126572, + "epoch": 0.16076957763415, + "flos": 18918569099520.0, + "grad_norm": 2.3065631305512473, + "language_loss": 0.73982865, + "learning_rate": 3.823279846575403e-06, + "loss": 0.81856978, + "num_input_tokens_seen": 57901430, + "router_z_loss_clip": 2.87304688, + "router_z_loss_mlp": 0.25305176, + "step": 2674, + "time_per_iteration": 2.524121046066284 + }, + { + "auxiliary_loss_clip": 0.06576435, + "auxiliary_loss_mlp": 0.0128192, + "balance_loss_clip": 0.06293078, + "balance_loss_mlp": 0.01255086, + "epoch": 0.16082970088681797, + "flos": 16770071358720.0, + "grad_norm": 3.691225614104051, + "language_loss": 0.85411537, + "learning_rate": 3.823119747211986e-06, + "loss": 0.93269891, + "num_input_tokens_seen": 57919550, + "router_z_loss_clip": 2.83007812, + "router_z_loss_mlp": 0.26806641, + "step": 2675, + "time_per_iteration": 2.4984703063964844 + }, + { + "auxiliary_loss_clip": 0.06581541, + "auxiliary_loss_mlp": 0.01285801, + "balance_loss_clip": 0.06293826, + "balance_loss_mlp": 0.01259468, + "epoch": 0.16088982413948594, + "flos": 35158560330240.0, + "grad_norm": 1.8394721735800996, + "language_loss": 0.83251232, + "learning_rate": 3.822959578715685e-06, + "loss": 0.91118574, + "num_input_tokens_seen": 57939890, + "router_z_loss_clip": 2.875, + "router_z_loss_mlp": 0.26306152, + "step": 2676, + "time_per_iteration": 2.6714260578155518 + }, + { + "auxiliary_loss_clip": 0.06567734, + "auxiliary_loss_mlp": 0.01280714, + "balance_loss_clip": 0.06290022, + "balance_loss_mlp": 0.01257456, + "epoch": 0.1609499473921539, + "flos": 18631125267840.0, + "grad_norm": 4.8459600996760805, + "language_loss": 0.74951547, + "learning_rate": 3.822799341092573e-06, + "loss": 0.82799989, + "num_input_tokens_seen": 57957410, + "router_z_loss_clip": 2.77539062, + "router_z_loss_mlp": 0.23266602, + "step": 2677, + "time_per_iteration": 2.5061256885528564 + }, + { + "auxiliary_loss_clip": 0.06577247, + "auxiliary_loss_mlp": 0.01283067, + "balance_loss_clip": 0.06292509, + "balance_loss_mlp": 0.01258164, + "epoch": 0.1610100706448219, + "flos": 33154057031040.0, + "grad_norm": 1.8038433202406936, + "language_loss": 0.77285242, + "learning_rate": 3.822639034348728e-06, + "loss": 0.85145557, + "num_input_tokens_seen": 57977900, + "router_z_loss_clip": 2.84765625, + "router_z_loss_mlp": 0.24926758, + "step": 2678, + "time_per_iteration": 2.6886472702026367 + }, + { + "auxiliary_loss_clip": 0.06581186, + "auxiliary_loss_mlp": 0.01287879, + "balance_loss_clip": 0.06295253, + "balance_loss_mlp": 0.01261271, + "epoch": 0.16107019389748986, + "flos": 34685054507520.0, + "grad_norm": 1.8476006870379242, + "language_loss": 0.71465111, + "learning_rate": 3.822478658490228e-06, + "loss": 0.79334176, + "num_input_tokens_seen": 57998210, + "router_z_loss_clip": 2.85742188, + "router_z_loss_mlp": 0.26611328, + "step": 2679, + "time_per_iteration": 2.646629810333252 + }, + { + "auxiliary_loss_clip": 0.06453654, + "auxiliary_loss_mlp": 0.01258662, + "balance_loss_clip": 0.06285442, + "balance_loss_mlp": 0.01250973, + "epoch": 0.16113031715015783, + "flos": 65730920411520.0, + "grad_norm": 0.7655469055577169, + "language_loss": 0.51874888, + "learning_rate": 3.822318213523154e-06, + "loss": 0.59587204, + "num_input_tokens_seen": 58059420, + "router_z_loss_clip": 1.6875, + "router_z_loss_mlp": 0.07678223, + "step": 2680, + "time_per_iteration": 3.3470637798309326 + }, + { + "auxiliary_loss_clip": 0.06584955, + "auxiliary_loss_mlp": 0.01288163, + "balance_loss_clip": 0.06295321, + "balance_loss_mlp": 0.01259363, + "epoch": 0.1611904404028258, + "flos": 20816156188800.0, + "grad_norm": 2.2126972690115476, + "language_loss": 0.81079412, + "learning_rate": 3.8221576994535925e-06, + "loss": 0.88952529, + "num_input_tokens_seen": 58078370, + "router_z_loss_clip": 2.8984375, + "router_z_loss_mlp": 0.28808594, + "step": 2681, + "time_per_iteration": 2.5526723861694336 + }, + { + "auxiliary_loss_clip": 0.06577247, + "auxiliary_loss_mlp": 0.01287934, + "balance_loss_clip": 0.06295492, + "balance_loss_mlp": 0.01262029, + "epoch": 0.16125056365549376, + "flos": 27020172343680.0, + "grad_norm": 2.1176985882953647, + "language_loss": 0.70093226, + "learning_rate": 3.821997116287627e-06, + "loss": 0.77958405, + "num_input_tokens_seen": 58097395, + "router_z_loss_clip": 2.81835938, + "router_z_loss_mlp": 0.25891113, + "step": 2682, + "time_per_iteration": 2.5618250370025635 + }, + { + "auxiliary_loss_clip": 0.0657934, + "auxiliary_loss_mlp": 0.01288185, + "balance_loss_clip": 0.06295457, + "balance_loss_mlp": 0.01261708, + "epoch": 0.16131068690816172, + "flos": 19281762622080.0, + "grad_norm": 2.105414566897303, + "language_loss": 0.88063419, + "learning_rate": 3.821836464031348e-06, + "loss": 0.9593094, + "num_input_tokens_seen": 58115630, + "router_z_loss_clip": 2.83789062, + "router_z_loss_mlp": 0.26464844, + "step": 2683, + "time_per_iteration": 2.528503656387329 + }, + { + "auxiliary_loss_clip": 0.06581098, + "auxiliary_loss_mlp": 0.01286491, + "balance_loss_clip": 0.06297365, + "balance_loss_mlp": 0.01260718, + "epoch": 0.16137081016082971, + "flos": 35347137943680.0, + "grad_norm": 2.6304159370219447, + "language_loss": 0.75242329, + "learning_rate": 3.821675742690849e-06, + "loss": 0.83109927, + "num_input_tokens_seen": 58138655, + "router_z_loss_clip": 2.83984375, + "router_z_loss_mlp": 0.25744629, + "step": 2684, + "time_per_iteration": 2.6683855056762695 + }, + { + "auxiliary_loss_clip": 0.06584509, + "auxiliary_loss_mlp": 0.01281022, + "balance_loss_clip": 0.0629454, + "balance_loss_mlp": 0.01253831, + "epoch": 0.16143093341349768, + "flos": 34242924839040.0, + "grad_norm": 3.4255618739056395, + "language_loss": 0.70703149, + "learning_rate": 3.821514952272223e-06, + "loss": 0.78568679, + "num_input_tokens_seen": 58157440, + "router_z_loss_clip": 2.90039062, + "router_z_loss_mlp": 0.27185059, + "step": 2685, + "time_per_iteration": 2.6502463817596436 + }, + { + "auxiliary_loss_clip": 0.06573574, + "auxiliary_loss_mlp": 0.01295712, + "balance_loss_clip": 0.06295055, + "balance_loss_mlp": 0.01269724, + "epoch": 0.16149105666616564, + "flos": 28006400499840.0, + "grad_norm": 2.7207808014988495, + "language_loss": 0.72642833, + "learning_rate": 3.821354092781567e-06, + "loss": 0.80512118, + "num_input_tokens_seen": 58176660, + "router_z_loss_clip": 2.78515625, + "router_z_loss_mlp": 0.26000977, + "step": 2686, + "time_per_iteration": 2.5685417652130127 + }, + { + "auxiliary_loss_clip": 0.06583634, + "auxiliary_loss_mlp": 0.01298345, + "balance_loss_clip": 0.06294057, + "balance_loss_mlp": 0.01269628, + "epoch": 0.1615511799188336, + "flos": 19427434145280.0, + "grad_norm": 2.058545535595822, + "language_loss": 0.82461345, + "learning_rate": 3.821193164224981e-06, + "loss": 0.90343326, + "num_input_tokens_seen": 58195085, + "router_z_loss_clip": 2.90234375, + "router_z_loss_mlp": 0.2869873, + "step": 2687, + "time_per_iteration": 2.5222442150115967 + }, + { + "auxiliary_loss_clip": 0.06594162, + "auxiliary_loss_mlp": 0.01299687, + "balance_loss_clip": 0.06299458, + "balance_loss_mlp": 0.01269109, + "epoch": 0.16161130317150157, + "flos": 22861217664000.0, + "grad_norm": 2.6401237934402575, + "language_loss": 0.72416258, + "learning_rate": 3.821032166608568e-06, + "loss": 0.80310106, + "num_input_tokens_seen": 58213540, + "router_z_loss_clip": 2.9453125, + "router_z_loss_mlp": 0.30578613, + "step": 2688, + "time_per_iteration": 2.5157902240753174 + }, + { + "auxiliary_loss_clip": 0.06589709, + "auxiliary_loss_mlp": 0.01309231, + "balance_loss_clip": 0.06303161, + "balance_loss_mlp": 0.0128161, + "epoch": 0.16167142642416954, + "flos": 26118833973120.0, + "grad_norm": 1.7781492277957918, + "language_loss": 0.76426512, + "learning_rate": 3.8208710999384325e-06, + "loss": 0.84325451, + "num_input_tokens_seen": 58236995, + "router_z_loss_clip": 2.8671875, + "router_z_loss_mlp": 0.27636719, + "step": 2689, + "time_per_iteration": 2.61681866645813 + }, + { + "auxiliary_loss_clip": 0.06586435, + "auxiliary_loss_mlp": 0.01313647, + "balance_loss_clip": 0.06302889, + "balance_loss_mlp": 0.01286182, + "epoch": 0.1617315496768375, + "flos": 22785551827200.0, + "grad_norm": 2.168912849024457, + "language_loss": 0.883026, + "learning_rate": 3.820709964220683e-06, + "loss": 0.96202683, + "num_input_tokens_seen": 58257230, + "router_z_loss_clip": 2.8359375, + "router_z_loss_mlp": 0.27478027, + "step": 2690, + "time_per_iteration": 2.542171001434326 + }, + { + "auxiliary_loss_clip": 0.06581193, + "auxiliary_loss_mlp": 0.01303059, + "balance_loss_clip": 0.06297438, + "balance_loss_mlp": 0.01277, + "epoch": 0.1617916729295055, + "flos": 22023721704960.0, + "grad_norm": 1.681429316785462, + "language_loss": 0.88894439, + "learning_rate": 3.8205487594614284e-06, + "loss": 0.96778685, + "num_input_tokens_seen": 58277080, + "router_z_loss_clip": 2.83398438, + "router_z_loss_mlp": 0.26049805, + "step": 2691, + "time_per_iteration": 2.5444743633270264 + }, + { + "auxiliary_loss_clip": 0.06592601, + "auxiliary_loss_mlp": 0.01300554, + "balance_loss_clip": 0.06297764, + "balance_loss_mlp": 0.01270108, + "epoch": 0.16185179618217346, + "flos": 23444574589440.0, + "grad_norm": 5.894128293889176, + "language_loss": 0.8353231, + "learning_rate": 3.820387485666784e-06, + "loss": 0.91425461, + "num_input_tokens_seen": 58294815, + "router_z_loss_clip": 2.94921875, + "router_z_loss_mlp": 0.30456543, + "step": 2692, + "time_per_iteration": 2.5367183685302734 + }, + { + "auxiliary_loss_clip": 0.06601407, + "auxiliary_loss_mlp": 0.01299753, + "balance_loss_clip": 0.06306131, + "balance_loss_mlp": 0.01270404, + "epoch": 0.16191191943484143, + "flos": 25673182433280.0, + "grad_norm": 2.87727514771051, + "language_loss": 0.82700074, + "learning_rate": 3.820226142842862e-06, + "loss": 0.9060123, + "num_input_tokens_seen": 58313215, + "router_z_loss_clip": 2.953125, + "router_z_loss_mlp": 0.29333496, + "step": 2693, + "time_per_iteration": 2.6187057495117188 + }, + { + "auxiliary_loss_clip": 0.06582904, + "auxiliary_loss_mlp": 0.01312533, + "balance_loss_clip": 0.06302174, + "balance_loss_mlp": 0.01286724, + "epoch": 0.1619720426875094, + "flos": 23484126516480.0, + "grad_norm": 1.4528149346161843, + "language_loss": 0.85022998, + "learning_rate": 3.820064730995783e-06, + "loss": 0.92918432, + "num_input_tokens_seen": 58333215, + "router_z_loss_clip": 2.80664062, + "router_z_loss_mlp": 0.25793457, + "step": 2694, + "time_per_iteration": 2.5672922134399414 + }, + { + "auxiliary_loss_clip": 0.06594259, + "auxiliary_loss_mlp": 0.01304563, + "balance_loss_clip": 0.0630251, + "balance_loss_mlp": 0.0127612, + "epoch": 0.16203216594017736, + "flos": 24140465948160.0, + "grad_norm": 2.1096932177369654, + "language_loss": 0.70739377, + "learning_rate": 3.819903250131667e-06, + "loss": 0.78638196, + "num_input_tokens_seen": 58351160, + "router_z_loss_clip": 2.921875, + "router_z_loss_mlp": 0.28442383, + "step": 2695, + "time_per_iteration": 2.5555880069732666 + }, + { + "auxiliary_loss_clip": 0.0659132, + "auxiliary_loss_mlp": 0.01297552, + "balance_loss_clip": 0.0630125, + "balance_loss_mlp": 0.01269943, + "epoch": 0.16209228919284532, + "flos": 22346566686720.0, + "grad_norm": 2.7194545314545153, + "language_loss": 0.83673584, + "learning_rate": 3.819741700256637e-06, + "loss": 0.91562462, + "num_input_tokens_seen": 58368505, + "router_z_loss_clip": 2.90429688, + "router_z_loss_mlp": 0.27600098, + "step": 2696, + "time_per_iteration": 2.520920753479004 + }, + { + "auxiliary_loss_clip": 0.06605247, + "auxiliary_loss_mlp": 0.01295053, + "balance_loss_clip": 0.06302903, + "balance_loss_mlp": 0.01263773, + "epoch": 0.1621524124455133, + "flos": 15820586017920.0, + "grad_norm": 2.3129442406301766, + "language_loss": 0.89183378, + "learning_rate": 3.8195800813768194e-06, + "loss": 0.97083676, + "num_input_tokens_seen": 58385085, + "router_z_loss_clip": 3.02539062, + "router_z_loss_mlp": 0.31274414, + "step": 2697, + "time_per_iteration": 2.5259652137756348 + }, + { + "auxiliary_loss_clip": 0.0658388, + "auxiliary_loss_mlp": 0.01292599, + "balance_loss_clip": 0.06303512, + "balance_loss_mlp": 0.01267004, + "epoch": 0.16221253569818128, + "flos": 30193905116160.0, + "grad_norm": 1.495271767432462, + "language_loss": 0.81588805, + "learning_rate": 3.819418393498343e-06, + "loss": 0.89465284, + "num_input_tokens_seen": 58406985, + "router_z_loss_clip": 2.8046875, + "router_z_loss_mlp": 0.25598145, + "step": 2698, + "time_per_iteration": 2.595975160598755 + }, + { + "auxiliary_loss_clip": 0.06588376, + "auxiliary_loss_mlp": 0.01284743, + "balance_loss_clip": 0.06309167, + "balance_loss_mlp": 0.01257765, + "epoch": 0.16227265895084925, + "flos": 24612546251520.0, + "grad_norm": 1.6873939512975982, + "language_loss": 0.78418016, + "learning_rate": 3.819256636627339e-06, + "loss": 0.86291134, + "num_input_tokens_seen": 58426205, + "router_z_loss_clip": 2.79296875, + "router_z_loss_mlp": 0.26965332, + "step": 2699, + "time_per_iteration": 2.5874006748199463 + }, + { + "auxiliary_loss_clip": 0.06599343, + "auxiliary_loss_mlp": 0.01283682, + "balance_loss_clip": 0.06313124, + "balance_loss_mlp": 0.0125754, + "epoch": 0.1623327822035172, + "flos": 19579436651520.0, + "grad_norm": 5.305505294911747, + "language_loss": 0.86966538, + "learning_rate": 3.81909481076994e-06, + "loss": 0.94849563, + "num_input_tokens_seen": 58443830, + "router_z_loss_clip": 2.859375, + "router_z_loss_mlp": 0.2611084, + "step": 2700, + "time_per_iteration": 4.029258966445923 + }, + { + "auxiliary_loss_clip": 0.06593184, + "auxiliary_loss_mlp": 0.01283437, + "balance_loss_clip": 0.06310724, + "balance_loss_mlp": 0.01256042, + "epoch": 0.16239290545618518, + "flos": 26475612658560.0, + "grad_norm": 1.7724025685719413, + "language_loss": 0.80958557, + "learning_rate": 3.818932915932284e-06, + "loss": 0.8883518, + "num_input_tokens_seen": 58464405, + "router_z_loss_clip": 2.83007812, + "router_z_loss_mlp": 0.27404785, + "step": 2701, + "time_per_iteration": 2.5998921394348145 + }, + { + "auxiliary_loss_clip": 0.06590648, + "auxiliary_loss_mlp": 0.01284929, + "balance_loss_clip": 0.06304645, + "balance_loss_mlp": 0.01256271, + "epoch": 0.16245302870885314, + "flos": 15857454614400.0, + "grad_norm": 1.7204107394325303, + "language_loss": 0.74345064, + "learning_rate": 3.818770952120511e-06, + "loss": 0.8222065, + "num_input_tokens_seen": 58483295, + "router_z_loss_clip": 2.859375, + "router_z_loss_mlp": 0.28649902, + "step": 2702, + "time_per_iteration": 3.937354803085327 + }, + { + "auxiliary_loss_clip": 0.06603839, + "auxiliary_loss_mlp": 0.0128822, + "balance_loss_clip": 0.06313589, + "balance_loss_mlp": 0.01259252, + "epoch": 0.1625131519615211, + "flos": 14761710771840.0, + "grad_norm": 9.119129404803312, + "language_loss": 0.7369948, + "learning_rate": 3.81860891934076e-06, + "loss": 0.81591535, + "num_input_tokens_seen": 58501205, + "router_z_loss_clip": 2.90234375, + "router_z_loss_mlp": 0.28955078, + "step": 2703, + "time_per_iteration": 2.5070807933807373 + }, + { + "auxiliary_loss_clip": 0.066023, + "auxiliary_loss_mlp": 0.01283178, + "balance_loss_clip": 0.0631163, + "balance_loss_mlp": 0.01255033, + "epoch": 0.1625732752141891, + "flos": 28228073276160.0, + "grad_norm": 2.112253840465368, + "language_loss": 0.70914233, + "learning_rate": 3.818446817599176e-06, + "loss": 0.78799713, + "num_input_tokens_seen": 58522315, + "router_z_loss_clip": 2.90820312, + "router_z_loss_mlp": 0.28112793, + "step": 2704, + "time_per_iteration": 2.6071994304656982 + }, + { + "auxiliary_loss_clip": 0.06486984, + "auxiliary_loss_mlp": 0.01271467, + "balance_loss_clip": 0.06323022, + "balance_loss_mlp": 0.01264725, + "epoch": 0.16263339846685707, + "flos": 67347268871040.0, + "grad_norm": 0.7781332743607355, + "language_loss": 0.53379726, + "learning_rate": 3.818284646901907e-06, + "loss": 0.61138183, + "num_input_tokens_seen": 58586695, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.06756592, + "step": 2705, + "time_per_iteration": 3.1592283248901367 + }, + { + "auxiliary_loss_clip": 0.06599878, + "auxiliary_loss_mlp": 0.01288619, + "balance_loss_clip": 0.06308411, + "balance_loss_mlp": 0.01259854, + "epoch": 0.16269352171952503, + "flos": 14324360785920.0, + "grad_norm": 2.6444300047772575, + "language_loss": 0.76420808, + "learning_rate": 3.818122407255102e-06, + "loss": 0.84309304, + "num_input_tokens_seen": 58602435, + "router_z_loss_clip": 2.9140625, + "router_z_loss_mlp": 0.2878418, + "step": 2706, + "time_per_iteration": 2.494798183441162 + }, + { + "auxiliary_loss_clip": 0.06595413, + "auxiliary_loss_mlp": 0.01288657, + "balance_loss_clip": 0.06307741, + "balance_loss_mlp": 0.01263015, + "epoch": 0.162753644972193, + "flos": 28367916940800.0, + "grad_norm": 2.0996317585826727, + "language_loss": 0.73324966, + "learning_rate": 3.817960098664914e-06, + "loss": 0.8120904, + "num_input_tokens_seen": 58621275, + "router_z_loss_clip": 2.87695312, + "router_z_loss_mlp": 0.25646973, + "step": 2707, + "time_per_iteration": 5.361986875534058 + }, + { + "auxiliary_loss_clip": 0.06597963, + "auxiliary_loss_mlp": 0.01297936, + "balance_loss_clip": 0.06310263, + "balance_loss_mlp": 0.01270721, + "epoch": 0.16281376822486096, + "flos": 19943971839360.0, + "grad_norm": 3.72169556400114, + "language_loss": 0.83658004, + "learning_rate": 3.817797721137495e-06, + "loss": 0.91553903, + "num_input_tokens_seen": 58637550, + "router_z_loss_clip": 2.875, + "router_z_loss_mlp": 0.27233887, + "step": 2708, + "time_per_iteration": 2.528703451156616 + }, + { + "auxiliary_loss_clip": 0.0659356, + "auxiliary_loss_mlp": 0.01292098, + "balance_loss_clip": 0.06302815, + "balance_loss_mlp": 0.01262701, + "epoch": 0.16287389147752893, + "flos": 21258118149120.0, + "grad_norm": 2.208557612842335, + "language_loss": 0.86945301, + "learning_rate": 3.817635274679006e-06, + "loss": 0.94830966, + "num_input_tokens_seen": 58654135, + "router_z_loss_clip": 2.91015625, + "router_z_loss_mlp": 0.29394531, + "step": 2709, + "time_per_iteration": 2.5158472061157227 + }, + { + "auxiliary_loss_clip": 0.06590779, + "auxiliary_loss_mlp": 0.01297599, + "balance_loss_clip": 0.06302857, + "balance_loss_mlp": 0.0127123, + "epoch": 0.1629340147301969, + "flos": 19250679957120.0, + "grad_norm": 2.0845626973210942, + "language_loss": 0.926085, + "learning_rate": 3.817472759295605e-06, + "loss": 1.00496876, + "num_input_tokens_seen": 58674320, + "router_z_loss_clip": 2.87695312, + "router_z_loss_mlp": 0.26367188, + "step": 2710, + "time_per_iteration": 2.566678762435913 + }, + { + "auxiliary_loss_clip": 0.06590527, + "auxiliary_loss_mlp": 0.01299634, + "balance_loss_clip": 0.06304915, + "balance_loss_mlp": 0.01271691, + "epoch": 0.16299413798286488, + "flos": 21255896016000.0, + "grad_norm": 2.354283395736919, + "language_loss": 0.82405818, + "learning_rate": 3.817310174993453e-06, + "loss": 0.90295976, + "num_input_tokens_seen": 58691000, + "router_z_loss_clip": 2.859375, + "router_z_loss_mlp": 0.27954102, + "step": 2711, + "time_per_iteration": 2.5129330158233643 + }, + { + "auxiliary_loss_clip": 0.06600536, + "auxiliary_loss_mlp": 0.01290666, + "balance_loss_clip": 0.0630425, + "balance_loss_mlp": 0.0126115, + "epoch": 0.16305426123553285, + "flos": 18776545228800.0, + "grad_norm": 3.9666408475565462, + "language_loss": 0.82468587, + "learning_rate": 3.817147521778719e-06, + "loss": 0.90359789, + "num_input_tokens_seen": 58710230, + "router_z_loss_clip": 2.96289062, + "router_z_loss_mlp": 0.29516602, + "step": 2712, + "time_per_iteration": 2.5337300300598145 + }, + { + "auxiliary_loss_clip": 0.06597727, + "auxiliary_loss_mlp": 0.01290483, + "balance_loss_clip": 0.06302102, + "balance_loss_mlp": 0.01261563, + "epoch": 0.16311438448820081, + "flos": 22093643537280.0, + "grad_norm": 1.9569381877955756, + "language_loss": 0.78029472, + "learning_rate": 3.816984799657568e-06, + "loss": 0.85917681, + "num_input_tokens_seen": 58728610, + "router_z_loss_clip": 2.95898438, + "router_z_loss_mlp": 0.28942871, + "step": 2713, + "time_per_iteration": 2.5238146781921387 + }, + { + "auxiliary_loss_clip": 0.06594867, + "auxiliary_loss_mlp": 0.0130017, + "balance_loss_clip": 0.06315845, + "balance_loss_mlp": 0.01271799, + "epoch": 0.16317450774086878, + "flos": 16472565037440.0, + "grad_norm": 2.250248562702171, + "language_loss": 0.80385303, + "learning_rate": 3.8168220086361715e-06, + "loss": 0.88280344, + "num_input_tokens_seen": 58744385, + "router_z_loss_clip": 2.79101562, + "router_z_loss_mlp": 0.28369141, + "step": 2714, + "time_per_iteration": 2.5166831016540527 + }, + { + "auxiliary_loss_clip": 0.06589634, + "auxiliary_loss_mlp": 0.01294838, + "balance_loss_clip": 0.06306746, + "balance_loss_mlp": 0.01269899, + "epoch": 0.16323463099353674, + "flos": 24359832737280.0, + "grad_norm": 1.8056327126335605, + "language_loss": 0.78403461, + "learning_rate": 3.816659148720702e-06, + "loss": 0.8628794, + "num_input_tokens_seen": 58763905, + "router_z_loss_clip": 2.828125, + "router_z_loss_mlp": 0.24951172, + "step": 2715, + "time_per_iteration": 2.5939090251922607 + }, + { + "auxiliary_loss_clip": 0.06588797, + "auxiliary_loss_mlp": 0.01288106, + "balance_loss_clip": 0.06304932, + "balance_loss_mlp": 0.01261952, + "epoch": 0.1632947542462047, + "flos": 24907872366720.0, + "grad_norm": 2.046246244819102, + "language_loss": 0.82485706, + "learning_rate": 3.816496219917336e-06, + "loss": 0.90362608, + "num_input_tokens_seen": 58785580, + "router_z_loss_clip": 2.84179688, + "router_z_loss_mlp": 0.26147461, + "step": 2716, + "time_per_iteration": 2.593174457550049 + }, + { + "auxiliary_loss_clip": 0.06597836, + "auxiliary_loss_mlp": 0.01294616, + "balance_loss_clip": 0.06307962, + "balance_loss_mlp": 0.01266017, + "epoch": 0.1633548774988727, + "flos": 24907285388160.0, + "grad_norm": 1.9895193792693864, + "language_loss": 0.87446529, + "learning_rate": 3.816333222232251e-06, + "loss": 0.95338982, + "num_input_tokens_seen": 58806075, + "router_z_loss_clip": 2.90234375, + "router_z_loss_mlp": 0.28613281, + "step": 2717, + "time_per_iteration": 2.55460262298584 + }, + { + "auxiliary_loss_clip": 0.0659758, + "auxiliary_loss_mlp": 0.01288078, + "balance_loss_clip": 0.06314965, + "balance_loss_mlp": 0.01262413, + "epoch": 0.16341500075154067, + "flos": 30449008471680.0, + "grad_norm": 1.9093048334188691, + "language_loss": 0.77648151, + "learning_rate": 3.816170155671629e-06, + "loss": 0.8553381, + "num_input_tokens_seen": 58827405, + "router_z_loss_clip": 2.82617188, + "router_z_loss_mlp": 0.25671387, + "step": 2718, + "time_per_iteration": 2.6473746299743652 + }, + { + "auxiliary_loss_clip": 0.06597009, + "auxiliary_loss_mlp": 0.01285687, + "balance_loss_clip": 0.0631033, + "balance_loss_mlp": 0.01259783, + "epoch": 0.16347512400420863, + "flos": 22791253904640.0, + "grad_norm": 2.222005290704418, + "language_loss": 0.74954313, + "learning_rate": 3.816007020241652e-06, + "loss": 0.82837009, + "num_input_tokens_seen": 58847205, + "router_z_loss_clip": 2.8671875, + "router_z_loss_mlp": 0.25866699, + "step": 2719, + "time_per_iteration": 2.551116704940796 + }, + { + "auxiliary_loss_clip": 0.0659292, + "auxiliary_loss_mlp": 0.01283628, + "balance_loss_clip": 0.0630803, + "balance_loss_mlp": 0.01257831, + "epoch": 0.1635352472568766, + "flos": 22639083690240.0, + "grad_norm": 1.7533438569003168, + "language_loss": 0.73446441, + "learning_rate": 3.815843815948507e-06, + "loss": 0.81322992, + "num_input_tokens_seen": 58866865, + "router_z_loss_clip": 2.84765625, + "router_z_loss_mlp": 0.25805664, + "step": 2720, + "time_per_iteration": 2.5771543979644775 + }, + { + "auxiliary_loss_clip": 0.06588636, + "auxiliary_loss_mlp": 0.01282225, + "balance_loss_clip": 0.0630826, + "balance_loss_mlp": 0.01254949, + "epoch": 0.16359537050954456, + "flos": 15528362503680.0, + "grad_norm": 2.643329433322918, + "language_loss": 0.7707237, + "learning_rate": 3.8156805427983824e-06, + "loss": 0.84943235, + "num_input_tokens_seen": 58885200, + "router_z_loss_clip": 2.80664062, + "router_z_loss_mlp": 0.27294922, + "step": 2721, + "time_per_iteration": 2.4961769580841064 + }, + { + "auxiliary_loss_clip": 0.06596414, + "auxiliary_loss_mlp": 0.0128382, + "balance_loss_clip": 0.0630523, + "balance_loss_mlp": 0.01256175, + "epoch": 0.16365549376221253, + "flos": 22096578430080.0, + "grad_norm": 2.1311655694461917, + "language_loss": 0.79885328, + "learning_rate": 3.8155172007974695e-06, + "loss": 0.87765563, + "num_input_tokens_seen": 58906385, + "router_z_loss_clip": 2.91601562, + "router_z_loss_mlp": 0.27648926, + "step": 2722, + "time_per_iteration": 2.614875078201294 + }, + { + "auxiliary_loss_clip": 0.06605944, + "auxiliary_loss_mlp": 0.01289108, + "balance_loss_clip": 0.06310583, + "balance_loss_mlp": 0.01258602, + "epoch": 0.1637156170148805, + "flos": 24067148025600.0, + "grad_norm": 1.9382892216015752, + "language_loss": 0.85628319, + "learning_rate": 3.8153537899519624e-06, + "loss": 0.93523371, + "num_input_tokens_seen": 58925040, + "router_z_loss_clip": 2.95507812, + "router_z_loss_mlp": 0.30493164, + "step": 2723, + "time_per_iteration": 2.531521797180176 + }, + { + "auxiliary_loss_clip": 0.0658607, + "auxiliary_loss_mlp": 0.01286244, + "balance_loss_clip": 0.06307479, + "balance_loss_mlp": 0.01259732, + "epoch": 0.1637757402675485, + "flos": 26692212263040.0, + "grad_norm": 4.459915510598608, + "language_loss": 0.71697843, + "learning_rate": 3.815190310268058e-06, + "loss": 0.7957015, + "num_input_tokens_seen": 58944790, + "router_z_loss_clip": 2.79101562, + "router_z_loss_mlp": 0.26477051, + "step": 2724, + "time_per_iteration": 2.577958822250366 + }, + { + "auxiliary_loss_clip": 0.06581962, + "auxiliary_loss_mlp": 0.01288602, + "balance_loss_clip": 0.06304826, + "balance_loss_mlp": 0.01263521, + "epoch": 0.16383586352021645, + "flos": 16112432188800.0, + "grad_norm": 1.9457979219444324, + "language_loss": 0.71286237, + "learning_rate": 3.815026761751955e-06, + "loss": 0.79156804, + "num_input_tokens_seen": 58962500, + "router_z_loss_clip": 2.77539062, + "router_z_loss_mlp": 0.25085449, + "step": 2725, + "time_per_iteration": 2.497311592102051 + }, + { + "auxiliary_loss_clip": 0.06590257, + "auxiliary_loss_mlp": 0.01285785, + "balance_loss_clip": 0.06310654, + "balance_loss_mlp": 0.01259761, + "epoch": 0.16389598677288442, + "flos": 19171031051520.0, + "grad_norm": 2.1904929355188325, + "language_loss": 0.89010125, + "learning_rate": 3.814863144409855e-06, + "loss": 0.96886164, + "num_input_tokens_seen": 58980355, + "router_z_loss_clip": 2.79101562, + "router_z_loss_mlp": 0.26013184, + "step": 2726, + "time_per_iteration": 2.5101511478424072 + }, + { + "auxiliary_loss_clip": 0.06595127, + "auxiliary_loss_mlp": 0.01285031, + "balance_loss_clip": 0.06307214, + "balance_loss_mlp": 0.01257732, + "epoch": 0.16395611002555238, + "flos": 21513431139840.0, + "grad_norm": 1.9675738265317178, + "language_loss": 0.75618744, + "learning_rate": 3.814699458247963e-06, + "loss": 0.83498907, + "num_input_tokens_seen": 58999505, + "router_z_loss_clip": 2.87890625, + "router_z_loss_mlp": 0.27331543, + "step": 2727, + "time_per_iteration": 2.5322039127349854 + }, + { + "auxiliary_loss_clip": 0.06578872, + "auxiliary_loss_mlp": 0.012812, + "balance_loss_clip": 0.06301126, + "balance_loss_mlp": 0.01257298, + "epoch": 0.16401623327822035, + "flos": 21477401084160.0, + "grad_norm": 2.357425852181157, + "language_loss": 0.82921708, + "learning_rate": 3.8145357032724855e-06, + "loss": 0.90781784, + "num_input_tokens_seen": 59017930, + "router_z_loss_clip": 2.78125, + "router_z_loss_mlp": 0.23913574, + "step": 2728, + "time_per_iteration": 2.538081407546997 + }, + { + "auxiliary_loss_clip": 0.06590319, + "auxiliary_loss_mlp": 0.01282423, + "balance_loss_clip": 0.0630119, + "balance_loss_mlp": 0.01255685, + "epoch": 0.1640763565308883, + "flos": 13631362392960.0, + "grad_norm": 3.359167938327165, + "language_loss": 0.85634404, + "learning_rate": 3.814371879489633e-06, + "loss": 0.93507141, + "num_input_tokens_seen": 59035130, + "router_z_loss_clip": 2.890625, + "router_z_loss_mlp": 0.26745605, + "step": 2729, + "time_per_iteration": 2.555157423019409 + }, + { + "auxiliary_loss_clip": 0.06590364, + "auxiliary_loss_mlp": 0.01282244, + "balance_loss_clip": 0.06303068, + "balance_loss_mlp": 0.01255732, + "epoch": 0.16413647978355628, + "flos": 15457057079040.0, + "grad_norm": 2.0375012641424193, + "language_loss": 0.73386455, + "learning_rate": 3.814207986905616e-06, + "loss": 0.81259066, + "num_input_tokens_seen": 59053080, + "router_z_loss_clip": 2.87695312, + "router_z_loss_mlp": 0.26477051, + "step": 2730, + "time_per_iteration": 2.5347042083740234 + }, + { + "auxiliary_loss_clip": 0.06593673, + "auxiliary_loss_mlp": 0.01289719, + "balance_loss_clip": 0.06303447, + "balance_loss_mlp": 0.01261967, + "epoch": 0.16419660303622427, + "flos": 45889043172480.0, + "grad_norm": 1.5633038653846945, + "language_loss": 0.75101161, + "learning_rate": 3.814044025526651e-06, + "loss": 0.82984555, + "num_input_tokens_seen": 59075610, + "router_z_loss_clip": 2.90234375, + "router_z_loss_mlp": 0.27734375, + "step": 2731, + "time_per_iteration": 2.7257211208343506 + }, + { + "auxiliary_loss_clip": 0.06592289, + "auxiliary_loss_mlp": 0.012866, + "balance_loss_clip": 0.06302358, + "balance_loss_mlp": 0.01258967, + "epoch": 0.16425672628889224, + "flos": 18958791859200.0, + "grad_norm": 2.3112437011786238, + "language_loss": 0.79966319, + "learning_rate": 3.8138799953589548e-06, + "loss": 0.87845206, + "num_input_tokens_seen": 59094555, + "router_z_loss_clip": 2.90039062, + "router_z_loss_mlp": 0.27648926, + "step": 2732, + "time_per_iteration": 2.5160276889801025 + }, + { + "auxiliary_loss_clip": 0.06590726, + "auxiliary_loss_mlp": 0.01293299, + "balance_loss_clip": 0.06301223, + "balance_loss_mlp": 0.01263854, + "epoch": 0.1643168495415602, + "flos": 24319316488320.0, + "grad_norm": 2.024679597680736, + "language_loss": 0.69993633, + "learning_rate": 3.8137158964087473e-06, + "loss": 0.77877665, + "num_input_tokens_seen": 59113515, + "router_z_loss_clip": 2.89648438, + "router_z_loss_mlp": 0.29467773, + "step": 2733, + "time_per_iteration": 2.53328537940979 + }, + { + "auxiliary_loss_clip": 0.06586764, + "auxiliary_loss_mlp": 0.0128512, + "balance_loss_clip": 0.06300272, + "balance_loss_mlp": 0.01256426, + "epoch": 0.16437697279422817, + "flos": 26434970628480.0, + "grad_norm": 2.0387940274909537, + "language_loss": 0.81552017, + "learning_rate": 3.8135517286822508e-06, + "loss": 0.89423895, + "num_input_tokens_seen": 59133275, + "router_z_loss_clip": 2.86132812, + "router_z_loss_mlp": 0.28674316, + "step": 2734, + "time_per_iteration": 2.567229747772217 + }, + { + "auxiliary_loss_clip": 0.0658897, + "auxiliary_loss_mlp": 0.01289023, + "balance_loss_clip": 0.06299339, + "balance_loss_mlp": 0.01261271, + "epoch": 0.16443709604689613, + "flos": 34540808503680.0, + "grad_norm": 4.048112349799869, + "language_loss": 0.82907999, + "learning_rate": 3.8133874921856914e-06, + "loss": 0.90785992, + "num_input_tokens_seen": 59154095, + "router_z_loss_clip": 2.89257812, + "router_z_loss_mlp": 0.27758789, + "step": 2735, + "time_per_iteration": 2.63996958732605 + }, + { + "auxiliary_loss_clip": 0.06579679, + "auxiliary_loss_mlp": 0.01279603, + "balance_loss_clip": 0.06297098, + "balance_loss_mlp": 0.01254783, + "epoch": 0.1644972192995641, + "flos": 23264717800320.0, + "grad_norm": 2.4207218830736417, + "language_loss": 0.80072814, + "learning_rate": 3.813223186925296e-06, + "loss": 0.87932098, + "num_input_tokens_seen": 59173795, + "router_z_loss_clip": 2.828125, + "router_z_loss_mlp": 0.24816895, + "step": 2736, + "time_per_iteration": 2.546694755554199 + }, + { + "auxiliary_loss_clip": 0.0658504, + "auxiliary_loss_mlp": 0.0128325, + "balance_loss_clip": 0.06300261, + "balance_loss_mlp": 0.01256499, + "epoch": 0.1645573425522321, + "flos": 26986825618560.0, + "grad_norm": 1.6682039059194231, + "language_loss": 0.82238322, + "learning_rate": 3.8130588129072964e-06, + "loss": 0.90106606, + "num_input_tokens_seen": 59191610, + "router_z_loss_clip": 2.84765625, + "router_z_loss_mlp": 0.2677002, + "step": 2737, + "time_per_iteration": 2.5593652725219727 + }, + { + "auxiliary_loss_clip": 0.06591076, + "auxiliary_loss_mlp": 0.0128149, + "balance_loss_clip": 0.06302774, + "balance_loss_mlp": 0.01256087, + "epoch": 0.16461746580490005, + "flos": 28739495871360.0, + "grad_norm": 1.7184215818783282, + "language_loss": 0.88135791, + "learning_rate": 3.8128943701379246e-06, + "loss": 0.96008366, + "num_input_tokens_seen": 59213000, + "router_z_loss_clip": 2.88476562, + "router_z_loss_mlp": 0.25402832, + "step": 2738, + "time_per_iteration": 2.6650192737579346 + }, + { + "auxiliary_loss_clip": 0.06589583, + "auxiliary_loss_mlp": 0.0128808, + "balance_loss_clip": 0.06299618, + "balance_loss_mlp": 0.01259446, + "epoch": 0.16467758905756802, + "flos": 24936062065920.0, + "grad_norm": 2.428798415539057, + "language_loss": 0.72705042, + "learning_rate": 3.8127298586234167e-06, + "loss": 0.80582702, + "num_input_tokens_seen": 59232340, + "router_z_loss_clip": 2.90234375, + "router_z_loss_mlp": 0.28649902, + "step": 2739, + "time_per_iteration": 4.007360935211182 + }, + { + "auxiliary_loss_clip": 0.06576341, + "auxiliary_loss_mlp": 0.0128871, + "balance_loss_clip": 0.06294868, + "balance_loss_mlp": 0.01261435, + "epoch": 0.16473771231023598, + "flos": 24833380487040.0, + "grad_norm": 2.4914045636792133, + "language_loss": 0.82377362, + "learning_rate": 3.8125652783700104e-06, + "loss": 0.90242416, + "num_input_tokens_seen": 59253950, + "router_z_loss_clip": 2.81445312, + "router_z_loss_mlp": 0.27270508, + "step": 2740, + "time_per_iteration": 2.5806076526641846 + }, + { + "auxiliary_loss_clip": 0.06593102, + "auxiliary_loss_mlp": 0.01294674, + "balance_loss_clip": 0.0629887, + "balance_loss_mlp": 0.01265218, + "epoch": 0.16479783556290395, + "flos": 39905609690880.0, + "grad_norm": 2.0874742304604785, + "language_loss": 0.6960665, + "learning_rate": 3.8124006293839475e-06, + "loss": 0.77494431, + "num_input_tokens_seen": 59275545, + "router_z_loss_clip": 2.9453125, + "router_z_loss_mlp": 0.29431152, + "step": 2741, + "time_per_iteration": 2.67899489402771 + }, + { + "auxiliary_loss_clip": 0.06583216, + "auxiliary_loss_mlp": 0.01289999, + "balance_loss_clip": 0.06296665, + "balance_loss_mlp": 0.0126295, + "epoch": 0.16485795881557191, + "flos": 19902449341440.0, + "grad_norm": 1.99300527848014, + "language_loss": 0.80380434, + "learning_rate": 3.812235911671472e-06, + "loss": 0.88253653, + "num_input_tokens_seen": 59293480, + "router_z_loss_clip": 2.86523438, + "router_z_loss_mlp": 0.27062988, + "step": 2742, + "time_per_iteration": 4.01186203956604 + }, + { + "auxiliary_loss_clip": 0.06583486, + "auxiliary_loss_mlp": 0.0128544, + "balance_loss_clip": 0.06299208, + "balance_loss_mlp": 0.01258034, + "epoch": 0.16491808206823988, + "flos": 20562017155200.0, + "grad_norm": 1.859989576393153, + "language_loss": 0.85480952, + "learning_rate": 3.8120711252388274e-06, + "loss": 0.9334988, + "num_input_tokens_seen": 59313435, + "router_z_loss_clip": 2.84375, + "router_z_loss_mlp": 0.27392578, + "step": 2743, + "time_per_iteration": 2.531813859939575 + }, + { + "auxiliary_loss_clip": 0.06583907, + "auxiliary_loss_mlp": 0.01288972, + "balance_loss_clip": 0.06300064, + "balance_loss_mlp": 0.01261018, + "epoch": 0.16497820532090787, + "flos": 23806803790080.0, + "grad_norm": 1.9796677960929725, + "language_loss": 0.87141418, + "learning_rate": 3.811906270092265e-06, + "loss": 0.95014304, + "num_input_tokens_seen": 59331535, + "router_z_loss_clip": 2.8359375, + "router_z_loss_mlp": 0.27966309, + "step": 2744, + "time_per_iteration": 2.5968780517578125 + }, + { + "auxiliary_loss_clip": 0.06573457, + "auxiliary_loss_mlp": 0.01283559, + "balance_loss_clip": 0.0629618, + "balance_loss_mlp": 0.01258847, + "epoch": 0.16503832857357584, + "flos": 25489510283520.0, + "grad_norm": 2.535956000825199, + "language_loss": 0.83221614, + "learning_rate": 3.811741346238036e-06, + "loss": 0.91078633, + "num_input_tokens_seen": 59350680, + "router_z_loss_clip": 2.76953125, + "router_z_loss_mlp": 0.24743652, + "step": 2745, + "time_per_iteration": 2.5640015602111816 + }, + { + "auxiliary_loss_clip": 0.06588263, + "auxiliary_loss_mlp": 0.01287637, + "balance_loss_clip": 0.06305014, + "balance_loss_mlp": 0.01261196, + "epoch": 0.1650984518262438, + "flos": 17681849562240.0, + "grad_norm": 2.0373309792274883, + "language_loss": 0.7743578, + "learning_rate": 3.8115763536823923e-06, + "loss": 0.85311675, + "num_input_tokens_seen": 59367020, + "router_z_loss_clip": 2.83007812, + "router_z_loss_mlp": 0.26452637, + "step": 2746, + "time_per_iteration": 5.4125282764434814 + }, + { + "auxiliary_loss_clip": 0.06589019, + "auxiliary_loss_mlp": 0.01289439, + "balance_loss_clip": 0.06303473, + "balance_loss_mlp": 0.01261723, + "epoch": 0.16515857507891177, + "flos": 18704401263360.0, + "grad_norm": 1.60188965958096, + "language_loss": 0.81673479, + "learning_rate": 3.811411292431592e-06, + "loss": 0.89551938, + "num_input_tokens_seen": 59386075, + "router_z_loss_clip": 2.85742188, + "router_z_loss_mlp": 0.27685547, + "step": 2747, + "time_per_iteration": 2.5460550785064697 + }, + { + "auxiliary_loss_clip": 0.06594047, + "auxiliary_loss_mlp": 0.0128679, + "balance_loss_clip": 0.06307407, + "balance_loss_mlp": 0.01260707, + "epoch": 0.16521869833157973, + "flos": 15015472462080.0, + "grad_norm": 2.468884923074517, + "language_loss": 0.71168172, + "learning_rate": 3.8112461624918945e-06, + "loss": 0.79049003, + "num_input_tokens_seen": 59402690, + "router_z_loss_clip": 2.86328125, + "router_z_loss_mlp": 0.26074219, + "step": 2748, + "time_per_iteration": 2.493168592453003 + }, + { + "auxiliary_loss_clip": 0.06589203, + "auxiliary_loss_mlp": 0.01284146, + "balance_loss_clip": 0.06305005, + "balance_loss_mlp": 0.01259732, + "epoch": 0.1652788215842477, + "flos": 22126654846080.0, + "grad_norm": 5.244624397631241, + "language_loss": 0.8897143, + "learning_rate": 3.811080963869561e-06, + "loss": 0.9684478, + "num_input_tokens_seen": 59421130, + "router_z_loss_clip": 2.84179688, + "router_z_loss_mlp": 0.24401855, + "step": 2749, + "time_per_iteration": 2.6453802585601807 + }, + { + "auxiliary_loss_clip": 0.0659653, + "auxiliary_loss_mlp": 0.01290094, + "balance_loss_clip": 0.06307155, + "balance_loss_mlp": 0.01261913, + "epoch": 0.16533894483691566, + "flos": 18339027534720.0, + "grad_norm": 3.9658549336517446, + "language_loss": 0.79764348, + "learning_rate": 3.8109156965708557e-06, + "loss": 0.87650967, + "num_input_tokens_seen": 59438970, + "router_z_loss_clip": 2.88867188, + "router_z_loss_mlp": 0.28210449, + "step": 2750, + "time_per_iteration": 2.5099878311157227 + }, + { + "auxiliary_loss_clip": 0.06587892, + "auxiliary_loss_mlp": 0.01285687, + "balance_loss_clip": 0.06303497, + "balance_loss_mlp": 0.01257673, + "epoch": 0.16539906808958366, + "flos": 22388592309120.0, + "grad_norm": 1.8681239023451541, + "language_loss": 0.95973986, + "learning_rate": 3.8107503606020455e-06, + "loss": 1.03847575, + "num_input_tokens_seen": 59458510, + "router_z_loss_clip": 2.84570312, + "router_z_loss_mlp": 0.2800293, + "step": 2751, + "time_per_iteration": 2.580857753753662 + }, + { + "auxiliary_loss_clip": 0.06591333, + "auxiliary_loss_mlp": 0.01293333, + "balance_loss_clip": 0.06311293, + "balance_loss_mlp": 0.01266344, + "epoch": 0.16545919134225162, + "flos": 22717726346880.0, + "grad_norm": 2.017884310231, + "language_loss": 0.71926272, + "learning_rate": 3.8105849559693997e-06, + "loss": 0.79810935, + "num_input_tokens_seen": 59477110, + "router_z_loss_clip": 2.79882812, + "router_z_loss_mlp": 0.26965332, + "step": 2752, + "time_per_iteration": 2.5533626079559326 + }, + { + "auxiliary_loss_clip": 0.06474683, + "auxiliary_loss_mlp": 0.01280412, + "balance_loss_clip": 0.06313415, + "balance_loss_mlp": 0.01272663, + "epoch": 0.1655193145949196, + "flos": 67822493702400.0, + "grad_norm": 0.7367497765392101, + "language_loss": 0.5395115, + "learning_rate": 3.810419482679192e-06, + "loss": 0.61706245, + "num_input_tokens_seen": 59541155, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.07739258, + "step": 2753, + "time_per_iteration": 3.283729314804077 + }, + { + "auxiliary_loss_clip": 0.06593385, + "auxiliary_loss_mlp": 0.01285286, + "balance_loss_clip": 0.06311026, + "balance_loss_mlp": 0.01258547, + "epoch": 0.16557943784758755, + "flos": 24287353355520.0, + "grad_norm": 1.793852310261697, + "language_loss": 0.75999093, + "learning_rate": 3.8102539407376954e-06, + "loss": 0.8387776, + "num_input_tokens_seen": 59561155, + "router_z_loss_clip": 2.82421875, + "router_z_loss_mlp": 0.26757812, + "step": 2754, + "time_per_iteration": 2.608365297317505 + }, + { + "auxiliary_loss_clip": 0.06608296, + "auxiliary_loss_mlp": 0.01288183, + "balance_loss_clip": 0.06315503, + "balance_loss_mlp": 0.01260575, + "epoch": 0.16563956110025552, + "flos": 20089727216640.0, + "grad_norm": 2.367713266740868, + "language_loss": 0.87993264, + "learning_rate": 3.810088330151188e-06, + "loss": 0.95889747, + "num_input_tokens_seen": 59580460, + "router_z_loss_clip": 2.9296875, + "router_z_loss_mlp": 0.27600098, + "step": 2755, + "time_per_iteration": 2.5239596366882324 + }, + { + "auxiliary_loss_clip": 0.06584992, + "auxiliary_loss_mlp": 0.01279054, + "balance_loss_clip": 0.06303753, + "balance_loss_mlp": 0.01253877, + "epoch": 0.16569968435292348, + "flos": 28041382379520.0, + "grad_norm": 1.6563009546595795, + "language_loss": 0.7383014, + "learning_rate": 3.80992265092595e-06, + "loss": 0.81694186, + "num_input_tokens_seen": 59600025, + "router_z_loss_clip": 2.8125, + "router_z_loss_mlp": 0.25195312, + "step": 2756, + "time_per_iteration": 2.6032936573028564 + }, + { + "auxiliary_loss_clip": 0.06582732, + "auxiliary_loss_mlp": 0.01284003, + "balance_loss_clip": 0.06305105, + "balance_loss_mlp": 0.0125817, + "epoch": 0.16575980760559147, + "flos": 26257461753600.0, + "grad_norm": 1.6426190009356174, + "language_loss": 0.75875264, + "learning_rate": 3.8097569030682636e-06, + "loss": 0.83741999, + "num_input_tokens_seen": 59620600, + "router_z_loss_clip": 2.7734375, + "router_z_loss_mlp": 0.25817871, + "step": 2757, + "time_per_iteration": 2.5745790004730225 + }, + { + "auxiliary_loss_clip": 0.06586438, + "auxiliary_loss_mlp": 0.01285191, + "balance_loss_clip": 0.063063, + "balance_loss_mlp": 0.01258822, + "epoch": 0.16581993085825944, + "flos": 26951382541440.0, + "grad_norm": 1.7077128151850376, + "language_loss": 0.85793787, + "learning_rate": 3.8095910865844137e-06, + "loss": 0.93665409, + "num_input_tokens_seen": 59641385, + "router_z_loss_clip": 2.80078125, + "router_z_loss_mlp": 0.26391602, + "step": 2758, + "time_per_iteration": 2.6094768047332764 + }, + { + "auxiliary_loss_clip": 0.06582282, + "auxiliary_loss_mlp": 0.01281611, + "balance_loss_clip": 0.06301229, + "balance_loss_mlp": 0.01255981, + "epoch": 0.1658800541109274, + "flos": 21660192766080.0, + "grad_norm": 2.0058299268215602, + "language_loss": 0.79821748, + "learning_rate": 3.809425201480689e-06, + "loss": 0.87685645, + "num_input_tokens_seen": 59659865, + "router_z_loss_clip": 2.81054688, + "router_z_loss_mlp": 0.25646973, + "step": 2759, + "time_per_iteration": 2.5326881408691406 + }, + { + "auxiliary_loss_clip": 0.06584738, + "auxiliary_loss_mlp": 0.01287284, + "balance_loss_clip": 0.06296851, + "balance_loss_mlp": 0.01258721, + "epoch": 0.16594017736359537, + "flos": 16441063102080.0, + "grad_norm": 2.640523985370613, + "language_loss": 0.76520288, + "learning_rate": 3.8092592477633793e-06, + "loss": 0.84392309, + "num_input_tokens_seen": 59678780, + "router_z_loss_clip": 2.8828125, + "router_z_loss_mlp": 0.28588867, + "step": 2760, + "time_per_iteration": 2.5365755558013916 + }, + { + "auxiliary_loss_clip": 0.06596339, + "auxiliary_loss_mlp": 0.01287081, + "balance_loss_clip": 0.06307873, + "balance_loss_mlp": 0.01260986, + "epoch": 0.16600030061626334, + "flos": 22643779518720.0, + "grad_norm": 1.8139140163731928, + "language_loss": 0.74449325, + "learning_rate": 3.8090932254387774e-06, + "loss": 0.82332754, + "num_input_tokens_seen": 59698795, + "router_z_loss_clip": 2.88671875, + "router_z_loss_mlp": 0.26086426, + "step": 2761, + "time_per_iteration": 2.5551891326904297 + }, + { + "auxiliary_loss_clip": 0.06586796, + "auxiliary_loss_mlp": 0.01291841, + "balance_loss_clip": 0.0630264, + "balance_loss_mlp": 0.01263922, + "epoch": 0.1660604238689313, + "flos": 26403887963520.0, + "grad_norm": 1.8147235749558717, + "language_loss": 0.89404368, + "learning_rate": 3.8089271345131788e-06, + "loss": 0.97283, + "num_input_tokens_seen": 59718795, + "router_z_loss_clip": 2.83984375, + "router_z_loss_mlp": 0.27905273, + "step": 2762, + "time_per_iteration": 2.587952136993408 + }, + { + "auxiliary_loss_clip": 0.0659417, + "auxiliary_loss_mlp": 0.01281866, + "balance_loss_clip": 0.0630425, + "balance_loss_mlp": 0.01255282, + "epoch": 0.16612054712159927, + "flos": 23046776530560.0, + "grad_norm": 1.779645358746394, + "language_loss": 0.8912673, + "learning_rate": 3.8087609749928822e-06, + "loss": 0.97002763, + "num_input_tokens_seen": 59737555, + "router_z_loss_clip": 2.90234375, + "router_z_loss_mlp": 0.26611328, + "step": 2763, + "time_per_iteration": 2.5509772300720215 + }, + { + "auxiliary_loss_clip": 0.06462647, + "auxiliary_loss_mlp": 0.01266671, + "balance_loss_clip": 0.06301262, + "balance_loss_mlp": 0.01259697, + "epoch": 0.16618067037426726, + "flos": 59261388266880.0, + "grad_norm": 0.7675418877188291, + "language_loss": 0.59855133, + "learning_rate": 3.8085947468841885e-06, + "loss": 0.67584455, + "num_input_tokens_seen": 59800915, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.06988525, + "step": 2764, + "time_per_iteration": 3.221308708190918 + }, + { + "auxiliary_loss_clip": 0.06595036, + "auxiliary_loss_mlp": 0.0129625, + "balance_loss_clip": 0.06311496, + "balance_loss_mlp": 0.01269607, + "epoch": 0.16624079362693522, + "flos": 27206192407680.0, + "grad_norm": 22.231303672766604, + "language_loss": 0.8298772, + "learning_rate": 3.808428450193401e-06, + "loss": 0.90879005, + "num_input_tokens_seen": 59822910, + "router_z_loss_clip": 2.83203125, + "router_z_loss_mlp": 0.26635742, + "step": 2765, + "time_per_iteration": 2.5886435508728027 + }, + { + "auxiliary_loss_clip": 0.06603917, + "auxiliary_loss_mlp": 0.0129703, + "balance_loss_clip": 0.06306268, + "balance_loss_mlp": 0.01269099, + "epoch": 0.1663009168796032, + "flos": 10929542215680.0, + "grad_norm": 2.384069935097126, + "language_loss": 0.7120772, + "learning_rate": 3.8082620849268244e-06, + "loss": 0.79108667, + "num_input_tokens_seen": 59838805, + "router_z_loss_clip": 2.97851562, + "router_z_loss_mlp": 0.27941895, + "step": 2766, + "time_per_iteration": 2.526913642883301 + }, + { + "auxiliary_loss_clip": 0.06591118, + "auxiliary_loss_mlp": 0.0128837, + "balance_loss_clip": 0.06309089, + "balance_loss_mlp": 0.01262526, + "epoch": 0.16636104013227115, + "flos": 17900168175360.0, + "grad_norm": 2.2120517261374593, + "language_loss": 0.89624047, + "learning_rate": 3.808095651090769e-06, + "loss": 0.97503531, + "num_input_tokens_seen": 59855345, + "router_z_loss_clip": 2.82226562, + "router_z_loss_mlp": 0.25830078, + "step": 2767, + "time_per_iteration": 2.4989144802093506 + }, + { + "auxiliary_loss_clip": 0.06446301, + "auxiliary_loss_mlp": 0.0126062, + "balance_loss_clip": 0.0628543, + "balance_loss_mlp": 0.01253307, + "epoch": 0.16642116338493912, + "flos": 66748342285440.0, + "grad_norm": 0.6237778354152628, + "language_loss": 0.52864301, + "learning_rate": 3.8079291486915447e-06, + "loss": 0.60571223, + "num_input_tokens_seen": 59917710, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.07293701, + "step": 2768, + "time_per_iteration": 3.263981580734253 + }, + { + "auxiliary_loss_clip": 0.06597716, + "auxiliary_loss_mlp": 0.01287278, + "balance_loss_clip": 0.06305783, + "balance_loss_mlp": 0.0126048, + "epoch": 0.16648128663760708, + "flos": 19032067854720.0, + "grad_norm": 2.5043941820877524, + "language_loss": 0.85743988, + "learning_rate": 3.8077625777354667e-06, + "loss": 0.93628991, + "num_input_tokens_seen": 59935105, + "router_z_loss_clip": 2.91796875, + "router_z_loss_mlp": 0.26782227, + "step": 2769, + "time_per_iteration": 2.5169060230255127 + }, + { + "auxiliary_loss_clip": 0.06441471, + "auxiliary_loss_mlp": 0.01258691, + "balance_loss_clip": 0.06281121, + "balance_loss_mlp": 0.01251771, + "epoch": 0.16654140989027508, + "flos": 70154370103680.0, + "grad_norm": 0.7855037683883999, + "language_loss": 0.57378197, + "learning_rate": 3.80759593822885e-06, + "loss": 0.65078354, + "num_input_tokens_seen": 59984085, + "router_z_loss_clip": 1.60351562, + "router_z_loss_mlp": 0.06939697, + "step": 2770, + "time_per_iteration": 3.0450947284698486 + }, + { + "auxiliary_loss_clip": 0.0643771, + "auxiliary_loss_mlp": 0.01261832, + "balance_loss_clip": 0.06278233, + "balance_loss_mlp": 0.01254959, + "epoch": 0.16660153314294304, + "flos": 70290398407680.0, + "grad_norm": 0.8814976481921372, + "language_loss": 0.5630703, + "learning_rate": 3.807429230178015e-06, + "loss": 0.64006579, + "num_input_tokens_seen": 60043470, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.06890869, + "step": 2771, + "time_per_iteration": 3.0379133224487305 + }, + { + "auxiliary_loss_clip": 0.06582694, + "auxiliary_loss_mlp": 0.01286148, + "balance_loss_clip": 0.06303653, + "balance_loss_mlp": 0.01260756, + "epoch": 0.166661656395611, + "flos": 23081590702080.0, + "grad_norm": 2.5291823890046534, + "language_loss": 0.71466291, + "learning_rate": 3.8072624535892817e-06, + "loss": 0.79335129, + "num_input_tokens_seen": 60063045, + "router_z_loss_clip": 2.79101562, + "router_z_loss_mlp": 0.25378418, + "step": 2772, + "time_per_iteration": 2.551870584487915 + }, + { + "auxiliary_loss_clip": 0.06576528, + "auxiliary_loss_mlp": 0.01281534, + "balance_loss_clip": 0.06298962, + "balance_loss_mlp": 0.01255082, + "epoch": 0.16672177964827897, + "flos": 28373912507520.0, + "grad_norm": 1.9791838329774285, + "language_loss": 0.87486583, + "learning_rate": 3.807095608468975e-06, + "loss": 0.95344645, + "num_input_tokens_seen": 60081945, + "router_z_loss_clip": 2.77929688, + "router_z_loss_mlp": 0.26452637, + "step": 2773, + "time_per_iteration": 2.613593339920044 + }, + { + "auxiliary_loss_clip": 0.06585228, + "auxiliary_loss_mlp": 0.01284542, + "balance_loss_clip": 0.06305268, + "balance_loss_mlp": 0.01259532, + "epoch": 0.16678190290094694, + "flos": 19095700631040.0, + "grad_norm": 2.4658170667158545, + "language_loss": 0.8279835, + "learning_rate": 3.8069286948234224e-06, + "loss": 0.90668118, + "num_input_tokens_seen": 60096820, + "router_z_loss_clip": 2.80273438, + "router_z_loss_mlp": 0.25012207, + "step": 2774, + "time_per_iteration": 2.5196969509124756 + }, + { + "auxiliary_loss_clip": 0.06592362, + "auxiliary_loss_mlp": 0.01284239, + "balance_loss_clip": 0.06307152, + "balance_loss_mlp": 0.01258871, + "epoch": 0.1668420261536149, + "flos": 21805612727040.0, + "grad_norm": 2.7739422626660053, + "language_loss": 0.84618509, + "learning_rate": 3.806761712658952e-06, + "loss": 0.92495108, + "num_input_tokens_seen": 60116140, + "router_z_loss_clip": 2.85546875, + "router_z_loss_mlp": 0.25354004, + "step": 2775, + "time_per_iteration": 2.5799014568328857 + }, + { + "auxiliary_loss_clip": 0.06591405, + "auxiliary_loss_mlp": 0.01285295, + "balance_loss_clip": 0.06311037, + "balance_loss_mlp": 0.01260702, + "epoch": 0.16690214940628287, + "flos": 19068559107840.0, + "grad_norm": 2.4582225386756793, + "language_loss": 0.81805599, + "learning_rate": 3.806594661981897e-06, + "loss": 0.89682293, + "num_input_tokens_seen": 60134235, + "router_z_loss_clip": 2.8046875, + "router_z_loss_mlp": 0.24584961, + "step": 2776, + "time_per_iteration": 2.547075033187866 + }, + { + "auxiliary_loss_clip": 0.06574798, + "auxiliary_loss_mlp": 0.01281066, + "balance_loss_clip": 0.06304269, + "balance_loss_mlp": 0.01257188, + "epoch": 0.16696227265895086, + "flos": 18594550160640.0, + "grad_norm": 2.127036404214793, + "language_loss": 0.80698764, + "learning_rate": 3.8064275427985906e-06, + "loss": 0.88554621, + "num_input_tokens_seen": 60153275, + "router_z_loss_clip": 2.703125, + "router_z_loss_mlp": 0.2388916, + "step": 2777, + "time_per_iteration": 2.701383352279663 + }, + { + "auxiliary_loss_clip": 0.06586365, + "auxiliary_loss_mlp": 0.0128362, + "balance_loss_clip": 0.06303923, + "balance_loss_mlp": 0.01258323, + "epoch": 0.16702239591161883, + "flos": 23300747856000.0, + "grad_norm": 1.7658630551266277, + "language_loss": 0.85838449, + "learning_rate": 3.806260355115371e-06, + "loss": 0.93708432, + "num_input_tokens_seen": 60173215, + "router_z_loss_clip": 2.82421875, + "router_z_loss_mlp": 0.25305176, + "step": 2778, + "time_per_iteration": 4.054275989532471 + }, + { + "auxiliary_loss_clip": 0.06594409, + "auxiliary_loss_mlp": 0.01286931, + "balance_loss_clip": 0.06310806, + "balance_loss_mlp": 0.01260908, + "epoch": 0.1670825191642868, + "flos": 24432521754240.0, + "grad_norm": 2.130533626904146, + "language_loss": 0.75036883, + "learning_rate": 3.8060930989385778e-06, + "loss": 0.82918215, + "num_input_tokens_seen": 60190515, + "router_z_loss_clip": 2.83789062, + "router_z_loss_mlp": 0.26013184, + "step": 2779, + "time_per_iteration": 2.5570623874664307 + }, + { + "auxiliary_loss_clip": 0.06586824, + "auxiliary_loss_mlp": 0.01289404, + "balance_loss_clip": 0.06304757, + "balance_loss_mlp": 0.01263237, + "epoch": 0.16714264241695476, + "flos": 26804830550400.0, + "grad_norm": 2.754931380433817, + "language_loss": 0.66534865, + "learning_rate": 3.805925774274554e-06, + "loss": 0.74411094, + "num_input_tokens_seen": 60211655, + "router_z_loss_clip": 2.8203125, + "router_z_loss_mlp": 0.26147461, + "step": 2780, + "time_per_iteration": 2.5990118980407715 + }, + { + "auxiliary_loss_clip": 0.06585376, + "auxiliary_loss_mlp": 0.01289397, + "balance_loss_clip": 0.06306757, + "balance_loss_mlp": 0.01263075, + "epoch": 0.16720276566962272, + "flos": 21841768563840.0, + "grad_norm": 3.156228906236902, + "language_loss": 0.80115324, + "learning_rate": 3.805758381129643e-06, + "loss": 0.87990093, + "num_input_tokens_seen": 60230860, + "router_z_loss_clip": 2.78320312, + "router_z_loss_mlp": 0.26318359, + "step": 2781, + "time_per_iteration": 3.9395251274108887 + }, + { + "auxiliary_loss_clip": 0.06586023, + "auxiliary_loss_mlp": 0.01284981, + "balance_loss_clip": 0.06303258, + "balance_loss_mlp": 0.01258791, + "epoch": 0.1672628889222907, + "flos": 21476814105600.0, + "grad_norm": 1.4411022993090745, + "language_loss": 0.75756633, + "learning_rate": 3.805590919510193e-06, + "loss": 0.83627641, + "num_input_tokens_seen": 60250535, + "router_z_loss_clip": 2.828125, + "router_z_loss_mlp": 0.26171875, + "step": 2782, + "time_per_iteration": 2.6298012733459473 + }, + { + "auxiliary_loss_clip": 0.06600203, + "auxiliary_loss_mlp": 0.01288992, + "balance_loss_clip": 0.06305742, + "balance_loss_mlp": 0.0126242, + "epoch": 0.16732301217495865, + "flos": 30781915943040.0, + "grad_norm": 2.647632172572772, + "language_loss": 0.6861552, + "learning_rate": 3.8054233894225547e-06, + "loss": 0.76504719, + "num_input_tokens_seen": 60269530, + "router_z_loss_clip": 2.94335938, + "router_z_loss_mlp": 0.26550293, + "step": 2783, + "time_per_iteration": 2.5996532440185547 + }, + { + "auxiliary_loss_clip": 0.06581019, + "auxiliary_loss_mlp": 0.01284416, + "balance_loss_clip": 0.06301262, + "balance_loss_mlp": 0.0125931, + "epoch": 0.16738313542762664, + "flos": 23480940061440.0, + "grad_norm": 1.7043112393392166, + "language_loss": 0.70624614, + "learning_rate": 3.805255790873081e-06, + "loss": 0.78490055, + "num_input_tokens_seen": 60289900, + "router_z_loss_clip": 2.79492188, + "router_z_loss_mlp": 0.25109863, + "step": 2784, + "time_per_iteration": 2.5658257007598877 + }, + { + "auxiliary_loss_clip": 0.06592201, + "auxiliary_loss_mlp": 0.01289494, + "balance_loss_clip": 0.06306473, + "balance_loss_mlp": 0.01263041, + "epoch": 0.1674432586802946, + "flos": 29796861744000.0, + "grad_norm": 2.259998214947441, + "language_loss": 0.61717749, + "learning_rate": 3.805088123868126e-06, + "loss": 0.69599444, + "num_input_tokens_seen": 60310025, + "router_z_loss_clip": 2.85742188, + "router_z_loss_mlp": 0.2644043, + "step": 2785, + "time_per_iteration": 4.003845691680908 + }, + { + "auxiliary_loss_clip": 0.064503, + "auxiliary_loss_mlp": 0.01262182, + "balance_loss_clip": 0.06288917, + "balance_loss_mlp": 0.01255161, + "epoch": 0.16750338193296258, + "flos": 66157228857600.0, + "grad_norm": 0.7834191651915974, + "language_loss": 0.58330011, + "learning_rate": 3.8049203884140492e-06, + "loss": 0.66042489, + "num_input_tokens_seen": 60377800, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.07037354, + "step": 2786, + "time_per_iteration": 4.598146200180054 + }, + { + "auxiliary_loss_clip": 0.06587794, + "auxiliary_loss_mlp": 0.01289611, + "balance_loss_clip": 0.06301168, + "balance_loss_mlp": 0.0126298, + "epoch": 0.16756350518563054, + "flos": 25702881505920.0, + "grad_norm": 2.328984985341375, + "language_loss": 0.76757109, + "learning_rate": 3.80475258451721e-06, + "loss": 0.84634513, + "num_input_tokens_seen": 60398215, + "router_z_loss_clip": 2.8671875, + "router_z_loss_mlp": 0.26623535, + "step": 2787, + "time_per_iteration": 2.5801339149475098 + }, + { + "auxiliary_loss_clip": 0.06585419, + "auxiliary_loss_mlp": 0.01283974, + "balance_loss_clip": 0.06301223, + "balance_loss_mlp": 0.01257891, + "epoch": 0.1676236284382985, + "flos": 23841911450880.0, + "grad_norm": 1.9360315934234018, + "language_loss": 0.78495795, + "learning_rate": 3.804584712183972e-06, + "loss": 0.86365187, + "num_input_tokens_seen": 60416910, + "router_z_loss_clip": 2.84375, + "router_z_loss_mlp": 0.26086426, + "step": 2788, + "time_per_iteration": 2.5693655014038086 + }, + { + "auxiliary_loss_clip": 0.06435917, + "auxiliary_loss_mlp": 0.0126534, + "balance_loss_clip": 0.06275532, + "balance_loss_mlp": 0.01257765, + "epoch": 0.16768375169096647, + "flos": 59891313663360.0, + "grad_norm": 0.8394736884379908, + "language_loss": 0.59391403, + "learning_rate": 3.8044167714207013e-06, + "loss": 0.67092663, + "num_input_tokens_seen": 60468660, + "router_z_loss_clip": 1.60839844, + "router_z_loss_mlp": 0.07562256, + "step": 2789, + "time_per_iteration": 3.006455659866333 + }, + { + "auxiliary_loss_clip": 0.06580187, + "auxiliary_loss_mlp": 0.01282981, + "balance_loss_clip": 0.06298364, + "balance_loss_mlp": 0.01257566, + "epoch": 0.16774387494363446, + "flos": 38444785608960.0, + "grad_norm": 1.7149926461558054, + "language_loss": 0.71297312, + "learning_rate": 3.804248762233765e-06, + "loss": 0.79160476, + "num_input_tokens_seen": 60492370, + "router_z_loss_clip": 2.8203125, + "router_z_loss_mlp": 0.25427246, + "step": 2790, + "time_per_iteration": 2.6886403560638428 + }, + { + "auxiliary_loss_clip": 0.065869, + "auxiliary_loss_mlp": 0.01286845, + "balance_loss_clip": 0.06305605, + "balance_loss_mlp": 0.01260142, + "epoch": 0.16780399819630243, + "flos": 22644156862080.0, + "grad_norm": 1.6857838889349592, + "language_loss": 0.7969588, + "learning_rate": 3.8040806846295356e-06, + "loss": 0.8756963, + "num_input_tokens_seen": 60512655, + "router_z_loss_clip": 2.8125, + "router_z_loss_mlp": 0.26696777, + "step": 2791, + "time_per_iteration": 2.542351484298706 + }, + { + "auxiliary_loss_clip": 0.06585324, + "auxiliary_loss_mlp": 0.01283873, + "balance_loss_clip": 0.06304726, + "balance_loss_mlp": 0.01256502, + "epoch": 0.1678641214489704, + "flos": 32900001851520.0, + "grad_norm": 1.6260668766519037, + "language_loss": 0.72283256, + "learning_rate": 3.8039125386143853e-06, + "loss": 0.80152452, + "num_input_tokens_seen": 60533090, + "router_z_loss_clip": 2.80664062, + "router_z_loss_mlp": 0.27355957, + "step": 2792, + "time_per_iteration": 2.681652784347534 + }, + { + "auxiliary_loss_clip": 0.06588314, + "auxiliary_loss_mlp": 0.01281257, + "balance_loss_clip": 0.06305955, + "balance_loss_mlp": 0.01256223, + "epoch": 0.16792424470163836, + "flos": 19981133925120.0, + "grad_norm": 2.7315250216088756, + "language_loss": 0.7262826, + "learning_rate": 3.803744324194691e-06, + "loss": 0.80497831, + "num_input_tokens_seen": 60553190, + "router_z_loss_clip": 2.82617188, + "router_z_loss_mlp": 0.25036621, + "step": 2793, + "time_per_iteration": 2.5261969566345215 + }, + { + "auxiliary_loss_clip": 0.06583093, + "auxiliary_loss_mlp": 0.01283488, + "balance_loss_clip": 0.06301598, + "balance_loss_mlp": 0.01257333, + "epoch": 0.16798436795430632, + "flos": 19726114423680.0, + "grad_norm": 2.037397007218884, + "language_loss": 0.78064799, + "learning_rate": 3.803576041376831e-06, + "loss": 0.85931379, + "num_input_tokens_seen": 60571995, + "router_z_loss_clip": 2.81445312, + "router_z_loss_mlp": 0.26135254, + "step": 2794, + "time_per_iteration": 2.5393919944763184 + }, + { + "auxiliary_loss_clip": 0.06580402, + "auxiliary_loss_mlp": 0.01288563, + "balance_loss_clip": 0.06298761, + "balance_loss_mlp": 0.01262206, + "epoch": 0.1680444912069743, + "flos": 28111346138880.0, + "grad_norm": 2.312644294934493, + "language_loss": 0.72345173, + "learning_rate": 3.803407690167187e-06, + "loss": 0.80214143, + "num_input_tokens_seen": 60591275, + "router_z_loss_clip": 2.81640625, + "router_z_loss_mlp": 0.26379395, + "step": 2795, + "time_per_iteration": 2.565215587615967 + }, + { + "auxiliary_loss_clip": 0.06578698, + "auxiliary_loss_mlp": 0.01278302, + "balance_loss_clip": 0.06297935, + "balance_loss_mlp": 0.01254329, + "epoch": 0.16810461445964225, + "flos": 18080695797120.0, + "grad_norm": 1.8533332907405589, + "language_loss": 0.85181081, + "learning_rate": 3.803239270572142e-06, + "loss": 0.93038082, + "num_input_tokens_seen": 60609235, + "router_z_loss_clip": 2.81054688, + "router_z_loss_mlp": 0.23986816, + "step": 2796, + "time_per_iteration": 2.627962112426758 + }, + { + "auxiliary_loss_clip": 0.06595714, + "auxiliary_loss_mlp": 0.01283274, + "balance_loss_clip": 0.0630767, + "balance_loss_mlp": 0.01256571, + "epoch": 0.16816473771231025, + "flos": 23885488373760.0, + "grad_norm": 2.13286065055067, + "language_loss": 0.82093614, + "learning_rate": 3.8030707825980838e-06, + "loss": 0.89972603, + "num_input_tokens_seen": 60629880, + "router_z_loss_clip": 2.8828125, + "router_z_loss_mlp": 0.26696777, + "step": 2797, + "time_per_iteration": 2.5887176990509033 + }, + { + "auxiliary_loss_clip": 0.06571205, + "auxiliary_loss_mlp": 0.01280989, + "balance_loss_clip": 0.06298848, + "balance_loss_mlp": 0.01257922, + "epoch": 0.1682248609649782, + "flos": 22790163801600.0, + "grad_norm": 1.6719709230048432, + "language_loss": 0.75814915, + "learning_rate": 3.802902226251401e-06, + "loss": 0.83667111, + "num_input_tokens_seen": 60651175, + "router_z_loss_clip": 2.72460938, + "router_z_loss_mlp": 0.23071289, + "step": 2798, + "time_per_iteration": 2.5682647228240967 + }, + { + "auxiliary_loss_clip": 0.06575698, + "auxiliary_loss_mlp": 0.01285158, + "balance_loss_clip": 0.06297997, + "balance_loss_mlp": 0.01261483, + "epoch": 0.16828498421764618, + "flos": 20711545966080.0, + "grad_norm": 1.6493106854951614, + "language_loss": 0.8051939, + "learning_rate": 3.8027336015384845e-06, + "loss": 0.88380253, + "num_input_tokens_seen": 60670210, + "router_z_loss_clip": 2.77929688, + "router_z_loss_mlp": 0.23669434, + "step": 2799, + "time_per_iteration": 2.5808820724487305 + }, + { + "auxiliary_loss_clip": 0.06588444, + "auxiliary_loss_mlp": 0.01290497, + "balance_loss_clip": 0.06306663, + "balance_loss_mlp": 0.01264951, + "epoch": 0.16834510747031414, + "flos": 29427714581760.0, + "grad_norm": 2.08568782894778, + "language_loss": 0.71203279, + "learning_rate": 3.8025649084657296e-06, + "loss": 0.79082221, + "num_input_tokens_seen": 60690895, + "router_z_loss_clip": 2.81835938, + "router_z_loss_mlp": 0.25561523, + "step": 2800, + "time_per_iteration": 2.6072590351104736 + }, + { + "auxiliary_loss_clip": 0.06577089, + "auxiliary_loss_mlp": 0.01284192, + "balance_loss_clip": 0.06299706, + "balance_loss_mlp": 0.01258705, + "epoch": 0.1684052307229821, + "flos": 18150407994240.0, + "grad_norm": 2.3689825925758647, + "language_loss": 0.84516144, + "learning_rate": 3.8023961470395326e-06, + "loss": 0.9237743, + "num_input_tokens_seen": 60708280, + "router_z_loss_clip": 2.7734375, + "router_z_loss_mlp": 0.25488281, + "step": 2801, + "time_per_iteration": 2.5283203125 + }, + { + "auxiliary_loss_clip": 0.06582664, + "auxiliary_loss_mlp": 0.01284981, + "balance_loss_clip": 0.06302365, + "balance_loss_mlp": 0.01258612, + "epoch": 0.16846535397565007, + "flos": 16579439320320.0, + "grad_norm": 3.0795087290353744, + "language_loss": 0.84073383, + "learning_rate": 3.8022273172662933e-06, + "loss": 0.91941023, + "num_input_tokens_seen": 60724150, + "router_z_loss_clip": 2.80273438, + "router_z_loss_mlp": 0.26391602, + "step": 2802, + "time_per_iteration": 2.493727684020996 + }, + { + "auxiliary_loss_clip": 0.06582403, + "auxiliary_loss_mlp": 0.01282997, + "balance_loss_clip": 0.0630409, + "balance_loss_mlp": 0.01256831, + "epoch": 0.16852547722831807, + "flos": 30416667995520.0, + "grad_norm": 4.967511006144659, + "language_loss": 0.81234676, + "learning_rate": 3.802058419152413e-06, + "loss": 0.89100075, + "num_input_tokens_seen": 60746485, + "router_z_loss_clip": 2.78320312, + "router_z_loss_mlp": 0.26147461, + "step": 2803, + "time_per_iteration": 2.6188409328460693 + }, + { + "auxiliary_loss_clip": 0.06578018, + "auxiliary_loss_mlp": 0.01280405, + "balance_loss_clip": 0.06301461, + "balance_loss_mlp": 0.01256157, + "epoch": 0.16858560048098603, + "flos": 33515279982720.0, + "grad_norm": 2.6560543874068205, + "language_loss": 0.77301621, + "learning_rate": 3.801889452704297e-06, + "loss": 0.85160041, + "num_input_tokens_seen": 60762875, + "router_z_loss_clip": 2.76367188, + "router_z_loss_mlp": 0.24230957, + "step": 2804, + "time_per_iteration": 2.6222236156463623 + }, + { + "auxiliary_loss_clip": 0.06456417, + "auxiliary_loss_mlp": 0.01261999, + "balance_loss_clip": 0.06296105, + "balance_loss_mlp": 0.0125524, + "epoch": 0.168645723733654, + "flos": 67390845793920.0, + "grad_norm": 0.7985418659660302, + "language_loss": 0.55433214, + "learning_rate": 3.8017204179283526e-06, + "loss": 0.63151628, + "num_input_tokens_seen": 60825510, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.06774902, + "step": 2805, + "time_per_iteration": 3.1424005031585693 + }, + { + "auxiliary_loss_clip": 0.06571464, + "auxiliary_loss_mlp": 0.01283981, + "balance_loss_clip": 0.06301463, + "balance_loss_mlp": 0.01260723, + "epoch": 0.16870584698632196, + "flos": 21331016801280.0, + "grad_norm": 1.8814500249786532, + "language_loss": 0.74235076, + "learning_rate": 3.8015513148309892e-06, + "loss": 0.82090515, + "num_input_tokens_seen": 60844440, + "router_z_loss_clip": 2.70117188, + "router_z_loss_mlp": 0.23254395, + "step": 2806, + "time_per_iteration": 2.5448226928710938 + }, + { + "auxiliary_loss_clip": 0.06569488, + "auxiliary_loss_mlp": 0.01288633, + "balance_loss_clip": 0.06295753, + "balance_loss_mlp": 0.01264123, + "epoch": 0.16876597023898993, + "flos": 20747030970240.0, + "grad_norm": 2.4625186255791407, + "language_loss": 0.70848989, + "learning_rate": 3.80138214341862e-06, + "loss": 0.78707111, + "num_input_tokens_seen": 60863210, + "router_z_loss_clip": 2.734375, + "router_z_loss_mlp": 0.24523926, + "step": 2807, + "time_per_iteration": 2.5282390117645264 + }, + { + "auxiliary_loss_clip": 0.06578949, + "auxiliary_loss_mlp": 0.01289591, + "balance_loss_clip": 0.06299816, + "balance_loss_mlp": 0.0126383, + "epoch": 0.1688260934916579, + "flos": 20309806765440.0, + "grad_norm": 3.7758907272624715, + "language_loss": 0.71724349, + "learning_rate": 3.8012129036976587e-06, + "loss": 0.79592896, + "num_input_tokens_seen": 60882510, + "router_z_loss_clip": 2.79101562, + "router_z_loss_mlp": 0.25744629, + "step": 2808, + "time_per_iteration": 2.5146172046661377 + }, + { + "auxiliary_loss_clip": 0.06592815, + "auxiliary_loss_mlp": 0.01288179, + "balance_loss_clip": 0.06306504, + "balance_loss_mlp": 0.01261119, + "epoch": 0.16888621674432586, + "flos": 20347136559360.0, + "grad_norm": 2.150924717168134, + "language_loss": 0.80452245, + "learning_rate": 3.8010435956745236e-06, + "loss": 0.88333237, + "num_input_tokens_seen": 60901105, + "router_z_loss_clip": 2.86328125, + "router_z_loss_mlp": 0.27075195, + "step": 2809, + "time_per_iteration": 2.590801477432251 + }, + { + "auxiliary_loss_clip": 0.06586212, + "auxiliary_loss_mlp": 0.01286252, + "balance_loss_clip": 0.06299593, + "balance_loss_mlp": 0.01258965, + "epoch": 0.16894633999699385, + "flos": 16248963617280.0, + "grad_norm": 2.023624064417177, + "language_loss": 0.8897475, + "learning_rate": 3.8008742193556358e-06, + "loss": 0.96847212, + "num_input_tokens_seen": 60915340, + "router_z_loss_clip": 2.86523438, + "router_z_loss_mlp": 0.27294922, + "step": 2810, + "time_per_iteration": 2.553370714187622 + }, + { + "auxiliary_loss_clip": 0.0659079, + "auxiliary_loss_mlp": 0.01302127, + "balance_loss_clip": 0.06304274, + "balance_loss_mlp": 0.01273994, + "epoch": 0.16900646324966181, + "flos": 19616347175040.0, + "grad_norm": 1.906856377822649, + "language_loss": 0.93345243, + "learning_rate": 3.800704774747416e-06, + "loss": 1.01238155, + "num_input_tokens_seen": 60933735, + "router_z_loss_clip": 2.86523438, + "router_z_loss_mlp": 0.28137207, + "step": 2811, + "time_per_iteration": 2.5584306716918945 + }, + { + "auxiliary_loss_clip": 0.06579725, + "auxiliary_loss_mlp": 0.01293368, + "balance_loss_clip": 0.0629798, + "balance_loss_mlp": 0.01266534, + "epoch": 0.16906658650232978, + "flos": 22024644099840.0, + "grad_norm": 1.777677884933971, + "language_loss": 0.80087781, + "learning_rate": 3.800535261856291e-06, + "loss": 0.87960875, + "num_input_tokens_seen": 60953105, + "router_z_loss_clip": 2.81445312, + "router_z_loss_mlp": 0.26818848, + "step": 2812, + "time_per_iteration": 2.5193934440612793 + }, + { + "auxiliary_loss_clip": 0.06578699, + "auxiliary_loss_mlp": 0.01288816, + "balance_loss_clip": 0.06299478, + "balance_loss_mlp": 0.01262983, + "epoch": 0.16912670975499774, + "flos": 11768212131840.0, + "grad_norm": 2.3060118484148586, + "language_loss": 0.76260078, + "learning_rate": 3.8003656806886887e-06, + "loss": 0.84127587, + "num_input_tokens_seen": 60969150, + "router_z_loss_clip": 2.79296875, + "router_z_loss_mlp": 0.25830078, + "step": 2813, + "time_per_iteration": 2.5597875118255615 + }, + { + "auxiliary_loss_clip": 0.06583597, + "auxiliary_loss_mlp": 0.01290749, + "balance_loss_clip": 0.06300971, + "balance_loss_mlp": 0.01265083, + "epoch": 0.1691868330076657, + "flos": 17166443898240.0, + "grad_norm": 2.6968588943339444, + "language_loss": 0.70284265, + "learning_rate": 3.8001960312510396e-06, + "loss": 0.78158611, + "num_input_tokens_seen": 60982825, + "router_z_loss_clip": 2.82617188, + "router_z_loss_mlp": 0.2565918, + "step": 2814, + "time_per_iteration": 2.4971132278442383 + }, + { + "auxiliary_loss_clip": 0.06581523, + "auxiliary_loss_mlp": 0.01299068, + "balance_loss_clip": 0.06302465, + "balance_loss_mlp": 0.01272174, + "epoch": 0.16924695626033368, + "flos": 22422693720960.0, + "grad_norm": 1.782997034372258, + "language_loss": 0.63103068, + "learning_rate": 3.800026313549776e-06, + "loss": 0.7098366, + "num_input_tokens_seen": 61000875, + "router_z_loss_clip": 2.79296875, + "router_z_loss_mlp": 0.2689209, + "step": 2815, + "time_per_iteration": 2.583073377609253 + }, + { + "auxiliary_loss_clip": 0.06579722, + "auxiliary_loss_mlp": 0.01301206, + "balance_loss_clip": 0.06305208, + "balance_loss_mlp": 0.01275195, + "epoch": 0.16930707951300164, + "flos": 25746835772160.0, + "grad_norm": 1.6235196600742487, + "language_loss": 0.82652867, + "learning_rate": 3.7998565275913342e-06, + "loss": 0.90533793, + "num_input_tokens_seen": 61021940, + "router_z_loss_clip": 2.7421875, + "router_z_loss_mlp": 0.26037598, + "step": 2816, + "time_per_iteration": 2.567267894744873 + }, + { + "auxiliary_loss_clip": 0.06582578, + "auxiliary_loss_mlp": 0.01283511, + "balance_loss_clip": 0.06305215, + "balance_loss_mlp": 0.01257404, + "epoch": 0.16936720276566963, + "flos": 22753588694400.0, + "grad_norm": 2.305113279035628, + "language_loss": 0.88275278, + "learning_rate": 3.799686673382153e-06, + "loss": 0.96141362, + "num_input_tokens_seen": 61040285, + "router_z_loss_clip": 2.77148438, + "router_z_loss_mlp": 0.26074219, + "step": 2817, + "time_per_iteration": 2.55474853515625 + }, + { + "auxiliary_loss_clip": 0.06582828, + "auxiliary_loss_mlp": 0.0128986, + "balance_loss_clip": 0.06307572, + "balance_loss_mlp": 0.01264326, + "epoch": 0.1694273260183376, + "flos": 19580191338240.0, + "grad_norm": 1.9827332941616407, + "language_loss": 0.82882643, + "learning_rate": 3.799516750928672e-06, + "loss": 0.90755332, + "num_input_tokens_seen": 61059020, + "router_z_loss_clip": 2.75195312, + "router_z_loss_mlp": 0.2557373, + "step": 2818, + "time_per_iteration": 4.006748676300049 + }, + { + "auxiliary_loss_clip": 0.06584448, + "auxiliary_loss_mlp": 0.01293023, + "balance_loss_clip": 0.06306577, + "balance_loss_mlp": 0.01267905, + "epoch": 0.16948744927100556, + "flos": 12462636044160.0, + "grad_norm": 2.7889091010227367, + "language_loss": 0.81285071, + "learning_rate": 3.799346760237336e-06, + "loss": 0.8916254, + "num_input_tokens_seen": 61074245, + "router_z_loss_clip": 2.77929688, + "router_z_loss_mlp": 0.2512207, + "step": 2819, + "time_per_iteration": 2.513493537902832 + }, + { + "auxiliary_loss_clip": 0.06486231, + "auxiliary_loss_mlp": 0.01264851, + "balance_loss_clip": 0.06326687, + "balance_loss_mlp": 0.01257299, + "epoch": 0.16954757252367353, + "flos": 71309470164480.0, + "grad_norm": 0.8945207214981431, + "language_loss": 0.6004045, + "learning_rate": 3.7991767013145902e-06, + "loss": 0.67791533, + "num_input_tokens_seen": 61127080, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.07537842, + "step": 2820, + "time_per_iteration": 3.0841901302337646 + }, + { + "auxiliary_loss_clip": 0.06583934, + "auxiliary_loss_mlp": 0.01283404, + "balance_loss_clip": 0.06305862, + "balance_loss_mlp": 0.01258656, + "epoch": 0.1696076957763415, + "flos": 29614237770240.0, + "grad_norm": 2.2684361224992315, + "language_loss": 0.79040307, + "learning_rate": 3.7990065741668844e-06, + "loss": 0.86907649, + "num_input_tokens_seen": 61146955, + "router_z_loss_clip": 2.78125, + "router_z_loss_mlp": 0.24755859, + "step": 2821, + "time_per_iteration": 4.0664753913879395 + }, + { + "auxiliary_loss_clip": 0.06580411, + "auxiliary_loss_mlp": 0.01287682, + "balance_loss_clip": 0.06301302, + "balance_loss_mlp": 0.01260884, + "epoch": 0.16966781902900946, + "flos": 24395359668480.0, + "grad_norm": 4.427680473234215, + "language_loss": 0.79946303, + "learning_rate": 3.7988363788006685e-06, + "loss": 0.87814403, + "num_input_tokens_seen": 61166605, + "router_z_loss_clip": 2.79101562, + "router_z_loss_mlp": 0.26782227, + "step": 2822, + "time_per_iteration": 2.591439962387085 + }, + { + "auxiliary_loss_clip": 0.06573688, + "auxiliary_loss_mlp": 0.01292623, + "balance_loss_clip": 0.06300368, + "balance_loss_mlp": 0.0126834, + "epoch": 0.16972794228167745, + "flos": 23045392938240.0, + "grad_norm": 1.79403732378333, + "language_loss": 0.75404185, + "learning_rate": 3.7986661152223967e-06, + "loss": 0.83270496, + "num_input_tokens_seen": 61186535, + "router_z_loss_clip": 2.734375, + "router_z_loss_mlp": 0.24291992, + "step": 2823, + "time_per_iteration": 2.607241153717041 + }, + { + "auxiliary_loss_clip": 0.06584911, + "auxiliary_loss_mlp": 0.01296286, + "balance_loss_clip": 0.06309374, + "balance_loss_mlp": 0.01270704, + "epoch": 0.16978806553434542, + "flos": 35237915746560.0, + "grad_norm": 1.9541945473914888, + "language_loss": 0.60637134, + "learning_rate": 3.7984957834385257e-06, + "loss": 0.68518329, + "num_input_tokens_seen": 61208965, + "router_z_loss_clip": 2.7578125, + "router_z_loss_mlp": 0.2557373, + "step": 2824, + "time_per_iteration": 4.110937595367432 + }, + { + "auxiliary_loss_clip": 0.06588213, + "auxiliary_loss_mlp": 0.01295922, + "balance_loss_clip": 0.06311615, + "balance_loss_mlp": 0.01271114, + "epoch": 0.16984818878701338, + "flos": 32022366986880.0, + "grad_norm": 1.641592491230249, + "language_loss": 0.73562557, + "learning_rate": 3.7983253834555144e-06, + "loss": 0.81446695, + "num_input_tokens_seen": 61230670, + "router_z_loss_clip": 2.765625, + "router_z_loss_mlp": 0.24816895, + "step": 2825, + "time_per_iteration": 2.634206533432007 + }, + { + "auxiliary_loss_clip": 0.06593174, + "auxiliary_loss_mlp": 0.01295449, + "balance_loss_clip": 0.06306911, + "balance_loss_mlp": 0.01267411, + "epoch": 0.16990831203968135, + "flos": 22824936046080.0, + "grad_norm": 2.0964880275629465, + "language_loss": 0.86494017, + "learning_rate": 3.7981549152798245e-06, + "loss": 0.94382638, + "num_input_tokens_seen": 61249510, + "router_z_loss_clip": 2.8671875, + "router_z_loss_mlp": 0.28051758, + "step": 2826, + "time_per_iteration": 4.0616254806518555 + }, + { + "auxiliary_loss_clip": 0.0658946, + "auxiliary_loss_mlp": 0.01287444, + "balance_loss_clip": 0.0630484, + "balance_loss_mlp": 0.01260122, + "epoch": 0.1699684352923493, + "flos": 23046315333120.0, + "grad_norm": 1.7026807922554432, + "language_loss": 0.83019429, + "learning_rate": 3.7979843789179196e-06, + "loss": 0.90896332, + "num_input_tokens_seen": 61269440, + "router_z_loss_clip": 2.8515625, + "router_z_loss_mlp": 0.27307129, + "step": 2827, + "time_per_iteration": 2.5943539142608643 + }, + { + "auxiliary_loss_clip": 0.0658665, + "auxiliary_loss_mlp": 0.01291922, + "balance_loss_clip": 0.06303778, + "balance_loss_mlp": 0.01264206, + "epoch": 0.17002855854501728, + "flos": 21440532487680.0, + "grad_norm": 1.9993521816112911, + "language_loss": 0.75042886, + "learning_rate": 3.797813774376267e-06, + "loss": 0.82921457, + "num_input_tokens_seen": 61288195, + "router_z_loss_clip": 2.83203125, + "router_z_loss_mlp": 0.27722168, + "step": 2828, + "time_per_iteration": 2.5574147701263428 + }, + { + "auxiliary_loss_clip": 0.06457284, + "auxiliary_loss_mlp": 0.01264115, + "balance_loss_clip": 0.06297607, + "balance_loss_mlp": 0.01257433, + "epoch": 0.17008868179768524, + "flos": 71473966928640.0, + "grad_norm": 0.7544805989931621, + "language_loss": 0.56274545, + "learning_rate": 3.797643101661336e-06, + "loss": 0.63995945, + "num_input_tokens_seen": 61350850, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.06695557, + "step": 2829, + "time_per_iteration": 3.2194459438323975 + }, + { + "auxiliary_loss_clip": 0.06582125, + "auxiliary_loss_mlp": 0.01292929, + "balance_loss_clip": 0.06305368, + "balance_loss_mlp": 0.01267168, + "epoch": 0.17014880505035324, + "flos": 24907327315200.0, + "grad_norm": 1.8200636755843338, + "language_loss": 0.84280431, + "learning_rate": 3.7974723607795983e-06, + "loss": 0.9215548, + "num_input_tokens_seen": 61370765, + "router_z_loss_clip": 2.76757812, + "router_z_loss_mlp": 0.25769043, + "step": 2830, + "time_per_iteration": 2.5831046104431152 + }, + { + "auxiliary_loss_clip": 0.0658033, + "auxiliary_loss_mlp": 0.01286886, + "balance_loss_clip": 0.06302518, + "balance_loss_mlp": 0.0125985, + "epoch": 0.1702089283030212, + "flos": 29870263520640.0, + "grad_norm": 2.350653052094916, + "language_loss": 0.78878641, + "learning_rate": 3.797301551737529e-06, + "loss": 0.86745858, + "num_input_tokens_seen": 61388935, + "router_z_loss_clip": 2.77929688, + "router_z_loss_mlp": 0.2701416, + "step": 2831, + "time_per_iteration": 2.6084542274475098 + }, + { + "auxiliary_loss_clip": 0.06581105, + "auxiliary_loss_mlp": 0.01292582, + "balance_loss_clip": 0.06301375, + "balance_loss_mlp": 0.01266975, + "epoch": 0.17026905155568917, + "flos": 17749171918080.0, + "grad_norm": 2.0319157009696327, + "language_loss": 0.80466926, + "learning_rate": 3.7971306745416044e-06, + "loss": 0.88340604, + "num_input_tokens_seen": 61407350, + "router_z_loss_clip": 2.79882812, + "router_z_loss_mlp": 0.25610352, + "step": 2832, + "time_per_iteration": 2.5211668014526367 + }, + { + "auxiliary_loss_clip": 0.06573536, + "auxiliary_loss_mlp": 0.01286888, + "balance_loss_clip": 0.06297776, + "balance_loss_mlp": 0.0126133, + "epoch": 0.17032917480835713, + "flos": 23155327895040.0, + "grad_norm": 1.986078489446087, + "language_loss": 0.89480335, + "learning_rate": 3.7969597291983046e-06, + "loss": 0.97340751, + "num_input_tokens_seen": 61429010, + "router_z_loss_clip": 2.7578125, + "router_z_loss_mlp": 0.25561523, + "step": 2833, + "time_per_iteration": 2.560952663421631 + }, + { + "auxiliary_loss_clip": 0.06575279, + "auxiliary_loss_mlp": 0.01285966, + "balance_loss_clip": 0.06302077, + "balance_loss_mlp": 0.01261123, + "epoch": 0.1703892980610251, + "flos": 39211940465280.0, + "grad_norm": 2.220027390834487, + "language_loss": 0.73524815, + "learning_rate": 3.7967887157141115e-06, + "loss": 0.81386054, + "num_input_tokens_seen": 61450040, + "router_z_loss_clip": 2.73046875, + "router_z_loss_mlp": 0.24829102, + "step": 2834, + "time_per_iteration": 2.679527521133423 + }, + { + "auxiliary_loss_clip": 0.06581013, + "auxiliary_loss_mlp": 0.01285804, + "balance_loss_clip": 0.06300581, + "balance_loss_mlp": 0.01260245, + "epoch": 0.17044942131369306, + "flos": 23045728354560.0, + "grad_norm": 1.8327084439605401, + "language_loss": 0.87308288, + "learning_rate": 3.7966176340955106e-06, + "loss": 0.95175111, + "num_input_tokens_seen": 61468585, + "router_z_loss_clip": 2.8046875, + "router_z_loss_mlp": 0.2557373, + "step": 2835, + "time_per_iteration": 2.656421661376953 + }, + { + "auxiliary_loss_clip": 0.06579748, + "auxiliary_loss_mlp": 0.01283404, + "balance_loss_clip": 0.06297451, + "balance_loss_mlp": 0.01256451, + "epoch": 0.17050954456636103, + "flos": 17060533937280.0, + "grad_norm": 2.3811755619363058, + "language_loss": 0.75235045, + "learning_rate": 3.796446484348989e-06, + "loss": 0.83098197, + "num_input_tokens_seen": 61486330, + "router_z_loss_clip": 2.82226562, + "router_z_loss_mlp": 0.26940918, + "step": 2836, + "time_per_iteration": 2.4939451217651367 + }, + { + "auxiliary_loss_clip": 0.06577778, + "auxiliary_loss_mlp": 0.01283432, + "balance_loss_clip": 0.06295718, + "balance_loss_mlp": 0.01256955, + "epoch": 0.17056966781902902, + "flos": 16842634594560.0, + "grad_norm": 2.2113478912931606, + "language_loss": 0.81597924, + "learning_rate": 3.796275266481036e-06, + "loss": 0.89459133, + "num_input_tokens_seen": 61503950, + "router_z_loss_clip": 2.8203125, + "router_z_loss_mlp": 0.26501465, + "step": 2837, + "time_per_iteration": 2.5308785438537598 + }, + { + "auxiliary_loss_clip": 0.06567004, + "auxiliary_loss_mlp": 0.01296468, + "balance_loss_clip": 0.06298919, + "balance_loss_mlp": 0.01272149, + "epoch": 0.17062979107169698, + "flos": 17718340815360.0, + "grad_norm": 2.307982469607828, + "language_loss": 0.84291762, + "learning_rate": 3.7961039804981456e-06, + "loss": 0.92155236, + "num_input_tokens_seen": 61523550, + "router_z_loss_clip": 2.67578125, + "router_z_loss_mlp": 0.24328613, + "step": 2838, + "time_per_iteration": 2.509929895401001 + }, + { + "auxiliary_loss_clip": 0.06570365, + "auxiliary_loss_mlp": 0.01284738, + "balance_loss_clip": 0.06295732, + "balance_loss_mlp": 0.01260264, + "epoch": 0.17068991432436495, + "flos": 22531035450240.0, + "grad_norm": 1.8555127422179185, + "language_loss": 0.94406807, + "learning_rate": 3.795932626406812e-06, + "loss": 1.02261913, + "num_input_tokens_seen": 61542720, + "router_z_loss_clip": 2.74804688, + "router_z_loss_mlp": 0.24450684, + "step": 2839, + "time_per_iteration": 2.588021755218506 + }, + { + "auxiliary_loss_clip": 0.06569307, + "auxiliary_loss_mlp": 0.01284917, + "balance_loss_clip": 0.06293422, + "balance_loss_mlp": 0.01256808, + "epoch": 0.17075003757703291, + "flos": 25889698183680.0, + "grad_norm": 2.1000046554588394, + "language_loss": 0.84480917, + "learning_rate": 3.7957612042135336e-06, + "loss": 0.92335141, + "num_input_tokens_seen": 61563040, + "router_z_loss_clip": 2.75976562, + "router_z_loss_mlp": 0.28100586, + "step": 2840, + "time_per_iteration": 2.5653579235076904 + }, + { + "auxiliary_loss_clip": 0.06573716, + "auxiliary_loss_mlp": 0.01290397, + "balance_loss_clip": 0.06298221, + "balance_loss_mlp": 0.01263503, + "epoch": 0.17081016082970088, + "flos": 20126931229440.0, + "grad_norm": 1.871912800472889, + "language_loss": 0.76954079, + "learning_rate": 3.79558971392481e-06, + "loss": 0.8481819, + "num_input_tokens_seen": 61581890, + "router_z_loss_clip": 2.75390625, + "router_z_loss_mlp": 0.26879883, + "step": 2841, + "time_per_iteration": 2.5525524616241455 + }, + { + "auxiliary_loss_clip": 0.06573537, + "auxiliary_loss_mlp": 0.01282668, + "balance_loss_clip": 0.06297247, + "balance_loss_mlp": 0.01257026, + "epoch": 0.17087028408236885, + "flos": 24943441224960.0, + "grad_norm": 1.6793065618865832, + "language_loss": 0.77364486, + "learning_rate": 3.7954181555471443e-06, + "loss": 0.85220695, + "num_input_tokens_seen": 61602095, + "router_z_loss_clip": 2.765625, + "router_z_loss_mlp": 0.2565918, + "step": 2842, + "time_per_iteration": 2.5674381256103516 + }, + { + "auxiliary_loss_clip": 0.06561892, + "auxiliary_loss_mlp": 0.01282368, + "balance_loss_clip": 0.06295875, + "balance_loss_mlp": 0.01257489, + "epoch": 0.17093040733503684, + "flos": 19063108592640.0, + "grad_norm": 1.967223672886595, + "language_loss": 0.87176019, + "learning_rate": 3.795246529087043e-06, + "loss": 0.95020282, + "num_input_tokens_seen": 61620400, + "router_z_loss_clip": 2.66015625, + "router_z_loss_mlp": 0.24853516, + "step": 2843, + "time_per_iteration": 2.546586036682129 + }, + { + "auxiliary_loss_clip": 0.06571361, + "auxiliary_loss_mlp": 0.01285811, + "balance_loss_clip": 0.06299275, + "balance_loss_mlp": 0.01262339, + "epoch": 0.1709905305877048, + "flos": 13083993596160.0, + "grad_norm": 1.8800221555677419, + "language_loss": 0.69446707, + "learning_rate": 3.7950748345510126e-06, + "loss": 0.7730388, + "num_input_tokens_seen": 61637680, + "router_z_loss_clip": 2.72265625, + "router_z_loss_mlp": 0.23461914, + "step": 2844, + "time_per_iteration": 2.5857818126678467 + }, + { + "auxiliary_loss_clip": 0.06575634, + "auxiliary_loss_mlp": 0.01288208, + "balance_loss_clip": 0.06299984, + "balance_loss_mlp": 0.0126346, + "epoch": 0.17105065384037277, + "flos": 19215530369280.0, + "grad_norm": 1.7660184935388845, + "language_loss": 0.79213876, + "learning_rate": 3.7949030719455646e-06, + "loss": 0.87077713, + "num_input_tokens_seen": 61655630, + "router_z_loss_clip": 2.75585938, + "router_z_loss_mlp": 0.24780273, + "step": 2845, + "time_per_iteration": 2.5564208030700684 + }, + { + "auxiliary_loss_clip": 0.06577709, + "auxiliary_loss_mlp": 0.01293667, + "balance_loss_clip": 0.06302914, + "balance_loss_mlp": 0.01268586, + "epoch": 0.17111077709304073, + "flos": 18521106456960.0, + "grad_norm": 2.255753625544696, + "language_loss": 0.79110825, + "learning_rate": 3.7947312412772127e-06, + "loss": 0.86982203, + "num_input_tokens_seen": 61673475, + "router_z_loss_clip": 2.74804688, + "router_z_loss_mlp": 0.25085449, + "step": 2846, + "time_per_iteration": 2.513607978820801 + }, + { + "auxiliary_loss_clip": 0.06568472, + "auxiliary_loss_mlp": 0.01290569, + "balance_loss_clip": 0.06298524, + "balance_loss_mlp": 0.01266727, + "epoch": 0.1711709003457087, + "flos": 25089699726720.0, + "grad_norm": 1.7214534237870849, + "language_loss": 0.80675447, + "learning_rate": 3.794559342552472e-06, + "loss": 0.88534492, + "num_input_tokens_seen": 61693370, + "router_z_loss_clip": 2.69921875, + "router_z_loss_mlp": 0.23852539, + "step": 2847, + "time_per_iteration": 2.618793249130249 + }, + { + "auxiliary_loss_clip": 0.06569728, + "auxiliary_loss_mlp": 0.01293508, + "balance_loss_clip": 0.0629475, + "balance_loss_mlp": 0.01268796, + "epoch": 0.17123102359837666, + "flos": 17572124240640.0, + "grad_norm": 2.2846174525506973, + "language_loss": 0.88074541, + "learning_rate": 3.7943873757778614e-06, + "loss": 0.95937777, + "num_input_tokens_seen": 61710820, + "router_z_loss_clip": 2.75, + "router_z_loss_mlp": 0.24719238, + "step": 2848, + "time_per_iteration": 2.487272024154663 + }, + { + "auxiliary_loss_clip": 0.06569223, + "auxiliary_loss_mlp": 0.01309638, + "balance_loss_clip": 0.06294799, + "balance_loss_mlp": 0.01284688, + "epoch": 0.17129114685104463, + "flos": 26180244616320.0, + "grad_norm": 1.906108969463994, + "language_loss": 0.76101243, + "learning_rate": 3.794215340959902e-06, + "loss": 0.83980107, + "num_input_tokens_seen": 61729855, + "router_z_loss_clip": 2.74414062, + "router_z_loss_mlp": 0.24938965, + "step": 2849, + "time_per_iteration": 2.620347738265991 + }, + { + "auxiliary_loss_clip": 0.06449599, + "auxiliary_loss_mlp": 0.01264516, + "balance_loss_clip": 0.06291912, + "balance_loss_mlp": 0.01257077, + "epoch": 0.17135127010371262, + "flos": 69290696943360.0, + "grad_norm": 0.770033327211451, + "language_loss": 0.57434958, + "learning_rate": 3.7940432381051163e-06, + "loss": 0.65149075, + "num_input_tokens_seen": 61790290, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.07421875, + "step": 2850, + "time_per_iteration": 3.1464109420776367 + }, + { + "auxiliary_loss_clip": 0.0656237, + "auxiliary_loss_mlp": 0.01301725, + "balance_loss_clip": 0.06296088, + "balance_loss_mlp": 0.01277966, + "epoch": 0.1714113933563806, + "flos": 23556857460480.0, + "grad_norm": 2.479535747356738, + "language_loss": 0.81586778, + "learning_rate": 3.793871067220031e-06, + "loss": 0.89450872, + "num_input_tokens_seen": 61809265, + "router_z_loss_clip": 2.66601562, + "router_z_loss_mlp": 0.23742676, + "step": 2851, + "time_per_iteration": 2.558507204055786 + }, + { + "auxiliary_loss_clip": 0.06565535, + "auxiliary_loss_mlp": 0.01289531, + "balance_loss_clip": 0.06298645, + "balance_loss_mlp": 0.01267119, + "epoch": 0.17147151660904855, + "flos": 21148854024960.0, + "grad_norm": 2.2154108843285107, + "language_loss": 0.94662631, + "learning_rate": 3.7936988283111764e-06, + "loss": 1.025177, + "num_input_tokens_seen": 61828980, + "router_z_loss_clip": 2.66992188, + "router_z_loss_mlp": 0.22412109, + "step": 2852, + "time_per_iteration": 2.518974542617798 + }, + { + "auxiliary_loss_clip": 0.0657506, + "auxiliary_loss_mlp": 0.01290477, + "balance_loss_clip": 0.06300224, + "balance_loss_mlp": 0.01264299, + "epoch": 0.17153163986171652, + "flos": 18630873705600.0, + "grad_norm": 1.8056831581423547, + "language_loss": 0.70245004, + "learning_rate": 3.7935265213850817e-06, + "loss": 0.7811054, + "num_input_tokens_seen": 61847915, + "router_z_loss_clip": 2.74609375, + "router_z_loss_mlp": 0.26184082, + "step": 2853, + "time_per_iteration": 2.552562952041626 + }, + { + "auxiliary_loss_clip": 0.06576742, + "auxiliary_loss_mlp": 0.01296459, + "balance_loss_clip": 0.06299934, + "balance_loss_mlp": 0.01271663, + "epoch": 0.17159176311438448, + "flos": 18229134504960.0, + "grad_norm": 2.1946039611354418, + "language_loss": 0.67477524, + "learning_rate": 3.7933541464482815e-06, + "loss": 0.75350726, + "num_input_tokens_seen": 61865570, + "router_z_loss_clip": 2.765625, + "router_z_loss_mlp": 0.2479248, + "step": 2854, + "time_per_iteration": 2.5350561141967773 + }, + { + "auxiliary_loss_clip": 0.06572944, + "auxiliary_loss_mlp": 0.0128611, + "balance_loss_clip": 0.06305773, + "balance_loss_mlp": 0.01263973, + "epoch": 0.17165188636705245, + "flos": 20744976545280.0, + "grad_norm": 1.5291061865624715, + "language_loss": 0.89537871, + "learning_rate": 3.7931817035073124e-06, + "loss": 0.97396928, + "num_input_tokens_seen": 61883340, + "router_z_loss_clip": 2.671875, + "router_z_loss_mlp": 0.22143555, + "step": 2855, + "time_per_iteration": 2.5376133918762207 + }, + { + "auxiliary_loss_clip": 0.06575546, + "auxiliary_loss_mlp": 0.01295321, + "balance_loss_clip": 0.06304888, + "balance_loss_mlp": 0.01271145, + "epoch": 0.17171200961972044, + "flos": 24906824190720.0, + "grad_norm": 2.4271457535299654, + "language_loss": 0.84835625, + "learning_rate": 3.7930091925687134e-06, + "loss": 0.9270649, + "num_input_tokens_seen": 61900610, + "router_z_loss_clip": 2.70703125, + "router_z_loss_mlp": 0.24206543, + "step": 2856, + "time_per_iteration": 2.551483392715454 + }, + { + "auxiliary_loss_clip": 0.06575087, + "auxiliary_loss_mlp": 0.01290512, + "balance_loss_clip": 0.0630254, + "balance_loss_mlp": 0.01267528, + "epoch": 0.1717721328723884, + "flos": 20163464409600.0, + "grad_norm": 7.491722293090189, + "language_loss": 0.87615776, + "learning_rate": 3.792836613639026e-06, + "loss": 0.95481372, + "num_input_tokens_seen": 61916795, + "router_z_loss_clip": 2.72460938, + "router_z_loss_mlp": 0.23010254, + "step": 2857, + "time_per_iteration": 4.012267112731934 + }, + { + "auxiliary_loss_clip": 0.06572698, + "auxiliary_loss_mlp": 0.01287955, + "balance_loss_clip": 0.06301427, + "balance_loss_mlp": 0.01262385, + "epoch": 0.17183225612505637, + "flos": 23367357452160.0, + "grad_norm": 2.309816452702101, + "language_loss": 0.78393459, + "learning_rate": 3.7926639667247947e-06, + "loss": 0.86254114, + "num_input_tokens_seen": 61936665, + "router_z_loss_clip": 2.71484375, + "router_z_loss_mlp": 0.25585938, + "step": 2858, + "time_per_iteration": 2.58130145072937 + }, + { + "auxiliary_loss_clip": 0.06589144, + "auxiliary_loss_mlp": 0.0128985, + "balance_loss_clip": 0.06303509, + "balance_loss_mlp": 0.0126453, + "epoch": 0.17189237937772434, + "flos": 18120163870080.0, + "grad_norm": 2.664171996061716, + "language_loss": 0.77798349, + "learning_rate": 3.7924912518325663e-06, + "loss": 0.85677344, + "num_input_tokens_seen": 61954415, + "router_z_loss_clip": 2.85742188, + "router_z_loss_mlp": 0.25317383, + "step": 2859, + "time_per_iteration": 2.5043106079101562 + }, + { + "auxiliary_loss_clip": 0.06572397, + "auxiliary_loss_mlp": 0.01281612, + "balance_loss_clip": 0.06301641, + "balance_loss_mlp": 0.01258939, + "epoch": 0.1719525026303923, + "flos": 23265137070720.0, + "grad_norm": 5.679736885155129, + "language_loss": 0.77697283, + "learning_rate": 3.7923184689688902e-06, + "loss": 0.85551292, + "num_input_tokens_seen": 61973940, + "router_z_loss_clip": 2.7109375, + "router_z_loss_mlp": 0.22692871, + "step": 2860, + "time_per_iteration": 2.572662591934204 + }, + { + "auxiliary_loss_clip": 0.06574808, + "auxiliary_loss_mlp": 0.01292828, + "balance_loss_clip": 0.06301817, + "balance_loss_mlp": 0.01270583, + "epoch": 0.17201262588306027, + "flos": 20816156188800.0, + "grad_norm": 2.1792765136561036, + "language_loss": 0.82509398, + "learning_rate": 3.792145618140317e-06, + "loss": 0.90377033, + "num_input_tokens_seen": 61991845, + "router_z_loss_clip": 2.72851562, + "router_z_loss_mlp": 0.22229004, + "step": 2861, + "time_per_iteration": 3.9328150749206543 + }, + { + "auxiliary_loss_clip": 0.06577721, + "auxiliary_loss_mlp": 0.01292683, + "balance_loss_clip": 0.06305138, + "balance_loss_mlp": 0.0126896, + "epoch": 0.17207274913572823, + "flos": 20382076512000.0, + "grad_norm": 2.450020121503541, + "language_loss": 0.8692534, + "learning_rate": 3.7919726993534038e-06, + "loss": 0.9479574, + "num_input_tokens_seen": 62009395, + "router_z_loss_clip": 2.72460938, + "router_z_loss_mlp": 0.23718262, + "step": 2862, + "time_per_iteration": 2.533240795135498 + }, + { + "auxiliary_loss_clip": 0.06570788, + "auxiliary_loss_mlp": 0.01286464, + "balance_loss_clip": 0.06306001, + "balance_loss_mlp": 0.01264387, + "epoch": 0.17213287238839622, + "flos": 26805082112640.0, + "grad_norm": 1.8452916722599864, + "language_loss": 0.78642774, + "learning_rate": 3.7917997126147054e-06, + "loss": 0.86500025, + "num_input_tokens_seen": 62029005, + "router_z_loss_clip": 2.64257812, + "router_z_loss_mlp": 0.22045898, + "step": 2863, + "time_per_iteration": 2.5886759757995605 + }, + { + "auxiliary_loss_clip": 0.06585991, + "auxiliary_loss_mlp": 0.0129295, + "balance_loss_clip": 0.06318994, + "balance_loss_mlp": 0.01270336, + "epoch": 0.1721929956410642, + "flos": 26037927256320.0, + "grad_norm": 1.9522517065159992, + "language_loss": 0.73622, + "learning_rate": 3.7916266579307823e-06, + "loss": 0.81500947, + "num_input_tokens_seen": 62048730, + "router_z_loss_clip": 2.66796875, + "router_z_loss_mlp": 0.22631836, + "step": 2864, + "time_per_iteration": 4.05191445350647 + }, + { + "auxiliary_loss_clip": 0.06582697, + "auxiliary_loss_mlp": 0.01292894, + "balance_loss_clip": 0.06309051, + "balance_loss_mlp": 0.01269362, + "epoch": 0.17225311889373215, + "flos": 22279621674240.0, + "grad_norm": 1.6774687827131978, + "language_loss": 0.73856592, + "learning_rate": 3.7914535353081973e-06, + "loss": 0.81732178, + "num_input_tokens_seen": 62069000, + "router_z_loss_clip": 2.73632812, + "router_z_loss_mlp": 0.23535156, + "step": 2865, + "time_per_iteration": 3.9612531661987305 + }, + { + "auxiliary_loss_clip": 0.06584621, + "auxiliary_loss_mlp": 0.01305521, + "balance_loss_clip": 0.06313194, + "balance_loss_mlp": 0.01281405, + "epoch": 0.17231324214640012, + "flos": 21294106277760.0, + "grad_norm": 2.4869534197111385, + "language_loss": 0.79160404, + "learning_rate": 3.7912803447535145e-06, + "loss": 0.87050545, + "num_input_tokens_seen": 62086750, + "router_z_loss_clip": 2.71484375, + "router_z_loss_mlp": 0.24121094, + "step": 2866, + "time_per_iteration": 2.542663812637329 + }, + { + "auxiliary_loss_clip": 0.06586975, + "auxiliary_loss_mlp": 0.01295234, + "balance_loss_clip": 0.0631168, + "balance_loss_mlp": 0.01269688, + "epoch": 0.17237336539906808, + "flos": 19686520569600.0, + "grad_norm": 2.39942640082668, + "language_loss": 0.80413449, + "learning_rate": 3.7911070862733016e-06, + "loss": 0.8829565, + "num_input_tokens_seen": 62106240, + "router_z_loss_clip": 2.75390625, + "router_z_loss_mlp": 0.25549316, + "step": 2867, + "time_per_iteration": 2.524634599685669 + }, + { + "auxiliary_loss_clip": 0.06577912, + "auxiliary_loss_mlp": 0.01291096, + "balance_loss_clip": 0.063054, + "balance_loss_mlp": 0.01267123, + "epoch": 0.17243348865173605, + "flos": 17535339498240.0, + "grad_norm": 1.6440546002054504, + "language_loss": 0.80347586, + "learning_rate": 3.7909337598741276e-06, + "loss": 0.88216591, + "num_input_tokens_seen": 62124895, + "router_z_loss_clip": 2.72460938, + "router_z_loss_mlp": 0.23974609, + "step": 2868, + "time_per_iteration": 2.5237460136413574 + }, + { + "auxiliary_loss_clip": 0.06586674, + "auxiliary_loss_mlp": 0.0129419, + "balance_loss_clip": 0.06310418, + "balance_loss_mlp": 0.01270241, + "epoch": 0.17249361190440402, + "flos": 18265751539200.0, + "grad_norm": 1.9212015042396675, + "language_loss": 0.84995282, + "learning_rate": 3.7907603655625674e-06, + "loss": 0.92876148, + "num_input_tokens_seen": 62143510, + "router_z_loss_clip": 2.76171875, + "router_z_loss_mlp": 0.23937988, + "step": 2869, + "time_per_iteration": 2.4968101978302 + }, + { + "auxiliary_loss_clip": 0.06574747, + "auxiliary_loss_mlp": 0.01290391, + "balance_loss_clip": 0.06302473, + "balance_loss_mlp": 0.01265393, + "epoch": 0.172553735157072, + "flos": 21180020544000.0, + "grad_norm": 2.372251531694949, + "language_loss": 0.78318757, + "learning_rate": 3.7905869033451932e-06, + "loss": 0.861839, + "num_input_tokens_seen": 62162285, + "router_z_loss_clip": 2.71875, + "router_z_loss_mlp": 0.25, + "step": 2870, + "time_per_iteration": 2.6494200229644775 + }, + { + "auxiliary_loss_clip": 0.06572236, + "auxiliary_loss_mlp": 0.01286981, + "balance_loss_clip": 0.06308384, + "balance_loss_mlp": 0.01266083, + "epoch": 0.17261385840973997, + "flos": 22279831309440.0, + "grad_norm": 1.8100610801094352, + "language_loss": 0.77937269, + "learning_rate": 3.7904133732285857e-06, + "loss": 0.85796487, + "num_input_tokens_seen": 62180970, + "router_z_loss_clip": 2.63671875, + "router_z_loss_mlp": 0.20910645, + "step": 2871, + "time_per_iteration": 2.6145200729370117 + }, + { + "auxiliary_loss_clip": 0.06580749, + "auxiliary_loss_mlp": 0.01284391, + "balance_loss_clip": 0.06306709, + "balance_loss_mlp": 0.01260263, + "epoch": 0.17267398166240794, + "flos": 27928680238080.0, + "grad_norm": 2.361348336036686, + "language_loss": 0.75478256, + "learning_rate": 3.7902397752193228e-06, + "loss": 0.83343399, + "num_input_tokens_seen": 62198965, + "router_z_loss_clip": 2.74414062, + "router_z_loss_mlp": 0.24157715, + "step": 2872, + "time_per_iteration": 2.598762035369873 + }, + { + "auxiliary_loss_clip": 0.06570577, + "auxiliary_loss_mlp": 0.01297063, + "balance_loss_clip": 0.06302171, + "balance_loss_mlp": 0.01274067, + "epoch": 0.1727341049150759, + "flos": 21951661593600.0, + "grad_norm": 1.9699566193216007, + "language_loss": 0.83421481, + "learning_rate": 3.790066109323988e-06, + "loss": 0.91289121, + "num_input_tokens_seen": 62219890, + "router_z_loss_clip": 2.68359375, + "router_z_loss_mlp": 0.23010254, + "step": 2873, + "time_per_iteration": 2.5375001430511475 + }, + { + "auxiliary_loss_clip": 0.06575856, + "auxiliary_loss_mlp": 0.01290457, + "balance_loss_clip": 0.06307539, + "balance_loss_mlp": 0.01266198, + "epoch": 0.17279422816774387, + "flos": 18112742784000.0, + "grad_norm": 2.023952379864123, + "language_loss": 0.75553465, + "learning_rate": 3.7898923755491678e-06, + "loss": 0.83419782, + "num_input_tokens_seen": 62237140, + "router_z_loss_clip": 2.6796875, + "router_z_loss_mlp": 0.24243164, + "step": 2874, + "time_per_iteration": 2.6628403663635254 + }, + { + "auxiliary_loss_clip": 0.06583337, + "auxiliary_loss_mlp": 0.01288686, + "balance_loss_clip": 0.06308968, + "balance_loss_mlp": 0.01261959, + "epoch": 0.17285435142041183, + "flos": 21841936272000.0, + "grad_norm": 2.156422611189301, + "language_loss": 0.81707162, + "learning_rate": 3.7897185739014487e-06, + "loss": 0.89579183, + "num_input_tokens_seen": 62255405, + "router_z_loss_clip": 2.7421875, + "router_z_loss_mlp": 0.26733398, + "step": 2875, + "time_per_iteration": 2.5195512771606445 + }, + { + "auxiliary_loss_clip": 0.06576921, + "auxiliary_loss_mlp": 0.0129142, + "balance_loss_clip": 0.06303119, + "balance_loss_mlp": 0.01265122, + "epoch": 0.17291447467307983, + "flos": 18374219049600.0, + "grad_norm": 2.297860169925143, + "language_loss": 0.89334786, + "learning_rate": 3.7895447043874217e-06, + "loss": 0.9720313, + "num_input_tokens_seen": 62271280, + "router_z_loss_clip": 2.73828125, + "router_z_loss_mlp": 0.26281738, + "step": 2876, + "time_per_iteration": 2.5156540870666504 + }, + { + "auxiliary_loss_clip": 0.06576936, + "auxiliary_loss_mlp": 0.01286777, + "balance_loss_clip": 0.06307175, + "balance_loss_mlp": 0.01262793, + "epoch": 0.1729745979257478, + "flos": 18630580216320.0, + "grad_norm": 2.037856806425618, + "language_loss": 0.85539293, + "learning_rate": 3.789370767013681e-06, + "loss": 0.93403006, + "num_input_tokens_seen": 62289140, + "router_z_loss_clip": 2.69726562, + "router_z_loss_mlp": 0.23986816, + "step": 2877, + "time_per_iteration": 2.4874324798583984 + }, + { + "auxiliary_loss_clip": 0.06576495, + "auxiliary_loss_mlp": 0.01284602, + "balance_loss_clip": 0.06305559, + "balance_loss_mlp": 0.01260593, + "epoch": 0.17303472117841576, + "flos": 23004122002560.0, + "grad_norm": 1.956584823379214, + "language_loss": 0.79972547, + "learning_rate": 3.7891967617868204e-06, + "loss": 0.87833643, + "num_input_tokens_seen": 62307490, + "router_z_loss_clip": 2.70898438, + "router_z_loss_mlp": 0.23986816, + "step": 2878, + "time_per_iteration": 2.5546791553497314 + }, + { + "auxiliary_loss_clip": 0.06571983, + "auxiliary_loss_mlp": 0.01289115, + "balance_loss_clip": 0.06302349, + "balance_loss_mlp": 0.01264558, + "epoch": 0.17309484443108372, + "flos": 25671169935360.0, + "grad_norm": 1.824315336901638, + "language_loss": 0.72073978, + "learning_rate": 3.78902268871344e-06, + "loss": 0.79935074, + "num_input_tokens_seen": 62328570, + "router_z_loss_clip": 2.69335938, + "router_z_loss_mlp": 0.24584961, + "step": 2879, + "time_per_iteration": 2.5585644245147705 + }, + { + "auxiliary_loss_clip": 0.06575425, + "auxiliary_loss_mlp": 0.01284736, + "balance_loss_clip": 0.06301329, + "balance_loss_mlp": 0.01260048, + "epoch": 0.1731549676837517, + "flos": 13557960616320.0, + "grad_norm": 1.9540483547981324, + "language_loss": 0.8431474, + "learning_rate": 3.78884854780014e-06, + "loss": 0.921749, + "num_input_tokens_seen": 62345735, + "router_z_loss_clip": 2.7421875, + "router_z_loss_mlp": 0.24682617, + "step": 2880, + "time_per_iteration": 2.5332508087158203 + }, + { + "auxiliary_loss_clip": 0.06579134, + "auxiliary_loss_mlp": 0.01281408, + "balance_loss_clip": 0.06303075, + "balance_loss_mlp": 0.01256565, + "epoch": 0.17321509093641965, + "flos": 22863733286400.0, + "grad_norm": 3.3854797576129525, + "language_loss": 0.82168967, + "learning_rate": 3.7886743390535236e-06, + "loss": 0.90029514, + "num_input_tokens_seen": 62365525, + "router_z_loss_clip": 2.76171875, + "router_z_loss_mlp": 0.2487793, + "step": 2881, + "time_per_iteration": 2.5265071392059326 + }, + { + "auxiliary_loss_clip": 0.06575799, + "auxiliary_loss_mlp": 0.01283502, + "balance_loss_clip": 0.06305043, + "balance_loss_mlp": 0.0125904, + "epoch": 0.17327521418908762, + "flos": 24359665029120.0, + "grad_norm": 1.8504646386399068, + "language_loss": 0.77975154, + "learning_rate": 3.788500062480197e-06, + "loss": 0.85834455, + "num_input_tokens_seen": 62385160, + "router_z_loss_clip": 2.70898438, + "router_z_loss_mlp": 0.24450684, + "step": 2882, + "time_per_iteration": 2.56476092338562 + }, + { + "auxiliary_loss_clip": 0.0657361, + "auxiliary_loss_mlp": 0.01281786, + "balance_loss_clip": 0.063067, + "balance_loss_mlp": 0.01260495, + "epoch": 0.1733353374417556, + "flos": 33113373073920.0, + "grad_norm": 2.021690524452963, + "language_loss": 0.77161384, + "learning_rate": 3.788325718086769e-06, + "loss": 0.85016787, + "num_input_tokens_seen": 62405280, + "router_z_loss_clip": 2.66992188, + "router_z_loss_mlp": 0.21276855, + "step": 2883, + "time_per_iteration": 2.6154749393463135 + }, + { + "auxiliary_loss_clip": 0.06569435, + "auxiliary_loss_mlp": 0.01278991, + "balance_loss_clip": 0.06301424, + "balance_loss_mlp": 0.01256365, + "epoch": 0.17339546069442358, + "flos": 24395778938880.0, + "grad_norm": 4.943843215515709, + "language_loss": 0.86164784, + "learning_rate": 3.7881513058798503e-06, + "loss": 0.94013214, + "num_input_tokens_seen": 62423665, + "router_z_loss_clip": 2.68164062, + "router_z_loss_mlp": 0.22631836, + "step": 2884, + "time_per_iteration": 2.5598208904266357 + }, + { + "auxiliary_loss_clip": 0.06577636, + "auxiliary_loss_mlp": 0.01280409, + "balance_loss_clip": 0.06308297, + "balance_loss_mlp": 0.01256878, + "epoch": 0.17345558394709154, + "flos": 27461589252480.0, + "grad_norm": 1.714045228397976, + "language_loss": 0.75027329, + "learning_rate": 3.787976825866055e-06, + "loss": 0.82885373, + "num_input_tokens_seen": 62445170, + "router_z_loss_clip": 2.69335938, + "router_z_loss_mlp": 0.23535156, + "step": 2885, + "time_per_iteration": 2.584550619125366 + }, + { + "auxiliary_loss_clip": 0.06567928, + "auxiliary_loss_mlp": 0.01282091, + "balance_loss_clip": 0.06304367, + "balance_loss_mlp": 0.01259954, + "epoch": 0.1735157071997595, + "flos": 24689260264320.0, + "grad_norm": 1.6836608181022428, + "language_loss": 0.71760321, + "learning_rate": 3.7878022780519998e-06, + "loss": 0.79610336, + "num_input_tokens_seen": 62466135, + "router_z_loss_clip": 2.63671875, + "router_z_loss_mlp": 0.22131348, + "step": 2886, + "time_per_iteration": 2.5990986824035645 + }, + { + "auxiliary_loss_clip": 0.06574686, + "auxiliary_loss_mlp": 0.01280319, + "balance_loss_clip": 0.06304233, + "balance_loss_mlp": 0.01257275, + "epoch": 0.17357583045242747, + "flos": 21695300426880.0, + "grad_norm": 2.252280410203818, + "language_loss": 0.70329314, + "learning_rate": 3.7876276624443024e-06, + "loss": 0.78184319, + "num_input_tokens_seen": 62483910, + "router_z_loss_clip": 2.703125, + "router_z_loss_mlp": 0.23071289, + "step": 2887, + "time_per_iteration": 2.528995990753174 + }, + { + "auxiliary_loss_clip": 0.0657585, + "auxiliary_loss_mlp": 0.0127978, + "balance_loss_clip": 0.06305341, + "balance_loss_mlp": 0.01258155, + "epoch": 0.17363595370509544, + "flos": 15380846190720.0, + "grad_norm": 1.8987045627788157, + "language_loss": 0.85982835, + "learning_rate": 3.787452979049585e-06, + "loss": 0.93838477, + "num_input_tokens_seen": 62501530, + "router_z_loss_clip": 2.70507812, + "router_z_loss_mlp": 0.21618652, + "step": 2888, + "time_per_iteration": 2.520200252532959 + }, + { + "auxiliary_loss_clip": 0.06585068, + "auxiliary_loss_mlp": 0.0128524, + "balance_loss_clip": 0.06313335, + "balance_loss_mlp": 0.01262077, + "epoch": 0.1736960769577634, + "flos": 23447719117440.0, + "grad_norm": 1.9850534312792847, + "language_loss": 0.79895031, + "learning_rate": 3.7872782278744718e-06, + "loss": 0.87765336, + "num_input_tokens_seen": 62521295, + "router_z_loss_clip": 2.71679688, + "router_z_loss_mlp": 0.23193359, + "step": 2889, + "time_per_iteration": 2.5683798789978027 + }, + { + "auxiliary_loss_clip": 0.06572761, + "auxiliary_loss_mlp": 0.01291973, + "balance_loss_clip": 0.06309643, + "balance_loss_mlp": 0.01268966, + "epoch": 0.1737562002104314, + "flos": 18593711619840.0, + "grad_norm": 2.1673011596526743, + "language_loss": 0.85773498, + "learning_rate": 3.7871034089255883e-06, + "loss": 0.93638229, + "num_input_tokens_seen": 62539615, + "router_z_loss_clip": 2.62890625, + "router_z_loss_mlp": 0.23010254, + "step": 2890, + "time_per_iteration": 2.5268702507019043 + }, + { + "auxiliary_loss_clip": 0.06571183, + "auxiliary_loss_mlp": 0.0127752, + "balance_loss_clip": 0.06302673, + "balance_loss_mlp": 0.0125493, + "epoch": 0.17381632346309936, + "flos": 16003629262080.0, + "grad_norm": 2.262236435886973, + "language_loss": 0.8327142, + "learning_rate": 3.7869285222095653e-06, + "loss": 0.91120124, + "num_input_tokens_seen": 62556820, + "router_z_loss_clip": 2.68359375, + "router_z_loss_mlp": 0.22595215, + "step": 2891, + "time_per_iteration": 2.4975481033325195 + }, + { + "auxiliary_loss_clip": 0.065819, + "auxiliary_loss_mlp": 0.01286901, + "balance_loss_clip": 0.06304774, + "balance_loss_mlp": 0.01263512, + "epoch": 0.17387644671576732, + "flos": 13374749664000.0, + "grad_norm": 2.593478250918492, + "language_loss": 0.82133532, + "learning_rate": 3.7867535677330334e-06, + "loss": 0.9000234, + "num_input_tokens_seen": 62572450, + "router_z_loss_clip": 2.76953125, + "router_z_loss_mlp": 0.23388672, + "step": 2892, + "time_per_iteration": 2.488811492919922 + }, + { + "auxiliary_loss_clip": 0.06588026, + "auxiliary_loss_mlp": 0.0128266, + "balance_loss_clip": 0.06313482, + "balance_loss_mlp": 0.0125759, + "epoch": 0.1739365699684353, + "flos": 26622877409280.0, + "grad_norm": 1.869199176824797, + "language_loss": 0.7570942, + "learning_rate": 3.786578545502627e-06, + "loss": 0.83580112, + "num_input_tokens_seen": 62592580, + "router_z_loss_clip": 2.74804688, + "router_z_loss_mlp": 0.25061035, + "step": 2893, + "time_per_iteration": 2.6775050163269043 + }, + { + "auxiliary_loss_clip": 0.06578243, + "auxiliary_loss_mlp": 0.01282281, + "balance_loss_clip": 0.06306182, + "balance_loss_mlp": 0.01257903, + "epoch": 0.17399669322110325, + "flos": 23374736611200.0, + "grad_norm": 1.8950837051329763, + "language_loss": 0.82900345, + "learning_rate": 3.7864034555249828e-06, + "loss": 0.90760863, + "num_input_tokens_seen": 62611220, + "router_z_loss_clip": 2.72265625, + "router_z_loss_mlp": 0.24377441, + "step": 2894, + "time_per_iteration": 2.5567498207092285 + }, + { + "auxiliary_loss_clip": 0.06582697, + "auxiliary_loss_mlp": 0.01287491, + "balance_loss_clip": 0.06309928, + "balance_loss_mlp": 0.01263232, + "epoch": 0.17405681647377122, + "flos": 22060590301440.0, + "grad_norm": 2.244882299044818, + "language_loss": 0.74999332, + "learning_rate": 3.786228297806741e-06, + "loss": 0.82869518, + "num_input_tokens_seen": 62629185, + "router_z_loss_clip": 2.73046875, + "router_z_loss_mlp": 0.24279785, + "step": 2895, + "time_per_iteration": 2.535771369934082 + }, + { + "auxiliary_loss_clip": 0.06500985, + "auxiliary_loss_mlp": 0.01252168, + "balance_loss_clip": 0.06341717, + "balance_loss_mlp": 0.01244449, + "epoch": 0.1741169397264392, + "flos": 61476537530880.0, + "grad_norm": 0.8158755233881254, + "language_loss": 0.62716168, + "learning_rate": 3.7860530723545435e-06, + "loss": 0.7046932, + "num_input_tokens_seen": 62691895, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.0770874, + "step": 2896, + "time_per_iteration": 3.260303497314453 + }, + { + "auxiliary_loss_clip": 0.06578183, + "auxiliary_loss_mlp": 0.01278967, + "balance_loss_clip": 0.06304477, + "balance_loss_mlp": 0.01254791, + "epoch": 0.17417706297910718, + "flos": 27025245515520.0, + "grad_norm": 1.768440838457988, + "language_loss": 0.76261735, + "learning_rate": 3.785877779175034e-06, + "loss": 0.84118891, + "num_input_tokens_seen": 62713790, + "router_z_loss_clip": 2.73632812, + "router_z_loss_mlp": 0.24157715, + "step": 2897, + "time_per_iteration": 3.9564483165740967 + }, + { + "auxiliary_loss_clip": 0.06567717, + "auxiliary_loss_mlp": 0.01283821, + "balance_loss_clip": 0.06302972, + "balance_loss_mlp": 0.01260325, + "epoch": 0.17423718623177514, + "flos": 33516957064320.0, + "grad_norm": 2.1770598890745694, + "language_loss": 0.7037769, + "learning_rate": 3.7857024182748606e-06, + "loss": 0.78229225, + "num_input_tokens_seen": 62736285, + "router_z_loss_clip": 2.64453125, + "router_z_loss_mlp": 0.23486328, + "step": 2898, + "time_per_iteration": 2.6747710704803467 + }, + { + "auxiliary_loss_clip": 0.06586026, + "auxiliary_loss_mlp": 0.01283538, + "balance_loss_clip": 0.0630955, + "balance_loss_mlp": 0.01261008, + "epoch": 0.1742973094844431, + "flos": 27205982772480.0, + "grad_norm": 2.322018652940294, + "language_loss": 0.77535176, + "learning_rate": 3.7855269896606717e-06, + "loss": 0.85404742, + "num_input_tokens_seen": 62756240, + "router_z_loss_clip": 2.76757812, + "router_z_loss_mlp": 0.22509766, + "step": 2899, + "time_per_iteration": 2.5824503898620605 + }, + { + "auxiliary_loss_clip": 0.06566149, + "auxiliary_loss_mlp": 0.01285927, + "balance_loss_clip": 0.06301811, + "balance_loss_mlp": 0.01263611, + "epoch": 0.17435743273711107, + "flos": 22717307076480.0, + "grad_norm": 1.8730005414784603, + "language_loss": 0.7345652, + "learning_rate": 3.785351493339121e-06, + "loss": 0.81308603, + "num_input_tokens_seen": 62775910, + "router_z_loss_clip": 2.64453125, + "router_z_loss_mlp": 0.22302246, + "step": 2900, + "time_per_iteration": 3.9656574726104736 + }, + { + "auxiliary_loss_clip": 0.06572049, + "auxiliary_loss_mlp": 0.01282784, + "balance_loss_clip": 0.06301104, + "balance_loss_mlp": 0.01259311, + "epoch": 0.17441755598977904, + "flos": 41656141664640.0, + "grad_norm": 1.6285149505686385, + "language_loss": 0.70661789, + "learning_rate": 3.785175929316863e-06, + "loss": 0.7851662, + "num_input_tokens_seen": 62799385, + "router_z_loss_clip": 2.71289062, + "router_z_loss_mlp": 0.23474121, + "step": 2901, + "time_per_iteration": 2.6915066242218018 + }, + { + "auxiliary_loss_clip": 0.06578797, + "auxiliary_loss_mlp": 0.01281619, + "balance_loss_clip": 0.06304422, + "balance_loss_mlp": 0.0125885, + "epoch": 0.174477679242447, + "flos": 26294372277120.0, + "grad_norm": 4.182093359181909, + "language_loss": 0.76958787, + "learning_rate": 3.7850002976005543e-06, + "loss": 0.84819204, + "num_input_tokens_seen": 62819380, + "router_z_loss_clip": 2.74414062, + "router_z_loss_mlp": 0.2277832, + "step": 2902, + "time_per_iteration": 2.58911395072937 + }, + { + "auxiliary_loss_clip": 0.06574767, + "auxiliary_loss_mlp": 0.0128676, + "balance_loss_clip": 0.06303128, + "balance_loss_mlp": 0.01265076, + "epoch": 0.174537802495115, + "flos": 17864221973760.0, + "grad_norm": 2.5386707468858942, + "language_loss": 0.82260907, + "learning_rate": 3.7848245981968558e-06, + "loss": 0.90122437, + "num_input_tokens_seen": 62836205, + "router_z_loss_clip": 2.71875, + "router_z_loss_mlp": 0.21679688, + "step": 2903, + "time_per_iteration": 3.919084072113037 + }, + { + "auxiliary_loss_clip": 0.06573024, + "auxiliary_loss_mlp": 0.01291861, + "balance_loss_clip": 0.06307561, + "balance_loss_mlp": 0.01269139, + "epoch": 0.17459792574778296, + "flos": 16945441954560.0, + "grad_norm": 1.7914306748896518, + "language_loss": 0.7447511, + "learning_rate": 3.784648831112429e-06, + "loss": 0.82340002, + "num_input_tokens_seen": 62854045, + "router_z_loss_clip": 2.65039062, + "router_z_loss_mlp": 0.22717285, + "step": 2904, + "time_per_iteration": 2.578841209411621 + }, + { + "auxiliary_loss_clip": 0.06575242, + "auxiliary_loss_mlp": 0.01290708, + "balance_loss_clip": 0.0630535, + "balance_loss_mlp": 0.01266592, + "epoch": 0.17465804900045093, + "flos": 25527049712640.0, + "grad_norm": 2.1432197986147004, + "language_loss": 0.65256733, + "learning_rate": 3.7844729963539406e-06, + "loss": 0.73122686, + "num_input_tokens_seen": 62873075, + "router_z_loss_clip": 2.703125, + "router_z_loss_mlp": 0.24133301, + "step": 2905, + "time_per_iteration": 3.9871487617492676 + }, + { + "auxiliary_loss_clip": 0.06593791, + "auxiliary_loss_mlp": 0.0129467, + "balance_loss_clip": 0.06312381, + "balance_loss_mlp": 0.01270137, + "epoch": 0.1747181722531189, + "flos": 24135853973760.0, + "grad_norm": 2.2797831517729046, + "language_loss": 0.80441433, + "learning_rate": 3.7842970939280566e-06, + "loss": 0.88329899, + "num_input_tokens_seen": 62892675, + "router_z_loss_clip": 2.8125, + "router_z_loss_mlp": 0.24511719, + "step": 2906, + "time_per_iteration": 2.556459903717041 + }, + { + "auxiliary_loss_clip": 0.065907, + "auxiliary_loss_mlp": 0.01299352, + "balance_loss_clip": 0.0631306, + "balance_loss_mlp": 0.01274306, + "epoch": 0.17477829550578686, + "flos": 17754580506240.0, + "grad_norm": 7.784703467250062, + "language_loss": 0.81983393, + "learning_rate": 3.784121123841449e-06, + "loss": 0.89873445, + "num_input_tokens_seen": 62910675, + "router_z_loss_clip": 2.77734375, + "router_z_loss_mlp": 0.25024414, + "step": 2907, + "time_per_iteration": 2.5256009101867676 + }, + { + "auxiliary_loss_clip": 0.06586979, + "auxiliary_loss_mlp": 0.01293929, + "balance_loss_clip": 0.06311269, + "balance_loss_mlp": 0.01269777, + "epoch": 0.17483841875845482, + "flos": 15382732907520.0, + "grad_norm": 1.9551973542338994, + "language_loss": 0.82190001, + "learning_rate": 3.7839450861007886e-06, + "loss": 0.90070903, + "num_input_tokens_seen": 62928130, + "router_z_loss_clip": 2.7578125, + "router_z_loss_mlp": 0.24133301, + "step": 2908, + "time_per_iteration": 2.5280957221984863 + }, + { + "auxiliary_loss_clip": 0.0658935, + "auxiliary_loss_mlp": 0.01308706, + "balance_loss_clip": 0.06314441, + "balance_loss_mlp": 0.01283279, + "epoch": 0.17489854201112282, + "flos": 17168624104320.0, + "grad_norm": 3.0308502496460243, + "language_loss": 0.8151319, + "learning_rate": 3.7837689807127518e-06, + "loss": 0.89411247, + "num_input_tokens_seen": 62944290, + "router_z_loss_clip": 2.74804688, + "router_z_loss_mlp": 0.25427246, + "step": 2909, + "time_per_iteration": 2.501805543899536 + }, + { + "auxiliary_loss_clip": 0.06591058, + "auxiliary_loss_mlp": 0.01307034, + "balance_loss_clip": 0.06313848, + "balance_loss_mlp": 0.01280235, + "epoch": 0.17495866526379078, + "flos": 19761347865600.0, + "grad_norm": 2.106593508541441, + "language_loss": 0.77213359, + "learning_rate": 3.783592807684017e-06, + "loss": 0.85111451, + "num_input_tokens_seen": 62963505, + "router_z_loss_clip": 2.76953125, + "router_z_loss_mlp": 0.26818848, + "step": 2910, + "time_per_iteration": 2.5401246547698975 + }, + { + "auxiliary_loss_clip": 0.065902, + "auxiliary_loss_mlp": 0.01309875, + "balance_loss_clip": 0.06316847, + "balance_loss_mlp": 0.01282147, + "epoch": 0.17501878851645875, + "flos": 28518535854720.0, + "grad_norm": 6.625386462851426, + "language_loss": 0.8799597, + "learning_rate": 3.7834165670212645e-06, + "loss": 0.95896053, + "num_input_tokens_seen": 62985020, + "router_z_loss_clip": 2.73242188, + "router_z_loss_mlp": 0.27770996, + "step": 2911, + "time_per_iteration": 2.60190486907959 + }, + { + "auxiliary_loss_clip": 0.06591105, + "auxiliary_loss_mlp": 0.01300463, + "balance_loss_clip": 0.06318109, + "balance_loss_mlp": 0.0127537, + "epoch": 0.1750789117691267, + "flos": 17936994844800.0, + "grad_norm": 2.1857421016012832, + "language_loss": 0.90469962, + "learning_rate": 3.7832402587311764e-06, + "loss": 0.98361528, + "num_input_tokens_seen": 63001745, + "router_z_loss_clip": 2.73046875, + "router_z_loss_mlp": 0.2512207, + "step": 2912, + "time_per_iteration": 2.5914218425750732 + }, + { + "auxiliary_loss_clip": 0.06588344, + "auxiliary_loss_mlp": 0.01304507, + "balance_loss_clip": 0.06308792, + "balance_loss_mlp": 0.01277041, + "epoch": 0.17513903502179468, + "flos": 18265248414720.0, + "grad_norm": 2.129743219312126, + "language_loss": 0.74037218, + "learning_rate": 3.783063882820439e-06, + "loss": 0.81930077, + "num_input_tokens_seen": 63019750, + "router_z_loss_clip": 2.79296875, + "router_z_loss_mlp": 0.27453613, + "step": 2913, + "time_per_iteration": 2.5509839057922363 + }, + { + "auxiliary_loss_clip": 0.06580269, + "auxiliary_loss_mlp": 0.01314219, + "balance_loss_clip": 0.06308483, + "balance_loss_mlp": 0.01289781, + "epoch": 0.17519915827446264, + "flos": 20711084768640.0, + "grad_norm": 1.8784732947097995, + "language_loss": 0.70240569, + "learning_rate": 3.782887439295741e-06, + "loss": 0.78135055, + "num_input_tokens_seen": 63039500, + "router_z_loss_clip": 2.71289062, + "router_z_loss_mlp": 0.24450684, + "step": 2914, + "time_per_iteration": 2.560774564743042 + }, + { + "auxiliary_loss_clip": 0.06575729, + "auxiliary_loss_mlp": 0.0130416, + "balance_loss_clip": 0.06304997, + "balance_loss_mlp": 0.01278935, + "epoch": 0.1752592815271306, + "flos": 20529928241280.0, + "grad_norm": 1.7233134110017265, + "language_loss": 0.94360971, + "learning_rate": 3.782710928163772e-06, + "loss": 1.0224086, + "num_input_tokens_seen": 63059785, + "router_z_loss_clip": 2.70507812, + "router_z_loss_mlp": 0.25231934, + "step": 2915, + "time_per_iteration": 2.5500216484069824 + }, + { + "auxiliary_loss_clip": 0.06576817, + "auxiliary_loss_mlp": 0.01301313, + "balance_loss_clip": 0.06306335, + "balance_loss_mlp": 0.01277269, + "epoch": 0.1753194047797986, + "flos": 21805696581120.0, + "grad_norm": 1.6995224084103926, + "language_loss": 0.81995428, + "learning_rate": 3.782534349431226e-06, + "loss": 0.89873564, + "num_input_tokens_seen": 63079385, + "router_z_loss_clip": 2.70703125, + "router_z_loss_mlp": 0.24060059, + "step": 2916, + "time_per_iteration": 2.6210248470306396 + }, + { + "auxiliary_loss_clip": 0.06578801, + "auxiliary_loss_mlp": 0.01308944, + "balance_loss_clip": 0.06305841, + "balance_loss_mlp": 0.01282694, + "epoch": 0.17537952803246656, + "flos": 20674719296640.0, + "grad_norm": 7.015160336993527, + "language_loss": 0.74587643, + "learning_rate": 3.782357703104799e-06, + "loss": 0.82475388, + "num_input_tokens_seen": 63098970, + "router_z_loss_clip": 2.72851562, + "router_z_loss_mlp": 0.26245117, + "step": 2917, + "time_per_iteration": 2.5568697452545166 + }, + { + "auxiliary_loss_clip": 0.06575756, + "auxiliary_loss_mlp": 0.01293408, + "balance_loss_clip": 0.06306349, + "balance_loss_mlp": 0.01269018, + "epoch": 0.17543965128513453, + "flos": 23301837959040.0, + "grad_norm": 1.9034970134752385, + "language_loss": 0.77783519, + "learning_rate": 3.7821809891911897e-06, + "loss": 0.85652685, + "num_input_tokens_seen": 63118750, + "router_z_loss_clip": 2.6953125, + "router_z_loss_mlp": 0.24414062, + "step": 2918, + "time_per_iteration": 2.592294692993164 + }, + { + "auxiliary_loss_clip": 0.06589542, + "auxiliary_loss_mlp": 0.01295236, + "balance_loss_clip": 0.06310425, + "balance_loss_mlp": 0.01271549, + "epoch": 0.1754997745378025, + "flos": 29103234445440.0, + "grad_norm": 2.152727236459042, + "language_loss": 0.75315654, + "learning_rate": 3.782004207697098e-06, + "loss": 0.83200431, + "num_input_tokens_seen": 63136865, + "router_z_loss_clip": 2.79101562, + "router_z_loss_mlp": 0.23693848, + "step": 2919, + "time_per_iteration": 2.67553973197937 + }, + { + "auxiliary_loss_clip": 0.06596158, + "auxiliary_loss_mlp": 0.01303514, + "balance_loss_clip": 0.06314485, + "balance_loss_mlp": 0.01279601, + "epoch": 0.17555989779047046, + "flos": 30379547836800.0, + "grad_norm": 1.8096477139902465, + "language_loss": 0.74872279, + "learning_rate": 3.781827358629228e-06, + "loss": 0.82771957, + "num_input_tokens_seen": 63158325, + "router_z_loss_clip": 2.81835938, + "router_z_loss_mlp": 0.23925781, + "step": 2920, + "time_per_iteration": 2.6885359287261963 + }, + { + "auxiliary_loss_clip": 0.06577891, + "auxiliary_loss_mlp": 0.01294192, + "balance_loss_clip": 0.06307238, + "balance_loss_mlp": 0.01270982, + "epoch": 0.17562002104313842, + "flos": 23293284842880.0, + "grad_norm": 2.5308626608738423, + "language_loss": 0.80572176, + "learning_rate": 3.7816504419942873e-06, + "loss": 0.88444257, + "num_input_tokens_seen": 63173115, + "router_z_loss_clip": 2.70703125, + "router_z_loss_mlp": 0.23217773, + "step": 2921, + "time_per_iteration": 2.51985502243042 + }, + { + "auxiliary_loss_clip": 0.06590457, + "auxiliary_loss_mlp": 0.01284789, + "balance_loss_clip": 0.06311172, + "balance_loss_mlp": 0.01260971, + "epoch": 0.1756801442958064, + "flos": 24797434285440.0, + "grad_norm": 1.5780045761030037, + "language_loss": 0.88755381, + "learning_rate": 3.7814734577989823e-06, + "loss": 0.96630621, + "num_input_tokens_seen": 63192880, + "router_z_loss_clip": 2.79296875, + "router_z_loss_mlp": 0.23815918, + "step": 2922, + "time_per_iteration": 2.595477819442749 + }, + { + "auxiliary_loss_clip": 0.06584172, + "auxiliary_loss_mlp": 0.01290113, + "balance_loss_clip": 0.06306588, + "balance_loss_mlp": 0.01265211, + "epoch": 0.17574026754847438, + "flos": 25778086145280.0, + "grad_norm": 2.2356333874414043, + "language_loss": 0.63389397, + "learning_rate": 3.7812964060500253e-06, + "loss": 0.71263683, + "num_input_tokens_seen": 63214395, + "router_z_loss_clip": 2.78125, + "router_z_loss_mlp": 0.24890137, + "step": 2923, + "time_per_iteration": 2.56712007522583 + }, + { + "auxiliary_loss_clip": 0.06590886, + "auxiliary_loss_mlp": 0.01293522, + "balance_loss_clip": 0.06313786, + "balance_loss_mlp": 0.01269394, + "epoch": 0.17580039080114235, + "flos": 17462273137920.0, + "grad_norm": 2.8211803221017617, + "language_loss": 0.81614435, + "learning_rate": 3.78111928675413e-06, + "loss": 0.89498842, + "num_input_tokens_seen": 63231020, + "router_z_loss_clip": 2.77148438, + "router_z_loss_mlp": 0.24145508, + "step": 2924, + "time_per_iteration": 2.5396065711975098 + }, + { + "auxiliary_loss_clip": 0.06586142, + "auxiliary_loss_mlp": 0.01294774, + "balance_loss_clip": 0.06306558, + "balance_loss_mlp": 0.01269108, + "epoch": 0.1758605140538103, + "flos": 14869633230720.0, + "grad_norm": 2.6608767055753244, + "language_loss": 0.71953624, + "learning_rate": 3.7809420999180126e-06, + "loss": 0.79834545, + "num_input_tokens_seen": 63246245, + "router_z_loss_clip": 2.79492188, + "router_z_loss_mlp": 0.25671387, + "step": 2925, + "time_per_iteration": 2.594172239303589 + }, + { + "auxiliary_loss_clip": 0.0657725, + "auxiliary_loss_mlp": 0.01284494, + "balance_loss_clip": 0.06310555, + "balance_loss_mlp": 0.01261546, + "epoch": 0.17592063730647828, + "flos": 23011165745280.0, + "grad_norm": 1.6593164954495325, + "language_loss": 0.72342992, + "learning_rate": 3.7807648455483934e-06, + "loss": 0.80204731, + "num_input_tokens_seen": 63267790, + "router_z_loss_clip": 2.6640625, + "router_z_loss_mlp": 0.22961426, + "step": 2926, + "time_per_iteration": 2.592061758041382 + }, + { + "auxiliary_loss_clip": 0.06592301, + "auxiliary_loss_mlp": 0.0128622, + "balance_loss_clip": 0.06310115, + "balance_loss_mlp": 0.01260911, + "epoch": 0.17598076055914624, + "flos": 20747911438080.0, + "grad_norm": 1.7750261498089963, + "language_loss": 0.85897779, + "learning_rate": 3.7805875236519918e-06, + "loss": 0.93776292, + "num_input_tokens_seen": 63286830, + "router_z_loss_clip": 2.8203125, + "router_z_loss_mlp": 0.25317383, + "step": 2927, + "time_per_iteration": 2.546537160873413 + }, + { + "auxiliary_loss_clip": 0.06583759, + "auxiliary_loss_mlp": 0.01277616, + "balance_loss_clip": 0.06312352, + "balance_loss_mlp": 0.01255431, + "epoch": 0.1760408838118142, + "flos": 34100607479040.0, + "grad_norm": 1.9484214610767971, + "language_loss": 0.72539592, + "learning_rate": 3.7804101342355336e-06, + "loss": 0.80400968, + "num_input_tokens_seen": 63308870, + "router_z_loss_clip": 2.71289062, + "router_z_loss_mlp": 0.22167969, + "step": 2928, + "time_per_iteration": 2.674516201019287 + }, + { + "auxiliary_loss_clip": 0.06577812, + "auxiliary_loss_mlp": 0.01278822, + "balance_loss_clip": 0.06308608, + "balance_loss_mlp": 0.01256292, + "epoch": 0.1761010070644822, + "flos": 24174902776320.0, + "grad_norm": 1.786019104625144, + "language_loss": 0.83572811, + "learning_rate": 3.780232677305744e-06, + "loss": 0.91429448, + "num_input_tokens_seen": 63329005, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 0.22521973, + "step": 2929, + "time_per_iteration": 2.5528249740600586 + }, + { + "auxiliary_loss_clip": 0.06584716, + "auxiliary_loss_mlp": 0.01284422, + "balance_loss_clip": 0.06311291, + "balance_loss_mlp": 0.01261439, + "epoch": 0.17616113031715017, + "flos": 26583660898560.0, + "grad_norm": 1.8454669041222298, + "language_loss": 0.80018413, + "learning_rate": 3.7800551528693535e-06, + "loss": 0.87887549, + "num_input_tokens_seen": 63349390, + "router_z_loss_clip": 2.73242188, + "router_z_loss_mlp": 0.2298584, + "step": 2930, + "time_per_iteration": 2.6004958152770996 + }, + { + "auxiliary_loss_clip": 0.06579742, + "auxiliary_loss_mlp": 0.01287089, + "balance_loss_clip": 0.06306133, + "balance_loss_mlp": 0.01261935, + "epoch": 0.17622125356981813, + "flos": 25673853265920.0, + "grad_norm": 2.4724081113031677, + "language_loss": 0.77905595, + "learning_rate": 3.7798775609330927e-06, + "loss": 0.85772425, + "num_input_tokens_seen": 63368835, + "router_z_loss_clip": 2.73632812, + "router_z_loss_mlp": 0.25195312, + "step": 2931, + "time_per_iteration": 2.580275774002075 + }, + { + "auxiliary_loss_clip": 0.0657528, + "auxiliary_loss_mlp": 0.01279459, + "balance_loss_clip": 0.063051, + "balance_loss_mlp": 0.01256988, + "epoch": 0.1762813768224861, + "flos": 16514129462400.0, + "grad_norm": 2.8370907048277973, + "language_loss": 0.75863802, + "learning_rate": 3.779699901503696e-06, + "loss": 0.83718544, + "num_input_tokens_seen": 63385220, + "router_z_loss_clip": 2.703125, + "router_z_loss_mlp": 0.22473145, + "step": 2932, + "time_per_iteration": 2.5535829067230225 + }, + { + "auxiliary_loss_clip": 0.06587049, + "auxiliary_loss_mlp": 0.0128414, + "balance_loss_clip": 0.06307124, + "balance_loss_mlp": 0.01258975, + "epoch": 0.17634150007515406, + "flos": 11215518600960.0, + "grad_norm": 2.570844699660862, + "language_loss": 0.90240741, + "learning_rate": 3.7795221745879016e-06, + "loss": 0.98111933, + "num_input_tokens_seen": 63400865, + "router_z_loss_clip": 2.80273438, + "router_z_loss_mlp": 0.25146484, + "step": 2933, + "time_per_iteration": 2.5120935440063477 + }, + { + "auxiliary_loss_clip": 0.06578325, + "auxiliary_loss_mlp": 0.01278816, + "balance_loss_clip": 0.06313163, + "balance_loss_mlp": 0.01256893, + "epoch": 0.17640162332782203, + "flos": 23666750490240.0, + "grad_norm": 2.3821255620265376, + "language_loss": 0.89272201, + "learning_rate": 3.779344380192448e-06, + "loss": 0.97129339, + "num_input_tokens_seen": 63421390, + "router_z_loss_clip": 2.65429688, + "router_z_loss_mlp": 0.21936035, + "step": 2934, + "time_per_iteration": 2.5753555297851562 + }, + { + "auxiliary_loss_clip": 0.06578338, + "auxiliary_loss_mlp": 0.0128005, + "balance_loss_clip": 0.0630947, + "balance_loss_mlp": 0.0125709, + "epoch": 0.17646174658049, + "flos": 53808819056640.0, + "grad_norm": 1.971590125699774, + "language_loss": 0.71700215, + "learning_rate": 3.779166518324077e-06, + "loss": 0.79558611, + "num_input_tokens_seen": 63444715, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 0.2298584, + "step": 2935, + "time_per_iteration": 2.8537397384643555 + }, + { + "auxiliary_loss_clip": 0.06584434, + "auxiliary_loss_mlp": 0.01288458, + "balance_loss_clip": 0.06307955, + "balance_loss_mlp": 0.01264401, + "epoch": 0.17652186983315798, + "flos": 24250820175360.0, + "grad_norm": 8.554775287736033, + "language_loss": 0.71186781, + "learning_rate": 3.7789885889895325e-06, + "loss": 0.79059678, + "num_input_tokens_seen": 63465525, + "router_z_loss_clip": 2.76953125, + "router_z_loss_mlp": 0.24047852, + "step": 2936, + "time_per_iteration": 4.091250896453857 + }, + { + "auxiliary_loss_clip": 0.06580865, + "auxiliary_loss_mlp": 0.01286216, + "balance_loss_clip": 0.06309694, + "balance_loss_mlp": 0.01263745, + "epoch": 0.17658199308582595, + "flos": 27461715033600.0, + "grad_norm": 1.9442195602404513, + "language_loss": 0.72206265, + "learning_rate": 3.7788105921955634e-06, + "loss": 0.80073345, + "num_input_tokens_seen": 63485815, + "router_z_loss_clip": 2.7109375, + "router_z_loss_mlp": 0.22473145, + "step": 2937, + "time_per_iteration": 2.5836215019226074 + }, + { + "auxiliary_loss_clip": 0.06581761, + "auxiliary_loss_mlp": 0.0128249, + "balance_loss_clip": 0.06303879, + "balance_loss_mlp": 0.01258088, + "epoch": 0.17664211633849392, + "flos": 22425167416320.0, + "grad_norm": 2.618384752485795, + "language_loss": 0.76896954, + "learning_rate": 3.7786325279489184e-06, + "loss": 0.84761202, + "num_input_tokens_seen": 63503905, + "router_z_loss_clip": 2.77929688, + "router_z_loss_mlp": 0.24389648, + "step": 2938, + "time_per_iteration": 2.5426154136657715 + }, + { + "auxiliary_loss_clip": 0.06581972, + "auxiliary_loss_mlp": 0.0129211, + "balance_loss_clip": 0.06306289, + "balance_loss_mlp": 0.01268638, + "epoch": 0.17670223959116188, + "flos": 24721642667520.0, + "grad_norm": 2.0224209621562803, + "language_loss": 0.72049117, + "learning_rate": 3.7784543962563495e-06, + "loss": 0.79923201, + "num_input_tokens_seen": 63521985, + "router_z_loss_clip": 2.7578125, + "router_z_loss_mlp": 0.23474121, + "step": 2939, + "time_per_iteration": 4.034467935562134 + }, + { + "auxiliary_loss_clip": 0.06574269, + "auxiliary_loss_mlp": 0.01281711, + "balance_loss_clip": 0.06305616, + "balance_loss_mlp": 0.01258668, + "epoch": 0.17676236284382985, + "flos": 22533383364480.0, + "grad_norm": 2.2379803860691667, + "language_loss": 0.75736713, + "learning_rate": 3.7782761971246115e-06, + "loss": 0.83592695, + "num_input_tokens_seen": 63539830, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 0.23034668, + "step": 2940, + "time_per_iteration": 2.6091058254241943 + }, + { + "auxiliary_loss_clip": 0.06579125, + "auxiliary_loss_mlp": 0.01284811, + "balance_loss_clip": 0.06305407, + "balance_loss_mlp": 0.01261494, + "epoch": 0.1768224860964978, + "flos": 12389988954240.0, + "grad_norm": 2.2625025035762443, + "language_loss": 0.86326134, + "learning_rate": 3.7780979305604616e-06, + "loss": 0.94190073, + "num_input_tokens_seen": 63555495, + "router_z_loss_clip": 2.73632812, + "router_z_loss_mlp": 0.2331543, + "step": 2941, + "time_per_iteration": 2.529346227645874 + }, + { + "auxiliary_loss_clip": 0.06590004, + "auxiliary_loss_mlp": 0.01292545, + "balance_loss_clip": 0.06314506, + "balance_loss_mlp": 0.01269073, + "epoch": 0.1768826093491658, + "flos": 24360335861760.0, + "grad_norm": 2.5150262997144806, + "language_loss": 0.78079373, + "learning_rate": 3.7779195965706607e-06, + "loss": 0.8596192, + "num_input_tokens_seen": 63575290, + "router_z_loss_clip": 2.75390625, + "router_z_loss_mlp": 0.23498535, + "step": 2942, + "time_per_iteration": 2.5893354415893555 + }, + { + "auxiliary_loss_clip": 0.06590073, + "auxiliary_loss_mlp": 0.01285718, + "balance_loss_clip": 0.06313878, + "balance_loss_mlp": 0.01261745, + "epoch": 0.17694273260183377, + "flos": 23593893765120.0, + "grad_norm": 1.793399089669822, + "language_loss": 0.81007993, + "learning_rate": 3.77774119516197e-06, + "loss": 0.88883781, + "num_input_tokens_seen": 63594670, + "router_z_loss_clip": 2.76171875, + "router_z_loss_mlp": 0.23962402, + "step": 2943, + "time_per_iteration": 4.085087537765503 + }, + { + "auxiliary_loss_clip": 0.065895, + "auxiliary_loss_mlp": 0.01284454, + "balance_loss_clip": 0.06311318, + "balance_loss_mlp": 0.01260266, + "epoch": 0.17700285585450173, + "flos": 26768297370240.0, + "grad_norm": 2.7078535987609524, + "language_loss": 0.81690747, + "learning_rate": 3.777562726341155e-06, + "loss": 0.89564693, + "num_input_tokens_seen": 63614780, + "router_z_loss_clip": 2.78125, + "router_z_loss_mlp": 0.24194336, + "step": 2944, + "time_per_iteration": 4.037370204925537 + }, + { + "auxiliary_loss_clip": 0.06577846, + "auxiliary_loss_mlp": 0.01285687, + "balance_loss_clip": 0.06307179, + "balance_loss_mlp": 0.01262, + "epoch": 0.1770629791071697, + "flos": 42785986919040.0, + "grad_norm": 3.287704950657118, + "language_loss": 0.74187398, + "learning_rate": 3.7773841901149835e-06, + "loss": 0.82050931, + "num_input_tokens_seen": 63637190, + "router_z_loss_clip": 2.70898438, + "router_z_loss_mlp": 0.23693848, + "step": 2945, + "time_per_iteration": 2.726703405380249 + }, + { + "auxiliary_loss_clip": 0.06568955, + "auxiliary_loss_mlp": 0.01286818, + "balance_loss_clip": 0.06300092, + "balance_loss_mlp": 0.01263596, + "epoch": 0.17712310235983766, + "flos": 17350954588800.0, + "grad_norm": 3.5781735305150013, + "language_loss": 0.78848231, + "learning_rate": 3.7772055864902256e-06, + "loss": 0.86704004, + "num_input_tokens_seen": 63652140, + "router_z_loss_clip": 2.68554688, + "router_z_loss_mlp": 0.23217773, + "step": 2946, + "time_per_iteration": 2.6050639152526855 + }, + { + "auxiliary_loss_clip": 0.06568858, + "auxiliary_loss_mlp": 0.01284865, + "balance_loss_clip": 0.06300168, + "balance_loss_mlp": 0.01262156, + "epoch": 0.17718322561250563, + "flos": 23885278738560.0, + "grad_norm": 1.9584306466242212, + "language_loss": 0.77679253, + "learning_rate": 3.7770269154736535e-06, + "loss": 0.85532975, + "num_input_tokens_seen": 63671700, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 0.22705078, + "step": 2947, + "time_per_iteration": 2.562394857406616 + }, + { + "auxiliary_loss_clip": 0.06579228, + "auxiliary_loss_mlp": 0.01286605, + "balance_loss_clip": 0.06305858, + "balance_loss_mlp": 0.01262573, + "epoch": 0.1772433488651736, + "flos": 36475306116480.0, + "grad_norm": 3.3061595908349193, + "language_loss": 0.7337119, + "learning_rate": 3.7768481770720424e-06, + "loss": 0.81237024, + "num_input_tokens_seen": 63691685, + "router_z_loss_clip": 2.734375, + "router_z_loss_mlp": 0.24023438, + "step": 2948, + "time_per_iteration": 2.6738815307617188 + }, + { + "auxiliary_loss_clip": 0.06568594, + "auxiliary_loss_mlp": 0.01285694, + "balance_loss_clip": 0.06305531, + "balance_loss_mlp": 0.01263915, + "epoch": 0.1773034721178416, + "flos": 26691457576320.0, + "grad_norm": 2.3861566912178915, + "language_loss": 0.82720947, + "learning_rate": 3.776669371292171e-06, + "loss": 0.90575236, + "num_input_tokens_seen": 63711720, + "router_z_loss_clip": 2.63085938, + "router_z_loss_mlp": 0.21777344, + "step": 2949, + "time_per_iteration": 2.6339261531829834 + }, + { + "auxiliary_loss_clip": 0.06558515, + "auxiliary_loss_mlp": 0.0129088, + "balance_loss_clip": 0.06397671, + "balance_loss_mlp": 0.01282136, + "epoch": 0.17736359537050955, + "flos": 57136007053440.0, + "grad_norm": 0.7127406603181583, + "language_loss": 0.65079832, + "learning_rate": 3.7764904981408186e-06, + "loss": 0.72929227, + "num_input_tokens_seen": 63776280, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.08758545, + "step": 2950, + "time_per_iteration": 3.2668871879577637 + }, + { + "auxiliary_loss_clip": 0.06572378, + "auxiliary_loss_mlp": 0.01284106, + "balance_loss_clip": 0.06306554, + "balance_loss_mlp": 0.01260896, + "epoch": 0.17742371862317752, + "flos": 27205479648000.0, + "grad_norm": 1.9196695606626306, + "language_loss": 0.84746122, + "learning_rate": 3.7763115576247686e-06, + "loss": 0.92602605, + "num_input_tokens_seen": 63797535, + "router_z_loss_clip": 2.65820312, + "router_z_loss_mlp": 0.2322998, + "step": 2951, + "time_per_iteration": 2.585566520690918 + }, + { + "auxiliary_loss_clip": 0.06574618, + "auxiliary_loss_mlp": 0.01283229, + "balance_loss_clip": 0.06301534, + "balance_loss_mlp": 0.01260556, + "epoch": 0.17748384187584548, + "flos": 20966020416000.0, + "grad_norm": 2.232427680766164, + "language_loss": 0.82122993, + "learning_rate": 3.776132549750806e-06, + "loss": 0.89980847, + "num_input_tokens_seen": 63817045, + "router_z_loss_clip": 2.72851562, + "router_z_loss_mlp": 0.22680664, + "step": 2952, + "time_per_iteration": 2.55747652053833 + }, + { + "auxiliary_loss_clip": 0.06570595, + "auxiliary_loss_mlp": 0.01296069, + "balance_loss_clip": 0.06303248, + "balance_loss_mlp": 0.01272251, + "epoch": 0.17754396512851345, + "flos": 25017052636800.0, + "grad_norm": 5.629810818318968, + "language_loss": 0.8066265, + "learning_rate": 3.7759534745257194e-06, + "loss": 0.88529313, + "num_input_tokens_seen": 63837665, + "router_z_loss_clip": 2.671875, + "router_z_loss_mlp": 0.23840332, + "step": 2953, + "time_per_iteration": 2.5756490230560303 + }, + { + "auxiliary_loss_clip": 0.06576403, + "auxiliary_loss_mlp": 0.01299444, + "balance_loss_clip": 0.06307617, + "balance_loss_mlp": 0.01275877, + "epoch": 0.1776040883811814, + "flos": 32059780634880.0, + "grad_norm": 1.9568540134603198, + "language_loss": 0.89472413, + "learning_rate": 3.7757743319562994e-06, + "loss": 0.97348255, + "num_input_tokens_seen": 63858455, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 0.2355957, + "step": 2954, + "time_per_iteration": 2.64989972114563 + }, + { + "auxiliary_loss_clip": 0.06576417, + "auxiliary_loss_mlp": 0.01304463, + "balance_loss_clip": 0.06308817, + "balance_loss_mlp": 0.01280788, + "epoch": 0.17766421163384938, + "flos": 21579579538560.0, + "grad_norm": 2.0844074095191423, + "language_loss": 0.85445726, + "learning_rate": 3.7755951220493386e-06, + "loss": 0.93326604, + "num_input_tokens_seen": 63876935, + "router_z_loss_clip": 2.6796875, + "router_z_loss_mlp": 0.23693848, + "step": 2955, + "time_per_iteration": 2.5314552783966064 + }, + { + "auxiliary_loss_clip": 0.06566998, + "auxiliary_loss_mlp": 0.01298177, + "balance_loss_clip": 0.06301849, + "balance_loss_mlp": 0.01274287, + "epoch": 0.17772433488651737, + "flos": 22425922103040.0, + "grad_norm": 1.629233918934169, + "language_loss": 0.7198323, + "learning_rate": 3.7754158448116327e-06, + "loss": 0.79848409, + "num_input_tokens_seen": 63896815, + "router_z_loss_clip": 2.65039062, + "router_z_loss_mlp": 0.2388916, + "step": 2956, + "time_per_iteration": 2.5686161518096924 + }, + { + "auxiliary_loss_clip": 0.06565966, + "auxiliary_loss_mlp": 0.01302663, + "balance_loss_clip": 0.06303196, + "balance_loss_mlp": 0.01279632, + "epoch": 0.17778445813918534, + "flos": 25636481544960.0, + "grad_norm": 1.8690466813220736, + "language_loss": 0.8383618, + "learning_rate": 3.7752365002499795e-06, + "loss": 0.9170481, + "num_input_tokens_seen": 63916140, + "router_z_loss_clip": 2.63085938, + "router_z_loss_mlp": 0.23034668, + "step": 2957, + "time_per_iteration": 2.5693180561065674 + }, + { + "auxiliary_loss_clip": 0.06574687, + "auxiliary_loss_mlp": 0.0129708, + "balance_loss_clip": 0.06307757, + "balance_loss_mlp": 0.01274323, + "epoch": 0.1778445813918533, + "flos": 25635810712320.0, + "grad_norm": 1.5960329991483622, + "language_loss": 0.75535214, + "learning_rate": 3.7750570883711807e-06, + "loss": 0.83406979, + "num_input_tokens_seen": 63935220, + "router_z_loss_clip": 2.66992188, + "router_z_loss_mlp": 0.22753906, + "step": 2958, + "time_per_iteration": 2.6068832874298096 + }, + { + "auxiliary_loss_clip": 0.06572513, + "auxiliary_loss_mlp": 0.01295837, + "balance_loss_clip": 0.06305174, + "balance_loss_mlp": 0.01273533, + "epoch": 0.17790470464452127, + "flos": 22351975274880.0, + "grad_norm": 2.4916809347301867, + "language_loss": 0.8152473, + "learning_rate": 3.7748776091820397e-06, + "loss": 0.89393079, + "num_input_tokens_seen": 63954550, + "router_z_loss_clip": 2.67773438, + "router_z_loss_mlp": 0.22302246, + "step": 2959, + "time_per_iteration": 2.532893419265747 + }, + { + "auxiliary_loss_clip": 0.06580231, + "auxiliary_loss_mlp": 0.01291039, + "balance_loss_clip": 0.06308466, + "balance_loss_mlp": 0.01267293, + "epoch": 0.17796482789718923, + "flos": 18771052786560.0, + "grad_norm": 1.971364332808954, + "language_loss": 0.52699149, + "learning_rate": 3.774698062689362e-06, + "loss": 0.60570425, + "num_input_tokens_seen": 63972425, + "router_z_loss_clip": 2.71679688, + "router_z_loss_mlp": 0.23754883, + "step": 2960, + "time_per_iteration": 2.5427799224853516 + }, + { + "auxiliary_loss_clip": 0.06575893, + "auxiliary_loss_mlp": 0.01290781, + "balance_loss_clip": 0.06308038, + "balance_loss_mlp": 0.01267726, + "epoch": 0.1780249511498572, + "flos": 23447719117440.0, + "grad_norm": 1.7972451693934908, + "language_loss": 0.90068716, + "learning_rate": 3.7745184488999548e-06, + "loss": 0.97935379, + "num_input_tokens_seen": 63992165, + "router_z_loss_clip": 2.6796875, + "router_z_loss_mlp": 0.23083496, + "step": 2961, + "time_per_iteration": 2.5641977787017822 + }, + { + "auxiliary_loss_clip": 0.06579147, + "auxiliary_loss_mlp": 0.01285295, + "balance_loss_clip": 0.06309063, + "balance_loss_mlp": 0.0126075, + "epoch": 0.1780850744025252, + "flos": 23374149632640.0, + "grad_norm": 3.006724243875413, + "language_loss": 0.79600328, + "learning_rate": 3.774338767820631e-06, + "loss": 0.87464768, + "num_input_tokens_seen": 64013470, + "router_z_loss_clip": 2.703125, + "router_z_loss_mlp": 0.2454834, + "step": 2962, + "time_per_iteration": 2.605395555496216 + }, + { + "auxiliary_loss_clip": 0.06579778, + "auxiliary_loss_mlp": 0.01288142, + "balance_loss_clip": 0.06310856, + "balance_loss_mlp": 0.01262977, + "epoch": 0.17814519765519315, + "flos": 13777117770240.0, + "grad_norm": 1.8585534107816564, + "language_loss": 0.75987798, + "learning_rate": 3.774159019458203e-06, + "loss": 0.83855718, + "num_input_tokens_seen": 64030975, + "router_z_loss_clip": 2.6875, + "router_z_loss_mlp": 0.25146484, + "step": 2963, + "time_per_iteration": 2.4989051818847656 + }, + { + "auxiliary_loss_clip": 0.06582604, + "auxiliary_loss_mlp": 0.01280238, + "balance_loss_clip": 0.06308165, + "balance_loss_mlp": 0.01255573, + "epoch": 0.17820532090786112, + "flos": 21982073425920.0, + "grad_norm": 2.394373782804808, + "language_loss": 0.79892176, + "learning_rate": 3.7739792038194877e-06, + "loss": 0.87755024, + "num_input_tokens_seen": 64050075, + "router_z_loss_clip": 2.74609375, + "router_z_loss_mlp": 0.24682617, + "step": 2964, + "time_per_iteration": 2.6040844917297363 + }, + { + "auxiliary_loss_clip": 0.06584992, + "auxiliary_loss_mlp": 0.01284037, + "balance_loss_clip": 0.06315298, + "balance_loss_mlp": 0.01259289, + "epoch": 0.17826544416052909, + "flos": 24797727774720.0, + "grad_norm": 4.1010799155066, + "language_loss": 0.8221398, + "learning_rate": 3.7737993209113027e-06, + "loss": 0.90083003, + "num_input_tokens_seen": 64071920, + "router_z_loss_clip": 2.69726562, + "router_z_loss_mlp": 0.24755859, + "step": 2965, + "time_per_iteration": 2.5539731979370117 + }, + { + "auxiliary_loss_clip": 0.06570912, + "auxiliary_loss_mlp": 0.01281116, + "balance_loss_clip": 0.06306428, + "balance_loss_mlp": 0.01258788, + "epoch": 0.17832556741319705, + "flos": 13884411323520.0, + "grad_norm": 2.4679554184574974, + "language_loss": 0.96086347, + "learning_rate": 3.7736193707404698e-06, + "loss": 1.03938377, + "num_input_tokens_seen": 64086835, + "router_z_loss_clip": 2.64257812, + "router_z_loss_mlp": 0.22338867, + "step": 2966, + "time_per_iteration": 2.527735948562622 + }, + { + "auxiliary_loss_clip": 0.06579631, + "auxiliary_loss_mlp": 0.01280876, + "balance_loss_clip": 0.06311509, + "balance_loss_mlp": 0.0125688, + "epoch": 0.17838569066586502, + "flos": 36649502755200.0, + "grad_norm": 2.0843689120837965, + "language_loss": 0.73698831, + "learning_rate": 3.7734393533138127e-06, + "loss": 0.81559336, + "num_input_tokens_seen": 64107360, + "router_z_loss_clip": 2.68359375, + "router_z_loss_mlp": 0.24023438, + "step": 2967, + "time_per_iteration": 2.7015600204467773 + }, + { + "auxiliary_loss_clip": 0.06577688, + "auxiliary_loss_mlp": 0.01283294, + "balance_loss_clip": 0.06315881, + "balance_loss_mlp": 0.01260192, + "epoch": 0.17844581391853298, + "flos": 18732087838080.0, + "grad_norm": 3.4272342033369956, + "language_loss": 0.77622253, + "learning_rate": 3.773259268638157e-06, + "loss": 0.85483229, + "num_input_tokens_seen": 64124690, + "router_z_loss_clip": 2.62109375, + "router_z_loss_mlp": 0.2310791, + "step": 2968, + "time_per_iteration": 2.5782222747802734 + }, + { + "auxiliary_loss_clip": 0.06574235, + "auxiliary_loss_mlp": 0.01280569, + "balance_loss_clip": 0.06309816, + "balance_loss_mlp": 0.01257716, + "epoch": 0.17850593717120097, + "flos": 27385168728960.0, + "grad_norm": 2.732998701382931, + "language_loss": 0.76891911, + "learning_rate": 3.7730791167203333e-06, + "loss": 0.84746712, + "num_input_tokens_seen": 64146315, + "router_z_loss_clip": 2.64453125, + "router_z_loss_mlp": 0.2286377, + "step": 2969, + "time_per_iteration": 2.596932888031006 + }, + { + "auxiliary_loss_clip": 0.06469887, + "auxiliary_loss_mlp": 0.01257031, + "balance_loss_clip": 0.06316882, + "balance_loss_mlp": 0.01250105, + "epoch": 0.17856606042386894, + "flos": 67014696816000.0, + "grad_norm": 0.8163537423270849, + "language_loss": 0.69127434, + "learning_rate": 3.772898897567171e-06, + "loss": 0.76854354, + "num_input_tokens_seen": 64210875, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.06939697, + "step": 2970, + "time_per_iteration": 3.239208221435547 + }, + { + "auxiliary_loss_clip": 0.06585611, + "auxiliary_loss_mlp": 0.01285467, + "balance_loss_clip": 0.06311353, + "balance_loss_mlp": 0.01261936, + "epoch": 0.1786261836765369, + "flos": 36986015952000.0, + "grad_norm": 1.9165060952178286, + "language_loss": 0.67737955, + "learning_rate": 3.772718611185505e-06, + "loss": 0.75609034, + "num_input_tokens_seen": 64230740, + "router_z_loss_clip": 2.7421875, + "router_z_loss_mlp": 0.23522949, + "step": 2971, + "time_per_iteration": 2.6962218284606934 + }, + { + "auxiliary_loss_clip": 0.06573113, + "auxiliary_loss_mlp": 0.01289649, + "balance_loss_clip": 0.06303954, + "balance_loss_mlp": 0.01265164, + "epoch": 0.17868630692920487, + "flos": 24832122675840.0, + "grad_norm": 2.3195878790033992, + "language_loss": 0.90615618, + "learning_rate": 3.7725382575821717e-06, + "loss": 0.98478377, + "num_input_tokens_seen": 64252300, + "router_z_loss_clip": 2.69335938, + "router_z_loss_mlp": 0.24475098, + "step": 2972, + "time_per_iteration": 2.5959432125091553 + }, + { + "auxiliary_loss_clip": 0.06576589, + "auxiliary_loss_mlp": 0.01296839, + "balance_loss_clip": 0.06306117, + "balance_loss_mlp": 0.01272747, + "epoch": 0.17874643018187283, + "flos": 16987509504000.0, + "grad_norm": 2.140735852517547, + "language_loss": 0.89032125, + "learning_rate": 3.77235783676401e-06, + "loss": 0.96905553, + "num_input_tokens_seen": 64270105, + "router_z_loss_clip": 2.70507812, + "router_z_loss_mlp": 0.24084473, + "step": 2973, + "time_per_iteration": 2.5378026962280273 + }, + { + "auxiliary_loss_clip": 0.06586085, + "auxiliary_loss_mlp": 0.01287873, + "balance_loss_clip": 0.06315553, + "balance_loss_mlp": 0.01263459, + "epoch": 0.1788065534345408, + "flos": 21038499797760.0, + "grad_norm": 2.0743135363702097, + "language_loss": 0.77368832, + "learning_rate": 3.7721773487378615e-06, + "loss": 0.8524279, + "num_input_tokens_seen": 64287250, + "router_z_loss_clip": 2.70703125, + "router_z_loss_mlp": 0.2442627, + "step": 2974, + "time_per_iteration": 2.53279972076416 + }, + { + "auxiliary_loss_clip": 0.06580098, + "auxiliary_loss_mlp": 0.01294024, + "balance_loss_clip": 0.06311634, + "balance_loss_mlp": 0.01269825, + "epoch": 0.17886667668720876, + "flos": 23994500935680.0, + "grad_norm": 2.8964956916015323, + "language_loss": 0.75456583, + "learning_rate": 3.7719967935105705e-06, + "loss": 0.83330709, + "num_input_tokens_seen": 64307140, + "router_z_loss_clip": 2.68359375, + "router_z_loss_mlp": 0.24182129, + "step": 2975, + "time_per_iteration": 2.5941531658172607 + }, + { + "auxiliary_loss_clip": 0.06574937, + "auxiliary_loss_mlp": 0.01296496, + "balance_loss_clip": 0.06309143, + "balance_loss_mlp": 0.0127443, + "epoch": 0.17892679993987676, + "flos": 25746626136960.0, + "grad_norm": 1.5983536265516811, + "language_loss": 0.73931366, + "learning_rate": 3.7718161710889833e-06, + "loss": 0.81802797, + "num_input_tokens_seen": 64328760, + "router_z_loss_clip": 2.65820312, + "router_z_loss_mlp": 0.22070312, + "step": 2976, + "time_per_iteration": 3.9981672763824463 + }, + { + "auxiliary_loss_clip": 0.06569345, + "auxiliary_loss_mlp": 0.01289522, + "balance_loss_clip": 0.06309073, + "balance_loss_mlp": 0.01268697, + "epoch": 0.17898692319254472, + "flos": 25706277596160.0, + "grad_norm": 1.568582717127115, + "language_loss": 0.7779026, + "learning_rate": 3.7716354814799495e-06, + "loss": 0.85649121, + "num_input_tokens_seen": 64348800, + "router_z_loss_clip": 2.60351562, + "router_z_loss_mlp": 0.20837402, + "step": 2977, + "time_per_iteration": 2.6050028800964355 + }, + { + "auxiliary_loss_clip": 0.06579779, + "auxiliary_loss_mlp": 0.01290892, + "balance_loss_clip": 0.06314169, + "balance_loss_mlp": 0.01267538, + "epoch": 0.1790470464452127, + "flos": 19323830171520.0, + "grad_norm": 2.1998049901746395, + "language_loss": 0.80421352, + "learning_rate": 3.7714547246903203e-06, + "loss": 0.88292015, + "num_input_tokens_seen": 64367955, + "router_z_loss_clip": 2.65234375, + "router_z_loss_mlp": 0.23339844, + "step": 2978, + "time_per_iteration": 4.010040044784546 + }, + { + "auxiliary_loss_clip": 0.06576563, + "auxiliary_loss_mlp": 0.01293687, + "balance_loss_clip": 0.06306942, + "balance_loss_mlp": 0.01267556, + "epoch": 0.17910716969788065, + "flos": 30052048953600.0, + "grad_norm": 1.73318348994846, + "language_loss": 0.77042997, + "learning_rate": 3.7712739007269508e-06, + "loss": 0.84913242, + "num_input_tokens_seen": 64389805, + "router_z_loss_clip": 2.69335938, + "router_z_loss_mlp": 0.2611084, + "step": 2979, + "time_per_iteration": 2.608980655670166 + }, + { + "auxiliary_loss_clip": 0.06560802, + "auxiliary_loss_mlp": 0.01281236, + "balance_loss_clip": 0.06300105, + "balance_loss_mlp": 0.01258264, + "epoch": 0.17916729295054862, + "flos": 19433848982400.0, + "grad_norm": 2.44165935104879, + "language_loss": 0.69755781, + "learning_rate": 3.7710930095966976e-06, + "loss": 0.77597821, + "num_input_tokens_seen": 64408220, + "router_z_loss_clip": 2.60742188, + "router_z_loss_mlp": 0.22961426, + "step": 2980, + "time_per_iteration": 2.5433084964752197 + }, + { + "auxiliary_loss_clip": 0.06568111, + "auxiliary_loss_mlp": 0.01287625, + "balance_loss_clip": 0.06298865, + "balance_loss_mlp": 0.01262627, + "epoch": 0.17922741620321658, + "flos": 14616877789440.0, + "grad_norm": 2.147684280368508, + "language_loss": 0.7145257, + "learning_rate": 3.7709120513064196e-06, + "loss": 0.79308307, + "num_input_tokens_seen": 64426380, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 0.25, + "step": 2981, + "time_per_iteration": 2.500054359436035 + }, + { + "auxiliary_loss_clip": 0.06576173, + "auxiliary_loss_mlp": 0.01291804, + "balance_loss_clip": 0.06304301, + "balance_loss_mlp": 0.01267676, + "epoch": 0.17928753945588458, + "flos": 17171013945600.0, + "grad_norm": 2.0884907581744514, + "language_loss": 0.82620054, + "learning_rate": 3.7707310258629796e-06, + "loss": 0.90488029, + "num_input_tokens_seen": 64444355, + "router_z_loss_clip": 2.72070312, + "router_z_loss_mlp": 0.24145508, + "step": 2982, + "time_per_iteration": 2.5748655796051025 + }, + { + "auxiliary_loss_clip": 0.06564468, + "auxiliary_loss_mlp": 0.01285766, + "balance_loss_clip": 0.06298885, + "balance_loss_mlp": 0.01263212, + "epoch": 0.17934766270855254, + "flos": 31403860473600.0, + "grad_norm": 1.5724638299649338, + "language_loss": 0.83894312, + "learning_rate": 3.7705499332732413e-06, + "loss": 0.91744542, + "num_input_tokens_seen": 64467800, + "router_z_loss_clip": 2.65429688, + "router_z_loss_mlp": 0.2253418, + "step": 2983, + "time_per_iteration": 5.515043497085571 + }, + { + "auxiliary_loss_clip": 0.0656914, + "auxiliary_loss_mlp": 0.01282068, + "balance_loss_clip": 0.06294827, + "balance_loss_mlp": 0.01257571, + "epoch": 0.1794077859612205, + "flos": 20820558528000.0, + "grad_norm": 2.232182880378402, + "language_loss": 0.86948806, + "learning_rate": 3.7703687735440718e-06, + "loss": 0.94800013, + "num_input_tokens_seen": 64487230, + "router_z_loss_clip": 2.74414062, + "router_z_loss_mlp": 0.24523926, + "step": 2984, + "time_per_iteration": 2.51488995552063 + }, + { + "auxiliary_loss_clip": 0.0657285, + "auxiliary_loss_mlp": 0.0128885, + "balance_loss_clip": 0.06300434, + "balance_loss_mlp": 0.01263315, + "epoch": 0.17946790921388847, + "flos": 28994096102400.0, + "grad_norm": 1.3770556187482685, + "language_loss": 0.90024149, + "learning_rate": 3.7701875466823416e-06, + "loss": 0.97885847, + "num_input_tokens_seen": 64509165, + "router_z_loss_clip": 2.72851562, + "router_z_loss_mlp": 0.25537109, + "step": 2985, + "time_per_iteration": 2.6063013076782227 + }, + { + "auxiliary_loss_clip": 0.06556329, + "auxiliary_loss_mlp": 0.01283368, + "balance_loss_clip": 0.06297163, + "balance_loss_mlp": 0.01261088, + "epoch": 0.17952803246655644, + "flos": 20743131755520.0, + "grad_norm": 1.9976249367728316, + "language_loss": 0.71013325, + "learning_rate": 3.770006252694922e-06, + "loss": 0.78853023, + "num_input_tokens_seen": 64527940, + "router_z_loss_clip": 2.58984375, + "router_z_loss_mlp": 0.22277832, + "step": 2986, + "time_per_iteration": 2.519601345062256 + }, + { + "auxiliary_loss_clip": 0.0656532, + "auxiliary_loss_mlp": 0.01291064, + "balance_loss_clip": 0.06300499, + "balance_loss_mlp": 0.01266805, + "epoch": 0.1795881557192244, + "flos": 28263390572160.0, + "grad_norm": 2.1489314529360994, + "language_loss": 0.78320301, + "learning_rate": 3.769824891588688e-06, + "loss": 0.86176682, + "num_input_tokens_seen": 64545230, + "router_z_loss_clip": 2.65039062, + "router_z_loss_mlp": 0.24243164, + "step": 2987, + "time_per_iteration": 2.6449100971221924 + }, + { + "auxiliary_loss_clip": 0.06569126, + "auxiliary_loss_mlp": 0.01288456, + "balance_loss_clip": 0.06297948, + "balance_loss_mlp": 0.01263589, + "epoch": 0.17964827897189237, + "flos": 18558016980480.0, + "grad_norm": 1.9340316390641499, + "language_loss": 0.78628373, + "learning_rate": 3.7696434633705164e-06, + "loss": 0.86485958, + "num_input_tokens_seen": 64563820, + "router_z_loss_clip": 2.7109375, + "router_z_loss_mlp": 0.24890137, + "step": 2988, + "time_per_iteration": 2.53200101852417 + }, + { + "auxiliary_loss_clip": 0.06451814, + "auxiliary_loss_mlp": 0.01275074, + "balance_loss_clip": 0.06303016, + "balance_loss_mlp": 0.01267408, + "epoch": 0.17970840222456036, + "flos": 58182052625280.0, + "grad_norm": 0.7360596365876024, + "language_loss": 0.62615538, + "learning_rate": 3.7694619680472875e-06, + "loss": 0.70342427, + "num_input_tokens_seen": 64621315, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.07653809, + "step": 2989, + "time_per_iteration": 3.076199769973755 + }, + { + "auxiliary_loss_clip": 0.06567107, + "auxiliary_loss_mlp": 0.01292244, + "balance_loss_clip": 0.06300405, + "balance_loss_mlp": 0.0126808, + "epoch": 0.17976852547722832, + "flos": 20306662237440.0, + "grad_norm": 2.2696852334697035, + "language_loss": 0.71750367, + "learning_rate": 3.7692804056258837e-06, + "loss": 0.79609722, + "num_input_tokens_seen": 64639885, + "router_z_loss_clip": 2.66601562, + "router_z_loss_mlp": 0.24157715, + "step": 2990, + "time_per_iteration": 2.5519793033599854 + }, + { + "auxiliary_loss_clip": 0.06572431, + "auxiliary_loss_mlp": 0.01293466, + "balance_loss_clip": 0.0629989, + "balance_loss_mlp": 0.0126873, + "epoch": 0.1798286487298963, + "flos": 39677564004480.0, + "grad_norm": 1.9736942492438545, + "language_loss": 0.69419956, + "learning_rate": 3.7690987761131893e-06, + "loss": 0.77285856, + "num_input_tokens_seen": 64661220, + "router_z_loss_clip": 2.7265625, + "router_z_loss_mlp": 0.24743652, + "step": 2991, + "time_per_iteration": 2.6942460536956787 + }, + { + "auxiliary_loss_clip": 0.06566148, + "auxiliary_loss_mlp": 0.01286066, + "balance_loss_clip": 0.0629756, + "balance_loss_mlp": 0.012617, + "epoch": 0.17988877198256426, + "flos": 25527385128960.0, + "grad_norm": 1.696800264728132, + "language_loss": 0.83554435, + "learning_rate": 3.7689170795160924e-06, + "loss": 0.91406649, + "num_input_tokens_seen": 64682530, + "router_z_loss_clip": 2.68359375, + "router_z_loss_mlp": 0.24365234, + "step": 2992, + "time_per_iteration": 2.5905981063842773 + }, + { + "auxiliary_loss_clip": 0.06555136, + "auxiliary_loss_mlp": 0.01287452, + "balance_loss_clip": 0.06296399, + "balance_loss_mlp": 0.01264087, + "epoch": 0.17994889523523222, + "flos": 18813539606400.0, + "grad_norm": 1.8489809189150626, + "language_loss": 0.83113515, + "learning_rate": 3.7687353158414822e-06, + "loss": 0.90956104, + "num_input_tokens_seen": 64701025, + "router_z_loss_clip": 2.59179688, + "router_z_loss_mlp": 0.23352051, + "step": 2993, + "time_per_iteration": 2.52469801902771 + }, + { + "auxiliary_loss_clip": 0.06567293, + "auxiliary_loss_mlp": 0.01295673, + "balance_loss_clip": 0.06297931, + "balance_loss_mlp": 0.01270532, + "epoch": 0.18000901848790019, + "flos": 21110601836160.0, + "grad_norm": 1.6727087173341013, + "language_loss": 0.79138827, + "learning_rate": 3.7685534850962517e-06, + "loss": 0.87001795, + "num_input_tokens_seen": 64719570, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 0.25134277, + "step": 2994, + "time_per_iteration": 2.6068711280822754 + }, + { + "auxiliary_loss_clip": 0.06570512, + "auxiliary_loss_mlp": 0.01299664, + "balance_loss_clip": 0.06303661, + "balance_loss_mlp": 0.01275656, + "epoch": 0.18006914174056818, + "flos": 19652586865920.0, + "grad_norm": 2.057688194559839, + "language_loss": 0.81263554, + "learning_rate": 3.768371587287296e-06, + "loss": 0.89133728, + "num_input_tokens_seen": 64738110, + "router_z_loss_clip": 2.66796875, + "router_z_loss_mlp": 0.24023438, + "step": 2995, + "time_per_iteration": 2.55191707611084 + }, + { + "auxiliary_loss_clip": 0.06569074, + "auxiliary_loss_mlp": 0.0128305, + "balance_loss_clip": 0.06302823, + "balance_loss_mlp": 0.012599, + "epoch": 0.18012926499323614, + "flos": 19505909093760.0, + "grad_norm": 1.5669289310044971, + "language_loss": 0.84560204, + "learning_rate": 3.768189622421512e-06, + "loss": 0.92412329, + "num_input_tokens_seen": 64756345, + "router_z_loss_clip": 2.66210938, + "router_z_loss_mlp": 0.23156738, + "step": 2996, + "time_per_iteration": 2.5438597202301025 + }, + { + "auxiliary_loss_clip": 0.06562654, + "auxiliary_loss_mlp": 0.012845, + "balance_loss_clip": 0.06302606, + "balance_loss_mlp": 0.01261124, + "epoch": 0.1801893882459041, + "flos": 19470759505920.0, + "grad_norm": 1.7191902249906965, + "language_loss": 0.88438457, + "learning_rate": 3.7680075905058006e-06, + "loss": 0.96285611, + "num_input_tokens_seen": 64776375, + "router_z_loss_clip": 2.6015625, + "router_z_loss_mlp": 0.23352051, + "step": 2997, + "time_per_iteration": 2.5537290573120117 + }, + { + "auxiliary_loss_clip": 0.06589026, + "auxiliary_loss_mlp": 0.01294218, + "balance_loss_clip": 0.06317096, + "balance_loss_mlp": 0.01268731, + "epoch": 0.18024951149857207, + "flos": 26877938837760.0, + "grad_norm": 1.8629134602199495, + "language_loss": 0.86106455, + "learning_rate": 3.7678254915470643e-06, + "loss": 0.939897, + "num_input_tokens_seen": 64796210, + "router_z_loss_clip": 2.71679688, + "router_z_loss_mlp": 0.25500488, + "step": 2998, + "time_per_iteration": 2.6256613731384277 + }, + { + "auxiliary_loss_clip": 0.06576181, + "auxiliary_loss_mlp": 0.01293189, + "balance_loss_clip": 0.06311405, + "balance_loss_mlp": 0.01269573, + "epoch": 0.18030963475124004, + "flos": 30234421365120.0, + "grad_norm": 1.8712207411963018, + "language_loss": 0.84650278, + "learning_rate": 3.7676433255522084e-06, + "loss": 0.92519647, + "num_input_tokens_seen": 64818590, + "router_z_loss_clip": 2.64648438, + "router_z_loss_mlp": 0.23608398, + "step": 2999, + "time_per_iteration": 2.6169869899749756 + }, + { + "auxiliary_loss_clip": 0.06576863, + "auxiliary_loss_mlp": 0.01287758, + "balance_loss_clip": 0.06310622, + "balance_loss_mlp": 0.01263905, + "epoch": 0.180369758003908, + "flos": 22313681159040.0, + "grad_norm": 2.163703762887268, + "language_loss": 0.75604963, + "learning_rate": 3.76746109252814e-06, + "loss": 0.83469582, + "num_input_tokens_seen": 64838350, + "router_z_loss_clip": 2.6640625, + "router_z_loss_mlp": 0.23852539, + "step": 3000, + "time_per_iteration": 2.6028895378112793 + }, + { + "auxiliary_loss_clip": 0.06574081, + "auxiliary_loss_mlp": 0.01292075, + "balance_loss_clip": 0.06310557, + "balance_loss_mlp": 0.01270034, + "epoch": 0.18042988125657597, + "flos": 23738726747520.0, + "grad_norm": 2.5967993482221114, + "language_loss": 0.72796941, + "learning_rate": 3.76727879248177e-06, + "loss": 0.80663097, + "num_input_tokens_seen": 64858065, + "router_z_loss_clip": 2.63671875, + "router_z_loss_mlp": 0.22033691, + "step": 3001, + "time_per_iteration": 2.5506463050842285 + }, + { + "auxiliary_loss_clip": 0.06583872, + "auxiliary_loss_mlp": 0.01288133, + "balance_loss_clip": 0.06311986, + "balance_loss_mlp": 0.01262336, + "epoch": 0.18049000450924396, + "flos": 24099781991040.0, + "grad_norm": 2.0612506576335488, + "language_loss": 0.88948703, + "learning_rate": 3.767096425420011e-06, + "loss": 0.96820712, + "num_input_tokens_seen": 64877305, + "router_z_loss_clip": 2.71484375, + "router_z_loss_mlp": 0.25793457, + "step": 3002, + "time_per_iteration": 2.606262683868408 + }, + { + "auxiliary_loss_clip": 0.06584583, + "auxiliary_loss_mlp": 0.01297298, + "balance_loss_clip": 0.06316328, + "balance_loss_mlp": 0.01274613, + "epoch": 0.18055012776191193, + "flos": 22169602863360.0, + "grad_norm": 1.9471434915323604, + "language_loss": 0.82044661, + "learning_rate": 3.7669139913497788e-06, + "loss": 0.89926547, + "num_input_tokens_seen": 64896955, + "router_z_loss_clip": 2.68359375, + "router_z_loss_mlp": 0.22705078, + "step": 3003, + "time_per_iteration": 2.519054889678955 + }, + { + "auxiliary_loss_clip": 0.06584047, + "auxiliary_loss_mlp": 0.01304701, + "balance_loss_clip": 0.0631455, + "balance_loss_mlp": 0.01281098, + "epoch": 0.1806102510145799, + "flos": 28921155523200.0, + "grad_norm": 1.9671809983045359, + "language_loss": 0.67718011, + "learning_rate": 3.7667314902779907e-06, + "loss": 0.75606757, + "num_input_tokens_seen": 64917080, + "router_z_loss_clip": 2.6953125, + "router_z_loss_mlp": 0.23608398, + "step": 3004, + "time_per_iteration": 2.576216459274292 + }, + { + "auxiliary_loss_clip": 0.06581833, + "auxiliary_loss_mlp": 0.01290497, + "balance_loss_clip": 0.06313001, + "balance_loss_mlp": 0.01265976, + "epoch": 0.18067037426724786, + "flos": 19031648584320.0, + "grad_norm": 1.7292261015630317, + "language_loss": 0.86117315, + "learning_rate": 3.7665489222115677e-06, + "loss": 0.93989646, + "num_input_tokens_seen": 64935215, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 0.2454834, + "step": 3005, + "time_per_iteration": 2.51688814163208 + }, + { + "auxiliary_loss_clip": 0.06579112, + "auxiliary_loss_mlp": 0.01292933, + "balance_loss_clip": 0.0631589, + "balance_loss_mlp": 0.01270247, + "epoch": 0.18073049751991582, + "flos": 27460960346880.0, + "grad_norm": 1.9900110027616933, + "language_loss": 0.84054905, + "learning_rate": 3.766366287157432e-06, + "loss": 0.9192695, + "num_input_tokens_seen": 64956275, + "router_z_loss_clip": 2.6328125, + "router_z_loss_mlp": 0.22692871, + "step": 3006, + "time_per_iteration": 2.6471307277679443 + }, + { + "auxiliary_loss_clip": 0.06573892, + "auxiliary_loss_mlp": 0.01293776, + "balance_loss_clip": 0.06311665, + "balance_loss_mlp": 0.01270399, + "epoch": 0.1807906207725838, + "flos": 28736309416320.0, + "grad_norm": 1.8980852178108305, + "language_loss": 0.77909601, + "learning_rate": 3.7661835851225103e-06, + "loss": 0.85777271, + "num_input_tokens_seen": 64979390, + "router_z_loss_clip": 2.62304688, + "router_z_loss_mlp": 0.23376465, + "step": 3007, + "time_per_iteration": 2.596728801727295 + }, + { + "auxiliary_loss_clip": 0.06488212, + "auxiliary_loss_mlp": 0.01341948, + "balance_loss_clip": 0.06340114, + "balance_loss_mlp": 0.01332817, + "epoch": 0.18085074402525175, + "flos": 64488861411840.0, + "grad_norm": 0.8091646786767962, + "language_loss": 0.57128072, + "learning_rate": 3.7660008161137294e-06, + "loss": 0.64958233, + "num_input_tokens_seen": 65043135, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.09136963, + "step": 3008, + "time_per_iteration": 3.2818551063537598 + }, + { + "auxiliary_loss_clip": 0.06575561, + "auxiliary_loss_mlp": 0.0128936, + "balance_loss_clip": 0.06307852, + "balance_loss_mlp": 0.0126528, + "epoch": 0.18091086727791975, + "flos": 23483665319040.0, + "grad_norm": 2.791287786369512, + "language_loss": 0.68172324, + "learning_rate": 3.765817980138021e-06, + "loss": 0.76037246, + "num_input_tokens_seen": 65062845, + "router_z_loss_clip": 2.67773438, + "router_z_loss_mlp": 0.24072266, + "step": 3009, + "time_per_iteration": 2.612866163253784 + }, + { + "auxiliary_loss_clip": 0.06566571, + "auxiliary_loss_mlp": 0.01283544, + "balance_loss_clip": 0.06299911, + "balance_loss_mlp": 0.01261228, + "epoch": 0.1809709905305877, + "flos": 24177334544640.0, + "grad_norm": 2.2065616524174745, + "language_loss": 0.76732111, + "learning_rate": 3.7656350772023177e-06, + "loss": 0.84582222, + "num_input_tokens_seen": 65082110, + "router_z_loss_clip": 2.6640625, + "router_z_loss_mlp": 0.22314453, + "step": 3010, + "time_per_iteration": 2.570751190185547 + }, + { + "auxiliary_loss_clip": 0.0656049, + "auxiliary_loss_mlp": 0.01277678, + "balance_loss_clip": 0.06301664, + "balance_loss_mlp": 0.01255028, + "epoch": 0.18103111378325568, + "flos": 21657006311040.0, + "grad_norm": 1.5802962280270132, + "language_loss": 0.68172359, + "learning_rate": 3.7654521073135553e-06, + "loss": 0.76010525, + "num_input_tokens_seen": 65101985, + "router_z_loss_clip": 2.58789062, + "router_z_loss_mlp": 0.22644043, + "step": 3011, + "time_per_iteration": 2.5724563598632812 + }, + { + "auxiliary_loss_clip": 0.0656517, + "auxiliary_loss_mlp": 0.01279328, + "balance_loss_clip": 0.06304309, + "balance_loss_mlp": 0.01256989, + "epoch": 0.18109123703592364, + "flos": 53698632537600.0, + "grad_norm": 1.5833259733478497, + "language_loss": 0.71816081, + "learning_rate": 3.7652690704786723e-06, + "loss": 0.79660583, + "num_input_tokens_seen": 65129295, + "router_z_loss_clip": 2.60546875, + "router_z_loss_mlp": 0.22351074, + "step": 3012, + "time_per_iteration": 2.810831069946289 + }, + { + "auxiliary_loss_clip": 0.06566492, + "auxiliary_loss_mlp": 0.01285528, + "balance_loss_clip": 0.06309225, + "balance_loss_mlp": 0.01261376, + "epoch": 0.1811513602885916, + "flos": 35854325907840.0, + "grad_norm": 2.597528045864961, + "language_loss": 0.63496852, + "learning_rate": 3.765085966704609e-06, + "loss": 0.7134887, + "num_input_tokens_seen": 65150625, + "router_z_loss_clip": 2.57617188, + "router_z_loss_mlp": 0.24169922, + "step": 3013, + "time_per_iteration": 2.728149175643921 + }, + { + "auxiliary_loss_clip": 0.0656557, + "auxiliary_loss_mlp": 0.01286402, + "balance_loss_clip": 0.06302488, + "balance_loss_mlp": 0.01262405, + "epoch": 0.18121148354125957, + "flos": 23739355653120.0, + "grad_norm": 1.5758176693533255, + "language_loss": 0.76564461, + "learning_rate": 3.764902795998309e-06, + "loss": 0.84416431, + "num_input_tokens_seen": 65170880, + "router_z_loss_clip": 2.63085938, + "router_z_loss_mlp": 0.23986816, + "step": 3014, + "time_per_iteration": 2.547717332839966 + }, + { + "auxiliary_loss_clip": 0.06584823, + "auxiliary_loss_mlp": 0.01295776, + "balance_loss_clip": 0.06314109, + "balance_loss_mlp": 0.01270336, + "epoch": 0.18127160679392756, + "flos": 28735470875520.0, + "grad_norm": 2.560866552798296, + "language_loss": 0.66988617, + "learning_rate": 3.7647195583667184e-06, + "loss": 0.74869215, + "num_input_tokens_seen": 65192530, + "router_z_loss_clip": 2.70898438, + "router_z_loss_mlp": 0.2545166, + "step": 3015, + "time_per_iteration": 2.69026780128479 + }, + { + "auxiliary_loss_clip": 0.06569196, + "auxiliary_loss_mlp": 0.01280146, + "balance_loss_clip": 0.06306805, + "balance_loss_mlp": 0.0125696, + "epoch": 0.18133173004659553, + "flos": 20491256782080.0, + "grad_norm": 2.469275114619788, + "language_loss": 0.78958207, + "learning_rate": 3.764536253816785e-06, + "loss": 0.86807549, + "num_input_tokens_seen": 65211675, + "router_z_loss_clip": 2.62304688, + "router_z_loss_mlp": 0.23168945, + "step": 3016, + "time_per_iteration": 3.9831480979919434 + }, + { + "auxiliary_loss_clip": 0.0657627, + "auxiliary_loss_mlp": 0.01288204, + "balance_loss_clip": 0.06304967, + "balance_loss_mlp": 0.01262967, + "epoch": 0.1813918532992635, + "flos": 22857905427840.0, + "grad_norm": 1.6723213639278358, + "language_loss": 0.84196192, + "learning_rate": 3.7643528823554602e-06, + "loss": 0.92060661, + "num_input_tokens_seen": 65231185, + "router_z_loss_clip": 2.71484375, + "router_z_loss_mlp": 0.25256348, + "step": 3017, + "time_per_iteration": 2.5418076515197754 + }, + { + "auxiliary_loss_clip": 0.06562062, + "auxiliary_loss_mlp": 0.01287085, + "balance_loss_clip": 0.063041, + "balance_loss_mlp": 0.01264197, + "epoch": 0.18145197655193146, + "flos": 36074028113280.0, + "grad_norm": 1.9391079186566258, + "language_loss": 0.68509835, + "learning_rate": 3.764169443989697e-06, + "loss": 0.76358986, + "num_input_tokens_seen": 65251645, + "router_z_loss_clip": 2.58203125, + "router_z_loss_mlp": 0.22900391, + "step": 3018, + "time_per_iteration": 4.119429111480713 + }, + { + "auxiliary_loss_clip": 0.06567694, + "auxiliary_loss_mlp": 0.01285506, + "balance_loss_clip": 0.06301513, + "balance_loss_mlp": 0.01262296, + "epoch": 0.18151209980459942, + "flos": 24030698699520.0, + "grad_norm": 1.811235496294486, + "language_loss": 0.76789671, + "learning_rate": 3.7639859387264518e-06, + "loss": 0.84642869, + "num_input_tokens_seen": 65271125, + "router_z_loss_clip": 2.65820312, + "router_z_loss_mlp": 0.23205566, + "step": 3019, + "time_per_iteration": 2.5501174926757812 + }, + { + "auxiliary_loss_clip": 0.06571496, + "auxiliary_loss_mlp": 0.01294569, + "balance_loss_clip": 0.0630317, + "balance_loss_mlp": 0.01267544, + "epoch": 0.1815722230572674, + "flos": 23958470880000.0, + "grad_norm": 3.3265475746221305, + "language_loss": 0.82225502, + "learning_rate": 3.7638023665726834e-06, + "loss": 0.90091568, + "num_input_tokens_seen": 65290600, + "router_z_loss_clip": 2.68554688, + "router_z_loss_mlp": 0.26989746, + "step": 3020, + "time_per_iteration": 2.5695080757141113 + }, + { + "auxiliary_loss_clip": 0.06568192, + "auxiliary_loss_mlp": 0.01285845, + "balance_loss_clip": 0.06304596, + "balance_loss_mlp": 0.01262433, + "epoch": 0.18163234630993536, + "flos": 24392885973120.0, + "grad_norm": 1.8328180932997555, + "language_loss": 0.78643721, + "learning_rate": 3.763618727535352e-06, + "loss": 0.8649776, + "num_input_tokens_seen": 65311040, + "router_z_loss_clip": 2.63867188, + "router_z_loss_mlp": 0.234375, + "step": 3021, + "time_per_iteration": 2.551942825317383 + }, + { + "auxiliary_loss_clip": 0.06560968, + "auxiliary_loss_mlp": 0.01283899, + "balance_loss_clip": 0.06301476, + "balance_loss_mlp": 0.01261034, + "epoch": 0.18169246956260335, + "flos": 24688295942400.0, + "grad_norm": 2.040482316083418, + "language_loss": 0.85882831, + "learning_rate": 3.763435021621422e-06, + "loss": 0.93727696, + "num_input_tokens_seen": 65332115, + "router_z_loss_clip": 2.59179688, + "router_z_loss_mlp": 0.22851562, + "step": 3022, + "time_per_iteration": 5.58092737197876 + }, + { + "auxiliary_loss_clip": 0.06578015, + "auxiliary_loss_mlp": 0.01285165, + "balance_loss_clip": 0.06310268, + "balance_loss_mlp": 0.0126031, + "epoch": 0.1817525928152713, + "flos": 24250149342720.0, + "grad_norm": 1.8455534069636814, + "language_loss": 0.7011804, + "learning_rate": 3.763251248837859e-06, + "loss": 0.77981222, + "num_input_tokens_seen": 65352210, + "router_z_loss_clip": 2.67773438, + "router_z_loss_mlp": 0.24853516, + "step": 3023, + "time_per_iteration": 2.5510292053222656 + }, + { + "auxiliary_loss_clip": 0.06576993, + "auxiliary_loss_mlp": 0.01285425, + "balance_loss_clip": 0.06311849, + "balance_loss_mlp": 0.01262382, + "epoch": 0.18181271606793928, + "flos": 16477680136320.0, + "grad_norm": 3.5802196750479753, + "language_loss": 0.7475239, + "learning_rate": 3.7630674091916317e-06, + "loss": 0.82614803, + "num_input_tokens_seen": 65370600, + "router_z_loss_clip": 2.6484375, + "router_z_loss_mlp": 0.23034668, + "step": 3024, + "time_per_iteration": 2.532150983810425 + }, + { + "auxiliary_loss_clip": 0.0657917, + "auxiliary_loss_mlp": 0.01281973, + "balance_loss_clip": 0.06315119, + "balance_loss_mlp": 0.01258239, + "epoch": 0.18187283932060724, + "flos": 18585787409280.0, + "grad_norm": 2.5283577302616593, + "language_loss": 0.89396572, + "learning_rate": 3.7628835026897123e-06, + "loss": 0.97257715, + "num_input_tokens_seen": 65387270, + "router_z_loss_clip": 2.640625, + "router_z_loss_mlp": 0.23742676, + "step": 3025, + "time_per_iteration": 2.503992795944214 + }, + { + "auxiliary_loss_clip": 0.0657706, + "auxiliary_loss_mlp": 0.01284845, + "balance_loss_clip": 0.06313155, + "balance_loss_mlp": 0.01260049, + "epoch": 0.1819329625732752, + "flos": 20273105877120.0, + "grad_norm": 1.766887401432974, + "language_loss": 0.80214149, + "learning_rate": 3.7626995293390735e-06, + "loss": 0.88076055, + "num_input_tokens_seen": 65406550, + "router_z_loss_clip": 2.63867188, + "router_z_loss_mlp": 0.24804688, + "step": 3026, + "time_per_iteration": 2.5226128101348877 + }, + { + "auxiliary_loss_clip": 0.06583989, + "auxiliary_loss_mlp": 0.01292049, + "balance_loss_clip": 0.06316754, + "balance_loss_mlp": 0.01267695, + "epoch": 0.18199308582594317, + "flos": 25921242046080.0, + "grad_norm": 3.8781285127645924, + "language_loss": 0.76237446, + "learning_rate": 3.762515489146692e-06, + "loss": 0.84113485, + "num_input_tokens_seen": 65425955, + "router_z_loss_clip": 2.67578125, + "router_z_loss_mlp": 0.2434082, + "step": 3027, + "time_per_iteration": 2.578749418258667 + }, + { + "auxiliary_loss_clip": 0.06592765, + "auxiliary_loss_mlp": 0.01296803, + "balance_loss_clip": 0.06322083, + "balance_loss_mlp": 0.01271328, + "epoch": 0.18205320907861114, + "flos": 15382942542720.0, + "grad_norm": 3.274226659229475, + "language_loss": 0.86130804, + "learning_rate": 3.762331382119546e-06, + "loss": 0.94020373, + "num_input_tokens_seen": 65442820, + "router_z_loss_clip": 2.71289062, + "router_z_loss_mlp": 0.25476074, + "step": 3028, + "time_per_iteration": 2.5201306343078613 + }, + { + "auxiliary_loss_clip": 0.06585124, + "auxiliary_loss_mlp": 0.01291016, + "balance_loss_clip": 0.06319305, + "balance_loss_mlp": 0.01263896, + "epoch": 0.18211333233127913, + "flos": 25630485978240.0, + "grad_norm": 1.8702692274079507, + "language_loss": 0.83509612, + "learning_rate": 3.7621472082646183e-06, + "loss": 0.91385752, + "num_input_tokens_seen": 65461825, + "router_z_loss_clip": 2.65429688, + "router_z_loss_mlp": 0.27111816, + "step": 3029, + "time_per_iteration": 2.562183380126953 + }, + { + "auxiliary_loss_clip": 0.06592625, + "auxiliary_loss_mlp": 0.01296678, + "balance_loss_clip": 0.06326656, + "balance_loss_mlp": 0.01269153, + "epoch": 0.1821734555839471, + "flos": 14981329123200.0, + "grad_norm": 1.9791177396807749, + "language_loss": 0.78960443, + "learning_rate": 3.761962967588891e-06, + "loss": 0.86849743, + "num_input_tokens_seen": 65479480, + "router_z_loss_clip": 2.65820312, + "router_z_loss_mlp": 0.27514648, + "step": 3030, + "time_per_iteration": 2.5145437717437744 + }, + { + "auxiliary_loss_clip": 0.06592657, + "auxiliary_loss_mlp": 0.01296331, + "balance_loss_clip": 0.06325006, + "balance_loss_mlp": 0.01269748, + "epoch": 0.18223357883661506, + "flos": 20200291079040.0, + "grad_norm": 1.9881761765350903, + "language_loss": 0.86102521, + "learning_rate": 3.761778660099352e-06, + "loss": 0.93991506, + "num_input_tokens_seen": 65497775, + "router_z_loss_clip": 2.6796875, + "router_z_loss_mlp": 0.26623535, + "step": 3031, + "time_per_iteration": 2.5260634422302246 + }, + { + "auxiliary_loss_clip": 0.06592748, + "auxiliary_loss_mlp": 0.01294791, + "balance_loss_clip": 0.06325988, + "balance_loss_mlp": 0.01270473, + "epoch": 0.18229370208928303, + "flos": 15237438727680.0, + "grad_norm": 2.0909174524979033, + "language_loss": 0.8092168, + "learning_rate": 3.76159428580299e-06, + "loss": 0.88809216, + "num_input_tokens_seen": 65516505, + "router_z_loss_clip": 2.66796875, + "router_z_loss_mlp": 0.24316406, + "step": 3032, + "time_per_iteration": 2.5710113048553467 + }, + { + "auxiliary_loss_clip": 0.06594816, + "auxiliary_loss_mlp": 0.01293656, + "balance_loss_clip": 0.06321192, + "balance_loss_mlp": 0.0126718, + "epoch": 0.182353825341951, + "flos": 23847026549760.0, + "grad_norm": 1.952875580311909, + "language_loss": 0.81854784, + "learning_rate": 3.761409844706795e-06, + "loss": 0.89743257, + "num_input_tokens_seen": 65536160, + "router_z_loss_clip": 2.73632812, + "router_z_loss_mlp": 0.26501465, + "step": 3033, + "time_per_iteration": 2.5495798587799072 + }, + { + "auxiliary_loss_clip": 0.06484132, + "auxiliary_loss_mlp": 0.01303963, + "balance_loss_clip": 0.06340252, + "balance_loss_mlp": 0.01294378, + "epoch": 0.18241394859461896, + "flos": 61208017522560.0, + "grad_norm": 0.8447557433525825, + "language_loss": 0.63402653, + "learning_rate": 3.7612253368177625e-06, + "loss": 0.71190745, + "num_input_tokens_seen": 65589375, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.09570312, + "step": 3034, + "time_per_iteration": 3.0660452842712402 + }, + { + "auxiliary_loss_clip": 0.0658728, + "auxiliary_loss_mlp": 0.01296965, + "balance_loss_clip": 0.0632379, + "balance_loss_mlp": 0.01271896, + "epoch": 0.18247407184728695, + "flos": 18476439431040.0, + "grad_norm": 2.061097584564917, + "language_loss": 0.80526477, + "learning_rate": 3.7610407621428893e-06, + "loss": 0.88410723, + "num_input_tokens_seen": 65606720, + "router_z_loss_clip": 2.63476562, + "router_z_loss_mlp": 0.25073242, + "step": 3035, + "time_per_iteration": 2.5506694316864014 + }, + { + "auxiliary_loss_clip": 0.06580287, + "auxiliary_loss_mlp": 0.01288285, + "balance_loss_clip": 0.06319961, + "balance_loss_mlp": 0.01264181, + "epoch": 0.18253419509995492, + "flos": 21801042679680.0, + "grad_norm": 1.6140632959859456, + "language_loss": 0.85371202, + "learning_rate": 3.7608561206891735e-06, + "loss": 0.93239772, + "num_input_tokens_seen": 65625495, + "router_z_loss_clip": 2.6015625, + "router_z_loss_mlp": 0.24108887, + "step": 3036, + "time_per_iteration": 2.6029741764068604 + }, + { + "auxiliary_loss_clip": 0.06580038, + "auxiliary_loss_mlp": 0.01290184, + "balance_loss_clip": 0.0632468, + "balance_loss_mlp": 0.01266843, + "epoch": 0.18259431835262288, + "flos": 20154743585280.0, + "grad_norm": 2.265799944133398, + "language_loss": 0.80322921, + "learning_rate": 3.760671412463617e-06, + "loss": 0.88193142, + "num_input_tokens_seen": 65643515, + "router_z_loss_clip": 2.54882812, + "router_z_loss_mlp": 0.23327637, + "step": 3037, + "time_per_iteration": 2.519632577896118 + }, + { + "auxiliary_loss_clip": 0.06593587, + "auxiliary_loss_mlp": 0.01295693, + "balance_loss_clip": 0.063269, + "balance_loss_mlp": 0.01270373, + "epoch": 0.18265444160529085, + "flos": 16987132160640.0, + "grad_norm": 4.978587383263401, + "language_loss": 0.80596817, + "learning_rate": 3.7604866374732246e-06, + "loss": 0.88486093, + "num_input_tokens_seen": 65658155, + "router_z_loss_clip": 2.66601562, + "router_z_loss_mlp": 0.25341797, + "step": 3038, + "time_per_iteration": 2.549565315246582 + }, + { + "auxiliary_loss_clip": 0.06577064, + "auxiliary_loss_mlp": 0.01293219, + "balance_loss_clip": 0.06316892, + "balance_loss_mlp": 0.01268221, + "epoch": 0.1827145648579588, + "flos": 34431879795840.0, + "grad_norm": 3.0715308969073907, + "language_loss": 0.6822418, + "learning_rate": 3.7603017957250023e-06, + "loss": 0.76094472, + "num_input_tokens_seen": 65679310, + "router_z_loss_clip": 2.60351562, + "router_z_loss_mlp": 0.24987793, + "step": 3039, + "time_per_iteration": 2.664839267730713 + }, + { + "auxiliary_loss_clip": 0.06579359, + "auxiliary_loss_mlp": 0.01283138, + "balance_loss_clip": 0.06312781, + "balance_loss_mlp": 0.0125783, + "epoch": 0.18277468811062678, + "flos": 53298905834880.0, + "grad_norm": 2.0617529505454866, + "language_loss": 0.74242914, + "learning_rate": 3.7601168872259593e-06, + "loss": 0.82105416, + "num_input_tokens_seen": 65705235, + "router_z_loss_clip": 2.66796875, + "router_z_loss_mlp": 0.25305176, + "step": 3040, + "time_per_iteration": 2.8341598510742188 + }, + { + "auxiliary_loss_clip": 0.06576048, + "auxiliary_loss_mlp": 0.01287293, + "balance_loss_clip": 0.06314505, + "balance_loss_mlp": 0.01261997, + "epoch": 0.18283481136329474, + "flos": 31658879975040.0, + "grad_norm": 2.270513376553218, + "language_loss": 0.61012894, + "learning_rate": 3.7599319119831075e-06, + "loss": 0.68876237, + "num_input_tokens_seen": 65727575, + "router_z_loss_clip": 2.61328125, + "router_z_loss_mlp": 0.25305176, + "step": 3041, + "time_per_iteration": 2.6312432289123535 + }, + { + "auxiliary_loss_clip": 0.065763, + "auxiliary_loss_mlp": 0.01280171, + "balance_loss_clip": 0.06311682, + "balance_loss_mlp": 0.01254779, + "epoch": 0.18289493461596273, + "flos": 53148957753600.0, + "grad_norm": 1.9789856473501881, + "language_loss": 0.60569113, + "learning_rate": 3.7597468700034616e-06, + "loss": 0.68425584, + "num_input_tokens_seen": 65751370, + "router_z_loss_clip": 2.6484375, + "router_z_loss_mlp": 0.25366211, + "step": 3042, + "time_per_iteration": 2.8294289112091064 + }, + { + "auxiliary_loss_clip": 0.06571855, + "auxiliary_loss_mlp": 0.01284933, + "balance_loss_clip": 0.06311391, + "balance_loss_mlp": 0.01261818, + "epoch": 0.1829550578686307, + "flos": 25595797587840.0, + "grad_norm": 2.1969947776781593, + "language_loss": 0.87948751, + "learning_rate": 3.7595617612940374e-06, + "loss": 0.95805538, + "num_input_tokens_seen": 65771040, + "router_z_loss_clip": 2.60742188, + "router_z_loss_mlp": 0.2310791, + "step": 3043, + "time_per_iteration": 2.5895864963531494 + }, + { + "auxiliary_loss_clip": 0.06576079, + "auxiliary_loss_mlp": 0.01280472, + "balance_loss_clip": 0.06308874, + "balance_loss_mlp": 0.01255737, + "epoch": 0.18301518112129866, + "flos": 22608001025280.0, + "grad_norm": 2.7546688504112633, + "language_loss": 0.71556103, + "learning_rate": 3.7593765858618552e-06, + "loss": 0.79412657, + "num_input_tokens_seen": 65789345, + "router_z_loss_clip": 2.671875, + "router_z_loss_mlp": 0.24731445, + "step": 3044, + "time_per_iteration": 2.524653196334839 + }, + { + "auxiliary_loss_clip": 0.06580091, + "auxiliary_loss_mlp": 0.0128018, + "balance_loss_clip": 0.06309704, + "balance_loss_mlp": 0.01255277, + "epoch": 0.18307530437396663, + "flos": 34029176273280.0, + "grad_norm": 2.5838478211487406, + "language_loss": 0.65133858, + "learning_rate": 3.7591913437139365e-06, + "loss": 0.72994125, + "num_input_tokens_seen": 65810990, + "router_z_loss_clip": 2.70117188, + "router_z_loss_mlp": 0.24914551, + "step": 3045, + "time_per_iteration": 2.656454086303711 + }, + { + "auxiliary_loss_clip": 0.06567913, + "auxiliary_loss_mlp": 0.01279381, + "balance_loss_clip": 0.06306372, + "balance_loss_mlp": 0.01256898, + "epoch": 0.1831354276266346, + "flos": 21284756547840.0, + "grad_norm": 3.147408680423339, + "language_loss": 0.803563, + "learning_rate": 3.7590060348573066e-06, + "loss": 0.88203591, + "num_input_tokens_seen": 65827230, + "router_z_loss_clip": 2.6171875, + "router_z_loss_mlp": 0.22497559, + "step": 3046, + "time_per_iteration": 2.503777503967285 + }, + { + "auxiliary_loss_clip": 0.06581149, + "auxiliary_loss_mlp": 0.01284573, + "balance_loss_clip": 0.06310049, + "balance_loss_mlp": 0.01259217, + "epoch": 0.18319555087930256, + "flos": 21039338338560.0, + "grad_norm": 2.4200593706157627, + "language_loss": 0.79505324, + "learning_rate": 3.7588206592989903e-06, + "loss": 0.87371051, + "num_input_tokens_seen": 65845900, + "router_z_loss_clip": 2.71289062, + "router_z_loss_mlp": 0.25354004, + "step": 3047, + "time_per_iteration": 2.5604546070098877 + }, + { + "auxiliary_loss_clip": 0.06579873, + "auxiliary_loss_mlp": 0.01282037, + "balance_loss_clip": 0.06320655, + "balance_loss_mlp": 0.01258243, + "epoch": 0.18325567413197055, + "flos": 34390944276480.0, + "grad_norm": 1.4781726378987778, + "language_loss": 0.81601483, + "learning_rate": 3.7586352170460194e-06, + "loss": 0.89463389, + "num_input_tokens_seen": 65868730, + "router_z_loss_clip": 2.59570312, + "router_z_loss_mlp": 0.23779297, + "step": 3048, + "time_per_iteration": 2.6359665393829346 + }, + { + "auxiliary_loss_clip": 0.06575403, + "auxiliary_loss_mlp": 0.01285089, + "balance_loss_clip": 0.0631268, + "balance_loss_mlp": 0.01260472, + "epoch": 0.18331579738463852, + "flos": 20564742412800.0, + "grad_norm": 2.1940168845136045, + "language_loss": 0.87414008, + "learning_rate": 3.758449708105424e-06, + "loss": 0.95274496, + "num_input_tokens_seen": 65888420, + "router_z_loss_clip": 2.62890625, + "router_z_loss_mlp": 0.24633789, + "step": 3049, + "time_per_iteration": 2.5575695037841797 + }, + { + "auxiliary_loss_clip": 0.06592787, + "auxiliary_loss_mlp": 0.01283738, + "balance_loss_clip": 0.0632069, + "balance_loss_mlp": 0.01259086, + "epoch": 0.18337592063730648, + "flos": 19613663844480.0, + "grad_norm": 3.2022638976819486, + "language_loss": 0.78845787, + "learning_rate": 3.75826413248424e-06, + "loss": 0.86722308, + "num_input_tokens_seen": 65905840, + "router_z_loss_clip": 2.72265625, + "router_z_loss_mlp": 0.24694824, + "step": 3050, + "time_per_iteration": 2.5530426502227783 + }, + { + "auxiliary_loss_clip": 0.06580114, + "auxiliary_loss_mlp": 0.01276938, + "balance_loss_clip": 0.06318066, + "balance_loss_mlp": 0.01253466, + "epoch": 0.18343604388997445, + "flos": 20857301343360.0, + "grad_norm": 2.3642096483096764, + "language_loss": 1.00007951, + "learning_rate": 3.7580784901895035e-06, + "loss": 1.07865, + "num_input_tokens_seen": 65922845, + "router_z_loss_clip": 2.62109375, + "router_z_loss_mlp": 0.23474121, + "step": 3051, + "time_per_iteration": 2.53879714012146 + }, + { + "auxiliary_loss_clip": 0.06576733, + "auxiliary_loss_mlp": 0.01279033, + "balance_loss_clip": 0.06316614, + "balance_loss_mlp": 0.01255025, + "epoch": 0.1834961671426424, + "flos": 24402109921920.0, + "grad_norm": 1.6089937167063422, + "language_loss": 0.87510651, + "learning_rate": 3.7578927812282542e-06, + "loss": 0.95366418, + "num_input_tokens_seen": 65945555, + "router_z_loss_clip": 2.6015625, + "router_z_loss_mlp": 0.23999023, + "step": 3052, + "time_per_iteration": 2.616711378097534 + }, + { + "auxiliary_loss_clip": 0.06578867, + "auxiliary_loss_mlp": 0.01277944, + "balance_loss_clip": 0.06319693, + "balance_loss_mlp": 0.01255485, + "epoch": 0.18355629039531038, + "flos": 21257992368000.0, + "grad_norm": 1.906783267886923, + "language_loss": 0.73879737, + "learning_rate": 3.7577070056075356e-06, + "loss": 0.81736547, + "num_input_tokens_seen": 65963965, + "router_z_loss_clip": 2.59179688, + "router_z_loss_mlp": 0.22473145, + "step": 3053, + "time_per_iteration": 2.5624823570251465 + }, + { + "auxiliary_loss_clip": 0.06577893, + "auxiliary_loss_mlp": 0.01281464, + "balance_loss_clip": 0.06309894, + "balance_loss_mlp": 0.01257264, + "epoch": 0.18361641364797834, + "flos": 28663830034560.0, + "grad_norm": 2.5767200648108233, + "language_loss": 0.6330536, + "learning_rate": 3.7575211633343902e-06, + "loss": 0.71164715, + "num_input_tokens_seen": 65985965, + "router_z_loss_clip": 2.6796875, + "router_z_loss_mlp": 0.24194336, + "step": 3054, + "time_per_iteration": 2.6126291751861572 + }, + { + "auxiliary_loss_clip": 0.06580043, + "auxiliary_loss_mlp": 0.01278803, + "balance_loss_clip": 0.0631642, + "balance_loss_mlp": 0.0125539, + "epoch": 0.18367653690064634, + "flos": 20924414064000.0, + "grad_norm": 2.0083810279560192, + "language_loss": 0.79178774, + "learning_rate": 3.7573352544158663e-06, + "loss": 0.87037629, + "num_input_tokens_seen": 66005645, + "router_z_loss_clip": 2.63671875, + "router_z_loss_mlp": 0.23400879, + "step": 3055, + "time_per_iteration": 3.9858450889587402 + }, + { + "auxiliary_loss_clip": 0.06567059, + "auxiliary_loss_mlp": 0.01278609, + "balance_loss_clip": 0.06311087, + "balance_loss_mlp": 0.01255971, + "epoch": 0.1837366601533143, + "flos": 28772884523520.0, + "grad_norm": 1.844309785332071, + "language_loss": 0.71021843, + "learning_rate": 3.757149278859014e-06, + "loss": 0.78867513, + "num_input_tokens_seen": 66025675, + "router_z_loss_clip": 2.56054688, + "router_z_loss_mlp": 0.2265625, + "step": 3056, + "time_per_iteration": 2.623892068862915 + }, + { + "auxiliary_loss_clip": 0.06573971, + "auxiliary_loss_mlp": 0.01282679, + "balance_loss_clip": 0.06309162, + "balance_loss_mlp": 0.0125954, + "epoch": 0.18379678340598227, + "flos": 21257782732800.0, + "grad_norm": 1.9202402240588465, + "language_loss": 0.81177384, + "learning_rate": 3.7569632366708842e-06, + "loss": 0.89034033, + "num_input_tokens_seen": 66046125, + "router_z_loss_clip": 2.64648438, + "router_z_loss_mlp": 0.23144531, + "step": 3057, + "time_per_iteration": 3.994014263153076 + }, + { + "auxiliary_loss_clip": 0.06576763, + "auxiliary_loss_mlp": 0.01288527, + "balance_loss_clip": 0.06303927, + "balance_loss_mlp": 0.01263029, + "epoch": 0.18385690665865023, + "flos": 20455981413120.0, + "grad_norm": 5.209505310648867, + "language_loss": 0.83562195, + "learning_rate": 3.756777127858533e-06, + "loss": 0.91427481, + "num_input_tokens_seen": 66064375, + "router_z_loss_clip": 2.73242188, + "router_z_loss_mlp": 0.25500488, + "step": 3058, + "time_per_iteration": 2.559356689453125 + }, + { + "auxiliary_loss_clip": 0.0658073, + "auxiliary_loss_mlp": 0.01283954, + "balance_loss_clip": 0.06315949, + "balance_loss_mlp": 0.01259278, + "epoch": 0.1839170299113182, + "flos": 26147736432000.0, + "grad_norm": 2.1347539719525552, + "language_loss": 0.86113238, + "learning_rate": 3.756590952429017e-06, + "loss": 0.93977928, + "num_input_tokens_seen": 66084590, + "router_z_loss_clip": 2.6484375, + "router_z_loss_mlp": 0.2467041, + "step": 3059, + "time_per_iteration": 2.5702602863311768 + }, + { + "auxiliary_loss_clip": 0.0656752, + "auxiliary_loss_mlp": 0.01279577, + "balance_loss_clip": 0.06304803, + "balance_loss_mlp": 0.01255997, + "epoch": 0.18397715316398616, + "flos": 31765921966080.0, + "grad_norm": 1.5595075663945241, + "language_loss": 0.73269093, + "learning_rate": 3.756404710389396e-06, + "loss": 0.81116188, + "num_input_tokens_seen": 66107105, + "router_z_loss_clip": 2.625, + "router_z_loss_mlp": 0.23583984, + "step": 3060, + "time_per_iteration": 2.6496734619140625 + }, + { + "auxiliary_loss_clip": 0.06572919, + "auxiliary_loss_mlp": 0.01280202, + "balance_loss_clip": 0.06306632, + "balance_loss_mlp": 0.01254715, + "epoch": 0.18403727641665413, + "flos": 24619548067200.0, + "grad_norm": 1.685629450787069, + "language_loss": 0.73033082, + "learning_rate": 3.7562184017467323e-06, + "loss": 0.80886197, + "num_input_tokens_seen": 66129295, + "router_z_loss_clip": 2.6640625, + "router_z_loss_mlp": 0.25512695, + "step": 3061, + "time_per_iteration": 2.611788034439087 + }, + { + "auxiliary_loss_clip": 0.06574027, + "auxiliary_loss_mlp": 0.01285757, + "balance_loss_clip": 0.06309725, + "balance_loss_mlp": 0.01262666, + "epoch": 0.18409739966932212, + "flos": 23446503233280.0, + "grad_norm": 3.8650330009727893, + "language_loss": 0.81972837, + "learning_rate": 3.7560320265080906e-06, + "loss": 0.89832628, + "num_input_tokens_seen": 66146910, + "router_z_loss_clip": 2.64453125, + "router_z_loss_mlp": 0.23095703, + "step": 3062, + "time_per_iteration": 5.428592920303345 + }, + { + "auxiliary_loss_clip": 0.06579094, + "auxiliary_loss_mlp": 0.01285398, + "balance_loss_clip": 0.06309452, + "balance_loss_mlp": 0.01260806, + "epoch": 0.18415752292199009, + "flos": 21878637160320.0, + "grad_norm": 1.977008299285237, + "language_loss": 0.74067175, + "learning_rate": 3.7558455846805383e-06, + "loss": 0.81931663, + "num_input_tokens_seen": 66165370, + "router_z_loss_clip": 2.69921875, + "router_z_loss_mlp": 0.24572754, + "step": 3063, + "time_per_iteration": 2.53143572807312 + }, + { + "auxiliary_loss_clip": 0.06568366, + "auxiliary_loss_mlp": 0.0128141, + "balance_loss_clip": 0.06305687, + "balance_loss_mlp": 0.01257556, + "epoch": 0.18421764617465805, + "flos": 25417701734400.0, + "grad_norm": 1.7280289049146156, + "language_loss": 0.66864884, + "learning_rate": 3.7556590762711463e-06, + "loss": 0.74714661, + "num_input_tokens_seen": 66186210, + "router_z_loss_clip": 2.62890625, + "router_z_loss_mlp": 0.23864746, + "step": 3064, + "time_per_iteration": 2.595961332321167 + }, + { + "auxiliary_loss_clip": 0.06569844, + "auxiliary_loss_mlp": 0.0127972, + "balance_loss_clip": 0.06304256, + "balance_loss_mlp": 0.01256748, + "epoch": 0.18427776942732602, + "flos": 27205395793920.0, + "grad_norm": 1.7817654183541871, + "language_loss": 0.69580668, + "learning_rate": 3.7554725012869853e-06, + "loss": 0.77430236, + "num_input_tokens_seen": 66204800, + "router_z_loss_clip": 2.65625, + "router_z_loss_mlp": 0.22937012, + "step": 3065, + "time_per_iteration": 2.5717501640319824 + }, + { + "auxiliary_loss_clip": 0.06574196, + "auxiliary_loss_mlp": 0.01283905, + "balance_loss_clip": 0.06306924, + "balance_loss_mlp": 0.01258168, + "epoch": 0.18433789267999398, + "flos": 27859303457280.0, + "grad_norm": 2.294674560085645, + "language_loss": 0.73328084, + "learning_rate": 3.7552858597351318e-06, + "loss": 0.81186187, + "num_input_tokens_seen": 66222195, + "router_z_loss_clip": 2.671875, + "router_z_loss_mlp": 0.25720215, + "step": 3066, + "time_per_iteration": 2.5840933322906494 + }, + { + "auxiliary_loss_clip": 0.06567979, + "auxiliary_loss_mlp": 0.01283252, + "balance_loss_clip": 0.06303403, + "balance_loss_mlp": 0.01259458, + "epoch": 0.18439801593266195, + "flos": 17862502965120.0, + "grad_norm": 1.9426241343058523, + "language_loss": 0.8287726, + "learning_rate": 3.7550991516226622e-06, + "loss": 0.90728498, + "num_input_tokens_seen": 66239505, + "router_z_loss_clip": 2.64453125, + "router_z_loss_mlp": 0.23791504, + "step": 3067, + "time_per_iteration": 2.510010004043579 + }, + { + "auxiliary_loss_clip": 0.06482083, + "auxiliary_loss_mlp": 0.01256206, + "balance_loss_clip": 0.06330505, + "balance_loss_mlp": 0.01248302, + "epoch": 0.18445813918532994, + "flos": 56408236416000.0, + "grad_norm": 0.8014843936748705, + "language_loss": 0.59808761, + "learning_rate": 3.754912376956657e-06, + "loss": 0.67547047, + "num_input_tokens_seen": 66295695, + "router_z_loss_clip": 1.515625, + "router_z_loss_mlp": 0.07897949, + "step": 3068, + "time_per_iteration": 3.036146879196167 + }, + { + "auxiliary_loss_clip": 0.06564388, + "auxiliary_loss_mlp": 0.01280505, + "balance_loss_clip": 0.06303549, + "balance_loss_mlp": 0.01256687, + "epoch": 0.1845182624379979, + "flos": 20963085523200.0, + "grad_norm": 1.8439912741449518, + "language_loss": 0.77266169, + "learning_rate": 3.7547255357441987e-06, + "loss": 0.8511107, + "num_input_tokens_seen": 66315315, + "router_z_loss_clip": 2.60742188, + "router_z_loss_mlp": 0.23840332, + "step": 3069, + "time_per_iteration": 2.5499565601348877 + }, + { + "auxiliary_loss_clip": 0.06570058, + "auxiliary_loss_mlp": 0.01283287, + "balance_loss_clip": 0.06303704, + "balance_loss_mlp": 0.01258038, + "epoch": 0.18457838569066587, + "flos": 20491382563200.0, + "grad_norm": 2.2630610204441655, + "language_loss": 0.86447155, + "learning_rate": 3.7545386279923718e-06, + "loss": 0.94300503, + "num_input_tokens_seen": 66333675, + "router_z_loss_clip": 2.6640625, + "router_z_loss_mlp": 0.25280762, + "step": 3070, + "time_per_iteration": 2.573843479156494 + }, + { + "auxiliary_loss_clip": 0.06575848, + "auxiliary_loss_mlp": 0.0128984, + "balance_loss_clip": 0.06307413, + "balance_loss_mlp": 0.01265545, + "epoch": 0.18463850894333383, + "flos": 25017094563840.0, + "grad_norm": 2.0459920671080725, + "language_loss": 0.78778827, + "learning_rate": 3.754351653708265e-06, + "loss": 0.86644518, + "num_input_tokens_seen": 66354075, + "router_z_loss_clip": 2.68164062, + "router_z_loss_mlp": 0.24279785, + "step": 3071, + "time_per_iteration": 2.6498963832855225 + }, + { + "auxiliary_loss_clip": 0.06567957, + "auxiliary_loss_mlp": 0.01281558, + "balance_loss_clip": 0.06301579, + "balance_loss_mlp": 0.01256142, + "epoch": 0.1846986321960018, + "flos": 16806311049600.0, + "grad_norm": 2.346095649750701, + "language_loss": 0.77759838, + "learning_rate": 3.7541646128989674e-06, + "loss": 0.85609353, + "num_input_tokens_seen": 66372520, + "router_z_loss_clip": 2.66210938, + "router_z_loss_mlp": 0.25427246, + "step": 3072, + "time_per_iteration": 2.5731780529022217 + }, + { + "auxiliary_loss_clip": 0.06569058, + "auxiliary_loss_mlp": 0.01286345, + "balance_loss_clip": 0.06299037, + "balance_loss_mlp": 0.01261096, + "epoch": 0.18475875544866976, + "flos": 20820726236160.0, + "grad_norm": 1.9004070702769575, + "language_loss": 0.87276495, + "learning_rate": 3.7539775055715715e-06, + "loss": 0.95131898, + "num_input_tokens_seen": 66390745, + "router_z_loss_clip": 2.703125, + "router_z_loss_mlp": 0.25231934, + "step": 3073, + "time_per_iteration": 2.5327014923095703 + }, + { + "auxiliary_loss_clip": 0.06571067, + "auxiliary_loss_mlp": 0.01285925, + "balance_loss_clip": 0.06302057, + "balance_loss_mlp": 0.01261523, + "epoch": 0.18481887870133773, + "flos": 22608001025280.0, + "grad_norm": 2.4702398063651314, + "language_loss": 0.9204939, + "learning_rate": 3.7537903317331732e-06, + "loss": 0.99906385, + "num_input_tokens_seen": 66410525, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 0.24401855, + "step": 3074, + "time_per_iteration": 2.6219372749328613 + }, + { + "auxiliary_loss_clip": 0.06566601, + "auxiliary_loss_mlp": 0.01284131, + "balance_loss_clip": 0.06302647, + "balance_loss_mlp": 0.01257583, + "epoch": 0.18487900195400572, + "flos": 29466218332800.0, + "grad_norm": 2.295087571563985, + "language_loss": 0.64970315, + "learning_rate": 3.75360309139087e-06, + "loss": 0.72821045, + "num_input_tokens_seen": 66432535, + "router_z_loss_clip": 2.640625, + "router_z_loss_mlp": 0.26550293, + "step": 3075, + "time_per_iteration": 2.6108217239379883 + }, + { + "auxiliary_loss_clip": 0.06563977, + "auxiliary_loss_mlp": 0.0128829, + "balance_loss_clip": 0.06303947, + "balance_loss_mlp": 0.01264519, + "epoch": 0.1849391252066737, + "flos": 20634622318080.0, + "grad_norm": 2.1580493004205943, + "language_loss": 0.7321173, + "learning_rate": 3.753415784551761e-06, + "loss": 0.81063998, + "num_input_tokens_seen": 66450620, + "router_z_loss_clip": 2.60351562, + "router_z_loss_mlp": 0.23742676, + "step": 3076, + "time_per_iteration": 2.552551746368408 + }, + { + "auxiliary_loss_clip": 0.06574243, + "auxiliary_loss_mlp": 0.01280151, + "balance_loss_clip": 0.06304738, + "balance_loss_mlp": 0.01256309, + "epoch": 0.18499924845934165, + "flos": 14433750691200.0, + "grad_norm": 2.459416187119703, + "language_loss": 0.82324487, + "learning_rate": 3.7532284112229507e-06, + "loss": 0.90178883, + "num_input_tokens_seen": 66467865, + "router_z_loss_clip": 2.69726562, + "router_z_loss_mlp": 0.23864746, + "step": 3077, + "time_per_iteration": 2.493069648742676 + }, + { + "auxiliary_loss_clip": 0.06560019, + "auxiliary_loss_mlp": 0.01280161, + "balance_loss_clip": 0.06302261, + "balance_loss_mlp": 0.01256748, + "epoch": 0.18505937171200962, + "flos": 23733611648640.0, + "grad_norm": 1.8347096473751274, + "language_loss": 0.79534197, + "learning_rate": 3.7530409714115424e-06, + "loss": 0.87374371, + "num_input_tokens_seen": 66486245, + "router_z_loss_clip": 2.57617188, + "router_z_loss_mlp": 0.23425293, + "step": 3078, + "time_per_iteration": 2.5838091373443604 + }, + { + "auxiliary_loss_clip": 0.0657796, + "auxiliary_loss_mlp": 0.01288284, + "balance_loss_clip": 0.06314268, + "balance_loss_mlp": 0.0126536, + "epoch": 0.18511949496467758, + "flos": 25964525479680.0, + "grad_norm": 2.3879568543100174, + "language_loss": 0.78543603, + "learning_rate": 3.7528534651246453e-06, + "loss": 0.86409843, + "num_input_tokens_seen": 66506510, + "router_z_loss_clip": 2.63867188, + "router_z_loss_mlp": 0.22937012, + "step": 3079, + "time_per_iteration": 2.5836563110351562 + }, + { + "auxiliary_loss_clip": 0.06574707, + "auxiliary_loss_mlp": 0.01290584, + "balance_loss_clip": 0.06311746, + "balance_loss_mlp": 0.01266921, + "epoch": 0.18517961821734555, + "flos": 42423506156160.0, + "grad_norm": 2.6792059094445393, + "language_loss": 0.82738018, + "learning_rate": 3.752665892369369e-06, + "loss": 0.90603304, + "num_input_tokens_seen": 66530960, + "router_z_loss_clip": 2.62890625, + "router_z_loss_mlp": 0.23669434, + "step": 3080, + "time_per_iteration": 2.7419395446777344 + }, + { + "auxiliary_loss_clip": 0.06581488, + "auxiliary_loss_mlp": 0.01283912, + "balance_loss_clip": 0.06312552, + "balance_loss_mlp": 0.01258306, + "epoch": 0.18523974147001354, + "flos": 24104435892480.0, + "grad_norm": 2.0136248585759815, + "language_loss": 0.75280142, + "learning_rate": 3.7524782531528266e-06, + "loss": 0.83145541, + "num_input_tokens_seen": 66550275, + "router_z_loss_clip": 2.69140625, + "router_z_loss_mlp": 0.25622559, + "step": 3081, + "time_per_iteration": 2.558880567550659 + }, + { + "auxiliary_loss_clip": 0.06580579, + "auxiliary_loss_mlp": 0.01294641, + "balance_loss_clip": 0.06314941, + "balance_loss_mlp": 0.01267354, + "epoch": 0.1852998647226815, + "flos": 27381688784640.0, + "grad_norm": 2.2228183561660533, + "language_loss": 0.72592467, + "learning_rate": 3.7522905474821334e-06, + "loss": 0.80467689, + "num_input_tokens_seen": 66569040, + "router_z_loss_clip": 2.65820312, + "router_z_loss_mlp": 0.27282715, + "step": 3082, + "time_per_iteration": 2.588782787322998 + }, + { + "auxiliary_loss_clip": 0.06586821, + "auxiliary_loss_mlp": 0.01289587, + "balance_loss_clip": 0.06314754, + "balance_loss_mlp": 0.01263409, + "epoch": 0.18535998797534947, + "flos": 18338650191360.0, + "grad_norm": 1.9336985276158285, + "language_loss": 0.70667702, + "learning_rate": 3.752102775364407e-06, + "loss": 0.78544116, + "num_input_tokens_seen": 66587775, + "router_z_loss_clip": 2.72070312, + "router_z_loss_mlp": 0.26184082, + "step": 3083, + "time_per_iteration": 2.630099296569824 + }, + { + "auxiliary_loss_clip": 0.06573243, + "auxiliary_loss_mlp": 0.01286773, + "balance_loss_clip": 0.06312741, + "balance_loss_mlp": 0.01261548, + "epoch": 0.18542011122801744, + "flos": 37853881816320.0, + "grad_norm": 1.8745280868212635, + "language_loss": 0.69687432, + "learning_rate": 3.751914936806767e-06, + "loss": 0.77547449, + "num_input_tokens_seen": 66610800, + "router_z_loss_clip": 2.60742188, + "router_z_loss_mlp": 0.25244141, + "step": 3084, + "time_per_iteration": 2.7246148586273193 + }, + { + "auxiliary_loss_clip": 0.06577612, + "auxiliary_loss_mlp": 0.01284469, + "balance_loss_clip": 0.06314437, + "balance_loss_mlp": 0.01261402, + "epoch": 0.1854802344806854, + "flos": 25192171670400.0, + "grad_norm": 1.5329506051970134, + "language_loss": 0.78209639, + "learning_rate": 3.7517270318163377e-06, + "loss": 0.86071718, + "num_input_tokens_seen": 66630960, + "router_z_loss_clip": 2.6328125, + "router_z_loss_mlp": 0.23071289, + "step": 3085, + "time_per_iteration": 2.6189463138580322 + }, + { + "auxiliary_loss_clip": 0.06579587, + "auxiliary_loss_mlp": 0.01287952, + "balance_loss_clip": 0.06314654, + "balance_loss_mlp": 0.01261964, + "epoch": 0.18554035773335337, + "flos": 26691541430400.0, + "grad_norm": 1.8306415954747441, + "language_loss": 0.74554545, + "learning_rate": 3.751539060400244e-06, + "loss": 0.82422084, + "num_input_tokens_seen": 66650585, + "router_z_loss_clip": 2.65234375, + "router_z_loss_mlp": 0.2598877, + "step": 3086, + "time_per_iteration": 2.5668296813964844 + }, + { + "auxiliary_loss_clip": 0.06581503, + "auxiliary_loss_mlp": 0.0129843, + "balance_loss_clip": 0.06316213, + "balance_loss_mlp": 0.01272026, + "epoch": 0.18560048098602133, + "flos": 22353568502400.0, + "grad_norm": 2.451797107788235, + "language_loss": 0.70597452, + "learning_rate": 3.7513510225656132e-06, + "loss": 0.78477389, + "num_input_tokens_seen": 66670045, + "router_z_loss_clip": 2.65429688, + "router_z_loss_mlp": 0.26391602, + "step": 3087, + "time_per_iteration": 2.568380117416382 + }, + { + "auxiliary_loss_clip": 0.06584737, + "auxiliary_loss_mlp": 0.01292318, + "balance_loss_clip": 0.06317757, + "balance_loss_mlp": 0.01264543, + "epoch": 0.18566060423868933, + "flos": 17754245089920.0, + "grad_norm": 1.9281487675228464, + "language_loss": 0.73915106, + "learning_rate": 3.7511629183195764e-06, + "loss": 0.81792164, + "num_input_tokens_seen": 66688790, + "router_z_loss_clip": 2.671875, + "router_z_loss_mlp": 0.27783203, + "step": 3088, + "time_per_iteration": 2.536055326461792 + }, + { + "auxiliary_loss_clip": 0.06578237, + "auxiliary_loss_mlp": 0.01288694, + "balance_loss_clip": 0.06316703, + "balance_loss_mlp": 0.0126571, + "epoch": 0.1857207274913573, + "flos": 24683558186880.0, + "grad_norm": 1.798814131108877, + "language_loss": 0.92793214, + "learning_rate": 3.7509747476692663e-06, + "loss": 1.00660145, + "num_input_tokens_seen": 66708090, + "router_z_loss_clip": 2.6171875, + "router_z_loss_mlp": 0.2298584, + "step": 3089, + "time_per_iteration": 2.591520071029663 + }, + { + "auxiliary_loss_clip": 0.06581305, + "auxiliary_loss_mlp": 0.01284125, + "balance_loss_clip": 0.06316443, + "balance_loss_mlp": 0.01260772, + "epoch": 0.18578085074402526, + "flos": 28155426186240.0, + "grad_norm": 2.9732427277308724, + "language_loss": 0.59245396, + "learning_rate": 3.7507865106218176e-06, + "loss": 0.67110825, + "num_input_tokens_seen": 66727320, + "router_z_loss_clip": 2.6484375, + "router_z_loss_mlp": 0.23352051, + "step": 3090, + "time_per_iteration": 2.587693452835083 + }, + { + "auxiliary_loss_clip": 0.06569171, + "auxiliary_loss_mlp": 0.01294048, + "balance_loss_clip": 0.06308332, + "balance_loss_mlp": 0.01269372, + "epoch": 0.18584097399669322, + "flos": 23958764369280.0, + "grad_norm": 1.6455413495288673, + "language_loss": 0.825216, + "learning_rate": 3.7505982071843695e-06, + "loss": 0.90384817, + "num_input_tokens_seen": 66747505, + "router_z_loss_clip": 2.61132812, + "router_z_loss_mlp": 0.24694824, + "step": 3091, + "time_per_iteration": 2.564748525619507 + }, + { + "auxiliary_loss_clip": 0.06580666, + "auxiliary_loss_mlp": 0.01293234, + "balance_loss_clip": 0.06311382, + "balance_loss_mlp": 0.01266758, + "epoch": 0.18590109724936119, + "flos": 17207379417600.0, + "grad_norm": 2.4797040605264904, + "language_loss": 0.8537268, + "learning_rate": 3.7504098373640617e-06, + "loss": 0.93246579, + "num_input_tokens_seen": 66766425, + "router_z_loss_clip": 2.69335938, + "router_z_loss_mlp": 0.2644043, + "step": 3092, + "time_per_iteration": 2.514536142349243 + }, + { + "auxiliary_loss_clip": 0.06587748, + "auxiliary_loss_mlp": 0.01293739, + "balance_loss_clip": 0.06317791, + "balance_loss_mlp": 0.012665, + "epoch": 0.18596122050202915, + "flos": 17239761820800.0, + "grad_norm": 2.2590627268781316, + "language_loss": 0.93402261, + "learning_rate": 3.750221401168038e-06, + "loss": 1.01283741, + "num_input_tokens_seen": 66781130, + "router_z_loss_clip": 2.69921875, + "router_z_loss_mlp": 0.27246094, + "step": 3093, + "time_per_iteration": 2.5037660598754883 + }, + { + "auxiliary_loss_clip": 0.06575991, + "auxiliary_loss_mlp": 0.01284238, + "balance_loss_clip": 0.06309767, + "balance_loss_mlp": 0.01258477, + "epoch": 0.18602134375469712, + "flos": 19025862652800.0, + "grad_norm": 1.8616717248352448, + "language_loss": 0.77931499, + "learning_rate": 3.750032898603443e-06, + "loss": 0.85791731, + "num_input_tokens_seen": 66797535, + "router_z_loss_clip": 2.66210938, + "router_z_loss_mlp": 0.25744629, + "step": 3094, + "time_per_iteration": 2.529491662979126 + }, + { + "auxiliary_loss_clip": 0.06576168, + "auxiliary_loss_mlp": 0.0128492, + "balance_loss_clip": 0.06311647, + "balance_loss_mlp": 0.01260637, + "epoch": 0.1860814670073651, + "flos": 50961285429120.0, + "grad_norm": 1.6485050019084173, + "language_loss": 0.70511484, + "learning_rate": 3.749844329677425e-06, + "loss": 0.7837258, + "num_input_tokens_seen": 66821720, + "router_z_loss_clip": 2.64257812, + "router_z_loss_mlp": 0.24291992, + "step": 3095, + "time_per_iteration": 4.124077558517456 + }, + { + "auxiliary_loss_clip": 0.0658177, + "auxiliary_loss_mlp": 0.01296881, + "balance_loss_clip": 0.06310082, + "balance_loss_mlp": 0.01268819, + "epoch": 0.18614159026003307, + "flos": 19397064240000.0, + "grad_norm": 1.9264485804072164, + "language_loss": 0.81302798, + "learning_rate": 3.749655694397135e-06, + "loss": 0.89181447, + "num_input_tokens_seen": 66839060, + "router_z_loss_clip": 2.71484375, + "router_z_loss_mlp": 0.28051758, + "step": 3096, + "time_per_iteration": 2.5277867317199707 + }, + { + "auxiliary_loss_clip": 0.06581111, + "auxiliary_loss_mlp": 0.01285017, + "balance_loss_clip": 0.06310429, + "balance_loss_mlp": 0.01259173, + "epoch": 0.18620171351270104, + "flos": 21805235383680.0, + "grad_norm": 1.9931413029080365, + "language_loss": 0.76143897, + "learning_rate": 3.7494669927697255e-06, + "loss": 0.84010023, + "num_input_tokens_seen": 66857760, + "router_z_loss_clip": 2.70898438, + "router_z_loss_mlp": 0.25842285, + "step": 3097, + "time_per_iteration": 3.982475996017456 + }, + { + "auxiliary_loss_clip": 0.06569855, + "auxiliary_loss_mlp": 0.01288887, + "balance_loss_clip": 0.06308468, + "balance_loss_mlp": 0.01263877, + "epoch": 0.186261836765369, + "flos": 16368499866240.0, + "grad_norm": 2.207337076402474, + "language_loss": 0.67101508, + "learning_rate": 3.749278224802352e-06, + "loss": 0.74960256, + "num_input_tokens_seen": 66876460, + "router_z_loss_clip": 2.61328125, + "router_z_loss_mlp": 0.25061035, + "step": 3098, + "time_per_iteration": 2.5570473670959473 + }, + { + "auxiliary_loss_clip": 0.06578363, + "auxiliary_loss_mlp": 0.01287977, + "balance_loss_clip": 0.06308189, + "balance_loss_mlp": 0.0126044, + "epoch": 0.18632196001803697, + "flos": 23377168379520.0, + "grad_norm": 1.559550653919394, + "language_loss": 0.70188725, + "learning_rate": 3.7490893905021733e-06, + "loss": 0.7805506, + "num_input_tokens_seen": 66897960, + "router_z_loss_clip": 2.703125, + "router_z_loss_mlp": 0.2755127, + "step": 3099, + "time_per_iteration": 2.5704476833343506 + }, + { + "auxiliary_loss_clip": 0.0657559, + "auxiliary_loss_mlp": 0.01292152, + "balance_loss_clip": 0.06309687, + "balance_loss_mlp": 0.01266689, + "epoch": 0.18638208327070493, + "flos": 22498569192960.0, + "grad_norm": 1.5145032946618349, + "language_loss": 0.72489583, + "learning_rate": 3.7489004898763494e-06, + "loss": 0.80357325, + "num_input_tokens_seen": 66917675, + "router_z_loss_clip": 2.66015625, + "router_z_loss_mlp": 0.25463867, + "step": 3100, + "time_per_iteration": 2.628770351409912 + }, + { + "auxiliary_loss_clip": 0.06585407, + "auxiliary_loss_mlp": 0.01287458, + "balance_loss_clip": 0.06314865, + "balance_loss_mlp": 0.01261971, + "epoch": 0.18644220652337293, + "flos": 29172317736960.0, + "grad_norm": 1.7314771672192502, + "language_loss": 0.80930734, + "learning_rate": 3.7487115229320444e-06, + "loss": 0.88803601, + "num_input_tokens_seen": 66936000, + "router_z_loss_clip": 2.703125, + "router_z_loss_mlp": 0.25524902, + "step": 3101, + "time_per_iteration": 4.063347578048706 + }, + { + "auxiliary_loss_clip": 0.0657436, + "auxiliary_loss_mlp": 0.01283038, + "balance_loss_clip": 0.06309733, + "balance_loss_mlp": 0.01259494, + "epoch": 0.1865023297760409, + "flos": 24250736321280.0, + "grad_norm": 2.4348094857493834, + "language_loss": 0.77630436, + "learning_rate": 3.7485224896764222e-06, + "loss": 0.85487837, + "num_input_tokens_seen": 66955700, + "router_z_loss_clip": 2.64453125, + "router_z_loss_mlp": 0.23535156, + "step": 3102, + "time_per_iteration": 3.9878056049346924 + }, + { + "auxiliary_loss_clip": 0.06580452, + "auxiliary_loss_mlp": 0.01284097, + "balance_loss_clip": 0.0631346, + "balance_loss_mlp": 0.01259504, + "epoch": 0.18656245302870886, + "flos": 19133617403520.0, + "grad_norm": 4.261808326107292, + "language_loss": 0.77043533, + "learning_rate": 3.7483333901166525e-06, + "loss": 0.8490808, + "num_input_tokens_seen": 66972815, + "router_z_loss_clip": 2.66992188, + "router_z_loss_mlp": 0.24584961, + "step": 3103, + "time_per_iteration": 2.5497515201568604 + }, + { + "auxiliary_loss_clip": 0.06580411, + "auxiliary_loss_mlp": 0.01279736, + "balance_loss_clip": 0.06311087, + "balance_loss_mlp": 0.01255596, + "epoch": 0.18662257628137682, + "flos": 17791994154240.0, + "grad_norm": 1.8534126866214053, + "language_loss": 0.80155015, + "learning_rate": 3.7481442242599054e-06, + "loss": 0.88015163, + "num_input_tokens_seen": 66992280, + "router_z_loss_clip": 2.69335938, + "router_z_loss_mlp": 0.24157715, + "step": 3104, + "time_per_iteration": 2.5436315536499023 + }, + { + "auxiliary_loss_clip": 0.06576735, + "auxiliary_loss_mlp": 0.01287024, + "balance_loss_clip": 0.06310537, + "balance_loss_mlp": 0.01262884, + "epoch": 0.1866826995340448, + "flos": 24031201824000.0, + "grad_norm": 1.9078675803700618, + "language_loss": 0.86523151, + "learning_rate": 3.747954992113354e-06, + "loss": 0.94386911, + "num_input_tokens_seen": 67012220, + "router_z_loss_clip": 2.66210938, + "router_z_loss_mlp": 0.24169922, + "step": 3105, + "time_per_iteration": 2.5862667560577393 + }, + { + "auxiliary_loss_clip": 0.06594124, + "auxiliary_loss_mlp": 0.01282565, + "balance_loss_clip": 0.06317551, + "balance_loss_mlp": 0.01257853, + "epoch": 0.18674282278671275, + "flos": 26148533045760.0, + "grad_norm": 3.6817594399013203, + "language_loss": 0.87727821, + "learning_rate": 3.7477656936841742e-06, + "loss": 0.95604515, + "num_input_tokens_seen": 67032030, + "router_z_loss_clip": 2.765625, + "router_z_loss_mlp": 0.24719238, + "step": 3106, + "time_per_iteration": 2.6158018112182617 + }, + { + "auxiliary_loss_clip": 0.06587484, + "auxiliary_loss_mlp": 0.01282217, + "balance_loss_clip": 0.06311296, + "balance_loss_mlp": 0.01259078, + "epoch": 0.18680294603938072, + "flos": 19206893399040.0, + "grad_norm": 1.800292289422269, + "language_loss": 0.78916037, + "learning_rate": 3.7475763289795445e-06, + "loss": 0.86785746, + "num_input_tokens_seen": 67048920, + "router_z_loss_clip": 2.76171875, + "router_z_loss_mlp": 0.23132324, + "step": 3107, + "time_per_iteration": 2.519771099090576 + }, + { + "auxiliary_loss_clip": 0.06579127, + "auxiliary_loss_mlp": 0.01290711, + "balance_loss_clip": 0.06304596, + "balance_loss_mlp": 0.01264997, + "epoch": 0.1868630692920487, + "flos": 28551840652800.0, + "grad_norm": 3.3283393961991345, + "language_loss": 0.75120842, + "learning_rate": 3.7473868980066446e-06, + "loss": 0.82990676, + "num_input_tokens_seen": 67068645, + "router_z_loss_clip": 2.75, + "router_z_loss_mlp": 0.25720215, + "step": 3108, + "time_per_iteration": 2.5681068897247314 + }, + { + "auxiliary_loss_clip": 0.06588297, + "auxiliary_loss_mlp": 0.01287258, + "balance_loss_clip": 0.06313515, + "balance_loss_mlp": 0.01262451, + "epoch": 0.18692319254471668, + "flos": 17243702962560.0, + "grad_norm": 1.5585462553143232, + "language_loss": 0.7488178, + "learning_rate": 3.747197400772658e-06, + "loss": 0.82757336, + "num_input_tokens_seen": 67087075, + "router_z_loss_clip": 2.7421875, + "router_z_loss_mlp": 0.24816895, + "step": 3109, + "time_per_iteration": 2.5719470977783203 + }, + { + "auxiliary_loss_clip": 0.06585538, + "auxiliary_loss_mlp": 0.01282339, + "balance_loss_clip": 0.06316088, + "balance_loss_mlp": 0.01256113, + "epoch": 0.18698331579738464, + "flos": 23191861075200.0, + "grad_norm": 1.4817620217833272, + "language_loss": 0.85173523, + "learning_rate": 3.747007837284772e-06, + "loss": 0.93041396, + "num_input_tokens_seen": 67108040, + "router_z_loss_clip": 2.69726562, + "router_z_loss_mlp": 0.26220703, + "step": 3110, + "time_per_iteration": 2.604595899581909 + }, + { + "auxiliary_loss_clip": 0.06572624, + "auxiliary_loss_mlp": 0.01284902, + "balance_loss_clip": 0.06305574, + "balance_loss_mlp": 0.01260142, + "epoch": 0.1870434390500526, + "flos": 25523192424960.0, + "grad_norm": 2.402854340329271, + "language_loss": 0.85246378, + "learning_rate": 3.7468182075501737e-06, + "loss": 0.93103909, + "num_input_tokens_seen": 67127605, + "router_z_loss_clip": 2.671875, + "router_z_loss_mlp": 0.24755859, + "step": 3111, + "time_per_iteration": 2.58076810836792 + }, + { + "auxiliary_loss_clip": 0.06578258, + "auxiliary_loss_mlp": 0.0128217, + "balance_loss_clip": 0.06306738, + "balance_loss_mlp": 0.0125778, + "epoch": 0.18710356230272057, + "flos": 19506999196800.0, + "grad_norm": 1.9642208489694009, + "language_loss": 0.77830005, + "learning_rate": 3.7466285115760536e-06, + "loss": 0.85690439, + "num_input_tokens_seen": 67145785, + "router_z_loss_clip": 2.71679688, + "router_z_loss_mlp": 0.24365234, + "step": 3112, + "time_per_iteration": 2.5625264644622803 + }, + { + "auxiliary_loss_clip": 0.06577107, + "auxiliary_loss_mlp": 0.01281729, + "balance_loss_clip": 0.06307282, + "balance_loss_mlp": 0.01258113, + "epoch": 0.18716368555538854, + "flos": 26768129662080.0, + "grad_norm": 2.238016316213089, + "language_loss": 0.65778387, + "learning_rate": 3.7464387493696046e-06, + "loss": 0.73637217, + "num_input_tokens_seen": 67165930, + "router_z_loss_clip": 2.70117188, + "router_z_loss_mlp": 0.23620605, + "step": 3113, + "time_per_iteration": 2.6080710887908936 + }, + { + "auxiliary_loss_clip": 0.06588607, + "auxiliary_loss_mlp": 0.01279317, + "balance_loss_clip": 0.06312529, + "balance_loss_mlp": 0.01254962, + "epoch": 0.1872238088080565, + "flos": 25196490155520.0, + "grad_norm": 2.335075222112074, + "language_loss": 0.82613724, + "learning_rate": 3.746248920938024e-06, + "loss": 0.90481651, + "num_input_tokens_seen": 67185830, + "router_z_loss_clip": 2.75976562, + "router_z_loss_mlp": 0.2434082, + "step": 3114, + "time_per_iteration": 2.5988082885742188 + }, + { + "auxiliary_loss_clip": 0.06587939, + "auxiliary_loss_mlp": 0.01289131, + "balance_loss_clip": 0.06312289, + "balance_loss_mlp": 0.01262655, + "epoch": 0.1872839320607245, + "flos": 24141220634880.0, + "grad_norm": 2.589653310619875, + "language_loss": 0.58319235, + "learning_rate": 3.74605902628851e-06, + "loss": 0.66196311, + "num_input_tokens_seen": 67206930, + "router_z_loss_clip": 2.75390625, + "router_z_loss_mlp": 0.26464844, + "step": 3115, + "time_per_iteration": 2.597001552581787 + }, + { + "auxiliary_loss_clip": 0.06578196, + "auxiliary_loss_mlp": 0.01284839, + "balance_loss_clip": 0.06308471, + "balance_loss_mlp": 0.01261676, + "epoch": 0.18734405531339246, + "flos": 21179349711360.0, + "grad_norm": 2.089321408475999, + "language_loss": 0.7264486, + "learning_rate": 3.745869065428261e-06, + "loss": 0.80507892, + "num_input_tokens_seen": 67226290, + "router_z_loss_clip": 2.6953125, + "router_z_loss_mlp": 0.23168945, + "step": 3116, + "time_per_iteration": 2.559483051300049 + }, + { + "auxiliary_loss_clip": 0.06573902, + "auxiliary_loss_mlp": 0.01278215, + "balance_loss_clip": 0.06309307, + "balance_loss_mlp": 0.01256292, + "epoch": 0.18740417856606043, + "flos": 17243325619200.0, + "grad_norm": 2.0473943382883184, + "language_loss": 0.79514784, + "learning_rate": 3.7456790383644833e-06, + "loss": 0.87366909, + "num_input_tokens_seen": 67244410, + "router_z_loss_clip": 2.64648438, + "router_z_loss_mlp": 0.21936035, + "step": 3117, + "time_per_iteration": 2.5308892726898193 + }, + { + "auxiliary_loss_clip": 0.06575021, + "auxiliary_loss_mlp": 0.01286113, + "balance_loss_clip": 0.06310903, + "balance_loss_mlp": 0.01262426, + "epoch": 0.1874643018187284, + "flos": 32565626933760.0, + "grad_norm": 1.6927935343473184, + "language_loss": 0.84475845, + "learning_rate": 3.745488945104381e-06, + "loss": 0.92336977, + "num_input_tokens_seen": 67264470, + "router_z_loss_clip": 2.64257812, + "router_z_loss_mlp": 0.23669434, + "step": 3118, + "time_per_iteration": 2.645819902420044 + }, + { + "auxiliary_loss_clip": 0.06577513, + "auxiliary_loss_mlp": 0.01281432, + "balance_loss_clip": 0.06306227, + "balance_loss_mlp": 0.01256184, + "epoch": 0.18752442507139636, + "flos": 23264843581440.0, + "grad_norm": 1.8564508885039195, + "language_loss": 0.77631271, + "learning_rate": 3.7452987856551636e-06, + "loss": 0.85490215, + "num_input_tokens_seen": 67284315, + "router_z_loss_clip": 2.7109375, + "router_z_loss_mlp": 0.25280762, + "step": 3119, + "time_per_iteration": 2.5282692909240723 + }, + { + "auxiliary_loss_clip": 0.06577515, + "auxiliary_loss_mlp": 0.01280917, + "balance_loss_clip": 0.06308109, + "balance_loss_mlp": 0.01257934, + "epoch": 0.18758454832406432, + "flos": 21767150903040.0, + "grad_norm": 1.872231122069903, + "language_loss": 0.83286214, + "learning_rate": 3.7451085600240406e-06, + "loss": 0.91144645, + "num_input_tokens_seen": 67302780, + "router_z_loss_clip": 2.69921875, + "router_z_loss_mlp": 0.22973633, + "step": 3120, + "time_per_iteration": 2.5557563304901123 + }, + { + "auxiliary_loss_clip": 0.06574757, + "auxiliary_loss_mlp": 0.01283184, + "balance_loss_clip": 0.06308539, + "balance_loss_mlp": 0.01260606, + "epoch": 0.1876446715767323, + "flos": 29577956152320.0, + "grad_norm": 1.9256466590755805, + "language_loss": 0.85764915, + "learning_rate": 3.7449182682182263e-06, + "loss": 0.93622863, + "num_input_tokens_seen": 67323405, + "router_z_loss_clip": 2.6640625, + "router_z_loss_mlp": 0.22595215, + "step": 3121, + "time_per_iteration": 2.5938265323638916 + }, + { + "auxiliary_loss_clip": 0.06579052, + "auxiliary_loss_mlp": 0.01278188, + "balance_loss_clip": 0.06313133, + "balance_loss_mlp": 0.01255037, + "epoch": 0.18770479482940028, + "flos": 30348465171840.0, + "grad_norm": 1.7101492266675271, + "language_loss": 0.71341884, + "learning_rate": 3.744727910244937e-06, + "loss": 0.79199123, + "num_input_tokens_seen": 67345800, + "router_z_loss_clip": 2.66015625, + "router_z_loss_mlp": 0.23156738, + "step": 3122, + "time_per_iteration": 2.6486034393310547 + }, + { + "auxiliary_loss_clip": 0.06583723, + "auxiliary_loss_mlp": 0.01279754, + "balance_loss_clip": 0.06317301, + "balance_loss_mlp": 0.01255602, + "epoch": 0.18776491808206824, + "flos": 14470619287680.0, + "grad_norm": 1.9121070999681127, + "language_loss": 0.71984768, + "learning_rate": 3.7445374861113905e-06, + "loss": 0.79848242, + "num_input_tokens_seen": 67363575, + "router_z_loss_clip": 2.6640625, + "router_z_loss_mlp": 0.24157715, + "step": 3123, + "time_per_iteration": 2.50598406791687 + }, + { + "auxiliary_loss_clip": 0.06582906, + "auxiliary_loss_mlp": 0.01279768, + "balance_loss_clip": 0.06318765, + "balance_loss_mlp": 0.01258251, + "epoch": 0.1878250413347362, + "flos": 24505420406400.0, + "grad_norm": 1.8100549345620827, + "language_loss": 0.74830985, + "learning_rate": 3.7443469958248066e-06, + "loss": 0.8269366, + "num_input_tokens_seen": 67381765, + "router_z_loss_clip": 2.63867188, + "router_z_loss_mlp": 0.21520996, + "step": 3124, + "time_per_iteration": 2.588963031768799 + }, + { + "auxiliary_loss_clip": 0.06579177, + "auxiliary_loss_mlp": 0.01284317, + "balance_loss_clip": 0.06309149, + "balance_loss_mlp": 0.01260177, + "epoch": 0.18788516458740417, + "flos": 39795632807040.0, + "grad_norm": 2.0156197395212225, + "language_loss": 0.81827998, + "learning_rate": 3.7441564393924106e-06, + "loss": 0.89691496, + "num_input_tokens_seen": 67405000, + "router_z_loss_clip": 2.703125, + "router_z_loss_mlp": 0.24133301, + "step": 3125, + "time_per_iteration": 2.6984996795654297 + }, + { + "auxiliary_loss_clip": 0.06689048, + "auxiliary_loss_mlp": 0.01323199, + "balance_loss_clip": 0.06516109, + "balance_loss_mlp": 0.01312268, + "epoch": 0.18794528784007214, + "flos": 64717844221440.0, + "grad_norm": 0.9517259918121469, + "language_loss": 0.63560247, + "learning_rate": 3.7439658168214273e-06, + "loss": 0.715725, + "num_input_tokens_seen": 67467140, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.10949707, + "step": 3126, + "time_per_iteration": 3.246349811553955 + }, + { + "auxiliary_loss_clip": 0.06580469, + "auxiliary_loss_mlp": 0.01289138, + "balance_loss_clip": 0.06317941, + "balance_loss_mlp": 0.01265118, + "epoch": 0.1880054110927401, + "flos": 28629728622720.0, + "grad_norm": 1.7132867879725662, + "language_loss": 0.81907004, + "learning_rate": 3.7437751281190857e-06, + "loss": 0.89776611, + "num_input_tokens_seen": 67487980, + "router_z_loss_clip": 2.62304688, + "router_z_loss_mlp": 0.24035645, + "step": 3127, + "time_per_iteration": 2.6359355449676514 + }, + { + "auxiliary_loss_clip": 0.06571439, + "auxiliary_loss_mlp": 0.01288176, + "balance_loss_clip": 0.06401625, + "balance_loss_mlp": 0.01277983, + "epoch": 0.1880655343454081, + "flos": 64508959192320.0, + "grad_norm": 0.7555261261025208, + "language_loss": 0.61928779, + "learning_rate": 3.7435843732926164e-06, + "loss": 0.69788396, + "num_input_tokens_seen": 67552500, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.10192871, + "step": 3128, + "time_per_iteration": 3.3078746795654297 + }, + { + "auxiliary_loss_clip": 0.06593472, + "auxiliary_loss_mlp": 0.01285866, + "balance_loss_clip": 0.06323253, + "balance_loss_mlp": 0.0126243, + "epoch": 0.18812565759807606, + "flos": 32132679287040.0, + "grad_norm": 2.3201362692378806, + "language_loss": 0.72451007, + "learning_rate": 3.7433935523492536e-06, + "loss": 0.80330348, + "num_input_tokens_seen": 67573295, + "router_z_loss_clip": 2.70117188, + "router_z_loss_mlp": 0.234375, + "step": 3129, + "time_per_iteration": 2.684316396713257 + }, + { + "auxiliary_loss_clip": 0.06599562, + "auxiliary_loss_mlp": 0.01283183, + "balance_loss_clip": 0.06331511, + "balance_loss_mlp": 0.01259294, + "epoch": 0.18818578085074403, + "flos": 20629674927360.0, + "grad_norm": 2.0063290669545024, + "language_loss": 0.85961545, + "learning_rate": 3.7432026652962314e-06, + "loss": 0.93844295, + "num_input_tokens_seen": 67590010, + "router_z_loss_clip": 2.68359375, + "router_z_loss_mlp": 0.23876953, + "step": 3130, + "time_per_iteration": 2.5385701656341553 + }, + { + "auxiliary_loss_clip": 0.0659353, + "auxiliary_loss_mlp": 0.0128556, + "balance_loss_clip": 0.06323448, + "balance_loss_mlp": 0.01262564, + "epoch": 0.188245904103412, + "flos": 28848131089920.0, + "grad_norm": 1.7743332045981155, + "language_loss": 0.77165318, + "learning_rate": 3.7430117121407897e-06, + "loss": 0.85044408, + "num_input_tokens_seen": 67611110, + "router_z_loss_clip": 2.69921875, + "router_z_loss_mlp": 0.23010254, + "step": 3131, + "time_per_iteration": 2.6456139087677 + }, + { + "auxiliary_loss_clip": 0.06594209, + "auxiliary_loss_mlp": 0.0129295, + "balance_loss_clip": 0.06329745, + "balance_loss_mlp": 0.01266891, + "epoch": 0.18830602735607996, + "flos": 29427379165440.0, + "grad_norm": 1.8335043044334671, + "language_loss": 0.8226279, + "learning_rate": 3.74282069289017e-06, + "loss": 0.90149951, + "num_input_tokens_seen": 67631990, + "router_z_loss_clip": 2.640625, + "router_z_loss_mlp": 0.26049805, + "step": 3132, + "time_per_iteration": 2.604219436645508 + }, + { + "auxiliary_loss_clip": 0.06612615, + "auxiliary_loss_mlp": 0.01296327, + "balance_loss_clip": 0.06340778, + "balance_loss_mlp": 0.01269886, + "epoch": 0.18836615060874792, + "flos": 28879884587520.0, + "grad_norm": 2.5361304129104476, + "language_loss": 0.80964118, + "learning_rate": 3.742629607551614e-06, + "loss": 0.88873059, + "num_input_tokens_seen": 67650490, + "router_z_loss_clip": 2.72070312, + "router_z_loss_mlp": 0.26452637, + "step": 3133, + "time_per_iteration": 2.6110780239105225 + }, + { + "auxiliary_loss_clip": 0.06596034, + "auxiliary_loss_mlp": 0.01290384, + "balance_loss_clip": 0.06326675, + "balance_loss_mlp": 0.01266709, + "epoch": 0.18842627386141592, + "flos": 22608294514560.0, + "grad_norm": 1.918700832470348, + "language_loss": 0.83331311, + "learning_rate": 3.7424384561323698e-06, + "loss": 0.91217732, + "num_input_tokens_seen": 67668860, + "router_z_loss_clip": 2.69921875, + "router_z_loss_mlp": 0.23669434, + "step": 3134, + "time_per_iteration": 3.9871177673339844 + }, + { + "auxiliary_loss_clip": 0.06585519, + "auxiliary_loss_mlp": 0.01303727, + "balance_loss_clip": 0.06320879, + "balance_loss_mlp": 0.01279873, + "epoch": 0.18848639711408388, + "flos": 24580834680960.0, + "grad_norm": 1.5688225209098985, + "language_loss": 0.83794045, + "learning_rate": 3.742247238639684e-06, + "loss": 0.91683292, + "num_input_tokens_seen": 67690220, + "router_z_loss_clip": 2.64648438, + "router_z_loss_mlp": 0.23852539, + "step": 3135, + "time_per_iteration": 2.576728343963623 + }, + { + "auxiliary_loss_clip": 0.06580248, + "auxiliary_loss_mlp": 0.01300724, + "balance_loss_clip": 0.06314597, + "balance_loss_mlp": 0.01277049, + "epoch": 0.18854652036675185, + "flos": 34175350920960.0, + "grad_norm": 2.0171444284890674, + "language_loss": 0.79025453, + "learning_rate": 3.7420559550808083e-06, + "loss": 0.86906427, + "num_input_tokens_seen": 67709820, + "router_z_loss_clip": 2.65429688, + "router_z_loss_mlp": 0.23681641, + "step": 3136, + "time_per_iteration": 4.059029817581177 + }, + { + "auxiliary_loss_clip": 0.06580447, + "auxiliary_loss_mlp": 0.01296286, + "balance_loss_clip": 0.06314041, + "balance_loss_mlp": 0.01272348, + "epoch": 0.1886066436194198, + "flos": 24205985441280.0, + "grad_norm": 1.848748774649379, + "language_loss": 0.82736617, + "learning_rate": 3.741864605462996e-06, + "loss": 0.90613353, + "num_input_tokens_seen": 67729490, + "router_z_loss_clip": 2.66796875, + "router_z_loss_mlp": 0.23925781, + "step": 3137, + "time_per_iteration": 2.5432510375976562 + }, + { + "auxiliary_loss_clip": 0.06589224, + "auxiliary_loss_mlp": 0.01291304, + "balance_loss_clip": 0.0632188, + "balance_loss_mlp": 0.0126745, + "epoch": 0.18866676687208778, + "flos": 21257405389440.0, + "grad_norm": 1.7037003999682347, + "language_loss": 0.81716311, + "learning_rate": 3.741673189793504e-06, + "loss": 0.89596832, + "num_input_tokens_seen": 67749665, + "router_z_loss_clip": 2.67382812, + "router_z_loss_mlp": 0.23864746, + "step": 3138, + "time_per_iteration": 2.5536084175109863 + }, + { + "auxiliary_loss_clip": 0.06589679, + "auxiliary_loss_mlp": 0.01290101, + "balance_loss_clip": 0.06319093, + "balance_loss_mlp": 0.01265985, + "epoch": 0.18872689012475574, + "flos": 37318294517760.0, + "grad_norm": 2.1585183145570723, + "language_loss": 0.64404404, + "learning_rate": 3.7414817080795896e-06, + "loss": 0.72284186, + "num_input_tokens_seen": 67776230, + "router_z_loss_clip": 2.70507812, + "router_z_loss_mlp": 0.24133301, + "step": 3139, + "time_per_iteration": 2.7355217933654785 + }, + { + "auxiliary_loss_clip": 0.06586127, + "auxiliary_loss_mlp": 0.01305421, + "balance_loss_clip": 0.06318149, + "balance_loss_mlp": 0.01280554, + "epoch": 0.1887870133774237, + "flos": 21658641465600.0, + "grad_norm": 2.033663323673097, + "language_loss": 0.72120833, + "learning_rate": 3.741290160328514e-06, + "loss": 0.80012381, + "num_input_tokens_seen": 67795080, + "router_z_loss_clip": 2.68164062, + "router_z_loss_mlp": 0.24865723, + "step": 3140, + "time_per_iteration": 2.556196928024292 + }, + { + "auxiliary_loss_clip": 0.06585391, + "auxiliary_loss_mlp": 0.01291018, + "balance_loss_clip": 0.06316558, + "balance_loss_mlp": 0.01264935, + "epoch": 0.1888471366300917, + "flos": 15930143631360.0, + "grad_norm": 2.3984250647338254, + "language_loss": 0.88684165, + "learning_rate": 3.7410985465475412e-06, + "loss": 0.9656058, + "num_input_tokens_seen": 67813110, + "router_z_loss_clip": 2.68554688, + "router_z_loss_mlp": 0.26086426, + "step": 3141, + "time_per_iteration": 5.341757774353027 + }, + { + "auxiliary_loss_clip": 0.06587377, + "auxiliary_loss_mlp": 0.01281785, + "balance_loss_clip": 0.06315634, + "balance_loss_mlp": 0.01256358, + "epoch": 0.18890725988275966, + "flos": 18557933126400.0, + "grad_norm": 1.8324612256611552, + "language_loss": 0.7775296, + "learning_rate": 3.7409068667439378e-06, + "loss": 0.85622126, + "num_input_tokens_seen": 67831070, + "router_z_loss_clip": 2.72070312, + "router_z_loss_mlp": 0.25390625, + "step": 3142, + "time_per_iteration": 2.5836708545684814 + }, + { + "auxiliary_loss_clip": 0.06576081, + "auxiliary_loss_mlp": 0.01283372, + "balance_loss_clip": 0.06312332, + "balance_loss_mlp": 0.01261413, + "epoch": 0.18896738313542763, + "flos": 28848550360320.0, + "grad_norm": 1.9913316615923113, + "language_loss": 0.79816502, + "learning_rate": 3.740715120924971e-06, + "loss": 0.87675953, + "num_input_tokens_seen": 67852170, + "router_z_loss_clip": 2.640625, + "router_z_loss_mlp": 0.21972656, + "step": 3143, + "time_per_iteration": 2.6068625450134277 + }, + { + "auxiliary_loss_clip": 0.06581955, + "auxiliary_loss_mlp": 0.01290595, + "balance_loss_clip": 0.0631283, + "balance_loss_mlp": 0.01266146, + "epoch": 0.1890275063880956, + "flos": 22418249454720.0, + "grad_norm": 2.17929571565749, + "language_loss": 0.72435296, + "learning_rate": 3.740523309097912e-06, + "loss": 0.80307841, + "num_input_tokens_seen": 67869945, + "router_z_loss_clip": 2.69140625, + "router_z_loss_mlp": 0.24475098, + "step": 3144, + "time_per_iteration": 2.565488338470459 + }, + { + "auxiliary_loss_clip": 0.06576345, + "auxiliary_loss_mlp": 0.0128465, + "balance_loss_clip": 0.0630596, + "balance_loss_mlp": 0.012602, + "epoch": 0.18908762964076356, + "flos": 24250862102400.0, + "grad_norm": 2.4312750691575253, + "language_loss": 0.74294418, + "learning_rate": 3.7403314312700356e-06, + "loss": 0.82155418, + "num_input_tokens_seen": 67890240, + "router_z_loss_clip": 2.70507812, + "router_z_loss_mlp": 0.24438477, + "step": 3145, + "time_per_iteration": 2.582784414291382 + }, + { + "auxiliary_loss_clip": 0.0656594, + "auxiliary_loss_mlp": 0.01281011, + "balance_loss_clip": 0.063042, + "balance_loss_mlp": 0.01258385, + "epoch": 0.18914775289343153, + "flos": 16988599607040.0, + "grad_norm": 2.264042873648611, + "language_loss": 0.77487111, + "learning_rate": 3.740139487448616e-06, + "loss": 0.85334063, + "num_input_tokens_seen": 67907825, + "router_z_loss_clip": 2.6171875, + "router_z_loss_mlp": 0.22631836, + "step": 3146, + "time_per_iteration": 2.5446579456329346 + }, + { + "auxiliary_loss_clip": 0.06567892, + "auxiliary_loss_mlp": 0.01282874, + "balance_loss_clip": 0.06302544, + "balance_loss_mlp": 0.01259342, + "epoch": 0.1892078761460995, + "flos": 21550257809280.0, + "grad_norm": 2.367888350934947, + "language_loss": 0.79622674, + "learning_rate": 3.7399474776409326e-06, + "loss": 0.87473428, + "num_input_tokens_seen": 67926670, + "router_z_loss_clip": 2.66015625, + "router_z_loss_mlp": 0.23535156, + "step": 3147, + "time_per_iteration": 2.5432369709014893 + }, + { + "auxiliary_loss_clip": 0.06564464, + "auxiliary_loss_mlp": 0.0128295, + "balance_loss_clip": 0.06297393, + "balance_loss_mlp": 0.01259096, + "epoch": 0.18926799939876748, + "flos": 23007979290240.0, + "grad_norm": 3.3066597325179443, + "language_loss": 0.67790151, + "learning_rate": 3.739755401854267e-06, + "loss": 0.75637561, + "num_input_tokens_seen": 67943645, + "router_z_loss_clip": 2.66992188, + "router_z_loss_mlp": 0.23864746, + "step": 3148, + "time_per_iteration": 2.5936107635498047 + }, + { + "auxiliary_loss_clip": 0.06566582, + "auxiliary_loss_mlp": 0.01281142, + "balance_loss_clip": 0.06297165, + "balance_loss_mlp": 0.01256693, + "epoch": 0.18932812265143545, + "flos": 22279537820160.0, + "grad_norm": 2.2349625482761843, + "language_loss": 0.76378185, + "learning_rate": 3.739563260095902e-06, + "loss": 0.84225905, + "num_input_tokens_seen": 67962345, + "router_z_loss_clip": 2.69335938, + "router_z_loss_mlp": 0.24450684, + "step": 3149, + "time_per_iteration": 2.5491833686828613 + }, + { + "auxiliary_loss_clip": 0.0656079, + "auxiliary_loss_mlp": 0.01279685, + "balance_loss_clip": 0.06300658, + "balance_loss_mlp": 0.01256785, + "epoch": 0.1893882459041034, + "flos": 18630328654080.0, + "grad_norm": 2.2856364952022687, + "language_loss": 0.81782246, + "learning_rate": 3.7393710523731245e-06, + "loss": 0.89622724, + "num_input_tokens_seen": 67979760, + "router_z_loss_clip": 2.59960938, + "router_z_loss_mlp": 0.22912598, + "step": 3150, + "time_per_iteration": 2.568166494369507 + }, + { + "auxiliary_loss_clip": 0.06565347, + "auxiliary_loss_mlp": 0.01285958, + "balance_loss_clip": 0.06297709, + "balance_loss_mlp": 0.01262617, + "epoch": 0.18944836915677138, + "flos": 22899553706880.0, + "grad_norm": 2.23925150788406, + "language_loss": 0.86091208, + "learning_rate": 3.7391787786932215e-06, + "loss": 0.93942523, + "num_input_tokens_seen": 67996895, + "router_z_loss_clip": 2.67773438, + "router_z_loss_mlp": 0.2331543, + "step": 3151, + "time_per_iteration": 2.520254373550415 + }, + { + "auxiliary_loss_clip": 0.06570399, + "auxiliary_loss_mlp": 0.01289995, + "balance_loss_clip": 0.06303516, + "balance_loss_mlp": 0.01266297, + "epoch": 0.18950849240943934, + "flos": 26803698520320.0, + "grad_norm": 1.7542668261130185, + "language_loss": 0.75358492, + "learning_rate": 3.7389864390634857e-06, + "loss": 0.83218884, + "num_input_tokens_seen": 68018365, + "router_z_loss_clip": 2.66796875, + "router_z_loss_mlp": 0.23706055, + "step": 3152, + "time_per_iteration": 2.612248182296753 + }, + { + "auxiliary_loss_clip": 0.06565326, + "auxiliary_loss_mlp": 0.01283167, + "balance_loss_clip": 0.06301029, + "balance_loss_mlp": 0.01258431, + "epoch": 0.1895686156621073, + "flos": 24977919980160.0, + "grad_norm": 1.8204901028243692, + "language_loss": 0.76455373, + "learning_rate": 3.738794033491209e-06, + "loss": 0.84303862, + "num_input_tokens_seen": 68037985, + "router_z_loss_clip": 2.64257812, + "router_z_loss_mlp": 0.24755859, + "step": 3153, + "time_per_iteration": 2.5559494495391846 + }, + { + "auxiliary_loss_clip": 0.06567015, + "auxiliary_loss_mlp": 0.01280834, + "balance_loss_clip": 0.06300367, + "balance_loss_mlp": 0.01256599, + "epoch": 0.1896287389147753, + "flos": 21950990760960.0, + "grad_norm": 1.7894410743269322, + "language_loss": 0.80290896, + "learning_rate": 3.7386015619836887e-06, + "loss": 0.88138747, + "num_input_tokens_seen": 68057975, + "router_z_loss_clip": 2.66601562, + "router_z_loss_mlp": 0.24255371, + "step": 3154, + "time_per_iteration": 2.554861545562744 + }, + { + "auxiliary_loss_clip": 0.06572987, + "auxiliary_loss_mlp": 0.01294065, + "balance_loss_clip": 0.06302256, + "balance_loss_mlp": 0.01267612, + "epoch": 0.18968886216744327, + "flos": 18183628938240.0, + "grad_norm": 2.9256856308256447, + "language_loss": 0.74259496, + "learning_rate": 3.738409024548223e-06, + "loss": 0.82126546, + "num_input_tokens_seen": 68074175, + "router_z_loss_clip": 2.70898438, + "router_z_loss_mlp": 0.26452637, + "step": 3155, + "time_per_iteration": 2.473719358444214 + }, + { + "auxiliary_loss_clip": 0.06557501, + "auxiliary_loss_mlp": 0.01284077, + "balance_loss_clip": 0.06296935, + "balance_loss_mlp": 0.01260247, + "epoch": 0.18974898542011123, + "flos": 20418735473280.0, + "grad_norm": 2.585248701074102, + "language_loss": 0.74503541, + "learning_rate": 3.7382164211921136e-06, + "loss": 0.82345116, + "num_input_tokens_seen": 68095230, + "router_z_loss_clip": 2.60546875, + "router_z_loss_mlp": 0.23815918, + "step": 3156, + "time_per_iteration": 2.5825979709625244 + }, + { + "auxiliary_loss_clip": 0.06561351, + "auxiliary_loss_mlp": 0.01283032, + "balance_loss_clip": 0.06294506, + "balance_loss_mlp": 0.01259786, + "epoch": 0.1898091086727792, + "flos": 23991356407680.0, + "grad_norm": 1.7654819302184697, + "language_loss": 0.68914878, + "learning_rate": 3.7380237519226623e-06, + "loss": 0.76759267, + "num_input_tokens_seen": 68113805, + "router_z_loss_clip": 2.66796875, + "router_z_loss_mlp": 0.23266602, + "step": 3157, + "time_per_iteration": 2.614276170730591 + }, + { + "auxiliary_loss_clip": 0.06562739, + "auxiliary_loss_mlp": 0.01287461, + "balance_loss_clip": 0.06299365, + "balance_loss_mlp": 0.01263822, + "epoch": 0.18986923192544716, + "flos": 27644590569600.0, + "grad_norm": 1.6841569236878713, + "language_loss": 0.80553401, + "learning_rate": 3.737831016747176e-06, + "loss": 0.88403606, + "num_input_tokens_seen": 68133190, + "router_z_loss_clip": 2.63867188, + "router_z_loss_mlp": 0.23657227, + "step": 3158, + "time_per_iteration": 2.6667590141296387 + }, + { + "auxiliary_loss_clip": 0.06570458, + "auxiliary_loss_mlp": 0.01285173, + "balance_loss_clip": 0.06298561, + "balance_loss_mlp": 0.01260509, + "epoch": 0.18992935517811513, + "flos": 25491271219200.0, + "grad_norm": 2.1165299373469755, + "language_loss": 0.72984976, + "learning_rate": 3.737638215672964e-06, + "loss": 0.808406, + "num_input_tokens_seen": 68152330, + "router_z_loss_clip": 2.71875, + "router_z_loss_mlp": 0.2467041, + "step": 3159, + "time_per_iteration": 2.5685224533081055 + }, + { + "auxiliary_loss_clip": 0.06567825, + "auxiliary_loss_mlp": 0.01281428, + "balance_loss_clip": 0.06301159, + "balance_loss_mlp": 0.01257014, + "epoch": 0.1899894784307831, + "flos": 17426578498560.0, + "grad_norm": 1.8951112773112917, + "language_loss": 0.86019123, + "learning_rate": 3.7374453487073366e-06, + "loss": 0.93868375, + "num_input_tokens_seen": 68170185, + "router_z_loss_clip": 2.66601562, + "router_z_loss_mlp": 0.24438477, + "step": 3160, + "time_per_iteration": 2.533764362335205 + }, + { + "auxiliary_loss_clip": 0.06553883, + "auxiliary_loss_mlp": 0.0128672, + "balance_loss_clip": 0.06294671, + "balance_loss_mlp": 0.01264154, + "epoch": 0.19004960168345109, + "flos": 27499925295360.0, + "grad_norm": 1.7631570201415632, + "language_loss": 0.74244189, + "learning_rate": 3.7372524158576074e-06, + "loss": 0.82084787, + "num_input_tokens_seen": 68191665, + "router_z_loss_clip": 2.59570312, + "router_z_loss_mlp": 0.22570801, + "step": 3161, + "time_per_iteration": 2.590913772583008 + }, + { + "auxiliary_loss_clip": 0.06558438, + "auxiliary_loss_mlp": 0.01279623, + "balance_loss_clip": 0.06296802, + "balance_loss_mlp": 0.01255817, + "epoch": 0.19010972493611905, + "flos": 38663858908800.0, + "grad_norm": 1.9041337161295762, + "language_loss": 0.81525451, + "learning_rate": 3.7370594171310926e-06, + "loss": 0.89363515, + "num_input_tokens_seen": 68214635, + "router_z_loss_clip": 2.61523438, + "router_z_loss_mlp": 0.23803711, + "step": 3162, + "time_per_iteration": 2.7009496688842773 + }, + { + "auxiliary_loss_clip": 0.06556226, + "auxiliary_loss_mlp": 0.01281702, + "balance_loss_clip": 0.06291863, + "balance_loss_mlp": 0.012573, + "epoch": 0.19016984818878702, + "flos": 19250763811200.0, + "grad_norm": 2.198798501736265, + "language_loss": 0.77194953, + "learning_rate": 3.73686635253511e-06, + "loss": 0.8503288, + "num_input_tokens_seen": 68232150, + "router_z_loss_clip": 2.64453125, + "router_z_loss_mlp": 0.2442627, + "step": 3163, + "time_per_iteration": 2.5443172454833984 + }, + { + "auxiliary_loss_clip": 0.06551848, + "auxiliary_loss_mlp": 0.01280109, + "balance_loss_clip": 0.06291605, + "balance_loss_mlp": 0.01256291, + "epoch": 0.19022997144145498, + "flos": 37605947984640.0, + "grad_norm": 1.6741633946121544, + "language_loss": 0.75098169, + "learning_rate": 3.736673222076982e-06, + "loss": 0.82930118, + "num_input_tokens_seen": 68253370, + "router_z_loss_clip": 2.6015625, + "router_z_loss_mlp": 0.23815918, + "step": 3164, + "time_per_iteration": 2.6625473499298096 + }, + { + "auxiliary_loss_clip": 0.06555005, + "auxiliary_loss_mlp": 0.01280136, + "balance_loss_clip": 0.06294911, + "balance_loss_mlp": 0.01256759, + "epoch": 0.19029009469412295, + "flos": 61543874615040.0, + "grad_norm": 2.119573778415358, + "language_loss": 0.67527556, + "learning_rate": 3.7364800257640313e-06, + "loss": 0.75362694, + "num_input_tokens_seen": 68278895, + "router_z_loss_clip": 2.60351562, + "router_z_loss_mlp": 0.23364258, + "step": 3165, + "time_per_iteration": 2.8877623081207275 + }, + { + "auxiliary_loss_clip": 0.06552027, + "auxiliary_loss_mlp": 0.01278943, + "balance_loss_clip": 0.06292567, + "balance_loss_mlp": 0.01254433, + "epoch": 0.1903502179467909, + "flos": 13960077160320.0, + "grad_norm": 2.3966036589645916, + "language_loss": 0.75069398, + "learning_rate": 3.7362867636035835e-06, + "loss": 0.82900369, + "num_input_tokens_seen": 68294880, + "router_z_loss_clip": 2.59765625, + "router_z_loss_mlp": 0.24523926, + "step": 3166, + "time_per_iteration": 2.505680799484253 + }, + { + "auxiliary_loss_clip": 0.06499279, + "auxiliary_loss_mlp": 0.0131955, + "balance_loss_clip": 0.06350935, + "balance_loss_mlp": 0.01311236, + "epoch": 0.1904103411994589, + "flos": 66920484499200.0, + "grad_norm": 0.8228799096925371, + "language_loss": 0.50405741, + "learning_rate": 3.736093435602968e-06, + "loss": 0.58224571, + "num_input_tokens_seen": 68359665, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.08319092, + "step": 3167, + "time_per_iteration": 3.1767730712890625 + }, + { + "auxiliary_loss_clip": 0.06551085, + "auxiliary_loss_mlp": 0.0128493, + "balance_loss_clip": 0.06295685, + "balance_loss_mlp": 0.0126141, + "epoch": 0.19047046445212687, + "flos": 21915296121600.0, + "grad_norm": 1.8666443369688703, + "language_loss": 0.75258517, + "learning_rate": 3.7359000417695156e-06, + "loss": 0.83094531, + "num_input_tokens_seen": 68378950, + "router_z_loss_clip": 2.5546875, + "router_z_loss_mlp": 0.23522949, + "step": 3168, + "time_per_iteration": 2.539647102355957 + }, + { + "auxiliary_loss_clip": 0.06476398, + "auxiliary_loss_mlp": 0.01306941, + "balance_loss_clip": 0.06328493, + "balance_loss_mlp": 0.01299204, + "epoch": 0.19053058770479483, + "flos": 59271549338880.0, + "grad_norm": 0.8502356895352512, + "language_loss": 0.60174263, + "learning_rate": 3.73570658211056e-06, + "loss": 0.67957604, + "num_input_tokens_seen": 68434235, + "router_z_loss_clip": 1.4765625, + "router_z_loss_mlp": 0.07727051, + "step": 3169, + "time_per_iteration": 3.0786385536193848 + }, + { + "auxiliary_loss_clip": 0.06569149, + "auxiliary_loss_mlp": 0.01284984, + "balance_loss_clip": 0.06301555, + "balance_loss_mlp": 0.01260057, + "epoch": 0.1905907109574628, + "flos": 23958093536640.0, + "grad_norm": 1.6203962411975037, + "language_loss": 0.79296863, + "learning_rate": 3.735513056633436e-06, + "loss": 0.87151003, + "num_input_tokens_seen": 68453830, + "router_z_loss_clip": 2.67773438, + "router_z_loss_mlp": 0.24926758, + "step": 3170, + "time_per_iteration": 2.5439629554748535 + }, + { + "auxiliary_loss_clip": 0.06568529, + "auxiliary_loss_mlp": 0.01282478, + "balance_loss_clip": 0.06308423, + "balance_loss_mlp": 0.01258636, + "epoch": 0.19065083421013077, + "flos": 20818378321920.0, + "grad_norm": 3.266788836182488, + "language_loss": 0.78913432, + "learning_rate": 3.7353194653454834e-06, + "loss": 0.86764443, + "num_input_tokens_seen": 68473005, + "router_z_loss_clip": 2.6015625, + "router_z_loss_mlp": 0.23840332, + "step": 3171, + "time_per_iteration": 2.5944604873657227 + }, + { + "auxiliary_loss_clip": 0.06584235, + "auxiliary_loss_mlp": 0.01294559, + "balance_loss_clip": 0.06313154, + "balance_loss_mlp": 0.01269323, + "epoch": 0.19071095746279873, + "flos": 31293003121920.0, + "grad_norm": 1.9362395671252917, + "language_loss": 0.79769027, + "learning_rate": 3.7351258082540426e-06, + "loss": 0.8764782, + "num_input_tokens_seen": 68493470, + "router_z_loss_clip": 2.71484375, + "router_z_loss_mlp": 0.25256348, + "step": 3172, + "time_per_iteration": 2.6039323806762695 + }, + { + "auxiliary_loss_clip": 0.06578603, + "auxiliary_loss_mlp": 0.01291257, + "balance_loss_clip": 0.06316808, + "balance_loss_mlp": 0.0126738, + "epoch": 0.1907710807154667, + "flos": 14361397090560.0, + "grad_norm": 1.549568453685288, + "language_loss": 0.81519973, + "learning_rate": 3.7349320853664576e-06, + "loss": 0.89389837, + "num_input_tokens_seen": 68511290, + "router_z_loss_clip": 2.6171875, + "router_z_loss_mlp": 0.2388916, + "step": 3173, + "time_per_iteration": 2.566249132156372 + }, + { + "auxiliary_loss_clip": 0.06577085, + "auxiliary_loss_mlp": 0.01291087, + "balance_loss_clip": 0.06311868, + "balance_loss_mlp": 0.01266077, + "epoch": 0.1908312039681347, + "flos": 26914388163840.0, + "grad_norm": 1.4831321875737526, + "language_loss": 0.79620194, + "learning_rate": 3.7347382966900735e-06, + "loss": 0.87488365, + "num_input_tokens_seen": 68532575, + "router_z_loss_clip": 2.65429688, + "router_z_loss_mlp": 0.25012207, + "step": 3174, + "time_per_iteration": 4.032260179519653 + }, + { + "auxiliary_loss_clip": 0.06571774, + "auxiliary_loss_mlp": 0.01295417, + "balance_loss_clip": 0.06307514, + "balance_loss_mlp": 0.01271563, + "epoch": 0.19089132722080265, + "flos": 14498767059840.0, + "grad_norm": 1.9289574693520037, + "language_loss": 0.82161433, + "learning_rate": 3.7345444422322395e-06, + "loss": 0.9002862, + "num_input_tokens_seen": 68548760, + "router_z_loss_clip": 2.640625, + "router_z_loss_mlp": 0.23864746, + "step": 3175, + "time_per_iteration": 3.92791748046875 + }, + { + "auxiliary_loss_clip": 0.06570717, + "auxiliary_loss_mlp": 0.01290773, + "balance_loss_clip": 0.06306395, + "balance_loss_mlp": 0.01265393, + "epoch": 0.19095145047347062, + "flos": 13957771173120.0, + "grad_norm": 2.497584127695701, + "language_loss": 0.86521202, + "learning_rate": 3.7343505220003067e-06, + "loss": 0.94382691, + "num_input_tokens_seen": 68563100, + "router_z_loss_clip": 2.640625, + "router_z_loss_mlp": 0.25390625, + "step": 3176, + "time_per_iteration": 2.5083093643188477 + }, + { + "auxiliary_loss_clip": 0.06573781, + "auxiliary_loss_mlp": 0.01293305, + "balance_loss_clip": 0.06304888, + "balance_loss_mlp": 0.01265148, + "epoch": 0.19101157372613858, + "flos": 25308940734720.0, + "grad_norm": 2.21127293150792, + "language_loss": 0.82911885, + "learning_rate": 3.7341565360016285e-06, + "loss": 0.90778971, + "num_input_tokens_seen": 68581650, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 0.28137207, + "step": 3177, + "time_per_iteration": 2.5615227222442627 + }, + { + "auxiliary_loss_clip": 0.06560818, + "auxiliary_loss_mlp": 0.01287183, + "balance_loss_clip": 0.06300267, + "balance_loss_mlp": 0.01263985, + "epoch": 0.19107169697880655, + "flos": 20564448923520.0, + "grad_norm": 2.02770964818788, + "language_loss": 0.75787783, + "learning_rate": 3.73396248424356e-06, + "loss": 0.83635783, + "num_input_tokens_seen": 68600360, + "router_z_loss_clip": 2.60351562, + "router_z_loss_mlp": 0.23205566, + "step": 3178, + "time_per_iteration": 2.6215403079986572 + }, + { + "auxiliary_loss_clip": 0.06568342, + "auxiliary_loss_mlp": 0.01282871, + "balance_loss_clip": 0.06301986, + "balance_loss_mlp": 0.01260233, + "epoch": 0.19113182023147451, + "flos": 22169644790400.0, + "grad_norm": 1.6828125352275214, + "language_loss": 0.82549155, + "learning_rate": 3.7337683667334606e-06, + "loss": 0.90400362, + "num_input_tokens_seen": 68617885, + "router_z_loss_clip": 2.6640625, + "router_z_loss_mlp": 0.22644043, + "step": 3179, + "time_per_iteration": 2.5675652027130127 + }, + { + "auxiliary_loss_clip": 0.06569887, + "auxiliary_loss_mlp": 0.01296491, + "balance_loss_clip": 0.06307887, + "balance_loss_mlp": 0.012734, + "epoch": 0.19119194348414248, + "flos": 18586667877120.0, + "grad_norm": 2.5330173520749124, + "language_loss": 0.80732077, + "learning_rate": 3.733574183478691e-06, + "loss": 0.88598454, + "num_input_tokens_seen": 68634550, + "router_z_loss_clip": 2.62109375, + "router_z_loss_mlp": 0.23095703, + "step": 3180, + "time_per_iteration": 3.945387601852417 + }, + { + "auxiliary_loss_clip": 0.06563538, + "auxiliary_loss_mlp": 0.01290582, + "balance_loss_clip": 0.06302621, + "balance_loss_mlp": 0.01266883, + "epoch": 0.19125206673681047, + "flos": 19032738687360.0, + "grad_norm": 2.1003445268953373, + "language_loss": 0.79773259, + "learning_rate": 3.733379934486615e-06, + "loss": 0.87627381, + "num_input_tokens_seen": 68651895, + "router_z_loss_clip": 2.609375, + "router_z_loss_mlp": 0.23706055, + "step": 3181, + "time_per_iteration": 3.9274189472198486 + }, + { + "auxiliary_loss_clip": 0.06568001, + "auxiliary_loss_mlp": 0.01288302, + "balance_loss_clip": 0.06304715, + "balance_loss_mlp": 0.0126477, + "epoch": 0.19131218998947844, + "flos": 21696725946240.0, + "grad_norm": 2.2417902838655888, + "language_loss": 0.74386561, + "learning_rate": 3.7331856197645973e-06, + "loss": 0.82242858, + "num_input_tokens_seen": 68671500, + "router_z_loss_clip": 2.63476562, + "router_z_loss_mlp": 0.23547363, + "step": 3182, + "time_per_iteration": 2.550570487976074 + }, + { + "auxiliary_loss_clip": 0.06570706, + "auxiliary_loss_mlp": 0.0129189, + "balance_loss_clip": 0.06306151, + "balance_loss_mlp": 0.01267166, + "epoch": 0.1913723132421464, + "flos": 18448459367040.0, + "grad_norm": 1.7754326163332461, + "language_loss": 0.66467738, + "learning_rate": 3.7329912393200084e-06, + "loss": 0.7433033, + "num_input_tokens_seen": 68690570, + "router_z_loss_clip": 2.6484375, + "router_z_loss_mlp": 0.24719238, + "step": 3183, + "time_per_iteration": 2.589555501937866 + }, + { + "auxiliary_loss_clip": 0.06578184, + "auxiliary_loss_mlp": 0.01296721, + "balance_loss_clip": 0.06308434, + "balance_loss_mlp": 0.01268659, + "epoch": 0.19143243649481437, + "flos": 27167101678080.0, + "grad_norm": 1.7849918331200134, + "language_loss": 0.73866975, + "learning_rate": 3.7327967931602173e-06, + "loss": 0.81741881, + "num_input_tokens_seen": 68709735, + "router_z_loss_clip": 2.69921875, + "router_z_loss_mlp": 0.28076172, + "step": 3184, + "time_per_iteration": 2.7020864486694336 + }, + { + "auxiliary_loss_clip": 0.06571424, + "auxiliary_loss_mlp": 0.01290073, + "balance_loss_clip": 0.06304838, + "balance_loss_mlp": 0.01264049, + "epoch": 0.19149255974748233, + "flos": 21724244812800.0, + "grad_norm": 1.9651356872089878, + "language_loss": 0.89339554, + "learning_rate": 3.732602281292598e-06, + "loss": 0.97201049, + "num_input_tokens_seen": 68727565, + "router_z_loss_clip": 2.66601562, + "router_z_loss_mlp": 0.26037598, + "step": 3185, + "time_per_iteration": 2.512737512588501 + }, + { + "auxiliary_loss_clip": 0.06568564, + "auxiliary_loss_mlp": 0.01286821, + "balance_loss_clip": 0.06304171, + "balance_loss_mlp": 0.01261429, + "epoch": 0.1915526830001503, + "flos": 22969433612160.0, + "grad_norm": 2.041503418641191, + "language_loss": 0.74291968, + "learning_rate": 3.7324077037245267e-06, + "loss": 0.82147354, + "num_input_tokens_seen": 68748110, + "router_z_loss_clip": 2.64648438, + "router_z_loss_mlp": 0.25390625, + "step": 3186, + "time_per_iteration": 2.577359676361084 + }, + { + "auxiliary_loss_clip": 0.06579722, + "auxiliary_loss_mlp": 0.01289876, + "balance_loss_clip": 0.06312623, + "balance_loss_mlp": 0.01264675, + "epoch": 0.1916128062528183, + "flos": 26147946067200.0, + "grad_norm": 1.9086459802632982, + "language_loss": 0.84205973, + "learning_rate": 3.7322130604633825e-06, + "loss": 0.92075574, + "num_input_tokens_seen": 68769765, + "router_z_loss_clip": 2.671875, + "router_z_loss_mlp": 0.25231934, + "step": 3187, + "time_per_iteration": 2.575345039367676 + }, + { + "auxiliary_loss_clip": 0.06462009, + "auxiliary_loss_mlp": 0.01273815, + "balance_loss_clip": 0.06313258, + "balance_loss_mlp": 0.01266967, + "epoch": 0.19167292950548626, + "flos": 54943513119360.0, + "grad_norm": 0.8344019653061644, + "language_loss": 0.56017417, + "learning_rate": 3.732018351516544e-06, + "loss": 0.63753241, + "num_input_tokens_seen": 68826815, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.06866455, + "step": 3188, + "time_per_iteration": 3.186802387237549 + }, + { + "auxiliary_loss_clip": 0.06575608, + "auxiliary_loss_mlp": 0.01301201, + "balance_loss_clip": 0.06310253, + "balance_loss_mlp": 0.01276942, + "epoch": 0.19173305275815422, + "flos": 29943497589120.0, + "grad_norm": 2.242687399889932, + "language_loss": 0.70996517, + "learning_rate": 3.731823576891397e-06, + "loss": 0.78873324, + "num_input_tokens_seen": 68847585, + "router_z_loss_clip": 2.65039062, + "router_z_loss_mlp": 0.24267578, + "step": 3189, + "time_per_iteration": 2.5879886150360107 + }, + { + "auxiliary_loss_clip": 0.0656148, + "auxiliary_loss_mlp": 0.01285809, + "balance_loss_clip": 0.06303851, + "balance_loss_mlp": 0.01263994, + "epoch": 0.1917931760108222, + "flos": 24759140169600.0, + "grad_norm": 2.034629185065424, + "language_loss": 0.74848962, + "learning_rate": 3.7316287365953266e-06, + "loss": 0.82696253, + "num_input_tokens_seen": 68866620, + "router_z_loss_clip": 2.578125, + "router_z_loss_mlp": 0.21813965, + "step": 3190, + "time_per_iteration": 2.618912696838379 + }, + { + "auxiliary_loss_clip": 0.06566381, + "auxiliary_loss_mlp": 0.01292718, + "balance_loss_clip": 0.06306858, + "balance_loss_mlp": 0.01268614, + "epoch": 0.19185329926349015, + "flos": 18849527735040.0, + "grad_norm": 1.9370060266864375, + "language_loss": 0.84794742, + "learning_rate": 3.73143383063572e-06, + "loss": 0.92653841, + "num_input_tokens_seen": 68885515, + "router_z_loss_clip": 2.59179688, + "router_z_loss_mlp": 0.24108887, + "step": 3191, + "time_per_iteration": 2.5354197025299072 + }, + { + "auxiliary_loss_clip": 0.06560425, + "auxiliary_loss_mlp": 0.01288793, + "balance_loss_clip": 0.06303156, + "balance_loss_mlp": 0.01265595, + "epoch": 0.19191342251615812, + "flos": 22092721142400.0, + "grad_norm": 1.810553957384375, + "language_loss": 0.90797645, + "learning_rate": 3.73123885901997e-06, + "loss": 0.98646855, + "num_input_tokens_seen": 68903225, + "router_z_loss_clip": 2.57421875, + "router_z_loss_mlp": 0.23193359, + "step": 3192, + "time_per_iteration": 2.594034433364868 + }, + { + "auxiliary_loss_clip": 0.06575879, + "auxiliary_loss_mlp": 0.01297652, + "balance_loss_clip": 0.06307722, + "balance_loss_mlp": 0.01273727, + "epoch": 0.19197354576882608, + "flos": 22205465210880.0, + "grad_norm": 3.128458316309985, + "language_loss": 0.76021564, + "learning_rate": 3.7310438217554687e-06, + "loss": 0.83895093, + "num_input_tokens_seen": 68922860, + "router_z_loss_clip": 2.68554688, + "router_z_loss_mlp": 0.23925781, + "step": 3193, + "time_per_iteration": 2.5328986644744873 + }, + { + "auxiliary_loss_clip": 0.06572805, + "auxiliary_loss_mlp": 0.01303133, + "balance_loss_clip": 0.06305176, + "balance_loss_mlp": 0.01278504, + "epoch": 0.19203366902149407, + "flos": 24902505705600.0, + "grad_norm": 1.8726296466629722, + "language_loss": 0.75837868, + "learning_rate": 3.730848718849612e-06, + "loss": 0.83713806, + "num_input_tokens_seen": 68943000, + "router_z_loss_clip": 2.67382812, + "router_z_loss_mlp": 0.24633789, + "step": 3194, + "time_per_iteration": 2.594693660736084 + }, + { + "auxiliary_loss_clip": 0.06443634, + "auxiliary_loss_mlp": 0.01272062, + "balance_loss_clip": 0.06298726, + "balance_loss_mlp": 0.01264749, + "epoch": 0.19209379227416204, + "flos": 68435256211200.0, + "grad_norm": 0.738426265798758, + "language_loss": 0.68323666, + "learning_rate": 3.7306535503097985e-06, + "loss": 0.76039362, + "num_input_tokens_seen": 69000255, + "router_z_loss_clip": 1.453125, + "router_z_loss_mlp": 0.07293701, + "step": 3195, + "time_per_iteration": 3.082646369934082 + }, + { + "auxiliary_loss_clip": 0.0656238, + "auxiliary_loss_mlp": 0.0129433, + "balance_loss_clip": 0.06301422, + "balance_loss_mlp": 0.01270488, + "epoch": 0.19215391552683, + "flos": 22061848112640.0, + "grad_norm": 2.817360442151248, + "language_loss": 0.74132156, + "learning_rate": 3.730458316143429e-06, + "loss": 0.81988871, + "num_input_tokens_seen": 69019665, + "router_z_loss_clip": 2.61132812, + "router_z_loss_mlp": 0.23852539, + "step": 3196, + "time_per_iteration": 2.5596578121185303 + }, + { + "auxiliary_loss_clip": 0.0656443, + "auxiliary_loss_mlp": 0.01296614, + "balance_loss_clip": 0.06303307, + "balance_loss_mlp": 0.01272939, + "epoch": 0.19221403877949797, + "flos": 20309177859840.0, + "grad_norm": 2.156505210347581, + "language_loss": 0.84144557, + "learning_rate": 3.7302630163579068e-06, + "loss": 0.92005599, + "num_input_tokens_seen": 69039055, + "router_z_loss_clip": 2.61523438, + "router_z_loss_mlp": 0.23657227, + "step": 3197, + "time_per_iteration": 2.505884885787964 + }, + { + "auxiliary_loss_clip": 0.06563333, + "auxiliary_loss_mlp": 0.01294057, + "balance_loss_clip": 0.06297445, + "balance_loss_mlp": 0.0126894, + "epoch": 0.19227416203216594, + "flos": 23192028783360.0, + "grad_norm": 2.1973705189643042, + "language_loss": 0.8105517, + "learning_rate": 3.7300676509606373e-06, + "loss": 0.88912559, + "num_input_tokens_seen": 69056370, + "router_z_loss_clip": 2.65625, + "router_z_loss_mlp": 0.25109863, + "step": 3198, + "time_per_iteration": 2.5759875774383545 + }, + { + "auxiliary_loss_clip": 0.06570526, + "auxiliary_loss_mlp": 0.01303751, + "balance_loss_clip": 0.06301676, + "balance_loss_mlp": 0.01279194, + "epoch": 0.1923342852848339, + "flos": 25783872076800.0, + "grad_norm": 2.3405078734196274, + "language_loss": 0.79434526, + "learning_rate": 3.729872219959029e-06, + "loss": 0.873088, + "num_input_tokens_seen": 69075915, + "router_z_loss_clip": 2.6875, + "router_z_loss_mlp": 0.24536133, + "step": 3199, + "time_per_iteration": 2.57918643951416 + }, + { + "auxiliary_loss_clip": 0.06561789, + "auxiliary_loss_mlp": 0.01291155, + "balance_loss_clip": 0.06299184, + "balance_loss_mlp": 0.01267694, + "epoch": 0.19239440853750187, + "flos": 17133977640960.0, + "grad_norm": 1.9996812909650197, + "language_loss": 0.84443569, + "learning_rate": 3.7296767233604934e-06, + "loss": 0.92296517, + "num_input_tokens_seen": 69094145, + "router_z_loss_clip": 2.625, + "router_z_loss_mlp": 0.23449707, + "step": 3200, + "time_per_iteration": 2.5089356899261475 + }, + { + "auxiliary_loss_clip": 0.06560853, + "auxiliary_loss_mlp": 0.01287978, + "balance_loss_clip": 0.06299884, + "balance_loss_mlp": 0.01265185, + "epoch": 0.19245453179016986, + "flos": 16440601904640.0, + "grad_norm": 1.9071909055640763, + "language_loss": 0.79753184, + "learning_rate": 3.729481161172443e-06, + "loss": 0.87602013, + "num_input_tokens_seen": 69111110, + "router_z_loss_clip": 2.609375, + "router_z_loss_mlp": 0.22790527, + "step": 3201, + "time_per_iteration": 2.5428295135498047 + }, + { + "auxiliary_loss_clip": 0.06563856, + "auxiliary_loss_mlp": 0.01287849, + "balance_loss_clip": 0.06298736, + "balance_loss_mlp": 0.01264365, + "epoch": 0.19251465504283782, + "flos": 20236530769920.0, + "grad_norm": 3.4105372180153273, + "language_loss": 0.70024735, + "learning_rate": 3.7292855334022927e-06, + "loss": 0.77876443, + "num_input_tokens_seen": 69130280, + "router_z_loss_clip": 2.65039062, + "router_z_loss_mlp": 0.23498535, + "step": 3202, + "time_per_iteration": 2.545257806777954 + }, + { + "auxiliary_loss_clip": 0.06559525, + "auxiliary_loss_mlp": 0.01288531, + "balance_loss_clip": 0.06303041, + "balance_loss_mlp": 0.01265965, + "epoch": 0.1925747782955058, + "flos": 19470549870720.0, + "grad_norm": 1.8972638993856672, + "language_loss": 0.9187758, + "learning_rate": 3.7290898400574627e-06, + "loss": 0.9972564, + "num_input_tokens_seen": 69149570, + "router_z_loss_clip": 2.56445312, + "router_z_loss_mlp": 0.22570801, + "step": 3203, + "time_per_iteration": 2.52083420753479 + }, + { + "auxiliary_loss_clip": 0.06569508, + "auxiliary_loss_mlp": 0.01288191, + "balance_loss_clip": 0.06305829, + "balance_loss_mlp": 0.01263193, + "epoch": 0.19263490154817375, + "flos": 17791407175680.0, + "grad_norm": 2.3309919698880637, + "language_loss": 0.82672936, + "learning_rate": 3.7288940811453725e-06, + "loss": 0.9053064, + "num_input_tokens_seen": 69168190, + "router_z_loss_clip": 2.63671875, + "router_z_loss_mlp": 0.25012207, + "step": 3204, + "time_per_iteration": 2.552898645401001 + }, + { + "auxiliary_loss_clip": 0.06554051, + "auxiliary_loss_mlp": 0.01280623, + "balance_loss_clip": 0.06297573, + "balance_loss_mlp": 0.01257437, + "epoch": 0.19269502480084172, + "flos": 17462818189440.0, + "grad_norm": 2.4686415170818927, + "language_loss": 0.76927221, + "learning_rate": 3.7286982566734454e-06, + "loss": 0.84761888, + "num_input_tokens_seen": 69186950, + "router_z_loss_clip": 2.56835938, + "router_z_loss_mlp": 0.23181152, + "step": 3205, + "time_per_iteration": 2.635087251663208 + }, + { + "auxiliary_loss_clip": 0.06570686, + "auxiliary_loss_mlp": 0.01281824, + "balance_loss_clip": 0.06303753, + "balance_loss_mlp": 0.01259913, + "epoch": 0.19275514805350968, + "flos": 21513305358720.0, + "grad_norm": 2.6796703276560034, + "language_loss": 0.84088528, + "learning_rate": 3.728502366649107e-06, + "loss": 0.91941041, + "num_input_tokens_seen": 69204850, + "router_z_loss_clip": 2.671875, + "router_z_loss_mlp": 0.21911621, + "step": 3206, + "time_per_iteration": 2.5875258445739746 + }, + { + "auxiliary_loss_clip": 0.06462742, + "auxiliary_loss_mlp": 0.01299031, + "balance_loss_clip": 0.06320498, + "balance_loss_mlp": 0.01291426, + "epoch": 0.19281527130617768, + "flos": 47711578602240.0, + "grad_norm": 0.8155276906071137, + "language_loss": 0.60688889, + "learning_rate": 3.728306411079786e-06, + "loss": 0.68450665, + "num_input_tokens_seen": 69259200, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.07592773, + "step": 3207, + "time_per_iteration": 2.98170804977417 + }, + { + "auxiliary_loss_clip": 0.06570975, + "auxiliary_loss_mlp": 0.01284779, + "balance_loss_clip": 0.06306583, + "balance_loss_mlp": 0.01261426, + "epoch": 0.19287539455884564, + "flos": 11805961196160.0, + "grad_norm": 2.350100512422909, + "language_loss": 0.76272619, + "learning_rate": 3.7281103899729125e-06, + "loss": 0.8412838, + "num_input_tokens_seen": 69275835, + "router_z_loss_clip": 2.64453125, + "router_z_loss_mlp": 0.23364258, + "step": 3208, + "time_per_iteration": 2.495664358139038 + }, + { + "auxiliary_loss_clip": 0.06570548, + "auxiliary_loss_mlp": 0.01287656, + "balance_loss_clip": 0.06303693, + "balance_loss_mlp": 0.01263253, + "epoch": 0.1929355178115136, + "flos": 20637724919040.0, + "grad_norm": 2.572131519169912, + "language_loss": 0.61787575, + "learning_rate": 3.7279143033359195e-06, + "loss": 0.69645774, + "num_input_tokens_seen": 69294810, + "router_z_loss_clip": 2.66796875, + "router_z_loss_mlp": 0.24389648, + "step": 3209, + "time_per_iteration": 2.5720291137695312 + }, + { + "auxiliary_loss_clip": 0.06569174, + "auxiliary_loss_mlp": 0.0128696, + "balance_loss_clip": 0.06303342, + "balance_loss_mlp": 0.01262832, + "epoch": 0.19299564106418157, + "flos": 40817555602560.0, + "grad_norm": 2.1926342764258773, + "language_loss": 0.80817664, + "learning_rate": 3.727718151176243e-06, + "loss": 0.88673794, + "num_input_tokens_seen": 69316065, + "router_z_loss_clip": 2.65820312, + "router_z_loss_mlp": 0.24133301, + "step": 3210, + "time_per_iteration": 2.6967084407806396 + }, + { + "auxiliary_loss_clip": 0.06562287, + "auxiliary_loss_mlp": 0.01281086, + "balance_loss_clip": 0.06305824, + "balance_loss_mlp": 0.01258913, + "epoch": 0.19305576431684954, + "flos": 11365718244480.0, + "grad_norm": 4.335018711819376, + "language_loss": 0.83798629, + "learning_rate": 3.7275219335013217e-06, + "loss": 0.9164201, + "num_input_tokens_seen": 69332900, + "router_z_loss_clip": 2.56445312, + "router_z_loss_mlp": 0.22167969, + "step": 3211, + "time_per_iteration": 2.522151470184326 + }, + { + "auxiliary_loss_clip": 0.06460443, + "auxiliary_loss_mlp": 0.01261987, + "balance_loss_clip": 0.06318722, + "balance_loss_mlp": 0.01254787, + "epoch": 0.1931158875695175, + "flos": 54527476798080.0, + "grad_norm": 0.9401062048905866, + "language_loss": 0.63522434, + "learning_rate": 3.7273256503185953e-06, + "loss": 0.71244872, + "num_input_tokens_seen": 69382535, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.07196045, + "step": 3212, + "time_per_iteration": 3.0072474479675293 + }, + { + "auxiliary_loss_clip": 0.06559554, + "auxiliary_loss_mlp": 0.01284587, + "balance_loss_clip": 0.06301133, + "balance_loss_mlp": 0.01260936, + "epoch": 0.19317601082218547, + "flos": 19834540007040.0, + "grad_norm": 1.629103353649286, + "language_loss": 0.7732501, + "learning_rate": 3.7271293016355074e-06, + "loss": 0.85169148, + "num_input_tokens_seen": 69400600, + "router_z_loss_clip": 2.58398438, + "router_z_loss_mlp": 0.23669434, + "step": 3213, + "time_per_iteration": 3.972214698791504 + }, + { + "auxiliary_loss_clip": 0.06571522, + "auxiliary_loss_mlp": 0.01282458, + "balance_loss_clip": 0.06306578, + "balance_loss_mlp": 0.01259749, + "epoch": 0.19323613407485346, + "flos": 13157143810560.0, + "grad_norm": 2.0451873974907864, + "language_loss": 0.71339387, + "learning_rate": 3.726932887459503e-06, + "loss": 0.79193366, + "num_input_tokens_seen": 69417350, + "router_z_loss_clip": 2.64648438, + "router_z_loss_mlp": 0.22729492, + "step": 3214, + "time_per_iteration": 2.542698383331299 + }, + { + "auxiliary_loss_clip": 0.06565271, + "auxiliary_loss_mlp": 0.01287539, + "balance_loss_clip": 0.06303567, + "balance_loss_mlp": 0.01264365, + "epoch": 0.19329625732752143, + "flos": 14032388833920.0, + "grad_norm": 2.534528672768976, + "language_loss": 0.75987494, + "learning_rate": 3.72673640779803e-06, + "loss": 0.83840305, + "num_input_tokens_seen": 69431845, + "router_z_loss_clip": 2.61328125, + "router_z_loss_mlp": 0.23205566, + "step": 3215, + "time_per_iteration": 3.8739888668060303 + }, + { + "auxiliary_loss_clip": 0.06557035, + "auxiliary_loss_mlp": 0.01279943, + "balance_loss_clip": 0.06302097, + "balance_loss_mlp": 0.01257615, + "epoch": 0.1933563805801894, + "flos": 23448641512320.0, + "grad_norm": 2.010602658012729, + "language_loss": 0.88668227, + "learning_rate": 3.72653986265854e-06, + "loss": 0.96505201, + "num_input_tokens_seen": 69453275, + "router_z_loss_clip": 2.55273438, + "router_z_loss_mlp": 0.22338867, + "step": 3216, + "time_per_iteration": 2.5690455436706543 + }, + { + "auxiliary_loss_clip": 0.06557489, + "auxiliary_loss_mlp": 0.01281443, + "balance_loss_clip": 0.06301452, + "balance_loss_mlp": 0.01259019, + "epoch": 0.19341650383285736, + "flos": 20491550271360.0, + "grad_norm": 2.1677144094151823, + "language_loss": 0.80915409, + "learning_rate": 3.726343252048485e-06, + "loss": 0.88754338, + "num_input_tokens_seen": 69471830, + "router_z_loss_clip": 2.55859375, + "router_z_loss_mlp": 0.2244873, + "step": 3217, + "time_per_iteration": 2.522089958190918 + }, + { + "auxiliary_loss_clip": 0.06573136, + "auxiliary_loss_mlp": 0.01282755, + "balance_loss_clip": 0.06306149, + "balance_loss_mlp": 0.01257709, + "epoch": 0.19347662708552532, + "flos": 17864305827840.0, + "grad_norm": 3.8111547770960907, + "language_loss": 0.63612419, + "learning_rate": 3.7261465759753206e-06, + "loss": 0.71468312, + "num_input_tokens_seen": 69489320, + "router_z_loss_clip": 2.66992188, + "router_z_loss_mlp": 0.25048828, + "step": 3218, + "time_per_iteration": 2.511009693145752 + }, + { + "auxiliary_loss_clip": 0.06568655, + "auxiliary_loss_mlp": 0.01286799, + "balance_loss_clip": 0.06304532, + "balance_loss_mlp": 0.01262945, + "epoch": 0.1935367503381933, + "flos": 18193188303360.0, + "grad_norm": 1.6615722636986479, + "language_loss": 0.80769217, + "learning_rate": 3.7259498344465053e-06, + "loss": 0.88624674, + "num_input_tokens_seen": 69506665, + "router_z_loss_clip": 2.63867188, + "router_z_loss_mlp": 0.23852539, + "step": 3219, + "time_per_iteration": 2.49652099609375 + }, + { + "auxiliary_loss_clip": 0.06560229, + "auxiliary_loss_mlp": 0.01283688, + "balance_loss_clip": 0.06305727, + "balance_loss_mlp": 0.01262183, + "epoch": 0.19359687359086128, + "flos": 15961939056000.0, + "grad_norm": 2.4004031272371096, + "language_loss": 0.87055713, + "learning_rate": 3.7257530274694993e-06, + "loss": 0.94899631, + "num_input_tokens_seen": 69523835, + "router_z_loss_clip": 2.54492188, + "router_z_loss_mlp": 0.21520996, + "step": 3220, + "time_per_iteration": 3.9898974895477295 + }, + { + "auxiliary_loss_clip": 0.06557765, + "auxiliary_loss_mlp": 0.01279498, + "balance_loss_clip": 0.06308522, + "balance_loss_mlp": 0.0125829, + "epoch": 0.19365699684352924, + "flos": 21221584968960.0, + "grad_norm": 2.3273733740868296, + "language_loss": 0.84724689, + "learning_rate": 3.725556155051766e-06, + "loss": 0.92561948, + "num_input_tokens_seen": 69542620, + "router_z_loss_clip": 2.48828125, + "router_z_loss_mlp": 0.21191406, + "step": 3221, + "time_per_iteration": 2.546876907348633 + }, + { + "auxiliary_loss_clip": 0.06557351, + "auxiliary_loss_mlp": 0.01282697, + "balance_loss_clip": 0.06305219, + "balance_loss_mlp": 0.01260333, + "epoch": 0.1937171200961972, + "flos": 17316811249920.0, + "grad_norm": 2.1420374809622507, + "language_loss": 0.8628484, + "learning_rate": 3.7253592172007702e-06, + "loss": 0.94124895, + "num_input_tokens_seen": 69561130, + "router_z_loss_clip": 2.52148438, + "router_z_loss_mlp": 0.22351074, + "step": 3222, + "time_per_iteration": 2.497483015060425 + }, + { + "auxiliary_loss_clip": 0.06565784, + "auxiliary_loss_mlp": 0.0127706, + "balance_loss_clip": 0.06304947, + "balance_loss_mlp": 0.01255114, + "epoch": 0.19377724334886517, + "flos": 22642228218240.0, + "grad_norm": 2.292443034833117, + "language_loss": 0.7909472, + "learning_rate": 3.72516221392398e-06, + "loss": 0.86937559, + "num_input_tokens_seen": 69580425, + "router_z_loss_clip": 2.609375, + "router_z_loss_mlp": 0.21948242, + "step": 3223, + "time_per_iteration": 2.63804292678833 + }, + { + "auxiliary_loss_clip": 0.06563858, + "auxiliary_loss_mlp": 0.01278148, + "balance_loss_clip": 0.06308811, + "balance_loss_mlp": 0.01256452, + "epoch": 0.19383736660153314, + "flos": 15081872423040.0, + "grad_norm": 2.2027436227921977, + "language_loss": 0.76066363, + "learning_rate": 3.7249651452288653e-06, + "loss": 0.83908367, + "num_input_tokens_seen": 69597085, + "router_z_loss_clip": 2.55273438, + "router_z_loss_mlp": 0.21728516, + "step": 3224, + "time_per_iteration": 2.4926822185516357 + }, + { + "auxiliary_loss_clip": 0.06569614, + "auxiliary_loss_mlp": 0.01280842, + "balance_loss_clip": 0.06311695, + "balance_loss_mlp": 0.01257155, + "epoch": 0.1938974898542011, + "flos": 47130626246400.0, + "grad_norm": 2.47304361876348, + "language_loss": 0.71419585, + "learning_rate": 3.7247680111229e-06, + "loss": 0.79270041, + "num_input_tokens_seen": 69618885, + "router_z_loss_clip": 2.578125, + "router_z_loss_mlp": 0.23681641, + "step": 3225, + "time_per_iteration": 2.8417437076568604 + }, + { + "auxiliary_loss_clip": 0.0656653, + "auxiliary_loss_mlp": 0.01277702, + "balance_loss_clip": 0.06306545, + "balance_loss_mlp": 0.01255076, + "epoch": 0.19395761310686907, + "flos": 25819734424320.0, + "grad_norm": 2.3579945849430235, + "language_loss": 0.6987173, + "learning_rate": 3.7245708116135585e-06, + "loss": 0.77715963, + "num_input_tokens_seen": 69638200, + "router_z_loss_clip": 2.59960938, + "router_z_loss_mlp": 0.22619629, + "step": 3226, + "time_per_iteration": 2.5816895961761475 + }, + { + "auxiliary_loss_clip": 0.06556038, + "auxiliary_loss_mlp": 0.01279426, + "balance_loss_clip": 0.06305292, + "balance_loss_mlp": 0.01255608, + "epoch": 0.19401773635953706, + "flos": 23046315333120.0, + "grad_norm": 1.6993594132957168, + "language_loss": 0.76826584, + "learning_rate": 3.7243735467083193e-06, + "loss": 0.84662044, + "num_input_tokens_seen": 69657550, + "router_z_loss_clip": 2.5078125, + "router_z_loss_mlp": 0.23815918, + "step": 3227, + "time_per_iteration": 2.5873494148254395 + }, + { + "auxiliary_loss_clip": 0.06565821, + "auxiliary_loss_mlp": 0.01279646, + "balance_loss_clip": 0.063063, + "balance_loss_mlp": 0.01257187, + "epoch": 0.19407785961220503, + "flos": 15925615511040.0, + "grad_norm": 1.984580707337323, + "language_loss": 0.70403302, + "learning_rate": 3.724176216414662e-06, + "loss": 0.78248763, + "num_input_tokens_seen": 69675005, + "router_z_loss_clip": 2.59765625, + "router_z_loss_mlp": 0.22460938, + "step": 3228, + "time_per_iteration": 2.5275485515594482 + }, + { + "auxiliary_loss_clip": 0.06563079, + "auxiliary_loss_mlp": 0.01279835, + "balance_loss_clip": 0.06306829, + "balance_loss_mlp": 0.01257662, + "epoch": 0.194137982864873, + "flos": 25928872767360.0, + "grad_norm": 1.8334459249779138, + "language_loss": 0.74913502, + "learning_rate": 3.72397882074007e-06, + "loss": 0.82756412, + "num_input_tokens_seen": 69696455, + "router_z_loss_clip": 2.56640625, + "router_z_loss_mlp": 0.2220459, + "step": 3229, + "time_per_iteration": 2.588756561279297 + }, + { + "auxiliary_loss_clip": 0.06561101, + "auxiliary_loss_mlp": 0.01283623, + "balance_loss_clip": 0.06304256, + "balance_loss_mlp": 0.01260126, + "epoch": 0.19419810611754096, + "flos": 13266407934720.0, + "grad_norm": 2.0512138922716034, + "language_loss": 0.66050041, + "learning_rate": 3.7237813596920285e-06, + "loss": 0.73894763, + "num_input_tokens_seen": 69714245, + "router_z_loss_clip": 2.57226562, + "router_z_loss_mlp": 0.23486328, + "step": 3230, + "time_per_iteration": 2.51173996925354 + }, + { + "auxiliary_loss_clip": 0.06559683, + "auxiliary_loss_mlp": 0.01282022, + "balance_loss_clip": 0.06306173, + "balance_loss_mlp": 0.01259444, + "epoch": 0.19425822937020892, + "flos": 15710986477440.0, + "grad_norm": 1.9323382078744304, + "language_loss": 0.82361978, + "learning_rate": 3.7235838332780254e-06, + "loss": 0.90203679, + "num_input_tokens_seen": 69731515, + "router_z_loss_clip": 2.53515625, + "router_z_loss_mlp": 0.22583008, + "step": 3231, + "time_per_iteration": 2.5331170558929443 + }, + { + "auxiliary_loss_clip": 0.06565376, + "auxiliary_loss_mlp": 0.01284277, + "balance_loss_clip": 0.0630706, + "balance_loss_mlp": 0.01260793, + "epoch": 0.1943183526228769, + "flos": 23110912431360.0, + "grad_norm": 1.7851653331870696, + "language_loss": 0.8806898, + "learning_rate": 3.72338624150555e-06, + "loss": 0.95918632, + "num_input_tokens_seen": 69748885, + "router_z_loss_clip": 2.5859375, + "router_z_loss_mlp": 0.23474121, + "step": 3232, + "time_per_iteration": 2.556128740310669 + }, + { + "auxiliary_loss_clip": 0.06561054, + "auxiliary_loss_mlp": 0.01288213, + "balance_loss_clip": 0.06308518, + "balance_loss_mlp": 0.01265718, + "epoch": 0.19437847587554485, + "flos": 24718707774720.0, + "grad_norm": 1.9425002506843316, + "language_loss": 0.8592729, + "learning_rate": 3.723188584382096e-06, + "loss": 0.93776554, + "num_input_tokens_seen": 69767540, + "router_z_loss_clip": 2.52929688, + "router_z_loss_mlp": 0.22497559, + "step": 3233, + "time_per_iteration": 2.5888071060180664 + }, + { + "auxiliary_loss_clip": 0.06570844, + "auxiliary_loss_mlp": 0.01287681, + "balance_loss_clip": 0.06309654, + "balance_loss_mlp": 0.01263195, + "epoch": 0.19443859912821285, + "flos": 23123448616320.0, + "grad_norm": 2.322933236090491, + "language_loss": 0.8952834, + "learning_rate": 3.722990861915158e-06, + "loss": 0.97386861, + "num_input_tokens_seen": 69789340, + "router_z_loss_clip": 2.61328125, + "router_z_loss_mlp": 0.24499512, + "step": 3234, + "time_per_iteration": 2.598424196243286 + }, + { + "auxiliary_loss_clip": 0.0656711, + "auxiliary_loss_mlp": 0.01279524, + "balance_loss_clip": 0.06307149, + "balance_loss_mlp": 0.01256243, + "epoch": 0.1944987223808808, + "flos": 15089545071360.0, + "grad_norm": 2.0762312051619993, + "language_loss": 0.7883603, + "learning_rate": 3.722793074112234e-06, + "loss": 0.86682659, + "num_input_tokens_seen": 69806470, + "router_z_loss_clip": 2.59765625, + "router_z_loss_mlp": 0.23291016, + "step": 3235, + "time_per_iteration": 2.518150806427002 + }, + { + "auxiliary_loss_clip": 0.06562902, + "auxiliary_loss_mlp": 0.01278747, + "balance_loss_clip": 0.06309078, + "balance_loss_mlp": 0.01257253, + "epoch": 0.19455884563354878, + "flos": 17132258632320.0, + "grad_norm": 2.012702835830896, + "language_loss": 0.79693586, + "learning_rate": 3.7225952209808233e-06, + "loss": 0.87535232, + "num_input_tokens_seen": 69822655, + "router_z_loss_clip": 2.54296875, + "router_z_loss_mlp": 0.21520996, + "step": 3236, + "time_per_iteration": 2.5621957778930664 + }, + { + "auxiliary_loss_clip": 0.06562862, + "auxiliary_loss_mlp": 0.01279358, + "balance_loss_clip": 0.06309117, + "balance_loss_mlp": 0.0125635, + "epoch": 0.19461896888621674, + "flos": 20199578319360.0, + "grad_norm": 1.7644130728207734, + "language_loss": 0.76505381, + "learning_rate": 3.72239730252843e-06, + "loss": 0.84347594, + "num_input_tokens_seen": 69841895, + "router_z_loss_clip": 2.53710938, + "router_z_loss_mlp": 0.23010254, + "step": 3237, + "time_per_iteration": 2.545138359069824 + }, + { + "auxiliary_loss_clip": 0.06572011, + "auxiliary_loss_mlp": 0.01287724, + "balance_loss_clip": 0.06309787, + "balance_loss_mlp": 0.01264455, + "epoch": 0.1946790921388847, + "flos": 25308395683200.0, + "grad_norm": 3.0171180207385855, + "language_loss": 0.75939953, + "learning_rate": 3.7221993187625583e-06, + "loss": 0.8379969, + "num_input_tokens_seen": 69862220, + "router_z_loss_clip": 2.62304688, + "router_z_loss_mlp": 0.23291016, + "step": 3238, + "time_per_iteration": 2.6292033195495605 + }, + { + "auxiliary_loss_clip": 0.06564013, + "auxiliary_loss_mlp": 0.01283016, + "balance_loss_clip": 0.0631004, + "balance_loss_mlp": 0.0126033, + "epoch": 0.19473921539155267, + "flos": 20199578319360.0, + "grad_norm": 5.2039179549819, + "language_loss": 0.740753, + "learning_rate": 3.7220012696907155e-06, + "loss": 0.81922328, + "num_input_tokens_seen": 69881830, + "router_z_loss_clip": 2.5390625, + "router_z_loss_mlp": 0.22692871, + "step": 3239, + "time_per_iteration": 2.5251026153564453 + }, + { + "auxiliary_loss_clip": 0.06561047, + "auxiliary_loss_mlp": 0.01279887, + "balance_loss_clip": 0.06308049, + "balance_loss_mlp": 0.01257464, + "epoch": 0.19479933864422067, + "flos": 20894002231680.0, + "grad_norm": 2.589752485587752, + "language_loss": 0.74076676, + "learning_rate": 3.721803155320412e-06, + "loss": 0.8191762, + "num_input_tokens_seen": 69900515, + "router_z_loss_clip": 2.53125, + "router_z_loss_mlp": 0.22424316, + "step": 3240, + "time_per_iteration": 2.5630886554718018 + }, + { + "auxiliary_loss_clip": 0.06569096, + "auxiliary_loss_mlp": 0.01285658, + "balance_loss_clip": 0.06312588, + "balance_loss_mlp": 0.01262758, + "epoch": 0.19485946189688863, + "flos": 23301837959040.0, + "grad_norm": 2.269188581778515, + "language_loss": 0.67009896, + "learning_rate": 3.7216049756591606e-06, + "loss": 0.7486465, + "num_input_tokens_seen": 69920060, + "router_z_loss_clip": 2.56640625, + "router_z_loss_mlp": 0.22888184, + "step": 3241, + "time_per_iteration": 2.5366311073303223 + }, + { + "auxiliary_loss_clip": 0.0657091, + "auxiliary_loss_mlp": 0.01284859, + "balance_loss_clip": 0.06315701, + "balance_loss_mlp": 0.01261017, + "epoch": 0.1949195851495566, + "flos": 23301796032000.0, + "grad_norm": 1.7252715969085026, + "language_loss": 0.8313868, + "learning_rate": 3.7214067307144754e-06, + "loss": 0.90994453, + "num_input_tokens_seen": 69939820, + "router_z_loss_clip": 2.5546875, + "router_z_loss_mlp": 0.23828125, + "step": 3242, + "time_per_iteration": 2.5582659244537354 + }, + { + "auxiliary_loss_clip": 0.06462191, + "auxiliary_loss_mlp": 0.01271622, + "balance_loss_clip": 0.06317475, + "balance_loss_mlp": 0.01264684, + "epoch": 0.19497970840222456, + "flos": 64982884285440.0, + "grad_norm": 0.8039225971535554, + "language_loss": 0.57435864, + "learning_rate": 3.721208420493875e-06, + "loss": 0.6516968, + "num_input_tokens_seen": 70002145, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.06951904, + "step": 3243, + "time_per_iteration": 3.1517677307128906 + }, + { + "auxiliary_loss_clip": 0.06582105, + "auxiliary_loss_mlp": 0.01289713, + "balance_loss_clip": 0.06324299, + "balance_loss_mlp": 0.01264619, + "epoch": 0.19503983165489253, + "flos": 19650574368000.0, + "grad_norm": 1.7327160710810887, + "language_loss": 0.83662367, + "learning_rate": 3.7210100450048784e-06, + "loss": 0.91534185, + "num_input_tokens_seen": 70020510, + "router_z_loss_clip": 2.58007812, + "router_z_loss_mlp": 0.25085449, + "step": 3244, + "time_per_iteration": 2.580615282058716 + }, + { + "auxiliary_loss_clip": 0.06580628, + "auxiliary_loss_mlp": 0.01287488, + "balance_loss_clip": 0.06321178, + "balance_loss_mlp": 0.01264206, + "epoch": 0.1950999549075605, + "flos": 21148308973440.0, + "grad_norm": 1.8443508562563502, + "language_loss": 0.77383208, + "learning_rate": 3.7208116042550088e-06, + "loss": 0.85251331, + "num_input_tokens_seen": 70040760, + "router_z_loss_clip": 2.59375, + "router_z_loss_mlp": 0.23278809, + "step": 3245, + "time_per_iteration": 2.562547206878662 + }, + { + "auxiliary_loss_clip": 0.06574707, + "auxiliary_loss_mlp": 0.01284069, + "balance_loss_clip": 0.06316134, + "balance_loss_mlp": 0.01260168, + "epoch": 0.19516007816022846, + "flos": 20890815776640.0, + "grad_norm": 1.9180190042930891, + "language_loss": 0.84645605, + "learning_rate": 3.7206130982517906e-06, + "loss": 0.92504388, + "num_input_tokens_seen": 70058720, + "router_z_loss_clip": 2.58789062, + "router_z_loss_mlp": 0.2388916, + "step": 3246, + "time_per_iteration": 2.5781290531158447 + }, + { + "auxiliary_loss_clip": 0.06585012, + "auxiliary_loss_mlp": 0.01283635, + "balance_loss_clip": 0.0632351, + "balance_loss_mlp": 0.012612, + "epoch": 0.19522020141289645, + "flos": 16916287933440.0, + "grad_norm": 2.4019655481348177, + "language_loss": 0.77056623, + "learning_rate": 3.7204145270027514e-06, + "loss": 0.8492527, + "num_input_tokens_seen": 70076470, + "router_z_loss_clip": 2.61523438, + "router_z_loss_mlp": 0.22436523, + "step": 3247, + "time_per_iteration": 2.5042033195495605 + }, + { + "auxiliary_loss_clip": 0.06582692, + "auxiliary_loss_mlp": 0.01287787, + "balance_loss_clip": 0.06325091, + "balance_loss_mlp": 0.01264136, + "epoch": 0.19528032466556441, + "flos": 26732183460480.0, + "grad_norm": 1.5912411640106108, + "language_loss": 0.75763261, + "learning_rate": 3.720215890515421e-06, + "loss": 0.83633739, + "num_input_tokens_seen": 70096220, + "router_z_loss_clip": 2.58007812, + "router_z_loss_mlp": 0.23669434, + "step": 3248, + "time_per_iteration": 2.629751205444336 + }, + { + "auxiliary_loss_clip": 0.0657216, + "auxiliary_loss_mlp": 0.01286346, + "balance_loss_clip": 0.06312956, + "balance_loss_mlp": 0.01263994, + "epoch": 0.19534044791823238, + "flos": 21039170630400.0, + "grad_norm": 2.0257715109614822, + "language_loss": 0.79102194, + "learning_rate": 3.7200171887973316e-06, + "loss": 0.86960697, + "num_input_tokens_seen": 70114800, + "router_z_loss_clip": 2.58984375, + "router_z_loss_mlp": 0.22375488, + "step": 3249, + "time_per_iteration": 2.5774686336517334 + }, + { + "auxiliary_loss_clip": 0.06565905, + "auxiliary_loss_mlp": 0.01285899, + "balance_loss_clip": 0.06309386, + "balance_loss_mlp": 0.01263035, + "epoch": 0.19540057117090034, + "flos": 22350256266240.0, + "grad_norm": 1.6645797480066, + "language_loss": 0.73634374, + "learning_rate": 3.7198184218560176e-06, + "loss": 0.81486177, + "num_input_tokens_seen": 70134930, + "router_z_loss_clip": 2.56835938, + "router_z_loss_mlp": 0.2286377, + "step": 3250, + "time_per_iteration": 2.5834462642669678 + }, + { + "auxiliary_loss_clip": 0.06557436, + "auxiliary_loss_mlp": 0.01284202, + "balance_loss_clip": 0.06304777, + "balance_loss_mlp": 0.01261791, + "epoch": 0.1954606944235683, + "flos": 20307626559360.0, + "grad_norm": 5.203824713813235, + "language_loss": 0.80619103, + "learning_rate": 3.719619589699017e-06, + "loss": 0.88460743, + "num_input_tokens_seen": 70152045, + "router_z_loss_clip": 2.52929688, + "router_z_loss_mlp": 0.22399902, + "step": 3251, + "time_per_iteration": 2.5159976482391357 + }, + { + "auxiliary_loss_clip": 0.06569009, + "auxiliary_loss_mlp": 0.0128766, + "balance_loss_clip": 0.06309755, + "balance_loss_mlp": 0.01264593, + "epoch": 0.19552081767623627, + "flos": 17352463962240.0, + "grad_norm": 2.6280610562746882, + "language_loss": 0.84652966, + "learning_rate": 3.7194206923338695e-06, + "loss": 0.92509639, + "num_input_tokens_seen": 70169240, + "router_z_loss_clip": 2.59375, + "router_z_loss_mlp": 0.23071289, + "step": 3252, + "time_per_iteration": 2.584712505340576 + }, + { + "auxiliary_loss_clip": 0.0657175, + "auxiliary_loss_mlp": 0.01284666, + "balance_loss_clip": 0.06305347, + "balance_loss_mlp": 0.01258559, + "epoch": 0.19558094092890424, + "flos": 31985666098560.0, + "grad_norm": 1.8259798075239808, + "language_loss": 0.74205744, + "learning_rate": 3.719221729768117e-06, + "loss": 0.82062161, + "num_input_tokens_seen": 70192690, + "router_z_loss_clip": 2.66601562, + "router_z_loss_mlp": 0.26098633, + "step": 3253, + "time_per_iteration": 4.126874685287476 + }, + { + "auxiliary_loss_clip": 0.06567718, + "auxiliary_loss_mlp": 0.01281159, + "balance_loss_clip": 0.06301166, + "balance_loss_mlp": 0.0125721, + "epoch": 0.19564106418157223, + "flos": 22274716210560.0, + "grad_norm": 1.973936337746025, + "language_loss": 0.77398765, + "learning_rate": 3.7190227020093037e-06, + "loss": 0.85247642, + "num_input_tokens_seen": 70209685, + "router_z_loss_clip": 2.66210938, + "router_z_loss_mlp": 0.23962402, + "step": 3254, + "time_per_iteration": 2.6537773609161377 + }, + { + "auxiliary_loss_clip": 0.06437294, + "auxiliary_loss_mlp": 0.01260118, + "balance_loss_clip": 0.06291844, + "balance_loss_mlp": 0.01253204, + "epoch": 0.1957011874342402, + "flos": 54379876631040.0, + "grad_norm": 0.7412950515810539, + "language_loss": 0.55013955, + "learning_rate": 3.7188236090649774e-06, + "loss": 0.62711358, + "num_input_tokens_seen": 70265050, + "router_z_loss_clip": 1.453125, + "router_z_loss_mlp": 0.06933594, + "step": 3255, + "time_per_iteration": 4.54949426651001 + }, + { + "auxiliary_loss_clip": 0.06563026, + "auxiliary_loss_mlp": 0.01289416, + "balance_loss_clip": 0.06301506, + "balance_loss_mlp": 0.01265407, + "epoch": 0.19576131068690816, + "flos": 16511991183360.0, + "grad_norm": 2.710710922193229, + "language_loss": 0.71672189, + "learning_rate": 3.718624450942688e-06, + "loss": 0.79524636, + "num_input_tokens_seen": 70281830, + "router_z_loss_clip": 2.61523438, + "router_z_loss_mlp": 0.2401123, + "step": 3256, + "time_per_iteration": 2.563938617706299 + }, + { + "auxiliary_loss_clip": 0.06557887, + "auxiliary_loss_mlp": 0.01283051, + "balance_loss_clip": 0.06298412, + "balance_loss_mlp": 0.01259591, + "epoch": 0.19582143393957613, + "flos": 14724800248320.0, + "grad_norm": 2.2116868908222176, + "language_loss": 0.8133806, + "learning_rate": 3.718425227649987e-06, + "loss": 0.89178997, + "num_input_tokens_seen": 70297420, + "router_z_loss_clip": 2.59570312, + "router_z_loss_mlp": 0.23461914, + "step": 3257, + "time_per_iteration": 2.546842336654663 + }, + { + "auxiliary_loss_clip": 0.06568147, + "auxiliary_loss_mlp": 0.01289159, + "balance_loss_clip": 0.06309533, + "balance_loss_mlp": 0.01264554, + "epoch": 0.1958815571922441, + "flos": 24432354046080.0, + "grad_norm": 4.3707104143190785, + "language_loss": 0.76246595, + "learning_rate": 3.7182259391944292e-06, + "loss": 0.841039, + "num_input_tokens_seen": 70319210, + "router_z_loss_clip": 2.58398438, + "router_z_loss_mlp": 0.24609375, + "step": 3258, + "time_per_iteration": 2.596585273742676 + }, + { + "auxiliary_loss_clip": 0.06562606, + "auxiliary_loss_mlp": 0.01282027, + "balance_loss_clip": 0.06300102, + "balance_loss_mlp": 0.01257828, + "epoch": 0.19594168044491206, + "flos": 24907285388160.0, + "grad_norm": 1.9490064747675282, + "language_loss": 0.74507892, + "learning_rate": 3.7180265855835714e-06, + "loss": 0.82352525, + "num_input_tokens_seen": 70339045, + "router_z_loss_clip": 2.62695312, + "router_z_loss_mlp": 0.24230957, + "step": 3259, + "time_per_iteration": 2.572443723678589 + }, + { + "auxiliary_loss_clip": 0.06562422, + "auxiliary_loss_mlp": 0.01289683, + "balance_loss_clip": 0.06298189, + "balance_loss_mlp": 0.01263302, + "epoch": 0.19600180369758005, + "flos": 12061819238400.0, + "grad_norm": 2.2810085679716106, + "language_loss": 0.7772423, + "learning_rate": 3.7178271668249735e-06, + "loss": 0.85576332, + "num_input_tokens_seen": 70356505, + "router_z_loss_clip": 2.63867188, + "router_z_loss_mlp": 0.26379395, + "step": 3260, + "time_per_iteration": 5.330974340438843 + }, + { + "auxiliary_loss_clip": 0.06562512, + "auxiliary_loss_mlp": 0.01290293, + "balance_loss_clip": 0.06300309, + "balance_loss_mlp": 0.01266046, + "epoch": 0.19606192695024802, + "flos": 20856504729600.0, + "grad_norm": 2.085882514659535, + "language_loss": 0.83190846, + "learning_rate": 3.7176276829261975e-06, + "loss": 0.91043651, + "num_input_tokens_seen": 70375410, + "router_z_loss_clip": 2.62109375, + "router_z_loss_mlp": 0.24279785, + "step": 3261, + "time_per_iteration": 2.5832743644714355 + }, + { + "auxiliary_loss_clip": 0.06565593, + "auxiliary_loss_mlp": 0.01288067, + "balance_loss_clip": 0.06304751, + "balance_loss_mlp": 0.01263296, + "epoch": 0.19612205020291598, + "flos": 28483050850560.0, + "grad_norm": 1.7951789750723233, + "language_loss": 0.77451867, + "learning_rate": 3.717428133894807e-06, + "loss": 0.85305524, + "num_input_tokens_seen": 70396315, + "router_z_loss_clip": 2.60546875, + "router_z_loss_mlp": 0.24768066, + "step": 3262, + "time_per_iteration": 2.5895204544067383 + }, + { + "auxiliary_loss_clip": 0.06560683, + "auxiliary_loss_mlp": 0.01286928, + "balance_loss_clip": 0.06303811, + "balance_loss_mlp": 0.01264004, + "epoch": 0.19618217345558395, + "flos": 25563666746880.0, + "grad_norm": 1.6758780497522678, + "language_loss": 0.87025416, + "learning_rate": 3.71722851973837e-06, + "loss": 0.94873023, + "num_input_tokens_seen": 70417945, + "router_z_loss_clip": 2.57226562, + "router_z_loss_mlp": 0.22937012, + "step": 3263, + "time_per_iteration": 2.5864033699035645 + }, + { + "auxiliary_loss_clip": 0.0656628, + "auxiliary_loss_mlp": 0.01296773, + "balance_loss_clip": 0.06306224, + "balance_loss_mlp": 0.0127137, + "epoch": 0.1962422967082519, + "flos": 25271359378560.0, + "grad_norm": 1.67172611639437, + "language_loss": 0.74829996, + "learning_rate": 3.717028840464455e-06, + "loss": 0.82693052, + "num_input_tokens_seen": 70438690, + "router_z_loss_clip": 2.60546875, + "router_z_loss_mlp": 0.25390625, + "step": 3264, + "time_per_iteration": 2.5601091384887695 + }, + { + "auxiliary_loss_clip": 0.06569743, + "auxiliary_loss_mlp": 0.01288835, + "balance_loss_clip": 0.0631538, + "balance_loss_mlp": 0.01264337, + "epoch": 0.19630241996091988, + "flos": 18813371898240.0, + "grad_norm": 2.189524829184907, + "language_loss": 0.7983582, + "learning_rate": 3.7168290960806344e-06, + "loss": 0.87694395, + "num_input_tokens_seen": 70455385, + "router_z_loss_clip": 2.546875, + "router_z_loss_mlp": 0.24511719, + "step": 3265, + "time_per_iteration": 2.540691614151001 + }, + { + "auxiliary_loss_clip": 0.06455089, + "auxiliary_loss_mlp": 0.01265834, + "balance_loss_clip": 0.06313262, + "balance_loss_mlp": 0.01257317, + "epoch": 0.19636254321358784, + "flos": 62338240120320.0, + "grad_norm": 0.7691014679533006, + "language_loss": 0.53069305, + "learning_rate": 3.716629286594483e-06, + "loss": 0.60790235, + "num_input_tokens_seen": 70514280, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.08526611, + "step": 3266, + "time_per_iteration": 3.1712465286254883 + }, + { + "auxiliary_loss_clip": 0.06579427, + "auxiliary_loss_mlp": 0.01300624, + "balance_loss_clip": 0.06317084, + "balance_loss_mlp": 0.01276138, + "epoch": 0.19642266646625584, + "flos": 21075703810560.0, + "grad_norm": 2.1807082930425548, + "language_loss": 0.8080219, + "learning_rate": 3.7164294120135767e-06, + "loss": 0.88682246, + "num_input_tokens_seen": 70531800, + "router_z_loss_clip": 2.61914062, + "router_z_loss_mlp": 0.24487305, + "step": 3267, + "time_per_iteration": 2.551907539367676 + }, + { + "auxiliary_loss_clip": 0.06564153, + "auxiliary_loss_mlp": 0.0128147, + "balance_loss_clip": 0.06308893, + "balance_loss_mlp": 0.01257366, + "epoch": 0.1964827897189238, + "flos": 14543979137280.0, + "grad_norm": 2.1592598522148694, + "language_loss": 0.8731035, + "learning_rate": 3.7162294723454953e-06, + "loss": 0.95155978, + "num_input_tokens_seen": 70550615, + "router_z_loss_clip": 2.55078125, + "router_z_loss_mlp": 0.24108887, + "step": 3268, + "time_per_iteration": 2.520824909210205 + }, + { + "auxiliary_loss_clip": 0.06570253, + "auxiliary_loss_mlp": 0.01291413, + "balance_loss_clip": 0.0631839, + "balance_loss_mlp": 0.01268858, + "epoch": 0.19654291297159177, + "flos": 19250638030080.0, + "grad_norm": 2.3684809338902215, + "language_loss": 0.70127171, + "learning_rate": 3.7160294675978197e-06, + "loss": 0.77988833, + "num_input_tokens_seen": 70568690, + "router_z_loss_clip": 2.52148438, + "router_z_loss_mlp": 0.22546387, + "step": 3269, + "time_per_iteration": 2.542065382003784 + }, + { + "auxiliary_loss_clip": 0.06579614, + "auxiliary_loss_mlp": 0.01289007, + "balance_loss_clip": 0.06318989, + "balance_loss_mlp": 0.01263008, + "epoch": 0.19660303622425973, + "flos": 25782823900800.0, + "grad_norm": 3.1056086534351324, + "language_loss": 0.80997849, + "learning_rate": 3.715829397778135e-06, + "loss": 0.88866472, + "num_input_tokens_seen": 70588665, + "router_z_loss_clip": 2.60351562, + "router_z_loss_mlp": 0.25976562, + "step": 3270, + "time_per_iteration": 2.5732779502868652 + }, + { + "auxiliary_loss_clip": 0.0656828, + "auxiliary_loss_mlp": 0.0128367, + "balance_loss_clip": 0.06310552, + "balance_loss_mlp": 0.01257468, + "epoch": 0.1966631594769277, + "flos": 20601401374080.0, + "grad_norm": 4.117702501056874, + "language_loss": 0.84620351, + "learning_rate": 3.715629262894028e-06, + "loss": 0.92472303, + "num_input_tokens_seen": 70606900, + "router_z_loss_clip": 2.58007812, + "router_z_loss_mlp": 0.26220703, + "step": 3271, + "time_per_iteration": 2.54874587059021 + }, + { + "auxiliary_loss_clip": 0.06565209, + "auxiliary_loss_mlp": 0.01287963, + "balance_loss_clip": 0.06316341, + "balance_loss_mlp": 0.01263311, + "epoch": 0.19672328272959566, + "flos": 23629965747840.0, + "grad_norm": 1.9724475535226151, + "language_loss": 0.8064115, + "learning_rate": 3.715429062953087e-06, + "loss": 0.88494325, + "num_input_tokens_seen": 70625955, + "router_z_loss_clip": 2.48828125, + "router_z_loss_mlp": 0.2467041, + "step": 3272, + "time_per_iteration": 2.5446958541870117 + }, + { + "auxiliary_loss_clip": 0.06582461, + "auxiliary_loss_mlp": 0.01289002, + "balance_loss_clip": 0.06322335, + "balance_loss_mlp": 0.0126218, + "epoch": 0.19678340598226365, + "flos": 23117369195520.0, + "grad_norm": 1.7276133269560208, + "language_loss": 0.81592834, + "learning_rate": 3.7152287979629043e-06, + "loss": 0.89464301, + "num_input_tokens_seen": 70646090, + "router_z_loss_clip": 2.60351562, + "router_z_loss_mlp": 0.26831055, + "step": 3273, + "time_per_iteration": 2.625422239303589 + }, + { + "auxiliary_loss_clip": 0.06569564, + "auxiliary_loss_mlp": 0.01284595, + "balance_loss_clip": 0.06313652, + "balance_loss_mlp": 0.0126142, + "epoch": 0.19684352923493162, + "flos": 24541702024320.0, + "grad_norm": 1.8603958272733907, + "language_loss": 0.78998351, + "learning_rate": 3.7150284679310735e-06, + "loss": 0.86852515, + "num_input_tokens_seen": 70666065, + "router_z_loss_clip": 2.5625, + "router_z_loss_mlp": 0.23181152, + "step": 3274, + "time_per_iteration": 2.6299047470092773 + }, + { + "auxiliary_loss_clip": 0.06566115, + "auxiliary_loss_mlp": 0.01283599, + "balance_loss_clip": 0.0630929, + "balance_loss_mlp": 0.01259722, + "epoch": 0.19690365248759958, + "flos": 21802510126080.0, + "grad_norm": 2.495100495270235, + "language_loss": 0.82370663, + "learning_rate": 3.7148280728651914e-06, + "loss": 0.90220374, + "num_input_tokens_seen": 70681580, + "router_z_loss_clip": 2.56835938, + "router_z_loss_mlp": 0.23864746, + "step": 3275, + "time_per_iteration": 2.532348394393921 + }, + { + "auxiliary_loss_clip": 0.06571324, + "auxiliary_loss_mlp": 0.0128437, + "balance_loss_clip": 0.06313166, + "balance_loss_mlp": 0.01259134, + "epoch": 0.19696377574026755, + "flos": 19061683073280.0, + "grad_norm": 2.1007591714873968, + "language_loss": 0.81547761, + "learning_rate": 3.7146276127728563e-06, + "loss": 0.8940345, + "num_input_tokens_seen": 70697745, + "router_z_loss_clip": 2.58203125, + "router_z_loss_mlp": 0.25244141, + "step": 3276, + "time_per_iteration": 2.533137798309326 + }, + { + "auxiliary_loss_clip": 0.06571773, + "auxiliary_loss_mlp": 0.01280216, + "balance_loss_clip": 0.0631392, + "balance_loss_mlp": 0.01256135, + "epoch": 0.19702389899293551, + "flos": 22827325887360.0, + "grad_norm": 2.204561669505926, + "language_loss": 0.89893198, + "learning_rate": 3.7144270876616713e-06, + "loss": 0.97745186, + "num_input_tokens_seen": 70715110, + "router_z_loss_clip": 2.58007812, + "router_z_loss_mlp": 0.24084473, + "step": 3277, + "time_per_iteration": 2.5781216621398926 + }, + { + "auxiliary_loss_clip": 0.0657627, + "auxiliary_loss_mlp": 0.01285494, + "balance_loss_clip": 0.06313394, + "balance_loss_mlp": 0.01258922, + "epoch": 0.19708402224560348, + "flos": 22901021153280.0, + "grad_norm": 2.1685116517567273, + "language_loss": 0.63218272, + "learning_rate": 3.714226497539239e-06, + "loss": 0.71080041, + "num_input_tokens_seen": 70734715, + "router_z_loss_clip": 2.62890625, + "router_z_loss_mlp": 0.26574707, + "step": 3278, + "time_per_iteration": 2.5733482837677 + }, + { + "auxiliary_loss_clip": 0.06573428, + "auxiliary_loss_mlp": 0.01286907, + "balance_loss_clip": 0.0631459, + "balance_loss_mlp": 0.01261515, + "epoch": 0.19714414549827144, + "flos": 25668989729280.0, + "grad_norm": 2.1172991336759983, + "language_loss": 0.75555933, + "learning_rate": 3.714025842413166e-06, + "loss": 0.83416271, + "num_input_tokens_seen": 70752650, + "router_z_loss_clip": 2.5859375, + "router_z_loss_mlp": 0.25378418, + "step": 3279, + "time_per_iteration": 2.598710775375366 + }, + { + "auxiliary_loss_clip": 0.06574699, + "auxiliary_loss_mlp": 0.0128012, + "balance_loss_clip": 0.06317799, + "balance_loss_mlp": 0.01256671, + "epoch": 0.19720426875093944, + "flos": 23922776240640.0, + "grad_norm": 1.6530428540457747, + "language_loss": 0.82974696, + "learning_rate": 3.713825122291061e-06, + "loss": 0.90829515, + "num_input_tokens_seen": 70772365, + "router_z_loss_clip": 2.56835938, + "router_z_loss_mlp": 0.23449707, + "step": 3280, + "time_per_iteration": 2.618016481399536 + }, + { + "auxiliary_loss_clip": 0.06568167, + "auxiliary_loss_mlp": 0.01283165, + "balance_loss_clip": 0.0630914, + "balance_loss_mlp": 0.01259085, + "epoch": 0.1972643920036074, + "flos": 13887178508160.0, + "grad_norm": 2.6497469055747036, + "language_loss": 0.78509879, + "learning_rate": 3.713624337180536e-06, + "loss": 0.86361206, + "num_input_tokens_seen": 70790340, + "router_z_loss_clip": 2.58984375, + "router_z_loss_mlp": 0.24084473, + "step": 3281, + "time_per_iteration": 2.5222740173339844 + }, + { + "auxiliary_loss_clip": 0.06561945, + "auxiliary_loss_mlp": 0.01286304, + "balance_loss_clip": 0.06312899, + "balance_loss_mlp": 0.01263952, + "epoch": 0.19732451525627537, + "flos": 19869479959680.0, + "grad_norm": 1.7725817592402109, + "language_loss": 0.80340242, + "learning_rate": 3.7134234870892045e-06, + "loss": 0.88188481, + "num_input_tokens_seen": 70809295, + "router_z_loss_clip": 2.4921875, + "router_z_loss_mlp": 0.22351074, + "step": 3282, + "time_per_iteration": 2.6235008239746094 + }, + { + "auxiliary_loss_clip": 0.06573974, + "auxiliary_loss_mlp": 0.01283963, + "balance_loss_clip": 0.06315407, + "balance_loss_mlp": 0.01259668, + "epoch": 0.19738463850894333, + "flos": 24980477529600.0, + "grad_norm": 1.861487958506938, + "language_loss": 0.72318685, + "learning_rate": 3.7132225720246826e-06, + "loss": 0.80176622, + "num_input_tokens_seen": 70828765, + "router_z_loss_clip": 2.58398438, + "router_z_loss_mlp": 0.24304199, + "step": 3283, + "time_per_iteration": 2.5938494205474854 + }, + { + "auxiliary_loss_clip": 0.06574511, + "auxiliary_loss_mlp": 0.01281543, + "balance_loss_clip": 0.06317373, + "balance_loss_mlp": 0.01256247, + "epoch": 0.1974447617616113, + "flos": 18374722174080.0, + "grad_norm": 1.6759301931344739, + "language_loss": 0.79791147, + "learning_rate": 3.7130215919945886e-06, + "loss": 0.87647206, + "num_input_tokens_seen": 70846805, + "router_z_loss_clip": 2.56640625, + "router_z_loss_mlp": 0.25292969, + "step": 3284, + "time_per_iteration": 2.530935049057007 + }, + { + "auxiliary_loss_clip": 0.06572407, + "auxiliary_loss_mlp": 0.01285612, + "balance_loss_clip": 0.06312867, + "balance_loss_mlp": 0.01260554, + "epoch": 0.19750488501427926, + "flos": 22899511779840.0, + "grad_norm": 1.8637255752391477, + "language_loss": 0.87043929, + "learning_rate": 3.7128205470065445e-06, + "loss": 0.94901949, + "num_input_tokens_seen": 70863805, + "router_z_loss_clip": 2.59570312, + "router_z_loss_mlp": 0.25061035, + "step": 3285, + "time_per_iteration": 2.5539395809173584 + }, + { + "auxiliary_loss_clip": 0.06561802, + "auxiliary_loss_mlp": 0.01282259, + "balance_loss_clip": 0.06307627, + "balance_loss_mlp": 0.01258012, + "epoch": 0.19756500826694723, + "flos": 21877924400640.0, + "grad_norm": 2.4795216745498956, + "language_loss": 0.88948774, + "learning_rate": 3.712619437068174e-06, + "loss": 0.96792841, + "num_input_tokens_seen": 70882660, + "router_z_loss_clip": 2.54296875, + "router_z_loss_mlp": 0.24243164, + "step": 3286, + "time_per_iteration": 2.5367021560668945 + }, + { + "auxiliary_loss_clip": 0.06569161, + "auxiliary_loss_mlp": 0.01280864, + "balance_loss_clip": 0.06308903, + "balance_loss_mlp": 0.01256641, + "epoch": 0.19762513151961522, + "flos": 15164414294400.0, + "grad_norm": 2.1735993607640904, + "language_loss": 0.79236507, + "learning_rate": 3.712418262187102e-06, + "loss": 0.87086535, + "num_input_tokens_seen": 70898765, + "router_z_loss_clip": 2.6015625, + "router_z_loss_mlp": 0.24230957, + "step": 3287, + "time_per_iteration": 2.4954702854156494 + }, + { + "auxiliary_loss_clip": 0.0656468, + "auxiliary_loss_mlp": 0.01280142, + "balance_loss_clip": 0.0630395, + "balance_loss_mlp": 0.01256824, + "epoch": 0.1976852547722832, + "flos": 16984239194880.0, + "grad_norm": 4.513328663516958, + "language_loss": 0.81957221, + "learning_rate": 3.7122170223709584e-06, + "loss": 0.89802045, + "num_input_tokens_seen": 70916370, + "router_z_loss_clip": 2.60742188, + "router_z_loss_mlp": 0.23303223, + "step": 3288, + "time_per_iteration": 2.504995584487915 + }, + { + "auxiliary_loss_clip": 0.0655796, + "auxiliary_loss_mlp": 0.01284666, + "balance_loss_clip": 0.06307058, + "balance_loss_mlp": 0.01260526, + "epoch": 0.19774537802495115, + "flos": 20309135932800.0, + "grad_norm": 2.127297919409227, + "language_loss": 0.73378497, + "learning_rate": 3.712015717627374e-06, + "loss": 0.81221128, + "num_input_tokens_seen": 70934870, + "router_z_loss_clip": 2.50976562, + "router_z_loss_mlp": 0.24157715, + "step": 3289, + "time_per_iteration": 2.5189085006713867 + }, + { + "auxiliary_loss_clip": 0.06562441, + "auxiliary_loss_mlp": 0.01280497, + "balance_loss_clip": 0.06308928, + "balance_loss_mlp": 0.0125718, + "epoch": 0.19780550127761912, + "flos": 27242893296000.0, + "grad_norm": 3.229663808517491, + "language_loss": 0.79990375, + "learning_rate": 3.7118143479639813e-06, + "loss": 0.87833309, + "num_input_tokens_seen": 70955140, + "router_z_loss_clip": 2.53320312, + "router_z_loss_mlp": 0.2331543, + "step": 3290, + "time_per_iteration": 2.615630626678467 + }, + { + "auxiliary_loss_clip": 0.06446102, + "auxiliary_loss_mlp": 0.01262954, + "balance_loss_clip": 0.06305116, + "balance_loss_mlp": 0.01256308, + "epoch": 0.19786562453028708, + "flos": 63572597015040.0, + "grad_norm": 0.871535655745335, + "language_loss": 0.60331321, + "learning_rate": 3.711612913388418e-06, + "loss": 0.68040371, + "num_input_tokens_seen": 71012005, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.06658936, + "step": 3291, + "time_per_iteration": 3.1708285808563232 + }, + { + "auxiliary_loss_clip": 0.06578626, + "auxiliary_loss_mlp": 0.01283318, + "balance_loss_clip": 0.06312629, + "balance_loss_mlp": 0.0125621, + "epoch": 0.19792574778295505, + "flos": 26293869152640.0, + "grad_norm": 1.6662005392394712, + "language_loss": 0.82490212, + "learning_rate": 3.7114114139083204e-06, + "loss": 0.90352154, + "num_input_tokens_seen": 71031140, + "router_z_loss_clip": 2.65820312, + "router_z_loss_mlp": 0.2713623, + "step": 3292, + "time_per_iteration": 4.009428024291992 + }, + { + "auxiliary_loss_clip": 0.06559315, + "auxiliary_loss_mlp": 0.01281718, + "balance_loss_clip": 0.06308785, + "balance_loss_mlp": 0.01259641, + "epoch": 0.19798587103562304, + "flos": 19944265328640.0, + "grad_norm": 2.398610043576172, + "language_loss": 0.82271063, + "learning_rate": 3.7112098495313313e-06, + "loss": 0.9011209, + "num_input_tokens_seen": 71050250, + "router_z_loss_clip": 2.50390625, + "router_z_loss_mlp": 0.2208252, + "step": 3293, + "time_per_iteration": 2.5567917823791504 + }, + { + "auxiliary_loss_clip": 0.06584712, + "auxiliary_loss_mlp": 0.0128547, + "balance_loss_clip": 0.06316388, + "balance_loss_mlp": 0.01259351, + "epoch": 0.198045994288291, + "flos": 20126428104960.0, + "grad_norm": 22.121432113432896, + "language_loss": 0.62642097, + "learning_rate": 3.711008220265093e-06, + "loss": 0.70512283, + "num_input_tokens_seen": 71068665, + "router_z_loss_clip": 2.68554688, + "router_z_loss_mlp": 0.26135254, + "step": 3294, + "time_per_iteration": 4.055817365646362 + }, + { + "auxiliary_loss_clip": 0.06568369, + "auxiliary_loss_mlp": 0.01283249, + "balance_loss_clip": 0.06312987, + "balance_loss_mlp": 0.01259849, + "epoch": 0.19810611754095897, + "flos": 17973444170880.0, + "grad_norm": 2.078666367863598, + "language_loss": 0.88182533, + "learning_rate": 3.710806526117251e-06, + "loss": 0.96034157, + "num_input_tokens_seen": 71085320, + "router_z_loss_clip": 2.55273438, + "router_z_loss_mlp": 0.23413086, + "step": 3295, + "time_per_iteration": 2.616658926010132 + }, + { + "auxiliary_loss_clip": 0.06566019, + "auxiliary_loss_mlp": 0.01286636, + "balance_loss_clip": 0.06313851, + "balance_loss_mlp": 0.01265298, + "epoch": 0.19816624079362694, + "flos": 15090257831040.0, + "grad_norm": 2.9890739239636575, + "language_loss": 0.82427287, + "learning_rate": 3.7106047670954544e-06, + "loss": 0.90279943, + "num_input_tokens_seen": 71102020, + "router_z_loss_clip": 2.5234375, + "router_z_loss_mlp": 0.21337891, + "step": 3296, + "time_per_iteration": 2.642479658126831 + }, + { + "auxiliary_loss_clip": 0.06579386, + "auxiliary_loss_mlp": 0.01281841, + "balance_loss_clip": 0.06320241, + "balance_loss_mlp": 0.01256593, + "epoch": 0.1982263640462949, + "flos": 24907327315200.0, + "grad_norm": 2.6461649791490522, + "language_loss": 0.69111884, + "learning_rate": 3.710402943207354e-06, + "loss": 0.76973104, + "num_input_tokens_seen": 71123390, + "router_z_loss_clip": 2.59179688, + "router_z_loss_mlp": 0.25268555, + "step": 3297, + "time_per_iteration": 2.5983548164367676 + }, + { + "auxiliary_loss_clip": 0.06568186, + "auxiliary_loss_mlp": 0.01294298, + "balance_loss_clip": 0.06316572, + "balance_loss_mlp": 0.01272125, + "epoch": 0.19828648729896287, + "flos": 20382453855360.0, + "grad_norm": 1.615710211373745, + "language_loss": 0.8249923, + "learning_rate": 3.7102010544606016e-06, + "loss": 0.90361714, + "num_input_tokens_seen": 71141800, + "router_z_loss_clip": 2.51757812, + "router_z_loss_mlp": 0.22167969, + "step": 3298, + "time_per_iteration": 2.548333168029785 + }, + { + "auxiliary_loss_clip": 0.0657866, + "auxiliary_loss_mlp": 0.01298019, + "balance_loss_clip": 0.06318102, + "balance_loss_mlp": 0.01272592, + "epoch": 0.19834661055163083, + "flos": 18886018988160.0, + "grad_norm": 1.9534827487794544, + "language_loss": 0.86188138, + "learning_rate": 3.7099991008628544e-06, + "loss": 0.94064808, + "num_input_tokens_seen": 71159505, + "router_z_loss_clip": 2.60546875, + "router_z_loss_mlp": 0.25402832, + "step": 3299, + "time_per_iteration": 3.944326400756836 + }, + { + "auxiliary_loss_clip": 0.06449087, + "auxiliary_loss_mlp": 0.01270227, + "balance_loss_clip": 0.06307668, + "balance_loss_mlp": 0.01262615, + "epoch": 0.19840673380429882, + "flos": 60278908723200.0, + "grad_norm": 0.7519898728992364, + "language_loss": 0.53224742, + "learning_rate": 3.7097970824217706e-06, + "loss": 0.60944057, + "num_input_tokens_seen": 71223265, + "router_z_loss_clip": 1.41308594, + "router_z_loss_mlp": 0.07598877, + "step": 3300, + "time_per_iteration": 4.6055073738098145 + }, + { + "auxiliary_loss_clip": 0.06570522, + "auxiliary_loss_mlp": 0.01292871, + "balance_loss_clip": 0.06315967, + "balance_loss_mlp": 0.01267706, + "epoch": 0.1984668570569668, + "flos": 19908235272960.0, + "grad_norm": 2.2853574973511472, + "language_loss": 0.73847342, + "learning_rate": 3.7095949991450093e-06, + "loss": 0.81710732, + "num_input_tokens_seen": 71242385, + "router_z_loss_clip": 2.546875, + "router_z_loss_mlp": 0.25183105, + "step": 3301, + "time_per_iteration": 2.6006925106048584 + }, + { + "auxiliary_loss_clip": 0.06563142, + "auxiliary_loss_mlp": 0.01290092, + "balance_loss_clip": 0.0631086, + "balance_loss_mlp": 0.01267239, + "epoch": 0.19852698030963475, + "flos": 15635865692160.0, + "grad_norm": 3.8656690955217976, + "language_loss": 0.8953101, + "learning_rate": 3.709392851040235e-06, + "loss": 0.9738425, + "num_input_tokens_seen": 71258990, + "router_z_loss_clip": 2.5234375, + "router_z_loss_mlp": 0.22851562, + "step": 3302, + "time_per_iteration": 2.487173080444336 + }, + { + "auxiliary_loss_clip": 0.06567049, + "auxiliary_loss_mlp": 0.0128658, + "balance_loss_clip": 0.06310292, + "balance_loss_mlp": 0.01263013, + "epoch": 0.19858710356230272, + "flos": 43153037729280.0, + "grad_norm": 2.6127475741484347, + "language_loss": 0.74595749, + "learning_rate": 3.709190638115111e-06, + "loss": 0.82449377, + "num_input_tokens_seen": 71282770, + "router_z_loss_clip": 2.56835938, + "router_z_loss_mlp": 0.23596191, + "step": 3303, + "time_per_iteration": 2.733031749725342 + }, + { + "auxiliary_loss_clip": 0.06567588, + "auxiliary_loss_mlp": 0.0129499, + "balance_loss_clip": 0.06313773, + "balance_loss_mlp": 0.01270373, + "epoch": 0.19864722681497068, + "flos": 35151348879360.0, + "grad_norm": 2.3312818962460686, + "language_loss": 0.75973707, + "learning_rate": 3.7089883603773084e-06, + "loss": 0.83836287, + "num_input_tokens_seen": 71301410, + "router_z_loss_clip": 2.54101562, + "router_z_loss_mlp": 0.24597168, + "step": 3304, + "time_per_iteration": 2.627612829208374 + }, + { + "auxiliary_loss_clip": 0.06565879, + "auxiliary_loss_mlp": 0.01301567, + "balance_loss_clip": 0.06315561, + "balance_loss_mlp": 0.01279156, + "epoch": 0.19870735006763865, + "flos": 19432088046720.0, + "grad_norm": 2.2073504264205277, + "language_loss": 0.86939341, + "learning_rate": 3.7087860178344955e-06, + "loss": 0.9480679, + "num_input_tokens_seen": 71319670, + "router_z_loss_clip": 2.50390625, + "router_z_loss_mlp": 0.22399902, + "step": 3305, + "time_per_iteration": 2.5243277549743652 + }, + { + "auxiliary_loss_clip": 0.06573498, + "auxiliary_loss_mlp": 0.01293424, + "balance_loss_clip": 0.06314258, + "balance_loss_mlp": 0.01270035, + "epoch": 0.19876747332030664, + "flos": 23553671005440.0, + "grad_norm": 1.7277126311559312, + "language_loss": 0.69397068, + "learning_rate": 3.7085836104943445e-06, + "loss": 0.77263987, + "num_input_tokens_seen": 71339850, + "router_z_loss_clip": 2.59570312, + "router_z_loss_mlp": 0.23388672, + "step": 3306, + "time_per_iteration": 2.6042323112487793 + }, + { + "auxiliary_loss_clip": 0.06570327, + "auxiliary_loss_mlp": 0.01299594, + "balance_loss_clip": 0.06314942, + "balance_loss_mlp": 0.0127723, + "epoch": 0.1988275965729746, + "flos": 19835672037120.0, + "grad_norm": 3.1120189325389735, + "language_loss": 0.77373499, + "learning_rate": 3.7083811383645332e-06, + "loss": 0.85243422, + "num_input_tokens_seen": 71359795, + "router_z_loss_clip": 2.55273438, + "router_z_loss_mlp": 0.22375488, + "step": 3307, + "time_per_iteration": 2.6128084659576416 + }, + { + "auxiliary_loss_clip": 0.06569448, + "auxiliary_loss_mlp": 0.01292327, + "balance_loss_clip": 0.06316574, + "balance_loss_mlp": 0.01270452, + "epoch": 0.19888771982564257, + "flos": 23520366207360.0, + "grad_norm": 3.545114094394172, + "language_loss": 0.7662878, + "learning_rate": 3.708178601452737e-06, + "loss": 0.84490561, + "num_input_tokens_seen": 71378885, + "router_z_loss_clip": 2.52929688, + "router_z_loss_mlp": 0.21875, + "step": 3308, + "time_per_iteration": 2.5699222087860107 + }, + { + "auxiliary_loss_clip": 0.06565186, + "auxiliary_loss_mlp": 0.01291629, + "balance_loss_clip": 0.0631263, + "balance_loss_mlp": 0.0126799, + "epoch": 0.19894784307831054, + "flos": 18156403560960.0, + "grad_norm": 1.7056349525902872, + "language_loss": 0.76261461, + "learning_rate": 3.7079759997666374e-06, + "loss": 0.84118271, + "num_input_tokens_seen": 71397285, + "router_z_loss_clip": 2.52539062, + "router_z_loss_mlp": 0.23657227, + "step": 3309, + "time_per_iteration": 2.5804028511047363 + }, + { + "auxiliary_loss_clip": 0.06557433, + "auxiliary_loss_mlp": 0.01287248, + "balance_loss_clip": 0.06309506, + "balance_loss_mlp": 0.0126287, + "epoch": 0.1990079663309785, + "flos": 24282280183680.0, + "grad_norm": 1.5893437900436935, + "language_loss": 0.8845197, + "learning_rate": 3.707773333313917e-06, + "loss": 0.96296644, + "num_input_tokens_seen": 71415775, + "router_z_loss_clip": 2.47851562, + "router_z_loss_mlp": 0.24377441, + "step": 3310, + "time_per_iteration": 2.540788412094116 + }, + { + "auxiliary_loss_clip": 0.06554775, + "auxiliary_loss_mlp": 0.01280476, + "balance_loss_clip": 0.06304908, + "balance_loss_mlp": 0.01256575, + "epoch": 0.19906808958364647, + "flos": 34906391867520.0, + "grad_norm": 2.4688423193302347, + "language_loss": 0.64663219, + "learning_rate": 3.70757060210226e-06, + "loss": 0.72498477, + "num_input_tokens_seen": 71437315, + "router_z_loss_clip": 2.50195312, + "router_z_loss_mlp": 0.23925781, + "step": 3311, + "time_per_iteration": 2.6754508018493652 + }, + { + "auxiliary_loss_clip": 0.06567319, + "auxiliary_loss_mlp": 0.01285122, + "balance_loss_clip": 0.06310549, + "balance_loss_mlp": 0.01261351, + "epoch": 0.19912821283631443, + "flos": 24031788802560.0, + "grad_norm": 3.0857408174701186, + "language_loss": 0.75624847, + "learning_rate": 3.707367806139355e-06, + "loss": 0.83477283, + "num_input_tokens_seen": 71456320, + "router_z_loss_clip": 2.56445312, + "router_z_loss_mlp": 0.23779297, + "step": 3312, + "time_per_iteration": 2.5815083980560303 + }, + { + "auxiliary_loss_clip": 0.06553487, + "auxiliary_loss_mlp": 0.01286524, + "balance_loss_clip": 0.06300232, + "balance_loss_mlp": 0.01262611, + "epoch": 0.19918833608898243, + "flos": 19864155225600.0, + "grad_norm": 2.0583715987658264, + "language_loss": 0.84526402, + "learning_rate": 3.7071649454328915e-06, + "loss": 0.92366409, + "num_input_tokens_seen": 71475360, + "router_z_loss_clip": 2.53125, + "router_z_loss_mlp": 0.23937988, + "step": 3313, + "time_per_iteration": 2.5260941982269287 + }, + { + "auxiliary_loss_clip": 0.06547163, + "auxiliary_loss_mlp": 0.01284622, + "balance_loss_clip": 0.06294618, + "balance_loss_mlp": 0.01261376, + "epoch": 0.1992484593416504, + "flos": 29103444080640.0, + "grad_norm": 1.8813056340492245, + "language_loss": 0.81481469, + "learning_rate": 3.7069620199905625e-06, + "loss": 0.89313251, + "num_input_tokens_seen": 71496155, + "router_z_loss_clip": 2.52539062, + "router_z_loss_mlp": 0.2322998, + "step": 3314, + "time_per_iteration": 2.618865966796875 + }, + { + "auxiliary_loss_clip": 0.06544838, + "auxiliary_loss_mlp": 0.01278619, + "balance_loss_clip": 0.06300788, + "balance_loss_mlp": 0.01257924, + "epoch": 0.19930858259431836, + "flos": 23301754104960.0, + "grad_norm": 1.60969518187187, + "language_loss": 0.88063407, + "learning_rate": 3.7067590298200627e-06, + "loss": 0.95886856, + "num_input_tokens_seen": 71517295, + "router_z_loss_clip": 2.43945312, + "router_z_loss_mlp": 0.20690918, + "step": 3315, + "time_per_iteration": 2.5732057094573975 + }, + { + "auxiliary_loss_clip": 0.06550217, + "auxiliary_loss_mlp": 0.01280633, + "balance_loss_clip": 0.06298293, + "balance_loss_mlp": 0.0125728, + "epoch": 0.19936870584698632, + "flos": 25386619069440.0, + "grad_norm": 1.6023919835075873, + "language_loss": 0.71362162, + "learning_rate": 3.7065559749290892e-06, + "loss": 0.79193014, + "num_input_tokens_seen": 71540000, + "router_z_loss_clip": 2.51953125, + "router_z_loss_mlp": 0.23352051, + "step": 3316, + "time_per_iteration": 2.6071085929870605 + }, + { + "auxiliary_loss_clip": 0.06427301, + "auxiliary_loss_mlp": 0.01269861, + "balance_loss_clip": 0.06290084, + "balance_loss_mlp": 0.01263975, + "epoch": 0.1994288290996543, + "flos": 62190038246400.0, + "grad_norm": 0.8251623423654184, + "language_loss": 0.6634506, + "learning_rate": 3.706352855325342e-06, + "loss": 0.74042213, + "num_input_tokens_seen": 71607880, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.05880737, + "step": 3317, + "time_per_iteration": 3.216862201690674 + }, + { + "auxiliary_loss_clip": 0.06558052, + "auxiliary_loss_mlp": 0.01286476, + "balance_loss_clip": 0.06302503, + "balance_loss_mlp": 0.01262813, + "epoch": 0.19948895235232225, + "flos": 19031816292480.0, + "grad_norm": 2.159914212237722, + "language_loss": 0.74519444, + "learning_rate": 3.7061496710165233e-06, + "loss": 0.82363975, + "num_input_tokens_seen": 71625695, + "router_z_loss_clip": 2.55078125, + "router_z_loss_mlp": 0.23669434, + "step": 3318, + "time_per_iteration": 2.5432114601135254 + }, + { + "auxiliary_loss_clip": 0.06544004, + "auxiliary_loss_mlp": 0.01278248, + "balance_loss_clip": 0.06298326, + "balance_loss_mlp": 0.01256266, + "epoch": 0.19954907560499022, + "flos": 37824895503360.0, + "grad_norm": 2.0763327087054604, + "language_loss": 0.79865813, + "learning_rate": 3.7059464220103385e-06, + "loss": 0.87688065, + "num_input_tokens_seen": 71648520, + "router_z_loss_clip": 2.45703125, + "router_z_loss_mlp": 0.21984863, + "step": 3319, + "time_per_iteration": 2.6703901290893555 + }, + { + "auxiliary_loss_clip": 0.06551617, + "auxiliary_loss_mlp": 0.01282829, + "balance_loss_clip": 0.06300303, + "balance_loss_mlp": 0.01259631, + "epoch": 0.1996091988576582, + "flos": 49576420673280.0, + "grad_norm": 2.869788826425785, + "language_loss": 0.763668, + "learning_rate": 3.7057431083144945e-06, + "loss": 0.84201247, + "num_input_tokens_seen": 71672185, + "router_z_loss_clip": 2.51757812, + "router_z_loss_mlp": 0.2322998, + "step": 3320, + "time_per_iteration": 2.817199945449829 + }, + { + "auxiliary_loss_clip": 0.06552573, + "auxiliary_loss_mlp": 0.01291841, + "balance_loss_clip": 0.06302333, + "balance_loss_mlp": 0.01269608, + "epoch": 0.19966932211032618, + "flos": 22642018583040.0, + "grad_norm": 1.4988243809721686, + "language_loss": 0.81033528, + "learning_rate": 3.705539729936701e-06, + "loss": 0.8887794, + "num_input_tokens_seen": 71692890, + "router_z_loss_clip": 2.50390625, + "router_z_loss_mlp": 0.22229004, + "step": 3321, + "time_per_iteration": 2.6688761711120605 + }, + { + "auxiliary_loss_clip": 0.06416404, + "auxiliary_loss_mlp": 0.01265491, + "balance_loss_clip": 0.06281121, + "balance_loss_mlp": 0.01258195, + "epoch": 0.19972944536299414, + "flos": 54098973417600.0, + "grad_norm": 0.8569411614728654, + "language_loss": 0.65245974, + "learning_rate": 3.7053362868846696e-06, + "loss": 0.72927874, + "num_input_tokens_seen": 71745815, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.07275391, + "step": 3322, + "time_per_iteration": 3.000269651412964 + }, + { + "auxiliary_loss_clip": 0.06410387, + "auxiliary_loss_mlp": 0.01261864, + "balance_loss_clip": 0.06274698, + "balance_loss_mlp": 0.01254372, + "epoch": 0.1997895686156621, + "flos": 69371995731840.0, + "grad_norm": 0.7694165297899808, + "language_loss": 0.56849998, + "learning_rate": 3.7051327791661153e-06, + "loss": 0.64522249, + "num_input_tokens_seen": 71806915, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.07476807, + "step": 3323, + "time_per_iteration": 3.330606698989868 + }, + { + "auxiliary_loss_clip": 0.06562012, + "auxiliary_loss_mlp": 0.01292664, + "balance_loss_clip": 0.06316413, + "balance_loss_mlp": 0.01268596, + "epoch": 0.19984969186833007, + "flos": 18558058907520.0, + "grad_norm": 1.8232624283894519, + "language_loss": 0.81610429, + "learning_rate": 3.7049292067887555e-06, + "loss": 0.89465106, + "num_input_tokens_seen": 71824645, + "router_z_loss_clip": 2.45703125, + "router_z_loss_mlp": 0.24084473, + "step": 3324, + "time_per_iteration": 2.5314769744873047 + }, + { + "auxiliary_loss_clip": 0.06558169, + "auxiliary_loss_mlp": 0.01292911, + "balance_loss_clip": 0.06310347, + "balance_loss_mlp": 0.01268318, + "epoch": 0.19990981512099804, + "flos": 26436438074880.0, + "grad_norm": 1.6515442637335616, + "language_loss": 0.54047406, + "learning_rate": 3.7047255697603092e-06, + "loss": 0.61898488, + "num_input_tokens_seen": 71845125, + "router_z_loss_clip": 2.48242188, + "router_z_loss_mlp": 0.24609375, + "step": 3325, + "time_per_iteration": 2.6192479133605957 + }, + { + "auxiliary_loss_clip": 0.06565623, + "auxiliary_loss_mlp": 0.01288281, + "balance_loss_clip": 0.063146, + "balance_loss_mlp": 0.01265572, + "epoch": 0.19996993837366603, + "flos": 16331547415680.0, + "grad_norm": 1.9371709062145088, + "language_loss": 0.8658272, + "learning_rate": 3.7045218680884984e-06, + "loss": 0.94436622, + "num_input_tokens_seen": 71863500, + "router_z_loss_clip": 2.50976562, + "router_z_loss_mlp": 0.22729492, + "step": 3326, + "time_per_iteration": 2.5111629962921143 + }, + { + "auxiliary_loss_clip": 0.06551019, + "auxiliary_loss_mlp": 0.01289033, + "balance_loss_clip": 0.06305069, + "balance_loss_mlp": 0.01266705, + "epoch": 0.200030061626334, + "flos": 20849460986880.0, + "grad_norm": 6.809877440219623, + "language_loss": 0.7272824, + "learning_rate": 3.7043181017810476e-06, + "loss": 0.8056829, + "num_input_tokens_seen": 71881845, + "router_z_loss_clip": 2.45898438, + "router_z_loss_mlp": 0.22314453, + "step": 3327, + "time_per_iteration": 2.5571372509002686 + }, + { + "auxiliary_loss_clip": 0.06566358, + "auxiliary_loss_mlp": 0.01287053, + "balance_loss_clip": 0.06313111, + "balance_loss_mlp": 0.01261756, + "epoch": 0.20009018487900196, + "flos": 23768341966080.0, + "grad_norm": 1.841950801645188, + "language_loss": 0.77914047, + "learning_rate": 3.7041142708456833e-06, + "loss": 0.8576746, + "num_input_tokens_seen": 71900940, + "router_z_loss_clip": 2.53320312, + "router_z_loss_mlp": 0.25317383, + "step": 3328, + "time_per_iteration": 2.5489912033081055 + }, + { + "auxiliary_loss_clip": 0.06559211, + "auxiliary_loss_mlp": 0.01288822, + "balance_loss_clip": 0.06314486, + "balance_loss_mlp": 0.01265338, + "epoch": 0.20015030813166992, + "flos": 28119186495360.0, + "grad_norm": 1.7739956363125764, + "language_loss": 0.6938678, + "learning_rate": 3.7039103752901353e-06, + "loss": 0.77234817, + "num_input_tokens_seen": 71921925, + "router_z_loss_clip": 2.4453125, + "router_z_loss_mlp": 0.23474121, + "step": 3329, + "time_per_iteration": 2.790318489074707 + }, + { + "auxiliary_loss_clip": 0.06562928, + "auxiliary_loss_mlp": 0.01288787, + "balance_loss_clip": 0.06310034, + "balance_loss_mlp": 0.01263396, + "epoch": 0.2002104313843379, + "flos": 26074250801280.0, + "grad_norm": 1.6222638892170962, + "language_loss": 0.81793886, + "learning_rate": 3.7037064151221353e-06, + "loss": 0.896456, + "num_input_tokens_seen": 71941855, + "router_z_loss_clip": 2.52929688, + "router_z_loss_mlp": 0.25415039, + "step": 3330, + "time_per_iteration": 2.6165175437927246 + }, + { + "auxiliary_loss_clip": 0.06561245, + "auxiliary_loss_mlp": 0.01293061, + "balance_loss_clip": 0.06310615, + "balance_loss_mlp": 0.01268874, + "epoch": 0.20027055463700585, + "flos": 22973332826880.0, + "grad_norm": 3.6220429921180877, + "language_loss": 0.7808395, + "learning_rate": 3.703502390349417e-06, + "loss": 0.85938263, + "num_input_tokens_seen": 71960915, + "router_z_loss_clip": 2.50585938, + "router_z_loss_mlp": 0.24194336, + "step": 3331, + "time_per_iteration": 4.07051157951355 + }, + { + "auxiliary_loss_clip": 0.06564473, + "auxiliary_loss_mlp": 0.01290798, + "balance_loss_clip": 0.06310149, + "balance_loss_mlp": 0.01266014, + "epoch": 0.20033067788967382, + "flos": 17171433216000.0, + "grad_norm": 1.7477664730796658, + "language_loss": 0.79863441, + "learning_rate": 3.7032983009797176e-06, + "loss": 0.87718713, + "num_input_tokens_seen": 71979220, + "router_z_loss_clip": 2.54296875, + "router_z_loss_mlp": 0.24780273, + "step": 3332, + "time_per_iteration": 2.5321452617645264 + }, + { + "auxiliary_loss_clip": 0.06409155, + "auxiliary_loss_mlp": 0.01261657, + "balance_loss_clip": 0.06275231, + "balance_loss_mlp": 0.01253551, + "epoch": 0.2003908011423418, + "flos": 60842476085760.0, + "grad_norm": 0.9021189232739572, + "language_loss": 0.61913729, + "learning_rate": 3.703094147020776e-06, + "loss": 0.69584543, + "num_input_tokens_seen": 72033950, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.08105469, + "step": 3333, + "time_per_iteration": 4.713933706283569 + }, + { + "auxiliary_loss_clip": 0.06552575, + "auxiliary_loss_mlp": 0.0128469, + "balance_loss_clip": 0.06299093, + "balance_loss_mlp": 0.0126123, + "epoch": 0.20045092439500978, + "flos": 24212987256960.0, + "grad_norm": 1.8847951547254278, + "language_loss": 0.82181144, + "learning_rate": 3.7028899284803334e-06, + "loss": 0.90018404, + "num_input_tokens_seen": 72051395, + "router_z_loss_clip": 2.53320312, + "router_z_loss_mlp": 0.23461914, + "step": 3334, + "time_per_iteration": 2.597038984298706 + }, + { + "auxiliary_loss_clip": 0.0654801, + "auxiliary_loss_mlp": 0.01282898, + "balance_loss_clip": 0.06293298, + "balance_loss_mlp": 0.01256874, + "epoch": 0.20051104764767774, + "flos": 29395290251520.0, + "grad_norm": 2.256626356817437, + "language_loss": 0.7536357, + "learning_rate": 3.702685645366134e-06, + "loss": 0.83194482, + "num_input_tokens_seen": 72071305, + "router_z_loss_clip": 2.54882812, + "router_z_loss_mlp": 0.26049805, + "step": 3335, + "time_per_iteration": 2.5860390663146973 + }, + { + "auxiliary_loss_clip": 0.06552432, + "auxiliary_loss_mlp": 0.01280424, + "balance_loss_clip": 0.06300009, + "balance_loss_mlp": 0.0125632, + "epoch": 0.2005711709003457, + "flos": 23520575842560.0, + "grad_norm": 6.047041669068293, + "language_loss": 0.80452931, + "learning_rate": 3.7024812976859243e-06, + "loss": 0.88285786, + "num_input_tokens_seen": 72090165, + "router_z_loss_clip": 2.5234375, + "router_z_loss_mlp": 0.24108887, + "step": 3336, + "time_per_iteration": 2.662705898284912 + }, + { + "auxiliary_loss_clip": 0.06555694, + "auxiliary_loss_mlp": 0.01283807, + "balance_loss_clip": 0.06297083, + "balance_loss_mlp": 0.01258045, + "epoch": 0.20063129415301367, + "flos": 22529106806400.0, + "grad_norm": 1.88296777376126, + "language_loss": 0.78839928, + "learning_rate": 3.7022768854474532e-06, + "loss": 0.86679429, + "num_input_tokens_seen": 72107210, + "router_z_loss_clip": 2.5859375, + "router_z_loss_mlp": 0.25756836, + "step": 3337, + "time_per_iteration": 2.541239023208618 + }, + { + "auxiliary_loss_clip": 0.06548997, + "auxiliary_loss_mlp": 0.01282446, + "balance_loss_clip": 0.06296889, + "balance_loss_mlp": 0.01258389, + "epoch": 0.20069141740568164, + "flos": 25965405947520.0, + "grad_norm": 2.093788516709133, + "language_loss": 0.69608915, + "learning_rate": 3.7020724086584724e-06, + "loss": 0.77440357, + "num_input_tokens_seen": 72126315, + "router_z_loss_clip": 2.51953125, + "router_z_loss_mlp": 0.24072266, + "step": 3338, + "time_per_iteration": 4.011674165725708 + }, + { + "auxiliary_loss_clip": 0.06553162, + "auxiliary_loss_mlp": 0.01285819, + "balance_loss_clip": 0.06298589, + "balance_loss_mlp": 0.01261703, + "epoch": 0.2007515406583496, + "flos": 24797560066560.0, + "grad_norm": 2.5614555335728375, + "language_loss": 0.70278549, + "learning_rate": 3.701867867326735e-06, + "loss": 0.78117526, + "num_input_tokens_seen": 72146470, + "router_z_loss_clip": 2.54492188, + "router_z_loss_mlp": 0.24121094, + "step": 3339, + "time_per_iteration": 4.021097183227539 + }, + { + "auxiliary_loss_clip": 0.06558233, + "auxiliary_loss_mlp": 0.01288707, + "balance_loss_clip": 0.06300814, + "balance_loss_mlp": 0.01264401, + "epoch": 0.2008116639110176, + "flos": 37934746606080.0, + "grad_norm": 2.4782874615073265, + "language_loss": 0.67773008, + "learning_rate": 3.7016632614599974e-06, + "loss": 0.75619948, + "num_input_tokens_seen": 72166600, + "router_z_loss_clip": 2.578125, + "router_z_loss_mlp": 0.24291992, + "step": 3340, + "time_per_iteration": 2.741156816482544 + }, + { + "auxiliary_loss_clip": 0.06555235, + "auxiliary_loss_mlp": 0.01284766, + "balance_loss_clip": 0.06297287, + "balance_loss_mlp": 0.01258122, + "epoch": 0.20087178716368556, + "flos": 20746779408000.0, + "grad_norm": 2.067820693237163, + "language_loss": 0.74698186, + "learning_rate": 3.701458591066019e-06, + "loss": 0.82538182, + "num_input_tokens_seen": 72185160, + "router_z_loss_clip": 2.58007812, + "router_z_loss_mlp": 0.26623535, + "step": 3341, + "time_per_iteration": 2.564480781555176 + }, + { + "auxiliary_loss_clip": 0.06547385, + "auxiliary_loss_mlp": 0.01280207, + "balance_loss_clip": 0.06298249, + "balance_loss_mlp": 0.01256532, + "epoch": 0.20093191041635353, + "flos": 23849122901760.0, + "grad_norm": 1.820842392943319, + "language_loss": 0.7265389, + "learning_rate": 3.70125385615256e-06, + "loss": 0.80481482, + "num_input_tokens_seen": 72205160, + "router_z_loss_clip": 2.48828125, + "router_z_loss_mlp": 0.23657227, + "step": 3342, + "time_per_iteration": 2.5828449726104736 + }, + { + "auxiliary_loss_clip": 0.065575, + "auxiliary_loss_mlp": 0.01288338, + "balance_loss_clip": 0.06302083, + "balance_loss_mlp": 0.01264174, + "epoch": 0.2009920336690215, + "flos": 21797395027200.0, + "grad_norm": 1.987813203177408, + "language_loss": 0.73357129, + "learning_rate": 3.701049056727384e-06, + "loss": 0.81202972, + "num_input_tokens_seen": 72223555, + "router_z_loss_clip": 2.55273438, + "router_z_loss_mlp": 0.24169922, + "step": 3343, + "time_per_iteration": 2.547868490219116 + }, + { + "auxiliary_loss_clip": 0.06554922, + "auxiliary_loss_mlp": 0.012954, + "balance_loss_clip": 0.06301528, + "balance_loss_mlp": 0.01269865, + "epoch": 0.20105215692168946, + "flos": 26366390461440.0, + "grad_norm": 2.115251797604865, + "language_loss": 0.81433517, + "learning_rate": 3.7008441927982574e-06, + "loss": 0.89283836, + "num_input_tokens_seen": 72242465, + "router_z_loss_clip": 2.53515625, + "router_z_loss_mlp": 0.25524902, + "step": 3344, + "time_per_iteration": 2.6067302227020264 + }, + { + "auxiliary_loss_clip": 0.06556335, + "auxiliary_loss_mlp": 0.01281302, + "balance_loss_clip": 0.06301118, + "balance_loss_mlp": 0.01258426, + "epoch": 0.20111228017435742, + "flos": 18813288044160.0, + "grad_norm": 4.0042293338609385, + "language_loss": 0.84618676, + "learning_rate": 3.700639264372948e-06, + "loss": 0.92456311, + "num_input_tokens_seen": 72260655, + "router_z_loss_clip": 2.55273438, + "router_z_loss_mlp": 0.2286377, + "step": 3345, + "time_per_iteration": 2.554713726043701 + }, + { + "auxiliary_loss_clip": 0.06542929, + "auxiliary_loss_mlp": 0.01295407, + "balance_loss_clip": 0.0629687, + "balance_loss_mlp": 0.01272697, + "epoch": 0.20117240342702541, + "flos": 19981301633280.0, + "grad_norm": 2.1108086187654025, + "language_loss": 0.68437809, + "learning_rate": 3.7004342714592283e-06, + "loss": 0.76276147, + "num_input_tokens_seen": 72279055, + "router_z_loss_clip": 2.4609375, + "router_z_loss_mlp": 0.22705078, + "step": 3346, + "time_per_iteration": 2.5748066902160645 + }, + { + "auxiliary_loss_clip": 0.06553109, + "auxiliary_loss_mlp": 0.01283392, + "balance_loss_clip": 0.06301546, + "balance_loss_mlp": 0.01258739, + "epoch": 0.20123252667969338, + "flos": 23148368006400.0, + "grad_norm": 1.9426154174848713, + "language_loss": 0.73952061, + "learning_rate": 3.70022921406487e-06, + "loss": 0.81788564, + "num_input_tokens_seen": 72297895, + "router_z_loss_clip": 2.51757812, + "router_z_loss_mlp": 0.24682617, + "step": 3347, + "time_per_iteration": 2.5353236198425293 + }, + { + "auxiliary_loss_clip": 0.06546339, + "auxiliary_loss_mlp": 0.01287781, + "balance_loss_clip": 0.0629671, + "balance_loss_mlp": 0.01263487, + "epoch": 0.20129264993236134, + "flos": 23228352328320.0, + "grad_norm": 1.557023243146552, + "language_loss": 0.87284029, + "learning_rate": 3.70002409219765e-06, + "loss": 0.95118147, + "num_input_tokens_seen": 72318385, + "router_z_loss_clip": 2.49414062, + "router_z_loss_mlp": 0.24316406, + "step": 3348, + "time_per_iteration": 2.5943105220794678 + }, + { + "auxiliary_loss_clip": 0.06550047, + "auxiliary_loss_mlp": 0.01294068, + "balance_loss_clip": 0.06302264, + "balance_loss_mlp": 0.01269034, + "epoch": 0.2013527731850293, + "flos": 21877882473600.0, + "grad_norm": 1.6966939322149492, + "language_loss": 0.71502012, + "learning_rate": 3.699818905865346e-06, + "loss": 0.7934612, + "num_input_tokens_seen": 72338235, + "router_z_loss_clip": 2.47851562, + "router_z_loss_mlp": 0.25061035, + "step": 3349, + "time_per_iteration": 2.5671966075897217 + }, + { + "auxiliary_loss_clip": 0.06552055, + "auxiliary_loss_mlp": 0.01290022, + "balance_loss_clip": 0.06301533, + "balance_loss_mlp": 0.01263486, + "epoch": 0.20141289643769728, + "flos": 18046636312320.0, + "grad_norm": 1.7460886195435679, + "language_loss": 0.72473693, + "learning_rate": 3.6996136550757377e-06, + "loss": 0.80315775, + "num_input_tokens_seen": 72357825, + "router_z_loss_clip": 2.50195312, + "router_z_loss_mlp": 0.26501465, + "step": 3350, + "time_per_iteration": 2.558486223220825 + }, + { + "auxiliary_loss_clip": 0.06561922, + "auxiliary_loss_mlp": 0.01282894, + "balance_loss_clip": 0.0630732, + "balance_loss_mlp": 0.01256728, + "epoch": 0.20147301969036524, + "flos": 23958219317760.0, + "grad_norm": 2.4285458765514623, + "language_loss": 0.76773715, + "learning_rate": 3.69940833983661e-06, + "loss": 0.84618533, + "num_input_tokens_seen": 72376335, + "router_z_loss_clip": 2.54882812, + "router_z_loss_mlp": 0.26135254, + "step": 3351, + "time_per_iteration": 2.5236856937408447 + }, + { + "auxiliary_loss_clip": 0.0657143, + "auxiliary_loss_mlp": 0.01289916, + "balance_loss_clip": 0.06311074, + "balance_loss_mlp": 0.01260638, + "epoch": 0.2015331429430332, + "flos": 25594749411840.0, + "grad_norm": 1.6280311670130643, + "language_loss": 0.81367022, + "learning_rate": 3.699202960155748e-06, + "loss": 0.89228368, + "num_input_tokens_seen": 72395440, + "router_z_loss_clip": 2.60742188, + "router_z_loss_mlp": 0.29248047, + "step": 3352, + "time_per_iteration": 2.603740692138672 + }, + { + "auxiliary_loss_clip": 0.06557955, + "auxiliary_loss_mlp": 0.01286544, + "balance_loss_clip": 0.06303968, + "balance_loss_mlp": 0.01258458, + "epoch": 0.2015932661957012, + "flos": 26732351168640.0, + "grad_norm": 2.001275007108419, + "language_loss": 0.81670761, + "learning_rate": 3.6989975160409396e-06, + "loss": 0.89515263, + "num_input_tokens_seen": 72414670, + "router_z_loss_clip": 2.54101562, + "router_z_loss_mlp": 0.28063965, + "step": 3353, + "time_per_iteration": 2.5631332397460938 + }, + { + "auxiliary_loss_clip": 0.06555627, + "auxiliary_loss_mlp": 0.01278407, + "balance_loss_clip": 0.0630668, + "balance_loss_mlp": 0.01253206, + "epoch": 0.20165338944836916, + "flos": 15638632876800.0, + "grad_norm": 1.8574199324884482, + "language_loss": 0.9049592, + "learning_rate": 3.6987920074999747e-06, + "loss": 0.98329961, + "num_input_tokens_seen": 72432210, + "router_z_loss_clip": 2.49414062, + "router_z_loss_mlp": 0.2520752, + "step": 3354, + "time_per_iteration": 2.567229986190796 + }, + { + "auxiliary_loss_clip": 0.06439115, + "auxiliary_loss_mlp": 0.01275126, + "balance_loss_clip": 0.06305242, + "balance_loss_mlp": 0.01268129, + "epoch": 0.20171351270103713, + "flos": 57929926089600.0, + "grad_norm": 0.8202677442032412, + "language_loss": 0.55840385, + "learning_rate": 3.6985864345406465e-06, + "loss": 0.63554633, + "num_input_tokens_seen": 72489225, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.07012939, + "step": 3355, + "time_per_iteration": 3.118603229522705 + }, + { + "auxiliary_loss_clip": 0.06557105, + "auxiliary_loss_mlp": 0.01281149, + "balance_loss_clip": 0.06309459, + "balance_loss_mlp": 0.01257474, + "epoch": 0.2017736359537051, + "flos": 20820768163200.0, + "grad_norm": 1.5861142309185163, + "language_loss": 0.84845644, + "learning_rate": 3.698380797170751e-06, + "loss": 0.92683893, + "num_input_tokens_seen": 72508715, + "router_z_loss_clip": 2.47265625, + "router_z_loss_mlp": 0.23669434, + "step": 3356, + "time_per_iteration": 2.5407068729400635 + }, + { + "auxiliary_loss_clip": 0.06578876, + "auxiliary_loss_mlp": 0.01283859, + "balance_loss_clip": 0.06314196, + "balance_loss_mlp": 0.01255344, + "epoch": 0.20183375920637306, + "flos": 17097696023040.0, + "grad_norm": 3.7689574240726147, + "language_loss": 0.71072245, + "learning_rate": 3.698175095398085e-06, + "loss": 0.78934979, + "num_input_tokens_seen": 72525135, + "router_z_loss_clip": 2.64257812, + "router_z_loss_mlp": 0.28515625, + "step": 3357, + "time_per_iteration": 2.4921233654022217 + }, + { + "auxiliary_loss_clip": 0.065685, + "auxiliary_loss_mlp": 0.01288812, + "balance_loss_clip": 0.0631017, + "balance_loss_mlp": 0.01263206, + "epoch": 0.20189388245904102, + "flos": 18667323031680.0, + "grad_norm": 2.064581487792546, + "language_loss": 0.72707927, + "learning_rate": 3.6979693292304493e-06, + "loss": 0.80565238, + "num_input_tokens_seen": 72543690, + "router_z_loss_clip": 2.58007812, + "router_z_loss_mlp": 0.25585938, + "step": 3358, + "time_per_iteration": 2.531280040740967 + }, + { + "auxiliary_loss_clip": 0.06550319, + "auxiliary_loss_mlp": 0.0128707, + "balance_loss_clip": 0.06304348, + "balance_loss_mlp": 0.01263633, + "epoch": 0.20195400571170902, + "flos": 16802705324160.0, + "grad_norm": 1.761827203655194, + "language_loss": 0.83542818, + "learning_rate": 3.6977634986756463e-06, + "loss": 0.91380209, + "num_input_tokens_seen": 72560725, + "router_z_loss_clip": 2.46289062, + "router_z_loss_mlp": 0.234375, + "step": 3359, + "time_per_iteration": 2.5004122257232666 + }, + { + "auxiliary_loss_clip": 0.06415485, + "auxiliary_loss_mlp": 0.01275385, + "balance_loss_clip": 0.06281421, + "balance_loss_mlp": 0.01269109, + "epoch": 0.20201412896437698, + "flos": 67192792669440.0, + "grad_norm": 0.7763137973079639, + "language_loss": 0.58718604, + "learning_rate": 3.697557603741482e-06, + "loss": 0.66409475, + "num_input_tokens_seen": 72621940, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.06274414, + "step": 3360, + "time_per_iteration": 3.202280282974243 + }, + { + "auxiliary_loss_clip": 0.06567518, + "auxiliary_loss_mlp": 0.01281863, + "balance_loss_clip": 0.06312253, + "balance_loss_mlp": 0.01257055, + "epoch": 0.20207425221704495, + "flos": 21331477998720.0, + "grad_norm": 2.7701451368403767, + "language_loss": 0.63371557, + "learning_rate": 3.697351644435763e-06, + "loss": 0.71220934, + "num_input_tokens_seen": 72639135, + "router_z_loss_clip": 2.55078125, + "router_z_loss_mlp": 0.24841309, + "step": 3361, + "time_per_iteration": 2.591505527496338 + }, + { + "auxiliary_loss_clip": 0.06556661, + "auxiliary_loss_mlp": 0.01280295, + "balance_loss_clip": 0.06304803, + "balance_loss_mlp": 0.01257049, + "epoch": 0.2021343754697129, + "flos": 22533509145600.0, + "grad_norm": 1.837331842396403, + "language_loss": 0.76495373, + "learning_rate": 3.6971456207662993e-06, + "loss": 0.84332329, + "num_input_tokens_seen": 72658525, + "router_z_loss_clip": 2.51953125, + "router_z_loss_mlp": 0.23254395, + "step": 3362, + "time_per_iteration": 2.5748798847198486 + }, + { + "auxiliary_loss_clip": 0.06552652, + "auxiliary_loss_mlp": 0.01281781, + "balance_loss_clip": 0.06300291, + "balance_loss_mlp": 0.01257379, + "epoch": 0.20219449872238088, + "flos": 19068852597120.0, + "grad_norm": 1.6506097934595576, + "language_loss": 0.77716577, + "learning_rate": 3.6969395327409035e-06, + "loss": 0.85551012, + "num_input_tokens_seen": 72678085, + "router_z_loss_clip": 2.52734375, + "router_z_loss_mlp": 0.24365234, + "step": 3363, + "time_per_iteration": 2.5682361125946045 + }, + { + "auxiliary_loss_clip": 0.06556462, + "auxiliary_loss_mlp": 0.01285372, + "balance_loss_clip": 0.06303493, + "balance_loss_mlp": 0.01262198, + "epoch": 0.20225462197504884, + "flos": 24723864800640.0, + "grad_norm": 1.5662342973814338, + "language_loss": 0.75767177, + "learning_rate": 3.696733380367391e-06, + "loss": 0.83609009, + "num_input_tokens_seen": 72698695, + "router_z_loss_clip": 2.53515625, + "router_z_loss_mlp": 0.23181152, + "step": 3364, + "time_per_iteration": 2.620352029800415 + }, + { + "auxiliary_loss_clip": 0.06564072, + "auxiliary_loss_mlp": 0.01282858, + "balance_loss_clip": 0.06306748, + "balance_loss_mlp": 0.01259374, + "epoch": 0.2023147452277168, + "flos": 22024895662080.0, + "grad_norm": 2.684464985384485, + "language_loss": 0.72232616, + "learning_rate": 3.6965271636535783e-06, + "loss": 0.80079544, + "num_input_tokens_seen": 72717880, + "router_z_loss_clip": 2.57226562, + "router_z_loss_mlp": 0.23474121, + "step": 3365, + "time_per_iteration": 2.6884727478027344 + }, + { + "auxiliary_loss_clip": 0.06551654, + "auxiliary_loss_mlp": 0.01279748, + "balance_loss_clip": 0.0629961, + "balance_loss_mlp": 0.01256336, + "epoch": 0.2023748684803848, + "flos": 17750555510400.0, + "grad_norm": 1.8865204005259733, + "language_loss": 0.86329257, + "learning_rate": 3.696320882607286e-06, + "loss": 0.94160658, + "num_input_tokens_seen": 72736410, + "router_z_loss_clip": 2.52148438, + "router_z_loss_mlp": 0.23425293, + "step": 3366, + "time_per_iteration": 2.541398525238037 + }, + { + "auxiliary_loss_clip": 0.06552443, + "auxiliary_loss_mlp": 0.01277252, + "balance_loss_clip": 0.06302489, + "balance_loss_mlp": 0.01254698, + "epoch": 0.20243499173305277, + "flos": 31146912328320.0, + "grad_norm": 1.6069123477498997, + "language_loss": 0.69763649, + "learning_rate": 3.696114537236335e-06, + "loss": 0.77593338, + "num_input_tokens_seen": 72758295, + "router_z_loss_clip": 2.49804688, + "router_z_loss_mlp": 0.22558594, + "step": 3367, + "time_per_iteration": 2.674370527267456 + }, + { + "auxiliary_loss_clip": 0.06562914, + "auxiliary_loss_mlp": 0.01285589, + "balance_loss_clip": 0.06300482, + "balance_loss_mlp": 0.01257777, + "epoch": 0.20249511498572073, + "flos": 33847726256640.0, + "grad_norm": 1.76028679400595, + "language_loss": 0.69152057, + "learning_rate": 3.6959081275485512e-06, + "loss": 0.77000558, + "num_input_tokens_seen": 72782495, + "router_z_loss_clip": 2.62109375, + "router_z_loss_mlp": 0.27819824, + "step": 3368, + "time_per_iteration": 2.6662635803222656 + }, + { + "auxiliary_loss_clip": 0.06551345, + "auxiliary_loss_mlp": 0.0128738, + "balance_loss_clip": 0.06301117, + "balance_loss_mlp": 0.01263657, + "epoch": 0.2025552382383887, + "flos": 21222088093440.0, + "grad_norm": 1.819755421756695, + "language_loss": 0.78064144, + "learning_rate": 3.6957016535517615e-06, + "loss": 0.8590287, + "num_input_tokens_seen": 72801885, + "router_z_loss_clip": 2.50390625, + "router_z_loss_mlp": 0.23718262, + "step": 3369, + "time_per_iteration": 2.5846660137176514 + }, + { + "auxiliary_loss_clip": 0.06560668, + "auxiliary_loss_mlp": 0.01282514, + "balance_loss_clip": 0.06299458, + "balance_loss_mlp": 0.01257492, + "epoch": 0.20261536149105666, + "flos": 14652614355840.0, + "grad_norm": 3.2010156823618687, + "language_loss": 0.66533637, + "learning_rate": 3.695495115253795e-06, + "loss": 0.74376816, + "num_input_tokens_seen": 72816990, + "router_z_loss_clip": 2.61132812, + "router_z_loss_mlp": 0.25024414, + "step": 3370, + "time_per_iteration": 3.953664541244507 + }, + { + "auxiliary_loss_clip": 0.06420556, + "auxiliary_loss_mlp": 0.01256354, + "balance_loss_clip": 0.06284036, + "balance_loss_mlp": 0.01249797, + "epoch": 0.20267548474372463, + "flos": 66803380018560.0, + "grad_norm": 0.6606134365812599, + "language_loss": 0.58273321, + "learning_rate": 3.6952885126624834e-06, + "loss": 0.65950233, + "num_input_tokens_seen": 72879240, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.06567383, + "step": 3371, + "time_per_iteration": 3.2517025470733643 + }, + { + "auxiliary_loss_clip": 0.06555597, + "auxiliary_loss_mlp": 0.01283717, + "balance_loss_clip": 0.06300298, + "balance_loss_mlp": 0.01257944, + "epoch": 0.2027356079963926, + "flos": 24687667036800.0, + "grad_norm": 1.6416079718190109, + "language_loss": 0.92020303, + "learning_rate": 3.6950818457856617e-06, + "loss": 0.99859619, + "num_input_tokens_seen": 72899030, + "router_z_loss_clip": 2.55273438, + "router_z_loss_mlp": 0.25769043, + "step": 3372, + "time_per_iteration": 4.108370065689087 + }, + { + "auxiliary_loss_clip": 0.06555616, + "auxiliary_loss_mlp": 0.01283062, + "balance_loss_clip": 0.06298956, + "balance_loss_mlp": 0.01258672, + "epoch": 0.20279573124906058, + "flos": 26399443697280.0, + "grad_norm": 1.769817073167301, + "language_loss": 0.79293168, + "learning_rate": 3.694875114631167e-06, + "loss": 0.87131846, + "num_input_tokens_seen": 72919190, + "router_z_loss_clip": 2.56640625, + "router_z_loss_mlp": 0.24414062, + "step": 3373, + "time_per_iteration": 2.6076717376708984 + }, + { + "auxiliary_loss_clip": 0.06543471, + "auxiliary_loss_mlp": 0.01280674, + "balance_loss_clip": 0.06296648, + "balance_loss_mlp": 0.01256343, + "epoch": 0.20285585450172855, + "flos": 33808006621440.0, + "grad_norm": 3.4143342380796255, + "language_loss": 0.72364163, + "learning_rate": 3.6946683192068377e-06, + "loss": 0.8018831, + "num_input_tokens_seen": 72939720, + "router_z_loss_clip": 2.46679688, + "router_z_loss_mlp": 0.24328613, + "step": 3374, + "time_per_iteration": 2.6686174869537354 + }, + { + "auxiliary_loss_clip": 0.06419748, + "auxiliary_loss_mlp": 0.01258876, + "balance_loss_clip": 0.06284177, + "balance_loss_mlp": 0.01252266, + "epoch": 0.20291597775439651, + "flos": 71185768410240.0, + "grad_norm": 1.0120800133799934, + "language_loss": 0.62520474, + "learning_rate": 3.694461459520516e-06, + "loss": 0.70199096, + "num_input_tokens_seen": 73000015, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.06622314, + "step": 3375, + "time_per_iteration": 3.159513473510742 + }, + { + "auxiliary_loss_clip": 0.06548455, + "auxiliary_loss_mlp": 0.01283408, + "balance_loss_clip": 0.06294296, + "balance_loss_mlp": 0.0125891, + "epoch": 0.20297610100706448, + "flos": 19499368475520.0, + "grad_norm": 1.6178559610323104, + "language_loss": 0.82908762, + "learning_rate": 3.6942545355800463e-06, + "loss": 0.90740621, + "num_input_tokens_seen": 73017675, + "router_z_loss_clip": 2.54296875, + "router_z_loss_mlp": 0.24499512, + "step": 3376, + "time_per_iteration": 2.5366275310516357 + }, + { + "auxiliary_loss_clip": 0.06553418, + "auxiliary_loss_mlp": 0.01284265, + "balance_loss_clip": 0.06296962, + "balance_loss_mlp": 0.0125854, + "epoch": 0.20303622425973245, + "flos": 25050944413440.0, + "grad_norm": 2.015544075965587, + "language_loss": 0.82464767, + "learning_rate": 3.6940475473932743e-06, + "loss": 0.90302449, + "num_input_tokens_seen": 73036135, + "router_z_loss_clip": 2.56835938, + "router_z_loss_mlp": 0.25720215, + "step": 3377, + "time_per_iteration": 2.579468250274658 + }, + { + "auxiliary_loss_clip": 0.06554671, + "auxiliary_loss_mlp": 0.01287763, + "balance_loss_clip": 0.06300091, + "balance_loss_mlp": 0.01261453, + "epoch": 0.2030963475124004, + "flos": 21986266129920.0, + "grad_norm": 1.7361857812490578, + "language_loss": 0.7745406, + "learning_rate": 3.69384049496805e-06, + "loss": 0.85296494, + "num_input_tokens_seen": 73054075, + "router_z_loss_clip": 2.546875, + "router_z_loss_mlp": 0.26306152, + "step": 3378, + "time_per_iteration": 3.999164342880249 + }, + { + "auxiliary_loss_clip": 0.06557525, + "auxiliary_loss_mlp": 0.01285912, + "balance_loss_clip": 0.06298093, + "balance_loss_mlp": 0.01259423, + "epoch": 0.2031564707650684, + "flos": 19506496072320.0, + "grad_norm": 1.7814270376711854, + "language_loss": 0.80552137, + "learning_rate": 3.6936333783122242e-06, + "loss": 0.88395572, + "num_input_tokens_seen": 73073530, + "router_z_loss_clip": 2.59375, + "router_z_loss_mlp": 0.26525879, + "step": 3379, + "time_per_iteration": 3.94376277923584 + }, + { + "auxiliary_loss_clip": 0.06547987, + "auxiliary_loss_mlp": 0.01283987, + "balance_loss_clip": 0.06298195, + "balance_loss_mlp": 0.01259799, + "epoch": 0.20321659401773637, + "flos": 22753630621440.0, + "grad_norm": 1.8399421212903948, + "language_loss": 0.87578034, + "learning_rate": 3.6934261974336505e-06, + "loss": 0.95410013, + "num_input_tokens_seen": 73092820, + "router_z_loss_clip": 2.49609375, + "router_z_loss_mlp": 0.24206543, + "step": 3380, + "time_per_iteration": 2.5826356410980225 + }, + { + "auxiliary_loss_clip": 0.06554954, + "auxiliary_loss_mlp": 0.01300173, + "balance_loss_clip": 0.06299303, + "balance_loss_mlp": 0.01274817, + "epoch": 0.20327671727040433, + "flos": 22462455283200.0, + "grad_norm": 2.147675917051705, + "language_loss": 0.75801265, + "learning_rate": 3.693218952340186e-06, + "loss": 0.83656389, + "num_input_tokens_seen": 73113385, + "router_z_loss_clip": 2.55664062, + "router_z_loss_mlp": 0.2532959, + "step": 3381, + "time_per_iteration": 2.580035924911499 + }, + { + "auxiliary_loss_clip": 0.06559204, + "auxiliary_loss_mlp": 0.0128659, + "balance_loss_clip": 0.06297147, + "balance_loss_mlp": 0.01260198, + "epoch": 0.2033368405230723, + "flos": 19540807119360.0, + "grad_norm": 1.8225171591496117, + "language_loss": 0.79701936, + "learning_rate": 3.6930116430396895e-06, + "loss": 0.87547731, + "num_input_tokens_seen": 73131195, + "router_z_loss_clip": 2.62304688, + "router_z_loss_mlp": 0.26391602, + "step": 3382, + "time_per_iteration": 2.743842601776123 + }, + { + "auxiliary_loss_clip": 0.06551235, + "auxiliary_loss_mlp": 0.01283934, + "balance_loss_clip": 0.06293041, + "balance_loss_mlp": 0.01258745, + "epoch": 0.20339696377574026, + "flos": 13814489491200.0, + "grad_norm": 1.712325191768153, + "language_loss": 0.80308962, + "learning_rate": 3.6928042695400214e-06, + "loss": 0.8814413, + "num_input_tokens_seen": 73148850, + "router_z_loss_clip": 2.58203125, + "router_z_loss_mlp": 0.25195312, + "step": 3383, + "time_per_iteration": 2.6428067684173584 + }, + { + "auxiliary_loss_clip": 0.06548008, + "auxiliary_loss_mlp": 0.01285433, + "balance_loss_clip": 0.06295451, + "balance_loss_mlp": 0.01259541, + "epoch": 0.20345708702840823, + "flos": 20345627185920.0, + "grad_norm": 1.7809184522678074, + "language_loss": 0.75199848, + "learning_rate": 3.6925968318490464e-06, + "loss": 0.83033288, + "num_input_tokens_seen": 73166775, + "router_z_loss_clip": 2.52734375, + "router_z_loss_mlp": 0.25891113, + "step": 3384, + "time_per_iteration": 2.5601112842559814 + }, + { + "auxiliary_loss_clip": 0.06573269, + "auxiliary_loss_mlp": 0.01282943, + "balance_loss_clip": 0.06306025, + "balance_loss_mlp": 0.01256229, + "epoch": 0.2035172102810762, + "flos": 20339254275840.0, + "grad_norm": 2.5841350087074852, + "language_loss": 0.77226508, + "learning_rate": 3.6923893299746293e-06, + "loss": 0.85082722, + "num_input_tokens_seen": 73183215, + "router_z_loss_clip": 2.66992188, + "router_z_loss_mlp": 0.26745605, + "step": 3385, + "time_per_iteration": 2.527583122253418 + }, + { + "auxiliary_loss_clip": 0.06553946, + "auxiliary_loss_mlp": 0.01288968, + "balance_loss_clip": 0.06300423, + "balance_loss_mlp": 0.01263934, + "epoch": 0.2035773335337442, + "flos": 23337658379520.0, + "grad_norm": 1.6683994830989402, + "language_loss": 0.70000219, + "learning_rate": 3.692181763924639e-06, + "loss": 0.7784313, + "num_input_tokens_seen": 73203290, + "router_z_loss_clip": 2.53710938, + "router_z_loss_mlp": 0.25048828, + "step": 3386, + "time_per_iteration": 2.583940029144287 + }, + { + "auxiliary_loss_clip": 0.06550556, + "auxiliary_loss_mlp": 0.01289862, + "balance_loss_clip": 0.0629431, + "balance_loss_mlp": 0.01265495, + "epoch": 0.20363745678641215, + "flos": 28337924378880.0, + "grad_norm": 1.2744067098921972, + "language_loss": 0.81998229, + "learning_rate": 3.691974133706947e-06, + "loss": 0.89838648, + "num_input_tokens_seen": 73226185, + "router_z_loss_clip": 2.5625, + "router_z_loss_mlp": 0.24365234, + "step": 3387, + "time_per_iteration": 2.624765634536743 + }, + { + "auxiliary_loss_clip": 0.06543861, + "auxiliary_loss_mlp": 0.01285642, + "balance_loss_clip": 0.06297304, + "balance_loss_mlp": 0.01261705, + "epoch": 0.20369758003908012, + "flos": 18921503992320.0, + "grad_norm": 2.338231566069276, + "language_loss": 0.80333674, + "learning_rate": 3.6917664393294262e-06, + "loss": 0.88163185, + "num_input_tokens_seen": 73243300, + "router_z_loss_clip": 2.47070312, + "router_z_loss_mlp": 0.23925781, + "step": 3388, + "time_per_iteration": 2.565795421600342 + }, + { + "auxiliary_loss_clip": 0.06553982, + "auxiliary_loss_mlp": 0.01281213, + "balance_loss_clip": 0.06297579, + "balance_loss_mlp": 0.0125693, + "epoch": 0.20375770329174808, + "flos": 19212218133120.0, + "grad_norm": 1.8814817968190891, + "language_loss": 0.72894287, + "learning_rate": 3.6915586807999527e-06, + "loss": 0.80729485, + "num_input_tokens_seen": 73261490, + "router_z_loss_clip": 2.56445312, + "router_z_loss_mlp": 0.24279785, + "step": 3389, + "time_per_iteration": 2.5263590812683105 + }, + { + "auxiliary_loss_clip": 0.06544612, + "auxiliary_loss_mlp": 0.01286594, + "balance_loss_clip": 0.06296231, + "balance_loss_mlp": 0.01262204, + "epoch": 0.20381782654441605, + "flos": 19397106167040.0, + "grad_norm": 2.5524619095037626, + "language_loss": 0.88214552, + "learning_rate": 3.691350858126404e-06, + "loss": 0.96045768, + "num_input_tokens_seen": 73280180, + "router_z_loss_clip": 2.484375, + "router_z_loss_mlp": 0.24377441, + "step": 3390, + "time_per_iteration": 2.5450997352600098 + }, + { + "auxiliary_loss_clip": 0.06546676, + "auxiliary_loss_mlp": 0.01283726, + "balance_loss_clip": 0.06297011, + "balance_loss_mlp": 0.01260683, + "epoch": 0.203877949797084, + "flos": 24834764079360.0, + "grad_norm": 2.430374095532116, + "language_loss": 0.71690643, + "learning_rate": 3.691142971316662e-06, + "loss": 0.79521036, + "num_input_tokens_seen": 73300680, + "router_z_loss_clip": 2.49609375, + "router_z_loss_mlp": 0.23022461, + "step": 3391, + "time_per_iteration": 2.5983424186706543 + }, + { + "auxiliary_loss_clip": 0.06548478, + "auxiliary_loss_mlp": 0.01287319, + "balance_loss_clip": 0.06300271, + "balance_loss_mlp": 0.01263799, + "epoch": 0.20393807304975198, + "flos": 18009432299520.0, + "grad_norm": 3.271459971820983, + "language_loss": 0.87029123, + "learning_rate": 3.6909350203786086e-06, + "loss": 0.94864917, + "num_input_tokens_seen": 73316760, + "router_z_loss_clip": 2.48242188, + "router_z_loss_mlp": 0.2355957, + "step": 3392, + "time_per_iteration": 2.5094432830810547 + }, + { + "auxiliary_loss_clip": 0.06555735, + "auxiliary_loss_mlp": 0.01288889, + "balance_loss_clip": 0.06302007, + "balance_loss_mlp": 0.0126432, + "epoch": 0.20399819630241997, + "flos": 24213867724800.0, + "grad_norm": 1.4298747009925739, + "language_loss": 0.8143822, + "learning_rate": 3.69072700532013e-06, + "loss": 0.8928284, + "num_input_tokens_seen": 73339385, + "router_z_loss_clip": 2.53710938, + "router_z_loss_mlp": 0.24560547, + "step": 3393, + "time_per_iteration": 2.674898147583008 + }, + { + "auxiliary_loss_clip": 0.06555712, + "auxiliary_loss_mlp": 0.01283361, + "balance_loss_clip": 0.0630876, + "balance_loss_mlp": 0.01260747, + "epoch": 0.20405831955508794, + "flos": 20783396442240.0, + "grad_norm": 2.2973425083766377, + "language_loss": 0.87181509, + "learning_rate": 3.6905189261491137e-06, + "loss": 0.9502058, + "num_input_tokens_seen": 73357235, + "router_z_loss_clip": 2.47070312, + "router_z_loss_mlp": 0.22619629, + "step": 3394, + "time_per_iteration": 2.5489470958709717 + }, + { + "auxiliary_loss_clip": 0.06548424, + "auxiliary_loss_mlp": 0.0128548, + "balance_loss_clip": 0.06299029, + "balance_loss_mlp": 0.01262448, + "epoch": 0.2041184428077559, + "flos": 15492332448000.0, + "grad_norm": 2.1306464149991027, + "language_loss": 0.8456347, + "learning_rate": 3.69031078287345e-06, + "loss": 0.92397374, + "num_input_tokens_seen": 73374435, + "router_z_loss_clip": 2.49609375, + "router_z_loss_mlp": 0.23034668, + "step": 3395, + "time_per_iteration": 2.5297558307647705 + }, + { + "auxiliary_loss_clip": 0.06554371, + "auxiliary_loss_mlp": 0.01288203, + "balance_loss_clip": 0.06299008, + "balance_loss_mlp": 0.0126448, + "epoch": 0.20417856606042387, + "flos": 15592582258560.0, + "grad_norm": 1.9297262637725432, + "language_loss": 0.84104818, + "learning_rate": 3.690102575501033e-06, + "loss": 0.91947389, + "num_input_tokens_seen": 73391025, + "router_z_loss_clip": 2.55273438, + "router_z_loss_mlp": 0.23730469, + "step": 3396, + "time_per_iteration": 2.492448568344116 + }, + { + "auxiliary_loss_clip": 0.0654766, + "auxiliary_loss_mlp": 0.01296047, + "balance_loss_clip": 0.06301443, + "balance_loss_mlp": 0.01272706, + "epoch": 0.20423868931309183, + "flos": 24286137471360.0, + "grad_norm": 2.084884773893835, + "language_loss": 0.7751056, + "learning_rate": 3.6898943040397556e-06, + "loss": 0.85354269, + "num_input_tokens_seen": 73409270, + "router_z_loss_clip": 2.46289062, + "router_z_loss_mlp": 0.2331543, + "step": 3397, + "time_per_iteration": 2.5621836185455322 + }, + { + "auxiliary_loss_clip": 0.06547033, + "auxiliary_loss_mlp": 0.01291146, + "balance_loss_clip": 0.06300367, + "balance_loss_mlp": 0.01268067, + "epoch": 0.2042988125657598, + "flos": 18619176061440.0, + "grad_norm": 3.401004534017878, + "language_loss": 0.88746947, + "learning_rate": 3.689685968497518e-06, + "loss": 0.96585131, + "num_input_tokens_seen": 73425225, + "router_z_loss_clip": 2.46875, + "router_z_loss_mlp": 0.23083496, + "step": 3398, + "time_per_iteration": 2.4821889400482178 + }, + { + "auxiliary_loss_clip": 0.06555858, + "auxiliary_loss_mlp": 0.01287072, + "balance_loss_clip": 0.06305312, + "balance_loss_mlp": 0.01263361, + "epoch": 0.2043589358184278, + "flos": 17855836565760.0, + "grad_norm": 2.044777021305177, + "language_loss": 0.79053116, + "learning_rate": 3.6894775688822186e-06, + "loss": 0.8689605, + "num_input_tokens_seen": 73440940, + "router_z_loss_clip": 2.50390625, + "router_z_loss_mlp": 0.23706055, + "step": 3399, + "time_per_iteration": 2.5007028579711914 + }, + { + "auxiliary_loss_clip": 0.06554085, + "auxiliary_loss_mlp": 0.01288353, + "balance_loss_clip": 0.06300685, + "balance_loss_mlp": 0.01264678, + "epoch": 0.20441905907109575, + "flos": 21441832225920.0, + "grad_norm": 3.4484144890832327, + "language_loss": 0.77263522, + "learning_rate": 3.6892691052017603e-06, + "loss": 0.85105962, + "num_input_tokens_seen": 73458805, + "router_z_loss_clip": 2.53125, + "router_z_loss_mlp": 0.23669434, + "step": 3400, + "time_per_iteration": 2.524930715560913 + }, + { + "auxiliary_loss_clip": 0.06546277, + "auxiliary_loss_mlp": 0.0128369, + "balance_loss_clip": 0.0630067, + "balance_loss_mlp": 0.01262423, + "epoch": 0.20447918232376372, + "flos": 27714847818240.0, + "grad_norm": 1.566944783994086, + "language_loss": 0.7976017, + "learning_rate": 3.6890605774640487e-06, + "loss": 0.87590134, + "num_input_tokens_seen": 73479380, + "router_z_loss_clip": 2.45703125, + "router_z_loss_mlp": 0.21264648, + "step": 3401, + "time_per_iteration": 2.5868172645568848 + }, + { + "auxiliary_loss_clip": 0.06547564, + "auxiliary_loss_mlp": 0.01287222, + "balance_loss_clip": 0.06297088, + "balance_loss_mlp": 0.01263833, + "epoch": 0.20453930557643168, + "flos": 30533017789440.0, + "grad_norm": 1.6743436404675067, + "language_loss": 0.69998658, + "learning_rate": 3.688851985676991e-06, + "loss": 0.7783345, + "num_input_tokens_seen": 73505105, + "router_z_loss_clip": 2.50195312, + "router_z_loss_mlp": 0.23400879, + "step": 3402, + "time_per_iteration": 2.664961099624634 + }, + { + "auxiliary_loss_clip": 0.06561718, + "auxiliary_loss_mlp": 0.01282309, + "balance_loss_clip": 0.06309628, + "balance_loss_mlp": 0.01259981, + "epoch": 0.20459942882909965, + "flos": 18993480249600.0, + "grad_norm": 2.0207590642868736, + "language_loss": 0.82498461, + "learning_rate": 3.688643329848496e-06, + "loss": 0.90342486, + "num_input_tokens_seen": 73523700, + "router_z_loss_clip": 2.52148438, + "router_z_loss_mlp": 0.2232666, + "step": 3403, + "time_per_iteration": 2.527240514755249 + }, + { + "auxiliary_loss_clip": 0.0655287, + "auxiliary_loss_mlp": 0.0128312, + "balance_loss_clip": 0.06304024, + "balance_loss_mlp": 0.01260256, + "epoch": 0.20465955208176762, + "flos": 20345207915520.0, + "grad_norm": 1.870475930372837, + "language_loss": 0.83792305, + "learning_rate": 3.6884346099864772e-06, + "loss": 0.91628289, + "num_input_tokens_seen": 73542625, + "router_z_loss_clip": 2.48632812, + "router_z_loss_mlp": 0.22900391, + "step": 3404, + "time_per_iteration": 2.5108580589294434 + }, + { + "auxiliary_loss_clip": 0.06555478, + "auxiliary_loss_mlp": 0.01280254, + "balance_loss_clip": 0.06302839, + "balance_loss_mlp": 0.0125671, + "epoch": 0.20471967533443558, + "flos": 21257615024640.0, + "grad_norm": 1.9668153962924477, + "language_loss": 0.86568373, + "learning_rate": 3.6882258260988487e-06, + "loss": 0.94404107, + "num_input_tokens_seen": 73561450, + "router_z_loss_clip": 2.52539062, + "router_z_loss_mlp": 0.2355957, + "step": 3405, + "time_per_iteration": 2.6064257621765137 + }, + { + "auxiliary_loss_clip": 0.06551084, + "auxiliary_loss_mlp": 0.0128024, + "balance_loss_clip": 0.06302287, + "balance_loss_mlp": 0.01257256, + "epoch": 0.20477979858710357, + "flos": 14506775124480.0, + "grad_norm": 2.695451734790842, + "language_loss": 0.85318458, + "learning_rate": 3.6880169781935276e-06, + "loss": 0.93149781, + "num_input_tokens_seen": 73577155, + "router_z_loss_clip": 2.49023438, + "router_z_loss_mlp": 0.22973633, + "step": 3406, + "time_per_iteration": 2.490360975265503 + }, + { + "auxiliary_loss_clip": 0.06551544, + "auxiliary_loss_mlp": 0.01279954, + "balance_loss_clip": 0.06302837, + "balance_loss_mlp": 0.01256768, + "epoch": 0.20483992183977154, + "flos": 11405018609280.0, + "grad_norm": 8.923539759508978, + "language_loss": 0.69000643, + "learning_rate": 3.6878080662784336e-06, + "loss": 0.76832145, + "num_input_tokens_seen": 73594900, + "router_z_loss_clip": 2.484375, + "router_z_loss_mlp": 0.23193359, + "step": 3407, + "time_per_iteration": 2.5344340801239014 + }, + { + "auxiliary_loss_clip": 0.06549555, + "auxiliary_loss_mlp": 0.01280964, + "balance_loss_clip": 0.06303824, + "balance_loss_mlp": 0.01258374, + "epoch": 0.2049000450924395, + "flos": 19065917704320.0, + "grad_norm": 2.112423962078429, + "language_loss": 0.85367447, + "learning_rate": 3.6875990903614886e-06, + "loss": 0.93197966, + "num_input_tokens_seen": 73613810, + "router_z_loss_clip": 2.45703125, + "router_z_loss_mlp": 0.22583008, + "step": 3408, + "time_per_iteration": 2.5491087436676025 + }, + { + "auxiliary_loss_clip": 0.06564584, + "auxiliary_loss_mlp": 0.0128728, + "balance_loss_clip": 0.06310433, + "balance_loss_mlp": 0.0126314, + "epoch": 0.20496016834510747, + "flos": 14579799557760.0, + "grad_norm": 2.4221013711544876, + "language_loss": 0.65169537, + "learning_rate": 3.6873900504506166e-06, + "loss": 0.730214, + "num_input_tokens_seen": 73631495, + "router_z_loss_clip": 2.54492188, + "router_z_loss_mlp": 0.24121094, + "step": 3409, + "time_per_iteration": 2.5570828914642334 + }, + { + "auxiliary_loss_clip": 0.06553619, + "auxiliary_loss_mlp": 0.0128187, + "balance_loss_clip": 0.06302843, + "balance_loss_mlp": 0.01259029, + "epoch": 0.20502029159777543, + "flos": 22133069683200.0, + "grad_norm": 1.5677004994493864, + "language_loss": 0.81331646, + "learning_rate": 3.687180946553745e-06, + "loss": 0.89167136, + "num_input_tokens_seen": 73652840, + "router_z_loss_clip": 2.5078125, + "router_z_loss_mlp": 0.22851562, + "step": 3410, + "time_per_iteration": 3.9941341876983643 + }, + { + "auxiliary_loss_clip": 0.06562116, + "auxiliary_loss_mlp": 0.01278044, + "balance_loss_clip": 0.06316169, + "balance_loss_mlp": 0.01256252, + "epoch": 0.2050804148504434, + "flos": 25373873249280.0, + "grad_norm": 2.231323409005704, + "language_loss": 0.76898587, + "learning_rate": 3.686971778678803e-06, + "loss": 0.84738749, + "num_input_tokens_seen": 73672150, + "router_z_loss_clip": 2.45898438, + "router_z_loss_mlp": 0.21801758, + "step": 3411, + "time_per_iteration": 2.557502031326294 + }, + { + "auxiliary_loss_clip": 0.06566584, + "auxiliary_loss_mlp": 0.01283098, + "balance_loss_clip": 0.06318649, + "balance_loss_mlp": 0.01260567, + "epoch": 0.2051405381031114, + "flos": 23626443876480.0, + "grad_norm": 1.9814328821552187, + "language_loss": 0.73997778, + "learning_rate": 3.686762546833722e-06, + "loss": 0.81847459, + "num_input_tokens_seen": 73691940, + "router_z_loss_clip": 2.47851562, + "router_z_loss_mlp": 0.22521973, + "step": 3412, + "time_per_iteration": 4.038960695266724 + }, + { + "auxiliary_loss_clip": 0.06568237, + "auxiliary_loss_mlp": 0.01280941, + "balance_loss_clip": 0.06316938, + "balance_loss_mlp": 0.01257183, + "epoch": 0.20520066135577936, + "flos": 19570338483840.0, + "grad_norm": 2.4438525241528963, + "language_loss": 0.79063112, + "learning_rate": 3.6865532510264362e-06, + "loss": 0.86912292, + "num_input_tokens_seen": 73709080, + "router_z_loss_clip": 2.515625, + "router_z_loss_mlp": 0.23754883, + "step": 3413, + "time_per_iteration": 2.5169565677642822 + }, + { + "auxiliary_loss_clip": 0.0655475, + "auxiliary_loss_mlp": 0.0128187, + "balance_loss_clip": 0.06315412, + "balance_loss_mlp": 0.01259423, + "epoch": 0.20526078460844732, + "flos": 17682184978560.0, + "grad_norm": 1.8594099787920526, + "language_loss": 0.85324407, + "learning_rate": 3.6863438912648823e-06, + "loss": 0.93161035, + "num_input_tokens_seen": 73727670, + "router_z_loss_clip": 2.39453125, + "router_z_loss_mlp": 0.2244873, + "step": 3414, + "time_per_iteration": 2.51891827583313 + }, + { + "auxiliary_loss_clip": 0.06556672, + "auxiliary_loss_mlp": 0.01283982, + "balance_loss_clip": 0.0631127, + "balance_loss_mlp": 0.01261451, + "epoch": 0.2053209078611153, + "flos": 21505632710400.0, + "grad_norm": 1.8989416463636506, + "language_loss": 0.8139196, + "learning_rate": 3.6861344675569986e-06, + "loss": 0.89232612, + "num_input_tokens_seen": 73747170, + "router_z_loss_clip": 2.453125, + "router_z_loss_mlp": 0.22521973, + "step": 3415, + "time_per_iteration": 2.534064769744873 + }, + { + "auxiliary_loss_clip": 0.06545444, + "auxiliary_loss_mlp": 0.01280017, + "balance_loss_clip": 0.06300274, + "balance_loss_mlp": 0.01259048, + "epoch": 0.20538103111378325, + "flos": 25670163686400.0, + "grad_norm": 1.9272907146050138, + "language_loss": 0.73450923, + "learning_rate": 3.6859249799107275e-06, + "loss": 0.81276381, + "num_input_tokens_seen": 73767690, + "router_z_loss_clip": 2.44921875, + "router_z_loss_mlp": 0.20959473, + "step": 3416, + "time_per_iteration": 2.5862622261047363 + }, + { + "auxiliary_loss_clip": 0.06555279, + "auxiliary_loss_mlp": 0.01278586, + "balance_loss_clip": 0.06309061, + "balance_loss_mlp": 0.01256342, + "epoch": 0.20544115436645122, + "flos": 23155663311360.0, + "grad_norm": 3.21470343355828, + "language_loss": 0.79731691, + "learning_rate": 3.6857154283340115e-06, + "loss": 0.87565553, + "num_input_tokens_seen": 73786900, + "router_z_loss_clip": 2.46484375, + "router_z_loss_mlp": 0.22253418, + "step": 3417, + "time_per_iteration": 2.5488288402557373 + }, + { + "auxiliary_loss_clip": 0.06553051, + "auxiliary_loss_mlp": 0.0128197, + "balance_loss_clip": 0.06304517, + "balance_loss_mlp": 0.01258248, + "epoch": 0.20550127761911918, + "flos": 19396435334400.0, + "grad_norm": 3.2012221600430744, + "language_loss": 0.88593423, + "learning_rate": 3.685505812834798e-06, + "loss": 0.96428442, + "num_input_tokens_seen": 73804515, + "router_z_loss_clip": 2.484375, + "router_z_loss_mlp": 0.23681641, + "step": 3418, + "time_per_iteration": 5.385840177536011 + }, + { + "auxiliary_loss_clip": 0.06553373, + "auxiliary_loss_mlp": 0.01284895, + "balance_loss_clip": 0.06304052, + "balance_loss_mlp": 0.0125998, + "epoch": 0.20556140087178718, + "flos": 22899721415040.0, + "grad_norm": 2.325256215928591, + "language_loss": 0.63040721, + "learning_rate": 3.685296133421035e-06, + "loss": 0.70878994, + "num_input_tokens_seen": 73822910, + "router_z_loss_clip": 2.49609375, + "router_z_loss_mlp": 0.24926758, + "step": 3419, + "time_per_iteration": 2.5786759853363037 + }, + { + "auxiliary_loss_clip": 0.06563735, + "auxiliary_loss_mlp": 0.01291649, + "balance_loss_clip": 0.06310479, + "balance_loss_mlp": 0.01265554, + "epoch": 0.20562152412445514, + "flos": 19795365423360.0, + "grad_norm": 1.7732270709951168, + "language_loss": 0.86988509, + "learning_rate": 3.685086390100674e-06, + "loss": 0.948439, + "num_input_tokens_seen": 73841160, + "router_z_loss_clip": 2.53515625, + "router_z_loss_mlp": 0.26098633, + "step": 3420, + "time_per_iteration": 2.5364928245544434 + }, + { + "auxiliary_loss_clip": 0.06546585, + "auxiliary_loss_mlp": 0.01284653, + "balance_loss_clip": 0.0630153, + "balance_loss_mlp": 0.01261109, + "epoch": 0.2056816473771231, + "flos": 31509728507520.0, + "grad_norm": 10.333340616962191, + "language_loss": 0.71886712, + "learning_rate": 3.684876582881668e-06, + "loss": 0.79717946, + "num_input_tokens_seen": 73862795, + "router_z_loss_clip": 2.44726562, + "router_z_loss_mlp": 0.2355957, + "step": 3421, + "time_per_iteration": 2.6350786685943604 + }, + { + "auxiliary_loss_clip": 0.06544094, + "auxiliary_loss_mlp": 0.01288814, + "balance_loss_clip": 0.0629928, + "balance_loss_mlp": 0.0126564, + "epoch": 0.20574177062979107, + "flos": 23265095143680.0, + "grad_norm": 2.122387036588777, + "language_loss": 0.72175372, + "learning_rate": 3.6846667117719732e-06, + "loss": 0.8000828, + "num_input_tokens_seen": 73881525, + "router_z_loss_clip": 2.44921875, + "router_z_loss_mlp": 0.23168945, + "step": 3422, + "time_per_iteration": 2.578552007675171 + }, + { + "auxiliary_loss_clip": 0.06409879, + "auxiliary_loss_mlp": 0.01269861, + "balance_loss_clip": 0.06279843, + "balance_loss_mlp": 0.01263078, + "epoch": 0.20580189388245904, + "flos": 70331124291840.0, + "grad_norm": 0.7131964126658911, + "language_loss": 0.551377, + "learning_rate": 3.684456776779548e-06, + "loss": 0.62817442, + "num_input_tokens_seen": 73937775, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.06799316, + "step": 3423, + "time_per_iteration": 3.2106337547302246 + }, + { + "auxiliary_loss_clip": 0.06548166, + "auxiliary_loss_mlp": 0.0128448, + "balance_loss_clip": 0.06301543, + "balance_loss_mlp": 0.01261091, + "epoch": 0.205862017135127, + "flos": 30745802033280.0, + "grad_norm": 1.8660135712145316, + "language_loss": 0.72238076, + "learning_rate": 3.684246777912353e-06, + "loss": 0.80070728, + "num_input_tokens_seen": 73958250, + "router_z_loss_clip": 2.46875, + "router_z_loss_mlp": 0.23400879, + "step": 3424, + "time_per_iteration": 2.614389181137085 + }, + { + "auxiliary_loss_clip": 0.06544662, + "auxiliary_loss_mlp": 0.01287262, + "balance_loss_clip": 0.06303795, + "balance_loss_mlp": 0.01263229, + "epoch": 0.20592214038779497, + "flos": 21330932947200.0, + "grad_norm": 1.6926765615616197, + "language_loss": 0.75646138, + "learning_rate": 3.684036715178351e-06, + "loss": 0.83478063, + "num_input_tokens_seen": 73977775, + "router_z_loss_clip": 2.41015625, + "router_z_loss_mlp": 0.24023438, + "step": 3425, + "time_per_iteration": 2.5351436138153076 + }, + { + "auxiliary_loss_clip": 0.06546403, + "auxiliary_loss_mlp": 0.01289796, + "balance_loss_clip": 0.06304145, + "balance_loss_mlp": 0.01266813, + "epoch": 0.20598226364046296, + "flos": 22898002406400.0, + "grad_norm": 1.848184132977354, + "language_loss": 0.88618112, + "learning_rate": 3.683826588585508e-06, + "loss": 0.9645431, + "num_input_tokens_seen": 73996590, + "router_z_loss_clip": 2.421875, + "router_z_loss_mlp": 0.22998047, + "step": 3426, + "time_per_iteration": 2.604752779006958 + }, + { + "auxiliary_loss_clip": 0.06551787, + "auxiliary_loss_mlp": 0.01284615, + "balance_loss_clip": 0.06311674, + "balance_loss_mlp": 0.01261226, + "epoch": 0.20604238689313092, + "flos": 23885362592640.0, + "grad_norm": 1.5517486951437824, + "language_loss": 0.77144063, + "learning_rate": 3.6836163981417926e-06, + "loss": 0.8498047, + "num_input_tokens_seen": 74015935, + "router_z_loss_clip": 2.40625, + "router_z_loss_mlp": 0.23376465, + "step": 3427, + "time_per_iteration": 2.5526115894317627 + }, + { + "auxiliary_loss_clip": 0.06556956, + "auxiliary_loss_mlp": 0.01287227, + "balance_loss_clip": 0.06309945, + "balance_loss_mlp": 0.01264661, + "epoch": 0.2061025101457989, + "flos": 22498024141440.0, + "grad_norm": 1.8896972045039995, + "language_loss": 0.74443614, + "learning_rate": 3.683406143855174e-06, + "loss": 0.822878, + "num_input_tokens_seen": 74036575, + "router_z_loss_clip": 2.46875, + "router_z_loss_mlp": 0.22558594, + "step": 3428, + "time_per_iteration": 2.5644474029541016 + }, + { + "auxiliary_loss_clip": 0.06552382, + "auxiliary_loss_mlp": 0.01283805, + "balance_loss_clip": 0.06304047, + "balance_loss_mlp": 0.01259427, + "epoch": 0.20616263339846685, + "flos": 22784713286400.0, + "grad_norm": 1.96097325322206, + "language_loss": 0.74164659, + "learning_rate": 3.6831958257336256e-06, + "loss": 0.82000846, + "num_input_tokens_seen": 74055365, + "router_z_loss_clip": 2.484375, + "router_z_loss_mlp": 0.24377441, + "step": 3429, + "time_per_iteration": 2.5337913036346436 + }, + { + "auxiliary_loss_clip": 0.06551956, + "auxiliary_loss_mlp": 0.01286455, + "balance_loss_clip": 0.06304303, + "balance_loss_mlp": 0.01263126, + "epoch": 0.20622275665113482, + "flos": 20887755102720.0, + "grad_norm": 2.9642283368918863, + "language_loss": 0.86220586, + "learning_rate": 3.6829854437851237e-06, + "loss": 0.94058996, + "num_input_tokens_seen": 74074875, + "router_z_loss_clip": 2.4765625, + "router_z_loss_mlp": 0.23327637, + "step": 3430, + "time_per_iteration": 2.5939443111419678 + }, + { + "auxiliary_loss_clip": 0.06546243, + "auxiliary_loss_mlp": 0.01280876, + "balance_loss_clip": 0.06300765, + "balance_loss_mlp": 0.01257607, + "epoch": 0.20628287990380278, + "flos": 19360489132800.0, + "grad_norm": 1.6588894263331828, + "language_loss": 0.70011377, + "learning_rate": 3.6827749980176444e-06, + "loss": 0.77838504, + "num_input_tokens_seen": 74094505, + "router_z_loss_clip": 2.45507812, + "router_z_loss_mlp": 0.23278809, + "step": 3431, + "time_per_iteration": 2.565840482711792 + }, + { + "auxiliary_loss_clip": 0.06410907, + "auxiliary_loss_mlp": 0.0126731, + "balance_loss_clip": 0.06280327, + "balance_loss_mlp": 0.01261215, + "epoch": 0.20634300315647078, + "flos": 71536970799360.0, + "grad_norm": 0.791675242165557, + "language_loss": 0.60400987, + "learning_rate": 3.6825644884391693e-06, + "loss": 0.68079197, + "num_input_tokens_seen": 74158500, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.0609436, + "step": 3432, + "time_per_iteration": 3.305082082748413 + }, + { + "auxiliary_loss_clip": 0.06552991, + "auxiliary_loss_mlp": 0.01280414, + "balance_loss_clip": 0.06308176, + "balance_loss_mlp": 0.01257561, + "epoch": 0.20640312640913874, + "flos": 21730072671360.0, + "grad_norm": 1.5897016059046762, + "language_loss": 0.72477019, + "learning_rate": 3.682353915057679e-06, + "loss": 0.80310422, + "num_input_tokens_seen": 74176685, + "router_z_loss_clip": 2.44921875, + "router_z_loss_mlp": 0.22875977, + "step": 3433, + "time_per_iteration": 2.564393997192383 + }, + { + "auxiliary_loss_clip": 0.06561184, + "auxiliary_loss_mlp": 0.01281531, + "balance_loss_clip": 0.06312474, + "balance_loss_mlp": 0.01258512, + "epoch": 0.2064632496618067, + "flos": 20560256219520.0, + "grad_norm": 1.7877531320590552, + "language_loss": 0.87141019, + "learning_rate": 3.6821432778811604e-06, + "loss": 0.94983733, + "num_input_tokens_seen": 74194935, + "router_z_loss_clip": 2.484375, + "router_z_loss_mlp": 0.23010254, + "step": 3434, + "time_per_iteration": 2.5466108322143555 + }, + { + "auxiliary_loss_clip": 0.06556005, + "auxiliary_loss_mlp": 0.01283316, + "balance_loss_clip": 0.06305495, + "balance_loss_mlp": 0.01259427, + "epoch": 0.20652337291447467, + "flos": 29830669666560.0, + "grad_norm": 1.6526860814470912, + "language_loss": 0.6970489, + "learning_rate": 3.6819325769176004e-06, + "loss": 0.77544212, + "num_input_tokens_seen": 74215400, + "router_z_loss_clip": 2.50585938, + "router_z_loss_mlp": 0.2388916, + "step": 3435, + "time_per_iteration": 2.613896369934082 + }, + { + "auxiliary_loss_clip": 0.06545977, + "auxiliary_loss_mlp": 0.01289312, + "balance_loss_clip": 0.0630382, + "balance_loss_mlp": 0.01264325, + "epoch": 0.20658349616714264, + "flos": 26220844719360.0, + "grad_norm": 1.7674379542335852, + "language_loss": 0.89957321, + "learning_rate": 3.681721812174988e-06, + "loss": 0.97792608, + "num_input_tokens_seen": 74234090, + "router_z_loss_clip": 2.42578125, + "router_z_loss_mlp": 0.24975586, + "step": 3436, + "time_per_iteration": 2.590360641479492 + }, + { + "auxiliary_loss_clip": 0.06548543, + "auxiliary_loss_mlp": 0.01277538, + "balance_loss_clip": 0.06303848, + "balance_loss_mlp": 0.01254209, + "epoch": 0.2066436194198106, + "flos": 26001477930240.0, + "grad_norm": 1.7140409089026185, + "language_loss": 0.77244872, + "learning_rate": 3.6815109836613163e-06, + "loss": 0.8507095, + "num_input_tokens_seen": 74253345, + "router_z_loss_clip": 2.44335938, + "router_z_loss_mlp": 0.23339844, + "step": 3437, + "time_per_iteration": 2.6068568229675293 + }, + { + "auxiliary_loss_clip": 0.06548648, + "auxiliary_loss_mlp": 0.01280201, + "balance_loss_clip": 0.06300757, + "balance_loss_mlp": 0.01257682, + "epoch": 0.20670374267247857, + "flos": 21367466127360.0, + "grad_norm": 2.0146667208247355, + "language_loss": 0.78725338, + "learning_rate": 3.6813000913845795e-06, + "loss": 0.86554188, + "num_input_tokens_seen": 74271615, + "router_z_loss_clip": 2.47460938, + "router_z_loss_mlp": 0.22521973, + "step": 3438, + "time_per_iteration": 2.567963123321533 + }, + { + "auxiliary_loss_clip": 0.06407821, + "auxiliary_loss_mlp": 0.01263014, + "balance_loss_clip": 0.06278364, + "balance_loss_mlp": 0.01257164, + "epoch": 0.20676386592514656, + "flos": 66403108264320.0, + "grad_norm": 0.8029327028802032, + "language_loss": 0.66817588, + "learning_rate": 3.6810891353527747e-06, + "loss": 0.74488425, + "num_input_tokens_seen": 74331390, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.05844116, + "step": 3439, + "time_per_iteration": 3.1231849193573 + }, + { + "auxiliary_loss_clip": 0.06557775, + "auxiliary_loss_mlp": 0.01283609, + "balance_loss_clip": 0.06302103, + "balance_loss_mlp": 0.01260423, + "epoch": 0.20682398917781453, + "flos": 17280278069760.0, + "grad_norm": 1.9287299109512155, + "language_loss": 0.8404541, + "learning_rate": 3.6808781155739014e-06, + "loss": 0.91886795, + "num_input_tokens_seen": 74347335, + "router_z_loss_clip": 2.5546875, + "router_z_loss_mlp": 0.23168945, + "step": 3440, + "time_per_iteration": 2.496563196182251 + }, + { + "auxiliary_loss_clip": 0.06545421, + "auxiliary_loss_mlp": 0.01282262, + "balance_loss_clip": 0.06298509, + "balance_loss_mlp": 0.0126028, + "epoch": 0.2068841124304825, + "flos": 18083127565440.0, + "grad_norm": 3.100665935871663, + "language_loss": 0.85299611, + "learning_rate": 3.6806670320559614e-06, + "loss": 0.93127292, + "num_input_tokens_seen": 74366310, + "router_z_loss_clip": 2.47265625, + "router_z_loss_mlp": 0.2199707, + "step": 3441, + "time_per_iteration": 2.528823137283325 + }, + { + "auxiliary_loss_clip": 0.06546343, + "auxiliary_loss_mlp": 0.01282668, + "balance_loss_clip": 0.06300771, + "balance_loss_mlp": 0.01258958, + "epoch": 0.20694423568315046, + "flos": 27354798823680.0, + "grad_norm": 1.6487564578537555, + "language_loss": 0.86298448, + "learning_rate": 3.680455884806959e-06, + "loss": 0.94127464, + "num_input_tokens_seen": 74387100, + "router_z_loss_clip": 2.45898438, + "router_z_loss_mlp": 0.23693848, + "step": 3442, + "time_per_iteration": 2.5904433727264404 + }, + { + "auxiliary_loss_clip": 0.06553168, + "auxiliary_loss_mlp": 0.0128107, + "balance_loss_clip": 0.06302296, + "balance_loss_mlp": 0.01256298, + "epoch": 0.20700435893581842, + "flos": 20236027645440.0, + "grad_norm": 1.991917549605425, + "language_loss": 0.74110967, + "learning_rate": 3.6802446738349014e-06, + "loss": 0.81945205, + "num_input_tokens_seen": 74404460, + "router_z_loss_clip": 2.50976562, + "router_z_loss_mlp": 0.24755859, + "step": 3443, + "time_per_iteration": 2.546297311782837 + }, + { + "auxiliary_loss_clip": 0.06540793, + "auxiliary_loss_mlp": 0.01282, + "balance_loss_clip": 0.06298059, + "balance_loss_mlp": 0.01259183, + "epoch": 0.2070644821884864, + "flos": 20637347575680.0, + "grad_norm": 5.522598582225395, + "language_loss": 0.86263227, + "learning_rate": 3.680033399147797e-06, + "loss": 0.94086015, + "num_input_tokens_seen": 74423790, + "router_z_loss_clip": 2.42773438, + "router_z_loss_mlp": 0.22814941, + "step": 3444, + "time_per_iteration": 2.5644776821136475 + }, + { + "auxiliary_loss_clip": 0.06396829, + "auxiliary_loss_mlp": 0.01270586, + "balance_loss_clip": 0.06267206, + "balance_loss_mlp": 0.01264399, + "epoch": 0.20712460544115438, + "flos": 65960098128000.0, + "grad_norm": 0.6752802627643808, + "language_loss": 0.56895542, + "learning_rate": 3.6798220607536585e-06, + "loss": 0.64562953, + "num_input_tokens_seen": 74488130, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.06185913, + "step": 3445, + "time_per_iteration": 3.133159637451172 + }, + { + "auxiliary_loss_clip": 0.06550106, + "auxiliary_loss_mlp": 0.0128273, + "balance_loss_clip": 0.06305806, + "balance_loss_mlp": 0.01259412, + "epoch": 0.20718472869382235, + "flos": 19431542995200.0, + "grad_norm": 1.845349461285762, + "language_loss": 0.78388685, + "learning_rate": 3.6796106586604987e-06, + "loss": 0.86221522, + "num_input_tokens_seen": 74506720, + "router_z_loss_clip": 2.4453125, + "router_z_loss_mlp": 0.23327637, + "step": 3446, + "time_per_iteration": 2.5563149452209473 + }, + { + "auxiliary_loss_clip": 0.06562304, + "auxiliary_loss_mlp": 0.0128875, + "balance_loss_clip": 0.06302087, + "balance_loss_mlp": 0.01263215, + "epoch": 0.2072448519464903, + "flos": 24506007384960.0, + "grad_norm": 2.528724295630225, + "language_loss": 0.63215572, + "learning_rate": 3.679399192876334e-06, + "loss": 0.7106663, + "num_input_tokens_seen": 74525330, + "router_z_loss_clip": 2.60351562, + "router_z_loss_mlp": 0.25549316, + "step": 3447, + "time_per_iteration": 2.5858354568481445 + }, + { + "auxiliary_loss_clip": 0.06550243, + "auxiliary_loss_mlp": 0.01285454, + "balance_loss_clip": 0.06302016, + "balance_loss_mlp": 0.01261624, + "epoch": 0.20730497519915828, + "flos": 23082345388800.0, + "grad_norm": 1.7246458475869415, + "language_loss": 0.87330115, + "learning_rate": 3.679187663409184e-06, + "loss": 0.95165813, + "num_input_tokens_seen": 74544535, + "router_z_loss_clip": 2.47851562, + "router_z_loss_mlp": 0.23840332, + "step": 3448, + "time_per_iteration": 2.5367424488067627 + }, + { + "auxiliary_loss_clip": 0.06547908, + "auxiliary_loss_mlp": 0.01287375, + "balance_loss_clip": 0.06301224, + "balance_loss_mlp": 0.0126407, + "epoch": 0.20736509845182624, + "flos": 21075368394240.0, + "grad_norm": 2.238353970842136, + "language_loss": 0.75934261, + "learning_rate": 3.6789760702670696e-06, + "loss": 0.83769548, + "num_input_tokens_seen": 74562300, + "router_z_loss_clip": 2.47070312, + "router_z_loss_mlp": 0.23291016, + "step": 3449, + "time_per_iteration": 3.94480562210083 + }, + { + "auxiliary_loss_clip": 0.06557415, + "auxiliary_loss_mlp": 0.01291462, + "balance_loss_clip": 0.06305711, + "balance_loss_mlp": 0.01267262, + "epoch": 0.2074252217044942, + "flos": 17638021077120.0, + "grad_norm": 1.9890451191355467, + "language_loss": 0.77508813, + "learning_rate": 3.6787644134580134e-06, + "loss": 0.8535769, + "num_input_tokens_seen": 74580080, + "router_z_loss_clip": 2.51757812, + "router_z_loss_mlp": 0.24243164, + "step": 3450, + "time_per_iteration": 2.545430898666382 + }, + { + "auxiliary_loss_clip": 0.06561074, + "auxiliary_loss_mlp": 0.01294493, + "balance_loss_clip": 0.06309673, + "balance_loss_mlp": 0.01270579, + "epoch": 0.20748534495716217, + "flos": 23553209808000.0, + "grad_norm": 2.274256725147599, + "language_loss": 0.823879, + "learning_rate": 3.6785526929900436e-06, + "loss": 0.90243471, + "num_input_tokens_seen": 74598980, + "router_z_loss_clip": 2.515625, + "router_z_loss_mlp": 0.23913574, + "step": 3451, + "time_per_iteration": 4.003388404846191 + }, + { + "auxiliary_loss_clip": 0.0640305, + "auxiliary_loss_mlp": 0.01254439, + "balance_loss_clip": 0.06273949, + "balance_loss_mlp": 0.01248494, + "epoch": 0.20754546820983016, + "flos": 52268666757120.0, + "grad_norm": 0.7675919354914552, + "language_loss": 0.56549037, + "learning_rate": 3.6783409088711875e-06, + "loss": 0.64206523, + "num_input_tokens_seen": 74655275, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.05941772, + "step": 3452, + "time_per_iteration": 3.0660083293914795 + }, + { + "auxiliary_loss_clip": 0.06557937, + "auxiliary_loss_mlp": 0.01287582, + "balance_loss_clip": 0.06309802, + "balance_loss_mlp": 0.01264956, + "epoch": 0.20760559146249813, + "flos": 20418609692160.0, + "grad_norm": 1.8872949255610445, + "language_loss": 0.88967919, + "learning_rate": 3.6781290611094755e-06, + "loss": 0.9681344, + "num_input_tokens_seen": 74674560, + "router_z_loss_clip": 2.48046875, + "router_z_loss_mlp": 0.22619629, + "step": 3453, + "time_per_iteration": 2.581430673599243 + }, + { + "auxiliary_loss_clip": 0.06554953, + "auxiliary_loss_mlp": 0.01287205, + "balance_loss_clip": 0.06307904, + "balance_loss_mlp": 0.01263256, + "epoch": 0.2076657147151661, + "flos": 23192825397120.0, + "grad_norm": 1.4776896143180385, + "language_loss": 0.80720532, + "learning_rate": 3.6779171497129407e-06, + "loss": 0.88562691, + "num_input_tokens_seen": 74694500, + "router_z_loss_clip": 2.47070312, + "router_z_loss_mlp": 0.23962402, + "step": 3454, + "time_per_iteration": 2.5793018341064453 + }, + { + "auxiliary_loss_clip": 0.06549348, + "auxiliary_loss_mlp": 0.01286388, + "balance_loss_clip": 0.06301847, + "balance_loss_mlp": 0.01263476, + "epoch": 0.20772583796783406, + "flos": 18298595139840.0, + "grad_norm": 4.241833159654324, + "language_loss": 0.78446364, + "learning_rate": 3.6777051746896202e-06, + "loss": 0.86282104, + "num_input_tokens_seen": 74710485, + "router_z_loss_clip": 2.4765625, + "router_z_loss_mlp": 0.22912598, + "step": 3455, + "time_per_iteration": 2.5377535820007324 + }, + { + "auxiliary_loss_clip": 0.0654678, + "auxiliary_loss_mlp": 0.01279125, + "balance_loss_clip": 0.06301546, + "balance_loss_mlp": 0.01256547, + "epoch": 0.20778596122050202, + "flos": 17608531639680.0, + "grad_norm": 1.6321737814924744, + "language_loss": 0.81251496, + "learning_rate": 3.6774931360475516e-06, + "loss": 0.89077407, + "num_input_tokens_seen": 74727450, + "router_z_loss_clip": 2.453125, + "router_z_loss_mlp": 0.22595215, + "step": 3456, + "time_per_iteration": 2.5125768184661865 + }, + { + "auxiliary_loss_clip": 0.06554688, + "auxiliary_loss_mlp": 0.01282924, + "balance_loss_clip": 0.06304802, + "balance_loss_mlp": 0.01259893, + "epoch": 0.20784608447317, + "flos": 23812380086400.0, + "grad_norm": 2.3276439316102695, + "language_loss": 0.79071975, + "learning_rate": 3.6772810337947745e-06, + "loss": 0.86909586, + "num_input_tokens_seen": 74746725, + "router_z_loss_clip": 2.49804688, + "router_z_loss_mlp": 0.23022461, + "step": 3457, + "time_per_iteration": 5.41590428352356 + }, + { + "auxiliary_loss_clip": 0.06553855, + "auxiliary_loss_mlp": 0.01279092, + "balance_loss_clip": 0.0630386, + "balance_loss_mlp": 0.01255739, + "epoch": 0.20790620772583795, + "flos": 17645022892800.0, + "grad_norm": 1.9963286729709264, + "language_loss": 0.84664595, + "learning_rate": 3.677068867939333e-06, + "loss": 0.9249754, + "num_input_tokens_seen": 74765255, + "router_z_loss_clip": 2.49609375, + "router_z_loss_mlp": 0.23364258, + "step": 3458, + "time_per_iteration": 2.610107183456421 + }, + { + "auxiliary_loss_clip": 0.06541788, + "auxiliary_loss_mlp": 0.01277058, + "balance_loss_clip": 0.06299603, + "balance_loss_mlp": 0.01254289, + "epoch": 0.20796633097850595, + "flos": 27680997968640.0, + "grad_norm": 1.7522329071194311, + "language_loss": 0.76853168, + "learning_rate": 3.676856638489272e-06, + "loss": 0.8467201, + "num_input_tokens_seen": 74785710, + "router_z_loss_clip": 2.41796875, + "router_z_loss_mlp": 0.2277832, + "step": 3459, + "time_per_iteration": 2.63517689704895 + }, + { + "auxiliary_loss_clip": 0.06543219, + "auxiliary_loss_mlp": 0.01279579, + "balance_loss_clip": 0.06299554, + "balance_loss_mlp": 0.01257024, + "epoch": 0.2080264542311739, + "flos": 19251770060160.0, + "grad_norm": 1.8057193688460893, + "language_loss": 0.77803749, + "learning_rate": 3.6766443454526382e-06, + "loss": 0.85626543, + "num_input_tokens_seen": 74804490, + "router_z_loss_clip": 2.43945312, + "router_z_loss_mlp": 0.22570801, + "step": 3460, + "time_per_iteration": 2.5500359535217285 + }, + { + "auxiliary_loss_clip": 0.06544735, + "auxiliary_loss_mlp": 0.01276939, + "balance_loss_clip": 0.06297737, + "balance_loss_mlp": 0.01255315, + "epoch": 0.20808657748384188, + "flos": 27533146239360.0, + "grad_norm": 1.865214089074118, + "language_loss": 0.76152873, + "learning_rate": 3.6764319888374836e-06, + "loss": 0.8397454, + "num_input_tokens_seen": 74826340, + "router_z_loss_clip": 2.46484375, + "router_z_loss_mlp": 0.21618652, + "step": 3461, + "time_per_iteration": 2.575975179672241 + }, + { + "auxiliary_loss_clip": 0.06554922, + "auxiliary_loss_mlp": 0.01279751, + "balance_loss_clip": 0.06301013, + "balance_loss_mlp": 0.01256183, + "epoch": 0.20814670073650984, + "flos": 26914262382720.0, + "grad_norm": 2.229402903272821, + "language_loss": 0.89438462, + "learning_rate": 3.6762195686518604e-06, + "loss": 0.97273135, + "num_input_tokens_seen": 74844960, + "router_z_loss_clip": 2.53515625, + "router_z_loss_mlp": 0.23571777, + "step": 3462, + "time_per_iteration": 2.5732173919677734 + }, + { + "auxiliary_loss_clip": 0.06402825, + "auxiliary_loss_mlp": 0.01283843, + "balance_loss_clip": 0.06274004, + "balance_loss_mlp": 0.01278395, + "epoch": 0.2082068239891778, + "flos": 70195850674560.0, + "grad_norm": 0.9150130859854356, + "language_loss": 0.59001637, + "learning_rate": 3.6760070849038226e-06, + "loss": 0.66688299, + "num_input_tokens_seen": 74909075, + "router_z_loss_clip": 1.28417969, + "router_z_loss_mlp": 0.05456543, + "step": 3463, + "time_per_iteration": 3.269202709197998 + }, + { + "auxiliary_loss_clip": 0.06550549, + "auxiliary_loss_mlp": 0.01282784, + "balance_loss_clip": 0.06300794, + "balance_loss_mlp": 0.01257929, + "epoch": 0.20826694724184577, + "flos": 24614978019840.0, + "grad_norm": 2.6522237220698663, + "language_loss": 0.66949397, + "learning_rate": 3.675794537601429e-06, + "loss": 0.74782729, + "num_input_tokens_seen": 74928125, + "router_z_loss_clip": 2.5, + "router_z_loss_mlp": 0.2487793, + "step": 3464, + "time_per_iteration": 2.5638158321380615 + }, + { + "auxiliary_loss_clip": 0.06556059, + "auxiliary_loss_mlp": 0.01287892, + "balance_loss_clip": 0.06307128, + "balance_loss_mlp": 0.01263299, + "epoch": 0.20832707049451377, + "flos": 12897218845440.0, + "grad_norm": 2.2476817474527913, + "language_loss": 0.84321886, + "learning_rate": 3.6755819267527373e-06, + "loss": 0.9216584, + "num_input_tokens_seen": 74945090, + "router_z_loss_clip": 2.49023438, + "router_z_loss_mlp": 0.24609375, + "step": 3465, + "time_per_iteration": 2.5794646739959717 + }, + { + "auxiliary_loss_clip": 0.06542073, + "auxiliary_loss_mlp": 0.01282156, + "balance_loss_clip": 0.06295872, + "balance_loss_mlp": 0.01258326, + "epoch": 0.20838719374718173, + "flos": 22205129794560.0, + "grad_norm": 3.281235222185926, + "language_loss": 0.82741451, + "learning_rate": 3.6753692523658113e-06, + "loss": 0.90565681, + "num_input_tokens_seen": 74963630, + "router_z_loss_clip": 2.46289062, + "router_z_loss_mlp": 0.23828125, + "step": 3466, + "time_per_iteration": 2.540011405944824 + }, + { + "auxiliary_loss_clip": 0.06540319, + "auxiliary_loss_mlp": 0.01287937, + "balance_loss_clip": 0.06300111, + "balance_loss_mlp": 0.01267243, + "epoch": 0.2084473169998497, + "flos": 15164036951040.0, + "grad_norm": 2.490655035944783, + "language_loss": 0.82892549, + "learning_rate": 3.675156514448716e-06, + "loss": 0.90720803, + "num_input_tokens_seen": 74981875, + "router_z_loss_clip": 2.40234375, + "router_z_loss_mlp": 0.20690918, + "step": 3467, + "time_per_iteration": 2.54622745513916 + }, + { + "auxiliary_loss_clip": 0.06540733, + "auxiliary_loss_mlp": 0.01289148, + "balance_loss_clip": 0.06303266, + "balance_loss_mlp": 0.01268167, + "epoch": 0.20850744025251766, + "flos": 17462482773120.0, + "grad_norm": 1.8114532422505003, + "language_loss": 0.82299387, + "learning_rate": 3.674943713009518e-06, + "loss": 0.90129268, + "num_input_tokens_seen": 74999155, + "router_z_loss_clip": 2.37304688, + "router_z_loss_mlp": 0.2097168, + "step": 3468, + "time_per_iteration": 2.5321285724639893 + }, + { + "auxiliary_loss_clip": 0.06553383, + "auxiliary_loss_mlp": 0.01280357, + "balance_loss_clip": 0.06302625, + "balance_loss_mlp": 0.01257158, + "epoch": 0.20856756350518563, + "flos": 25705439055360.0, + "grad_norm": 1.667306072143411, + "language_loss": 0.9042781, + "learning_rate": 3.6747308480562856e-06, + "loss": 0.98261553, + "num_input_tokens_seen": 75017850, + "router_z_loss_clip": 2.5078125, + "router_z_loss_mlp": 0.23217773, + "step": 3469, + "time_per_iteration": 2.6107866764068604 + }, + { + "auxiliary_loss_clip": 0.0655106, + "auxiliary_loss_mlp": 0.01281556, + "balance_loss_clip": 0.06308927, + "balance_loss_mlp": 0.01259872, + "epoch": 0.2086276867578536, + "flos": 37898213425920.0, + "grad_norm": 1.9476878714472061, + "language_loss": 0.77294397, + "learning_rate": 3.674517919597092e-06, + "loss": 0.85127008, + "num_input_tokens_seen": 75039270, + "router_z_loss_clip": 2.421875, + "router_z_loss_mlp": 0.21679688, + "step": 3470, + "time_per_iteration": 2.7083425521850586 + }, + { + "auxiliary_loss_clip": 0.06547298, + "auxiliary_loss_mlp": 0.01289218, + "balance_loss_clip": 0.06307482, + "balance_loss_mlp": 0.01266283, + "epoch": 0.20868781001052156, + "flos": 25564169871360.0, + "grad_norm": 1.8036684586339249, + "language_loss": 0.76289082, + "learning_rate": 3.674304927640011e-06, + "loss": 0.84125602, + "num_input_tokens_seen": 75059350, + "router_z_loss_clip": 2.40039062, + "router_z_loss_mlp": 0.22937012, + "step": 3471, + "time_per_iteration": 2.589884042739868 + }, + { + "auxiliary_loss_clip": 0.06554438, + "auxiliary_loss_mlp": 0.01280867, + "balance_loss_clip": 0.06303854, + "balance_loss_mlp": 0.01259028, + "epoch": 0.20874793326318955, + "flos": 27536961600000.0, + "grad_norm": 1.6381609540737498, + "language_loss": 0.76341867, + "learning_rate": 3.67409187219312e-06, + "loss": 0.84177172, + "num_input_tokens_seen": 75080150, + "router_z_loss_clip": 2.5078125, + "router_z_loss_mlp": 0.21813965, + "step": 3472, + "time_per_iteration": 2.610260009765625 + }, + { + "auxiliary_loss_clip": 0.06544036, + "auxiliary_loss_mlp": 0.01279562, + "balance_loss_clip": 0.06302247, + "balance_loss_mlp": 0.01259022, + "epoch": 0.20880805651585752, + "flos": 18554243546880.0, + "grad_norm": 2.073955911698539, + "language_loss": 0.85418117, + "learning_rate": 3.6738787532644966e-06, + "loss": 0.93241715, + "num_input_tokens_seen": 75097920, + "router_z_loss_clip": 2.41601562, + "router_z_loss_mlp": 0.20532227, + "step": 3473, + "time_per_iteration": 2.5741372108459473 + }, + { + "auxiliary_loss_clip": 0.06431094, + "auxiliary_loss_mlp": 0.01255526, + "balance_loss_clip": 0.06305239, + "balance_loss_mlp": 0.01250132, + "epoch": 0.20886817976852548, + "flos": 65966596819200.0, + "grad_norm": 0.8661888314681573, + "language_loss": 0.63746876, + "learning_rate": 3.6736655708622235e-06, + "loss": 0.71433502, + "num_input_tokens_seen": 75152410, + "router_z_loss_clip": 1.25585938, + "router_z_loss_mlp": 0.05401611, + "step": 3474, + "time_per_iteration": 3.061617612838745 + }, + { + "auxiliary_loss_clip": 0.06545534, + "auxiliary_loss_mlp": 0.01278543, + "balance_loss_clip": 0.06299987, + "balance_loss_mlp": 0.01255751, + "epoch": 0.20892830302119345, + "flos": 36548120914560.0, + "grad_norm": 1.9594452651536962, + "language_loss": 0.70746702, + "learning_rate": 3.6734523249943844e-06, + "loss": 0.78570777, + "num_input_tokens_seen": 75173265, + "router_z_loss_clip": 2.45898438, + "router_z_loss_mlp": 0.22790527, + "step": 3475, + "time_per_iteration": 2.7295854091644287 + }, + { + "auxiliary_loss_clip": 0.06544538, + "auxiliary_loss_mlp": 0.01277403, + "balance_loss_clip": 0.06299123, + "balance_loss_mlp": 0.01255754, + "epoch": 0.2089884262738614, + "flos": 20962582398720.0, + "grad_norm": 1.6086426160627472, + "language_loss": 0.70801485, + "learning_rate": 3.673239015669065e-06, + "loss": 0.78623426, + "num_input_tokens_seen": 75193640, + "router_z_loss_clip": 2.45703125, + "router_z_loss_mlp": 0.21643066, + "step": 3476, + "time_per_iteration": 2.6065874099731445 + }, + { + "auxiliary_loss_clip": 0.06538086, + "auxiliary_loss_mlp": 0.01275305, + "balance_loss_clip": 0.06299278, + "balance_loss_mlp": 0.0125523, + "epoch": 0.20904854952652938, + "flos": 22790666926080.0, + "grad_norm": 1.9785394209574967, + "language_loss": 0.90003526, + "learning_rate": 3.6730256428943544e-06, + "loss": 0.9781692, + "num_input_tokens_seen": 75212545, + "router_z_loss_clip": 2.390625, + "router_z_loss_mlp": 0.20080566, + "step": 3477, + "time_per_iteration": 2.5576000213623047 + }, + { + "auxiliary_loss_clip": 0.06542666, + "auxiliary_loss_mlp": 0.01278801, + "balance_loss_clip": 0.06302647, + "balance_loss_mlp": 0.01257594, + "epoch": 0.20910867277919734, + "flos": 27309838308480.0, + "grad_norm": 2.554960999675803, + "language_loss": 0.69433093, + "learning_rate": 3.672812206678344e-06, + "loss": 0.77254558, + "num_input_tokens_seen": 75230865, + "router_z_loss_clip": 2.40234375, + "router_z_loss_mlp": 0.21203613, + "step": 3478, + "time_per_iteration": 2.605890989303589 + }, + { + "auxiliary_loss_clip": 0.0654031, + "auxiliary_loss_mlp": 0.01282288, + "balance_loss_clip": 0.06298592, + "balance_loss_mlp": 0.01260461, + "epoch": 0.20916879603186533, + "flos": 14324444640000.0, + "grad_norm": 1.9959140715838508, + "language_loss": 0.85550553, + "learning_rate": 3.672598707029127e-06, + "loss": 0.93373156, + "num_input_tokens_seen": 75248285, + "router_z_loss_clip": 2.41992188, + "router_z_loss_mlp": 0.21813965, + "step": 3479, + "time_per_iteration": 2.5808637142181396 + }, + { + "auxiliary_loss_clip": 0.06542581, + "auxiliary_loss_mlp": 0.01279649, + "balance_loss_clip": 0.06299447, + "balance_loss_mlp": 0.01258072, + "epoch": 0.2092289192845333, + "flos": 22279537820160.0, + "grad_norm": 2.3833241848820372, + "language_loss": 0.75129831, + "learning_rate": 3.6723851439548003e-06, + "loss": 0.82952058, + "num_input_tokens_seen": 75266310, + "router_z_loss_clip": 2.43359375, + "router_z_loss_mlp": 0.21569824, + "step": 3480, + "time_per_iteration": 2.519789218902588 + }, + { + "auxiliary_loss_clip": 0.06546038, + "auxiliary_loss_mlp": 0.01278892, + "balance_loss_clip": 0.06306421, + "balance_loss_mlp": 0.01258495, + "epoch": 0.20928904253720126, + "flos": 14836118797440.0, + "grad_norm": 2.1621149118450163, + "language_loss": 0.7689389, + "learning_rate": 3.67217151746346e-06, + "loss": 0.84718817, + "num_input_tokens_seen": 75284175, + "router_z_loss_clip": 2.39648438, + "router_z_loss_mlp": 0.20410156, + "step": 3481, + "time_per_iteration": 2.541019916534424 + }, + { + "auxiliary_loss_clip": 0.06542054, + "auxiliary_loss_mlp": 0.01279748, + "balance_loss_clip": 0.06299154, + "balance_loss_mlp": 0.01257718, + "epoch": 0.20934916578986923, + "flos": 23266017538560.0, + "grad_norm": 1.9029543431357738, + "language_loss": 0.85756385, + "learning_rate": 3.671957827563209e-06, + "loss": 0.93578184, + "num_input_tokens_seen": 75303465, + "router_z_loss_clip": 2.4296875, + "router_z_loss_mlp": 0.22021484, + "step": 3482, + "time_per_iteration": 2.57550048828125 + }, + { + "auxiliary_loss_clip": 0.06538534, + "auxiliary_loss_mlp": 0.01281551, + "balance_loss_clip": 0.0629866, + "balance_loss_mlp": 0.01260237, + "epoch": 0.2094092890425372, + "flos": 32022492768000.0, + "grad_norm": 2.0122422455266076, + "language_loss": 0.71876764, + "learning_rate": 3.6717440742621494e-06, + "loss": 0.79696846, + "num_input_tokens_seen": 75325290, + "router_z_loss_clip": 2.3984375, + "router_z_loss_mlp": 0.21325684, + "step": 3483, + "time_per_iteration": 2.6664113998413086 + }, + { + "auxiliary_loss_clip": 0.06543796, + "auxiliary_loss_mlp": 0.01277426, + "balance_loss_clip": 0.06297769, + "balance_loss_mlp": 0.0125567, + "epoch": 0.20946941229520516, + "flos": 20016744710400.0, + "grad_norm": 1.623254768822543, + "language_loss": 0.75620067, + "learning_rate": 3.6715302575683865e-06, + "loss": 0.83441281, + "num_input_tokens_seen": 75343895, + "router_z_loss_clip": 2.45898438, + "router_z_loss_mlp": 0.21728516, + "step": 3484, + "time_per_iteration": 2.537745714187622 + }, + { + "auxiliary_loss_clip": 0.06537648, + "auxiliary_loss_mlp": 0.01274667, + "balance_loss_clip": 0.0629506, + "balance_loss_mlp": 0.01252733, + "epoch": 0.20952953554787315, + "flos": 30748401509760.0, + "grad_norm": 1.6710062021876058, + "language_loss": 0.71473777, + "learning_rate": 3.6713163774900292e-06, + "loss": 0.79286093, + "num_input_tokens_seen": 75367100, + "router_z_loss_clip": 2.42382812, + "router_z_loss_mlp": 0.21936035, + "step": 3485, + "time_per_iteration": 2.6310439109802246 + }, + { + "auxiliary_loss_clip": 0.0654947, + "auxiliary_loss_mlp": 0.01281894, + "balance_loss_clip": 0.06304678, + "balance_loss_mlp": 0.01258517, + "epoch": 0.20958965880054112, + "flos": 27055950837120.0, + "grad_norm": 1.7793136829828902, + "language_loss": 0.83105123, + "learning_rate": 3.6711024340351875e-06, + "loss": 0.90936482, + "num_input_tokens_seen": 75389925, + "router_z_loss_clip": 2.44921875, + "router_z_loss_mlp": 0.23376465, + "step": 3486, + "time_per_iteration": 2.5819222927093506 + }, + { + "auxiliary_loss_clip": 0.06539689, + "auxiliary_loss_mlp": 0.01279221, + "balance_loss_clip": 0.06297638, + "balance_loss_mlp": 0.01257978, + "epoch": 0.20964978205320908, + "flos": 34212680714880.0, + "grad_norm": 2.582218695391969, + "language_loss": 0.87821579, + "learning_rate": 3.6708884272119737e-06, + "loss": 0.95640486, + "num_input_tokens_seen": 75408575, + "router_z_loss_clip": 2.421875, + "router_z_loss_mlp": 0.21240234, + "step": 3487, + "time_per_iteration": 2.639369487762451 + }, + { + "auxiliary_loss_clip": 0.06538714, + "auxiliary_loss_mlp": 0.01279661, + "balance_loss_clip": 0.06298582, + "balance_loss_mlp": 0.01258227, + "epoch": 0.20970990530587705, + "flos": 23484168443520.0, + "grad_norm": 2.287931950731532, + "language_loss": 0.72719586, + "learning_rate": 3.670674357028504e-06, + "loss": 0.80537963, + "num_input_tokens_seen": 75427155, + "router_z_loss_clip": 2.40039062, + "router_z_loss_mlp": 0.21411133, + "step": 3488, + "time_per_iteration": 3.9480032920837402 + }, + { + "auxiliary_loss_clip": 0.06540683, + "auxiliary_loss_mlp": 0.01275293, + "balance_loss_clip": 0.06299531, + "balance_loss_mlp": 0.01255123, + "epoch": 0.209770028558545, + "flos": 18557346147840.0, + "grad_norm": 2.67396224290917, + "language_loss": 0.81189376, + "learning_rate": 3.6704602234928945e-06, + "loss": 0.89005351, + "num_input_tokens_seen": 75444450, + "router_z_loss_clip": 2.41210938, + "router_z_loss_mlp": 0.20178223, + "step": 3489, + "time_per_iteration": 2.500709295272827 + }, + { + "auxiliary_loss_clip": 0.0654545, + "auxiliary_loss_mlp": 0.01278304, + "balance_loss_clip": 0.06303608, + "balance_loss_mlp": 0.0125724, + "epoch": 0.20983015181121298, + "flos": 21623533804800.0, + "grad_norm": 2.0567102060198743, + "language_loss": 0.73407692, + "learning_rate": 3.670246026613266e-06, + "loss": 0.81231445, + "num_input_tokens_seen": 75462625, + "router_z_loss_clip": 2.41992188, + "router_z_loss_mlp": 0.21057129, + "step": 3490, + "time_per_iteration": 2.5622947216033936 + }, + { + "auxiliary_loss_clip": 0.06534347, + "auxiliary_loss_mlp": 0.01280989, + "balance_loss_clip": 0.06300151, + "balance_loss_mlp": 0.01260128, + "epoch": 0.20989027506388094, + "flos": 16619787861120.0, + "grad_norm": 1.7677892351641744, + "language_loss": 0.71503973, + "learning_rate": 3.6700317663977415e-06, + "loss": 0.7931931, + "num_input_tokens_seen": 75480640, + "router_z_loss_clip": 2.34570312, + "router_z_loss_mlp": 0.20849609, + "step": 3491, + "time_per_iteration": 4.0022783279418945 + }, + { + "auxiliary_loss_clip": 0.06542461, + "auxiliary_loss_mlp": 0.01283797, + "balance_loss_clip": 0.0629908, + "balance_loss_mlp": 0.01260957, + "epoch": 0.20995039831654894, + "flos": 23222692177920.0, + "grad_norm": 2.702657778988086, + "language_loss": 0.80329478, + "learning_rate": 3.669817442854444e-06, + "loss": 0.88155735, + "num_input_tokens_seen": 75494900, + "router_z_loss_clip": 2.43164062, + "router_z_loss_mlp": 0.22839355, + "step": 3492, + "time_per_iteration": 2.5376975536346436 + }, + { + "auxiliary_loss_clip": 0.06546506, + "auxiliary_loss_mlp": 0.01283519, + "balance_loss_clip": 0.06307527, + "balance_loss_mlp": 0.01262741, + "epoch": 0.2100105215692169, + "flos": 18152881689600.0, + "grad_norm": 1.9319737068083613, + "language_loss": 0.87613726, + "learning_rate": 3.669603055991502e-06, + "loss": 0.95443749, + "num_input_tokens_seen": 75513370, + "router_z_loss_clip": 2.39453125, + "router_z_loss_mlp": 0.20800781, + "step": 3493, + "time_per_iteration": 2.5462660789489746 + }, + { + "auxiliary_loss_clip": 0.06538918, + "auxiliary_loss_mlp": 0.01283808, + "balance_loss_clip": 0.06303683, + "balance_loss_mlp": 0.01262673, + "epoch": 0.21007064482188487, + "flos": 15967179936000.0, + "grad_norm": 1.7380368048158776, + "language_loss": 0.69753766, + "learning_rate": 3.6693886058170455e-06, + "loss": 0.77576494, + "num_input_tokens_seen": 75532480, + "router_z_loss_clip": 2.35351562, + "router_z_loss_mlp": 0.21130371, + "step": 3494, + "time_per_iteration": 2.523575782775879 + }, + { + "auxiliary_loss_clip": 0.0654956, + "auxiliary_loss_mlp": 0.0128408, + "balance_loss_clip": 0.06306064, + "balance_loss_mlp": 0.01262598, + "epoch": 0.21013076807455283, + "flos": 32242614243840.0, + "grad_norm": 1.6795437076377473, + "language_loss": 0.79639518, + "learning_rate": 3.6691740923392053e-06, + "loss": 0.87473154, + "num_input_tokens_seen": 75552745, + "router_z_loss_clip": 2.4375, + "router_z_loss_mlp": 0.21472168, + "step": 3495, + "time_per_iteration": 2.679564952850342 + }, + { + "auxiliary_loss_clip": 0.06543255, + "auxiliary_loss_mlp": 0.01280683, + "balance_loss_clip": 0.06300748, + "balance_loss_mlp": 0.01258832, + "epoch": 0.2101908913272208, + "flos": 23703493305600.0, + "grad_norm": 2.110842443067005, + "language_loss": 0.77733672, + "learning_rate": 3.668959515566116e-06, + "loss": 0.85557616, + "num_input_tokens_seen": 75574355, + "router_z_loss_clip": 2.42773438, + "router_z_loss_mlp": 0.21862793, + "step": 3496, + "time_per_iteration": 2.5728261470794678 + }, + { + "auxiliary_loss_clip": 0.06546371, + "auxiliary_loss_mlp": 0.01280297, + "balance_loss_clip": 0.06304142, + "balance_loss_mlp": 0.01257993, + "epoch": 0.21025101457988876, + "flos": 20381992657920.0, + "grad_norm": 2.1840810602746643, + "language_loss": 0.82214069, + "learning_rate": 3.668744875505915e-06, + "loss": 0.90040743, + "num_input_tokens_seen": 75592215, + "router_z_loss_clip": 2.421875, + "router_z_loss_mlp": 0.22302246, + "step": 3497, + "time_per_iteration": 5.435751438140869 + }, + { + "auxiliary_loss_clip": 0.06554863, + "auxiliary_loss_mlp": 0.01281759, + "balance_loss_clip": 0.06307989, + "balance_loss_mlp": 0.01259205, + "epoch": 0.21031113783255675, + "flos": 25782740046720.0, + "grad_norm": 1.9653925911520136, + "language_loss": 0.68009126, + "learning_rate": 3.668530172166741e-06, + "loss": 0.75845742, + "num_input_tokens_seen": 75610740, + "router_z_loss_clip": 2.46875, + "router_z_loss_mlp": 0.22558594, + "step": 3498, + "time_per_iteration": 2.6047511100769043 + }, + { + "auxiliary_loss_clip": 0.06550896, + "auxiliary_loss_mlp": 0.01291723, + "balance_loss_clip": 0.06304521, + "balance_loss_mlp": 0.01269789, + "epoch": 0.21037126108522472, + "flos": 22024769880960.0, + "grad_norm": 1.5964372308761317, + "language_loss": 0.81248403, + "learning_rate": 3.6683154055567352e-06, + "loss": 0.89091027, + "num_input_tokens_seen": 75631005, + "router_z_loss_clip": 2.46484375, + "router_z_loss_mlp": 0.21948242, + "step": 3499, + "time_per_iteration": 2.5279107093811035 + }, + { + "auxiliary_loss_clip": 0.06537838, + "auxiliary_loss_mlp": 0.01278117, + "balance_loss_clip": 0.06300277, + "balance_loss_mlp": 0.01257911, + "epoch": 0.21043138433789269, + "flos": 25340861940480.0, + "grad_norm": 2.3111316875342274, + "language_loss": 0.78733355, + "learning_rate": 3.668100575684043e-06, + "loss": 0.86549306, + "num_input_tokens_seen": 75650655, + "router_z_loss_clip": 2.37890625, + "router_z_loss_mlp": 0.20214844, + "step": 3500, + "time_per_iteration": 2.5789358615875244 + }, + { + "auxiliary_loss_clip": 0.06548081, + "auxiliary_loss_mlp": 0.01281815, + "balance_loss_clip": 0.06307902, + "balance_loss_mlp": 0.01259809, + "epoch": 0.21049150759056065, + "flos": 25563708673920.0, + "grad_norm": 1.5222387073827752, + "language_loss": 0.74519855, + "learning_rate": 3.6678856825568094e-06, + "loss": 0.82349753, + "num_input_tokens_seen": 75669895, + "router_z_loss_clip": 2.40234375, + "router_z_loss_mlp": 0.22021484, + "step": 3501, + "time_per_iteration": 2.5740344524383545 + }, + { + "auxiliary_loss_clip": 0.06532234, + "auxiliary_loss_mlp": 0.01279657, + "balance_loss_clip": 0.06293183, + "balance_loss_mlp": 0.01258521, + "epoch": 0.21055163084322862, + "flos": 24501982389120.0, + "grad_norm": 1.5726278305934103, + "language_loss": 0.75732303, + "learning_rate": 3.667670726183183e-06, + "loss": 0.83544195, + "num_input_tokens_seen": 75689535, + "router_z_loss_clip": 2.39453125, + "router_z_loss_mlp": 0.21142578, + "step": 3502, + "time_per_iteration": 2.564650535583496 + }, + { + "auxiliary_loss_clip": 0.06532737, + "auxiliary_loss_mlp": 0.01282141, + "balance_loss_clip": 0.06294994, + "balance_loss_mlp": 0.01260731, + "epoch": 0.21061175409589658, + "flos": 25746123012480.0, + "grad_norm": 2.0578640076956165, + "language_loss": 0.78642297, + "learning_rate": 3.667455706571316e-06, + "loss": 0.86457181, + "num_input_tokens_seen": 75709265, + "router_z_loss_clip": 2.37695312, + "router_z_loss_mlp": 0.21411133, + "step": 3503, + "time_per_iteration": 2.5651087760925293 + }, + { + "auxiliary_loss_clip": 0.06548393, + "auxiliary_loss_mlp": 0.01287579, + "balance_loss_clip": 0.06300595, + "balance_loss_mlp": 0.01262426, + "epoch": 0.21067187734856455, + "flos": 18995115404160.0, + "grad_norm": 2.3829290271278363, + "language_loss": 0.79109055, + "learning_rate": 3.6672406237293617e-06, + "loss": 0.86945021, + "num_input_tokens_seen": 75727050, + "router_z_loss_clip": 2.47460938, + "router_z_loss_mlp": 0.25134277, + "step": 3504, + "time_per_iteration": 2.5907576084136963 + }, + { + "auxiliary_loss_clip": 0.06540846, + "auxiliary_loss_mlp": 0.01277653, + "balance_loss_clip": 0.06295908, + "balance_loss_mlp": 0.012561, + "epoch": 0.21073200060123254, + "flos": 24688337869440.0, + "grad_norm": 2.6276986020802386, + "language_loss": 0.77414715, + "learning_rate": 3.6670254776654754e-06, + "loss": 0.85233212, + "num_input_tokens_seen": 75747175, + "router_z_loss_clip": 2.44921875, + "router_z_loss_mlp": 0.21557617, + "step": 3505, + "time_per_iteration": 2.564504861831665 + }, + { + "auxiliary_loss_clip": 0.06529057, + "auxiliary_loss_mlp": 0.01277583, + "balance_loss_clip": 0.06294015, + "balance_loss_mlp": 0.01257186, + "epoch": 0.2107921238539005, + "flos": 28557039605760.0, + "grad_norm": 2.0513581673642434, + "language_loss": 0.64351165, + "learning_rate": 3.6668102683878163e-06, + "loss": 0.721578, + "num_input_tokens_seen": 75767690, + "router_z_loss_clip": 2.34960938, + "router_z_loss_mlp": 0.20397949, + "step": 3506, + "time_per_iteration": 2.641390323638916 + }, + { + "auxiliary_loss_clip": 0.06535215, + "auxiliary_loss_mlp": 0.01278768, + "balance_loss_clip": 0.0629719, + "balance_loss_mlp": 0.01257656, + "epoch": 0.21085224710656847, + "flos": 25893094273920.0, + "grad_norm": 2.3889311598286436, + "language_loss": 0.82716179, + "learning_rate": 3.6665949959045443e-06, + "loss": 0.90530163, + "num_input_tokens_seen": 75787255, + "router_z_loss_clip": 2.38085938, + "router_z_loss_mlp": 0.21105957, + "step": 3507, + "time_per_iteration": 2.5718142986297607 + }, + { + "auxiliary_loss_clip": 0.06534198, + "auxiliary_loss_mlp": 0.01280018, + "balance_loss_clip": 0.06294642, + "balance_loss_mlp": 0.0125769, + "epoch": 0.21091237035923643, + "flos": 14981664539520.0, + "grad_norm": 1.9856074738329712, + "language_loss": 0.76547742, + "learning_rate": 3.666379660223824e-06, + "loss": 0.84361959, + "num_input_tokens_seen": 75805890, + "router_z_loss_clip": 2.3984375, + "router_z_loss_mlp": 0.22338867, + "step": 3508, + "time_per_iteration": 2.5104117393493652 + }, + { + "auxiliary_loss_clip": 0.06543706, + "auxiliary_loss_mlp": 0.01282498, + "balance_loss_clip": 0.06299506, + "balance_loss_mlp": 0.01261159, + "epoch": 0.2109724936119044, + "flos": 16368080595840.0, + "grad_norm": 2.529935640705384, + "language_loss": 0.86242574, + "learning_rate": 3.6661642613538192e-06, + "loss": 0.94068778, + "num_input_tokens_seen": 75821620, + "router_z_loss_clip": 2.44335938, + "router_z_loss_mlp": 0.21325684, + "step": 3509, + "time_per_iteration": 2.508370876312256 + }, + { + "auxiliary_loss_clip": 0.06541994, + "auxiliary_loss_mlp": 0.01280685, + "balance_loss_clip": 0.06295836, + "balance_loss_mlp": 0.01258679, + "epoch": 0.21103261686457236, + "flos": 31510315486080.0, + "grad_norm": 1.7053981088389916, + "language_loss": 0.68853724, + "learning_rate": 3.6659487993026987e-06, + "loss": 0.76676404, + "num_input_tokens_seen": 75842490, + "router_z_loss_clip": 2.46289062, + "router_z_loss_mlp": 0.22009277, + "step": 3510, + "time_per_iteration": 2.6452746391296387 + }, + { + "auxiliary_loss_clip": 0.06542882, + "auxiliary_loss_mlp": 0.01284418, + "balance_loss_clip": 0.06299803, + "balance_loss_mlp": 0.01263259, + "epoch": 0.21109274011724033, + "flos": 27351360806400.0, + "grad_norm": 1.7932280077203222, + "language_loss": 0.7352736, + "learning_rate": 3.6657332740786327e-06, + "loss": 0.8135466, + "num_input_tokens_seen": 75865985, + "router_z_loss_clip": 2.43164062, + "router_z_loss_mlp": 0.21154785, + "step": 3511, + "time_per_iteration": 2.6538095474243164 + }, + { + "auxiliary_loss_clip": 0.06553793, + "auxiliary_loss_mlp": 0.01288613, + "balance_loss_clip": 0.06308056, + "balance_loss_mlp": 0.01265546, + "epoch": 0.21115286336990832, + "flos": 17825927857920.0, + "grad_norm": 2.4490749473958577, + "language_loss": 0.70309734, + "learning_rate": 3.665517685689794e-06, + "loss": 0.78152132, + "num_input_tokens_seen": 75882745, + "router_z_loss_clip": 2.4609375, + "router_z_loss_mlp": 0.23071289, + "step": 3512, + "time_per_iteration": 2.5178020000457764 + }, + { + "auxiliary_loss_clip": 0.06542063, + "auxiliary_loss_mlp": 0.01280138, + "balance_loss_clip": 0.06299283, + "balance_loss_mlp": 0.01257739, + "epoch": 0.2112129866225763, + "flos": 27205228085760.0, + "grad_norm": 1.580176351931222, + "language_loss": 0.73930323, + "learning_rate": 3.6653020341443584e-06, + "loss": 0.81752527, + "num_input_tokens_seen": 75904305, + "router_z_loss_clip": 2.43164062, + "router_z_loss_mlp": 0.22412109, + "step": 3513, + "time_per_iteration": 2.62662410736084 + }, + { + "auxiliary_loss_clip": 0.06537203, + "auxiliary_loss_mlp": 0.01281283, + "balance_loss_clip": 0.06301522, + "balance_loss_mlp": 0.01260303, + "epoch": 0.21127310987524425, + "flos": 23737846279680.0, + "grad_norm": 1.7494748899805272, + "language_loss": 0.75353736, + "learning_rate": 3.665086319450502e-06, + "loss": 0.8317222, + "num_input_tokens_seen": 75923710, + "router_z_loss_clip": 2.359375, + "router_z_loss_mlp": 0.20983887, + "step": 3514, + "time_per_iteration": 2.584502696990967 + }, + { + "auxiliary_loss_clip": 0.06546184, + "auxiliary_loss_mlp": 0.01281455, + "balance_loss_clip": 0.06301809, + "balance_loss_mlp": 0.01261309, + "epoch": 0.21133323312791222, + "flos": 18338356702080.0, + "grad_norm": 1.6761924057980855, + "language_loss": 0.77322358, + "learning_rate": 3.6648705416164062e-06, + "loss": 0.85149997, + "num_input_tokens_seen": 75942625, + "router_z_loss_clip": 2.4453125, + "router_z_loss_mlp": 0.20141602, + "step": 3515, + "time_per_iteration": 2.552231550216675 + }, + { + "auxiliary_loss_clip": 0.06544478, + "auxiliary_loss_mlp": 0.0128088, + "balance_loss_clip": 0.06304052, + "balance_loss_mlp": 0.01260865, + "epoch": 0.21139335638058018, + "flos": 17936994844800.0, + "grad_norm": 2.0687526262765212, + "language_loss": 0.69083852, + "learning_rate": 3.6646547006502518e-06, + "loss": 0.76909214, + "num_input_tokens_seen": 75959930, + "router_z_loss_clip": 2.40625, + "router_z_loss_mlp": 0.19995117, + "step": 3516, + "time_per_iteration": 2.535282611846924 + }, + { + "auxiliary_loss_clip": 0.0654862, + "auxiliary_loss_mlp": 0.01279905, + "balance_loss_clip": 0.0630609, + "balance_loss_mlp": 0.01257756, + "epoch": 0.21145347963324815, + "flos": 24579073745280.0, + "grad_norm": 1.818548989117399, + "language_loss": 0.85523438, + "learning_rate": 3.664438796560225e-06, + "loss": 0.93351966, + "num_input_tokens_seen": 75980335, + "router_z_loss_clip": 2.42578125, + "router_z_loss_mlp": 0.22155762, + "step": 3517, + "time_per_iteration": 2.5862202644348145 + }, + { + "auxiliary_loss_clip": 0.06554718, + "auxiliary_loss_mlp": 0.01280908, + "balance_loss_clip": 0.06311698, + "balance_loss_mlp": 0.01260368, + "epoch": 0.21151360288591614, + "flos": 35854787105280.0, + "grad_norm": 2.178791897783965, + "language_loss": 0.6333189, + "learning_rate": 3.664222829354512e-06, + "loss": 0.71167523, + "num_input_tokens_seen": 76002095, + "router_z_loss_clip": 2.43164062, + "router_z_loss_mlp": 0.20532227, + "step": 3518, + "time_per_iteration": 2.6618587970733643 + }, + { + "auxiliary_loss_clip": 0.0654604, + "auxiliary_loss_mlp": 0.0129195, + "balance_loss_clip": 0.06306089, + "balance_loss_mlp": 0.01271625, + "epoch": 0.2115737261385841, + "flos": 24647989328640.0, + "grad_norm": 1.8588369306942552, + "language_loss": 0.90024757, + "learning_rate": 3.664006799041303e-06, + "loss": 0.97862744, + "num_input_tokens_seen": 76020425, + "router_z_loss_clip": 2.39648438, + "router_z_loss_mlp": 0.20336914, + "step": 3519, + "time_per_iteration": 2.5962281227111816 + }, + { + "auxiliary_loss_clip": 0.06553498, + "auxiliary_loss_mlp": 0.0129082, + "balance_loss_clip": 0.06309567, + "balance_loss_mlp": 0.01268945, + "epoch": 0.21163384939125207, + "flos": 25233652241280.0, + "grad_norm": 1.74321759448714, + "language_loss": 0.81933582, + "learning_rate": 3.6637907056287886e-06, + "loss": 0.89777905, + "num_input_tokens_seen": 76041210, + "router_z_loss_clip": 2.43945312, + "router_z_loss_mlp": 0.21862793, + "step": 3520, + "time_per_iteration": 2.6036746501922607 + }, + { + "auxiliary_loss_clip": 0.06544603, + "auxiliary_loss_mlp": 0.0127827, + "balance_loss_clip": 0.0630887, + "balance_loss_mlp": 0.01257576, + "epoch": 0.21169397264392004, + "flos": 26074670071680.0, + "grad_norm": 1.5989262406015683, + "language_loss": 0.76731956, + "learning_rate": 3.6635745491251642e-06, + "loss": 0.84554833, + "num_input_tokens_seen": 76062685, + "router_z_loss_clip": 2.35742188, + "router_z_loss_mlp": 0.20690918, + "step": 3521, + "time_per_iteration": 2.613945960998535 + }, + { + "auxiliary_loss_clip": 0.06548078, + "auxiliary_loss_mlp": 0.01281462, + "balance_loss_clip": 0.06310651, + "balance_loss_mlp": 0.01261364, + "epoch": 0.211754095896588, + "flos": 23114266594560.0, + "grad_norm": 2.104686387571933, + "language_loss": 0.75886559, + "learning_rate": 3.663358329538626e-06, + "loss": 0.83716094, + "num_input_tokens_seen": 76082300, + "router_z_loss_clip": 2.37695312, + "router_z_loss_mlp": 0.20092773, + "step": 3522, + "time_per_iteration": 2.530388355255127 + }, + { + "auxiliary_loss_clip": 0.06550008, + "auxiliary_loss_mlp": 0.01276271, + "balance_loss_clip": 0.06309568, + "balance_loss_mlp": 0.01255994, + "epoch": 0.21181421914925597, + "flos": 27928806019200.0, + "grad_norm": 2.55069435165465, + "language_loss": 0.71218652, + "learning_rate": 3.663142046877374e-06, + "loss": 0.79044926, + "num_input_tokens_seen": 76101135, + "router_z_loss_clip": 2.40625, + "router_z_loss_mlp": 0.20288086, + "step": 3523, + "time_per_iteration": 2.6448264122009277 + }, + { + "auxiliary_loss_clip": 0.06544726, + "auxiliary_loss_mlp": 0.01276969, + "balance_loss_clip": 0.06308427, + "balance_loss_mlp": 0.01256191, + "epoch": 0.21187434240192393, + "flos": 17134313057280.0, + "grad_norm": 2.0846198886990566, + "language_loss": 0.77930927, + "learning_rate": 3.6629257011496085e-06, + "loss": 0.8575263, + "num_input_tokens_seen": 76119320, + "router_z_loss_clip": 2.36523438, + "router_z_loss_mlp": 0.20788574, + "step": 3524, + "time_per_iteration": 2.527096748352051 + }, + { + "auxiliary_loss_clip": 0.06557429, + "auxiliary_loss_mlp": 0.01277075, + "balance_loss_clip": 0.0631334, + "balance_loss_mlp": 0.01255045, + "epoch": 0.21193446565459192, + "flos": 22354071626880.0, + "grad_norm": 2.138137470282545, + "language_loss": 0.82111794, + "learning_rate": 3.6627092923635338e-06, + "loss": 0.89946306, + "num_input_tokens_seen": 76137445, + "router_z_loss_clip": 2.43945312, + "router_z_loss_mlp": 0.22033691, + "step": 3525, + "time_per_iteration": 2.583249807357788 + }, + { + "auxiliary_loss_clip": 0.06547971, + "auxiliary_loss_mlp": 0.01274856, + "balance_loss_clip": 0.06308704, + "balance_loss_mlp": 0.01254519, + "epoch": 0.2119945889072599, + "flos": 27206779386240.0, + "grad_norm": 1.7514877674009408, + "language_loss": 0.75671291, + "learning_rate": 3.662492820527356e-06, + "loss": 0.83494115, + "num_input_tokens_seen": 76159500, + "router_z_loss_clip": 2.39453125, + "router_z_loss_mlp": 0.20324707, + "step": 3526, + "time_per_iteration": 2.56286883354187 + }, + { + "auxiliary_loss_clip": 0.06556675, + "auxiliary_loss_mlp": 0.01279028, + "balance_loss_clip": 0.0631361, + "balance_loss_mlp": 0.01258107, + "epoch": 0.21205471215992786, + "flos": 20997480424320.0, + "grad_norm": 1.9989732630407808, + "language_loss": 0.77276337, + "learning_rate": 3.662276285649284e-06, + "loss": 0.85112035, + "num_input_tokens_seen": 76177990, + "router_z_loss_clip": 2.4296875, + "router_z_loss_mlp": 0.20910645, + "step": 3527, + "time_per_iteration": 2.7162973880767822 + }, + { + "auxiliary_loss_clip": 0.06551696, + "auxiliary_loss_mlp": 0.01279873, + "balance_loss_clip": 0.06314081, + "balance_loss_mlp": 0.01258224, + "epoch": 0.21211483541259582, + "flos": 20784025347840.0, + "grad_norm": 2.0427089539116783, + "language_loss": 0.78184944, + "learning_rate": 3.662059687737528e-06, + "loss": 0.86016512, + "num_input_tokens_seen": 76197125, + "router_z_loss_clip": 2.375, + "router_z_loss_mlp": 0.21643066, + "step": 3528, + "time_per_iteration": 3.990530490875244 + }, + { + "auxiliary_loss_clip": 0.06551792, + "auxiliary_loss_mlp": 0.01277875, + "balance_loss_clip": 0.06313196, + "balance_loss_mlp": 0.01257025, + "epoch": 0.21217495866526379, + "flos": 18996079726080.0, + "grad_norm": 1.942993331862389, + "language_loss": 0.82054245, + "learning_rate": 3.6618430268003024e-06, + "loss": 0.89883912, + "num_input_tokens_seen": 76216215, + "router_z_loss_clip": 2.38671875, + "router_z_loss_mlp": 0.20861816, + "step": 3529, + "time_per_iteration": 2.564383029937744 + }, + { + "auxiliary_loss_clip": 0.06555474, + "auxiliary_loss_mlp": 0.01278138, + "balance_loss_clip": 0.06313926, + "balance_loss_mlp": 0.01257134, + "epoch": 0.21223508191793175, + "flos": 20673503412480.0, + "grad_norm": 2.2777790477523236, + "language_loss": 0.77694297, + "learning_rate": 3.6616263028458235e-06, + "loss": 0.85527909, + "num_input_tokens_seen": 76237010, + "router_z_loss_clip": 2.41796875, + "router_z_loss_mlp": 0.21008301, + "step": 3530, + "time_per_iteration": 2.576662540435791 + }, + { + "auxiliary_loss_clip": 0.06550869, + "auxiliary_loss_mlp": 0.01274157, + "balance_loss_clip": 0.06314521, + "balance_loss_mlp": 0.01254106, + "epoch": 0.21229520517059972, + "flos": 21622904899200.0, + "grad_norm": 2.3150689342230644, + "language_loss": 0.83926791, + "learning_rate": 3.661409515882308e-06, + "loss": 0.91751814, + "num_input_tokens_seen": 76255965, + "router_z_loss_clip": 2.36523438, + "router_z_loss_mlp": 0.20043945, + "step": 3531, + "time_per_iteration": 4.092180252075195 + }, + { + "auxiliary_loss_clip": 0.06553733, + "auxiliary_loss_mlp": 0.01280648, + "balance_loss_clip": 0.06313696, + "balance_loss_mlp": 0.0125888, + "epoch": 0.2123553284232677, + "flos": 13996232997120.0, + "grad_norm": 2.2553338764718145, + "language_loss": 0.74256229, + "learning_rate": 3.661192665917977e-06, + "loss": 0.82090604, + "num_input_tokens_seen": 76272150, + "router_z_loss_clip": 2.40039062, + "router_z_loss_mlp": 0.21777344, + "step": 3532, + "time_per_iteration": 2.5215070247650146 + }, + { + "auxiliary_loss_clip": 0.06549011, + "auxiliary_loss_mlp": 0.01276957, + "balance_loss_clip": 0.06309506, + "balance_loss_mlp": 0.01255714, + "epoch": 0.21241545167593567, + "flos": 18302745916800.0, + "grad_norm": 1.8963653738624293, + "language_loss": 0.74378759, + "learning_rate": 3.660975752961054e-06, + "loss": 0.82204729, + "num_input_tokens_seen": 76291425, + "router_z_loss_clip": 2.39648438, + "router_z_loss_mlp": 0.21252441, + "step": 3533, + "time_per_iteration": 2.5286645889282227 + }, + { + "auxiliary_loss_clip": 0.06554842, + "auxiliary_loss_mlp": 0.01279741, + "balance_loss_clip": 0.06312128, + "balance_loss_mlp": 0.01257341, + "epoch": 0.21247557492860364, + "flos": 34721461906560.0, + "grad_norm": 1.8118406193913599, + "language_loss": 0.71620667, + "learning_rate": 3.6607587770197634e-06, + "loss": 0.79455251, + "num_input_tokens_seen": 76313975, + "router_z_loss_clip": 2.42773438, + "router_z_loss_mlp": 0.22399902, + "step": 3534, + "time_per_iteration": 2.6872916221618652 + }, + { + "auxiliary_loss_clip": 0.06548804, + "auxiliary_loss_mlp": 0.01283614, + "balance_loss_clip": 0.0630706, + "balance_loss_mlp": 0.01262586, + "epoch": 0.2125356981812716, + "flos": 22060254885120.0, + "grad_norm": 2.3502862502903046, + "language_loss": 0.72866982, + "learning_rate": 3.6605417381023346e-06, + "loss": 0.80699402, + "num_input_tokens_seen": 76330955, + "router_z_loss_clip": 2.41992188, + "router_z_loss_mlp": 0.21032715, + "step": 3535, + "time_per_iteration": 2.5843448638916016 + }, + { + "auxiliary_loss_clip": 0.06546953, + "auxiliary_loss_mlp": 0.01279722, + "balance_loss_clip": 0.06307133, + "balance_loss_mlp": 0.01257621, + "epoch": 0.21259582143393957, + "flos": 28555865648640.0, + "grad_norm": 2.199655139190772, + "language_loss": 0.70759106, + "learning_rate": 3.660324636216996e-06, + "loss": 0.7858578, + "num_input_tokens_seen": 76352680, + "router_z_loss_clip": 2.3984375, + "router_z_loss_mlp": 0.22106934, + "step": 3536, + "time_per_iteration": 4.056318998336792 + }, + { + "auxiliary_loss_clip": 0.06557733, + "auxiliary_loss_mlp": 0.01286072, + "balance_loss_clip": 0.06310252, + "balance_loss_mlp": 0.0126415, + "epoch": 0.21265594468660753, + "flos": 20127140864640.0, + "grad_norm": 2.2134041941920897, + "language_loss": 0.8820163, + "learning_rate": 3.660107471371981e-06, + "loss": 0.96045434, + "num_input_tokens_seen": 76370750, + "router_z_loss_clip": 2.4765625, + "router_z_loss_mlp": 0.21911621, + "step": 3537, + "time_per_iteration": 2.6233468055725098 + }, + { + "auxiliary_loss_clip": 0.06541121, + "auxiliary_loss_mlp": 0.01278147, + "balance_loss_clip": 0.06304413, + "balance_loss_mlp": 0.01256094, + "epoch": 0.21271606793927553, + "flos": 23082890440320.0, + "grad_norm": 1.7848498720134809, + "language_loss": 0.81086004, + "learning_rate": 3.659890243575524e-06, + "loss": 0.88905263, + "num_input_tokens_seen": 76390610, + "router_z_loss_clip": 2.36914062, + "router_z_loss_mlp": 0.22058105, + "step": 3538, + "time_per_iteration": 2.5589442253112793 + }, + { + "auxiliary_loss_clip": 0.06545715, + "auxiliary_loss_mlp": 0.01283722, + "balance_loss_clip": 0.06305592, + "balance_loss_mlp": 0.01263981, + "epoch": 0.2127761911919435, + "flos": 26394118963200.0, + "grad_norm": 2.023826748108625, + "language_loss": 0.87817419, + "learning_rate": 3.659672952835863e-06, + "loss": 0.95646858, + "num_input_tokens_seen": 76408860, + "router_z_loss_clip": 2.40234375, + "router_z_loss_mlp": 0.19763184, + "step": 3539, + "time_per_iteration": 2.6115527153015137 + }, + { + "auxiliary_loss_clip": 0.06554011, + "auxiliary_loss_mlp": 0.01284638, + "balance_loss_clip": 0.06309317, + "balance_loss_mlp": 0.01264277, + "epoch": 0.21283631444461146, + "flos": 20234182855680.0, + "grad_norm": 3.1687626880856667, + "language_loss": 0.59144789, + "learning_rate": 3.659455599161237e-06, + "loss": 0.66983438, + "num_input_tokens_seen": 76424980, + "router_z_loss_clip": 2.4453125, + "router_z_loss_mlp": 0.20361328, + "step": 3540, + "time_per_iteration": 2.525139570236206 + }, + { + "auxiliary_loss_clip": 0.06543202, + "auxiliary_loss_mlp": 0.01278528, + "balance_loss_clip": 0.0630211, + "balance_loss_mlp": 0.01256557, + "epoch": 0.21289643769727942, + "flos": 13522140195840.0, + "grad_norm": 1.940296770056649, + "language_loss": 0.7721082, + "learning_rate": 3.659238182559888e-06, + "loss": 0.85032547, + "num_input_tokens_seen": 76443135, + "router_z_loss_clip": 2.40820312, + "router_z_loss_mlp": 0.21972656, + "step": 3541, + "time_per_iteration": 2.563164234161377 + }, + { + "auxiliary_loss_clip": 0.06542824, + "auxiliary_loss_mlp": 0.01283205, + "balance_loss_clip": 0.06305471, + "balance_loss_mlp": 0.01262486, + "epoch": 0.2129565609499474, + "flos": 24833967465600.0, + "grad_norm": 1.7979798329536472, + "language_loss": 0.69596064, + "learning_rate": 3.6590207030400615e-06, + "loss": 0.77422094, + "num_input_tokens_seen": 76462470, + "router_z_loss_clip": 2.37304688, + "router_z_loss_mlp": 0.20703125, + "step": 3542, + "time_per_iteration": 2.6213386058807373 + }, + { + "auxiliary_loss_clip": 0.06542216, + "auxiliary_loss_mlp": 0.01284362, + "balance_loss_clip": 0.0630642, + "balance_loss_mlp": 0.01264692, + "epoch": 0.21301668420261535, + "flos": 23665953876480.0, + "grad_norm": 1.8238030340304547, + "language_loss": 0.77012485, + "learning_rate": 3.658803160610004e-06, + "loss": 0.84839058, + "num_input_tokens_seen": 76481995, + "router_z_loss_clip": 2.359375, + "router_z_loss_mlp": 0.19677734, + "step": 3543, + "time_per_iteration": 2.5654232501983643 + }, + { + "auxiliary_loss_clip": 0.0654586, + "auxiliary_loss_mlp": 0.01282767, + "balance_loss_clip": 0.0630815, + "balance_loss_mlp": 0.01261488, + "epoch": 0.21307680745528332, + "flos": 16368416012160.0, + "grad_norm": 2.0315626098903468, + "language_loss": 0.67305464, + "learning_rate": 3.6585855552779634e-06, + "loss": 0.75134087, + "num_input_tokens_seen": 76500245, + "router_z_loss_clip": 2.37890625, + "router_z_loss_mlp": 0.2130127, + "step": 3544, + "time_per_iteration": 2.513288736343384 + }, + { + "auxiliary_loss_clip": 0.06542834, + "auxiliary_loss_mlp": 0.01284, + "balance_loss_clip": 0.06304078, + "balance_loss_mlp": 0.01264223, + "epoch": 0.2131369307079513, + "flos": 19105092288000.0, + "grad_norm": 1.7034786511890583, + "language_loss": 0.71322483, + "learning_rate": 3.6583678870521934e-06, + "loss": 0.79149318, + "num_input_tokens_seen": 76519535, + "router_z_loss_clip": 2.38867188, + "router_z_loss_mlp": 0.19763184, + "step": 3545, + "time_per_iteration": 2.5347442626953125 + }, + { + "auxiliary_loss_clip": 0.06549121, + "auxiliary_loss_mlp": 0.01288311, + "balance_loss_clip": 0.06306408, + "balance_loss_mlp": 0.01268224, + "epoch": 0.21319705396061928, + "flos": 30380050961280.0, + "grad_norm": 2.304335172733059, + "language_loss": 0.73178399, + "learning_rate": 3.658150155940946e-06, + "loss": 0.81015837, + "num_input_tokens_seen": 76542065, + "router_z_loss_clip": 2.42382812, + "router_z_loss_mlp": 0.20092773, + "step": 3546, + "time_per_iteration": 2.6647720336914062 + }, + { + "auxiliary_loss_clip": 0.0655164, + "auxiliary_loss_mlp": 0.01278696, + "balance_loss_clip": 0.06310475, + "balance_loss_mlp": 0.01258609, + "epoch": 0.21325717721328724, + "flos": 21761616533760.0, + "grad_norm": 1.9338253687785023, + "language_loss": 0.81206107, + "learning_rate": 3.657932361952479e-06, + "loss": 0.89036447, + "num_input_tokens_seen": 76560540, + "router_z_loss_clip": 2.41210938, + "router_z_loss_mlp": 0.20092773, + "step": 3547, + "time_per_iteration": 2.533062696456909 + }, + { + "auxiliary_loss_clip": 0.06547703, + "auxiliary_loss_mlp": 0.01281658, + "balance_loss_clip": 0.06302875, + "balance_loss_mlp": 0.01259127, + "epoch": 0.2133173004659552, + "flos": 28738447695360.0, + "grad_norm": 3.206018032759459, + "language_loss": 0.74960929, + "learning_rate": 3.6577145050950504e-06, + "loss": 0.82790291, + "num_input_tokens_seen": 76581760, + "router_z_loss_clip": 2.45117188, + "router_z_loss_mlp": 0.22521973, + "step": 3548, + "time_per_iteration": 2.605151414871216 + }, + { + "auxiliary_loss_clip": 0.06554648, + "auxiliary_loss_mlp": 0.01281207, + "balance_loss_clip": 0.06309359, + "balance_loss_mlp": 0.01259236, + "epoch": 0.21337742371862317, + "flos": 16842760375680.0, + "grad_norm": 2.056331081084102, + "language_loss": 0.74889886, + "learning_rate": 3.657496585376922e-06, + "loss": 0.82725745, + "num_input_tokens_seen": 76599940, + "router_z_loss_clip": 2.453125, + "router_z_loss_mlp": 0.21972656, + "step": 3549, + "time_per_iteration": 2.518305540084839 + }, + { + "auxiliary_loss_clip": 0.06547625, + "auxiliary_loss_mlp": 0.01281066, + "balance_loss_clip": 0.06306933, + "balance_loss_mlp": 0.01261278, + "epoch": 0.21343754697129114, + "flos": 24431683213440.0, + "grad_norm": 1.7052192349692608, + "language_loss": 0.8095907, + "learning_rate": 3.657278602806357e-06, + "loss": 0.88787764, + "num_input_tokens_seen": 76619580, + "router_z_loss_clip": 2.40625, + "router_z_loss_mlp": 0.19787598, + "step": 3550, + "time_per_iteration": 2.621840715408325 + }, + { + "auxiliary_loss_clip": 0.06544942, + "auxiliary_loss_mlp": 0.01278049, + "balance_loss_clip": 0.06309815, + "balance_loss_mlp": 0.01258653, + "epoch": 0.21349767022395913, + "flos": 19283271995520.0, + "grad_norm": 1.8011583081598594, + "language_loss": 0.88582718, + "learning_rate": 3.657060557391621e-06, + "loss": 0.96405709, + "num_input_tokens_seen": 76638195, + "router_z_loss_clip": 2.35546875, + "router_z_loss_mlp": 0.19384766, + "step": 3551, + "time_per_iteration": 2.5354909896850586 + }, + { + "auxiliary_loss_clip": 0.06541884, + "auxiliary_loss_mlp": 0.01276889, + "balance_loss_clip": 0.06304973, + "balance_loss_mlp": 0.01256635, + "epoch": 0.2135577934766271, + "flos": 17353260576000.0, + "grad_norm": 1.8291964059748265, + "language_loss": 0.83669794, + "learning_rate": 3.656842449140983e-06, + "loss": 0.91488564, + "num_input_tokens_seen": 76656695, + "router_z_loss_clip": 2.36914062, + "router_z_loss_mlp": 0.20275879, + "step": 3552, + "time_per_iteration": 2.5428099632263184 + }, + { + "auxiliary_loss_clip": 0.06543534, + "auxiliary_loss_mlp": 0.01282655, + "balance_loss_clip": 0.06305505, + "balance_loss_mlp": 0.01261329, + "epoch": 0.21361791672929506, + "flos": 24063416519040.0, + "grad_norm": 1.71251087169846, + "language_loss": 0.77181637, + "learning_rate": 3.656624278062713e-06, + "loss": 0.85007823, + "num_input_tokens_seen": 76677430, + "router_z_loss_clip": 2.38085938, + "router_z_loss_mlp": 0.21325684, + "step": 3553, + "time_per_iteration": 2.5453906059265137 + }, + { + "auxiliary_loss_clip": 0.06546006, + "auxiliary_loss_mlp": 0.01280965, + "balance_loss_clip": 0.06308904, + "balance_loss_mlp": 0.01260556, + "epoch": 0.21367803998196302, + "flos": 22168596614400.0, + "grad_norm": 1.6386548216082337, + "language_loss": 0.72918522, + "learning_rate": 3.6564060441650843e-06, + "loss": 0.80745488, + "num_input_tokens_seen": 76697615, + "router_z_loss_clip": 2.37109375, + "router_z_loss_mlp": 0.20397949, + "step": 3554, + "time_per_iteration": 2.610447883605957 + }, + { + "auxiliary_loss_clip": 0.06543835, + "auxiliary_loss_mlp": 0.01296522, + "balance_loss_clip": 0.06304221, + "balance_loss_mlp": 0.01276483, + "epoch": 0.213738163234631, + "flos": 20893205617920.0, + "grad_norm": 2.167468133085416, + "language_loss": 0.6838634, + "learning_rate": 3.6561877474563724e-06, + "loss": 0.76226699, + "num_input_tokens_seen": 76715685, + "router_z_loss_clip": 2.39453125, + "router_z_loss_mlp": 0.20043945, + "step": 3555, + "time_per_iteration": 2.6348068714141846 + }, + { + "auxiliary_loss_clip": 0.06544648, + "auxiliary_loss_mlp": 0.01283651, + "balance_loss_clip": 0.06303324, + "balance_loss_mlp": 0.01262861, + "epoch": 0.21379828648729896, + "flos": 28410739176960.0, + "grad_norm": 1.8068010568670265, + "language_loss": 0.6581043, + "learning_rate": 3.6559693879448553e-06, + "loss": 0.73638725, + "num_input_tokens_seen": 76735405, + "router_z_loss_clip": 2.4140625, + "router_z_loss_mlp": 0.20800781, + "step": 3556, + "time_per_iteration": 2.6547720432281494 + }, + { + "auxiliary_loss_clip": 0.06542179, + "auxiliary_loss_mlp": 0.0129054, + "balance_loss_clip": 0.06305043, + "balance_loss_mlp": 0.01269905, + "epoch": 0.21385840973996692, + "flos": 25486030339200.0, + "grad_norm": 1.6965425102308196, + "language_loss": 0.73263884, + "learning_rate": 3.6557509656388125e-06, + "loss": 0.81096601, + "num_input_tokens_seen": 76754395, + "router_z_loss_clip": 2.375, + "router_z_loss_mlp": 0.20617676, + "step": 3557, + "time_per_iteration": 2.5850143432617188 + }, + { + "auxiliary_loss_clip": 0.06555384, + "auxiliary_loss_mlp": 0.01282331, + "balance_loss_clip": 0.06310774, + "balance_loss_mlp": 0.01260814, + "epoch": 0.2139185329926349, + "flos": 28081772847360.0, + "grad_norm": 1.6861756161591135, + "language_loss": 0.67894918, + "learning_rate": 3.655532480546528e-06, + "loss": 0.75732636, + "num_input_tokens_seen": 76777210, + "router_z_loss_clip": 2.4453125, + "router_z_loss_mlp": 0.21508789, + "step": 3558, + "time_per_iteration": 2.6937482357025146 + }, + { + "auxiliary_loss_clip": 0.06554736, + "auxiliary_loss_mlp": 0.01278958, + "balance_loss_clip": 0.06306359, + "balance_loss_mlp": 0.0125905, + "epoch": 0.21397865624530288, + "flos": 19614628166400.0, + "grad_norm": 2.1418574307637575, + "language_loss": 0.81358159, + "learning_rate": 3.655313932676286e-06, + "loss": 0.89191854, + "num_input_tokens_seen": 76795830, + "router_z_loss_clip": 2.484375, + "router_z_loss_mlp": 0.19909668, + "step": 3559, + "time_per_iteration": 2.5145814418792725 + }, + { + "auxiliary_loss_clip": 0.06551723, + "auxiliary_loss_mlp": 0.01281472, + "balance_loss_clip": 0.06314635, + "balance_loss_mlp": 0.01262899, + "epoch": 0.21403877949797084, + "flos": 24688463650560.0, + "grad_norm": 1.6715073288493136, + "language_loss": 0.68710625, + "learning_rate": 3.655095322036373e-06, + "loss": 0.7654382, + "num_input_tokens_seen": 76814700, + "router_z_loss_clip": 2.36914062, + "router_z_loss_mlp": 0.18554688, + "step": 3560, + "time_per_iteration": 2.5956926345825195 + }, + { + "auxiliary_loss_clip": 0.06554615, + "auxiliary_loss_mlp": 0.01279566, + "balance_loss_clip": 0.0631121, + "balance_loss_mlp": 0.01259313, + "epoch": 0.2140989027506388, + "flos": 19866628920960.0, + "grad_norm": 1.9885830979576231, + "language_loss": 0.73618603, + "learning_rate": 3.65487664863508e-06, + "loss": 0.81452787, + "num_input_tokens_seen": 76833400, + "router_z_loss_clip": 2.43359375, + "router_z_loss_mlp": 0.20263672, + "step": 3561, + "time_per_iteration": 2.5286123752593994 + }, + { + "auxiliary_loss_clip": 0.06553814, + "auxiliary_loss_mlp": 0.01282143, + "balance_loss_clip": 0.06311779, + "balance_loss_mlp": 0.01262402, + "epoch": 0.21415902600330677, + "flos": 19141331978880.0, + "grad_norm": 2.350872095274855, + "language_loss": 0.78756285, + "learning_rate": 3.654657912480698e-06, + "loss": 0.86592233, + "num_input_tokens_seen": 76850645, + "router_z_loss_clip": 2.41992188, + "router_z_loss_mlp": 0.19763184, + "step": 3562, + "time_per_iteration": 2.608041286468506 + }, + { + "auxiliary_loss_clip": 0.06546983, + "auxiliary_loss_mlp": 0.01281911, + "balance_loss_clip": 0.06309752, + "balance_loss_mlp": 0.01261788, + "epoch": 0.21421914925597474, + "flos": 22279076622720.0, + "grad_norm": 1.5018972458321598, + "language_loss": 0.85257983, + "learning_rate": 3.6544391135815237e-06, + "loss": 0.93086874, + "num_input_tokens_seen": 76870135, + "router_z_loss_clip": 2.37304688, + "router_z_loss_mlp": 0.20117188, + "step": 3563, + "time_per_iteration": 2.5593912601470947 + }, + { + "auxiliary_loss_clip": 0.06548097, + "auxiliary_loss_mlp": 0.01281509, + "balance_loss_clip": 0.06308593, + "balance_loss_mlp": 0.01262531, + "epoch": 0.2142792725086427, + "flos": 33883504750080.0, + "grad_norm": 1.9248219523503745, + "language_loss": 0.76925778, + "learning_rate": 3.6542202519458507e-06, + "loss": 0.84755385, + "num_input_tokens_seen": 76893905, + "router_z_loss_clip": 2.39453125, + "router_z_loss_mlp": 0.18981934, + "step": 3564, + "time_per_iteration": 2.668755531311035 + }, + { + "auxiliary_loss_clip": 0.06542072, + "auxiliary_loss_mlp": 0.01280159, + "balance_loss_clip": 0.06305549, + "balance_loss_mlp": 0.01261181, + "epoch": 0.2143393957613107, + "flos": 19865538817920.0, + "grad_norm": 1.690691453330226, + "language_loss": 0.89139843, + "learning_rate": 3.654001327581981e-06, + "loss": 0.9696207, + "num_input_tokens_seen": 76914205, + "router_z_loss_clip": 2.36328125, + "router_z_loss_mlp": 0.18969727, + "step": 3565, + "time_per_iteration": 2.660306215286255 + }, + { + "auxiliary_loss_clip": 0.06436334, + "auxiliary_loss_mlp": 0.01286647, + "balance_loss_clip": 0.06303974, + "balance_loss_mlp": 0.01279924, + "epoch": 0.21439951901397866, + "flos": 68549300017920.0, + "grad_norm": 0.8225285981700966, + "language_loss": 0.52211988, + "learning_rate": 3.653782340498215e-06, + "loss": 0.59934968, + "num_input_tokens_seen": 76975650, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.06738281, + "step": 3566, + "time_per_iteration": 3.0845720767974854 + }, + { + "auxiliary_loss_clip": 0.06539588, + "auxiliary_loss_mlp": 0.01284533, + "balance_loss_clip": 0.06306818, + "balance_loss_mlp": 0.0126478, + "epoch": 0.21445964226664663, + "flos": 19689161973120.0, + "grad_norm": 1.8060006281631265, + "language_loss": 0.68295264, + "learning_rate": 3.6535632907028566e-06, + "loss": 0.76119387, + "num_input_tokens_seen": 76992615, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.19775391, + "step": 3567, + "time_per_iteration": 2.5250415802001953 + }, + { + "auxiliary_loss_clip": 0.06543978, + "auxiliary_loss_mlp": 0.01283364, + "balance_loss_clip": 0.06310168, + "balance_loss_mlp": 0.012641, + "epoch": 0.2145197655193146, + "flos": 31116039298560.0, + "grad_norm": 2.0548954423707753, + "language_loss": 0.75150776, + "learning_rate": 3.6533441782042126e-06, + "loss": 0.82978123, + "num_input_tokens_seen": 77017005, + "router_z_loss_clip": 2.33984375, + "router_z_loss_mlp": 0.19250488, + "step": 3568, + "time_per_iteration": 4.018412113189697 + }, + { + "auxiliary_loss_clip": 0.06538366, + "auxiliary_loss_mlp": 0.01282205, + "balance_loss_clip": 0.063043, + "balance_loss_mlp": 0.01261773, + "epoch": 0.21457988877198256, + "flos": 20127015083520.0, + "grad_norm": 2.3975687399079284, + "language_loss": 0.78487438, + "learning_rate": 3.6531250030105917e-06, + "loss": 0.86308008, + "num_input_tokens_seen": 77034990, + "router_z_loss_clip": 2.34179688, + "router_z_loss_mlp": 0.20446777, + "step": 3569, + "time_per_iteration": 2.6051042079925537 + }, + { + "auxiliary_loss_clip": 0.06554128, + "auxiliary_loss_mlp": 0.01283223, + "balance_loss_clip": 0.06309038, + "balance_loss_mlp": 0.01262183, + "epoch": 0.21464001202465052, + "flos": 18593963182080.0, + "grad_norm": 2.5916710851503173, + "language_loss": 0.7048617, + "learning_rate": 3.6529057651303053e-06, + "loss": 0.78323519, + "num_input_tokens_seen": 77052610, + "router_z_loss_clip": 2.453125, + "router_z_loss_mlp": 0.21032715, + "step": 3570, + "time_per_iteration": 2.5029172897338867 + }, + { + "auxiliary_loss_clip": 0.06548594, + "auxiliary_loss_mlp": 0.01293921, + "balance_loss_clip": 0.06305287, + "balance_loss_mlp": 0.01274621, + "epoch": 0.21470013527731852, + "flos": 21841600855680.0, + "grad_norm": 3.519297534980699, + "language_loss": 0.79412138, + "learning_rate": 3.6526864645716666e-06, + "loss": 0.87254649, + "num_input_tokens_seen": 77072475, + "router_z_loss_clip": 2.43554688, + "router_z_loss_mlp": 0.19311523, + "step": 3571, + "time_per_iteration": 3.984830141067505 + }, + { + "auxiliary_loss_clip": 0.06547887, + "auxiliary_loss_mlp": 0.01283536, + "balance_loss_clip": 0.06306981, + "balance_loss_mlp": 0.01263413, + "epoch": 0.21476025852998648, + "flos": 17608992837120.0, + "grad_norm": 2.1137138833129114, + "language_loss": 0.83417559, + "learning_rate": 3.652467101342991e-06, + "loss": 0.91248989, + "num_input_tokens_seen": 77089930, + "router_z_loss_clip": 2.41210938, + "router_z_loss_mlp": 0.20117188, + "step": 3572, + "time_per_iteration": 2.550900459289551 + }, + { + "auxiliary_loss_clip": 0.06544446, + "auxiliary_loss_mlp": 0.01290796, + "balance_loss_clip": 0.06300403, + "balance_loss_mlp": 0.01271114, + "epoch": 0.21482038178265445, + "flos": 24835267203840.0, + "grad_norm": 5.91831897424108, + "language_loss": 0.6534397, + "learning_rate": 3.652247675452598e-06, + "loss": 0.73179209, + "num_input_tokens_seen": 77108970, + "router_z_loss_clip": 2.44140625, + "router_z_loss_mlp": 0.19677734, + "step": 3573, + "time_per_iteration": 2.574037551879883 + }, + { + "auxiliary_loss_clip": 0.06536618, + "auxiliary_loss_mlp": 0.01287357, + "balance_loss_clip": 0.06305118, + "balance_loss_mlp": 0.0126814, + "epoch": 0.2148805050353224, + "flos": 23264927435520.0, + "grad_norm": 1.8228372560216166, + "language_loss": 0.76129293, + "learning_rate": 3.652028186908807e-06, + "loss": 0.83953267, + "num_input_tokens_seen": 77126045, + "router_z_loss_clip": 2.31640625, + "router_z_loss_mlp": 0.1920166, + "step": 3574, + "time_per_iteration": 2.610541343688965 + }, + { + "auxiliary_loss_clip": 0.06537417, + "auxiliary_loss_mlp": 0.01280783, + "balance_loss_clip": 0.06298707, + "balance_loss_mlp": 0.0126066, + "epoch": 0.21494062828799038, + "flos": 21326907951360.0, + "grad_norm": 2.0935140233911644, + "language_loss": 0.72909325, + "learning_rate": 3.6518086357199416e-06, + "loss": 0.8072753, + "num_input_tokens_seen": 77144600, + "router_z_loss_clip": 2.38867188, + "router_z_loss_mlp": 0.20117188, + "step": 3575, + "time_per_iteration": 2.581932306289673 + }, + { + "auxiliary_loss_clip": 0.06537387, + "auxiliary_loss_mlp": 0.01288909, + "balance_loss_clip": 0.06302074, + "balance_loss_mlp": 0.01269657, + "epoch": 0.21500075154065834, + "flos": 18849276172800.0, + "grad_norm": 2.2103119968131986, + "language_loss": 0.6923548, + "learning_rate": 3.6515890218943277e-06, + "loss": 0.77061772, + "num_input_tokens_seen": 77162965, + "router_z_loss_clip": 2.35742188, + "router_z_loss_mlp": 0.19262695, + "step": 3576, + "time_per_iteration": 5.394233703613281 + }, + { + "auxiliary_loss_clip": 0.06547244, + "auxiliary_loss_mlp": 0.01282016, + "balance_loss_clip": 0.06304461, + "balance_loss_mlp": 0.0126069, + "epoch": 0.2150608747933263, + "flos": 18447872388480.0, + "grad_norm": 1.9274083971527407, + "language_loss": 0.89371777, + "learning_rate": 3.651369345440292e-06, + "loss": 0.97201031, + "num_input_tokens_seen": 77179960, + "router_z_loss_clip": 2.42773438, + "router_z_loss_mlp": 0.21337891, + "step": 3577, + "time_per_iteration": 2.5629777908325195 + }, + { + "auxiliary_loss_clip": 0.06425267, + "auxiliary_loss_mlp": 0.01303124, + "balance_loss_clip": 0.06298774, + "balance_loss_mlp": 0.01297548, + "epoch": 0.2151209980459943, + "flos": 66617443808640.0, + "grad_norm": 0.7978427219987446, + "language_loss": 0.56304139, + "learning_rate": 3.6511496063661654e-06, + "loss": 0.64032531, + "num_input_tokens_seen": 77239500, + "router_z_loss_clip": 1.265625, + "router_z_loss_mlp": 0.05581665, + "step": 3578, + "time_per_iteration": 3.0982370376586914 + }, + { + "auxiliary_loss_clip": 0.06546376, + "auxiliary_loss_mlp": 0.0128684, + "balance_loss_clip": 0.06309081, + "balance_loss_mlp": 0.0126729, + "epoch": 0.21518112129866226, + "flos": 21581633963520.0, + "grad_norm": 1.7619248126111737, + "language_loss": 0.89097106, + "learning_rate": 3.6509298046802807e-06, + "loss": 0.96930325, + "num_input_tokens_seen": 77254680, + "router_z_loss_clip": 2.37304688, + "router_z_loss_mlp": 0.19555664, + "step": 3579, + "time_per_iteration": 2.5552327632904053 + }, + { + "auxiliary_loss_clip": 0.06544919, + "auxiliary_loss_mlp": 0.01281002, + "balance_loss_clip": 0.06304899, + "balance_loss_mlp": 0.01260498, + "epoch": 0.21524124455133023, + "flos": 20053822942080.0, + "grad_norm": 1.8548300822509616, + "language_loss": 0.78671825, + "learning_rate": 3.650709940390972e-06, + "loss": 0.86497748, + "num_input_tokens_seen": 77274060, + "router_z_loss_clip": 2.39648438, + "router_z_loss_mlp": 0.20507812, + "step": 3580, + "time_per_iteration": 2.538740634918213 + }, + { + "auxiliary_loss_clip": 0.06547832, + "auxiliary_loss_mlp": 0.01284221, + "balance_loss_clip": 0.06311843, + "balance_loss_mlp": 0.01265279, + "epoch": 0.2153013678039982, + "flos": 23958680515200.0, + "grad_norm": 2.0040984242528905, + "language_loss": 0.73520374, + "learning_rate": 3.6504900135065775e-06, + "loss": 0.81352425, + "num_input_tokens_seen": 77293255, + "router_z_loss_clip": 2.359375, + "router_z_loss_mlp": 0.18933105, + "step": 3581, + "time_per_iteration": 2.5783493518829346 + }, + { + "auxiliary_loss_clip": 0.06544261, + "auxiliary_loss_mlp": 0.01283002, + "balance_loss_clip": 0.06307264, + "balance_loss_mlp": 0.01262438, + "epoch": 0.21536149105666616, + "flos": 20601107884800.0, + "grad_norm": 2.9043222851567574, + "language_loss": 0.71477044, + "learning_rate": 3.6502700240354357e-06, + "loss": 0.79304302, + "num_input_tokens_seen": 77312390, + "router_z_loss_clip": 2.3671875, + "router_z_loss_mlp": 0.20556641, + "step": 3582, + "time_per_iteration": 2.5253281593322754 + }, + { + "auxiliary_loss_clip": 0.06553562, + "auxiliary_loss_mlp": 0.01282797, + "balance_loss_clip": 0.06315581, + "balance_loss_mlp": 0.01262209, + "epoch": 0.21542161430933413, + "flos": 12865046077440.0, + "grad_norm": 2.5916269023447795, + "language_loss": 0.85900396, + "learning_rate": 3.650049971985889e-06, + "loss": 0.93736756, + "num_input_tokens_seen": 77330985, + "router_z_loss_clip": 2.37890625, + "router_z_loss_mlp": 0.20568848, + "step": 3583, + "time_per_iteration": 2.580411434173584 + }, + { + "auxiliary_loss_clip": 0.0655268, + "auxiliary_loss_mlp": 0.01295505, + "balance_loss_clip": 0.06312086, + "balance_loss_mlp": 0.01275561, + "epoch": 0.21548173756200212, + "flos": 26111077470720.0, + "grad_norm": 2.720923149453336, + "language_loss": 0.83510441, + "learning_rate": 3.6498298573662824e-06, + "loss": 0.91358626, + "num_input_tokens_seen": 77350770, + "router_z_loss_clip": 2.40625, + "router_z_loss_mlp": 0.19934082, + "step": 3584, + "time_per_iteration": 2.587843179702759 + }, + { + "auxiliary_loss_clip": 0.06549002, + "auxiliary_loss_mlp": 0.01288111, + "balance_loss_clip": 0.06314336, + "balance_loss_mlp": 0.01267667, + "epoch": 0.21554186081467008, + "flos": 22170315623040.0, + "grad_norm": 2.7712372256622357, + "language_loss": 0.91010725, + "learning_rate": 3.6496096801849625e-06, + "loss": 0.9884783, + "num_input_tokens_seen": 77370510, + "router_z_loss_clip": 2.34375, + "router_z_loss_mlp": 0.20446777, + "step": 3585, + "time_per_iteration": 2.5638017654418945 + }, + { + "auxiliary_loss_clip": 0.06548285, + "auxiliary_loss_mlp": 0.0129374, + "balance_loss_clip": 0.0631054, + "balance_loss_mlp": 0.012745, + "epoch": 0.21560198406733805, + "flos": 22973458608000.0, + "grad_norm": 2.0799258962001548, + "language_loss": 0.75285476, + "learning_rate": 3.649389440450277e-06, + "loss": 0.83127499, + "num_input_tokens_seen": 77390645, + "router_z_loss_clip": 2.37890625, + "router_z_loss_mlp": 0.19238281, + "step": 3586, + "time_per_iteration": 2.5816385746002197 + }, + { + "auxiliary_loss_clip": 0.06560329, + "auxiliary_loss_mlp": 0.01301548, + "balance_loss_clip": 0.06317623, + "balance_loss_mlp": 0.012817, + "epoch": 0.215662107320006, + "flos": 22790708853120.0, + "grad_norm": 1.7819627104594034, + "language_loss": 0.83628035, + "learning_rate": 3.6491691381705804e-06, + "loss": 0.91489911, + "num_input_tokens_seen": 77409655, + "router_z_loss_clip": 2.42382812, + "router_z_loss_mlp": 0.19848633, + "step": 3587, + "time_per_iteration": 2.5768468379974365 + }, + { + "auxiliary_loss_clip": 0.06549525, + "auxiliary_loss_mlp": 0.01284104, + "balance_loss_clip": 0.06311873, + "balance_loss_mlp": 0.01265114, + "epoch": 0.21572223057267398, + "flos": 30891850899840.0, + "grad_norm": 2.819752743062096, + "language_loss": 0.764575, + "learning_rate": 3.648948773354224e-06, + "loss": 0.8429113, + "num_input_tokens_seen": 77430560, + "router_z_loss_clip": 2.37890625, + "router_z_loss_mlp": 0.18981934, + "step": 3588, + "time_per_iteration": 2.6578357219696045 + }, + { + "auxiliary_loss_clip": 0.06557232, + "auxiliary_loss_mlp": 0.01294163, + "balance_loss_clip": 0.06316121, + "balance_loss_mlp": 0.01274494, + "epoch": 0.21578235382534194, + "flos": 26918413159680.0, + "grad_norm": 3.674353356251158, + "language_loss": 0.8181411, + "learning_rate": 3.6487283460095643e-06, + "loss": 0.89665502, + "num_input_tokens_seen": 77455000, + "router_z_loss_clip": 2.4140625, + "router_z_loss_mlp": 0.19689941, + "step": 3589, + "time_per_iteration": 2.6730964183807373 + }, + { + "auxiliary_loss_clip": 0.06560542, + "auxiliary_loss_mlp": 0.01287343, + "balance_loss_clip": 0.06321919, + "balance_loss_mlp": 0.01267959, + "epoch": 0.2158424770780099, + "flos": 24432605608320.0, + "grad_norm": 2.119721317496626, + "language_loss": 0.73323047, + "learning_rate": 3.648507856144961e-06, + "loss": 0.81170928, + "num_input_tokens_seen": 77475075, + "router_z_loss_clip": 2.38671875, + "router_z_loss_mlp": 0.19384766, + "step": 3590, + "time_per_iteration": 2.5885848999023438 + }, + { + "auxiliary_loss_clip": 0.06554762, + "auxiliary_loss_mlp": 0.0128494, + "balance_loss_clip": 0.06310897, + "balance_loss_mlp": 0.01264401, + "epoch": 0.2159026003306779, + "flos": 23956542236160.0, + "grad_norm": 2.0666561712978813, + "language_loss": 0.84929311, + "learning_rate": 3.648287303768775e-06, + "loss": 0.92769015, + "num_input_tokens_seen": 77495945, + "router_z_loss_clip": 2.44140625, + "router_z_loss_mlp": 0.20544434, + "step": 3591, + "time_per_iteration": 2.5598154067993164 + }, + { + "auxiliary_loss_clip": 0.0656037, + "auxiliary_loss_mlp": 0.01294269, + "balance_loss_clip": 0.06315921, + "balance_loss_mlp": 0.01272776, + "epoch": 0.21596272358334587, + "flos": 30048191665920.0, + "grad_norm": 1.8943006547331833, + "language_loss": 0.69118065, + "learning_rate": 3.6480666888893686e-06, + "loss": 0.76972699, + "num_input_tokens_seen": 77517140, + "router_z_loss_clip": 2.44140625, + "router_z_loss_mlp": 0.21496582, + "step": 3592, + "time_per_iteration": 2.623124599456787 + }, + { + "auxiliary_loss_clip": 0.06558264, + "auxiliary_loss_mlp": 0.01284651, + "balance_loss_clip": 0.06314576, + "balance_loss_mlp": 0.01264218, + "epoch": 0.21602284683601383, + "flos": 20382495782400.0, + "grad_norm": 3.2836833125469753, + "language_loss": 0.84947151, + "learning_rate": 3.647846011515108e-06, + "loss": 0.92790061, + "num_input_tokens_seen": 77536085, + "router_z_loss_clip": 2.4375, + "router_z_loss_mlp": 0.2043457, + "step": 3593, + "time_per_iteration": 2.5159051418304443 + }, + { + "auxiliary_loss_clip": 0.06551524, + "auxiliary_loss_mlp": 0.01289729, + "balance_loss_clip": 0.06309479, + "balance_loss_mlp": 0.01267615, + "epoch": 0.2160829700886818, + "flos": 20783648004480.0, + "grad_norm": 2.6962087820066567, + "language_loss": 0.76424301, + "learning_rate": 3.6476252716543625e-06, + "loss": 0.84265554, + "num_input_tokens_seen": 77553675, + "router_z_loss_clip": 2.42382812, + "router_z_loss_mlp": 0.22119141, + "step": 3594, + "time_per_iteration": 2.530874490737915 + }, + { + "auxiliary_loss_clip": 0.06549954, + "auxiliary_loss_mlp": 0.01280574, + "balance_loss_clip": 0.06313863, + "balance_loss_mlp": 0.01260189, + "epoch": 0.21614309334134976, + "flos": 22316322562560.0, + "grad_norm": 1.5622924015328905, + "language_loss": 0.80828846, + "learning_rate": 3.6474044693155007e-06, + "loss": 0.88659382, + "num_input_tokens_seen": 77573360, + "router_z_loss_clip": 2.36132812, + "router_z_loss_mlp": 0.20385742, + "step": 3595, + "time_per_iteration": 2.5720436573028564 + }, + { + "auxiliary_loss_clip": 0.0655812, + "auxiliary_loss_mlp": 0.01282788, + "balance_loss_clip": 0.06310599, + "balance_loss_mlp": 0.01261962, + "epoch": 0.21620321659401773, + "flos": 19615592488320.0, + "grad_norm": 2.071968351759389, + "language_loss": 0.79120421, + "learning_rate": 3.647183604506897e-06, + "loss": 0.86961329, + "num_input_tokens_seen": 77591865, + "router_z_loss_clip": 2.4765625, + "router_z_loss_mlp": 0.20825195, + "step": 3596, + "time_per_iteration": 2.529978036880493 + }, + { + "auxiliary_loss_clip": 0.06547653, + "auxiliary_loss_mlp": 0.01279822, + "balance_loss_clip": 0.06309111, + "balance_loss_mlp": 0.01258615, + "epoch": 0.2162633398466857, + "flos": 18850701692160.0, + "grad_norm": 1.8098333997433065, + "language_loss": 0.83728772, + "learning_rate": 3.6469626772369253e-06, + "loss": 0.91556245, + "num_input_tokens_seen": 77611600, + "router_z_loss_clip": 2.38671875, + "router_z_loss_mlp": 0.2121582, + "step": 3597, + "time_per_iteration": 2.514389991760254 + }, + { + "auxiliary_loss_clip": 0.06559294, + "auxiliary_loss_mlp": 0.01284022, + "balance_loss_clip": 0.06315802, + "balance_loss_mlp": 0.01262421, + "epoch": 0.21632346309935369, + "flos": 18774490803840.0, + "grad_norm": 2.0845397374343655, + "language_loss": 0.81213892, + "learning_rate": 3.6467416875139642e-06, + "loss": 0.89057213, + "num_input_tokens_seen": 77630665, + "router_z_loss_clip": 2.4375, + "router_z_loss_mlp": 0.21606445, + "step": 3598, + "time_per_iteration": 2.517596960067749 + }, + { + "auxiliary_loss_clip": 0.06554621, + "auxiliary_loss_mlp": 0.01287936, + "balance_loss_clip": 0.06312433, + "balance_loss_mlp": 0.01265072, + "epoch": 0.21638358635202165, + "flos": 26331576289920.0, + "grad_norm": 1.6266226591192001, + "language_loss": 0.82318664, + "learning_rate": 3.6465206353463934e-06, + "loss": 0.90161228, + "num_input_tokens_seen": 77650835, + "router_z_loss_clip": 2.41992188, + "router_z_loss_mlp": 0.22851562, + "step": 3599, + "time_per_iteration": 2.567528486251831 + }, + { + "auxiliary_loss_clip": 0.06553015, + "auxiliary_loss_mlp": 0.01284743, + "balance_loss_clip": 0.06314674, + "balance_loss_mlp": 0.01263107, + "epoch": 0.21644370960468962, + "flos": 20747156751360.0, + "grad_norm": 2.0891036476830585, + "language_loss": 0.76652539, + "learning_rate": 3.6462995207425947e-06, + "loss": 0.84490293, + "num_input_tokens_seen": 77669000, + "router_z_loss_clip": 2.38476562, + "router_z_loss_mlp": 0.21618652, + "step": 3600, + "time_per_iteration": 2.5642178058624268 + }, + { + "auxiliary_loss_clip": 0.06555548, + "auxiliary_loss_mlp": 0.01287253, + "balance_loss_clip": 0.06316924, + "balance_loss_mlp": 0.01267512, + "epoch": 0.21650383285735758, + "flos": 23959183639680.0, + "grad_norm": 1.8375873098897355, + "language_loss": 0.80812716, + "learning_rate": 3.6460783437109533e-06, + "loss": 0.88655519, + "num_input_tokens_seen": 77688745, + "router_z_loss_clip": 2.38476562, + "router_z_loss_mlp": 0.19726562, + "step": 3601, + "time_per_iteration": 2.536790132522583 + }, + { + "auxiliary_loss_clip": 0.06558496, + "auxiliary_loss_mlp": 0.01286287, + "balance_loss_clip": 0.06317312, + "balance_loss_mlp": 0.01265783, + "epoch": 0.21656395611002555, + "flos": 23702864400000.0, + "grad_norm": 1.8593805820505158, + "language_loss": 0.84205902, + "learning_rate": 3.6458571042598565e-06, + "loss": 0.92050683, + "num_input_tokens_seen": 77708445, + "router_z_loss_clip": 2.41210938, + "router_z_loss_mlp": 0.2052002, + "step": 3602, + "time_per_iteration": 2.5919816493988037 + }, + { + "auxiliary_loss_clip": 0.06553967, + "auxiliary_loss_mlp": 0.01285958, + "balance_loss_clip": 0.06313825, + "balance_loss_mlp": 0.01265371, + "epoch": 0.2166240793626935, + "flos": 20672035966080.0, + "grad_norm": 1.6537912100509087, + "language_loss": 0.75107038, + "learning_rate": 3.645635802397693e-06, + "loss": 0.82946962, + "num_input_tokens_seen": 77728465, + "router_z_loss_clip": 2.40429688, + "router_z_loss_mlp": 0.20581055, + "step": 3603, + "time_per_iteration": 2.5602827072143555 + }, + { + "auxiliary_loss_clip": 0.06545025, + "auxiliary_loss_mlp": 0.01279689, + "balance_loss_clip": 0.06314509, + "balance_loss_mlp": 0.0125996, + "epoch": 0.2166842026153615, + "flos": 21586916770560.0, + "grad_norm": 1.9607230977514314, + "language_loss": 0.75016356, + "learning_rate": 3.645414438132855e-06, + "loss": 0.82841063, + "num_input_tokens_seen": 77746735, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.1973877, + "step": 3604, + "time_per_iteration": 2.7099287509918213 + }, + { + "auxiliary_loss_clip": 0.06550605, + "auxiliary_loss_mlp": 0.01283396, + "balance_loss_clip": 0.06315283, + "balance_loss_mlp": 0.01263881, + "epoch": 0.21674432586802947, + "flos": 25637068523520.0, + "grad_norm": 1.5948705207891358, + "language_loss": 0.80732697, + "learning_rate": 3.6451930114737366e-06, + "loss": 0.88566697, + "num_input_tokens_seen": 77768105, + "router_z_loss_clip": 2.35351562, + "router_z_loss_mlp": 0.19506836, + "step": 3605, + "time_per_iteration": 2.601269483566284 + }, + { + "auxiliary_loss_clip": 0.06465107, + "auxiliary_loss_mlp": 0.01314575, + "balance_loss_clip": 0.0633797, + "balance_loss_mlp": 0.01307596, + "epoch": 0.21680444912069743, + "flos": 56435126376960.0, + "grad_norm": 0.68181157035555, + "language_loss": 0.58316016, + "learning_rate": 3.6449715224287347e-06, + "loss": 0.66095698, + "num_input_tokens_seen": 77833750, + "router_z_loss_clip": 1.27246094, + "router_z_loss_mlp": 0.06994629, + "step": 3606, + "time_per_iteration": 3.2531886100769043 + }, + { + "auxiliary_loss_clip": 0.06547002, + "auxiliary_loss_mlp": 0.01286663, + "balance_loss_clip": 0.06303971, + "balance_loss_mlp": 0.01264502, + "epoch": 0.2168645723733654, + "flos": 23885823790080.0, + "grad_norm": 1.8693102201830953, + "language_loss": 0.73682618, + "learning_rate": 3.644749971006248e-06, + "loss": 0.81516284, + "num_input_tokens_seen": 77853780, + "router_z_loss_clip": 2.42773438, + "router_z_loss_mlp": 0.22155762, + "step": 3607, + "time_per_iteration": 4.0285868644714355 + }, + { + "auxiliary_loss_clip": 0.06548688, + "auxiliary_loss_mlp": 0.01281672, + "balance_loss_clip": 0.06306184, + "balance_loss_mlp": 0.01259595, + "epoch": 0.21692469562603336, + "flos": 16951814864640.0, + "grad_norm": 1.845726065350227, + "language_loss": 0.78116572, + "learning_rate": 3.6445283572146765e-06, + "loss": 0.85946935, + "num_input_tokens_seen": 77872575, + "router_z_loss_clip": 2.421875, + "router_z_loss_mlp": 0.22070312, + "step": 3608, + "time_per_iteration": 2.4997665882110596 + }, + { + "auxiliary_loss_clip": 0.06549841, + "auxiliary_loss_mlp": 0.01279583, + "balance_loss_clip": 0.06307275, + "balance_loss_mlp": 0.01260248, + "epoch": 0.21698481887870133, + "flos": 25126065198720.0, + "grad_norm": 2.052249511327834, + "language_loss": 0.74638152, + "learning_rate": 3.6443066810624255e-06, + "loss": 0.82467568, + "num_input_tokens_seen": 77892700, + "router_z_loss_clip": 2.42773438, + "router_z_loss_mlp": 0.19335938, + "step": 3609, + "time_per_iteration": 2.5834193229675293 + }, + { + "auxiliary_loss_clip": 0.06540599, + "auxiliary_loss_mlp": 0.01279572, + "balance_loss_clip": 0.06301089, + "balance_loss_mlp": 0.01258221, + "epoch": 0.2170449421313693, + "flos": 17900461664640.0, + "grad_norm": 2.066668805909691, + "language_loss": 0.8888129, + "learning_rate": 3.6440849425579e-06, + "loss": 0.96701467, + "num_input_tokens_seen": 77911060, + "router_z_loss_clip": 2.39648438, + "router_z_loss_mlp": 0.21374512, + "step": 3610, + "time_per_iteration": 3.978980302810669 + }, + { + "auxiliary_loss_clip": 0.06540407, + "auxiliary_loss_mlp": 0.01284961, + "balance_loss_clip": 0.06302356, + "balance_loss_mlp": 0.01264457, + "epoch": 0.2171050653840373, + "flos": 22645121184000.0, + "grad_norm": 2.4524698956279978, + "language_loss": 0.78034103, + "learning_rate": 3.6438631417095095e-06, + "loss": 0.85859472, + "num_input_tokens_seen": 77929930, + "router_z_loss_clip": 2.38085938, + "router_z_loss_mlp": 0.20507812, + "step": 3611, + "time_per_iteration": 2.537783622741699 + }, + { + "auxiliary_loss_clip": 0.06539893, + "auxiliary_loss_mlp": 0.01277493, + "balance_loss_clip": 0.06301216, + "balance_loss_mlp": 0.0125619, + "epoch": 0.21716518863670525, + "flos": 19506034874880.0, + "grad_norm": 1.9372172398113192, + "language_loss": 0.63866782, + "learning_rate": 3.6436412785256637e-06, + "loss": 0.71684164, + "num_input_tokens_seen": 77949060, + "router_z_loss_clip": 2.38671875, + "router_z_loss_mlp": 0.21313477, + "step": 3612, + "time_per_iteration": 2.5200283527374268 + }, + { + "auxiliary_loss_clip": 0.06543254, + "auxiliary_loss_mlp": 0.01280194, + "balance_loss_clip": 0.06303414, + "balance_loss_mlp": 0.01259761, + "epoch": 0.21722531188937322, + "flos": 19798132608000.0, + "grad_norm": 1.7866878621114652, + "language_loss": 0.76463711, + "learning_rate": 3.643419353014776e-06, + "loss": 0.84287155, + "num_input_tokens_seen": 77967920, + "router_z_loss_clip": 2.40039062, + "router_z_loss_mlp": 0.2043457, + "step": 3613, + "time_per_iteration": 2.536395311355591 + }, + { + "auxiliary_loss_clip": 0.06540725, + "auxiliary_loss_mlp": 0.01277778, + "balance_loss_clip": 0.06303174, + "balance_loss_mlp": 0.01256165, + "epoch": 0.21728543514204118, + "flos": 13339474295040.0, + "grad_norm": 1.8023674067133515, + "language_loss": 0.72213733, + "learning_rate": 3.643197365185261e-06, + "loss": 0.80032235, + "num_input_tokens_seen": 77985330, + "router_z_loss_clip": 2.375, + "router_z_loss_mlp": 0.21582031, + "step": 3614, + "time_per_iteration": 2.5000360012054443 + }, + { + "auxiliary_loss_clip": 0.06542929, + "auxiliary_loss_mlp": 0.01277823, + "balance_loss_clip": 0.06304483, + "balance_loss_mlp": 0.01256973, + "epoch": 0.21734555839470915, + "flos": 15237312946560.0, + "grad_norm": 2.7303590898197463, + "language_loss": 0.73928845, + "learning_rate": 3.6429753150455378e-06, + "loss": 0.81749594, + "num_input_tokens_seen": 78003105, + "router_z_loss_clip": 2.38476562, + "router_z_loss_mlp": 0.20849609, + "step": 3615, + "time_per_iteration": 3.924616813659668 + }, + { + "auxiliary_loss_clip": 0.0654763, + "auxiliary_loss_mlp": 0.01279327, + "balance_loss_clip": 0.06301322, + "balance_loss_mlp": 0.0125694, + "epoch": 0.2174056816473771, + "flos": 19980043822080.0, + "grad_norm": 2.1391350951981467, + "language_loss": 0.913239, + "learning_rate": 3.6427532026040263e-06, + "loss": 0.99150848, + "num_input_tokens_seen": 78019655, + "router_z_loss_clip": 2.46875, + "router_z_loss_mlp": 0.22387695, + "step": 3616, + "time_per_iteration": 3.9379403591156006 + }, + { + "auxiliary_loss_clip": 0.06540038, + "auxiliary_loss_mlp": 0.01284656, + "balance_loss_clip": 0.06298746, + "balance_loss_mlp": 0.01263163, + "epoch": 0.21746580490004508, + "flos": 16692309169920.0, + "grad_norm": 2.057861674488091, + "language_loss": 0.81572813, + "learning_rate": 3.642531027869148e-06, + "loss": 0.89397502, + "num_input_tokens_seen": 78036025, + "router_z_loss_clip": 2.41601562, + "router_z_loss_mlp": 0.21496582, + "step": 3617, + "time_per_iteration": 2.5517330169677734 + }, + { + "auxiliary_loss_clip": 0.06543958, + "auxiliary_loss_mlp": 0.01280959, + "balance_loss_clip": 0.06300673, + "balance_loss_mlp": 0.01258881, + "epoch": 0.21752592815271307, + "flos": 25778840832000.0, + "grad_norm": 1.7475820668036919, + "language_loss": 0.76030993, + "learning_rate": 3.642308790849329e-06, + "loss": 0.83855915, + "num_input_tokens_seen": 78055645, + "router_z_loss_clip": 2.43164062, + "router_z_loss_mlp": 0.2208252, + "step": 3618, + "time_per_iteration": 2.5874650478363037 + }, + { + "auxiliary_loss_clip": 0.06542084, + "auxiliary_loss_mlp": 0.01277743, + "balance_loss_clip": 0.06299525, + "balance_loss_mlp": 0.01255928, + "epoch": 0.21758605140538104, + "flos": 11259430940160.0, + "grad_norm": 1.9309868599682727, + "language_loss": 0.69592559, + "learning_rate": 3.642086491552996e-06, + "loss": 0.77412391, + "num_input_tokens_seen": 78071660, + "router_z_loss_clip": 2.421875, + "router_z_loss_mlp": 0.21826172, + "step": 3619, + "time_per_iteration": 2.5259079933166504 + }, + { + "auxiliary_loss_clip": 0.06549741, + "auxiliary_loss_mlp": 0.01287424, + "balance_loss_clip": 0.06307657, + "balance_loss_mlp": 0.01264906, + "epoch": 0.217646174658049, + "flos": 19248290115840.0, + "grad_norm": 1.6696593228851853, + "language_loss": 0.78744078, + "learning_rate": 3.641864129988579e-06, + "loss": 0.86581242, + "num_input_tokens_seen": 78091265, + "router_z_loss_clip": 2.41992188, + "router_z_loss_mlp": 0.22521973, + "step": 3620, + "time_per_iteration": 2.5225844383239746 + }, + { + "auxiliary_loss_clip": 0.06542689, + "auxiliary_loss_mlp": 0.01283495, + "balance_loss_clip": 0.06306273, + "balance_loss_mlp": 0.01263349, + "epoch": 0.21770629791071697, + "flos": 21951619666560.0, + "grad_norm": 1.6751510482296663, + "language_loss": 0.80184436, + "learning_rate": 3.641641706164509e-06, + "loss": 0.88010621, + "num_input_tokens_seen": 78110095, + "router_z_loss_clip": 2.36328125, + "router_z_loss_mlp": 0.20141602, + "step": 3621, + "time_per_iteration": 2.5528457164764404 + }, + { + "auxiliary_loss_clip": 0.0654473, + "auxiliary_loss_mlp": 0.01278712, + "balance_loss_clip": 0.06305254, + "balance_loss_mlp": 0.012594, + "epoch": 0.21776642116338493, + "flos": 24943776641280.0, + "grad_norm": 1.5217586163816694, + "language_loss": 0.87951142, + "learning_rate": 3.641419220089221e-06, + "loss": 0.95774585, + "num_input_tokens_seen": 78129475, + "router_z_loss_clip": 2.39648438, + "router_z_loss_mlp": 0.19299316, + "step": 3622, + "time_per_iteration": 2.621716022491455 + }, + { + "auxiliary_loss_clip": 0.06559718, + "auxiliary_loss_mlp": 0.01277107, + "balance_loss_clip": 0.06313318, + "balance_loss_mlp": 0.01254445, + "epoch": 0.2178265444160529, + "flos": 17827017960960.0, + "grad_norm": 3.34018590012949, + "language_loss": 0.77879506, + "learning_rate": 3.641196671771152e-06, + "loss": 0.85716331, + "num_input_tokens_seen": 78146880, + "router_z_loss_clip": 2.46289062, + "router_z_loss_mlp": 0.22668457, + "step": 3623, + "time_per_iteration": 2.5479788780212402 + }, + { + "auxiliary_loss_clip": 0.0655373, + "auxiliary_loss_mlp": 0.01283267, + "balance_loss_clip": 0.06310436, + "balance_loss_mlp": 0.0126132, + "epoch": 0.2178866676687209, + "flos": 17718760085760.0, + "grad_norm": 2.118806527220675, + "language_loss": 0.85078007, + "learning_rate": 3.640974061218741e-06, + "loss": 0.92914999, + "num_input_tokens_seen": 78165065, + "router_z_loss_clip": 2.43359375, + "router_z_loss_mlp": 0.21936035, + "step": 3624, + "time_per_iteration": 2.4991443157196045 + }, + { + "auxiliary_loss_clip": 0.06544428, + "auxiliary_loss_mlp": 0.01281962, + "balance_loss_clip": 0.06301346, + "balance_loss_mlp": 0.01259014, + "epoch": 0.21794679092138886, + "flos": 16951437521280.0, + "grad_norm": 2.3785715622769357, + "language_loss": 0.7814458, + "learning_rate": 3.640751388440429e-06, + "loss": 0.85970974, + "num_input_tokens_seen": 78180005, + "router_z_loss_clip": 2.42773438, + "router_z_loss_mlp": 0.22961426, + "step": 3625, + "time_per_iteration": 2.5113301277160645 + }, + { + "auxiliary_loss_clip": 0.06435797, + "auxiliary_loss_mlp": 0.01281105, + "balance_loss_clip": 0.0630773, + "balance_loss_mlp": 0.01275631, + "epoch": 0.21800691417405682, + "flos": 63737737413120.0, + "grad_norm": 0.7732492376258139, + "language_loss": 0.60674119, + "learning_rate": 3.64052865344466e-06, + "loss": 0.68391013, + "num_input_tokens_seen": 78245350, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.05477905, + "step": 3626, + "time_per_iteration": 3.230576992034912 + }, + { + "auxiliary_loss_clip": 0.06551459, + "auxiliary_loss_mlp": 0.01275255, + "balance_loss_clip": 0.06306285, + "balance_loss_mlp": 0.01252271, + "epoch": 0.21806703742672479, + "flos": 21622821045120.0, + "grad_norm": 2.0426080259896664, + "language_loss": 0.91217983, + "learning_rate": 3.6403058562398795e-06, + "loss": 0.99044704, + "num_input_tokens_seen": 78264165, + "router_z_loss_clip": 2.45117188, + "router_z_loss_mlp": 0.22961426, + "step": 3627, + "time_per_iteration": 2.571704149246216 + }, + { + "auxiliary_loss_clip": 0.06549745, + "auxiliary_loss_mlp": 0.01279629, + "balance_loss_clip": 0.06307864, + "balance_loss_mlp": 0.01257313, + "epoch": 0.21812716067939275, + "flos": 19361034184320.0, + "grad_norm": 1.8240036323551578, + "language_loss": 0.74830574, + "learning_rate": 3.6400829968345365e-06, + "loss": 0.82659948, + "num_input_tokens_seen": 78283745, + "router_z_loss_clip": 2.421875, + "router_z_loss_mlp": 0.2232666, + "step": 3628, + "time_per_iteration": 2.5547990798950195 + }, + { + "auxiliary_loss_clip": 0.06543273, + "auxiliary_loss_mlp": 0.01279581, + "balance_loss_clip": 0.06304347, + "balance_loss_mlp": 0.01257039, + "epoch": 0.21818728393206072, + "flos": 23554467619200.0, + "grad_norm": 1.7805187473711719, + "language_loss": 0.77940357, + "learning_rate": 3.6398600752370826e-06, + "loss": 0.85763204, + "num_input_tokens_seen": 78302900, + "router_z_loss_clip": 2.38867188, + "router_z_loss_mlp": 0.2253418, + "step": 3629, + "time_per_iteration": 2.5777294635772705 + }, + { + "auxiliary_loss_clip": 0.06540327, + "auxiliary_loss_mlp": 0.01278528, + "balance_loss_clip": 0.06302765, + "balance_loss_mlp": 0.01257822, + "epoch": 0.21824740718472868, + "flos": 30233289335040.0, + "grad_norm": 1.6105707802077895, + "language_loss": 0.72294879, + "learning_rate": 3.63963709145597e-06, + "loss": 0.80113733, + "num_input_tokens_seen": 78326470, + "router_z_loss_clip": 2.375, + "router_z_loss_mlp": 0.20703125, + "step": 3630, + "time_per_iteration": 2.6015560626983643 + }, + { + "auxiliary_loss_clip": 0.06535304, + "auxiliary_loss_mlp": 0.01279689, + "balance_loss_clip": 0.06303381, + "balance_loss_mlp": 0.01259364, + "epoch": 0.21830753043739667, + "flos": 26140860397440.0, + "grad_norm": 1.9295675894773927, + "language_loss": 0.77031553, + "learning_rate": 3.6394140454996544e-06, + "loss": 0.8484655, + "num_input_tokens_seen": 78345810, + "router_z_loss_clip": 2.31835938, + "router_z_loss_mlp": 0.203125, + "step": 3631, + "time_per_iteration": 2.5712599754333496 + }, + { + "auxiliary_loss_clip": 0.06546577, + "auxiliary_loss_mlp": 0.01286362, + "balance_loss_clip": 0.06304416, + "balance_loss_mlp": 0.01265274, + "epoch": 0.21836765369006464, + "flos": 21726299237760.0, + "grad_norm": 24.58992261392957, + "language_loss": 0.76358086, + "learning_rate": 3.639190937376594e-06, + "loss": 0.84191024, + "num_input_tokens_seen": 78364085, + "router_z_loss_clip": 2.41992188, + "router_z_loss_mlp": 0.21081543, + "step": 3632, + "time_per_iteration": 2.5312108993530273 + }, + { + "auxiliary_loss_clip": 0.06541382, + "auxiliary_loss_mlp": 0.01277975, + "balance_loss_clip": 0.06306228, + "balance_loss_mlp": 0.01258008, + "epoch": 0.2184277769427326, + "flos": 19943678350080.0, + "grad_norm": 2.014902514553352, + "language_loss": 0.8455261, + "learning_rate": 3.638967767095249e-06, + "loss": 0.9237197, + "num_input_tokens_seen": 78381385, + "router_z_loss_clip": 2.35351562, + "router_z_loss_mlp": 0.19958496, + "step": 3633, + "time_per_iteration": 2.5392541885375977 + }, + { + "auxiliary_loss_clip": 0.06536385, + "auxiliary_loss_mlp": 0.01279679, + "balance_loss_clip": 0.06300621, + "balance_loss_mlp": 0.0125821, + "epoch": 0.21848790019540057, + "flos": 20346591507840.0, + "grad_norm": 2.269088705731375, + "language_loss": 0.82069844, + "learning_rate": 3.6387445346640823e-06, + "loss": 0.89885902, + "num_input_tokens_seen": 78400500, + "router_z_loss_clip": 2.35742188, + "router_z_loss_mlp": 0.21484375, + "step": 3634, + "time_per_iteration": 2.5536303520202637 + }, + { + "auxiliary_loss_clip": 0.06544928, + "auxiliary_loss_mlp": 0.01275115, + "balance_loss_clip": 0.063034, + "balance_loss_mlp": 0.01254063, + "epoch": 0.21854802344806853, + "flos": 15456302392320.0, + "grad_norm": 2.1744892406337133, + "language_loss": 0.75276726, + "learning_rate": 3.638521240091558e-06, + "loss": 0.83096772, + "num_input_tokens_seen": 78418340, + "router_z_loss_clip": 2.41796875, + "router_z_loss_mlp": 0.21044922, + "step": 3635, + "time_per_iteration": 2.5158851146698 + }, + { + "auxiliary_loss_clip": 0.0653572, + "auxiliary_loss_mlp": 0.01278867, + "balance_loss_clip": 0.06301719, + "balance_loss_mlp": 0.01259018, + "epoch": 0.2186081467007365, + "flos": 16325384140800.0, + "grad_norm": 1.9753193728837781, + "language_loss": 0.88470638, + "learning_rate": 3.6382978833861445e-06, + "loss": 0.96285218, + "num_input_tokens_seen": 78434375, + "router_z_loss_clip": 2.33984375, + "router_z_loss_mlp": 0.19836426, + "step": 3636, + "time_per_iteration": 2.5056772232055664 + }, + { + "auxiliary_loss_clip": 0.06536973, + "auxiliary_loss_mlp": 0.01285934, + "balance_loss_clip": 0.06300446, + "balance_loss_mlp": 0.01264798, + "epoch": 0.2186682699534045, + "flos": 21695677770240.0, + "grad_norm": 1.933426681732421, + "language_loss": 0.76219505, + "learning_rate": 3.638074464556311e-06, + "loss": 0.84042412, + "num_input_tokens_seen": 78451735, + "router_z_loss_clip": 2.3671875, + "router_z_loss_mlp": 0.21118164, + "step": 3637, + "time_per_iteration": 2.5159406661987305 + }, + { + "auxiliary_loss_clip": 0.06547473, + "auxiliary_loss_mlp": 0.01280148, + "balance_loss_clip": 0.06303671, + "balance_loss_mlp": 0.0125726, + "epoch": 0.21872839320607246, + "flos": 17743427913600.0, + "grad_norm": 3.0066644559057867, + "language_loss": 0.90341294, + "learning_rate": 3.63785098361053e-06, + "loss": 0.98168921, + "num_input_tokens_seen": 78462730, + "router_z_loss_clip": 2.43945312, + "router_z_loss_mlp": 0.22888184, + "step": 3638, + "time_per_iteration": 2.475271224975586 + }, + { + "auxiliary_loss_clip": 0.06535378, + "auxiliary_loss_mlp": 0.01286586, + "balance_loss_clip": 0.06297417, + "balance_loss_mlp": 0.01264318, + "epoch": 0.21878851645874042, + "flos": 18656757417600.0, + "grad_norm": 3.417327747399998, + "language_loss": 0.90034223, + "learning_rate": 3.637627440557275e-06, + "loss": 0.97856188, + "num_input_tokens_seen": 78476300, + "router_z_loss_clip": 2.38085938, + "router_z_loss_mlp": 0.22265625, + "step": 3639, + "time_per_iteration": 2.4722554683685303 + }, + { + "auxiliary_loss_clip": 0.06531254, + "auxiliary_loss_mlp": 0.01281993, + "balance_loss_clip": 0.06296734, + "balance_loss_mlp": 0.01262264, + "epoch": 0.2188486397114084, + "flos": 25564463360640.0, + "grad_norm": 1.6695470201966474, + "language_loss": 0.7997371, + "learning_rate": 3.637403835405024e-06, + "loss": 0.87786961, + "num_input_tokens_seen": 78496135, + "router_z_loss_clip": 2.34375, + "router_z_loss_mlp": 0.19726562, + "step": 3640, + "time_per_iteration": 2.5905494689941406 + }, + { + "auxiliary_loss_clip": 0.06541579, + "auxiliary_loss_mlp": 0.01284166, + "balance_loss_clip": 0.06302525, + "balance_loss_mlp": 0.01260074, + "epoch": 0.21890876296407635, + "flos": 17897400990720.0, + "grad_norm": 8.732271245188107, + "language_loss": 0.72940969, + "learning_rate": 3.637180168162255e-06, + "loss": 0.80766714, + "num_input_tokens_seen": 78513855, + "router_z_loss_clip": 2.38671875, + "router_z_loss_mlp": 0.24084473, + "step": 3641, + "time_per_iteration": 2.5452075004577637 + }, + { + "auxiliary_loss_clip": 0.06541288, + "auxiliary_loss_mlp": 0.01280481, + "balance_loss_clip": 0.06304857, + "balance_loss_mlp": 0.01259619, + "epoch": 0.21896888621674432, + "flos": 17754915922560.0, + "grad_norm": 1.8801395061290727, + "language_loss": 0.81693721, + "learning_rate": 3.63695643883745e-06, + "loss": 0.89515489, + "num_input_tokens_seen": 78531740, + "router_z_loss_clip": 2.36328125, + "router_z_loss_mlp": 0.20874023, + "step": 3642, + "time_per_iteration": 2.5234179496765137 + }, + { + "auxiliary_loss_clip": 0.06550857, + "auxiliary_loss_mlp": 0.01284985, + "balance_loss_clip": 0.06311135, + "balance_loss_mlp": 0.01262204, + "epoch": 0.21902900946941228, + "flos": 23082890440320.0, + "grad_norm": 1.5963488152753738, + "language_loss": 0.71952182, + "learning_rate": 3.6367326474390928e-06, + "loss": 0.79788017, + "num_input_tokens_seen": 78549600, + "router_z_loss_clip": 2.39648438, + "router_z_loss_mlp": 0.2277832, + "step": 3643, + "time_per_iteration": 2.5542049407958984 + }, + { + "auxiliary_loss_clip": 0.06535246, + "auxiliary_loss_mlp": 0.01285725, + "balance_loss_clip": 0.06298445, + "balance_loss_mlp": 0.01264506, + "epoch": 0.21908913272208028, + "flos": 48189501492480.0, + "grad_norm": 1.9271022520918928, + "language_loss": 0.69055694, + "learning_rate": 3.6365087939756696e-06, + "loss": 0.76876664, + "num_input_tokens_seen": 78573350, + "router_z_loss_clip": 2.37109375, + "router_z_loss_mlp": 0.21228027, + "step": 3644, + "time_per_iteration": 2.8034632205963135 + }, + { + "auxiliary_loss_clip": 0.06548485, + "auxiliary_loss_mlp": 0.01283418, + "balance_loss_clip": 0.06302129, + "balance_loss_mlp": 0.01261531, + "epoch": 0.21914925597474824, + "flos": 22243298129280.0, + "grad_norm": 2.4423330778710937, + "language_loss": 0.78728521, + "learning_rate": 3.636284878455669e-06, + "loss": 0.86560422, + "num_input_tokens_seen": 78591005, + "router_z_loss_clip": 2.46484375, + "router_z_loss_mlp": 0.21911621, + "step": 3645, + "time_per_iteration": 2.547746419906616 + }, + { + "auxiliary_loss_clip": 0.06531754, + "auxiliary_loss_mlp": 0.01275201, + "balance_loss_clip": 0.06300971, + "balance_loss_mlp": 0.01254936, + "epoch": 0.2192093792274162, + "flos": 22131853799040.0, + "grad_norm": 1.5020846701532837, + "language_loss": 0.82847381, + "learning_rate": 3.636060900887582e-06, + "loss": 0.90654337, + "num_input_tokens_seen": 78610645, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.20263672, + "step": 3646, + "time_per_iteration": 2.569216012954712 + }, + { + "auxiliary_loss_clip": 0.06536786, + "auxiliary_loss_mlp": 0.01283667, + "balance_loss_clip": 0.06302559, + "balance_loss_mlp": 0.01263449, + "epoch": 0.21926950248008417, + "flos": 15674914494720.0, + "grad_norm": 1.6949719683005162, + "language_loss": 0.83080441, + "learning_rate": 3.635836861279901e-06, + "loss": 0.90900892, + "num_input_tokens_seen": 78628340, + "router_z_loss_clip": 2.34179688, + "router_z_loss_mlp": 0.20227051, + "step": 3647, + "time_per_iteration": 3.9349160194396973 + }, + { + "auxiliary_loss_clip": 0.06534994, + "auxiliary_loss_mlp": 0.01281644, + "balance_loss_clip": 0.06301765, + "balance_loss_mlp": 0.01261105, + "epoch": 0.21932962573275214, + "flos": 30270199858560.0, + "grad_norm": 1.587891801710132, + "language_loss": 0.7257458, + "learning_rate": 3.635612759641123e-06, + "loss": 0.80391216, + "num_input_tokens_seen": 78649355, + "router_z_loss_clip": 2.33203125, + "router_z_loss_mlp": 0.20532227, + "step": 3648, + "time_per_iteration": 2.6465656757354736 + }, + { + "auxiliary_loss_clip": 0.06545104, + "auxiliary_loss_mlp": 0.0128538, + "balance_loss_clip": 0.06304809, + "balance_loss_mlp": 0.01263434, + "epoch": 0.2193897489854201, + "flos": 10784751160320.0, + "grad_norm": 3.088861131276654, + "language_loss": 0.74724281, + "learning_rate": 3.635388595979745e-06, + "loss": 0.8255477, + "num_input_tokens_seen": 78664915, + "router_z_loss_clip": 2.40039062, + "router_z_loss_mlp": 0.21960449, + "step": 3649, + "time_per_iteration": 2.510040283203125 + }, + { + "auxiliary_loss_clip": 0.06531087, + "auxiliary_loss_mlp": 0.01295006, + "balance_loss_clip": 0.06299826, + "balance_loss_mlp": 0.01274752, + "epoch": 0.21944987223808807, + "flos": 19138984064640.0, + "grad_norm": 4.303407628828735, + "language_loss": 0.86915123, + "learning_rate": 3.635164370304267e-06, + "loss": 0.94741207, + "num_input_tokens_seen": 78681475, + "router_z_loss_clip": 2.3125, + "router_z_loss_mlp": 0.20251465, + "step": 3650, + "time_per_iteration": 3.93752384185791 + }, + { + "auxiliary_loss_clip": 0.06543732, + "auxiliary_loss_mlp": 0.01294843, + "balance_loss_clip": 0.06307691, + "balance_loss_mlp": 0.01273422, + "epoch": 0.21950999549075606, + "flos": 22717726346880.0, + "grad_norm": 2.457938069648898, + "language_loss": 0.8456791, + "learning_rate": 3.6349400826231927e-06, + "loss": 0.92406487, + "num_input_tokens_seen": 78702300, + "router_z_loss_clip": 2.359375, + "router_z_loss_mlp": 0.2142334, + "step": 3651, + "time_per_iteration": 2.7058322429656982 + }, + { + "auxiliary_loss_clip": 0.06539044, + "auxiliary_loss_mlp": 0.01290725, + "balance_loss_clip": 0.06304742, + "balance_loss_mlp": 0.01270257, + "epoch": 0.21957011874342403, + "flos": 10565929422720.0, + "grad_norm": 1.8310150193660448, + "language_loss": 0.74885792, + "learning_rate": 3.634715732945027e-06, + "loss": 0.82715559, + "num_input_tokens_seen": 78720230, + "router_z_loss_clip": 2.34570312, + "router_z_loss_mlp": 0.20458984, + "step": 3652, + "time_per_iteration": 2.512620210647583 + }, + { + "auxiliary_loss_clip": 0.06458014, + "auxiliary_loss_mlp": 0.01487979, + "balance_loss_clip": 0.06335165, + "balance_loss_mlp": 0.01477775, + "epoch": 0.219630241996092, + "flos": 65765105677440.0, + "grad_norm": 0.8085744951241601, + "language_loss": 0.51588702, + "learning_rate": 3.6344913212782764e-06, + "loss": 0.59534693, + "num_input_tokens_seen": 78780200, + "router_z_loss_clip": 1.234375, + "router_z_loss_mlp": 0.10205078, + "step": 3653, + "time_per_iteration": 3.156705617904663 + }, + { + "auxiliary_loss_clip": 0.06532414, + "auxiliary_loss_mlp": 0.01292976, + "balance_loss_clip": 0.06300488, + "balance_loss_mlp": 0.01271685, + "epoch": 0.21969036524875996, + "flos": 23703367524480.0, + "grad_norm": 2.2498105533123467, + "language_loss": 0.7598449, + "learning_rate": 3.6342668476314514e-06, + "loss": 0.83809876, + "num_input_tokens_seen": 78800575, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.21289062, + "step": 3654, + "time_per_iteration": 2.5549349784851074 + }, + { + "auxiliary_loss_clip": 0.06539033, + "auxiliary_loss_mlp": 0.01287688, + "balance_loss_clip": 0.06300852, + "balance_loss_mlp": 0.01265277, + "epoch": 0.21975048850142792, + "flos": 19646130101760.0, + "grad_norm": 1.856190016757107, + "language_loss": 0.72937429, + "learning_rate": 3.634042312013064e-06, + "loss": 0.80764157, + "num_input_tokens_seen": 78819585, + "router_z_loss_clip": 2.3828125, + "router_z_loss_mlp": 0.22412109, + "step": 3655, + "time_per_iteration": 5.397899866104126 + }, + { + "auxiliary_loss_clip": 0.06537225, + "auxiliary_loss_mlp": 0.01285968, + "balance_loss_clip": 0.06301227, + "balance_loss_mlp": 0.01265667, + "epoch": 0.21981061175409589, + "flos": 22453944094080.0, + "grad_norm": 1.6446350088012902, + "language_loss": 0.81351042, + "learning_rate": 3.6338177144316276e-06, + "loss": 0.89174235, + "num_input_tokens_seen": 78837330, + "router_z_loss_clip": 2.36328125, + "router_z_loss_mlp": 0.20300293, + "step": 3656, + "time_per_iteration": 2.53308367729187 + }, + { + "auxiliary_loss_clip": 0.06536204, + "auxiliary_loss_mlp": 0.01286139, + "balance_loss_clip": 0.06302683, + "balance_loss_mlp": 0.01265027, + "epoch": 0.21987073500676388, + "flos": 18157032466560.0, + "grad_norm": 2.081609460517537, + "language_loss": 0.86280632, + "learning_rate": 3.63359305489566e-06, + "loss": 0.94102979, + "num_input_tokens_seen": 78854955, + "router_z_loss_clip": 2.33398438, + "router_z_loss_mlp": 0.21105957, + "step": 3657, + "time_per_iteration": 2.5165464878082275 + }, + { + "auxiliary_loss_clip": 0.06534712, + "auxiliary_loss_mlp": 0.01283645, + "balance_loss_clip": 0.0629717, + "balance_loss_mlp": 0.01263439, + "epoch": 0.21993085825943184, + "flos": 25632666184320.0, + "grad_norm": 1.606816904846988, + "language_loss": 0.80728716, + "learning_rate": 3.6333683334136803e-06, + "loss": 0.88547069, + "num_input_tokens_seen": 78874965, + "router_z_loss_clip": 2.375, + "router_z_loss_mlp": 0.20202637, + "step": 3658, + "time_per_iteration": 2.5528533458709717 + }, + { + "auxiliary_loss_clip": 0.06407537, + "auxiliary_loss_mlp": 0.01256954, + "balance_loss_clip": 0.0628604, + "balance_loss_mlp": 0.01250839, + "epoch": 0.2199909815120998, + "flos": 70946429621760.0, + "grad_norm": 0.7593962827668586, + "language_loss": 0.58126092, + "learning_rate": 3.6331435499942095e-06, + "loss": 0.65790582, + "num_input_tokens_seen": 78937740, + "router_z_loss_clip": 1.21875, + "router_z_loss_mlp": 0.06103516, + "step": 3659, + "time_per_iteration": 3.237276077270508 + }, + { + "auxiliary_loss_clip": 0.06524363, + "auxiliary_loss_mlp": 0.01284023, + "balance_loss_clip": 0.06293888, + "balance_loss_mlp": 0.01264091, + "epoch": 0.22005110476476777, + "flos": 21549964320000.0, + "grad_norm": 2.05919214646248, + "language_loss": 0.75117528, + "learning_rate": 3.632918704645772e-06, + "loss": 0.82925916, + "num_input_tokens_seen": 78955055, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.19946289, + "step": 3660, + "time_per_iteration": 2.5259556770324707 + }, + { + "auxiliary_loss_clip": 0.06528022, + "auxiliary_loss_mlp": 0.01287991, + "balance_loss_clip": 0.06292684, + "balance_loss_mlp": 0.01267976, + "epoch": 0.22011122801743574, + "flos": 22061051498880.0, + "grad_norm": 2.4805712407940645, + "language_loss": 0.81579179, + "learning_rate": 3.632693797376893e-06, + "loss": 0.89395189, + "num_input_tokens_seen": 78974895, + "router_z_loss_clip": 2.35546875, + "router_z_loss_mlp": 0.20019531, + "step": 3661, + "time_per_iteration": 2.5724833011627197 + }, + { + "auxiliary_loss_clip": 0.06527096, + "auxiliary_loss_mlp": 0.01283614, + "balance_loss_clip": 0.06295218, + "balance_loss_mlp": 0.01264039, + "epoch": 0.2201713512701037, + "flos": 26694811739520.0, + "grad_norm": 2.4209612671003993, + "language_loss": 0.73935246, + "learning_rate": 3.632468828196102e-06, + "loss": 0.81745958, + "num_input_tokens_seen": 78994990, + "router_z_loss_clip": 2.31640625, + "router_z_loss_mlp": 0.19567871, + "step": 3662, + "time_per_iteration": 2.594336986541748 + }, + { + "auxiliary_loss_clip": 0.06524752, + "auxiliary_loss_mlp": 0.01286026, + "balance_loss_clip": 0.06294893, + "balance_loss_mlp": 0.01266976, + "epoch": 0.22023147452277167, + "flos": 22168470833280.0, + "grad_norm": 1.5979135918213576, + "language_loss": 0.79490995, + "learning_rate": 3.632243797111929e-06, + "loss": 0.87301779, + "num_input_tokens_seen": 79014405, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.19042969, + "step": 3663, + "time_per_iteration": 2.6437172889709473 + }, + { + "auxiliary_loss_clip": 0.06536885, + "auxiliary_loss_mlp": 0.01285417, + "balance_loss_clip": 0.06298422, + "balance_loss_mlp": 0.01264627, + "epoch": 0.22029159777543966, + "flos": 22528981025280.0, + "grad_norm": 1.9228872111745317, + "language_loss": 0.81154871, + "learning_rate": 3.632018704132908e-06, + "loss": 0.8897717, + "num_input_tokens_seen": 79032375, + "router_z_loss_clip": 2.38671875, + "router_z_loss_mlp": 0.20800781, + "step": 3664, + "time_per_iteration": 2.551218271255493 + }, + { + "auxiliary_loss_clip": 0.06543128, + "auxiliary_loss_mlp": 0.01279618, + "balance_loss_clip": 0.06298529, + "balance_loss_mlp": 0.01257457, + "epoch": 0.22035172102810763, + "flos": 13047502343040.0, + "grad_norm": 2.388837963421245, + "language_loss": 0.77563322, + "learning_rate": 3.6317935492675742e-06, + "loss": 0.85386074, + "num_input_tokens_seen": 79049635, + "router_z_loss_clip": 2.44335938, + "router_z_loss_mlp": 0.22167969, + "step": 3665, + "time_per_iteration": 2.5317838191986084 + }, + { + "auxiliary_loss_clip": 0.06533245, + "auxiliary_loss_mlp": 0.0128412, + "balance_loss_clip": 0.06298798, + "balance_loss_mlp": 0.01263616, + "epoch": 0.2204118442807756, + "flos": 12170538311040.0, + "grad_norm": 5.328131395204355, + "language_loss": 0.98459631, + "learning_rate": 3.631568332524466e-06, + "loss": 1.06277001, + "num_input_tokens_seen": 79062890, + "router_z_loss_clip": 2.34765625, + "router_z_loss_mlp": 0.20507812, + "step": 3666, + "time_per_iteration": 2.500293254852295 + }, + { + "auxiliary_loss_clip": 0.06531642, + "auxiliary_loss_mlp": 0.01281342, + "balance_loss_clip": 0.06297208, + "balance_loss_mlp": 0.01260767, + "epoch": 0.22047196753344356, + "flos": 40117345758720.0, + "grad_norm": 2.0087807452217143, + "language_loss": 0.81544572, + "learning_rate": 3.631343053912122e-06, + "loss": 0.89357555, + "num_input_tokens_seen": 79085495, + "router_z_loss_clip": 2.34179688, + "router_z_loss_mlp": 0.20568848, + "step": 3667, + "time_per_iteration": 2.7539899349212646 + }, + { + "auxiliary_loss_clip": 0.06542197, + "auxiliary_loss_mlp": 0.0128155, + "balance_loss_clip": 0.06300189, + "balance_loss_mlp": 0.01258363, + "epoch": 0.22053209078611152, + "flos": 20706892064640.0, + "grad_norm": 2.631241235852179, + "language_loss": 0.77648765, + "learning_rate": 3.631117713439087e-06, + "loss": 0.85472512, + "num_input_tokens_seen": 79101820, + "router_z_loss_clip": 2.41992188, + "router_z_loss_mlp": 0.23168945, + "step": 3668, + "time_per_iteration": 2.524740695953369 + }, + { + "auxiliary_loss_clip": 0.06534266, + "auxiliary_loss_mlp": 0.01279226, + "balance_loss_clip": 0.06300663, + "balance_loss_mlp": 0.01258758, + "epoch": 0.2205922140387795, + "flos": 24723026259840.0, + "grad_norm": 2.1996350177899386, + "language_loss": 0.72024125, + "learning_rate": 3.630892311113904e-06, + "loss": 0.7983762, + "num_input_tokens_seen": 79123320, + "router_z_loss_clip": 2.33789062, + "router_z_loss_mlp": 0.20471191, + "step": 3669, + "time_per_iteration": 2.5901756286621094 + }, + { + "auxiliary_loss_clip": 0.06540591, + "auxiliary_loss_mlp": 0.01281842, + "balance_loss_clip": 0.06304247, + "balance_loss_mlp": 0.01261398, + "epoch": 0.22065233729144745, + "flos": 23484000735360.0, + "grad_norm": 1.708018932230371, + "language_loss": 0.85830641, + "learning_rate": 3.6306668469451215e-06, + "loss": 0.93653071, + "num_input_tokens_seen": 79141615, + "router_z_loss_clip": 2.36132812, + "router_z_loss_mlp": 0.20422363, + "step": 3670, + "time_per_iteration": 2.6102726459503174 + }, + { + "auxiliary_loss_clip": 0.06539471, + "auxiliary_loss_mlp": 0.01279884, + "balance_loss_clip": 0.06300244, + "balance_loss_mlp": 0.01259678, + "epoch": 0.22071246054411545, + "flos": 35234268094080.0, + "grad_norm": 1.8596418583208814, + "language_loss": 0.77398729, + "learning_rate": 3.6304413209412886e-06, + "loss": 0.85218084, + "num_input_tokens_seen": 79164910, + "router_z_loss_clip": 2.39453125, + "router_z_loss_mlp": 0.20202637, + "step": 3671, + "time_per_iteration": 2.6463472843170166 + }, + { + "auxiliary_loss_clip": 0.06536315, + "auxiliary_loss_mlp": 0.01275828, + "balance_loss_clip": 0.06302021, + "balance_loss_mlp": 0.01256934, + "epoch": 0.2207725837967834, + "flos": 18156151998720.0, + "grad_norm": 3.3605951725525807, + "language_loss": 0.81071377, + "learning_rate": 3.6302157331109573e-06, + "loss": 0.88883519, + "num_input_tokens_seen": 79179685, + "router_z_loss_clip": 2.34375, + "router_z_loss_mlp": 0.18896484, + "step": 3672, + "time_per_iteration": 2.522409200668335 + }, + { + "auxiliary_loss_clip": 0.06541845, + "auxiliary_loss_mlp": 0.01282888, + "balance_loss_clip": 0.06304064, + "balance_loss_mlp": 0.01262086, + "epoch": 0.22083270704945138, + "flos": 20484967726080.0, + "grad_norm": 2.0276751679318905, + "language_loss": 0.74039209, + "learning_rate": 3.629990083462682e-06, + "loss": 0.8186394, + "num_input_tokens_seen": 79196285, + "router_z_loss_clip": 2.38085938, + "router_z_loss_mlp": 0.20800781, + "step": 3673, + "time_per_iteration": 2.5588481426239014 + }, + { + "auxiliary_loss_clip": 0.06537451, + "auxiliary_loss_mlp": 0.0127904, + "balance_loss_clip": 0.06301045, + "balance_loss_mlp": 0.01258154, + "epoch": 0.22089283030211934, + "flos": 34133451079680.0, + "grad_norm": 2.1113123853963223, + "language_loss": 0.77576697, + "learning_rate": 3.6297643720050203e-06, + "loss": 0.85393184, + "num_input_tokens_seen": 79216060, + "router_z_loss_clip": 2.3671875, + "router_z_loss_mlp": 0.2088623, + "step": 3674, + "time_per_iteration": 2.6212525367736816 + }, + { + "auxiliary_loss_clip": 0.06539989, + "auxiliary_loss_mlp": 0.01277379, + "balance_loss_clip": 0.06303889, + "balance_loss_mlp": 0.01255349, + "epoch": 0.2209529535547873, + "flos": 18083043711360.0, + "grad_norm": 2.9913121905850213, + "language_loss": 0.7632584, + "learning_rate": 3.6295385987465293e-06, + "loss": 0.84143209, + "num_input_tokens_seen": 79235145, + "router_z_loss_clip": 2.36523438, + "router_z_loss_mlp": 0.22033691, + "step": 3675, + "time_per_iteration": 2.529346466064453 + }, + { + "auxiliary_loss_clip": 0.06540923, + "auxiliary_loss_mlp": 0.01279311, + "balance_loss_clip": 0.06303286, + "balance_loss_mlp": 0.01258592, + "epoch": 0.22101307680745527, + "flos": 27242725587840.0, + "grad_norm": 1.8493496269427605, + "language_loss": 0.8074736, + "learning_rate": 3.629312763695772e-06, + "loss": 0.88567591, + "num_input_tokens_seen": 79256960, + "router_z_loss_clip": 2.38085938, + "router_z_loss_mlp": 0.20727539, + "step": 3676, + "time_per_iteration": 2.5729713439941406 + }, + { + "auxiliary_loss_clip": 0.06539683, + "auxiliary_loss_mlp": 0.0128126, + "balance_loss_clip": 0.06299066, + "balance_loss_mlp": 0.01260637, + "epoch": 0.22107320006012326, + "flos": 16548566290560.0, + "grad_norm": 2.695197102889201, + "language_loss": 0.76204234, + "learning_rate": 3.6290868668613107e-06, + "loss": 0.84025168, + "num_input_tokens_seen": 79274860, + "router_z_loss_clip": 2.40820312, + "router_z_loss_mlp": 0.2064209, + "step": 3677, + "time_per_iteration": 2.5165653228759766 + }, + { + "auxiliary_loss_clip": 0.0653778, + "auxiliary_loss_mlp": 0.01277642, + "balance_loss_clip": 0.06301221, + "balance_loss_mlp": 0.01257889, + "epoch": 0.22113332331279123, + "flos": 22061009571840.0, + "grad_norm": 1.9269573452829223, + "language_loss": 0.84673274, + "learning_rate": 3.628860908251712e-06, + "loss": 0.92488694, + "num_input_tokens_seen": 79294005, + "router_z_loss_clip": 2.36523438, + "router_z_loss_mlp": 0.19750977, + "step": 3678, + "time_per_iteration": 2.5460638999938965 + }, + { + "auxiliary_loss_clip": 0.06537814, + "auxiliary_loss_mlp": 0.01282989, + "balance_loss_clip": 0.06304095, + "balance_loss_mlp": 0.01262354, + "epoch": 0.2211934465654592, + "flos": 26619690954240.0, + "grad_norm": 2.1729831488916327, + "language_loss": 0.89362311, + "learning_rate": 3.6286348878755452e-06, + "loss": 0.9718312, + "num_input_tokens_seen": 79314005, + "router_z_loss_clip": 2.34179688, + "router_z_loss_mlp": 0.20629883, + "step": 3679, + "time_per_iteration": 2.596503973007202 + }, + { + "auxiliary_loss_clip": 0.06542142, + "auxiliary_loss_mlp": 0.01291632, + "balance_loss_clip": 0.06301068, + "balance_loss_mlp": 0.01269817, + "epoch": 0.22125356981812716, + "flos": 16365564973440.0, + "grad_norm": 3.197923457760992, + "language_loss": 0.87311327, + "learning_rate": 3.6284088057413803e-06, + "loss": 0.95145106, + "num_input_tokens_seen": 79331030, + "router_z_loss_clip": 2.40820312, + "router_z_loss_mlp": 0.21801758, + "step": 3680, + "time_per_iteration": 2.507798433303833 + }, + { + "auxiliary_loss_clip": 0.06534758, + "auxiliary_loss_mlp": 0.01279239, + "balance_loss_clip": 0.06302372, + "balance_loss_mlp": 0.01258211, + "epoch": 0.22131369307079513, + "flos": 21657257873280.0, + "grad_norm": 1.8058433539562604, + "language_loss": 0.81643963, + "learning_rate": 3.6281826618577894e-06, + "loss": 0.89457959, + "num_input_tokens_seen": 79348560, + "router_z_loss_clip": 2.32226562, + "router_z_loss_mlp": 0.21032715, + "step": 3681, + "time_per_iteration": 2.536559820175171 + }, + { + "auxiliary_loss_clip": 0.06530598, + "auxiliary_loss_mlp": 0.01283453, + "balance_loss_clip": 0.06302136, + "balance_loss_mlp": 0.01264344, + "epoch": 0.2213738163234631, + "flos": 19615592488320.0, + "grad_norm": 3.0843961282743138, + "language_loss": 0.80613208, + "learning_rate": 3.62795645623335e-06, + "loss": 0.88427258, + "num_input_tokens_seen": 79367175, + "router_z_loss_clip": 2.28320312, + "router_z_loss_mlp": 0.19116211, + "step": 3682, + "time_per_iteration": 2.5523715019226074 + }, + { + "auxiliary_loss_clip": 0.06540116, + "auxiliary_loss_mlp": 0.01284666, + "balance_loss_clip": 0.06302039, + "balance_loss_mlp": 0.01261933, + "epoch": 0.22143393957613106, + "flos": 23630217310080.0, + "grad_norm": 1.560467578099588, + "language_loss": 0.78323001, + "learning_rate": 3.627730188876638e-06, + "loss": 0.86147785, + "num_input_tokens_seen": 79388435, + "router_z_loss_clip": 2.38085938, + "router_z_loss_mlp": 0.22729492, + "step": 3683, + "time_per_iteration": 2.563915491104126 + }, + { + "auxiliary_loss_clip": 0.06546305, + "auxiliary_loss_mlp": 0.01292128, + "balance_loss_clip": 0.06304266, + "balance_loss_mlp": 0.01270801, + "epoch": 0.22149406282879905, + "flos": 26185108152960.0, + "grad_norm": 2.3659446396904276, + "language_loss": 0.73827177, + "learning_rate": 3.627503859796234e-06, + "loss": 0.81665611, + "num_input_tokens_seen": 79407910, + "router_z_loss_clip": 2.42382812, + "router_z_loss_mlp": 0.21337891, + "step": 3684, + "time_per_iteration": 2.5829403400421143 + }, + { + "auxiliary_loss_clip": 0.06539842, + "auxiliary_loss_mlp": 0.01288295, + "balance_loss_clip": 0.06303138, + "balance_loss_mlp": 0.01266396, + "epoch": 0.221554186081467, + "flos": 14544104918400.0, + "grad_norm": 1.9346272357304948, + "language_loss": 0.81055164, + "learning_rate": 3.6272774690007207e-06, + "loss": 0.88883299, + "num_input_tokens_seen": 79424020, + "router_z_loss_clip": 2.3671875, + "router_z_loss_mlp": 0.21899414, + "step": 3685, + "time_per_iteration": 2.5229949951171875 + }, + { + "auxiliary_loss_clip": 0.06531791, + "auxiliary_loss_mlp": 0.0128599, + "balance_loss_clip": 0.06302623, + "balance_loss_mlp": 0.01266607, + "epoch": 0.22161430933413498, + "flos": 22245059064960.0, + "grad_norm": 1.5947500054188823, + "language_loss": 0.87523818, + "learning_rate": 3.6270510164986823e-06, + "loss": 0.95341599, + "num_input_tokens_seen": 79445605, + "router_z_loss_clip": 2.29296875, + "router_z_loss_mlp": 0.19372559, + "step": 3686, + "time_per_iteration": 4.0018064975738525 + }, + { + "auxiliary_loss_clip": 0.06530964, + "auxiliary_loss_mlp": 0.01294037, + "balance_loss_clip": 0.06297237, + "balance_loss_mlp": 0.01272198, + "epoch": 0.22167443258680294, + "flos": 23483162194560.0, + "grad_norm": 2.0272053301197186, + "language_loss": 0.78420949, + "learning_rate": 3.626824502298707e-06, + "loss": 0.86245942, + "num_input_tokens_seen": 79463850, + "router_z_loss_clip": 2.3359375, + "router_z_loss_mlp": 0.21826172, + "step": 3687, + "time_per_iteration": 2.543321132659912 + }, + { + "auxiliary_loss_clip": 0.06551681, + "auxiliary_loss_mlp": 0.01283958, + "balance_loss_clip": 0.0630649, + "balance_loss_mlp": 0.01261177, + "epoch": 0.2217345558394709, + "flos": 23227723422720.0, + "grad_norm": 1.7957197826329643, + "language_loss": 0.85492283, + "learning_rate": 3.626597926409383e-06, + "loss": 0.93327922, + "num_input_tokens_seen": 79482845, + "router_z_loss_clip": 2.44921875, + "router_z_loss_mlp": 0.2277832, + "step": 3688, + "time_per_iteration": 2.5456702709198 + }, + { + "auxiliary_loss_clip": 0.06557921, + "auxiliary_loss_mlp": 0.01283081, + "balance_loss_clip": 0.0631456, + "balance_loss_mlp": 0.01260812, + "epoch": 0.22179467909213887, + "flos": 20017247834880.0, + "grad_norm": 1.8193279444648072, + "language_loss": 0.81821239, + "learning_rate": 3.6263712888393027e-06, + "loss": 0.89662236, + "num_input_tokens_seen": 79501550, + "router_z_loss_clip": 2.43554688, + "router_z_loss_mlp": 0.22265625, + "step": 3689, + "time_per_iteration": 4.073091506958008 + }, + { + "auxiliary_loss_clip": 0.06540284, + "auxiliary_loss_mlp": 0.0128456, + "balance_loss_clip": 0.06304172, + "balance_loss_mlp": 0.01263269, + "epoch": 0.22185480234480687, + "flos": 19689203900160.0, + "grad_norm": 2.302195520769192, + "language_loss": 0.70934272, + "learning_rate": 3.626144589597061e-06, + "loss": 0.7875911, + "num_input_tokens_seen": 79519680, + "router_z_loss_clip": 2.36132812, + "router_z_loss_mlp": 0.2130127, + "step": 3690, + "time_per_iteration": 2.5177161693573 + }, + { + "auxiliary_loss_clip": 0.06548303, + "auxiliary_loss_mlp": 0.01286756, + "balance_loss_clip": 0.06307022, + "balance_loss_mlp": 0.01264416, + "epoch": 0.22191492559747483, + "flos": 21987817430400.0, + "grad_norm": 2.3084892961245576, + "language_loss": 0.7285862, + "learning_rate": 3.6259178286912528e-06, + "loss": 0.80693686, + "num_input_tokens_seen": 79539000, + "router_z_loss_clip": 2.4140625, + "router_z_loss_mlp": 0.22338867, + "step": 3691, + "time_per_iteration": 2.545271873474121 + }, + { + "auxiliary_loss_clip": 0.0654895, + "auxiliary_loss_mlp": 0.01283693, + "balance_loss_clip": 0.06313456, + "balance_loss_mlp": 0.01261771, + "epoch": 0.2219750488501428, + "flos": 23228813525760.0, + "grad_norm": 2.0680633952732195, + "language_loss": 0.71962094, + "learning_rate": 3.625691006130477e-06, + "loss": 0.79794735, + "num_input_tokens_seen": 79559695, + "router_z_loss_clip": 2.36132812, + "router_z_loss_mlp": 0.21936035, + "step": 3692, + "time_per_iteration": 2.543306350708008 + }, + { + "auxiliary_loss_clip": 0.06558576, + "auxiliary_loss_mlp": 0.0128071, + "balance_loss_clip": 0.06317012, + "balance_loss_mlp": 0.01258394, + "epoch": 0.22203517210281076, + "flos": 22459939660800.0, + "grad_norm": 1.9780142392305156, + "language_loss": 0.87528688, + "learning_rate": 3.6254641219233362e-06, + "loss": 0.95367974, + "num_input_tokens_seen": 79579095, + "router_z_loss_clip": 2.41601562, + "router_z_loss_mlp": 0.22338867, + "step": 3693, + "time_per_iteration": 2.571045398712158 + }, + { + "auxiliary_loss_clip": 0.06534213, + "auxiliary_loss_mlp": 0.01282043, + "balance_loss_clip": 0.06303744, + "balance_loss_mlp": 0.01261122, + "epoch": 0.22209529535547873, + "flos": 17569985961600.0, + "grad_norm": 2.4004359049860824, + "language_loss": 0.86418116, + "learning_rate": 3.6252371760784325e-06, + "loss": 0.94234371, + "num_input_tokens_seen": 79596430, + "router_z_loss_clip": 2.30859375, + "router_z_loss_mlp": 0.20922852, + "step": 3694, + "time_per_iteration": 4.03299617767334 + }, + { + "auxiliary_loss_clip": 0.06554222, + "auxiliary_loss_mlp": 0.0127962, + "balance_loss_clip": 0.06307386, + "balance_loss_mlp": 0.0125815, + "epoch": 0.2221554186081467, + "flos": 21475178951040.0, + "grad_norm": 1.7692850214061204, + "language_loss": 0.69924927, + "learning_rate": 3.6250101686043725e-06, + "loss": 0.77758765, + "num_input_tokens_seen": 79615825, + "router_z_loss_clip": 2.46875, + "router_z_loss_mlp": 0.21472168, + "step": 3695, + "time_per_iteration": 3.989173412322998 + }, + { + "auxiliary_loss_clip": 0.06536973, + "auxiliary_loss_mlp": 0.01283487, + "balance_loss_clip": 0.0630603, + "balance_loss_mlp": 0.01262781, + "epoch": 0.22221554186081466, + "flos": 27680956041600.0, + "grad_norm": 1.7088419756312998, + "language_loss": 0.72215462, + "learning_rate": 3.6247830995097637e-06, + "loss": 0.80035925, + "num_input_tokens_seen": 79637875, + "router_z_loss_clip": 2.30859375, + "router_z_loss_mlp": 0.20715332, + "step": 3696, + "time_per_iteration": 2.6339590549468994 + }, + { + "auxiliary_loss_clip": 0.06543445, + "auxiliary_loss_mlp": 0.01279581, + "balance_loss_clip": 0.06307454, + "balance_loss_mlp": 0.01257825, + "epoch": 0.22227566511348265, + "flos": 25966202561280.0, + "grad_norm": 1.8417969407055101, + "language_loss": 0.88068652, + "learning_rate": 3.624555968803217e-06, + "loss": 0.95891678, + "num_input_tokens_seen": 79656970, + "router_z_loss_clip": 2.36132812, + "router_z_loss_mlp": 0.21740723, + "step": 3697, + "time_per_iteration": 2.5599191188812256 + }, + { + "auxiliary_loss_clip": 0.06533489, + "auxiliary_loss_mlp": 0.01284902, + "balance_loss_clip": 0.06305174, + "balance_loss_mlp": 0.01265042, + "epoch": 0.22233578836615062, + "flos": 39213240203520.0, + "grad_norm": 2.5935528152985867, + "language_loss": 0.6687606, + "learning_rate": 3.624328776493346e-06, + "loss": 0.74694455, + "num_input_tokens_seen": 79680275, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.1986084, + "step": 3698, + "time_per_iteration": 2.812140703201294 + }, + { + "auxiliary_loss_clip": 0.06546268, + "auxiliary_loss_mlp": 0.01282222, + "balance_loss_clip": 0.06307642, + "balance_loss_mlp": 0.01260216, + "epoch": 0.22239591161881858, + "flos": 36292682142720.0, + "grad_norm": 1.853195446284453, + "language_loss": 0.82990527, + "learning_rate": 3.6241015225887637e-06, + "loss": 0.90819019, + "num_input_tokens_seen": 79701255, + "router_z_loss_clip": 2.38671875, + "router_z_loss_mlp": 0.22009277, + "step": 3699, + "time_per_iteration": 2.667423725128174 + }, + { + "auxiliary_loss_clip": 0.06537004, + "auxiliary_loss_mlp": 0.01281329, + "balance_loss_clip": 0.06302205, + "balance_loss_mlp": 0.01260014, + "epoch": 0.22245603487148655, + "flos": 19725779007360.0, + "grad_norm": 1.45021308141165, + "language_loss": 0.80335897, + "learning_rate": 3.62387420709809e-06, + "loss": 0.88154227, + "num_input_tokens_seen": 79721315, + "router_z_loss_clip": 2.34765625, + "router_z_loss_mlp": 0.21313477, + "step": 3700, + "time_per_iteration": 2.5526716709136963 + }, + { + "auxiliary_loss_clip": 0.06548695, + "auxiliary_loss_mlp": 0.01279557, + "balance_loss_clip": 0.06306358, + "balance_loss_mlp": 0.01257885, + "epoch": 0.2225161581241545, + "flos": 46290950081280.0, + "grad_norm": 3.047641549556173, + "language_loss": 0.73186177, + "learning_rate": 3.623646830029943e-06, + "loss": 0.81014431, + "num_input_tokens_seen": 79742705, + "router_z_loss_clip": 2.42382812, + "router_z_loss_mlp": 0.21655273, + "step": 3701, + "time_per_iteration": 2.776974678039551 + }, + { + "auxiliary_loss_clip": 0.06535295, + "auxiliary_loss_mlp": 0.01280405, + "balance_loss_clip": 0.06300849, + "balance_loss_mlp": 0.01259734, + "epoch": 0.22257628137682248, + "flos": 23702990181120.0, + "grad_norm": 4.404280219854046, + "language_loss": 0.80455184, + "learning_rate": 3.6234193913929454e-06, + "loss": 0.88270885, + "num_input_tokens_seen": 79763000, + "router_z_loss_clip": 2.34570312, + "router_z_loss_mlp": 0.20666504, + "step": 3702, + "time_per_iteration": 2.5657999515533447 + }, + { + "auxiliary_loss_clip": 0.06524558, + "auxiliary_loss_mlp": 0.01274253, + "balance_loss_clip": 0.06297488, + "balance_loss_mlp": 0.01253331, + "epoch": 0.22263640462949044, + "flos": 19359986008320.0, + "grad_norm": 3.4101413472023405, + "language_loss": 0.78629804, + "learning_rate": 3.623191891195723e-06, + "loss": 0.86428618, + "num_input_tokens_seen": 79781335, + "router_z_loss_clip": 2.27148438, + "router_z_loss_mlp": 0.20910645, + "step": 3703, + "time_per_iteration": 2.550189971923828 + }, + { + "auxiliary_loss_clip": 0.06541737, + "auxiliary_loss_mlp": 0.01279602, + "balance_loss_clip": 0.06300878, + "balance_loss_mlp": 0.01257084, + "epoch": 0.22269652788215843, + "flos": 20782138631040.0, + "grad_norm": 2.0986231414271828, + "language_loss": 0.75210625, + "learning_rate": 3.6229643294469005e-06, + "loss": 0.83031964, + "num_input_tokens_seen": 79800150, + "router_z_loss_clip": 2.41015625, + "router_z_loss_mlp": 0.22509766, + "step": 3704, + "time_per_iteration": 2.5540754795074463 + }, + { + "auxiliary_loss_clip": 0.06527826, + "auxiliary_loss_mlp": 0.01288936, + "balance_loss_clip": 0.06299336, + "balance_loss_mlp": 0.01268682, + "epoch": 0.2227566511348264, + "flos": 47969631578880.0, + "grad_norm": 1.891044771341396, + "language_loss": 0.65108556, + "learning_rate": 3.6227367061551074e-06, + "loss": 0.72925317, + "num_input_tokens_seen": 79822390, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.20239258, + "step": 3705, + "time_per_iteration": 2.8109097480773926 + }, + { + "auxiliary_loss_clip": 0.06438605, + "auxiliary_loss_mlp": 0.01266416, + "balance_loss_clip": 0.0631493, + "balance_loss_mlp": 0.012611, + "epoch": 0.22281677438749437, + "flos": 66235676607360.0, + "grad_norm": 1.322453387614222, + "language_loss": 0.65218806, + "learning_rate": 3.6225090213289766e-06, + "loss": 0.72923827, + "num_input_tokens_seen": 79873350, + "router_z_loss_clip": 1.234375, + "router_z_loss_mlp": 0.05322266, + "step": 3706, + "time_per_iteration": 3.059636354446411 + }, + { + "auxiliary_loss_clip": 0.06534128, + "auxiliary_loss_mlp": 0.01286492, + "balance_loss_clip": 0.06297205, + "balance_loss_mlp": 0.01266274, + "epoch": 0.22287689764016233, + "flos": 21878050181760.0, + "grad_norm": 2.374246987916323, + "language_loss": 0.80905002, + "learning_rate": 3.622281274977141e-06, + "loss": 0.88725626, + "num_input_tokens_seen": 79891715, + "router_z_loss_clip": 2.36914062, + "router_z_loss_mlp": 0.20202637, + "step": 3707, + "time_per_iteration": 2.5891129970550537 + }, + { + "auxiliary_loss_clip": 0.06532377, + "auxiliary_loss_mlp": 0.01280313, + "balance_loss_clip": 0.06298505, + "balance_loss_mlp": 0.01257854, + "epoch": 0.2229370208928303, + "flos": 27679824011520.0, + "grad_norm": 1.802742500055583, + "language_loss": 0.79219007, + "learning_rate": 3.6220534671082367e-06, + "loss": 0.87031698, + "num_input_tokens_seen": 79911175, + "router_z_loss_clip": 2.34179688, + "router_z_loss_mlp": 0.2244873, + "step": 3708, + "time_per_iteration": 2.5907180309295654 + }, + { + "auxiliary_loss_clip": 0.06539932, + "auxiliary_loss_mlp": 0.01293698, + "balance_loss_clip": 0.06300655, + "balance_loss_mlp": 0.01271525, + "epoch": 0.22299714414549826, + "flos": 30162612816000.0, + "grad_norm": 1.9019649120082793, + "language_loss": 0.81583631, + "learning_rate": 3.6218255977309024e-06, + "loss": 0.89417267, + "num_input_tokens_seen": 79931875, + "router_z_loss_clip": 2.39453125, + "router_z_loss_mlp": 0.22167969, + "step": 3709, + "time_per_iteration": 2.658768892288208 + }, + { + "auxiliary_loss_clip": 0.06540084, + "auxiliary_loss_mlp": 0.01295766, + "balance_loss_clip": 0.0630019, + "balance_loss_mlp": 0.01274464, + "epoch": 0.22305726739816625, + "flos": 23148871130880.0, + "grad_norm": 2.9556041497723236, + "language_loss": 0.69413233, + "learning_rate": 3.6215976668537787e-06, + "loss": 0.77249086, + "num_input_tokens_seen": 79952445, + "router_z_loss_clip": 2.39648438, + "router_z_loss_mlp": 0.21289062, + "step": 3710, + "time_per_iteration": 2.603476047515869 + }, + { + "auxiliary_loss_clip": 0.06536471, + "auxiliary_loss_mlp": 0.01286054, + "balance_loss_clip": 0.06297636, + "balance_loss_mlp": 0.01264429, + "epoch": 0.22311739065083422, + "flos": 19178116721280.0, + "grad_norm": 2.184897161331363, + "language_loss": 0.91282266, + "learning_rate": 3.6213696744855096e-06, + "loss": 0.99104792, + "num_input_tokens_seen": 79971030, + "router_z_loss_clip": 2.39257812, + "router_z_loss_mlp": 0.21606445, + "step": 3711, + "time_per_iteration": 2.6093854904174805 + }, + { + "auxiliary_loss_clip": 0.06539471, + "auxiliary_loss_mlp": 0.01298084, + "balance_loss_clip": 0.06302293, + "balance_loss_mlp": 0.01275911, + "epoch": 0.22317751390350218, + "flos": 13621467611520.0, + "grad_norm": 2.3638705243519142, + "language_loss": 0.89271343, + "learning_rate": 3.6211416206347395e-06, + "loss": 0.97108901, + "num_input_tokens_seen": 79982085, + "router_z_loss_clip": 2.375, + "router_z_loss_mlp": 0.22192383, + "step": 3712, + "time_per_iteration": 2.5170199871063232 + }, + { + "auxiliary_loss_clip": 0.06530519, + "auxiliary_loss_mlp": 0.01292247, + "balance_loss_clip": 0.06299888, + "balance_loss_mlp": 0.01271481, + "epoch": 0.22323763715617015, + "flos": 11032643064960.0, + "grad_norm": 2.927785991832361, + "language_loss": 0.74880064, + "learning_rate": 3.620913505310117e-06, + "loss": 0.82702827, + "num_input_tokens_seen": 79997460, + "router_z_loss_clip": 2.30859375, + "router_z_loss_mlp": 0.2076416, + "step": 3713, + "time_per_iteration": 2.521813154220581 + }, + { + "auxiliary_loss_clip": 0.06534518, + "auxiliary_loss_mlp": 0.0130023, + "balance_loss_clip": 0.06298245, + "balance_loss_mlp": 0.01277556, + "epoch": 0.22329776040883811, + "flos": 41360647841280.0, + "grad_norm": 2.458794372685298, + "language_loss": 0.62675929, + "learning_rate": 3.6206853285202917e-06, + "loss": 0.70510674, + "num_input_tokens_seen": 80022450, + "router_z_loss_clip": 2.36328125, + "router_z_loss_mlp": 0.22668457, + "step": 3714, + "time_per_iteration": 2.704357862472534 + }, + { + "auxiliary_loss_clip": 0.06529912, + "auxiliary_loss_mlp": 0.01289936, + "balance_loss_clip": 0.06295826, + "balance_loss_mlp": 0.01267906, + "epoch": 0.22335788366150608, + "flos": 25126568323200.0, + "grad_norm": 1.757427072944695, + "language_loss": 0.79499549, + "learning_rate": 3.6204570902739164e-06, + "loss": 0.87319398, + "num_input_tokens_seen": 80042100, + "router_z_loss_clip": 2.33984375, + "router_z_loss_mlp": 0.22009277, + "step": 3715, + "time_per_iteration": 2.571711301803589 + }, + { + "auxiliary_loss_clip": 0.06527971, + "auxiliary_loss_mlp": 0.01294287, + "balance_loss_clip": 0.06293058, + "balance_loss_mlp": 0.0127302, + "epoch": 0.22341800691417404, + "flos": 16989144658560.0, + "grad_norm": 1.5961840175356918, + "language_loss": 0.77329421, + "learning_rate": 3.620228790579645e-06, + "loss": 0.85151684, + "num_input_tokens_seen": 80059690, + "router_z_loss_clip": 2.34765625, + "router_z_loss_mlp": 0.21276855, + "step": 3716, + "time_per_iteration": 2.502037286758423 + }, + { + "auxiliary_loss_clip": 0.06529684, + "auxiliary_loss_mlp": 0.0129404, + "balance_loss_clip": 0.06297298, + "balance_loss_mlp": 0.01273977, + "epoch": 0.22347813016684204, + "flos": 14141904520320.0, + "grad_norm": 2.4369226344025665, + "language_loss": 0.80004126, + "learning_rate": 3.6200004294461367e-06, + "loss": 0.87827849, + "num_input_tokens_seen": 80076060, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.20068359, + "step": 3717, + "time_per_iteration": 2.5208563804626465 + }, + { + "auxiliary_loss_clip": 0.065373, + "auxiliary_loss_mlp": 0.01297317, + "balance_loss_clip": 0.06298472, + "balance_loss_mlp": 0.01275215, + "epoch": 0.22353825341951, + "flos": 23589323717760.0, + "grad_norm": 2.564573329936102, + "language_loss": 0.68781847, + "learning_rate": 3.6197720068820497e-06, + "loss": 0.76616466, + "num_input_tokens_seen": 80094760, + "router_z_loss_clip": 2.38867188, + "router_z_loss_mlp": 0.22106934, + "step": 3718, + "time_per_iteration": 2.6491305828094482 + }, + { + "auxiliary_loss_clip": 0.06536659, + "auxiliary_loss_mlp": 0.01296292, + "balance_loss_clip": 0.06298986, + "balance_loss_mlp": 0.01271187, + "epoch": 0.22359837667217797, + "flos": 29831759769600.0, + "grad_norm": 1.515297493499622, + "language_loss": 0.80957985, + "learning_rate": 3.619543522896045e-06, + "loss": 0.88790929, + "num_input_tokens_seen": 80114475, + "router_z_loss_clip": 2.37695312, + "router_z_loss_mlp": 0.25085449, + "step": 3719, + "time_per_iteration": 2.6334550380706787 + }, + { + "auxiliary_loss_clip": 0.06540611, + "auxiliary_loss_mlp": 0.01300766, + "balance_loss_clip": 0.06299402, + "balance_loss_mlp": 0.01276793, + "epoch": 0.22365849992484593, + "flos": 17608867056000.0, + "grad_norm": 2.352033480486632, + "language_loss": 0.87360144, + "learning_rate": 3.6193149774967885e-06, + "loss": 0.95201522, + "num_input_tokens_seen": 80132920, + "router_z_loss_clip": 2.41210938, + "router_z_loss_mlp": 0.23962402, + "step": 3720, + "time_per_iteration": 2.5415003299713135 + }, + { + "auxiliary_loss_clip": 0.06526608, + "auxiliary_loss_mlp": 0.01292998, + "balance_loss_clip": 0.06295964, + "balance_loss_mlp": 0.01271672, + "epoch": 0.2237186231775139, + "flos": 22717558638720.0, + "grad_norm": 1.8478771577440833, + "language_loss": 0.75151736, + "learning_rate": 3.619086370692945e-06, + "loss": 0.8297134, + "num_input_tokens_seen": 80152845, + "router_z_loss_clip": 2.30664062, + "router_z_loss_mlp": 0.21325684, + "step": 3721, + "time_per_iteration": 2.548450469970703 + }, + { + "auxiliary_loss_clip": 0.06540586, + "auxiliary_loss_mlp": 0.0129148, + "balance_loss_clip": 0.06299528, + "balance_loss_mlp": 0.01269105, + "epoch": 0.22377874643018186, + "flos": 13376720234880.0, + "grad_norm": 2.2094798322640736, + "language_loss": 0.79352558, + "learning_rate": 3.6188577024931844e-06, + "loss": 0.87184626, + "num_input_tokens_seen": 80170680, + "router_z_loss_clip": 2.41210938, + "router_z_loss_mlp": 0.22375488, + "step": 3722, + "time_per_iteration": 2.519277572631836 + }, + { + "auxiliary_loss_clip": 0.06531984, + "auxiliary_loss_mlp": 0.01288897, + "balance_loss_clip": 0.06299505, + "balance_loss_mlp": 0.01267964, + "epoch": 0.22383886968284986, + "flos": 17900797080960.0, + "grad_norm": 2.2930078409484196, + "language_loss": 0.83410442, + "learning_rate": 3.618628972906178e-06, + "loss": 0.91231328, + "num_input_tokens_seen": 80189030, + "router_z_loss_clip": 2.32421875, + "router_z_loss_mlp": 0.20922852, + "step": 3723, + "time_per_iteration": 2.5086076259613037 + }, + { + "auxiliary_loss_clip": 0.06544059, + "auxiliary_loss_mlp": 0.01285781, + "balance_loss_clip": 0.06305651, + "balance_loss_mlp": 0.01263834, + "epoch": 0.22389899293551782, + "flos": 23886033425280.0, + "grad_norm": 4.429276920778782, + "language_loss": 0.84606177, + "learning_rate": 3.6184001819405984e-06, + "loss": 0.92436016, + "num_input_tokens_seen": 80208365, + "router_z_loss_clip": 2.3828125, + "router_z_loss_mlp": 0.21960449, + "step": 3724, + "time_per_iteration": 2.574178695678711 + }, + { + "auxiliary_loss_clip": 0.06534179, + "auxiliary_loss_mlp": 0.01287846, + "balance_loss_clip": 0.06299204, + "balance_loss_mlp": 0.01267211, + "epoch": 0.2239591161881858, + "flos": 27279929600640.0, + "grad_norm": 1.978846940821608, + "language_loss": 0.79885381, + "learning_rate": 3.618171329605121e-06, + "loss": 0.87707412, + "num_input_tokens_seen": 80228685, + "router_z_loss_clip": 2.3515625, + "router_z_loss_mlp": 0.20617676, + "step": 3725, + "time_per_iteration": 2.589184522628784 + }, + { + "auxiliary_loss_clip": 0.06541407, + "auxiliary_loss_mlp": 0.01289084, + "balance_loss_clip": 0.06307919, + "balance_loss_mlp": 0.01267197, + "epoch": 0.22401923944085375, + "flos": 22243423910400.0, + "grad_norm": 1.7178260071510263, + "language_loss": 0.78001326, + "learning_rate": 3.6179424159084254e-06, + "loss": 0.85831815, + "num_input_tokens_seen": 80247635, + "router_z_loss_clip": 2.3359375, + "router_z_loss_mlp": 0.21875, + "step": 3726, + "time_per_iteration": 3.980494976043701 + }, + { + "auxiliary_loss_clip": 0.06552388, + "auxiliary_loss_mlp": 0.01297244, + "balance_loss_clip": 0.06307887, + "balance_loss_mlp": 0.01272175, + "epoch": 0.22407936269352172, + "flos": 12057920023680.0, + "grad_norm": 3.478702992871699, + "language_loss": 0.73437679, + "learning_rate": 3.6177134408591914e-06, + "loss": 0.81287301, + "num_input_tokens_seen": 80260045, + "router_z_loss_clip": 2.45117188, + "router_z_loss_mlp": 0.25097656, + "step": 3727, + "time_per_iteration": 2.4799015522003174 + }, + { + "auxiliary_loss_clip": 0.06549139, + "auxiliary_loss_mlp": 0.01296668, + "balance_loss_clip": 0.06309944, + "balance_loss_mlp": 0.0127341, + "epoch": 0.22413948594618968, + "flos": 19359482883840.0, + "grad_norm": 2.179866459674304, + "language_loss": 0.8799302, + "learning_rate": 3.6174844044661013e-06, + "loss": 0.95838827, + "num_input_tokens_seen": 80277680, + "router_z_loss_clip": 2.39257812, + "router_z_loss_mlp": 0.23254395, + "step": 3728, + "time_per_iteration": 2.547523021697998 + }, + { + "auxiliary_loss_clip": 0.0653842, + "auxiliary_loss_mlp": 0.01294185, + "balance_loss_clip": 0.06303863, + "balance_loss_mlp": 0.0126989, + "epoch": 0.22419960919885765, + "flos": 24176789493120.0, + "grad_norm": 1.9160734665449493, + "language_loss": 0.80446088, + "learning_rate": 3.6172553067378406e-06, + "loss": 0.88278687, + "num_input_tokens_seen": 80294795, + "router_z_loss_clip": 2.34375, + "router_z_loss_mlp": 0.24328613, + "step": 3729, + "time_per_iteration": 4.021615266799927 + }, + { + "auxiliary_loss_clip": 0.06533324, + "auxiliary_loss_mlp": 0.01292111, + "balance_loss_clip": 0.06302898, + "balance_loss_mlp": 0.01271237, + "epoch": 0.22425973245152564, + "flos": 27386007269760.0, + "grad_norm": 1.6841051152750983, + "language_loss": 0.87170112, + "learning_rate": 3.6170261476830964e-06, + "loss": 0.94995546, + "num_input_tokens_seen": 80315425, + "router_z_loss_clip": 2.30273438, + "router_z_loss_mlp": 0.2088623, + "step": 3730, + "time_per_iteration": 2.598576307296753 + }, + { + "auxiliary_loss_clip": 0.0653019, + "auxiliary_loss_mlp": 0.01298076, + "balance_loss_clip": 0.06300467, + "balance_loss_mlp": 0.01276403, + "epoch": 0.2243198557041936, + "flos": 13740794225280.0, + "grad_norm": 2.088554635044429, + "language_loss": 0.73449922, + "learning_rate": 3.616796927310559e-06, + "loss": 0.81278187, + "num_input_tokens_seen": 80333905, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.21655273, + "step": 3731, + "time_per_iteration": 2.5361716747283936 + }, + { + "auxiliary_loss_clip": 0.06541456, + "auxiliary_loss_mlp": 0.01292681, + "balance_loss_clip": 0.06301124, + "balance_loss_mlp": 0.01267933, + "epoch": 0.22437997895686157, + "flos": 19535775874560.0, + "grad_norm": 5.172507402775724, + "language_loss": 0.75803339, + "learning_rate": 3.6165676456289195e-06, + "loss": 0.83637482, + "num_input_tokens_seen": 80352165, + "router_z_loss_clip": 2.40625, + "router_z_loss_mlp": 0.24755859, + "step": 3732, + "time_per_iteration": 2.5423076152801514 + }, + { + "auxiliary_loss_clip": 0.06533462, + "auxiliary_loss_mlp": 0.01296517, + "balance_loss_clip": 0.06298938, + "balance_loss_mlp": 0.01273664, + "epoch": 0.22444010220952954, + "flos": 23703032108160.0, + "grad_norm": 1.6752991374876018, + "language_loss": 0.89338291, + "learning_rate": 3.616338302646873e-06, + "loss": 0.97168273, + "num_input_tokens_seen": 80371305, + "router_z_loss_clip": 2.34375, + "router_z_loss_mlp": 0.2286377, + "step": 3733, + "time_per_iteration": 4.021088123321533 + }, + { + "auxiliary_loss_clip": 0.065323, + "auxiliary_loss_mlp": 0.01294952, + "balance_loss_clip": 0.06298727, + "balance_loss_mlp": 0.01270193, + "epoch": 0.2245002254621975, + "flos": 22389514704000.0, + "grad_norm": 1.4651206016819107, + "language_loss": 0.85422146, + "learning_rate": 3.6161088983731166e-06, + "loss": 0.93249398, + "num_input_tokens_seen": 80391020, + "router_z_loss_clip": 2.33398438, + "router_z_loss_mlp": 0.24780273, + "step": 3734, + "time_per_iteration": 2.5562949180603027 + }, + { + "auxiliary_loss_clip": 0.06539299, + "auxiliary_loss_mlp": 0.01283537, + "balance_loss_clip": 0.06303868, + "balance_loss_mlp": 0.01261113, + "epoch": 0.22456034871486547, + "flos": 26949453897600.0, + "grad_norm": 1.579737554219585, + "language_loss": 0.77332962, + "learning_rate": 3.6158794328163482e-06, + "loss": 0.85155803, + "num_input_tokens_seen": 80411365, + "router_z_loss_clip": 2.3515625, + "router_z_loss_mlp": 0.22436523, + "step": 3735, + "time_per_iteration": 4.016703367233276 + }, + { + "auxiliary_loss_clip": 0.06526705, + "auxiliary_loss_mlp": 0.01290552, + "balance_loss_clip": 0.06298478, + "balance_loss_mlp": 0.01269559, + "epoch": 0.22462047196753343, + "flos": 28990700012160.0, + "grad_norm": 1.885472064442235, + "language_loss": 0.84907603, + "learning_rate": 3.6156499059852702e-06, + "loss": 0.92724866, + "num_input_tokens_seen": 80431075, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.21008301, + "step": 3736, + "time_per_iteration": 2.6118290424346924 + }, + { + "auxiliary_loss_clip": 0.06536424, + "auxiliary_loss_mlp": 0.01285836, + "balance_loss_clip": 0.0630133, + "balance_loss_mlp": 0.01261922, + "epoch": 0.22468059522020142, + "flos": 20017541324160.0, + "grad_norm": 1.5290746464045628, + "language_loss": 0.87103891, + "learning_rate": 3.615420317888586e-06, + "loss": 0.94926155, + "num_input_tokens_seen": 80449240, + "router_z_loss_clip": 2.3515625, + "router_z_loss_mlp": 0.23913574, + "step": 3737, + "time_per_iteration": 2.5211808681488037 + }, + { + "auxiliary_loss_clip": 0.06534176, + "auxiliary_loss_mlp": 0.01288351, + "balance_loss_clip": 0.06294889, + "balance_loss_mlp": 0.01263949, + "epoch": 0.2247407184728694, + "flos": 29321846547840.0, + "grad_norm": 1.8581473098744326, + "language_loss": 0.80131769, + "learning_rate": 3.6151906685350006e-06, + "loss": 0.87954295, + "num_input_tokens_seen": 80467900, + "router_z_loss_clip": 2.39257812, + "router_z_loss_mlp": 0.24389648, + "step": 3738, + "time_per_iteration": 2.604417085647583 + }, + { + "auxiliary_loss_clip": 0.06530435, + "auxiliary_loss_mlp": 0.01285051, + "balance_loss_clip": 0.06293893, + "balance_loss_mlp": 0.01263295, + "epoch": 0.22480084172553735, + "flos": 22317035322240.0, + "grad_norm": 1.7432458267253939, + "language_loss": 0.77190316, + "learning_rate": 3.614960957933224e-06, + "loss": 0.85005802, + "num_input_tokens_seen": 80487100, + "router_z_loss_clip": 2.36914062, + "router_z_loss_mlp": 0.21728516, + "step": 3739, + "time_per_iteration": 2.540266275405884 + }, + { + "auxiliary_loss_clip": 0.06531328, + "auxiliary_loss_mlp": 0.01283134, + "balance_loss_clip": 0.06295189, + "balance_loss_mlp": 0.01260091, + "epoch": 0.22486096497820532, + "flos": 25598019720960.0, + "grad_norm": 4.441094103460663, + "language_loss": 0.74799633, + "learning_rate": 3.6147311860919655e-06, + "loss": 0.82614094, + "num_input_tokens_seen": 80508625, + "router_z_loss_clip": 2.36132812, + "router_z_loss_mlp": 0.23022461, + "step": 3740, + "time_per_iteration": 2.640592575073242 + }, + { + "auxiliary_loss_clip": 0.06520827, + "auxiliary_loss_mlp": 0.01278747, + "balance_loss_clip": 0.06289122, + "balance_loss_mlp": 0.01256681, + "epoch": 0.22492108823087328, + "flos": 17645651798400.0, + "grad_norm": 2.0040821388775285, + "language_loss": 0.75983584, + "learning_rate": 3.614501353019939e-06, + "loss": 0.83783156, + "num_input_tokens_seen": 80527345, + "router_z_loss_clip": 2.31640625, + "router_z_loss_mlp": 0.22070312, + "step": 3741, + "time_per_iteration": 2.513965129852295 + }, + { + "auxiliary_loss_clip": 0.06526901, + "auxiliary_loss_mlp": 0.01283674, + "balance_loss_clip": 0.06296658, + "balance_loss_mlp": 0.0126224, + "epoch": 0.22498121148354125, + "flos": 16040246296320.0, + "grad_norm": 1.702368757801579, + "language_loss": 0.87747514, + "learning_rate": 3.6142714587258592e-06, + "loss": 0.95558089, + "num_input_tokens_seen": 80545545, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.21435547, + "step": 3742, + "time_per_iteration": 2.5164167881011963 + }, + { + "auxiliary_loss_clip": 0.0652426, + "auxiliary_loss_mlp": 0.01281848, + "balance_loss_clip": 0.06294844, + "balance_loss_mlp": 0.01259389, + "epoch": 0.22504133473620924, + "flos": 24030489064320.0, + "grad_norm": 1.7109022824395175, + "language_loss": 0.82010657, + "learning_rate": 3.614041503218444e-06, + "loss": 0.89816761, + "num_input_tokens_seen": 80565040, + "router_z_loss_clip": 2.29296875, + "router_z_loss_mlp": 0.22473145, + "step": 3743, + "time_per_iteration": 2.5486276149749756 + }, + { + "auxiliary_loss_clip": 0.06524298, + "auxiliary_loss_mlp": 0.0127565, + "balance_loss_clip": 0.06291372, + "balance_loss_mlp": 0.01254562, + "epoch": 0.2251014579888772, + "flos": 16769610161280.0, + "grad_norm": 2.126207867209009, + "language_loss": 0.64185399, + "learning_rate": 3.6138114865064134e-06, + "loss": 0.7198534, + "num_input_tokens_seen": 80582815, + "router_z_loss_clip": 2.328125, + "router_z_loss_mlp": 0.2109375, + "step": 3744, + "time_per_iteration": 2.535020351409912 + }, + { + "auxiliary_loss_clip": 0.06527244, + "auxiliary_loss_mlp": 0.01277496, + "balance_loss_clip": 0.06293654, + "balance_loss_mlp": 0.01256372, + "epoch": 0.22516158124154517, + "flos": 13996191070080.0, + "grad_norm": 3.1643825534304684, + "language_loss": 0.76886272, + "learning_rate": 3.613581408598489e-06, + "loss": 0.84691012, + "num_input_tokens_seen": 80600865, + "router_z_loss_clip": 2.3359375, + "router_z_loss_mlp": 0.21105957, + "step": 3745, + "time_per_iteration": 2.5233495235443115 + }, + { + "auxiliary_loss_clip": 0.06522205, + "auxiliary_loss_mlp": 0.01281406, + "balance_loss_clip": 0.06290923, + "balance_loss_mlp": 0.01260675, + "epoch": 0.22522170449421314, + "flos": 14394869596800.0, + "grad_norm": 1.6969236990578618, + "language_loss": 0.80721819, + "learning_rate": 3.6133512695033965e-06, + "loss": 0.88525426, + "num_input_tokens_seen": 80617455, + "router_z_loss_clip": 2.3125, + "router_z_loss_mlp": 0.20739746, + "step": 3746, + "time_per_iteration": 2.559129476547241 + }, + { + "auxiliary_loss_clip": 0.06533524, + "auxiliary_loss_mlp": 0.01280566, + "balance_loss_clip": 0.06296681, + "balance_loss_mlp": 0.0125881, + "epoch": 0.2252818277468811, + "flos": 23812338159360.0, + "grad_norm": 2.077776202364112, + "language_loss": 0.86226261, + "learning_rate": 3.613121069229862e-06, + "loss": 0.94040346, + "num_input_tokens_seen": 80635125, + "router_z_loss_clip": 2.3671875, + "router_z_loss_mlp": 0.21765137, + "step": 3747, + "time_per_iteration": 2.5834550857543945 + }, + { + "auxiliary_loss_clip": 0.06530412, + "auxiliary_loss_mlp": 0.01275087, + "balance_loss_clip": 0.06296586, + "balance_loss_mlp": 0.01255095, + "epoch": 0.22534195099954907, + "flos": 24725038757760.0, + "grad_norm": 1.8595393434505574, + "language_loss": 0.76982796, + "learning_rate": 3.6128908077866145e-06, + "loss": 0.84788299, + "num_input_tokens_seen": 80656370, + "router_z_loss_clip": 2.34179688, + "router_z_loss_mlp": 0.1998291, + "step": 3748, + "time_per_iteration": 2.5877788066864014 + }, + { + "auxiliary_loss_clip": 0.0652978, + "auxiliary_loss_mlp": 0.0128313, + "balance_loss_clip": 0.06296694, + "balance_loss_mlp": 0.01261768, + "epoch": 0.22540207425221703, + "flos": 21038625578880.0, + "grad_norm": 1.5282192474331018, + "language_loss": 0.80547005, + "learning_rate": 3.6126604851823864e-06, + "loss": 0.88359916, + "num_input_tokens_seen": 80676495, + "router_z_loss_clip": 2.33203125, + "router_z_loss_mlp": 0.21374512, + "step": 3749, + "time_per_iteration": 2.5356597900390625 + }, + { + "auxiliary_loss_clip": 0.06526259, + "auxiliary_loss_mlp": 0.01273546, + "balance_loss_clip": 0.06298405, + "balance_loss_mlp": 0.01253698, + "epoch": 0.22546219750488503, + "flos": 19396351480320.0, + "grad_norm": 1.5225090015602234, + "language_loss": 0.80070651, + "learning_rate": 3.6124301014259108e-06, + "loss": 0.87870455, + "num_input_tokens_seen": 80694755, + "router_z_loss_clip": 2.2734375, + "router_z_loss_mlp": 0.19848633, + "step": 3750, + "time_per_iteration": 2.524614095687866 + }, + { + "auxiliary_loss_clip": 0.06532078, + "auxiliary_loss_mlp": 0.01279372, + "balance_loss_clip": 0.06297495, + "balance_loss_mlp": 0.01258117, + "epoch": 0.225522320757553, + "flos": 25199760464640.0, + "grad_norm": 5.336084937176506, + "language_loss": 0.8300491, + "learning_rate": 3.6121996565259244e-06, + "loss": 0.90816361, + "num_input_tokens_seen": 80713670, + "router_z_loss_clip": 2.34765625, + "router_z_loss_mlp": 0.21264648, + "step": 3751, + "time_per_iteration": 2.5638771057128906 + }, + { + "auxiliary_loss_clip": 0.06527963, + "auxiliary_loss_mlp": 0.01280546, + "balance_loss_clip": 0.06296829, + "balance_loss_mlp": 0.01260149, + "epoch": 0.22558244401022096, + "flos": 17168456396160.0, + "grad_norm": 1.7246902184661286, + "language_loss": 0.8427825, + "learning_rate": 3.611969150491165e-06, + "loss": 0.92086762, + "num_input_tokens_seen": 80731450, + "router_z_loss_clip": 2.31054688, + "router_z_loss_mlp": 0.20385742, + "step": 3752, + "time_per_iteration": 2.5650362968444824 + }, + { + "auxiliary_loss_clip": 0.06527157, + "auxiliary_loss_mlp": 0.01275092, + "balance_loss_clip": 0.06298538, + "balance_loss_mlp": 0.01254839, + "epoch": 0.22564256726288892, + "flos": 15236306697600.0, + "grad_norm": 1.7312534305272433, + "language_loss": 0.78620666, + "learning_rate": 3.611738583330375e-06, + "loss": 0.8642292, + "num_input_tokens_seen": 80748415, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.20251465, + "step": 3753, + "time_per_iteration": 2.510344982147217 + }, + { + "auxiliary_loss_clip": 0.06525348, + "auxiliary_loss_mlp": 0.01279816, + "balance_loss_clip": 0.06296748, + "balance_loss_mlp": 0.01257869, + "epoch": 0.2257026905155569, + "flos": 34577215902720.0, + "grad_norm": 1.9706921359503449, + "language_loss": 0.79448152, + "learning_rate": 3.611507955052295e-06, + "loss": 0.8725332, + "num_input_tokens_seen": 80770835, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.21948242, + "step": 3754, + "time_per_iteration": 2.6429665088653564 + }, + { + "auxiliary_loss_clip": 0.06526577, + "auxiliary_loss_mlp": 0.01281198, + "balance_loss_clip": 0.06299241, + "balance_loss_mlp": 0.01259835, + "epoch": 0.22576281376822485, + "flos": 19944642672000.0, + "grad_norm": 1.7667035857085684, + "language_loss": 0.70640147, + "learning_rate": 3.6112772656656727e-06, + "loss": 0.78447914, + "num_input_tokens_seen": 80787840, + "router_z_loss_clip": 2.2734375, + "router_z_loss_mlp": 0.21374512, + "step": 3755, + "time_per_iteration": 2.5482447147369385 + }, + { + "auxiliary_loss_clip": 0.06530152, + "auxiliary_loss_mlp": 0.01282078, + "balance_loss_clip": 0.06295566, + "balance_loss_mlp": 0.01261085, + "epoch": 0.22582293702089282, + "flos": 24607892350080.0, + "grad_norm": 2.6955819116528588, + "language_loss": 0.77899122, + "learning_rate": 3.6110465151792547e-06, + "loss": 0.85711348, + "num_input_tokens_seen": 80806335, + "router_z_loss_clip": 2.34570312, + "router_z_loss_mlp": 0.21008301, + "step": 3756, + "time_per_iteration": 2.573639392852783 + }, + { + "auxiliary_loss_clip": 0.06536651, + "auxiliary_loss_mlp": 0.01278842, + "balance_loss_clip": 0.0629873, + "balance_loss_mlp": 0.01255394, + "epoch": 0.2258830602735608, + "flos": 23041451796480.0, + "grad_norm": 2.9460656412940405, + "language_loss": 0.82867002, + "learning_rate": 3.6108157036017916e-06, + "loss": 0.90682495, + "num_input_tokens_seen": 80825355, + "router_z_loss_clip": 2.38085938, + "router_z_loss_mlp": 0.23461914, + "step": 3757, + "time_per_iteration": 2.5425305366516113 + }, + { + "auxiliary_loss_clip": 0.06538612, + "auxiliary_loss_mlp": 0.01279229, + "balance_loss_clip": 0.06302969, + "balance_loss_mlp": 0.01257164, + "epoch": 0.22594318352622877, + "flos": 22164068494080.0, + "grad_norm": 3.099441845199118, + "language_loss": 0.73941171, + "learning_rate": 3.6105848309420358e-06, + "loss": 0.81759018, + "num_input_tokens_seen": 80842570, + "router_z_loss_clip": 2.359375, + "router_z_loss_mlp": 0.2208252, + "step": 3758, + "time_per_iteration": 2.506148099899292 + }, + { + "auxiliary_loss_clip": 0.06531477, + "auxiliary_loss_mlp": 0.01288595, + "balance_loss_clip": 0.06296086, + "balance_loss_mlp": 0.01266816, + "epoch": 0.22600330677889674, + "flos": 20600478979200.0, + "grad_norm": 2.4125098710516117, + "language_loss": 0.77881908, + "learning_rate": 3.6103538972087412e-06, + "loss": 0.85701978, + "num_input_tokens_seen": 80858745, + "router_z_loss_clip": 2.35546875, + "router_z_loss_mlp": 0.21777344, + "step": 3759, + "time_per_iteration": 2.5171775817871094 + }, + { + "auxiliary_loss_clip": 0.06534176, + "auxiliary_loss_mlp": 0.01288917, + "balance_loss_clip": 0.06296586, + "balance_loss_mlp": 0.01266267, + "epoch": 0.2260634300315647, + "flos": 35667970427520.0, + "grad_norm": 1.6851914496917324, + "language_loss": 0.7921207, + "learning_rate": 3.6101229024106655e-06, + "loss": 0.87035167, + "num_input_tokens_seen": 80880085, + "router_z_loss_clip": 2.37695312, + "router_z_loss_mlp": 0.22644043, + "step": 3760, + "time_per_iteration": 2.6410677433013916 + }, + { + "auxiliary_loss_clip": 0.06433272, + "auxiliary_loss_mlp": 0.01258557, + "balance_loss_clip": 0.06311189, + "balance_loss_mlp": 0.01252156, + "epoch": 0.22612355328423267, + "flos": 72107707685760.0, + "grad_norm": 0.875668320300708, + "language_loss": 0.60230321, + "learning_rate": 3.609891846556569e-06, + "loss": 0.67922151, + "num_input_tokens_seen": 80937660, + "router_z_loss_clip": 1.21875, + "router_z_loss_mlp": 0.06408691, + "step": 3761, + "time_per_iteration": 3.1083786487579346 + }, + { + "auxiliary_loss_clip": 0.06545433, + "auxiliary_loss_mlp": 0.01288291, + "balance_loss_clip": 0.06303856, + "balance_loss_mlp": 0.01267012, + "epoch": 0.22618367653690064, + "flos": 22790373436800.0, + "grad_norm": 3.0022983434583783, + "language_loss": 0.77876461, + "learning_rate": 3.609660729655211e-06, + "loss": 0.8571018, + "num_input_tokens_seen": 80956265, + "router_z_loss_clip": 2.41601562, + "router_z_loss_mlp": 0.21289062, + "step": 3762, + "time_per_iteration": 2.5256128311157227 + }, + { + "auxiliary_loss_clip": 0.06531228, + "auxiliary_loss_mlp": 0.01280361, + "balance_loss_clip": 0.06294668, + "balance_loss_mlp": 0.01258343, + "epoch": 0.22624379978956863, + "flos": 20454388185600.0, + "grad_norm": 1.959767281760525, + "language_loss": 0.79828411, + "learning_rate": 3.6094295517153573e-06, + "loss": 0.87639999, + "num_input_tokens_seen": 80975185, + "router_z_loss_clip": 2.36328125, + "router_z_loss_mlp": 0.22033691, + "step": 3763, + "time_per_iteration": 2.528965950012207 + }, + { + "auxiliary_loss_clip": 0.06540731, + "auxiliary_loss_mlp": 0.01291635, + "balance_loss_clip": 0.06300753, + "balance_loss_mlp": 0.01268949, + "epoch": 0.2263039230422366, + "flos": 17500189910400.0, + "grad_norm": 1.5800574189561347, + "language_loss": 0.91907668, + "learning_rate": 3.6091983127457743e-06, + "loss": 0.99740022, + "num_input_tokens_seen": 80992830, + "router_z_loss_clip": 2.3984375, + "router_z_loss_mlp": 0.22705078, + "step": 3764, + "time_per_iteration": 2.5012450218200684 + }, + { + "auxiliary_loss_clip": 0.06527007, + "auxiliary_loss_mlp": 0.01291683, + "balance_loss_clip": 0.06295396, + "balance_loss_mlp": 0.01271001, + "epoch": 0.22636404629490456, + "flos": 28337295473280.0, + "grad_norm": 3.379650672619254, + "language_loss": 0.75542498, + "learning_rate": 3.6089670127552293e-06, + "loss": 0.83361191, + "num_input_tokens_seen": 81013675, + "router_z_loss_clip": 2.31445312, + "router_z_loss_mlp": 0.20690918, + "step": 3765, + "time_per_iteration": 2.6149775981903076 + }, + { + "auxiliary_loss_clip": 0.06519896, + "auxiliary_loss_mlp": 0.01290584, + "balance_loss_clip": 0.06290467, + "balance_loss_mlp": 0.01268256, + "epoch": 0.22642416954757252, + "flos": 17494152416640.0, + "grad_norm": 2.1325205607667526, + "language_loss": 0.90732884, + "learning_rate": 3.608735651752494e-06, + "loss": 0.98543364, + "num_input_tokens_seen": 81030345, + "router_z_loss_clip": 2.29492188, + "router_z_loss_mlp": 0.22338867, + "step": 3766, + "time_per_iteration": 3.925321340560913 + }, + { + "auxiliary_loss_clip": 0.06520344, + "auxiliary_loss_mlp": 0.01279841, + "balance_loss_clip": 0.0629393, + "balance_loss_mlp": 0.0125756, + "epoch": 0.2264842928002405, + "flos": 24390621912960.0, + "grad_norm": 1.5335844294501488, + "language_loss": 0.74866152, + "learning_rate": 3.6085042297463417e-06, + "loss": 0.82666337, + "num_input_tokens_seen": 81051000, + "router_z_loss_clip": 2.26757812, + "router_z_loss_mlp": 0.22290039, + "step": 3767, + "time_per_iteration": 2.585827589035034 + }, + { + "auxiliary_loss_clip": 0.06526411, + "auxiliary_loss_mlp": 0.01285323, + "balance_loss_clip": 0.06291486, + "balance_loss_mlp": 0.01262816, + "epoch": 0.22654441605290845, + "flos": 19836971775360.0, + "grad_norm": 1.5156609478299474, + "language_loss": 0.72064531, + "learning_rate": 3.6082727467455477e-06, + "loss": 0.79876268, + "num_input_tokens_seen": 81071205, + "router_z_loss_clip": 2.35351562, + "router_z_loss_mlp": 0.22521973, + "step": 3768, + "time_per_iteration": 3.9932377338409424 + }, + { + "auxiliary_loss_clip": 0.06525982, + "auxiliary_loss_mlp": 0.01291355, + "balance_loss_clip": 0.06294759, + "balance_loss_mlp": 0.01268347, + "epoch": 0.22660453930557642, + "flos": 27462050449920.0, + "grad_norm": 1.8227506475765343, + "language_loss": 0.78781188, + "learning_rate": 3.6080412027588905e-06, + "loss": 0.86598527, + "num_input_tokens_seen": 81091880, + "router_z_loss_clip": 2.31445312, + "router_z_loss_mlp": 0.22998047, + "step": 3769, + "time_per_iteration": 2.5796549320220947 + }, + { + "auxiliary_loss_clip": 0.06531481, + "auxiliary_loss_mlp": 0.01287446, + "balance_loss_clip": 0.06292526, + "balance_loss_mlp": 0.01265428, + "epoch": 0.2266646625582444, + "flos": 23995004060160.0, + "grad_norm": 2.604534401291856, + "language_loss": 0.69374454, + "learning_rate": 3.6078095977951488e-06, + "loss": 0.77193379, + "num_input_tokens_seen": 81113290, + "router_z_loss_clip": 2.38867188, + "router_z_loss_mlp": 0.22021484, + "step": 3770, + "time_per_iteration": 2.6160407066345215 + }, + { + "auxiliary_loss_clip": 0.065291, + "auxiliary_loss_mlp": 0.01289999, + "balance_loss_clip": 0.06292273, + "balance_loss_mlp": 0.01269077, + "epoch": 0.22672478581091238, + "flos": 26034698874240.0, + "grad_norm": 1.4830972618629188, + "language_loss": 0.8083868, + "learning_rate": 3.6075779318631067e-06, + "loss": 0.88657784, + "num_input_tokens_seen": 81133535, + "router_z_loss_clip": 2.37109375, + "router_z_loss_mlp": 0.20922852, + "step": 3771, + "time_per_iteration": 2.576948642730713 + }, + { + "auxiliary_loss_clip": 0.06521479, + "auxiliary_loss_mlp": 0.01283736, + "balance_loss_clip": 0.06290901, + "balance_loss_mlp": 0.012613, + "epoch": 0.22678490906358034, + "flos": 23848577850240.0, + "grad_norm": 1.5694676435300003, + "language_loss": 0.79189658, + "learning_rate": 3.6073462049715486e-06, + "loss": 0.86994874, + "num_input_tokens_seen": 81154650, + "router_z_loss_clip": 2.30273438, + "router_z_loss_mlp": 0.22436523, + "step": 3772, + "time_per_iteration": 4.012827396392822 + }, + { + "auxiliary_loss_clip": 0.06410234, + "auxiliary_loss_mlp": 0.01286376, + "balance_loss_clip": 0.06287075, + "balance_loss_mlp": 0.01280571, + "epoch": 0.2268450323162483, + "flos": 65070163912320.0, + "grad_norm": 0.6415690360853892, + "language_loss": 0.53899318, + "learning_rate": 3.607114417129261e-06, + "loss": 0.61595929, + "num_input_tokens_seen": 81221240, + "router_z_loss_clip": 1.234375, + "router_z_loss_mlp": 0.0579834, + "step": 3773, + "time_per_iteration": 3.249551773071289 + }, + { + "auxiliary_loss_clip": 0.06526346, + "auxiliary_loss_mlp": 0.01287624, + "balance_loss_clip": 0.06294057, + "balance_loss_mlp": 0.01266238, + "epoch": 0.22690515556891627, + "flos": 22532251334400.0, + "grad_norm": 1.8359701531623327, + "language_loss": 0.70997107, + "learning_rate": 3.6068825683450334e-06, + "loss": 0.78811073, + "num_input_tokens_seen": 81241520, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.21386719, + "step": 3774, + "time_per_iteration": 2.558279275894165 + }, + { + "auxiliary_loss_clip": 0.06521672, + "auxiliary_loss_mlp": 0.01287873, + "balance_loss_clip": 0.06291246, + "balance_loss_mlp": 0.01266857, + "epoch": 0.22696527882158424, + "flos": 18229344140160.0, + "grad_norm": 2.047907778931267, + "language_loss": 0.75449002, + "learning_rate": 3.606650658627658e-06, + "loss": 0.83258545, + "num_input_tokens_seen": 81256825, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.21008301, + "step": 3775, + "time_per_iteration": 3.928666353225708 + }, + { + "auxiliary_loss_clip": 0.06524701, + "auxiliary_loss_mlp": 0.01286732, + "balance_loss_clip": 0.06292307, + "balance_loss_mlp": 0.01266168, + "epoch": 0.22702540207425223, + "flos": 17024923152000.0, + "grad_norm": 2.031895062113734, + "language_loss": 0.82818532, + "learning_rate": 3.606418687985928e-06, + "loss": 0.90629965, + "num_input_tokens_seen": 81275695, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.20581055, + "step": 3776, + "time_per_iteration": 2.5941483974456787 + }, + { + "auxiliary_loss_clip": 0.06528914, + "auxiliary_loss_mlp": 0.01279846, + "balance_loss_clip": 0.06293055, + "balance_loss_mlp": 0.01259222, + "epoch": 0.2270855253269202, + "flos": 21332316539520.0, + "grad_norm": 1.645158938946052, + "language_loss": 0.83362442, + "learning_rate": 3.606186656428641e-06, + "loss": 0.91171205, + "num_input_tokens_seen": 81294920, + "router_z_loss_clip": 2.359375, + "router_z_loss_mlp": 0.20617676, + "step": 3777, + "time_per_iteration": 2.5177228450775146 + }, + { + "auxiliary_loss_clip": 0.06532624, + "auxiliary_loss_mlp": 0.01278936, + "balance_loss_clip": 0.06296799, + "balance_loss_mlp": 0.01257002, + "epoch": 0.22714564857958816, + "flos": 23557276730880.0, + "grad_norm": 1.8837878269403912, + "language_loss": 0.73246169, + "learning_rate": 3.6059545639645955e-06, + "loss": 0.81057739, + "num_input_tokens_seen": 81314275, + "router_z_loss_clip": 2.36328125, + "router_z_loss_mlp": 0.21948242, + "step": 3778, + "time_per_iteration": 2.5589511394500732 + }, + { + "auxiliary_loss_clip": 0.06530988, + "auxiliary_loss_mlp": 0.01275867, + "balance_loss_clip": 0.06293572, + "balance_loss_mlp": 0.01255673, + "epoch": 0.22720577183225613, + "flos": 25996237050240.0, + "grad_norm": 2.9659284448048555, + "language_loss": 0.65779513, + "learning_rate": 3.605722410602591e-06, + "loss": 0.73586369, + "num_input_tokens_seen": 81333890, + "router_z_loss_clip": 2.37304688, + "router_z_loss_mlp": 0.20178223, + "step": 3779, + "time_per_iteration": 2.543818950653076 + }, + { + "auxiliary_loss_clip": 0.06525169, + "auxiliary_loss_mlp": 0.01276701, + "balance_loss_clip": 0.06295511, + "balance_loss_mlp": 0.01255982, + "epoch": 0.2272658950849241, + "flos": 20820432746880.0, + "grad_norm": 1.7825989229768946, + "language_loss": 0.70823693, + "learning_rate": 3.6054901963514323e-06, + "loss": 0.7862556, + "num_input_tokens_seen": 81353640, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.20703125, + "step": 3780, + "time_per_iteration": 2.558850049972534 + }, + { + "auxiliary_loss_clip": 0.06528573, + "auxiliary_loss_mlp": 0.01280577, + "balance_loss_clip": 0.06296494, + "balance_loss_mlp": 0.01257927, + "epoch": 0.22732601833759206, + "flos": 23915187446400.0, + "grad_norm": 1.6463040629853982, + "language_loss": 0.89639765, + "learning_rate": 3.6052579212199246e-06, + "loss": 0.97448915, + "num_input_tokens_seen": 81371595, + "router_z_loss_clip": 2.32226562, + "router_z_loss_mlp": 0.2265625, + "step": 3781, + "time_per_iteration": 2.527230739593506 + }, + { + "auxiliary_loss_clip": 0.06532317, + "auxiliary_loss_mlp": 0.01280346, + "balance_loss_clip": 0.06296034, + "balance_loss_mlp": 0.01257672, + "epoch": 0.22738614159026002, + "flos": 15929850142080.0, + "grad_norm": 2.4692396393453016, + "language_loss": 0.75309098, + "learning_rate": 3.6050255852168753e-06, + "loss": 0.83121765, + "num_input_tokens_seen": 81388435, + "router_z_loss_clip": 2.36523438, + "router_z_loss_mlp": 0.2265625, + "step": 3782, + "time_per_iteration": 2.4901020526885986 + }, + { + "auxiliary_loss_clip": 0.06532567, + "auxiliary_loss_mlp": 0.01278379, + "balance_loss_clip": 0.06300219, + "balance_loss_mlp": 0.01257959, + "epoch": 0.22744626484292801, + "flos": 24212148716160.0, + "grad_norm": 1.7681967435875452, + "language_loss": 0.8314634, + "learning_rate": 3.604793188351095e-06, + "loss": 0.90957284, + "num_input_tokens_seen": 81410195, + "router_z_loss_clip": 2.32421875, + "router_z_loss_mlp": 0.20422363, + "step": 3783, + "time_per_iteration": 2.559361696243286 + }, + { + "auxiliary_loss_clip": 0.06539755, + "auxiliary_loss_mlp": 0.0128451, + "balance_loss_clip": 0.06305835, + "balance_loss_mlp": 0.01262266, + "epoch": 0.22750638809559598, + "flos": 24798734023680.0, + "grad_norm": 1.794476113807414, + "language_loss": 0.76757884, + "learning_rate": 3.6045607306313964e-06, + "loss": 0.8458215, + "num_input_tokens_seen": 81430060, + "router_z_loss_clip": 2.34179688, + "router_z_loss_mlp": 0.22229004, + "step": 3784, + "time_per_iteration": 2.6693339347839355 + }, + { + "auxiliary_loss_clip": 0.06533188, + "auxiliary_loss_mlp": 0.012806, + "balance_loss_clip": 0.06299379, + "balance_loss_mlp": 0.01257998, + "epoch": 0.22756651134826394, + "flos": 22243004640000.0, + "grad_norm": 1.5985438146538498, + "language_loss": 0.71667248, + "learning_rate": 3.604328212066594e-06, + "loss": 0.79481035, + "num_input_tokens_seen": 81447375, + "router_z_loss_clip": 2.34179688, + "router_z_loss_mlp": 0.22583008, + "step": 3785, + "time_per_iteration": 2.5436675548553467 + }, + { + "auxiliary_loss_clip": 0.06421004, + "auxiliary_loss_mlp": 0.0127133, + "balance_loss_clip": 0.0629871, + "balance_loss_mlp": 0.01265915, + "epoch": 0.2276266346009319, + "flos": 62728225021440.0, + "grad_norm": 1.545506426452605, + "language_loss": 0.63058448, + "learning_rate": 3.6040956326655047e-06, + "loss": 0.70750785, + "num_input_tokens_seen": 81505235, + "router_z_loss_clip": 1.21875, + "router_z_loss_mlp": 0.05422974, + "step": 3786, + "time_per_iteration": 3.1247661113739014 + }, + { + "auxiliary_loss_clip": 0.06538717, + "auxiliary_loss_mlp": 0.01277474, + "balance_loss_clip": 0.06302891, + "balance_loss_mlp": 0.01254299, + "epoch": 0.22768675785359987, + "flos": 18618085958400.0, + "grad_norm": 2.466113986800572, + "language_loss": 0.8751514, + "learning_rate": 3.6038629924369486e-06, + "loss": 0.95331335, + "num_input_tokens_seen": 81518685, + "router_z_loss_clip": 2.35742188, + "router_z_loss_mlp": 0.23156738, + "step": 3787, + "time_per_iteration": 2.488539457321167 + }, + { + "auxiliary_loss_clip": 0.06537791, + "auxiliary_loss_mlp": 0.01280159, + "balance_loss_clip": 0.06305036, + "balance_loss_mlp": 0.01259488, + "epoch": 0.22774688110626784, + "flos": 26877477640320.0, + "grad_norm": 2.053207704033697, + "language_loss": 0.73054254, + "learning_rate": 3.6036302913897474e-06, + "loss": 0.80872202, + "num_input_tokens_seen": 81538940, + "router_z_loss_clip": 2.33007812, + "router_z_loss_mlp": 0.20678711, + "step": 3788, + "time_per_iteration": 2.5763657093048096 + }, + { + "auxiliary_loss_clip": 0.06534025, + "auxiliary_loss_mlp": 0.01282834, + "balance_loss_clip": 0.06303776, + "balance_loss_mlp": 0.01260971, + "epoch": 0.2278070043589358, + "flos": 15557977722240.0, + "grad_norm": 4.57361945380841, + "language_loss": 0.68007839, + "learning_rate": 3.6033975295327243e-06, + "loss": 0.75824702, + "num_input_tokens_seen": 81555525, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.21850586, + "step": 3789, + "time_per_iteration": 2.4907443523406982 + }, + { + "auxiliary_loss_clip": 0.0653897, + "auxiliary_loss_mlp": 0.01283477, + "balance_loss_clip": 0.06308074, + "balance_loss_mlp": 0.0126115, + "epoch": 0.2278671276116038, + "flos": 22422987210240.0, + "grad_norm": 2.4388022002275243, + "language_loss": 0.76775718, + "learning_rate": 3.6031647068747065e-06, + "loss": 0.84598166, + "num_input_tokens_seen": 81576305, + "router_z_loss_clip": 2.31054688, + "router_z_loss_mlp": 0.22338867, + "step": 3790, + "time_per_iteration": 2.5787651538848877 + }, + { + "auxiliary_loss_clip": 0.06540109, + "auxiliary_loss_mlp": 0.01282259, + "balance_loss_clip": 0.06309578, + "balance_loss_mlp": 0.01259252, + "epoch": 0.22792725086427176, + "flos": 20637641064960.0, + "grad_norm": 1.9300771626575046, + "language_loss": 0.91910696, + "learning_rate": 3.602931823424522e-06, + "loss": 0.99733061, + "num_input_tokens_seen": 81594115, + "router_z_loss_clip": 2.30664062, + "router_z_loss_mlp": 0.23010254, + "step": 3791, + "time_per_iteration": 2.52327823638916 + }, + { + "auxiliary_loss_clip": 0.06538808, + "auxiliary_loss_mlp": 0.01277492, + "balance_loss_clip": 0.06302848, + "balance_loss_mlp": 0.01256893, + "epoch": 0.22798737411693973, + "flos": 31436662147200.0, + "grad_norm": 1.9637481556258098, + "language_loss": 0.83064067, + "learning_rate": 3.6026988791910026e-06, + "loss": 0.9088037, + "num_input_tokens_seen": 81615355, + "router_z_loss_clip": 2.36132812, + "router_z_loss_mlp": 0.20617676, + "step": 3792, + "time_per_iteration": 2.6190388202667236 + }, + { + "auxiliary_loss_clip": 0.06410792, + "auxiliary_loss_mlp": 0.01268683, + "balance_loss_clip": 0.06289717, + "balance_loss_mlp": 0.01263259, + "epoch": 0.2280474973696077, + "flos": 52412074220160.0, + "grad_norm": 1.1033671526650368, + "language_loss": 0.65792358, + "learning_rate": 3.602465874182981e-06, + "loss": 0.73471832, + "num_input_tokens_seen": 81662075, + "router_z_loss_clip": 1.2109375, + "router_z_loss_mlp": 0.05432129, + "step": 3793, + "time_per_iteration": 2.9110665321350098 + }, + { + "auxiliary_loss_clip": 0.0654863, + "auxiliary_loss_mlp": 0.01287304, + "balance_loss_clip": 0.06306019, + "balance_loss_mlp": 0.01261889, + "epoch": 0.22810762062227566, + "flos": 26403300984960.0, + "grad_norm": 1.9908643306499119, + "language_loss": 0.78207439, + "learning_rate": 3.602232808409293e-06, + "loss": 0.8604337, + "num_input_tokens_seen": 81681625, + "router_z_loss_clip": 2.4296875, + "router_z_loss_mlp": 0.25415039, + "step": 3794, + "time_per_iteration": 2.5911734104156494 + }, + { + "auxiliary_loss_clip": 0.06544799, + "auxiliary_loss_mlp": 0.01285336, + "balance_loss_clip": 0.06310074, + "balance_loss_mlp": 0.01262412, + "epoch": 0.22816774387494362, + "flos": 25637445866880.0, + "grad_norm": 3.443157636284035, + "language_loss": 0.81285226, + "learning_rate": 3.6019996818787755e-06, + "loss": 0.89115357, + "num_input_tokens_seen": 81701170, + "router_z_loss_clip": 2.34960938, + "router_z_loss_mlp": 0.22912598, + "step": 3795, + "time_per_iteration": 2.6825528144836426 + }, + { + "auxiliary_loss_clip": 0.06536914, + "auxiliary_loss_mlp": 0.0128896, + "balance_loss_clip": 0.06306744, + "balance_loss_mlp": 0.01267586, + "epoch": 0.22822786712761162, + "flos": 22457507892480.0, + "grad_norm": 1.703568435651106, + "language_loss": 0.77948368, + "learning_rate": 3.6017664946002704e-06, + "loss": 0.85774243, + "num_input_tokens_seen": 81721265, + "router_z_loss_clip": 2.30664062, + "router_z_loss_mlp": 0.21362305, + "step": 3796, + "time_per_iteration": 2.5418922901153564 + }, + { + "auxiliary_loss_clip": 0.06535624, + "auxiliary_loss_mlp": 0.01278994, + "balance_loss_clip": 0.06302401, + "balance_loss_mlp": 0.01258692, + "epoch": 0.22828799038027958, + "flos": 12207323053440.0, + "grad_norm": 2.5041816771456076, + "language_loss": 0.96305406, + "learning_rate": 3.6015332465826188e-06, + "loss": 1.04120016, + "num_input_tokens_seen": 81736565, + "router_z_loss_clip": 2.33203125, + "router_z_loss_mlp": 0.20324707, + "step": 3797, + "time_per_iteration": 2.5794107913970947 + }, + { + "auxiliary_loss_clip": 0.06537494, + "auxiliary_loss_mlp": 0.01281478, + "balance_loss_clip": 0.06304877, + "balance_loss_mlp": 0.01260057, + "epoch": 0.22834811363294755, + "flos": 22091379477120.0, + "grad_norm": 1.517581709018558, + "language_loss": 0.82277977, + "learning_rate": 3.601299937834666e-06, + "loss": 0.90096951, + "num_input_tokens_seen": 81756240, + "router_z_loss_clip": 2.32421875, + "router_z_loss_mlp": 0.2142334, + "step": 3798, + "time_per_iteration": 2.618784189224243 + }, + { + "auxiliary_loss_clip": 0.06536907, + "auxiliary_loss_mlp": 0.01279844, + "balance_loss_clip": 0.06300005, + "balance_loss_mlp": 0.01257146, + "epoch": 0.2284082368856155, + "flos": 24867104555520.0, + "grad_norm": 1.8603662335211264, + "language_loss": 0.79381669, + "learning_rate": 3.6010665683652596e-06, + "loss": 0.87198418, + "num_input_tokens_seen": 81775720, + "router_z_loss_clip": 2.36914062, + "router_z_loss_mlp": 0.22705078, + "step": 3799, + "time_per_iteration": 2.591053009033203 + }, + { + "auxiliary_loss_clip": 0.06534393, + "auxiliary_loss_mlp": 0.0128126, + "balance_loss_clip": 0.06300979, + "balance_loss_mlp": 0.01258646, + "epoch": 0.22846836013828348, + "flos": 23299280409600.0, + "grad_norm": 1.5152328596048934, + "language_loss": 0.75782096, + "learning_rate": 3.6008331381832484e-06, + "loss": 0.83597749, + "num_input_tokens_seen": 81795830, + "router_z_loss_clip": 2.33398438, + "router_z_loss_mlp": 0.22619629, + "step": 3800, + "time_per_iteration": 2.5370395183563232 + }, + { + "auxiliary_loss_clip": 0.06535068, + "auxiliary_loss_mlp": 0.01279113, + "balance_loss_clip": 0.06302812, + "balance_loss_mlp": 0.01258001, + "epoch": 0.22852848339095144, + "flos": 27423462844800.0, + "grad_norm": 1.9420817073182375, + "language_loss": 0.64685607, + "learning_rate": 3.600599647297484e-06, + "loss": 0.72499788, + "num_input_tokens_seen": 81815745, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.21105957, + "step": 3801, + "time_per_iteration": 2.6190593242645264 + }, + { + "auxiliary_loss_clip": 0.06524718, + "auxiliary_loss_mlp": 0.01278565, + "balance_loss_clip": 0.06296816, + "balance_loss_mlp": 0.01257835, + "epoch": 0.2285886066436194, + "flos": 26328054418560.0, + "grad_norm": 1.6808395254049295, + "language_loss": 0.81957126, + "learning_rate": 3.60036609571682e-06, + "loss": 0.89760411, + "num_input_tokens_seen": 81835155, + "router_z_loss_clip": 2.27929688, + "router_z_loss_mlp": 0.20727539, + "step": 3802, + "time_per_iteration": 2.554079055786133 + }, + { + "auxiliary_loss_clip": 0.06534229, + "auxiliary_loss_mlp": 0.01286931, + "balance_loss_clip": 0.06299631, + "balance_loss_mlp": 0.0126415, + "epoch": 0.2286487298962874, + "flos": 29724298508160.0, + "grad_norm": 1.6760491170738747, + "language_loss": 0.79838073, + "learning_rate": 3.600132483450114e-06, + "loss": 0.87659228, + "num_input_tokens_seen": 81855655, + "router_z_loss_clip": 2.34960938, + "router_z_loss_mlp": 0.22790527, + "step": 3803, + "time_per_iteration": 2.6287641525268555 + }, + { + "auxiliary_loss_clip": 0.0653572, + "auxiliary_loss_mlp": 0.01279074, + "balance_loss_clip": 0.06296768, + "balance_loss_mlp": 0.012559, + "epoch": 0.22870885314895537, + "flos": 21293435445120.0, + "grad_norm": 1.7238152987334623, + "language_loss": 0.86273003, + "learning_rate": 3.5998988105062235e-06, + "loss": 0.94087803, + "num_input_tokens_seen": 81876385, + "router_z_loss_clip": 2.390625, + "router_z_loss_mlp": 0.23168945, + "step": 3804, + "time_per_iteration": 2.511462450027466 + }, + { + "auxiliary_loss_clip": 0.06539486, + "auxiliary_loss_mlp": 0.01279472, + "balance_loss_clip": 0.06301028, + "balance_loss_mlp": 0.01257537, + "epoch": 0.22876897640162333, + "flos": 14944754016000.0, + "grad_norm": 1.89266353651555, + "language_loss": 0.76854289, + "learning_rate": 3.59966507689401e-06, + "loss": 0.84673244, + "num_input_tokens_seen": 81893225, + "router_z_loss_clip": 2.38476562, + "router_z_loss_mlp": 0.21923828, + "step": 3805, + "time_per_iteration": 3.929358959197998 + }, + { + "auxiliary_loss_clip": 0.0654166, + "auxiliary_loss_mlp": 0.01280204, + "balance_loss_clip": 0.06298529, + "balance_loss_mlp": 0.01257542, + "epoch": 0.2288290996542913, + "flos": 18119786526720.0, + "grad_norm": 2.0123502787071073, + "language_loss": 0.79403114, + "learning_rate": 3.5994312826223363e-06, + "loss": 0.87224978, + "num_input_tokens_seen": 81911350, + "router_z_loss_clip": 2.43359375, + "router_z_loss_mlp": 0.22680664, + "step": 3806, + "time_per_iteration": 2.538203477859497 + }, + { + "auxiliary_loss_clip": 0.06540429, + "auxiliary_loss_mlp": 0.01282432, + "balance_loss_clip": 0.06303287, + "balance_loss_mlp": 0.01259878, + "epoch": 0.22888922290695926, + "flos": 39864296828160.0, + "grad_norm": 1.8839046523975558, + "language_loss": 0.70310783, + "learning_rate": 3.5991974277000684e-06, + "loss": 0.78133643, + "num_input_tokens_seen": 81935420, + "router_z_loss_clip": 2.37304688, + "router_z_loss_mlp": 0.22546387, + "step": 3807, + "time_per_iteration": 4.134840488433838 + }, + { + "auxiliary_loss_clip": 0.06550615, + "auxiliary_loss_mlp": 0.01290274, + "balance_loss_clip": 0.06307966, + "balance_loss_mlp": 0.01265121, + "epoch": 0.22894934615962723, + "flos": 23410431250560.0, + "grad_norm": 2.1946772997431103, + "language_loss": 0.65960705, + "learning_rate": 3.5989635121360733e-06, + "loss": 0.73801601, + "num_input_tokens_seen": 81953845, + "router_z_loss_clip": 2.42578125, + "router_z_loss_mlp": 0.25183105, + "step": 3808, + "time_per_iteration": 2.561497688293457 + }, + { + "auxiliary_loss_clip": 0.06539108, + "auxiliary_loss_mlp": 0.01281064, + "balance_loss_clip": 0.06300798, + "balance_loss_mlp": 0.01259154, + "epoch": 0.22900946941229522, + "flos": 18848898829440.0, + "grad_norm": 1.7761632941249064, + "language_loss": 0.75198555, + "learning_rate": 3.598729535939222e-06, + "loss": 0.83018732, + "num_input_tokens_seen": 81972100, + "router_z_loss_clip": 2.38476562, + "router_z_loss_mlp": 0.21899414, + "step": 3809, + "time_per_iteration": 2.490895986557007 + }, + { + "auxiliary_loss_clip": 0.06533305, + "auxiliary_loss_mlp": 0.0127892, + "balance_loss_clip": 0.06299955, + "balance_loss_mlp": 0.01257331, + "epoch": 0.22906959266496318, + "flos": 22935961105920.0, + "grad_norm": 1.4656596651362013, + "language_loss": 0.82576305, + "learning_rate": 3.5984954991183862e-06, + "loss": 0.90388525, + "num_input_tokens_seen": 81992760, + "router_z_loss_clip": 2.33398438, + "router_z_loss_mlp": 0.21606445, + "step": 3810, + "time_per_iteration": 2.5684924125671387 + }, + { + "auxiliary_loss_clip": 0.06535805, + "auxiliary_loss_mlp": 0.01278794, + "balance_loss_clip": 0.06303711, + "balance_loss_mlp": 0.01259041, + "epoch": 0.22912971591763115, + "flos": 19360614913920.0, + "grad_norm": 1.8664104481323773, + "language_loss": 0.79914212, + "learning_rate": 3.598261401682441e-06, + "loss": 0.8772881, + "num_input_tokens_seen": 82009080, + "router_z_loss_clip": 2.31835938, + "router_z_loss_mlp": 0.19750977, + "step": 3811, + "time_per_iteration": 3.9766526222229004 + }, + { + "auxiliary_loss_clip": 0.0653518, + "auxiliary_loss_mlp": 0.01280553, + "balance_loss_clip": 0.06300636, + "balance_loss_mlp": 0.01258976, + "epoch": 0.22918983917029911, + "flos": 19938940594560.0, + "grad_norm": 1.7476175457386653, + "language_loss": 0.83391893, + "learning_rate": 3.5980272436402632e-06, + "loss": 0.91207623, + "num_input_tokens_seen": 82026705, + "router_z_loss_clip": 2.34765625, + "router_z_loss_mlp": 0.21569824, + "step": 3812, + "time_per_iteration": 2.5174708366394043 + }, + { + "auxiliary_loss_clip": 0.0655017, + "auxiliary_loss_mlp": 0.01288002, + "balance_loss_clip": 0.06306149, + "balance_loss_mlp": 0.01264673, + "epoch": 0.22924996242296708, + "flos": 16696501873920.0, + "grad_norm": 2.3839142545709886, + "language_loss": 0.8400377, + "learning_rate": 3.5977930250007324e-06, + "loss": 0.91841948, + "num_input_tokens_seen": 82043245, + "router_z_loss_clip": 2.44335938, + "router_z_loss_mlp": 0.2331543, + "step": 3813, + "time_per_iteration": 2.554779052734375 + }, + { + "auxiliary_loss_clip": 0.06538843, + "auxiliary_loss_mlp": 0.01276305, + "balance_loss_clip": 0.06301966, + "balance_loss_mlp": 0.01255456, + "epoch": 0.22931008567563504, + "flos": 33044457490560.0, + "grad_norm": 1.6858267943586043, + "language_loss": 0.70580167, + "learning_rate": 3.5975587457727298e-06, + "loss": 0.78395313, + "num_input_tokens_seen": 82066870, + "router_z_loss_clip": 2.36914062, + "router_z_loss_mlp": 0.20861816, + "step": 3814, + "time_per_iteration": 2.6764509677886963 + }, + { + "auxiliary_loss_clip": 0.06536946, + "auxiliary_loss_mlp": 0.01276372, + "balance_loss_clip": 0.06305344, + "balance_loss_mlp": 0.01256786, + "epoch": 0.229370208928303, + "flos": 23337322963200.0, + "grad_norm": 2.8831118113675114, + "language_loss": 0.67954975, + "learning_rate": 3.597324405965139e-06, + "loss": 0.75768292, + "num_input_tokens_seen": 82083180, + "router_z_loss_clip": 2.31445312, + "router_z_loss_mlp": 0.19604492, + "step": 3815, + "time_per_iteration": 3.9759562015533447 + }, + { + "auxiliary_loss_clip": 0.06547147, + "auxiliary_loss_mlp": 0.01282792, + "balance_loss_clip": 0.06311129, + "balance_loss_mlp": 0.01259952, + "epoch": 0.229430332180971, + "flos": 28624068472320.0, + "grad_norm": 1.7261339214380451, + "language_loss": 0.83511633, + "learning_rate": 3.597090005586848e-06, + "loss": 0.91341567, + "num_input_tokens_seen": 82102950, + "router_z_loss_clip": 2.359375, + "router_z_loss_mlp": 0.22839355, + "step": 3816, + "time_per_iteration": 2.6059420108795166 + }, + { + "auxiliary_loss_clip": 0.06539545, + "auxiliary_loss_mlp": 0.01275582, + "balance_loss_clip": 0.06303526, + "balance_loss_mlp": 0.01253302, + "epoch": 0.22949045543363897, + "flos": 17243912597760.0, + "grad_norm": 2.759151157832335, + "language_loss": 0.87850988, + "learning_rate": 3.596855544646742e-06, + "loss": 0.95666116, + "num_input_tokens_seen": 82119510, + "router_z_loss_clip": 2.36132812, + "router_z_loss_mlp": 0.22290039, + "step": 3817, + "time_per_iteration": 2.4830808639526367 + }, + { + "auxiliary_loss_clip": 0.06543944, + "auxiliary_loss_mlp": 0.01278311, + "balance_loss_clip": 0.06306894, + "balance_loss_mlp": 0.01256412, + "epoch": 0.22955057868630693, + "flos": 27496654986240.0, + "grad_norm": 1.6534336608142677, + "language_loss": 0.75343978, + "learning_rate": 3.5966210231537154e-06, + "loss": 0.83166242, + "num_input_tokens_seen": 82140095, + "router_z_loss_clip": 2.37304688, + "router_z_loss_mlp": 0.21899414, + "step": 3818, + "time_per_iteration": 2.634387969970703 + }, + { + "auxiliary_loss_clip": 0.06541272, + "auxiliary_loss_mlp": 0.01278617, + "balance_loss_clip": 0.06305389, + "balance_loss_mlp": 0.0125524, + "epoch": 0.2296107019389749, + "flos": 23483036413440.0, + "grad_norm": 1.7338201278327374, + "language_loss": 0.75486314, + "learning_rate": 3.596386441116659e-06, + "loss": 0.83306205, + "num_input_tokens_seen": 82159510, + "router_z_loss_clip": 2.359375, + "router_z_loss_mlp": 0.23376465, + "step": 3819, + "time_per_iteration": 2.593780279159546 + }, + { + "auxiliary_loss_clip": 0.06542156, + "auxiliary_loss_mlp": 0.01283095, + "balance_loss_clip": 0.06305272, + "balance_loss_mlp": 0.01263009, + "epoch": 0.22967082519164286, + "flos": 31293212757120.0, + "grad_norm": 1.753994919034331, + "language_loss": 0.8208195, + "learning_rate": 3.5961517985444684e-06, + "loss": 0.89907205, + "num_input_tokens_seen": 82179580, + "router_z_loss_clip": 2.3671875, + "router_z_loss_mlp": 0.20092773, + "step": 3820, + "time_per_iteration": 2.6047699451446533 + }, + { + "auxiliary_loss_clip": 0.06548945, + "auxiliary_loss_mlp": 0.0128207, + "balance_loss_clip": 0.06306617, + "balance_loss_mlp": 0.0125892, + "epoch": 0.22973094844431083, + "flos": 14647415402880.0, + "grad_norm": 4.329935521611207, + "language_loss": 0.70069146, + "learning_rate": 3.595917095446042e-06, + "loss": 0.77900159, + "num_input_tokens_seen": 82195585, + "router_z_loss_clip": 2.42773438, + "router_z_loss_mlp": 0.23156738, + "step": 3821, + "time_per_iteration": 2.479454517364502 + }, + { + "auxiliary_loss_clip": 0.06540461, + "auxiliary_loss_mlp": 0.01284444, + "balance_loss_clip": 0.06305948, + "balance_loss_mlp": 0.0126177, + "epoch": 0.2297910716969788, + "flos": 22831057393920.0, + "grad_norm": 2.1026243527938897, + "language_loss": 0.83607674, + "learning_rate": 3.5956823318302796e-06, + "loss": 0.91432583, + "num_input_tokens_seen": 82217530, + "router_z_loss_clip": 2.34375, + "router_z_loss_mlp": 0.22668457, + "step": 3822, + "time_per_iteration": 2.6070644855499268 + }, + { + "auxiliary_loss_clip": 0.06532617, + "auxiliary_loss_mlp": 0.01279894, + "balance_loss_clip": 0.06300794, + "balance_loss_mlp": 0.01256637, + "epoch": 0.2298511949496468, + "flos": 23045644500480.0, + "grad_norm": 1.4679532921797136, + "language_loss": 0.66860032, + "learning_rate": 3.5954475077060833e-06, + "loss": 0.74672538, + "num_input_tokens_seen": 82237980, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.23266602, + "step": 3823, + "time_per_iteration": 2.5421886444091797 + }, + { + "auxiliary_loss_clip": 0.06414426, + "auxiliary_loss_mlp": 0.01282472, + "balance_loss_clip": 0.062925, + "balance_loss_mlp": 0.01277524, + "epoch": 0.22991131820231475, + "flos": 66910296228480.0, + "grad_norm": 0.7674542175482253, + "language_loss": 0.56982124, + "learning_rate": 3.595212623082357e-06, + "loss": 0.64679027, + "num_input_tokens_seen": 82301785, + "router_z_loss_clip": 1.21875, + "router_z_loss_mlp": 0.04943848, + "step": 3824, + "time_per_iteration": 3.2466728687286377 + }, + { + "auxiliary_loss_clip": 0.06530097, + "auxiliary_loss_mlp": 0.0127961, + "balance_loss_clip": 0.06299412, + "balance_loss_mlp": 0.01258975, + "epoch": 0.22997144145498272, + "flos": 17891782767360.0, + "grad_norm": 2.0818696062092643, + "language_loss": 0.73658061, + "learning_rate": 3.594977677968009e-06, + "loss": 0.81467766, + "num_input_tokens_seen": 82317355, + "router_z_loss_clip": 2.30664062, + "router_z_loss_mlp": 0.2064209, + "step": 3825, + "time_per_iteration": 2.4705512523651123 + }, + { + "auxiliary_loss_clip": 0.06534772, + "auxiliary_loss_mlp": 0.01279784, + "balance_loss_clip": 0.06299614, + "balance_loss_mlp": 0.01257432, + "epoch": 0.23003156470765068, + "flos": 24683055062400.0, + "grad_norm": 2.356013632504241, + "language_loss": 0.88289648, + "learning_rate": 3.5947426723719473e-06, + "loss": 0.96104205, + "num_input_tokens_seen": 82336645, + "router_z_loss_clip": 2.3515625, + "router_z_loss_mlp": 0.22351074, + "step": 3826, + "time_per_iteration": 2.5636119842529297 + }, + { + "auxiliary_loss_clip": 0.06540347, + "auxiliary_loss_mlp": 0.01282145, + "balance_loss_clip": 0.0629928, + "balance_loss_mlp": 0.0125897, + "epoch": 0.23009168796031865, + "flos": 15819412060800.0, + "grad_norm": 2.476820030154751, + "language_loss": 0.81866372, + "learning_rate": 3.594507606303083e-06, + "loss": 0.89688861, + "num_input_tokens_seen": 82354225, + "router_z_loss_clip": 2.41601562, + "router_z_loss_mlp": 0.23181152, + "step": 3827, + "time_per_iteration": 2.4817094802856445 + }, + { + "auxiliary_loss_clip": 0.06527712, + "auxiliary_loss_mlp": 0.01278643, + "balance_loss_clip": 0.06294617, + "balance_loss_mlp": 0.01256911, + "epoch": 0.2301518112129866, + "flos": 16217755171200.0, + "grad_norm": 1.7308897820243296, + "language_loss": 0.87303799, + "learning_rate": 3.5942724797703314e-06, + "loss": 0.95110154, + "num_input_tokens_seen": 82370240, + "router_z_loss_clip": 2.33398438, + "router_z_loss_mlp": 0.21716309, + "step": 3828, + "time_per_iteration": 2.517916202545166 + }, + { + "auxiliary_loss_clip": 0.06537049, + "auxiliary_loss_mlp": 0.01281894, + "balance_loss_clip": 0.06300969, + "balance_loss_mlp": 0.01260686, + "epoch": 0.2302119344656546, + "flos": 20601820644480.0, + "grad_norm": 2.1621841127041668, + "language_loss": 0.71223086, + "learning_rate": 3.594037292782607e-06, + "loss": 0.79042029, + "num_input_tokens_seen": 82389145, + "router_z_loss_clip": 2.36328125, + "router_z_loss_mlp": 0.21191406, + "step": 3829, + "time_per_iteration": 2.5232293605804443 + }, + { + "auxiliary_loss_clip": 0.06527743, + "auxiliary_loss_mlp": 0.01278561, + "balance_loss_clip": 0.06299868, + "balance_loss_mlp": 0.01258629, + "epoch": 0.23027205771832257, + "flos": 26804117790720.0, + "grad_norm": 1.5730479724984117, + "language_loss": 0.84944689, + "learning_rate": 3.5938020453488293e-06, + "loss": 0.92750996, + "num_input_tokens_seen": 82409185, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.19934082, + "step": 3830, + "time_per_iteration": 2.6153595447540283 + }, + { + "auxiliary_loss_clip": 0.0653088, + "auxiliary_loss_mlp": 0.01278488, + "balance_loss_clip": 0.06299009, + "balance_loss_mlp": 0.01256863, + "epoch": 0.23033218097099054, + "flos": 43883365916160.0, + "grad_norm": 2.1076872960056834, + "language_loss": 0.67121679, + "learning_rate": 3.5935667374779177e-06, + "loss": 0.74931049, + "num_input_tokens_seen": 82432070, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.21630859, + "step": 3831, + "time_per_iteration": 2.7302401065826416 + }, + { + "auxiliary_loss_clip": 0.06528492, + "auxiliary_loss_mlp": 0.0127826, + "balance_loss_clip": 0.06295311, + "balance_loss_mlp": 0.01255944, + "epoch": 0.2303923042236585, + "flos": 26074837779840.0, + "grad_norm": 2.0679638399971525, + "language_loss": 0.7580992, + "learning_rate": 3.5933313691787957e-06, + "loss": 0.83616674, + "num_input_tokens_seen": 82450625, + "router_z_loss_clip": 2.33203125, + "router_z_loss_mlp": 0.2232666, + "step": 3832, + "time_per_iteration": 2.5789363384246826 + }, + { + "auxiliary_loss_clip": 0.06538022, + "auxiliary_loss_mlp": 0.01277154, + "balance_loss_clip": 0.06301656, + "balance_loss_mlp": 0.01254731, + "epoch": 0.23045242747632647, + "flos": 18302284719360.0, + "grad_norm": 1.9809188001289737, + "language_loss": 0.88229948, + "learning_rate": 3.593095940460389e-06, + "loss": 0.96045125, + "num_input_tokens_seen": 82468575, + "router_z_loss_clip": 2.36523438, + "router_z_loss_mlp": 0.22387695, + "step": 3833, + "time_per_iteration": 2.4890406131744385 + }, + { + "auxiliary_loss_clip": 0.06526786, + "auxiliary_loss_mlp": 0.01275622, + "balance_loss_clip": 0.06291149, + "balance_loss_mlp": 0.01253295, + "epoch": 0.23051255072899443, + "flos": 25527636691200.0, + "grad_norm": 1.751792699614105, + "language_loss": 0.75447762, + "learning_rate": 3.592860451331624e-06, + "loss": 0.83250165, + "num_input_tokens_seen": 82488655, + "router_z_loss_clip": 2.35742188, + "router_z_loss_mlp": 0.2232666, + "step": 3834, + "time_per_iteration": 2.5706892013549805 + }, + { + "auxiliary_loss_clip": 0.06528607, + "auxiliary_loss_mlp": 0.0128462, + "balance_loss_clip": 0.06295913, + "balance_loss_mlp": 0.01262089, + "epoch": 0.2305726739816624, + "flos": 21221584968960.0, + "grad_norm": 2.065687600185831, + "language_loss": 0.86859775, + "learning_rate": 3.592624901801432e-06, + "loss": 0.94673002, + "num_input_tokens_seen": 82507220, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.2253418, + "step": 3835, + "time_per_iteration": 2.5243782997131348 + }, + { + "auxiliary_loss_clip": 0.06531255, + "auxiliary_loss_mlp": 0.01277066, + "balance_loss_clip": 0.06292518, + "balance_loss_mlp": 0.01255489, + "epoch": 0.2306327972343304, + "flos": 23337826087680.0, + "grad_norm": 2.699164056519065, + "language_loss": 0.8346436, + "learning_rate": 3.5923892918787432e-06, + "loss": 0.91272676, + "num_input_tokens_seen": 82527920, + "router_z_loss_clip": 2.38671875, + "router_z_loss_mlp": 0.21594238, + "step": 3836, + "time_per_iteration": 2.588916301727295 + }, + { + "auxiliary_loss_clip": 0.06530184, + "auxiliary_loss_mlp": 0.01278505, + "balance_loss_clip": 0.0629724, + "balance_loss_mlp": 0.01257918, + "epoch": 0.23069292048699835, + "flos": 20672832579840.0, + "grad_norm": 1.5308621387149557, + "language_loss": 0.80123997, + "learning_rate": 3.5921536215724934e-06, + "loss": 0.87932694, + "num_input_tokens_seen": 82549040, + "router_z_loss_clip": 2.328125, + "router_z_loss_mlp": 0.20581055, + "step": 3837, + "time_per_iteration": 2.5265891551971436 + }, + { + "auxiliary_loss_clip": 0.06398934, + "auxiliary_loss_mlp": 0.01263477, + "balance_loss_clip": 0.06276935, + "balance_loss_mlp": 0.01257871, + "epoch": 0.23075304373966632, + "flos": 70472854673280.0, + "grad_norm": 0.8661269137999401, + "language_loss": 0.65425092, + "learning_rate": 3.5919178908916184e-06, + "loss": 0.73087507, + "num_input_tokens_seen": 82604070, + "router_z_loss_clip": 1.21875, + "router_z_loss_mlp": 0.05606079, + "step": 3838, + "time_per_iteration": 3.0690691471099854 + }, + { + "auxiliary_loss_clip": 0.06529964, + "auxiliary_loss_mlp": 0.01281931, + "balance_loss_clip": 0.0629662, + "balance_loss_mlp": 0.01260592, + "epoch": 0.23081316699233428, + "flos": 16623603221760.0, + "grad_norm": 1.9712307402798914, + "language_loss": 0.76919234, + "learning_rate": 3.591682099845058e-06, + "loss": 0.84731126, + "num_input_tokens_seen": 82619665, + "router_z_loss_clip": 2.33398438, + "router_z_loss_mlp": 0.21337891, + "step": 3839, + "time_per_iteration": 2.507899522781372 + }, + { + "auxiliary_loss_clip": 0.06539556, + "auxiliary_loss_mlp": 0.01283771, + "balance_loss_clip": 0.06303147, + "balance_loss_mlp": 0.01261873, + "epoch": 0.23087329024500225, + "flos": 13303192677120.0, + "grad_norm": 1.9535711626830803, + "language_loss": 0.6973604, + "learning_rate": 3.591446248441752e-06, + "loss": 0.77559364, + "num_input_tokens_seen": 82637530, + "router_z_loss_clip": 2.36523438, + "router_z_loss_mlp": 0.21899414, + "step": 3840, + "time_per_iteration": 2.507403612136841 + }, + { + "auxiliary_loss_clip": 0.06524121, + "auxiliary_loss_mlp": 0.01283726, + "balance_loss_clip": 0.06291715, + "balance_loss_mlp": 0.01261994, + "epoch": 0.23093341349767021, + "flos": 17791574883840.0, + "grad_norm": 2.1010490795203967, + "language_loss": 0.79679501, + "learning_rate": 3.591210336690645e-06, + "loss": 0.87487352, + "num_input_tokens_seen": 82656130, + "router_z_loss_clip": 2.32421875, + "router_z_loss_mlp": 0.21740723, + "step": 3841, + "time_per_iteration": 2.542506456375122 + }, + { + "auxiliary_loss_clip": 0.06525128, + "auxiliary_loss_mlp": 0.0128577, + "balance_loss_clip": 0.06292316, + "balance_loss_mlp": 0.0126591, + "epoch": 0.23099353675033818, + "flos": 23994920206080.0, + "grad_norm": 2.202794692504719, + "language_loss": 0.83472121, + "learning_rate": 3.590974364600683e-06, + "loss": 0.91283023, + "num_input_tokens_seen": 82675295, + "router_z_loss_clip": 2.328125, + "router_z_loss_mlp": 0.19873047, + "step": 3842, + "time_per_iteration": 2.5885045528411865 + }, + { + "auxiliary_loss_clip": 0.06525495, + "auxiliary_loss_mlp": 0.01277864, + "balance_loss_clip": 0.06294134, + "balance_loss_mlp": 0.01256251, + "epoch": 0.23105366000300617, + "flos": 36004567478400.0, + "grad_norm": 1.5198018897685672, + "language_loss": 0.66582537, + "learning_rate": 3.5907383321808135e-06, + "loss": 0.74385899, + "num_input_tokens_seen": 82703260, + "router_z_loss_clip": 2.3125, + "router_z_loss_mlp": 0.21630859, + "step": 3843, + "time_per_iteration": 2.7418570518493652 + }, + { + "auxiliary_loss_clip": 0.06517389, + "auxiliary_loss_mlp": 0.01282302, + "balance_loss_clip": 0.06289946, + "balance_loss_mlp": 0.01261667, + "epoch": 0.23111378325567414, + "flos": 31252822289280.0, + "grad_norm": 2.0273673860648613, + "language_loss": 0.77953953, + "learning_rate": 3.590502239439987e-06, + "loss": 0.85753644, + "num_input_tokens_seen": 82725060, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.2064209, + "step": 3844, + "time_per_iteration": 2.697105884552002 + }, + { + "auxiliary_loss_clip": 0.0652685, + "auxiliary_loss_mlp": 0.01278908, + "balance_loss_clip": 0.0629425, + "balance_loss_mlp": 0.01258618, + "epoch": 0.2311739065083421, + "flos": 19214230631040.0, + "grad_norm": 1.5733936305181, + "language_loss": 0.78526026, + "learning_rate": 3.590266086387156e-06, + "loss": 0.86331779, + "num_input_tokens_seen": 82742960, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.20275879, + "step": 3845, + "time_per_iteration": 3.9081645011901855 + }, + { + "auxiliary_loss_clip": 0.06512116, + "auxiliary_loss_mlp": 0.01275528, + "balance_loss_clip": 0.06288872, + "balance_loss_mlp": 0.01256323, + "epoch": 0.23123402976101007, + "flos": 23365638443520.0, + "grad_norm": 2.144369954512039, + "language_loss": 0.7696318, + "learning_rate": 3.590029873031276e-06, + "loss": 0.84750825, + "num_input_tokens_seen": 82760205, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.1920166, + "step": 3846, + "time_per_iteration": 2.5204334259033203 + }, + { + "auxiliary_loss_clip": 0.06530652, + "auxiliary_loss_mlp": 0.01280785, + "balance_loss_clip": 0.06296441, + "balance_loss_mlp": 0.01258946, + "epoch": 0.23129415301367803, + "flos": 13740458808960.0, + "grad_norm": 2.058546116129278, + "language_loss": 0.70736533, + "learning_rate": 3.589793599381304e-06, + "loss": 0.78547966, + "num_input_tokens_seen": 82778590, + "router_z_loss_clip": 2.34570312, + "router_z_loss_mlp": 0.21862793, + "step": 3847, + "time_per_iteration": 3.955061197280884 + }, + { + "auxiliary_loss_clip": 0.06395237, + "auxiliary_loss_mlp": 0.01270099, + "balance_loss_clip": 0.06274906, + "balance_loss_mlp": 0.01264553, + "epoch": 0.231354276266346, + "flos": 69756907461120.0, + "grad_norm": 0.7764718422559022, + "language_loss": 0.60909712, + "learning_rate": 3.589557265446198e-06, + "loss": 0.68575048, + "num_input_tokens_seen": 82833925, + "router_z_loss_clip": 1.203125, + "router_z_loss_mlp": 0.05557251, + "step": 3848, + "time_per_iteration": 3.0406246185302734 + }, + { + "auxiliary_loss_clip": 0.0652846, + "auxiliary_loss_mlp": 0.0128118, + "balance_loss_clip": 0.06295802, + "balance_loss_mlp": 0.01259925, + "epoch": 0.231414399519014, + "flos": 18840597275520.0, + "grad_norm": 2.051565204924659, + "language_loss": 0.79345453, + "learning_rate": 3.589320871234923e-06, + "loss": 0.87155092, + "num_input_tokens_seen": 82850625, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.21252441, + "step": 3849, + "time_per_iteration": 2.508357048034668 + }, + { + "auxiliary_loss_clip": 0.06525768, + "auxiliary_loss_mlp": 0.01279584, + "balance_loss_clip": 0.06294318, + "balance_loss_mlp": 0.01257995, + "epoch": 0.23147452277168196, + "flos": 36143949945600.0, + "grad_norm": 1.9799304996672493, + "language_loss": 0.72033536, + "learning_rate": 3.5890844167564405e-06, + "loss": 0.7983889, + "num_input_tokens_seen": 82872105, + "router_z_loss_clip": 2.31640625, + "router_z_loss_mlp": 0.21594238, + "step": 3850, + "time_per_iteration": 2.6283209323883057 + }, + { + "auxiliary_loss_clip": 0.06522007, + "auxiliary_loss_mlp": 0.012814, + "balance_loss_clip": 0.06293751, + "balance_loss_mlp": 0.01260562, + "epoch": 0.23153464602434992, + "flos": 20819091081600.0, + "grad_norm": 2.1585980033328216, + "language_loss": 0.76770389, + "learning_rate": 3.588847902019718e-06, + "loss": 0.84573799, + "num_input_tokens_seen": 82890595, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.20825195, + "step": 3851, + "time_per_iteration": 3.9542527198791504 + }, + { + "auxiliary_loss_clip": 0.06522575, + "auxiliary_loss_mlp": 0.01285563, + "balance_loss_clip": 0.06294242, + "balance_loss_mlp": 0.01264367, + "epoch": 0.2315947692770179, + "flos": 19945606993920.0, + "grad_norm": 4.396515099862161, + "language_loss": 0.70780337, + "learning_rate": 3.588611327033723e-06, + "loss": 0.78588474, + "num_input_tokens_seen": 82908910, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.21191406, + "step": 3852, + "time_per_iteration": 2.5292365550994873 + }, + { + "auxiliary_loss_clip": 0.06530476, + "auxiliary_loss_mlp": 0.01287483, + "balance_loss_clip": 0.0629744, + "balance_loss_mlp": 0.01267027, + "epoch": 0.23165489252968585, + "flos": 12859805197440.0, + "grad_norm": 2.0519661349019906, + "language_loss": 0.68142366, + "learning_rate": 3.588374691807428e-06, + "loss": 0.75960326, + "num_input_tokens_seen": 82925405, + "router_z_loss_clip": 2.33203125, + "router_z_loss_mlp": 0.20471191, + "step": 3853, + "time_per_iteration": 2.524214267730713 + }, + { + "auxiliary_loss_clip": 0.06532255, + "auxiliary_loss_mlp": 0.0127975, + "balance_loss_clip": 0.06299816, + "balance_loss_mlp": 0.01258579, + "epoch": 0.23171501578235382, + "flos": 30636202492800.0, + "grad_norm": 2.067759569090495, + "language_loss": 0.80620718, + "learning_rate": 3.5881379963498053e-06, + "loss": 0.88432729, + "num_input_tokens_seen": 82945615, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.21166992, + "step": 3854, + "time_per_iteration": 3.9913628101348877 + }, + { + "auxiliary_loss_clip": 0.06540599, + "auxiliary_loss_mlp": 0.0128392, + "balance_loss_clip": 0.06299743, + "balance_loss_mlp": 0.0126201, + "epoch": 0.23177513903502178, + "flos": 23849709880320.0, + "grad_norm": 1.9679065377847755, + "language_loss": 0.66096866, + "learning_rate": 3.587901240669831e-06, + "loss": 0.73921382, + "num_input_tokens_seen": 82967570, + "router_z_loss_clip": 2.4140625, + "router_z_loss_mlp": 0.21899414, + "step": 3855, + "time_per_iteration": 2.560032844543457 + }, + { + "auxiliary_loss_clip": 0.06526054, + "auxiliary_loss_mlp": 0.0129156, + "balance_loss_clip": 0.06295231, + "balance_loss_mlp": 0.0126972, + "epoch": 0.23183526228768978, + "flos": 29578040006400.0, + "grad_norm": 1.903884891832667, + "language_loss": 0.71179903, + "learning_rate": 3.5876644247764815e-06, + "loss": 0.78997517, + "num_input_tokens_seen": 82987435, + "router_z_loss_clip": 2.30664062, + "router_z_loss_mlp": 0.21838379, + "step": 3856, + "time_per_iteration": 2.602130174636841 + }, + { + "auxiliary_loss_clip": 0.06526691, + "auxiliary_loss_mlp": 0.01281572, + "balance_loss_clip": 0.06295416, + "balance_loss_mlp": 0.01261032, + "epoch": 0.23189538554035774, + "flos": 34467155164800.0, + "grad_norm": 1.5724941960823864, + "language_loss": 0.77830631, + "learning_rate": 3.5874275486787387e-06, + "loss": 0.85638893, + "num_input_tokens_seen": 83010505, + "router_z_loss_clip": 2.3125, + "router_z_loss_mlp": 0.20532227, + "step": 3857, + "time_per_iteration": 2.6366043090820312 + }, + { + "auxiliary_loss_clip": 0.06534412, + "auxiliary_loss_mlp": 0.0128226, + "balance_loss_clip": 0.06299518, + "balance_loss_mlp": 0.01259813, + "epoch": 0.2319555087930257, + "flos": 18009558080640.0, + "grad_norm": 2.2572913357008804, + "language_loss": 0.91563249, + "learning_rate": 3.587190612385584e-06, + "loss": 0.99379921, + "num_input_tokens_seen": 83026705, + "router_z_loss_clip": 2.34765625, + "router_z_loss_mlp": 0.2244873, + "step": 3858, + "time_per_iteration": 2.532270908355713 + }, + { + "auxiliary_loss_clip": 0.06524485, + "auxiliary_loss_mlp": 0.01281992, + "balance_loss_clip": 0.06299204, + "balance_loss_mlp": 0.01261833, + "epoch": 0.23201563204569367, + "flos": 23149709671680.0, + "grad_norm": 2.204043049012761, + "language_loss": 0.77328205, + "learning_rate": 3.5869536159060026e-06, + "loss": 0.85134679, + "num_input_tokens_seen": 83046500, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.20153809, + "step": 3859, + "time_per_iteration": 2.539982318878174 + }, + { + "auxiliary_loss_clip": 0.06526206, + "auxiliary_loss_mlp": 0.01282174, + "balance_loss_clip": 0.0629694, + "balance_loss_mlp": 0.01261098, + "epoch": 0.23207575529836164, + "flos": 20674300026240.0, + "grad_norm": 1.845949683873727, + "language_loss": 0.84980345, + "learning_rate": 3.58671655924898e-06, + "loss": 0.9278872, + "num_input_tokens_seen": 83065280, + "router_z_loss_clip": 2.29101562, + "router_z_loss_mlp": 0.21057129, + "step": 3860, + "time_per_iteration": 2.5464277267456055 + }, + { + "auxiliary_loss_clip": 0.06522566, + "auxiliary_loss_mlp": 0.01275514, + "balance_loss_clip": 0.06296555, + "balance_loss_mlp": 0.01254927, + "epoch": 0.2321358785510296, + "flos": 16477805917440.0, + "grad_norm": 2.2860023761203423, + "language_loss": 0.83316106, + "learning_rate": 3.586479442423508e-06, + "loss": 0.91114187, + "num_input_tokens_seen": 83082310, + "router_z_loss_clip": 2.25976562, + "router_z_loss_mlp": 0.20581055, + "step": 3861, + "time_per_iteration": 2.611527681350708 + }, + { + "auxiliary_loss_clip": 0.06526297, + "auxiliary_loss_mlp": 0.01281702, + "balance_loss_clip": 0.06296666, + "balance_loss_mlp": 0.01261198, + "epoch": 0.2321960018036976, + "flos": 21622737191040.0, + "grad_norm": 1.932164160561112, + "language_loss": 0.86100018, + "learning_rate": 3.586242265438576e-06, + "loss": 0.93908012, + "num_input_tokens_seen": 83102065, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.2052002, + "step": 3862, + "time_per_iteration": 2.599078893661499 + }, + { + "auxiliary_loss_clip": 0.06517789, + "auxiliary_loss_mlp": 0.01277863, + "balance_loss_clip": 0.0629621, + "balance_loss_mlp": 0.0125898, + "epoch": 0.23225612505636556, + "flos": 22277734957440.0, + "grad_norm": 1.8279700206037066, + "language_loss": 0.75524014, + "learning_rate": 3.5860050283031773e-06, + "loss": 0.83319664, + "num_input_tokens_seen": 83121445, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.18884277, + "step": 3863, + "time_per_iteration": 2.5592801570892334 + }, + { + "auxiliary_loss_clip": 0.06518993, + "auxiliary_loss_mlp": 0.01279608, + "balance_loss_clip": 0.06295245, + "balance_loss_mlp": 0.01260237, + "epoch": 0.23231624830903352, + "flos": 17057431336320.0, + "grad_norm": 1.8656538002376628, + "language_loss": 0.7504397, + "learning_rate": 3.58576773102631e-06, + "loss": 0.82842577, + "num_input_tokens_seen": 83138175, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.19372559, + "step": 3864, + "time_per_iteration": 2.549480438232422 + }, + { + "auxiliary_loss_clip": 0.06521947, + "auxiliary_loss_mlp": 0.01276148, + "balance_loss_clip": 0.06295212, + "balance_loss_mlp": 0.01255572, + "epoch": 0.2323763715617015, + "flos": 34648353619200.0, + "grad_norm": 2.1960138476201023, + "language_loss": 0.70505309, + "learning_rate": 3.5855303736169714e-06, + "loss": 0.78303403, + "num_input_tokens_seen": 83161975, + "router_z_loss_clip": 2.26757812, + "router_z_loss_mlp": 0.20568848, + "step": 3865, + "time_per_iteration": 2.6358752250671387 + }, + { + "auxiliary_loss_clip": 0.06539118, + "auxiliary_loss_mlp": 0.01279948, + "balance_loss_clip": 0.06299968, + "balance_loss_mlp": 0.01256464, + "epoch": 0.23243649481436945, + "flos": 25557922742400.0, + "grad_norm": 1.8533317501805489, + "language_loss": 0.95648015, + "learning_rate": 3.5852929560841617e-06, + "loss": 1.03467083, + "num_input_tokens_seen": 83180905, + "router_z_loss_clip": 2.39257812, + "router_z_loss_mlp": 0.23510742, + "step": 3866, + "time_per_iteration": 2.5805771350860596 + }, + { + "auxiliary_loss_clip": 0.06523386, + "auxiliary_loss_mlp": 0.0128215, + "balance_loss_clip": 0.06294955, + "balance_loss_mlp": 0.01260561, + "epoch": 0.23249661806703742, + "flos": 20489411992320.0, + "grad_norm": 3.3036871554572285, + "language_loss": 0.74161094, + "learning_rate": 3.5850554784368846e-06, + "loss": 0.81966627, + "num_input_tokens_seen": 83196390, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.21569824, + "step": 3867, + "time_per_iteration": 2.485872268676758 + }, + { + "auxiliary_loss_clip": 0.06527717, + "auxiliary_loss_mlp": 0.01278812, + "balance_loss_clip": 0.06298171, + "balance_loss_mlp": 0.01257271, + "epoch": 0.23255674131970538, + "flos": 20382956979840.0, + "grad_norm": 1.7596317335066716, + "language_loss": 0.82912898, + "learning_rate": 3.584817940684145e-06, + "loss": 0.90719432, + "num_input_tokens_seen": 83216165, + "router_z_loss_clip": 2.29296875, + "router_z_loss_mlp": 0.2154541, + "step": 3868, + "time_per_iteration": 2.5404841899871826 + }, + { + "auxiliary_loss_clip": 0.06518516, + "auxiliary_loss_mlp": 0.01279395, + "balance_loss_clip": 0.0629604, + "balance_loss_mlp": 0.01260321, + "epoch": 0.23261686457237338, + "flos": 17061833675520.0, + "grad_norm": 1.6597028261056146, + "language_loss": 0.73686016, + "learning_rate": 3.58458034283495e-06, + "loss": 0.81483924, + "num_input_tokens_seen": 83233845, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.1907959, + "step": 3869, + "time_per_iteration": 2.4850685596466064 + }, + { + "auxiliary_loss_clip": 0.06524374, + "auxiliary_loss_mlp": 0.01289937, + "balance_loss_clip": 0.06296247, + "balance_loss_mlp": 0.01268241, + "epoch": 0.23267698782504134, + "flos": 29177726325120.0, + "grad_norm": 1.8030595092782438, + "language_loss": 0.8079325, + "learning_rate": 3.5843426848983097e-06, + "loss": 0.88607562, + "num_input_tokens_seen": 83254930, + "router_z_loss_clip": 2.28320312, + "router_z_loss_mlp": 0.21716309, + "step": 3870, + "time_per_iteration": 2.5915870666503906 + }, + { + "auxiliary_loss_clip": 0.06532744, + "auxiliary_loss_mlp": 0.01283178, + "balance_loss_clip": 0.06299601, + "balance_loss_mlp": 0.0126178, + "epoch": 0.2327371110777093, + "flos": 21180355960320.0, + "grad_norm": 1.9640097574691695, + "language_loss": 0.71693742, + "learning_rate": 3.5841049668832357e-06, + "loss": 0.79509664, + "num_input_tokens_seen": 83272095, + "router_z_loss_clip": 2.33398438, + "router_z_loss_mlp": 0.21411133, + "step": 3871, + "time_per_iteration": 2.4897918701171875 + }, + { + "auxiliary_loss_clip": 0.065286, + "auxiliary_loss_mlp": 0.01280741, + "balance_loss_clip": 0.06295659, + "balance_loss_mlp": 0.01260034, + "epoch": 0.23279723433037727, + "flos": 24869997521280.0, + "grad_norm": 2.5352867939179484, + "language_loss": 0.69289309, + "learning_rate": 3.5838671887987433e-06, + "loss": 0.77098656, + "num_input_tokens_seen": 83290980, + "router_z_loss_clip": 2.33007812, + "router_z_loss_mlp": 0.20715332, + "step": 3872, + "time_per_iteration": 2.5636072158813477 + }, + { + "auxiliary_loss_clip": 0.06535204, + "auxiliary_loss_mlp": 0.01285984, + "balance_loss_clip": 0.06299452, + "balance_loss_mlp": 0.01263894, + "epoch": 0.23285735758304524, + "flos": 38809823921280.0, + "grad_norm": 2.0709139139802497, + "language_loss": 0.78303361, + "learning_rate": 3.5836293506538474e-06, + "loss": 0.86124545, + "num_input_tokens_seen": 83315175, + "router_z_loss_clip": 2.35742188, + "router_z_loss_mlp": 0.22094727, + "step": 3873, + "time_per_iteration": 2.671551465988159 + }, + { + "auxiliary_loss_clip": 0.06419215, + "auxiliary_loss_mlp": 0.01286246, + "balance_loss_clip": 0.06302594, + "balance_loss_mlp": 0.01280601, + "epoch": 0.2329174808357132, + "flos": 53962274280960.0, + "grad_norm": 0.8377063316545934, + "language_loss": 0.60286367, + "learning_rate": 3.5833914524575687e-06, + "loss": 0.67991829, + "num_input_tokens_seen": 83372060, + "router_z_loss_clip": 1.171875, + "router_z_loss_mlp": 0.05636597, + "step": 3874, + "time_per_iteration": 3.087822675704956 + }, + { + "auxiliary_loss_clip": 0.06525364, + "auxiliary_loss_mlp": 0.01281697, + "balance_loss_clip": 0.06298245, + "balance_loss_mlp": 0.012608, + "epoch": 0.23297760408838117, + "flos": 21222549290880.0, + "grad_norm": 2.3064833177652773, + "language_loss": 0.81324208, + "learning_rate": 3.583153494218927e-06, + "loss": 0.89131272, + "num_input_tokens_seen": 83389795, + "router_z_loss_clip": 2.2734375, + "router_z_loss_mlp": 0.20898438, + "step": 3875, + "time_per_iteration": 2.560511589050293 + }, + { + "auxiliary_loss_clip": 0.06520373, + "auxiliary_loss_mlp": 0.01275593, + "balance_loss_clip": 0.06294609, + "balance_loss_mlp": 0.01255983, + "epoch": 0.23303772734104916, + "flos": 28410613395840.0, + "grad_norm": 2.285945976693144, + "language_loss": 0.62077069, + "learning_rate": 3.5829154759469464e-06, + "loss": 0.69873035, + "num_input_tokens_seen": 83410005, + "router_z_loss_clip": 2.25976562, + "router_z_loss_mlp": 0.19628906, + "step": 3876, + "time_per_iteration": 2.63901948928833 + }, + { + "auxiliary_loss_clip": 0.06525883, + "auxiliary_loss_mlp": 0.01277799, + "balance_loss_clip": 0.06296121, + "balance_loss_mlp": 0.01258034, + "epoch": 0.23309785059371713, + "flos": 24321328986240.0, + "grad_norm": 1.9984006432494335, + "language_loss": 0.71087664, + "learning_rate": 3.5826773976506523e-06, + "loss": 0.78891349, + "num_input_tokens_seen": 83430250, + "router_z_loss_clip": 2.29492188, + "router_z_loss_mlp": 0.19787598, + "step": 3877, + "time_per_iteration": 2.533858299255371 + }, + { + "auxiliary_loss_clip": 0.06524412, + "auxiliary_loss_mlp": 0.01274037, + "balance_loss_clip": 0.06297307, + "balance_loss_mlp": 0.01253485, + "epoch": 0.2331579738463851, + "flos": 15997633695360.0, + "grad_norm": 2.4085120625047143, + "language_loss": 0.81286502, + "learning_rate": 3.582439259339073e-06, + "loss": 0.89084947, + "num_input_tokens_seen": 83447950, + "router_z_loss_clip": 2.27148438, + "router_z_loss_mlp": 0.20556641, + "step": 3878, + "time_per_iteration": 2.5396199226379395 + }, + { + "auxiliary_loss_clip": 0.06534204, + "auxiliary_loss_mlp": 0.01280932, + "balance_loss_clip": 0.06299698, + "balance_loss_mlp": 0.0126013, + "epoch": 0.23321809709905306, + "flos": 36435418773120.0, + "grad_norm": 2.3738521781051207, + "language_loss": 0.75046253, + "learning_rate": 3.5822010610212374e-06, + "loss": 0.82861388, + "num_input_tokens_seen": 83467785, + "router_z_loss_clip": 2.34570312, + "router_z_loss_mlp": 0.20788574, + "step": 3879, + "time_per_iteration": 2.6389944553375244 + }, + { + "auxiliary_loss_clip": 0.06528227, + "auxiliary_loss_mlp": 0.01279465, + "balance_loss_clip": 0.06299725, + "balance_loss_mlp": 0.01257972, + "epoch": 0.23327822035172102, + "flos": 21331184509440.0, + "grad_norm": 4.081669167605711, + "language_loss": 0.90526301, + "learning_rate": 3.5819628027061795e-06, + "loss": 0.98333991, + "num_input_tokens_seen": 83485390, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.21496582, + "step": 3880, + "time_per_iteration": 2.5659923553466797 + }, + { + "auxiliary_loss_clip": 0.06530303, + "auxiliary_loss_mlp": 0.01278258, + "balance_loss_clip": 0.06297769, + "balance_loss_mlp": 0.0125841, + "epoch": 0.233338343604389, + "flos": 19177907086080.0, + "grad_norm": 1.8856968798779488, + "language_loss": 0.72716117, + "learning_rate": 3.5817244844029334e-06, + "loss": 0.80524671, + "num_input_tokens_seen": 83504890, + "router_z_loss_clip": 2.32421875, + "router_z_loss_mlp": 0.19848633, + "step": 3881, + "time_per_iteration": 2.528083324432373 + }, + { + "auxiliary_loss_clip": 0.0653114, + "auxiliary_loss_mlp": 0.01278184, + "balance_loss_clip": 0.06302784, + "balance_loss_mlp": 0.0125805, + "epoch": 0.23339846685705698, + "flos": 26915939464320.0, + "grad_norm": 1.6578041146422486, + "language_loss": 0.68699455, + "learning_rate": 3.581486106120537e-06, + "loss": 0.76508778, + "num_input_tokens_seen": 83526475, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.20129395, + "step": 3882, + "time_per_iteration": 2.575275182723999 + }, + { + "auxiliary_loss_clip": 0.06529698, + "auxiliary_loss_mlp": 0.0127867, + "balance_loss_clip": 0.0629693, + "balance_loss_mlp": 0.01258226, + "epoch": 0.23345859010972494, + "flos": 32351375243520.0, + "grad_norm": 2.0584115637368767, + "language_loss": 0.77458596, + "learning_rate": 3.5812476678680287e-06, + "loss": 0.8526696, + "num_input_tokens_seen": 83546620, + "router_z_loss_clip": 2.33007812, + "router_z_loss_mlp": 0.20446777, + "step": 3883, + "time_per_iteration": 2.626533269882202 + }, + { + "auxiliary_loss_clip": 0.06405331, + "auxiliary_loss_mlp": 0.01262592, + "balance_loss_clip": 0.0628854, + "balance_loss_mlp": 0.01257663, + "epoch": 0.2335187133623929, + "flos": 58505805273600.0, + "grad_norm": 0.7704933603606158, + "language_loss": 0.59193355, + "learning_rate": 3.58100916965445e-06, + "loss": 0.66861278, + "num_input_tokens_seen": 83616160, + "router_z_loss_clip": 1.17089844, + "router_z_loss_mlp": 0.04925537, + "step": 3884, + "time_per_iteration": 4.6365087032318115 + }, + { + "auxiliary_loss_clip": 0.06533933, + "auxiliary_loss_mlp": 0.01280044, + "balance_loss_clip": 0.06302384, + "balance_loss_mlp": 0.01260017, + "epoch": 0.23357883661506088, + "flos": 24509822745600.0, + "grad_norm": 1.6610169782824564, + "language_loss": 0.80755335, + "learning_rate": 3.5807706114888455e-06, + "loss": 0.88569313, + "num_input_tokens_seen": 83636795, + "router_z_loss_clip": 2.31640625, + "router_z_loss_mlp": 0.20031738, + "step": 3885, + "time_per_iteration": 2.6180286407470703 + }, + { + "auxiliary_loss_clip": 0.06523974, + "auxiliary_loss_mlp": 0.01286823, + "balance_loss_clip": 0.06296945, + "balance_loss_mlp": 0.01265687, + "epoch": 0.23363895986772884, + "flos": 18953760614400.0, + "grad_norm": 2.3207575064623613, + "language_loss": 0.88500953, + "learning_rate": 3.580531993380261e-06, + "loss": 0.96311754, + "num_input_tokens_seen": 83654050, + "router_z_loss_clip": 2.26953125, + "router_z_loss_mlp": 0.21130371, + "step": 3886, + "time_per_iteration": 2.5116477012634277 + }, + { + "auxiliary_loss_clip": 0.06532702, + "auxiliary_loss_mlp": 0.01282855, + "balance_loss_clip": 0.06302926, + "balance_loss_mlp": 0.01262518, + "epoch": 0.2336990831203968, + "flos": 31694993884800.0, + "grad_norm": 1.8877154320423692, + "language_loss": 0.74203557, + "learning_rate": 3.5802933153377445e-06, + "loss": 0.82019114, + "num_input_tokens_seen": 83673720, + "router_z_loss_clip": 2.29882812, + "router_z_loss_mlp": 0.20336914, + "step": 3887, + "time_per_iteration": 4.024793863296509 + }, + { + "auxiliary_loss_clip": 0.06531121, + "auxiliary_loss_mlp": 0.01281305, + "balance_loss_clip": 0.06301375, + "balance_loss_mlp": 0.01261206, + "epoch": 0.23375920637306477, + "flos": 27717237659520.0, + "grad_norm": 1.8176198265631485, + "language_loss": 0.84478307, + "learning_rate": 3.5800545773703475e-06, + "loss": 0.92290735, + "num_input_tokens_seen": 83693470, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.20092773, + "step": 3888, + "time_per_iteration": 2.6297786235809326 + }, + { + "auxiliary_loss_clip": 0.06524558, + "auxiliary_loss_mlp": 0.01283639, + "balance_loss_clip": 0.06298919, + "balance_loss_mlp": 0.01263934, + "epoch": 0.23381932962573276, + "flos": 17681346437760.0, + "grad_norm": 2.056965631559896, + "language_loss": 0.88319886, + "learning_rate": 3.5798157794871225e-06, + "loss": 0.96128076, + "num_input_tokens_seen": 83711620, + "router_z_loss_clip": 2.25585938, + "router_z_loss_mlp": 0.19689941, + "step": 3889, + "time_per_iteration": 2.524937152862549 + }, + { + "auxiliary_loss_clip": 0.06524722, + "auxiliary_loss_mlp": 0.01282198, + "balance_loss_clip": 0.06299812, + "balance_loss_mlp": 0.01262708, + "epoch": 0.23387945287840073, + "flos": 14395833918720.0, + "grad_norm": 2.5361674913720487, + "language_loss": 0.7777229, + "learning_rate": 3.579576921697125e-06, + "loss": 0.85579211, + "num_input_tokens_seen": 83727890, + "router_z_loss_clip": 2.24804688, + "router_z_loss_mlp": 0.19470215, + "step": 3890, + "time_per_iteration": 4.02982497215271 + }, + { + "auxiliary_loss_clip": 0.06526545, + "auxiliary_loss_mlp": 0.01284178, + "balance_loss_clip": 0.06297928, + "balance_loss_mlp": 0.01264008, + "epoch": 0.2339395761310687, + "flos": 46108451888640.0, + "grad_norm": 1.897831891943022, + "language_loss": 0.74213481, + "learning_rate": 3.579338004009412e-06, + "loss": 0.82024205, + "num_input_tokens_seen": 83749370, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.20166016, + "step": 3891, + "time_per_iteration": 2.7951042652130127 + }, + { + "auxiliary_loss_clip": 0.06524959, + "auxiliary_loss_mlp": 0.01281513, + "balance_loss_clip": 0.06301059, + "balance_loss_mlp": 0.01262821, + "epoch": 0.23399969938373666, + "flos": 22388508455040.0, + "grad_norm": 1.6273389699862264, + "language_loss": 0.82863498, + "learning_rate": 3.5790990264330433e-06, + "loss": 0.90669972, + "num_input_tokens_seen": 83769560, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.18688965, + "step": 3892, + "time_per_iteration": 2.530782461166382 + }, + { + "auxiliary_loss_clip": 0.06531358, + "auxiliary_loss_mlp": 0.01281181, + "balance_loss_clip": 0.06301633, + "balance_loss_mlp": 0.01260951, + "epoch": 0.23405982263640462, + "flos": 43518746874240.0, + "grad_norm": 1.4575042253356143, + "language_loss": 0.65593249, + "learning_rate": 3.578859988977082e-06, + "loss": 0.7340579, + "num_input_tokens_seen": 83795635, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.20227051, + "step": 3893, + "time_per_iteration": 4.212572813034058 + }, + { + "auxiliary_loss_clip": 0.06519544, + "auxiliary_loss_mlp": 0.01283369, + "balance_loss_clip": 0.06297972, + "balance_loss_mlp": 0.01263259, + "epoch": 0.2341199458890726, + "flos": 22571216282880.0, + "grad_norm": 2.0084649252152564, + "language_loss": 0.79620147, + "learning_rate": 3.5786208916505916e-06, + "loss": 0.87423062, + "num_input_tokens_seen": 83814090, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.20117188, + "step": 3894, + "time_per_iteration": 2.580109119415283 + }, + { + "auxiliary_loss_clip": 0.06524212, + "auxiliary_loss_mlp": 0.01276443, + "balance_loss_clip": 0.06300013, + "balance_loss_mlp": 0.01257763, + "epoch": 0.23418006914174055, + "flos": 25641764352000.0, + "grad_norm": 1.5130292757453454, + "language_loss": 0.82681906, + "learning_rate": 3.5783817344626383e-06, + "loss": 0.90482563, + "num_input_tokens_seen": 83836870, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.18664551, + "step": 3895, + "time_per_iteration": 2.583759069442749 + }, + { + "auxiliary_loss_clip": 0.06520028, + "auxiliary_loss_mlp": 0.01278233, + "balance_loss_clip": 0.06295593, + "balance_loss_mlp": 0.0125885, + "epoch": 0.23424019239440855, + "flos": 13549826770560.0, + "grad_norm": 2.4592405022159496, + "language_loss": 0.81334293, + "learning_rate": 3.578142517422292e-06, + "loss": 0.89132559, + "num_input_tokens_seen": 83853275, + "router_z_loss_clip": 2.24414062, + "router_z_loss_mlp": 0.19372559, + "step": 3896, + "time_per_iteration": 2.536252021789551 + }, + { + "auxiliary_loss_clip": 0.06530771, + "auxiliary_loss_mlp": 0.012867, + "balance_loss_clip": 0.06299435, + "balance_loss_mlp": 0.01264253, + "epoch": 0.2343003156470765, + "flos": 22426131738240.0, + "grad_norm": 3.0940729647414598, + "language_loss": 0.83988011, + "learning_rate": 3.577903240538623e-06, + "loss": 0.91805482, + "num_input_tokens_seen": 83872340, + "router_z_loss_clip": 2.31445312, + "router_z_loss_mlp": 0.2244873, + "step": 3897, + "time_per_iteration": 2.572230577468872 + }, + { + "auxiliary_loss_clip": 0.06528857, + "auxiliary_loss_mlp": 0.01279177, + "balance_loss_clip": 0.06296414, + "balance_loss_mlp": 0.01258626, + "epoch": 0.23436043889974448, + "flos": 14795644475520.0, + "grad_norm": 2.317273344502078, + "language_loss": 0.79819012, + "learning_rate": 3.577663903820705e-06, + "loss": 0.87627041, + "num_input_tokens_seen": 83888795, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.20544434, + "step": 3898, + "time_per_iteration": 2.5207583904266357 + }, + { + "auxiliary_loss_clip": 0.0651897, + "auxiliary_loss_mlp": 0.01278878, + "balance_loss_clip": 0.06297988, + "balance_loss_mlp": 0.0126021, + "epoch": 0.23442056215241244, + "flos": 22972242723840.0, + "grad_norm": 1.88849810547605, + "language_loss": 0.7476474, + "learning_rate": 3.577424507277614e-06, + "loss": 0.82562584, + "num_input_tokens_seen": 83906820, + "router_z_loss_clip": 2.20800781, + "router_z_loss_mlp": 0.18676758, + "step": 3899, + "time_per_iteration": 2.535256862640381 + }, + { + "auxiliary_loss_clip": 0.06525272, + "auxiliary_loss_mlp": 0.01280019, + "balance_loss_clip": 0.06296974, + "balance_loss_mlp": 0.01259515, + "epoch": 0.2344806854050804, + "flos": 23077901122560.0, + "grad_norm": 1.7218865416029, + "language_loss": 0.75599915, + "learning_rate": 3.5771850509184277e-06, + "loss": 0.83405209, + "num_input_tokens_seen": 83926370, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.20507812, + "step": 3900, + "time_per_iteration": 2.5674827098846436 + }, + { + "auxiliary_loss_clip": 0.06524841, + "auxiliary_loss_mlp": 0.01281356, + "balance_loss_clip": 0.06296976, + "balance_loss_mlp": 0.01260959, + "epoch": 0.23454080865774837, + "flos": 16332805226880.0, + "grad_norm": 2.155964713283421, + "language_loss": 0.67468774, + "learning_rate": 3.5769455347522256e-06, + "loss": 0.75274968, + "num_input_tokens_seen": 83944600, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.20410156, + "step": 3901, + "time_per_iteration": 2.536736249923706 + }, + { + "auxiliary_loss_clip": 0.06415819, + "auxiliary_loss_mlp": 0.01256149, + "balance_loss_clip": 0.06299057, + "balance_loss_mlp": 0.01251181, + "epoch": 0.23460093191041637, + "flos": 67779545685120.0, + "grad_norm": 0.7514179301091559, + "language_loss": 0.58278525, + "learning_rate": 3.576705958788091e-06, + "loss": 0.65950489, + "num_input_tokens_seen": 84005100, + "router_z_loss_clip": 1.16796875, + "router_z_loss_mlp": 0.0496521, + "step": 3902, + "time_per_iteration": 3.134718894958496 + }, + { + "auxiliary_loss_clip": 0.06519462, + "auxiliary_loss_mlp": 0.01278211, + "balance_loss_clip": 0.06292997, + "balance_loss_mlp": 0.01258375, + "epoch": 0.23466105516308433, + "flos": 20082725400960.0, + "grad_norm": 4.781089560028637, + "language_loss": 0.80931306, + "learning_rate": 3.576466323035108e-06, + "loss": 0.88728976, + "num_input_tokens_seen": 84023775, + "router_z_loss_clip": 2.26171875, + "router_z_loss_mlp": 0.19836426, + "step": 3903, + "time_per_iteration": 2.525059938430786 + }, + { + "auxiliary_loss_clip": 0.06522641, + "auxiliary_loss_mlp": 0.01278887, + "balance_loss_clip": 0.06295069, + "balance_loss_mlp": 0.01258955, + "epoch": 0.2347211784157523, + "flos": 24542708273280.0, + "grad_norm": 1.8578223556950417, + "language_loss": 0.82988703, + "learning_rate": 3.5762266275023645e-06, + "loss": 0.90790236, + "num_input_tokens_seen": 84042605, + "router_z_loss_clip": 2.27929688, + "router_z_loss_mlp": 0.19909668, + "step": 3904, + "time_per_iteration": 2.5903875827789307 + }, + { + "auxiliary_loss_clip": 0.0652332, + "auxiliary_loss_mlp": 0.01285911, + "balance_loss_clip": 0.06295672, + "balance_loss_mlp": 0.01265562, + "epoch": 0.23478130166842026, + "flos": 23811751180800.0, + "grad_norm": 1.985666710181995, + "language_loss": 0.7223646, + "learning_rate": 3.57598687219895e-06, + "loss": 0.80045688, + "num_input_tokens_seen": 84061520, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.20361328, + "step": 3905, + "time_per_iteration": 2.5441884994506836 + }, + { + "auxiliary_loss_clip": 0.06517074, + "auxiliary_loss_mlp": 0.01274876, + "balance_loss_clip": 0.06294023, + "balance_loss_mlp": 0.01255564, + "epoch": 0.23484142492108823, + "flos": 24099823918080.0, + "grad_norm": 2.433861192511871, + "language_loss": 0.71703601, + "learning_rate": 3.5757470571339543e-06, + "loss": 0.79495549, + "num_input_tokens_seen": 84081800, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.19311523, + "step": 3906, + "time_per_iteration": 2.698309898376465 + }, + { + "auxiliary_loss_clip": 0.06533175, + "auxiliary_loss_mlp": 0.01285298, + "balance_loss_clip": 0.06297503, + "balance_loss_mlp": 0.01264341, + "epoch": 0.2349015481737562, + "flos": 29103486007680.0, + "grad_norm": 2.7858195598302014, + "language_loss": 0.74089986, + "learning_rate": 3.575507182316473e-06, + "loss": 0.81908458, + "num_input_tokens_seen": 84102340, + "router_z_loss_clip": 2.35742188, + "router_z_loss_mlp": 0.20959473, + "step": 3907, + "time_per_iteration": 2.578900098800659 + }, + { + "auxiliary_loss_clip": 0.06524273, + "auxiliary_loss_mlp": 0.01280946, + "balance_loss_clip": 0.06294693, + "balance_loss_mlp": 0.01260418, + "epoch": 0.23496167142642416, + "flos": 18922258679040.0, + "grad_norm": 2.1308722973133385, + "language_loss": 0.73705935, + "learning_rate": 3.575267247755601e-06, + "loss": 0.81511152, + "num_input_tokens_seen": 84120370, + "router_z_loss_clip": 2.29492188, + "router_z_loss_mlp": 0.2052002, + "step": 3908, + "time_per_iteration": 2.599888801574707 + }, + { + "auxiliary_loss_clip": 0.06415461, + "auxiliary_loss_mlp": 0.01265268, + "balance_loss_clip": 0.06297816, + "balance_loss_mlp": 0.01259901, + "epoch": 0.23502179467909215, + "flos": 55884906541440.0, + "grad_norm": 1.2475277524680826, + "language_loss": 0.73364127, + "learning_rate": 3.5750272534604367e-06, + "loss": 0.81044865, + "num_input_tokens_seen": 84165515, + "router_z_loss_clip": 1.171875, + "router_z_loss_mlp": 0.05374146, + "step": 3909, + "time_per_iteration": 2.9221227169036865 + }, + { + "auxiliary_loss_clip": 0.06524064, + "auxiliary_loss_mlp": 0.01285302, + "balance_loss_clip": 0.06297419, + "balance_loss_mlp": 0.01265013, + "epoch": 0.23508191793176011, + "flos": 23408083336320.0, + "grad_norm": 1.6005271399570604, + "language_loss": 0.88581395, + "learning_rate": 3.5747871994400822e-06, + "loss": 0.9639076, + "num_input_tokens_seen": 84184540, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.20288086, + "step": 3910, + "time_per_iteration": 2.571974277496338 + }, + { + "auxiliary_loss_clip": 0.06520193, + "auxiliary_loss_mlp": 0.01278841, + "balance_loss_clip": 0.06293051, + "balance_loss_mlp": 0.01258658, + "epoch": 0.23514204118442808, + "flos": 20053864869120.0, + "grad_norm": 1.9643755437340527, + "language_loss": 0.76589572, + "learning_rate": 3.5745470857036386e-06, + "loss": 0.84388608, + "num_input_tokens_seen": 84202025, + "router_z_loss_clip": 2.27148438, + "router_z_loss_mlp": 0.2019043, + "step": 3911, + "time_per_iteration": 2.5159506797790527 + }, + { + "auxiliary_loss_clip": 0.06514487, + "auxiliary_loss_mlp": 0.01291153, + "balance_loss_clip": 0.06293596, + "balance_loss_mlp": 0.01272568, + "epoch": 0.23520216443709605, + "flos": 21587126405760.0, + "grad_norm": 1.5390832092388007, + "language_loss": 0.82200038, + "learning_rate": 3.5743069122602122e-06, + "loss": 0.90005672, + "num_input_tokens_seen": 84221895, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.18579102, + "step": 3912, + "time_per_iteration": 2.53330135345459 + }, + { + "auxiliary_loss_clip": 0.06515642, + "auxiliary_loss_mlp": 0.01288785, + "balance_loss_clip": 0.06294793, + "balance_loss_mlp": 0.01269604, + "epoch": 0.235262287689764, + "flos": 23192573834880.0, + "grad_norm": 1.8330232089961167, + "language_loss": 0.72023201, + "learning_rate": 3.574066679118909e-06, + "loss": 0.79827625, + "num_input_tokens_seen": 84240455, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.19177246, + "step": 3913, + "time_per_iteration": 2.5643818378448486 + }, + { + "auxiliary_loss_clip": 0.06528541, + "auxiliary_loss_mlp": 0.01277731, + "balance_loss_clip": 0.0629672, + "balance_loss_mlp": 0.01257238, + "epoch": 0.23532241094243198, + "flos": 23191903002240.0, + "grad_norm": 1.784539383466316, + "language_loss": 0.76976919, + "learning_rate": 3.57382638628884e-06, + "loss": 0.84783185, + "num_input_tokens_seen": 84261605, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.20483398, + "step": 3914, + "time_per_iteration": 2.575133800506592 + }, + { + "auxiliary_loss_clip": 0.06525879, + "auxiliary_loss_mlp": 0.01279953, + "balance_loss_clip": 0.06294835, + "balance_loss_mlp": 0.01259759, + "epoch": 0.23538253419509997, + "flos": 17025007006080.0, + "grad_norm": 2.4875564397369745, + "language_loss": 0.90170735, + "learning_rate": 3.5735860337791174e-06, + "loss": 0.97976559, + "num_input_tokens_seen": 84278675, + "router_z_loss_clip": 2.30859375, + "router_z_loss_mlp": 0.2019043, + "step": 3915, + "time_per_iteration": 2.563430070877075 + }, + { + "auxiliary_loss_clip": 0.06418007, + "auxiliary_loss_mlp": 0.01258116, + "balance_loss_clip": 0.06301998, + "balance_loss_mlp": 0.0125336, + "epoch": 0.23544265744776793, + "flos": 63465276263040.0, + "grad_norm": 0.7933859009920101, + "language_loss": 0.59378946, + "learning_rate": 3.573345621598854e-06, + "loss": 0.6705507, + "num_input_tokens_seen": 84329765, + "router_z_loss_clip": 1.15625, + "router_z_loss_mlp": 0.04748535, + "step": 3916, + "time_per_iteration": 3.0965490341186523 + }, + { + "auxiliary_loss_clip": 0.06410776, + "auxiliary_loss_mlp": 0.01260488, + "balance_loss_clip": 0.06295535, + "balance_loss_mlp": 0.01255756, + "epoch": 0.2355027807004359, + "flos": 70537395116160.0, + "grad_norm": 0.7426668339088592, + "language_loss": 0.49443412, + "learning_rate": 3.5731051497571675e-06, + "loss": 0.57114673, + "num_input_tokens_seen": 84393680, + "router_z_loss_clip": 1.15625, + "router_z_loss_mlp": 0.04724121, + "step": 3917, + "time_per_iteration": 3.180136203765869 + }, + { + "auxiliary_loss_clip": 0.06525698, + "auxiliary_loss_mlp": 0.01279416, + "balance_loss_clip": 0.06297344, + "balance_loss_mlp": 0.01259687, + "epoch": 0.23556290395310386, + "flos": 21440742122880.0, + "grad_norm": 2.189382839240281, + "language_loss": 0.77017808, + "learning_rate": 3.5728646182631756e-06, + "loss": 0.84822929, + "num_input_tokens_seen": 84412640, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.19714355, + "step": 3918, + "time_per_iteration": 2.546833038330078 + }, + { + "auxiliary_loss_clip": 0.0652653, + "auxiliary_loss_mlp": 0.01274201, + "balance_loss_clip": 0.06294574, + "balance_loss_mlp": 0.01254353, + "epoch": 0.23562302720577183, + "flos": 18192223981440.0, + "grad_norm": 2.402769767514051, + "language_loss": 0.70165813, + "learning_rate": 3.5726240271259995e-06, + "loss": 0.77966547, + "num_input_tokens_seen": 84431605, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.1986084, + "step": 3919, + "time_per_iteration": 2.561800479888916 + }, + { + "auxiliary_loss_clip": 0.06516096, + "auxiliary_loss_mlp": 0.01279326, + "balance_loss_clip": 0.06294449, + "balance_loss_mlp": 0.0125999, + "epoch": 0.2356831504584398, + "flos": 33739091038080.0, + "grad_norm": 1.6359966895302622, + "language_loss": 0.71094656, + "learning_rate": 3.5723833763547634e-06, + "loss": 0.78890085, + "num_input_tokens_seen": 84454210, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.19335938, + "step": 3920, + "time_per_iteration": 2.672703504562378 + }, + { + "auxiliary_loss_clip": 0.065192, + "auxiliary_loss_mlp": 0.0127625, + "balance_loss_clip": 0.06295229, + "balance_loss_mlp": 0.0125707, + "epoch": 0.23574327371110776, + "flos": 24939122739840.0, + "grad_norm": 1.9300596293530992, + "language_loss": 0.77833009, + "learning_rate": 3.5721426659585916e-06, + "loss": 0.85628462, + "num_input_tokens_seen": 84475540, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.19189453, + "step": 3921, + "time_per_iteration": 2.5823934078216553 + }, + { + "auxiliary_loss_clip": 0.06519832, + "auxiliary_loss_mlp": 0.01273471, + "balance_loss_clip": 0.06293498, + "balance_loss_mlp": 0.01254898, + "epoch": 0.23580339696377575, + "flos": 17827940355840.0, + "grad_norm": 2.282195745019935, + "language_loss": 0.76750088, + "learning_rate": 3.571901895946612e-06, + "loss": 0.84543383, + "num_input_tokens_seen": 84494580, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.18566895, + "step": 3922, + "time_per_iteration": 2.5005834102630615 + }, + { + "auxiliary_loss_clip": 0.06518443, + "auxiliary_loss_mlp": 0.01276376, + "balance_loss_clip": 0.06292558, + "balance_loss_mlp": 0.01257255, + "epoch": 0.23586352021644372, + "flos": 26293827225600.0, + "grad_norm": 2.0102031772622277, + "language_loss": 0.80626559, + "learning_rate": 3.571661066327956e-06, + "loss": 0.88421381, + "num_input_tokens_seen": 84513850, + "router_z_loss_clip": 2.25976562, + "router_z_loss_mlp": 0.19128418, + "step": 3923, + "time_per_iteration": 2.581338882446289 + }, + { + "auxiliary_loss_clip": 0.0652013, + "auxiliary_loss_mlp": 0.01275781, + "balance_loss_clip": 0.06296518, + "balance_loss_mlp": 0.01256326, + "epoch": 0.23592364346911168, + "flos": 14251965258240.0, + "grad_norm": 1.780788070615976, + "language_loss": 0.7507394, + "learning_rate": 3.571420177111754e-06, + "loss": 0.82869852, + "num_input_tokens_seen": 84532315, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.19458008, + "step": 3924, + "time_per_iteration": 3.9297289848327637 + }, + { + "auxiliary_loss_clip": 0.06516001, + "auxiliary_loss_mlp": 0.01276934, + "balance_loss_clip": 0.06293369, + "balance_loss_mlp": 0.01258039, + "epoch": 0.23598376672177965, + "flos": 18593837400960.0, + "grad_norm": 1.7528516859224217, + "language_loss": 0.83231425, + "learning_rate": 3.5711792283071416e-06, + "loss": 0.91024363, + "num_input_tokens_seen": 84550970, + "router_z_loss_clip": 2.2265625, + "router_z_loss_mlp": 0.18884277, + "step": 3925, + "time_per_iteration": 2.5267770290374756 + }, + { + "auxiliary_loss_clip": 0.06520985, + "auxiliary_loss_mlp": 0.01279855, + "balance_loss_clip": 0.06293195, + "balance_loss_mlp": 0.01259673, + "epoch": 0.2360438899744476, + "flos": 22682325196800.0, + "grad_norm": 1.753261892654821, + "language_loss": 0.60038519, + "learning_rate": 3.5709382199232564e-06, + "loss": 0.6783936, + "num_input_tokens_seen": 84571655, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.20178223, + "step": 3926, + "time_per_iteration": 4.023118257522583 + }, + { + "auxiliary_loss_clip": 0.06514051, + "auxiliary_loss_mlp": 0.01276677, + "balance_loss_clip": 0.06293727, + "balance_loss_mlp": 0.01257735, + "epoch": 0.23610401322711558, + "flos": 29577872298240.0, + "grad_norm": 1.9607796947198142, + "language_loss": 0.72402066, + "learning_rate": 3.570697151969235e-06, + "loss": 0.80192792, + "num_input_tokens_seen": 84593130, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.1895752, + "step": 3927, + "time_per_iteration": 2.6113367080688477 + }, + { + "auxiliary_loss_clip": 0.06515504, + "auxiliary_loss_mlp": 0.01275291, + "balance_loss_clip": 0.06291251, + "balance_loss_mlp": 0.01256373, + "epoch": 0.23616413647978354, + "flos": 17864347754880.0, + "grad_norm": 2.08357001670468, + "language_loss": 0.75570691, + "learning_rate": 3.570456024454221e-06, + "loss": 0.83361489, + "num_input_tokens_seen": 84612410, + "router_z_loss_clip": 2.24609375, + "router_z_loss_mlp": 0.18920898, + "step": 3928, + "time_per_iteration": 2.601884365081787 + }, + { + "auxiliary_loss_clip": 0.06522287, + "auxiliary_loss_mlp": 0.01280424, + "balance_loss_clip": 0.06293722, + "balance_loss_mlp": 0.01260338, + "epoch": 0.23622425973245154, + "flos": 11039393318400.0, + "grad_norm": 3.3378461006384788, + "language_loss": 0.82518888, + "learning_rate": 3.5702148373873576e-06, + "loss": 0.903216, + "num_input_tokens_seen": 84627610, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.20080566, + "step": 3929, + "time_per_iteration": 3.9035136699676514 + }, + { + "auxiliary_loss_clip": 0.0652993, + "auxiliary_loss_mlp": 0.01281554, + "balance_loss_clip": 0.06295136, + "balance_loss_mlp": 0.01261228, + "epoch": 0.2362843829851195, + "flos": 23410766666880.0, + "grad_norm": 2.0127268398029607, + "language_loss": 0.7229315, + "learning_rate": 3.569973590777789e-06, + "loss": 0.80104637, + "num_input_tokens_seen": 84648415, + "router_z_loss_clip": 2.34960938, + "router_z_loss_mlp": 0.203125, + "step": 3930, + "time_per_iteration": 2.5537455081939697 + }, + { + "auxiliary_loss_clip": 0.06516138, + "auxiliary_loss_mlp": 0.01275778, + "balance_loss_clip": 0.06290947, + "balance_loss_mlp": 0.01257312, + "epoch": 0.23634450623778747, + "flos": 39539103932160.0, + "grad_norm": 1.8975533795335693, + "language_loss": 0.74476141, + "learning_rate": 3.569732284634665e-06, + "loss": 0.82268059, + "num_input_tokens_seen": 84670080, + "router_z_loss_clip": 2.25390625, + "router_z_loss_mlp": 0.18444824, + "step": 3931, + "time_per_iteration": 2.6975677013397217 + }, + { + "auxiliary_loss_clip": 0.06517775, + "auxiliary_loss_mlp": 0.01279269, + "balance_loss_clip": 0.06291172, + "balance_loss_mlp": 0.01260208, + "epoch": 0.23640462949045543, + "flos": 24214077360000.0, + "grad_norm": 2.102820580807434, + "language_loss": 0.8105433, + "learning_rate": 3.569490918967136e-06, + "loss": 0.88851368, + "num_input_tokens_seen": 84686465, + "router_z_loss_clip": 2.26757812, + "router_z_loss_mlp": 0.19055176, + "step": 3932, + "time_per_iteration": 2.539280652999878 + }, + { + "auxiliary_loss_clip": 0.06510118, + "auxiliary_loss_mlp": 0.01272436, + "balance_loss_clip": 0.06289183, + "balance_loss_mlp": 0.01254949, + "epoch": 0.2364647527431234, + "flos": 26184898517760.0, + "grad_norm": 1.6370407311570319, + "language_loss": 0.85819322, + "learning_rate": 3.5692494937843537e-06, + "loss": 0.93601882, + "num_input_tokens_seen": 84708825, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.17480469, + "step": 3933, + "time_per_iteration": 4.0140979290008545 + }, + { + "auxiliary_loss_clip": 0.06528582, + "auxiliary_loss_mlp": 0.01277532, + "balance_loss_clip": 0.06296912, + "balance_loss_mlp": 0.01257314, + "epoch": 0.23652487599579136, + "flos": 22643444102400.0, + "grad_norm": 3.233125821654351, + "language_loss": 0.83709848, + "learning_rate": 3.5690080090954727e-06, + "loss": 0.91515964, + "num_input_tokens_seen": 84726165, + "router_z_loss_clip": 2.31835938, + "router_z_loss_mlp": 0.20214844, + "step": 3934, + "time_per_iteration": 2.542692184448242 + }, + { + "auxiliary_loss_clip": 0.06519171, + "auxiliary_loss_mlp": 0.01281493, + "balance_loss_clip": 0.06292115, + "balance_loss_mlp": 0.01262896, + "epoch": 0.23658499924845935, + "flos": 21768702203520.0, + "grad_norm": 1.7174434370199074, + "language_loss": 0.7898351, + "learning_rate": 3.5687664649096515e-06, + "loss": 0.86784172, + "num_input_tokens_seen": 84745815, + "router_z_loss_clip": 2.26953125, + "router_z_loss_mlp": 0.18615723, + "step": 3935, + "time_per_iteration": 2.5311288833618164 + }, + { + "auxiliary_loss_clip": 0.0651848, + "auxiliary_loss_mlp": 0.01276844, + "balance_loss_clip": 0.06296465, + "balance_loss_mlp": 0.01258533, + "epoch": 0.23664512250112732, + "flos": 21805486945920.0, + "grad_norm": 1.7511193987533888, + "language_loss": 0.80239666, + "learning_rate": 3.5685248612360487e-06, + "loss": 0.88034987, + "num_input_tokens_seen": 84765415, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.1829834, + "step": 3936, + "time_per_iteration": 2.5497477054595947 + }, + { + "auxiliary_loss_clip": 0.06513149, + "auxiliary_loss_mlp": 0.01276001, + "balance_loss_clip": 0.06288509, + "balance_loss_mlp": 0.01256593, + "epoch": 0.23670524575379528, + "flos": 22644450351360.0, + "grad_norm": 1.4782770271817958, + "language_loss": 0.79820013, + "learning_rate": 3.568283198083826e-06, + "loss": 0.8760916, + "num_input_tokens_seen": 84787080, + "router_z_loss_clip": 2.24804688, + "router_z_loss_mlp": 0.19396973, + "step": 3937, + "time_per_iteration": 2.5636842250823975 + }, + { + "auxiliary_loss_clip": 0.06515164, + "auxiliary_loss_mlp": 0.0127913, + "balance_loss_clip": 0.06294726, + "balance_loss_mlp": 0.01261487, + "epoch": 0.23676536900646325, + "flos": 16730225942400.0, + "grad_norm": 2.2850190898814686, + "language_loss": 0.85810506, + "learning_rate": 3.568041475462147e-06, + "loss": 0.93604803, + "num_input_tokens_seen": 84805395, + "router_z_loss_clip": 2.20507812, + "router_z_loss_mlp": 0.1763916, + "step": 3938, + "time_per_iteration": 2.568195343017578 + }, + { + "auxiliary_loss_clip": 0.06509314, + "auxiliary_loss_mlp": 0.01278669, + "balance_loss_clip": 0.06288411, + "balance_loss_mlp": 0.01259393, + "epoch": 0.23682549225913122, + "flos": 11138720734080.0, + "grad_norm": 3.1023600205020876, + "language_loss": 0.94564033, + "learning_rate": 3.5677996933801785e-06, + "loss": 1.02351999, + "num_input_tokens_seen": 84818090, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.19287109, + "step": 3939, + "time_per_iteration": 2.4615180492401123 + }, + { + "auxiliary_loss_clip": 0.0652378, + "auxiliary_loss_mlp": 0.01277473, + "balance_loss_clip": 0.06294175, + "balance_loss_mlp": 0.0125803, + "epoch": 0.23688561551179918, + "flos": 22564843372800.0, + "grad_norm": 5.475058210638743, + "language_loss": 0.82803464, + "learning_rate": 3.567557851847088e-06, + "loss": 0.90604717, + "num_input_tokens_seen": 84837695, + "router_z_loss_clip": 2.29492188, + "router_z_loss_mlp": 0.19445801, + "step": 3940, + "time_per_iteration": 2.573552131652832 + }, + { + "auxiliary_loss_clip": 0.06531326, + "auxiliary_loss_mlp": 0.01276996, + "balance_loss_clip": 0.06295921, + "balance_loss_mlp": 0.0125679, + "epoch": 0.23694573876446715, + "flos": 18520771040640.0, + "grad_norm": 2.098492916494123, + "language_loss": 0.8946867, + "learning_rate": 3.5673159508720464e-06, + "loss": 0.97276992, + "num_input_tokens_seen": 84854630, + "router_z_loss_clip": 2.35351562, + "router_z_loss_mlp": 0.2019043, + "step": 3941, + "time_per_iteration": 2.5142972469329834 + }, + { + "auxiliary_loss_clip": 0.06529268, + "auxiliary_loss_mlp": 0.01286958, + "balance_loss_clip": 0.06297106, + "balance_loss_mlp": 0.01267503, + "epoch": 0.23700586201713514, + "flos": 15340246087680.0, + "grad_norm": 1.8886698836060631, + "language_loss": 0.84989077, + "learning_rate": 3.5670739904642274e-06, + "loss": 0.92805308, + "num_input_tokens_seen": 84871805, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.19458008, + "step": 3942, + "time_per_iteration": 2.56052827835083 + }, + { + "auxiliary_loss_clip": 0.06538361, + "auxiliary_loss_mlp": 0.01285865, + "balance_loss_clip": 0.06307331, + "balance_loss_mlp": 0.01265492, + "epoch": 0.2370659852698031, + "flos": 23953775051520.0, + "grad_norm": 2.0845511028002197, + "language_loss": 0.81156456, + "learning_rate": 3.5668319706328065e-06, + "loss": 0.88980681, + "num_input_tokens_seen": 84889815, + "router_z_loss_clip": 2.31054688, + "router_z_loss_mlp": 0.20373535, + "step": 3943, + "time_per_iteration": 2.539264678955078 + }, + { + "auxiliary_loss_clip": 0.06543057, + "auxiliary_loss_mlp": 0.01292355, + "balance_loss_clip": 0.06306483, + "balance_loss_mlp": 0.01271494, + "epoch": 0.23712610852247107, + "flos": 15336514581120.0, + "grad_norm": 2.5863771047568926, + "language_loss": 0.682428, + "learning_rate": 3.566589891386959e-06, + "loss": 0.76078212, + "num_input_tokens_seen": 84904380, + "router_z_loss_clip": 2.3671875, + "router_z_loss_mlp": 0.20861816, + "step": 3944, + "time_per_iteration": 2.520453929901123 + }, + { + "auxiliary_loss_clip": 0.06529288, + "auxiliary_loss_mlp": 0.01297026, + "balance_loss_clip": 0.06299931, + "balance_loss_mlp": 0.01276963, + "epoch": 0.23718623177513903, + "flos": 19688658848640.0, + "grad_norm": 1.6926271274644824, + "language_loss": 0.76068223, + "learning_rate": 3.566347752735866e-06, + "loss": 0.83894539, + "num_input_tokens_seen": 84922935, + "router_z_loss_clip": 2.29296875, + "router_z_loss_mlp": 0.20043945, + "step": 3945, + "time_per_iteration": 2.517084836959839 + }, + { + "auxiliary_loss_clip": 0.06535566, + "auxiliary_loss_mlp": 0.01288141, + "balance_loss_clip": 0.06307153, + "balance_loss_mlp": 0.0126859, + "epoch": 0.237246355027807, + "flos": 24980351748480.0, + "grad_norm": 1.7408538946114391, + "language_loss": 0.63962567, + "learning_rate": 3.5661055546887094e-06, + "loss": 0.71786278, + "num_input_tokens_seen": 84943685, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.19555664, + "step": 3946, + "time_per_iteration": 2.6133670806884766 + }, + { + "auxiliary_loss_clip": 0.06535441, + "auxiliary_loss_mlp": 0.01289697, + "balance_loss_clip": 0.06306995, + "balance_loss_mlp": 0.01269324, + "epoch": 0.23730647828047496, + "flos": 15382816761600.0, + "grad_norm": 3.1254224655104252, + "language_loss": 0.77114201, + "learning_rate": 3.5658632972546734e-06, + "loss": 0.84939343, + "num_input_tokens_seen": 84959505, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.20385742, + "step": 3947, + "time_per_iteration": 2.495837926864624 + }, + { + "auxiliary_loss_clip": 0.06540522, + "auxiliary_loss_mlp": 0.01290208, + "balance_loss_clip": 0.06311937, + "balance_loss_mlp": 0.01270431, + "epoch": 0.23736660153314296, + "flos": 28158738422400.0, + "grad_norm": 1.595292591120463, + "language_loss": 0.80941439, + "learning_rate": 3.565620980442944e-06, + "loss": 0.88772172, + "num_input_tokens_seen": 84982130, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.19775391, + "step": 3948, + "time_per_iteration": 2.6460211277008057 + }, + { + "auxiliary_loss_clip": 0.06542704, + "auxiliary_loss_mlp": 0.01297731, + "balance_loss_clip": 0.06312679, + "balance_loss_mlp": 0.01277025, + "epoch": 0.23742672478581092, + "flos": 22092385726080.0, + "grad_norm": 1.753357741589714, + "language_loss": 0.80419362, + "learning_rate": 3.5653786042627107e-06, + "loss": 0.88259804, + "num_input_tokens_seen": 85000640, + "router_z_loss_clip": 2.30078125, + "router_z_loss_mlp": 0.20715332, + "step": 3949, + "time_per_iteration": 2.5428664684295654 + }, + { + "auxiliary_loss_clip": 0.06549721, + "auxiliary_loss_mlp": 0.01294419, + "balance_loss_clip": 0.06317213, + "balance_loss_mlp": 0.012732, + "epoch": 0.2374868480384789, + "flos": 19543238887680.0, + "grad_norm": 1.6923054699564082, + "language_loss": 0.73375976, + "learning_rate": 3.565136168723163e-06, + "loss": 0.81220114, + "num_input_tokens_seen": 85018970, + "router_z_loss_clip": 2.32421875, + "router_z_loss_mlp": 0.2121582, + "step": 3950, + "time_per_iteration": 2.6125261783599854 + }, + { + "auxiliary_loss_clip": 0.06527583, + "auxiliary_loss_mlp": 0.01288007, + "balance_loss_clip": 0.06302388, + "balance_loss_mlp": 0.01268957, + "epoch": 0.23754697129114685, + "flos": 19427769561600.0, + "grad_norm": 1.893051910973559, + "language_loss": 0.73254943, + "learning_rate": 3.564893673833495e-06, + "loss": 0.8107053, + "num_input_tokens_seen": 85035905, + "router_z_loss_clip": 2.25390625, + "router_z_loss_mlp": 0.1907959, + "step": 3951, + "time_per_iteration": 2.501091957092285 + }, + { + "auxiliary_loss_clip": 0.06543966, + "auxiliary_loss_mlp": 0.01301622, + "balance_loss_clip": 0.06315006, + "balance_loss_mlp": 0.01280332, + "epoch": 0.23760709454381482, + "flos": 19507208832000.0, + "grad_norm": 1.727887568846887, + "language_loss": 0.7427932, + "learning_rate": 3.564651119602903e-06, + "loss": 0.82124901, + "num_input_tokens_seen": 85054560, + "router_z_loss_clip": 2.2890625, + "router_z_loss_mlp": 0.2130127, + "step": 3952, + "time_per_iteration": 2.5467019081115723 + }, + { + "auxiliary_loss_clip": 0.06536686, + "auxiliary_loss_mlp": 0.01292988, + "balance_loss_clip": 0.0630881, + "balance_loss_mlp": 0.01273379, + "epoch": 0.23766721779648278, + "flos": 27644045518080.0, + "grad_norm": 3.105577179216311, + "language_loss": 0.71633041, + "learning_rate": 3.564408506040583e-06, + "loss": 0.79462719, + "num_input_tokens_seen": 85074425, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.19604492, + "step": 3953, + "time_per_iteration": 2.599946975708008 + }, + { + "auxiliary_loss_clip": 0.06537458, + "auxiliary_loss_mlp": 0.01292831, + "balance_loss_clip": 0.06305911, + "balance_loss_mlp": 0.01272673, + "epoch": 0.23772734104915075, + "flos": 23411102083200.0, + "grad_norm": 6.547469437533346, + "language_loss": 0.82534778, + "learning_rate": 3.5641658331557356e-06, + "loss": 0.90365064, + "num_input_tokens_seen": 85092865, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.20166016, + "step": 3954, + "time_per_iteration": 2.595163583755493 + }, + { + "auxiliary_loss_clip": 0.06538694, + "auxiliary_loss_mlp": 0.01291334, + "balance_loss_clip": 0.0630859, + "balance_loss_mlp": 0.01271486, + "epoch": 0.23778746430181874, + "flos": 15710902623360.0, + "grad_norm": 2.2065720754909606, + "language_loss": 0.66202033, + "learning_rate": 3.5639231009575634e-06, + "loss": 0.74032056, + "num_input_tokens_seen": 85110175, + "router_z_loss_clip": 2.30273438, + "router_z_loss_mlp": 0.19848633, + "step": 3955, + "time_per_iteration": 2.5345511436462402 + }, + { + "auxiliary_loss_clip": 0.06527859, + "auxiliary_loss_mlp": 0.01285762, + "balance_loss_clip": 0.06301668, + "balance_loss_mlp": 0.01266081, + "epoch": 0.2378475875544867, + "flos": 19432381536000.0, + "grad_norm": 1.4478942147045952, + "language_loss": 0.84203303, + "learning_rate": 3.5636803094552704e-06, + "loss": 0.92016923, + "num_input_tokens_seen": 85129925, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.19689941, + "step": 3956, + "time_per_iteration": 2.5458483695983887 + }, + { + "auxiliary_loss_clip": 0.06526335, + "auxiliary_loss_mlp": 0.01287929, + "balance_loss_clip": 0.06303546, + "balance_loss_mlp": 0.01268438, + "epoch": 0.23790771080715467, + "flos": 22274338867200.0, + "grad_norm": 2.194064451149358, + "language_loss": 0.8561964, + "learning_rate": 3.5634374586580635e-06, + "loss": 0.93433905, + "num_input_tokens_seen": 85147755, + "router_z_loss_clip": 2.2265625, + "router_z_loss_mlp": 0.19494629, + "step": 3957, + "time_per_iteration": 2.5579113960266113 + }, + { + "auxiliary_loss_clip": 0.06532466, + "auxiliary_loss_mlp": 0.01283677, + "balance_loss_clip": 0.0630599, + "balance_loss_mlp": 0.01264008, + "epoch": 0.23796783405982264, + "flos": 20053445598720.0, + "grad_norm": 2.4454692262909856, + "language_loss": 0.7073434, + "learning_rate": 3.563194548575151e-06, + "loss": 0.78550482, + "num_input_tokens_seen": 85165270, + "router_z_loss_clip": 2.26757812, + "router_z_loss_mlp": 0.19665527, + "step": 3958, + "time_per_iteration": 2.556201219558716 + }, + { + "auxiliary_loss_clip": 0.06533751, + "auxiliary_loss_mlp": 0.01277914, + "balance_loss_clip": 0.06301822, + "balance_loss_mlp": 0.01257303, + "epoch": 0.2380279573124906, + "flos": 14251084790400.0, + "grad_norm": 4.548053192599961, + "language_loss": 0.66760004, + "learning_rate": 3.562951579215745e-06, + "loss": 0.74571669, + "num_input_tokens_seen": 85181555, + "router_z_loss_clip": 2.32226562, + "router_z_loss_mlp": 0.20617676, + "step": 3959, + "time_per_iteration": 2.491999626159668 + }, + { + "auxiliary_loss_clip": 0.06529753, + "auxiliary_loss_mlp": 0.01278003, + "balance_loss_clip": 0.06303047, + "balance_loss_mlp": 0.01259228, + "epoch": 0.23808808056515857, + "flos": 21185638767360.0, + "grad_norm": 1.7806564555446132, + "language_loss": 0.72341377, + "learning_rate": 3.5627085505890586e-06, + "loss": 0.80149138, + "num_input_tokens_seen": 85199455, + "router_z_loss_clip": 2.26757812, + "router_z_loss_mlp": 0.18774414, + "step": 3960, + "time_per_iteration": 2.523761034011841 + }, + { + "auxiliary_loss_clip": 0.0652384, + "auxiliary_loss_mlp": 0.0127522, + "balance_loss_clip": 0.06296217, + "balance_loss_mlp": 0.01255169, + "epoch": 0.23814820381782653, + "flos": 22534850810880.0, + "grad_norm": 1.610971251516654, + "language_loss": 0.7476449, + "learning_rate": 3.562465462704307e-06, + "loss": 0.82563543, + "num_input_tokens_seen": 85219170, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.20031738, + "step": 3961, + "time_per_iteration": 2.5350120067596436 + }, + { + "auxiliary_loss_clip": 0.06528293, + "auxiliary_loss_mlp": 0.01283237, + "balance_loss_clip": 0.06297825, + "balance_loss_mlp": 0.01261505, + "epoch": 0.23820832707049452, + "flos": 22309991579520.0, + "grad_norm": 2.008938617955162, + "language_loss": 0.66267157, + "learning_rate": 3.5622223155707085e-06, + "loss": 0.74078679, + "num_input_tokens_seen": 85238480, + "router_z_loss_clip": 2.30273438, + "router_z_loss_mlp": 0.21728516, + "step": 3962, + "time_per_iteration": 2.554936170578003 + }, + { + "auxiliary_loss_clip": 0.06522447, + "auxiliary_loss_mlp": 0.01279056, + "balance_loss_clip": 0.0629696, + "balance_loss_mlp": 0.0126009, + "epoch": 0.2382684503231625, + "flos": 24871297259520.0, + "grad_norm": 1.868964177707197, + "language_loss": 0.75134146, + "learning_rate": 3.561979109197483e-06, + "loss": 0.82935649, + "num_input_tokens_seen": 85259180, + "router_z_loss_clip": 2.25585938, + "router_z_loss_mlp": 0.18969727, + "step": 3963, + "time_per_iteration": 3.9841935634613037 + }, + { + "auxiliary_loss_clip": 0.0652955, + "auxiliary_loss_mlp": 0.01278457, + "balance_loss_clip": 0.06298651, + "balance_loss_mlp": 0.01257428, + "epoch": 0.23832857357583045, + "flos": 21878050181760.0, + "grad_norm": 2.083636930734351, + "language_loss": 0.77508426, + "learning_rate": 3.5617358435938538e-06, + "loss": 0.85316432, + "num_input_tokens_seen": 85278550, + "router_z_loss_clip": 2.30859375, + "router_z_loss_mlp": 0.21032715, + "step": 3964, + "time_per_iteration": 2.546093463897705 + }, + { + "auxiliary_loss_clip": 0.06513681, + "auxiliary_loss_mlp": 0.01275741, + "balance_loss_clip": 0.06290185, + "balance_loss_mlp": 0.01256275, + "epoch": 0.23838869682849842, + "flos": 21294441694080.0, + "grad_norm": 2.0070777911568207, + "language_loss": 0.72507781, + "learning_rate": 3.561492518769045e-06, + "loss": 0.80297208, + "num_input_tokens_seen": 85297345, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.19458008, + "step": 3965, + "time_per_iteration": 2.605717182159424 + }, + { + "auxiliary_loss_clip": 0.06518564, + "auxiliary_loss_mlp": 0.012776, + "balance_loss_clip": 0.06293208, + "balance_loss_mlp": 0.01258181, + "epoch": 0.23844882008116638, + "flos": 16186211308800.0, + "grad_norm": 2.069567415104782, + "language_loss": 0.79030257, + "learning_rate": 3.561249134732282e-06, + "loss": 0.8682642, + "num_input_tokens_seen": 85315105, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.19396973, + "step": 3966, + "time_per_iteration": 3.980722427368164 + }, + { + "auxiliary_loss_clip": 0.06517511, + "auxiliary_loss_mlp": 0.01283232, + "balance_loss_clip": 0.06290257, + "balance_loss_mlp": 0.01264647, + "epoch": 0.23850894333383435, + "flos": 21076165008000.0, + "grad_norm": 3.0015774693629433, + "language_loss": 0.69417417, + "learning_rate": 3.561005691492797e-06, + "loss": 0.77218163, + "num_input_tokens_seen": 85334735, + "router_z_loss_clip": 2.27929688, + "router_z_loss_mlp": 0.18579102, + "step": 3967, + "time_per_iteration": 2.542595386505127 + }, + { + "auxiliary_loss_clip": 0.06523537, + "auxiliary_loss_mlp": 0.01278611, + "balance_loss_clip": 0.0629587, + "balance_loss_mlp": 0.01257821, + "epoch": 0.23856906658650234, + "flos": 17207295563520.0, + "grad_norm": 1.9959497275253817, + "language_loss": 0.68410718, + "learning_rate": 3.5607621890598185e-06, + "loss": 0.76212859, + "num_input_tokens_seen": 85352875, + "router_z_loss_clip": 2.27539062, + "router_z_loss_mlp": 0.20800781, + "step": 3968, + "time_per_iteration": 2.5275728702545166 + }, + { + "auxiliary_loss_clip": 0.06526159, + "auxiliary_loss_mlp": 0.01279655, + "balance_loss_clip": 0.0629804, + "balance_loss_mlp": 0.01261392, + "epoch": 0.2386291898391703, + "flos": 29501451774720.0, + "grad_norm": 2.0078802263631994, + "language_loss": 0.77147222, + "learning_rate": 3.5605186274425823e-06, + "loss": 0.84953034, + "num_input_tokens_seen": 85372205, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.18261719, + "step": 3969, + "time_per_iteration": 4.006864547729492 + }, + { + "auxiliary_loss_clip": 0.06514208, + "auxiliary_loss_mlp": 0.01292793, + "balance_loss_clip": 0.06291697, + "balance_loss_mlp": 0.01274602, + "epoch": 0.23868931309183827, + "flos": 21148854024960.0, + "grad_norm": 1.9717404660495825, + "language_loss": 0.76892555, + "learning_rate": 3.5602750066503225e-06, + "loss": 0.84699559, + "num_input_tokens_seen": 85389705, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.18188477, + "step": 3970, + "time_per_iteration": 2.558915615081787 + }, + { + "auxiliary_loss_clip": 0.06523073, + "auxiliary_loss_mlp": 0.0128602, + "balance_loss_clip": 0.0629265, + "balance_loss_mlp": 0.01265969, + "epoch": 0.23874943634450624, + "flos": 25665342076800.0, + "grad_norm": 2.212795121423013, + "language_loss": 0.85452002, + "learning_rate": 3.5600313266922793e-06, + "loss": 0.93261099, + "num_input_tokens_seen": 85407855, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.20043945, + "step": 3971, + "time_per_iteration": 2.5621652603149414 + }, + { + "auxiliary_loss_clip": 0.06391954, + "auxiliary_loss_mlp": 0.01255828, + "balance_loss_clip": 0.06279661, + "balance_loss_mlp": 0.01251122, + "epoch": 0.2388095595971742, + "flos": 59006871889920.0, + "grad_norm": 0.7183517633018239, + "language_loss": 0.62744105, + "learning_rate": 3.5597875875776915e-06, + "loss": 0.70391893, + "num_input_tokens_seen": 85470885, + "router_z_loss_clip": 1.125, + "router_z_loss_mlp": 0.04696655, + "step": 3972, + "time_per_iteration": 4.643376350402832 + }, + { + "auxiliary_loss_clip": 0.06515118, + "auxiliary_loss_mlp": 0.01277926, + "balance_loss_clip": 0.06290536, + "balance_loss_mlp": 0.01258399, + "epoch": 0.23886968284984217, + "flos": 16805975633280.0, + "grad_norm": 3.0192177240020976, + "language_loss": 0.81866533, + "learning_rate": 3.5595437893158013e-06, + "loss": 0.89659578, + "num_input_tokens_seen": 85488460, + "router_z_loss_clip": 2.2421875, + "router_z_loss_mlp": 0.19543457, + "step": 3973, + "time_per_iteration": 2.5597283840179443 + }, + { + "auxiliary_loss_clip": 0.06517763, + "auxiliary_loss_mlp": 0.01283675, + "balance_loss_clip": 0.06291795, + "balance_loss_mlp": 0.01265162, + "epoch": 0.23892980610251013, + "flos": 22389221214720.0, + "grad_norm": 1.829209898292947, + "language_loss": 0.79696077, + "learning_rate": 3.5592999319158546e-06, + "loss": 0.8749752, + "num_input_tokens_seen": 85508590, + "router_z_loss_clip": 2.25976562, + "router_z_loss_mlp": 0.18518066, + "step": 3974, + "time_per_iteration": 2.5331227779388428 + }, + { + "auxiliary_loss_clip": 0.06524864, + "auxiliary_loss_mlp": 0.01291591, + "balance_loss_clip": 0.06296244, + "balance_loss_mlp": 0.01272279, + "epoch": 0.23898992935517813, + "flos": 12828135553920.0, + "grad_norm": 6.773745042238101, + "language_loss": 0.85156423, + "learning_rate": 3.5590560153870984e-06, + "loss": 0.92972875, + "num_input_tokens_seen": 85525970, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.19311523, + "step": 3975, + "time_per_iteration": 2.6016342639923096 + }, + { + "auxiliary_loss_clip": 0.06513388, + "auxiliary_loss_mlp": 0.01278416, + "balance_loss_clip": 0.06290747, + "balance_loss_mlp": 0.01260117, + "epoch": 0.2390500526078461, + "flos": 22352142983040.0, + "grad_norm": 3.375355565005516, + "language_loss": 0.84191501, + "learning_rate": 3.5588120397387816e-06, + "loss": 0.91983294, + "num_input_tokens_seen": 85543700, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.1829834, + "step": 3976, + "time_per_iteration": 2.5339527130126953 + }, + { + "auxiliary_loss_clip": 0.06511909, + "auxiliary_loss_mlp": 0.01282136, + "balance_loss_clip": 0.06290296, + "balance_loss_mlp": 0.01264111, + "epoch": 0.23911017586051406, + "flos": 22641263896320.0, + "grad_norm": 3.0704844059493497, + "language_loss": 0.74960983, + "learning_rate": 3.5585680049801566e-06, + "loss": 0.82755029, + "num_input_tokens_seen": 85562765, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.18029785, + "step": 3977, + "time_per_iteration": 2.5528597831726074 + }, + { + "auxiliary_loss_clip": 0.06524444, + "auxiliary_loss_mlp": 0.01281803, + "balance_loss_clip": 0.06296922, + "balance_loss_mlp": 0.01261478, + "epoch": 0.23917029911318202, + "flos": 23658993987840.0, + "grad_norm": 3.246082679368102, + "language_loss": 0.7235828, + "learning_rate": 3.5583239111204764e-06, + "loss": 0.80164528, + "num_input_tokens_seen": 85581755, + "router_z_loss_clip": 2.2734375, + "router_z_loss_mlp": 0.203125, + "step": 3978, + "time_per_iteration": 2.548459768295288 + }, + { + "auxiliary_loss_clip": 0.06536747, + "auxiliary_loss_mlp": 0.01279264, + "balance_loss_clip": 0.06306014, + "balance_loss_mlp": 0.0125994, + "epoch": 0.23923042236585, + "flos": 22790163801600.0, + "grad_norm": 2.3394422136849875, + "language_loss": 0.79264927, + "learning_rate": 3.558079758168997e-06, + "loss": 0.87080932, + "num_input_tokens_seen": 85599455, + "router_z_loss_clip": 2.30859375, + "router_z_loss_mlp": 0.1932373, + "step": 3979, + "time_per_iteration": 2.5696120262145996 + }, + { + "auxiliary_loss_clip": 0.06521225, + "auxiliary_loss_mlp": 0.01282521, + "balance_loss_clip": 0.06295727, + "balance_loss_mlp": 0.01263185, + "epoch": 0.23929054561851795, + "flos": 28155300405120.0, + "grad_norm": 1.7900268576070866, + "language_loss": 0.81971824, + "learning_rate": 3.557835546134977e-06, + "loss": 0.89775562, + "num_input_tokens_seen": 85619970, + "router_z_loss_clip": 2.25585938, + "router_z_loss_mlp": 0.1932373, + "step": 3980, + "time_per_iteration": 2.587286949157715 + }, + { + "auxiliary_loss_clip": 0.06519361, + "auxiliary_loss_mlp": 0.01281001, + "balance_loss_clip": 0.06296664, + "balance_loss_mlp": 0.01261891, + "epoch": 0.23935066887118592, + "flos": 21692491315200.0, + "grad_norm": 1.7930077111492302, + "language_loss": 0.84270984, + "learning_rate": 3.5575912750276775e-06, + "loss": 0.92071348, + "num_input_tokens_seen": 85638850, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.19091797, + "step": 3981, + "time_per_iteration": 2.550725221633911 + }, + { + "auxiliary_loss_clip": 0.06535558, + "auxiliary_loss_mlp": 0.01280601, + "balance_loss_clip": 0.06303745, + "balance_loss_mlp": 0.01260669, + "epoch": 0.2394107921238539, + "flos": 32130121737600.0, + "grad_norm": 2.0248039039910393, + "language_loss": 0.77712274, + "learning_rate": 3.5573469448563607e-06, + "loss": 0.85528433, + "num_input_tokens_seen": 85656285, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.19934082, + "step": 3982, + "time_per_iteration": 2.594698667526245 + }, + { + "auxiliary_loss_clip": 0.06530322, + "auxiliary_loss_mlp": 0.01280321, + "balance_loss_clip": 0.06304529, + "balance_loss_mlp": 0.01261307, + "epoch": 0.23947091537652188, + "flos": 17024839297920.0, + "grad_norm": 1.9623565914246572, + "language_loss": 0.7809152, + "learning_rate": 3.5571025556302915e-06, + "loss": 0.85902166, + "num_input_tokens_seen": 85673020, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.19006348, + "step": 3983, + "time_per_iteration": 2.537132740020752 + }, + { + "auxiliary_loss_clip": 0.06527262, + "auxiliary_loss_mlp": 0.01280803, + "balance_loss_clip": 0.0630171, + "balance_loss_mlp": 0.01261956, + "epoch": 0.23953103862918984, + "flos": 20599640438400.0, + "grad_norm": 2.137172968887566, + "language_loss": 0.73945713, + "learning_rate": 3.556858107358737e-06, + "loss": 0.81753772, + "num_input_tokens_seen": 85692565, + "router_z_loss_clip": 2.25585938, + "router_z_loss_mlp": 0.18835449, + "step": 3984, + "time_per_iteration": 2.538221836090088 + }, + { + "auxiliary_loss_clip": 0.06531888, + "auxiliary_loss_mlp": 0.01281613, + "balance_loss_clip": 0.06302323, + "balance_loss_mlp": 0.01262587, + "epoch": 0.2395911618818578, + "flos": 20710707425280.0, + "grad_norm": 1.9765684717262704, + "language_loss": 0.7965889, + "learning_rate": 3.5566136000509674e-06, + "loss": 0.87472391, + "num_input_tokens_seen": 85709730, + "router_z_loss_clip": 2.29492188, + "router_z_loss_mlp": 0.19030762, + "step": 3985, + "time_per_iteration": 2.551649570465088 + }, + { + "auxiliary_loss_clip": 0.06532246, + "auxiliary_loss_mlp": 0.0127953, + "balance_loss_clip": 0.06304285, + "balance_loss_mlp": 0.01259265, + "epoch": 0.23965128513452577, + "flos": 27060982081920.0, + "grad_norm": 1.916737509209056, + "language_loss": 0.73610401, + "learning_rate": 3.556369033716254e-06, + "loss": 0.8142218, + "num_input_tokens_seen": 85730045, + "router_z_loss_clip": 2.27929688, + "router_z_loss_mlp": 0.20263672, + "step": 3986, + "time_per_iteration": 2.710397481918335 + }, + { + "auxiliary_loss_clip": 0.06540911, + "auxiliary_loss_mlp": 0.01281338, + "balance_loss_clip": 0.0630495, + "balance_loss_mlp": 0.01261, + "epoch": 0.23971140838719374, + "flos": 23150254723200.0, + "grad_norm": 1.785192597796332, + "language_loss": 0.88325328, + "learning_rate": 3.556124408363871e-06, + "loss": 0.96147585, + "num_input_tokens_seen": 85747590, + "router_z_loss_clip": 2.36328125, + "router_z_loss_mlp": 0.20336914, + "step": 3987, + "time_per_iteration": 2.6331911087036133 + }, + { + "auxiliary_loss_clip": 0.06529854, + "auxiliary_loss_mlp": 0.01278502, + "balance_loss_clip": 0.06312454, + "balance_loss_mlp": 0.0126043, + "epoch": 0.23977153163986173, + "flos": 18039341007360.0, + "grad_norm": 2.2552133940915224, + "language_loss": 0.84056735, + "learning_rate": 3.5558797240030945e-06, + "loss": 0.91865093, + "num_input_tokens_seen": 85763460, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.18078613, + "step": 3988, + "time_per_iteration": 2.5413994789123535 + }, + { + "auxiliary_loss_clip": 0.06533512, + "auxiliary_loss_mlp": 0.01288032, + "balance_loss_clip": 0.06306052, + "balance_loss_mlp": 0.01267052, + "epoch": 0.2398316548925297, + "flos": 18119157621120.0, + "grad_norm": 1.6232739060807335, + "language_loss": 0.85473406, + "learning_rate": 3.5556349806432035e-06, + "loss": 0.93294942, + "num_input_tokens_seen": 85782050, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.2097168, + "step": 3989, + "time_per_iteration": 2.528348207473755 + }, + { + "auxiliary_loss_clip": 0.06527147, + "auxiliary_loss_mlp": 0.01286562, + "balance_loss_clip": 0.06305796, + "balance_loss_mlp": 0.01266642, + "epoch": 0.23989177814519766, + "flos": 12572612928000.0, + "grad_norm": 2.695913709141839, + "language_loss": 0.8517406, + "learning_rate": 3.555390178293477e-06, + "loss": 0.92987764, + "num_input_tokens_seen": 85797400, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.19909668, + "step": 3990, + "time_per_iteration": 2.52915358543396 + }, + { + "auxiliary_loss_clip": 0.06527729, + "auxiliary_loss_mlp": 0.01283435, + "balance_loss_clip": 0.06302518, + "balance_loss_mlp": 0.01264064, + "epoch": 0.23995190139786562, + "flos": 25271569013760.0, + "grad_norm": 1.4267230320219149, + "language_loss": 0.76345301, + "learning_rate": 3.5551453169631994e-06, + "loss": 0.84156466, + "num_input_tokens_seen": 85818995, + "router_z_loss_clip": 2.25, + "router_z_loss_mlp": 0.19372559, + "step": 3991, + "time_per_iteration": 2.556820869445801 + }, + { + "auxiliary_loss_clip": 0.06413993, + "auxiliary_loss_mlp": 0.0126815, + "balance_loss_clip": 0.06298733, + "balance_loss_mlp": 0.01262789, + "epoch": 0.2400120246505336, + "flos": 61978107271680.0, + "grad_norm": 0.8724678757997124, + "language_loss": 0.6358996, + "learning_rate": 3.554900396661656e-06, + "loss": 0.71272099, + "num_input_tokens_seen": 85876695, + "router_z_loss_clip": 1.15625, + "router_z_loss_mlp": 0.05368042, + "step": 3992, + "time_per_iteration": 3.0817418098449707 + }, + { + "auxiliary_loss_clip": 0.06411353, + "auxiliary_loss_mlp": 0.01264238, + "balance_loss_clip": 0.06297012, + "balance_loss_mlp": 0.01259121, + "epoch": 0.24007214790320155, + "flos": 66727923816960.0, + "grad_norm": 0.7394753945990321, + "language_loss": 0.62864375, + "learning_rate": 3.5546554173981334e-06, + "loss": 0.70539963, + "num_input_tokens_seen": 85940990, + "router_z_loss_clip": 1.140625, + "router_z_loss_mlp": 0.05117798, + "step": 3993, + "time_per_iteration": 3.2552971839904785 + }, + { + "auxiliary_loss_clip": 0.0652933, + "auxiliary_loss_mlp": 0.01280032, + "balance_loss_clip": 0.062997, + "balance_loss_mlp": 0.0125886, + "epoch": 0.24013227115586952, + "flos": 25815667501440.0, + "grad_norm": 1.8775036450716396, + "language_loss": 0.77610862, + "learning_rate": 3.5544103791819218e-06, + "loss": 0.85420227, + "num_input_tokens_seen": 85961165, + "router_z_loss_clip": 2.30078125, + "router_z_loss_mlp": 0.21154785, + "step": 3994, + "time_per_iteration": 2.6225738525390625 + }, + { + "auxiliary_loss_clip": 0.06526788, + "auxiliary_loss_mlp": 0.01288387, + "balance_loss_clip": 0.06296962, + "balance_loss_mlp": 0.01266822, + "epoch": 0.2401923944085375, + "flos": 25564672995840.0, + "grad_norm": 1.626402048760673, + "language_loss": 0.78733414, + "learning_rate": 3.5541652820223124e-06, + "loss": 0.86548591, + "num_input_tokens_seen": 85982710, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.21557617, + "step": 3995, + "time_per_iteration": 2.5860579013824463 + }, + { + "auxiliary_loss_clip": 0.06395802, + "auxiliary_loss_mlp": 0.01265549, + "balance_loss_clip": 0.06282736, + "balance_loss_mlp": 0.01260685, + "epoch": 0.24025251766120548, + "flos": 54961457892480.0, + "grad_norm": 0.8928130340410044, + "language_loss": 0.63566971, + "learning_rate": 3.5539201259286006e-06, + "loss": 0.71228325, + "num_input_tokens_seen": 86046935, + "router_z_loss_clip": 1.12597656, + "router_z_loss_mlp": 0.04858398, + "step": 3996, + "time_per_iteration": 3.232227087020874 + }, + { + "auxiliary_loss_clip": 0.06522241, + "auxiliary_loss_mlp": 0.01283128, + "balance_loss_clip": 0.06290409, + "balance_loss_mlp": 0.0126328, + "epoch": 0.24031264091387344, + "flos": 20637305648640.0, + "grad_norm": 2.8724335092069864, + "language_loss": 0.71121502, + "learning_rate": 3.5536749109100808e-06, + "loss": 0.78926873, + "num_input_tokens_seen": 86064355, + "router_z_loss_clip": 2.31835938, + "router_z_loss_mlp": 0.19848633, + "step": 3997, + "time_per_iteration": 2.5484869480133057 + }, + { + "auxiliary_loss_clip": 0.06510898, + "auxiliary_loss_mlp": 0.01285703, + "balance_loss_clip": 0.06285729, + "balance_loss_mlp": 0.01265473, + "epoch": 0.2403727641665414, + "flos": 20892492858240.0, + "grad_norm": 1.7909711234465908, + "language_loss": 0.87516266, + "learning_rate": 3.5534296369760535e-06, + "loss": 0.9531287, + "num_input_tokens_seen": 86081340, + "router_z_loss_clip": 2.25390625, + "router_z_loss_mlp": 0.20227051, + "step": 3998, + "time_per_iteration": 2.563215970993042 + }, + { + "auxiliary_loss_clip": 0.06526193, + "auxiliary_loss_mlp": 0.01279159, + "balance_loss_clip": 0.06292593, + "balance_loss_mlp": 0.01258762, + "epoch": 0.24043288741920937, + "flos": 22826613127680.0, + "grad_norm": 1.593528116777893, + "language_loss": 0.76414531, + "learning_rate": 3.5531843041358183e-06, + "loss": 0.84219879, + "num_input_tokens_seen": 86102260, + "router_z_loss_clip": 2.3359375, + "router_z_loss_mlp": 0.20410156, + "step": 3999, + "time_per_iteration": 2.5577592849731445 + }, + { + "auxiliary_loss_clip": 0.06511137, + "auxiliary_loss_mlp": 0.01275527, + "balance_loss_clip": 0.0628795, + "balance_loss_mlp": 0.01256716, + "epoch": 0.24049301067187734, + "flos": 27966261594240.0, + "grad_norm": 2.3407253335254086, + "language_loss": 0.73292184, + "learning_rate": 3.552938912398679e-06, + "loss": 0.81078851, + "num_input_tokens_seen": 86123400, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.18823242, + "step": 4000, + "time_per_iteration": 2.583524703979492 + }, + { + "auxiliary_loss_clip": 0.06528921, + "auxiliary_loss_mlp": 0.01283655, + "balance_loss_clip": 0.06293923, + "balance_loss_mlp": 0.01261935, + "epoch": 0.24055313392454533, + "flos": 27458360870400.0, + "grad_norm": 2.671051655318694, + "language_loss": 0.67159665, + "learning_rate": 3.5526934617739397e-06, + "loss": 0.74972242, + "num_input_tokens_seen": 86144060, + "router_z_loss_clip": 2.3515625, + "router_z_loss_mlp": 0.21728516, + "step": 4001, + "time_per_iteration": 2.6188552379608154 + }, + { + "auxiliary_loss_clip": 0.06522354, + "auxiliary_loss_mlp": 0.01279459, + "balance_loss_clip": 0.06293849, + "balance_loss_mlp": 0.01257703, + "epoch": 0.2406132571772133, + "flos": 25563666746880.0, + "grad_norm": 5.034242823707272, + "language_loss": 0.83152658, + "learning_rate": 3.5524479522709095e-06, + "loss": 0.90954471, + "num_input_tokens_seen": 86163005, + "router_z_loss_clip": 2.28320312, + "router_z_loss_mlp": 0.21740723, + "step": 4002, + "time_per_iteration": 3.9769785404205322 + }, + { + "auxiliary_loss_clip": 0.06519094, + "auxiliary_loss_mlp": 0.01282536, + "balance_loss_clip": 0.06293523, + "balance_loss_mlp": 0.01262032, + "epoch": 0.24067338042988126, + "flos": 24798482461440.0, + "grad_norm": 2.0463487498067323, + "language_loss": 0.83599687, + "learning_rate": 3.552202383898897e-06, + "loss": 0.91401321, + "num_input_tokens_seen": 86182580, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.20483398, + "step": 4003, + "time_per_iteration": 2.581669569015503 + }, + { + "auxiliary_loss_clip": 0.06526292, + "auxiliary_loss_mlp": 0.01281725, + "balance_loss_clip": 0.06295015, + "balance_loss_mlp": 0.01261412, + "epoch": 0.24073350368254923, + "flos": 21184171320960.0, + "grad_norm": 2.0670244348036646, + "language_loss": 0.87907362, + "learning_rate": 3.551956756667215e-06, + "loss": 0.9571538, + "num_input_tokens_seen": 86200665, + "router_z_loss_clip": 2.31445312, + "router_z_loss_mlp": 0.20300293, + "step": 4004, + "time_per_iteration": 2.514268636703491 + }, + { + "auxiliary_loss_clip": 0.06526911, + "auxiliary_loss_mlp": 0.01282868, + "balance_loss_clip": 0.06294513, + "balance_loss_mlp": 0.01261815, + "epoch": 0.2407936269352172, + "flos": 22501252523520.0, + "grad_norm": 3.538522770409821, + "language_loss": 0.78168321, + "learning_rate": 3.551711070585177e-06, + "loss": 0.85978097, + "num_input_tokens_seen": 86221640, + "router_z_loss_clip": 2.32421875, + "router_z_loss_mlp": 0.21057129, + "step": 4005, + "time_per_iteration": 2.67775559425354 + }, + { + "auxiliary_loss_clip": 0.0651572, + "auxiliary_loss_mlp": 0.01283457, + "balance_loss_clip": 0.06293365, + "balance_loss_mlp": 0.01263084, + "epoch": 0.24085375018788516, + "flos": 18556968804480.0, + "grad_norm": 2.371719422478697, + "language_loss": 0.79360878, + "learning_rate": 3.5514653256620995e-06, + "loss": 0.87160051, + "num_input_tokens_seen": 86240795, + "router_z_loss_clip": 2.2265625, + "router_z_loss_mlp": 0.20373535, + "step": 4006, + "time_per_iteration": 4.034858465194702 + }, + { + "auxiliary_loss_clip": 0.0653493, + "auxiliary_loss_mlp": 0.01283621, + "balance_loss_clip": 0.06296381, + "balance_loss_mlp": 0.01260709, + "epoch": 0.24091387344055312, + "flos": 24177418398720.0, + "grad_norm": 1.8737477168573817, + "language_loss": 0.71813238, + "learning_rate": 3.551219521907302e-06, + "loss": 0.79631788, + "num_input_tokens_seen": 86262000, + "router_z_loss_clip": 2.38671875, + "router_z_loss_mlp": 0.22912598, + "step": 4007, + "time_per_iteration": 2.5730202198028564 + }, + { + "auxiliary_loss_clip": 0.06518448, + "auxiliary_loss_mlp": 0.01300708, + "balance_loss_clip": 0.06295364, + "balance_loss_mlp": 0.01278773, + "epoch": 0.24097399669322112, + "flos": 11041112327040.0, + "grad_norm": 6.473369852788927, + "language_loss": 0.76978099, + "learning_rate": 3.5509736593301042e-06, + "loss": 0.84797251, + "num_input_tokens_seen": 86279680, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.21936035, + "step": 4008, + "time_per_iteration": 2.55989146232605 + }, + { + "auxiliary_loss_clip": 0.06518552, + "auxiliary_loss_mlp": 0.01286303, + "balance_loss_clip": 0.062894, + "balance_loss_mlp": 0.01264928, + "epoch": 0.24103411994588908, + "flos": 17170762383360.0, + "grad_norm": 2.1979472110907556, + "language_loss": 0.75080305, + "learning_rate": 3.5507277379398295e-06, + "loss": 0.82885164, + "num_input_tokens_seen": 86297180, + "router_z_loss_clip": 2.29296875, + "router_z_loss_mlp": 0.21398926, + "step": 4009, + "time_per_iteration": 3.957920551300049 + }, + { + "auxiliary_loss_clip": 0.06521554, + "auxiliary_loss_mlp": 0.01301136, + "balance_loss_clip": 0.06293823, + "balance_loss_mlp": 0.01279869, + "epoch": 0.24109424319855705, + "flos": 20674258099200.0, + "grad_norm": 1.5898496231384156, + "language_loss": 0.80111217, + "learning_rate": 3.550481757745804e-06, + "loss": 0.8793391, + "num_input_tokens_seen": 86317660, + "router_z_loss_clip": 2.27539062, + "router_z_loss_mlp": 0.21264648, + "step": 4010, + "time_per_iteration": 2.5475916862487793 + }, + { + "auxiliary_loss_clip": 0.06527252, + "auxiliary_loss_mlp": 0.01291864, + "balance_loss_clip": 0.06297424, + "balance_loss_mlp": 0.01268964, + "epoch": 0.241154366451225, + "flos": 28188982546560.0, + "grad_norm": 2.0856120841249366, + "language_loss": 0.70933908, + "learning_rate": 3.5502357187573555e-06, + "loss": 0.78753024, + "num_input_tokens_seen": 86338325, + "router_z_loss_clip": 2.29882812, + "router_z_loss_mlp": 0.22912598, + "step": 4011, + "time_per_iteration": 2.630932092666626 + }, + { + "auxiliary_loss_clip": 0.06528456, + "auxiliary_loss_mlp": 0.0128714, + "balance_loss_clip": 0.06298923, + "balance_loss_mlp": 0.01265766, + "epoch": 0.24121448970389298, + "flos": 21696222821760.0, + "grad_norm": 1.7418824634594252, + "language_loss": 0.694484, + "learning_rate": 3.5499896209838118e-06, + "loss": 0.77263987, + "num_input_tokens_seen": 86357615, + "router_z_loss_clip": 2.29492188, + "router_z_loss_mlp": 0.21362305, + "step": 4012, + "time_per_iteration": 3.988281726837158 + }, + { + "auxiliary_loss_clip": 0.06528036, + "auxiliary_loss_mlp": 0.01287792, + "balance_loss_clip": 0.06296879, + "balance_loss_mlp": 0.01264391, + "epoch": 0.24127461295656094, + "flos": 39685530142080.0, + "grad_norm": 1.5971840931497265, + "language_loss": 0.74512959, + "learning_rate": 3.5497434644345073e-06, + "loss": 0.82328784, + "num_input_tokens_seen": 86380355, + "router_z_loss_clip": 2.31054688, + "router_z_loss_mlp": 0.23388672, + "step": 4013, + "time_per_iteration": 2.7159719467163086 + }, + { + "auxiliary_loss_clip": 0.06531674, + "auxiliary_loss_mlp": 0.01283711, + "balance_loss_clip": 0.0630402, + "balance_loss_mlp": 0.01263231, + "epoch": 0.2413347362092289, + "flos": 19141960884480.0, + "grad_norm": 1.667652232266074, + "language_loss": 0.89031768, + "learning_rate": 3.5494972491187753e-06, + "loss": 0.96847153, + "num_input_tokens_seen": 86399125, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.20483398, + "step": 4014, + "time_per_iteration": 2.5638303756713867 + }, + { + "auxiliary_loss_clip": 0.06538786, + "auxiliary_loss_mlp": 0.01289681, + "balance_loss_clip": 0.06304225, + "balance_loss_mlp": 0.01268831, + "epoch": 0.2413948594618969, + "flos": 26946099734400.0, + "grad_norm": 1.9521080560444544, + "language_loss": 0.95043075, + "learning_rate": 3.549250975045952e-06, + "loss": 1.02871537, + "num_input_tokens_seen": 86418625, + "router_z_loss_clip": 2.34179688, + "router_z_loss_mlp": 0.20849609, + "step": 4015, + "time_per_iteration": 2.5697052478790283 + }, + { + "auxiliary_loss_clip": 0.0653477, + "auxiliary_loss_mlp": 0.01278309, + "balance_loss_clip": 0.06306844, + "balance_loss_mlp": 0.01257781, + "epoch": 0.24145498271456486, + "flos": 25235077760640.0, + "grad_norm": 1.8045004389175856, + "language_loss": 0.83243644, + "learning_rate": 3.5490046422253768e-06, + "loss": 0.91056728, + "num_input_tokens_seen": 86438375, + "router_z_loss_clip": 2.27929688, + "router_z_loss_mlp": 0.2052002, + "step": 4016, + "time_per_iteration": 2.5709176063537598 + }, + { + "auxiliary_loss_clip": 0.06532364, + "auxiliary_loss_mlp": 0.01285254, + "balance_loss_clip": 0.06311545, + "balance_loss_mlp": 0.0126463, + "epoch": 0.24151510596723283, + "flos": 40671339027840.0, + "grad_norm": 2.079467312298135, + "language_loss": 0.69439638, + "learning_rate": 3.54875825066639e-06, + "loss": 0.77257252, + "num_input_tokens_seen": 86463230, + "router_z_loss_clip": 2.20507812, + "router_z_loss_mlp": 0.20617676, + "step": 4017, + "time_per_iteration": 2.6893186569213867 + }, + { + "auxiliary_loss_clip": 0.06536807, + "auxiliary_loss_mlp": 0.01288936, + "balance_loss_clip": 0.06305309, + "balance_loss_mlp": 0.01266286, + "epoch": 0.2415752292199008, + "flos": 18151917367680.0, + "grad_norm": 1.6840714927615923, + "language_loss": 0.84970623, + "learning_rate": 3.5485118003783353e-06, + "loss": 0.92796361, + "num_input_tokens_seen": 86481230, + "router_z_loss_clip": 2.31445312, + "router_z_loss_mlp": 0.2265625, + "step": 4018, + "time_per_iteration": 2.521129608154297 + }, + { + "auxiliary_loss_clip": 0.06448493, + "auxiliary_loss_mlp": 0.01257752, + "balance_loss_clip": 0.06334345, + "balance_loss_mlp": 0.01253335, + "epoch": 0.24163535247256876, + "flos": 67307213819520.0, + "grad_norm": 1.2396896293086193, + "language_loss": 0.6054306, + "learning_rate": 3.548265291370558e-06, + "loss": 0.68249303, + "num_input_tokens_seen": 86541260, + "router_z_loss_clip": 1.140625, + "router_z_loss_mlp": 0.04425049, + "step": 4019, + "time_per_iteration": 3.2191333770751953 + }, + { + "auxiliary_loss_clip": 0.06539527, + "auxiliary_loss_mlp": 0.0127314, + "balance_loss_clip": 0.06310145, + "balance_loss_mlp": 0.01253983, + "epoch": 0.24169547572523672, + "flos": 24935810503680.0, + "grad_norm": 1.839335570686334, + "language_loss": 0.73635018, + "learning_rate": 3.5480187236524055e-06, + "loss": 0.81447685, + "num_input_tokens_seen": 86559580, + "router_z_loss_clip": 2.29296875, + "router_z_loss_mlp": 0.19140625, + "step": 4020, + "time_per_iteration": 2.587033271789551 + }, + { + "auxiliary_loss_clip": 0.06547633, + "auxiliary_loss_mlp": 0.01279706, + "balance_loss_clip": 0.06321433, + "balance_loss_mlp": 0.01259094, + "epoch": 0.24175559897790472, + "flos": 18733303722240.0, + "grad_norm": 1.757855043925666, + "language_loss": 0.81927264, + "learning_rate": 3.5477720972332285e-06, + "loss": 0.89754599, + "num_input_tokens_seen": 86577560, + "router_z_loss_clip": 2.26171875, + "router_z_loss_mlp": 0.20617676, + "step": 4021, + "time_per_iteration": 2.516295909881592 + }, + { + "auxiliary_loss_clip": 0.06542306, + "auxiliary_loss_mlp": 0.0127859, + "balance_loss_clip": 0.06314138, + "balance_loss_mlp": 0.01255201, + "epoch": 0.24181572223057268, + "flos": 23045937989760.0, + "grad_norm": 1.9677245364232816, + "language_loss": 0.76831293, + "learning_rate": 3.547525412122378e-06, + "loss": 0.84652191, + "num_input_tokens_seen": 86595350, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.23388672, + "step": 4022, + "time_per_iteration": 2.560833692550659 + }, + { + "auxiliary_loss_clip": 0.0655847, + "auxiliary_loss_mlp": 0.01279281, + "balance_loss_clip": 0.06321847, + "balance_loss_mlp": 0.01257477, + "epoch": 0.24187584548324065, + "flos": 20382411928320.0, + "grad_norm": 1.7589452517035808, + "language_loss": 0.75334597, + "learning_rate": 3.5472786683292083e-06, + "loss": 0.83172357, + "num_input_tokens_seen": 86614805, + "router_z_loss_clip": 2.36523438, + "router_z_loss_mlp": 0.21789551, + "step": 4023, + "time_per_iteration": 2.5414137840270996 + }, + { + "auxiliary_loss_clip": 0.06554291, + "auxiliary_loss_mlp": 0.01279196, + "balance_loss_clip": 0.06325305, + "balance_loss_mlp": 0.01258466, + "epoch": 0.2419359687359086, + "flos": 21403915453440.0, + "grad_norm": 1.837159559636974, + "language_loss": 0.82581335, + "learning_rate": 3.5470318658630766e-06, + "loss": 0.90414816, + "num_input_tokens_seen": 86633700, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.20751953, + "step": 4024, + "time_per_iteration": 2.570636034011841 + }, + { + "auxiliary_loss_clip": 0.06544912, + "auxiliary_loss_mlp": 0.01281053, + "balance_loss_clip": 0.06319256, + "balance_loss_mlp": 0.01260394, + "epoch": 0.24199609198857658, + "flos": 18375309152640.0, + "grad_norm": 1.8763334718563411, + "language_loss": 0.86724782, + "learning_rate": 3.5467850047333424e-06, + "loss": 0.94550753, + "num_input_tokens_seen": 86650905, + "router_z_loss_clip": 2.25585938, + "router_z_loss_mlp": 0.20654297, + "step": 4025, + "time_per_iteration": 2.507725715637207 + }, + { + "auxiliary_loss_clip": 0.0654591, + "auxiliary_loss_mlp": 0.01281956, + "balance_loss_clip": 0.06312732, + "balance_loss_mlp": 0.01261905, + "epoch": 0.24205621524124454, + "flos": 19469962892160.0, + "grad_norm": 2.105058685916829, + "language_loss": 0.72386706, + "learning_rate": 3.546538084949365e-06, + "loss": 0.80214572, + "num_input_tokens_seen": 86669185, + "router_z_loss_clip": 2.33398438, + "router_z_loss_mlp": 0.20068359, + "step": 4026, + "time_per_iteration": 2.573822498321533 + }, + { + "auxiliary_loss_clip": 0.06536272, + "auxiliary_loss_mlp": 0.01278576, + "balance_loss_clip": 0.06314979, + "balance_loss_mlp": 0.01258191, + "epoch": 0.2421163384939125, + "flos": 14981706466560.0, + "grad_norm": 5.331027510747572, + "language_loss": 0.64474452, + "learning_rate": 3.546291106520509e-06, + "loss": 0.722893, + "num_input_tokens_seen": 86686805, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.20397949, + "step": 4027, + "time_per_iteration": 2.5038652420043945 + }, + { + "auxiliary_loss_clip": 0.06553975, + "auxiliary_loss_mlp": 0.01291382, + "balance_loss_clip": 0.063242, + "balance_loss_mlp": 0.01271069, + "epoch": 0.2421764617465805, + "flos": 18668161572480.0, + "grad_norm": 2.149571528027882, + "language_loss": 0.70816404, + "learning_rate": 3.5460440694561388e-06, + "loss": 0.78661758, + "num_input_tokens_seen": 86705520, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.203125, + "step": 4028, + "time_per_iteration": 2.5707366466522217 + }, + { + "auxiliary_loss_clip": 0.06448589, + "auxiliary_loss_mlp": 0.01261037, + "balance_loss_clip": 0.06335288, + "balance_loss_mlp": 0.01254865, + "epoch": 0.24223658499924847, + "flos": 64368025424640.0, + "grad_norm": 0.8397041896242922, + "language_loss": 0.55315495, + "learning_rate": 3.545796973765623e-06, + "loss": 0.63025129, + "num_input_tokens_seen": 86767320, + "router_z_loss_clip": 1.13085938, + "router_z_loss_mlp": 0.06170654, + "step": 4029, + "time_per_iteration": 3.149601936340332 + }, + { + "auxiliary_loss_clip": 0.06557409, + "auxiliary_loss_mlp": 0.01307587, + "balance_loss_clip": 0.06331506, + "balance_loss_mlp": 0.01284615, + "epoch": 0.24229670825191643, + "flos": 25782278849280.0, + "grad_norm": 2.2612571716693664, + "language_loss": 0.75111073, + "learning_rate": 3.54554981945833e-06, + "loss": 0.82976073, + "num_input_tokens_seen": 86788110, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.22998047, + "step": 4030, + "time_per_iteration": 2.5939297676086426 + }, + { + "auxiliary_loss_clip": 0.0654521, + "auxiliary_loss_mlp": 0.0130894, + "balance_loss_clip": 0.06321512, + "balance_loss_mlp": 0.01287733, + "epoch": 0.2423568315045844, + "flos": 20673251850240.0, + "grad_norm": 1.8607136485921192, + "language_loss": 0.77126729, + "learning_rate": 3.5453026065436343e-06, + "loss": 0.84980875, + "num_input_tokens_seen": 86807640, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.2121582, + "step": 4031, + "time_per_iteration": 2.5886638164520264 + }, + { + "auxiliary_loss_clip": 0.06556953, + "auxiliary_loss_mlp": 0.01312472, + "balance_loss_clip": 0.06323709, + "balance_loss_mlp": 0.01290252, + "epoch": 0.24241695475725236, + "flos": 22422987210240.0, + "grad_norm": 1.956173023936914, + "language_loss": 0.66108859, + "learning_rate": 3.5450553350309083e-06, + "loss": 0.73978281, + "num_input_tokens_seen": 86826795, + "router_z_loss_clip": 2.33203125, + "router_z_loss_mlp": 0.22216797, + "step": 4032, + "time_per_iteration": 2.5665037631988525 + }, + { + "auxiliary_loss_clip": 0.06539695, + "auxiliary_loss_mlp": 0.01309421, + "balance_loss_clip": 0.06316876, + "balance_loss_mlp": 0.0128751, + "epoch": 0.24247707800992033, + "flos": 17134732327680.0, + "grad_norm": 3.4494454498841725, + "language_loss": 0.81464761, + "learning_rate": 3.5448080049295286e-06, + "loss": 0.89313877, + "num_input_tokens_seen": 86843175, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.21911621, + "step": 4033, + "time_per_iteration": 2.5237317085266113 + }, + { + "auxiliary_loss_clip": 0.06538171, + "auxiliary_loss_mlp": 0.01328283, + "balance_loss_clip": 0.06318024, + "balance_loss_mlp": 0.01305359, + "epoch": 0.2425372012625883, + "flos": 31621885597440.0, + "grad_norm": 1.909836856098088, + "language_loss": 0.69935066, + "learning_rate": 3.5445606162488754e-06, + "loss": 0.7780152, + "num_input_tokens_seen": 86863185, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.22900391, + "step": 4034, + "time_per_iteration": 2.713991641998291 + }, + { + "auxiliary_loss_clip": 0.06546839, + "auxiliary_loss_mlp": 0.01319063, + "balance_loss_clip": 0.06324256, + "balance_loss_mlp": 0.01298273, + "epoch": 0.24259732451525629, + "flos": 16331589342720.0, + "grad_norm": 2.1729941621503532, + "language_loss": 0.96340013, + "learning_rate": 3.5443131689983283e-06, + "loss": 1.04205918, + "num_input_tokens_seen": 86880040, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.20776367, + "step": 4035, + "time_per_iteration": 2.532848596572876 + }, + { + "auxiliary_loss_clip": 0.06537193, + "auxiliary_loss_mlp": 0.01327475, + "balance_loss_clip": 0.06319901, + "balance_loss_mlp": 0.01307447, + "epoch": 0.24265744776792425, + "flos": 22863230161920.0, + "grad_norm": 1.6992215283488847, + "language_loss": 0.78653824, + "learning_rate": 3.5440656631872715e-06, + "loss": 0.8651849, + "num_input_tokens_seen": 86900610, + "router_z_loss_clip": 2.17578125, + "router_z_loss_mlp": 0.20019531, + "step": 4036, + "time_per_iteration": 2.6079328060150146 + }, + { + "auxiliary_loss_clip": 0.06539825, + "auxiliary_loss_mlp": 0.01304693, + "balance_loss_clip": 0.06315397, + "balance_loss_mlp": 0.01282806, + "epoch": 0.24271757102059222, + "flos": 21878008254720.0, + "grad_norm": 1.624872867937933, + "language_loss": 0.74970233, + "learning_rate": 3.5438180988250898e-06, + "loss": 0.82814753, + "num_input_tokens_seen": 86919385, + "router_z_loss_clip": 2.2421875, + "router_z_loss_mlp": 0.21887207, + "step": 4037, + "time_per_iteration": 2.561479091644287 + }, + { + "auxiliary_loss_clip": 0.06526245, + "auxiliary_loss_mlp": 0.01308805, + "balance_loss_clip": 0.06302498, + "balance_loss_mlp": 0.01287539, + "epoch": 0.24277769427326018, + "flos": 19214649901440.0, + "grad_norm": 4.15075765155633, + "language_loss": 0.76952362, + "learning_rate": 3.543570475921171e-06, + "loss": 0.84787416, + "num_input_tokens_seen": 86938885, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.21276855, + "step": 4038, + "time_per_iteration": 2.514899492263794 + }, + { + "auxiliary_loss_clip": 0.06539176, + "auxiliary_loss_mlp": 0.01295141, + "balance_loss_clip": 0.06314565, + "balance_loss_mlp": 0.01272992, + "epoch": 0.24283781752592815, + "flos": 19505909093760.0, + "grad_norm": 2.116114626089979, + "language_loss": 0.72802031, + "learning_rate": 3.543322794484905e-06, + "loss": 0.80636352, + "num_input_tokens_seen": 86957705, + "router_z_loss_clip": 2.24414062, + "router_z_loss_mlp": 0.22167969, + "step": 4039, + "time_per_iteration": 2.603787422180176 + }, + { + "auxiliary_loss_clip": 0.06537706, + "auxiliary_loss_mlp": 0.01290985, + "balance_loss_clip": 0.06312682, + "balance_loss_mlp": 0.01269372, + "epoch": 0.2428979407785961, + "flos": 19908444908160.0, + "grad_norm": 1.7691638050154863, + "language_loss": 0.78818536, + "learning_rate": 3.5430750545256843e-06, + "loss": 0.86647218, + "num_input_tokens_seen": 86975845, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.21606445, + "step": 4040, + "time_per_iteration": 2.570063829421997 + }, + { + "auxiliary_loss_clip": 0.06530759, + "auxiliary_loss_mlp": 0.01283615, + "balance_loss_clip": 0.06313588, + "balance_loss_mlp": 0.01265162, + "epoch": 0.2429580640312641, + "flos": 24722523135360.0, + "grad_norm": 1.6907745152184719, + "language_loss": 0.81039703, + "learning_rate": 3.5428272560529027e-06, + "loss": 0.8885408, + "num_input_tokens_seen": 86994800, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.18444824, + "step": 4041, + "time_per_iteration": 2.5693795680999756 + }, + { + "auxiliary_loss_clip": 0.06532191, + "auxiliary_loss_mlp": 0.01286793, + "balance_loss_clip": 0.06311769, + "balance_loss_mlp": 0.01267529, + "epoch": 0.24301818728393207, + "flos": 25637529720960.0, + "grad_norm": 3.2457124561568, + "language_loss": 0.77433085, + "learning_rate": 3.542579399075957e-06, + "loss": 0.8525207, + "num_input_tokens_seen": 87016845, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.19262695, + "step": 4042, + "time_per_iteration": 3.9626972675323486 + }, + { + "auxiliary_loss_clip": 0.0653407, + "auxiliary_loss_mlp": 0.01279857, + "balance_loss_clip": 0.06316316, + "balance_loss_mlp": 0.01260652, + "epoch": 0.24307831053660003, + "flos": 26148700753920.0, + "grad_norm": 1.8532279658121147, + "language_loss": 0.82188201, + "learning_rate": 3.542331483604246e-06, + "loss": 0.90002131, + "num_input_tokens_seen": 87036270, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.19226074, + "step": 4043, + "time_per_iteration": 2.598202705383301 + }, + { + "auxiliary_loss_clip": 0.06538229, + "auxiliary_loss_mlp": 0.0127841, + "balance_loss_clip": 0.06309159, + "balance_loss_mlp": 0.01256594, + "epoch": 0.243138433789268, + "flos": 14977136419200.0, + "grad_norm": 2.775508644952731, + "language_loss": 0.73897892, + "learning_rate": 3.5420835096471706e-06, + "loss": 0.81714529, + "num_input_tokens_seen": 87049920, + "router_z_loss_clip": 2.29101562, + "router_z_loss_mlp": 0.21801758, + "step": 4044, + "time_per_iteration": 2.483752489089966 + }, + { + "auxiliary_loss_clip": 0.06534028, + "auxiliary_loss_mlp": 0.01284645, + "balance_loss_clip": 0.0631184, + "balance_loss_mlp": 0.01263629, + "epoch": 0.24319855704193596, + "flos": 25198670361600.0, + "grad_norm": 2.3685654829247227, + "language_loss": 0.83778739, + "learning_rate": 3.5418354772141337e-06, + "loss": 0.91597402, + "num_input_tokens_seen": 87068230, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.21020508, + "step": 4045, + "time_per_iteration": 2.60435152053833 + }, + { + "auxiliary_loss_clip": 0.06529962, + "auxiliary_loss_mlp": 0.0127985, + "balance_loss_clip": 0.06307946, + "balance_loss_mlp": 0.01260323, + "epoch": 0.24325868029460393, + "flos": 22133740515840.0, + "grad_norm": 1.834350653864789, + "language_loss": 0.87040859, + "learning_rate": 3.541587386314541e-06, + "loss": 0.94850671, + "num_input_tokens_seen": 87086435, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.19519043, + "step": 4046, + "time_per_iteration": 3.990011692047119 + }, + { + "auxiliary_loss_clip": 0.0652798, + "auxiliary_loss_mlp": 0.01281438, + "balance_loss_clip": 0.06311028, + "balance_loss_mlp": 0.01260922, + "epoch": 0.2433188035472719, + "flos": 23588107833600.0, + "grad_norm": 2.274532821816236, + "language_loss": 0.72945291, + "learning_rate": 3.5413392369578e-06, + "loss": 0.80754709, + "num_input_tokens_seen": 87105340, + "router_z_loss_clip": 2.16894531, + "router_z_loss_mlp": 0.20495605, + "step": 4047, + "time_per_iteration": 2.552464246749878 + }, + { + "auxiliary_loss_clip": 0.06530058, + "auxiliary_loss_mlp": 0.01284969, + "balance_loss_clip": 0.06306041, + "balance_loss_mlp": 0.01263666, + "epoch": 0.2433789267999399, + "flos": 24469809621120.0, + "grad_norm": 3.993347012147321, + "language_loss": 0.74453223, + "learning_rate": 3.5410910291533213e-06, + "loss": 0.8226825, + "num_input_tokens_seen": 87125780, + "router_z_loss_clip": 2.23828125, + "router_z_loss_mlp": 0.21325684, + "step": 4048, + "time_per_iteration": 4.027734279632568 + }, + { + "auxiliary_loss_clip": 0.06529407, + "auxiliary_loss_mlp": 0.01275879, + "balance_loss_clip": 0.06309648, + "balance_loss_mlp": 0.0125671, + "epoch": 0.24343905005260785, + "flos": 16733622032640.0, + "grad_norm": 2.185429514920852, + "language_loss": 0.73832756, + "learning_rate": 3.5408427629105155e-06, + "loss": 0.81638038, + "num_input_tokens_seen": 87144470, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.19165039, + "step": 4049, + "time_per_iteration": 2.5527403354644775 + }, + { + "auxiliary_loss_clip": 0.06525055, + "auxiliary_loss_mlp": 0.01275563, + "balance_loss_clip": 0.06306046, + "balance_loss_mlp": 0.01256084, + "epoch": 0.24349917330527582, + "flos": 20049294821760.0, + "grad_norm": 1.6558681415401064, + "language_loss": 0.74824917, + "learning_rate": 3.5405944382387985e-06, + "loss": 0.82625538, + "num_input_tokens_seen": 87162830, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.19482422, + "step": 4050, + "time_per_iteration": 2.517671585083008 + }, + { + "auxiliary_loss_clip": 0.06520879, + "auxiliary_loss_mlp": 0.0127856, + "balance_loss_clip": 0.06303313, + "balance_loss_mlp": 0.01258187, + "epoch": 0.24355929655794378, + "flos": 17426285009280.0, + "grad_norm": 2.447710360159803, + "language_loss": 0.75780261, + "learning_rate": 3.5403460551475854e-06, + "loss": 0.83579695, + "num_input_tokens_seen": 87180905, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.20361328, + "step": 4051, + "time_per_iteration": 3.961841583251953 + }, + { + "auxiliary_loss_clip": 0.06532377, + "auxiliary_loss_mlp": 0.01277824, + "balance_loss_clip": 0.06310124, + "balance_loss_mlp": 0.01257343, + "epoch": 0.24361941981061175, + "flos": 25417995223680.0, + "grad_norm": 2.289221862828171, + "language_loss": 0.71344352, + "learning_rate": 3.540097613646296e-06, + "loss": 0.79154545, + "num_input_tokens_seen": 87202290, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.20471191, + "step": 4052, + "time_per_iteration": 2.5851869583129883 + }, + { + "auxiliary_loss_clip": 0.06524909, + "auxiliary_loss_mlp": 0.01278097, + "balance_loss_clip": 0.06306259, + "balance_loss_mlp": 0.01258583, + "epoch": 0.2436795430632797, + "flos": 22827493595520.0, + "grad_norm": 1.7731467261886882, + "language_loss": 0.82073057, + "learning_rate": 3.539849113744351e-06, + "loss": 0.89876068, + "num_input_tokens_seen": 87221650, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.19519043, + "step": 4053, + "time_per_iteration": 2.6217734813690186 + }, + { + "auxiliary_loss_clip": 0.06533736, + "auxiliary_loss_mlp": 0.01278722, + "balance_loss_clip": 0.06309207, + "balance_loss_mlp": 0.01260126, + "epoch": 0.2437396663159477, + "flos": 15163030702080.0, + "grad_norm": 1.5690390746940162, + "language_loss": 0.78588867, + "learning_rate": 3.539600555451172e-06, + "loss": 0.86401325, + "num_input_tokens_seen": 87238515, + "router_z_loss_clip": 2.24414062, + "router_z_loss_mlp": 0.18615723, + "step": 4054, + "time_per_iteration": 2.513720750808716 + }, + { + "auxiliary_loss_clip": 0.06529565, + "auxiliary_loss_mlp": 0.0128197, + "balance_loss_clip": 0.06307493, + "balance_loss_mlp": 0.01263111, + "epoch": 0.24379978956861567, + "flos": 22097710460160.0, + "grad_norm": 1.7039269278884617, + "language_loss": 0.84417951, + "learning_rate": 3.5393519387761866e-06, + "loss": 0.92229491, + "num_input_tokens_seen": 87256290, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.1887207, + "step": 4055, + "time_per_iteration": 2.557584524154663 + }, + { + "auxiliary_loss_clip": 0.06542832, + "auxiliary_loss_mlp": 0.01280691, + "balance_loss_clip": 0.06312343, + "balance_loss_mlp": 0.01259508, + "epoch": 0.24385991282128364, + "flos": 31475878657920.0, + "grad_norm": 2.786051029634521, + "language_loss": 0.56684959, + "learning_rate": 3.5391032637288217e-06, + "loss": 0.6450848, + "num_input_tokens_seen": 87277085, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.21179199, + "step": 4056, + "time_per_iteration": 2.6548893451690674 + }, + { + "auxiliary_loss_clip": 0.06533613, + "auxiliary_loss_mlp": 0.01283826, + "balance_loss_clip": 0.06307291, + "balance_loss_mlp": 0.01262321, + "epoch": 0.2439200360739516, + "flos": 23845055978880.0, + "grad_norm": 2.215401064957846, + "language_loss": 0.80586845, + "learning_rate": 3.538854530318506e-06, + "loss": 0.88404286, + "num_input_tokens_seen": 87293020, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.21520996, + "step": 4057, + "time_per_iteration": 2.5563580989837646 + }, + { + "auxiliary_loss_clip": 0.06533922, + "auxiliary_loss_mlp": 0.01279797, + "balance_loss_clip": 0.06311886, + "balance_loss_mlp": 0.01261009, + "epoch": 0.24398015932661957, + "flos": 19175684952960.0, + "grad_norm": 1.7331406857586058, + "language_loss": 0.79934907, + "learning_rate": 3.538605738554673e-06, + "loss": 0.87748623, + "num_input_tokens_seen": 87311445, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.18786621, + "step": 4058, + "time_per_iteration": 2.5552098751068115 + }, + { + "auxiliary_loss_clip": 0.06541391, + "auxiliary_loss_mlp": 0.01280168, + "balance_loss_clip": 0.06312001, + "balance_loss_mlp": 0.01259772, + "epoch": 0.24404028257928753, + "flos": 25269095318400.0, + "grad_norm": 1.7324044437804977, + "language_loss": 0.86104828, + "learning_rate": 3.538356888446756e-06, + "loss": 0.93926388, + "num_input_tokens_seen": 87332055, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.20410156, + "step": 4059, + "time_per_iteration": 2.575345754623413 + }, + { + "auxiliary_loss_clip": 0.06538763, + "auxiliary_loss_mlp": 0.01274337, + "balance_loss_clip": 0.06318676, + "balance_loss_mlp": 0.01255621, + "epoch": 0.2441004058319555, + "flos": 26474606409600.0, + "grad_norm": 1.5285193147278118, + "language_loss": 0.74698234, + "learning_rate": 3.5381079800041913e-06, + "loss": 0.8251133, + "num_input_tokens_seen": 87351295, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.18713379, + "step": 4060, + "time_per_iteration": 2.6277999877929688 + }, + { + "auxiliary_loss_clip": 0.06560756, + "auxiliary_loss_mlp": 0.01280844, + "balance_loss_clip": 0.06327853, + "balance_loss_mlp": 0.01259469, + "epoch": 0.2441605290846235, + "flos": 26767752318720.0, + "grad_norm": 1.6858410849727605, + "language_loss": 0.73894358, + "learning_rate": 3.5378590132364182e-06, + "loss": 0.81735957, + "num_input_tokens_seen": 87370650, + "router_z_loss_clip": 2.328125, + "router_z_loss_mlp": 0.21374512, + "step": 4061, + "time_per_iteration": 2.5895774364471436 + }, + { + "auxiliary_loss_clip": 0.06538899, + "auxiliary_loss_mlp": 0.01273593, + "balance_loss_clip": 0.0631846, + "balance_loss_mlp": 0.01254103, + "epoch": 0.24422065233729146, + "flos": 21112236990720.0, + "grad_norm": 1.7809128746808311, + "language_loss": 0.76782405, + "learning_rate": 3.5376099881528768e-06, + "loss": 0.84594905, + "num_input_tokens_seen": 87389020, + "router_z_loss_clip": 2.20507812, + "router_z_loss_mlp": 0.19494629, + "step": 4062, + "time_per_iteration": 2.5655109882354736 + }, + { + "auxiliary_loss_clip": 0.06538436, + "auxiliary_loss_mlp": 0.01278297, + "balance_loss_clip": 0.06319936, + "balance_loss_mlp": 0.01258019, + "epoch": 0.24428077558995942, + "flos": 25269891932160.0, + "grad_norm": 1.624722619478305, + "language_loss": 0.84975201, + "learning_rate": 3.537360904763011e-06, + "loss": 0.92791933, + "num_input_tokens_seen": 87409695, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.20263672, + "step": 4063, + "time_per_iteration": 2.569420576095581 + }, + { + "auxiliary_loss_clip": 0.06559969, + "auxiliary_loss_mlp": 0.01275678, + "balance_loss_clip": 0.06327148, + "balance_loss_mlp": 0.01254459, + "epoch": 0.24434089884262739, + "flos": 20491508344320.0, + "grad_norm": 2.099790248638241, + "language_loss": 0.68837494, + "learning_rate": 3.5371117630762656e-06, + "loss": 0.76673138, + "num_input_tokens_seen": 87428250, + "router_z_loss_clip": 2.328125, + "router_z_loss_mlp": 0.2121582, + "step": 4064, + "time_per_iteration": 2.560065984725952 + }, + { + "auxiliary_loss_clip": 0.06547809, + "auxiliary_loss_mlp": 0.01276127, + "balance_loss_clip": 0.06317605, + "balance_loss_mlp": 0.01255349, + "epoch": 0.24440102209529535, + "flos": 23628456374400.0, + "grad_norm": 1.7607893449036869, + "language_loss": 0.70700729, + "learning_rate": 3.536862563102088e-06, + "loss": 0.78524667, + "num_input_tokens_seen": 87449380, + "router_z_loss_clip": 2.29882812, + "router_z_loss_mlp": 0.20788574, + "step": 4065, + "time_per_iteration": 2.5619614124298096 + }, + { + "auxiliary_loss_clip": 0.06554856, + "auxiliary_loss_mlp": 0.0127847, + "balance_loss_clip": 0.06322616, + "balance_loss_mlp": 0.01256726, + "epoch": 0.24446114534796332, + "flos": 20560382000640.0, + "grad_norm": 2.0639555504298372, + "language_loss": 0.84639663, + "learning_rate": 3.5366133048499282e-06, + "loss": 0.92472994, + "num_input_tokens_seen": 87465365, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.21765137, + "step": 4066, + "time_per_iteration": 2.5640382766723633 + }, + { + "auxiliary_loss_clip": 0.0647334, + "auxiliary_loss_mlp": 0.01266455, + "balance_loss_clip": 0.06356817, + "balance_loss_mlp": 0.01260456, + "epoch": 0.24452126860063128, + "flos": 60406719327360.0, + "grad_norm": 0.7224646734980834, + "language_loss": 0.52123713, + "learning_rate": 3.5363639883292374e-06, + "loss": 0.59863508, + "num_input_tokens_seen": 87522525, + "router_z_loss_clip": 1.1640625, + "router_z_loss_mlp": 0.05990601, + "step": 4067, + "time_per_iteration": 3.067857503890991 + }, + { + "auxiliary_loss_clip": 0.06549152, + "auxiliary_loss_mlp": 0.01275932, + "balance_loss_clip": 0.063198, + "balance_loss_mlp": 0.01255106, + "epoch": 0.24458139185329927, + "flos": 15126958719360.0, + "grad_norm": 4.582785635832698, + "language_loss": 0.72625411, + "learning_rate": 3.5361146135494706e-06, + "loss": 0.80450499, + "num_input_tokens_seen": 87539170, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.20825195, + "step": 4068, + "time_per_iteration": 2.5490705966949463 + }, + { + "auxiliary_loss_clip": 0.06542531, + "auxiliary_loss_mlp": 0.0127677, + "balance_loss_clip": 0.06318012, + "balance_loss_mlp": 0.01256111, + "epoch": 0.24464151510596724, + "flos": 28005771594240.0, + "grad_norm": 1.4744908303961997, + "language_loss": 0.7839663, + "learning_rate": 3.5358651805200835e-06, + "loss": 0.86215931, + "num_input_tokens_seen": 87558875, + "router_z_loss_clip": 2.24609375, + "router_z_loss_mlp": 0.20654297, + "step": 4069, + "time_per_iteration": 2.6064302921295166 + }, + { + "auxiliary_loss_clip": 0.06535528, + "auxiliary_loss_mlp": 0.01277448, + "balance_loss_clip": 0.06312935, + "balance_loss_mlp": 0.01257493, + "epoch": 0.2447016383586352, + "flos": 19799138856960.0, + "grad_norm": 1.9167348410225946, + "language_loss": 0.80741036, + "learning_rate": 3.5356156892505347e-06, + "loss": 0.88554007, + "num_input_tokens_seen": 87576485, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.19946289, + "step": 4070, + "time_per_iteration": 2.633073568344116 + }, + { + "auxiliary_loss_clip": 0.06543916, + "auxiliary_loss_mlp": 0.0127809, + "balance_loss_clip": 0.06317008, + "balance_loss_mlp": 0.01258825, + "epoch": 0.24476176161130317, + "flos": 26074460436480.0, + "grad_norm": 1.476613235331205, + "language_loss": 0.8444066, + "learning_rate": 3.5353661397502854e-06, + "loss": 0.92262667, + "num_input_tokens_seen": 87598620, + "router_z_loss_clip": 2.2734375, + "router_z_loss_mlp": 0.19262695, + "step": 4071, + "time_per_iteration": 2.6165285110473633 + }, + { + "auxiliary_loss_clip": 0.06545337, + "auxiliary_loss_mlp": 0.01275719, + "balance_loss_clip": 0.06310376, + "balance_loss_mlp": 0.01254679, + "epoch": 0.24482188486397113, + "flos": 18849527735040.0, + "grad_norm": 2.1913275656577857, + "language_loss": 0.8027429, + "learning_rate": 3.535116532028798e-06, + "loss": 0.88095343, + "num_input_tokens_seen": 87616595, + "router_z_loss_clip": 2.3515625, + "router_z_loss_mlp": 0.21032715, + "step": 4072, + "time_per_iteration": 2.580077648162842 + }, + { + "auxiliary_loss_clip": 0.06531823, + "auxiliary_loss_mlp": 0.01275557, + "balance_loss_clip": 0.06311209, + "balance_loss_mlp": 0.01257031, + "epoch": 0.2448820081166391, + "flos": 21258202003200.0, + "grad_norm": 1.4781582217057618, + "language_loss": 0.7076053, + "learning_rate": 3.5348668660955382e-06, + "loss": 0.7856791, + "num_input_tokens_seen": 87635755, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.18505859, + "step": 4073, + "time_per_iteration": 2.5430707931518555 + }, + { + "auxiliary_loss_clip": 0.06525481, + "auxiliary_loss_mlp": 0.01279613, + "balance_loss_clip": 0.06303517, + "balance_loss_mlp": 0.01260921, + "epoch": 0.2449421313693071, + "flos": 23957254995840.0, + "grad_norm": 2.412576467354098, + "language_loss": 0.67577648, + "learning_rate": 3.5346171419599728e-06, + "loss": 0.75382745, + "num_input_tokens_seen": 87652885, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.18676758, + "step": 4074, + "time_per_iteration": 2.5616037845611572 + }, + { + "auxiliary_loss_clip": 0.06435025, + "auxiliary_loss_mlp": 0.01257107, + "balance_loss_clip": 0.06320108, + "balance_loss_mlp": 0.01251907, + "epoch": 0.24500225462197506, + "flos": 60705902730240.0, + "grad_norm": 0.8764237694402175, + "language_loss": 0.68656927, + "learning_rate": 3.5343673596315718e-06, + "loss": 0.76349056, + "num_input_tokens_seen": 87713220, + "router_z_loss_clip": 1.1484375, + "router_z_loss_mlp": 0.05203247, + "step": 4075, + "time_per_iteration": 3.2623581886291504 + }, + { + "auxiliary_loss_clip": 0.06527948, + "auxiliary_loss_mlp": 0.01276643, + "balance_loss_clip": 0.06305515, + "balance_loss_mlp": 0.01257414, + "epoch": 0.24506237787464302, + "flos": 26291018113920.0, + "grad_norm": 2.301278269127432, + "language_loss": 0.79781568, + "learning_rate": 3.5341175191198063e-06, + "loss": 0.87586164, + "num_input_tokens_seen": 87732680, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.19226074, + "step": 4076, + "time_per_iteration": 2.6342012882232666 + }, + { + "auxiliary_loss_clip": 0.06535772, + "auxiliary_loss_mlp": 0.01280909, + "balance_loss_clip": 0.06304428, + "balance_loss_mlp": 0.01258462, + "epoch": 0.245122501127311, + "flos": 20557530961920.0, + "grad_norm": 1.9232761502629154, + "language_loss": 0.82461953, + "learning_rate": 3.533867620434151e-06, + "loss": 0.90278631, + "num_input_tokens_seen": 87751880, + "router_z_loss_clip": 2.31445312, + "router_z_loss_mlp": 0.2244873, + "step": 4077, + "time_per_iteration": 2.5863101482391357 + }, + { + "auxiliary_loss_clip": 0.06532669, + "auxiliary_loss_mlp": 0.01279172, + "balance_loss_clip": 0.06305817, + "balance_loss_mlp": 0.01257774, + "epoch": 0.24518262437997895, + "flos": 29140312677120.0, + "grad_norm": 2.8377644839815357, + "language_loss": 0.63268852, + "learning_rate": 3.533617663584082e-06, + "loss": 0.71080685, + "num_input_tokens_seen": 87771795, + "router_z_loss_clip": 2.26953125, + "router_z_loss_mlp": 0.21398926, + "step": 4078, + "time_per_iteration": 2.6045711040496826 + }, + { + "auxiliary_loss_clip": 0.06522519, + "auxiliary_loss_mlp": 0.01277179, + "balance_loss_clip": 0.06301752, + "balance_loss_mlp": 0.01258249, + "epoch": 0.24524274763264692, + "flos": 23483623392000.0, + "grad_norm": 1.4700896000405594, + "language_loss": 0.75762683, + "learning_rate": 3.5333676485790765e-06, + "loss": 0.8356238, + "num_input_tokens_seen": 87793640, + "router_z_loss_clip": 2.20800781, + "router_z_loss_mlp": 0.18933105, + "step": 4079, + "time_per_iteration": 2.6327531337738037 + }, + { + "auxiliary_loss_clip": 0.06521107, + "auxiliary_loss_mlp": 0.01276139, + "balance_loss_clip": 0.06297373, + "balance_loss_mlp": 0.01256171, + "epoch": 0.24530287088531488, + "flos": 17206792439040.0, + "grad_norm": 1.743597814486786, + "language_loss": 0.75652814, + "learning_rate": 3.5331175754286173e-06, + "loss": 0.83450055, + "num_input_tokens_seen": 87812390, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.1998291, + "step": 4080, + "time_per_iteration": 2.5027806758880615 + }, + { + "auxiliary_loss_clip": 0.06517033, + "auxiliary_loss_mlp": 0.01282693, + "balance_loss_clip": 0.06296979, + "balance_loss_mlp": 0.01262129, + "epoch": 0.24536299413798288, + "flos": 14872903539840.0, + "grad_norm": 1.7999885027482954, + "language_loss": 0.83532149, + "learning_rate": 3.532867444142186e-06, + "loss": 0.91331875, + "num_input_tokens_seen": 87830640, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.20544434, + "step": 4081, + "time_per_iteration": 3.9672679901123047 + }, + { + "auxiliary_loss_clip": 0.06524678, + "auxiliary_loss_mlp": 0.01276758, + "balance_loss_clip": 0.06300613, + "balance_loss_mlp": 0.01257458, + "epoch": 0.24542311739065084, + "flos": 35270759347200.0, + "grad_norm": 2.0934334924975797, + "language_loss": 0.7376107, + "learning_rate": 3.532617254729267e-06, + "loss": 0.81562507, + "num_input_tokens_seen": 87850450, + "router_z_loss_clip": 2.2421875, + "router_z_loss_mlp": 0.19311523, + "step": 4082, + "time_per_iteration": 2.687596559524536 + }, + { + "auxiliary_loss_clip": 0.06520141, + "auxiliary_loss_mlp": 0.01272132, + "balance_loss_clip": 0.06301866, + "balance_loss_mlp": 0.01254334, + "epoch": 0.2454832406433188, + "flos": 21508903019520.0, + "grad_norm": 4.081398895882933, + "language_loss": 0.72681344, + "learning_rate": 3.5323670071993485e-06, + "loss": 0.8047362, + "num_input_tokens_seen": 87868810, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.17810059, + "step": 4083, + "time_per_iteration": 2.5715560913085938 + }, + { + "auxiliary_loss_clip": 0.06531677, + "auxiliary_loss_mlp": 0.01285124, + "balance_loss_clip": 0.06304878, + "balance_loss_mlp": 0.01263404, + "epoch": 0.24554336389598677, + "flos": 14761878480000.0, + "grad_norm": 2.078496591548884, + "language_loss": 0.75461411, + "learning_rate": 3.532116701561919e-06, + "loss": 0.83278215, + "num_input_tokens_seen": 87885685, + "router_z_loss_clip": 2.26953125, + "router_z_loss_mlp": 0.21704102, + "step": 4084, + "time_per_iteration": 2.527059316635132 + }, + { + "auxiliary_loss_clip": 0.06521569, + "auxiliary_loss_mlp": 0.01278312, + "balance_loss_clip": 0.06299873, + "balance_loss_mlp": 0.01259238, + "epoch": 0.24560348714865474, + "flos": 14981790320640.0, + "grad_norm": 1.9240939687866982, + "language_loss": 0.85311353, + "learning_rate": 3.531866337826471e-06, + "loss": 0.93111229, + "num_input_tokens_seen": 87903715, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.19055176, + "step": 4085, + "time_per_iteration": 4.107008695602417 + }, + { + "auxiliary_loss_clip": 0.06523392, + "auxiliary_loss_mlp": 0.01277742, + "balance_loss_clip": 0.06299591, + "balance_loss_mlp": 0.0125725, + "epoch": 0.2456636104013227, + "flos": 22682073634560.0, + "grad_norm": 1.671481131781836, + "language_loss": 0.79073685, + "learning_rate": 3.5316159160024982e-06, + "loss": 0.86874819, + "num_input_tokens_seen": 87923375, + "router_z_loss_clip": 2.23828125, + "router_z_loss_mlp": 0.20495605, + "step": 4086, + "time_per_iteration": 2.5609679222106934 + }, + { + "auxiliary_loss_clip": 0.06519614, + "auxiliary_loss_mlp": 0.01278477, + "balance_loss_clip": 0.06300113, + "balance_loss_mlp": 0.01260107, + "epoch": 0.2457237336539907, + "flos": 27425307634560.0, + "grad_norm": 1.6115503736345718, + "language_loss": 0.75352013, + "learning_rate": 3.531365436099496e-06, + "loss": 0.83150113, + "num_input_tokens_seen": 87943115, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.18359375, + "step": 4087, + "time_per_iteration": 4.046957015991211 + }, + { + "auxiliary_loss_clip": 0.06525059, + "auxiliary_loss_mlp": 0.01276774, + "balance_loss_clip": 0.06299827, + "balance_loss_mlp": 0.0125633, + "epoch": 0.24578385690665866, + "flos": 20418609692160.0, + "grad_norm": 2.7081304915573914, + "language_loss": 0.79987848, + "learning_rate": 3.5311148981269635e-06, + "loss": 0.87789685, + "num_input_tokens_seen": 87959505, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.20458984, + "step": 4088, + "time_per_iteration": 2.5119664669036865 + }, + { + "auxiliary_loss_clip": 0.06519316, + "auxiliary_loss_mlp": 0.01276403, + "balance_loss_clip": 0.06303117, + "balance_loss_mlp": 0.01258152, + "epoch": 0.24584398015932662, + "flos": 23922273116160.0, + "grad_norm": 2.802199957042034, + "language_loss": 0.77758735, + "learning_rate": 3.5308643020944e-06, + "loss": 0.85554451, + "num_input_tokens_seen": 87979725, + "router_z_loss_clip": 2.16601562, + "router_z_loss_mlp": 0.18249512, + "step": 4089, + "time_per_iteration": 2.5686089992523193 + }, + { + "auxiliary_loss_clip": 0.06525148, + "auxiliary_loss_mlp": 0.01281238, + "balance_loss_clip": 0.0630155, + "balance_loss_mlp": 0.01261021, + "epoch": 0.2459041034119946, + "flos": 41505313115520.0, + "grad_norm": 1.8031915906993192, + "language_loss": 0.81701422, + "learning_rate": 3.530613648011309e-06, + "loss": 0.89507812, + "num_input_tokens_seen": 87998270, + "router_z_loss_clip": 2.23828125, + "router_z_loss_mlp": 0.20214844, + "step": 4090, + "time_per_iteration": 2.678403377532959 + }, + { + "auxiliary_loss_clip": 0.065328, + "auxiliary_loss_mlp": 0.01279305, + "balance_loss_clip": 0.06309135, + "balance_loss_mlp": 0.01258861, + "epoch": 0.24596422666466256, + "flos": 19942755955200.0, + "grad_norm": 2.438516046551743, + "language_loss": 0.73629344, + "learning_rate": 3.5303629358871946e-06, + "loss": 0.8144145, + "num_input_tokens_seen": 88016760, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.20446777, + "step": 4091, + "time_per_iteration": 3.961276054382324 + }, + { + "auxiliary_loss_clip": 0.06539448, + "auxiliary_loss_mlp": 0.01279874, + "balance_loss_clip": 0.06316313, + "balance_loss_mlp": 0.0126148, + "epoch": 0.24602434991733052, + "flos": 21550970568960.0, + "grad_norm": 2.2480658521871897, + "language_loss": 0.77723873, + "learning_rate": 3.5301121657315653e-06, + "loss": 0.85543197, + "num_input_tokens_seen": 88036465, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.18408203, + "step": 4092, + "time_per_iteration": 2.5494375228881836 + }, + { + "auxiliary_loss_clip": 0.06537454, + "auxiliary_loss_mlp": 0.01278374, + "balance_loss_clip": 0.06307742, + "balance_loss_mlp": 0.01258907, + "epoch": 0.24608447316999849, + "flos": 23191735294080.0, + "grad_norm": 2.380112015735871, + "language_loss": 0.82381165, + "learning_rate": 3.5298613375539287e-06, + "loss": 0.90196991, + "num_input_tokens_seen": 88053270, + "router_z_loss_clip": 2.30078125, + "router_z_loss_mlp": 0.19470215, + "step": 4093, + "time_per_iteration": 2.5551040172576904 + }, + { + "auxiliary_loss_clip": 0.06532703, + "auxiliary_loss_mlp": 0.01285, + "balance_loss_clip": 0.06305315, + "balance_loss_mlp": 0.01264412, + "epoch": 0.24614459642266648, + "flos": 19647345985920.0, + "grad_norm": 21.11973952887688, + "language_loss": 0.87671578, + "learning_rate": 3.529610451363797e-06, + "loss": 0.95489287, + "num_input_tokens_seen": 88072305, + "router_z_loss_clip": 2.27148438, + "router_z_loss_mlp": 0.20581055, + "step": 4094, + "time_per_iteration": 2.534127712249756 + }, + { + "auxiliary_loss_clip": 0.06404499, + "auxiliary_loss_mlp": 0.01293713, + "balance_loss_clip": 0.06291573, + "balance_loss_mlp": 0.01289332, + "epoch": 0.24620471967533444, + "flos": 61757231109120.0, + "grad_norm": 0.7533459551406883, + "language_loss": 0.57023478, + "learning_rate": 3.5293595071706833e-06, + "loss": 0.64721692, + "num_input_tokens_seen": 88137995, + "router_z_loss_clip": 1.125, + "router_z_loss_mlp": 0.04388428, + "step": 4095, + "time_per_iteration": 3.238482713699341 + }, + { + "auxiliary_loss_clip": 0.06404348, + "auxiliary_loss_mlp": 0.01286038, + "balance_loss_clip": 0.06290346, + "balance_loss_mlp": 0.01281767, + "epoch": 0.2462648429280024, + "flos": 69174431003520.0, + "grad_norm": 0.6365745764429788, + "language_loss": 0.56240451, + "learning_rate": 3.5291085049841042e-06, + "loss": 0.63930833, + "num_input_tokens_seen": 88208490, + "router_z_loss_clip": 1.140625, + "router_z_loss_mlp": 0.04275513, + "step": 4096, + "time_per_iteration": 3.3192596435546875 + }, + { + "auxiliary_loss_clip": 0.06545975, + "auxiliary_loss_mlp": 0.01281956, + "balance_loss_clip": 0.06318395, + "balance_loss_mlp": 0.01262143, + "epoch": 0.24632496618067037, + "flos": 29467140727680.0, + "grad_norm": 1.505356285132213, + "language_loss": 0.78075927, + "learning_rate": 3.5288574448135773e-06, + "loss": 0.85903859, + "num_input_tokens_seen": 88228050, + "router_z_loss_clip": 2.2734375, + "router_z_loss_mlp": 0.19812012, + "step": 4097, + "time_per_iteration": 2.617108106613159 + }, + { + "auxiliary_loss_clip": 0.06547391, + "auxiliary_loss_mlp": 0.01279842, + "balance_loss_clip": 0.06315026, + "balance_loss_mlp": 0.01259993, + "epoch": 0.24638508943333834, + "flos": 24323341484160.0, + "grad_norm": 2.0372573834811267, + "language_loss": 0.77321315, + "learning_rate": 3.5286063266686235e-06, + "loss": 0.85148549, + "num_input_tokens_seen": 88248090, + "router_z_loss_clip": 2.32421875, + "router_z_loss_mlp": 0.1986084, + "step": 4098, + "time_per_iteration": 2.6069419384002686 + }, + { + "auxiliary_loss_clip": 0.06542017, + "auxiliary_loss_mlp": 0.01275521, + "balance_loss_clip": 0.0631687, + "balance_loss_mlp": 0.01257341, + "epoch": 0.2464452126860063, + "flos": 26620236005760.0, + "grad_norm": 2.17921698337753, + "language_loss": 0.69183016, + "learning_rate": 3.528355150558764e-06, + "loss": 0.77000558, + "num_input_tokens_seen": 88267545, + "router_z_loss_clip": 2.25, + "router_z_loss_mlp": 0.1817627, + "step": 4099, + "time_per_iteration": 2.655956506729126 + }, + { + "auxiliary_loss_clip": 0.06525709, + "auxiliary_loss_mlp": 0.01274552, + "balance_loss_clip": 0.06309929, + "balance_loss_mlp": 0.01256062, + "epoch": 0.24650533593867427, + "flos": 31220481813120.0, + "grad_norm": 2.2743270797915076, + "language_loss": 0.67268491, + "learning_rate": 3.5281039164935237e-06, + "loss": 0.75068748, + "num_input_tokens_seen": 88289785, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.18493652, + "step": 4100, + "time_per_iteration": 2.6497559547424316 + }, + { + "auxiliary_loss_clip": 0.0641202, + "auxiliary_loss_mlp": 0.01258309, + "balance_loss_clip": 0.06296985, + "balance_loss_mlp": 0.01253758, + "epoch": 0.24656545919134226, + "flos": 68513269962240.0, + "grad_norm": 0.6889590379062642, + "language_loss": 0.61607081, + "learning_rate": 3.5278526244824304e-06, + "loss": 0.69277412, + "num_input_tokens_seen": 88357320, + "router_z_loss_clip": 1.1484375, + "router_z_loss_mlp": 0.04559326, + "step": 4101, + "time_per_iteration": 3.2961082458496094 + }, + { + "auxiliary_loss_clip": 0.06538613, + "auxiliary_loss_mlp": 0.01277942, + "balance_loss_clip": 0.06317261, + "balance_loss_mlp": 0.01259, + "epoch": 0.24662558244401023, + "flos": 20090398049280.0, + "grad_norm": 1.6193028382456236, + "language_loss": 0.73591036, + "learning_rate": 3.527601274535012e-06, + "loss": 0.81407589, + "num_input_tokens_seen": 88377040, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.18945312, + "step": 4102, + "time_per_iteration": 2.542275905609131 + }, + { + "auxiliary_loss_clip": 0.0654332, + "auxiliary_loss_mlp": 0.01273749, + "balance_loss_clip": 0.06317908, + "balance_loss_mlp": 0.01255152, + "epoch": 0.2466857056966782, + "flos": 30709310780160.0, + "grad_norm": 2.0137613654817854, + "language_loss": 0.76325667, + "learning_rate": 3.5273498666608004e-06, + "loss": 0.84142733, + "num_input_tokens_seen": 88395085, + "router_z_loss_clip": 2.25, + "router_z_loss_mlp": 0.18603516, + "step": 4103, + "time_per_iteration": 2.6544189453125 + }, + { + "auxiliary_loss_clip": 0.06542745, + "auxiliary_loss_mlp": 0.01273413, + "balance_loss_clip": 0.06315098, + "balance_loss_mlp": 0.01253159, + "epoch": 0.24674582894934616, + "flos": 22535102373120.0, + "grad_norm": 2.0816413841430697, + "language_loss": 0.79265451, + "learning_rate": 3.5270984008693288e-06, + "loss": 0.87081611, + "num_input_tokens_seen": 88413205, + "router_z_loss_clip": 2.27929688, + "router_z_loss_mlp": 0.20275879, + "step": 4104, + "time_per_iteration": 2.5569820404052734 + }, + { + "auxiliary_loss_clip": 0.06525403, + "auxiliary_loss_mlp": 0.01276885, + "balance_loss_clip": 0.06306183, + "balance_loss_mlp": 0.01257251, + "epoch": 0.24680595220201412, + "flos": 20710581644160.0, + "grad_norm": 1.7450607123984514, + "language_loss": 0.83681756, + "learning_rate": 3.526846877170133e-06, + "loss": 0.9148404, + "num_input_tokens_seen": 88431525, + "router_z_loss_clip": 2.19335938, + "router_z_loss_mlp": 0.19641113, + "step": 4105, + "time_per_iteration": 2.553579330444336 + }, + { + "auxiliary_loss_clip": 0.06533727, + "auxiliary_loss_mlp": 0.01273598, + "balance_loss_clip": 0.06309752, + "balance_loss_mlp": 0.01255371, + "epoch": 0.2468660754546821, + "flos": 21836946954240.0, + "grad_norm": 1.9208859898797113, + "language_loss": 0.77469373, + "learning_rate": 3.52659529557275e-06, + "loss": 0.85276699, + "num_input_tokens_seen": 88451210, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.18212891, + "step": 4106, + "time_per_iteration": 2.5389256477355957 + }, + { + "auxiliary_loss_clip": 0.06534247, + "auxiliary_loss_mlp": 0.01276275, + "balance_loss_clip": 0.06310344, + "balance_loss_mlp": 0.01257463, + "epoch": 0.24692619870735008, + "flos": 15273049512960.0, + "grad_norm": 2.4615103155960485, + "language_loss": 0.73436344, + "learning_rate": 3.5263436560867205e-06, + "loss": 0.81246865, + "num_input_tokens_seen": 88467790, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.18798828, + "step": 4107, + "time_per_iteration": 2.5545566082000732 + }, + { + "auxiliary_loss_clip": 0.06538644, + "auxiliary_loss_mlp": 0.01275055, + "balance_loss_clip": 0.06314194, + "balance_loss_mlp": 0.01256745, + "epoch": 0.24698632196001805, + "flos": 29687933036160.0, + "grad_norm": 2.1377324014009504, + "language_loss": 0.66432422, + "learning_rate": 3.526091958721587e-06, + "loss": 0.7424612, + "num_input_tokens_seen": 88490330, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.18322754, + "step": 4108, + "time_per_iteration": 2.6196486949920654 + }, + { + "auxiliary_loss_clip": 0.06540007, + "auxiliary_loss_mlp": 0.01277779, + "balance_loss_clip": 0.06313555, + "balance_loss_mlp": 0.01259623, + "epoch": 0.247046445212686, + "flos": 39174736452480.0, + "grad_norm": 2.010829594577025, + "language_loss": 0.73608756, + "learning_rate": 3.5258402034868936e-06, + "loss": 0.81426549, + "num_input_tokens_seen": 88512435, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.18151855, + "step": 4109, + "time_per_iteration": 2.764406442642212 + }, + { + "auxiliary_loss_clip": 0.06534623, + "auxiliary_loss_mlp": 0.01277352, + "balance_loss_clip": 0.06311052, + "balance_loss_mlp": 0.01259077, + "epoch": 0.24710656846535398, + "flos": 23004834762240.0, + "grad_norm": 1.68605601916547, + "language_loss": 0.79419786, + "learning_rate": 3.5255883903921866e-06, + "loss": 0.87231761, + "num_input_tokens_seen": 88529780, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.18249512, + "step": 4110, + "time_per_iteration": 2.5460774898529053 + }, + { + "auxiliary_loss_clip": 0.06540776, + "auxiliary_loss_mlp": 0.01276666, + "balance_loss_clip": 0.06313831, + "balance_loss_mlp": 0.01257032, + "epoch": 0.24716669171802194, + "flos": 26440085727360.0, + "grad_norm": 2.6454329848736604, + "language_loss": 0.81789577, + "learning_rate": 3.5253365194470144e-06, + "loss": 0.89607012, + "num_input_tokens_seen": 88547200, + "router_z_loss_clip": 2.26953125, + "router_z_loss_mlp": 0.19628906, + "step": 4111, + "time_per_iteration": 2.632023811340332 + }, + { + "auxiliary_loss_clip": 0.06537174, + "auxiliary_loss_mlp": 0.0127416, + "balance_loss_clip": 0.06311068, + "balance_loss_mlp": 0.01256065, + "epoch": 0.2472268149706899, + "flos": 23336358641280.0, + "grad_norm": 1.983709335436533, + "language_loss": 0.75390071, + "learning_rate": 3.5250845906609294e-06, + "loss": 0.83201408, + "num_input_tokens_seen": 88566415, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.18115234, + "step": 4112, + "time_per_iteration": 2.5546083450317383 + }, + { + "auxiliary_loss_clip": 0.06533875, + "auxiliary_loss_mlp": 0.01274467, + "balance_loss_clip": 0.06308994, + "balance_loss_mlp": 0.01255548, + "epoch": 0.24728693822335787, + "flos": 23775469562880.0, + "grad_norm": 2.380234182887367, + "language_loss": 0.83472633, + "learning_rate": 3.5248326040434835e-06, + "loss": 0.91280973, + "num_input_tokens_seen": 88585225, + "router_z_loss_clip": 2.24804688, + "router_z_loss_mlp": 0.18920898, + "step": 4113, + "time_per_iteration": 2.6223254203796387 + }, + { + "auxiliary_loss_clip": 0.06540644, + "auxiliary_loss_mlp": 0.01276865, + "balance_loss_clip": 0.06315883, + "balance_loss_mlp": 0.01257279, + "epoch": 0.24734706147602586, + "flos": 19323494755200.0, + "grad_norm": 2.0367731486494636, + "language_loss": 0.87924093, + "learning_rate": 3.5245805596042322e-06, + "loss": 0.95741606, + "num_input_tokens_seen": 88603280, + "router_z_loss_clip": 2.24804688, + "router_z_loss_mlp": 0.19580078, + "step": 4114, + "time_per_iteration": 2.5495545864105225 + }, + { + "auxiliary_loss_clip": 0.06532501, + "auxiliary_loss_mlp": 0.01273212, + "balance_loss_clip": 0.06308883, + "balance_loss_mlp": 0.01255474, + "epoch": 0.24740718472869383, + "flos": 28044275345280.0, + "grad_norm": 1.9170399047542779, + "language_loss": 0.75640035, + "learning_rate": 3.524328457352734e-06, + "loss": 0.83445752, + "num_input_tokens_seen": 88624925, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.17736816, + "step": 4115, + "time_per_iteration": 2.6333982944488525 + }, + { + "auxiliary_loss_clip": 0.0642873, + "auxiliary_loss_mlp": 0.01264911, + "balance_loss_clip": 0.06315603, + "balance_loss_mlp": 0.01259151, + "epoch": 0.2474673079813618, + "flos": 68129265899520.0, + "grad_norm": 0.63897767002188, + "language_loss": 0.58004332, + "learning_rate": 3.5240762972985475e-06, + "loss": 0.65697974, + "num_input_tokens_seen": 88691475, + "router_z_loss_clip": 1.1328125, + "router_z_loss_mlp": 0.05752563, + "step": 4116, + "time_per_iteration": 3.251235246658325 + }, + { + "auxiliary_loss_clip": 0.06532618, + "auxiliary_loss_mlp": 0.01276179, + "balance_loss_clip": 0.063094, + "balance_loss_mlp": 0.01257022, + "epoch": 0.24752743123402976, + "flos": 29470075620480.0, + "grad_norm": 1.407143363910891, + "language_loss": 0.8425988, + "learning_rate": 3.523824079451235e-06, + "loss": 0.92068678, + "num_input_tokens_seen": 88713425, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.19152832, + "step": 4117, + "time_per_iteration": 2.640665292739868 + }, + { + "auxiliary_loss_clip": 0.06425081, + "auxiliary_loss_mlp": 0.01267093, + "balance_loss_clip": 0.0631275, + "balance_loss_mlp": 0.01262089, + "epoch": 0.24758755448669773, + "flos": 58367946908160.0, + "grad_norm": 0.8764773034828885, + "language_loss": 0.63508207, + "learning_rate": 3.5235718038203602e-06, + "loss": 0.71200383, + "num_input_tokens_seen": 88769995, + "router_z_loss_clip": 1.125, + "router_z_loss_mlp": 0.05001831, + "step": 4118, + "time_per_iteration": 3.052507162094116 + }, + { + "auxiliary_loss_clip": 0.0652981, + "auxiliary_loss_mlp": 0.01277419, + "balance_loss_clip": 0.06307684, + "balance_loss_mlp": 0.01258203, + "epoch": 0.2476476777393657, + "flos": 20490502095360.0, + "grad_norm": 1.7262960547494681, + "language_loss": 0.80051601, + "learning_rate": 3.523319470415491e-06, + "loss": 0.87858826, + "num_input_tokens_seen": 88789970, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.19238281, + "step": 4119, + "time_per_iteration": 2.554318428039551 + }, + { + "auxiliary_loss_clip": 0.06530587, + "auxiliary_loss_mlp": 0.01282865, + "balance_loss_clip": 0.06310613, + "balance_loss_mlp": 0.01265198, + "epoch": 0.24770780099203366, + "flos": 20492179176960.0, + "grad_norm": 2.4192345138137386, + "language_loss": 0.74556476, + "learning_rate": 3.5230670792461943e-06, + "loss": 0.8236993, + "num_input_tokens_seen": 88810000, + "router_z_loss_clip": 2.19921875, + "router_z_loss_mlp": 0.17663574, + "step": 4120, + "time_per_iteration": 3.996234655380249 + }, + { + "auxiliary_loss_clip": 0.06531808, + "auxiliary_loss_mlp": 0.01276043, + "balance_loss_clip": 0.06307146, + "balance_loss_mlp": 0.01256362, + "epoch": 0.24776792424470165, + "flos": 15157915603200.0, + "grad_norm": 2.13486110959629, + "language_loss": 0.89734054, + "learning_rate": 3.522814630322041e-06, + "loss": 0.97541904, + "num_input_tokens_seen": 88827515, + "router_z_loss_clip": 2.24804688, + "router_z_loss_mlp": 0.19689941, + "step": 4121, + "time_per_iteration": 2.5337533950805664 + }, + { + "auxiliary_loss_clip": 0.06540959, + "auxiliary_loss_mlp": 0.01278306, + "balance_loss_clip": 0.06314932, + "balance_loss_mlp": 0.01258744, + "epoch": 0.2478280474973696, + "flos": 21731833607040.0, + "grad_norm": 2.0829104418917646, + "language_loss": 0.69792116, + "learning_rate": 3.5225621236526045e-06, + "loss": 0.77611381, + "num_input_tokens_seen": 88845025, + "router_z_loss_clip": 2.25585938, + "router_z_loss_mlp": 0.19580078, + "step": 4122, + "time_per_iteration": 2.5857455730438232 + }, + { + "auxiliary_loss_clip": 0.06535036, + "auxiliary_loss_mlp": 0.01273779, + "balance_loss_clip": 0.0630946, + "balance_loss_mlp": 0.01254729, + "epoch": 0.24788817075003758, + "flos": 20418400056960.0, + "grad_norm": 2.5894895086667264, + "language_loss": 0.80832231, + "learning_rate": 3.5223095592474596e-06, + "loss": 0.88641047, + "num_input_tokens_seen": 88861740, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.19042969, + "step": 4123, + "time_per_iteration": 2.533696174621582 + }, + { + "auxiliary_loss_clip": 0.06528741, + "auxiliary_loss_mlp": 0.01276684, + "balance_loss_clip": 0.06306656, + "balance_loss_mlp": 0.01259625, + "epoch": 0.24794829400270554, + "flos": 22599867179520.0, + "grad_norm": 2.45373622595604, + "language_loss": 0.75091624, + "learning_rate": 3.5220569371161846e-06, + "loss": 0.82897043, + "num_input_tokens_seen": 88879740, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.1706543, + "step": 4124, + "time_per_iteration": 2.5478947162628174 + }, + { + "auxiliary_loss_clip": 0.06523614, + "auxiliary_loss_mlp": 0.01276208, + "balance_loss_clip": 0.06306844, + "balance_loss_mlp": 0.01258708, + "epoch": 0.2480084172553735, + "flos": 39685362433920.0, + "grad_norm": 1.4066224864196382, + "language_loss": 0.74510413, + "learning_rate": 3.521804257268357e-06, + "loss": 0.82310236, + "num_input_tokens_seen": 88904095, + "router_z_loss_clip": 2.16503906, + "router_z_loss_mlp": 0.17504883, + "step": 4125, + "time_per_iteration": 4.164500951766968 + }, + { + "auxiliary_loss_clip": 0.06546921, + "auxiliary_loss_mlp": 0.01279637, + "balance_loss_clip": 0.06313127, + "balance_loss_mlp": 0.01260599, + "epoch": 0.24806854050804147, + "flos": 22060129104000.0, + "grad_norm": 1.9518521214536066, + "language_loss": 0.69807184, + "learning_rate": 3.5215515197135595e-06, + "loss": 0.77633739, + "num_input_tokens_seen": 88920740, + "router_z_loss_clip": 2.34179688, + "router_z_loss_mlp": 0.19030762, + "step": 4126, + "time_per_iteration": 2.520550489425659 + }, + { + "auxiliary_loss_clip": 0.06526291, + "auxiliary_loss_mlp": 0.0127589, + "balance_loss_clip": 0.06304894, + "balance_loss_mlp": 0.01257281, + "epoch": 0.24812866376070947, + "flos": 15492164739840.0, + "grad_norm": 2.6036079521490834, + "language_loss": 0.81805199, + "learning_rate": 3.5212987244613764e-06, + "loss": 0.89607382, + "num_input_tokens_seen": 88938510, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.18615723, + "step": 4127, + "time_per_iteration": 4.052755832672119 + }, + { + "auxiliary_loss_clip": 0.06533966, + "auxiliary_loss_mlp": 0.012739, + "balance_loss_clip": 0.06306454, + "balance_loss_mlp": 0.01255494, + "epoch": 0.24818878701337743, + "flos": 14762758947840.0, + "grad_norm": 2.4130643839940746, + "language_loss": 0.85122234, + "learning_rate": 3.5210458715213927e-06, + "loss": 0.92930102, + "num_input_tokens_seen": 88955235, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.18395996, + "step": 4128, + "time_per_iteration": 2.5801029205322266 + }, + { + "auxiliary_loss_clip": 0.06541854, + "auxiliary_loss_mlp": 0.01278965, + "balance_loss_clip": 0.06316209, + "balance_loss_mlp": 0.01260821, + "epoch": 0.2482489102660454, + "flos": 27096886356480.0, + "grad_norm": 2.0112959815575713, + "language_loss": 0.66149813, + "learning_rate": 3.5207929609031973e-06, + "loss": 0.73970628, + "num_input_tokens_seen": 88975210, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.18151855, + "step": 4129, + "time_per_iteration": 2.5865726470947266 + }, + { + "auxiliary_loss_clip": 0.06528358, + "auxiliary_loss_mlp": 0.01276243, + "balance_loss_clip": 0.06307153, + "balance_loss_mlp": 0.01257444, + "epoch": 0.24830903351871336, + "flos": 26474522555520.0, + "grad_norm": 1.7021812681223303, + "language_loss": 0.75761282, + "learning_rate": 3.5205399926163806e-06, + "loss": 0.83565885, + "num_input_tokens_seen": 88996120, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.18811035, + "step": 4130, + "time_per_iteration": 2.6659512519836426 + }, + { + "auxiliary_loss_clip": 0.06526491, + "auxiliary_loss_mlp": 0.01274514, + "balance_loss_clip": 0.06302534, + "balance_loss_mlp": 0.01255, + "epoch": 0.24836915677138133, + "flos": 10232225337600.0, + "grad_norm": 2.0871707802719004, + "language_loss": 0.77625716, + "learning_rate": 3.520286966670535e-06, + "loss": 0.85426718, + "num_input_tokens_seen": 89008685, + "router_z_loss_clip": 2.23828125, + "router_z_loss_mlp": 0.19519043, + "step": 4131, + "time_per_iteration": 3.906522274017334 + }, + { + "auxiliary_loss_clip": 0.06519566, + "auxiliary_loss_mlp": 0.01270892, + "balance_loss_clip": 0.0630278, + "balance_loss_mlp": 0.01253582, + "epoch": 0.2484292800240493, + "flos": 30088162863360.0, + "grad_norm": 1.7622390062278706, + "language_loss": 0.84475207, + "learning_rate": 3.520033883075255e-06, + "loss": 0.92265671, + "num_input_tokens_seen": 89031160, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.17297363, + "step": 4132, + "time_per_iteration": 2.6436057090759277 + }, + { + "auxiliary_loss_clip": 0.06525066, + "auxiliary_loss_mlp": 0.01275924, + "balance_loss_clip": 0.06302708, + "balance_loss_mlp": 0.01256779, + "epoch": 0.24848940327671726, + "flos": 13447899878400.0, + "grad_norm": 1.545647189211169, + "language_loss": 0.71393758, + "learning_rate": 3.5197807418401386e-06, + "loss": 0.79194742, + "num_input_tokens_seen": 89047235, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.19152832, + "step": 4133, + "time_per_iteration": 2.5431106090545654 + }, + { + "auxiliary_loss_clip": 0.06542444, + "auxiliary_loss_mlp": 0.01275489, + "balance_loss_clip": 0.06309851, + "balance_loss_mlp": 0.01255116, + "epoch": 0.24854952652938525, + "flos": 19975683409920.0, + "grad_norm": 2.3352452144714513, + "language_loss": 0.6286931, + "learning_rate": 3.5195275429747834e-06, + "loss": 0.70687246, + "num_input_tokens_seen": 89064790, + "router_z_loss_clip": 2.328125, + "router_z_loss_mlp": 0.20373535, + "step": 4134, + "time_per_iteration": 2.571525812149048 + }, + { + "auxiliary_loss_clip": 0.06524864, + "auxiliary_loss_mlp": 0.01277288, + "balance_loss_clip": 0.06301688, + "balance_loss_mlp": 0.01258883, + "epoch": 0.24860964978205322, + "flos": 18156026217600.0, + "grad_norm": 1.960513817978903, + "language_loss": 0.79140246, + "learning_rate": 3.5192742864887914e-06, + "loss": 0.86942399, + "num_input_tokens_seen": 89083250, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.18383789, + "step": 4135, + "time_per_iteration": 2.588916301727295 + }, + { + "auxiliary_loss_clip": 0.06524552, + "auxiliary_loss_mlp": 0.01274974, + "balance_loss_clip": 0.06303368, + "balance_loss_mlp": 0.01256294, + "epoch": 0.24866977303472118, + "flos": 11733397960320.0, + "grad_norm": 2.2852251503119234, + "language_loss": 0.8410641, + "learning_rate": 3.5190209723917662e-06, + "loss": 0.9190594, + "num_input_tokens_seen": 89100905, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.18676758, + "step": 4136, + "time_per_iteration": 2.497654676437378 + }, + { + "auxiliary_loss_clip": 0.06524116, + "auxiliary_loss_mlp": 0.01273427, + "balance_loss_clip": 0.06297501, + "balance_loss_mlp": 0.01254521, + "epoch": 0.24872989628738915, + "flos": 34832109623040.0, + "grad_norm": 1.7046352309858128, + "language_loss": 0.71601558, + "learning_rate": 3.518767600693314e-06, + "loss": 0.79399109, + "num_input_tokens_seen": 89122630, + "router_z_loss_clip": 2.26757812, + "router_z_loss_mlp": 0.18908691, + "step": 4137, + "time_per_iteration": 2.732480764389038 + }, + { + "auxiliary_loss_clip": 0.06525281, + "auxiliary_loss_mlp": 0.01273776, + "balance_loss_clip": 0.06299166, + "balance_loss_mlp": 0.01255549, + "epoch": 0.2487900195400571, + "flos": 13704512607360.0, + "grad_norm": 2.5230361612400296, + "language_loss": 0.67583597, + "learning_rate": 3.518514171403042e-06, + "loss": 0.7538265, + "num_input_tokens_seen": 89141050, + "router_z_loss_clip": 2.26171875, + "router_z_loss_mlp": 0.18212891, + "step": 4138, + "time_per_iteration": 2.531855583190918 + }, + { + "auxiliary_loss_clip": 0.06519014, + "auxiliary_loss_mlp": 0.01272692, + "balance_loss_clip": 0.06298752, + "balance_loss_mlp": 0.01254501, + "epoch": 0.24885014279272508, + "flos": 25344845009280.0, + "grad_norm": 1.9341473695701388, + "language_loss": 0.83479851, + "learning_rate": 3.51826068453056e-06, + "loss": 0.91271555, + "num_input_tokens_seen": 89160810, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.18188477, + "step": 4139, + "time_per_iteration": 2.6051557064056396 + }, + { + "auxiliary_loss_clip": 0.06528804, + "auxiliary_loss_mlp": 0.01275882, + "balance_loss_clip": 0.06300579, + "balance_loss_mlp": 0.01255711, + "epoch": 0.24891026604539307, + "flos": 20637724919040.0, + "grad_norm": 1.6977646822397727, + "language_loss": 0.79297662, + "learning_rate": 3.518007140085481e-06, + "loss": 0.87102342, + "num_input_tokens_seen": 89180610, + "router_z_loss_clip": 2.2890625, + "router_z_loss_mlp": 0.20178223, + "step": 4140, + "time_per_iteration": 2.5448291301727295 + }, + { + "auxiliary_loss_clip": 0.0641291, + "auxiliary_loss_mlp": 0.01270262, + "balance_loss_clip": 0.06303305, + "balance_loss_mlp": 0.0126555, + "epoch": 0.24897038929806103, + "flos": 66979086030720.0, + "grad_norm": 0.8107945435966392, + "language_loss": 0.60717231, + "learning_rate": 3.51775353807742e-06, + "loss": 0.68400407, + "num_input_tokens_seen": 89241880, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.04705811, + "step": 4141, + "time_per_iteration": 3.2685940265655518 + }, + { + "auxiliary_loss_clip": 0.06525983, + "auxiliary_loss_mlp": 0.01275717, + "balance_loss_clip": 0.06301422, + "balance_loss_mlp": 0.01256894, + "epoch": 0.249030512550729, + "flos": 36401359288320.0, + "grad_norm": 1.7802793710753735, + "language_loss": 0.72871864, + "learning_rate": 3.5174998785159913e-06, + "loss": 0.80673563, + "num_input_tokens_seen": 89263340, + "router_z_loss_clip": 2.24609375, + "router_z_loss_mlp": 0.18823242, + "step": 4142, + "time_per_iteration": 2.6564056873321533 + }, + { + "auxiliary_loss_clip": 0.0652248, + "auxiliary_loss_mlp": 0.01276725, + "balance_loss_clip": 0.06302793, + "balance_loss_mlp": 0.0125789, + "epoch": 0.24909063580339696, + "flos": 20160361808640.0, + "grad_norm": 1.9535741137498925, + "language_loss": 0.81280798, + "learning_rate": 3.5172461614108157e-06, + "loss": 0.8908, + "num_input_tokens_seen": 89282870, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.18823242, + "step": 4143, + "time_per_iteration": 2.5795881748199463 + }, + { + "auxiliary_loss_clip": 0.06522508, + "auxiliary_loss_mlp": 0.01275624, + "balance_loss_clip": 0.06301625, + "balance_loss_mlp": 0.01257039, + "epoch": 0.24915075905606493, + "flos": 26403887963520.0, + "grad_norm": 1.964912825826696, + "language_loss": 0.59448719, + "learning_rate": 3.5169923867715137e-06, + "loss": 0.67246854, + "num_input_tokens_seen": 89303830, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.18579102, + "step": 4144, + "time_per_iteration": 2.5888898372650146 + }, + { + "auxiliary_loss_clip": 0.06520054, + "auxiliary_loss_mlp": 0.01279478, + "balance_loss_clip": 0.06300642, + "balance_loss_mlp": 0.01260608, + "epoch": 0.2492108823087329, + "flos": 27534655612800.0, + "grad_norm": 2.2926576094039253, + "language_loss": 0.79198605, + "learning_rate": 3.516738554607708e-06, + "loss": 0.86998141, + "num_input_tokens_seen": 89324350, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.18859863, + "step": 4145, + "time_per_iteration": 2.6068575382232666 + }, + { + "auxiliary_loss_clip": 0.06539698, + "auxiliary_loss_mlp": 0.01282889, + "balance_loss_clip": 0.06307465, + "balance_loss_mlp": 0.01262587, + "epoch": 0.24927100556140086, + "flos": 16697088852480.0, + "grad_norm": 2.388513156986414, + "language_loss": 0.65914291, + "learning_rate": 3.5164846649290253e-06, + "loss": 0.73736882, + "num_input_tokens_seen": 89342875, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.20300293, + "step": 4146, + "time_per_iteration": 2.550225019454956 + }, + { + "auxiliary_loss_clip": 0.06418058, + "auxiliary_loss_mlp": 0.01257626, + "balance_loss_clip": 0.06307501, + "balance_loss_mlp": 0.01252389, + "epoch": 0.24933112881406885, + "flos": 62791899724800.0, + "grad_norm": 0.9255702942051489, + "language_loss": 0.67495543, + "learning_rate": 3.5162307177450915e-06, + "loss": 0.75171226, + "num_input_tokens_seen": 89404925, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.05239868, + "step": 4147, + "time_per_iteration": 3.2676596641540527 + }, + { + "auxiliary_loss_clip": 0.06525366, + "auxiliary_loss_mlp": 0.01281982, + "balance_loss_clip": 0.06302118, + "balance_loss_mlp": 0.01261764, + "epoch": 0.24939125206673682, + "flos": 26659242881280.0, + "grad_norm": 1.678024692441642, + "language_loss": 0.89250457, + "learning_rate": 3.5159767130655366e-06, + "loss": 0.97057807, + "num_input_tokens_seen": 89425090, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.20214844, + "step": 4148, + "time_per_iteration": 2.5950350761413574 + }, + { + "auxiliary_loss_clip": 0.06529681, + "auxiliary_loss_mlp": 0.01281757, + "balance_loss_clip": 0.06300169, + "balance_loss_mlp": 0.0125968, + "epoch": 0.24945137531940478, + "flos": 20710623571200.0, + "grad_norm": 1.8952521518004763, + "language_loss": 0.68350649, + "learning_rate": 3.5157226508999935e-06, + "loss": 0.76162088, + "num_input_tokens_seen": 89442615, + "router_z_loss_clip": 2.29492188, + "router_z_loss_mlp": 0.22070312, + "step": 4149, + "time_per_iteration": 2.52567720413208 + }, + { + "auxiliary_loss_clip": 0.06528307, + "auxiliary_loss_mlp": 0.0128627, + "balance_loss_clip": 0.06306647, + "balance_loss_mlp": 0.01266398, + "epoch": 0.24951149857207275, + "flos": 23775385708800.0, + "grad_norm": 1.639238516163445, + "language_loss": 0.71759897, + "learning_rate": 3.515468531258095e-06, + "loss": 0.79574472, + "num_input_tokens_seen": 89463025, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.1986084, + "step": 4150, + "time_per_iteration": 2.580000877380371 + }, + { + "auxiliary_loss_clip": 0.06529218, + "auxiliary_loss_mlp": 0.01284871, + "balance_loss_clip": 0.06303831, + "balance_loss_mlp": 0.01264129, + "epoch": 0.2495716218247407, + "flos": 15669589760640.0, + "grad_norm": 1.939767404293352, + "language_loss": 0.73002028, + "learning_rate": 3.515214354149478e-06, + "loss": 0.80816114, + "num_input_tokens_seen": 89480225, + "router_z_loss_clip": 2.25390625, + "router_z_loss_mlp": 0.20739746, + "step": 4151, + "time_per_iteration": 2.4935879707336426 + }, + { + "auxiliary_loss_clip": 0.06534886, + "auxiliary_loss_mlp": 0.01281273, + "balance_loss_clip": 0.0630331, + "balance_loss_mlp": 0.01261055, + "epoch": 0.24963174507740868, + "flos": 24057924076800.0, + "grad_norm": 4.265592628376469, + "language_loss": 0.64070994, + "learning_rate": 3.514960119583781e-06, + "loss": 0.71887159, + "num_input_tokens_seen": 89496985, + "router_z_loss_clip": 2.31445312, + "router_z_loss_mlp": 0.20227051, + "step": 4152, + "time_per_iteration": 2.5687365531921387 + }, + { + "auxiliary_loss_clip": 0.06516105, + "auxiliary_loss_mlp": 0.01279803, + "balance_loss_clip": 0.06296911, + "balance_loss_mlp": 0.01259979, + "epoch": 0.24969186833007664, + "flos": 21806073924480.0, + "grad_norm": 2.335025994250793, + "language_loss": 0.7798419, + "learning_rate": 3.514705827570645e-06, + "loss": 0.85780108, + "num_input_tokens_seen": 89514420, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.19812012, + "step": 4153, + "time_per_iteration": 2.5565860271453857 + }, + { + "auxiliary_loss_clip": 0.06523906, + "auxiliary_loss_mlp": 0.01276939, + "balance_loss_clip": 0.06304043, + "balance_loss_mlp": 0.01257806, + "epoch": 0.24975199158274464, + "flos": 19944307255680.0, + "grad_norm": 2.3946475317027978, + "language_loss": 0.77287221, + "learning_rate": 3.514451478119711e-06, + "loss": 0.85088068, + "num_input_tokens_seen": 89532925, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.19152832, + "step": 4154, + "time_per_iteration": 2.5327064990997314 + }, + { + "auxiliary_loss_clip": 0.06533594, + "auxiliary_loss_mlp": 0.0128089, + "balance_loss_clip": 0.06299926, + "balance_loss_mlp": 0.01258145, + "epoch": 0.2498121148354126, + "flos": 25345515841920.0, + "grad_norm": 1.7912237432514402, + "language_loss": 0.71052945, + "learning_rate": 3.5141970712406258e-06, + "loss": 0.78867429, + "num_input_tokens_seen": 89552855, + "router_z_loss_clip": 2.33984375, + "router_z_loss_mlp": 0.22766113, + "step": 4155, + "time_per_iteration": 2.566044330596924 + }, + { + "auxiliary_loss_clip": 0.06528749, + "auxiliary_loss_mlp": 0.01277956, + "balance_loss_clip": 0.06300025, + "balance_loss_mlp": 0.01257809, + "epoch": 0.24987223808808057, + "flos": 20565119756160.0, + "grad_norm": 1.6974291352944781, + "language_loss": 0.75592315, + "learning_rate": 3.513942606943036e-06, + "loss": 0.83399028, + "num_input_tokens_seen": 89572830, + "router_z_loss_clip": 2.2890625, + "router_z_loss_mlp": 0.20141602, + "step": 4156, + "time_per_iteration": 2.5388355255126953 + }, + { + "auxiliary_loss_clip": 0.06524897, + "auxiliary_loss_mlp": 0.01278507, + "balance_loss_clip": 0.06303049, + "balance_loss_mlp": 0.0125842, + "epoch": 0.24993236134074853, + "flos": 19754052560640.0, + "grad_norm": 3.125892113983293, + "language_loss": 0.77757698, + "learning_rate": 3.513688085236591e-06, + "loss": 0.85561097, + "num_input_tokens_seen": 89590345, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.20068359, + "step": 4157, + "time_per_iteration": 2.5327329635620117 + }, + { + "auxiliary_loss_clip": 0.06527505, + "auxiliary_loss_mlp": 0.012775, + "balance_loss_clip": 0.06301083, + "balance_loss_mlp": 0.01257068, + "epoch": 0.2499924845934165, + "flos": 18776209812480.0, + "grad_norm": 1.8891569690037928, + "language_loss": 0.82203197, + "learning_rate": 3.513433506130942e-06, + "loss": 0.90008199, + "num_input_tokens_seen": 89610295, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.20422363, + "step": 4158, + "time_per_iteration": 2.5894827842712402 + }, + { + "auxiliary_loss_clip": 0.06518973, + "auxiliary_loss_mlp": 0.01272913, + "balance_loss_clip": 0.06295922, + "balance_loss_mlp": 0.012544, + "epoch": 0.25005260784608446, + "flos": 16877658401280.0, + "grad_norm": 2.206587551308884, + "language_loss": 0.75718945, + "learning_rate": 3.5131788696357427e-06, + "loss": 0.83510834, + "num_input_tokens_seen": 89627795, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.18505859, + "step": 4159, + "time_per_iteration": 2.5279693603515625 + }, + { + "auxiliary_loss_clip": 0.06529576, + "auxiliary_loss_mlp": 0.01278956, + "balance_loss_clip": 0.06300279, + "balance_loss_mlp": 0.01258142, + "epoch": 0.2501127310987524, + "flos": 22131057185280.0, + "grad_norm": 2.1699031495969354, + "language_loss": 0.71598893, + "learning_rate": 3.512924175760649e-06, + "loss": 0.7940743, + "num_input_tokens_seen": 89648090, + "router_z_loss_clip": 2.29492188, + "router_z_loss_mlp": 0.20812988, + "step": 4160, + "time_per_iteration": 3.9746532440185547 + }, + { + "auxiliary_loss_clip": 0.06424317, + "auxiliary_loss_mlp": 0.01267599, + "balance_loss_clip": 0.06313459, + "balance_loss_mlp": 0.01263326, + "epoch": 0.2501728543514204, + "flos": 69480071170560.0, + "grad_norm": 0.7438462037708533, + "language_loss": 0.56844532, + "learning_rate": 3.5126694245153186e-06, + "loss": 0.64536446, + "num_input_tokens_seen": 89710345, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.04278564, + "step": 4161, + "time_per_iteration": 3.233760356903076 + }, + { + "auxiliary_loss_clip": 0.06530809, + "auxiliary_loss_mlp": 0.01282686, + "balance_loss_clip": 0.06298731, + "balance_loss_mlp": 0.01261848, + "epoch": 0.25023297760408836, + "flos": 16295601214080.0, + "grad_norm": 2.49700797922569, + "language_loss": 0.8179751, + "learning_rate": 3.5124146159094125e-06, + "loss": 0.89611006, + "num_input_tokens_seen": 89729390, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.20849609, + "step": 4162, + "time_per_iteration": 2.553572654724121 + }, + { + "auxiliary_loss_clip": 0.0652239, + "auxiliary_loss_mlp": 0.01280647, + "balance_loss_clip": 0.06294353, + "balance_loss_mlp": 0.01260358, + "epoch": 0.2502931008567563, + "flos": 12242598422400.0, + "grad_norm": 2.2503072324763616, + "language_loss": 0.88019562, + "learning_rate": 3.5121597499525927e-06, + "loss": 0.95822597, + "num_input_tokens_seen": 89742805, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.203125, + "step": 4163, + "time_per_iteration": 2.531467914581299 + }, + { + "auxiliary_loss_clip": 0.06520548, + "auxiliary_loss_mlp": 0.01277405, + "balance_loss_clip": 0.06293885, + "balance_loss_mlp": 0.01257092, + "epoch": 0.25035322410942434, + "flos": 23188003787520.0, + "grad_norm": 1.6365124228332002, + "language_loss": 0.83867121, + "learning_rate": 3.5119048266545232e-06, + "loss": 0.91665077, + "num_input_tokens_seen": 89761145, + "router_z_loss_clip": 2.26757812, + "router_z_loss_mlp": 0.20300293, + "step": 4164, + "time_per_iteration": 4.068189382553101 + }, + { + "auxiliary_loss_clip": 0.06509531, + "auxiliary_loss_mlp": 0.01280667, + "balance_loss_clip": 0.06292763, + "balance_loss_mlp": 0.01262106, + "epoch": 0.2504133473620923, + "flos": 20922904690560.0, + "grad_norm": 1.788160941639295, + "language_loss": 0.7460506, + "learning_rate": 3.5116498460248716e-06, + "loss": 0.82395256, + "num_input_tokens_seen": 89780905, + "router_z_loss_clip": 2.16601562, + "router_z_loss_mlp": 0.18579102, + "step": 4165, + "time_per_iteration": 2.568701982498169 + }, + { + "auxiliary_loss_clip": 0.06526586, + "auxiliary_loss_mlp": 0.01278077, + "balance_loss_clip": 0.06293961, + "balance_loss_mlp": 0.01257883, + "epoch": 0.2504734706147603, + "flos": 20782725609600.0, + "grad_norm": 1.8100288551258081, + "language_loss": 0.74429101, + "learning_rate": 3.5113948080733062e-06, + "loss": 0.82233763, + "num_input_tokens_seen": 89799230, + "router_z_loss_clip": 2.33007812, + "router_z_loss_mlp": 0.2019043, + "step": 4166, + "time_per_iteration": 3.989368438720703 + }, + { + "auxiliary_loss_clip": 0.065147, + "auxiliary_loss_mlp": 0.01277163, + "balance_loss_clip": 0.06293219, + "balance_loss_mlp": 0.0125778, + "epoch": 0.25053359386742824, + "flos": 24355681960320.0, + "grad_norm": 1.5960764456675967, + "language_loss": 0.82469785, + "learning_rate": 3.5111397128094973e-06, + "loss": 0.90261644, + "num_input_tokens_seen": 89818240, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.19384766, + "step": 4167, + "time_per_iteration": 2.554733991622925 + }, + { + "auxiliary_loss_clip": 0.06513357, + "auxiliary_loss_mlp": 0.01280403, + "balance_loss_clip": 0.06292276, + "balance_loss_mlp": 0.01260614, + "epoch": 0.2505937171200962, + "flos": 21220578720000.0, + "grad_norm": 1.9887592956808484, + "language_loss": 0.80394876, + "learning_rate": 3.51088456024312e-06, + "loss": 0.88188636, + "num_input_tokens_seen": 89834485, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.19799805, + "step": 4168, + "time_per_iteration": 2.576969623565674 + }, + { + "auxiliary_loss_clip": 0.06531397, + "auxiliary_loss_mlp": 0.01277594, + "balance_loss_clip": 0.06300385, + "balance_loss_mlp": 0.01256196, + "epoch": 0.25065384037276417, + "flos": 41436816802560.0, + "grad_norm": 4.930314721126017, + "language_loss": 0.69985271, + "learning_rate": 3.510629350383849e-06, + "loss": 0.7779426, + "num_input_tokens_seen": 89855645, + "router_z_loss_clip": 2.31445312, + "router_z_loss_mlp": 0.21386719, + "step": 4169, + "time_per_iteration": 2.709149122238159 + }, + { + "auxiliary_loss_clip": 0.06511761, + "auxiliary_loss_mlp": 0.01277868, + "balance_loss_clip": 0.06292351, + "balance_loss_mlp": 0.0125827, + "epoch": 0.25071396362543213, + "flos": 26109274608000.0, + "grad_norm": 1.904216953279787, + "language_loss": 0.77927327, + "learning_rate": 3.510374083241361e-06, + "loss": 0.85716957, + "num_input_tokens_seen": 89874895, + "router_z_loss_clip": 2.19335938, + "router_z_loss_mlp": 0.19592285, + "step": 4170, + "time_per_iteration": 4.016170024871826 + }, + { + "auxiliary_loss_clip": 0.0651409, + "auxiliary_loss_mlp": 0.01278168, + "balance_loss_clip": 0.06291165, + "balance_loss_mlp": 0.01258975, + "epoch": 0.2507740868781001, + "flos": 19105008433920.0, + "grad_norm": 2.5077494433812966, + "language_loss": 0.76900339, + "learning_rate": 3.5101187588253368e-06, + "loss": 0.84692597, + "num_input_tokens_seen": 89891700, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.1920166, + "step": 4171, + "time_per_iteration": 2.5651609897613525 + }, + { + "auxiliary_loss_clip": 0.06406491, + "auxiliary_loss_mlp": 0.01262132, + "balance_loss_clip": 0.06297328, + "balance_loss_mlp": 0.01257083, + "epoch": 0.25083421013076806, + "flos": 64361652514560.0, + "grad_norm": 0.8214086964760371, + "language_loss": 0.6006844, + "learning_rate": 3.509863377145458e-06, + "loss": 0.67737067, + "num_input_tokens_seen": 89955775, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.05047607, + "step": 4172, + "time_per_iteration": 3.1837103366851807 + }, + { + "auxiliary_loss_clip": 0.06520402, + "auxiliary_loss_mlp": 0.01281138, + "balance_loss_clip": 0.06295419, + "balance_loss_mlp": 0.012603, + "epoch": 0.25089433338343603, + "flos": 24286430960640.0, + "grad_norm": 1.3489665028935822, + "language_loss": 0.79424238, + "learning_rate": 3.509607938211409e-06, + "loss": 0.87225777, + "num_input_tokens_seen": 89977150, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.20849609, + "step": 4173, + "time_per_iteration": 2.6214826107025146 + }, + { + "auxiliary_loss_clip": 0.06513289, + "auxiliary_loss_mlp": 0.01273745, + "balance_loss_clip": 0.06291197, + "balance_loss_mlp": 0.01254398, + "epoch": 0.250954456636104, + "flos": 14726896600320.0, + "grad_norm": 1.8312177549547823, + "language_loss": 0.83930022, + "learning_rate": 3.509352442032875e-06, + "loss": 0.91717052, + "num_input_tokens_seen": 89994925, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.19360352, + "step": 4174, + "time_per_iteration": 2.5973377227783203 + }, + { + "auxiliary_loss_clip": 0.06519122, + "auxiliary_loss_mlp": 0.0127901, + "balance_loss_clip": 0.0629285, + "balance_loss_mlp": 0.01259341, + "epoch": 0.25101457988877196, + "flos": 22280208652800.0, + "grad_norm": 2.088546315652338, + "language_loss": 0.71558678, + "learning_rate": 3.509096888619545e-06, + "loss": 0.79356813, + "num_input_tokens_seen": 90013235, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.19665527, + "step": 4175, + "time_per_iteration": 2.6718719005584717 + }, + { + "auxiliary_loss_clip": 0.06522886, + "auxiliary_loss_mlp": 0.01277602, + "balance_loss_clip": 0.06295571, + "balance_loss_mlp": 0.01256502, + "epoch": 0.2510747031414399, + "flos": 25195441979520.0, + "grad_norm": 1.9595604726907228, + "language_loss": 0.81335604, + "learning_rate": 3.50884127798111e-06, + "loss": 0.891361, + "num_input_tokens_seen": 90032150, + "router_z_loss_clip": 2.2734375, + "router_z_loss_mlp": 0.2109375, + "step": 4176, + "time_per_iteration": 2.5455691814422607 + }, + { + "auxiliary_loss_clip": 0.06515132, + "auxiliary_loss_mlp": 0.01279504, + "balance_loss_clip": 0.06292217, + "balance_loss_mlp": 0.01257319, + "epoch": 0.25113482639410795, + "flos": 20710455863040.0, + "grad_norm": 1.8805810902271358, + "language_loss": 0.83346581, + "learning_rate": 3.5085856101272623e-06, + "loss": 0.91141224, + "num_input_tokens_seen": 90049085, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.22167969, + "step": 4177, + "time_per_iteration": 2.5471949577331543 + }, + { + "auxiliary_loss_clip": 0.06520942, + "auxiliary_loss_mlp": 0.01276628, + "balance_loss_clip": 0.06300486, + "balance_loss_mlp": 0.01256375, + "epoch": 0.2511949496467759, + "flos": 21513347285760.0, + "grad_norm": 2.081094632338002, + "language_loss": 0.83410418, + "learning_rate": 3.508329885067698e-06, + "loss": 0.91207987, + "num_input_tokens_seen": 90067695, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.20251465, + "step": 4178, + "time_per_iteration": 2.5352370738983154 + }, + { + "auxiliary_loss_clip": 0.06514454, + "auxiliary_loss_mlp": 0.01274486, + "balance_loss_clip": 0.06294617, + "balance_loss_mlp": 0.01255949, + "epoch": 0.2512550728994439, + "flos": 20707898313600.0, + "grad_norm": 2.160080340734635, + "language_loss": 0.75744665, + "learning_rate": 3.508074102812112e-06, + "loss": 0.83533603, + "num_input_tokens_seen": 90083890, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.18554688, + "step": 4179, + "time_per_iteration": 2.560995578765869 + }, + { + "auxiliary_loss_clip": 0.0652363, + "auxiliary_loss_mlp": 0.0128226, + "balance_loss_clip": 0.06298499, + "balance_loss_mlp": 0.01261053, + "epoch": 0.25131519615211184, + "flos": 18484531349760.0, + "grad_norm": 2.0850842878171347, + "language_loss": 0.70515448, + "learning_rate": 3.507818263370206e-06, + "loss": 0.78321338, + "num_input_tokens_seen": 90100995, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.2121582, + "step": 4180, + "time_per_iteration": 2.510233163833618 + }, + { + "auxiliary_loss_clip": 0.06511761, + "auxiliary_loss_mlp": 0.01275296, + "balance_loss_clip": 0.06292045, + "balance_loss_mlp": 0.0125565, + "epoch": 0.2513753194047798, + "flos": 20491131000960.0, + "grad_norm": 1.8144815234901748, + "language_loss": 0.86591852, + "learning_rate": 3.5075623667516796e-06, + "loss": 0.94378912, + "num_input_tokens_seen": 90120365, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.19628906, + "step": 4181, + "time_per_iteration": 2.546736240386963 + }, + { + "auxiliary_loss_clip": 0.06519435, + "auxiliary_loss_mlp": 0.01276165, + "balance_loss_clip": 0.06298217, + "balance_loss_mlp": 0.01256555, + "epoch": 0.25143544265744777, + "flos": 37679182053120.0, + "grad_norm": 1.8572714108551465, + "language_loss": 0.68626046, + "learning_rate": 3.507306412966238e-06, + "loss": 0.76421642, + "num_input_tokens_seen": 90142610, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.19616699, + "step": 4182, + "time_per_iteration": 2.6632721424102783 + }, + { + "auxiliary_loss_clip": 0.06408723, + "auxiliary_loss_mlp": 0.01271787, + "balance_loss_clip": 0.0630056, + "balance_loss_mlp": 0.012679, + "epoch": 0.25149556591011574, + "flos": 69386502487680.0, + "grad_norm": 0.837431587640593, + "language_loss": 0.70118701, + "learning_rate": 3.5070504020235853e-06, + "loss": 0.77799207, + "num_input_tokens_seen": 90200555, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.03881836, + "step": 4183, + "time_per_iteration": 3.194293737411499 + }, + { + "auxiliary_loss_clip": 0.0651418, + "auxiliary_loss_mlp": 0.01278526, + "balance_loss_clip": 0.06292195, + "balance_loss_mlp": 0.01258725, + "epoch": 0.2515556891627837, + "flos": 13995478310400.0, + "grad_norm": 2.4106350957321805, + "language_loss": 0.74627292, + "learning_rate": 3.506794333933431e-06, + "loss": 0.82419991, + "num_input_tokens_seen": 90218120, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.19799805, + "step": 4184, + "time_per_iteration": 2.589237689971924 + }, + { + "auxiliary_loss_clip": 0.0652144, + "auxiliary_loss_mlp": 0.01279322, + "balance_loss_clip": 0.06299628, + "balance_loss_mlp": 0.01258496, + "epoch": 0.25161581241545167, + "flos": 22170022133760.0, + "grad_norm": 2.9216799071507964, + "language_loss": 0.83484751, + "learning_rate": 3.506538208705484e-06, + "loss": 0.91285515, + "num_input_tokens_seen": 90236790, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.20837402, + "step": 4185, + "time_per_iteration": 2.5535552501678467 + }, + { + "auxiliary_loss_clip": 0.06393237, + "auxiliary_loss_mlp": 0.01262208, + "balance_loss_clip": 0.06284703, + "balance_loss_mlp": 0.01258632, + "epoch": 0.25167593566811963, + "flos": 69375936873600.0, + "grad_norm": 0.7619629684954553, + "language_loss": 0.61517715, + "learning_rate": 3.5062820263494574e-06, + "loss": 0.69173163, + "num_input_tokens_seen": 90297070, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.03567505, + "step": 4186, + "time_per_iteration": 3.0749270915985107 + }, + { + "auxiliary_loss_clip": 0.06517404, + "auxiliary_loss_mlp": 0.01276354, + "balance_loss_clip": 0.06296861, + "balance_loss_mlp": 0.01256946, + "epoch": 0.2517360589207876, + "flos": 13266533715840.0, + "grad_norm": 1.9855339768496567, + "language_loss": 0.79795682, + "learning_rate": 3.5060257868750656e-06, + "loss": 0.87589443, + "num_input_tokens_seen": 90315255, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.1940918, + "step": 4187, + "time_per_iteration": 2.507354974746704 + }, + { + "auxiliary_loss_clip": 0.06517795, + "auxiliary_loss_mlp": 0.01276527, + "balance_loss_clip": 0.06298955, + "balance_loss_mlp": 0.01257001, + "epoch": 0.25179618217345556, + "flos": 20383208542080.0, + "grad_norm": 1.642205422551737, + "language_loss": 0.80147833, + "learning_rate": 3.5057694902920244e-06, + "loss": 0.87942159, + "num_input_tokens_seen": 90334990, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.19519043, + "step": 4188, + "time_per_iteration": 2.5763680934906006 + }, + { + "auxiliary_loss_clip": 0.06512115, + "auxiliary_loss_mlp": 0.01281194, + "balance_loss_clip": 0.06293076, + "balance_loss_mlp": 0.01261405, + "epoch": 0.25185630542612353, + "flos": 27670767770880.0, + "grad_norm": 1.9118309511671905, + "language_loss": 0.75198257, + "learning_rate": 3.5055131366100534e-06, + "loss": 0.8299157, + "num_input_tokens_seen": 90351825, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.19775391, + "step": 4189, + "time_per_iteration": 2.5764901638031006 + }, + { + "auxiliary_loss_clip": 0.06511948, + "auxiliary_loss_mlp": 0.01272813, + "balance_loss_clip": 0.06296545, + "balance_loss_mlp": 0.01255253, + "epoch": 0.25191642867879155, + "flos": 21002805158400.0, + "grad_norm": 1.9652552730181423, + "language_loss": 0.84938216, + "learning_rate": 3.5052567258388745e-06, + "loss": 0.92722976, + "num_input_tokens_seen": 90369860, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.17565918, + "step": 4190, + "time_per_iteration": 2.592289447784424 + }, + { + "auxiliary_loss_clip": 0.06519347, + "auxiliary_loss_mlp": 0.01277887, + "balance_loss_clip": 0.0629743, + "balance_loss_mlp": 0.01256513, + "epoch": 0.2519765519314595, + "flos": 21112027355520.0, + "grad_norm": 3.618444667756858, + "language_loss": 0.7581113, + "learning_rate": 3.5050002579882082e-06, + "loss": 0.83608365, + "num_input_tokens_seen": 90389245, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.21386719, + "step": 4191, + "time_per_iteration": 2.526263952255249 + }, + { + "auxiliary_loss_clip": 0.06391463, + "auxiliary_loss_mlp": 0.01256383, + "balance_loss_clip": 0.06282607, + "balance_loss_mlp": 0.01252372, + "epoch": 0.2520366751841275, + "flos": 62765932158720.0, + "grad_norm": 0.7119135795788611, + "language_loss": 0.56952, + "learning_rate": 3.5047437330677823e-06, + "loss": 0.64599848, + "num_input_tokens_seen": 90456735, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.0401001, + "step": 4192, + "time_per_iteration": 3.271810531616211 + }, + { + "auxiliary_loss_clip": 0.06513695, + "auxiliary_loss_mlp": 0.01277171, + "balance_loss_clip": 0.06298056, + "balance_loss_mlp": 0.01257835, + "epoch": 0.25209679843679544, + "flos": 22236254386560.0, + "grad_norm": 1.9003966807864532, + "language_loss": 0.77017993, + "learning_rate": 3.504487151087323e-06, + "loss": 0.84808856, + "num_input_tokens_seen": 90474165, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.19335938, + "step": 4193, + "time_per_iteration": 2.57377028465271 + }, + { + "auxiliary_loss_clip": 0.06516427, + "auxiliary_loss_mlp": 0.01274362, + "balance_loss_clip": 0.06290127, + "balance_loss_mlp": 0.01254573, + "epoch": 0.2521569216894634, + "flos": 12171502632960.0, + "grad_norm": 10.029516736128722, + "language_loss": 0.84954166, + "learning_rate": 3.5042305120565598e-06, + "loss": 0.92744958, + "num_input_tokens_seen": 90491660, + "router_z_loss_clip": 2.26171875, + "router_z_loss_mlp": 0.19787598, + "step": 4194, + "time_per_iteration": 2.553053140640259 + }, + { + "auxiliary_loss_clip": 0.06517825, + "auxiliary_loss_mlp": 0.01277837, + "balance_loss_clip": 0.06293463, + "balance_loss_mlp": 0.01258668, + "epoch": 0.2522170449421314, + "flos": 23707182885120.0, + "grad_norm": 1.454284137617771, + "language_loss": 0.88584, + "learning_rate": 3.5039738159852253e-06, + "loss": 0.96379662, + "num_input_tokens_seen": 90514025, + "router_z_loss_clip": 2.2421875, + "router_z_loss_mlp": 0.19165039, + "step": 4195, + "time_per_iteration": 2.576735734939575 + }, + { + "auxiliary_loss_clip": 0.06516481, + "auxiliary_loss_mlp": 0.01280538, + "balance_loss_clip": 0.06291771, + "balance_loss_mlp": 0.01258258, + "epoch": 0.25227716819479934, + "flos": 20961073025280.0, + "grad_norm": 2.023401186655312, + "language_loss": 0.86073804, + "learning_rate": 3.503717062883053e-06, + "loss": 0.93870831, + "num_input_tokens_seen": 90533530, + "router_z_loss_clip": 2.24609375, + "router_z_loss_mlp": 0.22290039, + "step": 4196, + "time_per_iteration": 2.561074733734131 + }, + { + "auxiliary_loss_clip": 0.06519768, + "auxiliary_loss_mlp": 0.01277786, + "balance_loss_clip": 0.06297043, + "balance_loss_mlp": 0.01258486, + "epoch": 0.2523372914474673, + "flos": 23338077649920.0, + "grad_norm": 1.7735111095668046, + "language_loss": 0.8382597, + "learning_rate": 3.5034602527597786e-06, + "loss": 0.91623521, + "num_input_tokens_seen": 90554025, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.19299316, + "step": 4197, + "time_per_iteration": 2.606966018676758 + }, + { + "auxiliary_loss_clip": 0.06523669, + "auxiliary_loss_mlp": 0.01282022, + "balance_loss_clip": 0.06298, + "balance_loss_mlp": 0.01260898, + "epoch": 0.25239741470013527, + "flos": 36978217522560.0, + "grad_norm": 2.239450775339409, + "language_loss": 0.72922301, + "learning_rate": 3.5032033856251405e-06, + "loss": 0.80727994, + "num_input_tokens_seen": 90576930, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.21130371, + "step": 4198, + "time_per_iteration": 2.6708526611328125 + }, + { + "auxiliary_loss_clip": 0.06527208, + "auxiliary_loss_mlp": 0.012804, + "balance_loss_clip": 0.06297485, + "balance_loss_mlp": 0.01258967, + "epoch": 0.25245753795280323, + "flos": 18521777289600.0, + "grad_norm": 2.0891954597653055, + "language_loss": 0.77475321, + "learning_rate": 3.50294646148888e-06, + "loss": 0.85282922, + "num_input_tokens_seen": 90595710, + "router_z_loss_clip": 2.29882812, + "router_z_loss_mlp": 0.21447754, + "step": 4199, + "time_per_iteration": 3.9535269737243652 + }, + { + "auxiliary_loss_clip": 0.06522667, + "auxiliary_loss_mlp": 0.01277202, + "balance_loss_clip": 0.06296766, + "balance_loss_mlp": 0.01257485, + "epoch": 0.2525176612054712, + "flos": 32353387741440.0, + "grad_norm": 1.7804914051128766, + "language_loss": 0.74169135, + "learning_rate": 3.502689480360739e-06, + "loss": 0.81969011, + "num_input_tokens_seen": 90617945, + "router_z_loss_clip": 2.25976562, + "router_z_loss_mlp": 0.19714355, + "step": 4200, + "time_per_iteration": 2.637592315673828 + }, + { + "auxiliary_loss_clip": 0.06517747, + "auxiliary_loss_mlp": 0.01275367, + "balance_loss_clip": 0.06294595, + "balance_loss_mlp": 0.01255602, + "epoch": 0.25257778445813917, + "flos": 45268440307200.0, + "grad_norm": 1.5897560976370495, + "language_loss": 0.82704282, + "learning_rate": 3.5024324422504616e-06, + "loss": 0.90497398, + "num_input_tokens_seen": 90640855, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.19775391, + "step": 4201, + "time_per_iteration": 2.740555763244629 + }, + { + "auxiliary_loss_clip": 0.06520839, + "auxiliary_loss_mlp": 0.01279409, + "balance_loss_clip": 0.06295383, + "balance_loss_mlp": 0.01259048, + "epoch": 0.25263790771080713, + "flos": 23374526976000.0, + "grad_norm": 1.712909977397354, + "language_loss": 0.75193971, + "learning_rate": 3.5021753471677965e-06, + "loss": 0.82994223, + "num_input_tokens_seen": 90661350, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.20361328, + "step": 4202, + "time_per_iteration": 2.55350661277771 + }, + { + "auxiliary_loss_clip": 0.06512797, + "auxiliary_loss_mlp": 0.01277812, + "balance_loss_clip": 0.06294158, + "balance_loss_mlp": 0.01258226, + "epoch": 0.25269803096347515, + "flos": 18520938748800.0, + "grad_norm": 3.10045167794265, + "language_loss": 0.73924601, + "learning_rate": 3.501918195122491e-06, + "loss": 0.81715208, + "num_input_tokens_seen": 90680540, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.19592285, + "step": 4203, + "time_per_iteration": 2.539475917816162 + }, + { + "auxiliary_loss_clip": 0.06523657, + "auxiliary_loss_mlp": 0.01272979, + "balance_loss_clip": 0.0629805, + "balance_loss_mlp": 0.01252964, + "epoch": 0.2527581542161431, + "flos": 24617870985600.0, + "grad_norm": 1.4931409888350198, + "language_loss": 0.78306639, + "learning_rate": 3.501660986124297e-06, + "loss": 0.86103272, + "num_input_tokens_seen": 90703460, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.20007324, + "step": 4204, + "time_per_iteration": 4.058368682861328 + }, + { + "auxiliary_loss_clip": 0.0651952, + "auxiliary_loss_mlp": 0.01278061, + "balance_loss_clip": 0.06294288, + "balance_loss_mlp": 0.01258427, + "epoch": 0.2528182774688111, + "flos": 12646266266880.0, + "grad_norm": 2.5678524165435928, + "language_loss": 0.72629768, + "learning_rate": 3.5014037201829684e-06, + "loss": 0.80427349, + "num_input_tokens_seen": 90718815, + "router_z_loss_clip": 2.25585938, + "router_z_loss_mlp": 0.19616699, + "step": 4205, + "time_per_iteration": 2.503054618835449 + }, + { + "auxiliary_loss_clip": 0.06508891, + "auxiliary_loss_mlp": 0.01281235, + "balance_loss_clip": 0.06295824, + "balance_loss_mlp": 0.01264164, + "epoch": 0.25287840072147905, + "flos": 46947331440000.0, + "grad_norm": 1.3326329418173375, + "language_loss": 0.76355231, + "learning_rate": 3.50114639730826e-06, + "loss": 0.84145361, + "num_input_tokens_seen": 90742125, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.17077637, + "step": 4206, + "time_per_iteration": 4.097341537475586 + }, + { + "auxiliary_loss_clip": 0.06516857, + "auxiliary_loss_mlp": 0.01278993, + "balance_loss_clip": 0.06295118, + "balance_loss_mlp": 0.0126042, + "epoch": 0.252938523974147, + "flos": 18885641644800.0, + "grad_norm": 1.8849973173990275, + "language_loss": 0.79775047, + "learning_rate": 3.5008890175099296e-06, + "loss": 0.875709, + "num_input_tokens_seen": 90760785, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.18579102, + "step": 4207, + "time_per_iteration": 2.545203447341919 + }, + { + "auxiliary_loss_clip": 0.06511112, + "auxiliary_loss_mlp": 0.01280475, + "balance_loss_clip": 0.06293532, + "balance_loss_mlp": 0.01261628, + "epoch": 0.252998647226815, + "flos": 21441245247360.0, + "grad_norm": 1.449056492648579, + "language_loss": 0.76862776, + "learning_rate": 3.5006315807977375e-06, + "loss": 0.84654361, + "num_input_tokens_seen": 90780045, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.18859863, + "step": 4208, + "time_per_iteration": 2.540531873703003 + }, + { + "auxiliary_loss_clip": 0.06512551, + "auxiliary_loss_mlp": 0.01282266, + "balance_loss_clip": 0.06295963, + "balance_loss_mlp": 0.01264098, + "epoch": 0.25305877047948294, + "flos": 25448365128960.0, + "grad_norm": 1.8025422596027827, + "language_loss": 0.70108622, + "learning_rate": 3.5003740871814456e-06, + "loss": 0.77903438, + "num_input_tokens_seen": 90797980, + "router_z_loss_clip": 2.1640625, + "router_z_loss_mlp": 0.1817627, + "step": 4209, + "time_per_iteration": 2.586179256439209 + }, + { + "auxiliary_loss_clip": 0.06401253, + "auxiliary_loss_mlp": 0.01256172, + "balance_loss_clip": 0.06294125, + "balance_loss_mlp": 0.01251663, + "epoch": 0.2531188937321509, + "flos": 60205213457280.0, + "grad_norm": 0.7328516672129679, + "language_loss": 0.55096745, + "learning_rate": 3.5001165366708175e-06, + "loss": 0.62754166, + "num_input_tokens_seen": 90864865, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.0451355, + "step": 4210, + "time_per_iteration": 4.676252841949463 + }, + { + "auxiliary_loss_clip": 0.06515378, + "auxiliary_loss_mlp": 0.01285614, + "balance_loss_clip": 0.06294395, + "balance_loss_mlp": 0.01265861, + "epoch": 0.25317901698481887, + "flos": 19688449213440.0, + "grad_norm": 2.0935195986224837, + "language_loss": 0.81166065, + "learning_rate": 3.4998589292756204e-06, + "loss": 0.88967055, + "num_input_tokens_seen": 90882885, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.19763184, + "step": 4211, + "time_per_iteration": 2.5251474380493164 + }, + { + "auxiliary_loss_clip": 0.06513076, + "auxiliary_loss_mlp": 0.01275756, + "balance_loss_clip": 0.06299528, + "balance_loss_mlp": 0.01258554, + "epoch": 0.25323914023748684, + "flos": 24431012380800.0, + "grad_norm": 1.7184165713115493, + "language_loss": 0.78543985, + "learning_rate": 3.499601265005622e-06, + "loss": 0.86332822, + "num_input_tokens_seen": 90902985, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.17199707, + "step": 4212, + "time_per_iteration": 2.609750986099243 + }, + { + "auxiliary_loss_clip": 0.06514729, + "auxiliary_loss_mlp": 0.01278491, + "balance_loss_clip": 0.06293602, + "balance_loss_mlp": 0.0125912, + "epoch": 0.2532992634901548, + "flos": 25454528403840.0, + "grad_norm": 1.862422609084939, + "language_loss": 0.53407073, + "learning_rate": 3.4993435438705938e-06, + "loss": 0.61200291, + "num_input_tokens_seen": 90923550, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.19384766, + "step": 4213, + "time_per_iteration": 2.5825159549713135 + }, + { + "auxiliary_loss_clip": 0.06517738, + "auxiliary_loss_mlp": 0.01278881, + "balance_loss_clip": 0.06296406, + "balance_loss_mlp": 0.01259832, + "epoch": 0.25335938674282277, + "flos": 18886605966720.0, + "grad_norm": 2.428420926128805, + "language_loss": 0.65041012, + "learning_rate": 3.499085765880308e-06, + "loss": 0.72837627, + "num_input_tokens_seen": 90943260, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.19030762, + "step": 4214, + "time_per_iteration": 2.567539930343628 + }, + { + "auxiliary_loss_clip": 0.06391697, + "auxiliary_loss_mlp": 0.01257675, + "balance_loss_clip": 0.06284457, + "balance_loss_mlp": 0.01253702, + "epoch": 0.25341950999549073, + "flos": 53079692025600.0, + "grad_norm": 0.8253897319773601, + "language_loss": 0.57886475, + "learning_rate": 3.4988279310445396e-06, + "loss": 0.65535849, + "num_input_tokens_seen": 90996295, + "router_z_loss_clip": 1.07324219, + "router_z_loss_mlp": 0.03970337, + "step": 4215, + "time_per_iteration": 2.941021680831909 + }, + { + "auxiliary_loss_clip": 0.06512114, + "auxiliary_loss_mlp": 0.01274398, + "balance_loss_clip": 0.0629489, + "balance_loss_mlp": 0.0125604, + "epoch": 0.2534796332481587, + "flos": 39029609980800.0, + "grad_norm": 1.6071125602920209, + "language_loss": 0.84078032, + "learning_rate": 3.498570039373066e-06, + "loss": 0.9186455, + "num_input_tokens_seen": 91017545, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.18359375, + "step": 4216, + "time_per_iteration": 2.732790946960449 + }, + { + "auxiliary_loss_clip": 0.06509562, + "auxiliary_loss_mlp": 0.0127764, + "balance_loss_clip": 0.06290903, + "balance_loss_mlp": 0.01259294, + "epoch": 0.2535397565008267, + "flos": 23593809911040.0, + "grad_norm": 1.7865601815504963, + "language_loss": 0.81036615, + "learning_rate": 3.498312090875666e-06, + "loss": 0.88823819, + "num_input_tokens_seen": 91037715, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.18371582, + "step": 4217, + "time_per_iteration": 2.5606398582458496 + }, + { + "auxiliary_loss_clip": 0.06514265, + "auxiliary_loss_mlp": 0.01279769, + "balance_loss_clip": 0.06294704, + "balance_loss_mlp": 0.01260255, + "epoch": 0.2535998797534947, + "flos": 19287422772480.0, + "grad_norm": 2.529157470409933, + "language_loss": 0.761132, + "learning_rate": 3.4980540855621218e-06, + "loss": 0.83907235, + "num_input_tokens_seen": 91055295, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.19519043, + "step": 4218, + "time_per_iteration": 2.623429298400879 + }, + { + "auxiliary_loss_clip": 0.06516235, + "auxiliary_loss_mlp": 0.01282224, + "balance_loss_clip": 0.06296211, + "balance_loss_mlp": 0.01262757, + "epoch": 0.25366000300616265, + "flos": 24031201824000.0, + "grad_norm": 1.721807278316132, + "language_loss": 0.75063616, + "learning_rate": 3.4977960234422167e-06, + "loss": 0.82862079, + "num_input_tokens_seen": 91075485, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.19482422, + "step": 4219, + "time_per_iteration": 2.564220428466797 + }, + { + "auxiliary_loss_clip": 0.06520407, + "auxiliary_loss_mlp": 0.0127968, + "balance_loss_clip": 0.06298073, + "balance_loss_mlp": 0.01259713, + "epoch": 0.2537201262588306, + "flos": 16294888454400.0, + "grad_norm": 1.6804083546431516, + "language_loss": 0.81834626, + "learning_rate": 3.497537904525736e-06, + "loss": 0.89634717, + "num_input_tokens_seen": 91093620, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.19970703, + "step": 4220, + "time_per_iteration": 2.576335906982422 + }, + { + "auxiliary_loss_clip": 0.0652357, + "auxiliary_loss_mlp": 0.01275521, + "balance_loss_clip": 0.06301299, + "balance_loss_mlp": 0.01256936, + "epoch": 0.2537802495114986, + "flos": 23301376761600.0, + "grad_norm": 2.4535775533256796, + "language_loss": 0.71752739, + "learning_rate": 3.497279728822468e-06, + "loss": 0.79551834, + "num_input_tokens_seen": 91114110, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.18579102, + "step": 4221, + "time_per_iteration": 2.561870813369751 + }, + { + "auxiliary_loss_clip": 0.06528511, + "auxiliary_loss_mlp": 0.01279389, + "balance_loss_clip": 0.0630452, + "balance_loss_mlp": 0.01259148, + "epoch": 0.25384037276416654, + "flos": 17644855184640.0, + "grad_norm": 1.5017476973585115, + "language_loss": 0.62507772, + "learning_rate": 3.497021496342202e-06, + "loss": 0.70315671, + "num_input_tokens_seen": 91133135, + "router_z_loss_clip": 2.23828125, + "router_z_loss_mlp": 0.20239258, + "step": 4222, + "time_per_iteration": 2.6921043395996094 + }, + { + "auxiliary_loss_clip": 0.06520825, + "auxiliary_loss_mlp": 0.01278393, + "balance_loss_clip": 0.06297866, + "balance_loss_mlp": 0.0125864, + "epoch": 0.2539004960168345, + "flos": 21513473066880.0, + "grad_norm": 1.6064438591236823, + "language_loss": 0.75066334, + "learning_rate": 3.496763207094731e-06, + "loss": 0.82865554, + "num_input_tokens_seen": 91151805, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.19763184, + "step": 4223, + "time_per_iteration": 2.525251626968384 + }, + { + "auxiliary_loss_clip": 0.06514867, + "auxiliary_loss_mlp": 0.01278352, + "balance_loss_clip": 0.06297616, + "balance_loss_mlp": 0.01260101, + "epoch": 0.2539606192695025, + "flos": 23957632339200.0, + "grad_norm": 1.753259760034452, + "language_loss": 0.80341679, + "learning_rate": 3.49650486108985e-06, + "loss": 0.88134897, + "num_input_tokens_seen": 91172270, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.18261719, + "step": 4224, + "time_per_iteration": 2.6002583503723145 + }, + { + "auxiliary_loss_clip": 0.06515887, + "auxiliary_loss_mlp": 0.01281311, + "balance_loss_clip": 0.0629767, + "balance_loss_mlp": 0.01261999, + "epoch": 0.25402074252217044, + "flos": 24176537930880.0, + "grad_norm": 1.4707313275482783, + "language_loss": 0.78211224, + "learning_rate": 3.496246458337354e-06, + "loss": 0.8600843, + "num_input_tokens_seen": 91192080, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.19299316, + "step": 4225, + "time_per_iteration": 2.5527138710021973 + }, + { + "auxiliary_loss_clip": 0.06521728, + "auxiliary_loss_mlp": 0.01282671, + "balance_loss_clip": 0.06302264, + "balance_loss_mlp": 0.01263013, + "epoch": 0.2540808657748384, + "flos": 22309320746880.0, + "grad_norm": 1.6188569007516582, + "language_loss": 0.85543132, + "learning_rate": 3.4959879988470426e-06, + "loss": 0.93347526, + "num_input_tokens_seen": 91211450, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.1965332, + "step": 4226, + "time_per_iteration": 2.5676872730255127 + }, + { + "auxiliary_loss_clip": 0.06515788, + "auxiliary_loss_mlp": 0.01277599, + "balance_loss_clip": 0.06296097, + "balance_loss_mlp": 0.01258883, + "epoch": 0.25414098902750637, + "flos": 27606883432320.0, + "grad_norm": 1.6805883261517605, + "language_loss": 0.71414381, + "learning_rate": 3.4957294826287164e-06, + "loss": 0.79207766, + "num_input_tokens_seen": 91231835, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.18713379, + "step": 4227, + "time_per_iteration": 2.5918691158294678 + }, + { + "auxiliary_loss_clip": 0.06387169, + "auxiliary_loss_mlp": 0.01261576, + "balance_loss_clip": 0.06279954, + "balance_loss_mlp": 0.01257166, + "epoch": 0.25420111228017434, + "flos": 58188760951680.0, + "grad_norm": 0.9697801274632529, + "language_loss": 0.61857057, + "learning_rate": 3.4954709096921785e-06, + "loss": 0.69505799, + "num_input_tokens_seen": 91288755, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.04418945, + "step": 4228, + "time_per_iteration": 3.01169490814209 + }, + { + "auxiliary_loss_clip": 0.06514917, + "auxiliary_loss_mlp": 0.01279347, + "balance_loss_clip": 0.0629469, + "balance_loss_mlp": 0.01258235, + "epoch": 0.2542612355328423, + "flos": 11467645136640.0, + "grad_norm": 2.3876652287650577, + "language_loss": 0.8721081, + "learning_rate": 3.4952122800472336e-06, + "loss": 0.95005071, + "num_input_tokens_seen": 91302485, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.21130371, + "step": 4229, + "time_per_iteration": 2.5960769653320312 + }, + { + "auxiliary_loss_clip": 0.06519967, + "auxiliary_loss_mlp": 0.01277589, + "balance_loss_clip": 0.06299049, + "balance_loss_mlp": 0.01257836, + "epoch": 0.2543213587855103, + "flos": 22972452359040.0, + "grad_norm": 2.100172466954555, + "language_loss": 0.78119314, + "learning_rate": 3.4949535937036892e-06, + "loss": 0.85916877, + "num_input_tokens_seen": 91321120, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.19775391, + "step": 4230, + "time_per_iteration": 2.5483899116516113 + }, + { + "auxiliary_loss_clip": 0.06511904, + "auxiliary_loss_mlp": 0.01277721, + "balance_loss_clip": 0.06292608, + "balance_loss_mlp": 0.01257622, + "epoch": 0.2543814820381783, + "flos": 18257953109760.0, + "grad_norm": 2.00545114565419, + "language_loss": 0.75687885, + "learning_rate": 3.4946948506713544e-06, + "loss": 0.83477509, + "num_input_tokens_seen": 91338575, + "router_z_loss_clip": 2.19335938, + "router_z_loss_mlp": 0.20092773, + "step": 4231, + "time_per_iteration": 2.566326379776001 + }, + { + "auxiliary_loss_clip": 0.06520282, + "auxiliary_loss_mlp": 0.01278584, + "balance_loss_clip": 0.06300422, + "balance_loss_mlp": 0.01259761, + "epoch": 0.25444160529084625, + "flos": 15638129752320.0, + "grad_norm": 1.7887257039808522, + "language_loss": 0.74637282, + "learning_rate": 3.4944360509600416e-06, + "loss": 0.82436144, + "num_input_tokens_seen": 91357355, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.18823242, + "step": 4232, + "time_per_iteration": 2.5229685306549072 + }, + { + "auxiliary_loss_clip": 0.0652221, + "auxiliary_loss_mlp": 0.01293975, + "balance_loss_clip": 0.06303085, + "balance_loss_mlp": 0.01272947, + "epoch": 0.2545017285435142, + "flos": 24607431152640.0, + "grad_norm": 1.8617746927090988, + "language_loss": 0.87183899, + "learning_rate": 3.4941771945795637e-06, + "loss": 0.95000088, + "num_input_tokens_seen": 91376515, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.21032715, + "step": 4233, + "time_per_iteration": 2.6281485557556152 + }, + { + "auxiliary_loss_clip": 0.06505871, + "auxiliary_loss_mlp": 0.01278753, + "balance_loss_clip": 0.06294682, + "balance_loss_mlp": 0.01260442, + "epoch": 0.2545618517961822, + "flos": 24685654538880.0, + "grad_norm": 1.601433299567329, + "language_loss": 0.75604707, + "learning_rate": 3.493918281539737e-06, + "loss": 0.8338933, + "num_input_tokens_seen": 91397595, + "router_z_loss_clip": 2.11328125, + "router_z_loss_mlp": 0.18322754, + "step": 4234, + "time_per_iteration": 2.596642017364502 + }, + { + "auxiliary_loss_clip": 0.06514844, + "auxiliary_loss_mlp": 0.01287463, + "balance_loss_clip": 0.06292339, + "balance_loss_mlp": 0.01268938, + "epoch": 0.25462197504885015, + "flos": 23921937699840.0, + "grad_norm": 1.4560099290474922, + "language_loss": 0.75372213, + "learning_rate": 3.493659311850379e-06, + "loss": 0.83174521, + "num_input_tokens_seen": 91417775, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.18518066, + "step": 4235, + "time_per_iteration": 2.592942953109741 + }, + { + "auxiliary_loss_clip": 0.06532556, + "auxiliary_loss_mlp": 0.01283911, + "balance_loss_clip": 0.06299181, + "balance_loss_mlp": 0.01261797, + "epoch": 0.2546820983015181, + "flos": 24796134547200.0, + "grad_norm": 1.9414760170646592, + "language_loss": 0.65519691, + "learning_rate": 3.4934002855213106e-06, + "loss": 0.73336154, + "num_input_tokens_seen": 91437665, + "router_z_loss_clip": 2.33398438, + "router_z_loss_mlp": 0.22131348, + "step": 4236, + "time_per_iteration": 2.5583407878875732 + }, + { + "auxiliary_loss_clip": 0.06512251, + "auxiliary_loss_mlp": 0.01281938, + "balance_loss_clip": 0.06294776, + "balance_loss_mlp": 0.01262984, + "epoch": 0.2547422215541861, + "flos": 18740095902720.0, + "grad_norm": 1.5016735811799797, + "language_loss": 0.678509, + "learning_rate": 3.493141202562354e-06, + "loss": 0.75645095, + "num_input_tokens_seen": 91456705, + "router_z_loss_clip": 2.17578125, + "router_z_loss_mlp": 0.18945312, + "step": 4237, + "time_per_iteration": 2.5650389194488525 + }, + { + "auxiliary_loss_clip": 0.0651492, + "auxiliary_loss_mlp": 0.01282053, + "balance_loss_clip": 0.06293051, + "balance_loss_mlp": 0.01261394, + "epoch": 0.25480234480685404, + "flos": 21038751360000.0, + "grad_norm": 2.061881611294133, + "language_loss": 0.75628269, + "learning_rate": 3.492882062983333e-06, + "loss": 0.83425242, + "num_input_tokens_seen": 91475535, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.20654297, + "step": 4238, + "time_per_iteration": 2.529883861541748 + }, + { + "auxiliary_loss_clip": 0.06513957, + "auxiliary_loss_mlp": 0.0127785, + "balance_loss_clip": 0.06292559, + "balance_loss_mlp": 0.01258287, + "epoch": 0.254862468059522, + "flos": 25089112748160.0, + "grad_norm": 1.8905919191970875, + "language_loss": 0.81253731, + "learning_rate": 3.492622866794074e-06, + "loss": 0.89045537, + "num_input_tokens_seen": 91499140, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.19555664, + "step": 4239, + "time_per_iteration": 4.02100944519043 + }, + { + "auxiliary_loss_clip": 0.06508629, + "auxiliary_loss_mlp": 0.01294237, + "balance_loss_clip": 0.06291452, + "balance_loss_mlp": 0.01273471, + "epoch": 0.25492259131219, + "flos": 20564658558720.0, + "grad_norm": 1.7183169382614727, + "language_loss": 0.7800405, + "learning_rate": 3.492363614004407e-06, + "loss": 0.85806918, + "num_input_tokens_seen": 91518335, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.2076416, + "step": 4240, + "time_per_iteration": 2.5323400497436523 + }, + { + "auxiliary_loss_clip": 0.06515411, + "auxiliary_loss_mlp": 0.01282684, + "balance_loss_clip": 0.06290809, + "balance_loss_mlp": 0.01262037, + "epoch": 0.25498271456485794, + "flos": 25048889988480.0, + "grad_norm": 1.7684080721058644, + "language_loss": 0.83764112, + "learning_rate": 3.492104304624162e-06, + "loss": 0.915622, + "num_input_tokens_seen": 91537655, + "router_z_loss_clip": 2.24804688, + "router_z_loss_mlp": 0.20629883, + "step": 4241, + "time_per_iteration": 2.618563413619995 + }, + { + "auxiliary_loss_clip": 0.06511963, + "auxiliary_loss_mlp": 0.01282405, + "balance_loss_clip": 0.06292334, + "balance_loss_mlp": 0.01262676, + "epoch": 0.2550428378175259, + "flos": 26185820912640.0, + "grad_norm": 1.7847215082139707, + "language_loss": 0.73873413, + "learning_rate": 3.4918449386631725e-06, + "loss": 0.81667781, + "num_input_tokens_seen": 91557545, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.19750977, + "step": 4242, + "time_per_iteration": 2.6289515495300293 + }, + { + "auxiliary_loss_clip": 0.06517772, + "auxiliary_loss_mlp": 0.01279972, + "balance_loss_clip": 0.06296564, + "balance_loss_mlp": 0.01260398, + "epoch": 0.2551029610701939, + "flos": 15272420607360.0, + "grad_norm": 2.4567533637161896, + "language_loss": 0.72771823, + "learning_rate": 3.491585516131273e-06, + "loss": 0.80569565, + "num_input_tokens_seen": 91574405, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.19567871, + "step": 4243, + "time_per_iteration": 3.9432499408721924 + }, + { + "auxiliary_loss_clip": 0.06515735, + "auxiliary_loss_mlp": 0.0127996, + "balance_loss_clip": 0.06295779, + "balance_loss_mlp": 0.01260195, + "epoch": 0.2551630843228619, + "flos": 18117774028800.0, + "grad_norm": 1.7474968125895491, + "language_loss": 0.82239074, + "learning_rate": 3.491326037038301e-06, + "loss": 0.90034771, + "num_input_tokens_seen": 91593755, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.19750977, + "step": 4244, + "time_per_iteration": 2.6024672985076904 + }, + { + "auxiliary_loss_clip": 0.06397872, + "auxiliary_loss_mlp": 0.01258297, + "balance_loss_clip": 0.06291912, + "balance_loss_mlp": 0.01253388, + "epoch": 0.25522320757552985, + "flos": 70543055266560.0, + "grad_norm": 0.6771353060664416, + "language_loss": 0.57579219, + "learning_rate": 3.4910665013940967e-06, + "loss": 0.65235388, + "num_input_tokens_seen": 91660335, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.04904175, + "step": 4245, + "time_per_iteration": 4.687421083450317 + }, + { + "auxiliary_loss_clip": 0.06516664, + "auxiliary_loss_mlp": 0.01277203, + "balance_loss_clip": 0.06290803, + "balance_loss_mlp": 0.01256628, + "epoch": 0.2552833308281978, + "flos": 22899679488000.0, + "grad_norm": 2.827648139992037, + "language_loss": 0.65781415, + "learning_rate": 3.4908069092085015e-06, + "loss": 0.73575282, + "num_input_tokens_seen": 91678500, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.20593262, + "step": 4246, + "time_per_iteration": 2.542945384979248 + }, + { + "auxiliary_loss_clip": 0.06504452, + "auxiliary_loss_mlp": 0.01278422, + "balance_loss_clip": 0.06290503, + "balance_loss_mlp": 0.01258455, + "epoch": 0.2553434540808658, + "flos": 22060003322880.0, + "grad_norm": 2.2137811054544003, + "language_loss": 0.82470047, + "learning_rate": 3.4905472604913585e-06, + "loss": 0.90252924, + "num_input_tokens_seen": 91696430, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.19970703, + "step": 4247, + "time_per_iteration": 2.5786685943603516 + }, + { + "auxiliary_loss_clip": 0.06521233, + "auxiliary_loss_mlp": 0.01279993, + "balance_loss_clip": 0.062906, + "balance_loss_mlp": 0.01257271, + "epoch": 0.25540357733353375, + "flos": 16549656393600.0, + "grad_norm": 2.135954108256579, + "language_loss": 0.83991635, + "learning_rate": 3.490287555252514e-06, + "loss": 0.91792852, + "num_input_tokens_seen": 91713270, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.22729492, + "step": 4248, + "time_per_iteration": 2.5408127307891846 + }, + { + "auxiliary_loss_clip": 0.06511332, + "auxiliary_loss_mlp": 0.01273979, + "balance_loss_clip": 0.062884, + "balance_loss_mlp": 0.01253773, + "epoch": 0.2554637005862017, + "flos": 17570531013120.0, + "grad_norm": 2.3193810219262585, + "language_loss": 0.84631854, + "learning_rate": 3.4900277935018166e-06, + "loss": 0.92417163, + "num_input_tokens_seen": 91728865, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.20202637, + "step": 4249, + "time_per_iteration": 4.003984212875366 + }, + { + "auxiliary_loss_clip": 0.06380495, + "auxiliary_loss_mlp": 0.01253384, + "balance_loss_clip": 0.06276014, + "balance_loss_mlp": 0.01249388, + "epoch": 0.2555238238388697, + "flos": 72263441698560.0, + "grad_norm": 0.7365466774710785, + "language_loss": 0.56168175, + "learning_rate": 3.489767975249115e-06, + "loss": 0.63802058, + "num_input_tokens_seen": 91787470, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.03994751, + "step": 4250, + "time_per_iteration": 3.169614553451538 + }, + { + "auxiliary_loss_clip": 0.06511974, + "auxiliary_loss_mlp": 0.01277356, + "balance_loss_clip": 0.06289789, + "balance_loss_mlp": 0.01255433, + "epoch": 0.25558394709153764, + "flos": 24396323990400.0, + "grad_norm": 2.4378887831258527, + "language_loss": 0.81129342, + "learning_rate": 3.4895081005042632e-06, + "loss": 0.88918668, + "num_input_tokens_seen": 91805640, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.21936035, + "step": 4251, + "time_per_iteration": 2.576631784439087 + }, + { + "auxiliary_loss_clip": 0.06382731, + "auxiliary_loss_mlp": 0.01258719, + "balance_loss_clip": 0.06278136, + "balance_loss_mlp": 0.01254794, + "epoch": 0.2556440703442056, + "flos": 69251857776000.0, + "grad_norm": 0.7756464213587903, + "language_loss": 0.66132653, + "learning_rate": 3.4892481692771146e-06, + "loss": 0.73774105, + "num_input_tokens_seen": 91869695, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.03921509, + "step": 4252, + "time_per_iteration": 3.2080140113830566 + }, + { + "auxiliary_loss_clip": 0.06505658, + "auxiliary_loss_mlp": 0.0127465, + "balance_loss_clip": 0.06288829, + "balance_loss_mlp": 0.01255922, + "epoch": 0.2557041935968736, + "flos": 24870919916160.0, + "grad_norm": 1.8769862610793295, + "language_loss": 0.74028432, + "learning_rate": 3.4889881815775267e-06, + "loss": 0.81808746, + "num_input_tokens_seen": 91889920, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.18737793, + "step": 4253, + "time_per_iteration": 2.569730520248413 + }, + { + "auxiliary_loss_clip": 0.06509089, + "auxiliary_loss_mlp": 0.01282548, + "balance_loss_clip": 0.06290878, + "balance_loss_mlp": 0.01261746, + "epoch": 0.25576431684954154, + "flos": 22498694974080.0, + "grad_norm": 4.507455095580577, + "language_loss": 0.742535, + "learning_rate": 3.488728137415357e-06, + "loss": 0.82045132, + "num_input_tokens_seen": 91908665, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.20800781, + "step": 4254, + "time_per_iteration": 2.58933424949646 + }, + { + "auxiliary_loss_clip": 0.0651402, + "auxiliary_loss_mlp": 0.0127796, + "balance_loss_clip": 0.06292839, + "balance_loss_mlp": 0.01257253, + "epoch": 0.2558244401022095, + "flos": 19832569436160.0, + "grad_norm": 1.7853658258569405, + "language_loss": 0.81599152, + "learning_rate": 3.4884680368004675e-06, + "loss": 0.89391136, + "num_input_tokens_seen": 91927855, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.20703125, + "step": 4255, + "time_per_iteration": 2.5198400020599365 + }, + { + "auxiliary_loss_clip": 0.06507239, + "auxiliary_loss_mlp": 0.01282593, + "balance_loss_clip": 0.06290218, + "balance_loss_mlp": 0.01262304, + "epoch": 0.2558845633548775, + "flos": 23226968736000.0, + "grad_norm": 1.3889535500711463, + "language_loss": 0.85781598, + "learning_rate": 3.488207879742721e-06, + "loss": 0.93571424, + "num_input_tokens_seen": 91948500, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.20275879, + "step": 4256, + "time_per_iteration": 2.6466193199157715 + }, + { + "auxiliary_loss_clip": 0.06518268, + "auxiliary_loss_mlp": 0.01279996, + "balance_loss_clip": 0.06292354, + "balance_loss_mlp": 0.01259432, + "epoch": 0.2559446866075455, + "flos": 16843682770560.0, + "grad_norm": 2.0395659723156814, + "language_loss": 0.75505483, + "learning_rate": 3.4879476662519826e-06, + "loss": 0.83303738, + "num_input_tokens_seen": 91968375, + "router_z_loss_clip": 2.25585938, + "router_z_loss_mlp": 0.20556641, + "step": 4257, + "time_per_iteration": 2.5399420261383057 + }, + { + "auxiliary_loss_clip": 0.06380453, + "auxiliary_loss_mlp": 0.01254162, + "balance_loss_clip": 0.06277193, + "balance_loss_mlp": 0.01249772, + "epoch": 0.25600480986021346, + "flos": 57612741258240.0, + "grad_norm": 0.7838298602570629, + "language_loss": 0.65205377, + "learning_rate": 3.4876873963381196e-06, + "loss": 0.72839993, + "num_input_tokens_seen": 92028490, + "router_z_loss_clip": 1.03125, + "router_z_loss_mlp": 0.04397583, + "step": 4258, + "time_per_iteration": 3.1310055255889893 + }, + { + "auxiliary_loss_clip": 0.06504042, + "auxiliary_loss_mlp": 0.01278745, + "balance_loss_clip": 0.06291071, + "balance_loss_mlp": 0.01257192, + "epoch": 0.2560649331128814, + "flos": 27827088762240.0, + "grad_norm": 1.6413095395992356, + "language_loss": 0.76769841, + "learning_rate": 3.4874270700110013e-06, + "loss": 0.84552622, + "num_input_tokens_seen": 92048060, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.2154541, + "step": 4259, + "time_per_iteration": 2.6200387477874756 + }, + { + "auxiliary_loss_clip": 0.06386054, + "auxiliary_loss_mlp": 0.01255029, + "balance_loss_clip": 0.06282675, + "balance_loss_mlp": 0.01250824, + "epoch": 0.2561250563655494, + "flos": 70972187552640.0, + "grad_norm": 0.7732791072218576, + "language_loss": 0.58378285, + "learning_rate": 3.4871666872804994e-06, + "loss": 0.66019368, + "num_input_tokens_seen": 92118180, + "router_z_loss_clip": 1.03125, + "router_z_loss_mlp": 0.04208374, + "step": 4260, + "time_per_iteration": 3.2671031951904297 + }, + { + "auxiliary_loss_clip": 0.06510498, + "auxiliary_loss_mlp": 0.01277826, + "balance_loss_clip": 0.06290598, + "balance_loss_mlp": 0.0125824, + "epoch": 0.25618517961821735, + "flos": 27018998386560.0, + "grad_norm": 1.6762593333812295, + "language_loss": 0.77063274, + "learning_rate": 3.4869062481564875e-06, + "loss": 0.84851599, + "num_input_tokens_seen": 92137570, + "router_z_loss_clip": 2.19921875, + "router_z_loss_mlp": 0.19580078, + "step": 4261, + "time_per_iteration": 2.6590030193328857 + }, + { + "auxiliary_loss_clip": 0.06510883, + "auxiliary_loss_mlp": 0.01281621, + "balance_loss_clip": 0.06293076, + "balance_loss_mlp": 0.01261534, + "epoch": 0.2562453028708853, + "flos": 23073708418560.0, + "grad_norm": 1.5026397479094624, + "language_loss": 0.83196223, + "learning_rate": 3.486645752648842e-06, + "loss": 0.90988725, + "num_input_tokens_seen": 92157625, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.20080566, + "step": 4262, + "time_per_iteration": 2.606386661529541 + }, + { + "auxiliary_loss_clip": 0.06520962, + "auxiliary_loss_mlp": 0.01278022, + "balance_loss_clip": 0.06295811, + "balance_loss_mlp": 0.0125778, + "epoch": 0.2563054261235533, + "flos": 15126120178560.0, + "grad_norm": 2.976746783245639, + "language_loss": 0.7460134, + "learning_rate": 3.4863852007674405e-06, + "loss": 0.82400322, + "num_input_tokens_seen": 92175350, + "router_z_loss_clip": 2.25585938, + "router_z_loss_mlp": 0.20239258, + "step": 4263, + "time_per_iteration": 2.573204517364502 + }, + { + "auxiliary_loss_clip": 0.06511976, + "auxiliary_loss_mlp": 0.01275308, + "balance_loss_clip": 0.0629802, + "balance_loss_mlp": 0.01256008, + "epoch": 0.25636554937622125, + "flos": 27862238350080.0, + "grad_norm": 1.7189236473805392, + "language_loss": 0.83209884, + "learning_rate": 3.486124592522163e-06, + "loss": 0.90997171, + "num_input_tokens_seen": 92196070, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.19299316, + "step": 4264, + "time_per_iteration": 2.5768978595733643 + }, + { + "auxiliary_loss_clip": 0.06522107, + "auxiliary_loss_mlp": 0.01275609, + "balance_loss_clip": 0.06300539, + "balance_loss_mlp": 0.01255403, + "epoch": 0.2564256726288892, + "flos": 28912979750400.0, + "grad_norm": 2.7518222985569247, + "language_loss": 0.75264466, + "learning_rate": 3.4858639279228924e-06, + "loss": 0.83062184, + "num_input_tokens_seen": 92216310, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.20202637, + "step": 4265, + "time_per_iteration": 2.6022770404815674 + }, + { + "auxiliary_loss_clip": 0.06514756, + "auxiliary_loss_mlp": 0.01276084, + "balance_loss_clip": 0.06293936, + "balance_loss_mlp": 0.01256701, + "epoch": 0.2564857958815572, + "flos": 18520812967680.0, + "grad_norm": 2.7205564726060754, + "language_loss": 0.82059085, + "learning_rate": 3.485603206979513e-06, + "loss": 0.89849925, + "num_input_tokens_seen": 92234510, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.19396973, + "step": 4266, + "time_per_iteration": 2.5768039226531982 + }, + { + "auxiliary_loss_clip": 0.06513181, + "auxiliary_loss_mlp": 0.01282165, + "balance_loss_clip": 0.06295994, + "balance_loss_mlp": 0.01263199, + "epoch": 0.25654591913422514, + "flos": 25814745106560.0, + "grad_norm": 2.256505464235654, + "language_loss": 0.79590619, + "learning_rate": 3.4853424297019103e-06, + "loss": 0.8738597, + "num_input_tokens_seen": 92254070, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.1895752, + "step": 4267, + "time_per_iteration": 2.58900785446167 + }, + { + "auxiliary_loss_clip": 0.06512932, + "auxiliary_loss_mlp": 0.01282882, + "balance_loss_clip": 0.06302384, + "balance_loss_mlp": 0.01263439, + "epoch": 0.2566060423868931, + "flos": 19105805047680.0, + "grad_norm": 1.7450924080459818, + "language_loss": 0.79543281, + "learning_rate": 3.4850815960999736e-06, + "loss": 0.87339091, + "num_input_tokens_seen": 92275060, + "router_z_loss_clip": 2.10546875, + "router_z_loss_mlp": 0.19421387, + "step": 4268, + "time_per_iteration": 2.532245635986328 + }, + { + "auxiliary_loss_clip": 0.06515032, + "auxiliary_loss_mlp": 0.01281336, + "balance_loss_clip": 0.06296947, + "balance_loss_mlp": 0.01261166, + "epoch": 0.25666616563956113, + "flos": 23849584099200.0, + "grad_norm": 1.6329297187056233, + "language_loss": 0.69106698, + "learning_rate": 3.484820706183595e-06, + "loss": 0.76903057, + "num_input_tokens_seen": 92293610, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.20153809, + "step": 4269, + "time_per_iteration": 2.7064032554626465 + }, + { + "auxiliary_loss_clip": 0.06520134, + "auxiliary_loss_mlp": 0.01278603, + "balance_loss_clip": 0.06299803, + "balance_loss_mlp": 0.01259016, + "epoch": 0.2567262888922291, + "flos": 14608366600320.0, + "grad_norm": 2.976489070793836, + "language_loss": 0.79361498, + "learning_rate": 3.484559759962666e-06, + "loss": 0.8716023, + "num_input_tokens_seen": 92308305, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.19580078, + "step": 4270, + "time_per_iteration": 2.5247366428375244 + }, + { + "auxiliary_loss_clip": 0.06528008, + "auxiliary_loss_mlp": 0.01281711, + "balance_loss_clip": 0.0630113, + "balance_loss_mlp": 0.0125899, + "epoch": 0.25678641214489706, + "flos": 32930791027200.0, + "grad_norm": 2.0785991894062104, + "language_loss": 0.68438745, + "learning_rate": 3.4842987574470816e-06, + "loss": 0.76248461, + "num_input_tokens_seen": 92329875, + "router_z_loss_clip": 2.26757812, + "router_z_loss_mlp": 0.22717285, + "step": 4271, + "time_per_iteration": 2.6327364444732666 + }, + { + "auxiliary_loss_clip": 0.06521121, + "auxiliary_loss_mlp": 0.01277495, + "balance_loss_clip": 0.06297284, + "balance_loss_mlp": 0.01256395, + "epoch": 0.256846535397565, + "flos": 24106029120000.0, + "grad_norm": 1.3298745054932861, + "language_loss": 0.87827712, + "learning_rate": 3.4840376986467403e-06, + "loss": 0.9562633, + "num_input_tokens_seen": 92348780, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.2109375, + "step": 4272, + "time_per_iteration": 2.5886576175689697 + }, + { + "auxiliary_loss_clip": 0.06520741, + "auxiliary_loss_mlp": 0.0127846, + "balance_loss_clip": 0.06299604, + "balance_loss_mlp": 0.01256204, + "epoch": 0.256906658650233, + "flos": 19724437342080.0, + "grad_norm": 1.6471317846086577, + "language_loss": 0.8228811, + "learning_rate": 3.483776583571541e-06, + "loss": 0.90087312, + "num_input_tokens_seen": 92368175, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.22253418, + "step": 4273, + "time_per_iteration": 2.5273654460906982 + }, + { + "auxiliary_loss_clip": 0.06513067, + "auxiliary_loss_mlp": 0.0127658, + "balance_loss_clip": 0.06299708, + "balance_loss_mlp": 0.01257638, + "epoch": 0.25696678190290095, + "flos": 22932019964160.0, + "grad_norm": 1.4706338186359442, + "language_loss": 0.77439249, + "learning_rate": 3.4835154122313846e-06, + "loss": 0.85228896, + "num_input_tokens_seen": 92387755, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.18933105, + "step": 4274, + "time_per_iteration": 2.5805962085723877 + }, + { + "auxiliary_loss_clip": 0.06508841, + "auxiliary_loss_mlp": 0.01274973, + "balance_loss_clip": 0.06295496, + "balance_loss_mlp": 0.0125435, + "epoch": 0.2570269051555689, + "flos": 27315163042560.0, + "grad_norm": 1.5809391622925344, + "language_loss": 0.84101403, + "learning_rate": 3.4832541846361743e-06, + "loss": 0.91885215, + "num_input_tokens_seen": 92409850, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.20629883, + "step": 4275, + "time_per_iteration": 2.5743672847747803 + }, + { + "auxiliary_loss_clip": 0.0652002, + "auxiliary_loss_mlp": 0.01273541, + "balance_loss_clip": 0.06298091, + "balance_loss_mlp": 0.01252965, + "epoch": 0.2570870284082369, + "flos": 27570811449600.0, + "grad_norm": 2.3295240533415016, + "language_loss": 0.78590673, + "learning_rate": 3.4829929007958175e-06, + "loss": 0.86384231, + "num_input_tokens_seen": 92431250, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.20581055, + "step": 4276, + "time_per_iteration": 2.631866216659546 + }, + { + "auxiliary_loss_clip": 0.06515533, + "auxiliary_loss_mlp": 0.01279943, + "balance_loss_clip": 0.06298599, + "balance_loss_mlp": 0.01260237, + "epoch": 0.25714715166090485, + "flos": 28738405768320.0, + "grad_norm": 1.6396366021430353, + "language_loss": 0.79803967, + "learning_rate": 3.4827315607202214e-06, + "loss": 0.8759945, + "num_input_tokens_seen": 92452065, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.19714355, + "step": 4277, + "time_per_iteration": 2.5990161895751953 + }, + { + "auxiliary_loss_clip": 0.06513472, + "auxiliary_loss_mlp": 0.01272259, + "balance_loss_clip": 0.06296529, + "balance_loss_mlp": 0.01254377, + "epoch": 0.2572072749135728, + "flos": 20121606495360.0, + "grad_norm": 1.9596681746733369, + "language_loss": 0.78998482, + "learning_rate": 3.482470164419295e-06, + "loss": 0.8678422, + "num_input_tokens_seen": 92470025, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.17883301, + "step": 4278, + "time_per_iteration": 4.02304744720459 + }, + { + "auxiliary_loss_clip": 0.06522302, + "auxiliary_loss_mlp": 0.01278536, + "balance_loss_clip": 0.06301469, + "balance_loss_mlp": 0.01256911, + "epoch": 0.2572673981662408, + "flos": 26037969183360.0, + "grad_norm": 2.3063853220673067, + "language_loss": 0.75400203, + "learning_rate": 3.482208711902952e-06, + "loss": 0.83201039, + "num_input_tokens_seen": 92489825, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.21618652, + "step": 4279, + "time_per_iteration": 2.5523123741149902 + }, + { + "auxiliary_loss_clip": 0.06516609, + "auxiliary_loss_mlp": 0.0128394, + "balance_loss_clip": 0.06297271, + "balance_loss_mlp": 0.01262721, + "epoch": 0.25732752141890874, + "flos": 16112054845440.0, + "grad_norm": 3.423283610494841, + "language_loss": 0.85997081, + "learning_rate": 3.4819472031811065e-06, + "loss": 0.9379763, + "num_input_tokens_seen": 92507270, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.2121582, + "step": 4280, + "time_per_iteration": 2.5104546546936035 + }, + { + "auxiliary_loss_clip": 0.06517641, + "auxiliary_loss_mlp": 0.01282108, + "balance_loss_clip": 0.06295675, + "balance_loss_mlp": 0.0126133, + "epoch": 0.2573876446715767, + "flos": 22530322690560.0, + "grad_norm": 2.5830483171875955, + "language_loss": 0.78735828, + "learning_rate": 3.4816856382636744e-06, + "loss": 0.86535579, + "num_input_tokens_seen": 92526300, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.20788574, + "step": 4281, + "time_per_iteration": 2.511723279953003 + }, + { + "auxiliary_loss_clip": 0.06512952, + "auxiliary_loss_mlp": 0.01285256, + "balance_loss_clip": 0.06294534, + "balance_loss_mlp": 0.01264048, + "epoch": 0.2574477679242447, + "flos": 23957548485120.0, + "grad_norm": 1.8266556980022217, + "language_loss": 0.87782013, + "learning_rate": 3.4814240171605737e-06, + "loss": 0.9558022, + "num_input_tokens_seen": 92546465, + "router_z_loss_clip": 2.18066406, + "router_z_loss_mlp": 0.21203613, + "step": 4282, + "time_per_iteration": 2.5573971271514893 + }, + { + "auxiliary_loss_clip": 0.06509817, + "auxiliary_loss_mlp": 0.0128236, + "balance_loss_clip": 0.06291438, + "balance_loss_mlp": 0.01262905, + "epoch": 0.2575078911769127, + "flos": 21988278627840.0, + "grad_norm": 1.3881538001933933, + "language_loss": 0.71042287, + "learning_rate": 3.4811623398817267e-06, + "loss": 0.78834462, + "num_input_tokens_seen": 92567260, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.19470215, + "step": 4283, + "time_per_iteration": 3.9826109409332275 + }, + { + "auxiliary_loss_clip": 0.06500088, + "auxiliary_loss_mlp": 0.01289815, + "balance_loss_clip": 0.06290558, + "balance_loss_mlp": 0.01271051, + "epoch": 0.25756801442958066, + "flos": 21951997009920.0, + "grad_norm": 1.9398744879334104, + "language_loss": 0.80991805, + "learning_rate": 3.4809006064370553e-06, + "loss": 0.88781703, + "num_input_tokens_seen": 92585425, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 0.18762207, + "step": 4284, + "time_per_iteration": 2.512471914291382 + }, + { + "auxiliary_loss_clip": 0.06508928, + "auxiliary_loss_mlp": 0.01294414, + "balance_loss_clip": 0.06291771, + "balance_loss_mlp": 0.01274923, + "epoch": 0.2576281376822486, + "flos": 35270675493120.0, + "grad_norm": 2.158245566426343, + "language_loss": 0.70814562, + "learning_rate": 3.4806388168364835e-06, + "loss": 0.78617907, + "num_input_tokens_seen": 92604770, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.19494629, + "step": 4285, + "time_per_iteration": 4.088344097137451 + }, + { + "auxiliary_loss_clip": 0.06504595, + "auxiliary_loss_mlp": 0.0128171, + "balance_loss_clip": 0.06288387, + "balance_loss_mlp": 0.01262505, + "epoch": 0.2576882609349166, + "flos": 14136705567360.0, + "grad_norm": 1.771877130646751, + "language_loss": 0.58818436, + "learning_rate": 3.4803769710899402e-06, + "loss": 0.66604745, + "num_input_tokens_seen": 92622635, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.1920166, + "step": 4286, + "time_per_iteration": 2.5344176292419434 + }, + { + "auxiliary_loss_clip": 0.0650837, + "auxiliary_loss_mlp": 0.01278621, + "balance_loss_clip": 0.06287025, + "balance_loss_mlp": 0.01259118, + "epoch": 0.25774838418758456, + "flos": 23265053216640.0, + "grad_norm": 2.057811055203196, + "language_loss": 0.6464054, + "learning_rate": 3.480115069207354e-06, + "loss": 0.72427529, + "num_input_tokens_seen": 92642960, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.19494629, + "step": 4287, + "time_per_iteration": 2.5958328247070312 + }, + { + "auxiliary_loss_clip": 0.0650748, + "auxiliary_loss_mlp": 0.01286721, + "balance_loss_clip": 0.06287187, + "balance_loss_mlp": 0.01265824, + "epoch": 0.2578085074402525, + "flos": 22608378368640.0, + "grad_norm": 1.9946373780944937, + "language_loss": 0.7222265, + "learning_rate": 3.4798531111986557e-06, + "loss": 0.80016851, + "num_input_tokens_seen": 92662455, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.2088623, + "step": 4288, + "time_per_iteration": 2.5767109394073486 + }, + { + "auxiliary_loss_clip": 0.06504134, + "auxiliary_loss_mlp": 0.01288175, + "balance_loss_clip": 0.06288374, + "balance_loss_mlp": 0.01268851, + "epoch": 0.2578686306929205, + "flos": 24578780256000.0, + "grad_norm": 1.4737569046844996, + "language_loss": 0.77657092, + "learning_rate": 3.4795910970737786e-06, + "loss": 0.85449398, + "num_input_tokens_seen": 92683520, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.1932373, + "step": 4289, + "time_per_iteration": 3.9734480381011963 + }, + { + "auxiliary_loss_clip": 0.0651005, + "auxiliary_loss_mlp": 0.01285951, + "balance_loss_clip": 0.06290901, + "balance_loss_mlp": 0.012641, + "epoch": 0.25792875394558845, + "flos": 18119828453760.0, + "grad_norm": 2.192134211179858, + "language_loss": 0.8580482, + "learning_rate": 3.4793290268426592e-06, + "loss": 0.93600821, + "num_input_tokens_seen": 92701450, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.21838379, + "step": 4290, + "time_per_iteration": 2.5564229488372803 + }, + { + "auxiliary_loss_clip": 0.0651224, + "auxiliary_loss_mlp": 0.01283874, + "balance_loss_clip": 0.06293762, + "balance_loss_mlp": 0.01263573, + "epoch": 0.2579888771982564, + "flos": 17718760085760.0, + "grad_norm": 2.0247866667145344, + "language_loss": 0.73390263, + "learning_rate": 3.4790669005152354e-06, + "loss": 0.81186378, + "num_input_tokens_seen": 92720355, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.20300293, + "step": 4291, + "time_per_iteration": 2.497671365737915 + }, + { + "auxiliary_loss_clip": 0.06508101, + "auxiliary_loss_mlp": 0.01275245, + "balance_loss_clip": 0.06287237, + "balance_loss_mlp": 0.01255647, + "epoch": 0.2580490004509244, + "flos": 16440350342400.0, + "grad_norm": 2.23272675200871, + "language_loss": 0.82139969, + "learning_rate": 3.4788047181014458e-06, + "loss": 0.8992331, + "num_input_tokens_seen": 92736755, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.19604492, + "step": 4292, + "time_per_iteration": 2.5467498302459717 + }, + { + "auxiliary_loss_clip": 0.06505652, + "auxiliary_loss_mlp": 0.01282583, + "balance_loss_clip": 0.06289525, + "balance_loss_mlp": 0.01262532, + "epoch": 0.25810912370359235, + "flos": 33842946574080.0, + "grad_norm": 1.9023591833174374, + "language_loss": 0.67644775, + "learning_rate": 3.4785424796112337e-06, + "loss": 0.7543301, + "num_input_tokens_seen": 92757655, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.20043945, + "step": 4293, + "time_per_iteration": 2.626880168914795 + }, + { + "auxiliary_loss_clip": 0.06507371, + "auxiliary_loss_mlp": 0.01275889, + "balance_loss_clip": 0.06295517, + "balance_loss_mlp": 0.01257244, + "epoch": 0.2581692469562603, + "flos": 25199257340160.0, + "grad_norm": 2.9603548878770387, + "language_loss": 0.76158464, + "learning_rate": 3.478280185054542e-06, + "loss": 0.83941722, + "num_input_tokens_seen": 92776100, + "router_z_loss_clip": 2.12109375, + "router_z_loss_mlp": 0.18640137, + "step": 4294, + "time_per_iteration": 2.5711581707000732 + }, + { + "auxiliary_loss_clip": 0.06508272, + "auxiliary_loss_mlp": 0.01277058, + "balance_loss_clip": 0.06293358, + "balance_loss_mlp": 0.01257866, + "epoch": 0.2582293702089283, + "flos": 34940619060480.0, + "grad_norm": 2.382767918587226, + "language_loss": 0.81769538, + "learning_rate": 3.478017834441318e-06, + "loss": 0.8955487, + "num_input_tokens_seen": 92798880, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.1920166, + "step": 4295, + "time_per_iteration": 2.635817766189575 + }, + { + "auxiliary_loss_clip": 0.06519823, + "auxiliary_loss_mlp": 0.01276702, + "balance_loss_clip": 0.06295969, + "balance_loss_mlp": 0.01256496, + "epoch": 0.2582894934615963, + "flos": 26841028314240.0, + "grad_norm": 1.964012337767824, + "language_loss": 0.72949934, + "learning_rate": 3.4777554277815096e-06, + "loss": 0.80746454, + "num_input_tokens_seen": 92817750, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.20214844, + "step": 4296, + "time_per_iteration": 2.569481134414673 + }, + { + "auxiliary_loss_clip": 0.06514452, + "auxiliary_loss_mlp": 0.01277621, + "balance_loss_clip": 0.0629501, + "balance_loss_mlp": 0.0125732, + "epoch": 0.25834961671426426, + "flos": 23522252924160.0, + "grad_norm": 1.7245670135783875, + "language_loss": 0.87440747, + "learning_rate": 3.477492965085067e-06, + "loss": 0.95232815, + "num_input_tokens_seen": 92837995, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.20288086, + "step": 4297, + "time_per_iteration": 2.5871896743774414 + }, + { + "auxiliary_loss_clip": 0.06510781, + "auxiliary_loss_mlp": 0.01279012, + "balance_loss_clip": 0.062922, + "balance_loss_mlp": 0.01260558, + "epoch": 0.25840973996693223, + "flos": 22456837059840.0, + "grad_norm": 2.9037965134923076, + "language_loss": 0.84894854, + "learning_rate": 3.477230446361943e-06, + "loss": 0.9268465, + "num_input_tokens_seen": 92857245, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.18469238, + "step": 4298, + "time_per_iteration": 2.5290613174438477 + }, + { + "auxiliary_loss_clip": 0.06510766, + "auxiliary_loss_mlp": 0.01276006, + "balance_loss_clip": 0.06292143, + "balance_loss_mlp": 0.01256158, + "epoch": 0.2584698632196002, + "flos": 11295544849920.0, + "grad_norm": 2.12928453409433, + "language_loss": 0.83727312, + "learning_rate": 3.4769678716220927e-06, + "loss": 0.91514087, + "num_input_tokens_seen": 92873265, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.1986084, + "step": 4299, + "time_per_iteration": 2.5314571857452393 + }, + { + "auxiliary_loss_clip": 0.06506392, + "auxiliary_loss_mlp": 0.01272204, + "balance_loss_clip": 0.06292024, + "balance_loss_mlp": 0.01253214, + "epoch": 0.25852998647226816, + "flos": 17935569325440.0, + "grad_norm": 2.08690605682093, + "language_loss": 0.83303946, + "learning_rate": 3.4767052408754726e-06, + "loss": 0.91082543, + "num_input_tokens_seen": 92890880, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.18981934, + "step": 4300, + "time_per_iteration": 2.494170904159546 + }, + { + "auxiliary_loss_clip": 0.06507458, + "auxiliary_loss_mlp": 0.01272704, + "balance_loss_clip": 0.06287713, + "balance_loss_mlp": 0.01254012, + "epoch": 0.2585901097249361, + "flos": 33264620893440.0, + "grad_norm": 3.3706811216639307, + "language_loss": 0.67941749, + "learning_rate": 3.4764425541320417e-06, + "loss": 0.75721914, + "num_input_tokens_seen": 92910770, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.18688965, + "step": 4301, + "time_per_iteration": 2.6923537254333496 + }, + { + "auxiliary_loss_clip": 0.06512292, + "auxiliary_loss_mlp": 0.01275999, + "balance_loss_clip": 0.06289004, + "balance_loss_mlp": 0.01257009, + "epoch": 0.2586502329776041, + "flos": 18447033847680.0, + "grad_norm": 2.7819934823512282, + "language_loss": 0.83073664, + "learning_rate": 3.4761798114017617e-06, + "loss": 0.90861952, + "num_input_tokens_seen": 92929520, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.18994141, + "step": 4302, + "time_per_iteration": 2.5102365016937256 + }, + { + "auxiliary_loss_clip": 0.06508462, + "auxiliary_loss_mlp": 0.01276586, + "balance_loss_clip": 0.06292115, + "balance_loss_mlp": 0.01257358, + "epoch": 0.25871035623027205, + "flos": 17973989222400.0, + "grad_norm": 1.7107484291097332, + "language_loss": 0.91874599, + "learning_rate": 3.475917012694595e-06, + "loss": 0.99659652, + "num_input_tokens_seen": 92947890, + "router_z_loss_clip": 2.1640625, + "router_z_loss_mlp": 0.19238281, + "step": 4303, + "time_per_iteration": 2.5386602878570557 + }, + { + "auxiliary_loss_clip": 0.06508803, + "auxiliary_loss_mlp": 0.01277595, + "balance_loss_clip": 0.0629281, + "balance_loss_mlp": 0.01258569, + "epoch": 0.25877047948294, + "flos": 27784392307200.0, + "grad_norm": 1.7938003883067368, + "language_loss": 0.67601281, + "learning_rate": 3.475654158020507e-06, + "loss": 0.75387681, + "num_input_tokens_seen": 92967690, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.19018555, + "step": 4304, + "time_per_iteration": 2.5739033222198486 + }, + { + "auxiliary_loss_clip": 0.06507856, + "auxiliary_loss_mlp": 0.01276896, + "balance_loss_clip": 0.06286401, + "balance_loss_mlp": 0.01257477, + "epoch": 0.258830602735608, + "flos": 27133209901440.0, + "grad_norm": 2.1929382614593242, + "language_loss": 0.73436916, + "learning_rate": 3.4753912473894657e-06, + "loss": 0.81221676, + "num_input_tokens_seen": 92986830, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.1940918, + "step": 4305, + "time_per_iteration": 2.5877888202667236 + }, + { + "auxiliary_loss_clip": 0.06515621, + "auxiliary_loss_mlp": 0.01276889, + "balance_loss_clip": 0.06293313, + "balance_loss_mlp": 0.01255992, + "epoch": 0.25889072598827595, + "flos": 17896730158080.0, + "grad_norm": 1.8662067033328453, + "language_loss": 0.76418924, + "learning_rate": 3.4751282808114403e-06, + "loss": 0.84211433, + "num_input_tokens_seen": 93002740, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.20898438, + "step": 4306, + "time_per_iteration": 2.482933282852173 + }, + { + "auxiliary_loss_clip": 0.06403579, + "auxiliary_loss_mlp": 0.01258203, + "balance_loss_clip": 0.06296976, + "balance_loss_mlp": 0.01253566, + "epoch": 0.2589508492409439, + "flos": 53951582885760.0, + "grad_norm": 0.8023409981232837, + "language_loss": 0.56592381, + "learning_rate": 3.474865258296403e-06, + "loss": 0.64254159, + "num_input_tokens_seen": 93058645, + "router_z_loss_clip": 1.06347656, + "router_z_loss_mlp": 0.04629517, + "step": 4307, + "time_per_iteration": 3.1265084743499756 + }, + { + "auxiliary_loss_clip": 0.06500413, + "auxiliary_loss_mlp": 0.0127407, + "balance_loss_clip": 0.06289256, + "balance_loss_mlp": 0.01256105, + "epoch": 0.2590109724936119, + "flos": 22132063434240.0, + "grad_norm": 1.735104377472534, + "language_loss": 0.71851504, + "learning_rate": 3.474602179854327e-06, + "loss": 0.79625988, + "num_input_tokens_seen": 93077140, + "router_z_loss_clip": 2.11328125, + "router_z_loss_mlp": 0.17956543, + "step": 4308, + "time_per_iteration": 2.5442304611206055 + }, + { + "auxiliary_loss_clip": 0.06513858, + "auxiliary_loss_mlp": 0.01278615, + "balance_loss_clip": 0.0629196, + "balance_loss_mlp": 0.01258993, + "epoch": 0.2590710957462799, + "flos": 13478395564800.0, + "grad_norm": 2.8033587428294657, + "language_loss": 0.84278727, + "learning_rate": 3.4743390454951886e-06, + "loss": 0.92071199, + "num_input_tokens_seen": 93093580, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.19628906, + "step": 4309, + "time_per_iteration": 2.546034336090088 + }, + { + "auxiliary_loss_clip": 0.06504438, + "auxiliary_loss_mlp": 0.01276588, + "balance_loss_clip": 0.06290606, + "balance_loss_mlp": 0.01258814, + "epoch": 0.25913121899894787, + "flos": 22313219961600.0, + "grad_norm": 1.5400127324827177, + "language_loss": 0.84972912, + "learning_rate": 3.474075855228966e-06, + "loss": 0.92753935, + "num_input_tokens_seen": 93112345, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.17785645, + "step": 4310, + "time_per_iteration": 2.5188028812408447 + }, + { + "auxiliary_loss_clip": 0.06511362, + "auxiliary_loss_mlp": 0.0127375, + "balance_loss_clip": 0.06293052, + "balance_loss_mlp": 0.01254533, + "epoch": 0.25919134225161583, + "flos": 25818770102400.0, + "grad_norm": 1.8118221315599161, + "language_loss": 0.78088975, + "learning_rate": 3.473812609065639e-06, + "loss": 0.85874081, + "num_input_tokens_seen": 93131545, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.19213867, + "step": 4311, + "time_per_iteration": 2.6044604778289795 + }, + { + "auxiliary_loss_clip": 0.06511068, + "auxiliary_loss_mlp": 0.01275144, + "balance_loss_clip": 0.06293963, + "balance_loss_mlp": 0.01256666, + "epoch": 0.2592514655042838, + "flos": 31220314104960.0, + "grad_norm": 4.381167674093932, + "language_loss": 0.73062587, + "learning_rate": 3.4735493070151904e-06, + "loss": 0.80848801, + "num_input_tokens_seen": 93150730, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.18469238, + "step": 4312, + "time_per_iteration": 2.587942600250244 + }, + { + "auxiliary_loss_clip": 0.06508243, + "auxiliary_loss_mlp": 0.01275986, + "balance_loss_clip": 0.06291987, + "balance_loss_mlp": 0.012569, + "epoch": 0.25931158875695176, + "flos": 18480296718720.0, + "grad_norm": 1.7543304647253515, + "language_loss": 0.70305753, + "learning_rate": 3.4732859490876044e-06, + "loss": 0.78089976, + "num_input_tokens_seen": 93167895, + "router_z_loss_clip": 2.16113281, + "router_z_loss_mlp": 0.19091797, + "step": 4313, + "time_per_iteration": 2.5092732906341553 + }, + { + "auxiliary_loss_clip": 0.06508952, + "auxiliary_loss_mlp": 0.01278616, + "balance_loss_clip": 0.06293979, + "balance_loss_mlp": 0.0125971, + "epoch": 0.2593717120096197, + "flos": 19213895214720.0, + "grad_norm": 1.751562510714179, + "language_loss": 0.81158572, + "learning_rate": 3.473022535292867e-06, + "loss": 0.8894614, + "num_input_tokens_seen": 93187650, + "router_z_loss_clip": 2.15332031, + "router_z_loss_mlp": 0.18908691, + "step": 4314, + "time_per_iteration": 2.5584335327148438 + }, + { + "auxiliary_loss_clip": 0.06515148, + "auxiliary_loss_mlp": 0.01278316, + "balance_loss_clip": 0.06292658, + "balance_loss_mlp": 0.01257359, + "epoch": 0.2594318352622877, + "flos": 31256050671360.0, + "grad_norm": 1.9178095473181331, + "language_loss": 0.67283171, + "learning_rate": 3.472759065640968e-06, + "loss": 0.7507664, + "num_input_tokens_seen": 93207370, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.20959473, + "step": 4315, + "time_per_iteration": 2.6295278072357178 + }, + { + "auxiliary_loss_clip": 0.06506292, + "auxiliary_loss_mlp": 0.01277654, + "balance_loss_clip": 0.06292329, + "balance_loss_mlp": 0.01259463, + "epoch": 0.25949195851495566, + "flos": 22243759326720.0, + "grad_norm": 1.412764147956583, + "language_loss": 0.80242419, + "learning_rate": 3.4724955401418976e-06, + "loss": 0.88026369, + "num_input_tokens_seen": 93227925, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.18212891, + "step": 4316, + "time_per_iteration": 2.5277152061462402 + }, + { + "auxiliary_loss_clip": 0.06510989, + "auxiliary_loss_mlp": 0.01277583, + "balance_loss_clip": 0.06290686, + "balance_loss_mlp": 0.01256781, + "epoch": 0.2595520817676236, + "flos": 28083449928960.0, + "grad_norm": 1.6660208675023864, + "language_loss": 0.78127223, + "learning_rate": 3.4722319588056487e-06, + "loss": 0.85915792, + "num_input_tokens_seen": 93250020, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.20812988, + "step": 4317, + "time_per_iteration": 2.6210665702819824 + }, + { + "auxiliary_loss_clip": 0.06507257, + "auxiliary_loss_mlp": 0.01281581, + "balance_loss_clip": 0.06291957, + "balance_loss_mlp": 0.01262054, + "epoch": 0.2596122050202916, + "flos": 20196727280640.0, + "grad_norm": 2.4040812102587377, + "language_loss": 0.78420109, + "learning_rate": 3.4719683216422163e-06, + "loss": 0.86208946, + "num_input_tokens_seen": 93269070, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.19519043, + "step": 4318, + "time_per_iteration": 3.9600155353546143 + }, + { + "auxiliary_loss_clip": 0.06505568, + "auxiliary_loss_mlp": 0.01276855, + "balance_loss_clip": 0.06290057, + "balance_loss_mlp": 0.01256637, + "epoch": 0.25967232827295955, + "flos": 22534431540480.0, + "grad_norm": 2.66294558684285, + "language_loss": 0.77022719, + "learning_rate": 3.471704628661598e-06, + "loss": 0.84805143, + "num_input_tokens_seen": 93290250, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.20227051, + "step": 4319, + "time_per_iteration": 2.544752836227417 + }, + { + "auxiliary_loss_clip": 0.0650554, + "auxiliary_loss_mlp": 0.01280509, + "balance_loss_clip": 0.06290743, + "balance_loss_mlp": 0.01261555, + "epoch": 0.2597324515256275, + "flos": 21074445999360.0, + "grad_norm": 1.7925219732685136, + "language_loss": 0.77426791, + "learning_rate": 3.4714408798737925e-06, + "loss": 0.85212845, + "num_input_tokens_seen": 93310090, + "router_z_loss_clip": 2.14941406, + "router_z_loss_mlp": 0.18945312, + "step": 4320, + "time_per_iteration": 2.569967269897461 + }, + { + "auxiliary_loss_clip": 0.06508496, + "auxiliary_loss_mlp": 0.01273671, + "balance_loss_clip": 0.06289761, + "balance_loss_mlp": 0.01254634, + "epoch": 0.2597925747782955, + "flos": 22055810618880.0, + "grad_norm": 1.593385908573569, + "language_loss": 0.71533716, + "learning_rate": 3.471177075288801e-06, + "loss": 0.79315877, + "num_input_tokens_seen": 93329570, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.19042969, + "step": 4321, + "time_per_iteration": 2.5314829349517822 + }, + { + "auxiliary_loss_clip": 0.0650996, + "auxiliary_loss_mlp": 0.01274348, + "balance_loss_clip": 0.06287652, + "balance_loss_mlp": 0.01254011, + "epoch": 0.2598526980309635, + "flos": 19543071179520.0, + "grad_norm": 2.282331155451991, + "language_loss": 0.75262189, + "learning_rate": 3.4709132149166277e-06, + "loss": 0.83046496, + "num_input_tokens_seen": 93347920, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.20336914, + "step": 4322, + "time_per_iteration": 2.525724411010742 + }, + { + "auxiliary_loss_clip": 0.06509394, + "auxiliary_loss_mlp": 0.01275417, + "balance_loss_clip": 0.06289983, + "balance_loss_mlp": 0.0125533, + "epoch": 0.25991282128363147, + "flos": 24501521191680.0, + "grad_norm": 2.623736611083137, + "language_loss": 0.7442928, + "learning_rate": 3.470649298767278e-06, + "loss": 0.82214087, + "num_input_tokens_seen": 93367145, + "router_z_loss_clip": 2.19335938, + "router_z_loss_mlp": 0.20092773, + "step": 4323, + "time_per_iteration": 3.957674026489258 + }, + { + "auxiliary_loss_clip": 0.06515582, + "auxiliary_loss_mlp": 0.01279409, + "balance_loss_clip": 0.0628901, + "balance_loss_mlp": 0.01258893, + "epoch": 0.25997294453629943, + "flos": 24207410960640.0, + "grad_norm": 1.7976461796423409, + "language_loss": 0.68052149, + "learning_rate": 3.4703853268507597e-06, + "loss": 0.75847143, + "num_input_tokens_seen": 93386555, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.20495605, + "step": 4324, + "time_per_iteration": 4.001135349273682 + }, + { + "auxiliary_loss_clip": 0.06505544, + "auxiliary_loss_mlp": 0.01272584, + "balance_loss_clip": 0.06286605, + "balance_loss_mlp": 0.01254608, + "epoch": 0.2600330677889674, + "flos": 31439597040000.0, + "grad_norm": 1.7946989584541546, + "language_loss": 0.71402133, + "learning_rate": 3.470121299177082e-06, + "loss": 0.79180264, + "num_input_tokens_seen": 93405590, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.1796875, + "step": 4325, + "time_per_iteration": 2.6213603019714355 + }, + { + "auxiliary_loss_clip": 0.06501837, + "auxiliary_loss_mlp": 0.01274613, + "balance_loss_clip": 0.06284901, + "balance_loss_mlp": 0.01255004, + "epoch": 0.26009319104163536, + "flos": 32274116179200.0, + "grad_norm": 1.826124228611905, + "language_loss": 0.73262805, + "learning_rate": 3.469857215756257e-06, + "loss": 0.81039256, + "num_input_tokens_seen": 93424750, + "router_z_loss_clip": 2.16601562, + "router_z_loss_mlp": 0.19592285, + "step": 4326, + "time_per_iteration": 2.593801736831665 + }, + { + "auxiliary_loss_clip": 0.06500994, + "auxiliary_loss_mlp": 0.01276886, + "balance_loss_clip": 0.06288173, + "balance_loss_mlp": 0.01258051, + "epoch": 0.26015331429430333, + "flos": 26293994933760.0, + "grad_norm": 1.858424121782002, + "language_loss": 0.8722446, + "learning_rate": 3.4695930765982997e-06, + "loss": 0.95002341, + "num_input_tokens_seen": 93443465, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.18835449, + "step": 4327, + "time_per_iteration": 2.5950510501861572 + }, + { + "auxiliary_loss_clip": 0.06508228, + "auxiliary_loss_mlp": 0.01274095, + "balance_loss_clip": 0.06287643, + "balance_loss_mlp": 0.01254271, + "epoch": 0.2602134375469713, + "flos": 21148728243840.0, + "grad_norm": 1.765295937421399, + "language_loss": 0.8100785, + "learning_rate": 3.4693288817132255e-06, + "loss": 0.88790172, + "num_input_tokens_seen": 93462580, + "router_z_loss_clip": 2.20507812, + "router_z_loss_mlp": 0.19824219, + "step": 4328, + "time_per_iteration": 3.923682928085327 + }, + { + "auxiliary_loss_clip": 0.06502862, + "auxiliary_loss_mlp": 0.01277051, + "balance_loss_clip": 0.06285354, + "balance_loss_mlp": 0.01258704, + "epoch": 0.26027356079963926, + "flos": 25928411569920.0, + "grad_norm": 1.3948699622732248, + "language_loss": 0.88172936, + "learning_rate": 3.4690646311110525e-06, + "loss": 0.95952845, + "num_input_tokens_seen": 93482790, + "router_z_loss_clip": 2.17578125, + "router_z_loss_mlp": 0.18347168, + "step": 4329, + "time_per_iteration": 2.5685267448425293 + }, + { + "auxiliary_loss_clip": 0.06502585, + "auxiliary_loss_mlp": 0.01271461, + "balance_loss_clip": 0.06288581, + "balance_loss_mlp": 0.0125327, + "epoch": 0.2603336840523072, + "flos": 26366390461440.0, + "grad_norm": 1.8811175805050973, + "language_loss": 0.77705932, + "learning_rate": 3.468800324801802e-06, + "loss": 0.85479975, + "num_input_tokens_seen": 93498795, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.18188477, + "step": 4330, + "time_per_iteration": 2.6185224056243896 + }, + { + "auxiliary_loss_clip": 0.06508863, + "auxiliary_loss_mlp": 0.01277238, + "balance_loss_clip": 0.06289242, + "balance_loss_mlp": 0.0125826, + "epoch": 0.2603938073049752, + "flos": 23520408134400.0, + "grad_norm": 1.5596482888270802, + "language_loss": 0.76200908, + "learning_rate": 3.4685359627954958e-06, + "loss": 0.8398701, + "num_input_tokens_seen": 93518335, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.18981934, + "step": 4331, + "time_per_iteration": 2.5152506828308105 + }, + { + "auxiliary_loss_clip": 0.06507871, + "auxiliary_loss_mlp": 0.01272217, + "balance_loss_clip": 0.06292268, + "balance_loss_mlp": 0.01254527, + "epoch": 0.26045393055764315, + "flos": 25381336262400.0, + "grad_norm": 1.426884348550376, + "language_loss": 0.69540298, + "learning_rate": 3.4682715451021584e-06, + "loss": 0.77320385, + "num_input_tokens_seen": 93539170, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.17700195, + "step": 4332, + "time_per_iteration": 2.5776190757751465 + }, + { + "auxiliary_loss_clip": 0.06511752, + "auxiliary_loss_mlp": 0.01275479, + "balance_loss_clip": 0.0629351, + "balance_loss_mlp": 0.0125693, + "epoch": 0.2605140538103111, + "flos": 27642494217600.0, + "grad_norm": 1.8844860211449586, + "language_loss": 0.79951644, + "learning_rate": 3.4680070717318174e-06, + "loss": 0.87738872, + "num_input_tokens_seen": 93558480, + "router_z_loss_clip": 2.18164062, + "router_z_loss_mlp": 0.1854248, + "step": 4333, + "time_per_iteration": 2.5523998737335205 + }, + { + "auxiliary_loss_clip": 0.06501235, + "auxiliary_loss_mlp": 0.01272154, + "balance_loss_clip": 0.06290703, + "balance_loss_mlp": 0.01254714, + "epoch": 0.2605741770629791, + "flos": 13774602147840.0, + "grad_norm": 1.6726919145500945, + "language_loss": 0.81128466, + "learning_rate": 3.467742542694501e-06, + "loss": 0.8890186, + "num_input_tokens_seen": 93575220, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 0.17443848, + "step": 4334, + "time_per_iteration": 2.522210121154785 + }, + { + "auxiliary_loss_clip": 0.06510483, + "auxiliary_loss_mlp": 0.01278802, + "balance_loss_clip": 0.06293018, + "balance_loss_mlp": 0.01259859, + "epoch": 0.26063430031564705, + "flos": 26038933505280.0, + "grad_norm": 1.7438742011205015, + "language_loss": 0.80170292, + "learning_rate": 3.46747795800024e-06, + "loss": 0.87959582, + "num_input_tokens_seen": 93597015, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.18945312, + "step": 4335, + "time_per_iteration": 2.582817792892456 + }, + { + "auxiliary_loss_clip": 0.06403506, + "auxiliary_loss_mlp": 0.01257225, + "balance_loss_clip": 0.06297, + "balance_loss_mlp": 0.01252544, + "epoch": 0.26069442356831507, + "flos": 62463143030400.0, + "grad_norm": 0.8284851894367303, + "language_loss": 0.60816151, + "learning_rate": 3.467213317659068e-06, + "loss": 0.6847688, + "num_input_tokens_seen": 93657775, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.04672241, + "step": 4336, + "time_per_iteration": 3.2036406993865967 + }, + { + "auxiliary_loss_clip": 0.0651319, + "auxiliary_loss_mlp": 0.0127574, + "balance_loss_clip": 0.06294517, + "balance_loss_mlp": 0.01257405, + "epoch": 0.26075454682098304, + "flos": 13631530101120.0, + "grad_norm": 1.8662385080657846, + "language_loss": 0.78028893, + "learning_rate": 3.46694862168102e-06, + "loss": 0.85817826, + "num_input_tokens_seen": 93676145, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.18322754, + "step": 4337, + "time_per_iteration": 2.4899747371673584 + }, + { + "auxiliary_loss_clip": 0.06515083, + "auxiliary_loss_mlp": 0.01276173, + "balance_loss_clip": 0.06295171, + "balance_loss_mlp": 0.01256289, + "epoch": 0.260814670073651, + "flos": 12130776748800.0, + "grad_norm": 2.165940638299647, + "language_loss": 0.74851859, + "learning_rate": 3.4666838700761334e-06, + "loss": 0.82643116, + "num_input_tokens_seen": 93692480, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.19897461, + "step": 4338, + "time_per_iteration": 2.5323259830474854 + }, + { + "auxiliary_loss_clip": 0.06522977, + "auxiliary_loss_mlp": 0.01274339, + "balance_loss_clip": 0.0629933, + "balance_loss_mlp": 0.01255039, + "epoch": 0.26087479332631897, + "flos": 15127964968320.0, + "grad_norm": 2.9662822483112388, + "language_loss": 0.81419933, + "learning_rate": 3.466419062854447e-06, + "loss": 0.89217252, + "num_input_tokens_seen": 93710165, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.19287109, + "step": 4339, + "time_per_iteration": 2.486024856567383 + }, + { + "auxiliary_loss_clip": 0.06514673, + "auxiliary_loss_mlp": 0.0127648, + "balance_loss_clip": 0.06300991, + "balance_loss_mlp": 0.01259278, + "epoch": 0.26093491657898693, + "flos": 24687834744960.0, + "grad_norm": 1.5467473582016638, + "language_loss": 0.77106607, + "learning_rate": 3.4661542000260033e-06, + "loss": 0.84897768, + "num_input_tokens_seen": 93730185, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.17199707, + "step": 4340, + "time_per_iteration": 2.570777416229248 + }, + { + "auxiliary_loss_clip": 0.06513949, + "auxiliary_loss_mlp": 0.01274956, + "balance_loss_clip": 0.062961, + "balance_loss_mlp": 0.01255788, + "epoch": 0.2609950398316549, + "flos": 25122669108480.0, + "grad_norm": 1.4533527138525517, + "language_loss": 0.82740015, + "learning_rate": 3.465889281600845e-06, + "loss": 0.90528917, + "num_input_tokens_seen": 93747690, + "router_z_loss_clip": 2.18164062, + "router_z_loss_mlp": 0.19177246, + "step": 4341, + "time_per_iteration": 2.5946342945098877 + }, + { + "auxiliary_loss_clip": 0.06519589, + "auxiliary_loss_mlp": 0.01282035, + "balance_loss_clip": 0.06303687, + "balance_loss_mlp": 0.01261794, + "epoch": 0.26105516308432286, + "flos": 28556159137920.0, + "grad_norm": 1.7858700463590271, + "language_loss": 0.77163744, + "learning_rate": 3.4656243075890183e-06, + "loss": 0.84965372, + "num_input_tokens_seen": 93767405, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.20251465, + "step": 4342, + "time_per_iteration": 2.5742342472076416 + }, + { + "auxiliary_loss_clip": 0.06521034, + "auxiliary_loss_mlp": 0.01277248, + "balance_loss_clip": 0.06303718, + "balance_loss_mlp": 0.01258115, + "epoch": 0.2611152863369908, + "flos": 39539984400000.0, + "grad_norm": 1.7100835603344944, + "language_loss": 0.66681403, + "learning_rate": 3.4653592780005707e-06, + "loss": 0.74479687, + "num_input_tokens_seen": 93789950, + "router_z_loss_clip": 2.17578125, + "router_z_loss_mlp": 0.19140625, + "step": 4343, + "time_per_iteration": 2.662271738052368 + }, + { + "auxiliary_loss_clip": 0.06524731, + "auxiliary_loss_mlp": 0.01280109, + "balance_loss_clip": 0.0630408, + "balance_loss_mlp": 0.01261917, + "epoch": 0.2611754095896588, + "flos": 13740416881920.0, + "grad_norm": 1.8127929734390111, + "language_loss": 0.74220115, + "learning_rate": 3.465094192845553e-06, + "loss": 0.82024956, + "num_input_tokens_seen": 93807835, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.18200684, + "step": 4344, + "time_per_iteration": 2.5201361179351807 + }, + { + "auxiliary_loss_clip": 0.06524797, + "auxiliary_loss_mlp": 0.01284917, + "balance_loss_clip": 0.06307752, + "balance_loss_mlp": 0.01264484, + "epoch": 0.26123553284232676, + "flos": 21513011869440.0, + "grad_norm": 2.1854473316742338, + "language_loss": 0.8696478, + "learning_rate": 3.4648290521340165e-06, + "loss": 0.94774491, + "num_input_tokens_seen": 93825670, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.20422363, + "step": 4345, + "time_per_iteration": 2.510000228881836 + }, + { + "auxiliary_loss_clip": 0.06521724, + "auxiliary_loss_mlp": 0.01276675, + "balance_loss_clip": 0.06307776, + "balance_loss_mlp": 0.01258293, + "epoch": 0.2612956560949947, + "flos": 21145751424000.0, + "grad_norm": 2.0739898036059095, + "language_loss": 0.76897335, + "learning_rate": 3.464563855876015e-06, + "loss": 0.84695733, + "num_input_tokens_seen": 93844045, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.18371582, + "step": 4346, + "time_per_iteration": 2.5322000980377197 + }, + { + "auxiliary_loss_clip": 0.06522055, + "auxiliary_loss_mlp": 0.01275162, + "balance_loss_clip": 0.06305227, + "balance_loss_mlp": 0.01256911, + "epoch": 0.2613557793476627, + "flos": 25126023271680.0, + "grad_norm": 1.5562871556893731, + "language_loss": 0.76140273, + "learning_rate": 3.464298604081606e-06, + "loss": 0.83937496, + "num_input_tokens_seen": 93864380, + "router_z_loss_clip": 2.16894531, + "router_z_loss_mlp": 0.18249512, + "step": 4347, + "time_per_iteration": 2.557077169418335 + }, + { + "auxiliary_loss_clip": 0.06522661, + "auxiliary_loss_mlp": 0.01286127, + "balance_loss_clip": 0.06307539, + "balance_loss_mlp": 0.01267208, + "epoch": 0.26141590260033065, + "flos": 26074879706880.0, + "grad_norm": 1.3369896368920637, + "language_loss": 0.7377249, + "learning_rate": 3.4640332967608476e-06, + "loss": 0.81581283, + "num_input_tokens_seen": 93885475, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.18920898, + "step": 4348, + "time_per_iteration": 2.5915603637695312 + }, + { + "auxiliary_loss_clip": 0.06527912, + "auxiliary_loss_mlp": 0.01280562, + "balance_loss_clip": 0.06309946, + "balance_loss_mlp": 0.01260881, + "epoch": 0.2614760258529987, + "flos": 25708415875200.0, + "grad_norm": 1.876318754691465, + "language_loss": 0.9123491, + "learning_rate": 3.463767933923799e-06, + "loss": 0.99043381, + "num_input_tokens_seen": 93905545, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.19689941, + "step": 4349, + "time_per_iteration": 2.594332218170166 + }, + { + "auxiliary_loss_clip": 0.06524529, + "auxiliary_loss_mlp": 0.01276126, + "balance_loss_clip": 0.0631379, + "balance_loss_mlp": 0.01256695, + "epoch": 0.26153614910566664, + "flos": 17462902043520.0, + "grad_norm": 1.601755901803269, + "language_loss": 0.80459869, + "learning_rate": 3.463502515580524e-06, + "loss": 0.8826052, + "num_input_tokens_seen": 93924185, + "router_z_loss_clip": 2.10546875, + "router_z_loss_mlp": 0.19433594, + "step": 4350, + "time_per_iteration": 2.509274482727051 + }, + { + "auxiliary_loss_clip": 0.06520928, + "auxiliary_loss_mlp": 0.01277683, + "balance_loss_clip": 0.0631097, + "balance_loss_mlp": 0.01259063, + "epoch": 0.2615962723583346, + "flos": 17718676231680.0, + "grad_norm": 1.8928977658247819, + "language_loss": 0.62482548, + "learning_rate": 3.4632370417410866e-06, + "loss": 0.7028116, + "num_input_tokens_seen": 93942825, + "router_z_loss_clip": 2.1015625, + "router_z_loss_mlp": 0.18615723, + "step": 4351, + "time_per_iteration": 2.522862672805786 + }, + { + "auxiliary_loss_clip": 0.06526107, + "auxiliary_loss_mlp": 0.01278827, + "balance_loss_clip": 0.06308405, + "balance_loss_mlp": 0.01259396, + "epoch": 0.26165639561100257, + "flos": 23264340456960.0, + "grad_norm": 2.4783042039829546, + "language_loss": 0.84264326, + "learning_rate": 3.462971512415555e-06, + "loss": 0.92069256, + "num_input_tokens_seen": 93962045, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.19445801, + "step": 4352, + "time_per_iteration": 2.5326311588287354 + }, + { + "auxiliary_loss_clip": 0.06398427, + "auxiliary_loss_mlp": 0.01261209, + "balance_loss_clip": 0.06294002, + "balance_loss_mlp": 0.01256817, + "epoch": 0.26171651886367053, + "flos": 66756155443200.0, + "grad_norm": 0.7669563885543124, + "language_loss": 0.7057451, + "learning_rate": 3.462705927613996e-06, + "loss": 0.78234154, + "num_input_tokens_seen": 94021175, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.04397583, + "step": 4353, + "time_per_iteration": 3.093543529510498 + }, + { + "auxiliary_loss_clip": 0.06517833, + "auxiliary_loss_mlp": 0.01279039, + "balance_loss_clip": 0.06305997, + "balance_loss_mlp": 0.01259619, + "epoch": 0.2617766421163385, + "flos": 22356713030400.0, + "grad_norm": 1.943198757771125, + "language_loss": 0.77770078, + "learning_rate": 3.4624402873464816e-06, + "loss": 0.8556695, + "num_input_tokens_seen": 94043370, + "router_z_loss_clip": 2.1171875, + "router_z_loss_mlp": 0.19433594, + "step": 4354, + "time_per_iteration": 2.5782573223114014 + }, + { + "auxiliary_loss_clip": 0.06522856, + "auxiliary_loss_mlp": 0.01279183, + "balance_loss_clip": 0.06303968, + "balance_loss_mlp": 0.01259907, + "epoch": 0.26183676536900646, + "flos": 26074208874240.0, + "grad_norm": 2.16382169558429, + "language_loss": 0.68941987, + "learning_rate": 3.462174591623085e-06, + "loss": 0.7674402, + "num_input_tokens_seen": 94063510, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.19274902, + "step": 4355, + "time_per_iteration": 2.608482599258423 + }, + { + "auxiliary_loss_clip": 0.06517249, + "auxiliary_loss_mlp": 0.01282478, + "balance_loss_clip": 0.06301509, + "balance_loss_mlp": 0.01260889, + "epoch": 0.26189688862167443, + "flos": 21002847085440.0, + "grad_norm": 2.1598133279644554, + "language_loss": 0.68533909, + "learning_rate": 3.4619088404538815e-06, + "loss": 0.76333642, + "num_input_tokens_seen": 94083865, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.21594238, + "step": 4356, + "time_per_iteration": 2.526376247406006 + }, + { + "auxiliary_loss_clip": 0.06398848, + "auxiliary_loss_mlp": 0.01254107, + "balance_loss_clip": 0.06295048, + "balance_loss_mlp": 0.01249723, + "epoch": 0.2619570118743424, + "flos": 65817780768000.0, + "grad_norm": 0.6753767209108164, + "language_loss": 0.5316326, + "learning_rate": 3.4616430338489487e-06, + "loss": 0.60816211, + "num_input_tokens_seen": 94144095, + "router_z_loss_clip": 1.03125, + "router_z_loss_mlp": 0.04391479, + "step": 4357, + "time_per_iteration": 4.58653450012207 + }, + { + "auxiliary_loss_clip": 0.065238, + "auxiliary_loss_mlp": 0.01280125, + "balance_loss_clip": 0.06302594, + "balance_loss_mlp": 0.01261183, + "epoch": 0.26201713512701036, + "flos": 28774310042880.0, + "grad_norm": 1.9589657113609436, + "language_loss": 0.85308599, + "learning_rate": 3.4613771718183654e-06, + "loss": 0.93112528, + "num_input_tokens_seen": 94163035, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.18933105, + "step": 4358, + "time_per_iteration": 2.65427303314209 + }, + { + "auxiliary_loss_clip": 0.0652793, + "auxiliary_loss_mlp": 0.0127535, + "balance_loss_clip": 0.06300082, + "balance_loss_mlp": 0.01254917, + "epoch": 0.2620772583796783, + "flos": 26439750311040.0, + "grad_norm": 2.2013035586341663, + "language_loss": 0.68206531, + "learning_rate": 3.4611112543722127e-06, + "loss": 0.7600981, + "num_input_tokens_seen": 94182520, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.20422363, + "step": 4359, + "time_per_iteration": 2.5460946559906006 + }, + { + "auxiliary_loss_clip": 0.06517753, + "auxiliary_loss_mlp": 0.01278599, + "balance_loss_clip": 0.06299832, + "balance_loss_mlp": 0.01258763, + "epoch": 0.2621373816323463, + "flos": 20162667795840.0, + "grad_norm": 1.9413360196767273, + "language_loss": 0.7857362, + "learning_rate": 3.4608452815205757e-06, + "loss": 0.86369967, + "num_input_tokens_seen": 94201795, + "router_z_loss_clip": 2.18164062, + "router_z_loss_mlp": 0.19848633, + "step": 4360, + "time_per_iteration": 2.5442395210266113 + }, + { + "auxiliary_loss_clip": 0.06513859, + "auxiliary_loss_mlp": 0.01282389, + "balance_loss_clip": 0.06305451, + "balance_loss_mlp": 0.01262839, + "epoch": 0.26219750488501425, + "flos": 28628764300800.0, + "grad_norm": 1.9016418571028826, + "language_loss": 0.68632245, + "learning_rate": 3.4605792532735387e-06, + "loss": 0.76428491, + "num_input_tokens_seen": 94222390, + "router_z_loss_clip": 2.08105469, + "router_z_loss_mlp": 0.19519043, + "step": 4361, + "time_per_iteration": 2.5506739616394043 + }, + { + "auxiliary_loss_clip": 0.0652248, + "auxiliary_loss_mlp": 0.01277506, + "balance_loss_clip": 0.06302515, + "balance_loss_mlp": 0.01256298, + "epoch": 0.2622576281376823, + "flos": 15046806689280.0, + "grad_norm": 1.72568625675014, + "language_loss": 0.84433615, + "learning_rate": 3.46031316964119e-06, + "loss": 0.92233592, + "num_input_tokens_seen": 94239980, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.21179199, + "step": 4362, + "time_per_iteration": 3.9455041885375977 + }, + { + "auxiliary_loss_clip": 0.06516212, + "auxiliary_loss_mlp": 0.01274047, + "balance_loss_clip": 0.06303745, + "balance_loss_mlp": 0.01254914, + "epoch": 0.26231775139035024, + "flos": 26403426766080.0, + "grad_norm": 1.7310155723144771, + "language_loss": 0.65182602, + "learning_rate": 3.4600470306336197e-06, + "loss": 0.72972858, + "num_input_tokens_seen": 94260715, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.19140625, + "step": 4363, + "time_per_iteration": 2.5710229873657227 + }, + { + "auxiliary_loss_clip": 0.06417713, + "auxiliary_loss_mlp": 0.01270336, + "balance_loss_clip": 0.06313097, + "balance_loss_mlp": 0.01263804, + "epoch": 0.2623778746430182, + "flos": 65430380615040.0, + "grad_norm": 0.9022976396731897, + "language_loss": 0.61189461, + "learning_rate": 3.4597808362609194e-06, + "loss": 0.68877506, + "num_input_tokens_seen": 94321285, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.06542969, + "step": 4364, + "time_per_iteration": 4.728578805923462 + }, + { + "auxiliary_loss_clip": 0.06528256, + "auxiliary_loss_mlp": 0.01280703, + "balance_loss_clip": 0.06308191, + "balance_loss_mlp": 0.01260402, + "epoch": 0.26243799789568617, + "flos": 12609104181120.0, + "grad_norm": 2.531531320883944, + "language_loss": 0.72247571, + "learning_rate": 3.459514586533184e-06, + "loss": 0.80056524, + "num_input_tokens_seen": 94335420, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.20300293, + "step": 4365, + "time_per_iteration": 2.5567469596862793 + }, + { + "auxiliary_loss_clip": 0.06519997, + "auxiliary_loss_mlp": 0.01276088, + "balance_loss_clip": 0.06307054, + "balance_loss_mlp": 0.01257146, + "epoch": 0.26249812114835414, + "flos": 28631783047680.0, + "grad_norm": 1.7351756990107399, + "language_loss": 0.78023124, + "learning_rate": 3.459248281460509e-06, + "loss": 0.85819209, + "num_input_tokens_seen": 94357440, + "router_z_loss_clip": 2.12792969, + "router_z_loss_mlp": 0.18945312, + "step": 4366, + "time_per_iteration": 2.6212668418884277 + }, + { + "auxiliary_loss_clip": 0.06522524, + "auxiliary_loss_mlp": 0.01276459, + "balance_loss_clip": 0.06305946, + "balance_loss_mlp": 0.01258351, + "epoch": 0.2625582444010221, + "flos": 14470661214720.0, + "grad_norm": 1.579355851615032, + "language_loss": 0.77007079, + "learning_rate": 3.4589819210529927e-06, + "loss": 0.84806067, + "num_input_tokens_seen": 94375690, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.18103027, + "step": 4367, + "time_per_iteration": 2.602072238922119 + }, + { + "auxiliary_loss_clip": 0.06517363, + "auxiliary_loss_mlp": 0.01271186, + "balance_loss_clip": 0.06304537, + "balance_loss_mlp": 0.01253471, + "epoch": 0.26261836765369007, + "flos": 16617984998400.0, + "grad_norm": 1.5269013949985815, + "language_loss": 0.70157337, + "learning_rate": 3.458715505320736e-06, + "loss": 0.77945888, + "num_input_tokens_seen": 94393190, + "router_z_loss_clip": 2.13183594, + "router_z_loss_mlp": 0.17700195, + "step": 4368, + "time_per_iteration": 4.012764930725098 + }, + { + "auxiliary_loss_clip": 0.06516206, + "auxiliary_loss_mlp": 0.01278713, + "balance_loss_clip": 0.06299432, + "balance_loss_mlp": 0.01256635, + "epoch": 0.26267849090635803, + "flos": 20525861318400.0, + "grad_norm": 1.916794033771568, + "language_loss": 0.79240829, + "learning_rate": 3.458449034273841e-06, + "loss": 0.87035751, + "num_input_tokens_seen": 94410975, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.22070312, + "step": 4369, + "time_per_iteration": 2.51906418800354 + }, + { + "auxiliary_loss_clip": 0.06514631, + "auxiliary_loss_mlp": 0.01276005, + "balance_loss_clip": 0.06301987, + "balance_loss_mlp": 0.01256883, + "epoch": 0.262738614159026, + "flos": 21330220187520.0, + "grad_norm": 3.2285566965587873, + "language_loss": 0.83905816, + "learning_rate": 3.4581825079224133e-06, + "loss": 0.91696453, + "num_input_tokens_seen": 94429985, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.19116211, + "step": 4370, + "time_per_iteration": 2.562302589416504 + }, + { + "auxiliary_loss_clip": 0.06520583, + "auxiliary_loss_mlp": 0.01275132, + "balance_loss_clip": 0.06299531, + "balance_loss_mlp": 0.01253972, + "epoch": 0.26279873741169396, + "flos": 17609454034560.0, + "grad_norm": 1.7096089610285066, + "language_loss": 0.71678042, + "learning_rate": 3.4579159262765575e-06, + "loss": 0.79473758, + "num_input_tokens_seen": 94448660, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.21179199, + "step": 4371, + "time_per_iteration": 2.4965152740478516 + }, + { + "auxiliary_loss_clip": 0.06398421, + "auxiliary_loss_mlp": 0.01256739, + "balance_loss_clip": 0.0629326, + "balance_loss_mlp": 0.01252516, + "epoch": 0.2628588606643619, + "flos": 60969139931520.0, + "grad_norm": 0.666639264120038, + "language_loss": 0.56056166, + "learning_rate": 3.457649289346384e-06, + "loss": 0.63711321, + "num_input_tokens_seen": 94515630, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.04226685, + "step": 4372, + "time_per_iteration": 3.2867443561553955 + }, + { + "auxiliary_loss_clip": 0.06512036, + "auxiliary_loss_mlp": 0.01277679, + "balance_loss_clip": 0.06298684, + "balance_loss_mlp": 0.01259178, + "epoch": 0.2629189839170299, + "flos": 27023652288000.0, + "grad_norm": 1.5439358769508327, + "language_loss": 0.78190762, + "learning_rate": 3.4573825971420042e-06, + "loss": 0.85980475, + "num_input_tokens_seen": 94535385, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.18505859, + "step": 4373, + "time_per_iteration": 2.577479362487793 + }, + { + "auxiliary_loss_clip": 0.06510606, + "auxiliary_loss_mlp": 0.01278833, + "balance_loss_clip": 0.06297645, + "balance_loss_mlp": 0.01260427, + "epoch": 0.26297910716969786, + "flos": 17025635911680.0, + "grad_norm": 2.1443132622279664, + "language_loss": 0.723768, + "learning_rate": 3.4571158496735294e-06, + "loss": 0.80166239, + "num_input_tokens_seen": 94552650, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.18383789, + "step": 4374, + "time_per_iteration": 2.5588772296905518 + }, + { + "auxiliary_loss_clip": 0.06517059, + "auxiliary_loss_mlp": 0.01278902, + "balance_loss_clip": 0.0630156, + "balance_loss_mlp": 0.01258505, + "epoch": 0.2630392304223659, + "flos": 24903889297920.0, + "grad_norm": 2.1190930293084933, + "language_loss": 0.81199759, + "learning_rate": 3.4568490469510756e-06, + "loss": 0.88995719, + "num_input_tokens_seen": 94574075, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.20373535, + "step": 4375, + "time_per_iteration": 2.591381311416626 + }, + { + "auxiliary_loss_clip": 0.0651055, + "auxiliary_loss_mlp": 0.01275326, + "balance_loss_clip": 0.0629838, + "balance_loss_mlp": 0.01257289, + "epoch": 0.26309935367503384, + "flos": 32862336641280.0, + "grad_norm": 1.9139045559413268, + "language_loss": 0.66626596, + "learning_rate": 3.4565821889847603e-06, + "loss": 0.74412477, + "num_input_tokens_seen": 94594255, + "router_z_loss_clip": 2.12304688, + "router_z_loss_mlp": 0.18041992, + "step": 4376, + "time_per_iteration": 2.643944025039673 + }, + { + "auxiliary_loss_clip": 0.06515232, + "auxiliary_loss_mlp": 0.01276237, + "balance_loss_clip": 0.06297503, + "balance_loss_mlp": 0.01257485, + "epoch": 0.2631594769277018, + "flos": 15893400816000.0, + "grad_norm": 1.6251454157029055, + "language_loss": 0.70145154, + "learning_rate": 3.4563152757847026e-06, + "loss": 0.77936625, + "num_input_tokens_seen": 94611410, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.1875, + "step": 4377, + "time_per_iteration": 2.5593788623809814 + }, + { + "auxiliary_loss_clip": 0.06513406, + "auxiliary_loss_mlp": 0.01274994, + "balance_loss_clip": 0.06296869, + "balance_loss_mlp": 0.01255408, + "epoch": 0.2632196001803698, + "flos": 50816242811520.0, + "grad_norm": 1.6666327452584295, + "language_loss": 0.80235565, + "learning_rate": 3.4560483073610233e-06, + "loss": 0.88023967, + "num_input_tokens_seen": 94636575, + "router_z_loss_clip": 2.16601562, + "router_z_loss_mlp": 0.19592285, + "step": 4378, + "time_per_iteration": 2.794290065765381 + }, + { + "auxiliary_loss_clip": 0.0651051, + "auxiliary_loss_mlp": 0.01272396, + "balance_loss_clip": 0.06297652, + "balance_loss_mlp": 0.0125492, + "epoch": 0.26327972343303774, + "flos": 13737733551360.0, + "grad_norm": 2.7188396998417548, + "language_loss": 0.77230549, + "learning_rate": 3.455781283723846e-06, + "loss": 0.85013449, + "num_input_tokens_seen": 94654345, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.17480469, + "step": 4379, + "time_per_iteration": 2.542442560195923 + }, + { + "auxiliary_loss_clip": 0.06519607, + "auxiliary_loss_mlp": 0.01274968, + "balance_loss_clip": 0.06299821, + "balance_loss_mlp": 0.01255084, + "epoch": 0.2633398466857057, + "flos": 23775846906240.0, + "grad_norm": 1.9724368576120554, + "language_loss": 0.78418016, + "learning_rate": 3.4555142048832975e-06, + "loss": 0.86212587, + "num_input_tokens_seen": 94673985, + "router_z_loss_clip": 2.19921875, + "router_z_loss_mlp": 0.19897461, + "step": 4380, + "time_per_iteration": 2.529573440551758 + }, + { + "auxiliary_loss_clip": 0.06516172, + "auxiliary_loss_mlp": 0.012759, + "balance_loss_clip": 0.06296928, + "balance_loss_mlp": 0.01257518, + "epoch": 0.26339996993837367, + "flos": 27607680046080.0, + "grad_norm": 1.9046534185934374, + "language_loss": 0.6460917, + "learning_rate": 3.4552470708495036e-06, + "loss": 0.72401243, + "num_input_tokens_seen": 94693145, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.18383789, + "step": 4381, + "time_per_iteration": 2.5774149894714355 + }, + { + "auxiliary_loss_clip": 0.06511073, + "auxiliary_loss_mlp": 0.01273848, + "balance_loss_clip": 0.06295128, + "balance_loss_mlp": 0.01255394, + "epoch": 0.26346009319104163, + "flos": 16951982572800.0, + "grad_norm": 1.8115834165165374, + "language_loss": 0.8293367, + "learning_rate": 3.454979881632595e-06, + "loss": 0.90718591, + "num_input_tokens_seen": 94710185, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.18444824, + "step": 4382, + "time_per_iteration": 2.503119945526123 + }, + { + "auxiliary_loss_clip": 0.06526808, + "auxiliary_loss_mlp": 0.01282548, + "balance_loss_clip": 0.06304507, + "balance_loss_mlp": 0.0126196, + "epoch": 0.2635202164437096, + "flos": 37241245088640.0, + "grad_norm": 2.8611377763647363, + "language_loss": 0.70728219, + "learning_rate": 3.4547126372427035e-06, + "loss": 0.78537577, + "num_input_tokens_seen": 94730280, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.20581055, + "step": 4383, + "time_per_iteration": 2.7256851196289062 + }, + { + "auxiliary_loss_clip": 0.06511825, + "auxiliary_loss_mlp": 0.01278143, + "balance_loss_clip": 0.0629648, + "balance_loss_mlp": 0.01260214, + "epoch": 0.26358033969637756, + "flos": 21002721304320.0, + "grad_norm": 1.8636489890531567, + "language_loss": 0.69725919, + "learning_rate": 3.4544453376899638e-06, + "loss": 0.77515888, + "num_input_tokens_seen": 94748560, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.17919922, + "step": 4384, + "time_per_iteration": 2.526306629180908 + }, + { + "auxiliary_loss_clip": 0.06514609, + "auxiliary_loss_mlp": 0.01274952, + "balance_loss_clip": 0.06301568, + "balance_loss_mlp": 0.01256355, + "epoch": 0.26364046294904553, + "flos": 27753561204480.0, + "grad_norm": 2.704228439938978, + "language_loss": 0.70769042, + "learning_rate": 3.45417798298451e-06, + "loss": 0.785586, + "num_input_tokens_seen": 94767570, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.18603516, + "step": 4385, + "time_per_iteration": 2.6091294288635254 + }, + { + "auxiliary_loss_clip": 0.06510788, + "auxiliary_loss_mlp": 0.01275036, + "balance_loss_clip": 0.06297003, + "balance_loss_mlp": 0.01255903, + "epoch": 0.2637005862017135, + "flos": 22899679488000.0, + "grad_norm": 1.8400483569046413, + "language_loss": 0.85200071, + "learning_rate": 3.453910573136482e-06, + "loss": 0.92985892, + "num_input_tokens_seen": 94784985, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.19116211, + "step": 4386, + "time_per_iteration": 2.5284476280212402 + }, + { + "auxiliary_loss_clip": 0.06516191, + "auxiliary_loss_mlp": 0.01275321, + "balance_loss_clip": 0.06302508, + "balance_loss_mlp": 0.01255759, + "epoch": 0.26376070945438146, + "flos": 15054143921280.0, + "grad_norm": 1.9881194524454247, + "language_loss": 0.77597183, + "learning_rate": 3.4536431081560196e-06, + "loss": 0.85388696, + "num_input_tokens_seen": 94802545, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.19567871, + "step": 4387, + "time_per_iteration": 2.522135019302368 + }, + { + "auxiliary_loss_clip": 0.0651316, + "auxiliary_loss_mlp": 0.01278261, + "balance_loss_clip": 0.06301039, + "balance_loss_mlp": 0.01259378, + "epoch": 0.2638208327070494, + "flos": 21148141265280.0, + "grad_norm": 2.1303107819849316, + "language_loss": 0.76193964, + "learning_rate": 3.453375588053264e-06, + "loss": 0.83985388, + "num_input_tokens_seen": 94820730, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.1887207, + "step": 4388, + "time_per_iteration": 2.5082008838653564 + }, + { + "auxiliary_loss_clip": 0.06516623, + "auxiliary_loss_mlp": 0.01271478, + "balance_loss_clip": 0.06302176, + "balance_loss_mlp": 0.01253681, + "epoch": 0.26388095595971744, + "flos": 21732001315200.0, + "grad_norm": 2.125202232596161, + "language_loss": 0.86967361, + "learning_rate": 3.4531080128383617e-06, + "loss": 0.94755471, + "num_input_tokens_seen": 94839175, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.17785645, + "step": 4389, + "time_per_iteration": 2.570643901824951 + }, + { + "auxiliary_loss_clip": 0.06416489, + "auxiliary_loss_mlp": 0.01268137, + "balance_loss_clip": 0.0630957, + "balance_loss_mlp": 0.01263464, + "epoch": 0.2639410792123854, + "flos": 65536542138240.0, + "grad_norm": 0.8199197454978128, + "language_loss": 0.60138249, + "learning_rate": 3.452840382521457e-06, + "loss": 0.6782288, + "num_input_tokens_seen": 94898865, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.04666138, + "step": 4390, + "time_per_iteration": 3.174226999282837 + }, + { + "auxiliary_loss_clip": 0.06524064, + "auxiliary_loss_mlp": 0.01274153, + "balance_loss_clip": 0.06302064, + "balance_loss_mlp": 0.01255008, + "epoch": 0.2640012024650534, + "flos": 23954907081600.0, + "grad_norm": 1.739207981028, + "language_loss": 0.77995527, + "learning_rate": 3.4525726971127e-06, + "loss": 0.85793746, + "num_input_tokens_seen": 94917490, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.19152832, + "step": 4391, + "time_per_iteration": 2.5869362354278564 + }, + { + "auxiliary_loss_clip": 0.06415629, + "auxiliary_loss_mlp": 0.01265443, + "balance_loss_clip": 0.06309642, + "balance_loss_mlp": 0.0126082, + "epoch": 0.26406132571772134, + "flos": 56462420880000.0, + "grad_norm": 0.8885893091984226, + "language_loss": 0.58835375, + "learning_rate": 3.45230495662224e-06, + "loss": 0.66516447, + "num_input_tokens_seen": 94969065, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.04620361, + "step": 4392, + "time_per_iteration": 3.1856343746185303 + }, + { + "auxiliary_loss_clip": 0.0652501, + "auxiliary_loss_mlp": 0.0127481, + "balance_loss_clip": 0.06303259, + "balance_loss_mlp": 0.01256631, + "epoch": 0.2641214489703893, + "flos": 22097039627520.0, + "grad_norm": 1.7095674260711007, + "language_loss": 0.69284153, + "learning_rate": 3.4520371610602306e-06, + "loss": 0.77083969, + "num_input_tokens_seen": 94988540, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.1817627, + "step": 4393, + "time_per_iteration": 2.5519895553588867 + }, + { + "auxiliary_loss_clip": 0.06526117, + "auxiliary_loss_mlp": 0.01277548, + "balance_loss_clip": 0.06302489, + "balance_loss_mlp": 0.01255959, + "epoch": 0.26418157222305727, + "flos": 16550327226240.0, + "grad_norm": 2.304177456685855, + "language_loss": 0.84805501, + "learning_rate": 3.4517693104368267e-06, + "loss": 0.92609167, + "num_input_tokens_seen": 95004810, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.21594238, + "step": 4394, + "time_per_iteration": 2.5253031253814697 + }, + { + "auxiliary_loss_clip": 0.06528334, + "auxiliary_loss_mlp": 0.01280976, + "balance_loss_clip": 0.06304967, + "balance_loss_mlp": 0.01260066, + "epoch": 0.26424169547572524, + "flos": 18008006780160.0, + "grad_norm": 1.9555526734650441, + "language_loss": 0.70342916, + "learning_rate": 3.4515014047621856e-06, + "loss": 0.78152227, + "num_input_tokens_seen": 95024085, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.20910645, + "step": 4395, + "time_per_iteration": 2.5117664337158203 + }, + { + "auxiliary_loss_clip": 0.06512758, + "auxiliary_loss_mlp": 0.01272399, + "balance_loss_clip": 0.06300145, + "balance_loss_mlp": 0.01253171, + "epoch": 0.2643018187283932, + "flos": 16988893096320.0, + "grad_norm": 1.791387622967983, + "language_loss": 0.87312353, + "learning_rate": 3.4512334440464655e-06, + "loss": 0.95097506, + "num_input_tokens_seen": 95042515, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.19238281, + "step": 4396, + "time_per_iteration": 2.566774368286133 + }, + { + "auxiliary_loss_clip": 0.06404904, + "auxiliary_loss_mlp": 0.01257464, + "balance_loss_clip": 0.06300922, + "balance_loss_mlp": 0.01252997, + "epoch": 0.26436194198106117, + "flos": 59682135144960.0, + "grad_norm": 0.7723405564107855, + "language_loss": 0.54990101, + "learning_rate": 3.4509654282998277e-06, + "loss": 0.62652469, + "num_input_tokens_seen": 95094835, + "router_z_loss_clip": 1.0390625, + "router_z_loss_mlp": 0.04473877, + "step": 4397, + "time_per_iteration": 4.373678684234619 + }, + { + "auxiliary_loss_clip": 0.06510547, + "auxiliary_loss_mlp": 0.01274266, + "balance_loss_clip": 0.06297219, + "balance_loss_mlp": 0.01255633, + "epoch": 0.26442206523372913, + "flos": 32928694675200.0, + "grad_norm": 2.4292177107300224, + "language_loss": 0.78606653, + "learning_rate": 3.450697357532435e-06, + "loss": 0.86391467, + "num_input_tokens_seen": 95113480, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.1862793, + "step": 4398, + "time_per_iteration": 2.6890292167663574 + }, + { + "auxiliary_loss_clip": 0.06511252, + "auxiliary_loss_mlp": 0.01279415, + "balance_loss_clip": 0.06294377, + "balance_loss_mlp": 0.01259244, + "epoch": 0.2644821884863971, + "flos": 21037409694720.0, + "grad_norm": 1.6698754866149341, + "language_loss": 0.67733896, + "learning_rate": 3.4504292317544534e-06, + "loss": 0.75524557, + "num_input_tokens_seen": 95132580, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.20178223, + "step": 4399, + "time_per_iteration": 2.5403761863708496 + }, + { + "auxiliary_loss_clip": 0.06507229, + "auxiliary_loss_mlp": 0.01274507, + "balance_loss_clip": 0.06301808, + "balance_loss_mlp": 0.01256841, + "epoch": 0.26454231173906506, + "flos": 20783019098880.0, + "grad_norm": 1.5093240378212085, + "language_loss": 0.8695311, + "learning_rate": 3.4501610509760504e-06, + "loss": 0.94734848, + "num_input_tokens_seen": 95152375, + "router_z_loss_clip": 2.05175781, + "router_z_loss_mlp": 0.17675781, + "step": 4400, + "time_per_iteration": 2.546402931213379 + }, + { + "auxiliary_loss_clip": 0.06514899, + "auxiliary_loss_mlp": 0.01275157, + "balance_loss_clip": 0.06298938, + "balance_loss_mlp": 0.01255404, + "epoch": 0.264602434991733, + "flos": 16624399835520.0, + "grad_norm": 2.9592381962347076, + "language_loss": 0.77008456, + "learning_rate": 3.4498928152073944e-06, + "loss": 0.84798515, + "num_input_tokens_seen": 95170265, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.19750977, + "step": 4401, + "time_per_iteration": 4.000045537948608 + }, + { + "auxiliary_loss_clip": 0.06515318, + "auxiliary_loss_mlp": 0.01277892, + "balance_loss_clip": 0.0629567, + "balance_loss_mlp": 0.01257149, + "epoch": 0.26466255824440105, + "flos": 19068726816000.0, + "grad_norm": 1.7667226788610035, + "language_loss": 0.88791883, + "learning_rate": 3.4496245244586577e-06, + "loss": 0.96585095, + "num_input_tokens_seen": 95188655, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.20739746, + "step": 4402, + "time_per_iteration": 2.504951000213623 + }, + { + "auxiliary_loss_clip": 0.06514971, + "auxiliary_loss_mlp": 0.01280074, + "balance_loss_clip": 0.06299384, + "balance_loss_mlp": 0.01261203, + "epoch": 0.264722681497069, + "flos": 22645246965120.0, + "grad_norm": 2.1016866817380944, + "language_loss": 0.78604829, + "learning_rate": 3.4493561787400137e-06, + "loss": 0.86399865, + "num_input_tokens_seen": 95209615, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.18884277, + "step": 4403, + "time_per_iteration": 3.9830996990203857 + }, + { + "auxiliary_loss_clip": 0.06513863, + "auxiliary_loss_mlp": 0.01273109, + "balance_loss_clip": 0.0629956, + "balance_loss_mlp": 0.01254322, + "epoch": 0.264782804749737, + "flos": 22498862682240.0, + "grad_norm": 2.2718142403423887, + "language_loss": 0.88776851, + "learning_rate": 3.4490877780616387e-06, + "loss": 0.96563816, + "num_input_tokens_seen": 95227810, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.18774414, + "step": 4404, + "time_per_iteration": 2.5655670166015625 + }, + { + "auxiliary_loss_clip": 0.06512003, + "auxiliary_loss_mlp": 0.01272083, + "balance_loss_clip": 0.06294957, + "balance_loss_mlp": 0.01253666, + "epoch": 0.26484292800240494, + "flos": 16805891779200.0, + "grad_norm": 1.6853243703943699, + "language_loss": 0.77144921, + "learning_rate": 3.448819322433709e-06, + "loss": 0.84929001, + "num_input_tokens_seen": 95245890, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.18408203, + "step": 4405, + "time_per_iteration": 2.5151660442352295 + }, + { + "auxiliary_loss_clip": 0.06518488, + "auxiliary_loss_mlp": 0.01280263, + "balance_loss_clip": 0.06303011, + "balance_loss_mlp": 0.0126113, + "epoch": 0.2649030512550729, + "flos": 20455939486080.0, + "grad_norm": 1.6552463254663874, + "language_loss": 0.70570582, + "learning_rate": 3.4485508118664066e-06, + "loss": 0.78369337, + "num_input_tokens_seen": 95264955, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.19152832, + "step": 4406, + "time_per_iteration": 2.5817081928253174 + }, + { + "auxiliary_loss_clip": 0.06515051, + "auxiliary_loss_mlp": 0.01282775, + "balance_loss_clip": 0.06304015, + "balance_loss_mlp": 0.01264071, + "epoch": 0.2649631745077409, + "flos": 22422190596480.0, + "grad_norm": 1.6043271976664373, + "language_loss": 0.84213567, + "learning_rate": 3.448282246369912e-06, + "loss": 0.92011392, + "num_input_tokens_seen": 95284245, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 0.18701172, + "step": 4407, + "time_per_iteration": 2.5317513942718506 + }, + { + "auxiliary_loss_clip": 0.06506669, + "auxiliary_loss_mlp": 0.01274017, + "balance_loss_clip": 0.06294346, + "balance_loss_mlp": 0.01255384, + "epoch": 0.26502329776040884, + "flos": 35124794334720.0, + "grad_norm": 1.8863485028384246, + "language_loss": 0.76080608, + "learning_rate": 3.4480136259544084e-06, + "loss": 0.83861291, + "num_input_tokens_seen": 95307125, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.18615723, + "step": 4408, + "time_per_iteration": 4.144388675689697 + }, + { + "auxiliary_loss_clip": 0.06504838, + "auxiliary_loss_mlp": 0.01278565, + "balance_loss_clip": 0.06293095, + "balance_loss_mlp": 0.01259765, + "epoch": 0.2650834210130768, + "flos": 38696073603840.0, + "grad_norm": 1.6572856868324277, + "language_loss": 0.71237993, + "learning_rate": 3.447744950630084e-06, + "loss": 0.79021394, + "num_input_tokens_seen": 95329150, + "router_z_loss_clip": 2.11523438, + "router_z_loss_mlp": 0.18786621, + "step": 4409, + "time_per_iteration": 2.6830790042877197 + }, + { + "auxiliary_loss_clip": 0.06513892, + "auxiliary_loss_mlp": 0.01277956, + "balance_loss_clip": 0.06296389, + "balance_loss_mlp": 0.01258513, + "epoch": 0.26514354426574477, + "flos": 24723655165440.0, + "grad_norm": 1.9985850932403133, + "language_loss": 0.74335337, + "learning_rate": 3.4474762204071253e-06, + "loss": 0.82127184, + "num_input_tokens_seen": 95349880, + "router_z_loss_clip": 2.17578125, + "router_z_loss_mlp": 0.19445801, + "step": 4410, + "time_per_iteration": 2.5640783309936523 + }, + { + "auxiliary_loss_clip": 0.06510055, + "auxiliary_loss_mlp": 0.01275315, + "balance_loss_clip": 0.06293881, + "balance_loss_mlp": 0.01256873, + "epoch": 0.26520366751841273, + "flos": 20346381872640.0, + "grad_norm": 1.7362440314024254, + "language_loss": 0.74604267, + "learning_rate": 3.4472074352957244e-06, + "loss": 0.82389635, + "num_input_tokens_seen": 95368570, + "router_z_loss_clip": 2.1640625, + "router_z_loss_mlp": 0.18457031, + "step": 4411, + "time_per_iteration": 2.681392192840576 + }, + { + "auxiliary_loss_clip": 0.06503807, + "auxiliary_loss_mlp": 0.0127974, + "balance_loss_clip": 0.06292095, + "balance_loss_mlp": 0.01260941, + "epoch": 0.2652637907710807, + "flos": 22350046631040.0, + "grad_norm": 1.9068391403977176, + "language_loss": 0.83043784, + "learning_rate": 3.446938595306071e-06, + "loss": 0.90827328, + "num_input_tokens_seen": 95387065, + "router_z_loss_clip": 2.1171875, + "router_z_loss_mlp": 0.18798828, + "step": 4412, + "time_per_iteration": 2.570462942123413 + }, + { + "auxiliary_loss_clip": 0.06509882, + "auxiliary_loss_mlp": 0.01280008, + "balance_loss_clip": 0.0629638, + "balance_loss_mlp": 0.01260327, + "epoch": 0.26532391402374866, + "flos": 19360279497600.0, + "grad_norm": 1.6015505507863077, + "language_loss": 0.75010121, + "learning_rate": 3.4466697004483622e-06, + "loss": 0.82800013, + "num_input_tokens_seen": 95406345, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.19677734, + "step": 4413, + "time_per_iteration": 2.5575060844421387 + }, + { + "auxiliary_loss_clip": 0.06392879, + "auxiliary_loss_mlp": 0.01259819, + "balance_loss_clip": 0.06288524, + "balance_loss_mlp": 0.01255307, + "epoch": 0.26538403727641663, + "flos": 44804479121280.0, + "grad_norm": 0.9088609657061584, + "language_loss": 0.57055008, + "learning_rate": 3.446400750732793e-06, + "loss": 0.64707708, + "num_input_tokens_seen": 95463595, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.04522705, + "step": 4414, + "time_per_iteration": 3.090242624282837 + }, + { + "auxiliary_loss_clip": 0.06501576, + "auxiliary_loss_mlp": 0.01278206, + "balance_loss_clip": 0.06294522, + "balance_loss_mlp": 0.01260587, + "epoch": 0.26544416052908465, + "flos": 28189359889920.0, + "grad_norm": 1.5322949912702364, + "language_loss": 0.74997067, + "learning_rate": 3.4461317461695625e-06, + "loss": 0.82776845, + "num_input_tokens_seen": 95484115, + "router_z_loss_clip": 2.0703125, + "router_z_loss_mlp": 0.17626953, + "step": 4415, + "time_per_iteration": 2.6143665313720703 + }, + { + "auxiliary_loss_clip": 0.06505995, + "auxiliary_loss_mlp": 0.01278176, + "balance_loss_clip": 0.06289595, + "balance_loss_mlp": 0.0125791, + "epoch": 0.2655042837817526, + "flos": 17570824502400.0, + "grad_norm": 4.108925661978825, + "language_loss": 0.87716872, + "learning_rate": 3.4458626867688707e-06, + "loss": 0.95501041, + "num_input_tokens_seen": 95501435, + "router_z_loss_clip": 2.1640625, + "router_z_loss_mlp": 0.20263672, + "step": 4416, + "time_per_iteration": 2.4974279403686523 + }, + { + "auxiliary_loss_clip": 0.06510112, + "auxiliary_loss_mlp": 0.01280216, + "balance_loss_clip": 0.0629703, + "balance_loss_mlp": 0.0126094, + "epoch": 0.2655644070344206, + "flos": 23411437499520.0, + "grad_norm": 1.4955026126411677, + "language_loss": 0.77089638, + "learning_rate": 3.4455935725409217e-06, + "loss": 0.84879971, + "num_input_tokens_seen": 95520135, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.19274902, + "step": 4417, + "time_per_iteration": 2.576826572418213 + }, + { + "auxiliary_loss_clip": 0.0650158, + "auxiliary_loss_mlp": 0.01274734, + "balance_loss_clip": 0.06292985, + "balance_loss_mlp": 0.01255946, + "epoch": 0.26562453028708854, + "flos": 26475612658560.0, + "grad_norm": 1.3751463134954343, + "language_loss": 0.80062425, + "learning_rate": 3.4453244034959196e-06, + "loss": 0.87838733, + "num_input_tokens_seen": 95541705, + "router_z_loss_clip": 2.08203125, + "router_z_loss_mlp": 0.18786621, + "step": 4418, + "time_per_iteration": 2.573490619659424 + }, + { + "auxiliary_loss_clip": 0.06510676, + "auxiliary_loss_mlp": 0.01274316, + "balance_loss_clip": 0.06295326, + "balance_loss_mlp": 0.01254945, + "epoch": 0.2656846535397565, + "flos": 19213475944320.0, + "grad_norm": 2.092556142181657, + "language_loss": 0.67613918, + "learning_rate": 3.445055179644071e-06, + "loss": 0.7539891, + "num_input_tokens_seen": 95560300, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.19372559, + "step": 4419, + "time_per_iteration": 2.5705552101135254 + }, + { + "auxiliary_loss_clip": 0.06507199, + "auxiliary_loss_mlp": 0.01281966, + "balance_loss_clip": 0.06293494, + "balance_loss_mlp": 0.01262153, + "epoch": 0.2657447767924245, + "flos": 30558566085120.0, + "grad_norm": 1.8356097714997412, + "language_loss": 0.79905182, + "learning_rate": 3.444785900995585e-06, + "loss": 0.87694353, + "num_input_tokens_seen": 95580150, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.19799805, + "step": 4420, + "time_per_iteration": 2.5966663360595703 + }, + { + "auxiliary_loss_clip": 0.06514539, + "auxiliary_loss_mlp": 0.01276693, + "balance_loss_clip": 0.06294198, + "balance_loss_mlp": 0.01256367, + "epoch": 0.26580490004509244, + "flos": 20928984111360.0, + "grad_norm": 2.015825119850129, + "language_loss": 0.81966692, + "learning_rate": 3.444516567560673e-06, + "loss": 0.89757919, + "num_input_tokens_seen": 95597570, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.20324707, + "step": 4421, + "time_per_iteration": 2.5285565853118896 + }, + { + "auxiliary_loss_clip": 0.06503608, + "auxiliary_loss_mlp": 0.01277509, + "balance_loss_clip": 0.06293386, + "balance_loss_mlp": 0.01259341, + "epoch": 0.2658650232977604, + "flos": 43955845297920.0, + "grad_norm": 1.6494646012937118, + "language_loss": 0.66448712, + "learning_rate": 3.444247179349548e-06, + "loss": 0.74229831, + "num_input_tokens_seen": 95619415, + "router_z_loss_clip": 2.1015625, + "router_z_loss_mlp": 0.1817627, + "step": 4422, + "time_per_iteration": 2.715272903442383 + }, + { + "auxiliary_loss_clip": 0.0650918, + "auxiliary_loss_mlp": 0.01275047, + "balance_loss_clip": 0.06296968, + "balance_loss_mlp": 0.01257011, + "epoch": 0.26592514655042837, + "flos": 29724256581120.0, + "grad_norm": 6.571308072686312, + "language_loss": 0.75332773, + "learning_rate": 3.4439777363724252e-06, + "loss": 0.83116996, + "num_input_tokens_seen": 95639155, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.18029785, + "step": 4423, + "time_per_iteration": 2.5891942977905273 + }, + { + "auxiliary_loss_clip": 0.06514621, + "auxiliary_loss_mlp": 0.01277348, + "balance_loss_clip": 0.06297594, + "balance_loss_mlp": 0.01257619, + "epoch": 0.26598526980309634, + "flos": 46687616110080.0, + "grad_norm": 1.5716819541281883, + "language_loss": 0.78054529, + "learning_rate": 3.443708238639522e-06, + "loss": 0.85846502, + "num_input_tokens_seen": 95663320, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.19726562, + "step": 4424, + "time_per_iteration": 2.731308698654175 + }, + { + "auxiliary_loss_clip": 0.06513417, + "auxiliary_loss_mlp": 0.01282972, + "balance_loss_clip": 0.06298374, + "balance_loss_mlp": 0.01263147, + "epoch": 0.2660453930557643, + "flos": 11514115025280.0, + "grad_norm": 1.8953438163908696, + "language_loss": 0.7980895, + "learning_rate": 3.4434386861610573e-06, + "loss": 0.87605333, + "num_input_tokens_seen": 95680260, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.19824219, + "step": 4425, + "time_per_iteration": 2.536639928817749 + }, + { + "auxiliary_loss_clip": 0.0650531, + "auxiliary_loss_mlp": 0.01275945, + "balance_loss_clip": 0.06295954, + "balance_loss_mlp": 0.01257837, + "epoch": 0.26610551630843227, + "flos": 24798692096640.0, + "grad_norm": 1.624984400061838, + "language_loss": 0.81150436, + "learning_rate": 3.4431690789472532e-06, + "loss": 0.88931698, + "num_input_tokens_seen": 95701140, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 0.18103027, + "step": 4426, + "time_per_iteration": 2.55570912361145 + }, + { + "auxiliary_loss_clip": 0.06512492, + "auxiliary_loss_mlp": 0.01281328, + "balance_loss_clip": 0.06298596, + "balance_loss_mlp": 0.01262302, + "epoch": 0.26616563956110023, + "flos": 27643793955840.0, + "grad_norm": 1.6446869519549492, + "language_loss": 0.77695107, + "learning_rate": 3.442899417008333e-06, + "loss": 0.85488927, + "num_input_tokens_seen": 95722060, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.19042969, + "step": 4427, + "time_per_iteration": 2.609236001968384 + }, + { + "auxiliary_loss_clip": 0.06512281, + "auxiliary_loss_mlp": 0.01275028, + "balance_loss_clip": 0.06306126, + "balance_loss_mlp": 0.01257588, + "epoch": 0.26622576281376825, + "flos": 28369887511680.0, + "grad_norm": 1.5754757805335664, + "language_loss": 0.77615106, + "learning_rate": 3.4426297003545227e-06, + "loss": 0.85402417, + "num_input_tokens_seen": 95742495, + "router_z_loss_clip": 2.0625, + "router_z_loss_mlp": 0.17443848, + "step": 4428, + "time_per_iteration": 2.5886542797088623 + }, + { + "auxiliary_loss_clip": 0.06507164, + "auxiliary_loss_mlp": 0.01273818, + "balance_loss_clip": 0.06292614, + "balance_loss_mlp": 0.0125627, + "epoch": 0.2662858860664362, + "flos": 18047265217920.0, + "grad_norm": 1.9210496781424948, + "language_loss": 0.83184117, + "learning_rate": 3.4423599289960495e-06, + "loss": 0.90965092, + "num_input_tokens_seen": 95761510, + "router_z_loss_clip": 2.14746094, + "router_z_loss_mlp": 0.17541504, + "step": 4429, + "time_per_iteration": 2.5387768745422363 + }, + { + "auxiliary_loss_clip": 0.06512052, + "auxiliary_loss_mlp": 0.01276801, + "balance_loss_clip": 0.06301999, + "balance_loss_mlp": 0.01256762, + "epoch": 0.2663460093191042, + "flos": 22752163175040.0, + "grad_norm": 1.799497911690532, + "language_loss": 0.73120302, + "learning_rate": 3.442090102943143e-06, + "loss": 0.80909157, + "num_input_tokens_seen": 95782385, + "router_z_loss_clip": 2.1015625, + "router_z_loss_mlp": 0.20043945, + "step": 4430, + "time_per_iteration": 2.6026084423065186 + }, + { + "auxiliary_loss_clip": 0.06508531, + "auxiliary_loss_mlp": 0.0127429, + "balance_loss_clip": 0.06296858, + "balance_loss_mlp": 0.012548, + "epoch": 0.26640613257177215, + "flos": 16514422951680.0, + "grad_norm": 2.040164300856009, + "language_loss": 0.83262235, + "learning_rate": 3.441820222206035e-06, + "loss": 0.91045058, + "num_input_tokens_seen": 95800595, + "router_z_loss_clip": 2.1171875, + "router_z_loss_mlp": 0.19482422, + "step": 4431, + "time_per_iteration": 2.5464959144592285 + }, + { + "auxiliary_loss_clip": 0.0651544, + "auxiliary_loss_mlp": 0.01281122, + "balance_loss_clip": 0.06296271, + "balance_loss_mlp": 0.01261488, + "epoch": 0.2664662558244401, + "flos": 23082638878080.0, + "grad_norm": 2.4012085548553537, + "language_loss": 0.76319212, + "learning_rate": 3.44155028679496e-06, + "loss": 0.84115773, + "num_input_tokens_seen": 95818480, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.19641113, + "step": 4432, + "time_per_iteration": 2.5570900440216064 + }, + { + "auxiliary_loss_clip": 0.06513382, + "auxiliary_loss_mlp": 0.01279336, + "balance_loss_clip": 0.0629918, + "balance_loss_mlp": 0.01259011, + "epoch": 0.2665263790771081, + "flos": 23776098468480.0, + "grad_norm": 1.7645797084145118, + "language_loss": 0.8352288, + "learning_rate": 3.441280296720154e-06, + "loss": 0.91315603, + "num_input_tokens_seen": 95837205, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.20324707, + "step": 4433, + "time_per_iteration": 2.5431323051452637 + }, + { + "auxiliary_loss_clip": 0.06506403, + "auxiliary_loss_mlp": 0.01279917, + "balance_loss_clip": 0.06294529, + "balance_loss_mlp": 0.01260248, + "epoch": 0.26658650232977604, + "flos": 28008748414080.0, + "grad_norm": 2.0130085710694097, + "language_loss": 0.77006185, + "learning_rate": 3.441010251991854e-06, + "loss": 0.84792507, + "num_input_tokens_seen": 95858395, + "router_z_loss_clip": 2.11230469, + "router_z_loss_mlp": 0.19677734, + "step": 4434, + "time_per_iteration": 2.626286268234253 + }, + { + "auxiliary_loss_clip": 0.06505096, + "auxiliary_loss_mlp": 0.01274565, + "balance_loss_clip": 0.06296869, + "balance_loss_mlp": 0.01255563, + "epoch": 0.266646625582444, + "flos": 22170147914880.0, + "grad_norm": 1.9216331890087734, + "language_loss": 0.82643783, + "learning_rate": 3.440740152620301e-06, + "loss": 0.90423441, + "num_input_tokens_seen": 95877875, + "router_z_loss_clip": 2.07910156, + "router_z_loss_mlp": 0.18994141, + "step": 4435, + "time_per_iteration": 2.519731283187866 + }, + { + "auxiliary_loss_clip": 0.06515168, + "auxiliary_loss_mlp": 0.01287569, + "balance_loss_clip": 0.06296054, + "balance_loss_mlp": 0.01267065, + "epoch": 0.266706748835112, + "flos": 27860687049600.0, + "grad_norm": 2.5550616111147257, + "language_loss": 0.88173652, + "learning_rate": 3.4404699986157376e-06, + "loss": 0.95976388, + "num_input_tokens_seen": 95895820, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.2052002, + "step": 4436, + "time_per_iteration": 2.5790481567382812 + }, + { + "auxiliary_loss_clip": 0.0650726, + "auxiliary_loss_mlp": 0.01276794, + "balance_loss_clip": 0.0629128, + "balance_loss_mlp": 0.01258507, + "epoch": 0.26676687208777994, + "flos": 25819231299840.0, + "grad_norm": 5.920609689832761, + "language_loss": 0.79025435, + "learning_rate": 3.440199789988407e-06, + "loss": 0.86809486, + "num_input_tokens_seen": 95918025, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.1829834, + "step": 4437, + "time_per_iteration": 3.9761762619018555 + }, + { + "auxiliary_loss_clip": 0.06508271, + "auxiliary_loss_mlp": 0.01274387, + "balance_loss_clip": 0.06295269, + "balance_loss_mlp": 0.01256065, + "epoch": 0.2668269953404479, + "flos": 36073399207680.0, + "grad_norm": 3.5501154130665333, + "language_loss": 0.64866304, + "learning_rate": 3.439929526748556e-06, + "loss": 0.72648954, + "num_input_tokens_seen": 95937725, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.18322754, + "step": 4438, + "time_per_iteration": 2.655214786529541 + }, + { + "auxiliary_loss_clip": 0.0650841, + "auxiliary_loss_mlp": 0.01282243, + "balance_loss_clip": 0.0629243, + "balance_loss_mlp": 0.01263015, + "epoch": 0.26688711859311587, + "flos": 26576994499200.0, + "grad_norm": 1.9779853569110368, + "language_loss": 0.76120412, + "learning_rate": 3.4396592089064334e-06, + "loss": 0.83911061, + "num_input_tokens_seen": 95956335, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.1920166, + "step": 4439, + "time_per_iteration": 2.5468099117279053 + }, + { + "auxiliary_loss_clip": 0.06509372, + "auxiliary_loss_mlp": 0.01279302, + "balance_loss_clip": 0.06293344, + "balance_loss_mlp": 0.01259156, + "epoch": 0.26694724184578383, + "flos": 26768968202880.0, + "grad_norm": 1.7452542153948158, + "language_loss": 0.71747917, + "learning_rate": 3.4393888364722897e-06, + "loss": 0.79536593, + "num_input_tokens_seen": 95977135, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.20141602, + "step": 4440, + "time_per_iteration": 2.5845727920532227 + }, + { + "auxiliary_loss_clip": 0.06513558, + "auxiliary_loss_mlp": 0.01278841, + "balance_loss_clip": 0.06297302, + "balance_loss_mlp": 0.01258003, + "epoch": 0.2670073650984518, + "flos": 20965894634880.0, + "grad_norm": 2.018310090260772, + "language_loss": 0.67180222, + "learning_rate": 3.439118409456376e-06, + "loss": 0.74972624, + "num_input_tokens_seen": 95995435, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.20837402, + "step": 4441, + "time_per_iteration": 4.018662691116333 + }, + { + "auxiliary_loss_clip": 0.06511593, + "auxiliary_loss_mlp": 0.01281375, + "balance_loss_clip": 0.06295494, + "balance_loss_mlp": 0.01260692, + "epoch": 0.2670674883511198, + "flos": 28373577091200.0, + "grad_norm": 1.7028334543675463, + "language_loss": 0.77360296, + "learning_rate": 3.4388479278689486e-06, + "loss": 0.8515327, + "num_input_tokens_seen": 96016340, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.20690918, + "step": 4442, + "time_per_iteration": 2.613529682159424 + }, + { + "auxiliary_loss_clip": 0.06397913, + "auxiliary_loss_mlp": 0.0126448, + "balance_loss_clip": 0.06295023, + "balance_loss_mlp": 0.01259818, + "epoch": 0.2671276116037878, + "flos": 58989010970880.0, + "grad_norm": 0.9159689493293411, + "language_loss": 0.61561328, + "learning_rate": 3.4385773917202637e-06, + "loss": 0.6922372, + "num_input_tokens_seen": 96071205, + "router_z_loss_clip": 1.03125, + "router_z_loss_mlp": 0.04653931, + "step": 4443, + "time_per_iteration": 4.460381031036377 + }, + { + "auxiliary_loss_clip": 0.06510781, + "auxiliary_loss_mlp": 0.01278926, + "balance_loss_clip": 0.06294855, + "balance_loss_mlp": 0.0126021, + "epoch": 0.26718773485645575, + "flos": 43955132538240.0, + "grad_norm": 8.593795125602613, + "language_loss": 0.76795793, + "learning_rate": 3.4383068010205793e-06, + "loss": 0.845855, + "num_input_tokens_seen": 96094240, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.18725586, + "step": 4444, + "time_per_iteration": 2.7442104816436768 + }, + { + "auxiliary_loss_clip": 0.06512623, + "auxiliary_loss_mlp": 0.0127732, + "balance_loss_clip": 0.06297334, + "balance_loss_mlp": 0.01256255, + "epoch": 0.2672478581091237, + "flos": 25235329322880.0, + "grad_norm": 2.0392997213265867, + "language_loss": 0.81111336, + "learning_rate": 3.438036155780158e-06, + "loss": 0.88901269, + "num_input_tokens_seen": 96114105, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.21057129, + "step": 4445, + "time_per_iteration": 2.5493359565734863 + }, + { + "auxiliary_loss_clip": 0.06511448, + "auxiliary_loss_mlp": 0.01275318, + "balance_loss_clip": 0.0629541, + "balance_loss_mlp": 0.01256054, + "epoch": 0.2673079813617917, + "flos": 15273594564480.0, + "grad_norm": 1.8279407549944744, + "language_loss": 0.89906365, + "learning_rate": 3.43776545600926e-06, + "loss": 0.97693127, + "num_input_tokens_seen": 96132140, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.19262695, + "step": 4446, + "time_per_iteration": 2.536916971206665 + }, + { + "auxiliary_loss_clip": 0.06512347, + "auxiliary_loss_mlp": 0.01275408, + "balance_loss_clip": 0.06295379, + "balance_loss_mlp": 0.01256894, + "epoch": 0.26736810461445965, + "flos": 25819944059520.0, + "grad_norm": 1.8969857257431861, + "language_loss": 0.68977708, + "learning_rate": 3.437494701718153e-06, + "loss": 0.76765466, + "num_input_tokens_seen": 96152090, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.18518066, + "step": 4447, + "time_per_iteration": 4.071701526641846 + }, + { + "auxiliary_loss_clip": 0.06511723, + "auxiliary_loss_mlp": 0.01279215, + "balance_loss_clip": 0.06295793, + "balance_loss_mlp": 0.01259116, + "epoch": 0.2674282278671276, + "flos": 24318981072000.0, + "grad_norm": 1.8615578685879888, + "language_loss": 0.83522677, + "learning_rate": 3.4372238929171026e-06, + "loss": 0.91313618, + "num_input_tokens_seen": 96170015, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.2010498, + "step": 4448, + "time_per_iteration": 2.581207036972046 + }, + { + "auxiliary_loss_clip": 0.06506026, + "auxiliary_loss_mlp": 0.0127612, + "balance_loss_clip": 0.06295379, + "balance_loss_mlp": 0.01256855, + "epoch": 0.2674883511197956, + "flos": 22821330320640.0, + "grad_norm": 1.5806903023960923, + "language_loss": 0.84385109, + "learning_rate": 3.436953029616378e-06, + "loss": 0.92167258, + "num_input_tokens_seen": 96188065, + "router_z_loss_clip": 2.10742188, + "router_z_loss_mlp": 0.19262695, + "step": 4449, + "time_per_iteration": 2.556368827819824 + }, + { + "auxiliary_loss_clip": 0.06523807, + "auxiliary_loss_mlp": 0.01278506, + "balance_loss_clip": 0.06298804, + "balance_loss_mlp": 0.01256679, + "epoch": 0.26754847437246354, + "flos": 25376514652800.0, + "grad_norm": 2.5106466446094275, + "language_loss": 0.84170121, + "learning_rate": 3.4366821118262506e-06, + "loss": 0.91972435, + "num_input_tokens_seen": 96205780, + "router_z_loss_clip": 2.25390625, + "router_z_loss_mlp": 0.21838379, + "step": 4450, + "time_per_iteration": 2.540792465209961 + }, + { + "auxiliary_loss_clip": 0.06503032, + "auxiliary_loss_mlp": 0.01274274, + "balance_loss_clip": 0.06293193, + "balance_loss_mlp": 0.01255248, + "epoch": 0.2676085976251315, + "flos": 20236698478080.0, + "grad_norm": 1.7838817445044992, + "language_loss": 0.81239712, + "learning_rate": 3.4364111395569937e-06, + "loss": 0.8901701, + "num_input_tokens_seen": 96224990, + "router_z_loss_clip": 2.09570312, + "router_z_loss_mlp": 0.19042969, + "step": 4451, + "time_per_iteration": 2.552764892578125 + }, + { + "auxiliary_loss_clip": 0.06515267, + "auxiliary_loss_mlp": 0.01275992, + "balance_loss_clip": 0.06304526, + "balance_loss_mlp": 0.01257324, + "epoch": 0.26766872087779947, + "flos": 28045784718720.0, + "grad_norm": 1.859886698365648, + "language_loss": 0.87156057, + "learning_rate": 3.436140112818882e-06, + "loss": 0.94947314, + "num_input_tokens_seen": 96245345, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.18664551, + "step": 4452, + "time_per_iteration": 2.580838918685913 + }, + { + "auxiliary_loss_clip": 0.06515863, + "auxiliary_loss_mlp": 0.01278142, + "balance_loss_clip": 0.06301846, + "balance_loss_mlp": 0.01258377, + "epoch": 0.26772884413046744, + "flos": 18329803585920.0, + "grad_norm": 2.0572254627861577, + "language_loss": 0.84003425, + "learning_rate": 3.435869031622194e-06, + "loss": 0.91797435, + "num_input_tokens_seen": 96259000, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.19775391, + "step": 4453, + "time_per_iteration": 2.5120368003845215 + }, + { + "auxiliary_loss_clip": 0.06513035, + "auxiliary_loss_mlp": 0.01281566, + "balance_loss_clip": 0.06298169, + "balance_loss_mlp": 0.01261992, + "epoch": 0.2677889673831354, + "flos": 22134075932160.0, + "grad_norm": 1.66096029715733, + "language_loss": 0.79950684, + "learning_rate": 3.435597895977208e-06, + "loss": 0.87745285, + "num_input_tokens_seen": 96277000, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.19580078, + "step": 4454, + "time_per_iteration": 2.5411524772644043 + }, + { + "auxiliary_loss_clip": 0.06518991, + "auxiliary_loss_mlp": 0.0127963, + "balance_loss_clip": 0.0630191, + "balance_loss_mlp": 0.01259949, + "epoch": 0.2678490906358034, + "flos": 23736001489920.0, + "grad_norm": 1.4726826789128313, + "language_loss": 0.72626883, + "learning_rate": 3.435326705894206e-06, + "loss": 0.80425501, + "num_input_tokens_seen": 96297010, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.19689941, + "step": 4455, + "time_per_iteration": 2.600341558456421 + }, + { + "auxiliary_loss_clip": 0.0650526, + "auxiliary_loss_mlp": 0.01280807, + "balance_loss_clip": 0.06295176, + "balance_loss_mlp": 0.01262675, + "epoch": 0.2679092138884714, + "flos": 21769414963200.0, + "grad_norm": 1.6724393178855028, + "language_loss": 0.74066579, + "learning_rate": 3.435055461383471e-06, + "loss": 0.81852639, + "num_input_tokens_seen": 96315780, + "router_z_loss_clip": 2.09863281, + "router_z_loss_mlp": 0.18139648, + "step": 4456, + "time_per_iteration": 2.5469894409179688 + }, + { + "auxiliary_loss_clip": 0.06520095, + "auxiliary_loss_mlp": 0.01278452, + "balance_loss_clip": 0.06300029, + "balance_loss_mlp": 0.01258127, + "epoch": 0.26796933714113935, + "flos": 19866670848000.0, + "grad_norm": 2.417277333537857, + "language_loss": 0.71260488, + "learning_rate": 3.4347841624552896e-06, + "loss": 0.79059041, + "num_input_tokens_seen": 96333465, + "router_z_loss_clip": 2.20507812, + "router_z_loss_mlp": 0.20324707, + "step": 4457, + "time_per_iteration": 2.592397451400757 + }, + { + "auxiliary_loss_clip": 0.06517951, + "auxiliary_loss_mlp": 0.01279854, + "balance_loss_clip": 0.06301091, + "balance_loss_mlp": 0.01259183, + "epoch": 0.2680294603938073, + "flos": 20054116431360.0, + "grad_norm": 2.0107664890053143, + "language_loss": 0.79466271, + "learning_rate": 3.4345128091199493e-06, + "loss": 0.87264079, + "num_input_tokens_seen": 96352005, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.20666504, + "step": 4458, + "time_per_iteration": 2.5134661197662354 + }, + { + "auxiliary_loss_clip": 0.06383923, + "auxiliary_loss_mlp": 0.01263146, + "balance_loss_clip": 0.06281242, + "balance_loss_mlp": 0.01258718, + "epoch": 0.2680895836464753, + "flos": 72134918334720.0, + "grad_norm": 0.8734266993254428, + "language_loss": 0.5870322, + "learning_rate": 3.434241401387739e-06, + "loss": 0.66350281, + "num_input_tokens_seen": 96406265, + "router_z_loss_clip": 1.02734375, + "router_z_loss_mlp": 0.04437256, + "step": 4459, + "time_per_iteration": 3.2277050018310547 + }, + { + "auxiliary_loss_clip": 0.06506394, + "auxiliary_loss_mlp": 0.01274552, + "balance_loss_clip": 0.06292672, + "balance_loss_mlp": 0.01255633, + "epoch": 0.26814970689914325, + "flos": 20455310580480.0, + "grad_norm": 1.8403982609946155, + "language_loss": 0.85477257, + "learning_rate": 3.4339699392689507e-06, + "loss": 0.93258202, + "num_input_tokens_seen": 96425225, + "router_z_loss_clip": 2.13769531, + "router_z_loss_mlp": 0.18920898, + "step": 4460, + "time_per_iteration": 2.513317346572876 + }, + { + "auxiliary_loss_clip": 0.06504844, + "auxiliary_loss_mlp": 0.01281285, + "balance_loss_clip": 0.06292892, + "balance_loss_mlp": 0.01261866, + "epoch": 0.2682098301518112, + "flos": 17572459656960.0, + "grad_norm": 1.8133404743184358, + "language_loss": 0.69389015, + "learning_rate": 3.4336984227738796e-06, + "loss": 0.7717514, + "num_input_tokens_seen": 96443780, + "router_z_loss_clip": 2.12011719, + "router_z_loss_mlp": 0.19421387, + "step": 4461, + "time_per_iteration": 2.5566093921661377 + }, + { + "auxiliary_loss_clip": 0.06506921, + "auxiliary_loss_mlp": 0.01281085, + "balance_loss_clip": 0.06293105, + "balance_loss_mlp": 0.01260152, + "epoch": 0.2682699534044792, + "flos": 18339237169920.0, + "grad_norm": 1.6584506269914416, + "language_loss": 0.67031932, + "learning_rate": 3.43342685191282e-06, + "loss": 0.74819934, + "num_input_tokens_seen": 96464530, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.20935059, + "step": 4462, + "time_per_iteration": 2.5427775382995605 + }, + { + "auxiliary_loss_clip": 0.06508102, + "auxiliary_loss_mlp": 0.01282385, + "balance_loss_clip": 0.0629629, + "balance_loss_mlp": 0.01263287, + "epoch": 0.26833007665714714, + "flos": 25308311829120.0, + "grad_norm": 1.7808644454945033, + "language_loss": 0.69747704, + "learning_rate": 3.4331552266960705e-06, + "loss": 0.77538192, + "num_input_tokens_seen": 96483345, + "router_z_loss_clip": 2.11523438, + "router_z_loss_mlp": 0.19116211, + "step": 4463, + "time_per_iteration": 2.6194493770599365 + }, + { + "auxiliary_loss_clip": 0.06508362, + "auxiliary_loss_mlp": 0.01280959, + "balance_loss_clip": 0.06291216, + "balance_loss_mlp": 0.0126092, + "epoch": 0.2683901999098151, + "flos": 16104046780800.0, + "grad_norm": 2.9245690778148465, + "language_loss": 0.78600121, + "learning_rate": 3.432883547133931e-06, + "loss": 0.86389446, + "num_input_tokens_seen": 96498305, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.20056152, + "step": 4464, + "time_per_iteration": 2.463418483734131 + }, + { + "auxiliary_loss_clip": 0.06508331, + "auxiliary_loss_mlp": 0.01281824, + "balance_loss_clip": 0.06294504, + "balance_loss_mlp": 0.01262154, + "epoch": 0.2684503231624831, + "flos": 27315414604800.0, + "grad_norm": 1.7531136867378412, + "language_loss": 0.71091688, + "learning_rate": 3.432611813236704e-06, + "loss": 0.78881842, + "num_input_tokens_seen": 96519740, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.19665527, + "step": 4465, + "time_per_iteration": 2.6083028316497803 + }, + { + "auxiliary_loss_clip": 0.06379254, + "auxiliary_loss_mlp": 0.01259677, + "balance_loss_clip": 0.0627647, + "balance_loss_mlp": 0.01255094, + "epoch": 0.26851044641515104, + "flos": 71879060292480.0, + "grad_norm": 0.6551429372657154, + "language_loss": 0.52683848, + "learning_rate": 3.4323400250146943e-06, + "loss": 0.60322779, + "num_input_tokens_seen": 96588870, + "router_z_loss_clip": 1.02929688, + "router_z_loss_mlp": 0.04577637, + "step": 4466, + "time_per_iteration": 3.2851803302764893 + }, + { + "auxiliary_loss_clip": 0.06507096, + "auxiliary_loss_mlp": 0.01283442, + "balance_loss_clip": 0.06291512, + "balance_loss_mlp": 0.01263105, + "epoch": 0.268570569667819, + "flos": 18739676632320.0, + "grad_norm": 10.994589827837663, + "language_loss": 0.74195564, + "learning_rate": 3.4320681824782057e-06, + "loss": 0.81986099, + "num_input_tokens_seen": 96605100, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.20324707, + "step": 4467, + "time_per_iteration": 2.4971463680267334 + }, + { + "auxiliary_loss_clip": 0.06517448, + "auxiliary_loss_mlp": 0.01283031, + "balance_loss_clip": 0.06297839, + "balance_loss_mlp": 0.01264005, + "epoch": 0.268630692920487, + "flos": 18182832324480.0, + "grad_norm": 2.2391086352503504, + "language_loss": 0.81577581, + "learning_rate": 3.4317962856375493e-06, + "loss": 0.89378059, + "num_input_tokens_seen": 96621410, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.19042969, + "step": 4468, + "time_per_iteration": 2.547626256942749 + }, + { + "auxiliary_loss_clip": 0.06377872, + "auxiliary_loss_mlp": 0.01264177, + "balance_loss_clip": 0.06275174, + "balance_loss_mlp": 0.01259552, + "epoch": 0.268690816173155, + "flos": 68754229176960.0, + "grad_norm": 0.8279608156690638, + "language_loss": 0.59413958, + "learning_rate": 3.4315243345030334e-06, + "loss": 0.67056012, + "num_input_tokens_seen": 96684810, + "router_z_loss_clip": 1.03125, + "router_z_loss_mlp": 0.0461731, + "step": 4469, + "time_per_iteration": 3.2565419673919678 + }, + { + "auxiliary_loss_clip": 0.06507242, + "auxiliary_loss_mlp": 0.01284548, + "balance_loss_clip": 0.06292132, + "balance_loss_mlp": 0.01263304, + "epoch": 0.26875093942582295, + "flos": 23300160877440.0, + "grad_norm": 1.9707129205098373, + "language_loss": 0.8163017, + "learning_rate": 3.431252329084972e-06, + "loss": 0.89421958, + "num_input_tokens_seen": 96701920, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.21240234, + "step": 4470, + "time_per_iteration": 2.542893171310425 + }, + { + "auxiliary_loss_clip": 0.06497125, + "auxiliary_loss_mlp": 0.0128145, + "balance_loss_clip": 0.0628901, + "balance_loss_mlp": 0.012619, + "epoch": 0.2688110626784909, + "flos": 21549880465920.0, + "grad_norm": 1.5945085425671264, + "language_loss": 0.83326346, + "learning_rate": 3.4309802693936786e-06, + "loss": 0.91104919, + "num_input_tokens_seen": 96721260, + "router_z_loss_clip": 2.08496094, + "router_z_loss_mlp": 0.19555664, + "step": 4471, + "time_per_iteration": 2.5213489532470703 + }, + { + "auxiliary_loss_clip": 0.06497657, + "auxiliary_loss_mlp": 0.01284463, + "balance_loss_clip": 0.06289607, + "balance_loss_mlp": 0.01264365, + "epoch": 0.2688711859311589, + "flos": 28407804284160.0, + "grad_norm": 1.9607526414443455, + "language_loss": 0.70046443, + "learning_rate": 3.43070815543947e-06, + "loss": 0.77828562, + "num_input_tokens_seen": 96740385, + "router_z_loss_clip": 2.08007812, + "router_z_loss_mlp": 0.20092773, + "step": 4472, + "time_per_iteration": 2.6251678466796875 + }, + { + "auxiliary_loss_clip": 0.06504884, + "auxiliary_loss_mlp": 0.0128234, + "balance_loss_clip": 0.06293008, + "balance_loss_mlp": 0.01263112, + "epoch": 0.26893130918382685, + "flos": 26002148762880.0, + "grad_norm": 1.9293915951077794, + "language_loss": 0.68364072, + "learning_rate": 3.4304359872326656e-06, + "loss": 0.76151299, + "num_input_tokens_seen": 96761860, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.19213867, + "step": 4473, + "time_per_iteration": 2.5682830810546875 + }, + { + "auxiliary_loss_clip": 0.06499921, + "auxiliary_loss_mlp": 0.01278958, + "balance_loss_clip": 0.06292213, + "balance_loss_mlp": 0.01259467, + "epoch": 0.2689914324364948, + "flos": 20345878748160.0, + "grad_norm": 1.608174101079712, + "language_loss": 0.83682281, + "learning_rate": 3.4301637647835843e-06, + "loss": 0.91461158, + "num_input_tokens_seen": 96781890, + "router_z_loss_clip": 2.08007812, + "router_z_loss_mlp": 0.19470215, + "step": 4474, + "time_per_iteration": 2.554151773452759 + }, + { + "auxiliary_loss_clip": 0.06502855, + "auxiliary_loss_mlp": 0.01275806, + "balance_loss_clip": 0.06296148, + "balance_loss_mlp": 0.01256482, + "epoch": 0.2690515556891628, + "flos": 19470759505920.0, + "grad_norm": 1.847749203594977, + "language_loss": 0.70725596, + "learning_rate": 3.4298914881025494e-06, + "loss": 0.78504252, + "num_input_tokens_seen": 96800390, + "router_z_loss_clip": 2.06835938, + "router_z_loss_mlp": 0.19348145, + "step": 4475, + "time_per_iteration": 2.5116677284240723 + }, + { + "auxiliary_loss_clip": 0.06503256, + "auxiliary_loss_mlp": 0.01277275, + "balance_loss_clip": 0.06287575, + "balance_loss_mlp": 0.01257188, + "epoch": 0.26911167894183075, + "flos": 18151875440640.0, + "grad_norm": 2.2814450019498236, + "language_loss": 0.73125452, + "learning_rate": 3.4296191571998863e-06, + "loss": 0.80905986, + "num_input_tokens_seen": 96816685, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.20092773, + "step": 4476, + "time_per_iteration": 3.923501968383789 + }, + { + "auxiliary_loss_clip": 0.0650249, + "auxiliary_loss_mlp": 0.01274263, + "balance_loss_clip": 0.06291398, + "balance_loss_mlp": 0.01255511, + "epoch": 0.2691718021944987, + "flos": 19981385487360.0, + "grad_norm": 1.4862356596427981, + "language_loss": 0.80676347, + "learning_rate": 3.429346772085922e-06, + "loss": 0.88453096, + "num_input_tokens_seen": 96836285, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 0.18762207, + "step": 4477, + "time_per_iteration": 2.562681198120117 + }, + { + "auxiliary_loss_clip": 0.06506729, + "auxiliary_loss_mlp": 0.01275723, + "balance_loss_clip": 0.06289821, + "balance_loss_mlp": 0.01254873, + "epoch": 0.2692319254471667, + "flos": 37455622560000.0, + "grad_norm": 1.8507584096301994, + "language_loss": 0.65612036, + "learning_rate": 3.429074332770984e-06, + "loss": 0.73394483, + "num_input_tokens_seen": 96857745, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.20861816, + "step": 4478, + "time_per_iteration": 2.6743321418762207 + }, + { + "auxiliary_loss_clip": 0.06505084, + "auxiliary_loss_mlp": 0.01278495, + "balance_loss_clip": 0.06291381, + "balance_loss_mlp": 0.01259242, + "epoch": 0.26929204869983464, + "flos": 22134411348480.0, + "grad_norm": 2.2415663972983864, + "language_loss": 0.81841063, + "learning_rate": 3.4288018392654047e-06, + "loss": 0.89624637, + "num_input_tokens_seen": 96877295, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.19250488, + "step": 4479, + "time_per_iteration": 2.563365936279297 + }, + { + "auxiliary_loss_clip": 0.06510025, + "auxiliary_loss_mlp": 0.01277354, + "balance_loss_clip": 0.06295313, + "balance_loss_mlp": 0.01258305, + "epoch": 0.2693521719525026, + "flos": 19799055002880.0, + "grad_norm": 1.97047433874797, + "language_loss": 0.81362212, + "learning_rate": 3.4285292915795166e-06, + "loss": 0.89149588, + "num_input_tokens_seen": 96896160, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.19055176, + "step": 4480, + "time_per_iteration": 2.505098342895508 + }, + { + "auxiliary_loss_clip": 0.06504171, + "auxiliary_loss_mlp": 0.01276381, + "balance_loss_clip": 0.06296593, + "balance_loss_mlp": 0.01257677, + "epoch": 0.2694122952051706, + "flos": 21000415317120.0, + "grad_norm": 1.6210366032838512, + "language_loss": 0.7826978, + "learning_rate": 3.4282566897236543e-06, + "loss": 0.86050338, + "num_input_tokens_seen": 96915410, + "router_z_loss_clip": 2.07421875, + "router_z_loss_mlp": 0.18713379, + "step": 4481, + "time_per_iteration": 4.100890874862671 + }, + { + "auxiliary_loss_clip": 0.06511036, + "auxiliary_loss_mlp": 0.01275006, + "balance_loss_clip": 0.06298155, + "balance_loss_mlp": 0.01254192, + "epoch": 0.2694724184578386, + "flos": 25856519166720.0, + "grad_norm": 1.8924674974759383, + "language_loss": 0.74293458, + "learning_rate": 3.4279840337081547e-06, + "loss": 0.820795, + "num_input_tokens_seen": 96937865, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.20788574, + "step": 4482, + "time_per_iteration": 4.145740747451782 + }, + { + "auxiliary_loss_clip": 0.06511661, + "auxiliary_loss_mlp": 0.01276613, + "balance_loss_clip": 0.06299306, + "balance_loss_mlp": 0.01256836, + "epoch": 0.26953254171050656, + "flos": 21733594542720.0, + "grad_norm": 2.48131981073459, + "language_loss": 0.72700799, + "learning_rate": 3.4277113235433584e-06, + "loss": 0.80489069, + "num_input_tokens_seen": 96957710, + "router_z_loss_clip": 2.12109375, + "router_z_loss_mlp": 0.19763184, + "step": 4483, + "time_per_iteration": 2.5375680923461914 + }, + { + "auxiliary_loss_clip": 0.06523035, + "auxiliary_loss_mlp": 0.01278438, + "balance_loss_clip": 0.0630566, + "balance_loss_mlp": 0.01257994, + "epoch": 0.2695926649631745, + "flos": 19689078119040.0, + "grad_norm": 2.054691934345778, + "language_loss": 0.87485874, + "learning_rate": 3.427438559239605e-06, + "loss": 0.95287347, + "num_input_tokens_seen": 96975890, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.20446777, + "step": 4484, + "time_per_iteration": 2.541909694671631 + }, + { + "auxiliary_loss_clip": 0.06515766, + "auxiliary_loss_mlp": 0.01278738, + "balance_loss_clip": 0.06300886, + "balance_loss_mlp": 0.01259474, + "epoch": 0.2696527882158425, + "flos": 32894257847040.0, + "grad_norm": 2.0183728032076966, + "language_loss": 0.66971946, + "learning_rate": 3.427165740807239e-06, + "loss": 0.74766451, + "num_input_tokens_seen": 96998595, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.19262695, + "step": 4485, + "time_per_iteration": 2.623896598815918 + }, + { + "auxiliary_loss_clip": 0.06514997, + "auxiliary_loss_mlp": 0.01282999, + "balance_loss_clip": 0.06301111, + "balance_loss_mlp": 0.01262877, + "epoch": 0.26971291146851045, + "flos": 12128806177920.0, + "grad_norm": 3.3281733059389498, + "language_loss": 0.74281263, + "learning_rate": 3.426892868256604e-06, + "loss": 0.82079262, + "num_input_tokens_seen": 97013715, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.2010498, + "step": 4486, + "time_per_iteration": 2.525820016860962 + }, + { + "auxiliary_loss_clip": 0.06519947, + "auxiliary_loss_mlp": 0.01289409, + "balance_loss_clip": 0.06302445, + "balance_loss_mlp": 0.01268846, + "epoch": 0.2697730347211784, + "flos": 22640467282560.0, + "grad_norm": 2.8316541967285183, + "language_loss": 0.84592897, + "learning_rate": 3.4266199415980495e-06, + "loss": 0.92402256, + "num_input_tokens_seen": 97031570, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.20556641, + "step": 4487, + "time_per_iteration": 3.936244249343872 + }, + { + "auxiliary_loss_clip": 0.06520635, + "auxiliary_loss_mlp": 0.01285695, + "balance_loss_clip": 0.06303369, + "balance_loss_mlp": 0.01264845, + "epoch": 0.2698331579738464, + "flos": 23519695374720.0, + "grad_norm": 2.431656191901387, + "language_loss": 0.73194599, + "learning_rate": 3.4263469608419234e-06, + "loss": 0.81000936, + "num_input_tokens_seen": 97049815, + "router_z_loss_clip": 2.17578125, + "router_z_loss_mlp": 0.20861816, + "step": 4488, + "time_per_iteration": 2.522861957550049 + }, + { + "auxiliary_loss_clip": 0.06516892, + "auxiliary_loss_mlp": 0.0127853, + "balance_loss_clip": 0.06303044, + "balance_loss_mlp": 0.01258681, + "epoch": 0.26989328122651435, + "flos": 24647360423040.0, + "grad_norm": 1.6427618857215789, + "language_loss": 0.84162384, + "learning_rate": 3.426073925998578e-06, + "loss": 0.91957808, + "num_input_tokens_seen": 97067570, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.1986084, + "step": 4489, + "time_per_iteration": 2.558133602142334 + }, + { + "auxiliary_loss_clip": 0.06523076, + "auxiliary_loss_mlp": 0.0128704, + "balance_loss_clip": 0.0630331, + "balance_loss_mlp": 0.01265821, + "epoch": 0.2699534044791823, + "flos": 10775904554880.0, + "grad_norm": 2.0847356564254014, + "language_loss": 0.90199494, + "learning_rate": 3.4258008370783656e-06, + "loss": 0.98009604, + "num_input_tokens_seen": 97082180, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.21228027, + "step": 4490, + "time_per_iteration": 2.461840867996216 + }, + { + "auxiliary_loss_clip": 0.06505966, + "auxiliary_loss_mlp": 0.01275421, + "balance_loss_clip": 0.06297465, + "balance_loss_mlp": 0.01256288, + "epoch": 0.2700135277318503, + "flos": 36180021928320.0, + "grad_norm": 2.13129158363681, + "language_loss": 0.73836827, + "learning_rate": 3.4255276940916434e-06, + "loss": 0.81618214, + "num_input_tokens_seen": 97103470, + "router_z_loss_clip": 2.08496094, + "router_z_loss_mlp": 0.19128418, + "step": 4491, + "time_per_iteration": 2.6479640007019043 + }, + { + "auxiliary_loss_clip": 0.06516409, + "auxiliary_loss_mlp": 0.01284517, + "balance_loss_clip": 0.06303698, + "balance_loss_mlp": 0.01264788, + "epoch": 0.27007365098451824, + "flos": 17424020949120.0, + "grad_norm": 2.8438546283757793, + "language_loss": 0.74296927, + "learning_rate": 3.4252544970487676e-06, + "loss": 0.82097852, + "num_input_tokens_seen": 97118100, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.19726562, + "step": 4492, + "time_per_iteration": 2.462226629257202 + }, + { + "auxiliary_loss_clip": 0.06510016, + "auxiliary_loss_mlp": 0.01279369, + "balance_loss_clip": 0.06300159, + "balance_loss_mlp": 0.01259926, + "epoch": 0.2701337742371862, + "flos": 23192448053760.0, + "grad_norm": 1.7359009481863723, + "language_loss": 0.88954818, + "learning_rate": 3.4249812459600986e-06, + "loss": 0.96744204, + "num_input_tokens_seen": 97136765, + "router_z_loss_clip": 2.09960938, + "router_z_loss_mlp": 0.19445801, + "step": 4493, + "time_per_iteration": 2.5385639667510986 + }, + { + "auxiliary_loss_clip": 0.06509903, + "auxiliary_loss_mlp": 0.01283619, + "balance_loss_clip": 0.06296834, + "balance_loss_mlp": 0.01265201, + "epoch": 0.2701938974898542, + "flos": 24396365917440.0, + "grad_norm": 1.3961943163888275, + "language_loss": 0.71571529, + "learning_rate": 3.424707940835998e-06, + "loss": 0.79365045, + "num_input_tokens_seen": 97157470, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.1842041, + "step": 4494, + "time_per_iteration": 2.542644500732422 + }, + { + "auxiliary_loss_clip": 0.06499185, + "auxiliary_loss_mlp": 0.01282381, + "balance_loss_clip": 0.0629191, + "balance_loss_mlp": 0.01263713, + "epoch": 0.2702540207425222, + "flos": 26221641333120.0, + "grad_norm": 2.6689304552375366, + "language_loss": 0.8697859, + "learning_rate": 3.42443458168683e-06, + "loss": 0.94760156, + "num_input_tokens_seen": 97176905, + "router_z_loss_clip": 2.07226562, + "router_z_loss_mlp": 0.18652344, + "step": 4495, + "time_per_iteration": 2.6052844524383545 + }, + { + "auxiliary_loss_clip": 0.06507061, + "auxiliary_loss_mlp": 0.01284126, + "balance_loss_clip": 0.06293719, + "balance_loss_mlp": 0.01263944, + "epoch": 0.27031414399519016, + "flos": 22932439234560.0, + "grad_norm": 1.7866659337876034, + "language_loss": 0.76608586, + "learning_rate": 3.424161168522959e-06, + "loss": 0.84399772, + "num_input_tokens_seen": 97196380, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.20166016, + "step": 4496, + "time_per_iteration": 2.5191855430603027 + }, + { + "auxiliary_loss_clip": 0.06445029, + "auxiliary_loss_mlp": 0.01262058, + "balance_loss_clip": 0.06340651, + "balance_loss_mlp": 0.01257498, + "epoch": 0.2703742672478581, + "flos": 63037904912640.0, + "grad_norm": 0.6591771406427821, + "language_loss": 0.49976462, + "learning_rate": 3.423887701354754e-06, + "loss": 0.57683551, + "num_input_tokens_seen": 97260100, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.0456543, + "step": 4497, + "time_per_iteration": 3.2403736114501953 + }, + { + "auxiliary_loss_clip": 0.06506558, + "auxiliary_loss_mlp": 0.01283587, + "balance_loss_clip": 0.06295481, + "balance_loss_mlp": 0.01266039, + "epoch": 0.2704343905005261, + "flos": 18846341280000.0, + "grad_norm": 2.8639988273107657, + "language_loss": 0.72431815, + "learning_rate": 3.4236141801925847e-06, + "loss": 0.80221957, + "num_input_tokens_seen": 97277935, + "router_z_loss_clip": 2.11328125, + "router_z_loss_mlp": 0.17553711, + "step": 4498, + "time_per_iteration": 2.509298086166382 + }, + { + "auxiliary_loss_clip": 0.06432115, + "auxiliary_loss_mlp": 0.01259251, + "balance_loss_clip": 0.06327531, + "balance_loss_mlp": 0.01254679, + "epoch": 0.27049451375319405, + "flos": 71253635817600.0, + "grad_norm": 0.9422572009255263, + "language_loss": 0.5900467, + "learning_rate": 3.4233406050468237e-06, + "loss": 0.66696036, + "num_input_tokens_seen": 97338845, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.04577637, + "step": 4499, + "time_per_iteration": 3.2116270065307617 + }, + { + "auxiliary_loss_clip": 0.06502165, + "auxiliary_loss_mlp": 0.01281307, + "balance_loss_clip": 0.06292122, + "balance_loss_mlp": 0.01261422, + "epoch": 0.270554637005862, + "flos": 24285257003520.0, + "grad_norm": 2.589715304320551, + "language_loss": 0.73975158, + "learning_rate": 3.4230669759278438e-06, + "loss": 0.8175863, + "num_input_tokens_seen": 97356640, + "router_z_loss_clip": 2.09960938, + "router_z_loss_mlp": 0.19897461, + "step": 4500, + "time_per_iteration": 2.537710189819336 + }, + { + "auxiliary_loss_clip": 0.06501484, + "auxiliary_loss_mlp": 0.01276741, + "balance_loss_clip": 0.06289591, + "balance_loss_mlp": 0.01257965, + "epoch": 0.27061476025853, + "flos": 17636889047040.0, + "grad_norm": 2.788947169536346, + "language_loss": 0.81470346, + "learning_rate": 3.4227932928460215e-06, + "loss": 0.89248574, + "num_input_tokens_seen": 97372585, + "router_z_loss_clip": 2.11523438, + "router_z_loss_mlp": 0.18774414, + "step": 4501, + "time_per_iteration": 2.5423648357391357 + }, + { + "auxiliary_loss_clip": 0.06510358, + "auxiliary_loss_mlp": 0.01287368, + "balance_loss_clip": 0.06294559, + "balance_loss_mlp": 0.01267579, + "epoch": 0.27067488351119795, + "flos": 22716594316800.0, + "grad_norm": 1.5278818221734496, + "language_loss": 0.7303015, + "learning_rate": 3.422519555811735e-06, + "loss": 0.8082788, + "num_input_tokens_seen": 97393315, + "router_z_loss_clip": 2.15917969, + "router_z_loss_mlp": 0.19775391, + "step": 4502, + "time_per_iteration": 2.5804011821746826 + }, + { + "auxiliary_loss_clip": 0.06507368, + "auxiliary_loss_mlp": 0.01278341, + "balance_loss_clip": 0.06289332, + "balance_loss_mlp": 0.01258576, + "epoch": 0.2707350067638659, + "flos": 41729333806080.0, + "grad_norm": 1.6949775973694576, + "language_loss": 0.69090897, + "learning_rate": 3.4222457648353642e-06, + "loss": 0.76876605, + "num_input_tokens_seen": 97417860, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.19763184, + "step": 4503, + "time_per_iteration": 2.740292549133301 + }, + { + "auxiliary_loss_clip": 0.06502387, + "auxiliary_loss_mlp": 0.0128307, + "balance_loss_clip": 0.06290283, + "balance_loss_mlp": 0.01263746, + "epoch": 0.2707951300165339, + "flos": 20199159048960.0, + "grad_norm": 1.9752400870870641, + "language_loss": 0.69172543, + "learning_rate": 3.4219719199272918e-06, + "loss": 0.76958001, + "num_input_tokens_seen": 97436780, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.1932373, + "step": 4504, + "time_per_iteration": 2.548069477081299 + }, + { + "auxiliary_loss_clip": 0.06502561, + "auxiliary_loss_mlp": 0.0128216, + "balance_loss_clip": 0.06291538, + "balance_loss_mlp": 0.01263492, + "epoch": 0.27085525326920185, + "flos": 21440364779520.0, + "grad_norm": 2.9855030089462993, + "language_loss": 0.76122642, + "learning_rate": 3.421698021097902e-06, + "loss": 0.8390736, + "num_input_tokens_seen": 97456190, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 0.18652344, + "step": 4505, + "time_per_iteration": 2.527165651321411 + }, + { + "auxiliary_loss_clip": 0.06505956, + "auxiliary_loss_mlp": 0.0128432, + "balance_loss_clip": 0.06289993, + "balance_loss_mlp": 0.01264459, + "epoch": 0.2709153765218698, + "flos": 17680885240320.0, + "grad_norm": 2.0693026918396487, + "language_loss": 0.73959178, + "learning_rate": 3.42142406835758e-06, + "loss": 0.81749451, + "num_input_tokens_seen": 97474545, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.1986084, + "step": 4506, + "time_per_iteration": 2.5131149291992188 + }, + { + "auxiliary_loss_clip": 0.0650361, + "auxiliary_loss_mlp": 0.01278265, + "balance_loss_clip": 0.06290495, + "balance_loss_mlp": 0.01258595, + "epoch": 0.2709754997745378, + "flos": 24462136972800.0, + "grad_norm": 1.8128724600792683, + "language_loss": 0.81647539, + "learning_rate": 3.421150061716715e-06, + "loss": 0.89429414, + "num_input_tokens_seen": 97494520, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.1965332, + "step": 4507, + "time_per_iteration": 2.684535503387451 + }, + { + "auxiliary_loss_clip": 0.06395597, + "auxiliary_loss_mlp": 0.01254395, + "balance_loss_clip": 0.0629042, + "balance_loss_mlp": 0.01250205, + "epoch": 0.2710356230272058, + "flos": 65229602232960.0, + "grad_norm": 0.712447813073055, + "language_loss": 0.50718415, + "learning_rate": 3.420876001185698e-06, + "loss": 0.58368409, + "num_input_tokens_seen": 97552455, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.04193115, + "step": 4508, + "time_per_iteration": 3.111752986907959 + }, + { + "auxiliary_loss_clip": 0.0649793, + "auxiliary_loss_mlp": 0.01272465, + "balance_loss_clip": 0.06289998, + "balance_loss_mlp": 0.01255263, + "epoch": 0.27109574627987376, + "flos": 25491606635520.0, + "grad_norm": 2.0258218163980213, + "language_loss": 0.75015354, + "learning_rate": 3.4206018867749197e-06, + "loss": 0.82785749, + "num_input_tokens_seen": 97572650, + "router_z_loss_clip": 2.08007812, + "router_z_loss_mlp": 0.171875, + "step": 4509, + "time_per_iteration": 2.555316209793091 + }, + { + "auxiliary_loss_clip": 0.06495094, + "auxiliary_loss_mlp": 0.01275639, + "balance_loss_clip": 0.06289092, + "balance_loss_mlp": 0.01256947, + "epoch": 0.2711558695325417, + "flos": 19688910410880.0, + "grad_norm": 2.3712253737099767, + "language_loss": 0.71864915, + "learning_rate": 3.4203277184947757e-06, + "loss": 0.79635644, + "num_input_tokens_seen": 97591150, + "router_z_loss_clip": 2.05859375, + "router_z_loss_mlp": 0.18688965, + "step": 4510, + "time_per_iteration": 2.5428407192230225 + }, + { + "auxiliary_loss_clip": 0.06499062, + "auxiliary_loss_mlp": 0.01279444, + "balance_loss_clip": 0.0629103, + "balance_loss_mlp": 0.012608, + "epoch": 0.2712159927852097, + "flos": 18593627765760.0, + "grad_norm": 2.5496745820614515, + "language_loss": 0.71357799, + "learning_rate": 3.4200534963556627e-06, + "loss": 0.791363, + "num_input_tokens_seen": 97607410, + "router_z_loss_clip": 2.07910156, + "router_z_loss_mlp": 0.1862793, + "step": 4511, + "time_per_iteration": 2.483739137649536 + }, + { + "auxiliary_loss_clip": 0.06505338, + "auxiliary_loss_mlp": 0.01274141, + "balance_loss_clip": 0.06292383, + "balance_loss_mlp": 0.01254817, + "epoch": 0.27127611603787766, + "flos": 25637403939840.0, + "grad_norm": 1.9202075405224084, + "language_loss": 0.81604505, + "learning_rate": 3.419779220367979e-06, + "loss": 0.89383984, + "num_input_tokens_seen": 97626870, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.1932373, + "step": 4512, + "time_per_iteration": 2.593388795852661 + }, + { + "auxiliary_loss_clip": 0.06503928, + "auxiliary_loss_mlp": 0.01273233, + "balance_loss_clip": 0.06296667, + "balance_loss_mlp": 0.01255554, + "epoch": 0.2713362392905456, + "flos": 23155663311360.0, + "grad_norm": 1.8072498717910284, + "language_loss": 0.809147, + "learning_rate": 3.419504890542124e-06, + "loss": 0.88691866, + "num_input_tokens_seen": 97646595, + "router_z_loss_clip": 2.07226562, + "router_z_loss_mlp": 0.17663574, + "step": 4513, + "time_per_iteration": 2.519502639770508 + }, + { + "auxiliary_loss_clip": 0.06501831, + "auxiliary_loss_mlp": 0.01278947, + "balance_loss_clip": 0.0628939, + "balance_loss_mlp": 0.01261018, + "epoch": 0.2713963625432136, + "flos": 18371409937920.0, + "grad_norm": 3.81368034370299, + "language_loss": 0.88867396, + "learning_rate": 3.4192305068885026e-06, + "loss": 0.96648169, + "num_input_tokens_seen": 97665485, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.17932129, + "step": 4514, + "time_per_iteration": 2.54484224319458 + }, + { + "auxiliary_loss_clip": 0.06502509, + "auxiliary_loss_mlp": 0.01277056, + "balance_loss_clip": 0.06292502, + "balance_loss_mlp": 0.01258709, + "epoch": 0.27145648579588155, + "flos": 22498275703680.0, + "grad_norm": 1.610354502574947, + "language_loss": 0.92402363, + "learning_rate": 3.418956069417517e-06, + "loss": 1.00181937, + "num_input_tokens_seen": 97683800, + "router_z_loss_clip": 2.09863281, + "router_z_loss_mlp": 0.18347168, + "step": 4515, + "time_per_iteration": 2.5121350288391113 + }, + { + "auxiliary_loss_clip": 0.06511631, + "auxiliary_loss_mlp": 0.01281138, + "balance_loss_clip": 0.06296228, + "balance_loss_mlp": 0.01259669, + "epoch": 0.2715166090485495, + "flos": 19244265120000.0, + "grad_norm": 2.423654901761582, + "language_loss": 0.73979908, + "learning_rate": 3.4186815781395756e-06, + "loss": 0.81772685, + "num_input_tokens_seen": 97700505, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.21435547, + "step": 4516, + "time_per_iteration": 3.917318344116211 + }, + { + "auxiliary_loss_clip": 0.06498563, + "auxiliary_loss_mlp": 0.01272915, + "balance_loss_clip": 0.06289151, + "balance_loss_mlp": 0.01253627, + "epoch": 0.2715767323012175, + "flos": 17714902798080.0, + "grad_norm": 1.854313921742246, + "language_loss": 0.76927733, + "learning_rate": 3.4184070330650866e-06, + "loss": 0.84699214, + "num_input_tokens_seen": 97717410, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 0.19287109, + "step": 4517, + "time_per_iteration": 2.576723098754883 + }, + { + "auxiliary_loss_clip": 0.06500702, + "auxiliary_loss_mlp": 0.01276287, + "balance_loss_clip": 0.06291518, + "balance_loss_mlp": 0.01256701, + "epoch": 0.27163685555388545, + "flos": 22389430849920.0, + "grad_norm": 2.0334929641517956, + "language_loss": 0.7833634, + "learning_rate": 3.4181324342044607e-06, + "loss": 0.86113334, + "num_input_tokens_seen": 97734545, + "router_z_loss_clip": 2.09179688, + "router_z_loss_mlp": 0.19592285, + "step": 4518, + "time_per_iteration": 2.5335004329681396 + }, + { + "auxiliary_loss_clip": 0.06502728, + "auxiliary_loss_mlp": 0.01276691, + "balance_loss_clip": 0.06292961, + "balance_loss_mlp": 0.0125925, + "epoch": 0.2716969788065534, + "flos": 22353358867200.0, + "grad_norm": 1.6261203259974584, + "language_loss": 0.68873644, + "learning_rate": 3.41785778156811e-06, + "loss": 0.76653063, + "num_input_tokens_seen": 97754000, + "router_z_loss_clip": 2.09863281, + "router_z_loss_mlp": 0.17443848, + "step": 4519, + "time_per_iteration": 2.60939359664917 + }, + { + "auxiliary_loss_clip": 0.06500532, + "auxiliary_loss_mlp": 0.0127723, + "balance_loss_clip": 0.06291862, + "balance_loss_mlp": 0.01260302, + "epoch": 0.2717571020592214, + "flos": 25235497031040.0, + "grad_norm": 1.9620818548787327, + "language_loss": 0.75925875, + "learning_rate": 3.417583075166451e-06, + "loss": 0.83703637, + "num_input_tokens_seen": 97772080, + "router_z_loss_clip": 2.08789062, + "router_z_loss_mlp": 0.16931152, + "step": 4520, + "time_per_iteration": 3.988518238067627 + }, + { + "auxiliary_loss_clip": 0.06503896, + "auxiliary_loss_mlp": 0.012736, + "balance_loss_clip": 0.06291716, + "balance_loss_mlp": 0.01253942, + "epoch": 0.2718172253118894, + "flos": 20195343688320.0, + "grad_norm": 3.05783023991908, + "language_loss": 0.76690799, + "learning_rate": 3.4173083150099e-06, + "loss": 0.84468293, + "num_input_tokens_seen": 97789370, + "router_z_loss_clip": 2.12304688, + "router_z_loss_mlp": 0.1965332, + "step": 4521, + "time_per_iteration": 3.9463987350463867 + }, + { + "auxiliary_loss_clip": 0.0650706, + "auxiliary_loss_mlp": 0.0127528, + "balance_loss_clip": 0.06291709, + "balance_loss_mlp": 0.01255432, + "epoch": 0.27187734856455736, + "flos": 14324318858880.0, + "grad_norm": 2.0792585055499435, + "language_loss": 0.74927616, + "learning_rate": 3.417033501108875e-06, + "loss": 0.82709956, + "num_input_tokens_seen": 97807385, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.19824219, + "step": 4522, + "time_per_iteration": 2.576792001724243 + }, + { + "auxiliary_loss_clip": 0.06503602, + "auxiliary_loss_mlp": 0.01276885, + "balance_loss_clip": 0.06291734, + "balance_loss_mlp": 0.01258884, + "epoch": 0.27193747181722533, + "flos": 21114375269760.0, + "grad_norm": 1.7974712998396492, + "language_loss": 0.73055947, + "learning_rate": 3.416758633473798e-06, + "loss": 0.80836433, + "num_input_tokens_seen": 97827930, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.17993164, + "step": 4523, + "time_per_iteration": 2.5116758346557617 + }, + { + "auxiliary_loss_clip": 0.06493908, + "auxiliary_loss_mlp": 0.01278011, + "balance_loss_clip": 0.06286807, + "balance_loss_mlp": 0.01259665, + "epoch": 0.2719975950698933, + "flos": 19688910410880.0, + "grad_norm": 1.3231652709358832, + "language_loss": 0.74779463, + "learning_rate": 3.4164837121150915e-06, + "loss": 0.82551384, + "num_input_tokens_seen": 97847440, + "router_z_loss_clip": 2.0703125, + "router_z_loss_mlp": 0.18334961, + "step": 4524, + "time_per_iteration": 2.5318901538848877 + }, + { + "auxiliary_loss_clip": 0.06503987, + "auxiliary_loss_mlp": 0.01277059, + "balance_loss_clip": 0.06291917, + "balance_loss_mlp": 0.01258248, + "epoch": 0.27205771832256126, + "flos": 24761488083840.0, + "grad_norm": 2.222226091972884, + "language_loss": 0.76783192, + "learning_rate": 3.4162087370431803e-06, + "loss": 0.84564239, + "num_input_tokens_seen": 97867620, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.18811035, + "step": 4525, + "time_per_iteration": 2.594209909439087 + }, + { + "auxiliary_loss_clip": 0.06492639, + "auxiliary_loss_mlp": 0.01271759, + "balance_loss_clip": 0.0628486, + "balance_loss_mlp": 0.01254712, + "epoch": 0.2721178415752292, + "flos": 21760903774080.0, + "grad_norm": 1.8877793172534498, + "language_loss": 0.82166058, + "learning_rate": 3.4159337082684926e-06, + "loss": 0.89930463, + "num_input_tokens_seen": 97884345, + "router_z_loss_clip": 2.07617188, + "router_z_loss_mlp": 0.17041016, + "step": 4526, + "time_per_iteration": 3.9739785194396973 + }, + { + "auxiliary_loss_clip": 0.06510428, + "auxiliary_loss_mlp": 0.01273954, + "balance_loss_clip": 0.06292043, + "balance_loss_mlp": 0.01254189, + "epoch": 0.2721779648278972, + "flos": 12681667416960.0, + "grad_norm": 2.608637418907724, + "language_loss": 0.77407986, + "learning_rate": 3.4156586258014566e-06, + "loss": 0.8519237, + "num_input_tokens_seen": 97901500, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.19763184, + "step": 4527, + "time_per_iteration": 2.5017969608306885 + }, + { + "auxiliary_loss_clip": 0.06502572, + "auxiliary_loss_mlp": 0.01278457, + "balance_loss_clip": 0.0629287, + "balance_loss_mlp": 0.01260194, + "epoch": 0.27223808808056515, + "flos": 16258774544640.0, + "grad_norm": 2.1231016049423608, + "language_loss": 0.82676923, + "learning_rate": 3.415383489652503e-06, + "loss": 0.90457952, + "num_input_tokens_seen": 97917800, + "router_z_loss_clip": 2.09570312, + "router_z_loss_mlp": 0.18249512, + "step": 4528, + "time_per_iteration": 2.5011186599731445 + }, + { + "auxiliary_loss_clip": 0.06500327, + "auxiliary_loss_mlp": 0.012781, + "balance_loss_clip": 0.06293638, + "balance_loss_mlp": 0.01260064, + "epoch": 0.2722982113332331, + "flos": 27753225788160.0, + "grad_norm": 1.6573852241711216, + "language_loss": 0.77553773, + "learning_rate": 3.4151082998320666e-06, + "loss": 0.85332191, + "num_input_tokens_seen": 97937225, + "router_z_loss_clip": 2.06640625, + "router_z_loss_mlp": 0.18041992, + "step": 4529, + "time_per_iteration": 2.5810396671295166 + }, + { + "auxiliary_loss_clip": 0.06499013, + "auxiliary_loss_mlp": 0.01277211, + "balance_loss_clip": 0.06287834, + "balance_loss_mlp": 0.01259055, + "epoch": 0.2723583345859011, + "flos": 21732756001920.0, + "grad_norm": 2.1115027178358354, + "language_loss": 0.82665265, + "learning_rate": 3.4148330563505805e-06, + "loss": 0.90441489, + "num_input_tokens_seen": 97956845, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.18164062, + "step": 4530, + "time_per_iteration": 2.586454391479492 + }, + { + "auxiliary_loss_clip": 0.06502904, + "auxiliary_loss_mlp": 0.01282339, + "balance_loss_clip": 0.06295159, + "balance_loss_mlp": 0.0126379, + "epoch": 0.27241845783856905, + "flos": 17352925159680.0, + "grad_norm": 2.154635693147181, + "language_loss": 0.92694783, + "learning_rate": 3.4145577592184838e-06, + "loss": 1.0048002, + "num_input_tokens_seen": 97972465, + "router_z_loss_clip": 2.078125, + "router_z_loss_mlp": 0.18530273, + "step": 4531, + "time_per_iteration": 2.5160703659057617 + }, + { + "auxiliary_loss_clip": 0.06501545, + "auxiliary_loss_mlp": 0.01275858, + "balance_loss_clip": 0.06288925, + "balance_loss_mlp": 0.01257928, + "epoch": 0.272478581091237, + "flos": 24761278448640.0, + "grad_norm": 1.903467624841223, + "language_loss": 0.76781744, + "learning_rate": 3.4142824084462155e-06, + "loss": 0.84559143, + "num_input_tokens_seen": 97990770, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.17919922, + "step": 4532, + "time_per_iteration": 2.568319082260132 + }, + { + "auxiliary_loss_clip": 0.06500092, + "auxiliary_loss_mlp": 0.01272039, + "balance_loss_clip": 0.06296673, + "balance_loss_mlp": 0.0125448, + "epoch": 0.272538704343905, + "flos": 17895723909120.0, + "grad_norm": 2.5230523304945685, + "language_loss": 0.89717656, + "learning_rate": 3.4140070040442162e-06, + "loss": 0.97489792, + "num_input_tokens_seen": 98005775, + "router_z_loss_clip": 2.03125, + "router_z_loss_mlp": 0.17565918, + "step": 4533, + "time_per_iteration": 2.538637399673462 + }, + { + "auxiliary_loss_clip": 0.06497633, + "auxiliary_loss_mlp": 0.01272152, + "balance_loss_clip": 0.06294405, + "balance_loss_mlp": 0.01255559, + "epoch": 0.272598827596573, + "flos": 22939021779840.0, + "grad_norm": 1.9282389689502992, + "language_loss": 0.72213519, + "learning_rate": 3.413731546022929e-06, + "loss": 0.79983306, + "num_input_tokens_seen": 98025750, + "router_z_loss_clip": 2.03027344, + "router_z_loss_mlp": 0.16589355, + "step": 4534, + "time_per_iteration": 2.5503549575805664 + }, + { + "auxiliary_loss_clip": 0.06500763, + "auxiliary_loss_mlp": 0.01275564, + "balance_loss_clip": 0.06290451, + "balance_loss_mlp": 0.01255847, + "epoch": 0.27265895084924097, + "flos": 24244447265280.0, + "grad_norm": 1.8514773269853142, + "language_loss": 0.91784394, + "learning_rate": 3.4134560343928005e-06, + "loss": 0.99560714, + "num_input_tokens_seen": 98044955, + "router_z_loss_clip": 2.10449219, + "router_z_loss_mlp": 0.19702148, + "step": 4535, + "time_per_iteration": 2.558943510055542 + }, + { + "auxiliary_loss_clip": 0.06506651, + "auxiliary_loss_mlp": 0.01276542, + "balance_loss_clip": 0.06297188, + "balance_loss_mlp": 0.01258768, + "epoch": 0.27271907410190893, + "flos": 27019962708480.0, + "grad_norm": 1.7799258806344853, + "language_loss": 0.73195565, + "learning_rate": 3.4131804691642778e-06, + "loss": 0.80978757, + "num_input_tokens_seen": 98065860, + "router_z_loss_clip": 2.09472656, + "router_z_loss_mlp": 0.17773438, + "step": 4536, + "time_per_iteration": 2.5590782165527344 + }, + { + "auxiliary_loss_clip": 0.06502935, + "auxiliary_loss_mlp": 0.01275256, + "balance_loss_clip": 0.0629502, + "balance_loss_mlp": 0.01257351, + "epoch": 0.2727791973545769, + "flos": 34460027568000.0, + "grad_norm": 1.8462150885541477, + "language_loss": 0.72167033, + "learning_rate": 3.41290485034781e-06, + "loss": 0.79945225, + "num_input_tokens_seen": 98085450, + "router_z_loss_clip": 2.07714844, + "router_z_loss_mlp": 0.17907715, + "step": 4537, + "time_per_iteration": 2.680515766143799 + }, + { + "auxiliary_loss_clip": 0.06501988, + "auxiliary_loss_mlp": 0.01276469, + "balance_loss_clip": 0.06293489, + "balance_loss_mlp": 0.0125829, + "epoch": 0.27283932060724486, + "flos": 15045842367360.0, + "grad_norm": 2.3888098238231503, + "language_loss": 0.78421736, + "learning_rate": 3.4126291779538485e-06, + "loss": 0.8620019, + "num_input_tokens_seen": 98099115, + "router_z_loss_clip": 2.08496094, + "router_z_loss_mlp": 0.1817627, + "step": 4538, + "time_per_iteration": 2.4626059532165527 + }, + { + "auxiliary_loss_clip": 0.06506806, + "auxiliary_loss_mlp": 0.01275863, + "balance_loss_clip": 0.06298484, + "balance_loss_mlp": 0.01258566, + "epoch": 0.2728994438599128, + "flos": 21658767246720.0, + "grad_norm": 1.6357140094020364, + "language_loss": 0.90640903, + "learning_rate": 3.412353451992847e-06, + "loss": 0.9842357, + "num_input_tokens_seen": 98118415, + "router_z_loss_clip": 2.08007812, + "router_z_loss_mlp": 0.17297363, + "step": 4539, + "time_per_iteration": 2.5629584789276123 + }, + { + "auxiliary_loss_clip": 0.06501281, + "auxiliary_loss_mlp": 0.01272923, + "balance_loss_clip": 0.06294584, + "balance_loss_mlp": 0.01253778, + "epoch": 0.2729595671125808, + "flos": 17493313875840.0, + "grad_norm": 1.7229738452441967, + "language_loss": 0.88610893, + "learning_rate": 3.4120776724752607e-06, + "loss": 0.96385098, + "num_input_tokens_seen": 98136300, + "router_z_loss_clip": 2.06542969, + "router_z_loss_mlp": 0.19140625, + "step": 4540, + "time_per_iteration": 2.4959304332733154 + }, + { + "auxiliary_loss_clip": 0.06504017, + "auxiliary_loss_mlp": 0.01274499, + "balance_loss_clip": 0.06294081, + "balance_loss_mlp": 0.0125744, + "epoch": 0.27301969036524876, + "flos": 19324249441920.0, + "grad_norm": 2.2191409784662, + "language_loss": 0.8242712, + "learning_rate": 3.4118018394115476e-06, + "loss": 0.9020564, + "num_input_tokens_seen": 98154580, + "router_z_loss_clip": 2.09960938, + "router_z_loss_mlp": 0.17053223, + "step": 4541, + "time_per_iteration": 2.550239086151123 + }, + { + "auxiliary_loss_clip": 0.06500127, + "auxiliary_loss_mlp": 0.01279087, + "balance_loss_clip": 0.06291916, + "balance_loss_mlp": 0.01260431, + "epoch": 0.2730798136179167, + "flos": 21071427252480.0, + "grad_norm": 2.3060281935178795, + "language_loss": 0.80131608, + "learning_rate": 3.4115259528121678e-06, + "loss": 0.87910819, + "num_input_tokens_seen": 98173115, + "router_z_loss_clip": 2.08007812, + "router_z_loss_mlp": 0.18664551, + "step": 4542, + "time_per_iteration": 2.519717216491699 + }, + { + "auxiliary_loss_clip": 0.06509651, + "auxiliary_loss_mlp": 0.01276731, + "balance_loss_clip": 0.06301565, + "balance_loss_mlp": 0.01258599, + "epoch": 0.2731399368705847, + "flos": 19177739377920.0, + "grad_norm": 1.9524817452008785, + "language_loss": 0.89606124, + "learning_rate": 3.411250012687582e-06, + "loss": 0.97392499, + "num_input_tokens_seen": 98190260, + "router_z_loss_clip": 2.08105469, + "router_z_loss_mlp": 0.18139648, + "step": 4543, + "time_per_iteration": 2.5182156562805176 + }, + { + "auxiliary_loss_clip": 0.06509942, + "auxiliary_loss_mlp": 0.01279235, + "balance_loss_clip": 0.06297313, + "balance_loss_mlp": 0.012604, + "epoch": 0.27320006012325265, + "flos": 18294989414400.0, + "grad_norm": 2.101118642115193, + "language_loss": 0.64112943, + "learning_rate": 3.410974019048255e-06, + "loss": 0.7190212, + "num_input_tokens_seen": 98207115, + "router_z_loss_clip": 2.12597656, + "router_z_loss_mlp": 0.18823242, + "step": 4544, + "time_per_iteration": 2.482348918914795 + }, + { + "auxiliary_loss_clip": 0.06504791, + "auxiliary_loss_mlp": 0.01282982, + "balance_loss_clip": 0.06296986, + "balance_loss_mlp": 0.01264231, + "epoch": 0.2732601833759206, + "flos": 34869607125120.0, + "grad_norm": 1.6845842729353224, + "language_loss": 0.70290005, + "learning_rate": 3.410697971904651e-06, + "loss": 0.78077781, + "num_input_tokens_seen": 98230610, + "router_z_loss_clip": 2.07714844, + "router_z_loss_mlp": 0.1875, + "step": 4545, + "time_per_iteration": 2.6779940128326416 + }, + { + "auxiliary_loss_clip": 0.06375119, + "auxiliary_loss_mlp": 0.01256033, + "balance_loss_clip": 0.06273499, + "balance_loss_mlp": 0.01252296, + "epoch": 0.2733203066285886, + "flos": 53929514534400.0, + "grad_norm": 0.7176798913576009, + "language_loss": 0.61676908, + "learning_rate": 3.4104218712672383e-06, + "loss": 0.6930806, + "num_input_tokens_seen": 98293585, + "router_z_loss_clip": 1.015625, + "router_z_loss_mlp": 0.03729248, + "step": 4546, + "time_per_iteration": 3.1508243083953857 + }, + { + "auxiliary_loss_clip": 0.06510071, + "auxiliary_loss_mlp": 0.01277702, + "balance_loss_clip": 0.06301852, + "balance_loss_mlp": 0.01258843, + "epoch": 0.2733804298812566, + "flos": 20665411493760.0, + "grad_norm": 1.9095347334938924, + "language_loss": 0.65170372, + "learning_rate": 3.410145717146488e-06, + "loss": 0.72958136, + "num_input_tokens_seen": 98311680, + "router_z_loss_clip": 2.08105469, + "router_z_loss_mlp": 0.1887207, + "step": 4547, + "time_per_iteration": 2.57828426361084 + }, + { + "auxiliary_loss_clip": 0.06498976, + "auxiliary_loss_mlp": 0.0127425, + "balance_loss_clip": 0.06296893, + "balance_loss_mlp": 0.01257799, + "epoch": 0.27344055313392457, + "flos": 25891333338240.0, + "grad_norm": 2.438857151480637, + "language_loss": 0.78365928, + "learning_rate": 3.4098695095528694e-06, + "loss": 0.86139154, + "num_input_tokens_seen": 98330770, + "router_z_loss_clip": 2.02246094, + "router_z_loss_mlp": 0.16455078, + "step": 4548, + "time_per_iteration": 2.566077470779419 + }, + { + "auxiliary_loss_clip": 0.0650417, + "auxiliary_loss_mlp": 0.01276844, + "balance_loss_clip": 0.06295689, + "balance_loss_mlp": 0.01259785, + "epoch": 0.27350067638659253, + "flos": 22936380376320.0, + "grad_norm": 2.3129649243249157, + "language_loss": 0.83350241, + "learning_rate": 3.4095932484968585e-06, + "loss": 0.91131258, + "num_input_tokens_seen": 98349860, + "router_z_loss_clip": 2.08496094, + "router_z_loss_mlp": 0.17053223, + "step": 4549, + "time_per_iteration": 2.560349941253662 + }, + { + "auxiliary_loss_clip": 0.06503863, + "auxiliary_loss_mlp": 0.01277801, + "balance_loss_clip": 0.06292209, + "balance_loss_mlp": 0.0125707, + "epoch": 0.2735607996392605, + "flos": 16579313539200.0, + "grad_norm": 2.1355332193902568, + "language_loss": 0.71687186, + "learning_rate": 3.4093169339889305e-06, + "loss": 0.79468852, + "num_input_tokens_seen": 98367040, + "router_z_loss_clip": 2.11328125, + "router_z_loss_mlp": 0.20727539, + "step": 4550, + "time_per_iteration": 2.4829771518707275 + }, + { + "auxiliary_loss_clip": 0.06503724, + "auxiliary_loss_mlp": 0.01271817, + "balance_loss_clip": 0.06298332, + "balance_loss_mlp": 0.01253435, + "epoch": 0.27362092289192846, + "flos": 19651245200640.0, + "grad_norm": 2.4590448673698546, + "language_loss": 0.79561722, + "learning_rate": 3.409040566039563e-06, + "loss": 0.87337267, + "num_input_tokens_seen": 98384010, + "router_z_loss_clip": 2.05078125, + "router_z_loss_mlp": 0.18371582, + "step": 4551, + "time_per_iteration": 2.5074269771575928 + }, + { + "auxiliary_loss_clip": 0.06500211, + "auxiliary_loss_mlp": 0.01281852, + "balance_loss_clip": 0.06290769, + "balance_loss_mlp": 0.01263565, + "epoch": 0.27368104614459643, + "flos": 17644855184640.0, + "grad_norm": 2.2858009613836465, + "language_loss": 0.71362597, + "learning_rate": 3.4087641446592362e-06, + "loss": 0.79144663, + "num_input_tokens_seen": 98399625, + "router_z_loss_clip": 2.09765625, + "router_z_loss_mlp": 0.18286133, + "step": 4552, + "time_per_iteration": 2.478208541870117 + }, + { + "auxiliary_loss_clip": 0.0650662, + "auxiliary_loss_mlp": 0.01277463, + "balance_loss_clip": 0.06295393, + "balance_loss_mlp": 0.01258759, + "epoch": 0.2737411693972644, + "flos": 21586455573120.0, + "grad_norm": 1.8660820035104149, + "language_loss": 0.71756262, + "learning_rate": 3.408487669858431e-06, + "loss": 0.79540348, + "num_input_tokens_seen": 98417310, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.18701172, + "step": 4553, + "time_per_iteration": 2.5268712043762207 + }, + { + "auxiliary_loss_clip": 0.0650337, + "auxiliary_loss_mlp": 0.01274907, + "balance_loss_clip": 0.06293483, + "balance_loss_mlp": 0.01255738, + "epoch": 0.27380129264993236, + "flos": 25491145438080.0, + "grad_norm": 1.7561499880950933, + "language_loss": 0.60065031, + "learning_rate": 3.4082111416476337e-06, + "loss": 0.67843306, + "num_input_tokens_seen": 98438670, + "router_z_loss_clip": 2.09765625, + "router_z_loss_mlp": 0.19177246, + "step": 4554, + "time_per_iteration": 2.5836522579193115 + }, + { + "auxiliary_loss_clip": 0.06509934, + "auxiliary_loss_mlp": 0.01281174, + "balance_loss_clip": 0.06291255, + "balance_loss_mlp": 0.01261838, + "epoch": 0.2738614159026003, + "flos": 18667155323520.0, + "grad_norm": 1.5632450212680145, + "language_loss": 0.74850649, + "learning_rate": 3.4079345600373275e-06, + "loss": 0.82641757, + "num_input_tokens_seen": 98456060, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.1932373, + "step": 4555, + "time_per_iteration": 3.9590039253234863 + }, + { + "auxiliary_loss_clip": 0.06511028, + "auxiliary_loss_mlp": 0.01279514, + "balance_loss_clip": 0.0629926, + "balance_loss_mlp": 0.0125982, + "epoch": 0.2739215391552683, + "flos": 23483874954240.0, + "grad_norm": 6.994475758797384, + "language_loss": 0.7822473, + "learning_rate": 3.407657925038002e-06, + "loss": 0.86015272, + "num_input_tokens_seen": 98473765, + "router_z_loss_clip": 2.11523438, + "router_z_loss_mlp": 0.19677734, + "step": 4556, + "time_per_iteration": 2.5688674449920654 + }, + { + "auxiliary_loss_clip": 0.06517123, + "auxiliary_loss_mlp": 0.01280796, + "balance_loss_clip": 0.06293104, + "balance_loss_mlp": 0.01260125, + "epoch": 0.27398166240793626, + "flos": 17134313057280.0, + "grad_norm": 1.8677949115203087, + "language_loss": 0.83077759, + "learning_rate": 3.4073812366601473e-06, + "loss": 0.90875673, + "num_input_tokens_seen": 98490590, + "router_z_loss_clip": 2.2421875, + "router_z_loss_mlp": 0.20690918, + "step": 4557, + "time_per_iteration": 2.490562915802002 + }, + { + "auxiliary_loss_clip": 0.06504503, + "auxiliary_loss_mlp": 0.01276773, + "balance_loss_clip": 0.06292793, + "balance_loss_mlp": 0.01256292, + "epoch": 0.2740417856606042, + "flos": 23411563280640.0, + "grad_norm": 1.9738441909854203, + "language_loss": 0.73066616, + "learning_rate": 3.4071044949142547e-06, + "loss": 0.80847895, + "num_input_tokens_seen": 98510590, + "router_z_loss_clip": 2.1171875, + "router_z_loss_mlp": 0.20483398, + "step": 4558, + "time_per_iteration": 2.5761232376098633 + }, + { + "auxiliary_loss_clip": 0.06504066, + "auxiliary_loss_mlp": 0.01276845, + "balance_loss_clip": 0.06292865, + "balance_loss_mlp": 0.01256651, + "epoch": 0.2741019089132722, + "flos": 12784307068800.0, + "grad_norm": 2.149984670873407, + "language_loss": 0.68751299, + "learning_rate": 3.406827699810819e-06, + "loss": 0.76532209, + "num_input_tokens_seen": 98527875, + "router_z_loss_clip": 2.10839844, + "router_z_loss_mlp": 0.2019043, + "step": 4559, + "time_per_iteration": 2.4976439476013184 + }, + { + "auxiliary_loss_clip": 0.06501673, + "auxiliary_loss_mlp": 0.01278249, + "balance_loss_clip": 0.0629222, + "balance_loss_mlp": 0.01259676, + "epoch": 0.27416203216594015, + "flos": 20637850700160.0, + "grad_norm": 1.7403202614473876, + "language_loss": 0.72741163, + "learning_rate": 3.4065508513603353e-06, + "loss": 0.80521083, + "num_input_tokens_seen": 98547575, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 0.18566895, + "step": 4560, + "time_per_iteration": 4.005557537078857 + }, + { + "auxiliary_loss_clip": 0.06501405, + "auxiliary_loss_mlp": 0.01278052, + "balance_loss_clip": 0.06289977, + "balance_loss_mlp": 0.01259718, + "epoch": 0.27422215541860817, + "flos": 26548762872960.0, + "grad_norm": 1.7791790627265829, + "language_loss": 0.82245278, + "learning_rate": 3.406273949573303e-06, + "loss": 0.90024734, + "num_input_tokens_seen": 98566290, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.18334961, + "step": 4561, + "time_per_iteration": 4.059048652648926 + }, + { + "auxiliary_loss_clip": 0.06510133, + "auxiliary_loss_mlp": 0.01276094, + "balance_loss_clip": 0.06296331, + "balance_loss_mlp": 0.012564, + "epoch": 0.27428227867127614, + "flos": 23337868014720.0, + "grad_norm": 1.9098162884662422, + "language_loss": 0.75760031, + "learning_rate": 3.4059969944602214e-06, + "loss": 0.83546257, + "num_input_tokens_seen": 98586255, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.19702148, + "step": 4562, + "time_per_iteration": 2.558397054672241 + }, + { + "auxiliary_loss_clip": 0.06506505, + "auxiliary_loss_mlp": 0.01277189, + "balance_loss_clip": 0.06293164, + "balance_loss_mlp": 0.01258092, + "epoch": 0.2743424019239441, + "flos": 23041074453120.0, + "grad_norm": 1.577834756327151, + "language_loss": 0.75198597, + "learning_rate": 3.4057199860315928e-06, + "loss": 0.8298229, + "num_input_tokens_seen": 98606030, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.19091797, + "step": 4563, + "time_per_iteration": 2.5698354244232178 + }, + { + "auxiliary_loss_clip": 0.06524341, + "auxiliary_loss_mlp": 0.01283879, + "balance_loss_clip": 0.06305183, + "balance_loss_mlp": 0.01262302, + "epoch": 0.27440252517661207, + "flos": 21987565868160.0, + "grad_norm": 2.0193615345580085, + "language_loss": 0.6348893, + "learning_rate": 3.4054429242979213e-06, + "loss": 0.71297145, + "num_input_tokens_seen": 98625225, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.21569824, + "step": 4564, + "time_per_iteration": 2.545741558074951 + }, + { + "auxiliary_loss_clip": 0.06513885, + "auxiliary_loss_mlp": 0.01280066, + "balance_loss_clip": 0.06299828, + "balance_loss_mlp": 0.01260647, + "epoch": 0.27446264842928003, + "flos": 40196952737280.0, + "grad_norm": 2.2005709679787153, + "language_loss": 0.7878077, + "learning_rate": 3.4051658092697135e-06, + "loss": 0.86574721, + "num_input_tokens_seen": 98649470, + "router_z_loss_clip": 2.14160156, + "router_z_loss_mlp": 0.19433594, + "step": 4565, + "time_per_iteration": 2.7061169147491455 + }, + { + "auxiliary_loss_clip": 0.0650921, + "auxiliary_loss_mlp": 0.01277346, + "balance_loss_clip": 0.06296623, + "balance_loss_mlp": 0.01257903, + "epoch": 0.274522771681948, + "flos": 13484684620800.0, + "grad_norm": 1.9604173340299715, + "language_loss": 0.69729757, + "learning_rate": 3.404888640957477e-06, + "loss": 0.77516317, + "num_input_tokens_seen": 98666915, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.19458008, + "step": 4566, + "time_per_iteration": 3.9156126976013184 + }, + { + "auxiliary_loss_clip": 0.06511474, + "auxiliary_loss_mlp": 0.0128161, + "balance_loss_clip": 0.06300822, + "balance_loss_mlp": 0.0126318, + "epoch": 0.27458289493461596, + "flos": 28629812476800.0, + "grad_norm": 1.605297231279352, + "language_loss": 0.61699307, + "learning_rate": 3.404611419371723e-06, + "loss": 0.69492388, + "num_input_tokens_seen": 98688240, + "router_z_loss_clip": 2.10546875, + "router_z_loss_mlp": 0.18432617, + "step": 4567, + "time_per_iteration": 2.5721306800842285 + }, + { + "auxiliary_loss_clip": 0.06514515, + "auxiliary_loss_mlp": 0.01275467, + "balance_loss_clip": 0.06299441, + "balance_loss_mlp": 0.01255511, + "epoch": 0.2746430181872839, + "flos": 20125883053440.0, + "grad_norm": 1.9422441687055725, + "language_loss": 0.83055782, + "learning_rate": 3.4043341445229627e-06, + "loss": 0.90845764, + "num_input_tokens_seen": 98708245, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.19970703, + "step": 4568, + "time_per_iteration": 2.5616700649261475 + }, + { + "auxiliary_loss_clip": 0.06521738, + "auxiliary_loss_mlp": 0.01275653, + "balance_loss_clip": 0.06304733, + "balance_loss_mlp": 0.01255709, + "epoch": 0.2747031414399519, + "flos": 20199662173440.0, + "grad_norm": 2.1285143693034367, + "language_loss": 0.6896143, + "learning_rate": 3.4040568164217117e-06, + "loss": 0.76758814, + "num_input_tokens_seen": 98724575, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.19934082, + "step": 4569, + "time_per_iteration": 2.531096935272217 + }, + { + "auxiliary_loss_clip": 0.06517979, + "auxiliary_loss_mlp": 0.01281496, + "balance_loss_clip": 0.06303072, + "balance_loss_mlp": 0.0126216, + "epoch": 0.27476326469261986, + "flos": 13521385509120.0, + "grad_norm": 2.4613635331126926, + "language_loss": 0.71897286, + "learning_rate": 3.4037794350784848e-06, + "loss": 0.79696763, + "num_input_tokens_seen": 98740700, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.19360352, + "step": 4570, + "time_per_iteration": 2.5235774517059326 + }, + { + "auxiliary_loss_clip": 0.06414898, + "auxiliary_loss_mlp": 0.01257276, + "balance_loss_clip": 0.06312878, + "balance_loss_mlp": 0.01253897, + "epoch": 0.2748233879452878, + "flos": 65955486153600.0, + "grad_norm": 0.6977768363268191, + "language_loss": 0.5577414, + "learning_rate": 3.4035020005038014e-06, + "loss": 0.63446319, + "num_input_tokens_seen": 98803030, + "router_z_loss_clip": 1.015625, + "router_z_loss_mlp": 0.03387451, + "step": 4571, + "time_per_iteration": 3.234433889389038 + }, + { + "auxiliary_loss_clip": 0.06526154, + "auxiliary_loss_mlp": 0.01279423, + "balance_loss_clip": 0.06308736, + "balance_loss_mlp": 0.01260326, + "epoch": 0.2748835111979558, + "flos": 17389961464320.0, + "grad_norm": 2.165338105639142, + "language_loss": 0.78105313, + "learning_rate": 3.4032245127081812e-06, + "loss": 0.85910892, + "num_input_tokens_seen": 98820505, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.19104004, + "step": 4572, + "time_per_iteration": 2.562450647354126 + }, + { + "auxiliary_loss_clip": 0.06506811, + "auxiliary_loss_mlp": 0.01278507, + "balance_loss_clip": 0.06298923, + "balance_loss_mlp": 0.01261711, + "epoch": 0.27494363445062375, + "flos": 23594480743680.0, + "grad_norm": 2.0912194071895014, + "language_loss": 0.81855798, + "learning_rate": 3.402946971702147e-06, + "loss": 0.89641118, + "num_input_tokens_seen": 98842150, + "router_z_loss_clip": 2.078125, + "router_z_loss_mlp": 0.16809082, + "step": 4573, + "time_per_iteration": 2.575467824935913 + }, + { + "auxiliary_loss_clip": 0.06512269, + "auxiliary_loss_mlp": 0.01277933, + "balance_loss_clip": 0.06303579, + "balance_loss_mlp": 0.01258585, + "epoch": 0.2750037577032918, + "flos": 17170175404800.0, + "grad_norm": 1.5550185346959569, + "language_loss": 0.79688454, + "learning_rate": 3.402669377496223e-06, + "loss": 0.87478662, + "num_input_tokens_seen": 98861050, + "router_z_loss_clip": 2.08398438, + "router_z_loss_mlp": 0.19360352, + "step": 4574, + "time_per_iteration": 2.522381067276001 + }, + { + "auxiliary_loss_clip": 0.06514049, + "auxiliary_loss_mlp": 0.012813, + "balance_loss_clip": 0.06300252, + "balance_loss_mlp": 0.01263383, + "epoch": 0.27506388095595974, + "flos": 24497663904000.0, + "grad_norm": 1.9638366231768782, + "language_loss": 0.75217533, + "learning_rate": 3.402391730100936e-06, + "loss": 0.83012879, + "num_input_tokens_seen": 98879695, + "router_z_loss_clip": 2.14160156, + "router_z_loss_mlp": 0.17907715, + "step": 4575, + "time_per_iteration": 2.564023971557617 + }, + { + "auxiliary_loss_clip": 0.06513455, + "auxiliary_loss_mlp": 0.01285217, + "balance_loss_clip": 0.06304657, + "balance_loss_mlp": 0.01267562, + "epoch": 0.2751240042086277, + "flos": 38774003500800.0, + "grad_norm": 1.5894976166299741, + "language_loss": 0.71788073, + "learning_rate": 3.402114029526814e-06, + "loss": 0.79586744, + "num_input_tokens_seen": 98902035, + "router_z_loss_clip": 2.08789062, + "router_z_loss_mlp": 0.17663574, + "step": 4576, + "time_per_iteration": 2.6856141090393066 + }, + { + "auxiliary_loss_clip": 0.06515673, + "auxiliary_loss_mlp": 0.01294199, + "balance_loss_clip": 0.06304252, + "balance_loss_mlp": 0.0127447, + "epoch": 0.27518412746129567, + "flos": 26914388163840.0, + "grad_norm": 1.693116107866749, + "language_loss": 0.73358452, + "learning_rate": 3.4018362757843866e-06, + "loss": 0.81168324, + "num_input_tokens_seen": 98921835, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.19726562, + "step": 4577, + "time_per_iteration": 2.5795719623565674 + }, + { + "auxiliary_loss_clip": 0.06517484, + "auxiliary_loss_mlp": 0.01279945, + "balance_loss_clip": 0.0630409, + "balance_loss_mlp": 0.01260514, + "epoch": 0.27524425071396363, + "flos": 24907578877440.0, + "grad_norm": 1.9498672791378742, + "language_loss": 0.76234132, + "learning_rate": 3.401558468884188e-06, + "loss": 0.84031564, + "num_input_tokens_seen": 98939610, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.19433594, + "step": 4578, + "time_per_iteration": 2.5547378063201904 + }, + { + "auxiliary_loss_clip": 0.06518476, + "auxiliary_loss_mlp": 0.01286331, + "balance_loss_clip": 0.06304644, + "balance_loss_mlp": 0.01265255, + "epoch": 0.2753043739666316, + "flos": 26295504307200.0, + "grad_norm": 1.3718100748583155, + "language_loss": 0.66504484, + "learning_rate": 3.4012806088367516e-06, + "loss": 0.74309289, + "num_input_tokens_seen": 98962250, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.21069336, + "step": 4579, + "time_per_iteration": 2.6126484870910645 + }, + { + "auxiliary_loss_clip": 0.06516613, + "auxiliary_loss_mlp": 0.01291851, + "balance_loss_clip": 0.06301446, + "balance_loss_mlp": 0.01271753, + "epoch": 0.27536449721929956, + "flos": 24213616162560.0, + "grad_norm": 3.1986582184359853, + "language_loss": 0.80722374, + "learning_rate": 3.4010026956526137e-06, + "loss": 0.88530838, + "num_input_tokens_seen": 98981845, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.2010498, + "step": 4580, + "time_per_iteration": 2.571364164352417 + }, + { + "auxiliary_loss_clip": 0.06513728, + "auxiliary_loss_mlp": 0.01285107, + "balance_loss_clip": 0.06304168, + "balance_loss_mlp": 0.01264305, + "epoch": 0.27542462047196753, + "flos": 19543448522880.0, + "grad_norm": 1.580662182314359, + "language_loss": 0.68234229, + "learning_rate": 3.4007247293423137e-06, + "loss": 0.76033062, + "num_input_tokens_seen": 99001855, + "router_z_loss_clip": 2.09570312, + "router_z_loss_mlp": 0.20788574, + "step": 4581, + "time_per_iteration": 2.5507936477661133 + }, + { + "auxiliary_loss_clip": 0.06515522, + "auxiliary_loss_mlp": 0.01276377, + "balance_loss_clip": 0.06298342, + "balance_loss_mlp": 0.01258448, + "epoch": 0.2754847437246355, + "flos": 14324360785920.0, + "grad_norm": 1.5474830525473977, + "language_loss": 0.78408682, + "learning_rate": 3.400446709916392e-06, + "loss": 0.86200583, + "num_input_tokens_seen": 99019880, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.17919922, + "step": 4582, + "time_per_iteration": 2.511134624481201 + }, + { + "auxiliary_loss_clip": 0.06505451, + "auxiliary_loss_mlp": 0.01284248, + "balance_loss_clip": 0.06298563, + "balance_loss_mlp": 0.01266605, + "epoch": 0.27554486697730346, + "flos": 18843951438720.0, + "grad_norm": 1.627014419094476, + "language_loss": 0.84829235, + "learning_rate": 3.4001686373853895e-06, + "loss": 0.92618936, + "num_input_tokens_seen": 99037570, + "router_z_loss_clip": 2.06933594, + "router_z_loss_mlp": 0.17663574, + "step": 4583, + "time_per_iteration": 2.5625038146972656 + }, + { + "auxiliary_loss_clip": 0.065156, + "auxiliary_loss_mlp": 0.01295136, + "balance_loss_clip": 0.0629985, + "balance_loss_mlp": 0.01274799, + "epoch": 0.2756049902299714, + "flos": 22388801944320.0, + "grad_norm": 2.5216327683147104, + "language_loss": 0.67592049, + "learning_rate": 3.3998905117598528e-06, + "loss": 0.75402784, + "num_input_tokens_seen": 99056875, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.20349121, + "step": 4584, + "time_per_iteration": 2.5712413787841797 + }, + { + "auxiliary_loss_clip": 0.06508277, + "auxiliary_loss_mlp": 0.01286302, + "balance_loss_clip": 0.06299593, + "balance_loss_mlp": 0.01268385, + "epoch": 0.2756651134826394, + "flos": 19580107484160.0, + "grad_norm": 1.7056038485870715, + "language_loss": 0.77640843, + "learning_rate": 3.399612333050327e-06, + "loss": 0.8543542, + "num_input_tokens_seen": 99074685, + "router_z_loss_clip": 2.08789062, + "router_z_loss_mlp": 0.17919922, + "step": 4585, + "time_per_iteration": 2.5581910610198975 + }, + { + "auxiliary_loss_clip": 0.06520131, + "auxiliary_loss_mlp": 0.01290999, + "balance_loss_clip": 0.06302814, + "balance_loss_mlp": 0.01271151, + "epoch": 0.27572523673530736, + "flos": 23593306786560.0, + "grad_norm": 1.6012607614221503, + "language_loss": 0.72652835, + "learning_rate": 3.399334101267362e-06, + "loss": 0.8046397, + "num_input_tokens_seen": 99095300, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.1986084, + "step": 4586, + "time_per_iteration": 2.5581955909729004 + }, + { + "auxiliary_loss_clip": 0.06512299, + "auxiliary_loss_mlp": 0.01283131, + "balance_loss_clip": 0.06300563, + "balance_loss_mlp": 0.01264475, + "epoch": 0.2757853599879754, + "flos": 22826696981760.0, + "grad_norm": 1.4211606049909042, + "language_loss": 0.8102116, + "learning_rate": 3.3990558164215073e-06, + "loss": 0.88816595, + "num_input_tokens_seen": 99115965, + "router_z_loss_clip": 2.11523438, + "router_z_loss_mlp": 0.18664551, + "step": 4587, + "time_per_iteration": 2.6184678077697754 + }, + { + "auxiliary_loss_clip": 0.0651072, + "auxiliary_loss_mlp": 0.01292397, + "balance_loss_clip": 0.06300361, + "balance_loss_mlp": 0.01273037, + "epoch": 0.27584548324064334, + "flos": 18557639637120.0, + "grad_norm": 2.3677019636161716, + "language_loss": 0.83699477, + "learning_rate": 3.398777478523316e-06, + "loss": 0.91502589, + "num_input_tokens_seen": 99134265, + "router_z_loss_clip": 2.10351562, + "router_z_loss_mlp": 0.19348145, + "step": 4588, + "time_per_iteration": 2.5100526809692383 + }, + { + "auxiliary_loss_clip": 0.06502403, + "auxiliary_loss_mlp": 0.01287014, + "balance_loss_clip": 0.06294176, + "balance_loss_mlp": 0.0126856, + "epoch": 0.2759056064933113, + "flos": 23776811228160.0, + "grad_norm": 1.8520309888563375, + "language_loss": 0.76066566, + "learning_rate": 3.398499087583342e-06, + "loss": 0.83855987, + "num_input_tokens_seen": 99156185, + "router_z_loss_clip": 2.08398438, + "router_z_loss_mlp": 0.18457031, + "step": 4589, + "time_per_iteration": 2.5906028747558594 + }, + { + "auxiliary_loss_clip": 0.06503198, + "auxiliary_loss_mlp": 0.01281135, + "balance_loss_clip": 0.06293473, + "balance_loss_mlp": 0.01261703, + "epoch": 0.27596572974597927, + "flos": 24289114291200.0, + "grad_norm": 1.7619688929899446, + "language_loss": 0.88857687, + "learning_rate": 3.398220643612143e-06, + "loss": 0.96642017, + "num_input_tokens_seen": 99176735, + "router_z_loss_clip": 2.09570312, + "router_z_loss_mlp": 0.19421387, + "step": 4590, + "time_per_iteration": 2.5526933670043945 + }, + { + "auxiliary_loss_clip": 0.0650104, + "auxiliary_loss_mlp": 0.01279948, + "balance_loss_clip": 0.06291595, + "balance_loss_mlp": 0.01261041, + "epoch": 0.27602585299864724, + "flos": 35049296206080.0, + "grad_norm": 1.573202994920717, + "language_loss": 0.71835011, + "learning_rate": 3.397942146620277e-06, + "loss": 0.79615998, + "num_input_tokens_seen": 99199765, + "router_z_loss_clip": 2.09667969, + "router_z_loss_mlp": 0.18908691, + "step": 4591, + "time_per_iteration": 2.659573554992676 + }, + { + "auxiliary_loss_clip": 0.06502488, + "auxiliary_loss_mlp": 0.01277501, + "balance_loss_clip": 0.06290874, + "balance_loss_mlp": 0.01258964, + "epoch": 0.2760859762513152, + "flos": 24315123784320.0, + "grad_norm": 2.0980893762293866, + "language_loss": 0.80327255, + "learning_rate": 3.3976635966183046e-06, + "loss": 0.8810724, + "num_input_tokens_seen": 99218435, + "router_z_loss_clip": 2.11621094, + "router_z_loss_mlp": 0.18530273, + "step": 4592, + "time_per_iteration": 2.5534770488739014 + }, + { + "auxiliary_loss_clip": 0.06405188, + "auxiliary_loss_mlp": 0.01272136, + "balance_loss_clip": 0.06302959, + "balance_loss_mlp": 0.0126841, + "epoch": 0.27614609950398317, + "flos": 71279435675520.0, + "grad_norm": 0.6848268802880488, + "language_loss": 0.6162945, + "learning_rate": 3.3973849936167886e-06, + "loss": 0.69306767, + "num_input_tokens_seen": 99276200, + "router_z_loss_clip": 1.0234375, + "router_z_loss_mlp": 0.03717041, + "step": 4593, + "time_per_iteration": 3.127192735671997 + }, + { + "auxiliary_loss_clip": 0.06506699, + "auxiliary_loss_mlp": 0.01276217, + "balance_loss_clip": 0.0629646, + "balance_loss_mlp": 0.01256881, + "epoch": 0.27620622275665113, + "flos": 29681811688320.0, + "grad_norm": 2.6081053554454363, + "language_loss": 0.77380788, + "learning_rate": 3.3971063376262937e-06, + "loss": 0.85163713, + "num_input_tokens_seen": 99297625, + "router_z_loss_clip": 2.1015625, + "router_z_loss_mlp": 0.1932373, + "step": 4594, + "time_per_iteration": 2.5809319019317627 + }, + { + "auxiliary_loss_clip": 0.06503148, + "auxiliary_loss_mlp": 0.01273163, + "balance_loss_clip": 0.06295307, + "balance_loss_mlp": 0.01255138, + "epoch": 0.2762663460093191, + "flos": 15383571448320.0, + "grad_norm": 1.4453472339612206, + "language_loss": 0.9229176, + "learning_rate": 3.3968276286573866e-06, + "loss": 1.00068069, + "num_input_tokens_seen": 99315790, + "router_z_loss_clip": 2.08007812, + "router_z_loss_mlp": 0.18029785, + "step": 4595, + "time_per_iteration": 3.9466536045074463 + }, + { + "auxiliary_loss_clip": 0.06509015, + "auxiliary_loss_mlp": 0.01281786, + "balance_loss_clip": 0.06294905, + "balance_loss_mlp": 0.01261592, + "epoch": 0.27632646926198706, + "flos": 20710330081920.0, + "grad_norm": 1.8151181533722092, + "language_loss": 0.69491673, + "learning_rate": 3.3965488667206353e-06, + "loss": 0.77282476, + "num_input_tokens_seen": 99334615, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.2019043, + "step": 4596, + "time_per_iteration": 2.552893877029419 + }, + { + "auxiliary_loss_clip": 0.06517404, + "auxiliary_loss_mlp": 0.01272476, + "balance_loss_clip": 0.0629788, + "balance_loss_mlp": 0.0125382, + "epoch": 0.276386592514655, + "flos": 32820981851520.0, + "grad_norm": 1.6734752779014743, + "language_loss": 0.64091378, + "learning_rate": 3.3962700518266113e-06, + "loss": 0.71881258, + "num_input_tokens_seen": 99356685, + "router_z_loss_clip": 2.19335938, + "router_z_loss_mlp": 0.18652344, + "step": 4597, + "time_per_iteration": 2.61291766166687 + }, + { + "auxiliary_loss_clip": 0.06500123, + "auxiliary_loss_mlp": 0.01279427, + "balance_loss_clip": 0.0629456, + "balance_loss_mlp": 0.01260616, + "epoch": 0.276446715767323, + "flos": 18557639637120.0, + "grad_norm": 1.8925825739150304, + "language_loss": 0.86690855, + "learning_rate": 3.395991183985887e-06, + "loss": 0.94470406, + "num_input_tokens_seen": 99374810, + "router_z_loss_clip": 2.05664062, + "router_z_loss_mlp": 0.18835449, + "step": 4598, + "time_per_iteration": 2.5411598682403564 + }, + { + "auxiliary_loss_clip": 0.0650408, + "auxiliary_loss_mlp": 0.0127527, + "balance_loss_clip": 0.06291056, + "balance_loss_mlp": 0.01256554, + "epoch": 0.27650683901999096, + "flos": 22826110003200.0, + "grad_norm": 2.378506410601605, + "language_loss": 0.79588032, + "learning_rate": 3.395712263209037e-06, + "loss": 0.8736738, + "num_input_tokens_seen": 99391290, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.18725586, + "step": 4599, + "time_per_iteration": 2.515411138534546 + }, + { + "auxiliary_loss_clip": 0.06518425, + "auxiliary_loss_mlp": 0.01279235, + "balance_loss_clip": 0.06301137, + "balance_loss_mlp": 0.01259756, + "epoch": 0.276566962272659, + "flos": 21368011178880.0, + "grad_norm": 2.1602669865212487, + "language_loss": 0.80043805, + "learning_rate": 3.395433289506639e-06, + "loss": 0.87841463, + "num_input_tokens_seen": 99409120, + "router_z_loss_clip": 2.17480469, + "router_z_loss_mlp": 0.19482422, + "step": 4600, + "time_per_iteration": 5.317862033843994 + }, + { + "auxiliary_loss_clip": 0.06511359, + "auxiliary_loss_mlp": 0.01276749, + "balance_loss_clip": 0.06296661, + "balance_loss_mlp": 0.01258843, + "epoch": 0.27662708552532694, + "flos": 17716076755200.0, + "grad_norm": 12.932121146702709, + "language_loss": 0.73461431, + "learning_rate": 3.3951542628892694e-06, + "loss": 0.81249541, + "num_input_tokens_seen": 99426180, + "router_z_loss_clip": 2.14550781, + "router_z_loss_mlp": 0.17907715, + "step": 4601, + "time_per_iteration": 2.5192854404449463 + }, + { + "auxiliary_loss_clip": 0.0650773, + "auxiliary_loss_mlp": 0.01282643, + "balance_loss_clip": 0.06297003, + "balance_loss_mlp": 0.01263676, + "epoch": 0.2766872087779949, + "flos": 21259292106240.0, + "grad_norm": 1.833059055741047, + "language_loss": 0.8051585, + "learning_rate": 3.3948751833675113e-06, + "loss": 0.88306224, + "num_input_tokens_seen": 99447720, + "router_z_loss_clip": 2.10742188, + "router_z_loss_mlp": 0.18981934, + "step": 4602, + "time_per_iteration": 2.635265350341797 + }, + { + "auxiliary_loss_clip": 0.06517955, + "auxiliary_loss_mlp": 0.01279657, + "balance_loss_clip": 0.06297721, + "balance_loss_mlp": 0.01259749, + "epoch": 0.2767473320306629, + "flos": 12936728845440.0, + "grad_norm": 2.082735068257359, + "language_loss": 0.7691201, + "learning_rate": 3.3945960509519455e-06, + "loss": 0.8470962, + "num_input_tokens_seen": 99464720, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.19921875, + "step": 4603, + "time_per_iteration": 2.6102261543273926 + }, + { + "auxiliary_loss_clip": 0.06506386, + "auxiliary_loss_mlp": 0.01276601, + "balance_loss_clip": 0.06300791, + "balance_loss_mlp": 0.01259017, + "epoch": 0.27680745528333084, + "flos": 15018239646720.0, + "grad_norm": 1.5173997695974415, + "language_loss": 0.81704807, + "learning_rate": 3.3943168656531585e-06, + "loss": 0.89487797, + "num_input_tokens_seen": 99482310, + "router_z_loss_clip": 2.05859375, + "router_z_loss_mlp": 0.17578125, + "step": 4604, + "time_per_iteration": 2.5022366046905518 + }, + { + "auxiliary_loss_clip": 0.06510165, + "auxiliary_loss_mlp": 0.01279666, + "balance_loss_clip": 0.06295862, + "balance_loss_mlp": 0.01261367, + "epoch": 0.2768675785359988, + "flos": 22644408424320.0, + "grad_norm": 1.8407701121062605, + "language_loss": 0.70736969, + "learning_rate": 3.3940376274817363e-06, + "loss": 0.78526795, + "num_input_tokens_seen": 99501255, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.18310547, + "step": 4605, + "time_per_iteration": 4.068409442901611 + }, + { + "auxiliary_loss_clip": 0.06402105, + "auxiliary_loss_mlp": 0.01269906, + "balance_loss_clip": 0.0629937, + "balance_loss_mlp": 0.01266097, + "epoch": 0.27692770178866677, + "flos": 66150772093440.0, + "grad_norm": 0.7075303746126435, + "language_loss": 0.57218695, + "learning_rate": 3.3937583364482673e-06, + "loss": 0.64890707, + "num_input_tokens_seen": 99568925, + "router_z_loss_clip": 1.03027344, + "router_z_loss_mlp": 0.0380249, + "step": 4606, + "time_per_iteration": 3.269275426864624 + }, + { + "auxiliary_loss_clip": 0.06516754, + "auxiliary_loss_mlp": 0.01286288, + "balance_loss_clip": 0.06299627, + "balance_loss_mlp": 0.01266118, + "epoch": 0.27698782504133473, + "flos": 26471545735680.0, + "grad_norm": 1.9632725808751148, + "language_loss": 0.69427574, + "learning_rate": 3.3934789925633424e-06, + "loss": 0.77230614, + "num_input_tokens_seen": 99588455, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.20153809, + "step": 4607, + "time_per_iteration": 2.566908836364746 + }, + { + "auxiliary_loss_clip": 0.06512889, + "auxiliary_loss_mlp": 0.01276778, + "balance_loss_clip": 0.06304939, + "balance_loss_mlp": 0.01258849, + "epoch": 0.2770479482940027, + "flos": 25891878389760.0, + "grad_norm": 1.6636880421304368, + "language_loss": 0.70338356, + "learning_rate": 3.393199595837555e-06, + "loss": 0.78128028, + "num_input_tokens_seen": 99609355, + "router_z_loss_clip": 2.08203125, + "router_z_loss_mlp": 0.17919922, + "step": 4608, + "time_per_iteration": 2.709989309310913 + }, + { + "auxiliary_loss_clip": 0.06514756, + "auxiliary_loss_mlp": 0.01279509, + "balance_loss_clip": 0.06298438, + "balance_loss_mlp": 0.01260781, + "epoch": 0.27710807154667066, + "flos": 22863942921600.0, + "grad_norm": 1.8326330841759049, + "language_loss": 0.73323762, + "learning_rate": 3.392920146281499e-06, + "loss": 0.81118023, + "num_input_tokens_seen": 99628780, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.18725586, + "step": 4609, + "time_per_iteration": 2.530625581741333 + }, + { + "auxiliary_loss_clip": 0.06522895, + "auxiliary_loss_mlp": 0.01276886, + "balance_loss_clip": 0.063067, + "balance_loss_mlp": 0.0125749, + "epoch": 0.27716819479933863, + "flos": 17716621806720.0, + "grad_norm": 2.1915868475112714, + "language_loss": 0.84688777, + "learning_rate": 3.3926406439057714e-06, + "loss": 0.92488557, + "num_input_tokens_seen": 99644545, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.19396973, + "step": 4610, + "time_per_iteration": 2.578780174255371 + }, + { + "auxiliary_loss_clip": 0.06521606, + "auxiliary_loss_mlp": 0.01280928, + "balance_loss_clip": 0.06305112, + "balance_loss_mlp": 0.01260054, + "epoch": 0.2772283180520066, + "flos": 19652125668480.0, + "grad_norm": 1.9738462991775114, + "language_loss": 0.69718874, + "learning_rate": 3.3923610887209705e-06, + "loss": 0.77521408, + "num_input_tokens_seen": 99663125, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.20874023, + "step": 4611, + "time_per_iteration": 2.5499660968780518 + }, + { + "auxiliary_loss_clip": 0.0651576, + "auxiliary_loss_mlp": 0.0127314, + "balance_loss_clip": 0.06309414, + "balance_loss_mlp": 0.01254997, + "epoch": 0.27728844130467456, + "flos": 21038960995200.0, + "grad_norm": 1.8677227151172762, + "language_loss": 0.74507141, + "learning_rate": 3.392081480737698e-06, + "loss": 0.82296044, + "num_input_tokens_seen": 99682645, + "router_z_loss_clip": 2.06347656, + "router_z_loss_mlp": 0.18151855, + "step": 4612, + "time_per_iteration": 2.567218065261841 + }, + { + "auxiliary_loss_clip": 0.06522087, + "auxiliary_loss_mlp": 0.01282319, + "balance_loss_clip": 0.06306847, + "balance_loss_mlp": 0.01263067, + "epoch": 0.2773485645573425, + "flos": 18995157331200.0, + "grad_norm": 2.3882423035535063, + "language_loss": 0.67084455, + "learning_rate": 3.3918018199665563e-06, + "loss": 0.74888861, + "num_input_tokens_seen": 99700520, + "router_z_loss_clip": 2.14941406, + "router_z_loss_mlp": 0.19250488, + "step": 4613, + "time_per_iteration": 2.5458126068115234 + }, + { + "auxiliary_loss_clip": 0.06515062, + "auxiliary_loss_mlp": 0.01275499, + "balance_loss_clip": 0.06304698, + "balance_loss_mlp": 0.0125577, + "epoch": 0.27740868781001055, + "flos": 21474508118400.0, + "grad_norm": 1.6100748666203144, + "language_loss": 0.79936564, + "learning_rate": 3.39152210641815e-06, + "loss": 0.87727129, + "num_input_tokens_seen": 99720355, + "router_z_loss_clip": 2.1015625, + "router_z_loss_mlp": 0.19750977, + "step": 4614, + "time_per_iteration": 2.5586962699890137 + }, + { + "auxiliary_loss_clip": 0.06520429, + "auxiliary_loss_mlp": 0.01279079, + "balance_loss_clip": 0.06305806, + "balance_loss_mlp": 0.01257884, + "epoch": 0.2774688110626785, + "flos": 19833827247360.0, + "grad_norm": 2.249482091575283, + "language_loss": 0.81082475, + "learning_rate": 3.3912423401030865e-06, + "loss": 0.88881981, + "num_input_tokens_seen": 99736090, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.21179199, + "step": 4615, + "time_per_iteration": 2.5192136764526367 + }, + { + "auxiliary_loss_clip": 0.0652476, + "auxiliary_loss_mlp": 0.0127518, + "balance_loss_clip": 0.06306368, + "balance_loss_mlp": 0.01256655, + "epoch": 0.2775289343153465, + "flos": 18220916805120.0, + "grad_norm": 2.6879454427381715, + "language_loss": 0.64382082, + "learning_rate": 3.3909625210319735e-06, + "loss": 0.72182024, + "num_input_tokens_seen": 99751805, + "router_z_loss_clip": 2.18164062, + "router_z_loss_mlp": 0.18518066, + "step": 4616, + "time_per_iteration": 2.528766393661499 + }, + { + "auxiliary_loss_clip": 0.06523173, + "auxiliary_loss_mlp": 0.01284441, + "balance_loss_clip": 0.06308753, + "balance_loss_mlp": 0.0126377, + "epoch": 0.27758905756801444, + "flos": 16478141333760.0, + "grad_norm": 2.0768832102625296, + "language_loss": 0.82857239, + "learning_rate": 3.3906826492154226e-06, + "loss": 0.90664852, + "num_input_tokens_seen": 99770610, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.20678711, + "step": 4617, + "time_per_iteration": 2.5130555629730225 + }, + { + "auxiliary_loss_clip": 0.06522305, + "auxiliary_loss_mlp": 0.01278739, + "balance_loss_clip": 0.06308021, + "balance_loss_mlp": 0.01260059, + "epoch": 0.2776491808206824, + "flos": 18733219868160.0, + "grad_norm": 2.583119020836192, + "language_loss": 0.77338278, + "learning_rate": 3.3904027246640458e-06, + "loss": 0.85139322, + "num_input_tokens_seen": 99787305, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.18676758, + "step": 4618, + "time_per_iteration": 2.5491156578063965 + }, + { + "auxiliary_loss_clip": 0.06524394, + "auxiliary_loss_mlp": 0.01277476, + "balance_loss_clip": 0.06309742, + "balance_loss_mlp": 0.01260191, + "epoch": 0.27770930407335037, + "flos": 28045742791680.0, + "grad_norm": 1.764934716544716, + "language_loss": 0.85733759, + "learning_rate": 3.390122747388459e-06, + "loss": 0.93535626, + "num_input_tokens_seen": 99808940, + "router_z_loss_clip": 2.14648438, + "router_z_loss_mlp": 0.17297363, + "step": 4619, + "time_per_iteration": 2.5741615295410156 + }, + { + "auxiliary_loss_clip": 0.06514929, + "auxiliary_loss_mlp": 0.01285121, + "balance_loss_clip": 0.06308962, + "balance_loss_mlp": 0.01266798, + "epoch": 0.27776942732601834, + "flos": 23556522044160.0, + "grad_norm": 1.4813387132666624, + "language_loss": 0.77092409, + "learning_rate": 3.3898427173992778e-06, + "loss": 0.84892452, + "num_input_tokens_seen": 99829575, + "router_z_loss_clip": 2.0625, + "router_z_loss_mlp": 0.18322754, + "step": 4620, + "time_per_iteration": 2.690934658050537 + }, + { + "auxiliary_loss_clip": 0.0651743, + "auxiliary_loss_mlp": 0.01277569, + "balance_loss_clip": 0.06309397, + "balance_loss_mlp": 0.0125821, + "epoch": 0.2778295505786863, + "flos": 23914474686720.0, + "grad_norm": 1.8907472710416175, + "language_loss": 0.78585863, + "learning_rate": 3.389562634707122e-06, + "loss": 0.86380863, + "num_input_tokens_seen": 99847575, + "router_z_loss_clip": 2.07910156, + "router_z_loss_mlp": 0.19360352, + "step": 4621, + "time_per_iteration": 2.5846168994903564 + }, + { + "auxiliary_loss_clip": 0.06522836, + "auxiliary_loss_mlp": 0.01279001, + "balance_loss_clip": 0.0630835, + "balance_loss_mlp": 0.01259701, + "epoch": 0.27788967383135427, + "flos": 25561276905600.0, + "grad_norm": 2.170367430288875, + "language_loss": 0.88217753, + "learning_rate": 3.389282499322611e-06, + "loss": 0.96019584, + "num_input_tokens_seen": 99864995, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.1932373, + "step": 4622, + "time_per_iteration": 2.6036407947540283 + }, + { + "auxiliary_loss_clip": 0.06512653, + "auxiliary_loss_mlp": 0.01273349, + "balance_loss_clip": 0.06299745, + "balance_loss_mlp": 0.01254919, + "epoch": 0.27794979708402223, + "flos": 16258103712000.0, + "grad_norm": 2.5896700244630018, + "language_loss": 0.81515396, + "learning_rate": 3.389002311256369e-06, + "loss": 0.89301395, + "num_input_tokens_seen": 99881540, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.18432617, + "step": 4623, + "time_per_iteration": 2.539655923843384 + }, + { + "auxiliary_loss_clip": 0.06518189, + "auxiliary_loss_mlp": 0.01278229, + "balance_loss_clip": 0.06306686, + "balance_loss_mlp": 0.01258941, + "epoch": 0.2780099203366902, + "flos": 20673880755840.0, + "grad_norm": 1.9609752985345037, + "language_loss": 0.82099682, + "learning_rate": 3.3887220705190204e-06, + "loss": 0.89896095, + "num_input_tokens_seen": 99899595, + "router_z_loss_clip": 2.11621094, + "router_z_loss_mlp": 0.19274902, + "step": 4624, + "time_per_iteration": 2.5662107467651367 + }, + { + "auxiliary_loss_clip": 0.06512089, + "auxiliary_loss_mlp": 0.01276338, + "balance_loss_clip": 0.06303106, + "balance_loss_mlp": 0.01258004, + "epoch": 0.27807004358935816, + "flos": 17743805256960.0, + "grad_norm": 3.013190567677447, + "language_loss": 0.77269506, + "learning_rate": 3.388441777121191e-06, + "loss": 0.85057938, + "num_input_tokens_seen": 99913020, + "router_z_loss_clip": 2.08789062, + "router_z_loss_mlp": 0.18322754, + "step": 4625, + "time_per_iteration": 2.5685927867889404 + }, + { + "auxiliary_loss_clip": 0.06507699, + "auxiliary_loss_mlp": 0.01271539, + "balance_loss_clip": 0.06299223, + "balance_loss_mlp": 0.01253658, + "epoch": 0.2781301668420261, + "flos": 16732699637760.0, + "grad_norm": 1.9769276375727096, + "language_loss": 0.70884871, + "learning_rate": 3.388161431073511e-06, + "loss": 0.78664112, + "num_input_tokens_seen": 99931405, + "router_z_loss_clip": 2.08398438, + "router_z_loss_mlp": 0.17883301, + "step": 4626, + "time_per_iteration": 2.527975559234619 + }, + { + "auxiliary_loss_clip": 0.06520554, + "auxiliary_loss_mlp": 0.01273216, + "balance_loss_clip": 0.06304689, + "balance_loss_mlp": 0.01254798, + "epoch": 0.27819029009469415, + "flos": 13849848714240.0, + "grad_norm": 2.4481240639566013, + "language_loss": 0.93016249, + "learning_rate": 3.38788103238661e-06, + "loss": 1.00810015, + "num_input_tokens_seen": 99948100, + "router_z_loss_clip": 2.15917969, + "router_z_loss_mlp": 0.18432617, + "step": 4627, + "time_per_iteration": 2.551558494567871 + }, + { + "auxiliary_loss_clip": 0.06514014, + "auxiliary_loss_mlp": 0.01276758, + "balance_loss_clip": 0.06298277, + "balance_loss_mlp": 0.01258364, + "epoch": 0.2782504133473621, + "flos": 27096634794240.0, + "grad_norm": 1.6603793888564844, + "language_loss": 0.85558021, + "learning_rate": 3.387600581071121e-06, + "loss": 0.93348801, + "num_input_tokens_seen": 99966470, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.1842041, + "step": 4628, + "time_per_iteration": 2.56680965423584 + }, + { + "auxiliary_loss_clip": 0.06511193, + "auxiliary_loss_mlp": 0.01275379, + "balance_loss_clip": 0.06301076, + "balance_loss_mlp": 0.01257569, + "epoch": 0.2783105366000301, + "flos": 21075116832000.0, + "grad_norm": 1.7183700627805243, + "language_loss": 0.79370463, + "learning_rate": 3.387320077137679e-06, + "loss": 0.87157035, + "num_input_tokens_seen": 99985930, + "router_z_loss_clip": 2.1015625, + "router_z_loss_mlp": 0.17810059, + "step": 4629, + "time_per_iteration": 2.579024076461792 + }, + { + "auxiliary_loss_clip": 0.06504764, + "auxiliary_loss_mlp": 0.01277211, + "balance_loss_clip": 0.06300465, + "balance_loss_mlp": 0.01259699, + "epoch": 0.27837065985269804, + "flos": 26508456259200.0, + "grad_norm": 2.4632649346037856, + "language_loss": 0.84664094, + "learning_rate": 3.3870395205969208e-06, + "loss": 0.92446071, + "num_input_tokens_seen": 100006235, + "router_z_loss_clip": 2.04296875, + "router_z_loss_mlp": 0.17529297, + "step": 4630, + "time_per_iteration": 2.568190336227417 + }, + { + "auxiliary_loss_clip": 0.06516108, + "auxiliary_loss_mlp": 0.01271169, + "balance_loss_clip": 0.06302783, + "balance_loss_mlp": 0.01253395, + "epoch": 0.278430783105366, + "flos": 20228271143040.0, + "grad_norm": 1.8872458968592738, + "language_loss": 0.80858278, + "learning_rate": 3.386758911459485e-06, + "loss": 0.8864556, + "num_input_tokens_seen": 100023655, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.17773438, + "step": 4631, + "time_per_iteration": 2.5658912658691406 + }, + { + "auxiliary_loss_clip": 0.06512441, + "auxiliary_loss_mlp": 0.01275522, + "balance_loss_clip": 0.06299636, + "balance_loss_mlp": 0.01256866, + "epoch": 0.278490906358034, + "flos": 25599906437760.0, + "grad_norm": 2.407277572133289, + "language_loss": 0.715128, + "learning_rate": 3.3864782497360126e-06, + "loss": 0.79300761, + "num_input_tokens_seen": 100043280, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.18652344, + "step": 4632, + "time_per_iteration": 2.620729446411133 + }, + { + "auxiliary_loss_clip": 0.06502309, + "auxiliary_loss_mlp": 0.01270787, + "balance_loss_clip": 0.06296511, + "balance_loss_mlp": 0.01253502, + "epoch": 0.27855102961070194, + "flos": 16175645694720.0, + "grad_norm": 1.8302171024684264, + "language_loss": 0.82394838, + "learning_rate": 3.386197535437145e-06, + "loss": 0.9016794, + "num_input_tokens_seen": 100057690, + "router_z_loss_clip": 2.05859375, + "router_z_loss_mlp": 0.17297363, + "step": 4633, + "time_per_iteration": 2.513705015182495 + }, + { + "auxiliary_loss_clip": 0.06511516, + "auxiliary_loss_mlp": 0.01278904, + "balance_loss_clip": 0.06299913, + "balance_loss_mlp": 0.012597, + "epoch": 0.2786111528633699, + "flos": 22933864753920.0, + "grad_norm": 1.5843012688553681, + "language_loss": 0.8872478, + "learning_rate": 3.385916768573529e-06, + "loss": 0.96515197, + "num_input_tokens_seen": 100075875, + "router_z_loss_clip": 2.11523438, + "router_z_loss_mlp": 0.19213867, + "step": 4634, + "time_per_iteration": 2.5471088886260986 + }, + { + "auxiliary_loss_clip": 0.06514788, + "auxiliary_loss_mlp": 0.01276007, + "balance_loss_clip": 0.06301814, + "balance_loss_mlp": 0.01256588, + "epoch": 0.27867127611603787, + "flos": 23410934375040.0, + "grad_norm": 1.5369483246730489, + "language_loss": 0.77466059, + "learning_rate": 3.38563594915581e-06, + "loss": 0.85256851, + "num_input_tokens_seen": 100092930, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.19433594, + "step": 4635, + "time_per_iteration": 3.9016311168670654 + }, + { + "auxiliary_loss_clip": 0.06508552, + "auxiliary_loss_mlp": 0.01273062, + "balance_loss_clip": 0.06295648, + "balance_loss_mlp": 0.01254859, + "epoch": 0.27873139936870583, + "flos": 19835210839680.0, + "grad_norm": 1.7801998538005617, + "language_loss": 0.66571766, + "learning_rate": 3.385355077194637e-06, + "loss": 0.74353385, + "num_input_tokens_seen": 100110790, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.18188477, + "step": 4636, + "time_per_iteration": 2.5264599323272705 + }, + { + "auxiliary_loss_clip": 0.06519878, + "auxiliary_loss_mlp": 0.01275894, + "balance_loss_clip": 0.06302889, + "balance_loss_mlp": 0.01256392, + "epoch": 0.2787915226213738, + "flos": 17712638737920.0, + "grad_norm": 2.933733922484583, + "language_loss": 0.83255613, + "learning_rate": 3.3850741527006604e-06, + "loss": 0.91051382, + "num_input_tokens_seen": 100126970, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.19506836, + "step": 4637, + "time_per_iteration": 2.5344014167785645 + }, + { + "auxiliary_loss_clip": 0.06505676, + "auxiliary_loss_mlp": 0.01276787, + "balance_loss_clip": 0.06297021, + "balance_loss_mlp": 0.01258918, + "epoch": 0.27885164587404176, + "flos": 22097039627520.0, + "grad_norm": 1.4932909871395708, + "language_loss": 0.76038569, + "learning_rate": 3.384793175684533e-06, + "loss": 0.83821034, + "num_input_tokens_seen": 100146720, + "router_z_loss_clip": 2.08691406, + "router_z_loss_mlp": 0.17871094, + "step": 4638, + "time_per_iteration": 2.544187068939209 + }, + { + "auxiliary_loss_clip": 0.06510019, + "auxiliary_loss_mlp": 0.01280274, + "balance_loss_clip": 0.06297282, + "balance_loss_mlp": 0.01262511, + "epoch": 0.27891176912670973, + "flos": 19213601725440.0, + "grad_norm": 2.235877812045319, + "language_loss": 0.72492748, + "learning_rate": 3.38451214615691e-06, + "loss": 0.8028304, + "num_input_tokens_seen": 100165920, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.17749023, + "step": 4639, + "time_per_iteration": 4.002680063247681 + }, + { + "auxiliary_loss_clip": 0.06515414, + "auxiliary_loss_mlp": 0.0127372, + "balance_loss_clip": 0.06300536, + "balance_loss_mlp": 0.01254813, + "epoch": 0.27897189237937775, + "flos": 27607428483840.0, + "grad_norm": 1.8877142592522154, + "language_loss": 0.66217673, + "learning_rate": 3.384231064128447e-06, + "loss": 0.74006808, + "num_input_tokens_seen": 100185525, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.18896484, + "step": 4640, + "time_per_iteration": 4.054874420166016 + }, + { + "auxiliary_loss_clip": 0.0651349, + "auxiliary_loss_mlp": 0.01272631, + "balance_loss_clip": 0.06301108, + "balance_loss_mlp": 0.01254654, + "epoch": 0.2790320156320457, + "flos": 21184506737280.0, + "grad_norm": 2.077527470737851, + "language_loss": 0.72818768, + "learning_rate": 3.383949929609804e-06, + "loss": 0.80604887, + "num_input_tokens_seen": 100204850, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.1796875, + "step": 4641, + "time_per_iteration": 2.566758155822754 + }, + { + "auxiliary_loss_clip": 0.06517549, + "auxiliary_loss_mlp": 0.01276062, + "balance_loss_clip": 0.06298883, + "balance_loss_mlp": 0.01256488, + "epoch": 0.2790921388847137, + "flos": 22790541144960.0, + "grad_norm": 1.8548696214163785, + "language_loss": 0.75277239, + "learning_rate": 3.383668742611641e-06, + "loss": 0.8307085, + "num_input_tokens_seen": 100224520, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.19567871, + "step": 4642, + "time_per_iteration": 2.5531389713287354 + }, + { + "auxiliary_loss_clip": 0.0651103, + "auxiliary_loss_mlp": 0.01281312, + "balance_loss_clip": 0.06296819, + "balance_loss_mlp": 0.01261631, + "epoch": 0.27915226213738165, + "flos": 23406783598080.0, + "grad_norm": 1.8301300365045747, + "language_loss": 0.85787475, + "learning_rate": 3.3833875031446205e-06, + "loss": 0.93579817, + "num_input_tokens_seen": 100243935, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.19689941, + "step": 4643, + "time_per_iteration": 2.561692714691162 + }, + { + "auxiliary_loss_clip": 0.06505755, + "auxiliary_loss_mlp": 0.01281002, + "balance_loss_clip": 0.06292956, + "balance_loss_mlp": 0.01262572, + "epoch": 0.2792123853900496, + "flos": 22754469162240.0, + "grad_norm": 2.128449816262669, + "language_loss": 0.83027583, + "learning_rate": 3.383106211219407e-06, + "loss": 0.9081434, + "num_input_tokens_seen": 100262290, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.1842041, + "step": 4644, + "time_per_iteration": 2.5298962593078613 + }, + { + "auxiliary_loss_clip": 0.06505448, + "auxiliary_loss_mlp": 0.01273805, + "balance_loss_clip": 0.0629155, + "balance_loss_mlp": 0.01256174, + "epoch": 0.2792725086427176, + "flos": 15054772826880.0, + "grad_norm": 1.7497246062339578, + "language_loss": 0.79546082, + "learning_rate": 3.3828248668466673e-06, + "loss": 0.87325335, + "num_input_tokens_seen": 100280015, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.17626953, + "step": 4645, + "time_per_iteration": 3.9172677993774414 + }, + { + "auxiliary_loss_clip": 0.06419063, + "auxiliary_loss_mlp": 0.01254208, + "balance_loss_clip": 0.0631457, + "balance_loss_mlp": 0.0125017, + "epoch": 0.27933263189538554, + "flos": 62562805862400.0, + "grad_norm": 0.7707831229317741, + "language_loss": 0.62136066, + "learning_rate": 3.3825434700370705e-06, + "loss": 0.6980933, + "num_input_tokens_seen": 100338935, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.04037476, + "step": 4646, + "time_per_iteration": 3.1527390480041504 + }, + { + "auxiliary_loss_clip": 0.06500821, + "auxiliary_loss_mlp": 0.01275319, + "balance_loss_clip": 0.0629313, + "balance_loss_mlp": 0.01257581, + "epoch": 0.2793927551480535, + "flos": 25125268584960.0, + "grad_norm": 1.6018723981737446, + "language_loss": 0.89582062, + "learning_rate": 3.3822620208012865e-06, + "loss": 0.97358203, + "num_input_tokens_seen": 100359905, + "router_z_loss_clip": 2.078125, + "router_z_loss_mlp": 0.17736816, + "step": 4647, + "time_per_iteration": 2.564333915710449 + }, + { + "auxiliary_loss_clip": 0.06509704, + "auxiliary_loss_mlp": 0.01277108, + "balance_loss_clip": 0.06292088, + "balance_loss_mlp": 0.01258142, + "epoch": 0.27945287840072147, + "flos": 21330974874240.0, + "grad_norm": 1.6381839497334347, + "language_loss": 0.87525821, + "learning_rate": 3.381980519149988e-06, + "loss": 0.95312631, + "num_input_tokens_seen": 100376955, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.1895752, + "step": 4648, + "time_per_iteration": 2.5516953468322754 + }, + { + "auxiliary_loss_clip": 0.06507549, + "auxiliary_loss_mlp": 0.01274847, + "balance_loss_clip": 0.06291072, + "balance_loss_mlp": 0.01256643, + "epoch": 0.27951300165338944, + "flos": 27457354621440.0, + "grad_norm": 2.652634800411286, + "language_loss": 0.73020303, + "learning_rate": 3.38169896509385e-06, + "loss": 0.80802703, + "num_input_tokens_seen": 100397545, + "router_z_loss_clip": 2.1640625, + "router_z_loss_mlp": 0.18212891, + "step": 4649, + "time_per_iteration": 2.5767719745635986 + }, + { + "auxiliary_loss_clip": 0.06508242, + "auxiliary_loss_mlp": 0.01277361, + "balance_loss_clip": 0.0629622, + "balance_loss_mlp": 0.01259003, + "epoch": 0.2795731249060574, + "flos": 15164456221440.0, + "grad_norm": 2.110277953429804, + "language_loss": 0.81314564, + "learning_rate": 3.381417358643549e-06, + "loss": 0.8910017, + "num_input_tokens_seen": 100415080, + "router_z_loss_clip": 2.12109375, + "router_z_loss_mlp": 0.18347168, + "step": 4650, + "time_per_iteration": 2.663588285446167 + }, + { + "auxiliary_loss_clip": 0.06406052, + "auxiliary_loss_mlp": 0.01252705, + "balance_loss_clip": 0.06303374, + "balance_loss_mlp": 0.01248944, + "epoch": 0.27963324815872537, + "flos": 60140951775360.0, + "grad_norm": 0.800089640521837, + "language_loss": 0.5874877, + "learning_rate": 3.3811356998097624e-06, + "loss": 0.66407531, + "num_input_tokens_seen": 100471105, + "router_z_loss_clip": 1.0234375, + "router_z_loss_mlp": 0.03753662, + "step": 4651, + "time_per_iteration": 3.205563545227051 + }, + { + "auxiliary_loss_clip": 0.06513405, + "auxiliary_loss_mlp": 0.01276159, + "balance_loss_clip": 0.06293929, + "balance_loss_mlp": 0.01257205, + "epoch": 0.27969337141139333, + "flos": 21773020688640.0, + "grad_norm": 1.70848848544609, + "language_loss": 0.74928713, + "learning_rate": 3.3808539886031726e-06, + "loss": 0.82718277, + "num_input_tokens_seen": 100492520, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.18945312, + "step": 4652, + "time_per_iteration": 2.620284080505371 + }, + { + "auxiliary_loss_clip": 0.06513481, + "auxiliary_loss_mlp": 0.01277362, + "balance_loss_clip": 0.06297033, + "balance_loss_mlp": 0.01259517, + "epoch": 0.27975349466406135, + "flos": 39859559072640.0, + "grad_norm": 2.257859492249039, + "language_loss": 0.81193566, + "learning_rate": 3.380572225034461e-06, + "loss": 0.88984406, + "num_input_tokens_seen": 100512870, + "router_z_loss_clip": 2.16601562, + "router_z_loss_mlp": 0.17834473, + "step": 4653, + "time_per_iteration": 2.6902103424072266 + }, + { + "auxiliary_loss_clip": 0.06505801, + "auxiliary_loss_mlp": 0.01275903, + "balance_loss_clip": 0.06293398, + "balance_loss_mlp": 0.01257939, + "epoch": 0.2798136179167293, + "flos": 21586204010880.0, + "grad_norm": 2.2005279612587647, + "language_loss": 0.78939915, + "learning_rate": 3.380290409114312e-06, + "loss": 0.86721623, + "num_input_tokens_seen": 100531655, + "router_z_loss_clip": 2.12304688, + "router_z_loss_mlp": 0.17956543, + "step": 4654, + "time_per_iteration": 2.5862321853637695 + }, + { + "auxiliary_loss_clip": 0.06514826, + "auxiliary_loss_mlp": 0.01276603, + "balance_loss_clip": 0.06294681, + "balance_loss_mlp": 0.01256457, + "epoch": 0.2798737411693973, + "flos": 21543130212480.0, + "grad_norm": 2.786817882874951, + "language_loss": 0.81491858, + "learning_rate": 3.3800085408534127e-06, + "loss": 0.89283288, + "num_input_tokens_seen": 100548005, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.20153809, + "step": 4655, + "time_per_iteration": 2.5335962772369385 + }, + { + "auxiliary_loss_clip": 0.06503223, + "auxiliary_loss_mlp": 0.01276615, + "balance_loss_clip": 0.06287771, + "balance_loss_mlp": 0.0125778, + "epoch": 0.27993386442206525, + "flos": 26988586554240.0, + "grad_norm": 1.7572759264995625, + "language_loss": 0.82015479, + "learning_rate": 3.3797266202624506e-06, + "loss": 0.89795309, + "num_input_tokens_seen": 100567980, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.18847656, + "step": 4656, + "time_per_iteration": 2.5953826904296875 + }, + { + "auxiliary_loss_clip": 0.0650457, + "auxiliary_loss_mlp": 0.01280726, + "balance_loss_clip": 0.06291523, + "balance_loss_mlp": 0.01261319, + "epoch": 0.2799939876747332, + "flos": 24356268938880.0, + "grad_norm": 1.602501989097996, + "language_loss": 0.83292782, + "learning_rate": 3.3794446473521176e-06, + "loss": 0.91078079, + "num_input_tokens_seen": 100588630, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.19396973, + "step": 4657, + "time_per_iteration": 2.546698808670044 + }, + { + "auxiliary_loss_clip": 0.06501682, + "auxiliary_loss_mlp": 0.01283943, + "balance_loss_clip": 0.06287715, + "balance_loss_mlp": 0.01265847, + "epoch": 0.2800541109274012, + "flos": 33665479626240.0, + "grad_norm": 2.056920585114217, + "language_loss": 0.64474404, + "learning_rate": 3.379162622133105e-06, + "loss": 0.72260022, + "num_input_tokens_seen": 100608775, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.18103027, + "step": 4658, + "time_per_iteration": 2.633352041244507 + }, + { + "auxiliary_loss_clip": 0.0650496, + "auxiliary_loss_mlp": 0.01278289, + "balance_loss_clip": 0.06292152, + "balance_loss_mlp": 0.01258298, + "epoch": 0.28011423418006914, + "flos": 21620515057920.0, + "grad_norm": 1.9139831777919125, + "language_loss": 0.78200769, + "learning_rate": 3.3788805446161073e-06, + "loss": 0.85984015, + "num_input_tokens_seen": 100627975, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.19995117, + "step": 4659, + "time_per_iteration": 2.5146000385284424 + }, + { + "auxiliary_loss_clip": 0.06512548, + "auxiliary_loss_mlp": 0.01279668, + "balance_loss_clip": 0.06298335, + "balance_loss_mlp": 0.01260582, + "epoch": 0.2801743574327371, + "flos": 23119130131200.0, + "grad_norm": 1.8180566150817747, + "language_loss": 0.79711032, + "learning_rate": 3.3785984148118215e-06, + "loss": 0.87503254, + "num_input_tokens_seen": 100645430, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.1907959, + "step": 4660, + "time_per_iteration": 2.5558273792266846 + }, + { + "auxiliary_loss_clip": 0.06502102, + "auxiliary_loss_mlp": 0.01275983, + "balance_loss_clip": 0.06293646, + "balance_loss_mlp": 0.01257732, + "epoch": 0.2802344806854051, + "flos": 12646433975040.0, + "grad_norm": 2.0195446081970685, + "language_loss": 0.8127892, + "learning_rate": 3.3783162327309453e-06, + "loss": 0.89057004, + "num_input_tokens_seen": 100663775, + "router_z_loss_clip": 2.08300781, + "router_z_loss_mlp": 0.18237305, + "step": 4661, + "time_per_iteration": 2.475562572479248 + }, + { + "auxiliary_loss_clip": 0.06508808, + "auxiliary_loss_mlp": 0.01277709, + "balance_loss_clip": 0.06296618, + "balance_loss_mlp": 0.01258898, + "epoch": 0.28029460393807304, + "flos": 37276772019840.0, + "grad_norm": 2.0240330571158904, + "language_loss": 0.79226935, + "learning_rate": 3.3780339983841794e-06, + "loss": 0.87013447, + "num_input_tokens_seen": 100686085, + "router_z_loss_clip": 2.12109375, + "router_z_loss_mlp": 0.18823242, + "step": 4662, + "time_per_iteration": 2.6644277572631836 + }, + { + "auxiliary_loss_clip": 0.06515819, + "auxiliary_loss_mlp": 0.01277387, + "balance_loss_clip": 0.06296565, + "balance_loss_mlp": 0.01258349, + "epoch": 0.280354727190741, + "flos": 20747450240640.0, + "grad_norm": 1.722651872041065, + "language_loss": 0.70744783, + "learning_rate": 3.377751711782227e-06, + "loss": 0.78537989, + "num_input_tokens_seen": 100705135, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.19042969, + "step": 4663, + "time_per_iteration": 2.5365068912506104 + }, + { + "auxiliary_loss_clip": 0.06510712, + "auxiliary_loss_mlp": 0.01280818, + "balance_loss_clip": 0.06293653, + "balance_loss_mlp": 0.01259312, + "epoch": 0.28041485044340897, + "flos": 21477526865280.0, + "grad_norm": 1.8007469711633386, + "language_loss": 0.77919745, + "learning_rate": 3.377469372935791e-06, + "loss": 0.85711277, + "num_input_tokens_seen": 100724960, + "router_z_loss_clip": 2.17089844, + "router_z_loss_mlp": 0.21520996, + "step": 4664, + "time_per_iteration": 2.578552484512329 + }, + { + "auxiliary_loss_clip": 0.06500383, + "auxiliary_loss_mlp": 0.01277041, + "balance_loss_clip": 0.06293675, + "balance_loss_mlp": 0.01259374, + "epoch": 0.28047497369607693, + "flos": 14799669471360.0, + "grad_norm": 1.9758280924180103, + "language_loss": 0.80386382, + "learning_rate": 3.377186981855578e-06, + "loss": 0.88163805, + "num_input_tokens_seen": 100741995, + "router_z_loss_clip": 2.06738281, + "router_z_loss_mlp": 0.17675781, + "step": 4665, + "time_per_iteration": 2.5088212490081787 + }, + { + "auxiliary_loss_clip": 0.06506059, + "auxiliary_loss_mlp": 0.01274647, + "balance_loss_clip": 0.06294893, + "balance_loss_mlp": 0.01257397, + "epoch": 0.2805350969487449, + "flos": 23076559457280.0, + "grad_norm": 2.052054159073397, + "language_loss": 0.81109238, + "learning_rate": 3.3769045385522968e-06, + "loss": 0.88889945, + "num_input_tokens_seen": 100758985, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.17236328, + "step": 4666, + "time_per_iteration": 2.5765438079833984 + }, + { + "auxiliary_loss_clip": 0.06505027, + "auxiliary_loss_mlp": 0.01282246, + "balance_loss_clip": 0.0629367, + "balance_loss_mlp": 0.01263149, + "epoch": 0.2805952202014129, + "flos": 20485177361280.0, + "grad_norm": 2.1346617464039395, + "language_loss": 0.84940714, + "learning_rate": 3.376622043036658e-06, + "loss": 0.92727995, + "num_input_tokens_seen": 100777820, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.19104004, + "step": 4667, + "time_per_iteration": 2.536466360092163 + }, + { + "auxiliary_loss_clip": 0.06510031, + "auxiliary_loss_mlp": 0.01284991, + "balance_loss_clip": 0.0629562, + "balance_loss_mlp": 0.0126581, + "epoch": 0.2806553434540809, + "flos": 27424678728960.0, + "grad_norm": 1.8168022919289022, + "language_loss": 0.80077279, + "learning_rate": 3.376339495319373e-06, + "loss": 0.87872303, + "num_input_tokens_seen": 100798205, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.19177246, + "step": 4668, + "time_per_iteration": 2.620793581008911 + }, + { + "auxiliary_loss_clip": 0.06508033, + "auxiliary_loss_mlp": 0.01279574, + "balance_loss_clip": 0.06290744, + "balance_loss_mlp": 0.0126124, + "epoch": 0.28071546670674885, + "flos": 26512187765760.0, + "grad_norm": 1.3575587104794173, + "language_loss": 0.76748574, + "learning_rate": 3.3760568954111563e-06, + "loss": 0.84536183, + "num_input_tokens_seen": 100819800, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.18334961, + "step": 4669, + "time_per_iteration": 2.629755973815918 + }, + { + "auxiliary_loss_clip": 0.06511085, + "auxiliary_loss_mlp": 0.01281258, + "balance_loss_clip": 0.06298456, + "balance_loss_mlp": 0.01263376, + "epoch": 0.2807755899594168, + "flos": 20564993975040.0, + "grad_norm": 1.8976620486576934, + "language_loss": 0.79953671, + "learning_rate": 3.375774243322725e-06, + "loss": 0.87746012, + "num_input_tokens_seen": 100837880, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.17883301, + "step": 4670, + "time_per_iteration": 2.630960702896118 + }, + { + "auxiliary_loss_clip": 0.06512859, + "auxiliary_loss_mlp": 0.0128758, + "balance_loss_clip": 0.06296019, + "balance_loss_mlp": 0.0126859, + "epoch": 0.2808357132120848, + "flos": 24319693831680.0, + "grad_norm": 2.1242803821214915, + "language_loss": 0.79548872, + "learning_rate": 3.3754915390647955e-06, + "loss": 0.87349308, + "num_input_tokens_seen": 100856350, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.18981934, + "step": 4671, + "time_per_iteration": 2.5943963527679443 + }, + { + "auxiliary_loss_clip": 0.06499608, + "auxiliary_loss_mlp": 0.01282791, + "balance_loss_clip": 0.06293108, + "balance_loss_mlp": 0.01265124, + "epoch": 0.28089583646475275, + "flos": 26439624529920.0, + "grad_norm": 1.773606658736433, + "language_loss": 0.75789028, + "learning_rate": 3.37520878264809e-06, + "loss": 0.83571434, + "num_input_tokens_seen": 100876135, + "router_z_loss_clip": 2.06640625, + "router_z_loss_mlp": 0.17663574, + "step": 4672, + "time_per_iteration": 2.5819919109344482 + }, + { + "auxiliary_loss_clip": 0.06515782, + "auxiliary_loss_mlp": 0.0128082, + "balance_loss_clip": 0.06299746, + "balance_loss_mlp": 0.01260412, + "epoch": 0.2809559597174207, + "flos": 23118417371520.0, + "grad_norm": 2.723902952009536, + "language_loss": 0.76012361, + "learning_rate": 3.3749259740833286e-06, + "loss": 0.83808959, + "num_input_tokens_seen": 100894790, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.20410156, + "step": 4673, + "time_per_iteration": 2.579460859298706 + }, + { + "auxiliary_loss_clip": 0.06510463, + "auxiliary_loss_mlp": 0.01285315, + "balance_loss_clip": 0.06297876, + "balance_loss_mlp": 0.0126704, + "epoch": 0.2810160829700887, + "flos": 20929864579200.0, + "grad_norm": 1.8153863613356214, + "language_loss": 0.72824192, + "learning_rate": 3.374643113381237e-06, + "loss": 0.80619967, + "num_input_tokens_seen": 100915100, + "router_z_loss_clip": 2.12792969, + "router_z_loss_mlp": 0.18261719, + "step": 4674, + "time_per_iteration": 4.0586278438568115 + }, + { + "auxiliary_loss_clip": 0.06522093, + "auxiliary_loss_mlp": 0.01283708, + "balance_loss_clip": 0.06307152, + "balance_loss_mlp": 0.0126405, + "epoch": 0.28107620622275664, + "flos": 14361145528320.0, + "grad_norm": 1.8954321480679195, + "language_loss": 0.77875817, + "learning_rate": 3.374360200552541e-06, + "loss": 0.85681611, + "num_input_tokens_seen": 100932795, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.1965332, + "step": 4675, + "time_per_iteration": 2.550075054168701 + }, + { + "auxiliary_loss_clip": 0.06512761, + "auxiliary_loss_mlp": 0.01288962, + "balance_loss_clip": 0.06296991, + "balance_loss_mlp": 0.01269531, + "epoch": 0.2811363294754246, + "flos": 20924707553280.0, + "grad_norm": 3.9789590396078784, + "language_loss": 0.70705891, + "learning_rate": 3.374077235607968e-06, + "loss": 0.78507614, + "num_input_tokens_seen": 100950505, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.19433594, + "step": 4676, + "time_per_iteration": 2.519028425216675 + }, + { + "auxiliary_loss_clip": 0.06504105, + "auxiliary_loss_mlp": 0.01278874, + "balance_loss_clip": 0.0629884, + "balance_loss_mlp": 0.01260611, + "epoch": 0.28119645272809257, + "flos": 20601107884800.0, + "grad_norm": 1.5779309471284284, + "language_loss": 0.70529211, + "learning_rate": 3.3737942185582487e-06, + "loss": 0.78312188, + "num_input_tokens_seen": 100968790, + "router_z_loss_clip": 2.04882812, + "router_z_loss_mlp": 0.18286133, + "step": 4677, + "time_per_iteration": 2.5834195613861084 + }, + { + "auxiliary_loss_clip": 0.06516379, + "auxiliary_loss_mlp": 0.01281791, + "balance_loss_clip": 0.06302937, + "balance_loss_mlp": 0.0126193, + "epoch": 0.28125657598076054, + "flos": 25344383811840.0, + "grad_norm": 1.5021857900224345, + "language_loss": 0.64105308, + "learning_rate": 3.3735111494141153e-06, + "loss": 0.71903479, + "num_input_tokens_seen": 100990205, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.1986084, + "step": 4678, + "time_per_iteration": 2.618948221206665 + }, + { + "auxiliary_loss_clip": 0.06517099, + "auxiliary_loss_mlp": 0.01278079, + "balance_loss_clip": 0.06306246, + "balance_loss_mlp": 0.01259947, + "epoch": 0.2813166992334285, + "flos": 24834051319680.0, + "grad_norm": 1.437486997447774, + "language_loss": 0.71167207, + "learning_rate": 3.3732280281863013e-06, + "loss": 0.7896238, + "num_input_tokens_seen": 101009815, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.18139648, + "step": 4679, + "time_per_iteration": 5.466668128967285 + }, + { + "auxiliary_loss_clip": 0.06520079, + "auxiliary_loss_mlp": 0.0127734, + "balance_loss_clip": 0.06306013, + "balance_loss_mlp": 0.01257491, + "epoch": 0.2813768224860965, + "flos": 21766941267840.0, + "grad_norm": 1.8819388160659554, + "language_loss": 0.75122017, + "learning_rate": 3.3729448548855422e-06, + "loss": 0.82919437, + "num_input_tokens_seen": 101026780, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.19848633, + "step": 4680, + "time_per_iteration": 2.5146636962890625 + }, + { + "auxiliary_loss_clip": 0.06519224, + "auxiliary_loss_mlp": 0.01276065, + "balance_loss_clip": 0.06307293, + "balance_loss_mlp": 0.01257969, + "epoch": 0.2814369457387645, + "flos": 24323760754560.0, + "grad_norm": 2.4475033368931984, + "language_loss": 0.77670574, + "learning_rate": 3.3726616295225774e-06, + "loss": 0.8546586, + "num_input_tokens_seen": 101046215, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.18103027, + "step": 4681, + "time_per_iteration": 2.576263189315796 + }, + { + "auxiliary_loss_clip": 0.06524731, + "auxiliary_loss_mlp": 0.01277602, + "balance_loss_clip": 0.06309941, + "balance_loss_mlp": 0.01259208, + "epoch": 0.28149706899143245, + "flos": 18521274165120.0, + "grad_norm": 2.513172937911882, + "language_loss": 0.7420646, + "learning_rate": 3.372378352108146e-06, + "loss": 0.82008791, + "num_input_tokens_seen": 101063365, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.18383789, + "step": 4682, + "time_per_iteration": 2.5019047260284424 + }, + { + "auxiliary_loss_clip": 0.06516165, + "auxiliary_loss_mlp": 0.01280522, + "balance_loss_clip": 0.06307921, + "balance_loss_mlp": 0.01262879, + "epoch": 0.2815571922441004, + "flos": 24870165229440.0, + "grad_norm": 1.4634735151261165, + "language_loss": 0.81619561, + "learning_rate": 3.3720950226529894e-06, + "loss": 0.89416242, + "num_input_tokens_seen": 101083835, + "router_z_loss_clip": 2.08105469, + "router_z_loss_mlp": 0.17626953, + "step": 4683, + "time_per_iteration": 2.6108040809631348 + }, + { + "auxiliary_loss_clip": 0.06511167, + "auxiliary_loss_mlp": 0.01277368, + "balance_loss_clip": 0.06297079, + "balance_loss_mlp": 0.01258771, + "epoch": 0.2816173154967684, + "flos": 19907774075520.0, + "grad_norm": 1.6126473409715323, + "language_loss": 0.76514447, + "learning_rate": 3.371811641167852e-06, + "loss": 0.8430298, + "num_input_tokens_seen": 101101740, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.18579102, + "step": 4684, + "time_per_iteration": 3.9593515396118164 + }, + { + "auxiliary_loss_clip": 0.06509569, + "auxiliary_loss_mlp": 0.0127644, + "balance_loss_clip": 0.06298888, + "balance_loss_mlp": 0.01257474, + "epoch": 0.28167743874943635, + "flos": 17496709966080.0, + "grad_norm": 1.741664239740996, + "language_loss": 0.76634955, + "learning_rate": 3.3715282076634807e-06, + "loss": 0.84420967, + "num_input_tokens_seen": 101120480, + "router_z_loss_clip": 2.10742188, + "router_z_loss_mlp": 0.18969727, + "step": 4685, + "time_per_iteration": 2.533033847808838 + }, + { + "auxiliary_loss_clip": 0.06512235, + "auxiliary_loss_mlp": 0.01277016, + "balance_loss_clip": 0.06303049, + "balance_loss_mlp": 0.01258002, + "epoch": 0.2817375620021043, + "flos": 25309276151040.0, + "grad_norm": 1.5379443905684582, + "language_loss": 0.76075816, + "learning_rate": 3.3712447221506218e-06, + "loss": 0.8386507, + "num_input_tokens_seen": 101142910, + "router_z_loss_clip": 2.09179688, + "router_z_loss_mlp": 0.19006348, + "step": 4686, + "time_per_iteration": 2.5632452964782715 + }, + { + "auxiliary_loss_clip": 0.0651376, + "auxiliary_loss_mlp": 0.01282744, + "balance_loss_clip": 0.06298173, + "balance_loss_mlp": 0.01262705, + "epoch": 0.2817976852547723, + "flos": 18698447623680.0, + "grad_norm": 3.4763910689128945, + "language_loss": 0.63974833, + "learning_rate": 3.370961184640025e-06, + "loss": 0.71771336, + "num_input_tokens_seen": 101160030, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.20043945, + "step": 4687, + "time_per_iteration": 2.5520877838134766 + }, + { + "auxiliary_loss_clip": 0.0651626, + "auxiliary_loss_mlp": 0.01278308, + "balance_loss_clip": 0.06302825, + "balance_loss_mlp": 0.01258889, + "epoch": 0.28185780850744024, + "flos": 22748012398080.0, + "grad_norm": 2.5451270798344208, + "language_loss": 0.76514482, + "learning_rate": 3.3706775951424433e-06, + "loss": 0.84309042, + "num_input_tokens_seen": 101177675, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.1940918, + "step": 4688, + "time_per_iteration": 2.5427582263946533 + }, + { + "auxiliary_loss_clip": 0.06506021, + "auxiliary_loss_mlp": 0.01276039, + "balance_loss_clip": 0.06297493, + "balance_loss_mlp": 0.01258622, + "epoch": 0.2819179317601082, + "flos": 14938297251840.0, + "grad_norm": 2.0673048339937394, + "language_loss": 0.79160047, + "learning_rate": 3.37039395366863e-06, + "loss": 0.86942106, + "num_input_tokens_seen": 101192225, + "router_z_loss_clip": 2.0859375, + "router_z_loss_mlp": 0.17407227, + "step": 4689, + "time_per_iteration": 2.514857769012451 + }, + { + "auxiliary_loss_clip": 0.06505655, + "auxiliary_loss_mlp": 0.01279731, + "balance_loss_clip": 0.06295724, + "balance_loss_mlp": 0.0126098, + "epoch": 0.2819780550127762, + "flos": 23151428680320.0, + "grad_norm": 2.0480677905828664, + "language_loss": 0.78403682, + "learning_rate": 3.37011026022934e-06, + "loss": 0.86189067, + "num_input_tokens_seen": 101210870, + "router_z_loss_clip": 2.1015625, + "router_z_loss_mlp": 0.18762207, + "step": 4690, + "time_per_iteration": 2.5567362308502197 + }, + { + "auxiliary_loss_clip": 0.06514366, + "auxiliary_loss_mlp": 0.01275411, + "balance_loss_clip": 0.06301816, + "balance_loss_mlp": 0.01256981, + "epoch": 0.28203817826544414, + "flos": 21622779118080.0, + "grad_norm": 2.5530247222146976, + "language_loss": 0.87619591, + "learning_rate": 3.369826514835332e-06, + "loss": 0.95409369, + "num_input_tokens_seen": 101229965, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.18432617, + "step": 4691, + "time_per_iteration": 2.5987935066223145 + }, + { + "auxiliary_loss_clip": 0.0651565, + "auxiliary_loss_mlp": 0.0127711, + "balance_loss_clip": 0.0629878, + "balance_loss_mlp": 0.01258787, + "epoch": 0.2820983015181121, + "flos": 24034010935680.0, + "grad_norm": 1.7719901211447804, + "language_loss": 0.82443225, + "learning_rate": 3.3695427174973654e-06, + "loss": 0.90235984, + "num_input_tokens_seen": 101250980, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.18322754, + "step": 4692, + "time_per_iteration": 2.607388496398926 + }, + { + "auxiliary_loss_clip": 0.06515577, + "auxiliary_loss_mlp": 0.01278887, + "balance_loss_clip": 0.06304249, + "balance_loss_mlp": 0.01259921, + "epoch": 0.2821584247707801, + "flos": 30015725408640.0, + "grad_norm": 1.5203777397001885, + "language_loss": 0.74437934, + "learning_rate": 3.3692588682262022e-06, + "loss": 0.82232404, + "num_input_tokens_seen": 101273335, + "router_z_loss_clip": 2.11328125, + "router_z_loss_mlp": 0.1895752, + "step": 4693, + "time_per_iteration": 2.6104559898376465 + }, + { + "auxiliary_loss_clip": 0.06512225, + "auxiliary_loss_mlp": 0.0127432, + "balance_loss_clip": 0.06298921, + "balance_loss_mlp": 0.01255593, + "epoch": 0.2822185480234481, + "flos": 21403034985600.0, + "grad_norm": 1.7641787467317929, + "language_loss": 0.77641487, + "learning_rate": 3.3689749670326046e-06, + "loss": 0.85428035, + "num_input_tokens_seen": 101292110, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.18737793, + "step": 4694, + "time_per_iteration": 2.5619184970855713 + }, + { + "auxiliary_loss_clip": 0.06513312, + "auxiliary_loss_mlp": 0.01274888, + "balance_loss_clip": 0.0630666, + "balance_loss_mlp": 0.01255898, + "epoch": 0.28227867127611606, + "flos": 27459996024960.0, + "grad_norm": 2.064814820064932, + "language_loss": 0.67270994, + "learning_rate": 3.3686910139273392e-06, + "loss": 0.75059193, + "num_input_tokens_seen": 101312815, + "router_z_loss_clip": 2.06640625, + "router_z_loss_mlp": 0.18969727, + "step": 4695, + "time_per_iteration": 2.5849459171295166 + }, + { + "auxiliary_loss_clip": 0.06524754, + "auxiliary_loss_mlp": 0.01277288, + "balance_loss_clip": 0.06312457, + "balance_loss_mlp": 0.01255914, + "epoch": 0.282338794528784, + "flos": 22599028638720.0, + "grad_norm": 2.3022925444863747, + "language_loss": 0.75992346, + "learning_rate": 3.3684070089211736e-06, + "loss": 0.83794391, + "num_input_tokens_seen": 101329045, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.21362305, + "step": 4696, + "time_per_iteration": 2.5599312782287598 + }, + { + "auxiliary_loss_clip": 0.06528555, + "auxiliary_loss_mlp": 0.01276345, + "balance_loss_clip": 0.06319815, + "balance_loss_mlp": 0.01257915, + "epoch": 0.282398917781452, + "flos": 42020592998400.0, + "grad_norm": 1.6923608864022255, + "language_loss": 0.62607121, + "learning_rate": 3.368122952024877e-06, + "loss": 0.70412022, + "num_input_tokens_seen": 101352715, + "router_z_loss_clip": 2.08691406, + "router_z_loss_mlp": 0.1842041, + "step": 4697, + "time_per_iteration": 2.719783067703247 + }, + { + "auxiliary_loss_clip": 0.0651894, + "auxiliary_loss_mlp": 0.01278397, + "balance_loss_clip": 0.0631054, + "balance_loss_mlp": 0.01260564, + "epoch": 0.28245904103411995, + "flos": 23231916126720.0, + "grad_norm": 1.330125700327103, + "language_loss": 0.73835146, + "learning_rate": 3.3678388432492214e-06, + "loss": 0.81632483, + "num_input_tokens_seen": 101374640, + "router_z_loss_clip": 2.08398438, + "router_z_loss_mlp": 0.17834473, + "step": 4698, + "time_per_iteration": 2.671154260635376 + }, + { + "auxiliary_loss_clip": 0.06520095, + "auxiliary_loss_mlp": 0.01274177, + "balance_loss_clip": 0.06314629, + "balance_loss_mlp": 0.01255699, + "epoch": 0.2825191642867879, + "flos": 25381713605760.0, + "grad_norm": 1.8806904568543696, + "language_loss": 0.75498992, + "learning_rate": 3.3675546826049788e-06, + "loss": 0.83293265, + "num_input_tokens_seen": 101393595, + "router_z_loss_clip": 2.05371094, + "router_z_loss_mlp": 0.18481445, + "step": 4699, + "time_per_iteration": 2.749073028564453 + }, + { + "auxiliary_loss_clip": 0.06532586, + "auxiliary_loss_mlp": 0.0127858, + "balance_loss_clip": 0.06318063, + "balance_loss_mlp": 0.01257969, + "epoch": 0.2825792875394559, + "flos": 17242277443200.0, + "grad_norm": 2.5468251061801697, + "language_loss": 0.80103695, + "learning_rate": 3.3672704701029265e-06, + "loss": 0.87914866, + "num_input_tokens_seen": 101409265, + "router_z_loss_clip": 2.14355469, + "router_z_loss_mlp": 0.20617676, + "step": 4700, + "time_per_iteration": 2.539794683456421 + }, + { + "auxiliary_loss_clip": 0.06516679, + "auxiliary_loss_mlp": 0.01274136, + "balance_loss_clip": 0.06314512, + "balance_loss_mlp": 0.01257006, + "epoch": 0.28263941079212385, + "flos": 26731177211520.0, + "grad_norm": 2.1068022199140213, + "language_loss": 0.8243857, + "learning_rate": 3.3669862057538402e-06, + "loss": 0.90229392, + "num_input_tokens_seen": 101428365, + "router_z_loss_clip": 2.02246094, + "router_z_loss_mlp": 0.17114258, + "step": 4701, + "time_per_iteration": 2.5763485431671143 + }, + { + "auxiliary_loss_clip": 0.06520683, + "auxiliary_loss_mlp": 0.01274057, + "balance_loss_clip": 0.06312392, + "balance_loss_mlp": 0.01256116, + "epoch": 0.2826995340447918, + "flos": 25928411569920.0, + "grad_norm": 2.2990609650841276, + "language_loss": 0.73153478, + "learning_rate": 3.3667018895685004e-06, + "loss": 0.80948216, + "num_input_tokens_seen": 101447280, + "router_z_loss_clip": 2.08007812, + "router_z_loss_mlp": 0.17956543, + "step": 4702, + "time_per_iteration": 2.5968289375305176 + }, + { + "auxiliary_loss_clip": 0.06520355, + "auxiliary_loss_mlp": 0.01275823, + "balance_loss_clip": 0.06316096, + "balance_loss_mlp": 0.01258848, + "epoch": 0.2827596572974598, + "flos": 22385783197440.0, + "grad_norm": 1.6603391807745085, + "language_loss": 0.78883457, + "learning_rate": 3.3664175215576886e-06, + "loss": 0.86679637, + "num_input_tokens_seen": 101465435, + "router_z_loss_clip": 2.04296875, + "router_z_loss_mlp": 0.1697998, + "step": 4703, + "time_per_iteration": 2.56088924407959 + }, + { + "auxiliary_loss_clip": 0.06518066, + "auxiliary_loss_mlp": 0.01281519, + "balance_loss_clip": 0.06307587, + "balance_loss_mlp": 0.01261885, + "epoch": 0.28281978055012774, + "flos": 33555544669440.0, + "grad_norm": 1.530922589206002, + "language_loss": 0.69937778, + "learning_rate": 3.3661331017321867e-06, + "loss": 0.77737355, + "num_input_tokens_seen": 101486355, + "router_z_loss_clip": 2.10253906, + "router_z_loss_mlp": 0.19628906, + "step": 4704, + "time_per_iteration": 2.725234031677246 + }, + { + "auxiliary_loss_clip": 0.0652602, + "auxiliary_loss_mlp": 0.01283133, + "balance_loss_clip": 0.06319317, + "balance_loss_mlp": 0.01264119, + "epoch": 0.2828799038027957, + "flos": 23447635263360.0, + "grad_norm": 1.9265232828394878, + "language_loss": 0.70927215, + "learning_rate": 3.3658486301027807e-06, + "loss": 0.78736377, + "num_input_tokens_seen": 101505875, + "router_z_loss_clip": 2.06445312, + "router_z_loss_mlp": 0.19006348, + "step": 4705, + "time_per_iteration": 2.5391383171081543 + }, + { + "auxiliary_loss_clip": 0.06482799, + "auxiliary_loss_mlp": 0.0126888, + "balance_loss_clip": 0.06378852, + "balance_loss_mlp": 0.01263947, + "epoch": 0.2829400270554637, + "flos": 69892055297280.0, + "grad_norm": 0.9159756060868983, + "language_loss": 0.59201139, + "learning_rate": 3.3655641066802577e-06, + "loss": 0.66952819, + "num_input_tokens_seen": 101565045, + "router_z_loss_clip": 1.0390625, + "router_z_loss_mlp": 0.04928589, + "step": 4706, + "time_per_iteration": 3.219618797302246 + }, + { + "auxiliary_loss_clip": 0.06512764, + "auxiliary_loss_mlp": 0.01277701, + "balance_loss_clip": 0.06312177, + "balance_loss_mlp": 0.01260547, + "epoch": 0.2830001503081317, + "flos": 24795715276800.0, + "grad_norm": 1.373077415158703, + "language_loss": 0.82380199, + "learning_rate": 3.365279531475407e-06, + "loss": 0.90170658, + "num_input_tokens_seen": 101585825, + "router_z_loss_clip": 2.00683594, + "router_z_loss_mlp": 0.17138672, + "step": 4707, + "time_per_iteration": 2.5680840015411377 + }, + { + "auxiliary_loss_clip": 0.06518079, + "auxiliary_loss_mlp": 0.01276357, + "balance_loss_clip": 0.06304221, + "balance_loss_mlp": 0.01257391, + "epoch": 0.28306027356079966, + "flos": 27676218286080.0, + "grad_norm": 1.5569970524845527, + "language_loss": 0.81077999, + "learning_rate": 3.36499490449902e-06, + "loss": 0.88872433, + "num_input_tokens_seen": 101606105, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.18969727, + "step": 4708, + "time_per_iteration": 2.643389940261841 + }, + { + "auxiliary_loss_clip": 0.06443536, + "auxiliary_loss_mlp": 0.01268639, + "balance_loss_clip": 0.06339511, + "balance_loss_mlp": 0.01264025, + "epoch": 0.2831203968134676, + "flos": 60543837734400.0, + "grad_norm": 0.8586282544888121, + "language_loss": 0.62812036, + "learning_rate": 3.3647102257618895e-06, + "loss": 0.7052421, + "num_input_tokens_seen": 101656875, + "router_z_loss_clip": 1.04199219, + "router_z_loss_mlp": 0.04608154, + "step": 4709, + "time_per_iteration": 3.0554397106170654 + }, + { + "auxiliary_loss_clip": 0.06507774, + "auxiliary_loss_mlp": 0.01270408, + "balance_loss_clip": 0.06301016, + "balance_loss_mlp": 0.01253015, + "epoch": 0.2831805200661356, + "flos": 22061386915200.0, + "grad_norm": 1.4201642822404892, + "language_loss": 0.74412584, + "learning_rate": 3.3644254952748103e-06, + "loss": 0.82190764, + "num_input_tokens_seen": 101676225, + "router_z_loss_clip": 2.06835938, + "router_z_loss_mlp": 0.1739502, + "step": 4710, + "time_per_iteration": 2.555367946624756 + }, + { + "auxiliary_loss_clip": 0.06514937, + "auxiliary_loss_mlp": 0.01275331, + "balance_loss_clip": 0.06306514, + "balance_loss_mlp": 0.01256627, + "epoch": 0.28324064331880355, + "flos": 22607120557440.0, + "grad_norm": 1.9767009095982746, + "language_loss": 0.8018595, + "learning_rate": 3.364140713048579e-06, + "loss": 0.87976217, + "num_input_tokens_seen": 101693710, + "router_z_loss_clip": 2.08300781, + "router_z_loss_mlp": 0.18713379, + "step": 4711, + "time_per_iteration": 2.610027313232422 + }, + { + "auxiliary_loss_clip": 0.06509729, + "auxiliary_loss_mlp": 0.01278493, + "balance_loss_clip": 0.06300638, + "balance_loss_mlp": 0.01260385, + "epoch": 0.2833007665714715, + "flos": 30411133626240.0, + "grad_norm": 1.982526263820073, + "language_loss": 0.70604694, + "learning_rate": 3.363855879093996e-06, + "loss": 0.78392917, + "num_input_tokens_seen": 101714010, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 0.18103027, + "step": 4712, + "time_per_iteration": 2.602795124053955 + }, + { + "auxiliary_loss_clip": 0.06508194, + "auxiliary_loss_mlp": 0.01282495, + "balance_loss_clip": 0.06299947, + "balance_loss_mlp": 0.01262992, + "epoch": 0.2833608898241395, + "flos": 23556144700800.0, + "grad_norm": 1.7823239687069516, + "language_loss": 0.8193841, + "learning_rate": 3.3635709934218605e-06, + "loss": 0.89729095, + "num_input_tokens_seen": 101732995, + "router_z_loss_clip": 2.08398438, + "router_z_loss_mlp": 0.19494629, + "step": 4713, + "time_per_iteration": 2.6088523864746094 + }, + { + "auxiliary_loss_clip": 0.06512519, + "auxiliary_loss_mlp": 0.01275048, + "balance_loss_clip": 0.06304006, + "balance_loss_mlp": 0.01255236, + "epoch": 0.28342101307680745, + "flos": 20272980096000.0, + "grad_norm": 2.6212370689858493, + "language_loss": 0.75431275, + "learning_rate": 3.3632860560429766e-06, + "loss": 0.83218849, + "num_input_tokens_seen": 101751385, + "router_z_loss_clip": 2.08398438, + "router_z_loss_mlp": 0.19799805, + "step": 4714, + "time_per_iteration": 3.986696243286133 + }, + { + "auxiliary_loss_clip": 0.06505996, + "auxiliary_loss_mlp": 0.01276776, + "balance_loss_clip": 0.06297115, + "balance_loss_mlp": 0.01259324, + "epoch": 0.2834811363294754, + "flos": 30854982303360.0, + "grad_norm": 1.3268888753773178, + "language_loss": 0.78198218, + "learning_rate": 3.3630010669681494e-06, + "loss": 0.85980994, + "num_input_tokens_seen": 101773825, + "router_z_loss_clip": 2.08691406, + "router_z_loss_mlp": 0.17468262, + "step": 4715, + "time_per_iteration": 2.652470111846924 + }, + { + "auxiliary_loss_clip": 0.06506517, + "auxiliary_loss_mlp": 0.01277278, + "balance_loss_clip": 0.06300199, + "balance_loss_mlp": 0.01260088, + "epoch": 0.2835412595821434, + "flos": 22717642492800.0, + "grad_norm": 1.6173599581374518, + "language_loss": 0.74551272, + "learning_rate": 3.3627160262081845e-06, + "loss": 0.82335067, + "num_input_tokens_seen": 101791920, + "router_z_loss_clip": 2.06542969, + "router_z_loss_mlp": 0.17175293, + "step": 4716, + "time_per_iteration": 2.597083806991577 + }, + { + "auxiliary_loss_clip": 0.06516325, + "auxiliary_loss_mlp": 0.01281584, + "balance_loss_clip": 0.06298752, + "balance_loss_mlp": 0.0126189, + "epoch": 0.28360138283481134, + "flos": 18083630689920.0, + "grad_norm": 2.1150039301458112, + "language_loss": 0.75477433, + "learning_rate": 3.3624309337738917e-06, + "loss": 0.83275348, + "num_input_tokens_seen": 101809515, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.19702148, + "step": 4717, + "time_per_iteration": 2.5648136138916016 + }, + { + "auxiliary_loss_clip": 0.06514253, + "auxiliary_loss_mlp": 0.01277656, + "balance_loss_clip": 0.06302426, + "balance_loss_mlp": 0.01258606, + "epoch": 0.2836615060874793, + "flos": 17859987342720.0, + "grad_norm": 1.540618458402471, + "language_loss": 0.67445159, + "learning_rate": 3.3621457896760813e-06, + "loss": 0.75237072, + "num_input_tokens_seen": 101827735, + "router_z_loss_clip": 2.1171875, + "router_z_loss_mlp": 0.19042969, + "step": 4718, + "time_per_iteration": 3.962265968322754 + }, + { + "auxiliary_loss_clip": 0.06507722, + "auxiliary_loss_mlp": 0.01278787, + "balance_loss_clip": 0.06295013, + "balance_loss_mlp": 0.01258772, + "epoch": 0.2837216293401473, + "flos": 25747590458880.0, + "grad_norm": 1.8038295919740834, + "language_loss": 0.73164374, + "learning_rate": 3.361860593925566e-06, + "loss": 0.8095088, + "num_input_tokens_seen": 101845970, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.20007324, + "step": 4719, + "time_per_iteration": 4.095008134841919 + }, + { + "auxiliary_loss_clip": 0.0650832, + "auxiliary_loss_mlp": 0.01277839, + "balance_loss_clip": 0.06301163, + "balance_loss_mlp": 0.01259386, + "epoch": 0.2837817525928153, + "flos": 20929906506240.0, + "grad_norm": 1.8981156672354917, + "language_loss": 0.80600828, + "learning_rate": 3.3615753465331605e-06, + "loss": 0.88386989, + "num_input_tokens_seen": 101865040, + "router_z_loss_clip": 2.07324219, + "router_z_loss_mlp": 0.18444824, + "step": 4720, + "time_per_iteration": 2.53869366645813 + }, + { + "auxiliary_loss_clip": 0.06515027, + "auxiliary_loss_mlp": 0.01280641, + "balance_loss_clip": 0.06304276, + "balance_loss_mlp": 0.01261687, + "epoch": 0.28384187584548326, + "flos": 18922719876480.0, + "grad_norm": 1.7940545446838874, + "language_loss": 0.7966662, + "learning_rate": 3.3612900475096817e-06, + "loss": 0.87462288, + "num_input_tokens_seen": 101883735, + "router_z_loss_clip": 2.10742188, + "router_z_loss_mlp": 0.18945312, + "step": 4721, + "time_per_iteration": 2.5736734867095947 + }, + { + "auxiliary_loss_clip": 0.06507237, + "auxiliary_loss_mlp": 0.01272866, + "balance_loss_clip": 0.06298702, + "balance_loss_mlp": 0.01254996, + "epoch": 0.2839019990981512, + "flos": 27351235025280.0, + "grad_norm": 1.8504915753410351, + "language_loss": 0.83238685, + "learning_rate": 3.3610046968659474e-06, + "loss": 0.91018784, + "num_input_tokens_seen": 101903025, + "router_z_loss_clip": 2.0859375, + "router_z_loss_mlp": 0.17871094, + "step": 4722, + "time_per_iteration": 2.5798823833465576 + }, + { + "auxiliary_loss_clip": 0.06511718, + "auxiliary_loss_mlp": 0.01273786, + "balance_loss_clip": 0.06302544, + "balance_loss_mlp": 0.01255547, + "epoch": 0.2839621223508192, + "flos": 18120247724160.0, + "grad_norm": 1.9056364243243222, + "language_loss": 0.71157932, + "learning_rate": 3.3607192946127785e-06, + "loss": 0.78943431, + "num_input_tokens_seen": 101922255, + "router_z_loss_clip": 2.09179688, + "router_z_loss_mlp": 0.18225098, + "step": 4723, + "time_per_iteration": 2.5472381114959717 + }, + { + "auxiliary_loss_clip": 0.06508423, + "auxiliary_loss_mlp": 0.01279225, + "balance_loss_clip": 0.06299602, + "balance_loss_mlp": 0.01259937, + "epoch": 0.28402224560348716, + "flos": 26365384212480.0, + "grad_norm": 1.5487216964387416, + "language_loss": 0.7882036, + "learning_rate": 3.360433840760998e-06, + "loss": 0.86608005, + "num_input_tokens_seen": 101943100, + "router_z_loss_clip": 2.09082031, + "router_z_loss_mlp": 0.19299316, + "step": 4724, + "time_per_iteration": 4.039300203323364 + }, + { + "auxiliary_loss_clip": 0.0650482, + "auxiliary_loss_mlp": 0.01275588, + "balance_loss_clip": 0.06294143, + "balance_loss_mlp": 0.0125754, + "epoch": 0.2840823688561551, + "flos": 24067609223040.0, + "grad_norm": 1.5786087270385247, + "language_loss": 0.92781484, + "learning_rate": 3.36014833532143e-06, + "loss": 1.00561893, + "num_input_tokens_seen": 101963160, + "router_z_loss_clip": 2.10546875, + "router_z_loss_mlp": 0.18066406, + "step": 4725, + "time_per_iteration": 2.5839502811431885 + }, + { + "auxiliary_loss_clip": 0.06504668, + "auxiliary_loss_mlp": 0.01283756, + "balance_loss_clip": 0.06292438, + "balance_loss_mlp": 0.01263097, + "epoch": 0.2841424921088231, + "flos": 29467392289920.0, + "grad_norm": 1.5513315701194426, + "language_loss": 0.89446843, + "learning_rate": 3.3598627783049e-06, + "loss": 0.97235262, + "num_input_tokens_seen": 101984300, + "router_z_loss_clip": 2.12304688, + "router_z_loss_mlp": 0.20666504, + "step": 4726, + "time_per_iteration": 2.617002010345459 + }, + { + "auxiliary_loss_clip": 0.06507252, + "auxiliary_loss_mlp": 0.01284138, + "balance_loss_clip": 0.0629679, + "balance_loss_mlp": 0.01264409, + "epoch": 0.28420261536149105, + "flos": 48110439565440.0, + "grad_norm": 2.259876030173266, + "language_loss": 0.79337573, + "learning_rate": 3.359577169722238e-06, + "loss": 0.87128961, + "num_input_tokens_seen": 102005765, + "router_z_loss_clip": 2.1015625, + "router_z_loss_mlp": 0.19763184, + "step": 4727, + "time_per_iteration": 2.774508476257324 + }, + { + "auxiliary_loss_clip": 0.06499238, + "auxiliary_loss_mlp": 0.01275292, + "balance_loss_clip": 0.06294493, + "balance_loss_mlp": 0.01257483, + "epoch": 0.284262738614159, + "flos": 25673224360320.0, + "grad_norm": 2.051338722061539, + "language_loss": 0.67073631, + "learning_rate": 3.3592915095842733e-06, + "loss": 0.74848163, + "num_input_tokens_seen": 102022755, + "router_z_loss_clip": 2.04589844, + "router_z_loss_mlp": 0.17810059, + "step": 4728, + "time_per_iteration": 2.614614725112915 + }, + { + "auxiliary_loss_clip": 0.06494898, + "auxiliary_loss_mlp": 0.01274263, + "balance_loss_clip": 0.06287634, + "balance_loss_mlp": 0.01255702, + "epoch": 0.284322861866827, + "flos": 19725066247680.0, + "grad_norm": 2.0236031999203132, + "language_loss": 0.76682353, + "learning_rate": 3.3590057979018386e-06, + "loss": 0.84451514, + "num_input_tokens_seen": 102041850, + "router_z_loss_clip": 2.07617188, + "router_z_loss_mlp": 0.18554688, + "step": 4729, + "time_per_iteration": 2.542400360107422 + }, + { + "auxiliary_loss_clip": 0.06505589, + "auxiliary_loss_mlp": 0.01273011, + "balance_loss_clip": 0.06292985, + "balance_loss_mlp": 0.0125414, + "epoch": 0.28438298511949495, + "flos": 23922105408000.0, + "grad_norm": 1.7626205541686495, + "language_loss": 0.67443657, + "learning_rate": 3.3587200346857674e-06, + "loss": 0.75222254, + "num_input_tokens_seen": 102059500, + "router_z_loss_clip": 2.12304688, + "router_z_loss_mlp": 0.1887207, + "step": 4730, + "time_per_iteration": 2.6005139350891113 + }, + { + "auxiliary_loss_clip": 0.06503962, + "auxiliary_loss_mlp": 0.01275972, + "balance_loss_clip": 0.06292562, + "balance_loss_mlp": 0.01256219, + "epoch": 0.2844431083721629, + "flos": 26074460436480.0, + "grad_norm": 1.9951841893982447, + "language_loss": 0.74777246, + "learning_rate": 3.3584342199468965e-06, + "loss": 0.82557184, + "num_input_tokens_seen": 102080460, + "router_z_loss_clip": 2.1171875, + "router_z_loss_mlp": 0.1973877, + "step": 4731, + "time_per_iteration": 2.571259021759033 + }, + { + "auxiliary_loss_clip": 0.06501718, + "auxiliary_loss_mlp": 0.01275528, + "balance_loss_clip": 0.06291741, + "balance_loss_mlp": 0.01257384, + "epoch": 0.2845032316248309, + "flos": 25817260728960.0, + "grad_norm": 1.5216025808612688, + "language_loss": 0.8435545, + "learning_rate": 3.3581483536960638e-06, + "loss": 0.92132688, + "num_input_tokens_seen": 102100950, + "router_z_loss_clip": 2.09570312, + "router_z_loss_mlp": 0.18139648, + "step": 4732, + "time_per_iteration": 2.604717254638672 + }, + { + "auxiliary_loss_clip": 0.06508272, + "auxiliary_loss_mlp": 0.01277146, + "balance_loss_clip": 0.06295733, + "balance_loss_mlp": 0.01256082, + "epoch": 0.2845633548774989, + "flos": 19828418659200.0, + "grad_norm": 1.722472955192697, + "language_loss": 0.79522747, + "learning_rate": 3.357862435944109e-06, + "loss": 0.87308168, + "num_input_tokens_seen": 102119345, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.21069336, + "step": 4733, + "time_per_iteration": 2.517216444015503 + }, + { + "auxiliary_loss_clip": 0.06511072, + "auxiliary_loss_mlp": 0.01275761, + "balance_loss_clip": 0.06296709, + "balance_loss_mlp": 0.01256878, + "epoch": 0.28462347813016686, + "flos": 23189093890560.0, + "grad_norm": 2.336729990473161, + "language_loss": 0.72093451, + "learning_rate": 3.357576466701875e-06, + "loss": 0.79880273, + "num_input_tokens_seen": 102139050, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.1887207, + "step": 4734, + "time_per_iteration": 2.5948264598846436 + }, + { + "auxiliary_loss_clip": 0.06501292, + "auxiliary_loss_mlp": 0.01274129, + "balance_loss_clip": 0.06292972, + "balance_loss_mlp": 0.01256283, + "epoch": 0.2846836013828348, + "flos": 18666316782720.0, + "grad_norm": 1.7839237241912007, + "language_loss": 0.74739748, + "learning_rate": 3.3572904459802056e-06, + "loss": 0.82515168, + "num_input_tokens_seen": 102157935, + "router_z_loss_clip": 2.08203125, + "router_z_loss_mlp": 0.1784668, + "step": 4735, + "time_per_iteration": 2.5192623138427734 + }, + { + "auxiliary_loss_clip": 0.06500865, + "auxiliary_loss_mlp": 0.01274478, + "balance_loss_clip": 0.06291883, + "balance_loss_mlp": 0.01256096, + "epoch": 0.2847437246355028, + "flos": 14178731189760.0, + "grad_norm": 1.8549790130823454, + "language_loss": 0.81047934, + "learning_rate": 3.357004373789946e-06, + "loss": 0.88823277, + "num_input_tokens_seen": 102175325, + "router_z_loss_clip": 2.09179688, + "router_z_loss_mlp": 0.18383789, + "step": 4736, + "time_per_iteration": 2.593890905380249 + }, + { + "auxiliary_loss_clip": 0.06503595, + "auxiliary_loss_mlp": 0.01274596, + "balance_loss_clip": 0.06293313, + "balance_loss_mlp": 0.01256285, + "epoch": 0.28480384788817076, + "flos": 29286068054400.0, + "grad_norm": 3.1700593253391895, + "language_loss": 0.60580242, + "learning_rate": 3.3567182501419453e-06, + "loss": 0.68358433, + "num_input_tokens_seen": 102196625, + "router_z_loss_clip": 2.10644531, + "router_z_loss_mlp": 0.18310547, + "step": 4737, + "time_per_iteration": 2.591672897338867 + }, + { + "auxiliary_loss_clip": 0.06501776, + "auxiliary_loss_mlp": 0.01274486, + "balance_loss_clip": 0.06295541, + "balance_loss_mlp": 0.01256855, + "epoch": 0.2848639711408387, + "flos": 22607875244160.0, + "grad_norm": 1.8212806326874897, + "language_loss": 0.86685491, + "learning_rate": 3.356432075047052e-06, + "loss": 0.94461757, + "num_input_tokens_seen": 102214975, + "router_z_loss_clip": 2.06347656, + "router_z_loss_mlp": 0.1763916, + "step": 4738, + "time_per_iteration": 2.5788064002990723 + }, + { + "auxiliary_loss_clip": 0.06504256, + "auxiliary_loss_mlp": 0.01280924, + "balance_loss_clip": 0.06291994, + "balance_loss_mlp": 0.01260575, + "epoch": 0.2849240943935067, + "flos": 17604632424960.0, + "grad_norm": 2.187311269731562, + "language_loss": 0.90640962, + "learning_rate": 3.356145848516118e-06, + "loss": 0.98426139, + "num_input_tokens_seen": 102231885, + "router_z_loss_clip": 2.12304688, + "router_z_loss_mlp": 0.20336914, + "step": 4739, + "time_per_iteration": 2.491391897201538 + }, + { + "auxiliary_loss_clip": 0.06502014, + "auxiliary_loss_mlp": 0.01271887, + "balance_loss_clip": 0.06294325, + "balance_loss_mlp": 0.01254363, + "epoch": 0.28498421764617465, + "flos": 24869368615680.0, + "grad_norm": 1.2838984451042732, + "language_loss": 0.72652215, + "learning_rate": 3.355859570559998e-06, + "loss": 0.80426115, + "num_input_tokens_seen": 102252725, + "router_z_loss_clip": 2.07714844, + "router_z_loss_mlp": 0.17529297, + "step": 4740, + "time_per_iteration": 2.628420352935791 + }, + { + "auxiliary_loss_clip": 0.06497836, + "auxiliary_loss_mlp": 0.01273023, + "balance_loss_clip": 0.06293581, + "balance_loss_mlp": 0.01254069, + "epoch": 0.2850443408988426, + "flos": 22788947917440.0, + "grad_norm": 1.7372555552312992, + "language_loss": 0.77982342, + "learning_rate": 3.3555732411895477e-06, + "loss": 0.85753202, + "num_input_tokens_seen": 102271730, + "router_z_loss_clip": 2.04296875, + "router_z_loss_mlp": 0.1895752, + "step": 4741, + "time_per_iteration": 2.5205776691436768 + }, + { + "auxiliary_loss_clip": 0.06505083, + "auxiliary_loss_mlp": 0.01279172, + "balance_loss_clip": 0.06290049, + "balance_loss_mlp": 0.01260278, + "epoch": 0.2851044641515106, + "flos": 18850114713600.0, + "grad_norm": 2.3624012556043246, + "language_loss": 0.7702412, + "learning_rate": 3.3552868604156235e-06, + "loss": 0.84808373, + "num_input_tokens_seen": 102291325, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.18896484, + "step": 4742, + "time_per_iteration": 2.5852768421173096 + }, + { + "auxiliary_loss_clip": 0.06507465, + "auxiliary_loss_mlp": 0.01281198, + "balance_loss_clip": 0.06292667, + "balance_loss_mlp": 0.01260252, + "epoch": 0.28516458740417855, + "flos": 18886564039680.0, + "grad_norm": 2.066213096861692, + "language_loss": 0.57976151, + "learning_rate": 3.355000428249086e-06, + "loss": 0.65764809, + "num_input_tokens_seen": 102309000, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.20959473, + "step": 4743, + "time_per_iteration": 2.562298059463501 + }, + { + "auxiliary_loss_clip": 0.06507643, + "auxiliary_loss_mlp": 0.01278324, + "balance_loss_clip": 0.06297275, + "balance_loss_mlp": 0.01259787, + "epoch": 0.2852247106568465, + "flos": 25306592820480.0, + "grad_norm": 1.602300087654556, + "language_loss": 0.75013685, + "learning_rate": 3.354713944700797e-06, + "loss": 0.82799655, + "num_input_tokens_seen": 102329240, + "router_z_loss_clip": 2.10546875, + "router_z_loss_mlp": 0.1854248, + "step": 4744, + "time_per_iteration": 2.610302209854126 + }, + { + "auxiliary_loss_clip": 0.06500175, + "auxiliary_loss_mlp": 0.01276557, + "balance_loss_clip": 0.06292172, + "balance_loss_mlp": 0.01258794, + "epoch": 0.2852848339095145, + "flos": 11660080037760.0, + "grad_norm": 2.2644691376510844, + "language_loss": 0.78515136, + "learning_rate": 3.3544274097816185e-06, + "loss": 0.86291873, + "num_input_tokens_seen": 102344440, + "router_z_loss_clip": 2.08105469, + "router_z_loss_mlp": 0.17749023, + "step": 4745, + "time_per_iteration": 2.5170419216156006 + }, + { + "auxiliary_loss_clip": 0.06491117, + "auxiliary_loss_mlp": 0.01272956, + "balance_loss_clip": 0.06290857, + "balance_loss_mlp": 0.01254836, + "epoch": 0.2853449571621825, + "flos": 12938280145920.0, + "grad_norm": 1.7221704990089022, + "language_loss": 0.83220983, + "learning_rate": 3.3541408235024173e-06, + "loss": 0.9098506, + "num_input_tokens_seen": 102360985, + "router_z_loss_clip": 2.00097656, + "router_z_loss_mlp": 0.18127441, + "step": 4746, + "time_per_iteration": 2.6257071495056152 + }, + { + "auxiliary_loss_clip": 0.06514393, + "auxiliary_loss_mlp": 0.01278566, + "balance_loss_clip": 0.06295399, + "balance_loss_mlp": 0.01257943, + "epoch": 0.28540508041485046, + "flos": 20016660856320.0, + "grad_norm": 1.8084134515670756, + "language_loss": 0.80507863, + "learning_rate": 3.3538541858740604e-06, + "loss": 0.88300824, + "num_input_tokens_seen": 102380320, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.20617676, + "step": 4747, + "time_per_iteration": 2.5699074268341064 + }, + { + "auxiliary_loss_clip": 0.06375369, + "auxiliary_loss_mlp": 0.0127529, + "balance_loss_clip": 0.0627491, + "balance_loss_mlp": 0.01269043, + "epoch": 0.28546520366751843, + "flos": 68160264710400.0, + "grad_norm": 0.7514031277524565, + "language_loss": 0.60153103, + "learning_rate": 3.3535674969074173e-06, + "loss": 0.67803764, + "num_input_tokens_seen": 102439140, + "router_z_loss_clip": 1.0078125, + "router_z_loss_mlp": 0.06237793, + "step": 4748, + "time_per_iteration": 3.1155877113342285 + }, + { + "auxiliary_loss_clip": 0.06492989, + "auxiliary_loss_mlp": 0.01272874, + "balance_loss_clip": 0.06285426, + "balance_loss_mlp": 0.01255791, + "epoch": 0.2855253269201864, + "flos": 13254961852800.0, + "grad_norm": 2.1744647780903352, + "language_loss": 0.80643219, + "learning_rate": 3.3532807566133592e-06, + "loss": 0.88409078, + "num_input_tokens_seen": 102450990, + "router_z_loss_clip": 2.07617188, + "router_z_loss_mlp": 0.17089844, + "step": 4749, + "time_per_iteration": 2.5422439575195312 + }, + { + "auxiliary_loss_clip": 0.06506198, + "auxiliary_loss_mlp": 0.01278695, + "balance_loss_clip": 0.06295547, + "balance_loss_mlp": 0.0126011, + "epoch": 0.28558545017285436, + "flos": 28628345030400.0, + "grad_norm": 1.9900791940744995, + "language_loss": 0.70889151, + "learning_rate": 3.3529939650027587e-06, + "loss": 0.78674042, + "num_input_tokens_seen": 102471820, + "router_z_loss_clip": 2.10644531, + "router_z_loss_mlp": 0.18579102, + "step": 4750, + "time_per_iteration": 2.6223177909851074 + }, + { + "auxiliary_loss_clip": 0.06498066, + "auxiliary_loss_mlp": 0.01278692, + "balance_loss_clip": 0.06294224, + "balance_loss_mlp": 0.01261562, + "epoch": 0.2856455734255223, + "flos": 34138901594880.0, + "grad_norm": 1.523200352045364, + "language_loss": 0.82438904, + "learning_rate": 3.3527071220864917e-06, + "loss": 0.90215659, + "num_input_tokens_seen": 102492625, + "router_z_loss_clip": 2.04101562, + "router_z_loss_mlp": 0.17138672, + "step": 4751, + "time_per_iteration": 2.710822582244873 + }, + { + "auxiliary_loss_clip": 0.06498431, + "auxiliary_loss_mlp": 0.01276615, + "balance_loss_clip": 0.06291521, + "balance_loss_mlp": 0.01258424, + "epoch": 0.2857056966781903, + "flos": 39795590880000.0, + "grad_norm": 1.6833478059847915, + "language_loss": 0.80598158, + "learning_rate": 3.3524202278754353e-06, + "loss": 0.88373208, + "num_input_tokens_seen": 102514145, + "router_z_loss_clip": 2.06835938, + "router_z_loss_mlp": 0.1817627, + "step": 4752, + "time_per_iteration": 2.685669422149658 + }, + { + "auxiliary_loss_clip": 0.0649987, + "auxiliary_loss_mlp": 0.01272426, + "balance_loss_clip": 0.06292621, + "balance_loss_mlp": 0.01254223, + "epoch": 0.28576581993085826, + "flos": 21878846795520.0, + "grad_norm": 1.793038640961372, + "language_loss": 0.79062063, + "learning_rate": 3.3521332823804676e-06, + "loss": 0.86834359, + "num_input_tokens_seen": 102532365, + "router_z_loss_clip": 2.07324219, + "router_z_loss_mlp": 0.18200684, + "step": 4753, + "time_per_iteration": 2.612639904022217 + }, + { + "auxiliary_loss_clip": 0.06511062, + "auxiliary_loss_mlp": 0.01278051, + "balance_loss_clip": 0.06298037, + "balance_loss_mlp": 0.01257523, + "epoch": 0.2858259431835262, + "flos": 19096455317760.0, + "grad_norm": 2.5775982542053963, + "language_loss": 0.89774185, + "learning_rate": 3.3518462856124704e-06, + "loss": 0.97563303, + "num_input_tokens_seen": 102548425, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.20532227, + "step": 4754, + "time_per_iteration": 3.914802312850952 + }, + { + "auxiliary_loss_clip": 0.06494384, + "auxiliary_loss_mlp": 0.01278048, + "balance_loss_clip": 0.06293342, + "balance_loss_mlp": 0.01259988, + "epoch": 0.2858860664361942, + "flos": 20339673546240.0, + "grad_norm": 1.9874166310668562, + "language_loss": 0.82672411, + "learning_rate": 3.3515592375823267e-06, + "loss": 0.90444839, + "num_input_tokens_seen": 102566370, + "router_z_loss_clip": 2.01171875, + "router_z_loss_mlp": 0.18066406, + "step": 4755, + "time_per_iteration": 2.673158884048462 + }, + { + "auxiliary_loss_clip": 0.06498866, + "auxiliary_loss_mlp": 0.01274185, + "balance_loss_clip": 0.06291682, + "balance_loss_mlp": 0.0125721, + "epoch": 0.28594618968886215, + "flos": 24468551809920.0, + "grad_norm": 1.6562500913369433, + "language_loss": 0.83843541, + "learning_rate": 3.351272138300922e-06, + "loss": 0.91616589, + "num_input_tokens_seen": 102588715, + "router_z_loss_clip": 2.06835938, + "router_z_loss_mlp": 0.16992188, + "step": 4756, + "time_per_iteration": 2.6029391288757324 + }, + { + "auxiliary_loss_clip": 0.06377822, + "auxiliary_loss_mlp": 0.01262219, + "balance_loss_clip": 0.06279279, + "balance_loss_mlp": 0.01256002, + "epoch": 0.2860063129415301, + "flos": 71676170830080.0, + "grad_norm": 1.4612509113917642, + "language_loss": 0.6086607, + "learning_rate": 3.350984987779142e-06, + "loss": 0.68506116, + "num_input_tokens_seen": 102656715, + "router_z_loss_clip": 0.984375, + "router_z_loss_mlp": 0.06207275, + "step": 4757, + "time_per_iteration": 3.326833963394165 + }, + { + "auxiliary_loss_clip": 0.0650306, + "auxiliary_loss_mlp": 0.01277184, + "balance_loss_clip": 0.06298901, + "balance_loss_mlp": 0.01260459, + "epoch": 0.2860664361941981, + "flos": 20564993975040.0, + "grad_norm": 2.5468639815388996, + "language_loss": 0.66759324, + "learning_rate": 3.3506977860278756e-06, + "loss": 0.74539566, + "num_input_tokens_seen": 102676545, + "router_z_loss_clip": 2.04101562, + "router_z_loss_mlp": 0.1673584, + "step": 4758, + "time_per_iteration": 5.454218626022339 + }, + { + "auxiliary_loss_clip": 0.06503905, + "auxiliary_loss_mlp": 0.01277556, + "balance_loss_clip": 0.06294875, + "balance_loss_mlp": 0.01258817, + "epoch": 0.2861265594468661, + "flos": 36005992997760.0, + "grad_norm": 1.4420872105733484, + "language_loss": 0.63405287, + "learning_rate": 3.3504105330580143e-06, + "loss": 0.71186751, + "num_input_tokens_seen": 102702875, + "router_z_loss_clip": 2.09082031, + "router_z_loss_mlp": 0.1875, + "step": 4759, + "time_per_iteration": 2.745704174041748 + }, + { + "auxiliary_loss_clip": 0.06510226, + "auxiliary_loss_mlp": 0.01276918, + "balance_loss_clip": 0.06302258, + "balance_loss_mlp": 0.01257892, + "epoch": 0.28618668269953407, + "flos": 20053571379840.0, + "grad_norm": 2.14199936751817, + "language_loss": 0.74684435, + "learning_rate": 3.3501232288804496e-06, + "loss": 0.82471573, + "num_input_tokens_seen": 102723160, + "router_z_loss_clip": 2.078125, + "router_z_loss_mlp": 0.19030762, + "step": 4760, + "time_per_iteration": 2.541759490966797 + }, + { + "auxiliary_loss_clip": 0.06496474, + "auxiliary_loss_mlp": 0.01276289, + "balance_loss_clip": 0.06297328, + "balance_loss_mlp": 0.01260482, + "epoch": 0.28624680595220203, + "flos": 24978632739840.0, + "grad_norm": 1.8333731861449165, + "language_loss": 0.72652757, + "learning_rate": 3.3498358735060773e-06, + "loss": 0.80425525, + "num_input_tokens_seen": 102743855, + "router_z_loss_clip": 1.9921875, + "router_z_loss_mlp": 0.15795898, + "step": 4761, + "time_per_iteration": 2.57940673828125 + }, + { + "auxiliary_loss_clip": 0.06509258, + "auxiliary_loss_mlp": 0.01273154, + "balance_loss_clip": 0.06299996, + "balance_loss_mlp": 0.01256095, + "epoch": 0.28630692920487, + "flos": 22498862682240.0, + "grad_norm": 1.9183655494362113, + "language_loss": 0.74669504, + "learning_rate": 3.349548466945793e-06, + "loss": 0.82451922, + "num_input_tokens_seen": 102761370, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 0.1706543, + "step": 4762, + "time_per_iteration": 2.5321590900421143 + }, + { + "auxiliary_loss_clip": 0.06505883, + "auxiliary_loss_mlp": 0.01274368, + "balance_loss_clip": 0.06301434, + "balance_loss_mlp": 0.0125694, + "epoch": 0.28636705245753796, + "flos": 21255979870080.0, + "grad_norm": 2.6303759088840413, + "language_loss": 0.76297629, + "learning_rate": 3.349261009210496e-06, + "loss": 0.84077883, + "num_input_tokens_seen": 102780885, + "router_z_loss_clip": 2.04101562, + "router_z_loss_mlp": 0.17443848, + "step": 4763, + "time_per_iteration": 3.979782819747925 + }, + { + "auxiliary_loss_clip": 0.06506684, + "auxiliary_loss_mlp": 0.01275654, + "balance_loss_clip": 0.06298703, + "balance_loss_mlp": 0.012572, + "epoch": 0.28642717571020593, + "flos": 24102339540480.0, + "grad_norm": 1.7484925103151405, + "language_loss": 0.77499843, + "learning_rate": 3.348973500311086e-06, + "loss": 0.85282177, + "num_input_tokens_seen": 102801000, + "router_z_loss_clip": 2.08105469, + "router_z_loss_mlp": 0.18444824, + "step": 4764, + "time_per_iteration": 2.6036336421966553 + }, + { + "auxiliary_loss_clip": 0.0651267, + "auxiliary_loss_mlp": 0.01277486, + "balance_loss_clip": 0.06302905, + "balance_loss_mlp": 0.01257829, + "epoch": 0.2864872989628739, + "flos": 22607959098240.0, + "grad_norm": 5.154577786286556, + "language_loss": 0.71671587, + "learning_rate": 3.348685940258466e-06, + "loss": 0.79461741, + "num_input_tokens_seen": 102820230, + "router_z_loss_clip": 2.09765625, + "router_z_loss_mlp": 0.1965332, + "step": 4765, + "time_per_iteration": 2.5488131046295166 + }, + { + "auxiliary_loss_clip": 0.0651048, + "auxiliary_loss_mlp": 0.01272743, + "balance_loss_clip": 0.06304644, + "balance_loss_mlp": 0.01255684, + "epoch": 0.28654742221554186, + "flos": 32753449860480.0, + "grad_norm": 1.504395922922802, + "language_loss": 0.7630865, + "learning_rate": 3.3483983290635395e-06, + "loss": 0.84091872, + "num_input_tokens_seen": 102842670, + "router_z_loss_clip": 2.05957031, + "router_z_loss_mlp": 0.17053223, + "step": 4766, + "time_per_iteration": 2.659499406814575 + }, + { + "auxiliary_loss_clip": 0.0650377, + "auxiliary_loss_mlp": 0.01271145, + "balance_loss_clip": 0.0630042, + "balance_loss_mlp": 0.01254277, + "epoch": 0.2866075454682098, + "flos": 26989257386880.0, + "grad_norm": 2.0841406955827075, + "language_loss": 0.78443938, + "learning_rate": 3.348110666737214e-06, + "loss": 0.86218858, + "num_input_tokens_seen": 102864480, + "router_z_loss_clip": 2.03320312, + "router_z_loss_mlp": 0.16870117, + "step": 4767, + "time_per_iteration": 2.5891125202178955 + }, + { + "auxiliary_loss_clip": 0.06511022, + "auxiliary_loss_mlp": 0.01279425, + "balance_loss_clip": 0.06305116, + "balance_loss_mlp": 0.01261746, + "epoch": 0.2866676687208778, + "flos": 23259812336640.0, + "grad_norm": 2.0448044221544737, + "language_loss": 0.65430236, + "learning_rate": 3.3478229532903956e-06, + "loss": 0.73220682, + "num_input_tokens_seen": 102883740, + "router_z_loss_clip": 2.05761719, + "router_z_loss_mlp": 0.17675781, + "step": 4768, + "time_per_iteration": 2.572230815887451 + }, + { + "auxiliary_loss_clip": 0.0651636, + "auxiliary_loss_mlp": 0.01271508, + "balance_loss_clip": 0.06302489, + "balance_loss_mlp": 0.01253782, + "epoch": 0.28672779197354575, + "flos": 21586120156800.0, + "grad_norm": 1.6016626643500549, + "language_loss": 0.71173406, + "learning_rate": 3.3475351887339967e-06, + "loss": 0.78961271, + "num_input_tokens_seen": 102902945, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.17724609, + "step": 4769, + "time_per_iteration": 2.5180304050445557 + }, + { + "auxiliary_loss_clip": 0.06513099, + "auxiliary_loss_mlp": 0.01273812, + "balance_loss_clip": 0.06304821, + "balance_loss_mlp": 0.01256562, + "epoch": 0.2867879152262137, + "flos": 19871785946880.0, + "grad_norm": 1.7128041826885096, + "language_loss": 0.75347042, + "learning_rate": 3.3472473730789288e-06, + "loss": 0.83133948, + "num_input_tokens_seen": 102922405, + "router_z_loss_clip": 2.08496094, + "router_z_loss_mlp": 0.17248535, + "step": 4770, + "time_per_iteration": 2.575993537902832 + }, + { + "auxiliary_loss_clip": 0.06514675, + "auxiliary_loss_mlp": 0.01275884, + "balance_loss_clip": 0.06304142, + "balance_loss_mlp": 0.01257967, + "epoch": 0.2868480384788817, + "flos": 28219687868160.0, + "grad_norm": 4.606069071133779, + "language_loss": 0.68064034, + "learning_rate": 3.3469595063361045e-06, + "loss": 0.75854599, + "num_input_tokens_seen": 102938980, + "router_z_loss_clip": 2.10546875, + "router_z_loss_mlp": 0.17907715, + "step": 4771, + "time_per_iteration": 2.5533907413482666 + }, + { + "auxiliary_loss_clip": 0.06411134, + "auxiliary_loss_mlp": 0.0125763, + "balance_loss_clip": 0.06311508, + "balance_loss_mlp": 0.01253345, + "epoch": 0.2869081617315497, + "flos": 65442218768640.0, + "grad_norm": 0.7478629548239109, + "language_loss": 0.56696546, + "learning_rate": 3.3466715885164414e-06, + "loss": 0.64365304, + "num_input_tokens_seen": 103000405, + "router_z_loss_clip": 0.99609375, + "router_z_loss_mlp": 0.04290771, + "step": 4772, + "time_per_iteration": 3.1295437812805176 + }, + { + "auxiliary_loss_clip": 0.06515288, + "auxiliary_loss_mlp": 0.01274714, + "balance_loss_clip": 0.06305212, + "balance_loss_mlp": 0.01256165, + "epoch": 0.28696828498421767, + "flos": 18666610272000.0, + "grad_norm": 3.729070810615603, + "language_loss": 0.84013474, + "learning_rate": 3.346383619630856e-06, + "loss": 0.91803479, + "num_input_tokens_seen": 103017970, + "router_z_loss_clip": 2.09765625, + "router_z_loss_mlp": 0.1854248, + "step": 4773, + "time_per_iteration": 2.5181708335876465 + }, + { + "auxiliary_loss_clip": 0.06518447, + "auxiliary_loss_mlp": 0.01274166, + "balance_loss_clip": 0.06306095, + "balance_loss_mlp": 0.01254985, + "epoch": 0.28702840823688563, + "flos": 23666540855040.0, + "grad_norm": 2.856350636496585, + "language_loss": 0.78241181, + "learning_rate": 3.34609559969027e-06, + "loss": 0.86033797, + "num_input_tokens_seen": 103036385, + "router_z_loss_clip": 2.12109375, + "router_z_loss_mlp": 0.19177246, + "step": 4774, + "time_per_iteration": 2.5831918716430664 + }, + { + "auxiliary_loss_clip": 0.06519175, + "auxiliary_loss_mlp": 0.01275468, + "balance_loss_clip": 0.06307949, + "balance_loss_mlp": 0.01255703, + "epoch": 0.2870885314895536, + "flos": 13809248611200.0, + "grad_norm": 1.8762920881530476, + "language_loss": 0.74056339, + "learning_rate": 3.3458075287056034e-06, + "loss": 0.81850982, + "num_input_tokens_seen": 103052170, + "router_z_loss_clip": 2.11328125, + "router_z_loss_mlp": 0.19763184, + "step": 4775, + "time_per_iteration": 2.505293369293213 + }, + { + "auxiliary_loss_clip": 0.06520346, + "auxiliary_loss_mlp": 0.01275844, + "balance_loss_clip": 0.06309157, + "balance_loss_mlp": 0.01258142, + "epoch": 0.28714865474222157, + "flos": 17792790768000.0, + "grad_norm": 1.8823617406689648, + "language_loss": 0.88338864, + "learning_rate": 3.34551940668778e-06, + "loss": 0.96135056, + "num_input_tokens_seen": 103070510, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.17687988, + "step": 4776, + "time_per_iteration": 2.5638997554779053 + }, + { + "auxiliary_loss_clip": 0.06511634, + "auxiliary_loss_mlp": 0.01275769, + "balance_loss_clip": 0.06302971, + "balance_loss_mlp": 0.01258269, + "epoch": 0.28720877799488953, + "flos": 16002958429440.0, + "grad_norm": 2.648093963017482, + "language_loss": 0.74451852, + "learning_rate": 3.345231233647726e-06, + "loss": 0.82239252, + "num_input_tokens_seen": 103089590, + "router_z_loss_clip": 2.08789062, + "router_z_loss_mlp": 0.17492676, + "step": 4777, + "time_per_iteration": 2.5142223834991455 + }, + { + "auxiliary_loss_clip": 0.06527238, + "auxiliary_loss_mlp": 0.01280106, + "balance_loss_clip": 0.06311023, + "balance_loss_mlp": 0.01259924, + "epoch": 0.2872689012475575, + "flos": 20929445308800.0, + "grad_norm": 2.200879096052639, + "language_loss": 0.80539143, + "learning_rate": 3.3449430095963696e-06, + "loss": 0.88346487, + "num_input_tokens_seen": 103109080, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.20202637, + "step": 4778, + "time_per_iteration": 2.563994884490967 + }, + { + "auxiliary_loss_clip": 0.06511427, + "auxiliary_loss_mlp": 0.01281129, + "balance_loss_clip": 0.06304548, + "balance_loss_mlp": 0.01263223, + "epoch": 0.28732902450022546, + "flos": 21331603779840.0, + "grad_norm": 1.7996465112645923, + "language_loss": 0.73886508, + "learning_rate": 3.3446547345446386e-06, + "loss": 0.8167907, + "num_input_tokens_seen": 103127755, + "router_z_loss_clip": 2.06542969, + "router_z_loss_mlp": 0.17895508, + "step": 4779, + "time_per_iteration": 2.5394158363342285 + }, + { + "auxiliary_loss_clip": 0.06518923, + "auxiliary_loss_mlp": 0.01275383, + "balance_loss_clip": 0.06307982, + "balance_loss_mlp": 0.01255379, + "epoch": 0.2873891477528934, + "flos": 20856714364800.0, + "grad_norm": 1.509851280453794, + "language_loss": 0.76844704, + "learning_rate": 3.3443664085034656e-06, + "loss": 0.84639007, + "num_input_tokens_seen": 103147035, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.19995117, + "step": 4780, + "time_per_iteration": 2.5928425788879395 + }, + { + "auxiliary_loss_clip": 0.06507713, + "auxiliary_loss_mlp": 0.01271777, + "balance_loss_clip": 0.06302975, + "balance_loss_mlp": 0.01254014, + "epoch": 0.2874492710055614, + "flos": 17425698030720.0, + "grad_norm": 1.6471362454858889, + "language_loss": 0.81874287, + "learning_rate": 3.344078031483784e-06, + "loss": 0.89653778, + "num_input_tokens_seen": 103165410, + "router_z_loss_clip": 2.04296875, + "router_z_loss_mlp": 0.17773438, + "step": 4781, + "time_per_iteration": 2.6121537685394287 + }, + { + "auxiliary_loss_clip": 0.06521222, + "auxiliary_loss_mlp": 0.0127902, + "balance_loss_clip": 0.06306514, + "balance_loss_mlp": 0.01257002, + "epoch": 0.28750939425822936, + "flos": 13411827895680.0, + "grad_norm": 2.0671181517724966, + "language_loss": 0.86987036, + "learning_rate": 3.3437896034965283e-06, + "loss": 0.94787276, + "num_input_tokens_seen": 103183710, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.22009277, + "step": 4782, + "time_per_iteration": 2.554326057434082 + }, + { + "auxiliary_loss_clip": 0.06525762, + "auxiliary_loss_mlp": 0.01282396, + "balance_loss_clip": 0.06310341, + "balance_loss_mlp": 0.01262238, + "epoch": 0.2875695175108973, + "flos": 21876205392000.0, + "grad_norm": 1.4282255381090248, + "language_loss": 0.71525908, + "learning_rate": 3.3435011245526357e-06, + "loss": 0.79334062, + "num_input_tokens_seen": 103203790, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.20153809, + "step": 4783, + "time_per_iteration": 2.5632100105285645 + }, + { + "auxiliary_loss_clip": 0.06514136, + "auxiliary_loss_mlp": 0.01279499, + "balance_loss_clip": 0.06305264, + "balance_loss_mlp": 0.01259186, + "epoch": 0.2876296407635653, + "flos": 26251885457280.0, + "grad_norm": 1.5568964680804804, + "language_loss": 0.77152872, + "learning_rate": 3.343212594663047e-06, + "loss": 0.84946513, + "num_input_tokens_seen": 103223925, + "router_z_loss_clip": 2.08886719, + "router_z_loss_mlp": 0.203125, + "step": 4784, + "time_per_iteration": 2.589073657989502 + }, + { + "auxiliary_loss_clip": 0.06506136, + "auxiliary_loss_mlp": 0.01278073, + "balance_loss_clip": 0.06301259, + "balance_loss_mlp": 0.01257914, + "epoch": 0.28768976401623325, + "flos": 25380581575680.0, + "grad_norm": 1.5725877671574655, + "language_loss": 0.76106405, + "learning_rate": 3.3429240138387015e-06, + "loss": 0.83890617, + "num_input_tokens_seen": 103244760, + "router_z_loss_clip": 2.04785156, + "router_z_loss_mlp": 0.20153809, + "step": 4785, + "time_per_iteration": 2.6051061153411865 + }, + { + "auxiliary_loss_clip": 0.06513079, + "auxiliary_loss_mlp": 0.0127873, + "balance_loss_clip": 0.06302914, + "balance_loss_mlp": 0.01259394, + "epoch": 0.28774988726890127, + "flos": 30672232548480.0, + "grad_norm": 2.246179731229797, + "language_loss": 0.83339965, + "learning_rate": 3.3426353820905425e-06, + "loss": 0.91131771, + "num_input_tokens_seen": 103261995, + "router_z_loss_clip": 2.1015625, + "router_z_loss_mlp": 0.19348145, + "step": 4786, + "time_per_iteration": 2.6064071655273438 + }, + { + "auxiliary_loss_clip": 0.06512371, + "auxiliary_loss_mlp": 0.01278365, + "balance_loss_clip": 0.06303188, + "balance_loss_mlp": 0.01258934, + "epoch": 0.28781001052156924, + "flos": 20601820644480.0, + "grad_norm": 2.4876341958211037, + "language_loss": 0.80607671, + "learning_rate": 3.342346699429516e-06, + "loss": 0.88398409, + "num_input_tokens_seen": 103279780, + "router_z_loss_clip": 2.09179688, + "router_z_loss_mlp": 0.19433594, + "step": 4787, + "time_per_iteration": 2.574558973312378 + }, + { + "auxiliary_loss_clip": 0.06516974, + "auxiliary_loss_mlp": 0.01280481, + "balance_loss_clip": 0.0630367, + "balance_loss_mlp": 0.01260191, + "epoch": 0.2878701337742372, + "flos": 26549643340800.0, + "grad_norm": 1.713934654291453, + "language_loss": 0.84188497, + "learning_rate": 3.3420579658665677e-06, + "loss": 0.91985947, + "num_input_tokens_seen": 103300580, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.20288086, + "step": 4788, + "time_per_iteration": 2.610520362854004 + }, + { + "auxiliary_loss_clip": 0.06528202, + "auxiliary_loss_mlp": 0.01278372, + "balance_loss_clip": 0.06311956, + "balance_loss_mlp": 0.01257594, + "epoch": 0.28793025702690517, + "flos": 28154294156160.0, + "grad_norm": 1.8819133496848792, + "language_loss": 0.73887986, + "learning_rate": 3.3417691814126468e-06, + "loss": 0.81694555, + "num_input_tokens_seen": 103320430, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.2076416, + "step": 4789, + "time_per_iteration": 2.637234687805176 + }, + { + "auxiliary_loss_clip": 0.06504419, + "auxiliary_loss_mlp": 0.0127649, + "balance_loss_clip": 0.06300576, + "balance_loss_mlp": 0.01259014, + "epoch": 0.28799038027957313, + "flos": 23812254305280.0, + "grad_norm": 1.6484379512289788, + "language_loss": 0.84411776, + "learning_rate": 3.341480346078704e-06, + "loss": 0.92192692, + "num_input_tokens_seen": 103337695, + "router_z_loss_clip": 2.03808594, + "router_z_loss_mlp": 0.17492676, + "step": 4790, + "time_per_iteration": 2.5587222576141357 + }, + { + "auxiliary_loss_clip": 0.06518544, + "auxiliary_loss_mlp": 0.01278217, + "balance_loss_clip": 0.06308021, + "balance_loss_mlp": 0.01259728, + "epoch": 0.2880505035322411, + "flos": 22350340120320.0, + "grad_norm": 1.9872780385985664, + "language_loss": 0.78222489, + "learning_rate": 3.3411914598756922e-06, + "loss": 0.86019248, + "num_input_tokens_seen": 103357010, + "router_z_loss_clip": 2.10546875, + "router_z_loss_mlp": 0.18481445, + "step": 4791, + "time_per_iteration": 2.624457359313965 + }, + { + "auxiliary_loss_clip": 0.06518695, + "auxiliary_loss_mlp": 0.01277015, + "balance_loss_clip": 0.06302316, + "balance_loss_mlp": 0.01257286, + "epoch": 0.28811062678490906, + "flos": 18010061205120.0, + "grad_norm": 3.7561845310327002, + "language_loss": 0.71278274, + "learning_rate": 3.3409025228145654e-06, + "loss": 0.79073977, + "num_input_tokens_seen": 103375600, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.19726562, + "step": 4792, + "time_per_iteration": 2.5208675861358643 + }, + { + "auxiliary_loss_clip": 0.06512474, + "auxiliary_loss_mlp": 0.01276677, + "balance_loss_clip": 0.06301394, + "balance_loss_mlp": 0.01258391, + "epoch": 0.28817075003757703, + "flos": 22097416970880.0, + "grad_norm": 1.8001054572072859, + "language_loss": 0.80413318, + "learning_rate": 3.3406135349062812e-06, + "loss": 0.88202471, + "num_input_tokens_seen": 103395225, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 0.18286133, + "step": 4793, + "time_per_iteration": 4.170284271240234 + }, + { + "auxiliary_loss_clip": 0.06499149, + "auxiliary_loss_mlp": 0.01283104, + "balance_loss_clip": 0.06297339, + "balance_loss_mlp": 0.01264484, + "epoch": 0.288230873290245, + "flos": 41692842552960.0, + "grad_norm": 1.6709200510021447, + "language_loss": 0.78107667, + "learning_rate": 3.340324496161797e-06, + "loss": 0.85889918, + "num_input_tokens_seen": 103417245, + "router_z_loss_clip": 2.01757812, + "router_z_loss_mlp": 0.18603516, + "step": 4794, + "time_per_iteration": 2.8557510375976562 + }, + { + "auxiliary_loss_clip": 0.06510264, + "auxiliary_loss_mlp": 0.01279527, + "balance_loss_clip": 0.06298079, + "balance_loss_mlp": 0.01260882, + "epoch": 0.28829099654291296, + "flos": 18630328654080.0, + "grad_norm": 2.1208293695579608, + "language_loss": 0.83245766, + "learning_rate": 3.340035406592074e-06, + "loss": 0.91035557, + "num_input_tokens_seen": 103435500, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.18652344, + "step": 4795, + "time_per_iteration": 2.535163164138794 + }, + { + "auxiliary_loss_clip": 0.06498718, + "auxiliary_loss_mlp": 0.0128311, + "balance_loss_clip": 0.06297053, + "balance_loss_mlp": 0.01266099, + "epoch": 0.2883511197955809, + "flos": 24680707148160.0, + "grad_norm": 2.078774389913416, + "language_loss": 0.75219119, + "learning_rate": 3.339746266208074e-06, + "loss": 0.83000946, + "num_input_tokens_seen": 103451040, + "router_z_loss_clip": 2.015625, + "router_z_loss_mlp": 0.17004395, + "step": 4796, + "time_per_iteration": 2.567488670349121 + }, + { + "auxiliary_loss_clip": 0.06509424, + "auxiliary_loss_mlp": 0.01276979, + "balance_loss_clip": 0.06296358, + "balance_loss_mlp": 0.01257798, + "epoch": 0.2884112430482489, + "flos": 23118794714880.0, + "grad_norm": 2.1968759883463513, + "language_loss": 0.73290622, + "learning_rate": 3.3394570750207614e-06, + "loss": 0.81077027, + "num_input_tokens_seen": 103471330, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.19189453, + "step": 4797, + "time_per_iteration": 3.975389242172241 + }, + { + "auxiliary_loss_clip": 0.06507025, + "auxiliary_loss_mlp": 0.01273799, + "balance_loss_clip": 0.0629791, + "balance_loss_mlp": 0.0125556, + "epoch": 0.28847136630091685, + "flos": 16879000066560.0, + "grad_norm": 2.2937655739300373, + "language_loss": 0.74862409, + "learning_rate": 3.3391678330411017e-06, + "loss": 0.82643229, + "num_input_tokens_seen": 103488060, + "router_z_loss_clip": 2.08984375, + "router_z_loss_mlp": 0.18212891, + "step": 4798, + "time_per_iteration": 3.9849729537963867 + }, + { + "auxiliary_loss_clip": 0.06517179, + "auxiliary_loss_mlp": 0.01285883, + "balance_loss_clip": 0.06306559, + "balance_loss_mlp": 0.01266381, + "epoch": 0.2885314895535849, + "flos": 25663161870720.0, + "grad_norm": 2.626807334731923, + "language_loss": 0.65891635, + "learning_rate": 3.3388785402800642e-06, + "loss": 0.736947, + "num_input_tokens_seen": 103503600, + "router_z_loss_clip": 2.09960938, + "router_z_loss_mlp": 0.19494629, + "step": 4799, + "time_per_iteration": 2.6063008308410645 + }, + { + "auxiliary_loss_clip": 0.06513311, + "auxiliary_loss_mlp": 0.01278669, + "balance_loss_clip": 0.06300591, + "balance_loss_mlp": 0.01260013, + "epoch": 0.28859161280625284, + "flos": 21113872145280.0, + "grad_norm": 1.5942901452973643, + "language_loss": 0.82659006, + "learning_rate": 3.3385891967486178e-06, + "loss": 0.9045099, + "num_input_tokens_seen": 103524195, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.18664551, + "step": 4800, + "time_per_iteration": 2.5522704124450684 + }, + { + "auxiliary_loss_clip": 0.06498213, + "auxiliary_loss_mlp": 0.01277775, + "balance_loss_clip": 0.06294428, + "balance_loss_mlp": 0.01260609, + "epoch": 0.2886517360589208, + "flos": 26476870469760.0, + "grad_norm": 1.7957021177556654, + "language_loss": 0.91005886, + "learning_rate": 3.3382998024577347e-06, + "loss": 0.98781872, + "num_input_tokens_seen": 103545235, + "router_z_loss_clip": 2.03613281, + "router_z_loss_mlp": 0.17175293, + "step": 4801, + "time_per_iteration": 2.648975372314453 + }, + { + "auxiliary_loss_clip": 0.06509861, + "auxiliary_loss_mlp": 0.01278615, + "balance_loss_clip": 0.06299478, + "balance_loss_mlp": 0.01260722, + "epoch": 0.28871185931158877, + "flos": 25272365627520.0, + "grad_norm": 1.8432796050129874, + "language_loss": 0.74294543, + "learning_rate": 3.33801035741839e-06, + "loss": 0.82083023, + "num_input_tokens_seen": 103563305, + "router_z_loss_clip": 2.10546875, + "router_z_loss_mlp": 0.17895508, + "step": 4802, + "time_per_iteration": 2.5519795417785645 + }, + { + "auxiliary_loss_clip": 0.0639186, + "auxiliary_loss_mlp": 0.01290861, + "balance_loss_clip": 0.06293292, + "balance_loss_mlp": 0.01286456, + "epoch": 0.28877198256425674, + "flos": 66683676061440.0, + "grad_norm": 0.7742675136744124, + "language_loss": 0.62925327, + "learning_rate": 3.337720861641558e-06, + "loss": 0.70608056, + "num_input_tokens_seen": 103625025, + "router_z_loss_clip": 0.984375, + "router_z_loss_mlp": 0.04412842, + "step": 4803, + "time_per_iteration": 4.557742595672607 + }, + { + "auxiliary_loss_clip": 0.06504417, + "auxiliary_loss_mlp": 0.01273971, + "balance_loss_clip": 0.06297504, + "balance_loss_mlp": 0.01256721, + "epoch": 0.2888321058169247, + "flos": 20309261713920.0, + "grad_norm": 2.312081796144873, + "language_loss": 0.71418971, + "learning_rate": 3.3374313151382165e-06, + "loss": 0.79197359, + "num_input_tokens_seen": 103644235, + "router_z_loss_clip": 2.06738281, + "router_z_loss_mlp": 0.17248535, + "step": 4804, + "time_per_iteration": 2.5679221153259277 + }, + { + "auxiliary_loss_clip": 0.06511839, + "auxiliary_loss_mlp": 0.01276786, + "balance_loss_clip": 0.06299883, + "balance_loss_mlp": 0.01258892, + "epoch": 0.28889222906959267, + "flos": 25523192424960.0, + "grad_norm": 2.035708939634364, + "language_loss": 0.68254268, + "learning_rate": 3.337141717919346e-06, + "loss": 0.76042891, + "num_input_tokens_seen": 103664700, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.17907715, + "step": 4805, + "time_per_iteration": 2.5894699096679688 + }, + { + "auxiliary_loss_clip": 0.06510667, + "auxiliary_loss_mlp": 0.01276264, + "balance_loss_clip": 0.06300112, + "balance_loss_mlp": 0.01258955, + "epoch": 0.28895235232226063, + "flos": 32679544959360.0, + "grad_norm": 1.67836402891337, + "language_loss": 0.69622278, + "learning_rate": 3.3368520699959272e-06, + "loss": 0.77409214, + "num_input_tokens_seen": 103686595, + "router_z_loss_clip": 2.10546875, + "router_z_loss_mlp": 0.1730957, + "step": 4806, + "time_per_iteration": 2.6661036014556885 + }, + { + "auxiliary_loss_clip": 0.06499489, + "auxiliary_loss_mlp": 0.01273073, + "balance_loss_clip": 0.06297253, + "balance_loss_mlp": 0.01256133, + "epoch": 0.2890124755749286, + "flos": 29722202156160.0, + "grad_norm": 1.5048672267596763, + "language_loss": 0.71718901, + "learning_rate": 3.3365623713789443e-06, + "loss": 0.7949146, + "num_input_tokens_seen": 103707525, + "router_z_loss_clip": 2.02441406, + "router_z_loss_mlp": 0.16931152, + "step": 4807, + "time_per_iteration": 2.6082210540771484 + }, + { + "auxiliary_loss_clip": 0.06506096, + "auxiliary_loss_mlp": 0.01273173, + "balance_loss_clip": 0.06298453, + "balance_loss_mlp": 0.01255769, + "epoch": 0.28907259882759656, + "flos": 22681067385600.0, + "grad_norm": 1.6103433555287536, + "language_loss": 0.8189373, + "learning_rate": 3.336272622079382e-06, + "loss": 0.89672995, + "num_input_tokens_seen": 103727905, + "router_z_loss_clip": 2.07421875, + "router_z_loss_mlp": 0.17407227, + "step": 4808, + "time_per_iteration": 2.575005292892456 + }, + { + "auxiliary_loss_clip": 0.0649471, + "auxiliary_loss_mlp": 0.01279377, + "balance_loss_clip": 0.06293811, + "balance_loss_mlp": 0.01261543, + "epoch": 0.2891327220802645, + "flos": 22572809510400.0, + "grad_norm": 1.6658984409983257, + "language_loss": 0.79128641, + "learning_rate": 3.3359828221082276e-06, + "loss": 0.86902726, + "num_input_tokens_seen": 103748335, + "router_z_loss_clip": 2.00878906, + "router_z_loss_mlp": 0.17834473, + "step": 4809, + "time_per_iteration": 2.563202142715454 + }, + { + "auxiliary_loss_clip": 0.06509645, + "auxiliary_loss_mlp": 0.01274578, + "balance_loss_clip": 0.06294866, + "balance_loss_mlp": 0.01256411, + "epoch": 0.2891928453329325, + "flos": 21659228444160.0, + "grad_norm": 1.9154470794900575, + "language_loss": 0.79370517, + "learning_rate": 3.3356929714764714e-06, + "loss": 0.8715474, + "num_input_tokens_seen": 103767020, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.18151855, + "step": 4810, + "time_per_iteration": 2.555290460586548 + }, + { + "auxiliary_loss_clip": 0.06499892, + "auxiliary_loss_mlp": 0.01275722, + "balance_loss_clip": 0.06295595, + "balance_loss_mlp": 0.01259259, + "epoch": 0.28925296858560046, + "flos": 23228855452800.0, + "grad_norm": 1.5886971021791327, + "language_loss": 0.77595514, + "learning_rate": 3.3354030701951032e-06, + "loss": 0.85371131, + "num_input_tokens_seen": 103786355, + "router_z_loss_clip": 2.04199219, + "router_z_loss_mlp": 0.16467285, + "step": 4811, + "time_per_iteration": 2.5522642135620117 + }, + { + "auxiliary_loss_clip": 0.06509165, + "auxiliary_loss_mlp": 0.01277164, + "balance_loss_clip": 0.06302579, + "balance_loss_mlp": 0.01259497, + "epoch": 0.2893130918382685, + "flos": 28629267425280.0, + "grad_norm": 1.704164513062304, + "language_loss": 0.78002596, + "learning_rate": 3.335113118275117e-06, + "loss": 0.85788929, + "num_input_tokens_seen": 103809345, + "router_z_loss_clip": 2.06933594, + "router_z_loss_mlp": 0.17675781, + "step": 4812, + "time_per_iteration": 2.6069154739379883 + }, + { + "auxiliary_loss_clip": 0.06384769, + "auxiliary_loss_mlp": 0.01270413, + "balance_loss_clip": 0.06288065, + "balance_loss_mlp": 0.01266965, + "epoch": 0.28937321509093644, + "flos": 72323328240000.0, + "grad_norm": 0.7614773045430072, + "language_loss": 0.60086656, + "learning_rate": 3.3348231157275085e-06, + "loss": 0.67741829, + "num_input_tokens_seen": 103871180, + "router_z_loss_clip": 0.96875, + "router_z_loss_mlp": 0.03457642, + "step": 4813, + "time_per_iteration": 3.3377795219421387 + }, + { + "auxiliary_loss_clip": 0.06503347, + "auxiliary_loss_mlp": 0.01279669, + "balance_loss_clip": 0.0629978, + "balance_loss_mlp": 0.01262253, + "epoch": 0.2894333383436044, + "flos": 16221905948160.0, + "grad_norm": 2.095142654160917, + "language_loss": 0.83059847, + "learning_rate": 3.3345330625632725e-06, + "loss": 0.90842861, + "num_input_tokens_seen": 103889040, + "router_z_loss_clip": 2.03515625, + "router_z_loss_mlp": 0.17407227, + "step": 4814, + "time_per_iteration": 2.519822120666504 + }, + { + "auxiliary_loss_clip": 0.06510264, + "auxiliary_loss_mlp": 0.0128276, + "balance_loss_clip": 0.06297985, + "balance_loss_mlp": 0.01264389, + "epoch": 0.2894934615962724, + "flos": 24835434912000.0, + "grad_norm": 1.4921373382431753, + "language_loss": 0.72583377, + "learning_rate": 3.3342429587934094e-06, + "loss": 0.80376399, + "num_input_tokens_seen": 103910380, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.18371582, + "step": 4815, + "time_per_iteration": 2.613424301147461 + }, + { + "auxiliary_loss_clip": 0.06496876, + "auxiliary_loss_mlp": 0.01270189, + "balance_loss_clip": 0.06299625, + "balance_loss_mlp": 0.01253858, + "epoch": 0.28955358484894034, + "flos": 20456400683520.0, + "grad_norm": 1.478095248571898, + "language_loss": 0.71455014, + "learning_rate": 3.3339528044289198e-06, + "loss": 0.79222083, + "num_input_tokens_seen": 103929955, + "router_z_loss_clip": 1.97265625, + "router_z_loss_mlp": 0.16345215, + "step": 4816, + "time_per_iteration": 2.523789644241333 + }, + { + "auxiliary_loss_clip": 0.0651416, + "auxiliary_loss_mlp": 0.01273853, + "balance_loss_clip": 0.06301913, + "balance_loss_mlp": 0.01256007, + "epoch": 0.2896137081016083, + "flos": 22571803261440.0, + "grad_norm": 2.1886400582799643, + "language_loss": 0.75928313, + "learning_rate": 3.3336625994808055e-06, + "loss": 0.83716327, + "num_input_tokens_seen": 103948020, + "router_z_loss_clip": 2.12109375, + "router_z_loss_mlp": 0.17834473, + "step": 4817, + "time_per_iteration": 2.5829625129699707 + }, + { + "auxiliary_loss_clip": 0.0650699, + "auxiliary_loss_mlp": 0.0127444, + "balance_loss_clip": 0.06299114, + "balance_loss_mlp": 0.01255486, + "epoch": 0.28967383135427627, + "flos": 26695231009920.0, + "grad_norm": 2.009148210409016, + "language_loss": 0.77384543, + "learning_rate": 3.3333723439600723e-06, + "loss": 0.85165972, + "num_input_tokens_seen": 103968740, + "router_z_loss_clip": 2.078125, + "router_z_loss_mlp": 0.18933105, + "step": 4818, + "time_per_iteration": 2.583580732345581 + }, + { + "auxiliary_loss_clip": 0.06511898, + "auxiliary_loss_mlp": 0.01274642, + "balance_loss_clip": 0.063049, + "balance_loss_mlp": 0.01257833, + "epoch": 0.28973395460694423, + "flos": 15563428237440.0, + "grad_norm": 1.8180363278883531, + "language_loss": 0.80166686, + "learning_rate": 3.3330820378777263e-06, + "loss": 0.87953222, + "num_input_tokens_seen": 103986005, + "router_z_loss_clip": 2.07128906, + "router_z_loss_mlp": 0.16833496, + "step": 4819, + "time_per_iteration": 2.58598256111145 + }, + { + "auxiliary_loss_clip": 0.06512412, + "auxiliary_loss_mlp": 0.01275212, + "balance_loss_clip": 0.06301294, + "balance_loss_mlp": 0.01256543, + "epoch": 0.2897940778596122, + "flos": 18703395014400.0, + "grad_norm": 1.8889731698350438, + "language_loss": 0.79784238, + "learning_rate": 3.332791681244776e-06, + "loss": 0.87571859, + "num_input_tokens_seen": 104005070, + "router_z_loss_clip": 2.11328125, + "router_z_loss_mlp": 0.18664551, + "step": 4820, + "time_per_iteration": 2.514738082885742 + }, + { + "auxiliary_loss_clip": 0.06519003, + "auxiliary_loss_mlp": 0.01272112, + "balance_loss_clip": 0.06309246, + "balance_loss_mlp": 0.01254612, + "epoch": 0.28985420111228016, + "flos": 18776209812480.0, + "grad_norm": 1.948801074603747, + "language_loss": 0.73537958, + "learning_rate": 3.332501274072231e-06, + "loss": 0.81329072, + "num_input_tokens_seen": 104022945, + "router_z_loss_clip": 2.09863281, + "router_z_loss_mlp": 0.17492676, + "step": 4821, + "time_per_iteration": 2.6552352905273438 + }, + { + "auxiliary_loss_clip": 0.06509826, + "auxiliary_loss_mlp": 0.01279091, + "balance_loss_clip": 0.06303322, + "balance_loss_mlp": 0.01260733, + "epoch": 0.28991432436494813, + "flos": 23075511281280.0, + "grad_norm": 1.9415887628712303, + "language_loss": 0.7256397, + "learning_rate": 3.332210816371104e-06, + "loss": 0.8035289, + "num_input_tokens_seen": 104042080, + "router_z_loss_clip": 2.06347656, + "router_z_loss_mlp": 0.18347168, + "step": 4822, + "time_per_iteration": 2.5311806201934814 + }, + { + "auxiliary_loss_clip": 0.06508678, + "auxiliary_loss_mlp": 0.0127532, + "balance_loss_clip": 0.06304502, + "balance_loss_mlp": 0.01258237, + "epoch": 0.2899744476176161, + "flos": 17608992837120.0, + "grad_norm": 1.6868082855094653, + "language_loss": 0.66498971, + "learning_rate": 3.3319203081524102e-06, + "loss": 0.74282968, + "num_input_tokens_seen": 104060975, + "router_z_loss_clip": 2.04003906, + "router_z_loss_mlp": 0.17077637, + "step": 4823, + "time_per_iteration": 2.5582497119903564 + }, + { + "auxiliary_loss_clip": 0.06507877, + "auxiliary_loss_mlp": 0.0127093, + "balance_loss_clip": 0.06303018, + "balance_loss_mlp": 0.01253728, + "epoch": 0.29003457087028406, + "flos": 22315861365120.0, + "grad_norm": 2.007628710478466, + "language_loss": 0.81589168, + "learning_rate": 3.331629749427164e-06, + "loss": 0.89367974, + "num_input_tokens_seen": 104081395, + "router_z_loss_clip": 2.04882812, + "router_z_loss_mlp": 0.171875, + "step": 4824, + "time_per_iteration": 2.5258595943450928 + }, + { + "auxiliary_loss_clip": 0.06510833, + "auxiliary_loss_mlp": 0.01276305, + "balance_loss_clip": 0.06301483, + "balance_loss_mlp": 0.01258376, + "epoch": 0.2900946941229521, + "flos": 21951493885440.0, + "grad_norm": 1.837693758429887, + "language_loss": 0.73192668, + "learning_rate": 3.331339140206385e-06, + "loss": 0.80979806, + "num_input_tokens_seen": 104099995, + "router_z_loss_clip": 2.09179688, + "router_z_loss_mlp": 0.17932129, + "step": 4825, + "time_per_iteration": 2.558096170425415 + }, + { + "auxiliary_loss_clip": 0.0651435, + "auxiliary_loss_mlp": 0.01275324, + "balance_loss_clip": 0.06305824, + "balance_loss_mlp": 0.01257049, + "epoch": 0.29015481737562004, + "flos": 17938126874880.0, + "grad_norm": 2.202818652908599, + "language_loss": 0.7426061, + "learning_rate": 3.331048480501092e-06, + "loss": 0.82050288, + "num_input_tokens_seen": 104118930, + "router_z_loss_clip": 2.08496094, + "router_z_loss_mlp": 0.18273926, + "step": 4826, + "time_per_iteration": 2.497711420059204 + }, + { + "auxiliary_loss_clip": 0.06516986, + "auxiliary_loss_mlp": 0.01278902, + "balance_loss_clip": 0.06309567, + "balance_loss_mlp": 0.01262141, + "epoch": 0.290214940628288, + "flos": 22790079947520.0, + "grad_norm": 1.934932602801083, + "language_loss": 0.69077051, + "learning_rate": 3.3307577703223073e-06, + "loss": 0.76872945, + "num_input_tokens_seen": 104136940, + "router_z_loss_clip": 2.07324219, + "router_z_loss_mlp": 0.16748047, + "step": 4827, + "time_per_iteration": 2.5729641914367676 + }, + { + "auxiliary_loss_clip": 0.06517433, + "auxiliary_loss_mlp": 0.0127379, + "balance_loss_clip": 0.06311382, + "balance_loss_mlp": 0.01255646, + "epoch": 0.290275063880956, + "flos": 20011881173760.0, + "grad_norm": 1.8047855406998587, + "language_loss": 0.80766201, + "learning_rate": 3.3304670096810545e-06, + "loss": 0.88557422, + "num_input_tokens_seen": 104154280, + "router_z_loss_clip": 2.06054688, + "router_z_loss_mlp": 0.18151855, + "step": 4828, + "time_per_iteration": 2.5190348625183105 + }, + { + "auxiliary_loss_clip": 0.0651058, + "auxiliary_loss_mlp": 0.01278642, + "balance_loss_clip": 0.06308287, + "balance_loss_mlp": 0.01260809, + "epoch": 0.29033518713362394, + "flos": 22060003322880.0, + "grad_norm": 1.646725141321262, + "language_loss": 0.80908686, + "learning_rate": 3.33017619858836e-06, + "loss": 0.8869791, + "num_input_tokens_seen": 104172605, + "router_z_loss_clip": 2.02246094, + "router_z_loss_mlp": 0.17822266, + "step": 4829, + "time_per_iteration": 2.564837694168091 + }, + { + "auxiliary_loss_clip": 0.06503877, + "auxiliary_loss_mlp": 0.01277613, + "balance_loss_clip": 0.06304269, + "balance_loss_mlp": 0.0126059, + "epoch": 0.2903953103862919, + "flos": 25637194304640.0, + "grad_norm": 1.4271698228137566, + "language_loss": 0.82616186, + "learning_rate": 3.329885337055249e-06, + "loss": 0.90397674, + "num_input_tokens_seen": 104194120, + "router_z_loss_clip": 1.99414062, + "router_z_loss_mlp": 0.17016602, + "step": 4830, + "time_per_iteration": 2.557326555252075 + }, + { + "auxiliary_loss_clip": 0.0652103, + "auxiliary_loss_mlp": 0.01280335, + "balance_loss_clip": 0.06313583, + "balance_loss_mlp": 0.01262036, + "epoch": 0.29045543363895987, + "flos": 16951437521280.0, + "grad_norm": 2.247105417787089, + "language_loss": 0.79901475, + "learning_rate": 3.3295944250927546e-06, + "loss": 0.87702841, + "num_input_tokens_seen": 104210875, + "router_z_loss_clip": 2.07421875, + "router_z_loss_mlp": 0.18310547, + "step": 4831, + "time_per_iteration": 2.5306637287139893 + }, + { + "auxiliary_loss_clip": 0.06507042, + "auxiliary_loss_mlp": 0.01277723, + "balance_loss_clip": 0.06307022, + "balance_loss_mlp": 0.01261392, + "epoch": 0.29051555689162784, + "flos": 26402630152320.0, + "grad_norm": 2.3059080747570775, + "language_loss": 0.75331926, + "learning_rate": 3.3293034627119055e-06, + "loss": 0.83116686, + "num_input_tokens_seen": 104229875, + "router_z_loss_clip": 2.00097656, + "router_z_loss_mlp": 0.16333008, + "step": 4832, + "time_per_iteration": 2.5603439807891846 + }, + { + "auxiliary_loss_clip": 0.06503655, + "auxiliary_loss_mlp": 0.01283448, + "balance_loss_clip": 0.06302731, + "balance_loss_mlp": 0.01267271, + "epoch": 0.2905756801442958, + "flos": 21109931003520.0, + "grad_norm": 1.626645949157208, + "language_loss": 0.76312864, + "learning_rate": 3.329012449923736e-06, + "loss": 0.8409996, + "num_input_tokens_seen": 104250405, + "router_z_loss_clip": 2.00878906, + "router_z_loss_mlp": 0.16162109, + "step": 4833, + "time_per_iteration": 4.029958963394165 + }, + { + "auxiliary_loss_clip": 0.06504881, + "auxiliary_loss_mlp": 0.01280243, + "balance_loss_clip": 0.06303954, + "balance_loss_mlp": 0.01263363, + "epoch": 0.29063580339696377, + "flos": 15711573456000.0, + "grad_norm": 1.645904053352059, + "language_loss": 0.65383506, + "learning_rate": 3.3287213867392813e-06, + "loss": 0.73168635, + "num_input_tokens_seen": 104269185, + "router_z_loss_clip": 2.00976562, + "router_z_loss_mlp": 0.16882324, + "step": 4834, + "time_per_iteration": 2.5233187675476074 + }, + { + "auxiliary_loss_clip": 0.06499655, + "auxiliary_loss_mlp": 0.01274915, + "balance_loss_clip": 0.06299647, + "balance_loss_mlp": 0.01258893, + "epoch": 0.29069592664963173, + "flos": 24651972397440.0, + "grad_norm": 1.808411103531711, + "language_loss": 0.71914709, + "learning_rate": 3.3284302731695783e-06, + "loss": 0.79689276, + "num_input_tokens_seen": 104289400, + "router_z_loss_clip": 1.99902344, + "router_z_loss_mlp": 0.16027832, + "step": 4835, + "time_per_iteration": 2.555670738220215 + }, + { + "auxiliary_loss_clip": 0.06500543, + "auxiliary_loss_mlp": 0.01275594, + "balance_loss_clip": 0.06299368, + "balance_loss_mlp": 0.01259536, + "epoch": 0.2907560499022997, + "flos": 24980854872960.0, + "grad_norm": 1.750724607078226, + "language_loss": 0.80319953, + "learning_rate": 3.3281391092256668e-06, + "loss": 0.88096082, + "num_input_tokens_seen": 104310485, + "router_z_loss_clip": 2.00878906, + "router_z_loss_mlp": 0.16052246, + "step": 4836, + "time_per_iteration": 3.9953579902648926 + }, + { + "auxiliary_loss_clip": 0.0650623, + "auxiliary_loss_mlp": 0.01276306, + "balance_loss_clip": 0.06305872, + "balance_loss_mlp": 0.01260236, + "epoch": 0.29081617315496766, + "flos": 18662836838400.0, + "grad_norm": 1.8282626295265978, + "language_loss": 0.81337535, + "learning_rate": 3.3278478949185865e-06, + "loss": 0.89120078, + "num_input_tokens_seen": 104327330, + "router_z_loss_clip": 2.00585938, + "router_z_loss_mlp": 0.16064453, + "step": 4837, + "time_per_iteration": 3.9492576122283936 + }, + { + "auxiliary_loss_clip": 0.06508449, + "auxiliary_loss_mlp": 0.01275256, + "balance_loss_clip": 0.06305645, + "balance_loss_mlp": 0.01257362, + "epoch": 0.2908762964076356, + "flos": 35339087952000.0, + "grad_norm": 1.819350457328488, + "language_loss": 0.67809796, + "learning_rate": 3.327556630259381e-06, + "loss": 0.75593495, + "num_input_tokens_seen": 104350350, + "router_z_loss_clip": 2.02734375, + "router_z_loss_mlp": 0.17895508, + "step": 4838, + "time_per_iteration": 2.6575772762298584 + }, + { + "auxiliary_loss_clip": 0.06511781, + "auxiliary_loss_mlp": 0.01274117, + "balance_loss_clip": 0.06305051, + "balance_loss_mlp": 0.01256688, + "epoch": 0.29093641966030365, + "flos": 23083058148480.0, + "grad_norm": 2.3112745331966185, + "language_loss": 0.71775508, + "learning_rate": 3.327265315259095e-06, + "loss": 0.79561406, + "num_input_tokens_seen": 104369995, + "router_z_loss_clip": 2.06640625, + "router_z_loss_mlp": 0.17419434, + "step": 4839, + "time_per_iteration": 2.6057844161987305 + }, + { + "auxiliary_loss_clip": 0.06504601, + "auxiliary_loss_mlp": 0.0127457, + "balance_loss_clip": 0.06301045, + "balance_loss_mlp": 0.01258071, + "epoch": 0.2909965429129716, + "flos": 35964260864640.0, + "grad_norm": 1.8988017352340443, + "language_loss": 0.75792682, + "learning_rate": 3.326973949928776e-06, + "loss": 0.83571851, + "num_input_tokens_seen": 104392285, + "router_z_loss_clip": 2.03515625, + "router_z_loss_mlp": 0.16503906, + "step": 4840, + "time_per_iteration": 2.7049334049224854 + }, + { + "auxiliary_loss_clip": 0.06503059, + "auxiliary_loss_mlp": 0.0127188, + "balance_loss_clip": 0.06299757, + "balance_loss_mlp": 0.01255417, + "epoch": 0.2910566661656396, + "flos": 30887616268800.0, + "grad_norm": 1.8129671702232821, + "language_loss": 0.60949063, + "learning_rate": 3.326682534279471e-06, + "loss": 0.68724, + "num_input_tokens_seen": 104412640, + "router_z_loss_clip": 2.03125, + "router_z_loss_mlp": 0.16479492, + "step": 4841, + "time_per_iteration": 2.7237274646759033 + }, + { + "auxiliary_loss_clip": 0.06506652, + "auxiliary_loss_mlp": 0.01272342, + "balance_loss_clip": 0.06303366, + "balance_loss_mlp": 0.01255021, + "epoch": 0.29111678941830754, + "flos": 30018366812160.0, + "grad_norm": 1.3487344136639734, + "language_loss": 0.71762401, + "learning_rate": 3.326391068322232e-06, + "loss": 0.79541385, + "num_input_tokens_seen": 104435245, + "router_z_loss_clip": 2.03320312, + "router_z_loss_mlp": 0.17333984, + "step": 4842, + "time_per_iteration": 4.036385774612427 + }, + { + "auxiliary_loss_clip": 0.06507391, + "auxiliary_loss_mlp": 0.01271836, + "balance_loss_clip": 0.06304808, + "balance_loss_mlp": 0.01256423, + "epoch": 0.2911769126709755, + "flos": 22864110629760.0, + "grad_norm": 1.4808705717301018, + "language_loss": 0.74052906, + "learning_rate": 3.3260995520681098e-06, + "loss": 0.81832135, + "num_input_tokens_seen": 104455395, + "router_z_loss_clip": 2.0234375, + "router_z_loss_mlp": 0.1541748, + "step": 4843, + "time_per_iteration": 2.565093755722046 + }, + { + "auxiliary_loss_clip": 0.06510359, + "auxiliary_loss_mlp": 0.01272609, + "balance_loss_clip": 0.06305443, + "balance_loss_mlp": 0.01256742, + "epoch": 0.2912370359236435, + "flos": 21656545113600.0, + "grad_norm": 3.6041214714298806, + "language_loss": 0.5879783, + "learning_rate": 3.3258079855281602e-06, + "loss": 0.66580796, + "num_input_tokens_seen": 104473350, + "router_z_loss_clip": 2.04785156, + "router_z_loss_mlp": 0.15856934, + "step": 4844, + "time_per_iteration": 2.636667490005493 + }, + { + "auxiliary_loss_clip": 0.06518383, + "auxiliary_loss_mlp": 0.01278792, + "balance_loss_clip": 0.06309091, + "balance_loss_mlp": 0.01261566, + "epoch": 0.29129715917631144, + "flos": 22899972977280.0, + "grad_norm": 1.9195914149996331, + "language_loss": 0.86846137, + "learning_rate": 3.3255163687134396e-06, + "loss": 0.94643313, + "num_input_tokens_seen": 104492265, + "router_z_loss_clip": 2.09277344, + "router_z_loss_mlp": 0.17224121, + "step": 4845, + "time_per_iteration": 2.549297571182251 + }, + { + "auxiliary_loss_clip": 0.06508736, + "auxiliary_loss_mlp": 0.01273322, + "balance_loss_clip": 0.06304652, + "balance_loss_mlp": 0.01256144, + "epoch": 0.2913572824289794, + "flos": 22681067385600.0, + "grad_norm": 1.8711717874469986, + "language_loss": 0.67698014, + "learning_rate": 3.3252247016350046e-06, + "loss": 0.75480074, + "num_input_tokens_seen": 104510755, + "router_z_loss_clip": 2.04199219, + "router_z_loss_mlp": 0.17175293, + "step": 4846, + "time_per_iteration": 2.607025146484375 + }, + { + "auxiliary_loss_clip": 0.06502484, + "auxiliary_loss_mlp": 0.01275425, + "balance_loss_clip": 0.06301165, + "balance_loss_mlp": 0.01258771, + "epoch": 0.29141740568164737, + "flos": 23113260345600.0, + "grad_norm": 4.990917175371688, + "language_loss": 0.708718, + "learning_rate": 3.3249329843039166e-06, + "loss": 0.78649712, + "num_input_tokens_seen": 104530830, + "router_z_loss_clip": 2.01171875, + "router_z_loss_mlp": 0.16674805, + "step": 4847, + "time_per_iteration": 2.5293991565704346 + }, + { + "auxiliary_loss_clip": 0.06504785, + "auxiliary_loss_mlp": 0.01278673, + "balance_loss_clip": 0.06301495, + "balance_loss_mlp": 0.01261877, + "epoch": 0.29147752893431533, + "flos": 23593851838080.0, + "grad_norm": 1.4565796817402286, + "language_loss": 0.74258435, + "learning_rate": 3.324641216731237e-06, + "loss": 0.82041889, + "num_input_tokens_seen": 104550115, + "router_z_loss_clip": 2.03125, + "router_z_loss_mlp": 0.16796875, + "step": 4848, + "time_per_iteration": 2.585296630859375 + }, + { + "auxiliary_loss_clip": 0.06502895, + "auxiliary_loss_mlp": 0.01276049, + "balance_loss_clip": 0.06298006, + "balance_loss_mlp": 0.01259729, + "epoch": 0.2915376521869833, + "flos": 20597753721600.0, + "grad_norm": 2.1223800155182624, + "language_loss": 0.77561575, + "learning_rate": 3.3243493989280295e-06, + "loss": 0.85340518, + "num_input_tokens_seen": 104566255, + "router_z_loss_clip": 2.046875, + "router_z_loss_mlp": 0.16333008, + "step": 4849, + "time_per_iteration": 2.4936819076538086 + }, + { + "auxiliary_loss_clip": 0.06514408, + "auxiliary_loss_mlp": 0.01274174, + "balance_loss_clip": 0.06305997, + "balance_loss_mlp": 0.01257723, + "epoch": 0.29159777543965126, + "flos": 20817414000000.0, + "grad_norm": 1.652469266745217, + "language_loss": 0.79415965, + "learning_rate": 3.3240575309053596e-06, + "loss": 0.87204546, + "num_input_tokens_seen": 104585235, + "router_z_loss_clip": 2.08789062, + "router_z_loss_mlp": 0.16442871, + "step": 4850, + "time_per_iteration": 2.55340313911438 + }, + { + "auxiliary_loss_clip": 0.06494947, + "auxiliary_loss_mlp": 0.0127524, + "balance_loss_clip": 0.06295137, + "balance_loss_mlp": 0.01258479, + "epoch": 0.29165789869231923, + "flos": 24251155591680.0, + "grad_norm": 1.7747423674847125, + "language_loss": 0.76365012, + "learning_rate": 3.323765612674296e-06, + "loss": 0.84135199, + "num_input_tokens_seen": 104605315, + "router_z_loss_clip": 1.99902344, + "router_z_loss_mlp": 0.16748047, + "step": 4851, + "time_per_iteration": 2.5335612297058105 + }, + { + "auxiliary_loss_clip": 0.06499958, + "auxiliary_loss_mlp": 0.01272557, + "balance_loss_clip": 0.06300404, + "balance_loss_mlp": 0.01256929, + "epoch": 0.29171802194498725, + "flos": 28957562922240.0, + "grad_norm": 1.3481127708223366, + "language_loss": 0.7781775, + "learning_rate": 3.3234736442459078e-06, + "loss": 0.85590267, + "num_input_tokens_seen": 104626055, + "router_z_loss_clip": 1.99609375, + "router_z_loss_mlp": 0.15612793, + "step": 4852, + "time_per_iteration": 2.6266329288482666 + }, + { + "auxiliary_loss_clip": 0.06501517, + "auxiliary_loss_mlp": 0.0127959, + "balance_loss_clip": 0.06297216, + "balance_loss_mlp": 0.01261697, + "epoch": 0.2917781451976552, + "flos": 22604269518720.0, + "grad_norm": 1.5006442804531215, + "language_loss": 0.78676021, + "learning_rate": 3.3231816256312665e-06, + "loss": 0.86457133, + "num_input_tokens_seen": 104646005, + "router_z_loss_clip": 2.04296875, + "router_z_loss_mlp": 0.17883301, + "step": 4853, + "time_per_iteration": 2.5417568683624268 + }, + { + "auxiliary_loss_clip": 0.06501997, + "auxiliary_loss_mlp": 0.01270758, + "balance_loss_clip": 0.06296347, + "balance_loss_mlp": 0.01253818, + "epoch": 0.2918382684503232, + "flos": 21579956881920.0, + "grad_norm": 4.190137743849971, + "language_loss": 0.88580358, + "learning_rate": 3.322889556841445e-06, + "loss": 0.96353114, + "num_input_tokens_seen": 104661620, + "router_z_loss_clip": 2.05761719, + "router_z_loss_mlp": 0.16943359, + "step": 4854, + "time_per_iteration": 2.537247896194458 + }, + { + "auxiliary_loss_clip": 0.06492339, + "auxiliary_loss_mlp": 0.01274317, + "balance_loss_clip": 0.06290923, + "balance_loss_mlp": 0.01255517, + "epoch": 0.29189839170299114, + "flos": 24360503569920.0, + "grad_norm": 1.79615422427109, + "language_loss": 0.86863208, + "learning_rate": 3.322597437887519e-06, + "loss": 0.94629866, + "num_input_tokens_seen": 104681445, + "router_z_loss_clip": 2.01464844, + "router_z_loss_mlp": 0.18798828, + "step": 4855, + "time_per_iteration": 2.5408217906951904 + }, + { + "auxiliary_loss_clip": 0.06394155, + "auxiliary_loss_mlp": 0.01254999, + "balance_loss_clip": 0.0629582, + "balance_loss_mlp": 0.01250765, + "epoch": 0.2919585149556591, + "flos": 71338693311360.0, + "grad_norm": 0.8469602753394808, + "language_loss": 0.60232264, + "learning_rate": 3.322305268780566e-06, + "loss": 0.67881417, + "num_input_tokens_seen": 104747945, + "router_z_loss_clip": 0.98388672, + "router_z_loss_mlp": 0.04238892, + "step": 4856, + "time_per_iteration": 3.245720863342285 + }, + { + "auxiliary_loss_clip": 0.06496054, + "auxiliary_loss_mlp": 0.01271452, + "balance_loss_clip": 0.06293447, + "balance_loss_mlp": 0.01254966, + "epoch": 0.2920186382083271, + "flos": 15638716730880.0, + "grad_norm": 1.9340338412348166, + "language_loss": 0.69134986, + "learning_rate": 3.322013049531664e-06, + "loss": 0.76902497, + "num_input_tokens_seen": 104766225, + "router_z_loss_clip": 2.02734375, + "router_z_loss_mlp": 0.16479492, + "step": 4857, + "time_per_iteration": 2.492515802383423 + }, + { + "auxiliary_loss_clip": 0.0649875, + "auxiliary_loss_mlp": 0.01275648, + "balance_loss_clip": 0.06298544, + "balance_loss_mlp": 0.01258863, + "epoch": 0.29207876146099504, + "flos": 28373535164160.0, + "grad_norm": 2.0544380804392346, + "language_loss": 0.84425288, + "learning_rate": 3.321720780151895e-06, + "loss": 0.92199689, + "num_input_tokens_seen": 104785345, + "router_z_loss_clip": 2.00195312, + "router_z_loss_mlp": 0.16772461, + "step": 4858, + "time_per_iteration": 2.596036434173584 + }, + { + "auxiliary_loss_clip": 0.06500848, + "auxiliary_loss_mlp": 0.01274974, + "balance_loss_clip": 0.06300872, + "balance_loss_mlp": 0.01257879, + "epoch": 0.292138884713663, + "flos": 21877295495040.0, + "grad_norm": 1.6880642207641439, + "language_loss": 0.781169, + "learning_rate": 3.321428460652342e-06, + "loss": 0.85892725, + "num_input_tokens_seen": 104804560, + "router_z_loss_clip": 2.0, + "router_z_loss_mlp": 0.17102051, + "step": 4859, + "time_per_iteration": 2.5885818004608154 + }, + { + "auxiliary_loss_clip": 0.06508546, + "auxiliary_loss_mlp": 0.01274065, + "balance_loss_clip": 0.06301034, + "balance_loss_mlp": 0.0125684, + "epoch": 0.29219900796633097, + "flos": 20998277038080.0, + "grad_norm": 2.276956308498861, + "language_loss": 0.68823123, + "learning_rate": 3.3211360910440885e-06, + "loss": 0.76605731, + "num_input_tokens_seen": 104821105, + "router_z_loss_clip": 2.07421875, + "router_z_loss_mlp": 0.17224121, + "step": 4860, + "time_per_iteration": 2.6006133556365967 + }, + { + "auxiliary_loss_clip": 0.06497137, + "auxiliary_loss_mlp": 0.01273361, + "balance_loss_clip": 0.06296673, + "balance_loss_mlp": 0.01256743, + "epoch": 0.29225913121899894, + "flos": 35012930734080.0, + "grad_norm": 1.9621079535677741, + "language_loss": 0.75927335, + "learning_rate": 3.320843671338222e-06, + "loss": 0.83697826, + "num_input_tokens_seen": 104841440, + "router_z_loss_clip": 2.00292969, + "router_z_loss_mlp": 0.16625977, + "step": 4861, + "time_per_iteration": 2.6738815307617188 + }, + { + "auxiliary_loss_clip": 0.06498605, + "auxiliary_loss_mlp": 0.01278705, + "balance_loss_clip": 0.06298269, + "balance_loss_mlp": 0.0126229, + "epoch": 0.2923192544716669, + "flos": 13520588895360.0, + "grad_norm": 2.4944662876521027, + "language_loss": 0.91953582, + "learning_rate": 3.320551201545832e-06, + "loss": 0.99730897, + "num_input_tokens_seen": 104858210, + "router_z_loss_clip": 2.00097656, + "router_z_loss_mlp": 0.16418457, + "step": 4862, + "time_per_iteration": 2.523393392562866 + }, + { + "auxiliary_loss_clip": 0.06498902, + "auxiliary_loss_mlp": 0.01275122, + "balance_loss_clip": 0.06296849, + "balance_loss_mlp": 0.01258325, + "epoch": 0.29237937772433487, + "flos": 19469543621760.0, + "grad_norm": 2.367835349845546, + "language_loss": 0.74302417, + "learning_rate": 3.320258681678008e-06, + "loss": 0.82076436, + "num_input_tokens_seen": 104875620, + "router_z_loss_clip": 2.02441406, + "router_z_loss_mlp": 0.16809082, + "step": 4863, + "time_per_iteration": 2.5615665912628174 + }, + { + "auxiliary_loss_clip": 0.06495367, + "auxiliary_loss_mlp": 0.01274458, + "balance_loss_clip": 0.06298485, + "balance_loss_mlp": 0.01257041, + "epoch": 0.29243950097700283, + "flos": 20856965927040.0, + "grad_norm": 1.6096808438714836, + "language_loss": 0.78180861, + "learning_rate": 3.319966111745842e-06, + "loss": 0.85950685, + "num_input_tokens_seen": 104894600, + "router_z_loss_clip": 1.96679688, + "router_z_loss_mlp": 0.17419434, + "step": 4864, + "time_per_iteration": 2.543239116668701 + }, + { + "auxiliary_loss_clip": 0.06506015, + "auxiliary_loss_mlp": 0.01278091, + "balance_loss_clip": 0.06299396, + "balance_loss_mlp": 0.01260127, + "epoch": 0.29249962422967085, + "flos": 23590581528960.0, + "grad_norm": 1.7200803595236853, + "language_loss": 0.82166076, + "learning_rate": 3.319673491760429e-06, + "loss": 0.8995018, + "num_input_tokens_seen": 104914530, + "router_z_loss_clip": 2.06640625, + "router_z_loss_mlp": 0.1796875, + "step": 4865, + "time_per_iteration": 2.6162562370300293 + }, + { + "auxiliary_loss_clip": 0.06504746, + "auxiliary_loss_mlp": 0.01277785, + "balance_loss_clip": 0.06300808, + "balance_loss_mlp": 0.01258783, + "epoch": 0.2925597474823388, + "flos": 22279915163520.0, + "grad_norm": 1.8207973709117147, + "language_loss": 0.85861242, + "learning_rate": 3.3193808217328645e-06, + "loss": 0.93643779, + "num_input_tokens_seen": 104933460, + "router_z_loss_clip": 2.04003906, + "router_z_loss_mlp": 0.18994141, + "step": 4866, + "time_per_iteration": 2.5991125106811523 + }, + { + "auxiliary_loss_clip": 0.06498669, + "auxiliary_loss_mlp": 0.01277634, + "balance_loss_clip": 0.06298468, + "balance_loss_mlp": 0.0126005, + "epoch": 0.2926198707350068, + "flos": 34464136417920.0, + "grad_norm": 1.677629799943763, + "language_loss": 0.76065934, + "learning_rate": 3.3190881016742476e-06, + "loss": 0.83842242, + "num_input_tokens_seen": 104954495, + "router_z_loss_clip": 2.00097656, + "router_z_loss_mlp": 0.17578125, + "step": 4867, + "time_per_iteration": 2.652083396911621 + }, + { + "auxiliary_loss_clip": 0.06508122, + "auxiliary_loss_mlp": 0.01277995, + "balance_loss_clip": 0.06302974, + "balance_loss_mlp": 0.01260483, + "epoch": 0.29267999398767475, + "flos": 20710413936000.0, + "grad_norm": 2.5581846543962197, + "language_loss": 0.73412025, + "learning_rate": 3.3187953315956776e-06, + "loss": 0.81198144, + "num_input_tokens_seen": 104971915, + "router_z_loss_clip": 2.04980469, + "router_z_loss_mlp": 0.1751709, + "step": 4868, + "time_per_iteration": 2.5104074478149414 + }, + { + "auxiliary_loss_clip": 0.06504919, + "auxiliary_loss_mlp": 0.0127382, + "balance_loss_clip": 0.06304781, + "balance_loss_mlp": 0.01256558, + "epoch": 0.2927401172403427, + "flos": 18374470611840.0, + "grad_norm": 1.376823387605754, + "language_loss": 0.74768585, + "learning_rate": 3.3185025115082566e-06, + "loss": 0.82547319, + "num_input_tokens_seen": 104991335, + "router_z_loss_clip": 2.00292969, + "router_z_loss_mlp": 0.17260742, + "step": 4869, + "time_per_iteration": 2.517545461654663 + }, + { + "auxiliary_loss_clip": 0.06509744, + "auxiliary_loss_mlp": 0.01275578, + "balance_loss_clip": 0.06308037, + "balance_loss_mlp": 0.01258627, + "epoch": 0.2928002404930107, + "flos": 26111203251840.0, + "grad_norm": 1.453461002371515, + "language_loss": 0.76538026, + "learning_rate": 3.318209641423088e-06, + "loss": 0.84323347, + "num_input_tokens_seen": 105012015, + "router_z_loss_clip": 2.01660156, + "router_z_loss_mlp": 0.16931152, + "step": 4870, + "time_per_iteration": 2.571554183959961 + }, + { + "auxiliary_loss_clip": 0.06512202, + "auxiliary_loss_mlp": 0.01274146, + "balance_loss_clip": 0.06304315, + "balance_loss_mlp": 0.01255967, + "epoch": 0.29286036374567864, + "flos": 21331142582400.0, + "grad_norm": 3.1299518178223726, + "language_loss": 0.67793286, + "learning_rate": 3.3179167213512777e-06, + "loss": 0.75579637, + "num_input_tokens_seen": 105031460, + "router_z_loss_clip": 2.078125, + "router_z_loss_mlp": 0.18188477, + "step": 4871, + "time_per_iteration": 2.5867390632629395 + }, + { + "auxiliary_loss_clip": 0.06504084, + "auxiliary_loss_mlp": 0.01272553, + "balance_loss_clip": 0.0630291, + "balance_loss_mlp": 0.01256973, + "epoch": 0.2929204869983466, + "flos": 29577117611520.0, + "grad_norm": 1.7840080197301964, + "language_loss": 0.78071094, + "learning_rate": 3.317623751303933e-06, + "loss": 0.85847723, + "num_input_tokens_seen": 105052965, + "router_z_loss_clip": 2.01171875, + "router_z_loss_mlp": 0.15588379, + "step": 4872, + "time_per_iteration": 2.598357915878296 + }, + { + "auxiliary_loss_clip": 0.06511893, + "auxiliary_loss_mlp": 0.01279899, + "balance_loss_clip": 0.06305112, + "balance_loss_mlp": 0.01260313, + "epoch": 0.2929806102510146, + "flos": 19063569790080.0, + "grad_norm": 1.7763964443019538, + "language_loss": 0.72879624, + "learning_rate": 3.317330731292164e-06, + "loss": 0.80671406, + "num_input_tokens_seen": 105071840, + "router_z_loss_clip": 2.06640625, + "router_z_loss_mlp": 0.19580078, + "step": 4873, + "time_per_iteration": 3.9404540061950684 + }, + { + "auxiliary_loss_clip": 0.06511085, + "auxiliary_loss_mlp": 0.01274077, + "balance_loss_clip": 0.06303495, + "balance_loss_mlp": 0.01256386, + "epoch": 0.29304073350368254, + "flos": 21950613417600.0, + "grad_norm": 1.85182595241139, + "language_loss": 0.79023468, + "learning_rate": 3.3170376613270812e-06, + "loss": 0.86808634, + "num_input_tokens_seen": 105089445, + "router_z_loss_clip": 2.07617188, + "router_z_loss_mlp": 0.17675781, + "step": 4874, + "time_per_iteration": 2.523942470550537 + }, + { + "auxiliary_loss_clip": 0.06517696, + "auxiliary_loss_mlp": 0.01272827, + "balance_loss_clip": 0.06305568, + "balance_loss_mlp": 0.01255315, + "epoch": 0.2931008567563505, + "flos": 15456302392320.0, + "grad_norm": 2.3441988108556377, + "language_loss": 0.7791701, + "learning_rate": 3.3167445414197985e-06, + "loss": 0.85707539, + "num_input_tokens_seen": 105106210, + "router_z_loss_clip": 2.12304688, + "router_z_loss_mlp": 0.17504883, + "step": 4875, + "time_per_iteration": 2.4990556240081787 + }, + { + "auxiliary_loss_clip": 0.06506883, + "auxiliary_loss_mlp": 0.01280573, + "balance_loss_clip": 0.06301031, + "balance_loss_mlp": 0.01263252, + "epoch": 0.29316098000901847, + "flos": 16988893096320.0, + "grad_norm": 1.859745338516673, + "language_loss": 0.70031023, + "learning_rate": 3.316451371581431e-06, + "loss": 0.77818477, + "num_input_tokens_seen": 105124200, + "router_z_loss_clip": 2.05859375, + "router_z_loss_mlp": 0.17321777, + "step": 4876, + "time_per_iteration": 5.4681243896484375 + }, + { + "auxiliary_loss_clip": 0.06504045, + "auxiliary_loss_mlp": 0.01275518, + "balance_loss_clip": 0.06302452, + "balance_loss_mlp": 0.01259174, + "epoch": 0.29322110326168643, + "flos": 16362462372480.0, + "grad_norm": 1.8247622937841679, + "language_loss": 0.82480925, + "learning_rate": 3.316158151823096e-06, + "loss": 0.90260488, + "num_input_tokens_seen": 105140400, + "router_z_loss_clip": 2.015625, + "router_z_loss_mlp": 0.16345215, + "step": 4877, + "time_per_iteration": 2.5517635345458984 + }, + { + "auxiliary_loss_clip": 0.06509132, + "auxiliary_loss_mlp": 0.01278665, + "balance_loss_clip": 0.06299806, + "balance_loss_mlp": 0.0126064, + "epoch": 0.29328122651435445, + "flos": 13996023361920.0, + "grad_norm": 2.6416558700601334, + "language_loss": 0.6810987, + "learning_rate": 3.315864882155911e-06, + "loss": 0.75897658, + "num_input_tokens_seen": 105157535, + "router_z_loss_clip": 2.09179688, + "router_z_loss_mlp": 0.18017578, + "step": 4878, + "time_per_iteration": 2.511922597885132 + }, + { + "auxiliary_loss_clip": 0.0649902, + "auxiliary_loss_mlp": 0.01275226, + "balance_loss_clip": 0.06298085, + "balance_loss_mlp": 0.01257697, + "epoch": 0.2933413497670224, + "flos": 25271569013760.0, + "grad_norm": 1.8820124674491874, + "language_loss": 0.74030542, + "learning_rate": 3.3155715625909982e-06, + "loss": 0.81804794, + "num_input_tokens_seen": 105175185, + "router_z_loss_clip": 2.01074219, + "router_z_loss_mlp": 0.17510986, + "step": 4879, + "time_per_iteration": 2.6044318675994873 + }, + { + "auxiliary_loss_clip": 0.06501681, + "auxiliary_loss_mlp": 0.01277426, + "balance_loss_clip": 0.0629803, + "balance_loss_mlp": 0.01259187, + "epoch": 0.2934014730196904, + "flos": 32131840746240.0, + "grad_norm": 2.9151820016542183, + "language_loss": 0.67178017, + "learning_rate": 3.3152781931394803e-06, + "loss": 0.7495712, + "num_input_tokens_seen": 105194540, + "router_z_loss_clip": 2.03515625, + "router_z_loss_mlp": 0.18237305, + "step": 4880, + "time_per_iteration": 2.603761672973633 + }, + { + "auxiliary_loss_clip": 0.06503071, + "auxiliary_loss_mlp": 0.01271949, + "balance_loss_clip": 0.0629775, + "balance_loss_mlp": 0.01255367, + "epoch": 0.29346159627235835, + "flos": 24359329612800.0, + "grad_norm": 2.6105900749093633, + "language_loss": 0.71260536, + "learning_rate": 3.314984773812481e-06, + "loss": 0.79035556, + "num_input_tokens_seen": 105213215, + "router_z_loss_clip": 2.05273438, + "router_z_loss_mlp": 0.16577148, + "step": 4881, + "time_per_iteration": 2.593226432800293 + }, + { + "auxiliary_loss_clip": 0.06502824, + "auxiliary_loss_mlp": 0.01275283, + "balance_loss_clip": 0.06298223, + "balance_loss_mlp": 0.01256603, + "epoch": 0.2935217195250263, + "flos": 22753253278080.0, + "grad_norm": 1.6618295774620153, + "language_loss": 0.83893931, + "learning_rate": 3.314691304621127e-06, + "loss": 0.91672039, + "num_input_tokens_seen": 105231585, + "router_z_loss_clip": 2.04589844, + "router_z_loss_mlp": 0.18688965, + "step": 4882, + "time_per_iteration": 3.9488399028778076 + }, + { + "auxiliary_loss_clip": 0.06502259, + "auxiliary_loss_mlp": 0.01273532, + "balance_loss_clip": 0.06293593, + "balance_loss_mlp": 0.01255961, + "epoch": 0.2935818427776943, + "flos": 21731959388160.0, + "grad_norm": 4.210124979545191, + "language_loss": 0.72920972, + "learning_rate": 3.314397785576548e-06, + "loss": 0.80696762, + "num_input_tokens_seen": 105250120, + "router_z_loss_clip": 2.08789062, + "router_z_loss_mlp": 0.17565918, + "step": 4883, + "time_per_iteration": 2.557283878326416 + }, + { + "auxiliary_loss_clip": 0.06496279, + "auxiliary_loss_mlp": 0.01274258, + "balance_loss_clip": 0.06292833, + "balance_loss_mlp": 0.01257103, + "epoch": 0.29364196603036224, + "flos": 23811667326720.0, + "grad_norm": 2.0649535872154217, + "language_loss": 0.93051624, + "learning_rate": 3.3141042166898726e-06, + "loss": 1.00822163, + "num_input_tokens_seen": 105266065, + "router_z_loss_clip": 2.03417969, + "router_z_loss_mlp": 0.17150879, + "step": 4884, + "time_per_iteration": 2.5359458923339844 + }, + { + "auxiliary_loss_clip": 0.06506841, + "auxiliary_loss_mlp": 0.01273123, + "balance_loss_clip": 0.06302871, + "balance_loss_mlp": 0.01255409, + "epoch": 0.2937020892830302, + "flos": 23475615327360.0, + "grad_norm": 2.6201562161688017, + "language_loss": 0.73813069, + "learning_rate": 3.313810597972234e-06, + "loss": 0.81593031, + "num_input_tokens_seen": 105282155, + "router_z_loss_clip": 2.04101562, + "router_z_loss_mlp": 0.17712402, + "step": 4885, + "time_per_iteration": 2.547731637954712 + }, + { + "auxiliary_loss_clip": 0.06506574, + "auxiliary_loss_mlp": 0.01271233, + "balance_loss_clip": 0.06302118, + "balance_loss_mlp": 0.01253936, + "epoch": 0.2937622125356982, + "flos": 24278422896000.0, + "grad_norm": 2.0067568315745907, + "language_loss": 0.8568837, + "learning_rate": 3.3135169294347655e-06, + "loss": 0.93466175, + "num_input_tokens_seen": 105299225, + "router_z_loss_clip": 2.04492188, + "router_z_loss_mlp": 0.1730957, + "step": 4886, + "time_per_iteration": 2.5345749855041504 + }, + { + "auxiliary_loss_clip": 0.06516494, + "auxiliary_loss_mlp": 0.01282352, + "balance_loss_clip": 0.06309356, + "balance_loss_mlp": 0.01266223, + "epoch": 0.29382233578836614, + "flos": 20667843262080.0, + "grad_norm": 2.2972144011917863, + "language_loss": 0.7819618, + "learning_rate": 3.313223211088603e-06, + "loss": 0.85995024, + "num_input_tokens_seen": 105315710, + "router_z_loss_clip": 2.07128906, + "router_z_loss_mlp": 0.16137695, + "step": 4887, + "time_per_iteration": 2.5718464851379395 + }, + { + "auxiliary_loss_clip": 0.06508423, + "auxiliary_loss_mlp": 0.01281343, + "balance_loss_clip": 0.06301117, + "balance_loss_mlp": 0.01263962, + "epoch": 0.2938824590410341, + "flos": 16550662642560.0, + "grad_norm": 2.5346543108244366, + "language_loss": 0.80135798, + "learning_rate": 3.3129294429448855e-06, + "loss": 0.87925565, + "num_input_tokens_seen": 105333505, + "router_z_loss_clip": 2.07226562, + "router_z_loss_mlp": 0.1739502, + "step": 4888, + "time_per_iteration": 2.5823678970336914 + }, + { + "auxiliary_loss_clip": 0.06512221, + "auxiliary_loss_mlp": 0.01274662, + "balance_loss_clip": 0.06308408, + "balance_loss_mlp": 0.01257878, + "epoch": 0.29394258229370207, + "flos": 37934620824960.0, + "grad_norm": 1.521834171262281, + "language_loss": 0.55984998, + "learning_rate": 3.3126356250147517e-06, + "loss": 0.63771886, + "num_input_tokens_seen": 105355605, + "router_z_loss_clip": 2.04101562, + "router_z_loss_mlp": 0.16784668, + "step": 4889, + "time_per_iteration": 2.6925320625305176 + }, + { + "auxiliary_loss_clip": 0.06519246, + "auxiliary_loss_mlp": 0.01278013, + "balance_loss_clip": 0.06313413, + "balance_loss_mlp": 0.0126056, + "epoch": 0.29400270554637004, + "flos": 20050384924800.0, + "grad_norm": 1.7589662768394465, + "language_loss": 0.85257453, + "learning_rate": 3.3123417573093434e-06, + "loss": 0.93054712, + "num_input_tokens_seen": 105374225, + "router_z_loss_clip": 2.05761719, + "router_z_loss_mlp": 0.17443848, + "step": 4890, + "time_per_iteration": 2.546391010284424 + }, + { + "auxiliary_loss_clip": 0.06513973, + "auxiliary_loss_mlp": 0.01284253, + "balance_loss_clip": 0.06307942, + "balance_loss_mlp": 0.01266288, + "epoch": 0.294062828799038, + "flos": 15271498212480.0, + "grad_norm": 1.9077501912209676, + "language_loss": 0.73679662, + "learning_rate": 3.3120478398398046e-06, + "loss": 0.81477886, + "num_input_tokens_seen": 105391565, + "router_z_loss_clip": 2.06152344, + "router_z_loss_mlp": 0.17956543, + "step": 4891, + "time_per_iteration": 2.496230125427246 + }, + { + "auxiliary_loss_clip": 0.06519526, + "auxiliary_loss_mlp": 0.01285439, + "balance_loss_clip": 0.06312989, + "balance_loss_mlp": 0.01267468, + "epoch": 0.294122952051706, + "flos": 22753714475520.0, + "grad_norm": 1.802215562222595, + "language_loss": 0.77636111, + "learning_rate": 3.3117538726172797e-06, + "loss": 0.85441071, + "num_input_tokens_seen": 105409840, + "router_z_loss_clip": 2.06445312, + "router_z_loss_mlp": 0.17974854, + "step": 4892, + "time_per_iteration": 2.556626796722412 + }, + { + "auxiliary_loss_clip": 0.06508264, + "auxiliary_loss_mlp": 0.01274763, + "balance_loss_clip": 0.06305899, + "balance_loss_mlp": 0.01257096, + "epoch": 0.294183075304374, + "flos": 24979848624000.0, + "grad_norm": 1.857019535889917, + "language_loss": 0.78546309, + "learning_rate": 3.3114598556529164e-06, + "loss": 0.86329335, + "num_input_tokens_seen": 105428645, + "router_z_loss_clip": 2.02539062, + "router_z_loss_mlp": 0.17675781, + "step": 4893, + "time_per_iteration": 2.5583088397979736 + }, + { + "auxiliary_loss_clip": 0.06512541, + "auxiliary_loss_mlp": 0.01279131, + "balance_loss_clip": 0.06308632, + "balance_loss_mlp": 0.01262764, + "epoch": 0.29424319855704195, + "flos": 30960347212800.0, + "grad_norm": 7.778949224672863, + "language_loss": 0.85594332, + "learning_rate": 3.311165788957864e-06, + "loss": 0.93386006, + "num_input_tokens_seen": 105447480, + "router_z_loss_clip": 2.04101562, + "router_z_loss_mlp": 0.16357422, + "step": 4894, + "time_per_iteration": 2.642275094985962 + }, + { + "auxiliary_loss_clip": 0.06515005, + "auxiliary_loss_mlp": 0.01277674, + "balance_loss_clip": 0.06308285, + "balance_loss_mlp": 0.01260639, + "epoch": 0.2943033218097099, + "flos": 15236977530240.0, + "grad_norm": 2.7328127009682617, + "language_loss": 0.91485763, + "learning_rate": 3.310871672543274e-06, + "loss": 0.99278444, + "num_input_tokens_seen": 105464600, + "router_z_loss_clip": 2.06640625, + "router_z_loss_mlp": 0.17028809, + "step": 4895, + "time_per_iteration": 2.499884605407715 + }, + { + "auxiliary_loss_clip": 0.06521617, + "auxiliary_loss_mlp": 0.01275591, + "balance_loss_clip": 0.06309959, + "balance_loss_mlp": 0.01257519, + "epoch": 0.2943634450623779, + "flos": 21732336731520.0, + "grad_norm": 1.9156960384195119, + "language_loss": 0.86768568, + "learning_rate": 3.3105775064202982e-06, + "loss": 0.94565773, + "num_input_tokens_seen": 105481510, + "router_z_loss_clip": 2.11328125, + "router_z_loss_mlp": 0.18078613, + "step": 4896, + "time_per_iteration": 2.5482704639434814 + }, + { + "auxiliary_loss_clip": 0.06512056, + "auxiliary_loss_mlp": 0.01275376, + "balance_loss_clip": 0.06306215, + "balance_loss_mlp": 0.01257996, + "epoch": 0.29442356831504585, + "flos": 22608797639040.0, + "grad_norm": 2.0283086901116354, + "language_loss": 0.73915696, + "learning_rate": 3.3102832906000924e-06, + "loss": 0.81703126, + "num_input_tokens_seen": 105501390, + "router_z_loss_clip": 2.05566406, + "router_z_loss_mlp": 0.17382812, + "step": 4897, + "time_per_iteration": 2.5434658527374268 + }, + { + "auxiliary_loss_clip": 0.0652054, + "auxiliary_loss_mlp": 0.01280641, + "balance_loss_clip": 0.06307404, + "balance_loss_mlp": 0.01262378, + "epoch": 0.2944836915677138, + "flos": 20017625178240.0, + "grad_norm": 1.9321922101744466, + "language_loss": 0.74697995, + "learning_rate": 3.309989025093813e-06, + "loss": 0.82499176, + "num_input_tokens_seen": 105519600, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.18261719, + "step": 4898, + "time_per_iteration": 2.5770161151885986 + }, + { + "auxiliary_loss_clip": 0.06516017, + "auxiliary_loss_mlp": 0.01278564, + "balance_loss_clip": 0.06305353, + "balance_loss_mlp": 0.01259586, + "epoch": 0.2945438148203818, + "flos": 20051768517120.0, + "grad_norm": 2.462097706840479, + "language_loss": 0.71617198, + "learning_rate": 3.309694709912618e-06, + "loss": 0.79411781, + "num_input_tokens_seen": 105535970, + "router_z_loss_clip": 2.10742188, + "router_z_loss_mlp": 0.18969727, + "step": 4899, + "time_per_iteration": 2.5297536849975586 + }, + { + "auxiliary_loss_clip": 0.06510775, + "auxiliary_loss_mlp": 0.01278061, + "balance_loss_clip": 0.06304912, + "balance_loss_mlp": 0.01259727, + "epoch": 0.29460393807304974, + "flos": 23740487683200.0, + "grad_norm": 9.70716698994663, + "language_loss": 0.79828262, + "learning_rate": 3.3094003450676685e-06, + "loss": 0.87617099, + "num_input_tokens_seen": 105556735, + "router_z_loss_clip": 2.05859375, + "router_z_loss_mlp": 0.18322754, + "step": 4900, + "time_per_iteration": 2.589350461959839 + }, + { + "auxiliary_loss_clip": 0.06501958, + "auxiliary_loss_mlp": 0.01277561, + "balance_loss_clip": 0.06297968, + "balance_loss_mlp": 0.01260025, + "epoch": 0.2946640613257177, + "flos": 14981412977280.0, + "grad_norm": 1.6788003410312407, + "language_loss": 0.81419849, + "learning_rate": 3.3091059305701268e-06, + "loss": 0.89199364, + "num_input_tokens_seen": 105574875, + "router_z_loss_clip": 2.0390625, + "router_z_loss_mlp": 0.1751709, + "step": 4901, + "time_per_iteration": 2.4958457946777344 + }, + { + "auxiliary_loss_clip": 0.06498285, + "auxiliary_loss_mlp": 0.01276891, + "balance_loss_clip": 0.0630265, + "balance_loss_mlp": 0.01261095, + "epoch": 0.2947241845783857, + "flos": 24250862102400.0, + "grad_norm": 2.051988062923015, + "language_loss": 0.58211619, + "learning_rate": 3.308811466431157e-06, + "loss": 0.659868, + "num_input_tokens_seen": 105594225, + "router_z_loss_clip": 1.95800781, + "router_z_loss_mlp": 0.15783691, + "step": 4902, + "time_per_iteration": 2.5867393016815186 + }, + { + "auxiliary_loss_clip": 0.06509895, + "auxiliary_loss_mlp": 0.01278228, + "balance_loss_clip": 0.06304582, + "balance_loss_mlp": 0.01261825, + "epoch": 0.29478430783105364, + "flos": 19944600744960.0, + "grad_norm": 1.670035021285574, + "language_loss": 0.75883406, + "learning_rate": 3.308516952661925e-06, + "loss": 0.83671534, + "num_input_tokens_seen": 105614000, + "router_z_loss_clip": 2.05371094, + "router_z_loss_mlp": 0.16418457, + "step": 4903, + "time_per_iteration": 2.5120930671691895 + }, + { + "auxiliary_loss_clip": 0.06499215, + "auxiliary_loss_mlp": 0.01273387, + "balance_loss_clip": 0.06295954, + "balance_loss_mlp": 0.01255612, + "epoch": 0.2948444310837216, + "flos": 27388774454400.0, + "grad_norm": 1.8166217426315454, + "language_loss": 0.6305517, + "learning_rate": 3.3082223892736e-06, + "loss": 0.7082777, + "num_input_tokens_seen": 105634575, + "router_z_loss_clip": 2.03320312, + "router_z_loss_mlp": 0.17773438, + "step": 4904, + "time_per_iteration": 2.610600709915161 + }, + { + "auxiliary_loss_clip": 0.06509106, + "auxiliary_loss_mlp": 0.01272684, + "balance_loss_clip": 0.06301488, + "balance_loss_mlp": 0.01255983, + "epoch": 0.2949045543363896, + "flos": 23412401821440.0, + "grad_norm": 1.721115639485294, + "language_loss": 0.73724848, + "learning_rate": 3.3079277762773496e-06, + "loss": 0.8150664, + "num_input_tokens_seen": 105654385, + "router_z_loss_clip": 2.07519531, + "router_z_loss_mlp": 0.16711426, + "step": 4905, + "time_per_iteration": 2.5330429077148438 + }, + { + "auxiliary_loss_clip": 0.06501255, + "auxiliary_loss_mlp": 0.01270139, + "balance_loss_clip": 0.06297939, + "balance_loss_mlp": 0.01252508, + "epoch": 0.2949646775890576, + "flos": 23958303171840.0, + "grad_norm": 1.607284793713989, + "language_loss": 0.81930244, + "learning_rate": 3.3076331136843476e-06, + "loss": 0.89701641, + "num_input_tokens_seen": 105673570, + "router_z_loss_clip": 2.03320312, + "router_z_loss_mlp": 0.17614746, + "step": 4906, + "time_per_iteration": 2.5717568397521973 + }, + { + "auxiliary_loss_clip": 0.06499709, + "auxiliary_loss_mlp": 0.0127024, + "balance_loss_clip": 0.06300811, + "balance_loss_mlp": 0.01254051, + "epoch": 0.29502480084172555, + "flos": 22791002342400.0, + "grad_norm": 1.8767623479937394, + "language_loss": 0.88041449, + "learning_rate": 3.3073384015057667e-06, + "loss": 0.95811397, + "num_input_tokens_seen": 105691940, + "router_z_loss_clip": 1.98632812, + "router_z_loss_mlp": 0.16186523, + "step": 4907, + "time_per_iteration": 2.532233238220215 + }, + { + "auxiliary_loss_clip": 0.06504819, + "auxiliary_loss_mlp": 0.01277393, + "balance_loss_clip": 0.06294614, + "balance_loss_mlp": 0.01257592, + "epoch": 0.2950849240943935, + "flos": 19652838428160.0, + "grad_norm": 2.2863974346720837, + "language_loss": 0.82530308, + "learning_rate": 3.307043639752782e-06, + "loss": 0.90312517, + "num_input_tokens_seen": 105709825, + "router_z_loss_clip": 2.10058594, + "router_z_loss_mlp": 0.19812012, + "step": 4908, + "time_per_iteration": 2.6338536739349365 + }, + { + "auxiliary_loss_clip": 0.06393203, + "auxiliary_loss_mlp": 0.01256311, + "balance_loss_clip": 0.06296152, + "balance_loss_mlp": 0.01251251, + "epoch": 0.2951450473470615, + "flos": 71021062010880.0, + "grad_norm": 0.749349843123412, + "language_loss": 0.57384133, + "learning_rate": 3.3067488284365728e-06, + "loss": 0.65033644, + "num_input_tokens_seen": 105766880, + "router_z_loss_clip": 0.96875, + "router_z_loss_mlp": 0.05059814, + "step": 4909, + "time_per_iteration": 3.0084846019744873 + }, + { + "auxiliary_loss_clip": 0.06500423, + "auxiliary_loss_mlp": 0.01279147, + "balance_loss_clip": 0.06298146, + "balance_loss_mlp": 0.0126278, + "epoch": 0.29520517059972945, + "flos": 22972955483520.0, + "grad_norm": 1.5167904233162786, + "language_loss": 0.87274551, + "learning_rate": 3.3064539675683163e-06, + "loss": 0.9505412, + "num_input_tokens_seen": 105786875, + "router_z_loss_clip": 2.02441406, + "router_z_loss_mlp": 0.16381836, + "step": 4910, + "time_per_iteration": 2.615015745162964 + }, + { + "auxiliary_loss_clip": 0.06494174, + "auxiliary_loss_mlp": 0.01271849, + "balance_loss_clip": 0.06294993, + "balance_loss_mlp": 0.01255017, + "epoch": 0.2952652938523974, + "flos": 20491969541760.0, + "grad_norm": 1.9871602841434197, + "language_loss": 0.72998595, + "learning_rate": 3.3061590571591946e-06, + "loss": 0.80764621, + "num_input_tokens_seen": 105805315, + "router_z_loss_clip": 1.99316406, + "router_z_loss_mlp": 0.16821289, + "step": 4911, + "time_per_iteration": 2.5274527072906494 + }, + { + "auxiliary_loss_clip": 0.06493053, + "auxiliary_loss_mlp": 0.01276167, + "balance_loss_clip": 0.06295265, + "balance_loss_mlp": 0.01260122, + "epoch": 0.2953254171050654, + "flos": 19652754574080.0, + "grad_norm": 1.8153147203758204, + "language_loss": 0.90350848, + "learning_rate": 3.3058640972203904e-06, + "loss": 0.98120075, + "num_input_tokens_seen": 105825125, + "router_z_loss_clip": 1.9765625, + "router_z_loss_mlp": 0.16040039, + "step": 4912, + "time_per_iteration": 4.015045881271362 + }, + { + "auxiliary_loss_clip": 0.06500725, + "auxiliary_loss_mlp": 0.01273843, + "balance_loss_clip": 0.06298609, + "balance_loss_mlp": 0.01256474, + "epoch": 0.29538554035773334, + "flos": 22754678797440.0, + "grad_norm": 1.456675217678442, + "language_loss": 0.83491737, + "learning_rate": 3.3055690877630894e-06, + "loss": 0.91266304, + "num_input_tokens_seen": 105846085, + "router_z_loss_clip": 2.0234375, + "router_z_loss_mlp": 0.17370605, + "step": 4913, + "time_per_iteration": 2.5691113471984863 + }, + { + "auxiliary_loss_clip": 0.06499185, + "auxiliary_loss_mlp": 0.01271149, + "balance_loss_clip": 0.06297807, + "balance_loss_mlp": 0.01255163, + "epoch": 0.2954456636104013, + "flos": 21878343671040.0, + "grad_norm": 1.7751266266229593, + "language_loss": 0.77296054, + "learning_rate": 3.3052740287984765e-06, + "loss": 0.85066384, + "num_input_tokens_seen": 105865400, + "router_z_loss_clip": 2.01367188, + "router_z_loss_mlp": 0.15991211, + "step": 4914, + "time_per_iteration": 2.5379679203033447 + }, + { + "auxiliary_loss_clip": 0.06494316, + "auxiliary_loss_mlp": 0.01276253, + "balance_loss_clip": 0.06294423, + "balance_loss_mlp": 0.01259563, + "epoch": 0.2955057868630693, + "flos": 40452056092800.0, + "grad_norm": 1.8412710776020966, + "language_loss": 0.81848276, + "learning_rate": 3.3049789203377424e-06, + "loss": 0.89618844, + "num_input_tokens_seen": 105887920, + "router_z_loss_clip": 2.0, + "router_z_loss_mlp": 0.16674805, + "step": 4915, + "time_per_iteration": 4.123507261276245 + }, + { + "auxiliary_loss_clip": 0.06504083, + "auxiliary_loss_mlp": 0.01278112, + "balance_loss_clip": 0.06299824, + "balance_loss_mlp": 0.01260707, + "epoch": 0.29556591011573724, + "flos": 22571006647680.0, + "grad_norm": 1.7265680083109098, + "language_loss": 0.85337454, + "learning_rate": 3.3046837623920772e-06, + "loss": 0.93119645, + "num_input_tokens_seen": 105904035, + "router_z_loss_clip": 2.04296875, + "router_z_loss_mlp": 0.1739502, + "step": 4916, + "time_per_iteration": 3.964902400970459 + }, + { + "auxiliary_loss_clip": 0.06496175, + "auxiliary_loss_mlp": 0.01273483, + "balance_loss_clip": 0.06292706, + "balance_loss_mlp": 0.01257187, + "epoch": 0.2956260333684052, + "flos": 22095572181120.0, + "grad_norm": 2.6877460244099254, + "language_loss": 0.71410239, + "learning_rate": 3.3043885549726723e-06, + "loss": 0.79179895, + "num_input_tokens_seen": 105922685, + "router_z_loss_clip": 2.03613281, + "router_z_loss_mlp": 0.16296387, + "step": 4917, + "time_per_iteration": 2.510061502456665 + }, + { + "auxiliary_loss_clip": 0.06495264, + "auxiliary_loss_mlp": 0.01273068, + "balance_loss_clip": 0.06293347, + "balance_loss_mlp": 0.01255771, + "epoch": 0.2956861566210732, + "flos": 16441063102080.0, + "grad_norm": 1.9904514264943383, + "language_loss": 0.9154985, + "learning_rate": 3.3040932980907226e-06, + "loss": 0.99318182, + "num_input_tokens_seen": 105940425, + "router_z_loss_clip": 2.01757812, + "router_z_loss_mlp": 0.1730957, + "step": 4918, + "time_per_iteration": 2.5177812576293945 + }, + { + "auxiliary_loss_clip": 0.06500694, + "auxiliary_loss_mlp": 0.01270804, + "balance_loss_clip": 0.0629639, + "balance_loss_mlp": 0.01252887, + "epoch": 0.2957462798737412, + "flos": 25819189372800.0, + "grad_norm": 2.9632565132584587, + "language_loss": 0.73171133, + "learning_rate": 3.303797991757425e-06, + "loss": 0.80942631, + "num_input_tokens_seen": 105960550, + "router_z_loss_clip": 2.04394531, + "router_z_loss_mlp": 0.17919922, + "step": 4919, + "time_per_iteration": 2.548271656036377 + }, + { + "auxiliary_loss_clip": 0.06494663, + "auxiliary_loss_mlp": 0.01276246, + "balance_loss_clip": 0.062939, + "balance_loss_mlp": 0.01259104, + "epoch": 0.29580640312640916, + "flos": 16696459946880.0, + "grad_norm": 2.067015346809242, + "language_loss": 0.76653767, + "learning_rate": 3.3035026359839763e-06, + "loss": 0.84424675, + "num_input_tokens_seen": 105978820, + "router_z_loss_clip": 2.00878906, + "router_z_loss_mlp": 0.17138672, + "step": 4920, + "time_per_iteration": 2.5283315181732178 + }, + { + "auxiliary_loss_clip": 0.06505087, + "auxiliary_loss_mlp": 0.01280613, + "balance_loss_clip": 0.06298134, + "balance_loss_mlp": 0.01262886, + "epoch": 0.2958665263790771, + "flos": 23951427137280.0, + "grad_norm": 2.1683803944953786, + "language_loss": 0.69314063, + "learning_rate": 3.3032072307815774e-06, + "loss": 0.77099764, + "num_input_tokens_seen": 105997545, + "router_z_loss_clip": 2.06738281, + "router_z_loss_mlp": 0.17724609, + "step": 4921, + "time_per_iteration": 3.9904286861419678 + }, + { + "auxiliary_loss_clip": 0.06507339, + "auxiliary_loss_mlp": 0.01279047, + "balance_loss_clip": 0.06297763, + "balance_loss_mlp": 0.01261023, + "epoch": 0.2959266496317451, + "flos": 18484279787520.0, + "grad_norm": 1.8551497184563221, + "language_loss": 0.75478184, + "learning_rate": 3.3029117761614298e-06, + "loss": 0.83264565, + "num_input_tokens_seen": 106015320, + "router_z_loss_clip": 2.09570312, + "router_z_loss_mlp": 0.18017578, + "step": 4922, + "time_per_iteration": 2.5025644302368164 + }, + { + "auxiliary_loss_clip": 0.06508595, + "auxiliary_loss_mlp": 0.01277113, + "balance_loss_clip": 0.06298192, + "balance_loss_mlp": 0.01258051, + "epoch": 0.29598677288441305, + "flos": 25964525479680.0, + "grad_norm": 1.7877276864194063, + "language_loss": 0.77317607, + "learning_rate": 3.302616272134737e-06, + "loss": 0.85103309, + "num_input_tokens_seen": 106034555, + "router_z_loss_clip": 2.10742188, + "router_z_loss_mlp": 0.19067383, + "step": 4923, + "time_per_iteration": 2.57328462600708 + }, + { + "auxiliary_loss_clip": 0.06498858, + "auxiliary_loss_mlp": 0.01279587, + "balance_loss_clip": 0.06293048, + "balance_loss_mlp": 0.01262016, + "epoch": 0.296046896137081, + "flos": 25163101503360.0, + "grad_norm": 2.2992847921393174, + "language_loss": 0.8687042, + "learning_rate": 3.3023207187127042e-06, + "loss": 0.94648862, + "num_input_tokens_seen": 106054200, + "router_z_loss_clip": 2.05859375, + "router_z_loss_mlp": 0.17565918, + "step": 4924, + "time_per_iteration": 2.569819450378418 + }, + { + "auxiliary_loss_clip": 0.06495638, + "auxiliary_loss_mlp": 0.01274356, + "balance_loss_clip": 0.06293976, + "balance_loss_mlp": 0.01256891, + "epoch": 0.296107019389749, + "flos": 21767402465280.0, + "grad_norm": 1.4490170840920502, + "language_loss": 0.823627, + "learning_rate": 3.3020251159065396e-06, + "loss": 0.90132689, + "num_input_tokens_seen": 106074700, + "router_z_loss_clip": 2.015625, + "router_z_loss_mlp": 0.17468262, + "step": 4925, + "time_per_iteration": 2.586395025253296 + }, + { + "auxiliary_loss_clip": 0.06496158, + "auxiliary_loss_mlp": 0.01278426, + "balance_loss_clip": 0.06294197, + "balance_loss_mlp": 0.01261415, + "epoch": 0.29616714264241695, + "flos": 17964555638400.0, + "grad_norm": 3.115838377994743, + "language_loss": 0.87332439, + "learning_rate": 3.301729463727452e-06, + "loss": 0.95107025, + "num_input_tokens_seen": 106091415, + "router_z_loss_clip": 2.01953125, + "router_z_loss_mlp": 0.17016602, + "step": 4926, + "time_per_iteration": 2.480851411819458 + }, + { + "auxiliary_loss_clip": 0.06502646, + "auxiliary_loss_mlp": 0.01277188, + "balance_loss_clip": 0.06295682, + "balance_loss_mlp": 0.0125995, + "epoch": 0.2962272658950849, + "flos": 15018155792640.0, + "grad_norm": 2.5897634799766296, + "language_loss": 0.86097062, + "learning_rate": 3.3014337621866527e-06, + "loss": 0.93876898, + "num_input_tokens_seen": 106109135, + "router_z_loss_clip": 2.06738281, + "router_z_loss_mlp": 0.17236328, + "step": 4927, + "time_per_iteration": 2.524277687072754 + }, + { + "auxiliary_loss_clip": 0.06496821, + "auxiliary_loss_mlp": 0.01273329, + "balance_loss_clip": 0.06295302, + "balance_loss_mlp": 0.01256545, + "epoch": 0.2962873891477529, + "flos": 14726183840640.0, + "grad_norm": 1.628327768422068, + "language_loss": 0.80864251, + "learning_rate": 3.3011380112953553e-06, + "loss": 0.88634396, + "num_input_tokens_seen": 106125750, + "router_z_loss_clip": 2.01367188, + "router_z_loss_mlp": 0.16772461, + "step": 4928, + "time_per_iteration": 2.495842933654785 + }, + { + "auxiliary_loss_clip": 0.06510531, + "auxiliary_loss_mlp": 0.01280378, + "balance_loss_clip": 0.0629655, + "balance_loss_mlp": 0.012609, + "epoch": 0.29634751240042084, + "flos": 26730967576320.0, + "grad_norm": 3.186979474193142, + "language_loss": 0.72557974, + "learning_rate": 3.300842211064773e-06, + "loss": 0.80348885, + "num_input_tokens_seen": 106142835, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.19482422, + "step": 4929, + "time_per_iteration": 2.5845630168914795 + }, + { + "auxiliary_loss_clip": 0.06503193, + "auxiliary_loss_mlp": 0.01287506, + "balance_loss_clip": 0.06293295, + "balance_loss_mlp": 0.01268456, + "epoch": 0.2964076356530888, + "flos": 14575984197120.0, + "grad_norm": 2.811052251549286, + "language_loss": 0.73200721, + "learning_rate": 3.3005463615061246e-06, + "loss": 0.80991417, + "num_input_tokens_seen": 106160680, + "router_z_loss_clip": 2.09960938, + "router_z_loss_mlp": 0.19042969, + "step": 4930, + "time_per_iteration": 2.488785982131958 + }, + { + "auxiliary_loss_clip": 0.06387739, + "auxiliary_loss_mlp": 0.01269345, + "balance_loss_clip": 0.06290003, + "balance_loss_mlp": 0.0126519, + "epoch": 0.29646775890575683, + "flos": 63124387925760.0, + "grad_norm": 0.773484435694784, + "language_loss": 0.60626972, + "learning_rate": 3.3002504626306275e-06, + "loss": 0.68284053, + "num_input_tokens_seen": 106224415, + "router_z_loss_clip": 0.9765625, + "router_z_loss_mlp": 0.04156494, + "step": 4931, + "time_per_iteration": 3.1399567127227783 + }, + { + "auxiliary_loss_clip": 0.06390411, + "auxiliary_loss_mlp": 0.01264384, + "balance_loss_clip": 0.06293079, + "balance_loss_mlp": 0.0126054, + "epoch": 0.2965278821584248, + "flos": 63087728964480.0, + "grad_norm": 0.7260178151779769, + "language_loss": 0.52335358, + "learning_rate": 3.2999545144495023e-06, + "loss": 0.59990156, + "num_input_tokens_seen": 106279140, + "router_z_loss_clip": 0.97265625, + "router_z_loss_mlp": 0.03839111, + "step": 4932, + "time_per_iteration": 3.0242393016815186 + }, + { + "auxiliary_loss_clip": 0.06496995, + "auxiliary_loss_mlp": 0.01277379, + "balance_loss_clip": 0.06294326, + "balance_loss_mlp": 0.01260368, + "epoch": 0.29658800541109276, + "flos": 23775469562880.0, + "grad_norm": 1.6744964780290639, + "language_loss": 0.82042706, + "learning_rate": 3.299658516973972e-06, + "loss": 0.89817077, + "num_input_tokens_seen": 106298190, + "router_z_loss_clip": 2.02734375, + "router_z_loss_mlp": 0.17028809, + "step": 4933, + "time_per_iteration": 2.5955240726470947 + }, + { + "auxiliary_loss_clip": 0.06493178, + "auxiliary_loss_mlp": 0.01272888, + "balance_loss_clip": 0.06293809, + "balance_loss_mlp": 0.01256377, + "epoch": 0.2966481286637607, + "flos": 23995465257600.0, + "grad_norm": 1.8381459517159284, + "language_loss": 0.75639498, + "learning_rate": 3.299362470215261e-06, + "loss": 0.83405566, + "num_input_tokens_seen": 106319065, + "router_z_loss_clip": 1.99414062, + "router_z_loss_mlp": 0.16503906, + "step": 4934, + "time_per_iteration": 2.5714681148529053 + }, + { + "auxiliary_loss_clip": 0.06508597, + "auxiliary_loss_mlp": 0.01280413, + "balance_loss_clip": 0.06299804, + "balance_loss_mlp": 0.01261697, + "epoch": 0.2967082519164287, + "flos": 17170846237440.0, + "grad_norm": 1.723450067314057, + "language_loss": 0.63127494, + "learning_rate": 3.299066374184594e-06, + "loss": 0.70916504, + "num_input_tokens_seen": 106338040, + "router_z_loss_clip": 2.0859375, + "router_z_loss_mlp": 0.18713379, + "step": 4935, + "time_per_iteration": 2.513557195663452 + }, + { + "auxiliary_loss_clip": 0.06500618, + "auxiliary_loss_mlp": 0.01281806, + "balance_loss_clip": 0.06298316, + "balance_loss_mlp": 0.01263424, + "epoch": 0.29676837516909665, + "flos": 29395416032640.0, + "grad_norm": 1.6887254989691298, + "language_loss": 0.80239189, + "learning_rate": 3.2987702288932e-06, + "loss": 0.88021612, + "num_input_tokens_seen": 106358900, + "router_z_loss_clip": 2.02148438, + "router_z_loss_mlp": 0.18383789, + "step": 4936, + "time_per_iteration": 2.6222426891326904 + }, + { + "auxiliary_loss_clip": 0.06510909, + "auxiliary_loss_mlp": 0.0128109, + "balance_loss_clip": 0.06301413, + "balance_loss_mlp": 0.01261444, + "epoch": 0.2968284984217646, + "flos": 34759839876480.0, + "grad_norm": 1.4826285887608224, + "language_loss": 0.74831104, + "learning_rate": 3.298474034352309e-06, + "loss": 0.826231, + "num_input_tokens_seen": 106381805, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 0.19665527, + "step": 4937, + "time_per_iteration": 2.7231242656707764 + }, + { + "auxiliary_loss_clip": 0.06501779, + "auxiliary_loss_mlp": 0.01274387, + "balance_loss_clip": 0.06297591, + "balance_loss_mlp": 0.01256768, + "epoch": 0.2968886216744326, + "flos": 21550635152640.0, + "grad_norm": 1.507706154697653, + "language_loss": 0.78372371, + "learning_rate": 3.2981777905731526e-06, + "loss": 0.86148536, + "num_input_tokens_seen": 106402365, + "router_z_loss_clip": 2.04101562, + "router_z_loss_mlp": 0.17614746, + "step": 4938, + "time_per_iteration": 2.564958095550537 + }, + { + "auxiliary_loss_clip": 0.06506119, + "auxiliary_loss_mlp": 0.01279001, + "balance_loss_clip": 0.06296918, + "balance_loss_mlp": 0.01260643, + "epoch": 0.29694874492710055, + "flos": 12792357060480.0, + "grad_norm": 3.019574533594622, + "language_loss": 0.76788878, + "learning_rate": 3.297881497566964e-06, + "loss": 0.84574002, + "num_input_tokens_seen": 106419800, + "router_z_loss_clip": 2.09179688, + "router_z_loss_mlp": 0.18359375, + "step": 4939, + "time_per_iteration": 2.514143943786621 + }, + { + "auxiliary_loss_clip": 0.06509334, + "auxiliary_loss_mlp": 0.01271997, + "balance_loss_clip": 0.06296703, + "balance_loss_mlp": 0.01254259, + "epoch": 0.2970088681797685, + "flos": 24576600049920.0, + "grad_norm": 1.687046897883716, + "language_loss": 0.78335512, + "learning_rate": 3.297585155344979e-06, + "loss": 0.86116844, + "num_input_tokens_seen": 106440300, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.17736816, + "step": 4940, + "time_per_iteration": 2.570279359817505 + }, + { + "auxiliary_loss_clip": 0.06508817, + "auxiliary_loss_mlp": 0.01275865, + "balance_loss_clip": 0.06300067, + "balance_loss_mlp": 0.01257113, + "epoch": 0.2970689914324365, + "flos": 23665870022400.0, + "grad_norm": 1.5281741947741105, + "language_loss": 0.75415564, + "learning_rate": 3.297288763918435e-06, + "loss": 0.8320024, + "num_input_tokens_seen": 106460035, + "router_z_loss_clip": 2.08789062, + "router_z_loss_mlp": 0.1875, + "step": 4941, + "time_per_iteration": 2.549976348876953 + }, + { + "auxiliary_loss_clip": 0.06509985, + "auxiliary_loss_mlp": 0.01274098, + "balance_loss_clip": 0.06298217, + "balance_loss_mlp": 0.01254667, + "epoch": 0.29712911468510445, + "flos": 39678654107520.0, + "grad_norm": 2.245999939669129, + "language_loss": 0.74959898, + "learning_rate": 3.2969923232985712e-06, + "loss": 0.82743979, + "num_input_tokens_seen": 106481095, + "router_z_loss_clip": 2.1171875, + "router_z_loss_mlp": 0.19445801, + "step": 4942, + "time_per_iteration": 2.7199416160583496 + }, + { + "auxiliary_loss_clip": 0.0651295, + "auxiliary_loss_mlp": 0.01282177, + "balance_loss_clip": 0.06299168, + "balance_loss_mlp": 0.01261744, + "epoch": 0.2971892379377724, + "flos": 26402420517120.0, + "grad_norm": 1.727137408051059, + "language_loss": 0.70931113, + "learning_rate": 3.2966958334966287e-06, + "loss": 0.78726244, + "num_input_tokens_seen": 106501590, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.2043457, + "step": 4943, + "time_per_iteration": 2.5410006046295166 + }, + { + "auxiliary_loss_clip": 0.06508674, + "auxiliary_loss_mlp": 0.01274303, + "balance_loss_clip": 0.06296329, + "balance_loss_mlp": 0.01255599, + "epoch": 0.2972493611904404, + "flos": 17608992837120.0, + "grad_norm": 2.280832061666768, + "language_loss": 0.8012532, + "learning_rate": 3.2963992945238497e-06, + "loss": 0.87908292, + "num_input_tokens_seen": 106519430, + "router_z_loss_clip": 2.12207031, + "router_z_loss_mlp": 0.18725586, + "step": 4944, + "time_per_iteration": 2.5628697872161865 + }, + { + "auxiliary_loss_clip": 0.06495067, + "auxiliary_loss_mlp": 0.01272551, + "balance_loss_clip": 0.06293043, + "balance_loss_mlp": 0.01255194, + "epoch": 0.2973094844431084, + "flos": 20419070889600.0, + "grad_norm": 2.0196449856406704, + "language_loss": 0.83490258, + "learning_rate": 3.2961027063914795e-06, + "loss": 0.91257876, + "num_input_tokens_seen": 106535870, + "router_z_loss_clip": 2.01953125, + "router_z_loss_mlp": 0.17346191, + "step": 4945, + "time_per_iteration": 2.5184381008148193 + }, + { + "auxiliary_loss_clip": 0.06494735, + "auxiliary_loss_mlp": 0.01274271, + "balance_loss_clip": 0.0629338, + "balance_loss_mlp": 0.01257081, + "epoch": 0.29736960769577636, + "flos": 17499225588480.0, + "grad_norm": 1.8481246337269472, + "language_loss": 0.67665654, + "learning_rate": 3.2958060691107654e-06, + "loss": 0.75434661, + "num_input_tokens_seen": 106553560, + "router_z_loss_clip": 2.00976562, + "router_z_loss_mlp": 0.171875, + "step": 4946, + "time_per_iteration": 2.524073362350464 + }, + { + "auxiliary_loss_clip": 0.06500807, + "auxiliary_loss_mlp": 0.01272914, + "balance_loss_clip": 0.06294695, + "balance_loss_mlp": 0.01255462, + "epoch": 0.2974297309484443, + "flos": 26111119397760.0, + "grad_norm": 1.9041348906467674, + "language_loss": 0.74493206, + "learning_rate": 3.2955093826929547e-06, + "loss": 0.82266927, + "num_input_tokens_seen": 106574115, + "router_z_loss_clip": 2.05957031, + "router_z_loss_mlp": 0.17443848, + "step": 4947, + "time_per_iteration": 2.55096435546875 + }, + { + "auxiliary_loss_clip": 0.06508033, + "auxiliary_loss_mlp": 0.01274303, + "balance_loss_clip": 0.06299601, + "balance_loss_mlp": 0.01255396, + "epoch": 0.2974898542011123, + "flos": 25673559776640.0, + "grad_norm": 5.5840313105791894, + "language_loss": 0.73332673, + "learning_rate": 3.2952126471492985e-06, + "loss": 0.81115007, + "num_input_tokens_seen": 106593070, + "router_z_loss_clip": 2.08496094, + "router_z_loss_mlp": 0.18896484, + "step": 4948, + "time_per_iteration": 2.604213237762451 + }, + { + "auxiliary_loss_clip": 0.06494544, + "auxiliary_loss_mlp": 0.01275305, + "balance_loss_clip": 0.06292598, + "balance_loss_mlp": 0.01258687, + "epoch": 0.29754997745378026, + "flos": 18667323031680.0, + "grad_norm": 1.916403484704169, + "language_loss": 0.84057009, + "learning_rate": 3.2949158624910497e-06, + "loss": 0.91826856, + "num_input_tokens_seen": 106610695, + "router_z_loss_clip": 2.015625, + "router_z_loss_mlp": 0.1661377, + "step": 4949, + "time_per_iteration": 2.4725756645202637 + }, + { + "auxiliary_loss_clip": 0.06495193, + "auxiliary_loss_mlp": 0.01276752, + "balance_loss_clip": 0.06291104, + "balance_loss_mlp": 0.01258692, + "epoch": 0.2976101007064482, + "flos": 22281382609920.0, + "grad_norm": 2.0864257908602464, + "language_loss": 0.71227181, + "learning_rate": 3.2946190287294603e-06, + "loss": 0.78999126, + "num_input_tokens_seen": 106631300, + "router_z_loss_clip": 2.04101562, + "router_z_loss_mlp": 0.18078613, + "step": 4950, + "time_per_iteration": 2.5644164085388184 + }, + { + "auxiliary_loss_clip": 0.06486266, + "auxiliary_loss_mlp": 0.01272795, + "balance_loss_clip": 0.06290439, + "balance_loss_mlp": 0.01256308, + "epoch": 0.2976702239591162, + "flos": 21952290499200.0, + "grad_norm": 2.1576156011429597, + "language_loss": 0.83112931, + "learning_rate": 3.294322145875789e-06, + "loss": 0.9087199, + "num_input_tokens_seen": 106650065, + "router_z_loss_clip": 1.95898438, + "router_z_loss_mlp": 0.16467285, + "step": 4951, + "time_per_iteration": 2.5149009227752686 + }, + { + "auxiliary_loss_clip": 0.06493516, + "auxiliary_loss_mlp": 0.01274653, + "balance_loss_clip": 0.06287138, + "balance_loss_mlp": 0.01257248, + "epoch": 0.29773034721178415, + "flos": 24642874229760.0, + "grad_norm": 2.538162384222029, + "language_loss": 0.73777694, + "learning_rate": 3.2940252139412912e-06, + "loss": 0.81545866, + "num_input_tokens_seen": 106668230, + "router_z_loss_clip": 2.06347656, + "router_z_loss_mlp": 0.17407227, + "step": 4952, + "time_per_iteration": 3.9977774620056152 + }, + { + "auxiliary_loss_clip": 0.06494328, + "auxiliary_loss_mlp": 0.01279914, + "balance_loss_clip": 0.06291338, + "balance_loss_mlp": 0.01261472, + "epoch": 0.2977904704644521, + "flos": 20563694236800.0, + "grad_norm": 1.830993802630573, + "language_loss": 0.8420608, + "learning_rate": 3.293728232937228e-06, + "loss": 0.91980314, + "num_input_tokens_seen": 106687785, + "router_z_loss_clip": 2.03027344, + "router_z_loss_mlp": 0.18444824, + "step": 4953, + "time_per_iteration": 2.556278944015503 + }, + { + "auxiliary_loss_clip": 0.0649702, + "auxiliary_loss_mlp": 0.01271138, + "balance_loss_clip": 0.06289494, + "balance_loss_mlp": 0.01254246, + "epoch": 0.2978505937171201, + "flos": 18922426387200.0, + "grad_norm": 2.0824874332629113, + "language_loss": 0.74276727, + "learning_rate": 3.2934312028748597e-06, + "loss": 0.82044888, + "num_input_tokens_seen": 106706875, + "router_z_loss_clip": 2.07617188, + "router_z_loss_mlp": 0.16894531, + "step": 4954, + "time_per_iteration": 3.9108667373657227 + }, + { + "auxiliary_loss_clip": 0.06489201, + "auxiliary_loss_mlp": 0.01275174, + "balance_loss_clip": 0.06286507, + "balance_loss_mlp": 0.01259164, + "epoch": 0.29791071696978805, + "flos": 19323788244480.0, + "grad_norm": 1.865430683209025, + "language_loss": 0.75582623, + "learning_rate": 3.293134123765452e-06, + "loss": 0.83346999, + "num_input_tokens_seen": 106725105, + "router_z_loss_clip": 2.02929688, + "router_z_loss_mlp": 0.16003418, + "step": 4955, + "time_per_iteration": 4.034101724624634 + }, + { + "auxiliary_loss_clip": 0.06493168, + "auxiliary_loss_mlp": 0.01273359, + "balance_loss_clip": 0.06285557, + "balance_loss_mlp": 0.0125593, + "epoch": 0.297970840222456, + "flos": 18812742992640.0, + "grad_norm": 1.8893942834003292, + "language_loss": 0.72569048, + "learning_rate": 3.2928369956202684e-06, + "loss": 0.80335575, + "num_input_tokens_seen": 106744780, + "router_z_loss_clip": 2.07421875, + "router_z_loss_mlp": 0.17419434, + "step": 4956, + "time_per_iteration": 2.523688793182373 + }, + { + "auxiliary_loss_clip": 0.06498902, + "auxiliary_loss_mlp": 0.01272155, + "balance_loss_clip": 0.06287451, + "balance_loss_mlp": 0.01253141, + "epoch": 0.298030963475124, + "flos": 22858702041600.0, + "grad_norm": 1.7093127439145954, + "language_loss": 0.79588521, + "learning_rate": 3.2925398184505754e-06, + "loss": 0.87359571, + "num_input_tokens_seen": 106764670, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.19006348, + "step": 4957, + "time_per_iteration": 2.5350780487060547 + }, + { + "auxiliary_loss_clip": 0.0648672, + "auxiliary_loss_mlp": 0.01278155, + "balance_loss_clip": 0.06281397, + "balance_loss_mlp": 0.01261084, + "epoch": 0.298091086727792, + "flos": 21874402529280.0, + "grad_norm": 1.5033412482034976, + "language_loss": 0.70601791, + "learning_rate": 3.2922425922676437e-06, + "loss": 0.78366661, + "num_input_tokens_seen": 106783695, + "router_z_loss_clip": 2.05273438, + "router_z_loss_mlp": 0.17077637, + "step": 4958, + "time_per_iteration": 2.52998948097229 + }, + { + "auxiliary_loss_clip": 0.06484255, + "auxiliary_loss_mlp": 0.01275467, + "balance_loss_clip": 0.06283475, + "balance_loss_mlp": 0.01256954, + "epoch": 0.29815120998045996, + "flos": 21180775230720.0, + "grad_norm": 1.4471916983062794, + "language_loss": 0.78955591, + "learning_rate": 3.291945317082743e-06, + "loss": 0.86715317, + "num_input_tokens_seen": 106803150, + "router_z_loss_clip": 2.00683594, + "router_z_loss_mlp": 0.18505859, + "step": 4959, + "time_per_iteration": 2.5247116088867188 + }, + { + "auxiliary_loss_clip": 0.06484501, + "auxiliary_loss_mlp": 0.01275754, + "balance_loss_clip": 0.06281502, + "balance_loss_mlp": 0.01258183, + "epoch": 0.29821133323312793, + "flos": 19901526946560.0, + "grad_norm": 1.8097637226237389, + "language_loss": 0.79637736, + "learning_rate": 3.291647992907147e-06, + "loss": 0.87397993, + "num_input_tokens_seen": 106820705, + "router_z_loss_clip": 2.02734375, + "router_z_loss_mlp": 0.17578125, + "step": 4960, + "time_per_iteration": 2.544517755508423 + }, + { + "auxiliary_loss_clip": 0.06493803, + "auxiliary_loss_mlp": 0.01272699, + "balance_loss_clip": 0.06284714, + "balance_loss_mlp": 0.01254483, + "epoch": 0.2982714564857959, + "flos": 12755781953280.0, + "grad_norm": 2.226713674353186, + "language_loss": 0.74493575, + "learning_rate": 3.291350619752129e-06, + "loss": 0.82260078, + "num_input_tokens_seen": 106837335, + "router_z_loss_clip": 2.09082031, + "router_z_loss_mlp": 0.18225098, + "step": 4961, + "time_per_iteration": 3.9662065505981445 + }, + { + "auxiliary_loss_clip": 0.06486452, + "auxiliary_loss_mlp": 0.01274578, + "balance_loss_clip": 0.062804, + "balance_loss_mlp": 0.01256756, + "epoch": 0.29833157973846386, + "flos": 22278238081920.0, + "grad_norm": 2.8000667311611167, + "language_loss": 0.62968349, + "learning_rate": 3.291053197628967e-06, + "loss": 0.70729387, + "num_input_tokens_seen": 106856250, + "router_z_loss_clip": 2.06054688, + "router_z_loss_mlp": 0.17810059, + "step": 4962, + "time_per_iteration": 2.533984661102295 + }, + { + "auxiliary_loss_clip": 0.06485053, + "auxiliary_loss_mlp": 0.01276691, + "balance_loss_clip": 0.06281514, + "balance_loss_mlp": 0.01259596, + "epoch": 0.2983917029911318, + "flos": 15377659735680.0, + "grad_norm": 1.6706058401186525, + "language_loss": 0.83686638, + "learning_rate": 3.2907557265489375e-06, + "loss": 0.91448379, + "num_input_tokens_seen": 106873370, + "router_z_loss_clip": 2.03613281, + "router_z_loss_mlp": 0.17102051, + "step": 4963, + "time_per_iteration": 2.524486780166626 + }, + { + "auxiliary_loss_clip": 0.0648464, + "auxiliary_loss_mlp": 0.01276785, + "balance_loss_clip": 0.06283776, + "balance_loss_mlp": 0.01259572, + "epoch": 0.2984518262437998, + "flos": 15383068323840.0, + "grad_norm": 2.213795741630968, + "language_loss": 0.66932309, + "learning_rate": 3.290458206523322e-06, + "loss": 0.74693739, + "num_input_tokens_seen": 106890330, + "router_z_loss_clip": 2.01074219, + "router_z_loss_mlp": 0.17224121, + "step": 4964, + "time_per_iteration": 2.5100491046905518 + }, + { + "auxiliary_loss_clip": 0.06485043, + "auxiliary_loss_mlp": 0.01273472, + "balance_loss_clip": 0.06283367, + "balance_loss_mlp": 0.01257701, + "epoch": 0.29851194949646775, + "flos": 18113413616640.0, + "grad_norm": 1.8232440195867097, + "language_loss": 0.72163451, + "learning_rate": 3.2901606375634015e-06, + "loss": 0.79921961, + "num_input_tokens_seen": 106909190, + "router_z_loss_clip": 2.01660156, + "router_z_loss_mlp": 0.15771484, + "step": 4965, + "time_per_iteration": 2.5180373191833496 + }, + { + "auxiliary_loss_clip": 0.06490128, + "auxiliary_loss_mlp": 0.01278877, + "balance_loss_clip": 0.06284484, + "balance_loss_mlp": 0.01261139, + "epoch": 0.2985720727491357, + "flos": 22024811808000.0, + "grad_norm": 1.7919900337102326, + "language_loss": 0.66928089, + "learning_rate": 3.289863019680461e-06, + "loss": 0.74697095, + "num_input_tokens_seen": 106927825, + "router_z_loss_clip": 2.05664062, + "router_z_loss_mlp": 0.17724609, + "step": 4966, + "time_per_iteration": 2.5509839057922363 + }, + { + "auxiliary_loss_clip": 0.06492805, + "auxiliary_loss_mlp": 0.01279859, + "balance_loss_clip": 0.06288783, + "balance_loss_mlp": 0.01262026, + "epoch": 0.2986321960018037, + "flos": 13046202604800.0, + "grad_norm": 2.9983208236286862, + "language_loss": 0.74761832, + "learning_rate": 3.289565352885785e-06, + "loss": 0.82534492, + "num_input_tokens_seen": 106943155, + "router_z_loss_clip": 2.04199219, + "router_z_loss_mlp": 0.17822266, + "step": 4967, + "time_per_iteration": 2.5119001865386963 + }, + { + "auxiliary_loss_clip": 0.06492577, + "auxiliary_loss_mlp": 0.01276602, + "balance_loss_clip": 0.06288804, + "balance_loss_mlp": 0.01260294, + "epoch": 0.29869231925447165, + "flos": 14470241944320.0, + "grad_norm": 1.9901449284839132, + "language_loss": 0.72232509, + "learning_rate": 3.2892676371906614e-06, + "loss": 0.80001682, + "num_input_tokens_seen": 106960295, + "router_z_loss_clip": 2.03613281, + "router_z_loss_mlp": 0.16308594, + "step": 4968, + "time_per_iteration": 2.49646258354187 + }, + { + "auxiliary_loss_clip": 0.06497695, + "auxiliary_loss_mlp": 0.01278817, + "balance_loss_clip": 0.06290321, + "balance_loss_mlp": 0.01261007, + "epoch": 0.2987524425071396, + "flos": 31658376850560.0, + "grad_norm": 1.780098836704026, + "language_loss": 0.76775402, + "learning_rate": 3.2889698726063805e-06, + "loss": 0.84551913, + "num_input_tokens_seen": 106982870, + "router_z_loss_clip": 2.07324219, + "router_z_loss_mlp": 0.17810059, + "step": 4969, + "time_per_iteration": 2.677133321762085 + }, + { + "auxiliary_loss_clip": 0.0649517, + "auxiliary_loss_mlp": 0.01279823, + "balance_loss_clip": 0.06290856, + "balance_loss_mlp": 0.0126355, + "epoch": 0.2988125657598076, + "flos": 21439735873920.0, + "grad_norm": 1.6530964666677603, + "language_loss": 0.702811, + "learning_rate": 3.2886720591442327e-06, + "loss": 0.78056097, + "num_input_tokens_seen": 107002405, + "router_z_loss_clip": 2.04589844, + "router_z_loss_mlp": 0.16271973, + "step": 4970, + "time_per_iteration": 2.542041301727295 + }, + { + "auxiliary_loss_clip": 0.06501894, + "auxiliary_loss_mlp": 0.01279087, + "balance_loss_clip": 0.06289935, + "balance_loss_mlp": 0.01260336, + "epoch": 0.2988726890124756, + "flos": 18082750222080.0, + "grad_norm": 2.836679638175962, + "language_loss": 0.84790057, + "learning_rate": 3.2883741968155103e-06, + "loss": 0.92571044, + "num_input_tokens_seen": 107017310, + "router_z_loss_clip": 2.11816406, + "router_z_loss_mlp": 0.18737793, + "step": 4971, + "time_per_iteration": 2.5460052490234375 + }, + { + "auxiliary_loss_clip": 0.06490934, + "auxiliary_loss_mlp": 0.01273993, + "balance_loss_clip": 0.06292243, + "balance_loss_mlp": 0.01257691, + "epoch": 0.29893281226514357, + "flos": 21760987628160.0, + "grad_norm": 1.7104631490326472, + "language_loss": 0.79530191, + "learning_rate": 3.2880762856315107e-06, + "loss": 0.87295115, + "num_input_tokens_seen": 107034645, + "router_z_loss_clip": 1.98925781, + "router_z_loss_mlp": 0.16314697, + "step": 4972, + "time_per_iteration": 2.521575689315796 + }, + { + "auxiliary_loss_clip": 0.0650093, + "auxiliary_loss_mlp": 0.01282709, + "balance_loss_clip": 0.06297094, + "balance_loss_mlp": 0.01266234, + "epoch": 0.29899293551781153, + "flos": 16842341105280.0, + "grad_norm": 1.7682293865220609, + "language_loss": 0.85643351, + "learning_rate": 3.2877783256035285e-06, + "loss": 0.93426991, + "num_input_tokens_seen": 107051125, + "router_z_loss_clip": 2.04101562, + "router_z_loss_mlp": 0.16467285, + "step": 4973, + "time_per_iteration": 2.546552896499634 + }, + { + "auxiliary_loss_clip": 0.06486042, + "auxiliary_loss_mlp": 0.01280538, + "balance_loss_clip": 0.06291717, + "balance_loss_mlp": 0.01263539, + "epoch": 0.2990530587704795, + "flos": 11734068792960.0, + "grad_norm": 1.5403026658154284, + "language_loss": 0.78163445, + "learning_rate": 3.287480316742863e-06, + "loss": 0.85930026, + "num_input_tokens_seen": 107068815, + "router_z_loss_clip": 1.94433594, + "router_z_loss_mlp": 0.17004395, + "step": 4974, + "time_per_iteration": 2.519416093826294 + }, + { + "auxiliary_loss_clip": 0.06492939, + "auxiliary_loss_mlp": 0.01274131, + "balance_loss_clip": 0.06288281, + "balance_loss_mlp": 0.01257001, + "epoch": 0.29911318202314746, + "flos": 28047713362560.0, + "grad_norm": 1.767842246111843, + "language_loss": 0.73036933, + "learning_rate": 3.287182259060815e-06, + "loss": 0.80804002, + "num_input_tokens_seen": 107090420, + "router_z_loss_clip": 2.04589844, + "router_z_loss_mlp": 0.17126465, + "step": 4975, + "time_per_iteration": 2.6099252700805664 + }, + { + "auxiliary_loss_clip": 0.0649198, + "auxiliary_loss_mlp": 0.01278331, + "balance_loss_clip": 0.06288506, + "balance_loss_mlp": 0.0126163, + "epoch": 0.2991733052758154, + "flos": 18739425070080.0, + "grad_norm": 3.7568061887968374, + "language_loss": 0.76564699, + "learning_rate": 3.286884152568687e-06, + "loss": 0.84335011, + "num_input_tokens_seen": 107107255, + "router_z_loss_clip": 2.03222656, + "router_z_loss_mlp": 0.16711426, + "step": 4976, + "time_per_iteration": 2.4865057468414307 + }, + { + "auxiliary_loss_clip": 0.0649081, + "auxiliary_loss_mlp": 0.01274025, + "balance_loss_clip": 0.06290253, + "balance_loss_mlp": 0.01257574, + "epoch": 0.2992334285284834, + "flos": 15564476413440.0, + "grad_norm": 2.0027584051633256, + "language_loss": 0.86547983, + "learning_rate": 3.2865859972777827e-06, + "loss": 0.94312823, + "num_input_tokens_seen": 107123840, + "router_z_loss_clip": 2.00585938, + "router_z_loss_mlp": 0.16455078, + "step": 4977, + "time_per_iteration": 2.5564377307891846 + }, + { + "auxiliary_loss_clip": 0.06492308, + "auxiliary_loss_mlp": 0.01273791, + "balance_loss_clip": 0.06289831, + "balance_loss_mlp": 0.0125684, + "epoch": 0.29929355178115136, + "flos": 21803809864320.0, + "grad_norm": 1.498415139231663, + "language_loss": 0.69035208, + "learning_rate": 3.2862877931994088e-06, + "loss": 0.76801312, + "num_input_tokens_seen": 107143475, + "router_z_loss_clip": 2.02539062, + "router_z_loss_mlp": 0.16943359, + "step": 4978, + "time_per_iteration": 2.519927978515625 + }, + { + "auxiliary_loss_clip": 0.06498158, + "auxiliary_loss_mlp": 0.01273756, + "balance_loss_clip": 0.06295491, + "balance_loss_mlp": 0.0125634, + "epoch": 0.2993536750338193, + "flos": 21184884080640.0, + "grad_norm": 2.2981139003330924, + "language_loss": 0.76821494, + "learning_rate": 3.2859895403448726e-06, + "loss": 0.84593409, + "num_input_tokens_seen": 107161725, + "router_z_loss_clip": 2.02832031, + "router_z_loss_mlp": 0.17407227, + "step": 4979, + "time_per_iteration": 2.5783658027648926 + }, + { + "auxiliary_loss_clip": 0.06495501, + "auxiliary_loss_mlp": 0.01275001, + "balance_loss_clip": 0.06288472, + "balance_loss_mlp": 0.0125762, + "epoch": 0.2994137982864873, + "flos": 32129954029440.0, + "grad_norm": 1.9038495469030372, + "language_loss": 0.69286489, + "learning_rate": 3.285691238725484e-06, + "loss": 0.77056986, + "num_input_tokens_seen": 107183935, + "router_z_loss_clip": 2.06835938, + "router_z_loss_mlp": 0.17382812, + "step": 4980, + "time_per_iteration": 2.582043170928955 + }, + { + "auxiliary_loss_clip": 0.06490306, + "auxiliary_loss_mlp": 0.01274236, + "balance_loss_clip": 0.06288646, + "balance_loss_mlp": 0.01257177, + "epoch": 0.29947392153915525, + "flos": 21111733866240.0, + "grad_norm": 1.7308746684442236, + "language_loss": 0.74001658, + "learning_rate": 3.285392888352555e-06, + "loss": 0.817662, + "num_input_tokens_seen": 107204285, + "router_z_loss_clip": 2.015625, + "router_z_loss_mlp": 0.17053223, + "step": 4981, + "time_per_iteration": 2.580580711364746 + }, + { + "auxiliary_loss_clip": 0.06490904, + "auxiliary_loss_mlp": 0.01273981, + "balance_loss_clip": 0.06281741, + "balance_loss_mlp": 0.0125635, + "epoch": 0.2995340447918232, + "flos": 21548916144000.0, + "grad_norm": 1.9422940804684126, + "language_loss": 0.86877131, + "learning_rate": 3.2850944892373987e-06, + "loss": 0.94642013, + "num_input_tokens_seen": 107225265, + "router_z_loss_clip": 2.08984375, + "router_z_loss_mlp": 0.17626953, + "step": 4982, + "time_per_iteration": 2.4962990283966064 + }, + { + "auxiliary_loss_clip": 0.06497963, + "auxiliary_loss_mlp": 0.0127461, + "balance_loss_clip": 0.06287588, + "balance_loss_mlp": 0.01257241, + "epoch": 0.2995941680444912, + "flos": 16730393650560.0, + "grad_norm": 2.5640920256819886, + "language_loss": 0.87797368, + "learning_rate": 3.2847960413913307e-06, + "loss": 0.95569938, + "num_input_tokens_seen": 107241335, + "router_z_loss_clip": 2.1015625, + "router_z_loss_mlp": 0.17382812, + "step": 4983, + "time_per_iteration": 2.5295448303222656 + }, + { + "auxiliary_loss_clip": 0.0649021, + "auxiliary_loss_mlp": 0.01273363, + "balance_loss_clip": 0.06287163, + "balance_loss_mlp": 0.012569, + "epoch": 0.2996542912971592, + "flos": 20929864579200.0, + "grad_norm": 2.1931631477553943, + "language_loss": 0.78985476, + "learning_rate": 3.284497544825668e-06, + "loss": 0.86749053, + "num_input_tokens_seen": 107259375, + "router_z_loss_clip": 2.03027344, + "router_z_loss_mlp": 0.16467285, + "step": 4984, + "time_per_iteration": 2.510861873626709 + }, + { + "auxiliary_loss_clip": 0.06490169, + "auxiliary_loss_mlp": 0.01276988, + "balance_loss_clip": 0.06284384, + "balance_loss_mlp": 0.01259702, + "epoch": 0.29971441454982717, + "flos": 25086429417600.0, + "grad_norm": 1.6549542244227224, + "language_loss": 0.78558743, + "learning_rate": 3.2841989995517303e-06, + "loss": 0.86325896, + "num_input_tokens_seen": 107279890, + "router_z_loss_clip": 2.05859375, + "router_z_loss_mlp": 0.17285156, + "step": 4985, + "time_per_iteration": 2.6011219024658203 + }, + { + "auxiliary_loss_clip": 0.06501257, + "auxiliary_loss_mlp": 0.01278562, + "balance_loss_clip": 0.06288561, + "balance_loss_mlp": 0.0125968, + "epoch": 0.29977453780249513, + "flos": 52567445617920.0, + "grad_norm": 2.1128232330624757, + "language_loss": 0.71929544, + "learning_rate": 3.283900405580837e-06, + "loss": 0.79709363, + "num_input_tokens_seen": 107303430, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.1887207, + "step": 4986, + "time_per_iteration": 2.8261890411376953 + }, + { + "auxiliary_loss_clip": 0.06496918, + "auxiliary_loss_mlp": 0.01277715, + "balance_loss_clip": 0.06288348, + "balance_loss_mlp": 0.0125981, + "epoch": 0.2998346610551631, + "flos": 22243759326720.0, + "grad_norm": 2.0495005677193703, + "language_loss": 0.73353851, + "learning_rate": 3.283601762924312e-06, + "loss": 0.81128478, + "num_input_tokens_seen": 107323700, + "router_z_loss_clip": 2.08203125, + "router_z_loss_mlp": 0.17907715, + "step": 4987, + "time_per_iteration": 2.5969009399414062 + }, + { + "auxiliary_loss_clip": 0.06487568, + "auxiliary_loss_mlp": 0.01277048, + "balance_loss_clip": 0.06283796, + "balance_loss_mlp": 0.01260561, + "epoch": 0.29989478430783106, + "flos": 16878832358400.0, + "grad_norm": 1.677350703029162, + "language_loss": 0.80982405, + "learning_rate": 3.2833030715934793e-06, + "loss": 0.88747025, + "num_input_tokens_seen": 107341965, + "router_z_loss_clip": 2.03710938, + "router_z_loss_mlp": 0.16479492, + "step": 4988, + "time_per_iteration": 2.4802756309509277 + }, + { + "auxiliary_loss_clip": 0.06489251, + "auxiliary_loss_mlp": 0.0127416, + "balance_loss_clip": 0.06285515, + "balance_loss_mlp": 0.0125759, + "epoch": 0.29995490756049903, + "flos": 23775637271040.0, + "grad_norm": 1.830625198484136, + "language_loss": 0.7097913, + "learning_rate": 3.2830043315996658e-06, + "loss": 0.7874254, + "num_input_tokens_seen": 107362615, + "router_z_loss_clip": 2.03613281, + "router_z_loss_mlp": 0.16577148, + "step": 4989, + "time_per_iteration": 2.5968902111053467 + }, + { + "auxiliary_loss_clip": 0.06498987, + "auxiliary_loss_mlp": 0.01283365, + "balance_loss_clip": 0.0628901, + "balance_loss_mlp": 0.01264948, + "epoch": 0.300015030813167, + "flos": 14470577360640.0, + "grad_norm": 2.8004651200920576, + "language_loss": 0.85787904, + "learning_rate": 3.282705542954199e-06, + "loss": 0.93570256, + "num_input_tokens_seen": 107378980, + "router_z_loss_clip": 2.09570312, + "router_z_loss_mlp": 0.18408203, + "step": 4990, + "time_per_iteration": 2.4837355613708496 + }, + { + "auxiliary_loss_clip": 0.06499861, + "auxiliary_loss_mlp": 0.01278121, + "balance_loss_clip": 0.06287368, + "balance_loss_mlp": 0.01260204, + "epoch": 0.30007515406583496, + "flos": 25199005777920.0, + "grad_norm": 1.6608247288012334, + "language_loss": 0.67339301, + "learning_rate": 3.28240670566841e-06, + "loss": 0.75117278, + "num_input_tokens_seen": 107397640, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.17919922, + "step": 4991, + "time_per_iteration": 4.060553312301636 + }, + { + "auxiliary_loss_clip": 0.0649571, + "auxiliary_loss_mlp": 0.01277369, + "balance_loss_clip": 0.06284688, + "balance_loss_mlp": 0.01259022, + "epoch": 0.3001352773185029, + "flos": 19397315802240.0, + "grad_norm": 1.7545259775845383, + "language_loss": 0.79479051, + "learning_rate": 3.28210781975363e-06, + "loss": 0.87252128, + "num_input_tokens_seen": 107416020, + "router_z_loss_clip": 2.10546875, + "router_z_loss_mlp": 0.18347168, + "step": 4992, + "time_per_iteration": 2.5394246578216553 + }, + { + "auxiliary_loss_clip": 0.06496455, + "auxiliary_loss_mlp": 0.01272727, + "balance_loss_clip": 0.06291893, + "balance_loss_mlp": 0.01255061, + "epoch": 0.3001954005711709, + "flos": 21550341663360.0, + "grad_norm": 1.8174225064451806, + "language_loss": 0.83191693, + "learning_rate": 3.281808885221193e-06, + "loss": 0.90960872, + "num_input_tokens_seen": 107436340, + "router_z_loss_clip": 2.04589844, + "router_z_loss_mlp": 0.17675781, + "step": 4993, + "time_per_iteration": 2.536900520324707 + }, + { + "auxiliary_loss_clip": 0.06502604, + "auxiliary_loss_mlp": 0.0127659, + "balance_loss_clip": 0.06290129, + "balance_loss_mlp": 0.01257051, + "epoch": 0.30025552382383885, + "flos": 17390087245440.0, + "grad_norm": 2.3964724385856955, + "language_loss": 0.8713994, + "learning_rate": 3.2815099020824345e-06, + "loss": 0.94919133, + "num_input_tokens_seen": 107454585, + "router_z_loss_clip": 2.12109375, + "router_z_loss_mlp": 0.1953125, + "step": 4994, + "time_per_iteration": 5.451568603515625 + }, + { + "auxiliary_loss_clip": 0.06500117, + "auxiliary_loss_mlp": 0.01272707, + "balance_loss_clip": 0.06293428, + "balance_loss_mlp": 0.01255696, + "epoch": 0.3003156470765068, + "flos": 29541003701760.0, + "grad_norm": 1.492375768993242, + "language_loss": 0.81277597, + "learning_rate": 3.2812108703486924e-06, + "loss": 0.89050424, + "num_input_tokens_seen": 107477180, + "router_z_loss_clip": 2.06445312, + "router_z_loss_mlp": 0.17016602, + "step": 4995, + "time_per_iteration": 2.6498701572418213 + }, + { + "auxiliary_loss_clip": 0.06495272, + "auxiliary_loss_mlp": 0.01276355, + "balance_loss_clip": 0.06291361, + "balance_loss_mlp": 0.01257818, + "epoch": 0.3003757703291748, + "flos": 43655278302720.0, + "grad_norm": 1.561088997277918, + "language_loss": 0.67591625, + "learning_rate": 3.2809117900313055e-06, + "loss": 0.75363255, + "num_input_tokens_seen": 107500250, + "router_z_loss_clip": 2.03808594, + "router_z_loss_mlp": 0.18530273, + "step": 4996, + "time_per_iteration": 2.6940386295318604 + }, + { + "auxiliary_loss_clip": 0.06490915, + "auxiliary_loss_mlp": 0.01277922, + "balance_loss_clip": 0.06287466, + "balance_loss_mlp": 0.0125985, + "epoch": 0.30043589358184275, + "flos": 22534934664960.0, + "grad_norm": 1.8202769971321224, + "language_loss": 0.76585484, + "learning_rate": 3.280612661141615e-06, + "loss": 0.84354323, + "num_input_tokens_seen": 107520070, + "router_z_loss_clip": 2.03515625, + "router_z_loss_mlp": 0.18054199, + "step": 4997, + "time_per_iteration": 2.551025629043579 + }, + { + "auxiliary_loss_clip": 0.06488951, + "auxiliary_loss_mlp": 0.01282226, + "balance_loss_clip": 0.06286483, + "balance_loss_mlp": 0.01264785, + "epoch": 0.30049601683451077, + "flos": 21002176252800.0, + "grad_norm": 1.7136041248753544, + "language_loss": 0.78929758, + "learning_rate": 3.2803134836909646e-06, + "loss": 0.86700928, + "num_input_tokens_seen": 107539285, + "router_z_loss_clip": 2.0234375, + "router_z_loss_mlp": 0.17443848, + "step": 4998, + "time_per_iteration": 2.4853529930114746 + }, + { + "auxiliary_loss_clip": 0.06495959, + "auxiliary_loss_mlp": 0.01277634, + "balance_loss_clip": 0.06296599, + "balance_loss_mlp": 0.0126104, + "epoch": 0.30055614008717874, + "flos": 23922985875840.0, + "grad_norm": 1.6408959445510187, + "language_loss": 0.73985869, + "learning_rate": 3.2800142576906985e-06, + "loss": 0.81759465, + "num_input_tokens_seen": 107560260, + "router_z_loss_clip": 1.99121094, + "router_z_loss_mlp": 0.16589355, + "step": 4999, + "time_per_iteration": 2.565272331237793 + }, + { + "auxiliary_loss_clip": 0.06497648, + "auxiliary_loss_mlp": 0.01276599, + "balance_loss_clip": 0.06290608, + "balance_loss_mlp": 0.01258837, + "epoch": 0.3006162633398467, + "flos": 19175475317760.0, + "grad_norm": 1.6585129963537202, + "language_loss": 0.76246512, + "learning_rate": 3.2797149831521626e-06, + "loss": 0.84020758, + "num_input_tokens_seen": 107579260, + "router_z_loss_clip": 2.0703125, + "router_z_loss_mlp": 0.1776123, + "step": 5000, + "time_per_iteration": 3.978001117706299 + }, + { + "auxiliary_loss_clip": 0.06488875, + "auxiliary_loss_mlp": 0.01280464, + "balance_loss_clip": 0.06288455, + "balance_loss_mlp": 0.0126244, + "epoch": 0.30067638659251467, + "flos": 14683697020800.0, + "grad_norm": 1.838860389970219, + "language_loss": 0.81972182, + "learning_rate": 3.2794156600867073e-06, + "loss": 0.89741528, + "num_input_tokens_seen": 107595245, + "router_z_loss_clip": 2.00390625, + "router_z_loss_mlp": 0.18041992, + "step": 5001, + "time_per_iteration": 2.4995031356811523 + }, + { + "auxiliary_loss_clip": 0.06495227, + "auxiliary_loss_mlp": 0.01279132, + "balance_loss_clip": 0.06291329, + "balance_loss_mlp": 0.01261322, + "epoch": 0.30073650984518263, + "flos": 23374778538240.0, + "grad_norm": 1.6002838962292127, + "language_loss": 0.81160742, + "learning_rate": 3.2791162885056815e-06, + "loss": 0.88935101, + "num_input_tokens_seen": 107613985, + "router_z_loss_clip": 2.04003906, + "router_z_loss_mlp": 0.17797852, + "step": 5002, + "time_per_iteration": 2.549882650375366 + }, + { + "auxiliary_loss_clip": 0.06502556, + "auxiliary_loss_mlp": 0.01273216, + "balance_loss_clip": 0.06290467, + "balance_loss_mlp": 0.01255728, + "epoch": 0.3007966330978506, + "flos": 22973332826880.0, + "grad_norm": 1.7018817575326768, + "language_loss": 0.71524274, + "learning_rate": 3.2788168684204376e-06, + "loss": 0.79300046, + "num_input_tokens_seen": 107631435, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.17504883, + "step": 5003, + "time_per_iteration": 2.537760019302368 + }, + { + "auxiliary_loss_clip": 0.06502316, + "auxiliary_loss_mlp": 0.01275597, + "balance_loss_clip": 0.06290226, + "balance_loss_mlp": 0.01257441, + "epoch": 0.30085675635051856, + "flos": 27825830951040.0, + "grad_norm": 1.9954765529899763, + "language_loss": 0.706792, + "learning_rate": 3.27851739984233e-06, + "loss": 0.78457117, + "num_input_tokens_seen": 107650530, + "router_z_loss_clip": 2.12304688, + "router_z_loss_mlp": 0.18151855, + "step": 5004, + "time_per_iteration": 2.6357674598693848 + }, + { + "auxiliary_loss_clip": 0.06504735, + "auxiliary_loss_mlp": 0.01282861, + "balance_loss_clip": 0.06296123, + "balance_loss_mlp": 0.01263513, + "epoch": 0.3009168796031865, + "flos": 10886216855040.0, + "grad_norm": 2.7451882694975662, + "language_loss": 0.81914413, + "learning_rate": 3.278217882782715e-06, + "loss": 0.89702016, + "num_input_tokens_seen": 107662240, + "router_z_loss_clip": 2.08398438, + "router_z_loss_mlp": 0.19335938, + "step": 5005, + "time_per_iteration": 2.4386463165283203 + }, + { + "auxiliary_loss_clip": 0.06497307, + "auxiliary_loss_mlp": 0.01278667, + "balance_loss_clip": 0.06293161, + "balance_loss_mlp": 0.01261179, + "epoch": 0.3009770028558545, + "flos": 23812170451200.0, + "grad_norm": 3.689468326241579, + "language_loss": 0.74513727, + "learning_rate": 3.2779183172529497e-06, + "loss": 0.82289702, + "num_input_tokens_seen": 107680330, + "router_z_loss_clip": 2.04296875, + "router_z_loss_mlp": 0.17492676, + "step": 5006, + "time_per_iteration": 2.6309902667999268 + }, + { + "auxiliary_loss_clip": 0.06490835, + "auxiliary_loss_mlp": 0.01274011, + "balance_loss_clip": 0.06288077, + "balance_loss_mlp": 0.01255247, + "epoch": 0.30103712610852246, + "flos": 26475319169280.0, + "grad_norm": 1.9837745378518294, + "language_loss": 0.71514297, + "learning_rate": 3.2776187032643932e-06, + "loss": 0.79279143, + "num_input_tokens_seen": 107700020, + "router_z_loss_clip": 2.0234375, + "router_z_loss_mlp": 0.18762207, + "step": 5007, + "time_per_iteration": 2.5425140857696533 + }, + { + "auxiliary_loss_clip": 0.06499007, + "auxiliary_loss_mlp": 0.01277558, + "balance_loss_clip": 0.062922, + "balance_loss_mlp": 0.01258961, + "epoch": 0.3010972493611904, + "flos": 22863020526720.0, + "grad_norm": 2.135948160193648, + "language_loss": 0.76715112, + "learning_rate": 3.2773190408284075e-06, + "loss": 0.84491682, + "num_input_tokens_seen": 107718575, + "router_z_loss_clip": 2.06835938, + "router_z_loss_mlp": 0.18579102, + "step": 5008, + "time_per_iteration": 2.560136556625366 + }, + { + "auxiliary_loss_clip": 0.06498778, + "auxiliary_loss_mlp": 0.01277162, + "balance_loss_clip": 0.06291865, + "balance_loss_mlp": 0.01258959, + "epoch": 0.3011573726138584, + "flos": 24059307669120.0, + "grad_norm": 1.8647165617813573, + "language_loss": 0.85181898, + "learning_rate": 3.2770193299563564e-06, + "loss": 0.92957842, + "num_input_tokens_seen": 107738635, + "router_z_loss_clip": 2.06835938, + "router_z_loss_mlp": 0.18200684, + "step": 5009, + "time_per_iteration": 2.5235841274261475 + }, + { + "auxiliary_loss_clip": 0.06506295, + "auxiliary_loss_mlp": 0.01281474, + "balance_loss_clip": 0.06291408, + "balance_loss_mlp": 0.0126041, + "epoch": 0.30121749586652635, + "flos": 20264762396160.0, + "grad_norm": 1.8315766872525614, + "language_loss": 0.84202898, + "learning_rate": 3.276719570659604e-06, + "loss": 0.91990662, + "num_input_tokens_seen": 107753415, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.21069336, + "step": 5010, + "time_per_iteration": 2.5768747329711914 + }, + { + "auxiliary_loss_clip": 0.06499103, + "auxiliary_loss_mlp": 0.01276454, + "balance_loss_clip": 0.06292678, + "balance_loss_mlp": 0.01258728, + "epoch": 0.3012776191191944, + "flos": 26950334365440.0, + "grad_norm": 2.3479091749479593, + "language_loss": 0.85299456, + "learning_rate": 3.2764197629495176e-06, + "loss": 0.93075019, + "num_input_tokens_seen": 107773840, + "router_z_loss_clip": 2.06445312, + "router_z_loss_mlp": 0.17724609, + "step": 5011, + "time_per_iteration": 2.5496773719787598 + }, + { + "auxiliary_loss_clip": 0.06498772, + "auxiliary_loss_mlp": 0.01276485, + "balance_loss_clip": 0.06287067, + "balance_loss_mlp": 0.01258472, + "epoch": 0.30133774237186234, + "flos": 20418525838080.0, + "grad_norm": 2.2969937551574615, + "language_loss": 0.73043567, + "learning_rate": 3.2761199068374656e-06, + "loss": 0.80818832, + "num_input_tokens_seen": 107792020, + "router_z_loss_clip": 2.1171875, + "router_z_loss_mlp": 0.18017578, + "step": 5012, + "time_per_iteration": 2.5352632999420166 + }, + { + "auxiliary_loss_clip": 0.06502604, + "auxiliary_loss_mlp": 0.01275987, + "balance_loss_clip": 0.06294451, + "balance_loss_mlp": 0.01257581, + "epoch": 0.3013978656245303, + "flos": 19798635732480.0, + "grad_norm": 2.0714365992737247, + "language_loss": 0.88282806, + "learning_rate": 3.275820002334819e-06, + "loss": 0.96061397, + "num_input_tokens_seen": 107809595, + "router_z_loss_clip": 2.08007812, + "router_z_loss_mlp": 0.1842041, + "step": 5013, + "time_per_iteration": 2.5217273235321045 + }, + { + "auxiliary_loss_clip": 0.06510235, + "auxiliary_loss_mlp": 0.01281959, + "balance_loss_clip": 0.06297365, + "balance_loss_mlp": 0.01261956, + "epoch": 0.30145798887719827, + "flos": 16254623767680.0, + "grad_norm": 2.0397198762739253, + "language_loss": 0.8413021, + "learning_rate": 3.2755200494529496e-06, + "loss": 0.91922402, + "num_input_tokens_seen": 107827230, + "router_z_loss_clip": 2.12988281, + "router_z_loss_mlp": 0.19995117, + "step": 5014, + "time_per_iteration": 2.543929100036621 + }, + { + "auxiliary_loss_clip": 0.06496109, + "auxiliary_loss_mlp": 0.01278136, + "balance_loss_clip": 0.06295025, + "balance_loss_mlp": 0.01260934, + "epoch": 0.30151811212986623, + "flos": 24578654474880.0, + "grad_norm": 1.6793816963153507, + "language_loss": 0.68929201, + "learning_rate": 3.2752200482032323e-06, + "loss": 0.76703441, + "num_input_tokens_seen": 107847195, + "router_z_loss_clip": 2.0078125, + "router_z_loss_mlp": 0.17199707, + "step": 5015, + "time_per_iteration": 2.5488638877868652 + }, + { + "auxiliary_loss_clip": 0.06498226, + "auxiliary_loss_mlp": 0.01282599, + "balance_loss_clip": 0.06293575, + "balance_loss_mlp": 0.01262989, + "epoch": 0.3015782353825342, + "flos": 21878595233280.0, + "grad_norm": 2.19954780338382, + "language_loss": 0.75070626, + "learning_rate": 3.2749199985970436e-06, + "loss": 0.82851446, + "num_input_tokens_seen": 107866420, + "router_z_loss_clip": 2.046875, + "router_z_loss_mlp": 0.19604492, + "step": 5016, + "time_per_iteration": 2.6430094242095947 + }, + { + "auxiliary_loss_clip": 0.06498955, + "auxiliary_loss_mlp": 0.01278069, + "balance_loss_clip": 0.06290609, + "balance_loss_mlp": 0.01260009, + "epoch": 0.30163835863520216, + "flos": 28777244935680.0, + "grad_norm": 1.487936670829871, + "language_loss": 0.657938, + "learning_rate": 3.2746199006457603e-06, + "loss": 0.73570824, + "num_input_tokens_seen": 107889090, + "router_z_loss_clip": 2.08300781, + "router_z_loss_mlp": 0.18041992, + "step": 5017, + "time_per_iteration": 2.62882661819458 + }, + { + "auxiliary_loss_clip": 0.06504996, + "auxiliary_loss_mlp": 0.0127571, + "balance_loss_clip": 0.06297189, + "balance_loss_mlp": 0.01258019, + "epoch": 0.30169848188787013, + "flos": 22972829702400.0, + "grad_norm": 1.7163502989136974, + "language_loss": 0.68538272, + "learning_rate": 3.2743197543607628e-06, + "loss": 0.76318979, + "num_input_tokens_seen": 107907520, + "router_z_loss_clip": 2.078125, + "router_z_loss_mlp": 0.17675781, + "step": 5018, + "time_per_iteration": 2.5743629932403564 + }, + { + "auxiliary_loss_clip": 0.06490742, + "auxiliary_loss_mlp": 0.01280876, + "balance_loss_clip": 0.06291413, + "balance_loss_mlp": 0.01263102, + "epoch": 0.3017586051405381, + "flos": 21841726636800.0, + "grad_norm": 1.8632302123292983, + "language_loss": 0.79424834, + "learning_rate": 3.2740195597534327e-06, + "loss": 0.87196445, + "num_input_tokens_seen": 107925650, + "router_z_loss_clip": 1.99023438, + "router_z_loss_mlp": 0.17773438, + "step": 5019, + "time_per_iteration": 2.490190029144287 + }, + { + "auxiliary_loss_clip": 0.06497257, + "auxiliary_loss_mlp": 0.01272585, + "balance_loss_clip": 0.06291286, + "balance_loss_mlp": 0.01255932, + "epoch": 0.30181872839320606, + "flos": 22166374481280.0, + "grad_norm": 1.9171916392208899, + "language_loss": 0.70839167, + "learning_rate": 3.2737193168351527e-06, + "loss": 0.78609014, + "num_input_tokens_seen": 107943975, + "router_z_loss_clip": 2.05957031, + "router_z_loss_mlp": 0.16650391, + "step": 5020, + "time_per_iteration": 2.5635480880737305 + }, + { + "auxiliary_loss_clip": 0.06504546, + "auxiliary_loss_mlp": 0.01281398, + "balance_loss_clip": 0.06293903, + "balance_loss_mlp": 0.01263063, + "epoch": 0.301878851645874, + "flos": 18120080016000.0, + "grad_norm": 1.792157390717078, + "language_loss": 0.78276378, + "learning_rate": 3.2734190256173085e-06, + "loss": 0.86062324, + "num_input_tokens_seen": 107962950, + "router_z_loss_clip": 2.10742188, + "router_z_loss_mlp": 0.18347168, + "step": 5021, + "time_per_iteration": 2.4956390857696533 + }, + { + "auxiliary_loss_clip": 0.06497782, + "auxiliary_loss_mlp": 0.01276425, + "balance_loss_clip": 0.06289995, + "balance_loss_mlp": 0.01258758, + "epoch": 0.301938974898542, + "flos": 17607860807040.0, + "grad_norm": 2.1405998927344774, + "language_loss": 0.77019519, + "learning_rate": 3.2731186861112877e-06, + "loss": 0.84793723, + "num_input_tokens_seen": 107979700, + "router_z_loss_clip": 2.078125, + "router_z_loss_mlp": 0.17663574, + "step": 5022, + "time_per_iteration": 2.5157957077026367 + }, + { + "auxiliary_loss_clip": 0.06495966, + "auxiliary_loss_mlp": 0.01276385, + "balance_loss_clip": 0.0628897, + "balance_loss_mlp": 0.01258766, + "epoch": 0.30199909815120995, + "flos": 11185861455360.0, + "grad_norm": 1.768248661027107, + "language_loss": 0.70051187, + "learning_rate": 3.2728182983284793e-06, + "loss": 0.77823544, + "num_input_tokens_seen": 107996645, + "router_z_loss_clip": 2.06835938, + "router_z_loss_mlp": 0.17626953, + "step": 5023, + "time_per_iteration": 2.466554641723633 + }, + { + "auxiliary_loss_clip": 0.06500031, + "auxiliary_loss_mlp": 0.01272609, + "balance_loss_clip": 0.0628899, + "balance_loss_mlp": 0.0125586, + "epoch": 0.302059221403878, + "flos": 21914247945600.0, + "grad_norm": 1.9915350532209553, + "language_loss": 0.72159773, + "learning_rate": 3.2725178622802724e-06, + "loss": 0.7993241, + "num_input_tokens_seen": 108015020, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.16748047, + "step": 5024, + "time_per_iteration": 2.550529956817627 + }, + { + "auxiliary_loss_clip": 0.06490807, + "auxiliary_loss_mlp": 0.0127689, + "balance_loss_clip": 0.06288145, + "balance_loss_mlp": 0.01259068, + "epoch": 0.30211934465654594, + "flos": 26403678328320.0, + "grad_norm": 1.894121412902458, + "language_loss": 0.74805325, + "learning_rate": 3.272217377978061e-06, + "loss": 0.8257302, + "num_input_tokens_seen": 108036430, + "router_z_loss_clip": 2.02832031, + "router_z_loss_mlp": 0.17822266, + "step": 5025, + "time_per_iteration": 2.566805124282837 + }, + { + "auxiliary_loss_clip": 0.06489006, + "auxiliary_loss_mlp": 0.01277493, + "balance_loss_clip": 0.06288895, + "balance_loss_mlp": 0.01260649, + "epoch": 0.3021794679092139, + "flos": 23406573962880.0, + "grad_norm": 1.5421556017832176, + "language_loss": 0.67708206, + "learning_rate": 3.2719168454332387e-06, + "loss": 0.75474703, + "num_input_tokens_seen": 108054250, + "router_z_loss_clip": 1.99902344, + "router_z_loss_mlp": 0.16845703, + "step": 5026, + "time_per_iteration": 2.5388495922088623 + }, + { + "auxiliary_loss_clip": 0.06496219, + "auxiliary_loss_mlp": 0.01276315, + "balance_loss_clip": 0.0629091, + "balance_loss_mlp": 0.0125829, + "epoch": 0.30223959116188187, + "flos": 20266271769600.0, + "grad_norm": 1.7822947119811494, + "language_loss": 0.851165, + "learning_rate": 3.2716162646572034e-06, + "loss": 0.92889023, + "num_input_tokens_seen": 108071495, + "router_z_loss_clip": 2.05175781, + "router_z_loss_mlp": 0.18017578, + "step": 5027, + "time_per_iteration": 2.4944281578063965 + }, + { + "auxiliary_loss_clip": 0.06486274, + "auxiliary_loss_mlp": 0.01272499, + "balance_loss_clip": 0.06286463, + "balance_loss_mlp": 0.012555, + "epoch": 0.30229971441454984, + "flos": 26695105228800.0, + "grad_norm": 1.4959542036115716, + "language_loss": 0.79103637, + "learning_rate": 3.271315635661351e-06, + "loss": 0.86862409, + "num_input_tokens_seen": 108092135, + "router_z_loss_clip": 1.99804688, + "router_z_loss_mlp": 0.17004395, + "step": 5028, + "time_per_iteration": 2.559110403060913 + }, + { + "auxiliary_loss_clip": 0.06488896, + "auxiliary_loss_mlp": 0.01272685, + "balance_loss_clip": 0.06286621, + "balance_loss_mlp": 0.01255114, + "epoch": 0.3023598376672178, + "flos": 34353111358080.0, + "grad_norm": 2.034560710438702, + "language_loss": 0.777421, + "learning_rate": 3.2710149584570826e-06, + "loss": 0.8550368, + "num_input_tokens_seen": 108112945, + "router_z_loss_clip": 2.02246094, + "router_z_loss_mlp": 0.17553711, + "step": 5029, + "time_per_iteration": 2.616746187210083 + }, + { + "auxiliary_loss_clip": 0.06491397, + "auxiliary_loss_mlp": 0.012793, + "balance_loss_clip": 0.06285096, + "balance_loss_mlp": 0.0126112, + "epoch": 0.30241996091988577, + "flos": 23118794714880.0, + "grad_norm": 1.8709670039612754, + "language_loss": 0.83096594, + "learning_rate": 3.2707142330557993e-06, + "loss": 0.90867293, + "num_input_tokens_seen": 108130325, + "router_z_loss_clip": 2.06152344, + "router_z_loss_mlp": 0.1817627, + "step": 5030, + "time_per_iteration": 2.56754994392395 + }, + { + "auxiliary_loss_clip": 0.06496526, + "auxiliary_loss_mlp": 0.01269852, + "balance_loss_clip": 0.06289787, + "balance_loss_mlp": 0.01252817, + "epoch": 0.30248008417255373, + "flos": 19395932209920.0, + "grad_norm": 1.6009792224367259, + "language_loss": 0.70107001, + "learning_rate": 3.270413459468905e-06, + "loss": 0.77873379, + "num_input_tokens_seen": 108150300, + "router_z_loss_clip": 2.06542969, + "router_z_loss_mlp": 0.17028809, + "step": 5031, + "time_per_iteration": 3.9598355293273926 + }, + { + "auxiliary_loss_clip": 0.06489968, + "auxiliary_loss_mlp": 0.01272903, + "balance_loss_clip": 0.06286315, + "balance_loss_mlp": 0.01254843, + "epoch": 0.3025402074252217, + "flos": 23776601592960.0, + "grad_norm": 1.6577801639127376, + "language_loss": 0.83241403, + "learning_rate": 3.2701126377078047e-06, + "loss": 0.91004276, + "num_input_tokens_seen": 108170330, + "router_z_loss_clip": 2.03417969, + "router_z_loss_mlp": 0.18066406, + "step": 5032, + "time_per_iteration": 2.5589263439178467 + }, + { + "auxiliary_loss_clip": 0.064991, + "auxiliary_loss_mlp": 0.01275787, + "balance_loss_clip": 0.06290475, + "balance_loss_mlp": 0.01257846, + "epoch": 0.30260033067788966, + "flos": 26001184440960.0, + "grad_norm": 2.284722647008976, + "language_loss": 0.73521686, + "learning_rate": 3.269811767783906e-06, + "loss": 0.81296575, + "num_input_tokens_seen": 108191265, + "router_z_loss_clip": 2.08984375, + "router_z_loss_mlp": 0.17956543, + "step": 5033, + "time_per_iteration": 4.029735088348389 + }, + { + "auxiliary_loss_clip": 0.06487451, + "auxiliary_loss_mlp": 0.01273985, + "balance_loss_clip": 0.06287168, + "balance_loss_mlp": 0.01257201, + "epoch": 0.3026604539305576, + "flos": 25381629751680.0, + "grad_norm": 1.972268943863271, + "language_loss": 0.74434245, + "learning_rate": 3.2695108497086185e-06, + "loss": 0.82195687, + "num_input_tokens_seen": 108211615, + "router_z_loss_clip": 2.0, + "router_z_loss_mlp": 0.16784668, + "step": 5034, + "time_per_iteration": 4.0717785358428955 + }, + { + "auxiliary_loss_clip": 0.06489293, + "auxiliary_loss_mlp": 0.01272883, + "balance_loss_clip": 0.06285236, + "balance_loss_mlp": 0.01253785, + "epoch": 0.3027205771832256, + "flos": 25819944059520.0, + "grad_norm": 2.1341895685230434, + "language_loss": 0.72872615, + "learning_rate": 3.269209883493352e-06, + "loss": 0.80634785, + "num_input_tokens_seen": 108231080, + "router_z_loss_clip": 2.04003906, + "router_z_loss_mlp": 0.19104004, + "step": 5035, + "time_per_iteration": 2.552910804748535 + }, + { + "auxiliary_loss_clip": 0.06487517, + "auxiliary_loss_mlp": 0.01272592, + "balance_loss_clip": 0.06287874, + "balance_loss_mlp": 0.01255545, + "epoch": 0.30278070043589356, + "flos": 27351905857920.0, + "grad_norm": 2.3429469920607384, + "language_loss": 0.87837774, + "learning_rate": 3.2689088691495196e-06, + "loss": 0.95597875, + "num_input_tokens_seen": 108251125, + "router_z_loss_clip": 1.99414062, + "router_z_loss_mlp": 0.17041016, + "step": 5036, + "time_per_iteration": 2.5958964824676514 + }, + { + "auxiliary_loss_clip": 0.06487815, + "auxiliary_loss_mlp": 0.01273729, + "balance_loss_clip": 0.06288295, + "balance_loss_mlp": 0.0125574, + "epoch": 0.3028408236885616, + "flos": 24792444967680.0, + "grad_norm": 1.4626052772561229, + "language_loss": 0.77969307, + "learning_rate": 3.268607806688536e-06, + "loss": 0.85730845, + "num_input_tokens_seen": 108272545, + "router_z_loss_clip": 1.99609375, + "router_z_loss_mlp": 0.17980957, + "step": 5037, + "time_per_iteration": 2.556859016418457 + }, + { + "auxiliary_loss_clip": 0.06492691, + "auxiliary_loss_mlp": 0.01276846, + "balance_loss_clip": 0.06287664, + "balance_loss_mlp": 0.01258381, + "epoch": 0.30290094694122954, + "flos": 12937399678080.0, + "grad_norm": 2.1717737457337236, + "language_loss": 0.78095227, + "learning_rate": 3.268306696121816e-06, + "loss": 0.85864764, + "num_input_tokens_seen": 108289725, + "router_z_loss_clip": 2.04882812, + "router_z_loss_mlp": 0.18469238, + "step": 5038, + "time_per_iteration": 2.534095525741577 + }, + { + "auxiliary_loss_clip": 0.06487858, + "auxiliary_loss_mlp": 0.01274285, + "balance_loss_clip": 0.06289861, + "balance_loss_mlp": 0.01257631, + "epoch": 0.3029610701938975, + "flos": 25922709492480.0, + "grad_norm": 1.6864855803341283, + "language_loss": 0.74257523, + "learning_rate": 3.2680055374607804e-06, + "loss": 0.82019669, + "num_input_tokens_seen": 108310690, + "router_z_loss_clip": 1.9765625, + "router_z_loss_mlp": 0.16650391, + "step": 5039, + "time_per_iteration": 3.9620656967163086 + }, + { + "auxiliary_loss_clip": 0.06482661, + "auxiliary_loss_mlp": 0.01275025, + "balance_loss_clip": 0.06285235, + "balance_loss_mlp": 0.0125923, + "epoch": 0.3030211934465655, + "flos": 21987440087040.0, + "grad_norm": 1.8054159725903498, + "language_loss": 0.80141723, + "learning_rate": 3.267704330716847e-06, + "loss": 0.87899411, + "num_input_tokens_seen": 108328905, + "router_z_loss_clip": 1.97070312, + "router_z_loss_mlp": 0.15795898, + "step": 5040, + "time_per_iteration": 2.5038623809814453 + }, + { + "auxiliary_loss_clip": 0.06493679, + "auxiliary_loss_mlp": 0.01273287, + "balance_loss_clip": 0.06295684, + "balance_loss_mlp": 0.01256705, + "epoch": 0.30308131669923344, + "flos": 20997606205440.0, + "grad_norm": 1.5545793881611087, + "language_loss": 0.82498085, + "learning_rate": 3.267403075901438e-06, + "loss": 0.90265048, + "num_input_tokens_seen": 108346680, + "router_z_loss_clip": 1.97851562, + "router_z_loss_mlp": 0.16589355, + "step": 5041, + "time_per_iteration": 2.5619800090789795 + }, + { + "auxiliary_loss_clip": 0.06388037, + "auxiliary_loss_mlp": 0.01273694, + "balance_loss_clip": 0.062912, + "balance_loss_mlp": 0.012703, + "epoch": 0.3031414399519014, + "flos": 60568281198720.0, + "grad_norm": 0.7609258494567089, + "language_loss": 0.59132683, + "learning_rate": 3.267101773025978e-06, + "loss": 0.66794419, + "num_input_tokens_seen": 108413885, + "router_z_loss_clip": 0.96875, + "router_z_loss_mlp": 0.0340271, + "step": 5042, + "time_per_iteration": 3.2389016151428223 + }, + { + "auxiliary_loss_clip": 0.06493344, + "auxiliary_loss_mlp": 0.01274817, + "balance_loss_clip": 0.06288245, + "balance_loss_mlp": 0.0125808, + "epoch": 0.30320156320456937, + "flos": 21914038310400.0, + "grad_norm": 1.8743682054895758, + "language_loss": 0.71638298, + "learning_rate": 3.266800422101892e-06, + "loss": 0.79406464, + "num_input_tokens_seen": 108433640, + "router_z_loss_clip": 2.05175781, + "router_z_loss_mlp": 0.1673584, + "step": 5043, + "time_per_iteration": 2.5684726238250732 + }, + { + "auxiliary_loss_clip": 0.06492111, + "auxiliary_loss_mlp": 0.01274799, + "balance_loss_clip": 0.06289819, + "balance_loss_mlp": 0.01258121, + "epoch": 0.30326168645723733, + "flos": 21659186517120.0, + "grad_norm": 1.7052050019212173, + "language_loss": 0.70087332, + "learning_rate": 3.266499023140606e-06, + "loss": 0.7785424, + "num_input_tokens_seen": 108452640, + "router_z_loss_clip": 2.02441406, + "router_z_loss_mlp": 0.16699219, + "step": 5044, + "time_per_iteration": 2.517548084259033 + }, + { + "auxiliary_loss_clip": 0.06487354, + "auxiliary_loss_mlp": 0.01273722, + "balance_loss_clip": 0.06289065, + "balance_loss_mlp": 0.01257641, + "epoch": 0.3033218097099053, + "flos": 21877672838400.0, + "grad_norm": 1.4072868323237386, + "language_loss": 0.77798641, + "learning_rate": 3.2661975761535513e-06, + "loss": 0.85559714, + "num_input_tokens_seen": 108472470, + "router_z_loss_clip": 1.98242188, + "router_z_loss_mlp": 0.16088867, + "step": 5045, + "time_per_iteration": 2.5525407791137695 + }, + { + "auxiliary_loss_clip": 0.06487602, + "auxiliary_loss_mlp": 0.01277286, + "balance_loss_clip": 0.06286202, + "balance_loss_mlp": 0.01260096, + "epoch": 0.30338193296257326, + "flos": 27097137918720.0, + "grad_norm": 1.6677605508610576, + "language_loss": 0.72664404, + "learning_rate": 3.2658960811521564e-06, + "loss": 0.80429292, + "num_input_tokens_seen": 108493025, + "router_z_loss_clip": 2.01171875, + "router_z_loss_mlp": 0.171875, + "step": 5046, + "time_per_iteration": 2.5747427940368652 + }, + { + "auxiliary_loss_clip": 0.06495762, + "auxiliary_loss_mlp": 0.01276721, + "balance_loss_clip": 0.0629053, + "balance_loss_mlp": 0.0125897, + "epoch": 0.30344205621524123, + "flos": 19540052432640.0, + "grad_norm": 1.932306391246397, + "language_loss": 0.81483316, + "learning_rate": 3.2655945381478564e-06, + "loss": 0.89255798, + "num_input_tokens_seen": 108513480, + "router_z_loss_clip": 2.0546875, + "router_z_loss_mlp": 0.1776123, + "step": 5047, + "time_per_iteration": 2.5763392448425293 + }, + { + "auxiliary_loss_clip": 0.0648682, + "auxiliary_loss_mlp": 0.01271507, + "balance_loss_clip": 0.06287121, + "balance_loss_mlp": 0.01255568, + "epoch": 0.3035021794679092, + "flos": 23917116090240.0, + "grad_norm": 1.635585540948891, + "language_loss": 0.72204739, + "learning_rate": 3.265292947152084e-06, + "loss": 0.7996307, + "num_input_tokens_seen": 108533155, + "router_z_loss_clip": 1.99707031, + "router_z_loss_mlp": 0.15942383, + "step": 5048, + "time_per_iteration": 2.5134665966033936 + }, + { + "auxiliary_loss_clip": 0.06488065, + "auxiliary_loss_mlp": 0.01279017, + "balance_loss_clip": 0.0628863, + "balance_loss_mlp": 0.0126296, + "epoch": 0.30356230272057716, + "flos": 16149133077120.0, + "grad_norm": 2.0386560470204804, + "language_loss": 0.75622666, + "learning_rate": 3.2649913081762763e-06, + "loss": 0.83389747, + "num_input_tokens_seen": 108551900, + "router_z_loss_clip": 1.99707031, + "router_z_loss_mlp": 0.16052246, + "step": 5049, + "time_per_iteration": 2.516463279724121 + }, + { + "auxiliary_loss_clip": 0.06494351, + "auxiliary_loss_mlp": 0.01274287, + "balance_loss_clip": 0.06289351, + "balance_loss_mlp": 0.01257597, + "epoch": 0.3036224259732452, + "flos": 28922539115520.0, + "grad_norm": 1.525083803020086, + "language_loss": 0.82698894, + "learning_rate": 3.2646896212318717e-06, + "loss": 0.90467536, + "num_input_tokens_seen": 108574005, + "router_z_loss_clip": 2.05078125, + "router_z_loss_mlp": 0.16687012, + "step": 5050, + "time_per_iteration": 2.558199405670166 + }, + { + "auxiliary_loss_clip": 0.0649763, + "auxiliary_loss_mlp": 0.01273759, + "balance_loss_clip": 0.06295735, + "balance_loss_mlp": 0.01256617, + "epoch": 0.30368254922591315, + "flos": 21111943501440.0, + "grad_norm": 2.311701267026144, + "language_loss": 0.74346399, + "learning_rate": 3.2643878863303106e-06, + "loss": 0.82117784, + "num_input_tokens_seen": 108592715, + "router_z_loss_clip": 2.01953125, + "router_z_loss_mlp": 0.17150879, + "step": 5051, + "time_per_iteration": 2.530457019805908 + }, + { + "auxiliary_loss_clip": 0.06494159, + "auxiliary_loss_mlp": 0.01277624, + "balance_loss_clip": 0.06292571, + "balance_loss_mlp": 0.01260339, + "epoch": 0.3037426724785811, + "flos": 23008859758080.0, + "grad_norm": 1.7255753861859113, + "language_loss": 0.76444, + "learning_rate": 3.264086103483033e-06, + "loss": 0.84215784, + "num_input_tokens_seen": 108611770, + "router_z_loss_clip": 2.01367188, + "router_z_loss_mlp": 0.17297363, + "step": 5052, + "time_per_iteration": 2.596210479736328 + }, + { + "auxiliary_loss_clip": 0.06501957, + "auxiliary_loss_mlp": 0.01280226, + "balance_loss_clip": 0.06295583, + "balance_loss_mlp": 0.01262332, + "epoch": 0.3038027957312491, + "flos": 15638129752320.0, + "grad_norm": 1.9820354931454651, + "language_loss": 0.83096367, + "learning_rate": 3.2637842727014836e-06, + "loss": 0.90878546, + "num_input_tokens_seen": 108629070, + "router_z_loss_clip": 2.0625, + "router_z_loss_mlp": 0.17871094, + "step": 5053, + "time_per_iteration": 2.5384886264801025 + }, + { + "auxiliary_loss_clip": 0.06489826, + "auxiliary_loss_mlp": 0.0127909, + "balance_loss_clip": 0.06288566, + "balance_loss_mlp": 0.01262174, + "epoch": 0.30386291898391704, + "flos": 12718955283840.0, + "grad_norm": 1.6755872357210637, + "language_loss": 0.7197504, + "learning_rate": 3.2634823939971083e-06, + "loss": 0.79743958, + "num_input_tokens_seen": 108646315, + "router_z_loss_clip": 2.01171875, + "router_z_loss_mlp": 0.16906738, + "step": 5054, + "time_per_iteration": 2.4787559509277344 + }, + { + "auxiliary_loss_clip": 0.06500221, + "auxiliary_loss_mlp": 0.01282757, + "balance_loss_clip": 0.06298432, + "balance_loss_mlp": 0.01265805, + "epoch": 0.303923042236585, + "flos": 26366642023680.0, + "grad_norm": 1.8480883425842163, + "language_loss": 0.70137346, + "learning_rate": 3.2631804673813545e-06, + "loss": 0.77920318, + "num_input_tokens_seen": 108665920, + "router_z_loss_clip": 2.01757812, + "router_z_loss_mlp": 0.16943359, + "step": 5055, + "time_per_iteration": 2.5929152965545654 + }, + { + "auxiliary_loss_clip": 0.06494389, + "auxiliary_loss_mlp": 0.01279452, + "balance_loss_clip": 0.0629337, + "balance_loss_mlp": 0.01262488, + "epoch": 0.30398316548925297, + "flos": 19725359736960.0, + "grad_norm": 2.1405790356583516, + "language_loss": 0.68347496, + "learning_rate": 3.2628784928656707e-06, + "loss": 0.7612133, + "num_input_tokens_seen": 108683485, + "router_z_loss_clip": 2.00878906, + "router_z_loss_mlp": 0.16955566, + "step": 5056, + "time_per_iteration": 2.531677007675171 + }, + { + "auxiliary_loss_clip": 0.06490116, + "auxiliary_loss_mlp": 0.01281162, + "balance_loss_clip": 0.06292629, + "balance_loss_mlp": 0.01264377, + "epoch": 0.30404328874192094, + "flos": 24246124346880.0, + "grad_norm": 1.6503197514246037, + "language_loss": 0.83083463, + "learning_rate": 3.262576470461507e-06, + "loss": 0.9085474, + "num_input_tokens_seen": 108702700, + "router_z_loss_clip": 1.97753906, + "router_z_loss_mlp": 0.16796875, + "step": 5057, + "time_per_iteration": 2.5836069583892822 + }, + { + "auxiliary_loss_clip": 0.06484263, + "auxiliary_loss_mlp": 0.01272995, + "balance_loss_clip": 0.06286788, + "balance_loss_mlp": 0.01256603, + "epoch": 0.3041034119945889, + "flos": 24505881603840.0, + "grad_norm": 1.6860023663091837, + "language_loss": 0.89784855, + "learning_rate": 3.2622744001803176e-06, + "loss": 0.97542113, + "num_input_tokens_seen": 108721860, + "router_z_loss_clip": 1.97265625, + "router_z_loss_mlp": 0.16394043, + "step": 5058, + "time_per_iteration": 2.589932918548584 + }, + { + "auxiliary_loss_clip": 0.06495658, + "auxiliary_loss_mlp": 0.01274369, + "balance_loss_clip": 0.06294262, + "balance_loss_mlp": 0.01256524, + "epoch": 0.30416353524725687, + "flos": 28295689121280.0, + "grad_norm": 2.5117349508823392, + "language_loss": 0.71471179, + "learning_rate": 3.2619722820335564e-06, + "loss": 0.79241204, + "num_input_tokens_seen": 108743215, + "router_z_loss_clip": 2.01464844, + "router_z_loss_mlp": 0.17858887, + "step": 5059, + "time_per_iteration": 2.5827505588531494 + }, + { + "auxiliary_loss_clip": 0.06486548, + "auxiliary_loss_mlp": 0.01273567, + "balance_loss_clip": 0.06286812, + "balance_loss_mlp": 0.01257367, + "epoch": 0.30422365849992483, + "flos": 23667295541760.0, + "grad_norm": 1.868956784724377, + "language_loss": 0.73344606, + "learning_rate": 3.26167011603268e-06, + "loss": 0.8110472, + "num_input_tokens_seen": 108765505, + "router_z_loss_clip": 1.99609375, + "router_z_loss_mlp": 0.16174316, + "step": 5060, + "time_per_iteration": 2.624408006668091 + }, + { + "auxiliary_loss_clip": 0.06490071, + "auxiliary_loss_mlp": 0.01273663, + "balance_loss_clip": 0.06289257, + "balance_loss_mlp": 0.01257451, + "epoch": 0.3042837817525928, + "flos": 23004750908160.0, + "grad_norm": 1.75217091558972, + "language_loss": 0.7751621, + "learning_rate": 3.2613679021891463e-06, + "loss": 0.85279948, + "num_input_tokens_seen": 108783370, + "router_z_loss_clip": 2.0078125, + "router_z_loss_mlp": 0.16210938, + "step": 5061, + "time_per_iteration": 2.542299509048462 + }, + { + "auxiliary_loss_clip": 0.06496524, + "auxiliary_loss_mlp": 0.01274148, + "balance_loss_clip": 0.06292392, + "balance_loss_mlp": 0.01256362, + "epoch": 0.30434390500526076, + "flos": 22087438335360.0, + "grad_norm": 2.647933932315435, + "language_loss": 0.8275395, + "learning_rate": 3.261065640514415e-06, + "loss": 0.90524626, + "num_input_tokens_seen": 108797430, + "router_z_loss_clip": 2.03808594, + "router_z_loss_mlp": 0.17773438, + "step": 5062, + "time_per_iteration": 2.5313212871551514 + }, + { + "auxiliary_loss_clip": 0.06485732, + "auxiliary_loss_mlp": 0.01270116, + "balance_loss_clip": 0.06286077, + "balance_loss_mlp": 0.01253689, + "epoch": 0.3044040282579287, + "flos": 25490516532480.0, + "grad_norm": 1.803893214603413, + "language_loss": 0.74348861, + "learning_rate": 3.2607633310199483e-06, + "loss": 0.82104707, + "num_input_tokens_seen": 108816945, + "router_z_loss_clip": 1.99609375, + "router_z_loss_mlp": 0.16394043, + "step": 5063, + "time_per_iteration": 2.553527355194092 + }, + { + "auxiliary_loss_clip": 0.0649004, + "auxiliary_loss_mlp": 0.01274813, + "balance_loss_clip": 0.06291289, + "balance_loss_mlp": 0.01256753, + "epoch": 0.30446415151059675, + "flos": 21952080864000.0, + "grad_norm": 1.6090072895521823, + "language_loss": 0.84824491, + "learning_rate": 3.26046097371721e-06, + "loss": 0.92589343, + "num_input_tokens_seen": 108836615, + "router_z_loss_clip": 1.98925781, + "router_z_loss_mlp": 0.18066406, + "step": 5064, + "time_per_iteration": 2.558650493621826 + }, + { + "auxiliary_loss_clip": 0.06490266, + "auxiliary_loss_mlp": 0.01274023, + "balance_loss_clip": 0.06290541, + "balance_loss_mlp": 0.0125644, + "epoch": 0.3045242747632647, + "flos": 16440979248000.0, + "grad_norm": 2.1763674367183965, + "language_loss": 0.76565492, + "learning_rate": 3.2601585686176655e-06, + "loss": 0.84329784, + "num_input_tokens_seen": 108855165, + "router_z_loss_clip": 1.99804688, + "router_z_loss_mlp": 0.17578125, + "step": 5065, + "time_per_iteration": 2.50644588470459 + }, + { + "auxiliary_loss_clip": 0.06490786, + "auxiliary_loss_mlp": 0.01279051, + "balance_loss_clip": 0.06288782, + "balance_loss_mlp": 0.01260586, + "epoch": 0.3045843980159327, + "flos": 31548399966720.0, + "grad_norm": 1.8114152917186497, + "language_loss": 0.62859941, + "learning_rate": 3.2598561157327814e-06, + "loss": 0.70629776, + "num_input_tokens_seen": 108874690, + "router_z_loss_clip": 2.01953125, + "router_z_loss_mlp": 0.18469238, + "step": 5066, + "time_per_iteration": 2.6319751739501953 + }, + { + "auxiliary_loss_clip": 0.06499436, + "auxiliary_loss_mlp": 0.01273162, + "balance_loss_clip": 0.0629437, + "balance_loss_mlp": 0.01255602, + "epoch": 0.30464452126860064, + "flos": 17858645677440.0, + "grad_norm": 2.0549933694905653, + "language_loss": 0.82941914, + "learning_rate": 3.2595536150740265e-06, + "loss": 0.90714514, + "num_input_tokens_seen": 108893140, + "router_z_loss_clip": 2.05078125, + "router_z_loss_mlp": 0.17565918, + "step": 5067, + "time_per_iteration": 2.483863592147827 + }, + { + "auxiliary_loss_clip": 0.06485019, + "auxiliary_loss_mlp": 0.0127176, + "balance_loss_clip": 0.06289113, + "balance_loss_mlp": 0.01255643, + "epoch": 0.3047046445212686, + "flos": 20637682992000.0, + "grad_norm": 1.9234738451458053, + "language_loss": 0.63749218, + "learning_rate": 3.259251066652873e-06, + "loss": 0.71506, + "num_input_tokens_seen": 108911880, + "router_z_loss_clip": 1.95800781, + "router_z_loss_mlp": 0.16113281, + "step": 5068, + "time_per_iteration": 2.5133988857269287 + }, + { + "auxiliary_loss_clip": 0.06487909, + "auxiliary_loss_mlp": 0.01273097, + "balance_loss_clip": 0.06291264, + "balance_loss_mlp": 0.01256884, + "epoch": 0.3047647677739366, + "flos": 21293896642560.0, + "grad_norm": 1.767828765686575, + "language_loss": 0.75521863, + "learning_rate": 3.258948470480793e-06, + "loss": 0.8328287, + "num_input_tokens_seen": 108930440, + "router_z_loss_clip": 1.96679688, + "router_z_loss_mlp": 0.1619873, + "step": 5069, + "time_per_iteration": 2.5039985179901123 + }, + { + "auxiliary_loss_clip": 0.06492448, + "auxiliary_loss_mlp": 0.01270604, + "balance_loss_clip": 0.06298955, + "balance_loss_mlp": 0.01255047, + "epoch": 0.30482489102660454, + "flos": 21002218179840.0, + "grad_norm": 2.053197356954631, + "language_loss": 0.76551294, + "learning_rate": 3.258645826569261e-06, + "loss": 0.84314346, + "num_input_tokens_seen": 108949125, + "router_z_loss_clip": 1.93457031, + "router_z_loss_mlp": 0.15551758, + "step": 5070, + "time_per_iteration": 2.56703519821167 + }, + { + "auxiliary_loss_clip": 0.06501058, + "auxiliary_loss_mlp": 0.01275886, + "balance_loss_clip": 0.06296416, + "balance_loss_mlp": 0.01257742, + "epoch": 0.3048850142792725, + "flos": 26298732689280.0, + "grad_norm": 1.581704774716999, + "language_loss": 0.82567108, + "learning_rate": 3.2583431349297527e-06, + "loss": 0.90344059, + "num_input_tokens_seen": 108972190, + "router_z_loss_clip": 2.046875, + "router_z_loss_mlp": 0.18139648, + "step": 5071, + "time_per_iteration": 3.9534900188446045 + }, + { + "auxiliary_loss_clip": 0.06502657, + "auxiliary_loss_mlp": 0.01270862, + "balance_loss_clip": 0.06296133, + "balance_loss_mlp": 0.01253374, + "epoch": 0.30494513753194047, + "flos": 22352813815680.0, + "grad_norm": 1.6603887086526505, + "language_loss": 0.76386344, + "learning_rate": 3.2580403955737467e-06, + "loss": 0.84159869, + "num_input_tokens_seen": 108990325, + "router_z_loss_clip": 2.06542969, + "router_z_loss_mlp": 0.17492676, + "step": 5072, + "time_per_iteration": 3.9736859798431396 + }, + { + "auxiliary_loss_clip": 0.06492919, + "auxiliary_loss_mlp": 0.01277102, + "balance_loss_clip": 0.06293403, + "balance_loss_mlp": 0.01260544, + "epoch": 0.30500526078460843, + "flos": 19543909720320.0, + "grad_norm": 1.870095200943675, + "language_loss": 0.71741343, + "learning_rate": 3.257737608512723e-06, + "loss": 0.79511362, + "num_input_tokens_seen": 109009505, + "router_z_loss_clip": 1.99609375, + "router_z_loss_mlp": 0.16564941, + "step": 5073, + "time_per_iteration": 3.961787700653076 + }, + { + "auxiliary_loss_clip": 0.064973, + "auxiliary_loss_mlp": 0.01276358, + "balance_loss_clip": 0.06293514, + "balance_loss_mlp": 0.01259752, + "epoch": 0.3050653840372764, + "flos": 14470577360640.0, + "grad_norm": 2.0196062448027843, + "language_loss": 0.76699424, + "learning_rate": 3.257434773758163e-06, + "loss": 0.84473085, + "num_input_tokens_seen": 109026350, + "router_z_loss_clip": 2.03613281, + "router_z_loss_mlp": 0.16601562, + "step": 5074, + "time_per_iteration": 2.498986005783081 + }, + { + "auxiliary_loss_clip": 0.06498405, + "auxiliary_loss_mlp": 0.01271199, + "balance_loss_clip": 0.06298129, + "balance_loss_mlp": 0.01254534, + "epoch": 0.30512550728994436, + "flos": 24250736321280.0, + "grad_norm": 2.0830863268570496, + "language_loss": 0.75075227, + "learning_rate": 3.25713189132155e-06, + "loss": 0.8284483, + "num_input_tokens_seen": 109044165, + "router_z_loss_clip": 2.00292969, + "router_z_loss_mlp": 0.16662598, + "step": 5075, + "time_per_iteration": 2.586857557296753 + }, + { + "auxiliary_loss_clip": 0.06500411, + "auxiliary_loss_mlp": 0.01274386, + "balance_loss_clip": 0.06294686, + "balance_loss_mlp": 0.01256004, + "epoch": 0.30518563054261233, + "flos": 16365774608640.0, + "grad_norm": 1.8100237719305525, + "language_loss": 0.75655556, + "learning_rate": 3.2568289612143703e-06, + "loss": 0.8343035, + "num_input_tokens_seen": 109060665, + "router_z_loss_clip": 2.0546875, + "router_z_loss_mlp": 0.18371582, + "step": 5076, + "time_per_iteration": 2.4945309162139893 + }, + { + "auxiliary_loss_clip": 0.06496741, + "auxiliary_loss_mlp": 0.01270713, + "balance_loss_clip": 0.06296699, + "balance_loss_mlp": 0.01252712, + "epoch": 0.30524575379528035, + "flos": 21585952448640.0, + "grad_norm": 4.173383760279569, + "language_loss": 0.79782987, + "learning_rate": 3.25652598344811e-06, + "loss": 0.87550437, + "num_input_tokens_seen": 109080035, + "router_z_loss_clip": 2.0, + "router_z_loss_mlp": 0.17993164, + "step": 5077, + "time_per_iteration": 2.534932851791382 + }, + { + "auxiliary_loss_clip": 0.06489561, + "auxiliary_loss_mlp": 0.01270916, + "balance_loss_clip": 0.06295882, + "balance_loss_mlp": 0.01254012, + "epoch": 0.3053058770479483, + "flos": 16550872277760.0, + "grad_norm": 2.5701417949840146, + "language_loss": 0.7555238, + "learning_rate": 3.256222958034259e-06, + "loss": 0.83312857, + "num_input_tokens_seen": 109097385, + "router_z_loss_clip": 1.93652344, + "router_z_loss_mlp": 0.16894531, + "step": 5078, + "time_per_iteration": 2.530031442642212 + }, + { + "auxiliary_loss_clip": 0.06495726, + "auxiliary_loss_mlp": 0.01279629, + "balance_loss_clip": 0.06297612, + "balance_loss_mlp": 0.01262487, + "epoch": 0.3053660003006163, + "flos": 12317844988800.0, + "grad_norm": 1.8416681282179364, + "language_loss": 0.67517591, + "learning_rate": 3.255919884984307e-06, + "loss": 0.75292945, + "num_input_tokens_seen": 109115495, + "router_z_loss_clip": 1.98046875, + "router_z_loss_mlp": 0.17126465, + "step": 5079, + "time_per_iteration": 3.8981266021728516 + }, + { + "auxiliary_loss_clip": 0.06496017, + "auxiliary_loss_mlp": 0.01271448, + "balance_loss_clip": 0.06296019, + "balance_loss_mlp": 0.01253757, + "epoch": 0.30542612355328425, + "flos": 23118962423040.0, + "grad_norm": 1.7235884914338329, + "language_loss": 0.8044346, + "learning_rate": 3.2556167643097477e-06, + "loss": 0.88210917, + "num_input_tokens_seen": 109134235, + "router_z_loss_clip": 1.99707031, + "router_z_loss_mlp": 0.17687988, + "step": 5080, + "time_per_iteration": 2.562946081161499 + }, + { + "auxiliary_loss_clip": 0.06497588, + "auxiliary_loss_mlp": 0.01276495, + "balance_loss_clip": 0.06297643, + "balance_loss_mlp": 0.01259377, + "epoch": 0.3054862468059522, + "flos": 24396365917440.0, + "grad_norm": 2.5665035909877725, + "language_loss": 0.81653202, + "learning_rate": 3.255313596022074e-06, + "loss": 0.89427292, + "num_input_tokens_seen": 109152760, + "router_z_loss_clip": 1.99707031, + "router_z_loss_mlp": 0.17114258, + "step": 5081, + "time_per_iteration": 2.6026763916015625 + }, + { + "auxiliary_loss_clip": 0.06490453, + "auxiliary_loss_mlp": 0.0127058, + "balance_loss_clip": 0.06291625, + "balance_loss_mlp": 0.01253962, + "epoch": 0.3055463700586202, + "flos": 29393529315840.0, + "grad_norm": 1.580638075296793, + "language_loss": 0.72516012, + "learning_rate": 3.255010380132783e-06, + "loss": 0.80277044, + "num_input_tokens_seen": 109173925, + "router_z_loss_clip": 1.98632812, + "router_z_loss_mlp": 0.16619873, + "step": 5082, + "time_per_iteration": 2.650310516357422 + }, + { + "auxiliary_loss_clip": 0.06499462, + "auxiliary_loss_mlp": 0.01274957, + "balance_loss_clip": 0.06293429, + "balance_loss_mlp": 0.01257159, + "epoch": 0.30560649331128814, + "flos": 25598606699520.0, + "grad_norm": 2.3807589086926533, + "language_loss": 0.73733467, + "learning_rate": 3.2547071166533736e-06, + "loss": 0.81507885, + "num_input_tokens_seen": 109192510, + "router_z_loss_clip": 2.05761719, + "router_z_loss_mlp": 0.17797852, + "step": 5083, + "time_per_iteration": 2.595439910888672 + }, + { + "auxiliary_loss_clip": 0.06488115, + "auxiliary_loss_mlp": 0.01272372, + "balance_loss_clip": 0.0628676, + "balance_loss_mlp": 0.01254729, + "epoch": 0.3056666165639561, + "flos": 19133156206080.0, + "grad_norm": 1.8141392710911106, + "language_loss": 0.71165347, + "learning_rate": 3.254403805595344e-06, + "loss": 0.78925836, + "num_input_tokens_seen": 109210885, + "router_z_loss_clip": 2.01367188, + "router_z_loss_mlp": 0.17626953, + "step": 5084, + "time_per_iteration": 2.499873161315918 + }, + { + "auxiliary_loss_clip": 0.06505337, + "auxiliary_loss_mlp": 0.01276239, + "balance_loss_clip": 0.063004, + "balance_loss_mlp": 0.01260194, + "epoch": 0.30572673981662407, + "flos": 15529368752640.0, + "grad_norm": 2.0821129981034567, + "language_loss": 0.79337353, + "learning_rate": 3.2541004469701962e-06, + "loss": 0.87118936, + "num_input_tokens_seen": 109229180, + "router_z_loss_clip": 2.04882812, + "router_z_loss_mlp": 0.16027832, + "step": 5085, + "time_per_iteration": 2.479790449142456 + }, + { + "auxiliary_loss_clip": 0.06486039, + "auxiliary_loss_mlp": 0.01278912, + "balance_loss_clip": 0.06289506, + "balance_loss_mlp": 0.01260602, + "epoch": 0.30578686306929204, + "flos": 21512886088320.0, + "grad_norm": 2.123366644532801, + "language_loss": 0.78524947, + "learning_rate": 3.2537970407894342e-06, + "loss": 0.86289901, + "num_input_tokens_seen": 109249510, + "router_z_loss_clip": 1.96484375, + "router_z_loss_mlp": 0.18310547, + "step": 5086, + "time_per_iteration": 2.5372772216796875 + }, + { + "auxiliary_loss_clip": 0.06487311, + "auxiliary_loss_mlp": 0.01277834, + "balance_loss_clip": 0.06289313, + "balance_loss_mlp": 0.01259797, + "epoch": 0.30584698632196, + "flos": 20959689432960.0, + "grad_norm": 1.7535206397091907, + "language_loss": 0.77160186, + "learning_rate": 3.253493587064563e-06, + "loss": 0.8492533, + "num_input_tokens_seen": 109268200, + "router_z_loss_clip": 1.98144531, + "router_z_loss_mlp": 0.18041992, + "step": 5087, + "time_per_iteration": 2.4971578121185303 + }, + { + "auxiliary_loss_clip": 0.06492934, + "auxiliary_loss_mlp": 0.01277252, + "balance_loss_clip": 0.06288779, + "balance_loss_mlp": 0.01258154, + "epoch": 0.30590710957462797, + "flos": 24688044380160.0, + "grad_norm": 1.802467786704899, + "language_loss": 0.7266196, + "learning_rate": 3.2531900858070885e-06, + "loss": 0.80432141, + "num_input_tokens_seen": 109288370, + "router_z_loss_clip": 2.04492188, + "router_z_loss_mlp": 0.19091797, + "step": 5088, + "time_per_iteration": 2.5416259765625 + }, + { + "auxiliary_loss_clip": 0.06501624, + "auxiliary_loss_mlp": 0.0127311, + "balance_loss_clip": 0.06292014, + "balance_loss_mlp": 0.01253893, + "epoch": 0.30596723282729593, + "flos": 17091700456320.0, + "grad_norm": 2.3226252492467037, + "language_loss": 0.79702371, + "learning_rate": 3.252886537028521e-06, + "loss": 0.874771, + "num_input_tokens_seen": 109306730, + "router_z_loss_clip": 2.09570312, + "router_z_loss_mlp": 0.19226074, + "step": 5089, + "time_per_iteration": 2.4745559692382812 + }, + { + "auxiliary_loss_clip": 0.06491631, + "auxiliary_loss_mlp": 0.01275196, + "balance_loss_clip": 0.06291364, + "balance_loss_mlp": 0.01256981, + "epoch": 0.30602735607996395, + "flos": 22863775213440.0, + "grad_norm": 6.857787253608019, + "language_loss": 0.77299303, + "learning_rate": 3.2525829407403703e-06, + "loss": 0.85066134, + "num_input_tokens_seen": 109327360, + "router_z_loss_clip": 2.00097656, + "router_z_loss_mlp": 0.18225098, + "step": 5090, + "time_per_iteration": 2.5330631732940674 + }, + { + "auxiliary_loss_clip": 0.06500913, + "auxiliary_loss_mlp": 0.01279012, + "balance_loss_clip": 0.06295903, + "balance_loss_mlp": 0.01260773, + "epoch": 0.3060874793326319, + "flos": 29869173417600.0, + "grad_norm": 1.854909004407163, + "language_loss": 0.76970392, + "learning_rate": 3.2522792969541488e-06, + "loss": 0.84750324, + "num_input_tokens_seen": 109348135, + "router_z_loss_clip": 2.04882812, + "router_z_loss_mlp": 0.18237305, + "step": 5091, + "time_per_iteration": 2.561894178390503 + }, + { + "auxiliary_loss_clip": 0.06491988, + "auxiliary_loss_mlp": 0.01272552, + "balance_loss_clip": 0.06287533, + "balance_loss_mlp": 0.01254551, + "epoch": 0.3061476025852999, + "flos": 20454765528960.0, + "grad_norm": 1.7300285931862276, + "language_loss": 0.72878456, + "learning_rate": 3.2519756056813705e-06, + "loss": 0.80642998, + "num_input_tokens_seen": 109366220, + "router_z_loss_clip": 2.04589844, + "router_z_loss_mlp": 0.18005371, + "step": 5092, + "time_per_iteration": 2.5661561489105225 + }, + { + "auxiliary_loss_clip": 0.06495406, + "auxiliary_loss_mlp": 0.01276172, + "balance_loss_clip": 0.06294402, + "balance_loss_mlp": 0.01258696, + "epoch": 0.30620772583796785, + "flos": 19397651218560.0, + "grad_norm": 1.8286917674158676, + "language_loss": 0.83293521, + "learning_rate": 3.2516718669335522e-06, + "loss": 0.91065109, + "num_input_tokens_seen": 109385260, + "router_z_loss_clip": 2.00976562, + "router_z_loss_mlp": 0.17468262, + "step": 5093, + "time_per_iteration": 2.49686336517334 + }, + { + "auxiliary_loss_clip": 0.06495437, + "auxiliary_loss_mlp": 0.01277069, + "balance_loss_clip": 0.06295857, + "balance_loss_mlp": 0.01259652, + "epoch": 0.3062678490906358, + "flos": 24031411459200.0, + "grad_norm": 1.7386581048181018, + "language_loss": 0.74963737, + "learning_rate": 3.2513680807222114e-06, + "loss": 0.82736242, + "num_input_tokens_seen": 109405025, + "router_z_loss_clip": 1.99414062, + "router_z_loss_mlp": 0.17419434, + "step": 5094, + "time_per_iteration": 2.5497004985809326 + }, + { + "auxiliary_loss_clip": 0.06491575, + "auxiliary_loss_mlp": 0.01272234, + "balance_loss_clip": 0.06293601, + "balance_loss_mlp": 0.01255735, + "epoch": 0.3063279723433038, + "flos": 19760593178880.0, + "grad_norm": 1.8971341227661025, + "language_loss": 0.76389223, + "learning_rate": 3.251064247058868e-06, + "loss": 0.84153032, + "num_input_tokens_seen": 109422465, + "router_z_loss_clip": 1.9765625, + "router_z_loss_mlp": 0.16503906, + "step": 5095, + "time_per_iteration": 2.493479013442993 + }, + { + "auxiliary_loss_clip": 0.06485657, + "auxiliary_loss_mlp": 0.0128124, + "balance_loss_clip": 0.06288686, + "balance_loss_mlp": 0.01262727, + "epoch": 0.30638809559597174, + "flos": 22455663102720.0, + "grad_norm": 1.6310889817091494, + "language_loss": 0.81246006, + "learning_rate": 3.250760365955042e-06, + "loss": 0.89012897, + "num_input_tokens_seen": 109440575, + "router_z_loss_clip": 1.97070312, + "router_z_loss_mlp": 0.18518066, + "step": 5096, + "time_per_iteration": 2.606100559234619 + }, + { + "auxiliary_loss_clip": 0.06500001, + "auxiliary_loss_mlp": 0.01286183, + "balance_loss_clip": 0.06297529, + "balance_loss_mlp": 0.01269947, + "epoch": 0.3064482188486397, + "flos": 17170846237440.0, + "grad_norm": 2.1701963694762862, + "language_loss": 0.81871414, + "learning_rate": 3.250456437422258e-06, + "loss": 0.89657605, + "num_input_tokens_seen": 109459050, + "router_z_loss_clip": 2.0234375, + "router_z_loss_mlp": 0.16235352, + "step": 5097, + "time_per_iteration": 2.506908893585205 + }, + { + "auxiliary_loss_clip": 0.06498241, + "auxiliary_loss_mlp": 0.01288982, + "balance_loss_clip": 0.06297113, + "balance_loss_mlp": 0.01269647, + "epoch": 0.3065083421013077, + "flos": 23775176073600.0, + "grad_norm": 2.1266024193404385, + "language_loss": 0.7855283, + "learning_rate": 3.250152461472041e-06, + "loss": 0.86340058, + "num_input_tokens_seen": 109475860, + "router_z_loss_clip": 2.0078125, + "router_z_loss_mlp": 0.19335938, + "step": 5098, + "time_per_iteration": 2.546875238418579 + }, + { + "auxiliary_loss_clip": 0.06494713, + "auxiliary_loss_mlp": 0.01291897, + "balance_loss_clip": 0.06296527, + "balance_loss_mlp": 0.0127367, + "epoch": 0.30656846535397564, + "flos": 26438953697280.0, + "grad_norm": 1.8261556885246946, + "language_loss": 0.84430897, + "learning_rate": 3.249848438115917e-06, + "loss": 0.92217511, + "num_input_tokens_seen": 109494760, + "router_z_loss_clip": 1.98242188, + "router_z_loss_mlp": 0.18225098, + "step": 5099, + "time_per_iteration": 2.5726583003997803 + }, + { + "auxiliary_loss_clip": 0.06498358, + "auxiliary_loss_mlp": 0.01287293, + "balance_loss_clip": 0.06295489, + "balance_loss_mlp": 0.01268434, + "epoch": 0.3066285886066436, + "flos": 26659117100160.0, + "grad_norm": 1.588615118025773, + "language_loss": 0.86241573, + "learning_rate": 3.2495443673654148e-06, + "loss": 0.94027227, + "num_input_tokens_seen": 109516480, + "router_z_loss_clip": 2.02832031, + "router_z_loss_mlp": 0.18859863, + "step": 5100, + "time_per_iteration": 2.5711421966552734 + }, + { + "auxiliary_loss_clip": 0.06496789, + "auxiliary_loss_mlp": 0.01283562, + "balance_loss_clip": 0.06296922, + "balance_loss_mlp": 0.01264345, + "epoch": 0.30668871185931157, + "flos": 15055443659520.0, + "grad_norm": 1.7244173580954059, + "language_loss": 0.79369497, + "learning_rate": 3.249240249232065e-06, + "loss": 0.87149858, + "num_input_tokens_seen": 109534615, + "router_z_loss_clip": 1.99804688, + "router_z_loss_mlp": 0.19226074, + "step": 5101, + "time_per_iteration": 2.539132833480835 + }, + { + "auxiliary_loss_clip": 0.0650195, + "auxiliary_loss_mlp": 0.01287055, + "balance_loss_clip": 0.06299084, + "balance_loss_mlp": 0.01268172, + "epoch": 0.30674883511197953, + "flos": 20087966280960.0, + "grad_norm": 1.7739241542858428, + "language_loss": 0.80435872, + "learning_rate": 3.2489360837273998e-06, + "loss": 0.88224876, + "num_input_tokens_seen": 109554040, + "router_z_loss_clip": 2.02832031, + "router_z_loss_mlp": 0.1887207, + "step": 5102, + "time_per_iteration": 2.5558016300201416 + }, + { + "auxiliary_loss_clip": 0.06503183, + "auxiliary_loss_mlp": 0.01284648, + "balance_loss_clip": 0.06301928, + "balance_loss_mlp": 0.01265253, + "epoch": 0.30680895836464755, + "flos": 22900518028800.0, + "grad_norm": 1.6865927559982214, + "language_loss": 0.89335668, + "learning_rate": 3.2486318708629532e-06, + "loss": 0.97123504, + "num_input_tokens_seen": 109574345, + "router_z_loss_clip": 2.015625, + "router_z_loss_mlp": 0.19396973, + "step": 5103, + "time_per_iteration": 2.542555570602417 + }, + { + "auxiliary_loss_clip": 0.06501935, + "auxiliary_loss_mlp": 0.01286618, + "balance_loss_clip": 0.06302223, + "balance_loss_mlp": 0.0126876, + "epoch": 0.3068690816173155, + "flos": 23702948254080.0, + "grad_norm": 2.119732369805114, + "language_loss": 0.74448419, + "learning_rate": 3.2483276106502607e-06, + "loss": 0.82236969, + "num_input_tokens_seen": 109593670, + "router_z_loss_clip": 1.99707031, + "router_z_loss_mlp": 0.17871094, + "step": 5104, + "time_per_iteration": 2.560253143310547 + }, + { + "auxiliary_loss_clip": 0.06502049, + "auxiliary_loss_mlp": 0.01274873, + "balance_loss_clip": 0.06295487, + "balance_loss_mlp": 0.01257552, + "epoch": 0.3069292048699835, + "flos": 23557947563520.0, + "grad_norm": 1.7334515387821061, + "language_loss": 0.72909176, + "learning_rate": 3.2480233031008605e-06, + "loss": 0.80686092, + "num_input_tokens_seen": 109613385, + "router_z_loss_clip": 2.06835938, + "router_z_loss_mlp": 0.17321777, + "step": 5105, + "time_per_iteration": 2.5751454830169678 + }, + { + "auxiliary_loss_clip": 0.06498945, + "auxiliary_loss_mlp": 0.01282015, + "balance_loss_clip": 0.06297372, + "balance_loss_mlp": 0.01263907, + "epoch": 0.30698932812265145, + "flos": 24537970517760.0, + "grad_norm": 2.0977567017321608, + "language_loss": 0.87578112, + "learning_rate": 3.2477189482262916e-06, + "loss": 0.95359075, + "num_input_tokens_seen": 109632395, + "router_z_loss_clip": 2.01464844, + "router_z_loss_mlp": 0.18103027, + "step": 5106, + "time_per_iteration": 2.54413104057312 + }, + { + "auxiliary_loss_clip": 0.06503764, + "auxiliary_loss_mlp": 0.01279082, + "balance_loss_clip": 0.06296381, + "balance_loss_mlp": 0.01261189, + "epoch": 0.3070494513753194, + "flos": 21002805158400.0, + "grad_norm": 2.310425767564757, + "language_loss": 0.72092319, + "learning_rate": 3.2474145460380945e-06, + "loss": 0.79875165, + "num_input_tokens_seen": 109651380, + "router_z_loss_clip": 2.0703125, + "router_z_loss_mlp": 0.17883301, + "step": 5107, + "time_per_iteration": 2.571430206298828 + }, + { + "auxiliary_loss_clip": 0.06493405, + "auxiliary_loss_mlp": 0.01275594, + "balance_loss_clip": 0.06294269, + "balance_loss_mlp": 0.01256735, + "epoch": 0.3071095746279874, + "flos": 19031942073600.0, + "grad_norm": 1.99593781887154, + "language_loss": 0.72653455, + "learning_rate": 3.247110096547814e-06, + "loss": 0.80422449, + "num_input_tokens_seen": 109670240, + "router_z_loss_clip": 1.99121094, + "router_z_loss_mlp": 0.18847656, + "step": 5108, + "time_per_iteration": 2.497788190841675 + }, + { + "auxiliary_loss_clip": 0.06499057, + "auxiliary_loss_mlp": 0.01277116, + "balance_loss_clip": 0.06297708, + "balance_loss_mlp": 0.01259533, + "epoch": 0.30716969788065535, + "flos": 21221962312320.0, + "grad_norm": 1.48656392648579, + "language_loss": 0.86441541, + "learning_rate": 3.2468055997669926e-06, + "loss": 0.94217712, + "num_input_tokens_seen": 109690810, + "router_z_loss_clip": 2.01074219, + "router_z_loss_mlp": 0.17578125, + "step": 5109, + "time_per_iteration": 2.563480854034424 + }, + { + "auxiliary_loss_clip": 0.06501789, + "auxiliary_loss_mlp": 0.01278566, + "balance_loss_clip": 0.063005, + "balance_loss_mlp": 0.01260541, + "epoch": 0.3072298211333233, + "flos": 25779385883520.0, + "grad_norm": 1.8235353484155168, + "language_loss": 0.67904091, + "learning_rate": 3.2465010557071788e-06, + "loss": 0.75684446, + "num_input_tokens_seen": 109711145, + "router_z_loss_clip": 2.01171875, + "router_z_loss_mlp": 0.18029785, + "step": 5110, + "time_per_iteration": 3.9785540103912354 + }, + { + "auxiliary_loss_clip": 0.06493396, + "auxiliary_loss_mlp": 0.01273369, + "balance_loss_clip": 0.06295427, + "balance_loss_mlp": 0.01256727, + "epoch": 0.3072899443859913, + "flos": 25856099896320.0, + "grad_norm": 1.4123986071879864, + "language_loss": 0.76984161, + "learning_rate": 3.246196464379919e-06, + "loss": 0.84750926, + "num_input_tokens_seen": 109731425, + "router_z_loss_clip": 1.97753906, + "router_z_loss_mlp": 0.16638184, + "step": 5111, + "time_per_iteration": 2.5771117210388184 + }, + { + "auxiliary_loss_clip": 0.06498265, + "auxiliary_loss_mlp": 0.01277301, + "balance_loss_clip": 0.06293567, + "balance_loss_mlp": 0.01258585, + "epoch": 0.30735006763865924, + "flos": 25930130578560.0, + "grad_norm": 2.349951455822933, + "language_loss": 0.67755288, + "learning_rate": 3.245891825796765e-06, + "loss": 0.75530857, + "num_input_tokens_seen": 109752720, + "router_z_loss_clip": 2.04394531, + "router_z_loss_mlp": 0.18713379, + "step": 5112, + "time_per_iteration": 3.963136672973633 + }, + { + "auxiliary_loss_clip": 0.0650286, + "auxiliary_loss_mlp": 0.01277737, + "balance_loss_clip": 0.06295824, + "balance_loss_mlp": 0.01257614, + "epoch": 0.3074101908913272, + "flos": 30924442938240.0, + "grad_norm": 2.270303220058131, + "language_loss": 0.79939896, + "learning_rate": 3.2455871399692678e-06, + "loss": 0.87720484, + "num_input_tokens_seen": 109772840, + "router_z_loss_clip": 2.06640625, + "router_z_loss_mlp": 0.20117188, + "step": 5113, + "time_per_iteration": 4.084795236587524 + }, + { + "auxiliary_loss_clip": 0.06502695, + "auxiliary_loss_mlp": 0.01276516, + "balance_loss_clip": 0.06297943, + "balance_loss_mlp": 0.01258599, + "epoch": 0.30747031414399517, + "flos": 18406182182400.0, + "grad_norm": 2.072714063381377, + "language_loss": 0.77269047, + "learning_rate": 3.2452824069089815e-06, + "loss": 0.85048258, + "num_input_tokens_seen": 109790150, + "router_z_loss_clip": 2.04589844, + "router_z_loss_mlp": 0.17919922, + "step": 5114, + "time_per_iteration": 2.4906773567199707 + }, + { + "auxiliary_loss_clip": 0.06498024, + "auxiliary_loss_mlp": 0.01283612, + "balance_loss_clip": 0.06298083, + "balance_loss_mlp": 0.01265087, + "epoch": 0.30753043739666314, + "flos": 22638957909120.0, + "grad_norm": 1.8131309248321845, + "language_loss": 0.62640405, + "learning_rate": 3.2449776266274623e-06, + "loss": 0.70422041, + "num_input_tokens_seen": 109807985, + "router_z_loss_clip": 2.0, + "router_z_loss_mlp": 0.18530273, + "step": 5115, + "time_per_iteration": 2.5328574180603027 + }, + { + "auxiliary_loss_clip": 0.06499057, + "auxiliary_loss_mlp": 0.01274347, + "balance_loss_clip": 0.06295817, + "balance_loss_mlp": 0.0125513, + "epoch": 0.3075905606493311, + "flos": 27351360806400.0, + "grad_norm": 1.7894066300170501, + "language_loss": 0.83589995, + "learning_rate": 3.2446727991362657e-06, + "loss": 0.91363406, + "num_input_tokens_seen": 109825920, + "router_z_loss_clip": 2.03222656, + "router_z_loss_mlp": 0.19213867, + "step": 5116, + "time_per_iteration": 2.562014102935791 + }, + { + "auxiliary_loss_clip": 0.06500115, + "auxiliary_loss_mlp": 0.01273454, + "balance_loss_clip": 0.06298394, + "balance_loss_mlp": 0.0125512, + "epoch": 0.3076506839019991, + "flos": 22097333116800.0, + "grad_norm": 1.8649453582041782, + "language_loss": 0.76016742, + "learning_rate": 3.244367924446952e-06, + "loss": 0.83790314, + "num_input_tokens_seen": 109846220, + "router_z_loss_clip": 2.01660156, + "router_z_loss_mlp": 0.18322754, + "step": 5117, + "time_per_iteration": 2.5509209632873535 + }, + { + "auxiliary_loss_clip": 0.06498168, + "auxiliary_loss_mlp": 0.01274202, + "balance_loss_clip": 0.0629583, + "balance_loss_mlp": 0.01256142, + "epoch": 0.3077108071546671, + "flos": 21296160702720.0, + "grad_norm": 2.167097847201453, + "language_loss": 0.72108531, + "learning_rate": 3.2440630025710826e-06, + "loss": 0.79880905, + "num_input_tokens_seen": 109863870, + "router_z_loss_clip": 2.02246094, + "router_z_loss_mlp": 0.18054199, + "step": 5118, + "time_per_iteration": 2.5190913677215576 + }, + { + "auxiliary_loss_clip": 0.06502286, + "auxiliary_loss_mlp": 0.01275745, + "balance_loss_clip": 0.06299888, + "balance_loss_mlp": 0.01258198, + "epoch": 0.30777093040733505, + "flos": 21436884835200.0, + "grad_norm": 2.760855389686565, + "language_loss": 0.74956095, + "learning_rate": 3.243758033520219e-06, + "loss": 0.82734126, + "num_input_tokens_seen": 109883500, + "router_z_loss_clip": 2.0234375, + "router_z_loss_mlp": 0.17553711, + "step": 5119, + "time_per_iteration": 3.973721981048584 + }, + { + "auxiliary_loss_clip": 0.06494488, + "auxiliary_loss_mlp": 0.01279388, + "balance_loss_clip": 0.06289928, + "balance_loss_mlp": 0.01259814, + "epoch": 0.307831053660003, + "flos": 23156040654720.0, + "grad_norm": 1.7924264386276263, + "language_loss": 0.80264926, + "learning_rate": 3.243453017305926e-06, + "loss": 0.88038802, + "num_input_tokens_seen": 109904620, + "router_z_loss_clip": 2.04101562, + "router_z_loss_mlp": 0.19580078, + "step": 5120, + "time_per_iteration": 2.54705548286438 + }, + { + "auxiliary_loss_clip": 0.06492078, + "auxiliary_loss_mlp": 0.01273208, + "balance_loss_clip": 0.06292595, + "balance_loss_mlp": 0.01255445, + "epoch": 0.307891176912671, + "flos": 17025510130560.0, + "grad_norm": 1.642273509687288, + "language_loss": 0.80521786, + "learning_rate": 3.24314795393977e-06, + "loss": 0.88287073, + "num_input_tokens_seen": 109922275, + "router_z_loss_clip": 1.99707031, + "router_z_loss_mlp": 0.1776123, + "step": 5121, + "time_per_iteration": 2.515054702758789 + }, + { + "auxiliary_loss_clip": 0.06496292, + "auxiliary_loss_mlp": 0.0127453, + "balance_loss_clip": 0.06298114, + "balance_loss_mlp": 0.01256875, + "epoch": 0.30795130016533895, + "flos": 27711745217280.0, + "grad_norm": 1.3913461280715187, + "language_loss": 0.82847351, + "learning_rate": 3.242842843433319e-06, + "loss": 0.90618169, + "num_input_tokens_seen": 109944265, + "router_z_loss_clip": 1.98339844, + "router_z_loss_mlp": 0.17651367, + "step": 5122, + "time_per_iteration": 2.5832252502441406 + }, + { + "auxiliary_loss_clip": 0.06416376, + "auxiliary_loss_mlp": 0.01252861, + "balance_loss_clip": 0.0632116, + "balance_loss_mlp": 0.01249526, + "epoch": 0.3080114234180069, + "flos": 69080973373440.0, + "grad_norm": 0.7221499072225652, + "language_loss": 0.58650029, + "learning_rate": 3.242537685798143e-06, + "loss": 0.66319263, + "num_input_tokens_seen": 110014160, + "router_z_loss_clip": 0.953125, + "router_z_loss_mlp": 0.03341675, + "step": 5123, + "time_per_iteration": 3.3316402435302734 + }, + { + "auxiliary_loss_clip": 0.06503562, + "auxiliary_loss_mlp": 0.01279925, + "balance_loss_clip": 0.06296872, + "balance_loss_mlp": 0.01260744, + "epoch": 0.3080715466706749, + "flos": 24066938390400.0, + "grad_norm": 1.6584153298959496, + "language_loss": 0.83586073, + "learning_rate": 3.242232481045813e-06, + "loss": 0.91369557, + "num_input_tokens_seen": 110034865, + "router_z_loss_clip": 2.06640625, + "router_z_loss_mlp": 0.1920166, + "step": 5124, + "time_per_iteration": 2.589906930923462 + }, + { + "auxiliary_loss_clip": 0.06498908, + "auxiliary_loss_mlp": 0.01271737, + "balance_loss_clip": 0.06294107, + "balance_loss_mlp": 0.01253629, + "epoch": 0.30813166992334284, + "flos": 25855806407040.0, + "grad_norm": 2.061271988083176, + "language_loss": 0.79248756, + "learning_rate": 3.2419272291879035e-06, + "loss": 0.87019402, + "num_input_tokens_seen": 110052930, + "router_z_loss_clip": 2.04589844, + "router_z_loss_mlp": 0.1809082, + "step": 5125, + "time_per_iteration": 2.550884485244751 + }, + { + "auxiliary_loss_clip": 0.06501068, + "auxiliary_loss_mlp": 0.012774, + "balance_loss_clip": 0.06292764, + "balance_loss_mlp": 0.01258374, + "epoch": 0.3081917931760108, + "flos": 20455981413120.0, + "grad_norm": 2.085029494567846, + "language_loss": 0.64930958, + "learning_rate": 3.241621930235989e-06, + "loss": 0.72709423, + "num_input_tokens_seen": 110071765, + "router_z_loss_clip": 2.08203125, + "router_z_loss_mlp": 0.19018555, + "step": 5126, + "time_per_iteration": 2.576352119445801 + }, + { + "auxiliary_loss_clip": 0.06490224, + "auxiliary_loss_mlp": 0.01277045, + "balance_loss_clip": 0.06294391, + "balance_loss_mlp": 0.01259533, + "epoch": 0.3082519164286788, + "flos": 22173208588800.0, + "grad_norm": 1.5681866965441809, + "language_loss": 0.87117672, + "learning_rate": 3.241316584201646e-06, + "loss": 0.94884944, + "num_input_tokens_seen": 110092660, + "router_z_loss_clip": 1.95800781, + "router_z_loss_mlp": 0.17504883, + "step": 5127, + "time_per_iteration": 2.567615270614624 + }, + { + "auxiliary_loss_clip": 0.0649047, + "auxiliary_loss_mlp": 0.01273562, + "balance_loss_clip": 0.06291968, + "balance_loss_mlp": 0.0125593, + "epoch": 0.30831203968134674, + "flos": 28921029742080.0, + "grad_norm": 1.4544126326452276, + "language_loss": 0.69282925, + "learning_rate": 3.2410111910964538e-06, + "loss": 0.77046961, + "num_input_tokens_seen": 110114960, + "router_z_loss_clip": 1.984375, + "router_z_loss_mlp": 0.1763916, + "step": 5128, + "time_per_iteration": 2.6129322052001953 + }, + { + "auxiliary_loss_clip": 0.06499469, + "auxiliary_loss_mlp": 0.01276178, + "balance_loss_clip": 0.06295171, + "balance_loss_mlp": 0.01257843, + "epoch": 0.3083721629340147, + "flos": 25675069150080.0, + "grad_norm": 2.0282558045061396, + "language_loss": 0.7195785, + "learning_rate": 3.240705750931993e-06, + "loss": 0.79733503, + "num_input_tokens_seen": 110135750, + "router_z_loss_clip": 2.04394531, + "router_z_loss_mlp": 0.18334961, + "step": 5129, + "time_per_iteration": 2.5587165355682373 + }, + { + "auxiliary_loss_clip": 0.06388761, + "auxiliary_loss_mlp": 0.01275431, + "balance_loss_clip": 0.06292662, + "balance_loss_mlp": 0.01271816, + "epoch": 0.3084322861866827, + "flos": 68233666487040.0, + "grad_norm": 0.8077979927321801, + "language_loss": 0.58935201, + "learning_rate": 3.240400263719846e-06, + "loss": 0.66599393, + "num_input_tokens_seen": 110189480, + "router_z_loss_clip": 0.9609375, + "router_z_loss_mlp": 0.03607178, + "step": 5130, + "time_per_iteration": 3.2353098392486572 + }, + { + "auxiliary_loss_clip": 0.06498231, + "auxiliary_loss_mlp": 0.012758, + "balance_loss_clip": 0.0629265, + "balance_loss_mlp": 0.01258443, + "epoch": 0.3084924094393507, + "flos": 20301630992640.0, + "grad_norm": 2.071340626605126, + "language_loss": 0.73298538, + "learning_rate": 3.2400947294715957e-06, + "loss": 0.81072569, + "num_input_tokens_seen": 110206445, + "router_z_loss_clip": 2.05664062, + "router_z_loss_mlp": 0.17370605, + "step": 5131, + "time_per_iteration": 2.523510456085205 + }, + { + "auxiliary_loss_clip": 0.06487547, + "auxiliary_loss_mlp": 0.01274811, + "balance_loss_clip": 0.06290068, + "balance_loss_mlp": 0.01257728, + "epoch": 0.30855253269201866, + "flos": 23956374528000.0, + "grad_norm": 1.6208223340220833, + "language_loss": 0.71358359, + "learning_rate": 3.2397891481988303e-06, + "loss": 0.79120713, + "num_input_tokens_seen": 110226845, + "router_z_loss_clip": 1.97558594, + "router_z_loss_mlp": 0.17077637, + "step": 5132, + "time_per_iteration": 2.581470012664795 + }, + { + "auxiliary_loss_clip": 0.06487268, + "auxiliary_loss_mlp": 0.01273323, + "balance_loss_clip": 0.06290212, + "balance_loss_mlp": 0.01255262, + "epoch": 0.3086126559446866, + "flos": 19288009751040.0, + "grad_norm": 1.7801590489825803, + "language_loss": 0.90374929, + "learning_rate": 3.239483519913136e-06, + "loss": 0.98135513, + "num_input_tokens_seen": 110244095, + "router_z_loss_clip": 1.96972656, + "router_z_loss_mlp": 0.18066406, + "step": 5133, + "time_per_iteration": 2.5197763442993164 + }, + { + "auxiliary_loss_clip": 0.06499831, + "auxiliary_loss_mlp": 0.01275105, + "balance_loss_clip": 0.06295495, + "balance_loss_mlp": 0.01257105, + "epoch": 0.3086727791973546, + "flos": 33768328913280.0, + "grad_norm": 1.8524807236065886, + "language_loss": 0.67443442, + "learning_rate": 3.239177844626102e-06, + "loss": 0.75218379, + "num_input_tokens_seen": 110264240, + "router_z_loss_clip": 2.04101562, + "router_z_loss_mlp": 0.18017578, + "step": 5134, + "time_per_iteration": 2.664303779602051 + }, + { + "auxiliary_loss_clip": 0.06498815, + "auxiliary_loss_mlp": 0.01275704, + "balance_loss_clip": 0.06293166, + "balance_loss_mlp": 0.01257167, + "epoch": 0.30873290245002255, + "flos": 16039659317760.0, + "grad_norm": 1.8927812104332384, + "language_loss": 0.83517784, + "learning_rate": 3.2388721223493197e-06, + "loss": 0.91292304, + "num_input_tokens_seen": 110282450, + "router_z_loss_clip": 2.05664062, + "router_z_loss_mlp": 0.18518066, + "step": 5135, + "time_per_iteration": 2.505138397216797 + }, + { + "auxiliary_loss_clip": 0.06377634, + "auxiliary_loss_mlp": 0.01258895, + "balance_loss_clip": 0.06282344, + "balance_loss_mlp": 0.01255602, + "epoch": 0.3087930257026905, + "flos": 65070415474560.0, + "grad_norm": 0.6863645266912056, + "language_loss": 0.55337238, + "learning_rate": 3.2385663530943824e-06, + "loss": 0.62973773, + "num_input_tokens_seen": 110343715, + "router_z_loss_clip": 0.953125, + "router_z_loss_mlp": 0.0329895, + "step": 5136, + "time_per_iteration": 3.179166555404663 + }, + { + "auxiliary_loss_clip": 0.06488921, + "auxiliary_loss_mlp": 0.01274465, + "balance_loss_clip": 0.06290261, + "balance_loss_mlp": 0.01257085, + "epoch": 0.3088531489553585, + "flos": 74754001733760.0, + "grad_norm": 1.8635236180899502, + "language_loss": 0.76610464, + "learning_rate": 3.2382605368728852e-06, + "loss": 0.8437385, + "num_input_tokens_seen": 110368430, + "router_z_loss_clip": 1.98632812, + "router_z_loss_mlp": 0.1739502, + "step": 5137, + "time_per_iteration": 2.9993999004364014 + }, + { + "auxiliary_loss_clip": 0.06489644, + "auxiliary_loss_mlp": 0.01272707, + "balance_loss_clip": 0.06290223, + "balance_loss_mlp": 0.01255458, + "epoch": 0.30891327220802645, + "flos": 21148686316800.0, + "grad_norm": 1.7480087539569926, + "language_loss": 0.80450445, + "learning_rate": 3.237954673696424e-06, + "loss": 0.882128, + "num_input_tokens_seen": 110386735, + "router_z_loss_clip": 1.9921875, + "router_z_loss_mlp": 0.17248535, + "step": 5138, + "time_per_iteration": 2.531916856765747 + }, + { + "auxiliary_loss_clip": 0.06496161, + "auxiliary_loss_mlp": 0.01276289, + "balance_loss_clip": 0.06294001, + "balance_loss_mlp": 0.01258896, + "epoch": 0.3089733954606944, + "flos": 25671295716480.0, + "grad_norm": 1.629930216805369, + "language_loss": 0.81626344, + "learning_rate": 3.2376487635765983e-06, + "loss": 0.89398789, + "num_input_tokens_seen": 110406820, + "router_z_loss_clip": 2.02050781, + "router_z_loss_mlp": 0.1739502, + "step": 5139, + "time_per_iteration": 2.585380792617798 + }, + { + "auxiliary_loss_clip": 0.06501773, + "auxiliary_loss_mlp": 0.01277306, + "balance_loss_clip": 0.06292425, + "balance_loss_mlp": 0.01258817, + "epoch": 0.3090335187133624, + "flos": 19433429712000.0, + "grad_norm": 2.0033599705043854, + "language_loss": 0.77724934, + "learning_rate": 3.2373428065250067e-06, + "loss": 0.85504013, + "num_input_tokens_seen": 110424225, + "router_z_loss_clip": 2.09179688, + "router_z_loss_mlp": 0.18481445, + "step": 5140, + "time_per_iteration": 2.504387617111206 + }, + { + "auxiliary_loss_clip": 0.06482549, + "auxiliary_loss_mlp": 0.01272919, + "balance_loss_clip": 0.06290817, + "balance_loss_mlp": 0.0125741, + "epoch": 0.30909364196603034, + "flos": 20017541324160.0, + "grad_norm": 1.9132937458234096, + "language_loss": 0.78916645, + "learning_rate": 3.237036802553252e-06, + "loss": 0.86672109, + "num_input_tokens_seen": 110443310, + "router_z_loss_clip": 1.91601562, + "router_z_loss_mlp": 0.15515137, + "step": 5141, + "time_per_iteration": 2.5588464736938477 + }, + { + "auxiliary_loss_clip": 0.06494773, + "auxiliary_loss_mlp": 0.01277459, + "balance_loss_clip": 0.06291379, + "balance_loss_mlp": 0.01260543, + "epoch": 0.3091537652186983, + "flos": 19682830990080.0, + "grad_norm": 2.2087235088394728, + "language_loss": 0.8789897, + "learning_rate": 3.2367307516729377e-06, + "loss": 0.95671201, + "num_input_tokens_seen": 110460215, + "router_z_loss_clip": 2.03222656, + "router_z_loss_mlp": 0.16906738, + "step": 5142, + "time_per_iteration": 2.52750825881958 + }, + { + "auxiliary_loss_clip": 0.06498981, + "auxiliary_loss_mlp": 0.01276818, + "balance_loss_clip": 0.06294474, + "balance_loss_mlp": 0.01259438, + "epoch": 0.3092138884713663, + "flos": 17025845546880.0, + "grad_norm": 2.3473661014686984, + "language_loss": 0.7985431, + "learning_rate": 3.23642465389567e-06, + "loss": 0.87630117, + "num_input_tokens_seen": 110479385, + "router_z_loss_clip": 2.04785156, + "router_z_loss_mlp": 0.17382812, + "step": 5143, + "time_per_iteration": 2.658299207687378 + }, + { + "auxiliary_loss_clip": 0.06489455, + "auxiliary_loss_mlp": 0.01277055, + "balance_loss_clip": 0.06291586, + "balance_loss_mlp": 0.01260378, + "epoch": 0.3092740117240343, + "flos": 25017052636800.0, + "grad_norm": 1.6187717199492768, + "language_loss": 0.72479737, + "learning_rate": 3.236118509233055e-06, + "loss": 0.8024624, + "num_input_tokens_seen": 110499885, + "router_z_loss_clip": 1.97949219, + "router_z_loss_mlp": 0.16662598, + "step": 5144, + "time_per_iteration": 2.547358989715576 + }, + { + "auxiliary_loss_clip": 0.06496169, + "auxiliary_loss_mlp": 0.01272398, + "balance_loss_clip": 0.06292454, + "balance_loss_mlp": 0.01256138, + "epoch": 0.30933413497670226, + "flos": 25597013472000.0, + "grad_norm": 2.2714150562550466, + "language_loss": 0.74676621, + "learning_rate": 3.235812317696702e-06, + "loss": 0.82445192, + "num_input_tokens_seen": 110519690, + "router_z_loss_clip": 2.03515625, + "router_z_loss_mlp": 0.16271973, + "step": 5145, + "time_per_iteration": 2.6273365020751953 + }, + { + "auxiliary_loss_clip": 0.06490701, + "auxiliary_loss_mlp": 0.01273039, + "balance_loss_clip": 0.06289125, + "balance_loss_mlp": 0.01256296, + "epoch": 0.3093942582293702, + "flos": 24396617479680.0, + "grad_norm": 1.731689317121935, + "language_loss": 0.76830649, + "learning_rate": 3.2355060792982224e-06, + "loss": 0.84594393, + "num_input_tokens_seen": 110540520, + "router_z_loss_clip": 2.01464844, + "router_z_loss_mlp": 0.16729736, + "step": 5146, + "time_per_iteration": 2.5352702140808105 + }, + { + "auxiliary_loss_clip": 0.06485911, + "auxiliary_loss_mlp": 0.01273533, + "balance_loss_clip": 0.06287882, + "balance_loss_mlp": 0.0125707, + "epoch": 0.3094543814820382, + "flos": 19652586865920.0, + "grad_norm": 1.8011449994622988, + "language_loss": 0.66675043, + "learning_rate": 3.2351997940492286e-06, + "loss": 0.74434483, + "num_input_tokens_seen": 110557950, + "router_z_loss_clip": 1.97949219, + "router_z_loss_mlp": 0.16467285, + "step": 5147, + "time_per_iteration": 2.545940637588501 + }, + { + "auxiliary_loss_clip": 0.06492072, + "auxiliary_loss_mlp": 0.01271267, + "balance_loss_clip": 0.0628895, + "balance_loss_mlp": 0.01253731, + "epoch": 0.30951450473470615, + "flos": 25670499102720.0, + "grad_norm": 1.8580519203508368, + "language_loss": 0.74971956, + "learning_rate": 3.2348934619613346e-06, + "loss": 0.82735288, + "num_input_tokens_seen": 110578215, + "router_z_loss_clip": 2.03125, + "router_z_loss_mlp": 0.17529297, + "step": 5148, + "time_per_iteration": 2.5673537254333496 + }, + { + "auxiliary_loss_clip": 0.06501722, + "auxiliary_loss_mlp": 0.01278545, + "balance_loss_clip": 0.06290632, + "balance_loss_mlp": 0.01260342, + "epoch": 0.3095746279873741, + "flos": 12025202204160.0, + "grad_norm": 2.1335435485893166, + "language_loss": 0.73367, + "learning_rate": 3.2345870830461567e-06, + "loss": 0.81147265, + "num_input_tokens_seen": 110592990, + "router_z_loss_clip": 2.11230469, + "router_z_loss_mlp": 0.18212891, + "step": 5149, + "time_per_iteration": 2.682609796524048 + }, + { + "auxiliary_loss_clip": 0.06497431, + "auxiliary_loss_mlp": 0.01277143, + "balance_loss_clip": 0.06292653, + "balance_loss_mlp": 0.01258534, + "epoch": 0.3096347512400421, + "flos": 23629798039680.0, + "grad_norm": 1.913638713978071, + "language_loss": 0.85296845, + "learning_rate": 3.2342806573153132e-06, + "loss": 0.93071413, + "num_input_tokens_seen": 110612130, + "router_z_loss_clip": 2.04785156, + "router_z_loss_mlp": 0.18591309, + "step": 5150, + "time_per_iteration": 3.9813008308410645 + }, + { + "auxiliary_loss_clip": 0.06483387, + "auxiliary_loss_mlp": 0.01274387, + "balance_loss_clip": 0.06285527, + "balance_loss_mlp": 0.01256815, + "epoch": 0.30969487449271005, + "flos": 22536024768000.0, + "grad_norm": 1.8960829077128427, + "language_loss": 0.79181123, + "learning_rate": 3.233974184780424e-06, + "loss": 0.86938894, + "num_input_tokens_seen": 110632045, + "router_z_loss_clip": 1.97851562, + "router_z_loss_mlp": 0.17565918, + "step": 5151, + "time_per_iteration": 2.5336477756500244 + }, + { + "auxiliary_loss_clip": 0.06493182, + "auxiliary_loss_mlp": 0.01274378, + "balance_loss_clip": 0.06291731, + "balance_loss_mlp": 0.01257426, + "epoch": 0.309754997745378, + "flos": 15273301075200.0, + "grad_norm": 2.079664023782487, + "language_loss": 0.67843604, + "learning_rate": 3.2336676654531084e-06, + "loss": 0.75611162, + "num_input_tokens_seen": 110649340, + "router_z_loss_clip": 2.01367188, + "router_z_loss_mlp": 0.16931152, + "step": 5152, + "time_per_iteration": 5.332815647125244 + }, + { + "auxiliary_loss_clip": 0.06492282, + "auxiliary_loss_mlp": 0.01278303, + "balance_loss_clip": 0.06293005, + "balance_loss_mlp": 0.01261888, + "epoch": 0.309815120998046, + "flos": 26986532129280.0, + "grad_norm": 1.9990242894688834, + "language_loss": 0.83170605, + "learning_rate": 3.2333610993449926e-06, + "loss": 0.90941191, + "num_input_tokens_seen": 110668450, + "router_z_loss_clip": 1.98925781, + "router_z_loss_mlp": 0.16394043, + "step": 5153, + "time_per_iteration": 2.5944862365722656 + }, + { + "auxiliary_loss_clip": 0.06488585, + "auxiliary_loss_mlp": 0.0127453, + "balance_loss_clip": 0.06290878, + "balance_loss_mlp": 0.0125709, + "epoch": 0.30987524425071394, + "flos": 21149692565760.0, + "grad_norm": 1.7708804151784365, + "language_loss": 0.74136615, + "learning_rate": 3.2330544864676997e-06, + "loss": 0.81899732, + "num_input_tokens_seen": 110689410, + "router_z_loss_clip": 1.9765625, + "router_z_loss_mlp": 0.17456055, + "step": 5154, + "time_per_iteration": 2.529526948928833 + }, + { + "auxiliary_loss_clip": 0.0648791, + "auxiliary_loss_mlp": 0.01284436, + "balance_loss_clip": 0.06292189, + "balance_loss_mlp": 0.01267544, + "epoch": 0.3099353675033819, + "flos": 15273720345600.0, + "grad_norm": 2.7515131151360763, + "language_loss": 0.76419097, + "learning_rate": 3.232747826832858e-06, + "loss": 0.84191442, + "num_input_tokens_seen": 110707350, + "router_z_loss_clip": 1.95605469, + "router_z_loss_mlp": 0.16882324, + "step": 5155, + "time_per_iteration": 2.5338993072509766 + }, + { + "auxiliary_loss_clip": 0.06490543, + "auxiliary_loss_mlp": 0.01273122, + "balance_loss_clip": 0.06289169, + "balance_loss_mlp": 0.01256373, + "epoch": 0.30999549075604993, + "flos": 15419182233600.0, + "grad_norm": 1.684257178792462, + "language_loss": 0.79886794, + "learning_rate": 3.232441120452094e-06, + "loss": 0.87650466, + "num_input_tokens_seen": 110724910, + "router_z_loss_clip": 2.01367188, + "router_z_loss_mlp": 0.1673584, + "step": 5156, + "time_per_iteration": 2.5190272331237793 + }, + { + "auxiliary_loss_clip": 0.06493768, + "auxiliary_loss_mlp": 0.01281451, + "balance_loss_clip": 0.06290715, + "balance_loss_mlp": 0.01264821, + "epoch": 0.3100556140087179, + "flos": 23191106388480.0, + "grad_norm": 2.1803769191775197, + "language_loss": 0.74967813, + "learning_rate": 3.23213436733704e-06, + "loss": 0.82743037, + "num_input_tokens_seen": 110744010, + "router_z_loss_clip": 2.03125, + "router_z_loss_mlp": 0.16625977, + "step": 5157, + "time_per_iteration": 2.59045147895813 + }, + { + "auxiliary_loss_clip": 0.06486322, + "auxiliary_loss_mlp": 0.01274347, + "balance_loss_clip": 0.06289537, + "balance_loss_mlp": 0.01258921, + "epoch": 0.31011573726138586, + "flos": 25749770664960.0, + "grad_norm": 2.4337865277632065, + "language_loss": 0.69860423, + "learning_rate": 3.231827567499327e-06, + "loss": 0.7762109, + "num_input_tokens_seen": 110765835, + "router_z_loss_clip": 1.96777344, + "router_z_loss_mlp": 0.1541748, + "step": 5158, + "time_per_iteration": 4.041999578475952 + }, + { + "auxiliary_loss_clip": 0.06488799, + "auxiliary_loss_mlp": 0.0127365, + "balance_loss_clip": 0.0629247, + "balance_loss_mlp": 0.0125795, + "epoch": 0.3101758605140538, + "flos": 20017541324160.0, + "grad_norm": 2.0387737109261477, + "language_loss": 0.84883308, + "learning_rate": 3.2315207209505896e-06, + "loss": 0.92645758, + "num_input_tokens_seen": 110784655, + "router_z_loss_clip": 1.96484375, + "router_z_loss_mlp": 0.15673828, + "step": 5159, + "time_per_iteration": 2.5081369876861572 + }, + { + "auxiliary_loss_clip": 0.06491841, + "auxiliary_loss_mlp": 0.0127455, + "balance_loss_clip": 0.06293043, + "balance_loss_mlp": 0.01257002, + "epoch": 0.3102359837667218, + "flos": 19141751249280.0, + "grad_norm": 1.926707434190644, + "language_loss": 0.85498118, + "learning_rate": 3.231213827702462e-06, + "loss": 0.93264508, + "num_input_tokens_seen": 110802545, + "router_z_loss_clip": 1.98828125, + "router_z_loss_mlp": 0.17529297, + "step": 5160, + "time_per_iteration": 2.5466468334198 + }, + { + "auxiliary_loss_clip": 0.06486624, + "auxiliary_loss_mlp": 0.01270062, + "balance_loss_clip": 0.06291263, + "balance_loss_mlp": 0.01253945, + "epoch": 0.31029610701938976, + "flos": 22270649287680.0, + "grad_norm": 1.6869427612303989, + "language_loss": 0.75787026, + "learning_rate": 3.230906887766584e-06, + "loss": 0.83543712, + "num_input_tokens_seen": 110820265, + "router_z_loss_clip": 1.95214844, + "router_z_loss_mlp": 0.16113281, + "step": 5161, + "time_per_iteration": 2.518521785736084 + }, + { + "auxiliary_loss_clip": 0.06491208, + "auxiliary_loss_mlp": 0.0127494, + "balance_loss_clip": 0.06289751, + "balance_loss_mlp": 0.01256915, + "epoch": 0.3103562302720577, + "flos": 20810244476160.0, + "grad_norm": 2.463900279304932, + "language_loss": 0.8222912, + "learning_rate": 3.2305999011545924e-06, + "loss": 0.89995265, + "num_input_tokens_seen": 110836195, + "router_z_loss_clip": 2.01367188, + "router_z_loss_mlp": 0.18029785, + "step": 5162, + "time_per_iteration": 2.5057315826416016 + }, + { + "auxiliary_loss_clip": 0.06485277, + "auxiliary_loss_mlp": 0.01269002, + "balance_loss_clip": 0.06289959, + "balance_loss_mlp": 0.01253594, + "epoch": 0.3104163535247257, + "flos": 22350382047360.0, + "grad_norm": 1.4717884967200954, + "language_loss": 0.83087295, + "learning_rate": 3.2302928678781295e-06, + "loss": 0.90841573, + "num_input_tokens_seen": 110856420, + "router_z_loss_clip": 1.95507812, + "router_z_loss_mlp": 0.15423584, + "step": 5163, + "time_per_iteration": 2.542052745819092 + }, + { + "auxiliary_loss_clip": 0.06490193, + "auxiliary_loss_mlp": 0.01271791, + "balance_loss_clip": 0.06290418, + "balance_loss_mlp": 0.0125559, + "epoch": 0.31047647677739365, + "flos": 21695803551360.0, + "grad_norm": 1.756895513371669, + "language_loss": 0.76630449, + "learning_rate": 3.2299857879488376e-06, + "loss": 0.84392428, + "num_input_tokens_seen": 110876650, + "router_z_loss_clip": 1.99707031, + "router_z_loss_mlp": 0.16186523, + "step": 5164, + "time_per_iteration": 2.5616652965545654 + }, + { + "auxiliary_loss_clip": 0.06486434, + "auxiliary_loss_mlp": 0.01274591, + "balance_loss_clip": 0.0628885, + "balance_loss_mlp": 0.01258331, + "epoch": 0.3105366000300616, + "flos": 18923390709120.0, + "grad_norm": 1.866784827400394, + "language_loss": 0.75307393, + "learning_rate": 3.2296786613783626e-06, + "loss": 0.83068419, + "num_input_tokens_seen": 110894445, + "router_z_loss_clip": 1.97558594, + "router_z_loss_mlp": 0.16271973, + "step": 5165, + "time_per_iteration": 2.5190699100494385 + }, + { + "auxiliary_loss_clip": 0.06483215, + "auxiliary_loss_mlp": 0.01271797, + "balance_loss_clip": 0.062862, + "balance_loss_mlp": 0.01255096, + "epoch": 0.3105967232827296, + "flos": 18266380444800.0, + "grad_norm": 1.5432274368627708, + "language_loss": 0.76476973, + "learning_rate": 3.229371488178348e-06, + "loss": 0.84231985, + "num_input_tokens_seen": 110912855, + "router_z_loss_clip": 1.97070312, + "router_z_loss_mlp": 0.16699219, + "step": 5166, + "time_per_iteration": 2.5421557426452637 + }, + { + "auxiliary_loss_clip": 0.06486712, + "auxiliary_loss_mlp": 0.01273485, + "balance_loss_clip": 0.06287863, + "balance_loss_mlp": 0.01256796, + "epoch": 0.31065684653539755, + "flos": 17677279514880.0, + "grad_norm": 2.119255684006569, + "language_loss": 0.74129677, + "learning_rate": 3.229064268360444e-06, + "loss": 0.81889874, + "num_input_tokens_seen": 110928025, + "router_z_loss_clip": 1.98828125, + "router_z_loss_mlp": 0.16687012, + "step": 5167, + "time_per_iteration": 2.5039737224578857 + }, + { + "auxiliary_loss_clip": 0.06378125, + "auxiliary_loss_mlp": 0.01261765, + "balance_loss_clip": 0.06284033, + "balance_loss_mlp": 0.01258356, + "epoch": 0.3107169697880655, + "flos": 68551522151040.0, + "grad_norm": 0.7172817016896729, + "language_loss": 0.53065968, + "learning_rate": 3.2287570019362997e-06, + "loss": 0.60705864, + "num_input_tokens_seen": 110992215, + "router_z_loss_clip": 0.93945312, + "router_z_loss_mlp": 0.03417969, + "step": 5168, + "time_per_iteration": 3.211498737335205 + }, + { + "auxiliary_loss_clip": 0.06491841, + "auxiliary_loss_mlp": 0.0127061, + "balance_loss_clip": 0.06290184, + "balance_loss_mlp": 0.01254052, + "epoch": 0.3107770930407335, + "flos": 13193844698880.0, + "grad_norm": 1.7226101243088363, + "language_loss": 0.79536855, + "learning_rate": 3.2284496889175668e-06, + "loss": 0.87299311, + "num_input_tokens_seen": 111010400, + "router_z_loss_clip": 2.01757812, + "router_z_loss_mlp": 0.16552734, + "step": 5169, + "time_per_iteration": 2.526906728744507 + }, + { + "auxiliary_loss_clip": 0.06491011, + "auxiliary_loss_mlp": 0.01271387, + "balance_loss_clip": 0.06288561, + "balance_loss_mlp": 0.01254328, + "epoch": 0.3108372162934015, + "flos": 31589587048320.0, + "grad_norm": 1.7384868970357352, + "language_loss": 0.6439994, + "learning_rate": 3.2281423293158986e-06, + "loss": 0.7216233, + "num_input_tokens_seen": 111033960, + "router_z_loss_clip": 2.02636719, + "router_z_loss_mlp": 0.17077637, + "step": 5170, + "time_per_iteration": 2.659008264541626 + }, + { + "auxiliary_loss_clip": 0.06488822, + "auxiliary_loss_mlp": 0.01276189, + "balance_loss_clip": 0.06288925, + "balance_loss_mlp": 0.01258927, + "epoch": 0.31089733954606946, + "flos": 28737231811200.0, + "grad_norm": 2.2754975952460086, + "language_loss": 0.77238673, + "learning_rate": 3.22783492314295e-06, + "loss": 0.8500368, + "num_input_tokens_seen": 111053265, + "router_z_loss_clip": 1.99902344, + "router_z_loss_mlp": 0.17260742, + "step": 5171, + "time_per_iteration": 2.5726847648620605 + }, + { + "auxiliary_loss_clip": 0.06489364, + "auxiliary_loss_mlp": 0.01274912, + "balance_loss_clip": 0.06290348, + "balance_loss_mlp": 0.01258294, + "epoch": 0.3109574627987374, + "flos": 19689455462400.0, + "grad_norm": 1.774750718996553, + "language_loss": 0.84023309, + "learning_rate": 3.2275274704103785e-06, + "loss": 0.91787583, + "num_input_tokens_seen": 111071130, + "router_z_loss_clip": 1.9921875, + "router_z_loss_mlp": 0.16625977, + "step": 5172, + "time_per_iteration": 2.5289804935455322 + }, + { + "auxiliary_loss_clip": 0.06485899, + "auxiliary_loss_mlp": 0.01271683, + "balance_loss_clip": 0.06284268, + "balance_loss_mlp": 0.01254493, + "epoch": 0.3110175860514054, + "flos": 14689231390080.0, + "grad_norm": 2.444929493076507, + "language_loss": 0.8466565, + "learning_rate": 3.227219971129842e-06, + "loss": 0.92423236, + "num_input_tokens_seen": 111089560, + "router_z_loss_clip": 2.01855469, + "router_z_loss_mlp": 0.17199707, + "step": 5173, + "time_per_iteration": 2.477851629257202 + }, + { + "auxiliary_loss_clip": 0.06478094, + "auxiliary_loss_mlp": 0.01270979, + "balance_loss_clip": 0.06285643, + "balance_loss_mlp": 0.01255279, + "epoch": 0.31107770930407336, + "flos": 25746835772160.0, + "grad_norm": 1.6684709759498597, + "language_loss": 0.83928138, + "learning_rate": 3.226912425313001e-06, + "loss": 0.91677213, + "num_input_tokens_seen": 111109960, + "router_z_loss_clip": 1.92382812, + "router_z_loss_mlp": 0.15698242, + "step": 5174, + "time_per_iteration": 2.6188318729400635 + }, + { + "auxiliary_loss_clip": 0.06483682, + "auxiliary_loss_mlp": 0.0127308, + "balance_loss_clip": 0.06284115, + "balance_loss_mlp": 0.01256057, + "epoch": 0.3111378325567413, + "flos": 19214272558080.0, + "grad_norm": 2.0188284806938945, + "language_loss": 0.85820258, + "learning_rate": 3.2266048329715183e-06, + "loss": 0.93577021, + "num_input_tokens_seen": 111127960, + "router_z_loss_clip": 1.99414062, + "router_z_loss_mlp": 0.17016602, + "step": 5175, + "time_per_iteration": 2.489356756210327 + }, + { + "auxiliary_loss_clip": 0.06477995, + "auxiliary_loss_mlp": 0.01275126, + "balance_loss_clip": 0.0628527, + "balance_loss_mlp": 0.01257352, + "epoch": 0.3111979558094093, + "flos": 23703199816320.0, + "grad_norm": 1.907748003287586, + "language_loss": 0.84357607, + "learning_rate": 3.2262971941170575e-06, + "loss": 0.92110729, + "num_input_tokens_seen": 111146730, + "router_z_loss_clip": 1.92675781, + "router_z_loss_mlp": 0.17773438, + "step": 5176, + "time_per_iteration": 2.599229574203491 + }, + { + "auxiliary_loss_clip": 0.06476277, + "auxiliary_loss_mlp": 0.01273206, + "balance_loss_clip": 0.06279132, + "balance_loss_mlp": 0.01255468, + "epoch": 0.31125807906207725, + "flos": 21039422192640.0, + "grad_norm": 2.9714078029027977, + "language_loss": 0.80720133, + "learning_rate": 3.2259895087612837e-06, + "loss": 0.88469613, + "num_input_tokens_seen": 111166295, + "router_z_loss_clip": 1.96972656, + "router_z_loss_mlp": 0.17736816, + "step": 5177, + "time_per_iteration": 2.500892162322998 + }, + { + "auxiliary_loss_clip": 0.06482373, + "auxiliary_loss_mlp": 0.01272639, + "balance_loss_clip": 0.06283157, + "balance_loss_mlp": 0.01255353, + "epoch": 0.3113182023147452, + "flos": 23083435491840.0, + "grad_norm": 1.9531801027744504, + "language_loss": 0.81037831, + "learning_rate": 3.2256817769158657e-06, + "loss": 0.88792837, + "num_input_tokens_seen": 111185665, + "router_z_loss_clip": 1.9921875, + "router_z_loss_mlp": 0.17285156, + "step": 5178, + "time_per_iteration": 2.6086864471435547 + }, + { + "auxiliary_loss_clip": 0.06483644, + "auxiliary_loss_mlp": 0.01276661, + "balance_loss_clip": 0.06283852, + "balance_loss_mlp": 0.01259316, + "epoch": 0.3113783255674132, + "flos": 11843919895680.0, + "grad_norm": 1.9055325557306373, + "language_loss": 0.81524587, + "learning_rate": 3.225373998592471e-06, + "loss": 0.89284897, + "num_input_tokens_seen": 111201615, + "router_z_loss_clip": 1.99511719, + "router_z_loss_mlp": 0.17346191, + "step": 5179, + "time_per_iteration": 2.4582295417785645 + }, + { + "auxiliary_loss_clip": 0.06482498, + "auxiliary_loss_mlp": 0.01272412, + "balance_loss_clip": 0.06285708, + "balance_loss_mlp": 0.01255926, + "epoch": 0.31143844882008115, + "flos": 16295098089600.0, + "grad_norm": 1.625598326664227, + "language_loss": 0.78714401, + "learning_rate": 3.2250661738027715e-06, + "loss": 0.86469316, + "num_input_tokens_seen": 111220515, + "router_z_loss_clip": 1.96582031, + "router_z_loss_mlp": 0.16491699, + "step": 5180, + "time_per_iteration": 2.4980807304382324 + }, + { + "auxiliary_loss_clip": 0.06486566, + "auxiliary_loss_mlp": 0.01274849, + "balance_loss_clip": 0.06288585, + "balance_loss_mlp": 0.01257742, + "epoch": 0.3114985720727491, + "flos": 23223824208000.0, + "grad_norm": 4.8505374097148595, + "language_loss": 0.83649975, + "learning_rate": 3.22475830255844e-06, + "loss": 0.91411394, + "num_input_tokens_seen": 111240395, + "router_z_loss_clip": 1.98144531, + "router_z_loss_mlp": 0.17102051, + "step": 5181, + "time_per_iteration": 2.519810438156128 + }, + { + "auxiliary_loss_clip": 0.0648061, + "auxiliary_loss_mlp": 0.01273344, + "balance_loss_clip": 0.06285872, + "balance_loss_mlp": 0.01258348, + "epoch": 0.3115586953254171, + "flos": 30052468224000.0, + "grad_norm": 1.6592506395593873, + "language_loss": 0.74442661, + "learning_rate": 3.2244503848711516e-06, + "loss": 0.82196611, + "num_input_tokens_seen": 111261100, + "router_z_loss_clip": 1.94726562, + "router_z_loss_mlp": 0.15002441, + "step": 5182, + "time_per_iteration": 2.6227729320526123 + }, + { + "auxiliary_loss_clip": 0.06490366, + "auxiliary_loss_mlp": 0.01270872, + "balance_loss_clip": 0.06288615, + "balance_loss_mlp": 0.01254362, + "epoch": 0.3116188185780851, + "flos": 25673433995520.0, + "grad_norm": 2.0195817263542852, + "language_loss": 0.70974112, + "learning_rate": 3.2241424207525815e-06, + "loss": 0.78735352, + "num_input_tokens_seen": 111281320, + "router_z_loss_clip": 2.015625, + "router_z_loss_mlp": 0.16503906, + "step": 5183, + "time_per_iteration": 2.5801775455474854 + }, + { + "auxiliary_loss_clip": 0.06369011, + "auxiliary_loss_mlp": 0.0126694, + "balance_loss_clip": 0.06276023, + "balance_loss_mlp": 0.0126376, + "epoch": 0.31167894183075306, + "flos": 69528568285440.0, + "grad_norm": 0.9410725627351464, + "language_loss": 0.59133947, + "learning_rate": 3.223834410214408e-06, + "loss": 0.66769892, + "num_input_tokens_seen": 111341405, + "router_z_loss_clip": 0.9296875, + "router_z_loss_mlp": 0.03182983, + "step": 5184, + "time_per_iteration": 3.1446807384490967 + }, + { + "auxiliary_loss_clip": 0.06488199, + "auxiliary_loss_mlp": 0.01277241, + "balance_loss_clip": 0.06288702, + "balance_loss_mlp": 0.01260206, + "epoch": 0.31173906508342103, + "flos": 14945215213440.0, + "grad_norm": 2.5697318046341424, + "language_loss": 0.69689488, + "learning_rate": 3.223526353268311e-06, + "loss": 0.77454925, + "num_input_tokens_seen": 111358975, + "router_z_loss_clip": 1.99121094, + "router_z_loss_mlp": 0.17041016, + "step": 5185, + "time_per_iteration": 2.51505446434021 + }, + { + "auxiliary_loss_clip": 0.06492566, + "auxiliary_loss_mlp": 0.01273506, + "balance_loss_clip": 0.06291321, + "balance_loss_mlp": 0.01256507, + "epoch": 0.311799188336089, + "flos": 16180886574720.0, + "grad_norm": 2.500262239817252, + "language_loss": 0.63946617, + "learning_rate": 3.2232182499259725e-06, + "loss": 0.71712691, + "num_input_tokens_seen": 111375845, + "router_z_loss_clip": 2.01269531, + "router_z_loss_mlp": 0.17004395, + "step": 5186, + "time_per_iteration": 2.505030870437622 + }, + { + "auxiliary_loss_clip": 0.06492127, + "auxiliary_loss_mlp": 0.01277284, + "balance_loss_clip": 0.06286798, + "balance_loss_mlp": 0.01258592, + "epoch": 0.31185931158875696, + "flos": 25016633366400.0, + "grad_norm": 2.1681671670490603, + "language_loss": 0.86641979, + "learning_rate": 3.2229101001990747e-06, + "loss": 0.94411391, + "num_input_tokens_seen": 111394150, + "router_z_loss_clip": 2.05664062, + "router_z_loss_mlp": 0.18688965, + "step": 5187, + "time_per_iteration": 2.583510160446167 + }, + { + "auxiliary_loss_clip": 0.06487665, + "auxiliary_loss_mlp": 0.01281669, + "balance_loss_clip": 0.06287494, + "balance_loss_mlp": 0.01264527, + "epoch": 0.3119194348414249, + "flos": 37242041702400.0, + "grad_norm": 1.4465041932602023, + "language_loss": 0.6305244, + "learning_rate": 3.2226019040993036e-06, + "loss": 0.70821768, + "num_input_tokens_seen": 111418355, + "router_z_loss_clip": 2.00097656, + "router_z_loss_mlp": 0.17138672, + "step": 5188, + "time_per_iteration": 2.7036139965057373 + }, + { + "auxiliary_loss_clip": 0.06486794, + "auxiliary_loss_mlp": 0.01278194, + "balance_loss_clip": 0.06286722, + "balance_loss_mlp": 0.01261397, + "epoch": 0.3119795580940929, + "flos": 15018155792640.0, + "grad_norm": 2.1005201528303683, + "language_loss": 0.83722234, + "learning_rate": 3.222293661638346e-06, + "loss": 0.91487223, + "num_input_tokens_seen": 111435445, + "router_z_loss_clip": 1.99902344, + "router_z_loss_mlp": 0.16796875, + "step": 5189, + "time_per_iteration": 3.933061361312866 + }, + { + "auxiliary_loss_clip": 0.06481164, + "auxiliary_loss_mlp": 0.0127866, + "balance_loss_clip": 0.06286235, + "balance_loss_mlp": 0.01262602, + "epoch": 0.31203968134676086, + "flos": 16003755043200.0, + "grad_norm": 2.4405990352060862, + "language_loss": 0.79429829, + "learning_rate": 3.22198537282789e-06, + "loss": 0.87189662, + "num_input_tokens_seen": 111453430, + "router_z_loss_clip": 1.94824219, + "router_z_loss_mlp": 0.16064453, + "step": 5190, + "time_per_iteration": 2.479335308074951 + }, + { + "auxiliary_loss_clip": 0.0648755, + "auxiliary_loss_mlp": 0.01275874, + "balance_loss_clip": 0.06287287, + "balance_loss_mlp": 0.01259292, + "epoch": 0.3120998045994288, + "flos": 23843378897280.0, + "grad_norm": 1.451249914697294, + "language_loss": 0.75502658, + "learning_rate": 3.2216770376796262e-06, + "loss": 0.83266091, + "num_input_tokens_seen": 111475325, + "router_z_loss_clip": 2.00195312, + "router_z_loss_mlp": 0.16589355, + "step": 5191, + "time_per_iteration": 3.997621536254883 + }, + { + "auxiliary_loss_clip": 0.06364973, + "auxiliary_loss_mlp": 0.01267778, + "balance_loss_clip": 0.0627303, + "balance_loss_mlp": 0.01264178, + "epoch": 0.3121599278520968, + "flos": 69203081900160.0, + "grad_norm": 0.8286054534369729, + "language_loss": 0.63964236, + "learning_rate": 3.221368656205247e-06, + "loss": 0.71596992, + "num_input_tokens_seen": 111533960, + "router_z_loss_clip": 0.91796875, + "router_z_loss_mlp": 0.03594971, + "step": 5192, + "time_per_iteration": 4.631687879562378 + }, + { + "auxiliary_loss_clip": 0.06487048, + "auxiliary_loss_mlp": 0.01274026, + "balance_loss_clip": 0.06284614, + "balance_loss_mlp": 0.01254916, + "epoch": 0.31222005110476475, + "flos": 23813302481280.0, + "grad_norm": 1.6272414578256373, + "language_loss": 0.80280936, + "learning_rate": 3.221060228416446e-06, + "loss": 0.88042009, + "num_input_tokens_seen": 111554055, + "router_z_loss_clip": 2.02441406, + "router_z_loss_mlp": 0.19116211, + "step": 5193, + "time_per_iteration": 2.5469777584075928 + }, + { + "auxiliary_loss_clip": 0.06487141, + "auxiliary_loss_mlp": 0.01274246, + "balance_loss_clip": 0.06286725, + "balance_loss_mlp": 0.01255244, + "epoch": 0.3122801743574327, + "flos": 25232771773440.0, + "grad_norm": 1.8740192083695482, + "language_loss": 0.72266662, + "learning_rate": 3.2207517543249183e-06, + "loss": 0.80028057, + "num_input_tokens_seen": 111574305, + "router_z_loss_clip": 2.00292969, + "router_z_loss_mlp": 0.19006348, + "step": 5194, + "time_per_iteration": 2.5416929721832275 + }, + { + "auxiliary_loss_clip": 0.06483766, + "auxiliary_loss_mlp": 0.01273792, + "balance_loss_clip": 0.06285778, + "balance_loss_mlp": 0.01257604, + "epoch": 0.3123402976101007, + "flos": 22973165118720.0, + "grad_norm": 1.4810805631902553, + "language_loss": 0.77076054, + "learning_rate": 3.2204432339423616e-06, + "loss": 0.8483361, + "num_input_tokens_seen": 111595680, + "router_z_loss_clip": 1.97949219, + "router_z_loss_mlp": 0.16186523, + "step": 5195, + "time_per_iteration": 2.5890305042266846 + }, + { + "auxiliary_loss_clip": 0.06489303, + "auxiliary_loss_mlp": 0.01273064, + "balance_loss_clip": 0.06285589, + "balance_loss_mlp": 0.01256268, + "epoch": 0.3124004208627687, + "flos": 25199131559040.0, + "grad_norm": 1.3828607146804377, + "language_loss": 0.78218812, + "learning_rate": 3.220134667280476e-06, + "loss": 0.85981178, + "num_input_tokens_seen": 111618135, + "router_z_loss_clip": 2.03710938, + "router_z_loss_mlp": 0.16796875, + "step": 5196, + "time_per_iteration": 2.608607769012451 + }, + { + "auxiliary_loss_clip": 0.06360652, + "auxiliary_loss_mlp": 0.0126022, + "balance_loss_clip": 0.06268834, + "balance_loss_mlp": 0.01256831, + "epoch": 0.31246054411543667, + "flos": 67506398974080.0, + "grad_norm": 0.7576873975695796, + "language_loss": 0.54860902, + "learning_rate": 3.2198260543509613e-06, + "loss": 0.62481773, + "num_input_tokens_seen": 111682220, + "router_z_loss_clip": 0.91455078, + "router_z_loss_mlp": 0.03396606, + "step": 5197, + "time_per_iteration": 4.588749170303345 + }, + { + "auxiliary_loss_clip": 0.06482677, + "auxiliary_loss_mlp": 0.0127766, + "balance_loss_clip": 0.06286696, + "balance_loss_mlp": 0.01261424, + "epoch": 0.31252066736810463, + "flos": 17864347754880.0, + "grad_norm": 1.7824095594325715, + "language_loss": 0.67078102, + "learning_rate": 3.21951739516552e-06, + "loss": 0.74838442, + "num_input_tokens_seen": 111700815, + "router_z_loss_clip": 1.9609375, + "router_z_loss_mlp": 0.16247559, + "step": 5198, + "time_per_iteration": 2.5304651260375977 + }, + { + "auxiliary_loss_clip": 0.06490927, + "auxiliary_loss_mlp": 0.01280145, + "balance_loss_clip": 0.06286959, + "balance_loss_mlp": 0.01261596, + "epoch": 0.3125807906207726, + "flos": 18480338645760.0, + "grad_norm": 2.4146329055675264, + "language_loss": 0.70401263, + "learning_rate": 3.219208689735857e-06, + "loss": 0.78172338, + "num_input_tokens_seen": 111718195, + "router_z_loss_clip": 2.04101562, + "router_z_loss_mlp": 0.1854248, + "step": 5199, + "time_per_iteration": 2.5358517169952393 + }, + { + "auxiliary_loss_clip": 0.06486207, + "auxiliary_loss_mlp": 0.01275953, + "balance_loss_clip": 0.06286721, + "balance_loss_mlp": 0.01258751, + "epoch": 0.31264091387344056, + "flos": 18951454627200.0, + "grad_norm": 1.7917967449154466, + "language_loss": 0.79258394, + "learning_rate": 3.2188999380736785e-06, + "loss": 0.87020558, + "num_input_tokens_seen": 111734440, + "router_z_loss_clip": 1.99316406, + "router_z_loss_mlp": 0.17211914, + "step": 5200, + "time_per_iteration": 2.5519278049468994 + }, + { + "auxiliary_loss_clip": 0.06479242, + "auxiliary_loss_mlp": 0.0127792, + "balance_loss_clip": 0.06284697, + "balance_loss_mlp": 0.01261195, + "epoch": 0.3127010371261085, + "flos": 21474591972480.0, + "grad_norm": 1.8808343302197998, + "language_loss": 0.83758473, + "learning_rate": 3.2185911401906917e-06, + "loss": 0.91515636, + "num_input_tokens_seen": 111751960, + "router_z_loss_clip": 1.94726562, + "router_z_loss_mlp": 0.16711426, + "step": 5201, + "time_per_iteration": 2.509331226348877 + }, + { + "auxiliary_loss_clip": 0.06487838, + "auxiliary_loss_mlp": 0.0127922, + "balance_loss_clip": 0.06288306, + "balance_loss_mlp": 0.01262006, + "epoch": 0.3127611603787765, + "flos": 15340623431040.0, + "grad_norm": 2.173524859167814, + "language_loss": 0.69690537, + "learning_rate": 3.2182822960986072e-06, + "loss": 0.77457595, + "num_input_tokens_seen": 111769585, + "router_z_loss_clip": 1.99414062, + "router_z_loss_mlp": 0.17224121, + "step": 5202, + "time_per_iteration": 2.52652907371521 + }, + { + "auxiliary_loss_clip": 0.06486704, + "auxiliary_loss_mlp": 0.01278352, + "balance_loss_clip": 0.06286184, + "balance_loss_mlp": 0.01261257, + "epoch": 0.31282128363144446, + "flos": 17608741274880.0, + "grad_norm": 2.6038382996561604, + "language_loss": 0.83874559, + "learning_rate": 3.2179734058091358e-06, + "loss": 0.91639626, + "num_input_tokens_seen": 111787880, + "router_z_loss_clip": 2.00488281, + "router_z_loss_mlp": 0.17077637, + "step": 5203, + "time_per_iteration": 2.502721071243286 + }, + { + "auxiliary_loss_clip": 0.06488604, + "auxiliary_loss_mlp": 0.01274199, + "balance_loss_clip": 0.06287186, + "balance_loss_mlp": 0.01256604, + "epoch": 0.3128814068841124, + "flos": 26763349979520.0, + "grad_norm": 2.412675439541041, + "language_loss": 0.61310971, + "learning_rate": 3.2176644693339913e-06, + "loss": 0.69073772, + "num_input_tokens_seen": 111805950, + "router_z_loss_clip": 2.01464844, + "router_z_loss_mlp": 0.17602539, + "step": 5204, + "time_per_iteration": 2.62591814994812 + }, + { + "auxiliary_loss_clip": 0.06482827, + "auxiliary_loss_mlp": 0.01275158, + "balance_loss_clip": 0.0628654, + "balance_loss_mlp": 0.01259553, + "epoch": 0.3129415301367804, + "flos": 22278783133440.0, + "grad_norm": 1.7324044566720012, + "language_loss": 0.66418731, + "learning_rate": 3.217355486684887e-06, + "loss": 0.74176717, + "num_input_tokens_seen": 111826135, + "router_z_loss_clip": 1.96191406, + "router_z_loss_mlp": 0.15582275, + "step": 5205, + "time_per_iteration": 2.512777328491211 + }, + { + "auxiliary_loss_clip": 0.06487758, + "auxiliary_loss_mlp": 0.01277628, + "balance_loss_clip": 0.06287788, + "balance_loss_mlp": 0.01260021, + "epoch": 0.31300165338944835, + "flos": 26471461881600.0, + "grad_norm": 1.8344199627772577, + "language_loss": 0.77298087, + "learning_rate": 3.2170464578735414e-06, + "loss": 0.85063475, + "num_input_tokens_seen": 111844700, + "router_z_loss_clip": 1.99902344, + "router_z_loss_mlp": 0.17614746, + "step": 5206, + "time_per_iteration": 2.5712244510650635 + }, + { + "auxiliary_loss_clip": 0.06485735, + "auxiliary_loss_mlp": 0.01271701, + "balance_loss_clip": 0.06288184, + "balance_loss_mlp": 0.01255488, + "epoch": 0.3130617766421163, + "flos": 21951116542080.0, + "grad_norm": 2.0121384013718226, + "language_loss": 0.83184564, + "learning_rate": 3.216737382911672e-06, + "loss": 0.90941995, + "num_input_tokens_seen": 111861585, + "router_z_loss_clip": 1.97558594, + "router_z_loss_mlp": 0.16210938, + "step": 5207, + "time_per_iteration": 2.5004825592041016 + }, + { + "auxiliary_loss_clip": 0.06481713, + "auxiliary_loss_mlp": 0.01271341, + "balance_loss_clip": 0.06286129, + "balance_loss_mlp": 0.0125489, + "epoch": 0.3131218998947843, + "flos": 23299154628480.0, + "grad_norm": 2.0890442442793478, + "language_loss": 0.71795774, + "learning_rate": 3.216428261810999e-06, + "loss": 0.79548824, + "num_input_tokens_seen": 111882950, + "router_z_loss_clip": 1.95605469, + "router_z_loss_mlp": 0.16442871, + "step": 5208, + "time_per_iteration": 2.5763585567474365 + }, + { + "auxiliary_loss_clip": 0.06485837, + "auxiliary_loss_mlp": 0.01275661, + "balance_loss_clip": 0.06287587, + "balance_loss_mlp": 0.0125927, + "epoch": 0.3131820231474523, + "flos": 21145583715840.0, + "grad_norm": 1.890905451265213, + "language_loss": 0.74832964, + "learning_rate": 3.2161190945832445e-06, + "loss": 0.82594466, + "num_input_tokens_seen": 111901640, + "router_z_loss_clip": 1.98046875, + "router_z_loss_mlp": 0.1640625, + "step": 5209, + "time_per_iteration": 2.510582685470581 + }, + { + "auxiliary_loss_clip": 0.06483819, + "auxiliary_loss_mlp": 0.01271712, + "balance_loss_clip": 0.06284019, + "balance_loss_mlp": 0.01255678, + "epoch": 0.31324214640012027, + "flos": 23915816352000.0, + "grad_norm": 1.8368712630160764, + "language_loss": 0.77846575, + "learning_rate": 3.2158098812401325e-06, + "loss": 0.85602105, + "num_input_tokens_seen": 111919615, + "router_z_loss_clip": 1.99609375, + "router_z_loss_mlp": 0.16027832, + "step": 5210, + "time_per_iteration": 2.5457394123077393 + }, + { + "auxiliary_loss_clip": 0.06472643, + "auxiliary_loss_mlp": 0.01278598, + "balance_loss_clip": 0.06280389, + "balance_loss_mlp": 0.01262963, + "epoch": 0.31330226965278823, + "flos": 22243507764480.0, + "grad_norm": 1.7690758446531836, + "language_loss": 0.79563594, + "learning_rate": 3.2155006217933874e-06, + "loss": 0.87314838, + "num_input_tokens_seen": 111938485, + "router_z_loss_clip": 1.92285156, + "router_z_loss_mlp": 0.15643311, + "step": 5211, + "time_per_iteration": 2.5383517742156982 + }, + { + "auxiliary_loss_clip": 0.0648172, + "auxiliary_loss_mlp": 0.01270065, + "balance_loss_clip": 0.06285914, + "balance_loss_mlp": 0.01254699, + "epoch": 0.3133623929054562, + "flos": 19759838492160.0, + "grad_norm": 1.6892345584465767, + "language_loss": 0.79993588, + "learning_rate": 3.2151913162547367e-06, + "loss": 0.87745374, + "num_input_tokens_seen": 111956425, + "router_z_loss_clip": 1.95996094, + "router_z_loss_mlp": 0.15368652, + "step": 5212, + "time_per_iteration": 2.5550856590270996 + }, + { + "auxiliary_loss_clip": 0.06489062, + "auxiliary_loss_mlp": 0.01276168, + "balance_loss_clip": 0.06287421, + "balance_loss_mlp": 0.01258919, + "epoch": 0.31342251615812416, + "flos": 27169617300480.0, + "grad_norm": 2.030797991853156, + "language_loss": 0.71651685, + "learning_rate": 3.2148819646359097e-06, + "loss": 0.79416913, + "num_input_tokens_seen": 111975915, + "router_z_loss_clip": 2.01660156, + "router_z_loss_mlp": 0.17248535, + "step": 5213, + "time_per_iteration": 2.5827908515930176 + }, + { + "auxiliary_loss_clip": 0.06486979, + "auxiliary_loss_mlp": 0.01275678, + "balance_loss_clip": 0.06285015, + "balance_loss_mlp": 0.01258763, + "epoch": 0.31348263941079213, + "flos": 20235985718400.0, + "grad_norm": 2.164105834219518, + "language_loss": 0.77949297, + "learning_rate": 3.2145725669486374e-06, + "loss": 0.85711956, + "num_input_tokens_seen": 111995055, + "router_z_loss_clip": 2.01757812, + "router_z_loss_mlp": 0.16918945, + "step": 5214, + "time_per_iteration": 2.539149761199951 + }, + { + "auxiliary_loss_clip": 0.06478322, + "auxiliary_loss_mlp": 0.0127674, + "balance_loss_clip": 0.06285194, + "balance_loss_mlp": 0.01261267, + "epoch": 0.3135427626634601, + "flos": 24614474895360.0, + "grad_norm": 1.5354860146289633, + "language_loss": 0.82935429, + "learning_rate": 3.2142631232046517e-06, + "loss": 0.90690494, + "num_input_tokens_seen": 112015830, + "router_z_loss_clip": 1.92871094, + "router_z_loss_mlp": 0.15472412, + "step": 5215, + "time_per_iteration": 2.541269302368164 + }, + { + "auxiliary_loss_clip": 0.06486098, + "auxiliary_loss_mlp": 0.01273565, + "balance_loss_clip": 0.06288007, + "balance_loss_mlp": 0.01257186, + "epoch": 0.31360288591612806, + "flos": 20966230051200.0, + "grad_norm": 1.8278899125375987, + "language_loss": 0.79790628, + "learning_rate": 3.213953633415686e-06, + "loss": 0.87550294, + "num_input_tokens_seen": 112035065, + "router_z_loss_clip": 1.98144531, + "router_z_loss_mlp": 0.16369629, + "step": 5216, + "time_per_iteration": 2.5465261936187744 + }, + { + "auxiliary_loss_clip": 0.06489767, + "auxiliary_loss_mlp": 0.0127801, + "balance_loss_clip": 0.06286536, + "balance_loss_mlp": 0.01258722, + "epoch": 0.313663009168796, + "flos": 26987957648640.0, + "grad_norm": 1.8964979694160957, + "language_loss": 0.68953168, + "learning_rate": 3.213644097593477e-06, + "loss": 0.76720947, + "num_input_tokens_seen": 112058405, + "router_z_loss_clip": 2.03125, + "router_z_loss_mlp": 0.19299316, + "step": 5217, + "time_per_iteration": 2.5518875122070312 + }, + { + "auxiliary_loss_clip": 0.06480299, + "auxiliary_loss_mlp": 0.01275451, + "balance_loss_clip": 0.06283456, + "balance_loss_mlp": 0.01259298, + "epoch": 0.313723132421464, + "flos": 18046762093440.0, + "grad_norm": 1.6389262097165689, + "language_loss": 0.80772746, + "learning_rate": 3.2133345157497624e-06, + "loss": 0.88528496, + "num_input_tokens_seen": 112076420, + "router_z_loss_clip": 1.96679688, + "router_z_loss_mlp": 0.16149902, + "step": 5218, + "time_per_iteration": 2.5255727767944336 + }, + { + "auxiliary_loss_clip": 0.06485314, + "auxiliary_loss_mlp": 0.0127641, + "balance_loss_clip": 0.06285116, + "balance_loss_mlp": 0.01259363, + "epoch": 0.31378325567413196, + "flos": 22494963467520.0, + "grad_norm": 2.253901481236794, + "language_loss": 0.70057523, + "learning_rate": 3.2130248878962813e-06, + "loss": 0.77819252, + "num_input_tokens_seen": 112090775, + "router_z_loss_clip": 2.00195312, + "router_z_loss_mlp": 0.17047119, + "step": 5219, + "time_per_iteration": 2.487877368927002 + }, + { + "auxiliary_loss_clip": 0.06483484, + "auxiliary_loss_mlp": 0.0127692, + "balance_loss_clip": 0.06284904, + "balance_loss_mlp": 0.01259181, + "epoch": 0.3138433789267999, + "flos": 22425838248960.0, + "grad_norm": 1.9320324134388631, + "language_loss": 0.80156839, + "learning_rate": 3.2127152140447747e-06, + "loss": 0.87917244, + "num_input_tokens_seen": 112110980, + "router_z_loss_clip": 1.98535156, + "router_z_loss_mlp": 0.17736816, + "step": 5220, + "time_per_iteration": 2.5364530086517334 + }, + { + "auxiliary_loss_clip": 0.06484166, + "auxiliary_loss_mlp": 0.01276534, + "balance_loss_clip": 0.06287254, + "balance_loss_mlp": 0.01260751, + "epoch": 0.3139035021794679, + "flos": 13010927235840.0, + "grad_norm": 1.8390249578816682, + "language_loss": 0.73235905, + "learning_rate": 3.212405494206986e-06, + "loss": 0.80996603, + "num_input_tokens_seen": 112129020, + "router_z_loss_clip": 1.96777344, + "router_z_loss_mlp": 0.15771484, + "step": 5221, + "time_per_iteration": 2.477369546890259 + }, + { + "auxiliary_loss_clip": 0.06480553, + "auxiliary_loss_mlp": 0.0127616, + "balance_loss_clip": 0.0628504, + "balance_loss_mlp": 0.0125996, + "epoch": 0.31396362543213585, + "flos": 16951605229440.0, + "grad_norm": 1.9354629264259422, + "language_loss": 0.81906354, + "learning_rate": 3.2120957283946588e-06, + "loss": 0.89663064, + "num_input_tokens_seen": 112147865, + "router_z_loss_clip": 1.95605469, + "router_z_loss_mlp": 0.16223145, + "step": 5222, + "time_per_iteration": 2.5057129859924316 + }, + { + "auxiliary_loss_clip": 0.06490297, + "auxiliary_loss_mlp": 0.01284294, + "balance_loss_clip": 0.06288279, + "balance_loss_mlp": 0.01266555, + "epoch": 0.31402374868480387, + "flos": 20162877431040.0, + "grad_norm": 1.9084075298763516, + "language_loss": 0.70490289, + "learning_rate": 3.2117859166195407e-06, + "loss": 0.78264874, + "num_input_tokens_seen": 112166745, + "router_z_loss_clip": 2.02050781, + "router_z_loss_mlp": 0.17749023, + "step": 5223, + "time_per_iteration": 2.4747233390808105 + }, + { + "auxiliary_loss_clip": 0.06484593, + "auxiliary_loss_mlp": 0.01276253, + "balance_loss_clip": 0.06287414, + "balance_loss_mlp": 0.01259718, + "epoch": 0.31408387193747184, + "flos": 21257363462400.0, + "grad_norm": 1.5262001080385015, + "language_loss": 0.80608702, + "learning_rate": 3.211476058893379e-06, + "loss": 0.88369542, + "num_input_tokens_seen": 112185895, + "router_z_loss_clip": 1.96972656, + "router_z_loss_mlp": 0.1652832, + "step": 5224, + "time_per_iteration": 2.576864004135132 + }, + { + "auxiliary_loss_clip": 0.06497495, + "auxiliary_loss_mlp": 0.01279621, + "balance_loss_clip": 0.06291461, + "balance_loss_mlp": 0.01261632, + "epoch": 0.3141439951901398, + "flos": 27490617492480.0, + "grad_norm": 2.962077450034062, + "language_loss": 0.58624607, + "learning_rate": 3.2111661552279243e-06, + "loss": 0.66401726, + "num_input_tokens_seen": 112204465, + "router_z_loss_clip": 2.05859375, + "router_z_loss_mlp": 0.17993164, + "step": 5225, + "time_per_iteration": 2.558159828186035 + }, + { + "auxiliary_loss_clip": 0.06482717, + "auxiliary_loss_mlp": 0.0128044, + "balance_loss_clip": 0.06289019, + "balance_loss_mlp": 0.0126505, + "epoch": 0.31420411844280777, + "flos": 17857010522880.0, + "grad_norm": 1.7568792542410607, + "language_loss": 0.81975454, + "learning_rate": 3.2108562056349273e-06, + "loss": 0.89738619, + "num_input_tokens_seen": 112221635, + "router_z_loss_clip": 1.93847656, + "router_z_loss_mlp": 0.15380859, + "step": 5226, + "time_per_iteration": 2.5197925567626953 + }, + { + "auxiliary_loss_clip": 0.06493273, + "auxiliary_loss_mlp": 0.01283534, + "balance_loss_clip": 0.0629416, + "balance_loss_mlp": 0.01265998, + "epoch": 0.31426424169547573, + "flos": 21623491877760.0, + "grad_norm": 1.9094319640845634, + "language_loss": 0.74358761, + "learning_rate": 3.210546210126141e-06, + "loss": 0.8213557, + "num_input_tokens_seen": 112241240, + "router_z_loss_clip": 1.99023438, + "router_z_loss_mlp": 0.17529297, + "step": 5227, + "time_per_iteration": 2.6723456382751465 + }, + { + "auxiliary_loss_clip": 0.06493893, + "auxiliary_loss_mlp": 0.01287677, + "balance_loss_clip": 0.0629607, + "balance_loss_mlp": 0.01270392, + "epoch": 0.3143243649481437, + "flos": 30928677569280.0, + "grad_norm": 1.9492252245216757, + "language_loss": 0.68802202, + "learning_rate": 3.2102361687133213e-06, + "loss": 0.76583767, + "num_input_tokens_seen": 112262350, + "router_z_loss_clip": 1.97949219, + "router_z_loss_mlp": 0.17297363, + "step": 5228, + "time_per_iteration": 2.724705934524536 + }, + { + "auxiliary_loss_clip": 0.06488988, + "auxiliary_loss_mlp": 0.01281066, + "balance_loss_clip": 0.06292454, + "balance_loss_mlp": 0.01265044, + "epoch": 0.31438448820081166, + "flos": 22828206355200.0, + "grad_norm": 1.7089427628420442, + "language_loss": 0.80276144, + "learning_rate": 3.2099260814082254e-06, + "loss": 0.88046199, + "num_input_tokens_seen": 112283710, + "router_z_loss_clip": 1.96484375, + "router_z_loss_mlp": 0.16015625, + "step": 5229, + "time_per_iteration": 4.091265678405762 + }, + { + "auxiliary_loss_clip": 0.06481495, + "auxiliary_loss_mlp": 0.01275808, + "balance_loss_clip": 0.06287295, + "balance_loss_mlp": 0.01259428, + "epoch": 0.3144446114534796, + "flos": 23298399941760.0, + "grad_norm": 1.658320923858175, + "language_loss": 0.70112014, + "learning_rate": 3.209615948222611e-06, + "loss": 0.7786932, + "num_input_tokens_seen": 112304285, + "router_z_loss_clip": 1.93945312, + "router_z_loss_mlp": 0.16381836, + "step": 5230, + "time_per_iteration": 2.5652499198913574 + }, + { + "auxiliary_loss_clip": 0.06489812, + "auxiliary_loss_mlp": 0.01281571, + "balance_loss_clip": 0.06291179, + "balance_loss_mlp": 0.01264572, + "epoch": 0.3145047347061476, + "flos": 31363679640960.0, + "grad_norm": 2.930398163442548, + "language_loss": 0.80236816, + "learning_rate": 3.209305769168239e-06, + "loss": 0.88008201, + "num_input_tokens_seen": 112325110, + "router_z_loss_clip": 1.984375, + "router_z_loss_mlp": 0.17004395, + "step": 5231, + "time_per_iteration": 5.461926698684692 + }, + { + "auxiliary_loss_clip": 0.06483024, + "auxiliary_loss_mlp": 0.01279077, + "balance_loss_clip": 0.062879, + "balance_loss_mlp": 0.01262912, + "epoch": 0.31456485795881556, + "flos": 10894182992640.0, + "grad_norm": 3.377505802107346, + "language_loss": 0.85102671, + "learning_rate": 3.2089955442568704e-06, + "loss": 0.92864776, + "num_input_tokens_seen": 112339855, + "router_z_loss_clip": 1.95117188, + "router_z_loss_mlp": 0.16149902, + "step": 5232, + "time_per_iteration": 2.549555778503418 + }, + { + "auxiliary_loss_clip": 0.06479923, + "auxiliary_loss_mlp": 0.01286528, + "balance_loss_clip": 0.06287266, + "balance_loss_mlp": 0.01269779, + "epoch": 0.3146249812114835, + "flos": 17098157220480.0, + "grad_norm": 1.5771176865385883, + "language_loss": 0.80666757, + "learning_rate": 3.2086852735002692e-06, + "loss": 0.88433212, + "num_input_tokens_seen": 112358480, + "router_z_loss_clip": 1.92675781, + "router_z_loss_mlp": 0.16748047, + "step": 5233, + "time_per_iteration": 2.502790927886963 + }, + { + "auxiliary_loss_clip": 0.06496342, + "auxiliary_loss_mlp": 0.01276742, + "balance_loss_clip": 0.06294576, + "balance_loss_mlp": 0.01260768, + "epoch": 0.3146851044641515, + "flos": 55303283352960.0, + "grad_norm": 1.6501859452394316, + "language_loss": 0.71124518, + "learning_rate": 3.2083749569102024e-06, + "loss": 0.78897607, + "num_input_tokens_seen": 112382350, + "router_z_loss_clip": 2.01660156, + "router_z_loss_mlp": 0.15966797, + "step": 5234, + "time_per_iteration": 2.8301026821136475 + }, + { + "auxiliary_loss_clip": 0.06491733, + "auxiliary_loss_mlp": 0.01276589, + "balance_loss_clip": 0.06292239, + "balance_loss_mlp": 0.01259566, + "epoch": 0.31474522771681945, + "flos": 27023149163520.0, + "grad_norm": 1.9231261360365097, + "language_loss": 0.73437119, + "learning_rate": 3.2080645944984356e-06, + "loss": 0.8120544, + "num_input_tokens_seen": 112400260, + "router_z_loss_clip": 1.99902344, + "router_z_loss_mlp": 0.17004395, + "step": 5235, + "time_per_iteration": 2.543799638748169 + }, + { + "auxiliary_loss_clip": 0.0648193, + "auxiliary_loss_mlp": 0.0127527, + "balance_loss_clip": 0.0628682, + "balance_loss_mlp": 0.01259308, + "epoch": 0.3148053509694875, + "flos": 21258369711360.0, + "grad_norm": 1.9283939280374622, + "language_loss": 0.79554284, + "learning_rate": 3.2077541862767384e-06, + "loss": 0.87311482, + "num_input_tokens_seen": 112419400, + "router_z_loss_clip": 1.95214844, + "router_z_loss_mlp": 0.15942383, + "step": 5236, + "time_per_iteration": 2.5356431007385254 + }, + { + "auxiliary_loss_clip": 0.06493077, + "auxiliary_loss_mlp": 0.01277667, + "balance_loss_clip": 0.06288847, + "balance_loss_mlp": 0.01260942, + "epoch": 0.31486547422215544, + "flos": 31256721504000.0, + "grad_norm": 2.880510555000243, + "language_loss": 0.76337612, + "learning_rate": 3.207443732256881e-06, + "loss": 0.84108353, + "num_input_tokens_seen": 112440825, + "router_z_loss_clip": 2.04101562, + "router_z_loss_mlp": 0.16723633, + "step": 5237, + "time_per_iteration": 4.129598379135132 + }, + { + "auxiliary_loss_clip": 0.0648271, + "auxiliary_loss_mlp": 0.01277744, + "balance_loss_clip": 0.06291585, + "balance_loss_mlp": 0.01262843, + "epoch": 0.3149255974748234, + "flos": 19834749642240.0, + "grad_norm": 1.6736027402410734, + "language_loss": 0.7951014, + "learning_rate": 3.2071332324506372e-06, + "loss": 0.87270594, + "num_input_tokens_seen": 112459180, + "router_z_loss_clip": 1.91015625, + "router_z_loss_mlp": 0.14916992, + "step": 5238, + "time_per_iteration": 2.504612445831299 + }, + { + "auxiliary_loss_clip": 0.06376656, + "auxiliary_loss_mlp": 0.01267743, + "balance_loss_clip": 0.06282751, + "balance_loss_mlp": 0.01263604, + "epoch": 0.31498572072749137, + "flos": 67701867350400.0, + "grad_norm": 0.8276402478045692, + "language_loss": 0.68007928, + "learning_rate": 3.2068226868697795e-06, + "loss": 0.75652325, + "num_input_tokens_seen": 112516680, + "router_z_loss_clip": 0.9375, + "router_z_loss_mlp": 0.04141235, + "step": 5239, + "time_per_iteration": 3.174287796020508 + }, + { + "auxiliary_loss_clip": 0.06498836, + "auxiliary_loss_mlp": 0.01274257, + "balance_loss_clip": 0.06292844, + "balance_loss_mlp": 0.01256376, + "epoch": 0.31504584398015933, + "flos": 19799432346240.0, + "grad_norm": 2.176171670908613, + "language_loss": 0.82951081, + "learning_rate": 3.2065120955260846e-06, + "loss": 0.9072417, + "num_input_tokens_seen": 112535895, + "router_z_loss_clip": 2.0625, + "router_z_loss_mlp": 0.17883301, + "step": 5240, + "time_per_iteration": 2.509793996810913 + }, + { + "auxiliary_loss_clip": 0.06485248, + "auxiliary_loss_mlp": 0.01278588, + "balance_loss_clip": 0.06288239, + "balance_loss_mlp": 0.01262125, + "epoch": 0.3151059672328273, + "flos": 26622751628160.0, + "grad_norm": 1.8077188253124041, + "language_loss": 0.81193888, + "learning_rate": 3.2062014584313302e-06, + "loss": 0.88957721, + "num_input_tokens_seen": 112557490, + "router_z_loss_clip": 1.97363281, + "router_z_loss_mlp": 0.16455078, + "step": 5241, + "time_per_iteration": 2.571192502975464 + }, + { + "auxiliary_loss_clip": 0.06482153, + "auxiliary_loss_mlp": 0.01277268, + "balance_loss_clip": 0.06291743, + "balance_loss_mlp": 0.01260912, + "epoch": 0.31516609048549526, + "flos": 24210890904960.0, + "grad_norm": 1.4478120037649602, + "language_loss": 0.74484038, + "learning_rate": 3.2058907755972956e-06, + "loss": 0.82243454, + "num_input_tokens_seen": 112577075, + "router_z_loss_clip": 1.90332031, + "router_z_loss_mlp": 0.16357422, + "step": 5242, + "time_per_iteration": 2.526357650756836 + }, + { + "auxiliary_loss_clip": 0.06487267, + "auxiliary_loss_mlp": 0.01275494, + "balance_loss_clip": 0.06292535, + "balance_loss_mlp": 0.01259163, + "epoch": 0.31522621373816323, + "flos": 25965950999040.0, + "grad_norm": 1.6442244241642663, + "language_loss": 0.73668325, + "learning_rate": 3.2055800470357626e-06, + "loss": 0.81431091, + "num_input_tokens_seen": 112597620, + "router_z_loss_clip": 1.95019531, + "router_z_loss_mlp": 0.16320801, + "step": 5243, + "time_per_iteration": 2.606276273727417 + }, + { + "auxiliary_loss_clip": 0.06485401, + "auxiliary_loss_mlp": 0.01273843, + "balance_loss_clip": 0.0628818, + "balance_loss_mlp": 0.0125713, + "epoch": 0.3152863369908312, + "flos": 21915379975680.0, + "grad_norm": 1.7357669101009914, + "language_loss": 0.64914608, + "learning_rate": 3.205269272758513e-06, + "loss": 0.72673857, + "num_input_tokens_seen": 112617150, + "router_z_loss_clip": 1.97265625, + "router_z_loss_mlp": 0.16711426, + "step": 5244, + "time_per_iteration": 2.5950305461883545 + }, + { + "auxiliary_loss_clip": 0.06492754, + "auxiliary_loss_mlp": 0.01274277, + "balance_loss_clip": 0.06292984, + "balance_loss_mlp": 0.01257743, + "epoch": 0.31534646024349916, + "flos": 16285203308160.0, + "grad_norm": 2.8540583379791005, + "language_loss": 0.91357732, + "learning_rate": 3.2049584527773313e-06, + "loss": 0.99124765, + "num_input_tokens_seen": 112631090, + "router_z_loss_clip": 2.0, + "router_z_loss_mlp": 0.16540527, + "step": 5245, + "time_per_iteration": 2.510085105895996 + }, + { + "auxiliary_loss_clip": 0.06488977, + "auxiliary_loss_mlp": 0.01277309, + "balance_loss_clip": 0.06291293, + "balance_loss_mlp": 0.01260596, + "epoch": 0.3154065834961671, + "flos": 24724116362880.0, + "grad_norm": 1.9445780779956967, + "language_loss": 0.75699973, + "learning_rate": 3.2046475871040048e-06, + "loss": 0.83466256, + "num_input_tokens_seen": 112651220, + "router_z_loss_clip": 1.97851562, + "router_z_loss_mlp": 0.1673584, + "step": 5246, + "time_per_iteration": 2.543600559234619 + }, + { + "auxiliary_loss_clip": 0.06488622, + "auxiliary_loss_mlp": 0.01279725, + "balance_loss_clip": 0.06290317, + "balance_loss_mlp": 0.01262833, + "epoch": 0.3154667067488351, + "flos": 35379813836160.0, + "grad_norm": 1.6152414177037249, + "language_loss": 0.61608225, + "learning_rate": 3.204336675750321e-06, + "loss": 0.69376576, + "num_input_tokens_seen": 112671560, + "router_z_loss_clip": 1.98144531, + "router_z_loss_mlp": 0.16882324, + "step": 5247, + "time_per_iteration": 2.6849827766418457 + }, + { + "auxiliary_loss_clip": 0.06491058, + "auxiliary_loss_mlp": 0.01281873, + "balance_loss_clip": 0.06290263, + "balance_loss_mlp": 0.0126417, + "epoch": 0.31552683000150306, + "flos": 17462105429760.0, + "grad_norm": 2.6938697298202667, + "language_loss": 0.82848823, + "learning_rate": 3.2040257187280693e-06, + "loss": 0.90621758, + "num_input_tokens_seen": 112689790, + "router_z_loss_clip": 2.00585938, + "router_z_loss_mlp": 0.17687988, + "step": 5248, + "time_per_iteration": 2.4956586360931396 + }, + { + "auxiliary_loss_clip": 0.06488842, + "auxiliary_loss_mlp": 0.01282146, + "balance_loss_clip": 0.06291078, + "balance_loss_mlp": 0.01264121, + "epoch": 0.3155869532541711, + "flos": 18411674624640.0, + "grad_norm": 4.654519722073602, + "language_loss": 0.85721719, + "learning_rate": 3.2037147160490423e-06, + "loss": 0.93492711, + "num_input_tokens_seen": 112708265, + "router_z_loss_clip": 1.9765625, + "router_z_loss_mlp": 0.18029785, + "step": 5249, + "time_per_iteration": 2.568054437637329 + }, + { + "auxiliary_loss_clip": 0.06489561, + "auxiliary_loss_mlp": 0.01280069, + "balance_loss_clip": 0.06290483, + "balance_loss_mlp": 0.01261198, + "epoch": 0.31564707650683904, + "flos": 21586162083840.0, + "grad_norm": 1.7795262086342007, + "language_loss": 0.86067384, + "learning_rate": 3.2034036677250322e-06, + "loss": 0.93837023, + "num_input_tokens_seen": 112727820, + "router_z_loss_clip": 1.98730469, + "router_z_loss_mlp": 0.1887207, + "step": 5250, + "time_per_iteration": 2.508528709411621 + }, + { + "auxiliary_loss_clip": 0.06486481, + "auxiliary_loss_mlp": 0.01279989, + "balance_loss_clip": 0.06289366, + "balance_loss_mlp": 0.01262334, + "epoch": 0.315707199759507, + "flos": 21037032351360.0, + "grad_norm": 2.1261014211455063, + "language_loss": 0.6942147, + "learning_rate": 3.203092573767835e-06, + "loss": 0.77187943, + "num_input_tokens_seen": 112743140, + "router_z_loss_clip": 1.96777344, + "router_z_loss_mlp": 0.1763916, + "step": 5251, + "time_per_iteration": 2.526685953140259 + }, + { + "auxiliary_loss_clip": 0.06487083, + "auxiliary_loss_mlp": 0.01273182, + "balance_loss_clip": 0.06288725, + "balance_loss_mlp": 0.01255586, + "epoch": 0.31576732301217497, + "flos": 26835326236800.0, + "grad_norm": 2.019211823887184, + "language_loss": 0.78895354, + "learning_rate": 3.202781434189246e-06, + "loss": 0.86655623, + "num_input_tokens_seen": 112764705, + "router_z_loss_clip": 1.98339844, + "router_z_loss_mlp": 0.17602539, + "step": 5252, + "time_per_iteration": 2.570160150527954 + }, + { + "auxiliary_loss_clip": 0.06486022, + "auxiliary_loss_mlp": 0.01277329, + "balance_loss_clip": 0.06289184, + "balance_loss_mlp": 0.01261664, + "epoch": 0.31582744626484294, + "flos": 22717810200960.0, + "grad_norm": 1.5436537660689573, + "language_loss": 0.74377203, + "learning_rate": 3.202470249001066e-06, + "loss": 0.82140553, + "num_input_tokens_seen": 112785310, + "router_z_loss_clip": 1.96679688, + "router_z_loss_mlp": 0.15661621, + "step": 5253, + "time_per_iteration": 2.587277412414551 + }, + { + "auxiliary_loss_clip": 0.06489179, + "auxiliary_loss_mlp": 0.01281773, + "balance_loss_clip": 0.06290863, + "balance_loss_mlp": 0.01264309, + "epoch": 0.3158875695175109, + "flos": 23958806296320.0, + "grad_norm": 1.6773864910066614, + "language_loss": 0.73971915, + "learning_rate": 3.2021590182150924e-06, + "loss": 0.81742871, + "num_input_tokens_seen": 112802905, + "router_z_loss_clip": 1.98339844, + "router_z_loss_mlp": 0.17456055, + "step": 5254, + "time_per_iteration": 2.588543653488159 + }, + { + "auxiliary_loss_clip": 0.06491473, + "auxiliary_loss_mlp": 0.01275265, + "balance_loss_clip": 0.06290045, + "balance_loss_mlp": 0.01257408, + "epoch": 0.31594769277017887, + "flos": 13267036840320.0, + "grad_norm": 2.7381317978754933, + "language_loss": 0.78115344, + "learning_rate": 3.201847741843128e-06, + "loss": 0.85882092, + "num_input_tokens_seen": 112820305, + "router_z_loss_clip": 2.01367188, + "router_z_loss_mlp": 0.17858887, + "step": 5255, + "time_per_iteration": 2.5159435272216797 + }, + { + "auxiliary_loss_clip": 0.0648552, + "auxiliary_loss_mlp": 0.01275031, + "balance_loss_clip": 0.06288838, + "balance_loss_mlp": 0.01255921, + "epoch": 0.31600781602284683, + "flos": 23375072027520.0, + "grad_norm": 2.9601180138118286, + "language_loss": 0.78838313, + "learning_rate": 3.2015364198969772e-06, + "loss": 0.86598861, + "num_input_tokens_seen": 112841185, + "router_z_loss_clip": 1.96875, + "router_z_loss_mlp": 0.19104004, + "step": 5256, + "time_per_iteration": 2.560702085494995 + }, + { + "auxiliary_loss_clip": 0.06480406, + "auxiliary_loss_mlp": 0.01272902, + "balance_loss_clip": 0.06291319, + "balance_loss_mlp": 0.01257352, + "epoch": 0.3160679392755148, + "flos": 19834707715200.0, + "grad_norm": 1.443888473305352, + "language_loss": 0.71476674, + "learning_rate": 3.2012250523884453e-06, + "loss": 0.79229981, + "num_input_tokens_seen": 112860570, + "router_z_loss_clip": 1.89257812, + "router_z_loss_mlp": 0.15533447, + "step": 5257, + "time_per_iteration": 2.515044927597046 + }, + { + "auxiliary_loss_clip": 0.06490695, + "auxiliary_loss_mlp": 0.01275192, + "balance_loss_clip": 0.06291541, + "balance_loss_mlp": 0.01257787, + "epoch": 0.31612806252818276, + "flos": 20199368684160.0, + "grad_norm": 3.1125237193001967, + "language_loss": 0.77181315, + "learning_rate": 3.2009136393293393e-06, + "loss": 0.84947205, + "num_input_tokens_seen": 112877975, + "router_z_loss_clip": 1.9921875, + "router_z_loss_mlp": 0.17419434, + "step": 5258, + "time_per_iteration": 2.544926166534424 + }, + { + "auxiliary_loss_clip": 0.06484105, + "auxiliary_loss_mlp": 0.01276302, + "balance_loss_clip": 0.06286652, + "balance_loss_mlp": 0.01258624, + "epoch": 0.31618818578085073, + "flos": 24241596226560.0, + "grad_norm": 2.554871248122792, + "language_loss": 0.73012489, + "learning_rate": 3.200602180731467e-06, + "loss": 0.80772901, + "num_input_tokens_seen": 112896170, + "router_z_loss_clip": 1.97363281, + "router_z_loss_mlp": 0.17675781, + "step": 5259, + "time_per_iteration": 2.5244109630584717 + }, + { + "auxiliary_loss_clip": 0.06490766, + "auxiliary_loss_mlp": 0.01272581, + "balance_loss_clip": 0.06291697, + "balance_loss_mlp": 0.01256106, + "epoch": 0.3162483090335187, + "flos": 25088735404800.0, + "grad_norm": 2.502439629336286, + "language_loss": 0.66774327, + "learning_rate": 3.20029067660664e-06, + "loss": 0.74537671, + "num_input_tokens_seen": 112916180, + "router_z_loss_clip": 1.9921875, + "router_z_loss_mlp": 0.16455078, + "step": 5260, + "time_per_iteration": 2.575772762298584 + }, + { + "auxiliary_loss_clip": 0.06481651, + "auxiliary_loss_mlp": 0.01272837, + "balance_loss_clip": 0.06285223, + "balance_loss_mlp": 0.01256386, + "epoch": 0.31630843228618666, + "flos": 26330653895040.0, + "grad_norm": 2.0766337978972023, + "language_loss": 0.72817439, + "learning_rate": 3.1999791269666706e-06, + "loss": 0.80571926, + "num_input_tokens_seen": 112936745, + "router_z_loss_clip": 1.96679688, + "router_z_loss_mlp": 0.16455078, + "step": 5261, + "time_per_iteration": 2.559112548828125 + }, + { + "auxiliary_loss_clip": 0.06366719, + "auxiliary_loss_mlp": 0.01254616, + "balance_loss_clip": 0.06272718, + "balance_loss_mlp": 0.01250792, + "epoch": 0.3163685555388547, + "flos": 66780053856000.0, + "grad_norm": 0.7132570662369885, + "language_loss": 0.50697625, + "learning_rate": 3.1996675318233716e-06, + "loss": 0.58318961, + "num_input_tokens_seen": 112994845, + "router_z_loss_clip": 0.9375, + "router_z_loss_mlp": 0.03817749, + "step": 5262, + "time_per_iteration": 3.1381468772888184 + }, + { + "auxiliary_loss_clip": 0.06487425, + "auxiliary_loss_mlp": 0.01273056, + "balance_loss_clip": 0.06289163, + "balance_loss_mlp": 0.01256224, + "epoch": 0.31642867879152264, + "flos": 26002987303680.0, + "grad_norm": 1.713052875923359, + "language_loss": 0.85966682, + "learning_rate": 3.19935589118856e-06, + "loss": 0.9372716, + "num_input_tokens_seen": 113015125, + "router_z_loss_clip": 1.98046875, + "router_z_loss_mlp": 0.16833496, + "step": 5263, + "time_per_iteration": 2.5385844707489014 + }, + { + "auxiliary_loss_clip": 0.0647549, + "auxiliary_loss_mlp": 0.01273956, + "balance_loss_clip": 0.06283621, + "balance_loss_mlp": 0.01257695, + "epoch": 0.3164888020441906, + "flos": 25781943432960.0, + "grad_norm": 1.4697461293234868, + "language_loss": 0.82077682, + "learning_rate": 3.1990442050740535e-06, + "loss": 0.89827132, + "num_input_tokens_seen": 113035535, + "router_z_loss_clip": 1.91894531, + "router_z_loss_mlp": 0.16247559, + "step": 5264, + "time_per_iteration": 2.558708429336548 + }, + { + "auxiliary_loss_clip": 0.06488511, + "auxiliary_loss_mlp": 0.01271533, + "balance_loss_clip": 0.06288397, + "balance_loss_mlp": 0.01254117, + "epoch": 0.3165489252968586, + "flos": 19762437968640.0, + "grad_norm": 1.8601211050375244, + "language_loss": 0.80259931, + "learning_rate": 3.19873247349167e-06, + "loss": 0.88019973, + "num_input_tokens_seen": 113052720, + "router_z_loss_clip": 2.00097656, + "router_z_loss_mlp": 0.17419434, + "step": 5265, + "time_per_iteration": 2.492342948913574 + }, + { + "auxiliary_loss_clip": 0.06481829, + "auxiliary_loss_mlp": 0.01275233, + "balance_loss_clip": 0.06283312, + "balance_loss_mlp": 0.01257148, + "epoch": 0.31660904854952654, + "flos": 23190393628800.0, + "grad_norm": 2.032053662698869, + "language_loss": 0.75410831, + "learning_rate": 3.1984206964532307e-06, + "loss": 0.83167893, + "num_input_tokens_seen": 113071435, + "router_z_loss_clip": 1.98730469, + "router_z_loss_mlp": 0.1809082, + "step": 5266, + "time_per_iteration": 2.5563931465148926 + }, + { + "auxiliary_loss_clip": 0.06488708, + "auxiliary_loss_mlp": 0.01276821, + "balance_loss_clip": 0.06287502, + "balance_loss_mlp": 0.01258308, + "epoch": 0.3166691718021945, + "flos": 20414081571840.0, + "grad_norm": 2.020882594632444, + "language_loss": 0.79489279, + "learning_rate": 3.1981088739705585e-06, + "loss": 0.87254804, + "num_input_tokens_seen": 113088645, + "router_z_loss_clip": 2.01269531, + "router_z_loss_mlp": 0.18518066, + "step": 5267, + "time_per_iteration": 2.509413242340088 + }, + { + "auxiliary_loss_clip": 0.06371635, + "auxiliary_loss_mlp": 0.01254873, + "balance_loss_clip": 0.06277829, + "balance_loss_mlp": 0.01251359, + "epoch": 0.31672929505486247, + "flos": 70165816185600.0, + "grad_norm": 1.145238273522293, + "language_loss": 0.57623893, + "learning_rate": 3.197797006055478e-06, + "loss": 0.65250397, + "num_input_tokens_seen": 113152775, + "router_z_loss_clip": 0.9375, + "router_z_loss_mlp": 0.03518677, + "step": 5268, + "time_per_iteration": 4.6658477783203125 + }, + { + "auxiliary_loss_clip": 0.06486145, + "auxiliary_loss_mlp": 0.01271551, + "balance_loss_clip": 0.06287054, + "balance_loss_mlp": 0.01253884, + "epoch": 0.31678941830753043, + "flos": 14360977820160.0, + "grad_norm": 2.2953322915245784, + "language_loss": 0.73492396, + "learning_rate": 3.197485092719815e-06, + "loss": 0.81250083, + "num_input_tokens_seen": 113171410, + "router_z_loss_clip": 1.98925781, + "router_z_loss_mlp": 0.17651367, + "step": 5269, + "time_per_iteration": 2.500276565551758 + }, + { + "auxiliary_loss_clip": 0.06490922, + "auxiliary_loss_mlp": 0.01279355, + "balance_loss_clip": 0.06295022, + "balance_loss_mlp": 0.01261652, + "epoch": 0.3168495415601984, + "flos": 22754385308160.0, + "grad_norm": 1.8930521062253438, + "language_loss": 0.80391312, + "learning_rate": 3.1971731339753973e-06, + "loss": 0.88161588, + "num_input_tokens_seen": 113189965, + "router_z_loss_clip": 1.9609375, + "router_z_loss_mlp": 0.17700195, + "step": 5270, + "time_per_iteration": 4.030852794647217 + }, + { + "auxiliary_loss_clip": 0.0648749, + "auxiliary_loss_mlp": 0.01275027, + "balance_loss_clip": 0.06288311, + "balance_loss_mlp": 0.01257742, + "epoch": 0.31690966481286637, + "flos": 20120558319360.0, + "grad_norm": 2.0275703030815744, + "language_loss": 0.79860884, + "learning_rate": 3.1968611298340545e-06, + "loss": 0.87623405, + "num_input_tokens_seen": 113206355, + "router_z_loss_clip": 1.99121094, + "router_z_loss_mlp": 0.17285156, + "step": 5271, + "time_per_iteration": 3.963491201400757 + }, + { + "auxiliary_loss_clip": 0.06485552, + "auxiliary_loss_mlp": 0.01274595, + "balance_loss_clip": 0.06286864, + "balance_loss_mlp": 0.01256344, + "epoch": 0.31696978806553433, + "flos": 21185345278080.0, + "grad_norm": 2.0532864997035616, + "language_loss": 0.7348994, + "learning_rate": 3.1965490803076173e-06, + "loss": 0.81250083, + "num_input_tokens_seen": 113225440, + "router_z_loss_clip": 1.98828125, + "router_z_loss_mlp": 0.18237305, + "step": 5272, + "time_per_iteration": 2.5324926376342773 + }, + { + "auxiliary_loss_clip": 0.06497657, + "auxiliary_loss_mlp": 0.01275072, + "balance_loss_clip": 0.06294467, + "balance_loss_mlp": 0.01255629, + "epoch": 0.3170299113182023, + "flos": 43007030789760.0, + "grad_norm": 2.3636013379780083, + "language_loss": 0.69916022, + "learning_rate": 3.1962369854079194e-06, + "loss": 0.77688754, + "num_input_tokens_seen": 113248840, + "router_z_loss_clip": 2.03320312, + "router_z_loss_mlp": 0.19458008, + "step": 5273, + "time_per_iteration": 2.8313193321228027 + }, + { + "auxiliary_loss_clip": 0.0648469, + "auxiliary_loss_mlp": 0.01272488, + "balance_loss_clip": 0.06288255, + "balance_loss_mlp": 0.01255954, + "epoch": 0.31709003457087026, + "flos": 24466707020160.0, + "grad_norm": 3.373298123766896, + "language_loss": 0.68486917, + "learning_rate": 3.195924845146795e-06, + "loss": 0.76244098, + "num_input_tokens_seen": 113269630, + "router_z_loss_clip": 1.9609375, + "router_z_loss_mlp": 0.1652832, + "step": 5274, + "time_per_iteration": 2.5647053718566895 + }, + { + "auxiliary_loss_clip": 0.06486842, + "auxiliary_loss_mlp": 0.01272159, + "balance_loss_clip": 0.06295811, + "balance_loss_mlp": 0.01256114, + "epoch": 0.3171501578235382, + "flos": 24142394592000.0, + "grad_norm": 1.437173314012816, + "language_loss": 0.8105545, + "learning_rate": 3.195612659536081e-06, + "loss": 0.88814449, + "num_input_tokens_seen": 113291200, + "router_z_loss_clip": 1.91308594, + "router_z_loss_mlp": 0.16052246, + "step": 5275, + "time_per_iteration": 2.545689821243286 + }, + { + "auxiliary_loss_clip": 0.06496362, + "auxiliary_loss_mlp": 0.01272499, + "balance_loss_clip": 0.0629561, + "balance_loss_mlp": 0.01254296, + "epoch": 0.31721028107620625, + "flos": 18885641644800.0, + "grad_norm": 1.7797970991839078, + "language_loss": 0.73459136, + "learning_rate": 3.1953004285876147e-06, + "loss": 0.81228, + "num_input_tokens_seen": 113310170, + "router_z_loss_clip": 2.00683594, + "router_z_loss_mlp": 0.18212891, + "step": 5276, + "time_per_iteration": 3.978994131088257 + }, + { + "auxiliary_loss_clip": 0.06480486, + "auxiliary_loss_mlp": 0.01276369, + "balance_loss_clip": 0.06288992, + "balance_loss_mlp": 0.01259811, + "epoch": 0.3172704043288742, + "flos": 23154405500160.0, + "grad_norm": 1.4192945576637652, + "language_loss": 0.78409082, + "learning_rate": 3.194988152313236e-06, + "loss": 0.86165935, + "num_input_tokens_seen": 113331140, + "router_z_loss_clip": 1.91699219, + "router_z_loss_mlp": 0.16552734, + "step": 5277, + "time_per_iteration": 2.6181840896606445 + }, + { + "auxiliary_loss_clip": 0.06493685, + "auxiliary_loss_mlp": 0.01273951, + "balance_loss_clip": 0.06294833, + "balance_loss_mlp": 0.01256653, + "epoch": 0.3173305275815422, + "flos": 17864347754880.0, + "grad_norm": 1.9934204528772321, + "language_loss": 0.79709554, + "learning_rate": 3.1946758307247878e-06, + "loss": 0.87477195, + "num_input_tokens_seen": 113350030, + "router_z_loss_clip": 1.98828125, + "router_z_loss_mlp": 0.17297363, + "step": 5278, + "time_per_iteration": 2.4955894947052 + }, + { + "auxiliary_loss_clip": 0.06380783, + "auxiliary_loss_mlp": 0.01265109, + "balance_loss_clip": 0.06288064, + "balance_loss_mlp": 0.01260886, + "epoch": 0.31739065083421014, + "flos": 59988083529600.0, + "grad_norm": 0.841903886868049, + "language_loss": 0.62797457, + "learning_rate": 3.1943634638341114e-06, + "loss": 0.7044335, + "num_input_tokens_seen": 113395820, + "router_z_loss_clip": 0.92724609, + "router_z_loss_mlp": 0.04226685, + "step": 5279, + "time_per_iteration": 2.920987367630005 + }, + { + "auxiliary_loss_clip": 0.06489395, + "auxiliary_loss_mlp": 0.01285376, + "balance_loss_clip": 0.06287935, + "balance_loss_mlp": 0.01265265, + "epoch": 0.3174507740868781, + "flos": 23807013425280.0, + "grad_norm": 2.0709232065681475, + "language_loss": 0.81487882, + "learning_rate": 3.194051051653053e-06, + "loss": 0.89262652, + "num_input_tokens_seen": 113416835, + "router_z_loss_clip": 2.01367188, + "router_z_loss_mlp": 0.2010498, + "step": 5280, + "time_per_iteration": 2.537612199783325 + }, + { + "auxiliary_loss_clip": 0.06483282, + "auxiliary_loss_mlp": 0.01281645, + "balance_loss_clip": 0.06291374, + "balance_loss_mlp": 0.01264276, + "epoch": 0.31751089733954607, + "flos": 27646728848640.0, + "grad_norm": 1.437826441265799, + "language_loss": 0.78464299, + "learning_rate": 3.19373859419346e-06, + "loss": 0.86229229, + "num_input_tokens_seen": 113440850, + "router_z_loss_clip": 1.91699219, + "router_z_loss_mlp": 0.17358398, + "step": 5281, + "time_per_iteration": 2.6482186317443848 + }, + { + "auxiliary_loss_clip": 0.06485789, + "auxiliary_loss_mlp": 0.01283007, + "balance_loss_clip": 0.06290175, + "balance_loss_mlp": 0.01265424, + "epoch": 0.31757102059221404, + "flos": 23776098468480.0, + "grad_norm": 1.5338111796323235, + "language_loss": 0.78882301, + "learning_rate": 3.193426091467179e-06, + "loss": 0.86651099, + "num_input_tokens_seen": 113461000, + "router_z_loss_clip": 1.95410156, + "router_z_loss_mlp": 0.17590332, + "step": 5282, + "time_per_iteration": 2.5157217979431152 + }, + { + "auxiliary_loss_clip": 0.06494205, + "auxiliary_loss_mlp": 0.01276135, + "balance_loss_clip": 0.0629286, + "balance_loss_mlp": 0.01258373, + "epoch": 0.317631143844882, + "flos": 25271485159680.0, + "grad_norm": 2.0006947857157753, + "language_loss": 0.67952389, + "learning_rate": 3.193113543486061e-06, + "loss": 0.7572273, + "num_input_tokens_seen": 113480820, + "router_z_loss_clip": 2.01367188, + "router_z_loss_mlp": 0.1776123, + "step": 5283, + "time_per_iteration": 2.565925359725952 + }, + { + "auxiliary_loss_clip": 0.06373101, + "auxiliary_loss_mlp": 0.01271528, + "balance_loss_clip": 0.0628058, + "balance_loss_mlp": 0.01267352, + "epoch": 0.31769126709754997, + "flos": 55841832743040.0, + "grad_norm": 0.7241871595116953, + "language_loss": 0.52631503, + "learning_rate": 3.192800950261958e-06, + "loss": 0.60276127, + "num_input_tokens_seen": 113536910, + "router_z_loss_clip": 0.921875, + "router_z_loss_mlp": 0.04177856, + "step": 5284, + "time_per_iteration": 3.1037213802337646 + }, + { + "auxiliary_loss_clip": 0.0649649, + "auxiliary_loss_mlp": 0.01274319, + "balance_loss_clip": 0.06291351, + "balance_loss_mlp": 0.01257225, + "epoch": 0.31775139035021793, + "flos": 16696124530560.0, + "grad_norm": 2.2460762000689294, + "language_loss": 0.70842284, + "learning_rate": 3.1924883118067235e-06, + "loss": 0.78613091, + "num_input_tokens_seen": 113555480, + "router_z_loss_clip": 2.04980469, + "router_z_loss_mlp": 0.17102051, + "step": 5285, + "time_per_iteration": 2.5407655239105225 + }, + { + "auxiliary_loss_clip": 0.06366412, + "auxiliary_loss_mlp": 0.01262401, + "balance_loss_clip": 0.06274283, + "balance_loss_mlp": 0.01258384, + "epoch": 0.3178115136028859, + "flos": 64246141261440.0, + "grad_norm": 1.0137073922687154, + "language_loss": 0.60545647, + "learning_rate": 3.1921756281322123e-06, + "loss": 0.68174458, + "num_input_tokens_seen": 113616790, + "router_z_loss_clip": 0.921875, + "router_z_loss_mlp": 0.04016113, + "step": 5286, + "time_per_iteration": 3.1833202838897705 + }, + { + "auxiliary_loss_clip": 0.06498363, + "auxiliary_loss_mlp": 0.01284909, + "balance_loss_clip": 0.06297486, + "balance_loss_mlp": 0.01267051, + "epoch": 0.31787163685555386, + "flos": 18703395014400.0, + "grad_norm": 1.7319286904547555, + "language_loss": 0.72404122, + "learning_rate": 3.1918628992502826e-06, + "loss": 0.80187392, + "num_input_tokens_seen": 113635320, + "router_z_loss_clip": 2.0078125, + "router_z_loss_mlp": 0.17871094, + "step": 5287, + "time_per_iteration": 2.50571608543396 + }, + { + "auxiliary_loss_clip": 0.06495041, + "auxiliary_loss_mlp": 0.01276683, + "balance_loss_clip": 0.06292516, + "balance_loss_mlp": 0.012578, + "epoch": 0.31793176010822183, + "flos": 21331184509440.0, + "grad_norm": 1.978321388726588, + "language_loss": 0.76231503, + "learning_rate": 3.191550125172792e-06, + "loss": 0.84003228, + "num_input_tokens_seen": 113654000, + "router_z_loss_clip": 2.02539062, + "router_z_loss_mlp": 0.18884277, + "step": 5288, + "time_per_iteration": 2.5568416118621826 + }, + { + "auxiliary_loss_clip": 0.06485806, + "auxiliary_loss_mlp": 0.01283528, + "balance_loss_clip": 0.06293501, + "balance_loss_mlp": 0.01267816, + "epoch": 0.31799188336088985, + "flos": 20964846458880.0, + "grad_norm": 1.7076221862053031, + "language_loss": 0.88265222, + "learning_rate": 3.1912373059116007e-06, + "loss": 0.96034551, + "num_input_tokens_seen": 113672375, + "router_z_loss_clip": 1.92578125, + "router_z_loss_mlp": 0.15710449, + "step": 5289, + "time_per_iteration": 2.5359349250793457 + }, + { + "auxiliary_loss_clip": 0.06488061, + "auxiliary_loss_mlp": 0.01286652, + "balance_loss_clip": 0.06295781, + "balance_loss_mlp": 0.01269724, + "epoch": 0.3180520066135578, + "flos": 22498485338880.0, + "grad_norm": 1.4069348748047803, + "language_loss": 0.68210149, + "learning_rate": 3.190924441478572e-06, + "loss": 0.75984859, + "num_input_tokens_seen": 113692385, + "router_z_loss_clip": 1.92382812, + "router_z_loss_mlp": 0.16906738, + "step": 5290, + "time_per_iteration": 2.5393311977386475 + }, + { + "auxiliary_loss_clip": 0.06494544, + "auxiliary_loss_mlp": 0.0128386, + "balance_loss_clip": 0.06290419, + "balance_loss_mlp": 0.01265788, + "epoch": 0.3181121298662258, + "flos": 27242725587840.0, + "grad_norm": 3.4346413288346, + "language_loss": 0.79944348, + "learning_rate": 3.1906115318855687e-06, + "loss": 0.87722754, + "num_input_tokens_seen": 113712145, + "router_z_loss_clip": 2.04003906, + "router_z_loss_mlp": 0.18066406, + "step": 5291, + "time_per_iteration": 2.564091444015503 + }, + { + "auxiliary_loss_clip": 0.06485635, + "auxiliary_loss_mlp": 0.01278435, + "balance_loss_clip": 0.06287642, + "balance_loss_mlp": 0.01259361, + "epoch": 0.31817225311889374, + "flos": 23185991289600.0, + "grad_norm": 2.0451390273410004, + "language_loss": 0.79931051, + "learning_rate": 3.1902985771444577e-06, + "loss": 0.87695122, + "num_input_tokens_seen": 113731435, + "router_z_loss_clip": 1.97949219, + "router_z_loss_mlp": 0.19067383, + "step": 5292, + "time_per_iteration": 2.743156671524048 + }, + { + "auxiliary_loss_clip": 0.06476898, + "auxiliary_loss_mlp": 0.01275055, + "balance_loss_clip": 0.06287324, + "balance_loss_mlp": 0.01258044, + "epoch": 0.3182323763715617, + "flos": 23265598268160.0, + "grad_norm": 1.819133879513315, + "language_loss": 0.75602406, + "learning_rate": 3.1899855772671043e-06, + "loss": 0.8335436, + "num_input_tokens_seen": 113750825, + "router_z_loss_clip": 1.89550781, + "router_z_loss_mlp": 0.17004395, + "step": 5293, + "time_per_iteration": 2.523386001586914 + }, + { + "auxiliary_loss_clip": 0.06482453, + "auxiliary_loss_mlp": 0.01276012, + "balance_loss_clip": 0.06290737, + "balance_loss_mlp": 0.01260204, + "epoch": 0.3182924996242297, + "flos": 29023292050560.0, + "grad_norm": 2.0524562129349526, + "language_loss": 0.75145984, + "learning_rate": 3.189672532265379e-06, + "loss": 0.82904446, + "num_input_tokens_seen": 113770010, + "router_z_loss_clip": 1.91601562, + "router_z_loss_mlp": 0.15808105, + "step": 5294, + "time_per_iteration": 2.607849597930908 + }, + { + "auxiliary_loss_clip": 0.06489888, + "auxiliary_loss_mlp": 0.01277785, + "balance_loss_clip": 0.06293514, + "balance_loss_mlp": 0.01259201, + "epoch": 0.31835262287689764, + "flos": 20455478288640.0, + "grad_norm": 2.029675905915872, + "language_loss": 0.76497674, + "learning_rate": 3.189359442151152e-06, + "loss": 0.84265351, + "num_input_tokens_seen": 113788640, + "router_z_loss_clip": 1.96582031, + "router_z_loss_mlp": 0.18591309, + "step": 5295, + "time_per_iteration": 2.4980461597442627 + }, + { + "auxiliary_loss_clip": 0.06494178, + "auxiliary_loss_mlp": 0.01278535, + "balance_loss_clip": 0.06293284, + "balance_loss_mlp": 0.01261166, + "epoch": 0.3184127461295656, + "flos": 25126568323200.0, + "grad_norm": 2.03182891885516, + "language_loss": 0.70142519, + "learning_rate": 3.189046306936296e-06, + "loss": 0.77915227, + "num_input_tokens_seen": 113809515, + "router_z_loss_clip": 2.0078125, + "router_z_loss_mlp": 0.17358398, + "step": 5296, + "time_per_iteration": 2.610671043395996 + }, + { + "auxiliary_loss_clip": 0.06483515, + "auxiliary_loss_mlp": 0.01274893, + "balance_loss_clip": 0.0628704, + "balance_loss_mlp": 0.01258371, + "epoch": 0.31847286938223357, + "flos": 25557377690880.0, + "grad_norm": 1.5251920176335134, + "language_loss": 0.77957898, + "learning_rate": 3.1887331266326846e-06, + "loss": 0.85716307, + "num_input_tokens_seen": 113829770, + "router_z_loss_clip": 1.96484375, + "router_z_loss_mlp": 0.16516113, + "step": 5297, + "time_per_iteration": 2.539649486541748 + }, + { + "auxiliary_loss_clip": 0.06479752, + "auxiliary_loss_mlp": 0.01272766, + "balance_loss_clip": 0.06283344, + "balance_loss_mlp": 0.01255516, + "epoch": 0.31853299263490154, + "flos": 27789926676480.0, + "grad_norm": 1.8177911904554251, + "language_loss": 0.80074358, + "learning_rate": 3.1884199012521942e-06, + "loss": 0.87826872, + "num_input_tokens_seen": 113849320, + "router_z_loss_clip": 1.96386719, + "router_z_loss_mlp": 0.17248535, + "step": 5298, + "time_per_iteration": 2.6127634048461914 + }, + { + "auxiliary_loss_clip": 0.06487016, + "auxiliary_loss_mlp": 0.0127216, + "balance_loss_clip": 0.06284906, + "balance_loss_mlp": 0.01254815, + "epoch": 0.3185931158875695, + "flos": 22712653175040.0, + "grad_norm": 1.6158824069779534, + "language_loss": 0.74615932, + "learning_rate": 3.1881066308067016e-06, + "loss": 0.82375109, + "num_input_tokens_seen": 113867860, + "router_z_loss_clip": 2.02148438, + "router_z_loss_mlp": 0.17346191, + "step": 5299, + "time_per_iteration": 2.570178508758545 + }, + { + "auxiliary_loss_clip": 0.06491919, + "auxiliary_loss_mlp": 0.01275355, + "balance_loss_clip": 0.06290901, + "balance_loss_mlp": 0.01258249, + "epoch": 0.31865323914023747, + "flos": 24578402912640.0, + "grad_norm": 1.9760141697724851, + "language_loss": 0.78568625, + "learning_rate": 3.1877933153080873e-06, + "loss": 0.86335897, + "num_input_tokens_seen": 113886375, + "router_z_loss_clip": 2.0078125, + "router_z_loss_mlp": 0.17102051, + "step": 5300, + "time_per_iteration": 2.7260777950286865 + }, + { + "auxiliary_loss_clip": 0.06483838, + "auxiliary_loss_mlp": 0.01272854, + "balance_loss_clip": 0.06287212, + "balance_loss_mlp": 0.01254495, + "epoch": 0.31871336239290543, + "flos": 18192391689600.0, + "grad_norm": 2.1538981188283195, + "language_loss": 0.84250915, + "learning_rate": 3.1874799547682304e-06, + "loss": 0.92007607, + "num_input_tokens_seen": 113904065, + "router_z_loss_clip": 1.96386719, + "router_z_loss_mlp": 0.18347168, + "step": 5301, + "time_per_iteration": 2.485152244567871 + }, + { + "auxiliary_loss_clip": 0.06484723, + "auxiliary_loss_mlp": 0.01274861, + "balance_loss_clip": 0.06291914, + "balance_loss_mlp": 0.01256777, + "epoch": 0.31877348564557345, + "flos": 21831789928320.0, + "grad_norm": 2.0482094969798696, + "language_loss": 0.7812382, + "learning_rate": 3.187166549199015e-06, + "loss": 0.85883403, + "num_input_tokens_seen": 113918415, + "router_z_loss_clip": 1.92675781, + "router_z_loss_mlp": 0.18066406, + "step": 5302, + "time_per_iteration": 2.528764247894287 + }, + { + "auxiliary_loss_clip": 0.0648333, + "auxiliary_loss_mlp": 0.01275814, + "balance_loss_clip": 0.06290714, + "balance_loss_mlp": 0.01258159, + "epoch": 0.3188336088982414, + "flos": 22021331863680.0, + "grad_norm": 1.6144767194600491, + "language_loss": 0.79736584, + "learning_rate": 3.1868530986123255e-06, + "loss": 0.8749572, + "num_input_tokens_seen": 113938135, + "router_z_loss_clip": 1.92675781, + "router_z_loss_mlp": 0.17651367, + "step": 5303, + "time_per_iteration": 2.5235095024108887 + }, + { + "auxiliary_loss_clip": 0.06497993, + "auxiliary_loss_mlp": 0.01276899, + "balance_loss_clip": 0.06290174, + "balance_loss_mlp": 0.01258159, + "epoch": 0.3188937321509094, + "flos": 20054116431360.0, + "grad_norm": 1.7320090718032515, + "language_loss": 0.73529422, + "learning_rate": 3.186539603020047e-06, + "loss": 0.81304312, + "num_input_tokens_seen": 113957125, + "router_z_loss_clip": 2.08007812, + "router_z_loss_mlp": 0.18737793, + "step": 5304, + "time_per_iteration": 2.5141329765319824 + }, + { + "auxiliary_loss_clip": 0.06481734, + "auxiliary_loss_mlp": 0.01278154, + "balance_loss_clip": 0.06290816, + "balance_loss_mlp": 0.01260928, + "epoch": 0.31895385540357735, + "flos": 25855135574400.0, + "grad_norm": 1.8091269764667626, + "language_loss": 0.72548914, + "learning_rate": 3.186226062434068e-06, + "loss": 0.80308801, + "num_input_tokens_seen": 113974875, + "router_z_loss_clip": 1.91113281, + "router_z_loss_mlp": 0.17236328, + "step": 5305, + "time_per_iteration": 2.5648975372314453 + }, + { + "auxiliary_loss_clip": 0.06487268, + "auxiliary_loss_mlp": 0.01270708, + "balance_loss_clip": 0.06292576, + "balance_loss_mlp": 0.01254603, + "epoch": 0.3190139786562453, + "flos": 23484545786880.0, + "grad_norm": 2.116447005947582, + "language_loss": 0.64815247, + "learning_rate": 3.1859124768662778e-06, + "loss": 0.72573221, + "num_input_tokens_seen": 113994450, + "router_z_loss_clip": 1.94628906, + "router_z_loss_mlp": 0.16113281, + "step": 5306, + "time_per_iteration": 2.5745668411254883 + }, + { + "auxiliary_loss_clip": 0.06483987, + "auxiliary_loss_mlp": 0.01282676, + "balance_loss_clip": 0.0628574, + "balance_loss_mlp": 0.01264413, + "epoch": 0.3190741019089133, + "flos": 29103150591360.0, + "grad_norm": 2.0084949709877726, + "language_loss": 0.79260421, + "learning_rate": 3.1855988463285678e-06, + "loss": 0.87027091, + "num_input_tokens_seen": 114013945, + "router_z_loss_clip": 1.98144531, + "router_z_loss_mlp": 0.18273926, + "step": 5307, + "time_per_iteration": 2.557509183883667 + }, + { + "auxiliary_loss_clip": 0.06481419, + "auxiliary_loss_mlp": 0.01278653, + "balance_loss_clip": 0.06289747, + "balance_loss_mlp": 0.01260736, + "epoch": 0.31913422516158124, + "flos": 17135361233280.0, + "grad_norm": 3.9021838038471097, + "language_loss": 0.78660965, + "learning_rate": 3.1852851708328308e-06, + "loss": 0.86421037, + "num_input_tokens_seen": 114031375, + "router_z_loss_clip": 1.91796875, + "router_z_loss_mlp": 0.17907715, + "step": 5308, + "time_per_iteration": 3.906280994415283 + }, + { + "auxiliary_loss_clip": 0.06493698, + "auxiliary_loss_mlp": 0.01278493, + "balance_loss_clip": 0.06287338, + "balance_loss_mlp": 0.01259408, + "epoch": 0.3191943484142492, + "flos": 16075228176000.0, + "grad_norm": 3.1945469837170215, + "language_loss": 0.74758154, + "learning_rate": 3.184971450390961e-06, + "loss": 0.82530349, + "num_input_tokens_seen": 114048465, + "router_z_loss_clip": 2.06152344, + "router_z_loss_mlp": 0.19091797, + "step": 5309, + "time_per_iteration": 2.4796438217163086 + }, + { + "auxiliary_loss_clip": 0.06480245, + "auxiliary_loss_mlp": 0.01274762, + "balance_loss_clip": 0.06283399, + "balance_loss_mlp": 0.01257954, + "epoch": 0.3192544716669172, + "flos": 22972787775360.0, + "grad_norm": 1.6995242114780418, + "language_loss": 0.83242565, + "learning_rate": 3.184657685014856e-06, + "loss": 0.90997577, + "num_input_tokens_seen": 114068415, + "router_z_loss_clip": 1.96777344, + "router_z_loss_mlp": 0.16809082, + "step": 5310, + "time_per_iteration": 5.470219373703003 + }, + { + "auxiliary_loss_clip": 0.06475915, + "auxiliary_loss_mlp": 0.01272391, + "balance_loss_clip": 0.06281388, + "balance_loss_mlp": 0.01255868, + "epoch": 0.31931459491958514, + "flos": 26877645348480.0, + "grad_norm": 1.407923936832892, + "language_loss": 0.78906345, + "learning_rate": 3.184343874716412e-06, + "loss": 0.86654651, + "num_input_tokens_seen": 114088565, + "router_z_loss_clip": 1.94628906, + "router_z_loss_mlp": 0.1652832, + "step": 5311, + "time_per_iteration": 2.546112298965454 + }, + { + "auxiliary_loss_clip": 0.06477334, + "auxiliary_loss_mlp": 0.01272194, + "balance_loss_clip": 0.06282097, + "balance_loss_mlp": 0.01255254, + "epoch": 0.3193747181722531, + "flos": 21843194083200.0, + "grad_norm": 1.8192899238067177, + "language_loss": 0.84889889, + "learning_rate": 3.1840300195075295e-06, + "loss": 0.92639416, + "num_input_tokens_seen": 114107160, + "router_z_loss_clip": 1.95117188, + "router_z_loss_mlp": 0.16943359, + "step": 5312, + "time_per_iteration": 2.5534987449645996 + }, + { + "auxiliary_loss_clip": 0.06489489, + "auxiliary_loss_mlp": 0.01274677, + "balance_loss_clip": 0.06284228, + "balance_loss_mlp": 0.012567, + "epoch": 0.31943484142492107, + "flos": 18329593950720.0, + "grad_norm": 3.1557419136729536, + "language_loss": 0.79280984, + "learning_rate": 3.1837161194001102e-06, + "loss": 0.87045145, + "num_input_tokens_seen": 114123420, + "router_z_loss_clip": 2.05078125, + "router_z_loss_mlp": 0.17980957, + "step": 5313, + "time_per_iteration": 2.47098445892334 + }, + { + "auxiliary_loss_clip": 0.06477478, + "auxiliary_loss_mlp": 0.01274452, + "balance_loss_clip": 0.06281047, + "balance_loss_mlp": 0.01256618, + "epoch": 0.31949496467758903, + "flos": 21622150212480.0, + "grad_norm": 2.7721598847405584, + "language_loss": 0.86245549, + "learning_rate": 3.183402174406057e-06, + "loss": 0.93997484, + "num_input_tokens_seen": 114139230, + "router_z_loss_clip": 1.96484375, + "router_z_loss_mlp": 0.17834473, + "step": 5314, + "time_per_iteration": 2.531196117401123 + }, + { + "auxiliary_loss_clip": 0.0647811, + "auxiliary_loss_mlp": 0.0127239, + "balance_loss_clip": 0.06281686, + "balance_loss_mlp": 0.01255188, + "epoch": 0.31955508793025705, + "flos": 21766312362240.0, + "grad_norm": 1.712027342879292, + "language_loss": 0.80238831, + "learning_rate": 3.1830881845372747e-06, + "loss": 0.8798933, + "num_input_tokens_seen": 114159290, + "router_z_loss_clip": 1.96386719, + "router_z_loss_mlp": 0.17199707, + "step": 5315, + "time_per_iteration": 2.5066771507263184 + }, + { + "auxiliary_loss_clip": 0.06485026, + "auxiliary_loss_mlp": 0.01283831, + "balance_loss_clip": 0.06286455, + "balance_loss_mlp": 0.01265854, + "epoch": 0.319615211182925, + "flos": 17169881915520.0, + "grad_norm": 2.687676993792702, + "language_loss": 0.67569852, + "learning_rate": 3.18277414980567e-06, + "loss": 0.75338709, + "num_input_tokens_seen": 114177655, + "router_z_loss_clip": 1.98632812, + "router_z_loss_mlp": 0.17980957, + "step": 5316, + "time_per_iteration": 3.943110942840576 + }, + { + "auxiliary_loss_clip": 0.0648303, + "auxiliary_loss_mlp": 0.01272207, + "balance_loss_clip": 0.0628902, + "balance_loss_mlp": 0.01255566, + "epoch": 0.319675334435593, + "flos": 28120653941760.0, + "grad_norm": 1.5692381446514811, + "language_loss": 0.69637752, + "learning_rate": 3.1824600702231515e-06, + "loss": 0.77392983, + "num_input_tokens_seen": 114200880, + "router_z_loss_clip": 1.93652344, + "router_z_loss_mlp": 0.16650391, + "step": 5317, + "time_per_iteration": 2.642251491546631 + }, + { + "auxiliary_loss_clip": 0.06377298, + "auxiliary_loss_mlp": 0.0129256, + "balance_loss_clip": 0.06285109, + "balance_loss_mlp": 0.01288716, + "epoch": 0.31973545768826095, + "flos": 69524235072000.0, + "grad_norm": 0.7198160842036254, + "language_loss": 0.5281924, + "learning_rate": 3.182145945801628e-06, + "loss": 0.60489094, + "num_input_tokens_seen": 114267145, + "router_z_loss_clip": 0.921875, + "router_z_loss_mlp": 0.03839111, + "step": 5318, + "time_per_iteration": 3.2718679904937744 + }, + { + "auxiliary_loss_clip": 0.06479475, + "auxiliary_loss_mlp": 0.01271921, + "balance_loss_clip": 0.0628712, + "balance_loss_mlp": 0.01254969, + "epoch": 0.3197955809409289, + "flos": 13704344899200.0, + "grad_norm": 1.5995609143402318, + "language_loss": 0.84504628, + "learning_rate": 3.181831776553012e-06, + "loss": 0.92256021, + "num_input_tokens_seen": 114284630, + "router_z_loss_clip": 1.92480469, + "router_z_loss_mlp": 0.16955566, + "step": 5319, + "time_per_iteration": 2.5372629165649414 + }, + { + "auxiliary_loss_clip": 0.06480815, + "auxiliary_loss_mlp": 0.01279474, + "balance_loss_clip": 0.06286162, + "balance_loss_mlp": 0.01261199, + "epoch": 0.3198557041935969, + "flos": 33226368704640.0, + "grad_norm": 1.6136244255626262, + "language_loss": 0.64208525, + "learning_rate": 3.1815175624892165e-06, + "loss": 0.71968812, + "num_input_tokens_seen": 114305830, + "router_z_loss_clip": 1.9453125, + "router_z_loss_mlp": 0.18273926, + "step": 5320, + "time_per_iteration": 2.675477981567383 + }, + { + "auxiliary_loss_clip": 0.0648189, + "auxiliary_loss_mlp": 0.01271878, + "balance_loss_clip": 0.06280586, + "balance_loss_mlp": 0.01254402, + "epoch": 0.31991582744626484, + "flos": 23738726747520.0, + "grad_norm": 1.9696222638037655, + "language_loss": 0.71059012, + "learning_rate": 3.1812033036221567e-06, + "loss": 0.78812778, + "num_input_tokens_seen": 114325165, + "router_z_loss_clip": 2.01171875, + "router_z_loss_mlp": 0.17480469, + "step": 5321, + "time_per_iteration": 2.6383230686187744 + }, + { + "auxiliary_loss_clip": 0.06491005, + "auxiliary_loss_mlp": 0.01288903, + "balance_loss_clip": 0.06286187, + "balance_loss_mlp": 0.01270318, + "epoch": 0.3199759506989328, + "flos": 18556633388160.0, + "grad_norm": 2.30981924299517, + "language_loss": 0.86988461, + "learning_rate": 3.180888999963749e-06, + "loss": 0.94768369, + "num_input_tokens_seen": 114341310, + "router_z_loss_clip": 2.04980469, + "router_z_loss_mlp": 0.18591309, + "step": 5322, + "time_per_iteration": 2.4862442016601562 + }, + { + "auxiliary_loss_clip": 0.0648296, + "auxiliary_loss_mlp": 0.01273077, + "balance_loss_clip": 0.06285054, + "balance_loss_mlp": 0.01256722, + "epoch": 0.3200360739516008, + "flos": 22425418978560.0, + "grad_norm": 1.6041292280722281, + "language_loss": 0.83380175, + "learning_rate": 3.1805746515259123e-06, + "loss": 0.91136217, + "num_input_tokens_seen": 114360355, + "router_z_loss_clip": 1.9765625, + "router_z_loss_mlp": 0.16369629, + "step": 5323, + "time_per_iteration": 2.5262420177459717 + }, + { + "auxiliary_loss_clip": 0.06476378, + "auxiliary_loss_mlp": 0.01277972, + "balance_loss_clip": 0.06284457, + "balance_loss_mlp": 0.01258529, + "epoch": 0.32009619720426874, + "flos": 20601569082240.0, + "grad_norm": 1.775654796490425, + "language_loss": 0.78471839, + "learning_rate": 3.1802602583205663e-06, + "loss": 0.86226195, + "num_input_tokens_seen": 114379220, + "router_z_loss_clip": 1.91796875, + "router_z_loss_mlp": 0.19433594, + "step": 5324, + "time_per_iteration": 2.492380380630493 + }, + { + "auxiliary_loss_clip": 0.06478705, + "auxiliary_loss_mlp": 0.01274174, + "balance_loss_clip": 0.06283212, + "balance_loss_mlp": 0.01256042, + "epoch": 0.3201563204569367, + "flos": 18153049397760.0, + "grad_norm": 1.7224742254360714, + "language_loss": 0.80742848, + "learning_rate": 3.1799458203596333e-06, + "loss": 0.88495719, + "num_input_tokens_seen": 114396365, + "router_z_loss_clip": 1.95507812, + "router_z_loss_mlp": 0.18139648, + "step": 5325, + "time_per_iteration": 2.4962642192840576 + }, + { + "auxiliary_loss_clip": 0.06478769, + "auxiliary_loss_mlp": 0.01277308, + "balance_loss_clip": 0.06280222, + "balance_loss_mlp": 0.01259701, + "epoch": 0.32021644370960467, + "flos": 31691975137920.0, + "grad_norm": 1.8321318923341703, + "language_loss": 0.75898254, + "learning_rate": 3.179631337655037e-06, + "loss": 0.83654332, + "num_input_tokens_seen": 114416780, + "router_z_loss_clip": 1.98632812, + "router_z_loss_mlp": 0.17602539, + "step": 5326, + "time_per_iteration": 2.5752692222595215 + }, + { + "auxiliary_loss_clip": 0.06472234, + "auxiliary_loss_mlp": 0.01278108, + "balance_loss_clip": 0.06281741, + "balance_loss_mlp": 0.01260918, + "epoch": 0.32027656696227264, + "flos": 26872488322560.0, + "grad_norm": 1.458996564995821, + "language_loss": 0.81400204, + "learning_rate": 3.179316810218701e-06, + "loss": 0.89150548, + "num_input_tokens_seen": 114437405, + "router_z_loss_clip": 1.90527344, + "router_z_loss_mlp": 0.171875, + "step": 5327, + "time_per_iteration": 2.5635383129119873 + }, + { + "auxiliary_loss_clip": 0.06486546, + "auxiliary_loss_mlp": 0.01273421, + "balance_loss_clip": 0.062847, + "balance_loss_mlp": 0.01256207, + "epoch": 0.32033669021494066, + "flos": 24176705639040.0, + "grad_norm": 1.3787000535244864, + "language_loss": 0.77910948, + "learning_rate": 3.179002238062554e-06, + "loss": 0.85670912, + "num_input_tokens_seen": 114458505, + "router_z_loss_clip": 2.01855469, + "router_z_loss_mlp": 0.17211914, + "step": 5328, + "time_per_iteration": 2.514646053314209 + }, + { + "auxiliary_loss_clip": 0.06484267, + "auxiliary_loss_mlp": 0.01278516, + "balance_loss_clip": 0.06287045, + "balance_loss_mlp": 0.0125992, + "epoch": 0.3203968134676086, + "flos": 24467419779840.0, + "grad_norm": 1.5501370939230803, + "language_loss": 0.74267161, + "learning_rate": 3.178687621198524e-06, + "loss": 0.82029939, + "num_input_tokens_seen": 114479050, + "router_z_loss_clip": 1.97265625, + "router_z_loss_mlp": 0.18591309, + "step": 5329, + "time_per_iteration": 2.5436654090881348 + }, + { + "auxiliary_loss_clip": 0.06471072, + "auxiliary_loss_mlp": 0.01278598, + "balance_loss_clip": 0.06282842, + "balance_loss_mlp": 0.01262434, + "epoch": 0.3204569367202766, + "flos": 18010606256640.0, + "grad_norm": 1.7046636031855489, + "language_loss": 0.71222955, + "learning_rate": 3.1783729596385415e-06, + "loss": 0.78972626, + "num_input_tokens_seen": 114497415, + "router_z_loss_clip": 1.8828125, + "router_z_loss_mlp": 0.16162109, + "step": 5330, + "time_per_iteration": 2.479647397994995 + }, + { + "auxiliary_loss_clip": 0.06485157, + "auxiliary_loss_mlp": 0.01277162, + "balance_loss_clip": 0.0628237, + "balance_loss_mlp": 0.0125791, + "epoch": 0.32051705997294455, + "flos": 30597237544320.0, + "grad_norm": 1.705143811074938, + "language_loss": 0.80496192, + "learning_rate": 3.1780582533945376e-06, + "loss": 0.88258511, + "num_input_tokens_seen": 114518785, + "router_z_loss_clip": 2.02832031, + "router_z_loss_mlp": 0.19250488, + "step": 5331, + "time_per_iteration": 2.5741958618164062 + }, + { + "auxiliary_loss_clip": 0.06384323, + "auxiliary_loss_mlp": 0.0125803, + "balance_loss_clip": 0.06292279, + "balance_loss_mlp": 0.01253741, + "epoch": 0.3205771832256125, + "flos": 68436723657600.0, + "grad_norm": 0.7949538218297083, + "language_loss": 0.5776577, + "learning_rate": 3.177743502478447e-06, + "loss": 0.65408123, + "num_input_tokens_seen": 114577710, + "router_z_loss_clip": 0.921875, + "router_z_loss_mlp": 0.04293823, + "step": 5332, + "time_per_iteration": 3.084747314453125 + }, + { + "auxiliary_loss_clip": 0.06488422, + "auxiliary_loss_mlp": 0.01272523, + "balance_loss_clip": 0.06286052, + "balance_loss_mlp": 0.01255154, + "epoch": 0.3206373064782805, + "flos": 30451524094080.0, + "grad_norm": 1.5377704746044631, + "language_loss": 0.73702615, + "learning_rate": 3.177428706902205e-06, + "loss": 0.81463563, + "num_input_tokens_seen": 114598640, + "router_z_loss_clip": 2.0234375, + "router_z_loss_mlp": 0.17358398, + "step": 5333, + "time_per_iteration": 2.6130683422088623 + }, + { + "auxiliary_loss_clip": 0.06480561, + "auxiliary_loss_mlp": 0.01273615, + "balance_loss_clip": 0.06284031, + "balance_loss_mlp": 0.01256246, + "epoch": 0.32069742973094845, + "flos": 22061051498880.0, + "grad_norm": 1.6882238799892797, + "language_loss": 0.70957875, + "learning_rate": 3.1771138666777485e-06, + "loss": 0.78712052, + "num_input_tokens_seen": 114618780, + "router_z_loss_clip": 1.96484375, + "router_z_loss_mlp": 0.17382812, + "step": 5334, + "time_per_iteration": 2.5501654148101807 + }, + { + "auxiliary_loss_clip": 0.06476508, + "auxiliary_loss_mlp": 0.01276305, + "balance_loss_clip": 0.06281763, + "balance_loss_mlp": 0.01257947, + "epoch": 0.3207575529836164, + "flos": 22060464520320.0, + "grad_norm": 1.723674002448169, + "language_loss": 0.77349097, + "learning_rate": 3.1767989818170156e-06, + "loss": 0.85101908, + "num_input_tokens_seen": 114637525, + "router_z_loss_clip": 1.94433594, + "router_z_loss_mlp": 0.18347168, + "step": 5335, + "time_per_iteration": 2.5194711685180664 + }, + { + "auxiliary_loss_clip": 0.06479798, + "auxiliary_loss_mlp": 0.0127571, + "balance_loss_clip": 0.06285612, + "balance_loss_mlp": 0.0125889, + "epoch": 0.3208176762362844, + "flos": 34065961015680.0, + "grad_norm": 1.52521333905674, + "language_loss": 0.68891776, + "learning_rate": 3.1764840523319477e-06, + "loss": 0.76647282, + "num_input_tokens_seen": 114659705, + "router_z_loss_clip": 1.94238281, + "router_z_loss_mlp": 0.16809082, + "step": 5336, + "time_per_iteration": 2.6550848484039307 + }, + { + "auxiliary_loss_clip": 0.06481949, + "auxiliary_loss_mlp": 0.01285819, + "balance_loss_clip": 0.06286713, + "balance_loss_mlp": 0.01268343, + "epoch": 0.32087779948895234, + "flos": 21805151529600.0, + "grad_norm": 1.6666772631518172, + "language_loss": 0.79367507, + "learning_rate": 3.176169078234487e-06, + "loss": 0.87135273, + "num_input_tokens_seen": 114678340, + "router_z_loss_clip": 1.95410156, + "router_z_loss_mlp": 0.17480469, + "step": 5337, + "time_per_iteration": 2.5133795738220215 + }, + { + "auxiliary_loss_clip": 0.06473362, + "auxiliary_loss_mlp": 0.01277197, + "balance_loss_clip": 0.06284505, + "balance_loss_mlp": 0.01260865, + "epoch": 0.3209379227416203, + "flos": 21440532487680.0, + "grad_norm": 1.6244255970978692, + "language_loss": 0.75145769, + "learning_rate": 3.1758540595365766e-06, + "loss": 0.82896328, + "num_input_tokens_seen": 114696980, + "router_z_loss_clip": 1.88964844, + "router_z_loss_mlp": 0.16320801, + "step": 5338, + "time_per_iteration": 2.526841402053833 + }, + { + "auxiliary_loss_clip": 0.06482957, + "auxiliary_loss_mlp": 0.01277739, + "balance_loss_clip": 0.06285477, + "balance_loss_mlp": 0.01260216, + "epoch": 0.3209980459942883, + "flos": 25856267604480.0, + "grad_norm": 1.7965894601451369, + "language_loss": 0.63241929, + "learning_rate": 3.1755389962501626e-06, + "loss": 0.7100262, + "num_input_tokens_seen": 114717330, + "router_z_loss_clip": 1.97558594, + "router_z_loss_mlp": 0.17504883, + "step": 5339, + "time_per_iteration": 2.5847740173339844 + }, + { + "auxiliary_loss_clip": 0.06482022, + "auxiliary_loss_mlp": 0.0127165, + "balance_loss_clip": 0.06283947, + "balance_loss_mlp": 0.01255151, + "epoch": 0.32105816924695624, + "flos": 19105218069120.0, + "grad_norm": 2.418138513897033, + "language_loss": 0.81912339, + "learning_rate": 3.175223888387192e-06, + "loss": 0.89666009, + "num_input_tokens_seen": 114736320, + "router_z_loss_clip": 1.97949219, + "router_z_loss_mlp": 0.16491699, + "step": 5340, + "time_per_iteration": 2.5764145851135254 + }, + { + "auxiliary_loss_clip": 0.06475554, + "auxiliary_loss_mlp": 0.01271917, + "balance_loss_clip": 0.06281976, + "balance_loss_mlp": 0.01254774, + "epoch": 0.3211182924996242, + "flos": 16587531239040.0, + "grad_norm": 1.7719401771551753, + "language_loss": 0.76604897, + "learning_rate": 3.1749087359596137e-06, + "loss": 0.84352368, + "num_input_tokens_seen": 114754575, + "router_z_loss_clip": 1.93652344, + "router_z_loss_mlp": 0.17150879, + "step": 5341, + "time_per_iteration": 2.505668878555298 + }, + { + "auxiliary_loss_clip": 0.06474154, + "auxiliary_loss_mlp": 0.01272611, + "balance_loss_clip": 0.0628191, + "balance_loss_mlp": 0.01255969, + "epoch": 0.3211784157522922, + "flos": 22678425982080.0, + "grad_norm": 1.4764530250267398, + "language_loss": 0.79422891, + "learning_rate": 3.1745935389793786e-06, + "loss": 0.87169659, + "num_input_tokens_seen": 114773590, + "router_z_loss_clip": 1.92382812, + "router_z_loss_mlp": 0.16662598, + "step": 5342, + "time_per_iteration": 2.5391595363616943 + }, + { + "auxiliary_loss_clip": 0.06483465, + "auxiliary_loss_mlp": 0.01277474, + "balance_loss_clip": 0.06286988, + "balance_loss_mlp": 0.01260141, + "epoch": 0.3212385390049602, + "flos": 20565119756160.0, + "grad_norm": 2.45787142613039, + "language_loss": 0.75074786, + "learning_rate": 3.174278297458438e-06, + "loss": 0.82835722, + "num_input_tokens_seen": 114790775, + "router_z_loss_clip": 1.96484375, + "router_z_loss_mlp": 0.17321777, + "step": 5343, + "time_per_iteration": 2.4957783222198486 + }, + { + "auxiliary_loss_clip": 0.06479985, + "auxiliary_loss_mlp": 0.01272066, + "balance_loss_clip": 0.06286201, + "balance_loss_mlp": 0.01255043, + "epoch": 0.32129866225762815, + "flos": 24798188972160.0, + "grad_norm": 1.5494427093400844, + "language_loss": 0.82596725, + "learning_rate": 3.173963011408748e-06, + "loss": 0.9034878, + "num_input_tokens_seen": 114809835, + "router_z_loss_clip": 1.94042969, + "router_z_loss_mlp": 0.17028809, + "step": 5344, + "time_per_iteration": 2.5672519207000732 + }, + { + "auxiliary_loss_clip": 0.06478736, + "auxiliary_loss_mlp": 0.01273821, + "balance_loss_clip": 0.06282513, + "balance_loss_mlp": 0.0125731, + "epoch": 0.3213587855102961, + "flos": 18372374259840.0, + "grad_norm": 1.9111940233558649, + "language_loss": 0.80321491, + "learning_rate": 3.173647680842262e-06, + "loss": 0.8807404, + "num_input_tokens_seen": 114826505, + "router_z_loss_clip": 1.96289062, + "router_z_loss_mlp": 0.16516113, + "step": 5345, + "time_per_iteration": 2.479442834854126 + }, + { + "auxiliary_loss_clip": 0.06478975, + "auxiliary_loss_mlp": 0.01271046, + "balance_loss_clip": 0.06283471, + "balance_loss_mlp": 0.01254321, + "epoch": 0.3214189087629641, + "flos": 27023274944640.0, + "grad_norm": 1.7019036305222461, + "language_loss": 0.83604348, + "learning_rate": 3.1733323057709384e-06, + "loss": 0.9135437, + "num_input_tokens_seen": 114846140, + "router_z_loss_clip": 1.95507812, + "router_z_loss_mlp": 0.16723633, + "step": 5346, + "time_per_iteration": 2.549257755279541 + }, + { + "auxiliary_loss_clip": 0.0648382, + "auxiliary_loss_mlp": 0.01272196, + "balance_loss_clip": 0.06285056, + "balance_loss_mlp": 0.0125528, + "epoch": 0.32147903201563205, + "flos": 23154866697600.0, + "grad_norm": 1.4545038816344273, + "language_loss": 0.81656283, + "learning_rate": 3.1730168862067366e-06, + "loss": 0.89412296, + "num_input_tokens_seen": 114866660, + "router_z_loss_clip": 1.98925781, + "router_z_loss_mlp": 0.16918945, + "step": 5347, + "time_per_iteration": 2.5096054077148438 + }, + { + "auxiliary_loss_clip": 0.06480029, + "auxiliary_loss_mlp": 0.01274054, + "balance_loss_clip": 0.06286772, + "balance_loss_mlp": 0.01256673, + "epoch": 0.3215391552683, + "flos": 16586231500800.0, + "grad_norm": 2.536962878441814, + "language_loss": 0.80386555, + "learning_rate": 3.1727014221616164e-06, + "loss": 0.88140643, + "num_input_tokens_seen": 114882820, + "router_z_loss_clip": 1.9296875, + "router_z_loss_mlp": 0.1739502, + "step": 5348, + "time_per_iteration": 3.9639015197753906 + }, + { + "auxiliary_loss_clip": 0.06474565, + "auxiliary_loss_mlp": 0.01276371, + "balance_loss_clip": 0.06280862, + "balance_loss_mlp": 0.01259431, + "epoch": 0.321599278520968, + "flos": 17827604939520.0, + "grad_norm": 2.026618804026968, + "language_loss": 0.85758352, + "learning_rate": 3.172385913647542e-06, + "loss": 0.93509287, + "num_input_tokens_seen": 114900745, + "router_z_loss_clip": 1.93554688, + "router_z_loss_mlp": 0.16943359, + "step": 5349, + "time_per_iteration": 3.8848202228546143 + }, + { + "auxiliary_loss_clip": 0.06481349, + "auxiliary_loss_mlp": 0.01274724, + "balance_loss_clip": 0.06286412, + "balance_loss_mlp": 0.01257022, + "epoch": 0.32165940177363594, + "flos": 16257097463040.0, + "grad_norm": 1.7607877661370477, + "language_loss": 0.8123306, + "learning_rate": 3.172070360676475e-06, + "loss": 0.88989133, + "num_input_tokens_seen": 114917940, + "router_z_loss_clip": 1.95019531, + "router_z_loss_mlp": 0.17700195, + "step": 5350, + "time_per_iteration": 3.9589500427246094 + }, + { + "auxiliary_loss_clip": 0.06471309, + "auxiliary_loss_mlp": 0.01270957, + "balance_loss_clip": 0.06282239, + "balance_loss_mlp": 0.01255055, + "epoch": 0.3217195250263039, + "flos": 27607302702720.0, + "grad_norm": 1.8529018663543275, + "language_loss": 0.80116528, + "learning_rate": 3.1717547632603828e-06, + "loss": 0.87858802, + "num_input_tokens_seen": 114937735, + "router_z_loss_clip": 1.89257812, + "router_z_loss_mlp": 0.15905762, + "step": 5351, + "time_per_iteration": 2.562232732772827 + }, + { + "auxiliary_loss_clip": 0.06476732, + "auxiliary_loss_mlp": 0.01274907, + "balance_loss_clip": 0.06284767, + "balance_loss_mlp": 0.01256668, + "epoch": 0.3217796482789719, + "flos": 21477023740800.0, + "grad_norm": 2.0321110975992562, + "language_loss": 0.7641573, + "learning_rate": 3.1714391214112326e-06, + "loss": 0.84167361, + "num_input_tokens_seen": 114956630, + "router_z_loss_clip": 1.91699219, + "router_z_loss_mlp": 0.18249512, + "step": 5352, + "time_per_iteration": 2.5320773124694824 + }, + { + "auxiliary_loss_clip": 0.0648407, + "auxiliary_loss_mlp": 0.01278365, + "balance_loss_clip": 0.06291708, + "balance_loss_mlp": 0.0126133, + "epoch": 0.32183977153163984, + "flos": 21222046166400.0, + "grad_norm": 1.9188598206640457, + "language_loss": 0.82159722, + "learning_rate": 3.1711234351409933e-06, + "loss": 0.89922154, + "num_input_tokens_seen": 114976470, + "router_z_loss_clip": 1.92089844, + "router_z_loss_mlp": 0.17028809, + "step": 5353, + "time_per_iteration": 2.5061802864074707 + }, + { + "auxiliary_loss_clip": 0.06480308, + "auxiliary_loss_mlp": 0.01275858, + "balance_loss_clip": 0.0629053, + "balance_loss_mlp": 0.0125837, + "epoch": 0.3218998947843078, + "flos": 24615103800960.0, + "grad_norm": 1.8505936463490174, + "language_loss": 0.74125177, + "learning_rate": 3.1708077044616365e-06, + "loss": 0.81881344, + "num_input_tokens_seen": 114996710, + "router_z_loss_clip": 1.89941406, + "router_z_loss_mlp": 0.17480469, + "step": 5354, + "time_per_iteration": 2.5725185871124268 + }, + { + "auxiliary_loss_clip": 0.06479903, + "auxiliary_loss_mlp": 0.01277081, + "balance_loss_clip": 0.06284866, + "balance_loss_mlp": 0.01259951, + "epoch": 0.3219600180369758, + "flos": 22276686781440.0, + "grad_norm": 2.612968571970558, + "language_loss": 0.83769405, + "learning_rate": 3.1704919293851334e-06, + "loss": 0.91526389, + "num_input_tokens_seen": 115015775, + "router_z_loss_clip": 1.95019531, + "router_z_loss_mlp": 0.17126465, + "step": 5355, + "time_per_iteration": 3.985846757888794 + }, + { + "auxiliary_loss_clip": 0.0647967, + "auxiliary_loss_mlp": 0.01272253, + "balance_loss_clip": 0.0628608, + "balance_loss_mlp": 0.01255528, + "epoch": 0.3220201412896438, + "flos": 14944376672640.0, + "grad_norm": 1.8959584470465125, + "language_loss": 0.71344721, + "learning_rate": 3.1701761099234597e-06, + "loss": 0.79096651, + "num_input_tokens_seen": 115034265, + "router_z_loss_clip": 1.93554688, + "router_z_loss_mlp": 0.1673584, + "step": 5356, + "time_per_iteration": 2.5644400119781494 + }, + { + "auxiliary_loss_clip": 0.06494904, + "auxiliary_loss_mlp": 0.01280986, + "balance_loss_clip": 0.0629259, + "balance_loss_mlp": 0.01263367, + "epoch": 0.32208026454231176, + "flos": 22672807758720.0, + "grad_norm": 2.5335154176231525, + "language_loss": 0.67879629, + "learning_rate": 3.1698602460885903e-06, + "loss": 0.7565552, + "num_input_tokens_seen": 115051945, + "router_z_loss_clip": 2.02050781, + "router_z_loss_mlp": 0.17614746, + "step": 5357, + "time_per_iteration": 2.546654224395752 + }, + { + "auxiliary_loss_clip": 0.06384487, + "auxiliary_loss_mlp": 0.01261366, + "balance_loss_clip": 0.06294875, + "balance_loss_mlp": 0.01257649, + "epoch": 0.3221403877949797, + "flos": 64626273308160.0, + "grad_norm": 0.6824166316331671, + "language_loss": 0.58314437, + "learning_rate": 3.1695443378925035e-06, + "loss": 0.65960288, + "num_input_tokens_seen": 115119090, + "router_z_loss_clip": 0.89648438, + "router_z_loss_mlp": 0.03707886, + "step": 5358, + "time_per_iteration": 3.2290756702423096 + }, + { + "auxiliary_loss_clip": 0.06481851, + "auxiliary_loss_mlp": 0.01282518, + "balance_loss_clip": 0.06287378, + "balance_loss_mlp": 0.01264839, + "epoch": 0.3222005110476477, + "flos": 20163212847360.0, + "grad_norm": 1.9186908993809755, + "language_loss": 0.84190667, + "learning_rate": 3.1692283853471777e-06, + "loss": 0.91955042, + "num_input_tokens_seen": 115137755, + "router_z_loss_clip": 1.94628906, + "router_z_loss_mlp": 0.17675781, + "step": 5359, + "time_per_iteration": 2.531033754348755 + }, + { + "auxiliary_loss_clip": 0.06480163, + "auxiliary_loss_mlp": 0.01277134, + "balance_loss_clip": 0.06287846, + "balance_loss_mlp": 0.01260051, + "epoch": 0.32226063430031565, + "flos": 22680731969280.0, + "grad_norm": 1.6695480137557102, + "language_loss": 0.79997146, + "learning_rate": 3.168912388464595e-06, + "loss": 0.87754452, + "num_input_tokens_seen": 115158150, + "router_z_loss_clip": 1.921875, + "router_z_loss_mlp": 0.17077637, + "step": 5360, + "time_per_iteration": 2.544461727142334 + }, + { + "auxiliary_loss_clip": 0.06382456, + "auxiliary_loss_mlp": 0.01256795, + "balance_loss_clip": 0.06292457, + "balance_loss_mlp": 0.01253353, + "epoch": 0.3223207575529836, + "flos": 63847798151040.0, + "grad_norm": 0.6356253914940931, + "language_loss": 0.56731617, + "learning_rate": 3.168596347256737e-06, + "loss": 0.64370871, + "num_input_tokens_seen": 115212755, + "router_z_loss_clip": 0.8984375, + "router_z_loss_mlp": 0.03451538, + "step": 5361, + "time_per_iteration": 3.0336568355560303 + }, + { + "auxiliary_loss_clip": 0.06478466, + "auxiliary_loss_mlp": 0.01277797, + "balance_loss_clip": 0.06288562, + "balance_loss_mlp": 0.01261346, + "epoch": 0.3223808808056516, + "flos": 26877393786240.0, + "grad_norm": 2.167930910708006, + "language_loss": 0.71792114, + "learning_rate": 3.168280261735588e-06, + "loss": 0.79548371, + "num_input_tokens_seen": 115233090, + "router_z_loss_clip": 1.89746094, + "router_z_loss_mlp": 0.16442871, + "step": 5362, + "time_per_iteration": 2.561345338821411 + }, + { + "auxiliary_loss_clip": 0.06483887, + "auxiliary_loss_mlp": 0.01279203, + "balance_loss_clip": 0.06293412, + "balance_loss_mlp": 0.01262692, + "epoch": 0.32244100405831955, + "flos": 26768716640640.0, + "grad_norm": 1.5327886568658977, + "language_loss": 0.73854291, + "learning_rate": 3.167964131913135e-06, + "loss": 0.81617379, + "num_input_tokens_seen": 115252645, + "router_z_loss_clip": 1.90722656, + "router_z_loss_mlp": 0.16503906, + "step": 5363, + "time_per_iteration": 2.583064556121826 + }, + { + "auxiliary_loss_clip": 0.06489229, + "auxiliary_loss_mlp": 0.01275466, + "balance_loss_clip": 0.06291971, + "balance_loss_mlp": 0.01258717, + "epoch": 0.3225011273109875, + "flos": 23809403266560.0, + "grad_norm": 2.354374584633167, + "language_loss": 0.76664144, + "learning_rate": 3.167647957801365e-06, + "loss": 0.84428835, + "num_input_tokens_seen": 115269085, + "router_z_loss_clip": 1.97265625, + "router_z_loss_mlp": 0.16748047, + "step": 5364, + "time_per_iteration": 2.5177268981933594 + }, + { + "auxiliary_loss_clip": 0.06479897, + "auxiliary_loss_mlp": 0.01275674, + "balance_loss_clip": 0.06290577, + "balance_loss_mlp": 0.01259473, + "epoch": 0.3225612505636555, + "flos": 17280194215680.0, + "grad_norm": 2.1891061142162327, + "language_loss": 0.7715044, + "learning_rate": 3.1673317394122672e-06, + "loss": 0.84906018, + "num_input_tokens_seen": 115286470, + "router_z_loss_clip": 1.89257812, + "router_z_loss_mlp": 0.1619873, + "step": 5365, + "time_per_iteration": 2.5122928619384766 + }, + { + "auxiliary_loss_clip": 0.06484331, + "auxiliary_loss_mlp": 0.01277663, + "balance_loss_clip": 0.06292351, + "balance_loss_mlp": 0.01260711, + "epoch": 0.32262137381632344, + "flos": 23372724113280.0, + "grad_norm": 2.314444268247813, + "language_loss": 0.77153468, + "learning_rate": 3.1670154767578333e-06, + "loss": 0.84915465, + "num_input_tokens_seen": 115307000, + "router_z_loss_clip": 1.92089844, + "router_z_loss_mlp": 0.16955566, + "step": 5366, + "time_per_iteration": 2.514768362045288 + }, + { + "auxiliary_loss_clip": 0.06481092, + "auxiliary_loss_mlp": 0.01280366, + "balance_loss_clip": 0.0629226, + "balance_loss_mlp": 0.0126388, + "epoch": 0.3226814970689914, + "flos": 23265598268160.0, + "grad_norm": 1.8642315088319754, + "language_loss": 0.72423649, + "learning_rate": 3.166699169850055e-06, + "loss": 0.80185115, + "num_input_tokens_seen": 115325925, + "router_z_loss_clip": 1.88769531, + "router_z_loss_mlp": 0.16491699, + "step": 5367, + "time_per_iteration": 2.544145345687866 + }, + { + "auxiliary_loss_clip": 0.06480073, + "auxiliary_loss_mlp": 0.01278287, + "balance_loss_clip": 0.06290721, + "balance_loss_mlp": 0.01262248, + "epoch": 0.32274162032165943, + "flos": 16400127582720.0, + "grad_norm": 1.9542840286813894, + "language_loss": 0.74559301, + "learning_rate": 3.1663828187009274e-06, + "loss": 0.82317662, + "num_input_tokens_seen": 115343705, + "router_z_loss_clip": 1.89453125, + "router_z_loss_mlp": 0.16033936, + "step": 5368, + "time_per_iteration": 2.4653942584991455 + }, + { + "auxiliary_loss_clip": 0.06481207, + "auxiliary_loss_mlp": 0.01271425, + "balance_loss_clip": 0.06294385, + "balance_loss_mlp": 0.01255874, + "epoch": 0.3228017435743274, + "flos": 27862489912320.0, + "grad_norm": 2.016369988637382, + "language_loss": 0.79033995, + "learning_rate": 3.1660664233224467e-06, + "loss": 0.86786628, + "num_input_tokens_seen": 115364170, + "router_z_loss_clip": 1.86621094, + "router_z_loss_mlp": 0.15533447, + "step": 5369, + "time_per_iteration": 2.6923141479492188 + }, + { + "auxiliary_loss_clip": 0.06471382, + "auxiliary_loss_mlp": 0.01280148, + "balance_loss_clip": 0.0628759, + "balance_loss_mlp": 0.01264567, + "epoch": 0.32286186682699536, + "flos": 19614712020480.0, + "grad_norm": 1.8619928029866217, + "language_loss": 0.83607441, + "learning_rate": 3.16574998372661e-06, + "loss": 0.91358972, + "num_input_tokens_seen": 115382495, + "router_z_loss_clip": 1.83984375, + "router_z_loss_mlp": 0.15576172, + "step": 5370, + "time_per_iteration": 2.4963490962982178 + }, + { + "auxiliary_loss_clip": 0.06481104, + "auxiliary_loss_mlp": 0.01278081, + "balance_loss_clip": 0.062904, + "balance_loss_mlp": 0.01262703, + "epoch": 0.3229219900796633, + "flos": 24140885218560.0, + "grad_norm": 2.7780356443351146, + "language_loss": 0.83346975, + "learning_rate": 3.1654334999254177e-06, + "loss": 0.91106164, + "num_input_tokens_seen": 115399450, + "router_z_loss_clip": 1.90625, + "router_z_loss_mlp": 0.15368652, + "step": 5371, + "time_per_iteration": 2.554034948348999 + }, + { + "auxiliary_loss_clip": 0.06486623, + "auxiliary_loss_mlp": 0.01278101, + "balance_loss_clip": 0.0629211, + "balance_loss_mlp": 0.01260434, + "epoch": 0.3229821133323313, + "flos": 17754454725120.0, + "grad_norm": 2.279534384310274, + "language_loss": 0.89153087, + "learning_rate": 3.1651169719308695e-06, + "loss": 0.96917808, + "num_input_tokens_seen": 115417700, + "router_z_loss_clip": 1.94433594, + "router_z_loss_mlp": 0.17663574, + "step": 5372, + "time_per_iteration": 2.468693971633911 + }, + { + "auxiliary_loss_clip": 0.06478924, + "auxiliary_loss_mlp": 0.01278448, + "balance_loss_clip": 0.06288313, + "balance_loss_mlp": 0.01261843, + "epoch": 0.32304223658499925, + "flos": 22352562253440.0, + "grad_norm": 1.986067660558338, + "language_loss": 0.730793, + "learning_rate": 3.1648003997549694e-06, + "loss": 0.80836678, + "num_input_tokens_seen": 115435840, + "router_z_loss_clip": 1.90332031, + "router_z_loss_mlp": 0.16601562, + "step": 5373, + "time_per_iteration": 2.5757906436920166 + }, + { + "auxiliary_loss_clip": 0.06476311, + "auxiliary_loss_mlp": 0.0127432, + "balance_loss_clip": 0.06293686, + "balance_loss_mlp": 0.01258227, + "epoch": 0.3231023598376672, + "flos": 18484154006400.0, + "grad_norm": 2.1970042176000963, + "language_loss": 0.82592154, + "learning_rate": 3.1644837834097214e-06, + "loss": 0.90342778, + "num_input_tokens_seen": 115454210, + "router_z_loss_clip": 1.82714844, + "router_z_loss_mlp": 0.1607666, + "step": 5374, + "time_per_iteration": 2.4853713512420654 + }, + { + "auxiliary_loss_clip": 0.06474404, + "auxiliary_loss_mlp": 0.01271223, + "balance_loss_clip": 0.06291121, + "balance_loss_mlp": 0.0125544, + "epoch": 0.3231624830903352, + "flos": 27643710101760.0, + "grad_norm": 1.9120740622639463, + "language_loss": 0.88405079, + "learning_rate": 3.1641671229071317e-06, + "loss": 0.96150708, + "num_input_tokens_seen": 115471785, + "router_z_loss_clip": 1.83007812, + "router_z_loss_mlp": 0.15783691, + "step": 5375, + "time_per_iteration": 2.58644700050354 + }, + { + "auxiliary_loss_clip": 0.06483716, + "auxiliary_loss_mlp": 0.01275166, + "balance_loss_clip": 0.06289761, + "balance_loss_mlp": 0.01258799, + "epoch": 0.32322260634300315, + "flos": 21732965637120.0, + "grad_norm": 2.2884949024183983, + "language_loss": 0.76224899, + "learning_rate": 3.1638504182592076e-06, + "loss": 0.83983773, + "num_input_tokens_seen": 115491405, + "router_z_loss_clip": 1.94042969, + "router_z_loss_mlp": 0.16345215, + "step": 5376, + "time_per_iteration": 2.5090999603271484 + }, + { + "auxiliary_loss_clip": 0.0647772, + "auxiliary_loss_mlp": 0.01272254, + "balance_loss_clip": 0.06289793, + "balance_loss_mlp": 0.01256649, + "epoch": 0.3232827295956711, + "flos": 22644198789120.0, + "grad_norm": 1.5259481118475857, + "language_loss": 0.67275858, + "learning_rate": 3.1635336694779594e-06, + "loss": 0.75025833, + "num_input_tokens_seen": 115511555, + "router_z_loss_clip": 1.88085938, + "router_z_loss_mlp": 0.15594482, + "step": 5377, + "time_per_iteration": 2.592737913131714 + }, + { + "auxiliary_loss_clip": 0.06482306, + "auxiliary_loss_mlp": 0.01279693, + "balance_loss_clip": 0.06294581, + "balance_loss_mlp": 0.01262158, + "epoch": 0.3233428528483391, + "flos": 26329731500160.0, + "grad_norm": 1.747214931760967, + "language_loss": 0.73022175, + "learning_rate": 3.1632168765753982e-06, + "loss": 0.80784178, + "num_input_tokens_seen": 115532860, + "router_z_loss_clip": 1.87597656, + "router_z_loss_mlp": 0.17541504, + "step": 5378, + "time_per_iteration": 2.560969114303589 + }, + { + "auxiliary_loss_clip": 0.06476232, + "auxiliary_loss_mlp": 0.01272167, + "balance_loss_clip": 0.06289409, + "balance_loss_mlp": 0.01256598, + "epoch": 0.32340297610100704, + "flos": 28592818099200.0, + "grad_norm": 2.0362074337070832, + "language_loss": 0.82332939, + "learning_rate": 3.1629000395635357e-06, + "loss": 0.90081334, + "num_input_tokens_seen": 115553850, + "router_z_loss_clip": 1.86816406, + "router_z_loss_mlp": 0.15563965, + "step": 5379, + "time_per_iteration": 2.661787986755371 + }, + { + "auxiliary_loss_clip": 0.06481552, + "auxiliary_loss_mlp": 0.01276474, + "balance_loss_clip": 0.06288823, + "balance_loss_mlp": 0.01260548, + "epoch": 0.323463099353675, + "flos": 30781664380800.0, + "grad_norm": 1.6212615798097256, + "language_loss": 0.78942055, + "learning_rate": 3.162583158454388e-06, + "loss": 0.86700082, + "num_input_tokens_seen": 115575530, + "router_z_loss_clip": 1.92285156, + "router_z_loss_mlp": 0.15942383, + "step": 5380, + "time_per_iteration": 2.593618631362915 + }, + { + "auxiliary_loss_clip": 0.06489569, + "auxiliary_loss_mlp": 0.01272069, + "balance_loss_clip": 0.06298643, + "balance_loss_mlp": 0.01255368, + "epoch": 0.32352322260634303, + "flos": 25235664739200.0, + "grad_norm": 1.685322069138263, + "language_loss": 0.77853882, + "learning_rate": 3.1622662332599697e-06, + "loss": 0.85615522, + "num_input_tokens_seen": 115594885, + "router_z_loss_clip": 1.91015625, + "router_z_loss_mlp": 0.16699219, + "step": 5381, + "time_per_iteration": 2.5967609882354736 + }, + { + "auxiliary_loss_clip": 0.06476527, + "auxiliary_loss_mlp": 0.01269308, + "balance_loss_clip": 0.06292967, + "balance_loss_mlp": 0.01255438, + "epoch": 0.323583345859011, + "flos": 23337071400960.0, + "grad_norm": 1.9004028984655497, + "language_loss": 0.72391021, + "learning_rate": 3.1619492639922998e-06, + "loss": 0.80136859, + "num_input_tokens_seen": 115614080, + "router_z_loss_clip": 1.83496094, + "router_z_loss_mlp": 0.13848877, + "step": 5382, + "time_per_iteration": 2.5095293521881104 + }, + { + "auxiliary_loss_clip": 0.06488711, + "auxiliary_loss_mlp": 0.01277606, + "balance_loss_clip": 0.06295708, + "balance_loss_mlp": 0.01262157, + "epoch": 0.32364346911167896, + "flos": 26213675195520.0, + "grad_norm": 2.3447859303702883, + "language_loss": 0.71528596, + "learning_rate": 3.1616322506633964e-06, + "loss": 0.79294908, + "num_input_tokens_seen": 115632820, + "router_z_loss_clip": 1.9296875, + "router_z_loss_mlp": 0.15441895, + "step": 5383, + "time_per_iteration": 2.5806562900543213 + }, + { + "auxiliary_loss_clip": 0.06476977, + "auxiliary_loss_mlp": 0.01276799, + "balance_loss_clip": 0.06292375, + "balance_loss_mlp": 0.01261564, + "epoch": 0.3237035923643469, + "flos": 23702487056640.0, + "grad_norm": 1.948915226701978, + "language_loss": 0.78857487, + "learning_rate": 3.161315193285283e-06, + "loss": 0.86611259, + "num_input_tokens_seen": 115652860, + "router_z_loss_clip": 1.84472656, + "router_z_loss_mlp": 0.15234375, + "step": 5384, + "time_per_iteration": 2.548797369003296 + }, + { + "auxiliary_loss_clip": 0.06481218, + "auxiliary_loss_mlp": 0.01274762, + "balance_loss_clip": 0.06288576, + "balance_loss_mlp": 0.0125793, + "epoch": 0.3237637156170149, + "flos": 14433960326400.0, + "grad_norm": 1.885180362402172, + "language_loss": 0.75034815, + "learning_rate": 3.16099809186998e-06, + "loss": 0.82790792, + "num_input_tokens_seen": 115670940, + "router_z_loss_clip": 1.92675781, + "router_z_loss_mlp": 0.16821289, + "step": 5385, + "time_per_iteration": 2.577547073364258 + }, + { + "auxiliary_loss_clip": 0.06486371, + "auxiliary_loss_mlp": 0.0127007, + "balance_loss_clip": 0.06298091, + "balance_loss_mlp": 0.01255032, + "epoch": 0.32382383886968286, + "flos": 31070449877760.0, + "grad_norm": 1.8174179211363362, + "language_loss": 0.72224641, + "learning_rate": 3.1606809464295145e-06, + "loss": 0.79981083, + "num_input_tokens_seen": 115691155, + "router_z_loss_clip": 1.8828125, + "router_z_loss_mlp": 0.15032959, + "step": 5386, + "time_per_iteration": 2.585822820663452 + }, + { + "auxiliary_loss_clip": 0.06485418, + "auxiliary_loss_mlp": 0.01273325, + "balance_loss_clip": 0.06292341, + "balance_loss_mlp": 0.01256803, + "epoch": 0.3238839621223508, + "flos": 23263418062080.0, + "grad_norm": 3.182973165751226, + "language_loss": 0.95573068, + "learning_rate": 3.1603637569759095e-06, + "loss": 1.03331804, + "num_input_tokens_seen": 115710340, + "router_z_loss_clip": 1.93261719, + "router_z_loss_mlp": 0.16503906, + "step": 5387, + "time_per_iteration": 4.075104236602783 + }, + { + "auxiliary_loss_clip": 0.06490889, + "auxiliary_loss_mlp": 0.01270509, + "balance_loss_clip": 0.06298059, + "balance_loss_mlp": 0.0125376, + "epoch": 0.3239440853750188, + "flos": 22971026839680.0, + "grad_norm": 2.142304582151843, + "language_loss": 0.78141761, + "learning_rate": 3.1600465235211956e-06, + "loss": 0.85903162, + "num_input_tokens_seen": 115726745, + "router_z_loss_clip": 1.92675781, + "router_z_loss_mlp": 0.16748047, + "step": 5388, + "time_per_iteration": 2.623976707458496 + }, + { + "auxiliary_loss_clip": 0.06478786, + "auxiliary_loss_mlp": 0.01276501, + "balance_loss_clip": 0.06289905, + "balance_loss_mlp": 0.01259704, + "epoch": 0.32400420862768675, + "flos": 36255394275840.0, + "grad_norm": 1.9954909505528162, + "language_loss": 0.71735168, + "learning_rate": 3.1597292460774006e-06, + "loss": 0.79490453, + "num_input_tokens_seen": 115749385, + "router_z_loss_clip": 1.88867188, + "router_z_loss_mlp": 0.16796875, + "step": 5389, + "time_per_iteration": 4.133269309997559 + }, + { + "auxiliary_loss_clip": 0.06479806, + "auxiliary_loss_mlp": 0.01273464, + "balance_loss_clip": 0.06294239, + "balance_loss_mlp": 0.01257872, + "epoch": 0.3240643318803547, + "flos": 21622946826240.0, + "grad_norm": 1.7464997421167434, + "language_loss": 0.81443554, + "learning_rate": 3.159411924656557e-06, + "loss": 0.89196825, + "num_input_tokens_seen": 115768105, + "router_z_loss_clip": 1.85449219, + "router_z_loss_mlp": 0.15588379, + "step": 5390, + "time_per_iteration": 3.9378364086151123 + }, + { + "auxiliary_loss_clip": 0.06491944, + "auxiliary_loss_mlp": 0.01278594, + "balance_loss_clip": 0.06301276, + "balance_loss_mlp": 0.01261296, + "epoch": 0.3241244551330227, + "flos": 23302466864640.0, + "grad_norm": 1.9807661160762629, + "language_loss": 0.73182476, + "learning_rate": 3.1590945592706967e-06, + "loss": 0.80953014, + "num_input_tokens_seen": 115787340, + "router_z_loss_clip": 1.90625, + "router_z_loss_mlp": 0.1730957, + "step": 5391, + "time_per_iteration": 2.532317638397217 + }, + { + "auxiliary_loss_clip": 0.06482222, + "auxiliary_loss_mlp": 0.01278908, + "balance_loss_clip": 0.06294864, + "balance_loss_mlp": 0.0126241, + "epoch": 0.32418457838569065, + "flos": 14101891395840.0, + "grad_norm": 1.5457442510257688, + "language_loss": 0.77541089, + "learning_rate": 3.158777149931855e-06, + "loss": 0.85302216, + "num_input_tokens_seen": 115805565, + "router_z_loss_clip": 1.87304688, + "router_z_loss_mlp": 0.16491699, + "step": 5392, + "time_per_iteration": 2.486161470413208 + }, + { + "auxiliary_loss_clip": 0.06490408, + "auxiliary_loss_mlp": 0.01278183, + "balance_loss_clip": 0.0629712, + "balance_loss_mlp": 0.01261411, + "epoch": 0.3242447016383586, + "flos": 29760454344960.0, + "grad_norm": 1.849936210081937, + "language_loss": 0.63213563, + "learning_rate": 3.158459696652067e-06, + "loss": 0.70982158, + "num_input_tokens_seen": 115826725, + "router_z_loss_clip": 1.93066406, + "router_z_loss_mlp": 0.16760254, + "step": 5393, + "time_per_iteration": 2.5853707790374756 + }, + { + "auxiliary_loss_clip": 0.06489256, + "auxiliary_loss_mlp": 0.01282677, + "balance_loss_clip": 0.06301466, + "balance_loss_mlp": 0.01266011, + "epoch": 0.3243048248910266, + "flos": 24357820239360.0, + "grad_norm": 1.7023503315224988, + "language_loss": 0.82889545, + "learning_rate": 3.158142199443371e-06, + "loss": 0.90661478, + "num_input_tokens_seen": 115846955, + "router_z_loss_clip": 1.88085938, + "router_z_loss_mlp": 0.16674805, + "step": 5394, + "time_per_iteration": 3.946955680847168 + }, + { + "auxiliary_loss_clip": 0.06480435, + "auxiliary_loss_mlp": 0.01285084, + "balance_loss_clip": 0.06298714, + "balance_loss_mlp": 0.01269825, + "epoch": 0.3243649481436946, + "flos": 24359958518400.0, + "grad_norm": 2.1573093021253333, + "language_loss": 0.82280314, + "learning_rate": 3.1578246583178076e-06, + "loss": 0.90045834, + "num_input_tokens_seen": 115865975, + "router_z_loss_clip": 1.81738281, + "router_z_loss_mlp": 0.15270996, + "step": 5395, + "time_per_iteration": 2.537313222885132 + }, + { + "auxiliary_loss_clip": 0.06480338, + "auxiliary_loss_mlp": 0.01292267, + "balance_loss_clip": 0.06300412, + "balance_loss_mlp": 0.01276424, + "epoch": 0.32442507139636256, + "flos": 22931097569280.0, + "grad_norm": 1.7302006802896392, + "language_loss": 0.839818, + "learning_rate": 3.157507073287417e-06, + "loss": 0.91754401, + "num_input_tokens_seen": 115884950, + "router_z_loss_clip": 1.79785156, + "router_z_loss_mlp": 0.15844727, + "step": 5396, + "time_per_iteration": 2.6440067291259766 + }, + { + "auxiliary_loss_clip": 0.06491997, + "auxiliary_loss_mlp": 0.01291538, + "balance_loss_clip": 0.06299315, + "balance_loss_mlp": 0.01274121, + "epoch": 0.32448519464903053, + "flos": 22206723022080.0, + "grad_norm": 1.8684779143202024, + "language_loss": 0.76113403, + "learning_rate": 3.1571894443642414e-06, + "loss": 0.83896935, + "num_input_tokens_seen": 115904170, + "router_z_loss_clip": 1.92578125, + "router_z_loss_mlp": 0.17419434, + "step": 5397, + "time_per_iteration": 2.506601095199585 + }, + { + "auxiliary_loss_clip": 0.06473789, + "auxiliary_loss_mlp": 0.01290487, + "balance_loss_clip": 0.06290997, + "balance_loss_mlp": 0.0127387, + "epoch": 0.3245453179016985, + "flos": 18843574095360.0, + "grad_norm": 2.304762567896747, + "language_loss": 0.67975587, + "learning_rate": 3.1568717715603263e-06, + "loss": 0.75739866, + "num_input_tokens_seen": 115919255, + "router_z_loss_clip": 1.828125, + "router_z_loss_mlp": 0.1661377, + "step": 5398, + "time_per_iteration": 2.50168514251709 + }, + { + "auxiliary_loss_clip": 0.06478744, + "auxiliary_loss_mlp": 0.01288926, + "balance_loss_clip": 0.06293125, + "balance_loss_mlp": 0.01272189, + "epoch": 0.32460544115436646, + "flos": 21184716372480.0, + "grad_norm": 1.3685049489713428, + "language_loss": 0.73232323, + "learning_rate": 3.156554054887718e-06, + "loss": 0.80999994, + "num_input_tokens_seen": 115938535, + "router_z_loss_clip": 1.85742188, + "router_z_loss_mlp": 0.16748047, + "step": 5399, + "time_per_iteration": 2.5114216804504395 + }, + { + "auxiliary_loss_clip": 0.0648094, + "auxiliary_loss_mlp": 0.01289931, + "balance_loss_clip": 0.06293677, + "balance_loss_mlp": 0.01273241, + "epoch": 0.3246655644070344, + "flos": 21987607795200.0, + "grad_norm": 2.072173153822147, + "language_loss": 0.71044981, + "learning_rate": 3.1562362943584645e-06, + "loss": 0.78815848, + "num_input_tokens_seen": 115955005, + "router_z_loss_clip": 1.87207031, + "router_z_loss_mlp": 0.16687012, + "step": 5400, + "time_per_iteration": 2.548757314682007 + }, + { + "auxiliary_loss_clip": 0.06480449, + "auxiliary_loss_mlp": 0.01279651, + "balance_loss_clip": 0.06289301, + "balance_loss_mlp": 0.01263355, + "epoch": 0.3247256876597024, + "flos": 32167745020800.0, + "grad_norm": 2.104371315429844, + "language_loss": 0.80626661, + "learning_rate": 3.155918489984614e-06, + "loss": 0.88386756, + "num_input_tokens_seen": 115975305, + "router_z_loss_clip": 1.91210938, + "router_z_loss_mlp": 0.16296387, + "step": 5401, + "time_per_iteration": 2.59226393699646 + }, + { + "auxiliary_loss_clip": 0.06483636, + "auxiliary_loss_mlp": 0.01281263, + "balance_loss_clip": 0.06294005, + "balance_loss_mlp": 0.01264073, + "epoch": 0.32478581091237035, + "flos": 21004104896640.0, + "grad_norm": 1.4796090680940444, + "language_loss": 0.87935805, + "learning_rate": 3.1556006417782196e-06, + "loss": 0.95700705, + "num_input_tokens_seen": 115994810, + "router_z_loss_clip": 1.89746094, + "router_z_loss_mlp": 0.17175293, + "step": 5402, + "time_per_iteration": 2.5548956394195557 + }, + { + "auxiliary_loss_clip": 0.06474966, + "auxiliary_loss_mlp": 0.0127368, + "balance_loss_clip": 0.06291528, + "balance_loss_mlp": 0.01258767, + "epoch": 0.3248459341650383, + "flos": 17929741466880.0, + "grad_norm": 2.584856005153906, + "language_loss": 0.85243386, + "learning_rate": 3.155282749751332e-06, + "loss": 0.92992032, + "num_input_tokens_seen": 116011095, + "router_z_loss_clip": 1.83691406, + "router_z_loss_mlp": 0.14904785, + "step": 5403, + "time_per_iteration": 2.479205369949341 + }, + { + "auxiliary_loss_clip": 0.06468324, + "auxiliary_loss_mlp": 0.01277336, + "balance_loss_clip": 0.06290223, + "balance_loss_mlp": 0.01262667, + "epoch": 0.3249060574177063, + "flos": 24542582492160.0, + "grad_norm": 2.1052258035485214, + "language_loss": 0.8828373, + "learning_rate": 3.154964813916007e-06, + "loss": 0.96029389, + "num_input_tokens_seen": 116028805, + "router_z_loss_clip": 1.77832031, + "router_z_loss_mlp": 0.14672852, + "step": 5404, + "time_per_iteration": 2.5845093727111816 + }, + { + "auxiliary_loss_clip": 0.06473936, + "auxiliary_loss_mlp": 0.01275771, + "balance_loss_clip": 0.06291413, + "balance_loss_mlp": 0.01259368, + "epoch": 0.32496618067037425, + "flos": 26001939127680.0, + "grad_norm": 1.6833557203411496, + "language_loss": 0.72900558, + "learning_rate": 3.1546468342843008e-06, + "loss": 0.80650264, + "num_input_tokens_seen": 116047765, + "router_z_loss_clip": 1.82617188, + "router_z_loss_mlp": 0.1640625, + "step": 5405, + "time_per_iteration": 2.542433500289917 + }, + { + "auxiliary_loss_clip": 0.06474283, + "auxiliary_loss_mlp": 0.01273684, + "balance_loss_clip": 0.06290333, + "balance_loss_mlp": 0.01258264, + "epoch": 0.3250263039230422, + "flos": 19579939776000.0, + "grad_norm": 1.7320098663924197, + "language_loss": 0.83355331, + "learning_rate": 3.1543288108682707e-06, + "loss": 0.91103297, + "num_input_tokens_seen": 116068385, + "router_z_loss_clip": 1.83789062, + "router_z_loss_mlp": 0.15435791, + "step": 5406, + "time_per_iteration": 2.591207265853882 + }, + { + "auxiliary_loss_clip": 0.06474167, + "auxiliary_loss_mlp": 0.01270112, + "balance_loss_clip": 0.06290454, + "balance_loss_mlp": 0.01254949, + "epoch": 0.3250864271757102, + "flos": 16769232817920.0, + "grad_norm": 2.13827452533593, + "language_loss": 0.87879711, + "learning_rate": 3.1540107436799764e-06, + "loss": 0.95623994, + "num_input_tokens_seen": 116085350, + "router_z_loss_clip": 1.83886719, + "router_z_loss_mlp": 0.15161133, + "step": 5407, + "time_per_iteration": 2.4856173992156982 + }, + { + "auxiliary_loss_clip": 0.06469748, + "auxiliary_loss_mlp": 0.01276836, + "balance_loss_clip": 0.06284758, + "balance_loss_mlp": 0.01261195, + "epoch": 0.3251465504283782, + "flos": 27827004908160.0, + "grad_norm": 2.430972813034592, + "language_loss": 0.69975567, + "learning_rate": 3.153692632731479e-06, + "loss": 0.77722144, + "num_input_tokens_seen": 116107560, + "router_z_loss_clip": 1.84960938, + "router_z_loss_mlp": 0.15649414, + "step": 5408, + "time_per_iteration": 2.5838799476623535 + }, + { + "auxiliary_loss_clip": 0.06481153, + "auxiliary_loss_mlp": 0.01282988, + "balance_loss_clip": 0.06286341, + "balance_loss_mlp": 0.01267396, + "epoch": 0.32520667368104617, + "flos": 19069271867520.0, + "grad_norm": 3.909403651515765, + "language_loss": 0.78053123, + "learning_rate": 3.153374478034841e-06, + "loss": 0.85817266, + "num_input_tokens_seen": 116125980, + "router_z_loss_clip": 1.94824219, + "router_z_loss_mlp": 0.15588379, + "step": 5409, + "time_per_iteration": 2.5178377628326416 + }, + { + "auxiliary_loss_clip": 0.06473932, + "auxiliary_loss_mlp": 0.01272582, + "balance_loss_clip": 0.06286227, + "balance_loss_mlp": 0.01256202, + "epoch": 0.32526679693371413, + "flos": 29388917341440.0, + "grad_norm": 1.8050072916987376, + "language_loss": 0.83473468, + "learning_rate": 3.1530562796021285e-06, + "loss": 0.91219985, + "num_input_tokens_seen": 116146530, + "router_z_loss_clip": 1.87695312, + "router_z_loss_mlp": 0.16381836, + "step": 5410, + "time_per_iteration": 2.5948092937469482 + }, + { + "auxiliary_loss_clip": 0.06466505, + "auxiliary_loss_mlp": 0.01275621, + "balance_loss_clip": 0.06286819, + "balance_loss_mlp": 0.01261274, + "epoch": 0.3253269201863821, + "flos": 20710833206400.0, + "grad_norm": 1.580323990141508, + "language_loss": 0.72005814, + "learning_rate": 3.152738037445405e-06, + "loss": 0.79747939, + "num_input_tokens_seen": 116165695, + "router_z_loss_clip": 1.79589844, + "router_z_loss_mlp": 0.14349365, + "step": 5411, + "time_per_iteration": 2.515542507171631 + }, + { + "auxiliary_loss_clip": 0.06472497, + "auxiliary_loss_mlp": 0.0127713, + "balance_loss_clip": 0.06287136, + "balance_loss_mlp": 0.01261632, + "epoch": 0.32538704343905006, + "flos": 29101515436800.0, + "grad_norm": 1.470162471805647, + "language_loss": 0.83496881, + "learning_rate": 3.1524197515767403e-06, + "loss": 0.91246504, + "num_input_tokens_seen": 116185375, + "router_z_loss_clip": 1.85351562, + "router_z_loss_mlp": 0.15490723, + "step": 5412, + "time_per_iteration": 2.55008602142334 + }, + { + "auxiliary_loss_clip": 0.06476887, + "auxiliary_loss_mlp": 0.01277617, + "balance_loss_clip": 0.06287435, + "balance_loss_mlp": 0.01260904, + "epoch": 0.325447166691718, + "flos": 24682216521600.0, + "grad_norm": 1.5504273053971407, + "language_loss": 0.8129071, + "learning_rate": 3.152101422008203e-06, + "loss": 0.89045215, + "num_input_tokens_seen": 116204335, + "router_z_loss_clip": 1.89550781, + "router_z_loss_mlp": 0.16711426, + "step": 5413, + "time_per_iteration": 2.54195499420166 + }, + { + "auxiliary_loss_clip": 0.06477104, + "auxiliary_loss_mlp": 0.0127801, + "balance_loss_clip": 0.0628976, + "balance_loss_mlp": 0.01261643, + "epoch": 0.325507289944386, + "flos": 21549503122560.0, + "grad_norm": 1.5527044192655586, + "language_loss": 0.76985061, + "learning_rate": 3.151783048751864e-06, + "loss": 0.84740174, + "num_input_tokens_seen": 116222840, + "router_z_loss_clip": 1.87402344, + "router_z_loss_mlp": 0.16363525, + "step": 5414, + "time_per_iteration": 2.5435919761657715 + }, + { + "auxiliary_loss_clip": 0.063807, + "auxiliary_loss_mlp": 0.01284661, + "balance_loss_clip": 0.06291388, + "balance_loss_mlp": 0.01280793, + "epoch": 0.32556741319705396, + "flos": 71537893194240.0, + "grad_norm": 0.9015335749308697, + "language_loss": 0.64095414, + "learning_rate": 3.1514646318197965e-06, + "loss": 0.71760774, + "num_input_tokens_seen": 116274940, + "router_z_loss_clip": 0.890625, + "router_z_loss_mlp": 0.03863525, + "step": 5415, + "time_per_iteration": 3.0875957012176514 + }, + { + "auxiliary_loss_clip": 0.0647157, + "auxiliary_loss_mlp": 0.01275105, + "balance_loss_clip": 0.06285933, + "balance_loss_mlp": 0.01258845, + "epoch": 0.3256275364497219, + "flos": 23739187944960.0, + "grad_norm": 1.4815485577141352, + "language_loss": 0.74123245, + "learning_rate": 3.151146171224075e-06, + "loss": 0.81869924, + "num_input_tokens_seen": 116297300, + "router_z_loss_clip": 1.85546875, + "router_z_loss_mlp": 0.16235352, + "step": 5416, + "time_per_iteration": 2.5792665481567383 + }, + { + "auxiliary_loss_clip": 0.06381539, + "auxiliary_loss_mlp": 0.01266569, + "balance_loss_clip": 0.06293018, + "balance_loss_mlp": 0.01262769, + "epoch": 0.3256876597023899, + "flos": 67308136214400.0, + "grad_norm": 0.7704887993649999, + "language_loss": 0.57850802, + "learning_rate": 3.1508276669767757e-06, + "loss": 0.65498912, + "num_input_tokens_seen": 116362370, + "router_z_loss_clip": 0.88378906, + "router_z_loss_mlp": 0.03793335, + "step": 5417, + "time_per_iteration": 3.2770884037017822 + }, + { + "auxiliary_loss_clip": 0.06373264, + "auxiliary_loss_mlp": 0.01258837, + "balance_loss_clip": 0.06284805, + "balance_loss_mlp": 0.01254933, + "epoch": 0.32574778295505785, + "flos": 71304633826560.0, + "grad_norm": 0.8775074523137479, + "language_loss": 0.63674986, + "learning_rate": 3.150509119089975e-06, + "loss": 0.71307087, + "num_input_tokens_seen": 116430365, + "router_z_loss_clip": 0.8828125, + "router_z_loss_mlp": 0.03900146, + "step": 5418, + "time_per_iteration": 3.315948724746704 + }, + { + "auxiliary_loss_clip": 0.06476019, + "auxiliary_loss_mlp": 0.01273465, + "balance_loss_clip": 0.06290952, + "balance_loss_mlp": 0.01258111, + "epoch": 0.3258079062077258, + "flos": 20782515974400.0, + "grad_norm": 1.8847025208507953, + "language_loss": 0.6957128, + "learning_rate": 3.1501905275757537e-06, + "loss": 0.77320766, + "num_input_tokens_seen": 116447525, + "router_z_loss_clip": 1.85058594, + "router_z_loss_mlp": 0.15344238, + "step": 5419, + "time_per_iteration": 2.5722780227661133 + }, + { + "auxiliary_loss_clip": 0.06480842, + "auxiliary_loss_mlp": 0.01275789, + "balance_loss_clip": 0.06291591, + "balance_loss_mlp": 0.01260006, + "epoch": 0.3258680294603938, + "flos": 22241788755840.0, + "grad_norm": 2.023173952709465, + "language_loss": 0.77398664, + "learning_rate": 3.1498718924461926e-06, + "loss": 0.85155296, + "num_input_tokens_seen": 116466310, + "router_z_loss_clip": 1.890625, + "router_z_loss_mlp": 0.15783691, + "step": 5420, + "time_per_iteration": 2.5199873447418213 + }, + { + "auxiliary_loss_clip": 0.06478356, + "auxiliary_loss_mlp": 0.0127343, + "balance_loss_clip": 0.06290038, + "balance_loss_mlp": 0.0125798, + "epoch": 0.3259281527130618, + "flos": 26987328743040.0, + "grad_norm": 1.5124533627457746, + "language_loss": 0.80826706, + "learning_rate": 3.1495532137133736e-06, + "loss": 0.88578492, + "num_input_tokens_seen": 116487825, + "router_z_loss_clip": 1.88378906, + "router_z_loss_mlp": 0.15441895, + "step": 5421, + "time_per_iteration": 2.6014363765716553 + }, + { + "auxiliary_loss_clip": 0.06476312, + "auxiliary_loss_mlp": 0.0127337, + "balance_loss_clip": 0.06293876, + "balance_loss_mlp": 0.01258982, + "epoch": 0.32598827596572977, + "flos": 26221557479040.0, + "grad_norm": 1.4846059645471, + "language_loss": 0.76098251, + "learning_rate": 3.149234491389381e-06, + "loss": 0.8384794, + "num_input_tokens_seen": 116509950, + "router_z_loss_clip": 1.82421875, + "router_z_loss_mlp": 0.1439209, + "step": 5422, + "time_per_iteration": 2.5738978385925293 + }, + { + "auxiliary_loss_clip": 0.06480287, + "auxiliary_loss_mlp": 0.01270664, + "balance_loss_clip": 0.06288645, + "balance_loss_mlp": 0.01255095, + "epoch": 0.32604839921839773, + "flos": 17645567944320.0, + "grad_norm": 2.282982793788361, + "language_loss": 0.63826233, + "learning_rate": 3.1489157254863026e-06, + "loss": 0.71577179, + "num_input_tokens_seen": 116527695, + "router_z_loss_clip": 1.91699219, + "router_z_loss_mlp": 0.15576172, + "step": 5423, + "time_per_iteration": 2.5513644218444824 + }, + { + "auxiliary_loss_clip": 0.06469624, + "auxiliary_loss_mlp": 0.01273816, + "balance_loss_clip": 0.06290927, + "balance_loss_mlp": 0.01258748, + "epoch": 0.3261085224710657, + "flos": 23629420696320.0, + "grad_norm": 1.6690467832946037, + "language_loss": 0.75170749, + "learning_rate": 3.148596916016224e-06, + "loss": 0.82914186, + "num_input_tokens_seen": 116547800, + "router_z_loss_clip": 1.78710938, + "router_z_loss_mlp": 0.1505127, + "step": 5424, + "time_per_iteration": 2.546074151992798 + }, + { + "auxiliary_loss_clip": 0.06470636, + "auxiliary_loss_mlp": 0.01274311, + "balance_loss_clip": 0.06288706, + "balance_loss_mlp": 0.01258945, + "epoch": 0.32616864572373366, + "flos": 23267526912000.0, + "grad_norm": 1.6415169459291201, + "language_loss": 0.7718606, + "learning_rate": 3.1482780629912355e-06, + "loss": 0.84931004, + "num_input_tokens_seen": 116568460, + "router_z_loss_clip": 1.81933594, + "router_z_loss_mlp": 0.15368652, + "step": 5425, + "time_per_iteration": 2.5883710384368896 + }, + { + "auxiliary_loss_clip": 0.06476015, + "auxiliary_loss_mlp": 0.01273254, + "balance_loss_clip": 0.06284191, + "balance_loss_mlp": 0.01256589, + "epoch": 0.32622876897640163, + "flos": 25600535343360.0, + "grad_norm": 2.4681515054731924, + "language_loss": 0.78599709, + "learning_rate": 3.147959166423428e-06, + "loss": 0.86348987, + "num_input_tokens_seen": 116588705, + "router_z_loss_clip": 1.91992188, + "router_z_loss_mlp": 0.16650391, + "step": 5426, + "time_per_iteration": 2.569566488265991 + }, + { + "auxiliary_loss_clip": 0.06473041, + "auxiliary_loss_mlp": 0.01273059, + "balance_loss_clip": 0.06286261, + "balance_loss_mlp": 0.0125749, + "epoch": 0.3262888922290696, + "flos": 22425544759680.0, + "grad_norm": 1.6671872965592953, + "language_loss": 0.74719262, + "learning_rate": 3.147640226324893e-06, + "loss": 0.82465363, + "num_input_tokens_seen": 116608845, + "router_z_loss_clip": 1.86816406, + "router_z_loss_mlp": 0.15563965, + "step": 5427, + "time_per_iteration": 3.941770315170288 + }, + { + "auxiliary_loss_clip": 0.06474692, + "auxiliary_loss_mlp": 0.0127251, + "balance_loss_clip": 0.06285059, + "balance_loss_mlp": 0.01256154, + "epoch": 0.32634901548173756, + "flos": 19724982393600.0, + "grad_norm": 2.0508761677602965, + "language_loss": 0.79472262, + "learning_rate": 3.1473212427077266e-06, + "loss": 0.87219465, + "num_input_tokens_seen": 116628145, + "router_z_loss_clip": 1.89550781, + "router_z_loss_mlp": 0.16357422, + "step": 5428, + "time_per_iteration": 3.9950850009918213 + }, + { + "auxiliary_loss_clip": 0.06475013, + "auxiliary_loss_mlp": 0.01275116, + "balance_loss_clip": 0.0628937, + "balance_loss_mlp": 0.01259309, + "epoch": 0.3264091387344055, + "flos": 16148336463360.0, + "grad_norm": 1.5445825374219135, + "language_loss": 0.71770716, + "learning_rate": 3.147002215584023e-06, + "loss": 0.79520845, + "num_input_tokens_seen": 116646920, + "router_z_loss_clip": 1.85644531, + "router_z_loss_mlp": 0.15808105, + "step": 5429, + "time_per_iteration": 3.922197103500366 + }, + { + "auxiliary_loss_clip": 0.06468233, + "auxiliary_loss_mlp": 0.01269844, + "balance_loss_clip": 0.06283497, + "balance_loss_mlp": 0.01254466, + "epoch": 0.3264692619870735, + "flos": 16404655703040.0, + "grad_norm": 1.5791835311639297, + "language_loss": 0.78689212, + "learning_rate": 3.146683144965881e-06, + "loss": 0.86427283, + "num_input_tokens_seen": 116665100, + "router_z_loss_clip": 1.84863281, + "router_z_loss_mlp": 0.15380859, + "step": 5430, + "time_per_iteration": 2.4873790740966797 + }, + { + "auxiliary_loss_clip": 0.06468185, + "auxiliary_loss_mlp": 0.0127668, + "balance_loss_clip": 0.06281599, + "balance_loss_mlp": 0.01259561, + "epoch": 0.32652938523974145, + "flos": 22388843871360.0, + "grad_norm": 1.9481749952405665, + "language_loss": 0.84556186, + "learning_rate": 3.146364030865399e-06, + "loss": 0.92301053, + "num_input_tokens_seen": 116682205, + "router_z_loss_clip": 1.86425781, + "router_z_loss_mlp": 0.17126465, + "step": 5431, + "time_per_iteration": 2.522075653076172 + }, + { + "auxiliary_loss_clip": 0.06468672, + "auxiliary_loss_mlp": 0.01274085, + "balance_loss_clip": 0.06286903, + "balance_loss_mlp": 0.01259327, + "epoch": 0.3265895084924094, + "flos": 21914499507840.0, + "grad_norm": 1.6266920997971765, + "language_loss": 0.71123517, + "learning_rate": 3.146044873294678e-06, + "loss": 0.78866279, + "num_input_tokens_seen": 116702575, + "router_z_loss_clip": 1.81835938, + "router_z_loss_mlp": 0.14758301, + "step": 5432, + "time_per_iteration": 2.513209104537964 + }, + { + "auxiliary_loss_clip": 0.06469099, + "auxiliary_loss_mlp": 0.01272277, + "balance_loss_clip": 0.06282821, + "balance_loss_mlp": 0.01257424, + "epoch": 0.3266496317450774, + "flos": 16072083648000.0, + "grad_norm": 1.3982751613904698, + "language_loss": 0.84207368, + "learning_rate": 3.1457256722658203e-06, + "loss": 0.91948748, + "num_input_tokens_seen": 116720885, + "router_z_loss_clip": 1.86328125, + "router_z_loss_mlp": 0.14855957, + "step": 5433, + "time_per_iteration": 2.5324172973632812 + }, + { + "auxiliary_loss_clip": 0.06463822, + "auxiliary_loss_mlp": 0.01279207, + "balance_loss_clip": 0.06283711, + "balance_loss_mlp": 0.01264049, + "epoch": 0.3267097549977454, + "flos": 22534766956800.0, + "grad_norm": 1.4562075652627795, + "language_loss": 0.85916972, + "learning_rate": 3.145406427790931e-06, + "loss": 0.93660003, + "num_input_tokens_seen": 116740395, + "router_z_loss_clip": 1.80175781, + "router_z_loss_mlp": 0.15155029, + "step": 5434, + "time_per_iteration": 3.9434614181518555 + }, + { + "auxiliary_loss_clip": 0.06468898, + "auxiliary_loss_mlp": 0.01277076, + "balance_loss_clip": 0.06281307, + "balance_loss_mlp": 0.0126134, + "epoch": 0.32676987825041337, + "flos": 27277581686400.0, + "grad_norm": 1.6909362765146225, + "language_loss": 0.88470823, + "learning_rate": 3.1450871398821147e-06, + "loss": 0.96216792, + "num_input_tokens_seen": 116758870, + "router_z_loss_clip": 1.875, + "router_z_loss_mlp": 0.1574707, + "step": 5435, + "time_per_iteration": 2.5430006980895996 + }, + { + "auxiliary_loss_clip": 0.06469613, + "auxiliary_loss_mlp": 0.01271625, + "balance_loss_clip": 0.06283396, + "balance_loss_mlp": 0.01256306, + "epoch": 0.32683000150308134, + "flos": 11512731432960.0, + "grad_norm": 2.3091497119382733, + "language_loss": 0.77129918, + "learning_rate": 3.144767808551479e-06, + "loss": 0.84871155, + "num_input_tokens_seen": 116773440, + "router_z_loss_clip": 1.86035156, + "router_z_loss_mlp": 0.15307617, + "step": 5436, + "time_per_iteration": 2.486003875732422 + }, + { + "auxiliary_loss_clip": 0.06464212, + "auxiliary_loss_mlp": 0.01277236, + "balance_loss_clip": 0.0628064, + "balance_loss_mlp": 0.01261977, + "epoch": 0.3268901247557493, + "flos": 25637362012800.0, + "grad_norm": 1.5303988762112921, + "language_loss": 0.72448635, + "learning_rate": 3.144448433811134e-06, + "loss": 0.80190074, + "num_input_tokens_seen": 116794375, + "router_z_loss_clip": 1.83398438, + "router_z_loss_mlp": 0.15270996, + "step": 5437, + "time_per_iteration": 2.545548915863037 + }, + { + "auxiliary_loss_clip": 0.06472606, + "auxiliary_loss_mlp": 0.01275014, + "balance_loss_clip": 0.06282267, + "balance_loss_mlp": 0.01258253, + "epoch": 0.32695024800841727, + "flos": 24867356117760.0, + "grad_norm": 1.604360978002023, + "language_loss": 0.64194709, + "learning_rate": 3.144129015673189e-06, + "loss": 0.71942323, + "num_input_tokens_seen": 116815095, + "router_z_loss_clip": 1.90429688, + "router_z_loss_mlp": 0.16760254, + "step": 5438, + "time_per_iteration": 2.5657694339752197 + }, + { + "auxiliary_loss_clip": 0.06462848, + "auxiliary_loss_mlp": 0.01273041, + "balance_loss_clip": 0.0627985, + "balance_loss_mlp": 0.01257246, + "epoch": 0.32701037126108523, + "flos": 28846663643520.0, + "grad_norm": 1.637174889107761, + "language_loss": 0.74795192, + "learning_rate": 3.1438095541497576e-06, + "loss": 0.82531083, + "num_input_tokens_seen": 116836630, + "router_z_loss_clip": 1.82910156, + "router_z_loss_mlp": 0.15795898, + "step": 5439, + "time_per_iteration": 2.5655689239501953 + }, + { + "auxiliary_loss_clip": 0.06473309, + "auxiliary_loss_mlp": 0.01272512, + "balance_loss_clip": 0.06286814, + "balance_loss_mlp": 0.01257087, + "epoch": 0.3270704945137532, + "flos": 27972592577280.0, + "grad_norm": 1.745503595629167, + "language_loss": 0.74950606, + "learning_rate": 3.1434900492529527e-06, + "loss": 0.82696426, + "num_input_tokens_seen": 116856880, + "router_z_loss_clip": 1.86425781, + "router_z_loss_mlp": 0.1541748, + "step": 5440, + "time_per_iteration": 2.601821184158325 + }, + { + "auxiliary_loss_clip": 0.06460315, + "auxiliary_loss_mlp": 0.01269526, + "balance_loss_clip": 0.06277528, + "balance_loss_mlp": 0.01254947, + "epoch": 0.32713061776642116, + "flos": 23696575344000.0, + "grad_norm": 1.95462638600934, + "language_loss": 0.84695202, + "learning_rate": 3.1431705009948914e-06, + "loss": 0.92425048, + "num_input_tokens_seen": 116873770, + "router_z_loss_clip": 1.82910156, + "router_z_loss_mlp": 0.14599609, + "step": 5441, + "time_per_iteration": 2.5020570755004883 + }, + { + "auxiliary_loss_clip": 0.06466734, + "auxiliary_loss_mlp": 0.01272021, + "balance_loss_clip": 0.06280614, + "balance_loss_mlp": 0.01256798, + "epoch": 0.3271907410190891, + "flos": 22462203720960.0, + "grad_norm": 1.9620532707625304, + "language_loss": 0.86928713, + "learning_rate": 3.1428509093876897e-06, + "loss": 0.9466747, + "num_input_tokens_seen": 116891225, + "router_z_loss_clip": 1.86035156, + "router_z_loss_mlp": 0.15222168, + "step": 5442, + "time_per_iteration": 2.5388059616088867 + }, + { + "auxiliary_loss_clip": 0.06470812, + "auxiliary_loss_mlp": 0.0126936, + "balance_loss_clip": 0.06282146, + "balance_loss_mlp": 0.01254399, + "epoch": 0.3272508642717571, + "flos": 22826696981760.0, + "grad_norm": 1.5979656279548642, + "language_loss": 0.77388418, + "learning_rate": 3.1425312744434668e-06, + "loss": 0.85128593, + "num_input_tokens_seen": 116912300, + "router_z_loss_clip": 1.88671875, + "router_z_loss_mlp": 0.1496582, + "step": 5443, + "time_per_iteration": 2.5765621662139893 + }, + { + "auxiliary_loss_clip": 0.0646731, + "auxiliary_loss_mlp": 0.01271075, + "balance_loss_clip": 0.06280384, + "balance_loss_mlp": 0.01255518, + "epoch": 0.32731098752442506, + "flos": 11806086977280.0, + "grad_norm": 2.2200780771744073, + "language_loss": 0.82818562, + "learning_rate": 3.142211596174343e-06, + "loss": 0.90556955, + "num_input_tokens_seen": 116929425, + "router_z_loss_clip": 1.8671875, + "router_z_loss_mlp": 0.15551758, + "step": 5444, + "time_per_iteration": 2.5514841079711914 + }, + { + "auxiliary_loss_clip": 0.06468201, + "auxiliary_loss_mlp": 0.01274937, + "balance_loss_clip": 0.06282412, + "balance_loss_mlp": 0.01258295, + "epoch": 0.327371110777093, + "flos": 21033300844800.0, + "grad_norm": 2.365977713323657, + "language_loss": 0.59248179, + "learning_rate": 3.1418918745924423e-06, + "loss": 0.66991317, + "num_input_tokens_seen": 116948255, + "router_z_loss_clip": 1.85742188, + "router_z_loss_mlp": 0.16638184, + "step": 5445, + "time_per_iteration": 2.5325539112091064 + }, + { + "auxiliary_loss_clip": 0.06469189, + "auxiliary_loss_mlp": 0.01278146, + "balance_loss_clip": 0.0628283, + "balance_loss_mlp": 0.01261278, + "epoch": 0.327431234029761, + "flos": 19068055983360.0, + "grad_norm": 2.7570820492615886, + "language_loss": 0.89260846, + "learning_rate": 3.1415721097098865e-06, + "loss": 0.97008175, + "num_input_tokens_seen": 116964905, + "router_z_loss_clip": 1.86425781, + "router_z_loss_mlp": 0.16870117, + "step": 5446, + "time_per_iteration": 2.576833724975586 + }, + { + "auxiliary_loss_clip": 0.06476346, + "auxiliary_loss_mlp": 0.01274903, + "balance_loss_clip": 0.06282137, + "balance_loss_mlp": 0.01257403, + "epoch": 0.32749135728242895, + "flos": 25856435312640.0, + "grad_norm": 1.9641165872810087, + "language_loss": 0.79404771, + "learning_rate": 3.141252301538802e-06, + "loss": 0.87156022, + "num_input_tokens_seen": 116983650, + "router_z_loss_clip": 1.94238281, + "router_z_loss_mlp": 0.17480469, + "step": 5447, + "time_per_iteration": 2.5539090633392334 + }, + { + "auxiliary_loss_clip": 0.06462374, + "auxiliary_loss_mlp": 0.01278273, + "balance_loss_clip": 0.06277826, + "balance_loss_mlp": 0.01263277, + "epoch": 0.327551480535097, + "flos": 20126721594240.0, + "grad_norm": 1.953936246680755, + "language_loss": 0.73150277, + "learning_rate": 3.1409324500913157e-06, + "loss": 0.80890924, + "num_input_tokens_seen": 117003265, + "router_z_loss_clip": 1.84667969, + "router_z_loss_mlp": 0.14990234, + "step": 5448, + "time_per_iteration": 2.633612871170044 + }, + { + "auxiliary_loss_clip": 0.06464307, + "auxiliary_loss_mlp": 0.01272265, + "balance_loss_clip": 0.0628064, + "balance_loss_mlp": 0.01256291, + "epoch": 0.32761160378776494, + "flos": 28811094785280.0, + "grad_norm": 1.3623614976773524, + "language_loss": 0.67002481, + "learning_rate": 3.1406125553795567e-06, + "loss": 0.74739063, + "num_input_tokens_seen": 117025370, + "router_z_loss_clip": 1.83789062, + "router_z_loss_mlp": 0.15966797, + "step": 5449, + "time_per_iteration": 2.5777859687805176 + }, + { + "auxiliary_loss_clip": 0.0647198, + "auxiliary_loss_mlp": 0.01270062, + "balance_loss_clip": 0.0628611, + "balance_loss_mlp": 0.01254493, + "epoch": 0.3276717270404329, + "flos": 26944171090560.0, + "grad_norm": 1.378619651715801, + "language_loss": 0.65736711, + "learning_rate": 3.1402926174156556e-06, + "loss": 0.73478758, + "num_input_tokens_seen": 117044350, + "router_z_loss_clip": 1.85644531, + "router_z_loss_mlp": 0.15576172, + "step": 5450, + "time_per_iteration": 2.5971169471740723 + }, + { + "auxiliary_loss_clip": 0.06468028, + "auxiliary_loss_mlp": 0.01275162, + "balance_loss_clip": 0.06280884, + "balance_loss_mlp": 0.01258509, + "epoch": 0.32773185029310087, + "flos": 25345557768960.0, + "grad_norm": 7.041147023955008, + "language_loss": 0.77832162, + "learning_rate": 3.1399726362117437e-06, + "loss": 0.85575354, + "num_input_tokens_seen": 117064450, + "router_z_loss_clip": 1.87207031, + "router_z_loss_mlp": 0.16662598, + "step": 5451, + "time_per_iteration": 2.572112560272217 + }, + { + "auxiliary_loss_clip": 0.06472664, + "auxiliary_loss_mlp": 0.01278588, + "balance_loss_clip": 0.06283467, + "balance_loss_mlp": 0.01262042, + "epoch": 0.32779197354576883, + "flos": 26398227813120.0, + "grad_norm": 1.9495025825112327, + "language_loss": 0.70696288, + "learning_rate": 3.1396526117799555e-06, + "loss": 0.78447533, + "num_input_tokens_seen": 117083060, + "router_z_loss_clip": 1.88867188, + "router_z_loss_mlp": 0.16540527, + "step": 5452, + "time_per_iteration": 2.6081676483154297 + }, + { + "auxiliary_loss_clip": 0.0646618, + "auxiliary_loss_mlp": 0.01272924, + "balance_loss_clip": 0.06283787, + "balance_loss_mlp": 0.01256938, + "epoch": 0.3278520967984368, + "flos": 24906237212160.0, + "grad_norm": 1.6132254933408041, + "language_loss": 0.7924304, + "learning_rate": 3.1393325441324256e-06, + "loss": 0.86982143, + "num_input_tokens_seen": 117101860, + "router_z_loss_clip": 1.82519531, + "router_z_loss_mlp": 0.15979004, + "step": 5453, + "time_per_iteration": 2.5893869400024414 + }, + { + "auxiliary_loss_clip": 0.06469721, + "auxiliary_loss_mlp": 0.01274795, + "balance_loss_clip": 0.06282013, + "balance_loss_mlp": 0.01259309, + "epoch": 0.32791222005110476, + "flos": 29760831688320.0, + "grad_norm": 2.0442879632543476, + "language_loss": 0.758448, + "learning_rate": 3.1390124332812916e-06, + "loss": 0.83589315, + "num_input_tokens_seen": 117123100, + "router_z_loss_clip": 1.87792969, + "router_z_loss_mlp": 0.15478516, + "step": 5454, + "time_per_iteration": 2.590080499649048 + }, + { + "auxiliary_loss_clip": 0.06461332, + "auxiliary_loss_mlp": 0.01271865, + "balance_loss_clip": 0.06280516, + "balance_loss_mlp": 0.01257536, + "epoch": 0.32797234330377273, + "flos": 16513584410880.0, + "grad_norm": 2.183253633037468, + "language_loss": 0.77119774, + "learning_rate": 3.1386922792386924e-06, + "loss": 0.8485297, + "num_input_tokens_seen": 117140515, + "router_z_loss_clip": 1.80761719, + "router_z_loss_mlp": 0.14318848, + "step": 5455, + "time_per_iteration": 2.4873318672180176 + }, + { + "auxiliary_loss_clip": 0.06482153, + "auxiliary_loss_mlp": 0.01285817, + "balance_loss_clip": 0.06290287, + "balance_loss_mlp": 0.01268377, + "epoch": 0.3280324665564407, + "flos": 26585086417920.0, + "grad_norm": 1.6915080932551223, + "language_loss": 0.74407738, + "learning_rate": 3.138372082016768e-06, + "loss": 0.82175708, + "num_input_tokens_seen": 117161485, + "router_z_loss_clip": 1.91894531, + "router_z_loss_mlp": 0.17443848, + "step": 5456, + "time_per_iteration": 2.593258857727051 + }, + { + "auxiliary_loss_clip": 0.0646835, + "auxiliary_loss_mlp": 0.01277637, + "balance_loss_clip": 0.06282572, + "balance_loss_mlp": 0.01261306, + "epoch": 0.32809258980910866, + "flos": 22936631938560.0, + "grad_norm": 1.4862092693082851, + "language_loss": 0.78666067, + "learning_rate": 3.1380518416276596e-06, + "loss": 0.8641206, + "num_input_tokens_seen": 117181870, + "router_z_loss_clip": 1.85546875, + "router_z_loss_mlp": 0.16345215, + "step": 5457, + "time_per_iteration": 2.523540496826172 + }, + { + "auxiliary_loss_clip": 0.06473868, + "auxiliary_loss_mlp": 0.01274513, + "balance_loss_clip": 0.06281006, + "balance_loss_mlp": 0.0125873, + "epoch": 0.3281527130617766, + "flos": 22790457290880.0, + "grad_norm": 2.0769759307730644, + "language_loss": 0.78958774, + "learning_rate": 3.1377315580835115e-06, + "loss": 0.86707151, + "num_input_tokens_seen": 117201380, + "router_z_loss_clip": 1.92773438, + "router_z_loss_mlp": 0.15795898, + "step": 5458, + "time_per_iteration": 2.552680015563965 + }, + { + "auxiliary_loss_clip": 0.06469774, + "auxiliary_loss_mlp": 0.01274499, + "balance_loss_clip": 0.06284518, + "balance_loss_mlp": 0.01258215, + "epoch": 0.3282128363144446, + "flos": 21256902264960.0, + "grad_norm": 1.5512978296749391, + "language_loss": 0.73655844, + "learning_rate": 3.1374112313964686e-06, + "loss": 0.8140012, + "num_input_tokens_seen": 117221040, + "router_z_loss_clip": 1.84960938, + "router_z_loss_mlp": 0.1628418, + "step": 5459, + "time_per_iteration": 2.5166404247283936 + }, + { + "auxiliary_loss_clip": 0.0647283, + "auxiliary_loss_mlp": 0.01274033, + "balance_loss_clip": 0.0628351, + "balance_loss_mlp": 0.01257761, + "epoch": 0.32827295956711255, + "flos": 30850328401920.0, + "grad_norm": 2.2277675097031993, + "language_loss": 0.84476066, + "learning_rate": 3.1370908615786783e-06, + "loss": 0.92222929, + "num_input_tokens_seen": 117241395, + "router_z_loss_clip": 1.89257812, + "router_z_loss_mlp": 0.16271973, + "step": 5460, + "time_per_iteration": 2.6067721843719482 + }, + { + "auxiliary_loss_clip": 0.06469227, + "auxiliary_loss_mlp": 0.01276293, + "balance_loss_clip": 0.06282166, + "balance_loss_mlp": 0.01260319, + "epoch": 0.3283330828197806, + "flos": 25921032410880.0, + "grad_norm": 2.3722751928185297, + "language_loss": 0.78114808, + "learning_rate": 3.136770448642288e-06, + "loss": 0.8586033, + "num_input_tokens_seen": 117259340, + "router_z_loss_clip": 1.86914062, + "router_z_loss_mlp": 0.15991211, + "step": 5461, + "time_per_iteration": 2.550417184829712 + }, + { + "auxiliary_loss_clip": 0.06469681, + "auxiliary_loss_mlp": 0.01279493, + "balance_loss_clip": 0.06282061, + "balance_loss_mlp": 0.01261361, + "epoch": 0.32839320607244854, + "flos": 38591295672960.0, + "grad_norm": 1.5965953358146812, + "language_loss": 0.62925887, + "learning_rate": 3.1364499925994484e-06, + "loss": 0.70675063, + "num_input_tokens_seen": 117282375, + "router_z_loss_clip": 1.87695312, + "router_z_loss_mlp": 0.18115234, + "step": 5462, + "time_per_iteration": 2.7004194259643555 + }, + { + "auxiliary_loss_clip": 0.06467308, + "auxiliary_loss_mlp": 0.0128086, + "balance_loss_clip": 0.06284478, + "balance_loss_mlp": 0.01265077, + "epoch": 0.3284533293251165, + "flos": 26658068924160.0, + "grad_norm": 1.3126719376538145, + "language_loss": 0.78502059, + "learning_rate": 3.1361294934623115e-06, + "loss": 0.86250222, + "num_input_tokens_seen": 117303830, + "router_z_loss_clip": 1.828125, + "router_z_loss_mlp": 0.15783691, + "step": 5463, + "time_per_iteration": 2.6072070598602295 + }, + { + "auxiliary_loss_clip": 0.0647091, + "auxiliary_loss_mlp": 0.01272646, + "balance_loss_clip": 0.06283993, + "balance_loss_mlp": 0.01256589, + "epoch": 0.32851345257778447, + "flos": 15309498839040.0, + "grad_norm": 1.727782559794916, + "language_loss": 0.70068884, + "learning_rate": 3.1358089512430303e-06, + "loss": 0.77812445, + "num_input_tokens_seen": 117320665, + "router_z_loss_clip": 1.87109375, + "router_z_loss_mlp": 0.16064453, + "step": 5464, + "time_per_iteration": 2.519319534301758 + }, + { + "auxiliary_loss_clip": 0.06466094, + "auxiliary_loss_mlp": 0.01275271, + "balance_loss_clip": 0.06284432, + "balance_loss_mlp": 0.01257938, + "epoch": 0.32857357583045244, + "flos": 23520491988480.0, + "grad_norm": 1.6619431416557902, + "language_loss": 0.72759986, + "learning_rate": 3.1354883659537594e-06, + "loss": 0.80501354, + "num_input_tokens_seen": 117339795, + "router_z_loss_clip": 1.81445312, + "router_z_loss_mlp": 0.17333984, + "step": 5465, + "time_per_iteration": 2.573444366455078 + }, + { + "auxiliary_loss_clip": 0.06473792, + "auxiliary_loss_mlp": 0.01281793, + "balance_loss_clip": 0.06287415, + "balance_loss_mlp": 0.01265509, + "epoch": 0.3286336990831204, + "flos": 21001379639040.0, + "grad_norm": 1.5232981833560715, + "language_loss": 0.82967317, + "learning_rate": 3.1351677376066567e-06, + "loss": 0.90722907, + "num_input_tokens_seen": 117359525, + "router_z_loss_clip": 1.86328125, + "router_z_loss_mlp": 0.16271973, + "step": 5466, + "time_per_iteration": 4.012515306472778 + }, + { + "auxiliary_loss_clip": 0.0647275, + "auxiliary_loss_mlp": 0.01271061, + "balance_loss_clip": 0.06285034, + "balance_loss_mlp": 0.01254932, + "epoch": 0.32869382233578837, + "flos": 23665450752000.0, + "grad_norm": 1.6606265994221874, + "language_loss": 0.79192597, + "learning_rate": 3.134847066213879e-06, + "loss": 0.86936402, + "num_input_tokens_seen": 117380320, + "router_z_loss_clip": 1.87597656, + "router_z_loss_mlp": 0.16125488, + "step": 5467, + "time_per_iteration": 4.000247955322266 + }, + { + "auxiliary_loss_clip": 0.06467809, + "auxiliary_loss_mlp": 0.01271951, + "balance_loss_clip": 0.06279044, + "balance_loss_mlp": 0.01255333, + "epoch": 0.32875394558845633, + "flos": 25343335635840.0, + "grad_norm": 1.5510134892276737, + "language_loss": 0.74865687, + "learning_rate": 3.134526351787587e-06, + "loss": 0.82605445, + "num_input_tokens_seen": 117400695, + "router_z_loss_clip": 1.88574219, + "router_z_loss_mlp": 0.16601562, + "step": 5468, + "time_per_iteration": 2.5805253982543945 + }, + { + "auxiliary_loss_clip": 0.06474267, + "auxiliary_loss_mlp": 0.01276703, + "balance_loss_clip": 0.0628129, + "balance_loss_mlp": 0.01259108, + "epoch": 0.3288140688411243, + "flos": 14908430471040.0, + "grad_norm": 1.672146103500693, + "language_loss": 0.78728724, + "learning_rate": 3.134205594339942e-06, + "loss": 0.86479694, + "num_input_tokens_seen": 117418800, + "router_z_loss_clip": 1.9296875, + "router_z_loss_mlp": 0.17614746, + "step": 5469, + "time_per_iteration": 3.955373525619507 + }, + { + "auxiliary_loss_clip": 0.06466976, + "auxiliary_loss_mlp": 0.01273245, + "balance_loss_clip": 0.06279504, + "balance_loss_mlp": 0.01257224, + "epoch": 0.32887419209379226, + "flos": 18557220366720.0, + "grad_norm": 1.6018901390748483, + "language_loss": 0.82183433, + "learning_rate": 3.133884793883107e-06, + "loss": 0.89923656, + "num_input_tokens_seen": 117438220, + "router_z_loss_clip": 1.87402344, + "router_z_loss_mlp": 0.16015625, + "step": 5470, + "time_per_iteration": 2.5481319427490234 + }, + { + "auxiliary_loss_clip": 0.06467617, + "auxiliary_loss_mlp": 0.01271427, + "balance_loss_clip": 0.06279681, + "balance_loss_mlp": 0.01254869, + "epoch": 0.3289343153464602, + "flos": 48116560913280.0, + "grad_norm": 1.6166643495117736, + "language_loss": 0.68441176, + "learning_rate": 3.1335639504292478e-06, + "loss": 0.76180226, + "num_input_tokens_seen": 117462560, + "router_z_loss_clip": 1.87890625, + "router_z_loss_mlp": 0.16564941, + "step": 5471, + "time_per_iteration": 2.780454158782959 + }, + { + "auxiliary_loss_clip": 0.06479289, + "auxiliary_loss_mlp": 0.012789, + "balance_loss_clip": 0.06285035, + "balance_loss_mlp": 0.01260637, + "epoch": 0.3289944385991282, + "flos": 27607763900160.0, + "grad_norm": 1.5078842371471577, + "language_loss": 0.65564525, + "learning_rate": 3.1332430639905288e-06, + "loss": 0.73322713, + "num_input_tokens_seen": 117483665, + "router_z_loss_clip": 1.94140625, + "router_z_loss_mlp": 0.18273926, + "step": 5472, + "time_per_iteration": 2.580644369125366 + }, + { + "auxiliary_loss_clip": 0.06472386, + "auxiliary_loss_mlp": 0.01277133, + "balance_loss_clip": 0.06281875, + "balance_loss_mlp": 0.01259144, + "epoch": 0.32905456185179616, + "flos": 20126470032000.0, + "grad_norm": 1.614198879205061, + "language_loss": 0.88538003, + "learning_rate": 3.13292213457912e-06, + "loss": 0.96287525, + "num_input_tokens_seen": 117503565, + "router_z_loss_clip": 1.90527344, + "router_z_loss_mlp": 0.17993164, + "step": 5473, + "time_per_iteration": 4.021254062652588 + }, + { + "auxiliary_loss_clip": 0.06475069, + "auxiliary_loss_mlp": 0.01270272, + "balance_loss_clip": 0.06285396, + "balance_loss_mlp": 0.01253714, + "epoch": 0.3291146851044642, + "flos": 23186075143680.0, + "grad_norm": 1.7643015597930078, + "language_loss": 0.78719336, + "learning_rate": 3.1326011622071903e-06, + "loss": 0.86464679, + "num_input_tokens_seen": 117521460, + "router_z_loss_clip": 1.89453125, + "router_z_loss_mlp": 0.16552734, + "step": 5474, + "time_per_iteration": 2.5416688919067383 + }, + { + "auxiliary_loss_clip": 0.06379573, + "auxiliary_loss_mlp": 0.0134405, + "balance_loss_clip": 0.06291323, + "balance_loss_mlp": 0.01340224, + "epoch": 0.32917480835713214, + "flos": 67641630664320.0, + "grad_norm": 0.8577160187921843, + "language_loss": 0.60258645, + "learning_rate": 3.132280146886911e-06, + "loss": 0.67982268, + "num_input_tokens_seen": 117580550, + "router_z_loss_clip": 0.8828125, + "router_z_loss_mlp": 0.03820801, + "step": 5475, + "time_per_iteration": 3.1267805099487305 + }, + { + "auxiliary_loss_clip": 0.06479369, + "auxiliary_loss_mlp": 0.01279647, + "balance_loss_clip": 0.06284596, + "balance_loss_mlp": 0.01261599, + "epoch": 0.3292349316098001, + "flos": 27971963671680.0, + "grad_norm": 3.252822648856248, + "language_loss": 0.7712574, + "learning_rate": 3.131959088630455e-06, + "loss": 0.84884757, + "num_input_tokens_seen": 117600645, + "router_z_loss_clip": 1.94824219, + "router_z_loss_mlp": 0.18041992, + "step": 5476, + "time_per_iteration": 2.5819692611694336 + }, + { + "auxiliary_loss_clip": 0.06469015, + "auxiliary_loss_mlp": 0.01275163, + "balance_loss_clip": 0.06282525, + "balance_loss_mlp": 0.01258956, + "epoch": 0.3292950548624681, + "flos": 20269416297600.0, + "grad_norm": 1.7333439092472165, + "language_loss": 0.7556808, + "learning_rate": 3.131637987449997e-06, + "loss": 0.83312255, + "num_input_tokens_seen": 117618880, + "router_z_loss_clip": 1.86425781, + "router_z_loss_mlp": 0.1619873, + "step": 5477, + "time_per_iteration": 2.532106637954712 + }, + { + "auxiliary_loss_clip": 0.06470291, + "auxiliary_loss_mlp": 0.01275718, + "balance_loss_clip": 0.0628788, + "balance_loss_mlp": 0.01259541, + "epoch": 0.32935517811513604, + "flos": 20819174935680.0, + "grad_norm": 2.104456143380591, + "language_loss": 0.75728148, + "learning_rate": 3.131316843357713e-06, + "loss": 0.83474159, + "num_input_tokens_seen": 117636445, + "router_z_loss_clip": 1.82226562, + "router_z_loss_mlp": 0.16174316, + "step": 5478, + "time_per_iteration": 2.5293543338775635 + }, + { + "auxiliary_loss_clip": 0.06470281, + "auxiliary_loss_mlp": 0.01278094, + "balance_loss_clip": 0.06287058, + "balance_loss_mlp": 0.01261631, + "epoch": 0.329415301367804, + "flos": 18447704680320.0, + "grad_norm": 2.368560120299576, + "language_loss": 0.80772918, + "learning_rate": 3.1309956563657807e-06, + "loss": 0.8852129, + "num_input_tokens_seen": 117653105, + "router_z_loss_clip": 1.83203125, + "router_z_loss_mlp": 0.16455078, + "step": 5479, + "time_per_iteration": 2.5154647827148438 + }, + { + "auxiliary_loss_clip": 0.06362775, + "auxiliary_loss_mlp": 0.01272199, + "balance_loss_clip": 0.06275004, + "balance_loss_mlp": 0.01268579, + "epoch": 0.32947542462047197, + "flos": 66344967930240.0, + "grad_norm": 0.7366188072531391, + "language_loss": 0.56333017, + "learning_rate": 3.1306744264863804e-06, + "loss": 0.63967991, + "num_input_tokens_seen": 117719225, + "router_z_loss_clip": 0.87597656, + "router_z_loss_mlp": 0.03616333, + "step": 5480, + "time_per_iteration": 3.2369706630706787 + }, + { + "auxiliary_loss_clip": 0.06477214, + "auxiliary_loss_mlp": 0.01278618, + "balance_loss_clip": 0.06290235, + "balance_loss_mlp": 0.01262179, + "epoch": 0.32953554787313993, + "flos": 23228268474240.0, + "grad_norm": 1.631877255513098, + "language_loss": 0.7736274, + "learning_rate": 3.1303531537316915e-06, + "loss": 0.85118574, + "num_input_tokens_seen": 117738725, + "router_z_loss_clip": 1.8671875, + "router_z_loss_mlp": 0.16442871, + "step": 5481, + "time_per_iteration": 2.5206968784332275 + }, + { + "auxiliary_loss_clip": 0.06479073, + "auxiliary_loss_mlp": 0.01277292, + "balance_loss_clip": 0.0628771, + "balance_loss_mlp": 0.01260686, + "epoch": 0.3295956711258079, + "flos": 27015686150400.0, + "grad_norm": 1.3752047504599005, + "language_loss": 0.78639877, + "learning_rate": 3.130031838113899e-06, + "loss": 0.86396235, + "num_input_tokens_seen": 117757765, + "router_z_loss_clip": 1.91601562, + "router_z_loss_mlp": 0.16601562, + "step": 5482, + "time_per_iteration": 2.604720115661621 + }, + { + "auxiliary_loss_clip": 0.06475698, + "auxiliary_loss_mlp": 0.01274916, + "balance_loss_clip": 0.06286834, + "balance_loss_mlp": 0.01258274, + "epoch": 0.32965579437847586, + "flos": 19177697450880.0, + "grad_norm": 2.0027782692889358, + "language_loss": 0.74399549, + "learning_rate": 3.129710479645185e-06, + "loss": 0.82150161, + "num_input_tokens_seen": 117776810, + "router_z_loss_clip": 1.88964844, + "router_z_loss_mlp": 0.16662598, + "step": 5483, + "time_per_iteration": 2.5124409198760986 + }, + { + "auxiliary_loss_clip": 0.06472629, + "auxiliary_loss_mlp": 0.01273838, + "balance_loss_clip": 0.06286867, + "balance_loss_mlp": 0.01258472, + "epoch": 0.32971591763114383, + "flos": 30490447115520.0, + "grad_norm": 1.7640387903996015, + "language_loss": 0.7588225, + "learning_rate": 3.1293890783377366e-06, + "loss": 0.83628714, + "num_input_tokens_seen": 117797730, + "router_z_loss_clip": 1.85742188, + "router_z_loss_mlp": 0.15368652, + "step": 5484, + "time_per_iteration": 2.64021635055542 + }, + { + "auxiliary_loss_clip": 0.06469439, + "auxiliary_loss_mlp": 0.01274788, + "balance_loss_clip": 0.06284587, + "balance_loss_mlp": 0.01259232, + "epoch": 0.3297760408838118, + "flos": 16295140016640.0, + "grad_norm": 1.7787654746377481, + "language_loss": 0.72680974, + "learning_rate": 3.129067634203742e-06, + "loss": 0.80425203, + "num_input_tokens_seen": 117815365, + "router_z_loss_clip": 1.84960938, + "router_z_loss_mlp": 0.15563965, + "step": 5485, + "time_per_iteration": 2.516080379486084 + }, + { + "auxiliary_loss_clip": 0.06466281, + "auxiliary_loss_mlp": 0.01274799, + "balance_loss_clip": 0.06281459, + "balance_loss_mlp": 0.0125991, + "epoch": 0.32983616413647976, + "flos": 29538194590080.0, + "grad_norm": 2.336444213272706, + "language_loss": 0.80720758, + "learning_rate": 3.128746147255388e-06, + "loss": 0.8846184, + "num_input_tokens_seen": 117836095, + "router_z_loss_clip": 1.84765625, + "router_z_loss_mlp": 0.14904785, + "step": 5486, + "time_per_iteration": 2.633730173110962 + }, + { + "auxiliary_loss_clip": 0.06467714, + "auxiliary_loss_mlp": 0.01276658, + "balance_loss_clip": 0.06283799, + "balance_loss_mlp": 0.01261828, + "epoch": 0.3298962873891478, + "flos": 20637682992000.0, + "grad_norm": 1.9361428819205904, + "language_loss": 0.84726417, + "learning_rate": 3.1284246175048683e-06, + "loss": 0.92470789, + "num_input_tokens_seen": 117854655, + "router_z_loss_clip": 1.83886719, + "router_z_loss_mlp": 0.14819336, + "step": 5487, + "time_per_iteration": 2.5073888301849365 + }, + { + "auxiliary_loss_clip": 0.06473765, + "auxiliary_loss_mlp": 0.01275689, + "balance_loss_clip": 0.06283425, + "balance_loss_mlp": 0.01258845, + "epoch": 0.32995641064181574, + "flos": 14981329123200.0, + "grad_norm": 2.0510786453666707, + "language_loss": 0.74805683, + "learning_rate": 3.1281030449643735e-06, + "loss": 0.82555139, + "num_input_tokens_seen": 117873300, + "router_z_loss_clip": 1.90429688, + "router_z_loss_mlp": 0.16833496, + "step": 5488, + "time_per_iteration": 2.5195999145507812 + }, + { + "auxiliary_loss_clip": 0.06475645, + "auxiliary_loss_mlp": 0.01276585, + "balance_loss_clip": 0.06288432, + "balance_loss_mlp": 0.012611, + "epoch": 0.3300165338944837, + "flos": 18667448812800.0, + "grad_norm": 2.2567239989743912, + "language_loss": 0.73048651, + "learning_rate": 3.127781429646098e-06, + "loss": 0.80800879, + "num_input_tokens_seen": 117891540, + "router_z_loss_clip": 1.87207031, + "router_z_loss_mlp": 0.15466309, + "step": 5489, + "time_per_iteration": 2.489529609680176 + }, + { + "auxiliary_loss_clip": 0.06468415, + "auxiliary_loss_mlp": 0.01275877, + "balance_loss_clip": 0.06282636, + "balance_loss_mlp": 0.01260987, + "epoch": 0.3300766571471517, + "flos": 25589215042560.0, + "grad_norm": 2.1838257682132256, + "language_loss": 0.89381063, + "learning_rate": 3.127459771562238e-06, + "loss": 0.97125351, + "num_input_tokens_seen": 117907690, + "router_z_loss_clip": 1.859375, + "router_z_loss_mlp": 0.14898682, + "step": 5490, + "time_per_iteration": 2.583505153656006 + }, + { + "auxiliary_loss_clip": 0.06470391, + "auxiliary_loss_mlp": 0.01273693, + "balance_loss_clip": 0.06285221, + "balance_loss_mlp": 0.01258339, + "epoch": 0.33013678039981964, + "flos": 11368150012800.0, + "grad_norm": 1.8708534793530802, + "language_loss": 0.82974613, + "learning_rate": 3.1271380707249907e-06, + "loss": 0.90718699, + "num_input_tokens_seen": 117925640, + "router_z_loss_clip": 1.85253906, + "router_z_loss_mlp": 0.15344238, + "step": 5491, + "time_per_iteration": 2.4903311729431152 + }, + { + "auxiliary_loss_clip": 0.06473103, + "auxiliary_loss_mlp": 0.01274646, + "balance_loss_clip": 0.06285959, + "balance_loss_mlp": 0.01258589, + "epoch": 0.3301969036524876, + "flos": 24827175285120.0, + "grad_norm": 1.8609460693795263, + "language_loss": 0.77910721, + "learning_rate": 3.126816327146554e-06, + "loss": 0.85658479, + "num_input_tokens_seen": 117944525, + "router_z_loss_clip": 1.87402344, + "router_z_loss_mlp": 0.16052246, + "step": 5492, + "time_per_iteration": 2.5615334510803223 + }, + { + "auxiliary_loss_clip": 0.06478797, + "auxiliary_loss_mlp": 0.01277822, + "balance_loss_clip": 0.06287751, + "balance_loss_mlp": 0.01261324, + "epoch": 0.33025702690515557, + "flos": 15966634884480.0, + "grad_norm": 2.4722908606070875, + "language_loss": 0.75614154, + "learning_rate": 3.12649454083913e-06, + "loss": 0.83370769, + "num_input_tokens_seen": 117962515, + "router_z_loss_clip": 1.91015625, + "router_z_loss_mlp": 0.16503906, + "step": 5493, + "time_per_iteration": 2.489143133163452 + }, + { + "auxiliary_loss_clip": 0.06366986, + "auxiliary_loss_mlp": 0.01258616, + "balance_loss_clip": 0.06280049, + "balance_loss_mlp": 0.0125515, + "epoch": 0.33031715015782354, + "flos": 59435794540800.0, + "grad_norm": 0.7878547289977352, + "language_loss": 0.54030049, + "learning_rate": 3.12617271181492e-06, + "loss": 0.61655653, + "num_input_tokens_seen": 118018780, + "router_z_loss_clip": 0.86767578, + "router_z_loss_mlp": 0.03475952, + "step": 5494, + "time_per_iteration": 3.0869832038879395 + }, + { + "auxiliary_loss_clip": 0.06482484, + "auxiliary_loss_mlp": 0.01281394, + "balance_loss_clip": 0.0629174, + "balance_loss_mlp": 0.01264753, + "epoch": 0.3303772734104915, + "flos": 23190896753280.0, + "grad_norm": 1.4215593277180028, + "language_loss": 0.87367666, + "learning_rate": 3.1258508400861276e-06, + "loss": 0.9513154, + "num_input_tokens_seen": 118038610, + "router_z_loss_clip": 1.90625, + "router_z_loss_mlp": 0.16625977, + "step": 5495, + "time_per_iteration": 2.5188820362091064 + }, + { + "auxiliary_loss_clip": 0.06477214, + "auxiliary_loss_mlp": 0.0127749, + "balance_loss_clip": 0.06287535, + "balance_loss_mlp": 0.01260038, + "epoch": 0.33043739666315947, + "flos": 33080068275840.0, + "grad_norm": 2.0083800771900995, + "language_loss": 0.74168754, + "learning_rate": 3.1255289256649587e-06, + "loss": 0.81923461, + "num_input_tokens_seen": 118055905, + "router_z_loss_clip": 1.89550781, + "router_z_loss_mlp": 0.17443848, + "step": 5496, + "time_per_iteration": 2.6151347160339355 + }, + { + "auxiliary_loss_clip": 0.06470463, + "auxiliary_loss_mlp": 0.01272194, + "balance_loss_clip": 0.0628539, + "balance_loss_mlp": 0.01256434, + "epoch": 0.33049751991582743, + "flos": 24901625237760.0, + "grad_norm": 1.9468549986980455, + "language_loss": 0.72676557, + "learning_rate": 3.1252069685636196e-06, + "loss": 0.80419219, + "num_input_tokens_seen": 118073695, + "router_z_loss_clip": 1.84960938, + "router_z_loss_mlp": 0.15759277, + "step": 5497, + "time_per_iteration": 2.51874041557312 + }, + { + "auxiliary_loss_clip": 0.06472345, + "auxiliary_loss_mlp": 0.0127459, + "balance_loss_clip": 0.06286049, + "balance_loss_mlp": 0.01259343, + "epoch": 0.3305576431684954, + "flos": 29468272757760.0, + "grad_norm": 1.8137955115189202, + "language_loss": 0.80825889, + "learning_rate": 3.124884968794321e-06, + "loss": 0.88572824, + "num_input_tokens_seen": 118094030, + "router_z_loss_clip": 1.86328125, + "router_z_loss_mlp": 0.15234375, + "step": 5498, + "time_per_iteration": 2.6010656356811523 + }, + { + "auxiliary_loss_clip": 0.06476308, + "auxiliary_loss_mlp": 0.0127559, + "balance_loss_clip": 0.0628619, + "balance_loss_mlp": 0.01258281, + "epoch": 0.33061776642116336, + "flos": 22637951660160.0, + "grad_norm": 1.8227647554707032, + "language_loss": 0.76843095, + "learning_rate": 3.12456292636927e-06, + "loss": 0.84594989, + "num_input_tokens_seen": 118111665, + "router_z_loss_clip": 1.90039062, + "router_z_loss_mlp": 0.1730957, + "step": 5499, + "time_per_iteration": 2.5308327674865723 + }, + { + "auxiliary_loss_clip": 0.06475572, + "auxiliary_loss_mlp": 0.01277032, + "balance_loss_clip": 0.06287447, + "balance_loss_mlp": 0.01260832, + "epoch": 0.3306778896738313, + "flos": 25783536660480.0, + "grad_norm": 1.5377855738322084, + "language_loss": 0.79203349, + "learning_rate": 3.124240841300681e-06, + "loss": 0.86955953, + "num_input_tokens_seen": 118132435, + "router_z_loss_clip": 1.88183594, + "router_z_loss_mlp": 0.16186523, + "step": 5500, + "time_per_iteration": 2.5970370769500732 + }, + { + "auxiliary_loss_clip": 0.0648918, + "auxiliary_loss_mlp": 0.01275283, + "balance_loss_clip": 0.06298861, + "balance_loss_mlp": 0.01257544, + "epoch": 0.33073801292649935, + "flos": 36949566625920.0, + "grad_norm": 1.9211086255091194, + "language_loss": 0.66916561, + "learning_rate": 3.1239187136007665e-06, + "loss": 0.7468102, + "num_input_tokens_seen": 118155255, + "router_z_loss_clip": 1.90527344, + "router_z_loss_mlp": 0.17724609, + "step": 5501, + "time_per_iteration": 2.687847375869751 + }, + { + "auxiliary_loss_clip": 0.06481969, + "auxiliary_loss_mlp": 0.01273275, + "balance_loss_clip": 0.06291866, + "balance_loss_mlp": 0.01255763, + "epoch": 0.3307981361791673, + "flos": 12972465411840.0, + "grad_norm": 2.0893698607967957, + "language_loss": 0.77978551, + "learning_rate": 3.1235965432817417e-06, + "loss": 0.85733795, + "num_input_tokens_seen": 118169865, + "router_z_loss_clip": 1.90039062, + "router_z_loss_mlp": 0.17504883, + "step": 5502, + "time_per_iteration": 2.500303268432617 + }, + { + "auxiliary_loss_clip": 0.06481159, + "auxiliary_loss_mlp": 0.0127421, + "balance_loss_clip": 0.06290131, + "balance_loss_mlp": 0.01256424, + "epoch": 0.3308582594318353, + "flos": 25381420116480.0, + "grad_norm": 1.7450780858535315, + "language_loss": 0.72841054, + "learning_rate": 3.123274330355824e-06, + "loss": 0.80596423, + "num_input_tokens_seen": 118190760, + "router_z_loss_clip": 1.90917969, + "router_z_loss_mlp": 0.17773438, + "step": 5503, + "time_per_iteration": 2.5851874351501465 + }, + { + "auxiliary_loss_clip": 0.06475106, + "auxiliary_loss_mlp": 0.01274446, + "balance_loss_clip": 0.06287622, + "balance_loss_mlp": 0.01257769, + "epoch": 0.33091838268450324, + "flos": 26475738439680.0, + "grad_norm": 1.4901464435255347, + "language_loss": 0.7565586, + "learning_rate": 3.12295207483523e-06, + "loss": 0.83405411, + "num_input_tokens_seen": 118213620, + "router_z_loss_clip": 1.87597656, + "router_z_loss_mlp": 0.16674805, + "step": 5504, + "time_per_iteration": 2.5670559406280518 + }, + { + "auxiliary_loss_clip": 0.06476955, + "auxiliary_loss_mlp": 0.01276594, + "balance_loss_clip": 0.06289346, + "balance_loss_mlp": 0.01261025, + "epoch": 0.3309785059371712, + "flos": 24977836126080.0, + "grad_norm": 1.5646403370775293, + "language_loss": 0.70214427, + "learning_rate": 3.1226297767321816e-06, + "loss": 0.77967972, + "num_input_tokens_seen": 118235010, + "router_z_loss_clip": 1.87597656, + "router_z_loss_mlp": 0.15545654, + "step": 5505, + "time_per_iteration": 2.628267288208008 + }, + { + "auxiliary_loss_clip": 0.06474259, + "auxiliary_loss_mlp": 0.01275018, + "balance_loss_clip": 0.06288742, + "balance_loss_mlp": 0.01258543, + "epoch": 0.3310386291898392, + "flos": 20452585322880.0, + "grad_norm": 1.7982072656373813, + "language_loss": 0.8240785, + "learning_rate": 3.122307436058899e-06, + "loss": 0.90157127, + "num_input_tokens_seen": 118255820, + "router_z_loss_clip": 1.85644531, + "router_z_loss_mlp": 0.16467285, + "step": 5506, + "time_per_iteration": 4.10949444770813 + }, + { + "auxiliary_loss_clip": 0.06476486, + "auxiliary_loss_mlp": 0.01275135, + "balance_loss_clip": 0.0628888, + "balance_loss_mlp": 0.01258428, + "epoch": 0.33109875244250714, + "flos": 23188926182400.0, + "grad_norm": 1.740251919086934, + "language_loss": 0.79860532, + "learning_rate": 3.121985052827606e-06, + "loss": 0.87612152, + "num_input_tokens_seen": 118274160, + "router_z_loss_clip": 1.875, + "router_z_loss_mlp": 0.16705322, + "step": 5507, + "time_per_iteration": 4.12217903137207 + }, + { + "auxiliary_loss_clip": 0.06468768, + "auxiliary_loss_mlp": 0.01276749, + "balance_loss_clip": 0.06281893, + "balance_loss_mlp": 0.01260477, + "epoch": 0.3311588756951751, + "flos": 24174902776320.0, + "grad_norm": 1.6433149866128014, + "language_loss": 0.71967649, + "learning_rate": 3.1216626270505274e-06, + "loss": 0.79713166, + "num_input_tokens_seen": 118294385, + "router_z_loss_clip": 1.87109375, + "router_z_loss_mlp": 0.1628418, + "step": 5508, + "time_per_iteration": 2.5890002250671387 + }, + { + "auxiliary_loss_clip": 0.06468692, + "auxiliary_loss_mlp": 0.01272213, + "balance_loss_clip": 0.06284875, + "balance_loss_mlp": 0.01256788, + "epoch": 0.33121899894784307, + "flos": 28152994417920.0, + "grad_norm": 1.6757523088462936, + "language_loss": 0.71588784, + "learning_rate": 3.12134015873989e-06, + "loss": 0.79329687, + "num_input_tokens_seen": 118313105, + "router_z_loss_clip": 1.83691406, + "router_z_loss_mlp": 0.15429688, + "step": 5509, + "time_per_iteration": 3.976996660232544 + }, + { + "auxiliary_loss_clip": 0.06471643, + "auxiliary_loss_mlp": 0.01279857, + "balance_loss_clip": 0.06286702, + "balance_loss_mlp": 0.01264396, + "epoch": 0.33127912220051103, + "flos": 29574979332480.0, + "grad_norm": 1.5753317257606638, + "language_loss": 0.73806137, + "learning_rate": 3.121017647907921e-06, + "loss": 0.81557631, + "num_input_tokens_seen": 118335250, + "router_z_loss_clip": 1.84960938, + "router_z_loss_mlp": 0.15460205, + "step": 5510, + "time_per_iteration": 2.576838731765747 + }, + { + "auxiliary_loss_clip": 0.06473264, + "auxiliary_loss_mlp": 0.01276647, + "balance_loss_clip": 0.06286872, + "balance_loss_mlp": 0.01261019, + "epoch": 0.331339245453179, + "flos": 14434086107520.0, + "grad_norm": 2.529546935928515, + "language_loss": 0.88507652, + "learning_rate": 3.1206950945668508e-06, + "loss": 0.96257567, + "num_input_tokens_seen": 118351470, + "router_z_loss_clip": 1.86328125, + "router_z_loss_mlp": 0.15612793, + "step": 5511, + "time_per_iteration": 2.550442695617676 + }, + { + "auxiliary_loss_clip": 0.06464168, + "auxiliary_loss_mlp": 0.01275515, + "balance_loss_clip": 0.06286532, + "balance_loss_mlp": 0.01260494, + "epoch": 0.33139936870584696, + "flos": 20893499107200.0, + "grad_norm": 1.6341387009287651, + "language_loss": 0.73559558, + "learning_rate": 3.12037249872891e-06, + "loss": 0.81299245, + "num_input_tokens_seen": 118370970, + "router_z_loss_clip": 1.77441406, + "router_z_loss_mlp": 0.15026855, + "step": 5512, + "time_per_iteration": 2.5596871376037598 + }, + { + "auxiliary_loss_clip": 0.06468001, + "auxiliary_loss_mlp": 0.01278341, + "balance_loss_clip": 0.06286225, + "balance_loss_mlp": 0.01262438, + "epoch": 0.33145949195851493, + "flos": 36293352975360.0, + "grad_norm": 1.8738374179289, + "language_loss": 0.72677827, + "learning_rate": 3.1200498604063317e-06, + "loss": 0.80424166, + "num_input_tokens_seen": 118393125, + "router_z_loss_clip": 1.81738281, + "router_z_loss_mlp": 0.15905762, + "step": 5513, + "time_per_iteration": 4.148774147033691 + }, + { + "auxiliary_loss_clip": 0.06472933, + "auxiliary_loss_mlp": 0.01275876, + "balance_loss_clip": 0.06284368, + "balance_loss_mlp": 0.0125958, + "epoch": 0.33151961521118295, + "flos": 14284431515520.0, + "grad_norm": 1.8311253656567958, + "language_loss": 0.69026303, + "learning_rate": 3.1197271796113507e-06, + "loss": 0.7677511, + "num_input_tokens_seen": 118410860, + "router_z_loss_clip": 1.8828125, + "router_z_loss_mlp": 0.16296387, + "step": 5514, + "time_per_iteration": 2.486818313598633 + }, + { + "auxiliary_loss_clip": 0.06477968, + "auxiliary_loss_mlp": 0.0127816, + "balance_loss_clip": 0.06291951, + "balance_loss_mlp": 0.01261089, + "epoch": 0.3315797384638509, + "flos": 20780126133120.0, + "grad_norm": 1.9656560392088134, + "language_loss": 0.66393441, + "learning_rate": 3.1194044563562026e-06, + "loss": 0.74149573, + "num_input_tokens_seen": 118429570, + "router_z_loss_clip": 1.859375, + "router_z_loss_mlp": 0.17053223, + "step": 5515, + "time_per_iteration": 2.531658411026001 + }, + { + "auxiliary_loss_clip": 0.06473279, + "auxiliary_loss_mlp": 0.01275122, + "balance_loss_clip": 0.06286342, + "balance_loss_mlp": 0.01258885, + "epoch": 0.3316398617165189, + "flos": 24686115736320.0, + "grad_norm": 3.8914339391091732, + "language_loss": 0.69369388, + "learning_rate": 3.1190816906531257e-06, + "loss": 0.77117789, + "num_input_tokens_seen": 118450285, + "router_z_loss_clip": 1.8671875, + "router_z_loss_mlp": 0.16235352, + "step": 5516, + "time_per_iteration": 2.5392425060272217 + }, + { + "auxiliary_loss_clip": 0.06476592, + "auxiliary_loss_mlp": 0.01274968, + "balance_loss_clip": 0.06287026, + "balance_loss_mlp": 0.0125959, + "epoch": 0.33169998496918685, + "flos": 18593879328000.0, + "grad_norm": 2.757231582138207, + "language_loss": 0.80914545, + "learning_rate": 3.118758882514359e-06, + "loss": 0.88666099, + "num_input_tokens_seen": 118468270, + "router_z_loss_clip": 1.89550781, + "router_z_loss_mlp": 0.15368652, + "step": 5517, + "time_per_iteration": 2.4851818084716797 + }, + { + "auxiliary_loss_clip": 0.06465174, + "auxiliary_loss_mlp": 0.01279818, + "balance_loss_clip": 0.06284687, + "balance_loss_mlp": 0.01264142, + "epoch": 0.3317601082218548, + "flos": 20199871808640.0, + "grad_norm": 1.6705032998917397, + "language_loss": 0.74656814, + "learning_rate": 3.118436031952143e-06, + "loss": 0.82401806, + "num_input_tokens_seen": 118486615, + "router_z_loss_clip": 1.8046875, + "router_z_loss_mlp": 0.15686035, + "step": 5518, + "time_per_iteration": 2.518036127090454 + }, + { + "auxiliary_loss_clip": 0.06372921, + "auxiliary_loss_mlp": 0.01283465, + "balance_loss_clip": 0.06286249, + "balance_loss_mlp": 0.01279764, + "epoch": 0.3318202314745228, + "flos": 68995119265920.0, + "grad_norm": 0.7149144856696655, + "language_loss": 0.54263318, + "learning_rate": 3.1181131389787206e-06, + "loss": 0.61919701, + "num_input_tokens_seen": 118553580, + "router_z_loss_clip": 0.8671875, + "router_z_loss_mlp": 0.03692627, + "step": 5519, + "time_per_iteration": 3.246586322784424 + }, + { + "auxiliary_loss_clip": 0.06472577, + "auxiliary_loss_mlp": 0.01276695, + "balance_loss_clip": 0.06288108, + "balance_loss_mlp": 0.0125966, + "epoch": 0.33188035472719074, + "flos": 21505381148160.0, + "grad_norm": 2.182658812554146, + "language_loss": 0.79452467, + "learning_rate": 3.117790203606336e-06, + "loss": 0.87201744, + "num_input_tokens_seen": 118570280, + "router_z_loss_clip": 1.84375, + "router_z_loss_mlp": 0.17028809, + "step": 5520, + "time_per_iteration": 2.517853260040283 + }, + { + "auxiliary_loss_clip": 0.06465811, + "auxiliary_loss_mlp": 0.01271287, + "balance_loss_clip": 0.06283027, + "balance_loss_mlp": 0.01256279, + "epoch": 0.3319404779798587, + "flos": 28877033548800.0, + "grad_norm": 1.8300903967069966, + "language_loss": 0.77067709, + "learning_rate": 3.1174672258472344e-06, + "loss": 0.84804809, + "num_input_tokens_seen": 118590455, + "router_z_loss_clip": 1.82910156, + "router_z_loss_mlp": 0.15002441, + "step": 5521, + "time_per_iteration": 2.555697441101074 + }, + { + "auxiliary_loss_clip": 0.06478226, + "auxiliary_loss_mlp": 0.01278256, + "balance_loss_clip": 0.06288885, + "balance_loss_mlp": 0.01261542, + "epoch": 0.33200060123252667, + "flos": 23083770908160.0, + "grad_norm": 1.9119948906690396, + "language_loss": 0.70441258, + "learning_rate": 3.117144205713664e-06, + "loss": 0.78197736, + "num_input_tokens_seen": 118609495, + "router_z_loss_clip": 1.89355469, + "router_z_loss_mlp": 0.16699219, + "step": 5522, + "time_per_iteration": 2.5673933029174805 + }, + { + "auxiliary_loss_clip": 0.06474358, + "auxiliary_loss_mlp": 0.01271133, + "balance_loss_clip": 0.06290573, + "balance_loss_mlp": 0.01255255, + "epoch": 0.33206072448519464, + "flos": 21148895952000.0, + "grad_norm": 1.6906348218339255, + "language_loss": 0.74640656, + "learning_rate": 3.1168211432178735e-06, + "loss": 0.82386148, + "num_input_tokens_seen": 118628720, + "router_z_loss_clip": 1.83789062, + "router_z_loss_mlp": 0.15881348, + "step": 5523, + "time_per_iteration": 2.516275405883789 + }, + { + "auxiliary_loss_clip": 0.06473421, + "auxiliary_loss_mlp": 0.01271212, + "balance_loss_clip": 0.06292297, + "balance_loss_mlp": 0.01255763, + "epoch": 0.3321208477378626, + "flos": 13084161304320.0, + "grad_norm": 2.1726495268835024, + "language_loss": 0.82172406, + "learning_rate": 3.116498038372114e-06, + "loss": 0.8991704, + "num_input_tokens_seen": 118645955, + "router_z_loss_clip": 1.80957031, + "router_z_loss_mlp": 0.15454102, + "step": 5524, + "time_per_iteration": 2.557941198348999 + }, + { + "auxiliary_loss_clip": 0.06473367, + "auxiliary_loss_mlp": 0.01272915, + "balance_loss_clip": 0.06289522, + "balance_loss_mlp": 0.01257251, + "epoch": 0.33218097099053057, + "flos": 21221836531200.0, + "grad_norm": 1.6566666481357326, + "language_loss": 0.83100772, + "learning_rate": 3.116174891188636e-06, + "loss": 0.90847051, + "num_input_tokens_seen": 118665605, + "router_z_loss_clip": 1.8359375, + "router_z_loss_mlp": 0.15649414, + "step": 5525, + "time_per_iteration": 2.527944564819336 + }, + { + "auxiliary_loss_clip": 0.06379532, + "auxiliary_loss_mlp": 0.01265433, + "balance_loss_clip": 0.06292765, + "balance_loss_mlp": 0.01261484, + "epoch": 0.33224109424319853, + "flos": 64369954068480.0, + "grad_norm": 0.7407224947932968, + "language_loss": 0.52533764, + "learning_rate": 3.1158517016796945e-06, + "loss": 0.60178727, + "num_input_tokens_seen": 118728155, + "router_z_loss_clip": 0.8671875, + "router_z_loss_mlp": 0.03945923, + "step": 5526, + "time_per_iteration": 3.1679162979125977 + }, + { + "auxiliary_loss_clip": 0.0647909, + "auxiliary_loss_mlp": 0.01274604, + "balance_loss_clip": 0.06291543, + "balance_loss_mlp": 0.01258391, + "epoch": 0.33230121749586655, + "flos": 17351457713280.0, + "grad_norm": 1.970764365513445, + "language_loss": 0.79041827, + "learning_rate": 3.1155284698575445e-06, + "loss": 0.86795521, + "num_input_tokens_seen": 118743955, + "router_z_loss_clip": 1.87597656, + "router_z_loss_mlp": 0.1619873, + "step": 5527, + "time_per_iteration": 2.5327274799346924 + }, + { + "auxiliary_loss_clip": 0.06477004, + "auxiliary_loss_mlp": 0.01278538, + "balance_loss_clip": 0.06294803, + "balance_loss_mlp": 0.01263458, + "epoch": 0.3323613407485345, + "flos": 21003517918080.0, + "grad_norm": 1.6591522480418575, + "language_loss": 0.72383821, + "learning_rate": 3.1152051957344434e-06, + "loss": 0.80139363, + "num_input_tokens_seen": 118763275, + "router_z_loss_clip": 1.82226562, + "router_z_loss_mlp": 0.15063477, + "step": 5528, + "time_per_iteration": 2.6072213649749756 + }, + { + "auxiliary_loss_clip": 0.06477713, + "auxiliary_loss_mlp": 0.01274869, + "balance_loss_clip": 0.06292165, + "balance_loss_mlp": 0.01259396, + "epoch": 0.3324214640012025, + "flos": 13157688862080.0, + "grad_norm": 1.8543805866880412, + "language_loss": 0.8336091, + "learning_rate": 3.1148818793226497e-06, + "loss": 0.91113496, + "num_input_tokens_seen": 118781110, + "router_z_loss_clip": 1.85546875, + "router_z_loss_mlp": 0.15466309, + "step": 5529, + "time_per_iteration": 2.5001087188720703 + }, + { + "auxiliary_loss_clip": 0.06479646, + "auxiliary_loss_mlp": 0.01270144, + "balance_loss_clip": 0.06290583, + "balance_loss_mlp": 0.01254587, + "epoch": 0.33248158725387045, + "flos": 22280124798720.0, + "grad_norm": 1.7380748666321508, + "language_loss": 0.70133483, + "learning_rate": 3.114558520634423e-06, + "loss": 0.77883273, + "num_input_tokens_seen": 118800620, + "router_z_loss_clip": 1.89355469, + "router_z_loss_mlp": 0.15551758, + "step": 5530, + "time_per_iteration": 2.5806338787078857 + }, + { + "auxiliary_loss_clip": 0.06479505, + "auxiliary_loss_mlp": 0.01275357, + "balance_loss_clip": 0.06291899, + "balance_loss_mlp": 0.01258751, + "epoch": 0.3325417105065384, + "flos": 20747324459520.0, + "grad_norm": 2.7342028000668552, + "language_loss": 0.77694213, + "learning_rate": 3.1142351196820256e-06, + "loss": 0.85449082, + "num_input_tokens_seen": 118818725, + "router_z_loss_clip": 1.87597656, + "router_z_loss_mlp": 0.16589355, + "step": 5531, + "time_per_iteration": 2.5307323932647705 + }, + { + "auxiliary_loss_clip": 0.06477839, + "auxiliary_loss_mlp": 0.01280766, + "balance_loss_clip": 0.06290089, + "balance_loss_mlp": 0.01263552, + "epoch": 0.3326018337592064, + "flos": 24797476212480.0, + "grad_norm": 1.9473942094883194, + "language_loss": 0.73779702, + "learning_rate": 3.1139116764777206e-06, + "loss": 0.81538308, + "num_input_tokens_seen": 118839390, + "router_z_loss_clip": 1.87597656, + "router_z_loss_mlp": 0.17211914, + "step": 5532, + "time_per_iteration": 2.5989890098571777 + }, + { + "auxiliary_loss_clip": 0.06472681, + "auxiliary_loss_mlp": 0.01278728, + "balance_loss_clip": 0.06288014, + "balance_loss_mlp": 0.01263147, + "epoch": 0.33266195701187434, + "flos": 14506942832640.0, + "grad_norm": 1.825417572799306, + "language_loss": 0.66042602, + "learning_rate": 3.1135881910337735e-06, + "loss": 0.73794013, + "num_input_tokens_seen": 118856275, + "router_z_loss_clip": 1.84667969, + "router_z_loss_mlp": 0.15576172, + "step": 5533, + "time_per_iteration": 2.47566294670105 + }, + { + "auxiliary_loss_clip": 0.06474279, + "auxiliary_loss_mlp": 0.012755, + "balance_loss_clip": 0.06289338, + "balance_loss_mlp": 0.01258954, + "epoch": 0.3327220802645423, + "flos": 15309792328320.0, + "grad_norm": 1.6677538876536442, + "language_loss": 0.71568084, + "learning_rate": 3.113264663362451e-06, + "loss": 0.79317868, + "num_input_tokens_seen": 118873830, + "router_z_loss_clip": 1.84863281, + "router_z_loss_mlp": 0.16552734, + "step": 5534, + "time_per_iteration": 2.5140762329101562 + }, + { + "auxiliary_loss_clip": 0.06474573, + "auxiliary_loss_mlp": 0.01273002, + "balance_loss_clip": 0.06290095, + "balance_loss_mlp": 0.01257088, + "epoch": 0.3327822035172103, + "flos": 23484336151680.0, + "grad_norm": 1.635346823223845, + "language_loss": 0.67885029, + "learning_rate": 3.1129410934760204e-06, + "loss": 0.75632608, + "num_input_tokens_seen": 118891560, + "router_z_loss_clip": 1.84277344, + "router_z_loss_mlp": 0.15917969, + "step": 5535, + "time_per_iteration": 2.522270917892456 + }, + { + "auxiliary_loss_clip": 0.0647034, + "auxiliary_loss_mlp": 0.01273438, + "balance_loss_clip": 0.06284929, + "balance_loss_mlp": 0.01257547, + "epoch": 0.33284232676987824, + "flos": 25381587824640.0, + "grad_norm": 2.3715726564419155, + "language_loss": 0.72782886, + "learning_rate": 3.1126174813867517e-06, + "loss": 0.80526668, + "num_input_tokens_seen": 118910260, + "router_z_loss_clip": 1.85449219, + "router_z_loss_mlp": 0.15893555, + "step": 5536, + "time_per_iteration": 2.5831825733184814 + }, + { + "auxiliary_loss_clip": 0.06470598, + "auxiliary_loss_mlp": 0.01270866, + "balance_loss_clip": 0.06285597, + "balance_loss_mlp": 0.01255464, + "epoch": 0.3329024500225462, + "flos": 23700851902080.0, + "grad_norm": 1.6831469867631554, + "language_loss": 0.81958938, + "learning_rate": 3.112293827106917e-06, + "loss": 0.89700401, + "num_input_tokens_seen": 118929985, + "router_z_loss_clip": 1.84960938, + "router_z_loss_mlp": 0.15405273, + "step": 5537, + "time_per_iteration": 2.520211935043335 + }, + { + "auxiliary_loss_clip": 0.06473641, + "auxiliary_loss_mlp": 0.01270298, + "balance_loss_clip": 0.06284811, + "balance_loss_mlp": 0.01253799, + "epoch": 0.33296257327521417, + "flos": 31731317429760.0, + "grad_norm": 1.8576028267218818, + "language_loss": 0.71933794, + "learning_rate": 3.111970130648789e-06, + "loss": 0.79677737, + "num_input_tokens_seen": 118951355, + "router_z_loss_clip": 1.88671875, + "router_z_loss_mlp": 0.16491699, + "step": 5538, + "time_per_iteration": 2.6061229705810547 + }, + { + "auxiliary_loss_clip": 0.06466128, + "auxiliary_loss_mlp": 0.01271828, + "balance_loss_clip": 0.06283107, + "balance_loss_mlp": 0.01256784, + "epoch": 0.33302269652788213, + "flos": 22750863436800.0, + "grad_norm": 1.8542539639588682, + "language_loss": 0.75063813, + "learning_rate": 3.1116463920246424e-06, + "loss": 0.82801771, + "num_input_tokens_seen": 118970910, + "router_z_loss_clip": 1.83007812, + "router_z_loss_mlp": 0.15039062, + "step": 5539, + "time_per_iteration": 2.5176634788513184 + }, + { + "auxiliary_loss_clip": 0.06473792, + "auxiliary_loss_mlp": 0.0127244, + "balance_loss_clip": 0.06284824, + "balance_loss_mlp": 0.01255739, + "epoch": 0.33308281978055015, + "flos": 11478546167040.0, + "grad_norm": 1.8040392528519402, + "language_loss": 0.71489209, + "learning_rate": 3.1113226112467527e-06, + "loss": 0.79235446, + "num_input_tokens_seen": 118989200, + "router_z_loss_clip": 1.890625, + "router_z_loss_mlp": 0.16699219, + "step": 5540, + "time_per_iteration": 2.536752939224243 + }, + { + "auxiliary_loss_clip": 0.06462967, + "auxiliary_loss_mlp": 0.01271775, + "balance_loss_clip": 0.06280267, + "balance_loss_mlp": 0.01256576, + "epoch": 0.3331429430332181, + "flos": 38222274291840.0, + "grad_norm": 3.095851444688792, + "language_loss": 0.60970843, + "learning_rate": 3.1109987883273983e-06, + "loss": 0.68705589, + "num_input_tokens_seen": 119011030, + "router_z_loss_clip": 1.82519531, + "router_z_loss_mlp": 0.15197754, + "step": 5541, + "time_per_iteration": 2.6592354774475098 + }, + { + "auxiliary_loss_clip": 0.06472225, + "auxiliary_loss_mlp": 0.01276024, + "balance_loss_clip": 0.06284402, + "balance_loss_mlp": 0.01259872, + "epoch": 0.3332030662858861, + "flos": 22535270081280.0, + "grad_norm": 1.770287690308821, + "language_loss": 0.69711685, + "learning_rate": 3.1106749232788584e-06, + "loss": 0.77459931, + "num_input_tokens_seen": 119030620, + "router_z_loss_clip": 1.87597656, + "router_z_loss_mlp": 0.16149902, + "step": 5542, + "time_per_iteration": 2.5427184104919434 + }, + { + "auxiliary_loss_clip": 0.06473213, + "auxiliary_loss_mlp": 0.01276881, + "balance_loss_clip": 0.06286451, + "balance_loss_mlp": 0.01261658, + "epoch": 0.33326318953855405, + "flos": 16003293845760.0, + "grad_norm": 1.6729265705607443, + "language_loss": 0.75927889, + "learning_rate": 3.110351016113414e-06, + "loss": 0.83677983, + "num_input_tokens_seen": 119048015, + "router_z_loss_clip": 1.86914062, + "router_z_loss_mlp": 0.15222168, + "step": 5543, + "time_per_iteration": 2.4745616912841797 + }, + { + "auxiliary_loss_clip": 0.06475509, + "auxiliary_loss_mlp": 0.01276899, + "balance_loss_clip": 0.06287046, + "balance_loss_mlp": 0.01260281, + "epoch": 0.333323312791222, + "flos": 25600661124480.0, + "grad_norm": 1.7242995092969657, + "language_loss": 0.75332278, + "learning_rate": 3.110027066843348e-06, + "loss": 0.83084685, + "num_input_tokens_seen": 119066280, + "router_z_loss_clip": 1.88476562, + "router_z_loss_mlp": 0.16601562, + "step": 5544, + "time_per_iteration": 2.565572738647461 + }, + { + "auxiliary_loss_clip": 0.06467521, + "auxiliary_loss_mlp": 0.01270286, + "balance_loss_clip": 0.06283619, + "balance_loss_mlp": 0.01254848, + "epoch": 0.33338343604389, + "flos": 25126652177280.0, + "grad_norm": 1.4364166263140996, + "language_loss": 0.71556139, + "learning_rate": 3.1097030754809456e-06, + "loss": 0.79293942, + "num_input_tokens_seen": 119087680, + "router_z_loss_clip": 1.83984375, + "router_z_loss_mlp": 0.1541748, + "step": 5545, + "time_per_iteration": 3.9951117038726807 + }, + { + "auxiliary_loss_clip": 0.0646642, + "auxiliary_loss_mlp": 0.01275763, + "balance_loss_clip": 0.0628425, + "balance_loss_mlp": 0.01260063, + "epoch": 0.33344355929655795, + "flos": 16953114602880.0, + "grad_norm": 1.5928525652704049, + "language_loss": 0.69892073, + "learning_rate": 3.1093790420384894e-06, + "loss": 0.77634251, + "num_input_tokens_seen": 119105820, + "router_z_loss_clip": 1.8203125, + "router_z_loss_mlp": 0.15722656, + "step": 5546, + "time_per_iteration": 4.069552659988403 + }, + { + "auxiliary_loss_clip": 0.06469481, + "auxiliary_loss_mlp": 0.01273771, + "balance_loss_clip": 0.06280591, + "balance_loss_mlp": 0.01257308, + "epoch": 0.3335036825492259, + "flos": 27896675178240.0, + "grad_norm": 1.5973320112543803, + "language_loss": 0.65030676, + "learning_rate": 3.1090549665282702e-06, + "loss": 0.72773933, + "num_input_tokens_seen": 119126630, + "router_z_loss_clip": 1.88964844, + "router_z_loss_mlp": 0.16455078, + "step": 5547, + "time_per_iteration": 2.578320026397705 + }, + { + "auxiliary_loss_clip": 0.06468174, + "auxiliary_loss_mlp": 0.01274769, + "balance_loss_clip": 0.06284153, + "balance_loss_mlp": 0.01258736, + "epoch": 0.3335638058018939, + "flos": 16184995424640.0, + "grad_norm": 1.9789366990729325, + "language_loss": 0.85645819, + "learning_rate": 3.1087308489625742e-06, + "loss": 0.9338876, + "num_input_tokens_seen": 119143375, + "router_z_loss_clip": 1.84082031, + "router_z_loss_mlp": 0.16040039, + "step": 5548, + "time_per_iteration": 3.917346477508545 + }, + { + "auxiliary_loss_clip": 0.06473708, + "auxiliary_loss_mlp": 0.01275416, + "balance_loss_clip": 0.06283803, + "balance_loss_mlp": 0.01259264, + "epoch": 0.33362392905456184, + "flos": 39905651617920.0, + "grad_norm": 1.927393858225298, + "language_loss": 0.74956143, + "learning_rate": 3.1084066893536945e-06, + "loss": 0.82705271, + "num_input_tokens_seen": 119166450, + "router_z_loss_clip": 1.89453125, + "router_z_loss_mlp": 0.16149902, + "step": 5549, + "time_per_iteration": 2.662152051925659 + }, + { + "auxiliary_loss_clip": 0.0647629, + "auxiliary_loss_mlp": 0.01276829, + "balance_loss_clip": 0.06287523, + "balance_loss_mlp": 0.0125946, + "epoch": 0.3336840523072298, + "flos": 44280954339840.0, + "grad_norm": 3.284743863263659, + "language_loss": 0.68874133, + "learning_rate": 3.108082487713921e-06, + "loss": 0.76627254, + "num_input_tokens_seen": 119189645, + "router_z_loss_clip": 1.88476562, + "router_z_loss_mlp": 0.17370605, + "step": 5550, + "time_per_iteration": 2.703099250793457 + }, + { + "auxiliary_loss_clip": 0.06476407, + "auxiliary_loss_mlp": 0.01275354, + "balance_loss_clip": 0.06290508, + "balance_loss_mlp": 0.01259488, + "epoch": 0.33374417555989777, + "flos": 15091054444800.0, + "grad_norm": 2.6465919002896436, + "language_loss": 0.60992151, + "learning_rate": 3.1077582440555495e-06, + "loss": 0.6874392, + "num_input_tokens_seen": 119208045, + "router_z_loss_clip": 1.85839844, + "router_z_loss_mlp": 0.15856934, + "step": 5551, + "time_per_iteration": 2.5024354457855225 + }, + { + "auxiliary_loss_clip": 0.06471356, + "auxiliary_loss_mlp": 0.01275291, + "balance_loss_clip": 0.06287605, + "balance_loss_mlp": 0.01259985, + "epoch": 0.33380429881256574, + "flos": 15854226232320.0, + "grad_norm": 1.6170207033712265, + "language_loss": 0.71155131, + "learning_rate": 3.1074339583908746e-06, + "loss": 0.78901786, + "num_input_tokens_seen": 119224910, + "router_z_loss_clip": 1.83691406, + "router_z_loss_mlp": 0.15307617, + "step": 5552, + "time_per_iteration": 4.0786826610565186 + }, + { + "auxiliary_loss_clip": 0.06476602, + "auxiliary_loss_mlp": 0.01270143, + "balance_loss_clip": 0.06291272, + "balance_loss_mlp": 0.01255182, + "epoch": 0.33386442206523376, + "flos": 13485439307520.0, + "grad_norm": 2.244029622012826, + "language_loss": 0.83864999, + "learning_rate": 3.107109630732192e-06, + "loss": 0.91611743, + "num_input_tokens_seen": 119243290, + "router_z_loss_clip": 1.85644531, + "router_z_loss_mlp": 0.1496582, + "step": 5553, + "time_per_iteration": 2.603986978530884 + }, + { + "auxiliary_loss_clip": 0.06474789, + "auxiliary_loss_mlp": 0.0127187, + "balance_loss_clip": 0.06288507, + "balance_loss_mlp": 0.01255562, + "epoch": 0.3339245453179017, + "flos": 16696250311680.0, + "grad_norm": 2.098616423404285, + "language_loss": 0.81424135, + "learning_rate": 3.1067852610918017e-06, + "loss": 0.89170802, + "num_input_tokens_seen": 119261195, + "router_z_loss_clip": 1.86132812, + "router_z_loss_mlp": 0.16320801, + "step": 5554, + "time_per_iteration": 2.4884121417999268 + }, + { + "auxiliary_loss_clip": 0.06477922, + "auxiliary_loss_mlp": 0.01277907, + "balance_loss_clip": 0.06288742, + "balance_loss_mlp": 0.01261647, + "epoch": 0.3339846685705697, + "flos": 24617954839680.0, + "grad_norm": 1.4369599322997015, + "language_loss": 0.81866252, + "learning_rate": 3.1064608494820032e-06, + "loss": 0.89622086, + "num_input_tokens_seen": 119282845, + "router_z_loss_clip": 1.89160156, + "router_z_loss_mlp": 0.16259766, + "step": 5555, + "time_per_iteration": 2.6273152828216553 + }, + { + "auxiliary_loss_clip": 0.06478396, + "auxiliary_loss_mlp": 0.01271619, + "balance_loss_clip": 0.06292441, + "balance_loss_mlp": 0.01256325, + "epoch": 0.33404479182323765, + "flos": 30961311534720.0, + "grad_norm": 1.7387044564853729, + "language_loss": 0.74836755, + "learning_rate": 3.106136395915099e-06, + "loss": 0.82586771, + "num_input_tokens_seen": 119304430, + "router_z_loss_clip": 1.85839844, + "router_z_loss_mlp": 0.1529541, + "step": 5556, + "time_per_iteration": 2.5936899185180664 + }, + { + "auxiliary_loss_clip": 0.06476042, + "auxiliary_loss_mlp": 0.01275785, + "balance_loss_clip": 0.06293188, + "balance_loss_mlp": 0.01260562, + "epoch": 0.3341049150759056, + "flos": 23519988864000.0, + "grad_norm": 1.3815052276914728, + "language_loss": 0.82545519, + "learning_rate": 3.105811900403391e-06, + "loss": 0.90297353, + "num_input_tokens_seen": 119323830, + "router_z_loss_clip": 1.82617188, + "router_z_loss_mlp": 0.15222168, + "step": 5557, + "time_per_iteration": 2.5862598419189453 + }, + { + "auxiliary_loss_clip": 0.06477734, + "auxiliary_loss_mlp": 0.01279505, + "balance_loss_clip": 0.0629133, + "balance_loss_mlp": 0.01264067, + "epoch": 0.3341650383285736, + "flos": 24034052862720.0, + "grad_norm": 2.760917503655681, + "language_loss": 0.80188966, + "learning_rate": 3.1054873629591855e-06, + "loss": 0.87946206, + "num_input_tokens_seen": 119346340, + "router_z_loss_clip": 1.86230469, + "router_z_loss_mlp": 0.15429688, + "step": 5558, + "time_per_iteration": 2.596344232559204 + }, + { + "auxiliary_loss_clip": 0.06475051, + "auxiliary_loss_mlp": 0.01282797, + "balance_loss_clip": 0.06287208, + "balance_loss_mlp": 0.01267646, + "epoch": 0.33422516158124155, + "flos": 24909255959040.0, + "grad_norm": 1.7423955567809428, + "language_loss": 0.81954122, + "learning_rate": 3.105162783594788e-06, + "loss": 0.8971197, + "num_input_tokens_seen": 119367285, + "router_z_loss_clip": 1.87792969, + "router_z_loss_mlp": 0.1517334, + "step": 5559, + "time_per_iteration": 2.587005376815796 + }, + { + "auxiliary_loss_clip": 0.06467593, + "auxiliary_loss_mlp": 0.01279767, + "balance_loss_clip": 0.06286522, + "balance_loss_mlp": 0.01265224, + "epoch": 0.3342852848339095, + "flos": 18339404878080.0, + "grad_norm": 2.1220335034517093, + "language_loss": 0.72058392, + "learning_rate": 3.1048381623225074e-06, + "loss": 0.79805756, + "num_input_tokens_seen": 119385370, + "router_z_loss_clip": 1.81054688, + "router_z_loss_mlp": 0.14550781, + "step": 5560, + "time_per_iteration": 2.536546230316162 + }, + { + "auxiliary_loss_clip": 0.06481705, + "auxiliary_loss_mlp": 0.01285397, + "balance_loss_clip": 0.06292065, + "balance_loss_mlp": 0.01269458, + "epoch": 0.3343454080865775, + "flos": 30054690357120.0, + "grad_norm": 1.596178779859494, + "language_loss": 0.75386882, + "learning_rate": 3.1045134991546526e-06, + "loss": 0.83153981, + "num_input_tokens_seen": 119409150, + "router_z_loss_clip": 1.89453125, + "router_z_loss_mlp": 0.15930176, + "step": 5561, + "time_per_iteration": 2.672700881958008 + }, + { + "auxiliary_loss_clip": 0.06477022, + "auxiliary_loss_mlp": 0.01277798, + "balance_loss_clip": 0.06291385, + "balance_loss_mlp": 0.01262551, + "epoch": 0.33440553133924544, + "flos": 16404362213760.0, + "grad_norm": 1.6462526862455489, + "language_loss": 0.70108986, + "learning_rate": 3.1041887941035355e-06, + "loss": 0.77863806, + "num_input_tokens_seen": 119426475, + "router_z_loss_clip": 1.85742188, + "router_z_loss_mlp": 0.15246582, + "step": 5562, + "time_per_iteration": 2.501317024230957 + }, + { + "auxiliary_loss_clip": 0.06472157, + "auxiliary_loss_mlp": 0.01280428, + "balance_loss_clip": 0.06287345, + "balance_loss_mlp": 0.01265396, + "epoch": 0.3344656545919134, + "flos": 24248723823360.0, + "grad_norm": 1.5361546803562123, + "language_loss": 0.65648419, + "learning_rate": 3.1038640471814685e-06, + "loss": 0.7340101, + "num_input_tokens_seen": 119446900, + "router_z_loss_clip": 1.84667969, + "router_z_loss_mlp": 0.15026855, + "step": 5563, + "time_per_iteration": 2.5564165115356445 + }, + { + "auxiliary_loss_clip": 0.06477885, + "auxiliary_loss_mlp": 0.01282181, + "balance_loss_clip": 0.06290222, + "balance_loss_mlp": 0.01264752, + "epoch": 0.3345257778445814, + "flos": 52130431048320.0, + "grad_norm": 1.3531042812140452, + "language_loss": 0.74246049, + "learning_rate": 3.103539258400766e-06, + "loss": 0.82006115, + "num_input_tokens_seen": 119470945, + "router_z_loss_clip": 1.87792969, + "router_z_loss_mlp": 0.17431641, + "step": 5564, + "time_per_iteration": 2.810534715652466 + }, + { + "auxiliary_loss_clip": 0.06356741, + "auxiliary_loss_mlp": 0.01295627, + "balance_loss_clip": 0.06270711, + "balance_loss_mlp": 0.01291562, + "epoch": 0.33458590109724934, + "flos": 68066528319360.0, + "grad_norm": 0.78222915395806, + "language_loss": 0.55275309, + "learning_rate": 3.103214427773745e-06, + "loss": 0.62927675, + "num_input_tokens_seen": 119529925, + "router_z_loss_clip": 0.859375, + "router_z_loss_mlp": 0.04064941, + "step": 5565, + "time_per_iteration": 3.1279821395874023 + }, + { + "auxiliary_loss_clip": 0.06471252, + "auxiliary_loss_mlp": 0.01279791, + "balance_loss_clip": 0.06288698, + "balance_loss_mlp": 0.01264163, + "epoch": 0.3346460243499173, + "flos": 37423869062400.0, + "grad_norm": 1.705115292174207, + "language_loss": 0.65565574, + "learning_rate": 3.102889555312721e-06, + "loss": 0.73316622, + "num_input_tokens_seen": 119550700, + "router_z_loss_clip": 1.82519531, + "router_z_loss_mlp": 0.15625, + "step": 5566, + "time_per_iteration": 2.712435245513916 + }, + { + "auxiliary_loss_clip": 0.0647177, + "auxiliary_loss_mlp": 0.01282122, + "balance_loss_clip": 0.06289912, + "balance_loss_mlp": 0.01266529, + "epoch": 0.3347061476025853, + "flos": 18703269233280.0, + "grad_norm": 1.6655571733561654, + "language_loss": 0.77372861, + "learning_rate": 3.102564641030016e-06, + "loss": 0.85126758, + "num_input_tokens_seen": 119569295, + "router_z_loss_clip": 1.81835938, + "router_z_loss_mlp": 0.15588379, + "step": 5567, + "time_per_iteration": 2.4871251583099365 + }, + { + "auxiliary_loss_clip": 0.06471208, + "auxiliary_loss_mlp": 0.01275703, + "balance_loss_clip": 0.06285998, + "balance_loss_mlp": 0.01259491, + "epoch": 0.3347662708552533, + "flos": 13922957001600.0, + "grad_norm": 1.6558873666299474, + "language_loss": 0.77099127, + "learning_rate": 3.102239684937949e-06, + "loss": 0.84846038, + "num_input_tokens_seen": 119587375, + "router_z_loss_clip": 1.8515625, + "router_z_loss_mlp": 0.16223145, + "step": 5568, + "time_per_iteration": 2.5343427658081055 + }, + { + "auxiliary_loss_clip": 0.06472506, + "auxiliary_loss_mlp": 0.01277777, + "balance_loss_clip": 0.06286565, + "balance_loss_mlp": 0.01262136, + "epoch": 0.33482639410792125, + "flos": 19755645788160.0, + "grad_norm": 1.9310298365294178, + "language_loss": 0.71334505, + "learning_rate": 3.101914687048842e-06, + "loss": 0.7908479, + "num_input_tokens_seen": 119604530, + "router_z_loss_clip": 1.85742188, + "router_z_loss_mlp": 0.15643311, + "step": 5569, + "time_per_iteration": 2.5091118812561035 + }, + { + "auxiliary_loss_clip": 0.06473939, + "auxiliary_loss_mlp": 0.01271857, + "balance_loss_clip": 0.06285448, + "balance_loss_mlp": 0.01256479, + "epoch": 0.3348865173605892, + "flos": 16107820214400.0, + "grad_norm": 1.931700529164995, + "language_loss": 0.90211284, + "learning_rate": 3.10158964737502e-06, + "loss": 0.97957081, + "num_input_tokens_seen": 119621025, + "router_z_loss_clip": 1.88476562, + "router_z_loss_mlp": 0.15380859, + "step": 5570, + "time_per_iteration": 2.6067447662353516 + }, + { + "auxiliary_loss_clip": 0.06465288, + "auxiliary_loss_mlp": 0.01272678, + "balance_loss_clip": 0.06282274, + "balance_loss_mlp": 0.01257264, + "epoch": 0.3349466406132572, + "flos": 25015836752640.0, + "grad_norm": 1.5216158426421846, + "language_loss": 0.79890078, + "learning_rate": 3.101264565928808e-06, + "loss": 0.87628049, + "num_input_tokens_seen": 119641725, + "router_z_loss_clip": 1.82910156, + "router_z_loss_mlp": 0.15405273, + "step": 5571, + "time_per_iteration": 2.5423781871795654 + }, + { + "auxiliary_loss_clip": 0.06342317, + "auxiliary_loss_mlp": 0.01254883, + "balance_loss_clip": 0.06257176, + "balance_loss_mlp": 0.01251411, + "epoch": 0.33500676386592515, + "flos": 54340058413440.0, + "grad_norm": 0.8278358272998855, + "language_loss": 0.55695772, + "learning_rate": 3.1009394427225335e-06, + "loss": 0.63292974, + "num_input_tokens_seen": 119693560, + "router_z_loss_clip": 0.8515625, + "router_z_loss_mlp": 0.03482056, + "step": 5572, + "time_per_iteration": 3.1027615070343018 + }, + { + "auxiliary_loss_clip": 0.06472763, + "auxiliary_loss_mlp": 0.0127696, + "balance_loss_clip": 0.06287524, + "balance_loss_mlp": 0.01261677, + "epoch": 0.3350668871185931, + "flos": 26804620915200.0, + "grad_norm": 1.9863197052332227, + "language_loss": 0.78856999, + "learning_rate": 3.1006142777685257e-06, + "loss": 0.86606717, + "num_input_tokens_seen": 119712935, + "router_z_loss_clip": 1.85253906, + "router_z_loss_mlp": 0.15283203, + "step": 5573, + "time_per_iteration": 2.571803331375122 + }, + { + "auxiliary_loss_clip": 0.06473139, + "auxiliary_loss_mlp": 0.01274748, + "balance_loss_clip": 0.06286675, + "balance_loss_mlp": 0.01257999, + "epoch": 0.3351270103712611, + "flos": 33518885708160.0, + "grad_norm": 2.2174625445936256, + "language_loss": 0.72959399, + "learning_rate": 3.1002890710791133e-06, + "loss": 0.80707288, + "num_input_tokens_seen": 119731680, + "router_z_loss_clip": 1.86523438, + "router_z_loss_mlp": 0.16723633, + "step": 5574, + "time_per_iteration": 2.660301923751831 + }, + { + "auxiliary_loss_clip": 0.06465638, + "auxiliary_loss_mlp": 0.01271718, + "balance_loss_clip": 0.06284496, + "balance_loss_mlp": 0.01256042, + "epoch": 0.33518713362392905, + "flos": 26513613285120.0, + "grad_norm": 1.6818935039401424, + "language_loss": 0.88364851, + "learning_rate": 3.0999638226666287e-06, + "loss": 0.96102208, + "num_input_tokens_seen": 119752155, + "router_z_loss_clip": 1.81152344, + "router_z_loss_mlp": 0.15661621, + "step": 5575, + "time_per_iteration": 2.5729191303253174 + }, + { + "auxiliary_loss_clip": 0.0648465, + "auxiliary_loss_mlp": 0.01276363, + "balance_loss_clip": 0.06290504, + "balance_loss_mlp": 0.01259316, + "epoch": 0.335247256876597, + "flos": 17237078490240.0, + "grad_norm": 1.9893319880263207, + "language_loss": 0.83043218, + "learning_rate": 3.0996385325434063e-06, + "loss": 0.90804225, + "num_input_tokens_seen": 119769195, + "router_z_loss_clip": 1.94042969, + "router_z_loss_mlp": 0.17053223, + "step": 5576, + "time_per_iteration": 2.5360445976257324 + }, + { + "auxiliary_loss_clip": 0.06478332, + "auxiliary_loss_mlp": 0.01275534, + "balance_loss_clip": 0.06288211, + "balance_loss_mlp": 0.01259095, + "epoch": 0.335307380129265, + "flos": 25636397690880.0, + "grad_norm": 2.0001339744496622, + "language_loss": 0.73279572, + "learning_rate": 3.0993132007217806e-06, + "loss": 0.81033432, + "num_input_tokens_seen": 119786810, + "router_z_loss_clip": 1.89941406, + "router_z_loss_mlp": 0.16442871, + "step": 5577, + "time_per_iteration": 2.575026750564575 + }, + { + "auxiliary_loss_clip": 0.06475031, + "auxiliary_loss_mlp": 0.01274987, + "balance_loss_clip": 0.0628825, + "balance_loss_mlp": 0.01257689, + "epoch": 0.33536750338193294, + "flos": 19685765882880.0, + "grad_norm": 1.6019428598408136, + "language_loss": 0.82233781, + "learning_rate": 3.0989878272140883e-06, + "loss": 0.89983797, + "num_input_tokens_seen": 119805395, + "router_z_loss_clip": 1.86425781, + "router_z_loss_mlp": 0.17297363, + "step": 5578, + "time_per_iteration": 2.544978380203247 + }, + { + "auxiliary_loss_clip": 0.06461956, + "auxiliary_loss_mlp": 0.01278493, + "balance_loss_clip": 0.06282087, + "balance_loss_mlp": 0.01262907, + "epoch": 0.3354276266346009, + "flos": 18338482483200.0, + "grad_norm": 1.788420802177993, + "language_loss": 0.72050315, + "learning_rate": 3.0986624120326676e-06, + "loss": 0.79790771, + "num_input_tokens_seen": 119823135, + "router_z_loss_clip": 1.796875, + "router_z_loss_mlp": 0.15582275, + "step": 5579, + "time_per_iteration": 2.50080943107605 + }, + { + "auxiliary_loss_clip": 0.06478497, + "auxiliary_loss_mlp": 0.01282646, + "balance_loss_clip": 0.06290549, + "balance_loss_mlp": 0.01266898, + "epoch": 0.3354877498872689, + "flos": 17864389681920.0, + "grad_norm": 2.052679713623706, + "language_loss": 0.81401342, + "learning_rate": 3.0983369551898573e-06, + "loss": 0.89162487, + "num_input_tokens_seen": 119842265, + "router_z_loss_clip": 1.87597656, + "router_z_loss_mlp": 0.15734863, + "step": 5580, + "time_per_iteration": 2.566675901412964 + }, + { + "auxiliary_loss_clip": 0.06473458, + "auxiliary_loss_mlp": 0.0128019, + "balance_loss_clip": 0.06284851, + "balance_loss_mlp": 0.01263691, + "epoch": 0.3355478731399369, + "flos": 24724703341440.0, + "grad_norm": 1.6024353673136869, + "language_loss": 0.78190315, + "learning_rate": 3.0980114566980003e-06, + "loss": 0.85943961, + "num_input_tokens_seen": 119862500, + "router_z_loss_clip": 1.88378906, + "router_z_loss_mlp": 0.16485596, + "step": 5581, + "time_per_iteration": 2.539208173751831 + }, + { + "auxiliary_loss_clip": 0.06482114, + "auxiliary_loss_mlp": 0.01276643, + "balance_loss_clip": 0.06289735, + "balance_loss_mlp": 0.01259084, + "epoch": 0.33560799639260486, + "flos": 16879628972160.0, + "grad_norm": 2.359779356701633, + "language_loss": 0.74923486, + "learning_rate": 3.0976859165694384e-06, + "loss": 0.8268224, + "num_input_tokens_seen": 119880160, + "router_z_loss_clip": 1.92480469, + "router_z_loss_mlp": 0.17565918, + "step": 5582, + "time_per_iteration": 2.5489563941955566 + }, + { + "auxiliary_loss_clip": 0.06478906, + "auxiliary_loss_mlp": 0.01276582, + "balance_loss_clip": 0.06287926, + "balance_loss_mlp": 0.01260191, + "epoch": 0.3356681196452728, + "flos": 18339530659200.0, + "grad_norm": 1.5985505462491367, + "language_loss": 0.82591236, + "learning_rate": 3.0973603348165166e-06, + "loss": 0.90346718, + "num_input_tokens_seen": 119899040, + "router_z_loss_clip": 1.91113281, + "router_z_loss_mlp": 0.16369629, + "step": 5583, + "time_per_iteration": 2.4985439777374268 + }, + { + "auxiliary_loss_clip": 0.06466989, + "auxiliary_loss_mlp": 0.01276424, + "balance_loss_clip": 0.06282677, + "balance_loss_mlp": 0.01260664, + "epoch": 0.3357282428979408, + "flos": 34759127116800.0, + "grad_norm": 1.8261350586664176, + "language_loss": 0.77844834, + "learning_rate": 3.097034711451581e-06, + "loss": 0.85588253, + "num_input_tokens_seen": 119921120, + "router_z_loss_clip": 1.84277344, + "router_z_loss_mlp": 0.15771484, + "step": 5584, + "time_per_iteration": 2.649090051651001 + }, + { + "auxiliary_loss_clip": 0.06475179, + "auxiliary_loss_mlp": 0.01274752, + "balance_loss_clip": 0.06285385, + "balance_loss_mlp": 0.01259427, + "epoch": 0.33578836615060875, + "flos": 21586539427200.0, + "grad_norm": 1.6814695059799305, + "language_loss": 0.76339197, + "learning_rate": 3.0967090464869795e-06, + "loss": 0.84089124, + "num_input_tokens_seen": 119940165, + "router_z_loss_clip": 1.89941406, + "router_z_loss_mlp": 0.15313721, + "step": 5585, + "time_per_iteration": 5.408076763153076 + }, + { + "auxiliary_loss_clip": 0.06463687, + "auxiliary_loss_mlp": 0.01277288, + "balance_loss_clip": 0.06280811, + "balance_loss_mlp": 0.0126054, + "epoch": 0.3358484894032767, + "flos": 24536377290240.0, + "grad_norm": 1.7085225722674646, + "language_loss": 0.78121984, + "learning_rate": 3.0963833399350608e-06, + "loss": 0.85862964, + "num_input_tokens_seen": 119959730, + "router_z_loss_clip": 1.83007812, + "router_z_loss_mlp": 0.16760254, + "step": 5586, + "time_per_iteration": 2.5785536766052246 + }, + { + "auxiliary_loss_clip": 0.06482486, + "auxiliary_loss_mlp": 0.01271246, + "balance_loss_clip": 0.06290784, + "balance_loss_mlp": 0.01254902, + "epoch": 0.3359086126559447, + "flos": 22462161793920.0, + "grad_norm": 1.9607494340110725, + "language_loss": 0.81952178, + "learning_rate": 3.0960575918081756e-06, + "loss": 0.89705908, + "num_input_tokens_seen": 119979315, + "router_z_loss_clip": 1.91796875, + "router_z_loss_mlp": 0.16357422, + "step": 5587, + "time_per_iteration": 3.9456732273101807 + }, + { + "auxiliary_loss_clip": 0.06460288, + "auxiliary_loss_mlp": 0.01274939, + "balance_loss_clip": 0.06281327, + "balance_loss_mlp": 0.01259692, + "epoch": 0.33596873590861265, + "flos": 16549069415040.0, + "grad_norm": 1.7386991231776667, + "language_loss": 0.67118108, + "learning_rate": 3.095731802118677e-06, + "loss": 0.74853337, + "num_input_tokens_seen": 119996140, + "router_z_loss_clip": 1.79003906, + "router_z_loss_mlp": 0.15234375, + "step": 5588, + "time_per_iteration": 2.6328773498535156 + }, + { + "auxiliary_loss_clip": 0.06471635, + "auxiliary_loss_mlp": 0.01272286, + "balance_loss_clip": 0.0628484, + "balance_loss_mlp": 0.01255215, + "epoch": 0.3360288591612806, + "flos": 31183864778880.0, + "grad_norm": 2.547244730124186, + "language_loss": 0.70319438, + "learning_rate": 3.095405970878919e-06, + "loss": 0.78063357, + "num_input_tokens_seen": 120017720, + "router_z_loss_clip": 1.86621094, + "router_z_loss_mlp": 0.17077637, + "step": 5589, + "time_per_iteration": 2.631972074508667 + }, + { + "auxiliary_loss_clip": 0.06473772, + "auxiliary_loss_mlp": 0.01270331, + "balance_loss_clip": 0.06286001, + "balance_loss_mlp": 0.01255096, + "epoch": 0.3360889824139486, + "flos": 23703828721920.0, + "grad_norm": 1.7722032929069027, + "language_loss": 0.67818141, + "learning_rate": 3.0950800981012567e-06, + "loss": 0.75562239, + "num_input_tokens_seen": 120036335, + "router_z_loss_clip": 1.87695312, + "router_z_loss_mlp": 0.15258789, + "step": 5590, + "time_per_iteration": 2.582160711288452 + }, + { + "auxiliary_loss_clip": 0.0646477, + "auxiliary_loss_mlp": 0.01273314, + "balance_loss_clip": 0.06283349, + "balance_loss_mlp": 0.01257972, + "epoch": 0.33614910566661654, + "flos": 19324207514880.0, + "grad_norm": 1.8733623292805037, + "language_loss": 0.73821473, + "learning_rate": 3.094754183798047e-06, + "loss": 0.81559563, + "num_input_tokens_seen": 120056120, + "router_z_loss_clip": 1.8125, + "router_z_loss_mlp": 0.15344238, + "step": 5591, + "time_per_iteration": 2.5325355529785156 + }, + { + "auxiliary_loss_clip": 0.06462986, + "auxiliary_loss_mlp": 0.01270586, + "balance_loss_clip": 0.06280106, + "balance_loss_mlp": 0.01254945, + "epoch": 0.3362092289192845, + "flos": 16477889771520.0, + "grad_norm": 3.0838875929044036, + "language_loss": 0.70195794, + "learning_rate": 3.0944282279816493e-06, + "loss": 0.77929366, + "num_input_tokens_seen": 120073650, + "router_z_loss_clip": 1.828125, + "router_z_loss_mlp": 0.15637207, + "step": 5592, + "time_per_iteration": 3.919609546661377 + }, + { + "auxiliary_loss_clip": 0.06466913, + "auxiliary_loss_mlp": 0.01271712, + "balance_loss_clip": 0.06283789, + "balance_loss_mlp": 0.01257014, + "epoch": 0.33626935217195253, + "flos": 24250484759040.0, + "grad_norm": 2.017741256836838, + "language_loss": 0.76621854, + "learning_rate": 3.094102230664423e-06, + "loss": 0.8436048, + "num_input_tokens_seen": 120093260, + "router_z_loss_clip": 1.83007812, + "router_z_loss_mlp": 0.14697266, + "step": 5593, + "time_per_iteration": 2.582902431488037 + }, + { + "auxiliary_loss_clip": 0.06476289, + "auxiliary_loss_mlp": 0.01272909, + "balance_loss_clip": 0.06285767, + "balance_loss_mlp": 0.01255445, + "epoch": 0.3363294754246205, + "flos": 19724814685440.0, + "grad_norm": 3.212319882003512, + "language_loss": 0.72710228, + "learning_rate": 3.093776191858731e-06, + "loss": 0.80459422, + "num_input_tokens_seen": 120111830, + "router_z_loss_clip": 1.90625, + "router_z_loss_mlp": 0.17456055, + "step": 5594, + "time_per_iteration": 2.495196580886841 + }, + { + "auxiliary_loss_clip": 0.06477273, + "auxiliary_loss_mlp": 0.01272377, + "balance_loss_clip": 0.06289684, + "balance_loss_mlp": 0.01256379, + "epoch": 0.33638959867728846, + "flos": 22602005458560.0, + "grad_norm": 1.7565144487218112, + "language_loss": 0.8009572, + "learning_rate": 3.0934501115769363e-06, + "loss": 0.87845373, + "num_input_tokens_seen": 120130470, + "router_z_loss_clip": 1.87402344, + "router_z_loss_mlp": 0.16003418, + "step": 5595, + "time_per_iteration": 2.5639891624450684 + }, + { + "auxiliary_loss_clip": 0.06468762, + "auxiliary_loss_mlp": 0.01271282, + "balance_loss_clip": 0.06285411, + "balance_loss_mlp": 0.01256691, + "epoch": 0.3364497219299564, + "flos": 21000834587520.0, + "grad_norm": 1.6187307873664143, + "language_loss": 0.81718135, + "learning_rate": 3.0931239898314037e-06, + "loss": 0.89458185, + "num_input_tokens_seen": 120150735, + "router_z_loss_clip": 1.83203125, + "router_z_loss_mlp": 0.14587402, + "step": 5596, + "time_per_iteration": 2.579089403152466 + }, + { + "auxiliary_loss_clip": 0.06470582, + "auxiliary_loss_mlp": 0.01270351, + "balance_loss_clip": 0.06285384, + "balance_loss_mlp": 0.01256034, + "epoch": 0.3365098451826244, + "flos": 25235664739200.0, + "grad_norm": 1.5539796133352632, + "language_loss": 0.76225436, + "learning_rate": 3.0927978266344995e-06, + "loss": 0.83966368, + "num_input_tokens_seen": 120173230, + "router_z_loss_clip": 1.8515625, + "router_z_loss_mlp": 0.14318848, + "step": 5597, + "time_per_iteration": 2.6059625148773193 + }, + { + "auxiliary_loss_clip": 0.06473622, + "auxiliary_loss_mlp": 0.01271725, + "balance_loss_clip": 0.06290761, + "balance_loss_mlp": 0.01257206, + "epoch": 0.33656996843529235, + "flos": 24578612547840.0, + "grad_norm": 1.67554812607641, + "language_loss": 0.78886169, + "learning_rate": 3.0924716219985916e-06, + "loss": 0.86631513, + "num_input_tokens_seen": 120191860, + "router_z_loss_clip": 1.83007812, + "router_z_loss_mlp": 0.14520264, + "step": 5598, + "time_per_iteration": 2.54971981048584 + }, + { + "auxiliary_loss_clip": 0.06487022, + "auxiliary_loss_mlp": 0.01275679, + "balance_loss_clip": 0.0629402, + "balance_loss_mlp": 0.01259353, + "epoch": 0.3366300916879603, + "flos": 44101223331840.0, + "grad_norm": 1.966389459711274, + "language_loss": 0.64792764, + "learning_rate": 3.0921453759360514e-06, + "loss": 0.7255547, + "num_input_tokens_seen": 120219195, + "router_z_loss_clip": 1.92871094, + "router_z_loss_mlp": 0.16326904, + "step": 5599, + "time_per_iteration": 2.741544723510742 + }, + { + "auxiliary_loss_clip": 0.06483869, + "auxiliary_loss_mlp": 0.01276046, + "balance_loss_clip": 0.06290758, + "balance_loss_mlp": 0.01259118, + "epoch": 0.3366902149406283, + "flos": 13884746739840.0, + "grad_norm": 2.857086104177812, + "language_loss": 0.82787466, + "learning_rate": 3.091819088459249e-06, + "loss": 0.90547383, + "num_input_tokens_seen": 120232950, + "router_z_loss_clip": 1.93164062, + "router_z_loss_mlp": 0.16906738, + "step": 5600, + "time_per_iteration": 2.4761526584625244 + }, + { + "auxiliary_loss_clip": 0.06480727, + "auxiliary_loss_mlp": 0.01272907, + "balance_loss_clip": 0.06289887, + "balance_loss_mlp": 0.01257255, + "epoch": 0.33675033819329625, + "flos": 16258648763520.0, + "grad_norm": 2.1921833677853853, + "language_loss": 0.83268821, + "learning_rate": 3.0914927595805573e-06, + "loss": 0.91022456, + "num_input_tokens_seen": 120248865, + "router_z_loss_clip": 1.90625, + "router_z_loss_mlp": 0.15649414, + "step": 5601, + "time_per_iteration": 2.5205788612365723 + }, + { + "auxiliary_loss_clip": 0.06469133, + "auxiliary_loss_mlp": 0.01269312, + "balance_loss_clip": 0.0628977, + "balance_loss_mlp": 0.01255382, + "epoch": 0.3368104614459642, + "flos": 17061498259200.0, + "grad_norm": 1.6270640398275205, + "language_loss": 0.83791035, + "learning_rate": 3.0911663893123507e-06, + "loss": 0.91529477, + "num_input_tokens_seen": 120267820, + "router_z_loss_clip": 1.79199219, + "router_z_loss_mlp": 0.1394043, + "step": 5602, + "time_per_iteration": 2.5069589614868164 + }, + { + "auxiliary_loss_clip": 0.06479525, + "auxiliary_loss_mlp": 0.01274011, + "balance_loss_clip": 0.06294133, + "balance_loss_mlp": 0.01258645, + "epoch": 0.3368705846986322, + "flos": 17864473536000.0, + "grad_norm": 2.666791314538914, + "language_loss": 0.69934028, + "learning_rate": 3.0908399776670048e-06, + "loss": 0.77687562, + "num_input_tokens_seen": 120286540, + "router_z_loss_clip": 1.85351562, + "router_z_loss_mlp": 0.15380859, + "step": 5603, + "time_per_iteration": 2.5512561798095703 + }, + { + "auxiliary_loss_clip": 0.0648806, + "auxiliary_loss_mlp": 0.01271029, + "balance_loss_clip": 0.06298037, + "balance_loss_mlp": 0.01255376, + "epoch": 0.33693070795130015, + "flos": 22936086887040.0, + "grad_norm": 1.5393691582180518, + "language_loss": 0.83336604, + "learning_rate": 3.090513524656898e-06, + "loss": 0.91095686, + "num_input_tokens_seen": 120307305, + "router_z_loss_clip": 1.90234375, + "router_z_loss_mlp": 0.15661621, + "step": 5604, + "time_per_iteration": 2.542419910430908 + }, + { + "auxiliary_loss_clip": 0.06487563, + "auxiliary_loss_mlp": 0.01271201, + "balance_loss_clip": 0.06296179, + "balance_loss_mlp": 0.01255, + "epoch": 0.3369908312039681, + "flos": 22023889413120.0, + "grad_norm": 1.7290560496085086, + "language_loss": 0.74166059, + "learning_rate": 3.090187030294409e-06, + "loss": 0.8192482, + "num_input_tokens_seen": 120327845, + "router_z_loss_clip": 1.91015625, + "router_z_loss_mlp": 0.1619873, + "step": 5605, + "time_per_iteration": 2.551250696182251 + }, + { + "auxiliary_loss_clip": 0.0648852, + "auxiliary_loss_mlp": 0.01268868, + "balance_loss_clip": 0.06295876, + "balance_loss_mlp": 0.01253347, + "epoch": 0.33705095445663613, + "flos": 11806799736960.0, + "grad_norm": 2.683910051705504, + "language_loss": 0.84068418, + "learning_rate": 3.089860494591919e-06, + "loss": 0.91825807, + "num_input_tokens_seen": 120343255, + "router_z_loss_clip": 1.92675781, + "router_z_loss_mlp": 0.15515137, + "step": 5606, + "time_per_iteration": 2.4841489791870117 + }, + { + "auxiliary_loss_clip": 0.0647673, + "auxiliary_loss_mlp": 0.01269431, + "balance_loss_clip": 0.06290583, + "balance_loss_mlp": 0.01254721, + "epoch": 0.3371110777093041, + "flos": 25053460035840.0, + "grad_norm": 1.669780314791874, + "language_loss": 0.68210214, + "learning_rate": 3.089533917561809e-06, + "loss": 0.7595638, + "num_input_tokens_seen": 120361745, + "router_z_loss_clip": 1.85839844, + "router_z_loss_mlp": 0.14709473, + "step": 5607, + "time_per_iteration": 2.6018009185791016 + }, + { + "auxiliary_loss_clip": 0.0648887, + "auxiliary_loss_mlp": 0.01274582, + "balance_loss_clip": 0.06295381, + "balance_loss_mlp": 0.01258131, + "epoch": 0.33717120096197206, + "flos": 26586386156160.0, + "grad_norm": 1.643709475435958, + "language_loss": 0.71566343, + "learning_rate": 3.089207299216464e-06, + "loss": 0.79329789, + "num_input_tokens_seen": 120380565, + "router_z_loss_clip": 1.93261719, + "router_z_loss_mlp": 0.16442871, + "step": 5608, + "time_per_iteration": 2.5980639457702637 + }, + { + "auxiliary_loss_clip": 0.06479236, + "auxiliary_loss_mlp": 0.01274936, + "balance_loss_clip": 0.06291037, + "balance_loss_mlp": 0.01258712, + "epoch": 0.33723132421464, + "flos": 15163911169920.0, + "grad_norm": 1.8781248289320855, + "language_loss": 0.79662472, + "learning_rate": 3.088880639568269e-06, + "loss": 0.87416643, + "num_input_tokens_seen": 120399235, + "router_z_loss_clip": 1.8828125, + "router_z_loss_mlp": 0.16223145, + "step": 5609, + "time_per_iteration": 2.6196935176849365 + }, + { + "auxiliary_loss_clip": 0.06480544, + "auxiliary_loss_mlp": 0.01274048, + "balance_loss_clip": 0.06290779, + "balance_loss_mlp": 0.01256262, + "epoch": 0.337291447467308, + "flos": 23442058967040.0, + "grad_norm": 1.7293742366408622, + "language_loss": 0.83075953, + "learning_rate": 3.0885539386296114e-06, + "loss": 0.90830547, + "num_input_tokens_seen": 120420095, + "router_z_loss_clip": 1.89550781, + "router_z_loss_mlp": 0.17785645, + "step": 5610, + "time_per_iteration": 2.53485369682312 + }, + { + "auxiliary_loss_clip": 0.06471263, + "auxiliary_loss_mlp": 0.01269511, + "balance_loss_clip": 0.06288794, + "balance_loss_mlp": 0.01254097, + "epoch": 0.33735157071997596, + "flos": 17243870670720.0, + "grad_norm": 1.916021570377688, + "language_loss": 0.82657987, + "learning_rate": 3.088227196412879e-06, + "loss": 0.90398765, + "num_input_tokens_seen": 120437690, + "router_z_loss_clip": 1.82519531, + "router_z_loss_mlp": 0.1541748, + "step": 5611, + "time_per_iteration": 2.5164084434509277 + }, + { + "auxiliary_loss_clip": 0.06478009, + "auxiliary_loss_mlp": 0.01278112, + "balance_loss_clip": 0.0629037, + "balance_loss_mlp": 0.01260005, + "epoch": 0.3374116939726439, + "flos": 28265025726720.0, + "grad_norm": 3.0042840390827106, + "language_loss": 0.79815799, + "learning_rate": 3.0879004129304626e-06, + "loss": 0.87571925, + "num_input_tokens_seen": 120459240, + "router_z_loss_clip": 1.87597656, + "router_z_loss_mlp": 0.18084717, + "step": 5612, + "time_per_iteration": 2.582742929458618 + }, + { + "auxiliary_loss_clip": 0.06476334, + "auxiliary_loss_mlp": 0.0127707, + "balance_loss_clip": 0.06288031, + "balance_loss_mlp": 0.01261597, + "epoch": 0.3374718172253119, + "flos": 35928314663040.0, + "grad_norm": 2.3711016444568003, + "language_loss": 0.69757682, + "learning_rate": 3.087573588194753e-06, + "loss": 0.7751109, + "num_input_tokens_seen": 120481090, + "router_z_loss_clip": 1.88378906, + "router_z_loss_mlp": 0.15466309, + "step": 5613, + "time_per_iteration": 2.6553308963775635 + }, + { + "auxiliary_loss_clip": 0.06477948, + "auxiliary_loss_mlp": 0.01274833, + "balance_loss_clip": 0.06288674, + "balance_loss_mlp": 0.01259181, + "epoch": 0.33753194047797985, + "flos": 18192517470720.0, + "grad_norm": 1.7341744507496721, + "language_loss": 0.80043244, + "learning_rate": 3.087246722218144e-06, + "loss": 0.87796032, + "num_input_tokens_seen": 120500045, + "router_z_loss_clip": 1.89257812, + "router_z_loss_mlp": 0.15673828, + "step": 5614, + "time_per_iteration": 2.5162055492401123 + }, + { + "auxiliary_loss_clip": 0.06479318, + "auxiliary_loss_mlp": 0.01274123, + "balance_loss_clip": 0.06289384, + "balance_loss_mlp": 0.01257684, + "epoch": 0.3375920637306478, + "flos": 23155621384320.0, + "grad_norm": 1.8737965791301845, + "language_loss": 0.91138643, + "learning_rate": 3.086919815013031e-06, + "loss": 0.98892087, + "num_input_tokens_seen": 120521125, + "router_z_loss_clip": 1.90234375, + "router_z_loss_mlp": 0.16430664, + "step": 5615, + "time_per_iteration": 2.5491819381713867 + }, + { + "auxiliary_loss_clip": 0.0646698, + "auxiliary_loss_mlp": 0.01277747, + "balance_loss_clip": 0.06282586, + "balance_loss_mlp": 0.01261857, + "epoch": 0.3376521869833158, + "flos": 23118878568960.0, + "grad_norm": 1.8899714235087088, + "language_loss": 0.81227732, + "learning_rate": 3.086592866591809e-06, + "loss": 0.88972461, + "num_input_tokens_seen": 120539180, + "router_z_loss_clip": 1.84375, + "router_z_loss_mlp": 0.15881348, + "step": 5616, + "time_per_iteration": 2.551891803741455 + }, + { + "auxiliary_loss_clip": 0.0647929, + "auxiliary_loss_mlp": 0.01281624, + "balance_loss_clip": 0.06285349, + "balance_loss_mlp": 0.01263576, + "epoch": 0.33771231023598375, + "flos": 19279498561920.0, + "grad_norm": 1.7280186066143421, + "language_loss": 0.84097004, + "learning_rate": 3.0862658769668774e-06, + "loss": 0.91857922, + "num_input_tokens_seen": 120556280, + "router_z_loss_clip": 1.9375, + "router_z_loss_mlp": 0.18054199, + "step": 5617, + "time_per_iteration": 2.532703161239624 + }, + { + "auxiliary_loss_clip": 0.06466082, + "auxiliary_loss_mlp": 0.01273548, + "balance_loss_clip": 0.06279126, + "balance_loss_mlp": 0.01257073, + "epoch": 0.3377724334886517, + "flos": 18156026217600.0, + "grad_norm": 1.631465963150073, + "language_loss": 0.80857313, + "learning_rate": 3.0859388461506343e-06, + "loss": 0.8859694, + "num_input_tokens_seen": 120575395, + "router_z_loss_clip": 1.86816406, + "router_z_loss_mlp": 0.16467285, + "step": 5618, + "time_per_iteration": 2.5592081546783447 + }, + { + "auxiliary_loss_clip": 0.06473768, + "auxiliary_loss_mlp": 0.01275311, + "balance_loss_clip": 0.06286047, + "balance_loss_mlp": 0.01258514, + "epoch": 0.3378325567413197, + "flos": 25783159317120.0, + "grad_norm": 2.0305417192076267, + "language_loss": 0.71181929, + "learning_rate": 3.085611774155481e-06, + "loss": 0.7893101, + "num_input_tokens_seen": 120596075, + "router_z_loss_clip": 1.87695312, + "router_z_loss_mlp": 0.16809082, + "step": 5619, + "time_per_iteration": 2.5726358890533447 + }, + { + "auxiliary_loss_clip": 0.06476114, + "auxiliary_loss_mlp": 0.01271613, + "balance_loss_clip": 0.06289306, + "balance_loss_mlp": 0.01256688, + "epoch": 0.3378926799939877, + "flos": 21322254049920.0, + "grad_norm": 2.6280659122339496, + "language_loss": 0.70615005, + "learning_rate": 3.085284660993821e-06, + "loss": 0.78362733, + "num_input_tokens_seen": 120614195, + "router_z_loss_clip": 1.86523438, + "router_z_loss_mlp": 0.14929199, + "step": 5620, + "time_per_iteration": 2.604161500930786 + }, + { + "auxiliary_loss_clip": 0.06467394, + "auxiliary_loss_mlp": 0.0127348, + "balance_loss_clip": 0.0628472, + "balance_loss_mlp": 0.01258054, + "epoch": 0.33795280324665566, + "flos": 24906991898880.0, + "grad_norm": 2.3940060195146384, + "language_loss": 0.6847257, + "learning_rate": 3.084957506678058e-06, + "loss": 0.76213443, + "num_input_tokens_seen": 120634475, + "router_z_loss_clip": 1.82421875, + "router_z_loss_mlp": 0.1541748, + "step": 5621, + "time_per_iteration": 2.559730052947998 + }, + { + "auxiliary_loss_clip": 0.06469798, + "auxiliary_loss_mlp": 0.01273981, + "balance_loss_clip": 0.06287812, + "balance_loss_mlp": 0.0125914, + "epoch": 0.33801292649932363, + "flos": 24760859178240.0, + "grad_norm": 1.8671152624425502, + "language_loss": 0.82685888, + "learning_rate": 3.0846303112205975e-06, + "loss": 0.90429658, + "num_input_tokens_seen": 120654980, + "router_z_loss_clip": 1.81933594, + "router_z_loss_mlp": 0.1484375, + "step": 5622, + "time_per_iteration": 2.5722928047180176 + }, + { + "auxiliary_loss_clip": 0.06466316, + "auxiliary_loss_mlp": 0.01274625, + "balance_loss_clip": 0.06284748, + "balance_loss_mlp": 0.01260564, + "epoch": 0.3380730497519916, + "flos": 26731177211520.0, + "grad_norm": 1.4865849557607265, + "language_loss": 0.74114043, + "learning_rate": 3.0843030746338464e-06, + "loss": 0.81854987, + "num_input_tokens_seen": 120676245, + "router_z_loss_clip": 1.81738281, + "router_z_loss_mlp": 0.14056396, + "step": 5623, + "time_per_iteration": 2.5830907821655273 + }, + { + "auxiliary_loss_clip": 0.06389539, + "auxiliary_loss_mlp": 0.01273334, + "balance_loss_clip": 0.06299451, + "balance_loss_mlp": 0.01265943, + "epoch": 0.33813317300465956, + "flos": 70056845550720.0, + "grad_norm": 0.7132848624035326, + "language_loss": 0.54856884, + "learning_rate": 3.083975796930215e-06, + "loss": 0.62519753, + "num_input_tokens_seen": 120741965, + "router_z_loss_clip": 0.8984375, + "router_z_loss_mlp": 0.07373047, + "step": 5624, + "time_per_iteration": 4.680114030838013 + }, + { + "auxiliary_loss_clip": 0.06475174, + "auxiliary_loss_mlp": 0.01272775, + "balance_loss_clip": 0.06285602, + "balance_loss_mlp": 0.01256086, + "epoch": 0.3381932962573275, + "flos": 24104142403200.0, + "grad_norm": 3.6042241236842267, + "language_loss": 0.73496938, + "learning_rate": 3.083648478122111e-06, + "loss": 0.81244886, + "num_input_tokens_seen": 120760410, + "router_z_loss_clip": 1.89550781, + "router_z_loss_mlp": 0.16687012, + "step": 5625, + "time_per_iteration": 4.002846956253052 + }, + { + "auxiliary_loss_clip": 0.06480759, + "auxiliary_loss_mlp": 0.01274878, + "balance_loss_clip": 0.06288841, + "balance_loss_mlp": 0.01257021, + "epoch": 0.3382534195099955, + "flos": 19283775120000.0, + "grad_norm": 1.9831743515273117, + "language_loss": 0.7176404, + "learning_rate": 3.0833211182219497e-06, + "loss": 0.79519677, + "num_input_tokens_seen": 120777705, + "router_z_loss_clip": 1.91796875, + "router_z_loss_mlp": 0.17858887, + "step": 5626, + "time_per_iteration": 2.4999427795410156 + }, + { + "auxiliary_loss_clip": 0.06468458, + "auxiliary_loss_mlp": 0.01272986, + "balance_loss_clip": 0.06287608, + "balance_loss_mlp": 0.01257739, + "epoch": 0.33831354276266346, + "flos": 25232897554560.0, + "grad_norm": 2.987617225478933, + "language_loss": 0.81275499, + "learning_rate": 3.0829937172421425e-06, + "loss": 0.8901695, + "num_input_tokens_seen": 120798660, + "router_z_loss_clip": 1.80957031, + "router_z_loss_mlp": 0.15246582, + "step": 5627, + "time_per_iteration": 3.951984405517578 + }, + { + "auxiliary_loss_clip": 0.06478465, + "auxiliary_loss_mlp": 0.01272976, + "balance_loss_clip": 0.06288861, + "balance_loss_mlp": 0.0125668, + "epoch": 0.3383736660153314, + "flos": 23118627006720.0, + "grad_norm": 1.844905449272807, + "language_loss": 0.80405974, + "learning_rate": 3.0826662751951055e-06, + "loss": 0.88157415, + "num_input_tokens_seen": 120816705, + "router_z_loss_clip": 1.89648438, + "router_z_loss_mlp": 0.16296387, + "step": 5628, + "time_per_iteration": 2.5670697689056396 + }, + { + "auxiliary_loss_clip": 0.06477988, + "auxiliary_loss_mlp": 0.01270735, + "balance_loss_clip": 0.06288996, + "balance_loss_mlp": 0.0125457, + "epoch": 0.3384337892679994, + "flos": 23483874954240.0, + "grad_norm": 2.662319374226008, + "language_loss": 0.77757806, + "learning_rate": 3.082338792093254e-06, + "loss": 0.85506529, + "num_input_tokens_seen": 120835375, + "router_z_loss_clip": 1.88867188, + "router_z_loss_mlp": 0.16174316, + "step": 5629, + "time_per_iteration": 2.5463128089904785 + }, + { + "auxiliary_loss_clip": 0.06482605, + "auxiliary_loss_mlp": 0.01280413, + "balance_loss_clip": 0.06291752, + "balance_loss_mlp": 0.01262758, + "epoch": 0.33849391252066735, + "flos": 19431626849280.0, + "grad_norm": 1.826421419331283, + "language_loss": 0.85789764, + "learning_rate": 3.0820112679490074e-06, + "loss": 0.9355278, + "num_input_tokens_seen": 120854260, + "router_z_loss_clip": 1.90722656, + "router_z_loss_mlp": 0.17663574, + "step": 5630, + "time_per_iteration": 2.5818262100219727 + }, + { + "auxiliary_loss_clip": 0.06476109, + "auxiliary_loss_mlp": 0.01274878, + "balance_loss_clip": 0.06290477, + "balance_loss_mlp": 0.01260073, + "epoch": 0.3385540357733353, + "flos": 21070462930560.0, + "grad_norm": 2.179516256809373, + "language_loss": 0.72520673, + "learning_rate": 3.0816837027747857e-06, + "loss": 0.80271661, + "num_input_tokens_seen": 120871590, + "router_z_loss_clip": 1.85351562, + "router_z_loss_mlp": 0.14807129, + "step": 5631, + "time_per_iteration": 3.9340498447418213 + }, + { + "auxiliary_loss_clip": 0.06388511, + "auxiliary_loss_mlp": 0.01280567, + "balance_loss_clip": 0.06298131, + "balance_loss_mlp": 0.01272211, + "epoch": 0.3386141590260033, + "flos": 69224772908160.0, + "grad_norm": 0.8339652565495183, + "language_loss": 0.56105018, + "learning_rate": 3.0813560965830084e-06, + "loss": 0.63774097, + "num_input_tokens_seen": 120925550, + "router_z_loss_clip": 0.90234375, + "router_z_loss_mlp": 0.08361816, + "step": 5632, + "time_per_iteration": 3.215395450592041 + }, + { + "auxiliary_loss_clip": 0.06477562, + "auxiliary_loss_mlp": 0.01271677, + "balance_loss_clip": 0.06290288, + "balance_loss_mlp": 0.01256573, + "epoch": 0.3386742822786713, + "flos": 25526420807040.0, + "grad_norm": 3.459768837753136, + "language_loss": 0.81030583, + "learning_rate": 3.0810284493861005e-06, + "loss": 0.88779831, + "num_input_tokens_seen": 120947620, + "router_z_loss_clip": 1.87011719, + "router_z_loss_mlp": 0.15112305, + "step": 5633, + "time_per_iteration": 2.6278936862945557 + }, + { + "auxiliary_loss_clip": 0.06473435, + "auxiliary_loss_mlp": 0.01274796, + "balance_loss_clip": 0.06287597, + "balance_loss_mlp": 0.01258942, + "epoch": 0.33873440553133927, + "flos": 23629881893760.0, + "grad_norm": 2.634738846372382, + "language_loss": 0.59410667, + "learning_rate": 3.0807007611964855e-06, + "loss": 0.67158902, + "num_input_tokens_seen": 120965205, + "router_z_loss_clip": 1.85839844, + "router_z_loss_mlp": 0.15856934, + "step": 5634, + "time_per_iteration": 2.565622091293335 + }, + { + "auxiliary_loss_clip": 0.06475686, + "auxiliary_loss_mlp": 0.01270379, + "balance_loss_clip": 0.0628805, + "balance_loss_mlp": 0.01255216, + "epoch": 0.33879452878400723, + "flos": 17094006443520.0, + "grad_norm": 1.81394172090833, + "language_loss": 0.92877531, + "learning_rate": 3.080373032026589e-06, + "loss": 1.00623596, + "num_input_tokens_seen": 120983560, + "router_z_loss_clip": 1.87597656, + "router_z_loss_mlp": 0.15161133, + "step": 5635, + "time_per_iteration": 2.539051055908203 + }, + { + "auxiliary_loss_clip": 0.06470082, + "auxiliary_loss_mlp": 0.01273079, + "balance_loss_clip": 0.0629005, + "balance_loss_mlp": 0.01257457, + "epoch": 0.3388546520366752, + "flos": 15747477730560.0, + "grad_norm": 1.8703432540182672, + "language_loss": 0.75823128, + "learning_rate": 3.0800452618888386e-06, + "loss": 0.83566296, + "num_input_tokens_seen": 121001400, + "router_z_loss_clip": 1.80078125, + "router_z_loss_mlp": 0.15618896, + "step": 5636, + "time_per_iteration": 2.4998726844787598 + }, + { + "auxiliary_loss_clip": 0.064714, + "auxiliary_loss_mlp": 0.01275037, + "balance_loss_clip": 0.06288341, + "balance_loss_mlp": 0.01258848, + "epoch": 0.33891477528934316, + "flos": 22425251270400.0, + "grad_norm": 1.6981405891584176, + "language_loss": 0.83775222, + "learning_rate": 3.0797174507956637e-06, + "loss": 0.91521657, + "num_input_tokens_seen": 121021760, + "router_z_loss_clip": 1.828125, + "router_z_loss_mlp": 0.1619873, + "step": 5637, + "time_per_iteration": 2.551074981689453 + }, + { + "auxiliary_loss_clip": 0.06474115, + "auxiliary_loss_mlp": 0.01272331, + "balance_loss_clip": 0.06286962, + "balance_loss_mlp": 0.01254736, + "epoch": 0.3389748985420111, + "flos": 17280571559040.0, + "grad_norm": 1.787045955061502, + "language_loss": 0.70609659, + "learning_rate": 3.079389598759495e-06, + "loss": 0.78356105, + "num_input_tokens_seen": 121041070, + "router_z_loss_clip": 1.87304688, + "router_z_loss_mlp": 0.17590332, + "step": 5638, + "time_per_iteration": 2.5479955673217773 + }, + { + "auxiliary_loss_clip": 0.06478329, + "auxiliary_loss_mlp": 0.01289332, + "balance_loss_clip": 0.06293231, + "balance_loss_mlp": 0.01272404, + "epoch": 0.3390350217946791, + "flos": 27752261466240.0, + "grad_norm": 1.7018866339003167, + "language_loss": 0.81276166, + "learning_rate": 3.079061705792765e-06, + "loss": 0.89043832, + "num_input_tokens_seen": 121060890, + "router_z_loss_clip": 1.84960938, + "router_z_loss_mlp": 0.16931152, + "step": 5639, + "time_per_iteration": 2.614819288253784 + }, + { + "auxiliary_loss_clip": 0.06487049, + "auxiliary_loss_mlp": 0.01288743, + "balance_loss_clip": 0.06296147, + "balance_loss_mlp": 0.01270635, + "epoch": 0.33909514504734706, + "flos": 20346088383360.0, + "grad_norm": 6.449374256721531, + "language_loss": 0.68149316, + "learning_rate": 3.078733771907907e-06, + "loss": 0.75925112, + "num_input_tokens_seen": 121079135, + "router_z_loss_clip": 1.90917969, + "router_z_loss_mlp": 0.18103027, + "step": 5640, + "time_per_iteration": 2.496300220489502 + }, + { + "auxiliary_loss_clip": 0.06471096, + "auxiliary_loss_mlp": 0.01277542, + "balance_loss_clip": 0.06286727, + "balance_loss_mlp": 0.0125978, + "epoch": 0.339155268300015, + "flos": 14835322183680.0, + "grad_norm": 1.7549267997867504, + "language_loss": 0.70165765, + "learning_rate": 3.0784057971173554e-06, + "loss": 0.77914405, + "num_input_tokens_seen": 121097685, + "router_z_loss_clip": 1.84570312, + "router_z_loss_mlp": 0.1776123, + "step": 5641, + "time_per_iteration": 2.524548053741455 + }, + { + "auxiliary_loss_clip": 0.0647646, + "auxiliary_loss_mlp": 0.0128105, + "balance_loss_clip": 0.06289618, + "balance_loss_mlp": 0.01264611, + "epoch": 0.339215391552683, + "flos": 26075173196160.0, + "grad_norm": 2.2643311920206592, + "language_loss": 0.88204467, + "learning_rate": 3.0780777814335483e-06, + "loss": 0.95961982, + "num_input_tokens_seen": 121115640, + "router_z_loss_clip": 1.86914062, + "router_z_loss_mlp": 0.16430664, + "step": 5642, + "time_per_iteration": 2.551790237426758 + }, + { + "auxiliary_loss_clip": 0.06466684, + "auxiliary_loss_mlp": 0.01272488, + "balance_loss_clip": 0.06289211, + "balance_loss_mlp": 0.01258195, + "epoch": 0.33927551480535095, + "flos": 14579967265920.0, + "grad_norm": 2.023061860440481, + "language_loss": 0.84285331, + "learning_rate": 3.077749724868924e-06, + "loss": 0.92024505, + "num_input_tokens_seen": 121132485, + "router_z_loss_clip": 1.7734375, + "router_z_loss_mlp": 0.1428833, + "step": 5643, + "time_per_iteration": 2.542921304702759 + }, + { + "auxiliary_loss_clip": 0.06468654, + "auxiliary_loss_mlp": 0.01272873, + "balance_loss_clip": 0.06285787, + "balance_loss_mlp": 0.01256708, + "epoch": 0.3393356380580189, + "flos": 23812380086400.0, + "grad_norm": 6.736940029896959, + "language_loss": 0.77634799, + "learning_rate": 3.077421627435922e-06, + "loss": 0.85376322, + "num_input_tokens_seen": 121152935, + "router_z_loss_clip": 1.828125, + "router_z_loss_mlp": 0.16162109, + "step": 5644, + "time_per_iteration": 2.523386240005493 + }, + { + "auxiliary_loss_clip": 0.06472027, + "auxiliary_loss_mlp": 0.01274584, + "balance_loss_clip": 0.06288091, + "balance_loss_mlp": 0.0125873, + "epoch": 0.3393957613106869, + "flos": 17353637919360.0, + "grad_norm": 2.9654561398927752, + "language_loss": 0.6324017, + "learning_rate": 3.0770934891469832e-06, + "loss": 0.70986784, + "num_input_tokens_seen": 121169835, + "router_z_loss_clip": 1.84179688, + "router_z_loss_mlp": 0.15856934, + "step": 5645, + "time_per_iteration": 2.51273775100708 + }, + { + "auxiliary_loss_clip": 0.06466414, + "auxiliary_loss_mlp": 0.01272532, + "balance_loss_clip": 0.06285059, + "balance_loss_mlp": 0.01256284, + "epoch": 0.3394558845633549, + "flos": 28440647884800.0, + "grad_norm": 2.089100449350665, + "language_loss": 0.77295536, + "learning_rate": 3.076765310014552e-06, + "loss": 0.8503449, + "num_input_tokens_seen": 121190290, + "router_z_loss_clip": 1.8125, + "router_z_loss_mlp": 0.16247559, + "step": 5646, + "time_per_iteration": 2.5461859703063965 + }, + { + "auxiliary_loss_clip": 0.06477356, + "auxiliary_loss_mlp": 0.01274638, + "balance_loss_clip": 0.06287819, + "balance_loss_mlp": 0.01257568, + "epoch": 0.33951600781602287, + "flos": 22092804996480.0, + "grad_norm": 2.533529984962848, + "language_loss": 0.79702288, + "learning_rate": 3.0764370900510727e-06, + "loss": 0.87454283, + "num_input_tokens_seen": 121209060, + "router_z_loss_clip": 1.89550781, + "router_z_loss_mlp": 0.17077637, + "step": 5647, + "time_per_iteration": 2.5699684619903564 + }, + { + "auxiliary_loss_clip": 0.0647471, + "auxiliary_loss_mlp": 0.01272685, + "balance_loss_clip": 0.06288452, + "balance_loss_mlp": 0.01256067, + "epoch": 0.33957613106869083, + "flos": 23885027176320.0, + "grad_norm": 2.1454269075726535, + "language_loss": 0.78001738, + "learning_rate": 3.0761088292689904e-06, + "loss": 0.85749137, + "num_input_tokens_seen": 121227480, + "router_z_loss_clip": 1.86328125, + "router_z_loss_mlp": 0.16625977, + "step": 5648, + "time_per_iteration": 2.5294926166534424 + }, + { + "auxiliary_loss_clip": 0.063921, + "auxiliary_loss_mlp": 0.01261966, + "balance_loss_clip": 0.0630298, + "balance_loss_mlp": 0.01254759, + "epoch": 0.3396362543213588, + "flos": 71264411066880.0, + "grad_norm": 0.7604552176896413, + "language_loss": 0.56109136, + "learning_rate": 3.075780527680754e-06, + "loss": 0.63763207, + "num_input_tokens_seen": 121291305, + "router_z_loss_clip": 0.890625, + "router_z_loss_mlp": 0.07196045, + "step": 5649, + "time_per_iteration": 3.2003703117370605 + }, + { + "auxiliary_loss_clip": 0.06473398, + "auxiliary_loss_mlp": 0.01280094, + "balance_loss_clip": 0.06287606, + "balance_loss_mlp": 0.01263274, + "epoch": 0.33969637757402676, + "flos": 25928746986240.0, + "grad_norm": 1.4812234353432667, + "language_loss": 0.85783911, + "learning_rate": 3.0754521852988117e-06, + "loss": 0.93537402, + "num_input_tokens_seen": 121312740, + "router_z_loss_clip": 1.85839844, + "router_z_loss_mlp": 0.16821289, + "step": 5650, + "time_per_iteration": 2.551633834838867 + }, + { + "auxiliary_loss_clip": 0.06475022, + "auxiliary_loss_mlp": 0.01277613, + "balance_loss_clip": 0.06292272, + "balance_loss_mlp": 0.01261841, + "epoch": 0.33975650082669473, + "flos": 35270382003840.0, + "grad_norm": 3.382903843955623, + "language_loss": 0.71404934, + "learning_rate": 3.0751238021356152e-06, + "loss": 0.79157567, + "num_input_tokens_seen": 121334220, + "router_z_loss_clip": 1.82617188, + "router_z_loss_mlp": 0.15759277, + "step": 5651, + "time_per_iteration": 2.665083885192871 + }, + { + "auxiliary_loss_clip": 0.06471914, + "auxiliary_loss_mlp": 0.01278706, + "balance_loss_clip": 0.06286959, + "balance_loss_mlp": 0.01261922, + "epoch": 0.3398166240793627, + "flos": 16651373650560.0, + "grad_norm": 4.478617872089092, + "language_loss": 0.81850624, + "learning_rate": 3.074795378203616e-06, + "loss": 0.89601243, + "num_input_tokens_seen": 121351870, + "router_z_loss_clip": 1.84960938, + "router_z_loss_mlp": 0.16772461, + "step": 5652, + "time_per_iteration": 2.5136160850524902 + }, + { + "auxiliary_loss_clip": 0.06483054, + "auxiliary_loss_mlp": 0.01281024, + "balance_loss_clip": 0.06293614, + "balance_loss_mlp": 0.0126344, + "epoch": 0.33987674733203066, + "flos": 24069244377600.0, + "grad_norm": 3.0225456344203088, + "language_loss": 0.77707815, + "learning_rate": 3.0744669135152685e-06, + "loss": 0.85471892, + "num_input_tokens_seen": 121373400, + "router_z_loss_clip": 1.89257812, + "router_z_loss_mlp": 0.17590332, + "step": 5653, + "time_per_iteration": 2.6221256256103516 + }, + { + "auxiliary_loss_clip": 0.06478614, + "auxiliary_loss_mlp": 0.01275428, + "balance_loss_clip": 0.06293246, + "balance_loss_mlp": 0.01259788, + "epoch": 0.3399368705846986, + "flos": 13253955603840.0, + "grad_norm": 4.6454995512067745, + "language_loss": 0.86809218, + "learning_rate": 3.0741384080830278e-06, + "loss": 0.94563264, + "num_input_tokens_seen": 121385225, + "router_z_loss_clip": 1.85449219, + "router_z_loss_mlp": 0.15625, + "step": 5654, + "time_per_iteration": 2.4661965370178223 + }, + { + "auxiliary_loss_clip": 0.06476527, + "auxiliary_loss_mlp": 0.01283952, + "balance_loss_clip": 0.06292438, + "balance_loss_mlp": 0.01267584, + "epoch": 0.3399969938373666, + "flos": 27019585365120.0, + "grad_norm": 2.782601809339298, + "language_loss": 0.65974486, + "learning_rate": 3.073809861919351e-06, + "loss": 0.73734963, + "num_input_tokens_seen": 121404735, + "router_z_loss_clip": 1.83984375, + "router_z_loss_mlp": 0.16369629, + "step": 5655, + "time_per_iteration": 2.555647611618042 + }, + { + "auxiliary_loss_clip": 0.06478781, + "auxiliary_loss_mlp": 0.01275484, + "balance_loss_clip": 0.06293027, + "balance_loss_mlp": 0.01259558, + "epoch": 0.34005711709003456, + "flos": 28557920073600.0, + "grad_norm": 1.4106761603755547, + "language_loss": 0.76612461, + "learning_rate": 3.073481275036697e-06, + "loss": 0.84366733, + "num_input_tokens_seen": 121426780, + "router_z_loss_clip": 1.85839844, + "router_z_loss_mlp": 0.15917969, + "step": 5656, + "time_per_iteration": 2.644866466522217 + }, + { + "auxiliary_loss_clip": 0.06484362, + "auxiliary_loss_mlp": 0.01277113, + "balance_loss_clip": 0.06293096, + "balance_loss_mlp": 0.01260436, + "epoch": 0.3401172403427025, + "flos": 21623533804800.0, + "grad_norm": 1.950261924987131, + "language_loss": 0.83422613, + "learning_rate": 3.073152647447525e-06, + "loss": 0.9118408, + "num_input_tokens_seen": 121447245, + "router_z_loss_clip": 1.91210938, + "router_z_loss_mlp": 0.16674805, + "step": 5657, + "time_per_iteration": 2.701688051223755 + }, + { + "auxiliary_loss_clip": 0.06477939, + "auxiliary_loss_mlp": 0.01276671, + "balance_loss_clip": 0.06292981, + "balance_loss_mlp": 0.01259851, + "epoch": 0.3401773635953705, + "flos": 25893010419840.0, + "grad_norm": 5.064784702806917, + "language_loss": 0.86277437, + "learning_rate": 3.0728239791642976e-06, + "loss": 0.94032043, + "num_input_tokens_seen": 121468165, + "router_z_loss_clip": 1.84765625, + "router_z_loss_mlp": 0.16833496, + "step": 5658, + "time_per_iteration": 2.622107744216919 + }, + { + "auxiliary_loss_clip": 0.06400045, + "auxiliary_loss_mlp": 0.01275632, + "balance_loss_clip": 0.06310016, + "balance_loss_mlp": 0.01268671, + "epoch": 0.3402374868480385, + "flos": 65527737459840.0, + "grad_norm": 0.8082747939523138, + "language_loss": 0.59960568, + "learning_rate": 3.072495270199477e-06, + "loss": 0.67636251, + "num_input_tokens_seen": 121523795, + "router_z_loss_clip": 0.8984375, + "router_z_loss_mlp": 0.06970215, + "step": 5659, + "time_per_iteration": 3.1002566814422607 + }, + { + "auxiliary_loss_clip": 0.0647618, + "auxiliary_loss_mlp": 0.01281423, + "balance_loss_clip": 0.06294397, + "balance_loss_mlp": 0.01264591, + "epoch": 0.34029761010070647, + "flos": 24067357660800.0, + "grad_norm": 2.7764582815625514, + "language_loss": 0.68693221, + "learning_rate": 3.0721665205655284e-06, + "loss": 0.76450825, + "num_input_tokens_seen": 121542950, + "router_z_loss_clip": 1.81542969, + "router_z_loss_mlp": 0.16821289, + "step": 5660, + "time_per_iteration": 2.620135545730591 + }, + { + "auxiliary_loss_clip": 0.06473149, + "auxiliary_loss_mlp": 0.01278369, + "balance_loss_clip": 0.06289428, + "balance_loss_mlp": 0.01262157, + "epoch": 0.34035773335337444, + "flos": 27607093067520.0, + "grad_norm": 2.0682817387265477, + "language_loss": 0.6727913, + "learning_rate": 3.071837730274918e-06, + "loss": 0.75030649, + "num_input_tokens_seen": 121562765, + "router_z_loss_clip": 1.83691406, + "router_z_loss_mlp": 0.16210938, + "step": 5661, + "time_per_iteration": 2.56429123878479 + }, + { + "auxiliary_loss_clip": 0.06469939, + "auxiliary_loss_mlp": 0.01280149, + "balance_loss_clip": 0.06289508, + "balance_loss_mlp": 0.01264175, + "epoch": 0.3404178566060424, + "flos": 20818923373440.0, + "grad_norm": 1.802665197928241, + "language_loss": 0.79380333, + "learning_rate": 3.071508899340113e-06, + "loss": 0.87130427, + "num_input_tokens_seen": 121581610, + "router_z_loss_clip": 1.80566406, + "router_z_loss_mlp": 0.15966797, + "step": 5662, + "time_per_iteration": 2.552755832672119 + }, + { + "auxiliary_loss_clip": 0.06474, + "auxiliary_loss_mlp": 0.01278156, + "balance_loss_clip": 0.06290844, + "balance_loss_mlp": 0.01260454, + "epoch": 0.34047797985871037, + "flos": 26840818679040.0, + "grad_norm": 2.1558050020889894, + "language_loss": 0.73809367, + "learning_rate": 3.0711800277735833e-06, + "loss": 0.8156153, + "num_input_tokens_seen": 121601885, + "router_z_loss_clip": 1.83007812, + "router_z_loss_mlp": 0.17700195, + "step": 5663, + "time_per_iteration": 2.5490622520446777 + }, + { + "auxiliary_loss_clip": 0.06470126, + "auxiliary_loss_mlp": 0.01281986, + "balance_loss_clip": 0.06290488, + "balance_loss_mlp": 0.01265714, + "epoch": 0.34053810311137833, + "flos": 19688742702720.0, + "grad_norm": 1.852400144955729, + "language_loss": 0.86839676, + "learning_rate": 3.0708511155877997e-06, + "loss": 0.94591784, + "num_input_tokens_seen": 121621335, + "router_z_loss_clip": 1.796875, + "router_z_loss_mlp": 0.16259766, + "step": 5664, + "time_per_iteration": 5.419060707092285 + }, + { + "auxiliary_loss_clip": 0.06483276, + "auxiliary_loss_mlp": 0.01274362, + "balance_loss_clip": 0.06295361, + "balance_loss_mlp": 0.01257423, + "epoch": 0.3405982263640463, + "flos": 21732169023360.0, + "grad_norm": 1.8640809787797845, + "language_loss": 0.69509971, + "learning_rate": 3.070522162795235e-06, + "loss": 0.77267611, + "num_input_tokens_seen": 121641310, + "router_z_loss_clip": 1.88183594, + "router_z_loss_mlp": 0.16943359, + "step": 5665, + "time_per_iteration": 2.547194719314575 + }, + { + "auxiliary_loss_clip": 0.06482168, + "auxiliary_loss_mlp": 0.01274659, + "balance_loss_clip": 0.0629427, + "balance_loss_mlp": 0.01257648, + "epoch": 0.34065834961671426, + "flos": 18047600634240.0, + "grad_norm": 2.6257214905883237, + "language_loss": 0.73526829, + "learning_rate": 3.0701931694083626e-06, + "loss": 0.81283653, + "num_input_tokens_seen": 121659625, + "router_z_loss_clip": 1.87890625, + "router_z_loss_mlp": 0.17016602, + "step": 5666, + "time_per_iteration": 2.527994155883789 + }, + { + "auxiliary_loss_clip": 0.06482688, + "auxiliary_loss_mlp": 0.01272217, + "balance_loss_clip": 0.06295212, + "balance_loss_mlp": 0.01255373, + "epoch": 0.3407184728693822, + "flos": 21403705818240.0, + "grad_norm": 1.661941695135435, + "language_loss": 0.74005675, + "learning_rate": 3.0698641354396576e-06, + "loss": 0.81760579, + "num_input_tokens_seen": 121679205, + "router_z_loss_clip": 1.87304688, + "router_z_loss_mlp": 0.1685791, + "step": 5667, + "time_per_iteration": 4.029574155807495 + }, + { + "auxiliary_loss_clip": 0.06378959, + "auxiliary_loss_mlp": 0.01268313, + "balance_loss_clip": 0.06290369, + "balance_loss_mlp": 0.01260898, + "epoch": 0.3407785961220502, + "flos": 68709352515840.0, + "grad_norm": 0.8062084259911544, + "language_loss": 0.63318539, + "learning_rate": 3.069535060901597e-06, + "loss": 0.70965815, + "num_input_tokens_seen": 121751085, + "router_z_loss_clip": 0.8828125, + "router_z_loss_mlp": 0.07397461, + "step": 5668, + "time_per_iteration": 3.3641560077667236 + }, + { + "auxiliary_loss_clip": 0.06472414, + "auxiliary_loss_mlp": 0.01272754, + "balance_loss_clip": 0.0628752, + "balance_loss_mlp": 0.01256863, + "epoch": 0.34083871937471816, + "flos": 14069634773760.0, + "grad_norm": 2.007810831329869, + "language_loss": 0.73127198, + "learning_rate": 3.0692059458066596e-06, + "loss": 0.80872369, + "num_input_tokens_seen": 121768565, + "router_z_loss_clip": 1.84765625, + "router_z_loss_mlp": 0.15893555, + "step": 5669, + "time_per_iteration": 2.4918038845062256 + }, + { + "auxiliary_loss_clip": 0.06479842, + "auxiliary_loss_mlp": 0.0127954, + "balance_loss_clip": 0.06292197, + "balance_loss_mlp": 0.01263423, + "epoch": 0.3408988426273861, + "flos": 17089981447680.0, + "grad_norm": 2.0642744441347287, + "language_loss": 0.80626565, + "learning_rate": 3.0688767901673265e-06, + "loss": 0.88385952, + "num_input_tokens_seen": 121784925, + "router_z_loss_clip": 1.87597656, + "router_z_loss_mlp": 0.16125488, + "step": 5670, + "time_per_iteration": 2.5270040035247803 + }, + { + "auxiliary_loss_clip": 0.06481062, + "auxiliary_loss_mlp": 0.01275164, + "balance_loss_clip": 0.06291522, + "balance_loss_mlp": 0.0125838, + "epoch": 0.3409589658800541, + "flos": 24031411459200.0, + "grad_norm": 1.863009265742361, + "language_loss": 0.77916187, + "learning_rate": 3.068547593996078e-06, + "loss": 0.85672414, + "num_input_tokens_seen": 121804425, + "router_z_loss_clip": 1.89355469, + "router_z_loss_mlp": 0.16784668, + "step": 5671, + "time_per_iteration": 4.039815664291382 + }, + { + "auxiliary_loss_clip": 0.06473973, + "auxiliary_loss_mlp": 0.01276984, + "balance_loss_clip": 0.06289308, + "balance_loss_mlp": 0.01260712, + "epoch": 0.34101908913272205, + "flos": 21148350900480.0, + "grad_norm": 1.9142883162018633, + "language_loss": 0.74626315, + "learning_rate": 3.0682183573053974e-06, + "loss": 0.82377267, + "num_input_tokens_seen": 121825145, + "router_z_loss_clip": 1.84960938, + "router_z_loss_mlp": 0.16259766, + "step": 5672, + "time_per_iteration": 2.564887762069702 + }, + { + "auxiliary_loss_clip": 0.06475951, + "auxiliary_loss_mlp": 0.01275656, + "balance_loss_clip": 0.06287946, + "balance_loss_mlp": 0.01259265, + "epoch": 0.3410792123853901, + "flos": 15706835700480.0, + "grad_norm": 1.714309741158987, + "language_loss": 0.73791027, + "learning_rate": 3.06788908010777e-06, + "loss": 0.81542635, + "num_input_tokens_seen": 121842185, + "router_z_loss_clip": 1.88085938, + "router_z_loss_mlp": 0.16394043, + "step": 5673, + "time_per_iteration": 2.540194511413574 + }, + { + "auxiliary_loss_clip": 0.06466323, + "auxiliary_loss_mlp": 0.01283225, + "balance_loss_clip": 0.06284231, + "balance_loss_mlp": 0.01266584, + "epoch": 0.34113933563805804, + "flos": 23042122629120.0, + "grad_norm": 1.8379615104267257, + "language_loss": 0.7978701, + "learning_rate": 3.067559762415682e-06, + "loss": 0.87536556, + "num_input_tokens_seen": 121862260, + "router_z_loss_clip": 1.81835938, + "router_z_loss_mlp": 0.16638184, + "step": 5674, + "time_per_iteration": 2.5462148189544678 + }, + { + "auxiliary_loss_clip": 0.06364837, + "auxiliary_loss_mlp": 0.01262017, + "balance_loss_clip": 0.06277348, + "balance_loss_mlp": 0.01255442, + "epoch": 0.341199458890726, + "flos": 69631878769920.0, + "grad_norm": 0.7752872762952348, + "language_loss": 0.56147063, + "learning_rate": 3.0672304042416198e-06, + "loss": 0.63773918, + "num_input_tokens_seen": 121923560, + "router_z_loss_clip": 0.875, + "router_z_loss_mlp": 0.06585693, + "step": 5675, + "time_per_iteration": 3.370281457901001 + }, + { + "auxiliary_loss_clip": 0.0645988, + "auxiliary_loss_mlp": 0.01273396, + "balance_loss_clip": 0.06281768, + "balance_loss_mlp": 0.01257398, + "epoch": 0.34125958214339397, + "flos": 22352939596800.0, + "grad_norm": 2.600205708544321, + "language_loss": 0.79689062, + "learning_rate": 3.0669010055980734e-06, + "loss": 0.87422335, + "num_input_tokens_seen": 121943515, + "router_z_loss_clip": 1.78125, + "router_z_loss_mlp": 0.16003418, + "step": 5676, + "time_per_iteration": 2.5312321186065674 + }, + { + "auxiliary_loss_clip": 0.06470488, + "auxiliary_loss_mlp": 0.01271752, + "balance_loss_clip": 0.06286064, + "balance_loss_mlp": 0.01255051, + "epoch": 0.34131970539606193, + "flos": 21878427525120.0, + "grad_norm": 2.203551534393157, + "language_loss": 0.8601976, + "learning_rate": 3.0665715664975357e-06, + "loss": 0.93761992, + "num_input_tokens_seen": 121962540, + "router_z_loss_clip": 1.84375, + "router_z_loss_mlp": 0.16699219, + "step": 5677, + "time_per_iteration": 2.555037260055542 + }, + { + "auxiliary_loss_clip": 0.06463757, + "auxiliary_loss_mlp": 0.01274207, + "balance_loss_clip": 0.06280699, + "balance_loss_mlp": 0.01257268, + "epoch": 0.3413798286487299, + "flos": 24942560757120.0, + "grad_norm": 2.786164717546535, + "language_loss": 0.80252033, + "learning_rate": 3.0662420869524966e-06, + "loss": 0.87989998, + "num_input_tokens_seen": 121979830, + "router_z_loss_clip": 1.83203125, + "router_z_loss_mlp": 0.16955566, + "step": 5678, + "time_per_iteration": 2.6321489810943604 + }, + { + "auxiliary_loss_clip": 0.06467854, + "auxiliary_loss_mlp": 0.01270663, + "balance_loss_clip": 0.06282793, + "balance_loss_mlp": 0.01255404, + "epoch": 0.34143995190139786, + "flos": 25381420116480.0, + "grad_norm": 1.8772848902338297, + "language_loss": 0.75927806, + "learning_rate": 3.0659125669754506e-06, + "loss": 0.83666325, + "num_input_tokens_seen": 121999055, + "router_z_loss_clip": 1.85253906, + "router_z_loss_mlp": 0.15246582, + "step": 5679, + "time_per_iteration": 2.5981781482696533 + }, + { + "auxiliary_loss_clip": 0.06365222, + "auxiliary_loss_mlp": 0.01260685, + "balance_loss_clip": 0.06278291, + "balance_loss_mlp": 0.01253538, + "epoch": 0.34150007515406583, + "flos": 67804785763200.0, + "grad_norm": 0.7019635675964923, + "language_loss": 0.59521842, + "learning_rate": 3.0655830065788923e-06, + "loss": 0.67147756, + "num_input_tokens_seen": 122067015, + "router_z_loss_clip": 0.8671875, + "router_z_loss_mlp": 0.0713501, + "step": 5680, + "time_per_iteration": 3.2768852710723877 + }, + { + "auxiliary_loss_clip": 0.06464119, + "auxiliary_loss_mlp": 0.01271493, + "balance_loss_clip": 0.06282759, + "balance_loss_mlp": 0.01255602, + "epoch": 0.3415601984067338, + "flos": 20308548954240.0, + "grad_norm": 1.756785442101194, + "language_loss": 0.72804415, + "learning_rate": 3.0652534057753206e-06, + "loss": 0.80540025, + "num_input_tokens_seen": 122085295, + "router_z_loss_clip": 1.81445312, + "router_z_loss_mlp": 0.15881348, + "step": 5681, + "time_per_iteration": 2.540839195251465 + }, + { + "auxiliary_loss_clip": 0.06462204, + "auxiliary_loss_mlp": 0.01272244, + "balance_loss_clip": 0.06283034, + "balance_loss_mlp": 0.01256806, + "epoch": 0.34162032165940176, + "flos": 26038346526720.0, + "grad_norm": 5.204332383129175, + "language_loss": 0.71220171, + "learning_rate": 3.064923764577233e-06, + "loss": 0.78954625, + "num_input_tokens_seen": 122104020, + "router_z_loss_clip": 1.79199219, + "router_z_loss_mlp": 0.15454102, + "step": 5682, + "time_per_iteration": 2.5933032035827637 + }, + { + "auxiliary_loss_clip": 0.06466864, + "auxiliary_loss_mlp": 0.0127503, + "balance_loss_clip": 0.06286532, + "balance_loss_mlp": 0.01258711, + "epoch": 0.3416804449120697, + "flos": 28810843223040.0, + "grad_norm": 1.4703350638010875, + "language_loss": 0.83879244, + "learning_rate": 3.0645940829971295e-06, + "loss": 0.91621137, + "num_input_tokens_seen": 122125080, + "router_z_loss_clip": 1.80273438, + "router_z_loss_mlp": 0.16320801, + "step": 5683, + "time_per_iteration": 2.595921277999878 + }, + { + "auxiliary_loss_clip": 0.06468399, + "auxiliary_loss_mlp": 0.01274924, + "balance_loss_clip": 0.06284815, + "balance_loss_mlp": 0.01258354, + "epoch": 0.3417405681647377, + "flos": 22608210660480.0, + "grad_norm": 1.8188343464074745, + "language_loss": 0.71334541, + "learning_rate": 3.0642643610475116e-06, + "loss": 0.79077864, + "num_input_tokens_seen": 122146350, + "router_z_loss_clip": 1.8359375, + "router_z_loss_mlp": 0.16577148, + "step": 5684, + "time_per_iteration": 2.5821194648742676 + }, + { + "auxiliary_loss_clip": 0.06462076, + "auxiliary_loss_mlp": 0.01268234, + "balance_loss_clip": 0.06284268, + "balance_loss_mlp": 0.01253816, + "epoch": 0.34180069141740566, + "flos": 24722942405760.0, + "grad_norm": 1.4943065575919134, + "language_loss": 0.75352108, + "learning_rate": 3.0639345987408823e-06, + "loss": 0.8308242, + "num_input_tokens_seen": 122168085, + "router_z_loss_clip": 1.77636719, + "router_z_loss_mlp": 0.144104, + "step": 5685, + "time_per_iteration": 2.545419216156006 + }, + { + "auxiliary_loss_clip": 0.06457227, + "auxiliary_loss_mlp": 0.01270508, + "balance_loss_clip": 0.06281762, + "balance_loss_mlp": 0.0125501, + "epoch": 0.3418608146700737, + "flos": 30526644879360.0, + "grad_norm": 1.8907916568784255, + "language_loss": 0.70833004, + "learning_rate": 3.0636047960897468e-06, + "loss": 0.7856074, + "num_input_tokens_seen": 122191040, + "router_z_loss_clip": 1.75390625, + "router_z_loss_mlp": 0.1550293, + "step": 5686, + "time_per_iteration": 2.645081043243408 + }, + { + "auxiliary_loss_clip": 0.06467415, + "auxiliary_loss_mlp": 0.01269453, + "balance_loss_clip": 0.06284459, + "balance_loss_mlp": 0.01253407, + "epoch": 0.34192093792274164, + "flos": 15127755333120.0, + "grad_norm": 2.1973050683231303, + "language_loss": 0.77864039, + "learning_rate": 3.06327495310661e-06, + "loss": 0.85600907, + "num_input_tokens_seen": 122209225, + "router_z_loss_clip": 1.82910156, + "router_z_loss_mlp": 0.16052246, + "step": 5687, + "time_per_iteration": 2.501957654953003 + }, + { + "auxiliary_loss_clip": 0.06462508, + "auxiliary_loss_mlp": 0.01273993, + "balance_loss_clip": 0.06286186, + "balance_loss_mlp": 0.01257435, + "epoch": 0.3419810611754096, + "flos": 13192754595840.0, + "grad_norm": 1.8198375176693335, + "language_loss": 0.87159389, + "learning_rate": 3.062945069803981e-06, + "loss": 0.94895893, + "num_input_tokens_seen": 122226160, + "router_z_loss_clip": 1.76269531, + "router_z_loss_mlp": 0.16552734, + "step": 5688, + "time_per_iteration": 2.514558792114258 + }, + { + "auxiliary_loss_clip": 0.06470017, + "auxiliary_loss_mlp": 0.01272882, + "balance_loss_clip": 0.06283651, + "balance_loss_mlp": 0.01255025, + "epoch": 0.34204118442807757, + "flos": 19542274565760.0, + "grad_norm": 1.9150705307332732, + "language_loss": 0.80177575, + "learning_rate": 3.0626151461943684e-06, + "loss": 0.87920475, + "num_input_tokens_seen": 122243115, + "router_z_loss_clip": 1.86328125, + "router_z_loss_mlp": 0.17858887, + "step": 5689, + "time_per_iteration": 2.4941842555999756 + }, + { + "auxiliary_loss_clip": 0.06471369, + "auxiliary_loss_mlp": 0.01270545, + "balance_loss_clip": 0.06286988, + "balance_loss_mlp": 0.01254476, + "epoch": 0.34210130768074554, + "flos": 15200192787840.0, + "grad_norm": 1.8413075326603192, + "language_loss": 0.74004579, + "learning_rate": 3.0622851822902834e-06, + "loss": 0.81746483, + "num_input_tokens_seen": 122261105, + "router_z_loss_clip": 1.84277344, + "router_z_loss_mlp": 0.1607666, + "step": 5690, + "time_per_iteration": 2.5133728981018066 + }, + { + "auxiliary_loss_clip": 0.06470567, + "auxiliary_loss_mlp": 0.01270745, + "balance_loss_clip": 0.06288044, + "balance_loss_mlp": 0.01254854, + "epoch": 0.3421614309334135, + "flos": 24943147735680.0, + "grad_norm": 2.8439157619722666, + "language_loss": 0.76563686, + "learning_rate": 3.061955178104237e-06, + "loss": 0.84305, + "num_input_tokens_seen": 122279995, + "router_z_loss_clip": 1.82519531, + "router_z_loss_mlp": 0.15893555, + "step": 5691, + "time_per_iteration": 2.5346477031707764 + }, + { + "auxiliary_loss_clip": 0.06465675, + "auxiliary_loss_mlp": 0.01269129, + "balance_loss_clip": 0.06286939, + "balance_loss_mlp": 0.01254395, + "epoch": 0.34222155418608147, + "flos": 21915170340480.0, + "grad_norm": 1.7269103068173344, + "language_loss": 0.6888957, + "learning_rate": 3.0616251336487447e-06, + "loss": 0.7662437, + "num_input_tokens_seen": 122299070, + "router_z_loss_clip": 1.78710938, + "router_z_loss_mlp": 0.1472168, + "step": 5692, + "time_per_iteration": 2.544475793838501 + }, + { + "auxiliary_loss_clip": 0.06469652, + "auxiliary_loss_mlp": 0.01276187, + "balance_loss_clip": 0.06286649, + "balance_loss_mlp": 0.01259069, + "epoch": 0.34228167743874943, + "flos": 18119954234880.0, + "grad_norm": 2.5543870280075494, + "language_loss": 0.72691154, + "learning_rate": 3.06129504893632e-06, + "loss": 0.80436993, + "num_input_tokens_seen": 122316800, + "router_z_loss_clip": 1.83203125, + "router_z_loss_mlp": 0.17126465, + "step": 5693, + "time_per_iteration": 2.4823062419891357 + }, + { + "auxiliary_loss_clip": 0.06469734, + "auxiliary_loss_mlp": 0.01268069, + "balance_loss_clip": 0.06291726, + "balance_loss_mlp": 0.01253049, + "epoch": 0.3423418006914174, + "flos": 21295070599680.0, + "grad_norm": 1.6526919771326485, + "language_loss": 0.76433146, + "learning_rate": 3.0609649239794813e-06, + "loss": 0.84170949, + "num_input_tokens_seen": 122335275, + "router_z_loss_clip": 1.78027344, + "router_z_loss_mlp": 0.15008545, + "step": 5694, + "time_per_iteration": 2.5759999752044678 + }, + { + "auxiliary_loss_clip": 0.06469683, + "auxiliary_loss_mlp": 0.01269733, + "balance_loss_clip": 0.06292015, + "balance_loss_mlp": 0.01254498, + "epoch": 0.34240192394408536, + "flos": 19828754075520.0, + "grad_norm": 1.7073290043069882, + "language_loss": 0.80359411, + "learning_rate": 3.060634758790747e-06, + "loss": 0.88098824, + "num_input_tokens_seen": 122353215, + "router_z_loss_clip": 1.77734375, + "router_z_loss_mlp": 0.15222168, + "step": 5695, + "time_per_iteration": 2.53019118309021 + }, + { + "auxiliary_loss_clip": 0.06473886, + "auxiliary_loss_mlp": 0.01274215, + "balance_loss_clip": 0.06292082, + "balance_loss_mlp": 0.01257335, + "epoch": 0.3424620471967533, + "flos": 24542498638080.0, + "grad_norm": 2.150928833794339, + "language_loss": 0.74189723, + "learning_rate": 3.060304553382635e-06, + "loss": 0.81937826, + "num_input_tokens_seen": 122372495, + "router_z_loss_clip": 1.81640625, + "router_z_loss_mlp": 0.16882324, + "step": 5696, + "time_per_iteration": 2.6046504974365234 + }, + { + "auxiliary_loss_clip": 0.06472932, + "auxiliary_loss_mlp": 0.01273918, + "balance_loss_clip": 0.062935, + "balance_loss_mlp": 0.0125786, + "epoch": 0.3425221704494213, + "flos": 25856057969280.0, + "grad_norm": 1.9268953245740004, + "language_loss": 0.71419311, + "learning_rate": 3.0599743077676685e-06, + "loss": 0.79166162, + "num_input_tokens_seen": 122394600, + "router_z_loss_clip": 1.79394531, + "router_z_loss_mlp": 0.16052246, + "step": 5697, + "time_per_iteration": 2.565295696258545 + }, + { + "auxiliary_loss_clip": 0.06469944, + "auxiliary_loss_mlp": 0.01270077, + "balance_loss_clip": 0.06292768, + "balance_loss_mlp": 0.01254293, + "epoch": 0.34258229370208926, + "flos": 21546442448640.0, + "grad_norm": 1.77565898086167, + "language_loss": 0.82456839, + "learning_rate": 3.05964402195837e-06, + "loss": 0.90196872, + "num_input_tokens_seen": 122414700, + "router_z_loss_clip": 1.77050781, + "router_z_loss_mlp": 0.15795898, + "step": 5698, + "time_per_iteration": 2.636547327041626 + }, + { + "auxiliary_loss_clip": 0.06476933, + "auxiliary_loss_mlp": 0.01277942, + "balance_loss_clip": 0.06293021, + "balance_loss_mlp": 0.01260573, + "epoch": 0.3426424169547573, + "flos": 23658407009280.0, + "grad_norm": 1.9460205950694964, + "language_loss": 0.69722092, + "learning_rate": 3.0593136959672645e-06, + "loss": 0.77476966, + "num_input_tokens_seen": 122432760, + "router_z_loss_clip": 1.83886719, + "router_z_loss_mlp": 0.17358398, + "step": 5699, + "time_per_iteration": 2.523766040802002 + }, + { + "auxiliary_loss_clip": 0.06469926, + "auxiliary_loss_mlp": 0.0127405, + "balance_loss_clip": 0.06289239, + "balance_loss_mlp": 0.01257719, + "epoch": 0.34270254020742524, + "flos": 24651846616320.0, + "grad_norm": 2.105384484263751, + "language_loss": 0.72511256, + "learning_rate": 3.058983329806877e-06, + "loss": 0.80255234, + "num_input_tokens_seen": 122449105, + "router_z_loss_clip": 1.80566406, + "router_z_loss_mlp": 0.16320801, + "step": 5700, + "time_per_iteration": 2.57511568069458 + }, + { + "auxiliary_loss_clip": 0.06467311, + "auxiliary_loss_mlp": 0.01271093, + "balance_loss_clip": 0.06288276, + "balance_loss_mlp": 0.01254273, + "epoch": 0.3427626634600932, + "flos": 21003182501760.0, + "grad_norm": 2.114283139984186, + "language_loss": 0.82378924, + "learning_rate": 3.0586529234897354e-06, + "loss": 0.90117323, + "num_input_tokens_seen": 122468700, + "router_z_loss_clip": 1.79003906, + "router_z_loss_mlp": 0.16821289, + "step": 5701, + "time_per_iteration": 2.496392250061035 + }, + { + "auxiliary_loss_clip": 0.06469429, + "auxiliary_loss_mlp": 0.0127326, + "balance_loss_clip": 0.06287375, + "balance_loss_mlp": 0.01256439, + "epoch": 0.3428227867127612, + "flos": 21440155144320.0, + "grad_norm": 1.6330699344557849, + "language_loss": 0.71898985, + "learning_rate": 3.0583224770283694e-06, + "loss": 0.79641676, + "num_input_tokens_seen": 122488160, + "router_z_loss_clip": 1.8203125, + "router_z_loss_mlp": 0.16821289, + "step": 5702, + "time_per_iteration": 2.566856861114502 + }, + { + "auxiliary_loss_clip": 0.06377172, + "auxiliary_loss_mlp": 0.01259818, + "balance_loss_clip": 0.06290582, + "balance_loss_mlp": 0.01252552, + "epoch": 0.34288290996542914, + "flos": 55750219902720.0, + "grad_norm": 0.7671857510805999, + "language_loss": 0.56708395, + "learning_rate": 3.057991990435309e-06, + "loss": 0.64345384, + "num_input_tokens_seen": 122542890, + "router_z_loss_clip": 0.8671875, + "router_z_loss_mlp": 0.07244873, + "step": 5703, + "time_per_iteration": 4.447732925415039 + }, + { + "auxiliary_loss_clip": 0.06465772, + "auxiliary_loss_mlp": 0.01272683, + "balance_loss_clip": 0.06283242, + "balance_loss_mlp": 0.01255207, + "epoch": 0.3429430332180971, + "flos": 20162961285120.0, + "grad_norm": 1.88810633796735, + "language_loss": 0.74954486, + "learning_rate": 3.057661463723086e-06, + "loss": 0.82692933, + "num_input_tokens_seen": 122561770, + "router_z_loss_clip": 1.82617188, + "router_z_loss_mlp": 0.17468262, + "step": 5704, + "time_per_iteration": 4.062070608139038 + }, + { + "auxiliary_loss_clip": 0.06463447, + "auxiliary_loss_mlp": 0.01275499, + "balance_loss_clip": 0.06284866, + "balance_loss_mlp": 0.01259716, + "epoch": 0.34300315647076507, + "flos": 17971347818880.0, + "grad_norm": 2.0890845856962565, + "language_loss": 0.73438597, + "learning_rate": 3.0573308969042346e-06, + "loss": 0.81177545, + "num_input_tokens_seen": 122580580, + "router_z_loss_clip": 1.78417969, + "router_z_loss_mlp": 0.15795898, + "step": 5705, + "time_per_iteration": 2.5125277042388916 + }, + { + "auxiliary_loss_clip": 0.06466857, + "auxiliary_loss_mlp": 0.01271633, + "balance_loss_clip": 0.0628458, + "balance_loss_mlp": 0.01255194, + "epoch": 0.34306327972343303, + "flos": 22092679215360.0, + "grad_norm": 2.3658652894382075, + "language_loss": 0.80144984, + "learning_rate": 3.057000289991289e-06, + "loss": 0.87883472, + "num_input_tokens_seen": 122599810, + "router_z_loss_clip": 1.82226562, + "router_z_loss_mlp": 0.16430664, + "step": 5706, + "time_per_iteration": 2.524531364440918 + }, + { + "auxiliary_loss_clip": 0.06468605, + "auxiliary_loss_mlp": 0.01272132, + "balance_loss_clip": 0.06282079, + "balance_loss_mlp": 0.0125493, + "epoch": 0.343123402976101, + "flos": 18448669002240.0, + "grad_norm": 1.9272208577124825, + "language_loss": 0.83210528, + "learning_rate": 3.056669642996787e-06, + "loss": 0.90951264, + "num_input_tokens_seen": 122616035, + "router_z_loss_clip": 1.86621094, + "router_z_loss_mlp": 0.17199707, + "step": 5707, + "time_per_iteration": 4.017935514450073 + }, + { + "auxiliary_loss_clip": 0.06464301, + "auxiliary_loss_mlp": 0.01275983, + "balance_loss_clip": 0.06283538, + "balance_loss_mlp": 0.01259544, + "epoch": 0.34318352622876896, + "flos": 17169127228800.0, + "grad_norm": 1.5274992455100316, + "language_loss": 0.74774885, + "learning_rate": 3.056338955933266e-06, + "loss": 0.82515168, + "num_input_tokens_seen": 122633785, + "router_z_loss_clip": 1.80566406, + "router_z_loss_mlp": 0.16442871, + "step": 5708, + "time_per_iteration": 2.6189568042755127 + }, + { + "auxiliary_loss_clip": 0.06460952, + "auxiliary_loss_mlp": 0.01273078, + "balance_loss_clip": 0.06282704, + "balance_loss_mlp": 0.01256365, + "epoch": 0.34324364948143693, + "flos": 26695482572160.0, + "grad_norm": 1.5717787719434457, + "language_loss": 0.80904007, + "learning_rate": 3.0560082288132662e-06, + "loss": 0.88638043, + "num_input_tokens_seen": 122652100, + "router_z_loss_clip": 1.78515625, + "router_z_loss_mlp": 0.16711426, + "step": 5709, + "time_per_iteration": 2.563938617706299 + }, + { + "auxiliary_loss_clip": 0.06471742, + "auxiliary_loss_mlp": 0.01280104, + "balance_loss_clip": 0.06286193, + "balance_loss_mlp": 0.01260685, + "epoch": 0.3433037727341049, + "flos": 21257950440960.0, + "grad_norm": 2.571520261591023, + "language_loss": 0.79460347, + "learning_rate": 3.055677461649329e-06, + "loss": 0.87212193, + "num_input_tokens_seen": 122669720, + "router_z_loss_clip": 1.85449219, + "router_z_loss_mlp": 0.1940918, + "step": 5710, + "time_per_iteration": 2.5515291690826416 + }, + { + "auxiliary_loss_clip": 0.06468266, + "auxiliary_loss_mlp": 0.0127181, + "balance_loss_clip": 0.06282788, + "balance_loss_mlp": 0.01254334, + "epoch": 0.34336389598677286, + "flos": 20635377004800.0, + "grad_norm": 1.916674758610419, + "language_loss": 0.70532334, + "learning_rate": 3.055346654453996e-06, + "loss": 0.78272408, + "num_input_tokens_seen": 122688715, + "router_z_loss_clip": 1.85253906, + "router_z_loss_mlp": 0.17468262, + "step": 5711, + "time_per_iteration": 3.958890914916992 + }, + { + "auxiliary_loss_clip": 0.06467056, + "auxiliary_loss_mlp": 0.01273896, + "balance_loss_clip": 0.0628437, + "balance_loss_mlp": 0.01256909, + "epoch": 0.3434240192394409, + "flos": 14543895283200.0, + "grad_norm": 2.810027228242578, + "language_loss": 0.67786914, + "learning_rate": 3.055015807239812e-06, + "loss": 0.75527865, + "num_input_tokens_seen": 122706970, + "router_z_loss_clip": 1.82421875, + "router_z_loss_mlp": 0.16992188, + "step": 5712, + "time_per_iteration": 2.4752726554870605 + }, + { + "auxiliary_loss_clip": 0.06366295, + "auxiliary_loss_mlp": 0.01262856, + "balance_loss_clip": 0.06280869, + "balance_loss_mlp": 0.01254685, + "epoch": 0.34348414249210885, + "flos": 58067799183360.0, + "grad_norm": 0.8383081559544242, + "language_loss": 0.58214718, + "learning_rate": 3.0546849200193226e-06, + "loss": 0.65843868, + "num_input_tokens_seen": 122758095, + "router_z_loss_clip": 0.85205078, + "router_z_loss_mlp": 0.08172607, + "step": 5713, + "time_per_iteration": 3.11580491065979 + }, + { + "auxiliary_loss_clip": 0.06465655, + "auxiliary_loss_mlp": 0.01274581, + "balance_loss_clip": 0.06281169, + "balance_loss_mlp": 0.01257308, + "epoch": 0.3435442657447768, + "flos": 20710749352320.0, + "grad_norm": 1.8141637433077298, + "language_loss": 0.81045675, + "learning_rate": 3.054353992805076e-06, + "loss": 0.88785917, + "num_input_tokens_seen": 122777815, + "router_z_loss_clip": 1.84472656, + "router_z_loss_mlp": 0.17272949, + "step": 5714, + "time_per_iteration": 2.510929822921753 + }, + { + "auxiliary_loss_clip": 0.0646632, + "auxiliary_loss_mlp": 0.01276019, + "balance_loss_clip": 0.06283875, + "balance_loss_mlp": 0.01260045, + "epoch": 0.3436043889974448, + "flos": 22936967354880.0, + "grad_norm": 2.602776673257047, + "language_loss": 0.72001171, + "learning_rate": 3.05402302560962e-06, + "loss": 0.79743505, + "num_input_tokens_seen": 122797555, + "router_z_loss_clip": 1.82421875, + "router_z_loss_mlp": 0.15991211, + "step": 5715, + "time_per_iteration": 2.5680224895477295 + }, + { + "auxiliary_loss_clip": 0.06365244, + "auxiliary_loss_mlp": 0.01259148, + "balance_loss_clip": 0.06280053, + "balance_loss_mlp": 0.01251191, + "epoch": 0.34366451225011274, + "flos": 58423514964480.0, + "grad_norm": 0.8879413605742031, + "language_loss": 0.65628481, + "learning_rate": 3.053692018445505e-06, + "loss": 0.73252875, + "num_input_tokens_seen": 122863955, + "router_z_loss_clip": 0.8515625, + "router_z_loss_mlp": 0.07952881, + "step": 5716, + "time_per_iteration": 3.184952735900879 + }, + { + "auxiliary_loss_clip": 0.06463662, + "auxiliary_loss_mlp": 0.01279768, + "balance_loss_clip": 0.0628469, + "balance_loss_mlp": 0.01264509, + "epoch": 0.3437246355027807, + "flos": 15601722353280.0, + "grad_norm": 1.9800950186090778, + "language_loss": 0.74289393, + "learning_rate": 3.0533609713252838e-06, + "loss": 0.82032824, + "num_input_tokens_seen": 122883000, + "router_z_loss_clip": 1.78808594, + "router_z_loss_mlp": 0.15252686, + "step": 5717, + "time_per_iteration": 2.5220494270324707 + }, + { + "auxiliary_loss_clip": 0.06466433, + "auxiliary_loss_mlp": 0.01278824, + "balance_loss_clip": 0.0628383, + "balance_loss_mlp": 0.01262946, + "epoch": 0.34378475875544867, + "flos": 27679572449280.0, + "grad_norm": 1.8348085520910409, + "language_loss": 0.75694019, + "learning_rate": 3.0530298842615077e-06, + "loss": 0.83439279, + "num_input_tokens_seen": 122903265, + "router_z_loss_clip": 1.82617188, + "router_z_loss_mlp": 0.15869141, + "step": 5718, + "time_per_iteration": 2.5983147621154785 + }, + { + "auxiliary_loss_clip": 0.06468937, + "auxiliary_loss_mlp": 0.01273829, + "balance_loss_clip": 0.06284641, + "balance_loss_mlp": 0.01256829, + "epoch": 0.34384488200811664, + "flos": 31439638967040.0, + "grad_norm": 1.8816683210791167, + "language_loss": 0.6437763, + "learning_rate": 3.052698757266734e-06, + "loss": 0.72120392, + "num_input_tokens_seen": 122923860, + "router_z_loss_clip": 1.84179688, + "router_z_loss_mlp": 0.17004395, + "step": 5719, + "time_per_iteration": 2.7075517177581787 + }, + { + "auxiliary_loss_clip": 0.06472047, + "auxiliary_loss_mlp": 0.0127673, + "balance_loss_clip": 0.06285335, + "balance_loss_mlp": 0.012596, + "epoch": 0.3439050052607846, + "flos": 24906866117760.0, + "grad_norm": 1.6709560385881974, + "language_loss": 0.73730874, + "learning_rate": 3.0523675903535183e-06, + "loss": 0.81479651, + "num_input_tokens_seen": 122945305, + "router_z_loss_clip": 1.86816406, + "router_z_loss_mlp": 0.17150879, + "step": 5720, + "time_per_iteration": 2.5936295986175537 + }, + { + "auxiliary_loss_clip": 0.06469208, + "auxiliary_loss_mlp": 0.01280833, + "balance_loss_clip": 0.06286804, + "balance_loss_mlp": 0.01264072, + "epoch": 0.34396512851345257, + "flos": 18155900436480.0, + "grad_norm": 1.8909667336437188, + "language_loss": 0.74550021, + "learning_rate": 3.0520363835344173e-06, + "loss": 0.82300061, + "num_input_tokens_seen": 122962535, + "router_z_loss_clip": 1.82421875, + "router_z_loss_mlp": 0.16748047, + "step": 5721, + "time_per_iteration": 2.5109763145446777 + }, + { + "auxiliary_loss_clip": 0.06468637, + "auxiliary_loss_mlp": 0.01276688, + "balance_loss_clip": 0.06284628, + "balance_loss_mlp": 0.01260208, + "epoch": 0.34402525176612053, + "flos": 16039994734080.0, + "grad_norm": 3.7669546448597497, + "language_loss": 0.80102623, + "learning_rate": 3.051705136821992e-06, + "loss": 0.87847948, + "num_input_tokens_seen": 122979750, + "router_z_loss_clip": 1.83984375, + "router_z_loss_mlp": 0.16479492, + "step": 5722, + "time_per_iteration": 2.5231471061706543 + }, + { + "auxiliary_loss_clip": 0.06467631, + "auxiliary_loss_mlp": 0.01281232, + "balance_loss_clip": 0.06286201, + "balance_loss_mlp": 0.01265806, + "epoch": 0.3440853750187885, + "flos": 21185009861760.0, + "grad_norm": 1.9591310013999468, + "language_loss": 0.82034022, + "learning_rate": 3.051373850228801e-06, + "loss": 0.89782888, + "num_input_tokens_seen": 122998955, + "router_z_loss_clip": 1.81445312, + "router_z_loss_mlp": 0.1541748, + "step": 5723, + "time_per_iteration": 2.5556578636169434 + }, + { + "auxiliary_loss_clip": 0.06471531, + "auxiliary_loss_mlp": 0.01281521, + "balance_loss_clip": 0.0628756, + "balance_loss_mlp": 0.0126588, + "epoch": 0.34414549827145646, + "flos": 12682883301120.0, + "grad_norm": 1.867182825140108, + "language_loss": 0.8172524, + "learning_rate": 3.0510425237674096e-06, + "loss": 0.8947829, + "num_input_tokens_seen": 123016165, + "router_z_loss_clip": 1.83984375, + "router_z_loss_mlp": 0.15661621, + "step": 5724, + "time_per_iteration": 2.509129524230957 + }, + { + "auxiliary_loss_clip": 0.06476942, + "auxiliary_loss_mlp": 0.01281282, + "balance_loss_clip": 0.06292838, + "balance_loss_mlp": 0.01265237, + "epoch": 0.3442056215241244, + "flos": 31292458070400.0, + "grad_norm": 1.852126712281853, + "language_loss": 0.69186389, + "learning_rate": 3.05071115745038e-06, + "loss": 0.76944625, + "num_input_tokens_seen": 123036900, + "router_z_loss_clip": 1.83886719, + "router_z_loss_mlp": 0.16040039, + "step": 5725, + "time_per_iteration": 2.6253697872161865 + }, + { + "auxiliary_loss_clip": 0.06482734, + "auxiliary_loss_mlp": 0.01284248, + "balance_loss_clip": 0.06293113, + "balance_loss_mlp": 0.01266462, + "epoch": 0.34426574477679245, + "flos": 23373939997440.0, + "grad_norm": 1.5373453518160676, + "language_loss": 0.69532049, + "learning_rate": 3.0503797512902773e-06, + "loss": 0.77299035, + "num_input_tokens_seen": 123057480, + "router_z_loss_clip": 1.89648438, + "router_z_loss_mlp": 0.17785645, + "step": 5726, + "time_per_iteration": 2.5495173931121826 + }, + { + "auxiliary_loss_clip": 0.06477433, + "auxiliary_loss_mlp": 0.01281684, + "balance_loss_clip": 0.06292193, + "balance_loss_mlp": 0.01265948, + "epoch": 0.3443258680294604, + "flos": 24542372856960.0, + "grad_norm": 3.3735616171284453, + "language_loss": 0.73631704, + "learning_rate": 3.0500483052996703e-06, + "loss": 0.81390822, + "num_input_tokens_seen": 123076890, + "router_z_loss_clip": 1.85253906, + "router_z_loss_mlp": 0.15734863, + "step": 5727, + "time_per_iteration": 2.5395119190216064 + }, + { + "auxiliary_loss_clip": 0.06474276, + "auxiliary_loss_mlp": 0.01274594, + "balance_loss_clip": 0.06292102, + "balance_loss_mlp": 0.01259049, + "epoch": 0.3443859912821284, + "flos": 20236363061760.0, + "grad_norm": 1.756953821036591, + "language_loss": 0.88303459, + "learning_rate": 3.0497168194911257e-06, + "loss": 0.96052337, + "num_input_tokens_seen": 123092530, + "router_z_loss_clip": 1.8203125, + "router_z_loss_mlp": 0.15551758, + "step": 5728, + "time_per_iteration": 2.5943620204925537 + }, + { + "auxiliary_loss_clip": 0.06472028, + "auxiliary_loss_mlp": 0.01275786, + "balance_loss_clip": 0.06289984, + "balance_loss_mlp": 0.01259382, + "epoch": 0.34444611453479634, + "flos": 24323425338240.0, + "grad_norm": 1.9801243778486481, + "language_loss": 0.70532095, + "learning_rate": 3.0493852938772143e-06, + "loss": 0.78279907, + "num_input_tokens_seen": 123110560, + "router_z_loss_clip": 1.8203125, + "router_z_loss_mlp": 0.1640625, + "step": 5729, + "time_per_iteration": 2.5122504234313965 + }, + { + "auxiliary_loss_clip": 0.06472413, + "auxiliary_loss_mlp": 0.01278834, + "balance_loss_clip": 0.06293523, + "balance_loss_mlp": 0.01263123, + "epoch": 0.3445062377874643, + "flos": 16989186585600.0, + "grad_norm": 2.065738946159642, + "language_loss": 0.74902749, + "learning_rate": 3.0490537284705078e-06, + "loss": 0.82653993, + "num_input_tokens_seen": 123128655, + "router_z_loss_clip": 1.79003906, + "router_z_loss_mlp": 0.15710449, + "step": 5730, + "time_per_iteration": 2.4971024990081787 + }, + { + "auxiliary_loss_clip": 0.06477457, + "auxiliary_loss_mlp": 0.01272788, + "balance_loss_clip": 0.06295118, + "balance_loss_mlp": 0.01256921, + "epoch": 0.3445663610401323, + "flos": 20308884370560.0, + "grad_norm": 2.25692333978076, + "language_loss": 0.79881716, + "learning_rate": 3.048722123283578e-06, + "loss": 0.87631959, + "num_input_tokens_seen": 123145130, + "router_z_loss_clip": 1.82128906, + "router_z_loss_mlp": 0.15869141, + "step": 5731, + "time_per_iteration": 2.5055606365203857 + }, + { + "auxiliary_loss_clip": 0.0647382, + "auxiliary_loss_mlp": 0.01272088, + "balance_loss_clip": 0.06289574, + "balance_loss_mlp": 0.01256532, + "epoch": 0.34462648429280024, + "flos": 15893568524160.0, + "grad_norm": 2.0529883798711586, + "language_loss": 0.78536034, + "learning_rate": 3.0483904783290006e-06, + "loss": 0.86281943, + "num_input_tokens_seen": 123162265, + "router_z_loss_clip": 1.84277344, + "router_z_loss_mlp": 0.15545654, + "step": 5732, + "time_per_iteration": 2.58428692817688 + }, + { + "auxiliary_loss_clip": 0.06393671, + "auxiliary_loss_mlp": 0.01269392, + "balance_loss_clip": 0.06309536, + "balance_loss_mlp": 0.01263571, + "epoch": 0.3446866075454682, + "flos": 59330681193600.0, + "grad_norm": 0.7296400398421587, + "language_loss": 0.53166986, + "learning_rate": 3.0480587936193505e-06, + "loss": 0.60830045, + "num_input_tokens_seen": 123218620, + "router_z_loss_clip": 0.84375, + "router_z_loss_mlp": 0.05813599, + "step": 5733, + "time_per_iteration": 3.1921679973602295 + }, + { + "auxiliary_loss_clip": 0.06473544, + "auxiliary_loss_mlp": 0.01275818, + "balance_loss_clip": 0.06292105, + "balance_loss_mlp": 0.01259248, + "epoch": 0.34474673079813617, + "flos": 22349962776960.0, + "grad_norm": 1.6143563972241732, + "language_loss": 0.83787543, + "learning_rate": 3.047727069167207e-06, + "loss": 0.91536903, + "num_input_tokens_seen": 123237325, + "router_z_loss_clip": 1.81542969, + "router_z_loss_mlp": 0.16564941, + "step": 5734, + "time_per_iteration": 2.5630810260772705 + }, + { + "auxiliary_loss_clip": 0.06472072, + "auxiliary_loss_mlp": 0.01278915, + "balance_loss_clip": 0.0628967, + "balance_loss_mlp": 0.01262834, + "epoch": 0.34480685405080413, + "flos": 27677098753920.0, + "grad_norm": 1.7144738343554842, + "language_loss": 0.93389094, + "learning_rate": 3.0473953049851478e-06, + "loss": 1.01140082, + "num_input_tokens_seen": 123258650, + "router_z_loss_clip": 1.82324219, + "router_z_loss_mlp": 0.1607666, + "step": 5735, + "time_per_iteration": 2.5621798038482666 + }, + { + "auxiliary_loss_clip": 0.06471383, + "auxiliary_loss_mlp": 0.01276844, + "balance_loss_clip": 0.06284925, + "balance_loss_mlp": 0.01259273, + "epoch": 0.3448669773034721, + "flos": 22462664918400.0, + "grad_norm": 1.7840822264419087, + "language_loss": 0.77095437, + "learning_rate": 3.0470635010857533e-06, + "loss": 0.84843659, + "num_input_tokens_seen": 123277155, + "router_z_loss_clip": 1.86523438, + "router_z_loss_mlp": 0.17578125, + "step": 5736, + "time_per_iteration": 2.5377349853515625 + }, + { + "auxiliary_loss_clip": 0.06471781, + "auxiliary_loss_mlp": 0.01270645, + "balance_loss_clip": 0.06287266, + "balance_loss_mlp": 0.01255326, + "epoch": 0.34492710055614006, + "flos": 24943105808640.0, + "grad_norm": 1.6287034776462515, + "language_loss": 0.79113513, + "learning_rate": 3.0467316574816064e-06, + "loss": 0.86855936, + "num_input_tokens_seen": 123297640, + "router_z_loss_clip": 1.84667969, + "router_z_loss_mlp": 0.15319824, + "step": 5737, + "time_per_iteration": 2.5471904277801514 + }, + { + "auxiliary_loss_clip": 0.06473309, + "auxiliary_loss_mlp": 0.01276485, + "balance_loss_clip": 0.06285917, + "balance_loss_mlp": 0.0125976, + "epoch": 0.34498722380880803, + "flos": 20127057010560.0, + "grad_norm": 2.191814396638409, + "language_loss": 0.72072059, + "learning_rate": 3.0463997741852893e-06, + "loss": 0.79821849, + "num_input_tokens_seen": 123314370, + "router_z_loss_clip": 1.875, + "router_z_loss_mlp": 0.16723633, + "step": 5738, + "time_per_iteration": 2.540442943572998 + }, + { + "auxiliary_loss_clip": 0.06471272, + "auxiliary_loss_mlp": 0.01272808, + "balance_loss_clip": 0.06284432, + "balance_loss_mlp": 0.01255821, + "epoch": 0.34504734706147605, + "flos": 28445511421440.0, + "grad_norm": 1.9413212194180998, + "language_loss": 0.82238245, + "learning_rate": 3.046067851209389e-06, + "loss": 0.89982325, + "num_input_tokens_seen": 123336085, + "router_z_loss_clip": 1.8671875, + "router_z_loss_mlp": 0.16992188, + "step": 5739, + "time_per_iteration": 2.57327938079834 + }, + { + "auxiliary_loss_clip": 0.06469989, + "auxiliary_loss_mlp": 0.0127904, + "balance_loss_clip": 0.06284826, + "balance_loss_mlp": 0.01261862, + "epoch": 0.345107470314144, + "flos": 22681067385600.0, + "grad_norm": 1.914547064909644, + "language_loss": 0.83564734, + "learning_rate": 3.0457358885664898e-06, + "loss": 0.91313767, + "num_input_tokens_seen": 123354460, + "router_z_loss_clip": 1.84960938, + "router_z_loss_mlp": 0.171875, + "step": 5740, + "time_per_iteration": 2.5514895915985107 + }, + { + "auxiliary_loss_clip": 0.06466584, + "auxiliary_loss_mlp": 0.01275646, + "balance_loss_clip": 0.06283005, + "balance_loss_mlp": 0.01258921, + "epoch": 0.345167593566812, + "flos": 20636886378240.0, + "grad_norm": 2.1474795597791734, + "language_loss": 0.76802379, + "learning_rate": 3.045403886269181e-06, + "loss": 0.84544611, + "num_input_tokens_seen": 123373420, + "router_z_loss_clip": 1.83496094, + "router_z_loss_mlp": 0.16723633, + "step": 5741, + "time_per_iteration": 2.511997699737549 + }, + { + "auxiliary_loss_clip": 0.06466299, + "auxiliary_loss_mlp": 0.0127053, + "balance_loss_clip": 0.06279384, + "balance_loss_mlp": 0.01254544, + "epoch": 0.34522771681947995, + "flos": 26221683260160.0, + "grad_norm": 1.6006732343467382, + "language_loss": 0.77803171, + "learning_rate": 3.045071844330053e-06, + "loss": 0.85540009, + "num_input_tokens_seen": 123394730, + "router_z_loss_clip": 1.87011719, + "router_z_loss_mlp": 0.15966797, + "step": 5742, + "time_per_iteration": 2.5593955516815186 + }, + { + "auxiliary_loss_clip": 0.06464212, + "auxiliary_loss_mlp": 0.01272894, + "balance_loss_clip": 0.06281982, + "balance_loss_mlp": 0.01256074, + "epoch": 0.3452878400721479, + "flos": 19068349472640.0, + "grad_norm": 2.2544306863162538, + "language_loss": 0.76459014, + "learning_rate": 3.0447397627616955e-06, + "loss": 0.84196126, + "num_input_tokens_seen": 123412895, + "router_z_loss_clip": 1.82421875, + "router_z_loss_mlp": 0.16821289, + "step": 5743, + "time_per_iteration": 3.996267557144165 + }, + { + "auxiliary_loss_clip": 0.06462429, + "auxiliary_loss_mlp": 0.0126984, + "balance_loss_clip": 0.06281956, + "balance_loss_mlp": 0.01255118, + "epoch": 0.3453479633248159, + "flos": 27937442989440.0, + "grad_norm": 1.578255214465821, + "language_loss": 0.7080915, + "learning_rate": 3.0444076415767016e-06, + "loss": 0.78541422, + "num_input_tokens_seen": 123432320, + "router_z_loss_clip": 1.80761719, + "router_z_loss_mlp": 0.14727783, + "step": 5744, + "time_per_iteration": 2.5594234466552734 + }, + { + "auxiliary_loss_clip": 0.06462625, + "auxiliary_loss_mlp": 0.01272389, + "balance_loss_clip": 0.0628416, + "balance_loss_mlp": 0.01256523, + "epoch": 0.34540808657748384, + "flos": 19611609419520.0, + "grad_norm": 1.8945383960499247, + "language_loss": 0.79877782, + "learning_rate": 3.044075480787665e-06, + "loss": 0.87612802, + "num_input_tokens_seen": 123450980, + "router_z_loss_clip": 1.78320312, + "router_z_loss_mlp": 0.15881348, + "step": 5745, + "time_per_iteration": 2.5577902793884277 + }, + { + "auxiliary_loss_clip": 0.0646376, + "auxiliary_loss_mlp": 0.0127446, + "balance_loss_clip": 0.0627804, + "balance_loss_mlp": 0.01258343, + "epoch": 0.3454682098301518, + "flos": 20417771151360.0, + "grad_norm": 2.2215207406176063, + "language_loss": 0.90027881, + "learning_rate": 3.043743280407182e-06, + "loss": 0.97766101, + "num_input_tokens_seen": 123469365, + "router_z_loss_clip": 1.85742188, + "router_z_loss_mlp": 0.16113281, + "step": 5746, + "time_per_iteration": 4.126953840255737 + }, + { + "auxiliary_loss_clip": 0.06469168, + "auxiliary_loss_mlp": 0.01271588, + "balance_loss_clip": 0.06281114, + "balance_loss_mlp": 0.01254648, + "epoch": 0.34552833308281977, + "flos": 21331603779840.0, + "grad_norm": 1.8420175913064167, + "language_loss": 0.65233189, + "learning_rate": 3.043411040447849e-06, + "loss": 0.72973943, + "num_input_tokens_seen": 123489425, + "router_z_loss_clip": 1.88085938, + "router_z_loss_mlp": 0.16931152, + "step": 5747, + "time_per_iteration": 2.6445960998535156 + }, + { + "auxiliary_loss_clip": 0.06461484, + "auxiliary_loss_mlp": 0.01274425, + "balance_loss_clip": 0.06279166, + "balance_loss_mlp": 0.01259166, + "epoch": 0.34558845633548774, + "flos": 36251914331520.0, + "grad_norm": 1.6152983170909512, + "language_loss": 0.72912234, + "learning_rate": 3.043078760922264e-06, + "loss": 0.80648136, + "num_input_tokens_seen": 123509970, + "router_z_loss_clip": 1.82226562, + "router_z_loss_mlp": 0.15246582, + "step": 5748, + "time_per_iteration": 2.668628692626953 + }, + { + "auxiliary_loss_clip": 0.0646018, + "auxiliary_loss_mlp": 0.01271906, + "balance_loss_clip": 0.06281725, + "balance_loss_mlp": 0.01257268, + "epoch": 0.3456485795881557, + "flos": 22456292008320.0, + "grad_norm": 2.139365243179929, + "language_loss": 0.75935584, + "learning_rate": 3.042746441843029e-06, + "loss": 0.83667672, + "num_input_tokens_seen": 123531055, + "router_z_loss_clip": 1.78320312, + "router_z_loss_mlp": 0.14648438, + "step": 5749, + "time_per_iteration": 2.533357620239258 + }, + { + "auxiliary_loss_clip": 0.06372777, + "auxiliary_loss_mlp": 0.01259534, + "balance_loss_clip": 0.06290279, + "balance_loss_mlp": 0.0125392, + "epoch": 0.34570870284082367, + "flos": 62023277422080.0, + "grad_norm": 0.8741398929973155, + "language_loss": 0.62861037, + "learning_rate": 3.0424140832227437e-06, + "loss": 0.70493352, + "num_input_tokens_seen": 123584720, + "router_z_loss_clip": 0.82470703, + "router_z_loss_mlp": 0.05612183, + "step": 5750, + "time_per_iteration": 4.42021369934082 + }, + { + "auxiliary_loss_clip": 0.06455849, + "auxiliary_loss_mlp": 0.0126761, + "balance_loss_clip": 0.06279862, + "balance_loss_mlp": 0.01253383, + "epoch": 0.34576882609349163, + "flos": 22788528647040.0, + "grad_norm": 2.5604939014714043, + "language_loss": 0.80745482, + "learning_rate": 3.042081685074012e-06, + "loss": 0.88468945, + "num_input_tokens_seen": 123604465, + "router_z_loss_clip": 1.7578125, + "router_z_loss_mlp": 0.14227295, + "step": 5751, + "time_per_iteration": 2.610229730606079 + }, + { + "auxiliary_loss_clip": 0.06461278, + "auxiliary_loss_mlp": 0.01273124, + "balance_loss_clip": 0.06282206, + "balance_loss_mlp": 0.01258199, + "epoch": 0.34582894934615965, + "flos": 12353665409280.0, + "grad_norm": 2.333174149642167, + "language_loss": 0.85112172, + "learning_rate": 3.041749247409439e-06, + "loss": 0.92846578, + "num_input_tokens_seen": 123622320, + "router_z_loss_clip": 1.79101562, + "router_z_loss_mlp": 0.14904785, + "step": 5752, + "time_per_iteration": 2.49895977973938 + }, + { + "auxiliary_loss_clip": 0.06379203, + "auxiliary_loss_mlp": 0.01260282, + "balance_loss_clip": 0.06296635, + "balance_loss_mlp": 0.01254092, + "epoch": 0.3458890725988276, + "flos": 70186459017600.0, + "grad_norm": 0.7233537791569425, + "language_loss": 0.63163221, + "learning_rate": 3.0414167702416296e-06, + "loss": 0.70802706, + "num_input_tokens_seen": 123678010, + "router_z_loss_clip": 0.828125, + "router_z_loss_mlp": 0.06185913, + "step": 5753, + "time_per_iteration": 3.0605263710021973 + }, + { + "auxiliary_loss_clip": 0.06463367, + "auxiliary_loss_mlp": 0.01274407, + "balance_loss_clip": 0.06282756, + "balance_loss_mlp": 0.01258498, + "epoch": 0.3459491958514956, + "flos": 17098324928640.0, + "grad_norm": 2.0282181813946116, + "language_loss": 0.71483171, + "learning_rate": 3.0410842535831914e-06, + "loss": 0.79220951, + "num_input_tokens_seen": 123696830, + "router_z_loss_clip": 1.80566406, + "router_z_loss_mlp": 0.15899658, + "step": 5754, + "time_per_iteration": 2.499213457107544 + }, + { + "auxiliary_loss_clip": 0.06470741, + "auxiliary_loss_mlp": 0.01271896, + "balance_loss_clip": 0.06282809, + "balance_loss_mlp": 0.01255898, + "epoch": 0.34600931910416355, + "flos": 16655985624960.0, + "grad_norm": 2.0834630321372534, + "language_loss": 0.7328862, + "learning_rate": 3.0407516974467343e-06, + "loss": 0.81031251, + "num_input_tokens_seen": 123714360, + "router_z_loss_clip": 1.87695312, + "router_z_loss_mlp": 0.15979004, + "step": 5755, + "time_per_iteration": 2.540292263031006 + }, + { + "auxiliary_loss_clip": 0.0646005, + "auxiliary_loss_mlp": 0.01272619, + "balance_loss_clip": 0.06280342, + "balance_loss_mlp": 0.01257801, + "epoch": 0.3460694423568315, + "flos": 38555517179520.0, + "grad_norm": 1.432388080922509, + "language_loss": 0.7255426, + "learning_rate": 3.040419101844869e-06, + "loss": 0.80286932, + "num_input_tokens_seen": 123739250, + "router_z_loss_clip": 1.79589844, + "router_z_loss_mlp": 0.14813232, + "step": 5756, + "time_per_iteration": 2.679203510284424 + }, + { + "auxiliary_loss_clip": 0.06371044, + "auxiliary_loss_mlp": 0.01257585, + "balance_loss_clip": 0.06288835, + "balance_loss_mlp": 0.01251058, + "epoch": 0.3461295656094995, + "flos": 72103332545280.0, + "grad_norm": 0.6902951700774806, + "language_loss": 0.62318385, + "learning_rate": 3.040086466790207e-06, + "loss": 0.69947016, + "num_input_tokens_seen": 123802845, + "router_z_loss_clip": 0.8203125, + "router_z_loss_mlp": 0.06536865, + "step": 5757, + "time_per_iteration": 3.209688901901245 + }, + { + "auxiliary_loss_clip": 0.06363717, + "auxiliary_loss_mlp": 0.01259824, + "balance_loss_clip": 0.06281064, + "balance_loss_mlp": 0.01253244, + "epoch": 0.34618968886216744, + "flos": 65477913408000.0, + "grad_norm": 0.8114970964410039, + "language_loss": 0.59130025, + "learning_rate": 3.039753792295362e-06, + "loss": 0.66753566, + "num_input_tokens_seen": 123861805, + "router_z_loss_clip": 0.828125, + "router_z_loss_mlp": 0.06591797, + "step": 5758, + "time_per_iteration": 3.139495372772217 + }, + { + "auxiliary_loss_clip": 0.06467785, + "auxiliary_loss_mlp": 0.01274731, + "balance_loss_clip": 0.06288655, + "balance_loss_mlp": 0.01259747, + "epoch": 0.3462498121148354, + "flos": 23478508293120.0, + "grad_norm": 1.7665020183034759, + "language_loss": 0.72321635, + "learning_rate": 3.0394210783729487e-06, + "loss": 0.80064148, + "num_input_tokens_seen": 123881820, + "router_z_loss_clip": 1.79101562, + "router_z_loss_mlp": 0.14990234, + "step": 5759, + "time_per_iteration": 2.575479745864868 + }, + { + "auxiliary_loss_clip": 0.06456805, + "auxiliary_loss_mlp": 0.01274415, + "balance_loss_clip": 0.06277698, + "balance_loss_mlp": 0.01258632, + "epoch": 0.3463099353675034, + "flos": 24177711888000.0, + "grad_norm": 1.8760422141660649, + "language_loss": 0.83568478, + "learning_rate": 3.0390883250355836e-06, + "loss": 0.91299695, + "num_input_tokens_seen": 123903700, + "router_z_loss_clip": 1.79199219, + "router_z_loss_mlp": 0.15771484, + "step": 5760, + "time_per_iteration": 2.5610272884368896 + }, + { + "auxiliary_loss_clip": 0.06358143, + "auxiliary_loss_mlp": 0.01257449, + "balance_loss_clip": 0.06276596, + "balance_loss_mlp": 0.0125125, + "epoch": 0.34637005862017134, + "flos": 63716773893120.0, + "grad_norm": 0.8043642187655193, + "language_loss": 0.56576806, + "learning_rate": 3.0387555322958865e-06, + "loss": 0.64192402, + "num_input_tokens_seen": 123960075, + "router_z_loss_clip": 0.81591797, + "router_z_loss_mlp": 0.06195068, + "step": 5761, + "time_per_iteration": 3.2343695163726807 + }, + { + "auxiliary_loss_clip": 0.06453449, + "auxiliary_loss_mlp": 0.01270941, + "balance_loss_clip": 0.06277917, + "balance_loss_mlp": 0.01256457, + "epoch": 0.3464301818728393, + "flos": 13149513089280.0, + "grad_norm": 1.936786863895872, + "language_loss": 0.9549523, + "learning_rate": 3.038422700166474e-06, + "loss": 1.03219616, + "num_input_tokens_seen": 123975805, + "router_z_loss_clip": 1.75585938, + "router_z_loss_mlp": 0.14477539, + "step": 5762, + "time_per_iteration": 2.496039390563965 + }, + { + "auxiliary_loss_clip": 0.06467324, + "auxiliary_loss_mlp": 0.01276759, + "balance_loss_clip": 0.06279808, + "balance_loss_mlp": 0.01260928, + "epoch": 0.34649030512550727, + "flos": 29322936650880.0, + "grad_norm": 1.870020160295256, + "language_loss": 0.69913763, + "learning_rate": 3.0380898286599692e-06, + "loss": 0.77657849, + "num_input_tokens_seen": 123997530, + "router_z_loss_clip": 1.875, + "router_z_loss_mlp": 0.15820312, + "step": 5763, + "time_per_iteration": 2.5929718017578125 + }, + { + "auxiliary_loss_clip": 0.06466965, + "auxiliary_loss_mlp": 0.01270957, + "balance_loss_clip": 0.06278971, + "balance_loss_mlp": 0.01253922, + "epoch": 0.34655042837817523, + "flos": 23737385082240.0, + "grad_norm": 1.7922805842181977, + "language_loss": 0.83863467, + "learning_rate": 3.0377569177889945e-06, + "loss": 0.9160139, + "num_input_tokens_seen": 124016375, + "router_z_loss_clip": 1.88183594, + "router_z_loss_mlp": 0.17028809, + "step": 5764, + "time_per_iteration": 2.634692668914795 + }, + { + "auxiliary_loss_clip": 0.06459094, + "auxiliary_loss_mlp": 0.01274486, + "balance_loss_clip": 0.06279744, + "balance_loss_mlp": 0.01259263, + "epoch": 0.34661055163084326, + "flos": 22060716082560.0, + "grad_norm": 2.9007104109569943, + "language_loss": 0.67647815, + "learning_rate": 3.0374239675661722e-06, + "loss": 0.75381392, + "num_input_tokens_seen": 124033975, + "router_z_loss_clip": 1.79394531, + "router_z_loss_mlp": 0.15234375, + "step": 5765, + "time_per_iteration": 2.5028090476989746 + }, + { + "auxiliary_loss_clip": 0.06460512, + "auxiliary_loss_mlp": 0.0127692, + "balance_loss_clip": 0.06280708, + "balance_loss_mlp": 0.01262233, + "epoch": 0.3466706748835112, + "flos": 21805738508160.0, + "grad_norm": 3.5961884004183426, + "language_loss": 0.77947313, + "learning_rate": 3.03709097800413e-06, + "loss": 0.85684741, + "num_input_tokens_seen": 124051930, + "router_z_loss_clip": 1.79980469, + "router_z_loss_mlp": 0.14709473, + "step": 5766, + "time_per_iteration": 2.5584661960601807 + }, + { + "auxiliary_loss_clip": 0.06460432, + "auxiliary_loss_mlp": 0.01274096, + "balance_loss_clip": 0.06278767, + "balance_loss_mlp": 0.01260614, + "epoch": 0.3467307981361792, + "flos": 19467405342720.0, + "grad_norm": 1.5497773141022704, + "language_loss": 0.73886019, + "learning_rate": 3.0367579491154943e-06, + "loss": 0.8162055, + "num_input_tokens_seen": 124071220, + "router_z_loss_clip": 1.81640625, + "router_z_loss_mlp": 0.13500977, + "step": 5767, + "time_per_iteration": 2.571500062942505 + }, + { + "auxiliary_loss_clip": 0.06461183, + "auxiliary_loss_mlp": 0.01276021, + "balance_loss_clip": 0.06279645, + "balance_loss_mlp": 0.01260107, + "epoch": 0.34679092138884715, + "flos": 24834470590080.0, + "grad_norm": 2.0350854996297696, + "language_loss": 0.78955162, + "learning_rate": 3.036424880912893e-06, + "loss": 0.86692369, + "num_input_tokens_seen": 124090140, + "router_z_loss_clip": 1.8125, + "router_z_loss_mlp": 0.15917969, + "step": 5768, + "time_per_iteration": 2.5747995376586914 + }, + { + "auxiliary_loss_clip": 0.06369781, + "auxiliary_loss_mlp": 0.01257254, + "balance_loss_clip": 0.06288455, + "balance_loss_mlp": 0.01251723, + "epoch": 0.3468510446415151, + "flos": 63253791757440.0, + "grad_norm": 0.7431238132649503, + "language_loss": 0.57319033, + "learning_rate": 3.036091773408956e-06, + "loss": 0.64946061, + "num_input_tokens_seen": 124152025, + "router_z_loss_clip": 0.8125, + "router_z_loss_mlp": 0.05535889, + "step": 5769, + "time_per_iteration": 3.176074981689453 + }, + { + "auxiliary_loss_clip": 0.06479758, + "auxiliary_loss_mlp": 0.01277235, + "balance_loss_clip": 0.06285711, + "balance_loss_mlp": 0.01260212, + "epoch": 0.3469111678941831, + "flos": 12123984568320.0, + "grad_norm": 2.4016361546378158, + "language_loss": 0.85419703, + "learning_rate": 3.0357586266163154e-06, + "loss": 0.93176699, + "num_input_tokens_seen": 124165795, + "router_z_loss_clip": 1.94042969, + "router_z_loss_mlp": 0.17028809, + "step": 5770, + "time_per_iteration": 2.5156779289245605 + }, + { + "auxiliary_loss_clip": 0.06372644, + "auxiliary_loss_mlp": 0.01258777, + "balance_loss_clip": 0.0629043, + "balance_loss_mlp": 0.01253087, + "epoch": 0.34697129114685105, + "flos": 65951964282240.0, + "grad_norm": 0.7493725348793998, + "language_loss": 0.59862447, + "learning_rate": 3.0354254405476036e-06, + "loss": 0.67493868, + "num_input_tokens_seen": 124222925, + "router_z_loss_clip": 0.82080078, + "router_z_loss_mlp": 0.05685425, + "step": 5771, + "time_per_iteration": 2.938957691192627 + }, + { + "auxiliary_loss_clip": 0.0646434, + "auxiliary_loss_mlp": 0.012787, + "balance_loss_clip": 0.06282143, + "balance_loss_mlp": 0.01263572, + "epoch": 0.347031414399519, + "flos": 34461914284800.0, + "grad_norm": 1.9396999801577832, + "language_loss": 0.72527683, + "learning_rate": 3.0350922152154557e-06, + "loss": 0.80270731, + "num_input_tokens_seen": 124240915, + "router_z_loss_clip": 1.82324219, + "router_z_loss_mlp": 0.15136719, + "step": 5772, + "time_per_iteration": 2.6529078483581543 + }, + { + "auxiliary_loss_clip": 0.06462972, + "auxiliary_loss_mlp": 0.01272172, + "balance_loss_clip": 0.06281382, + "balance_loss_mlp": 0.01256246, + "epoch": 0.347091537652187, + "flos": 26951592176640.0, + "grad_norm": 1.5709710398058576, + "language_loss": 0.76695967, + "learning_rate": 3.034758950632507e-06, + "loss": 0.84431112, + "num_input_tokens_seen": 124262770, + "router_z_loss_clip": 1.81445312, + "router_z_loss_mlp": 0.15924072, + "step": 5773, + "time_per_iteration": 2.5785317420959473 + }, + { + "auxiliary_loss_clip": 0.06466497, + "auxiliary_loss_mlp": 0.01271256, + "balance_loss_clip": 0.06280655, + "balance_loss_mlp": 0.01255366, + "epoch": 0.34715166090485494, + "flos": 21148602462720.0, + "grad_norm": 2.4326309651076463, + "language_loss": 0.70796078, + "learning_rate": 3.034425646811396e-06, + "loss": 0.78533834, + "num_input_tokens_seen": 124280950, + "router_z_loss_clip": 1.85644531, + "router_z_loss_mlp": 0.15893555, + "step": 5774, + "time_per_iteration": 2.5585873126983643 + }, + { + "auxiliary_loss_clip": 0.06458526, + "auxiliary_loss_mlp": 0.01271942, + "balance_loss_clip": 0.06278332, + "balance_loss_mlp": 0.01256707, + "epoch": 0.3472117841575229, + "flos": 23484881203200.0, + "grad_norm": 2.2084812675777474, + "language_loss": 0.76485682, + "learning_rate": 3.0340923037647602e-06, + "loss": 0.84216148, + "num_input_tokens_seen": 124299540, + "router_z_loss_clip": 1.80175781, + "router_z_loss_mlp": 0.15228271, + "step": 5775, + "time_per_iteration": 2.5899477005004883 + }, + { + "auxiliary_loss_clip": 0.06472419, + "auxiliary_loss_mlp": 0.01271173, + "balance_loss_clip": 0.06281743, + "balance_loss_mlp": 0.01255163, + "epoch": 0.34727190741019087, + "flos": 17498428974720.0, + "grad_norm": 2.2070819655775282, + "language_loss": 0.7869916, + "learning_rate": 3.0337589215052404e-06, + "loss": 0.86442757, + "num_input_tokens_seen": 124316285, + "router_z_loss_clip": 1.90625, + "router_z_loss_mlp": 0.16009521, + "step": 5776, + "time_per_iteration": 2.5874037742614746 + }, + { + "auxiliary_loss_clip": 0.0636313, + "auxiliary_loss_mlp": 0.01265305, + "balance_loss_clip": 0.06280468, + "balance_loss_mlp": 0.0125983, + "epoch": 0.34733203066285884, + "flos": 65287350495360.0, + "grad_norm": 0.8333293277096808, + "language_loss": 0.63448966, + "learning_rate": 3.033425500045478e-06, + "loss": 0.710774, + "num_input_tokens_seen": 124376650, + "router_z_loss_clip": 0.82666016, + "router_z_loss_mlp": 0.05477905, + "step": 5777, + "time_per_iteration": 3.168325185775757 + }, + { + "auxiliary_loss_clip": 0.0646584, + "auxiliary_loss_mlp": 0.01270867, + "balance_loss_clip": 0.06279471, + "balance_loss_mlp": 0.01255048, + "epoch": 0.3473921539155268, + "flos": 28666429511040.0, + "grad_norm": 3.258496862714712, + "language_loss": 0.65075529, + "learning_rate": 3.033092039398119e-06, + "loss": 0.72812235, + "num_input_tokens_seen": 124396475, + "router_z_loss_clip": 1.86425781, + "router_z_loss_mlp": 0.15808105, + "step": 5778, + "time_per_iteration": 2.5797836780548096 + }, + { + "auxiliary_loss_clip": 0.06467149, + "auxiliary_loss_mlp": 0.01271344, + "balance_loss_clip": 0.06278305, + "balance_loss_mlp": 0.0125633, + "epoch": 0.3474522771681948, + "flos": 40845284104320.0, + "grad_norm": 1.7195764072446118, + "language_loss": 0.722601, + "learning_rate": 3.0327585395758046e-06, + "loss": 0.79998595, + "num_input_tokens_seen": 124416480, + "router_z_loss_clip": 1.88769531, + "router_z_loss_mlp": 0.15008545, + "step": 5779, + "time_per_iteration": 2.6901330947875977 + }, + { + "auxiliary_loss_clip": 0.06474127, + "auxiliary_loss_mlp": 0.01275722, + "balance_loss_clip": 0.06282836, + "balance_loss_mlp": 0.01259092, + "epoch": 0.3475124004208628, + "flos": 24615564998400.0, + "grad_norm": 2.601451729132101, + "language_loss": 0.62399209, + "learning_rate": 3.0324250005911837e-06, + "loss": 0.70149052, + "num_input_tokens_seen": 124435950, + "router_z_loss_clip": 1.9140625, + "router_z_loss_mlp": 0.1663208, + "step": 5780, + "time_per_iteration": 2.5493476390838623 + }, + { + "auxiliary_loss_clip": 0.0647147, + "auxiliary_loss_mlp": 0.01271785, + "balance_loss_clip": 0.06285025, + "balance_loss_mlp": 0.01256264, + "epoch": 0.34757252367353075, + "flos": 22717977909120.0, + "grad_norm": 3.4183593986527043, + "language_loss": 0.72164977, + "learning_rate": 3.0320914224569033e-06, + "loss": 0.79908228, + "num_input_tokens_seen": 124455410, + "router_z_loss_clip": 1.86523438, + "router_z_loss_mlp": 0.15515137, + "step": 5781, + "time_per_iteration": 2.610198974609375 + }, + { + "auxiliary_loss_clip": 0.06471756, + "auxiliary_loss_mlp": 0.01273476, + "balance_loss_clip": 0.06282213, + "balance_loss_mlp": 0.01257228, + "epoch": 0.3476326469261987, + "flos": 19834246517760.0, + "grad_norm": 2.4264406265191325, + "language_loss": 0.77686667, + "learning_rate": 3.031757805185612e-06, + "loss": 0.85431898, + "num_input_tokens_seen": 124474870, + "router_z_loss_clip": 1.89648438, + "router_z_loss_mlp": 0.16235352, + "step": 5782, + "time_per_iteration": 3.918602705001831 + }, + { + "auxiliary_loss_clip": 0.06470296, + "auxiliary_loss_mlp": 0.01277549, + "balance_loss_clip": 0.0628626, + "balance_loss_mlp": 0.01262695, + "epoch": 0.3476927701788667, + "flos": 19944265328640.0, + "grad_norm": 2.639685157679876, + "language_loss": 0.63410383, + "learning_rate": 3.0314241487899622e-06, + "loss": 0.7115823, + "num_input_tokens_seen": 124494105, + "router_z_loss_clip": 1.84082031, + "router_z_loss_mlp": 0.14855957, + "step": 5783, + "time_per_iteration": 4.021190881729126 + }, + { + "auxiliary_loss_clip": 0.06469369, + "auxiliary_loss_mlp": 0.01277895, + "balance_loss_clip": 0.06290524, + "balance_loss_mlp": 0.01264121, + "epoch": 0.34775289343153465, + "flos": 20740448424960.0, + "grad_norm": 1.686879732071426, + "language_loss": 0.89054763, + "learning_rate": 3.031090453282605e-06, + "loss": 0.9680202, + "num_input_tokens_seen": 124512030, + "router_z_loss_clip": 1.78808594, + "router_z_loss_mlp": 0.13763428, + "step": 5784, + "time_per_iteration": 2.553847074508667 + }, + { + "auxiliary_loss_clip": 0.06470798, + "auxiliary_loss_mlp": 0.01275566, + "balance_loss_clip": 0.06289466, + "balance_loss_mlp": 0.01260903, + "epoch": 0.3478130166842026, + "flos": 19360992257280.0, + "grad_norm": 1.643062521609265, + "language_loss": 0.82068878, + "learning_rate": 3.0307567186761946e-06, + "loss": 0.89815247, + "num_input_tokens_seen": 124530980, + "router_z_loss_clip": 1.8125, + "router_z_loss_mlp": 0.14672852, + "step": 5785, + "time_per_iteration": 2.5452024936676025 + }, + { + "auxiliary_loss_clip": 0.06472684, + "auxiliary_loss_mlp": 0.01281071, + "balance_loss_clip": 0.06290045, + "balance_loss_mlp": 0.01267004, + "epoch": 0.3478731399368706, + "flos": 22057194211200.0, + "grad_norm": 1.6654216237849466, + "language_loss": 0.80731958, + "learning_rate": 3.0304229449833862e-06, + "loss": 0.88485718, + "num_input_tokens_seen": 124549330, + "router_z_loss_clip": 1.82617188, + "router_z_loss_mlp": 0.14074707, + "step": 5786, + "time_per_iteration": 4.040801286697388 + }, + { + "auxiliary_loss_clip": 0.06468868, + "auxiliary_loss_mlp": 0.01275893, + "balance_loss_clip": 0.06289011, + "balance_loss_mlp": 0.01260515, + "epoch": 0.34793326318953854, + "flos": 18047390999040.0, + "grad_norm": 1.5833193798509506, + "language_loss": 0.75743961, + "learning_rate": 3.030089132216836e-06, + "loss": 0.83488721, + "num_input_tokens_seen": 124567200, + "router_z_loss_clip": 1.79785156, + "router_z_loss_mlp": 0.15368652, + "step": 5787, + "time_per_iteration": 2.5231845378875732 + }, + { + "auxiliary_loss_clip": 0.06470607, + "auxiliary_loss_mlp": 0.01273428, + "balance_loss_clip": 0.06287535, + "balance_loss_mlp": 0.01259111, + "epoch": 0.3479933864422065, + "flos": 29322349672320.0, + "grad_norm": 1.5447805606313796, + "language_loss": 0.81661141, + "learning_rate": 3.029755280389203e-06, + "loss": 0.89405167, + "num_input_tokens_seen": 124587025, + "router_z_loss_clip": 1.83007812, + "router_z_loss_mlp": 0.14312744, + "step": 5788, + "time_per_iteration": 2.5828304290771484 + }, + { + "auxiliary_loss_clip": 0.064804, + "auxiliary_loss_mlp": 0.01277805, + "balance_loss_clip": 0.06290662, + "balance_loss_mlp": 0.01261831, + "epoch": 0.3480535096948745, + "flos": 20126931229440.0, + "grad_norm": 1.9688082680528027, + "language_loss": 0.85984367, + "learning_rate": 3.029421389513147e-06, + "loss": 0.93742573, + "num_input_tokens_seen": 124605860, + "router_z_loss_clip": 1.8984375, + "router_z_loss_mlp": 0.15979004, + "step": 5789, + "time_per_iteration": 2.582662343978882 + }, + { + "auxiliary_loss_clip": 0.06479242, + "auxiliary_loss_mlp": 0.0127695, + "balance_loss_clip": 0.06292568, + "balance_loss_mlp": 0.0126178, + "epoch": 0.34811363294754244, + "flos": 18554453182080.0, + "grad_norm": 1.6869236803506542, + "language_loss": 0.84773821, + "learning_rate": 3.029087459601328e-06, + "loss": 0.92530012, + "num_input_tokens_seen": 124624270, + "router_z_loss_clip": 1.86816406, + "router_z_loss_mlp": 0.15185547, + "step": 5790, + "time_per_iteration": 3.942929983139038 + }, + { + "auxiliary_loss_clip": 0.06469919, + "auxiliary_loss_mlp": 0.01274378, + "balance_loss_clip": 0.0628828, + "balance_loss_mlp": 0.01259465, + "epoch": 0.3481737562002104, + "flos": 26877603421440.0, + "grad_norm": 1.9257745343225423, + "language_loss": 0.81410027, + "learning_rate": 3.0287534906664097e-06, + "loss": 0.89154327, + "num_input_tokens_seen": 124644005, + "router_z_loss_clip": 1.81738281, + "router_z_loss_mlp": 0.14904785, + "step": 5791, + "time_per_iteration": 2.5533103942871094 + }, + { + "auxiliary_loss_clip": 0.06478444, + "auxiliary_loss_mlp": 0.01278573, + "balance_loss_clip": 0.0629065, + "balance_loss_mlp": 0.01263356, + "epoch": 0.3482338794528784, + "flos": 28915495372800.0, + "grad_norm": 1.656722788090249, + "language_loss": 0.78119808, + "learning_rate": 3.028419482721056e-06, + "loss": 0.85876822, + "num_input_tokens_seen": 124663020, + "router_z_loss_clip": 1.87988281, + "router_z_loss_mlp": 0.15216064, + "step": 5792, + "time_per_iteration": 2.5784294605255127 + }, + { + "auxiliary_loss_clip": 0.06471643, + "auxiliary_loss_mlp": 0.01270557, + "balance_loss_clip": 0.06287935, + "balance_loss_mlp": 0.01255989, + "epoch": 0.3482940027055464, + "flos": 22207393854720.0, + "grad_norm": 1.5928062225109956, + "language_loss": 0.82187879, + "learning_rate": 3.0280854357779325e-06, + "loss": 0.89930081, + "num_input_tokens_seen": 124682975, + "router_z_loss_clip": 1.83496094, + "router_z_loss_mlp": 0.14575195, + "step": 5793, + "time_per_iteration": 2.545158624649048 + }, + { + "auxiliary_loss_clip": 0.06472721, + "auxiliary_loss_mlp": 0.01275633, + "balance_loss_clip": 0.06286202, + "balance_loss_mlp": 0.01259438, + "epoch": 0.34835412595821436, + "flos": 20308884370560.0, + "grad_norm": 1.8552979095996294, + "language_loss": 0.7616328, + "learning_rate": 3.027751349849706e-06, + "loss": 0.83911633, + "num_input_tokens_seen": 124701340, + "router_z_loss_clip": 1.86328125, + "router_z_loss_mlp": 0.1618042, + "step": 5794, + "time_per_iteration": 2.548841953277588 + }, + { + "auxiliary_loss_clip": 0.06468202, + "auxiliary_loss_mlp": 0.01277142, + "balance_loss_clip": 0.06286102, + "balance_loss_mlp": 0.01262271, + "epoch": 0.3484142492108823, + "flos": 20456065267200.0, + "grad_norm": 2.5979910850639336, + "language_loss": 0.57406038, + "learning_rate": 3.0274172249490456e-06, + "loss": 0.65151387, + "num_input_tokens_seen": 124719165, + "router_z_loss_clip": 1.82226562, + "router_z_loss_mlp": 0.14868164, + "step": 5795, + "time_per_iteration": 2.5222668647766113 + }, + { + "auxiliary_loss_clip": 0.06465806, + "auxiliary_loss_mlp": 0.01271041, + "balance_loss_clip": 0.06285395, + "balance_loss_mlp": 0.01257469, + "epoch": 0.3484743724635503, + "flos": 24359832737280.0, + "grad_norm": 1.8988060542741243, + "language_loss": 0.83093596, + "learning_rate": 3.0270830610886213e-06, + "loss": 0.90830439, + "num_input_tokens_seen": 124738670, + "router_z_loss_clip": 1.80371094, + "router_z_loss_mlp": 0.13580322, + "step": 5796, + "time_per_iteration": 2.5901992321014404 + }, + { + "auxiliary_loss_clip": 0.06459932, + "auxiliary_loss_mlp": 0.01272067, + "balance_loss_clip": 0.06285086, + "balance_loss_mlp": 0.01258692, + "epoch": 0.34853449571621825, + "flos": 24359916591360.0, + "grad_norm": 1.6441838604480552, + "language_loss": 0.83544898, + "learning_rate": 3.0267488582811033e-06, + "loss": 0.91276896, + "num_input_tokens_seen": 124758760, + "router_z_loss_clip": 1.74511719, + "router_z_loss_mlp": 0.13378906, + "step": 5797, + "time_per_iteration": 2.5595455169677734 + }, + { + "auxiliary_loss_clip": 0.06466283, + "auxiliary_loss_mlp": 0.01269705, + "balance_loss_clip": 0.06287575, + "balance_loss_mlp": 0.01256055, + "epoch": 0.3485946189688862, + "flos": 27274395231360.0, + "grad_norm": 1.5517160717894904, + "language_loss": 0.73727238, + "learning_rate": 3.026414616539167e-06, + "loss": 0.81463224, + "num_input_tokens_seen": 124777765, + "router_z_loss_clip": 1.78808594, + "router_z_loss_mlp": 0.13647461, + "step": 5798, + "time_per_iteration": 2.716830015182495 + }, + { + "auxiliary_loss_clip": 0.06466942, + "auxiliary_loss_mlp": 0.012712, + "balance_loss_clip": 0.06280895, + "balance_loss_mlp": 0.0125618, + "epoch": 0.3486547422215542, + "flos": 20162835504000.0, + "grad_norm": 1.8098383323780278, + "language_loss": 0.76806593, + "learning_rate": 3.026080335875485e-06, + "loss": 0.84544736, + "num_input_tokens_seen": 124796775, + "router_z_loss_clip": 1.86035156, + "router_z_loss_mlp": 0.15014648, + "step": 5799, + "time_per_iteration": 2.550356149673462 + }, + { + "auxiliary_loss_clip": 0.06464861, + "auxiliary_loss_mlp": 0.01267271, + "balance_loss_clip": 0.06284796, + "balance_loss_mlp": 0.01253735, + "epoch": 0.34871486547422215, + "flos": 20236614624000.0, + "grad_norm": 2.6888551620055363, + "language_loss": 0.75880742, + "learning_rate": 3.025746016302734e-06, + "loss": 0.83612871, + "num_input_tokens_seen": 124815825, + "router_z_loss_clip": 1.79980469, + "router_z_loss_mlp": 0.13543701, + "step": 5800, + "time_per_iteration": 2.559406042098999 + }, + { + "auxiliary_loss_clip": 0.06468332, + "auxiliary_loss_mlp": 0.01272895, + "balance_loss_clip": 0.06284243, + "balance_loss_mlp": 0.01258375, + "epoch": 0.3487749887268901, + "flos": 44063096924160.0, + "grad_norm": 1.6752863637060063, + "language_loss": 0.67620414, + "learning_rate": 3.025411657833591e-06, + "loss": 0.75361645, + "num_input_tokens_seen": 124838420, + "router_z_loss_clip": 1.84082031, + "router_z_loss_mlp": 0.14538574, + "step": 5801, + "time_per_iteration": 2.7286293506622314 + }, + { + "auxiliary_loss_clip": 0.064619, + "auxiliary_loss_mlp": 0.01268462, + "balance_loss_clip": 0.06283519, + "balance_loss_mlp": 0.01253406, + "epoch": 0.3488351119795581, + "flos": 23301921813120.0, + "grad_norm": 1.7427843167651098, + "language_loss": 0.76900619, + "learning_rate": 3.025077260480735e-06, + "loss": 0.84630978, + "num_input_tokens_seen": 124857320, + "router_z_loss_clip": 1.78320312, + "router_z_loss_mlp": 0.15075684, + "step": 5802, + "time_per_iteration": 2.5632455348968506 + }, + { + "auxiliary_loss_clip": 0.0645422, + "auxiliary_loss_mlp": 0.01273067, + "balance_loss_clip": 0.06281535, + "balance_loss_mlp": 0.01260109, + "epoch": 0.34889523523222604, + "flos": 19940449968000.0, + "grad_norm": 1.7168444943641856, + "language_loss": 0.79347479, + "learning_rate": 3.0247428242568474e-06, + "loss": 0.87074769, + "num_input_tokens_seen": 124875685, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.12957764, + "step": 5803, + "time_per_iteration": 2.5202274322509766 + }, + { + "auxiliary_loss_clip": 0.06462935, + "auxiliary_loss_mlp": 0.01269017, + "balance_loss_clip": 0.06277519, + "balance_loss_mlp": 0.01255212, + "epoch": 0.348955358484894, + "flos": 30454123570560.0, + "grad_norm": 2.672940484210586, + "language_loss": 0.67680007, + "learning_rate": 3.0244083491746085e-06, + "loss": 0.75411958, + "num_input_tokens_seen": 124895960, + "router_z_loss_clip": 1.85351562, + "router_z_loss_mlp": 0.13812256, + "step": 5804, + "time_per_iteration": 2.636371374130249 + }, + { + "auxiliary_loss_clip": 0.06455779, + "auxiliary_loss_mlp": 0.01267233, + "balance_loss_clip": 0.06282568, + "balance_loss_mlp": 0.01253989, + "epoch": 0.349015481737562, + "flos": 18005071887360.0, + "grad_norm": 1.776416664420285, + "language_loss": 0.76608741, + "learning_rate": 3.024073835246702e-06, + "loss": 0.84331751, + "num_input_tokens_seen": 124914140, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.13238525, + "step": 5805, + "time_per_iteration": 2.4746642112731934 + }, + { + "auxiliary_loss_clip": 0.06461459, + "auxiliary_loss_mlp": 0.01269872, + "balance_loss_clip": 0.06281143, + "balance_loss_mlp": 0.0125568, + "epoch": 0.34907560499023, + "flos": 27205815064320.0, + "grad_norm": 2.094620432718779, + "language_loss": 0.67626035, + "learning_rate": 3.023739282485814e-06, + "loss": 0.7535736, + "num_input_tokens_seen": 124934180, + "router_z_loss_clip": 1.80273438, + "router_z_loss_mlp": 0.14178467, + "step": 5806, + "time_per_iteration": 2.6109619140625 + }, + { + "auxiliary_loss_clip": 0.06461781, + "auxiliary_loss_mlp": 0.01268424, + "balance_loss_clip": 0.06281736, + "balance_loss_mlp": 0.01254596, + "epoch": 0.34913572824289796, + "flos": 30234714854400.0, + "grad_norm": 1.7462714312606824, + "language_loss": 0.71972066, + "learning_rate": 3.023404690904629e-06, + "loss": 0.7970227, + "num_input_tokens_seen": 124956060, + "router_z_loss_clip": 1.80273438, + "router_z_loss_mlp": 0.1383667, + "step": 5807, + "time_per_iteration": 2.6023621559143066 + }, + { + "auxiliary_loss_clip": 0.06464535, + "auxiliary_loss_mlp": 0.01272433, + "balance_loss_clip": 0.06279333, + "balance_loss_mlp": 0.01257425, + "epoch": 0.3491958514955659, + "flos": 29979779207040.0, + "grad_norm": 2.0002365662223727, + "language_loss": 0.74799109, + "learning_rate": 3.0230700605158364e-06, + "loss": 0.82536077, + "num_input_tokens_seen": 124976070, + "router_z_loss_clip": 1.8515625, + "router_z_loss_mlp": 0.15002441, + "step": 5808, + "time_per_iteration": 2.661327362060547 + }, + { + "auxiliary_loss_clip": 0.0645329, + "auxiliary_loss_mlp": 0.01272203, + "balance_loss_clip": 0.06278954, + "balance_loss_mlp": 0.0125828, + "epoch": 0.3492559747482339, + "flos": 22789786458240.0, + "grad_norm": 1.539446612060682, + "language_loss": 0.84555626, + "learning_rate": 3.0227353913321238e-06, + "loss": 0.92281115, + "num_input_tokens_seen": 124996995, + "router_z_loss_clip": 1.74316406, + "router_z_loss_mlp": 0.13922119, + "step": 5809, + "time_per_iteration": 2.577709197998047 + }, + { + "auxiliary_loss_clip": 0.06454454, + "auxiliary_loss_mlp": 0.01270466, + "balance_loss_clip": 0.06282149, + "balance_loss_mlp": 0.0125755, + "epoch": 0.34931609800090185, + "flos": 26075257050240.0, + "grad_norm": 1.9706347482771516, + "language_loss": 0.80724359, + "learning_rate": 3.0224006833661835e-06, + "loss": 0.88449275, + "num_input_tokens_seen": 125015600, + "router_z_loss_clip": 1.72167969, + "router_z_loss_mlp": 0.12921143, + "step": 5810, + "time_per_iteration": 2.583709955215454 + }, + { + "auxiliary_loss_clip": 0.06460047, + "auxiliary_loss_mlp": 0.01274437, + "balance_loss_clip": 0.06281585, + "balance_loss_mlp": 0.01260477, + "epoch": 0.3493762212535698, + "flos": 29249744509440.0, + "grad_norm": 1.580057936247994, + "language_loss": 0.75975537, + "learning_rate": 3.0220659366307057e-06, + "loss": 0.83710015, + "num_input_tokens_seen": 125035290, + "router_z_loss_clip": 1.78515625, + "router_z_loss_mlp": 0.1395874, + "step": 5811, + "time_per_iteration": 2.6304807662963867 + }, + { + "auxiliary_loss_clip": 0.06459605, + "auxiliary_loss_mlp": 0.01268711, + "balance_loss_clip": 0.06280548, + "balance_loss_mlp": 0.01254746, + "epoch": 0.3494363445062378, + "flos": 27133461463680.0, + "grad_norm": 1.6291603050336358, + "language_loss": 0.80527401, + "learning_rate": 3.021731151138386e-06, + "loss": 0.88255721, + "num_input_tokens_seen": 125057130, + "router_z_loss_clip": 1.79003906, + "router_z_loss_mlp": 0.1395874, + "step": 5812, + "time_per_iteration": 2.657989025115967 + }, + { + "auxiliary_loss_clip": 0.06462281, + "auxiliary_loss_mlp": 0.01270882, + "balance_loss_clip": 0.0628228, + "balance_loss_mlp": 0.01257179, + "epoch": 0.34949646775890575, + "flos": 12281102173440.0, + "grad_norm": 2.0118644405033463, + "language_loss": 0.701132, + "learning_rate": 3.021396326901918e-06, + "loss": 0.7784636, + "num_input_tokens_seen": 125073720, + "router_z_loss_clip": 1.80273438, + "router_z_loss_mlp": 0.137146, + "step": 5813, + "time_per_iteration": 2.47231388092041 + }, + { + "auxiliary_loss_clip": 0.06457584, + "auxiliary_loss_mlp": 0.01270878, + "balance_loss_clip": 0.06281666, + "balance_loss_mlp": 0.01257401, + "epoch": 0.3495565910115737, + "flos": 17171265507840.0, + "grad_norm": 1.9224367307793844, + "language_loss": 0.76310062, + "learning_rate": 3.0210614639339998e-06, + "loss": 0.8403852, + "num_input_tokens_seen": 125090635, + "router_z_loss_clip": 1.75976562, + "router_z_loss_mlp": 0.13482666, + "step": 5814, + "time_per_iteration": 2.4967095851898193 + }, + { + "auxiliary_loss_clip": 0.06471042, + "auxiliary_loss_mlp": 0.01271787, + "balance_loss_clip": 0.06288652, + "balance_loss_mlp": 0.01257076, + "epoch": 0.3496167142642417, + "flos": 26472342349440.0, + "grad_norm": 1.8186936331307002, + "language_loss": 0.85099685, + "learning_rate": 3.020726562247328e-06, + "loss": 0.92842519, + "num_input_tokens_seen": 125110070, + "router_z_loss_clip": 1.82519531, + "router_z_loss_mlp": 0.1472168, + "step": 5815, + "time_per_iteration": 2.597399950027466 + }, + { + "auxiliary_loss_clip": 0.06466906, + "auxiliary_loss_mlp": 0.01275707, + "balance_loss_clip": 0.06285352, + "balance_loss_mlp": 0.01261712, + "epoch": 0.34967683751690964, + "flos": 17419618609920.0, + "grad_norm": 2.3640337842934565, + "language_loss": 0.78006089, + "learning_rate": 3.0203916218546024e-06, + "loss": 0.85748702, + "num_input_tokens_seen": 125125730, + "router_z_loss_clip": 1.81542969, + "router_z_loss_mlp": 0.13995361, + "step": 5816, + "time_per_iteration": 2.5164036750793457 + }, + { + "auxiliary_loss_clip": 0.0646984, + "auxiliary_loss_mlp": 0.01273456, + "balance_loss_clip": 0.06286636, + "balance_loss_mlp": 0.01258692, + "epoch": 0.3497369607695776, + "flos": 22606365870720.0, + "grad_norm": 1.8515414586733512, + "language_loss": 0.59787703, + "learning_rate": 3.0200566427685246e-06, + "loss": 0.6753099, + "num_input_tokens_seen": 125146195, + "router_z_loss_clip": 1.83398438, + "router_z_loss_mlp": 0.14764404, + "step": 5817, + "time_per_iteration": 2.542877674102783 + }, + { + "auxiliary_loss_clip": 0.06358884, + "auxiliary_loss_mlp": 0.01261904, + "balance_loss_clip": 0.06277611, + "balance_loss_mlp": 0.01257669, + "epoch": 0.34979708402224563, + "flos": 68548461477120.0, + "grad_norm": 0.858700346008579, + "language_loss": 0.59824663, + "learning_rate": 3.0197216250017975e-06, + "loss": 0.67445457, + "num_input_tokens_seen": 125207790, + "router_z_loss_clip": 0.8125, + "router_z_loss_mlp": 0.04238892, + "step": 5818, + "time_per_iteration": 3.1992976665496826 + }, + { + "auxiliary_loss_clip": 0.06459703, + "auxiliary_loss_mlp": 0.01271152, + "balance_loss_clip": 0.06283519, + "balance_loss_mlp": 0.01257109, + "epoch": 0.3498572072749136, + "flos": 18995660455680.0, + "grad_norm": 1.926998914600137, + "language_loss": 0.83806789, + "learning_rate": 3.019386568567123e-06, + "loss": 0.91537642, + "num_input_tokens_seen": 125226220, + "router_z_loss_clip": 1.76074219, + "router_z_loss_mlp": 0.14031982, + "step": 5819, + "time_per_iteration": 2.5241613388061523 + }, + { + "auxiliary_loss_clip": 0.06466879, + "auxiliary_loss_mlp": 0.01269175, + "balance_loss_clip": 0.0628517, + "balance_loss_mlp": 0.0125493, + "epoch": 0.34991733052758156, + "flos": 27826334075520.0, + "grad_norm": 2.092302610514248, + "language_loss": 0.71273863, + "learning_rate": 3.0190514734772083e-06, + "loss": 0.79009914, + "num_input_tokens_seen": 125247485, + "router_z_loss_clip": 1.81835938, + "router_z_loss_mlp": 0.14245605, + "step": 5820, + "time_per_iteration": 2.569838762283325 + }, + { + "auxiliary_loss_clip": 0.06470378, + "auxiliary_loss_mlp": 0.01270567, + "balance_loss_clip": 0.06288413, + "balance_loss_mlp": 0.01256292, + "epoch": 0.3499774537802495, + "flos": 33592706755200.0, + "grad_norm": 2.4345068466865083, + "language_loss": 0.70581877, + "learning_rate": 3.018716339744759e-06, + "loss": 0.78322828, + "num_input_tokens_seen": 125268625, + "router_z_loss_clip": 1.81835938, + "router_z_loss_mlp": 0.14294434, + "step": 5821, + "time_per_iteration": 2.6535534858703613 + }, + { + "auxiliary_loss_clip": 0.06479154, + "auxiliary_loss_mlp": 0.0127118, + "balance_loss_clip": 0.06291604, + "balance_loss_mlp": 0.01254538, + "epoch": 0.3500375770329175, + "flos": 23483413756800.0, + "grad_norm": 1.9533795991074365, + "language_loss": 0.74227631, + "learning_rate": 3.0183811673824842e-06, + "loss": 0.81977963, + "num_input_tokens_seen": 125287530, + "router_z_loss_clip": 1.87402344, + "router_z_loss_mlp": 0.16650391, + "step": 5822, + "time_per_iteration": 5.406672716140747 + }, + { + "auxiliary_loss_clip": 0.06470097, + "auxiliary_loss_mlp": 0.01273086, + "balance_loss_clip": 0.06285684, + "balance_loss_mlp": 0.01257588, + "epoch": 0.35009770028558546, + "flos": 19032067854720.0, + "grad_norm": 2.646032233627204, + "language_loss": 0.7905609, + "learning_rate": 3.018045956403094e-06, + "loss": 0.86799276, + "num_input_tokens_seen": 125307020, + "router_z_loss_clip": 1.84570312, + "router_z_loss_mlp": 0.15496826, + "step": 5823, + "time_per_iteration": 2.5048515796661377 + }, + { + "auxiliary_loss_clip": 0.06353101, + "auxiliary_loss_mlp": 0.01254576, + "balance_loss_clip": 0.06271273, + "balance_loss_mlp": 0.01249748, + "epoch": 0.3501578235382534, + "flos": 68371749216000.0, + "grad_norm": 0.6915411290730273, + "language_loss": 0.58945203, + "learning_rate": 3.017710706819298e-06, + "loss": 0.66552877, + "num_input_tokens_seen": 125370445, + "router_z_loss_clip": 0.81689453, + "router_z_loss_mlp": 0.04821777, + "step": 5824, + "time_per_iteration": 3.209726333618164 + }, + { + "auxiliary_loss_clip": 0.06465952, + "auxiliary_loss_mlp": 0.01274281, + "balance_loss_clip": 0.06284555, + "balance_loss_mlp": 0.01258045, + "epoch": 0.3502179467909214, + "flos": 21257153827200.0, + "grad_norm": 3.0621504018438164, + "language_loss": 0.85168576, + "learning_rate": 3.017375418643811e-06, + "loss": 0.92908812, + "num_input_tokens_seen": 125388900, + "router_z_loss_clip": 1.81445312, + "router_z_loss_mlp": 0.16223145, + "step": 5825, + "time_per_iteration": 2.513498067855835 + }, + { + "auxiliary_loss_clip": 0.06462917, + "auxiliary_loss_mlp": 0.01268842, + "balance_loss_clip": 0.06283134, + "balance_loss_mlp": 0.01254275, + "epoch": 0.35027807004358935, + "flos": 11946978817920.0, + "grad_norm": 2.498923152973308, + "language_loss": 0.83643848, + "learning_rate": 3.0170400918893464e-06, + "loss": 0.91375613, + "num_input_tokens_seen": 125402675, + "router_z_loss_clip": 1.79882812, + "router_z_loss_mlp": 0.14556885, + "step": 5826, + "time_per_iteration": 3.9313511848449707 + }, + { + "auxiliary_loss_clip": 0.06470059, + "auxiliary_loss_mlp": 0.01272158, + "balance_loss_clip": 0.06284411, + "balance_loss_mlp": 0.01254956, + "epoch": 0.3503381932962573, + "flos": 21477401084160.0, + "grad_norm": 2.100708343809493, + "language_loss": 0.81216669, + "learning_rate": 3.0167047265686186e-06, + "loss": 0.88958883, + "num_input_tokens_seen": 125421360, + "router_z_loss_clip": 1.85644531, + "router_z_loss_mlp": 0.17211914, + "step": 5827, + "time_per_iteration": 2.556704044342041 + }, + { + "auxiliary_loss_clip": 0.06462219, + "auxiliary_loss_mlp": 0.01272255, + "balance_loss_clip": 0.06283772, + "balance_loss_mlp": 0.01257473, + "epoch": 0.3503983165489253, + "flos": 21257405389440.0, + "grad_norm": 2.0166313071454858, + "language_loss": 0.71145403, + "learning_rate": 3.0163693226943467e-06, + "loss": 0.78879881, + "num_input_tokens_seen": 125440000, + "router_z_loss_clip": 1.78515625, + "router_z_loss_mlp": 0.14794922, + "step": 5828, + "time_per_iteration": 2.5173239707946777 + }, + { + "auxiliary_loss_clip": 0.06467165, + "auxiliary_loss_mlp": 0.01274622, + "balance_loss_clip": 0.06285597, + "balance_loss_mlp": 0.01257539, + "epoch": 0.35045843980159325, + "flos": 27822644496000.0, + "grad_norm": 1.678964319221545, + "language_loss": 0.79897165, + "learning_rate": 3.016033880279248e-06, + "loss": 0.8763895, + "num_input_tokens_seen": 125460390, + "router_z_loss_clip": 1.81542969, + "router_z_loss_mlp": 0.17077637, + "step": 5829, + "time_per_iteration": 4.086450099945068 + }, + { + "auxiliary_loss_clip": 0.06475446, + "auxiliary_loss_mlp": 0.01275238, + "balance_loss_clip": 0.06286699, + "balance_loss_mlp": 0.01257988, + "epoch": 0.3505185630542612, + "flos": 25928201934720.0, + "grad_norm": 1.7428196933402165, + "language_loss": 0.72440839, + "learning_rate": 3.0156983993360417e-06, + "loss": 0.80191517, + "num_input_tokens_seen": 125478410, + "router_z_loss_clip": 1.88769531, + "router_z_loss_mlp": 0.17248535, + "step": 5830, + "time_per_iteration": 2.625723361968994 + }, + { + "auxiliary_loss_clip": 0.06461293, + "auxiliary_loss_mlp": 0.01273843, + "balance_loss_clip": 0.06283247, + "balance_loss_mlp": 0.01259633, + "epoch": 0.35057868630692923, + "flos": 20527999597440.0, + "grad_norm": 2.5118715805025884, + "language_loss": 0.88613749, + "learning_rate": 3.0153628798774513e-06, + "loss": 0.96348894, + "num_input_tokens_seen": 125495975, + "router_z_loss_clip": 1.78222656, + "router_z_loss_mlp": 0.14221191, + "step": 5831, + "time_per_iteration": 2.577260732650757 + }, + { + "auxiliary_loss_clip": 0.06466363, + "auxiliary_loss_mlp": 0.01273549, + "balance_loss_clip": 0.06284641, + "balance_loss_mlp": 0.01258672, + "epoch": 0.3506388095595972, + "flos": 20454849383040.0, + "grad_norm": 2.013142681723478, + "language_loss": 0.78719735, + "learning_rate": 3.0150273219161985e-06, + "loss": 0.86459637, + "num_input_tokens_seen": 125515035, + "router_z_loss_clip": 1.81738281, + "router_z_loss_mlp": 0.14868164, + "step": 5832, + "time_per_iteration": 2.584496021270752 + }, + { + "auxiliary_loss_clip": 0.06470136, + "auxiliary_loss_mlp": 0.01274951, + "balance_loss_clip": 0.06284127, + "balance_loss_mlp": 0.01258536, + "epoch": 0.35069893281226516, + "flos": 23115901749120.0, + "grad_norm": 3.869403317005625, + "language_loss": 0.71628016, + "learning_rate": 3.014691725465008e-06, + "loss": 0.79373109, + "num_input_tokens_seen": 125535555, + "router_z_loss_clip": 1.86230469, + "router_z_loss_mlp": 0.1640625, + "step": 5833, + "time_per_iteration": 2.559213161468506 + }, + { + "auxiliary_loss_clip": 0.06462866, + "auxiliary_loss_mlp": 0.01271192, + "balance_loss_clip": 0.06285653, + "balance_loss_mlp": 0.01256291, + "epoch": 0.35075905606493313, + "flos": 27279426476160.0, + "grad_norm": 2.081089463640026, + "language_loss": 0.80963689, + "learning_rate": 3.014356090536606e-06, + "loss": 0.88697743, + "num_input_tokens_seen": 125558195, + "router_z_loss_clip": 1.77246094, + "router_z_loss_mlp": 0.14892578, + "step": 5834, + "time_per_iteration": 2.6462955474853516 + }, + { + "auxiliary_loss_clip": 0.06469317, + "auxiliary_loss_mlp": 0.0127505, + "balance_loss_clip": 0.06288308, + "balance_loss_mlp": 0.01258634, + "epoch": 0.3508191793176011, + "flos": 19133491622400.0, + "grad_norm": 2.5340357013843566, + "language_loss": 0.84608614, + "learning_rate": 3.0140204171437183e-06, + "loss": 0.92352986, + "num_input_tokens_seen": 125575375, + "router_z_loss_clip": 1.80957031, + "router_z_loss_mlp": 0.1640625, + "step": 5835, + "time_per_iteration": 2.5068061351776123 + }, + { + "auxiliary_loss_clip": 0.06463549, + "auxiliary_loss_mlp": 0.01274357, + "balance_loss_clip": 0.0628426, + "balance_loss_mlp": 0.01259122, + "epoch": 0.35087930257026906, + "flos": 25564798776960.0, + "grad_norm": 1.6798272602016127, + "language_loss": 0.77162683, + "learning_rate": 3.0136847052990754e-06, + "loss": 0.84900588, + "num_input_tokens_seen": 125596745, + "router_z_loss_clip": 1.79296875, + "router_z_loss_mlp": 0.15234375, + "step": 5836, + "time_per_iteration": 2.628737449645996 + }, + { + "auxiliary_loss_clip": 0.06462973, + "auxiliary_loss_mlp": 0.01284097, + "balance_loss_clip": 0.06285001, + "balance_loss_mlp": 0.01268767, + "epoch": 0.350939425822937, + "flos": 18010061205120.0, + "grad_norm": 1.7914903677000888, + "language_loss": 0.7777887, + "learning_rate": 3.0133489550154074e-06, + "loss": 0.85525942, + "num_input_tokens_seen": 125613980, + "router_z_loss_clip": 1.77929688, + "router_z_loss_mlp": 0.15325928, + "step": 5837, + "time_per_iteration": 2.4906866550445557 + }, + { + "auxiliary_loss_clip": 0.06464779, + "auxiliary_loss_mlp": 0.0127724, + "balance_loss_clip": 0.0628402, + "balance_loss_mlp": 0.01261575, + "epoch": 0.350999549075605, + "flos": 22279747455360.0, + "grad_norm": 2.3774474075228995, + "language_loss": 0.68712002, + "learning_rate": 3.0130131663054442e-06, + "loss": 0.7645402, + "num_input_tokens_seen": 125632100, + "router_z_loss_clip": 1.8046875, + "router_z_loss_mlp": 0.15649414, + "step": 5838, + "time_per_iteration": 2.616330862045288 + }, + { + "auxiliary_loss_clip": 0.06463079, + "auxiliary_loss_mlp": 0.01275242, + "balance_loss_clip": 0.0628327, + "balance_loss_mlp": 0.01259554, + "epoch": 0.35105967232827295, + "flos": 14397511000320.0, + "grad_norm": 2.135026117356547, + "language_loss": 0.83941519, + "learning_rate": 3.0126773391819215e-06, + "loss": 0.91679841, + "num_input_tokens_seen": 125649190, + "router_z_loss_clip": 1.79785156, + "router_z_loss_mlp": 0.15686035, + "step": 5839, + "time_per_iteration": 2.475210428237915 + }, + { + "auxiliary_loss_clip": 0.06472797, + "auxiliary_loss_mlp": 0.01274732, + "balance_loss_clip": 0.06285894, + "balance_loss_mlp": 0.01258376, + "epoch": 0.3511197955809409, + "flos": 25089322383360.0, + "grad_norm": 2.313381638226651, + "language_loss": 0.58970249, + "learning_rate": 3.012341473657572e-06, + "loss": 0.6671778, + "num_input_tokens_seen": 125668680, + "router_z_loss_clip": 1.86914062, + "router_z_loss_mlp": 0.16357422, + "step": 5840, + "time_per_iteration": 2.5654497146606445 + }, + { + "auxiliary_loss_clip": 0.06465258, + "auxiliary_loss_mlp": 0.01277785, + "balance_loss_clip": 0.06280696, + "balance_loss_mlp": 0.0126174, + "epoch": 0.3511799188336089, + "flos": 25891123703040.0, + "grad_norm": 2.5798747861510254, + "language_loss": 0.87567091, + "learning_rate": 3.0120055697451322e-06, + "loss": 0.9531014, + "num_input_tokens_seen": 125686935, + "router_z_loss_clip": 1.84570312, + "router_z_loss_mlp": 0.16040039, + "step": 5841, + "time_per_iteration": 2.5275204181671143 + }, + { + "auxiliary_loss_clip": 0.06473795, + "auxiliary_loss_mlp": 0.01278097, + "balance_loss_clip": 0.0628502, + "balance_loss_mlp": 0.01261038, + "epoch": 0.35124004208627685, + "flos": 20089852997760.0, + "grad_norm": 1.7442007932185601, + "language_loss": 0.7546367, + "learning_rate": 3.0116696274573406e-06, + "loss": 0.83215564, + "num_input_tokens_seen": 125707180, + "router_z_loss_clip": 1.88964844, + "router_z_loss_mlp": 0.17077637, + "step": 5842, + "time_per_iteration": 2.5876784324645996 + }, + { + "auxiliary_loss_clip": 0.06465417, + "auxiliary_loss_mlp": 0.01280375, + "balance_loss_clip": 0.06280544, + "balance_loss_mlp": 0.01265105, + "epoch": 0.3513001653389448, + "flos": 17788891553280.0, + "grad_norm": 2.704982383226077, + "language_loss": 0.68951106, + "learning_rate": 3.0113336468069346e-06, + "loss": 0.76696897, + "num_input_tokens_seen": 125722780, + "router_z_loss_clip": 1.84765625, + "router_z_loss_mlp": 0.15258789, + "step": 5843, + "time_per_iteration": 2.4710304737091064 + }, + { + "auxiliary_loss_clip": 0.06466319, + "auxiliary_loss_mlp": 0.01285229, + "balance_loss_clip": 0.0628369, + "balance_loss_mlp": 0.01268892, + "epoch": 0.3513602885916128, + "flos": 29394745200000.0, + "grad_norm": 2.1140022916881525, + "language_loss": 0.66181982, + "learning_rate": 3.010997627806655e-06, + "loss": 0.7393353, + "num_input_tokens_seen": 125742110, + "router_z_loss_clip": 1.82714844, + "router_z_loss_mlp": 0.16326904, + "step": 5844, + "time_per_iteration": 2.585793972015381 + }, + { + "auxiliary_loss_clip": 0.06472903, + "auxiliary_loss_mlp": 0.01282408, + "balance_loss_clip": 0.0628912, + "balance_loss_mlp": 0.01265761, + "epoch": 0.3514204118442808, + "flos": 16185372768000.0, + "grad_norm": 2.0590361589883206, + "language_loss": 0.75743866, + "learning_rate": 3.010661570469245e-06, + "loss": 0.83499175, + "num_input_tokens_seen": 125759980, + "router_z_loss_clip": 1.83789062, + "router_z_loss_mlp": 0.1663208, + "step": 5845, + "time_per_iteration": 2.50748348236084 + }, + { + "auxiliary_loss_clip": 0.06463686, + "auxiliary_loss_mlp": 0.01285129, + "balance_loss_clip": 0.06284383, + "balance_loss_mlp": 0.01270102, + "epoch": 0.35148053509694877, + "flos": 23840234369280.0, + "grad_norm": 5.020955850717412, + "language_loss": 0.73988718, + "learning_rate": 3.0103254748074465e-06, + "loss": 0.8173753, + "num_input_tokens_seen": 125772660, + "router_z_loss_clip": 1.79101562, + "router_z_loss_mlp": 0.15032959, + "step": 5846, + "time_per_iteration": 2.626898765563965 + }, + { + "auxiliary_loss_clip": 0.06470932, + "auxiliary_loss_mlp": 0.01280544, + "balance_loss_clip": 0.06285631, + "balance_loss_mlp": 0.01265482, + "epoch": 0.35154065834961673, + "flos": 20996809591680.0, + "grad_norm": 1.7410870567887373, + "language_loss": 0.75501883, + "learning_rate": 3.0099893408340046e-06, + "loss": 0.8325336, + "num_input_tokens_seen": 125791935, + "router_z_loss_clip": 1.85253906, + "router_z_loss_mlp": 0.1506958, + "step": 5847, + "time_per_iteration": 2.559131383895874 + }, + { + "auxiliary_loss_clip": 0.06472816, + "auxiliary_loss_mlp": 0.01272158, + "balance_loss_clip": 0.06284919, + "balance_loss_mlp": 0.01257316, + "epoch": 0.3516007816022847, + "flos": 33263866206720.0, + "grad_norm": 1.8955744454716683, + "language_loss": 0.72774404, + "learning_rate": 3.009653168561666e-06, + "loss": 0.80519378, + "num_input_tokens_seen": 125813455, + "router_z_loss_clip": 1.87792969, + "router_z_loss_mlp": 0.1484375, + "step": 5848, + "time_per_iteration": 2.6645965576171875 + }, + { + "auxiliary_loss_clip": 0.06467354, + "auxiliary_loss_mlp": 0.01280776, + "balance_loss_clip": 0.06280826, + "balance_loss_mlp": 0.01265124, + "epoch": 0.35166090485495266, + "flos": 11731427389440.0, + "grad_norm": 2.1922530808110983, + "language_loss": 0.90064394, + "learning_rate": 3.009316958003178e-06, + "loss": 0.97812521, + "num_input_tokens_seen": 125827660, + "router_z_loss_clip": 1.86425781, + "router_z_loss_mlp": 0.15655518, + "step": 5849, + "time_per_iteration": 2.4567575454711914 + }, + { + "auxiliary_loss_clip": 0.06464183, + "auxiliary_loss_mlp": 0.01272929, + "balance_loss_clip": 0.06281896, + "balance_loss_mlp": 0.01257461, + "epoch": 0.3517210281076206, + "flos": 22645121184000.0, + "grad_norm": 2.4964624006606946, + "language_loss": 0.75405449, + "learning_rate": 3.0089807091712897e-06, + "loss": 0.83142555, + "num_input_tokens_seen": 125846655, + "router_z_loss_clip": 1.82324219, + "router_z_loss_mlp": 0.15472412, + "step": 5850, + "time_per_iteration": 2.5980029106140137 + }, + { + "auxiliary_loss_clip": 0.06463099, + "auxiliary_loss_mlp": 0.01274678, + "balance_loss_clip": 0.06282984, + "balance_loss_mlp": 0.01259842, + "epoch": 0.3517811513602886, + "flos": 21328836595200.0, + "grad_norm": 2.0250770904548303, + "language_loss": 0.76385641, + "learning_rate": 3.0086444220787515e-06, + "loss": 0.84123409, + "num_input_tokens_seen": 125866290, + "router_z_loss_clip": 1.80371094, + "router_z_loss_mlp": 0.14825439, + "step": 5851, + "time_per_iteration": 2.5065958499908447 + }, + { + "auxiliary_loss_clip": 0.06463097, + "auxiliary_loss_mlp": 0.01275014, + "balance_loss_clip": 0.06281513, + "balance_loss_mlp": 0.01258933, + "epoch": 0.35184127461295656, + "flos": 21039254484480.0, + "grad_norm": 1.95256002439052, + "language_loss": 0.88133335, + "learning_rate": 3.0083080967383165e-06, + "loss": 0.95871449, + "num_input_tokens_seen": 125884620, + "router_z_loss_clip": 1.81738281, + "router_z_loss_mlp": 0.1607666, + "step": 5852, + "time_per_iteration": 2.571439266204834 + }, + { + "auxiliary_loss_clip": 0.06461711, + "auxiliary_loss_mlp": 0.01273084, + "balance_loss_clip": 0.06282608, + "balance_loss_mlp": 0.01258087, + "epoch": 0.3519013978656245, + "flos": 22461784450560.0, + "grad_norm": 2.1690150127965038, + "language_loss": 0.68480182, + "learning_rate": 3.007971733162737e-06, + "loss": 0.76214981, + "num_input_tokens_seen": 125902430, + "router_z_loss_clip": 1.79101562, + "router_z_loss_mlp": 0.14990234, + "step": 5853, + "time_per_iteration": 2.5121214389801025 + }, + { + "auxiliary_loss_clip": 0.06466305, + "auxiliary_loss_mlp": 0.0127272, + "balance_loss_clip": 0.06282477, + "balance_loss_mlp": 0.01256972, + "epoch": 0.3519615211182925, + "flos": 13120317141120.0, + "grad_norm": 2.1084516189193403, + "language_loss": 0.81284809, + "learning_rate": 3.0076353313647686e-06, + "loss": 0.89023829, + "num_input_tokens_seen": 125920570, + "router_z_loss_clip": 1.83691406, + "router_z_loss_mlp": 0.15734863, + "step": 5854, + "time_per_iteration": 2.644672155380249 + }, + { + "auxiliary_loss_clip": 0.06456967, + "auxiliary_loss_mlp": 0.01267471, + "balance_loss_clip": 0.06279022, + "balance_loss_mlp": 0.01253481, + "epoch": 0.35202164437096045, + "flos": 19141122343680.0, + "grad_norm": 1.5283351736697255, + "language_loss": 0.73366165, + "learning_rate": 3.0072988913571666e-06, + "loss": 0.81090605, + "num_input_tokens_seen": 125939800, + "router_z_loss_clip": 1.77832031, + "router_z_loss_mlp": 0.13970947, + "step": 5855, + "time_per_iteration": 2.489614486694336 + }, + { + "auxiliary_loss_clip": 0.06458069, + "auxiliary_loss_mlp": 0.01271058, + "balance_loss_clip": 0.06279419, + "balance_loss_mlp": 0.01256717, + "epoch": 0.3520817676236284, + "flos": 26549475632640.0, + "grad_norm": 1.8023400431296785, + "language_loss": 0.71055883, + "learning_rate": 3.006962413152691e-06, + "loss": 0.78785008, + "num_input_tokens_seen": 125958720, + "router_z_loss_clip": 1.78613281, + "router_z_loss_mlp": 0.14337158, + "step": 5856, + "time_per_iteration": 2.5643463134765625 + }, + { + "auxiliary_loss_clip": 0.064651, + "auxiliary_loss_mlp": 0.01271649, + "balance_loss_clip": 0.062787, + "balance_loss_mlp": 0.01255663, + "epoch": 0.3521418908762964, + "flos": 44903653557120.0, + "grad_norm": 1.9243906825553334, + "language_loss": 0.61456323, + "learning_rate": 3.0066258967640987e-06, + "loss": 0.69193071, + "num_input_tokens_seen": 125984310, + "router_z_loss_clip": 1.86523438, + "router_z_loss_mlp": 0.16003418, + "step": 5857, + "time_per_iteration": 2.723026752471924 + }, + { + "auxiliary_loss_clip": 0.06463988, + "auxiliary_loss_mlp": 0.0126934, + "balance_loss_clip": 0.06281644, + "balance_loss_mlp": 0.01253569, + "epoch": 0.3522020141289644, + "flos": 20192576503680.0, + "grad_norm": 1.9490734994800325, + "language_loss": 0.73682863, + "learning_rate": 3.006289342204152e-06, + "loss": 0.8141619, + "num_input_tokens_seen": 126002410, + "router_z_loss_clip": 1.82324219, + "router_z_loss_mlp": 0.15765381, + "step": 5858, + "time_per_iteration": 2.5245583057403564 + }, + { + "auxiliary_loss_clip": 0.0646653, + "auxiliary_loss_mlp": 0.01270245, + "balance_loss_clip": 0.06283493, + "balance_loss_mlp": 0.01255368, + "epoch": 0.35226213738163237, + "flos": 27571398428160.0, + "grad_norm": 1.5191641480211209, + "language_loss": 0.76385832, + "learning_rate": 3.0059527494856126e-06, + "loss": 0.8412261, + "num_input_tokens_seen": 126022490, + "router_z_loss_clip": 1.83105469, + "router_z_loss_mlp": 0.14880371, + "step": 5859, + "time_per_iteration": 2.5650510787963867 + }, + { + "auxiliary_loss_clip": 0.06474233, + "auxiliary_loss_mlp": 0.01272168, + "balance_loss_clip": 0.06283402, + "balance_loss_mlp": 0.01256862, + "epoch": 0.35232226063430033, + "flos": 22972955483520.0, + "grad_norm": 2.0210321352313305, + "language_loss": 0.72436023, + "learning_rate": 3.0056161186212435e-06, + "loss": 0.80182427, + "num_input_tokens_seen": 126042895, + "router_z_loss_clip": 1.90820312, + "router_z_loss_mlp": 0.15307617, + "step": 5860, + "time_per_iteration": 2.557419776916504 + }, + { + "auxiliary_loss_clip": 0.06468037, + "auxiliary_loss_mlp": 0.01273009, + "balance_loss_clip": 0.06280215, + "balance_loss_mlp": 0.01257304, + "epoch": 0.3523823838869683, + "flos": 19173714382080.0, + "grad_norm": 2.1675794505809076, + "language_loss": 0.66646308, + "learning_rate": 3.005279449623811e-06, + "loss": 0.74387354, + "num_input_tokens_seen": 126060130, + "router_z_loss_clip": 1.87890625, + "router_z_loss_mlp": 0.15704346, + "step": 5861, + "time_per_iteration": 5.330287218093872 + }, + { + "auxiliary_loss_clip": 0.06464717, + "auxiliary_loss_mlp": 0.01272322, + "balance_loss_clip": 0.06283809, + "balance_loss_mlp": 0.01257331, + "epoch": 0.35244250713963626, + "flos": 17936743282560.0, + "grad_norm": 1.8073030876467324, + "language_loss": 0.67339319, + "learning_rate": 3.0049427425060815e-06, + "loss": 0.7507636, + "num_input_tokens_seen": 126077850, + "router_z_loss_clip": 1.80859375, + "router_z_loss_mlp": 0.15002441, + "step": 5862, + "time_per_iteration": 2.545534372329712 + }, + { + "auxiliary_loss_clip": 0.06465253, + "auxiliary_loss_mlp": 0.01277428, + "balance_loss_clip": 0.06279148, + "balance_loss_mlp": 0.01260775, + "epoch": 0.35250263039230423, + "flos": 21438687697920.0, + "grad_norm": 2.06594301339393, + "language_loss": 0.76956195, + "learning_rate": 3.0046059972808215e-06, + "loss": 0.8469888, + "num_input_tokens_seen": 126095985, + "router_z_loss_clip": 1.859375, + "router_z_loss_mlp": 0.16650391, + "step": 5863, + "time_per_iteration": 2.5614800453186035 + }, + { + "auxiliary_loss_clip": 0.06466909, + "auxiliary_loss_mlp": 0.01270449, + "balance_loss_clip": 0.06283094, + "balance_loss_mlp": 0.01255846, + "epoch": 0.3525627536449722, + "flos": 27424133677440.0, + "grad_norm": 1.7204880099735786, + "language_loss": 0.75455201, + "learning_rate": 3.0042692139608024e-06, + "loss": 0.83192563, + "num_input_tokens_seen": 126116070, + "router_z_loss_clip": 1.83789062, + "router_z_loss_mlp": 0.14605713, + "step": 5864, + "time_per_iteration": 2.590428113937378 + }, + { + "auxiliary_loss_clip": 0.06465425, + "auxiliary_loss_mlp": 0.01271849, + "balance_loss_clip": 0.06283714, + "balance_loss_mlp": 0.01257306, + "epoch": 0.35262287689764016, + "flos": 24796637671680.0, + "grad_norm": 2.274548371802061, + "language_loss": 0.79325253, + "learning_rate": 3.003932392558793e-06, + "loss": 0.87062526, + "num_input_tokens_seen": 126135205, + "router_z_loss_clip": 1.81738281, + "router_z_loss_mlp": 0.14550781, + "step": 5865, + "time_per_iteration": 4.090251922607422 + }, + { + "auxiliary_loss_clip": 0.06479216, + "auxiliary_loss_mlp": 0.01273849, + "balance_loss_clip": 0.06290671, + "balance_loss_mlp": 0.01257935, + "epoch": 0.3526830001503081, + "flos": 17827353377280.0, + "grad_norm": 3.6346687905375155, + "language_loss": 0.81561065, + "learning_rate": 3.0035955330875677e-06, + "loss": 0.89314139, + "num_input_tokens_seen": 126151895, + "router_z_loss_clip": 1.88476562, + "router_z_loss_mlp": 0.15917969, + "step": 5866, + "time_per_iteration": 2.5417611598968506 + }, + { + "auxiliary_loss_clip": 0.06481875, + "auxiliary_loss_mlp": 0.01272499, + "balance_loss_clip": 0.06287797, + "balance_loss_mlp": 0.01255226, + "epoch": 0.3527431234029761, + "flos": 18084091887360.0, + "grad_norm": 2.1275369997353692, + "language_loss": 0.84947896, + "learning_rate": 3.0032586355598986e-06, + "loss": 0.9270227, + "num_input_tokens_seen": 126168515, + "router_z_loss_clip": 1.94042969, + "router_z_loss_mlp": 0.17272949, + "step": 5867, + "time_per_iteration": 2.487138509750366 + }, + { + "auxiliary_loss_clip": 0.06472977, + "auxiliary_loss_mlp": 0.01270369, + "balance_loss_clip": 0.06285943, + "balance_loss_mlp": 0.01254431, + "epoch": 0.35280324665564405, + "flos": 19433429712000.0, + "grad_norm": 2.157782607866355, + "language_loss": 0.74828005, + "learning_rate": 3.0029216999885613e-06, + "loss": 0.82571352, + "num_input_tokens_seen": 126186460, + "router_z_loss_clip": 1.86816406, + "router_z_loss_mlp": 0.15942383, + "step": 5868, + "time_per_iteration": 2.536522150039673 + }, + { + "auxiliary_loss_clip": 0.06471637, + "auxiliary_loss_mlp": 0.01277122, + "balance_loss_clip": 0.06284134, + "balance_loss_mlp": 0.01260277, + "epoch": 0.352863369908312, + "flos": 21509951195520.0, + "grad_norm": 2.023756469283546, + "language_loss": 0.6153, + "learning_rate": 3.0025847263863327e-06, + "loss": 0.69278765, + "num_input_tokens_seen": 126206170, + "router_z_loss_clip": 1.87402344, + "router_z_loss_mlp": 0.16845703, + "step": 5869, + "time_per_iteration": 3.977250099182129 + }, + { + "auxiliary_loss_clip": 0.06469242, + "auxiliary_loss_mlp": 0.01275411, + "balance_loss_clip": 0.06282457, + "balance_loss_mlp": 0.01259985, + "epoch": 0.35292349316098, + "flos": 22316029073280.0, + "grad_norm": 3.8155591266042173, + "language_loss": 0.75253737, + "learning_rate": 3.0022477147659917e-06, + "loss": 0.82998383, + "num_input_tokens_seen": 126225605, + "router_z_loss_clip": 1.86914062, + "router_z_loss_mlp": 0.1541748, + "step": 5870, + "time_per_iteration": 2.5275635719299316 + }, + { + "auxiliary_loss_clip": 0.06466261, + "auxiliary_loss_mlp": 0.01271259, + "balance_loss_clip": 0.06282211, + "balance_loss_mlp": 0.01255964, + "epoch": 0.352983616413648, + "flos": 33118152756480.0, + "grad_norm": 1.8217533687724534, + "language_loss": 0.72204906, + "learning_rate": 3.001910665140316e-06, + "loss": 0.79942429, + "num_input_tokens_seen": 126250230, + "router_z_loss_clip": 1.84082031, + "router_z_loss_mlp": 0.1529541, + "step": 5871, + "time_per_iteration": 2.660351037979126 + }, + { + "auxiliary_loss_clip": 0.06463222, + "auxiliary_loss_mlp": 0.012708, + "balance_loss_clip": 0.0628562, + "balance_loss_mlp": 0.01257389, + "epoch": 0.35304373966631597, + "flos": 18702388765440.0, + "grad_norm": 1.8432981727531608, + "language_loss": 0.73899144, + "learning_rate": 3.0015735775220873e-06, + "loss": 0.81633162, + "num_input_tokens_seen": 126268315, + "router_z_loss_clip": 1.77636719, + "router_z_loss_mlp": 0.13415527, + "step": 5872, + "time_per_iteration": 2.501868724822998 + }, + { + "auxiliary_loss_clip": 0.06467956, + "auxiliary_loss_mlp": 0.01269552, + "balance_loss_clip": 0.06285646, + "balance_loss_mlp": 0.01255163, + "epoch": 0.35310386291898394, + "flos": 23371214739840.0, + "grad_norm": 1.6596154000518588, + "language_loss": 0.83059716, + "learning_rate": 3.001236451924089e-06, + "loss": 0.90797222, + "num_input_tokens_seen": 126288390, + "router_z_loss_clip": 1.8203125, + "router_z_loss_mlp": 0.14404297, + "step": 5873, + "time_per_iteration": 2.6044130325317383 + }, + { + "auxiliary_loss_clip": 0.06475792, + "auxiliary_loss_mlp": 0.01275098, + "balance_loss_clip": 0.06285458, + "balance_loss_mlp": 0.0125879, + "epoch": 0.3531639861716519, + "flos": 24468803372160.0, + "grad_norm": 2.6977932070351183, + "language_loss": 0.65726781, + "learning_rate": 3.000899288359104e-06, + "loss": 0.73477674, + "num_input_tokens_seen": 126305750, + "router_z_loss_clip": 1.90234375, + "router_z_loss_mlp": 0.16308594, + "step": 5874, + "time_per_iteration": 2.558915138244629 + }, + { + "auxiliary_loss_clip": 0.06370112, + "auxiliary_loss_mlp": 0.01273024, + "balance_loss_clip": 0.06287491, + "balance_loss_mlp": 0.01268941, + "epoch": 0.35322410942431987, + "flos": 70331040437760.0, + "grad_norm": 0.7490717453474699, + "language_loss": 0.616135, + "learning_rate": 3.000562086839917e-06, + "loss": 0.69256639, + "num_input_tokens_seen": 126362495, + "router_z_loss_clip": 0.82714844, + "router_z_loss_mlp": 0.04083252, + "step": 5875, + "time_per_iteration": 3.1286721229553223 + }, + { + "auxiliary_loss_clip": 0.06475496, + "auxiliary_loss_mlp": 0.01277595, + "balance_loss_clip": 0.06289661, + "balance_loss_mlp": 0.01262086, + "epoch": 0.35328423267698783, + "flos": 19825735328640.0, + "grad_norm": 2.073373185113386, + "language_loss": 0.8042345, + "learning_rate": 3.0002248473793163e-06, + "loss": 0.88176548, + "num_input_tokens_seen": 126378320, + "router_z_loss_clip": 1.85742188, + "router_z_loss_mlp": 0.15509033, + "step": 5876, + "time_per_iteration": 2.5174875259399414 + }, + { + "auxiliary_loss_clip": 0.063563, + "auxiliary_loss_mlp": 0.01261292, + "balance_loss_clip": 0.06274077, + "balance_loss_mlp": 0.01257364, + "epoch": 0.3533443559296558, + "flos": 60843398480640.0, + "grad_norm": 0.6578323239794136, + "language_loss": 0.56720114, + "learning_rate": 2.999887569990088e-06, + "loss": 0.64337707, + "num_input_tokens_seen": 126442735, + "router_z_loss_clip": 0.82128906, + "router_z_loss_mlp": 0.03924561, + "step": 5877, + "time_per_iteration": 3.239800214767456 + }, + { + "auxiliary_loss_clip": 0.0647119, + "auxiliary_loss_mlp": 0.01275609, + "balance_loss_clip": 0.06286252, + "balance_loss_mlp": 0.01259301, + "epoch": 0.35340447918232376, + "flos": 24762997457280.0, + "grad_norm": 1.7728898292153, + "language_loss": 0.72425848, + "learning_rate": 2.999550254685024e-06, + "loss": 0.80172646, + "num_input_tokens_seen": 126463090, + "router_z_loss_clip": 1.84863281, + "router_z_loss_mlp": 0.16308594, + "step": 5878, + "time_per_iteration": 2.576354742050171 + }, + { + "auxiliary_loss_clip": 0.06470102, + "auxiliary_loss_mlp": 0.01272441, + "balance_loss_clip": 0.06286008, + "balance_loss_mlp": 0.01256789, + "epoch": 0.3534646024349917, + "flos": 21802342417920.0, + "grad_norm": 2.4353464978664494, + "language_loss": 0.78682542, + "learning_rate": 2.9992129014769136e-06, + "loss": 0.86425084, + "num_input_tokens_seen": 126482105, + "router_z_loss_clip": 1.84082031, + "router_z_loss_mlp": 0.15649414, + "step": 5879, + "time_per_iteration": 2.535600423812866 + }, + { + "auxiliary_loss_clip": 0.06481053, + "auxiliary_loss_mlp": 0.01271703, + "balance_loss_clip": 0.0628894, + "balance_loss_mlp": 0.01253714, + "epoch": 0.3535247256876597, + "flos": 20018463719040.0, + "grad_norm": 2.0590866059314035, + "language_loss": 0.63551295, + "learning_rate": 2.9988755103785493e-06, + "loss": 0.71304053, + "num_input_tokens_seen": 126502125, + "router_z_loss_clip": 1.91992188, + "router_z_loss_mlp": 0.17980957, + "step": 5880, + "time_per_iteration": 2.5576937198638916 + }, + { + "auxiliary_loss_clip": 0.06481048, + "auxiliary_loss_mlp": 0.01274855, + "balance_loss_clip": 0.06292346, + "balance_loss_mlp": 0.01258035, + "epoch": 0.35358484894032766, + "flos": 18193984917120.0, + "grad_norm": 2.6506562916801273, + "language_loss": 0.66346908, + "learning_rate": 2.998538081402727e-06, + "loss": 0.74102807, + "num_input_tokens_seen": 126521950, + "router_z_loss_clip": 1.88671875, + "router_z_loss_mlp": 0.16821289, + "step": 5881, + "time_per_iteration": 2.5375049114227295 + }, + { + "auxiliary_loss_clip": 0.06465093, + "auxiliary_loss_mlp": 0.01272514, + "balance_loss_clip": 0.06285467, + "balance_loss_mlp": 0.0125818, + "epoch": 0.3536449721929956, + "flos": 22826990471040.0, + "grad_norm": 1.7415962616346485, + "language_loss": 0.75838578, + "learning_rate": 2.998200614562239e-06, + "loss": 0.8357619, + "num_input_tokens_seen": 126542445, + "router_z_loss_clip": 1.79492188, + "router_z_loss_mlp": 0.14337158, + "step": 5882, + "time_per_iteration": 2.546163558959961 + }, + { + "auxiliary_loss_clip": 0.06472618, + "auxiliary_loss_mlp": 0.01271877, + "balance_loss_clip": 0.06285325, + "balance_loss_mlp": 0.01256189, + "epoch": 0.3537050954456636, + "flos": 26439540675840.0, + "grad_norm": 2.210270342508568, + "language_loss": 0.70790988, + "learning_rate": 2.9978631098698847e-06, + "loss": 0.78535485, + "num_input_tokens_seen": 126560690, + "router_z_loss_clip": 1.87402344, + "router_z_loss_mlp": 0.15692139, + "step": 5883, + "time_per_iteration": 2.5813896656036377 + }, + { + "auxiliary_loss_clip": 0.06481725, + "auxiliary_loss_mlp": 0.01274676, + "balance_loss_clip": 0.0628854, + "balance_loss_mlp": 0.01258105, + "epoch": 0.3537652186983316, + "flos": 17202096610560.0, + "grad_norm": 3.5308447991949348, + "language_loss": 0.7912811, + "learning_rate": 2.9975255673384614e-06, + "loss": 0.86884505, + "num_input_tokens_seen": 126577620, + "router_z_loss_clip": 1.9296875, + "router_z_loss_mlp": 0.16564941, + "step": 5884, + "time_per_iteration": 2.564178228378296 + }, + { + "auxiliary_loss_clip": 0.06469014, + "auxiliary_loss_mlp": 0.01273424, + "balance_loss_clip": 0.06285414, + "balance_loss_mlp": 0.01258142, + "epoch": 0.3538253419509996, + "flos": 19542861544320.0, + "grad_norm": 3.0890260502514173, + "language_loss": 0.76079619, + "learning_rate": 2.9971879869807673e-06, + "loss": 0.83822054, + "num_input_tokens_seen": 126596235, + "router_z_loss_clip": 1.83691406, + "router_z_loss_mlp": 0.15283203, + "step": 5885, + "time_per_iteration": 2.5860350131988525 + }, + { + "auxiliary_loss_clip": 0.06473316, + "auxiliary_loss_mlp": 0.01274145, + "balance_loss_clip": 0.06285691, + "balance_loss_mlp": 0.01257766, + "epoch": 0.35388546520366754, + "flos": 12133166590080.0, + "grad_norm": 4.983567417880078, + "language_loss": 0.83563066, + "learning_rate": 2.996850368809606e-06, + "loss": 0.91310525, + "num_input_tokens_seen": 126612830, + "router_z_loss_clip": 1.87597656, + "router_z_loss_mlp": 0.16357422, + "step": 5886, + "time_per_iteration": 2.549227714538574 + }, + { + "auxiliary_loss_clip": 0.06464715, + "auxiliary_loss_mlp": 0.01274591, + "balance_loss_clip": 0.06282739, + "balance_loss_mlp": 0.0125851, + "epoch": 0.3539455884563355, + "flos": 19683501822720.0, + "grad_norm": 3.219387216821374, + "language_loss": 0.78429639, + "learning_rate": 2.9965127128377787e-06, + "loss": 0.86168945, + "num_input_tokens_seen": 126630910, + "router_z_loss_clip": 1.82128906, + "router_z_loss_mlp": 0.16088867, + "step": 5887, + "time_per_iteration": 2.523743152618408 + }, + { + "auxiliary_loss_clip": 0.0646676, + "auxiliary_loss_mlp": 0.0127383, + "balance_loss_clip": 0.06282511, + "balance_loss_mlp": 0.01258631, + "epoch": 0.35400571170900347, + "flos": 18077006217600.0, + "grad_norm": 1.8956957640615841, + "language_loss": 0.66116667, + "learning_rate": 2.996175019078089e-06, + "loss": 0.7385726, + "num_input_tokens_seen": 126648365, + "router_z_loss_clip": 1.84570312, + "router_z_loss_mlp": 0.15197754, + "step": 5888, + "time_per_iteration": 2.5279300212860107 + }, + { + "auxiliary_loss_clip": 0.06467725, + "auxiliary_loss_mlp": 0.01271718, + "balance_loss_clip": 0.06284125, + "balance_loss_mlp": 0.01256185, + "epoch": 0.35406583496167143, + "flos": 26075298977280.0, + "grad_norm": 2.3097601077816443, + "language_loss": 0.76721621, + "learning_rate": 2.9958372875433437e-06, + "loss": 0.84461069, + "num_input_tokens_seen": 126667500, + "router_z_loss_clip": 1.8359375, + "router_z_loss_mlp": 0.15527344, + "step": 5889, + "time_per_iteration": 2.564761161804199 + }, + { + "auxiliary_loss_clip": 0.06465457, + "auxiliary_loss_mlp": 0.01270164, + "balance_loss_clip": 0.06283142, + "balance_loss_mlp": 0.01254357, + "epoch": 0.3541259582143394, + "flos": 19798635732480.0, + "grad_norm": 2.1640548649274116, + "language_loss": 0.81408846, + "learning_rate": 2.9954995182463478e-06, + "loss": 0.89144462, + "num_input_tokens_seen": 126686820, + "router_z_loss_clip": 1.82421875, + "router_z_loss_mlp": 0.15808105, + "step": 5890, + "time_per_iteration": 2.5614936351776123 + }, + { + "auxiliary_loss_clip": 0.06466024, + "auxiliary_loss_mlp": 0.01270173, + "balance_loss_clip": 0.06285816, + "balance_loss_mlp": 0.01256094, + "epoch": 0.35418608146700736, + "flos": 24028518493440.0, + "grad_norm": 1.6495661544524922, + "language_loss": 0.80017459, + "learning_rate": 2.99516171119991e-06, + "loss": 0.87753654, + "num_input_tokens_seen": 126706965, + "router_z_loss_clip": 1.80078125, + "router_z_loss_mlp": 0.14074707, + "step": 5891, + "time_per_iteration": 2.553158760070801 + }, + { + "auxiliary_loss_clip": 0.06471643, + "auxiliary_loss_mlp": 0.01282427, + "balance_loss_clip": 0.06289162, + "balance_loss_mlp": 0.01265928, + "epoch": 0.35424620471967533, + "flos": 12390701713920.0, + "grad_norm": 1.7694155250203176, + "language_loss": 0.73450041, + "learning_rate": 2.9948238664168415e-06, + "loss": 0.81204116, + "num_input_tokens_seen": 126724015, + "router_z_loss_clip": 1.82226562, + "router_z_loss_mlp": 0.16516113, + "step": 5892, + "time_per_iteration": 2.529136896133423 + }, + { + "auxiliary_loss_clip": 0.06470741, + "auxiliary_loss_mlp": 0.01274401, + "balance_loss_clip": 0.06286078, + "balance_loss_mlp": 0.01259059, + "epoch": 0.3543063279723433, + "flos": 19678219015680.0, + "grad_norm": 3.019670501918518, + "language_loss": 0.67408991, + "learning_rate": 2.9944859839099518e-06, + "loss": 0.75154132, + "num_input_tokens_seen": 126737565, + "router_z_loss_clip": 1.84667969, + "router_z_loss_mlp": 0.15344238, + "step": 5893, + "time_per_iteration": 2.507456064224243 + }, + { + "auxiliary_loss_clip": 0.06469926, + "auxiliary_loss_mlp": 0.01274247, + "balance_loss_clip": 0.06287754, + "balance_loss_mlp": 0.01257545, + "epoch": 0.35436645122501126, + "flos": 21915841173120.0, + "grad_norm": 1.8801549379271045, + "language_loss": 0.70079887, + "learning_rate": 2.9941480636920533e-06, + "loss": 0.77824062, + "num_input_tokens_seen": 126756095, + "router_z_loss_clip": 1.82128906, + "router_z_loss_mlp": 0.16711426, + "step": 5894, + "time_per_iteration": 2.5596466064453125 + }, + { + "auxiliary_loss_clip": 0.0646911, + "auxiliary_loss_mlp": 0.0127714, + "balance_loss_clip": 0.06291118, + "balance_loss_mlp": 0.0126256, + "epoch": 0.3544265744776792, + "flos": 21724915645440.0, + "grad_norm": 1.8040348457355686, + "language_loss": 0.74516678, + "learning_rate": 2.9938101057759615e-06, + "loss": 0.82262927, + "num_input_tokens_seen": 126775455, + "router_z_loss_clip": 1.78027344, + "router_z_loss_mlp": 0.14569092, + "step": 5895, + "time_per_iteration": 2.602884531021118 + }, + { + "auxiliary_loss_clip": 0.06476314, + "auxiliary_loss_mlp": 0.01274747, + "balance_loss_clip": 0.06292941, + "balance_loss_mlp": 0.01259643, + "epoch": 0.3544866977303472, + "flos": 21219278981760.0, + "grad_norm": 1.7647167527567422, + "language_loss": 0.83600783, + "learning_rate": 2.993472110174491e-06, + "loss": 0.91351843, + "num_input_tokens_seen": 126792320, + "router_z_loss_clip": 1.83300781, + "router_z_loss_mlp": 0.15100098, + "step": 5896, + "time_per_iteration": 2.5642035007476807 + }, + { + "auxiliary_loss_clip": 0.06472465, + "auxiliary_loss_mlp": 0.01278933, + "balance_loss_clip": 0.06292751, + "balance_loss_mlp": 0.01261576, + "epoch": 0.35454682098301515, + "flos": 29318534311680.0, + "grad_norm": 1.8515152904238923, + "language_loss": 0.70294917, + "learning_rate": 2.9931340769004576e-06, + "loss": 0.7804631, + "num_input_tokens_seen": 126813680, + "router_z_loss_clip": 1.79785156, + "router_z_loss_mlp": 0.17346191, + "step": 5897, + "time_per_iteration": 2.613032341003418 + }, + { + "auxiliary_loss_clip": 0.06475735, + "auxiliary_loss_mlp": 0.01274261, + "balance_loss_clip": 0.06293957, + "balance_loss_mlp": 0.01259205, + "epoch": 0.3546069442356832, + "flos": 24323509192320.0, + "grad_norm": 1.6960731630978507, + "language_loss": 0.81964374, + "learning_rate": 2.9927960059666816e-06, + "loss": 0.89714372, + "num_input_tokens_seen": 126834395, + "router_z_loss_clip": 1.81933594, + "router_z_loss_mlp": 0.15063477, + "step": 5898, + "time_per_iteration": 2.6033098697662354 + }, + { + "auxiliary_loss_clip": 0.06471986, + "auxiliary_loss_mlp": 0.01279895, + "balance_loss_clip": 0.0629501, + "balance_loss_mlp": 0.01265173, + "epoch": 0.35466706748835114, + "flos": 22863984848640.0, + "grad_norm": 1.4933011631381068, + "language_loss": 0.74405515, + "learning_rate": 2.9924578973859804e-06, + "loss": 0.82157397, + "num_input_tokens_seen": 126855145, + "router_z_loss_clip": 1.76855469, + "router_z_loss_mlp": 0.14727783, + "step": 5899, + "time_per_iteration": 2.5492894649505615 + }, + { + "auxiliary_loss_clip": 0.0647797, + "auxiliary_loss_mlp": 0.01272872, + "balance_loss_clip": 0.06294148, + "balance_loss_mlp": 0.01257196, + "epoch": 0.3547271907410191, + "flos": 28337714743680.0, + "grad_norm": 3.4583325446366673, + "language_loss": 0.80211669, + "learning_rate": 2.9921197511711763e-06, + "loss": 0.87962508, + "num_input_tokens_seen": 126873790, + "router_z_loss_clip": 1.83886719, + "router_z_loss_mlp": 0.15698242, + "step": 5900, + "time_per_iteration": 5.435121774673462 + }, + { + "auxiliary_loss_clip": 0.06478105, + "auxiliary_loss_mlp": 0.01279951, + "balance_loss_clip": 0.06296446, + "balance_loss_mlp": 0.01263607, + "epoch": 0.35478731399368707, + "flos": 23520911258880.0, + "grad_norm": 2.0942596894242533, + "language_loss": 0.8216058, + "learning_rate": 2.991781567335093e-06, + "loss": 0.89918637, + "num_input_tokens_seen": 126892865, + "router_z_loss_clip": 1.81445312, + "router_z_loss_mlp": 0.16357422, + "step": 5901, + "time_per_iteration": 2.603769540786743 + }, + { + "auxiliary_loss_clip": 0.06480999, + "auxiliary_loss_mlp": 0.01277169, + "balance_loss_clip": 0.06295676, + "balance_loss_mlp": 0.01261899, + "epoch": 0.35484743724635504, + "flos": 18630202872960.0, + "grad_norm": 2.2545917554681663, + "language_loss": 0.75979805, + "learning_rate": 2.9914433458905525e-06, + "loss": 0.83737969, + "num_input_tokens_seen": 126911935, + "router_z_loss_clip": 1.85253906, + "router_z_loss_mlp": 0.152771, + "step": 5902, + "time_per_iteration": 2.5356359481811523 + }, + { + "auxiliary_loss_clip": 0.06482422, + "auxiliary_loss_mlp": 0.01280542, + "balance_loss_clip": 0.06300852, + "balance_loss_mlp": 0.01265331, + "epoch": 0.354907560499023, + "flos": 17390296880640.0, + "grad_norm": 1.6908684001073404, + "language_loss": 0.70729327, + "learning_rate": 2.991105086850381e-06, + "loss": 0.78492296, + "num_input_tokens_seen": 126930040, + "router_z_loss_clip": 1.81640625, + "router_z_loss_mlp": 0.15209961, + "step": 5903, + "time_per_iteration": 2.52494478225708 + }, + { + "auxiliary_loss_clip": 0.06482972, + "auxiliary_loss_mlp": 0.01276075, + "balance_loss_clip": 0.06297173, + "balance_loss_mlp": 0.0125929, + "epoch": 0.35496768375169097, + "flos": 19214607974400.0, + "grad_norm": 2.9744492269587153, + "language_loss": 0.75001359, + "learning_rate": 2.9907667902274053e-06, + "loss": 0.82760406, + "num_input_tokens_seen": 126948390, + "router_z_loss_clip": 1.85742188, + "router_z_loss_mlp": 0.16784668, + "step": 5904, + "time_per_iteration": 2.5316994190216064 + }, + { + "auxiliary_loss_clip": 0.0648163, + "auxiliary_loss_mlp": 0.01277137, + "balance_loss_clip": 0.06297497, + "balance_loss_mlp": 0.01261902, + "epoch": 0.35502780700435893, + "flos": 18338692118400.0, + "grad_norm": 2.2144866791488536, + "language_loss": 0.78981996, + "learning_rate": 2.9904284560344536e-06, + "loss": 0.86740756, + "num_input_tokens_seen": 126964905, + "router_z_loss_clip": 1.84179688, + "router_z_loss_mlp": 0.15246582, + "step": 5905, + "time_per_iteration": 3.9867374897003174 + }, + { + "auxiliary_loss_clip": 0.06472038, + "auxiliary_loss_mlp": 0.01276232, + "balance_loss_clip": 0.06301226, + "balance_loss_mlp": 0.01262249, + "epoch": 0.3550879302570269, + "flos": 15453660988800.0, + "grad_norm": 1.8340819850757704, + "language_loss": 0.72531646, + "learning_rate": 2.990090084284356e-06, + "loss": 0.80279917, + "num_input_tokens_seen": 126982000, + "router_z_loss_clip": 1.70703125, + "router_z_loss_mlp": 0.13977051, + "step": 5906, + "time_per_iteration": 2.5326547622680664 + }, + { + "auxiliary_loss_clip": 0.06491787, + "auxiliary_loss_mlp": 0.01272032, + "balance_loss_clip": 0.06306198, + "balance_loss_mlp": 0.01256046, + "epoch": 0.35514805350969486, + "flos": 21985343735040.0, + "grad_norm": 1.9483914182465616, + "language_loss": 0.75052631, + "learning_rate": 2.9897516749899426e-06, + "loss": 0.82816458, + "num_input_tokens_seen": 126998390, + "router_z_loss_clip": 1.85839844, + "router_z_loss_mlp": 0.15991211, + "step": 5907, + "time_per_iteration": 2.526137113571167 + }, + { + "auxiliary_loss_clip": 0.06486456, + "auxiliary_loss_mlp": 0.01280245, + "balance_loss_clip": 0.06305459, + "balance_loss_mlp": 0.01264271, + "epoch": 0.3552081767623628, + "flos": 29869718469120.0, + "grad_norm": 2.2786495725258424, + "language_loss": 0.76563632, + "learning_rate": 2.989413228164047e-06, + "loss": 0.84330332, + "num_input_tokens_seen": 127020220, + "router_z_loss_clip": 1.81152344, + "router_z_loss_mlp": 0.15966797, + "step": 5908, + "time_per_iteration": 4.063998222351074 + }, + { + "auxiliary_loss_clip": 0.06491728, + "auxiliary_loss_mlp": 0.01276886, + "balance_loss_clip": 0.06310974, + "balance_loss_mlp": 0.0126146, + "epoch": 0.3552683000150308, + "flos": 26439456821760.0, + "grad_norm": 2.352503484530038, + "language_loss": 0.68572766, + "learning_rate": 2.989074743819502e-06, + "loss": 0.76341379, + "num_input_tokens_seen": 127038585, + "router_z_loss_clip": 1.80566406, + "router_z_loss_mlp": 0.15429688, + "step": 5909, + "time_per_iteration": 2.6902143955230713 + }, + { + "auxiliary_loss_clip": 0.0648414, + "auxiliary_loss_mlp": 0.01282146, + "balance_loss_clip": 0.06310885, + "balance_loss_mlp": 0.01268061, + "epoch": 0.35532842326769876, + "flos": 19791088865280.0, + "grad_norm": 1.9680680199916993, + "language_loss": 0.79103023, + "learning_rate": 2.988736221969144e-06, + "loss": 0.86869311, + "num_input_tokens_seen": 127056215, + "router_z_loss_clip": 1.73144531, + "router_z_loss_mlp": 0.14086914, + "step": 5910, + "time_per_iteration": 2.535050630569458 + }, + { + "auxiliary_loss_clip": 0.06495271, + "auxiliary_loss_mlp": 0.01274944, + "balance_loss_clip": 0.06310071, + "balance_loss_mlp": 0.0125841, + "epoch": 0.3553885465203668, + "flos": 17245170408960.0, + "grad_norm": 1.607302447744311, + "language_loss": 0.7130779, + "learning_rate": 2.98839766262581e-06, + "loss": 0.79078007, + "num_input_tokens_seen": 127075825, + "router_z_loss_clip": 1.85253906, + "router_z_loss_mlp": 0.1652832, + "step": 5911, + "time_per_iteration": 2.572942018508911 + }, + { + "auxiliary_loss_clip": 0.06485709, + "auxiliary_loss_mlp": 0.01272785, + "balance_loss_clip": 0.06309631, + "balance_loss_mlp": 0.01258313, + "epoch": 0.35544866977303474, + "flos": 14938800376320.0, + "grad_norm": 2.1423891041027514, + "language_loss": 0.87973344, + "learning_rate": 2.9880590658023366e-06, + "loss": 0.95731837, + "num_input_tokens_seen": 127091205, + "router_z_loss_clip": 1.76074219, + "router_z_loss_mlp": 0.14477539, + "step": 5912, + "time_per_iteration": 2.4826059341430664 + }, + { + "auxiliary_loss_clip": 0.0648666, + "auxiliary_loss_mlp": 0.01278679, + "balance_loss_clip": 0.0630875, + "balance_loss_mlp": 0.0126441, + "epoch": 0.3555087930257027, + "flos": 19762228333440.0, + "grad_norm": 2.0928412919366477, + "language_loss": 0.77506435, + "learning_rate": 2.9877204315115646e-06, + "loss": 0.8527177, + "num_input_tokens_seen": 127109210, + "router_z_loss_clip": 1.77832031, + "router_z_loss_mlp": 0.14251709, + "step": 5913, + "time_per_iteration": 2.577362060546875 + }, + { + "auxiliary_loss_clip": 0.06486008, + "auxiliary_loss_mlp": 0.01273445, + "balance_loss_clip": 0.06311025, + "balance_loss_mlp": 0.01258789, + "epoch": 0.3555689162783707, + "flos": 21074445999360.0, + "grad_norm": 5.920108951080063, + "language_loss": 0.82525283, + "learning_rate": 2.9873817597663353e-06, + "loss": 0.90284735, + "num_input_tokens_seen": 127128400, + "router_z_loss_clip": 1.75097656, + "router_z_loss_mlp": 0.14660645, + "step": 5914, + "time_per_iteration": 2.521756649017334 + }, + { + "auxiliary_loss_clip": 0.06490604, + "auxiliary_loss_mlp": 0.01268632, + "balance_loss_clip": 0.06310836, + "balance_loss_mlp": 0.01254118, + "epoch": 0.35562903953103864, + "flos": 33077426872320.0, + "grad_norm": 3.2692214801304686, + "language_loss": 0.7113682, + "learning_rate": 2.98704305057949e-06, + "loss": 0.78896052, + "num_input_tokens_seen": 127149965, + "router_z_loss_clip": 1.79882812, + "router_z_loss_mlp": 0.14508057, + "step": 5915, + "time_per_iteration": 2.6931562423706055 + }, + { + "auxiliary_loss_clip": 0.06477264, + "auxiliary_loss_mlp": 0.01269512, + "balance_loss_clip": 0.06297429, + "balance_loss_mlp": 0.01254814, + "epoch": 0.3556891627837066, + "flos": 20564029653120.0, + "grad_norm": 4.458093980019367, + "language_loss": 0.76718718, + "learning_rate": 2.9867043039638737e-06, + "loss": 0.84465492, + "num_input_tokens_seen": 127169865, + "router_z_loss_clip": 1.796875, + "router_z_loss_mlp": 0.14697266, + "step": 5916, + "time_per_iteration": 2.5489182472229004 + }, + { + "auxiliary_loss_clip": 0.06487325, + "auxiliary_loss_mlp": 0.01272059, + "balance_loss_clip": 0.06307879, + "balance_loss_mlp": 0.01256651, + "epoch": 0.35574928603637457, + "flos": 20709449614080.0, + "grad_norm": 1.674174142445476, + "language_loss": 0.88208687, + "learning_rate": 2.986365519932332e-06, + "loss": 0.95968074, + "num_input_tokens_seen": 127188075, + "router_z_loss_clip": 1.79589844, + "router_z_loss_mlp": 0.1539917, + "step": 5917, + "time_per_iteration": 2.6043195724487305 + }, + { + "auxiliary_loss_clip": 0.0649041, + "auxiliary_loss_mlp": 0.0126981, + "balance_loss_clip": 0.0631107, + "balance_loss_mlp": 0.01254289, + "epoch": 0.35580940928904253, + "flos": 15199899298560.0, + "grad_norm": 3.6980401889874086, + "language_loss": 0.75538862, + "learning_rate": 2.98602669849771e-06, + "loss": 0.83299077, + "num_input_tokens_seen": 127206065, + "router_z_loss_clip": 1.79296875, + "router_z_loss_mlp": 0.15515137, + "step": 5918, + "time_per_iteration": 2.5186190605163574 + }, + { + "auxiliary_loss_clip": 0.06461592, + "auxiliary_loss_mlp": 0.01285001, + "balance_loss_clip": 0.06381316, + "balance_loss_mlp": 0.01279086, + "epoch": 0.3558695325417105, + "flos": 58656145426560.0, + "grad_norm": 0.8458689331650495, + "language_loss": 0.63255095, + "learning_rate": 2.985687839672857e-06, + "loss": 0.71001691, + "num_input_tokens_seen": 127257885, + "router_z_loss_clip": 0.80224609, + "router_z_loss_mlp": 0.05911255, + "step": 5919, + "time_per_iteration": 2.9552297592163086 + }, + { + "auxiliary_loss_clip": 0.06485933, + "auxiliary_loss_mlp": 0.01271829, + "balance_loss_clip": 0.06302524, + "balance_loss_mlp": 0.01255998, + "epoch": 0.35592965579437846, + "flos": 22024811808000.0, + "grad_norm": 2.2679396062128188, + "language_loss": 0.74402696, + "learning_rate": 2.9853489434706223e-06, + "loss": 0.82160461, + "num_input_tokens_seen": 127275550, + "router_z_loss_clip": 1.83496094, + "router_z_loss_mlp": 0.1583252, + "step": 5920, + "time_per_iteration": 2.54848313331604 + }, + { + "auxiliary_loss_clip": 0.06483243, + "auxiliary_loss_mlp": 0.01277956, + "balance_loss_clip": 0.06304519, + "balance_loss_mlp": 0.01262638, + "epoch": 0.35598977904704643, + "flos": 23374401194880.0, + "grad_norm": 3.1552684799501733, + "language_loss": 0.77735227, + "learning_rate": 2.985010009903857e-06, + "loss": 0.85496426, + "num_input_tokens_seen": 127295110, + "router_z_loss_clip": 1.78710938, + "router_z_loss_mlp": 0.15332031, + "step": 5921, + "time_per_iteration": 2.6517810821533203 + }, + { + "auxiliary_loss_clip": 0.06490617, + "auxiliary_loss_mlp": 0.01276672, + "balance_loss_clip": 0.06309058, + "balance_loss_mlp": 0.01261329, + "epoch": 0.3560499022997144, + "flos": 17791113686400.0, + "grad_norm": 2.349487021583332, + "language_loss": 0.6770314, + "learning_rate": 2.9846710389854133e-06, + "loss": 0.75470436, + "num_input_tokens_seen": 127312865, + "router_z_loss_clip": 1.81640625, + "router_z_loss_mlp": 0.15332031, + "step": 5922, + "time_per_iteration": 2.525566577911377 + }, + { + "auxiliary_loss_clip": 0.06484485, + "auxiliary_loss_mlp": 0.0127389, + "balance_loss_clip": 0.06306913, + "balance_loss_mlp": 0.01258524, + "epoch": 0.35611002555238236, + "flos": 20746695553920.0, + "grad_norm": 2.231194122260979, + "language_loss": 0.79304701, + "learning_rate": 2.9843320307281454e-06, + "loss": 0.87063074, + "num_input_tokens_seen": 127331710, + "router_z_loss_clip": 1.77441406, + "router_z_loss_mlp": 0.15380859, + "step": 5923, + "time_per_iteration": 2.5809409618377686 + }, + { + "auxiliary_loss_clip": 0.06479051, + "auxiliary_loss_mlp": 0.01272719, + "balance_loss_clip": 0.06301268, + "balance_loss_mlp": 0.01257579, + "epoch": 0.3561701488050504, + "flos": 19468034248320.0, + "grad_norm": 1.61778925366919, + "language_loss": 0.8543126, + "learning_rate": 2.983992985144908e-06, + "loss": 0.93183035, + "num_input_tokens_seen": 127350950, + "router_z_loss_clip": 1.77929688, + "router_z_loss_mlp": 0.15148926, + "step": 5924, + "time_per_iteration": 2.524949312210083 + }, + { + "auxiliary_loss_clip": 0.06478724, + "auxiliary_loss_mlp": 0.01271843, + "balance_loss_clip": 0.06301951, + "balance_loss_mlp": 0.01255797, + "epoch": 0.35623027205771834, + "flos": 30783006046080.0, + "grad_norm": 1.9504196686726267, + "language_loss": 0.77609557, + "learning_rate": 2.9836539022485578e-06, + "loss": 0.85360122, + "num_input_tokens_seen": 127369385, + "router_z_loss_clip": 1.76660156, + "router_z_loss_mlp": 0.16033936, + "step": 5925, + "time_per_iteration": 2.6268069744110107 + }, + { + "auxiliary_loss_clip": 0.06472521, + "auxiliary_loss_mlp": 0.01273729, + "balance_loss_clip": 0.06292735, + "balance_loss_mlp": 0.01258291, + "epoch": 0.3562903953103863, + "flos": 16986461328000.0, + "grad_norm": 1.8072288436418724, + "language_loss": 0.76488966, + "learning_rate": 2.9833147820519535e-06, + "loss": 0.84235215, + "num_input_tokens_seen": 127386965, + "router_z_loss_clip": 1.79589844, + "router_z_loss_mlp": 0.15441895, + "step": 5926, + "time_per_iteration": 2.492009401321411 + }, + { + "auxiliary_loss_clip": 0.064781, + "auxiliary_loss_mlp": 0.01271518, + "balance_loss_clip": 0.06293385, + "balance_loss_mlp": 0.01255478, + "epoch": 0.3563505185630543, + "flos": 23846271863040.0, + "grad_norm": 2.038892178711472, + "language_loss": 0.69665909, + "learning_rate": 2.9829756245679544e-06, + "loss": 0.77415526, + "num_input_tokens_seen": 127406075, + "router_z_loss_clip": 1.84863281, + "router_z_loss_mlp": 0.16046143, + "step": 5927, + "time_per_iteration": 2.555192708969116 + }, + { + "auxiliary_loss_clip": 0.06471409, + "auxiliary_loss_mlp": 0.01273845, + "balance_loss_clip": 0.06293224, + "balance_loss_mlp": 0.0125889, + "epoch": 0.35641064181572224, + "flos": 22280040944640.0, + "grad_norm": 1.7768317666214009, + "language_loss": 0.79454333, + "learning_rate": 2.9826364298094212e-06, + "loss": 0.87199581, + "num_input_tokens_seen": 127425350, + "router_z_loss_clip": 1.78027344, + "router_z_loss_mlp": 0.1494751, + "step": 5928, + "time_per_iteration": 2.5192928314208984 + }, + { + "auxiliary_loss_clip": 0.06473258, + "auxiliary_loss_mlp": 0.01271381, + "balance_loss_clip": 0.06294424, + "balance_loss_mlp": 0.01256439, + "epoch": 0.3564707650683902, + "flos": 23007643873920.0, + "grad_norm": 1.230692465633979, + "language_loss": 0.8197661, + "learning_rate": 2.982297197789215e-06, + "loss": 0.89721251, + "num_input_tokens_seen": 127446335, + "router_z_loss_clip": 1.78808594, + "router_z_loss_mlp": 0.1494751, + "step": 5929, + "time_per_iteration": 2.6044368743896484 + }, + { + "auxiliary_loss_clip": 0.0646459, + "auxiliary_loss_mlp": 0.01268428, + "balance_loss_clip": 0.06289564, + "balance_loss_mlp": 0.01253765, + "epoch": 0.35653088832105817, + "flos": 14689566806400.0, + "grad_norm": 1.5209281639747478, + "language_loss": 0.70385516, + "learning_rate": 2.981957928520201e-06, + "loss": 0.78118533, + "num_input_tokens_seen": 127462795, + "router_z_loss_clip": 1.75097656, + "router_z_loss_mlp": 0.14685059, + "step": 5930, + "time_per_iteration": 2.498253107070923 + }, + { + "auxiliary_loss_clip": 0.06473252, + "auxiliary_loss_mlp": 0.01273096, + "balance_loss_clip": 0.06289653, + "balance_loss_mlp": 0.01256943, + "epoch": 0.35659101157372614, + "flos": 23483791100160.0, + "grad_norm": 2.174064041384607, + "language_loss": 0.68760598, + "learning_rate": 2.981618622015244e-06, + "loss": 0.76506943, + "num_input_tokens_seen": 127482675, + "router_z_loss_clip": 1.83496094, + "router_z_loss_mlp": 0.16162109, + "step": 5931, + "time_per_iteration": 2.5391998291015625 + }, + { + "auxiliary_loss_clip": 0.06463969, + "auxiliary_loss_mlp": 0.0126845, + "balance_loss_clip": 0.06288578, + "balance_loss_mlp": 0.01253788, + "epoch": 0.3566511348263941, + "flos": 26585966885760.0, + "grad_norm": 1.5444695234240167, + "language_loss": 0.68331707, + "learning_rate": 2.981279278287211e-06, + "loss": 0.76064122, + "num_input_tokens_seen": 127502275, + "router_z_loss_clip": 1.75195312, + "router_z_loss_mlp": 0.14660645, + "step": 5932, + "time_per_iteration": 2.553738832473755 + }, + { + "auxiliary_loss_clip": 0.06465189, + "auxiliary_loss_mlp": 0.01272147, + "balance_loss_clip": 0.06290227, + "balance_loss_mlp": 0.01257854, + "epoch": 0.35671125807906207, + "flos": 13119981724800.0, + "grad_norm": 2.4744838507658917, + "language_loss": 0.79635656, + "learning_rate": 2.980939897348969e-06, + "loss": 0.87372994, + "num_input_tokens_seen": 127520195, + "router_z_loss_clip": 1.74902344, + "router_z_loss_mlp": 0.14294434, + "step": 5933, + "time_per_iteration": 2.573812961578369 + }, + { + "auxiliary_loss_clip": 0.06470121, + "auxiliary_loss_mlp": 0.01270309, + "balance_loss_clip": 0.06288668, + "balance_loss_mlp": 0.01255372, + "epoch": 0.35677138133173003, + "flos": 33009014413440.0, + "grad_norm": 1.4096936090904761, + "language_loss": 0.69970256, + "learning_rate": 2.980600479213388e-06, + "loss": 0.77710688, + "num_input_tokens_seen": 127544495, + "router_z_loss_clip": 1.81445312, + "router_z_loss_mlp": 0.14929199, + "step": 5934, + "time_per_iteration": 2.6381173133850098 + }, + { + "auxiliary_loss_clip": 0.06481285, + "auxiliary_loss_mlp": 0.01277705, + "balance_loss_clip": 0.06294179, + "balance_loss_mlp": 0.01260741, + "epoch": 0.356831504584398, + "flos": 20784234983040.0, + "grad_norm": 2.103415594097178, + "language_loss": 0.72006869, + "learning_rate": 2.9802610238933384e-06, + "loss": 0.79765862, + "num_input_tokens_seen": 127563810, + "router_z_loss_clip": 1.87304688, + "router_z_loss_mlp": 0.16967773, + "step": 5935, + "time_per_iteration": 2.620471954345703 + }, + { + "auxiliary_loss_clip": 0.06467808, + "auxiliary_loss_mlp": 0.01275583, + "balance_loss_clip": 0.06287988, + "balance_loss_mlp": 0.01261004, + "epoch": 0.35689162783706596, + "flos": 12170244821760.0, + "grad_norm": 2.011082803426264, + "language_loss": 0.78423738, + "learning_rate": 2.979921531401692e-06, + "loss": 0.86167133, + "num_input_tokens_seen": 127579065, + "router_z_loss_clip": 1.796875, + "router_z_loss_mlp": 0.14569092, + "step": 5936, + "time_per_iteration": 2.4827091693878174 + }, + { + "auxiliary_loss_clip": 0.06466486, + "auxiliary_loss_mlp": 0.01273239, + "balance_loss_clip": 0.06289199, + "balance_loss_mlp": 0.01258147, + "epoch": 0.356951751089734, + "flos": 23848200506880.0, + "grad_norm": 1.8250890312079233, + "language_loss": 0.64893055, + "learning_rate": 2.9795820017513242e-06, + "loss": 0.72632784, + "num_input_tokens_seen": 127599105, + "router_z_loss_clip": 1.77636719, + "router_z_loss_mlp": 0.15100098, + "step": 5937, + "time_per_iteration": 2.5968148708343506 + }, + { + "auxiliary_loss_clip": 0.06470716, + "auxiliary_loss_mlp": 0.01277052, + "balance_loss_clip": 0.06291182, + "balance_loss_mlp": 0.01261644, + "epoch": 0.35701187434240195, + "flos": 11725851093120.0, + "grad_norm": 3.2825373138133633, + "language_loss": 0.79029787, + "learning_rate": 2.9792424349551073e-06, + "loss": 0.86777556, + "num_input_tokens_seen": 127614940, + "router_z_loss_clip": 1.79492188, + "router_z_loss_mlp": 0.15429688, + "step": 5938, + "time_per_iteration": 2.4724228382110596 + }, + { + "auxiliary_loss_clip": 0.0646999, + "auxiliary_loss_mlp": 0.01275118, + "balance_loss_clip": 0.06289655, + "balance_loss_mlp": 0.01259835, + "epoch": 0.3570719975950699, + "flos": 24905650233600.0, + "grad_norm": 2.3707612213619624, + "language_loss": 0.80684471, + "learning_rate": 2.9789028310259202e-06, + "loss": 0.88429582, + "num_input_tokens_seen": 127634960, + "router_z_loss_clip": 1.80566406, + "router_z_loss_mlp": 0.15307617, + "step": 5939, + "time_per_iteration": 4.067660331726074 + }, + { + "auxiliary_loss_clip": 0.06474897, + "auxiliary_loss_mlp": 0.01278586, + "balance_loss_clip": 0.06288245, + "balance_loss_mlp": 0.01263357, + "epoch": 0.3571321208477379, + "flos": 26002022981760.0, + "grad_norm": 1.7209958005115653, + "language_loss": 0.79509544, + "learning_rate": 2.9785631899766395e-06, + "loss": 0.8726303, + "num_input_tokens_seen": 127654545, + "router_z_loss_clip": 1.8671875, + "router_z_loss_mlp": 0.15228271, + "step": 5940, + "time_per_iteration": 3.961956262588501 + }, + { + "auxiliary_loss_clip": 0.06472583, + "auxiliary_loss_mlp": 0.01274024, + "balance_loss_clip": 0.0628977, + "balance_loss_mlp": 0.01258223, + "epoch": 0.35719224410040584, + "flos": 14506900905600.0, + "grad_norm": 2.455654522420387, + "language_loss": 0.72918689, + "learning_rate": 2.9782235118201443e-06, + "loss": 0.80665296, + "num_input_tokens_seen": 127672320, + "router_z_loss_clip": 1.82714844, + "router_z_loss_mlp": 0.15802002, + "step": 5941, + "time_per_iteration": 2.529376745223999 + }, + { + "auxiliary_loss_clip": 0.06469624, + "auxiliary_loss_mlp": 0.01274223, + "balance_loss_clip": 0.06291723, + "balance_loss_mlp": 0.01258577, + "epoch": 0.3572523673530738, + "flos": 31183445508480.0, + "grad_norm": 1.9522398224767823, + "language_loss": 0.64961332, + "learning_rate": 2.9778837965693154e-06, + "loss": 0.72705185, + "num_input_tokens_seen": 127693315, + "router_z_loss_clip": 1.77636719, + "router_z_loss_mlp": 0.15667725, + "step": 5942, + "time_per_iteration": 2.6694955825805664 + }, + { + "auxiliary_loss_clip": 0.06470639, + "auxiliary_loss_mlp": 0.01273062, + "balance_loss_clip": 0.06291504, + "balance_loss_mlp": 0.01257124, + "epoch": 0.3573124906057418, + "flos": 15857496541440.0, + "grad_norm": 1.9232266262089555, + "language_loss": 0.7463761, + "learning_rate": 2.9775440442370354e-06, + "loss": 0.82381314, + "num_input_tokens_seen": 127711570, + "router_z_loss_clip": 1.79101562, + "router_z_loss_mlp": 0.1595459, + "step": 5943, + "time_per_iteration": 2.5988807678222656 + }, + { + "auxiliary_loss_clip": 0.06416824, + "auxiliary_loss_mlp": 0.01259877, + "balance_loss_clip": 0.06336363, + "balance_loss_mlp": 0.01254631, + "epoch": 0.35737261385840974, + "flos": 60839163849600.0, + "grad_norm": 0.8122274991603828, + "language_loss": 0.60684133, + "learning_rate": 2.9772042548361867e-06, + "loss": 0.68360829, + "num_input_tokens_seen": 127772475, + "router_z_loss_clip": 0.8046875, + "router_z_loss_mlp": 0.05249023, + "step": 5944, + "time_per_iteration": 3.2639529705047607 + }, + { + "auxiliary_loss_clip": 0.06467592, + "auxiliary_loss_mlp": 0.01274246, + "balance_loss_clip": 0.06290887, + "balance_loss_mlp": 0.01259464, + "epoch": 0.3574327371110777, + "flos": 18849779297280.0, + "grad_norm": 1.8477550360079977, + "language_loss": 0.7280755, + "learning_rate": 2.976864428379655e-06, + "loss": 0.80549395, + "num_input_tokens_seen": 127790940, + "router_z_loss_clip": 1.765625, + "router_z_loss_mlp": 0.14782715, + "step": 5945, + "time_per_iteration": 3.974971294403076 + }, + { + "auxiliary_loss_clip": 0.06464474, + "auxiliary_loss_mlp": 0.01274521, + "balance_loss_clip": 0.06288721, + "balance_loss_mlp": 0.01259619, + "epoch": 0.35749286036374567, + "flos": 23556354336000.0, + "grad_norm": 1.6530257311602492, + "language_loss": 0.8152287, + "learning_rate": 2.976524564880326e-06, + "loss": 0.89261866, + "num_input_tokens_seen": 127808275, + "router_z_loss_clip": 1.7578125, + "router_z_loss_mlp": 0.14892578, + "step": 5946, + "time_per_iteration": 2.567702531814575 + }, + { + "auxiliary_loss_clip": 0.06472433, + "auxiliary_loss_mlp": 0.01275229, + "balance_loss_clip": 0.06292298, + "balance_loss_mlp": 0.01260036, + "epoch": 0.35755298361641363, + "flos": 21111817720320.0, + "grad_norm": 1.4004407917222146, + "language_loss": 0.69023073, + "learning_rate": 2.9761846643510882e-06, + "loss": 0.76770723, + "num_input_tokens_seen": 127828840, + "router_z_loss_clip": 1.80273438, + "router_z_loss_mlp": 0.15209961, + "step": 5947, + "time_per_iteration": 2.531938076019287 + }, + { + "auxiliary_loss_clip": 0.06458312, + "auxiliary_loss_mlp": 0.01270008, + "balance_loss_clip": 0.06284653, + "balance_loss_mlp": 0.01256109, + "epoch": 0.3576131068690816, + "flos": 19251099227520.0, + "grad_norm": 2.059659188145791, + "language_loss": 0.75891036, + "learning_rate": 2.9758447268048297e-06, + "loss": 0.83619356, + "num_input_tokens_seen": 127846240, + "router_z_loss_clip": 1.73535156, + "router_z_loss_mlp": 0.13916016, + "step": 5948, + "time_per_iteration": 3.9236361980438232 + }, + { + "auxiliary_loss_clip": 0.06466205, + "auxiliary_loss_mlp": 0.01276458, + "balance_loss_clip": 0.06287337, + "balance_loss_mlp": 0.01261462, + "epoch": 0.35767323012174956, + "flos": 28661733682560.0, + "grad_norm": 1.6908098548641093, + "language_loss": 0.71228039, + "learning_rate": 2.9755047522544415e-06, + "loss": 0.78970701, + "num_input_tokens_seen": 127866880, + "router_z_loss_clip": 1.78808594, + "router_z_loss_mlp": 0.15002441, + "step": 5949, + "time_per_iteration": 2.56809663772583 + }, + { + "auxiliary_loss_clip": 0.06464282, + "auxiliary_loss_mlp": 0.01281848, + "balance_loss_clip": 0.06286816, + "balance_loss_mlp": 0.01266995, + "epoch": 0.35773335337441753, + "flos": 17089897593600.0, + "grad_norm": 1.7763817610233048, + "language_loss": 0.77821207, + "learning_rate": 2.9751647407128154e-06, + "loss": 0.85567343, + "num_input_tokens_seen": 127883560, + "router_z_loss_clip": 1.7734375, + "router_z_loss_mlp": 0.1484375, + "step": 5950, + "time_per_iteration": 2.529543876647949 + }, + { + "auxiliary_loss_clip": 0.06465182, + "auxiliary_loss_mlp": 0.01276208, + "balance_loss_clip": 0.0628643, + "balance_loss_mlp": 0.01261331, + "epoch": 0.35779347662708555, + "flos": 15894155502720.0, + "grad_norm": 2.1549260339424725, + "language_loss": 0.73109937, + "learning_rate": 2.9748246921928445e-06, + "loss": 0.80851334, + "num_input_tokens_seen": 127902330, + "router_z_loss_clip": 1.78710938, + "router_z_loss_mlp": 0.14892578, + "step": 5951, + "time_per_iteration": 2.5201168060302734 + }, + { + "auxiliary_loss_clip": 0.06470691, + "auxiliary_loss_mlp": 0.01277881, + "balance_loss_clip": 0.06287189, + "balance_loss_mlp": 0.01262181, + "epoch": 0.3578535998797535, + "flos": 28666555292160.0, + "grad_norm": 1.9784791605149854, + "language_loss": 0.7026071, + "learning_rate": 2.9744846067074236e-06, + "loss": 0.78009284, + "num_input_tokens_seen": 127922325, + "router_z_loss_clip": 1.83691406, + "router_z_loss_mlp": 0.15698242, + "step": 5952, + "time_per_iteration": 2.5931434631347656 + }, + { + "auxiliary_loss_clip": 0.0646029, + "auxiliary_loss_mlp": 0.01277333, + "balance_loss_clip": 0.06284408, + "balance_loss_mlp": 0.01263069, + "epoch": 0.3579137231324215, + "flos": 37861554464640.0, + "grad_norm": 1.6267089711440414, + "language_loss": 0.69578886, + "learning_rate": 2.974144484269449e-06, + "loss": 0.77316511, + "num_input_tokens_seen": 127942635, + "router_z_loss_clip": 1.75878906, + "router_z_loss_mlp": 0.14276123, + "step": 5953, + "time_per_iteration": 2.668464422225952 + }, + { + "auxiliary_loss_clip": 0.0645823, + "auxiliary_loss_mlp": 0.01275685, + "balance_loss_clip": 0.06282876, + "balance_loss_mlp": 0.01261117, + "epoch": 0.35797384638508944, + "flos": 22353526575360.0, + "grad_norm": 1.5719996722989455, + "language_loss": 0.67333478, + "learning_rate": 2.9738043248918175e-06, + "loss": 0.75067389, + "num_input_tokens_seen": 127962520, + "router_z_loss_clip": 1.75390625, + "router_z_loss_mlp": 0.14562988, + "step": 5954, + "time_per_iteration": 2.5791454315185547 + }, + { + "auxiliary_loss_clip": 0.06459846, + "auxiliary_loss_mlp": 0.01278708, + "balance_loss_clip": 0.06287006, + "balance_loss_mlp": 0.0126414, + "epoch": 0.3580339696377574, + "flos": 13594829212800.0, + "grad_norm": 1.8066455981447187, + "language_loss": 0.75335681, + "learning_rate": 2.9734641285874282e-06, + "loss": 0.83074236, + "num_input_tokens_seen": 127981180, + "router_z_loss_clip": 1.73144531, + "router_z_loss_mlp": 0.14556885, + "step": 5955, + "time_per_iteration": 2.5049943923950195 + }, + { + "auxiliary_loss_clip": 0.06458074, + "auxiliary_loss_mlp": 0.01270596, + "balance_loss_clip": 0.06286005, + "balance_loss_mlp": 0.01256595, + "epoch": 0.3580940928904254, + "flos": 23774882584320.0, + "grad_norm": 1.7018331496498176, + "language_loss": 0.76155579, + "learning_rate": 2.973123895369182e-06, + "loss": 0.83884245, + "num_input_tokens_seen": 127999725, + "router_z_loss_clip": 1.72070312, + "router_z_loss_mlp": 0.14007568, + "step": 5956, + "time_per_iteration": 2.565455675125122 + }, + { + "auxiliary_loss_clip": 0.06456999, + "auxiliary_loss_mlp": 0.01278066, + "balance_loss_clip": 0.06286499, + "balance_loss_mlp": 0.01263415, + "epoch": 0.35815421614309334, + "flos": 19469962892160.0, + "grad_norm": 1.5319401259692025, + "language_loss": 0.73558611, + "learning_rate": 2.9727836252499805e-06, + "loss": 0.81293678, + "num_input_tokens_seen": 128018885, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.14642334, + "step": 5957, + "time_per_iteration": 2.5241572856903076 + }, + { + "auxiliary_loss_clip": 0.064648, + "auxiliary_loss_mlp": 0.01274688, + "balance_loss_clip": 0.06291045, + "balance_loss_mlp": 0.01260204, + "epoch": 0.3582143393957613, + "flos": 23374988173440.0, + "grad_norm": 2.1285308943055727, + "language_loss": 0.71748459, + "learning_rate": 2.972443318242726e-06, + "loss": 0.79487944, + "num_input_tokens_seen": 128037875, + "router_z_loss_clip": 1.73828125, + "router_z_loss_mlp": 0.14477539, + "step": 5958, + "time_per_iteration": 2.566181182861328 + }, + { + "auxiliary_loss_clip": 0.06459813, + "auxiliary_loss_mlp": 0.01267621, + "balance_loss_clip": 0.06289116, + "balance_loss_mlp": 0.0125415, + "epoch": 0.35827446264842927, + "flos": 26330528113920.0, + "grad_norm": 1.6357791647016078, + "language_loss": 0.88725436, + "learning_rate": 2.972102974360324e-06, + "loss": 0.96452874, + "num_input_tokens_seen": 128056045, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.13452148, + "step": 5959, + "time_per_iteration": 2.6218011379241943 + }, + { + "auxiliary_loss_clip": 0.06463417, + "auxiliary_loss_mlp": 0.01271505, + "balance_loss_clip": 0.06288788, + "balance_loss_mlp": 0.0125816, + "epoch": 0.35833458590109724, + "flos": 30454626695040.0, + "grad_norm": 1.5143701220572547, + "language_loss": 0.58769095, + "learning_rate": 2.971762593615679e-06, + "loss": 0.66504014, + "num_input_tokens_seen": 128077815, + "router_z_loss_clip": 1.74707031, + "router_z_loss_mlp": 0.13348389, + "step": 5960, + "time_per_iteration": 2.636439800262451 + }, + { + "auxiliary_loss_clip": 0.06462947, + "auxiliary_loss_mlp": 0.01269103, + "balance_loss_clip": 0.06286879, + "balance_loss_mlp": 0.01253469, + "epoch": 0.3583947091537652, + "flos": 14835154475520.0, + "grad_norm": 2.541265940729937, + "language_loss": 0.76686686, + "learning_rate": 2.9714221760216993e-06, + "loss": 0.84418738, + "num_input_tokens_seen": 128095460, + "router_z_loss_clip": 1.75976562, + "router_z_loss_mlp": 0.15631104, + "step": 5961, + "time_per_iteration": 2.523674249649048 + }, + { + "auxiliary_loss_clip": 0.06464821, + "auxiliary_loss_mlp": 0.01275426, + "balance_loss_clip": 0.06287968, + "balance_loss_mlp": 0.01261324, + "epoch": 0.35845483240643317, + "flos": 34249213895040.0, + "grad_norm": 1.6475679018941416, + "language_loss": 0.70478481, + "learning_rate": 2.971081721591294e-06, + "loss": 0.78218734, + "num_input_tokens_seen": 128118605, + "router_z_loss_clip": 1.76953125, + "router_z_loss_mlp": 0.14099121, + "step": 5962, + "time_per_iteration": 2.6199357509613037 + }, + { + "auxiliary_loss_clip": 0.06464063, + "auxiliary_loss_mlp": 0.01269417, + "balance_loss_clip": 0.06289653, + "balance_loss_mlp": 0.01255207, + "epoch": 0.35851495565910113, + "flos": 20966481613440.0, + "grad_norm": 1.6496872805273144, + "language_loss": 0.75120842, + "learning_rate": 2.9707412303373716e-06, + "loss": 0.82854319, + "num_input_tokens_seen": 128139205, + "router_z_loss_clip": 1.74609375, + "router_z_loss_mlp": 0.14221191, + "step": 5963, + "time_per_iteration": 2.5526950359344482 + }, + { + "auxiliary_loss_clip": 0.06467253, + "auxiliary_loss_mlp": 0.01271151, + "balance_loss_clip": 0.06291784, + "balance_loss_mlp": 0.01256322, + "epoch": 0.35857507891176915, + "flos": 22316448343680.0, + "grad_norm": 1.675466861885377, + "language_loss": 0.78945208, + "learning_rate": 2.9704007022728447e-06, + "loss": 0.86683613, + "num_input_tokens_seen": 128158765, + "router_z_loss_clip": 1.75292969, + "router_z_loss_mlp": 0.14831543, + "step": 5964, + "time_per_iteration": 2.5257983207702637 + }, + { + "auxiliary_loss_clip": 0.0647264, + "auxiliary_loss_mlp": 0.01272042, + "balance_loss_clip": 0.06292663, + "balance_loss_mlp": 0.0125726, + "epoch": 0.3586352021644371, + "flos": 23374610830080.0, + "grad_norm": 3.2898914726182684, + "language_loss": 0.667786, + "learning_rate": 2.970060137410626e-06, + "loss": 0.74523282, + "num_input_tokens_seen": 128177850, + "router_z_loss_clip": 1.79785156, + "router_z_loss_mlp": 0.14764404, + "step": 5965, + "time_per_iteration": 2.5664315223693848 + }, + { + "auxiliary_loss_clip": 0.06463271, + "auxiliary_loss_mlp": 0.01271526, + "balance_loss_clip": 0.06287476, + "balance_loss_mlp": 0.01256773, + "epoch": 0.3586953254171051, + "flos": 27855655804800.0, + "grad_norm": 1.5935311272675807, + "language_loss": 0.79428947, + "learning_rate": 2.9697195357636294e-06, + "loss": 0.87163734, + "num_input_tokens_seen": 128196925, + "router_z_loss_clip": 1.75585938, + "router_z_loss_mlp": 0.14746094, + "step": 5966, + "time_per_iteration": 2.576537609100342 + }, + { + "auxiliary_loss_clip": 0.06467331, + "auxiliary_loss_mlp": 0.01268742, + "balance_loss_clip": 0.06291486, + "balance_loss_mlp": 0.01254717, + "epoch": 0.35875544866977305, + "flos": 19506621853440.0, + "grad_norm": 2.077713447457672, + "language_loss": 0.91477883, + "learning_rate": 2.9693788973447715e-06, + "loss": 0.99213958, + "num_input_tokens_seen": 128213955, + "router_z_loss_clip": 1.75683594, + "router_z_loss_mlp": 0.14044189, + "step": 5967, + "time_per_iteration": 2.553084135055542 + }, + { + "auxiliary_loss_clip": 0.06466691, + "auxiliary_loss_mlp": 0.01272699, + "balance_loss_clip": 0.06288824, + "balance_loss_mlp": 0.01257261, + "epoch": 0.358815571922441, + "flos": 21477652646400.0, + "grad_norm": 1.8463229992001005, + "language_loss": 0.80835712, + "learning_rate": 2.9690382221669682e-06, + "loss": 0.88575101, + "num_input_tokens_seen": 128232980, + "router_z_loss_clip": 1.77832031, + "router_z_loss_mlp": 0.15435791, + "step": 5968, + "time_per_iteration": 2.526298761367798 + }, + { + "auxiliary_loss_clip": 0.06467028, + "auxiliary_loss_mlp": 0.0127428, + "balance_loss_clip": 0.06287041, + "balance_loss_mlp": 0.012587, + "epoch": 0.358875695175109, + "flos": 21841894344960.0, + "grad_norm": 1.8179824378655614, + "language_loss": 0.84621, + "learning_rate": 2.9686975102431384e-06, + "loss": 0.92362314, + "num_input_tokens_seen": 128252795, + "router_z_loss_clip": 1.80078125, + "router_z_loss_mlp": 0.15588379, + "step": 5969, + "time_per_iteration": 2.5340397357940674 + }, + { + "auxiliary_loss_clip": 0.0646342, + "auxiliary_loss_mlp": 0.0127204, + "balance_loss_clip": 0.06288599, + "balance_loss_mlp": 0.01258664, + "epoch": 0.35893581842777694, + "flos": 32019264385920.0, + "grad_norm": 1.8505987075691241, + "language_loss": 0.72233456, + "learning_rate": 2.968356761586202e-06, + "loss": 0.79968911, + "num_input_tokens_seen": 128273115, + "router_z_loss_clip": 1.74804688, + "router_z_loss_mlp": 0.13366699, + "step": 5970, + "time_per_iteration": 2.581071615219116 + }, + { + "auxiliary_loss_clip": 0.06468321, + "auxiliary_loss_mlp": 0.01272468, + "balance_loss_clip": 0.06292167, + "balance_loss_mlp": 0.01258056, + "epoch": 0.3589959416804449, + "flos": 20492137249920.0, + "grad_norm": 1.5610077365233734, + "language_loss": 0.79753757, + "learning_rate": 2.9680159762090805e-06, + "loss": 0.87494546, + "num_input_tokens_seen": 128292220, + "router_z_loss_clip": 1.76269531, + "router_z_loss_mlp": 0.14422607, + "step": 5971, + "time_per_iteration": 2.5405828952789307 + }, + { + "auxiliary_loss_clip": 0.0646906, + "auxiliary_loss_mlp": 0.01270026, + "balance_loss_clip": 0.06288019, + "balance_loss_mlp": 0.01255006, + "epoch": 0.3590560649331129, + "flos": 16186295162880.0, + "grad_norm": 1.6291573791515084, + "language_loss": 0.78869599, + "learning_rate": 2.967675154124696e-06, + "loss": 0.86608684, + "num_input_tokens_seen": 128310305, + "router_z_loss_clip": 1.81054688, + "router_z_loss_mlp": 0.15026855, + "step": 5972, + "time_per_iteration": 2.4778740406036377 + }, + { + "auxiliary_loss_clip": 0.06465904, + "auxiliary_loss_mlp": 0.01274602, + "balance_loss_clip": 0.06286226, + "balance_loss_mlp": 0.01260201, + "epoch": 0.35911618818578084, + "flos": 20381531460480.0, + "grad_norm": 2.0141455740295875, + "language_loss": 0.81742013, + "learning_rate": 2.9673342953459722e-06, + "loss": 0.89482516, + "num_input_tokens_seen": 128328305, + "router_z_loss_clip": 1.79492188, + "router_z_loss_mlp": 0.1439209, + "step": 5973, + "time_per_iteration": 2.532027006149292 + }, + { + "auxiliary_loss_clip": 0.06404248, + "auxiliary_loss_mlp": 0.01258065, + "balance_loss_clip": 0.06324309, + "balance_loss_mlp": 0.01254096, + "epoch": 0.3591763114384488, + "flos": 41250991645440.0, + "grad_norm": 0.9082562918021452, + "language_loss": 0.56514442, + "learning_rate": 2.9669933998858355e-06, + "loss": 0.64176756, + "num_input_tokens_seen": 128378380, + "router_z_loss_clip": 0.796875, + "router_z_loss_mlp": 0.03967285, + "step": 5974, + "time_per_iteration": 3.0029375553131104 + }, + { + "auxiliary_loss_clip": 0.06464389, + "auxiliary_loss_mlp": 0.01272027, + "balance_loss_clip": 0.06286667, + "balance_loss_mlp": 0.01257781, + "epoch": 0.35923643469111677, + "flos": 18701047100160.0, + "grad_norm": 1.9591615340661908, + "language_loss": 0.69342583, + "learning_rate": 2.9666524677572114e-06, + "loss": 0.77078998, + "num_input_tokens_seen": 128394315, + "router_z_loss_clip": 1.77636719, + "router_z_loss_mlp": 0.14227295, + "step": 5975, + "time_per_iteration": 2.5330698490142822 + }, + { + "auxiliary_loss_clip": 0.06462636, + "auxiliary_loss_mlp": 0.0126783, + "balance_loss_clip": 0.06286036, + "balance_loss_mlp": 0.0125325, + "epoch": 0.35929655794378473, + "flos": 25017010709760.0, + "grad_norm": 1.597565036747504, + "language_loss": 0.8049522, + "learning_rate": 2.96631149897303e-06, + "loss": 0.88225687, + "num_input_tokens_seen": 128414515, + "router_z_loss_clip": 1.765625, + "router_z_loss_mlp": 0.14575195, + "step": 5976, + "time_per_iteration": 2.5599968433380127 + }, + { + "auxiliary_loss_clip": 0.0646351, + "auxiliary_loss_mlp": 0.0126845, + "balance_loss_clip": 0.06286681, + "balance_loss_mlp": 0.01253489, + "epoch": 0.35935668119645275, + "flos": 14980825998720.0, + "grad_norm": 1.8019140268476472, + "language_loss": 0.79171205, + "learning_rate": 2.9659704935462194e-06, + "loss": 0.86903155, + "num_input_tokens_seen": 128430615, + "router_z_loss_clip": 1.76757812, + "router_z_loss_mlp": 0.1496582, + "step": 5977, + "time_per_iteration": 2.4876949787139893 + }, + { + "auxiliary_loss_clip": 0.06459211, + "auxiliary_loss_mlp": 0.01266574, + "balance_loss_clip": 0.0628271, + "balance_loss_mlp": 0.0125324, + "epoch": 0.3594168044491207, + "flos": 21184422883200.0, + "grad_norm": 1.897291031169604, + "language_loss": 0.80843097, + "learning_rate": 2.9656294514897102e-06, + "loss": 0.88568884, + "num_input_tokens_seen": 128449480, + "router_z_loss_clip": 1.76464844, + "router_z_loss_mlp": 0.13342285, + "step": 5978, + "time_per_iteration": 2.5270771980285645 + }, + { + "auxiliary_loss_clip": 0.06458849, + "auxiliary_loss_mlp": 0.01272545, + "balance_loss_clip": 0.06279429, + "balance_loss_mlp": 0.01257703, + "epoch": 0.3594769277017887, + "flos": 27679446668160.0, + "grad_norm": 1.6570486295636508, + "language_loss": 0.67797875, + "learning_rate": 2.965288372816436e-06, + "loss": 0.75529265, + "num_input_tokens_seen": 128471465, + "router_z_loss_clip": 1.79394531, + "router_z_loss_mlp": 0.14819336, + "step": 5979, + "time_per_iteration": 5.427239179611206 + }, + { + "auxiliary_loss_clip": 0.06460471, + "auxiliary_loss_mlp": 0.01270252, + "balance_loss_clip": 0.06282781, + "balance_loss_mlp": 0.01256323, + "epoch": 0.35953705095445665, + "flos": 23008901685120.0, + "grad_norm": 2.1534655116077928, + "language_loss": 0.67667198, + "learning_rate": 2.9649472575393296e-06, + "loss": 0.75397921, + "num_input_tokens_seen": 128490645, + "router_z_loss_clip": 1.77929688, + "router_z_loss_mlp": 0.13928223, + "step": 5980, + "time_per_iteration": 2.538149833679199 + }, + { + "auxiliary_loss_clip": 0.0647162, + "auxiliary_loss_mlp": 0.01273216, + "balance_loss_clip": 0.06285568, + "balance_loss_mlp": 0.01257146, + "epoch": 0.3595971742071246, + "flos": 25520005969920.0, + "grad_norm": 2.2162969460708597, + "language_loss": 0.71122372, + "learning_rate": 2.964606105671327e-06, + "loss": 0.78867209, + "num_input_tokens_seen": 128510225, + "router_z_loss_clip": 1.86132812, + "router_z_loss_mlp": 0.16064453, + "step": 5981, + "time_per_iteration": 2.5711326599121094 + }, + { + "auxiliary_loss_clip": 0.06464566, + "auxiliary_loss_mlp": 0.01272445, + "balance_loss_clip": 0.06283125, + "balance_loss_mlp": 0.01256709, + "epoch": 0.3596572974597926, + "flos": 29870431228800.0, + "grad_norm": 2.0278025655936958, + "language_loss": 0.71914935, + "learning_rate": 2.9642649172253635e-06, + "loss": 0.7965194, + "num_input_tokens_seen": 128530195, + "router_z_loss_clip": 1.81640625, + "router_z_loss_mlp": 0.1572876, + "step": 5982, + "time_per_iteration": 2.6292126178741455 + }, + { + "auxiliary_loss_clip": 0.06458835, + "auxiliary_loss_mlp": 0.01267882, + "balance_loss_clip": 0.06286852, + "balance_loss_mlp": 0.0125428, + "epoch": 0.35971742071246054, + "flos": 23119255912320.0, + "grad_norm": 1.6791573126106523, + "language_loss": 0.7649492, + "learning_rate": 2.9639236922143786e-06, + "loss": 0.84221637, + "num_input_tokens_seen": 128549990, + "router_z_loss_clip": 1.72070312, + "router_z_loss_mlp": 0.13598633, + "step": 5983, + "time_per_iteration": 2.540801763534546 + }, + { + "auxiliary_loss_clip": 0.06468493, + "auxiliary_loss_mlp": 0.01273263, + "balance_loss_clip": 0.06285352, + "balance_loss_mlp": 0.01257206, + "epoch": 0.3597775439651285, + "flos": 16730645212800.0, + "grad_norm": 1.651729152091261, + "language_loss": 0.77260226, + "learning_rate": 2.96358243065131e-06, + "loss": 0.85001981, + "num_input_tokens_seen": 128567925, + "router_z_loss_clip": 1.83007812, + "router_z_loss_mlp": 0.16052246, + "step": 5984, + "time_per_iteration": 2.5278737545013428 + }, + { + "auxiliary_loss_clip": 0.06458455, + "auxiliary_loss_mlp": 0.01270496, + "balance_loss_clip": 0.0628411, + "balance_loss_mlp": 0.01256155, + "epoch": 0.3598376672177965, + "flos": 19725653226240.0, + "grad_norm": 2.0268922239891163, + "language_loss": 0.87093443, + "learning_rate": 2.9632411325490993e-06, + "loss": 0.94822395, + "num_input_tokens_seen": 128585655, + "router_z_loss_clip": 1.7421875, + "router_z_loss_mlp": 0.14355469, + "step": 5985, + "time_per_iteration": 3.9569170475006104 + }, + { + "auxiliary_loss_clip": 0.06461216, + "auxiliary_loss_mlp": 0.01272807, + "balance_loss_clip": 0.06284203, + "balance_loss_mlp": 0.01258109, + "epoch": 0.35989779047046444, + "flos": 17317314374400.0, + "grad_norm": 1.4939910635791536, + "language_loss": 0.72980917, + "learning_rate": 2.9628997979206884e-06, + "loss": 0.80714941, + "num_input_tokens_seen": 128604820, + "router_z_loss_clip": 1.77246094, + "router_z_loss_mlp": 0.14709473, + "step": 5986, + "time_per_iteration": 2.5065739154815674 + }, + { + "auxiliary_loss_clip": 0.06469383, + "auxiliary_loss_mlp": 0.0126965, + "balance_loss_clip": 0.06283881, + "balance_loss_mlp": 0.01254761, + "epoch": 0.3599579137231324, + "flos": 22717894055040.0, + "grad_norm": 2.903112824764454, + "language_loss": 0.73792106, + "learning_rate": 2.9625584267790204e-06, + "loss": 0.81531143, + "num_input_tokens_seen": 128623070, + "router_z_loss_clip": 1.85449219, + "router_z_loss_mlp": 0.14892578, + "step": 5987, + "time_per_iteration": 3.961486339569092 + }, + { + "auxiliary_loss_clip": 0.06467381, + "auxiliary_loss_mlp": 0.01269998, + "balance_loss_clip": 0.06286356, + "balance_loss_mlp": 0.01255347, + "epoch": 0.36001803697580037, + "flos": 20966230051200.0, + "grad_norm": 1.8945086710394061, + "language_loss": 0.69721663, + "learning_rate": 2.9622170191370404e-06, + "loss": 0.77459043, + "num_input_tokens_seen": 128642430, + "router_z_loss_clip": 1.80859375, + "router_z_loss_mlp": 0.14648438, + "step": 5988, + "time_per_iteration": 2.5483100414276123 + }, + { + "auxiliary_loss_clip": 0.0647547, + "auxiliary_loss_mlp": 0.01273545, + "balance_loss_clip": 0.06292704, + "balance_loss_mlp": 0.01258209, + "epoch": 0.36007816022846834, + "flos": 20491843760640.0, + "grad_norm": 1.7927951606002523, + "language_loss": 0.7305057, + "learning_rate": 2.9618755750076953e-06, + "loss": 0.80799592, + "num_input_tokens_seen": 128661285, + "router_z_loss_clip": 1.82714844, + "router_z_loss_mlp": 0.15344238, + "step": 5989, + "time_per_iteration": 2.5010430812835693 + }, + { + "auxiliary_loss_clip": 0.06467338, + "auxiliary_loss_mlp": 0.01268061, + "balance_loss_clip": 0.06289014, + "balance_loss_mlp": 0.01254173, + "epoch": 0.36013828348113636, + "flos": 28008706487040.0, + "grad_norm": 1.4999082498201763, + "language_loss": 0.80117184, + "learning_rate": 2.961534094403931e-06, + "loss": 0.87852585, + "num_input_tokens_seen": 128682210, + "router_z_loss_clip": 1.78515625, + "router_z_loss_mlp": 0.13897705, + "step": 5990, + "time_per_iteration": 2.6733410358428955 + }, + { + "auxiliary_loss_clip": 0.06464024, + "auxiliary_loss_mlp": 0.01270971, + "balance_loss_clip": 0.0628631, + "balance_loss_mlp": 0.01255938, + "epoch": 0.3601984067338043, + "flos": 20088050135040.0, + "grad_norm": 1.799909646769202, + "language_loss": 0.84338784, + "learning_rate": 2.961192577338698e-06, + "loss": 0.92073774, + "num_input_tokens_seen": 128700445, + "router_z_loss_clip": 1.77441406, + "router_z_loss_mlp": 0.15032959, + "step": 5991, + "time_per_iteration": 2.518554925918579 + }, + { + "auxiliary_loss_clip": 0.06474696, + "auxiliary_loss_mlp": 0.01276578, + "balance_loss_clip": 0.06292041, + "balance_loss_mlp": 0.01261367, + "epoch": 0.3602585299864723, + "flos": 18622362516480.0, + "grad_norm": 1.891276760716041, + "language_loss": 0.76406145, + "learning_rate": 2.9608510238249463e-06, + "loss": 0.84157419, + "num_input_tokens_seen": 128716855, + "router_z_loss_clip": 1.82617188, + "router_z_loss_mlp": 0.1519165, + "step": 5992, + "time_per_iteration": 2.5224106311798096 + }, + { + "auxiliary_loss_clip": 0.06471405, + "auxiliary_loss_mlp": 0.01273147, + "balance_loss_clip": 0.06294376, + "balance_loss_mlp": 0.01258496, + "epoch": 0.36031865323914025, + "flos": 19579059308160.0, + "grad_norm": 2.086772991356176, + "language_loss": 0.78120929, + "learning_rate": 2.960509433875627e-06, + "loss": 0.8586548, + "num_input_tokens_seen": 128735835, + "router_z_loss_clip": 1.76953125, + "router_z_loss_mlp": 0.14648438, + "step": 5993, + "time_per_iteration": 2.5155129432678223 + }, + { + "auxiliary_loss_clip": 0.06474859, + "auxiliary_loss_mlp": 0.01271898, + "balance_loss_clip": 0.06293729, + "balance_loss_mlp": 0.01257807, + "epoch": 0.3603787764918082, + "flos": 17495871425280.0, + "grad_norm": 1.6487847999674183, + "language_loss": 0.74534261, + "learning_rate": 2.9601678075036943e-06, + "loss": 0.82281017, + "num_input_tokens_seen": 128752465, + "router_z_loss_clip": 1.81445312, + "router_z_loss_mlp": 0.14086914, + "step": 5994, + "time_per_iteration": 2.647794723510742 + }, + { + "auxiliary_loss_clip": 0.06474246, + "auxiliary_loss_mlp": 0.01268785, + "balance_loss_clip": 0.06290799, + "balance_loss_mlp": 0.01254415, + "epoch": 0.3604388997444762, + "flos": 15528823701120.0, + "grad_norm": 1.8873654318884407, + "language_loss": 0.69500113, + "learning_rate": 2.9598261447221024e-06, + "loss": 0.77243149, + "num_input_tokens_seen": 128770865, + "router_z_loss_clip": 1.83691406, + "router_z_loss_mlp": 0.14361572, + "step": 5995, + "time_per_iteration": 2.501981019973755 + }, + { + "auxiliary_loss_clip": 0.06479774, + "auxiliary_loss_mlp": 0.01276345, + "balance_loss_clip": 0.06295834, + "balance_loss_mlp": 0.01261688, + "epoch": 0.36049902299714415, + "flos": 17316559687680.0, + "grad_norm": 1.8201062799427143, + "language_loss": 0.8309989, + "learning_rate": 2.9594844455438057e-06, + "loss": 0.90856004, + "num_input_tokens_seen": 128789730, + "router_z_loss_clip": 1.83886719, + "router_z_loss_mlp": 0.14642334, + "step": 5996, + "time_per_iteration": 2.551095962524414 + }, + { + "auxiliary_loss_clip": 0.06472808, + "auxiliary_loss_mlp": 0.01275418, + "balance_loss_clip": 0.06293936, + "balance_loss_mlp": 0.01260493, + "epoch": 0.3605591462498121, + "flos": 17061749821440.0, + "grad_norm": 2.2503529028172804, + "language_loss": 0.73762429, + "learning_rate": 2.959142709981763e-06, + "loss": 0.81510657, + "num_input_tokens_seen": 128806610, + "router_z_loss_clip": 1.79003906, + "router_z_loss_mlp": 0.14910889, + "step": 5997, + "time_per_iteration": 2.493100881576538 + }, + { + "auxiliary_loss_clip": 0.06465439, + "auxiliary_loss_mlp": 0.0127421, + "balance_loss_clip": 0.06288476, + "balance_loss_mlp": 0.0125944, + "epoch": 0.3606192695024801, + "flos": 16842508813440.0, + "grad_norm": 2.0075843423569326, + "language_loss": 0.69582814, + "learning_rate": 2.9588009380489337e-06, + "loss": 0.77322465, + "num_input_tokens_seen": 128824830, + "router_z_loss_clip": 1.76855469, + "router_z_loss_mlp": 0.14758301, + "step": 5998, + "time_per_iteration": 2.54227352142334 + }, + { + "auxiliary_loss_clip": 0.06468997, + "auxiliary_loss_mlp": 0.01272453, + "balance_loss_clip": 0.06292363, + "balance_loss_mlp": 0.01258243, + "epoch": 0.36067939275514804, + "flos": 12134424401280.0, + "grad_norm": 2.607888629955908, + "language_loss": 0.77566224, + "learning_rate": 2.9584591297582758e-06, + "loss": 0.8530767, + "num_input_tokens_seen": 128838170, + "router_z_loss_clip": 1.76757812, + "router_z_loss_mlp": 0.14208984, + "step": 5999, + "time_per_iteration": 2.456887722015381 + }, + { + "auxiliary_loss_clip": 0.06474666, + "auxiliary_loss_mlp": 0.01272087, + "balance_loss_clip": 0.06294585, + "balance_loss_mlp": 0.01257776, + "epoch": 0.360739516007816, + "flos": 18047390999040.0, + "grad_norm": 1.725953097254869, + "language_loss": 0.78777629, + "learning_rate": 2.9581172851227516e-06, + "loss": 0.86524385, + "num_input_tokens_seen": 128855625, + "router_z_loss_clip": 1.80273438, + "router_z_loss_mlp": 0.14300537, + "step": 6000, + "time_per_iteration": 2.523397445678711 + }, + { + "auxiliary_loss_clip": 0.06471578, + "auxiliary_loss_mlp": 0.01271527, + "balance_loss_clip": 0.06294253, + "balance_loss_mlp": 0.01257854, + "epoch": 0.360799639260484, + "flos": 18555417504000.0, + "grad_norm": 1.7389483603698193, + "language_loss": 0.78602117, + "learning_rate": 2.9577754041553243e-06, + "loss": 0.86345226, + "num_input_tokens_seen": 128873540, + "router_z_loss_clip": 1.77148438, + "router_z_loss_mlp": 0.13671875, + "step": 6001, + "time_per_iteration": 2.4887304306030273 + }, + { + "auxiliary_loss_clip": 0.06462014, + "auxiliary_loss_mlp": 0.01269074, + "balance_loss_clip": 0.06287026, + "balance_loss_mlp": 0.012549, + "epoch": 0.36085976251315194, + "flos": 19688029943040.0, + "grad_norm": 2.5640130860082206, + "language_loss": 0.83264118, + "learning_rate": 2.9574334868689575e-06, + "loss": 0.90995204, + "num_input_tokens_seen": 128889925, + "router_z_loss_clip": 1.74902344, + "router_z_loss_mlp": 0.14178467, + "step": 6002, + "time_per_iteration": 2.523263931274414 + }, + { + "auxiliary_loss_clip": 0.06462792, + "auxiliary_loss_mlp": 0.01274754, + "balance_loss_clip": 0.06293326, + "balance_loss_mlp": 0.01262034, + "epoch": 0.3609198857658199, + "flos": 24204476067840.0, + "grad_norm": 2.058215255218527, + "language_loss": 0.91365647, + "learning_rate": 2.9570915332766165e-06, + "loss": 0.991032, + "num_input_tokens_seen": 128906890, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.12713623, + "step": 6003, + "time_per_iteration": 2.5147922039031982 + }, + { + "auxiliary_loss_clip": 0.06424739, + "auxiliary_loss_mlp": 0.01257394, + "balance_loss_clip": 0.06345953, + "balance_loss_mlp": 0.01254351, + "epoch": 0.3609800090184879, + "flos": 57134288044800.0, + "grad_norm": 0.8495896975763515, + "language_loss": 0.53457719, + "learning_rate": 2.9567495433912693e-06, + "loss": 0.61139846, + "num_input_tokens_seen": 128965940, + "router_z_loss_clip": 0.78857422, + "router_z_loss_mlp": 0.03041077, + "step": 6004, + "time_per_iteration": 3.1006038188934326 + }, + { + "auxiliary_loss_clip": 0.06473242, + "auxiliary_loss_mlp": 0.01270523, + "balance_loss_clip": 0.06291834, + "balance_loss_mlp": 0.0125549, + "epoch": 0.3610401322711559, + "flos": 20817120510720.0, + "grad_norm": 1.7032625156204924, + "language_loss": 0.78291458, + "learning_rate": 2.956407517225883e-06, + "loss": 0.86035228, + "num_input_tokens_seen": 128985835, + "router_z_loss_clip": 1.81640625, + "router_z_loss_mlp": 0.15026855, + "step": 6005, + "time_per_iteration": 2.507681369781494 + }, + { + "auxiliary_loss_clip": 0.06466124, + "auxiliary_loss_mlp": 0.01274708, + "balance_loss_clip": 0.06289654, + "balance_loss_mlp": 0.01260373, + "epoch": 0.36110025552382385, + "flos": 13704302972160.0, + "grad_norm": 1.9788670063291258, + "language_loss": 0.79365236, + "learning_rate": 2.956065454793429e-06, + "loss": 0.87106061, + "num_input_tokens_seen": 129003120, + "router_z_loss_clip": 1.76464844, + "router_z_loss_mlp": 0.14349365, + "step": 6006, + "time_per_iteration": 2.6221675872802734 + }, + { + "auxiliary_loss_clip": 0.06467897, + "auxiliary_loss_mlp": 0.01276481, + "balance_loss_clip": 0.06290089, + "balance_loss_mlp": 0.01260317, + "epoch": 0.3611603787764918, + "flos": 22461490961280.0, + "grad_norm": 1.8947484153914913, + "language_loss": 0.84532005, + "learning_rate": 2.955723356106876e-06, + "loss": 0.92276382, + "num_input_tokens_seen": 129021645, + "router_z_loss_clip": 1.77734375, + "router_z_loss_mlp": 0.16162109, + "step": 6007, + "time_per_iteration": 2.5697944164276123 + }, + { + "auxiliary_loss_clip": 0.06477423, + "auxiliary_loss_mlp": 0.01275582, + "balance_loss_clip": 0.06289505, + "balance_loss_mlp": 0.0126018, + "epoch": 0.3612205020291598, + "flos": 20892954055680.0, + "grad_norm": 2.2451481952848953, + "language_loss": 0.73192191, + "learning_rate": 2.955381221179198e-06, + "loss": 0.80945194, + "num_input_tokens_seen": 129038375, + "router_z_loss_clip": 1.87890625, + "router_z_loss_mlp": 0.1541748, + "step": 6008, + "time_per_iteration": 2.5410661697387695 + }, + { + "auxiliary_loss_clip": 0.06468849, + "auxiliary_loss_mlp": 0.01276747, + "balance_loss_clip": 0.06288531, + "balance_loss_mlp": 0.01262036, + "epoch": 0.36128062528182775, + "flos": 15747393876480.0, + "grad_norm": 2.0636796050179194, + "language_loss": 0.83194089, + "learning_rate": 2.955039050023368e-06, + "loss": 0.90939683, + "num_input_tokens_seen": 129056235, + "router_z_loss_clip": 1.80273438, + "router_z_loss_mlp": 0.1472168, + "step": 6009, + "time_per_iteration": 2.4896605014801025 + }, + { + "auxiliary_loss_clip": 0.06467466, + "auxiliary_loss_mlp": 0.01270582, + "balance_loss_clip": 0.0629051, + "balance_loss_mlp": 0.012553, + "epoch": 0.3613407485344957, + "flos": 16770239066880.0, + "grad_norm": 1.996577445690206, + "language_loss": 0.7613554, + "learning_rate": 2.954696842652362e-06, + "loss": 0.83873594, + "num_input_tokens_seen": 129072405, + "router_z_loss_clip": 1.76953125, + "router_z_loss_mlp": 0.15258789, + "step": 6010, + "time_per_iteration": 2.501328468322754 + }, + { + "auxiliary_loss_clip": 0.064712, + "auxiliary_loss_mlp": 0.0127317, + "balance_loss_clip": 0.06292284, + "balance_loss_mlp": 0.01258734, + "epoch": 0.3614008717871637, + "flos": 20376625996800.0, + "grad_norm": 1.7565456089129825, + "language_loss": 0.8353886, + "learning_rate": 2.9543545990791554e-06, + "loss": 0.91283226, + "num_input_tokens_seen": 129090225, + "router_z_loss_clip": 1.78710938, + "router_z_loss_mlp": 0.14440918, + "step": 6011, + "time_per_iteration": 2.5080785751342773 + }, + { + "auxiliary_loss_clip": 0.06473367, + "auxiliary_loss_mlp": 0.01273027, + "balance_loss_clip": 0.06288376, + "balance_loss_mlp": 0.0125784, + "epoch": 0.36146099503983165, + "flos": 22782071882880.0, + "grad_norm": 2.5852128775447536, + "language_loss": 0.62982023, + "learning_rate": 2.954012319316727e-06, + "loss": 0.70728415, + "num_input_tokens_seen": 129107685, + "router_z_loss_clip": 1.84863281, + "router_z_loss_mlp": 0.15185547, + "step": 6012, + "time_per_iteration": 2.5285983085632324 + }, + { + "auxiliary_loss_clip": 0.06468817, + "auxiliary_loss_mlp": 0.01279391, + "balance_loss_clip": 0.06292222, + "balance_loss_mlp": 0.01264728, + "epoch": 0.3615211182924996, + "flos": 23002277212800.0, + "grad_norm": 2.060645495819417, + "language_loss": 0.83850408, + "learning_rate": 2.9536700033780565e-06, + "loss": 0.91598618, + "num_input_tokens_seen": 129125315, + "router_z_loss_clip": 1.76660156, + "router_z_loss_mlp": 0.14648438, + "step": 6013, + "time_per_iteration": 2.511187791824341 + }, + { + "auxiliary_loss_clip": 0.06469796, + "auxiliary_loss_mlp": 0.01276155, + "balance_loss_clip": 0.06291521, + "balance_loss_mlp": 0.01259501, + "epoch": 0.3615812415451676, + "flos": 16652631461760.0, + "grad_norm": 1.9072870373759168, + "language_loss": 0.92107058, + "learning_rate": 2.9533276512761228e-06, + "loss": 0.99853015, + "num_input_tokens_seen": 129141600, + "router_z_loss_clip": 1.78417969, + "router_z_loss_mlp": 0.16638184, + "step": 6014, + "time_per_iteration": 2.498011350631714 + }, + { + "auxiliary_loss_clip": 0.06466013, + "auxiliary_loss_mlp": 0.01275475, + "balance_loss_clip": 0.06290498, + "balance_loss_mlp": 0.01260097, + "epoch": 0.36164136479783554, + "flos": 21325733994240.0, + "grad_norm": 8.045361949377702, + "language_loss": 0.73973721, + "learning_rate": 2.95298526302391e-06, + "loss": 0.81715214, + "num_input_tokens_seen": 129160665, + "router_z_loss_clip": 1.75683594, + "router_z_loss_mlp": 0.15393066, + "step": 6015, + "time_per_iteration": 2.5139665603637695 + }, + { + "auxiliary_loss_clip": 0.0646963, + "auxiliary_loss_mlp": 0.01277804, + "balance_loss_clip": 0.06291166, + "balance_loss_mlp": 0.01262151, + "epoch": 0.3617014880505035, + "flos": 24176286368640.0, + "grad_norm": 1.9455925595590893, + "language_loss": 0.65181047, + "learning_rate": 2.9526428386344e-06, + "loss": 0.72928476, + "num_input_tokens_seen": 129179220, + "router_z_loss_clip": 1.78320312, + "router_z_loss_mlp": 0.15637207, + "step": 6016, + "time_per_iteration": 2.5485315322875977 + }, + { + "auxiliary_loss_clip": 0.06469464, + "auxiliary_loss_mlp": 0.01276058, + "balance_loss_clip": 0.06288736, + "balance_loss_mlp": 0.01259261, + "epoch": 0.3617616113031715, + "flos": 39023278997760.0, + "grad_norm": 1.6846943976812254, + "language_loss": 0.72102833, + "learning_rate": 2.9523003781205785e-06, + "loss": 0.79848349, + "num_input_tokens_seen": 129200385, + "router_z_loss_clip": 1.81152344, + "router_z_loss_mlp": 0.16784668, + "step": 6017, + "time_per_iteration": 2.6685996055603027 + }, + { + "auxiliary_loss_clip": 0.06470844, + "auxiliary_loss_mlp": 0.01272479, + "balance_loss_clip": 0.06287402, + "balance_loss_mlp": 0.01256886, + "epoch": 0.3618217345558395, + "flos": 12135807993600.0, + "grad_norm": 2.3155685522099962, + "language_loss": 0.74387789, + "learning_rate": 2.9519578814954307e-06, + "loss": 0.82131112, + "num_input_tokens_seen": 129217395, + "router_z_loss_clip": 1.83398438, + "router_z_loss_mlp": 0.15600586, + "step": 6018, + "time_per_iteration": 3.93249249458313 + }, + { + "auxiliary_loss_clip": 0.06458628, + "auxiliary_loss_mlp": 0.01273986, + "balance_loss_clip": 0.06287278, + "balance_loss_mlp": 0.0125856, + "epoch": 0.36188185780850746, + "flos": 24941722216320.0, + "grad_norm": 2.406612181934337, + "language_loss": 0.69554305, + "learning_rate": 2.9516153487719448e-06, + "loss": 0.77286923, + "num_input_tokens_seen": 129238940, + "router_z_loss_clip": 1.71191406, + "router_z_loss_mlp": 0.1541748, + "step": 6019, + "time_per_iteration": 4.000872373580933 + }, + { + "auxiliary_loss_clip": 0.06472806, + "auxiliary_loss_mlp": 0.01271681, + "balance_loss_clip": 0.0628852, + "balance_loss_mlp": 0.01255815, + "epoch": 0.3619419810611754, + "flos": 20965014167040.0, + "grad_norm": 2.953778610066193, + "language_loss": 0.76874363, + "learning_rate": 2.95127277996311e-06, + "loss": 0.84618843, + "num_input_tokens_seen": 129258240, + "router_z_loss_clip": 1.84277344, + "router_z_loss_mlp": 0.15869141, + "step": 6020, + "time_per_iteration": 2.5465614795684814 + }, + { + "auxiliary_loss_clip": 0.06471147, + "auxiliary_loss_mlp": 0.01273965, + "balance_loss_clip": 0.06288891, + "balance_loss_mlp": 0.01257264, + "epoch": 0.3620021043138434, + "flos": 22535521643520.0, + "grad_norm": 2.2311166939070097, + "language_loss": 0.74090236, + "learning_rate": 2.9509301750819156e-06, + "loss": 0.81835353, + "num_input_tokens_seen": 129279040, + "router_z_loss_clip": 1.82519531, + "router_z_loss_mlp": 0.16687012, + "step": 6021, + "time_per_iteration": 2.57817006111145 + }, + { + "auxiliary_loss_clip": 0.06467178, + "auxiliary_loss_mlp": 0.01270658, + "balance_loss_clip": 0.0628859, + "balance_loss_mlp": 0.01255685, + "epoch": 0.36206222756651135, + "flos": 15602183550720.0, + "grad_norm": 5.238961551513005, + "language_loss": 0.81591839, + "learning_rate": 2.9505875341413533e-06, + "loss": 0.89329672, + "num_input_tokens_seen": 129295415, + "router_z_loss_clip": 1.78710938, + "router_z_loss_mlp": 0.1496582, + "step": 6022, + "time_per_iteration": 2.5385305881500244 + }, + { + "auxiliary_loss_clip": 0.06457289, + "auxiliary_loss_mlp": 0.0127544, + "balance_loss_clip": 0.06285636, + "balance_loss_mlp": 0.01260349, + "epoch": 0.3621223508191793, + "flos": 23594019546240.0, + "grad_norm": 2.318322058767841, + "language_loss": 0.81707698, + "learning_rate": 2.950244857154417e-06, + "loss": 0.89440429, + "num_input_tokens_seen": 129312620, + "router_z_loss_clip": 1.71484375, + "router_z_loss_mlp": 0.15075684, + "step": 6023, + "time_per_iteration": 2.604048013687134 + }, + { + "auxiliary_loss_clip": 0.0647051, + "auxiliary_loss_mlp": 0.01276448, + "balance_loss_clip": 0.06288643, + "balance_loss_mlp": 0.01259795, + "epoch": 0.3621824740718473, + "flos": 22316490270720.0, + "grad_norm": 2.4056275848880038, + "language_loss": 0.80008531, + "learning_rate": 2.9499021441341e-06, + "loss": 0.87755489, + "num_input_tokens_seen": 129331825, + "router_z_loss_clip": 1.81640625, + "router_z_loss_mlp": 0.16650391, + "step": 6024, + "time_per_iteration": 3.9998557567596436 + }, + { + "auxiliary_loss_clip": 0.06462081, + "auxiliary_loss_mlp": 0.01273703, + "balance_loss_clip": 0.06288754, + "balance_loss_mlp": 0.01258599, + "epoch": 0.36224259732451525, + "flos": 16769232817920.0, + "grad_norm": 2.2201652107227354, + "language_loss": 0.75149572, + "learning_rate": 2.9495593950933997e-06, + "loss": 0.82885349, + "num_input_tokens_seen": 129350400, + "router_z_loss_clip": 1.73535156, + "router_z_loss_mlp": 0.15112305, + "step": 6025, + "time_per_iteration": 2.5139317512512207 + }, + { + "auxiliary_loss_clip": 0.06466474, + "auxiliary_loss_mlp": 0.01274175, + "balance_loss_clip": 0.06290425, + "balance_loss_mlp": 0.01260198, + "epoch": 0.3623027205771832, + "flos": 23156585706240.0, + "grad_norm": 1.704945166995659, + "language_loss": 0.72471905, + "learning_rate": 2.9492166100453107e-06, + "loss": 0.80212557, + "num_input_tokens_seen": 129371155, + "router_z_loss_clip": 1.76074219, + "router_z_loss_mlp": 0.13989258, + "step": 6026, + "time_per_iteration": 3.974848985671997 + }, + { + "auxiliary_loss_clip": 0.06476888, + "auxiliary_loss_mlp": 0.01276899, + "balance_loss_clip": 0.06290971, + "balance_loss_mlp": 0.01260233, + "epoch": 0.3623628438298512, + "flos": 28556829970560.0, + "grad_norm": 1.945563554904942, + "language_loss": 0.79502189, + "learning_rate": 2.948873789002833e-06, + "loss": 0.87255979, + "num_input_tokens_seen": 129391230, + "router_z_loss_clip": 1.859375, + "router_z_loss_mlp": 0.16662598, + "step": 6027, + "time_per_iteration": 2.614713430404663 + }, + { + "auxiliary_loss_clip": 0.06469107, + "auxiliary_loss_mlp": 0.01272818, + "balance_loss_clip": 0.06288799, + "balance_loss_mlp": 0.01256427, + "epoch": 0.36242296708251914, + "flos": 25492193614080.0, + "grad_norm": 4.95803648299326, + "language_loss": 0.68042505, + "learning_rate": 2.9485309319789667e-06, + "loss": 0.75784421, + "num_input_tokens_seen": 129410065, + "router_z_loss_clip": 1.80371094, + "router_z_loss_mlp": 0.16381836, + "step": 6028, + "time_per_iteration": 2.5680782794952393 + }, + { + "auxiliary_loss_clip": 0.06467344, + "auxiliary_loss_mlp": 0.01275782, + "balance_loss_clip": 0.0629041, + "balance_loss_mlp": 0.01260273, + "epoch": 0.3624830903351871, + "flos": 16296062411520.0, + "grad_norm": 2.2968183263714983, + "language_loss": 0.85463655, + "learning_rate": 2.9481880389867117e-06, + "loss": 0.93206775, + "num_input_tokens_seen": 129428655, + "router_z_loss_clip": 1.76953125, + "router_z_loss_mlp": 0.1550293, + "step": 6029, + "time_per_iteration": 2.519960403442383 + }, + { + "auxiliary_loss_clip": 0.06462874, + "auxiliary_loss_mlp": 0.01270115, + "balance_loss_clip": 0.0628645, + "balance_loss_mlp": 0.01255107, + "epoch": 0.36254321358785513, + "flos": 18302200865280.0, + "grad_norm": 1.7460468862336926, + "language_loss": 0.72888201, + "learning_rate": 2.9478451100390714e-06, + "loss": 0.80621189, + "num_input_tokens_seen": 129447845, + "router_z_loss_clip": 1.76367188, + "router_z_loss_mlp": 0.15008545, + "step": 6030, + "time_per_iteration": 2.480053663253784 + }, + { + "auxiliary_loss_clip": 0.06476077, + "auxiliary_loss_mlp": 0.01274605, + "balance_loss_clip": 0.06291036, + "balance_loss_mlp": 0.01257558, + "epoch": 0.3626033368405231, + "flos": 14870387917440.0, + "grad_norm": 3.30241855147188, + "language_loss": 0.75249928, + "learning_rate": 2.94750214514905e-06, + "loss": 0.83000606, + "num_input_tokens_seen": 129463275, + "router_z_loss_clip": 1.84960938, + "router_z_loss_mlp": 0.17041016, + "step": 6031, + "time_per_iteration": 2.4887540340423584 + }, + { + "auxiliary_loss_clip": 0.06465365, + "auxiliary_loss_mlp": 0.01279599, + "balance_loss_clip": 0.06287815, + "balance_loss_mlp": 0.01264245, + "epoch": 0.36266346009319106, + "flos": 22312632983040.0, + "grad_norm": 2.377019393957944, + "language_loss": 0.73490477, + "learning_rate": 2.9471591443296516e-06, + "loss": 0.81235439, + "num_input_tokens_seen": 129483205, + "router_z_loss_clip": 1.77539062, + "router_z_loss_mlp": 0.15344238, + "step": 6032, + "time_per_iteration": 2.5194106101989746 + }, + { + "auxiliary_loss_clip": 0.06471337, + "auxiliary_loss_mlp": 0.01274047, + "balance_loss_clip": 0.06290144, + "balance_loss_mlp": 0.01258776, + "epoch": 0.362723583345859, + "flos": 18228044401920.0, + "grad_norm": 1.8908046818451942, + "language_loss": 0.78089464, + "learning_rate": 2.946816107593884e-06, + "loss": 0.85834849, + "num_input_tokens_seen": 129499885, + "router_z_loss_clip": 1.81054688, + "router_z_loss_mlp": 0.15270996, + "step": 6033, + "time_per_iteration": 2.6062612533569336 + }, + { + "auxiliary_loss_clip": 0.06434236, + "auxiliary_loss_mlp": 0.01267532, + "balance_loss_clip": 0.06350702, + "balance_loss_mlp": 0.01264055, + "epoch": 0.362783706598527, + "flos": 68519307456000.0, + "grad_norm": 0.7613876705351186, + "language_loss": 0.64809752, + "learning_rate": 2.9464730349547547e-06, + "loss": 0.72511524, + "num_input_tokens_seen": 129561885, + "router_z_loss_clip": 0.8359375, + "router_z_loss_mlp": 0.03485107, + "step": 6034, + "time_per_iteration": 3.216454267501831 + }, + { + "auxiliary_loss_clip": 0.06466131, + "auxiliary_loss_mlp": 0.01276184, + "balance_loss_clip": 0.06289437, + "balance_loss_mlp": 0.01260222, + "epoch": 0.36284382985119495, + "flos": 26583535117440.0, + "grad_norm": 2.053623051898619, + "language_loss": 0.89456552, + "learning_rate": 2.946129926425273e-06, + "loss": 0.97198874, + "num_input_tokens_seen": 129582325, + "router_z_loss_clip": 1.765625, + "router_z_loss_mlp": 0.15966797, + "step": 6035, + "time_per_iteration": 2.5606629848480225 + }, + { + "auxiliary_loss_clip": 0.06479318, + "auxiliary_loss_mlp": 0.01272395, + "balance_loss_clip": 0.06295764, + "balance_loss_mlp": 0.0125592, + "epoch": 0.3629039531038629, + "flos": 20162919358080.0, + "grad_norm": 1.7740824971358589, + "language_loss": 0.73855877, + "learning_rate": 2.9457867820184496e-06, + "loss": 0.81607592, + "num_input_tokens_seen": 129600350, + "router_z_loss_clip": 1.83398438, + "router_z_loss_mlp": 0.16455078, + "step": 6036, + "time_per_iteration": 2.5144500732421875 + }, + { + "auxiliary_loss_clip": 0.06482191, + "auxiliary_loss_mlp": 0.01272832, + "balance_loss_clip": 0.06296846, + "balance_loss_mlp": 0.01256823, + "epoch": 0.3629640763565309, + "flos": 18631838027520.0, + "grad_norm": 1.8050884717083873, + "language_loss": 0.76438695, + "learning_rate": 2.945443601747297e-06, + "loss": 0.84193718, + "num_input_tokens_seen": 129618425, + "router_z_loss_clip": 1.8515625, + "router_z_loss_mlp": 0.16015625, + "step": 6037, + "time_per_iteration": 2.5286643505096436 + }, + { + "auxiliary_loss_clip": 0.06467965, + "auxiliary_loss_mlp": 0.01277972, + "balance_loss_clip": 0.06292737, + "balance_loss_mlp": 0.01262546, + "epoch": 0.36302419960919885, + "flos": 19577256445440.0, + "grad_norm": 1.633141884703147, + "language_loss": 0.78871524, + "learning_rate": 2.945100385624828e-06, + "loss": 0.86617458, + "num_input_tokens_seen": 129636750, + "router_z_loss_clip": 1.75097656, + "router_z_loss_mlp": 0.1541748, + "step": 6038, + "time_per_iteration": 2.5062947273254395 + }, + { + "auxiliary_loss_clip": 0.06400688, + "auxiliary_loss_mlp": 0.01261234, + "balance_loss_clip": 0.06318134, + "balance_loss_mlp": 0.01257723, + "epoch": 0.3630843228618668, + "flos": 63817805589120.0, + "grad_norm": 0.8140528620617334, + "language_loss": 0.63225597, + "learning_rate": 2.9447571336640573e-06, + "loss": 0.70887518, + "num_input_tokens_seen": 129699030, + "router_z_loss_clip": 0.82470703, + "router_z_loss_mlp": 0.03512573, + "step": 6039, + "time_per_iteration": 3.269761323928833 + }, + { + "auxiliary_loss_clip": 0.06467007, + "auxiliary_loss_mlp": 0.01269703, + "balance_loss_clip": 0.06289599, + "balance_loss_mlp": 0.01253932, + "epoch": 0.3631444461145348, + "flos": 21841600855680.0, + "grad_norm": 2.592040544468795, + "language_loss": 0.71409321, + "learning_rate": 2.944413845878002e-06, + "loss": 0.79146034, + "num_input_tokens_seen": 129717135, + "router_z_loss_clip": 1.7734375, + "router_z_loss_mlp": 0.15783691, + "step": 6040, + "time_per_iteration": 2.5549709796905518 + }, + { + "auxiliary_loss_clip": 0.06477243, + "auxiliary_loss_mlp": 0.01276394, + "balance_loss_clip": 0.06293249, + "balance_loss_mlp": 0.01260277, + "epoch": 0.36320456936720275, + "flos": 21727850538240.0, + "grad_norm": 1.6745525965006305, + "language_loss": 0.81387192, + "learning_rate": 2.9440705222796783e-06, + "loss": 0.89140832, + "num_input_tokens_seen": 129735940, + "router_z_loss_clip": 1.83789062, + "router_z_loss_mlp": 0.16113281, + "step": 6041, + "time_per_iteration": 2.529555320739746 + }, + { + "auxiliary_loss_clip": 0.06473525, + "auxiliary_loss_mlp": 0.01278326, + "balance_loss_clip": 0.0629223, + "balance_loss_mlp": 0.01261291, + "epoch": 0.3632646926198707, + "flos": 17024713516800.0, + "grad_norm": 3.0330286867158547, + "language_loss": 0.8477391, + "learning_rate": 2.943727162882107e-06, + "loss": 0.92525762, + "num_input_tokens_seen": 129752790, + "router_z_loss_clip": 1.81347656, + "router_z_loss_mlp": 0.17016602, + "step": 6042, + "time_per_iteration": 2.52242112159729 + }, + { + "auxiliary_loss_clip": 0.06469671, + "auxiliary_loss_mlp": 0.01277961, + "balance_loss_clip": 0.06290909, + "balance_loss_mlp": 0.01261892, + "epoch": 0.36332481587253873, + "flos": 23337868014720.0, + "grad_norm": 1.7311470578574424, + "language_loss": 0.78563523, + "learning_rate": 2.9433837676983064e-06, + "loss": 0.86311156, + "num_input_tokens_seen": 129773655, + "router_z_loss_clip": 1.78808594, + "router_z_loss_mlp": 0.16088867, + "step": 6043, + "time_per_iteration": 2.5507187843322754 + }, + { + "auxiliary_loss_clip": 0.06465393, + "auxiliary_loss_mlp": 0.0127573, + "balance_loss_clip": 0.06289753, + "balance_loss_mlp": 0.01258755, + "epoch": 0.3633849391252067, + "flos": 10748134126080.0, + "grad_norm": 2.0752100798218245, + "language_loss": 0.66141021, + "learning_rate": 2.943040336741298e-06, + "loss": 0.73882145, + "num_input_tokens_seen": 129791605, + "router_z_loss_clip": 1.75683594, + "router_z_loss_mlp": 0.16967773, + "step": 6044, + "time_per_iteration": 2.5431315898895264 + }, + { + "auxiliary_loss_clip": 0.06470387, + "auxiliary_loss_mlp": 0.0127397, + "balance_loss_clip": 0.06293066, + "balance_loss_mlp": 0.01258794, + "epoch": 0.36344506237787466, + "flos": 25856351458560.0, + "grad_norm": 1.7019744870222642, + "language_loss": 0.81317604, + "learning_rate": 2.9426968700241066e-06, + "loss": 0.89061964, + "num_input_tokens_seen": 129811075, + "router_z_loss_clip": 1.7734375, + "router_z_loss_mlp": 0.15185547, + "step": 6045, + "time_per_iteration": 2.578608274459839 + }, + { + "auxiliary_loss_clip": 0.06471765, + "auxiliary_loss_mlp": 0.01277035, + "balance_loss_clip": 0.06291001, + "balance_loss_mlp": 0.01260977, + "epoch": 0.3635051856305426, + "flos": 30161900056320.0, + "grad_norm": 1.9031490691130954, + "language_loss": 0.64869618, + "learning_rate": 2.942353367559755e-06, + "loss": 0.72618413, + "num_input_tokens_seen": 129833755, + "router_z_loss_clip": 1.80761719, + "router_z_loss_mlp": 0.16064453, + "step": 6046, + "time_per_iteration": 2.6581788063049316 + }, + { + "auxiliary_loss_clip": 0.06469898, + "auxiliary_loss_mlp": 0.01279877, + "balance_loss_clip": 0.06291277, + "balance_loss_mlp": 0.01264082, + "epoch": 0.3635653088832106, + "flos": 22204626670080.0, + "grad_norm": 1.4883910134219482, + "language_loss": 0.77790976, + "learning_rate": 2.9420098293612692e-06, + "loss": 0.85540754, + "num_input_tokens_seen": 129854475, + "router_z_loss_clip": 1.78417969, + "router_z_loss_mlp": 0.15783691, + "step": 6047, + "time_per_iteration": 2.59384822845459 + }, + { + "auxiliary_loss_clip": 0.06482202, + "auxiliary_loss_mlp": 0.01277437, + "balance_loss_clip": 0.0629375, + "balance_loss_mlp": 0.01259794, + "epoch": 0.36362543213587856, + "flos": 24793409289600.0, + "grad_norm": 2.402065763679051, + "language_loss": 0.79315472, + "learning_rate": 2.9416662554416767e-06, + "loss": 0.87075114, + "num_input_tokens_seen": 129873530, + "router_z_loss_clip": 1.88574219, + "router_z_loss_mlp": 0.1763916, + "step": 6048, + "time_per_iteration": 2.586355447769165 + }, + { + "auxiliary_loss_clip": 0.06388409, + "auxiliary_loss_mlp": 0.01275978, + "balance_loss_clip": 0.06308184, + "balance_loss_mlp": 0.01272211, + "epoch": 0.3636855553885465, + "flos": 62547320056320.0, + "grad_norm": 0.756250652706744, + "language_loss": 0.52505761, + "learning_rate": 2.9413226458140054e-06, + "loss": 0.6017015, + "num_input_tokens_seen": 129940400, + "router_z_loss_clip": 0.80126953, + "router_z_loss_mlp": 0.03759766, + "step": 6049, + "time_per_iteration": 3.1991608142852783 + }, + { + "auxiliary_loss_clip": 0.06471006, + "auxiliary_loss_mlp": 0.01281005, + "balance_loss_clip": 0.06289691, + "balance_loss_mlp": 0.01264518, + "epoch": 0.3637456786412145, + "flos": 24067441514880.0, + "grad_norm": 1.9518715754512581, + "language_loss": 0.8677333, + "learning_rate": 2.9409790004912845e-06, + "loss": 0.94525343, + "num_input_tokens_seen": 129958635, + "router_z_loss_clip": 1.81152344, + "router_z_loss_mlp": 0.16467285, + "step": 6050, + "time_per_iteration": 2.619880437850952 + }, + { + "auxiliary_loss_clip": 0.06465575, + "auxiliary_loss_mlp": 0.01288294, + "balance_loss_clip": 0.06288004, + "balance_loss_mlp": 0.01271784, + "epoch": 0.36380580189388245, + "flos": 16697214633600.0, + "grad_norm": 2.0514222430242937, + "language_loss": 0.78671187, + "learning_rate": 2.940635319486546e-06, + "loss": 0.86425054, + "num_input_tokens_seen": 129977685, + "router_z_loss_clip": 1.77539062, + "router_z_loss_mlp": 0.16491699, + "step": 6051, + "time_per_iteration": 2.5192694664001465 + }, + { + "auxiliary_loss_clip": 0.064697, + "auxiliary_loss_mlp": 0.0128748, + "balance_loss_clip": 0.06289212, + "balance_loss_mlp": 0.01271315, + "epoch": 0.3638659251465504, + "flos": 25120279267200.0, + "grad_norm": 2.1218426019343943, + "language_loss": 0.82423818, + "learning_rate": 2.940291602812822e-06, + "loss": 0.90180993, + "num_input_tokens_seen": 129997530, + "router_z_loss_clip": 1.8046875, + "router_z_loss_mlp": 0.16174316, + "step": 6052, + "time_per_iteration": 2.6190178394317627 + }, + { + "auxiliary_loss_clip": 0.06462704, + "auxiliary_loss_mlp": 0.01293914, + "balance_loss_clip": 0.06289209, + "balance_loss_mlp": 0.0127831, + "epoch": 0.3639260483992184, + "flos": 23009698298880.0, + "grad_norm": 1.6976848198598335, + "language_loss": 0.72702307, + "learning_rate": 2.939947850483145e-06, + "loss": 0.80458927, + "num_input_tokens_seen": 130017955, + "router_z_loss_clip": 1.73339844, + "router_z_loss_mlp": 0.15588379, + "step": 6053, + "time_per_iteration": 2.5632545948028564 + }, + { + "auxiliary_loss_clip": 0.0637124, + "auxiliary_loss_mlp": 0.0126271, + "balance_loss_clip": 0.06291765, + "balance_loss_mlp": 0.01258046, + "epoch": 0.36398617165188635, + "flos": 70735043698560.0, + "grad_norm": 0.7367280535398725, + "language_loss": 0.61109686, + "learning_rate": 2.9396040625105532e-06, + "loss": 0.68743634, + "num_input_tokens_seen": 130074275, + "router_z_loss_clip": 0.79541016, + "router_z_loss_mlp": 0.04656982, + "step": 6054, + "time_per_iteration": 3.1670703887939453 + }, + { + "auxiliary_loss_clip": 0.06468257, + "auxiliary_loss_mlp": 0.01284514, + "balance_loss_clip": 0.06288631, + "balance_loss_mlp": 0.01267062, + "epoch": 0.3640462949045543, + "flos": 22241788755840.0, + "grad_norm": 2.4941401517388795, + "language_loss": 0.76399368, + "learning_rate": 2.9392602389080802e-06, + "loss": 0.84152138, + "num_input_tokens_seen": 130091375, + "router_z_loss_clip": 1.79589844, + "router_z_loss_mlp": 0.17456055, + "step": 6055, + "time_per_iteration": 2.5719425678253174 + }, + { + "auxiliary_loss_clip": 0.06463572, + "auxiliary_loss_mlp": 0.0128082, + "balance_loss_clip": 0.06286994, + "balance_loss_mlp": 0.01264023, + "epoch": 0.3641064181572223, + "flos": 21549964320000.0, + "grad_norm": 1.5003458585655993, + "language_loss": 0.75247842, + "learning_rate": 2.938916379688765e-06, + "loss": 0.82992232, + "num_input_tokens_seen": 130111595, + "router_z_loss_clip": 1.765625, + "router_z_loss_mlp": 0.16784668, + "step": 6056, + "time_per_iteration": 2.548563241958618 + }, + { + "auxiliary_loss_clip": 0.06463505, + "auxiliary_loss_mlp": 0.01288137, + "balance_loss_clip": 0.06286436, + "balance_loss_mlp": 0.01271805, + "epoch": 0.3641665414098903, + "flos": 22279873236480.0, + "grad_norm": 1.8427248639079936, + "language_loss": 0.80231911, + "learning_rate": 2.9385724848656468e-06, + "loss": 0.87983549, + "num_input_tokens_seen": 130131440, + "router_z_loss_clip": 1.77050781, + "router_z_loss_mlp": 0.16320801, + "step": 6057, + "time_per_iteration": 2.590890645980835 + }, + { + "auxiliary_loss_clip": 0.06463237, + "auxiliary_loss_mlp": 0.01288366, + "balance_loss_clip": 0.06286855, + "balance_loss_mlp": 0.01271259, + "epoch": 0.36422666466255826, + "flos": 28337211619200.0, + "grad_norm": 2.0267495677395106, + "language_loss": 0.80895132, + "learning_rate": 2.9382285544517647e-06, + "loss": 0.88646734, + "num_input_tokens_seen": 130151375, + "router_z_loss_clip": 1.76367188, + "router_z_loss_mlp": 0.17114258, + "step": 6058, + "time_per_iteration": 3.9912350177764893 + }, + { + "auxiliary_loss_clip": 0.06462751, + "auxiliary_loss_mlp": 0.01284352, + "balance_loss_clip": 0.06282878, + "balance_loss_mlp": 0.01267794, + "epoch": 0.36428678791522623, + "flos": 24177376471680.0, + "grad_norm": 1.829086801108262, + "language_loss": 0.84467566, + "learning_rate": 2.9378845884601636e-06, + "loss": 0.9221468, + "num_input_tokens_seen": 130169960, + "router_z_loss_clip": 1.80078125, + "router_z_loss_mlp": 0.16552734, + "step": 6059, + "time_per_iteration": 3.9484288692474365 + }, + { + "auxiliary_loss_clip": 0.06463904, + "auxiliary_loss_mlp": 0.01290231, + "balance_loss_clip": 0.06284287, + "balance_loss_mlp": 0.01274006, + "epoch": 0.3643469111678942, + "flos": 22535018519040.0, + "grad_norm": 1.8662633122766634, + "language_loss": 0.88296366, + "learning_rate": 2.937540586903884e-06, + "loss": 0.96050501, + "num_input_tokens_seen": 130189800, + "router_z_loss_clip": 1.796875, + "router_z_loss_mlp": 0.16223145, + "step": 6060, + "time_per_iteration": 2.580472946166992 + }, + { + "auxiliary_loss_clip": 0.06469811, + "auxiliary_loss_mlp": 0.01278183, + "balance_loss_clip": 0.06287585, + "balance_loss_mlp": 0.01260611, + "epoch": 0.36440703442056216, + "flos": 19432549244160.0, + "grad_norm": 2.050716636944588, + "language_loss": 0.66968513, + "learning_rate": 2.937196549795971e-06, + "loss": 0.74716496, + "num_input_tokens_seen": 130206370, + "router_z_loss_clip": 1.82226562, + "router_z_loss_mlp": 0.17578125, + "step": 6061, + "time_per_iteration": 2.4934303760528564 + }, + { + "auxiliary_loss_clip": 0.06472699, + "auxiliary_loss_mlp": 0.01276187, + "balance_loss_clip": 0.06290831, + "balance_loss_mlp": 0.01259283, + "epoch": 0.3644671576732301, + "flos": 18046300896000.0, + "grad_norm": 2.6099029342135838, + "language_loss": 0.76223081, + "learning_rate": 2.9368524771494718e-06, + "loss": 0.83971971, + "num_input_tokens_seen": 130224445, + "router_z_loss_clip": 1.81835938, + "router_z_loss_mlp": 0.16918945, + "step": 6062, + "time_per_iteration": 2.5342442989349365 + }, + { + "auxiliary_loss_clip": 0.06462175, + "auxiliary_loss_mlp": 0.01277866, + "balance_loss_clip": 0.06284274, + "balance_loss_mlp": 0.01261844, + "epoch": 0.3645272809258981, + "flos": 21549125779200.0, + "grad_norm": 1.679264330509425, + "language_loss": 0.7250427, + "learning_rate": 2.936508368977432e-06, + "loss": 0.80244315, + "num_input_tokens_seen": 130245380, + "router_z_loss_clip": 1.77929688, + "router_z_loss_mlp": 0.16027832, + "step": 6063, + "time_per_iteration": 2.560140609741211 + }, + { + "auxiliary_loss_clip": 0.06463223, + "auxiliary_loss_mlp": 0.01278838, + "balance_loss_clip": 0.0628884, + "balance_loss_mlp": 0.0126256, + "epoch": 0.36458740417856605, + "flos": 22753379059200.0, + "grad_norm": 1.9927269992491163, + "language_loss": 0.67982519, + "learning_rate": 2.936164225292901e-06, + "loss": 0.75724578, + "num_input_tokens_seen": 130265575, + "router_z_loss_clip": 1.74414062, + "router_z_loss_mlp": 0.16265869, + "step": 6064, + "time_per_iteration": 4.001475095748901 + }, + { + "auxiliary_loss_clip": 0.06469691, + "auxiliary_loss_mlp": 0.01281677, + "balance_loss_clip": 0.06288914, + "balance_loss_mlp": 0.01265131, + "epoch": 0.364647527431234, + "flos": 26147862213120.0, + "grad_norm": 2.2981357468080725, + "language_loss": 0.75006247, + "learning_rate": 2.9358200461089297e-06, + "loss": 0.82757616, + "num_input_tokens_seen": 130286195, + "router_z_loss_clip": 1.80761719, + "router_z_loss_mlp": 0.16540527, + "step": 6065, + "time_per_iteration": 2.557175397872925 + }, + { + "auxiliary_loss_clip": 0.06475934, + "auxiliary_loss_mlp": 0.01274844, + "balance_loss_clip": 0.06292161, + "balance_loss_mlp": 0.01257487, + "epoch": 0.364707650683902, + "flos": 31037941693440.0, + "grad_norm": 1.8804228270875918, + "language_loss": 0.75913531, + "learning_rate": 2.9354758314385676e-06, + "loss": 0.8366431, + "num_input_tokens_seen": 130306095, + "router_z_loss_clip": 1.83691406, + "router_z_loss_mlp": 0.17370605, + "step": 6066, + "time_per_iteration": 4.028696537017822 + }, + { + "auxiliary_loss_clip": 0.06465262, + "auxiliary_loss_mlp": 0.01275132, + "balance_loss_clip": 0.06290717, + "balance_loss_mlp": 0.01260124, + "epoch": 0.36476777393656995, + "flos": 19578933527040.0, + "grad_norm": 2.1324188585544293, + "language_loss": 0.77645338, + "learning_rate": 2.9351315812948684e-06, + "loss": 0.85385728, + "num_input_tokens_seen": 130324685, + "router_z_loss_clip": 1.74414062, + "router_z_loss_mlp": 0.15014648, + "step": 6067, + "time_per_iteration": 2.5697665214538574 + }, + { + "auxiliary_loss_clip": 0.06463823, + "auxiliary_loss_mlp": 0.01273764, + "balance_loss_clip": 0.06289702, + "balance_loss_mlp": 0.01258684, + "epoch": 0.3648278971892379, + "flos": 17754622433280.0, + "grad_norm": 1.930394247385299, + "language_loss": 0.71678597, + "learning_rate": 2.934787295690886e-06, + "loss": 0.7941618, + "num_input_tokens_seen": 130343855, + "router_z_loss_clip": 1.74023438, + "router_z_loss_mlp": 0.15063477, + "step": 6068, + "time_per_iteration": 2.4845492839813232 + }, + { + "auxiliary_loss_clip": 0.06473656, + "auxiliary_loss_mlp": 0.0127485, + "balance_loss_clip": 0.06290961, + "balance_loss_mlp": 0.01258005, + "epoch": 0.3648880204419059, + "flos": 17936952917760.0, + "grad_norm": 1.8532098574136342, + "language_loss": 0.73989958, + "learning_rate": 2.9344429746396755e-06, + "loss": 0.8173846, + "num_input_tokens_seen": 130362320, + "router_z_loss_clip": 1.82519531, + "router_z_loss_mlp": 0.16845703, + "step": 6069, + "time_per_iteration": 2.508863687515259 + }, + { + "auxiliary_loss_clip": 0.06469753, + "auxiliary_loss_mlp": 0.01277718, + "balance_loss_clip": 0.06287999, + "balance_loss_mlp": 0.01261684, + "epoch": 0.3649481436945739, + "flos": 22644911548800.0, + "grad_norm": 1.9157179359535086, + "language_loss": 0.66736126, + "learning_rate": 2.9340986181542945e-06, + "loss": 0.74483597, + "num_input_tokens_seen": 130383165, + "router_z_loss_clip": 1.81738281, + "router_z_loss_mlp": 0.16027832, + "step": 6070, + "time_per_iteration": 2.516735076904297 + }, + { + "auxiliary_loss_clip": 0.06467332, + "auxiliary_loss_mlp": 0.01274362, + "balance_loss_clip": 0.06291667, + "balance_loss_mlp": 0.01259169, + "epoch": 0.36500826694724187, + "flos": 21586036302720.0, + "grad_norm": 1.8858284323375742, + "language_loss": 0.7453323, + "learning_rate": 2.9337542262477994e-06, + "loss": 0.82274926, + "num_input_tokens_seen": 130402425, + "router_z_loss_clip": 1.75878906, + "router_z_loss_mlp": 0.1519165, + "step": 6071, + "time_per_iteration": 2.566274642944336 + }, + { + "auxiliary_loss_clip": 0.06468312, + "auxiliary_loss_mlp": 0.01272795, + "balance_loss_clip": 0.0629068, + "balance_loss_mlp": 0.0125703, + "epoch": 0.36506839019990983, + "flos": 13777746675840.0, + "grad_norm": 1.7184690359068113, + "language_loss": 0.88681865, + "learning_rate": 2.9334097989332506e-06, + "loss": 0.96422982, + "num_input_tokens_seen": 130419440, + "router_z_loss_clip": 1.77539062, + "router_z_loss_mlp": 0.15771484, + "step": 6072, + "time_per_iteration": 2.510390043258667 + }, + { + "auxiliary_loss_clip": 0.06471045, + "auxiliary_loss_mlp": 0.01276068, + "balance_loss_clip": 0.06292107, + "balance_loss_mlp": 0.01260285, + "epoch": 0.3651285134525778, + "flos": 17280739267200.0, + "grad_norm": 2.591250971390436, + "language_loss": 0.72601849, + "learning_rate": 2.9330653362237094e-06, + "loss": 0.80348963, + "num_input_tokens_seen": 130438495, + "router_z_loss_clip": 1.79101562, + "router_z_loss_mlp": 0.15771484, + "step": 6073, + "time_per_iteration": 2.5448079109191895 + }, + { + "auxiliary_loss_clip": 0.06476631, + "auxiliary_loss_mlp": 0.012706, + "balance_loss_clip": 0.06296042, + "balance_loss_mlp": 0.0125422, + "epoch": 0.36518863670524576, + "flos": 21914415653760.0, + "grad_norm": 2.188049192517554, + "language_loss": 0.66876209, + "learning_rate": 2.932720838132236e-06, + "loss": 0.74623442, + "num_input_tokens_seen": 130455575, + "router_z_loss_clip": 1.8046875, + "router_z_loss_mlp": 0.16394043, + "step": 6074, + "time_per_iteration": 2.5186121463775635 + }, + { + "auxiliary_loss_clip": 0.06466351, + "auxiliary_loss_mlp": 0.01270864, + "balance_loss_clip": 0.06289779, + "balance_loss_mlp": 0.01255319, + "epoch": 0.3652487599579137, + "flos": 27128933343360.0, + "grad_norm": 1.455377552522792, + "language_loss": 0.73552799, + "learning_rate": 2.9323763046718954e-06, + "loss": 0.81290013, + "num_input_tokens_seen": 130476385, + "router_z_loss_clip": 1.76269531, + "router_z_loss_mlp": 0.15551758, + "step": 6075, + "time_per_iteration": 2.5611414909362793 + }, + { + "auxiliary_loss_clip": 0.06476435, + "auxiliary_loss_mlp": 0.01270879, + "balance_loss_clip": 0.06292082, + "balance_loss_mlp": 0.01255107, + "epoch": 0.3653088832105817, + "flos": 19761683281920.0, + "grad_norm": 3.551310730384351, + "language_loss": 0.89872956, + "learning_rate": 2.9320317358557524e-06, + "loss": 0.97620273, + "num_input_tokens_seen": 130493630, + "router_z_loss_clip": 1.84179688, + "router_z_loss_mlp": 0.15771484, + "step": 6076, + "time_per_iteration": 2.491070508956909 + }, + { + "auxiliary_loss_clip": 0.06471214, + "auxiliary_loss_mlp": 0.01269524, + "balance_loss_clip": 0.06294619, + "balance_loss_mlp": 0.01253782, + "epoch": 0.36536900646324966, + "flos": 13119981724800.0, + "grad_norm": 1.9522812947590364, + "language_loss": 0.69894624, + "learning_rate": 2.931687131696872e-06, + "loss": 0.7763536, + "num_input_tokens_seen": 130510735, + "router_z_loss_clip": 1.76855469, + "router_z_loss_mlp": 0.15740967, + "step": 6077, + "time_per_iteration": 2.5298445224761963 + }, + { + "auxiliary_loss_clip": 0.06367216, + "auxiliary_loss_mlp": 0.01255974, + "balance_loss_clip": 0.06288684, + "balance_loss_mlp": 0.0125196, + "epoch": 0.3654291297159176, + "flos": 71122848393600.0, + "grad_norm": 0.715882721223993, + "language_loss": 0.61670828, + "learning_rate": 2.9313424922083224e-06, + "loss": 0.69294018, + "num_input_tokens_seen": 130577050, + "router_z_loss_clip": 0.78271484, + "router_z_loss_mlp": 0.04013062, + "step": 6078, + "time_per_iteration": 3.245680093765259 + }, + { + "auxiliary_loss_clip": 0.06468864, + "auxiliary_loss_mlp": 0.01269715, + "balance_loss_clip": 0.0628942, + "balance_loss_mlp": 0.01254217, + "epoch": 0.3654892529685856, + "flos": 23623299348480.0, + "grad_norm": 2.6954686860737427, + "language_loss": 0.78565228, + "learning_rate": 2.930997817403173e-06, + "loss": 0.86303806, + "num_input_tokens_seen": 130593780, + "router_z_loss_clip": 1.79492188, + "router_z_loss_mlp": 0.1550293, + "step": 6079, + "time_per_iteration": 2.5243916511535645 + }, + { + "auxiliary_loss_clip": 0.06474455, + "auxiliary_loss_mlp": 0.0127227, + "balance_loss_clip": 0.06293908, + "balance_loss_mlp": 0.01255557, + "epoch": 0.36554937622125355, + "flos": 43480788174720.0, + "grad_norm": 2.827080544182906, + "language_loss": 0.62854588, + "learning_rate": 2.9306531072944913e-06, + "loss": 0.70601308, + "num_input_tokens_seen": 130615510, + "router_z_loss_clip": 1.80371094, + "router_z_loss_mlp": 0.16711426, + "step": 6080, + "time_per_iteration": 2.755979299545288 + }, + { + "auxiliary_loss_clip": 0.06473932, + "auxiliary_loss_mlp": 0.01273454, + "balance_loss_clip": 0.06292675, + "balance_loss_mlp": 0.012568, + "epoch": 0.3656094994739215, + "flos": 23301334834560.0, + "grad_norm": 2.0380719718304046, + "language_loss": 0.68215913, + "learning_rate": 2.930308361895352e-06, + "loss": 0.75963295, + "num_input_tokens_seen": 130635410, + "router_z_loss_clip": 1.8125, + "router_z_loss_mlp": 0.16674805, + "step": 6081, + "time_per_iteration": 2.5318713188171387 + }, + { + "auxiliary_loss_clip": 0.06476995, + "auxiliary_loss_mlp": 0.01283221, + "balance_loss_clip": 0.06289314, + "balance_loss_mlp": 0.01267021, + "epoch": 0.3656696227265895, + "flos": 24578947964160.0, + "grad_norm": 1.6214502004720641, + "language_loss": 0.75242162, + "learning_rate": 2.9299635812188257e-06, + "loss": 0.83002377, + "num_input_tokens_seen": 130657725, + "router_z_loss_clip": 1.87597656, + "router_z_loss_mlp": 0.1619873, + "step": 6082, + "time_per_iteration": 2.614473819732666 + }, + { + "auxiliary_loss_clip": 0.06474194, + "auxiliary_loss_mlp": 0.0127049, + "balance_loss_clip": 0.06295186, + "balance_loss_mlp": 0.01255851, + "epoch": 0.3657297459792575, + "flos": 27935849761920.0, + "grad_norm": 4.519769037138984, + "language_loss": 0.83192384, + "learning_rate": 2.929618765277987e-06, + "loss": 0.90937066, + "num_input_tokens_seen": 130678360, + "router_z_loss_clip": 1.79199219, + "router_z_loss_mlp": 0.14660645, + "step": 6083, + "time_per_iteration": 2.569382429122925 + }, + { + "auxiliary_loss_clip": 0.06373743, + "auxiliary_loss_mlp": 0.01258609, + "balance_loss_clip": 0.06293802, + "balance_loss_mlp": 0.01254855, + "epoch": 0.36578986923192547, + "flos": 67410566231040.0, + "grad_norm": 0.7897440828264927, + "language_loss": 0.59315842, + "learning_rate": 2.9292739140859125e-06, + "loss": 0.66948193, + "num_input_tokens_seen": 130742110, + "router_z_loss_clip": 0.796875, + "router_z_loss_mlp": 0.03747559, + "step": 6084, + "time_per_iteration": 3.2453150749206543 + }, + { + "auxiliary_loss_clip": 0.0646999, + "auxiliary_loss_mlp": 0.0127453, + "balance_loss_clip": 0.06292025, + "balance_loss_mlp": 0.01258801, + "epoch": 0.36584999248459343, + "flos": 20233302387840.0, + "grad_norm": 1.9605927592145687, + "language_loss": 0.73469806, + "learning_rate": 2.9289290276556767e-06, + "loss": 0.81214333, + "num_input_tokens_seen": 130759870, + "router_z_loss_clip": 1.78320312, + "router_z_loss_mlp": 0.15734863, + "step": 6085, + "time_per_iteration": 2.5149080753326416 + }, + { + "auxiliary_loss_clip": 0.06475443, + "auxiliary_loss_mlp": 0.01272781, + "balance_loss_clip": 0.06296027, + "balance_loss_mlp": 0.01256974, + "epoch": 0.3659101157372614, + "flos": 19068475253760.0, + "grad_norm": 1.7755618246241633, + "language_loss": 0.78367889, + "learning_rate": 2.9285841060003604e-06, + "loss": 0.86116111, + "num_input_tokens_seen": 130778510, + "router_z_loss_clip": 1.79492188, + "router_z_loss_mlp": 0.15802002, + "step": 6086, + "time_per_iteration": 2.6959855556488037 + }, + { + "auxiliary_loss_clip": 0.06460601, + "auxiliary_loss_mlp": 0.01277624, + "balance_loss_clip": 0.0628686, + "balance_loss_mlp": 0.01262449, + "epoch": 0.36597023898992936, + "flos": 30818658758400.0, + "grad_norm": 2.7333963743808387, + "language_loss": 0.77419388, + "learning_rate": 2.9282391491330416e-06, + "loss": 0.85157609, + "num_input_tokens_seen": 130798535, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.15185547, + "step": 6087, + "time_per_iteration": 2.660513401031494 + }, + { + "auxiliary_loss_clip": 0.06470397, + "auxiliary_loss_mlp": 0.01281375, + "balance_loss_clip": 0.06288096, + "balance_loss_mlp": 0.0126543, + "epoch": 0.36603036224259733, + "flos": 20528041524480.0, + "grad_norm": 2.0856395013908005, + "language_loss": 0.70779794, + "learning_rate": 2.9278941570668002e-06, + "loss": 0.78531569, + "num_input_tokens_seen": 130816655, + "router_z_loss_clip": 1.82226562, + "router_z_loss_mlp": 0.15948486, + "step": 6088, + "time_per_iteration": 2.5904111862182617 + }, + { + "auxiliary_loss_clip": 0.064822, + "auxiliary_loss_mlp": 0.0127711, + "balance_loss_clip": 0.06290494, + "balance_loss_mlp": 0.01258835, + "epoch": 0.3660904854952653, + "flos": 38339043356160.0, + "grad_norm": 1.5018444157956148, + "language_loss": 0.8073988, + "learning_rate": 2.92754912981472e-06, + "loss": 0.88499188, + "num_input_tokens_seen": 130841225, + "router_z_loss_clip": 1.9140625, + "router_z_loss_mlp": 0.18273926, + "step": 6089, + "time_per_iteration": 2.695387125015259 + }, + { + "auxiliary_loss_clip": 0.06466638, + "auxiliary_loss_mlp": 0.0126828, + "balance_loss_clip": 0.06289521, + "balance_loss_mlp": 0.01254065, + "epoch": 0.36615060874793326, + "flos": 21842062053120.0, + "grad_norm": 1.783943984741075, + "language_loss": 0.71745276, + "learning_rate": 2.927204067389884e-06, + "loss": 0.79480195, + "num_input_tokens_seen": 130861050, + "router_z_loss_clip": 1.77050781, + "router_z_loss_mlp": 0.14208984, + "step": 6090, + "time_per_iteration": 2.5730583667755127 + }, + { + "auxiliary_loss_clip": 0.06467035, + "auxiliary_loss_mlp": 0.01270022, + "balance_loss_clip": 0.06292006, + "balance_loss_mlp": 0.01254585, + "epoch": 0.3662107320006012, + "flos": 16587153895680.0, + "grad_norm": 1.8168526275922985, + "language_loss": 0.74269617, + "learning_rate": 2.9268589698053763e-06, + "loss": 0.82006675, + "num_input_tokens_seen": 130879775, + "router_z_loss_clip": 1.75195312, + "router_z_loss_mlp": 0.1541748, + "step": 6091, + "time_per_iteration": 2.5094668865203857 + }, + { + "auxiliary_loss_clip": 0.06470925, + "auxiliary_loss_mlp": 0.01271934, + "balance_loss_clip": 0.062924, + "balance_loss_mlp": 0.01256699, + "epoch": 0.3662708552532692, + "flos": 20964469115520.0, + "grad_norm": 2.9410218249320796, + "language_loss": 0.72888803, + "learning_rate": 2.926513837074284e-06, + "loss": 0.80631661, + "num_input_tokens_seen": 130898070, + "router_z_loss_clip": 1.78613281, + "router_z_loss_mlp": 0.15234375, + "step": 6092, + "time_per_iteration": 2.525499105453491 + }, + { + "auxiliary_loss_clip": 0.06472248, + "auxiliary_loss_mlp": 0.01276986, + "balance_loss_clip": 0.06288992, + "balance_loss_mlp": 0.01260833, + "epoch": 0.36633097850593715, + "flos": 21908252378880.0, + "grad_norm": 2.382181592286333, + "language_loss": 0.78829455, + "learning_rate": 2.9261686692096942e-06, + "loss": 0.86578685, + "num_input_tokens_seen": 130915250, + "router_z_loss_clip": 1.83105469, + "router_z_loss_mlp": 0.16174316, + "step": 6093, + "time_per_iteration": 2.519925355911255 + }, + { + "auxiliary_loss_clip": 0.06470528, + "auxiliary_loss_mlp": 0.01272915, + "balance_loss_clip": 0.06288898, + "balance_loss_mlp": 0.0125743, + "epoch": 0.3663911017586051, + "flos": 32862462422400.0, + "grad_norm": 1.6789792555665461, + "language_loss": 0.74561131, + "learning_rate": 2.925823466224696e-06, + "loss": 0.82304573, + "num_input_tokens_seen": 130936995, + "router_z_loss_clip": 1.81640625, + "router_z_loss_mlp": 0.15478516, + "step": 6094, + "time_per_iteration": 2.6374077796936035 + }, + { + "auxiliary_loss_clip": 0.06470601, + "auxiliary_loss_mlp": 0.01277645, + "balance_loss_clip": 0.06289363, + "balance_loss_mlp": 0.01261421, + "epoch": 0.3664512250112731, + "flos": 27279132986880.0, + "grad_norm": 1.6273421100585188, + "language_loss": 0.7975142, + "learning_rate": 2.9254782281323785e-06, + "loss": 0.87499666, + "num_input_tokens_seen": 130957970, + "router_z_loss_clip": 1.81152344, + "router_z_loss_mlp": 0.16223145, + "step": 6095, + "time_per_iteration": 2.565009117126465 + }, + { + "auxiliary_loss_clip": 0.06480707, + "auxiliary_loss_mlp": 0.01275122, + "balance_loss_clip": 0.06295107, + "balance_loss_mlp": 0.01258552, + "epoch": 0.3665113482639411, + "flos": 17790065510400.0, + "grad_norm": 2.4875649346087725, + "language_loss": 0.73963505, + "learning_rate": 2.925132954945834e-06, + "loss": 0.81719339, + "num_input_tokens_seen": 130974915, + "router_z_loss_clip": 1.859375, + "router_z_loss_mlp": 0.16577148, + "step": 6096, + "time_per_iteration": 2.5150070190429688 + }, + { + "auxiliary_loss_clip": 0.06474067, + "auxiliary_loss_mlp": 0.01271541, + "balance_loss_clip": 0.06288943, + "balance_loss_mlp": 0.01255901, + "epoch": 0.36657147151660907, + "flos": 27861944860800.0, + "grad_norm": 1.9533584433338151, + "language_loss": 0.67592847, + "learning_rate": 2.924787646678155e-06, + "loss": 0.75338453, + "num_input_tokens_seen": 130995745, + "router_z_loss_clip": 1.8515625, + "router_z_loss_mlp": 0.15649414, + "step": 6097, + "time_per_iteration": 4.085919618606567 + }, + { + "auxiliary_loss_clip": 0.06474558, + "auxiliary_loss_mlp": 0.01273059, + "balance_loss_clip": 0.06292384, + "balance_loss_mlp": 0.01257204, + "epoch": 0.36663159476927704, + "flos": 25381000846080.0, + "grad_norm": 1.4284875999183062, + "language_loss": 0.77924675, + "learning_rate": 2.9244423033424365e-06, + "loss": 0.85672289, + "num_input_tokens_seen": 131015545, + "router_z_loss_clip": 1.82421875, + "router_z_loss_mlp": 0.15856934, + "step": 6098, + "time_per_iteration": 4.075935363769531 + }, + { + "auxiliary_loss_clip": 0.06469452, + "auxiliary_loss_mlp": 0.01270135, + "balance_loss_clip": 0.06291129, + "balance_loss_mlp": 0.01254751, + "epoch": 0.366691718021945, + "flos": 21362979934080.0, + "grad_norm": 2.6338542151665862, + "language_loss": 0.73907244, + "learning_rate": 2.9240969249517723e-06, + "loss": 0.81646824, + "num_input_tokens_seen": 131033990, + "router_z_loss_clip": 1.78320312, + "router_z_loss_mlp": 0.15386963, + "step": 6099, + "time_per_iteration": 2.5343947410583496 + }, + { + "auxiliary_loss_clip": 0.06462912, + "auxiliary_loss_mlp": 0.01271369, + "balance_loss_clip": 0.06286579, + "balance_loss_mlp": 0.01256695, + "epoch": 0.36675184127461297, + "flos": 16806017560320.0, + "grad_norm": 1.7024924966611934, + "language_loss": 0.84795189, + "learning_rate": 2.9237515115192602e-06, + "loss": 0.92529464, + "num_input_tokens_seen": 131050710, + "router_z_loss_clip": 1.76171875, + "router_z_loss_mlp": 0.14660645, + "step": 6100, + "time_per_iteration": 2.5503897666931152 + }, + { + "auxiliary_loss_clip": 0.06478457, + "auxiliary_loss_mlp": 0.01268408, + "balance_loss_clip": 0.06293124, + "balance_loss_mlp": 0.0125216, + "epoch": 0.36681196452728093, + "flos": 21912696645120.0, + "grad_norm": 2.268106387872694, + "language_loss": 0.712331, + "learning_rate": 2.9234060630579992e-06, + "loss": 0.78979969, + "num_input_tokens_seen": 131071435, + "router_z_loss_clip": 1.85351562, + "router_z_loss_mlp": 0.16235352, + "step": 6101, + "time_per_iteration": 2.5698294639587402 + }, + { + "auxiliary_loss_clip": 0.06474541, + "auxiliary_loss_mlp": 0.01273553, + "balance_loss_clip": 0.0629383, + "balance_loss_mlp": 0.01257137, + "epoch": 0.3668720877799489, + "flos": 17718215034240.0, + "grad_norm": 2.179497141372214, + "language_loss": 0.76701671, + "learning_rate": 2.9230605795810865e-06, + "loss": 0.84449768, + "num_input_tokens_seen": 131088775, + "router_z_loss_clip": 1.8046875, + "router_z_loss_mlp": 0.16418457, + "step": 6102, + "time_per_iteration": 2.653047561645508 + }, + { + "auxiliary_loss_clip": 0.06477299, + "auxiliary_loss_mlp": 0.01279444, + "balance_loss_clip": 0.06290299, + "balance_loss_mlp": 0.01262099, + "epoch": 0.36693221103261686, + "flos": 47055882804480.0, + "grad_norm": 1.641444039565929, + "language_loss": 0.70188046, + "learning_rate": 2.922715061101625e-06, + "loss": 0.77944791, + "num_input_tokens_seen": 131112800, + "router_z_loss_clip": 1.86621094, + "router_z_loss_mlp": 0.17333984, + "step": 6103, + "time_per_iteration": 2.7502424716949463 + }, + { + "auxiliary_loss_clip": 0.06472746, + "auxiliary_loss_mlp": 0.01272056, + "balance_loss_clip": 0.06290279, + "balance_loss_mlp": 0.01255581, + "epoch": 0.3669923342852848, + "flos": 15966383322240.0, + "grad_norm": 1.6662921664183201, + "language_loss": 0.71920598, + "learning_rate": 2.922369507632716e-06, + "loss": 0.79665399, + "num_input_tokens_seen": 131131150, + "router_z_loss_clip": 1.82324219, + "router_z_loss_mlp": 0.16467285, + "step": 6104, + "time_per_iteration": 3.993805408477783 + }, + { + "auxiliary_loss_clip": 0.0647142, + "auxiliary_loss_mlp": 0.01272456, + "balance_loss_clip": 0.06291486, + "balance_loss_mlp": 0.01256494, + "epoch": 0.3670524575379528, + "flos": 19980630800640.0, + "grad_norm": 1.7978052174853272, + "language_loss": 0.81448174, + "learning_rate": 2.9220239191874617e-06, + "loss": 0.89192045, + "num_input_tokens_seen": 131150365, + "router_z_loss_clip": 1.79882812, + "router_z_loss_mlp": 0.15966797, + "step": 6105, + "time_per_iteration": 3.907820463180542 + }, + { + "auxiliary_loss_clip": 0.06477002, + "auxiliary_loss_mlp": 0.01272813, + "balance_loss_clip": 0.06288886, + "balance_loss_mlp": 0.01254896, + "epoch": 0.36711258079062076, + "flos": 25710092956800.0, + "grad_norm": 1.7139492182529468, + "language_loss": 0.81421959, + "learning_rate": 2.9216782957789692e-06, + "loss": 0.89171767, + "num_input_tokens_seen": 131169310, + "router_z_loss_clip": 1.88183594, + "router_z_loss_mlp": 0.17919922, + "step": 6106, + "time_per_iteration": 2.5623860359191895 + }, + { + "auxiliary_loss_clip": 0.06422871, + "auxiliary_loss_mlp": 0.01259281, + "balance_loss_clip": 0.06342293, + "balance_loss_mlp": 0.01254903, + "epoch": 0.3671727040432887, + "flos": 60793014648960.0, + "grad_norm": 0.6928078159632836, + "language_loss": 0.59215379, + "learning_rate": 2.9213326374203426e-06, + "loss": 0.66897523, + "num_input_tokens_seen": 131232900, + "router_z_loss_clip": 0.8046875, + "router_z_loss_mlp": 0.04385376, + "step": 6107, + "time_per_iteration": 3.2451207637786865 + }, + { + "auxiliary_loss_clip": 0.06468046, + "auxiliary_loss_mlp": 0.01273048, + "balance_loss_clip": 0.06291793, + "balance_loss_mlp": 0.01257396, + "epoch": 0.3672328272959567, + "flos": 18667281104640.0, + "grad_norm": 1.5826982165866754, + "language_loss": 0.74750638, + "learning_rate": 2.92098694412469e-06, + "loss": 0.82491726, + "num_input_tokens_seen": 131250920, + "router_z_loss_clip": 1.76171875, + "router_z_loss_mlp": 0.15631104, + "step": 6108, + "time_per_iteration": 2.5317509174346924 + }, + { + "auxiliary_loss_clip": 0.06472465, + "auxiliary_loss_mlp": 0.01275992, + "balance_loss_clip": 0.06289458, + "balance_loss_mlp": 0.01260482, + "epoch": 0.3672929505486247, + "flos": 15054395483520.0, + "grad_norm": 2.0251921146130547, + "language_loss": 0.74524188, + "learning_rate": 2.9206412159051213e-06, + "loss": 0.82272649, + "num_input_tokens_seen": 131267910, + "router_z_loss_clip": 1.83203125, + "router_z_loss_mlp": 0.15490723, + "step": 6109, + "time_per_iteration": 2.530214309692383 + }, + { + "auxiliary_loss_clip": 0.06464404, + "auxiliary_loss_mlp": 0.01270146, + "balance_loss_clip": 0.06286883, + "balance_loss_mlp": 0.0125503, + "epoch": 0.3673530738012927, + "flos": 20594693047680.0, + "grad_norm": 1.6431777634434088, + "language_loss": 0.53560948, + "learning_rate": 2.920295452774744e-06, + "loss": 0.61295497, + "num_input_tokens_seen": 131287150, + "router_z_loss_clip": 1.77734375, + "router_z_loss_mlp": 0.15112305, + "step": 6110, + "time_per_iteration": 2.5247035026550293 + }, + { + "auxiliary_loss_clip": 0.06459565, + "auxiliary_loss_mlp": 0.01275062, + "balance_loss_clip": 0.06284792, + "balance_loss_mlp": 0.01258957, + "epoch": 0.36741319705396064, + "flos": 21696348602880.0, + "grad_norm": 1.814369900920369, + "language_loss": 0.80767608, + "learning_rate": 2.919949654746672e-06, + "loss": 0.8850224, + "num_input_tokens_seen": 131308225, + "router_z_loss_clip": 1.74707031, + "router_z_loss_mlp": 0.16088867, + "step": 6111, + "time_per_iteration": 2.6213719844818115 + }, + { + "auxiliary_loss_clip": 0.06459287, + "auxiliary_loss_mlp": 0.01273038, + "balance_loss_clip": 0.06284556, + "balance_loss_mlp": 0.01256861, + "epoch": 0.3674733203066286, + "flos": 29870011958400.0, + "grad_norm": 1.7131296557309772, + "language_loss": 0.72860467, + "learning_rate": 2.9196038218340163e-06, + "loss": 0.80592787, + "num_input_tokens_seen": 131332115, + "router_z_loss_clip": 1.74609375, + "router_z_loss_mlp": 0.16174316, + "step": 6112, + "time_per_iteration": 2.656101703643799 + }, + { + "auxiliary_loss_clip": 0.06459092, + "auxiliary_loss_mlp": 0.01272993, + "balance_loss_clip": 0.06283998, + "balance_loss_mlp": 0.01257866, + "epoch": 0.36753344355929657, + "flos": 18262439303040.0, + "grad_norm": 1.5099687925303509, + "language_loss": 0.85667342, + "learning_rate": 2.919257954049892e-06, + "loss": 0.93399429, + "num_input_tokens_seen": 131351885, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.15124512, + "step": 6113, + "time_per_iteration": 2.5230536460876465 + }, + { + "auxiliary_loss_clip": 0.06460717, + "auxiliary_loss_mlp": 0.01276985, + "balance_loss_clip": 0.06281444, + "balance_loss_mlp": 0.01260439, + "epoch": 0.36759356681196453, + "flos": 25308144120960.0, + "grad_norm": 1.9025835930032806, + "language_loss": 0.78706479, + "learning_rate": 2.918912051407413e-06, + "loss": 0.86444181, + "num_input_tokens_seen": 131370245, + "router_z_loss_clip": 1.79101562, + "router_z_loss_mlp": 0.16540527, + "step": 6114, + "time_per_iteration": 2.6091229915618896 + }, + { + "auxiliary_loss_clip": 0.06466475, + "auxiliary_loss_mlp": 0.01272915, + "balance_loss_clip": 0.0628548, + "balance_loss_mlp": 0.01255725, + "epoch": 0.3676536900646325, + "flos": 21039338338560.0, + "grad_norm": 1.6305517572579116, + "language_loss": 0.67626929, + "learning_rate": 2.918566113919698e-06, + "loss": 0.75366318, + "num_input_tokens_seen": 131388115, + "router_z_loss_clip": 1.80859375, + "router_z_loss_mlp": 0.17199707, + "step": 6115, + "time_per_iteration": 2.5226221084594727 + }, + { + "auxiliary_loss_clip": 0.06454025, + "auxiliary_loss_mlp": 0.01272139, + "balance_loss_clip": 0.06280309, + "balance_loss_mlp": 0.01257077, + "epoch": 0.36771381331730046, + "flos": 16293882205440.0, + "grad_norm": 2.2835896682412105, + "language_loss": 0.76996851, + "learning_rate": 2.9182201415998636e-06, + "loss": 0.84723008, + "num_input_tokens_seen": 131404595, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.15063477, + "step": 6116, + "time_per_iteration": 2.504951238632202 + }, + { + "auxiliary_loss_clip": 0.06459618, + "auxiliary_loss_mlp": 0.01274615, + "balance_loss_clip": 0.06282905, + "balance_loss_mlp": 0.01259153, + "epoch": 0.36777393656996843, + "flos": 22316574124800.0, + "grad_norm": 1.8264539284878285, + "language_loss": 0.62890095, + "learning_rate": 2.9178741344610286e-06, + "loss": 0.70624328, + "num_input_tokens_seen": 131423760, + "router_z_loss_clip": 1.76367188, + "router_z_loss_mlp": 0.15454102, + "step": 6117, + "time_per_iteration": 2.529193639755249 + }, + { + "auxiliary_loss_clip": 0.06458353, + "auxiliary_loss_mlp": 0.01270127, + "balance_loss_clip": 0.06285255, + "balance_loss_mlp": 0.01254749, + "epoch": 0.3678340598226364, + "flos": 26841405657600.0, + "grad_norm": 1.7359331247938332, + "language_loss": 0.73532575, + "learning_rate": 2.9175280925163156e-06, + "loss": 0.81261057, + "num_input_tokens_seen": 131444955, + "router_z_loss_clip": 1.73046875, + "router_z_loss_mlp": 0.15368652, + "step": 6118, + "time_per_iteration": 2.6261374950408936 + }, + { + "auxiliary_loss_clip": 0.06469986, + "auxiliary_loss_mlp": 0.01276003, + "balance_loss_clip": 0.06289329, + "balance_loss_mlp": 0.01259707, + "epoch": 0.36789418307530436, + "flos": 21768073297920.0, + "grad_norm": 1.5781425493049515, + "language_loss": 0.73047614, + "learning_rate": 2.9171820157788445e-06, + "loss": 0.80793607, + "num_input_tokens_seen": 131465720, + "router_z_loss_clip": 1.80566406, + "router_z_loss_mlp": 0.16320801, + "step": 6119, + "time_per_iteration": 2.5320048332214355 + }, + { + "auxiliary_loss_clip": 0.06466002, + "auxiliary_loss_mlp": 0.0127303, + "balance_loss_clip": 0.06290065, + "balance_loss_mlp": 0.0125789, + "epoch": 0.3679543063279723, + "flos": 15929598579840.0, + "grad_norm": 2.0565678381587307, + "language_loss": 0.8018201, + "learning_rate": 2.9168359042617404e-06, + "loss": 0.87921047, + "num_input_tokens_seen": 131483080, + "router_z_loss_clip": 1.75976562, + "router_z_loss_mlp": 0.15136719, + "step": 6120, + "time_per_iteration": 2.5085418224334717 + }, + { + "auxiliary_loss_clip": 0.06467941, + "auxiliary_loss_mlp": 0.01276389, + "balance_loss_clip": 0.0629365, + "balance_loss_mlp": 0.01260868, + "epoch": 0.3680144295806403, + "flos": 24281693205120.0, + "grad_norm": 2.0719591239633703, + "language_loss": 0.64803445, + "learning_rate": 2.916489757978126e-06, + "loss": 0.72547781, + "num_input_tokens_seen": 131502545, + "router_z_loss_clip": 1.74316406, + "router_z_loss_mlp": 0.15515137, + "step": 6121, + "time_per_iteration": 2.532470703125 + }, + { + "auxiliary_loss_clip": 0.06466727, + "auxiliary_loss_mlp": 0.01268749, + "balance_loss_clip": 0.06293779, + "balance_loss_mlp": 0.01254527, + "epoch": 0.36807455283330826, + "flos": 26111329032960.0, + "grad_norm": 1.9648479350594452, + "language_loss": 0.71416938, + "learning_rate": 2.9161435769411286e-06, + "loss": 0.79152405, + "num_input_tokens_seen": 131522155, + "router_z_loss_clip": 1.72851562, + "router_z_loss_mlp": 0.14221191, + "step": 6122, + "time_per_iteration": 2.5836074352264404 + }, + { + "auxiliary_loss_clip": 0.06461313, + "auxiliary_loss_mlp": 0.01273307, + "balance_loss_clip": 0.06291762, + "balance_loss_mlp": 0.0125831, + "epoch": 0.3681346760859763, + "flos": 24651972397440.0, + "grad_norm": 1.8972357597085572, + "language_loss": 0.69858962, + "learning_rate": 2.915797361163875e-06, + "loss": 0.77593577, + "num_input_tokens_seen": 131543865, + "router_z_loss_clip": 1.69726562, + "router_z_loss_mlp": 0.15002441, + "step": 6123, + "time_per_iteration": 2.5574307441711426 + }, + { + "auxiliary_loss_clip": 0.06474412, + "auxiliary_loss_mlp": 0.0127264, + "balance_loss_clip": 0.06293641, + "balance_loss_mlp": 0.01256094, + "epoch": 0.36819479933864424, + "flos": 23885152957440.0, + "grad_norm": 2.796866262853862, + "language_loss": 0.74766016, + "learning_rate": 2.9154511106594933e-06, + "loss": 0.8251307, + "num_input_tokens_seen": 131562155, + "router_z_loss_clip": 1.80957031, + "router_z_loss_mlp": 0.16540527, + "step": 6124, + "time_per_iteration": 2.5769121646881104 + }, + { + "auxiliary_loss_clip": 0.06470435, + "auxiliary_loss_mlp": 0.01274758, + "balance_loss_clip": 0.06295419, + "balance_loss_mlp": 0.01258116, + "epoch": 0.3682549225913122, + "flos": 25560606072960.0, + "grad_norm": 3.2532876436035236, + "language_loss": 0.74467599, + "learning_rate": 2.915104825441114e-06, + "loss": 0.82212794, + "num_input_tokens_seen": 131581695, + "router_z_loss_clip": 1.75195312, + "router_z_loss_mlp": 0.16625977, + "step": 6125, + "time_per_iteration": 2.5822880268096924 + }, + { + "auxiliary_loss_clip": 0.06476732, + "auxiliary_loss_mlp": 0.01270787, + "balance_loss_clip": 0.06296605, + "balance_loss_mlp": 0.01253967, + "epoch": 0.36831504584398017, + "flos": 16952317989120.0, + "grad_norm": 1.938795434914092, + "language_loss": 0.7843706, + "learning_rate": 2.9147585055218686e-06, + "loss": 0.86184579, + "num_input_tokens_seen": 131599465, + "router_z_loss_clip": 1.80078125, + "router_z_loss_mlp": 0.16809082, + "step": 6126, + "time_per_iteration": 2.5298731327056885 + }, + { + "auxiliary_loss_clip": 0.06483818, + "auxiliary_loss_mlp": 0.01275366, + "balance_loss_clip": 0.06301596, + "balance_loss_mlp": 0.01257413, + "epoch": 0.36837516909664814, + "flos": 19871198968320.0, + "grad_norm": 2.3034543329783173, + "language_loss": 0.66139042, + "learning_rate": 2.914412150914888e-06, + "loss": 0.73898232, + "num_input_tokens_seen": 131618330, + "router_z_loss_clip": 1.82128906, + "router_z_loss_mlp": 0.17980957, + "step": 6127, + "time_per_iteration": 2.5208253860473633 + }, + { + "auxiliary_loss_clip": 0.06475674, + "auxiliary_loss_mlp": 0.01272228, + "balance_loss_clip": 0.06294744, + "balance_loss_mlp": 0.01256409, + "epoch": 0.3684352923493161, + "flos": 37634976224640.0, + "grad_norm": 1.7597572196634643, + "language_loss": 0.70472896, + "learning_rate": 2.9140657616333074e-06, + "loss": 0.78220791, + "num_input_tokens_seen": 131638960, + "router_z_loss_clip": 1.80761719, + "router_z_loss_mlp": 0.15808105, + "step": 6128, + "time_per_iteration": 2.6984474658966064 + }, + { + "auxiliary_loss_clip": 0.06467833, + "auxiliary_loss_mlp": 0.01270944, + "balance_loss_clip": 0.06293194, + "balance_loss_mlp": 0.01255613, + "epoch": 0.36849541560198407, + "flos": 14470786995840.0, + "grad_norm": 1.6868142680460214, + "language_loss": 0.7591843, + "learning_rate": 2.9137193376902614e-06, + "loss": 0.83657211, + "num_input_tokens_seen": 131657440, + "router_z_loss_clip": 1.74511719, + "router_z_loss_mlp": 0.15332031, + "step": 6129, + "time_per_iteration": 2.49924898147583 + }, + { + "auxiliary_loss_clip": 0.06473218, + "auxiliary_loss_mlp": 0.01270816, + "balance_loss_clip": 0.06296876, + "balance_loss_mlp": 0.01255844, + "epoch": 0.36855553885465203, + "flos": 25777037969280.0, + "grad_norm": 1.6502765336301308, + "language_loss": 0.85087365, + "learning_rate": 2.9133728790988868e-06, + "loss": 0.92831397, + "num_input_tokens_seen": 131678035, + "router_z_loss_clip": 1.76464844, + "router_z_loss_mlp": 0.1496582, + "step": 6130, + "time_per_iteration": 2.604851484298706 + }, + { + "auxiliary_loss_clip": 0.06391466, + "auxiliary_loss_mlp": 0.01263828, + "balance_loss_clip": 0.06313837, + "balance_loss_mlp": 0.01261091, + "epoch": 0.36861566210732, + "flos": 65071715212800.0, + "grad_norm": 0.7916436629428728, + "language_loss": 0.60275888, + "learning_rate": 2.913026385872321e-06, + "loss": 0.67931175, + "num_input_tokens_seen": 131742470, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.02740479, + "step": 6131, + "time_per_iteration": 3.228571891784668 + }, + { + "auxiliary_loss_clip": 0.0647023, + "auxiliary_loss_mlp": 0.01270609, + "balance_loss_clip": 0.06296837, + "balance_loss_mlp": 0.01255332, + "epoch": 0.36867578535998796, + "flos": 30962108148480.0, + "grad_norm": 1.7580055354180455, + "language_loss": 0.73204952, + "learning_rate": 2.9126798580237034e-06, + "loss": 0.8094579, + "num_input_tokens_seen": 131764570, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.152771, + "step": 6132, + "time_per_iteration": 2.6286978721618652 + }, + { + "auxiliary_loss_clip": 0.06478602, + "auxiliary_loss_mlp": 0.01273616, + "balance_loss_clip": 0.0629575, + "balance_loss_mlp": 0.0125738, + "epoch": 0.3687359086126559, + "flos": 28845154270080.0, + "grad_norm": 1.8077518075699008, + "language_loss": 0.7455107, + "learning_rate": 2.9123332955661736e-06, + "loss": 0.82303286, + "num_input_tokens_seen": 131785720, + "router_z_loss_clip": 1.83007812, + "router_z_loss_mlp": 0.16235352, + "step": 6133, + "time_per_iteration": 2.6024398803710938 + }, + { + "auxiliary_loss_clip": 0.06463782, + "auxiliary_loss_mlp": 0.0127464, + "balance_loss_clip": 0.06292324, + "balance_loss_mlp": 0.01258618, + "epoch": 0.3687960318653239, + "flos": 21403076912640.0, + "grad_norm": 1.7721182564640174, + "language_loss": 0.7199074, + "learning_rate": 2.911986698512874e-06, + "loss": 0.79729164, + "num_input_tokens_seen": 131804430, + "router_z_loss_clip": 1.71386719, + "router_z_loss_mlp": 0.16027832, + "step": 6134, + "time_per_iteration": 2.646097421646118 + }, + { + "auxiliary_loss_clip": 0.0646476, + "auxiliary_loss_mlp": 0.0126875, + "balance_loss_clip": 0.06289706, + "balance_loss_mlp": 0.01252288, + "epoch": 0.36885615511799186, + "flos": 20272183482240.0, + "grad_norm": 4.124945820193244, + "language_loss": 0.7570188, + "learning_rate": 2.9116400668769477e-06, + "loss": 0.83435392, + "num_input_tokens_seen": 131822060, + "router_z_loss_clip": 1.74804688, + "router_z_loss_mlp": 0.16455078, + "step": 6135, + "time_per_iteration": 2.6019539833068848 + }, + { + "auxiliary_loss_clip": 0.06382909, + "auxiliary_loss_mlp": 0.01256883, + "balance_loss_clip": 0.06304377, + "balance_loss_mlp": 0.0125392, + "epoch": 0.3689162783706599, + "flos": 63106317371520.0, + "grad_norm": 0.7816734524389999, + "language_loss": 0.58664352, + "learning_rate": 2.9112934006715376e-06, + "loss": 0.66304147, + "num_input_tokens_seen": 131880715, + "router_z_loss_clip": 0.78515625, + "router_z_loss_mlp": 0.02960205, + "step": 6136, + "time_per_iteration": 3.139789342880249 + }, + { + "auxiliary_loss_clip": 0.06465235, + "auxiliary_loss_mlp": 0.01270986, + "balance_loss_clip": 0.06292487, + "balance_loss_mlp": 0.012563, + "epoch": 0.36897640162332784, + "flos": 10966536593280.0, + "grad_norm": 2.7370945268269806, + "language_loss": 0.79547632, + "learning_rate": 2.9109466999097918e-06, + "loss": 0.8728385, + "num_input_tokens_seen": 131895850, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.14678955, + "step": 6137, + "time_per_iteration": 3.937328577041626 + }, + { + "auxiliary_loss_clip": 0.06472172, + "auxiliary_loss_mlp": 0.01271273, + "balance_loss_clip": 0.06297816, + "balance_loss_mlp": 0.01255764, + "epoch": 0.3690365248759958, + "flos": 20710581644160.0, + "grad_norm": 1.9257362559650297, + "language_loss": 0.74479491, + "learning_rate": 2.9105999646048552e-06, + "loss": 0.82222939, + "num_input_tokens_seen": 131915775, + "router_z_loss_clip": 1.74414062, + "router_z_loss_mlp": 0.15515137, + "step": 6138, + "time_per_iteration": 4.004723072052002 + }, + { + "auxiliary_loss_clip": 0.06475753, + "auxiliary_loss_mlp": 0.01270871, + "balance_loss_clip": 0.06296947, + "balance_loss_mlp": 0.01255827, + "epoch": 0.3690966481286638, + "flos": 31833495884160.0, + "grad_norm": 1.986271481109943, + "language_loss": 0.65762347, + "learning_rate": 2.9102531947698764e-06, + "loss": 0.73508972, + "num_input_tokens_seen": 131935715, + "router_z_loss_clip": 1.78808594, + "router_z_loss_mlp": 0.1505127, + "step": 6139, + "time_per_iteration": 2.621832847595215 + }, + { + "auxiliary_loss_clip": 0.06460394, + "auxiliary_loss_mlp": 0.01271698, + "balance_loss_clip": 0.06290884, + "balance_loss_mlp": 0.0125626, + "epoch": 0.36915677138133174, + "flos": 13119897870720.0, + "grad_norm": 1.9334180469367421, + "language_loss": 0.72060692, + "learning_rate": 2.909906390418006e-06, + "loss": 0.7979278, + "num_input_tokens_seen": 131954120, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.15429688, + "step": 6140, + "time_per_iteration": 2.542410135269165 + }, + { + "auxiliary_loss_clip": 0.06370358, + "auxiliary_loss_mlp": 0.01255246, + "balance_loss_clip": 0.06292184, + "balance_loss_mlp": 0.01252388, + "epoch": 0.3692168946339997, + "flos": 68707926996480.0, + "grad_norm": 0.7297912869343693, + "language_loss": 0.59210759, + "learning_rate": 2.9095595515623934e-06, + "loss": 0.66836369, + "num_input_tokens_seen": 132017485, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 0.02853394, + "step": 6141, + "time_per_iteration": 3.242342710494995 + }, + { + "auxiliary_loss_clip": 0.06465677, + "auxiliary_loss_mlp": 0.01272477, + "balance_loss_clip": 0.06289662, + "balance_loss_mlp": 0.01256336, + "epoch": 0.36927701788666767, + "flos": 22024392537600.0, + "grad_norm": 1.6449420117919953, + "language_loss": 0.75489783, + "learning_rate": 2.909212678216192e-06, + "loss": 0.83227944, + "num_input_tokens_seen": 132036760, + "router_z_loss_clip": 1.76074219, + "router_z_loss_mlp": 0.16149902, + "step": 6142, + "time_per_iteration": 2.552541732788086 + }, + { + "auxiliary_loss_clip": 0.06459697, + "auxiliary_loss_mlp": 0.01271426, + "balance_loss_clip": 0.06287819, + "balance_loss_mlp": 0.01256883, + "epoch": 0.36933714113933563, + "flos": 21842103980160.0, + "grad_norm": 2.1834908331499694, + "language_loss": 0.77180201, + "learning_rate": 2.908865770392555e-06, + "loss": 0.84911323, + "num_input_tokens_seen": 132056935, + "router_z_loss_clip": 1.71777344, + "router_z_loss_mlp": 0.14544678, + "step": 6143, + "time_per_iteration": 3.990859031677246 + }, + { + "auxiliary_loss_clip": 0.06461622, + "auxiliary_loss_mlp": 0.01265429, + "balance_loss_clip": 0.06289461, + "balance_loss_mlp": 0.01251565, + "epoch": 0.3693972643920036, + "flos": 23697749301120.0, + "grad_norm": 1.9416354027972629, + "language_loss": 0.82307315, + "learning_rate": 2.9085188281046364e-06, + "loss": 0.9003436, + "num_input_tokens_seen": 132077285, + "router_z_loss_clip": 1.72265625, + "router_z_loss_mlp": 0.13867188, + "step": 6144, + "time_per_iteration": 2.5504705905914307 + }, + { + "auxiliary_loss_clip": 0.06462898, + "auxiliary_loss_mlp": 0.01269861, + "balance_loss_clip": 0.06287374, + "balance_loss_mlp": 0.01255586, + "epoch": 0.36945738764467156, + "flos": 22863355943040.0, + "grad_norm": 2.172105123479451, + "language_loss": 0.78995448, + "learning_rate": 2.908171851365593e-06, + "loss": 0.86728209, + "num_input_tokens_seen": 132095520, + "router_z_loss_clip": 1.75390625, + "router_z_loss_mlp": 0.14282227, + "step": 6145, + "time_per_iteration": 3.9733781814575195 + }, + { + "auxiliary_loss_clip": 0.06468924, + "auxiliary_loss_mlp": 0.01271457, + "balance_loss_clip": 0.06291068, + "balance_loss_mlp": 0.01256067, + "epoch": 0.36951751089733953, + "flos": 16621213380480.0, + "grad_norm": 1.6722610276638135, + "language_loss": 0.77129662, + "learning_rate": 2.9078248401885815e-06, + "loss": 0.8487004, + "num_input_tokens_seen": 132112810, + "router_z_loss_clip": 1.77832031, + "router_z_loss_mlp": 0.15380859, + "step": 6146, + "time_per_iteration": 2.5411174297332764 + }, + { + "auxiliary_loss_clip": 0.06466483, + "auxiliary_loss_mlp": 0.0127594, + "balance_loss_clip": 0.06289164, + "balance_loss_mlp": 0.01260419, + "epoch": 0.3695776341500075, + "flos": 18920204254080.0, + "grad_norm": 1.6293394058894772, + "language_loss": 0.81346822, + "learning_rate": 2.907477794586761e-06, + "loss": 0.89089251, + "num_input_tokens_seen": 132131615, + "router_z_loss_clip": 1.7734375, + "router_z_loss_mlp": 0.1550293, + "step": 6147, + "time_per_iteration": 2.5456924438476562 + }, + { + "auxiliary_loss_clip": 0.06463629, + "auxiliary_loss_mlp": 0.01275917, + "balance_loss_clip": 0.06286413, + "balance_loss_mlp": 0.01261684, + "epoch": 0.36963775740267546, + "flos": 20813892128640.0, + "grad_norm": 1.8090658573318705, + "language_loss": 0.83484954, + "learning_rate": 2.9071307145732926e-06, + "loss": 0.91224504, + "num_input_tokens_seen": 132149585, + "router_z_loss_clip": 1.77246094, + "router_z_loss_mlp": 0.14227295, + "step": 6148, + "time_per_iteration": 2.6318178176879883 + }, + { + "auxiliary_loss_clip": 0.06458767, + "auxiliary_loss_mlp": 0.01266964, + "balance_loss_clip": 0.06284354, + "balance_loss_mlp": 0.01252814, + "epoch": 0.3696978806553435, + "flos": 26068087526400.0, + "grad_norm": 2.191330684134815, + "language_loss": 0.74277508, + "learning_rate": 2.9067836001613357e-06, + "loss": 0.82003242, + "num_input_tokens_seen": 132165555, + "router_z_loss_clip": 1.74414062, + "router_z_loss_mlp": 0.14147949, + "step": 6149, + "time_per_iteration": 2.6037940979003906 + }, + { + "auxiliary_loss_clip": 0.06464496, + "auxiliary_loss_mlp": 0.01271867, + "balance_loss_clip": 0.06287233, + "balance_loss_mlp": 0.01256203, + "epoch": 0.36975800390801145, + "flos": 26841237949440.0, + "grad_norm": 2.856714094904378, + "language_loss": 0.71066409, + "learning_rate": 2.906436451364054e-06, + "loss": 0.78802776, + "num_input_tokens_seen": 132185100, + "router_z_loss_clip": 1.77050781, + "router_z_loss_mlp": 0.15667725, + "step": 6150, + "time_per_iteration": 2.612860918045044 + }, + { + "auxiliary_loss_clip": 0.06457143, + "auxiliary_loss_mlp": 0.01270306, + "balance_loss_clip": 0.06283612, + "balance_loss_mlp": 0.01256341, + "epoch": 0.3698181271606794, + "flos": 21149063660160.0, + "grad_norm": 1.8423166255946122, + "language_loss": 0.81970799, + "learning_rate": 2.906089268194611e-06, + "loss": 0.89698249, + "num_input_tokens_seen": 132203930, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.1395874, + "step": 6151, + "time_per_iteration": 2.535888195037842 + }, + { + "auxiliary_loss_clip": 0.0635625, + "auxiliary_loss_mlp": 0.01266021, + "balance_loss_clip": 0.06277541, + "balance_loss_mlp": 0.01262752, + "epoch": 0.3698782504133474, + "flos": 66761605958400.0, + "grad_norm": 0.7660918799950965, + "language_loss": 0.63089043, + "learning_rate": 2.9057420506661726e-06, + "loss": 0.70711315, + "num_input_tokens_seen": 132263845, + "router_z_loss_clip": 0.78857422, + "router_z_loss_mlp": 0.03274536, + "step": 6152, + "time_per_iteration": 3.27481746673584 + }, + { + "auxiliary_loss_clip": 0.06456928, + "auxiliary_loss_mlp": 0.01269625, + "balance_loss_clip": 0.06289765, + "balance_loss_mlp": 0.01256709, + "epoch": 0.36993837366601534, + "flos": 24317597479680.0, + "grad_norm": 2.4460843976292455, + "language_loss": 0.7067228, + "learning_rate": 2.9053947987919044e-06, + "loss": 0.78398836, + "num_input_tokens_seen": 132282350, + "router_z_loss_clip": 1.671875, + "router_z_loss_mlp": 0.12921143, + "step": 6153, + "time_per_iteration": 2.561366319656372 + }, + { + "auxiliary_loss_clip": 0.06461591, + "auxiliary_loss_mlp": 0.01272426, + "balance_loss_clip": 0.06285959, + "balance_loss_mlp": 0.0125796, + "epoch": 0.3699984969186833, + "flos": 24355472325120.0, + "grad_norm": 1.7390512131477307, + "language_loss": 0.72820848, + "learning_rate": 2.9050475125849755e-06, + "loss": 0.80554867, + "num_input_tokens_seen": 132301930, + "router_z_loss_clip": 1.75585938, + "router_z_loss_mlp": 0.14459229, + "step": 6154, + "time_per_iteration": 2.6359784603118896 + }, + { + "auxiliary_loss_clip": 0.06464534, + "auxiliary_loss_mlp": 0.01270069, + "balance_loss_clip": 0.06290819, + "balance_loss_mlp": 0.01256468, + "epoch": 0.37005862017135127, + "flos": 19835378547840.0, + "grad_norm": 1.7720975153034155, + "language_loss": 0.68251342, + "learning_rate": 2.9047001920585534e-06, + "loss": 0.75985944, + "num_input_tokens_seen": 132320915, + "router_z_loss_clip": 1.73828125, + "router_z_loss_mlp": 0.1361084, + "step": 6155, + "time_per_iteration": 2.6026792526245117 + }, + { + "auxiliary_loss_clip": 0.06462097, + "auxiliary_loss_mlp": 0.01275284, + "balance_loss_clip": 0.06290478, + "balance_loss_mlp": 0.01261551, + "epoch": 0.37011874342401924, + "flos": 19579981703040.0, + "grad_norm": 1.763175663447542, + "language_loss": 0.68228447, + "learning_rate": 2.9043528372258097e-06, + "loss": 0.75965828, + "num_input_tokens_seen": 132340415, + "router_z_loss_clip": 1.71679688, + "router_z_loss_mlp": 0.13745117, + "step": 6156, + "time_per_iteration": 2.5805797576904297 + }, + { + "auxiliary_loss_clip": 0.06460856, + "auxiliary_loss_mlp": 0.01276122, + "balance_loss_clip": 0.06292138, + "balance_loss_mlp": 0.01263051, + "epoch": 0.3701788666766872, + "flos": 20380315576320.0, + "grad_norm": 2.4756712581972673, + "language_loss": 0.82280111, + "learning_rate": 2.904005448099916e-06, + "loss": 0.9001708, + "num_input_tokens_seen": 132358600, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.13061523, + "step": 6157, + "time_per_iteration": 2.5239012241363525 + }, + { + "auxiliary_loss_clip": 0.06472905, + "auxiliary_loss_mlp": 0.01276517, + "balance_loss_clip": 0.06294029, + "balance_loss_mlp": 0.0126136, + "epoch": 0.37023898992935517, + "flos": 15346325508480.0, + "grad_norm": 2.1879647979069055, + "language_loss": 0.77007514, + "learning_rate": 2.9036580246940444e-06, + "loss": 0.84756935, + "num_input_tokens_seen": 132373160, + "router_z_loss_clip": 1.78710938, + "router_z_loss_mlp": 0.15142822, + "step": 6158, + "time_per_iteration": 2.5507380962371826 + }, + { + "auxiliary_loss_clip": 0.06472066, + "auxiliary_loss_mlp": 0.01273585, + "balance_loss_clip": 0.0629342, + "balance_loss_mlp": 0.0125872, + "epoch": 0.37029911318202313, + "flos": 19580149411200.0, + "grad_norm": 1.9796058392103062, + "language_loss": 0.68833315, + "learning_rate": 2.9033105670213708e-06, + "loss": 0.76578963, + "num_input_tokens_seen": 132392345, + "router_z_loss_clip": 1.78710938, + "router_z_loss_mlp": 0.14880371, + "step": 6159, + "time_per_iteration": 2.4941582679748535 + }, + { + "auxiliary_loss_clip": 0.06464109, + "auxiliary_loss_mlp": 0.01275069, + "balance_loss_clip": 0.06292266, + "balance_loss_mlp": 0.01261986, + "epoch": 0.3703592364346911, + "flos": 26220509303040.0, + "grad_norm": 1.9367461088396363, + "language_loss": 0.71322787, + "learning_rate": 2.9029630750950697e-06, + "loss": 0.79061961, + "num_input_tokens_seen": 132412620, + "router_z_loss_clip": 1.71679688, + "router_z_loss_mlp": 0.13079834, + "step": 6160, + "time_per_iteration": 2.5934555530548096 + }, + { + "auxiliary_loss_clip": 0.06465742, + "auxiliary_loss_mlp": 0.01272758, + "balance_loss_clip": 0.06295532, + "balance_loss_mlp": 0.0125958, + "epoch": 0.37041935968735906, + "flos": 20054619555840.0, + "grad_norm": 1.6534007301448785, + "language_loss": 0.78978807, + "learning_rate": 2.9026155489283176e-06, + "loss": 0.86717302, + "num_input_tokens_seen": 132431570, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.1317749, + "step": 6161, + "time_per_iteration": 2.5337588787078857 + }, + { + "auxiliary_loss_clip": 0.06465232, + "auxiliary_loss_mlp": 0.01270423, + "balance_loss_clip": 0.06291839, + "balance_loss_mlp": 0.01255837, + "epoch": 0.3704794829400271, + "flos": 24140633656320.0, + "grad_norm": 1.7631614273732186, + "language_loss": 0.79746109, + "learning_rate": 2.902267988534295e-06, + "loss": 0.87481761, + "num_input_tokens_seen": 132451525, + "router_z_loss_clip": 1.73339844, + "router_z_loss_mlp": 0.14587402, + "step": 6162, + "time_per_iteration": 2.5815200805664062 + }, + { + "auxiliary_loss_clip": 0.06466715, + "auxiliary_loss_mlp": 0.01274307, + "balance_loss_clip": 0.06292939, + "balance_loss_mlp": 0.01260717, + "epoch": 0.37053960619269505, + "flos": 14872232707200.0, + "grad_norm": 1.8866019587111915, + "language_loss": 0.80318987, + "learning_rate": 2.9019203939261783e-06, + "loss": 0.88060015, + "num_input_tokens_seen": 132469875, + "router_z_loss_clip": 1.73925781, + "router_z_loss_mlp": 0.13580322, + "step": 6163, + "time_per_iteration": 2.501971483230591 + }, + { + "auxiliary_loss_clip": 0.06466764, + "auxiliary_loss_mlp": 0.01273928, + "balance_loss_clip": 0.0629348, + "balance_loss_mlp": 0.01260315, + "epoch": 0.370599729445363, + "flos": 21367969251840.0, + "grad_norm": 1.81392406825425, + "language_loss": 0.68857837, + "learning_rate": 2.9015727651171507e-06, + "loss": 0.76598537, + "num_input_tokens_seen": 132488360, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.13598633, + "step": 6164, + "time_per_iteration": 2.557870388031006 + }, + { + "auxiliary_loss_clip": 0.06463528, + "auxiliary_loss_mlp": 0.01275542, + "balance_loss_clip": 0.06290606, + "balance_loss_mlp": 0.0126064, + "epoch": 0.370659852698031, + "flos": 26835535872000.0, + "grad_norm": 2.3609289004256984, + "language_loss": 0.83364576, + "learning_rate": 2.9012251021203935e-06, + "loss": 0.91103643, + "num_input_tokens_seen": 132508630, + "router_z_loss_clip": 1.72851562, + "router_z_loss_mlp": 0.14916992, + "step": 6165, + "time_per_iteration": 2.5597267150878906 + }, + { + "auxiliary_loss_clip": 0.06475651, + "auxiliary_loss_mlp": 0.01276631, + "balance_loss_clip": 0.06294797, + "balance_loss_mlp": 0.01261086, + "epoch": 0.37071997595069894, + "flos": 19105050360960.0, + "grad_norm": 1.8212520052796557, + "language_loss": 0.69703627, + "learning_rate": 2.9008774049490896e-06, + "loss": 0.77455908, + "num_input_tokens_seen": 132527465, + "router_z_loss_clip": 1.81054688, + "router_z_loss_mlp": 0.15551758, + "step": 6166, + "time_per_iteration": 2.7443737983703613 + }, + { + "auxiliary_loss_clip": 0.06351966, + "auxiliary_loss_mlp": 0.01259396, + "balance_loss_clip": 0.0627325, + "balance_loss_mlp": 0.01255936, + "epoch": 0.3707800992033669, + "flos": 52193839461120.0, + "grad_norm": 0.7767712005900987, + "language_loss": 0.55992532, + "learning_rate": 2.9005296736164244e-06, + "loss": 0.6360389, + "num_input_tokens_seen": 132579940, + "router_z_loss_clip": 0.78808594, + "router_z_loss_mlp": 0.03469849, + "step": 6167, + "time_per_iteration": 3.122786045074463 + }, + { + "auxiliary_loss_clip": 0.06470326, + "auxiliary_loss_mlp": 0.01270542, + "balance_loss_clip": 0.06298738, + "balance_loss_mlp": 0.01256553, + "epoch": 0.3708402224560349, + "flos": 19908025637760.0, + "grad_norm": 1.887650816435161, + "language_loss": 0.75851792, + "learning_rate": 2.900181908135584e-06, + "loss": 0.83592659, + "num_input_tokens_seen": 132598390, + "router_z_loss_clip": 1.71484375, + "router_z_loss_mlp": 0.13983154, + "step": 6168, + "time_per_iteration": 2.516329050064087 + }, + { + "auxiliary_loss_clip": 0.06462339, + "auxiliary_loss_mlp": 0.01269774, + "balance_loss_clip": 0.0628986, + "balance_loss_mlp": 0.01255833, + "epoch": 0.37090034570870284, + "flos": 20013222839040.0, + "grad_norm": 1.688087532093935, + "language_loss": 0.74697542, + "learning_rate": 2.899834108519755e-06, + "loss": 0.82429659, + "num_input_tokens_seen": 132616920, + "router_z_loss_clip": 1.72363281, + "router_z_loss_mlp": 0.13946533, + "step": 6169, + "time_per_iteration": 2.571059226989746 + }, + { + "auxiliary_loss_clip": 0.06462043, + "auxiliary_loss_mlp": 0.01269285, + "balance_loss_clip": 0.06291892, + "balance_loss_mlp": 0.0125526, + "epoch": 0.3709604689613708, + "flos": 24141681832320.0, + "grad_norm": 1.6120375976718775, + "language_loss": 0.79462636, + "learning_rate": 2.899486274782127e-06, + "loss": 0.87193966, + "num_input_tokens_seen": 132637660, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.14007568, + "step": 6170, + "time_per_iteration": 2.539099931716919 + }, + { + "auxiliary_loss_clip": 0.06461793, + "auxiliary_loss_mlp": 0.01268893, + "balance_loss_clip": 0.06289523, + "balance_loss_mlp": 0.01254183, + "epoch": 0.37102059221403877, + "flos": 23882469626880.0, + "grad_norm": 1.7170622011660002, + "language_loss": 0.76363444, + "learning_rate": 2.8991384069358885e-06, + "loss": 0.84094131, + "num_input_tokens_seen": 132657635, + "router_z_loss_clip": 1.72167969, + "router_z_loss_mlp": 0.14703369, + "step": 6171, + "time_per_iteration": 2.5565338134765625 + }, + { + "auxiliary_loss_clip": 0.06464403, + "auxiliary_loss_mlp": 0.01269741, + "balance_loss_clip": 0.06292279, + "balance_loss_mlp": 0.0125568, + "epoch": 0.37108071546670673, + "flos": 14506439708160.0, + "grad_norm": 2.2434941236901222, + "language_loss": 0.80974334, + "learning_rate": 2.898790504994232e-06, + "loss": 0.88708472, + "num_input_tokens_seen": 132674455, + "router_z_loss_clip": 1.71972656, + "router_z_loss_mlp": 0.140625, + "step": 6172, + "time_per_iteration": 2.496101140975952 + }, + { + "auxiliary_loss_clip": 0.06468061, + "auxiliary_loss_mlp": 0.01272991, + "balance_loss_clip": 0.06291698, + "balance_loss_mlp": 0.01258352, + "epoch": 0.3711408387193747, + "flos": 34570172160000.0, + "grad_norm": 1.701200983183655, + "language_loss": 0.59536189, + "learning_rate": 2.89844256897035e-06, + "loss": 0.67277241, + "num_input_tokens_seen": 132695140, + "router_z_loss_clip": 1.76367188, + "router_z_loss_mlp": 0.14648438, + "step": 6173, + "time_per_iteration": 2.68860125541687 + }, + { + "auxiliary_loss_clip": 0.06465948, + "auxiliary_loss_mlp": 0.01267208, + "balance_loss_clip": 0.06291407, + "balance_loss_mlp": 0.01252825, + "epoch": 0.37120096197204266, + "flos": 17316350052480.0, + "grad_norm": 3.482738270256764, + "language_loss": 0.81161231, + "learning_rate": 2.898094598877435e-06, + "loss": 0.88894391, + "num_input_tokens_seen": 132712470, + "router_z_loss_clip": 1.74414062, + "router_z_loss_mlp": 0.1439209, + "step": 6174, + "time_per_iteration": 2.498631238937378 + }, + { + "auxiliary_loss_clip": 0.06459825, + "auxiliary_loss_mlp": 0.01267088, + "balance_loss_clip": 0.06290745, + "balance_loss_mlp": 0.01253826, + "epoch": 0.37126108522471063, + "flos": 30671855205120.0, + "grad_norm": 1.7762050826086826, + "language_loss": 0.79733562, + "learning_rate": 2.8977465947286826e-06, + "loss": 0.87460476, + "num_input_tokens_seen": 132732945, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.13275146, + "step": 6175, + "time_per_iteration": 2.6155989170074463 + }, + { + "auxiliary_loss_clip": 0.06469794, + "auxiliary_loss_mlp": 0.01268815, + "balance_loss_clip": 0.06296568, + "balance_loss_mlp": 0.01253926, + "epoch": 0.37132120847737865, + "flos": 25162682232960.0, + "grad_norm": 2.183025760433602, + "language_loss": 0.8886646, + "learning_rate": 2.89739855653729e-06, + "loss": 0.96605068, + "num_input_tokens_seen": 132752470, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.14880371, + "step": 6176, + "time_per_iteration": 3.9855380058288574 + }, + { + "auxiliary_loss_clip": 0.06463525, + "auxiliary_loss_mlp": 0.01266267, + "balance_loss_clip": 0.0629091, + "balance_loss_mlp": 0.01252331, + "epoch": 0.3713813317300466, + "flos": 21219572471040.0, + "grad_norm": 1.8377156327305517, + "language_loss": 0.73693877, + "learning_rate": 2.8970504843164546e-06, + "loss": 0.8142367, + "num_input_tokens_seen": 132771485, + "router_z_loss_clip": 1.72363281, + "router_z_loss_mlp": 0.13952637, + "step": 6177, + "time_per_iteration": 2.584007501602173 + }, + { + "auxiliary_loss_clip": 0.06460603, + "auxiliary_loss_mlp": 0.01270943, + "balance_loss_clip": 0.06288581, + "balance_loss_mlp": 0.01256722, + "epoch": 0.3714414549827146, + "flos": 21623114534400.0, + "grad_norm": 3.348536242845292, + "language_loss": 0.75657964, + "learning_rate": 2.896702378079374e-06, + "loss": 0.83389515, + "num_input_tokens_seen": 132791465, + "router_z_loss_clip": 1.72265625, + "router_z_loss_mlp": 0.14227295, + "step": 6178, + "time_per_iteration": 4.047810077667236 + }, + { + "auxiliary_loss_clip": 0.06459013, + "auxiliary_loss_mlp": 0.01268256, + "balance_loss_clip": 0.06288654, + "balance_loss_mlp": 0.01253796, + "epoch": 0.37150157823538255, + "flos": 19978073251200.0, + "grad_norm": 1.677068577007521, + "language_loss": 0.7243154, + "learning_rate": 2.8963542378392502e-06, + "loss": 0.80158818, + "num_input_tokens_seen": 132810160, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.14465332, + "step": 6179, + "time_per_iteration": 2.525162696838379 + }, + { + "auxiliary_loss_clip": 0.06464912, + "auxiliary_loss_mlp": 0.01268865, + "balance_loss_clip": 0.06289817, + "balance_loss_mlp": 0.01254506, + "epoch": 0.3715617014880505, + "flos": 24867020701440.0, + "grad_norm": 1.5744290711880986, + "language_loss": 0.70164317, + "learning_rate": 2.896006063609283e-06, + "loss": 0.77898097, + "num_input_tokens_seen": 132831265, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.14361572, + "step": 6180, + "time_per_iteration": 2.564251661300659 + }, + { + "auxiliary_loss_clip": 0.06459807, + "auxiliary_loss_mlp": 0.01269776, + "balance_loss_clip": 0.0628929, + "balance_loss_mlp": 0.01255173, + "epoch": 0.3716218247407185, + "flos": 20455352507520.0, + "grad_norm": 1.6669585833251956, + "language_loss": 0.78357702, + "learning_rate": 2.8956578554026767e-06, + "loss": 0.86087286, + "num_input_tokens_seen": 132850005, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.14605713, + "step": 6181, + "time_per_iteration": 2.5857934951782227 + }, + { + "auxiliary_loss_clip": 0.06456675, + "auxiliary_loss_mlp": 0.01268697, + "balance_loss_clip": 0.06286183, + "balance_loss_mlp": 0.01254195, + "epoch": 0.37168194799338644, + "flos": 24140256312960.0, + "grad_norm": 1.7806049549646892, + "language_loss": 0.78926349, + "learning_rate": 2.8953096132326343e-06, + "loss": 0.86651719, + "num_input_tokens_seen": 132865790, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.14520264, + "step": 6182, + "time_per_iteration": 2.572563409805298 + }, + { + "auxiliary_loss_clip": 0.0637676, + "auxiliary_loss_mlp": 0.01256678, + "balance_loss_clip": 0.06297279, + "balance_loss_mlp": 0.01253508, + "epoch": 0.3717420712460544, + "flos": 67429601107200.0, + "grad_norm": 0.7782169453066291, + "language_loss": 0.57265592, + "learning_rate": 2.894961337112362e-06, + "loss": 0.64899027, + "num_input_tokens_seen": 132921775, + "router_z_loss_clip": 0.79296875, + "router_z_loss_mlp": 0.03170776, + "step": 6183, + "time_per_iteration": 4.616533279418945 + }, + { + "auxiliary_loss_clip": 0.06460768, + "auxiliary_loss_mlp": 0.0127302, + "balance_loss_clip": 0.06282368, + "balance_loss_mlp": 0.01258059, + "epoch": 0.37180219449872237, + "flos": 22382512888320.0, + "grad_norm": 2.288371354177028, + "language_loss": 0.77116179, + "learning_rate": 2.894613027055066e-06, + "loss": 0.84849966, + "num_input_tokens_seen": 132941060, + "router_z_loss_clip": 1.78320312, + "router_z_loss_mlp": 0.1496582, + "step": 6184, + "time_per_iteration": 2.5182292461395264 + }, + { + "auxiliary_loss_clip": 0.06457444, + "auxiliary_loss_mlp": 0.01269752, + "balance_loss_clip": 0.0628842, + "balance_loss_mlp": 0.01255739, + "epoch": 0.37186231775139034, + "flos": 21876037683840.0, + "grad_norm": 2.2342830987852023, + "language_loss": 0.72608167, + "learning_rate": 2.894264683073954e-06, + "loss": 0.80335367, + "num_input_tokens_seen": 132961850, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.14007568, + "step": 6185, + "time_per_iteration": 3.928272247314453 + }, + { + "auxiliary_loss_clip": 0.06453837, + "auxiliary_loss_mlp": 0.01267225, + "balance_loss_clip": 0.06286646, + "balance_loss_mlp": 0.01253075, + "epoch": 0.3719224410040583, + "flos": 22421142420480.0, + "grad_norm": 1.6056881027286982, + "language_loss": 0.77329034, + "learning_rate": 2.8939163051822363e-06, + "loss": 0.85050094, + "num_input_tokens_seen": 132981625, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.14160156, + "step": 6186, + "time_per_iteration": 2.549499988555908 + }, + { + "auxiliary_loss_clip": 0.0646092, + "auxiliary_loss_mlp": 0.01274226, + "balance_loss_clip": 0.06283663, + "balance_loss_mlp": 0.01258121, + "epoch": 0.37198256425672627, + "flos": 25157525207040.0, + "grad_norm": 1.8763954627941488, + "language_loss": 0.84227252, + "learning_rate": 2.8935678933931224e-06, + "loss": 0.91962403, + "num_input_tokens_seen": 133001225, + "router_z_loss_clip": 1.77246094, + "router_z_loss_mlp": 0.16101074, + "step": 6187, + "time_per_iteration": 2.542978048324585 + }, + { + "auxiliary_loss_clip": 0.06456143, + "auxiliary_loss_mlp": 0.01269651, + "balance_loss_clip": 0.06286585, + "balance_loss_mlp": 0.01255919, + "epoch": 0.37204268750939423, + "flos": 21144032415360.0, + "grad_norm": 2.100791898470326, + "language_loss": 0.84696567, + "learning_rate": 2.893219447719824e-06, + "loss": 0.9242236, + "num_input_tokens_seen": 133018820, + "router_z_loss_clip": 1.69433594, + "router_z_loss_mlp": 0.13726807, + "step": 6188, + "time_per_iteration": 2.626126766204834 + }, + { + "auxiliary_loss_clip": 0.06458837, + "auxiliary_loss_mlp": 0.01269894, + "balance_loss_clip": 0.06288396, + "balance_loss_mlp": 0.01256232, + "epoch": 0.37210281076206225, + "flos": 21513221504640.0, + "grad_norm": 2.2586863759616564, + "language_loss": 0.66390121, + "learning_rate": 2.8928709681755548e-06, + "loss": 0.74118853, + "num_input_tokens_seen": 133040205, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.13653564, + "step": 6189, + "time_per_iteration": 2.5793135166168213 + }, + { + "auxiliary_loss_clip": 0.06460261, + "auxiliary_loss_mlp": 0.01271387, + "balance_loss_clip": 0.0628726, + "balance_loss_mlp": 0.01255926, + "epoch": 0.3721629340147302, + "flos": 17353595992320.0, + "grad_norm": 2.971940637043147, + "language_loss": 0.84218514, + "learning_rate": 2.8925224547735293e-06, + "loss": 0.91950166, + "num_input_tokens_seen": 133058095, + "router_z_loss_clip": 1.72949219, + "router_z_loss_mlp": 0.15466309, + "step": 6190, + "time_per_iteration": 2.530977487564087 + }, + { + "auxiliary_loss_clip": 0.06464738, + "auxiliary_loss_mlp": 0.01270544, + "balance_loss_clip": 0.06287063, + "balance_loss_mlp": 0.01255905, + "epoch": 0.3722230572673982, + "flos": 16437457376640.0, + "grad_norm": 2.7368484374177076, + "language_loss": 0.89274895, + "learning_rate": 2.8921739075269633e-06, + "loss": 0.97010183, + "num_input_tokens_seen": 133071530, + "router_z_loss_clip": 1.77441406, + "router_z_loss_mlp": 0.14648438, + "step": 6191, + "time_per_iteration": 2.4786319732666016 + }, + { + "auxiliary_loss_clip": 0.06463645, + "auxiliary_loss_mlp": 0.01271285, + "balance_loss_clip": 0.06286322, + "balance_loss_mlp": 0.01254465, + "epoch": 0.37228318052006615, + "flos": 22681360874880.0, + "grad_norm": 2.1321020045013577, + "language_loss": 0.74374199, + "learning_rate": 2.891825326449073e-06, + "loss": 0.82109123, + "num_input_tokens_seen": 133091410, + "router_z_loss_clip": 1.77148438, + "router_z_loss_mlp": 0.16790771, + "step": 6192, + "time_per_iteration": 2.6107547283172607 + }, + { + "auxiliary_loss_clip": 0.06461145, + "auxiliary_loss_mlp": 0.01269074, + "balance_loss_clip": 0.06288278, + "balance_loss_mlp": 0.0125493, + "epoch": 0.3723433037727341, + "flos": 25272617189760.0, + "grad_norm": 2.3785606336548124, + "language_loss": 0.79934001, + "learning_rate": 2.8914767115530766e-06, + "loss": 0.87664223, + "num_input_tokens_seen": 133110365, + "router_z_loss_clip": 1.72753906, + "router_z_loss_mlp": 0.14154053, + "step": 6193, + "time_per_iteration": 2.5584514141082764 + }, + { + "auxiliary_loss_clip": 0.06469596, + "auxiliary_loss_mlp": 0.01270113, + "balance_loss_clip": 0.06293128, + "balance_loss_mlp": 0.01255594, + "epoch": 0.3724034270254021, + "flos": 10529228534400.0, + "grad_norm": 1.7620775512614164, + "language_loss": 0.84889179, + "learning_rate": 2.891128062852194e-06, + "loss": 0.92628884, + "num_input_tokens_seen": 133128255, + "router_z_loss_clip": 1.76269531, + "router_z_loss_mlp": 0.14526367, + "step": 6194, + "time_per_iteration": 2.5419061183929443 + }, + { + "auxiliary_loss_clip": 0.06460975, + "auxiliary_loss_mlp": 0.01266847, + "balance_loss_clip": 0.06288271, + "balance_loss_mlp": 0.01253317, + "epoch": 0.37246355027807004, + "flos": 20272393117440.0, + "grad_norm": 2.226391461709797, + "language_loss": 0.78030515, + "learning_rate": 2.890779380359646e-06, + "loss": 0.85758334, + "num_input_tokens_seen": 133143975, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.13543701, + "step": 6195, + "time_per_iteration": 2.51361346244812 + }, + { + "auxiliary_loss_clip": 0.06459115, + "auxiliary_loss_mlp": 0.01274112, + "balance_loss_clip": 0.06288831, + "balance_loss_mlp": 0.01258955, + "epoch": 0.372523673530738, + "flos": 19506705707520.0, + "grad_norm": 1.8216220923823887, + "language_loss": 0.79924363, + "learning_rate": 2.890430664088655e-06, + "loss": 0.87657595, + "num_input_tokens_seen": 133162935, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.15155029, + "step": 6196, + "time_per_iteration": 2.6005568504333496 + }, + { + "auxiliary_loss_clip": 0.06458211, + "auxiliary_loss_mlp": 0.01270847, + "balance_loss_clip": 0.06289028, + "balance_loss_mlp": 0.01256888, + "epoch": 0.372583796783406, + "flos": 16769945577600.0, + "grad_norm": 2.2795878215352396, + "language_loss": 0.84059894, + "learning_rate": 2.890081914052443e-06, + "loss": 0.91788948, + "num_input_tokens_seen": 133181180, + "router_z_loss_clip": 1.69140625, + "router_z_loss_mlp": 0.13952637, + "step": 6197, + "time_per_iteration": 2.538058042526245 + }, + { + "auxiliary_loss_clip": 0.06456813, + "auxiliary_loss_mlp": 0.01270068, + "balance_loss_clip": 0.06289704, + "balance_loss_mlp": 0.01255095, + "epoch": 0.37264392003607394, + "flos": 22644576132480.0, + "grad_norm": 1.7143100919816474, + "language_loss": 0.64964151, + "learning_rate": 2.889733130264237e-06, + "loss": 0.72691035, + "num_input_tokens_seen": 133199615, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.14971924, + "step": 6198, + "time_per_iteration": 2.5891072750091553 + }, + { + "auxiliary_loss_clip": 0.06454235, + "auxiliary_loss_mlp": 0.0127235, + "balance_loss_clip": 0.0628581, + "balance_loss_mlp": 0.01258367, + "epoch": 0.3727040432887419, + "flos": 19979037573120.0, + "grad_norm": 1.4303592099178044, + "language_loss": 0.74534631, + "learning_rate": 2.889384312737261e-06, + "loss": 0.82261217, + "num_input_tokens_seen": 133219650, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.13977051, + "step": 6199, + "time_per_iteration": 2.5612289905548096 + }, + { + "auxiliary_loss_clip": 0.06453978, + "auxiliary_loss_mlp": 0.01269323, + "balance_loss_clip": 0.06284302, + "balance_loss_mlp": 0.01255095, + "epoch": 0.37276416654140987, + "flos": 63911906853120.0, + "grad_norm": 1.6001689252403943, + "language_loss": 0.81250614, + "learning_rate": 2.889035461484742e-06, + "loss": 0.88973916, + "num_input_tokens_seen": 133245675, + "router_z_loss_clip": 1.69433594, + "router_z_loss_mlp": 0.14227295, + "step": 6200, + "time_per_iteration": 2.9802377223968506 + }, + { + "auxiliary_loss_clip": 0.06452343, + "auxiliary_loss_mlp": 0.01273173, + "balance_loss_clip": 0.06282811, + "balance_loss_mlp": 0.0125907, + "epoch": 0.37282428979407783, + "flos": 39795381244800.0, + "grad_norm": 2.0282879733455776, + "language_loss": 0.61128068, + "learning_rate": 2.88868657651991e-06, + "loss": 0.68853581, + "num_input_tokens_seen": 133266905, + "router_z_loss_clip": 1.69238281, + "router_z_loss_mlp": 0.14123535, + "step": 6201, + "time_per_iteration": 2.6786048412323 + }, + { + "auxiliary_loss_clip": 0.06460309, + "auxiliary_loss_mlp": 0.01271912, + "balance_loss_clip": 0.06284842, + "balance_loss_mlp": 0.01257166, + "epoch": 0.37288441304674586, + "flos": 22715336505600.0, + "grad_norm": 1.562126243298772, + "language_loss": 0.73424393, + "learning_rate": 2.8883376578559934e-06, + "loss": 0.81156611, + "num_input_tokens_seen": 133286865, + "router_z_loss_clip": 1.75390625, + "router_z_loss_mlp": 0.14746094, + "step": 6202, + "time_per_iteration": 2.5774593353271484 + }, + { + "auxiliary_loss_clip": 0.06450565, + "auxiliary_loss_mlp": 0.01268741, + "balance_loss_clip": 0.06279768, + "balance_loss_mlp": 0.01253697, + "epoch": 0.3729445362994138, + "flos": 18776209812480.0, + "grad_norm": 3.8476229642649895, + "language_loss": 0.73690808, + "learning_rate": 2.8879887055062243e-06, + "loss": 0.81410116, + "num_input_tokens_seen": 133305295, + "router_z_loss_clip": 1.70898438, + "router_z_loss_mlp": 0.1505127, + "step": 6203, + "time_per_iteration": 2.4786221981048584 + }, + { + "auxiliary_loss_clip": 0.06448745, + "auxiliary_loss_mlp": 0.0126679, + "balance_loss_clip": 0.06280582, + "balance_loss_mlp": 0.01253402, + "epoch": 0.3730046595520818, + "flos": 22462874553600.0, + "grad_norm": 1.6222639611717555, + "language_loss": 0.82113981, + "learning_rate": 2.8876397194838353e-06, + "loss": 0.89829516, + "num_input_tokens_seen": 133324625, + "router_z_loss_clip": 1.68066406, + "router_z_loss_mlp": 0.13391113, + "step": 6204, + "time_per_iteration": 2.5474419593811035 + }, + { + "auxiliary_loss_clip": 0.06456085, + "auxiliary_loss_mlp": 0.01267649, + "balance_loss_clip": 0.06282973, + "balance_loss_mlp": 0.01253094, + "epoch": 0.37306478280474975, + "flos": 24323257630080.0, + "grad_norm": 1.5013454609640156, + "language_loss": 0.75699729, + "learning_rate": 2.8872906998020577e-06, + "loss": 0.8342346, + "num_input_tokens_seen": 133344625, + "router_z_loss_clip": 1.73144531, + "router_z_loss_mlp": 0.14562988, + "step": 6205, + "time_per_iteration": 2.5284838676452637 + }, + { + "auxiliary_loss_clip": 0.06453846, + "auxiliary_loss_mlp": 0.01269403, + "balance_loss_clip": 0.06283775, + "balance_loss_mlp": 0.01254538, + "epoch": 0.3731249060574177, + "flos": 15820627944960.0, + "grad_norm": 2.409990557003708, + "language_loss": 0.78042793, + "learning_rate": 2.886941646474128e-06, + "loss": 0.85766041, + "num_input_tokens_seen": 133363605, + "router_z_loss_clip": 1.70019531, + "router_z_loss_mlp": 0.14868164, + "step": 6206, + "time_per_iteration": 2.5130996704101562 + }, + { + "auxiliary_loss_clip": 0.06455843, + "auxiliary_loss_mlp": 0.01268821, + "balance_loss_clip": 0.06284125, + "balance_loss_mlp": 0.01253085, + "epoch": 0.3731850293100857, + "flos": 19834120736640.0, + "grad_norm": 3.8358433201526334, + "language_loss": 0.93966329, + "learning_rate": 2.886592559513283e-06, + "loss": 1.01690984, + "num_input_tokens_seen": 133379405, + "router_z_loss_clip": 1.71484375, + "router_z_loss_mlp": 0.15734863, + "step": 6207, + "time_per_iteration": 2.4994020462036133 + }, + { + "auxiliary_loss_clip": 0.06459471, + "auxiliary_loss_mlp": 0.01269129, + "balance_loss_clip": 0.06283936, + "balance_loss_mlp": 0.01254561, + "epoch": 0.37324515256275365, + "flos": 19068349472640.0, + "grad_norm": 2.1400449567396826, + "language_loss": 0.82643408, + "learning_rate": 2.886243438932759e-06, + "loss": 0.90372002, + "num_input_tokens_seen": 133397585, + "router_z_loss_clip": 1.75488281, + "router_z_loss_mlp": 0.14575195, + "step": 6208, + "time_per_iteration": 2.5359628200531006 + }, + { + "auxiliary_loss_clip": 0.06460227, + "auxiliary_loss_mlp": 0.01272188, + "balance_loss_clip": 0.06285752, + "balance_loss_mlp": 0.01255904, + "epoch": 0.3733052758154216, + "flos": 20710623571200.0, + "grad_norm": 2.148305950788212, + "language_loss": 0.73528939, + "learning_rate": 2.8858942847457953e-06, + "loss": 0.81261349, + "num_input_tokens_seen": 133415365, + "router_z_loss_clip": 1.7421875, + "router_z_loss_mlp": 0.1628418, + "step": 6209, + "time_per_iteration": 2.499209403991699 + }, + { + "auxiliary_loss_clip": 0.06455819, + "auxiliary_loss_mlp": 0.01273959, + "balance_loss_clip": 0.06285547, + "balance_loss_mlp": 0.01258593, + "epoch": 0.3733653990680896, + "flos": 20199704100480.0, + "grad_norm": 2.014449395888949, + "language_loss": 0.71212471, + "learning_rate": 2.8855450969656305e-06, + "loss": 0.78942245, + "num_input_tokens_seen": 133435700, + "router_z_loss_clip": 1.70117188, + "router_z_loss_mlp": 0.15368652, + "step": 6210, + "time_per_iteration": 2.5324270725250244 + }, + { + "auxiliary_loss_clip": 0.06468424, + "auxiliary_loss_mlp": 0.01268574, + "balance_loss_clip": 0.06295058, + "balance_loss_mlp": 0.01253631, + "epoch": 0.37342552232075754, + "flos": 20345920675200.0, + "grad_norm": 1.543701660359285, + "language_loss": 0.7823801, + "learning_rate": 2.8851958756055073e-06, + "loss": 0.85975003, + "num_input_tokens_seen": 133455180, + "router_z_loss_clip": 1.73535156, + "router_z_loss_mlp": 0.1494751, + "step": 6211, + "time_per_iteration": 2.5388078689575195 + }, + { + "auxiliary_loss_clip": 0.06464606, + "auxiliary_loss_mlp": 0.01268752, + "balance_loss_clip": 0.06291494, + "balance_loss_mlp": 0.0125347, + "epoch": 0.3734856455734255, + "flos": 35526701243520.0, + "grad_norm": 1.6765525733287814, + "language_loss": 0.73612988, + "learning_rate": 2.884846620678668e-06, + "loss": 0.81346345, + "num_input_tokens_seen": 133476715, + "router_z_loss_clip": 1.73046875, + "router_z_loss_mlp": 0.15283203, + "step": 6212, + "time_per_iteration": 2.663950204849243 + }, + { + "auxiliary_loss_clip": 0.06477734, + "auxiliary_loss_mlp": 0.01272795, + "balance_loss_clip": 0.06294222, + "balance_loss_mlp": 0.01256345, + "epoch": 0.37354576882609347, + "flos": 21148686316800.0, + "grad_norm": 1.865900947954382, + "language_loss": 0.82430422, + "learning_rate": 2.884497332198356e-06, + "loss": 0.90180945, + "num_input_tokens_seen": 133494550, + "router_z_loss_clip": 1.83496094, + "router_z_loss_mlp": 0.16455078, + "step": 6213, + "time_per_iteration": 2.541431427001953 + }, + { + "auxiliary_loss_clip": 0.06467836, + "auxiliary_loss_mlp": 0.01271096, + "balance_loss_clip": 0.06295606, + "balance_loss_mlp": 0.01255623, + "epoch": 0.37360589207876144, + "flos": 21513179577600.0, + "grad_norm": 2.345206885791162, + "language_loss": 0.7896657, + "learning_rate": 2.8841480101778167e-06, + "loss": 0.86705506, + "num_input_tokens_seen": 133512640, + "router_z_loss_clip": 1.72070312, + "router_z_loss_mlp": 0.15466309, + "step": 6214, + "time_per_iteration": 2.545792579650879 + }, + { + "auxiliary_loss_clip": 0.06466322, + "auxiliary_loss_mlp": 0.01270745, + "balance_loss_clip": 0.06297071, + "balance_loss_mlp": 0.01255981, + "epoch": 0.37366601533142946, + "flos": 38444953317120.0, + "grad_norm": 1.6116656191599898, + "language_loss": 0.85112274, + "learning_rate": 2.883798654630296e-06, + "loss": 0.92849338, + "num_input_tokens_seen": 133535540, + "router_z_loss_clip": 1.69433594, + "router_z_loss_mlp": 0.14758301, + "step": 6215, + "time_per_iteration": 2.70700740814209 + }, + { + "auxiliary_loss_clip": 0.06472297, + "auxiliary_loss_mlp": 0.01270089, + "balance_loss_clip": 0.06296762, + "balance_loss_mlp": 0.01254044, + "epoch": 0.3737261385840974, + "flos": 18446908066560.0, + "grad_norm": 1.6510257786225762, + "language_loss": 0.6833967, + "learning_rate": 2.8834492655690423e-06, + "loss": 0.76082057, + "num_input_tokens_seen": 133555795, + "router_z_loss_clip": 1.75683594, + "router_z_loss_mlp": 0.16040039, + "step": 6216, + "time_per_iteration": 3.941821575164795 + }, + { + "auxiliary_loss_clip": 0.06466141, + "auxiliary_loss_mlp": 0.01276294, + "balance_loss_clip": 0.06293347, + "balance_loss_mlp": 0.01260224, + "epoch": 0.3737862618367654, + "flos": 22936506157440.0, + "grad_norm": 2.1208446300989983, + "language_loss": 0.6621505, + "learning_rate": 2.883099843007303e-06, + "loss": 0.73957485, + "num_input_tokens_seen": 133575905, + "router_z_loss_clip": 1.72753906, + "router_z_loss_mlp": 0.1607666, + "step": 6217, + "time_per_iteration": 4.067852258682251 + }, + { + "auxiliary_loss_clip": 0.06468368, + "auxiliary_loss_mlp": 0.01272371, + "balance_loss_clip": 0.06294458, + "balance_loss_mlp": 0.0125772, + "epoch": 0.37384638508943335, + "flos": 15414360624000.0, + "grad_norm": 1.5564133784357135, + "language_loss": 0.80760753, + "learning_rate": 2.88275038695833e-06, + "loss": 0.88501501, + "num_input_tokens_seen": 133592585, + "router_z_loss_clip": 1.73925781, + "router_z_loss_mlp": 0.1463623, + "step": 6218, + "time_per_iteration": 2.5253372192382812 + }, + { + "auxiliary_loss_clip": 0.06465785, + "auxiliary_loss_mlp": 0.01272039, + "balance_loss_clip": 0.06298652, + "balance_loss_mlp": 0.01256661, + "epoch": 0.3739065083421013, + "flos": 24287856480000.0, + "grad_norm": 2.4835018506755566, + "language_loss": 0.79185957, + "learning_rate": 2.8824008974353736e-06, + "loss": 0.86923778, + "num_input_tokens_seen": 133615070, + "router_z_loss_clip": 1.671875, + "router_z_loss_mlp": 0.15380859, + "step": 6219, + "time_per_iteration": 2.595684289932251 + }, + { + "auxiliary_loss_clip": 0.06464131, + "auxiliary_loss_mlp": 0.01274727, + "balance_loss_clip": 0.06296779, + "balance_loss_mlp": 0.01260177, + "epoch": 0.3739666315947693, + "flos": 23009488663680.0, + "grad_norm": 2.098390778414135, + "language_loss": 0.77614415, + "learning_rate": 2.8820513744516866e-06, + "loss": 0.85353279, + "num_input_tokens_seen": 133633490, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.14538574, + "step": 6220, + "time_per_iteration": 2.5899298191070557 + }, + { + "auxiliary_loss_clip": 0.06466513, + "auxiliary_loss_mlp": 0.01270657, + "balance_loss_clip": 0.06292208, + "balance_loss_mlp": 0.0125541, + "epoch": 0.37402675484743725, + "flos": 19397231948160.0, + "grad_norm": 1.5821121915867322, + "language_loss": 0.83564717, + "learning_rate": 2.8817018180205235e-06, + "loss": 0.91301888, + "num_input_tokens_seen": 133653425, + "router_z_loss_clip": 1.74121094, + "router_z_loss_mlp": 0.15240479, + "step": 6221, + "time_per_iteration": 2.540102481842041 + }, + { + "auxiliary_loss_clip": 0.06464627, + "auxiliary_loss_mlp": 0.0127713, + "balance_loss_clip": 0.06293692, + "balance_loss_mlp": 0.01262647, + "epoch": 0.3740868781001052, + "flos": 17131420091520.0, + "grad_norm": 1.6401420513761291, + "language_loss": 0.76738596, + "learning_rate": 2.8813522281551387e-06, + "loss": 0.84480345, + "num_input_tokens_seen": 133670220, + "router_z_loss_clip": 1.7109375, + "router_z_loss_mlp": 0.14477539, + "step": 6222, + "time_per_iteration": 4.020254850387573 + }, + { + "auxiliary_loss_clip": 0.06466988, + "auxiliary_loss_mlp": 0.01277801, + "balance_loss_clip": 0.06296736, + "balance_loss_mlp": 0.01263467, + "epoch": 0.3741470013527732, + "flos": 20049001332480.0, + "grad_norm": 1.799306271558528, + "language_loss": 0.70768011, + "learning_rate": 2.881002604868789e-06, + "loss": 0.785128, + "num_input_tokens_seen": 133688910, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.14349365, + "step": 6223, + "time_per_iteration": 2.6146726608276367 + }, + { + "auxiliary_loss_clip": 0.0646846, + "auxiliary_loss_mlp": 0.01272132, + "balance_loss_clip": 0.06299432, + "balance_loss_mlp": 0.01258954, + "epoch": 0.37420712460544114, + "flos": 36905151162240.0, + "grad_norm": 1.9191598081110601, + "language_loss": 0.69292819, + "learning_rate": 2.8806529481747325e-06, + "loss": 0.77033412, + "num_input_tokens_seen": 133708690, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.1317749, + "step": 6224, + "time_per_iteration": 4.144296407699585 + }, + { + "auxiliary_loss_clip": 0.06463895, + "auxiliary_loss_mlp": 0.01274949, + "balance_loss_clip": 0.06296779, + "balance_loss_mlp": 0.01260126, + "epoch": 0.3742672478581091, + "flos": 22207896979200.0, + "grad_norm": 1.811742579086715, + "language_loss": 0.70166373, + "learning_rate": 2.880303258086228e-06, + "loss": 0.77905214, + "num_input_tokens_seen": 133728095, + "router_z_loss_clip": 1.66894531, + "router_z_loss_mlp": 0.14819336, + "step": 6225, + "time_per_iteration": 2.562023162841797 + }, + { + "auxiliary_loss_clip": 0.06462345, + "auxiliary_loss_mlp": 0.0127698, + "balance_loss_clip": 0.06296264, + "balance_loss_mlp": 0.01262257, + "epoch": 0.3743273711107771, + "flos": 24688547504640.0, + "grad_norm": 2.0306145345851614, + "language_loss": 0.79386592, + "learning_rate": 2.879953534616536e-06, + "loss": 0.87125921, + "num_input_tokens_seen": 133745590, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.14715576, + "step": 6226, + "time_per_iteration": 2.5372707843780518 + }, + { + "auxiliary_loss_clip": 0.06464548, + "auxiliary_loss_mlp": 0.01273743, + "balance_loss_clip": 0.0629389, + "balance_loss_mlp": 0.01259021, + "epoch": 0.37438749436344504, + "flos": 24466078114560.0, + "grad_norm": 1.6346435650910545, + "language_loss": 0.68240035, + "learning_rate": 2.879603777778917e-06, + "loss": 0.75978327, + "num_input_tokens_seen": 133766155, + "router_z_loss_clip": 1.70800781, + "router_z_loss_mlp": 0.14733887, + "step": 6227, + "time_per_iteration": 2.5752079486846924 + }, + { + "auxiliary_loss_clip": 0.06464467, + "auxiliary_loss_mlp": 0.01270066, + "balance_loss_clip": 0.06297411, + "balance_loss_mlp": 0.0125588, + "epoch": 0.374447617616113, + "flos": 21805193456640.0, + "grad_norm": 1.6298548281431393, + "language_loss": 0.83520573, + "learning_rate": 2.879253987586635e-06, + "loss": 0.91255105, + "num_input_tokens_seen": 133783185, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.14190674, + "step": 6228, + "time_per_iteration": 2.605607748031616 + }, + { + "auxiliary_loss_clip": 0.06458256, + "auxiliary_loss_mlp": 0.01270458, + "balance_loss_clip": 0.06288552, + "balance_loss_mlp": 0.01256033, + "epoch": 0.374507740868781, + "flos": 17974073076480.0, + "grad_norm": 1.5343038876343353, + "language_loss": 0.75450277, + "learning_rate": 2.8789041640529535e-06, + "loss": 0.83178985, + "num_input_tokens_seen": 133800975, + "router_z_loss_clip": 1.69824219, + "router_z_loss_mlp": 0.14428711, + "step": 6229, + "time_per_iteration": 2.607506036758423 + }, + { + "auxiliary_loss_clip": 0.06464534, + "auxiliary_loss_mlp": 0.012714, + "balance_loss_clip": 0.06293011, + "balance_loss_mlp": 0.01256249, + "epoch": 0.374567864121449, + "flos": 16111132450560.0, + "grad_norm": 3.0205318355467083, + "language_loss": 0.84065855, + "learning_rate": 2.8785543071911383e-06, + "loss": 0.91801792, + "num_input_tokens_seen": 133818020, + "router_z_loss_clip": 1.71386719, + "router_z_loss_mlp": 0.15142822, + "step": 6230, + "time_per_iteration": 2.4964523315429688 + }, + { + "auxiliary_loss_clip": 0.06463904, + "auxiliary_loss_mlp": 0.01275239, + "balance_loss_clip": 0.06291893, + "balance_loss_mlp": 0.01259569, + "epoch": 0.37462798737411696, + "flos": 25779847080960.0, + "grad_norm": 1.7178487844900587, + "language_loss": 0.73793018, + "learning_rate": 2.878204417014456e-06, + "loss": 0.81532168, + "num_input_tokens_seen": 133840690, + "router_z_loss_clip": 1.72167969, + "router_z_loss_mlp": 0.15667725, + "step": 6231, + "time_per_iteration": 2.589771270751953 + }, + { + "auxiliary_loss_clip": 0.06465879, + "auxiliary_loss_mlp": 0.01270223, + "balance_loss_clip": 0.06291361, + "balance_loss_mlp": 0.01255298, + "epoch": 0.3746881106267849, + "flos": 16660136401920.0, + "grad_norm": 1.8762806294571872, + "language_loss": 0.74086344, + "learning_rate": 2.8778544935361735e-06, + "loss": 0.81822443, + "num_input_tokens_seen": 133858350, + "router_z_loss_clip": 1.74414062, + "router_z_loss_mlp": 0.14929199, + "step": 6232, + "time_per_iteration": 2.483219861984253 + }, + { + "auxiliary_loss_clip": 0.06463014, + "auxiliary_loss_mlp": 0.01270796, + "balance_loss_clip": 0.06290261, + "balance_loss_mlp": 0.0125605, + "epoch": 0.3747482338794529, + "flos": 26185317788160.0, + "grad_norm": 1.743409558247901, + "language_loss": 0.77404612, + "learning_rate": 2.877504536769561e-06, + "loss": 0.85138428, + "num_input_tokens_seen": 133879775, + "router_z_loss_clip": 1.72753906, + "router_z_loss_mlp": 0.14758301, + "step": 6233, + "time_per_iteration": 2.5796406269073486 + }, + { + "auxiliary_loss_clip": 0.06463634, + "auxiliary_loss_mlp": 0.01269135, + "balance_loss_clip": 0.06292734, + "balance_loss_mlp": 0.01255432, + "epoch": 0.37480835713212085, + "flos": 12025956890880.0, + "grad_norm": 1.7958128584553208, + "language_loss": 0.69650698, + "learning_rate": 2.8771545467278883e-06, + "loss": 0.77383471, + "num_input_tokens_seen": 133898295, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.13690186, + "step": 6234, + "time_per_iteration": 2.524226188659668 + }, + { + "auxiliary_loss_clip": 0.06464471, + "auxiliary_loss_mlp": 0.01267248, + "balance_loss_clip": 0.06295948, + "balance_loss_mlp": 0.0125311, + "epoch": 0.3748684803847888, + "flos": 19684801560960.0, + "grad_norm": 2.1537876510353597, + "language_loss": 0.83551729, + "learning_rate": 2.8768045234244276e-06, + "loss": 0.91283447, + "num_input_tokens_seen": 133915230, + "router_z_loss_clip": 1.68554688, + "router_z_loss_mlp": 0.14135742, + "step": 6235, + "time_per_iteration": 2.5380606651306152 + }, + { + "auxiliary_loss_clip": 0.06462481, + "auxiliary_loss_mlp": 0.01267157, + "balance_loss_clip": 0.06289958, + "balance_loss_mlp": 0.0125222, + "epoch": 0.3749286036374568, + "flos": 20527328764800.0, + "grad_norm": 1.8434440291752416, + "language_loss": 0.78213942, + "learning_rate": 2.8764544668724517e-06, + "loss": 0.8594358, + "num_input_tokens_seen": 133934110, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.14941406, + "step": 6236, + "time_per_iteration": 2.507180690765381 + }, + { + "auxiliary_loss_clip": 0.06465082, + "auxiliary_loss_mlp": 0.0127323, + "balance_loss_clip": 0.06288011, + "balance_loss_mlp": 0.0125616, + "epoch": 0.37498872689012475, + "flos": 20710958987520.0, + "grad_norm": 1.9437086154972172, + "language_loss": 0.73305297, + "learning_rate": 2.876104377085234e-06, + "loss": 0.81043607, + "num_input_tokens_seen": 133952395, + "router_z_loss_clip": 1.76953125, + "router_z_loss_mlp": 0.17077637, + "step": 6237, + "time_per_iteration": 2.5545706748962402 + }, + { + "auxiliary_loss_clip": 0.06460923, + "auxiliary_loss_mlp": 0.01271336, + "balance_loss_clip": 0.0628608, + "balance_loss_mlp": 0.01256548, + "epoch": 0.3750488501427927, + "flos": 21580418079360.0, + "grad_norm": 2.5847168840400787, + "language_loss": 0.93616223, + "learning_rate": 2.8757542540760508e-06, + "loss": 1.01348472, + "num_input_tokens_seen": 133969635, + "router_z_loss_clip": 1.74804688, + "router_z_loss_mlp": 0.14788818, + "step": 6238, + "time_per_iteration": 2.544524669647217 + }, + { + "auxiliary_loss_clip": 0.06457306, + "auxiliary_loss_mlp": 0.01272243, + "balance_loss_clip": 0.06286643, + "balance_loss_mlp": 0.01257127, + "epoch": 0.3751089733954607, + "flos": 15929221236480.0, + "grad_norm": 2.2437121352489093, + "language_loss": 0.71661341, + "learning_rate": 2.8754040978581777e-06, + "loss": 0.79390883, + "num_input_tokens_seen": 133987215, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.15106201, + "step": 6239, + "time_per_iteration": 2.519807815551758 + }, + { + "auxiliary_loss_clip": 0.06461261, + "auxiliary_loss_mlp": 0.01271582, + "balance_loss_clip": 0.06287319, + "balance_loss_mlp": 0.01256485, + "epoch": 0.37516909664812864, + "flos": 36293688391680.0, + "grad_norm": 1.5212724151961043, + "language_loss": 0.65758455, + "learning_rate": 2.875053908444895e-06, + "loss": 0.73491299, + "num_input_tokens_seen": 134009250, + "router_z_loss_clip": 1.73730469, + "router_z_loss_mlp": 0.15118408, + "step": 6240, + "time_per_iteration": 2.6838748455047607 + }, + { + "auxiliary_loss_clip": 0.06461462, + "auxiliary_loss_mlp": 0.0126514, + "balance_loss_clip": 0.06288624, + "balance_loss_mlp": 0.01251258, + "epoch": 0.3752292199007966, + "flos": 13520882384640.0, + "grad_norm": 2.454894337240739, + "language_loss": 0.76209545, + "learning_rate": 2.8747036858494795e-06, + "loss": 0.83936143, + "num_input_tokens_seen": 134026875, + "router_z_loss_clip": 1.73046875, + "router_z_loss_mlp": 0.13867188, + "step": 6241, + "time_per_iteration": 2.498286008834839 + }, + { + "auxiliary_loss_clip": 0.06461808, + "auxiliary_loss_mlp": 0.01268507, + "balance_loss_clip": 0.06289176, + "balance_loss_mlp": 0.01253206, + "epoch": 0.3752893431534646, + "flos": 27205353866880.0, + "grad_norm": 2.0832931967812853, + "language_loss": 0.84671998, + "learning_rate": 2.874353430085213e-06, + "loss": 0.92402315, + "num_input_tokens_seen": 134047185, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.15313721, + "step": 6242, + "time_per_iteration": 2.6289877891540527 + }, + { + "auxiliary_loss_clip": 0.06457841, + "auxiliary_loss_mlp": 0.01272178, + "balance_loss_clip": 0.06285247, + "balance_loss_mlp": 0.01257379, + "epoch": 0.3753494664061326, + "flos": 30015431919360.0, + "grad_norm": 2.6434313807577112, + "language_loss": 0.68551457, + "learning_rate": 2.8740031411653766e-06, + "loss": 0.76281476, + "num_input_tokens_seen": 134067330, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.14813232, + "step": 6243, + "time_per_iteration": 2.7211153507232666 + }, + { + "auxiliary_loss_clip": 0.0645824, + "auxiliary_loss_mlp": 0.01270289, + "balance_loss_clip": 0.06286814, + "balance_loss_mlp": 0.01254482, + "epoch": 0.37540958965880056, + "flos": 24468803372160.0, + "grad_norm": 1.7478523324296555, + "language_loss": 0.8397631, + "learning_rate": 2.8736528191032535e-06, + "loss": 0.91704839, + "num_input_tokens_seen": 134085525, + "router_z_loss_clip": 1.71386719, + "router_z_loss_mlp": 0.15808105, + "step": 6244, + "time_per_iteration": 2.5738887786865234 + }, + { + "auxiliary_loss_clip": 0.0645659, + "auxiliary_loss_mlp": 0.01266605, + "balance_loss_clip": 0.06290226, + "balance_loss_mlp": 0.01252842, + "epoch": 0.3754697129114685, + "flos": 16513961754240.0, + "grad_norm": 3.8447339818169257, + "language_loss": 0.83823436, + "learning_rate": 2.8733024639121277e-06, + "loss": 0.91546631, + "num_input_tokens_seen": 134101855, + "router_z_loss_clip": 1.66113281, + "router_z_loss_mlp": 0.13751221, + "step": 6245, + "time_per_iteration": 2.5320816040039062 + }, + { + "auxiliary_loss_clip": 0.06453504, + "auxiliary_loss_mlp": 0.0127263, + "balance_loss_clip": 0.06282875, + "balance_loss_mlp": 0.01257633, + "epoch": 0.3755298361641365, + "flos": 19396980385920.0, + "grad_norm": 2.4621620681348295, + "language_loss": 0.64685225, + "learning_rate": 2.8729520756052853e-06, + "loss": 0.72411358, + "num_input_tokens_seen": 134119360, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.14990234, + "step": 6246, + "time_per_iteration": 2.58577561378479 + }, + { + "auxiliary_loss_clip": 0.06466524, + "auxiliary_loss_mlp": 0.01278259, + "balance_loss_clip": 0.06289048, + "balance_loss_mlp": 0.01262428, + "epoch": 0.37558995941680445, + "flos": 14725638789120.0, + "grad_norm": 2.3474335464279648, + "language_loss": 0.75348055, + "learning_rate": 2.8726016541960124e-06, + "loss": 0.83092844, + "num_input_tokens_seen": 134137475, + "router_z_loss_clip": 1.77148438, + "router_z_loss_mlp": 0.1583252, + "step": 6247, + "time_per_iteration": 2.47930908203125 + }, + { + "auxiliary_loss_clip": 0.06456453, + "auxiliary_loss_mlp": 0.012715, + "balance_loss_clip": 0.06282347, + "balance_loss_mlp": 0.01255503, + "epoch": 0.3756500826694724, + "flos": 21696432456960.0, + "grad_norm": 3.5646784592424017, + "language_loss": 0.55380279, + "learning_rate": 2.872251199697598e-06, + "loss": 0.6310823, + "num_input_tokens_seen": 134154580, + "router_z_loss_clip": 1.74121094, + "router_z_loss_mlp": 0.16003418, + "step": 6248, + "time_per_iteration": 2.5266313552856445 + }, + { + "auxiliary_loss_clip": 0.06453443, + "auxiliary_loss_mlp": 0.01268535, + "balance_loss_clip": 0.06283841, + "balance_loss_mlp": 0.01253109, + "epoch": 0.3757102059221404, + "flos": 26512942452480.0, + "grad_norm": 1.7302245846967215, + "language_loss": 0.84781861, + "learning_rate": 2.8719007121233297e-06, + "loss": 0.92503834, + "num_input_tokens_seen": 134174285, + "router_z_loss_clip": 1.69433594, + "router_z_loss_mlp": 0.15429688, + "step": 6249, + "time_per_iteration": 2.5590078830718994 + }, + { + "auxiliary_loss_clip": 0.06456596, + "auxiliary_loss_mlp": 0.01267858, + "balance_loss_clip": 0.0628508, + "balance_loss_mlp": 0.01253481, + "epoch": 0.37577032917480835, + "flos": 37346526144000.0, + "grad_norm": 1.6299752789251518, + "language_loss": 0.68482721, + "learning_rate": 2.8715501914864993e-06, + "loss": 0.76207179, + "num_input_tokens_seen": 134195940, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.14361572, + "step": 6250, + "time_per_iteration": 2.6926450729370117 + }, + { + "auxiliary_loss_clip": 0.06454285, + "auxiliary_loss_mlp": 0.01268088, + "balance_loss_clip": 0.06283066, + "balance_loss_mlp": 0.01254099, + "epoch": 0.3758304524274763, + "flos": 21915128413440.0, + "grad_norm": 2.0147801854845895, + "language_loss": 0.78550422, + "learning_rate": 2.8711996378003987e-06, + "loss": 0.862728, + "num_input_tokens_seen": 134212235, + "router_z_loss_clip": 1.71191406, + "router_z_loss_mlp": 0.13995361, + "step": 6251, + "time_per_iteration": 2.5072193145751953 + }, + { + "auxiliary_loss_clip": 0.06455163, + "auxiliary_loss_mlp": 0.01271265, + "balance_loss_clip": 0.06285167, + "balance_loss_mlp": 0.01257139, + "epoch": 0.3758905756801443, + "flos": 36577233008640.0, + "grad_norm": 2.2428429985343543, + "language_loss": 0.58560276, + "learning_rate": 2.8708490510783203e-06, + "loss": 0.66286701, + "num_input_tokens_seen": 134233810, + "router_z_loss_clip": 1.69824219, + "router_z_loss_mlp": 0.14111328, + "step": 6252, + "time_per_iteration": 2.684899091720581 + }, + { + "auxiliary_loss_clip": 0.06456266, + "auxiliary_loss_mlp": 0.01271539, + "balance_loss_clip": 0.06283682, + "balance_loss_mlp": 0.01255649, + "epoch": 0.37595069893281224, + "flos": 24534616354560.0, + "grad_norm": 1.5871699178816958, + "language_loss": 0.8998009, + "learning_rate": 2.8704984313335584e-06, + "loss": 0.97707891, + "num_input_tokens_seen": 134252020, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.15869141, + "step": 6253, + "time_per_iteration": 2.539088010787964 + }, + { + "auxiliary_loss_clip": 0.0645566, + "auxiliary_loss_mlp": 0.01270173, + "balance_loss_clip": 0.06288448, + "balance_loss_mlp": 0.01255523, + "epoch": 0.3760108221854802, + "flos": 16440518050560.0, + "grad_norm": 2.3821241740713086, + "language_loss": 0.77027023, + "learning_rate": 2.8701477785794097e-06, + "loss": 0.84752858, + "num_input_tokens_seen": 134269495, + "router_z_loss_clip": 1.671875, + "router_z_loss_mlp": 0.14648438, + "step": 6254, + "time_per_iteration": 2.545330047607422 + }, + { + "auxiliary_loss_clip": 0.06454843, + "auxiliary_loss_mlp": 0.01270718, + "balance_loss_clip": 0.06281418, + "balance_loss_mlp": 0.01254386, + "epoch": 0.37607094543814823, + "flos": 13776824280960.0, + "grad_norm": 2.2494955117694007, + "language_loss": 0.62504637, + "learning_rate": 2.869797092829169e-06, + "loss": 0.70230198, + "num_input_tokens_seen": 134287035, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.16333008, + "step": 6255, + "time_per_iteration": 3.937791109085083 + }, + { + "auxiliary_loss_clip": 0.06456207, + "auxiliary_loss_mlp": 0.0127009, + "balance_loss_clip": 0.06282066, + "balance_loss_mlp": 0.01253758, + "epoch": 0.3761310686908162, + "flos": 19862855487360.0, + "grad_norm": 2.2501042164391634, + "language_loss": 0.74801397, + "learning_rate": 2.869446374096135e-06, + "loss": 0.82527697, + "num_input_tokens_seen": 134304840, + "router_z_loss_clip": 1.74023438, + "router_z_loss_mlp": 0.16345215, + "step": 6256, + "time_per_iteration": 2.52768611907959 + }, + { + "auxiliary_loss_clip": 0.06456085, + "auxiliary_loss_mlp": 0.01270671, + "balance_loss_clip": 0.06281887, + "balance_loss_mlp": 0.01254637, + "epoch": 0.37619119194348416, + "flos": 12755823880320.0, + "grad_norm": 1.8167076240371511, + "language_loss": 0.70818299, + "learning_rate": 2.8690956223936088e-06, + "loss": 0.78545058, + "num_input_tokens_seen": 134323180, + "router_z_loss_clip": 1.74023438, + "router_z_loss_mlp": 0.16040039, + "step": 6257, + "time_per_iteration": 4.052328824996948 + }, + { + "auxiliary_loss_clip": 0.06452011, + "auxiliary_loss_mlp": 0.01268418, + "balance_loss_clip": 0.0628053, + "balance_loss_mlp": 0.01253743, + "epoch": 0.3762513151961521, + "flos": 17536387674240.0, + "grad_norm": 1.6926603581335775, + "language_loss": 0.85114312, + "learning_rate": 2.868744837734889e-06, + "loss": 0.92834735, + "num_input_tokens_seen": 134341390, + "router_z_loss_clip": 1.71386719, + "router_z_loss_mlp": 0.14672852, + "step": 6258, + "time_per_iteration": 2.50252366065979 + }, + { + "auxiliary_loss_clip": 0.06455131, + "auxiliary_loss_mlp": 0.0127104, + "balance_loss_clip": 0.06282814, + "balance_loss_mlp": 0.01256503, + "epoch": 0.3763114384488201, + "flos": 23623215494400.0, + "grad_norm": 1.3678719492617617, + "language_loss": 0.81156051, + "learning_rate": 2.868394020133277e-06, + "loss": 0.8888222, + "num_input_tokens_seen": 134360425, + "router_z_loss_clip": 1.72265625, + "router_z_loss_mlp": 0.14532471, + "step": 6259, + "time_per_iteration": 2.5430314540863037 + }, + { + "auxiliary_loss_clip": 0.06458686, + "auxiliary_loss_mlp": 0.01274293, + "balance_loss_clip": 0.06282908, + "balance_loss_mlp": 0.0125696, + "epoch": 0.37637156170148806, + "flos": 25413383249280.0, + "grad_norm": 1.809326583941318, + "language_loss": 0.71774137, + "learning_rate": 2.8680431696020783e-06, + "loss": 0.79507113, + "num_input_tokens_seen": 134379775, + "router_z_loss_clip": 1.75585938, + "router_z_loss_mlp": 0.17321777, + "step": 6260, + "time_per_iteration": 2.566267490386963 + }, + { + "auxiliary_loss_clip": 0.06455691, + "auxiliary_loss_mlp": 0.0127871, + "balance_loss_clip": 0.06279852, + "balance_loss_mlp": 0.01262128, + "epoch": 0.376431684954156, + "flos": 23447677190400.0, + "grad_norm": 1.8475234283885087, + "language_loss": 0.78925788, + "learning_rate": 2.867692286154594e-06, + "loss": 0.86660182, + "num_input_tokens_seen": 134400315, + "router_z_loss_clip": 1.75878906, + "router_z_loss_mlp": 0.16589355, + "step": 6261, + "time_per_iteration": 2.5848124027252197 + }, + { + "auxiliary_loss_clip": 0.06455033, + "auxiliary_loss_mlp": 0.01273009, + "balance_loss_clip": 0.06278862, + "balance_loss_mlp": 0.01257607, + "epoch": 0.376491808206824, + "flos": 34213099985280.0, + "grad_norm": 2.1653724604475255, + "language_loss": 0.80626601, + "learning_rate": 2.867341369804132e-06, + "loss": 0.88354641, + "num_input_tokens_seen": 134422875, + "router_z_loss_clip": 1.76171875, + "router_z_loss_mlp": 0.15405273, + "step": 6262, + "time_per_iteration": 4.146479368209839 + }, + { + "auxiliary_loss_clip": 0.06453078, + "auxiliary_loss_mlp": 0.01268581, + "balance_loss_clip": 0.06282018, + "balance_loss_mlp": 0.01253799, + "epoch": 0.37655193145949195, + "flos": 35193793772160.0, + "grad_norm": 1.6953841761456194, + "language_loss": 0.81274903, + "learning_rate": 2.866990420563998e-06, + "loss": 0.88996559, + "num_input_tokens_seen": 134443025, + "router_z_loss_clip": 1.70996094, + "router_z_loss_mlp": 0.14794922, + "step": 6263, + "time_per_iteration": 2.6529650688171387 + }, + { + "auxiliary_loss_clip": 0.06460523, + "auxiliary_loss_mlp": 0.01276014, + "balance_loss_clip": 0.06286405, + "balance_loss_mlp": 0.01261172, + "epoch": 0.3766120547121599, + "flos": 16767136465920.0, + "grad_norm": 1.8888627452248796, + "language_loss": 0.79794824, + "learning_rate": 2.866639438447501e-06, + "loss": 0.87531358, + "num_input_tokens_seen": 134460945, + "router_z_loss_clip": 1.74121094, + "router_z_loss_mlp": 0.14831543, + "step": 6264, + "time_per_iteration": 3.9715349674224854 + }, + { + "auxiliary_loss_clip": 0.06455237, + "auxiliary_loss_mlp": 0.01269266, + "balance_loss_clip": 0.06284397, + "balance_loss_mlp": 0.0125396, + "epoch": 0.3766721779648279, + "flos": 23557150949760.0, + "grad_norm": 1.690336708132248, + "language_loss": 0.7363869, + "learning_rate": 2.8662884234679497e-06, + "loss": 0.81363189, + "num_input_tokens_seen": 134480440, + "router_z_loss_clip": 1.70898438, + "router_z_loss_mlp": 0.15307617, + "step": 6265, + "time_per_iteration": 2.5544657707214355 + }, + { + "auxiliary_loss_clip": 0.06451584, + "auxiliary_loss_mlp": 0.01276088, + "balance_loss_clip": 0.06283864, + "balance_loss_mlp": 0.01262486, + "epoch": 0.37673230121749585, + "flos": 29136329608320.0, + "grad_norm": 1.6256668529315172, + "language_loss": 0.6925773, + "learning_rate": 2.865937375638654e-06, + "loss": 0.76985407, + "num_input_tokens_seen": 134501110, + "router_z_loss_clip": 1.67675781, + "router_z_loss_mlp": 0.1361084, + "step": 6266, + "time_per_iteration": 2.5735552310943604 + }, + { + "auxiliary_loss_clip": 0.06456051, + "auxiliary_loss_mlp": 0.01274095, + "balance_loss_clip": 0.06279004, + "balance_loss_mlp": 0.01258825, + "epoch": 0.3767924244701638, + "flos": 28154210302080.0, + "grad_norm": 2.361518747365002, + "language_loss": 0.63358176, + "learning_rate": 2.8655862949729264e-06, + "loss": 0.7108832, + "num_input_tokens_seen": 134522460, + "router_z_loss_clip": 1.76855469, + "router_z_loss_mlp": 0.15270996, + "step": 6267, + "time_per_iteration": 2.6408746242523193 + }, + { + "auxiliary_loss_clip": 0.0637848, + "auxiliary_loss_mlp": 0.01265654, + "balance_loss_clip": 0.0630175, + "balance_loss_mlp": 0.01263043, + "epoch": 0.37685254772283183, + "flos": 60815460343680.0, + "grad_norm": 0.7019670976586264, + "language_loss": 0.58932841, + "learning_rate": 2.8652351814840795e-06, + "loss": 0.66576976, + "num_input_tokens_seen": 134589545, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.02612305, + "step": 6268, + "time_per_iteration": 3.3041250705718994 + }, + { + "auxiliary_loss_clip": 0.06448595, + "auxiliary_loss_mlp": 0.01272563, + "balance_loss_clip": 0.06277184, + "balance_loss_mlp": 0.01256756, + "epoch": 0.3769126709754998, + "flos": 26039939754240.0, + "grad_norm": 1.4401012750228117, + "language_loss": 0.65166855, + "learning_rate": 2.8648840351854283e-06, + "loss": 0.72888005, + "num_input_tokens_seen": 134610550, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.15795898, + "step": 6269, + "time_per_iteration": 2.654707670211792 + }, + { + "auxiliary_loss_clip": 0.06454687, + "auxiliary_loss_mlp": 0.01276662, + "balance_loss_clip": 0.06286559, + "balance_loss_mlp": 0.01261296, + "epoch": 0.37697279422816776, + "flos": 23585508357120.0, + "grad_norm": 1.4576669810179597, + "language_loss": 0.71144199, + "learning_rate": 2.8645328560902874e-06, + "loss": 0.78875554, + "num_input_tokens_seen": 134630485, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.15362549, + "step": 6270, + "time_per_iteration": 2.5369231700897217 + }, + { + "auxiliary_loss_clip": 0.06374384, + "auxiliary_loss_mlp": 0.0126987, + "balance_loss_clip": 0.062971, + "balance_loss_mlp": 0.01266305, + "epoch": 0.3770329174808357, + "flos": 64766242753920.0, + "grad_norm": 0.6950430831807741, + "language_loss": 0.56232381, + "learning_rate": 2.8641816442119746e-06, + "loss": 0.63876635, + "num_input_tokens_seen": 134693510, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.03561401, + "step": 6271, + "time_per_iteration": 3.1599924564361572 + }, + { + "auxiliary_loss_clip": 0.06448443, + "auxiliary_loss_mlp": 0.01272708, + "balance_loss_clip": 0.06279441, + "balance_loss_mlp": 0.0125696, + "epoch": 0.3770930407335037, + "flos": 21841768563840.0, + "grad_norm": 1.6801171250404496, + "language_loss": 0.80461442, + "learning_rate": 2.8638303995638066e-06, + "loss": 0.88182592, + "num_input_tokens_seen": 134713115, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.1574707, + "step": 6272, + "time_per_iteration": 2.524846076965332 + }, + { + "auxiliary_loss_clip": 0.06450769, + "auxiliary_loss_mlp": 0.01273349, + "balance_loss_clip": 0.06283743, + "balance_loss_mlp": 0.01258329, + "epoch": 0.37715316398617166, + "flos": 22754594943360.0, + "grad_norm": 1.6672783573066894, + "language_loss": 0.74972034, + "learning_rate": 2.863479122159103e-06, + "loss": 0.82696146, + "num_input_tokens_seen": 134732635, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.15026855, + "step": 6273, + "time_per_iteration": 2.5571129322052 + }, + { + "auxiliary_loss_clip": 0.06449255, + "auxiliary_loss_mlp": 0.01271721, + "balance_loss_clip": 0.06280608, + "balance_loss_mlp": 0.01257148, + "epoch": 0.3772132872388396, + "flos": 18920246181120.0, + "grad_norm": 1.32773283576084, + "language_loss": 0.72241038, + "learning_rate": 2.8631278120111858e-06, + "loss": 0.79962015, + "num_input_tokens_seen": 134750695, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.14569092, + "step": 6274, + "time_per_iteration": 2.4966516494750977 + }, + { + "auxiliary_loss_clip": 0.06454083, + "auxiliary_loss_mlp": 0.01271444, + "balance_loss_clip": 0.06282286, + "balance_loss_mlp": 0.01257467, + "epoch": 0.3772734104915076, + "flos": 17351709275520.0, + "grad_norm": 1.8983068498635614, + "language_loss": 0.84638643, + "learning_rate": 2.8627764691333742e-06, + "loss": 0.92364168, + "num_input_tokens_seen": 134768935, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.13983154, + "step": 6275, + "time_per_iteration": 2.534308910369873 + }, + { + "auxiliary_loss_clip": 0.06448515, + "auxiliary_loss_mlp": 0.01272502, + "balance_loss_clip": 0.06282812, + "balance_loss_mlp": 0.01258865, + "epoch": 0.37733353374417555, + "flos": 32350452848640.0, + "grad_norm": 1.3669254528099, + "language_loss": 0.75387293, + "learning_rate": 2.8624250935389935e-06, + "loss": 0.83108306, + "num_input_tokens_seen": 134791260, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.13641357, + "step": 6276, + "time_per_iteration": 2.6563172340393066 + }, + { + "auxiliary_loss_clip": 0.06453335, + "auxiliary_loss_mlp": 0.0127286, + "balance_loss_clip": 0.06282486, + "balance_loss_mlp": 0.0125803, + "epoch": 0.3773936569968435, + "flos": 23366225422080.0, + "grad_norm": 1.9054341571687776, + "language_loss": 0.86016738, + "learning_rate": 2.862073685241366e-06, + "loss": 0.93742937, + "num_input_tokens_seen": 134808350, + "router_z_loss_clip": 1.70898438, + "router_z_loss_mlp": 0.1484375, + "step": 6277, + "time_per_iteration": 2.6153500080108643 + }, + { + "auxiliary_loss_clip": 0.06448077, + "auxiliary_loss_mlp": 0.01271912, + "balance_loss_clip": 0.0628462, + "balance_loss_mlp": 0.01257488, + "epoch": 0.3774537802495115, + "flos": 21472579474560.0, + "grad_norm": 1.5956300393708251, + "language_loss": 0.78636366, + "learning_rate": 2.861722244253818e-06, + "loss": 0.86356354, + "num_input_tokens_seen": 134826005, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.14428711, + "step": 6278, + "time_per_iteration": 2.564234495162964 + }, + { + "auxiliary_loss_clip": 0.06459187, + "auxiliary_loss_mlp": 0.01270608, + "balance_loss_clip": 0.06284142, + "balance_loss_mlp": 0.01255075, + "epoch": 0.37751390350217945, + "flos": 24980812945920.0, + "grad_norm": 1.8067410295121689, + "language_loss": 0.8371948, + "learning_rate": 2.8613707705896767e-06, + "loss": 0.91449273, + "num_input_tokens_seen": 134844995, + "router_z_loss_clip": 1.75097656, + "router_z_loss_mlp": 0.15527344, + "step": 6279, + "time_per_iteration": 2.6134567260742188 + }, + { + "auxiliary_loss_clip": 0.06454675, + "auxiliary_loss_mlp": 0.01271405, + "balance_loss_clip": 0.06282948, + "balance_loss_mlp": 0.01257117, + "epoch": 0.3775740267548474, + "flos": 27826585637760.0, + "grad_norm": 1.84994794715845, + "language_loss": 0.74995327, + "learning_rate": 2.861019264262269e-06, + "loss": 0.82721412, + "num_input_tokens_seen": 134865285, + "router_z_loss_clip": 1.71777344, + "router_z_loss_mlp": 0.1428833, + "step": 6280, + "time_per_iteration": 2.6029937267303467 + }, + { + "auxiliary_loss_clip": 0.06448464, + "auxiliary_loss_mlp": 0.01272763, + "balance_loss_clip": 0.06282684, + "balance_loss_mlp": 0.01259156, + "epoch": 0.3776341500075154, + "flos": 22571845188480.0, + "grad_norm": 1.3018494364650444, + "language_loss": 0.76205039, + "learning_rate": 2.8606677252849242e-06, + "loss": 0.83926266, + "num_input_tokens_seen": 134886535, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.13592529, + "step": 6281, + "time_per_iteration": 2.524489641189575 + }, + { + "auxiliary_loss_clip": 0.06448536, + "auxiliary_loss_mlp": 0.01271342, + "balance_loss_clip": 0.06279069, + "balance_loss_mlp": 0.0125718, + "epoch": 0.3776942732601834, + "flos": 23084148251520.0, + "grad_norm": 1.5306913056637732, + "language_loss": 0.84658033, + "learning_rate": 2.860316153670974e-06, + "loss": 0.92377913, + "num_input_tokens_seen": 134907435, + "router_z_loss_clip": 1.69238281, + "router_z_loss_mlp": 0.14160156, + "step": 6282, + "time_per_iteration": 2.6190710067749023 + }, + { + "auxiliary_loss_clip": 0.06449918, + "auxiliary_loss_mlp": 0.01269426, + "balance_loss_clip": 0.06282572, + "balance_loss_mlp": 0.0125555, + "epoch": 0.37775439651285136, + "flos": 21730617722880.0, + "grad_norm": 1.840636786741823, + "language_loss": 0.70143461, + "learning_rate": 2.8599645494337484e-06, + "loss": 0.77862805, + "num_input_tokens_seen": 134925360, + "router_z_loss_clip": 1.67578125, + "router_z_loss_mlp": 0.13879395, + "step": 6283, + "time_per_iteration": 2.555816411972046 + }, + { + "auxiliary_loss_clip": 0.06452499, + "auxiliary_loss_mlp": 0.01274632, + "balance_loss_clip": 0.06285429, + "balance_loss_mlp": 0.01259957, + "epoch": 0.37781451976551933, + "flos": 23994542862720.0, + "grad_norm": 1.743481736886233, + "language_loss": 0.76856482, + "learning_rate": 2.859612912586581e-06, + "loss": 0.8458361, + "num_input_tokens_seen": 134944205, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.14648438, + "step": 6284, + "time_per_iteration": 2.560770034790039 + }, + { + "auxiliary_loss_clip": 0.06464045, + "auxiliary_loss_mlp": 0.01271283, + "balance_loss_clip": 0.06286186, + "balance_loss_mlp": 0.01254725, + "epoch": 0.3778746430181873, + "flos": 13731821838720.0, + "grad_norm": 2.746966655353194, + "language_loss": 0.85536617, + "learning_rate": 2.8592612431428055e-06, + "loss": 0.93271947, + "num_input_tokens_seen": 134960255, + "router_z_loss_clip": 1.77636719, + "router_z_loss_mlp": 0.16564941, + "step": 6285, + "time_per_iteration": 2.5006392002105713 + }, + { + "auxiliary_loss_clip": 0.06451872, + "auxiliary_loss_mlp": 0.01271139, + "balance_loss_clip": 0.06279811, + "balance_loss_mlp": 0.01256065, + "epoch": 0.37793476627085526, + "flos": 19466021750400.0, + "grad_norm": 1.7632018529100697, + "language_loss": 0.84913701, + "learning_rate": 2.858909541115758e-06, + "loss": 0.9263671, + "num_input_tokens_seen": 134978605, + "router_z_loss_clip": 1.71777344, + "router_z_loss_mlp": 0.1506958, + "step": 6286, + "time_per_iteration": 2.566092014312744 + }, + { + "auxiliary_loss_clip": 0.06452557, + "auxiliary_loss_mlp": 0.01269453, + "balance_loss_clip": 0.06281806, + "balance_loss_mlp": 0.01254182, + "epoch": 0.3779948895235232, + "flos": 10711600945920.0, + "grad_norm": 1.9010574176879877, + "language_loss": 0.823708, + "learning_rate": 2.858557806518775e-06, + "loss": 0.90092808, + "num_input_tokens_seen": 134995020, + "router_z_loss_clip": 1.70800781, + "router_z_loss_mlp": 0.15258789, + "step": 6287, + "time_per_iteration": 2.4892444610595703 + }, + { + "auxiliary_loss_clip": 0.06454234, + "auxiliary_loss_mlp": 0.01274095, + "balance_loss_clip": 0.06284317, + "balance_loss_mlp": 0.01258408, + "epoch": 0.3780550127761912, + "flos": 22316616051840.0, + "grad_norm": 2.1030531862013584, + "language_loss": 0.7330361, + "learning_rate": 2.8582060393651927e-06, + "loss": 0.81031942, + "num_input_tokens_seen": 135012620, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.15679932, + "step": 6288, + "time_per_iteration": 2.5415592193603516 + }, + { + "auxiliary_loss_clip": 0.06453485, + "auxiliary_loss_mlp": 0.01269135, + "balance_loss_clip": 0.06282276, + "balance_loss_mlp": 0.01254359, + "epoch": 0.37811513602885916, + "flos": 28958401463040.0, + "grad_norm": 1.6277535048544236, + "language_loss": 0.75782627, + "learning_rate": 2.857854239668352e-06, + "loss": 0.83505249, + "num_input_tokens_seen": 135033365, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.14770508, + "step": 6289, + "time_per_iteration": 2.5579047203063965 + }, + { + "auxiliary_loss_clip": 0.06454412, + "auxiliary_loss_mlp": 0.01273518, + "balance_loss_clip": 0.06284275, + "balance_loss_mlp": 0.01257925, + "epoch": 0.3781752592815271, + "flos": 23119717109760.0, + "grad_norm": 1.945372772068441, + "language_loss": 0.74155736, + "learning_rate": 2.857502407441593e-06, + "loss": 0.81883669, + "num_input_tokens_seen": 135052185, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.15588379, + "step": 6290, + "time_per_iteration": 2.5697786808013916 + }, + { + "auxiliary_loss_clip": 0.06458094, + "auxiliary_loss_mlp": 0.01273362, + "balance_loss_clip": 0.06281058, + "balance_loss_mlp": 0.0125653, + "epoch": 0.3782353825341951, + "flos": 19762102552320.0, + "grad_norm": 2.4066647483264596, + "language_loss": 0.80529308, + "learning_rate": 2.8571505426982566e-06, + "loss": 0.88260764, + "num_input_tokens_seen": 135070425, + "router_z_loss_clip": 1.77050781, + "router_z_loss_mlp": 0.16833496, + "step": 6291, + "time_per_iteration": 2.4970998764038086 + }, + { + "auxiliary_loss_clip": 0.06456125, + "auxiliary_loss_mlp": 0.01270776, + "balance_loss_clip": 0.06283687, + "balance_loss_mlp": 0.01254933, + "epoch": 0.37829550578686305, + "flos": 22056774940800.0, + "grad_norm": 1.7419894192909393, + "language_loss": 0.76369846, + "learning_rate": 2.8567986454516854e-06, + "loss": 0.84096742, + "num_input_tokens_seen": 135090525, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.1583252, + "step": 6292, + "time_per_iteration": 2.572916030883789 + }, + { + "auxiliary_loss_clip": 0.06452248, + "auxiliary_loss_mlp": 0.0127064, + "balance_loss_clip": 0.06281239, + "balance_loss_mlp": 0.01255631, + "epoch": 0.378355629039531, + "flos": 16475667638400.0, + "grad_norm": 1.682972265329385, + "language_loss": 0.70006013, + "learning_rate": 2.856446715715224e-06, + "loss": 0.77728903, + "num_input_tokens_seen": 135109575, + "router_z_loss_clip": 1.70996094, + "router_z_loss_mlp": 0.15014648, + "step": 6293, + "time_per_iteration": 2.5161240100860596 + }, + { + "auxiliary_loss_clip": 0.06449296, + "auxiliary_loss_mlp": 0.01271246, + "balance_loss_clip": 0.06281447, + "balance_loss_mlp": 0.01255934, + "epoch": 0.378415752292199, + "flos": 19981050071040.0, + "grad_norm": 1.9898859900525039, + "language_loss": 0.7173214, + "learning_rate": 2.8560947535022173e-06, + "loss": 0.79452682, + "num_input_tokens_seen": 135127000, + "router_z_loss_clip": 1.6796875, + "router_z_loss_mlp": 0.15332031, + "step": 6294, + "time_per_iteration": 3.9304022789001465 + }, + { + "auxiliary_loss_clip": 0.06465693, + "auxiliary_loss_mlp": 0.01279732, + "balance_loss_clip": 0.06285857, + "balance_loss_mlp": 0.01264068, + "epoch": 0.378475875544867, + "flos": 14652614355840.0, + "grad_norm": 2.57033704665896, + "language_loss": 0.83215445, + "learning_rate": 2.855742758826011e-06, + "loss": 0.90960872, + "num_input_tokens_seen": 135145285, + "router_z_loss_clip": 1.79980469, + "router_z_loss_mlp": 0.15655518, + "step": 6295, + "time_per_iteration": 2.488780975341797 + }, + { + "auxiliary_loss_clip": 0.06459963, + "auxiliary_loss_mlp": 0.01268811, + "balance_loss_clip": 0.06288329, + "balance_loss_mlp": 0.01253255, + "epoch": 0.37853599879753497, + "flos": 26658194705280.0, + "grad_norm": 1.6154959379599871, + "language_loss": 0.71442378, + "learning_rate": 2.8553907316999547e-06, + "loss": 0.79171151, + "num_input_tokens_seen": 135165240, + "router_z_loss_clip": 1.71484375, + "router_z_loss_mlp": 0.15563965, + "step": 6296, + "time_per_iteration": 4.0578773021698 + }, + { + "auxiliary_loss_clip": 0.06454356, + "auxiliary_loss_mlp": 0.01274534, + "balance_loss_clip": 0.06287888, + "balance_loss_mlp": 0.01260455, + "epoch": 0.37859612205020293, + "flos": 17317817498880.0, + "grad_norm": 1.7695984237012152, + "language_loss": 0.77514613, + "learning_rate": 2.855038672137396e-06, + "loss": 0.85243499, + "num_input_tokens_seen": 135184045, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.14074707, + "step": 6297, + "time_per_iteration": 2.54968523979187 + }, + { + "auxiliary_loss_clip": 0.06462398, + "auxiliary_loss_mlp": 0.01275228, + "balance_loss_clip": 0.0628902, + "balance_loss_mlp": 0.01259481, + "epoch": 0.3786562453028709, + "flos": 18225780341760.0, + "grad_norm": 1.977165612519376, + "language_loss": 0.80132794, + "learning_rate": 2.854686580151684e-06, + "loss": 0.87870419, + "num_input_tokens_seen": 135202365, + "router_z_loss_clip": 1.73339844, + "router_z_loss_mlp": 0.1574707, + "step": 6298, + "time_per_iteration": 2.5013349056243896 + }, + { + "auxiliary_loss_clip": 0.06454945, + "auxiliary_loss_mlp": 0.01270815, + "balance_loss_clip": 0.06285203, + "balance_loss_mlp": 0.01255711, + "epoch": 0.37871636855553886, + "flos": 21221207625600.0, + "grad_norm": 1.480969598733767, + "language_loss": 0.8501091, + "learning_rate": 2.8543344557561722e-06, + "loss": 0.92736673, + "num_input_tokens_seen": 135220955, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.15087891, + "step": 6299, + "time_per_iteration": 2.5749709606170654 + }, + { + "auxiliary_loss_clip": 0.06460874, + "auxiliary_loss_mlp": 0.01272586, + "balance_loss_clip": 0.06288288, + "balance_loss_mlp": 0.01256844, + "epoch": 0.3787764918082068, + "flos": 20957886570240.0, + "grad_norm": 2.4357425027716895, + "language_loss": 0.77022231, + "learning_rate": 2.8539822989642116e-06, + "loss": 0.84755683, + "num_input_tokens_seen": 135239715, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.15740967, + "step": 6300, + "time_per_iteration": 2.521772623062134 + }, + { + "auxiliary_loss_clip": 0.06472084, + "auxiliary_loss_mlp": 0.01275415, + "balance_loss_clip": 0.06293886, + "balance_loss_mlp": 0.01258177, + "epoch": 0.3788366150608748, + "flos": 17313205524480.0, + "grad_norm": 1.8143586204861406, + "language_loss": 0.83141446, + "learning_rate": 2.8536301097891577e-06, + "loss": 0.90888953, + "num_input_tokens_seen": 135257035, + "router_z_loss_clip": 1.78125, + "router_z_loss_mlp": 0.17236328, + "step": 6301, + "time_per_iteration": 3.982780933380127 + }, + { + "auxiliary_loss_clip": 0.0646001, + "auxiliary_loss_mlp": 0.01270469, + "balance_loss_clip": 0.06287184, + "balance_loss_mlp": 0.0125428, + "epoch": 0.37889673831354276, + "flos": 24317094355200.0, + "grad_norm": 1.8203378599779103, + "language_loss": 0.68096328, + "learning_rate": 2.8532778882443636e-06, + "loss": 0.75826812, + "num_input_tokens_seen": 135275720, + "router_z_loss_clip": 1.72753906, + "router_z_loss_mlp": 0.16186523, + "step": 6302, + "time_per_iteration": 2.5983002185821533 + }, + { + "auxiliary_loss_clip": 0.06455475, + "auxiliary_loss_mlp": 0.01270441, + "balance_loss_clip": 0.06284864, + "balance_loss_mlp": 0.01255718, + "epoch": 0.3789568615662107, + "flos": 26690157838080.0, + "grad_norm": 2.521279180058548, + "language_loss": 0.68357861, + "learning_rate": 2.8529256343431867e-06, + "loss": 0.76083779, + "num_input_tokens_seen": 135294140, + "router_z_loss_clip": 1.70703125, + "router_z_loss_mlp": 0.1472168, + "step": 6303, + "time_per_iteration": 2.5610175132751465 + }, + { + "auxiliary_loss_clip": 0.06458124, + "auxiliary_loss_mlp": 0.01272095, + "balance_loss_clip": 0.06285581, + "balance_loss_mlp": 0.01257265, + "epoch": 0.3790169848188787, + "flos": 23591713559040.0, + "grad_norm": 1.604251878296904, + "language_loss": 0.78095663, + "learning_rate": 2.8525733480989846e-06, + "loss": 0.85825884, + "num_input_tokens_seen": 135314845, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.14807129, + "step": 6304, + "time_per_iteration": 3.994072437286377 + }, + { + "auxiliary_loss_clip": 0.06468576, + "auxiliary_loss_mlp": 0.01269708, + "balance_loss_clip": 0.06292479, + "balance_loss_mlp": 0.01253806, + "epoch": 0.37907710807154665, + "flos": 18442547654400.0, + "grad_norm": 1.8924180649319282, + "language_loss": 0.80524492, + "learning_rate": 2.8522210295251146e-06, + "loss": 0.88262779, + "num_input_tokens_seen": 135333055, + "router_z_loss_clip": 1.76074219, + "router_z_loss_mlp": 0.15881348, + "step": 6305, + "time_per_iteration": 2.5073235034942627 + }, + { + "auxiliary_loss_clip": 0.06370047, + "auxiliary_loss_mlp": 0.01262008, + "balance_loss_clip": 0.06291789, + "balance_loss_mlp": 0.01258527, + "epoch": 0.3791372313242146, + "flos": 50123690887680.0, + "grad_norm": 0.9538902579511545, + "language_loss": 0.64400995, + "learning_rate": 2.8518686786349387e-06, + "loss": 0.72033048, + "num_input_tokens_seen": 135387865, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 0.03491211, + "step": 6306, + "time_per_iteration": 3.106515645980835 + }, + { + "auxiliary_loss_clip": 0.06464424, + "auxiliary_loss_mlp": 0.01273174, + "balance_loss_clip": 0.06292081, + "balance_loss_mlp": 0.01257683, + "epoch": 0.3791973545768826, + "flos": 24323467265280.0, + "grad_norm": 1.5167178412192643, + "language_loss": 0.73534656, + "learning_rate": 2.851516295441817e-06, + "loss": 0.8127225, + "num_input_tokens_seen": 135409095, + "router_z_loss_clip": 1.72363281, + "router_z_loss_mlp": 0.15484619, + "step": 6307, + "time_per_iteration": 2.6272099018096924 + }, + { + "auxiliary_loss_clip": 0.06462627, + "auxiliary_loss_mlp": 0.01270499, + "balance_loss_clip": 0.06287986, + "balance_loss_mlp": 0.0125505, + "epoch": 0.3792574778295506, + "flos": 21586329792000.0, + "grad_norm": 1.8539993286062635, + "language_loss": 0.78603798, + "learning_rate": 2.851163879959112e-06, + "loss": 0.86336923, + "num_input_tokens_seen": 135429585, + "router_z_loss_clip": 1.74511719, + "router_z_loss_mlp": 0.15441895, + "step": 6308, + "time_per_iteration": 2.518927574157715 + }, + { + "auxiliary_loss_clip": 0.06459265, + "auxiliary_loss_mlp": 0.01272841, + "balance_loss_clip": 0.06287025, + "balance_loss_mlp": 0.01257028, + "epoch": 0.37931760108221857, + "flos": 22279202403840.0, + "grad_norm": 4.0253147283534, + "language_loss": 0.73503512, + "learning_rate": 2.8508114322001876e-06, + "loss": 0.81235617, + "num_input_tokens_seen": 135446320, + "router_z_loss_clip": 1.72265625, + "router_z_loss_mlp": 0.15814209, + "step": 6309, + "time_per_iteration": 2.539158344268799 + }, + { + "auxiliary_loss_clip": 0.06457806, + "auxiliary_loss_mlp": 0.01274513, + "balance_loss_clip": 0.06288067, + "balance_loss_mlp": 0.0125963, + "epoch": 0.37937772433488653, + "flos": 19689161973120.0, + "grad_norm": 1.3654110952225158, + "language_loss": 0.79184294, + "learning_rate": 2.8504589521784083e-06, + "loss": 0.86916614, + "num_input_tokens_seen": 135465720, + "router_z_loss_clip": 1.69726562, + "router_z_loss_mlp": 0.14886475, + "step": 6310, + "time_per_iteration": 2.4997847080230713 + }, + { + "auxiliary_loss_clip": 0.06457442, + "auxiliary_loss_mlp": 0.01268809, + "balance_loss_clip": 0.06285986, + "balance_loss_mlp": 0.01253586, + "epoch": 0.3794378475875545, + "flos": 19105469631360.0, + "grad_norm": 1.8573579951480166, + "language_loss": 0.76741791, + "learning_rate": 2.8501064399071403e-06, + "loss": 0.84468043, + "num_input_tokens_seen": 135485155, + "router_z_loss_clip": 1.71484375, + "router_z_loss_mlp": 0.15222168, + "step": 6311, + "time_per_iteration": 2.5216546058654785 + }, + { + "auxiliary_loss_clip": 0.06457929, + "auxiliary_loss_mlp": 0.01276784, + "balance_loss_clip": 0.06287444, + "balance_loss_mlp": 0.01261746, + "epoch": 0.37949797084022246, + "flos": 20345920675200.0, + "grad_norm": 1.4012846072012495, + "language_loss": 0.71063423, + "learning_rate": 2.8497538953997504e-06, + "loss": 0.78798139, + "num_input_tokens_seen": 135502675, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.15032959, + "step": 6312, + "time_per_iteration": 2.4909064769744873 + }, + { + "auxiliary_loss_clip": 0.06361144, + "auxiliary_loss_mlp": 0.01254908, + "balance_loss_clip": 0.06283364, + "balance_loss_mlp": 0.01251185, + "epoch": 0.37955809409289043, + "flos": 63991121760000.0, + "grad_norm": 0.7457914665340521, + "language_loss": 0.55941355, + "learning_rate": 2.849401318669608e-06, + "loss": 0.63557404, + "num_input_tokens_seen": 135562005, + "router_z_loss_clip": 0.77734375, + "router_z_loss_mlp": 0.03713989, + "step": 6313, + "time_per_iteration": 3.1312170028686523 + }, + { + "auxiliary_loss_clip": 0.06457204, + "auxiliary_loss_mlp": 0.0127245, + "balance_loss_clip": 0.06285529, + "balance_loss_mlp": 0.01258211, + "epoch": 0.3796182173455584, + "flos": 31548777310080.0, + "grad_norm": 1.7202421351204062, + "language_loss": 0.71222353, + "learning_rate": 2.849048709730083e-06, + "loss": 0.78952008, + "num_input_tokens_seen": 135582600, + "router_z_loss_clip": 1.71679688, + "router_z_loss_mlp": 0.14233398, + "step": 6314, + "time_per_iteration": 2.5876691341400146 + }, + { + "auxiliary_loss_clip": 0.06465393, + "auxiliary_loss_mlp": 0.01270992, + "balance_loss_clip": 0.06290812, + "balance_loss_mlp": 0.01254922, + "epoch": 0.37967834059822636, + "flos": 12135766066560.0, + "grad_norm": 2.8019471516683985, + "language_loss": 0.74203241, + "learning_rate": 2.848696068594545e-06, + "loss": 0.81939626, + "num_input_tokens_seen": 135600280, + "router_z_loss_clip": 1.74511719, + "router_z_loss_mlp": 0.16064453, + "step": 6315, + "time_per_iteration": 2.5312654972076416 + }, + { + "auxiliary_loss_clip": 0.06455735, + "auxiliary_loss_mlp": 0.01269414, + "balance_loss_clip": 0.0628659, + "balance_loss_mlp": 0.01253512, + "epoch": 0.3797384638508943, + "flos": 39357989331840.0, + "grad_norm": 5.544256779510487, + "language_loss": 0.7095021, + "learning_rate": 2.8483433952763677e-06, + "loss": 0.78675354, + "num_input_tokens_seen": 135621560, + "router_z_loss_clip": 1.69238281, + "router_z_loss_mlp": 0.15905762, + "step": 6316, + "time_per_iteration": 2.642946481704712 + }, + { + "auxiliary_loss_clip": 0.06458603, + "auxiliary_loss_mlp": 0.0127094, + "balance_loss_clip": 0.06288237, + "balance_loss_mlp": 0.01255991, + "epoch": 0.3797985871035623, + "flos": 34061852165760.0, + "grad_norm": 2.4477129072331656, + "language_loss": 0.65612113, + "learning_rate": 2.847990689788923e-06, + "loss": 0.7334165, + "num_input_tokens_seen": 135641745, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.1496582, + "step": 6317, + "time_per_iteration": 2.634066104888916 + }, + { + "auxiliary_loss_clip": 0.0645286, + "auxiliary_loss_mlp": 0.0127098, + "balance_loss_clip": 0.06285463, + "balance_loss_mlp": 0.0125702, + "epoch": 0.37985871035623026, + "flos": 23228939306880.0, + "grad_norm": 1.9893651635894969, + "language_loss": 0.86348939, + "learning_rate": 2.8476379521455877e-06, + "loss": 0.94072783, + "num_input_tokens_seen": 135660650, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.13964844, + "step": 6318, + "time_per_iteration": 2.50665545463562 + }, + { + "auxiliary_loss_clip": 0.06460046, + "auxiliary_loss_mlp": 0.01273041, + "balance_loss_clip": 0.06287004, + "balance_loss_mlp": 0.01257675, + "epoch": 0.3799188336088982, + "flos": 18121002410880.0, + "grad_norm": 2.356531700065532, + "language_loss": 0.76647675, + "learning_rate": 2.8472851823597354e-06, + "loss": 0.84380764, + "num_input_tokens_seen": 135679980, + "router_z_loss_clip": 1.73046875, + "router_z_loss_mlp": 0.15368652, + "step": 6319, + "time_per_iteration": 2.50382137298584 + }, + { + "auxiliary_loss_clip": 0.06453398, + "auxiliary_loss_mlp": 0.01272745, + "balance_loss_clip": 0.06284256, + "balance_loss_mlp": 0.01258082, + "epoch": 0.3799789568615662, + "flos": 21878385598080.0, + "grad_norm": 6.804259628026359, + "language_loss": 0.6451484, + "learning_rate": 2.846932380444744e-06, + "loss": 0.72240984, + "num_input_tokens_seen": 135699400, + "router_z_loss_clip": 1.69140625, + "router_z_loss_mlp": 0.14660645, + "step": 6320, + "time_per_iteration": 2.516150712966919 + }, + { + "auxiliary_loss_clip": 0.06456275, + "auxiliary_loss_mlp": 0.01268463, + "balance_loss_clip": 0.06285265, + "balance_loss_mlp": 0.01252846, + "epoch": 0.3800390801142342, + "flos": 32971181495040.0, + "grad_norm": 1.7343317020382172, + "language_loss": 0.71855223, + "learning_rate": 2.846579546413992e-06, + "loss": 0.79579961, + "num_input_tokens_seen": 135723455, + "router_z_loss_clip": 1.71191406, + "router_z_loss_mlp": 0.15612793, + "step": 6321, + "time_per_iteration": 2.6204988956451416 + }, + { + "auxiliary_loss_clip": 0.06458073, + "auxiliary_loss_mlp": 0.01268703, + "balance_loss_clip": 0.06285845, + "balance_loss_mlp": 0.01253784, + "epoch": 0.38009920336690217, + "flos": 26914430090880.0, + "grad_norm": 1.8398392312515923, + "language_loss": 0.75578612, + "learning_rate": 2.846226680280859e-06, + "loss": 0.83305389, + "num_input_tokens_seen": 135744335, + "router_z_loss_clip": 1.72070312, + "router_z_loss_mlp": 0.14923096, + "step": 6322, + "time_per_iteration": 2.5463461875915527 + }, + { + "auxiliary_loss_clip": 0.06454624, + "auxiliary_loss_mlp": 0.01271033, + "balance_loss_clip": 0.06285781, + "balance_loss_mlp": 0.01256823, + "epoch": 0.38015932661957014, + "flos": 22494963467520.0, + "grad_norm": 1.8201003599281902, + "language_loss": 0.85709381, + "learning_rate": 2.845873782058725e-06, + "loss": 0.93435031, + "num_input_tokens_seen": 135761440, + "router_z_loss_clip": 1.6875, + "router_z_loss_mlp": 0.14215088, + "step": 6323, + "time_per_iteration": 2.4927124977111816 + }, + { + "auxiliary_loss_clip": 0.06458908, + "auxiliary_loss_mlp": 0.01270641, + "balance_loss_clip": 0.06286593, + "balance_loss_mlp": 0.01254596, + "epoch": 0.3802194498722381, + "flos": 21987440087040.0, + "grad_norm": 2.2452863694907426, + "language_loss": 0.73932886, + "learning_rate": 2.845520851760973e-06, + "loss": 0.81662428, + "num_input_tokens_seen": 135779955, + "router_z_loss_clip": 1.72265625, + "router_z_loss_mlp": 0.16027832, + "step": 6324, + "time_per_iteration": 2.4913861751556396 + }, + { + "auxiliary_loss_clip": 0.06464465, + "auxiliary_loss_mlp": 0.01272923, + "balance_loss_clip": 0.06288414, + "balance_loss_mlp": 0.01257724, + "epoch": 0.38027957312490607, + "flos": 21331310290560.0, + "grad_norm": 1.7884051563809298, + "language_loss": 0.84122628, + "learning_rate": 2.8451678894009847e-06, + "loss": 0.91860014, + "num_input_tokens_seen": 135799840, + "router_z_loss_clip": 1.76171875, + "router_z_loss_mlp": 0.15203857, + "step": 6325, + "time_per_iteration": 2.6119046211242676 + }, + { + "auxiliary_loss_clip": 0.06455745, + "auxiliary_loss_mlp": 0.01266737, + "balance_loss_clip": 0.06285073, + "balance_loss_mlp": 0.01252712, + "epoch": 0.38033969637757403, + "flos": 16696921144320.0, + "grad_norm": 2.2200302984742915, + "language_loss": 0.79868543, + "learning_rate": 2.8448148949921465e-06, + "loss": 0.87591028, + "num_input_tokens_seen": 135817880, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.14019775, + "step": 6326, + "time_per_iteration": 2.5188262462615967 + }, + { + "auxiliary_loss_clip": 0.06455691, + "auxiliary_loss_mlp": 0.01270203, + "balance_loss_clip": 0.06286497, + "balance_loss_mlp": 0.01255242, + "epoch": 0.380399819630242, + "flos": 36219741563520.0, + "grad_norm": 3.3742704435112025, + "language_loss": 0.73389304, + "learning_rate": 2.844461868547842e-06, + "loss": 0.81115204, + "num_input_tokens_seen": 135838940, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.14978027, + "step": 6327, + "time_per_iteration": 2.649383783340454 + }, + { + "auxiliary_loss_clip": 0.06459647, + "auxiliary_loss_mlp": 0.01269027, + "balance_loss_clip": 0.06290785, + "balance_loss_mlp": 0.01255145, + "epoch": 0.38045994288290996, + "flos": 21295364088960.0, + "grad_norm": 1.4936601975654378, + "language_loss": 0.83229524, + "learning_rate": 2.844108810081459e-06, + "loss": 0.90958202, + "num_input_tokens_seen": 135858325, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.13867188, + "step": 6328, + "time_per_iteration": 2.527261972427368 + }, + { + "auxiliary_loss_clip": 0.06452741, + "auxiliary_loss_mlp": 0.01268758, + "balance_loss_clip": 0.06281206, + "balance_loss_mlp": 0.01253755, + "epoch": 0.38052006613557793, + "flos": 20929151819520.0, + "grad_norm": 1.5056942690240434, + "language_loss": 0.61757982, + "learning_rate": 2.843755719606385e-06, + "loss": 0.69479483, + "num_input_tokens_seen": 135878430, + "router_z_loss_clip": 1.71679688, + "router_z_loss_mlp": 0.15008545, + "step": 6329, + "time_per_iteration": 2.54025936126709 + }, + { + "auxiliary_loss_clip": 0.0645529, + "auxiliary_loss_mlp": 0.01268187, + "balance_loss_clip": 0.06283917, + "balance_loss_mlp": 0.01254037, + "epoch": 0.3805801893882459, + "flos": 20996138759040.0, + "grad_norm": 2.0488191193117316, + "language_loss": 0.56127822, + "learning_rate": 2.8434025971360104e-06, + "loss": 0.63851297, + "num_input_tokens_seen": 135894755, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.14160156, + "step": 6330, + "time_per_iteration": 2.4913628101348877 + }, + { + "auxiliary_loss_clip": 0.06449446, + "auxiliary_loss_mlp": 0.01269693, + "balance_loss_clip": 0.06282543, + "balance_loss_mlp": 0.01255781, + "epoch": 0.38064031264091386, + "flos": 25565972734080.0, + "grad_norm": 1.4483276491856993, + "language_loss": 0.65912807, + "learning_rate": 2.8430494426837243e-06, + "loss": 0.73631942, + "num_input_tokens_seen": 135918275, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.13903809, + "step": 6331, + "time_per_iteration": 2.6071105003356934 + }, + { + "auxiliary_loss_clip": 0.0645493, + "auxiliary_loss_mlp": 0.01269934, + "balance_loss_clip": 0.06284193, + "balance_loss_mlp": 0.01254312, + "epoch": 0.3807004358935818, + "flos": 15091264080000.0, + "grad_norm": 1.528944840420101, + "language_loss": 0.7597304, + "learning_rate": 2.842696256262919e-06, + "loss": 0.83697909, + "num_input_tokens_seen": 135937430, + "router_z_loss_clip": 1.70898438, + "router_z_loss_mlp": 0.15618896, + "step": 6332, + "time_per_iteration": 2.4808928966522217 + }, + { + "auxiliary_loss_clip": 0.06456427, + "auxiliary_loss_mlp": 0.01273089, + "balance_loss_clip": 0.06283183, + "balance_loss_mlp": 0.01257943, + "epoch": 0.3807605591462498, + "flos": 16405033046400.0, + "grad_norm": 2.2042220893600226, + "language_loss": 0.82397389, + "learning_rate": 2.842343037886987e-06, + "loss": 0.90126908, + "num_input_tokens_seen": 135954210, + "router_z_loss_clip": 1.73144531, + "router_z_loss_mlp": 0.15142822, + "step": 6333, + "time_per_iteration": 2.5033013820648193 + }, + { + "auxiliary_loss_clip": 0.06454624, + "auxiliary_loss_mlp": 0.01269205, + "balance_loss_clip": 0.06283775, + "balance_loss_mlp": 0.01254655, + "epoch": 0.3808206823989178, + "flos": 29064353351040.0, + "grad_norm": 1.4831969327294916, + "language_loss": 0.86723578, + "learning_rate": 2.8419897875693226e-06, + "loss": 0.9444741, + "num_input_tokens_seen": 135974425, + "router_z_loss_clip": 1.70703125, + "router_z_loss_mlp": 0.14538574, + "step": 6334, + "time_per_iteration": 4.024240493774414 + }, + { + "auxiliary_loss_clip": 0.06455058, + "auxiliary_loss_mlp": 0.01270467, + "balance_loss_clip": 0.06282362, + "balance_loss_mlp": 0.01255155, + "epoch": 0.3808808056515858, + "flos": 15711321893760.0, + "grad_norm": 2.3448311359770795, + "language_loss": 0.79450226, + "learning_rate": 2.841636505323321e-06, + "loss": 0.87175757, + "num_input_tokens_seen": 135991985, + "router_z_loss_clip": 1.72753906, + "router_z_loss_mlp": 0.15301514, + "step": 6335, + "time_per_iteration": 2.4698357582092285 + }, + { + "auxiliary_loss_clip": 0.06453745, + "auxiliary_loss_mlp": 0.0127096, + "balance_loss_clip": 0.06281872, + "balance_loss_mlp": 0.0125517, + "epoch": 0.38094092890425374, + "flos": 20710917060480.0, + "grad_norm": 1.9128487431319638, + "language_loss": 0.72795898, + "learning_rate": 2.8412831911623795e-06, + "loss": 0.80520606, + "num_input_tokens_seen": 136010015, + "router_z_loss_clip": 1.71777344, + "router_z_loss_mlp": 0.15802002, + "step": 6336, + "time_per_iteration": 3.9780919551849365 + }, + { + "auxiliary_loss_clip": 0.06449959, + "auxiliary_loss_mlp": 0.01267203, + "balance_loss_clip": 0.06281384, + "balance_loss_mlp": 0.01252826, + "epoch": 0.3810010521569217, + "flos": 20674258099200.0, + "grad_norm": 2.2277206975915362, + "language_loss": 0.69756234, + "learning_rate": 2.840929845099894e-06, + "loss": 0.77473396, + "num_input_tokens_seen": 136028440, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.14373779, + "step": 6337, + "time_per_iteration": 2.5475378036499023 + }, + { + "auxiliary_loss_clip": 0.06454941, + "auxiliary_loss_mlp": 0.01273075, + "balance_loss_clip": 0.06282912, + "balance_loss_mlp": 0.012579, + "epoch": 0.38106117540958967, + "flos": 31834963330560.0, + "grad_norm": 1.987280020069696, + "language_loss": 0.64026022, + "learning_rate": 2.8405764671492652e-06, + "loss": 0.71754032, + "num_input_tokens_seen": 136048360, + "router_z_loss_clip": 1.71777344, + "router_z_loss_mlp": 0.1517334, + "step": 6338, + "time_per_iteration": 2.5795555114746094 + }, + { + "auxiliary_loss_clip": 0.06456137, + "auxiliary_loss_mlp": 0.01271603, + "balance_loss_clip": 0.06282276, + "balance_loss_mlp": 0.01255772, + "epoch": 0.38112129866225763, + "flos": 16907231692800.0, + "grad_norm": 1.6550535893348008, + "language_loss": 0.69685936, + "learning_rate": 2.8402230573238923e-06, + "loss": 0.77413678, + "num_input_tokens_seen": 136065500, + "router_z_loss_clip": 1.74121094, + "router_z_loss_mlp": 0.15856934, + "step": 6339, + "time_per_iteration": 2.48705792427063 + }, + { + "auxiliary_loss_clip": 0.06451984, + "auxiliary_loss_mlp": 0.01267449, + "balance_loss_clip": 0.06281533, + "balance_loss_mlp": 0.01253913, + "epoch": 0.3811814219149256, + "flos": 20893624888320.0, + "grad_norm": 2.252585455539085, + "language_loss": 0.68345773, + "learning_rate": 2.839869615637177e-06, + "loss": 0.76065207, + "num_input_tokens_seen": 136084060, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.13519287, + "step": 6340, + "time_per_iteration": 2.5142812728881836 + }, + { + "auxiliary_loss_clip": 0.06456652, + "auxiliary_loss_mlp": 0.01275426, + "balance_loss_clip": 0.06282599, + "balance_loss_mlp": 0.01260083, + "epoch": 0.38124154516759357, + "flos": 16696418019840.0, + "grad_norm": 2.4997436549257754, + "language_loss": 0.89721388, + "learning_rate": 2.839516142102522e-06, + "loss": 0.97453463, + "num_input_tokens_seen": 136102310, + "router_z_loss_clip": 1.74023438, + "router_z_loss_mlp": 0.15332031, + "step": 6341, + "time_per_iteration": 4.08266806602478 + }, + { + "auxiliary_loss_clip": 0.06461132, + "auxiliary_loss_mlp": 0.01272557, + "balance_loss_clip": 0.06284279, + "balance_loss_mlp": 0.01255427, + "epoch": 0.38130166842026153, + "flos": 19687946088960.0, + "grad_norm": 1.4891162994718032, + "language_loss": 0.75298452, + "learning_rate": 2.83916263673333e-06, + "loss": 0.83032143, + "num_input_tokens_seen": 136120725, + "router_z_loss_clip": 1.76855469, + "router_z_loss_mlp": 0.17138672, + "step": 6342, + "time_per_iteration": 2.496697425842285 + }, + { + "auxiliary_loss_clip": 0.06453368, + "auxiliary_loss_mlp": 0.01271075, + "balance_loss_clip": 0.06281647, + "balance_loss_mlp": 0.0125646, + "epoch": 0.3813617916729295, + "flos": 22204668597120.0, + "grad_norm": 1.7145643847071266, + "language_loss": 0.83785719, + "learning_rate": 2.838809099543007e-06, + "loss": 0.91510159, + "num_input_tokens_seen": 136139105, + "router_z_loss_clip": 1.71777344, + "router_z_loss_mlp": 0.14599609, + "step": 6343, + "time_per_iteration": 4.049302339553833 + }, + { + "auxiliary_loss_clip": 0.0645491, + "auxiliary_loss_mlp": 0.01269585, + "balance_loss_clip": 0.06281073, + "balance_loss_mlp": 0.01254905, + "epoch": 0.38142191492559746, + "flos": 19102576665600.0, + "grad_norm": 1.619462393744454, + "language_loss": 0.77529186, + "learning_rate": 2.838455530544959e-06, + "loss": 0.8525368, + "num_input_tokens_seen": 136158265, + "router_z_loss_clip": 1.73828125, + "router_z_loss_mlp": 0.14678955, + "step": 6344, + "time_per_iteration": 2.579394817352295 + }, + { + "auxiliary_loss_clip": 0.06456682, + "auxiliary_loss_mlp": 0.01271203, + "balance_loss_clip": 0.06285504, + "balance_loss_mlp": 0.01256635, + "epoch": 0.3814820381782654, + "flos": 24104645527680.0, + "grad_norm": 1.8871239884396722, + "language_loss": 0.74166036, + "learning_rate": 2.838101929752593e-06, + "loss": 0.81893921, + "num_input_tokens_seen": 136176100, + "router_z_loss_clip": 1.7109375, + "router_z_loss_mlp": 0.14587402, + "step": 6345, + "time_per_iteration": 2.5367093086242676 + }, + { + "auxiliary_loss_clip": 0.06457509, + "auxiliary_loss_mlp": 0.0127188, + "balance_loss_clip": 0.06287415, + "balance_loss_mlp": 0.01257765, + "epoch": 0.3815421614309334, + "flos": 15783927056640.0, + "grad_norm": 1.7118462514914357, + "language_loss": 0.69868183, + "learning_rate": 2.8377482971793187e-06, + "loss": 0.7759757, + "num_input_tokens_seen": 136195125, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.14111328, + "step": 6346, + "time_per_iteration": 2.5815930366516113 + }, + { + "auxiliary_loss_clip": 0.06466204, + "auxiliary_loss_mlp": 0.0127262, + "balance_loss_clip": 0.06290555, + "balance_loss_mlp": 0.01257236, + "epoch": 0.38160228468360136, + "flos": 19905593869440.0, + "grad_norm": 1.781545419456976, + "language_loss": 0.7611326, + "learning_rate": 2.8373946328385437e-06, + "loss": 0.83852088, + "num_input_tokens_seen": 136213885, + "router_z_loss_clip": 1.75585938, + "router_z_loss_mlp": 0.15374756, + "step": 6347, + "time_per_iteration": 2.5027284622192383 + }, + { + "auxiliary_loss_clip": 0.06456521, + "auxiliary_loss_mlp": 0.01269003, + "balance_loss_clip": 0.06283832, + "balance_loss_mlp": 0.012553, + "epoch": 0.3816624079362694, + "flos": 19287045429120.0, + "grad_norm": 1.488288802844173, + "language_loss": 0.75192666, + "learning_rate": 2.8370409367436813e-06, + "loss": 0.82918191, + "num_input_tokens_seen": 136232700, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.13702393, + "step": 6348, + "time_per_iteration": 2.559131383895874 + }, + { + "auxiliary_loss_clip": 0.0645996, + "auxiliary_loss_mlp": 0.01270391, + "balance_loss_clip": 0.06286097, + "balance_loss_mlp": 0.01256599, + "epoch": 0.38172253118893734, + "flos": 21183752050560.0, + "grad_norm": 1.729316797973715, + "language_loss": 0.88237411, + "learning_rate": 2.836687208908142e-06, + "loss": 0.95967764, + "num_input_tokens_seen": 136248975, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.13775635, + "step": 6349, + "time_per_iteration": 2.525542974472046 + }, + { + "auxiliary_loss_clip": 0.06453095, + "auxiliary_loss_mlp": 0.0126974, + "balance_loss_clip": 0.06281723, + "balance_loss_mlp": 0.01255149, + "epoch": 0.3817826544416053, + "flos": 17534836373760.0, + "grad_norm": 1.7576595366031973, + "language_loss": 0.76939785, + "learning_rate": 2.836333449345341e-06, + "loss": 0.84662628, + "num_input_tokens_seen": 136266710, + "router_z_loss_clip": 1.71386719, + "router_z_loss_mlp": 0.14593506, + "step": 6350, + "time_per_iteration": 2.532376289367676 + }, + { + "auxiliary_loss_clip": 0.06458531, + "auxiliary_loss_mlp": 0.01273484, + "balance_loss_clip": 0.06286063, + "balance_loss_mlp": 0.01258231, + "epoch": 0.38184277769427327, + "flos": 16332176321280.0, + "grad_norm": 2.21296257119241, + "language_loss": 0.77054518, + "learning_rate": 2.8359796580686907e-06, + "loss": 0.84786528, + "num_input_tokens_seen": 136284445, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.15264893, + "step": 6351, + "time_per_iteration": 2.4930031299591064 + }, + { + "auxiliary_loss_clip": 0.06457832, + "auxiliary_loss_mlp": 0.01273263, + "balance_loss_clip": 0.0628476, + "balance_loss_mlp": 0.012577, + "epoch": 0.38190290094694124, + "flos": 30450937115520.0, + "grad_norm": 2.2550067272061254, + "language_loss": 0.74895489, + "learning_rate": 2.8356258350916085e-06, + "loss": 0.82626581, + "num_input_tokens_seen": 136305730, + "router_z_loss_clip": 1.73144531, + "router_z_loss_mlp": 0.15563965, + "step": 6352, + "time_per_iteration": 2.6078808307647705 + }, + { + "auxiliary_loss_clip": 0.0645259, + "auxiliary_loss_mlp": 0.01270341, + "balance_loss_clip": 0.06283389, + "balance_loss_mlp": 0.0125659, + "epoch": 0.3819630241996092, + "flos": 14215138588800.0, + "grad_norm": 2.0554991668998777, + "language_loss": 0.63961715, + "learning_rate": 2.8352719804275104e-06, + "loss": 0.71684647, + "num_input_tokens_seen": 136323850, + "router_z_loss_clip": 1.69140625, + "router_z_loss_mlp": 0.13739014, + "step": 6353, + "time_per_iteration": 2.476759433746338 + }, + { + "auxiliary_loss_clip": 0.06456264, + "auxiliary_loss_mlp": 0.01279815, + "balance_loss_clip": 0.06283177, + "balance_loss_mlp": 0.01266112, + "epoch": 0.38202314745227717, + "flos": 25016717220480.0, + "grad_norm": 1.720129608989886, + "language_loss": 0.83556378, + "learning_rate": 2.834918094089816e-06, + "loss": 0.91292459, + "num_input_tokens_seen": 136344880, + "router_z_loss_clip": 1.72949219, + "router_z_loss_mlp": 0.13702393, + "step": 6354, + "time_per_iteration": 2.5726418495178223 + }, + { + "auxiliary_loss_clip": 0.06456912, + "auxiliary_loss_mlp": 0.01271961, + "balance_loss_clip": 0.06290418, + "balance_loss_mlp": 0.0125911, + "epoch": 0.38208327070494513, + "flos": 20820935871360.0, + "grad_norm": 1.6482101436629937, + "language_loss": 0.81480742, + "learning_rate": 2.834564176091943e-06, + "loss": 0.89209616, + "num_input_tokens_seen": 136366060, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.12854004, + "step": 6355, + "time_per_iteration": 2.5225114822387695 + }, + { + "auxiliary_loss_clip": 0.06459523, + "auxiliary_loss_mlp": 0.01273228, + "balance_loss_clip": 0.06289364, + "balance_loss_mlp": 0.01259179, + "epoch": 0.3821433939576131, + "flos": 22644282643200.0, + "grad_norm": 1.8808367718392982, + "language_loss": 0.75647783, + "learning_rate": 2.8342102264473125e-06, + "loss": 0.83380532, + "num_input_tokens_seen": 136385625, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.14031982, + "step": 6356, + "time_per_iteration": 2.5584537982940674 + }, + { + "auxiliary_loss_clip": 0.0646046, + "auxiliary_loss_mlp": 0.01272045, + "balance_loss_clip": 0.06287301, + "balance_loss_mlp": 0.01257645, + "epoch": 0.38220351721028106, + "flos": 26877100296960.0, + "grad_norm": 1.8976132208861074, + "language_loss": 0.82161039, + "learning_rate": 2.833856245169348e-06, + "loss": 0.89893544, + "num_input_tokens_seen": 136405750, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.14398193, + "step": 6357, + "time_per_iteration": 2.546190023422241 + }, + { + "auxiliary_loss_clip": 0.06463508, + "auxiliary_loss_mlp": 0.01275628, + "balance_loss_clip": 0.0629019, + "balance_loss_mlp": 0.01260035, + "epoch": 0.38226364046294903, + "flos": 23374149632640.0, + "grad_norm": 1.7334885634957151, + "language_loss": 0.78531659, + "learning_rate": 2.8335022322714695e-06, + "loss": 0.86270791, + "num_input_tokens_seen": 136426085, + "router_z_loss_clip": 1.73535156, + "router_z_loss_mlp": 0.15612793, + "step": 6358, + "time_per_iteration": 2.5330071449279785 + }, + { + "auxiliary_loss_clip": 0.06462916, + "auxiliary_loss_mlp": 0.01271369, + "balance_loss_clip": 0.06287834, + "balance_loss_mlp": 0.01256086, + "epoch": 0.382323763715617, + "flos": 19652335303680.0, + "grad_norm": 1.9007754709735623, + "language_loss": 0.79191673, + "learning_rate": 2.8331481877671036e-06, + "loss": 0.86925954, + "num_input_tokens_seen": 136442670, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.15270996, + "step": 6359, + "time_per_iteration": 2.5185654163360596 + }, + { + "auxiliary_loss_clip": 0.06457044, + "auxiliary_loss_mlp": 0.01275796, + "balance_loss_clip": 0.06287733, + "balance_loss_mlp": 0.01261884, + "epoch": 0.38238388696828496, + "flos": 54136527575040.0, + "grad_norm": 1.6591220194179586, + "language_loss": 0.70001733, + "learning_rate": 2.8327941116696754e-06, + "loss": 0.77734572, + "num_input_tokens_seen": 136465730, + "router_z_loss_clip": 1.69238281, + "router_z_loss_mlp": 0.13903809, + "step": 6360, + "time_per_iteration": 2.8067054748535156 + }, + { + "auxiliary_loss_clip": 0.06461466, + "auxiliary_loss_mlp": 0.01277777, + "balance_loss_clip": 0.06292595, + "balance_loss_mlp": 0.01262923, + "epoch": 0.382444010220953, + "flos": 24943105808640.0, + "grad_norm": 1.5737902616354833, + "language_loss": 0.79093289, + "learning_rate": 2.83244000399261e-06, + "loss": 0.86832535, + "num_input_tokens_seen": 136487215, + "router_z_loss_clip": 1.6875, + "router_z_loss_mlp": 0.14849854, + "step": 6361, + "time_per_iteration": 2.558579683303833 + }, + { + "auxiliary_loss_clip": 0.0645285, + "auxiliary_loss_mlp": 0.01272146, + "balance_loss_clip": 0.06286099, + "balance_loss_mlp": 0.01257996, + "epoch": 0.38250413347362094, + "flos": 42346750216320.0, + "grad_norm": 1.4645255919949542, + "language_loss": 0.65580732, + "learning_rate": 2.832085864749337e-06, + "loss": 0.73305726, + "num_input_tokens_seen": 136510365, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.14154053, + "step": 6362, + "time_per_iteration": 2.709390878677368 + }, + { + "auxiliary_loss_clip": 0.06459438, + "auxiliary_loss_mlp": 0.01270758, + "balance_loss_clip": 0.06287294, + "balance_loss_mlp": 0.01255415, + "epoch": 0.3825642567262889, + "flos": 16294720746240.0, + "grad_norm": 1.6166481183320216, + "language_loss": 0.8211807, + "learning_rate": 2.8317316939532848e-06, + "loss": 0.89848268, + "num_input_tokens_seen": 136527100, + "router_z_loss_clip": 1.72070312, + "router_z_loss_mlp": 0.15332031, + "step": 6363, + "time_per_iteration": 2.468846559524536 + }, + { + "auxiliary_loss_clip": 0.06453779, + "auxiliary_loss_mlp": 0.01274743, + "balance_loss_clip": 0.06286556, + "balance_loss_mlp": 0.01259401, + "epoch": 0.3826243799789569, + "flos": 45664267795200.0, + "grad_norm": 1.6258867054195516, + "language_loss": 0.59107661, + "learning_rate": 2.8313774916178825e-06, + "loss": 0.6683619, + "num_input_tokens_seen": 136550870, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.15356445, + "step": 6364, + "time_per_iteration": 2.745589256286621 + }, + { + "auxiliary_loss_clip": 0.06465845, + "auxiliary_loss_mlp": 0.0127531, + "balance_loss_clip": 0.06290866, + "balance_loss_mlp": 0.01261058, + "epoch": 0.38268450323162484, + "flos": 25308647245440.0, + "grad_norm": 2.2940920681906873, + "language_loss": 0.6951021, + "learning_rate": 2.8310232577565635e-06, + "loss": 0.77251363, + "num_input_tokens_seen": 136569895, + "router_z_loss_clip": 1.74902344, + "router_z_loss_mlp": 0.14257812, + "step": 6365, + "time_per_iteration": 2.561795473098755 + }, + { + "auxiliary_loss_clip": 0.06461614, + "auxiliary_loss_mlp": 0.01269888, + "balance_loss_clip": 0.06285347, + "balance_loss_mlp": 0.0125451, + "epoch": 0.3827446264842928, + "flos": 21842607104640.0, + "grad_norm": 2.2040506714686208, + "language_loss": 0.73211187, + "learning_rate": 2.830668992382758e-06, + "loss": 0.8094269, + "num_input_tokens_seen": 136588585, + "router_z_loss_clip": 1.76464844, + "router_z_loss_mlp": 0.15374756, + "step": 6366, + "time_per_iteration": 2.527252435684204 + }, + { + "auxiliary_loss_clip": 0.06455328, + "auxiliary_loss_mlp": 0.01270912, + "balance_loss_clip": 0.06284537, + "balance_loss_mlp": 0.0125703, + "epoch": 0.38280474973696077, + "flos": 25740924059520.0, + "grad_norm": 2.537372436592335, + "language_loss": 0.69208872, + "learning_rate": 2.830314695509902e-06, + "loss": 0.76935112, + "num_input_tokens_seen": 136606640, + "router_z_loss_clip": 1.70898438, + "router_z_loss_mlp": 0.13885498, + "step": 6367, + "time_per_iteration": 2.563174247741699 + }, + { + "auxiliary_loss_clip": 0.06445135, + "auxiliary_loss_mlp": 0.01267364, + "balance_loss_clip": 0.06281811, + "balance_loss_mlp": 0.01253482, + "epoch": 0.38286487298962874, + "flos": 24902212216320.0, + "grad_norm": 2.529219827632029, + "language_loss": 0.64519894, + "learning_rate": 2.82996036715143e-06, + "loss": 0.72232389, + "num_input_tokens_seen": 136624940, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.13897705, + "step": 6368, + "time_per_iteration": 2.5240230560302734 + }, + { + "auxiliary_loss_clip": 0.0644632, + "auxiliary_loss_mlp": 0.0126879, + "balance_loss_clip": 0.06279288, + "balance_loss_mlp": 0.01255111, + "epoch": 0.3829249962422967, + "flos": 28550457060480.0, + "grad_norm": 1.3073196657605344, + "language_loss": 0.68441451, + "learning_rate": 2.8296060073207763e-06, + "loss": 0.76156569, + "num_input_tokens_seen": 136645540, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.13677979, + "step": 6369, + "time_per_iteration": 2.623020887374878 + }, + { + "auxiliary_loss_clip": 0.06452611, + "auxiliary_loss_mlp": 0.01268713, + "balance_loss_clip": 0.0628352, + "balance_loss_mlp": 0.01254724, + "epoch": 0.38298511949496467, + "flos": 21477736500480.0, + "grad_norm": 1.6896603918496267, + "language_loss": 0.79100078, + "learning_rate": 2.8292516160313804e-06, + "loss": 0.86821401, + "num_input_tokens_seen": 136664530, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.13995361, + "step": 6370, + "time_per_iteration": 2.5265746116638184 + }, + { + "auxiliary_loss_clip": 0.06451623, + "auxiliary_loss_mlp": 0.0127085, + "balance_loss_clip": 0.06281339, + "balance_loss_mlp": 0.01256265, + "epoch": 0.38304524274763263, + "flos": 31687027747200.0, + "grad_norm": 2.908092380852583, + "language_loss": 0.651667, + "learning_rate": 2.8288971932966805e-06, + "loss": 0.72889173, + "num_input_tokens_seen": 136682315, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.14587402, + "step": 6371, + "time_per_iteration": 2.6345784664154053 + }, + { + "auxiliary_loss_clip": 0.06459577, + "auxiliary_loss_mlp": 0.01272301, + "balance_loss_clip": 0.06283382, + "balance_loss_mlp": 0.01257543, + "epoch": 0.3831053660003006, + "flos": 25082865619200.0, + "grad_norm": 2.362243450203488, + "language_loss": 0.73142469, + "learning_rate": 2.8285427391301155e-06, + "loss": 0.80874348, + "num_input_tokens_seen": 136701185, + "router_z_loss_clip": 1.76464844, + "router_z_loss_mlp": 0.14746094, + "step": 6372, + "time_per_iteration": 2.5150070190429688 + }, + { + "auxiliary_loss_clip": 0.06454702, + "auxiliary_loss_mlp": 0.01266707, + "balance_loss_clip": 0.06282556, + "balance_loss_mlp": 0.01252485, + "epoch": 0.38316548925296856, + "flos": 23265849830400.0, + "grad_norm": 1.5439174716844835, + "language_loss": 0.85255867, + "learning_rate": 2.8281882535451266e-06, + "loss": 0.92977273, + "num_input_tokens_seen": 136721265, + "router_z_loss_clip": 1.72167969, + "router_z_loss_mlp": 0.14221191, + "step": 6373, + "time_per_iteration": 4.056765794754028 + }, + { + "auxiliary_loss_clip": 0.0645606, + "auxiliary_loss_mlp": 0.01272183, + "balance_loss_clip": 0.06281903, + "balance_loss_mlp": 0.01257431, + "epoch": 0.3832256125056366, + "flos": 34432131358080.0, + "grad_norm": 8.29118461423438, + "language_loss": 0.75127506, + "learning_rate": 2.8278337365551567e-06, + "loss": 0.82855743, + "num_input_tokens_seen": 136741885, + "router_z_loss_clip": 1.74121094, + "router_z_loss_mlp": 0.14758301, + "step": 6374, + "time_per_iteration": 2.739825963973999 + }, + { + "auxiliary_loss_clip": 0.06457414, + "auxiliary_loss_mlp": 0.01272454, + "balance_loss_clip": 0.0628335, + "balance_loss_mlp": 0.01258042, + "epoch": 0.38328573575830455, + "flos": 21769289182080.0, + "grad_norm": 1.9434329018980874, + "language_loss": 0.76033717, + "learning_rate": 2.8274791881736485e-06, + "loss": 0.83763582, + "num_input_tokens_seen": 136760905, + "router_z_loss_clip": 1.74121094, + "router_z_loss_mlp": 0.14416504, + "step": 6375, + "time_per_iteration": 2.521092176437378 + }, + { + "auxiliary_loss_clip": 0.06457017, + "auxiliary_loss_mlp": 0.01267252, + "balance_loss_clip": 0.06283681, + "balance_loss_mlp": 0.01252541, + "epoch": 0.3833458590109725, + "flos": 17385056000640.0, + "grad_norm": 2.081333613596134, + "language_loss": 0.73067588, + "learning_rate": 2.8271246084140457e-06, + "loss": 0.80791855, + "num_input_tokens_seen": 136777240, + "router_z_loss_clip": 1.73046875, + "router_z_loss_mlp": 0.1472168, + "step": 6376, + "time_per_iteration": 3.913828134536743 + }, + { + "auxiliary_loss_clip": 0.06451094, + "auxiliary_loss_mlp": 0.01266207, + "balance_loss_clip": 0.06282462, + "balance_loss_mlp": 0.01251294, + "epoch": 0.3834059822636405, + "flos": 29432326556160.0, + "grad_norm": 1.6469866452188906, + "language_loss": 0.68444526, + "learning_rate": 2.826769997289796e-06, + "loss": 0.76161826, + "num_input_tokens_seen": 136801040, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.14916992, + "step": 6377, + "time_per_iteration": 2.552703857421875 + }, + { + "auxiliary_loss_clip": 0.0646103, + "auxiliary_loss_mlp": 0.01268999, + "balance_loss_clip": 0.06285432, + "balance_loss_mlp": 0.01253413, + "epoch": 0.38346610551630844, + "flos": 21477191448960.0, + "grad_norm": 1.937210921117629, + "language_loss": 0.73608565, + "learning_rate": 2.826415354814344e-06, + "loss": 0.8133859, + "num_input_tokens_seen": 136819495, + "router_z_loss_clip": 1.75488281, + "router_z_loss_mlp": 0.15582275, + "step": 6378, + "time_per_iteration": 2.554784059524536 + }, + { + "auxiliary_loss_clip": 0.06455162, + "auxiliary_loss_mlp": 0.01271763, + "balance_loss_clip": 0.06283469, + "balance_loss_mlp": 0.01257661, + "epoch": 0.3835262287689764, + "flos": 27568253900160.0, + "grad_norm": 1.6187724503548255, + "language_loss": 0.69142127, + "learning_rate": 2.8260606810011396e-06, + "loss": 0.76869053, + "num_input_tokens_seen": 136838840, + "router_z_loss_clip": 1.72070312, + "router_z_loss_mlp": 0.14099121, + "step": 6379, + "time_per_iteration": 2.540184736251831 + }, + { + "auxiliary_loss_clip": 0.06449591, + "auxiliary_loss_mlp": 0.01271871, + "balance_loss_clip": 0.06281038, + "balance_loss_mlp": 0.01258209, + "epoch": 0.3835863520216444, + "flos": 15529201044480.0, + "grad_norm": 1.7677581121541173, + "language_loss": 0.8420229, + "learning_rate": 2.8257059758636315e-06, + "loss": 0.91923743, + "num_input_tokens_seen": 136854425, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.13659668, + "step": 6380, + "time_per_iteration": 3.9425628185272217 + }, + { + "auxiliary_loss_clip": 0.06454644, + "auxiliary_loss_mlp": 0.01270371, + "balance_loss_clip": 0.06286694, + "balance_loss_mlp": 0.01255786, + "epoch": 0.38364647527431234, + "flos": 21910851855360.0, + "grad_norm": 1.4264464063638025, + "language_loss": 0.81255281, + "learning_rate": 2.8253512394152697e-06, + "loss": 0.88980293, + "num_input_tokens_seen": 136874355, + "router_z_loss_clip": 1.67871094, + "router_z_loss_mlp": 0.14569092, + "step": 6381, + "time_per_iteration": 2.5692083835601807 + }, + { + "auxiliary_loss_clip": 0.06363897, + "auxiliary_loss_mlp": 0.0126892, + "balance_loss_clip": 0.06286111, + "balance_loss_mlp": 0.01265082, + "epoch": 0.3837065985269803, + "flos": 65553076120320.0, + "grad_norm": 0.8198763586735168, + "language_loss": 0.60085058, + "learning_rate": 2.8249964716695068e-06, + "loss": 0.67717874, + "num_input_tokens_seen": 136937475, + "router_z_loss_clip": 0.77734375, + "router_z_loss_mlp": 0.03833008, + "step": 6382, + "time_per_iteration": 3.1118690967559814 + }, + { + "auxiliary_loss_clip": 0.06458844, + "auxiliary_loss_mlp": 0.0127264, + "balance_loss_clip": 0.06285119, + "balance_loss_mlp": 0.01257375, + "epoch": 0.38376672177964827, + "flos": 28264103331840.0, + "grad_norm": 2.361672223919581, + "language_loss": 0.67004663, + "learning_rate": 2.824641672639794e-06, + "loss": 0.74736154, + "num_input_tokens_seen": 136955805, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.15264893, + "step": 6383, + "time_per_iteration": 3.949587345123291 + }, + { + "auxiliary_loss_clip": 0.06458098, + "auxiliary_loss_mlp": 0.01270272, + "balance_loss_clip": 0.06285569, + "balance_loss_mlp": 0.01255919, + "epoch": 0.38382684503231623, + "flos": 20637641064960.0, + "grad_norm": 1.580160930907899, + "language_loss": 0.75169957, + "learning_rate": 2.824286842339587e-06, + "loss": 0.82898319, + "num_input_tokens_seen": 136975240, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.14355469, + "step": 6384, + "time_per_iteration": 2.5578341484069824 + }, + { + "auxiliary_loss_clip": 0.0645394, + "auxiliary_loss_mlp": 0.01272921, + "balance_loss_clip": 0.06286485, + "balance_loss_mlp": 0.01259819, + "epoch": 0.3838869682849842, + "flos": 19611274003200.0, + "grad_norm": 1.4416039952500834, + "language_loss": 0.76348937, + "learning_rate": 2.823931980782341e-06, + "loss": 0.84075809, + "num_input_tokens_seen": 136994985, + "router_z_loss_clip": 1.67578125, + "router_z_loss_mlp": 0.13092041, + "step": 6385, + "time_per_iteration": 2.5225770473480225 + }, + { + "auxiliary_loss_clip": 0.06357871, + "auxiliary_loss_mlp": 0.01265462, + "balance_loss_clip": 0.06280675, + "balance_loss_mlp": 0.01261296, + "epoch": 0.38394709153765216, + "flos": 56572202856960.0, + "grad_norm": 1.1093406194632214, + "language_loss": 0.67841589, + "learning_rate": 2.82357708798151e-06, + "loss": 0.75464916, + "num_input_tokens_seen": 137046290, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.04168701, + "step": 6386, + "time_per_iteration": 3.0481390953063965 + }, + { + "auxiliary_loss_clip": 0.06453113, + "auxiliary_loss_mlp": 0.01268796, + "balance_loss_clip": 0.06286535, + "balance_loss_mlp": 0.01254777, + "epoch": 0.3840072147903202, + "flos": 15894323210880.0, + "grad_norm": 1.5665063027995272, + "language_loss": 0.72740716, + "learning_rate": 2.8232221639505547e-06, + "loss": 0.80462623, + "num_input_tokens_seen": 137064725, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.14025879, + "step": 6387, + "time_per_iteration": 2.514692783355713 + }, + { + "auxiliary_loss_clip": 0.06447147, + "auxiliary_loss_mlp": 0.01275854, + "balance_loss_clip": 0.06283197, + "balance_loss_mlp": 0.0126187, + "epoch": 0.38406733804298815, + "flos": 28225180310400.0, + "grad_norm": 2.2869557055676095, + "language_loss": 0.81707162, + "learning_rate": 2.822867208702932e-06, + "loss": 0.89430165, + "num_input_tokens_seen": 137086030, + "router_z_loss_clip": 1.63867188, + "router_z_loss_mlp": 0.13989258, + "step": 6388, + "time_per_iteration": 2.6592257022857666 + }, + { + "auxiliary_loss_clip": 0.06454118, + "auxiliary_loss_mlp": 0.01267752, + "balance_loss_clip": 0.0628527, + "balance_loss_mlp": 0.01253888, + "epoch": 0.3841274612956561, + "flos": 18229511848320.0, + "grad_norm": 1.6912658906890043, + "language_loss": 0.76762819, + "learning_rate": 2.8225122222521026e-06, + "loss": 0.84484684, + "num_input_tokens_seen": 137105400, + "router_z_loss_clip": 1.6875, + "router_z_loss_mlp": 0.13873291, + "step": 6389, + "time_per_iteration": 2.5315403938293457 + }, + { + "auxiliary_loss_clip": 0.06454799, + "auxiliary_loss_mlp": 0.01270773, + "balance_loss_clip": 0.06281878, + "balance_loss_mlp": 0.01254847, + "epoch": 0.3841875845483241, + "flos": 19799138856960.0, + "grad_norm": 1.6723623276481432, + "language_loss": 0.76991975, + "learning_rate": 2.8221572046115273e-06, + "loss": 0.84717548, + "num_input_tokens_seen": 137124985, + "router_z_loss_clip": 1.73046875, + "router_z_loss_mlp": 0.15905762, + "step": 6390, + "time_per_iteration": 2.5315029621124268 + }, + { + "auxiliary_loss_clip": 0.0646126, + "auxiliary_loss_mlp": 0.01271779, + "balance_loss_clip": 0.06286746, + "balance_loss_mlp": 0.01255572, + "epoch": 0.38424770780099204, + "flos": 29906670919680.0, + "grad_norm": 1.876202489708209, + "language_loss": 0.70321602, + "learning_rate": 2.821802155794668e-06, + "loss": 0.78054643, + "num_input_tokens_seen": 137146745, + "router_z_loss_clip": 1.74414062, + "router_z_loss_mlp": 0.1618042, + "step": 6391, + "time_per_iteration": 2.6110270023345947 + }, + { + "auxiliary_loss_clip": 0.06455616, + "auxiliary_loss_mlp": 0.01272965, + "balance_loss_clip": 0.06284156, + "balance_loss_mlp": 0.01258499, + "epoch": 0.38430783105366, + "flos": 20820013476480.0, + "grad_norm": 1.8135855175826887, + "language_loss": 0.83923954, + "learning_rate": 2.8214470758149884e-06, + "loss": 0.91652524, + "num_input_tokens_seen": 137163195, + "router_z_loss_clip": 1.71386719, + "router_z_loss_mlp": 0.14459229, + "step": 6392, + "time_per_iteration": 2.5735576152801514 + }, + { + "auxiliary_loss_clip": 0.06461488, + "auxiliary_loss_mlp": 0.01268829, + "balance_loss_clip": 0.06290185, + "balance_loss_mlp": 0.01255162, + "epoch": 0.384367954306328, + "flos": 11003153627520.0, + "grad_norm": 1.9242234625767662, + "language_loss": 0.61454862, + "learning_rate": 2.8210919646859536e-06, + "loss": 0.69185179, + "num_input_tokens_seen": 137179330, + "router_z_loss_clip": 1.7109375, + "router_z_loss_mlp": 0.13677979, + "step": 6393, + "time_per_iteration": 2.4626450538635254 + }, + { + "auxiliary_loss_clip": 0.06467697, + "auxiliary_loss_mlp": 0.01271997, + "balance_loss_clip": 0.06290497, + "balance_loss_mlp": 0.01256071, + "epoch": 0.38442807755899594, + "flos": 25345096571520.0, + "grad_norm": 2.1306446802295325, + "language_loss": 0.71410203, + "learning_rate": 2.820736822421029e-06, + "loss": 0.79149896, + "num_input_tokens_seen": 137198655, + "router_z_loss_clip": 1.7734375, + "router_z_loss_mlp": 0.15905762, + "step": 6394, + "time_per_iteration": 2.5997071266174316 + }, + { + "auxiliary_loss_clip": 0.06463788, + "auxiliary_loss_mlp": 0.01269365, + "balance_loss_clip": 0.0628664, + "balance_loss_mlp": 0.01254082, + "epoch": 0.3844882008116639, + "flos": 21076206935040.0, + "grad_norm": 1.9216116882295546, + "language_loss": 0.82087183, + "learning_rate": 2.8203816490336822e-06, + "loss": 0.89820337, + "num_input_tokens_seen": 137217120, + "router_z_loss_clip": 1.76953125, + "router_z_loss_mlp": 0.1529541, + "step": 6395, + "time_per_iteration": 2.517411470413208 + }, + { + "auxiliary_loss_clip": 0.06460339, + "auxiliary_loss_mlp": 0.01275993, + "balance_loss_clip": 0.06287727, + "balance_loss_mlp": 0.01261831, + "epoch": 0.38454832406433187, + "flos": 17968287144960.0, + "grad_norm": 2.112818402600052, + "language_loss": 0.70801687, + "learning_rate": 2.8200264445373813e-06, + "loss": 0.78538024, + "num_input_tokens_seen": 137234410, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.14160156, + "step": 6396, + "time_per_iteration": 2.50288987159729 + }, + { + "auxiliary_loss_clip": 0.06365301, + "auxiliary_loss_mlp": 0.01257609, + "balance_loss_clip": 0.06287754, + "balance_loss_mlp": 0.01253767, + "epoch": 0.38460844731699984, + "flos": 67946641925760.0, + "grad_norm": 0.873922952794391, + "language_loss": 0.59863293, + "learning_rate": 2.8196712089455954e-06, + "loss": 0.67486203, + "num_input_tokens_seen": 137294940, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.0383606, + "step": 6397, + "time_per_iteration": 3.206678628921509 + }, + { + "auxiliary_loss_clip": 0.06450997, + "auxiliary_loss_mlp": 0.01276354, + "balance_loss_clip": 0.06284742, + "balance_loss_mlp": 0.0126187, + "epoch": 0.3846685705696678, + "flos": 25856267604480.0, + "grad_norm": 1.772406293141946, + "language_loss": 0.85227352, + "learning_rate": 2.819315942271794e-06, + "loss": 0.92954701, + "num_input_tokens_seen": 137315035, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.14477539, + "step": 6398, + "time_per_iteration": 2.5761947631835938 + }, + { + "auxiliary_loss_clip": 0.06453151, + "auxiliary_loss_mlp": 0.01277177, + "balance_loss_clip": 0.06285614, + "balance_loss_mlp": 0.01262467, + "epoch": 0.38472869382233577, + "flos": 16295852776320.0, + "grad_norm": 2.386881726324987, + "language_loss": 0.80489028, + "learning_rate": 2.8189606445294515e-06, + "loss": 0.88219357, + "num_input_tokens_seen": 137333155, + "router_z_loss_clip": 1.67578125, + "router_z_loss_mlp": 0.14715576, + "step": 6399, + "time_per_iteration": 2.4882943630218506 + }, + { + "auxiliary_loss_clip": 0.06455526, + "auxiliary_loss_mlp": 0.01279196, + "balance_loss_clip": 0.06283697, + "balance_loss_mlp": 0.01263592, + "epoch": 0.38478881707500373, + "flos": 19358979759360.0, + "grad_norm": 1.8772073039605681, + "language_loss": 0.67565721, + "learning_rate": 2.818605315732038e-06, + "loss": 0.75300437, + "num_input_tokens_seen": 137351515, + "router_z_loss_clip": 1.71972656, + "router_z_loss_mlp": 0.15588379, + "step": 6400, + "time_per_iteration": 2.5162830352783203 + }, + { + "auxiliary_loss_clip": 0.06460319, + "auxiliary_loss_mlp": 0.01269914, + "balance_loss_clip": 0.06288355, + "balance_loss_mlp": 0.01255454, + "epoch": 0.38484894032767175, + "flos": 24867356117760.0, + "grad_norm": 1.6933093627789975, + "language_loss": 0.7382642, + "learning_rate": 2.81824995589303e-06, + "loss": 0.81556654, + "num_input_tokens_seen": 137371255, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.14459229, + "step": 6401, + "time_per_iteration": 2.5274739265441895 + }, + { + "auxiliary_loss_clip": 0.06457724, + "auxiliary_loss_mlp": 0.01277936, + "balance_loss_clip": 0.06285743, + "balance_loss_mlp": 0.01262296, + "epoch": 0.3849090635803397, + "flos": 14507068613760.0, + "grad_norm": 1.836175131611194, + "language_loss": 0.72368169, + "learning_rate": 2.8178945650259012e-06, + "loss": 0.80103827, + "num_input_tokens_seen": 137388980, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.15637207, + "step": 6402, + "time_per_iteration": 2.509624481201172 + }, + { + "auxiliary_loss_clip": 0.06455728, + "auxiliary_loss_mlp": 0.01275333, + "balance_loss_clip": 0.06288305, + "balance_loss_mlp": 0.01261195, + "epoch": 0.3849691868330077, + "flos": 18521903070720.0, + "grad_norm": 1.8063322577059318, + "language_loss": 0.83321881, + "learning_rate": 2.817539143144128e-06, + "loss": 0.91052943, + "num_input_tokens_seen": 137406885, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.14147949, + "step": 6403, + "time_per_iteration": 2.469576835632324 + }, + { + "auxiliary_loss_clip": 0.06451748, + "auxiliary_loss_mlp": 0.01274136, + "balance_loss_clip": 0.06283461, + "balance_loss_mlp": 0.01259813, + "epoch": 0.38502931008567565, + "flos": 21622821045120.0, + "grad_norm": 1.901744090638215, + "language_loss": 0.83685166, + "learning_rate": 2.817183690261189e-06, + "loss": 0.91411054, + "num_input_tokens_seen": 137425535, + "router_z_loss_clip": 1.68066406, + "router_z_loss_mlp": 0.14331055, + "step": 6404, + "time_per_iteration": 2.53399920463562 + }, + { + "auxiliary_loss_clip": 0.06460617, + "auxiliary_loss_mlp": 0.01279935, + "balance_loss_clip": 0.06287636, + "balance_loss_mlp": 0.01265844, + "epoch": 0.3850894333383436, + "flos": 25423152249600.0, + "grad_norm": 1.4804001380923333, + "language_loss": 0.70053053, + "learning_rate": 2.816828206390563e-06, + "loss": 0.77793604, + "num_input_tokens_seen": 137447700, + "router_z_loss_clip": 1.72949219, + "router_z_loss_mlp": 0.14105225, + "step": 6405, + "time_per_iteration": 2.577394485473633 + }, + { + "auxiliary_loss_clip": 0.06446706, + "auxiliary_loss_mlp": 0.01276604, + "balance_loss_clip": 0.06280848, + "balance_loss_mlp": 0.01263628, + "epoch": 0.3851495565910116, + "flos": 20233721658240.0, + "grad_norm": 1.9002503642999313, + "language_loss": 0.7926501, + "learning_rate": 2.816472691545729e-06, + "loss": 0.86988324, + "num_input_tokens_seen": 137462245, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.12976074, + "step": 6406, + "time_per_iteration": 2.491785764694214 + }, + { + "auxiliary_loss_clip": 0.06454885, + "auxiliary_loss_mlp": 0.01271692, + "balance_loss_clip": 0.06282916, + "balance_loss_mlp": 0.01256516, + "epoch": 0.38520967984367954, + "flos": 16514045608320.0, + "grad_norm": 2.2453520034380463, + "language_loss": 0.84628403, + "learning_rate": 2.8161171457401694e-06, + "loss": 0.92354977, + "num_input_tokens_seen": 137476455, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.1517334, + "step": 6407, + "time_per_iteration": 2.461927890777588 + }, + { + "auxiliary_loss_clip": 0.06351051, + "auxiliary_loss_mlp": 0.01274061, + "balance_loss_clip": 0.06273395, + "balance_loss_mlp": 0.01270625, + "epoch": 0.3852698030963475, + "flos": 61333088140800.0, + "grad_norm": 0.7518927461814024, + "language_loss": 0.64829391, + "learning_rate": 2.815761568987365e-06, + "loss": 0.72454506, + "num_input_tokens_seen": 137539845, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.03445435, + "step": 6408, + "time_per_iteration": 3.195535659790039 + }, + { + "auxiliary_loss_clip": 0.06454469, + "auxiliary_loss_mlp": 0.01271284, + "balance_loss_clip": 0.06283102, + "balance_loss_mlp": 0.01256383, + "epoch": 0.3853299263490155, + "flos": 22899595633920.0, + "grad_norm": 1.3862214198415879, + "language_loss": 0.73785079, + "learning_rate": 2.8154059613008e-06, + "loss": 0.8151083, + "num_input_tokens_seen": 137559880, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.14904785, + "step": 6409, + "time_per_iteration": 2.5463829040527344 + }, + { + "auxiliary_loss_clip": 0.06465833, + "auxiliary_loss_mlp": 0.01272782, + "balance_loss_clip": 0.06287792, + "balance_loss_mlp": 0.01257667, + "epoch": 0.38539004960168344, + "flos": 20053655233920.0, + "grad_norm": 2.2638026574615076, + "language_loss": 0.70597708, + "learning_rate": 2.81505032269396e-06, + "loss": 0.78336322, + "num_input_tokens_seen": 137578225, + "router_z_loss_clip": 1.78027344, + "router_z_loss_mlp": 0.15100098, + "step": 6410, + "time_per_iteration": 2.4989383220672607 + }, + { + "auxiliary_loss_clip": 0.06347367, + "auxiliary_loss_mlp": 0.01259072, + "balance_loss_clip": 0.06269964, + "balance_loss_mlp": 0.01255689, + "epoch": 0.3854501728543514, + "flos": 68752971365760.0, + "grad_norm": 0.6472142759451909, + "language_loss": 0.6009953, + "learning_rate": 2.81469465318033e-06, + "loss": 0.67705965, + "num_input_tokens_seen": 137645770, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.03390503, + "step": 6411, + "time_per_iteration": 3.221977472305298 + }, + { + "auxiliary_loss_clip": 0.06456396, + "auxiliary_loss_mlp": 0.01271512, + "balance_loss_clip": 0.06285078, + "balance_loss_mlp": 0.01257266, + "epoch": 0.38551029610701937, + "flos": 20491214855040.0, + "grad_norm": 1.7976443608036217, + "language_loss": 0.78197634, + "learning_rate": 2.814338952773397e-06, + "loss": 0.85925543, + "num_input_tokens_seen": 137664090, + "router_z_loss_clip": 1.71484375, + "router_z_loss_mlp": 0.14245605, + "step": 6412, + "time_per_iteration": 2.5103437900543213 + }, + { + "auxiliary_loss_clip": 0.06460511, + "auxiliary_loss_mlp": 0.01272302, + "balance_loss_clip": 0.06287103, + "balance_loss_mlp": 0.01255267, + "epoch": 0.38557041935968733, + "flos": 23477627825280.0, + "grad_norm": 1.8586112834781277, + "language_loss": 0.78031844, + "learning_rate": 2.8139832214866493e-06, + "loss": 0.85764652, + "num_input_tokens_seen": 137683190, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.17041016, + "step": 6413, + "time_per_iteration": 3.933619499206543 + }, + { + "auxiliary_loss_clip": 0.06342902, + "auxiliary_loss_mlp": 0.01258937, + "balance_loss_clip": 0.06265719, + "balance_loss_mlp": 0.01255421, + "epoch": 0.38563054261235535, + "flos": 63984623068800.0, + "grad_norm": 0.7920557210391271, + "language_loss": 0.61310911, + "learning_rate": 2.813627459333576e-06, + "loss": 0.6891275, + "num_input_tokens_seen": 137737315, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.03527832, + "step": 6414, + "time_per_iteration": 3.063016891479492 + }, + { + "auxiliary_loss_clip": 0.06460327, + "auxiliary_loss_mlp": 0.0126994, + "balance_loss_clip": 0.06286235, + "balance_loss_mlp": 0.01255552, + "epoch": 0.3856906658650233, + "flos": 23994584789760.0, + "grad_norm": 1.981122511442252, + "language_loss": 0.78303337, + "learning_rate": 2.8132716663276685e-06, + "loss": 0.86033607, + "num_input_tokens_seen": 137753535, + "router_z_loss_clip": 1.74121094, + "router_z_loss_mlp": 0.14379883, + "step": 6415, + "time_per_iteration": 3.915883779525757 + }, + { + "auxiliary_loss_clip": 0.06448652, + "auxiliary_loss_mlp": 0.0126708, + "balance_loss_clip": 0.06285002, + "balance_loss_mlp": 0.01253842, + "epoch": 0.3857507891176913, + "flos": 25014075816960.0, + "grad_norm": 1.7132059772930233, + "language_loss": 0.8030045, + "learning_rate": 2.8129158424824173e-06, + "loss": 0.88016176, + "num_input_tokens_seen": 137773405, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.13244629, + "step": 6416, + "time_per_iteration": 2.5699849128723145 + }, + { + "auxiliary_loss_clip": 0.06451176, + "auxiliary_loss_mlp": 0.01270271, + "balance_loss_clip": 0.06281747, + "balance_loss_mlp": 0.01256353, + "epoch": 0.38581091237035925, + "flos": 21542082036480.0, + "grad_norm": 1.7425936217489657, + "language_loss": 0.79650658, + "learning_rate": 2.8125599878113155e-06, + "loss": 0.87372106, + "num_input_tokens_seen": 137790810, + "router_z_loss_clip": 1.69433594, + "router_z_loss_mlp": 0.13909912, + "step": 6417, + "time_per_iteration": 2.490114450454712 + }, + { + "auxiliary_loss_clip": 0.06448381, + "auxiliary_loss_mlp": 0.01266538, + "balance_loss_clip": 0.06279223, + "balance_loss_mlp": 0.01252602, + "epoch": 0.3858710356230272, + "flos": 17389584120960.0, + "grad_norm": 1.6880082960892822, + "language_loss": 0.80518526, + "learning_rate": 2.8122041023278583e-06, + "loss": 0.88233447, + "num_input_tokens_seen": 137810265, + "router_z_loss_clip": 1.69042969, + "router_z_loss_mlp": 0.13922119, + "step": 6418, + "time_per_iteration": 2.5246312618255615 + }, + { + "auxiliary_loss_clip": 0.06443715, + "auxiliary_loss_mlp": 0.01268216, + "balance_loss_clip": 0.06276865, + "balance_loss_mlp": 0.01254662, + "epoch": 0.3859311588756952, + "flos": 20345836821120.0, + "grad_norm": 1.685120659988575, + "language_loss": 0.79909503, + "learning_rate": 2.8118481860455407e-06, + "loss": 0.87621439, + "num_input_tokens_seen": 137828580, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.13568115, + "step": 6419, + "time_per_iteration": 3.9288835525512695 + }, + { + "auxiliary_loss_clip": 0.06446663, + "auxiliary_loss_mlp": 0.01270123, + "balance_loss_clip": 0.06280138, + "balance_loss_mlp": 0.01254745, + "epoch": 0.38599128212836314, + "flos": 26328054418560.0, + "grad_norm": 1.9252922162684358, + "language_loss": 0.67831242, + "learning_rate": 2.8114922389778573e-06, + "loss": 0.75548029, + "num_input_tokens_seen": 137846145, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.15362549, + "step": 6420, + "time_per_iteration": 2.5568132400512695 + }, + { + "auxiliary_loss_clip": 0.06447464, + "auxiliary_loss_mlp": 0.0127397, + "balance_loss_clip": 0.06282772, + "balance_loss_mlp": 0.01260267, + "epoch": 0.3860514053810311, + "flos": 13559050719360.0, + "grad_norm": 1.8138727093850848, + "language_loss": 0.81903851, + "learning_rate": 2.8111362611383076e-06, + "loss": 0.89625287, + "num_input_tokens_seen": 137863705, + "router_z_loss_clip": 1.64746094, + "router_z_loss_mlp": 0.13690186, + "step": 6421, + "time_per_iteration": 2.6095190048217773 + }, + { + "auxiliary_loss_clip": 0.06448883, + "auxiliary_loss_mlp": 0.01270223, + "balance_loss_clip": 0.06279142, + "balance_loss_mlp": 0.01254654, + "epoch": 0.3861115286336991, + "flos": 20959689432960.0, + "grad_norm": 1.9472147710185277, + "language_loss": 0.72463268, + "learning_rate": 2.8107802525403886e-06, + "loss": 0.80182374, + "num_input_tokens_seen": 137880285, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.15576172, + "step": 6422, + "time_per_iteration": 3.9032654762268066 + }, + { + "auxiliary_loss_clip": 0.06443937, + "auxiliary_loss_mlp": 0.01268443, + "balance_loss_clip": 0.06280221, + "balance_loss_mlp": 0.01254925, + "epoch": 0.38617165188636704, + "flos": 16368290231040.0, + "grad_norm": 1.6312257254810183, + "language_loss": 0.66935605, + "learning_rate": 2.8104242131976025e-06, + "loss": 0.74647987, + "num_input_tokens_seen": 137898335, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.13531494, + "step": 6423, + "time_per_iteration": 2.4858603477478027 + }, + { + "auxiliary_loss_clip": 0.06452656, + "auxiliary_loss_mlp": 0.01269446, + "balance_loss_clip": 0.06281117, + "balance_loss_mlp": 0.01254771, + "epoch": 0.386231775139035, + "flos": 34795828005120.0, + "grad_norm": 1.7836916741722195, + "language_loss": 0.69448572, + "learning_rate": 2.810068143123449e-06, + "loss": 0.77170676, + "num_input_tokens_seen": 137918605, + "router_z_loss_clip": 1.71679688, + "router_z_loss_mlp": 0.14685059, + "step": 6424, + "time_per_iteration": 2.636545181274414 + }, + { + "auxiliary_loss_clip": 0.06446116, + "auxiliary_loss_mlp": 0.01269815, + "balance_loss_clip": 0.0628031, + "balance_loss_mlp": 0.0125616, + "epoch": 0.38629189839170297, + "flos": 21732672147840.0, + "grad_norm": 1.4876753960050375, + "language_loss": 0.72829968, + "learning_rate": 2.809712042331429e-06, + "loss": 0.80545902, + "num_input_tokens_seen": 137938245, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.13677979, + "step": 6425, + "time_per_iteration": 2.520872116088867 + }, + { + "auxiliary_loss_clip": 0.06454374, + "auxiliary_loss_mlp": 0.01269159, + "balance_loss_clip": 0.06279134, + "balance_loss_mlp": 0.01254383, + "epoch": 0.38635202164437094, + "flos": 27930315392640.0, + "grad_norm": 3.253764220801107, + "language_loss": 0.8113848, + "learning_rate": 2.8093559108350484e-06, + "loss": 0.88862014, + "num_input_tokens_seen": 137956770, + "router_z_loss_clip": 1.75097656, + "router_z_loss_mlp": 0.14752197, + "step": 6426, + "time_per_iteration": 2.577439785003662 + }, + { + "auxiliary_loss_clip": 0.06458677, + "auxiliary_loss_mlp": 0.01277199, + "balance_loss_clip": 0.06288534, + "balance_loss_mlp": 0.01261797, + "epoch": 0.38641214489703896, + "flos": 23593390640640.0, + "grad_norm": 1.9966810796758758, + "language_loss": 0.75299263, + "learning_rate": 2.80899974864781e-06, + "loss": 0.83035141, + "num_input_tokens_seen": 137977040, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.15393066, + "step": 6427, + "time_per_iteration": 2.538494825363159 + }, + { + "auxiliary_loss_clip": 0.06449243, + "auxiliary_loss_mlp": 0.01269948, + "balance_loss_clip": 0.0627961, + "balance_loss_mlp": 0.01255512, + "epoch": 0.3864722681497069, + "flos": 12646224339840.0, + "grad_norm": 1.7399599530073546, + "language_loss": 0.70451963, + "learning_rate": 2.8086435557832203e-06, + "loss": 0.78171146, + "num_input_tokens_seen": 137993545, + "router_z_loss_clip": 1.69726562, + "router_z_loss_mlp": 0.14428711, + "step": 6428, + "time_per_iteration": 2.501620292663574 + }, + { + "auxiliary_loss_clip": 0.06450263, + "auxiliary_loss_mlp": 0.01273584, + "balance_loss_clip": 0.06279485, + "balance_loss_mlp": 0.01259517, + "epoch": 0.3865323914023749, + "flos": 17604003519360.0, + "grad_norm": 1.9791686977360912, + "language_loss": 0.84605539, + "learning_rate": 2.8082873322547863e-06, + "loss": 0.92329377, + "num_input_tokens_seen": 138010140, + "router_z_loss_clip": 1.70898438, + "router_z_loss_mlp": 0.14074707, + "step": 6429, + "time_per_iteration": 2.4769797325134277 + }, + { + "auxiliary_loss_clip": 0.06453393, + "auxiliary_loss_mlp": 0.01272687, + "balance_loss_clip": 0.06283154, + "balance_loss_mlp": 0.01258679, + "epoch": 0.38659251465504285, + "flos": 18484908693120.0, + "grad_norm": 1.8799663311521415, + "language_loss": 0.81149292, + "learning_rate": 2.807931078076015e-06, + "loss": 0.88875371, + "num_input_tokens_seen": 138028880, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.13995361, + "step": 6430, + "time_per_iteration": 2.552243232727051 + }, + { + "auxiliary_loss_clip": 0.06342202, + "auxiliary_loss_mlp": 0.0126596, + "balance_loss_clip": 0.06266356, + "balance_loss_mlp": 0.0126256, + "epoch": 0.3866526379077108, + "flos": 64186533480960.0, + "grad_norm": 0.7018569193916078, + "language_loss": 0.58841789, + "learning_rate": 2.807574793260416e-06, + "loss": 0.66449958, + "num_input_tokens_seen": 138098090, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.03408813, + "step": 6431, + "time_per_iteration": 3.1865365505218506 + }, + { + "auxiliary_loss_clip": 0.06457522, + "auxiliary_loss_mlp": 0.01268383, + "balance_loss_clip": 0.06283836, + "balance_loss_mlp": 0.01253464, + "epoch": 0.3867127611603788, + "flos": 14392857098880.0, + "grad_norm": 1.8389423140015868, + "language_loss": 0.79719216, + "learning_rate": 2.8072184778215004e-06, + "loss": 0.87445116, + "num_input_tokens_seen": 138114735, + "router_z_loss_clip": 1.73828125, + "router_z_loss_mlp": 0.14910889, + "step": 6432, + "time_per_iteration": 2.5060834884643555 + }, + { + "auxiliary_loss_clip": 0.06456694, + "auxiliary_loss_mlp": 0.0127456, + "balance_loss_clip": 0.06279335, + "balance_loss_mlp": 0.01259217, + "epoch": 0.38677288441304675, + "flos": 20016870491520.0, + "grad_norm": 2.041684818915054, + "language_loss": 0.80982423, + "learning_rate": 2.806862131772779e-06, + "loss": 0.88713682, + "num_input_tokens_seen": 138130480, + "router_z_loss_clip": 1.77050781, + "router_z_loss_mlp": 0.15350342, + "step": 6433, + "time_per_iteration": 2.4978644847869873 + }, + { + "auxiliary_loss_clip": 0.06452015, + "auxiliary_loss_mlp": 0.01268045, + "balance_loss_clip": 0.06280582, + "balance_loss_mlp": 0.01251725, + "epoch": 0.3868330076657147, + "flos": 22243465837440.0, + "grad_norm": 1.5518308416482827, + "language_loss": 0.71316475, + "learning_rate": 2.806505755127765e-06, + "loss": 0.79036534, + "num_input_tokens_seen": 138150640, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.16308594, + "step": 6434, + "time_per_iteration": 2.5623676776885986 + }, + { + "auxiliary_loss_clip": 0.06457677, + "auxiliary_loss_mlp": 0.01269901, + "balance_loss_clip": 0.06278059, + "balance_loss_mlp": 0.01254547, + "epoch": 0.3868931309183827, + "flos": 16733076981120.0, + "grad_norm": 1.5292505515468358, + "language_loss": 0.77740347, + "learning_rate": 2.806149347899972e-06, + "loss": 0.85467923, + "num_input_tokens_seen": 138169700, + "router_z_loss_clip": 1.79394531, + "router_z_loss_mlp": 0.15350342, + "step": 6435, + "time_per_iteration": 2.4930777549743652 + }, + { + "auxiliary_loss_clip": 0.06446007, + "auxiliary_loss_mlp": 0.01272949, + "balance_loss_clip": 0.0627854, + "balance_loss_mlp": 0.01257594, + "epoch": 0.38695325417105064, + "flos": 22681360874880.0, + "grad_norm": 2.334489182765127, + "language_loss": 0.79902756, + "learning_rate": 2.805792910102915e-06, + "loss": 0.87621707, + "num_input_tokens_seen": 138185835, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.15362549, + "step": 6436, + "time_per_iteration": 2.595480442047119 + }, + { + "auxiliary_loss_clip": 0.06446151, + "auxiliary_loss_mlp": 0.01269254, + "balance_loss_clip": 0.0628051, + "balance_loss_mlp": 0.01255312, + "epoch": 0.3870133774237186, + "flos": 23118668933760.0, + "grad_norm": 1.736913277816888, + "language_loss": 0.77232099, + "learning_rate": 2.8054364417501093e-06, + "loss": 0.84947503, + "num_input_tokens_seen": 138204080, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.13934326, + "step": 6437, + "time_per_iteration": 2.6555299758911133 + }, + { + "auxiliary_loss_clip": 0.064465, + "auxiliary_loss_mlp": 0.01272869, + "balance_loss_clip": 0.06279578, + "balance_loss_mlp": 0.01259422, + "epoch": 0.3870735006763866, + "flos": 17681430291840.0, + "grad_norm": 2.573442514460841, + "language_loss": 0.81961322, + "learning_rate": 2.805079942855074e-06, + "loss": 0.89680696, + "num_input_tokens_seen": 138220710, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.13452148, + "step": 6438, + "time_per_iteration": 2.55658221244812 + }, + { + "auxiliary_loss_clip": 0.06449786, + "auxiliary_loss_mlp": 0.01268651, + "balance_loss_clip": 0.06278464, + "balance_loss_mlp": 0.01253869, + "epoch": 0.38713362392905454, + "flos": 23302676499840.0, + "grad_norm": 1.3535213690135137, + "language_loss": 0.75684851, + "learning_rate": 2.804723413431326e-06, + "loss": 0.83403289, + "num_input_tokens_seen": 138241720, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.14782715, + "step": 6439, + "time_per_iteration": 2.5023999214172363 + }, + { + "auxiliary_loss_clip": 0.06452194, + "auxiliary_loss_mlp": 0.01275332, + "balance_loss_clip": 0.06287295, + "balance_loss_mlp": 0.0126083, + "epoch": 0.38719374718172256, + "flos": 21037283913600.0, + "grad_norm": 2.8624272787557556, + "language_loss": 0.74227071, + "learning_rate": 2.8043668534923855e-06, + "loss": 0.81954598, + "num_input_tokens_seen": 138261885, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.1449585, + "step": 6440, + "time_per_iteration": 2.5370354652404785 + }, + { + "auxiliary_loss_clip": 0.06454886, + "auxiliary_loss_mlp": 0.01272767, + "balance_loss_clip": 0.06279822, + "balance_loss_mlp": 0.01257401, + "epoch": 0.3872538704343905, + "flos": 19615885977600.0, + "grad_norm": 1.8472167429080706, + "language_loss": 0.82205182, + "learning_rate": 2.804010263051774e-06, + "loss": 0.89932835, + "num_input_tokens_seen": 138280255, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.15368652, + "step": 6441, + "time_per_iteration": 2.4829154014587402 + }, + { + "auxiliary_loss_clip": 0.06449816, + "auxiliary_loss_mlp": 0.01273448, + "balance_loss_clip": 0.0628119, + "balance_loss_mlp": 0.01258833, + "epoch": 0.3873139936870585, + "flos": 17535800695680.0, + "grad_norm": 2.061540845511299, + "language_loss": 0.80687004, + "learning_rate": 2.8036536421230118e-06, + "loss": 0.8841027, + "num_input_tokens_seen": 138296675, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.14593506, + "step": 6442, + "time_per_iteration": 2.5348403453826904 + }, + { + "auxiliary_loss_clip": 0.0645024, + "auxiliary_loss_mlp": 0.01274941, + "balance_loss_clip": 0.0628161, + "balance_loss_mlp": 0.01260302, + "epoch": 0.38737411693972645, + "flos": 17792539205760.0, + "grad_norm": 1.5850563005203315, + "language_loss": 0.84242606, + "learning_rate": 2.803296990719624e-06, + "loss": 0.91967785, + "num_input_tokens_seen": 138314985, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.14642334, + "step": 6443, + "time_per_iteration": 2.475142240524292 + }, + { + "auxiliary_loss_clip": 0.06346577, + "auxiliary_loss_mlp": 0.01257136, + "balance_loss_clip": 0.06270638, + "balance_loss_mlp": 0.01253804, + "epoch": 0.3874342401923944, + "flos": 58320554624640.0, + "grad_norm": 0.7460963165264183, + "language_loss": 0.5025984, + "learning_rate": 2.8029403088551327e-06, + "loss": 0.57863545, + "num_input_tokens_seen": 138373275, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.03338623, + "step": 6444, + "time_per_iteration": 3.146993398666382 + }, + { + "auxiliary_loss_clip": 0.06439754, + "auxiliary_loss_mlp": 0.01267857, + "balance_loss_clip": 0.0627708, + "balance_loss_mlp": 0.01254088, + "epoch": 0.3874943634450624, + "flos": 17717628055680.0, + "grad_norm": 1.4103476418524727, + "language_loss": 0.79081571, + "learning_rate": 2.802583596543065e-06, + "loss": 0.86789179, + "num_input_tokens_seen": 138391145, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.13757324, + "step": 6445, + "time_per_iteration": 2.4769954681396484 + }, + { + "auxiliary_loss_clip": 0.06442489, + "auxiliary_loss_mlp": 0.01275349, + "balance_loss_clip": 0.06277544, + "balance_loss_mlp": 0.01261497, + "epoch": 0.38755448669773035, + "flos": 19250889592320.0, + "grad_norm": 1.890349589911811, + "language_loss": 0.81530821, + "learning_rate": 2.8022268537969474e-06, + "loss": 0.89248657, + "num_input_tokens_seen": 138409875, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.13861084, + "step": 6446, + "time_per_iteration": 2.5224525928497314 + }, + { + "auxiliary_loss_clip": 0.06442682, + "auxiliary_loss_mlp": 0.01277068, + "balance_loss_clip": 0.06275576, + "balance_loss_mlp": 0.01262489, + "epoch": 0.3876146099503983, + "flos": 20600437052160.0, + "grad_norm": 2.019397578580159, + "language_loss": 0.77555805, + "learning_rate": 2.801870080630306e-06, + "loss": 0.85275555, + "num_input_tokens_seen": 138428965, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.14575195, + "step": 6447, + "time_per_iteration": 2.4808783531188965 + }, + { + "auxiliary_loss_clip": 0.06441282, + "auxiliary_loss_mlp": 0.01273458, + "balance_loss_clip": 0.06277911, + "balance_loss_mlp": 0.01259355, + "epoch": 0.3876747332030663, + "flos": 19287129283200.0, + "grad_norm": 1.5926200346390118, + "language_loss": 0.76299512, + "learning_rate": 2.801513277056671e-06, + "loss": 0.84014249, + "num_input_tokens_seen": 138448090, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.14099121, + "step": 6448, + "time_per_iteration": 2.532101631164551 + }, + { + "auxiliary_loss_clip": 0.06445228, + "auxiliary_loss_mlp": 0.01276025, + "balance_loss_clip": 0.06280892, + "balance_loss_mlp": 0.01262363, + "epoch": 0.38773485645573424, + "flos": 18950699940480.0, + "grad_norm": 1.5288018173805344, + "language_loss": 0.76734072, + "learning_rate": 2.8011564430895725e-06, + "loss": 0.84455323, + "num_input_tokens_seen": 138466105, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.13647461, + "step": 6449, + "time_per_iteration": 2.515660524368286 + }, + { + "auxiliary_loss_clip": 0.06448871, + "auxiliary_loss_mlp": 0.01273884, + "balance_loss_clip": 0.0627744, + "balance_loss_mlp": 0.01258673, + "epoch": 0.3877949797084022, + "flos": 23077272216960.0, + "grad_norm": 1.7542495709483765, + "language_loss": 0.78832948, + "learning_rate": 2.800799578742542e-06, + "loss": 0.86555696, + "num_input_tokens_seen": 138485160, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.15209961, + "step": 6450, + "time_per_iteration": 2.5662050247192383 + }, + { + "auxiliary_loss_clip": 0.06452119, + "auxiliary_loss_mlp": 0.01276385, + "balance_loss_clip": 0.06276712, + "balance_loss_mlp": 0.01261317, + "epoch": 0.3878551029610702, + "flos": 29103150591360.0, + "grad_norm": 2.1638461576043095, + "language_loss": 0.78188771, + "learning_rate": 2.8004426840291106e-06, + "loss": 0.8591727, + "num_input_tokens_seen": 138504135, + "router_z_loss_clip": 1.75390625, + "router_z_loss_mlp": 0.15063477, + "step": 6451, + "time_per_iteration": 2.5734686851501465 + }, + { + "auxiliary_loss_clip": 0.06442447, + "auxiliary_loss_mlp": 0.01277813, + "balance_loss_clip": 0.06278168, + "balance_loss_mlp": 0.01263967, + "epoch": 0.38791522621373814, + "flos": 21002763231360.0, + "grad_norm": 1.7745661107883532, + "language_loss": 0.76657486, + "learning_rate": 2.800085758962812e-06, + "loss": 0.84377748, + "num_input_tokens_seen": 138523955, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.13842773, + "step": 6452, + "time_per_iteration": 4.083965301513672 + }, + { + "auxiliary_loss_clip": 0.06445795, + "auxiliary_loss_mlp": 0.01272941, + "balance_loss_clip": 0.06277959, + "balance_loss_mlp": 0.01258457, + "epoch": 0.3879753494664061, + "flos": 15492248593920.0, + "grad_norm": 1.5775897118958155, + "language_loss": 0.80075014, + "learning_rate": 2.799728803557182e-06, + "loss": 0.87793756, + "num_input_tokens_seen": 138541655, + "router_z_loss_clip": 1.67871094, + "router_z_loss_mlp": 0.14483643, + "step": 6453, + "time_per_iteration": 2.5186924934387207 + }, + { + "auxiliary_loss_clip": 0.06452494, + "auxiliary_loss_mlp": 0.01273182, + "balance_loss_clip": 0.06277925, + "balance_loss_mlp": 0.01258472, + "epoch": 0.3880354727190741, + "flos": 22060422593280.0, + "grad_norm": 1.7271767654368522, + "language_loss": 0.71748114, + "learning_rate": 2.7993718178257555e-06, + "loss": 0.79473794, + "num_input_tokens_seen": 138560860, + "router_z_loss_clip": 1.74609375, + "router_z_loss_mlp": 0.14697266, + "step": 6454, + "time_per_iteration": 2.516023635864258 + }, + { + "auxiliary_loss_clip": 0.0645522, + "auxiliary_loss_mlp": 0.01280556, + "balance_loss_clip": 0.06279911, + "balance_loss_mlp": 0.01263986, + "epoch": 0.3880955959717421, + "flos": 20346675361920.0, + "grad_norm": 2.0562500360548452, + "language_loss": 0.77941358, + "learning_rate": 2.7990148017820694e-06, + "loss": 0.85677135, + "num_input_tokens_seen": 138580200, + "router_z_loss_clip": 1.75097656, + "router_z_loss_mlp": 0.16577148, + "step": 6455, + "time_per_iteration": 3.9251530170440674 + }, + { + "auxiliary_loss_clip": 0.0644723, + "auxiliary_loss_mlp": 0.0127199, + "balance_loss_clip": 0.062791, + "balance_loss_mlp": 0.01257804, + "epoch": 0.38815571922441006, + "flos": 23082009972480.0, + "grad_norm": 1.5355571660803105, + "language_loss": 0.76081556, + "learning_rate": 2.798657755439662e-06, + "loss": 0.83800781, + "num_input_tokens_seen": 138598315, + "router_z_loss_clip": 1.68066406, + "router_z_loss_mlp": 0.14196777, + "step": 6456, + "time_per_iteration": 2.5377979278564453 + }, + { + "auxiliary_loss_clip": 0.064498, + "auxiliary_loss_mlp": 0.01279611, + "balance_loss_clip": 0.06277888, + "balance_loss_mlp": 0.01264811, + "epoch": 0.388215842477078, + "flos": 20783186807040.0, + "grad_norm": 2.2521174172947838, + "language_loss": 0.60975528, + "learning_rate": 2.7983006788120726e-06, + "loss": 0.68704933, + "num_input_tokens_seen": 138615695, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.14801025, + "step": 6457, + "time_per_iteration": 2.500054121017456 + }, + { + "auxiliary_loss_clip": 0.06447765, + "auxiliary_loss_mlp": 0.01274853, + "balance_loss_clip": 0.06275971, + "balance_loss_mlp": 0.01259308, + "epoch": 0.388275965729746, + "flos": 20454304331520.0, + "grad_norm": 3.4499577756661384, + "language_loss": 0.80527538, + "learning_rate": 2.797943571912841e-06, + "loss": 0.88250154, + "num_input_tokens_seen": 138633180, + "router_z_loss_clip": 1.71777344, + "router_z_loss_mlp": 0.15551758, + "step": 6458, + "time_per_iteration": 2.5349881649017334 + }, + { + "auxiliary_loss_clip": 0.06448271, + "auxiliary_loss_mlp": 0.01274317, + "balance_loss_clip": 0.06278434, + "balance_loss_mlp": 0.0125938, + "epoch": 0.38833608898241395, + "flos": 27899945487360.0, + "grad_norm": 3.532155031934189, + "language_loss": 0.8156774, + "learning_rate": 2.797586434755509e-06, + "loss": 0.89290321, + "num_input_tokens_seen": 138654785, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.14941406, + "step": 6459, + "time_per_iteration": 4.015187978744507 + }, + { + "auxiliary_loss_clip": 0.0644253, + "auxiliary_loss_mlp": 0.01277266, + "balance_loss_clip": 0.06278129, + "balance_loss_mlp": 0.01263789, + "epoch": 0.3883962122350819, + "flos": 18082079389440.0, + "grad_norm": 1.6405749509561738, + "language_loss": 0.62564123, + "learning_rate": 2.7972292673536202e-06, + "loss": 0.7028392, + "num_input_tokens_seen": 138673330, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.13470459, + "step": 6460, + "time_per_iteration": 2.497053861618042 + }, + { + "auxiliary_loss_clip": 0.06445154, + "auxiliary_loss_mlp": 0.01273315, + "balance_loss_clip": 0.06277992, + "balance_loss_mlp": 0.01259374, + "epoch": 0.3884563354877499, + "flos": 23628875644800.0, + "grad_norm": 1.560750838950793, + "language_loss": 0.86785483, + "learning_rate": 2.796872069720717e-06, + "loss": 0.94503951, + "num_input_tokens_seen": 138694185, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.1394043, + "step": 6461, + "time_per_iteration": 2.5308427810668945 + }, + { + "auxiliary_loss_clip": 0.06442384, + "auxiliary_loss_mlp": 0.01273139, + "balance_loss_clip": 0.06272203, + "balance_loss_mlp": 0.01258369, + "epoch": 0.38851645874041785, + "flos": 27460834565760.0, + "grad_norm": 2.5738865735247285, + "language_loss": 0.71770304, + "learning_rate": 2.7965148418703456e-06, + "loss": 0.79485828, + "num_input_tokens_seen": 138714625, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.14782715, + "step": 6462, + "time_per_iteration": 3.942819833755493 + }, + { + "auxiliary_loss_clip": 0.06442184, + "auxiliary_loss_mlp": 0.01271045, + "balance_loss_clip": 0.0627287, + "balance_loss_mlp": 0.01256036, + "epoch": 0.3885765819930858, + "flos": 25235035833600.0, + "grad_norm": 2.2250707690072886, + "language_loss": 0.76693827, + "learning_rate": 2.796157583816052e-06, + "loss": 0.84407055, + "num_input_tokens_seen": 138733585, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.15014648, + "step": 6463, + "time_per_iteration": 2.577254056930542 + }, + { + "auxiliary_loss_clip": 0.06458563, + "auxiliary_loss_mlp": 0.01275367, + "balance_loss_clip": 0.06282724, + "balance_loss_mlp": 0.01259441, + "epoch": 0.3886367052457538, + "flos": 16952317989120.0, + "grad_norm": 2.5235079856597196, + "language_loss": 0.70838499, + "learning_rate": 2.795800295571382e-06, + "loss": 0.78572428, + "num_input_tokens_seen": 138752335, + "router_z_loss_clip": 1.75683594, + "router_z_loss_mlp": 0.15930176, + "step": 6464, + "time_per_iteration": 2.501830816268921 + }, + { + "auxiliary_loss_clip": 0.06442419, + "auxiliary_loss_mlp": 0.01270994, + "balance_loss_clip": 0.06275325, + "balance_loss_mlp": 0.01255699, + "epoch": 0.38869682849842174, + "flos": 27160141789440.0, + "grad_norm": 1.8571499226781363, + "language_loss": 0.69473737, + "learning_rate": 2.7954429771498858e-06, + "loss": 0.77187151, + "num_input_tokens_seen": 138768450, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.15301514, + "step": 6465, + "time_per_iteration": 2.6060595512390137 + }, + { + "auxiliary_loss_clip": 0.06446355, + "auxiliary_loss_mlp": 0.01273054, + "balance_loss_clip": 0.06276145, + "balance_loss_mlp": 0.01257271, + "epoch": 0.3887569517510897, + "flos": 21069037411200.0, + "grad_norm": 2.3078416168388243, + "language_loss": 0.78628361, + "learning_rate": 2.7950856285651117e-06, + "loss": 0.86347771, + "num_input_tokens_seen": 138786775, + "router_z_loss_clip": 1.70019531, + "router_z_loss_mlp": 0.15771484, + "step": 6466, + "time_per_iteration": 2.503218650817871 + }, + { + "auxiliary_loss_clip": 0.06447446, + "auxiliary_loss_mlp": 0.01269245, + "balance_loss_clip": 0.0627599, + "balance_loss_mlp": 0.01255, + "epoch": 0.38881707500375773, + "flos": 29505141354240.0, + "grad_norm": 1.7748655394270907, + "language_loss": 0.695912, + "learning_rate": 2.794728249830611e-06, + "loss": 0.77307892, + "num_input_tokens_seen": 138810100, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.1427002, + "step": 6467, + "time_per_iteration": 2.6156952381134033 + }, + { + "auxiliary_loss_clip": 0.0644877, + "auxiliary_loss_mlp": 0.01269809, + "balance_loss_clip": 0.06277345, + "balance_loss_mlp": 0.01255403, + "epoch": 0.3888771982564257, + "flos": 17493146167680.0, + "grad_norm": 2.2278384059050285, + "language_loss": 0.83988351, + "learning_rate": 2.794370840959936e-06, + "loss": 0.91706932, + "num_input_tokens_seen": 138825140, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.14404297, + "step": 6468, + "time_per_iteration": 2.446979522705078 + }, + { + "auxiliary_loss_clip": 0.0644114, + "auxiliary_loss_mlp": 0.01268766, + "balance_loss_clip": 0.06273733, + "balance_loss_mlp": 0.01254628, + "epoch": 0.38893732150909366, + "flos": 21948517065600.0, + "grad_norm": 2.4269891965149837, + "language_loss": 0.84667963, + "learning_rate": 2.7940134019666383e-06, + "loss": 0.92377871, + "num_input_tokens_seen": 138844115, + "router_z_loss_clip": 1.67480469, + "router_z_loss_mlp": 0.14141846, + "step": 6469, + "time_per_iteration": 2.6123251914978027 + }, + { + "auxiliary_loss_clip": 0.06445388, + "auxiliary_loss_mlp": 0.01267071, + "balance_loss_clip": 0.06276623, + "balance_loss_mlp": 0.01252575, + "epoch": 0.3889974447617616, + "flos": 24282657527040.0, + "grad_norm": 1.7885497899924685, + "language_loss": 0.75114912, + "learning_rate": 2.793655932864273e-06, + "loss": 0.82827377, + "num_input_tokens_seen": 138860860, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.14508057, + "step": 6470, + "time_per_iteration": 2.5293121337890625 + }, + { + "auxiliary_loss_clip": 0.06447375, + "auxiliary_loss_mlp": 0.01272376, + "balance_loss_clip": 0.06277959, + "balance_loss_mlp": 0.01257785, + "epoch": 0.3890575680144296, + "flos": 25674356390400.0, + "grad_norm": 2.975621998510204, + "language_loss": 0.75126278, + "learning_rate": 2.7932984336663953e-06, + "loss": 0.8284604, + "num_input_tokens_seen": 138881910, + "router_z_loss_clip": 1.69433594, + "router_z_loss_mlp": 0.14575195, + "step": 6471, + "time_per_iteration": 2.6211233139038086 + }, + { + "auxiliary_loss_clip": 0.0644885, + "auxiliary_loss_mlp": 0.01268799, + "balance_loss_clip": 0.06277963, + "balance_loss_mlp": 0.01254291, + "epoch": 0.38911769126709755, + "flos": 22861636934400.0, + "grad_norm": 1.6871762941495017, + "language_loss": 0.68158531, + "learning_rate": 2.792940904386562e-06, + "loss": 0.75876176, + "num_input_tokens_seen": 138900975, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.1451416, + "step": 6472, + "time_per_iteration": 2.5192203521728516 + }, + { + "auxiliary_loss_clip": 0.06449802, + "auxiliary_loss_mlp": 0.01271384, + "balance_loss_clip": 0.06278318, + "balance_loss_mlp": 0.01256739, + "epoch": 0.3891778145197655, + "flos": 25454612257920.0, + "grad_norm": 1.6537492711017865, + "language_loss": 0.76761287, + "learning_rate": 2.7925833450383293e-06, + "loss": 0.84482473, + "num_input_tokens_seen": 138920795, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.14654541, + "step": 6473, + "time_per_iteration": 2.588179349899292 + }, + { + "auxiliary_loss_clip": 0.06451473, + "auxiliary_loss_mlp": 0.01269072, + "balance_loss_clip": 0.0627984, + "balance_loss_mlp": 0.01254803, + "epoch": 0.3892379377724335, + "flos": 14033227374720.0, + "grad_norm": 1.8453216957475485, + "language_loss": 0.71886337, + "learning_rate": 2.792225755635257e-06, + "loss": 0.79606879, + "num_input_tokens_seen": 138938770, + "router_z_loss_clip": 1.71582031, + "router_z_loss_mlp": 0.1427002, + "step": 6474, + "time_per_iteration": 2.5054657459259033 + }, + { + "auxiliary_loss_clip": 0.06452703, + "auxiliary_loss_mlp": 0.01266582, + "balance_loss_clip": 0.06280853, + "balance_loss_mlp": 0.01252945, + "epoch": 0.38929806102510145, + "flos": 20163715971840.0, + "grad_norm": 1.4152146042292184, + "language_loss": 0.68943882, + "learning_rate": 2.7918681361909046e-06, + "loss": 0.76663172, + "num_input_tokens_seen": 138958880, + "router_z_loss_clip": 1.71582031, + "router_z_loss_mlp": 0.1362915, + "step": 6475, + "time_per_iteration": 2.5646328926086426 + }, + { + "auxiliary_loss_clip": 0.06459899, + "auxiliary_loss_mlp": 0.01272247, + "balance_loss_clip": 0.06281739, + "balance_loss_mlp": 0.01257107, + "epoch": 0.3893581842777694, + "flos": 22170525258240.0, + "grad_norm": 1.7897820076570896, + "language_loss": 0.75474584, + "learning_rate": 2.7915104867188332e-06, + "loss": 0.83206725, + "num_input_tokens_seen": 138977240, + "router_z_loss_clip": 1.78125, + "router_z_loss_mlp": 0.15142822, + "step": 6476, + "time_per_iteration": 2.515145778656006 + }, + { + "auxiliary_loss_clip": 0.06356712, + "auxiliary_loss_mlp": 0.01262119, + "balance_loss_clip": 0.06275933, + "balance_loss_mlp": 0.01259353, + "epoch": 0.3894183075304374, + "flos": 67322936459520.0, + "grad_norm": 0.7612569916112396, + "language_loss": 0.58157814, + "learning_rate": 2.7911528072326055e-06, + "loss": 0.65776634, + "num_input_tokens_seen": 139039035, + "router_z_loss_clip": 0.80615234, + "router_z_loss_mlp": 0.0276947, + "step": 6477, + "time_per_iteration": 3.147226572036743 + }, + { + "auxiliary_loss_clip": 0.06461065, + "auxiliary_loss_mlp": 0.0127366, + "balance_loss_clip": 0.06287047, + "balance_loss_mlp": 0.01258711, + "epoch": 0.38947843078310534, + "flos": 18552734173440.0, + "grad_norm": 2.207057593016708, + "language_loss": 0.77832031, + "learning_rate": 2.7907950977457832e-06, + "loss": 0.85566759, + "num_input_tokens_seen": 139055560, + "router_z_loss_clip": 1.73925781, + "router_z_loss_mlp": 0.14953613, + "step": 6478, + "time_per_iteration": 2.5238850116729736 + }, + { + "auxiliary_loss_clip": 0.06450923, + "auxiliary_loss_mlp": 0.01273895, + "balance_loss_clip": 0.06281843, + "balance_loss_mlp": 0.01260162, + "epoch": 0.3895385540357733, + "flos": 14610253317120.0, + "grad_norm": 2.187508322407885, + "language_loss": 0.83306336, + "learning_rate": 2.7904373582719317e-06, + "loss": 0.91031158, + "num_input_tokens_seen": 139071865, + "router_z_loss_clip": 1.69140625, + "router_z_loss_mlp": 0.13739014, + "step": 6479, + "time_per_iteration": 2.5355920791625977 + }, + { + "auxiliary_loss_clip": 0.06451993, + "auxiliary_loss_mlp": 0.0126931, + "balance_loss_clip": 0.06282853, + "balance_loss_mlp": 0.01254414, + "epoch": 0.38959867728844133, + "flos": 19981469341440.0, + "grad_norm": 1.7759645272954405, + "language_loss": 0.80297941, + "learning_rate": 2.790079588824617e-06, + "loss": 0.8801924, + "num_input_tokens_seen": 139089640, + "router_z_loss_clip": 1.69140625, + "router_z_loss_mlp": 0.14892578, + "step": 6480, + "time_per_iteration": 2.51645565032959 + }, + { + "auxiliary_loss_clip": 0.06447603, + "auxiliary_loss_mlp": 0.01270991, + "balance_loss_clip": 0.06278986, + "balance_loss_mlp": 0.01256924, + "epoch": 0.3896588005411093, + "flos": 22678342128000.0, + "grad_norm": 1.6438066173178132, + "language_loss": 0.83259583, + "learning_rate": 2.7897217894174038e-06, + "loss": 0.90978175, + "num_input_tokens_seen": 139109365, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.140625, + "step": 6481, + "time_per_iteration": 2.542642116546631 + }, + { + "auxiliary_loss_clip": 0.06446713, + "auxiliary_loss_mlp": 0.0127065, + "balance_loss_clip": 0.0628217, + "balance_loss_mlp": 0.01257204, + "epoch": 0.38971892379377726, + "flos": 21002343960960.0, + "grad_norm": 1.5951406272778517, + "language_loss": 0.75640547, + "learning_rate": 2.789363960063863e-06, + "loss": 0.83357906, + "num_input_tokens_seen": 139128260, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.13458252, + "step": 6482, + "time_per_iteration": 2.5500056743621826 + }, + { + "auxiliary_loss_clip": 0.06452929, + "auxiliary_loss_mlp": 0.01268783, + "balance_loss_clip": 0.06281099, + "balance_loss_mlp": 0.01254853, + "epoch": 0.3897790470464452, + "flos": 22535060446080.0, + "grad_norm": 1.9197222218969183, + "language_loss": 0.78993875, + "learning_rate": 2.78900610077756e-06, + "loss": 0.86715591, + "num_input_tokens_seen": 139147315, + "router_z_loss_clip": 1.71777344, + "router_z_loss_mlp": 0.13922119, + "step": 6483, + "time_per_iteration": 2.5677597522735596 + }, + { + "auxiliary_loss_clip": 0.06452915, + "auxiliary_loss_mlp": 0.01271475, + "balance_loss_clip": 0.06281908, + "balance_loss_mlp": 0.01256157, + "epoch": 0.3898391702991132, + "flos": 26216484307200.0, + "grad_norm": 1.4915682478636534, + "language_loss": 0.80430162, + "learning_rate": 2.788648211572067e-06, + "loss": 0.88154554, + "num_input_tokens_seen": 139167270, + "router_z_loss_clip": 1.70898438, + "router_z_loss_mlp": 0.15307617, + "step": 6484, + "time_per_iteration": 2.582933187484741 + }, + { + "auxiliary_loss_clip": 0.06455952, + "auxiliary_loss_mlp": 0.01270999, + "balance_loss_clip": 0.06285131, + "balance_loss_mlp": 0.01255347, + "epoch": 0.38989929355178116, + "flos": 21071301471360.0, + "grad_norm": 1.959559170578303, + "language_loss": 0.7792083, + "learning_rate": 2.7882902924609557e-06, + "loss": 0.8564778, + "num_input_tokens_seen": 139185970, + "router_z_loss_clip": 1.70800781, + "router_z_loss_mlp": 0.15637207, + "step": 6485, + "time_per_iteration": 2.532944917678833 + }, + { + "auxiliary_loss_clip": 0.06453831, + "auxiliary_loss_mlp": 0.01268339, + "balance_loss_clip": 0.06280229, + "balance_loss_mlp": 0.01253444, + "epoch": 0.3899594168044491, + "flos": 25491229292160.0, + "grad_norm": 2.289645436499478, + "language_loss": 0.84979439, + "learning_rate": 2.7879323434577965e-06, + "loss": 0.92701602, + "num_input_tokens_seen": 139203730, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.14898682, + "step": 6486, + "time_per_iteration": 2.5743820667266846 + }, + { + "auxiliary_loss_clip": 0.06453397, + "auxiliary_loss_mlp": 0.01267827, + "balance_loss_clip": 0.06278502, + "balance_loss_mlp": 0.01253141, + "epoch": 0.3900195400571171, + "flos": 31147415452800.0, + "grad_norm": 1.9273192838933928, + "language_loss": 0.85622168, + "learning_rate": 2.7875743645761645e-06, + "loss": 0.93343389, + "num_input_tokens_seen": 139222560, + "router_z_loss_clip": 1.74902344, + "router_z_loss_mlp": 0.14672852, + "step": 6487, + "time_per_iteration": 2.580012321472168 + }, + { + "auxiliary_loss_clip": 0.06449067, + "auxiliary_loss_mlp": 0.01273707, + "balance_loss_clip": 0.06279142, + "balance_loss_mlp": 0.01259121, + "epoch": 0.39007966330978505, + "flos": 20236111499520.0, + "grad_norm": 1.468779525903349, + "language_loss": 0.73436427, + "learning_rate": 2.787216355829633e-06, + "loss": 0.81159198, + "num_input_tokens_seen": 139242165, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.14569092, + "step": 6488, + "time_per_iteration": 2.54925274848938 + }, + { + "auxiliary_loss_clip": 0.06455337, + "auxiliary_loss_mlp": 0.0127042, + "balance_loss_clip": 0.06281433, + "balance_loss_mlp": 0.01255072, + "epoch": 0.390139786562453, + "flos": 22535353935360.0, + "grad_norm": 1.7339556546984902, + "language_loss": 0.68455738, + "learning_rate": 2.786858317231779e-06, + "loss": 0.76181495, + "num_input_tokens_seen": 139262525, + "router_z_loss_clip": 1.73730469, + "router_z_loss_mlp": 0.15344238, + "step": 6489, + "time_per_iteration": 2.529337167739868 + }, + { + "auxiliary_loss_clip": 0.06445001, + "auxiliary_loss_mlp": 0.01269777, + "balance_loss_clip": 0.0627808, + "balance_loss_mlp": 0.01256079, + "epoch": 0.390199909815121, + "flos": 26440211508480.0, + "grad_norm": 1.5752653046558913, + "language_loss": 0.81221771, + "learning_rate": 2.7865002487961788e-06, + "loss": 0.88936543, + "num_input_tokens_seen": 139282835, + "router_z_loss_clip": 1.66894531, + "router_z_loss_mlp": 0.13690186, + "step": 6490, + "time_per_iteration": 2.580287218093872 + }, + { + "auxiliary_loss_clip": 0.06445351, + "auxiliary_loss_mlp": 0.01270566, + "balance_loss_clip": 0.06275269, + "balance_loss_mlp": 0.01255784, + "epoch": 0.39026003306778895, + "flos": 17280278069760.0, + "grad_norm": 1.8612382479767444, + "language_loss": 0.89715946, + "learning_rate": 2.7861421505364104e-06, + "loss": 0.97431856, + "num_input_tokens_seen": 139299490, + "router_z_loss_clip": 1.69824219, + "router_z_loss_mlp": 0.14782715, + "step": 6491, + "time_per_iteration": 2.476393461227417 + }, + { + "auxiliary_loss_clip": 0.06446734, + "auxiliary_loss_mlp": 0.01271059, + "balance_loss_clip": 0.06275047, + "balance_loss_mlp": 0.01256325, + "epoch": 0.3903201563204569, + "flos": 24539354110080.0, + "grad_norm": 1.7715634168525083, + "language_loss": 0.78570807, + "learning_rate": 2.7857840224660523e-06, + "loss": 0.86288601, + "num_input_tokens_seen": 139317865, + "router_z_loss_clip": 1.71679688, + "router_z_loss_mlp": 0.14746094, + "step": 6492, + "time_per_iteration": 3.918022871017456 + }, + { + "auxiliary_loss_clip": 0.06448489, + "auxiliary_loss_mlp": 0.01270077, + "balance_loss_clip": 0.06278895, + "balance_loss_mlp": 0.01255528, + "epoch": 0.39038027957312493, + "flos": 23774547168000.0, + "grad_norm": 1.9649032306705667, + "language_loss": 0.74995399, + "learning_rate": 2.7854258645986857e-06, + "loss": 0.82713962, + "num_input_tokens_seen": 139339840, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.14544678, + "step": 6493, + "time_per_iteration": 2.5337636470794678 + }, + { + "auxiliary_loss_clip": 0.06457585, + "auxiliary_loss_mlp": 0.0126917, + "balance_loss_clip": 0.06280027, + "balance_loss_mlp": 0.0125341, + "epoch": 0.3904404028257929, + "flos": 14105832537600.0, + "grad_norm": 2.4323863844033498, + "language_loss": 0.76480663, + "learning_rate": 2.7850676769478916e-06, + "loss": 0.84207416, + "num_input_tokens_seen": 139357555, + "router_z_loss_clip": 1.77636719, + "router_z_loss_mlp": 0.15771484, + "step": 6494, + "time_per_iteration": 3.9828202724456787 + }, + { + "auxiliary_loss_clip": 0.06461826, + "auxiliary_loss_mlp": 0.01272307, + "balance_loss_clip": 0.06279928, + "balance_loss_mlp": 0.01255582, + "epoch": 0.39050052607846086, + "flos": 16915742881920.0, + "grad_norm": 1.9306711407360488, + "language_loss": 0.74818373, + "learning_rate": 2.7847094595272525e-06, + "loss": 0.82552505, + "num_input_tokens_seen": 139374455, + "router_z_loss_clip": 1.81933594, + "router_z_loss_mlp": 0.16723633, + "step": 6495, + "time_per_iteration": 2.5104000568389893 + }, + { + "auxiliary_loss_clip": 0.06450078, + "auxiliary_loss_mlp": 0.01273142, + "balance_loss_clip": 0.06281738, + "balance_loss_mlp": 0.01257358, + "epoch": 0.39056064933112883, + "flos": 25921912878720.0, + "grad_norm": 2.748187950361319, + "language_loss": 0.68202364, + "learning_rate": 2.784351212350352e-06, + "loss": 0.75925589, + "num_input_tokens_seen": 139394770, + "router_z_loss_clip": 1.68554688, + "router_z_loss_mlp": 0.15783691, + "step": 6496, + "time_per_iteration": 2.550957202911377 + }, + { + "auxiliary_loss_clip": 0.0637021, + "auxiliary_loss_mlp": 0.01254222, + "balance_loss_clip": 0.06292024, + "balance_loss_mlp": 0.01251394, + "epoch": 0.3906207725837968, + "flos": 60046125281280.0, + "grad_norm": 0.6447698339715318, + "language_loss": 0.53706288, + "learning_rate": 2.783992935430775e-06, + "loss": 0.61330724, + "num_input_tokens_seen": 139454760, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 0.02824402, + "step": 6497, + "time_per_iteration": 3.2988505363464355 + }, + { + "auxiliary_loss_clip": 0.06453034, + "auxiliary_loss_mlp": 0.01276113, + "balance_loss_clip": 0.06281406, + "balance_loss_mlp": 0.01261265, + "epoch": 0.39068089583646476, + "flos": 21074949123840.0, + "grad_norm": 2.0090604178847795, + "language_loss": 0.68947327, + "learning_rate": 2.7836346287821068e-06, + "loss": 0.76676476, + "num_input_tokens_seen": 139472645, + "router_z_loss_clip": 1.71679688, + "router_z_loss_mlp": 0.14837646, + "step": 6498, + "time_per_iteration": 3.9722609519958496 + }, + { + "auxiliary_loss_clip": 0.06365327, + "auxiliary_loss_mlp": 0.01254987, + "balance_loss_clip": 0.06287005, + "balance_loss_mlp": 0.01252178, + "epoch": 0.3907410190891327, + "flos": 70468269897600.0, + "grad_norm": 0.719858085665683, + "language_loss": 0.51721394, + "learning_rate": 2.783276292417936e-06, + "loss": 0.59341711, + "num_input_tokens_seen": 139536730, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 0.02807617, + "step": 6499, + "time_per_iteration": 3.209885835647583 + }, + { + "auxiliary_loss_clip": 0.06452541, + "auxiliary_loss_mlp": 0.01273785, + "balance_loss_clip": 0.06277416, + "balance_loss_mlp": 0.0125681, + "epoch": 0.3908011423418007, + "flos": 27969531903360.0, + "grad_norm": 1.5964691032272669, + "language_loss": 0.7347858, + "learning_rate": 2.7829179263518487e-06, + "loss": 0.81204903, + "num_input_tokens_seen": 139557540, + "router_z_loss_clip": 1.75195312, + "router_z_loss_mlp": 0.16992188, + "step": 6500, + "time_per_iteration": 2.5915534496307373 + }, + { + "auxiliary_loss_clip": 0.06456988, + "auxiliary_loss_mlp": 0.01269402, + "balance_loss_clip": 0.06284038, + "balance_loss_mlp": 0.01254728, + "epoch": 0.39086126559446865, + "flos": 24468971080320.0, + "grad_norm": 2.170342944486325, + "language_loss": 0.68858671, + "learning_rate": 2.7825595305974354e-06, + "loss": 0.7658506, + "num_input_tokens_seen": 139576875, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.14691162, + "step": 6501, + "time_per_iteration": 3.948155164718628 + }, + { + "auxiliary_loss_clip": 0.06445958, + "auxiliary_loss_mlp": 0.01271431, + "balance_loss_clip": 0.06277448, + "balance_loss_mlp": 0.01256327, + "epoch": 0.3909213888471366, + "flos": 16946406276480.0, + "grad_norm": 1.631531331045391, + "language_loss": 0.78994954, + "learning_rate": 2.782201105168287e-06, + "loss": 0.86712337, + "num_input_tokens_seen": 139594295, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.15100098, + "step": 6502, + "time_per_iteration": 2.505021810531616 + }, + { + "auxiliary_loss_clip": 0.06451446, + "auxiliary_loss_mlp": 0.01272758, + "balance_loss_clip": 0.06288067, + "balance_loss_mlp": 0.01259133, + "epoch": 0.3909815120998046, + "flos": 29286109981440.0, + "grad_norm": 4.8026818588998115, + "language_loss": 0.80286908, + "learning_rate": 2.7818426500779932e-06, + "loss": 0.88011116, + "num_input_tokens_seen": 139614080, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.13623047, + "step": 6503, + "time_per_iteration": 2.6041667461395264 + }, + { + "auxiliary_loss_clip": 0.06444375, + "auxiliary_loss_mlp": 0.01267951, + "balance_loss_clip": 0.06278107, + "balance_loss_mlp": 0.01253574, + "epoch": 0.39104163535247255, + "flos": 18956947069440.0, + "grad_norm": 1.8714653526076386, + "language_loss": 0.71717298, + "learning_rate": 2.7814841653401485e-06, + "loss": 0.79429626, + "num_input_tokens_seen": 139632755, + "router_z_loss_clip": 1.66113281, + "router_z_loss_mlp": 0.14379883, + "step": 6504, + "time_per_iteration": 2.499645471572876 + }, + { + "auxiliary_loss_clip": 0.06449269, + "auxiliary_loss_mlp": 0.01267487, + "balance_loss_clip": 0.06279607, + "balance_loss_mlp": 0.0125379, + "epoch": 0.3911017586051405, + "flos": 26330611968000.0, + "grad_norm": 1.7094242767760466, + "language_loss": 0.83403468, + "learning_rate": 2.7811256509683454e-06, + "loss": 0.91120219, + "num_input_tokens_seen": 139654205, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.137146, + "step": 6505, + "time_per_iteration": 2.5698060989379883 + }, + { + "auxiliary_loss_clip": 0.06447234, + "auxiliary_loss_mlp": 0.01267488, + "balance_loss_clip": 0.06281015, + "balance_loss_mlp": 0.01253022, + "epoch": 0.3911618818578085, + "flos": 21842313615360.0, + "grad_norm": 2.3254017668705083, + "language_loss": 0.71427596, + "learning_rate": 2.7807671069761797e-06, + "loss": 0.7914232, + "num_input_tokens_seen": 139673595, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.14465332, + "step": 6506, + "time_per_iteration": 2.4988996982574463 + }, + { + "auxiliary_loss_clip": 0.06443267, + "auxiliary_loss_mlp": 0.01271489, + "balance_loss_clip": 0.0628104, + "balance_loss_mlp": 0.01258149, + "epoch": 0.3912220051104765, + "flos": 16364768359680.0, + "grad_norm": 2.639532414168514, + "language_loss": 0.75588799, + "learning_rate": 2.7804085333772477e-06, + "loss": 0.83303547, + "num_input_tokens_seen": 139690565, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.13348389, + "step": 6507, + "time_per_iteration": 2.506723403930664 + }, + { + "auxiliary_loss_clip": 0.06355534, + "auxiliary_loss_mlp": 0.01255368, + "balance_loss_clip": 0.0627788, + "balance_loss_mlp": 0.01252429, + "epoch": 0.39128212836314447, + "flos": 71071179552000.0, + "grad_norm": 0.751869236178363, + "language_loss": 0.56649405, + "learning_rate": 2.7800499301851446e-06, + "loss": 0.64260316, + "num_input_tokens_seen": 139749420, + "router_z_loss_clip": 0.77441406, + "router_z_loss_mlp": 0.02935791, + "step": 6508, + "time_per_iteration": 3.282604455947876 + }, + { + "auxiliary_loss_clip": 0.06448714, + "auxiliary_loss_mlp": 0.01268575, + "balance_loss_clip": 0.06280237, + "balance_loss_mlp": 0.01254294, + "epoch": 0.39134225161581243, + "flos": 20336948288640.0, + "grad_norm": 1.8618605672003898, + "language_loss": 0.76758552, + "learning_rate": 2.779691297413471e-06, + "loss": 0.84475839, + "num_input_tokens_seen": 139766265, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.14276123, + "step": 6509, + "time_per_iteration": 2.5330445766448975 + }, + { + "auxiliary_loss_clip": 0.0644654, + "auxiliary_loss_mlp": 0.01272023, + "balance_loss_clip": 0.06278333, + "balance_loss_mlp": 0.01256073, + "epoch": 0.3914023748684804, + "flos": 17023916903040.0, + "grad_norm": 3.0317271524647427, + "language_loss": 0.83418059, + "learning_rate": 2.779332635075825e-06, + "loss": 0.91136616, + "num_input_tokens_seen": 139782400, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.1595459, + "step": 6510, + "time_per_iteration": 2.484149217605591 + }, + { + "auxiliary_loss_clip": 0.06450167, + "auxiliary_loss_mlp": 0.01268149, + "balance_loss_clip": 0.06277542, + "balance_loss_mlp": 0.01254463, + "epoch": 0.39146249812114836, + "flos": 18411045719040.0, + "grad_norm": 1.8343195842354416, + "language_loss": 0.77659726, + "learning_rate": 2.7789739431858073e-06, + "loss": 0.85378045, + "num_input_tokens_seen": 139801435, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.13684082, + "step": 6511, + "time_per_iteration": 2.493088722229004 + }, + { + "auxiliary_loss_clip": 0.06343137, + "auxiliary_loss_mlp": 0.01261237, + "balance_loss_clip": 0.06266295, + "balance_loss_mlp": 0.01258513, + "epoch": 0.3915226213738163, + "flos": 67659659291520.0, + "grad_norm": 0.7080449531762238, + "language_loss": 0.57720256, + "learning_rate": 2.7786152217570196e-06, + "loss": 0.65324628, + "num_input_tokens_seen": 139869700, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.02726746, + "step": 6512, + "time_per_iteration": 3.217658042907715 + }, + { + "auxiliary_loss_clip": 0.06445479, + "auxiliary_loss_mlp": 0.01273045, + "balance_loss_clip": 0.06275767, + "balance_loss_mlp": 0.01257452, + "epoch": 0.3915827446264843, + "flos": 26366516242560.0, + "grad_norm": 1.5252758876056967, + "language_loss": 0.69950658, + "learning_rate": 2.7782564708030647e-06, + "loss": 0.77669179, + "num_input_tokens_seen": 139890140, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.15600586, + "step": 6513, + "time_per_iteration": 2.560802936553955 + }, + { + "auxiliary_loss_clip": 0.06451759, + "auxiliary_loss_mlp": 0.01273121, + "balance_loss_clip": 0.06276838, + "balance_loss_mlp": 0.01258208, + "epoch": 0.39164286787915226, + "flos": 21950236074240.0, + "grad_norm": 2.7587511630204777, + "language_loss": 0.76322639, + "learning_rate": 2.7778976903375464e-06, + "loss": 0.8404752, + "num_input_tokens_seen": 139908020, + "router_z_loss_clip": 1.74707031, + "router_z_loss_mlp": 0.14916992, + "step": 6514, + "time_per_iteration": 2.499101400375366 + }, + { + "auxiliary_loss_clip": 0.0644438, + "auxiliary_loss_mlp": 0.01269565, + "balance_loss_clip": 0.06276566, + "balance_loss_mlp": 0.0125619, + "epoch": 0.3917029911318202, + "flos": 16405536170880.0, + "grad_norm": 1.811906351936664, + "language_loss": 0.782359, + "learning_rate": 2.7775388803740693e-06, + "loss": 0.8594985, + "num_input_tokens_seen": 139926180, + "router_z_loss_clip": 1.67773438, + "router_z_loss_mlp": 0.13378906, + "step": 6515, + "time_per_iteration": 2.5104947090148926 + }, + { + "auxiliary_loss_clip": 0.06443886, + "auxiliary_loss_mlp": 0.01270163, + "balance_loss_clip": 0.06277545, + "balance_loss_mlp": 0.0125705, + "epoch": 0.3917631143844882, + "flos": 26218580659200.0, + "grad_norm": 1.4298617884300358, + "language_loss": 0.79790455, + "learning_rate": 2.7771800409262406e-06, + "loss": 0.87504506, + "num_input_tokens_seen": 139947420, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.13122559, + "step": 6516, + "time_per_iteration": 2.5912764072418213 + }, + { + "auxiliary_loss_clip": 0.06446922, + "auxiliary_loss_mlp": 0.0126951, + "balance_loss_clip": 0.06278265, + "balance_loss_mlp": 0.0125511, + "epoch": 0.39182323763715615, + "flos": 18553740422400.0, + "grad_norm": 1.8457537699229483, + "language_loss": 0.70234001, + "learning_rate": 2.7768211720076665e-06, + "loss": 0.7795043, + "num_input_tokens_seen": 139965800, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.14404297, + "step": 6517, + "time_per_iteration": 2.630155324935913 + }, + { + "auxiliary_loss_clip": 0.06449963, + "auxiliary_loss_mlp": 0.01269735, + "balance_loss_clip": 0.06279542, + "balance_loss_mlp": 0.01254905, + "epoch": 0.3918833608898241, + "flos": 34322112547200.0, + "grad_norm": 1.6944592538331644, + "language_loss": 0.72209281, + "learning_rate": 2.776462273631956e-06, + "loss": 0.79928982, + "num_input_tokens_seen": 139988140, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.1484375, + "step": 6518, + "time_per_iteration": 2.6439340114593506 + }, + { + "auxiliary_loss_clip": 0.06453219, + "auxiliary_loss_mlp": 0.0127268, + "balance_loss_clip": 0.06280756, + "balance_loss_mlp": 0.0125751, + "epoch": 0.3919434841424921, + "flos": 36948434595840.0, + "grad_norm": 1.7409198797741048, + "language_loss": 0.62180024, + "learning_rate": 2.7761033458127177e-06, + "loss": 0.69905925, + "num_input_tokens_seen": 140010060, + "router_z_loss_clip": 1.72363281, + "router_z_loss_mlp": 0.15179443, + "step": 6519, + "time_per_iteration": 2.6407580375671387 + }, + { + "auxiliary_loss_clip": 0.06457552, + "auxiliary_loss_mlp": 0.01269986, + "balance_loss_clip": 0.06280086, + "balance_loss_mlp": 0.01253535, + "epoch": 0.3920036073951601, + "flos": 23514915692160.0, + "grad_norm": 2.3243103288051485, + "language_loss": 0.6728406, + "learning_rate": 2.775744388563563e-06, + "loss": 0.75011599, + "num_input_tokens_seen": 140029400, + "router_z_loss_clip": 1.77441406, + "router_z_loss_mlp": 0.16442871, + "step": 6520, + "time_per_iteration": 2.557736396789551 + }, + { + "auxiliary_loss_clip": 0.06452015, + "auxiliary_loss_mlp": 0.01272672, + "balance_loss_clip": 0.06281003, + "balance_loss_mlp": 0.0125845, + "epoch": 0.39206373064782807, + "flos": 18412051968000.0, + "grad_norm": 5.792319014223258, + "language_loss": 0.79119205, + "learning_rate": 2.775385401898104e-06, + "loss": 0.86843884, + "num_input_tokens_seen": 140048940, + "router_z_loss_clip": 1.70898438, + "router_z_loss_mlp": 0.14233398, + "step": 6521, + "time_per_iteration": 2.487144947052002 + }, + { + "auxiliary_loss_clip": 0.0645816, + "auxiliary_loss_mlp": 0.01271766, + "balance_loss_clip": 0.06282392, + "balance_loss_mlp": 0.01255297, + "epoch": 0.39212385390049603, + "flos": 12318012696960.0, + "grad_norm": 2.63137671789129, + "language_loss": 0.70893902, + "learning_rate": 2.775026385829952e-06, + "loss": 0.78623831, + "num_input_tokens_seen": 140066380, + "router_z_loss_clip": 1.75976562, + "router_z_loss_mlp": 0.16473389, + "step": 6522, + "time_per_iteration": 2.501777410507202 + }, + { + "auxiliary_loss_clip": 0.06455532, + "auxiliary_loss_mlp": 0.01272148, + "balance_loss_clip": 0.06282486, + "balance_loss_mlp": 0.01257693, + "epoch": 0.392183977153164, + "flos": 19725275882880.0, + "grad_norm": 2.1277990565539087, + "language_loss": 0.77424598, + "learning_rate": 2.774667340372722e-06, + "loss": 0.8515228, + "num_input_tokens_seen": 140085275, + "router_z_loss_clip": 1.73046875, + "router_z_loss_mlp": 0.14453125, + "step": 6523, + "time_per_iteration": 2.494900941848755 + }, + { + "auxiliary_loss_clip": 0.0645543, + "auxiliary_loss_mlp": 0.01272716, + "balance_loss_clip": 0.06282179, + "balance_loss_mlp": 0.01258769, + "epoch": 0.39224410040583196, + "flos": 33153092709120.0, + "grad_norm": 2.7826558407508855, + "language_loss": 0.62314886, + "learning_rate": 2.7743082655400293e-06, + "loss": 0.70043033, + "num_input_tokens_seen": 140105105, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.13964844, + "step": 6524, + "time_per_iteration": 2.6380085945129395 + }, + { + "auxiliary_loss_clip": 0.06452876, + "auxiliary_loss_mlp": 0.01268165, + "balance_loss_clip": 0.06281661, + "balance_loss_mlp": 0.01252895, + "epoch": 0.39230422365849993, + "flos": 27789884749440.0, + "grad_norm": 1.7105729654368218, + "language_loss": 0.74638754, + "learning_rate": 2.773949161345489e-06, + "loss": 0.82359803, + "num_input_tokens_seen": 140125645, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.15264893, + "step": 6525, + "time_per_iteration": 2.5430080890655518 + }, + { + "auxiliary_loss_clip": 0.06454577, + "auxiliary_loss_mlp": 0.0126824, + "balance_loss_clip": 0.06280737, + "balance_loss_mlp": 0.01253863, + "epoch": 0.3923643469111679, + "flos": 17937497969280.0, + "grad_norm": 2.1060109606385673, + "language_loss": 0.8182255, + "learning_rate": 2.773590027802719e-06, + "loss": 0.89545369, + "num_input_tokens_seen": 140141925, + "router_z_loss_clip": 1.73925781, + "router_z_loss_mlp": 0.14367676, + "step": 6526, + "time_per_iteration": 2.4994354248046875 + }, + { + "auxiliary_loss_clip": 0.06454204, + "auxiliary_loss_mlp": 0.01269978, + "balance_loss_clip": 0.06281518, + "balance_loss_mlp": 0.01255482, + "epoch": 0.39242447016383586, + "flos": 24066141776640.0, + "grad_norm": 1.5927090967738864, + "language_loss": 0.70157206, + "learning_rate": 2.7732308649253383e-06, + "loss": 0.77881384, + "num_input_tokens_seen": 140160965, + "router_z_loss_clip": 1.72753906, + "router_z_loss_mlp": 0.14501953, + "step": 6527, + "time_per_iteration": 2.5232326984405518 + }, + { + "auxiliary_loss_clip": 0.06452368, + "auxiliary_loss_mlp": 0.01268854, + "balance_loss_clip": 0.06281934, + "balance_loss_mlp": 0.01254245, + "epoch": 0.3924845934165038, + "flos": 10667562825600.0, + "grad_norm": 3.256824520755738, + "language_loss": 0.82039493, + "learning_rate": 2.772871672726965e-06, + "loss": 0.89760715, + "num_input_tokens_seen": 140177780, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.14605713, + "step": 6528, + "time_per_iteration": 2.498852014541626 + }, + { + "auxiliary_loss_clip": 0.06450985, + "auxiliary_loss_mlp": 0.0127277, + "balance_loss_clip": 0.06284485, + "balance_loss_mlp": 0.01258048, + "epoch": 0.3925447166691718, + "flos": 31253493121920.0, + "grad_norm": 1.712128770360143, + "language_loss": 0.68666142, + "learning_rate": 2.7725124512212205e-06, + "loss": 0.76389897, + "num_input_tokens_seen": 140201660, + "router_z_loss_clip": 1.66308594, + "router_z_loss_mlp": 0.14733887, + "step": 6529, + "time_per_iteration": 2.588303565979004 + }, + { + "auxiliary_loss_clip": 0.06454393, + "auxiliary_loss_mlp": 0.01267174, + "balance_loss_clip": 0.06281163, + "balance_loss_mlp": 0.01252213, + "epoch": 0.39260483992183975, + "flos": 29421215890560.0, + "grad_norm": 2.512935177473184, + "language_loss": 0.80622673, + "learning_rate": 2.7721532004217267e-06, + "loss": 0.8834424, + "num_input_tokens_seen": 140218585, + "router_z_loss_clip": 1.73339844, + "router_z_loss_mlp": 0.14959717, + "step": 6530, + "time_per_iteration": 2.5896732807159424 + }, + { + "auxiliary_loss_clip": 0.06449011, + "auxiliary_loss_mlp": 0.01267415, + "balance_loss_clip": 0.06279489, + "balance_loss_mlp": 0.0125252, + "epoch": 0.3926649631745077, + "flos": 22864571827200.0, + "grad_norm": 1.8446830755174628, + "language_loss": 0.76176864, + "learning_rate": 2.7717939203421063e-06, + "loss": 0.83893287, + "num_input_tokens_seen": 140239905, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.14892578, + "step": 6531, + "time_per_iteration": 3.9335060119628906 + }, + { + "auxiliary_loss_clip": 0.06348795, + "auxiliary_loss_mlp": 0.01256081, + "balance_loss_clip": 0.06271987, + "balance_loss_mlp": 0.01253434, + "epoch": 0.3927250864271757, + "flos": 63911892124800.0, + "grad_norm": 0.7987882767963658, + "language_loss": 0.6030035, + "learning_rate": 2.7714346109959822e-06, + "loss": 0.67905223, + "num_input_tokens_seen": 140293820, + "router_z_loss_clip": 0.76611328, + "router_z_loss_mlp": 0.02648926, + "step": 6532, + "time_per_iteration": 3.023615598678589 + }, + { + "auxiliary_loss_clip": 0.06346735, + "auxiliary_loss_mlp": 0.01258162, + "balance_loss_clip": 0.06270058, + "balance_loss_mlp": 0.01255445, + "epoch": 0.3927852096798437, + "flos": 68931486489600.0, + "grad_norm": 0.7618686105615924, + "language_loss": 0.55496854, + "learning_rate": 2.771075272396981e-06, + "loss": 0.63101745, + "num_input_tokens_seen": 140360420, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.02720642, + "step": 6533, + "time_per_iteration": 3.2504148483276367 + }, + { + "auxiliary_loss_clip": 0.06452841, + "auxiliary_loss_mlp": 0.01269959, + "balance_loss_clip": 0.06277935, + "balance_loss_mlp": 0.01254557, + "epoch": 0.39284533293251167, + "flos": 29723711529600.0, + "grad_norm": 1.823371664681604, + "language_loss": 0.76552856, + "learning_rate": 2.7707159045587284e-06, + "loss": 0.84275657, + "num_input_tokens_seen": 140381950, + "router_z_loss_clip": 1.74804688, + "router_z_loss_mlp": 0.15405273, + "step": 6534, + "time_per_iteration": 4.098775148391724 + }, + { + "auxiliary_loss_clip": 0.06459314, + "auxiliary_loss_mlp": 0.01269352, + "balance_loss_clip": 0.06282811, + "balance_loss_mlp": 0.01253974, + "epoch": 0.39290545618517964, + "flos": 18558016980480.0, + "grad_norm": 2.2164588420846267, + "language_loss": 0.78656316, + "learning_rate": 2.770356507494851e-06, + "loss": 0.86384982, + "num_input_tokens_seen": 140399410, + "router_z_loss_clip": 1.76464844, + "router_z_loss_mlp": 0.15380859, + "step": 6535, + "time_per_iteration": 2.4923341274261475 + }, + { + "auxiliary_loss_clip": 0.06449763, + "auxiliary_loss_mlp": 0.01266735, + "balance_loss_clip": 0.06282885, + "balance_loss_mlp": 0.01253592, + "epoch": 0.3929655794378476, + "flos": 26256581285760.0, + "grad_norm": 2.2738959430224326, + "language_loss": 0.69076276, + "learning_rate": 2.769997081218978e-06, + "loss": 0.76792771, + "num_input_tokens_seen": 140419055, + "router_z_loss_clip": 1.66699219, + "router_z_loss_mlp": 0.1315918, + "step": 6536, + "time_per_iteration": 2.5980727672576904 + }, + { + "auxiliary_loss_clip": 0.06448898, + "auxiliary_loss_mlp": 0.0127095, + "balance_loss_clip": 0.0628474, + "balance_loss_mlp": 0.01257265, + "epoch": 0.39302570269051557, + "flos": 29285564929920.0, + "grad_norm": 1.8741537429596062, + "language_loss": 0.69716197, + "learning_rate": 2.769637625744738e-06, + "loss": 0.77436042, + "num_input_tokens_seen": 140438800, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.13684082, + "step": 6537, + "time_per_iteration": 4.096014499664307 + }, + { + "auxiliary_loss_clip": 0.064602, + "auxiliary_loss_mlp": 0.01269576, + "balance_loss_clip": 0.06288625, + "balance_loss_mlp": 0.01255432, + "epoch": 0.39308582594318353, + "flos": 17353134794880.0, + "grad_norm": 1.7942703591990323, + "language_loss": 0.79606509, + "learning_rate": 2.769278141085763e-06, + "loss": 0.8733629, + "num_input_tokens_seen": 140456880, + "router_z_loss_clip": 1.71582031, + "router_z_loss_mlp": 0.14129639, + "step": 6538, + "time_per_iteration": 2.578815221786499 + }, + { + "auxiliary_loss_clip": 0.06359898, + "auxiliary_loss_mlp": 0.01255927, + "balance_loss_clip": 0.06283404, + "balance_loss_mlp": 0.0125297, + "epoch": 0.3931459491958515, + "flos": 61023884175360.0, + "grad_norm": 0.7947880980854773, + "language_loss": 0.61826062, + "learning_rate": 2.768918627255683e-06, + "loss": 0.69441885, + "num_input_tokens_seen": 140507510, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.02955627, + "step": 6539, + "time_per_iteration": 2.9553403854370117 + }, + { + "auxiliary_loss_clip": 0.06458268, + "auxiliary_loss_mlp": 0.01272646, + "balance_loss_clip": 0.06289513, + "balance_loss_mlp": 0.01257339, + "epoch": 0.39320607244851946, + "flos": 39024662590080.0, + "grad_norm": 2.4294685123961295, + "language_loss": 0.68263721, + "learning_rate": 2.7685590842681315e-06, + "loss": 0.75994635, + "num_input_tokens_seen": 140528740, + "router_z_loss_clip": 1.68554688, + "router_z_loss_mlp": 0.15307617, + "step": 6540, + "time_per_iteration": 2.732541799545288 + }, + { + "auxiliary_loss_clip": 0.06455955, + "auxiliary_loss_mlp": 0.01271651, + "balance_loss_clip": 0.06287128, + "balance_loss_mlp": 0.0125613, + "epoch": 0.3932661957011874, + "flos": 24686451152640.0, + "grad_norm": 1.7600019176005988, + "language_loss": 0.72681171, + "learning_rate": 2.7681995121367433e-06, + "loss": 0.80408776, + "num_input_tokens_seen": 140547560, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.15527344, + "step": 6541, + "time_per_iteration": 4.03834342956543 + }, + { + "auxiliary_loss_clip": 0.06358681, + "auxiliary_loss_mlp": 0.01262146, + "balance_loss_clip": 0.06282184, + "balance_loss_mlp": 0.01259297, + "epoch": 0.3933263189538554, + "flos": 70115614790400.0, + "grad_norm": 0.7938144397826515, + "language_loss": 0.60408866, + "learning_rate": 2.7678399108751516e-06, + "loss": 0.6802969, + "num_input_tokens_seen": 140601175, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.02844238, + "step": 6542, + "time_per_iteration": 3.0015151500701904 + }, + { + "auxiliary_loss_clip": 0.06453243, + "auxiliary_loss_mlp": 0.01279318, + "balance_loss_clip": 0.0628323, + "balance_loss_mlp": 0.01265305, + "epoch": 0.39338644220652336, + "flos": 22935583762560.0, + "grad_norm": 1.4413337304531033, + "language_loss": 0.82278919, + "learning_rate": 2.7674802804969947e-06, + "loss": 0.90011483, + "num_input_tokens_seen": 140622200, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.14013672, + "step": 6543, + "time_per_iteration": 2.6289048194885254 + }, + { + "auxiliary_loss_clip": 0.06454003, + "auxiliary_loss_mlp": 0.01270252, + "balance_loss_clip": 0.06284549, + "balance_loss_mlp": 0.01255768, + "epoch": 0.3934465654591913, + "flos": 30856282041600.0, + "grad_norm": 1.7408174737933344, + "language_loss": 0.69224536, + "learning_rate": 2.767120621015908e-06, + "loss": 0.76948798, + "num_input_tokens_seen": 140643125, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.14489746, + "step": 6544, + "time_per_iteration": 2.6554784774780273 + }, + { + "auxiliary_loss_clip": 0.06466363, + "auxiliary_loss_mlp": 0.01274712, + "balance_loss_clip": 0.06291823, + "balance_loss_mlp": 0.01258524, + "epoch": 0.3935066887118593, + "flos": 29243329672320.0, + "grad_norm": 2.0329338261061887, + "language_loss": 0.75462705, + "learning_rate": 2.76676093244553e-06, + "loss": 0.83203781, + "num_input_tokens_seen": 140662500, + "router_z_loss_clip": 1.74511719, + "router_z_loss_mlp": 0.1619873, + "step": 6545, + "time_per_iteration": 2.606234312057495 + }, + { + "auxiliary_loss_clip": 0.06446254, + "auxiliary_loss_mlp": 0.01275344, + "balance_loss_clip": 0.06285709, + "balance_loss_mlp": 0.01262309, + "epoch": 0.3935668119645273, + "flos": 19141290051840.0, + "grad_norm": 1.4467327313094591, + "language_loss": 0.75122333, + "learning_rate": 2.7664012147995015e-06, + "loss": 0.82843935, + "num_input_tokens_seen": 140681960, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.13043213, + "step": 6546, + "time_per_iteration": 2.5514185428619385 + }, + { + "auxiliary_loss_clip": 0.06461848, + "auxiliary_loss_mlp": 0.01270617, + "balance_loss_clip": 0.06285486, + "balance_loss_mlp": 0.01254822, + "epoch": 0.3936269352171953, + "flos": 18522196560000.0, + "grad_norm": 2.187625212538507, + "language_loss": 0.82285661, + "learning_rate": 2.7660414680914617e-06, + "loss": 0.90018129, + "num_input_tokens_seen": 140699170, + "router_z_loss_clip": 1.76367188, + "router_z_loss_mlp": 0.15783691, + "step": 6547, + "time_per_iteration": 2.536921501159668 + }, + { + "auxiliary_loss_clip": 0.06454909, + "auxiliary_loss_mlp": 0.01273072, + "balance_loss_clip": 0.06285325, + "balance_loss_mlp": 0.01259685, + "epoch": 0.39368705846986324, + "flos": 15638255533440.0, + "grad_norm": 1.8611217813328955, + "language_loss": 0.84309554, + "learning_rate": 2.7656816923350525e-06, + "loss": 0.92037535, + "num_input_tokens_seen": 140714920, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.1340332, + "step": 6548, + "time_per_iteration": 2.586596727371216 + }, + { + "auxiliary_loss_clip": 0.06451154, + "auxiliary_loss_mlp": 0.01275141, + "balance_loss_clip": 0.06285168, + "balance_loss_mlp": 0.01261325, + "epoch": 0.3937471817225312, + "flos": 21332442320640.0, + "grad_norm": 1.5541020214417252, + "language_loss": 0.7306931, + "learning_rate": 2.7653218875439174e-06, + "loss": 0.8079561, + "num_input_tokens_seen": 140734595, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.13842773, + "step": 6549, + "time_per_iteration": 2.5176355838775635 + }, + { + "auxiliary_loss_clip": 0.06453951, + "auxiliary_loss_mlp": 0.0127461, + "balance_loss_clip": 0.06282675, + "balance_loss_mlp": 0.01258398, + "epoch": 0.39380730497519917, + "flos": 20782893317760.0, + "grad_norm": 1.443831260247086, + "language_loss": 0.77958995, + "learning_rate": 2.764962053731699e-06, + "loss": 0.85687554, + "num_input_tokens_seen": 140754050, + "router_z_loss_clip": 1.71191406, + "router_z_loss_mlp": 0.16204834, + "step": 6550, + "time_per_iteration": 2.5665266513824463 + }, + { + "auxiliary_loss_clip": 0.06449334, + "auxiliary_loss_mlp": 0.01268564, + "balance_loss_clip": 0.0628082, + "balance_loss_mlp": 0.01254455, + "epoch": 0.39386742822786713, + "flos": 21615106469760.0, + "grad_norm": 1.5479702434138036, + "language_loss": 0.81395853, + "learning_rate": 2.7646021909120434e-06, + "loss": 0.89113748, + "num_input_tokens_seen": 140771440, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.14129639, + "step": 6551, + "time_per_iteration": 2.509472370147705 + }, + { + "auxiliary_loss_clip": 0.06452134, + "auxiliary_loss_mlp": 0.01274621, + "balance_loss_clip": 0.06282679, + "balance_loss_mlp": 0.01259791, + "epoch": 0.3939275514805351, + "flos": 12418304434560.0, + "grad_norm": 2.3772322810911892, + "language_loss": 0.80163503, + "learning_rate": 2.764242299098596e-06, + "loss": 0.87890255, + "num_input_tokens_seen": 140786715, + "router_z_loss_clip": 1.69433594, + "router_z_loss_mlp": 0.14825439, + "step": 6552, + "time_per_iteration": 2.512632369995117 + }, + { + "auxiliary_loss_clip": 0.06458388, + "auxiliary_loss_mlp": 0.01271806, + "balance_loss_clip": 0.06285821, + "balance_loss_mlp": 0.01256803, + "epoch": 0.39398767473320306, + "flos": 18558016980480.0, + "grad_norm": 1.9836463121020687, + "language_loss": 0.71468151, + "learning_rate": 2.763882378305003e-06, + "loss": 0.79198349, + "num_input_tokens_seen": 140804950, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.14996338, + "step": 6553, + "time_per_iteration": 2.4973459243774414 + }, + { + "auxiliary_loss_clip": 0.06447914, + "auxiliary_loss_mlp": 0.01269169, + "balance_loss_clip": 0.06280744, + "balance_loss_mlp": 0.0125422, + "epoch": 0.39404779798587103, + "flos": 29315599418880.0, + "grad_norm": 1.8230931816174483, + "language_loss": 0.64176017, + "learning_rate": 2.7635224285449144e-06, + "loss": 0.71893102, + "num_input_tokens_seen": 140822800, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.14941406, + "step": 6554, + "time_per_iteration": 2.6340816020965576 + }, + { + "auxiliary_loss_clip": 0.06448209, + "auxiliary_loss_mlp": 0.01269545, + "balance_loss_clip": 0.06281387, + "balance_loss_mlp": 0.0125561, + "epoch": 0.394107921238539, + "flos": 34905679107840.0, + "grad_norm": 1.8577413865682035, + "language_loss": 0.79801202, + "learning_rate": 2.7631624498319796e-06, + "loss": 0.8751896, + "num_input_tokens_seen": 140842940, + "router_z_loss_clip": 1.66894531, + "router_z_loss_mlp": 0.13934326, + "step": 6555, + "time_per_iteration": 2.673266887664795 + }, + { + "auxiliary_loss_clip": 0.06451041, + "auxiliary_loss_mlp": 0.01267708, + "balance_loss_clip": 0.06280783, + "balance_loss_mlp": 0.01252748, + "epoch": 0.39416804449120696, + "flos": 25088232280320.0, + "grad_norm": 1.8326733466575391, + "language_loss": 0.72028196, + "learning_rate": 2.7628024421798473e-06, + "loss": 0.79746938, + "num_input_tokens_seen": 140863060, + "router_z_loss_clip": 1.70117188, + "router_z_loss_mlp": 0.1496582, + "step": 6556, + "time_per_iteration": 2.572880744934082 + }, + { + "auxiliary_loss_clip": 0.06448796, + "auxiliary_loss_mlp": 0.01268731, + "balance_loss_clip": 0.06281175, + "balance_loss_mlp": 0.01254348, + "epoch": 0.3942281677438749, + "flos": 32314842063360.0, + "grad_norm": 2.2262653228658666, + "language_loss": 0.83903825, + "learning_rate": 2.7624424056021705e-06, + "loss": 0.91621351, + "num_input_tokens_seen": 140883795, + "router_z_loss_clip": 1.67675781, + "router_z_loss_mlp": 0.14373779, + "step": 6557, + "time_per_iteration": 2.605922222137451 + }, + { + "auxiliary_loss_clip": 0.06447846, + "auxiliary_loss_mlp": 0.01272636, + "balance_loss_clip": 0.06281336, + "balance_loss_mlp": 0.01258671, + "epoch": 0.3942882909965429, + "flos": 24943608933120.0, + "grad_norm": 2.1784611950300605, + "language_loss": 0.80248392, + "learning_rate": 2.7620823401126004e-06, + "loss": 0.87968874, + "num_input_tokens_seen": 140903055, + "router_z_loss_clip": 1.6640625, + "router_z_loss_mlp": 0.1395874, + "step": 6558, + "time_per_iteration": 2.5902092456817627 + }, + { + "auxiliary_loss_clip": 0.06445447, + "auxiliary_loss_mlp": 0.01267686, + "balance_loss_clip": 0.06280681, + "balance_loss_mlp": 0.01253816, + "epoch": 0.39434841424921085, + "flos": 11879614535040.0, + "grad_norm": 2.1357186014692546, + "language_loss": 0.71689725, + "learning_rate": 2.761722245724792e-06, + "loss": 0.79402852, + "num_input_tokens_seen": 140920685, + "router_z_loss_clip": 1.64648438, + "router_z_loss_mlp": 0.13873291, + "step": 6559, + "time_per_iteration": 2.4894917011260986 + }, + { + "auxiliary_loss_clip": 0.06456885, + "auxiliary_loss_mlp": 0.01269254, + "balance_loss_clip": 0.0628094, + "balance_loss_mlp": 0.01254622, + "epoch": 0.3944085375018789, + "flos": 16367032419840.0, + "grad_norm": 2.0841749511208705, + "language_loss": 0.81285572, + "learning_rate": 2.7613621224524003e-06, + "loss": 0.89011705, + "num_input_tokens_seen": 140937320, + "router_z_loss_clip": 1.7578125, + "router_z_loss_mlp": 0.14630127, + "step": 6560, + "time_per_iteration": 2.522434711456299 + }, + { + "auxiliary_loss_clip": 0.06452034, + "auxiliary_loss_mlp": 0.0126948, + "balance_loss_clip": 0.06282307, + "balance_loss_mlp": 0.01254078, + "epoch": 0.39446866075454684, + "flos": 10637821825920.0, + "grad_norm": 3.641985825462619, + "language_loss": 0.83127379, + "learning_rate": 2.7610019703090803e-06, + "loss": 0.90848899, + "num_input_tokens_seen": 140954855, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.15386963, + "step": 6561, + "time_per_iteration": 2.4804983139038086 + }, + { + "auxiliary_loss_clip": 0.06450383, + "auxiliary_loss_mlp": 0.0127031, + "balance_loss_clip": 0.06283262, + "balance_loss_mlp": 0.01257102, + "epoch": 0.3945287840072148, + "flos": 18193481792640.0, + "grad_norm": 2.043086634933395, + "language_loss": 0.80616236, + "learning_rate": 2.7606417893084887e-06, + "loss": 0.88336933, + "num_input_tokens_seen": 140973250, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.13208008, + "step": 6562, + "time_per_iteration": 2.5335006713867188 + }, + { + "auxiliary_loss_clip": 0.06448314, + "auxiliary_loss_mlp": 0.01268686, + "balance_loss_clip": 0.06283693, + "balance_loss_mlp": 0.01254476, + "epoch": 0.39458890725988277, + "flos": 23046650749440.0, + "grad_norm": 1.5717146465742573, + "language_loss": 0.81509531, + "learning_rate": 2.7602815794642853e-06, + "loss": 0.89226532, + "num_input_tokens_seen": 140993050, + "router_z_loss_clip": 1.64648438, + "router_z_loss_mlp": 0.14215088, + "step": 6563, + "time_per_iteration": 2.5315918922424316 + }, + { + "auxiliary_loss_clip": 0.06453238, + "auxiliary_loss_mlp": 0.01270349, + "balance_loss_clip": 0.0628344, + "balance_loss_mlp": 0.0125608, + "epoch": 0.39464903051255074, + "flos": 17163718640640.0, + "grad_norm": 1.8608988788141587, + "language_loss": 0.70080984, + "learning_rate": 2.759921340790127e-06, + "loss": 0.77804577, + "num_input_tokens_seen": 141010815, + "router_z_loss_clip": 1.69726562, + "router_z_loss_mlp": 0.14257812, + "step": 6564, + "time_per_iteration": 2.543459415435791 + }, + { + "auxiliary_loss_clip": 0.06449583, + "auxiliary_loss_mlp": 0.01269395, + "balance_loss_clip": 0.06281252, + "balance_loss_mlp": 0.01254648, + "epoch": 0.3947091537652187, + "flos": 15894616700160.0, + "grad_norm": 2.288586168499947, + "language_loss": 0.83967394, + "learning_rate": 2.759561073299676e-06, + "loss": 0.91686368, + "num_input_tokens_seen": 141028720, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.14746094, + "step": 6565, + "time_per_iteration": 2.5438666343688965 + }, + { + "auxiliary_loss_clip": 0.06447474, + "auxiliary_loss_mlp": 0.01269356, + "balance_loss_clip": 0.06280743, + "balance_loss_mlp": 0.01255229, + "epoch": 0.39476927701788667, + "flos": 18550386259200.0, + "grad_norm": 2.0020652066074285, + "language_loss": 0.83519006, + "learning_rate": 2.7592007770065937e-06, + "loss": 0.91235834, + "num_input_tokens_seen": 141046025, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.14129639, + "step": 6566, + "time_per_iteration": 2.550548791885376 + }, + { + "auxiliary_loss_clip": 0.06459671, + "auxiliary_loss_mlp": 0.01271058, + "balance_loss_clip": 0.06282969, + "balance_loss_mlp": 0.01255072, + "epoch": 0.39482940027055463, + "flos": 22282682348160.0, + "grad_norm": 1.770017298907609, + "language_loss": 0.77499187, + "learning_rate": 2.7588404519245403e-06, + "loss": 0.85229909, + "num_input_tokens_seen": 141066865, + "router_z_loss_clip": 1.76660156, + "router_z_loss_mlp": 0.15979004, + "step": 6567, + "time_per_iteration": 2.535980463027954 + }, + { + "auxiliary_loss_clip": 0.0644526, + "auxiliary_loss_mlp": 0.01270792, + "balance_loss_clip": 0.06283294, + "balance_loss_mlp": 0.01257851, + "epoch": 0.3948895235232226, + "flos": 14763010510080.0, + "grad_norm": 1.9280900707618294, + "language_loss": 0.80259991, + "learning_rate": 2.758480098067182e-06, + "loss": 0.87976044, + "num_input_tokens_seen": 141084210, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.12945557, + "step": 6568, + "time_per_iteration": 2.56528639793396 + }, + { + "auxiliary_loss_clip": 0.06451409, + "auxiliary_loss_mlp": 0.01272888, + "balance_loss_clip": 0.06283959, + "balance_loss_mlp": 0.01258356, + "epoch": 0.39494964677589056, + "flos": 22572474094080.0, + "grad_norm": 2.8189067544408166, + "language_loss": 0.84836519, + "learning_rate": 2.7581197154481816e-06, + "loss": 0.9256081, + "num_input_tokens_seen": 141103895, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.1451416, + "step": 6569, + "time_per_iteration": 2.512678623199463 + }, + { + "auxiliary_loss_clip": 0.06448043, + "auxiliary_loss_mlp": 0.01269688, + "balance_loss_clip": 0.06284526, + "balance_loss_mlp": 0.01255538, + "epoch": 0.3950097700285585, + "flos": 22969307831040.0, + "grad_norm": 1.7602858722639216, + "language_loss": 0.74665594, + "learning_rate": 2.7577593040812066e-06, + "loss": 0.82383323, + "num_input_tokens_seen": 141124000, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.14147949, + "step": 6570, + "time_per_iteration": 2.611072063446045 + }, + { + "auxiliary_loss_clip": 0.06447589, + "auxiliary_loss_mlp": 0.01270515, + "balance_loss_clip": 0.06279834, + "balance_loss_mlp": 0.01256305, + "epoch": 0.3950698932812265, + "flos": 20601569082240.0, + "grad_norm": 1.9769080404363342, + "language_loss": 0.80472994, + "learning_rate": 2.757398863979922e-06, + "loss": 0.88191104, + "num_input_tokens_seen": 141142535, + "router_z_loss_clip": 1.67871094, + "router_z_loss_mlp": 0.14196777, + "step": 6571, + "time_per_iteration": 4.037761688232422 + }, + { + "auxiliary_loss_clip": 0.06446905, + "auxiliary_loss_mlp": 0.012706, + "balance_loss_clip": 0.06278758, + "balance_loss_mlp": 0.01257022, + "epoch": 0.39513001653389446, + "flos": 20381992657920.0, + "grad_norm": 1.599556952476494, + "language_loss": 0.78081018, + "learning_rate": 2.757038395157997e-06, + "loss": 0.8579852, + "num_input_tokens_seen": 141161575, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.13574219, + "step": 6572, + "time_per_iteration": 2.542388439178467 + }, + { + "auxiliary_loss_clip": 0.06450671, + "auxiliary_loss_mlp": 0.01268422, + "balance_loss_clip": 0.06281148, + "balance_loss_mlp": 0.01253991, + "epoch": 0.3951901397865625, + "flos": 26469994435200.0, + "grad_norm": 1.9679034095416588, + "language_loss": 0.74861181, + "learning_rate": 2.7566778976291002e-06, + "loss": 0.8258028, + "num_input_tokens_seen": 141181150, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.14434814, + "step": 6573, + "time_per_iteration": 3.9954564571380615 + }, + { + "auxiliary_loss_clip": 0.06447303, + "auxiliary_loss_mlp": 0.012677, + "balance_loss_clip": 0.06282736, + "balance_loss_mlp": 0.0125492, + "epoch": 0.39525026303923044, + "flos": 43848845233920.0, + "grad_norm": 1.4348738267970096, + "language_loss": 0.67874503, + "learning_rate": 2.7563173714069017e-06, + "loss": 0.75589502, + "num_input_tokens_seen": 141206310, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.12799072, + "step": 6574, + "time_per_iteration": 2.75056791305542 + }, + { + "auxiliary_loss_clip": 0.06451984, + "auxiliary_loss_mlp": 0.01270185, + "balance_loss_clip": 0.06284595, + "balance_loss_mlp": 0.01255832, + "epoch": 0.3953103862918984, + "flos": 18046636312320.0, + "grad_norm": 3.0759560063082736, + "language_loss": 0.72770178, + "learning_rate": 2.755956816505072e-06, + "loss": 0.80492353, + "num_input_tokens_seen": 141223925, + "router_z_loss_clip": 1.67675781, + "router_z_loss_mlp": 0.14355469, + "step": 6575, + "time_per_iteration": 2.508314847946167 + }, + { + "auxiliary_loss_clip": 0.06452627, + "auxiliary_loss_mlp": 0.01270422, + "balance_loss_clip": 0.0628259, + "balance_loss_mlp": 0.01256015, + "epoch": 0.3953705095445664, + "flos": 16980549615360.0, + "grad_norm": 2.3956956088423382, + "language_loss": 0.73929548, + "learning_rate": 2.7555962329372845e-06, + "loss": 0.816526, + "num_input_tokens_seen": 141239010, + "router_z_loss_clip": 1.69824219, + "router_z_loss_mlp": 0.1439209, + "step": 6576, + "time_per_iteration": 2.4877238273620605 + }, + { + "auxiliary_loss_clip": 0.06453596, + "auxiliary_loss_mlp": 0.01269813, + "balance_loss_clip": 0.06286615, + "balance_loss_mlp": 0.0125704, + "epoch": 0.39543063279723434, + "flos": 17415300124800.0, + "grad_norm": 2.3089155525157397, + "language_loss": 0.8424108, + "learning_rate": 2.7552356207172124e-06, + "loss": 0.91964483, + "num_input_tokens_seen": 141252255, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.12786865, + "step": 6577, + "time_per_iteration": 3.9026546478271484 + }, + { + "auxiliary_loss_clip": 0.06447916, + "auxiliary_loss_mlp": 0.01269176, + "balance_loss_clip": 0.06283568, + "balance_loss_mlp": 0.01255788, + "epoch": 0.3954907560499023, + "flos": 22790876561280.0, + "grad_norm": 2.6090797034217603, + "language_loss": 0.90399998, + "learning_rate": 2.75487497985853e-06, + "loss": 0.98117089, + "num_input_tokens_seen": 141269325, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.1338501, + "step": 6578, + "time_per_iteration": 2.624901533126831 + }, + { + "auxiliary_loss_clip": 0.06451896, + "auxiliary_loss_mlp": 0.01269341, + "balance_loss_clip": 0.06281315, + "balance_loss_mlp": 0.01254284, + "epoch": 0.39555087930257027, + "flos": 21950823052800.0, + "grad_norm": 1.8247592517251146, + "language_loss": 0.78543842, + "learning_rate": 2.7545143103749117e-06, + "loss": 0.86265075, + "num_input_tokens_seen": 141288505, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.15063477, + "step": 6579, + "time_per_iteration": 2.5111443996429443 + }, + { + "auxiliary_loss_clip": 0.06456701, + "auxiliary_loss_mlp": 0.01273715, + "balance_loss_clip": 0.0628474, + "balance_loss_mlp": 0.01258492, + "epoch": 0.39561100255523823, + "flos": 20409553451520.0, + "grad_norm": 2.1653293739232753, + "language_loss": 0.68659246, + "learning_rate": 2.754153612280037e-06, + "loss": 0.76389658, + "num_input_tokens_seen": 141303680, + "router_z_loss_clip": 1.71972656, + "router_z_loss_mlp": 0.15216064, + "step": 6580, + "time_per_iteration": 4.038321495056152 + }, + { + "auxiliary_loss_clip": 0.06448758, + "auxiliary_loss_mlp": 0.01270958, + "balance_loss_clip": 0.06283981, + "balance_loss_mlp": 0.01256635, + "epoch": 0.3956711258079062, + "flos": 27972005598720.0, + "grad_norm": 1.867170796056586, + "language_loss": 0.58577931, + "learning_rate": 2.7537928855875797e-06, + "loss": 0.6629765, + "num_input_tokens_seen": 141324090, + "router_z_loss_clip": 1.64648438, + "router_z_loss_mlp": 0.14318848, + "step": 6581, + "time_per_iteration": 2.618917942047119 + }, + { + "auxiliary_loss_clip": 0.0645448, + "auxiliary_loss_mlp": 0.0127135, + "balance_loss_clip": 0.06288571, + "balance_loss_mlp": 0.01256413, + "epoch": 0.39573124906057416, + "flos": 14433457201920.0, + "grad_norm": 2.002939068333409, + "language_loss": 0.69910431, + "learning_rate": 2.7534321303112224e-06, + "loss": 0.77636254, + "num_input_tokens_seen": 141342235, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.14929199, + "step": 6582, + "time_per_iteration": 2.530895709991455 + }, + { + "auxiliary_loss_clip": 0.06451949, + "auxiliary_loss_mlp": 0.01273006, + "balance_loss_clip": 0.06283893, + "balance_loss_mlp": 0.01258546, + "epoch": 0.39579137231324213, + "flos": 18739592778240.0, + "grad_norm": 2.2302551557868457, + "language_loss": 0.76587689, + "learning_rate": 2.753071346464642e-06, + "loss": 0.84312642, + "num_input_tokens_seen": 141361195, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.14453125, + "step": 6583, + "time_per_iteration": 2.5276317596435547 + }, + { + "auxiliary_loss_clip": 0.0645259, + "auxiliary_loss_mlp": 0.0127002, + "balance_loss_clip": 0.06284047, + "balance_loss_mlp": 0.01256562, + "epoch": 0.3958514955659101, + "flos": 17682268832640.0, + "grad_norm": 1.926047340176765, + "language_loss": 0.66262352, + "learning_rate": 2.7527105340615207e-06, + "loss": 0.73984963, + "num_input_tokens_seen": 141378275, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.13458252, + "step": 6584, + "time_per_iteration": 2.501209259033203 + }, + { + "auxiliary_loss_clip": 0.06456675, + "auxiliary_loss_mlp": 0.01270923, + "balance_loss_clip": 0.06285589, + "balance_loss_mlp": 0.01256803, + "epoch": 0.39591161881857806, + "flos": 29315850981120.0, + "grad_norm": 1.992954295318491, + "language_loss": 0.72398281, + "learning_rate": 2.7523496931155413e-06, + "loss": 0.8012588, + "num_input_tokens_seen": 141396960, + "router_z_loss_clip": 1.7109375, + "router_z_loss_mlp": 0.14111328, + "step": 6585, + "time_per_iteration": 2.617694616317749 + }, + { + "auxiliary_loss_clip": 0.06457305, + "auxiliary_loss_mlp": 0.0127182, + "balance_loss_clip": 0.06288064, + "balance_loss_mlp": 0.01257336, + "epoch": 0.3959717420712461, + "flos": 25778295780480.0, + "grad_norm": 1.6889684303793513, + "language_loss": 0.73472714, + "learning_rate": 2.7519888236403856e-06, + "loss": 0.81201839, + "num_input_tokens_seen": 141417320, + "router_z_loss_clip": 1.69238281, + "router_z_loss_mlp": 0.14477539, + "step": 6586, + "time_per_iteration": 2.565883159637451 + }, + { + "auxiliary_loss_clip": 0.06454571, + "auxiliary_loss_mlp": 0.01267143, + "balance_loss_clip": 0.06286268, + "balance_loss_mlp": 0.01252969, + "epoch": 0.39603186532391405, + "flos": 20930199995520.0, + "grad_norm": 1.6150585752618039, + "language_loss": 0.71662915, + "learning_rate": 2.7516279256497382e-06, + "loss": 0.79384637, + "num_input_tokens_seen": 141435985, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.14160156, + "step": 6587, + "time_per_iteration": 2.5788414478302 + }, + { + "auxiliary_loss_clip": 0.06362241, + "auxiliary_loss_mlp": 0.01254401, + "balance_loss_clip": 0.06286076, + "balance_loss_mlp": 0.01251419, + "epoch": 0.396091988576582, + "flos": 54897336720000.0, + "grad_norm": 0.8108180128275717, + "language_loss": 0.60705078, + "learning_rate": 2.751266999157285e-06, + "loss": 0.68321717, + "num_input_tokens_seen": 141486075, + "router_z_loss_clip": 0.75830078, + "router_z_loss_mlp": 0.02980042, + "step": 6588, + "time_per_iteration": 2.973475217819214 + }, + { + "auxiliary_loss_clip": 0.06457016, + "auxiliary_loss_mlp": 0.01266428, + "balance_loss_clip": 0.06285909, + "balance_loss_mlp": 0.01251873, + "epoch": 0.39615211182925, + "flos": 20708946489600.0, + "grad_norm": 1.752385405351709, + "language_loss": 0.81335068, + "learning_rate": 2.7509060441767115e-06, + "loss": 0.89058518, + "num_input_tokens_seen": 141505280, + "router_z_loss_clip": 1.71191406, + "router_z_loss_mlp": 0.14575195, + "step": 6589, + "time_per_iteration": 2.557732582092285 + }, + { + "auxiliary_loss_clip": 0.06456019, + "auxiliary_loss_mlp": 0.01269797, + "balance_loss_clip": 0.06286196, + "balance_loss_mlp": 0.01254431, + "epoch": 0.39621223508191794, + "flos": 21000331463040.0, + "grad_norm": 1.8508577793480634, + "language_loss": 0.71167219, + "learning_rate": 2.7505450607217057e-06, + "loss": 0.7889303, + "num_input_tokens_seen": 141523930, + "router_z_loss_clip": 1.69726562, + "router_z_loss_mlp": 0.15368652, + "step": 6590, + "time_per_iteration": 2.5155017375946045 + }, + { + "auxiliary_loss_clip": 0.06451933, + "auxiliary_loss_mlp": 0.01266373, + "balance_loss_clip": 0.06285245, + "balance_loss_mlp": 0.01253284, + "epoch": 0.3962723583345859, + "flos": 23375742860160.0, + "grad_norm": 1.6853348593397999, + "language_loss": 0.75984478, + "learning_rate": 2.750184048805956e-06, + "loss": 0.83702791, + "num_input_tokens_seen": 141541320, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.13098145, + "step": 6591, + "time_per_iteration": 2.569958448410034 + }, + { + "auxiliary_loss_clip": 0.06454425, + "auxiliary_loss_mlp": 0.01268025, + "balance_loss_clip": 0.06288329, + "balance_loss_mlp": 0.01254215, + "epoch": 0.39633248158725387, + "flos": 25122040202880.0, + "grad_norm": 1.5542594066551045, + "language_loss": 0.78422546, + "learning_rate": 2.749823008443152e-06, + "loss": 0.8614499, + "num_input_tokens_seen": 141561880, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.13806152, + "step": 6592, + "time_per_iteration": 2.5509040355682373 + }, + { + "auxiliary_loss_clip": 0.06448938, + "auxiliary_loss_mlp": 0.0127036, + "balance_loss_clip": 0.062861, + "balance_loss_mlp": 0.01256615, + "epoch": 0.39639260483992184, + "flos": 39797309888640.0, + "grad_norm": 1.716432087396327, + "language_loss": 0.69405383, + "learning_rate": 2.7494619396469843e-06, + "loss": 0.77124685, + "num_input_tokens_seen": 141586460, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.13751221, + "step": 6593, + "time_per_iteration": 2.742421865463257 + }, + { + "auxiliary_loss_clip": 0.06455009, + "auxiliary_loss_mlp": 0.01268833, + "balance_loss_clip": 0.06285039, + "balance_loss_mlp": 0.01253896, + "epoch": 0.3964527280925898, + "flos": 17352673597440.0, + "grad_norm": 2.6756229463225134, + "language_loss": 0.78082192, + "learning_rate": 2.7491008424311452e-06, + "loss": 0.85806036, + "num_input_tokens_seen": 141605955, + "router_z_loss_clip": 1.69824219, + "router_z_loss_mlp": 0.14929199, + "step": 6594, + "time_per_iteration": 2.5240583419799805 + }, + { + "auxiliary_loss_clip": 0.06345355, + "auxiliary_loss_mlp": 0.01253278, + "balance_loss_clip": 0.06269702, + "balance_loss_mlp": 0.0125056, + "epoch": 0.39651285134525777, + "flos": 71739845533440.0, + "grad_norm": 0.9367359782969226, + "language_loss": 0.6293599, + "learning_rate": 2.7487397168093265e-06, + "loss": 0.70534623, + "num_input_tokens_seen": 141673140, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.02722168, + "step": 6595, + "time_per_iteration": 3.195411205291748 + }, + { + "auxiliary_loss_clip": 0.06455558, + "auxiliary_loss_mlp": 0.01273293, + "balance_loss_clip": 0.0628309, + "balance_loss_mlp": 0.0125714, + "epoch": 0.39657297459792573, + "flos": 25782823900800.0, + "grad_norm": 2.0629727816625656, + "language_loss": 0.63503623, + "learning_rate": 2.748378562795223e-06, + "loss": 0.71232474, + "num_input_tokens_seen": 141692955, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.16149902, + "step": 6596, + "time_per_iteration": 2.564436197280884 + }, + { + "auxiliary_loss_clip": 0.06445512, + "auxiliary_loss_mlp": 0.01270278, + "balance_loss_clip": 0.0628349, + "balance_loss_mlp": 0.01256086, + "epoch": 0.3966330978505937, + "flos": 20272267336320.0, + "grad_norm": 3.0845696935228646, + "language_loss": 0.79033494, + "learning_rate": 2.7480173804025293e-06, + "loss": 0.86749279, + "num_input_tokens_seen": 141710680, + "router_z_loss_clip": 1.61914062, + "router_z_loss_mlp": 0.14202881, + "step": 6597, + "time_per_iteration": 2.5187220573425293 + }, + { + "auxiliary_loss_clip": 0.0645806, + "auxiliary_loss_mlp": 0.01272047, + "balance_loss_clip": 0.06285266, + "balance_loss_mlp": 0.01257259, + "epoch": 0.39669322110326166, + "flos": 20637431429760.0, + "grad_norm": 1.9127598273467419, + "language_loss": 0.67675543, + "learning_rate": 2.747656169644941e-06, + "loss": 0.75405657, + "num_input_tokens_seen": 141729860, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.14776611, + "step": 6598, + "time_per_iteration": 2.5287654399871826 + }, + { + "auxiliary_loss_clip": 0.06448894, + "auxiliary_loss_mlp": 0.01270917, + "balance_loss_clip": 0.06280929, + "balance_loss_mlp": 0.01257643, + "epoch": 0.3967533443559297, + "flos": 21732546366720.0, + "grad_norm": 1.6941457063111416, + "language_loss": 0.79130334, + "learning_rate": 2.747294930536157e-06, + "loss": 0.86850142, + "num_input_tokens_seen": 141749060, + "router_z_loss_clip": 1.68066406, + "router_z_loss_mlp": 0.13269043, + "step": 6599, + "time_per_iteration": 2.564073324203491 + }, + { + "auxiliary_loss_clip": 0.06447926, + "auxiliary_loss_mlp": 0.01270436, + "balance_loss_clip": 0.06279482, + "balance_loss_mlp": 0.01254289, + "epoch": 0.39681346760859765, + "flos": 25491271219200.0, + "grad_norm": 1.7355689440790156, + "language_loss": 0.72895992, + "learning_rate": 2.7469336630898737e-06, + "loss": 0.80614352, + "num_input_tokens_seen": 141769860, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.16149902, + "step": 6600, + "time_per_iteration": 2.6141197681427 + }, + { + "auxiliary_loss_clip": 0.06448444, + "auxiliary_loss_mlp": 0.01274951, + "balance_loss_clip": 0.06280382, + "balance_loss_mlp": 0.01261045, + "epoch": 0.3968735908612656, + "flos": 20965894634880.0, + "grad_norm": 1.918502465070546, + "language_loss": 0.85902363, + "learning_rate": 2.746572367319791e-06, + "loss": 0.9362576, + "num_input_tokens_seen": 141788465, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.13909912, + "step": 6601, + "time_per_iteration": 2.539337396621704 + }, + { + "auxiliary_loss_clip": 0.06455625, + "auxiliary_loss_mlp": 0.01273924, + "balance_loss_clip": 0.06281834, + "balance_loss_mlp": 0.0125773, + "epoch": 0.3969337141139336, + "flos": 10711684800000.0, + "grad_norm": 2.4177834123100412, + "language_loss": 0.70406669, + "learning_rate": 2.7462110432396095e-06, + "loss": 0.78136218, + "num_input_tokens_seen": 141804955, + "router_z_loss_clip": 1.74023438, + "router_z_loss_mlp": 0.16192627, + "step": 6602, + "time_per_iteration": 2.5344958305358887 + }, + { + "auxiliary_loss_clip": 0.06450728, + "auxiliary_loss_mlp": 0.01272133, + "balance_loss_clip": 0.06280322, + "balance_loss_mlp": 0.01257583, + "epoch": 0.39699383736660154, + "flos": 17597924098560.0, + "grad_norm": 4.3880896635048865, + "language_loss": 0.84332073, + "learning_rate": 2.7458496908630305e-06, + "loss": 0.92054927, + "num_input_tokens_seen": 141820025, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.14550781, + "step": 6603, + "time_per_iteration": 2.4587697982788086 + }, + { + "auxiliary_loss_clip": 0.06445679, + "auxiliary_loss_mlp": 0.01276756, + "balance_loss_clip": 0.06278397, + "balance_loss_mlp": 0.01263017, + "epoch": 0.3970539606192695, + "flos": 17791826446080.0, + "grad_norm": 1.5258003920697418, + "language_loss": 0.7302916, + "learning_rate": 2.7454883102037563e-06, + "loss": 0.80751598, + "num_input_tokens_seen": 141838735, + "router_z_loss_clip": 1.67480469, + "router_z_loss_mlp": 0.13751221, + "step": 6604, + "time_per_iteration": 2.525475025177002 + }, + { + "auxiliary_loss_clip": 0.06437713, + "auxiliary_loss_mlp": 0.01269691, + "balance_loss_clip": 0.06277181, + "balance_loss_mlp": 0.0125609, + "epoch": 0.3971140838719375, + "flos": 24796260328320.0, + "grad_norm": 1.5312177971095886, + "language_loss": 0.82809514, + "learning_rate": 2.745126901275491e-06, + "loss": 0.90516913, + "num_input_tokens_seen": 141858090, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.13598633, + "step": 6605, + "time_per_iteration": 2.5601069927215576 + }, + { + "auxiliary_loss_clip": 0.06439412, + "auxiliary_loss_mlp": 0.01269635, + "balance_loss_clip": 0.06274941, + "balance_loss_mlp": 0.01256337, + "epoch": 0.39717420712460544, + "flos": 24250484759040.0, + "grad_norm": 1.721474173213711, + "language_loss": 0.74617773, + "learning_rate": 2.7447654640919383e-06, + "loss": 0.82326818, + "num_input_tokens_seen": 141877540, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.13293457, + "step": 6606, + "time_per_iteration": 2.570338726043701 + }, + { + "auxiliary_loss_clip": 0.06450282, + "auxiliary_loss_mlp": 0.01268462, + "balance_loss_clip": 0.06279129, + "balance_loss_mlp": 0.01255343, + "epoch": 0.3972343303772734, + "flos": 25891752608640.0, + "grad_norm": 1.7826498780228273, + "language_loss": 0.74625784, + "learning_rate": 2.744403998666805e-06, + "loss": 0.8234452, + "num_input_tokens_seen": 141897315, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.13122559, + "step": 6607, + "time_per_iteration": 2.554779052734375 + }, + { + "auxiliary_loss_clip": 0.06451584, + "auxiliary_loss_mlp": 0.01271624, + "balance_loss_clip": 0.0628166, + "balance_loss_mlp": 0.01257366, + "epoch": 0.39729445362994137, + "flos": 45634107525120.0, + "grad_norm": 2.013518755058626, + "language_loss": 0.68503535, + "learning_rate": 2.744042505013797e-06, + "loss": 0.76226741, + "num_input_tokens_seen": 141919580, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.1427002, + "step": 6608, + "time_per_iteration": 2.814741611480713 + }, + { + "auxiliary_loss_clip": 0.06453016, + "auxiliary_loss_mlp": 0.01270819, + "balance_loss_clip": 0.06280445, + "balance_loss_mlp": 0.01256496, + "epoch": 0.39735457688260933, + "flos": 20200249152000.0, + "grad_norm": 2.238404873213265, + "language_loss": 0.74168068, + "learning_rate": 2.7436809831466233e-06, + "loss": 0.818919, + "num_input_tokens_seen": 141937045, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.14318848, + "step": 6609, + "time_per_iteration": 2.549020767211914 + }, + { + "auxiliary_loss_clip": 0.06450722, + "auxiliary_loss_mlp": 0.01268408, + "balance_loss_clip": 0.06281993, + "balance_loss_mlp": 0.0125424, + "epoch": 0.3974147001352773, + "flos": 23337868014720.0, + "grad_norm": 1.4758458837885644, + "language_loss": 0.71468556, + "learning_rate": 2.7433194330789927e-06, + "loss": 0.79187685, + "num_input_tokens_seen": 141956695, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.14154053, + "step": 6610, + "time_per_iteration": 3.985957622528076 + }, + { + "auxiliary_loss_clip": 0.06440872, + "auxiliary_loss_mlp": 0.01270494, + "balance_loss_clip": 0.062764, + "balance_loss_mlp": 0.01256559, + "epoch": 0.39747482338794526, + "flos": 21694965010560.0, + "grad_norm": 1.555692262156073, + "language_loss": 0.7854501, + "learning_rate": 2.7429578548246133e-06, + "loss": 0.86256385, + "num_input_tokens_seen": 141975935, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.13934326, + "step": 6611, + "time_per_iteration": 2.5972208976745605 + }, + { + "auxiliary_loss_clip": 0.06447503, + "auxiliary_loss_mlp": 0.01268941, + "balance_loss_clip": 0.06280762, + "balance_loss_mlp": 0.01255065, + "epoch": 0.3975349466406133, + "flos": 30995957998080.0, + "grad_norm": 2.19308398220208, + "language_loss": 0.79606485, + "learning_rate": 2.7425962483971985e-06, + "loss": 0.87322932, + "num_input_tokens_seen": 141995750, + "router_z_loss_clip": 1.66699219, + "router_z_loss_mlp": 0.13891602, + "step": 6612, + "time_per_iteration": 2.6106274127960205 + }, + { + "auxiliary_loss_clip": 0.0634682, + "auxiliary_loss_mlp": 0.01253265, + "balance_loss_clip": 0.06270839, + "balance_loss_mlp": 0.01250469, + "epoch": 0.39759506989328125, + "flos": 63703426366080.0, + "grad_norm": 0.8245936024085626, + "language_loss": 0.6463905, + "learning_rate": 2.742234613810459e-06, + "loss": 0.72239137, + "num_input_tokens_seen": 142057655, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.02796936, + "step": 6613, + "time_per_iteration": 4.473678112030029 + }, + { + "auxiliary_loss_clip": 0.06450668, + "auxiliary_loss_mlp": 0.01269678, + "balance_loss_clip": 0.06282368, + "balance_loss_mlp": 0.01255367, + "epoch": 0.3976551931459492, + "flos": 23702570910720.0, + "grad_norm": 2.448614415916545, + "language_loss": 0.72596258, + "learning_rate": 2.741872951078109e-06, + "loss": 0.80316603, + "num_input_tokens_seen": 142076020, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.14312744, + "step": 6614, + "time_per_iteration": 2.5691444873809814 + }, + { + "auxiliary_loss_clip": 0.06449673, + "auxiliary_loss_mlp": 0.0127007, + "balance_loss_clip": 0.06283288, + "balance_loss_mlp": 0.01256051, + "epoch": 0.3977153163986172, + "flos": 15675166056960.0, + "grad_norm": 2.2284862441621995, + "language_loss": 0.81666011, + "learning_rate": 2.741511260213862e-06, + "loss": 0.89385748, + "num_input_tokens_seen": 142093790, + "router_z_loss_clip": 1.6640625, + "router_z_loss_mlp": 0.14013672, + "step": 6615, + "time_per_iteration": 2.55078387260437 + }, + { + "auxiliary_loss_clip": 0.06452717, + "auxiliary_loss_mlp": 0.01269531, + "balance_loss_clip": 0.06284063, + "balance_loss_mlp": 0.01255679, + "epoch": 0.39777543965128515, + "flos": 14070012117120.0, + "grad_norm": 1.96274897748641, + "language_loss": 0.67687142, + "learning_rate": 2.741149541231434e-06, + "loss": 0.75409389, + "num_input_tokens_seen": 142110545, + "router_z_loss_clip": 1.68554688, + "router_z_loss_mlp": 0.13842773, + "step": 6616, + "time_per_iteration": 2.533982992172241 + }, + { + "auxiliary_loss_clip": 0.06455097, + "auxiliary_loss_mlp": 0.0126897, + "balance_loss_clip": 0.06281532, + "balance_loss_mlp": 0.01253986, + "epoch": 0.3978355629039531, + "flos": 23374149632640.0, + "grad_norm": 2.1811174101900552, + "language_loss": 0.8396368, + "learning_rate": 2.740787794144541e-06, + "loss": 0.91687751, + "num_input_tokens_seen": 142128695, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.14978027, + "step": 6617, + "time_per_iteration": 3.9742090702056885 + }, + { + "auxiliary_loss_clip": 0.06446042, + "auxiliary_loss_mlp": 0.01268103, + "balance_loss_clip": 0.06283504, + "balance_loss_mlp": 0.01255556, + "epoch": 0.3978956861566211, + "flos": 19068852597120.0, + "grad_norm": 1.7253210008214133, + "language_loss": 0.73000187, + "learning_rate": 2.7404260189669e-06, + "loss": 0.80714333, + "num_input_tokens_seen": 142148375, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.12536621, + "step": 6618, + "time_per_iteration": 2.562913179397583 + }, + { + "auxiliary_loss_clip": 0.06454587, + "auxiliary_loss_mlp": 0.01274299, + "balance_loss_clip": 0.06285769, + "balance_loss_mlp": 0.01258576, + "epoch": 0.39795580940928904, + "flos": 30235679176320.0, + "grad_norm": 1.6365941861062427, + "language_loss": 0.65343797, + "learning_rate": 2.740064215712231e-06, + "loss": 0.73072684, + "num_input_tokens_seen": 142169735, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.15710449, + "step": 6619, + "time_per_iteration": 2.598667860031128 + }, + { + "auxiliary_loss_clip": 0.06341819, + "auxiliary_loss_mlp": 0.01254465, + "balance_loss_clip": 0.06266081, + "balance_loss_mlp": 0.01251738, + "epoch": 0.398015932661957, + "flos": 69867261688320.0, + "grad_norm": 0.7579483566665592, + "language_loss": 0.582268, + "learning_rate": 2.7397023843942527e-06, + "loss": 0.65823084, + "num_input_tokens_seen": 142229520, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.02731323, + "step": 6620, + "time_per_iteration": 4.528149604797363 + }, + { + "auxiliary_loss_clip": 0.06446633, + "auxiliary_loss_mlp": 0.01270084, + "balance_loss_clip": 0.06280729, + "balance_loss_mlp": 0.01256858, + "epoch": 0.39807605591462497, + "flos": 20164093315200.0, + "grad_norm": 1.5024608902652035, + "language_loss": 0.79499102, + "learning_rate": 2.739340525026686e-06, + "loss": 0.87215811, + "num_input_tokens_seen": 142247660, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.13232422, + "step": 6621, + "time_per_iteration": 2.559305191040039 + }, + { + "auxiliary_loss_clip": 0.06445563, + "auxiliary_loss_mlp": 0.01270989, + "balance_loss_clip": 0.06279579, + "balance_loss_mlp": 0.01257435, + "epoch": 0.39813617916729294, + "flos": 21148057411200.0, + "grad_norm": 1.7591122738615637, + "language_loss": 0.78347874, + "learning_rate": 2.738978637623252e-06, + "loss": 0.86064428, + "num_input_tokens_seen": 142266990, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.13568115, + "step": 6622, + "time_per_iteration": 2.541325569152832 + }, + { + "auxiliary_loss_clip": 0.06444648, + "auxiliary_loss_mlp": 0.01270694, + "balance_loss_clip": 0.06278688, + "balance_loss_mlp": 0.01255948, + "epoch": 0.3981963024199609, + "flos": 18994318790400.0, + "grad_norm": 9.51473607747463, + "language_loss": 0.75430334, + "learning_rate": 2.738616722197674e-06, + "loss": 0.83145678, + "num_input_tokens_seen": 142287170, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.14733887, + "step": 6623, + "time_per_iteration": 2.5859150886535645 + }, + { + "auxiliary_loss_clip": 0.06449074, + "auxiliary_loss_mlp": 0.0127457, + "balance_loss_clip": 0.06282511, + "balance_loss_mlp": 0.01260551, + "epoch": 0.39825642567262887, + "flos": 16579648955520.0, + "grad_norm": 1.7143371951380526, + "language_loss": 0.79926246, + "learning_rate": 2.7382547787636766e-06, + "loss": 0.87649894, + "num_input_tokens_seen": 142305405, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.14025879, + "step": 6624, + "time_per_iteration": 2.509500026702881 + }, + { + "auxiliary_loss_clip": 0.06454292, + "auxiliary_loss_mlp": 0.01269994, + "balance_loss_clip": 0.06280515, + "balance_loss_mlp": 0.01254234, + "epoch": 0.39831654892529683, + "flos": 22206303751680.0, + "grad_norm": 2.195062259081814, + "language_loss": 0.84314877, + "learning_rate": 2.7378928073349832e-06, + "loss": 0.92039162, + "num_input_tokens_seen": 142322710, + "router_z_loss_clip": 1.73535156, + "router_z_loss_mlp": 0.15759277, + "step": 6625, + "time_per_iteration": 2.5617175102233887 + }, + { + "auxiliary_loss_clip": 0.06446299, + "auxiliary_loss_mlp": 0.01272387, + "balance_loss_clip": 0.06279518, + "balance_loss_mlp": 0.01258517, + "epoch": 0.39837667217796485, + "flos": 10492485719040.0, + "grad_norm": 1.8250293636172175, + "language_loss": 0.8709324, + "learning_rate": 2.737530807925321e-06, + "loss": 0.94811928, + "num_input_tokens_seen": 142338535, + "router_z_loss_clip": 1.66699219, + "router_z_loss_mlp": 0.13867188, + "step": 6626, + "time_per_iteration": 2.72031307220459 + }, + { + "auxiliary_loss_clip": 0.06447423, + "auxiliary_loss_mlp": 0.01271086, + "balance_loss_clip": 0.0627908, + "balance_loss_mlp": 0.01256531, + "epoch": 0.3984367954306328, + "flos": 17970676986240.0, + "grad_norm": 2.760632977827581, + "language_loss": 0.84402627, + "learning_rate": 2.737168780548417e-06, + "loss": 0.9212113, + "num_input_tokens_seen": 142354570, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.14575195, + "step": 6627, + "time_per_iteration": 2.6228654384613037 + }, + { + "auxiliary_loss_clip": 0.06445234, + "auxiliary_loss_mlp": 0.01268693, + "balance_loss_clip": 0.0627917, + "balance_loss_mlp": 0.01255443, + "epoch": 0.3984969186833008, + "flos": 22717684419840.0, + "grad_norm": 3.2429830324928095, + "language_loss": 0.83402491, + "learning_rate": 2.736806725217998e-06, + "loss": 0.91116416, + "num_input_tokens_seen": 142374395, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.13250732, + "step": 6628, + "time_per_iteration": 2.6287484169006348 + }, + { + "auxiliary_loss_clip": 0.06449139, + "auxiliary_loss_mlp": 0.01271852, + "balance_loss_clip": 0.06279008, + "balance_loss_mlp": 0.01256981, + "epoch": 0.39855704193596875, + "flos": 23412779164800.0, + "grad_norm": 1.5731823007903518, + "language_loss": 0.71793973, + "learning_rate": 2.7364446419477945e-06, + "loss": 0.79514968, + "num_input_tokens_seen": 142396040, + "router_z_loss_clip": 1.70117188, + "router_z_loss_mlp": 0.14868164, + "step": 6629, + "time_per_iteration": 2.5752875804901123 + }, + { + "auxiliary_loss_clip": 0.06441505, + "auxiliary_loss_mlp": 0.01268472, + "balance_loss_clip": 0.06280406, + "balance_loss_mlp": 0.01254834, + "epoch": 0.3986171651886367, + "flos": 21258369711360.0, + "grad_norm": 2.035566678796665, + "language_loss": 0.80905473, + "learning_rate": 2.7360825307515366e-06, + "loss": 0.88615453, + "num_input_tokens_seen": 142415495, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.1362915, + "step": 6630, + "time_per_iteration": 2.5329513549804688 + }, + { + "auxiliary_loss_clip": 0.06445715, + "auxiliary_loss_mlp": 0.01269878, + "balance_loss_clip": 0.06276714, + "balance_loss_mlp": 0.01255693, + "epoch": 0.3986772884413047, + "flos": 12463642293120.0, + "grad_norm": 2.1251751047068783, + "language_loss": 0.75146663, + "learning_rate": 2.7357203916429555e-06, + "loss": 0.82862258, + "num_input_tokens_seen": 142431865, + "router_z_loss_clip": 1.69042969, + "router_z_loss_mlp": 0.14190674, + "step": 6631, + "time_per_iteration": 2.5500082969665527 + }, + { + "auxiliary_loss_clip": 0.06448178, + "auxiliary_loss_mlp": 0.01269111, + "balance_loss_clip": 0.06279311, + "balance_loss_mlp": 0.0125505, + "epoch": 0.39873741169397264, + "flos": 19652209522560.0, + "grad_norm": 1.6915315525927903, + "language_loss": 0.71496904, + "learning_rate": 2.735358224635783e-06, + "loss": 0.79214191, + "num_input_tokens_seen": 142450595, + "router_z_loss_clip": 1.69042969, + "router_z_loss_mlp": 0.140625, + "step": 6632, + "time_per_iteration": 2.563776731491089 + }, + { + "auxiliary_loss_clip": 0.06444843, + "auxiliary_loss_mlp": 0.01269449, + "balance_loss_clip": 0.06279632, + "balance_loss_mlp": 0.01255955, + "epoch": 0.3987975349466406, + "flos": 21690436890240.0, + "grad_norm": 1.8116978167005697, + "language_loss": 0.75623924, + "learning_rate": 2.7349960297437533e-06, + "loss": 0.83338219, + "num_input_tokens_seen": 142466650, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.13494873, + "step": 6633, + "time_per_iteration": 2.5171151161193848 + }, + { + "auxiliary_loss_clip": 0.06449188, + "auxiliary_loss_mlp": 0.01272467, + "balance_loss_clip": 0.06280442, + "balance_loss_mlp": 0.0125846, + "epoch": 0.3988576581993086, + "flos": 23920721815680.0, + "grad_norm": 1.9002609831735993, + "language_loss": 0.81678545, + "learning_rate": 2.7346338069806e-06, + "loss": 0.89400202, + "num_input_tokens_seen": 142486165, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.14001465, + "step": 6634, + "time_per_iteration": 2.539128065109253 + }, + { + "auxiliary_loss_clip": 0.06453361, + "auxiliary_loss_mlp": 0.01269766, + "balance_loss_clip": 0.06283009, + "balance_loss_mlp": 0.01255449, + "epoch": 0.39891778145197654, + "flos": 18155690801280.0, + "grad_norm": 1.9946050359209588, + "language_loss": 0.7547667, + "learning_rate": 2.7342715563600597e-06, + "loss": 0.83199799, + "num_input_tokens_seen": 142505035, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.14306641, + "step": 6635, + "time_per_iteration": 2.5426242351531982 + }, + { + "auxiliary_loss_clip": 0.06468328, + "auxiliary_loss_mlp": 0.01272826, + "balance_loss_clip": 0.06289048, + "balance_loss_mlp": 0.01256053, + "epoch": 0.3989779047046445, + "flos": 22600831501440.0, + "grad_norm": 1.9740114535883675, + "language_loss": 0.66474432, + "learning_rate": 2.733909277895868e-06, + "loss": 0.74215585, + "num_input_tokens_seen": 142521870, + "router_z_loss_clip": 1.79296875, + "router_z_loss_mlp": 0.16760254, + "step": 6636, + "time_per_iteration": 2.5290956497192383 + }, + { + "auxiliary_loss_clip": 0.06452767, + "auxiliary_loss_mlp": 0.01270258, + "balance_loss_clip": 0.06285115, + "balance_loss_mlp": 0.01255012, + "epoch": 0.39903802795731247, + "flos": 18083043711360.0, + "grad_norm": 1.6936131920640751, + "language_loss": 0.82211542, + "learning_rate": 2.733546971601763e-06, + "loss": 0.89934564, + "num_input_tokens_seen": 142540455, + "router_z_loss_clip": 1.67773438, + "router_z_loss_mlp": 0.15246582, + "step": 6637, + "time_per_iteration": 2.516279458999634 + }, + { + "auxiliary_loss_clip": 0.06353697, + "auxiliary_loss_mlp": 0.01252791, + "balance_loss_clip": 0.06278069, + "balance_loss_mlp": 0.01250418, + "epoch": 0.39909815120998043, + "flos": 70463238652800.0, + "grad_norm": 0.7262189478909644, + "language_loss": 0.531524, + "learning_rate": 2.733184637491484e-06, + "loss": 0.60758889, + "num_input_tokens_seen": 142599665, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.0236969, + "step": 6638, + "time_per_iteration": 3.2179603576660156 + }, + { + "auxiliary_loss_clip": 0.06449973, + "auxiliary_loss_mlp": 0.01277744, + "balance_loss_clip": 0.06279011, + "balance_loss_mlp": 0.0126304, + "epoch": 0.39915827446264845, + "flos": 18554788598400.0, + "grad_norm": 1.4980640352775056, + "language_loss": 0.75670731, + "learning_rate": 2.732822275578769e-06, + "loss": 0.83398449, + "num_input_tokens_seen": 142618845, + "router_z_loss_clip": 1.70898438, + "router_z_loss_mlp": 0.14715576, + "step": 6639, + "time_per_iteration": 2.5319011211395264 + }, + { + "auxiliary_loss_clip": 0.06442601, + "auxiliary_loss_mlp": 0.01272751, + "balance_loss_clip": 0.0627881, + "balance_loss_mlp": 0.01258249, + "epoch": 0.3992183977153164, + "flos": 29904826129920.0, + "grad_norm": 2.014095124557279, + "language_loss": 0.76376802, + "learning_rate": 2.7324598858773603e-06, + "loss": 0.84092152, + "num_input_tokens_seen": 142640885, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.1451416, + "step": 6640, + "time_per_iteration": 2.642223834991455 + }, + { + "auxiliary_loss_clip": 0.06449724, + "auxiliary_loss_mlp": 0.01270265, + "balance_loss_clip": 0.06280393, + "balance_loss_mlp": 0.01255757, + "epoch": 0.3992785209679844, + "flos": 22571677480320.0, + "grad_norm": 2.238528881986372, + "language_loss": 0.8211664, + "learning_rate": 2.7320974684009996e-06, + "loss": 0.89836633, + "num_input_tokens_seen": 142659340, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.14501953, + "step": 6641, + "time_per_iteration": 2.530189275741577 + }, + { + "auxiliary_loss_clip": 0.06456075, + "auxiliary_loss_mlp": 0.01270045, + "balance_loss_clip": 0.06284191, + "balance_loss_mlp": 0.01254971, + "epoch": 0.39933864422065235, + "flos": 19688784629760.0, + "grad_norm": 1.8306704082742173, + "language_loss": 0.77208257, + "learning_rate": 2.7317350231634288e-06, + "loss": 0.84934378, + "num_input_tokens_seen": 142677085, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.15081787, + "step": 6642, + "time_per_iteration": 2.5495219230651855 + }, + { + "auxiliary_loss_clip": 0.06453043, + "auxiliary_loss_mlp": 0.01270555, + "balance_loss_clip": 0.06281064, + "balance_loss_mlp": 0.01255564, + "epoch": 0.3993987674733203, + "flos": 23045015594880.0, + "grad_norm": 2.242078242091602, + "language_loss": 0.72883618, + "learning_rate": 2.731372550178393e-06, + "loss": 0.80607212, + "num_input_tokens_seen": 142694595, + "router_z_loss_clip": 1.71972656, + "router_z_loss_mlp": 0.14984131, + "step": 6643, + "time_per_iteration": 2.521857500076294 + }, + { + "auxiliary_loss_clip": 0.06456347, + "auxiliary_loss_mlp": 0.01273961, + "balance_loss_clip": 0.06283459, + "balance_loss_mlp": 0.01259317, + "epoch": 0.3994588907259883, + "flos": 19396896531840.0, + "grad_norm": 1.7649027305896348, + "language_loss": 0.66785717, + "learning_rate": 2.7310100494596375e-06, + "loss": 0.74516022, + "num_input_tokens_seen": 142714175, + "router_z_loss_clip": 1.72851562, + "router_z_loss_mlp": 0.14642334, + "step": 6644, + "time_per_iteration": 2.571690320968628 + }, + { + "auxiliary_loss_clip": 0.06454624, + "auxiliary_loss_mlp": 0.0127806, + "balance_loss_clip": 0.06282313, + "balance_loss_mlp": 0.01263737, + "epoch": 0.39951901397865625, + "flos": 13739326778880.0, + "grad_norm": 1.9095077452421072, + "language_loss": 0.78757256, + "learning_rate": 2.730647521020907e-06, + "loss": 0.86489946, + "num_input_tokens_seen": 142730955, + "router_z_loss_clip": 1.72363281, + "router_z_loss_mlp": 0.14312744, + "step": 6645, + "time_per_iteration": 2.499361753463745 + }, + { + "auxiliary_loss_clip": 0.06458238, + "auxiliary_loss_mlp": 0.01274341, + "balance_loss_clip": 0.06283879, + "balance_loss_mlp": 0.01259321, + "epoch": 0.3995791372313242, + "flos": 23593181005440.0, + "grad_norm": 1.5926569767996783, + "language_loss": 0.7044934, + "learning_rate": 2.73028496487595e-06, + "loss": 0.78181922, + "num_input_tokens_seen": 142751200, + "router_z_loss_clip": 1.74414062, + "router_z_loss_mlp": 0.15026855, + "step": 6646, + "time_per_iteration": 2.619114875793457 + }, + { + "auxiliary_loss_clip": 0.06456489, + "auxiliary_loss_mlp": 0.01271766, + "balance_loss_clip": 0.06284152, + "balance_loss_mlp": 0.01257103, + "epoch": 0.3996392604839922, + "flos": 21361428633600.0, + "grad_norm": 2.2667385155288917, + "language_loss": 0.72035694, + "learning_rate": 2.729922381038513e-06, + "loss": 0.79763949, + "num_input_tokens_seen": 142770170, + "router_z_loss_clip": 1.72265625, + "router_z_loss_mlp": 0.14660645, + "step": 6647, + "time_per_iteration": 2.58251953125 + }, + { + "auxiliary_loss_clip": 0.06449988, + "auxiliary_loss_mlp": 0.01272061, + "balance_loss_clip": 0.06284988, + "balance_loss_mlp": 0.01257195, + "epoch": 0.39969938373666014, + "flos": 26039604337920.0, + "grad_norm": 1.4692875023338006, + "language_loss": 0.74830031, + "learning_rate": 2.7295597695223463e-06, + "loss": 0.82552081, + "num_input_tokens_seen": 142792680, + "router_z_loss_clip": 1.64941406, + "router_z_loss_mlp": 0.14849854, + "step": 6648, + "time_per_iteration": 2.7020201683044434 + }, + { + "auxiliary_loss_clip": 0.06453955, + "auxiliary_loss_mlp": 0.0126884, + "balance_loss_clip": 0.06283584, + "balance_loss_mlp": 0.0125472, + "epoch": 0.3997595069893281, + "flos": 20121858057600.0, + "grad_norm": 2.0106261298514907, + "language_loss": 0.65986454, + "learning_rate": 2.7291971303412006e-06, + "loss": 0.73709244, + "num_input_tokens_seen": 142810510, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.14117432, + "step": 6649, + "time_per_iteration": 3.9323928356170654 + }, + { + "auxiliary_loss_clip": 0.06463098, + "auxiliary_loss_mlp": 0.0127713, + "balance_loss_clip": 0.06290667, + "balance_loss_mlp": 0.01260774, + "epoch": 0.39981963024199607, + "flos": 27791016779520.0, + "grad_norm": 1.831691866077207, + "language_loss": 0.75774682, + "learning_rate": 2.728834463508826e-06, + "loss": 0.83514905, + "num_input_tokens_seen": 142832455, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.16357422, + "step": 6650, + "time_per_iteration": 2.6374714374542236 + }, + { + "auxiliary_loss_clip": 0.06454846, + "auxiliary_loss_mlp": 0.01272611, + "balance_loss_clip": 0.06283177, + "balance_loss_mlp": 0.01257782, + "epoch": 0.39987975349466404, + "flos": 21950864979840.0, + "grad_norm": 1.4608995971033776, + "language_loss": 0.7199676, + "learning_rate": 2.728471769038975e-06, + "loss": 0.79724216, + "num_input_tokens_seen": 142852590, + "router_z_loss_clip": 1.71582031, + "router_z_loss_mlp": 0.14831543, + "step": 6651, + "time_per_iteration": 2.5789706707000732 + }, + { + "auxiliary_loss_clip": 0.06457064, + "auxiliary_loss_mlp": 0.01269592, + "balance_loss_clip": 0.06283179, + "balance_loss_mlp": 0.01255245, + "epoch": 0.39993987674733206, + "flos": 20710707425280.0, + "grad_norm": 1.930350074981486, + "language_loss": 0.73724478, + "learning_rate": 2.728109046945403e-06, + "loss": 0.8145113, + "num_input_tokens_seen": 142870595, + "router_z_loss_clip": 1.74023438, + "router_z_loss_mlp": 0.14331055, + "step": 6652, + "time_per_iteration": 3.9592838287353516 + }, + { + "auxiliary_loss_clip": 0.06347093, + "auxiliary_loss_mlp": 0.01255075, + "balance_loss_clip": 0.06271589, + "balance_loss_mlp": 0.01252878, + "epoch": 0.4, + "flos": 61543566397440.0, + "grad_norm": 0.8159851457251004, + "language_loss": 0.60542929, + "learning_rate": 2.727746297241862e-06, + "loss": 0.68145096, + "num_input_tokens_seen": 142925805, + "router_z_loss_clip": 0.75732422, + "router_z_loss_mlp": 0.02201843, + "step": 6653, + "time_per_iteration": 3.0700466632843018 + }, + { + "auxiliary_loss_clip": 0.06454087, + "auxiliary_loss_mlp": 0.01272182, + "balance_loss_clip": 0.0629051, + "balance_loss_mlp": 0.01257698, + "epoch": 0.400060123252668, + "flos": 14507655592320.0, + "grad_norm": 1.9278074838902122, + "language_loss": 0.66929328, + "learning_rate": 2.7273835199421085e-06, + "loss": 0.74655592, + "num_input_tokens_seen": 142943145, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.14477539, + "step": 6654, + "time_per_iteration": 2.5292413234710693 + }, + { + "auxiliary_loss_clip": 0.06457023, + "auxiliary_loss_mlp": 0.01271182, + "balance_loss_clip": 0.06287654, + "balance_loss_mlp": 0.01257396, + "epoch": 0.40012024650533595, + "flos": 19098383961600.0, + "grad_norm": 1.998304088554008, + "language_loss": 0.90550762, + "learning_rate": 2.7270207150599e-06, + "loss": 0.98278964, + "num_input_tokens_seen": 142956925, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.13775635, + "step": 6655, + "time_per_iteration": 2.529496192932129 + }, + { + "auxiliary_loss_clip": 0.06450539, + "auxiliary_loss_mlp": 0.012675, + "balance_loss_clip": 0.06286812, + "balance_loss_mlp": 0.01254899, + "epoch": 0.4001803697580039, + "flos": 29358673217280.0, + "grad_norm": 1.6559902316252946, + "language_loss": 0.73729336, + "learning_rate": 2.7266578826089917e-06, + "loss": 0.81447375, + "num_input_tokens_seen": 142978040, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.1260376, + "step": 6656, + "time_per_iteration": 4.062687158584595 + }, + { + "auxiliary_loss_clip": 0.0645894, + "auxiliary_loss_mlp": 0.01271003, + "balance_loss_clip": 0.06288408, + "balance_loss_mlp": 0.01255696, + "epoch": 0.4002404930106719, + "flos": 20925839583360.0, + "grad_norm": 1.4738199157728433, + "language_loss": 0.73207194, + "learning_rate": 2.726295022603144e-06, + "loss": 0.80937135, + "num_input_tokens_seen": 142998390, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.15307617, + "step": 6657, + "time_per_iteration": 2.5996904373168945 + }, + { + "auxiliary_loss_clip": 0.06458808, + "auxiliary_loss_mlp": 0.0127186, + "balance_loss_clip": 0.06288153, + "balance_loss_mlp": 0.01256506, + "epoch": 0.40030061626333985, + "flos": 28413799850880.0, + "grad_norm": 1.489557881553797, + "language_loss": 0.79247761, + "learning_rate": 2.725932135056117e-06, + "loss": 0.86978424, + "num_input_tokens_seen": 143021505, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.15350342, + "step": 6658, + "time_per_iteration": 2.7172279357910156 + }, + { + "auxiliary_loss_clip": 0.06459276, + "auxiliary_loss_mlp": 0.01278121, + "balance_loss_clip": 0.06289512, + "balance_loss_mlp": 0.01264084, + "epoch": 0.4003607395160078, + "flos": 25928746986240.0, + "grad_norm": 2.1209995886317956, + "language_loss": 0.77640641, + "learning_rate": 2.72556921998167e-06, + "loss": 0.85378039, + "num_input_tokens_seen": 143041375, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.14050293, + "step": 6659, + "time_per_iteration": 4.3210484981536865 + }, + { + "auxiliary_loss_clip": 0.06450686, + "auxiliary_loss_mlp": 0.01279792, + "balance_loss_clip": 0.06291049, + "balance_loss_mlp": 0.01267442, + "epoch": 0.4004208627686758, + "flos": 20773501660800.0, + "grad_norm": 1.7380110296153854, + "language_loss": 0.73432875, + "learning_rate": 2.7252062773935662e-06, + "loss": 0.81163359, + "num_input_tokens_seen": 143058725, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.12359619, + "step": 6660, + "time_per_iteration": 2.668088436126709 + }, + { + "auxiliary_loss_clip": 0.06457424, + "auxiliary_loss_mlp": 0.01270844, + "balance_loss_clip": 0.06287603, + "balance_loss_mlp": 0.01258077, + "epoch": 0.40048098602134374, + "flos": 24688170161280.0, + "grad_norm": 2.131845423391088, + "language_loss": 0.71318859, + "learning_rate": 2.7248433073055674e-06, + "loss": 0.79047126, + "num_input_tokens_seen": 143076995, + "router_z_loss_clip": 1.69726562, + "router_z_loss_mlp": 0.12786865, + "step": 6661, + "time_per_iteration": 2.5673065185546875 + }, + { + "auxiliary_loss_clip": 0.06462744, + "auxiliary_loss_mlp": 0.01272248, + "balance_loss_clip": 0.06291083, + "balance_loss_mlp": 0.01257889, + "epoch": 0.4005411092740117, + "flos": 23192448053760.0, + "grad_norm": 1.7831816831822005, + "language_loss": 0.75751495, + "learning_rate": 2.724480309731437e-06, + "loss": 0.83486485, + "num_input_tokens_seen": 143096780, + "router_z_loss_clip": 1.71582031, + "router_z_loss_mlp": 0.14361572, + "step": 6662, + "time_per_iteration": 2.5870559215545654 + }, + { + "auxiliary_loss_clip": 0.06461672, + "auxiliary_loss_mlp": 0.01271183, + "balance_loss_clip": 0.0628756, + "balance_loss_mlp": 0.01256175, + "epoch": 0.4006012325266797, + "flos": 17526786382080.0, + "grad_norm": 2.241735466255753, + "language_loss": 0.66247231, + "learning_rate": 2.7241172846849417e-06, + "loss": 0.73980081, + "num_input_tokens_seen": 143112590, + "router_z_loss_clip": 1.7421875, + "router_z_loss_mlp": 0.15014648, + "step": 6663, + "time_per_iteration": 2.5879623889923096 + }, + { + "auxiliary_loss_clip": 0.06461117, + "auxiliary_loss_mlp": 0.01271573, + "balance_loss_clip": 0.06290103, + "balance_loss_mlp": 0.01257316, + "epoch": 0.40066135577934764, + "flos": 19862016946560.0, + "grad_norm": 2.129058070747091, + "language_loss": 0.86377645, + "learning_rate": 2.7237542321798455e-06, + "loss": 0.94110334, + "num_input_tokens_seen": 143130220, + "router_z_loss_clip": 1.70898438, + "router_z_loss_mlp": 0.14251709, + "step": 6664, + "time_per_iteration": 2.580240249633789 + }, + { + "auxiliary_loss_clip": 0.06459028, + "auxiliary_loss_mlp": 0.01272821, + "balance_loss_clip": 0.06287652, + "balance_loss_mlp": 0.01259064, + "epoch": 0.40072147903201566, + "flos": 18155816582400.0, + "grad_norm": 1.9805392577959038, + "language_loss": 0.84895325, + "learning_rate": 2.723391152229917e-06, + "loss": 0.92627168, + "num_input_tokens_seen": 143147160, + "router_z_loss_clip": 1.71191406, + "router_z_loss_mlp": 0.13751221, + "step": 6665, + "time_per_iteration": 2.50386381149292 + }, + { + "auxiliary_loss_clip": 0.06457423, + "auxiliary_loss_mlp": 0.01268968, + "balance_loss_clip": 0.06286919, + "balance_loss_mlp": 0.0125458, + "epoch": 0.4007816022846836, + "flos": 18667239177600.0, + "grad_norm": 1.826402815553393, + "language_loss": 0.78598213, + "learning_rate": 2.7230280448489236e-06, + "loss": 0.86324608, + "num_input_tokens_seen": 143164605, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.14404297, + "step": 6666, + "time_per_iteration": 2.5133461952209473 + }, + { + "auxiliary_loss_clip": 0.06465514, + "auxiliary_loss_mlp": 0.01268831, + "balance_loss_clip": 0.06295928, + "balance_loss_mlp": 0.01253834, + "epoch": 0.4008417255373516, + "flos": 25710344519040.0, + "grad_norm": 1.8943268651740763, + "language_loss": 0.74139559, + "learning_rate": 2.7226649100506333e-06, + "loss": 0.81873906, + "num_input_tokens_seen": 143183965, + "router_z_loss_clip": 1.69433594, + "router_z_loss_mlp": 0.14990234, + "step": 6667, + "time_per_iteration": 2.635195732116699 + }, + { + "auxiliary_loss_clip": 0.06460091, + "auxiliary_loss_mlp": 0.01273802, + "balance_loss_clip": 0.06287248, + "balance_loss_mlp": 0.01258519, + "epoch": 0.40090184879001955, + "flos": 22865536149120.0, + "grad_norm": 1.4912552700664468, + "language_loss": 0.75818384, + "learning_rate": 2.7223017478488183e-06, + "loss": 0.83552277, + "num_input_tokens_seen": 143204965, + "router_z_loss_clip": 1.72851562, + "router_z_loss_mlp": 0.15270996, + "step": 6668, + "time_per_iteration": 2.567748546600342 + }, + { + "auxiliary_loss_clip": 0.06454465, + "auxiliary_loss_mlp": 0.01272615, + "balance_loss_clip": 0.0628936, + "balance_loss_mlp": 0.01258572, + "epoch": 0.4009619720426875, + "flos": 29067581733120.0, + "grad_norm": 1.8066450616757106, + "language_loss": 0.82171971, + "learning_rate": 2.721938558257248e-06, + "loss": 0.89899051, + "num_input_tokens_seen": 143225015, + "router_z_loss_clip": 1.64941406, + "router_z_loss_mlp": 0.14050293, + "step": 6669, + "time_per_iteration": 2.614875555038452 + }, + { + "auxiliary_loss_clip": 0.06349576, + "auxiliary_loss_mlp": 0.01259788, + "balance_loss_clip": 0.06273951, + "balance_loss_mlp": 0.01257549, + "epoch": 0.4010220952953555, + "flos": 66080347136640.0, + "grad_norm": 0.6837113267664942, + "language_loss": 0.53268963, + "learning_rate": 2.721575341289695e-06, + "loss": 0.60878325, + "num_input_tokens_seen": 143294925, + "router_z_loss_clip": 0.75683594, + "router_z_loss_mlp": 0.02243042, + "step": 6670, + "time_per_iteration": 3.2985219955444336 + }, + { + "auxiliary_loss_clip": 0.06453651, + "auxiliary_loss_mlp": 0.01274966, + "balance_loss_clip": 0.06286684, + "balance_loss_mlp": 0.01260405, + "epoch": 0.40108221854802345, + "flos": 29650519388160.0, + "grad_norm": 1.6370315093264123, + "language_loss": 0.88528681, + "learning_rate": 2.7212120969599333e-06, + "loss": 0.96257305, + "num_input_tokens_seen": 143314170, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.14556885, + "step": 6671, + "time_per_iteration": 2.6268246173858643 + }, + { + "auxiliary_loss_clip": 0.06460971, + "auxiliary_loss_mlp": 0.01272066, + "balance_loss_clip": 0.06289764, + "balance_loss_mlp": 0.01256861, + "epoch": 0.4011423418006914, + "flos": 19934286693120.0, + "grad_norm": 1.7015153377224497, + "language_loss": 0.78868973, + "learning_rate": 2.720848825281736e-06, + "loss": 0.86602008, + "num_input_tokens_seen": 143330050, + "router_z_loss_clip": 1.7109375, + "router_z_loss_mlp": 0.1519165, + "step": 6672, + "time_per_iteration": 2.4949698448181152 + }, + { + "auxiliary_loss_clip": 0.06458279, + "auxiliary_loss_mlp": 0.01271887, + "balance_loss_clip": 0.06290099, + "balance_loss_mlp": 0.01257701, + "epoch": 0.4012024650533594, + "flos": 20090523830400.0, + "grad_norm": 2.076088840896174, + "language_loss": 0.63474464, + "learning_rate": 2.72048552626888e-06, + "loss": 0.71204633, + "num_input_tokens_seen": 143348650, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.1418457, + "step": 6673, + "time_per_iteration": 2.644050121307373 + }, + { + "auxiliary_loss_clip": 0.06458048, + "auxiliary_loss_mlp": 0.0127375, + "balance_loss_clip": 0.062879, + "balance_loss_mlp": 0.01259827, + "epoch": 0.40126258830602735, + "flos": 21703224637440.0, + "grad_norm": 1.4478595936596839, + "language_loss": 0.80581552, + "learning_rate": 2.7201221999351402e-06, + "loss": 0.88313353, + "num_input_tokens_seen": 143370275, + "router_z_loss_clip": 1.70117188, + "router_z_loss_mlp": 0.13903809, + "step": 6674, + "time_per_iteration": 2.559034824371338 + }, + { + "auxiliary_loss_clip": 0.0646532, + "auxiliary_loss_mlp": 0.01272896, + "balance_loss_clip": 0.06289816, + "balance_loss_mlp": 0.01258269, + "epoch": 0.4013227115586953, + "flos": 12025160277120.0, + "grad_norm": 2.4455561687367195, + "language_loss": 0.82561237, + "learning_rate": 2.719758846294294e-06, + "loss": 0.90299457, + "num_input_tokens_seen": 143385390, + "router_z_loss_clip": 1.75390625, + "router_z_loss_mlp": 0.14624023, + "step": 6675, + "time_per_iteration": 2.5448951721191406 + }, + { + "auxiliary_loss_clip": 0.06465134, + "auxiliary_loss_mlp": 0.01268709, + "balance_loss_clip": 0.06295693, + "balance_loss_mlp": 0.01254106, + "epoch": 0.4013828348113633, + "flos": 25454612257920.0, + "grad_norm": 1.6408733853472015, + "language_loss": 0.93777156, + "learning_rate": 2.71939546536012e-06, + "loss": 1.01511002, + "num_input_tokens_seen": 143404215, + "router_z_loss_clip": 1.69433594, + "router_z_loss_mlp": 0.14581299, + "step": 6676, + "time_per_iteration": 2.5721349716186523 + }, + { + "auxiliary_loss_clip": 0.06469207, + "auxiliary_loss_mlp": 0.01274451, + "balance_loss_clip": 0.06291738, + "balance_loss_mlp": 0.01258274, + "epoch": 0.40144295806403124, + "flos": 18588009542400.0, + "grad_norm": 2.5026106137632222, + "language_loss": 0.80060673, + "learning_rate": 2.719032057146399e-06, + "loss": 0.87804335, + "num_input_tokens_seen": 143422245, + "router_z_loss_clip": 1.7734375, + "router_z_loss_mlp": 0.16186523, + "step": 6677, + "time_per_iteration": 2.5438191890716553 + }, + { + "auxiliary_loss_clip": 0.06455022, + "auxiliary_loss_mlp": 0.01270715, + "balance_loss_clip": 0.0628567, + "balance_loss_mlp": 0.01256934, + "epoch": 0.4015030813166992, + "flos": 22936925427840.0, + "grad_norm": 1.8567640541952835, + "language_loss": 0.83925951, + "learning_rate": 2.71866862166691e-06, + "loss": 0.9165169, + "num_input_tokens_seen": 143443130, + "router_z_loss_clip": 1.69042969, + "router_z_loss_mlp": 0.13793945, + "step": 6678, + "time_per_iteration": 2.5458457469940186 + }, + { + "auxiliary_loss_clip": 0.06455562, + "auxiliary_loss_mlp": 0.0127344, + "balance_loss_clip": 0.06287661, + "balance_loss_mlp": 0.01258325, + "epoch": 0.4015632045693672, + "flos": 20601359447040.0, + "grad_norm": 2.2595275456436767, + "language_loss": 0.6400671, + "learning_rate": 2.718305158935434e-06, + "loss": 0.7173571, + "num_input_tokens_seen": 143461385, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.15124512, + "step": 6679, + "time_per_iteration": 2.553312063217163 + }, + { + "auxiliary_loss_clip": 0.0645475, + "auxiliary_loss_mlp": 0.01270251, + "balance_loss_clip": 0.06285992, + "balance_loss_mlp": 0.01256268, + "epoch": 0.4016233278220352, + "flos": 23445371203200.0, + "grad_norm": 1.525723625053638, + "language_loss": 0.78686285, + "learning_rate": 2.7179416689657554e-06, + "loss": 0.86411297, + "num_input_tokens_seen": 143481750, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.14001465, + "step": 6680, + "time_per_iteration": 2.5376389026641846 + }, + { + "auxiliary_loss_clip": 0.0646753, + "auxiliary_loss_mlp": 0.0127372, + "balance_loss_clip": 0.06289258, + "balance_loss_mlp": 0.01258008, + "epoch": 0.40168345107470316, + "flos": 21436968689280.0, + "grad_norm": 1.5038657697958466, + "language_loss": 0.76059246, + "learning_rate": 2.7175781517716556e-06, + "loss": 0.83800501, + "num_input_tokens_seen": 143501540, + "router_z_loss_clip": 1.78417969, + "router_z_loss_mlp": 0.15710449, + "step": 6681, + "time_per_iteration": 2.532668352127075 + }, + { + "auxiliary_loss_clip": 0.06461542, + "auxiliary_loss_mlp": 0.01268459, + "balance_loss_clip": 0.06289437, + "balance_loss_mlp": 0.01254285, + "epoch": 0.4017435743273711, + "flos": 22863900994560.0, + "grad_norm": 2.212326324471445, + "language_loss": 0.6446861, + "learning_rate": 2.7172146073669213e-06, + "loss": 0.72198606, + "num_input_tokens_seen": 143520530, + "router_z_loss_clip": 1.72070312, + "router_z_loss_mlp": 0.1416626, + "step": 6682, + "time_per_iteration": 2.585963010787964 + }, + { + "auxiliary_loss_clip": 0.06452938, + "auxiliary_loss_mlp": 0.01271302, + "balance_loss_clip": 0.06279296, + "balance_loss_mlp": 0.01257288, + "epoch": 0.4018036975800391, + "flos": 28630022112000.0, + "grad_norm": 1.839007150843812, + "language_loss": 0.73340857, + "learning_rate": 2.716851035765337e-06, + "loss": 0.81065094, + "num_input_tokens_seen": 143540210, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.14013672, + "step": 6683, + "time_per_iteration": 2.5977652072906494 + }, + { + "auxiliary_loss_clip": 0.06452199, + "auxiliary_loss_mlp": 0.01270902, + "balance_loss_clip": 0.0628196, + "balance_loss_mlp": 0.01257252, + "epoch": 0.40186382083270705, + "flos": 26658446267520.0, + "grad_norm": 1.545951486041889, + "language_loss": 0.73326242, + "learning_rate": 2.7164874369806896e-06, + "loss": 0.81049347, + "num_input_tokens_seen": 143560940, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.13671875, + "step": 6684, + "time_per_iteration": 2.579061985015869 + }, + { + "auxiliary_loss_clip": 0.06341122, + "auxiliary_loss_mlp": 0.01263578, + "balance_loss_clip": 0.06265609, + "balance_loss_mlp": 0.01260683, + "epoch": 0.401923944085375, + "flos": 59277167562240.0, + "grad_norm": 0.7966859396902427, + "language_loss": 0.60515714, + "learning_rate": 2.716123811026767e-06, + "loss": 0.68120408, + "num_input_tokens_seen": 143624015, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 0.02891541, + "step": 6685, + "time_per_iteration": 3.2738587856292725 + }, + { + "auxiliary_loss_clip": 0.06456321, + "auxiliary_loss_mlp": 0.01269632, + "balance_loss_clip": 0.06278493, + "balance_loss_mlp": 0.01255291, + "epoch": 0.401984067338043, + "flos": 16988473825920.0, + "grad_norm": 1.7615677724791905, + "language_loss": 0.70125616, + "learning_rate": 2.715760157917357e-06, + "loss": 0.77851576, + "num_input_tokens_seen": 143642750, + "router_z_loss_clip": 1.77929688, + "router_z_loss_mlp": 0.14343262, + "step": 6686, + "time_per_iteration": 2.565185070037842 + }, + { + "auxiliary_loss_clip": 0.06450202, + "auxiliary_loss_mlp": 0.01269094, + "balance_loss_clip": 0.06279442, + "balance_loss_mlp": 0.0125554, + "epoch": 0.40204419059071095, + "flos": 24979387426560.0, + "grad_norm": 1.3440220766592053, + "language_loss": 0.74867636, + "learning_rate": 2.7153964776662504e-06, + "loss": 0.82586932, + "num_input_tokens_seen": 143664515, + "router_z_loss_clip": 1.70800781, + "router_z_loss_mlp": 0.13549805, + "step": 6687, + "time_per_iteration": 2.6009433269500732 + }, + { + "auxiliary_loss_clip": 0.06451625, + "auxiliary_loss_mlp": 0.01275028, + "balance_loss_clip": 0.06281097, + "balance_loss_mlp": 0.01261164, + "epoch": 0.4021043138433789, + "flos": 23484252297600.0, + "grad_norm": 1.7565801002117698, + "language_loss": 0.71198428, + "learning_rate": 2.7150327702872385e-06, + "loss": 0.78925073, + "num_input_tokens_seen": 143683135, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.13873291, + "step": 6688, + "time_per_iteration": 3.9550609588623047 + }, + { + "auxiliary_loss_clip": 0.06455014, + "auxiliary_loss_mlp": 0.01278979, + "balance_loss_clip": 0.06277826, + "balance_loss_mlp": 0.01263506, + "epoch": 0.4021644370960469, + "flos": 26003155011840.0, + "grad_norm": 1.6503070586239919, + "language_loss": 0.64854121, + "learning_rate": 2.7146690357941112e-06, + "loss": 0.7258811, + "num_input_tokens_seen": 143703985, + "router_z_loss_clip": 1.77246094, + "router_z_loss_mlp": 0.15478516, + "step": 6689, + "time_per_iteration": 2.552058458328247 + }, + { + "auxiliary_loss_clip": 0.06450799, + "auxiliary_loss_mlp": 0.01267992, + "balance_loss_clip": 0.06276366, + "balance_loss_mlp": 0.0125417, + "epoch": 0.40222456034871484, + "flos": 13592816714880.0, + "grad_norm": 1.9543405887805447, + "language_loss": 0.73594153, + "learning_rate": 2.7143052742006632e-06, + "loss": 0.81312943, + "num_input_tokens_seen": 143719245, + "router_z_loss_clip": 1.74414062, + "router_z_loss_mlp": 0.13824463, + "step": 6690, + "time_per_iteration": 2.5484251976013184 + }, + { + "auxiliary_loss_clip": 0.06448495, + "auxiliary_loss_mlp": 0.0127057, + "balance_loss_clip": 0.06278096, + "balance_loss_mlp": 0.01256682, + "epoch": 0.4022846836013828, + "flos": 24284586170880.0, + "grad_norm": 1.722227920192768, + "language_loss": 0.74861401, + "learning_rate": 2.7139414855206872e-06, + "loss": 0.82580471, + "num_input_tokens_seen": 143739575, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.13903809, + "step": 6691, + "time_per_iteration": 3.9708051681518555 + }, + { + "auxiliary_loss_clip": 0.06451076, + "auxiliary_loss_mlp": 0.01277672, + "balance_loss_clip": 0.0627808, + "balance_loss_mlp": 0.01262151, + "epoch": 0.40234480685405083, + "flos": 20156881864320.0, + "grad_norm": 1.7761891830354823, + "language_loss": 0.72677463, + "learning_rate": 2.7135776697679785e-06, + "loss": 0.80406213, + "num_input_tokens_seen": 143758515, + "router_z_loss_clip": 1.72949219, + "router_z_loss_mlp": 0.15515137, + "step": 6692, + "time_per_iteration": 2.5179357528686523 + }, + { + "auxiliary_loss_clip": 0.06447224, + "auxiliary_loss_mlp": 0.01270814, + "balance_loss_clip": 0.06276847, + "balance_loss_mlp": 0.0125664, + "epoch": 0.4024049301067188, + "flos": 22936925427840.0, + "grad_norm": 1.7625804596819372, + "language_loss": 0.8401857, + "learning_rate": 2.7132138269563333e-06, + "loss": 0.91736615, + "num_input_tokens_seen": 143776770, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.1418457, + "step": 6693, + "time_per_iteration": 2.707941770553589 + }, + { + "auxiliary_loss_clip": 0.06452498, + "auxiliary_loss_mlp": 0.01266294, + "balance_loss_clip": 0.06281643, + "balance_loss_mlp": 0.01252865, + "epoch": 0.40246505335938676, + "flos": 36037285297920.0, + "grad_norm": 1.8844808694168769, + "language_loss": 0.70966387, + "learning_rate": 2.7128499570995483e-06, + "loss": 0.78685182, + "num_input_tokens_seen": 143798450, + "router_z_loss_clip": 1.70996094, + "router_z_loss_mlp": 0.13433838, + "step": 6694, + "time_per_iteration": 2.637481927871704 + }, + { + "auxiliary_loss_clip": 0.06444509, + "auxiliary_loss_mlp": 0.01272964, + "balance_loss_clip": 0.0627351, + "balance_loss_mlp": 0.01258552, + "epoch": 0.4025251766120547, + "flos": 20600478979200.0, + "grad_norm": 1.9746374404018712, + "language_loss": 0.68475246, + "learning_rate": 2.7124860602114212e-06, + "loss": 0.76192719, + "num_input_tokens_seen": 143816995, + "router_z_loss_clip": 1.7109375, + "router_z_loss_mlp": 0.14428711, + "step": 6695, + "time_per_iteration": 3.9740405082702637 + }, + { + "auxiliary_loss_clip": 0.06446315, + "auxiliary_loss_mlp": 0.01270396, + "balance_loss_clip": 0.06276862, + "balance_loss_mlp": 0.01256484, + "epoch": 0.4025852998647227, + "flos": 64537582890240.0, + "grad_norm": 2.0865884556399363, + "language_loss": 0.79765463, + "learning_rate": 2.7121221363057515e-06, + "loss": 0.87482178, + "num_input_tokens_seen": 143842090, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.13897705, + "step": 6696, + "time_per_iteration": 3.0413708686828613 + }, + { + "auxiliary_loss_clip": 0.06454235, + "auxiliary_loss_mlp": 0.01269123, + "balance_loss_clip": 0.06281278, + "balance_loss_mlp": 0.01254473, + "epoch": 0.40264542311739066, + "flos": 20892534785280.0, + "grad_norm": 1.7976365729577468, + "language_loss": 0.71608603, + "learning_rate": 2.7117581853963393e-06, + "loss": 0.79331958, + "num_input_tokens_seen": 143860800, + "router_z_loss_clip": 1.72851562, + "router_z_loss_mlp": 0.14660645, + "step": 6697, + "time_per_iteration": 2.5200350284576416 + }, + { + "auxiliary_loss_clip": 0.06445032, + "auxiliary_loss_mlp": 0.01270069, + "balance_loss_clip": 0.06276169, + "balance_loss_mlp": 0.0125658, + "epoch": 0.4027055463700586, + "flos": 26257419826560.0, + "grad_norm": 1.9918981514977272, + "language_loss": 0.61230171, + "learning_rate": 2.711394207496984e-06, + "loss": 0.68945277, + "num_input_tokens_seen": 143878950, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.13464355, + "step": 6698, + "time_per_iteration": 2.576472520828247 + }, + { + "auxiliary_loss_clip": 0.06449181, + "auxiliary_loss_mlp": 0.0126685, + "balance_loss_clip": 0.06276856, + "balance_loss_mlp": 0.01252849, + "epoch": 0.4027656696227266, + "flos": 20637682992000.0, + "grad_norm": 2.0070875825685266, + "language_loss": 0.77479243, + "learning_rate": 2.711030202621491e-06, + "loss": 0.85195273, + "num_input_tokens_seen": 143898385, + "router_z_loss_clip": 1.72363281, + "router_z_loss_mlp": 0.14001465, + "step": 6699, + "time_per_iteration": 3.937375545501709 + }, + { + "auxiliary_loss_clip": 0.0644554, + "auxiliary_loss_mlp": 0.01267758, + "balance_loss_clip": 0.0627719, + "balance_loss_mlp": 0.01253977, + "epoch": 0.40282579287539455, + "flos": 22352855742720.0, + "grad_norm": 1.735185416550665, + "language_loss": 0.80698907, + "learning_rate": 2.7106661707836605e-06, + "loss": 0.88412201, + "num_input_tokens_seen": 143918795, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.13793945, + "step": 6700, + "time_per_iteration": 2.535510540008545 + }, + { + "auxiliary_loss_clip": 0.06459837, + "auxiliary_loss_mlp": 0.01268332, + "balance_loss_clip": 0.06282608, + "balance_loss_mlp": 0.01253157, + "epoch": 0.4028859161280625, + "flos": 29282126912640.0, + "grad_norm": 1.7653471156752092, + "language_loss": 0.74938649, + "learning_rate": 2.7103021119972977e-06, + "loss": 0.82666814, + "num_input_tokens_seen": 143938245, + "router_z_loss_clip": 1.77246094, + "router_z_loss_mlp": 0.1517334, + "step": 6701, + "time_per_iteration": 2.6509363651275635 + }, + { + "auxiliary_loss_clip": 0.06451308, + "auxiliary_loss_mlp": 0.01270948, + "balance_loss_clip": 0.06281418, + "balance_loss_mlp": 0.01257329, + "epoch": 0.4029460393807305, + "flos": 28630022112000.0, + "grad_norm": 1.48917022125432, + "language_loss": 0.66283298, + "learning_rate": 2.709938026276208e-06, + "loss": 0.74005556, + "num_input_tokens_seen": 143960995, + "router_z_loss_clip": 1.69824219, + "router_z_loss_mlp": 0.13641357, + "step": 6702, + "time_per_iteration": 2.6183536052703857 + }, + { + "auxiliary_loss_clip": 0.06460792, + "auxiliary_loss_mlp": 0.0127397, + "balance_loss_clip": 0.06286055, + "balance_loss_mlp": 0.01259117, + "epoch": 0.40300616263339845, + "flos": 22608588003840.0, + "grad_norm": 1.5996325972429297, + "language_loss": 0.66632348, + "learning_rate": 2.7095739136341964e-06, + "loss": 0.74367112, + "num_input_tokens_seen": 143979910, + "router_z_loss_clip": 1.74511719, + "router_z_loss_mlp": 0.14849854, + "step": 6703, + "time_per_iteration": 2.583040237426758 + }, + { + "auxiliary_loss_clip": 0.06456298, + "auxiliary_loss_mlp": 0.01273361, + "balance_loss_clip": 0.06282306, + "balance_loss_mlp": 0.012584, + "epoch": 0.4030662858860664, + "flos": 25527385128960.0, + "grad_norm": 1.7345540067512994, + "language_loss": 0.82398093, + "learning_rate": 2.709209774085071e-06, + "loss": 0.90127754, + "num_input_tokens_seen": 144000095, + "router_z_loss_clip": 1.73925781, + "router_z_loss_mlp": 0.14959717, + "step": 6704, + "time_per_iteration": 2.564052104949951 + }, + { + "auxiliary_loss_clip": 0.06457714, + "auxiliary_loss_mlp": 0.01272416, + "balance_loss_clip": 0.06283459, + "balance_loss_mlp": 0.01258332, + "epoch": 0.40312640913873443, + "flos": 23593474494720.0, + "grad_norm": 1.6434462448941187, + "language_loss": 0.73919153, + "learning_rate": 2.7088456076426407e-06, + "loss": 0.81649286, + "num_input_tokens_seen": 144019695, + "router_z_loss_clip": 1.74316406, + "router_z_loss_mlp": 0.140625, + "step": 6705, + "time_per_iteration": 2.609738349914551 + }, + { + "auxiliary_loss_clip": 0.06450006, + "auxiliary_loss_mlp": 0.01270089, + "balance_loss_clip": 0.06282469, + "balance_loss_mlp": 0.01256481, + "epoch": 0.4031865323914024, + "flos": 20017205907840.0, + "grad_norm": 1.6242014521871173, + "language_loss": 0.66795284, + "learning_rate": 2.708481414320713e-06, + "loss": 0.74515378, + "num_input_tokens_seen": 144038525, + "router_z_loss_clip": 1.67480469, + "router_z_loss_mlp": 0.1361084, + "step": 6706, + "time_per_iteration": 2.5215423107147217 + }, + { + "auxiliary_loss_clip": 0.06452154, + "auxiliary_loss_mlp": 0.01268976, + "balance_loss_clip": 0.06282388, + "balance_loss_mlp": 0.0125513, + "epoch": 0.40324665564407036, + "flos": 21877840546560.0, + "grad_norm": 1.6449246324910813, + "language_loss": 0.71481538, + "learning_rate": 2.7081171941330992e-06, + "loss": 0.79202664, + "num_input_tokens_seen": 144059485, + "router_z_loss_clip": 1.70019531, + "router_z_loss_mlp": 0.13842773, + "step": 6707, + "time_per_iteration": 2.5762581825256348 + }, + { + "auxiliary_loss_clip": 0.0644149, + "auxiliary_loss_mlp": 0.01271296, + "balance_loss_clip": 0.06278867, + "balance_loss_mlp": 0.01258379, + "epoch": 0.4033067788967383, + "flos": 23885572227840.0, + "grad_norm": 1.6148090336243837, + "language_loss": 0.80062628, + "learning_rate": 2.707752947093611e-06, + "loss": 0.87775409, + "num_input_tokens_seen": 144080265, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.12908936, + "step": 6708, + "time_per_iteration": 2.5509586334228516 + }, + { + "auxiliary_loss_clip": 0.06459241, + "auxiliary_loss_mlp": 0.01271237, + "balance_loss_clip": 0.0628079, + "balance_loss_mlp": 0.01256133, + "epoch": 0.4033669021494063, + "flos": 17425530322560.0, + "grad_norm": 2.5431099630067435, + "language_loss": 0.8334195, + "learning_rate": 2.70738867321606e-06, + "loss": 0.91072428, + "num_input_tokens_seen": 144098040, + "router_z_loss_clip": 1.78417969, + "router_z_loss_mlp": 0.15100098, + "step": 6709, + "time_per_iteration": 2.5844790935516357 + }, + { + "auxiliary_loss_clip": 0.06454608, + "auxiliary_loss_mlp": 0.01274744, + "balance_loss_clip": 0.0628157, + "balance_loss_mlp": 0.01259211, + "epoch": 0.40342702540207426, + "flos": 29607277881600.0, + "grad_norm": 1.5307534200842645, + "language_loss": 0.71642667, + "learning_rate": 2.70702437251426e-06, + "loss": 0.79372019, + "num_input_tokens_seen": 144118265, + "router_z_loss_clip": 1.72753906, + "router_z_loss_mlp": 0.15527344, + "step": 6710, + "time_per_iteration": 2.5950214862823486 + }, + { + "auxiliary_loss_clip": 0.06448973, + "auxiliary_loss_mlp": 0.01270551, + "balance_loss_clip": 0.06280518, + "balance_loss_mlp": 0.01256037, + "epoch": 0.4034871486547422, + "flos": 11288249544960.0, + "grad_norm": 5.632076524924719, + "language_loss": 0.85771239, + "learning_rate": 2.7066600450020236e-06, + "loss": 0.93490767, + "num_input_tokens_seen": 144133865, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.1451416, + "step": 6711, + "time_per_iteration": 2.530691146850586 + }, + { + "auxiliary_loss_clip": 0.06457499, + "auxiliary_loss_mlp": 0.01273198, + "balance_loss_clip": 0.0628542, + "balance_loss_mlp": 0.01258732, + "epoch": 0.4035472719074102, + "flos": 15557097254400.0, + "grad_norm": 2.360012043566648, + "language_loss": 0.76516247, + "learning_rate": 2.706295690693168e-06, + "loss": 0.84246945, + "num_input_tokens_seen": 144150125, + "router_z_loss_clip": 1.72167969, + "router_z_loss_mlp": 0.14471436, + "step": 6712, + "time_per_iteration": 2.485973358154297 + }, + { + "auxiliary_loss_clip": 0.06453355, + "auxiliary_loss_mlp": 0.01270625, + "balance_loss_clip": 0.06282951, + "balance_loss_mlp": 0.01256249, + "epoch": 0.40360739516007815, + "flos": 24680162096640.0, + "grad_norm": 2.2673991582834803, + "language_loss": 0.80280489, + "learning_rate": 2.7059313096015096e-06, + "loss": 0.88004464, + "num_input_tokens_seen": 144169295, + "router_z_loss_clip": 1.70117188, + "router_z_loss_mlp": 0.14379883, + "step": 6713, + "time_per_iteration": 2.604844093322754 + }, + { + "auxiliary_loss_clip": 0.06452335, + "auxiliary_loss_mlp": 0.01272867, + "balance_loss_clip": 0.06279401, + "balance_loss_mlp": 0.01258824, + "epoch": 0.4036675184127461, + "flos": 17308635477120.0, + "grad_norm": 2.487123438751718, + "language_loss": 0.88458717, + "learning_rate": 2.705566901740865e-06, + "loss": 0.9618392, + "num_input_tokens_seen": 144185790, + "router_z_loss_clip": 1.73144531, + "router_z_loss_mlp": 0.14038086, + "step": 6714, + "time_per_iteration": 2.4827568531036377 + }, + { + "auxiliary_loss_clip": 0.06454237, + "auxiliary_loss_mlp": 0.01268245, + "balance_loss_clip": 0.06281483, + "balance_loss_mlp": 0.01254011, + "epoch": 0.4037276416654141, + "flos": 19869983084160.0, + "grad_norm": 1.5212273970247687, + "language_loss": 0.69752967, + "learning_rate": 2.7052024671250527e-06, + "loss": 0.77475452, + "num_input_tokens_seen": 144205190, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.14233398, + "step": 6715, + "time_per_iteration": 2.5602893829345703 + }, + { + "auxiliary_loss_clip": 0.06458366, + "auxiliary_loss_mlp": 0.01269769, + "balance_loss_clip": 0.06281729, + "balance_loss_mlp": 0.0125541, + "epoch": 0.40378776491808205, + "flos": 18302158938240.0, + "grad_norm": 1.8718399277124913, + "language_loss": 0.78095776, + "learning_rate": 2.704838005767892e-06, + "loss": 0.85823905, + "num_input_tokens_seen": 144222705, + "router_z_loss_clip": 1.76757812, + "router_z_loss_mlp": 0.14367676, + "step": 6716, + "time_per_iteration": 2.4911210536956787 + }, + { + "auxiliary_loss_clip": 0.06449929, + "auxiliary_loss_mlp": 0.01275524, + "balance_loss_clip": 0.0628348, + "balance_loss_mlp": 0.01262185, + "epoch": 0.40384788817075, + "flos": 15054772826880.0, + "grad_norm": 1.8985450182353327, + "language_loss": 0.76491797, + "learning_rate": 2.7044735176832037e-06, + "loss": 0.8421725, + "num_input_tokens_seen": 144239545, + "router_z_loss_clip": 1.66308594, + "router_z_loss_mlp": 0.13342285, + "step": 6717, + "time_per_iteration": 2.5457956790924072 + }, + { + "auxiliary_loss_clip": 0.0634857, + "auxiliary_loss_mlp": 0.01256954, + "balance_loss_clip": 0.06272445, + "balance_loss_mlp": 0.01254165, + "epoch": 0.40390801142341803, + "flos": 61948659761280.0, + "grad_norm": 0.8842261639057883, + "language_loss": 0.60140264, + "learning_rate": 2.7041090028848084e-06, + "loss": 0.67745787, + "num_input_tokens_seen": 144288145, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.02790833, + "step": 6718, + "time_per_iteration": 2.9733822345733643 + }, + { + "auxiliary_loss_clip": 0.06457312, + "auxiliary_loss_mlp": 0.0127584, + "balance_loss_clip": 0.06279647, + "balance_loss_mlp": 0.01260366, + "epoch": 0.403968134676086, + "flos": 22743945475200.0, + "grad_norm": 1.799198719667369, + "language_loss": 0.75286412, + "learning_rate": 2.7037444613865306e-06, + "loss": 0.83019567, + "num_input_tokens_seen": 144302315, + "router_z_loss_clip": 1.77441406, + "router_z_loss_mlp": 0.15490723, + "step": 6719, + "time_per_iteration": 2.5417115688323975 + }, + { + "auxiliary_loss_clip": 0.06454173, + "auxiliary_loss_mlp": 0.01269672, + "balance_loss_clip": 0.06282561, + "balance_loss_mlp": 0.01254592, + "epoch": 0.40402825792875396, + "flos": 19789244075520.0, + "grad_norm": 2.1951890128687257, + "language_loss": 0.81351668, + "learning_rate": 2.7033798932021906e-06, + "loss": 0.89075512, + "num_input_tokens_seen": 144318990, + "router_z_loss_clip": 1.71484375, + "router_z_loss_mlp": 0.15100098, + "step": 6720, + "time_per_iteration": 2.4906880855560303 + }, + { + "auxiliary_loss_clip": 0.06453006, + "auxiliary_loss_mlp": 0.01269643, + "balance_loss_clip": 0.06279179, + "balance_loss_mlp": 0.01254742, + "epoch": 0.40408838118142193, + "flos": 19615298999040.0, + "grad_norm": 1.8273574705972042, + "language_loss": 0.77227581, + "learning_rate": 2.7030152983456153e-06, + "loss": 0.84950233, + "num_input_tokens_seen": 144335765, + "router_z_loss_clip": 1.73925781, + "router_z_loss_mlp": 0.14904785, + "step": 6721, + "time_per_iteration": 2.5645196437835693 + }, + { + "auxiliary_loss_clip": 0.06447627, + "auxiliary_loss_mlp": 0.01264811, + "balance_loss_clip": 0.06279851, + "balance_loss_mlp": 0.01251931, + "epoch": 0.4041485044340899, + "flos": 24432982951680.0, + "grad_norm": 1.7503779333013576, + "language_loss": 0.72784024, + "learning_rate": 2.7026506768306304e-06, + "loss": 0.80496466, + "num_input_tokens_seen": 144355825, + "router_z_loss_clip": 1.67773438, + "router_z_loss_mlp": 0.12884521, + "step": 6722, + "time_per_iteration": 2.5520758628845215 + }, + { + "auxiliary_loss_clip": 0.06450947, + "auxiliary_loss_mlp": 0.01270139, + "balance_loss_clip": 0.06280953, + "balance_loss_mlp": 0.01256972, + "epoch": 0.40420862768675786, + "flos": 16765207822080.0, + "grad_norm": 1.6533819858806273, + "language_loss": 0.65986466, + "learning_rate": 2.7022860286710602e-06, + "loss": 0.73707551, + "num_input_tokens_seen": 144374320, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.13165283, + "step": 6723, + "time_per_iteration": 2.5385141372680664 + }, + { + "auxiliary_loss_clip": 0.06456833, + "auxiliary_loss_mlp": 0.01276273, + "balance_loss_clip": 0.06280676, + "balance_loss_mlp": 0.01262039, + "epoch": 0.4042687509394258, + "flos": 22498066068480.0, + "grad_norm": 1.4281101192387737, + "language_loss": 0.74082482, + "learning_rate": 2.701921353880734e-06, + "loss": 0.81815588, + "num_input_tokens_seen": 144394325, + "router_z_loss_clip": 1.76464844, + "router_z_loss_mlp": 0.14227295, + "step": 6724, + "time_per_iteration": 2.5705087184906006 + }, + { + "auxiliary_loss_clip": 0.06445859, + "auxiliary_loss_mlp": 0.01269625, + "balance_loss_clip": 0.06280795, + "balance_loss_mlp": 0.01256226, + "epoch": 0.4043288741920938, + "flos": 30343978978560.0, + "grad_norm": 1.716107680872733, + "language_loss": 0.75255632, + "learning_rate": 2.7015566524734787e-06, + "loss": 0.8297112, + "num_input_tokens_seen": 144412765, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.13409424, + "step": 6725, + "time_per_iteration": 2.6433653831481934 + }, + { + "auxiliary_loss_clip": 0.06451583, + "auxiliary_loss_mlp": 0.01271794, + "balance_loss_clip": 0.06282748, + "balance_loss_mlp": 0.01257054, + "epoch": 0.40438899744476176, + "flos": 46357978947840.0, + "grad_norm": 1.593616701788039, + "language_loss": 0.77198207, + "learning_rate": 2.701191924463126e-06, + "loss": 0.84921581, + "num_input_tokens_seen": 144435400, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.14733887, + "step": 6726, + "time_per_iteration": 2.8469409942626953 + }, + { + "auxiliary_loss_clip": 0.06452948, + "auxiliary_loss_mlp": 0.0127047, + "balance_loss_clip": 0.06279704, + "balance_loss_mlp": 0.01256058, + "epoch": 0.4044491206974297, + "flos": 13338468046080.0, + "grad_norm": 2.072990787427281, + "language_loss": 0.82297921, + "learning_rate": 2.7008271698635054e-06, + "loss": 0.90021348, + "num_input_tokens_seen": 144452925, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.14404297, + "step": 6727, + "time_per_iteration": 2.5381619930267334 + }, + { + "auxiliary_loss_clip": 0.06453642, + "auxiliary_loss_mlp": 0.01266247, + "balance_loss_clip": 0.06281026, + "balance_loss_mlp": 0.01252413, + "epoch": 0.4045092439500977, + "flos": 12098603980800.0, + "grad_norm": 2.0199249210029055, + "language_loss": 0.86119437, + "learning_rate": 2.700462388688447e-06, + "loss": 0.93839324, + "num_input_tokens_seen": 144470195, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.13830566, + "step": 6728, + "time_per_iteration": 3.903547763824463 + }, + { + "auxiliary_loss_clip": 0.06450571, + "auxiliary_loss_mlp": 0.01275259, + "balance_loss_clip": 0.06281772, + "balance_loss_mlp": 0.01260567, + "epoch": 0.40456936720276565, + "flos": 21186225745920.0, + "grad_norm": 1.6307737524107195, + "language_loss": 0.82346553, + "learning_rate": 2.700097580951786e-06, + "loss": 0.90072381, + "num_input_tokens_seen": 144490320, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.14697266, + "step": 6729, + "time_per_iteration": 2.5673158168792725 + }, + { + "auxiliary_loss_clip": 0.06454299, + "auxiliary_loss_mlp": 0.01268394, + "balance_loss_clip": 0.06281105, + "balance_loss_mlp": 0.01253755, + "epoch": 0.4046294904554336, + "flos": 23922147335040.0, + "grad_norm": 1.7857320211804986, + "language_loss": 0.73840159, + "learning_rate": 2.6997327466673533e-06, + "loss": 0.81562853, + "num_input_tokens_seen": 144508990, + "router_z_loss_clip": 1.73144531, + "router_z_loss_mlp": 0.14630127, + "step": 6730, + "time_per_iteration": 4.11122727394104 + }, + { + "auxiliary_loss_clip": 0.0645189, + "auxiliary_loss_mlp": 0.01268684, + "balance_loss_clip": 0.06282154, + "balance_loss_mlp": 0.01254767, + "epoch": 0.4046896137081016, + "flos": 38080376202240.0, + "grad_norm": 1.7383158082611918, + "language_loss": 0.67290312, + "learning_rate": 2.699367885848985e-06, + "loss": 0.75010884, + "num_input_tokens_seen": 144529550, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.13922119, + "step": 6731, + "time_per_iteration": 2.8046634197235107 + }, + { + "auxiliary_loss_clip": 0.06450266, + "auxiliary_loss_mlp": 0.01270158, + "balance_loss_clip": 0.0628126, + "balance_loss_mlp": 0.01256175, + "epoch": 0.4047497369607696, + "flos": 23623047786240.0, + "grad_norm": 1.7716081402001673, + "language_loss": 0.74489558, + "learning_rate": 2.699002998510517e-06, + "loss": 0.8220998, + "num_input_tokens_seen": 144549310, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.13977051, + "step": 6732, + "time_per_iteration": 2.608191728591919 + }, + { + "auxiliary_loss_clip": 0.06450449, + "auxiliary_loss_mlp": 0.01269365, + "balance_loss_clip": 0.06283008, + "balance_loss_mlp": 0.01255978, + "epoch": 0.40480986021343757, + "flos": 12828596751360.0, + "grad_norm": 1.6538752037468725, + "language_loss": 0.77253687, + "learning_rate": 2.6986380846657852e-06, + "loss": 0.84973502, + "num_input_tokens_seen": 144567430, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.13391113, + "step": 6733, + "time_per_iteration": 2.525399923324585 + }, + { + "auxiliary_loss_clip": 0.06457898, + "auxiliary_loss_mlp": 0.01270828, + "balance_loss_clip": 0.06280859, + "balance_loss_mlp": 0.01255176, + "epoch": 0.40486998346610553, + "flos": 23775511489920.0, + "grad_norm": 4.637374264151728, + "language_loss": 0.76891112, + "learning_rate": 2.698273144328627e-06, + "loss": 0.84619832, + "num_input_tokens_seen": 144585975, + "router_z_loss_clip": 1.76953125, + "router_z_loss_mlp": 0.15661621, + "step": 6734, + "time_per_iteration": 4.040409564971924 + }, + { + "auxiliary_loss_clip": 0.06455547, + "auxiliary_loss_mlp": 0.01267949, + "balance_loss_clip": 0.0627891, + "balance_loss_mlp": 0.0125421, + "epoch": 0.4049301067187735, + "flos": 22863439797120.0, + "grad_norm": 2.24732512167567, + "language_loss": 0.64935613, + "learning_rate": 2.6979081775128805e-06, + "loss": 0.72659111, + "num_input_tokens_seen": 144605225, + "router_z_loss_clip": 1.76757812, + "router_z_loss_mlp": 0.13745117, + "step": 6735, + "time_per_iteration": 2.5326993465423584 + }, + { + "auxiliary_loss_clip": 0.06448689, + "auxiliary_loss_mlp": 0.01271873, + "balance_loss_clip": 0.06279301, + "balance_loss_mlp": 0.01258849, + "epoch": 0.40499022997144146, + "flos": 22790624999040.0, + "grad_norm": 1.962844708798157, + "language_loss": 0.83769405, + "learning_rate": 2.697543184232387e-06, + "loss": 0.91489971, + "num_input_tokens_seen": 144624145, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.13024902, + "step": 6736, + "time_per_iteration": 2.5863215923309326 + }, + { + "auxiliary_loss_clip": 0.06454039, + "auxiliary_loss_mlp": 0.01271412, + "balance_loss_clip": 0.06281038, + "balance_loss_mlp": 0.01256832, + "epoch": 0.4050503532241094, + "flos": 23046021843840.0, + "grad_norm": 1.714368942149708, + "language_loss": 0.75428641, + "learning_rate": 2.6971781645009863e-06, + "loss": 0.83154088, + "num_input_tokens_seen": 144644470, + "router_z_loss_clip": 1.72949219, + "router_z_loss_mlp": 0.14569092, + "step": 6737, + "time_per_iteration": 2.6163716316223145 + }, + { + "auxiliary_loss_clip": 0.06448484, + "auxiliary_loss_mlp": 0.01271121, + "balance_loss_clip": 0.06280237, + "balance_loss_mlp": 0.01257644, + "epoch": 0.4051104764767774, + "flos": 16652254118400.0, + "grad_norm": 4.810644037565116, + "language_loss": 0.72306561, + "learning_rate": 2.696813118332519e-06, + "loss": 0.80026174, + "num_input_tokens_seen": 144661055, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.13470459, + "step": 6738, + "time_per_iteration": 4.0618274211883545 + }, + { + "auxiliary_loss_clip": 0.06449332, + "auxiliary_loss_mlp": 0.01270399, + "balance_loss_clip": 0.06280854, + "balance_loss_mlp": 0.01257399, + "epoch": 0.40517059972944536, + "flos": 16363929818880.0, + "grad_norm": 1.8147061411614016, + "language_loss": 0.75123262, + "learning_rate": 2.696448045740828e-06, + "loss": 0.82842994, + "num_input_tokens_seen": 144677935, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.13000488, + "step": 6739, + "time_per_iteration": 2.489001512527466 + }, + { + "auxiliary_loss_clip": 0.06454495, + "auxiliary_loss_mlp": 0.0126968, + "balance_loss_clip": 0.06282163, + "balance_loss_mlp": 0.01255405, + "epoch": 0.4052307229821133, + "flos": 28810885150080.0, + "grad_norm": 1.87280601387568, + "language_loss": 0.74278009, + "learning_rate": 2.6960829467397576e-06, + "loss": 0.82002187, + "num_input_tokens_seen": 144697725, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.14257812, + "step": 6740, + "time_per_iteration": 2.616560220718384 + }, + { + "auxiliary_loss_clip": 0.0644789, + "auxiliary_loss_mlp": 0.01270934, + "balance_loss_clip": 0.06280458, + "balance_loss_mlp": 0.01257076, + "epoch": 0.4052908462347813, + "flos": 21404334723840.0, + "grad_norm": 1.6527814212000655, + "language_loss": 0.77083528, + "learning_rate": 2.695717821343153e-06, + "loss": 0.84802353, + "num_input_tokens_seen": 144718805, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.1385498, + "step": 6741, + "time_per_iteration": 2.5236477851867676 + }, + { + "auxiliary_loss_clip": 0.06449165, + "auxiliary_loss_mlp": 0.01274329, + "balance_loss_clip": 0.06278783, + "balance_loss_mlp": 0.01259606, + "epoch": 0.40535096948744925, + "flos": 22425628613760.0, + "grad_norm": 1.6285650306233073, + "language_loss": 0.7166388, + "learning_rate": 2.6953526695648577e-06, + "loss": 0.79387373, + "num_input_tokens_seen": 144737105, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.1472168, + "step": 6742, + "time_per_iteration": 2.588928699493408 + }, + { + "auxiliary_loss_clip": 0.06454468, + "auxiliary_loss_mlp": 0.01273335, + "balance_loss_clip": 0.06282452, + "balance_loss_mlp": 0.01258016, + "epoch": 0.4054110927401172, + "flos": 17015028370560.0, + "grad_norm": 2.751799665484638, + "language_loss": 0.73206228, + "learning_rate": 2.6949874914187202e-06, + "loss": 0.80934024, + "num_input_tokens_seen": 144751350, + "router_z_loss_clip": 1.72167969, + "router_z_loss_mlp": 0.15332031, + "step": 6743, + "time_per_iteration": 2.519907236099243 + }, + { + "auxiliary_loss_clip": 0.0645441, + "auxiliary_loss_mlp": 0.01272217, + "balance_loss_clip": 0.06280394, + "balance_loss_mlp": 0.01257494, + "epoch": 0.4054712159927852, + "flos": 21621018182400.0, + "grad_norm": 2.0068914143371623, + "language_loss": 0.7128458, + "learning_rate": 2.694622286918588e-06, + "loss": 0.79011208, + "num_input_tokens_seen": 144770030, + "router_z_loss_clip": 1.73925781, + "router_z_loss_mlp": 0.14733887, + "step": 6744, + "time_per_iteration": 2.641242742538452 + }, + { + "auxiliary_loss_clip": 0.06447047, + "auxiliary_loss_mlp": 0.01269556, + "balance_loss_clip": 0.06280165, + "balance_loss_mlp": 0.01255722, + "epoch": 0.4055313392454532, + "flos": 25819734424320.0, + "grad_norm": 1.5431481906112547, + "language_loss": 0.80460721, + "learning_rate": 2.6942570560783076e-06, + "loss": 0.88177323, + "num_input_tokens_seen": 144790965, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.13830566, + "step": 6745, + "time_per_iteration": 2.563445806503296 + }, + { + "auxiliary_loss_clip": 0.06450857, + "auxiliary_loss_mlp": 0.01269463, + "balance_loss_clip": 0.06282623, + "balance_loss_mlp": 0.01255009, + "epoch": 0.40559146249812117, + "flos": 14142323790720.0, + "grad_norm": 1.9690336991849304, + "language_loss": 0.67176485, + "learning_rate": 2.693891798911731e-06, + "loss": 0.74896801, + "num_input_tokens_seen": 144807755, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.14465332, + "step": 6746, + "time_per_iteration": 2.532186508178711 + }, + { + "auxiliary_loss_clip": 0.064533, + "auxiliary_loss_mlp": 0.01266546, + "balance_loss_clip": 0.06283557, + "balance_loss_mlp": 0.01253272, + "epoch": 0.40565158575078913, + "flos": 41365259815680.0, + "grad_norm": 1.4380414737187444, + "language_loss": 0.57222033, + "learning_rate": 2.6935265154327075e-06, + "loss": 0.64941883, + "num_input_tokens_seen": 144832405, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.1328125, + "step": 6747, + "time_per_iteration": 2.7487149238586426 + }, + { + "auxiliary_loss_clip": 0.06454123, + "auxiliary_loss_mlp": 0.01269064, + "balance_loss_clip": 0.06282702, + "balance_loss_mlp": 0.01255319, + "epoch": 0.4057117090034571, + "flos": 28551421382400.0, + "grad_norm": 2.093705794925994, + "language_loss": 0.84795344, + "learning_rate": 2.693161205655089e-06, + "loss": 0.92518532, + "num_input_tokens_seen": 144853890, + "router_z_loss_clip": 1.71582031, + "router_z_loss_mlp": 0.13739014, + "step": 6748, + "time_per_iteration": 2.5967648029327393 + }, + { + "auxiliary_loss_clip": 0.06453951, + "auxiliary_loss_mlp": 0.01269749, + "balance_loss_clip": 0.06281549, + "balance_loss_mlp": 0.01254794, + "epoch": 0.40577183225612506, + "flos": 18009851569920.0, + "grad_norm": 1.9056349360303495, + "language_loss": 0.81943792, + "learning_rate": 2.6927958695927287e-06, + "loss": 0.89667493, + "num_input_tokens_seen": 144871395, + "router_z_loss_clip": 1.72265625, + "router_z_loss_mlp": 0.14953613, + "step": 6749, + "time_per_iteration": 2.546419143676758 + }, + { + "auxiliary_loss_clip": 0.06450339, + "auxiliary_loss_mlp": 0.01270578, + "balance_loss_clip": 0.06281818, + "balance_loss_mlp": 0.01256762, + "epoch": 0.40583195550879303, + "flos": 19542819617280.0, + "grad_norm": 1.7354001752331154, + "language_loss": 0.75251377, + "learning_rate": 2.6924305072594784e-06, + "loss": 0.82972294, + "num_input_tokens_seen": 144890975, + "router_z_loss_clip": 1.68554688, + "router_z_loss_mlp": 0.13824463, + "step": 6750, + "time_per_iteration": 2.633349895477295 + }, + { + "auxiliary_loss_clip": 0.06461279, + "auxiliary_loss_mlp": 0.01270913, + "balance_loss_clip": 0.06282868, + "balance_loss_mlp": 0.01256441, + "epoch": 0.405892078761461, + "flos": 22315987146240.0, + "grad_norm": 2.3215315740209026, + "language_loss": 0.73715317, + "learning_rate": 2.692065118669195e-06, + "loss": 0.81447506, + "num_input_tokens_seen": 144908170, + "router_z_loss_clip": 1.78125, + "router_z_loss_mlp": 0.14459229, + "step": 6751, + "time_per_iteration": 2.579233169555664 + }, + { + "auxiliary_loss_clip": 0.06456044, + "auxiliary_loss_mlp": 0.01276434, + "balance_loss_clip": 0.06282923, + "balance_loss_mlp": 0.01261622, + "epoch": 0.40595220201412896, + "flos": 25491564708480.0, + "grad_norm": 1.5288716905414277, + "language_loss": 0.66520017, + "learning_rate": 2.6916997038357326e-06, + "loss": 0.74252492, + "num_input_tokens_seen": 144928020, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.14788818, + "step": 6752, + "time_per_iteration": 2.5768818855285645 + }, + { + "auxiliary_loss_clip": 0.06457777, + "auxiliary_loss_mlp": 0.01274224, + "balance_loss_clip": 0.06281942, + "balance_loss_mlp": 0.01259025, + "epoch": 0.4060123252667969, + "flos": 49867092887040.0, + "grad_norm": 1.7025851849816316, + "language_loss": 0.71210098, + "learning_rate": 2.691334262772948e-06, + "loss": 0.78942096, + "num_input_tokens_seen": 144951240, + "router_z_loss_clip": 1.75976562, + "router_z_loss_mlp": 0.15197754, + "step": 6753, + "time_per_iteration": 2.807713031768799 + }, + { + "auxiliary_loss_clip": 0.06455305, + "auxiliary_loss_mlp": 0.01268505, + "balance_loss_clip": 0.06281379, + "balance_loss_mlp": 0.01254736, + "epoch": 0.4060724485194649, + "flos": 21140720179200.0, + "grad_norm": 2.0551663576230657, + "language_loss": 0.72102135, + "learning_rate": 2.690968795494699e-06, + "loss": 0.7982595, + "num_input_tokens_seen": 144969100, + "router_z_loss_clip": 1.73925781, + "router_z_loss_mlp": 0.13763428, + "step": 6754, + "time_per_iteration": 2.5342867374420166 + }, + { + "auxiliary_loss_clip": 0.0645773, + "auxiliary_loss_mlp": 0.01273848, + "balance_loss_clip": 0.06283537, + "balance_loss_mlp": 0.0125931, + "epoch": 0.40613257177213286, + "flos": 21763796739840.0, + "grad_norm": 1.762365568083109, + "language_loss": 0.83186102, + "learning_rate": 2.690603302014844e-06, + "loss": 0.90917671, + "num_input_tokens_seen": 144987065, + "router_z_loss_clip": 1.74121094, + "router_z_loss_mlp": 0.14520264, + "step": 6755, + "time_per_iteration": 2.6024997234344482 + }, + { + "auxiliary_loss_clip": 0.06461492, + "auxiliary_loss_mlp": 0.01268966, + "balance_loss_clip": 0.06283044, + "balance_loss_mlp": 0.01254047, + "epoch": 0.4061926950248008, + "flos": 25561863884160.0, + "grad_norm": 1.6099502444653784, + "language_loss": 0.71436989, + "learning_rate": 2.6902377823472426e-06, + "loss": 0.79167449, + "num_input_tokens_seen": 145007310, + "router_z_loss_clip": 1.78417969, + "router_z_loss_mlp": 0.14923096, + "step": 6756, + "time_per_iteration": 2.5427916049957275 + }, + { + "auxiliary_loss_clip": 0.06455702, + "auxiliary_loss_mlp": 0.01272698, + "balance_loss_clip": 0.06279261, + "balance_loss_mlp": 0.01257726, + "epoch": 0.4062528182774688, + "flos": 23702528983680.0, + "grad_norm": 1.686471122095966, + "language_loss": 0.79134113, + "learning_rate": 2.689872236505755e-06, + "loss": 0.86862516, + "num_input_tokens_seen": 145026210, + "router_z_loss_clip": 1.76464844, + "router_z_loss_mlp": 0.14990234, + "step": 6757, + "time_per_iteration": 2.573546886444092 + }, + { + "auxiliary_loss_clip": 0.06451409, + "auxiliary_loss_mlp": 0.01275677, + "balance_loss_clip": 0.0627944, + "balance_loss_mlp": 0.01260561, + "epoch": 0.4063129415301368, + "flos": 21732504439680.0, + "grad_norm": 1.6631673854083442, + "language_loss": 0.78665155, + "learning_rate": 2.6895066645042437e-06, + "loss": 0.86392242, + "num_input_tokens_seen": 145045475, + "router_z_loss_clip": 1.71972656, + "router_z_loss_mlp": 0.15100098, + "step": 6758, + "time_per_iteration": 2.5283167362213135 + }, + { + "auxiliary_loss_clip": 0.06450847, + "auxiliary_loss_mlp": 0.01276876, + "balance_loss_clip": 0.06280972, + "balance_loss_mlp": 0.0126331, + "epoch": 0.40637306478280477, + "flos": 12792650549760.0, + "grad_norm": 2.0123521464099183, + "language_loss": 0.89116049, + "learning_rate": 2.6891410663565703e-06, + "loss": 0.96843767, + "num_input_tokens_seen": 145062260, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.13568115, + "step": 6759, + "time_per_iteration": 2.5211679935455322 + }, + { + "auxiliary_loss_clip": 0.06457647, + "auxiliary_loss_mlp": 0.01273439, + "balance_loss_clip": 0.06284226, + "balance_loss_mlp": 0.01259742, + "epoch": 0.40643318803547274, + "flos": 24031327605120.0, + "grad_norm": 2.379594130925159, + "language_loss": 0.64235389, + "learning_rate": 2.688775442076598e-06, + "loss": 0.71966481, + "num_input_tokens_seen": 145082470, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.13690186, + "step": 6760, + "time_per_iteration": 2.546807050704956 + }, + { + "auxiliary_loss_clip": 0.0645775, + "auxiliary_loss_mlp": 0.01275543, + "balance_loss_clip": 0.06282319, + "balance_loss_mlp": 0.01260856, + "epoch": 0.4064933112881407, + "flos": 25599361386240.0, + "grad_norm": 1.4617486076979092, + "language_loss": 0.75530171, + "learning_rate": 2.688409791678193e-06, + "loss": 0.83263463, + "num_input_tokens_seen": 145105685, + "router_z_loss_clip": 1.75390625, + "router_z_loss_mlp": 0.14666748, + "step": 6761, + "time_per_iteration": 2.635345935821533 + }, + { + "auxiliary_loss_clip": 0.0645279, + "auxiliary_loss_mlp": 0.01275826, + "balance_loss_clip": 0.06285599, + "balance_loss_mlp": 0.01262183, + "epoch": 0.40655343454080867, + "flos": 22060841863680.0, + "grad_norm": 1.3772427401241372, + "language_loss": 0.70268184, + "learning_rate": 2.6880441151752185e-06, + "loss": 0.77996796, + "num_input_tokens_seen": 145125590, + "router_z_loss_clip": 1.671875, + "router_z_loss_mlp": 0.1362915, + "step": 6762, + "time_per_iteration": 2.5381741523742676 + }, + { + "auxiliary_loss_clip": 0.06454535, + "auxiliary_loss_mlp": 0.01269241, + "balance_loss_clip": 0.06282306, + "balance_loss_mlp": 0.01255532, + "epoch": 0.40661355779347663, + "flos": 26476115783040.0, + "grad_norm": 2.097586218934523, + "language_loss": 0.74072015, + "learning_rate": 2.6876784125815433e-06, + "loss": 0.81795788, + "num_input_tokens_seen": 145146810, + "router_z_loss_clip": 1.72265625, + "router_z_loss_mlp": 0.13708496, + "step": 6763, + "time_per_iteration": 2.6068081855773926 + }, + { + "auxiliary_loss_clip": 0.06460483, + "auxiliary_loss_mlp": 0.01272662, + "balance_loss_clip": 0.06284823, + "balance_loss_mlp": 0.01257946, + "epoch": 0.4066736810461446, + "flos": 13266156372480.0, + "grad_norm": 1.6908157420926835, + "language_loss": 0.69497877, + "learning_rate": 2.687312683911033e-06, + "loss": 0.77231026, + "num_input_tokens_seen": 145163130, + "router_z_loss_clip": 1.7578125, + "router_z_loss_mlp": 0.14703369, + "step": 6764, + "time_per_iteration": 2.511901378631592 + }, + { + "auxiliary_loss_clip": 0.06461611, + "auxiliary_loss_mlp": 0.01272386, + "balance_loss_clip": 0.06284289, + "balance_loss_mlp": 0.01255995, + "epoch": 0.40673380429881256, + "flos": 28811178639360.0, + "grad_norm": 2.09874166778498, + "language_loss": 0.91354716, + "learning_rate": 2.686946929177557e-06, + "loss": 0.99088717, + "num_input_tokens_seen": 145181420, + "router_z_loss_clip": 1.7734375, + "router_z_loss_mlp": 0.16381836, + "step": 6765, + "time_per_iteration": 2.614131450653076 + }, + { + "auxiliary_loss_clip": 0.06467324, + "auxiliary_loss_mlp": 0.01271556, + "balance_loss_clip": 0.06289016, + "balance_loss_mlp": 0.01256959, + "epoch": 0.4067939275514805, + "flos": 12500301254400.0, + "grad_norm": 2.6861779086384945, + "language_loss": 0.7896508, + "learning_rate": 2.6865811483949855e-06, + "loss": 0.86703956, + "num_input_tokens_seen": 145198545, + "router_z_loss_clip": 1.78222656, + "router_z_loss_mlp": 0.14599609, + "step": 6766, + "time_per_iteration": 2.5117299556732178 + }, + { + "auxiliary_loss_clip": 0.06462067, + "auxiliary_loss_mlp": 0.01273332, + "balance_loss_clip": 0.0628517, + "balance_loss_mlp": 0.01258306, + "epoch": 0.4068540508041485, + "flos": 18776461374720.0, + "grad_norm": 40.22612567694579, + "language_loss": 0.77094513, + "learning_rate": 2.6862153415771867e-06, + "loss": 0.84829921, + "num_input_tokens_seen": 145215835, + "router_z_loss_clip": 1.76855469, + "router_z_loss_mlp": 0.15020752, + "step": 6767, + "time_per_iteration": 2.5433967113494873 + }, + { + "auxiliary_loss_clip": 0.06456982, + "auxiliary_loss_mlp": 0.01274714, + "balance_loss_clip": 0.06286283, + "balance_loss_mlp": 0.01260784, + "epoch": 0.40691417405681646, + "flos": 28520506425600.0, + "grad_norm": 1.6477494711234055, + "language_loss": 0.77846849, + "learning_rate": 2.685849508738034e-06, + "loss": 0.85578549, + "num_input_tokens_seen": 145236555, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.1394043, + "step": 6768, + "time_per_iteration": 4.049299478530884 + }, + { + "auxiliary_loss_clip": 0.06460279, + "auxiliary_loss_mlp": 0.0127197, + "balance_loss_clip": 0.06286994, + "balance_loss_mlp": 0.01258213, + "epoch": 0.4069742973094844, + "flos": 20820390819840.0, + "grad_norm": 1.9557468193178857, + "language_loss": 0.87631512, + "learning_rate": 2.6854836498913995e-06, + "loss": 0.9536376, + "num_input_tokens_seen": 145254595, + "router_z_loss_clip": 1.73339844, + "router_z_loss_mlp": 0.13757324, + "step": 6769, + "time_per_iteration": 2.540104389190674 + }, + { + "auxiliary_loss_clip": 0.06461371, + "auxiliary_loss_mlp": 0.01272921, + "balance_loss_clip": 0.06292167, + "balance_loss_mlp": 0.01259504, + "epoch": 0.4070344205621524, + "flos": 21476646397440.0, + "grad_norm": 2.001246026688969, + "language_loss": 0.80859989, + "learning_rate": 2.685117765051156e-06, + "loss": 0.88594282, + "num_input_tokens_seen": 145274005, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.13421631, + "step": 6770, + "time_per_iteration": 3.9851884841918945 + }, + { + "auxiliary_loss_clip": 0.06465216, + "auxiliary_loss_mlp": 0.01270985, + "balance_loss_clip": 0.06288273, + "balance_loss_mlp": 0.01256203, + "epoch": 0.4070945438148204, + "flos": 26836709829120.0, + "grad_norm": 1.8007492597774561, + "language_loss": 0.80221689, + "learning_rate": 2.6847518542311783e-06, + "loss": 0.87957895, + "num_input_tokens_seen": 145294850, + "router_z_loss_clip": 1.77148438, + "router_z_loss_mlp": 0.14770508, + "step": 6771, + "time_per_iteration": 2.5747835636138916 + }, + { + "auxiliary_loss_clip": 0.06460344, + "auxiliary_loss_mlp": 0.01270623, + "balance_loss_clip": 0.06287014, + "balance_loss_mlp": 0.01256926, + "epoch": 0.4071546670674884, + "flos": 26360478748800.0, + "grad_norm": 1.364923552922522, + "language_loss": 0.7623316, + "learning_rate": 2.6843859174453417e-06, + "loss": 0.83964121, + "num_input_tokens_seen": 145317050, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.13696289, + "step": 6772, + "time_per_iteration": 2.628304958343506 + }, + { + "auxiliary_loss_clip": 0.06461407, + "auxiliary_loss_mlp": 0.01270312, + "balance_loss_clip": 0.06287165, + "balance_loss_mlp": 0.01255471, + "epoch": 0.40721479032015634, + "flos": 17901300205440.0, + "grad_norm": 1.7629352970283074, + "language_loss": 0.81345379, + "learning_rate": 2.6840199547075218e-06, + "loss": 0.89077097, + "num_input_tokens_seen": 145334480, + "router_z_loss_clip": 1.74316406, + "router_z_loss_mlp": 0.1484375, + "step": 6773, + "time_per_iteration": 2.5225751399993896 + }, + { + "auxiliary_loss_clip": 0.06368425, + "auxiliary_loss_mlp": 0.01263617, + "balance_loss_clip": 0.06289985, + "balance_loss_mlp": 0.01259653, + "epoch": 0.4072749135728243, + "flos": 49871522424960.0, + "grad_norm": 0.8094154348681942, + "language_loss": 0.64365125, + "learning_rate": 2.683653966031597e-06, + "loss": 0.71997166, + "num_input_tokens_seen": 145388695, + "router_z_loss_clip": 0.78515625, + "router_z_loss_mlp": 0.03961182, + "step": 6774, + "time_per_iteration": 4.446218967437744 + }, + { + "auxiliary_loss_clip": 0.06460027, + "auxiliary_loss_mlp": 0.01268161, + "balance_loss_clip": 0.06283361, + "balance_loss_mlp": 0.01254481, + "epoch": 0.40733503682549227, + "flos": 27571063011840.0, + "grad_norm": 1.7398483222375367, + "language_loss": 0.7269184, + "learning_rate": 2.683287951431446e-06, + "loss": 0.80420029, + "num_input_tokens_seen": 145408240, + "router_z_loss_clip": 1.765625, + "router_z_loss_mlp": 0.13659668, + "step": 6775, + "time_per_iteration": 2.599534511566162 + }, + { + "auxiliary_loss_clip": 0.0645956, + "auxiliary_loss_mlp": 0.01271281, + "balance_loss_clip": 0.06285449, + "balance_loss_mlp": 0.01257328, + "epoch": 0.40739516007816023, + "flos": 22133447026560.0, + "grad_norm": 1.36694346344043, + "language_loss": 0.78053248, + "learning_rate": 2.6829219109209474e-06, + "loss": 0.8578409, + "num_input_tokens_seen": 145428395, + "router_z_loss_clip": 1.74023438, + "router_z_loss_mlp": 0.13946533, + "step": 6776, + "time_per_iteration": 2.6111807823181152 + }, + { + "auxiliary_loss_clip": 0.06466034, + "auxiliary_loss_mlp": 0.01268413, + "balance_loss_clip": 0.06288318, + "balance_loss_mlp": 0.01254358, + "epoch": 0.4074552833308282, + "flos": 23849080974720.0, + "grad_norm": 2.6992343713036933, + "language_loss": 0.79444098, + "learning_rate": 2.682555844513981e-06, + "loss": 0.87178552, + "num_input_tokens_seen": 145448290, + "router_z_loss_clip": 1.77734375, + "router_z_loss_mlp": 0.14056396, + "step": 6777, + "time_per_iteration": 2.6968321800231934 + }, + { + "auxiliary_loss_clip": 0.0635563, + "auxiliary_loss_mlp": 0.01254556, + "balance_loss_clip": 0.06276868, + "balance_loss_mlp": 0.01251499, + "epoch": 0.40751540658349616, + "flos": 58019847120000.0, + "grad_norm": 0.6740608536307336, + "language_loss": 0.53006828, + "learning_rate": 2.6821897522244286e-06, + "loss": 0.60617012, + "num_input_tokens_seen": 145509785, + "router_z_loss_clip": 0.7890625, + "router_z_loss_mlp": 0.0305481, + "step": 6778, + "time_per_iteration": 4.5793616771698 + }, + { + "auxiliary_loss_clip": 0.0645799, + "auxiliary_loss_mlp": 0.01272337, + "balance_loss_clip": 0.06285123, + "balance_loss_mlp": 0.01257996, + "epoch": 0.40757552983616413, + "flos": 21220956063360.0, + "grad_norm": 2.166644010842874, + "language_loss": 0.8325671, + "learning_rate": 2.6818236340661718e-06, + "loss": 0.90987039, + "num_input_tokens_seen": 145528620, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.14349365, + "step": 6779, + "time_per_iteration": 2.5122289657592773 + }, + { + "auxiliary_loss_clip": 0.06459656, + "auxiliary_loss_mlp": 0.01270176, + "balance_loss_clip": 0.06286415, + "balance_loss_mlp": 0.01255752, + "epoch": 0.4076356530888321, + "flos": 26840776752000.0, + "grad_norm": 1.555798351548063, + "language_loss": 0.76392281, + "learning_rate": 2.6814574900530957e-06, + "loss": 0.84122109, + "num_input_tokens_seen": 145547775, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.14440918, + "step": 6780, + "time_per_iteration": 2.5635926723480225 + }, + { + "auxiliary_loss_clip": 0.06453321, + "auxiliary_loss_mlp": 0.01268481, + "balance_loss_clip": 0.06285319, + "balance_loss_mlp": 0.01255964, + "epoch": 0.40769577634150006, + "flos": 12207868104960.0, + "grad_norm": 2.3318684771465388, + "language_loss": 0.66762495, + "learning_rate": 2.6810913201990827e-06, + "loss": 0.74484301, + "num_input_tokens_seen": 145564465, + "router_z_loss_clip": 1.67871094, + "router_z_loss_mlp": 0.12512207, + "step": 6781, + "time_per_iteration": 2.4998953342437744 + }, + { + "auxiliary_loss_clip": 0.06457075, + "auxiliary_loss_mlp": 0.01270756, + "balance_loss_clip": 0.06285501, + "balance_loss_mlp": 0.01257005, + "epoch": 0.407755899594168, + "flos": 33663467128320.0, + "grad_norm": 1.4801990709986605, + "language_loss": 0.71833825, + "learning_rate": 2.6807251245180183e-06, + "loss": 0.79561651, + "num_input_tokens_seen": 145585965, + "router_z_loss_clip": 1.71679688, + "router_z_loss_mlp": 0.13757324, + "step": 6782, + "time_per_iteration": 2.6407761573791504 + }, + { + "auxiliary_loss_clip": 0.06455722, + "auxiliary_loss_mlp": 0.01265619, + "balance_loss_clip": 0.06282325, + "balance_loss_mlp": 0.01252804, + "epoch": 0.407816022846836, + "flos": 20163590190720.0, + "grad_norm": 1.6531823939859909, + "language_loss": 0.82546687, + "learning_rate": 2.6803589030237897e-06, + "loss": 0.90268028, + "num_input_tokens_seen": 145605000, + "router_z_loss_clip": 1.73535156, + "router_z_loss_mlp": 0.12823486, + "step": 6783, + "time_per_iteration": 2.521007776260376 + }, + { + "auxiliary_loss_clip": 0.06456424, + "auxiliary_loss_mlp": 0.01272041, + "balance_loss_clip": 0.06284439, + "balance_loss_mlp": 0.01258504, + "epoch": 0.40787614609950396, + "flos": 21185219496960.0, + "grad_norm": 3.105146861858365, + "language_loss": 0.80980694, + "learning_rate": 2.679992655730283e-06, + "loss": 0.88709158, + "num_input_tokens_seen": 145623740, + "router_z_loss_clip": 1.71972656, + "router_z_loss_mlp": 0.13549805, + "step": 6784, + "time_per_iteration": 2.555502414703369 + }, + { + "auxiliary_loss_clip": 0.06462008, + "auxiliary_loss_mlp": 0.01270528, + "balance_loss_clip": 0.06282149, + "balance_loss_mlp": 0.01254888, + "epoch": 0.407936269352172, + "flos": 20526699859200.0, + "grad_norm": 1.8248584482375538, + "language_loss": 0.65994555, + "learning_rate": 2.679626382651386e-06, + "loss": 0.73727089, + "num_input_tokens_seen": 145643515, + "router_z_loss_clip": 1.79882812, + "router_z_loss_mlp": 0.15661621, + "step": 6785, + "time_per_iteration": 2.5122246742248535 + }, + { + "auxiliary_loss_clip": 0.06453374, + "auxiliary_loss_mlp": 0.01270477, + "balance_loss_clip": 0.06281981, + "balance_loss_mlp": 0.01256505, + "epoch": 0.40799639260483994, + "flos": 20124709096320.0, + "grad_norm": 2.5052548980669487, + "language_loss": 0.80350053, + "learning_rate": 2.679260083800989e-06, + "loss": 0.88073903, + "num_input_tokens_seen": 145660890, + "router_z_loss_clip": 1.71191406, + "router_z_loss_mlp": 0.13970947, + "step": 6786, + "time_per_iteration": 2.554553985595703 + }, + { + "auxiliary_loss_clip": 0.0645851, + "auxiliary_loss_mlp": 0.01272529, + "balance_loss_clip": 0.06286281, + "balance_loss_mlp": 0.01258874, + "epoch": 0.4080565158575079, + "flos": 21003853334400.0, + "grad_norm": 1.5530341827396597, + "language_loss": 0.81621969, + "learning_rate": 2.678893759192982e-06, + "loss": 0.89353013, + "num_input_tokens_seen": 145680070, + "router_z_loss_clip": 1.72070312, + "router_z_loss_mlp": 0.13665771, + "step": 6787, + "time_per_iteration": 2.536215305328369 + }, + { + "auxiliary_loss_clip": 0.06458452, + "auxiliary_loss_mlp": 0.01268932, + "balance_loss_clip": 0.0628721, + "balance_loss_mlp": 0.01255623, + "epoch": 0.40811663911017587, + "flos": 19323746317440.0, + "grad_norm": 1.9049170263972377, + "language_loss": 0.6798445, + "learning_rate": 2.678527408841255e-06, + "loss": 0.75711828, + "num_input_tokens_seen": 145698010, + "router_z_loss_clip": 1.7109375, + "router_z_loss_mlp": 0.13323975, + "step": 6788, + "time_per_iteration": 2.533457040786743 + }, + { + "auxiliary_loss_clip": 0.06456561, + "auxiliary_loss_mlp": 0.01272482, + "balance_loss_clip": 0.06284444, + "balance_loss_mlp": 0.01258952, + "epoch": 0.40817676236284384, + "flos": 40634973555840.0, + "grad_norm": 1.8916550457168047, + "language_loss": 0.66478348, + "learning_rate": 2.678161032759701e-06, + "loss": 0.74207389, + "num_input_tokens_seen": 145722215, + "router_z_loss_clip": 1.72167969, + "router_z_loss_mlp": 0.13537598, + "step": 6789, + "time_per_iteration": 2.726292371749878 + }, + { + "auxiliary_loss_clip": 0.06456382, + "auxiliary_loss_mlp": 0.01270282, + "balance_loss_clip": 0.06284897, + "balance_loss_mlp": 0.01256383, + "epoch": 0.4082368856155118, + "flos": 20528376940800.0, + "grad_norm": 1.5670896359254076, + "language_loss": 0.61192298, + "learning_rate": 2.6777946309622123e-06, + "loss": 0.68918967, + "num_input_tokens_seen": 145741090, + "router_z_loss_clip": 1.71484375, + "router_z_loss_mlp": 0.13885498, + "step": 6790, + "time_per_iteration": 2.5437731742858887 + }, + { + "auxiliary_loss_clip": 0.06455828, + "auxiliary_loss_mlp": 0.01271387, + "balance_loss_clip": 0.062863, + "balance_loss_mlp": 0.01257928, + "epoch": 0.40829700886817977, + "flos": 11430944248320.0, + "grad_norm": 3.0698605132878076, + "language_loss": 0.69964224, + "learning_rate": 2.677428203462683e-06, + "loss": 0.77691442, + "num_input_tokens_seen": 145754985, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.13452148, + "step": 6791, + "time_per_iteration": 2.4941210746765137 + }, + { + "auxiliary_loss_clip": 0.0635563, + "auxiliary_loss_mlp": 0.01262815, + "balance_loss_clip": 0.06277826, + "balance_loss_mlp": 0.01259486, + "epoch": 0.40835713212084773, + "flos": 67350455326080.0, + "grad_norm": 0.7295736549212738, + "language_loss": 0.59295797, + "learning_rate": 2.6770617502750093e-06, + "loss": 0.66914248, + "num_input_tokens_seen": 145815260, + "router_z_loss_clip": 0.77734375, + "router_z_loss_mlp": 0.03335571, + "step": 6792, + "time_per_iteration": 3.153479814529419 + }, + { + "auxiliary_loss_clip": 0.06459208, + "auxiliary_loss_mlp": 0.01270498, + "balance_loss_clip": 0.06285354, + "balance_loss_mlp": 0.01256193, + "epoch": 0.4084172553735157, + "flos": 21768408714240.0, + "grad_norm": 1.6689878199369865, + "language_loss": 0.80186534, + "learning_rate": 2.6766952714130857e-06, + "loss": 0.87916243, + "num_input_tokens_seen": 145832665, + "router_z_loss_clip": 1.73828125, + "router_z_loss_mlp": 0.14306641, + "step": 6793, + "time_per_iteration": 2.562311887741089 + }, + { + "auxiliary_loss_clip": 0.06458702, + "auxiliary_loss_mlp": 0.01272476, + "balance_loss_clip": 0.06283591, + "balance_loss_mlp": 0.01258237, + "epoch": 0.40847737862618366, + "flos": 27424594874880.0, + "grad_norm": 3.9059129474249, + "language_loss": 0.85597503, + "learning_rate": 2.6763287668908094e-06, + "loss": 0.93328679, + "num_input_tokens_seen": 145850240, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.14227295, + "step": 6794, + "time_per_iteration": 2.558554172515869 + }, + { + "auxiliary_loss_clip": 0.06457786, + "auxiliary_loss_mlp": 0.01274296, + "balance_loss_clip": 0.0628652, + "balance_loss_mlp": 0.01259991, + "epoch": 0.4085375018788516, + "flos": 18593040787200.0, + "grad_norm": 1.7852935587618148, + "language_loss": 0.80216181, + "learning_rate": 2.6759622367220788e-06, + "loss": 0.87948263, + "num_input_tokens_seen": 145869545, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.14306641, + "step": 6795, + "time_per_iteration": 2.540349006652832 + }, + { + "auxiliary_loss_clip": 0.06465046, + "auxiliary_loss_mlp": 0.01270762, + "balance_loss_clip": 0.0628596, + "balance_loss_mlp": 0.01255718, + "epoch": 0.4085976251315196, + "flos": 15416834319360.0, + "grad_norm": 2.647671549267762, + "language_loss": 0.70204669, + "learning_rate": 2.675595680920792e-06, + "loss": 0.77940476, + "num_input_tokens_seen": 145884025, + "router_z_loss_clip": 1.79101562, + "router_z_loss_mlp": 0.15057373, + "step": 6796, + "time_per_iteration": 2.483670711517334 + }, + { + "auxiliary_loss_clip": 0.06458762, + "auxiliary_loss_mlp": 0.01269742, + "balance_loss_clip": 0.06285367, + "balance_loss_mlp": 0.01256558, + "epoch": 0.40865774838418756, + "flos": 21258705127680.0, + "grad_norm": 1.5727118215642113, + "language_loss": 0.78255171, + "learning_rate": 2.6752290995008498e-06, + "loss": 0.85983676, + "num_input_tokens_seen": 145903210, + "router_z_loss_clip": 1.73535156, + "router_z_loss_mlp": 0.13189697, + "step": 6797, + "time_per_iteration": 2.580595016479492 + }, + { + "auxiliary_loss_clip": 0.06459324, + "auxiliary_loss_mlp": 0.01274053, + "balance_loss_clip": 0.06286809, + "balance_loss_mlp": 0.01260183, + "epoch": 0.4087178716368556, + "flos": 13777411259520.0, + "grad_norm": 1.8045279385790254, + "language_loss": 0.86005986, + "learning_rate": 2.6748624924761523e-06, + "loss": 0.93739361, + "num_input_tokens_seen": 145920985, + "router_z_loss_clip": 1.72363281, + "router_z_loss_mlp": 0.13885498, + "step": 6798, + "time_per_iteration": 2.525223970413208 + }, + { + "auxiliary_loss_clip": 0.0645816, + "auxiliary_loss_mlp": 0.01271081, + "balance_loss_clip": 0.06287363, + "balance_loss_mlp": 0.01258308, + "epoch": 0.40877799488952354, + "flos": 23628288666240.0, + "grad_norm": 1.532136532380416, + "language_loss": 0.84202659, + "learning_rate": 2.674495859860601e-06, + "loss": 0.91931903, + "num_input_tokens_seen": 145940350, + "router_z_loss_clip": 1.70800781, + "router_z_loss_mlp": 0.12774658, + "step": 6799, + "time_per_iteration": 2.5898637771606445 + }, + { + "auxiliary_loss_clip": 0.06456885, + "auxiliary_loss_mlp": 0.01270815, + "balance_loss_clip": 0.06284514, + "balance_loss_mlp": 0.01256695, + "epoch": 0.4088381181421915, + "flos": 20924372136960.0, + "grad_norm": 3.2861641598601516, + "language_loss": 0.83725351, + "learning_rate": 2.6741292016681e-06, + "loss": 0.91453052, + "num_input_tokens_seen": 145957460, + "router_z_loss_clip": 1.72265625, + "router_z_loss_mlp": 0.14129639, + "step": 6800, + "time_per_iteration": 2.5050573348999023 + }, + { + "auxiliary_loss_clip": 0.06460495, + "auxiliary_loss_mlp": 0.0127488, + "balance_loss_clip": 0.06284706, + "balance_loss_mlp": 0.01260324, + "epoch": 0.4088982413948595, + "flos": 13302605698560.0, + "grad_norm": 2.1402246624759225, + "language_loss": 0.74944514, + "learning_rate": 2.6737625179125514e-06, + "loss": 0.82679886, + "num_input_tokens_seen": 145975285, + "router_z_loss_clip": 1.7578125, + "router_z_loss_mlp": 0.14532471, + "step": 6801, + "time_per_iteration": 2.546226978302002 + }, + { + "auxiliary_loss_clip": 0.0646005, + "auxiliary_loss_mlp": 0.0127012, + "balance_loss_clip": 0.06286253, + "balance_loss_mlp": 0.01256358, + "epoch": 0.40895836464752744, + "flos": 15273007585920.0, + "grad_norm": 2.8712837575861316, + "language_loss": 0.80348778, + "learning_rate": 2.673395808607861e-06, + "loss": 0.8807894, + "num_input_tokens_seen": 145989150, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.13775635, + "step": 6802, + "time_per_iteration": 2.4804327487945557 + }, + { + "auxiliary_loss_clip": 0.06463334, + "auxiliary_loss_mlp": 0.01271488, + "balance_loss_clip": 0.06286001, + "balance_loss_mlp": 0.01256813, + "epoch": 0.4090184879001954, + "flos": 14506607416320.0, + "grad_norm": 2.1610413406346147, + "language_loss": 0.7616486, + "learning_rate": 2.673029073767934e-06, + "loss": 0.83899677, + "num_input_tokens_seen": 146006980, + "router_z_loss_clip": 1.77246094, + "router_z_loss_mlp": 0.14660645, + "step": 6803, + "time_per_iteration": 2.5792553424835205 + }, + { + "auxiliary_loss_clip": 0.06459032, + "auxiliary_loss_mlp": 0.01268618, + "balance_loss_clip": 0.06286538, + "balance_loss_mlp": 0.01255017, + "epoch": 0.40907861115286337, + "flos": 13886759237760.0, + "grad_norm": 1.7652651103072021, + "language_loss": 0.79160619, + "learning_rate": 2.6726623134066764e-06, + "loss": 0.86888266, + "num_input_tokens_seen": 146025125, + "router_z_loss_clip": 1.72363281, + "router_z_loss_mlp": 0.1361084, + "step": 6804, + "time_per_iteration": 2.489569902420044 + }, + { + "auxiliary_loss_clip": 0.06464031, + "auxiliary_loss_mlp": 0.01273102, + "balance_loss_clip": 0.06285653, + "balance_loss_mlp": 0.0125919, + "epoch": 0.40913873440553133, + "flos": 28045071959040.0, + "grad_norm": 1.8644340771163777, + "language_loss": 0.75315928, + "learning_rate": 2.672295527537998e-06, + "loss": 0.83053064, + "num_input_tokens_seen": 146044990, + "router_z_loss_clip": 1.78125, + "router_z_loss_mlp": 0.13909912, + "step": 6805, + "time_per_iteration": 2.6142778396606445 + }, + { + "auxiliary_loss_clip": 0.06465782, + "auxiliary_loss_mlp": 0.01272786, + "balance_loss_clip": 0.06288569, + "balance_loss_mlp": 0.01257957, + "epoch": 0.4091988576581993, + "flos": 21624917397120.0, + "grad_norm": 1.7712960163929097, + "language_loss": 0.7965951, + "learning_rate": 2.671928716175804e-06, + "loss": 0.87398076, + "num_input_tokens_seen": 146066045, + "router_z_loss_clip": 1.77050781, + "router_z_loss_mlp": 0.14825439, + "step": 6806, + "time_per_iteration": 2.567579984664917 + }, + { + "auxiliary_loss_clip": 0.06464592, + "auxiliary_loss_mlp": 0.01268771, + "balance_loss_clip": 0.06287415, + "balance_loss_mlp": 0.01254609, + "epoch": 0.40925898091086726, + "flos": 25230381932160.0, + "grad_norm": 1.8487150493759184, + "language_loss": 0.725999, + "learning_rate": 2.671561879334007e-06, + "loss": 0.80333263, + "num_input_tokens_seen": 146086280, + "router_z_loss_clip": 1.77148438, + "router_z_loss_mlp": 0.14147949, + "step": 6807, + "time_per_iteration": 4.0469160079956055 + }, + { + "auxiliary_loss_clip": 0.06359696, + "auxiliary_loss_mlp": 0.012552, + "balance_loss_clip": 0.06279803, + "balance_loss_mlp": 0.01251397, + "epoch": 0.40931910416353523, + "flos": 68949697553280.0, + "grad_norm": 0.8076862955861985, + "language_loss": 0.5884732, + "learning_rate": 2.6711950170265155e-06, + "loss": 0.66462219, + "num_input_tokens_seen": 146148840, + "router_z_loss_clip": 0.79833984, + "router_z_loss_mlp": 0.03796387, + "step": 6808, + "time_per_iteration": 3.236466407775879 + }, + { + "auxiliary_loss_clip": 0.0646228, + "auxiliary_loss_mlp": 0.01268444, + "balance_loss_clip": 0.06290961, + "balance_loss_mlp": 0.0125511, + "epoch": 0.4093792274162032, + "flos": 20195092126080.0, + "grad_norm": 2.068974912031903, + "language_loss": 0.54879391, + "learning_rate": 2.670828129267242e-06, + "loss": 0.62610114, + "num_input_tokens_seen": 146166195, + "router_z_loss_clip": 1.71386719, + "router_z_loss_mlp": 0.13342285, + "step": 6809, + "time_per_iteration": 4.028552055358887 + }, + { + "auxiliary_loss_clip": 0.06460767, + "auxiliary_loss_mlp": 0.01271891, + "balance_loss_clip": 0.06288341, + "balance_loss_mlp": 0.0125805, + "epoch": 0.40943935066887116, + "flos": 25235832447360.0, + "grad_norm": 1.6877735836202645, + "language_loss": 0.83297133, + "learning_rate": 2.6704612160700983e-06, + "loss": 0.91029787, + "num_input_tokens_seen": 146185045, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.13830566, + "step": 6810, + "time_per_iteration": 2.5688657760620117 + }, + { + "auxiliary_loss_clip": 0.06467541, + "auxiliary_loss_mlp": 0.01274919, + "balance_loss_clip": 0.06291755, + "balance_loss_mlp": 0.01260376, + "epoch": 0.4094994739215392, + "flos": 23261531345280.0, + "grad_norm": 2.1410482965152475, + "language_loss": 0.78002244, + "learning_rate": 2.670094277448999e-06, + "loss": 0.85744703, + "num_input_tokens_seen": 146204655, + "router_z_loss_clip": 1.75878906, + "router_z_loss_mlp": 0.14526367, + "step": 6811, + "time_per_iteration": 2.5859668254852295 + }, + { + "auxiliary_loss_clip": 0.06461761, + "auxiliary_loss_mlp": 0.01270439, + "balance_loss_clip": 0.06286068, + "balance_loss_mlp": 0.01255705, + "epoch": 0.40955959717420715, + "flos": 17387571623040.0, + "grad_norm": 1.532323288412775, + "language_loss": 0.70159924, + "learning_rate": 2.669727313417857e-06, + "loss": 0.77892125, + "num_input_tokens_seen": 146222000, + "router_z_loss_clip": 1.75683594, + "router_z_loss_mlp": 0.1472168, + "step": 6812, + "time_per_iteration": 2.5128583908081055 + }, + { + "auxiliary_loss_clip": 0.06459609, + "auxiliary_loss_mlp": 0.01271673, + "balance_loss_clip": 0.06286342, + "balance_loss_mlp": 0.01257689, + "epoch": 0.4096197204268751, + "flos": 25089406237440.0, + "grad_norm": 1.5016829758663763, + "language_loss": 0.6657182, + "learning_rate": 2.6693603239905872e-06, + "loss": 0.74303102, + "num_input_tokens_seen": 146242630, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.13989258, + "step": 6813, + "time_per_iteration": 4.086791515350342 + }, + { + "auxiliary_loss_clip": 0.06457571, + "auxiliary_loss_mlp": 0.01273443, + "balance_loss_clip": 0.06284814, + "balance_loss_mlp": 0.01259186, + "epoch": 0.4096798436795431, + "flos": 30593841454080.0, + "grad_norm": 3.468085127477164, + "language_loss": 0.74528515, + "learning_rate": 2.6689933091811087e-06, + "loss": 0.82259536, + "num_input_tokens_seen": 146263070, + "router_z_loss_clip": 1.72949219, + "router_z_loss_mlp": 0.14282227, + "step": 6814, + "time_per_iteration": 2.6079764366149902 + }, + { + "auxiliary_loss_clip": 0.06469103, + "auxiliary_loss_mlp": 0.0126922, + "balance_loss_clip": 0.06290863, + "balance_loss_mlp": 0.01254927, + "epoch": 0.40973996693221104, + "flos": 24140424021120.0, + "grad_norm": 2.1723549744151573, + "language_loss": 0.66418713, + "learning_rate": 2.6686262690033357e-06, + "loss": 0.74157035, + "num_input_tokens_seen": 146282890, + "router_z_loss_clip": 1.78222656, + "router_z_loss_mlp": 0.14276123, + "step": 6815, + "time_per_iteration": 2.574538469314575 + }, + { + "auxiliary_loss_clip": 0.06459038, + "auxiliary_loss_mlp": 0.01277533, + "balance_loss_clip": 0.06290913, + "balance_loss_mlp": 0.01264116, + "epoch": 0.409800090184879, + "flos": 23995968382080.0, + "grad_norm": 1.5545179592453178, + "language_loss": 0.76523387, + "learning_rate": 2.668259203471188e-06, + "loss": 0.84259957, + "num_input_tokens_seen": 146301755, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.13433838, + "step": 6816, + "time_per_iteration": 2.5691564083099365 + }, + { + "auxiliary_loss_clip": 0.06462897, + "auxiliary_loss_mlp": 0.01272633, + "balance_loss_clip": 0.06288977, + "balance_loss_mlp": 0.01258834, + "epoch": 0.40986021343754697, + "flos": 16149216931200.0, + "grad_norm": 2.0573498340626957, + "language_loss": 0.82244468, + "learning_rate": 2.6678921125985843e-06, + "loss": 0.8998, + "num_input_tokens_seen": 146316835, + "router_z_loss_clip": 1.73925781, + "router_z_loss_mlp": 0.13812256, + "step": 6817, + "time_per_iteration": 3.992452621459961 + }, + { + "auxiliary_loss_clip": 0.06471414, + "auxiliary_loss_mlp": 0.0127126, + "balance_loss_clip": 0.06288736, + "balance_loss_mlp": 0.0125556, + "epoch": 0.40992033669021494, + "flos": 24797811628800.0, + "grad_norm": 1.5933135055943601, + "language_loss": 0.80022383, + "learning_rate": 2.667524996399444e-06, + "loss": 0.87765062, + "num_input_tokens_seen": 146336650, + "router_z_loss_clip": 1.82714844, + "router_z_loss_mlp": 0.15698242, + "step": 6818, + "time_per_iteration": 2.6226916313171387 + }, + { + "auxiliary_loss_clip": 0.06458658, + "auxiliary_loss_mlp": 0.01265615, + "balance_loss_clip": 0.06287554, + "balance_loss_mlp": 0.01252609, + "epoch": 0.4099804599428829, + "flos": 29649429285120.0, + "grad_norm": 1.5014418509343528, + "language_loss": 0.66358954, + "learning_rate": 2.66715785488769e-06, + "loss": 0.74083227, + "num_input_tokens_seen": 146357640, + "router_z_loss_clip": 1.7109375, + "router_z_loss_mlp": 0.13006592, + "step": 6819, + "time_per_iteration": 2.5726187229156494 + }, + { + "auxiliary_loss_clip": 0.06472912, + "auxiliary_loss_mlp": 0.01275099, + "balance_loss_clip": 0.06290931, + "balance_loss_mlp": 0.01259566, + "epoch": 0.41004058319555087, + "flos": 24833464341120.0, + "grad_norm": 1.4779477588129932, + "language_loss": 0.85265613, + "learning_rate": 2.6667906880772428e-06, + "loss": 0.9301362, + "num_input_tokens_seen": 146379325, + "router_z_loss_clip": 1.8203125, + "router_z_loss_mlp": 0.15527344, + "step": 6820, + "time_per_iteration": 2.5997445583343506 + }, + { + "auxiliary_loss_clip": 0.06459977, + "auxiliary_loss_mlp": 0.01274929, + "balance_loss_clip": 0.06289133, + "balance_loss_mlp": 0.01261571, + "epoch": 0.41010070644821883, + "flos": 25744278222720.0, + "grad_norm": 1.6716831778372079, + "language_loss": 0.71520668, + "learning_rate": 2.6664234959820256e-06, + "loss": 0.79255575, + "num_input_tokens_seen": 146398635, + "router_z_loss_clip": 1.70800781, + "router_z_loss_mlp": 0.13360596, + "step": 6821, + "time_per_iteration": 2.5686511993408203 + }, + { + "auxiliary_loss_clip": 0.06462038, + "auxiliary_loss_mlp": 0.01275085, + "balance_loss_clip": 0.06288444, + "balance_loss_mlp": 0.01262037, + "epoch": 0.4101608297008868, + "flos": 22352604180480.0, + "grad_norm": 1.920651769082741, + "language_loss": 0.74875939, + "learning_rate": 2.6660562786159634e-06, + "loss": 0.82613057, + "num_input_tokens_seen": 146417585, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.13049316, + "step": 6822, + "time_per_iteration": 2.5453121662139893 + }, + { + "auxiliary_loss_clip": 0.0646743, + "auxiliary_loss_mlp": 0.01270606, + "balance_loss_clip": 0.06293608, + "balance_loss_mlp": 0.01256408, + "epoch": 0.41022095295355476, + "flos": 21951619666560.0, + "grad_norm": 2.1329933375936045, + "language_loss": 0.75859648, + "learning_rate": 2.6656890359929796e-06, + "loss": 0.83597684, + "num_input_tokens_seen": 146437035, + "router_z_loss_clip": 1.73730469, + "router_z_loss_mlp": 0.14208984, + "step": 6823, + "time_per_iteration": 2.514934539794922 + }, + { + "auxiliary_loss_clip": 0.06469562, + "auxiliary_loss_mlp": 0.01272535, + "balance_loss_clip": 0.06289219, + "balance_loss_mlp": 0.01257276, + "epoch": 0.4102810762062228, + "flos": 27457312694400.0, + "grad_norm": 5.1897859223278004, + "language_loss": 0.74005461, + "learning_rate": 2.665321768127001e-06, + "loss": 0.81747556, + "num_input_tokens_seen": 146457370, + "router_z_loss_clip": 1.80078125, + "router_z_loss_mlp": 0.15258789, + "step": 6824, + "time_per_iteration": 2.645362615585327 + }, + { + "auxiliary_loss_clip": 0.06472579, + "auxiliary_loss_mlp": 0.01268406, + "balance_loss_clip": 0.06292652, + "balance_loss_mlp": 0.01253589, + "epoch": 0.41034119945889075, + "flos": 24506258947200.0, + "grad_norm": 2.0548664701913215, + "language_loss": 0.72348672, + "learning_rate": 2.6649544750319548e-06, + "loss": 0.80089658, + "num_input_tokens_seen": 146478105, + "router_z_loss_clip": 1.79882812, + "router_z_loss_mlp": 0.14788818, + "step": 6825, + "time_per_iteration": 2.5779926776885986 + }, + { + "auxiliary_loss_clip": 0.0646458, + "auxiliary_loss_mlp": 0.01269358, + "balance_loss_clip": 0.06292018, + "balance_loss_mlp": 0.01255822, + "epoch": 0.4104013227115587, + "flos": 24359497320960.0, + "grad_norm": 2.1141131447671, + "language_loss": 0.85571408, + "learning_rate": 2.664587156721768e-06, + "loss": 0.93305349, + "num_input_tokens_seen": 146497835, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.13537598, + "step": 6826, + "time_per_iteration": 2.556445598602295 + }, + { + "auxiliary_loss_clip": 0.06462094, + "auxiliary_loss_mlp": 0.01278764, + "balance_loss_clip": 0.0629297, + "balance_loss_mlp": 0.0126468, + "epoch": 0.4104614459642267, + "flos": 23735582219520.0, + "grad_norm": 2.6430290167775037, + "language_loss": 0.6714378, + "learning_rate": 2.6642198132103696e-06, + "loss": 0.74884635, + "num_input_tokens_seen": 146517735, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.14080811, + "step": 6827, + "time_per_iteration": 2.55556058883667 + }, + { + "auxiliary_loss_clip": 0.06463977, + "auxiliary_loss_mlp": 0.01267684, + "balance_loss_clip": 0.06292337, + "balance_loss_mlp": 0.01254017, + "epoch": 0.41052156921689464, + "flos": 22134620983680.0, + "grad_norm": 1.346138162541555, + "language_loss": 0.72310138, + "learning_rate": 2.663852444511689e-06, + "loss": 0.80041802, + "num_input_tokens_seen": 146537640, + "router_z_loss_clip": 1.71777344, + "router_z_loss_mlp": 0.13665771, + "step": 6828, + "time_per_iteration": 2.6050894260406494 + }, + { + "auxiliary_loss_clip": 0.06477004, + "auxiliary_loss_mlp": 0.01275424, + "balance_loss_clip": 0.06296174, + "balance_loss_mlp": 0.01259855, + "epoch": 0.4105816924695626, + "flos": 20090607684480.0, + "grad_norm": 2.1527229818824196, + "language_loss": 0.84003794, + "learning_rate": 2.6634850506396574e-06, + "loss": 0.91756219, + "num_input_tokens_seen": 146554695, + "router_z_loss_clip": 1.81054688, + "router_z_loss_mlp": 0.15588379, + "step": 6829, + "time_per_iteration": 2.5358362197875977 + }, + { + "auxiliary_loss_clip": 0.06466494, + "auxiliary_loss_mlp": 0.01273558, + "balance_loss_clip": 0.0629379, + "balance_loss_mlp": 0.01259789, + "epoch": 0.4106418157222306, + "flos": 18082540586880.0, + "grad_norm": 1.474811924806309, + "language_loss": 0.90568459, + "learning_rate": 2.663117631608206e-06, + "loss": 0.98308516, + "num_input_tokens_seen": 146573740, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.13781738, + "step": 6830, + "time_per_iteration": 2.5749125480651855 + }, + { + "auxiliary_loss_clip": 0.06471005, + "auxiliary_loss_mlp": 0.01271813, + "balance_loss_clip": 0.06296638, + "balance_loss_mlp": 0.01257729, + "epoch": 0.41070193897489854, + "flos": 21653442512640.0, + "grad_norm": 1.8339460976388509, + "language_loss": 0.6606307, + "learning_rate": 2.662750187431268e-06, + "loss": 0.73805887, + "num_input_tokens_seen": 146592885, + "router_z_loss_clip": 1.74414062, + "router_z_loss_mlp": 0.14080811, + "step": 6831, + "time_per_iteration": 2.5448153018951416 + }, + { + "auxiliary_loss_clip": 0.06473927, + "auxiliary_loss_mlp": 0.01269964, + "balance_loss_clip": 0.06301369, + "balance_loss_mlp": 0.01256613, + "epoch": 0.4107620622275665, + "flos": 26654924396160.0, + "grad_norm": 2.1106075691496766, + "language_loss": 0.69853723, + "learning_rate": 2.662382718122776e-06, + "loss": 0.77597612, + "num_input_tokens_seen": 146611995, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.13360596, + "step": 6832, + "time_per_iteration": 2.61200213432312 + }, + { + "auxiliary_loss_clip": 0.06467804, + "auxiliary_loss_mlp": 0.01274675, + "balance_loss_clip": 0.06296351, + "balance_loss_mlp": 0.01261586, + "epoch": 0.41082218548023447, + "flos": 18740305537920.0, + "grad_norm": 3.2749058883058177, + "language_loss": 0.73955101, + "learning_rate": 2.662015223696666e-06, + "loss": 0.81697583, + "num_input_tokens_seen": 146628045, + "router_z_loss_clip": 1.71386719, + "router_z_loss_mlp": 0.13092041, + "step": 6833, + "time_per_iteration": 2.5293643474578857 + }, + { + "auxiliary_loss_clip": 0.06477401, + "auxiliary_loss_mlp": 0.01270878, + "balance_loss_clip": 0.06301869, + "balance_loss_mlp": 0.01256334, + "epoch": 0.41088230873290243, + "flos": 22900476101760.0, + "grad_norm": 1.6362019789175348, + "language_loss": 0.72870773, + "learning_rate": 2.6616477041668713e-06, + "loss": 0.80619049, + "num_input_tokens_seen": 146648355, + "router_z_loss_clip": 1.75488281, + "router_z_loss_mlp": 0.14532471, + "step": 6834, + "time_per_iteration": 2.5534543991088867 + }, + { + "auxiliary_loss_clip": 0.06479818, + "auxiliary_loss_mlp": 0.01271417, + "balance_loss_clip": 0.0630189, + "balance_loss_mlp": 0.01257601, + "epoch": 0.4109424319855704, + "flos": 24283370286720.0, + "grad_norm": 2.482567827780577, + "language_loss": 0.71274042, + "learning_rate": 2.661280159547329e-06, + "loss": 0.7902528, + "num_input_tokens_seen": 146668370, + "router_z_loss_clip": 1.78027344, + "router_z_loss_mlp": 0.13824463, + "step": 6835, + "time_per_iteration": 2.6012609004974365 + }, + { + "auxiliary_loss_clip": 0.06481166, + "auxiliary_loss_mlp": 0.012697, + "balance_loss_clip": 0.06306168, + "balance_loss_mlp": 0.01255318, + "epoch": 0.41100255523823837, + "flos": 12974100566400.0, + "grad_norm": 1.7690004377507398, + "language_loss": 0.87590879, + "learning_rate": 2.660912589851978e-06, + "loss": 0.95341742, + "num_input_tokens_seen": 146686665, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.14373779, + "step": 6836, + "time_per_iteration": 2.5210461616516113 + }, + { + "auxiliary_loss_clip": 0.06475058, + "auxiliary_loss_mlp": 0.0127358, + "balance_loss_clip": 0.06304475, + "balance_loss_mlp": 0.01259937, + "epoch": 0.4110626784909064, + "flos": 23151806023680.0, + "grad_norm": 1.7062413123689164, + "language_loss": 0.69134921, + "learning_rate": 2.6605449950947547e-06, + "loss": 0.76883554, + "num_input_tokens_seen": 146706570, + "router_z_loss_clip": 1.70703125, + "router_z_loss_mlp": 0.13641357, + "step": 6837, + "time_per_iteration": 2.58320689201355 + }, + { + "auxiliary_loss_clip": 0.06479225, + "auxiliary_loss_mlp": 0.01273179, + "balance_loss_clip": 0.06301909, + "balance_loss_mlp": 0.01258248, + "epoch": 0.41112280174357435, + "flos": 22754007964800.0, + "grad_norm": 1.9797600155486905, + "language_loss": 0.7565136, + "learning_rate": 2.660177375289599e-06, + "loss": 0.83403766, + "num_input_tokens_seen": 146723425, + "router_z_loss_clip": 1.7734375, + "router_z_loss_mlp": 0.1494751, + "step": 6838, + "time_per_iteration": 2.5357375144958496 + }, + { + "auxiliary_loss_clip": 0.06478335, + "auxiliary_loss_mlp": 0.01273659, + "balance_loss_clip": 0.06305958, + "balance_loss_mlp": 0.01259318, + "epoch": 0.4111829249962423, + "flos": 21108211994880.0, + "grad_norm": 2.0771476339041635, + "language_loss": 0.82403398, + "learning_rate": 2.659809730450451e-06, + "loss": 0.90155393, + "num_input_tokens_seen": 146741640, + "router_z_loss_clip": 1.72265625, + "router_z_loss_mlp": 0.14343262, + "step": 6839, + "time_per_iteration": 2.596498489379883 + }, + { + "auxiliary_loss_clip": 0.06477809, + "auxiliary_loss_mlp": 0.01273131, + "balance_loss_clip": 0.06305793, + "balance_loss_mlp": 0.01259404, + "epoch": 0.4112430482489103, + "flos": 21512005620480.0, + "grad_norm": 1.908617135949294, + "language_loss": 0.8080616, + "learning_rate": 2.6594420605912523e-06, + "loss": 0.885571, + "num_input_tokens_seen": 146759195, + "router_z_loss_clip": 1.72070312, + "router_z_loss_mlp": 0.13726807, + "step": 6840, + "time_per_iteration": 2.575131893157959 + }, + { + "auxiliary_loss_clip": 0.06480156, + "auxiliary_loss_mlp": 0.01275329, + "balance_loss_clip": 0.06307412, + "balance_loss_mlp": 0.01262639, + "epoch": 0.41130317150157825, + "flos": 19575579363840.0, + "grad_norm": 1.874526459917051, + "language_loss": 0.67950094, + "learning_rate": 2.6590743657259442e-06, + "loss": 0.75705582, + "num_input_tokens_seen": 146774990, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.12701416, + "step": 6841, + "time_per_iteration": 2.5642948150634766 + }, + { + "auxiliary_loss_clip": 0.06386833, + "auxiliary_loss_mlp": 0.01258898, + "balance_loss_clip": 0.06308911, + "balance_loss_mlp": 0.01256092, + "epoch": 0.4113632947542462, + "flos": 62404541498880.0, + "grad_norm": 0.7544179812034518, + "language_loss": 0.59557825, + "learning_rate": 2.65870664586847e-06, + "loss": 0.67203557, + "num_input_tokens_seen": 146839610, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 0.02804565, + "step": 6842, + "time_per_iteration": 3.2257192134857178 + }, + { + "auxiliary_loss_clip": 0.06472278, + "auxiliary_loss_mlp": 0.01271531, + "balance_loss_clip": 0.06304677, + "balance_loss_mlp": 0.01257977, + "epoch": 0.4114234180069142, + "flos": 13923879396480.0, + "grad_norm": 2.0142050293437803, + "language_loss": 0.70280814, + "learning_rate": 2.6583389010327742e-06, + "loss": 0.78024626, + "num_input_tokens_seen": 146857360, + "router_z_loss_clip": 1.67480469, + "router_z_loss_mlp": 0.13562012, + "step": 6843, + "time_per_iteration": 2.565969944000244 + }, + { + "auxiliary_loss_clip": 0.06380486, + "auxiliary_loss_mlp": 0.01256868, + "balance_loss_clip": 0.06302112, + "balance_loss_mlp": 0.01253599, + "epoch": 0.41148354125958214, + "flos": 64948866727680.0, + "grad_norm": 0.7130365683812196, + "language_loss": 0.53645009, + "learning_rate": 2.6579711312328013e-06, + "loss": 0.61282361, + "num_input_tokens_seen": 146917055, + "router_z_loss_clip": 0.78222656, + "router_z_loss_mlp": 0.03274536, + "step": 6844, + "time_per_iteration": 3.16054105758667 + }, + { + "auxiliary_loss_clip": 0.06475421, + "auxiliary_loss_mlp": 0.0126646, + "balance_loss_clip": 0.06304798, + "balance_loss_mlp": 0.01253144, + "epoch": 0.4115436645122501, + "flos": 18733848773760.0, + "grad_norm": 1.6055019254999645, + "language_loss": 0.66105658, + "learning_rate": 2.6576033364824967e-06, + "loss": 0.73847538, + "num_input_tokens_seen": 146935215, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.13317871, + "step": 6845, + "time_per_iteration": 2.5785298347473145 + }, + { + "auxiliary_loss_clip": 0.06478415, + "auxiliary_loss_mlp": 0.01267629, + "balance_loss_clip": 0.06307876, + "balance_loss_mlp": 0.01254176, + "epoch": 0.41160378776491807, + "flos": 16258439128320.0, + "grad_norm": 2.0979946916750594, + "language_loss": 0.70201457, + "learning_rate": 2.657235516795808e-06, + "loss": 0.77947497, + "num_input_tokens_seen": 146951970, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.13446045, + "step": 6846, + "time_per_iteration": 2.510215997695923 + }, + { + "auxiliary_loss_clip": 0.06481081, + "auxiliary_loss_mlp": 0.01271315, + "balance_loss_clip": 0.06309364, + "balance_loss_mlp": 0.01257391, + "epoch": 0.41166391101758604, + "flos": 27978378508800.0, + "grad_norm": 1.4002739744354715, + "language_loss": 0.65459704, + "learning_rate": 2.6568676721866826e-06, + "loss": 0.73212105, + "num_input_tokens_seen": 146975615, + "router_z_loss_clip": 1.71777344, + "router_z_loss_mlp": 0.13922119, + "step": 6847, + "time_per_iteration": 4.048614025115967 + }, + { + "auxiliary_loss_clip": 0.06476664, + "auxiliary_loss_mlp": 0.01270454, + "balance_loss_clip": 0.06304531, + "balance_loss_mlp": 0.01256459, + "epoch": 0.411724034270254, + "flos": 34139865916800.0, + "grad_norm": 1.3666484547506623, + "language_loss": 0.7086308, + "learning_rate": 2.656499802669069e-06, + "loss": 0.78610194, + "num_input_tokens_seen": 146998855, + "router_z_loss_clip": 1.72363281, + "router_z_loss_mlp": 0.13983154, + "step": 6848, + "time_per_iteration": 4.219269037246704 + }, + { + "auxiliary_loss_clip": 0.06375948, + "auxiliary_loss_mlp": 0.01253417, + "balance_loss_clip": 0.06298448, + "balance_loss_mlp": 0.01250777, + "epoch": 0.41178415752292197, + "flos": 67945090625280.0, + "grad_norm": 0.8791919044020794, + "language_loss": 0.56300032, + "learning_rate": 2.6561319082569174e-06, + "loss": 0.63929397, + "num_input_tokens_seen": 147062710, + "router_z_loss_clip": 0.77441406, + "router_z_loss_mlp": 0.02642822, + "step": 6849, + "time_per_iteration": 3.226757287979126 + }, + { + "auxiliary_loss_clip": 0.06472921, + "auxiliary_loss_mlp": 0.0127066, + "balance_loss_clip": 0.06303038, + "balance_loss_mlp": 0.0125707, + "epoch": 0.41184428077558993, + "flos": 34322573744640.0, + "grad_norm": 1.830210581648694, + "language_loss": 0.76533353, + "learning_rate": 2.6557639889641783e-06, + "loss": 0.84276927, + "num_input_tokens_seen": 147086075, + "router_z_loss_clip": 1.70019531, + "router_z_loss_mlp": 0.13598633, + "step": 6850, + "time_per_iteration": 2.653665542602539 + }, + { + "auxiliary_loss_clip": 0.06475841, + "auxiliary_loss_mlp": 0.01268752, + "balance_loss_clip": 0.06303935, + "balance_loss_mlp": 0.0125484, + "epoch": 0.41190440402825795, + "flos": 35452796342400.0, + "grad_norm": 1.6037978840830116, + "language_loss": 0.68379039, + "learning_rate": 2.6553960448048025e-06, + "loss": 0.76123631, + "num_input_tokens_seen": 147107590, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.13909912, + "step": 6851, + "time_per_iteration": 2.72273588180542 + }, + { + "auxiliary_loss_clip": 0.06482952, + "auxiliary_loss_mlp": 0.01272578, + "balance_loss_clip": 0.06306773, + "balance_loss_mlp": 0.01256437, + "epoch": 0.4119645272809259, + "flos": 20856127386240.0, + "grad_norm": 2.4937650031840275, + "language_loss": 0.80344605, + "learning_rate": 2.655028075792743e-06, + "loss": 0.88100129, + "num_input_tokens_seen": 147123715, + "router_z_loss_clip": 1.76171875, + "router_z_loss_mlp": 0.16162109, + "step": 6852, + "time_per_iteration": 2.563422679901123 + }, + { + "auxiliary_loss_clip": 0.06490047, + "auxiliary_loss_mlp": 0.01270823, + "balance_loss_clip": 0.06310906, + "balance_loss_mlp": 0.01256267, + "epoch": 0.4120246505335939, + "flos": 27569218222080.0, + "grad_norm": 2.025784739879877, + "language_loss": 0.77943873, + "learning_rate": 2.6546600819419537e-06, + "loss": 0.8570475, + "num_input_tokens_seen": 147144290, + "router_z_loss_clip": 1.79003906, + "router_z_loss_mlp": 0.14538574, + "step": 6853, + "time_per_iteration": 4.108957290649414 + }, + { + "auxiliary_loss_clip": 0.06493531, + "auxiliary_loss_mlp": 0.0127083, + "balance_loss_clip": 0.06310283, + "balance_loss_mlp": 0.01254618, + "epoch": 0.41208477378626185, + "flos": 37824476232960.0, + "grad_norm": 1.7138113243533049, + "language_loss": 0.66213286, + "learning_rate": 2.6542920632663883e-06, + "loss": 0.73977649, + "num_input_tokens_seen": 147166340, + "router_z_loss_clip": 1.83203125, + "router_z_loss_mlp": 0.16223145, + "step": 6854, + "time_per_iteration": 2.706514596939087 + }, + { + "auxiliary_loss_clip": 0.06481706, + "auxiliary_loss_mlp": 0.012695, + "balance_loss_clip": 0.06308492, + "balance_loss_mlp": 0.01256268, + "epoch": 0.4121448970389298, + "flos": 23447509482240.0, + "grad_norm": 1.8819465084993465, + "language_loss": 0.83935457, + "learning_rate": 2.6539240197800023e-06, + "loss": 0.9168666, + "num_input_tokens_seen": 147184025, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.13238525, + "step": 6855, + "time_per_iteration": 2.6131205558776855 + }, + { + "auxiliary_loss_clip": 0.06478727, + "auxiliary_loss_mlp": 0.01272662, + "balance_loss_clip": 0.06308559, + "balance_loss_mlp": 0.01258524, + "epoch": 0.4122050202915978, + "flos": 21331813415040.0, + "grad_norm": 1.6556690578140216, + "language_loss": 0.79642534, + "learning_rate": 2.6535559514967517e-06, + "loss": 0.87393928, + "num_input_tokens_seen": 147202730, + "router_z_loss_clip": 1.70019531, + "router_z_loss_mlp": 0.14129639, + "step": 6856, + "time_per_iteration": 2.6186776161193848 + }, + { + "auxiliary_loss_clip": 0.06486623, + "auxiliary_loss_mlp": 0.01271133, + "balance_loss_clip": 0.06312534, + "balance_loss_mlp": 0.01257383, + "epoch": 0.41226514354426574, + "flos": 17311193026560.0, + "grad_norm": 2.5768867092656516, + "language_loss": 0.80543911, + "learning_rate": 2.6531878584305935e-06, + "loss": 0.88301665, + "num_input_tokens_seen": 147215315, + "router_z_loss_clip": 1.74023438, + "router_z_loss_mlp": 0.13739014, + "step": 6857, + "time_per_iteration": 4.0222320556640625 + }, + { + "auxiliary_loss_clip": 0.06484015, + "auxiliary_loss_mlp": 0.01273092, + "balance_loss_clip": 0.06307175, + "balance_loss_mlp": 0.01259168, + "epoch": 0.4123252667969337, + "flos": 17644519768320.0, + "grad_norm": 1.8891533513627916, + "language_loss": 0.71074593, + "learning_rate": 2.6528197405954873e-06, + "loss": 0.78831697, + "num_input_tokens_seen": 147233330, + "router_z_loss_clip": 1.765625, + "router_z_loss_mlp": 0.13934326, + "step": 6858, + "time_per_iteration": 2.598215341567993 + }, + { + "auxiliary_loss_clip": 0.06484012, + "auxiliary_loss_mlp": 0.01270468, + "balance_loss_clip": 0.06310833, + "balance_loss_mlp": 0.01256109, + "epoch": 0.4123853900496017, + "flos": 46435070304000.0, + "grad_norm": 1.791293678645808, + "language_loss": 0.59712768, + "learning_rate": 2.652451598005391e-06, + "loss": 0.67467248, + "num_input_tokens_seen": 147257780, + "router_z_loss_clip": 1.73339844, + "router_z_loss_mlp": 0.14361572, + "step": 6859, + "time_per_iteration": 2.818535804748535 + }, + { + "auxiliary_loss_clip": 0.0648525, + "auxiliary_loss_mlp": 0.01269281, + "balance_loss_clip": 0.06306802, + "balance_loss_mlp": 0.01255423, + "epoch": 0.41244551330226964, + "flos": 17680801386240.0, + "grad_norm": 3.190643468711074, + "language_loss": 0.73818636, + "learning_rate": 2.652083430674264e-06, + "loss": 0.81573164, + "num_input_tokens_seen": 147276055, + "router_z_loss_clip": 1.78417969, + "router_z_loss_mlp": 0.13861084, + "step": 6860, + "time_per_iteration": 2.559460163116455 + }, + { + "auxiliary_loss_clip": 0.06473921, + "auxiliary_loss_mlp": 0.01270813, + "balance_loss_clip": 0.06301314, + "balance_loss_mlp": 0.01257706, + "epoch": 0.4125056365549376, + "flos": 18699034602240.0, + "grad_norm": 1.5713730110506565, + "language_loss": 0.74087375, + "learning_rate": 2.651715238616068e-06, + "loss": 0.81832111, + "num_input_tokens_seen": 147293200, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.13110352, + "step": 6861, + "time_per_iteration": 2.563107967376709 + }, + { + "auxiliary_loss_clip": 0.06476536, + "auxiliary_loss_mlp": 0.01266788, + "balance_loss_clip": 0.06306636, + "balance_loss_mlp": 0.01253425, + "epoch": 0.41256575980760557, + "flos": 17901174424320.0, + "grad_norm": 2.040837827964215, + "language_loss": 0.8021872, + "learning_rate": 2.651347021844765e-06, + "loss": 0.87962043, + "num_input_tokens_seen": 147310640, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.13354492, + "step": 6862, + "time_per_iteration": 2.4968619346618652 + }, + { + "auxiliary_loss_clip": 0.06481781, + "auxiliary_loss_mlp": 0.01269578, + "balance_loss_clip": 0.06308153, + "balance_loss_mlp": 0.01255881, + "epoch": 0.41262588306027354, + "flos": 21987817430400.0, + "grad_norm": 2.204342418200638, + "language_loss": 0.767263, + "learning_rate": 2.650978780374318e-06, + "loss": 0.84477663, + "num_input_tokens_seen": 147329435, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.13708496, + "step": 6863, + "time_per_iteration": 2.5787971019744873 + }, + { + "auxiliary_loss_clip": 0.06377177, + "auxiliary_loss_mlp": 0.01254592, + "balance_loss_clip": 0.06300335, + "balance_loss_mlp": 0.01252135, + "epoch": 0.41268600631294156, + "flos": 53366339243520.0, + "grad_norm": 0.6821216328900507, + "language_loss": 0.52583742, + "learning_rate": 2.650610514218691e-06, + "loss": 0.60215503, + "num_input_tokens_seen": 147385805, + "router_z_loss_clip": 0.77001953, + "router_z_loss_mlp": 0.02455139, + "step": 6864, + "time_per_iteration": 3.1086013317108154 + }, + { + "auxiliary_loss_clip": 0.06480177, + "auxiliary_loss_mlp": 0.01271204, + "balance_loss_clip": 0.06300756, + "balance_loss_mlp": 0.01256714, + "epoch": 0.4127461295656095, + "flos": 24391586234880.0, + "grad_norm": 1.7134572277425464, + "language_loss": 0.72468507, + "learning_rate": 2.6502422233918468e-06, + "loss": 0.80219889, + "num_input_tokens_seen": 147405160, + "router_z_loss_clip": 1.79394531, + "router_z_loss_mlp": 0.14489746, + "step": 6865, + "time_per_iteration": 2.6081020832061768 + }, + { + "auxiliary_loss_clip": 0.06375298, + "auxiliary_loss_mlp": 0.01255641, + "balance_loss_clip": 0.06298722, + "balance_loss_mlp": 0.01252579, + "epoch": 0.4128062528182775, + "flos": 71725129142400.0, + "grad_norm": 0.9099190790692077, + "language_loss": 0.66497219, + "learning_rate": 2.649873907907753e-06, + "loss": 0.74128163, + "num_input_tokens_seen": 147460245, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.03059387, + "step": 6866, + "time_per_iteration": 3.0357213020324707 + }, + { + "auxiliary_loss_clip": 0.06476509, + "auxiliary_loss_mlp": 0.01269311, + "balance_loss_clip": 0.06301893, + "balance_loss_mlp": 0.01255799, + "epoch": 0.41286637607094545, + "flos": 17853362870400.0, + "grad_norm": 2.1198776843792357, + "language_loss": 0.81617618, + "learning_rate": 2.649505567780375e-06, + "loss": 0.89363438, + "num_input_tokens_seen": 147476200, + "router_z_loss_clip": 1.74414062, + "router_z_loss_mlp": 0.13500977, + "step": 6867, + "time_per_iteration": 2.6095240116119385 + }, + { + "auxiliary_loss_clip": 0.06482062, + "auxiliary_loss_mlp": 0.01270795, + "balance_loss_clip": 0.06303717, + "balance_loss_mlp": 0.01256657, + "epoch": 0.4129264993236134, + "flos": 25555407120000.0, + "grad_norm": 2.8405529060711006, + "language_loss": 0.78333044, + "learning_rate": 2.6491372030236815e-06, + "loss": 0.86085904, + "num_input_tokens_seen": 147494315, + "router_z_loss_clip": 1.78222656, + "router_z_loss_mlp": 0.14147949, + "step": 6868, + "time_per_iteration": 2.558155059814453 + }, + { + "auxiliary_loss_clip": 0.06374986, + "auxiliary_loss_mlp": 0.01255045, + "balance_loss_clip": 0.06298015, + "balance_loss_mlp": 0.01251991, + "epoch": 0.4129866225762814, + "flos": 65430730759680.0, + "grad_norm": 0.8212939455862347, + "language_loss": 0.57654673, + "learning_rate": 2.64876881365164e-06, + "loss": 0.65284705, + "num_input_tokens_seen": 147543665, + "router_z_loss_clip": 0.77001953, + "router_z_loss_mlp": 0.03051758, + "step": 6869, + "time_per_iteration": 2.9284112453460693 + }, + { + "auxiliary_loss_clip": 0.06481783, + "auxiliary_loss_mlp": 0.01277222, + "balance_loss_clip": 0.06310707, + "balance_loss_mlp": 0.01263472, + "epoch": 0.41304674582894935, + "flos": 28884622343040.0, + "grad_norm": 2.4401499988028594, + "language_loss": 0.75528967, + "learning_rate": 2.64840039967822e-06, + "loss": 0.83287978, + "num_input_tokens_seen": 147564870, + "router_z_loss_clip": 1.70996094, + "router_z_loss_mlp": 0.13763428, + "step": 6870, + "time_per_iteration": 2.6844911575317383 + }, + { + "auxiliary_loss_clip": 0.0647882, + "auxiliary_loss_mlp": 0.01278278, + "balance_loss_clip": 0.06302784, + "balance_loss_mlp": 0.0126414, + "epoch": 0.4131068690816173, + "flos": 22898379749760.0, + "grad_norm": 1.5575458850844177, + "language_loss": 0.83697838, + "learning_rate": 2.6480319611173912e-06, + "loss": 0.91454935, + "num_input_tokens_seen": 147584840, + "router_z_loss_clip": 1.76074219, + "router_z_loss_mlp": 0.14135742, + "step": 6871, + "time_per_iteration": 2.636808156967163 + }, + { + "auxiliary_loss_clip": 0.06479517, + "auxiliary_loss_mlp": 0.0126964, + "balance_loss_clip": 0.06303998, + "balance_loss_mlp": 0.01256033, + "epoch": 0.4131669923342853, + "flos": 26071944814080.0, + "grad_norm": 2.2227773400911732, + "language_loss": 0.69246161, + "learning_rate": 2.6476634979831263e-06, + "loss": 0.76995325, + "num_input_tokens_seen": 147604635, + "router_z_loss_clip": 1.75585938, + "router_z_loss_mlp": 0.1361084, + "step": 6872, + "time_per_iteration": 2.6492373943328857 + }, + { + "auxiliary_loss_clip": 0.06480041, + "auxiliary_loss_mlp": 0.01273197, + "balance_loss_clip": 0.06303592, + "balance_loss_mlp": 0.01259494, + "epoch": 0.41322711558695324, + "flos": 19250554176000.0, + "grad_norm": 1.8563624048188305, + "language_loss": 0.76261687, + "learning_rate": 2.6472950102893964e-06, + "loss": 0.84014916, + "num_input_tokens_seen": 147620700, + "router_z_loss_clip": 1.76464844, + "router_z_loss_mlp": 0.13696289, + "step": 6873, + "time_per_iteration": 2.5294342041015625 + }, + { + "auxiliary_loss_clip": 0.06480598, + "auxiliary_loss_mlp": 0.01273623, + "balance_loss_clip": 0.06302338, + "balance_loss_mlp": 0.0125958, + "epoch": 0.4132872388396212, + "flos": 22681067385600.0, + "grad_norm": 1.8281818605346505, + "language_loss": 0.83432305, + "learning_rate": 2.6469264980501746e-06, + "loss": 0.91186529, + "num_input_tokens_seen": 147639490, + "router_z_loss_clip": 1.78027344, + "router_z_loss_mlp": 0.14031982, + "step": 6874, + "time_per_iteration": 2.6135475635528564 + }, + { + "auxiliary_loss_clip": 0.06483124, + "auxiliary_loss_mlp": 0.01273525, + "balance_loss_clip": 0.06306563, + "balance_loss_mlp": 0.01258498, + "epoch": 0.4133473620922892, + "flos": 20155246709760.0, + "grad_norm": 1.7886089381127788, + "language_loss": 0.72210878, + "learning_rate": 2.646557961279436e-06, + "loss": 0.79967523, + "num_input_tokens_seen": 147657205, + "router_z_loss_clip": 1.76367188, + "router_z_loss_mlp": 0.15020752, + "step": 6875, + "time_per_iteration": 2.535613536834717 + }, + { + "auxiliary_loss_clip": 0.06467389, + "auxiliary_loss_mlp": 0.01270264, + "balance_loss_clip": 0.06301813, + "balance_loss_mlp": 0.01257151, + "epoch": 0.41340748534495714, + "flos": 24249520437120.0, + "grad_norm": 1.4522680677637643, + "language_loss": 0.82662565, + "learning_rate": 2.646189399991154e-06, + "loss": 0.90400219, + "num_input_tokens_seen": 147677005, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.13098145, + "step": 6876, + "time_per_iteration": 2.631683111190796 + }, + { + "auxiliary_loss_clip": 0.06476636, + "auxiliary_loss_mlp": 0.0126976, + "balance_loss_clip": 0.06298597, + "balance_loss_mlp": 0.01255198, + "epoch": 0.41346760859762516, + "flos": 14397385219200.0, + "grad_norm": 2.4272621941749044, + "language_loss": 0.65427208, + "learning_rate": 2.6458208141993048e-06, + "loss": 0.73173606, + "num_input_tokens_seen": 147693435, + "router_z_loss_clip": 1.77929688, + "router_z_loss_mlp": 0.14556885, + "step": 6877, + "time_per_iteration": 2.5211727619171143 + }, + { + "auxiliary_loss_clip": 0.06477489, + "auxiliary_loss_mlp": 0.01272334, + "balance_loss_clip": 0.06304673, + "balance_loss_mlp": 0.0125853, + "epoch": 0.4135277318502931, + "flos": 22498569192960.0, + "grad_norm": 1.7887587996629348, + "language_loss": 0.77271414, + "learning_rate": 2.6454522039178668e-06, + "loss": 0.85021234, + "num_input_tokens_seen": 147714000, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.13800049, + "step": 6878, + "time_per_iteration": 2.591952085494995 + }, + { + "auxiliary_loss_clip": 0.06478719, + "auxiliary_loss_mlp": 0.01271871, + "balance_loss_clip": 0.06303747, + "balance_loss_mlp": 0.01258525, + "epoch": 0.4135878551029611, + "flos": 22425251270400.0, + "grad_norm": 1.9381355665838014, + "language_loss": 0.8049022, + "learning_rate": 2.6450835691608154e-06, + "loss": 0.88240814, + "num_input_tokens_seen": 147731010, + "router_z_loss_clip": 1.74609375, + "router_z_loss_mlp": 0.13354492, + "step": 6879, + "time_per_iteration": 2.565875291824341 + }, + { + "auxiliary_loss_clip": 0.06476135, + "auxiliary_loss_mlp": 0.0127254, + "balance_loss_clip": 0.06301241, + "balance_loss_mlp": 0.01258688, + "epoch": 0.41364797835562905, + "flos": 27060646665600.0, + "grad_norm": 1.8294611042748399, + "language_loss": 0.8543402, + "learning_rate": 2.6447149099421315e-06, + "loss": 0.93182689, + "num_input_tokens_seen": 147750880, + "router_z_loss_clip": 1.74804688, + "router_z_loss_mlp": 0.13861084, + "step": 6880, + "time_per_iteration": 2.6438286304473877 + }, + { + "auxiliary_loss_clip": 0.06478438, + "auxiliary_loss_mlp": 0.01270379, + "balance_loss_clip": 0.06301369, + "balance_loss_mlp": 0.01256258, + "epoch": 0.413708101608297, + "flos": 22974464856960.0, + "grad_norm": 2.0767525842165413, + "language_loss": 0.70694637, + "learning_rate": 2.6443462262757927e-06, + "loss": 0.78443456, + "num_input_tokens_seen": 147771360, + "router_z_loss_clip": 1.77050781, + "router_z_loss_mlp": 0.14129639, + "step": 6881, + "time_per_iteration": 2.57663893699646 + }, + { + "auxiliary_loss_clip": 0.06468567, + "auxiliary_loss_mlp": 0.01269061, + "balance_loss_clip": 0.06300917, + "balance_loss_mlp": 0.01255978, + "epoch": 0.413768224860965, + "flos": 13339013097600.0, + "grad_norm": 1.7206029499163673, + "language_loss": 0.81694102, + "learning_rate": 2.6439775181757805e-06, + "loss": 0.89431733, + "num_input_tokens_seen": 147787440, + "router_z_loss_clip": 1.67578125, + "router_z_loss_mlp": 0.13092041, + "step": 6882, + "time_per_iteration": 2.572300672531128 + }, + { + "auxiliary_loss_clip": 0.06484764, + "auxiliary_loss_mlp": 0.01273853, + "balance_loss_clip": 0.06306723, + "balance_loss_mlp": 0.0125776, + "epoch": 0.41382834811363295, + "flos": 20820306965760.0, + "grad_norm": 2.0204096459019176, + "language_loss": 0.69182575, + "learning_rate": 2.643608785656077e-06, + "loss": 0.76941192, + "num_input_tokens_seen": 147805720, + "router_z_loss_clip": 1.77929688, + "router_z_loss_mlp": 0.16088867, + "step": 6883, + "time_per_iteration": 2.5611510276794434 + }, + { + "auxiliary_loss_clip": 0.06472149, + "auxiliary_loss_mlp": 0.0126815, + "balance_loss_clip": 0.06297622, + "balance_loss_mlp": 0.01255061, + "epoch": 0.4138884713663009, + "flos": 20673293777280.0, + "grad_norm": 2.0786241324697, + "language_loss": 0.75945485, + "learning_rate": 2.643240028730663e-06, + "loss": 0.83685786, + "num_input_tokens_seen": 147824605, + "router_z_loss_clip": 1.74609375, + "router_z_loss_mlp": 0.13092041, + "step": 6884, + "time_per_iteration": 2.5788567066192627 + }, + { + "auxiliary_loss_clip": 0.06477202, + "auxiliary_loss_mlp": 0.01273717, + "balance_loss_clip": 0.06298974, + "balance_loss_mlp": 0.01260008, + "epoch": 0.4139485946189689, + "flos": 29063808299520.0, + "grad_norm": 3.0401310083666444, + "language_loss": 0.76198518, + "learning_rate": 2.642871247413523e-06, + "loss": 0.83949435, + "num_input_tokens_seen": 147845445, + "router_z_loss_clip": 1.78125, + "router_z_loss_mlp": 0.13720703, + "step": 6885, + "time_per_iteration": 2.5964529514312744 + }, + { + "auxiliary_loss_clip": 0.06475228, + "auxiliary_loss_mlp": 0.01270635, + "balance_loss_clip": 0.06299268, + "balance_loss_mlp": 0.01256187, + "epoch": 0.41400871787163684, + "flos": 24432605608320.0, + "grad_norm": 1.9051304938208142, + "language_loss": 0.70031226, + "learning_rate": 2.6425024417186414e-06, + "loss": 0.77777094, + "num_input_tokens_seen": 147865580, + "router_z_loss_clip": 1.75878906, + "router_z_loss_mlp": 0.14447021, + "step": 6886, + "time_per_iteration": 4.101384878158569 + }, + { + "auxiliary_loss_clip": 0.06475122, + "auxiliary_loss_mlp": 0.01275658, + "balance_loss_clip": 0.06297341, + "balance_loss_mlp": 0.01260423, + "epoch": 0.4140688411243048, + "flos": 19470172527360.0, + "grad_norm": 1.459976196778311, + "language_loss": 0.75538456, + "learning_rate": 2.642133611660002e-06, + "loss": 0.83289236, + "num_input_tokens_seen": 147885230, + "router_z_loss_clip": 1.77832031, + "router_z_loss_mlp": 0.15234375, + "step": 6887, + "time_per_iteration": 2.5979294776916504 + }, + { + "auxiliary_loss_clip": 0.06468056, + "auxiliary_loss_mlp": 0.01273257, + "balance_loss_clip": 0.06294202, + "balance_loss_mlp": 0.0125916, + "epoch": 0.4141289643769728, + "flos": 19319008561920.0, + "grad_norm": 2.153365375528394, + "language_loss": 0.70707798, + "learning_rate": 2.641764757251592e-06, + "loss": 0.78449106, + "num_input_tokens_seen": 147903035, + "router_z_loss_clip": 1.73730469, + "router_z_loss_mlp": 0.14099121, + "step": 6888, + "time_per_iteration": 4.008386850357056 + }, + { + "auxiliary_loss_clip": 0.06466109, + "auxiliary_loss_mlp": 0.01273102, + "balance_loss_clip": 0.0629206, + "balance_loss_mlp": 0.0125863, + "epoch": 0.41418908762964074, + "flos": 16732448075520.0, + "grad_norm": 2.015209624353795, + "language_loss": 0.76631236, + "learning_rate": 2.6413958785073976e-06, + "loss": 0.84370446, + "num_input_tokens_seen": 147918745, + "router_z_loss_clip": 1.74121094, + "router_z_loss_mlp": 0.14477539, + "step": 6889, + "time_per_iteration": 2.5270447731018066 + }, + { + "auxiliary_loss_clip": 0.06466071, + "auxiliary_loss_mlp": 0.012722, + "balance_loss_clip": 0.06294381, + "balance_loss_mlp": 0.01258628, + "epoch": 0.41424921088230876, + "flos": 25303112876160.0, + "grad_norm": 1.5878983493356928, + "language_loss": 0.80245477, + "learning_rate": 2.6410269754414074e-06, + "loss": 0.87983751, + "num_input_tokens_seen": 147938265, + "router_z_loss_clip": 1.71679688, + "router_z_loss_mlp": 0.13568115, + "step": 6890, + "time_per_iteration": 2.5559017658233643 + }, + { + "auxiliary_loss_clip": 0.06465066, + "auxiliary_loss_mlp": 0.01273625, + "balance_loss_clip": 0.06294424, + "balance_loss_mlp": 0.01258592, + "epoch": 0.4143093341349767, + "flos": 20966984737920.0, + "grad_norm": 1.4631338633868025, + "language_loss": 0.74175858, + "learning_rate": 2.6406580480676113e-06, + "loss": 0.81914544, + "num_input_tokens_seen": 147957320, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.15014648, + "step": 6891, + "time_per_iteration": 2.5313403606414795 + }, + { + "auxiliary_loss_clip": 0.06475316, + "auxiliary_loss_mlp": 0.01269526, + "balance_loss_clip": 0.0629719, + "balance_loss_mlp": 0.01253283, + "epoch": 0.4143694573876447, + "flos": 22024182902400.0, + "grad_norm": 2.801103384820577, + "language_loss": 0.84378529, + "learning_rate": 2.6402890963999963e-06, + "loss": 0.92123371, + "num_input_tokens_seen": 147977045, + "router_z_loss_clip": 1.78125, + "router_z_loss_mlp": 0.16247559, + "step": 6892, + "time_per_iteration": 3.9777607917785645 + }, + { + "auxiliary_loss_clip": 0.06465086, + "auxiliary_loss_mlp": 0.01270368, + "balance_loss_clip": 0.06295982, + "balance_loss_mlp": 0.01257339, + "epoch": 0.41442958064031266, + "flos": 35705761418880.0, + "grad_norm": 1.735816743811137, + "language_loss": 0.70161885, + "learning_rate": 2.6399201204525554e-06, + "loss": 0.7789734, + "num_input_tokens_seen": 147996905, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.13037109, + "step": 6893, + "time_per_iteration": 2.6909854412078857 + }, + { + "auxiliary_loss_clip": 0.06467048, + "auxiliary_loss_mlp": 0.01267192, + "balance_loss_clip": 0.0629535, + "balance_loss_mlp": 0.01253799, + "epoch": 0.4144897038929806, + "flos": 28301391198720.0, + "grad_norm": 1.3940088969507989, + "language_loss": 0.73223269, + "learning_rate": 2.639551120239279e-06, + "loss": 0.80957508, + "num_input_tokens_seen": 148017875, + "router_z_loss_clip": 1.71484375, + "router_z_loss_mlp": 0.13378906, + "step": 6894, + "time_per_iteration": 2.5950350761413574 + }, + { + "auxiliary_loss_clip": 0.06476665, + "auxiliary_loss_mlp": 0.01273362, + "balance_loss_clip": 0.06300536, + "balance_loss_mlp": 0.0125867, + "epoch": 0.4145498271456486, + "flos": 11651568848640.0, + "grad_norm": 2.440609351676066, + "language_loss": 0.62663507, + "learning_rate": 2.63918209577416e-06, + "loss": 0.7041353, + "num_input_tokens_seen": 148032300, + "router_z_loss_clip": 1.76074219, + "router_z_loss_mlp": 0.14697266, + "step": 6895, + "time_per_iteration": 2.471320390701294 + }, + { + "auxiliary_loss_clip": 0.0646576, + "auxiliary_loss_mlp": 0.01272394, + "balance_loss_clip": 0.06296334, + "balance_loss_mlp": 0.01258589, + "epoch": 0.41460995039831655, + "flos": 27243061004160.0, + "grad_norm": 3.24758428503537, + "language_loss": 0.70684588, + "learning_rate": 2.638813047071192e-06, + "loss": 0.78422737, + "num_input_tokens_seen": 148053260, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.13806152, + "step": 6896, + "time_per_iteration": 2.5871524810791016 + }, + { + "auxiliary_loss_clip": 0.06475289, + "auxiliary_loss_mlp": 0.01275214, + "balance_loss_clip": 0.06299431, + "balance_loss_mlp": 0.01260164, + "epoch": 0.4146700736509845, + "flos": 25929627454080.0, + "grad_norm": 1.8920871134817128, + "language_loss": 0.73144394, + "learning_rate": 2.6384439741443696e-06, + "loss": 0.80894893, + "num_input_tokens_seen": 148072965, + "router_z_loss_clip": 1.7578125, + "router_z_loss_mlp": 0.15057373, + "step": 6897, + "time_per_iteration": 4.0778656005859375 + }, + { + "auxiliary_loss_clip": 0.0646714, + "auxiliary_loss_mlp": 0.01271778, + "balance_loss_clip": 0.06293359, + "balance_loss_mlp": 0.01257371, + "epoch": 0.4147301969036525, + "flos": 26840441335680.0, + "grad_norm": 6.247593775216772, + "language_loss": 0.84715986, + "learning_rate": 2.6380748770076873e-06, + "loss": 0.92454904, + "num_input_tokens_seen": 148093240, + "router_z_loss_clip": 1.73730469, + "router_z_loss_mlp": 0.14404297, + "step": 6898, + "time_per_iteration": 2.5603139400482178 + }, + { + "auxiliary_loss_clip": 0.06469397, + "auxiliary_loss_mlp": 0.01267488, + "balance_loss_clip": 0.06293289, + "balance_loss_mlp": 0.01253678, + "epoch": 0.41479032015632045, + "flos": 20303727344640.0, + "grad_norm": 2.0378276609946098, + "language_loss": 0.74898899, + "learning_rate": 2.6377057556751416e-06, + "loss": 0.82635784, + "num_input_tokens_seen": 148110925, + "router_z_loss_clip": 1.76074219, + "router_z_loss_mlp": 0.13812256, + "step": 6899, + "time_per_iteration": 2.53822660446167 + }, + { + "auxiliary_loss_clip": 0.06477535, + "auxiliary_loss_mlp": 0.01273796, + "balance_loss_clip": 0.06297705, + "balance_loss_mlp": 0.01258239, + "epoch": 0.4148504434089884, + "flos": 25272030211200.0, + "grad_norm": 2.0370175779228465, + "language_loss": 0.75786376, + "learning_rate": 2.6373366101607306e-06, + "loss": 0.83537704, + "num_input_tokens_seen": 148130670, + "router_z_loss_clip": 1.796875, + "router_z_loss_mlp": 0.15563965, + "step": 6900, + "time_per_iteration": 2.5547776222229004 + }, + { + "auxiliary_loss_clip": 0.06470095, + "auxiliary_loss_mlp": 0.01275828, + "balance_loss_clip": 0.06298018, + "balance_loss_mlp": 0.01260057, + "epoch": 0.4149105666616564, + "flos": 12827087377920.0, + "grad_norm": 3.426788101109298, + "language_loss": 0.80153453, + "learning_rate": 2.6369674404784503e-06, + "loss": 0.87899375, + "num_input_tokens_seen": 148148350, + "router_z_loss_clip": 1.72070312, + "router_z_loss_mlp": 0.15783691, + "step": 6901, + "time_per_iteration": 2.5724570751190186 + }, + { + "auxiliary_loss_clip": 0.06464257, + "auxiliary_loss_mlp": 0.01273382, + "balance_loss_clip": 0.06292327, + "balance_loss_mlp": 0.01258791, + "epoch": 0.41497068991432434, + "flos": 16769526307200.0, + "grad_norm": 2.2871359145608507, + "language_loss": 0.70271528, + "learning_rate": 2.6365982466423014e-06, + "loss": 0.78009164, + "num_input_tokens_seen": 148167550, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.14593506, + "step": 6902, + "time_per_iteration": 2.518018960952759 + }, + { + "auxiliary_loss_clip": 0.06463319, + "auxiliary_loss_mlp": 0.01270625, + "balance_loss_clip": 0.06294475, + "balance_loss_mlp": 0.01255706, + "epoch": 0.4150308131669923, + "flos": 18006161990400.0, + "grad_norm": 2.0523680752477906, + "language_loss": 0.8405019, + "learning_rate": 2.6362290286662834e-06, + "loss": 0.91784132, + "num_input_tokens_seen": 148184740, + "router_z_loss_clip": 1.68554688, + "router_z_loss_mlp": 0.14923096, + "step": 6903, + "time_per_iteration": 2.719252586364746 + }, + { + "auxiliary_loss_clip": 0.06478511, + "auxiliary_loss_mlp": 0.01270948, + "balance_loss_clip": 0.06298795, + "balance_loss_mlp": 0.01254282, + "epoch": 0.41509093641966033, + "flos": 30052635932160.0, + "grad_norm": 2.3513516306772826, + "language_loss": 0.67960835, + "learning_rate": 2.6358597865643968e-06, + "loss": 0.75710285, + "num_input_tokens_seen": 148204605, + "router_z_loss_clip": 1.796875, + "router_z_loss_mlp": 0.16674805, + "step": 6904, + "time_per_iteration": 2.605834484100342 + }, + { + "auxiliary_loss_clip": 0.06473922, + "auxiliary_loss_mlp": 0.01267185, + "balance_loss_clip": 0.06295053, + "balance_loss_mlp": 0.01252678, + "epoch": 0.4151510596723283, + "flos": 24286892158080.0, + "grad_norm": 1.8668907258080212, + "language_loss": 0.77697861, + "learning_rate": 2.635490520350643e-06, + "loss": 0.85438967, + "num_input_tokens_seen": 148224675, + "router_z_loss_clip": 1.78710938, + "router_z_loss_mlp": 0.14508057, + "step": 6905, + "time_per_iteration": 2.6073246002197266 + }, + { + "auxiliary_loss_clip": 0.06477012, + "auxiliary_loss_mlp": 0.01269791, + "balance_loss_clip": 0.06300149, + "balance_loss_mlp": 0.01255391, + "epoch": 0.41521118292499626, + "flos": 23482784851200.0, + "grad_norm": 2.106489831039321, + "language_loss": 0.68546331, + "learning_rate": 2.635121230039025e-06, + "loss": 0.76293135, + "num_input_tokens_seen": 148243375, + "router_z_loss_clip": 1.76855469, + "router_z_loss_mlp": 0.1439209, + "step": 6906, + "time_per_iteration": 2.5378260612487793 + }, + { + "auxiliary_loss_clip": 0.06470662, + "auxiliary_loss_mlp": 0.01269025, + "balance_loss_clip": 0.06298003, + "balance_loss_mlp": 0.01254839, + "epoch": 0.4152713061776642, + "flos": 22131728017920.0, + "grad_norm": 2.406599601104124, + "language_loss": 0.68275452, + "learning_rate": 2.6347519156435467e-06, + "loss": 0.76015139, + "num_input_tokens_seen": 148261140, + "router_z_loss_clip": 1.72851562, + "router_z_loss_mlp": 0.14196777, + "step": 6907, + "time_per_iteration": 2.548020124435425 + }, + { + "auxiliary_loss_clip": 0.06477083, + "auxiliary_loss_mlp": 0.01270349, + "balance_loss_clip": 0.06301615, + "balance_loss_mlp": 0.01256342, + "epoch": 0.4153314294303322, + "flos": 21257740805760.0, + "grad_norm": 2.5393224991434398, + "language_loss": 0.77004838, + "learning_rate": 2.6343825771782123e-06, + "loss": 0.84752274, + "num_input_tokens_seen": 148279655, + "router_z_loss_clip": 1.75585938, + "router_z_loss_mlp": 0.14013672, + "step": 6908, + "time_per_iteration": 2.52205753326416 + }, + { + "auxiliary_loss_clip": 0.0635362, + "auxiliary_loss_mlp": 0.01259834, + "balance_loss_clip": 0.06277395, + "balance_loss_mlp": 0.01256612, + "epoch": 0.41539155268300015, + "flos": 57939443527680.0, + "grad_norm": 0.769240592375345, + "language_loss": 0.64804208, + "learning_rate": 2.634013214657026e-06, + "loss": 0.72417659, + "num_input_tokens_seen": 148339005, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.03225708, + "step": 6909, + "time_per_iteration": 3.109095573425293 + }, + { + "auxiliary_loss_clip": 0.06469519, + "auxiliary_loss_mlp": 0.01271461, + "balance_loss_clip": 0.06297643, + "balance_loss_mlp": 0.0125746, + "epoch": 0.4154516759356681, + "flos": 21909384408960.0, + "grad_norm": 1.4248669333769037, + "language_loss": 0.87550539, + "learning_rate": 2.633643828093996e-06, + "loss": 0.95291519, + "num_input_tokens_seen": 148358715, + "router_z_loss_clip": 1.71777344, + "router_z_loss_mlp": 0.13989258, + "step": 6910, + "time_per_iteration": 2.5253639221191406 + }, + { + "auxiliary_loss_clip": 0.06354217, + "auxiliary_loss_mlp": 0.01257534, + "balance_loss_clip": 0.0627715, + "balance_loss_mlp": 0.01254598, + "epoch": 0.4155117991883361, + "flos": 67852234702080.0, + "grad_norm": 0.8147918233574727, + "language_loss": 0.62098897, + "learning_rate": 2.633274417503128e-06, + "loss": 0.69710648, + "num_input_tokens_seen": 148417280, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.02932739, + "step": 6911, + "time_per_iteration": 3.1515297889709473 + }, + { + "auxiliary_loss_clip": 0.06486405, + "auxiliary_loss_mlp": 0.01269938, + "balance_loss_clip": 0.06302486, + "balance_loss_mlp": 0.01254393, + "epoch": 0.41557192244100405, + "flos": 14287869532800.0, + "grad_norm": 2.853367345352451, + "language_loss": 0.88092077, + "learning_rate": 2.6329049828984312e-06, + "loss": 0.95848417, + "num_input_tokens_seen": 148432610, + "router_z_loss_clip": 1.83691406, + "router_z_loss_mlp": 0.15551758, + "step": 6912, + "time_per_iteration": 2.5334529876708984 + }, + { + "auxiliary_loss_clip": 0.06480967, + "auxiliary_loss_mlp": 0.01268043, + "balance_loss_clip": 0.06303312, + "balance_loss_mlp": 0.01253451, + "epoch": 0.415632045693672, + "flos": 24468803372160.0, + "grad_norm": 2.9756004279328945, + "language_loss": 0.63331664, + "learning_rate": 2.632535524293914e-06, + "loss": 0.71080673, + "num_input_tokens_seen": 148451510, + "router_z_loss_clip": 1.77734375, + "router_z_loss_mlp": 0.14581299, + "step": 6913, + "time_per_iteration": 2.547567129135132 + }, + { + "auxiliary_loss_clip": 0.06471419, + "auxiliary_loss_mlp": 0.01270035, + "balance_loss_clip": 0.06297998, + "balance_loss_mlp": 0.01256249, + "epoch": 0.41569216894634, + "flos": 20120600246400.0, + "grad_norm": 1.832366261637427, + "language_loss": 0.75605875, + "learning_rate": 2.632166041703586e-06, + "loss": 0.83347332, + "num_input_tokens_seen": 148469945, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.13787842, + "step": 6914, + "time_per_iteration": 2.5624208450317383 + }, + { + "auxiliary_loss_clip": 0.06479953, + "auxiliary_loss_mlp": 0.01273918, + "balance_loss_clip": 0.06302451, + "balance_loss_mlp": 0.01257897, + "epoch": 0.41575229219900794, + "flos": 23804497802880.0, + "grad_norm": 2.012818087979969, + "language_loss": 0.87586981, + "learning_rate": 2.631796535141458e-06, + "loss": 0.95340854, + "num_input_tokens_seen": 148486655, + "router_z_loss_clip": 1.77441406, + "router_z_loss_mlp": 0.16015625, + "step": 6915, + "time_per_iteration": 2.545825481414795 + }, + { + "auxiliary_loss_clip": 0.06478707, + "auxiliary_loss_mlp": 0.01273084, + "balance_loss_clip": 0.06302266, + "balance_loss_mlp": 0.01259273, + "epoch": 0.4158124154516759, + "flos": 23114224667520.0, + "grad_norm": 2.419843437778294, + "language_loss": 0.71605122, + "learning_rate": 2.6314270046215426e-06, + "loss": 0.79356909, + "num_input_tokens_seen": 148505035, + "router_z_loss_clip": 1.765625, + "router_z_loss_mlp": 0.13818359, + "step": 6916, + "time_per_iteration": 2.59429669380188 + }, + { + "auxiliary_loss_clip": 0.06477056, + "auxiliary_loss_mlp": 0.01267217, + "balance_loss_clip": 0.06298968, + "balance_loss_mlp": 0.01252208, + "epoch": 0.41587253870434393, + "flos": 24249771999360.0, + "grad_norm": 1.4428572529082921, + "language_loss": 0.71931446, + "learning_rate": 2.631057450157852e-06, + "loss": 0.7967571, + "num_input_tokens_seen": 148525575, + "router_z_loss_clip": 1.78222656, + "router_z_loss_mlp": 0.15002441, + "step": 6917, + "time_per_iteration": 2.56001877784729 + }, + { + "auxiliary_loss_clip": 0.06469631, + "auxiliary_loss_mlp": 0.01267681, + "balance_loss_clip": 0.06294615, + "balance_loss_mlp": 0.01253089, + "epoch": 0.4159326619570119, + "flos": 23888926391040.0, + "grad_norm": 4.142003179261072, + "language_loss": 0.80924189, + "learning_rate": 2.6306878717643988e-06, + "loss": 0.88661504, + "num_input_tokens_seen": 148547270, + "router_z_loss_clip": 1.75097656, + "router_z_loss_mlp": 0.14599609, + "step": 6918, + "time_per_iteration": 2.6182031631469727 + }, + { + "auxiliary_loss_clip": 0.06479505, + "auxiliary_loss_mlp": 0.01270819, + "balance_loss_clip": 0.06299014, + "balance_loss_mlp": 0.01255, + "epoch": 0.41599278520967986, + "flos": 40636315221120.0, + "grad_norm": 1.446116397311604, + "language_loss": 0.70620072, + "learning_rate": 2.6303182694551995e-06, + "loss": 0.78370392, + "num_input_tokens_seen": 148572100, + "router_z_loss_clip": 1.8046875, + "router_z_loss_mlp": 0.1583252, + "step": 6919, + "time_per_iteration": 2.7974801063537598 + }, + { + "auxiliary_loss_clip": 0.06470604, + "auxiliary_loss_mlp": 0.01270956, + "balance_loss_clip": 0.06293205, + "balance_loss_mlp": 0.01255697, + "epoch": 0.4160529084623478, + "flos": 18228757161600.0, + "grad_norm": 1.8139422387612383, + "language_loss": 0.81669927, + "learning_rate": 2.6299486432442677e-06, + "loss": 0.89411485, + "num_input_tokens_seen": 148591245, + "router_z_loss_clip": 1.77441406, + "router_z_loss_mlp": 0.15258789, + "step": 6920, + "time_per_iteration": 2.652277708053589 + }, + { + "auxiliary_loss_clip": 0.06476951, + "auxiliary_loss_mlp": 0.01273828, + "balance_loss_clip": 0.06298292, + "balance_loss_mlp": 0.01258724, + "epoch": 0.4161130317150158, + "flos": 13666973178240.0, + "grad_norm": 2.775667367204969, + "language_loss": 0.65528631, + "learning_rate": 2.6295789931456195e-06, + "loss": 0.73279405, + "num_input_tokens_seen": 148607980, + "router_z_loss_clip": 1.7890625, + "router_z_loss_mlp": 0.15100098, + "step": 6921, + "time_per_iteration": 2.543761968612671 + }, + { + "auxiliary_loss_clip": 0.0647813, + "auxiliary_loss_mlp": 0.01273522, + "balance_loss_clip": 0.06301805, + "balance_loss_mlp": 0.01258168, + "epoch": 0.41617315496768376, + "flos": 16183779540480.0, + "grad_norm": 2.038581093377189, + "language_loss": 0.80900288, + "learning_rate": 2.629209319173274e-06, + "loss": 0.88651937, + "num_input_tokens_seen": 148624490, + "router_z_loss_clip": 1.76171875, + "router_z_loss_mlp": 0.15368652, + "step": 6922, + "time_per_iteration": 2.5606656074523926 + }, + { + "auxiliary_loss_clip": 0.06480581, + "auxiliary_loss_mlp": 0.01270422, + "balance_loss_clip": 0.06301428, + "balance_loss_mlp": 0.01255163, + "epoch": 0.4162332782203517, + "flos": 26220467376000.0, + "grad_norm": 1.63600266107907, + "language_loss": 0.6809119, + "learning_rate": 2.628839621341247e-06, + "loss": 0.7584219, + "num_input_tokens_seen": 148646490, + "router_z_loss_clip": 1.79101562, + "router_z_loss_mlp": 0.15258789, + "step": 6923, + "time_per_iteration": 2.5789952278137207 + }, + { + "auxiliary_loss_clip": 0.06474873, + "auxiliary_loss_mlp": 0.0126996, + "balance_loss_clip": 0.06299335, + "balance_loss_mlp": 0.01254152, + "epoch": 0.4162934014730197, + "flos": 28191540096000.0, + "grad_norm": 1.91165548300248, + "language_loss": 0.76249051, + "learning_rate": 2.6284698996635593e-06, + "loss": 0.83993888, + "num_input_tokens_seen": 148668580, + "router_z_loss_clip": 1.75585938, + "router_z_loss_mlp": 0.15795898, + "step": 6924, + "time_per_iteration": 2.6209194660186768 + }, + { + "auxiliary_loss_clip": 0.06473987, + "auxiliary_loss_mlp": 0.01272207, + "balance_loss_clip": 0.06295989, + "balance_loss_mlp": 0.01257759, + "epoch": 0.41635352472568765, + "flos": 19871492457600.0, + "grad_norm": 1.5667233765254498, + "language_loss": 0.73101473, + "learning_rate": 2.62810015415423e-06, + "loss": 0.80847669, + "num_input_tokens_seen": 148688410, + "router_z_loss_clip": 1.78027344, + "router_z_loss_mlp": 0.14465332, + "step": 6925, + "time_per_iteration": 2.5133748054504395 + }, + { + "auxiliary_loss_clip": 0.0646892, + "auxiliary_loss_mlp": 0.01268263, + "balance_loss_clip": 0.06293461, + "balance_loss_mlp": 0.0125391, + "epoch": 0.4164136479783556, + "flos": 14939974333440.0, + "grad_norm": 2.1337011873068445, + "language_loss": 0.84242827, + "learning_rate": 2.6277303848272792e-06, + "loss": 0.91980004, + "num_input_tokens_seen": 148704855, + "router_z_loss_clip": 1.75488281, + "router_z_loss_mlp": 0.14361572, + "step": 6926, + "time_per_iteration": 3.923924446105957 + }, + { + "auxiliary_loss_clip": 0.06465639, + "auxiliary_loss_mlp": 0.01268265, + "balance_loss_clip": 0.06292935, + "balance_loss_mlp": 0.01254574, + "epoch": 0.4164737712310236, + "flos": 21763251688320.0, + "grad_norm": 1.56658623429888, + "language_loss": 0.86570489, + "learning_rate": 2.6273605916967302e-06, + "loss": 0.94304395, + "num_input_tokens_seen": 148723065, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.13696289, + "step": 6927, + "time_per_iteration": 3.9643561840057373 + }, + { + "auxiliary_loss_clip": 0.06468353, + "auxiliary_loss_mlp": 0.01275736, + "balance_loss_clip": 0.06293458, + "balance_loss_mlp": 0.01260287, + "epoch": 0.41653389448369155, + "flos": 20746318210560.0, + "grad_norm": 2.3770101780600976, + "language_loss": 0.72583216, + "learning_rate": 2.626990774776604e-06, + "loss": 0.80327296, + "num_input_tokens_seen": 148741780, + "router_z_loss_clip": 1.74902344, + "router_z_loss_mlp": 0.15447998, + "step": 6928, + "time_per_iteration": 2.5111186504364014 + }, + { + "auxiliary_loss_clip": 0.06468435, + "auxiliary_loss_mlp": 0.01272442, + "balance_loss_clip": 0.062929, + "balance_loss_mlp": 0.0125735, + "epoch": 0.4165940177363595, + "flos": 24979848624000.0, + "grad_norm": 1.9381497388164433, + "language_loss": 0.78399348, + "learning_rate": 2.6266209340809254e-06, + "loss": 0.86140227, + "num_input_tokens_seen": 148759795, + "router_z_loss_clip": 1.75488281, + "router_z_loss_mlp": 0.15087891, + "step": 6929, + "time_per_iteration": 2.6066014766693115 + }, + { + "auxiliary_loss_clip": 0.0646543, + "auxiliary_loss_mlp": 0.01268046, + "balance_loss_clip": 0.06291193, + "balance_loss_mlp": 0.01253842, + "epoch": 0.41665414098902753, + "flos": 20527957670400.0, + "grad_norm": 1.8432748306405895, + "language_loss": 0.71154583, + "learning_rate": 2.6262510696237182e-06, + "loss": 0.78888059, + "num_input_tokens_seen": 148778680, + "router_z_loss_clip": 1.74316406, + "router_z_loss_mlp": 0.14190674, + "step": 6930, + "time_per_iteration": 2.5052478313446045 + }, + { + "auxiliary_loss_clip": 0.06468388, + "auxiliary_loss_mlp": 0.01269408, + "balance_loss_clip": 0.06291626, + "balance_loss_mlp": 0.01255067, + "epoch": 0.4167142642416955, + "flos": 19689078119040.0, + "grad_norm": 1.7731266468983917, + "language_loss": 0.81487417, + "learning_rate": 2.625881181419007e-06, + "loss": 0.89225209, + "num_input_tokens_seen": 148796470, + "router_z_loss_clip": 1.76855469, + "router_z_loss_mlp": 0.14355469, + "step": 6931, + "time_per_iteration": 2.555651903152466 + }, + { + "auxiliary_loss_clip": 0.0646255, + "auxiliary_loss_mlp": 0.01270611, + "balance_loss_clip": 0.06289293, + "balance_loss_mlp": 0.01255233, + "epoch": 0.41677438749436346, + "flos": 23769641704320.0, + "grad_norm": 2.211036345176988, + "language_loss": 0.79310054, + "learning_rate": 2.6255112694808193e-06, + "loss": 0.87043214, + "num_input_tokens_seen": 148815300, + "router_z_loss_clip": 1.73144531, + "router_z_loss_mlp": 0.15362549, + "step": 6932, + "time_per_iteration": 4.05314040184021 + }, + { + "auxiliary_loss_clip": 0.06464541, + "auxiliary_loss_mlp": 0.01269463, + "balance_loss_clip": 0.06289106, + "balance_loss_mlp": 0.01254752, + "epoch": 0.41683451074703143, + "flos": 30418051587840.0, + "grad_norm": 2.244908394273299, + "language_loss": 0.82220912, + "learning_rate": 2.6251413338231813e-06, + "loss": 0.89954913, + "num_input_tokens_seen": 148834315, + "router_z_loss_clip": 1.75292969, + "router_z_loss_mlp": 0.14727783, + "step": 6933, + "time_per_iteration": 2.715542793273926 + }, + { + "auxiliary_loss_clip": 0.06467043, + "auxiliary_loss_mlp": 0.01272262, + "balance_loss_clip": 0.06287256, + "balance_loss_mlp": 0.01257963, + "epoch": 0.4168946339996994, + "flos": 21513137650560.0, + "grad_norm": 1.8583396237684835, + "language_loss": 0.76938605, + "learning_rate": 2.624771374460121e-06, + "loss": 0.84677911, + "num_input_tokens_seen": 148852420, + "router_z_loss_clip": 1.796875, + "router_z_loss_mlp": 0.14300537, + "step": 6934, + "time_per_iteration": 2.630192279815674 + }, + { + "auxiliary_loss_clip": 0.06469443, + "auxiliary_loss_mlp": 0.0126919, + "balance_loss_clip": 0.06293288, + "balance_loss_mlp": 0.01254586, + "epoch": 0.41695475725236736, + "flos": 17644310133120.0, + "grad_norm": 2.110423315639561, + "language_loss": 0.67164314, + "learning_rate": 2.624401391405668e-06, + "loss": 0.74902946, + "num_input_tokens_seen": 148869305, + "router_z_loss_clip": 1.76074219, + "router_z_loss_mlp": 0.14599609, + "step": 6935, + "time_per_iteration": 2.484464168548584 + }, + { + "auxiliary_loss_clip": 0.0646461, + "auxiliary_loss_mlp": 0.01269491, + "balance_loss_clip": 0.06289718, + "balance_loss_mlp": 0.01254458, + "epoch": 0.4170148805050353, + "flos": 15674285589120.0, + "grad_norm": 2.4566205528754033, + "language_loss": 0.7383365, + "learning_rate": 2.6240313846738513e-06, + "loss": 0.81567752, + "num_input_tokens_seen": 148886395, + "router_z_loss_clip": 1.74902344, + "router_z_loss_mlp": 0.15039062, + "step": 6936, + "time_per_iteration": 3.9171254634857178 + }, + { + "auxiliary_loss_clip": 0.06457968, + "auxiliary_loss_mlp": 0.01275405, + "balance_loss_clip": 0.06285361, + "balance_loss_mlp": 0.01262184, + "epoch": 0.4170750037577033, + "flos": 15164623929600.0, + "grad_norm": 4.126334603160969, + "language_loss": 0.74596691, + "learning_rate": 2.6236613542787024e-06, + "loss": 0.8233006, + "num_input_tokens_seen": 148905235, + "router_z_loss_clip": 1.72851562, + "router_z_loss_mlp": 0.13226318, + "step": 6937, + "time_per_iteration": 2.5286996364593506 + }, + { + "auxiliary_loss_clip": 0.06462386, + "auxiliary_loss_mlp": 0.01273752, + "balance_loss_clip": 0.06289354, + "balance_loss_mlp": 0.01259727, + "epoch": 0.41713512701037125, + "flos": 28776029051520.0, + "grad_norm": 1.4497703642581674, + "language_loss": 0.84985441, + "learning_rate": 2.6232913002342518e-06, + "loss": 0.92721575, + "num_input_tokens_seen": 148928130, + "router_z_loss_clip": 1.72949219, + "router_z_loss_mlp": 0.14031982, + "step": 6938, + "time_per_iteration": 2.594024419784546 + }, + { + "auxiliary_loss_clip": 0.06468149, + "auxiliary_loss_mlp": 0.01274736, + "balance_loss_clip": 0.06289169, + "balance_loss_mlp": 0.01259114, + "epoch": 0.4171952502630392, + "flos": 28264564529280.0, + "grad_norm": 1.8332960409763566, + "language_loss": 0.74288213, + "learning_rate": 2.6229212225545334e-06, + "loss": 0.82031095, + "num_input_tokens_seen": 148948790, + "router_z_loss_clip": 1.7890625, + "router_z_loss_mlp": 0.15618896, + "step": 6939, + "time_per_iteration": 2.628620147705078 + }, + { + "auxiliary_loss_clip": 0.06462568, + "auxiliary_loss_mlp": 0.01269134, + "balance_loss_clip": 0.06289193, + "balance_loss_mlp": 0.01254817, + "epoch": 0.4172553735157072, + "flos": 24578612547840.0, + "grad_norm": 1.6044361894616455, + "language_loss": 0.75275123, + "learning_rate": 2.622551121253579e-06, + "loss": 0.83006829, + "num_input_tokens_seen": 148967690, + "router_z_loss_clip": 1.73339844, + "router_z_loss_mlp": 0.14331055, + "step": 6940, + "time_per_iteration": 2.55566143989563 + }, + { + "auxiliary_loss_clip": 0.06464436, + "auxiliary_loss_mlp": 0.01269512, + "balance_loss_clip": 0.0628769, + "balance_loss_mlp": 0.01255338, + "epoch": 0.41731549676837515, + "flos": 27051967768320.0, + "grad_norm": 1.7023568307679129, + "language_loss": 0.71513987, + "learning_rate": 2.622180996345424e-06, + "loss": 0.79247934, + "num_input_tokens_seen": 148987150, + "router_z_loss_clip": 1.76855469, + "router_z_loss_mlp": 0.1416626, + "step": 6941, + "time_per_iteration": 2.628779649734497 + }, + { + "auxiliary_loss_clip": 0.06464395, + "auxiliary_loss_mlp": 0.0127035, + "balance_loss_clip": 0.06285797, + "balance_loss_mlp": 0.01255342, + "epoch": 0.4173756200210431, + "flos": 28400173562880.0, + "grad_norm": 3.007655990717308, + "language_loss": 0.73701853, + "learning_rate": 2.621810847844104e-06, + "loss": 0.81436592, + "num_input_tokens_seen": 149004895, + "router_z_loss_clip": 1.78613281, + "router_z_loss_mlp": 0.15008545, + "step": 6942, + "time_per_iteration": 2.579085350036621 + }, + { + "auxiliary_loss_clip": 0.06469673, + "auxiliary_loss_mlp": 0.01269256, + "balance_loss_clip": 0.06289446, + "balance_loss_mlp": 0.01254587, + "epoch": 0.41743574327371114, + "flos": 22526968527360.0, + "grad_norm": 2.366625341311562, + "language_loss": 0.73327738, + "learning_rate": 2.6214406757636534e-06, + "loss": 0.81066668, + "num_input_tokens_seen": 149020970, + "router_z_loss_clip": 1.80273438, + "router_z_loss_mlp": 0.14672852, + "step": 6943, + "time_per_iteration": 2.5890767574310303 + }, + { + "auxiliary_loss_clip": 0.06466928, + "auxiliary_loss_mlp": 0.01267232, + "balance_loss_clip": 0.06290001, + "balance_loss_mlp": 0.01252998, + "epoch": 0.4174958665263791, + "flos": 30120587193600.0, + "grad_norm": 2.3204117950268817, + "language_loss": 0.63901597, + "learning_rate": 2.621070480118111e-06, + "loss": 0.71635759, + "num_input_tokens_seen": 149041795, + "router_z_loss_clip": 1.76855469, + "router_z_loss_mlp": 0.14245605, + "step": 6944, + "time_per_iteration": 2.586949586868286 + }, + { + "auxiliary_loss_clip": 0.06466375, + "auxiliary_loss_mlp": 0.01271741, + "balance_loss_clip": 0.0628995, + "balance_loss_mlp": 0.0125684, + "epoch": 0.41755598977904707, + "flos": 25270227348480.0, + "grad_norm": 11.202050930016789, + "language_loss": 0.70295048, + "learning_rate": 2.620700260921513e-06, + "loss": 0.78033161, + "num_input_tokens_seen": 149063700, + "router_z_loss_clip": 1.76269531, + "router_z_loss_mlp": 0.14898682, + "step": 6945, + "time_per_iteration": 2.6323587894439697 + }, + { + "auxiliary_loss_clip": 0.06460019, + "auxiliary_loss_mlp": 0.01270496, + "balance_loss_clip": 0.06285217, + "balance_loss_mlp": 0.01255219, + "epoch": 0.41761611303171503, + "flos": 19834707715200.0, + "grad_norm": 1.6201275470111005, + "language_loss": 0.8079865, + "learning_rate": 2.620330018187899e-06, + "loss": 0.88529164, + "num_input_tokens_seen": 149082410, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.152771, + "step": 6946, + "time_per_iteration": 2.5303776264190674 + }, + { + "auxiliary_loss_clip": 0.064612, + "auxiliary_loss_mlp": 0.01269709, + "balance_loss_clip": 0.06288694, + "balance_loss_mlp": 0.0125569, + "epoch": 0.417676236284383, + "flos": 15528655992960.0, + "grad_norm": 2.2948583781036027, + "language_loss": 0.77726543, + "learning_rate": 2.6199597519313086e-06, + "loss": 0.85457456, + "num_input_tokens_seen": 149098745, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.14038086, + "step": 6947, + "time_per_iteration": 2.5844216346740723 + }, + { + "auxiliary_loss_clip": 0.06465282, + "auxiliary_loss_mlp": 0.01267852, + "balance_loss_clip": 0.06289726, + "balance_loss_mlp": 0.01252844, + "epoch": 0.41773635953705096, + "flos": 32532531770880.0, + "grad_norm": 1.6041388362904736, + "language_loss": 0.71914941, + "learning_rate": 2.6195894621657825e-06, + "loss": 0.79648077, + "num_input_tokens_seen": 149122255, + "router_z_loss_clip": 1.75488281, + "router_z_loss_mlp": 0.15014648, + "step": 6948, + "time_per_iteration": 2.632211685180664 + }, + { + "auxiliary_loss_clip": 0.06460577, + "auxiliary_loss_mlp": 0.01271252, + "balance_loss_clip": 0.06288102, + "balance_loss_mlp": 0.01256303, + "epoch": 0.4177964827897189, + "flos": 23447719117440.0, + "grad_norm": 1.868509756028272, + "language_loss": 0.76914591, + "learning_rate": 2.619219148905362e-06, + "loss": 0.84646416, + "num_input_tokens_seen": 149142845, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.14941406, + "step": 6949, + "time_per_iteration": 2.5791566371917725 + }, + { + "auxiliary_loss_clip": 0.06466889, + "auxiliary_loss_mlp": 0.01270609, + "balance_loss_clip": 0.06288934, + "balance_loss_mlp": 0.01255476, + "epoch": 0.4178566060423869, + "flos": 22755768900480.0, + "grad_norm": 1.6605109484051197, + "language_loss": 0.81921285, + "learning_rate": 2.6188488121640888e-06, + "loss": 0.89658785, + "num_input_tokens_seen": 149163375, + "router_z_loss_clip": 1.77734375, + "router_z_loss_mlp": 0.15148926, + "step": 6950, + "time_per_iteration": 2.550705909729004 + }, + { + "auxiliary_loss_clip": 0.06457172, + "auxiliary_loss_mlp": 0.01266593, + "balance_loss_clip": 0.062898, + "balance_loss_mlp": 0.01253319, + "epoch": 0.41791672929505486, + "flos": 26040233243520.0, + "grad_norm": 1.3162845057727355, + "language_loss": 0.76396811, + "learning_rate": 2.618478451956007e-06, + "loss": 0.84120584, + "num_input_tokens_seen": 149185610, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.13275146, + "step": 6951, + "time_per_iteration": 2.6047768592834473 + }, + { + "auxiliary_loss_clip": 0.06472172, + "auxiliary_loss_mlp": 0.01271966, + "balance_loss_clip": 0.06291625, + "balance_loss_mlp": 0.01256988, + "epoch": 0.4179768525477228, + "flos": 19574028063360.0, + "grad_norm": 1.8780871701618023, + "language_loss": 0.72956991, + "learning_rate": 2.61810806829516e-06, + "loss": 0.80701125, + "num_input_tokens_seen": 149203990, + "router_z_loss_clip": 1.80566406, + "router_z_loss_mlp": 0.14978027, + "step": 6952, + "time_per_iteration": 2.498915910720825 + }, + { + "auxiliary_loss_clip": 0.06467617, + "auxiliary_loss_mlp": 0.01270698, + "balance_loss_clip": 0.06290505, + "balance_loss_mlp": 0.01256286, + "epoch": 0.4180369758003908, + "flos": 17789352750720.0, + "grad_norm": 3.5208466342014444, + "language_loss": 0.72192442, + "learning_rate": 2.617737661195593e-06, + "loss": 0.79930753, + "num_input_tokens_seen": 149221385, + "router_z_loss_clip": 1.77050781, + "router_z_loss_mlp": 0.14428711, + "step": 6953, + "time_per_iteration": 2.5105345249176025 + }, + { + "auxiliary_loss_clip": 0.06460451, + "auxiliary_loss_mlp": 0.01269376, + "balance_loss_clip": 0.0629045, + "balance_loss_mlp": 0.01255143, + "epoch": 0.41809709905305875, + "flos": 20967152446080.0, + "grad_norm": 1.9107321624636409, + "language_loss": 0.76574248, + "learning_rate": 2.617367230671353e-06, + "loss": 0.8430407, + "num_input_tokens_seen": 149241175, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.14233398, + "step": 6954, + "time_per_iteration": 2.5424091815948486 + }, + { + "auxiliary_loss_clip": 0.06461184, + "auxiliary_loss_mlp": 0.01271375, + "balance_loss_clip": 0.06286837, + "balance_loss_mlp": 0.01255866, + "epoch": 0.4181572223057267, + "flos": 22024099048320.0, + "grad_norm": 2.2757291119189693, + "language_loss": 0.84719867, + "learning_rate": 2.616996776736485e-06, + "loss": 0.92452419, + "num_input_tokens_seen": 149259115, + "router_z_loss_clip": 1.74511719, + "router_z_loss_mlp": 0.15490723, + "step": 6955, + "time_per_iteration": 2.5423128604888916 + }, + { + "auxiliary_loss_clip": 0.06460696, + "auxiliary_loss_mlp": 0.01269449, + "balance_loss_clip": 0.06289047, + "balance_loss_mlp": 0.01255001, + "epoch": 0.4182173455583947, + "flos": 26251969311360.0, + "grad_norm": 1.5480485879739414, + "language_loss": 0.83159053, + "learning_rate": 2.616626299405037e-06, + "loss": 0.90889192, + "num_input_tokens_seen": 149278705, + "router_z_loss_clip": 1.71582031, + "router_z_loss_mlp": 0.14453125, + "step": 6956, + "time_per_iteration": 2.5377910137176514 + }, + { + "auxiliary_loss_clip": 0.06470253, + "auxiliary_loss_mlp": 0.01272951, + "balance_loss_clip": 0.06292067, + "balance_loss_mlp": 0.01258163, + "epoch": 0.4182774688110627, + "flos": 14796566870400.0, + "grad_norm": 2.2161530875987205, + "language_loss": 0.72170293, + "learning_rate": 2.616255798691059e-06, + "loss": 0.79913497, + "num_input_tokens_seen": 149294040, + "router_z_loss_clip": 1.77929688, + "router_z_loss_mlp": 0.14801025, + "step": 6957, + "time_per_iteration": 2.5512890815734863 + }, + { + "auxiliary_loss_clip": 0.06465964, + "auxiliary_loss_mlp": 0.01272907, + "balance_loss_clip": 0.06289618, + "balance_loss_mlp": 0.01258745, + "epoch": 0.41833759206373067, + "flos": 20418190421760.0, + "grad_norm": 1.9534240722910163, + "language_loss": 0.75827634, + "learning_rate": 2.6158852746085982e-06, + "loss": 0.83566499, + "num_input_tokens_seen": 149310385, + "router_z_loss_clip": 1.76269531, + "router_z_loss_mlp": 0.14147949, + "step": 6958, + "time_per_iteration": 2.5025634765625 + }, + { + "auxiliary_loss_clip": 0.06461923, + "auxiliary_loss_mlp": 0.01277567, + "balance_loss_clip": 0.06289306, + "balance_loss_mlp": 0.01262505, + "epoch": 0.41839771531639863, + "flos": 23662557786240.0, + "grad_norm": 1.62032760192947, + "language_loss": 0.77450699, + "learning_rate": 2.6155147271717066e-06, + "loss": 0.85190189, + "num_input_tokens_seen": 149328235, + "router_z_loss_clip": 1.72851562, + "router_z_loss_mlp": 0.15075684, + "step": 6959, + "time_per_iteration": 2.5644967555999756 + }, + { + "auxiliary_loss_clip": 0.06462178, + "auxiliary_loss_mlp": 0.01275343, + "balance_loss_clip": 0.06288128, + "balance_loss_mlp": 0.01259423, + "epoch": 0.4184578385690666, + "flos": 19760006200320.0, + "grad_norm": 1.8483570445524284, + "language_loss": 0.77022827, + "learning_rate": 2.6151441563944347e-06, + "loss": 0.84760344, + "num_input_tokens_seen": 149347465, + "router_z_loss_clip": 1.74121094, + "router_z_loss_mlp": 0.15924072, + "step": 6960, + "time_per_iteration": 2.5269885063171387 + }, + { + "auxiliary_loss_clip": 0.06453702, + "auxiliary_loss_mlp": 0.01269309, + "balance_loss_clip": 0.06288585, + "balance_loss_mlp": 0.01255552, + "epoch": 0.41851796182173456, + "flos": 20199578319360.0, + "grad_norm": 2.3993036704472717, + "language_loss": 0.75495946, + "learning_rate": 2.614773562290835e-06, + "loss": 0.83218956, + "num_input_tokens_seen": 149366685, + "router_z_loss_clip": 1.64941406, + "router_z_loss_mlp": 0.13769531, + "step": 6961, + "time_per_iteration": 2.571563243865967 + }, + { + "auxiliary_loss_clip": 0.06367883, + "auxiliary_loss_mlp": 0.0126221, + "balance_loss_clip": 0.06291385, + "balance_loss_mlp": 0.01259577, + "epoch": 0.41857808507440253, + "flos": 59038331898240.0, + "grad_norm": 0.8546546360875583, + "language_loss": 0.54730451, + "learning_rate": 2.61440294487496e-06, + "loss": 0.62360549, + "num_input_tokens_seen": 149422925, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.02635193, + "step": 6962, + "time_per_iteration": 3.0928165912628174 + }, + { + "auxiliary_loss_clip": 0.06468143, + "auxiliary_loss_mlp": 0.0127052, + "balance_loss_clip": 0.06293048, + "balance_loss_mlp": 0.01256423, + "epoch": 0.4186382083270705, + "flos": 18484740984960.0, + "grad_norm": 2.146654503648622, + "language_loss": 0.8523612, + "learning_rate": 2.614032304160864e-06, + "loss": 0.92974788, + "num_input_tokens_seen": 149440820, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.14093018, + "step": 6963, + "time_per_iteration": 2.4891340732574463 + }, + { + "auxiliary_loss_clip": 0.06465001, + "auxiliary_loss_mlp": 0.01271241, + "balance_loss_clip": 0.06290912, + "balance_loss_mlp": 0.01256453, + "epoch": 0.41869833157973846, + "flos": 21584988126720.0, + "grad_norm": 1.5636714712462336, + "language_loss": 0.70520425, + "learning_rate": 2.6136616401626014e-06, + "loss": 0.78256667, + "num_input_tokens_seen": 149461060, + "router_z_loss_clip": 1.74023438, + "router_z_loss_mlp": 0.14788818, + "step": 6964, + "time_per_iteration": 2.6037514209747314 + }, + { + "auxiliary_loss_clip": 0.06460649, + "auxiliary_loss_mlp": 0.01270666, + "balance_loss_clip": 0.06289357, + "balance_loss_mlp": 0.01257034, + "epoch": 0.4187584548324064, + "flos": 35526156192000.0, + "grad_norm": 2.108688626905877, + "language_loss": 0.71782613, + "learning_rate": 2.6132909528942273e-06, + "loss": 0.79513931, + "num_input_tokens_seen": 149483115, + "router_z_loss_clip": 1.71191406, + "router_z_loss_mlp": 0.1362915, + "step": 6965, + "time_per_iteration": 4.077980279922485 + }, + { + "auxiliary_loss_clip": 0.06453691, + "auxiliary_loss_mlp": 0.0126997, + "balance_loss_clip": 0.06286767, + "balance_loss_mlp": 0.01257173, + "epoch": 0.4188185780850744, + "flos": 18660950121600.0, + "grad_norm": 1.7018758391145836, + "language_loss": 0.72080678, + "learning_rate": 2.6129202423697997e-06, + "loss": 0.79804349, + "num_input_tokens_seen": 149501495, + "router_z_loss_clip": 1.66894531, + "router_z_loss_mlp": 0.12792969, + "step": 6966, + "time_per_iteration": 2.5740551948547363 + }, + { + "auxiliary_loss_clip": 0.06466748, + "auxiliary_loss_mlp": 0.0127158, + "balance_loss_clip": 0.06288405, + "balance_loss_mlp": 0.0125625, + "epoch": 0.41887870133774235, + "flos": 40342959676800.0, + "grad_norm": 4.506306240026155, + "language_loss": 0.71212667, + "learning_rate": 2.612549508603375e-06, + "loss": 0.78950995, + "num_input_tokens_seen": 149523170, + "router_z_loss_clip": 1.78125, + "router_z_loss_mlp": 0.15338135, + "step": 6967, + "time_per_iteration": 4.179578065872192 + }, + { + "auxiliary_loss_clip": 0.0636977, + "auxiliary_loss_mlp": 0.01256477, + "balance_loss_clip": 0.06291805, + "balance_loss_mlp": 0.01253975, + "epoch": 0.4189388245904103, + "flos": 61388083946880.0, + "grad_norm": 0.6570416522373307, + "language_loss": 0.45988834, + "learning_rate": 2.612178751609011e-06, + "loss": 0.53615081, + "num_input_tokens_seen": 149583955, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 0.02500916, + "step": 6968, + "time_per_iteration": 3.1288843154907227 + }, + { + "auxiliary_loss_clip": 0.06467855, + "auxiliary_loss_mlp": 0.01273397, + "balance_loss_clip": 0.06290668, + "balance_loss_mlp": 0.01257685, + "epoch": 0.4189989478430783, + "flos": 28222371198720.0, + "grad_norm": 1.7081344299750898, + "language_loss": 0.75350499, + "learning_rate": 2.6118079714007685e-06, + "loss": 0.8309176, + "num_input_tokens_seen": 149604440, + "router_z_loss_clip": 1.77246094, + "router_z_loss_mlp": 0.15710449, + "step": 6969, + "time_per_iteration": 2.5936050415039062 + }, + { + "auxiliary_loss_clip": 0.06460407, + "auxiliary_loss_mlp": 0.01272467, + "balance_loss_clip": 0.06287546, + "balance_loss_mlp": 0.01258365, + "epoch": 0.4190590710957463, + "flos": 24571820367360.0, + "grad_norm": 1.8003201263588986, + "language_loss": 0.80904478, + "learning_rate": 2.611437167992705e-06, + "loss": 0.88637358, + "num_input_tokens_seen": 149623745, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.14099121, + "step": 6970, + "time_per_iteration": 2.5366463661193848 + }, + { + "auxiliary_loss_clip": 0.06461529, + "auxiliary_loss_mlp": 0.01271512, + "balance_loss_clip": 0.06291033, + "balance_loss_mlp": 0.01257594, + "epoch": 0.41911919434841427, + "flos": 21732504439680.0, + "grad_norm": 2.0427263912189098, + "language_loss": 0.83781362, + "learning_rate": 2.6110663413988835e-06, + "loss": 0.91514409, + "num_input_tokens_seen": 149643025, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.13922119, + "step": 6971, + "time_per_iteration": 4.038029909133911 + }, + { + "auxiliary_loss_clip": 0.06459013, + "auxiliary_loss_mlp": 0.01277453, + "balance_loss_clip": 0.06292501, + "balance_loss_mlp": 0.01262766, + "epoch": 0.41917931760108224, + "flos": 17607064193280.0, + "grad_norm": 1.8913036217137231, + "language_loss": 0.74956995, + "learning_rate": 2.6106954916333648e-06, + "loss": 0.82693458, + "num_input_tokens_seen": 149660695, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.14685059, + "step": 6972, + "time_per_iteration": 2.5450055599212646 + }, + { + "auxiliary_loss_clip": 0.06463002, + "auxiliary_loss_mlp": 0.01269114, + "balance_loss_clip": 0.06289829, + "balance_loss_mlp": 0.01255405, + "epoch": 0.4192394408537502, + "flos": 37825943679360.0, + "grad_norm": 1.6425528401757075, + "language_loss": 0.73133683, + "learning_rate": 2.610324618710212e-06, + "loss": 0.808658, + "num_input_tokens_seen": 149682040, + "router_z_loss_clip": 1.73339844, + "router_z_loss_mlp": 0.13684082, + "step": 6973, + "time_per_iteration": 2.6852450370788574 + }, + { + "auxiliary_loss_clip": 0.06474721, + "auxiliary_loss_mlp": 0.01271721, + "balance_loss_clip": 0.06293075, + "balance_loss_mlp": 0.01257272, + "epoch": 0.41929956410641817, + "flos": 23113637688960.0, + "grad_norm": 1.8862458299453466, + "language_loss": 0.74830127, + "learning_rate": 2.609953722643489e-06, + "loss": 0.82576567, + "num_input_tokens_seen": 149700855, + "router_z_loss_clip": 1.81542969, + "router_z_loss_mlp": 0.14453125, + "step": 6974, + "time_per_iteration": 2.5765645503997803 + }, + { + "auxiliary_loss_clip": 0.06460831, + "auxiliary_loss_mlp": 0.01266173, + "balance_loss_clip": 0.0628831, + "balance_loss_mlp": 0.01252744, + "epoch": 0.41935968735908613, + "flos": 22530448471680.0, + "grad_norm": 1.902296645802657, + "language_loss": 0.73513019, + "learning_rate": 2.609582803447259e-06, + "loss": 0.81240016, + "num_input_tokens_seen": 149717360, + "router_z_loss_clip": 1.72363281, + "router_z_loss_mlp": 0.13421631, + "step": 6975, + "time_per_iteration": 2.4907052516937256 + }, + { + "auxiliary_loss_clip": 0.06461257, + "auxiliary_loss_mlp": 0.0127025, + "balance_loss_clip": 0.06293045, + "balance_loss_mlp": 0.01256172, + "epoch": 0.4194198106117541, + "flos": 26877771129600.0, + "grad_norm": 1.432926445179704, + "language_loss": 0.80820251, + "learning_rate": 2.6092118611355885e-06, + "loss": 0.8855176, + "num_input_tokens_seen": 149738975, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.14086914, + "step": 6976, + "time_per_iteration": 4.015337705612183 + }, + { + "auxiliary_loss_clip": 0.06465544, + "auxiliary_loss_mlp": 0.01265752, + "balance_loss_clip": 0.06291896, + "balance_loss_mlp": 0.01252174, + "epoch": 0.41947993386442206, + "flos": 19908696470400.0, + "grad_norm": 6.530638917868016, + "language_loss": 0.67613435, + "learning_rate": 2.6088408957225425e-06, + "loss": 0.75344729, + "num_input_tokens_seen": 149757055, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.13592529, + "step": 6977, + "time_per_iteration": 2.5907933712005615 + }, + { + "auxiliary_loss_clip": 0.06466645, + "auxiliary_loss_mlp": 0.012707, + "balance_loss_clip": 0.06291468, + "balance_loss_mlp": 0.01257104, + "epoch": 0.41954005711709, + "flos": 17389584120960.0, + "grad_norm": 2.431968733580352, + "language_loss": 0.8152501, + "learning_rate": 2.6084699072221898e-06, + "loss": 0.89262354, + "num_input_tokens_seen": 149772885, + "router_z_loss_clip": 1.75390625, + "router_z_loss_mlp": 0.13604736, + "step": 6978, + "time_per_iteration": 2.5534939765930176 + }, + { + "auxiliary_loss_clip": 0.06466036, + "auxiliary_loss_mlp": 0.01269917, + "balance_loss_clip": 0.06288658, + "balance_loss_mlp": 0.012561, + "epoch": 0.419600180369758, + "flos": 25009254207360.0, + "grad_norm": 1.7617066668945498, + "language_loss": 0.83044857, + "learning_rate": 2.6080988956485964e-06, + "loss": 0.90780807, + "num_input_tokens_seen": 149791515, + "router_z_loss_clip": 1.77539062, + "router_z_loss_mlp": 0.13824463, + "step": 6979, + "time_per_iteration": 2.5991194248199463 + }, + { + "auxiliary_loss_clip": 0.06464113, + "auxiliary_loss_mlp": 0.01266396, + "balance_loss_clip": 0.0629217, + "balance_loss_mlp": 0.01253313, + "epoch": 0.41966030362242596, + "flos": 17389458339840.0, + "grad_norm": 2.43413237172065, + "language_loss": 0.83727056, + "learning_rate": 2.6077278610158325e-06, + "loss": 0.9145757, + "num_input_tokens_seen": 149807250, + "router_z_loss_clip": 1.71972656, + "router_z_loss_mlp": 0.13079834, + "step": 6980, + "time_per_iteration": 2.4868295192718506 + }, + { + "auxiliary_loss_clip": 0.06469644, + "auxiliary_loss_mlp": 0.01274217, + "balance_loss_clip": 0.06293017, + "balance_loss_mlp": 0.01260061, + "epoch": 0.4197204268750939, + "flos": 22161427090560.0, + "grad_norm": 2.953064628504675, + "language_loss": 0.79802233, + "learning_rate": 2.6073568033379665e-06, + "loss": 0.87546098, + "num_input_tokens_seen": 149821640, + "router_z_loss_clip": 1.765625, + "router_z_loss_mlp": 0.14172363, + "step": 6981, + "time_per_iteration": 2.572671890258789 + }, + { + "auxiliary_loss_clip": 0.06461273, + "auxiliary_loss_mlp": 0.01268979, + "balance_loss_clip": 0.06293882, + "balance_loss_mlp": 0.01256152, + "epoch": 0.4197805501277619, + "flos": 22089534687360.0, + "grad_norm": 1.8874441419731374, + "language_loss": 0.84437835, + "learning_rate": 2.6069857226290696e-06, + "loss": 0.92168081, + "num_input_tokens_seen": 149840545, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.12823486, + "step": 6982, + "time_per_iteration": 2.515719413757324 + }, + { + "auxiliary_loss_clip": 0.06468281, + "auxiliary_loss_mlp": 0.0127262, + "balance_loss_clip": 0.06291284, + "balance_loss_mlp": 0.0125844, + "epoch": 0.4198406733804299, + "flos": 26439372967680.0, + "grad_norm": 2.198770889515785, + "language_loss": 0.57229298, + "learning_rate": 2.606614618903214e-06, + "loss": 0.64970195, + "num_input_tokens_seen": 149860375, + "router_z_loss_clip": 1.76953125, + "router_z_loss_mlp": 0.1418457, + "step": 6983, + "time_per_iteration": 2.589905023574829 + }, + { + "auxiliary_loss_clip": 0.06459898, + "auxiliary_loss_mlp": 0.01268511, + "balance_loss_clip": 0.0629196, + "balance_loss_mlp": 0.01255922, + "epoch": 0.4199007966330979, + "flos": 12535870112640.0, + "grad_norm": 1.9546340544122036, + "language_loss": 0.82430601, + "learning_rate": 2.606243492174471e-06, + "loss": 0.90159011, + "num_input_tokens_seen": 149877850, + "router_z_loss_clip": 1.6796875, + "router_z_loss_mlp": 0.1260376, + "step": 6984, + "time_per_iteration": 2.4837801456451416 + }, + { + "auxiliary_loss_clip": 0.06465998, + "auxiliary_loss_mlp": 0.0127065, + "balance_loss_clip": 0.06293395, + "balance_loss_mlp": 0.01257698, + "epoch": 0.41996091988576584, + "flos": 21769498817280.0, + "grad_norm": 1.6572496297875159, + "language_loss": 0.79565531, + "learning_rate": 2.605872342456914e-06, + "loss": 0.87302184, + "num_input_tokens_seen": 149896110, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.12963867, + "step": 6985, + "time_per_iteration": 2.558382511138916 + }, + { + "auxiliary_loss_clip": 0.06471538, + "auxiliary_loss_mlp": 0.01269266, + "balance_loss_clip": 0.06292171, + "balance_loss_mlp": 0.01254425, + "epoch": 0.4200210431384338, + "flos": 26549182143360.0, + "grad_norm": 1.7232010674189546, + "language_loss": 0.78413719, + "learning_rate": 2.6055011697646173e-06, + "loss": 0.86154521, + "num_input_tokens_seen": 149916495, + "router_z_loss_clip": 1.79492188, + "router_z_loss_mlp": 0.14831543, + "step": 6986, + "time_per_iteration": 2.557201385498047 + }, + { + "auxiliary_loss_clip": 0.06457713, + "auxiliary_loss_mlp": 0.01264036, + "balance_loss_clip": 0.06290729, + "balance_loss_mlp": 0.0125171, + "epoch": 0.42008116639110177, + "flos": 26802859979520.0, + "grad_norm": 1.5119871943534449, + "language_loss": 0.72772801, + "learning_rate": 2.605129974111655e-06, + "loss": 0.80494547, + "num_input_tokens_seen": 149936445, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.12310791, + "step": 6987, + "time_per_iteration": 2.590758800506592 + }, + { + "auxiliary_loss_clip": 0.06464639, + "auxiliary_loss_mlp": 0.01270518, + "balance_loss_clip": 0.06291942, + "balance_loss_mlp": 0.01256994, + "epoch": 0.42014128964376973, + "flos": 32095433347200.0, + "grad_norm": 1.493413355723003, + "language_loss": 0.75077468, + "learning_rate": 2.604758755512104e-06, + "loss": 0.82812625, + "num_input_tokens_seen": 149959430, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.13519287, + "step": 6988, + "time_per_iteration": 2.6159229278564453 + }, + { + "auxiliary_loss_clip": 0.064705, + "auxiliary_loss_mlp": 0.01272645, + "balance_loss_clip": 0.06293759, + "balance_loss_mlp": 0.01258256, + "epoch": 0.4202014128964377, + "flos": 26474061358080.0, + "grad_norm": 1.4960604967721163, + "language_loss": 0.7416907, + "learning_rate": 2.60438751398004e-06, + "loss": 0.81912208, + "num_input_tokens_seen": 149980365, + "router_z_loss_clip": 1.76855469, + "router_z_loss_mlp": 0.14385986, + "step": 6989, + "time_per_iteration": 2.6082265377044678 + }, + { + "auxiliary_loss_clip": 0.06467222, + "auxiliary_loss_mlp": 0.01268972, + "balance_loss_clip": 0.06291176, + "balance_loss_mlp": 0.0125413, + "epoch": 0.42026153614910566, + "flos": 13405287277440.0, + "grad_norm": 2.240751664581705, + "language_loss": 0.70939904, + "learning_rate": 2.6040162495295404e-06, + "loss": 0.78676105, + "num_input_tokens_seen": 149997375, + "router_z_loss_clip": 1.75976562, + "router_z_loss_mlp": 0.14831543, + "step": 6990, + "time_per_iteration": 2.5301413536071777 + }, + { + "auxiliary_loss_clip": 0.06372039, + "auxiliary_loss_mlp": 0.01262281, + "balance_loss_clip": 0.06294142, + "balance_loss_mlp": 0.01259734, + "epoch": 0.42032165940177363, + "flos": 60268720452480.0, + "grad_norm": 0.7958876139316734, + "language_loss": 0.6024788, + "learning_rate": 2.603644962174685e-06, + "loss": 0.67882204, + "num_input_tokens_seen": 150051230, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 0.02546692, + "step": 6991, + "time_per_iteration": 3.036398410797119 + }, + { + "auxiliary_loss_clip": 0.06468751, + "auxiliary_loss_mlp": 0.0127226, + "balance_loss_clip": 0.06294238, + "balance_loss_mlp": 0.01257251, + "epoch": 0.4203817826544416, + "flos": 24542121294720.0, + "grad_norm": 1.5524019758451273, + "language_loss": 0.83787376, + "learning_rate": 2.6032736519295517e-06, + "loss": 0.91528386, + "num_input_tokens_seen": 150071135, + "router_z_loss_clip": 1.74511719, + "router_z_loss_mlp": 0.15014648, + "step": 6992, + "time_per_iteration": 2.5513317584991455 + }, + { + "auxiliary_loss_clip": 0.06374694, + "auxiliary_loss_mlp": 0.01259872, + "balance_loss_clip": 0.06295739, + "balance_loss_mlp": 0.01257284, + "epoch": 0.42044190590710956, + "flos": 58837679297280.0, + "grad_norm": 0.7870388441722128, + "language_loss": 0.65295899, + "learning_rate": 2.6029023188082217e-06, + "loss": 0.72930467, + "num_input_tokens_seen": 150125220, + "router_z_loss_clip": 0.7890625, + "router_z_loss_mlp": 0.02589417, + "step": 6993, + "time_per_iteration": 3.139356851577759 + }, + { + "auxiliary_loss_clip": 0.06475414, + "auxiliary_loss_mlp": 0.01273103, + "balance_loss_clip": 0.06293732, + "balance_loss_mlp": 0.01257534, + "epoch": 0.4205020291597775, + "flos": 16441733934720.0, + "grad_norm": 2.0884817814411307, + "language_loss": 0.83771634, + "learning_rate": 2.6025309628247746e-06, + "loss": 0.91520149, + "num_input_tokens_seen": 150142300, + "router_z_loss_clip": 1.81738281, + "router_z_loss_mlp": 0.15576172, + "step": 6994, + "time_per_iteration": 2.5307908058166504 + }, + { + "auxiliary_loss_clip": 0.06461746, + "auxiliary_loss_mlp": 0.01269563, + "balance_loss_clip": 0.06292755, + "balance_loss_mlp": 0.01255544, + "epoch": 0.4205621524124455, + "flos": 18411548843520.0, + "grad_norm": 1.728991128313806, + "language_loss": 0.79243588, + "learning_rate": 2.6021595839932934e-06, + "loss": 0.86974895, + "num_input_tokens_seen": 150161345, + "router_z_loss_clip": 1.69140625, + "router_z_loss_mlp": 0.14013672, + "step": 6995, + "time_per_iteration": 2.5054030418395996 + }, + { + "auxiliary_loss_clip": 0.06461824, + "auxiliary_loss_mlp": 0.0126885, + "balance_loss_clip": 0.06293637, + "balance_loss_mlp": 0.01255433, + "epoch": 0.4206222756651135, + "flos": 25527133566720.0, + "grad_norm": 1.491511685078805, + "language_loss": 0.80235636, + "learning_rate": 2.60178818232786e-06, + "loss": 0.87966311, + "num_input_tokens_seen": 150182420, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.13409424, + "step": 6996, + "time_per_iteration": 2.6613996028900146 + }, + { + "auxiliary_loss_clip": 0.06466329, + "auxiliary_loss_mlp": 0.01268157, + "balance_loss_clip": 0.06293097, + "balance_loss_mlp": 0.01254466, + "epoch": 0.4206823989177815, + "flos": 15309708474240.0, + "grad_norm": 2.3637588948298998, + "language_loss": 0.76051879, + "learning_rate": 2.601416757842559e-06, + "loss": 0.83786368, + "num_input_tokens_seen": 150200175, + "router_z_loss_clip": 1.73339844, + "router_z_loss_mlp": 0.13690186, + "step": 6997, + "time_per_iteration": 2.484876871109009 + }, + { + "auxiliary_loss_clip": 0.06463061, + "auxiliary_loss_mlp": 0.0126838, + "balance_loss_clip": 0.06288689, + "balance_loss_mlp": 0.01253789, + "epoch": 0.42074252217044944, + "flos": 15558564700800.0, + "grad_norm": 2.0514206793414345, + "language_loss": 0.76478076, + "learning_rate": 2.6010453105514743e-06, + "loss": 0.84209514, + "num_input_tokens_seen": 150217100, + "router_z_loss_clip": 1.74316406, + "router_z_loss_mlp": 0.14599609, + "step": 6998, + "time_per_iteration": 2.5640127658843994 + }, + { + "auxiliary_loss_clip": 0.06466474, + "auxiliary_loss_mlp": 0.01275488, + "balance_loss_clip": 0.06289443, + "balance_loss_mlp": 0.01260587, + "epoch": 0.4208026454231174, + "flos": 26153941633920.0, + "grad_norm": 1.581279992496262, + "language_loss": 0.76102519, + "learning_rate": 2.60067384046869e-06, + "loss": 0.83844483, + "num_input_tokens_seen": 150239830, + "router_z_loss_clip": 1.76757812, + "router_z_loss_mlp": 0.14892578, + "step": 6999, + "time_per_iteration": 2.6406025886535645 + }, + { + "auxiliary_loss_clip": 0.06461642, + "auxiliary_loss_mlp": 0.01267644, + "balance_loss_clip": 0.06291209, + "balance_loss_mlp": 0.01254382, + "epoch": 0.42086276867578537, + "flos": 23556857460480.0, + "grad_norm": 1.988296138175356, + "language_loss": 0.64461291, + "learning_rate": 2.600302347608295e-06, + "loss": 0.72190583, + "num_input_tokens_seen": 150260690, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.13244629, + "step": 7000, + "time_per_iteration": 2.6081695556640625 + }, + { + "auxiliary_loss_clip": 0.06469343, + "auxiliary_loss_mlp": 0.01270405, + "balance_loss_clip": 0.06294516, + "balance_loss_mlp": 0.01256076, + "epoch": 0.42092289192845334, + "flos": 18119199548160.0, + "grad_norm": 1.6363851387704167, + "language_loss": 0.77022576, + "learning_rate": 2.5999308319843743e-06, + "loss": 0.84762329, + "num_input_tokens_seen": 150279885, + "router_z_loss_clip": 1.74804688, + "router_z_loss_mlp": 0.14318848, + "step": 7001, + "time_per_iteration": 2.5761475563049316 + }, + { + "auxiliary_loss_clip": 0.06461353, + "auxiliary_loss_mlp": 0.01268364, + "balance_loss_clip": 0.06290751, + "balance_loss_mlp": 0.01254882, + "epoch": 0.4209830151811213, + "flos": 20012006954880.0, + "grad_norm": 1.5030484792833017, + "language_loss": 0.86740428, + "learning_rate": 2.5995592936110154e-06, + "loss": 0.94470143, + "num_input_tokens_seen": 150297390, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.13482666, + "step": 7002, + "time_per_iteration": 2.585397958755493 + }, + { + "auxiliary_loss_clip": 0.06461627, + "auxiliary_loss_mlp": 0.01271644, + "balance_loss_clip": 0.06290498, + "balance_loss_mlp": 0.01258251, + "epoch": 0.42104313843378927, + "flos": 21985050245760.0, + "grad_norm": 2.152971198745627, + "language_loss": 0.68539977, + "learning_rate": 2.5991877325023096e-06, + "loss": 0.76273245, + "num_input_tokens_seen": 150317390, + "router_z_loss_clip": 1.70996094, + "router_z_loss_mlp": 0.1338501, + "step": 7003, + "time_per_iteration": 2.5039963722229004 + }, + { + "auxiliary_loss_clip": 0.06469242, + "auxiliary_loss_mlp": 0.01271214, + "balance_loss_clip": 0.06293743, + "balance_loss_mlp": 0.01255747, + "epoch": 0.42110326168645723, + "flos": 25450461480960.0, + "grad_norm": 1.8015075946869743, + "language_loss": 0.77306843, + "learning_rate": 2.598816148672344e-06, + "loss": 0.85047305, + "num_input_tokens_seen": 150337455, + "router_z_loss_clip": 1.75585938, + "router_z_loss_mlp": 0.15472412, + "step": 7004, + "time_per_iteration": 2.6128745079040527 + }, + { + "auxiliary_loss_clip": 0.06462541, + "auxiliary_loss_mlp": 0.01273285, + "balance_loss_clip": 0.06294234, + "balance_loss_mlp": 0.0125873, + "epoch": 0.4211633849391252, + "flos": 17828485407360.0, + "grad_norm": 1.7810886301824922, + "language_loss": 0.68804276, + "learning_rate": 2.59844454213521e-06, + "loss": 0.76540101, + "num_input_tokens_seen": 150355385, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.14562988, + "step": 7005, + "time_per_iteration": 3.888760566711426 + }, + { + "auxiliary_loss_clip": 0.06465107, + "auxiliary_loss_mlp": 0.01269773, + "balance_loss_clip": 0.0629124, + "balance_loss_mlp": 0.01255593, + "epoch": 0.42122350819179316, + "flos": 16286796535680.0, + "grad_norm": 1.8605985429595449, + "language_loss": 0.72998816, + "learning_rate": 2.5980729129049994e-06, + "loss": 0.80733699, + "num_input_tokens_seen": 150371750, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.14178467, + "step": 7006, + "time_per_iteration": 3.991835832595825 + }, + { + "auxiliary_loss_clip": 0.06464688, + "auxiliary_loss_mlp": 0.01266849, + "balance_loss_clip": 0.06289375, + "balance_loss_mlp": 0.01252424, + "epoch": 0.4212836314444611, + "flos": 19651916033280.0, + "grad_norm": 1.623062925912009, + "language_loss": 0.7118417, + "learning_rate": 2.5977012609958033e-06, + "loss": 0.78915709, + "num_input_tokens_seen": 150389955, + "router_z_loss_clip": 1.75585938, + "router_z_loss_mlp": 0.14416504, + "step": 7007, + "time_per_iteration": 2.5425753593444824 + }, + { + "auxiliary_loss_clip": 0.06463595, + "auxiliary_loss_mlp": 0.01271642, + "balance_loss_clip": 0.06289028, + "balance_loss_mlp": 0.01257581, + "epoch": 0.4213437546971291, + "flos": 18374889882240.0, + "grad_norm": 2.097779928402724, + "language_loss": 0.82573175, + "learning_rate": 2.5973295864217166e-06, + "loss": 0.90308416, + "num_input_tokens_seen": 150405780, + "router_z_loss_clip": 1.74316406, + "router_z_loss_mlp": 0.140625, + "step": 7008, + "time_per_iteration": 2.492260456085205 + }, + { + "auxiliary_loss_clip": 0.0646316, + "auxiliary_loss_mlp": 0.01269434, + "balance_loss_clip": 0.06289843, + "balance_loss_mlp": 0.01255129, + "epoch": 0.42140387794979706, + "flos": 27711116311680.0, + "grad_norm": 1.9580680041192111, + "language_loss": 0.72638381, + "learning_rate": 2.596957889196831e-06, + "loss": 0.80370975, + "num_input_tokens_seen": 150425615, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.14318848, + "step": 7009, + "time_per_iteration": 2.6216533184051514 + }, + { + "auxiliary_loss_clip": 0.06466616, + "auxiliary_loss_mlp": 0.0126722, + "balance_loss_clip": 0.06289244, + "balance_loss_mlp": 0.01253338, + "epoch": 0.4214640012024651, + "flos": 28154545718400.0, + "grad_norm": 2.5692415195563543, + "language_loss": 0.66926241, + "learning_rate": 2.596586169335243e-06, + "loss": 0.74660075, + "num_input_tokens_seen": 150445765, + "router_z_loss_clip": 1.77441406, + "router_z_loss_mlp": 0.13873291, + "step": 7010, + "time_per_iteration": 2.606501579284668 + }, + { + "auxiliary_loss_clip": 0.06462754, + "auxiliary_loss_mlp": 0.01271396, + "balance_loss_clip": 0.06290238, + "balance_loss_mlp": 0.01256662, + "epoch": 0.42152412445513304, + "flos": 23002989972480.0, + "grad_norm": 1.6839098151972378, + "language_loss": 0.7266804, + "learning_rate": 2.5962144268510477e-06, + "loss": 0.80402195, + "num_input_tokens_seen": 150464405, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.14727783, + "step": 7011, + "time_per_iteration": 4.0488903522491455 + }, + { + "auxiliary_loss_clip": 0.06363396, + "auxiliary_loss_mlp": 0.01255682, + "balance_loss_clip": 0.06285673, + "balance_loss_mlp": 0.01253149, + "epoch": 0.421584247707801, + "flos": 63767855756160.0, + "grad_norm": 0.7737758086067837, + "language_loss": 0.54255652, + "learning_rate": 2.5958426617583417e-06, + "loss": 0.61874723, + "num_input_tokens_seen": 150520430, + "router_z_loss_clip": 0.77734375, + "router_z_loss_mlp": 0.02532959, + "step": 7012, + "time_per_iteration": 3.0473456382751465 + }, + { + "auxiliary_loss_clip": 0.06465481, + "auxiliary_loss_mlp": 0.01272136, + "balance_loss_clip": 0.06289969, + "balance_loss_mlp": 0.01256656, + "epoch": 0.421644370960469, + "flos": 24321203205120.0, + "grad_norm": 1.3531523641491952, + "language_loss": 0.78821653, + "learning_rate": 2.5954708740712215e-06, + "loss": 0.86559272, + "num_input_tokens_seen": 150542610, + "router_z_loss_clip": 1.75292969, + "router_z_loss_mlp": 0.15472412, + "step": 7013, + "time_per_iteration": 2.5436811447143555 + }, + { + "auxiliary_loss_clip": 0.06463543, + "auxiliary_loss_mlp": 0.0127162, + "balance_loss_clip": 0.06287397, + "balance_loss_mlp": 0.01256516, + "epoch": 0.42170449421313694, + "flos": 23447425628160.0, + "grad_norm": 1.8634561108800796, + "language_loss": 0.81284738, + "learning_rate": 2.595099063803787e-06, + "loss": 0.89019895, + "num_input_tokens_seen": 150560970, + "router_z_loss_clip": 1.76269531, + "router_z_loss_mlp": 0.15100098, + "step": 7014, + "time_per_iteration": 2.6464757919311523 + }, + { + "auxiliary_loss_clip": 0.06460524, + "auxiliary_loss_mlp": 0.01273083, + "balance_loss_clip": 0.06287747, + "balance_loss_mlp": 0.01259225, + "epoch": 0.4217646174658049, + "flos": 23702151640320.0, + "grad_norm": 1.4680948866945018, + "language_loss": 0.77888769, + "learning_rate": 2.5947272309701354e-06, + "loss": 0.85622376, + "num_input_tokens_seen": 150582615, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.1385498, + "step": 7015, + "time_per_iteration": 4.043898582458496 + }, + { + "auxiliary_loss_clip": 0.06464352, + "auxiliary_loss_mlp": 0.01268474, + "balance_loss_clip": 0.06287283, + "balance_loss_mlp": 0.01253394, + "epoch": 0.42182474071847287, + "flos": 24978297323520.0, + "grad_norm": 1.853408702102599, + "language_loss": 0.82096922, + "learning_rate": 2.594355375584368e-06, + "loss": 0.89829755, + "num_input_tokens_seen": 150603640, + "router_z_loss_clip": 1.77050781, + "router_z_loss_mlp": 0.15075684, + "step": 7016, + "time_per_iteration": 2.5523900985717773 + }, + { + "auxiliary_loss_clip": 0.06465739, + "auxiliary_loss_mlp": 0.01271643, + "balance_loss_clip": 0.06291386, + "balance_loss_mlp": 0.01256527, + "epoch": 0.42188486397114083, + "flos": 22863230161920.0, + "grad_norm": 2.845700477826224, + "language_loss": 0.6853466, + "learning_rate": 2.593983497660586e-06, + "loss": 0.76272047, + "num_input_tokens_seen": 150622490, + "router_z_loss_clip": 1.74121094, + "router_z_loss_mlp": 0.15112305, + "step": 7017, + "time_per_iteration": 2.57027530670166 + }, + { + "auxiliary_loss_clip": 0.0636536, + "auxiliary_loss_mlp": 0.01255401, + "balance_loss_clip": 0.06287346, + "balance_loss_mlp": 0.01252595, + "epoch": 0.4219449872238088, + "flos": 66997072730880.0, + "grad_norm": 0.6666550742113542, + "language_loss": 0.59442866, + "learning_rate": 2.5936115972128895e-06, + "loss": 0.67063624, + "num_input_tokens_seen": 150689545, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 0.02804565, + "step": 7018, + "time_per_iteration": 3.1860194206237793 + }, + { + "auxiliary_loss_clip": 0.0646835, + "auxiliary_loss_mlp": 0.01271161, + "balance_loss_clip": 0.0628873, + "balance_loss_mlp": 0.0125617, + "epoch": 0.42200511047647676, + "flos": 13120400995200.0, + "grad_norm": 1.8819765217055724, + "language_loss": 0.75926054, + "learning_rate": 2.593239674255382e-06, + "loss": 0.83665562, + "num_input_tokens_seen": 150707610, + "router_z_loss_clip": 1.79492188, + "router_z_loss_mlp": 0.14990234, + "step": 7019, + "time_per_iteration": 2.542468309402466 + }, + { + "auxiliary_loss_clip": 0.06462015, + "auxiliary_loss_mlp": 0.01273146, + "balance_loss_clip": 0.06287961, + "balance_loss_mlp": 0.01257864, + "epoch": 0.42206523372914473, + "flos": 13996400705280.0, + "grad_norm": 1.899626408213008, + "language_loss": 0.69618917, + "learning_rate": 2.592867728802166e-06, + "loss": 0.77354079, + "num_input_tokens_seen": 150724530, + "router_z_loss_clip": 1.74121094, + "router_z_loss_mlp": 0.15283203, + "step": 7020, + "time_per_iteration": 2.4884140491485596 + }, + { + "auxiliary_loss_clip": 0.06459437, + "auxiliary_loss_mlp": 0.01272473, + "balance_loss_clip": 0.06290746, + "balance_loss_mlp": 0.01258347, + "epoch": 0.4221253569818127, + "flos": 21948391284480.0, + "grad_norm": 1.6760812445081854, + "language_loss": 0.81457055, + "learning_rate": 2.592495760867347e-06, + "loss": 0.89188963, + "num_input_tokens_seen": 150742870, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.14135742, + "step": 7021, + "time_per_iteration": 2.60335111618042 + }, + { + "auxiliary_loss_clip": 0.06460646, + "auxiliary_loss_mlp": 0.01268222, + "balance_loss_clip": 0.06286098, + "balance_loss_mlp": 0.01253869, + "epoch": 0.42218548023448066, + "flos": 32200001642880.0, + "grad_norm": 1.5750279801473723, + "language_loss": 0.70101392, + "learning_rate": 2.5921237704650293e-06, + "loss": 0.77830255, + "num_input_tokens_seen": 150765500, + "router_z_loss_clip": 1.74609375, + "router_z_loss_mlp": 0.14355469, + "step": 7022, + "time_per_iteration": 2.605795383453369 + }, + { + "auxiliary_loss_clip": 0.06450655, + "auxiliary_loss_mlp": 0.01272538, + "balance_loss_clip": 0.06284072, + "balance_loss_mlp": 0.01258788, + "epoch": 0.4222456034871487, + "flos": 30127043957760.0, + "grad_norm": 1.5974321201389856, + "language_loss": 0.67428911, + "learning_rate": 2.5917517576093188e-06, + "loss": 0.75152111, + "num_input_tokens_seen": 150784945, + "router_z_loss_clip": 1.66699219, + "router_z_loss_mlp": 0.13751221, + "step": 7023, + "time_per_iteration": 2.6615898609161377 + }, + { + "auxiliary_loss_clip": 0.06455819, + "auxiliary_loss_mlp": 0.01270862, + "balance_loss_clip": 0.06287459, + "balance_loss_mlp": 0.01255508, + "epoch": 0.42230572673981664, + "flos": 22134537129600.0, + "grad_norm": 1.6408413231786074, + "language_loss": 0.69710904, + "learning_rate": 2.591379722314322e-06, + "loss": 0.77437586, + "num_input_tokens_seen": 150803120, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.15356445, + "step": 7024, + "time_per_iteration": 2.531874895095825 + }, + { + "auxiliary_loss_clip": 0.06457987, + "auxiliary_loss_mlp": 0.01269795, + "balance_loss_clip": 0.06283922, + "balance_loss_mlp": 0.01255598, + "epoch": 0.4223658499924846, + "flos": 22061722331520.0, + "grad_norm": 2.1972757713163102, + "language_loss": 0.76880538, + "learning_rate": 2.591007664594147e-06, + "loss": 0.84608328, + "num_input_tokens_seen": 150823135, + "router_z_loss_clip": 1.73925781, + "router_z_loss_mlp": 0.14196777, + "step": 7025, + "time_per_iteration": 2.568814754486084 + }, + { + "auxiliary_loss_clip": 0.06457998, + "auxiliary_loss_mlp": 0.01277209, + "balance_loss_clip": 0.06287608, + "balance_loss_mlp": 0.01263017, + "epoch": 0.4224259732451526, + "flos": 20416681048320.0, + "grad_norm": 1.910881237925828, + "language_loss": 0.80124468, + "learning_rate": 2.5906355844629024e-06, + "loss": 0.87859672, + "num_input_tokens_seen": 150842070, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.14208984, + "step": 7026, + "time_per_iteration": 2.4988901615142822 + }, + { + "auxiliary_loss_clip": 0.06353324, + "auxiliary_loss_mlp": 0.01252769, + "balance_loss_clip": 0.06275862, + "balance_loss_mlp": 0.01250106, + "epoch": 0.42248609649782054, + "flos": 62866307750400.0, + "grad_norm": 0.7325438580667073, + "language_loss": 0.62037623, + "learning_rate": 2.5902634819346966e-06, + "loss": 0.69643718, + "num_input_tokens_seen": 150907450, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.0266571, + "step": 7027, + "time_per_iteration": 3.230607748031616 + }, + { + "auxiliary_loss_clip": 0.06460012, + "auxiliary_loss_mlp": 0.01272089, + "balance_loss_clip": 0.06290331, + "balance_loss_mlp": 0.01257456, + "epoch": 0.4225462197504885, + "flos": 26257126337280.0, + "grad_norm": 2.572422824646089, + "language_loss": 0.71053827, + "learning_rate": 2.5898913570236414e-06, + "loss": 0.78785932, + "num_input_tokens_seen": 150928040, + "router_z_loss_clip": 1.69824219, + "router_z_loss_mlp": 0.14642334, + "step": 7028, + "time_per_iteration": 2.5667781829833984 + }, + { + "auxiliary_loss_clip": 0.06463138, + "auxiliary_loss_mlp": 0.01269354, + "balance_loss_clip": 0.06289553, + "balance_loss_mlp": 0.01255437, + "epoch": 0.42260634300315647, + "flos": 20528209232640.0, + "grad_norm": 1.948126664005559, + "language_loss": 0.82621461, + "learning_rate": 2.589519209743846e-06, + "loss": 0.90353954, + "num_input_tokens_seen": 150945760, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.13928223, + "step": 7029, + "time_per_iteration": 2.5936038494110107 + }, + { + "auxiliary_loss_clip": 0.06468205, + "auxiliary_loss_mlp": 0.01274403, + "balance_loss_clip": 0.06289516, + "balance_loss_mlp": 0.01258441, + "epoch": 0.42266646625582444, + "flos": 24323676900480.0, + "grad_norm": 1.8377333901506168, + "language_loss": 0.75193119, + "learning_rate": 2.589147040109424e-06, + "loss": 0.82935727, + "num_input_tokens_seen": 150965665, + "router_z_loss_clip": 1.78613281, + "router_z_loss_mlp": 0.15966797, + "step": 7030, + "time_per_iteration": 2.6162269115448 + }, + { + "auxiliary_loss_clip": 0.06462294, + "auxiliary_loss_mlp": 0.01267502, + "balance_loss_clip": 0.06287964, + "balance_loss_mlp": 0.01251421, + "epoch": 0.4227265895084924, + "flos": 24210555488640.0, + "grad_norm": 1.9734407814648771, + "language_loss": 0.86909479, + "learning_rate": 2.588774848134486e-06, + "loss": 0.94639277, + "num_input_tokens_seen": 150982260, + "router_z_loss_clip": 1.74511719, + "router_z_loss_mlp": 0.1607666, + "step": 7031, + "time_per_iteration": 2.5292763710021973 + }, + { + "auxiliary_loss_clip": 0.06460671, + "auxiliary_loss_mlp": 0.01269226, + "balance_loss_clip": 0.06286174, + "balance_loss_mlp": 0.01255171, + "epoch": 0.42278671276116037, + "flos": 16915407465600.0, + "grad_norm": 1.893963671956315, + "language_loss": 0.73803562, + "learning_rate": 2.5884026338331473e-06, + "loss": 0.81533462, + "num_input_tokens_seen": 150999990, + "router_z_loss_clip": 1.74511719, + "router_z_loss_mlp": 0.140625, + "step": 7032, + "time_per_iteration": 2.5382707118988037 + }, + { + "auxiliary_loss_clip": 0.06463667, + "auxiliary_loss_mlp": 0.0126981, + "balance_loss_clip": 0.06286915, + "balance_loss_mlp": 0.01254874, + "epoch": 0.42284683601382833, + "flos": 25418162931840.0, + "grad_norm": 1.9439146678532522, + "language_loss": 0.70438349, + "learning_rate": 2.5880303972195222e-06, + "loss": 0.78171825, + "num_input_tokens_seen": 151021105, + "router_z_loss_clip": 1.76660156, + "router_z_loss_mlp": 0.1496582, + "step": 7033, + "time_per_iteration": 2.5798444747924805 + }, + { + "auxiliary_loss_clip": 0.06464536, + "auxiliary_loss_mlp": 0.01270969, + "balance_loss_clip": 0.06288149, + "balance_loss_mlp": 0.01256282, + "epoch": 0.4229069592664963, + "flos": 23047153873920.0, + "grad_norm": 1.8861418032064503, + "language_loss": 0.90879869, + "learning_rate": 2.5876581383077256e-06, + "loss": 0.98615378, + "num_input_tokens_seen": 151040665, + "router_z_loss_clip": 1.76269531, + "router_z_loss_mlp": 0.14685059, + "step": 7034, + "time_per_iteration": 2.5370678901672363 + }, + { + "auxiliary_loss_clip": 0.06455763, + "auxiliary_loss_mlp": 0.01270773, + "balance_loss_clip": 0.06283915, + "balance_loss_mlp": 0.01256676, + "epoch": 0.42296708251916426, + "flos": 26074586217600.0, + "grad_norm": 1.9962240812191803, + "language_loss": 0.77578306, + "learning_rate": 2.5872858571118723e-06, + "loss": 0.85304844, + "num_input_tokens_seen": 151061240, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.14080811, + "step": 7035, + "time_per_iteration": 2.542121648788452 + }, + { + "auxiliary_loss_clip": 0.06464495, + "auxiliary_loss_mlp": 0.01274418, + "balance_loss_clip": 0.06287753, + "balance_loss_mlp": 0.01259863, + "epoch": 0.4230272057718323, + "flos": 19463548055040.0, + "grad_norm": 2.323654021784471, + "language_loss": 0.83016878, + "learning_rate": 2.5869135536460817e-06, + "loss": 0.90755796, + "num_input_tokens_seen": 151076870, + "router_z_loss_clip": 1.76855469, + "router_z_loss_mlp": 0.14538574, + "step": 7036, + "time_per_iteration": 2.5446789264678955 + }, + { + "auxiliary_loss_clip": 0.06461224, + "auxiliary_loss_mlp": 0.01270872, + "balance_loss_clip": 0.06292447, + "balance_loss_mlp": 0.01256859, + "epoch": 0.42308732902450025, + "flos": 22389975901440.0, + "grad_norm": 1.9007003646753964, + "language_loss": 0.70561719, + "learning_rate": 2.58654122792447e-06, + "loss": 0.78293824, + "num_input_tokens_seen": 151095110, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.14031982, + "step": 7037, + "time_per_iteration": 2.5331337451934814 + }, + { + "auxiliary_loss_clip": 0.06462964, + "auxiliary_loss_mlp": 0.01269409, + "balance_loss_clip": 0.06289166, + "balance_loss_mlp": 0.01253923, + "epoch": 0.4231474522771682, + "flos": 21001631201280.0, + "grad_norm": 1.6547666669933128, + "language_loss": 0.77886164, + "learning_rate": 2.586168879961155e-06, + "loss": 0.85618538, + "num_input_tokens_seen": 151114355, + "router_z_loss_clip": 1.73828125, + "router_z_loss_mlp": 0.1550293, + "step": 7038, + "time_per_iteration": 2.547067165374756 + }, + { + "auxiliary_loss_clip": 0.06470759, + "auxiliary_loss_mlp": 0.01270751, + "balance_loss_clip": 0.06292742, + "balance_loss_mlp": 0.01255432, + "epoch": 0.4232075755298362, + "flos": 14981161415040.0, + "grad_norm": 2.6561544689274714, + "language_loss": 0.67851424, + "learning_rate": 2.585796509770259e-06, + "loss": 0.75592935, + "num_input_tokens_seen": 151131505, + "router_z_loss_clip": 1.77929688, + "router_z_loss_mlp": 0.15301514, + "step": 7039, + "time_per_iteration": 2.5148706436157227 + }, + { + "auxiliary_loss_clip": 0.06471442, + "auxiliary_loss_mlp": 0.01274269, + "balance_loss_clip": 0.06291762, + "balance_loss_mlp": 0.01258962, + "epoch": 0.42326769878250414, + "flos": 24539144474880.0, + "grad_norm": 1.5526791387199284, + "language_loss": 0.75859225, + "learning_rate": 2.5854241173658996e-06, + "loss": 0.83604932, + "num_input_tokens_seen": 151151555, + "router_z_loss_clip": 1.79492188, + "router_z_loss_mlp": 0.15307617, + "step": 7040, + "time_per_iteration": 2.6170670986175537 + }, + { + "auxiliary_loss_clip": 0.0646336, + "auxiliary_loss_mlp": 0.01267915, + "balance_loss_clip": 0.06288165, + "balance_loss_mlp": 0.01253199, + "epoch": 0.4233278220351721, + "flos": 26877603421440.0, + "grad_norm": 2.185572961013026, + "language_loss": 0.65619481, + "learning_rate": 2.5850517027621996e-06, + "loss": 0.73350751, + "num_input_tokens_seen": 151172385, + "router_z_loss_clip": 1.75097656, + "router_z_loss_mlp": 0.14715576, + "step": 7041, + "time_per_iteration": 2.5701920986175537 + }, + { + "auxiliary_loss_clip": 0.06470653, + "auxiliary_loss_mlp": 0.01271372, + "balance_loss_clip": 0.06294046, + "balance_loss_mlp": 0.01256626, + "epoch": 0.4233879452878401, + "flos": 42824951867520.0, + "grad_norm": 2.182989579985364, + "language_loss": 0.73763824, + "learning_rate": 2.5846792659732803e-06, + "loss": 0.81505847, + "num_input_tokens_seen": 151194930, + "router_z_loss_clip": 1.76367188, + "router_z_loss_mlp": 0.14752197, + "step": 7042, + "time_per_iteration": 2.7377729415893555 + }, + { + "auxiliary_loss_clip": 0.06466709, + "auxiliary_loss_mlp": 0.01270508, + "balance_loss_clip": 0.06294659, + "balance_loss_mlp": 0.01256119, + "epoch": 0.42344806854050804, + "flos": 25236125936640.0, + "grad_norm": 1.357775127981886, + "language_loss": 0.82479644, + "learning_rate": 2.5843068070132643e-06, + "loss": 0.90216863, + "num_input_tokens_seen": 151217905, + "router_z_loss_clip": 1.72167969, + "router_z_loss_mlp": 0.14379883, + "step": 7043, + "time_per_iteration": 2.6002635955810547 + }, + { + "auxiliary_loss_clip": 0.06466006, + "auxiliary_loss_mlp": 0.01268509, + "balance_loss_clip": 0.06294385, + "balance_loss_mlp": 0.01252749, + "epoch": 0.423508191793176, + "flos": 22784587505280.0, + "grad_norm": 2.981661405110402, + "language_loss": 0.65042412, + "learning_rate": 2.5839343258962763e-06, + "loss": 0.72776926, + "num_input_tokens_seen": 151234580, + "router_z_loss_clip": 1.71679688, + "router_z_loss_mlp": 0.1574707, + "step": 7044, + "time_per_iteration": 4.032661437988281 + }, + { + "auxiliary_loss_clip": 0.06473978, + "auxiliary_loss_mlp": 0.01277434, + "balance_loss_clip": 0.06294475, + "balance_loss_mlp": 0.01261793, + "epoch": 0.42356831504584397, + "flos": 34645376799360.0, + "grad_norm": 1.8091896069955142, + "language_loss": 0.74864423, + "learning_rate": 2.5835618226364393e-06, + "loss": 0.82615834, + "num_input_tokens_seen": 151254765, + "router_z_loss_clip": 1.79394531, + "router_z_loss_mlp": 0.15649414, + "step": 7045, + "time_per_iteration": 2.6634554862976074 + }, + { + "auxiliary_loss_clip": 0.06458761, + "auxiliary_loss_mlp": 0.01272071, + "balance_loss_clip": 0.06289783, + "balance_loss_mlp": 0.01258177, + "epoch": 0.42362843829851193, + "flos": 17601487896960.0, + "grad_norm": 2.434331790625752, + "language_loss": 0.8101598, + "learning_rate": 2.5831892972478797e-06, + "loss": 0.88746816, + "num_input_tokens_seen": 151269045, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.13885498, + "step": 7046, + "time_per_iteration": 3.8471035957336426 + }, + { + "auxiliary_loss_clip": 0.06470428, + "auxiliary_loss_mlp": 0.01270077, + "balance_loss_clip": 0.06293224, + "balance_loss_mlp": 0.01255635, + "epoch": 0.4236885615511799, + "flos": 22572390240000.0, + "grad_norm": 1.5654922866483163, + "language_loss": 0.77272886, + "learning_rate": 2.5828167497447242e-06, + "loss": 0.8501339, + "num_input_tokens_seen": 151287530, + "router_z_loss_clip": 1.77050781, + "router_z_loss_mlp": 0.14416504, + "step": 7047, + "time_per_iteration": 2.5323123931884766 + }, + { + "auxiliary_loss_clip": 0.06461948, + "auxiliary_loss_mlp": 0.01271728, + "balance_loss_clip": 0.06291857, + "balance_loss_mlp": 0.01258245, + "epoch": 0.42374868480384786, + "flos": 26476493126400.0, + "grad_norm": 1.7230664508561655, + "language_loss": 0.68109751, + "learning_rate": 2.582444180141098e-06, + "loss": 0.75843424, + "num_input_tokens_seen": 151308905, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.13482666, + "step": 7048, + "time_per_iteration": 2.5632970333099365 + }, + { + "auxiliary_loss_clip": 0.06464637, + "auxiliary_loss_mlp": 0.01268497, + "balance_loss_clip": 0.06289657, + "balance_loss_mlp": 0.01253263, + "epoch": 0.4238088080565159, + "flos": 20375493966720.0, + "grad_norm": 1.6594147848364105, + "language_loss": 0.78005636, + "learning_rate": 2.5820715884511307e-06, + "loss": 0.85738766, + "num_input_tokens_seen": 151326525, + "router_z_loss_clip": 1.74707031, + "router_z_loss_mlp": 0.15234375, + "step": 7049, + "time_per_iteration": 2.5366568565368652 + }, + { + "auxiliary_loss_clip": 0.06468852, + "auxiliary_loss_mlp": 0.01270789, + "balance_loss_clip": 0.06292627, + "balance_loss_mlp": 0.01256067, + "epoch": 0.42386893130918385, + "flos": 21177379140480.0, + "grad_norm": 1.886460992095426, + "language_loss": 0.83185136, + "learning_rate": 2.5816989746889504e-06, + "loss": 0.90924776, + "num_input_tokens_seen": 151344675, + "router_z_loss_clip": 1.76269531, + "router_z_loss_mlp": 0.1472168, + "step": 7050, + "time_per_iteration": 2.5130441188812256 + }, + { + "auxiliary_loss_clip": 0.06460265, + "auxiliary_loss_mlp": 0.01271009, + "balance_loss_clip": 0.06286017, + "balance_loss_mlp": 0.01255738, + "epoch": 0.4239290545618518, + "flos": 17681346437760.0, + "grad_norm": 2.0965482043088968, + "language_loss": 0.73218369, + "learning_rate": 2.581326338868687e-06, + "loss": 0.80949646, + "num_input_tokens_seen": 151360730, + "router_z_loss_clip": 1.74316406, + "router_z_loss_mlp": 0.15283203, + "step": 7051, + "time_per_iteration": 3.92645263671875 + }, + { + "auxiliary_loss_clip": 0.06464715, + "auxiliary_loss_mlp": 0.01268876, + "balance_loss_clip": 0.06291503, + "balance_loss_mlp": 0.01254595, + "epoch": 0.4239891778145198, + "flos": 24321077424000.0, + "grad_norm": 1.57175281695923, + "language_loss": 0.86744994, + "learning_rate": 2.5809536810044706e-06, + "loss": 0.94478583, + "num_input_tokens_seen": 151380445, + "router_z_loss_clip": 1.73144531, + "router_z_loss_mlp": 0.1427002, + "step": 7052, + "time_per_iteration": 2.584425210952759 + }, + { + "auxiliary_loss_clip": 0.06467065, + "auxiliary_loss_mlp": 0.01277353, + "balance_loss_clip": 0.06289236, + "balance_loss_mlp": 0.01262559, + "epoch": 0.42404930106718774, + "flos": 20564700485760.0, + "grad_norm": 1.3965954512003949, + "language_loss": 0.72571224, + "learning_rate": 2.5805810011104323e-06, + "loss": 0.80315644, + "num_input_tokens_seen": 151399325, + "router_z_loss_clip": 1.77734375, + "router_z_loss_mlp": 0.14794922, + "step": 7053, + "time_per_iteration": 2.5454976558685303 + }, + { + "auxiliary_loss_clip": 0.06462884, + "auxiliary_loss_mlp": 0.01267759, + "balance_loss_clip": 0.06288673, + "balance_loss_mlp": 0.01253251, + "epoch": 0.4241094243198557, + "flos": 22314351991680.0, + "grad_norm": 1.5249079777591508, + "language_loss": 0.82902604, + "learning_rate": 2.580208299200704e-06, + "loss": 0.90633249, + "num_input_tokens_seen": 151417240, + "router_z_loss_clip": 1.74023438, + "router_z_loss_mlp": 0.14508057, + "step": 7054, + "time_per_iteration": 4.019419193267822 + }, + { + "auxiliary_loss_clip": 0.06381379, + "auxiliary_loss_mlp": 0.01253973, + "balance_loss_clip": 0.06300146, + "balance_loss_mlp": 0.01250773, + "epoch": 0.4241695475725237, + "flos": 70632445973760.0, + "grad_norm": 0.7904217901105888, + "language_loss": 0.60280955, + "learning_rate": 2.5798355752894183e-06, + "loss": 0.6791631, + "num_input_tokens_seen": 151476015, + "router_z_loss_clip": 0.8125, + "router_z_loss_mlp": 0.03204346, + "step": 7055, + "time_per_iteration": 3.152217388153076 + }, + { + "auxiliary_loss_clip": 0.06467455, + "auxiliary_loss_mlp": 0.01267499, + "balance_loss_clip": 0.06290264, + "balance_loss_mlp": 0.01252717, + "epoch": 0.42422967082519164, + "flos": 14032640396160.0, + "grad_norm": 2.414100924234879, + "language_loss": 0.77460873, + "learning_rate": 2.5794628293907107e-06, + "loss": 0.85195827, + "num_input_tokens_seen": 151492035, + "router_z_loss_clip": 1.7734375, + "router_z_loss_mlp": 0.14782715, + "step": 7056, + "time_per_iteration": 2.469475746154785 + }, + { + "auxiliary_loss_clip": 0.06476917, + "auxiliary_loss_mlp": 0.01275416, + "balance_loss_clip": 0.06295634, + "balance_loss_mlp": 0.01259013, + "epoch": 0.4242897940778596, + "flos": 22351975274880.0, + "grad_norm": 2.3823515442172187, + "language_loss": 0.84773225, + "learning_rate": 2.579090061518714e-06, + "loss": 0.92525554, + "num_input_tokens_seen": 151508970, + "router_z_loss_clip": 1.8125, + "router_z_loss_mlp": 0.1640625, + "step": 7057, + "time_per_iteration": 2.559659481048584 + }, + { + "auxiliary_loss_clip": 0.06472223, + "auxiliary_loss_mlp": 0.01277699, + "balance_loss_clip": 0.06293373, + "balance_loss_mlp": 0.01262202, + "epoch": 0.42434991733052757, + "flos": 22601502334080.0, + "grad_norm": 3.5122040291641583, + "language_loss": 0.83485544, + "learning_rate": 2.5787172716875642e-06, + "loss": 0.91235471, + "num_input_tokens_seen": 151525295, + "router_z_loss_clip": 1.78808594, + "router_z_loss_mlp": 0.15490723, + "step": 7058, + "time_per_iteration": 2.4998161792755127 + }, + { + "auxiliary_loss_clip": 0.06459209, + "auxiliary_loss_mlp": 0.01270641, + "balance_loss_clip": 0.06288499, + "balance_loss_mlp": 0.01256205, + "epoch": 0.42441004058319554, + "flos": 20017667105280.0, + "grad_norm": 2.0122152391379498, + "language_loss": 0.80975556, + "learning_rate": 2.5783444599113973e-06, + "loss": 0.88705409, + "num_input_tokens_seen": 151544435, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.14440918, + "step": 7059, + "time_per_iteration": 2.581310987472534 + }, + { + "auxiliary_loss_clip": 0.06467164, + "auxiliary_loss_mlp": 0.0127411, + "balance_loss_clip": 0.06288522, + "balance_loss_mlp": 0.01258053, + "epoch": 0.4244701638358635, + "flos": 11149663691520.0, + "grad_norm": 2.3594129001130963, + "language_loss": 0.70608068, + "learning_rate": 2.57797162620435e-06, + "loss": 0.7834934, + "num_input_tokens_seen": 151559520, + "router_z_loss_clip": 1.78710938, + "router_z_loss_mlp": 0.16064453, + "step": 7060, + "time_per_iteration": 2.485072612762451 + }, + { + "auxiliary_loss_clip": 0.06469266, + "auxiliary_loss_mlp": 0.01274664, + "balance_loss_clip": 0.06293246, + "balance_loss_mlp": 0.01260317, + "epoch": 0.42453028708853147, + "flos": 23994542862720.0, + "grad_norm": 1.485543893241047, + "language_loss": 0.76297516, + "learning_rate": 2.577598770580562e-06, + "loss": 0.84041446, + "num_input_tokens_seen": 151579790, + "router_z_loss_clip": 1.76171875, + "router_z_loss_mlp": 0.14324951, + "step": 7061, + "time_per_iteration": 2.594430685043335 + }, + { + "auxiliary_loss_clip": 0.06469865, + "auxiliary_loss_mlp": 0.01271574, + "balance_loss_clip": 0.06291063, + "balance_loss_mlp": 0.01256643, + "epoch": 0.42459041034119943, + "flos": 18412345457280.0, + "grad_norm": 1.9822246970542112, + "language_loss": 0.72630441, + "learning_rate": 2.5772258930541693e-06, + "loss": 0.80371881, + "num_input_tokens_seen": 151598285, + "router_z_loss_clip": 1.79296875, + "router_z_loss_mlp": 0.14935303, + "step": 7062, + "time_per_iteration": 2.64372181892395 + }, + { + "auxiliary_loss_clip": 0.06460352, + "auxiliary_loss_mlp": 0.01277188, + "balance_loss_clip": 0.06284757, + "balance_loss_mlp": 0.01262215, + "epoch": 0.42465053359386745, + "flos": 20964049845120.0, + "grad_norm": 2.6818567528078923, + "language_loss": 0.66330427, + "learning_rate": 2.5768529936393137e-06, + "loss": 0.74067968, + "num_input_tokens_seen": 151615430, + "router_z_loss_clip": 1.7578125, + "router_z_loss_mlp": 0.1496582, + "step": 7063, + "time_per_iteration": 2.5413248538970947 + }, + { + "auxiliary_loss_clip": 0.06452604, + "auxiliary_loss_mlp": 0.01267624, + "balance_loss_clip": 0.062814, + "balance_loss_mlp": 0.01254195, + "epoch": 0.4247106568465354, + "flos": 33114001979520.0, + "grad_norm": 1.5147527354116395, + "language_loss": 0.78917265, + "learning_rate": 2.5764800723501354e-06, + "loss": 0.86637491, + "num_input_tokens_seen": 151637030, + "router_z_loss_clip": 1.71191406, + "router_z_loss_mlp": 0.13446045, + "step": 7064, + "time_per_iteration": 2.610231876373291 + }, + { + "auxiliary_loss_clip": 0.06469544, + "auxiliary_loss_mlp": 0.01271013, + "balance_loss_clip": 0.06291715, + "balance_loss_mlp": 0.01256267, + "epoch": 0.4247707800992034, + "flos": 20052984401280.0, + "grad_norm": 1.8682780470126852, + "language_loss": 0.75125778, + "learning_rate": 2.5761071292007736e-06, + "loss": 0.82866335, + "num_input_tokens_seen": 151655745, + "router_z_loss_clip": 1.77832031, + "router_z_loss_mlp": 0.14733887, + "step": 7065, + "time_per_iteration": 2.583846092224121 + }, + { + "auxiliary_loss_clip": 0.06463289, + "auxiliary_loss_mlp": 0.01272027, + "balance_loss_clip": 0.06289071, + "balance_loss_mlp": 0.01256971, + "epoch": 0.42483090335187135, + "flos": 22392114180480.0, + "grad_norm": 1.5143179334948575, + "language_loss": 0.72187293, + "learning_rate": 2.5757341642053725e-06, + "loss": 0.79922605, + "num_input_tokens_seen": 151678040, + "router_z_loss_clip": 1.7421875, + "router_z_loss_mlp": 0.1505127, + "step": 7066, + "time_per_iteration": 2.5569074153900146 + }, + { + "auxiliary_loss_clip": 0.06467879, + "auxiliary_loss_mlp": 0.01269525, + "balance_loss_clip": 0.06290474, + "balance_loss_mlp": 0.01254231, + "epoch": 0.4248910266045393, + "flos": 21362518736640.0, + "grad_norm": 2.6158792173392484, + "language_loss": 0.79757857, + "learning_rate": 2.5753611773780745e-06, + "loss": 0.87495261, + "num_input_tokens_seen": 151696410, + "router_z_loss_clip": 1.77441406, + "router_z_loss_mlp": 0.15289307, + "step": 7067, + "time_per_iteration": 2.5845797061920166 + }, + { + "auxiliary_loss_clip": 0.06384341, + "auxiliary_loss_mlp": 0.01254549, + "balance_loss_clip": 0.06303053, + "balance_loss_mlp": 0.01250746, + "epoch": 0.4249511498572073, + "flos": 64026942180480.0, + "grad_norm": 1.3506219442036578, + "language_loss": 0.63354319, + "learning_rate": 2.574988168733022e-06, + "loss": 0.70993209, + "num_input_tokens_seen": 151756365, + "router_z_loss_clip": 0.8125, + "router_z_loss_mlp": 0.03796387, + "step": 7068, + "time_per_iteration": 3.082864284515381 + }, + { + "auxiliary_loss_clip": 0.06464778, + "auxiliary_loss_mlp": 0.0127101, + "balance_loss_clip": 0.06287815, + "balance_loss_mlp": 0.01255155, + "epoch": 0.42501127310987524, + "flos": 19612699522560.0, + "grad_norm": 2.0360912712095875, + "language_loss": 0.72778141, + "learning_rate": 2.574615138284361e-06, + "loss": 0.8051393, + "num_input_tokens_seen": 151775165, + "router_z_loss_clip": 1.76953125, + "router_z_loss_mlp": 0.15844727, + "step": 7069, + "time_per_iteration": 2.560899257659912 + }, + { + "auxiliary_loss_clip": 0.06466071, + "auxiliary_loss_mlp": 0.01271316, + "balance_loss_clip": 0.06289013, + "balance_loss_mlp": 0.01255378, + "epoch": 0.4250713963625432, + "flos": 19468160029440.0, + "grad_norm": 2.1627827730841074, + "language_loss": 0.79640651, + "learning_rate": 2.5742420860462364e-06, + "loss": 0.87378043, + "num_input_tokens_seen": 151792620, + "router_z_loss_clip": 1.77050781, + "router_z_loss_mlp": 0.15930176, + "step": 7070, + "time_per_iteration": 2.507615327835083 + }, + { + "auxiliary_loss_clip": 0.06461551, + "auxiliary_loss_mlp": 0.01270578, + "balance_loss_clip": 0.06285524, + "balance_loss_mlp": 0.01255117, + "epoch": 0.4251315196152112, + "flos": 25344719228160.0, + "grad_norm": 1.9437385428250697, + "language_loss": 0.70912981, + "learning_rate": 2.573869012032795e-06, + "loss": 0.7864511, + "num_input_tokens_seen": 151812850, + "router_z_loss_clip": 1.76074219, + "router_z_loss_mlp": 0.15454102, + "step": 7071, + "time_per_iteration": 2.5730371475219727 + }, + { + "auxiliary_loss_clip": 0.06465049, + "auxiliary_loss_mlp": 0.01271451, + "balance_loss_clip": 0.06289509, + "balance_loss_mlp": 0.01256896, + "epoch": 0.42519164286787914, + "flos": 26366348534400.0, + "grad_norm": 2.618295142810269, + "language_loss": 0.71212989, + "learning_rate": 2.5734959162581824e-06, + "loss": 0.78949487, + "num_input_tokens_seen": 151831785, + "router_z_loss_clip": 1.75390625, + "router_z_loss_mlp": 0.14544678, + "step": 7072, + "time_per_iteration": 2.5560264587402344 + }, + { + "auxiliary_loss_clip": 0.06469329, + "auxiliary_loss_mlp": 0.01270547, + "balance_loss_clip": 0.06289761, + "balance_loss_mlp": 0.01256182, + "epoch": 0.4252517661205471, + "flos": 26038220745600.0, + "grad_norm": 1.647981639391401, + "language_loss": 0.81448823, + "learning_rate": 2.5731227987365475e-06, + "loss": 0.89188695, + "num_input_tokens_seen": 151853885, + "router_z_loss_clip": 1.79589844, + "router_z_loss_mlp": 0.14385986, + "step": 7073, + "time_per_iteration": 2.5955123901367188 + }, + { + "auxiliary_loss_clip": 0.06462769, + "auxiliary_loss_mlp": 0.01273163, + "balance_loss_clip": 0.06288294, + "balance_loss_mlp": 0.01259204, + "epoch": 0.42531188937321507, + "flos": 12718536013440.0, + "grad_norm": 2.653395632366352, + "language_loss": 0.91860557, + "learning_rate": 2.5727496594820386e-06, + "loss": 0.99596488, + "num_input_tokens_seen": 151871780, + "router_z_loss_clip": 1.74316406, + "router_z_loss_mlp": 0.1395874, + "step": 7074, + "time_per_iteration": 2.4894237518310547 + }, + { + "auxiliary_loss_clip": 0.06467288, + "auxiliary_loss_mlp": 0.01273087, + "balance_loss_clip": 0.06287881, + "balance_loss_mlp": 0.0125827, + "epoch": 0.42537201262588303, + "flos": 22098339365760.0, + "grad_norm": 1.877755960639547, + "language_loss": 0.64814276, + "learning_rate": 2.572376498508805e-06, + "loss": 0.72554648, + "num_input_tokens_seen": 151891600, + "router_z_loss_clip": 1.79296875, + "router_z_loss_mlp": 0.14807129, + "step": 7075, + "time_per_iteration": 2.598754644393921 + }, + { + "auxiliary_loss_clip": 0.06455241, + "auxiliary_loss_mlp": 0.01269515, + "balance_loss_clip": 0.06284718, + "balance_loss_mlp": 0.01255246, + "epoch": 0.42543213587855105, + "flos": 23009824080000.0, + "grad_norm": 2.0883967049140666, + "language_loss": 0.74251705, + "learning_rate": 2.5720033158309973e-06, + "loss": 0.81976461, + "num_input_tokens_seen": 151911330, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.1427002, + "step": 7076, + "time_per_iteration": 2.537986993789673 + }, + { + "auxiliary_loss_clip": 0.0646292, + "auxiliary_loss_mlp": 0.01270865, + "balance_loss_clip": 0.06284414, + "balance_loss_mlp": 0.01256334, + "epoch": 0.425492259131219, + "flos": 25089448164480.0, + "grad_norm": 3.3689754116422335, + "language_loss": 0.79212517, + "learning_rate": 2.571630111462766e-06, + "loss": 0.86946297, + "num_input_tokens_seen": 151930355, + "router_z_loss_clip": 1.78417969, + "router_z_loss_mlp": 0.14520264, + "step": 7077, + "time_per_iteration": 2.6490280628204346 + }, + { + "auxiliary_loss_clip": 0.06455311, + "auxiliary_loss_mlp": 0.01267846, + "balance_loss_clip": 0.06287791, + "balance_loss_mlp": 0.01254721, + "epoch": 0.425552382383887, + "flos": 22822881621120.0, + "grad_norm": 1.7167135286528112, + "language_loss": 0.7317155, + "learning_rate": 2.571256885418265e-06, + "loss": 0.80894709, + "num_input_tokens_seen": 151949695, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.13116455, + "step": 7078, + "time_per_iteration": 2.5729281902313232 + }, + { + "auxiliary_loss_clip": 0.06459501, + "auxiliary_loss_mlp": 0.01269381, + "balance_loss_clip": 0.06290293, + "balance_loss_mlp": 0.01256173, + "epoch": 0.42561250563655495, + "flos": 13558757230080.0, + "grad_norm": 1.6803598980459025, + "language_loss": 0.80183727, + "learning_rate": 2.5708836377116445e-06, + "loss": 0.87912607, + "num_input_tokens_seen": 151967640, + "router_z_loss_clip": 1.69238281, + "router_z_loss_mlp": 0.13201904, + "step": 7079, + "time_per_iteration": 2.4937188625335693 + }, + { + "auxiliary_loss_clip": 0.06460771, + "auxiliary_loss_mlp": 0.0127097, + "balance_loss_clip": 0.06287594, + "balance_loss_mlp": 0.01257481, + "epoch": 0.4256726288892229, + "flos": 46989692478720.0, + "grad_norm": 1.4689183555154843, + "language_loss": 0.71987867, + "learning_rate": 2.5705103683570592e-06, + "loss": 0.79719609, + "num_input_tokens_seen": 151994020, + "router_z_loss_clip": 1.73046875, + "router_z_loss_mlp": 0.13500977, + "step": 7080, + "time_per_iteration": 2.774247884750366 + }, + { + "auxiliary_loss_clip": 0.06462272, + "auxiliary_loss_mlp": 0.01269683, + "balance_loss_clip": 0.0628937, + "balance_loss_mlp": 0.01256505, + "epoch": 0.4257327521418909, + "flos": 23593181005440.0, + "grad_norm": 1.9610396393278133, + "language_loss": 0.80520535, + "learning_rate": 2.5701370773686646e-06, + "loss": 0.88252497, + "num_input_tokens_seen": 152013415, + "router_z_loss_clip": 1.72949219, + "router_z_loss_mlp": 0.13165283, + "step": 7081, + "time_per_iteration": 2.53387451171875 + }, + { + "auxiliary_loss_clip": 0.06452817, + "auxiliary_loss_mlp": 0.01271536, + "balance_loss_clip": 0.06286353, + "balance_loss_mlp": 0.01257844, + "epoch": 0.42579287539455885, + "flos": 18996079726080.0, + "grad_norm": 1.496926936820616, + "language_loss": 0.81558168, + "learning_rate": 2.5697637647606138e-06, + "loss": 0.89282513, + "num_input_tokens_seen": 152030860, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.13702393, + "step": 7082, + "time_per_iteration": 2.50972580909729 + }, + { + "auxiliary_loss_clip": 0.06462308, + "auxiliary_loss_mlp": 0.01271701, + "balance_loss_clip": 0.06289167, + "balance_loss_mlp": 0.0125745, + "epoch": 0.4258529986472268, + "flos": 25198921923840.0, + "grad_norm": 1.6583429285627758, + "language_loss": 0.70258069, + "learning_rate": 2.569390430547065e-06, + "loss": 0.77992082, + "num_input_tokens_seen": 152050395, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.14251709, + "step": 7083, + "time_per_iteration": 2.543390989303589 + }, + { + "auxiliary_loss_clip": 0.06373302, + "auxiliary_loss_mlp": 0.01258345, + "balance_loss_clip": 0.06290752, + "balance_loss_mlp": 0.01254316, + "epoch": 0.4259131218998948, + "flos": 69990277881600.0, + "grad_norm": 0.8555028711944374, + "language_loss": 0.67011017, + "learning_rate": 2.569017074742173e-06, + "loss": 0.74642664, + "num_input_tokens_seen": 152113555, + "router_z_loss_clip": 0.82714844, + "router_z_loss_mlp": 0.0402832, + "step": 7084, + "time_per_iteration": 4.592621803283691 + }, + { + "auxiliary_loss_clip": 0.0645996, + "auxiliary_loss_mlp": 0.01273486, + "balance_loss_clip": 0.06287397, + "balance_loss_mlp": 0.01259348, + "epoch": 0.42597324515256274, + "flos": 18010899745920.0, + "grad_norm": 6.078178213614668, + "language_loss": 0.78467649, + "learning_rate": 2.5686436973600964e-06, + "loss": 0.86201096, + "num_input_tokens_seen": 152131575, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.14135742, + "step": 7085, + "time_per_iteration": 4.053593635559082 + }, + { + "auxiliary_loss_clip": 0.0647409, + "auxiliary_loss_mlp": 0.01277113, + "balance_loss_clip": 0.0629435, + "balance_loss_mlp": 0.01262158, + "epoch": 0.4260333684052307, + "flos": 15164204659200.0, + "grad_norm": 2.149155774842141, + "language_loss": 0.7699095, + "learning_rate": 2.568270298414995e-06, + "loss": 0.84742153, + "num_input_tokens_seen": 152149435, + "router_z_loss_clip": 1.79785156, + "router_z_loss_mlp": 0.1496582, + "step": 7086, + "time_per_iteration": 2.480053424835205 + }, + { + "auxiliary_loss_clip": 0.06458418, + "auxiliary_loss_mlp": 0.01275137, + "balance_loss_clip": 0.06286179, + "balance_loss_mlp": 0.01260129, + "epoch": 0.42609349165789867, + "flos": 14944628234880.0, + "grad_norm": 1.8417550415955477, + "language_loss": 0.80286872, + "learning_rate": 2.5678968779210255e-06, + "loss": 0.88020432, + "num_input_tokens_seen": 152166860, + "router_z_loss_clip": 1.72070312, + "router_z_loss_mlp": 0.15026855, + "step": 7087, + "time_per_iteration": 2.5487940311431885 + }, + { + "auxiliary_loss_clip": 0.06464538, + "auxiliary_loss_mlp": 0.01271303, + "balance_loss_clip": 0.06291935, + "balance_loss_mlp": 0.01257183, + "epoch": 0.42615361491056664, + "flos": 23738642893440.0, + "grad_norm": 2.1069826106325213, + "language_loss": 0.66537511, + "learning_rate": 2.5675234358923505e-06, + "loss": 0.7427336, + "num_input_tokens_seen": 152187475, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.14111328, + "step": 7088, + "time_per_iteration": 2.5807759761810303 + }, + { + "auxiliary_loss_clip": 0.06470972, + "auxiliary_loss_mlp": 0.01274052, + "balance_loss_clip": 0.06293773, + "balance_loss_mlp": 0.01260402, + "epoch": 0.42621373816323466, + "flos": 24943399297920.0, + "grad_norm": 2.133950232933384, + "language_loss": 0.69013214, + "learning_rate": 2.56714997234313e-06, + "loss": 0.76758242, + "num_input_tokens_seen": 152207235, + "router_z_loss_clip": 1.76660156, + "router_z_loss_mlp": 0.13665771, + "step": 7089, + "time_per_iteration": 2.5817432403564453 + }, + { + "auxiliary_loss_clip": 0.06463064, + "auxiliary_loss_mlp": 0.0127013, + "balance_loss_clip": 0.0628805, + "balance_loss_mlp": 0.0125598, + "epoch": 0.4262738614159026, + "flos": 13558044470400.0, + "grad_norm": 4.212045379455766, + "language_loss": 0.74597216, + "learning_rate": 2.566776487287525e-06, + "loss": 0.82330406, + "num_input_tokens_seen": 152224240, + "router_z_loss_clip": 1.74804688, + "router_z_loss_mlp": 0.14141846, + "step": 7090, + "time_per_iteration": 3.9426205158233643 + }, + { + "auxiliary_loss_clip": 0.06464858, + "auxiliary_loss_mlp": 0.01272944, + "balance_loss_clip": 0.06287836, + "balance_loss_mlp": 0.01259211, + "epoch": 0.4263339846685706, + "flos": 29755926224640.0, + "grad_norm": 2.684790824023287, + "language_loss": 0.75386477, + "learning_rate": 2.5664029807396994e-06, + "loss": 0.8312428, + "num_input_tokens_seen": 152242595, + "router_z_loss_clip": 1.77148438, + "router_z_loss_mlp": 0.13745117, + "step": 7091, + "time_per_iteration": 2.563892126083374 + }, + { + "auxiliary_loss_clip": 0.0645293, + "auxiliary_loss_mlp": 0.01269396, + "balance_loss_clip": 0.06285767, + "balance_loss_mlp": 0.01257278, + "epoch": 0.42639410792123855, + "flos": 16839406212480.0, + "grad_norm": 1.8445868770478253, + "language_loss": 0.82496071, + "learning_rate": 2.5660294527138156e-06, + "loss": 0.90218395, + "num_input_tokens_seen": 152260840, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.12121582, + "step": 7092, + "time_per_iteration": 2.55583119392395 + }, + { + "auxiliary_loss_clip": 0.06467807, + "auxiliary_loss_mlp": 0.01271484, + "balance_loss_clip": 0.06288138, + "balance_loss_mlp": 0.01257567, + "epoch": 0.4264542311739065, + "flos": 28769991557760.0, + "grad_norm": 1.5226511822280566, + "language_loss": 0.73850381, + "learning_rate": 2.565655903224038e-06, + "loss": 0.81589675, + "num_input_tokens_seen": 152280580, + "router_z_loss_clip": 1.796875, + "router_z_loss_mlp": 0.13922119, + "step": 7093, + "time_per_iteration": 4.021864414215088 + }, + { + "auxiliary_loss_clip": 0.06460725, + "auxiliary_loss_mlp": 0.012688, + "balance_loss_clip": 0.06287876, + "balance_loss_mlp": 0.01254512, + "epoch": 0.4265143544265745, + "flos": 24719881731840.0, + "grad_norm": 2.2430846112789617, + "language_loss": 0.70883787, + "learning_rate": 2.565282332284532e-06, + "loss": 0.78613305, + "num_input_tokens_seen": 152298455, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.14300537, + "step": 7094, + "time_per_iteration": 2.5826168060302734 + }, + { + "auxiliary_loss_clip": 0.06461484, + "auxiliary_loss_mlp": 0.01268246, + "balance_loss_clip": 0.06287476, + "balance_loss_mlp": 0.0125381, + "epoch": 0.42657447767924245, + "flos": 21871467636480.0, + "grad_norm": 1.4959257312535472, + "language_loss": 0.81979394, + "learning_rate": 2.564908739909464e-06, + "loss": 0.89709127, + "num_input_tokens_seen": 152316995, + "router_z_loss_clip": 1.73925781, + "router_z_loss_mlp": 0.14428711, + "step": 7095, + "time_per_iteration": 2.5714282989501953 + }, + { + "auxiliary_loss_clip": 0.06464021, + "auxiliary_loss_mlp": 0.0127044, + "balance_loss_clip": 0.06287175, + "balance_loss_mlp": 0.01255831, + "epoch": 0.4266346009319104, + "flos": 21476604470400.0, + "grad_norm": 2.7630559086257533, + "language_loss": 0.80476701, + "learning_rate": 2.5645351261129996e-06, + "loss": 0.88211161, + "num_input_tokens_seen": 152334800, + "router_z_loss_clip": 1.76757812, + "router_z_loss_mlp": 0.1461792, + "step": 7096, + "time_per_iteration": 2.52101731300354 + }, + { + "auxiliary_loss_clip": 0.06471846, + "auxiliary_loss_mlp": 0.0126828, + "balance_loss_clip": 0.06290311, + "balance_loss_mlp": 0.01253946, + "epoch": 0.4266947241845784, + "flos": 25526295025920.0, + "grad_norm": 2.003429077322888, + "language_loss": 0.65857691, + "learning_rate": 2.5641614909093066e-06, + "loss": 0.73597825, + "num_input_tokens_seen": 152355175, + "router_z_loss_clip": 1.81542969, + "router_z_loss_mlp": 0.14331055, + "step": 7097, + "time_per_iteration": 2.6010050773620605 + }, + { + "auxiliary_loss_clip": 0.0645384, + "auxiliary_loss_mlp": 0.01272923, + "balance_loss_clip": 0.06282586, + "balance_loss_mlp": 0.01259601, + "epoch": 0.42675484743724634, + "flos": 26548343602560.0, + "grad_norm": 1.7498935394273216, + "language_loss": 0.75170088, + "learning_rate": 2.5637878343125535e-06, + "loss": 0.82896858, + "num_input_tokens_seen": 152377245, + "router_z_loss_clip": 1.71386719, + "router_z_loss_mlp": 0.13317871, + "step": 7098, + "time_per_iteration": 2.5674946308135986 + }, + { + "auxiliary_loss_clip": 0.06458846, + "auxiliary_loss_mlp": 0.01274446, + "balance_loss_clip": 0.0628911, + "balance_loss_mlp": 0.01260033, + "epoch": 0.4268149706899143, + "flos": 23119465547520.0, + "grad_norm": 1.6850998762786562, + "language_loss": 0.75184697, + "learning_rate": 2.5634141563369086e-06, + "loss": 0.82917988, + "num_input_tokens_seen": 152396985, + "router_z_loss_clip": 1.69726562, + "router_z_loss_mlp": 0.14428711, + "step": 7099, + "time_per_iteration": 2.5784735679626465 + }, + { + "auxiliary_loss_clip": 0.06459826, + "auxiliary_loss_mlp": 0.01273278, + "balance_loss_clip": 0.06283994, + "balance_loss_mlp": 0.01259116, + "epoch": 0.4268750939425823, + "flos": 22712401612800.0, + "grad_norm": 2.0765509228592802, + "language_loss": 0.83059096, + "learning_rate": 2.5630404569965432e-06, + "loss": 0.90792197, + "num_input_tokens_seen": 152415590, + "router_z_loss_clip": 1.75683594, + "router_z_loss_mlp": 0.14172363, + "step": 7100, + "time_per_iteration": 2.520923614501953 + }, + { + "auxiliary_loss_clip": 0.06459752, + "auxiliary_loss_mlp": 0.01269142, + "balance_loss_clip": 0.06284218, + "balance_loss_mlp": 0.01255839, + "epoch": 0.42693521719525024, + "flos": 25382007095040.0, + "grad_norm": 1.4351436052366604, + "language_loss": 0.82259512, + "learning_rate": 2.562666736305627e-06, + "loss": 0.8998841, + "num_input_tokens_seen": 152436735, + "router_z_loss_clip": 1.75292969, + "router_z_loss_mlp": 0.13311768, + "step": 7101, + "time_per_iteration": 2.595768451690674 + }, + { + "auxiliary_loss_clip": 0.06466523, + "auxiliary_loss_mlp": 0.01273606, + "balance_loss_clip": 0.06287891, + "balance_loss_mlp": 0.01259099, + "epoch": 0.42699534044791826, + "flos": 18156613196160.0, + "grad_norm": 2.266580923573967, + "language_loss": 0.72800845, + "learning_rate": 2.5622929942783314e-06, + "loss": 0.80540979, + "num_input_tokens_seen": 152455685, + "router_z_loss_clip": 1.78613281, + "router_z_loss_mlp": 0.14501953, + "step": 7102, + "time_per_iteration": 2.494673252105713 + }, + { + "auxiliary_loss_clip": 0.06457532, + "auxiliary_loss_mlp": 0.0127168, + "balance_loss_clip": 0.06287985, + "balance_loss_mlp": 0.01257935, + "epoch": 0.4270554637005862, + "flos": 13703422504320.0, + "grad_norm": 2.1781975733094936, + "language_loss": 0.83514953, + "learning_rate": 2.5619192309288297e-06, + "loss": 0.91244167, + "num_input_tokens_seen": 152473500, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.13751221, + "step": 7103, + "time_per_iteration": 2.506204128265381 + }, + { + "auxiliary_loss_clip": 0.06465043, + "auxiliary_loss_mlp": 0.01274672, + "balance_loss_clip": 0.0628773, + "balance_loss_mlp": 0.01259753, + "epoch": 0.4271155869532542, + "flos": 17499351369600.0, + "grad_norm": 2.042502996026563, + "language_loss": 0.73773789, + "learning_rate": 2.561545446271294e-06, + "loss": 0.815135, + "num_input_tokens_seen": 152491320, + "router_z_loss_clip": 1.77246094, + "router_z_loss_mlp": 0.14916992, + "step": 7104, + "time_per_iteration": 2.5006070137023926 + }, + { + "auxiliary_loss_clip": 0.06459317, + "auxiliary_loss_mlp": 0.01274322, + "balance_loss_clip": 0.0628491, + "balance_loss_mlp": 0.01260494, + "epoch": 0.42717571020592215, + "flos": 32460471659520.0, + "grad_norm": 3.22189729136274, + "language_loss": 0.75052768, + "learning_rate": 2.5611716403198987e-06, + "loss": 0.82786405, + "num_input_tokens_seen": 152511970, + "router_z_loss_clip": 1.74609375, + "router_z_loss_mlp": 0.13830566, + "step": 7105, + "time_per_iteration": 2.607759475708008 + }, + { + "auxiliary_loss_clip": 0.06461999, + "auxiliary_loss_mlp": 0.01274519, + "balance_loss_clip": 0.06286199, + "balance_loss_mlp": 0.01261168, + "epoch": 0.4272358334585901, + "flos": 16258606836480.0, + "grad_norm": 17.703344591331568, + "language_loss": 0.77349067, + "learning_rate": 2.560797813088819e-06, + "loss": 0.85085583, + "num_input_tokens_seen": 152530515, + "router_z_loss_clip": 1.75878906, + "router_z_loss_mlp": 0.13354492, + "step": 7106, + "time_per_iteration": 2.4834203720092773 + }, + { + "auxiliary_loss_clip": 0.06461152, + "auxiliary_loss_mlp": 0.01276721, + "balance_loss_clip": 0.06287872, + "balance_loss_mlp": 0.01262499, + "epoch": 0.4272959567112581, + "flos": 24205817733120.0, + "grad_norm": 1.9445558892844073, + "language_loss": 0.8013317, + "learning_rate": 2.560423964592229e-06, + "loss": 0.87871039, + "num_input_tokens_seen": 152549295, + "router_z_loss_clip": 1.73046875, + "router_z_loss_mlp": 0.14233398, + "step": 7107, + "time_per_iteration": 2.5639657974243164 + }, + { + "auxiliary_loss_clip": 0.06454289, + "auxiliary_loss_mlp": 0.01267783, + "balance_loss_clip": 0.06283173, + "balance_loss_mlp": 0.01253424, + "epoch": 0.42735607996392605, + "flos": 27970747787520.0, + "grad_norm": 1.710799907332892, + "language_loss": 0.68469441, + "learning_rate": 2.5600500948443075e-06, + "loss": 0.76191515, + "num_input_tokens_seen": 152570725, + "router_z_loss_clip": 1.70996094, + "router_z_loss_mlp": 0.14349365, + "step": 7108, + "time_per_iteration": 2.5538556575775146 + }, + { + "auxiliary_loss_clip": 0.06460684, + "auxiliary_loss_mlp": 0.01273244, + "balance_loss_clip": 0.06285615, + "balance_loss_mlp": 0.01258712, + "epoch": 0.427416203216594, + "flos": 20300582816640.0, + "grad_norm": 2.1700047707431342, + "language_loss": 0.72192961, + "learning_rate": 2.5596762038592294e-06, + "loss": 0.79926884, + "num_input_tokens_seen": 152588950, + "router_z_loss_clip": 1.75097656, + "router_z_loss_mlp": 0.14520264, + "step": 7109, + "time_per_iteration": 2.5418453216552734 + }, + { + "auxiliary_loss_clip": 0.06462875, + "auxiliary_loss_mlp": 0.01279728, + "balance_loss_clip": 0.06288399, + "balance_loss_mlp": 0.01264159, + "epoch": 0.427476326469262, + "flos": 26951382541440.0, + "grad_norm": 2.7192306397859034, + "language_loss": 0.64651388, + "learning_rate": 2.559302291651174e-06, + "loss": 0.7239399, + "num_input_tokens_seen": 152608965, + "router_z_loss_clip": 1.74511719, + "router_z_loss_mlp": 0.15551758, + "step": 7110, + "time_per_iteration": 2.6708264350891113 + }, + { + "auxiliary_loss_clip": 0.06457267, + "auxiliary_loss_mlp": 0.01278945, + "balance_loss_clip": 0.06284395, + "balance_loss_mlp": 0.01264056, + "epoch": 0.42753644972192995, + "flos": 25709967175680.0, + "grad_norm": 2.127603657525877, + "language_loss": 0.76798368, + "learning_rate": 2.5589283582343197e-06, + "loss": 0.84534585, + "num_input_tokens_seen": 152630220, + "router_z_loss_clip": 1.72851562, + "router_z_loss_mlp": 0.14880371, + "step": 7111, + "time_per_iteration": 2.678954601287842 + }, + { + "auxiliary_loss_clip": 0.0646024, + "auxiliary_loss_mlp": 0.01269729, + "balance_loss_clip": 0.06282812, + "balance_loss_mlp": 0.01255352, + "epoch": 0.4275965729745979, + "flos": 18772855649280.0, + "grad_norm": 1.9451066993795918, + "language_loss": 0.73479104, + "learning_rate": 2.558554403622845e-06, + "loss": 0.81209064, + "num_input_tokens_seen": 152648835, + "router_z_loss_clip": 1.77441406, + "router_z_loss_mlp": 0.1439209, + "step": 7112, + "time_per_iteration": 2.4913687705993652 + }, + { + "auxiliary_loss_clip": 0.06453889, + "auxiliary_loss_mlp": 0.01274214, + "balance_loss_clip": 0.06283249, + "balance_loss_mlp": 0.01260248, + "epoch": 0.4276566962272659, + "flos": 23770438318080.0, + "grad_norm": 1.6965987454612683, + "language_loss": 0.71646041, + "learning_rate": 2.5581804278309323e-06, + "loss": 0.79374146, + "num_input_tokens_seen": 152668375, + "router_z_loss_clip": 1.70800781, + "router_z_loss_mlp": 0.13964844, + "step": 7113, + "time_per_iteration": 2.567722797393799 + }, + { + "auxiliary_loss_clip": 0.06462316, + "auxiliary_loss_mlp": 0.01277106, + "balance_loss_clip": 0.06286302, + "balance_loss_mlp": 0.01262157, + "epoch": 0.42771681947993384, + "flos": 22499156171520.0, + "grad_norm": 1.507728091462329, + "language_loss": 0.61987239, + "learning_rate": 2.5578064308727617e-06, + "loss": 0.69726658, + "num_input_tokens_seen": 152689725, + "router_z_loss_clip": 1.75878906, + "router_z_loss_mlp": 0.14953613, + "step": 7114, + "time_per_iteration": 2.5800352096557617 + }, + { + "auxiliary_loss_clip": 0.06466354, + "auxiliary_loss_mlp": 0.01281834, + "balance_loss_clip": 0.06284335, + "balance_loss_mlp": 0.01264895, + "epoch": 0.42777694273260186, + "flos": 25051489464960.0, + "grad_norm": 1.9424022728130763, + "language_loss": 0.64557558, + "learning_rate": 2.5574324127625153e-06, + "loss": 0.72305751, + "num_input_tokens_seen": 152709375, + "router_z_loss_clip": 1.8203125, + "router_z_loss_mlp": 0.16943359, + "step": 7115, + "time_per_iteration": 2.625234603881836 + }, + { + "auxiliary_loss_clip": 0.06458592, + "auxiliary_loss_mlp": 0.01271806, + "balance_loss_clip": 0.06283341, + "balance_loss_mlp": 0.01257668, + "epoch": 0.4278370659852698, + "flos": 18667532666880.0, + "grad_norm": 1.4802584121928888, + "language_loss": 0.73841792, + "learning_rate": 2.5570583735143753e-06, + "loss": 0.81572187, + "num_input_tokens_seen": 152727510, + "router_z_loss_clip": 1.75195312, + "router_z_loss_mlp": 0.14141846, + "step": 7116, + "time_per_iteration": 2.517512798309326 + }, + { + "auxiliary_loss_clip": 0.06453552, + "auxiliary_loss_mlp": 0.0127651, + "balance_loss_clip": 0.06284202, + "balance_loss_mlp": 0.01262461, + "epoch": 0.4278971892379378, + "flos": 27315666167040.0, + "grad_norm": 1.6819154869474044, + "language_loss": 0.69691694, + "learning_rate": 2.5566843131425275e-06, + "loss": 0.77421755, + "num_input_tokens_seen": 152746670, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.14044189, + "step": 7117, + "time_per_iteration": 2.5842087268829346 + }, + { + "auxiliary_loss_clip": 0.06455907, + "auxiliary_loss_mlp": 0.01274379, + "balance_loss_clip": 0.06285148, + "balance_loss_mlp": 0.0126008, + "epoch": 0.42795731249060576, + "flos": 12892397235840.0, + "grad_norm": 2.190420439429125, + "language_loss": 0.69763142, + "learning_rate": 2.5563102316611536e-06, + "loss": 0.77493429, + "num_input_tokens_seen": 152760545, + "router_z_loss_clip": 1.70800781, + "router_z_loss_mlp": 0.14306641, + "step": 7118, + "time_per_iteration": 2.480435609817505 + }, + { + "auxiliary_loss_clip": 0.06457028, + "auxiliary_loss_mlp": 0.01277321, + "balance_loss_clip": 0.06285428, + "balance_loss_mlp": 0.01262109, + "epoch": 0.4280174357432737, + "flos": 33409873146240.0, + "grad_norm": 2.392758427844577, + "language_loss": 0.74691743, + "learning_rate": 2.55593612908444e-06, + "loss": 0.82426095, + "num_input_tokens_seen": 152780970, + "router_z_loss_clip": 1.71582031, + "router_z_loss_mlp": 0.15197754, + "step": 7119, + "time_per_iteration": 2.633418083190918 + }, + { + "auxiliary_loss_clip": 0.06453852, + "auxiliary_loss_mlp": 0.01276265, + "balance_loss_clip": 0.06282485, + "balance_loss_mlp": 0.0126134, + "epoch": 0.4280775589959417, + "flos": 18264871071360.0, + "grad_norm": 2.26485992413173, + "language_loss": 0.75017536, + "learning_rate": 2.555562005426573e-06, + "loss": 0.8274765, + "num_input_tokens_seen": 152798475, + "router_z_loss_clip": 1.71386719, + "router_z_loss_mlp": 0.14916992, + "step": 7120, + "time_per_iteration": 2.4857230186462402 + }, + { + "auxiliary_loss_clip": 0.06459665, + "auxiliary_loss_mlp": 0.01279872, + "balance_loss_clip": 0.062869, + "balance_loss_mlp": 0.01265883, + "epoch": 0.42813768224860965, + "flos": 21477820354560.0, + "grad_norm": 1.904077899556691, + "language_loss": 0.77223492, + "learning_rate": 2.5551878607017385e-06, + "loss": 0.8496303, + "num_input_tokens_seen": 152817555, + "router_z_loss_clip": 1.72753906, + "router_z_loss_mlp": 0.13989258, + "step": 7121, + "time_per_iteration": 2.547011375427246 + }, + { + "auxiliary_loss_clip": 0.06450777, + "auxiliary_loss_mlp": 0.01281298, + "balance_loss_clip": 0.06280679, + "balance_loss_mlp": 0.01267255, + "epoch": 0.4281978055012776, + "flos": 15674704859520.0, + "grad_norm": 1.7733631777850345, + "language_loss": 0.85767531, + "learning_rate": 2.554813694924126e-06, + "loss": 0.93499613, + "num_input_tokens_seen": 152836295, + "router_z_loss_clip": 1.70117188, + "router_z_loss_mlp": 0.14056396, + "step": 7122, + "time_per_iteration": 2.488633155822754 + }, + { + "auxiliary_loss_clip": 0.06454846, + "auxiliary_loss_mlp": 0.01275392, + "balance_loss_clip": 0.06284268, + "balance_loss_mlp": 0.01261022, + "epoch": 0.4282579287539456, + "flos": 17717711909760.0, + "grad_norm": 2.3186837977879886, + "language_loss": 0.8157897, + "learning_rate": 2.554439508107921e-06, + "loss": 0.89309216, + "num_input_tokens_seen": 152854950, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.14355469, + "step": 7123, + "time_per_iteration": 3.969069719314575 + }, + { + "auxiliary_loss_clip": 0.06453736, + "auxiliary_loss_mlp": 0.01276304, + "balance_loss_clip": 0.06284729, + "balance_loss_mlp": 0.01262034, + "epoch": 0.42831805200661355, + "flos": 19287171210240.0, + "grad_norm": 1.594767030772038, + "language_loss": 0.80927598, + "learning_rate": 2.5540653002673153e-06, + "loss": 0.88657635, + "num_input_tokens_seen": 152873995, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.14257812, + "step": 7124, + "time_per_iteration": 3.901512861251831 + }, + { + "auxiliary_loss_clip": 0.06454194, + "auxiliary_loss_mlp": 0.01273804, + "balance_loss_clip": 0.06283361, + "balance_loss_mlp": 0.01258312, + "epoch": 0.4283781752592815, + "flos": 19798845367680.0, + "grad_norm": 1.7493536594312618, + "language_loss": 0.81056678, + "learning_rate": 2.553691071416498e-06, + "loss": 0.88784677, + "num_input_tokens_seen": 152892925, + "router_z_loss_clip": 1.70703125, + "router_z_loss_mlp": 0.15484619, + "step": 7125, + "time_per_iteration": 2.561479091644287 + }, + { + "auxiliary_loss_clip": 0.06453275, + "auxiliary_loss_mlp": 0.0127252, + "balance_loss_clip": 0.06283629, + "balance_loss_mlp": 0.01259467, + "epoch": 0.4284382985119495, + "flos": 16513584410880.0, + "grad_norm": 2.012470201752393, + "language_loss": 0.75256401, + "learning_rate": 2.553316821569659e-06, + "loss": 0.829822, + "num_input_tokens_seen": 152910935, + "router_z_loss_clip": 1.69824219, + "router_z_loss_mlp": 0.13037109, + "step": 7126, + "time_per_iteration": 2.550835371017456 + }, + { + "auxiliary_loss_clip": 0.06454661, + "auxiliary_loss_mlp": 0.01269423, + "balance_loss_clip": 0.06280357, + "balance_loss_mlp": 0.01255518, + "epoch": 0.42849842176461744, + "flos": 23337406817280.0, + "grad_norm": 1.7018740006461155, + "language_loss": 0.81619167, + "learning_rate": 2.5529425507409913e-06, + "loss": 0.8934325, + "num_input_tokens_seen": 152931030, + "router_z_loss_clip": 1.74511719, + "router_z_loss_mlp": 0.13916016, + "step": 7127, + "time_per_iteration": 2.512833833694458 + }, + { + "auxiliary_loss_clip": 0.06455937, + "auxiliary_loss_mlp": 0.01269506, + "balance_loss_clip": 0.06282341, + "balance_loss_mlp": 0.01254659, + "epoch": 0.4285585450172854, + "flos": 17280110361600.0, + "grad_norm": 1.7733778395824964, + "language_loss": 0.76877725, + "learning_rate": 2.5525682589446867e-06, + "loss": 0.84603173, + "num_input_tokens_seen": 152948085, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.14837646, + "step": 7128, + "time_per_iteration": 2.54837703704834 + }, + { + "auxiliary_loss_clip": 0.06458156, + "auxiliary_loss_mlp": 0.01271641, + "balance_loss_clip": 0.06282061, + "balance_loss_mlp": 0.01255726, + "epoch": 0.42861866826995343, + "flos": 24286430960640.0, + "grad_norm": 1.8449893243882522, + "language_loss": 0.74647015, + "learning_rate": 2.552193946194937e-06, + "loss": 0.82376814, + "num_input_tokens_seen": 152966265, + "router_z_loss_clip": 1.76074219, + "router_z_loss_mlp": 0.15917969, + "step": 7129, + "time_per_iteration": 2.5513017177581787 + }, + { + "auxiliary_loss_clip": 0.06454159, + "auxiliary_loss_mlp": 0.0127295, + "balance_loss_clip": 0.06282164, + "balance_loss_mlp": 0.01258949, + "epoch": 0.4286787915226214, + "flos": 24360042372480.0, + "grad_norm": 1.8999084688655365, + "language_loss": 0.7830866, + "learning_rate": 2.5518196125059394e-06, + "loss": 0.86035764, + "num_input_tokens_seen": 152986775, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.14007568, + "step": 7130, + "time_per_iteration": 3.9916892051696777 + }, + { + "auxiliary_loss_clip": 0.06456774, + "auxiliary_loss_mlp": 0.01278579, + "balance_loss_clip": 0.06282126, + "balance_loss_mlp": 0.01263618, + "epoch": 0.42873891477528936, + "flos": 15455338070400.0, + "grad_norm": 2.1626861971351263, + "language_loss": 0.73881406, + "learning_rate": 2.551445257891886e-06, + "loss": 0.81616759, + "num_input_tokens_seen": 153003595, + "router_z_loss_clip": 1.74707031, + "router_z_loss_mlp": 0.1496582, + "step": 7131, + "time_per_iteration": 2.504786252975464 + }, + { + "auxiliary_loss_clip": 0.06455156, + "auxiliary_loss_mlp": 0.01273453, + "balance_loss_clip": 0.06282241, + "balance_loss_mlp": 0.01258183, + "epoch": 0.4287990380279573, + "flos": 17645358309120.0, + "grad_norm": 2.0546861067047533, + "language_loss": 0.77884281, + "learning_rate": 2.551070882366973e-06, + "loss": 0.85612893, + "num_input_tokens_seen": 153021960, + "router_z_loss_clip": 1.72753906, + "router_z_loss_mlp": 0.15270996, + "step": 7132, + "time_per_iteration": 2.5048811435699463 + }, + { + "auxiliary_loss_clip": 0.06456134, + "auxiliary_loss_mlp": 0.01270516, + "balance_loss_clip": 0.06281912, + "balance_loss_mlp": 0.01254542, + "epoch": 0.4288591612806253, + "flos": 27169701154560.0, + "grad_norm": 1.7726331897563596, + "language_loss": 0.78733218, + "learning_rate": 2.550696485945397e-06, + "loss": 0.86459869, + "num_input_tokens_seen": 153042110, + "router_z_loss_clip": 1.74121094, + "router_z_loss_mlp": 0.1595459, + "step": 7133, + "time_per_iteration": 4.068531036376953 + }, + { + "auxiliary_loss_clip": 0.06450784, + "auxiliary_loss_mlp": 0.01268858, + "balance_loss_clip": 0.06277733, + "balance_loss_mlp": 0.01254785, + "epoch": 0.42891928453329325, + "flos": 17168540250240.0, + "grad_norm": 1.7118267088696246, + "language_loss": 0.7483775, + "learning_rate": 2.550322068641355e-06, + "loss": 0.82557392, + "num_input_tokens_seen": 153058925, + "router_z_loss_clip": 1.73046875, + "router_z_loss_mlp": 0.14068604, + "step": 7134, + "time_per_iteration": 2.504011631011963 + }, + { + "auxiliary_loss_clip": 0.06450233, + "auxiliary_loss_mlp": 0.01272762, + "balance_loss_clip": 0.06279828, + "balance_loss_mlp": 0.0125882, + "epoch": 0.4289794077859612, + "flos": 18192936741120.0, + "grad_norm": 1.9195667435408965, + "language_loss": 0.84458339, + "learning_rate": 2.5499476304690455e-06, + "loss": 0.92181337, + "num_input_tokens_seen": 153078070, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.13946533, + "step": 7135, + "time_per_iteration": 2.4924819469451904 + }, + { + "auxiliary_loss_clip": 0.06447092, + "auxiliary_loss_mlp": 0.01268039, + "balance_loss_clip": 0.06279005, + "balance_loss_mlp": 0.01253949, + "epoch": 0.4290395310386292, + "flos": 28264438748160.0, + "grad_norm": 2.116473983113214, + "language_loss": 0.754601, + "learning_rate": 2.549573171442666e-06, + "loss": 0.8317523, + "num_input_tokens_seen": 153096680, + "router_z_loss_clip": 1.6796875, + "router_z_loss_mlp": 0.14099121, + "step": 7136, + "time_per_iteration": 2.579450845718384 + }, + { + "auxiliary_loss_clip": 0.06453092, + "auxiliary_loss_mlp": 0.01272367, + "balance_loss_clip": 0.06277236, + "balance_loss_mlp": 0.01257895, + "epoch": 0.42909965429129715, + "flos": 16221528604800.0, + "grad_norm": 1.8728665886520197, + "language_loss": 0.79211873, + "learning_rate": 2.5491986915764175e-06, + "loss": 0.86937326, + "num_input_tokens_seen": 153113305, + "router_z_loss_clip": 1.75976562, + "router_z_loss_mlp": 0.14465332, + "step": 7137, + "time_per_iteration": 2.485880136489868 + }, + { + "auxiliary_loss_clip": 0.06452384, + "auxiliary_loss_mlp": 0.01271962, + "balance_loss_clip": 0.06279657, + "balance_loss_mlp": 0.01257359, + "epoch": 0.4291597775439651, + "flos": 23119633255680.0, + "grad_norm": 1.8713356259191796, + "language_loss": 0.76152903, + "learning_rate": 2.548824190884499e-06, + "loss": 0.83877248, + "num_input_tokens_seen": 153132735, + "router_z_loss_clip": 1.72753906, + "router_z_loss_mlp": 0.14605713, + "step": 7138, + "time_per_iteration": 2.5630223751068115 + }, + { + "auxiliary_loss_clip": 0.06367285, + "auxiliary_loss_mlp": 0.01254388, + "balance_loss_clip": 0.06288805, + "balance_loss_mlp": 0.01250711, + "epoch": 0.4292199007966331, + "flos": 67565461703040.0, + "grad_norm": 0.7609122933706777, + "language_loss": 0.5608238, + "learning_rate": 2.548449669381113e-06, + "loss": 0.63704056, + "num_input_tokens_seen": 153187925, + "router_z_loss_clip": 0.78515625, + "router_z_loss_mlp": 0.03668213, + "step": 7139, + "time_per_iteration": 3.0345327854156494 + }, + { + "auxiliary_loss_clip": 0.06448679, + "auxiliary_loss_mlp": 0.01269902, + "balance_loss_clip": 0.06282055, + "balance_loss_mlp": 0.01256861, + "epoch": 0.42928002404930105, + "flos": 23006008719360.0, + "grad_norm": 1.7405631209015646, + "language_loss": 0.81563902, + "learning_rate": 2.5480751270804595e-06, + "loss": 0.89282477, + "num_input_tokens_seen": 153206990, + "router_z_loss_clip": 1.66699219, + "router_z_loss_mlp": 0.13049316, + "step": 7140, + "time_per_iteration": 2.5697882175445557 + }, + { + "auxiliary_loss_clip": 0.06455392, + "auxiliary_loss_mlp": 0.01267223, + "balance_loss_clip": 0.0628099, + "balance_loss_mlp": 0.01252543, + "epoch": 0.429340147301969, + "flos": 11549432321280.0, + "grad_norm": 1.8011940744465647, + "language_loss": 0.82215559, + "learning_rate": 2.5477005639967424e-06, + "loss": 0.89938176, + "num_input_tokens_seen": 153222345, + "router_z_loss_clip": 1.74316406, + "router_z_loss_mlp": 0.14678955, + "step": 7141, + "time_per_iteration": 2.4844813346862793 + }, + { + "auxiliary_loss_clip": 0.0646215, + "auxiliary_loss_mlp": 0.0128237, + "balance_loss_clip": 0.06283965, + "balance_loss_mlp": 0.01266336, + "epoch": 0.42940027055463703, + "flos": 25272030211200.0, + "grad_norm": 2.0081644747821947, + "language_loss": 0.86468136, + "learning_rate": 2.547325980144166e-06, + "loss": 0.94212657, + "num_input_tokens_seen": 153240570, + "router_z_loss_clip": 1.78222656, + "router_z_loss_mlp": 0.16027832, + "step": 7142, + "time_per_iteration": 2.570967674255371 + }, + { + "auxiliary_loss_clip": 0.0645667, + "auxiliary_loss_mlp": 0.01269132, + "balance_loss_clip": 0.06288485, + "balance_loss_mlp": 0.01255596, + "epoch": 0.429460393807305, + "flos": 23811709253760.0, + "grad_norm": 2.010483035293097, + "language_loss": 0.78394985, + "learning_rate": 2.5469513755369323e-06, + "loss": 0.86120784, + "num_input_tokens_seen": 153259575, + "router_z_loss_clip": 1.68066406, + "router_z_loss_mlp": 0.13549805, + "step": 7143, + "time_per_iteration": 2.5245959758758545 + }, + { + "auxiliary_loss_clip": 0.06458203, + "auxiliary_loss_mlp": 0.01271797, + "balance_loss_clip": 0.06286128, + "balance_loss_mlp": 0.01257689, + "epoch": 0.42952051705997296, + "flos": 13923502053120.0, + "grad_norm": 1.8646185905931467, + "language_loss": 0.77133417, + "learning_rate": 2.5465767501892484e-06, + "loss": 0.84863412, + "num_input_tokens_seen": 153276650, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.14117432, + "step": 7144, + "time_per_iteration": 2.5442261695861816 + }, + { + "auxiliary_loss_clip": 0.0645657, + "auxiliary_loss_mlp": 0.01274131, + "balance_loss_clip": 0.06283006, + "balance_loss_mlp": 0.0125973, + "epoch": 0.4295806403126409, + "flos": 26767584610560.0, + "grad_norm": 1.5670382727140026, + "language_loss": 0.74293256, + "learning_rate": 2.54620210411532e-06, + "loss": 0.8202396, + "num_input_tokens_seen": 153298025, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.14404297, + "step": 7145, + "time_per_iteration": 2.5812947750091553 + }, + { + "auxiliary_loss_clip": 0.06458145, + "auxiliary_loss_mlp": 0.01276391, + "balance_loss_clip": 0.06281675, + "balance_loss_mlp": 0.01261585, + "epoch": 0.4296407635653089, + "flos": 20957760789120.0, + "grad_norm": 2.084760622121642, + "language_loss": 0.79444236, + "learning_rate": 2.545827437329352e-06, + "loss": 0.87178773, + "num_input_tokens_seen": 153315775, + "router_z_loss_clip": 1.76269531, + "router_z_loss_mlp": 0.14807129, + "step": 7146, + "time_per_iteration": 2.5411908626556396 + }, + { + "auxiliary_loss_clip": 0.0645076, + "auxiliary_loss_mlp": 0.01276231, + "balance_loss_clip": 0.06280234, + "balance_loss_mlp": 0.01262373, + "epoch": 0.42970088681797686, + "flos": 15857915811840.0, + "grad_norm": 1.9977945232207481, + "language_loss": 0.83012491, + "learning_rate": 2.5454527498455532e-06, + "loss": 0.90739477, + "num_input_tokens_seen": 153332765, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.13867188, + "step": 7147, + "time_per_iteration": 2.4752652645111084 + }, + { + "auxiliary_loss_clip": 0.06456682, + "auxiliary_loss_mlp": 0.01274227, + "balance_loss_clip": 0.06283284, + "balance_loss_mlp": 0.01258622, + "epoch": 0.4297610100706448, + "flos": 22389179287680.0, + "grad_norm": 1.9494252458685553, + "language_loss": 0.87818855, + "learning_rate": 2.545078041678131e-06, + "loss": 0.95549762, + "num_input_tokens_seen": 153350760, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.15612793, + "step": 7148, + "time_per_iteration": 2.5504684448242188 + }, + { + "auxiliary_loss_clip": 0.06459592, + "auxiliary_loss_mlp": 0.0127006, + "balance_loss_clip": 0.06287406, + "balance_loss_mlp": 0.01255689, + "epoch": 0.4298211333233128, + "flos": 27932705233920.0, + "grad_norm": 1.7901480630114543, + "language_loss": 0.78474885, + "learning_rate": 2.5447033128412957e-06, + "loss": 0.86204541, + "num_input_tokens_seen": 153370765, + "router_z_loss_clip": 1.72265625, + "router_z_loss_mlp": 0.14373779, + "step": 7149, + "time_per_iteration": 2.5467026233673096 + }, + { + "auxiliary_loss_clip": 0.06454438, + "auxiliary_loss_mlp": 0.01275691, + "balance_loss_clip": 0.06285315, + "balance_loss_mlp": 0.01261153, + "epoch": 0.42988125657598075, + "flos": 24432479827200.0, + "grad_norm": 1.6909372302648806, + "language_loss": 0.79794931, + "learning_rate": 2.544328563349256e-06, + "loss": 0.87525058, + "num_input_tokens_seen": 153390725, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.14550781, + "step": 7150, + "time_per_iteration": 2.5642549991607666 + }, + { + "auxiliary_loss_clip": 0.06463797, + "auxiliary_loss_mlp": 0.01273266, + "balance_loss_clip": 0.06283444, + "balance_loss_mlp": 0.01256636, + "epoch": 0.4299413798286487, + "flos": 15855400189440.0, + "grad_norm": 1.6104667865383644, + "language_loss": 0.75438166, + "learning_rate": 2.5439537932162222e-06, + "loss": 0.8317523, + "num_input_tokens_seen": 153408010, + "router_z_loss_clip": 1.80371094, + "router_z_loss_mlp": 0.16638184, + "step": 7151, + "time_per_iteration": 2.47206711769104 + }, + { + "auxiliary_loss_clip": 0.06463672, + "auxiliary_loss_mlp": 0.01271158, + "balance_loss_clip": 0.06284998, + "balance_loss_mlp": 0.01256179, + "epoch": 0.4300015030813167, + "flos": 22316029073280.0, + "grad_norm": 1.9504143763164294, + "language_loss": 0.70926738, + "learning_rate": 2.543579002456406e-06, + "loss": 0.78661567, + "num_input_tokens_seen": 153426865, + "router_z_loss_clip": 1.78808594, + "router_z_loss_mlp": 0.14984131, + "step": 7152, + "time_per_iteration": 2.541208267211914 + }, + { + "auxiliary_loss_clip": 0.06452823, + "auxiliary_loss_mlp": 0.01271847, + "balance_loss_clip": 0.06279409, + "balance_loss_mlp": 0.01257482, + "epoch": 0.43006162633398465, + "flos": 34906391867520.0, + "grad_norm": 1.81395768481921, + "language_loss": 0.7223562, + "learning_rate": 2.54320419108402e-06, + "loss": 0.79960287, + "num_input_tokens_seen": 153449410, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.14361572, + "step": 7153, + "time_per_iteration": 2.6242926120758057 + }, + { + "auxiliary_loss_clip": 0.064519, + "auxiliary_loss_mlp": 0.01271383, + "balance_loss_clip": 0.06279962, + "balance_loss_mlp": 0.01257018, + "epoch": 0.4301217495866526, + "flos": 15967138008960.0, + "grad_norm": 2.006134184464422, + "language_loss": 0.78977376, + "learning_rate": 2.542829359113276e-06, + "loss": 0.8670066, + "num_input_tokens_seen": 153467910, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.14367676, + "step": 7154, + "time_per_iteration": 2.5568442344665527 + }, + { + "auxiliary_loss_clip": 0.06457433, + "auxiliary_loss_mlp": 0.01273105, + "balance_loss_clip": 0.06286051, + "balance_loss_mlp": 0.01258943, + "epoch": 0.43018187283932063, + "flos": 18776293666560.0, + "grad_norm": 1.5037130128548426, + "language_loss": 0.78947407, + "learning_rate": 2.542454506558389e-06, + "loss": 0.86677945, + "num_input_tokens_seen": 153487100, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.14172363, + "step": 7155, + "time_per_iteration": 2.5090463161468506 + }, + { + "auxiliary_loss_clip": 0.06448177, + "auxiliary_loss_mlp": 0.01271989, + "balance_loss_clip": 0.06280203, + "balance_loss_mlp": 0.01258613, + "epoch": 0.4302419960919886, + "flos": 20157007645440.0, + "grad_norm": 4.525310176173048, + "language_loss": 0.89197671, + "learning_rate": 2.5420796334335723e-06, + "loss": 0.96917844, + "num_input_tokens_seen": 153505565, + "router_z_loss_clip": 1.67871094, + "router_z_loss_mlp": 0.13397217, + "step": 7156, + "time_per_iteration": 2.5620951652526855 + }, + { + "auxiliary_loss_clip": 0.0645663, + "auxiliary_loss_mlp": 0.01274773, + "balance_loss_clip": 0.06281747, + "balance_loss_mlp": 0.01259836, + "epoch": 0.43030211934465656, + "flos": 26440001873280.0, + "grad_norm": 2.4796677358200423, + "language_loss": 0.82988536, + "learning_rate": 2.541704739753042e-06, + "loss": 0.90719938, + "num_input_tokens_seen": 153526130, + "router_z_loss_clip": 1.74707031, + "router_z_loss_mlp": 0.14929199, + "step": 7157, + "time_per_iteration": 2.5528175830841064 + }, + { + "auxiliary_loss_clip": 0.06457967, + "auxiliary_loss_mlp": 0.01275139, + "balance_loss_clip": 0.06280558, + "balance_loss_mlp": 0.01258974, + "epoch": 0.43036224259732453, + "flos": 24396114355200.0, + "grad_norm": 1.7333061296854189, + "language_loss": 0.71840358, + "learning_rate": 2.5413298255310132e-06, + "loss": 0.79573464, + "num_input_tokens_seen": 153546370, + "router_z_loss_clip": 1.77441406, + "router_z_loss_mlp": 0.16162109, + "step": 7158, + "time_per_iteration": 2.540012836456299 + }, + { + "auxiliary_loss_clip": 0.06449466, + "auxiliary_loss_mlp": 0.01275077, + "balance_loss_clip": 0.06278417, + "balance_loss_mlp": 0.01260355, + "epoch": 0.4304223658499925, + "flos": 17207421344640.0, + "grad_norm": 2.0047997442662684, + "language_loss": 0.82936633, + "learning_rate": 2.5409548907817034e-06, + "loss": 0.9066118, + "num_input_tokens_seen": 153562800, + "router_z_loss_clip": 1.7109375, + "router_z_loss_mlp": 0.14709473, + "step": 7159, + "time_per_iteration": 2.550978183746338 + }, + { + "auxiliary_loss_clip": 0.0645431, + "auxiliary_loss_mlp": 0.01270347, + "balance_loss_clip": 0.06281546, + "balance_loss_mlp": 0.01256048, + "epoch": 0.43048248910266046, + "flos": 14908304689920.0, + "grad_norm": 2.57539664943107, + "language_loss": 0.82999021, + "learning_rate": 2.54057993551933e-06, + "loss": 0.90723681, + "num_input_tokens_seen": 153578395, + "router_z_loss_clip": 1.72851562, + "router_z_loss_mlp": 0.1428833, + "step": 7160, + "time_per_iteration": 2.525343894958496 + }, + { + "auxiliary_loss_clip": 0.0645951, + "auxiliary_loss_mlp": 0.01269507, + "balance_loss_clip": 0.06281772, + "balance_loss_mlp": 0.01252675, + "epoch": 0.4305426123553284, + "flos": 21586245937920.0, + "grad_norm": 3.3699216716451046, + "language_loss": 0.77364504, + "learning_rate": 2.5402049597581116e-06, + "loss": 0.85093522, + "num_input_tokens_seen": 153596880, + "router_z_loss_clip": 1.77929688, + "router_z_loss_mlp": 0.16845703, + "step": 7161, + "time_per_iteration": 2.5307719707489014 + }, + { + "auxiliary_loss_clip": 0.06452791, + "auxiliary_loss_mlp": 0.0127042, + "balance_loss_clip": 0.06280292, + "balance_loss_mlp": 0.01256449, + "epoch": 0.4306027356079964, + "flos": 22607833317120.0, + "grad_norm": 2.044056208596942, + "language_loss": 0.73045391, + "learning_rate": 2.5398299635122662e-06, + "loss": 0.80768597, + "num_input_tokens_seen": 153616570, + "router_z_loss_clip": 1.72363281, + "router_z_loss_mlp": 0.13964844, + "step": 7162, + "time_per_iteration": 2.53442645072937 + }, + { + "auxiliary_loss_clip": 0.06358678, + "auxiliary_loss_mlp": 0.01256162, + "balance_loss_clip": 0.06279682, + "balance_loss_mlp": 0.01252738, + "epoch": 0.43066285886066435, + "flos": 70689873548160.0, + "grad_norm": 0.805422068373614, + "language_loss": 0.58694339, + "learning_rate": 2.5394549467960147e-06, + "loss": 0.66309178, + "num_input_tokens_seen": 153671450, + "router_z_loss_clip": 0.7890625, + "router_z_loss_mlp": 0.03433228, + "step": 7163, + "time_per_iteration": 4.420603036880493 + }, + { + "auxiliary_loss_clip": 0.06450315, + "auxiliary_loss_mlp": 0.01271156, + "balance_loss_clip": 0.06279671, + "balance_loss_mlp": 0.01257298, + "epoch": 0.4307229821133323, + "flos": 26727236069760.0, + "grad_norm": 1.7043821860128514, + "language_loss": 0.79015797, + "learning_rate": 2.5390799096235783e-06, + "loss": 0.86737275, + "num_input_tokens_seen": 153691405, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.13842773, + "step": 7164, + "time_per_iteration": 4.077051162719727 + }, + { + "auxiliary_loss_clip": 0.0645581, + "auxiliary_loss_mlp": 0.01269266, + "balance_loss_clip": 0.06279337, + "balance_loss_mlp": 0.01254222, + "epoch": 0.4307831053660003, + "flos": 26184311539200.0, + "grad_norm": 1.6263476545367235, + "language_loss": 0.68622434, + "learning_rate": 2.538704852009177e-06, + "loss": 0.76347512, + "num_input_tokens_seen": 153711555, + "router_z_loss_clip": 1.765625, + "router_z_loss_mlp": 0.1505127, + "step": 7165, + "time_per_iteration": 2.5447044372558594 + }, + { + "auxiliary_loss_clip": 0.06454252, + "auxiliary_loss_mlp": 0.01269461, + "balance_loss_clip": 0.06280573, + "balance_loss_mlp": 0.01254733, + "epoch": 0.43084322861866825, + "flos": 18915298790400.0, + "grad_norm": 2.036386887615401, + "language_loss": 0.75601453, + "learning_rate": 2.538329773967034e-06, + "loss": 0.83325171, + "num_input_tokens_seen": 153730095, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.14758301, + "step": 7166, + "time_per_iteration": 2.5380423069000244 + }, + { + "auxiliary_loss_clip": 0.06447423, + "auxiliary_loss_mlp": 0.01267427, + "balance_loss_clip": 0.06278174, + "balance_loss_mlp": 0.0125401, + "epoch": 0.4309033518713362, + "flos": 26440211508480.0, + "grad_norm": 1.6055464610704053, + "language_loss": 0.72472453, + "learning_rate": 2.537954675511372e-06, + "loss": 0.80187303, + "num_input_tokens_seen": 153749320, + "router_z_loss_clip": 1.69140625, + "router_z_loss_mlp": 0.13415527, + "step": 7167, + "time_per_iteration": 2.581911563873291 + }, + { + "auxiliary_loss_clip": 0.06445278, + "auxiliary_loss_mlp": 0.01267536, + "balance_loss_clip": 0.06279434, + "balance_loss_mlp": 0.01253398, + "epoch": 0.43096347512400424, + "flos": 21219362835840.0, + "grad_norm": 1.5535022771303773, + "language_loss": 0.78678393, + "learning_rate": 2.537579556656414e-06, + "loss": 0.86391199, + "num_input_tokens_seen": 153767825, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.14135742, + "step": 7168, + "time_per_iteration": 2.5395426750183105 + }, + { + "auxiliary_loss_clip": 0.06449728, + "auxiliary_loss_mlp": 0.0127075, + "balance_loss_clip": 0.06278324, + "balance_loss_mlp": 0.01257095, + "epoch": 0.4310235983766722, + "flos": 16544918638080.0, + "grad_norm": 2.3704233546720936, + "language_loss": 0.82314277, + "learning_rate": 2.537204417416387e-06, + "loss": 0.90034759, + "num_input_tokens_seen": 153785350, + "router_z_loss_clip": 1.71191406, + "router_z_loss_mlp": 0.13647461, + "step": 7169, + "time_per_iteration": 3.8934504985809326 + }, + { + "auxiliary_loss_clip": 0.06353073, + "auxiliary_loss_mlp": 0.01255187, + "balance_loss_clip": 0.0627488, + "balance_loss_mlp": 0.01251897, + "epoch": 0.43108372162934017, + "flos": 64794893650560.0, + "grad_norm": 0.6586067859139012, + "language_loss": 0.60826671, + "learning_rate": 2.5368292578055132e-06, + "loss": 0.6843493, + "num_input_tokens_seen": 153856400, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 0.03295898, + "step": 7170, + "time_per_iteration": 3.303295612335205 + }, + { + "auxiliary_loss_clip": 0.06446448, + "auxiliary_loss_mlp": 0.01267633, + "balance_loss_clip": 0.06276239, + "balance_loss_mlp": 0.01253841, + "epoch": 0.43114384488200813, + "flos": 13449241543680.0, + "grad_norm": 1.7965809828184895, + "language_loss": 0.76463991, + "learning_rate": 2.536454077838021e-06, + "loss": 0.84178072, + "num_input_tokens_seen": 153875230, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.13787842, + "step": 7171, + "time_per_iteration": 2.4991650581359863 + }, + { + "auxiliary_loss_clip": 0.06446211, + "auxiliary_loss_mlp": 0.01267534, + "balance_loss_clip": 0.06277663, + "balance_loss_mlp": 0.01253592, + "epoch": 0.4312039681346761, + "flos": 26293911079680.0, + "grad_norm": 1.4736819236139371, + "language_loss": 0.77570975, + "learning_rate": 2.5360788775281357e-06, + "loss": 0.8528471, + "num_input_tokens_seen": 153894740, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.13934326, + "step": 7172, + "time_per_iteration": 2.540095567703247 + }, + { + "auxiliary_loss_clip": 0.06448045, + "auxiliary_loss_mlp": 0.01271237, + "balance_loss_clip": 0.062755, + "balance_loss_mlp": 0.01256449, + "epoch": 0.43126409138734406, + "flos": 20383040833920.0, + "grad_norm": 1.8735364024745536, + "language_loss": 0.76837397, + "learning_rate": 2.535703656890086e-06, + "loss": 0.84556675, + "num_input_tokens_seen": 153913230, + "router_z_loss_clip": 1.72363281, + "router_z_loss_mlp": 0.14776611, + "step": 7173, + "time_per_iteration": 3.998828887939453 + }, + { + "auxiliary_loss_clip": 0.06449778, + "auxiliary_loss_mlp": 0.0126907, + "balance_loss_clip": 0.06280752, + "balance_loss_mlp": 0.0125529, + "epoch": 0.431324214640012, + "flos": 22128918906240.0, + "grad_norm": 1.4124937065278635, + "language_loss": 0.76940411, + "learning_rate": 2.5353284159381e-06, + "loss": 0.84659261, + "num_input_tokens_seen": 153933250, + "router_z_loss_clip": 1.69042969, + "router_z_loss_mlp": 0.13800049, + "step": 7174, + "time_per_iteration": 2.510742425918579 + }, + { + "auxiliary_loss_clip": 0.06448075, + "auxiliary_loss_mlp": 0.01271664, + "balance_loss_clip": 0.06275856, + "balance_loss_mlp": 0.01256477, + "epoch": 0.43138433789268, + "flos": 15236306697600.0, + "grad_norm": 1.9136821796322663, + "language_loss": 0.82178259, + "learning_rate": 2.534953154686407e-06, + "loss": 0.89898002, + "num_input_tokens_seen": 153951325, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.15185547, + "step": 7175, + "time_per_iteration": 2.5317423343658447 + }, + { + "auxiliary_loss_clip": 0.06456869, + "auxiliary_loss_mlp": 0.01274036, + "balance_loss_clip": 0.06277366, + "balance_loss_mlp": 0.01256935, + "epoch": 0.43144446114534796, + "flos": 18156151998720.0, + "grad_norm": 2.207412358761708, + "language_loss": 0.74869847, + "learning_rate": 2.5345778731492366e-06, + "loss": 0.82600749, + "num_input_tokens_seen": 153966975, + "router_z_loss_clip": 1.79492188, + "router_z_loss_mlp": 0.17095947, + "step": 7176, + "time_per_iteration": 2.4871389865875244 + }, + { + "auxiliary_loss_clip": 0.0645103, + "auxiliary_loss_mlp": 0.01269847, + "balance_loss_clip": 0.06277142, + "balance_loss_mlp": 0.01255565, + "epoch": 0.4315045843980159, + "flos": 22936506157440.0, + "grad_norm": 1.949576719813971, + "language_loss": 0.73992217, + "learning_rate": 2.534202571340819e-06, + "loss": 0.81713092, + "num_input_tokens_seen": 153986695, + "router_z_loss_clip": 1.73828125, + "router_z_loss_mlp": 0.14294434, + "step": 7177, + "time_per_iteration": 2.5317373275756836 + }, + { + "auxiliary_loss_clip": 0.06461225, + "auxiliary_loss_mlp": 0.01270022, + "balance_loss_clip": 0.06277613, + "balance_loss_mlp": 0.01253667, + "epoch": 0.4315647076506839, + "flos": 22133321245440.0, + "grad_norm": 1.7707547745548928, + "language_loss": 0.81576592, + "learning_rate": 2.533827249275387e-06, + "loss": 0.89307833, + "num_input_tokens_seen": 154004710, + "router_z_loss_clip": 1.83691406, + "router_z_loss_mlp": 0.16357422, + "step": 7178, + "time_per_iteration": 2.5210797786712646 + }, + { + "auxiliary_loss_clip": 0.06445872, + "auxiliary_loss_mlp": 0.01271308, + "balance_loss_clip": 0.06281172, + "balance_loss_mlp": 0.01257962, + "epoch": 0.43162483090335185, + "flos": 26878567743360.0, + "grad_norm": 1.4959775860860902, + "language_loss": 0.84818423, + "learning_rate": 2.5334519069671725e-06, + "loss": 0.92535609, + "num_input_tokens_seen": 154024320, + "router_z_loss_clip": 1.64746094, + "router_z_loss_mlp": 0.13360596, + "step": 7179, + "time_per_iteration": 2.6229355335235596 + }, + { + "auxiliary_loss_clip": 0.06446353, + "auxiliary_loss_mlp": 0.01270616, + "balance_loss_clip": 0.06276584, + "balance_loss_mlp": 0.01256096, + "epoch": 0.4316849541560198, + "flos": 13917464559360.0, + "grad_norm": 1.6356598233983888, + "language_loss": 0.75595218, + "learning_rate": 2.5330765444304075e-06, + "loss": 0.83312184, + "num_input_tokens_seen": 154041755, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.1451416, + "step": 7180, + "time_per_iteration": 2.4882874488830566 + }, + { + "auxiliary_loss_clip": 0.06450133, + "auxiliary_loss_mlp": 0.01266264, + "balance_loss_clip": 0.0627453, + "balance_loss_mlp": 0.01251023, + "epoch": 0.4317450774086878, + "flos": 16440685758720.0, + "grad_norm": 1.8060434620212955, + "language_loss": 0.81820869, + "learning_rate": 2.5327011616793274e-06, + "loss": 0.89537263, + "num_input_tokens_seen": 154056775, + "router_z_loss_clip": 1.7578125, + "router_z_loss_mlp": 0.15252686, + "step": 7181, + "time_per_iteration": 2.534747838973999 + }, + { + "auxiliary_loss_clip": 0.0644898, + "auxiliary_loss_mlp": 0.0127112, + "balance_loss_clip": 0.06274159, + "balance_loss_mlp": 0.01256189, + "epoch": 0.4318052006613558, + "flos": 20560675489920.0, + "grad_norm": 1.632078496987146, + "language_loss": 0.88980561, + "learning_rate": 2.532325758728165e-06, + "loss": 0.96700662, + "num_input_tokens_seen": 154075015, + "router_z_loss_clip": 1.74804688, + "router_z_loss_mlp": 0.14923096, + "step": 7182, + "time_per_iteration": 2.493427038192749 + }, + { + "auxiliary_loss_clip": 0.06446697, + "auxiliary_loss_mlp": 0.01267064, + "balance_loss_clip": 0.06278539, + "balance_loss_mlp": 0.01254052, + "epoch": 0.43186532391402377, + "flos": 22826613127680.0, + "grad_norm": 1.9212724157627075, + "language_loss": 0.75858486, + "learning_rate": 2.5319503355911566e-06, + "loss": 0.83572245, + "num_input_tokens_seen": 154095170, + "router_z_loss_clip": 1.68554688, + "router_z_loss_mlp": 0.13012695, + "step": 7183, + "time_per_iteration": 2.552116870880127 + }, + { + "auxiliary_loss_clip": 0.06451686, + "auxiliary_loss_mlp": 0.01267536, + "balance_loss_clip": 0.06278371, + "balance_loss_mlp": 0.01253923, + "epoch": 0.43192544716669173, + "flos": 25563624819840.0, + "grad_norm": 1.5103875784905794, + "language_loss": 0.77652711, + "learning_rate": 2.5315748922825393e-06, + "loss": 0.85371935, + "num_input_tokens_seen": 154116895, + "router_z_loss_clip": 1.73339844, + "router_z_loss_mlp": 0.13604736, + "step": 7184, + "time_per_iteration": 2.5299277305603027 + }, + { + "auxiliary_loss_clip": 0.06444119, + "auxiliary_loss_mlp": 0.01269203, + "balance_loss_clip": 0.06279948, + "balance_loss_mlp": 0.01255494, + "epoch": 0.4319855704193597, + "flos": 30962317783680.0, + "grad_norm": 1.4924548432613554, + "language_loss": 0.73502755, + "learning_rate": 2.5311994288165474e-06, + "loss": 0.81216079, + "num_input_tokens_seen": 154138395, + "router_z_loss_clip": 1.63964844, + "router_z_loss_mlp": 0.13720703, + "step": 7185, + "time_per_iteration": 2.5939247608184814 + }, + { + "auxiliary_loss_clip": 0.06455707, + "auxiliary_loss_mlp": 0.01271443, + "balance_loss_clip": 0.06279209, + "balance_loss_mlp": 0.0125684, + "epoch": 0.43204569367202766, + "flos": 24244824608640.0, + "grad_norm": 2.4112385113933015, + "language_loss": 0.75683951, + "learning_rate": 2.530823945207421e-06, + "loss": 0.83411103, + "num_input_tokens_seen": 154156775, + "router_z_loss_clip": 1.76464844, + "router_z_loss_mlp": 0.14611816, + "step": 7186, + "time_per_iteration": 2.543679714202881 + }, + { + "auxiliary_loss_clip": 0.06451818, + "auxiliary_loss_mlp": 0.01273087, + "balance_loss_clip": 0.06278853, + "balance_loss_mlp": 0.01259068, + "epoch": 0.43210581692469563, + "flos": 18413058216960.0, + "grad_norm": 2.2976206703160065, + "language_loss": 0.76516449, + "learning_rate": 2.5304484414693962e-06, + "loss": 0.84241354, + "num_input_tokens_seen": 154177500, + "router_z_loss_clip": 1.73046875, + "router_z_loss_mlp": 0.14038086, + "step": 7187, + "time_per_iteration": 2.530064105987549 + }, + { + "auxiliary_loss_clip": 0.06368419, + "auxiliary_loss_mlp": 0.01252589, + "balance_loss_clip": 0.06291005, + "balance_loss_mlp": 0.01249776, + "epoch": 0.4321659401773636, + "flos": 49851718133760.0, + "grad_norm": 0.8382360401327144, + "language_loss": 0.68072379, + "learning_rate": 2.530072917616714e-06, + "loss": 0.75693387, + "num_input_tokens_seen": 154237110, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.02812195, + "step": 7188, + "time_per_iteration": 3.1670610904693604 + }, + { + "auxiliary_loss_clip": 0.06446176, + "auxiliary_loss_mlp": 0.01270026, + "balance_loss_clip": 0.06279401, + "balance_loss_mlp": 0.01256913, + "epoch": 0.43222606343003156, + "flos": 17134229203200.0, + "grad_norm": 1.9056972558163987, + "language_loss": 0.7844317, + "learning_rate": 2.529697373663614e-06, + "loss": 0.86159372, + "num_input_tokens_seen": 154253910, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.13110352, + "step": 7189, + "time_per_iteration": 2.491743564605713 + }, + { + "auxiliary_loss_clip": 0.06457567, + "auxiliary_loss_mlp": 0.01270927, + "balance_loss_clip": 0.06278813, + "balance_loss_mlp": 0.01255906, + "epoch": 0.4322861866826995, + "flos": 22756984784640.0, + "grad_norm": 1.8601510823080152, + "language_loss": 0.72126836, + "learning_rate": 2.5293218096243364e-06, + "loss": 0.79855329, + "num_input_tokens_seen": 154274770, + "router_z_loss_clip": 1.78808594, + "router_z_loss_mlp": 0.15020752, + "step": 7190, + "time_per_iteration": 2.5745973587036133 + }, + { + "auxiliary_loss_clip": 0.06452946, + "auxiliary_loss_mlp": 0.01274284, + "balance_loss_clip": 0.06282853, + "balance_loss_mlp": 0.0125992, + "epoch": 0.4323463099353675, + "flos": 27899400435840.0, + "grad_norm": 1.5852812804273753, + "language_loss": 0.79949737, + "learning_rate": 2.5289462255131223e-06, + "loss": 0.87676966, + "num_input_tokens_seen": 154295035, + "router_z_loss_clip": 1.70019531, + "router_z_loss_mlp": 0.14355469, + "step": 7191, + "time_per_iteration": 2.5719873905181885 + }, + { + "auxiliary_loss_clip": 0.06448484, + "auxiliary_loss_mlp": 0.01269731, + "balance_loss_clip": 0.06279992, + "balance_loss_mlp": 0.01255694, + "epoch": 0.43240643318803546, + "flos": 21620892401280.0, + "grad_norm": 3.0880415359088467, + "language_loss": 0.75279927, + "learning_rate": 2.5285706213442146e-06, + "loss": 0.82998139, + "num_input_tokens_seen": 154314905, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.14056396, + "step": 7192, + "time_per_iteration": 2.536587715148926 + }, + { + "auxiliary_loss_clip": 0.0644784, + "auxiliary_loss_mlp": 0.01276118, + "balance_loss_clip": 0.06277698, + "balance_loss_mlp": 0.01260883, + "epoch": 0.4324665564407034, + "flos": 17562774510720.0, + "grad_norm": 2.069328799544239, + "language_loss": 0.79199994, + "learning_rate": 2.5281949971318557e-06, + "loss": 0.86923951, + "num_input_tokens_seen": 154331740, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.15216064, + "step": 7193, + "time_per_iteration": 2.483978033065796 + }, + { + "auxiliary_loss_clip": 0.06449077, + "auxiliary_loss_mlp": 0.01277624, + "balance_loss_clip": 0.06278618, + "balance_loss_mlp": 0.01263212, + "epoch": 0.4325266796933714, + "flos": 18407775409920.0, + "grad_norm": 2.329186427032778, + "language_loss": 0.76053572, + "learning_rate": 2.5278193528902897e-06, + "loss": 0.83780271, + "num_input_tokens_seen": 154348740, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.14404297, + "step": 7194, + "time_per_iteration": 2.5057263374328613 + }, + { + "auxiliary_loss_clip": 0.06451394, + "auxiliary_loss_mlp": 0.01275378, + "balance_loss_clip": 0.06279992, + "balance_loss_mlp": 0.01260847, + "epoch": 0.4325868029460394, + "flos": 22571342064000.0, + "grad_norm": 1.9582306658700896, + "language_loss": 0.60073519, + "learning_rate": 2.5274436886337613e-06, + "loss": 0.67800295, + "num_input_tokens_seen": 154368835, + "router_z_loss_clip": 1.71191406, + "router_z_loss_mlp": 0.14532471, + "step": 7195, + "time_per_iteration": 2.5116991996765137 + }, + { + "auxiliary_loss_clip": 0.06458029, + "auxiliary_loss_mlp": 0.01275051, + "balance_loss_clip": 0.06281463, + "balance_loss_mlp": 0.01259989, + "epoch": 0.43264692619870737, + "flos": 14609834046720.0, + "grad_norm": 1.968403141706004, + "language_loss": 0.65685856, + "learning_rate": 2.527068004376515e-06, + "loss": 0.73418939, + "num_input_tokens_seen": 154384620, + "router_z_loss_clip": 1.76660156, + "router_z_loss_mlp": 0.1506958, + "step": 7196, + "time_per_iteration": 2.5037827491760254 + }, + { + "auxiliary_loss_clip": 0.06456476, + "auxiliary_loss_mlp": 0.01272338, + "balance_loss_clip": 0.06280259, + "balance_loss_mlp": 0.01257151, + "epoch": 0.43270704945137534, + "flos": 21507184010880.0, + "grad_norm": 2.17558250449299, + "language_loss": 0.72638965, + "learning_rate": 2.526692300132797e-06, + "loss": 0.8036778, + "num_input_tokens_seen": 154402865, + "router_z_loss_clip": 1.76171875, + "router_z_loss_mlp": 0.15197754, + "step": 7197, + "time_per_iteration": 2.4931299686431885 + }, + { + "auxiliary_loss_clip": 0.0645181, + "auxiliary_loss_mlp": 0.01280731, + "balance_loss_clip": 0.06284913, + "balance_loss_mlp": 0.01265627, + "epoch": 0.4327671727040433, + "flos": 25162975722240.0, + "grad_norm": 1.6800922175899422, + "language_loss": 0.72821289, + "learning_rate": 2.5263165759168547e-06, + "loss": 0.8055383, + "num_input_tokens_seen": 154423625, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.15100098, + "step": 7198, + "time_per_iteration": 2.574894428253174 + }, + { + "auxiliary_loss_clip": 0.06448364, + "auxiliary_loss_mlp": 0.01268994, + "balance_loss_clip": 0.06280281, + "balance_loss_mlp": 0.01254969, + "epoch": 0.43282729595671127, + "flos": 25454192987520.0, + "grad_norm": 1.3407856907116962, + "language_loss": 0.8128798, + "learning_rate": 2.525940831742934e-06, + "loss": 0.89005339, + "num_input_tokens_seen": 154444775, + "router_z_loss_clip": 1.68066406, + "router_z_loss_mlp": 0.14013672, + "step": 7199, + "time_per_iteration": 2.5314407348632812 + }, + { + "auxiliary_loss_clip": 0.06450363, + "auxiliary_loss_mlp": 0.01269925, + "balance_loss_clip": 0.06280895, + "balance_loss_mlp": 0.01255918, + "epoch": 0.43288741920937923, + "flos": 24131661269760.0, + "grad_norm": 2.374744791798318, + "language_loss": 0.68757379, + "learning_rate": 2.525565067625286e-06, + "loss": 0.76477665, + "num_input_tokens_seen": 154460815, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.14013672, + "step": 7200, + "time_per_iteration": 2.5569095611572266 + }, + { + "auxiliary_loss_clip": 0.06449814, + "auxiliary_loss_mlp": 0.01269719, + "balance_loss_clip": 0.06278992, + "balance_loss_mlp": 0.01254925, + "epoch": 0.4329475424620472, + "flos": 19210415270400.0, + "grad_norm": 1.7756006077325563, + "language_loss": 0.87039292, + "learning_rate": 2.525189283578157e-06, + "loss": 0.94758821, + "num_input_tokens_seen": 154479145, + "router_z_loss_clip": 1.70898438, + "router_z_loss_mlp": 0.14807129, + "step": 7201, + "time_per_iteration": 2.4946835041046143 + }, + { + "auxiliary_loss_clip": 0.06464264, + "auxiliary_loss_mlp": 0.0127186, + "balance_loss_clip": 0.06283499, + "balance_loss_mlp": 0.01255016, + "epoch": 0.43300766571471516, + "flos": 22645037329920.0, + "grad_norm": 5.903168179153311, + "language_loss": 0.64564252, + "learning_rate": 2.5248134796157974e-06, + "loss": 0.72300375, + "num_input_tokens_seen": 154498905, + "router_z_loss_clip": 1.80664062, + "router_z_loss_mlp": 0.16845703, + "step": 7202, + "time_per_iteration": 2.5667803287506104 + }, + { + "auxiliary_loss_clip": 0.06448028, + "auxiliary_loss_mlp": 0.01268297, + "balance_loss_clip": 0.06278727, + "balance_loss_mlp": 0.01254838, + "epoch": 0.4330677889673831, + "flos": 22126570992000.0, + "grad_norm": 2.072135817395126, + "language_loss": 0.8230809, + "learning_rate": 2.5244376557524586e-06, + "loss": 0.90024418, + "num_input_tokens_seen": 154517270, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.13470459, + "step": 7203, + "time_per_iteration": 5.375681161880493 + }, + { + "auxiliary_loss_clip": 0.06458279, + "auxiliary_loss_mlp": 0.01268927, + "balance_loss_clip": 0.06282033, + "balance_loss_mlp": 0.01254169, + "epoch": 0.4331279122200511, + "flos": 23228184620160.0, + "grad_norm": 2.3968905297379024, + "language_loss": 0.81134045, + "learning_rate": 2.5240618120023912e-06, + "loss": 0.88861251, + "num_input_tokens_seen": 154535945, + "router_z_loss_clip": 1.75976562, + "router_z_loss_mlp": 0.14764404, + "step": 7204, + "time_per_iteration": 2.524557113647461 + }, + { + "auxiliary_loss_clip": 0.06450962, + "auxiliary_loss_mlp": 0.01270518, + "balance_loss_clip": 0.06281083, + "balance_loss_mlp": 0.0125691, + "epoch": 0.43318803547271906, + "flos": 18265625758080.0, + "grad_norm": 2.088854485199162, + "language_loss": 0.7413221, + "learning_rate": 2.5236859483798468e-06, + "loss": 0.81853694, + "num_input_tokens_seen": 154554935, + "router_z_loss_clip": 1.69726562, + "router_z_loss_mlp": 0.13604736, + "step": 7205, + "time_per_iteration": 2.519554376602173 + }, + { + "auxiliary_loss_clip": 0.0644919, + "auxiliary_loss_mlp": 0.01273515, + "balance_loss_clip": 0.06284859, + "balance_loss_mlp": 0.01259908, + "epoch": 0.433248158725387, + "flos": 27425936540160.0, + "grad_norm": 1.5872196628882773, + "language_loss": 0.75603741, + "learning_rate": 2.5233100648990803e-06, + "loss": 0.83326447, + "num_input_tokens_seen": 154576065, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.13598633, + "step": 7206, + "time_per_iteration": 2.5732641220092773 + }, + { + "auxiliary_loss_clip": 0.0644986, + "auxiliary_loss_mlp": 0.01269665, + "balance_loss_clip": 0.06280635, + "balance_loss_mlp": 0.01254728, + "epoch": 0.433308281978055, + "flos": 23224075770240.0, + "grad_norm": 1.828436296505125, + "language_loss": 0.78923273, + "learning_rate": 2.522934161574342e-06, + "loss": 0.86642796, + "num_input_tokens_seen": 154595110, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.1496582, + "step": 7207, + "time_per_iteration": 2.6846628189086914 + }, + { + "auxiliary_loss_clip": 0.06456017, + "auxiliary_loss_mlp": 0.01270448, + "balance_loss_clip": 0.06279423, + "balance_loss_mlp": 0.0125513, + "epoch": 0.433368405230723, + "flos": 15857999665920.0, + "grad_norm": 2.196810095173743, + "language_loss": 0.81095958, + "learning_rate": 2.5225582384198888e-06, + "loss": 0.8882243, + "num_input_tokens_seen": 154612255, + "router_z_loss_clip": 1.76367188, + "router_z_loss_mlp": 0.15307617, + "step": 7208, + "time_per_iteration": 2.4724419116973877 + }, + { + "auxiliary_loss_clip": 0.0645436, + "auxiliary_loss_mlp": 0.01269383, + "balance_loss_clip": 0.0628323, + "balance_loss_mlp": 0.0125481, + "epoch": 0.433428528483391, + "flos": 19032109781760.0, + "grad_norm": 2.1243132825557107, + "language_loss": 0.71321076, + "learning_rate": 2.5221822954499744e-06, + "loss": 0.79044819, + "num_input_tokens_seen": 154630440, + "router_z_loss_clip": 1.7109375, + "router_z_loss_mlp": 0.14581299, + "step": 7209, + "time_per_iteration": 3.9143481254577637 + }, + { + "auxiliary_loss_clip": 0.06450495, + "auxiliary_loss_mlp": 0.01271038, + "balance_loss_clip": 0.06281973, + "balance_loss_mlp": 0.01255517, + "epoch": 0.43348865173605894, + "flos": 24725290320000.0, + "grad_norm": 1.4388803928851785, + "language_loss": 0.8148647, + "learning_rate": 2.5218063326788557e-06, + "loss": 0.89208007, + "num_input_tokens_seen": 154652515, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.15515137, + "step": 7210, + "time_per_iteration": 2.564333915710449 + }, + { + "auxiliary_loss_clip": 0.06451392, + "auxiliary_loss_mlp": 0.01274146, + "balance_loss_clip": 0.06281275, + "balance_loss_mlp": 0.01261045, + "epoch": 0.4335487749887269, + "flos": 22097165408640.0, + "grad_norm": 1.8576931130518815, + "language_loss": 0.82474005, + "learning_rate": 2.5214303501207885e-06, + "loss": 0.90199542, + "num_input_tokens_seen": 154670965, + "router_z_loss_clip": 1.70019531, + "router_z_loss_mlp": 0.13110352, + "step": 7211, + "time_per_iteration": 2.491514205932617 + }, + { + "auxiliary_loss_clip": 0.06452142, + "auxiliary_loss_mlp": 0.01271809, + "balance_loss_clip": 0.06280628, + "balance_loss_mlp": 0.01258362, + "epoch": 0.43360889824139487, + "flos": 22389556631040.0, + "grad_norm": 12.106558391415842, + "language_loss": 0.7536357, + "learning_rate": 2.521054347790029e-06, + "loss": 0.83087522, + "num_input_tokens_seen": 154689980, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.13452148, + "step": 7212, + "time_per_iteration": 2.551093816757202 + }, + { + "auxiliary_loss_clip": 0.06452519, + "auxiliary_loss_mlp": 0.01272111, + "balance_loss_clip": 0.06284005, + "balance_loss_mlp": 0.01259517, + "epoch": 0.43366902149406283, + "flos": 17533746270720.0, + "grad_norm": 1.8081714291238689, + "language_loss": 0.77247733, + "learning_rate": 2.5206783257008375e-06, + "loss": 0.84972358, + "num_input_tokens_seen": 154706570, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.1260376, + "step": 7213, + "time_per_iteration": 3.8823790550231934 + }, + { + "auxiliary_loss_clip": 0.06452443, + "auxiliary_loss_mlp": 0.01274704, + "balance_loss_clip": 0.06281798, + "balance_loss_mlp": 0.01261245, + "epoch": 0.4337291447467308, + "flos": 19028126712960.0, + "grad_norm": 1.4293111519880635, + "language_loss": 0.65090191, + "learning_rate": 2.520302283867471e-06, + "loss": 0.72817338, + "num_input_tokens_seen": 154725210, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.13446045, + "step": 7214, + "time_per_iteration": 2.512341260910034 + }, + { + "auxiliary_loss_clip": 0.0644484, + "auxiliary_loss_mlp": 0.01268075, + "balance_loss_clip": 0.06280676, + "balance_loss_mlp": 0.01255319, + "epoch": 0.43378926799939876, + "flos": 27241216214400.0, + "grad_norm": 1.6847650033402397, + "language_loss": 0.7180531, + "learning_rate": 2.519926222304191e-06, + "loss": 0.79518223, + "num_input_tokens_seen": 154745945, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.12750244, + "step": 7215, + "time_per_iteration": 2.5413544178009033 + }, + { + "auxiliary_loss_clip": 0.06451561, + "auxiliary_loss_mlp": 0.01271937, + "balance_loss_clip": 0.06284516, + "balance_loss_mlp": 0.01258365, + "epoch": 0.43384939125206673, + "flos": 15966592957440.0, + "grad_norm": 1.7641597528508168, + "language_loss": 0.75291193, + "learning_rate": 2.519550141025255e-06, + "loss": 0.83014691, + "num_input_tokens_seen": 154763580, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.13574219, + "step": 7216, + "time_per_iteration": 2.539677143096924 + }, + { + "auxiliary_loss_clip": 0.06459753, + "auxiliary_loss_mlp": 0.01268936, + "balance_loss_clip": 0.06280532, + "balance_loss_mlp": 0.01254256, + "epoch": 0.4339095145047347, + "flos": 21798736692480.0, + "grad_norm": 2.367070732862923, + "language_loss": 0.7623983, + "learning_rate": 2.519174040044927e-06, + "loss": 0.8396852, + "num_input_tokens_seen": 154776825, + "router_z_loss_clip": 1.79394531, + "router_z_loss_mlp": 0.14685059, + "step": 7217, + "time_per_iteration": 2.491522789001465 + }, + { + "auxiliary_loss_clip": 0.06451164, + "auxiliary_loss_mlp": 0.01267926, + "balance_loss_clip": 0.0628095, + "balance_loss_mlp": 0.01254389, + "epoch": 0.43396963775740266, + "flos": 14215054734720.0, + "grad_norm": 2.758270274773255, + "language_loss": 0.74231893, + "learning_rate": 2.5187979193774664e-06, + "loss": 0.81950986, + "num_input_tokens_seen": 154794025, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.13531494, + "step": 7218, + "time_per_iteration": 2.5123910903930664 + }, + { + "auxiliary_loss_clip": 0.06450492, + "auxiliary_loss_mlp": 0.01270563, + "balance_loss_clip": 0.06277994, + "balance_loss_mlp": 0.01256443, + "epoch": 0.4340297610100706, + "flos": 19725150101760.0, + "grad_norm": 1.5975368135070402, + "language_loss": 0.69353253, + "learning_rate": 2.5184217790371367e-06, + "loss": 0.77074307, + "num_input_tokens_seen": 154813105, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.14117432, + "step": 7219, + "time_per_iteration": 2.502150297164917 + }, + { + "auxiliary_loss_clip": 0.06450121, + "auxiliary_loss_mlp": 0.01273865, + "balance_loss_clip": 0.06280973, + "balance_loss_mlp": 0.01259482, + "epoch": 0.4340898842627386, + "flos": 18959588472960.0, + "grad_norm": 2.696483499139917, + "language_loss": 0.77797616, + "learning_rate": 2.518045619038202e-06, + "loss": 0.85521603, + "num_input_tokens_seen": 154833525, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.1439209, + "step": 7220, + "time_per_iteration": 2.5805821418762207 + }, + { + "auxiliary_loss_clip": 0.06449743, + "auxiliary_loss_mlp": 0.01270897, + "balance_loss_clip": 0.06280366, + "balance_loss_mlp": 0.01257331, + "epoch": 0.4341500075154066, + "flos": 22024895662080.0, + "grad_norm": 2.140213938529436, + "language_loss": 0.69858402, + "learning_rate": 2.5176694393949243e-06, + "loss": 0.77579045, + "num_input_tokens_seen": 154853090, + "router_z_loss_clip": 1.69238281, + "router_z_loss_mlp": 0.13562012, + "step": 7221, + "time_per_iteration": 2.556913137435913 + }, + { + "auxiliary_loss_clip": 0.06448823, + "auxiliary_loss_mlp": 0.01267968, + "balance_loss_clip": 0.06277943, + "balance_loss_mlp": 0.01254188, + "epoch": 0.4342101307680746, + "flos": 23588527104000.0, + "grad_norm": 1.6725579163220456, + "language_loss": 0.65062654, + "learning_rate": 2.51729324012157e-06, + "loss": 0.72779441, + "num_input_tokens_seen": 154872055, + "router_z_loss_clip": 1.70898438, + "router_z_loss_mlp": 0.13793945, + "step": 7222, + "time_per_iteration": 2.642038345336914 + }, + { + "auxiliary_loss_clip": 0.0644563, + "auxiliary_loss_mlp": 0.01269163, + "balance_loss_clip": 0.06277044, + "balance_loss_mlp": 0.01254912, + "epoch": 0.43427025402074254, + "flos": 17973821514240.0, + "grad_norm": 2.158287657708821, + "language_loss": 0.73335516, + "learning_rate": 2.5169170212324053e-06, + "loss": 0.81050307, + "num_input_tokens_seen": 154886645, + "router_z_loss_clip": 1.68554688, + "router_z_loss_mlp": 0.14257812, + "step": 7223, + "time_per_iteration": 2.5124166011810303 + }, + { + "auxiliary_loss_clip": 0.06448437, + "auxiliary_loss_mlp": 0.01270913, + "balance_loss_clip": 0.06275682, + "balance_loss_mlp": 0.0125746, + "epoch": 0.4343303772734105, + "flos": 26293575663360.0, + "grad_norm": 1.9810355285503365, + "language_loss": 0.94283241, + "learning_rate": 2.516540782741694e-06, + "loss": 1.02002597, + "num_input_tokens_seen": 154906775, + "router_z_loss_clip": 1.72753906, + "router_z_loss_mlp": 0.13458252, + "step": 7224, + "time_per_iteration": 2.5581512451171875 + }, + { + "auxiliary_loss_clip": 0.06445128, + "auxiliary_loss_mlp": 0.01270275, + "balance_loss_clip": 0.06277162, + "balance_loss_mlp": 0.01257383, + "epoch": 0.43439050052607847, + "flos": 26841279876480.0, + "grad_norm": 2.0217716161026624, + "language_loss": 0.61832798, + "learning_rate": 2.5161645246637056e-06, + "loss": 0.69548196, + "num_input_tokens_seen": 154926990, + "router_z_loss_clip": 1.6796875, + "router_z_loss_mlp": 0.12890625, + "step": 7225, + "time_per_iteration": 2.5797905921936035 + }, + { + "auxiliary_loss_clip": 0.06447432, + "auxiliary_loss_mlp": 0.01269551, + "balance_loss_clip": 0.06278066, + "balance_loss_mlp": 0.01255895, + "epoch": 0.43445062377874644, + "flos": 21404083161600.0, + "grad_norm": 2.452465231522654, + "language_loss": 0.77966076, + "learning_rate": 2.5157882470127054e-06, + "loss": 0.8568306, + "num_input_tokens_seen": 154946210, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.13653564, + "step": 7226, + "time_per_iteration": 2.511101722717285 + }, + { + "auxiliary_loss_clip": 0.06444375, + "auxiliary_loss_mlp": 0.01273195, + "balance_loss_clip": 0.06280836, + "balance_loss_mlp": 0.01260553, + "epoch": 0.4345107470314144, + "flos": 19908151418880.0, + "grad_norm": 1.6845072318289191, + "language_loss": 0.84942114, + "learning_rate": 2.515411949802964e-06, + "loss": 0.92659688, + "num_input_tokens_seen": 154964995, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.12652588, + "step": 7227, + "time_per_iteration": 2.525317430496216 + }, + { + "auxiliary_loss_clip": 0.06449986, + "auxiliary_loss_mlp": 0.01270041, + "balance_loss_clip": 0.06281552, + "balance_loss_mlp": 0.0125601, + "epoch": 0.43457087028408237, + "flos": 26439876092160.0, + "grad_norm": 2.0880007397823714, + "language_loss": 0.77098775, + "learning_rate": 2.5150356330487498e-06, + "loss": 0.84818804, + "num_input_tokens_seen": 154984775, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.14025879, + "step": 7228, + "time_per_iteration": 2.5491206645965576 + }, + { + "auxiliary_loss_clip": 0.06447831, + "auxiliary_loss_mlp": 0.01269154, + "balance_loss_clip": 0.06281967, + "balance_loss_mlp": 0.0125486, + "epoch": 0.43463099353675033, + "flos": 31876947025920.0, + "grad_norm": 1.527689344505128, + "language_loss": 0.80533445, + "learning_rate": 2.5146592967643324e-06, + "loss": 0.88250422, + "num_input_tokens_seen": 155008125, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.14294434, + "step": 7229, + "time_per_iteration": 2.6139633655548096 + }, + { + "auxiliary_loss_clip": 0.06448658, + "auxiliary_loss_mlp": 0.01272316, + "balance_loss_clip": 0.0627811, + "balance_loss_mlp": 0.01258208, + "epoch": 0.4346911167894183, + "flos": 24578109423360.0, + "grad_norm": 1.897670481755329, + "language_loss": 0.8187139, + "learning_rate": 2.5142829409639834e-06, + "loss": 0.89592373, + "num_input_tokens_seen": 155027885, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.14117432, + "step": 7230, + "time_per_iteration": 2.535597085952759 + }, + { + "auxiliary_loss_clip": 0.06454149, + "auxiliary_loss_mlp": 0.01272993, + "balance_loss_clip": 0.06280425, + "balance_loss_mlp": 0.01258849, + "epoch": 0.43475124004208626, + "flos": 17096102795520.0, + "grad_norm": 2.6326033188165012, + "language_loss": 0.77091682, + "learning_rate": 2.513906565661973e-06, + "loss": 0.84818828, + "num_input_tokens_seen": 155043375, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.14135742, + "step": 7231, + "time_per_iteration": 2.509392738342285 + }, + { + "auxiliary_loss_clip": 0.064488, + "auxiliary_loss_mlp": 0.01274763, + "balance_loss_clip": 0.06282736, + "balance_loss_mlp": 0.01262162, + "epoch": 0.4348113632947542, + "flos": 26111874084480.0, + "grad_norm": 2.1662461953899044, + "language_loss": 0.69288278, + "learning_rate": 2.513530170872575e-06, + "loss": 0.77011836, + "num_input_tokens_seen": 155062930, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.1260376, + "step": 7232, + "time_per_iteration": 2.547469139099121 + }, + { + "auxiliary_loss_clip": 0.0645097, + "auxiliary_loss_mlp": 0.01271517, + "balance_loss_clip": 0.06279375, + "balance_loss_mlp": 0.01256431, + "epoch": 0.4348714865474222, + "flos": 34208446083840.0, + "grad_norm": 2.030594980717477, + "language_loss": 0.72046328, + "learning_rate": 2.5131537566100605e-06, + "loss": 0.79768813, + "num_input_tokens_seen": 155084980, + "router_z_loss_clip": 1.71386719, + "router_z_loss_mlp": 0.15075684, + "step": 7233, + "time_per_iteration": 2.633953332901001 + }, + { + "auxiliary_loss_clip": 0.06453332, + "auxiliary_loss_mlp": 0.01271348, + "balance_loss_clip": 0.06279553, + "balance_loss_mlp": 0.01257466, + "epoch": 0.43493160980009016, + "flos": 31545045803520.0, + "grad_norm": 1.5667863682634524, + "language_loss": 0.75517476, + "learning_rate": 2.5127773228887053e-06, + "loss": 0.83242154, + "num_input_tokens_seen": 155107260, + "router_z_loss_clip": 1.73730469, + "router_z_loss_mlp": 0.13885498, + "step": 7234, + "time_per_iteration": 2.592467784881592 + }, + { + "auxiliary_loss_clip": 0.06464201, + "auxiliary_loss_mlp": 0.01272529, + "balance_loss_clip": 0.06286918, + "balance_loss_mlp": 0.01258003, + "epoch": 0.4349917330527582, + "flos": 24068238128640.0, + "grad_norm": 2.6345915143615284, + "language_loss": 0.5890404, + "learning_rate": 2.512400869722782e-06, + "loss": 0.6664077, + "num_input_tokens_seen": 155126720, + "router_z_loss_clip": 1.77246094, + "router_z_loss_mlp": 0.14520264, + "step": 7235, + "time_per_iteration": 2.5652947425842285 + }, + { + "auxiliary_loss_clip": 0.06449015, + "auxiliary_loss_mlp": 0.01271774, + "balance_loss_clip": 0.06278686, + "balance_loss_mlp": 0.01257754, + "epoch": 0.43505185630542614, + "flos": 30527315712000.0, + "grad_norm": 1.3439257210534017, + "language_loss": 0.77555895, + "learning_rate": 2.512024397126566e-06, + "loss": 0.85276687, + "num_input_tokens_seen": 155148640, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.14019775, + "step": 7236, + "time_per_iteration": 2.600897789001465 + }, + { + "auxiliary_loss_clip": 0.06450135, + "auxiliary_loss_mlp": 0.01275561, + "balance_loss_clip": 0.06283981, + "balance_loss_mlp": 0.01260833, + "epoch": 0.4351119795580941, + "flos": 15739427738880.0, + "grad_norm": 1.5753739577535406, + "language_loss": 0.81058431, + "learning_rate": 2.5116479051143345e-06, + "loss": 0.88784134, + "num_input_tokens_seen": 155165870, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.14733887, + "step": 7237, + "time_per_iteration": 2.515153169631958 + }, + { + "auxiliary_loss_clip": 0.0644604, + "auxiliary_loss_mlp": 0.01270202, + "balance_loss_clip": 0.0627768, + "balance_loss_mlp": 0.0125607, + "epoch": 0.4351721028107621, + "flos": 18737328718080.0, + "grad_norm": 1.5657016421471992, + "language_loss": 0.63616467, + "learning_rate": 2.5112713937003623e-06, + "loss": 0.71332717, + "num_input_tokens_seen": 155185315, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.14129639, + "step": 7238, + "time_per_iteration": 2.4845099449157715 + }, + { + "auxiliary_loss_clip": 0.06448185, + "auxiliary_loss_mlp": 0.01273501, + "balance_loss_clip": 0.06281941, + "balance_loss_mlp": 0.01260162, + "epoch": 0.43523222606343004, + "flos": 25233652241280.0, + "grad_norm": 1.9152472058436172, + "language_loss": 0.85898602, + "learning_rate": 2.510894862898928e-06, + "loss": 0.93620288, + "num_input_tokens_seen": 155205790, + "router_z_loss_clip": 1.66308594, + "router_z_loss_mlp": 0.13342285, + "step": 7239, + "time_per_iteration": 2.579202175140381 + }, + { + "auxiliary_loss_clip": 0.06452584, + "auxiliary_loss_mlp": 0.01267786, + "balance_loss_clip": 0.06283215, + "balance_loss_mlp": 0.01253987, + "epoch": 0.435292349316098, + "flos": 22715504213760.0, + "grad_norm": 1.439066736410537, + "language_loss": 0.72456282, + "learning_rate": 2.510518312724309e-06, + "loss": 0.80176651, + "num_input_tokens_seen": 155226475, + "router_z_loss_clip": 1.69433594, + "router_z_loss_mlp": 0.13793945, + "step": 7240, + "time_per_iteration": 2.5192179679870605 + }, + { + "auxiliary_loss_clip": 0.06454788, + "auxiliary_loss_mlp": 0.01270866, + "balance_loss_clip": 0.06282151, + "balance_loss_mlp": 0.01256913, + "epoch": 0.43535247256876597, + "flos": 25783033536000.0, + "grad_norm": 2.0220617163145485, + "language_loss": 0.81900156, + "learning_rate": 2.5101417431907842e-06, + "loss": 0.89625818, + "num_input_tokens_seen": 155247110, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.1394043, + "step": 7241, + "time_per_iteration": 2.5792059898376465 + }, + { + "auxiliary_loss_clip": 0.06460294, + "auxiliary_loss_mlp": 0.01275581, + "balance_loss_clip": 0.0628238, + "balance_loss_mlp": 0.01260346, + "epoch": 0.43541259582143393, + "flos": 17533578562560.0, + "grad_norm": 2.581589278543144, + "language_loss": 0.79383838, + "learning_rate": 2.5097651543126345e-06, + "loss": 0.8711971, + "num_input_tokens_seen": 155261335, + "router_z_loss_clip": 1.77929688, + "router_z_loss_mlp": 0.15246582, + "step": 7242, + "time_per_iteration": 3.918156623840332 + }, + { + "auxiliary_loss_clip": 0.06452459, + "auxiliary_loss_mlp": 0.01271144, + "balance_loss_clip": 0.06278028, + "balance_loss_mlp": 0.01257405, + "epoch": 0.4354727190741019, + "flos": 15200612058240.0, + "grad_norm": 2.430343835688426, + "language_loss": 0.69088292, + "learning_rate": 2.509388546104138e-06, + "loss": 0.76811898, + "num_input_tokens_seen": 155278510, + "router_z_loss_clip": 1.7421875, + "router_z_loss_mlp": 0.13745117, + "step": 7243, + "time_per_iteration": 3.900606632232666 + }, + { + "auxiliary_loss_clip": 0.06444837, + "auxiliary_loss_mlp": 0.01271827, + "balance_loss_clip": 0.06279606, + "balance_loss_mlp": 0.01258655, + "epoch": 0.43553284232676986, + "flos": 16654015054080.0, + "grad_norm": 1.5901355562967736, + "language_loss": 0.81475091, + "learning_rate": 2.5090119185795766e-06, + "loss": 0.89191759, + "num_input_tokens_seen": 155296450, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.1317749, + "step": 7244, + "time_per_iteration": 2.581033229827881 + }, + { + "auxiliary_loss_clip": 0.06446069, + "auxiliary_loss_mlp": 0.01268137, + "balance_loss_clip": 0.06277774, + "balance_loss_mlp": 0.01255596, + "epoch": 0.43559296557943783, + "flos": 23407035160320.0, + "grad_norm": 1.5978807757182665, + "language_loss": 0.73241115, + "learning_rate": 2.508635271753234e-06, + "loss": 0.80955315, + "num_input_tokens_seen": 155316080, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.12554932, + "step": 7245, + "time_per_iteration": 2.5589826107025146 + }, + { + "auxiliary_loss_clip": 0.06452223, + "auxiliary_loss_mlp": 0.01269888, + "balance_loss_clip": 0.06282671, + "balance_loss_mlp": 0.01255792, + "epoch": 0.4356530888321058, + "flos": 22425628613760.0, + "grad_norm": 1.6720109050482812, + "language_loss": 0.77539527, + "learning_rate": 2.508258605639389e-06, + "loss": 0.85261637, + "num_input_tokens_seen": 155336765, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.14111328, + "step": 7246, + "time_per_iteration": 2.593538999557495 + }, + { + "auxiliary_loss_clip": 0.06448724, + "auxiliary_loss_mlp": 0.01267563, + "balance_loss_clip": 0.06280839, + "balance_loss_mlp": 0.01254033, + "epoch": 0.43571321208477376, + "flos": 21622527555840.0, + "grad_norm": 3.3071750834647426, + "language_loss": 0.86156344, + "learning_rate": 2.5078819202523275e-06, + "loss": 0.93872631, + "num_input_tokens_seen": 155356440, + "router_z_loss_clip": 1.67871094, + "router_z_loss_mlp": 0.13531494, + "step": 7247, + "time_per_iteration": 2.5369882583618164 + }, + { + "auxiliary_loss_clip": 0.06446265, + "auxiliary_loss_mlp": 0.01269788, + "balance_loss_clip": 0.06277846, + "balance_loss_mlp": 0.01257194, + "epoch": 0.4357733353374418, + "flos": 23994081665280.0, + "grad_norm": 1.7467086672612386, + "language_loss": 0.73132598, + "learning_rate": 2.507505215606333e-06, + "loss": 0.80848658, + "num_input_tokens_seen": 155377070, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.12597656, + "step": 7248, + "time_per_iteration": 3.9830687046051025 + }, + { + "auxiliary_loss_clip": 0.06447548, + "auxiliary_loss_mlp": 0.01267385, + "balance_loss_clip": 0.06279291, + "balance_loss_mlp": 0.01254022, + "epoch": 0.43583345859010975, + "flos": 25271736721920.0, + "grad_norm": 1.509350817375945, + "language_loss": 0.87227005, + "learning_rate": 2.5071284917156893e-06, + "loss": 0.94941938, + "num_input_tokens_seen": 155398415, + "router_z_loss_clip": 1.68066406, + "router_z_loss_mlp": 0.13378906, + "step": 7249, + "time_per_iteration": 2.565516948699951 + }, + { + "auxiliary_loss_clip": 0.06451611, + "auxiliary_loss_mlp": 0.01267914, + "balance_loss_clip": 0.06279075, + "balance_loss_mlp": 0.01254223, + "epoch": 0.4358935818427777, + "flos": 23703115962240.0, + "grad_norm": 1.8925784396827436, + "language_loss": 0.8199448, + "learning_rate": 2.506751748594683e-06, + "loss": 0.89714003, + "num_input_tokens_seen": 155415625, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.13690186, + "step": 7250, + "time_per_iteration": 2.5410354137420654 + }, + { + "auxiliary_loss_clip": 0.06454265, + "auxiliary_loss_mlp": 0.01273165, + "balance_loss_clip": 0.06283678, + "balance_loss_mlp": 0.01258901, + "epoch": 0.4359537050954457, + "flos": 29540416723200.0, + "grad_norm": 2.0613712873147723, + "language_loss": 0.85409963, + "learning_rate": 2.5063749862575988e-06, + "loss": 0.93137395, + "num_input_tokens_seen": 155435505, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.14251709, + "step": 7251, + "time_per_iteration": 2.5893919467926025 + }, + { + "auxiliary_loss_clip": 0.06448197, + "auxiliary_loss_mlp": 0.01270693, + "balance_loss_clip": 0.06280132, + "balance_loss_mlp": 0.01257431, + "epoch": 0.43601382834811364, + "flos": 22717935982080.0, + "grad_norm": 1.9454057009257966, + "language_loss": 0.69792974, + "learning_rate": 2.5059982047187245e-06, + "loss": 0.77511865, + "num_input_tokens_seen": 155455425, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.13262939, + "step": 7252, + "time_per_iteration": 2.518423080444336 + }, + { + "auxiliary_loss_clip": 0.06442783, + "auxiliary_loss_mlp": 0.01269502, + "balance_loss_clip": 0.06278728, + "balance_loss_mlp": 0.01256336, + "epoch": 0.4360739516007816, + "flos": 19104714944640.0, + "grad_norm": 1.67696041016681, + "language_loss": 0.83826983, + "learning_rate": 2.505621403992348e-06, + "loss": 0.91539264, + "num_input_tokens_seen": 155474250, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.13146973, + "step": 7253, + "time_per_iteration": 3.929287910461426 + }, + { + "auxiliary_loss_clip": 0.06446494, + "auxiliary_loss_mlp": 0.01271781, + "balance_loss_clip": 0.06278495, + "balance_loss_mlp": 0.01257095, + "epoch": 0.43613407485344957, + "flos": 23411185937280.0, + "grad_norm": 1.865330471105, + "language_loss": 0.7061553, + "learning_rate": 2.505244584092757e-06, + "loss": 0.78333807, + "num_input_tokens_seen": 155494685, + "router_z_loss_clip": 1.68066406, + "router_z_loss_mlp": 0.14678955, + "step": 7254, + "time_per_iteration": 2.5348615646362305 + }, + { + "auxiliary_loss_clip": 0.06446688, + "auxiliary_loss_mlp": 0.01270934, + "balance_loss_clip": 0.0628084, + "balance_loss_mlp": 0.01257249, + "epoch": 0.43619419810611754, + "flos": 22644366497280.0, + "grad_norm": 1.8869772682878516, + "language_loss": 0.81010306, + "learning_rate": 2.5048677450342406e-06, + "loss": 0.88727921, + "num_input_tokens_seen": 155513040, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.13671875, + "step": 7255, + "time_per_iteration": 2.6183383464813232 + }, + { + "auxiliary_loss_clip": 0.06450298, + "auxiliary_loss_mlp": 0.01267933, + "balance_loss_clip": 0.06279971, + "balance_loss_mlp": 0.01254772, + "epoch": 0.4362543213587855, + "flos": 20054200285440.0, + "grad_norm": 1.8086691858124306, + "language_loss": 0.78106731, + "learning_rate": 2.504490886831089e-06, + "loss": 0.85824955, + "num_input_tokens_seen": 155530100, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.13165283, + "step": 7256, + "time_per_iteration": 2.5364508628845215 + }, + { + "auxiliary_loss_clip": 0.06446915, + "auxiliary_loss_mlp": 0.01269551, + "balance_loss_clip": 0.06280836, + "balance_loss_mlp": 0.01256122, + "epoch": 0.43631444461145347, + "flos": 21367759616640.0, + "grad_norm": 1.5279282177598472, + "language_loss": 0.75952047, + "learning_rate": 2.5041140094975922e-06, + "loss": 0.83668512, + "num_input_tokens_seen": 155549375, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.13452148, + "step": 7257, + "time_per_iteration": 2.5156846046447754 + }, + { + "auxiliary_loss_clip": 0.06452259, + "auxiliary_loss_mlp": 0.01269452, + "balance_loss_clip": 0.06281701, + "balance_loss_mlp": 0.01255123, + "epoch": 0.43637456786412143, + "flos": 22424999708160.0, + "grad_norm": 1.7230532534800784, + "language_loss": 0.73248196, + "learning_rate": 2.5037371130480417e-06, + "loss": 0.80969918, + "num_input_tokens_seen": 155569395, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.14324951, + "step": 7258, + "time_per_iteration": 2.6132447719573975 + }, + { + "auxiliary_loss_clip": 0.06453618, + "auxiliary_loss_mlp": 0.01267142, + "balance_loss_clip": 0.06282197, + "balance_loss_mlp": 0.01253725, + "epoch": 0.4364346911167894, + "flos": 28556452627200.0, + "grad_norm": 1.8100021880336497, + "language_loss": 0.77633202, + "learning_rate": 2.5033601974967297e-06, + "loss": 0.85353959, + "num_input_tokens_seen": 155589090, + "router_z_loss_clip": 1.71191406, + "router_z_loss_mlp": 0.13415527, + "step": 7259, + "time_per_iteration": 2.589134931564331 + }, + { + "auxiliary_loss_clip": 0.06393245, + "auxiliary_loss_mlp": 0.01278627, + "balance_loss_clip": 0.0631365, + "balance_loss_mlp": 0.01275647, + "epoch": 0.43649481436945736, + "flos": 62678149407360.0, + "grad_norm": 0.7458705100033151, + "language_loss": 0.56939262, + "learning_rate": 2.5029832628579483e-06, + "loss": 0.64611137, + "num_input_tokens_seen": 155648660, + "router_z_loss_clip": 0.796875, + "router_z_loss_mlp": 0.02978516, + "step": 7260, + "time_per_iteration": 3.11572265625 + }, + { + "auxiliary_loss_clip": 0.06454421, + "auxiliary_loss_mlp": 0.01272288, + "balance_loss_clip": 0.06285764, + "balance_loss_mlp": 0.01257494, + "epoch": 0.4365549376221254, + "flos": 30600088583040.0, + "grad_norm": 1.806363539403124, + "language_loss": 0.71915948, + "learning_rate": 2.5026063091459907e-06, + "loss": 0.79642659, + "num_input_tokens_seen": 155669945, + "router_z_loss_clip": 1.6875, + "router_z_loss_mlp": 0.14794922, + "step": 7261, + "time_per_iteration": 2.6100480556488037 + }, + { + "auxiliary_loss_clip": 0.06453972, + "auxiliary_loss_mlp": 0.01271962, + "balance_loss_clip": 0.06284794, + "balance_loss_mlp": 0.0125836, + "epoch": 0.43661506087479335, + "flos": 17171684778240.0, + "grad_norm": 2.033659544742114, + "language_loss": 0.69274759, + "learning_rate": 2.5022293363751522e-06, + "loss": 0.77000701, + "num_input_tokens_seen": 155688555, + "router_z_loss_clip": 1.69042969, + "router_z_loss_mlp": 0.13604736, + "step": 7262, + "time_per_iteration": 2.556318521499634 + }, + { + "auxiliary_loss_clip": 0.0644339, + "auxiliary_loss_mlp": 0.01266124, + "balance_loss_clip": 0.06282735, + "balance_loss_mlp": 0.01253345, + "epoch": 0.4366751841274613, + "flos": 22052875726080.0, + "grad_norm": 1.6437752521732585, + "language_loss": 0.80115777, + "learning_rate": 2.501852344559726e-06, + "loss": 0.87825286, + "num_input_tokens_seen": 155705370, + "router_z_loss_clip": 1.60546875, + "router_z_loss_mlp": 0.12780762, + "step": 7263, + "time_per_iteration": 2.509807825088501 + }, + { + "auxiliary_loss_clip": 0.06448945, + "auxiliary_loss_mlp": 0.01267422, + "balance_loss_clip": 0.06281485, + "balance_loss_mlp": 0.01254076, + "epoch": 0.4367353073801293, + "flos": 16002748794240.0, + "grad_norm": 1.6772415302555446, + "language_loss": 0.76036841, + "learning_rate": 2.50147533371401e-06, + "loss": 0.83753204, + "num_input_tokens_seen": 155721890, + "router_z_loss_clip": 1.67480469, + "router_z_loss_mlp": 0.13354492, + "step": 7264, + "time_per_iteration": 2.523973226547241 + }, + { + "auxiliary_loss_clip": 0.06444526, + "auxiliary_loss_mlp": 0.01267772, + "balance_loss_clip": 0.06279328, + "balance_loss_mlp": 0.01253997, + "epoch": 0.43679543063279724, + "flos": 38226760485120.0, + "grad_norm": 2.1479145935669615, + "language_loss": 0.61845875, + "learning_rate": 2.501098303852298e-06, + "loss": 0.69558173, + "num_input_tokens_seen": 155743970, + "router_z_loss_clip": 1.65234375, + "router_z_loss_mlp": 0.13787842, + "step": 7265, + "time_per_iteration": 2.6696202754974365 + }, + { + "auxiliary_loss_clip": 0.06447139, + "auxiliary_loss_mlp": 0.01269097, + "balance_loss_clip": 0.06282498, + "balance_loss_mlp": 0.01256211, + "epoch": 0.4368555538854652, + "flos": 15198306071040.0, + "grad_norm": 1.934873925186605, + "language_loss": 0.73721504, + "learning_rate": 2.5007212549888884e-06, + "loss": 0.81437743, + "num_input_tokens_seen": 155761830, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.12896729, + "step": 7266, + "time_per_iteration": 2.5559945106506348 + }, + { + "auxiliary_loss_clip": 0.0644975, + "auxiliary_loss_mlp": 0.01273187, + "balance_loss_clip": 0.06282988, + "balance_loss_mlp": 0.01260432, + "epoch": 0.4369156771381332, + "flos": 23074630813440.0, + "grad_norm": 2.1253877681457904, + "language_loss": 0.82184762, + "learning_rate": 2.5003441871380794e-06, + "loss": 0.899077, + "num_input_tokens_seen": 155779610, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.12762451, + "step": 7267, + "time_per_iteration": 2.534639358520508 + }, + { + "auxiliary_loss_clip": 0.06444408, + "auxiliary_loss_mlp": 0.01269536, + "balance_loss_clip": 0.06281124, + "balance_loss_mlp": 0.01256459, + "epoch": 0.43697580039080114, + "flos": 23447886825600.0, + "grad_norm": 2.09966668439896, + "language_loss": 0.75195235, + "learning_rate": 2.4999671003141674e-06, + "loss": 0.82909179, + "num_input_tokens_seen": 155798765, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.13085938, + "step": 7268, + "time_per_iteration": 2.6128745079040527 + }, + { + "auxiliary_loss_clip": 0.06451406, + "auxiliary_loss_mlp": 0.0126863, + "balance_loss_clip": 0.06280525, + "balance_loss_mlp": 0.0125451, + "epoch": 0.4370359236434691, + "flos": 18520519478400.0, + "grad_norm": 3.050341004743464, + "language_loss": 0.79660171, + "learning_rate": 2.499589994531454e-06, + "loss": 0.87380207, + "num_input_tokens_seen": 155817750, + "router_z_loss_clip": 1.70898438, + "router_z_loss_mlp": 0.14099121, + "step": 7269, + "time_per_iteration": 2.516211986541748 + }, + { + "auxiliary_loss_clip": 0.06446489, + "auxiliary_loss_mlp": 0.01273185, + "balance_loss_clip": 0.06281964, + "balance_loss_mlp": 0.01260174, + "epoch": 0.43709604689613707, + "flos": 23229316650240.0, + "grad_norm": 1.8886828014681587, + "language_loss": 0.75057715, + "learning_rate": 2.499212869804237e-06, + "loss": 0.82777393, + "num_input_tokens_seen": 155836490, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.13024902, + "step": 7270, + "time_per_iteration": 2.5755550861358643 + }, + { + "auxiliary_loss_clip": 0.06447008, + "auxiliary_loss_mlp": 0.01268284, + "balance_loss_clip": 0.06279345, + "balance_loss_mlp": 0.01255064, + "epoch": 0.43715617014880503, + "flos": 23810199880320.0, + "grad_norm": 1.808972971243201, + "language_loss": 0.79453981, + "learning_rate": 2.4988357261468182e-06, + "loss": 0.87169278, + "num_input_tokens_seen": 155856225, + "router_z_loss_clip": 1.67773438, + "router_z_loss_mlp": 0.13220215, + "step": 7271, + "time_per_iteration": 2.564471960067749 + }, + { + "auxiliary_loss_clip": 0.06369642, + "auxiliary_loss_mlp": 0.01258814, + "balance_loss_clip": 0.0629034, + "balance_loss_mlp": 0.01255858, + "epoch": 0.437216293401473, + "flos": 61961824851840.0, + "grad_norm": 0.6886560925106296, + "language_loss": 0.54733157, + "learning_rate": 2.4984585635734993e-06, + "loss": 0.62361616, + "num_input_tokens_seen": 155916770, + "router_z_loss_clip": 0.79443359, + "router_z_loss_mlp": 0.02954102, + "step": 7272, + "time_per_iteration": 3.208707332611084 + }, + { + "auxiliary_loss_clip": 0.06451584, + "auxiliary_loss_mlp": 0.01270794, + "balance_loss_clip": 0.06281105, + "balance_loss_mlp": 0.01256757, + "epoch": 0.43727641665414096, + "flos": 21988907533440.0, + "grad_norm": 1.571184799437717, + "language_loss": 0.70994467, + "learning_rate": 2.498081382098581e-06, + "loss": 0.78716844, + "num_input_tokens_seen": 155936490, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.14031982, + "step": 7273, + "time_per_iteration": 2.540081262588501 + }, + { + "auxiliary_loss_clip": 0.06448624, + "auxiliary_loss_mlp": 0.0126917, + "balance_loss_clip": 0.06279367, + "balance_loss_mlp": 0.01255271, + "epoch": 0.437336539906809, + "flos": 39540277889280.0, + "grad_norm": 1.8107596290780341, + "language_loss": 0.7551834, + "learning_rate": 2.497704181736367e-06, + "loss": 0.83236134, + "num_input_tokens_seen": 155957595, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.13903809, + "step": 7274, + "time_per_iteration": 2.6836495399475098 + }, + { + "auxiliary_loss_clip": 0.06441884, + "auxiliary_loss_mlp": 0.01265059, + "balance_loss_clip": 0.06277934, + "balance_loss_mlp": 0.01252703, + "epoch": 0.43739666315947695, + "flos": 17462902043520.0, + "grad_norm": 1.9085211858375455, + "language_loss": 0.80314881, + "learning_rate": 2.49732696250116e-06, + "loss": 0.88021827, + "num_input_tokens_seen": 155975710, + "router_z_loss_clip": 1.63867188, + "router_z_loss_mlp": 0.12353516, + "step": 7275, + "time_per_iteration": 2.5408823490142822 + }, + { + "auxiliary_loss_clip": 0.06450746, + "auxiliary_loss_mlp": 0.01272848, + "balance_loss_clip": 0.06284586, + "balance_loss_mlp": 0.01259753, + "epoch": 0.4374567864121449, + "flos": 16363678256640.0, + "grad_norm": 1.98644372860744, + "language_loss": 0.81298435, + "learning_rate": 2.496949724407266e-06, + "loss": 0.89022022, + "num_input_tokens_seen": 155993090, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.13092041, + "step": 7276, + "time_per_iteration": 2.4871010780334473 + }, + { + "auxiliary_loss_clip": 0.06454313, + "auxiliary_loss_mlp": 0.01266955, + "balance_loss_clip": 0.06281172, + "balance_loss_mlp": 0.01253013, + "epoch": 0.4375169096648129, + "flos": 30594721921920.0, + "grad_norm": 1.9320579241517422, + "language_loss": 0.73048055, + "learning_rate": 2.496572467468988e-06, + "loss": 0.8076933, + "num_input_tokens_seen": 156013685, + "router_z_loss_clip": 1.73339844, + "router_z_loss_mlp": 0.1394043, + "step": 7277, + "time_per_iteration": 2.6151673793792725 + }, + { + "auxiliary_loss_clip": 0.06445154, + "auxiliary_loss_mlp": 0.01272648, + "balance_loss_clip": 0.06279732, + "balance_loss_mlp": 0.01258939, + "epoch": 0.43757703291748085, + "flos": 30563555402880.0, + "grad_norm": 1.9557335242574223, + "language_loss": 0.72527206, + "learning_rate": 2.4961951917006317e-06, + "loss": 0.80245006, + "num_input_tokens_seen": 156034300, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.13696289, + "step": 7278, + "time_per_iteration": 2.583293914794922 + }, + { + "auxiliary_loss_clip": 0.06440841, + "auxiliary_loss_mlp": 0.01270709, + "balance_loss_clip": 0.06277154, + "balance_loss_mlp": 0.01258371, + "epoch": 0.4376371561701488, + "flos": 21403747745280.0, + "grad_norm": 1.4778175335443475, + "language_loss": 0.65870327, + "learning_rate": 2.4958178971165046e-06, + "loss": 0.73581874, + "num_input_tokens_seen": 156053805, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.12329102, + "step": 7279, + "time_per_iteration": 2.5419130325317383 + }, + { + "auxiliary_loss_clip": 0.06451775, + "auxiliary_loss_mlp": 0.01270137, + "balance_loss_clip": 0.06279162, + "balance_loss_mlp": 0.01256559, + "epoch": 0.4376972794228168, + "flos": 23411144010240.0, + "grad_norm": 1.7454635588007905, + "language_loss": 0.8264519, + "learning_rate": 2.4954405837309126e-06, + "loss": 0.90367103, + "num_input_tokens_seen": 156073295, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.13568115, + "step": 7280, + "time_per_iteration": 2.5270493030548096 + }, + { + "auxiliary_loss_clip": 0.06438784, + "auxiliary_loss_mlp": 0.01272842, + "balance_loss_clip": 0.06277376, + "balance_loss_mlp": 0.01259848, + "epoch": 0.43775740267548474, + "flos": 22899511779840.0, + "grad_norm": 1.6085189920631162, + "language_loss": 0.7756325, + "learning_rate": 2.4950632515581653e-06, + "loss": 0.85274875, + "num_input_tokens_seen": 156094540, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.13000488, + "step": 7281, + "time_per_iteration": 2.614102602005005 + }, + { + "auxiliary_loss_clip": 0.0644282, + "auxiliary_loss_mlp": 0.01275956, + "balance_loss_clip": 0.06276567, + "balance_loss_mlp": 0.01263028, + "epoch": 0.4378175259281527, + "flos": 23301041345280.0, + "grad_norm": 1.8125010794319167, + "language_loss": 0.7622053, + "learning_rate": 2.494685900612569e-06, + "loss": 0.83939308, + "num_input_tokens_seen": 156114070, + "router_z_loss_clip": 1.66308594, + "router_z_loss_mlp": 0.12915039, + "step": 7282, + "time_per_iteration": 3.9149930477142334 + }, + { + "auxiliary_loss_clip": 0.06446523, + "auxiliary_loss_mlp": 0.01267087, + "balance_loss_clip": 0.06279582, + "balance_loss_mlp": 0.01254438, + "epoch": 0.43787764918082067, + "flos": 23883433948800.0, + "grad_norm": 2.0076194716834874, + "language_loss": 0.85396934, + "learning_rate": 2.4943085309084333e-06, + "loss": 0.93110549, + "num_input_tokens_seen": 156132130, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.12652588, + "step": 7283, + "time_per_iteration": 3.9656553268432617 + }, + { + "auxiliary_loss_clip": 0.0644891, + "auxiliary_loss_mlp": 0.01268213, + "balance_loss_clip": 0.06279234, + "balance_loss_mlp": 0.01254999, + "epoch": 0.43793777243348864, + "flos": 23995004060160.0, + "grad_norm": 1.8602515290448327, + "language_loss": 0.8091675, + "learning_rate": 2.49393114246007e-06, + "loss": 0.88633871, + "num_input_tokens_seen": 156150820, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.13214111, + "step": 7284, + "time_per_iteration": 2.566521167755127 + }, + { + "auxiliary_loss_clip": 0.06443676, + "auxiliary_loss_mlp": 0.0127107, + "balance_loss_clip": 0.06278057, + "balance_loss_mlp": 0.01258774, + "epoch": 0.4379978956861566, + "flos": 18629909383680.0, + "grad_norm": 1.7731724137458924, + "language_loss": 0.80635571, + "learning_rate": 2.493553735281787e-06, + "loss": 0.8835032, + "num_input_tokens_seen": 156170125, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.12310791, + "step": 7285, + "time_per_iteration": 2.5004618167877197 + }, + { + "auxiliary_loss_clip": 0.0643899, + "auxiliary_loss_mlp": 0.01269665, + "balance_loss_clip": 0.06274976, + "balance_loss_mlp": 0.01256642, + "epoch": 0.43805801893882457, + "flos": 21987901284480.0, + "grad_norm": 1.9005617879541583, + "language_loss": 0.75070119, + "learning_rate": 2.493176309387897e-06, + "loss": 0.82778776, + "num_input_tokens_seen": 156187320, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.13031006, + "step": 7286, + "time_per_iteration": 2.5617265701293945 + }, + { + "auxiliary_loss_clip": 0.0644343, + "auxiliary_loss_mlp": 0.01269982, + "balance_loss_clip": 0.06274993, + "balance_loss_mlp": 0.01257239, + "epoch": 0.43811814219149253, + "flos": 26400114529920.0, + "grad_norm": 2.124374396883661, + "language_loss": 0.73769003, + "learning_rate": 2.492798864792712e-06, + "loss": 0.81482422, + "num_input_tokens_seen": 156207455, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.12738037, + "step": 7287, + "time_per_iteration": 2.5709421634674072 + }, + { + "auxiliary_loss_clip": 0.06442735, + "auxiliary_loss_mlp": 0.01272914, + "balance_loss_clip": 0.06276426, + "balance_loss_mlp": 0.01259115, + "epoch": 0.43817826544416055, + "flos": 17499015953280.0, + "grad_norm": 1.6607447345750057, + "language_loss": 0.82538438, + "learning_rate": 2.492421401510545e-06, + "loss": 0.90254092, + "num_input_tokens_seen": 156226560, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.13812256, + "step": 7288, + "time_per_iteration": 3.92202091217041 + }, + { + "auxiliary_loss_clip": 0.06447385, + "auxiliary_loss_mlp": 0.01268847, + "balance_loss_clip": 0.06276591, + "balance_loss_mlp": 0.01254888, + "epoch": 0.4382383886968285, + "flos": 21587629530240.0, + "grad_norm": 1.4460149141548964, + "language_loss": 0.84252048, + "learning_rate": 2.4920439195557093e-06, + "loss": 0.9196828, + "num_input_tokens_seen": 156246740, + "router_z_loss_clip": 1.70703125, + "router_z_loss_mlp": 0.1395874, + "step": 7289, + "time_per_iteration": 2.557433843612671 + }, + { + "auxiliary_loss_clip": 0.06446871, + "auxiliary_loss_mlp": 0.01267959, + "balance_loss_clip": 0.06274465, + "balance_loss_mlp": 0.01254912, + "epoch": 0.4382985119494965, + "flos": 27930441173760.0, + "grad_norm": 2.36337419111835, + "language_loss": 0.78573066, + "learning_rate": 2.4916664189425183e-06, + "loss": 0.86287904, + "num_input_tokens_seen": 156266440, + "router_z_loss_clip": 1.72753906, + "router_z_loss_mlp": 0.13067627, + "step": 7290, + "time_per_iteration": 2.5970215797424316 + }, + { + "auxiliary_loss_clip": 0.06439934, + "auxiliary_loss_mlp": 0.01272143, + "balance_loss_clip": 0.06275328, + "balance_loss_mlp": 0.0125903, + "epoch": 0.43835863520216445, + "flos": 24943860495360.0, + "grad_norm": 1.8528017599911322, + "language_loss": 0.7800144, + "learning_rate": 2.491288899685288e-06, + "loss": 0.85713518, + "num_input_tokens_seen": 156286900, + "router_z_loss_clip": 1.64648438, + "router_z_loss_mlp": 0.13110352, + "step": 7291, + "time_per_iteration": 2.5944950580596924 + }, + { + "auxiliary_loss_clip": 0.06443708, + "auxiliary_loss_mlp": 0.01274453, + "balance_loss_clip": 0.06277154, + "balance_loss_mlp": 0.0126106, + "epoch": 0.4384187584548324, + "flos": 33518634145920.0, + "grad_norm": 1.8972630881774872, + "language_loss": 0.64874315, + "learning_rate": 2.4909113617983325e-06, + "loss": 0.72592473, + "num_input_tokens_seen": 156307690, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.13391113, + "step": 7292, + "time_per_iteration": 2.628173351287842 + }, + { + "auxiliary_loss_clip": 0.06447129, + "auxiliary_loss_mlp": 0.01269671, + "balance_loss_clip": 0.06278794, + "balance_loss_mlp": 0.01256653, + "epoch": 0.4384788817075004, + "flos": 23957800047360.0, + "grad_norm": 1.5925770854238166, + "language_loss": 0.74671286, + "learning_rate": 2.49053380529597e-06, + "loss": 0.82388091, + "num_input_tokens_seen": 156326620, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.13031006, + "step": 7293, + "time_per_iteration": 3.9379074573516846 + }, + { + "auxiliary_loss_clip": 0.06446324, + "auxiliary_loss_mlp": 0.01270789, + "balance_loss_clip": 0.06279649, + "balance_loss_mlp": 0.0125668, + "epoch": 0.43853900496016834, + "flos": 19104463382400.0, + "grad_norm": 4.9627482836353165, + "language_loss": 0.7920171, + "learning_rate": 2.490156230192516e-06, + "loss": 0.86918819, + "num_input_tokens_seen": 156345495, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.14099121, + "step": 7294, + "time_per_iteration": 2.4718902111053467 + }, + { + "auxiliary_loss_clip": 0.06450905, + "auxiliary_loss_mlp": 0.01269807, + "balance_loss_clip": 0.06283231, + "balance_loss_mlp": 0.01256252, + "epoch": 0.4385991282128363, + "flos": 13230503660160.0, + "grad_norm": 1.631074893492929, + "language_loss": 0.73162925, + "learning_rate": 2.4897786365022883e-06, + "loss": 0.80883634, + "num_input_tokens_seen": 156363155, + "router_z_loss_clip": 1.67480469, + "router_z_loss_mlp": 0.13574219, + "step": 7295, + "time_per_iteration": 2.531641721725464 + }, + { + "auxiliary_loss_clip": 0.06452312, + "auxiliary_loss_mlp": 0.01270937, + "balance_loss_clip": 0.06283045, + "balance_loss_mlp": 0.01256298, + "epoch": 0.4386592514655043, + "flos": 14325199326720.0, + "grad_norm": 2.435451861079371, + "language_loss": 0.75030828, + "learning_rate": 2.4894010242396063e-06, + "loss": 0.8275407, + "num_input_tokens_seen": 156380940, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.14648438, + "step": 7296, + "time_per_iteration": 2.4799978733062744 + }, + { + "auxiliary_loss_clip": 0.06443385, + "auxiliary_loss_mlp": 0.01270746, + "balance_loss_clip": 0.06278379, + "balance_loss_mlp": 0.01257598, + "epoch": 0.43871937471817224, + "flos": 22791128123520.0, + "grad_norm": 1.513671798105688, + "language_loss": 0.69379568, + "learning_rate": 2.4890233934187873e-06, + "loss": 0.77093697, + "num_input_tokens_seen": 156400415, + "router_z_loss_clip": 1.64941406, + "router_z_loss_mlp": 0.13146973, + "step": 7297, + "time_per_iteration": 2.5378599166870117 + }, + { + "auxiliary_loss_clip": 0.06447895, + "auxiliary_loss_mlp": 0.01268794, + "balance_loss_clip": 0.06281355, + "balance_loss_mlp": 0.01255878, + "epoch": 0.4387794979708402, + "flos": 28079466860160.0, + "grad_norm": 1.3753147611046208, + "language_loss": 0.70496702, + "learning_rate": 2.4886457440541535e-06, + "loss": 0.78213394, + "num_input_tokens_seen": 156421120, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.12902832, + "step": 7298, + "time_per_iteration": 2.5667972564697266 + }, + { + "auxiliary_loss_clip": 0.06442846, + "auxiliary_loss_mlp": 0.01270993, + "balance_loss_clip": 0.06279726, + "balance_loss_mlp": 0.01258023, + "epoch": 0.43883962122350817, + "flos": 26256665139840.0, + "grad_norm": 1.5271246100670304, + "language_loss": 0.72762883, + "learning_rate": 2.4882680761600238e-06, + "loss": 0.80476719, + "num_input_tokens_seen": 156441535, + "router_z_loss_clip": 1.63085938, + "router_z_loss_mlp": 0.12976074, + "step": 7299, + "time_per_iteration": 2.567258834838867 + }, + { + "auxiliary_loss_clip": 0.06449576, + "auxiliary_loss_mlp": 0.012749, + "balance_loss_clip": 0.06281091, + "balance_loss_mlp": 0.01260142, + "epoch": 0.43889974447617613, + "flos": 25890662505600.0, + "grad_norm": 1.7549107290593968, + "language_loss": 0.76878119, + "learning_rate": 2.487890389750719e-06, + "loss": 0.84602594, + "num_input_tokens_seen": 156462015, + "router_z_loss_clip": 1.6875, + "router_z_loss_mlp": 0.14758301, + "step": 7300, + "time_per_iteration": 2.541740655899048 + }, + { + "auxiliary_loss_clip": 0.06448291, + "auxiliary_loss_mlp": 0.01268162, + "balance_loss_clip": 0.06281555, + "balance_loss_mlp": 0.01254346, + "epoch": 0.43895986772884416, + "flos": 25053711598080.0, + "grad_norm": 2.544712476821277, + "language_loss": 0.71268392, + "learning_rate": 2.4875126848405626e-06, + "loss": 0.78984845, + "num_input_tokens_seen": 156482165, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.13824463, + "step": 7301, + "time_per_iteration": 2.547846794128418 + }, + { + "auxiliary_loss_clip": 0.06445279, + "auxiliary_loss_mlp": 0.01269466, + "balance_loss_clip": 0.06277898, + "balance_loss_mlp": 0.01254434, + "epoch": 0.4390199909815121, + "flos": 26001729492480.0, + "grad_norm": 4.607507625532986, + "language_loss": 0.71274817, + "learning_rate": 2.4871349614438757e-06, + "loss": 0.78989553, + "num_input_tokens_seen": 156503170, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.15026855, + "step": 7302, + "time_per_iteration": 2.531633138656616 + }, + { + "auxiliary_loss_clip": 0.06444067, + "auxiliary_loss_mlp": 0.0126751, + "balance_loss_clip": 0.06280646, + "balance_loss_mlp": 0.01254618, + "epoch": 0.4390801142341801, + "flos": 29029790741760.0, + "grad_norm": 1.545722029471357, + "language_loss": 0.82388735, + "learning_rate": 2.486757219574983e-06, + "loss": 0.90100312, + "num_input_tokens_seen": 156523005, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.12908936, + "step": 7303, + "time_per_iteration": 2.6841824054718018 + }, + { + "auxiliary_loss_clip": 0.06456171, + "auxiliary_loss_mlp": 0.01271253, + "balance_loss_clip": 0.06284264, + "balance_loss_mlp": 0.01256649, + "epoch": 0.43914023748684805, + "flos": 33447077159040.0, + "grad_norm": 2.3091286506484034, + "language_loss": 0.69152826, + "learning_rate": 2.4863794592482067e-06, + "loss": 0.76880252, + "num_input_tokens_seen": 156544440, + "router_z_loss_clip": 1.71972656, + "router_z_loss_mlp": 0.1461792, + "step": 7304, + "time_per_iteration": 2.6893982887268066 + }, + { + "auxiliary_loss_clip": 0.06439492, + "auxiliary_loss_mlp": 0.01269095, + "balance_loss_clip": 0.06278437, + "balance_loss_mlp": 0.01256507, + "epoch": 0.439200360739516, + "flos": 34540347306240.0, + "grad_norm": 1.5007015420493954, + "language_loss": 0.78744507, + "learning_rate": 2.486001680477873e-06, + "loss": 0.86453092, + "num_input_tokens_seen": 156565410, + "router_z_loss_clip": 1.61132812, + "router_z_loss_mlp": 0.12573242, + "step": 7305, + "time_per_iteration": 2.6403284072875977 + }, + { + "auxiliary_loss_clip": 0.06446742, + "auxiliary_loss_mlp": 0.01269235, + "balance_loss_clip": 0.06281108, + "balance_loss_mlp": 0.01255019, + "epoch": 0.439260483992184, + "flos": 21914247945600.0, + "grad_norm": 1.7423010107893722, + "language_loss": 0.68937683, + "learning_rate": 2.485623883278308e-06, + "loss": 0.76653659, + "num_input_tokens_seen": 156584210, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.14221191, + "step": 7306, + "time_per_iteration": 2.5665781497955322 + }, + { + "auxiliary_loss_clip": 0.06446797, + "auxiliary_loss_mlp": 0.01272443, + "balance_loss_clip": 0.06279111, + "balance_loss_mlp": 0.01258877, + "epoch": 0.43932060724485195, + "flos": 21002805158400.0, + "grad_norm": 1.5749593715316206, + "language_loss": 0.63249755, + "learning_rate": 2.4852460676638344e-06, + "loss": 0.70968997, + "num_input_tokens_seen": 156602730, + "router_z_loss_clip": 1.67675781, + "router_z_loss_mlp": 0.13562012, + "step": 7307, + "time_per_iteration": 2.5204410552978516 + }, + { + "auxiliary_loss_clip": 0.06449466, + "auxiliary_loss_mlp": 0.0126805, + "balance_loss_clip": 0.06279462, + "balance_loss_mlp": 0.01254305, + "epoch": 0.4393807304975199, + "flos": 17752526081280.0, + "grad_norm": 1.900088770074622, + "language_loss": 0.72216207, + "learning_rate": 2.4848682336487828e-06, + "loss": 0.79933721, + "num_input_tokens_seen": 156619405, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.13745117, + "step": 7308, + "time_per_iteration": 2.4988410472869873 + }, + { + "auxiliary_loss_clip": 0.06445662, + "auxiliary_loss_mlp": 0.01268116, + "balance_loss_clip": 0.06277111, + "balance_loss_mlp": 0.01254669, + "epoch": 0.4394408537501879, + "flos": 22535102373120.0, + "grad_norm": 2.200318468716899, + "language_loss": 0.76911771, + "learning_rate": 2.4844903812474787e-06, + "loss": 0.84625548, + "num_input_tokens_seen": 156638165, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.13458252, + "step": 7309, + "time_per_iteration": 2.521385431289673 + }, + { + "auxiliary_loss_clip": 0.06438792, + "auxiliary_loss_mlp": 0.01270246, + "balance_loss_clip": 0.06277418, + "balance_loss_mlp": 0.01257908, + "epoch": 0.43950097700285584, + "flos": 23447383701120.0, + "grad_norm": 3.092354645663241, + "language_loss": 0.71101463, + "learning_rate": 2.484112510474251e-06, + "loss": 0.78810501, + "num_input_tokens_seen": 156658845, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.12335205, + "step": 7310, + "time_per_iteration": 2.609769344329834 + }, + { + "auxiliary_loss_clip": 0.06452246, + "auxiliary_loss_mlp": 0.01269049, + "balance_loss_clip": 0.06282806, + "balance_loss_mlp": 0.0125624, + "epoch": 0.4395611002555238, + "flos": 23186620195200.0, + "grad_norm": 3.6443795998554744, + "language_loss": 0.76179528, + "learning_rate": 2.483734621343429e-06, + "loss": 0.83900821, + "num_input_tokens_seen": 156677275, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.12817383, + "step": 7311, + "time_per_iteration": 2.5347063541412354 + }, + { + "auxiliary_loss_clip": 0.06451476, + "auxiliary_loss_mlp": 0.01270936, + "balance_loss_clip": 0.0628207, + "balance_loss_mlp": 0.01258043, + "epoch": 0.43962122350819177, + "flos": 22133908224000.0, + "grad_norm": 1.9101034753519561, + "language_loss": 0.81546378, + "learning_rate": 2.483356713869341e-06, + "loss": 0.89268786, + "num_input_tokens_seen": 156695815, + "router_z_loss_clip": 1.69042969, + "router_z_loss_mlp": 0.12890625, + "step": 7312, + "time_per_iteration": 2.5744950771331787 + }, + { + "auxiliary_loss_clip": 0.06441756, + "auxiliary_loss_mlp": 0.01268695, + "balance_loss_clip": 0.06277572, + "balance_loss_mlp": 0.01255713, + "epoch": 0.43968134676085974, + "flos": 17426285009280.0, + "grad_norm": 1.9172183853591918, + "language_loss": 0.86001694, + "learning_rate": 2.482978788066318e-06, + "loss": 0.93712139, + "num_input_tokens_seen": 156714385, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.12982178, + "step": 7313, + "time_per_iteration": 2.536870241165161 + }, + { + "auxiliary_loss_clip": 0.06445049, + "auxiliary_loss_mlp": 0.01271249, + "balance_loss_clip": 0.06276917, + "balance_loss_mlp": 0.01258184, + "epoch": 0.43974147001352776, + "flos": 18958582224000.0, + "grad_norm": 6.24702313006486, + "language_loss": 0.679317, + "learning_rate": 2.4826008439486904e-06, + "loss": 0.75647992, + "num_input_tokens_seen": 156732615, + "router_z_loss_clip": 1.68066406, + "router_z_loss_mlp": 0.13061523, + "step": 7314, + "time_per_iteration": 2.5457370281219482 + }, + { + "auxiliary_loss_clip": 0.06448518, + "auxiliary_loss_mlp": 0.01271322, + "balance_loss_clip": 0.06279253, + "balance_loss_mlp": 0.01258209, + "epoch": 0.4398015932661957, + "flos": 18959588472960.0, + "grad_norm": 1.6336273312910292, + "language_loss": 0.76986659, + "learning_rate": 2.4822228815307915e-06, + "loss": 0.84706497, + "num_input_tokens_seen": 156750920, + "router_z_loss_clip": 1.69042969, + "router_z_loss_mlp": 0.13098145, + "step": 7315, + "time_per_iteration": 2.5225329399108887 + }, + { + "auxiliary_loss_clip": 0.06442133, + "auxiliary_loss_mlp": 0.01268226, + "balance_loss_clip": 0.06276898, + "balance_loss_mlp": 0.01255447, + "epoch": 0.4398617165188637, + "flos": 24205608097920.0, + "grad_norm": 2.1993234427936637, + "language_loss": 0.74934149, + "learning_rate": 2.4818449008269523e-06, + "loss": 0.8264451, + "num_input_tokens_seen": 156768520, + "router_z_loss_clip": 1.65234375, + "router_z_loss_mlp": 0.12780762, + "step": 7316, + "time_per_iteration": 2.5561742782592773 + }, + { + "auxiliary_loss_clip": 0.06444536, + "auxiliary_loss_mlp": 0.01271979, + "balance_loss_clip": 0.06280385, + "balance_loss_mlp": 0.01259289, + "epoch": 0.43992183977153165, + "flos": 22243214275200.0, + "grad_norm": 2.7598614180807814, + "language_loss": 0.65349543, + "learning_rate": 2.481466901851506e-06, + "loss": 0.73066062, + "num_input_tokens_seen": 156788700, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.12695312, + "step": 7317, + "time_per_iteration": 2.5142266750335693 + }, + { + "auxiliary_loss_clip": 0.06450248, + "auxiliary_loss_mlp": 0.01270442, + "balance_loss_clip": 0.06283192, + "balance_loss_mlp": 0.01256929, + "epoch": 0.4399819630241996, + "flos": 18703395014400.0, + "grad_norm": 1.826408349581849, + "language_loss": 0.80062312, + "learning_rate": 2.4810888846187865e-06, + "loss": 0.87783003, + "num_input_tokens_seen": 156806470, + "router_z_loss_clip": 1.66894531, + "router_z_loss_mlp": 0.13519287, + "step": 7318, + "time_per_iteration": 2.519906520843506 + }, + { + "auxiliary_loss_clip": 0.06445621, + "auxiliary_loss_mlp": 0.01269422, + "balance_loss_clip": 0.06275794, + "balance_loss_mlp": 0.01255725, + "epoch": 0.4400420862768676, + "flos": 23886326914560.0, + "grad_norm": 1.6582419144412086, + "language_loss": 0.79880667, + "learning_rate": 2.4807108491431283e-06, + "loss": 0.87595713, + "num_input_tokens_seen": 156825895, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.13708496, + "step": 7319, + "time_per_iteration": 2.593442440032959 + }, + { + "auxiliary_loss_clip": 0.06445733, + "auxiliary_loss_mlp": 0.01274619, + "balance_loss_clip": 0.06279506, + "balance_loss_mlp": 0.01260547, + "epoch": 0.44010220952953555, + "flos": 28045071959040.0, + "grad_norm": 2.6685359162637172, + "language_loss": 0.80292428, + "learning_rate": 2.4803327954388667e-06, + "loss": 0.88012779, + "num_input_tokens_seen": 156845990, + "router_z_loss_clip": 1.66113281, + "router_z_loss_mlp": 0.14074707, + "step": 7320, + "time_per_iteration": 2.576824188232422 + }, + { + "auxiliary_loss_clip": 0.06443729, + "auxiliary_loss_mlp": 0.01271309, + "balance_loss_clip": 0.06278579, + "balance_loss_mlp": 0.01258333, + "epoch": 0.4401623327822035, + "flos": 23775763052160.0, + "grad_norm": 3.573791590582856, + "language_loss": 0.69620574, + "learning_rate": 2.4799547235203376e-06, + "loss": 0.77335614, + "num_input_tokens_seen": 156866685, + "router_z_loss_clip": 1.65136719, + "router_z_loss_mlp": 0.12969971, + "step": 7321, + "time_per_iteration": 4.008130311965942 + }, + { + "auxiliary_loss_clip": 0.06352215, + "auxiliary_loss_mlp": 0.01268902, + "balance_loss_clip": 0.06277325, + "balance_loss_mlp": 0.01265612, + "epoch": 0.4402224560348715, + "flos": 70797320081280.0, + "grad_norm": 0.8902034574652531, + "language_loss": 0.56966496, + "learning_rate": 2.4795766334018763e-06, + "loss": 0.64587617, + "num_input_tokens_seen": 156923450, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.03295898, + "step": 7322, + "time_per_iteration": 4.591723680496216 + }, + { + "auxiliary_loss_clip": 0.06443685, + "auxiliary_loss_mlp": 0.01271286, + "balance_loss_clip": 0.06277888, + "balance_loss_mlp": 0.01258787, + "epoch": 0.44028257928753944, + "flos": 22898170114560.0, + "grad_norm": 1.423216656342095, + "language_loss": 0.76491451, + "learning_rate": 2.479198525097822e-06, + "loss": 0.8420642, + "num_input_tokens_seen": 156944795, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.12493896, + "step": 7323, + "time_per_iteration": 2.5367372035980225 + }, + { + "auxiliary_loss_clip": 0.06449594, + "auxiliary_loss_mlp": 0.01277882, + "balance_loss_clip": 0.06282798, + "balance_loss_mlp": 0.01265216, + "epoch": 0.4403427025402074, + "flos": 17901719475840.0, + "grad_norm": 1.6412485345287482, + "language_loss": 0.80679965, + "learning_rate": 2.478820398622511e-06, + "loss": 0.88407433, + "num_input_tokens_seen": 156962755, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.12670898, + "step": 7324, + "time_per_iteration": 2.496735095977783 + }, + { + "auxiliary_loss_clip": 0.0634661, + "auxiliary_loss_mlp": 0.01259308, + "balance_loss_clip": 0.06271856, + "balance_loss_mlp": 0.01255979, + "epoch": 0.4404028257928754, + "flos": 69583717071360.0, + "grad_norm": 0.6517122364434149, + "language_loss": 0.54482663, + "learning_rate": 2.478442253990283e-06, + "loss": 0.62088585, + "num_input_tokens_seen": 157028095, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.03335571, + "step": 7325, + "time_per_iteration": 3.1927096843719482 + }, + { + "auxiliary_loss_clip": 0.06445315, + "auxiliary_loss_mlp": 0.01266283, + "balance_loss_clip": 0.06281503, + "balance_loss_mlp": 0.01253981, + "epoch": 0.44046294904554334, + "flos": 20930074214400.0, + "grad_norm": 1.5304533021700073, + "language_loss": 0.69945073, + "learning_rate": 2.4780640912154766e-06, + "loss": 0.77656674, + "num_input_tokens_seen": 157048365, + "router_z_loss_clip": 1.63964844, + "router_z_loss_mlp": 0.12298584, + "step": 7326, + "time_per_iteration": 2.5716168880462646 + }, + { + "auxiliary_loss_clip": 0.06441578, + "auxiliary_loss_mlp": 0.01266878, + "balance_loss_clip": 0.06279023, + "balance_loss_mlp": 0.01254402, + "epoch": 0.44052307229821136, + "flos": 23630301164160.0, + "grad_norm": 1.488040619087652, + "language_loss": 0.76529855, + "learning_rate": 2.477685910312432e-06, + "loss": 0.84238315, + "num_input_tokens_seen": 157069130, + "router_z_loss_clip": 1.62402344, + "router_z_loss_mlp": 0.12481689, + "step": 7327, + "time_per_iteration": 3.997654676437378 + }, + { + "auxiliary_loss_clip": 0.06439877, + "auxiliary_loss_mlp": 0.01269684, + "balance_loss_clip": 0.06277373, + "balance_loss_mlp": 0.01256744, + "epoch": 0.4405831955508793, + "flos": 17602536072960.0, + "grad_norm": 2.6410067735498512, + "language_loss": 0.83833683, + "learning_rate": 2.4773077112954897e-06, + "loss": 0.91543245, + "num_input_tokens_seen": 157084940, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.1295166, + "step": 7328, + "time_per_iteration": 2.520899534225464 + }, + { + "auxiliary_loss_clip": 0.06445633, + "auxiliary_loss_mlp": 0.01268864, + "balance_loss_clip": 0.06283547, + "balance_loss_mlp": 0.01255703, + "epoch": 0.4406433188035473, + "flos": 21468596405760.0, + "grad_norm": 3.134642090151518, + "language_loss": 0.77723283, + "learning_rate": 2.4769294941789908e-06, + "loss": 0.85437775, + "num_input_tokens_seen": 157102770, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.13165283, + "step": 7329, + "time_per_iteration": 2.5004947185516357 + }, + { + "auxiliary_loss_clip": 0.06448144, + "auxiliary_loss_mlp": 0.01272671, + "balance_loss_clip": 0.06280035, + "balance_loss_mlp": 0.01259176, + "epoch": 0.44070344205621526, + "flos": 22680019209600.0, + "grad_norm": 1.6769566948090702, + "language_loss": 0.74290001, + "learning_rate": 2.476551258977278e-06, + "loss": 0.82010818, + "num_input_tokens_seen": 157122035, + "router_z_loss_clip": 1.6796875, + "router_z_loss_mlp": 0.1348877, + "step": 7330, + "time_per_iteration": 2.534775733947754 + }, + { + "auxiliary_loss_clip": 0.06448483, + "auxiliary_loss_mlp": 0.01270882, + "balance_loss_clip": 0.06283589, + "balance_loss_mlp": 0.01258127, + "epoch": 0.4407635653088832, + "flos": 23448012606720.0, + "grad_norm": 1.699983061814717, + "language_loss": 0.74538559, + "learning_rate": 2.4761730057046936e-06, + "loss": 0.82257915, + "num_input_tokens_seen": 157142800, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.12762451, + "step": 7331, + "time_per_iteration": 2.5442659854888916 + }, + { + "auxiliary_loss_clip": 0.06442808, + "auxiliary_loss_mlp": 0.01270328, + "balance_loss_clip": 0.06279509, + "balance_loss_mlp": 0.01256667, + "epoch": 0.4408236885615512, + "flos": 24027596098560.0, + "grad_norm": 1.6889636086213913, + "language_loss": 0.76643395, + "learning_rate": 2.475794734375581e-06, + "loss": 0.84356534, + "num_input_tokens_seen": 157163295, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.13659668, + "step": 7332, + "time_per_iteration": 2.5714762210845947 + }, + { + "auxiliary_loss_clip": 0.06442308, + "auxiliary_loss_mlp": 0.01271754, + "balance_loss_clip": 0.06277508, + "balance_loss_mlp": 0.01258272, + "epoch": 0.44088381181421915, + "flos": 12681667416960.0, + "grad_norm": 1.845933322464005, + "language_loss": 0.73768836, + "learning_rate": 2.475416445004285e-06, + "loss": 0.81482899, + "num_input_tokens_seen": 157180890, + "router_z_loss_clip": 1.64746094, + "router_z_loss_mlp": 0.1348877, + "step": 7333, + "time_per_iteration": 3.9176201820373535 + }, + { + "auxiliary_loss_clip": 0.06439593, + "auxiliary_loss_mlp": 0.01265669, + "balance_loss_clip": 0.06280486, + "balance_loss_mlp": 0.01253486, + "epoch": 0.4409439350668871, + "flos": 24576474268800.0, + "grad_norm": 1.6297964144317614, + "language_loss": 0.79249531, + "learning_rate": 2.4750381376051493e-06, + "loss": 0.8695479, + "num_input_tokens_seen": 157200580, + "router_z_loss_clip": 1.59277344, + "router_z_loss_mlp": 0.12200928, + "step": 7334, + "time_per_iteration": 2.530762195587158 + }, + { + "auxiliary_loss_clip": 0.06456793, + "auxiliary_loss_mlp": 0.01269696, + "balance_loss_clip": 0.06281539, + "balance_loss_mlp": 0.01254747, + "epoch": 0.4410040583195551, + "flos": 22674191351040.0, + "grad_norm": 7.845487214918662, + "language_loss": 0.7603153, + "learning_rate": 2.47465981219252e-06, + "loss": 0.83758014, + "num_input_tokens_seen": 157218345, + "router_z_loss_clip": 1.75097656, + "router_z_loss_mlp": 0.1496582, + "step": 7335, + "time_per_iteration": 2.5146994590759277 + }, + { + "auxiliary_loss_clip": 0.06445056, + "auxiliary_loss_mlp": 0.01269223, + "balance_loss_clip": 0.06279862, + "balance_loss_mlp": 0.01254942, + "epoch": 0.44106418157222305, + "flos": 10857062833920.0, + "grad_norm": 1.9701535584859973, + "language_loss": 0.72720182, + "learning_rate": 2.4742814687807423e-06, + "loss": 0.80434465, + "num_input_tokens_seen": 157234395, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.14263916, + "step": 7336, + "time_per_iteration": 2.470501661300659 + }, + { + "auxiliary_loss_clip": 0.06448875, + "auxiliary_loss_mlp": 0.01272884, + "balance_loss_clip": 0.06281201, + "balance_loss_mlp": 0.01259079, + "epoch": 0.441124304824891, + "flos": 21733301053440.0, + "grad_norm": 2.690720747597236, + "language_loss": 0.62764168, + "learning_rate": 2.473903107384165e-06, + "loss": 0.70485932, + "num_input_tokens_seen": 157254805, + "router_z_loss_clip": 1.67578125, + "router_z_loss_mlp": 0.13812256, + "step": 7337, + "time_per_iteration": 2.5464730262756348 + }, + { + "auxiliary_loss_clip": 0.06339368, + "auxiliary_loss_mlp": 0.01255392, + "balance_loss_clip": 0.06265444, + "balance_loss_mlp": 0.01252635, + "epoch": 0.441184428077559, + "flos": 63241702041600.0, + "grad_norm": 0.7296971987367982, + "language_loss": 0.52622962, + "learning_rate": 2.473524728017134e-06, + "loss": 0.60217726, + "num_input_tokens_seen": 157317870, + "router_z_loss_clip": 0.73876953, + "router_z_loss_mlp": 0.02761841, + "step": 7338, + "time_per_iteration": 3.1634135246276855 + }, + { + "auxiliary_loss_clip": 0.06451306, + "auxiliary_loss_mlp": 0.0127376, + "balance_loss_clip": 0.06278681, + "balance_loss_mlp": 0.01259133, + "epoch": 0.44124455133022694, + "flos": 21184213248000.0, + "grad_norm": 2.888450189779477, + "language_loss": 0.71053195, + "learning_rate": 2.473146330693997e-06, + "loss": 0.78778255, + "num_input_tokens_seen": 157336505, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.14611816, + "step": 7339, + "time_per_iteration": 2.526179552078247 + }, + { + "auxiliary_loss_clip": 0.06437125, + "auxiliary_loss_mlp": 0.01265386, + "balance_loss_clip": 0.06279349, + "balance_loss_mlp": 0.01252833, + "epoch": 0.4413046745828949, + "flos": 17463740584320.0, + "grad_norm": 1.6365123651784117, + "language_loss": 0.70282859, + "learning_rate": 2.472767915429105e-06, + "loss": 0.77985364, + "num_input_tokens_seen": 157354995, + "router_z_loss_clip": 1.57714844, + "router_z_loss_mlp": 0.12554932, + "step": 7340, + "time_per_iteration": 2.4790234565734863 + }, + { + "auxiliary_loss_clip": 0.06342094, + "auxiliary_loss_mlp": 0.01254424, + "balance_loss_clip": 0.06268074, + "balance_loss_mlp": 0.01251767, + "epoch": 0.4413647978355629, + "flos": 61602251783040.0, + "grad_norm": 0.8821319445569078, + "language_loss": 0.64009017, + "learning_rate": 2.4723894822368054e-06, + "loss": 0.71605539, + "num_input_tokens_seen": 157404260, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.02659607, + "step": 7341, + "time_per_iteration": 2.9593453407287598 + }, + { + "auxiliary_loss_clip": 0.06446001, + "auxiliary_loss_mlp": 0.0127129, + "balance_loss_clip": 0.06280506, + "balance_loss_mlp": 0.01257992, + "epoch": 0.4414249210882309, + "flos": 27534404050560.0, + "grad_norm": 1.9827417031820809, + "language_loss": 0.73812068, + "learning_rate": 2.47201103113145e-06, + "loss": 0.81529361, + "num_input_tokens_seen": 157423045, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.13299561, + "step": 7342, + "time_per_iteration": 2.5592381954193115 + }, + { + "auxiliary_loss_clip": 0.06443819, + "auxiliary_loss_mlp": 0.01272391, + "balance_loss_clip": 0.06280041, + "balance_loss_mlp": 0.01258497, + "epoch": 0.44148504434089886, + "flos": 23520785477760.0, + "grad_norm": 1.7847903417039304, + "language_loss": 0.80326116, + "learning_rate": 2.4716325621273886e-06, + "loss": 0.88042319, + "num_input_tokens_seen": 157441815, + "router_z_loss_clip": 1.63769531, + "router_z_loss_mlp": 0.13885498, + "step": 7343, + "time_per_iteration": 2.567669630050659 + }, + { + "auxiliary_loss_clip": 0.0644604, + "auxiliary_loss_mlp": 0.01268371, + "balance_loss_clip": 0.06281629, + "balance_loss_mlp": 0.01254382, + "epoch": 0.4415451675935668, + "flos": 21587126405760.0, + "grad_norm": 1.6274174275387656, + "language_loss": 0.7678231, + "learning_rate": 2.4712540752389725e-06, + "loss": 0.84496725, + "num_input_tokens_seen": 157460470, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.14001465, + "step": 7344, + "time_per_iteration": 2.50498628616333 + }, + { + "auxiliary_loss_clip": 0.06331868, + "auxiliary_loss_mlp": 0.01254509, + "balance_loss_clip": 0.06258254, + "balance_loss_mlp": 0.01251979, + "epoch": 0.4416052908462348, + "flos": 59023825142400.0, + "grad_norm": 0.9594048262741005, + "language_loss": 0.63725042, + "learning_rate": 2.470875570480556e-06, + "loss": 0.71311414, + "num_input_tokens_seen": 157512655, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.02529907, + "step": 7345, + "time_per_iteration": 2.9305789470672607 + }, + { + "auxiliary_loss_clip": 0.06448534, + "auxiliary_loss_mlp": 0.01269691, + "balance_loss_clip": 0.06281187, + "balance_loss_mlp": 0.01255386, + "epoch": 0.44166541409890275, + "flos": 26364545671680.0, + "grad_norm": 1.5861169822925434, + "language_loss": 0.86231661, + "learning_rate": 2.470497047866489e-06, + "loss": 0.9394989, + "num_input_tokens_seen": 157533700, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.14306641, + "step": 7346, + "time_per_iteration": 2.566326141357422 + }, + { + "auxiliary_loss_clip": 0.06448992, + "auxiliary_loss_mlp": 0.01268131, + "balance_loss_clip": 0.06282933, + "balance_loss_mlp": 0.01253909, + "epoch": 0.4417255373515707, + "flos": 20198739778560.0, + "grad_norm": 1.9006247897038917, + "language_loss": 0.80872411, + "learning_rate": 2.470118507411128e-06, + "loss": 0.88589537, + "num_input_tokens_seen": 157551105, + "router_z_loss_clip": 1.66113281, + "router_z_loss_mlp": 0.14221191, + "step": 7347, + "time_per_iteration": 2.4968490600585938 + }, + { + "auxiliary_loss_clip": 0.06445403, + "auxiliary_loss_mlp": 0.01269031, + "balance_loss_clip": 0.06279784, + "balance_loss_mlp": 0.01254166, + "epoch": 0.4417856606042387, + "flos": 17892537454080.0, + "grad_norm": 1.9280841383218132, + "language_loss": 0.83507645, + "learning_rate": 2.4697399491288263e-06, + "loss": 0.91222078, + "num_input_tokens_seen": 157568285, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.14868164, + "step": 7348, + "time_per_iteration": 2.5483500957489014 + }, + { + "auxiliary_loss_clip": 0.06451687, + "auxiliary_loss_mlp": 0.01270301, + "balance_loss_clip": 0.06282644, + "balance_loss_mlp": 0.0125571, + "epoch": 0.44184578385690665, + "flos": 27971376693120.0, + "grad_norm": 2.209333058456871, + "language_loss": 0.70229864, + "learning_rate": 2.469361373033938e-06, + "loss": 0.77951854, + "num_input_tokens_seen": 157590405, + "router_z_loss_clip": 1.69042969, + "router_z_loss_mlp": 0.14593506, + "step": 7349, + "time_per_iteration": 2.5552031993865967 + }, + { + "auxiliary_loss_clip": 0.06448848, + "auxiliary_loss_mlp": 0.01269717, + "balance_loss_clip": 0.06281149, + "balance_loss_mlp": 0.01254858, + "epoch": 0.4419059071095746, + "flos": 23374652757120.0, + "grad_norm": 1.8931524120790788, + "language_loss": 0.74732667, + "learning_rate": 2.468982779140819e-06, + "loss": 0.82451236, + "num_input_tokens_seen": 157607420, + "router_z_loss_clip": 1.67578125, + "router_z_loss_mlp": 0.14855957, + "step": 7350, + "time_per_iteration": 2.5428407192230225 + }, + { + "auxiliary_loss_clip": 0.06449752, + "auxiliary_loss_mlp": 0.01269052, + "balance_loss_clip": 0.06283528, + "balance_loss_mlp": 0.01254591, + "epoch": 0.4419660303622426, + "flos": 15017443032960.0, + "grad_norm": 2.6211867622298626, + "language_loss": 0.81412131, + "learning_rate": 2.468604167463827e-06, + "loss": 0.89130938, + "num_input_tokens_seen": 157624990, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.14453125, + "step": 7351, + "time_per_iteration": 2.5310895442962646 + }, + { + "auxiliary_loss_clip": 0.06439559, + "auxiliary_loss_mlp": 0.01271292, + "balance_loss_clip": 0.06278528, + "balance_loss_mlp": 0.01258537, + "epoch": 0.44202615361491054, + "flos": 25378359442560.0, + "grad_norm": 1.998249332467298, + "language_loss": 0.73669267, + "learning_rate": 2.4682255380173176e-06, + "loss": 0.81380117, + "num_input_tokens_seen": 157645300, + "router_z_loss_clip": 1.61035156, + "router_z_loss_mlp": 0.12774658, + "step": 7352, + "time_per_iteration": 2.6823537349700928 + }, + { + "auxiliary_loss_clip": 0.06450884, + "auxiliary_loss_mlp": 0.01267651, + "balance_loss_clip": 0.06284234, + "balance_loss_mlp": 0.01253584, + "epoch": 0.4420862768675785, + "flos": 24688044380160.0, + "grad_norm": 1.9707834429969424, + "language_loss": 0.87580955, + "learning_rate": 2.467846890815649e-06, + "loss": 0.95299494, + "num_input_tokens_seen": 157664060, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.14086914, + "step": 7353, + "time_per_iteration": 2.531208038330078 + }, + { + "auxiliary_loss_clip": 0.06445745, + "auxiliary_loss_mlp": 0.01274404, + "balance_loss_clip": 0.06277722, + "balance_loss_mlp": 0.01260659, + "epoch": 0.44214640012024653, + "flos": 19533134471040.0, + "grad_norm": 2.5061219192509676, + "language_loss": 0.76425511, + "learning_rate": 2.4674682258731795e-06, + "loss": 0.84145659, + "num_input_tokens_seen": 157680905, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.13751221, + "step": 7354, + "time_per_iteration": 2.5208046436309814 + }, + { + "auxiliary_loss_clip": 0.06442366, + "auxiliary_loss_mlp": 0.01269638, + "balance_loss_clip": 0.06279345, + "balance_loss_mlp": 0.01256894, + "epoch": 0.4422065233729145, + "flos": 47568143940480.0, + "grad_norm": 2.32689870132585, + "language_loss": 0.65273595, + "learning_rate": 2.467089543204268e-06, + "loss": 0.72985595, + "num_input_tokens_seen": 157701980, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.12768555, + "step": 7355, + "time_per_iteration": 2.7359063625335693 + }, + { + "auxiliary_loss_clip": 0.06452843, + "auxiliary_loss_mlp": 0.01272532, + "balance_loss_clip": 0.06279876, + "balance_loss_mlp": 0.01257225, + "epoch": 0.44226664662558246, + "flos": 19287045429120.0, + "grad_norm": 1.8090120162092156, + "language_loss": 0.78513968, + "learning_rate": 2.466710842823274e-06, + "loss": 0.86239338, + "num_input_tokens_seen": 157720555, + "router_z_loss_clip": 1.72753906, + "router_z_loss_mlp": 0.15307617, + "step": 7356, + "time_per_iteration": 2.5535836219787598 + }, + { + "auxiliary_loss_clip": 0.0645135, + "auxiliary_loss_mlp": 0.01270574, + "balance_loss_clip": 0.0628085, + "balance_loss_mlp": 0.01255184, + "epoch": 0.4423267698782504, + "flos": 17827604939520.0, + "grad_norm": 1.5923292427452285, + "language_loss": 0.77331412, + "learning_rate": 2.4663321247445577e-06, + "loss": 0.85053337, + "num_input_tokens_seen": 157739160, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.1539917, + "step": 7357, + "time_per_iteration": 2.472616195678711 + }, + { + "auxiliary_loss_clip": 0.06444242, + "auxiliary_loss_mlp": 0.0127409, + "balance_loss_clip": 0.06277513, + "balance_loss_mlp": 0.01259112, + "epoch": 0.4423868931309184, + "flos": 29211953518080.0, + "grad_norm": 1.4316006976636513, + "language_loss": 0.73656726, + "learning_rate": 2.465953388982481e-06, + "loss": 0.81375057, + "num_input_tokens_seen": 157760020, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.14971924, + "step": 7358, + "time_per_iteration": 2.596794366836548 + }, + { + "auxiliary_loss_clip": 0.06449263, + "auxiliary_loss_mlp": 0.01268513, + "balance_loss_clip": 0.06281863, + "balance_loss_mlp": 0.01255131, + "epoch": 0.44244701638358636, + "flos": 29720399293440.0, + "grad_norm": 1.5482043588344903, + "language_loss": 0.75746959, + "learning_rate": 2.465574635551405e-06, + "loss": 0.83464736, + "num_input_tokens_seen": 157780435, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.13378906, + "step": 7359, + "time_per_iteration": 2.565152168273926 + }, + { + "auxiliary_loss_clip": 0.06449427, + "auxiliary_loss_mlp": 0.01273427, + "balance_loss_clip": 0.06282771, + "balance_loss_mlp": 0.01258907, + "epoch": 0.4425071396362543, + "flos": 22936715792640.0, + "grad_norm": 1.7006216058888692, + "language_loss": 0.70234901, + "learning_rate": 2.4651958644656923e-06, + "loss": 0.77957749, + "num_input_tokens_seen": 157799420, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.14526367, + "step": 7360, + "time_per_iteration": 3.9516735076904297 + }, + { + "auxiliary_loss_clip": 0.06450445, + "auxiliary_loss_mlp": 0.01276643, + "balance_loss_clip": 0.06282296, + "balance_loss_mlp": 0.01262028, + "epoch": 0.4425672628889223, + "flos": 19798509951360.0, + "grad_norm": 2.334645337647824, + "language_loss": 0.69802427, + "learning_rate": 2.4648170757397053e-06, + "loss": 0.77529514, + "num_input_tokens_seen": 157817025, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.14599609, + "step": 7361, + "time_per_iteration": 3.9590420722961426 + }, + { + "auxiliary_loss_clip": 0.06448395, + "auxiliary_loss_mlp": 0.01271063, + "balance_loss_clip": 0.06281347, + "balance_loss_mlp": 0.01256287, + "epoch": 0.44262738614159025, + "flos": 13667266667520.0, + "grad_norm": 1.9889994262633817, + "language_loss": 0.82882756, + "learning_rate": 2.464438269387809e-06, + "loss": 0.90602213, + "num_input_tokens_seen": 157834345, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.14770508, + "step": 7362, + "time_per_iteration": 2.4627645015716553 + }, + { + "auxiliary_loss_clip": 0.06458044, + "auxiliary_loss_mlp": 0.01274491, + "balance_loss_clip": 0.06284538, + "balance_loss_mlp": 0.01258111, + "epoch": 0.4426875093942582, + "flos": 14215474005120.0, + "grad_norm": 1.7592716332344263, + "language_loss": 0.75051332, + "learning_rate": 2.464059445424366e-06, + "loss": 0.82783866, + "num_input_tokens_seen": 157852290, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.16381836, + "step": 7363, + "time_per_iteration": 2.526925802230835 + }, + { + "auxiliary_loss_clip": 0.0633463, + "auxiliary_loss_mlp": 0.01256608, + "balance_loss_clip": 0.06260501, + "balance_loss_mlp": 0.01253844, + "epoch": 0.4427476326469262, + "flos": 70140100181760.0, + "grad_norm": 0.6687771463902197, + "language_loss": 0.55581295, + "learning_rate": 2.463680603863743e-06, + "loss": 0.63172531, + "num_input_tokens_seen": 157923060, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.02767944, + "step": 7364, + "time_per_iteration": 3.2234084606170654 + }, + { + "auxiliary_loss_clip": 0.06445954, + "auxiliary_loss_mlp": 0.01269396, + "balance_loss_clip": 0.06280937, + "balance_loss_mlp": 0.01255479, + "epoch": 0.44280775589959415, + "flos": 25451761219200.0, + "grad_norm": 6.076987981061014, + "language_loss": 0.75066888, + "learning_rate": 2.463301744720305e-06, + "loss": 0.82782239, + "num_input_tokens_seen": 157944110, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.13928223, + "step": 7365, + "time_per_iteration": 2.606168746948242 + }, + { + "auxiliary_loss_clip": 0.06448679, + "auxiliary_loss_mlp": 0.01268458, + "balance_loss_clip": 0.06282686, + "balance_loss_mlp": 0.01253724, + "epoch": 0.4428678791522621, + "flos": 22863900994560.0, + "grad_norm": 1.5120042705282817, + "language_loss": 0.74655497, + "learning_rate": 2.4629228680084184e-06, + "loss": 0.82372636, + "num_input_tokens_seen": 157964295, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.1473999, + "step": 7366, + "time_per_iteration": 2.5269834995269775 + }, + { + "auxiliary_loss_clip": 0.06449491, + "auxiliary_loss_mlp": 0.0127034, + "balance_loss_clip": 0.06283636, + "balance_loss_mlp": 0.01255438, + "epoch": 0.44292800240493013, + "flos": 25819608643200.0, + "grad_norm": 2.3253747528787447, + "language_loss": 0.7339704, + "learning_rate": 2.46254397374245e-06, + "loss": 0.81116873, + "num_input_tokens_seen": 157983970, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.14904785, + "step": 7367, + "time_per_iteration": 4.017570495605469 + }, + { + "auxiliary_loss_clip": 0.06453082, + "auxiliary_loss_mlp": 0.01276023, + "balance_loss_clip": 0.06286091, + "balance_loss_mlp": 0.01260979, + "epoch": 0.4429881256575981, + "flos": 32425238217600.0, + "grad_norm": 1.584590811661976, + "language_loss": 0.73953557, + "learning_rate": 2.4621650619367677e-06, + "loss": 0.81682664, + "num_input_tokens_seen": 158006515, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.15057373, + "step": 7368, + "time_per_iteration": 2.6219804286956787 + }, + { + "auxiliary_loss_clip": 0.06446074, + "auxiliary_loss_mlp": 0.01270312, + "balance_loss_clip": 0.06281151, + "balance_loss_mlp": 0.01256007, + "epoch": 0.44304824891026606, + "flos": 22170231768960.0, + "grad_norm": 1.6442785623938219, + "language_loss": 0.79845673, + "learning_rate": 2.4617861326057403e-06, + "loss": 0.8756206, + "num_input_tokens_seen": 158025565, + "router_z_loss_clip": 1.64941406, + "router_z_loss_mlp": 0.14306641, + "step": 7369, + "time_per_iteration": 2.5048859119415283 + }, + { + "auxiliary_loss_clip": 0.06445719, + "auxiliary_loss_mlp": 0.01268056, + "balance_loss_clip": 0.0628242, + "balance_loss_mlp": 0.01253524, + "epoch": 0.443108372162934, + "flos": 25345725477120.0, + "grad_norm": 1.8080912741875748, + "language_loss": 0.72226167, + "learning_rate": 2.461407185763737e-06, + "loss": 0.79939938, + "num_input_tokens_seen": 158045620, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.14538574, + "step": 7370, + "time_per_iteration": 2.59167218208313 + }, + { + "auxiliary_loss_clip": 0.06444093, + "auxiliary_loss_mlp": 0.01274154, + "balance_loss_clip": 0.06279977, + "balance_loss_mlp": 0.01259741, + "epoch": 0.443168495415602, + "flos": 23337616452480.0, + "grad_norm": 2.642683672552081, + "language_loss": 0.70957971, + "learning_rate": 2.461028221425126e-06, + "loss": 0.78676224, + "num_input_tokens_seen": 158063505, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.14428711, + "step": 7371, + "time_per_iteration": 2.5119266510009766 + }, + { + "auxiliary_loss_clip": 0.0644391, + "auxiliary_loss_mlp": 0.01268622, + "balance_loss_clip": 0.06280756, + "balance_loss_mlp": 0.01255288, + "epoch": 0.44322861866826996, + "flos": 21877924400640.0, + "grad_norm": 2.5641722247612977, + "language_loss": 0.69211292, + "learning_rate": 2.4606492396042786e-06, + "loss": 0.76923823, + "num_input_tokens_seen": 158080335, + "router_z_loss_clip": 1.63085938, + "router_z_loss_mlp": 0.13330078, + "step": 7372, + "time_per_iteration": 2.575803518295288 + }, + { + "auxiliary_loss_clip": 0.06450622, + "auxiliary_loss_mlp": 0.01273627, + "balance_loss_clip": 0.06281562, + "balance_loss_mlp": 0.01257855, + "epoch": 0.4432887419209379, + "flos": 20090649611520.0, + "grad_norm": 1.7339006835744544, + "language_loss": 0.83742619, + "learning_rate": 2.4602702403155664e-06, + "loss": 0.91466868, + "num_input_tokens_seen": 158098955, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.15765381, + "step": 7373, + "time_per_iteration": 4.006488084793091 + }, + { + "auxiliary_loss_clip": 0.06340961, + "auxiliary_loss_mlp": 0.01252329, + "balance_loss_clip": 0.06267951, + "balance_loss_mlp": 0.01249765, + "epoch": 0.4433488651736059, + "flos": 70056593988480.0, + "grad_norm": 0.7566866942124226, + "language_loss": 0.55204445, + "learning_rate": 2.4598912235733604e-06, + "loss": 0.62797731, + "num_input_tokens_seen": 158164110, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.02565002, + "step": 7374, + "time_per_iteration": 3.1780457496643066 + }, + { + "auxiliary_loss_clip": 0.06443411, + "auxiliary_loss_mlp": 0.01275671, + "balance_loss_clip": 0.06280876, + "balance_loss_mlp": 0.01260198, + "epoch": 0.44340898842627385, + "flos": 16286838462720.0, + "grad_norm": 2.3260457628480617, + "language_loss": 0.82868445, + "learning_rate": 2.4595121893920327e-06, + "loss": 0.90587527, + "num_input_tokens_seen": 158179850, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.15478516, + "step": 7375, + "time_per_iteration": 2.5473110675811768 + }, + { + "auxiliary_loss_clip": 0.0644948, + "auxiliary_loss_mlp": 0.01269753, + "balance_loss_clip": 0.06282064, + "balance_loss_mlp": 0.01255388, + "epoch": 0.4434691116789418, + "flos": 16616601406080.0, + "grad_norm": 2.217281539940859, + "language_loss": 0.83904636, + "learning_rate": 2.4591331377859578e-06, + "loss": 0.91623867, + "num_input_tokens_seen": 158196590, + "router_z_loss_clip": 1.671875, + "router_z_loss_mlp": 0.1439209, + "step": 7376, + "time_per_iteration": 2.4960668087005615 + }, + { + "auxiliary_loss_clip": 0.06447101, + "auxiliary_loss_mlp": 0.01271986, + "balance_loss_clip": 0.06282647, + "balance_loss_mlp": 0.01257573, + "epoch": 0.4435292349316098, + "flos": 19069397648640.0, + "grad_norm": 1.7110647715019258, + "language_loss": 0.77357483, + "learning_rate": 2.4587540687695077e-06, + "loss": 0.85076571, + "num_input_tokens_seen": 158216355, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.14422607, + "step": 7377, + "time_per_iteration": 2.5489466190338135 + }, + { + "auxiliary_loss_clip": 0.064443, + "auxiliary_loss_mlp": 0.01269165, + "balance_loss_clip": 0.06284986, + "balance_loss_mlp": 0.01255396, + "epoch": 0.44358935818427775, + "flos": 21257656951680.0, + "grad_norm": 1.7746716431943175, + "language_loss": 0.75928617, + "learning_rate": 2.458374982357057e-06, + "loss": 0.83642089, + "num_input_tokens_seen": 158235825, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.13763428, + "step": 7378, + "time_per_iteration": 2.498782157897949 + }, + { + "auxiliary_loss_clip": 0.06446375, + "auxiliary_loss_mlp": 0.01269929, + "balance_loss_clip": 0.06281648, + "balance_loss_mlp": 0.01255106, + "epoch": 0.4436494814369457, + "flos": 12500259327360.0, + "grad_norm": 1.8740687903376234, + "language_loss": 0.69627756, + "learning_rate": 2.457995878562982e-06, + "loss": 0.77344066, + "num_input_tokens_seen": 158254230, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.14825439, + "step": 7379, + "time_per_iteration": 2.5212602615356445 + }, + { + "auxiliary_loss_clip": 0.0645185, + "auxiliary_loss_mlp": 0.01266938, + "balance_loss_clip": 0.0628576, + "balance_loss_mlp": 0.01252556, + "epoch": 0.44370960468961373, + "flos": 23666666636160.0, + "grad_norm": 2.508566876625721, + "language_loss": 0.73565447, + "learning_rate": 2.457616757401656e-06, + "loss": 0.81284231, + "num_input_tokens_seen": 158273400, + "router_z_loss_clip": 1.66113281, + "router_z_loss_mlp": 0.1439209, + "step": 7380, + "time_per_iteration": 2.500859260559082 + }, + { + "auxiliary_loss_clip": 0.06449685, + "auxiliary_loss_mlp": 0.01268804, + "balance_loss_clip": 0.06285541, + "balance_loss_mlp": 0.01255452, + "epoch": 0.4437697279422817, + "flos": 32425196290560.0, + "grad_norm": 1.7107220322970214, + "language_loss": 0.65104783, + "learning_rate": 2.457237618887458e-06, + "loss": 0.72823262, + "num_input_tokens_seen": 158296840, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.13336182, + "step": 7381, + "time_per_iteration": 2.618229627609253 + }, + { + "auxiliary_loss_clip": 0.06454551, + "auxiliary_loss_mlp": 0.01272971, + "balance_loss_clip": 0.06288015, + "balance_loss_mlp": 0.01258773, + "epoch": 0.44382985119494966, + "flos": 18118570642560.0, + "grad_norm": 2.331874867497661, + "language_loss": 0.80543017, + "learning_rate": 2.456858463034763e-06, + "loss": 0.88270545, + "num_input_tokens_seen": 158314935, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.14190674, + "step": 7382, + "time_per_iteration": 2.4738404750823975 + }, + { + "auxiliary_loss_clip": 0.06452931, + "auxiliary_loss_mlp": 0.01272481, + "balance_loss_clip": 0.06287742, + "balance_loss_mlp": 0.01258486, + "epoch": 0.44388997444761763, + "flos": 30782083651200.0, + "grad_norm": 1.5922456749371714, + "language_loss": 0.65226638, + "learning_rate": 2.456479289857949e-06, + "loss": 0.72952044, + "num_input_tokens_seen": 158334620, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.13983154, + "step": 7383, + "time_per_iteration": 2.614912986755371 + }, + { + "auxiliary_loss_clip": 0.0645685, + "auxiliary_loss_mlp": 0.01272667, + "balance_loss_clip": 0.0628838, + "balance_loss_mlp": 0.01258088, + "epoch": 0.4439500977002856, + "flos": 20345333696640.0, + "grad_norm": 2.064556949518224, + "language_loss": 0.76699257, + "learning_rate": 2.4561000993713953e-06, + "loss": 0.84428775, + "num_input_tokens_seen": 158350550, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.14587402, + "step": 7384, + "time_per_iteration": 2.4842731952667236 + }, + { + "auxiliary_loss_clip": 0.06456664, + "auxiliary_loss_mlp": 0.012692, + "balance_loss_clip": 0.06288753, + "balance_loss_mlp": 0.01254442, + "epoch": 0.44401022095295356, + "flos": 20376667923840.0, + "grad_norm": 2.2924078267975605, + "language_loss": 0.80810666, + "learning_rate": 2.4557208915894796e-06, + "loss": 0.88536537, + "num_input_tokens_seen": 158369555, + "router_z_loss_clip": 1.67871094, + "router_z_loss_mlp": 0.14758301, + "step": 7385, + "time_per_iteration": 2.5268380641937256 + }, + { + "auxiliary_loss_clip": 0.0645503, + "auxiliary_loss_mlp": 0.01272748, + "balance_loss_clip": 0.06290472, + "balance_loss_mlp": 0.01257013, + "epoch": 0.4440703442056215, + "flos": 20236950040320.0, + "grad_norm": 1.6897241264536553, + "language_loss": 0.82179439, + "learning_rate": 2.455341666526582e-06, + "loss": 0.89907217, + "num_input_tokens_seen": 158388045, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.15734863, + "step": 7386, + "time_per_iteration": 2.497891426086426 + }, + { + "auxiliary_loss_clip": 0.06463334, + "auxiliary_loss_mlp": 0.01273049, + "balance_loss_clip": 0.06290253, + "balance_loss_mlp": 0.01257683, + "epoch": 0.4441304674582895, + "flos": 39504163979520.0, + "grad_norm": 2.9557468241194624, + "language_loss": 0.70275033, + "learning_rate": 2.4549624241970832e-06, + "loss": 0.78011411, + "num_input_tokens_seen": 158410115, + "router_z_loss_clip": 1.73046875, + "router_z_loss_mlp": 0.15356445, + "step": 7387, + "time_per_iteration": 2.6782705783843994 + }, + { + "auxiliary_loss_clip": 0.06455649, + "auxiliary_loss_mlp": 0.01272917, + "balance_loss_clip": 0.06289866, + "balance_loss_mlp": 0.01258206, + "epoch": 0.44419059071095746, + "flos": 14834902913280.0, + "grad_norm": 1.9684531060003607, + "language_loss": 0.72165161, + "learning_rate": 2.4545831646153628e-06, + "loss": 0.79893732, + "num_input_tokens_seen": 158427765, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.14715576, + "step": 7388, + "time_per_iteration": 2.5119476318359375 + }, + { + "auxiliary_loss_clip": 0.06464041, + "auxiliary_loss_mlp": 0.01270575, + "balance_loss_clip": 0.06293739, + "balance_loss_mlp": 0.01255113, + "epoch": 0.4442507139636254, + "flos": 22644408424320.0, + "grad_norm": 1.566920019209845, + "language_loss": 0.69646138, + "learning_rate": 2.4542038877958044e-06, + "loss": 0.77380753, + "num_input_tokens_seen": 158446375, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.15454102, + "step": 7389, + "time_per_iteration": 2.671290874481201 + }, + { + "auxiliary_loss_clip": 0.06455444, + "auxiliary_loss_mlp": 0.01269625, + "balance_loss_clip": 0.06289597, + "balance_loss_mlp": 0.01255487, + "epoch": 0.4443108372162934, + "flos": 38299994553600.0, + "grad_norm": 1.918848783354648, + "language_loss": 0.74912727, + "learning_rate": 2.453824593752788e-06, + "loss": 0.82637799, + "num_input_tokens_seen": 158467260, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.14135742, + "step": 7390, + "time_per_iteration": 2.6656923294067383 + }, + { + "auxiliary_loss_clip": 0.06453501, + "auxiliary_loss_mlp": 0.01269903, + "balance_loss_clip": 0.06290193, + "balance_loss_mlp": 0.0125657, + "epoch": 0.44437096046896135, + "flos": 17754790141440.0, + "grad_norm": 1.7902511429273704, + "language_loss": 0.82203722, + "learning_rate": 2.4534452825006988e-06, + "loss": 0.89927119, + "num_input_tokens_seen": 158486720, + "router_z_loss_clip": 1.63476562, + "router_z_loss_mlp": 0.13323975, + "step": 7391, + "time_per_iteration": 2.5425097942352295 + }, + { + "auxiliary_loss_clip": 0.06451984, + "auxiliary_loss_mlp": 0.01268602, + "balance_loss_clip": 0.06289234, + "balance_loss_mlp": 0.01254547, + "epoch": 0.4444310837216293, + "flos": 13736936937600.0, + "grad_norm": 1.5949305897923123, + "language_loss": 0.73880637, + "learning_rate": 2.4530659540539185e-06, + "loss": 0.81601214, + "num_input_tokens_seen": 158502530, + "router_z_loss_clip": 1.62890625, + "router_z_loss_mlp": 0.14044189, + "step": 7392, + "time_per_iteration": 2.509695053100586 + }, + { + "auxiliary_loss_clip": 0.06450866, + "auxiliary_loss_mlp": 0.01269173, + "balance_loss_clip": 0.06287552, + "balance_loss_mlp": 0.01256424, + "epoch": 0.44449120697429734, + "flos": 25017346126080.0, + "grad_norm": 1.7319744549950544, + "language_loss": 0.79953551, + "learning_rate": 2.4526866084268313e-06, + "loss": 0.87673593, + "num_input_tokens_seen": 158522715, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.12744141, + "step": 7393, + "time_per_iteration": 2.6058006286621094 + }, + { + "auxiliary_loss_clip": 0.06460646, + "auxiliary_loss_mlp": 0.01270821, + "balance_loss_clip": 0.06291801, + "balance_loss_mlp": 0.01255276, + "epoch": 0.4445513302269653, + "flos": 32680006156800.0, + "grad_norm": 1.76893741086752, + "language_loss": 0.8113097, + "learning_rate": 2.4523072456338226e-06, + "loss": 0.88862437, + "num_input_tokens_seen": 158543615, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.15551758, + "step": 7394, + "time_per_iteration": 2.6408586502075195 + }, + { + "auxiliary_loss_clip": 0.06448914, + "auxiliary_loss_mlp": 0.01267892, + "balance_loss_clip": 0.06286056, + "balance_loss_mlp": 0.01254796, + "epoch": 0.44461145347963327, + "flos": 11660583162240.0, + "grad_norm": 2.0227503675909646, + "language_loss": 0.79471397, + "learning_rate": 2.4519278656892785e-06, + "loss": 0.87188208, + "num_input_tokens_seen": 158560330, + "router_z_loss_clip": 1.62988281, + "router_z_loss_mlp": 0.13092041, + "step": 7395, + "time_per_iteration": 2.482771158218384 + }, + { + "auxiliary_loss_clip": 0.06457528, + "auxiliary_loss_mlp": 0.01269923, + "balance_loss_clip": 0.06293359, + "balance_loss_mlp": 0.01255838, + "epoch": 0.44467157673230123, + "flos": 20893079836800.0, + "grad_norm": 1.8465254869377097, + "language_loss": 0.68925393, + "learning_rate": 2.451548468607584e-06, + "loss": 0.76652849, + "num_input_tokens_seen": 158579735, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.14074707, + "step": 7396, + "time_per_iteration": 2.526031017303467 + }, + { + "auxiliary_loss_clip": 0.06458125, + "auxiliary_loss_mlp": 0.01266336, + "balance_loss_clip": 0.06290217, + "balance_loss_mlp": 0.0125299, + "epoch": 0.4447316999849692, + "flos": 18551140945920.0, + "grad_norm": 2.1703937468753964, + "language_loss": 0.80956584, + "learning_rate": 2.451169054403126e-06, + "loss": 0.88681042, + "num_input_tokens_seen": 158597075, + "router_z_loss_clip": 1.67773438, + "router_z_loss_mlp": 0.13342285, + "step": 7397, + "time_per_iteration": 2.482004404067993 + }, + { + "auxiliary_loss_clip": 0.06453413, + "auxiliary_loss_mlp": 0.01269867, + "balance_loss_clip": 0.06290947, + "balance_loss_mlp": 0.01256814, + "epoch": 0.44479182323763716, + "flos": 23775846906240.0, + "grad_norm": 2.7975733901761672, + "language_loss": 0.67842102, + "learning_rate": 2.450789623090293e-06, + "loss": 0.75565386, + "num_input_tokens_seen": 158616650, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.13067627, + "step": 7398, + "time_per_iteration": 2.579227924346924 + }, + { + "auxiliary_loss_clip": 0.06451767, + "auxiliary_loss_mlp": 0.01268989, + "balance_loss_clip": 0.06290427, + "balance_loss_mlp": 0.01256097, + "epoch": 0.44485194649030513, + "flos": 16549237123200.0, + "grad_norm": 1.6886298033370946, + "language_loss": 0.70454216, + "learning_rate": 2.450410174683472e-06, + "loss": 0.78174973, + "num_input_tokens_seen": 158634515, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.12896729, + "step": 7399, + "time_per_iteration": 2.491422653198242 + }, + { + "auxiliary_loss_clip": 0.06448349, + "auxiliary_loss_mlp": 0.01267519, + "balance_loss_clip": 0.06287403, + "balance_loss_mlp": 0.01254543, + "epoch": 0.4449120697429731, + "flos": 22607455973760.0, + "grad_norm": 1.7365156462421643, + "language_loss": 0.72588718, + "learning_rate": 2.4500307091970514e-06, + "loss": 0.80304587, + "num_input_tokens_seen": 158653760, + "router_z_loss_clip": 1.61035156, + "router_z_loss_mlp": 0.12963867, + "step": 7400, + "time_per_iteration": 3.9914138317108154 + }, + { + "auxiliary_loss_clip": 0.06451382, + "auxiliary_loss_mlp": 0.01270619, + "balance_loss_clip": 0.06288703, + "balance_loss_mlp": 0.0125738, + "epoch": 0.44497219299564106, + "flos": 20009994456960.0, + "grad_norm": 1.5547932465186114, + "language_loss": 0.85223019, + "learning_rate": 2.449651226645422e-06, + "loss": 0.92945021, + "num_input_tokens_seen": 158672190, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.13250732, + "step": 7401, + "time_per_iteration": 3.972844123840332 + }, + { + "auxiliary_loss_clip": 0.0644277, + "auxiliary_loss_mlp": 0.01266074, + "balance_loss_clip": 0.06283394, + "balance_loss_mlp": 0.01254099, + "epoch": 0.445032316248309, + "flos": 25601499665280.0, + "grad_norm": 1.7738805367720483, + "language_loss": 0.8345179, + "learning_rate": 2.449271727042973e-06, + "loss": 0.91160637, + "num_input_tokens_seen": 158694115, + "router_z_loss_clip": 1.59277344, + "router_z_loss_mlp": 0.11968994, + "step": 7402, + "time_per_iteration": 2.546557664871216 + }, + { + "auxiliary_loss_clip": 0.06449325, + "auxiliary_loss_mlp": 0.0126916, + "balance_loss_clip": 0.06285563, + "balance_loss_mlp": 0.01255898, + "epoch": 0.445092439500977, + "flos": 21256608775680.0, + "grad_norm": 1.6765614973905527, + "language_loss": 0.77230763, + "learning_rate": 2.4488922104040947e-06, + "loss": 0.84949255, + "num_input_tokens_seen": 158711000, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.13275146, + "step": 7403, + "time_per_iteration": 2.540351152420044 + }, + { + "auxiliary_loss_clip": 0.06362203, + "auxiliary_loss_mlp": 0.01255762, + "balance_loss_clip": 0.0628911, + "balance_loss_mlp": 0.01252394, + "epoch": 0.44515256275364495, + "flos": 57781990506240.0, + "grad_norm": 0.751382178532419, + "language_loss": 0.60078514, + "learning_rate": 2.4485126767431793e-06, + "loss": 0.67696476, + "num_input_tokens_seen": 158769675, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.03375244, + "step": 7404, + "time_per_iteration": 3.1188013553619385 + }, + { + "auxiliary_loss_clip": 0.06455964, + "auxiliary_loss_mlp": 0.01272779, + "balance_loss_clip": 0.06287853, + "balance_loss_mlp": 0.01258462, + "epoch": 0.4452126860063129, + "flos": 15601386936960.0, + "grad_norm": 1.4877710129276585, + "language_loss": 0.82279229, + "learning_rate": 2.4481331260746177e-06, + "loss": 0.90007967, + "num_input_tokens_seen": 158788215, + "router_z_loss_clip": 1.68066406, + "router_z_loss_mlp": 0.14312744, + "step": 7405, + "time_per_iteration": 2.5388095378875732 + }, + { + "auxiliary_loss_clip": 0.06447265, + "auxiliary_loss_mlp": 0.01267875, + "balance_loss_clip": 0.06283686, + "balance_loss_mlp": 0.0125512, + "epoch": 0.4452728092589809, + "flos": 21623995002240.0, + "grad_norm": 1.5786988713847923, + "language_loss": 0.75529754, + "learning_rate": 2.4477535584128036e-06, + "loss": 0.83244896, + "num_input_tokens_seen": 158809090, + "router_z_loss_clip": 1.63476562, + "router_z_loss_mlp": 0.12744141, + "step": 7406, + "time_per_iteration": 2.5249385833740234 + }, + { + "auxiliary_loss_clip": 0.06440533, + "auxiliary_loss_mlp": 0.01271164, + "balance_loss_clip": 0.06282739, + "balance_loss_mlp": 0.01259094, + "epoch": 0.4453329325116489, + "flos": 29505267135360.0, + "grad_norm": 1.6524917293298949, + "language_loss": 0.65847838, + "learning_rate": 2.447373973772129e-06, + "loss": 0.73559535, + "num_input_tokens_seen": 158828320, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.12060547, + "step": 7407, + "time_per_iteration": 3.998326063156128 + }, + { + "auxiliary_loss_clip": 0.06449907, + "auxiliary_loss_mlp": 0.01269795, + "balance_loss_clip": 0.06284529, + "balance_loss_mlp": 0.01256777, + "epoch": 0.44539305576431687, + "flos": 21367549981440.0, + "grad_norm": 1.547450204556426, + "language_loss": 0.68216872, + "learning_rate": 2.4469943721669887e-06, + "loss": 0.75936574, + "num_input_tokens_seen": 158847040, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.13018799, + "step": 7408, + "time_per_iteration": 2.5295586585998535 + }, + { + "auxiliary_loss_clip": 0.06449315, + "auxiliary_loss_mlp": 0.01269644, + "balance_loss_clip": 0.06285807, + "balance_loss_mlp": 0.01256508, + "epoch": 0.44545317901698483, + "flos": 41437278000000.0, + "grad_norm": 2.0427525389439443, + "language_loss": 0.720608, + "learning_rate": 2.4466147536117776e-06, + "loss": 0.79779756, + "num_input_tokens_seen": 158870490, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.13134766, + "step": 7409, + "time_per_iteration": 2.678666114807129 + }, + { + "auxiliary_loss_clip": 0.06448312, + "auxiliary_loss_mlp": 0.01270862, + "balance_loss_clip": 0.06284307, + "balance_loss_mlp": 0.01257045, + "epoch": 0.4455133022696528, + "flos": 22061638477440.0, + "grad_norm": 1.7184461657241017, + "language_loss": 0.65940762, + "learning_rate": 2.4462351181208895e-06, + "loss": 0.73659933, + "num_input_tokens_seen": 158889920, + "router_z_loss_clip": 1.63769531, + "router_z_loss_mlp": 0.13818359, + "step": 7410, + "time_per_iteration": 2.5486950874328613 + }, + { + "auxiliary_loss_clip": 0.06453686, + "auxiliary_loss_mlp": 0.01268565, + "balance_loss_clip": 0.06284985, + "balance_loss_mlp": 0.0125522, + "epoch": 0.44557342552232077, + "flos": 23483665319040.0, + "grad_norm": 3.696220183147237, + "language_loss": 0.74690163, + "learning_rate": 2.4458554657087217e-06, + "loss": 0.82412422, + "num_input_tokens_seen": 158909580, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.13360596, + "step": 7411, + "time_per_iteration": 2.5290050506591797 + }, + { + "auxiliary_loss_clip": 0.0644176, + "auxiliary_loss_mlp": 0.01268016, + "balance_loss_clip": 0.06284117, + "balance_loss_mlp": 0.01256166, + "epoch": 0.44563354877498873, + "flos": 19140577292160.0, + "grad_norm": 2.065063291172047, + "language_loss": 0.7906481, + "learning_rate": 2.4454757963896695e-06, + "loss": 0.86774588, + "num_input_tokens_seen": 158924600, + "router_z_loss_clip": 1.57617188, + "router_z_loss_mlp": 0.11859131, + "step": 7412, + "time_per_iteration": 2.5156190395355225 + }, + { + "auxiliary_loss_clip": 0.0645022, + "auxiliary_loss_mlp": 0.01268988, + "balance_loss_clip": 0.06282784, + "balance_loss_mlp": 0.01255792, + "epoch": 0.4456936720276567, + "flos": 13625744169600.0, + "grad_norm": 2.15802472542835, + "language_loss": 0.80199099, + "learning_rate": 2.4450961101781304e-06, + "loss": 0.87918305, + "num_input_tokens_seen": 158939345, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.13195801, + "step": 7413, + "time_per_iteration": 3.9694504737854004 + }, + { + "auxiliary_loss_clip": 0.06443125, + "auxiliary_loss_mlp": 0.01265994, + "balance_loss_clip": 0.0628258, + "balance_loss_mlp": 0.01254037, + "epoch": 0.44575379528032466, + "flos": 14717840359680.0, + "grad_norm": 1.9357576200238034, + "language_loss": 0.76531088, + "learning_rate": 2.4447164070885026e-06, + "loss": 0.8424021, + "num_input_tokens_seen": 158955855, + "router_z_loss_clip": 1.60546875, + "router_z_loss_mlp": 0.11956787, + "step": 7414, + "time_per_iteration": 2.515110731124878 + }, + { + "auxiliary_loss_clip": 0.06447163, + "auxiliary_loss_mlp": 0.01269628, + "balance_loss_clip": 0.06286051, + "balance_loss_mlp": 0.01257177, + "epoch": 0.4458139185329926, + "flos": 24177586106880.0, + "grad_norm": 1.4166090983539044, + "language_loss": 0.84000552, + "learning_rate": 2.4443366871351837e-06, + "loss": 0.91717345, + "num_input_tokens_seen": 158976315, + "router_z_loss_clip": 1.61132812, + "router_z_loss_mlp": 0.12457275, + "step": 7415, + "time_per_iteration": 2.528939723968506 + }, + { + "auxiliary_loss_clip": 0.06442896, + "auxiliary_loss_mlp": 0.01267494, + "balance_loss_clip": 0.06282021, + "balance_loss_mlp": 0.01254733, + "epoch": 0.4458740417856606, + "flos": 21768660276480.0, + "grad_norm": 1.9578275078246672, + "language_loss": 0.84485269, + "learning_rate": 2.4439569503325732e-06, + "loss": 0.92195654, + "num_input_tokens_seen": 158996725, + "router_z_loss_clip": 1.60839844, + "router_z_loss_mlp": 0.12756348, + "step": 7416, + "time_per_iteration": 2.57027268409729 + }, + { + "auxiliary_loss_clip": 0.06451635, + "auxiliary_loss_mlp": 0.0126797, + "balance_loss_clip": 0.06285699, + "balance_loss_mlp": 0.01255298, + "epoch": 0.44593416503832856, + "flos": 21075074904960.0, + "grad_norm": 1.7085615846271827, + "language_loss": 0.81362593, + "learning_rate": 2.4435771966950706e-06, + "loss": 0.89082199, + "num_input_tokens_seen": 159017255, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.12670898, + "step": 7417, + "time_per_iteration": 2.547837734222412 + }, + { + "auxiliary_loss_clip": 0.06448114, + "auxiliary_loss_mlp": 0.01267636, + "balance_loss_clip": 0.06283562, + "balance_loss_mlp": 0.01255601, + "epoch": 0.4459942882909965, + "flos": 22606910922240.0, + "grad_norm": 1.8801354401717048, + "language_loss": 0.81286234, + "learning_rate": 2.443197426237077e-06, + "loss": 0.89001989, + "num_input_tokens_seen": 159035010, + "router_z_loss_clip": 1.64648438, + "router_z_loss_mlp": 0.12042236, + "step": 7418, + "time_per_iteration": 2.5529236793518066 + }, + { + "auxiliary_loss_clip": 0.06449951, + "auxiliary_loss_mlp": 0.01268288, + "balance_loss_clip": 0.06284475, + "balance_loss_mlp": 0.01255652, + "epoch": 0.4460544115436645, + "flos": 26512732817280.0, + "grad_norm": 1.8068813549808598, + "language_loss": 0.77866399, + "learning_rate": 2.442817638972991e-06, + "loss": 0.85584641, + "num_input_tokens_seen": 159055345, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.12646484, + "step": 7419, + "time_per_iteration": 2.637568235397339 + }, + { + "auxiliary_loss_clip": 0.06446308, + "auxiliary_loss_mlp": 0.01271146, + "balance_loss_clip": 0.06283416, + "balance_loss_mlp": 0.01258349, + "epoch": 0.4461145347963325, + "flos": 17609957159040.0, + "grad_norm": 3.5469346323262068, + "language_loss": 0.73053217, + "learning_rate": 2.4424378349172176e-06, + "loss": 0.80770659, + "num_input_tokens_seen": 159074225, + "router_z_loss_clip": 1.63085938, + "router_z_loss_mlp": 0.12805176, + "step": 7420, + "time_per_iteration": 2.4839932918548584 + }, + { + "auxiliary_loss_clip": 0.06441851, + "auxiliary_loss_mlp": 0.01268009, + "balance_loss_clip": 0.06283888, + "balance_loss_mlp": 0.01255176, + "epoch": 0.44617465804900047, + "flos": 27274982209920.0, + "grad_norm": 1.4177043979342248, + "language_loss": 0.75314558, + "learning_rate": 2.442058014084156e-06, + "loss": 0.83024418, + "num_input_tokens_seen": 159095415, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.12823486, + "step": 7421, + "time_per_iteration": 2.6001040935516357 + }, + { + "auxiliary_loss_clip": 0.06439819, + "auxiliary_loss_mlp": 0.01266608, + "balance_loss_clip": 0.06281345, + "balance_loss_mlp": 0.01254073, + "epoch": 0.44623478130166844, + "flos": 17792371497600.0, + "grad_norm": 1.9155365450665858, + "language_loss": 0.75864565, + "learning_rate": 2.44167817648821e-06, + "loss": 0.83570993, + "num_input_tokens_seen": 159114615, + "router_z_loss_clip": 1.58398438, + "router_z_loss_mlp": 0.12536621, + "step": 7422, + "time_per_iteration": 2.481241226196289 + }, + { + "auxiliary_loss_clip": 0.06447253, + "auxiliary_loss_mlp": 0.01267362, + "balance_loss_clip": 0.06284253, + "balance_loss_mlp": 0.01254804, + "epoch": 0.4462949045543364, + "flos": 23009698298880.0, + "grad_norm": 1.7347835392128452, + "language_loss": 0.65679651, + "learning_rate": 2.441298322143784e-06, + "loss": 0.73394263, + "num_input_tokens_seen": 159134370, + "router_z_loss_clip": 1.62890625, + "router_z_loss_mlp": 0.12573242, + "step": 7423, + "time_per_iteration": 2.539268732070923 + }, + { + "auxiliary_loss_clip": 0.06440745, + "auxiliary_loss_mlp": 0.01268488, + "balance_loss_clip": 0.06283564, + "balance_loss_mlp": 0.01256591, + "epoch": 0.44635502780700437, + "flos": 17825592441600.0, + "grad_norm": 1.4381231336851048, + "language_loss": 0.79473054, + "learning_rate": 2.4409184510652807e-06, + "loss": 0.87182289, + "num_input_tokens_seen": 159152540, + "router_z_loss_clip": 1.57128906, + "router_z_loss_mlp": 0.11901855, + "step": 7424, + "time_per_iteration": 2.488111972808838 + }, + { + "auxiliary_loss_clip": 0.06437074, + "auxiliary_loss_mlp": 0.01267937, + "balance_loss_clip": 0.06280597, + "balance_loss_mlp": 0.01256148, + "epoch": 0.44641515105967233, + "flos": 26695314864000.0, + "grad_norm": 1.3471148592694158, + "language_loss": 0.8055563, + "learning_rate": 2.4405385632671063e-06, + "loss": 0.88260639, + "num_input_tokens_seen": 159173425, + "router_z_loss_clip": 1.56542969, + "router_z_loss_mlp": 0.11791992, + "step": 7425, + "time_per_iteration": 2.598731756210327 + }, + { + "auxiliary_loss_clip": 0.06439465, + "auxiliary_loss_mlp": 0.01271755, + "balance_loss_clip": 0.06279327, + "balance_loss_mlp": 0.01259536, + "epoch": 0.4464752743123403, + "flos": 18918778734720.0, + "grad_norm": 1.4143607287110962, + "language_loss": 0.77488291, + "learning_rate": 2.4401586587636655e-06, + "loss": 0.85199511, + "num_input_tokens_seen": 159191210, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.12207031, + "step": 7426, + "time_per_iteration": 2.494330406188965 + }, + { + "auxiliary_loss_clip": 0.06445856, + "auxiliary_loss_mlp": 0.01267303, + "balance_loss_clip": 0.06281333, + "balance_loss_mlp": 0.01253773, + "epoch": 0.44653539756500826, + "flos": 29578081933440.0, + "grad_norm": 1.9924998088803147, + "language_loss": 0.64776599, + "learning_rate": 2.4397787375693634e-06, + "loss": 0.72489762, + "num_input_tokens_seen": 159211755, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.13513184, + "step": 7427, + "time_per_iteration": 2.611482858657837 + }, + { + "auxiliary_loss_clip": 0.06441574, + "auxiliary_loss_mlp": 0.01275968, + "balance_loss_clip": 0.06284505, + "balance_loss_mlp": 0.0126372, + "epoch": 0.44659552081767623, + "flos": 21475137024000.0, + "grad_norm": 1.5780428941103348, + "language_loss": 0.75530696, + "learning_rate": 2.439398799698608e-06, + "loss": 0.8324824, + "num_input_tokens_seen": 159230315, + "router_z_loss_clip": 1.56933594, + "router_z_loss_mlp": 0.12268066, + "step": 7428, + "time_per_iteration": 2.505094051361084 + }, + { + "auxiliary_loss_clip": 0.06441561, + "auxiliary_loss_mlp": 0.01271156, + "balance_loss_clip": 0.06281359, + "balance_loss_mlp": 0.0125843, + "epoch": 0.4466556440703442, + "flos": 17937791458560.0, + "grad_norm": 1.912744298925221, + "language_loss": 0.78478271, + "learning_rate": 2.439018845165806e-06, + "loss": 0.86190987, + "num_input_tokens_seen": 159249810, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.12731934, + "step": 7429, + "time_per_iteration": 2.5107972621917725 + }, + { + "auxiliary_loss_clip": 0.06447433, + "auxiliary_loss_mlp": 0.0127403, + "balance_loss_clip": 0.06283738, + "balance_loss_mlp": 0.01260667, + "epoch": 0.44671576732301216, + "flos": 21114081780480.0, + "grad_norm": 1.7694096542013318, + "language_loss": 0.91354167, + "learning_rate": 2.438638873985366e-06, + "loss": 0.99075633, + "num_input_tokens_seen": 159271715, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.13366699, + "step": 7430, + "time_per_iteration": 2.537428140640259 + }, + { + "auxiliary_loss_clip": 0.06451312, + "auxiliary_loss_mlp": 0.01271269, + "balance_loss_clip": 0.06282946, + "balance_loss_mlp": 0.01257792, + "epoch": 0.4467758905756801, + "flos": 23514873765120.0, + "grad_norm": 1.610238873942938, + "language_loss": 0.80143106, + "learning_rate": 2.4382588861716954e-06, + "loss": 0.87865686, + "num_input_tokens_seen": 159290690, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.1348877, + "step": 7431, + "time_per_iteration": 2.5611300468444824 + }, + { + "auxiliary_loss_clip": 0.06447126, + "auxiliary_loss_mlp": 0.01271916, + "balance_loss_clip": 0.06282945, + "balance_loss_mlp": 0.01258374, + "epoch": 0.4468360138283481, + "flos": 18739970121600.0, + "grad_norm": 1.9551980798487134, + "language_loss": 0.80273902, + "learning_rate": 2.437878881739204e-06, + "loss": 0.87992942, + "num_input_tokens_seen": 159309400, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.13543701, + "step": 7432, + "time_per_iteration": 2.500554084777832 + }, + { + "auxiliary_loss_clip": 0.06450094, + "auxiliary_loss_mlp": 0.01273992, + "balance_loss_clip": 0.06283274, + "balance_loss_mlp": 0.0126073, + "epoch": 0.4468961370810161, + "flos": 23483874954240.0, + "grad_norm": 1.835454334349629, + "language_loss": 0.76644909, + "learning_rate": 2.437498860702301e-06, + "loss": 0.84368992, + "num_input_tokens_seen": 159327425, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.13269043, + "step": 7433, + "time_per_iteration": 2.5840916633605957 + }, + { + "auxiliary_loss_clip": 0.06435596, + "auxiliary_loss_mlp": 0.01271551, + "balance_loss_clip": 0.06279343, + "balance_loss_mlp": 0.01260047, + "epoch": 0.4469562603336841, + "flos": 30081873807360.0, + "grad_norm": 1.6012992804544768, + "language_loss": 0.77581275, + "learning_rate": 2.437118823075398e-06, + "loss": 0.85288417, + "num_input_tokens_seen": 159345805, + "router_z_loss_clip": 1.5625, + "router_z_loss_mlp": 0.1151123, + "step": 7434, + "time_per_iteration": 2.579667329788208 + }, + { + "auxiliary_loss_clip": 0.06443198, + "auxiliary_loss_mlp": 0.01270182, + "balance_loss_clip": 0.06278063, + "balance_loss_mlp": 0.01257439, + "epoch": 0.44701638358635204, + "flos": 22463126115840.0, + "grad_norm": 1.683412458990524, + "language_loss": 0.63887638, + "learning_rate": 2.436738768872905e-06, + "loss": 0.71601021, + "num_input_tokens_seen": 159364595, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.12750244, + "step": 7435, + "time_per_iteration": 2.5773611068725586 + }, + { + "auxiliary_loss_clip": 0.06444404, + "auxiliary_loss_mlp": 0.01272477, + "balance_loss_clip": 0.06280479, + "balance_loss_mlp": 0.01258714, + "epoch": 0.44707650683902, + "flos": 24064171205760.0, + "grad_norm": 1.5617494879233198, + "language_loss": 0.83911443, + "learning_rate": 2.4363586981092346e-06, + "loss": 0.91628319, + "num_input_tokens_seen": 159385265, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.13763428, + "step": 7436, + "time_per_iteration": 2.5204451084136963 + }, + { + "auxiliary_loss_clip": 0.0644998, + "auxiliary_loss_mlp": 0.01269044, + "balance_loss_clip": 0.0628316, + "balance_loss_mlp": 0.01254226, + "epoch": 0.44713663009168797, + "flos": 23773373210880.0, + "grad_norm": 1.7812959316100008, + "language_loss": 0.79632622, + "learning_rate": 2.435978610798798e-06, + "loss": 0.87351644, + "num_input_tokens_seen": 159405080, + "router_z_loss_clip": 1.66699219, + "router_z_loss_mlp": 0.14819336, + "step": 7437, + "time_per_iteration": 2.564180374145508 + }, + { + "auxiliary_loss_clip": 0.0644551, + "auxiliary_loss_mlp": 0.01269936, + "balance_loss_clip": 0.06279416, + "balance_loss_mlp": 0.01256829, + "epoch": 0.44719675334435594, + "flos": 24506258947200.0, + "grad_norm": 1.814975751419929, + "language_loss": 0.72632974, + "learning_rate": 2.435598506956009e-06, + "loss": 0.8034842, + "num_input_tokens_seen": 159424595, + "router_z_loss_clip": 1.66113281, + "router_z_loss_mlp": 0.13116455, + "step": 7438, + "time_per_iteration": 2.601855993270874 + }, + { + "auxiliary_loss_clip": 0.06445266, + "auxiliary_loss_mlp": 0.01270946, + "balance_loss_clip": 0.06279082, + "balance_loss_mlp": 0.01257046, + "epoch": 0.4472568765970239, + "flos": 29788308627840.0, + "grad_norm": 3.3026679320519716, + "language_loss": 0.67660618, + "learning_rate": 2.4352183865952808e-06, + "loss": 0.75376832, + "num_input_tokens_seen": 159443865, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.13903809, + "step": 7439, + "time_per_iteration": 2.6503498554229736 + }, + { + "auxiliary_loss_clip": 0.06447087, + "auxiliary_loss_mlp": 0.01272251, + "balance_loss_clip": 0.06280239, + "balance_loss_mlp": 0.01257648, + "epoch": 0.44731699984969187, + "flos": 24649792191360.0, + "grad_norm": 1.6003212894552636, + "language_loss": 0.73896551, + "learning_rate": 2.4348382497310285e-06, + "loss": 0.81615895, + "num_input_tokens_seen": 159464525, + "router_z_loss_clip": 1.66894531, + "router_z_loss_mlp": 0.14605713, + "step": 7440, + "time_per_iteration": 4.026291608810425 + }, + { + "auxiliary_loss_clip": 0.06441355, + "auxiliary_loss_mlp": 0.01270172, + "balance_loss_clip": 0.06277838, + "balance_loss_mlp": 0.0125722, + "epoch": 0.44737712310235983, + "flos": 29462570680320.0, + "grad_norm": 1.5530123963175664, + "language_loss": 0.74356592, + "learning_rate": 2.4344580963776655e-06, + "loss": 0.82068115, + "num_input_tokens_seen": 159486385, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.12963867, + "step": 7441, + "time_per_iteration": 2.5968191623687744 + }, + { + "auxiliary_loss_clip": 0.06443278, + "auxiliary_loss_mlp": 0.01268347, + "balance_loss_clip": 0.06277753, + "balance_loss_mlp": 0.01254983, + "epoch": 0.4474372463550278, + "flos": 24903260392320.0, + "grad_norm": 2.4580446492601014, + "language_loss": 0.75523049, + "learning_rate": 2.4340779265496082e-06, + "loss": 0.83234674, + "num_input_tokens_seen": 159503880, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.13378906, + "step": 7442, + "time_per_iteration": 2.6050899028778076 + }, + { + "auxiliary_loss_clip": 0.0645077, + "auxiliary_loss_mlp": 0.01276603, + "balance_loss_clip": 0.06281515, + "balance_loss_mlp": 0.01262644, + "epoch": 0.44749736960769576, + "flos": 33189835524480.0, + "grad_norm": 1.8304580376547321, + "language_loss": 0.74504036, + "learning_rate": 2.433697740261273e-06, + "loss": 0.82231408, + "num_input_tokens_seen": 159522980, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.13952637, + "step": 7443, + "time_per_iteration": 2.590211868286133 + }, + { + "auxiliary_loss_clip": 0.06441949, + "auxiliary_loss_mlp": 0.01270493, + "balance_loss_clip": 0.06278961, + "balance_loss_mlp": 0.01256605, + "epoch": 0.4475574928603637, + "flos": 21078596776320.0, + "grad_norm": 1.7164366382085705, + "language_loss": 0.78287792, + "learning_rate": 2.4333175375270748e-06, + "loss": 0.86000234, + "num_input_tokens_seen": 159543340, + "router_z_loss_clip": 1.62890625, + "router_z_loss_mlp": 0.13891602, + "step": 7444, + "time_per_iteration": 2.554215669631958 + }, + { + "auxiliary_loss_clip": 0.06437638, + "auxiliary_loss_mlp": 0.01276986, + "balance_loss_clip": 0.06276217, + "balance_loss_mlp": 0.01263664, + "epoch": 0.4476176161130317, + "flos": 21867442640640.0, + "grad_norm": 2.3488437532538735, + "language_loss": 0.85014707, + "learning_rate": 2.4329373183614333e-06, + "loss": 0.9272933, + "num_input_tokens_seen": 159558210, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.13317871, + "step": 7445, + "time_per_iteration": 2.463123321533203 + }, + { + "auxiliary_loss_clip": 0.06446601, + "auxiliary_loss_mlp": 0.0127394, + "balance_loss_clip": 0.06279677, + "balance_loss_mlp": 0.01258312, + "epoch": 0.4476777393656997, + "flos": 22535270081280.0, + "grad_norm": 2.2137135091267135, + "language_loss": 0.64567178, + "learning_rate": 2.432557082778765e-06, + "loss": 0.72287714, + "num_input_tokens_seen": 159577920, + "router_z_loss_clip": 1.66894531, + "router_z_loss_mlp": 0.15631104, + "step": 7446, + "time_per_iteration": 3.9910571575164795 + }, + { + "auxiliary_loss_clip": 0.06349403, + "auxiliary_loss_mlp": 0.01256298, + "balance_loss_clip": 0.06276181, + "balance_loss_mlp": 0.01253975, + "epoch": 0.4477378626183677, + "flos": 49034236101120.0, + "grad_norm": 0.7348354325841562, + "language_loss": 0.49922079, + "learning_rate": 2.4321768307934884e-06, + "loss": 0.57527786, + "num_input_tokens_seen": 159632295, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.0231781, + "step": 7447, + "time_per_iteration": 3.0209667682647705 + }, + { + "auxiliary_loss_clip": 0.06344398, + "auxiliary_loss_mlp": 0.01262514, + "balance_loss_clip": 0.06271263, + "balance_loss_mlp": 0.01260019, + "epoch": 0.44779798587103564, + "flos": 56562041784960.0, + "grad_norm": 0.8026230684928909, + "language_loss": 0.59334445, + "learning_rate": 2.4317965624200235e-06, + "loss": 0.66941357, + "num_input_tokens_seen": 159698435, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.02493286, + "step": 7448, + "time_per_iteration": 3.2380871772766113 + }, + { + "auxiliary_loss_clip": 0.06443155, + "auxiliary_loss_mlp": 0.01270524, + "balance_loss_clip": 0.06277426, + "balance_loss_mlp": 0.01256994, + "epoch": 0.4478581091237036, + "flos": 46508933278080.0, + "grad_norm": 1.7384627548967189, + "language_loss": 0.59131092, + "learning_rate": 2.431416277672789e-06, + "loss": 0.66844773, + "num_input_tokens_seen": 159722150, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.13537598, + "step": 7449, + "time_per_iteration": 2.7783467769622803 + }, + { + "auxiliary_loss_clip": 0.06440828, + "auxiliary_loss_mlp": 0.01272088, + "balance_loss_clip": 0.06277853, + "balance_loss_mlp": 0.01258868, + "epoch": 0.4479182323763716, + "flos": 20820768163200.0, + "grad_norm": 1.956040680672474, + "language_loss": 0.81008971, + "learning_rate": 2.4310359765662065e-06, + "loss": 0.88721895, + "num_input_tokens_seen": 159740550, + "router_z_loss_clip": 1.62890625, + "router_z_loss_mlp": 0.13220215, + "step": 7450, + "time_per_iteration": 2.488323450088501 + }, + { + "auxiliary_loss_clip": 0.06442301, + "auxiliary_loss_mlp": 0.01273054, + "balance_loss_clip": 0.06277788, + "balance_loss_mlp": 0.01259172, + "epoch": 0.44797835562903954, + "flos": 14251126717440.0, + "grad_norm": 2.5451576111358136, + "language_loss": 0.79348361, + "learning_rate": 2.430655659114697e-06, + "loss": 0.87063718, + "num_input_tokens_seen": 159758245, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.13885498, + "step": 7451, + "time_per_iteration": 2.4923946857452393 + }, + { + "auxiliary_loss_clip": 0.06344576, + "auxiliary_loss_mlp": 0.0125349, + "balance_loss_clip": 0.06271936, + "balance_loss_mlp": 0.0125126, + "epoch": 0.4480384788817075, + "flos": 63553436357760.0, + "grad_norm": 0.7850742570611701, + "language_loss": 0.62791413, + "learning_rate": 2.430275325332681e-06, + "loss": 0.70389479, + "num_input_tokens_seen": 159826790, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.02233887, + "step": 7452, + "time_per_iteration": 3.2259254455566406 + }, + { + "auxiliary_loss_clip": 0.06441975, + "auxiliary_loss_mlp": 0.01272416, + "balance_loss_clip": 0.06277539, + "balance_loss_mlp": 0.01258874, + "epoch": 0.44809860213437547, + "flos": 21659018808960.0, + "grad_norm": 1.8053672901244522, + "language_loss": 0.62585479, + "learning_rate": 2.429894975234582e-06, + "loss": 0.70299876, + "num_input_tokens_seen": 159845805, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.13537598, + "step": 7453, + "time_per_iteration": 3.928234577178955 + }, + { + "auxiliary_loss_clip": 0.06345223, + "auxiliary_loss_mlp": 0.01256622, + "balance_loss_clip": 0.06272231, + "balance_loss_mlp": 0.01254279, + "epoch": 0.44815872538704343, + "flos": 69210586840320.0, + "grad_norm": 0.747363028090033, + "language_loss": 0.5699693, + "learning_rate": 2.4295146088348224e-06, + "loss": 0.64598775, + "num_input_tokens_seen": 159898860, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.02339172, + "step": 7454, + "time_per_iteration": 3.0569918155670166 + }, + { + "auxiliary_loss_clip": 0.06447325, + "auxiliary_loss_mlp": 0.01268938, + "balance_loss_clip": 0.06281178, + "balance_loss_mlp": 0.01255705, + "epoch": 0.4482188486397114, + "flos": 12602186219520.0, + "grad_norm": 1.9501180256269237, + "language_loss": 0.75448847, + "learning_rate": 2.4291342261478255e-06, + "loss": 0.83165109, + "num_input_tokens_seen": 159911555, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.13220215, + "step": 7455, + "time_per_iteration": 2.4410433769226074 + }, + { + "auxiliary_loss_clip": 0.06442874, + "auxiliary_loss_mlp": 0.0126888, + "balance_loss_clip": 0.06278916, + "balance_loss_mlp": 0.01254932, + "epoch": 0.44827897189237936, + "flos": 34066715702400.0, + "grad_norm": 1.6532992970231903, + "language_loss": 0.76341856, + "learning_rate": 2.428753827188016e-06, + "loss": 0.84053606, + "num_input_tokens_seen": 159931470, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.1394043, + "step": 7456, + "time_per_iteration": 2.6695046424865723 + }, + { + "auxiliary_loss_clip": 0.06443818, + "auxiliary_loss_mlp": 0.01274223, + "balance_loss_clip": 0.06283055, + "balance_loss_mlp": 0.01261087, + "epoch": 0.44833909514504733, + "flos": 25153080940800.0, + "grad_norm": 1.8332154029673087, + "language_loss": 0.7703625, + "learning_rate": 2.428373411969818e-06, + "loss": 0.84754294, + "num_input_tokens_seen": 159946115, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.13122559, + "step": 7457, + "time_per_iteration": 2.4982032775878906 + }, + { + "auxiliary_loss_clip": 0.06449621, + "auxiliary_loss_mlp": 0.0126721, + "balance_loss_clip": 0.06282188, + "balance_loss_mlp": 0.01253269, + "epoch": 0.4483992183977153, + "flos": 16185498549120.0, + "grad_norm": 2.4281328609676254, + "language_loss": 0.68744391, + "learning_rate": 2.4279929805076576e-06, + "loss": 0.7646122, + "num_input_tokens_seen": 159963915, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.1394043, + "step": 7458, + "time_per_iteration": 2.4979610443115234 + }, + { + "auxiliary_loss_clip": 0.06448827, + "auxiliary_loss_mlp": 0.01274875, + "balance_loss_clip": 0.06280437, + "balance_loss_mlp": 0.01259592, + "epoch": 0.44845934165038326, + "flos": 17751352124160.0, + "grad_norm": 1.539492966179865, + "language_loss": 0.71756333, + "learning_rate": 2.427612532815961e-06, + "loss": 0.79480034, + "num_input_tokens_seen": 159982140, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.15283203, + "step": 7459, + "time_per_iteration": 2.482675075531006 + }, + { + "auxiliary_loss_clip": 0.06445904, + "auxiliary_loss_mlp": 0.01268873, + "balance_loss_clip": 0.06281781, + "balance_loss_mlp": 0.01255343, + "epoch": 0.4485194649030513, + "flos": 21842481323520.0, + "grad_norm": 1.7620296739852843, + "language_loss": 0.69945031, + "learning_rate": 2.427232068909154e-06, + "loss": 0.7765981, + "num_input_tokens_seen": 160002280, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.13525391, + "step": 7460, + "time_per_iteration": 2.548891067504883 + }, + { + "auxiliary_loss_clip": 0.06446661, + "auxiliary_loss_mlp": 0.01267799, + "balance_loss_clip": 0.06281269, + "balance_loss_mlp": 0.01253744, + "epoch": 0.44857958815571924, + "flos": 20090775392640.0, + "grad_norm": 2.1567039258492637, + "language_loss": 0.77558124, + "learning_rate": 2.4268515888016635e-06, + "loss": 0.85272586, + "num_input_tokens_seen": 160020260, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.14068604, + "step": 7461, + "time_per_iteration": 2.488675832748413 + }, + { + "auxiliary_loss_clip": 0.0644468, + "auxiliary_loss_mlp": 0.01266891, + "balance_loss_clip": 0.0627977, + "balance_loss_mlp": 0.01252514, + "epoch": 0.4486397114083872, + "flos": 27060982081920.0, + "grad_norm": 1.6449935173844783, + "language_loss": 0.68081152, + "learning_rate": 2.4264710925079184e-06, + "loss": 0.75792718, + "num_input_tokens_seen": 160040240, + "router_z_loss_clip": 1.64941406, + "router_z_loss_mlp": 0.14367676, + "step": 7462, + "time_per_iteration": 2.5873477458953857 + }, + { + "auxiliary_loss_clip": 0.06346884, + "auxiliary_loss_mlp": 0.01259781, + "balance_loss_clip": 0.06274216, + "balance_loss_mlp": 0.01257521, + "epoch": 0.4486998346610552, + "flos": 67339386587520.0, + "grad_norm": 0.7371865357722727, + "language_loss": 0.54459572, + "learning_rate": 2.4260905800423462e-06, + "loss": 0.62066233, + "num_input_tokens_seen": 160093865, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.0226593, + "step": 7463, + "time_per_iteration": 3.135831594467163 + }, + { + "auxiliary_loss_clip": 0.06446455, + "auxiliary_loss_mlp": 0.01271071, + "balance_loss_clip": 0.06283797, + "balance_loss_mlp": 0.01257344, + "epoch": 0.44875995791372314, + "flos": 27644297080320.0, + "grad_norm": 1.768714620285087, + "language_loss": 0.76698768, + "learning_rate": 2.4257100514193775e-06, + "loss": 0.844163, + "num_input_tokens_seen": 160113590, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.13726807, + "step": 7464, + "time_per_iteration": 2.5624353885650635 + }, + { + "auxiliary_loss_clip": 0.06442145, + "auxiliary_loss_mlp": 0.01270123, + "balance_loss_clip": 0.06281784, + "balance_loss_mlp": 0.01257063, + "epoch": 0.4488200811663911, + "flos": 13010969162880.0, + "grad_norm": 1.8955897931068166, + "language_loss": 0.74468267, + "learning_rate": 2.425329506653441e-06, + "loss": 0.82180536, + "num_input_tokens_seen": 160131795, + "router_z_loss_clip": 1.60351562, + "router_z_loss_mlp": 0.13043213, + "step": 7465, + "time_per_iteration": 2.4702823162078857 + }, + { + "auxiliary_loss_clip": 0.0645618, + "auxiliary_loss_mlp": 0.01272918, + "balance_loss_clip": 0.06284305, + "balance_loss_mlp": 0.01257391, + "epoch": 0.44888020441905907, + "flos": 27497283891840.0, + "grad_norm": 2.0464026275546314, + "language_loss": 0.80248308, + "learning_rate": 2.424948945758966e-06, + "loss": 0.87977397, + "num_input_tokens_seen": 160150635, + "router_z_loss_clip": 1.71777344, + "router_z_loss_mlp": 0.1552124, + "step": 7466, + "time_per_iteration": 2.542721748352051 + }, + { + "auxiliary_loss_clip": 0.06448439, + "auxiliary_loss_mlp": 0.01269021, + "balance_loss_clip": 0.0628207, + "balance_loss_mlp": 0.01255735, + "epoch": 0.44894032767172704, + "flos": 18265541904000.0, + "grad_norm": 2.2890338528416416, + "language_loss": 0.80875736, + "learning_rate": 2.4245683687503844e-06, + "loss": 0.88593197, + "num_input_tokens_seen": 160168615, + "router_z_loss_clip": 1.66308594, + "router_z_loss_mlp": 0.13293457, + "step": 7467, + "time_per_iteration": 2.4503378868103027 + }, + { + "auxiliary_loss_clip": 0.06442044, + "auxiliary_loss_mlp": 0.01269059, + "balance_loss_clip": 0.06284908, + "balance_loss_mlp": 0.01256465, + "epoch": 0.449000450924395, + "flos": 21586245937920.0, + "grad_norm": 2.2421166338055762, + "language_loss": 0.75738609, + "learning_rate": 2.424187775642129e-06, + "loss": 0.83449709, + "num_input_tokens_seen": 160187295, + "router_z_loss_clip": 1.5703125, + "router_z_loss_mlp": 0.12597656, + "step": 7468, + "time_per_iteration": 2.5120036602020264 + }, + { + "auxiliary_loss_clip": 0.06448267, + "auxiliary_loss_mlp": 0.01270415, + "balance_loss_clip": 0.06286301, + "balance_loss_mlp": 0.01257993, + "epoch": 0.44906057417706297, + "flos": 17973737660160.0, + "grad_norm": 2.1198815882874626, + "language_loss": 0.71292973, + "learning_rate": 2.4238071664486297e-06, + "loss": 0.79011655, + "num_input_tokens_seen": 160205115, + "router_z_loss_clip": 1.62109375, + "router_z_loss_mlp": 0.12414551, + "step": 7469, + "time_per_iteration": 2.4725160598754883 + }, + { + "auxiliary_loss_clip": 0.06450349, + "auxiliary_loss_mlp": 0.01268175, + "balance_loss_clip": 0.06284628, + "balance_loss_mlp": 0.0125427, + "epoch": 0.44912069742973093, + "flos": 20053487525760.0, + "grad_norm": 1.6969020049584582, + "language_loss": 0.7254343, + "learning_rate": 2.4234265411843203e-06, + "loss": 0.80261958, + "num_input_tokens_seen": 160222580, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.13903809, + "step": 7470, + "time_per_iteration": 2.5212604999542236 + }, + { + "auxiliary_loss_clip": 0.06447989, + "auxiliary_loss_mlp": 0.01269333, + "balance_loss_clip": 0.0628368, + "balance_loss_mlp": 0.01255951, + "epoch": 0.4491808206823989, + "flos": 21040009171200.0, + "grad_norm": 2.607168963621531, + "language_loss": 0.77266711, + "learning_rate": 2.423045899863634e-06, + "loss": 0.84984034, + "num_input_tokens_seen": 160241520, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.13397217, + "step": 7471, + "time_per_iteration": 2.4833462238311768 + }, + { + "auxiliary_loss_clip": 0.0644739, + "auxiliary_loss_mlp": 0.01274961, + "balance_loss_clip": 0.06286953, + "balance_loss_mlp": 0.01261579, + "epoch": 0.44924094393506686, + "flos": 22973919805440.0, + "grad_norm": 1.613716342828386, + "language_loss": 0.69996417, + "learning_rate": 2.4226652425010048e-06, + "loss": 0.77718765, + "num_input_tokens_seen": 160261815, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.1338501, + "step": 7472, + "time_per_iteration": 2.5575385093688965 + }, + { + "auxiliary_loss_clip": 0.06348881, + "auxiliary_loss_mlp": 0.01263011, + "balance_loss_clip": 0.0627597, + "balance_loss_mlp": 0.01260363, + "epoch": 0.4493010671877349, + "flos": 59252332026240.0, + "grad_norm": 0.7278471165666979, + "language_loss": 0.61657208, + "learning_rate": 2.4222845691108676e-06, + "loss": 0.69269097, + "num_input_tokens_seen": 160317070, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.02650452, + "step": 7473, + "time_per_iteration": 3.1560816764831543 + }, + { + "auxiliary_loss_clip": 0.06448925, + "auxiliary_loss_mlp": 0.01270251, + "balance_loss_clip": 0.0628556, + "balance_loss_mlp": 0.01256417, + "epoch": 0.44936119044040285, + "flos": 18010815891840.0, + "grad_norm": 2.7240719920550873, + "language_loss": 0.77420998, + "learning_rate": 2.421903879707657e-06, + "loss": 0.85140175, + "num_input_tokens_seen": 160334980, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.13830566, + "step": 7474, + "time_per_iteration": 2.4717578887939453 + }, + { + "auxiliary_loss_clip": 0.06442197, + "auxiliary_loss_mlp": 0.01276021, + "balance_loss_clip": 0.06283113, + "balance_loss_mlp": 0.0126264, + "epoch": 0.4494213136930708, + "flos": 21258243930240.0, + "grad_norm": 2.650117553560035, + "language_loss": 0.72072601, + "learning_rate": 2.4215231743058086e-06, + "loss": 0.79790819, + "num_input_tokens_seen": 160354500, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.1338501, + "step": 7475, + "time_per_iteration": 2.513819456100464 + }, + { + "auxiliary_loss_clip": 0.06442311, + "auxiliary_loss_mlp": 0.01269894, + "balance_loss_clip": 0.06277612, + "balance_loss_mlp": 0.01256954, + "epoch": 0.4494814369457388, + "flos": 27426271956480.0, + "grad_norm": 1.759412456892788, + "language_loss": 0.77338856, + "learning_rate": 2.4211424529197594e-06, + "loss": 0.8505106, + "num_input_tokens_seen": 160373650, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.1295166, + "step": 7476, + "time_per_iteration": 2.5318853855133057 + }, + { + "auxiliary_loss_clip": 0.06449737, + "auxiliary_loss_mlp": 0.01271172, + "balance_loss_clip": 0.06281359, + "balance_loss_mlp": 0.01256754, + "epoch": 0.44954156019840674, + "flos": 22860211415040.0, + "grad_norm": 1.712065897066968, + "language_loss": 0.71606135, + "learning_rate": 2.4207617155639464e-06, + "loss": 0.79327047, + "num_input_tokens_seen": 160393430, + "router_z_loss_clip": 1.68066406, + "router_z_loss_mlp": 0.144104, + "step": 7477, + "time_per_iteration": 2.532437324523926 + }, + { + "auxiliary_loss_clip": 0.06452323, + "auxiliary_loss_mlp": 0.01271774, + "balance_loss_clip": 0.06283113, + "balance_loss_mlp": 0.01257457, + "epoch": 0.4496016834510747, + "flos": 17207253636480.0, + "grad_norm": 8.505711381360525, + "language_loss": 0.68249893, + "learning_rate": 2.4203809622528062e-06, + "loss": 0.75973988, + "num_input_tokens_seen": 160410545, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.14331055, + "step": 7478, + "time_per_iteration": 2.4901106357574463 + }, + { + "auxiliary_loss_clip": 0.06438291, + "auxiliary_loss_mlp": 0.01274211, + "balance_loss_clip": 0.06278055, + "balance_loss_mlp": 0.01261676, + "epoch": 0.4496618067037427, + "flos": 18922636022400.0, + "grad_norm": 1.7939017561082606, + "language_loss": 0.89897281, + "learning_rate": 2.420000193000779e-06, + "loss": 0.97609776, + "num_input_tokens_seen": 160428105, + "router_z_loss_clip": 1.60058594, + "router_z_loss_mlp": 0.12518311, + "step": 7479, + "time_per_iteration": 3.9324028491973877 + }, + { + "auxiliary_loss_clip": 0.06445809, + "auxiliary_loss_mlp": 0.01275156, + "balance_loss_clip": 0.06282537, + "balance_loss_mlp": 0.01261304, + "epoch": 0.44972192995641064, + "flos": 21037828965120.0, + "grad_norm": 1.5817445570827902, + "language_loss": 0.75620329, + "learning_rate": 2.419619407822302e-06, + "loss": 0.833413, + "num_input_tokens_seen": 160448815, + "router_z_loss_clip": 1.63085938, + "router_z_loss_mlp": 0.13861084, + "step": 7480, + "time_per_iteration": 2.519364595413208 + }, + { + "auxiliary_loss_clip": 0.06450936, + "auxiliary_loss_mlp": 0.01270868, + "balance_loss_clip": 0.06283928, + "balance_loss_mlp": 0.01257033, + "epoch": 0.4497820532090786, + "flos": 20783354515200.0, + "grad_norm": 2.4818923045987233, + "language_loss": 0.79794782, + "learning_rate": 2.419238606731815e-06, + "loss": 0.87516582, + "num_input_tokens_seen": 160465940, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.1385498, + "step": 7481, + "time_per_iteration": 2.511104106903076 + }, + { + "auxiliary_loss_clip": 0.06439544, + "auxiliary_loss_mlp": 0.01274879, + "balance_loss_clip": 0.06280965, + "balance_loss_mlp": 0.01261003, + "epoch": 0.44984217646174657, + "flos": 33811067295360.0, + "grad_norm": 1.5325857273153378, + "language_loss": 0.68501163, + "learning_rate": 2.418857789743758e-06, + "loss": 0.76215583, + "num_input_tokens_seen": 160486710, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 0.13873291, + "step": 7482, + "time_per_iteration": 2.6323177814483643 + }, + { + "auxiliary_loss_clip": 0.06449723, + "auxiliary_loss_mlp": 0.01275016, + "balance_loss_clip": 0.06284413, + "balance_loss_mlp": 0.01261236, + "epoch": 0.44990229971441453, + "flos": 15522953915520.0, + "grad_norm": 2.4692742165129347, + "language_loss": 0.85184467, + "learning_rate": 2.418476956872571e-06, + "loss": 0.92909217, + "num_input_tokens_seen": 160503405, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.13775635, + "step": 7483, + "time_per_iteration": 2.5510005950927734 + }, + { + "auxiliary_loss_clip": 0.0644832, + "auxiliary_loss_mlp": 0.01272458, + "balance_loss_clip": 0.06278956, + "balance_loss_mlp": 0.01259017, + "epoch": 0.4499624229670825, + "flos": 29869676542080.0, + "grad_norm": 2.2555510336477362, + "language_loss": 0.81026614, + "learning_rate": 2.4180961081326967e-06, + "loss": 0.88747394, + "num_input_tokens_seen": 160525080, + "router_z_loss_clip": 1.69238281, + "router_z_loss_mlp": 0.13439941, + "step": 7484, + "time_per_iteration": 2.5549514293670654 + }, + { + "auxiliary_loss_clip": 0.06454043, + "auxiliary_loss_mlp": 0.01271307, + "balance_loss_clip": 0.06282799, + "balance_loss_mlp": 0.01257133, + "epoch": 0.45002254621975046, + "flos": 18519345521280.0, + "grad_norm": 3.0066277785462296, + "language_loss": 0.75523663, + "learning_rate": 2.4177152435385754e-06, + "loss": 0.83249015, + "num_input_tokens_seen": 160540895, + "router_z_loss_clip": 1.71191406, + "router_z_loss_mlp": 0.14172363, + "step": 7485, + "time_per_iteration": 2.5260515213012695 + }, + { + "auxiliary_loss_clip": 0.06353837, + "auxiliary_loss_mlp": 0.01254878, + "balance_loss_clip": 0.06280266, + "balance_loss_mlp": 0.01252054, + "epoch": 0.4500826694724185, + "flos": 70438753261440.0, + "grad_norm": 0.7710237062022668, + "language_loss": 0.58055162, + "learning_rate": 2.4173343631046504e-06, + "loss": 0.65663874, + "num_input_tokens_seen": 160598270, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.02819824, + "step": 7486, + "time_per_iteration": 4.631975173950195 + }, + { + "auxiliary_loss_clip": 0.06445555, + "auxiliary_loss_mlp": 0.0126857, + "balance_loss_clip": 0.06281094, + "balance_loss_mlp": 0.0125523, + "epoch": 0.45014279272508645, + "flos": 15784388254080.0, + "grad_norm": 2.313810641491004, + "language_loss": 0.83291382, + "learning_rate": 2.4169534668453654e-06, + "loss": 0.91005504, + "num_input_tokens_seen": 160614720, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.13336182, + "step": 7487, + "time_per_iteration": 2.4474549293518066 + }, + { + "auxiliary_loss_clip": 0.06440553, + "auxiliary_loss_mlp": 0.01274868, + "balance_loss_clip": 0.06278186, + "balance_loss_mlp": 0.01260879, + "epoch": 0.4502029159777544, + "flos": 21806157778560.0, + "grad_norm": 1.8256144522955593, + "language_loss": 0.77817398, + "learning_rate": 2.4165725547751622e-06, + "loss": 0.8553282, + "num_input_tokens_seen": 160635170, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.13983154, + "step": 7488, + "time_per_iteration": 2.5497655868530273 + }, + { + "auxiliary_loss_clip": 0.0645895, + "auxiliary_loss_mlp": 0.01273187, + "balance_loss_clip": 0.06284817, + "balance_loss_mlp": 0.01257773, + "epoch": 0.4502630392304224, + "flos": 28775651708160.0, + "grad_norm": 2.1057521417086194, + "language_loss": 0.72464138, + "learning_rate": 2.4161916269084858e-06, + "loss": 0.80196273, + "num_input_tokens_seen": 160654490, + "router_z_loss_clip": 1.7421875, + "router_z_loss_mlp": 0.15405273, + "step": 7489, + "time_per_iteration": 2.536022186279297 + }, + { + "auxiliary_loss_clip": 0.06449728, + "auxiliary_loss_mlp": 0.01273963, + "balance_loss_clip": 0.06281854, + "balance_loss_mlp": 0.012597, + "epoch": 0.45032316248309034, + "flos": 15848398373760.0, + "grad_norm": 2.178444480440472, + "language_loss": 0.70506239, + "learning_rate": 2.4158106832597817e-06, + "loss": 0.78229928, + "num_input_tokens_seen": 160669400, + "router_z_loss_clip": 1.6796875, + "router_z_loss_mlp": 0.14263916, + "step": 7490, + "time_per_iteration": 2.5048370361328125 + }, + { + "auxiliary_loss_clip": 0.06351414, + "auxiliary_loss_mlp": 0.01254304, + "balance_loss_clip": 0.06277761, + "balance_loss_mlp": 0.01251552, + "epoch": 0.4503832857357583, + "flos": 57873337056000.0, + "grad_norm": 0.766905441156629, + "language_loss": 0.56608462, + "learning_rate": 2.415429723843495e-06, + "loss": 0.64214182, + "num_input_tokens_seen": 160733820, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.02757263, + "step": 7491, + "time_per_iteration": 3.1021111011505127 + }, + { + "auxiliary_loss_clip": 0.06440033, + "auxiliary_loss_mlp": 0.01270664, + "balance_loss_clip": 0.06278066, + "balance_loss_mlp": 0.01257217, + "epoch": 0.4504434089884263, + "flos": 23884817541120.0, + "grad_norm": 1.940533812141729, + "language_loss": 0.79471588, + "learning_rate": 2.4150487486740713e-06, + "loss": 0.87182283, + "num_input_tokens_seen": 160753175, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.13446045, + "step": 7492, + "time_per_iteration": 3.906813144683838 + }, + { + "auxiliary_loss_clip": 0.06454505, + "auxiliary_loss_mlp": 0.01272683, + "balance_loss_clip": 0.06282404, + "balance_loss_mlp": 0.01257925, + "epoch": 0.45050353224109424, + "flos": 17790820197120.0, + "grad_norm": 2.4926790281130566, + "language_loss": 0.92799652, + "learning_rate": 2.4146677577659573e-06, + "loss": 1.00526834, + "num_input_tokens_seen": 160768310, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.14758301, + "step": 7493, + "time_per_iteration": 2.516523838043213 + }, + { + "auxiliary_loss_clip": 0.06351101, + "auxiliary_loss_mlp": 0.01253906, + "balance_loss_clip": 0.06277501, + "balance_loss_mlp": 0.01251232, + "epoch": 0.4505636554937622, + "flos": 65081960138880.0, + "grad_norm": 0.7917943169613642, + "language_loss": 0.62850708, + "learning_rate": 2.4142867511336e-06, + "loss": 0.70455718, + "num_input_tokens_seen": 160827370, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.02676392, + "step": 7494, + "time_per_iteration": 3.200533866882324 + }, + { + "auxiliary_loss_clip": 0.06439039, + "auxiliary_loss_mlp": 0.01268167, + "balance_loss_clip": 0.06275568, + "balance_loss_mlp": 0.01255305, + "epoch": 0.45062377874643017, + "flos": 22206597240960.0, + "grad_norm": 1.3576432808579277, + "language_loss": 0.8187722, + "learning_rate": 2.4139057287914484e-06, + "loss": 0.89584428, + "num_input_tokens_seen": 160849140, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.12860107, + "step": 7495, + "time_per_iteration": 2.6740329265594482 + }, + { + "auxiliary_loss_clip": 0.06444755, + "auxiliary_loss_mlp": 0.01267118, + "balance_loss_clip": 0.06279008, + "balance_loss_mlp": 0.01253344, + "epoch": 0.45068390199909814, + "flos": 37679433615360.0, + "grad_norm": 3.4533684270887988, + "language_loss": 0.85559022, + "learning_rate": 2.41352469075395e-06, + "loss": 0.93270886, + "num_input_tokens_seen": 160871280, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.13775635, + "step": 7496, + "time_per_iteration": 2.6514453887939453 + }, + { + "auxiliary_loss_clip": 0.06445448, + "auxiliary_loss_mlp": 0.01271465, + "balance_loss_clip": 0.06277982, + "balance_loss_mlp": 0.01258042, + "epoch": 0.4507440252517661, + "flos": 22307853300480.0, + "grad_norm": 2.147795774994512, + "language_loss": 0.76396865, + "learning_rate": 2.4131436370355534e-06, + "loss": 0.84113777, + "num_input_tokens_seen": 160888625, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.13427734, + "step": 7497, + "time_per_iteration": 2.5248610973358154 + }, + { + "auxiliary_loss_clip": 0.0644587, + "auxiliary_loss_mlp": 0.01268435, + "balance_loss_clip": 0.062753, + "balance_loss_mlp": 0.01254189, + "epoch": 0.45080414850443407, + "flos": 13193425428480.0, + "grad_norm": 1.9297018893586142, + "language_loss": 0.75253481, + "learning_rate": 2.4127625676507088e-06, + "loss": 0.82967794, + "num_input_tokens_seen": 160907040, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.14245605, + "step": 7498, + "time_per_iteration": 2.482625722885132 + }, + { + "auxiliary_loss_clip": 0.06447846, + "auxiliary_loss_mlp": 0.01269776, + "balance_loss_clip": 0.06277958, + "balance_loss_mlp": 0.01255697, + "epoch": 0.4508642717571021, + "flos": 21951451958400.0, + "grad_norm": 1.9463705761270829, + "language_loss": 0.70564914, + "learning_rate": 2.4123814826138663e-06, + "loss": 0.78282535, + "num_input_tokens_seen": 160927115, + "router_z_loss_clip": 1.69824219, + "router_z_loss_mlp": 0.14093018, + "step": 7499, + "time_per_iteration": 2.5338642597198486 + }, + { + "auxiliary_loss_clip": 0.06449613, + "auxiliary_loss_mlp": 0.01268145, + "balance_loss_clip": 0.06278396, + "balance_loss_mlp": 0.0125412, + "epoch": 0.45092439500977005, + "flos": 23374149632640.0, + "grad_norm": 2.119825325087625, + "language_loss": 0.77484369, + "learning_rate": 2.412000381939477e-06, + "loss": 0.85202128, + "num_input_tokens_seen": 160944405, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.14025879, + "step": 7500, + "time_per_iteration": 2.5290849208831787 + }, + { + "auxiliary_loss_clip": 0.06441833, + "auxiliary_loss_mlp": 0.01275038, + "balance_loss_clip": 0.06276967, + "balance_loss_mlp": 0.01262211, + "epoch": 0.450984518262438, + "flos": 20778532905600.0, + "grad_norm": 2.0513851791377014, + "language_loss": 0.62714708, + "learning_rate": 2.411619265641992e-06, + "loss": 0.70431578, + "num_input_tokens_seen": 160961345, + "router_z_loss_clip": 1.64746094, + "router_z_loss_mlp": 0.12823486, + "step": 7501, + "time_per_iteration": 2.513014316558838 + }, + { + "auxiliary_loss_clip": 0.06447023, + "auxiliary_loss_mlp": 0.01269353, + "balance_loss_clip": 0.0627754, + "balance_loss_mlp": 0.01255251, + "epoch": 0.451044641515106, + "flos": 17712303321600.0, + "grad_norm": 1.7676077358786102, + "language_loss": 0.8475225, + "learning_rate": 2.411238133735863e-06, + "loss": 0.92468631, + "num_input_tokens_seen": 160977330, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.14111328, + "step": 7502, + "time_per_iteration": 2.502213954925537 + }, + { + "auxiliary_loss_clip": 0.06440664, + "auxiliary_loss_mlp": 0.01270795, + "balance_loss_clip": 0.06275544, + "balance_loss_mlp": 0.01256967, + "epoch": 0.45110476476777395, + "flos": 20600940176640.0, + "grad_norm": 1.2963550821027272, + "language_loss": 0.79440266, + "learning_rate": 2.4108569862355418e-06, + "loss": 0.8715173, + "num_input_tokens_seen": 160997280, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.13824463, + "step": 7503, + "time_per_iteration": 2.539870023727417 + }, + { + "auxiliary_loss_clip": 0.0643944, + "auxiliary_loss_mlp": 0.01270582, + "balance_loss_clip": 0.06278714, + "balance_loss_mlp": 0.01257213, + "epoch": 0.4511648880204419, + "flos": 16039533536640.0, + "grad_norm": 2.8864102182872746, + "language_loss": 0.80966014, + "learning_rate": 2.410475823155484e-06, + "loss": 0.88676035, + "num_input_tokens_seen": 161014235, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.13354492, + "step": 7504, + "time_per_iteration": 2.4834609031677246 + }, + { + "auxiliary_loss_clip": 0.06439783, + "auxiliary_loss_mlp": 0.01267614, + "balance_loss_clip": 0.06277721, + "balance_loss_mlp": 0.0125412, + "epoch": 0.4512250112731099, + "flos": 23984103029760.0, + "grad_norm": 1.8935476867238503, + "language_loss": 0.63783783, + "learning_rate": 2.4100946445101405e-06, + "loss": 0.71491182, + "num_input_tokens_seen": 161032360, + "router_z_loss_clip": 1.62109375, + "router_z_loss_mlp": 0.1350708, + "step": 7505, + "time_per_iteration": 2.5183863639831543 + }, + { + "auxiliary_loss_clip": 0.06338686, + "auxiliary_loss_mlp": 0.0125649, + "balance_loss_clip": 0.06265638, + "balance_loss_mlp": 0.01253881, + "epoch": 0.45128513452577784, + "flos": 71484239053440.0, + "grad_norm": 0.8179087732062593, + "language_loss": 0.58726048, + "learning_rate": 2.409713450313968e-06, + "loss": 0.66321218, + "num_input_tokens_seen": 161091360, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.02610779, + "step": 7506, + "time_per_iteration": 3.2057392597198486 + }, + { + "auxiliary_loss_clip": 0.06438521, + "auxiliary_loss_mlp": 0.01269482, + "balance_loss_clip": 0.0627608, + "balance_loss_mlp": 0.01255987, + "epoch": 0.4513452577784458, + "flos": 22097375043840.0, + "grad_norm": 1.6199933066680872, + "language_loss": 0.79207951, + "learning_rate": 2.40933224058142e-06, + "loss": 0.86915958, + "num_input_tokens_seen": 161110825, + "router_z_loss_clip": 1.62402344, + "router_z_loss_mlp": 0.1348877, + "step": 7507, + "time_per_iteration": 2.485177993774414 + }, + { + "auxiliary_loss_clip": 0.0644455, + "auxiliary_loss_mlp": 0.01270991, + "balance_loss_clip": 0.06277668, + "balance_loss_mlp": 0.01256543, + "epoch": 0.4514053810311138, + "flos": 24282699454080.0, + "grad_norm": 1.6041025363642085, + "language_loss": 0.74460357, + "learning_rate": 2.4089510153269526e-06, + "loss": 0.82175899, + "num_input_tokens_seen": 161130685, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.14440918, + "step": 7508, + "time_per_iteration": 2.5957343578338623 + }, + { + "auxiliary_loss_clip": 0.06439587, + "auxiliary_loss_mlp": 0.01271402, + "balance_loss_clip": 0.06279378, + "balance_loss_mlp": 0.01258552, + "epoch": 0.45146550428378174, + "flos": 17891237715840.0, + "grad_norm": 2.0541508842975946, + "language_loss": 0.79828942, + "learning_rate": 2.4085697745650217e-06, + "loss": 0.87539923, + "num_input_tokens_seen": 161147555, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.12841797, + "step": 7509, + "time_per_iteration": 2.4700090885162354 + }, + { + "auxiliary_loss_clip": 0.06441342, + "auxiliary_loss_mlp": 0.01270525, + "balance_loss_clip": 0.06278946, + "balance_loss_mlp": 0.01257746, + "epoch": 0.4515256275364497, + "flos": 24250317050880.0, + "grad_norm": 1.7065874480024321, + "language_loss": 0.73257631, + "learning_rate": 2.4081885183100837e-06, + "loss": 0.80969501, + "num_input_tokens_seen": 161166255, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.12774658, + "step": 7510, + "time_per_iteration": 2.5448224544525146 + }, + { + "auxiliary_loss_clip": 0.06438527, + "auxiliary_loss_mlp": 0.01269291, + "balance_loss_clip": 0.06273279, + "balance_loss_mlp": 0.01255707, + "epoch": 0.45158575078911767, + "flos": 20637263721600.0, + "grad_norm": 1.688618785836195, + "language_loss": 0.77059448, + "learning_rate": 2.4078072465765964e-06, + "loss": 0.8476727, + "num_input_tokens_seen": 161184720, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.13598633, + "step": 7511, + "time_per_iteration": 2.48913311958313 + }, + { + "auxiliary_loss_clip": 0.06443627, + "auxiliary_loss_mlp": 0.01270366, + "balance_loss_clip": 0.06277004, + "balance_loss_mlp": 0.0125543, + "epoch": 0.45164587404178563, + "flos": 23333884945920.0, + "grad_norm": 1.5549799825793658, + "language_loss": 0.79259372, + "learning_rate": 2.4074259593790174e-06, + "loss": 0.86973357, + "num_input_tokens_seen": 161204360, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.14929199, + "step": 7512, + "time_per_iteration": 2.5429651737213135 + }, + { + "auxiliary_loss_clip": 0.06447546, + "auxiliary_loss_mlp": 0.01266751, + "balance_loss_clip": 0.06275645, + "balance_loss_mlp": 0.01252219, + "epoch": 0.45170599729445365, + "flos": 23812841283840.0, + "grad_norm": 2.088368619040166, + "language_loss": 0.87660837, + "learning_rate": 2.4070446567318053e-06, + "loss": 0.95375133, + "num_input_tokens_seen": 161223575, + "router_z_loss_clip": 1.71777344, + "router_z_loss_mlp": 0.14538574, + "step": 7513, + "time_per_iteration": 2.50119686126709 + }, + { + "auxiliary_loss_clip": 0.06437154, + "auxiliary_loss_mlp": 0.01272349, + "balance_loss_clip": 0.06280629, + "balance_loss_mlp": 0.01259963, + "epoch": 0.4517661205471216, + "flos": 23519569593600.0, + "grad_norm": 1.9321046654640033, + "language_loss": 0.67692971, + "learning_rate": 2.406663338649419e-06, + "loss": 0.75402474, + "num_input_tokens_seen": 161243805, + "router_z_loss_clip": 1.56738281, + "router_z_loss_mlp": 0.1237793, + "step": 7514, + "time_per_iteration": 2.548349618911743 + }, + { + "auxiliary_loss_clip": 0.0644633, + "auxiliary_loss_mlp": 0.01272615, + "balance_loss_clip": 0.06280062, + "balance_loss_mlp": 0.01258017, + "epoch": 0.4518262437997896, + "flos": 23520743550720.0, + "grad_norm": 2.108913826152056, + "language_loss": 0.69738746, + "learning_rate": 2.406282005146318e-06, + "loss": 0.7745769, + "num_input_tokens_seen": 161261450, + "router_z_loss_clip": 1.6640625, + "router_z_loss_mlp": 0.14587402, + "step": 7515, + "time_per_iteration": 2.5203166007995605 + }, + { + "auxiliary_loss_clip": 0.06448089, + "auxiliary_loss_mlp": 0.01273292, + "balance_loss_clip": 0.06278358, + "balance_loss_mlp": 0.01258379, + "epoch": 0.45188636705245755, + "flos": 14572210763520.0, + "grad_norm": 2.327142049261069, + "language_loss": 0.81245089, + "learning_rate": 2.405900656236963e-06, + "loss": 0.88966471, + "num_input_tokens_seen": 161276965, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.14916992, + "step": 7516, + "time_per_iteration": 2.5070860385894775 + }, + { + "auxiliary_loss_clip": 0.06440821, + "auxiliary_loss_mlp": 0.01272469, + "balance_loss_clip": 0.0627999, + "balance_loss_mlp": 0.01259899, + "epoch": 0.4519464903051255, + "flos": 19907690221440.0, + "grad_norm": 1.8586788547852597, + "language_loss": 0.65825433, + "learning_rate": 2.4055192919358137e-06, + "loss": 0.73538721, + "num_input_tokens_seen": 161295375, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.12573242, + "step": 7517, + "time_per_iteration": 2.4824438095092773 + }, + { + "auxiliary_loss_clip": 0.06439231, + "auxiliary_loss_mlp": 0.01270445, + "balance_loss_clip": 0.06279515, + "balance_loss_mlp": 0.01257923, + "epoch": 0.4520066135577935, + "flos": 18850492056960.0, + "grad_norm": 1.7463164288041955, + "language_loss": 0.63218093, + "learning_rate": 2.405137912257333e-06, + "loss": 0.70927775, + "num_input_tokens_seen": 161313010, + "router_z_loss_clip": 1.59570312, + "router_z_loss_mlp": 0.12524414, + "step": 7518, + "time_per_iteration": 2.5339365005493164 + }, + { + "auxiliary_loss_clip": 0.0644324, + "auxiliary_loss_mlp": 0.01270416, + "balance_loss_clip": 0.06278235, + "balance_loss_mlp": 0.0125713, + "epoch": 0.45206673681046144, + "flos": 48225279985920.0, + "grad_norm": 1.4167266474258036, + "language_loss": 0.59749353, + "learning_rate": 2.404756517215982e-06, + "loss": 0.67463017, + "num_input_tokens_seen": 161336690, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.13287354, + "step": 7519, + "time_per_iteration": 4.238602876663208 + }, + { + "auxiliary_loss_clip": 0.06444496, + "auxiliary_loss_mlp": 0.01271755, + "balance_loss_clip": 0.06278859, + "balance_loss_mlp": 0.0125789, + "epoch": 0.4521268600631294, + "flos": 23848997120640.0, + "grad_norm": 1.307309529899749, + "language_loss": 0.72893107, + "learning_rate": 2.404375106826223e-06, + "loss": 0.80609363, + "num_input_tokens_seen": 161357845, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.13848877, + "step": 7520, + "time_per_iteration": 2.5295658111572266 + }, + { + "auxiliary_loss_clip": 0.06438812, + "auxiliary_loss_mlp": 0.01272031, + "balance_loss_clip": 0.062758, + "balance_loss_mlp": 0.01257875, + "epoch": 0.4521869833157974, + "flos": 18849611589120.0, + "grad_norm": 1.9694306251575102, + "language_loss": 0.75821477, + "learning_rate": 2.4039936811025194e-06, + "loss": 0.83532321, + "num_input_tokens_seen": 161375160, + "router_z_loss_clip": 1.62890625, + "router_z_loss_mlp": 0.14147949, + "step": 7521, + "time_per_iteration": 2.51493763923645 + }, + { + "auxiliary_loss_clip": 0.06448258, + "auxiliary_loss_mlp": 0.01268765, + "balance_loss_clip": 0.06277934, + "balance_loss_mlp": 0.01255485, + "epoch": 0.45224710656846534, + "flos": 19793520633600.0, + "grad_norm": 2.0145516283749334, + "language_loss": 0.68112928, + "learning_rate": 2.4036122400593343e-06, + "loss": 0.75829947, + "num_input_tokens_seen": 161393690, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.1328125, + "step": 7522, + "time_per_iteration": 2.4986941814422607 + }, + { + "auxiliary_loss_clip": 0.06441501, + "auxiliary_loss_mlp": 0.0127253, + "balance_loss_clip": 0.06278691, + "balance_loss_mlp": 0.01258797, + "epoch": 0.4523072298211333, + "flos": 28263558280320.0, + "grad_norm": 1.4118666030005445, + "language_loss": 0.61165464, + "learning_rate": 2.403230783711134e-06, + "loss": 0.68879497, + "num_input_tokens_seen": 161415015, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.13739014, + "step": 7523, + "time_per_iteration": 2.5918800830841064 + }, + { + "auxiliary_loss_clip": 0.06446532, + "auxiliary_loss_mlp": 0.01271231, + "balance_loss_clip": 0.06278014, + "balance_loss_mlp": 0.01256187, + "epoch": 0.45236735307380127, + "flos": 11185651820160.0, + "grad_norm": 1.7682897571754845, + "language_loss": 0.78361082, + "learning_rate": 2.4028493120723813e-06, + "loss": 0.86078846, + "num_input_tokens_seen": 161432940, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.15057373, + "step": 7524, + "time_per_iteration": 2.4915785789489746 + }, + { + "auxiliary_loss_clip": 0.06441181, + "auxiliary_loss_mlp": 0.01272652, + "balance_loss_clip": 0.06277032, + "balance_loss_mlp": 0.01259527, + "epoch": 0.45242747632646924, + "flos": 22607959098240.0, + "grad_norm": 1.5918865124670334, + "language_loss": 0.63704681, + "learning_rate": 2.4024678251575417e-06, + "loss": 0.71418512, + "num_input_tokens_seen": 161452215, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.13122559, + "step": 7525, + "time_per_iteration": 4.0678441524505615 + }, + { + "auxiliary_loss_clip": 0.06439088, + "auxiliary_loss_mlp": 0.01272795, + "balance_loss_clip": 0.06279112, + "balance_loss_mlp": 0.01260153, + "epoch": 0.45248759957913726, + "flos": 18261558835200.0, + "grad_norm": 33.97196740045056, + "language_loss": 0.78961569, + "learning_rate": 2.402086322981083e-06, + "loss": 0.8667345, + "num_input_tokens_seen": 161469520, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.12664795, + "step": 7526, + "time_per_iteration": 2.4813144207000732 + }, + { + "auxiliary_loss_clip": 0.06437138, + "auxiliary_loss_mlp": 0.01271118, + "balance_loss_clip": 0.06276058, + "balance_loss_mlp": 0.01257493, + "epoch": 0.4525477228318052, + "flos": 22455746956800.0, + "grad_norm": 1.6415997795559136, + "language_loss": 0.81301343, + "learning_rate": 2.40170480555747e-06, + "loss": 0.89009607, + "num_input_tokens_seen": 161487335, + "router_z_loss_clip": 1.61132812, + "router_z_loss_mlp": 0.13641357, + "step": 7527, + "time_per_iteration": 2.5056183338165283 + }, + { + "auxiliary_loss_clip": 0.06441762, + "auxiliary_loss_mlp": 0.01270981, + "balance_loss_clip": 0.06280501, + "balance_loss_mlp": 0.01258106, + "epoch": 0.4526078460844732, + "flos": 29652909229440.0, + "grad_norm": 1.731340365534577, + "language_loss": 0.65853465, + "learning_rate": 2.4013232729011706e-06, + "loss": 0.73566198, + "num_input_tokens_seen": 161510095, + "router_z_loss_clip": 1.61230469, + "router_z_loss_mlp": 0.12866211, + "step": 7528, + "time_per_iteration": 2.6073391437530518 + }, + { + "auxiliary_loss_clip": 0.06439637, + "auxiliary_loss_mlp": 0.0127116, + "balance_loss_clip": 0.06280227, + "balance_loss_mlp": 0.01257296, + "epoch": 0.45266796933714115, + "flos": 23046483041280.0, + "grad_norm": 1.6874802957215247, + "language_loss": 0.75494301, + "learning_rate": 2.4009417250266525e-06, + "loss": 0.83205104, + "num_input_tokens_seen": 161528725, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.13867188, + "step": 7529, + "time_per_iteration": 2.5490171909332275 + }, + { + "auxiliary_loss_clip": 0.06443143, + "auxiliary_loss_mlp": 0.01270284, + "balance_loss_clip": 0.06278682, + "balance_loss_mlp": 0.0125614, + "epoch": 0.4527280925898091, + "flos": 14433582983040.0, + "grad_norm": 5.318026120447717, + "language_loss": 0.73199093, + "learning_rate": 2.400560161948384e-06, + "loss": 0.80912519, + "num_input_tokens_seen": 161547195, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.14160156, + "step": 7530, + "time_per_iteration": 2.4709434509277344 + }, + { + "auxiliary_loss_clip": 0.06441925, + "auxiliary_loss_mlp": 0.01267178, + "balance_loss_clip": 0.06279813, + "balance_loss_mlp": 0.01253857, + "epoch": 0.4527882158424771, + "flos": 22931432985600.0, + "grad_norm": 1.7055117614079858, + "language_loss": 0.76767921, + "learning_rate": 2.400178583680834e-06, + "loss": 0.84477019, + "num_input_tokens_seen": 161565565, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.13336182, + "step": 7531, + "time_per_iteration": 3.9209694862365723 + }, + { + "auxiliary_loss_clip": 0.06439964, + "auxiliary_loss_mlp": 0.01266077, + "balance_loss_clip": 0.06281422, + "balance_loss_mlp": 0.01253018, + "epoch": 0.45284833909514505, + "flos": 25562157373440.0, + "grad_norm": 1.5452453614533965, + "language_loss": 0.67367595, + "learning_rate": 2.3997969902384717e-06, + "loss": 0.75073636, + "num_input_tokens_seen": 161586630, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.1305542, + "step": 7532, + "time_per_iteration": 2.5799813270568848 + }, + { + "auxiliary_loss_clip": 0.06441537, + "auxiliary_loss_mlp": 0.01269682, + "balance_loss_clip": 0.06280663, + "balance_loss_mlp": 0.01257206, + "epoch": 0.452908462347813, + "flos": 18155816582400.0, + "grad_norm": 2.362226158293886, + "language_loss": 0.78750062, + "learning_rate": 2.399415381635768e-06, + "loss": 0.86461282, + "num_input_tokens_seen": 161603815, + "router_z_loss_clip": 1.60839844, + "router_z_loss_mlp": 0.12481689, + "step": 7533, + "time_per_iteration": 2.4713315963745117 + }, + { + "auxiliary_loss_clip": 0.06451754, + "auxiliary_loss_mlp": 0.01272809, + "balance_loss_clip": 0.06279968, + "balance_loss_mlp": 0.01257849, + "epoch": 0.452968585600481, + "flos": 19068810670080.0, + "grad_norm": 1.7736608700696739, + "language_loss": 0.83544481, + "learning_rate": 2.3990337578871927e-06, + "loss": 0.9126904, + "num_input_tokens_seen": 161622900, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.1494751, + "step": 7534, + "time_per_iteration": 2.632647752761841 + }, + { + "auxiliary_loss_clip": 0.06447195, + "auxiliary_loss_mlp": 0.01272735, + "balance_loss_clip": 0.06281491, + "balance_loss_mlp": 0.01258597, + "epoch": 0.45302870885314894, + "flos": 22057823116800.0, + "grad_norm": 1.5477368000033016, + "language_loss": 0.77199811, + "learning_rate": 2.3986521190072176e-06, + "loss": 0.84919739, + "num_input_tokens_seen": 161641700, + "router_z_loss_clip": 1.65625, + "router_z_loss_mlp": 0.14129639, + "step": 7535, + "time_per_iteration": 2.504075765609741 + }, + { + "auxiliary_loss_clip": 0.06444988, + "auxiliary_loss_mlp": 0.01267095, + "balance_loss_clip": 0.06283444, + "balance_loss_mlp": 0.01254453, + "epoch": 0.4530888321058169, + "flos": 20382495782400.0, + "grad_norm": 1.553658728431748, + "language_loss": 0.80988163, + "learning_rate": 2.3982704650103138e-06, + "loss": 0.88700247, + "num_input_tokens_seen": 161661955, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.12640381, + "step": 7536, + "time_per_iteration": 2.5701963901519775 + }, + { + "auxiliary_loss_clip": 0.06448273, + "auxiliary_loss_mlp": 0.01269034, + "balance_loss_clip": 0.06281114, + "balance_loss_mlp": 0.01255617, + "epoch": 0.4531489553584849, + "flos": 14835783381120.0, + "grad_norm": 1.8444336957712972, + "language_loss": 0.76206815, + "learning_rate": 2.3978887959109544e-06, + "loss": 0.83924115, + "num_input_tokens_seen": 161679245, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.13427734, + "step": 7537, + "time_per_iteration": 2.4535741806030273 + }, + { + "auxiliary_loss_clip": 0.06453362, + "auxiliary_loss_mlp": 0.01269094, + "balance_loss_clip": 0.06287456, + "balance_loss_mlp": 0.0125526, + "epoch": 0.45320907861115284, + "flos": 21951493885440.0, + "grad_norm": 1.8251133101176713, + "language_loss": 0.75698435, + "learning_rate": 2.3975071117236118e-06, + "loss": 0.83420891, + "num_input_tokens_seen": 161698795, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.13830566, + "step": 7538, + "time_per_iteration": 2.5437614917755127 + }, + { + "auxiliary_loss_clip": 0.06342177, + "auxiliary_loss_mlp": 0.01255931, + "balance_loss_clip": 0.06267795, + "balance_loss_mlp": 0.01253302, + "epoch": 0.45326920186382086, + "flos": 66273620578560.0, + "grad_norm": 1.09487044177016, + "language_loss": 0.62420493, + "learning_rate": 2.3971254124627593e-06, + "loss": 0.70018601, + "num_input_tokens_seen": 161761980, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.02630615, + "step": 7539, + "time_per_iteration": 3.1658005714416504 + }, + { + "auxiliary_loss_clip": 0.06450586, + "auxiliary_loss_mlp": 0.01270155, + "balance_loss_clip": 0.06287818, + "balance_loss_mlp": 0.01256404, + "epoch": 0.4533293251164888, + "flos": 14689524879360.0, + "grad_norm": 1.7102983978579578, + "language_loss": 0.65674543, + "learning_rate": 2.396743698142872e-06, + "loss": 0.73395288, + "num_input_tokens_seen": 161779455, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.13757324, + "step": 7540, + "time_per_iteration": 2.5642666816711426 + }, + { + "auxiliary_loss_clip": 0.06454974, + "auxiliary_loss_mlp": 0.01269021, + "balance_loss_clip": 0.06285828, + "balance_loss_mlp": 0.01254179, + "epoch": 0.4533894483691568, + "flos": 22607749463040.0, + "grad_norm": 2.019177110810713, + "language_loss": 0.84982491, + "learning_rate": 2.396361968778424e-06, + "loss": 0.92706484, + "num_input_tokens_seen": 161798980, + "router_z_loss_clip": 1.69238281, + "router_z_loss_mlp": 0.1484375, + "step": 7541, + "time_per_iteration": 2.515012741088867 + }, + { + "auxiliary_loss_clip": 0.06444205, + "auxiliary_loss_mlp": 0.01270638, + "balance_loss_clip": 0.06281162, + "balance_loss_mlp": 0.01257853, + "epoch": 0.45344957162182475, + "flos": 34760301073920.0, + "grad_norm": 1.6772641382422697, + "language_loss": 0.77260393, + "learning_rate": 2.395980224383889e-06, + "loss": 0.84975231, + "num_input_tokens_seen": 161819745, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.12780762, + "step": 7542, + "time_per_iteration": 2.6276772022247314 + }, + { + "auxiliary_loss_clip": 0.06447195, + "auxiliary_loss_mlp": 0.01266644, + "balance_loss_clip": 0.06281827, + "balance_loss_mlp": 0.01252398, + "epoch": 0.4535096948744927, + "flos": 23556983241600.0, + "grad_norm": 1.679511772595701, + "language_loss": 0.80522043, + "learning_rate": 2.395598464973746e-06, + "loss": 0.88235873, + "num_input_tokens_seen": 161838575, + "router_z_loss_clip": 1.65234375, + "router_z_loss_mlp": 0.14233398, + "step": 7543, + "time_per_iteration": 2.5102038383483887 + }, + { + "auxiliary_loss_clip": 0.06448692, + "auxiliary_loss_mlp": 0.01269791, + "balance_loss_clip": 0.06283225, + "balance_loss_mlp": 0.01256339, + "epoch": 0.4535698181271607, + "flos": 25564756849920.0, + "grad_norm": 1.5595363191014409, + "language_loss": 0.76234162, + "learning_rate": 2.395216690562469e-06, + "loss": 0.83952641, + "num_input_tokens_seen": 161858590, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.13446045, + "step": 7544, + "time_per_iteration": 2.613546371459961 + }, + { + "auxiliary_loss_clip": 0.06450664, + "auxiliary_loss_mlp": 0.0127145, + "balance_loss_clip": 0.06283042, + "balance_loss_mlp": 0.01257747, + "epoch": 0.45362994137982865, + "flos": 24871171478400.0, + "grad_norm": 1.656067150864753, + "language_loss": 0.75691646, + "learning_rate": 2.3948349011645355e-06, + "loss": 0.83413762, + "num_input_tokens_seen": 161878390, + "router_z_loss_clip": 1.67578125, + "router_z_loss_mlp": 0.137146, + "step": 7545, + "time_per_iteration": 2.5587077140808105 + }, + { + "auxiliary_loss_clip": 0.06444206, + "auxiliary_loss_mlp": 0.01276554, + "balance_loss_clip": 0.06279359, + "balance_loss_mlp": 0.01263161, + "epoch": 0.4536900646324966, + "flos": 30814088711040.0, + "grad_norm": 1.7013764448707542, + "language_loss": 0.72677243, + "learning_rate": 2.394453096794423e-06, + "loss": 0.80397999, + "num_input_tokens_seen": 161898610, + "router_z_loss_clip": 1.64941406, + "router_z_loss_mlp": 0.13391113, + "step": 7546, + "time_per_iteration": 2.582507371902466 + }, + { + "auxiliary_loss_clip": 0.06454303, + "auxiliary_loss_mlp": 0.01276587, + "balance_loss_clip": 0.06282242, + "balance_loss_mlp": 0.01261531, + "epoch": 0.4537501878851646, + "flos": 23411060156160.0, + "grad_norm": 1.4140833040204603, + "language_loss": 0.76407051, + "learning_rate": 2.394071277466609e-06, + "loss": 0.8413794, + "num_input_tokens_seen": 161918210, + "router_z_loss_clip": 1.72070312, + "router_z_loss_mlp": 0.1505127, + "step": 7547, + "time_per_iteration": 2.5376148223876953 + }, + { + "auxiliary_loss_clip": 0.06452849, + "auxiliary_loss_mlp": 0.0127245, + "balance_loss_clip": 0.06284454, + "balance_loss_mlp": 0.01258086, + "epoch": 0.45381031113783254, + "flos": 18154978041600.0, + "grad_norm": 1.9572251150113926, + "language_loss": 0.70011902, + "learning_rate": 2.393689443195573e-06, + "loss": 0.777372, + "num_input_tokens_seen": 161936950, + "router_z_loss_clip": 1.68554688, + "router_z_loss_mlp": 0.14367676, + "step": 7548, + "time_per_iteration": 2.519615650177002 + }, + { + "auxiliary_loss_clip": 0.0644725, + "auxiliary_loss_mlp": 0.01271972, + "balance_loss_clip": 0.06283379, + "balance_loss_mlp": 0.01258638, + "epoch": 0.4538704343905005, + "flos": 25343503344000.0, + "grad_norm": 2.0312160927741933, + "language_loss": 0.72993481, + "learning_rate": 2.393307593995794e-06, + "loss": 0.80712706, + "num_input_tokens_seen": 161955550, + "router_z_loss_clip": 1.63769531, + "router_z_loss_mlp": 0.13342285, + "step": 7549, + "time_per_iteration": 2.57501482963562 + }, + { + "auxiliary_loss_clip": 0.06446082, + "auxiliary_loss_mlp": 0.01269972, + "balance_loss_clip": 0.06283575, + "balance_loss_mlp": 0.01257312, + "epoch": 0.4539305576431685, + "flos": 28739118528000.0, + "grad_norm": 1.441987244253853, + "language_loss": 0.65387678, + "learning_rate": 2.392925729881751e-06, + "loss": 0.73103732, + "num_input_tokens_seen": 161976760, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.12658691, + "step": 7550, + "time_per_iteration": 2.5835819244384766 + }, + { + "auxiliary_loss_clip": 0.06445216, + "auxiliary_loss_mlp": 0.01271365, + "balance_loss_clip": 0.06284294, + "balance_loss_mlp": 0.01258162, + "epoch": 0.45399068089583644, + "flos": 22499030390400.0, + "grad_norm": 1.5764003430967004, + "language_loss": 0.6906575, + "learning_rate": 2.3925438508679263e-06, + "loss": 0.76782334, + "num_input_tokens_seen": 161996120, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.13189697, + "step": 7551, + "time_per_iteration": 2.562033176422119 + }, + { + "auxiliary_loss_clip": 0.06442459, + "auxiliary_loss_mlp": 0.01272903, + "balance_loss_clip": 0.06276844, + "balance_loss_mlp": 0.01259504, + "epoch": 0.45405080414850446, + "flos": 12897889678080.0, + "grad_norm": 1.6874134559177159, + "language_loss": 0.79426885, + "learning_rate": 2.392161956968798e-06, + "loss": 0.87142253, + "num_input_tokens_seen": 162011125, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.13409424, + "step": 7552, + "time_per_iteration": 2.4449541568756104 + }, + { + "auxiliary_loss_clip": 0.063404, + "auxiliary_loss_mlp": 0.01262626, + "balance_loss_clip": 0.06265783, + "balance_loss_mlp": 0.01260128, + "epoch": 0.4541109274011724, + "flos": 59783558912640.0, + "grad_norm": 0.8094629177090237, + "language_loss": 0.57832247, + "learning_rate": 2.39178004819885e-06, + "loss": 0.65435266, + "num_input_tokens_seen": 162068705, + "router_z_loss_clip": 0.74707031, + "router_z_loss_mlp": 0.02496338, + "step": 7553, + "time_per_iteration": 3.089684247970581 + }, + { + "auxiliary_loss_clip": 0.06443945, + "auxiliary_loss_mlp": 0.01272453, + "balance_loss_clip": 0.06280293, + "balance_loss_mlp": 0.01258946, + "epoch": 0.4541710506538404, + "flos": 28519248614400.0, + "grad_norm": 1.8062911390055711, + "language_loss": 0.76727033, + "learning_rate": 2.3913981245725626e-06, + "loss": 0.84443438, + "num_input_tokens_seen": 162089655, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.13494873, + "step": 7554, + "time_per_iteration": 2.541727066040039 + }, + { + "auxiliary_loss_clip": 0.06449907, + "auxiliary_loss_mlp": 0.0126986, + "balance_loss_clip": 0.06284112, + "balance_loss_mlp": 0.0125559, + "epoch": 0.45423117390650836, + "flos": 17681304510720.0, + "grad_norm": 3.221825223389834, + "language_loss": 0.76701951, + "learning_rate": 2.3910161861044194e-06, + "loss": 0.84421712, + "num_input_tokens_seen": 162108465, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.1427002, + "step": 7555, + "time_per_iteration": 2.5190746784210205 + }, + { + "auxiliary_loss_clip": 0.06447887, + "auxiliary_loss_mlp": 0.01270234, + "balance_loss_clip": 0.06284074, + "balance_loss_mlp": 0.01256292, + "epoch": 0.4542912971591763, + "flos": 28079760349440.0, + "grad_norm": 1.2938327471401587, + "language_loss": 0.7293222, + "learning_rate": 2.390634232808903e-06, + "loss": 0.80650342, + "num_input_tokens_seen": 162129910, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.13946533, + "step": 7556, + "time_per_iteration": 2.559330940246582 + }, + { + "auxiliary_loss_clip": 0.06452744, + "auxiliary_loss_mlp": 0.0127062, + "balance_loss_clip": 0.06282438, + "balance_loss_mlp": 0.01256351, + "epoch": 0.4543514204118443, + "flos": 22677922857600.0, + "grad_norm": 1.9930550713200077, + "language_loss": 0.63614035, + "learning_rate": 2.3902522647004982e-06, + "loss": 0.71337396, + "num_input_tokens_seen": 162148840, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.14294434, + "step": 7557, + "time_per_iteration": 2.555694580078125 + }, + { + "auxiliary_loss_clip": 0.06341553, + "auxiliary_loss_mlp": 0.01256007, + "balance_loss_clip": 0.06267436, + "balance_loss_mlp": 0.01253351, + "epoch": 0.45441154366451225, + "flos": 58236027454080.0, + "grad_norm": 0.6640379644801875, + "language_loss": 0.57562745, + "learning_rate": 2.3898702817936875e-06, + "loss": 0.65160298, + "num_input_tokens_seen": 162208500, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.02658081, + "step": 7558, + "time_per_iteration": 5.871712684631348 + }, + { + "auxiliary_loss_clip": 0.06449831, + "auxiliary_loss_mlp": 0.01270129, + "balance_loss_clip": 0.06282432, + "balance_loss_mlp": 0.01255216, + "epoch": 0.4544716669171802, + "flos": 16769987504640.0, + "grad_norm": 2.2880587940678927, + "language_loss": 0.56438738, + "learning_rate": 2.3894882841029573e-06, + "loss": 0.64158702, + "num_input_tokens_seen": 162224650, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.14904785, + "step": 7559, + "time_per_iteration": 2.4660634994506836 + }, + { + "auxiliary_loss_clip": 0.06446083, + "auxiliary_loss_mlp": 0.01271383, + "balance_loss_clip": 0.06282272, + "balance_loss_mlp": 0.01257728, + "epoch": 0.4545317901698482, + "flos": 15930814464000.0, + "grad_norm": 1.794091833084443, + "language_loss": 0.72316611, + "learning_rate": 2.389106271642792e-06, + "loss": 0.80034077, + "num_input_tokens_seen": 162242930, + "router_z_loss_clip": 1.63867188, + "router_z_loss_mlp": 0.13671875, + "step": 7560, + "time_per_iteration": 2.497083902359009 + }, + { + "auxiliary_loss_clip": 0.06455533, + "auxiliary_loss_mlp": 0.01271449, + "balance_loss_clip": 0.0628465, + "balance_loss_mlp": 0.01257096, + "epoch": 0.45459191342251615, + "flos": 17645567944320.0, + "grad_norm": 2.9678955818231167, + "language_loss": 0.69120479, + "learning_rate": 2.3887242444276775e-06, + "loss": 0.76847458, + "num_input_tokens_seen": 162261455, + "router_z_loss_clip": 1.70800781, + "router_z_loss_mlp": 0.14355469, + "step": 7561, + "time_per_iteration": 2.463433027267456 + }, + { + "auxiliary_loss_clip": 0.06447616, + "auxiliary_loss_mlp": 0.01269071, + "balance_loss_clip": 0.06286462, + "balance_loss_mlp": 0.01256161, + "epoch": 0.4546520366751841, + "flos": 16181557407360.0, + "grad_norm": 2.3534128933362277, + "language_loss": 0.85417646, + "learning_rate": 2.3883422024721015e-06, + "loss": 0.93134332, + "num_input_tokens_seen": 162279725, + "router_z_loss_clip": 1.61132812, + "router_z_loss_mlp": 0.12908936, + "step": 7562, + "time_per_iteration": 2.5475013256073 + }, + { + "auxiliary_loss_clip": 0.06445649, + "auxiliary_loss_mlp": 0.01271177, + "balance_loss_clip": 0.06284063, + "balance_loss_mlp": 0.01257504, + "epoch": 0.4547121599278521, + "flos": 19756861672320.0, + "grad_norm": 1.7772924752060992, + "language_loss": 0.89642298, + "learning_rate": 2.38796014579055e-06, + "loss": 0.97359127, + "num_input_tokens_seen": 162297865, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.13684082, + "step": 7563, + "time_per_iteration": 2.489121675491333 + }, + { + "auxiliary_loss_clip": 0.06453149, + "auxiliary_loss_mlp": 0.01274815, + "balance_loss_clip": 0.06286659, + "balance_loss_mlp": 0.01260397, + "epoch": 0.45477228318052004, + "flos": 19943510641920.0, + "grad_norm": 1.9263110789996643, + "language_loss": 0.71668887, + "learning_rate": 2.3875780743975097e-06, + "loss": 0.79396844, + "num_input_tokens_seen": 162316010, + "router_z_loss_clip": 1.6640625, + "router_z_loss_mlp": 0.14428711, + "step": 7564, + "time_per_iteration": 2.4964044094085693 + }, + { + "auxiliary_loss_clip": 0.06450239, + "auxiliary_loss_mlp": 0.01267794, + "balance_loss_clip": 0.06283273, + "balance_loss_mlp": 0.01253912, + "epoch": 0.454832406433188, + "flos": 21294735183360.0, + "grad_norm": 2.0561067408009994, + "language_loss": 0.68633133, + "learning_rate": 2.3871959883074713e-06, + "loss": 0.7635116, + "num_input_tokens_seen": 162336115, + "router_z_loss_clip": 1.66894531, + "router_z_loss_mlp": 0.13879395, + "step": 7565, + "time_per_iteration": 4.080512762069702 + }, + { + "auxiliary_loss_clip": 0.06446166, + "auxiliary_loss_mlp": 0.01274343, + "balance_loss_clip": 0.06282604, + "balance_loss_mlp": 0.01260247, + "epoch": 0.45489252968585603, + "flos": 24505630041600.0, + "grad_norm": 2.0436514367854413, + "language_loss": 0.802881, + "learning_rate": 2.386813887534922e-06, + "loss": 0.88008606, + "num_input_tokens_seen": 162355705, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.14105225, + "step": 7566, + "time_per_iteration": 2.521056890487671 + }, + { + "auxiliary_loss_clip": 0.06452477, + "auxiliary_loss_mlp": 0.01273216, + "balance_loss_clip": 0.06286022, + "balance_loss_mlp": 0.01257558, + "epoch": 0.454952652938524, + "flos": 17098199147520.0, + "grad_norm": 2.208842453595512, + "language_loss": 0.74317467, + "learning_rate": 2.3864317720943508e-06, + "loss": 0.82043159, + "num_input_tokens_seen": 162374055, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.15661621, + "step": 7567, + "time_per_iteration": 2.515658140182495 + }, + { + "auxiliary_loss_clip": 0.06459296, + "auxiliary_loss_mlp": 0.01271605, + "balance_loss_clip": 0.06291091, + "balance_loss_mlp": 0.0125801, + "epoch": 0.45501277619119196, + "flos": 27636792140160.0, + "grad_norm": 1.5215577708435108, + "language_loss": 0.80959934, + "learning_rate": 2.386049642000249e-06, + "loss": 0.88690829, + "num_input_tokens_seen": 162393560, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.13604736, + "step": 7568, + "time_per_iteration": 2.558258533477783 + }, + { + "auxiliary_loss_clip": 0.06466229, + "auxiliary_loss_mlp": 0.01276365, + "balance_loss_clip": 0.06294216, + "balance_loss_mlp": 0.01260176, + "epoch": 0.4550728994438599, + "flos": 19980840435840.0, + "grad_norm": 1.8148678559144198, + "language_loss": 0.80280846, + "learning_rate": 2.3856674972671055e-06, + "loss": 0.88023436, + "num_input_tokens_seen": 162413170, + "router_z_loss_clip": 1.71972656, + "router_z_loss_mlp": 0.16186523, + "step": 7569, + "time_per_iteration": 2.531153917312622 + }, + { + "auxiliary_loss_clip": 0.06458277, + "auxiliary_loss_mlp": 0.01268707, + "balance_loss_clip": 0.06287743, + "balance_loss_mlp": 0.01254176, + "epoch": 0.4551330226965279, + "flos": 26073915384960.0, + "grad_norm": 1.3474740501928035, + "language_loss": 0.75202894, + "learning_rate": 2.385285337909412e-06, + "loss": 0.82929879, + "num_input_tokens_seen": 162434080, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.14538574, + "step": 7570, + "time_per_iteration": 2.543170690536499 + }, + { + "auxiliary_loss_clip": 0.06452256, + "auxiliary_loss_mlp": 0.01273702, + "balance_loss_clip": 0.06289603, + "balance_loss_mlp": 0.01259826, + "epoch": 0.45519314594919585, + "flos": 32789396062080.0, + "grad_norm": 1.7878922954829848, + "language_loss": 0.74832451, + "learning_rate": 2.3849031639416596e-06, + "loss": 0.82558417, + "num_input_tokens_seen": 162455445, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.13879395, + "step": 7571, + "time_per_iteration": 4.052931308746338 + }, + { + "auxiliary_loss_clip": 0.06451707, + "auxiliary_loss_mlp": 0.01275937, + "balance_loss_clip": 0.06292738, + "balance_loss_mlp": 0.01261954, + "epoch": 0.4552532692018638, + "flos": 19178829480960.0, + "grad_norm": 1.5879241198756615, + "language_loss": 0.81163442, + "learning_rate": 2.3845209753783414e-06, + "loss": 0.88891089, + "num_input_tokens_seen": 162474940, + "router_z_loss_clip": 1.58886719, + "router_z_loss_mlp": 0.13983154, + "step": 7572, + "time_per_iteration": 2.511032819747925 + }, + { + "auxiliary_loss_clip": 0.06461887, + "auxiliary_loss_mlp": 0.01269094, + "balance_loss_clip": 0.06292465, + "balance_loss_mlp": 0.01254306, + "epoch": 0.4553133924545318, + "flos": 26033650698240.0, + "grad_norm": 2.340526601051543, + "language_loss": 0.72866237, + "learning_rate": 2.3841387722339486e-06, + "loss": 0.80597222, + "num_input_tokens_seen": 162493340, + "router_z_loss_clip": 1.69238281, + "router_z_loss_mlp": 0.14788818, + "step": 7573, + "time_per_iteration": 2.5469906330108643 + }, + { + "auxiliary_loss_clip": 0.06470129, + "auxiliary_loss_mlp": 0.0127089, + "balance_loss_clip": 0.06300491, + "balance_loss_mlp": 0.01255094, + "epoch": 0.45537351570719975, + "flos": 30668920312320.0, + "grad_norm": 1.9189620807456311, + "language_loss": 0.74504352, + "learning_rate": 2.3837565545229748e-06, + "loss": 0.82245368, + "num_input_tokens_seen": 162514360, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.15783691, + "step": 7574, + "time_per_iteration": 2.6484622955322266 + }, + { + "auxiliary_loss_clip": 0.06463373, + "auxiliary_loss_mlp": 0.01271034, + "balance_loss_clip": 0.06294367, + "balance_loss_mlp": 0.0125661, + "epoch": 0.4554336389598677, + "flos": 24360377788800.0, + "grad_norm": 1.669597443611077, + "language_loss": 0.71544576, + "learning_rate": 2.383374322259915e-06, + "loss": 0.79278982, + "num_input_tokens_seen": 162535240, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.14428711, + "step": 7575, + "time_per_iteration": 2.544975519180298 + }, + { + "auxiliary_loss_clip": 0.06456485, + "auxiliary_loss_mlp": 0.01268004, + "balance_loss_clip": 0.06290726, + "balance_loss_mlp": 0.01253794, + "epoch": 0.4554937622125357, + "flos": 20564113507200.0, + "grad_norm": 1.7578928676474412, + "language_loss": 0.7370066, + "learning_rate": 2.3829920754592617e-06, + "loss": 0.81425148, + "num_input_tokens_seen": 162553880, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.14202881, + "step": 7576, + "time_per_iteration": 2.534135580062866 + }, + { + "auxiliary_loss_clip": 0.06453636, + "auxiliary_loss_mlp": 0.0127588, + "balance_loss_clip": 0.06290971, + "balance_loss_mlp": 0.01261551, + "epoch": 0.45555388546520365, + "flos": 22827451668480.0, + "grad_norm": 2.007695048360481, + "language_loss": 0.66580224, + "learning_rate": 2.382609814135511e-06, + "loss": 0.74309736, + "num_input_tokens_seen": 162574485, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.14312744, + "step": 7577, + "time_per_iteration": 2.5095431804656982 + }, + { + "auxiliary_loss_clip": 0.06452672, + "auxiliary_loss_mlp": 0.01272369, + "balance_loss_clip": 0.0628684, + "balance_loss_mlp": 0.01256538, + "epoch": 0.4556140087178716, + "flos": 21732462512640.0, + "grad_norm": 1.904316861437945, + "language_loss": 0.74386835, + "learning_rate": 2.382227538303157e-06, + "loss": 0.82111871, + "num_input_tokens_seen": 162595130, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.15820312, + "step": 7578, + "time_per_iteration": 2.5497546195983887 + }, + { + "auxiliary_loss_clip": 0.06453466, + "auxiliary_loss_mlp": 0.01270181, + "balance_loss_clip": 0.06290053, + "balance_loss_mlp": 0.01256645, + "epoch": 0.45567413197053963, + "flos": 26001645638400.0, + "grad_norm": 1.7724513927111563, + "language_loss": 0.70436674, + "learning_rate": 2.381845247976697e-06, + "loss": 0.78160322, + "num_input_tokens_seen": 162615720, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.13531494, + "step": 7579, + "time_per_iteration": 2.5318000316619873 + }, + { + "auxiliary_loss_clip": 0.06449443, + "auxiliary_loss_mlp": 0.01270557, + "balance_loss_clip": 0.06286655, + "balance_loss_mlp": 0.01257664, + "epoch": 0.4557342552232076, + "flos": 21543046358400.0, + "grad_norm": 1.8462396851301097, + "language_loss": 0.78760922, + "learning_rate": 2.381462943170627e-06, + "loss": 0.86480927, + "num_input_tokens_seen": 162635825, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.12902832, + "step": 7580, + "time_per_iteration": 2.5358526706695557 + }, + { + "auxiliary_loss_clip": 0.06450854, + "auxiliary_loss_mlp": 0.0127087, + "balance_loss_clip": 0.06288584, + "balance_loss_mlp": 0.01257822, + "epoch": 0.45579437847587556, + "flos": 40010932673280.0, + "grad_norm": 1.6599136037597217, + "language_loss": 0.68708634, + "learning_rate": 2.381080623899444e-06, + "loss": 0.76430357, + "num_input_tokens_seen": 162659130, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.13049316, + "step": 7581, + "time_per_iteration": 2.667543888092041 + }, + { + "auxiliary_loss_clip": 0.06448796, + "auxiliary_loss_mlp": 0.01272512, + "balance_loss_clip": 0.06289542, + "balance_loss_mlp": 0.01258678, + "epoch": 0.4558545017285435, + "flos": 31146409203840.0, + "grad_norm": 1.6471906775179725, + "language_loss": 0.7358638, + "learning_rate": 2.3806982901776455e-06, + "loss": 0.81307691, + "num_input_tokens_seen": 162681665, + "router_z_loss_clip": 1.59277344, + "router_z_loss_mlp": 0.1383667, + "step": 7582, + "time_per_iteration": 2.6570708751678467 + }, + { + "auxiliary_loss_clip": 0.06455518, + "auxiliary_loss_mlp": 0.01272969, + "balance_loss_clip": 0.06286626, + "balance_loss_mlp": 0.01257818, + "epoch": 0.4559146249812115, + "flos": 21732210950400.0, + "grad_norm": 1.8620959272942483, + "language_loss": 0.73187852, + "learning_rate": 2.380315942019729e-06, + "loss": 0.80916339, + "num_input_tokens_seen": 162702040, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.15148926, + "step": 7583, + "time_per_iteration": 2.510700225830078 + }, + { + "auxiliary_loss_clip": 0.06455322, + "auxiliary_loss_mlp": 0.01272152, + "balance_loss_clip": 0.06287013, + "balance_loss_mlp": 0.01256202, + "epoch": 0.45597474823387946, + "flos": 23812841283840.0, + "grad_norm": 1.81949303768272, + "language_loss": 0.72839421, + "learning_rate": 2.379933579440195e-06, + "loss": 0.80566895, + "num_input_tokens_seen": 162722375, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.1595459, + "step": 7584, + "time_per_iteration": 2.5747973918914795 + }, + { + "auxiliary_loss_clip": 0.06447833, + "auxiliary_loss_mlp": 0.01268136, + "balance_loss_clip": 0.0628446, + "balance_loss_mlp": 0.01255357, + "epoch": 0.4560348714865474, + "flos": 31913857549440.0, + "grad_norm": 1.7864940938501939, + "language_loss": 0.67957801, + "learning_rate": 2.379551202453541e-06, + "loss": 0.75673771, + "num_input_tokens_seen": 162746095, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.12792969, + "step": 7585, + "time_per_iteration": 2.6153225898742676 + }, + { + "auxiliary_loss_clip": 0.0645072, + "auxiliary_loss_mlp": 0.01268647, + "balance_loss_clip": 0.06284043, + "balance_loss_mlp": 0.01254449, + "epoch": 0.4560949947392154, + "flos": 22054427026560.0, + "grad_norm": 1.7083540410775564, + "language_loss": 0.76353097, + "learning_rate": 2.379168811074267e-06, + "loss": 0.84072465, + "num_input_tokens_seen": 162766330, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.14190674, + "step": 7586, + "time_per_iteration": 2.5682435035705566 + }, + { + "auxiliary_loss_clip": 0.06448488, + "auxiliary_loss_mlp": 0.01267379, + "balance_loss_clip": 0.0628647, + "balance_loss_mlp": 0.01254182, + "epoch": 0.45615511799188335, + "flos": 24578738328960.0, + "grad_norm": 1.819670635232321, + "language_loss": 0.78360641, + "learning_rate": 2.3787864053168747e-06, + "loss": 0.86076516, + "num_input_tokens_seen": 162784755, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.13189697, + "step": 7587, + "time_per_iteration": 2.5558509826660156 + }, + { + "auxiliary_loss_clip": 0.06459979, + "auxiliary_loss_mlp": 0.01275995, + "balance_loss_clip": 0.06286488, + "balance_loss_mlp": 0.01260152, + "epoch": 0.4562152412445513, + "flos": 18336260350080.0, + "grad_norm": 1.7968748305561377, + "language_loss": 0.69667047, + "learning_rate": 2.378403985195863e-06, + "loss": 0.77403021, + "num_input_tokens_seen": 162803850, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.1583252, + "step": 7588, + "time_per_iteration": 2.5365071296691895 + }, + { + "auxiliary_loss_clip": 0.06447656, + "auxiliary_loss_mlp": 0.01274434, + "balance_loss_clip": 0.06286096, + "balance_loss_mlp": 0.01261422, + "epoch": 0.4562753644972193, + "flos": 13521595144320.0, + "grad_norm": 1.6774091429175193, + "language_loss": 0.79575098, + "learning_rate": 2.378021550725735e-06, + "loss": 0.87297189, + "num_input_tokens_seen": 162820775, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.13006592, + "step": 7589, + "time_per_iteration": 2.484713315963745 + }, + { + "auxiliary_loss_clip": 0.06452583, + "auxiliary_loss_mlp": 0.01271771, + "balance_loss_clip": 0.06289135, + "balance_loss_mlp": 0.0125774, + "epoch": 0.45633548774988725, + "flos": 29646871735680.0, + "grad_norm": 2.003946782113331, + "language_loss": 0.62696528, + "learning_rate": 2.377639101920992e-06, + "loss": 0.70420885, + "num_input_tokens_seen": 162839695, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.14044189, + "step": 7590, + "time_per_iteration": 2.609936475753784 + }, + { + "auxiliary_loss_clip": 0.06445528, + "auxiliary_loss_mlp": 0.01270847, + "balance_loss_clip": 0.06280724, + "balance_loss_mlp": 0.01257496, + "epoch": 0.4563956110025552, + "flos": 22239398914560.0, + "grad_norm": 1.8300596662255737, + "language_loss": 0.73085624, + "learning_rate": 2.377256638796135e-06, + "loss": 0.80802, + "num_input_tokens_seen": 162856095, + "router_z_loss_clip": 1.64746094, + "router_z_loss_mlp": 0.13330078, + "step": 7591, + "time_per_iteration": 2.47824764251709 + }, + { + "auxiliary_loss_clip": 0.06452768, + "auxiliary_loss_mlp": 0.01273962, + "balance_loss_clip": 0.0628728, + "balance_loss_mlp": 0.01260205, + "epoch": 0.45645573425522323, + "flos": 17097696023040.0, + "grad_norm": 1.9979722051509847, + "language_loss": 0.77518493, + "learning_rate": 2.3768741613656695e-06, + "loss": 0.85245228, + "num_input_tokens_seen": 162874070, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.13751221, + "step": 7592, + "time_per_iteration": 2.5239169597625732 + }, + { + "auxiliary_loss_clip": 0.06449406, + "auxiliary_loss_mlp": 0.01273175, + "balance_loss_clip": 0.06284081, + "balance_loss_mlp": 0.01259954, + "epoch": 0.4565158575078912, + "flos": 20337367559040.0, + "grad_norm": 2.421698823443505, + "language_loss": 0.6941641, + "learning_rate": 2.376491669644098e-06, + "loss": 0.77138984, + "num_input_tokens_seen": 162891000, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.13232422, + "step": 7593, + "time_per_iteration": 2.5688788890838623 + }, + { + "auxiliary_loss_clip": 0.06437326, + "auxiliary_loss_mlp": 0.01268265, + "balance_loss_clip": 0.06278698, + "balance_loss_mlp": 0.01256034, + "epoch": 0.45657598076055916, + "flos": 23989008493440.0, + "grad_norm": 2.02887277896486, + "language_loss": 0.8417384, + "learning_rate": 2.3761091636459248e-06, + "loss": 0.91879439, + "num_input_tokens_seen": 162910120, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.12237549, + "step": 7594, + "time_per_iteration": 2.5792298316955566 + }, + { + "auxiliary_loss_clip": 0.06341574, + "auxiliary_loss_mlp": 0.01258819, + "balance_loss_clip": 0.06267718, + "balance_loss_mlp": 0.0125595, + "epoch": 0.45663610401322713, + "flos": 69382812908160.0, + "grad_norm": 0.7684087429591354, + "language_loss": 0.52710819, + "learning_rate": 2.375726643385654e-06, + "loss": 0.60311204, + "num_input_tokens_seen": 162963720, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.02864075, + "step": 7595, + "time_per_iteration": 3.150902509689331 + }, + { + "auxiliary_loss_clip": 0.06451569, + "auxiliary_loss_mlp": 0.01268714, + "balance_loss_clip": 0.06282795, + "balance_loss_mlp": 0.0125491, + "epoch": 0.4566962272658951, + "flos": 15152884358400.0, + "grad_norm": 2.304862186673624, + "language_loss": 0.8729161, + "learning_rate": 2.3753441088777915e-06, + "loss": 0.95011896, + "num_input_tokens_seen": 162975760, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.13824463, + "step": 7596, + "time_per_iteration": 2.490346908569336 + }, + { + "auxiliary_loss_clip": 0.0644666, + "auxiliary_loss_mlp": 0.01270115, + "balance_loss_clip": 0.06282236, + "balance_loss_mlp": 0.01257324, + "epoch": 0.45675635051856306, + "flos": 18703395014400.0, + "grad_norm": 1.5857620712679525, + "language_loss": 0.77719533, + "learning_rate": 2.374961560136843e-06, + "loss": 0.85436308, + "num_input_tokens_seen": 162994865, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.12792969, + "step": 7597, + "time_per_iteration": 2.5043859481811523 + }, + { + "auxiliary_loss_clip": 0.0644691, + "auxiliary_loss_mlp": 0.01271101, + "balance_loss_clip": 0.06280024, + "balance_loss_mlp": 0.01256587, + "epoch": 0.456816473771231, + "flos": 19104211820160.0, + "grad_norm": 1.619707981694153, + "language_loss": 0.78513646, + "learning_rate": 2.374578997177314e-06, + "loss": 0.86231661, + "num_input_tokens_seen": 163014730, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.14501953, + "step": 7598, + "time_per_iteration": 3.9724912643432617 + }, + { + "auxiliary_loss_clip": 0.06447135, + "auxiliary_loss_mlp": 0.01268948, + "balance_loss_clip": 0.06284773, + "balance_loss_mlp": 0.01255508, + "epoch": 0.456876597023899, + "flos": 28957730630400.0, + "grad_norm": 2.2287540067942957, + "language_loss": 0.72171777, + "learning_rate": 2.374196420013712e-06, + "loss": 0.79887861, + "num_input_tokens_seen": 163033405, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.13458252, + "step": 7599, + "time_per_iteration": 2.594240188598633 + }, + { + "auxiliary_loss_clip": 0.06445186, + "auxiliary_loss_mlp": 0.0126948, + "balance_loss_clip": 0.06281814, + "balance_loss_mlp": 0.01256021, + "epoch": 0.45693672027656695, + "flos": 23295297340800.0, + "grad_norm": 1.7934880288039583, + "language_loss": 0.70205128, + "learning_rate": 2.373813828660544e-06, + "loss": 0.77919793, + "num_input_tokens_seen": 163051400, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.13439941, + "step": 7600, + "time_per_iteration": 2.5063295364379883 + }, + { + "auxiliary_loss_clip": 0.06449603, + "auxiliary_loss_mlp": 0.01270393, + "balance_loss_clip": 0.06284294, + "balance_loss_mlp": 0.01256571, + "epoch": 0.4569968435292349, + "flos": 20564448923520.0, + "grad_norm": 2.031833923402261, + "language_loss": 0.78985888, + "learning_rate": 2.373431223132319e-06, + "loss": 0.86705881, + "num_input_tokens_seen": 163069250, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.13824463, + "step": 7601, + "time_per_iteration": 2.559072494506836 + }, + { + "auxiliary_loss_clip": 0.06449661, + "auxiliary_loss_mlp": 0.0127022, + "balance_loss_clip": 0.06283583, + "balance_loss_mlp": 0.01257089, + "epoch": 0.4570569667819029, + "flos": 41292403090560.0, + "grad_norm": 1.9704151582810323, + "language_loss": 0.71676505, + "learning_rate": 2.3730486034435448e-06, + "loss": 0.79396379, + "num_input_tokens_seen": 163091755, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.13134766, + "step": 7602, + "time_per_iteration": 2.6897006034851074 + }, + { + "auxiliary_loss_clip": 0.06446967, + "auxiliary_loss_mlp": 0.01270876, + "balance_loss_clip": 0.06280911, + "balance_loss_mlp": 0.01255843, + "epoch": 0.45711709003457085, + "flos": 26038807724160.0, + "grad_norm": 1.8547506252317059, + "language_loss": 0.73479527, + "learning_rate": 2.372665969608729e-06, + "loss": 0.81197369, + "num_input_tokens_seen": 163111600, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.15026855, + "step": 7603, + "time_per_iteration": 2.5908169746398926 + }, + { + "auxiliary_loss_clip": 0.06447335, + "auxiliary_loss_mlp": 0.01269467, + "balance_loss_clip": 0.0628283, + "balance_loss_mlp": 0.01254077, + "epoch": 0.4571772132872388, + "flos": 22163649223680.0, + "grad_norm": 1.7365999934209901, + "language_loss": 0.83048642, + "learning_rate": 2.372283321642383e-06, + "loss": 0.90765446, + "num_input_tokens_seen": 163127350, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.15374756, + "step": 7604, + "time_per_iteration": 2.462653636932373 + }, + { + "auxiliary_loss_clip": 0.0645724, + "auxiliary_loss_mlp": 0.01271667, + "balance_loss_clip": 0.06285316, + "balance_loss_mlp": 0.01256456, + "epoch": 0.45723733653990684, + "flos": 23885739936000.0, + "grad_norm": 1.8384947858044167, + "language_loss": 0.86237913, + "learning_rate": 2.371900659559016e-06, + "loss": 0.93966818, + "num_input_tokens_seen": 163145855, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.15209961, + "step": 7605, + "time_per_iteration": 3.9711341857910156 + }, + { + "auxiliary_loss_clip": 0.0645397, + "auxiliary_loss_mlp": 0.01268015, + "balance_loss_clip": 0.06283225, + "balance_loss_mlp": 0.01253686, + "epoch": 0.4572974597925748, + "flos": 16877197203840.0, + "grad_norm": 1.5621441730902494, + "language_loss": 0.73368603, + "learning_rate": 2.371517983373138e-06, + "loss": 0.81090587, + "num_input_tokens_seen": 163163830, + "router_z_loss_clip": 1.70898438, + "router_z_loss_mlp": 0.14343262, + "step": 7606, + "time_per_iteration": 2.53171968460083 + }, + { + "auxiliary_loss_clip": 0.06450876, + "auxiliary_loss_mlp": 0.01272472, + "balance_loss_clip": 0.06281146, + "balance_loss_mlp": 0.01257118, + "epoch": 0.45735758304524277, + "flos": 13776530791680.0, + "grad_norm": 2.9980100906386324, + "language_loss": 0.80445778, + "learning_rate": 2.371135293099262e-06, + "loss": 0.88169128, + "num_input_tokens_seen": 163180700, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.15356445, + "step": 7607, + "time_per_iteration": 2.4730136394500732 + }, + { + "auxiliary_loss_clip": 0.06449468, + "auxiliary_loss_mlp": 0.01267355, + "balance_loss_clip": 0.06282607, + "balance_loss_mlp": 0.01252216, + "epoch": 0.45741770629791073, + "flos": 21106283351040.0, + "grad_norm": 1.9890456967063905, + "language_loss": 0.80849135, + "learning_rate": 2.3707525887518982e-06, + "loss": 0.88565969, + "num_input_tokens_seen": 163199450, + "router_z_loss_clip": 1.66894531, + "router_z_loss_mlp": 0.15130615, + "step": 7608, + "time_per_iteration": 2.5604805946350098 + }, + { + "auxiliary_loss_clip": 0.06445852, + "auxiliary_loss_mlp": 0.01268416, + "balance_loss_clip": 0.06281331, + "balance_loss_mlp": 0.01254576, + "epoch": 0.4574778295505787, + "flos": 23119675182720.0, + "grad_norm": 1.6776975313937859, + "language_loss": 0.68550682, + "learning_rate": 2.370369870345559e-06, + "loss": 0.76264954, + "num_input_tokens_seen": 163217875, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.1385498, + "step": 7609, + "time_per_iteration": 2.5249829292297363 + }, + { + "auxiliary_loss_clip": 0.06446596, + "auxiliary_loss_mlp": 0.01267793, + "balance_loss_clip": 0.06279876, + "balance_loss_mlp": 0.01253917, + "epoch": 0.45753795280324666, + "flos": 24359832737280.0, + "grad_norm": 4.839518120228961, + "language_loss": 0.81053591, + "learning_rate": 2.369987137894757e-06, + "loss": 0.88767982, + "num_input_tokens_seen": 163237430, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.13879395, + "step": 7610, + "time_per_iteration": 3.9629292488098145 + }, + { + "auxiliary_loss_clip": 0.06456244, + "auxiliary_loss_mlp": 0.01272187, + "balance_loss_clip": 0.06284218, + "balance_loss_mlp": 0.01258359, + "epoch": 0.4575980760559146, + "flos": 16659297861120.0, + "grad_norm": 2.22162560638367, + "language_loss": 0.82538879, + "learning_rate": 2.3696043914140057e-06, + "loss": 0.90267307, + "num_input_tokens_seen": 163253905, + "router_z_loss_clip": 1.71972656, + "router_z_loss_mlp": 0.13848877, + "step": 7611, + "time_per_iteration": 2.483184337615967 + }, + { + "auxiliary_loss_clip": 0.06450104, + "auxiliary_loss_mlp": 0.01268987, + "balance_loss_clip": 0.06284404, + "balance_loss_mlp": 0.01254753, + "epoch": 0.4576581993085826, + "flos": 35919006860160.0, + "grad_norm": 1.7486456420241998, + "language_loss": 0.73840886, + "learning_rate": 2.369221630917819e-06, + "loss": 0.81559974, + "num_input_tokens_seen": 163274285, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.14239502, + "step": 7612, + "time_per_iteration": 2.629122734069824 + }, + { + "auxiliary_loss_clip": 0.06446031, + "auxiliary_loss_mlp": 0.0126785, + "balance_loss_clip": 0.06281702, + "balance_loss_mlp": 0.01253711, + "epoch": 0.45771832256125056, + "flos": 20085995710080.0, + "grad_norm": 1.498537690587119, + "language_loss": 0.85104787, + "learning_rate": 2.368838856420711e-06, + "loss": 0.92818671, + "num_input_tokens_seen": 163293150, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.14160156, + "step": 7613, + "time_per_iteration": 2.4995853900909424 + }, + { + "auxiliary_loss_clip": 0.06450839, + "auxiliary_loss_mlp": 0.01271405, + "balance_loss_clip": 0.062853, + "balance_loss_mlp": 0.01257458, + "epoch": 0.4577784458139185, + "flos": 10749056520960.0, + "grad_norm": 2.317250545042104, + "language_loss": 0.75818133, + "learning_rate": 2.3684560679371965e-06, + "loss": 0.8354038, + "num_input_tokens_seen": 163310065, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.13946533, + "step": 7614, + "time_per_iteration": 2.5512688159942627 + }, + { + "auxiliary_loss_clip": 0.06447698, + "auxiliary_loss_mlp": 0.01269104, + "balance_loss_clip": 0.06284869, + "balance_loss_mlp": 0.01254513, + "epoch": 0.4578385690665865, + "flos": 21913577112960.0, + "grad_norm": 1.7278714332693421, + "language_loss": 0.7495364, + "learning_rate": 2.368073265481791e-06, + "loss": 0.82670438, + "num_input_tokens_seen": 163329415, + "router_z_loss_clip": 1.62890625, + "router_z_loss_mlp": 0.14587402, + "step": 7615, + "time_per_iteration": 2.4959964752197266 + }, + { + "auxiliary_loss_clip": 0.06341572, + "auxiliary_loss_mlp": 0.01260056, + "balance_loss_clip": 0.06266811, + "balance_loss_mlp": 0.01256924, + "epoch": 0.45789869231925445, + "flos": 64774559036160.0, + "grad_norm": 0.7564263714074747, + "language_loss": 0.57682395, + "learning_rate": 2.3676904490690105e-06, + "loss": 0.65284026, + "num_input_tokens_seen": 163385875, + "router_z_loss_clip": 0.74804688, + "router_z_loss_mlp": 0.03129578, + "step": 7616, + "time_per_iteration": 3.1225674152374268 + }, + { + "auxiliary_loss_clip": 0.06451499, + "auxiliary_loss_mlp": 0.01269699, + "balance_loss_clip": 0.06287209, + "balance_loss_mlp": 0.01255299, + "epoch": 0.4579588155719224, + "flos": 16149594274560.0, + "grad_norm": 2.222129623674548, + "language_loss": 0.71319497, + "learning_rate": 2.3673076187133704e-06, + "loss": 0.790407, + "num_input_tokens_seen": 163405170, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.144104, + "step": 7617, + "time_per_iteration": 2.535795211791992 + }, + { + "auxiliary_loss_clip": 0.06453606, + "auxiliary_loss_mlp": 0.01272033, + "balance_loss_clip": 0.06288601, + "balance_loss_mlp": 0.0125749, + "epoch": 0.45801893882459044, + "flos": 21401609466240.0, + "grad_norm": 1.7708953304075432, + "language_loss": 0.7611897, + "learning_rate": 2.36692477442939e-06, + "loss": 0.83844614, + "num_input_tokens_seen": 163423155, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.14538574, + "step": 7618, + "time_per_iteration": 2.486976146697998 + }, + { + "auxiliary_loss_clip": 0.06453368, + "auxiliary_loss_mlp": 0.01269962, + "balance_loss_clip": 0.06288654, + "balance_loss_mlp": 0.01256778, + "epoch": 0.4580790620772584, + "flos": 19542609982080.0, + "grad_norm": 1.989312042597275, + "language_loss": 0.76642346, + "learning_rate": 2.366541916231585e-06, + "loss": 0.84365678, + "num_input_tokens_seen": 163442450, + "router_z_loss_clip": 1.64648438, + "router_z_loss_mlp": 0.13195801, + "step": 7619, + "time_per_iteration": 2.5505213737487793 + }, + { + "auxiliary_loss_clip": 0.06448688, + "auxiliary_loss_mlp": 0.01269236, + "balance_loss_clip": 0.06287201, + "balance_loss_mlp": 0.01256242, + "epoch": 0.45813918532992637, + "flos": 16586608844160.0, + "grad_norm": 1.7634638926548802, + "language_loss": 0.72444797, + "learning_rate": 2.366159044134473e-06, + "loss": 0.80162722, + "num_input_tokens_seen": 163459810, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.13018799, + "step": 7620, + "time_per_iteration": 2.5020828247070312 + }, + { + "auxiliary_loss_clip": 0.06448015, + "auxiliary_loss_mlp": 0.0127207, + "balance_loss_clip": 0.06286486, + "balance_loss_mlp": 0.01259243, + "epoch": 0.45819930858259433, + "flos": 42240085568640.0, + "grad_norm": 2.4478513756868168, + "language_loss": 0.77894747, + "learning_rate": 2.3657761581525748e-06, + "loss": 0.8561483, + "num_input_tokens_seen": 163482970, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.12835693, + "step": 7621, + "time_per_iteration": 2.7115588188171387 + }, + { + "auxiliary_loss_clip": 0.06339111, + "auxiliary_loss_mlp": 0.01257981, + "balance_loss_clip": 0.06264743, + "balance_loss_mlp": 0.01255324, + "epoch": 0.4582594318352623, + "flos": 63733335073920.0, + "grad_norm": 0.7682856550602313, + "language_loss": 0.64809114, + "learning_rate": 2.3653932583004063e-06, + "loss": 0.72406203, + "num_input_tokens_seen": 163545330, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.02659607, + "step": 7622, + "time_per_iteration": 3.13112473487854 + }, + { + "auxiliary_loss_clip": 0.06452725, + "auxiliary_loss_mlp": 0.01272617, + "balance_loss_clip": 0.06286744, + "balance_loss_mlp": 0.01258449, + "epoch": 0.45831955508793026, + "flos": 26877226078080.0, + "grad_norm": 1.7433537302254658, + "language_loss": 0.79958743, + "learning_rate": 2.3650103445924903e-06, + "loss": 0.87684089, + "num_input_tokens_seen": 163564620, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.1416626, + "step": 7623, + "time_per_iteration": 2.6407015323638916 + }, + { + "auxiliary_loss_clip": 0.0645254, + "auxiliary_loss_mlp": 0.0127269, + "balance_loss_clip": 0.06285348, + "balance_loss_mlp": 0.01258528, + "epoch": 0.45837967834059823, + "flos": 18739886267520.0, + "grad_norm": 2.305548200028626, + "language_loss": 0.71172595, + "learning_rate": 2.3646274170433452e-06, + "loss": 0.78897822, + "num_input_tokens_seen": 163581010, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.14160156, + "step": 7624, + "time_per_iteration": 2.4580042362213135 + }, + { + "auxiliary_loss_clip": 0.06451602, + "auxiliary_loss_mlp": 0.01273069, + "balance_loss_clip": 0.06285381, + "balance_loss_mlp": 0.012593, + "epoch": 0.4584398015932662, + "flos": 21184380956160.0, + "grad_norm": 1.776025787081333, + "language_loss": 0.73132861, + "learning_rate": 2.364244475667491e-06, + "loss": 0.80857527, + "num_input_tokens_seen": 163599955, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.13763428, + "step": 7625, + "time_per_iteration": 2.5352139472961426 + }, + { + "auxiliary_loss_clip": 0.06452388, + "auxiliary_loss_mlp": 0.01273572, + "balance_loss_clip": 0.06287026, + "balance_loss_mlp": 0.01259857, + "epoch": 0.45849992484593416, + "flos": 19795826620800.0, + "grad_norm": 3.130746647878431, + "language_loss": 0.78340298, + "learning_rate": 2.363861520479451e-06, + "loss": 0.86066258, + "num_input_tokens_seen": 163618545, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.137146, + "step": 7626, + "time_per_iteration": 2.4839165210723877 + }, + { + "auxiliary_loss_clip": 0.06454711, + "auxiliary_loss_mlp": 0.01271249, + "balance_loss_clip": 0.06286182, + "balance_loss_mlp": 0.01257284, + "epoch": 0.4585600480986021, + "flos": 18229134504960.0, + "grad_norm": 1.6201293476115848, + "language_loss": 0.85071468, + "learning_rate": 2.3634785514937445e-06, + "loss": 0.92797422, + "num_input_tokens_seen": 163636055, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.1394043, + "step": 7627, + "time_per_iteration": 2.5822484493255615 + }, + { + "auxiliary_loss_clip": 0.06454201, + "auxiliary_loss_mlp": 0.01270166, + "balance_loss_clip": 0.06285322, + "balance_loss_mlp": 0.01255634, + "epoch": 0.4586201713512701, + "flos": 29029748814720.0, + "grad_norm": 1.6524494424678404, + "language_loss": 0.69812655, + "learning_rate": 2.3630955687248953e-06, + "loss": 0.77537024, + "num_input_tokens_seen": 163657485, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.14544678, + "step": 7628, + "time_per_iteration": 2.5642716884613037 + }, + { + "auxiliary_loss_clip": 0.06450283, + "auxiliary_loss_mlp": 0.01272737, + "balance_loss_clip": 0.06287684, + "balance_loss_mlp": 0.01258492, + "epoch": 0.45868029460393805, + "flos": 23411395572480.0, + "grad_norm": 1.512396631295222, + "language_loss": 0.78590345, + "learning_rate": 2.3627125721874265e-06, + "loss": 0.86313355, + "num_input_tokens_seen": 163676030, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.14245605, + "step": 7629, + "time_per_iteration": 2.5380680561065674 + }, + { + "auxiliary_loss_clip": 0.0645413, + "auxiliary_loss_mlp": 0.01273786, + "balance_loss_clip": 0.06283213, + "balance_loss_mlp": 0.01258372, + "epoch": 0.458740417856606, + "flos": 18227625131520.0, + "grad_norm": 2.58579854057945, + "language_loss": 0.7964831, + "learning_rate": 2.3623295618958595e-06, + "loss": 0.87376225, + "num_input_tokens_seen": 163694490, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.1541748, + "step": 7630, + "time_per_iteration": 2.4736902713775635 + }, + { + "auxiliary_loss_clip": 0.0645593, + "auxiliary_loss_mlp": 0.01273082, + "balance_loss_clip": 0.06288286, + "balance_loss_mlp": 0.01258378, + "epoch": 0.458800541109274, + "flos": 34577341683840.0, + "grad_norm": 2.0263904819558243, + "language_loss": 0.72204614, + "learning_rate": 2.3619465378647198e-06, + "loss": 0.79933631, + "num_input_tokens_seen": 163717035, + "router_z_loss_clip": 1.67675781, + "router_z_loss_mlp": 0.14715576, + "step": 7631, + "time_per_iteration": 2.8143060207366943 + }, + { + "auxiliary_loss_clip": 0.06451838, + "auxiliary_loss_mlp": 0.01269985, + "balance_loss_clip": 0.06285281, + "balance_loss_mlp": 0.0125565, + "epoch": 0.458860664361942, + "flos": 17717837690880.0, + "grad_norm": 2.417001672331849, + "language_loss": 0.71850061, + "learning_rate": 2.361563500108531e-06, + "loss": 0.79571879, + "num_input_tokens_seen": 163734525, + "router_z_loss_clip": 1.66113281, + "router_z_loss_mlp": 0.14324951, + "step": 7632, + "time_per_iteration": 2.616152048110962 + }, + { + "auxiliary_loss_clip": 0.0645618, + "auxiliary_loss_mlp": 0.01272337, + "balance_loss_clip": 0.06285533, + "balance_loss_mlp": 0.01258055, + "epoch": 0.45892078761460997, + "flos": 18447746607360.0, + "grad_norm": 2.3994338935229784, + "language_loss": 0.69457287, + "learning_rate": 2.3611804486418178e-06, + "loss": 0.7718581, + "num_input_tokens_seen": 163752860, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.14294434, + "step": 7633, + "time_per_iteration": 2.544916868209839 + }, + { + "auxiliary_loss_clip": 0.06450637, + "auxiliary_loss_mlp": 0.01269265, + "balance_loss_clip": 0.06284192, + "balance_loss_mlp": 0.01255055, + "epoch": 0.45898091086727794, + "flos": 22679306449920.0, + "grad_norm": 1.6111707393144439, + "language_loss": 0.81188464, + "learning_rate": 2.3607973834791062e-06, + "loss": 0.88908368, + "num_input_tokens_seen": 163772495, + "router_z_loss_clip": 1.6640625, + "router_z_loss_mlp": 0.14208984, + "step": 7634, + "time_per_iteration": 2.508498430252075 + }, + { + "auxiliary_loss_clip": 0.06458217, + "auxiliary_loss_mlp": 0.0127198, + "balance_loss_clip": 0.06285305, + "balance_loss_mlp": 0.01256995, + "epoch": 0.4590410341199459, + "flos": 21659396152320.0, + "grad_norm": 1.6788945577423258, + "language_loss": 0.8141619, + "learning_rate": 2.3604143046349216e-06, + "loss": 0.89146382, + "num_input_tokens_seen": 163791475, + "router_z_loss_clip": 1.72851562, + "router_z_loss_mlp": 0.15002441, + "step": 7635, + "time_per_iteration": 2.5435891151428223 + }, + { + "auxiliary_loss_clip": 0.06450347, + "auxiliary_loss_mlp": 0.01272084, + "balance_loss_clip": 0.06285377, + "balance_loss_mlp": 0.01258095, + "epoch": 0.45910115737261387, + "flos": 36543676648320.0, + "grad_norm": 1.5202825589824251, + "language_loss": 0.65088654, + "learning_rate": 2.3600312121237905e-06, + "loss": 0.72811085, + "num_input_tokens_seen": 163812995, + "router_z_loss_clip": 1.64746094, + "router_z_loss_mlp": 0.13995361, + "step": 7636, + "time_per_iteration": 2.6333730220794678 + }, + { + "auxiliary_loss_clip": 0.06449063, + "auxiliary_loss_mlp": 0.01267464, + "balance_loss_clip": 0.06286588, + "balance_loss_mlp": 0.0125376, + "epoch": 0.45916128062528183, + "flos": 24425771500800.0, + "grad_norm": 1.3857173948582018, + "language_loss": 0.80552399, + "learning_rate": 2.3596481059602395e-06, + "loss": 0.88268924, + "num_input_tokens_seen": 163833945, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.13702393, + "step": 7637, + "time_per_iteration": 4.1112189292907715 + }, + { + "auxiliary_loss_clip": 0.06456389, + "auxiliary_loss_mlp": 0.0127208, + "balance_loss_clip": 0.06286228, + "balance_loss_mlp": 0.01257089, + "epoch": 0.4592214038779498, + "flos": 23228687744640.0, + "grad_norm": 2.823234077565048, + "language_loss": 0.75517625, + "learning_rate": 2.3592649861587965e-06, + "loss": 0.83246088, + "num_input_tokens_seen": 163853885, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.14990234, + "step": 7638, + "time_per_iteration": 3.910426616668701 + }, + { + "auxiliary_loss_clip": 0.06446041, + "auxiliary_loss_mlp": 0.01269213, + "balance_loss_clip": 0.06283274, + "balance_loss_mlp": 0.01254824, + "epoch": 0.45928152713061776, + "flos": 19178200575360.0, + "grad_norm": 1.717868731304971, + "language_loss": 0.74023581, + "learning_rate": 2.358881852733989e-06, + "loss": 0.81738836, + "num_input_tokens_seen": 163871855, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.14373779, + "step": 7639, + "time_per_iteration": 2.566300630569458 + }, + { + "auxiliary_loss_clip": 0.06454983, + "auxiliary_loss_mlp": 0.01270543, + "balance_loss_clip": 0.06286465, + "balance_loss_mlp": 0.01255165, + "epoch": 0.4593416503832857, + "flos": 22420513514880.0, + "grad_norm": 1.8698154023651474, + "language_loss": 0.683029, + "learning_rate": 2.358498705700346e-06, + "loss": 0.76028425, + "num_input_tokens_seen": 163891450, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.15380859, + "step": 7640, + "time_per_iteration": 2.5371484756469727 + }, + { + "auxiliary_loss_clip": 0.06455723, + "auxiliary_loss_mlp": 0.01270807, + "balance_loss_clip": 0.06285085, + "balance_loss_mlp": 0.01256454, + "epoch": 0.4594017736359537, + "flos": 18886228623360.0, + "grad_norm": 1.657871276405927, + "language_loss": 0.76190329, + "learning_rate": 2.3581155450723958e-06, + "loss": 0.83916861, + "num_input_tokens_seen": 163909345, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.14367676, + "step": 7641, + "time_per_iteration": 2.633190631866455 + }, + { + "auxiliary_loss_clip": 0.06450865, + "auxiliary_loss_mlp": 0.01271757, + "balance_loss_clip": 0.06281709, + "balance_loss_mlp": 0.01256749, + "epoch": 0.45946189688862166, + "flos": 20524268090880.0, + "grad_norm": 2.1109400166256753, + "language_loss": 0.75088501, + "learning_rate": 2.357732370864668e-06, + "loss": 0.82811123, + "num_input_tokens_seen": 163926940, + "router_z_loss_clip": 1.69238281, + "router_z_loss_mlp": 0.15008545, + "step": 7642, + "time_per_iteration": 2.497342824935913 + }, + { + "auxiliary_loss_clip": 0.06325873, + "auxiliary_loss_mlp": 0.01255986, + "balance_loss_clip": 0.06252096, + "balance_loss_mlp": 0.01253583, + "epoch": 0.4595220201412896, + "flos": 61422436920960.0, + "grad_norm": 0.8082143270085457, + "language_loss": 0.58238232, + "learning_rate": 2.357349183091694e-06, + "loss": 0.65820098, + "num_input_tokens_seen": 163977785, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.02400208, + "step": 7643, + "time_per_iteration": 2.9001851081848145 + }, + { + "auxiliary_loss_clip": 0.06454818, + "auxiliary_loss_mlp": 0.01269178, + "balance_loss_clip": 0.06279951, + "balance_loss_mlp": 0.01254467, + "epoch": 0.4595821433939576, + "flos": 23337616452480.0, + "grad_norm": 1.460564072578963, + "language_loss": 0.93123877, + "learning_rate": 2.3569659817680016e-06, + "loss": 1.00847864, + "num_input_tokens_seen": 163996630, + "router_z_loss_clip": 1.74902344, + "router_z_loss_mlp": 0.14709473, + "step": 7644, + "time_per_iteration": 3.956286668777466 + }, + { + "auxiliary_loss_clip": 0.06453376, + "auxiliary_loss_mlp": 0.01272616, + "balance_loss_clip": 0.06283151, + "balance_loss_mlp": 0.01258591, + "epoch": 0.4596422666466256, + "flos": 14287492189440.0, + "grad_norm": 2.5856018073831954, + "language_loss": 0.82780254, + "learning_rate": 2.3565827669081243e-06, + "loss": 0.90506244, + "num_input_tokens_seen": 164013190, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.14031982, + "step": 7645, + "time_per_iteration": 2.5230045318603516 + }, + { + "auxiliary_loss_clip": 0.0632263, + "auxiliary_loss_mlp": 0.0125685, + "balance_loss_clip": 0.06249407, + "balance_loss_mlp": 0.01254095, + "epoch": 0.4597023898992936, + "flos": 65747188103040.0, + "grad_norm": 0.7461836102968291, + "language_loss": 0.59904981, + "learning_rate": 2.356199538526593e-06, + "loss": 0.67484462, + "num_input_tokens_seen": 164074030, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.02758789, + "step": 7646, + "time_per_iteration": 3.0677428245544434 + }, + { + "auxiliary_loss_clip": 0.06451902, + "auxiliary_loss_mlp": 0.01272391, + "balance_loss_clip": 0.06282644, + "balance_loss_mlp": 0.01257931, + "epoch": 0.45976251315196154, + "flos": 26914430090880.0, + "grad_norm": 1.5401961064627432, + "language_loss": 0.72954202, + "learning_rate": 2.355816296637939e-06, + "loss": 0.80678499, + "num_input_tokens_seen": 164095515, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.14465332, + "step": 7647, + "time_per_iteration": 2.5715911388397217 + }, + { + "auxiliary_loss_clip": 0.06455843, + "auxiliary_loss_mlp": 0.01270403, + "balance_loss_clip": 0.06283608, + "balance_loss_mlp": 0.0125586, + "epoch": 0.4598226364046295, + "flos": 26625854229120.0, + "grad_norm": 1.5262276937698116, + "language_loss": 0.66966379, + "learning_rate": 2.3554330412566957e-06, + "loss": 0.74692625, + "num_input_tokens_seen": 164117270, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.14526367, + "step": 7648, + "time_per_iteration": 2.6032962799072266 + }, + { + "auxiliary_loss_clip": 0.06453076, + "auxiliary_loss_mlp": 0.01270647, + "balance_loss_clip": 0.06283541, + "balance_loss_mlp": 0.01256562, + "epoch": 0.45988275965729747, + "flos": 24394395346560.0, + "grad_norm": 1.3937992948207578, + "language_loss": 0.78837889, + "learning_rate": 2.3550497723973953e-06, + "loss": 0.86561614, + "num_input_tokens_seen": 164137850, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.14093018, + "step": 7649, + "time_per_iteration": 3.961230754852295 + }, + { + "auxiliary_loss_clip": 0.06449774, + "auxiliary_loss_mlp": 0.01273295, + "balance_loss_clip": 0.06282938, + "balance_loss_mlp": 0.01258221, + "epoch": 0.45994288290996543, + "flos": 24542834054400.0, + "grad_norm": 2.427132979105608, + "language_loss": 0.694453, + "learning_rate": 2.3546664900745726e-06, + "loss": 0.77168369, + "num_input_tokens_seen": 164157960, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.15087891, + "step": 7650, + "time_per_iteration": 2.5870516300201416 + }, + { + "auxiliary_loss_clip": 0.06454967, + "auxiliary_loss_mlp": 0.01271386, + "balance_loss_clip": 0.06281558, + "balance_loss_mlp": 0.01255876, + "epoch": 0.4600030061626334, + "flos": 14835573745920.0, + "grad_norm": 2.508823744651641, + "language_loss": 0.84580773, + "learning_rate": 2.354283194302761e-06, + "loss": 0.92307127, + "num_input_tokens_seen": 164174590, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.15515137, + "step": 7651, + "time_per_iteration": 2.4682910442352295 + }, + { + "auxiliary_loss_clip": 0.06447899, + "auxiliary_loss_mlp": 0.01269723, + "balance_loss_clip": 0.06282218, + "balance_loss_mlp": 0.01255567, + "epoch": 0.46006312941530136, + "flos": 18119702672640.0, + "grad_norm": 2.0398588051370536, + "language_loss": 0.75204146, + "learning_rate": 2.3538998850964948e-06, + "loss": 0.82921767, + "num_input_tokens_seen": 164192935, + "router_z_loss_clip": 1.65625, + "router_z_loss_mlp": 0.14160156, + "step": 7652, + "time_per_iteration": 2.533160448074341 + }, + { + "auxiliary_loss_clip": 0.06453463, + "auxiliary_loss_mlp": 0.01267977, + "balance_loss_clip": 0.06283025, + "balance_loss_mlp": 0.01253803, + "epoch": 0.46012325266796933, + "flos": 21982157280000.0, + "grad_norm": 1.8219910575186118, + "language_loss": 0.76111704, + "learning_rate": 2.3535165624703097e-06, + "loss": 0.83833146, + "num_input_tokens_seen": 164213160, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.14154053, + "step": 7653, + "time_per_iteration": 2.607556104660034 + }, + { + "auxiliary_loss_clip": 0.06466014, + "auxiliary_loss_mlp": 0.01279742, + "balance_loss_clip": 0.06286691, + "balance_loss_mlp": 0.01262618, + "epoch": 0.4601833759206373, + "flos": 15273468783360.0, + "grad_norm": 1.9930521100890286, + "language_loss": 0.66339052, + "learning_rate": 2.353133226438741e-06, + "loss": 0.74084806, + "num_input_tokens_seen": 164229330, + "router_z_loss_clip": 1.79199219, + "router_z_loss_mlp": 0.17132568, + "step": 7654, + "time_per_iteration": 2.5845115184783936 + }, + { + "auxiliary_loss_clip": 0.06450775, + "auxiliary_loss_mlp": 0.01273684, + "balance_loss_clip": 0.06283066, + "balance_loss_mlp": 0.01260524, + "epoch": 0.46024349917330526, + "flos": 27096299377920.0, + "grad_norm": 1.834954182024095, + "language_loss": 0.79552221, + "learning_rate": 2.3527498770163248e-06, + "loss": 0.87276679, + "num_input_tokens_seen": 164248240, + "router_z_loss_clip": 1.67675781, + "router_z_loss_mlp": 0.1315918, + "step": 7655, + "time_per_iteration": 2.5619075298309326 + }, + { + "auxiliary_loss_clip": 0.06446843, + "auxiliary_loss_mlp": 0.01271784, + "balance_loss_clip": 0.06282479, + "balance_loss_mlp": 0.0125795, + "epoch": 0.4603036224259732, + "flos": 24469935402240.0, + "grad_norm": 1.525008853184554, + "language_loss": 0.68020397, + "learning_rate": 2.3523665142175985e-06, + "loss": 0.7573902, + "num_input_tokens_seen": 164268020, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.13824463, + "step": 7656, + "time_per_iteration": 2.534085988998413 + }, + { + "auxiliary_loss_clip": 0.06450829, + "auxiliary_loss_mlp": 0.0126853, + "balance_loss_clip": 0.06280753, + "balance_loss_mlp": 0.01254249, + "epoch": 0.4603637456786412, + "flos": 28116545091840.0, + "grad_norm": 1.6883930229899933, + "language_loss": 0.81940675, + "learning_rate": 2.351983138057098e-06, + "loss": 0.89660037, + "num_input_tokens_seen": 164287305, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.14300537, + "step": 7657, + "time_per_iteration": 2.6093909740448 + }, + { + "auxiliary_loss_clip": 0.06452166, + "auxiliary_loss_mlp": 0.01272452, + "balance_loss_clip": 0.06283732, + "balance_loss_mlp": 0.01257598, + "epoch": 0.4604238689313092, + "flos": 24355178835840.0, + "grad_norm": 1.9081069655960825, + "language_loss": 0.70684779, + "learning_rate": 2.3515997485493623e-06, + "loss": 0.78409398, + "num_input_tokens_seen": 164306835, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.1484375, + "step": 7658, + "time_per_iteration": 2.5257532596588135 + }, + { + "auxiliary_loss_clip": 0.06333129, + "auxiliary_loss_mlp": 0.01254207, + "balance_loss_clip": 0.06259783, + "balance_loss_mlp": 0.01251698, + "epoch": 0.4604839921839772, + "flos": 53622742337280.0, + "grad_norm": 1.3056028191134426, + "language_loss": 0.6180622, + "learning_rate": 2.351216345708928e-06, + "loss": 0.69393557, + "num_input_tokens_seen": 164367095, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.02508545, + "step": 7659, + "time_per_iteration": 3.2051191329956055 + }, + { + "auxiliary_loss_clip": 0.06450778, + "auxiliary_loss_mlp": 0.01270415, + "balance_loss_clip": 0.06284198, + "balance_loss_mlp": 0.01254692, + "epoch": 0.46054411543664514, + "flos": 31256428014720.0, + "grad_norm": 1.6821089703035916, + "language_loss": 0.68614, + "learning_rate": 2.350832929550336e-06, + "loss": 0.76335192, + "num_input_tokens_seen": 164388895, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.1572876, + "step": 7660, + "time_per_iteration": 2.5768120288848877 + }, + { + "auxiliary_loss_clip": 0.06455722, + "auxiliary_loss_mlp": 0.01269625, + "balance_loss_clip": 0.06285393, + "balance_loss_mlp": 0.01254843, + "epoch": 0.4606042386893131, + "flos": 24098943450240.0, + "grad_norm": 1.8024702284570222, + "language_loss": 0.76982367, + "learning_rate": 2.3504495000881227e-06, + "loss": 0.84707713, + "num_input_tokens_seen": 164409080, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.14782715, + "step": 7661, + "time_per_iteration": 2.5556533336639404 + }, + { + "auxiliary_loss_clip": 0.06448123, + "auxiliary_loss_mlp": 0.01270523, + "balance_loss_clip": 0.06284644, + "balance_loss_mlp": 0.01257511, + "epoch": 0.46066436194198107, + "flos": 26585715323520.0, + "grad_norm": 1.64374674726695, + "language_loss": 0.75330603, + "learning_rate": 2.3500660573368305e-06, + "loss": 0.8304925, + "num_input_tokens_seen": 164427585, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.13000488, + "step": 7662, + "time_per_iteration": 2.5430636405944824 + }, + { + "auxiliary_loss_clip": 0.064645, + "auxiliary_loss_mlp": 0.01271435, + "balance_loss_clip": 0.06287506, + "balance_loss_mlp": 0.01255807, + "epoch": 0.46072448519464904, + "flos": 17779751458560.0, + "grad_norm": 2.8997354943734144, + "language_loss": 0.79542935, + "learning_rate": 2.349682601310998e-06, + "loss": 0.87278873, + "num_input_tokens_seen": 164438455, + "router_z_loss_clip": 1.76953125, + "router_z_loss_mlp": 0.15625, + "step": 7663, + "time_per_iteration": 2.4792025089263916 + }, + { + "auxiliary_loss_clip": 0.06451327, + "auxiliary_loss_mlp": 0.01270399, + "balance_loss_clip": 0.0628781, + "balance_loss_mlp": 0.01256344, + "epoch": 0.460784608447317, + "flos": 15091557569280.0, + "grad_norm": 1.9500633364095115, + "language_loss": 0.73664737, + "learning_rate": 2.3492991320251653e-06, + "loss": 0.81386459, + "num_input_tokens_seen": 164456830, + "router_z_loss_clip": 1.63476562, + "router_z_loss_mlp": 0.14050293, + "step": 7664, + "time_per_iteration": 2.5058319568634033 + }, + { + "auxiliary_loss_clip": 0.06454196, + "auxiliary_loss_mlp": 0.01269654, + "balance_loss_clip": 0.06286658, + "balance_loss_mlp": 0.01255403, + "epoch": 0.46084473169998497, + "flos": 18594214744320.0, + "grad_norm": 1.4541358898310397, + "language_loss": 0.72731769, + "learning_rate": 2.3489156494938753e-06, + "loss": 0.80455625, + "num_input_tokens_seen": 164475375, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.14257812, + "step": 7665, + "time_per_iteration": 2.5651309490203857 + }, + { + "auxiliary_loss_clip": 0.06452034, + "auxiliary_loss_mlp": 0.01269476, + "balance_loss_clip": 0.06283794, + "balance_loss_mlp": 0.01255016, + "epoch": 0.46090485495265293, + "flos": 19499955454080.0, + "grad_norm": 1.6858212343920378, + "language_loss": 0.78057897, + "learning_rate": 2.348532153731669e-06, + "loss": 0.85779405, + "num_input_tokens_seen": 164492040, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.14459229, + "step": 7666, + "time_per_iteration": 2.4884724617004395 + }, + { + "auxiliary_loss_clip": 0.06454702, + "auxiliary_loss_mlp": 0.01278259, + "balance_loss_clip": 0.06288874, + "balance_loss_mlp": 0.01262982, + "epoch": 0.4609649782053209, + "flos": 33373339966080.0, + "grad_norm": 1.3323556356345916, + "language_loss": 0.7438637, + "learning_rate": 2.348148644753088e-06, + "loss": 0.82119334, + "num_input_tokens_seen": 164513665, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.15270996, + "step": 7667, + "time_per_iteration": 2.6961426734924316 + }, + { + "auxiliary_loss_clip": 0.06450665, + "auxiliary_loss_mlp": 0.01267319, + "balance_loss_clip": 0.06283414, + "balance_loss_mlp": 0.01253574, + "epoch": 0.46102510145798886, + "flos": 23775972687360.0, + "grad_norm": 1.463924526715157, + "language_loss": 0.76157856, + "learning_rate": 2.347765122572676e-06, + "loss": 0.83875835, + "num_input_tokens_seen": 164533890, + "router_z_loss_clip": 1.67480469, + "router_z_loss_mlp": 0.1373291, + "step": 7668, + "time_per_iteration": 2.517401933670044 + }, + { + "auxiliary_loss_clip": 0.06446877, + "auxiliary_loss_mlp": 0.0126819, + "balance_loss_clip": 0.06284317, + "balance_loss_mlp": 0.01254982, + "epoch": 0.4610852247106568, + "flos": 23301544469760.0, + "grad_norm": 1.5533292001822034, + "language_loss": 0.78315312, + "learning_rate": 2.347381587204975e-06, + "loss": 0.86030376, + "num_input_tokens_seen": 164553815, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.13208008, + "step": 7669, + "time_per_iteration": 2.58445405960083 + }, + { + "auxiliary_loss_clip": 0.06450041, + "auxiliary_loss_mlp": 0.01264673, + "balance_loss_clip": 0.06282575, + "balance_loss_mlp": 0.01251286, + "epoch": 0.4611453479633248, + "flos": 25454528403840.0, + "grad_norm": 1.739851036429443, + "language_loss": 0.83272684, + "learning_rate": 2.34699803866453e-06, + "loss": 0.90987396, + "num_input_tokens_seen": 164573125, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.13391113, + "step": 7670, + "time_per_iteration": 2.5387001037597656 + }, + { + "auxiliary_loss_clip": 0.06451756, + "auxiliary_loss_mlp": 0.01270534, + "balance_loss_clip": 0.06288445, + "balance_loss_mlp": 0.01257129, + "epoch": 0.4612054712159928, + "flos": 21145541788800.0, + "grad_norm": 1.8274954721629995, + "language_loss": 0.63656652, + "learning_rate": 2.3466144769658845e-06, + "loss": 0.7137894, + "num_input_tokens_seen": 164592575, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.1340332, + "step": 7671, + "time_per_iteration": 2.5336413383483887 + }, + { + "auxiliary_loss_clip": 0.06335695, + "auxiliary_loss_mlp": 0.01251787, + "balance_loss_clip": 0.0626289, + "balance_loss_mlp": 0.01249119, + "epoch": 0.4612655944686608, + "flos": 69979754194560.0, + "grad_norm": 0.792480479203595, + "language_loss": 0.55791217, + "learning_rate": 2.346230902123583e-06, + "loss": 0.63378698, + "num_input_tokens_seen": 164659795, + "router_z_loss_clip": 0.72705078, + "router_z_loss_mlp": 0.02670288, + "step": 7672, + "time_per_iteration": 3.2302184104919434 + }, + { + "auxiliary_loss_clip": 0.06453065, + "auxiliary_loss_mlp": 0.01266934, + "balance_loss_clip": 0.06283592, + "balance_loss_mlp": 0.01253213, + "epoch": 0.46132571772132874, + "flos": 16842844229760.0, + "grad_norm": 2.026723370874256, + "language_loss": 0.71486014, + "learning_rate": 2.3458473141521715e-06, + "loss": 0.79206014, + "num_input_tokens_seen": 164678735, + "router_z_loss_clip": 1.69238281, + "router_z_loss_mlp": 0.13720703, + "step": 7673, + "time_per_iteration": 2.5307891368865967 + }, + { + "auxiliary_loss_clip": 0.06444372, + "auxiliary_loss_mlp": 0.01267461, + "balance_loss_clip": 0.06280223, + "balance_loss_mlp": 0.01254014, + "epoch": 0.4613858409739967, + "flos": 35817666946560.0, + "grad_norm": 1.6118988477871892, + "language_loss": 0.70779812, + "learning_rate": 2.345463713066195e-06, + "loss": 0.7849164, + "num_input_tokens_seen": 164700885, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.13446045, + "step": 7674, + "time_per_iteration": 2.67787766456604 + }, + { + "auxiliary_loss_clip": 0.06445141, + "auxiliary_loss_mlp": 0.01269162, + "balance_loss_clip": 0.06278897, + "balance_loss_mlp": 0.01255554, + "epoch": 0.4614459642266647, + "flos": 35276251789440.0, + "grad_norm": 1.4817902433092767, + "language_loss": 0.65456873, + "learning_rate": 2.3450800988801996e-06, + "loss": 0.73171175, + "num_input_tokens_seen": 164726960, + "router_z_loss_clip": 1.66113281, + "router_z_loss_mlp": 0.1362915, + "step": 7675, + "time_per_iteration": 2.683043956756592 + }, + { + "auxiliary_loss_clip": 0.06330552, + "auxiliary_loss_mlp": 0.01253837, + "balance_loss_clip": 0.06257802, + "balance_loss_mlp": 0.01251083, + "epoch": 0.46150608747933264, + "flos": 66723311842560.0, + "grad_norm": 0.7159632658119685, + "language_loss": 0.58438665, + "learning_rate": 2.3446964716087327e-06, + "loss": 0.66023052, + "num_input_tokens_seen": 164788525, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.02758789, + "step": 7676, + "time_per_iteration": 3.2052080631256104 + }, + { + "auxiliary_loss_clip": 0.06331712, + "auxiliary_loss_mlp": 0.01253621, + "balance_loss_clip": 0.06258753, + "balance_loss_mlp": 0.01250806, + "epoch": 0.4615662107320006, + "flos": 55846780133760.0, + "grad_norm": 0.7666580083801284, + "language_loss": 0.62806678, + "learning_rate": 2.344312831266341e-06, + "loss": 0.70392013, + "num_input_tokens_seen": 164843525, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.02810669, + "step": 7677, + "time_per_iteration": 5.753543853759766 + }, + { + "auxiliary_loss_clip": 0.06441256, + "auxiliary_loss_mlp": 0.01269221, + "balance_loss_clip": 0.06278154, + "balance_loss_mlp": 0.012564, + "epoch": 0.46162633398466857, + "flos": 15488055889920.0, + "grad_norm": 2.0928007642005224, + "language_loss": 0.7694543, + "learning_rate": 2.3439291778675718e-06, + "loss": 0.84655911, + "num_input_tokens_seen": 164859895, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.12817383, + "step": 7678, + "time_per_iteration": 2.5979206562042236 + }, + { + "auxiliary_loss_clip": 0.06447493, + "auxiliary_loss_mlp": 0.01267035, + "balance_loss_clip": 0.06279032, + "balance_loss_mlp": 0.01253672, + "epoch": 0.46168645723733653, + "flos": 20017667105280.0, + "grad_norm": 1.9130482273301792, + "language_loss": 0.66792345, + "learning_rate": 2.343545511426974e-06, + "loss": 0.74506873, + "num_input_tokens_seen": 164878030, + "router_z_loss_clip": 1.68554688, + "router_z_loss_mlp": 0.13360596, + "step": 7679, + "time_per_iteration": 2.548025131225586 + }, + { + "auxiliary_loss_clip": 0.06445532, + "auxiliary_loss_mlp": 0.0127232, + "balance_loss_clip": 0.06279338, + "balance_loss_mlp": 0.01259409, + "epoch": 0.4617465804900045, + "flos": 20304020833920.0, + "grad_norm": 2.6299917180378203, + "language_loss": 0.702595, + "learning_rate": 2.3431618319590963e-06, + "loss": 0.77977353, + "num_input_tokens_seen": 164895710, + "router_z_loss_clip": 1.66113281, + "router_z_loss_mlp": 0.12921143, + "step": 7680, + "time_per_iteration": 2.475419282913208 + }, + { + "auxiliary_loss_clip": 0.06449848, + "auxiliary_loss_mlp": 0.01274843, + "balance_loss_clip": 0.06279959, + "balance_loss_mlp": 0.01260454, + "epoch": 0.46180670374267246, + "flos": 22352897669760.0, + "grad_norm": 1.6539051623213383, + "language_loss": 0.63903129, + "learning_rate": 2.342778139478487e-06, + "loss": 0.7162782, + "num_input_tokens_seen": 164913365, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.14398193, + "step": 7681, + "time_per_iteration": 2.518878698348999 + }, + { + "auxiliary_loss_clip": 0.06438938, + "auxiliary_loss_mlp": 0.01267755, + "balance_loss_clip": 0.06277744, + "balance_loss_mlp": 0.01255566, + "epoch": 0.46186682699534043, + "flos": 19900856113920.0, + "grad_norm": 1.5795449228659066, + "language_loss": 0.67458999, + "learning_rate": 2.342394433999697e-06, + "loss": 0.75165695, + "num_input_tokens_seen": 164931620, + "router_z_loss_clip": 1.61230469, + "router_z_loss_mlp": 0.12194824, + "step": 7682, + "time_per_iteration": 2.4734294414520264 + }, + { + "auxiliary_loss_clip": 0.06442823, + "auxiliary_loss_mlp": 0.01267731, + "balance_loss_clip": 0.06276147, + "balance_loss_mlp": 0.01254564, + "epoch": 0.4619269502480084, + "flos": 31511573297280.0, + "grad_norm": 2.0778412213868025, + "language_loss": 0.74573362, + "learning_rate": 2.342010715537275e-06, + "loss": 0.82283914, + "num_input_tokens_seen": 164950905, + "router_z_loss_clip": 1.66699219, + "router_z_loss_mlp": 0.1317749, + "step": 7683, + "time_per_iteration": 2.5680744647979736 + }, + { + "auxiliary_loss_clip": 0.0644316, + "auxiliary_loss_mlp": 0.01269615, + "balance_loss_clip": 0.06278165, + "balance_loss_mlp": 0.01255995, + "epoch": 0.46198707350067636, + "flos": 25016465658240.0, + "grad_norm": 2.034673139361796, + "language_loss": 0.77701104, + "learning_rate": 2.3416269841057726e-06, + "loss": 0.85413885, + "num_input_tokens_seen": 164970950, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.13604736, + "step": 7684, + "time_per_iteration": 3.9865663051605225 + }, + { + "auxiliary_loss_clip": 0.06455924, + "auxiliary_loss_mlp": 0.01269534, + "balance_loss_clip": 0.06282193, + "balance_loss_mlp": 0.01255074, + "epoch": 0.4620471967533444, + "flos": 18297588890880.0, + "grad_norm": 1.7679070884814239, + "language_loss": 0.79849184, + "learning_rate": 2.3412432397197412e-06, + "loss": 0.87574637, + "num_input_tokens_seen": 164989855, + "router_z_loss_clip": 1.73828125, + "router_z_loss_mlp": 0.14471436, + "step": 7685, + "time_per_iteration": 2.4874165058135986 + }, + { + "auxiliary_loss_clip": 0.06442665, + "auxiliary_loss_mlp": 0.01268831, + "balance_loss_clip": 0.06282581, + "balance_loss_mlp": 0.01254151, + "epoch": 0.46210732000601235, + "flos": 33993607415040.0, + "grad_norm": 2.697729181890728, + "language_loss": 0.66966581, + "learning_rate": 2.340859482393731e-06, + "loss": 0.74678075, + "num_input_tokens_seen": 165012290, + "router_z_loss_clip": 1.59863281, + "router_z_loss_mlp": 0.14678955, + "step": 7686, + "time_per_iteration": 2.673029661178589 + }, + { + "auxiliary_loss_clip": 0.06450719, + "auxiliary_loss_mlp": 0.01270437, + "balance_loss_clip": 0.06281859, + "balance_loss_mlp": 0.01255929, + "epoch": 0.4621674432586803, + "flos": 25016381804160.0, + "grad_norm": 1.8957956969587364, + "language_loss": 0.7416718, + "learning_rate": 2.340475712142296e-06, + "loss": 0.81888342, + "num_input_tokens_seen": 165030810, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.14508057, + "step": 7687, + "time_per_iteration": 2.520526885986328 + }, + { + "auxiliary_loss_clip": 0.06441881, + "auxiliary_loss_mlp": 0.01268556, + "balance_loss_clip": 0.06278582, + "balance_loss_mlp": 0.01254943, + "epoch": 0.4622275665113483, + "flos": 22019906344320.0, + "grad_norm": 2.1641165257521098, + "language_loss": 0.75034606, + "learning_rate": 2.3400919289799873e-06, + "loss": 0.82745045, + "num_input_tokens_seen": 165050205, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.13623047, + "step": 7688, + "time_per_iteration": 2.6087183952331543 + }, + { + "auxiliary_loss_clip": 0.06442745, + "auxiliary_loss_mlp": 0.01266791, + "balance_loss_clip": 0.06279768, + "balance_loss_mlp": 0.0125375, + "epoch": 0.46228768976401624, + "flos": 24065303235840.0, + "grad_norm": 1.76695871159964, + "language_loss": 0.78822517, + "learning_rate": 2.3397081329213585e-06, + "loss": 0.86532056, + "num_input_tokens_seen": 165069370, + "router_z_loss_clip": 1.62988281, + "router_z_loss_mlp": 0.13043213, + "step": 7689, + "time_per_iteration": 4.008488416671753 + }, + { + "auxiliary_loss_clip": 0.0644816, + "auxiliary_loss_mlp": 0.01269125, + "balance_loss_clip": 0.06278446, + "balance_loss_mlp": 0.01254116, + "epoch": 0.4623478130166842, + "flos": 26658655902720.0, + "grad_norm": 2.4003711776889936, + "language_loss": 0.56824899, + "learning_rate": 2.339324323980964e-06, + "loss": 0.6454218, + "num_input_tokens_seen": 165089610, + "router_z_loss_clip": 1.69726562, + "router_z_loss_mlp": 0.15020752, + "step": 7690, + "time_per_iteration": 2.586726665496826 + }, + { + "auxiliary_loss_clip": 0.0644986, + "auxiliary_loss_mlp": 0.01270548, + "balance_loss_clip": 0.06281572, + "balance_loss_mlp": 0.01256421, + "epoch": 0.46240793626935217, + "flos": 20564700485760.0, + "grad_norm": 2.1153050114919387, + "language_loss": 0.83470464, + "learning_rate": 2.3389405021733562e-06, + "loss": 0.91190875, + "num_input_tokens_seen": 165109050, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.14135742, + "step": 7691, + "time_per_iteration": 2.5688517093658447 + }, + { + "auxiliary_loss_clip": 0.06446303, + "auxiliary_loss_mlp": 0.01268112, + "balance_loss_clip": 0.06280233, + "balance_loss_mlp": 0.01254528, + "epoch": 0.46246805952202014, + "flos": 22462706845440.0, + "grad_norm": 1.4394066258336355, + "language_loss": 0.75601387, + "learning_rate": 2.338556667513091e-06, + "loss": 0.83315802, + "num_input_tokens_seen": 165130130, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.13604736, + "step": 7692, + "time_per_iteration": 2.537447929382324 + }, + { + "auxiliary_loss_clip": 0.06447245, + "auxiliary_loss_mlp": 0.01269367, + "balance_loss_clip": 0.06279314, + "balance_loss_mlp": 0.01255324, + "epoch": 0.4625281827746881, + "flos": 35049673549440.0, + "grad_norm": 1.4816622996820314, + "language_loss": 0.74488908, + "learning_rate": 2.338172820014723e-06, + "loss": 0.82205522, + "num_input_tokens_seen": 165152685, + "router_z_loss_clip": 1.67871094, + "router_z_loss_mlp": 0.14038086, + "step": 7693, + "time_per_iteration": 2.655733823776245 + }, + { + "auxiliary_loss_clip": 0.06448781, + "auxiliary_loss_mlp": 0.01269271, + "balance_loss_clip": 0.06283827, + "balance_loss_mlp": 0.01255496, + "epoch": 0.46258830602735607, + "flos": 21074907196800.0, + "grad_norm": 1.4111581138712515, + "language_loss": 0.85637844, + "learning_rate": 2.337788959692808e-06, + "loss": 0.93355894, + "num_input_tokens_seen": 165173315, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.13781738, + "step": 7694, + "time_per_iteration": 2.5321285724639893 + }, + { + "auxiliary_loss_clip": 0.06447286, + "auxiliary_loss_mlp": 0.01268569, + "balance_loss_clip": 0.06280261, + "balance_loss_mlp": 0.01254979, + "epoch": 0.46264842928002403, + "flos": 26184437320320.0, + "grad_norm": 2.8233556574725744, + "language_loss": 0.79577935, + "learning_rate": 2.337405086561902e-06, + "loss": 0.87293792, + "num_input_tokens_seen": 165192395, + "router_z_loss_clip": 1.66699219, + "router_z_loss_mlp": 0.13586426, + "step": 7695, + "time_per_iteration": 2.569974660873413 + }, + { + "auxiliary_loss_clip": 0.06442414, + "auxiliary_loss_mlp": 0.01270579, + "balance_loss_clip": 0.0628098, + "balance_loss_mlp": 0.01258432, + "epoch": 0.462708552532692, + "flos": 16769903650560.0, + "grad_norm": 1.6398131561505984, + "language_loss": 0.72464627, + "learning_rate": 2.3370212006365606e-06, + "loss": 0.80177617, + "num_input_tokens_seen": 165211355, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.12133789, + "step": 7696, + "time_per_iteration": 2.49324369430542 + }, + { + "auxiliary_loss_clip": 0.06448425, + "auxiliary_loss_mlp": 0.01269091, + "balance_loss_clip": 0.06281986, + "balance_loss_mlp": 0.01256139, + "epoch": 0.46276867578535996, + "flos": 15565985786880.0, + "grad_norm": 1.5682310460433448, + "language_loss": 0.69151074, + "learning_rate": 2.3366373019313423e-06, + "loss": 0.76868594, + "num_input_tokens_seen": 165229380, + "router_z_loss_clip": 1.66308594, + "router_z_loss_mlp": 0.12945557, + "step": 7697, + "time_per_iteration": 2.5437402725219727 + }, + { + "auxiliary_loss_clip": 0.06445374, + "auxiliary_loss_mlp": 0.01272368, + "balance_loss_clip": 0.06278891, + "balance_loss_mlp": 0.01258903, + "epoch": 0.462828799038028, + "flos": 22421352055680.0, + "grad_norm": 2.477481810490018, + "language_loss": 0.84870285, + "learning_rate": 2.3362533904608025e-06, + "loss": 0.92588031, + "num_input_tokens_seen": 165247200, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.13470459, + "step": 7698, + "time_per_iteration": 2.5088558197021484 + }, + { + "auxiliary_loss_clip": 0.06449191, + "auxiliary_loss_mlp": 0.01269693, + "balance_loss_clip": 0.06284188, + "balance_loss_mlp": 0.01255883, + "epoch": 0.46288892229069595, + "flos": 21075997299840.0, + "grad_norm": 1.5978854439043657, + "language_loss": 0.71711451, + "learning_rate": 2.335869466239502e-06, + "loss": 0.79430336, + "num_input_tokens_seen": 165265825, + "router_z_loss_clip": 1.64941406, + "router_z_loss_mlp": 0.13824463, + "step": 7699, + "time_per_iteration": 2.572908639907837 + }, + { + "auxiliary_loss_clip": 0.06453253, + "auxiliary_loss_mlp": 0.01268472, + "balance_loss_clip": 0.06283245, + "balance_loss_mlp": 0.01253952, + "epoch": 0.4629490455433639, + "flos": 23192448053760.0, + "grad_norm": 3.9296940778908724, + "language_loss": 0.71994227, + "learning_rate": 2.335485529281996e-06, + "loss": 0.79715955, + "num_input_tokens_seen": 165284380, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.1451416, + "step": 7700, + "time_per_iteration": 2.5155210494995117 + }, + { + "auxiliary_loss_clip": 0.06446292, + "auxiliary_loss_mlp": 0.01271375, + "balance_loss_clip": 0.0628306, + "balance_loss_mlp": 0.01258608, + "epoch": 0.4630091687960319, + "flos": 18840178005120.0, + "grad_norm": 2.0219592023308297, + "language_loss": 0.72735655, + "learning_rate": 2.3351015796028467e-06, + "loss": 0.80453324, + "num_input_tokens_seen": 165300320, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.12780762, + "step": 7701, + "time_per_iteration": 2.5208041667938232 + }, + { + "auxiliary_loss_clip": 0.06455772, + "auxiliary_loss_mlp": 0.01272275, + "balance_loss_clip": 0.06285252, + "balance_loss_mlp": 0.01258768, + "epoch": 0.46306929204869984, + "flos": 38915733882240.0, + "grad_norm": 1.8677153728043454, + "language_loss": 0.64857763, + "learning_rate": 2.3347176172166114e-06, + "loss": 0.72585809, + "num_input_tokens_seen": 165318130, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.13519287, + "step": 7702, + "time_per_iteration": 2.6274476051330566 + }, + { + "auxiliary_loss_clip": 0.06443912, + "auxiliary_loss_mlp": 0.01267806, + "balance_loss_clip": 0.06281176, + "balance_loss_mlp": 0.01255181, + "epoch": 0.4631294153013678, + "flos": 19649945462400.0, + "grad_norm": 1.8702283374659314, + "language_loss": 0.73327863, + "learning_rate": 2.33433364213785e-06, + "loss": 0.81039578, + "num_input_tokens_seen": 165336225, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.12640381, + "step": 7703, + "time_per_iteration": 2.505009651184082 + }, + { + "auxiliary_loss_clip": 0.06456561, + "auxiliary_loss_mlp": 0.01272434, + "balance_loss_clip": 0.0628607, + "balance_loss_mlp": 0.0125776, + "epoch": 0.4631895385540358, + "flos": 24615187655040.0, + "grad_norm": 1.7291559958554978, + "language_loss": 0.68770319, + "learning_rate": 2.3339496543811243e-06, + "loss": 0.76499313, + "num_input_tokens_seen": 165355005, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.14666748, + "step": 7704, + "time_per_iteration": 2.5337138175964355 + }, + { + "auxiliary_loss_clip": 0.06456052, + "auxiliary_loss_mlp": 0.01269056, + "balance_loss_clip": 0.06286585, + "balance_loss_mlp": 0.01255693, + "epoch": 0.46324966180670374, + "flos": 26326838534400.0, + "grad_norm": 2.021774763699282, + "language_loss": 0.81483209, + "learning_rate": 2.3335656539609934e-06, + "loss": 0.89208323, + "num_input_tokens_seen": 165374910, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.13378906, + "step": 7705, + "time_per_iteration": 2.612663745880127 + }, + { + "auxiliary_loss_clip": 0.06459744, + "auxiliary_loss_mlp": 0.01269987, + "balance_loss_clip": 0.06288762, + "balance_loss_mlp": 0.01256313, + "epoch": 0.4633097850593717, + "flos": 19245816420480.0, + "grad_norm": 1.7146225700720175, + "language_loss": 0.77885628, + "learning_rate": 2.3331816408920196e-06, + "loss": 0.85615361, + "num_input_tokens_seen": 165392590, + "router_z_loss_clip": 1.70898438, + "router_z_loss_mlp": 0.13684082, + "step": 7706, + "time_per_iteration": 2.508925437927246 + }, + { + "auxiliary_loss_clip": 0.06446654, + "auxiliary_loss_mlp": 0.01269933, + "balance_loss_clip": 0.06285432, + "balance_loss_mlp": 0.01256254, + "epoch": 0.46336990831203967, + "flos": 22789660677120.0, + "grad_norm": 1.8229249281456994, + "language_loss": 0.70008546, + "learning_rate": 2.3327976151887654e-06, + "loss": 0.77725136, + "num_input_tokens_seen": 165411195, + "router_z_loss_clip": 1.61230469, + "router_z_loss_mlp": 0.13671875, + "step": 7707, + "time_per_iteration": 2.5517148971557617 + }, + { + "auxiliary_loss_clip": 0.06460145, + "auxiliary_loss_mlp": 0.01269807, + "balance_loss_clip": 0.06290638, + "balance_loss_mlp": 0.01255716, + "epoch": 0.46343003156470763, + "flos": 38218668566400.0, + "grad_norm": 2.701141573629833, + "language_loss": 0.61044616, + "learning_rate": 2.332413576865791e-06, + "loss": 0.68774569, + "num_input_tokens_seen": 165430150, + "router_z_loss_clip": 1.69433594, + "router_z_loss_mlp": 0.14093018, + "step": 7708, + "time_per_iteration": 2.6566975116729736 + }, + { + "auxiliary_loss_clip": 0.06457859, + "auxiliary_loss_mlp": 0.01269726, + "balance_loss_clip": 0.06291145, + "balance_loss_mlp": 0.01255946, + "epoch": 0.4634901548173756, + "flos": 31946156098560.0, + "grad_norm": 2.0418964495503125, + "language_loss": 0.77915132, + "learning_rate": 2.3320295259376614e-06, + "loss": 0.85642713, + "num_input_tokens_seen": 165450595, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.13781738, + "step": 7709, + "time_per_iteration": 2.6596858501434326 + }, + { + "auxiliary_loss_clip": 0.06459823, + "auxiliary_loss_mlp": 0.01271527, + "balance_loss_clip": 0.06291819, + "balance_loss_mlp": 0.01256756, + "epoch": 0.46355027807004356, + "flos": 20088469405440.0, + "grad_norm": 1.5745013311626586, + "language_loss": 0.77581245, + "learning_rate": 2.3316454624189385e-06, + "loss": 0.85312593, + "num_input_tokens_seen": 165469515, + "router_z_loss_clip": 1.67773438, + "router_z_loss_mlp": 0.14764404, + "step": 7710, + "time_per_iteration": 2.5101842880249023 + }, + { + "auxiliary_loss_clip": 0.06457606, + "auxiliary_loss_mlp": 0.01274408, + "balance_loss_clip": 0.06287406, + "balance_loss_mlp": 0.01260151, + "epoch": 0.4636104013227116, + "flos": 24068280055680.0, + "grad_norm": 2.3601088939338086, + "language_loss": 0.73606086, + "learning_rate": 2.3312613863241865e-06, + "loss": 0.81338096, + "num_input_tokens_seen": 165488125, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.14257812, + "step": 7711, + "time_per_iteration": 2.590855598449707 + }, + { + "auxiliary_loss_clip": 0.06459524, + "auxiliary_loss_mlp": 0.01272046, + "balance_loss_clip": 0.06293879, + "balance_loss_mlp": 0.01257354, + "epoch": 0.46367052457537955, + "flos": 23921392648320.0, + "grad_norm": 1.4235356855228358, + "language_loss": 0.71632046, + "learning_rate": 2.33087729766797e-06, + "loss": 0.7936362, + "num_input_tokens_seen": 165509225, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.14685059, + "step": 7712, + "time_per_iteration": 2.524653434753418 + }, + { + "auxiliary_loss_clip": 0.06464949, + "auxiliary_loss_mlp": 0.01272658, + "balance_loss_clip": 0.06290694, + "balance_loss_mlp": 0.01257709, + "epoch": 0.4637306478280475, + "flos": 26403846036480.0, + "grad_norm": 2.2505033505731493, + "language_loss": 0.73737693, + "learning_rate": 2.3304931964648524e-06, + "loss": 0.81475306, + "num_input_tokens_seen": 165529945, + "router_z_loss_clip": 1.74121094, + "router_z_loss_mlp": 0.14941406, + "step": 7713, + "time_per_iteration": 2.5624618530273438 + }, + { + "auxiliary_loss_clip": 0.06466722, + "auxiliary_loss_mlp": 0.01276857, + "balance_loss_clip": 0.06292763, + "balance_loss_mlp": 0.01261372, + "epoch": 0.4637907710807155, + "flos": 21987104670720.0, + "grad_norm": 1.4954624193011212, + "language_loss": 0.58918363, + "learning_rate": 2.3301090827294e-06, + "loss": 0.66661942, + "num_input_tokens_seen": 165550690, + "router_z_loss_clip": 1.74023438, + "router_z_loss_mlp": 0.15466309, + "step": 7714, + "time_per_iteration": 2.510551929473877 + }, + { + "auxiliary_loss_clip": 0.06456332, + "auxiliary_loss_mlp": 0.01271959, + "balance_loss_clip": 0.06290398, + "balance_loss_mlp": 0.01257427, + "epoch": 0.46385089433338345, + "flos": 12427234894080.0, + "grad_norm": 2.7033660685293186, + "language_loss": 0.70470357, + "learning_rate": 2.3297249564761784e-06, + "loss": 0.78198647, + "num_input_tokens_seen": 165567775, + "router_z_loss_clip": 1.66113281, + "router_z_loss_mlp": 0.14538574, + "step": 7715, + "time_per_iteration": 2.533158779144287 + }, + { + "auxiliary_loss_clip": 0.06470867, + "auxiliary_loss_mlp": 0.01270095, + "balance_loss_clip": 0.06294338, + "balance_loss_mlp": 0.01255731, + "epoch": 0.4639110175860514, + "flos": 23922692386560.0, + "grad_norm": 1.7790063066577455, + "language_loss": 0.68472731, + "learning_rate": 2.3293408177197527e-06, + "loss": 0.762137, + "num_input_tokens_seen": 165587010, + "router_z_loss_clip": 1.76660156, + "router_z_loss_mlp": 0.14355469, + "step": 7716, + "time_per_iteration": 4.020689249038696 + }, + { + "auxiliary_loss_clip": 0.06459275, + "auxiliary_loss_mlp": 0.01270908, + "balance_loss_clip": 0.06288785, + "balance_loss_mlp": 0.01255858, + "epoch": 0.4639711408387194, + "flos": 25307263653120.0, + "grad_norm": 1.603260424737227, + "language_loss": 0.81029081, + "learning_rate": 2.328956666474691e-06, + "loss": 0.88759267, + "num_input_tokens_seen": 165607850, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.1505127, + "step": 7717, + "time_per_iteration": 3.932593584060669 + }, + { + "auxiliary_loss_clip": 0.06454346, + "auxiliary_loss_mlp": 0.01273075, + "balance_loss_clip": 0.06284629, + "balance_loss_mlp": 0.01258127, + "epoch": 0.46403126409138734, + "flos": 21217643827200.0, + "grad_norm": 1.6983648240686933, + "language_loss": 0.73560178, + "learning_rate": 2.3285725027555593e-06, + "loss": 0.81287599, + "num_input_tokens_seen": 165627175, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.14929199, + "step": 7718, + "time_per_iteration": 2.567814350128174 + }, + { + "auxiliary_loss_clip": 0.06461985, + "auxiliary_loss_mlp": 0.0127191, + "balance_loss_clip": 0.06294554, + "balance_loss_mlp": 0.01257384, + "epoch": 0.4640913873440553, + "flos": 35854325907840.0, + "grad_norm": 1.9528130818693374, + "language_loss": 0.70908272, + "learning_rate": 2.3281883265769254e-06, + "loss": 0.78642172, + "num_input_tokens_seen": 165648340, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.14526367, + "step": 7719, + "time_per_iteration": 2.6412456035614014 + }, + { + "auxiliary_loss_clip": 0.06458225, + "auxiliary_loss_mlp": 0.01272538, + "balance_loss_clip": 0.06287955, + "balance_loss_mlp": 0.01258793, + "epoch": 0.46415151059672327, + "flos": 19171282613760.0, + "grad_norm": 2.2400961683609473, + "language_loss": 0.86823237, + "learning_rate": 2.327804137953357e-06, + "loss": 0.94553995, + "num_input_tokens_seen": 165667195, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.13745117, + "step": 7720, + "time_per_iteration": 2.5479180812835693 + }, + { + "auxiliary_loss_clip": 0.06346954, + "auxiliary_loss_mlp": 0.01257869, + "balance_loss_clip": 0.06273555, + "balance_loss_mlp": 0.01255387, + "epoch": 0.46421163384939124, + "flos": 58932841207680.0, + "grad_norm": 0.7060507258277461, + "language_loss": 0.54935473, + "learning_rate": 2.3274199368994226e-06, + "loss": 0.62540293, + "num_input_tokens_seen": 165726760, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.02481079, + "step": 7721, + "time_per_iteration": 3.185922861099243 + }, + { + "auxiliary_loss_clip": 0.06453753, + "auxiliary_loss_mlp": 0.01271222, + "balance_loss_clip": 0.0628788, + "balance_loss_mlp": 0.01257227, + "epoch": 0.4642717571020592, + "flos": 20163590190720.0, + "grad_norm": 1.901448408880664, + "language_loss": 0.80108112, + "learning_rate": 2.3270357234296918e-06, + "loss": 0.87833083, + "num_input_tokens_seen": 165745005, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.13995361, + "step": 7722, + "time_per_iteration": 2.524707317352295 + }, + { + "auxiliary_loss_clip": 0.06454173, + "auxiliary_loss_mlp": 0.01270539, + "balance_loss_clip": 0.06282455, + "balance_loss_mlp": 0.0125627, + "epoch": 0.46433188035472717, + "flos": 25053208473600.0, + "grad_norm": 1.90118065677523, + "language_loss": 0.78278601, + "learning_rate": 2.3266514975587332e-06, + "loss": 0.86003315, + "num_input_tokens_seen": 165765750, + "router_z_loss_clip": 1.71777344, + "router_z_loss_mlp": 0.1427002, + "step": 7723, + "time_per_iteration": 3.9820849895477295 + }, + { + "auxiliary_loss_clip": 0.06448075, + "auxiliary_loss_mlp": 0.01267351, + "balance_loss_clip": 0.06282157, + "balance_loss_mlp": 0.01253046, + "epoch": 0.4643920036073952, + "flos": 28083366074880.0, + "grad_norm": 1.6378874340525207, + "language_loss": 0.68861282, + "learning_rate": 2.326267259301118e-06, + "loss": 0.7657671, + "num_input_tokens_seen": 165787515, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.14306641, + "step": 7724, + "time_per_iteration": 2.550832748413086 + }, + { + "auxiliary_loss_clip": 0.06449208, + "auxiliary_loss_mlp": 0.01272875, + "balance_loss_clip": 0.06283656, + "balance_loss_mlp": 0.01259297, + "epoch": 0.46445212686006315, + "flos": 18375267225600.0, + "grad_norm": 2.354559005563411, + "language_loss": 0.67722934, + "learning_rate": 2.325883008671415e-06, + "loss": 0.7544502, + "num_input_tokens_seen": 165806675, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.13592529, + "step": 7725, + "time_per_iteration": 2.534698009490967 + }, + { + "auxiliary_loss_clip": 0.0644237, + "auxiliary_loss_mlp": 0.01270691, + "balance_loss_clip": 0.06281108, + "balance_loss_mlp": 0.01258108, + "epoch": 0.4645122501127311, + "flos": 31729514567040.0, + "grad_norm": 1.5959059771038482, + "language_loss": 0.65303701, + "learning_rate": 2.3254987456841955e-06, + "loss": 0.73016763, + "num_input_tokens_seen": 165829835, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.12585449, + "step": 7726, + "time_per_iteration": 2.6071393489837646 + }, + { + "auxiliary_loss_clip": 0.06452325, + "auxiliary_loss_mlp": 0.01268836, + "balance_loss_clip": 0.06286149, + "balance_loss_mlp": 0.01255312, + "epoch": 0.4645723733653991, + "flos": 23775553416960.0, + "grad_norm": 2.198219591713496, + "language_loss": 0.75535023, + "learning_rate": 2.3251144703540307e-06, + "loss": 0.83256185, + "num_input_tokens_seen": 165849380, + "router_z_loss_clip": 1.66113281, + "router_z_loss_mlp": 0.13525391, + "step": 7727, + "time_per_iteration": 2.5323383808135986 + }, + { + "auxiliary_loss_clip": 0.06449004, + "auxiliary_loss_mlp": 0.01272292, + "balance_loss_clip": 0.06281407, + "balance_loss_mlp": 0.01258166, + "epoch": 0.46463249661806705, + "flos": 33153805468800.0, + "grad_norm": 1.912145195790545, + "language_loss": 0.78694946, + "learning_rate": 2.3247301826954936e-06, + "loss": 0.86416245, + "num_input_tokens_seen": 165868620, + "router_z_loss_clip": 1.67675781, + "router_z_loss_mlp": 0.14147949, + "step": 7728, + "time_per_iteration": 3.998812437057495 + }, + { + "auxiliary_loss_clip": 0.06450211, + "auxiliary_loss_mlp": 0.01270241, + "balance_loss_clip": 0.06282613, + "balance_loss_mlp": 0.0125658, + "epoch": 0.464692619870735, + "flos": 18301865448960.0, + "grad_norm": 2.3670866338465295, + "language_loss": 0.76134968, + "learning_rate": 2.324345882723155e-06, + "loss": 0.83855414, + "num_input_tokens_seen": 165885915, + "router_z_loss_clip": 1.67675781, + "router_z_loss_mlp": 0.13659668, + "step": 7729, + "time_per_iteration": 2.459913730621338 + }, + { + "auxiliary_loss_clip": 0.06449223, + "auxiliary_loss_mlp": 0.01270726, + "balance_loss_clip": 0.06283462, + "balance_loss_mlp": 0.01257339, + "epoch": 0.464752743123403, + "flos": 22644659986560.0, + "grad_norm": 1.7402612149106196, + "language_loss": 0.80316758, + "learning_rate": 2.323961570451588e-06, + "loss": 0.88036704, + "num_input_tokens_seen": 165905465, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.13378906, + "step": 7730, + "time_per_iteration": 2.5472798347473145 + }, + { + "auxiliary_loss_clip": 0.06447513, + "auxiliary_loss_mlp": 0.01272657, + "balance_loss_clip": 0.06282953, + "balance_loss_mlp": 0.01258924, + "epoch": 0.46481286637607094, + "flos": 20418316202880.0, + "grad_norm": 1.544685409716396, + "language_loss": 0.77440143, + "learning_rate": 2.3235772458953655e-06, + "loss": 0.85160315, + "num_input_tokens_seen": 165924640, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.13726807, + "step": 7731, + "time_per_iteration": 2.539971351623535 + }, + { + "auxiliary_loss_clip": 0.06444095, + "auxiliary_loss_mlp": 0.01267001, + "balance_loss_clip": 0.06280014, + "balance_loss_mlp": 0.01253984, + "epoch": 0.4648729896287389, + "flos": 34283692650240.0, + "grad_norm": 1.8393249998070078, + "language_loss": 0.66022158, + "learning_rate": 2.323192909069061e-06, + "loss": 0.73733258, + "num_input_tokens_seen": 165945765, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.13006592, + "step": 7732, + "time_per_iteration": 2.6860389709472656 + }, + { + "auxiliary_loss_clip": 0.0645274, + "auxiliary_loss_mlp": 0.01268588, + "balance_loss_clip": 0.0628058, + "balance_loss_mlp": 0.01254474, + "epoch": 0.4649331128814069, + "flos": 21327704565120.0, + "grad_norm": 2.1920635353287157, + "language_loss": 0.73225021, + "learning_rate": 2.32280855998725e-06, + "loss": 0.8094635, + "num_input_tokens_seen": 165964025, + "router_z_loss_clip": 1.72363281, + "router_z_loss_mlp": 0.14123535, + "step": 7733, + "time_per_iteration": 2.4875564575195312 + }, + { + "auxiliary_loss_clip": 0.06338679, + "auxiliary_loss_mlp": 0.01252754, + "balance_loss_clip": 0.0626616, + "balance_loss_mlp": 0.0124981, + "epoch": 0.46499323613407484, + "flos": 58325082744960.0, + "grad_norm": 1.3051386869973822, + "language_loss": 0.52022988, + "learning_rate": 2.3224241986645057e-06, + "loss": 0.5961442, + "num_input_tokens_seen": 166021950, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.02941895, + "step": 7734, + "time_per_iteration": 3.0869898796081543 + }, + { + "auxiliary_loss_clip": 0.0644846, + "auxiliary_loss_mlp": 0.01271308, + "balance_loss_clip": 0.06283916, + "balance_loss_mlp": 0.01257856, + "epoch": 0.4650533593867428, + "flos": 10894308773760.0, + "grad_norm": 2.170877243914886, + "language_loss": 0.75776118, + "learning_rate": 2.3220398251154035e-06, + "loss": 0.83495891, + "num_input_tokens_seen": 166039675, + "router_z_loss_clip": 1.64648438, + "router_z_loss_mlp": 0.13464355, + "step": 7735, + "time_per_iteration": 2.478837490081787 + }, + { + "auxiliary_loss_clip": 0.06441534, + "auxiliary_loss_mlp": 0.01268486, + "balance_loss_clip": 0.0627993, + "balance_loss_mlp": 0.01255009, + "epoch": 0.46511348263941077, + "flos": 19980756581760.0, + "grad_norm": 2.0032469234086507, + "language_loss": 0.6994068, + "learning_rate": 2.321655439354519e-06, + "loss": 0.77650702, + "num_input_tokens_seen": 166057745, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.13482666, + "step": 7736, + "time_per_iteration": 2.6353659629821777 + }, + { + "auxiliary_loss_clip": 0.06442849, + "auxiliary_loss_mlp": 0.01268241, + "balance_loss_clip": 0.0628303, + "balance_loss_mlp": 0.01256237, + "epoch": 0.46517360589207873, + "flos": 19683795312000.0, + "grad_norm": 1.6634794649969447, + "language_loss": 0.72674608, + "learning_rate": 2.321271041396427e-06, + "loss": 0.80385697, + "num_input_tokens_seen": 166076440, + "router_z_loss_clip": 1.59570312, + "router_z_loss_mlp": 0.12005615, + "step": 7737, + "time_per_iteration": 2.5038952827453613 + }, + { + "auxiliary_loss_clip": 0.06449911, + "auxiliary_loss_mlp": 0.01268223, + "balance_loss_clip": 0.06283341, + "balance_loss_mlp": 0.01254603, + "epoch": 0.46523372914474675, + "flos": 16878203452800.0, + "grad_norm": 1.9711860161800356, + "language_loss": 0.84095049, + "learning_rate": 2.3208866312557065e-06, + "loss": 0.91813183, + "num_input_tokens_seen": 166092520, + "router_z_loss_clip": 1.66699219, + "router_z_loss_mlp": 0.1361084, + "step": 7738, + "time_per_iteration": 2.5216240882873535 + }, + { + "auxiliary_loss_clip": 0.06338458, + "auxiliary_loss_mlp": 0.01253722, + "balance_loss_clip": 0.06265976, + "balance_loss_mlp": 0.01250617, + "epoch": 0.4652938523974147, + "flos": 53458188917760.0, + "grad_norm": 0.7399188166866549, + "language_loss": 0.57646966, + "learning_rate": 2.320502208946932e-06, + "loss": 0.65239149, + "num_input_tokens_seen": 166156285, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.03102112, + "step": 7739, + "time_per_iteration": 3.215662717819214 + }, + { + "auxiliary_loss_clip": 0.06450304, + "auxiliary_loss_mlp": 0.01271295, + "balance_loss_clip": 0.06285876, + "balance_loss_mlp": 0.01257299, + "epoch": 0.4653539756500827, + "flos": 15236642113920.0, + "grad_norm": 1.7449085109148506, + "language_loss": 0.85184145, + "learning_rate": 2.3201177744846815e-06, + "loss": 0.92905748, + "num_input_tokens_seen": 166173455, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.14013672, + "step": 7740, + "time_per_iteration": 2.4736168384552 + }, + { + "auxiliary_loss_clip": 0.0644415, + "auxiliary_loss_mlp": 0.01270653, + "balance_loss_clip": 0.06281894, + "balance_loss_mlp": 0.01256706, + "epoch": 0.46541409890275065, + "flos": 23738978309760.0, + "grad_norm": 1.5125636475233326, + "language_loss": 0.76338875, + "learning_rate": 2.3197333278835327e-06, + "loss": 0.84053683, + "num_input_tokens_seen": 166194370, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.1394043, + "step": 7741, + "time_per_iteration": 2.56061053276062 + }, + { + "auxiliary_loss_clip": 0.06456167, + "auxiliary_loss_mlp": 0.01268672, + "balance_loss_clip": 0.06284943, + "balance_loss_mlp": 0.01254838, + "epoch": 0.4654742221554186, + "flos": 20853150566400.0, + "grad_norm": 1.6688490987186926, + "language_loss": 0.81291914, + "learning_rate": 2.319348869158064e-06, + "loss": 0.89016759, + "num_input_tokens_seen": 166213195, + "router_z_loss_clip": 1.71386719, + "router_z_loss_mlp": 0.13812256, + "step": 7742, + "time_per_iteration": 2.5372226238250732 + }, + { + "auxiliary_loss_clip": 0.06456183, + "auxiliary_loss_mlp": 0.01268485, + "balance_loss_clip": 0.06287557, + "balance_loss_mlp": 0.01254264, + "epoch": 0.4655343454080866, + "flos": 20711210549760.0, + "grad_norm": 1.6329017257985423, + "language_loss": 0.72620338, + "learning_rate": 2.3189643983228555e-06, + "loss": 0.80345011, + "num_input_tokens_seen": 166231350, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.14227295, + "step": 7743, + "time_per_iteration": 2.561323404312134 + }, + { + "auxiliary_loss_clip": 0.0644543, + "auxiliary_loss_mlp": 0.01269888, + "balance_loss_clip": 0.06280947, + "balance_loss_mlp": 0.01256036, + "epoch": 0.46559446866075455, + "flos": 18995912017920.0, + "grad_norm": 1.7294678893011792, + "language_loss": 0.71235406, + "learning_rate": 2.318579915392483e-06, + "loss": 0.78950727, + "num_input_tokens_seen": 166250530, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.13842773, + "step": 7744, + "time_per_iteration": 2.491428852081299 + }, + { + "auxiliary_loss_clip": 0.06446386, + "auxiliary_loss_mlp": 0.01264964, + "balance_loss_clip": 0.06285123, + "balance_loss_mlp": 0.01252513, + "epoch": 0.4656545919134225, + "flos": 34505030010240.0, + "grad_norm": 1.6678897715471863, + "language_loss": 0.84893715, + "learning_rate": 2.31819542038153e-06, + "loss": 0.92605066, + "num_input_tokens_seen": 166272545, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.12451172, + "step": 7745, + "time_per_iteration": 2.759547233581543 + }, + { + "auxiliary_loss_clip": 0.064444, + "auxiliary_loss_mlp": 0.01268532, + "balance_loss_clip": 0.06282735, + "balance_loss_mlp": 0.01255824, + "epoch": 0.4657147151660905, + "flos": 24316465449600.0, + "grad_norm": 1.3285756054685907, + "language_loss": 0.73465878, + "learning_rate": 2.317810913304574e-06, + "loss": 0.81178808, + "num_input_tokens_seen": 166292135, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.12701416, + "step": 7746, + "time_per_iteration": 2.5268633365631104 + }, + { + "auxiliary_loss_clip": 0.064431, + "auxiliary_loss_mlp": 0.01272209, + "balance_loss_clip": 0.06282558, + "balance_loss_mlp": 0.0125931, + "epoch": 0.46577483841875844, + "flos": 58807743390720.0, + "grad_norm": 1.6027404056917662, + "language_loss": 0.69721079, + "learning_rate": 2.3174263941761963e-06, + "loss": 0.77436388, + "num_input_tokens_seen": 166316710, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.12896729, + "step": 7747, + "time_per_iteration": 2.8772974014282227 + }, + { + "auxiliary_loss_clip": 0.06441785, + "auxiliary_loss_mlp": 0.01269191, + "balance_loss_clip": 0.06279266, + "balance_loss_mlp": 0.01255631, + "epoch": 0.4658349616714264, + "flos": 31330081353600.0, + "grad_norm": 1.8250767057505617, + "language_loss": 0.68153578, + "learning_rate": 2.317041863010978e-06, + "loss": 0.75864553, + "num_input_tokens_seen": 166338535, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.13543701, + "step": 7748, + "time_per_iteration": 2.576828956604004 + }, + { + "auxiliary_loss_clip": 0.06449303, + "auxiliary_loss_mlp": 0.01269068, + "balance_loss_clip": 0.06280029, + "balance_loss_mlp": 0.01254768, + "epoch": 0.46589508492409437, + "flos": 14864601985920.0, + "grad_norm": 2.1691376792383554, + "language_loss": 0.64591479, + "learning_rate": 2.3166573198235007e-06, + "loss": 0.72309858, + "num_input_tokens_seen": 166355540, + "router_z_loss_clip": 1.69140625, + "router_z_loss_mlp": 0.14306641, + "step": 7749, + "time_per_iteration": 2.5408928394317627 + }, + { + "auxiliary_loss_clip": 0.06452534, + "auxiliary_loss_mlp": 0.01273929, + "balance_loss_clip": 0.06283832, + "balance_loss_mlp": 0.01258795, + "epoch": 0.46595520817676234, + "flos": 12900908424960.0, + "grad_norm": 2.0171049134441237, + "language_loss": 0.74442625, + "learning_rate": 2.3162727646283456e-06, + "loss": 0.82169086, + "num_input_tokens_seen": 166372635, + "router_z_loss_clip": 1.6875, + "router_z_loss_mlp": 0.15142822, + "step": 7750, + "time_per_iteration": 2.4698846340179443 + }, + { + "auxiliary_loss_clip": 0.06444734, + "auxiliary_loss_mlp": 0.01270437, + "balance_loss_clip": 0.06276895, + "balance_loss_mlp": 0.01255811, + "epoch": 0.46601533142943036, + "flos": 32862504349440.0, + "grad_norm": 1.8980956421649817, + "language_loss": 0.7426213, + "learning_rate": 2.3158881974400963e-06, + "loss": 0.81977308, + "num_input_tokens_seen": 166393175, + "router_z_loss_clip": 1.67871094, + "router_z_loss_mlp": 0.14624023, + "step": 7751, + "time_per_iteration": 2.6534221172332764 + }, + { + "auxiliary_loss_clip": 0.06449904, + "auxiliary_loss_mlp": 0.01267221, + "balance_loss_clip": 0.06280084, + "balance_loss_mlp": 0.01253017, + "epoch": 0.4660754546820983, + "flos": 19972496954880.0, + "grad_norm": 1.7579709538150943, + "language_loss": 0.73910719, + "learning_rate": 2.3155036182733345e-06, + "loss": 0.81627846, + "num_input_tokens_seen": 166408630, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.14202881, + "step": 7752, + "time_per_iteration": 2.474492311477661 + }, + { + "auxiliary_loss_clip": 0.06447943, + "auxiliary_loss_mlp": 0.01268609, + "balance_loss_clip": 0.06279718, + "balance_loss_mlp": 0.01254578, + "epoch": 0.4661355779347663, + "flos": 26695482572160.0, + "grad_norm": 2.190938043745359, + "language_loss": 0.69726032, + "learning_rate": 2.315119027142644e-06, + "loss": 0.7744258, + "num_input_tokens_seen": 166428170, + "router_z_loss_clip": 1.68066406, + "router_z_loss_mlp": 0.14038086, + "step": 7753, + "time_per_iteration": 2.604612350463867 + }, + { + "auxiliary_loss_clip": 0.06438763, + "auxiliary_loss_mlp": 0.01269724, + "balance_loss_clip": 0.0627787, + "balance_loss_mlp": 0.01256777, + "epoch": 0.46619570118743425, + "flos": 20965726926720.0, + "grad_norm": 1.7706266197381177, + "language_loss": 0.73293746, + "learning_rate": 2.3147344240626076e-06, + "loss": 0.81002235, + "num_input_tokens_seen": 166446705, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.12963867, + "step": 7754, + "time_per_iteration": 2.491225242614746 + }, + { + "auxiliary_loss_clip": 0.06444383, + "auxiliary_loss_mlp": 0.01271714, + "balance_loss_clip": 0.06278208, + "balance_loss_mlp": 0.01256855, + "epoch": 0.4662558244401022, + "flos": 24433024878720.0, + "grad_norm": 1.5728879839910523, + "language_loss": 0.79001075, + "learning_rate": 2.3143498090478114e-06, + "loss": 0.8671717, + "num_input_tokens_seen": 166466750, + "router_z_loss_clip": 1.6640625, + "router_z_loss_mlp": 0.14868164, + "step": 7755, + "time_per_iteration": 2.562178134918213 + }, + { + "auxiliary_loss_clip": 0.06436031, + "auxiliary_loss_mlp": 0.01269294, + "balance_loss_clip": 0.06276575, + "balance_loss_mlp": 0.01256181, + "epoch": 0.4663159476927702, + "flos": 20601820644480.0, + "grad_norm": 1.5633103047544015, + "language_loss": 0.72593671, + "learning_rate": 2.3139651821128382e-06, + "loss": 0.80299002, + "num_input_tokens_seen": 166485400, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.13116455, + "step": 7756, + "time_per_iteration": 4.01608943939209 + }, + { + "auxiliary_loss_clip": 0.06436817, + "auxiliary_loss_mlp": 0.01269611, + "balance_loss_clip": 0.06276436, + "balance_loss_mlp": 0.01256897, + "epoch": 0.46637607094543815, + "flos": 25668235042560.0, + "grad_norm": 1.701604485790762, + "language_loss": 0.7836898, + "learning_rate": 2.313580543272274e-06, + "loss": 0.86075413, + "num_input_tokens_seen": 166505730, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.12719727, + "step": 7757, + "time_per_iteration": 2.555097818374634 + }, + { + "auxiliary_loss_clip": 0.06441291, + "auxiliary_loss_mlp": 0.01274403, + "balance_loss_clip": 0.06277295, + "balance_loss_mlp": 0.01261123, + "epoch": 0.4664361941981061, + "flos": 24279722634240.0, + "grad_norm": 1.9711907960618857, + "language_loss": 0.66213286, + "learning_rate": 2.313195892540705e-06, + "loss": 0.73928982, + "num_input_tokens_seen": 166523770, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.13275146, + "step": 7758, + "time_per_iteration": 2.569962739944458 + }, + { + "auxiliary_loss_clip": 0.06442615, + "auxiliary_loss_mlp": 0.01273146, + "balance_loss_clip": 0.0627957, + "balance_loss_mlp": 0.01260629, + "epoch": 0.4664963174507741, + "flos": 18411800405760.0, + "grad_norm": 1.9738824417509344, + "language_loss": 0.74950838, + "learning_rate": 2.3128112299327147e-06, + "loss": 0.826666, + "num_input_tokens_seen": 166542935, + "router_z_loss_clip": 1.63085938, + "router_z_loss_mlp": 0.12518311, + "step": 7759, + "time_per_iteration": 2.47729229927063 + }, + { + "auxiliary_loss_clip": 0.06440781, + "auxiliary_loss_mlp": 0.01272683, + "balance_loss_clip": 0.06281125, + "balance_loss_mlp": 0.01259827, + "epoch": 0.46655644070344204, + "flos": 22461616742400.0, + "grad_norm": 3.1770723580201103, + "language_loss": 0.77710176, + "learning_rate": 2.312426555462893e-06, + "loss": 0.85423636, + "num_input_tokens_seen": 166563935, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.12860107, + "step": 7760, + "time_per_iteration": 2.555143117904663 + }, + { + "auxiliary_loss_clip": 0.06438316, + "auxiliary_loss_mlp": 0.01270754, + "balance_loss_clip": 0.06279285, + "balance_loss_mlp": 0.01256675, + "epoch": 0.46661656395611, + "flos": 13813525169280.0, + "grad_norm": 1.6658245877843647, + "language_loss": 0.7447418, + "learning_rate": 2.3120418691458237e-06, + "loss": 0.82183254, + "num_input_tokens_seen": 166582175, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.14099121, + "step": 7761, + "time_per_iteration": 2.493032217025757 + }, + { + "auxiliary_loss_clip": 0.06446707, + "auxiliary_loss_mlp": 0.01275728, + "balance_loss_clip": 0.06281132, + "balance_loss_mlp": 0.0126094, + "epoch": 0.466676687208778, + "flos": 21658473757440.0, + "grad_norm": 1.6817719059657052, + "language_loss": 0.78770381, + "learning_rate": 2.3116571709960956e-06, + "loss": 0.86492819, + "num_input_tokens_seen": 166601870, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.14788818, + "step": 7762, + "time_per_iteration": 2.5613081455230713 + }, + { + "auxiliary_loss_clip": 0.06338885, + "auxiliary_loss_mlp": 0.01268455, + "balance_loss_clip": 0.06268312, + "balance_loss_mlp": 0.01265552, + "epoch": 0.46673681046144594, + "flos": 68554163554560.0, + "grad_norm": 0.7818830178478652, + "language_loss": 0.59643799, + "learning_rate": 2.311272461028297e-06, + "loss": 0.67251134, + "num_input_tokens_seen": 166668960, + "router_z_loss_clip": 0.70605469, + "router_z_loss_mlp": 0.0289917, + "step": 7763, + "time_per_iteration": 4.584456443786621 + }, + { + "auxiliary_loss_clip": 0.06446124, + "auxiliary_loss_mlp": 0.01269966, + "balance_loss_clip": 0.06278878, + "balance_loss_mlp": 0.01255559, + "epoch": 0.46679693371411396, + "flos": 15819789404160.0, + "grad_norm": 1.948864663001373, + "language_loss": 0.79278809, + "learning_rate": 2.3108877392570146e-06, + "loss": 0.86994898, + "num_input_tokens_seen": 166686110, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.14398193, + "step": 7764, + "time_per_iteration": 2.465179920196533 + }, + { + "auxiliary_loss_clip": 0.06441632, + "auxiliary_loss_mlp": 0.01267635, + "balance_loss_clip": 0.06281599, + "balance_loss_mlp": 0.01255035, + "epoch": 0.4668570569667819, + "flos": 18520393697280.0, + "grad_norm": 2.0437394229584123, + "language_loss": 0.72096646, + "learning_rate": 2.310503005696839e-06, + "loss": 0.79805923, + "num_input_tokens_seen": 166703930, + "router_z_loss_clip": 1.59863281, + "router_z_loss_mlp": 0.12597656, + "step": 7765, + "time_per_iteration": 2.5701630115509033 + }, + { + "auxiliary_loss_clip": 0.06443523, + "auxiliary_loss_mlp": 0.01272136, + "balance_loss_clip": 0.06278671, + "balance_loss_mlp": 0.01258141, + "epoch": 0.4669171802194499, + "flos": 19212385841280.0, + "grad_norm": 2.21059711365052, + "language_loss": 0.77947736, + "learning_rate": 2.3101182603623576e-06, + "loss": 0.85663396, + "num_input_tokens_seen": 166719940, + "router_z_loss_clip": 1.64746094, + "router_z_loss_mlp": 0.14001465, + "step": 7766, + "time_per_iteration": 2.481160879135132 + }, + { + "auxiliary_loss_clip": 0.06441876, + "auxiliary_loss_mlp": 0.01272138, + "balance_loss_clip": 0.06280202, + "balance_loss_mlp": 0.01258489, + "epoch": 0.46697730347211786, + "flos": 12281018319360.0, + "grad_norm": 2.232432946710323, + "language_loss": 0.65461195, + "learning_rate": 2.3097335032681607e-06, + "loss": 0.73175204, + "num_input_tokens_seen": 166738285, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.13653564, + "step": 7767, + "time_per_iteration": 2.5368387699127197 + }, + { + "auxiliary_loss_clip": 0.06442834, + "auxiliary_loss_mlp": 0.01272968, + "balance_loss_clip": 0.06280966, + "balance_loss_mlp": 0.01259307, + "epoch": 0.4670374267247858, + "flos": 23593516421760.0, + "grad_norm": 2.313152144280668, + "language_loss": 0.75071919, + "learning_rate": 2.3093487344288393e-06, + "loss": 0.82787716, + "num_input_tokens_seen": 166758170, + "router_z_loss_clip": 1.61914062, + "router_z_loss_mlp": 0.13677979, + "step": 7768, + "time_per_iteration": 3.9271702766418457 + }, + { + "auxiliary_loss_clip": 0.06441817, + "auxiliary_loss_mlp": 0.0126721, + "balance_loss_clip": 0.06279824, + "balance_loss_mlp": 0.01253697, + "epoch": 0.4670975499774538, + "flos": 15995495416320.0, + "grad_norm": 1.5695198160982793, + "language_loss": 0.71176434, + "learning_rate": 2.308963953858982e-06, + "loss": 0.7888546, + "num_input_tokens_seen": 166775750, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.1350708, + "step": 7769, + "time_per_iteration": 2.5253636837005615 + }, + { + "auxiliary_loss_clip": 0.06441696, + "auxiliary_loss_mlp": 0.01271746, + "balance_loss_clip": 0.06279374, + "balance_loss_mlp": 0.01258305, + "epoch": 0.46715767323012175, + "flos": 15383026396800.0, + "grad_norm": 1.8223238330296296, + "language_loss": 0.81503379, + "learning_rate": 2.3085791615731803e-06, + "loss": 0.89216816, + "num_input_tokens_seen": 166791720, + "router_z_loss_clip": 1.61914062, + "router_z_loss_mlp": 0.13446045, + "step": 7770, + "time_per_iteration": 2.468287706375122 + }, + { + "auxiliary_loss_clip": 0.06346406, + "auxiliary_loss_mlp": 0.01251242, + "balance_loss_clip": 0.06275694, + "balance_loss_mlp": 0.01249068, + "epoch": 0.4672177964827897, + "flos": 60270774877440.0, + "grad_norm": 0.8490857527823061, + "language_loss": 0.55591935, + "learning_rate": 2.3081943575860265e-06, + "loss": 0.63189584, + "num_input_tokens_seen": 166856360, + "router_z_loss_clip": 0.70703125, + "router_z_loss_mlp": 0.02177429, + "step": 7771, + "time_per_iteration": 3.1719799041748047 + }, + { + "auxiliary_loss_clip": 0.064445, + "auxiliary_loss_mlp": 0.01269252, + "balance_loss_clip": 0.06282087, + "balance_loss_mlp": 0.01256234, + "epoch": 0.4672779197354577, + "flos": 27643500466560.0, + "grad_norm": 2.2149063838305363, + "language_loss": 0.65989488, + "learning_rate": 2.3078095419121117e-06, + "loss": 0.73703241, + "num_input_tokens_seen": 166875925, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.13024902, + "step": 7772, + "time_per_iteration": 2.616668939590454 + }, + { + "auxiliary_loss_clip": 0.06441614, + "auxiliary_loss_mlp": 0.01269621, + "balance_loss_clip": 0.06282961, + "balance_loss_mlp": 0.01257009, + "epoch": 0.46733804298812565, + "flos": 31402267246080.0, + "grad_norm": 2.671628135597842, + "language_loss": 0.64495057, + "learning_rate": 2.3074247145660283e-06, + "loss": 0.72206295, + "num_input_tokens_seen": 166896520, + "router_z_loss_clip": 1.58789062, + "router_z_loss_mlp": 0.1260376, + "step": 7773, + "time_per_iteration": 2.5923900604248047 + }, + { + "auxiliary_loss_clip": 0.06442621, + "auxiliary_loss_mlp": 0.01269928, + "balance_loss_clip": 0.06280822, + "balance_loss_mlp": 0.01256457, + "epoch": 0.4673981662407936, + "flos": 19506747634560.0, + "grad_norm": 1.7164237292195044, + "language_loss": 0.80045915, + "learning_rate": 2.3070398755623685e-06, + "loss": 0.87758458, + "num_input_tokens_seen": 166915370, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.13464355, + "step": 7774, + "time_per_iteration": 2.577458620071411 + }, + { + "auxiliary_loss_clip": 0.06444994, + "auxiliary_loss_mlp": 0.01268006, + "balance_loss_clip": 0.06279732, + "balance_loss_mlp": 0.01254583, + "epoch": 0.4674582894934616, + "flos": 20528083451520.0, + "grad_norm": 1.5985457295090966, + "language_loss": 0.78042519, + "learning_rate": 2.306655024915726e-06, + "loss": 0.85755515, + "num_input_tokens_seen": 166934875, + "router_z_loss_clip": 1.65234375, + "router_z_loss_mlp": 0.13439941, + "step": 7775, + "time_per_iteration": 2.5538787841796875 + }, + { + "auxiliary_loss_clip": 0.06442325, + "auxiliary_loss_mlp": 0.0127297, + "balance_loss_clip": 0.06282222, + "balance_loss_mlp": 0.01259988, + "epoch": 0.46751841274612954, + "flos": 22097500824960.0, + "grad_norm": 1.8860444903676625, + "language_loss": 0.69909471, + "learning_rate": 2.306270162640694e-06, + "loss": 0.77624762, + "num_input_tokens_seen": 166954285, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.12963867, + "step": 7776, + "time_per_iteration": 2.561692237854004 + }, + { + "auxiliary_loss_clip": 0.0644502, + "auxiliary_loss_mlp": 0.01270071, + "balance_loss_clip": 0.06284119, + "balance_loss_mlp": 0.01257244, + "epoch": 0.46757853599879756, + "flos": 26987454524160.0, + "grad_norm": 1.3861659298765134, + "language_loss": 0.74096608, + "learning_rate": 2.3058852887518678e-06, + "loss": 0.81811702, + "num_input_tokens_seen": 166975975, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.1282959, + "step": 7777, + "time_per_iteration": 2.536015510559082 + }, + { + "auxiliary_loss_clip": 0.06447745, + "auxiliary_loss_mlp": 0.01270612, + "balance_loss_clip": 0.06284414, + "balance_loss_mlp": 0.01256921, + "epoch": 0.4676386592514655, + "flos": 24140927145600.0, + "grad_norm": 1.9470179218555579, + "language_loss": 0.69820189, + "learning_rate": 2.3055004032638394e-06, + "loss": 0.77538544, + "num_input_tokens_seen": 166996140, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.13690186, + "step": 7778, + "time_per_iteration": 2.548154354095459 + }, + { + "auxiliary_loss_clip": 0.06447626, + "auxiliary_loss_mlp": 0.01267681, + "balance_loss_clip": 0.06282265, + "balance_loss_mlp": 0.01253513, + "epoch": 0.4676987825041335, + "flos": 25490768094720.0, + "grad_norm": 1.4247023457023664, + "language_loss": 0.73440385, + "learning_rate": 2.305115506191206e-06, + "loss": 0.81155688, + "num_input_tokens_seen": 167016105, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.14160156, + "step": 7779, + "time_per_iteration": 2.5291388034820557 + }, + { + "auxiliary_loss_clip": 0.06443821, + "auxiliary_loss_mlp": 0.01265717, + "balance_loss_clip": 0.06285408, + "balance_loss_mlp": 0.01253379, + "epoch": 0.46775890575680146, + "flos": 21951871228800.0, + "grad_norm": 1.9613896423037807, + "language_loss": 0.72685552, + "learning_rate": 2.304730597548562e-06, + "loss": 0.80395079, + "num_input_tokens_seen": 167036185, + "router_z_loss_clip": 1.58398438, + "router_z_loss_mlp": 0.12353516, + "step": 7780, + "time_per_iteration": 2.5508480072021484 + }, + { + "auxiliary_loss_clip": 0.06447856, + "auxiliary_loss_mlp": 0.01269851, + "balance_loss_clip": 0.06280719, + "balance_loss_mlp": 0.01256273, + "epoch": 0.4678190290094694, + "flos": 25235413176960.0, + "grad_norm": 1.8471847442174032, + "language_loss": 0.74638426, + "learning_rate": 2.3043456773505023e-06, + "loss": 0.82356131, + "num_input_tokens_seen": 167054515, + "router_z_loss_clip": 1.671875, + "router_z_loss_mlp": 0.13586426, + "step": 7781, + "time_per_iteration": 2.527614116668701 + }, + { + "auxiliary_loss_clip": 0.06446712, + "auxiliary_loss_mlp": 0.01269353, + "balance_loss_clip": 0.06281281, + "balance_loss_mlp": 0.0125528, + "epoch": 0.4678791522621374, + "flos": 32276254458240.0, + "grad_norm": 1.845752858447898, + "language_loss": 0.63050562, + "learning_rate": 2.3039607456116252e-06, + "loss": 0.70766628, + "num_input_tokens_seen": 167077245, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.140625, + "step": 7782, + "time_per_iteration": 2.650505304336548 + }, + { + "auxiliary_loss_clip": 0.06445308, + "auxiliary_loss_mlp": 0.01268795, + "balance_loss_clip": 0.06280467, + "balance_loss_mlp": 0.01255306, + "epoch": 0.46793927551480535, + "flos": 27052764382080.0, + "grad_norm": 2.229893941722145, + "language_loss": 0.63585413, + "learning_rate": 2.3035758023465254e-06, + "loss": 0.71299517, + "num_input_tokens_seen": 167097235, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.13494873, + "step": 7783, + "time_per_iteration": 2.5537588596343994 + }, + { + "auxiliary_loss_clip": 0.0645118, + "auxiliary_loss_mlp": 0.01271407, + "balance_loss_clip": 0.06280845, + "balance_loss_mlp": 0.01257245, + "epoch": 0.4679993987674733, + "flos": 17463195532800.0, + "grad_norm": 2.4083561383098004, + "language_loss": 0.68662858, + "learning_rate": 2.303190847569801e-06, + "loss": 0.7638545, + "num_input_tokens_seen": 167113155, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.1418457, + "step": 7784, + "time_per_iteration": 2.560459613800049 + }, + { + "auxiliary_loss_clip": 0.06438549, + "auxiliary_loss_mlp": 0.01266567, + "balance_loss_clip": 0.06278238, + "balance_loss_mlp": 0.01254003, + "epoch": 0.4680595220201413, + "flos": 17170804310400.0, + "grad_norm": 1.9765250646873525, + "language_loss": 0.84616911, + "learning_rate": 2.3028058812960497e-06, + "loss": 0.92322016, + "num_input_tokens_seen": 167131765, + "router_z_loss_clip": 1.60351562, + "router_z_loss_mlp": 0.12567139, + "step": 7785, + "time_per_iteration": 2.5567643642425537 + }, + { + "auxiliary_loss_clip": 0.06444662, + "auxiliary_loss_mlp": 0.01268089, + "balance_loss_clip": 0.06281722, + "balance_loss_mlp": 0.01254225, + "epoch": 0.46811964527280925, + "flos": 11332329592320.0, + "grad_norm": 1.9719414675879272, + "language_loss": 0.77991092, + "learning_rate": 2.3024209035398678e-06, + "loss": 0.85703844, + "num_input_tokens_seen": 167149030, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.13867188, + "step": 7786, + "time_per_iteration": 2.507206439971924 + }, + { + "auxiliary_loss_clip": 0.06440122, + "auxiliary_loss_mlp": 0.01265794, + "balance_loss_clip": 0.06281641, + "balance_loss_mlp": 0.01253897, + "epoch": 0.4681797685254772, + "flos": 24285508565760.0, + "grad_norm": 2.2497529795631817, + "language_loss": 0.74387538, + "learning_rate": 2.302035914315856e-06, + "loss": 0.82093459, + "num_input_tokens_seen": 167167375, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.11901855, + "step": 7787, + "time_per_iteration": 2.498021125793457 + }, + { + "auxiliary_loss_clip": 0.06439888, + "auxiliary_loss_mlp": 0.01272631, + "balance_loss_clip": 0.06278901, + "balance_loss_mlp": 0.01258785, + "epoch": 0.4682398917781452, + "flos": 31658544558720.0, + "grad_norm": 1.7533783368280031, + "language_loss": 0.66132212, + "learning_rate": 2.3016509136386116e-06, + "loss": 0.73844731, + "num_input_tokens_seen": 167188065, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.1383667, + "step": 7788, + "time_per_iteration": 2.650092363357544 + }, + { + "auxiliary_loss_clip": 0.06441839, + "auxiliary_loss_mlp": 0.01268022, + "balance_loss_clip": 0.06280681, + "balance_loss_mlp": 0.01256036, + "epoch": 0.46830001503081314, + "flos": 28118264100480.0, + "grad_norm": 1.5278727961877703, + "language_loss": 0.64315766, + "learning_rate": 2.3012659015227343e-06, + "loss": 0.72025621, + "num_input_tokens_seen": 167209675, + "router_z_loss_clip": 1.61132812, + "router_z_loss_mlp": 0.11987305, + "step": 7789, + "time_per_iteration": 2.5806198120117188 + }, + { + "auxiliary_loss_clip": 0.06338993, + "auxiliary_loss_mlp": 0.01252338, + "balance_loss_clip": 0.06268935, + "balance_loss_mlp": 0.01250063, + "epoch": 0.4683601382834811, + "flos": 57900059308800.0, + "grad_norm": 0.6904155708009142, + "language_loss": 0.61868596, + "learning_rate": 2.300880877982825e-06, + "loss": 0.69459921, + "num_input_tokens_seen": 167273940, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 0.02276611, + "step": 7790, + "time_per_iteration": 3.2271504402160645 + }, + { + "auxiliary_loss_clip": 0.06442016, + "auxiliary_loss_mlp": 0.01269711, + "balance_loss_clip": 0.06283005, + "balance_loss_mlp": 0.01257111, + "epoch": 0.46842026153614913, + "flos": 21878427525120.0, + "grad_norm": 1.6377280327187325, + "language_loss": 0.79426539, + "learning_rate": 2.3004958430334808e-06, + "loss": 0.87138271, + "num_input_tokens_seen": 167292730, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.12597656, + "step": 7791, + "time_per_iteration": 2.490171194076538 + }, + { + "auxiliary_loss_clip": 0.06441824, + "auxiliary_loss_mlp": 0.01269493, + "balance_loss_clip": 0.06283456, + "balance_loss_mlp": 0.01256899, + "epoch": 0.4684803847888171, + "flos": 24907914293760.0, + "grad_norm": 1.496703208223837, + "language_loss": 0.74930024, + "learning_rate": 2.3001107966893052e-06, + "loss": 0.82641351, + "num_input_tokens_seen": 167313460, + "router_z_loss_clip": 1.58300781, + "router_z_loss_mlp": 0.12573242, + "step": 7792, + "time_per_iteration": 2.5588057041168213 + }, + { + "auxiliary_loss_clip": 0.0643919, + "auxiliary_loss_mlp": 0.01267774, + "balance_loss_clip": 0.06282478, + "balance_loss_mlp": 0.01255972, + "epoch": 0.46854050804148506, + "flos": 26259138835200.0, + "grad_norm": 1.9488467409065784, + "language_loss": 0.68353844, + "learning_rate": 2.299725738964898e-06, + "loss": 0.76060808, + "num_input_tokens_seen": 167335385, + "router_z_loss_clip": 1.56640625, + "router_z_loss_mlp": 0.11804199, + "step": 7793, + "time_per_iteration": 2.543156147003174 + }, + { + "auxiliary_loss_clip": 0.06441274, + "auxiliary_loss_mlp": 0.01273582, + "balance_loss_clip": 0.0628298, + "balance_loss_mlp": 0.01261387, + "epoch": 0.468600631294153, + "flos": 21586204010880.0, + "grad_norm": 1.8535654365133143, + "language_loss": 0.74367434, + "learning_rate": 2.2993406698748607e-06, + "loss": 0.82082289, + "num_input_tokens_seen": 167353625, + "router_z_loss_clip": 1.58105469, + "router_z_loss_mlp": 0.12194824, + "step": 7794, + "time_per_iteration": 2.6082603931427 + }, + { + "auxiliary_loss_clip": 0.06445156, + "auxiliary_loss_mlp": 0.01268892, + "balance_loss_clip": 0.06285646, + "balance_loss_mlp": 0.01255343, + "epoch": 0.468660754546821, + "flos": 25892842711680.0, + "grad_norm": 2.128212140250663, + "language_loss": 0.64027059, + "learning_rate": 2.2989555894337953e-06, + "loss": 0.71741104, + "num_input_tokens_seen": 167374565, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.13537598, + "step": 7795, + "time_per_iteration": 2.554871082305908 + }, + { + "auxiliary_loss_clip": 0.06440422, + "auxiliary_loss_mlp": 0.01266239, + "balance_loss_clip": 0.06283793, + "balance_loss_mlp": 0.01253067, + "epoch": 0.46872087779948896, + "flos": 35482746977280.0, + "grad_norm": 1.4934025143707166, + "language_loss": 0.6791029, + "learning_rate": 2.298570497656304e-06, + "loss": 0.7561695, + "num_input_tokens_seen": 167395010, + "router_z_loss_clip": 1.56542969, + "router_z_loss_mlp": 0.13171387, + "step": 7796, + "time_per_iteration": 4.070605754852295 + }, + { + "auxiliary_loss_clip": 0.06441301, + "auxiliary_loss_mlp": 0.01267111, + "balance_loss_clip": 0.06280352, + "balance_loss_mlp": 0.0125435, + "epoch": 0.4687810010521569, + "flos": 26403720255360.0, + "grad_norm": 1.619506492510176, + "language_loss": 0.70710748, + "learning_rate": 2.2981853945569894e-06, + "loss": 0.78419161, + "num_input_tokens_seen": 167415285, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.12762451, + "step": 7797, + "time_per_iteration": 2.574291706085205 + }, + { + "auxiliary_loss_clip": 0.06443868, + "auxiliary_loss_mlp": 0.01272473, + "balance_loss_clip": 0.0628204, + "balance_loss_mlp": 0.01258472, + "epoch": 0.4688411243048249, + "flos": 19978618302720.0, + "grad_norm": 1.9026226114754317, + "language_loss": 0.67159688, + "learning_rate": 2.297800280150454e-06, + "loss": 0.74876028, + "num_input_tokens_seen": 167432405, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.14007568, + "step": 7798, + "time_per_iteration": 2.4703564643859863 + }, + { + "auxiliary_loss_clip": 0.06331287, + "auxiliary_loss_mlp": 0.01256102, + "balance_loss_clip": 0.06261373, + "balance_loss_mlp": 0.01253898, + "epoch": 0.46890124755749285, + "flos": 63996739983360.0, + "grad_norm": 0.926390069403038, + "language_loss": 0.64518279, + "learning_rate": 2.2974151544513033e-06, + "loss": 0.7210567, + "num_input_tokens_seen": 167499365, + "router_z_loss_clip": 0.69921875, + "router_z_loss_mlp": 0.02207947, + "step": 7799, + "time_per_iteration": 3.3128738403320312 + }, + { + "auxiliary_loss_clip": 0.06441961, + "auxiliary_loss_mlp": 0.01271763, + "balance_loss_clip": 0.06283548, + "balance_loss_mlp": 0.01258429, + "epoch": 0.4689613708101608, + "flos": 23775763052160.0, + "grad_norm": 1.2629628474735628, + "language_loss": 0.72331405, + "learning_rate": 2.2970300174741395e-06, + "loss": 0.80045128, + "num_input_tokens_seen": 167520390, + "router_z_loss_clip": 1.58300781, + "router_z_loss_mlp": 0.13330078, + "step": 7800, + "time_per_iteration": 2.5339090824127197 + }, + { + "auxiliary_loss_clip": 0.06436972, + "auxiliary_loss_mlp": 0.01269738, + "balance_loss_clip": 0.06279731, + "balance_loss_mlp": 0.01257406, + "epoch": 0.4690214940628288, + "flos": 24795337933440.0, + "grad_norm": 2.7480307453946726, + "language_loss": 0.72682166, + "learning_rate": 2.296644869233568e-06, + "loss": 0.80388874, + "num_input_tokens_seen": 167539865, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.12335205, + "step": 7801, + "time_per_iteration": 2.552154541015625 + }, + { + "auxiliary_loss_clip": 0.06449857, + "auxiliary_loss_mlp": 0.01274232, + "balance_loss_clip": 0.06283514, + "balance_loss_mlp": 0.01260094, + "epoch": 0.46908161731549675, + "flos": 18083169492480.0, + "grad_norm": 1.9453242658612842, + "language_loss": 0.62466741, + "learning_rate": 2.2962597097441936e-06, + "loss": 0.70190829, + "num_input_tokens_seen": 167558190, + "router_z_loss_clip": 1.66308594, + "router_z_loss_mlp": 0.14135742, + "step": 7802, + "time_per_iteration": 3.9707396030426025 + }, + { + "auxiliary_loss_clip": 0.06437971, + "auxiliary_loss_mlp": 0.01270017, + "balance_loss_clip": 0.06277081, + "balance_loss_mlp": 0.01257459, + "epoch": 0.4691417405681647, + "flos": 25710554154240.0, + "grad_norm": 1.8844359624083942, + "language_loss": 0.73532665, + "learning_rate": 2.2958745390206206e-06, + "loss": 0.81240654, + "num_input_tokens_seen": 167577685, + "router_z_loss_clip": 1.60839844, + "router_z_loss_mlp": 0.12554932, + "step": 7803, + "time_per_iteration": 2.554459810256958 + }, + { + "auxiliary_loss_clip": 0.06438211, + "auxiliary_loss_mlp": 0.01272362, + "balance_loss_clip": 0.06278156, + "balance_loss_mlp": 0.01259338, + "epoch": 0.46920186382083273, + "flos": 17462776262400.0, + "grad_norm": 1.58578754852504, + "language_loss": 0.77327907, + "learning_rate": 2.2954893570774558e-06, + "loss": 0.85038471, + "num_input_tokens_seen": 167596390, + "router_z_loss_clip": 1.60058594, + "router_z_loss_mlp": 0.13012695, + "step": 7804, + "time_per_iteration": 2.543470621109009 + }, + { + "auxiliary_loss_clip": 0.06432682, + "auxiliary_loss_mlp": 0.0126654, + "balance_loss_clip": 0.06275688, + "balance_loss_mlp": 0.01254298, + "epoch": 0.4692619870735007, + "flos": 20345669112960.0, + "grad_norm": 1.787683586047485, + "language_loss": 0.77375299, + "learning_rate": 2.295104163929305e-06, + "loss": 0.8507452, + "num_input_tokens_seen": 167614980, + "router_z_loss_clip": 1.56933594, + "router_z_loss_mlp": 0.12231445, + "step": 7805, + "time_per_iteration": 2.501739740371704 + }, + { + "auxiliary_loss_clip": 0.0644381, + "auxiliary_loss_mlp": 0.01270681, + "balance_loss_clip": 0.06276695, + "balance_loss_mlp": 0.01257163, + "epoch": 0.46932211032616866, + "flos": 29504177032320.0, + "grad_norm": 1.522976757050157, + "language_loss": 0.83108258, + "learning_rate": 2.2947189595907742e-06, + "loss": 0.90822744, + "num_input_tokens_seen": 167635895, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.13519287, + "step": 7806, + "time_per_iteration": 2.6634225845336914 + }, + { + "auxiliary_loss_clip": 0.06437123, + "auxiliary_loss_mlp": 0.01266513, + "balance_loss_clip": 0.06274477, + "balance_loss_mlp": 0.01253496, + "epoch": 0.4693822335788366, + "flos": 36220202760960.0, + "grad_norm": 1.6923542734381007, + "language_loss": 0.77444482, + "learning_rate": 2.294333744076472e-06, + "loss": 0.8514812, + "num_input_tokens_seen": 167657440, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.13006592, + "step": 7807, + "time_per_iteration": 4.0442986488342285 + }, + { + "auxiliary_loss_clip": 0.06438392, + "auxiliary_loss_mlp": 0.01270643, + "balance_loss_clip": 0.06276641, + "balance_loss_mlp": 0.01257024, + "epoch": 0.4694423568315046, + "flos": 20345124061440.0, + "grad_norm": 1.7839407979100135, + "language_loss": 0.51769608, + "learning_rate": 2.2939485174010035e-06, + "loss": 0.59478641, + "num_input_tokens_seen": 167675025, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.13635254, + "step": 7808, + "time_per_iteration": 2.4910712242126465 + }, + { + "auxiliary_loss_clip": 0.06328695, + "auxiliary_loss_mlp": 0.01252926, + "balance_loss_clip": 0.06259091, + "balance_loss_mlp": 0.01250451, + "epoch": 0.46950248008417256, + "flos": 64343540033280.0, + "grad_norm": 0.7688077124363479, + "language_loss": 0.57691324, + "learning_rate": 2.293563279578978e-06, + "loss": 0.65272945, + "num_input_tokens_seen": 167729635, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 0.0247345, + "step": 7809, + "time_per_iteration": 3.055589199066162 + }, + { + "auxiliary_loss_clip": 0.06439595, + "auxiliary_loss_mlp": 0.01268316, + "balance_loss_clip": 0.06276885, + "balance_loss_mlp": 0.01254845, + "epoch": 0.4695626033368405, + "flos": 19204755120000.0, + "grad_norm": 2.3576337237105425, + "language_loss": 0.71649069, + "learning_rate": 2.2931780306250045e-06, + "loss": 0.7935698, + "num_input_tokens_seen": 167745135, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.13470459, + "step": 7810, + "time_per_iteration": 2.5001537799835205 + }, + { + "auxiliary_loss_clip": 0.06435918, + "auxiliary_loss_mlp": 0.01272852, + "balance_loss_clip": 0.06275883, + "balance_loss_mlp": 0.01259113, + "epoch": 0.4696227265895085, + "flos": 23009027466240.0, + "grad_norm": 3.6880824309964617, + "language_loss": 0.81146425, + "learning_rate": 2.29279277055369e-06, + "loss": 0.88855195, + "num_input_tokens_seen": 167763875, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.13726807, + "step": 7811, + "time_per_iteration": 2.5971217155456543 + }, + { + "auxiliary_loss_clip": 0.06437828, + "auxiliary_loss_mlp": 0.01267753, + "balance_loss_clip": 0.06276736, + "balance_loss_mlp": 0.0125405, + "epoch": 0.46968284984217645, + "flos": 21877169713920.0, + "grad_norm": 1.5426371434141024, + "language_loss": 0.80606401, + "learning_rate": 2.292407499379644e-06, + "loss": 0.88311982, + "num_input_tokens_seen": 167784895, + "router_z_loss_clip": 1.61035156, + "router_z_loss_mlp": 0.13708496, + "step": 7812, + "time_per_iteration": 2.5140600204467773 + }, + { + "auxiliary_loss_clip": 0.06435272, + "auxiliary_loss_mlp": 0.01271707, + "balance_loss_clip": 0.06277305, + "balance_loss_mlp": 0.01258445, + "epoch": 0.4697429730948444, + "flos": 19981217779200.0, + "grad_norm": 1.702985157553907, + "language_loss": 0.74653876, + "learning_rate": 2.292022217117477e-06, + "loss": 0.82360852, + "num_input_tokens_seen": 167803185, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.13256836, + "step": 7813, + "time_per_iteration": 2.530773401260376 + }, + { + "auxiliary_loss_clip": 0.06438613, + "auxiliary_loss_mlp": 0.01270357, + "balance_loss_clip": 0.06279637, + "balance_loss_mlp": 0.01256755, + "epoch": 0.4698030963475124, + "flos": 15161185912320.0, + "grad_norm": 2.103167897479233, + "language_loss": 0.84843278, + "learning_rate": 2.291636923781798e-06, + "loss": 0.92552245, + "num_input_tokens_seen": 167816550, + "router_z_loss_clip": 1.58886719, + "router_z_loss_mlp": 0.13604736, + "step": 7814, + "time_per_iteration": 2.550631046295166 + }, + { + "auxiliary_loss_clip": 0.06432581, + "auxiliary_loss_mlp": 0.01265742, + "balance_loss_clip": 0.06276342, + "balance_loss_mlp": 0.01252856, + "epoch": 0.46986321960018035, + "flos": 15155316126720.0, + "grad_norm": 2.71974016097947, + "language_loss": 0.82219559, + "learning_rate": 2.291251619387217e-06, + "loss": 0.89917886, + "num_input_tokens_seen": 167831845, + "router_z_loss_clip": 1.56152344, + "router_z_loss_mlp": 0.12896729, + "step": 7815, + "time_per_iteration": 2.508582592010498 + }, + { + "auxiliary_loss_clip": 0.06434117, + "auxiliary_loss_mlp": 0.01273411, + "balance_loss_clip": 0.06275953, + "balance_loss_mlp": 0.01259952, + "epoch": 0.4699233428528483, + "flos": 23115021281280.0, + "grad_norm": 2.356408218131492, + "language_loss": 0.77761489, + "learning_rate": 2.2908663039483468e-06, + "loss": 0.85469019, + "num_input_tokens_seen": 167850360, + "router_z_loss_clip": 1.58300781, + "router_z_loss_mlp": 0.13452148, + "step": 7816, + "time_per_iteration": 2.505244493484497 + }, + { + "auxiliary_loss_clip": 0.06334539, + "auxiliary_loss_mlp": 0.01254323, + "balance_loss_clip": 0.06264929, + "balance_loss_mlp": 0.01251993, + "epoch": 0.46998346610551633, + "flos": 68126917985280.0, + "grad_norm": 0.8142436419344395, + "language_loss": 0.58616334, + "learning_rate": 2.290480977479796e-06, + "loss": 0.66205192, + "num_input_tokens_seen": 167908660, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 0.02325439, + "step": 7817, + "time_per_iteration": 3.1171398162841797 + }, + { + "auxiliary_loss_clip": 0.0643587, + "auxiliary_loss_mlp": 0.01268626, + "balance_loss_clip": 0.06280724, + "balance_loss_mlp": 0.01255119, + "epoch": 0.4700435893581843, + "flos": 24135560484480.0, + "grad_norm": 1.6087842481989176, + "language_loss": 0.7922467, + "learning_rate": 2.2900956399961775e-06, + "loss": 0.8692916, + "num_input_tokens_seen": 167927905, + "router_z_loss_clip": 1.54980469, + "router_z_loss_mlp": 0.13513184, + "step": 7818, + "time_per_iteration": 2.5133657455444336 + }, + { + "auxiliary_loss_clip": 0.06435841, + "auxiliary_loss_mlp": 0.01270106, + "balance_loss_clip": 0.06278426, + "balance_loss_mlp": 0.01257279, + "epoch": 0.47010371261085226, + "flos": 20155624053120.0, + "grad_norm": 1.9598217577618973, + "language_loss": 0.83629054, + "learning_rate": 2.289710291512104e-06, + "loss": 0.91334999, + "num_input_tokens_seen": 167945995, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.12841797, + "step": 7819, + "time_per_iteration": 2.512434482574463 + }, + { + "auxiliary_loss_clip": 0.06440641, + "auxiliary_loss_mlp": 0.01268241, + "balance_loss_clip": 0.06277996, + "balance_loss_mlp": 0.01253519, + "epoch": 0.47016383586352023, + "flos": 15127587624960.0, + "grad_norm": 1.951811924314391, + "language_loss": 0.76718354, + "learning_rate": 2.289324932042186e-06, + "loss": 0.84427238, + "num_input_tokens_seen": 167963380, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.1472168, + "step": 7820, + "time_per_iteration": 2.4596121311187744 + }, + { + "auxiliary_loss_clip": 0.06434815, + "auxiliary_loss_mlp": 0.01270743, + "balance_loss_clip": 0.06279559, + "balance_loss_mlp": 0.01257636, + "epoch": 0.4702239591161882, + "flos": 13558044470400.0, + "grad_norm": 1.9648943700675503, + "language_loss": 0.74081844, + "learning_rate": 2.288939561601039e-06, + "loss": 0.81787401, + "num_input_tokens_seen": 167981740, + "router_z_loss_clip": 1.55175781, + "router_z_loss_mlp": 0.13116455, + "step": 7821, + "time_per_iteration": 2.4793312549591064 + }, + { + "auxiliary_loss_clip": 0.06431578, + "auxiliary_loss_mlp": 0.01268853, + "balance_loss_clip": 0.06276228, + "balance_loss_mlp": 0.01256658, + "epoch": 0.47028408236885616, + "flos": 24282825235200.0, + "grad_norm": 1.6413236035832721, + "language_loss": 0.89491117, + "learning_rate": 2.2885541802032746e-06, + "loss": 0.97191548, + "num_input_tokens_seen": 167999380, + "router_z_loss_clip": 1.54980469, + "router_z_loss_mlp": 0.12207031, + "step": 7822, + "time_per_iteration": 2.5880398750305176 + }, + { + "auxiliary_loss_clip": 0.06433522, + "auxiliary_loss_mlp": 0.01266311, + "balance_loss_clip": 0.06277143, + "balance_loss_mlp": 0.01254062, + "epoch": 0.4703442056215241, + "flos": 22863565578240.0, + "grad_norm": 1.438932852866735, + "language_loss": 0.79699898, + "learning_rate": 2.2881687878635055e-06, + "loss": 0.87399733, + "num_input_tokens_seen": 168018395, + "router_z_loss_clip": 1.56445312, + "router_z_loss_mlp": 0.12255859, + "step": 7823, + "time_per_iteration": 2.5661919116973877 + }, + { + "auxiliary_loss_clip": 0.06324597, + "auxiliary_loss_mlp": 0.01253174, + "balance_loss_clip": 0.06255165, + "balance_loss_mlp": 0.01250784, + "epoch": 0.4704043288741921, + "flos": 69262381463040.0, + "grad_norm": 0.6854102840454825, + "language_loss": 0.56514406, + "learning_rate": 2.2877833845963487e-06, + "loss": 0.64092177, + "num_input_tokens_seen": 168084080, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 0.02386475, + "step": 7824, + "time_per_iteration": 3.223728656768799 + }, + { + "auxiliary_loss_clip": 0.06442541, + "auxiliary_loss_mlp": 0.01269654, + "balance_loss_clip": 0.06281068, + "balance_loss_mlp": 0.01255837, + "epoch": 0.47046445212686006, + "flos": 18046971728640.0, + "grad_norm": 1.8116047863427858, + "language_loss": 0.81242847, + "learning_rate": 2.2873979704164157e-06, + "loss": 0.88955039, + "num_input_tokens_seen": 168101555, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.13818359, + "step": 7825, + "time_per_iteration": 2.4815890789031982 + }, + { + "auxiliary_loss_clip": 0.06441189, + "auxiliary_loss_mlp": 0.01270609, + "balance_loss_clip": 0.06280564, + "balance_loss_mlp": 0.01257443, + "epoch": 0.470524575379528, + "flos": 23958261244800.0, + "grad_norm": 2.19673184020816, + "language_loss": 0.67126369, + "learning_rate": 2.287012545338324e-06, + "loss": 0.74838167, + "num_input_tokens_seen": 168121530, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.1317749, + "step": 7826, + "time_per_iteration": 2.5820834636688232 + }, + { + "auxiliary_loss_clip": 0.06443623, + "auxiliary_loss_mlp": 0.01268086, + "balance_loss_clip": 0.06281798, + "balance_loss_mlp": 0.01254824, + "epoch": 0.470584698632196, + "flos": 18119367256320.0, + "grad_norm": 1.7021383964965269, + "language_loss": 0.8395251, + "learning_rate": 2.2866271093766877e-06, + "loss": 0.91664219, + "num_input_tokens_seen": 168140335, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.13250732, + "step": 7827, + "time_per_iteration": 2.4966769218444824 + }, + { + "auxiliary_loss_clip": 0.06333943, + "auxiliary_loss_mlp": 0.01253247, + "balance_loss_clip": 0.06264865, + "balance_loss_mlp": 0.01250913, + "epoch": 0.47064482188486395, + "flos": 57268555413120.0, + "grad_norm": 0.786622619089935, + "language_loss": 0.55656797, + "learning_rate": 2.286241662546122e-06, + "loss": 0.63243991, + "num_input_tokens_seen": 168200535, + "router_z_loss_clip": 0.6875, + "router_z_loss_mlp": 0.02328491, + "step": 7828, + "time_per_iteration": 3.1594009399414062 + }, + { + "auxiliary_loss_clip": 0.06439656, + "auxiliary_loss_mlp": 0.01268005, + "balance_loss_clip": 0.06281954, + "balance_loss_mlp": 0.01254743, + "epoch": 0.4707049451375319, + "flos": 17900922862080.0, + "grad_norm": 1.8377127056601934, + "language_loss": 0.80904895, + "learning_rate": 2.285856204861245e-06, + "loss": 0.88612556, + "num_input_tokens_seen": 168219610, + "router_z_loss_clip": 1.57519531, + "router_z_loss_mlp": 0.13256836, + "step": 7829, + "time_per_iteration": 2.485140800476074 + }, + { + "auxiliary_loss_clip": 0.0643746, + "auxiliary_loss_mlp": 0.01272596, + "balance_loss_clip": 0.06279843, + "balance_loss_mlp": 0.0126024, + "epoch": 0.47076506839019994, + "flos": 25240402494720.0, + "grad_norm": 1.2696703606336757, + "language_loss": 0.76018727, + "learning_rate": 2.2854707363366703e-06, + "loss": 0.83728784, + "num_input_tokens_seen": 168242505, + "router_z_loss_clip": 1.57519531, + "router_z_loss_mlp": 0.12359619, + "step": 7830, + "time_per_iteration": 2.6114325523376465 + }, + { + "auxiliary_loss_clip": 0.06438384, + "auxiliary_loss_mlp": 0.01269492, + "balance_loss_clip": 0.06283822, + "balance_loss_mlp": 0.01257016, + "epoch": 0.4708251916428679, + "flos": 13484684620800.0, + "grad_norm": 2.037519777934202, + "language_loss": 0.78570348, + "learning_rate": 2.2850852569870177e-06, + "loss": 0.86278224, + "num_input_tokens_seen": 168260220, + "router_z_loss_clip": 1.54785156, + "router_z_loss_mlp": 0.12463379, + "step": 7831, + "time_per_iteration": 2.4759325981140137 + }, + { + "auxiliary_loss_clip": 0.06447008, + "auxiliary_loss_mlp": 0.01269408, + "balance_loss_clip": 0.06280228, + "balance_loss_mlp": 0.01255365, + "epoch": 0.47088531489553587, + "flos": 30154646678400.0, + "grad_norm": 1.667499960909574, + "language_loss": 0.7574442, + "learning_rate": 2.2846997668269033e-06, + "loss": 0.83460832, + "num_input_tokens_seen": 168277360, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.140625, + "step": 7832, + "time_per_iteration": 2.6298487186431885 + }, + { + "auxiliary_loss_clip": 0.06434175, + "auxiliary_loss_mlp": 0.01267877, + "balance_loss_clip": 0.0627791, + "balance_loss_mlp": 0.01256844, + "epoch": 0.47094543814820383, + "flos": 21804648405120.0, + "grad_norm": 1.2855995862723888, + "language_loss": 0.74791807, + "learning_rate": 2.2843142658709454e-06, + "loss": 0.82493854, + "num_input_tokens_seen": 168296605, + "router_z_loss_clip": 1.5625, + "router_z_loss_mlp": 0.1104126, + "step": 7833, + "time_per_iteration": 2.5464203357696533 + }, + { + "auxiliary_loss_clip": 0.06437977, + "auxiliary_loss_mlp": 0.01266439, + "balance_loss_clip": 0.06281009, + "balance_loss_mlp": 0.01254118, + "epoch": 0.4710055614008718, + "flos": 23009698298880.0, + "grad_norm": 1.569702279619268, + "language_loss": 0.76145566, + "learning_rate": 2.283928754133762e-06, + "loss": 0.83849978, + "num_input_tokens_seen": 168316205, + "router_z_loss_clip": 1.57128906, + "router_z_loss_mlp": 0.12329102, + "step": 7834, + "time_per_iteration": 2.6125214099884033 + }, + { + "auxiliary_loss_clip": 0.06433094, + "auxiliary_loss_mlp": 0.01266226, + "balance_loss_clip": 0.06278115, + "balance_loss_mlp": 0.01254078, + "epoch": 0.47106568465353976, + "flos": 42751256601600.0, + "grad_norm": 1.4292072421609816, + "language_loss": 0.66957295, + "learning_rate": 2.283543231629972e-06, + "loss": 0.74656606, + "num_input_tokens_seen": 168338935, + "router_z_loss_clip": 1.54882812, + "router_z_loss_mlp": 0.12158203, + "step": 7835, + "time_per_iteration": 5.518744707107544 + }, + { + "auxiliary_loss_clip": 0.06330478, + "auxiliary_loss_mlp": 0.01256395, + "balance_loss_clip": 0.06261497, + "balance_loss_mlp": 0.01253791, + "epoch": 0.4711258079062077, + "flos": 68571116807040.0, + "grad_norm": 0.853960187866431, + "language_loss": 0.62259066, + "learning_rate": 2.283157698374194e-06, + "loss": 0.69845939, + "num_input_tokens_seen": 168392800, + "router_z_loss_clip": 0.68798828, + "router_z_loss_mlp": 0.02604675, + "step": 7836, + "time_per_iteration": 3.1000564098358154 + }, + { + "auxiliary_loss_clip": 0.06439401, + "auxiliary_loss_mlp": 0.01267232, + "balance_loss_clip": 0.06274831, + "balance_loss_mlp": 0.01254006, + "epoch": 0.4711859311588757, + "flos": 25453522154880.0, + "grad_norm": 1.6974399997165228, + "language_loss": 0.69606686, + "learning_rate": 2.2827721543810475e-06, + "loss": 0.7731331, + "num_input_tokens_seen": 168412940, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.13238525, + "step": 7837, + "time_per_iteration": 2.5282108783721924 + }, + { + "auxiliary_loss_clip": 0.06437849, + "auxiliary_loss_mlp": 0.01268383, + "balance_loss_clip": 0.06277718, + "balance_loss_mlp": 0.01255061, + "epoch": 0.47124605441154366, + "flos": 21988488263040.0, + "grad_norm": 1.9658270715858404, + "language_loss": 0.66562694, + "learning_rate": 2.282386599665153e-06, + "loss": 0.74268925, + "num_input_tokens_seen": 168431995, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.13311768, + "step": 7838, + "time_per_iteration": 2.5846638679504395 + }, + { + "auxiliary_loss_clip": 0.06440166, + "auxiliary_loss_mlp": 0.01268362, + "balance_loss_clip": 0.06276537, + "balance_loss_mlp": 0.01255082, + "epoch": 0.4713061776642116, + "flos": 25420049648640.0, + "grad_norm": 5.850528361960432, + "language_loss": 0.77699667, + "learning_rate": 2.2820010342411304e-06, + "loss": 0.85408199, + "num_input_tokens_seen": 168454585, + "router_z_loss_clip": 1.63476562, + "router_z_loss_mlp": 0.1328125, + "step": 7839, + "time_per_iteration": 2.5414958000183105 + }, + { + "auxiliary_loss_clip": 0.06429788, + "auxiliary_loss_mlp": 0.01268311, + "balance_loss_clip": 0.06275208, + "balance_loss_mlp": 0.0125592, + "epoch": 0.4713663009168796, + "flos": 26549559486720.0, + "grad_norm": 2.242315176037199, + "language_loss": 0.73086643, + "learning_rate": 2.2816154581235993e-06, + "loss": 0.80784744, + "num_input_tokens_seen": 168471265, + "router_z_loss_clip": 1.54492188, + "router_z_loss_mlp": 0.12390137, + "step": 7840, + "time_per_iteration": 2.5519280433654785 + }, + { + "auxiliary_loss_clip": 0.06431505, + "auxiliary_loss_mlp": 0.01263733, + "balance_loss_clip": 0.06274457, + "balance_loss_mlp": 0.01251562, + "epoch": 0.47142642416954755, + "flos": 23630426945280.0, + "grad_norm": 1.566587637557085, + "language_loss": 0.75317335, + "learning_rate": 2.2812298713271833e-06, + "loss": 0.83012575, + "num_input_tokens_seen": 168491360, + "router_z_loss_clip": 1.5703125, + "router_z_loss_mlp": 0.1217041, + "step": 7841, + "time_per_iteration": 2.552835702896118 + }, + { + "auxiliary_loss_clip": 0.06436779, + "auxiliary_loss_mlp": 0.01265983, + "balance_loss_clip": 0.06277694, + "balance_loss_mlp": 0.01252947, + "epoch": 0.4714865474222155, + "flos": 22316783760000.0, + "grad_norm": 1.5550986710562988, + "language_loss": 0.70513815, + "learning_rate": 2.280844273866501e-06, + "loss": 0.78216577, + "num_input_tokens_seen": 168511335, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.13049316, + "step": 7842, + "time_per_iteration": 3.933955192565918 + }, + { + "auxiliary_loss_clip": 0.06436103, + "auxiliary_loss_mlp": 0.01268574, + "balance_loss_clip": 0.0627934, + "balance_loss_mlp": 0.01255891, + "epoch": 0.4715466706748835, + "flos": 17828317699200.0, + "grad_norm": 1.9804632158033957, + "language_loss": 0.79634649, + "learning_rate": 2.280458665756177e-06, + "loss": 0.87339324, + "num_input_tokens_seen": 168529920, + "router_z_loss_clip": 1.56738281, + "router_z_loss_mlp": 0.12677002, + "step": 7843, + "time_per_iteration": 2.4907753467559814 + }, + { + "auxiliary_loss_clip": 0.06434722, + "auxiliary_loss_mlp": 0.01265319, + "balance_loss_clip": 0.06276919, + "balance_loss_mlp": 0.0125301, + "epoch": 0.4716067939275515, + "flos": 23666289292800.0, + "grad_norm": 1.6302002599700955, + "language_loss": 0.74402809, + "learning_rate": 2.280073047010832e-06, + "loss": 0.82102847, + "num_input_tokens_seen": 168550595, + "router_z_loss_clip": 1.57714844, + "router_z_loss_mlp": 0.12298584, + "step": 7844, + "time_per_iteration": 2.5746476650238037 + }, + { + "auxiliary_loss_clip": 0.06436022, + "auxiliary_loss_mlp": 0.0127037, + "balance_loss_clip": 0.0627865, + "balance_loss_mlp": 0.01257138, + "epoch": 0.47166691718021947, + "flos": 17935778960640.0, + "grad_norm": 2.158450508091108, + "language_loss": 0.78678179, + "learning_rate": 2.279687417645088e-06, + "loss": 0.86384571, + "num_input_tokens_seen": 168569765, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.13238525, + "step": 7845, + "time_per_iteration": 2.4827558994293213 + }, + { + "auxiliary_loss_clip": 0.06430048, + "auxiliary_loss_mlp": 0.01266435, + "balance_loss_clip": 0.06273912, + "balance_loss_mlp": 0.01254991, + "epoch": 0.47172704043288743, + "flos": 26621787306240.0, + "grad_norm": 1.2653259456946966, + "language_loss": 0.73458219, + "learning_rate": 2.2793017776735703e-06, + "loss": 0.81154698, + "num_input_tokens_seen": 168591525, + "router_z_loss_clip": 1.56152344, + "router_z_loss_mlp": 0.11450195, + "step": 7846, + "time_per_iteration": 2.586641550064087 + }, + { + "auxiliary_loss_clip": 0.06430165, + "auxiliary_loss_mlp": 0.01268985, + "balance_loss_clip": 0.06277196, + "balance_loss_mlp": 0.01256754, + "epoch": 0.4717871636855554, + "flos": 27929225289600.0, + "grad_norm": 1.2918573904220954, + "language_loss": 0.74434412, + "learning_rate": 2.2789161271109e-06, + "loss": 0.82133555, + "num_input_tokens_seen": 168611235, + "router_z_loss_clip": 1.52929688, + "router_z_loss_mlp": 0.12243652, + "step": 7847, + "time_per_iteration": 3.984661817550659 + }, + { + "auxiliary_loss_clip": 0.06434786, + "auxiliary_loss_mlp": 0.0126996, + "balance_loss_clip": 0.06276622, + "balance_loss_mlp": 0.01258123, + "epoch": 0.47184728693822336, + "flos": 14507571738240.0, + "grad_norm": 1.68455833448323, + "language_loss": 0.81004, + "learning_rate": 2.278530465971703e-06, + "loss": 0.88708746, + "num_input_tokens_seen": 168628710, + "router_z_loss_clip": 1.58105469, + "router_z_loss_mlp": 0.1184082, + "step": 7848, + "time_per_iteration": 2.482759714126587 + }, + { + "auxiliary_loss_clip": 0.06438575, + "auxiliary_loss_mlp": 0.01265775, + "balance_loss_clip": 0.06279046, + "balance_loss_mlp": 0.01252394, + "epoch": 0.47190741019089133, + "flos": 17862041767680.0, + "grad_norm": 1.8089027190058555, + "language_loss": 0.70106918, + "learning_rate": 2.2781447942706032e-06, + "loss": 0.77811265, + "num_input_tokens_seen": 168645645, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.1338501, + "step": 7849, + "time_per_iteration": 2.5101277828216553 + }, + { + "auxiliary_loss_clip": 0.06444675, + "auxiliary_loss_mlp": 0.01269385, + "balance_loss_clip": 0.06280467, + "balance_loss_mlp": 0.0125539, + "epoch": 0.4719675334435593, + "flos": 17901384059520.0, + "grad_norm": 1.915736246727948, + "language_loss": 0.69964916, + "learning_rate": 2.277759112022224e-06, + "loss": 0.77678978, + "num_input_tokens_seen": 168664165, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.14001465, + "step": 7850, + "time_per_iteration": 2.46455979347229 + }, + { + "auxiliary_loss_clip": 0.06441706, + "auxiliary_loss_mlp": 0.01269243, + "balance_loss_clip": 0.0627879, + "balance_loss_mlp": 0.01255951, + "epoch": 0.47202765669622726, + "flos": 20710665498240.0, + "grad_norm": 1.953909301983903, + "language_loss": 0.75806379, + "learning_rate": 2.2773734192411916e-06, + "loss": 0.83517331, + "num_input_tokens_seen": 168681940, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.13305664, + "step": 7851, + "time_per_iteration": 2.5298452377319336 + }, + { + "auxiliary_loss_clip": 0.06440549, + "auxiliary_loss_mlp": 0.01271731, + "balance_loss_clip": 0.06277989, + "balance_loss_mlp": 0.01257534, + "epoch": 0.4720877799488952, + "flos": 16365439192320.0, + "grad_norm": 1.905541371588542, + "language_loss": 0.76767981, + "learning_rate": 2.276987715942132e-06, + "loss": 0.84480262, + "num_input_tokens_seen": 168698830, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.14196777, + "step": 7852, + "time_per_iteration": 2.473349094390869 + }, + { + "auxiliary_loss_clip": 0.06431545, + "auxiliary_loss_mlp": 0.01270384, + "balance_loss_clip": 0.06273787, + "balance_loss_mlp": 0.01257742, + "epoch": 0.4721479032015632, + "flos": 20674509661440.0, + "grad_norm": 2.394869083314355, + "language_loss": 0.69452804, + "learning_rate": 2.2766020021396696e-06, + "loss": 0.77154732, + "num_input_tokens_seen": 168718305, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.12658691, + "step": 7853, + "time_per_iteration": 2.537550210952759 + }, + { + "auxiliary_loss_clip": 0.06333929, + "auxiliary_loss_mlp": 0.01250651, + "balance_loss_clip": 0.06264801, + "balance_loss_mlp": 0.01248457, + "epoch": 0.47220802645423116, + "flos": 67773367681920.0, + "grad_norm": 0.6896509796832918, + "language_loss": 0.50247812, + "learning_rate": 2.276216277848432e-06, + "loss": 0.57832396, + "num_input_tokens_seen": 168782365, + "router_z_loss_clip": 0.69140625, + "router_z_loss_mlp": 0.02197266, + "step": 7854, + "time_per_iteration": 3.2550642490386963 + }, + { + "auxiliary_loss_clip": 0.06436136, + "auxiliary_loss_mlp": 0.0126914, + "balance_loss_clip": 0.06276229, + "balance_loss_mlp": 0.0125583, + "epoch": 0.4722681497068991, + "flos": 20927474737920.0, + "grad_norm": 1.8228483302344913, + "language_loss": 0.63672256, + "learning_rate": 2.2758305430830455e-06, + "loss": 0.71377528, + "num_input_tokens_seen": 168800485, + "router_z_loss_clip": 1.60058594, + "router_z_loss_mlp": 0.13317871, + "step": 7855, + "time_per_iteration": 2.5252599716186523 + }, + { + "auxiliary_loss_clip": 0.06439453, + "auxiliary_loss_mlp": 0.01268333, + "balance_loss_clip": 0.06280654, + "balance_loss_mlp": 0.01255715, + "epoch": 0.4723282729595671, + "flos": 28300594584960.0, + "grad_norm": 1.8174966086465816, + "language_loss": 0.76136196, + "learning_rate": 2.2754447978581376e-06, + "loss": 0.83843982, + "num_input_tokens_seen": 168818965, + "router_z_loss_clip": 1.58789062, + "router_z_loss_mlp": 0.1262207, + "step": 7856, + "time_per_iteration": 2.560236692428589 + }, + { + "auxiliary_loss_clip": 0.06436295, + "auxiliary_loss_mlp": 0.01269996, + "balance_loss_clip": 0.06279726, + "balance_loss_mlp": 0.01258284, + "epoch": 0.4723883962122351, + "flos": 27132287506560.0, + "grad_norm": 1.7138943667728106, + "language_loss": 0.750875, + "learning_rate": 2.2750590421883347e-06, + "loss": 0.8279379, + "num_input_tokens_seen": 168840355, + "router_z_loss_clip": 1.56347656, + "router_z_loss_mlp": 0.11706543, + "step": 7857, + "time_per_iteration": 2.5613489151000977 + }, + { + "auxiliary_loss_clip": 0.06436294, + "auxiliary_loss_mlp": 0.01270819, + "balance_loss_clip": 0.0628143, + "balance_loss_mlp": 0.01258946, + "epoch": 0.47244851946490307, + "flos": 31544794241280.0, + "grad_norm": 1.4694813046790665, + "language_loss": 0.64839488, + "learning_rate": 2.2746732760882655e-06, + "loss": 0.72546607, + "num_input_tokens_seen": 168861765, + "router_z_loss_clip": 1.54882812, + "router_z_loss_mlp": 0.11889648, + "step": 7858, + "time_per_iteration": 2.5957653522491455 + }, + { + "auxiliary_loss_clip": 0.06431169, + "auxiliary_loss_mlp": 0.01271908, + "balance_loss_clip": 0.06278542, + "balance_loss_mlp": 0.01259719, + "epoch": 0.47250864271757104, + "flos": 20892828274560.0, + "grad_norm": 1.741748713475879, + "language_loss": 0.71104157, + "learning_rate": 2.2742874995725575e-06, + "loss": 0.78807235, + "num_input_tokens_seen": 168881310, + "router_z_loss_clip": 1.52539062, + "router_z_loss_mlp": 0.12194824, + "step": 7859, + "time_per_iteration": 2.541404962539673 + }, + { + "auxiliary_loss_clip": 0.06440333, + "auxiliary_loss_mlp": 0.01270209, + "balance_loss_clip": 0.06277637, + "balance_loss_mlp": 0.01257776, + "epoch": 0.472568765970239, + "flos": 20528376940800.0, + "grad_norm": 1.7364161900477437, + "language_loss": 0.62341475, + "learning_rate": 2.2739017126558413e-06, + "loss": 0.70052016, + "num_input_tokens_seen": 168899470, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.12426758, + "step": 7860, + "time_per_iteration": 2.5165910720825195 + }, + { + "auxiliary_loss_clip": 0.06438711, + "auxiliary_loss_mlp": 0.01267574, + "balance_loss_clip": 0.06280093, + "balance_loss_mlp": 0.01254914, + "epoch": 0.47262888922290697, + "flos": 35813306534400.0, + "grad_norm": 2.092826385669962, + "language_loss": 0.72540921, + "learning_rate": 2.2735159153527445e-06, + "loss": 0.80247205, + "num_input_tokens_seen": 168921495, + "router_z_loss_clip": 1.58398438, + "router_z_loss_mlp": 0.12658691, + "step": 7861, + "time_per_iteration": 2.6575915813446045 + }, + { + "auxiliary_loss_clip": 0.06439754, + "auxiliary_loss_mlp": 0.01268288, + "balance_loss_clip": 0.0628088, + "balance_loss_mlp": 0.01254734, + "epoch": 0.47268901247557493, + "flos": 20674006536960.0, + "grad_norm": 2.2960282018232965, + "language_loss": 0.85134012, + "learning_rate": 2.273130107677896e-06, + "loss": 0.92842054, + "num_input_tokens_seen": 168940515, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 0.13555908, + "step": 7862, + "time_per_iteration": 2.4969582557678223 + }, + { + "auxiliary_loss_clip": 0.06443156, + "auxiliary_loss_mlp": 0.01269094, + "balance_loss_clip": 0.06283151, + "balance_loss_mlp": 0.012566, + "epoch": 0.4727491357282429, + "flos": 19579394724480.0, + "grad_norm": 1.7759944267926648, + "language_loss": 0.84885079, + "learning_rate": 2.272744289645927e-06, + "loss": 0.92597324, + "num_input_tokens_seen": 168958340, + "router_z_loss_clip": 1.60058594, + "router_z_loss_mlp": 0.12506104, + "step": 7863, + "time_per_iteration": 2.545445442199707 + }, + { + "auxiliary_loss_clip": 0.06435807, + "auxiliary_loss_mlp": 0.01268812, + "balance_loss_clip": 0.06279373, + "balance_loss_mlp": 0.01256873, + "epoch": 0.47280925898091086, + "flos": 18222090762240.0, + "grad_norm": 1.953539417417106, + "language_loss": 0.6582734, + "learning_rate": 2.272358461271467e-06, + "loss": 0.73531955, + "num_input_tokens_seen": 168974850, + "router_z_loss_clip": 1.5625, + "router_z_loss_mlp": 0.11950684, + "step": 7864, + "time_per_iteration": 2.4730403423309326 + }, + { + "auxiliary_loss_clip": 0.06438613, + "auxiliary_loss_mlp": 0.01269576, + "balance_loss_clip": 0.06280264, + "balance_loss_mlp": 0.01257619, + "epoch": 0.4728693822335788, + "flos": 17827604939520.0, + "grad_norm": 1.945688521953863, + "language_loss": 0.65635985, + "learning_rate": 2.271972622569147e-06, + "loss": 0.73344177, + "num_input_tokens_seen": 168992860, + "router_z_loss_clip": 1.58398438, + "router_z_loss_mlp": 0.11962891, + "step": 7865, + "time_per_iteration": 2.498135805130005 + }, + { + "auxiliary_loss_clip": 0.06430352, + "auxiliary_loss_mlp": 0.01270111, + "balance_loss_clip": 0.06277367, + "balance_loss_mlp": 0.01257671, + "epoch": 0.4729295054862468, + "flos": 20601359447040.0, + "grad_norm": 2.5713138482446234, + "language_loss": 0.73970878, + "learning_rate": 2.2715867735535976e-06, + "loss": 0.81671345, + "num_input_tokens_seen": 169010325, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.12445068, + "step": 7866, + "time_per_iteration": 2.495232582092285 + }, + { + "auxiliary_loss_clip": 0.06437797, + "auxiliary_loss_mlp": 0.01265922, + "balance_loss_clip": 0.06279261, + "balance_loss_mlp": 0.01254347, + "epoch": 0.47298962873891476, + "flos": 23374862392320.0, + "grad_norm": 2.8570557032751522, + "language_loss": 0.83387589, + "learning_rate": 2.271200914239451e-06, + "loss": 0.91091311, + "num_input_tokens_seen": 169029840, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.11578369, + "step": 7867, + "time_per_iteration": 2.565706968307495 + }, + { + "auxiliary_loss_clip": 0.06430209, + "auxiliary_loss_mlp": 0.01265413, + "balance_loss_clip": 0.06275865, + "balance_loss_mlp": 0.01253391, + "epoch": 0.4730497519915827, + "flos": 22058410095360.0, + "grad_norm": 1.6535025871822049, + "language_loss": 0.79521739, + "learning_rate": 2.2708150446413385e-06, + "loss": 0.87217355, + "num_input_tokens_seen": 169049975, + "router_z_loss_clip": 1.54492188, + "router_z_loss_mlp": 0.12036133, + "step": 7868, + "time_per_iteration": 2.549220561981201 + }, + { + "auxiliary_loss_clip": 0.06442262, + "auxiliary_loss_mlp": 0.01268103, + "balance_loss_clip": 0.06279381, + "balance_loss_mlp": 0.01255169, + "epoch": 0.4731098752442507, + "flos": 21076165008000.0, + "grad_norm": 1.8227151972017304, + "language_loss": 0.75178695, + "learning_rate": 2.2704291647738915e-06, + "loss": 0.82889056, + "num_input_tokens_seen": 169069540, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.12945557, + "step": 7869, + "time_per_iteration": 2.5188441276550293 + }, + { + "auxiliary_loss_clip": 0.06441551, + "auxiliary_loss_mlp": 0.01271574, + "balance_loss_clip": 0.06282122, + "balance_loss_mlp": 0.01258014, + "epoch": 0.4731699984969187, + "flos": 22535395862400.0, + "grad_norm": 1.4513841331120019, + "language_loss": 0.73749697, + "learning_rate": 2.2700432746517443e-06, + "loss": 0.81462824, + "num_input_tokens_seen": 169089940, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.13555908, + "step": 7870, + "time_per_iteration": 2.520761251449585 + }, + { + "auxiliary_loss_clip": 0.0644481, + "auxiliary_loss_mlp": 0.01272916, + "balance_loss_clip": 0.06280311, + "balance_loss_mlp": 0.01259231, + "epoch": 0.4732301217495867, + "flos": 24904769765760.0, + "grad_norm": 1.9907019842809281, + "language_loss": 0.81971508, + "learning_rate": 2.2696573742895292e-06, + "loss": 0.89689231, + "num_input_tokens_seen": 169109650, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.13684082, + "step": 7871, + "time_per_iteration": 2.7390120029449463 + }, + { + "auxiliary_loss_clip": 0.06436551, + "auxiliary_loss_mlp": 0.01267445, + "balance_loss_clip": 0.06278443, + "balance_loss_mlp": 0.01254261, + "epoch": 0.47329024500225464, + "flos": 22791128123520.0, + "grad_norm": 1.7255093919697873, + "language_loss": 0.76232624, + "learning_rate": 2.269271463701879e-06, + "loss": 0.8393662, + "num_input_tokens_seen": 169128990, + "router_z_loss_clip": 1.58203125, + "router_z_loss_mlp": 0.13189697, + "step": 7872, + "time_per_iteration": 2.6356093883514404 + }, + { + "auxiliary_loss_clip": 0.06438267, + "auxiliary_loss_mlp": 0.0127024, + "balance_loss_clip": 0.06279084, + "balance_loss_mlp": 0.01256847, + "epoch": 0.4733503682549226, + "flos": 38705884531200.0, + "grad_norm": 1.877318740282883, + "language_loss": 0.67809367, + "learning_rate": 2.268885542903428e-06, + "loss": 0.75517869, + "num_input_tokens_seen": 169154645, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.1338501, + "step": 7873, + "time_per_iteration": 2.7092511653900146 + }, + { + "auxiliary_loss_clip": 0.06434255, + "auxiliary_loss_mlp": 0.01269292, + "balance_loss_clip": 0.06277623, + "balance_loss_mlp": 0.0125699, + "epoch": 0.47341049150759057, + "flos": 22973584389120.0, + "grad_norm": 1.442307420398724, + "language_loss": 0.72792107, + "learning_rate": 2.26849961190881e-06, + "loss": 0.80495656, + "num_input_tokens_seen": 169174995, + "router_z_loss_clip": 1.56738281, + "router_z_loss_mlp": 0.12298584, + "step": 7874, + "time_per_iteration": 3.9462826251983643 + }, + { + "auxiliary_loss_clip": 0.06440391, + "auxiliary_loss_mlp": 0.01271103, + "balance_loss_clip": 0.06281446, + "balance_loss_mlp": 0.01258431, + "epoch": 0.47347061476025853, + "flos": 14543769502080.0, + "grad_norm": 2.253933500743018, + "language_loss": 0.65938866, + "learning_rate": 2.26811367073266e-06, + "loss": 0.7365036, + "num_input_tokens_seen": 169191815, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.12658691, + "step": 7875, + "time_per_iteration": 4.013593435287476 + }, + { + "auxiliary_loss_clip": 0.06443131, + "auxiliary_loss_mlp": 0.01267762, + "balance_loss_clip": 0.06284615, + "balance_loss_mlp": 0.01254571, + "epoch": 0.4735307380129265, + "flos": 30271080326400.0, + "grad_norm": 2.373261357507393, + "language_loss": 0.80868709, + "learning_rate": 2.2677277193896125e-06, + "loss": 0.88579601, + "num_input_tokens_seen": 169210430, + "router_z_loss_clip": 1.58398438, + "router_z_loss_mlp": 0.13183594, + "step": 7876, + "time_per_iteration": 2.577624797821045 + }, + { + "auxiliary_loss_clip": 0.06439028, + "auxiliary_loss_mlp": 0.01268228, + "balance_loss_clip": 0.0628099, + "balance_loss_mlp": 0.0125583, + "epoch": 0.47359086126559446, + "flos": 19397148094080.0, + "grad_norm": 1.7113236821341018, + "language_loss": 0.792979, + "learning_rate": 2.267341757894304e-06, + "loss": 0.87005162, + "num_input_tokens_seen": 169229295, + "router_z_loss_clip": 1.58007812, + "router_z_loss_mlp": 0.12402344, + "step": 7877, + "time_per_iteration": 2.5248916149139404 + }, + { + "auxiliary_loss_clip": 0.06431633, + "auxiliary_loss_mlp": 0.01269276, + "balance_loss_clip": 0.0627646, + "balance_loss_mlp": 0.01256938, + "epoch": 0.47365098451826243, + "flos": 21944995194240.0, + "grad_norm": 1.9478135029908927, + "language_loss": 0.70673579, + "learning_rate": 2.2669557862613685e-06, + "loss": 0.78374487, + "num_input_tokens_seen": 169247855, + "router_z_loss_clip": 1.55273438, + "router_z_loss_mlp": 0.12335205, + "step": 7878, + "time_per_iteration": 2.5023298263549805 + }, + { + "auxiliary_loss_clip": 0.06432398, + "auxiliary_loss_mlp": 0.01268548, + "balance_loss_clip": 0.06278147, + "balance_loss_mlp": 0.01256382, + "epoch": 0.4737111077709304, + "flos": 25851571776000.0, + "grad_norm": 1.6314467446120229, + "language_loss": 0.75137293, + "learning_rate": 2.2665698045054425e-06, + "loss": 0.82838243, + "num_input_tokens_seen": 169268860, + "router_z_loss_clip": 1.54199219, + "router_z_loss_mlp": 0.1217041, + "step": 7879, + "time_per_iteration": 2.623811960220337 + }, + { + "auxiliary_loss_clip": 0.06320075, + "auxiliary_loss_mlp": 0.01265678, + "balance_loss_clip": 0.06251323, + "balance_loss_mlp": 0.01262992, + "epoch": 0.47377123102359836, + "flos": 67779461831040.0, + "grad_norm": 0.7167002771941348, + "language_loss": 0.6131798, + "learning_rate": 2.266183812641164e-06, + "loss": 0.68903732, + "num_input_tokens_seen": 169331855, + "router_z_loss_clip": 0.6875, + "router_z_loss_mlp": 0.02690125, + "step": 7880, + "time_per_iteration": 3.159388303756714 + }, + { + "auxiliary_loss_clip": 0.06434937, + "auxiliary_loss_mlp": 0.01268898, + "balance_loss_clip": 0.06278567, + "balance_loss_mlp": 0.01256035, + "epoch": 0.4738313542762663, + "flos": 24322796432640.0, + "grad_norm": 1.5964233369580554, + "language_loss": 0.68369412, + "learning_rate": 2.2657978106831675e-06, + "loss": 0.76073253, + "num_input_tokens_seen": 169352175, + "router_z_loss_clip": 1.56542969, + "router_z_loss_mlp": 0.12866211, + "step": 7881, + "time_per_iteration": 4.010294198989868 + }, + { + "auxiliary_loss_clip": 0.06434233, + "auxiliary_loss_mlp": 0.01267509, + "balance_loss_clip": 0.06279774, + "balance_loss_mlp": 0.01255964, + "epoch": 0.4738914775289343, + "flos": 20711797528320.0, + "grad_norm": 1.8204307046333812, + "language_loss": 0.77692872, + "learning_rate": 2.265411798646092e-06, + "loss": 0.85394609, + "num_input_tokens_seen": 169371215, + "router_z_loss_clip": 1.54492188, + "router_z_loss_mlp": 0.11541748, + "step": 7882, + "time_per_iteration": 2.5205814838409424 + }, + { + "auxiliary_loss_clip": 0.06437336, + "auxiliary_loss_mlp": 0.01269511, + "balance_loss_clip": 0.06279442, + "balance_loss_mlp": 0.01257208, + "epoch": 0.4739516007816023, + "flos": 25453228665600.0, + "grad_norm": 1.3763225621826927, + "language_loss": 0.76357329, + "learning_rate": 2.2650257765445747e-06, + "loss": 0.84064174, + "num_input_tokens_seen": 169391745, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.12304688, + "step": 7883, + "time_per_iteration": 2.5500354766845703 + }, + { + "auxiliary_loss_clip": 0.0643235, + "auxiliary_loss_mlp": 0.0126636, + "balance_loss_clip": 0.06278035, + "balance_loss_mlp": 0.01255101, + "epoch": 0.4740117240342703, + "flos": 19980463092480.0, + "grad_norm": 1.6935272320670107, + "language_loss": 0.72225314, + "learning_rate": 2.2646397443932525e-06, + "loss": 0.79924023, + "num_input_tokens_seen": 169409845, + "router_z_loss_clip": 1.54101562, + "router_z_loss_mlp": 0.1126709, + "step": 7884, + "time_per_iteration": 2.5347273349761963 + }, + { + "auxiliary_loss_clip": 0.06443354, + "auxiliary_loss_mlp": 0.01266451, + "balance_loss_clip": 0.06279097, + "balance_loss_mlp": 0.01252944, + "epoch": 0.47407184728693824, + "flos": 15665229348480.0, + "grad_norm": 2.6351569696409314, + "language_loss": 0.82340348, + "learning_rate": 2.2642537022067655e-06, + "loss": 0.90050149, + "num_input_tokens_seen": 169426085, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.13513184, + "step": 7885, + "time_per_iteration": 2.482201099395752 + }, + { + "auxiliary_loss_clip": 0.06433931, + "auxiliary_loss_mlp": 0.01271088, + "balance_loss_clip": 0.06277239, + "balance_loss_mlp": 0.01259262, + "epoch": 0.4741319705396062, + "flos": 18594843649920.0, + "grad_norm": 1.913533031103811, + "language_loss": 0.7349298, + "learning_rate": 2.263867649999751e-06, + "loss": 0.81198001, + "num_input_tokens_seen": 169444705, + "router_z_loss_clip": 1.56738281, + "router_z_loss_mlp": 0.11816406, + "step": 7886, + "time_per_iteration": 3.95589017868042 + }, + { + "auxiliary_loss_clip": 0.06445764, + "auxiliary_loss_mlp": 0.01269023, + "balance_loss_clip": 0.0628106, + "balance_loss_mlp": 0.01256655, + "epoch": 0.47419209379227417, + "flos": 13266114445440.0, + "grad_norm": 1.8957247676006206, + "language_loss": 0.74131465, + "learning_rate": 2.263481587786849e-06, + "loss": 0.81846249, + "num_input_tokens_seen": 169460850, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.12384033, + "step": 7887, + "time_per_iteration": 2.558175563812256 + }, + { + "auxiliary_loss_clip": 0.06431396, + "auxiliary_loss_mlp": 0.01269479, + "balance_loss_clip": 0.06276178, + "balance_loss_mlp": 0.01257499, + "epoch": 0.47425221704494214, + "flos": 20049630238080.0, + "grad_norm": 2.0468025330010016, + "language_loss": 0.7742272, + "learning_rate": 2.2630955155826993e-06, + "loss": 0.85123587, + "num_input_tokens_seen": 169478890, + "router_z_loss_clip": 1.55273438, + "router_z_loss_mlp": 0.11987305, + "step": 7888, + "time_per_iteration": 2.5532913208007812 + }, + { + "auxiliary_loss_clip": 0.06440586, + "auxiliary_loss_mlp": 0.01268245, + "balance_loss_clip": 0.06282103, + "balance_loss_mlp": 0.01255978, + "epoch": 0.4743123402976101, + "flos": 27279300695040.0, + "grad_norm": 1.7248476258859713, + "language_loss": 0.72833514, + "learning_rate": 2.2627094334019406e-06, + "loss": 0.80542344, + "num_input_tokens_seen": 169499690, + "router_z_loss_clip": 1.58398438, + "router_z_loss_mlp": 0.1227417, + "step": 7889, + "time_per_iteration": 2.635697603225708 + }, + { + "auxiliary_loss_clip": 0.06323753, + "auxiliary_loss_mlp": 0.01252671, + "balance_loss_clip": 0.0625556, + "balance_loss_mlp": 0.01250217, + "epoch": 0.47437246355027807, + "flos": 55410771813120.0, + "grad_norm": 0.6980000025852627, + "language_loss": 0.55692458, + "learning_rate": 2.262323341259214e-06, + "loss": 0.63268882, + "num_input_tokens_seen": 169560475, + "router_z_loss_clip": 0.68261719, + "router_z_loss_mlp": 0.02452087, + "step": 7890, + "time_per_iteration": 3.196005344390869 + }, + { + "auxiliary_loss_clip": 0.06440383, + "auxiliary_loss_mlp": 0.01269286, + "balance_loss_clip": 0.06280889, + "balance_loss_mlp": 0.01255929, + "epoch": 0.47443258680294603, + "flos": 23885278738560.0, + "grad_norm": 1.7863596191541609, + "language_loss": 0.65755105, + "learning_rate": 2.2619372391691605e-06, + "loss": 0.73464775, + "num_input_tokens_seen": 169580110, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.13366699, + "step": 7891, + "time_per_iteration": 2.5535497665405273 + }, + { + "auxiliary_loss_clip": 0.06448144, + "auxiliary_loss_mlp": 0.01270649, + "balance_loss_clip": 0.06284909, + "balance_loss_mlp": 0.01256892, + "epoch": 0.474492710055614, + "flos": 21983666653440.0, + "grad_norm": 2.0785188787991133, + "language_loss": 0.70081401, + "learning_rate": 2.26155112714642e-06, + "loss": 0.77800196, + "num_input_tokens_seen": 169597510, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.13757324, + "step": 7892, + "time_per_iteration": 2.512953519821167 + }, + { + "auxiliary_loss_clip": 0.06322581, + "auxiliary_loss_mlp": 0.01253797, + "balance_loss_clip": 0.06254438, + "balance_loss_mlp": 0.01251454, + "epoch": 0.47455283330828196, + "flos": 62577186837120.0, + "grad_norm": 0.7954751994073583, + "language_loss": 0.58515328, + "learning_rate": 2.2611650052056355e-06, + "loss": 0.66091704, + "num_input_tokens_seen": 169660010, + "router_z_loss_clip": 0.6796875, + "router_z_loss_mlp": 0.02337646, + "step": 7893, + "time_per_iteration": 3.2652807235717773 + }, + { + "auxiliary_loss_clip": 0.06435462, + "auxiliary_loss_mlp": 0.01271377, + "balance_loss_clip": 0.06278428, + "balance_loss_mlp": 0.01259498, + "epoch": 0.47461295656094993, + "flos": 12098478199680.0, + "grad_norm": 1.6548256161788057, + "language_loss": 0.77515912, + "learning_rate": 2.2607788733614463e-06, + "loss": 0.85222745, + "num_input_tokens_seen": 169678485, + "router_z_loss_clip": 1.57226562, + "router_z_loss_mlp": 0.11871338, + "step": 7894, + "time_per_iteration": 2.4962351322174072 + }, + { + "auxiliary_loss_clip": 0.06436545, + "auxiliary_loss_mlp": 0.01267591, + "balance_loss_clip": 0.06277076, + "balance_loss_mlp": 0.01254883, + "epoch": 0.4746730798136179, + "flos": 20890522287360.0, + "grad_norm": 1.8932038979458137, + "language_loss": 0.75310624, + "learning_rate": 2.260392731628497e-06, + "loss": 0.83014762, + "num_input_tokens_seen": 169697335, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.1270752, + "step": 7895, + "time_per_iteration": 2.536651611328125 + }, + { + "auxiliary_loss_clip": 0.06438908, + "auxiliary_loss_mlp": 0.0126825, + "balance_loss_clip": 0.06280944, + "balance_loss_mlp": 0.012559, + "epoch": 0.4747332030662859, + "flos": 19981008144000.0, + "grad_norm": 1.9186877339725528, + "language_loss": 0.824898, + "learning_rate": 2.260006580021429e-06, + "loss": 0.90196961, + "num_input_tokens_seen": 169715395, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.12341309, + "step": 7896, + "time_per_iteration": 2.5451180934906006 + }, + { + "auxiliary_loss_clip": 0.06438936, + "auxiliary_loss_mlp": 0.0126766, + "balance_loss_clip": 0.06281327, + "balance_loss_mlp": 0.01254964, + "epoch": 0.4747933263189539, + "flos": 16039701244800.0, + "grad_norm": 4.910262672985542, + "language_loss": 0.76465023, + "learning_rate": 2.259620418554886e-06, + "loss": 0.84171617, + "num_input_tokens_seen": 169733755, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.12689209, + "step": 7897, + "time_per_iteration": 2.529157876968384 + }, + { + "auxiliary_loss_clip": 0.06443989, + "auxiliary_loss_mlp": 0.012709, + "balance_loss_clip": 0.0627964, + "balance_loss_mlp": 0.01257376, + "epoch": 0.47485344957162184, + "flos": 13960370649600.0, + "grad_norm": 1.9701771451271233, + "language_loss": 0.64411497, + "learning_rate": 2.25923424724351e-06, + "loss": 0.72126389, + "num_input_tokens_seen": 169751390, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.13519287, + "step": 7898, + "time_per_iteration": 2.4861059188842773 + }, + { + "auxiliary_loss_clip": 0.06443477, + "auxiliary_loss_mlp": 0.01269988, + "balance_loss_clip": 0.0628337, + "balance_loss_mlp": 0.01256774, + "epoch": 0.4749135728242898, + "flos": 20455352507520.0, + "grad_norm": 2.55946780946792, + "language_loss": 0.70317411, + "learning_rate": 2.258848066101946e-06, + "loss": 0.78030878, + "num_input_tokens_seen": 169769500, + "router_z_loss_clip": 1.60058594, + "router_z_loss_mlp": 0.13201904, + "step": 7899, + "time_per_iteration": 2.5035181045532227 + }, + { + "auxiliary_loss_clip": 0.06438522, + "auxiliary_loss_mlp": 0.0127023, + "balance_loss_clip": 0.06280558, + "balance_loss_mlp": 0.01257701, + "epoch": 0.4749736960769578, + "flos": 28957604849280.0, + "grad_norm": 1.797290129910965, + "language_loss": 0.68821597, + "learning_rate": 2.258461875144837e-06, + "loss": 0.76530349, + "num_input_tokens_seen": 169789215, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.12536621, + "step": 7900, + "time_per_iteration": 2.638021469116211 + }, + { + "auxiliary_loss_clip": 0.06435557, + "auxiliary_loss_mlp": 0.0126873, + "balance_loss_clip": 0.06277159, + "balance_loss_mlp": 0.01254216, + "epoch": 0.47503381932962574, + "flos": 31946407660800.0, + "grad_norm": 2.027602507157595, + "language_loss": 0.70583236, + "learning_rate": 2.2580756743868273e-06, + "loss": 0.78287518, + "num_input_tokens_seen": 169808825, + "router_z_loss_clip": 1.58300781, + "router_z_loss_mlp": 0.14501953, + "step": 7901, + "time_per_iteration": 2.6210362911224365 + }, + { + "auxiliary_loss_clip": 0.06438562, + "auxiliary_loss_mlp": 0.01269369, + "balance_loss_clip": 0.06280936, + "balance_loss_mlp": 0.01256817, + "epoch": 0.4750939425822937, + "flos": 22133782442880.0, + "grad_norm": 1.48556411263083, + "language_loss": 0.73796129, + "learning_rate": 2.2576894638425636e-06, + "loss": 0.81504059, + "num_input_tokens_seen": 169827590, + "router_z_loss_clip": 1.57617188, + "router_z_loss_mlp": 0.12542725, + "step": 7902, + "time_per_iteration": 2.5175282955169678 + }, + { + "auxiliary_loss_clip": 0.06431635, + "auxiliary_loss_mlp": 0.01269606, + "balance_loss_clip": 0.06275479, + "balance_loss_mlp": 0.0125747, + "epoch": 0.47515406583496167, + "flos": 20856378948480.0, + "grad_norm": 3.332476837285125, + "language_loss": 0.69285202, + "learning_rate": 2.257303243526688e-06, + "loss": 0.76986444, + "num_input_tokens_seen": 169844925, + "router_z_loss_clip": 1.5625, + "router_z_loss_mlp": 0.12139893, + "step": 7903, + "time_per_iteration": 2.5292611122131348 + }, + { + "auxiliary_loss_clip": 0.06430157, + "auxiliary_loss_mlp": 0.01266387, + "balance_loss_clip": 0.06276098, + "balance_loss_mlp": 0.01255015, + "epoch": 0.47521418908762963, + "flos": 17529679347840.0, + "grad_norm": 1.464561850634071, + "language_loss": 0.72526675, + "learning_rate": 2.256917013453848e-06, + "loss": 0.80223215, + "num_input_tokens_seen": 169862705, + "router_z_loss_clip": 1.54003906, + "router_z_loss_mlp": 0.1137085, + "step": 7904, + "time_per_iteration": 2.491152286529541 + }, + { + "auxiliary_loss_clip": 0.06430416, + "auxiliary_loss_mlp": 0.01265335, + "balance_loss_clip": 0.06276643, + "balance_loss_mlp": 0.01253706, + "epoch": 0.4752743123402976, + "flos": 20565874442880.0, + "grad_norm": 1.4968424405470007, + "language_loss": 0.86079156, + "learning_rate": 2.25653077363869e-06, + "loss": 0.93774903, + "num_input_tokens_seen": 169880155, + "router_z_loss_clip": 1.53710938, + "router_z_loss_mlp": 0.11633301, + "step": 7905, + "time_per_iteration": 2.5502467155456543 + }, + { + "auxiliary_loss_clip": 0.06426042, + "auxiliary_loss_mlp": 0.01267894, + "balance_loss_clip": 0.06274827, + "balance_loss_mlp": 0.01256146, + "epoch": 0.47533443559296557, + "flos": 26368025616000.0, + "grad_norm": 2.2485080153720425, + "language_loss": 0.82345891, + "learning_rate": 2.2561445240958583e-06, + "loss": 0.90039825, + "num_input_tokens_seen": 169901525, + "router_z_loss_clip": 1.51269531, + "router_z_loss_mlp": 0.11749268, + "step": 7906, + "time_per_iteration": 2.5368199348449707 + }, + { + "auxiliary_loss_clip": 0.06321883, + "auxiliary_loss_mlp": 0.01254668, + "balance_loss_clip": 0.06254389, + "balance_loss_mlp": 0.01251897, + "epoch": 0.47539455884563353, + "flos": 65970118690560.0, + "grad_norm": 0.659791256047387, + "language_loss": 0.5900293, + "learning_rate": 2.255758264840002e-06, + "loss": 0.66579485, + "num_input_tokens_seen": 169970345, + "router_z_loss_clip": 0.67529297, + "router_z_loss_mlp": 0.02775574, + "step": 7907, + "time_per_iteration": 3.279963254928589 + }, + { + "auxiliary_loss_clip": 0.06431986, + "auxiliary_loss_mlp": 0.01269488, + "balance_loss_clip": 0.06276301, + "balance_loss_mlp": 0.01256721, + "epoch": 0.4754546820983015, + "flos": 17243828743680.0, + "grad_norm": 1.7704403118247245, + "language_loss": 0.81422615, + "learning_rate": 2.255371995885765e-06, + "loss": 0.89124084, + "num_input_tokens_seen": 169986440, + "router_z_loss_clip": 1.55664062, + "router_z_loss_mlp": 0.12756348, + "step": 7908, + "time_per_iteration": 2.5366125106811523 + }, + { + "auxiliary_loss_clip": 0.0643681, + "auxiliary_loss_mlp": 0.01270103, + "balance_loss_clip": 0.06278989, + "balance_loss_mlp": 0.01257258, + "epoch": 0.47551480535096946, + "flos": 19831563187200.0, + "grad_norm": 1.6522879253580633, + "language_loss": 0.74338585, + "learning_rate": 2.254985717247797e-06, + "loss": 0.82045496, + "num_input_tokens_seen": 170005705, + "router_z_loss_clip": 1.57714844, + "router_z_loss_mlp": 0.12841797, + "step": 7909, + "time_per_iteration": 2.5318603515625 + }, + { + "auxiliary_loss_clip": 0.06431618, + "auxiliary_loss_mlp": 0.01267166, + "balance_loss_clip": 0.0627422, + "balance_loss_mlp": 0.01255192, + "epoch": 0.4755749286036375, + "flos": 22170525258240.0, + "grad_norm": 1.5977935042114109, + "language_loss": 0.75628603, + "learning_rate": 2.2545994289407457e-06, + "loss": 0.83327389, + "num_input_tokens_seen": 170023415, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.11987305, + "step": 7910, + "time_per_iteration": 2.5529162883758545 + }, + { + "auxiliary_loss_clip": 0.0643287, + "auxiliary_loss_mlp": 0.01264956, + "balance_loss_clip": 0.06276555, + "balance_loss_mlp": 0.01253488, + "epoch": 0.47563505185630545, + "flos": 21653945637120.0, + "grad_norm": 1.8732404582916444, + "language_loss": 0.7930491, + "learning_rate": 2.2542131309792577e-06, + "loss": 0.8700273, + "num_input_tokens_seen": 170042395, + "router_z_loss_clip": 1.56347656, + "router_z_loss_mlp": 0.11474609, + "step": 7911, + "time_per_iteration": 2.5172598361968994 + }, + { + "auxiliary_loss_clip": 0.0643772, + "auxiliary_loss_mlp": 0.01268087, + "balance_loss_clip": 0.06276082, + "balance_loss_mlp": 0.01253854, + "epoch": 0.4756951751089734, + "flos": 20634622318080.0, + "grad_norm": 1.775078995772379, + "language_loss": 0.76487613, + "learning_rate": 2.253826823377983e-06, + "loss": 0.8419342, + "num_input_tokens_seen": 170061610, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.14239502, + "step": 7912, + "time_per_iteration": 2.5627753734588623 + }, + { + "auxiliary_loss_clip": 0.06432701, + "auxiliary_loss_mlp": 0.01273558, + "balance_loss_clip": 0.06275164, + "balance_loss_mlp": 0.01260797, + "epoch": 0.4757552983616414, + "flos": 25855932188160.0, + "grad_norm": 1.3867905424321492, + "language_loss": 0.74749589, + "learning_rate": 2.253440506151569e-06, + "loss": 0.82455844, + "num_input_tokens_seen": 170083505, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.12762451, + "step": 7913, + "time_per_iteration": 2.539555549621582 + }, + { + "auxiliary_loss_clip": 0.06434918, + "auxiliary_loss_mlp": 0.01269661, + "balance_loss_clip": 0.06277134, + "balance_loss_mlp": 0.01257418, + "epoch": 0.47581542161430934, + "flos": 18228841015680.0, + "grad_norm": 1.9858873239790236, + "language_loss": 0.72184181, + "learning_rate": 2.253054179314666e-06, + "loss": 0.79888761, + "num_input_tokens_seen": 170100690, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.12249756, + "step": 7914, + "time_per_iteration": 3.9911863803863525 + }, + { + "auxiliary_loss_clip": 0.06440303, + "auxiliary_loss_mlp": 0.01270006, + "balance_loss_clip": 0.06281254, + "balance_loss_mlp": 0.0125737, + "epoch": 0.4758755448669773, + "flos": 21586162083840.0, + "grad_norm": 1.8571830642758371, + "language_loss": 0.65017748, + "learning_rate": 2.2526678428819227e-06, + "loss": 0.72728062, + "num_input_tokens_seen": 170119240, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.12628174, + "step": 7915, + "time_per_iteration": 3.94254207611084 + }, + { + "auxiliary_loss_clip": 0.06428695, + "auxiliary_loss_mlp": 0.01268984, + "balance_loss_clip": 0.06276157, + "balance_loss_mlp": 0.01257027, + "epoch": 0.47593566811964527, + "flos": 15236474405760.0, + "grad_norm": 1.6782618347522322, + "language_loss": 0.77118516, + "learning_rate": 2.2522814968679896e-06, + "loss": 0.84816194, + "num_input_tokens_seen": 170136450, + "router_z_loss_clip": 1.52636719, + "router_z_loss_mlp": 0.11950684, + "step": 7916, + "time_per_iteration": 2.5071310997009277 + }, + { + "auxiliary_loss_clip": 0.0642941, + "auxiliary_loss_mlp": 0.01270125, + "balance_loss_clip": 0.06275692, + "balance_loss_mlp": 0.01258842, + "epoch": 0.47599579137231324, + "flos": 21549628903680.0, + "grad_norm": 2.1020342658546878, + "language_loss": 0.64506871, + "learning_rate": 2.2518951412875173e-06, + "loss": 0.72206402, + "num_input_tokens_seen": 170155295, + "router_z_loss_clip": 1.53710938, + "router_z_loss_mlp": 0.112854, + "step": 7917, + "time_per_iteration": 2.660997152328491 + }, + { + "auxiliary_loss_clip": 0.06322742, + "auxiliary_loss_mlp": 0.01267172, + "balance_loss_clip": 0.06253887, + "balance_loss_mlp": 0.01264125, + "epoch": 0.4760559146249812, + "flos": 64573388582400.0, + "grad_norm": 0.81764582989578, + "language_loss": 0.65507567, + "learning_rate": 2.2515087761551557e-06, + "loss": 0.73097479, + "num_input_tokens_seen": 170222325, + "router_z_loss_clip": 0.6875, + "router_z_loss_mlp": 0.03042603, + "step": 7918, + "time_per_iteration": 3.185194492340088 + }, + { + "auxiliary_loss_clip": 0.06435688, + "auxiliary_loss_mlp": 0.01270072, + "balance_loss_clip": 0.06277663, + "balance_loss_mlp": 0.01257781, + "epoch": 0.47611603787764917, + "flos": 22239943966080.0, + "grad_norm": 1.5442115166230013, + "language_loss": 0.69113988, + "learning_rate": 2.2511224014855563e-06, + "loss": 0.76819742, + "num_input_tokens_seen": 170241625, + "router_z_loss_clip": 1.58007812, + "router_z_loss_mlp": 0.12286377, + "step": 7919, + "time_per_iteration": 2.5625159740448 + }, + { + "auxiliary_loss_clip": 0.06440815, + "auxiliary_loss_mlp": 0.01266869, + "balance_loss_clip": 0.06280257, + "balance_loss_mlp": 0.01254966, + "epoch": 0.47617616113031713, + "flos": 22785971097600.0, + "grad_norm": 1.4153562055419862, + "language_loss": 0.75135148, + "learning_rate": 2.2507360172933694e-06, + "loss": 0.82842833, + "num_input_tokens_seen": 170262470, + "router_z_loss_clip": 1.60742188, + "router_z_loss_mlp": 0.11914062, + "step": 7920, + "time_per_iteration": 2.606783866882324 + }, + { + "auxiliary_loss_clip": 0.06442747, + "auxiliary_loss_mlp": 0.01268403, + "balance_loss_clip": 0.06280643, + "balance_loss_mlp": 0.01255391, + "epoch": 0.4762362843829851, + "flos": 24140633656320.0, + "grad_norm": 1.5595930907743143, + "language_loss": 0.77291155, + "learning_rate": 2.2503496235932487e-06, + "loss": 0.85002303, + "num_input_tokens_seen": 170283460, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.13000488, + "step": 7921, + "time_per_iteration": 4.0331573486328125 + }, + { + "auxiliary_loss_clip": 0.06441253, + "auxiliary_loss_mlp": 0.01270198, + "balance_loss_clip": 0.06281719, + "balance_loss_mlp": 0.01256859, + "epoch": 0.47629640763565306, + "flos": 22458052944000.0, + "grad_norm": 1.5318798569312555, + "language_loss": 0.78402638, + "learning_rate": 2.249963220399845e-06, + "loss": 0.86114085, + "num_input_tokens_seen": 170304225, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.13342285, + "step": 7922, + "time_per_iteration": 2.615656614303589 + }, + { + "auxiliary_loss_clip": 0.06443102, + "auxiliary_loss_mlp": 0.01266921, + "balance_loss_clip": 0.06280392, + "balance_loss_mlp": 0.01253426, + "epoch": 0.4763565308883211, + "flos": 11186071090560.0, + "grad_norm": 1.9566034639967664, + "language_loss": 0.72915596, + "learning_rate": 2.2495768077278104e-06, + "loss": 0.80625618, + "num_input_tokens_seen": 170322110, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.1350708, + "step": 7923, + "time_per_iteration": 2.495023727416992 + }, + { + "auxiliary_loss_clip": 0.06440397, + "auxiliary_loss_mlp": 0.01267365, + "balance_loss_clip": 0.06280472, + "balance_loss_mlp": 0.01255772, + "epoch": 0.47641665414098905, + "flos": 22388634236160.0, + "grad_norm": 2.175648520453788, + "language_loss": 0.82023257, + "learning_rate": 2.2491903855917992e-06, + "loss": 0.8973102, + "num_input_tokens_seen": 170340700, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.11590576, + "step": 7924, + "time_per_iteration": 2.5592448711395264 + }, + { + "auxiliary_loss_clip": 0.06449094, + "auxiliary_loss_mlp": 0.01271258, + "balance_loss_clip": 0.06283164, + "balance_loss_mlp": 0.01257191, + "epoch": 0.476476777393657, + "flos": 25053166546560.0, + "grad_norm": 1.6497722763363074, + "language_loss": 0.80566549, + "learning_rate": 2.2488039540064626e-06, + "loss": 0.88286906, + "num_input_tokens_seen": 170359780, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.14074707, + "step": 7925, + "time_per_iteration": 2.5462217330932617 + }, + { + "auxiliary_loss_clip": 0.06433398, + "auxiliary_loss_mlp": 0.01273204, + "balance_loss_clip": 0.06273591, + "balance_loss_mlp": 0.01259984, + "epoch": 0.476536900646325, + "flos": 27276994707840.0, + "grad_norm": 1.5163925310357687, + "language_loss": 0.72183931, + "learning_rate": 2.2484175129864558e-06, + "loss": 0.79890537, + "num_input_tokens_seen": 170381260, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.13214111, + "step": 7926, + "time_per_iteration": 4.022697448730469 + }, + { + "auxiliary_loss_clip": 0.06443252, + "auxiliary_loss_mlp": 0.01270757, + "balance_loss_clip": 0.062805, + "balance_loss_mlp": 0.01257304, + "epoch": 0.47659702389899294, + "flos": 25308437610240.0, + "grad_norm": 2.540030120332383, + "language_loss": 0.69248974, + "learning_rate": 2.248031062546432e-06, + "loss": 0.76962984, + "num_input_tokens_seen": 170400595, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.13452148, + "step": 7927, + "time_per_iteration": 2.651005744934082 + }, + { + "auxiliary_loss_clip": 0.06432809, + "auxiliary_loss_mlp": 0.01274998, + "balance_loss_clip": 0.06276157, + "balance_loss_mlp": 0.01262928, + "epoch": 0.4766571471516609, + "flos": 25999716994560.0, + "grad_norm": 1.8555909912878064, + "language_loss": 0.68153882, + "learning_rate": 2.247644602701045e-06, + "loss": 0.75861686, + "num_input_tokens_seen": 170421110, + "router_z_loss_clip": 1.56640625, + "router_z_loss_mlp": 0.12072754, + "step": 7928, + "time_per_iteration": 2.6001169681549072 + }, + { + "auxiliary_loss_clip": 0.06439018, + "auxiliary_loss_mlp": 0.01266996, + "balance_loss_clip": 0.06277569, + "balance_loss_mlp": 0.01254497, + "epoch": 0.4767172704043289, + "flos": 16037395257600.0, + "grad_norm": 2.030081429010121, + "language_loss": 0.79402888, + "learning_rate": 2.2472581334649496e-06, + "loss": 0.87108904, + "num_input_tokens_seen": 170436700, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.12506104, + "step": 7929, + "time_per_iteration": 2.4979782104492188 + }, + { + "auxiliary_loss_clip": 0.06434054, + "auxiliary_loss_mlp": 0.0127525, + "balance_loss_clip": 0.06276359, + "balance_loss_mlp": 0.01263496, + "epoch": 0.47677739365699684, + "flos": 39244113233280.0, + "grad_norm": 1.8073767988538123, + "language_loss": 0.67109072, + "learning_rate": 2.2468716548528016e-06, + "loss": 0.74818379, + "num_input_tokens_seen": 170459555, + "router_z_loss_clip": 1.57617188, + "router_z_loss_mlp": 0.11749268, + "step": 7930, + "time_per_iteration": 2.64865779876709 + }, + { + "auxiliary_loss_clip": 0.06440657, + "auxiliary_loss_mlp": 0.01272697, + "balance_loss_clip": 0.06280986, + "balance_loss_mlp": 0.01260484, + "epoch": 0.4768375169096648, + "flos": 24724745268480.0, + "grad_norm": 1.7506463735046407, + "language_loss": 0.79864836, + "learning_rate": 2.2464851668792555e-06, + "loss": 0.87578189, + "num_input_tokens_seen": 170479175, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.12207031, + "step": 7931, + "time_per_iteration": 2.5824391841888428 + }, + { + "auxiliary_loss_clip": 0.06435428, + "auxiliary_loss_mlp": 0.01273232, + "balance_loss_clip": 0.06274468, + "balance_loss_mlp": 0.01260203, + "epoch": 0.47689764016233277, + "flos": 22535270081280.0, + "grad_norm": 2.3707401208689753, + "language_loss": 0.76826382, + "learning_rate": 2.2460986695589678e-06, + "loss": 0.8453505, + "num_input_tokens_seen": 170498450, + "router_z_loss_clip": 1.61035156, + "router_z_loss_mlp": 0.13043213, + "step": 7932, + "time_per_iteration": 2.510439157485962 + }, + { + "auxiliary_loss_clip": 0.06434679, + "auxiliary_loss_mlp": 0.01279125, + "balance_loss_clip": 0.06279778, + "balance_loss_mlp": 0.01266101, + "epoch": 0.47695776341500074, + "flos": 15125742835200.0, + "grad_norm": 3.7494408598150946, + "language_loss": 0.79909194, + "learning_rate": 2.245712162906593e-06, + "loss": 0.87623, + "num_input_tokens_seen": 170516255, + "router_z_loss_clip": 1.54589844, + "router_z_loss_mlp": 0.13012695, + "step": 7933, + "time_per_iteration": 2.5868406295776367 + }, + { + "auxiliary_loss_clip": 0.06440616, + "auxiliary_loss_mlp": 0.01270557, + "balance_loss_clip": 0.06276172, + "balance_loss_mlp": 0.01256889, + "epoch": 0.4770178866676687, + "flos": 14683319677440.0, + "grad_norm": 1.845903856635024, + "language_loss": 0.74363738, + "learning_rate": 2.2453256469367888e-06, + "loss": 0.8207491, + "num_input_tokens_seen": 170532705, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.13677979, + "step": 7934, + "time_per_iteration": 2.467625141143799 + }, + { + "auxiliary_loss_clip": 0.06439498, + "auxiliary_loss_mlp": 0.01269177, + "balance_loss_clip": 0.06278646, + "balance_loss_mlp": 0.01256213, + "epoch": 0.47707800992033667, + "flos": 22572264458880.0, + "grad_norm": 2.1751877197221847, + "language_loss": 0.80426806, + "learning_rate": 2.244939121664211e-06, + "loss": 0.88135481, + "num_input_tokens_seen": 170551925, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.12963867, + "step": 7935, + "time_per_iteration": 2.57150936126709 + }, + { + "auxiliary_loss_clip": 0.06443004, + "auxiliary_loss_mlp": 0.01271494, + "balance_loss_clip": 0.06275547, + "balance_loss_mlp": 0.01257249, + "epoch": 0.4771381331730047, + "flos": 30925868457600.0, + "grad_norm": 1.696374515888555, + "language_loss": 0.71442336, + "learning_rate": 2.2445525871035177e-06, + "loss": 0.7915684, + "num_input_tokens_seen": 170572320, + "router_z_loss_clip": 1.67480469, + "router_z_loss_mlp": 0.14245605, + "step": 7936, + "time_per_iteration": 2.577134609222412 + }, + { + "auxiliary_loss_clip": 0.06440726, + "auxiliary_loss_mlp": 0.01267366, + "balance_loss_clip": 0.06278887, + "balance_loss_mlp": 0.01254593, + "epoch": 0.47719825642567265, + "flos": 25745955304320.0, + "grad_norm": 1.9394747057802306, + "language_loss": 0.68651855, + "learning_rate": 2.2441660432693656e-06, + "loss": 0.76359951, + "num_input_tokens_seen": 170589470, + "router_z_loss_clip": 1.61914062, + "router_z_loss_mlp": 0.12774658, + "step": 7937, + "time_per_iteration": 2.5523571968078613 + }, + { + "auxiliary_loss_clip": 0.06332788, + "auxiliary_loss_mlp": 0.01255518, + "balance_loss_clip": 0.06264147, + "balance_loss_mlp": 0.01252959, + "epoch": 0.4772583796783406, + "flos": 66376344084480.0, + "grad_norm": 0.7063710164794027, + "language_loss": 0.56256598, + "learning_rate": 2.2437794901764128e-06, + "loss": 0.63844901, + "num_input_tokens_seen": 170662265, + "router_z_loss_clip": 0.6875, + "router_z_loss_mlp": 0.02558899, + "step": 7938, + "time_per_iteration": 3.3101401329040527 + }, + { + "auxiliary_loss_clip": 0.06435397, + "auxiliary_loss_mlp": 0.0126663, + "balance_loss_clip": 0.06278569, + "balance_loss_mlp": 0.01252927, + "epoch": 0.4773185029310086, + "flos": 22057068430080.0, + "grad_norm": 1.5498541545702798, + "language_loss": 0.89232612, + "learning_rate": 2.243392927839317e-06, + "loss": 0.96934634, + "num_input_tokens_seen": 170679680, + "router_z_loss_clip": 1.56738281, + "router_z_loss_mlp": 0.13702393, + "step": 7939, + "time_per_iteration": 2.559797525405884 + }, + { + "auxiliary_loss_clip": 0.06434917, + "auxiliary_loss_mlp": 0.01268488, + "balance_loss_clip": 0.06277393, + "balance_loss_mlp": 0.01256239, + "epoch": 0.47737862618367655, + "flos": 16733496251520.0, + "grad_norm": 2.4258721196632456, + "language_loss": 0.77298427, + "learning_rate": 2.2430063562727367e-06, + "loss": 0.85001838, + "num_input_tokens_seen": 170697340, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.12249756, + "step": 7940, + "time_per_iteration": 2.5268869400024414 + }, + { + "auxiliary_loss_clip": 0.06430884, + "auxiliary_loss_mlp": 0.01269812, + "balance_loss_clip": 0.0627719, + "balance_loss_mlp": 0.01257373, + "epoch": 0.4774387494363445, + "flos": 19615508634240.0, + "grad_norm": 1.6559533080399789, + "language_loss": 0.85386801, + "learning_rate": 2.2426197754913322e-06, + "loss": 0.930875, + "num_input_tokens_seen": 170714905, + "router_z_loss_clip": 1.53710938, + "router_z_loss_mlp": 0.12432861, + "step": 7941, + "time_per_iteration": 2.547070264816284 + }, + { + "auxiliary_loss_clip": 0.06437483, + "auxiliary_loss_mlp": 0.01270392, + "balance_loss_clip": 0.06277451, + "balance_loss_mlp": 0.01257965, + "epoch": 0.4774988726890125, + "flos": 16659507496320.0, + "grad_norm": 1.9070361015512296, + "language_loss": 0.76308775, + "learning_rate": 2.24223318550976e-06, + "loss": 0.84016657, + "num_input_tokens_seen": 170731810, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.12420654, + "step": 7942, + "time_per_iteration": 2.4842329025268555 + }, + { + "auxiliary_loss_clip": 0.06440963, + "auxiliary_loss_mlp": 0.01266017, + "balance_loss_clip": 0.06282113, + "balance_loss_mlp": 0.01253601, + "epoch": 0.47755899594168044, + "flos": 20491843760640.0, + "grad_norm": 1.6294214929971118, + "language_loss": 0.64313745, + "learning_rate": 2.241846586342682e-06, + "loss": 0.72020721, + "num_input_tokens_seen": 170750270, + "router_z_loss_clip": 1.58789062, + "router_z_loss_mlp": 0.12402344, + "step": 7943, + "time_per_iteration": 2.5384066104888916 + }, + { + "auxiliary_loss_clip": 0.06444484, + "auxiliary_loss_mlp": 0.01267261, + "balance_loss_clip": 0.06280033, + "balance_loss_mlp": 0.01253493, + "epoch": 0.4776191191943484, + "flos": 21659228444160.0, + "grad_norm": 1.6943023581153507, + "language_loss": 0.73866045, + "learning_rate": 2.2414599780047577e-06, + "loss": 0.8157779, + "num_input_tokens_seen": 170769015, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.13781738, + "step": 7944, + "time_per_iteration": 2.5201148986816406 + }, + { + "auxiliary_loss_clip": 0.06447009, + "auxiliary_loss_mlp": 0.01271608, + "balance_loss_clip": 0.06287117, + "balance_loss_mlp": 0.01258459, + "epoch": 0.4776792424470164, + "flos": 18776125958400.0, + "grad_norm": 2.2429214657199257, + "language_loss": 0.68437827, + "learning_rate": 2.2410733605106456e-06, + "loss": 0.76156443, + "num_input_tokens_seen": 170785725, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.13153076, + "step": 7945, + "time_per_iteration": 2.5126469135284424 + }, + { + "auxiliary_loss_clip": 0.06440154, + "auxiliary_loss_mlp": 0.01270213, + "balance_loss_clip": 0.06280819, + "balance_loss_mlp": 0.01257577, + "epoch": 0.47773936569968434, + "flos": 29723543821440.0, + "grad_norm": 1.8191434389659598, + "language_loss": 0.75203103, + "learning_rate": 2.240686733875009e-06, + "loss": 0.8291347, + "num_input_tokens_seen": 170804600, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.12628174, + "step": 7946, + "time_per_iteration": 2.5952818393707275 + }, + { + "auxiliary_loss_clip": 0.06450987, + "auxiliary_loss_mlp": 0.0126674, + "balance_loss_clip": 0.06288904, + "balance_loss_mlp": 0.0125368, + "epoch": 0.4777994889523523, + "flos": 24798650169600.0, + "grad_norm": 2.1264871549136566, + "language_loss": 0.79598629, + "learning_rate": 2.240300098112506e-06, + "loss": 0.87316352, + "num_input_tokens_seen": 170824230, + "router_z_loss_clip": 1.62109375, + "router_z_loss_mlp": 0.13043213, + "step": 7947, + "time_per_iteration": 2.561429023742676 + }, + { + "auxiliary_loss_clip": 0.06437, + "auxiliary_loss_mlp": 0.01268516, + "balance_loss_clip": 0.06282562, + "balance_loss_mlp": 0.01255302, + "epoch": 0.47785961220502027, + "flos": 17863928484480.0, + "grad_norm": 1.6733844414372485, + "language_loss": 0.73571151, + "learning_rate": 2.2399134532377998e-06, + "loss": 0.81276667, + "num_input_tokens_seen": 170843365, + "router_z_loss_clip": 1.54199219, + "router_z_loss_mlp": 0.13220215, + "step": 7948, + "time_per_iteration": 2.5309975147247314 + }, + { + "auxiliary_loss_clip": 0.06442553, + "auxiliary_loss_mlp": 0.01267736, + "balance_loss_clip": 0.06283022, + "balance_loss_mlp": 0.01253848, + "epoch": 0.4779197354576883, + "flos": 20272770460800.0, + "grad_norm": 2.2305312131568256, + "language_loss": 0.78282905, + "learning_rate": 2.2395267992655514e-06, + "loss": 0.85993195, + "num_input_tokens_seen": 170863515, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.13891602, + "step": 7949, + "time_per_iteration": 2.5135691165924072 + }, + { + "auxiliary_loss_clip": 0.06441014, + "auxiliary_loss_mlp": 0.01264008, + "balance_loss_clip": 0.06285359, + "balance_loss_mlp": 0.01251849, + "epoch": 0.47797985871035625, + "flos": 17062420654080.0, + "grad_norm": 2.4211239692864686, + "language_loss": 0.75134766, + "learning_rate": 2.2391401362104227e-06, + "loss": 0.82839787, + "num_input_tokens_seen": 170881245, + "router_z_loss_clip": 1.55566406, + "router_z_loss_mlp": 0.12164307, + "step": 7950, + "time_per_iteration": 2.5256588459014893 + }, + { + "auxiliary_loss_clip": 0.06439517, + "auxiliary_loss_mlp": 0.01271424, + "balance_loss_clip": 0.0628176, + "balance_loss_mlp": 0.01258668, + "epoch": 0.4780399819630242, + "flos": 31366530679680.0, + "grad_norm": 1.6557560470716002, + "language_loss": 0.744519, + "learning_rate": 2.2387534640870756e-06, + "loss": 0.82162845, + "num_input_tokens_seen": 170901285, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.12756348, + "step": 7951, + "time_per_iteration": 2.6257662773132324 + }, + { + "auxiliary_loss_clip": 0.0644564, + "auxiliary_loss_mlp": 0.0126871, + "balance_loss_clip": 0.06285301, + "balance_loss_mlp": 0.01255925, + "epoch": 0.4781001052156922, + "flos": 24906488774400.0, + "grad_norm": 2.0941094174335, + "language_loss": 0.80880862, + "learning_rate": 2.238366782910174e-06, + "loss": 0.88595212, + "num_input_tokens_seen": 170919740, + "router_z_loss_clip": 1.60351562, + "router_z_loss_mlp": 0.12786865, + "step": 7952, + "time_per_iteration": 2.6039650440216064 + }, + { + "auxiliary_loss_clip": 0.06449462, + "auxiliary_loss_mlp": 0.01273751, + "balance_loss_clip": 0.06286798, + "balance_loss_mlp": 0.01259684, + "epoch": 0.47816022846836015, + "flos": 18703688503680.0, + "grad_norm": 1.7383850677064194, + "language_loss": 0.78965735, + "learning_rate": 2.23798009269438e-06, + "loss": 0.86688948, + "num_input_tokens_seen": 170938510, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.14068604, + "step": 7953, + "time_per_iteration": 3.9394986629486084 + }, + { + "auxiliary_loss_clip": 0.0644647, + "auxiliary_loss_mlp": 0.0126971, + "balance_loss_clip": 0.0628321, + "balance_loss_mlp": 0.01256793, + "epoch": 0.4782203517210281, + "flos": 11981289864960.0, + "grad_norm": 2.1105030234958733, + "language_loss": 0.84721971, + "learning_rate": 2.2375933934543566e-06, + "loss": 0.92438149, + "num_input_tokens_seen": 170951170, + "router_z_loss_clip": 1.62988281, + "router_z_loss_mlp": 0.12921143, + "step": 7954, + "time_per_iteration": 3.9196231365203857 + }, + { + "auxiliary_loss_clip": 0.06440185, + "auxiliary_loss_mlp": 0.0126799, + "balance_loss_clip": 0.06283759, + "balance_loss_mlp": 0.01255282, + "epoch": 0.4782804749736961, + "flos": 20819761914240.0, + "grad_norm": 1.4881886911999394, + "language_loss": 0.70481235, + "learning_rate": 2.237206685204768e-06, + "loss": 0.78189409, + "num_input_tokens_seen": 170970990, + "router_z_loss_clip": 1.56347656, + "router_z_loss_mlp": 0.1270752, + "step": 7955, + "time_per_iteration": 2.5434484481811523 + }, + { + "auxiliary_loss_clip": 0.064454, + "auxiliary_loss_mlp": 0.01270242, + "balance_loss_clip": 0.06284527, + "balance_loss_mlp": 0.01257326, + "epoch": 0.47834059822636404, + "flos": 23846816914560.0, + "grad_norm": 1.553979149808007, + "language_loss": 0.823044, + "learning_rate": 2.2368199679602787e-06, + "loss": 0.90020043, + "num_input_tokens_seen": 170991215, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.12902832, + "step": 7956, + "time_per_iteration": 2.545602560043335 + }, + { + "auxiliary_loss_clip": 0.06441168, + "auxiliary_loss_mlp": 0.01269938, + "balance_loss_clip": 0.06284995, + "balance_loss_mlp": 0.01255627, + "epoch": 0.478400721479032, + "flos": 22639670668800.0, + "grad_norm": 1.9591153371347299, + "language_loss": 0.85127819, + "learning_rate": 2.2364332417355516e-06, + "loss": 0.92838925, + "num_input_tokens_seen": 171007325, + "router_z_loss_clip": 1.56152344, + "router_z_loss_mlp": 0.14300537, + "step": 7957, + "time_per_iteration": 2.548643112182617 + }, + { + "auxiliary_loss_clip": 0.06441608, + "auxiliary_loss_mlp": 0.01269143, + "balance_loss_clip": 0.06285611, + "balance_loss_mlp": 0.01257001, + "epoch": 0.4784608447317, + "flos": 19361118038400.0, + "grad_norm": 7.050300940807432, + "language_loss": 0.79869133, + "learning_rate": 2.2360465065452527e-06, + "loss": 0.87579882, + "num_input_tokens_seen": 171025650, + "router_z_loss_clip": 1.55957031, + "router_z_loss_mlp": 0.12139893, + "step": 7958, + "time_per_iteration": 2.5078237056732178 + }, + { + "auxiliary_loss_clip": 0.06441762, + "auxiliary_loss_mlp": 0.01268959, + "balance_loss_clip": 0.06283723, + "balance_loss_mlp": 0.0125534, + "epoch": 0.47852096798436794, + "flos": 24027386463360.0, + "grad_norm": 1.6951891176109464, + "language_loss": 0.82802176, + "learning_rate": 2.235659762404047e-06, + "loss": 0.90512896, + "num_input_tokens_seen": 171045045, + "router_z_loss_clip": 1.58203125, + "router_z_loss_mlp": 0.1361084, + "step": 7959, + "time_per_iteration": 2.565302610397339 + }, + { + "auxiliary_loss_clip": 0.06438372, + "auxiliary_loss_mlp": 0.01267095, + "balance_loss_clip": 0.06285324, + "balance_loss_mlp": 0.01255615, + "epoch": 0.4785810912370359, + "flos": 25673559776640.0, + "grad_norm": 2.330976037710063, + "language_loss": 0.73464501, + "learning_rate": 2.235273009326599e-06, + "loss": 0.81169969, + "num_input_tokens_seen": 171062910, + "router_z_loss_clip": 1.53027344, + "router_z_loss_mlp": 0.1149292, + "step": 7960, + "time_per_iteration": 4.027269124984741 + }, + { + "auxiliary_loss_clip": 0.06436551, + "auxiliary_loss_mlp": 0.01270036, + "balance_loss_clip": 0.0628148, + "balance_loss_mlp": 0.01258014, + "epoch": 0.47864121448970387, + "flos": 21438226500480.0, + "grad_norm": 3.172971837567245, + "language_loss": 0.77372915, + "learning_rate": 2.2348862473275745e-06, + "loss": 0.85079503, + "num_input_tokens_seen": 171080875, + "router_z_loss_clip": 1.55078125, + "router_z_loss_mlp": 0.12036133, + "step": 7961, + "time_per_iteration": 2.5147969722747803 + }, + { + "auxiliary_loss_clip": 0.06435739, + "auxiliary_loss_mlp": 0.01267875, + "balance_loss_clip": 0.06278273, + "balance_loss_mlp": 0.01255269, + "epoch": 0.47870133774237184, + "flos": 16149468493440.0, + "grad_norm": 1.5337652867811775, + "language_loss": 0.78017688, + "learning_rate": 2.2344994764216405e-06, + "loss": 0.85721302, + "num_input_tokens_seen": 171099190, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.12597656, + "step": 7962, + "time_per_iteration": 2.513148307800293 + }, + { + "auxiliary_loss_clip": 0.06441396, + "auxiliary_loss_mlp": 0.01270097, + "balance_loss_clip": 0.06281849, + "balance_loss_mlp": 0.01257646, + "epoch": 0.47876146099503986, + "flos": 26914094674560.0, + "grad_norm": 1.8277818369463197, + "language_loss": 0.65211046, + "learning_rate": 2.2341126966234635e-06, + "loss": 0.7292254, + "num_input_tokens_seen": 171119060, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.12457275, + "step": 7963, + "time_per_iteration": 2.601811647415161 + }, + { + "auxiliary_loss_clip": 0.06439337, + "auxiliary_loss_mlp": 0.01266508, + "balance_loss_clip": 0.06280507, + "balance_loss_mlp": 0.01253621, + "epoch": 0.4788215842477078, + "flos": 45342470989440.0, + "grad_norm": 2.309935013710649, + "language_loss": 0.77810884, + "learning_rate": 2.2337259079477083e-06, + "loss": 0.85516727, + "num_input_tokens_seen": 171141900, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.12890625, + "step": 7964, + "time_per_iteration": 2.747879981994629 + }, + { + "auxiliary_loss_clip": 0.06446981, + "auxiliary_loss_mlp": 0.01271797, + "balance_loss_clip": 0.06283239, + "balance_loss_mlp": 0.01257218, + "epoch": 0.4788817075003758, + "flos": 22243801253760.0, + "grad_norm": 1.6568781202078557, + "language_loss": 0.76541996, + "learning_rate": 2.233339110409044e-06, + "loss": 0.84260774, + "num_input_tokens_seen": 171161045, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.14587402, + "step": 7965, + "time_per_iteration": 2.562894344329834 + }, + { + "auxiliary_loss_clip": 0.06441608, + "auxiliary_loss_mlp": 0.01268659, + "balance_loss_clip": 0.06281182, + "balance_loss_mlp": 0.01256434, + "epoch": 0.47894183075304375, + "flos": 16476631960320.0, + "grad_norm": 1.6972134667517975, + "language_loss": 0.74819887, + "learning_rate": 2.232952304022137e-06, + "loss": 0.82530153, + "num_input_tokens_seen": 171179675, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.12237549, + "step": 7966, + "time_per_iteration": 4.023793697357178 + }, + { + "auxiliary_loss_clip": 0.06437664, + "auxiliary_loss_mlp": 0.01265562, + "balance_loss_clip": 0.06279117, + "balance_loss_mlp": 0.01253033, + "epoch": 0.4790019540057117, + "flos": 24290036686080.0, + "grad_norm": 1.5237416858661557, + "language_loss": 0.73335361, + "learning_rate": 2.232565488801655e-06, + "loss": 0.81038582, + "num_input_tokens_seen": 171201175, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.12518311, + "step": 7967, + "time_per_iteration": 2.586228847503662 + }, + { + "auxiliary_loss_clip": 0.06429637, + "auxiliary_loss_mlp": 0.01267705, + "balance_loss_clip": 0.06277768, + "balance_loss_mlp": 0.01254825, + "epoch": 0.4790620772583797, + "flos": 25673601703680.0, + "grad_norm": 2.2388113154567058, + "language_loss": 0.79254079, + "learning_rate": 2.232178664762267e-06, + "loss": 0.86951417, + "num_input_tokens_seen": 171221750, + "router_z_loss_clip": 1.51855469, + "router_z_loss_mlp": 0.12896729, + "step": 7968, + "time_per_iteration": 2.569835901260376 + }, + { + "auxiliary_loss_clip": 0.06330545, + "auxiliary_loss_mlp": 0.01255481, + "balance_loss_clip": 0.06260878, + "balance_loss_mlp": 0.01252947, + "epoch": 0.47912220051104765, + "flos": 69451168711680.0, + "grad_norm": 0.7701358383106056, + "language_loss": 0.62163401, + "learning_rate": 2.2317918319186408e-06, + "loss": 0.69749427, + "num_input_tokens_seen": 171292235, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 0.02534485, + "step": 7969, + "time_per_iteration": 3.2898826599121094 + }, + { + "auxiliary_loss_clip": 0.06435778, + "auxiliary_loss_mlp": 0.01265918, + "balance_loss_clip": 0.06281342, + "balance_loss_mlp": 0.012529, + "epoch": 0.4791823237637156, + "flos": 24175531681920.0, + "grad_norm": 1.7909857243287752, + "language_loss": 0.77847564, + "learning_rate": 2.2314049902854446e-06, + "loss": 0.85549259, + "num_input_tokens_seen": 171312215, + "router_z_loss_clip": 1.54296875, + "router_z_loss_mlp": 0.13006592, + "step": 7970, + "time_per_iteration": 2.5170607566833496 + }, + { + "auxiliary_loss_clip": 0.06435491, + "auxiliary_loss_mlp": 0.01267513, + "balance_loss_clip": 0.06276551, + "balance_loss_mlp": 0.0125384, + "epoch": 0.4792424470163836, + "flos": 24757966212480.0, + "grad_norm": 1.6160167990193877, + "language_loss": 0.71182537, + "learning_rate": 2.231018139877349e-06, + "loss": 0.78885543, + "num_input_tokens_seen": 171332975, + "router_z_loss_clip": 1.58886719, + "router_z_loss_mlp": 0.13665771, + "step": 7971, + "time_per_iteration": 2.572124719619751 + }, + { + "auxiliary_loss_clip": 0.06436221, + "auxiliary_loss_mlp": 0.01271919, + "balance_loss_clip": 0.06279434, + "balance_loss_mlp": 0.01258836, + "epoch": 0.47930257026905154, + "flos": 23264550092160.0, + "grad_norm": 1.2950674857674533, + "language_loss": 0.80144143, + "learning_rate": 2.230631280709021e-06, + "loss": 0.87852287, + "num_input_tokens_seen": 171353880, + "router_z_loss_clip": 1.56738281, + "router_z_loss_mlp": 0.1307373, + "step": 7972, + "time_per_iteration": 2.545262575149536 + }, + { + "auxiliary_loss_clip": 0.06442808, + "auxiliary_loss_mlp": 0.01270328, + "balance_loss_clip": 0.06281324, + "balance_loss_mlp": 0.01256392, + "epoch": 0.4793626935217195, + "flos": 14069299357440.0, + "grad_norm": 2.062531710859889, + "language_loss": 0.70572007, + "learning_rate": 2.2302444127951327e-06, + "loss": 0.7828514, + "num_input_tokens_seen": 171370930, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.13934326, + "step": 7973, + "time_per_iteration": 2.5338237285614014 + }, + { + "auxiliary_loss_clip": 0.064371, + "auxiliary_loss_mlp": 0.01270261, + "balance_loss_clip": 0.06283109, + "balance_loss_mlp": 0.0125806, + "epoch": 0.4794228167743875, + "flos": 21805319237760.0, + "grad_norm": 1.7273933233655367, + "language_loss": 0.79198468, + "learning_rate": 2.2298575361503523e-06, + "loss": 0.86905837, + "num_input_tokens_seen": 171387575, + "router_z_loss_clip": 1.54101562, + "router_z_loss_mlp": 0.12200928, + "step": 7974, + "time_per_iteration": 2.5069854259490967 + }, + { + "auxiliary_loss_clip": 0.06339005, + "auxiliary_loss_mlp": 0.01258702, + "balance_loss_clip": 0.06269643, + "balance_loss_mlp": 0.01255866, + "epoch": 0.47948294002705544, + "flos": 66989022739200.0, + "grad_norm": 0.7443790840370731, + "language_loss": 0.53920376, + "learning_rate": 2.2294706507893517e-06, + "loss": 0.61518085, + "num_input_tokens_seen": 171449980, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 0.02832031, + "step": 7975, + "time_per_iteration": 3.2263216972351074 + }, + { + "auxiliary_loss_clip": 0.06450166, + "auxiliary_loss_mlp": 0.01269981, + "balance_loss_clip": 0.06283702, + "balance_loss_mlp": 0.0125465, + "epoch": 0.47954306327972346, + "flos": 12427444529280.0, + "grad_norm": 1.9824704830592612, + "language_loss": 0.90397954, + "learning_rate": 2.2290837567268008e-06, + "loss": 0.98118103, + "num_input_tokens_seen": 171465290, + "router_z_loss_clip": 1.6640625, + "router_z_loss_mlp": 0.15313721, + "step": 7976, + "time_per_iteration": 2.5806965827941895 + }, + { + "auxiliary_loss_clip": 0.06448781, + "auxiliary_loss_mlp": 0.01272852, + "balance_loss_clip": 0.06284519, + "balance_loss_mlp": 0.01257629, + "epoch": 0.4796031865323914, + "flos": 18366630255360.0, + "grad_norm": 3.7288296944586166, + "language_loss": 0.73905623, + "learning_rate": 2.2286968539773713e-06, + "loss": 0.81627262, + "num_input_tokens_seen": 171481130, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.15209961, + "step": 7977, + "time_per_iteration": 2.5562849044799805 + }, + { + "auxiliary_loss_clip": 0.06437217, + "auxiliary_loss_mlp": 0.01268705, + "balance_loss_clip": 0.06283021, + "balance_loss_mlp": 0.01255741, + "epoch": 0.4796633097850594, + "flos": 21841517001600.0, + "grad_norm": 1.607227573724713, + "language_loss": 0.78873986, + "learning_rate": 2.228309942555734e-06, + "loss": 0.86579907, + "num_input_tokens_seen": 171501140, + "router_z_loss_clip": 1.54296875, + "router_z_loss_mlp": 0.12976074, + "step": 7978, + "time_per_iteration": 2.558842420578003 + }, + { + "auxiliary_loss_clip": 0.06440634, + "auxiliary_loss_mlp": 0.01269299, + "balance_loss_clip": 0.06280127, + "balance_loss_mlp": 0.01255214, + "epoch": 0.47972343303772735, + "flos": 23443526413440.0, + "grad_norm": 1.9276236664860738, + "language_loss": 0.89800453, + "learning_rate": 2.22792302247656e-06, + "loss": 0.97510386, + "num_input_tokens_seen": 171519835, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.14099121, + "step": 7979, + "time_per_iteration": 2.5952987670898438 + }, + { + "auxiliary_loss_clip": 0.06446249, + "auxiliary_loss_mlp": 0.01270987, + "balance_loss_clip": 0.06283665, + "balance_loss_mlp": 0.01256378, + "epoch": 0.4797835562903953, + "flos": 24906698409600.0, + "grad_norm": 1.4562164603157606, + "language_loss": 0.7704469, + "learning_rate": 2.227536093754523e-06, + "loss": 0.8476193, + "num_input_tokens_seen": 171540980, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.14605713, + "step": 7980, + "time_per_iteration": 2.5736522674560547 + }, + { + "auxiliary_loss_clip": 0.06447264, + "auxiliary_loss_mlp": 0.01273404, + "balance_loss_clip": 0.06281359, + "balance_loss_mlp": 0.01258938, + "epoch": 0.4798436795430633, + "flos": 35051644120320.0, + "grad_norm": 1.875578547391537, + "language_loss": 0.71508431, + "learning_rate": 2.227149156404295e-06, + "loss": 0.79229099, + "num_input_tokens_seen": 171563600, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.14459229, + "step": 7981, + "time_per_iteration": 2.6367290019989014 + }, + { + "auxiliary_loss_clip": 0.06439552, + "auxiliary_loss_mlp": 0.01273941, + "balance_loss_clip": 0.06281938, + "balance_loss_mlp": 0.01258998, + "epoch": 0.47990380279573125, + "flos": 20595699296640.0, + "grad_norm": 1.7763359166784585, + "language_loss": 0.70155972, + "learning_rate": 2.2267622104405473e-06, + "loss": 0.77869463, + "num_input_tokens_seen": 171580700, + "router_z_loss_clip": 1.57519531, + "router_z_loss_mlp": 0.14935303, + "step": 7982, + "time_per_iteration": 2.5258874893188477 + }, + { + "auxiliary_loss_clip": 0.06432236, + "auxiliary_loss_mlp": 0.0126906, + "balance_loss_clip": 0.06278554, + "balance_loss_mlp": 0.01257079, + "epoch": 0.4799639260483992, + "flos": 26366600096640.0, + "grad_norm": 1.7437778110304778, + "language_loss": 0.71608925, + "learning_rate": 2.2263752558779544e-06, + "loss": 0.79310226, + "num_input_tokens_seen": 171602035, + "router_z_loss_clip": 1.53417969, + "router_z_loss_mlp": 0.11975098, + "step": 7983, + "time_per_iteration": 2.568826913833618 + }, + { + "auxiliary_loss_clip": 0.06340544, + "auxiliary_loss_mlp": 0.01252804, + "balance_loss_clip": 0.06272082, + "balance_loss_mlp": 0.01249972, + "epoch": 0.4800240493010672, + "flos": 70999371002880.0, + "grad_norm": 0.765879442061108, + "language_loss": 0.59357727, + "learning_rate": 2.2259882927311883e-06, + "loss": 0.66951072, + "num_input_tokens_seen": 171659215, + "router_z_loss_clip": 0.68554688, + "router_z_loss_mlp": 0.02828979, + "step": 7984, + "time_per_iteration": 3.1084651947021484 + }, + { + "auxiliary_loss_clip": 0.0643955, + "auxiliary_loss_mlp": 0.01275134, + "balance_loss_clip": 0.0628207, + "balance_loss_mlp": 0.01262152, + "epoch": 0.48008417255373514, + "flos": 17091406967040.0, + "grad_norm": 1.5773823669430012, + "language_loss": 0.67127079, + "learning_rate": 2.2256013210149247e-06, + "loss": 0.74841756, + "num_input_tokens_seen": 171675710, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.12988281, + "step": 7985, + "time_per_iteration": 2.4906041622161865 + }, + { + "auxiliary_loss_clip": 0.06439713, + "auxiliary_loss_mlp": 0.01270507, + "balance_loss_clip": 0.0627727, + "balance_loss_mlp": 0.01256458, + "epoch": 0.4801442958064031, + "flos": 15418762963200.0, + "grad_norm": 1.6902399231491212, + "language_loss": 0.70749509, + "learning_rate": 2.225214340743835e-06, + "loss": 0.78459728, + "num_input_tokens_seen": 171692510, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.14056396, + "step": 7986, + "time_per_iteration": 2.52093243598938 + }, + { + "auxiliary_loss_clip": 0.06445119, + "auxiliary_loss_mlp": 0.01273703, + "balance_loss_clip": 0.06282695, + "balance_loss_mlp": 0.0125972, + "epoch": 0.4802044190590711, + "flos": 11478546167040.0, + "grad_norm": 1.9459651571320913, + "language_loss": 0.79178715, + "learning_rate": 2.2248273519325956e-06, + "loss": 0.86897534, + "num_input_tokens_seen": 171710235, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.13983154, + "step": 7987, + "time_per_iteration": 2.498640537261963 + }, + { + "auxiliary_loss_clip": 0.06442459, + "auxiliary_loss_mlp": 0.01274239, + "balance_loss_clip": 0.06282187, + "balance_loss_mlp": 0.01260029, + "epoch": 0.48026454231173904, + "flos": 20955874072320.0, + "grad_norm": 2.568897435463935, + "language_loss": 0.75366008, + "learning_rate": 2.2244403545958812e-06, + "loss": 0.83082712, + "num_input_tokens_seen": 171726715, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.14215088, + "step": 7988, + "time_per_iteration": 2.516512632369995 + }, + { + "auxiliary_loss_clip": 0.0644449, + "auxiliary_loss_mlp": 0.01267812, + "balance_loss_clip": 0.06284034, + "balance_loss_mlp": 0.01254651, + "epoch": 0.48032466556440706, + "flos": 20454220477440.0, + "grad_norm": 2.121657383550553, + "language_loss": 0.79781222, + "learning_rate": 2.224053348748365e-06, + "loss": 0.87493527, + "num_input_tokens_seen": 171743605, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.13140869, + "step": 7989, + "time_per_iteration": 2.5021252632141113 + }, + { + "auxiliary_loss_clip": 0.06450642, + "auxiliary_loss_mlp": 0.01272628, + "balance_loss_clip": 0.0628516, + "balance_loss_mlp": 0.01259277, + "epoch": 0.480384788817075, + "flos": 37129507269120.0, + "grad_norm": 1.6027553338262992, + "language_loss": 0.73628318, + "learning_rate": 2.223666334404724e-06, + "loss": 0.81351584, + "num_input_tokens_seen": 171765445, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.13360596, + "step": 7990, + "time_per_iteration": 2.678316593170166 + }, + { + "auxiliary_loss_clip": 0.06340674, + "auxiliary_loss_mlp": 0.01254539, + "balance_loss_clip": 0.06272323, + "balance_loss_mlp": 0.01252124, + "epoch": 0.480444912069743, + "flos": 69572103281280.0, + "grad_norm": 0.7463246314152452, + "language_loss": 0.59028065, + "learning_rate": 2.223279311579633e-06, + "loss": 0.66623276, + "num_input_tokens_seen": 171830115, + "router_z_loss_clip": 0.68359375, + "router_z_loss_mlp": 0.02412415, + "step": 7991, + "time_per_iteration": 3.2123708724975586 + }, + { + "auxiliary_loss_clip": 0.06440669, + "auxiliary_loss_mlp": 0.0127166, + "balance_loss_clip": 0.06280738, + "balance_loss_mlp": 0.01258493, + "epoch": 0.48050503532241096, + "flos": 29829453782400.0, + "grad_norm": 1.8077991766436714, + "language_loss": 0.67425305, + "learning_rate": 2.222892280287768e-06, + "loss": 0.75137639, + "num_input_tokens_seen": 171849135, + "router_z_loss_clip": 1.60058594, + "router_z_loss_mlp": 0.1317749, + "step": 7992, + "time_per_iteration": 4.022457599639893 + }, + { + "auxiliary_loss_clip": 0.06441684, + "auxiliary_loss_mlp": 0.01270903, + "balance_loss_clip": 0.06280079, + "balance_loss_mlp": 0.01257289, + "epoch": 0.4805651585750789, + "flos": 23954865154560.0, + "grad_norm": 1.520335815005364, + "language_loss": 0.76567221, + "learning_rate": 2.2225052405438056e-06, + "loss": 0.84279805, + "num_input_tokens_seen": 171868880, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.13616943, + "step": 7993, + "time_per_iteration": 2.5975513458251953 + }, + { + "auxiliary_loss_clip": 0.0643717, + "auxiliary_loss_mlp": 0.012705, + "balance_loss_clip": 0.06281759, + "balance_loss_mlp": 0.01257101, + "epoch": 0.4806252818277469, + "flos": 25672385819520.0, + "grad_norm": 1.5304271246014225, + "language_loss": 0.78575444, + "learning_rate": 2.222118192362422e-06, + "loss": 0.86283118, + "num_input_tokens_seen": 171889455, + "router_z_loss_clip": 1.55273438, + "router_z_loss_mlp": 0.1340332, + "step": 7994, + "time_per_iteration": 3.9770989418029785 + }, + { + "auxiliary_loss_clip": 0.06441342, + "auxiliary_loss_mlp": 0.01268981, + "balance_loss_clip": 0.06282856, + "balance_loss_mlp": 0.01255284, + "epoch": 0.48068540508041485, + "flos": 13157059956480.0, + "grad_norm": 1.7612496141579397, + "language_loss": 0.80023497, + "learning_rate": 2.2217311357582946e-06, + "loss": 0.87733817, + "num_input_tokens_seen": 171906070, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.13702393, + "step": 7995, + "time_per_iteration": 2.565765380859375 + }, + { + "auxiliary_loss_clip": 0.06436922, + "auxiliary_loss_mlp": 0.01271915, + "balance_loss_clip": 0.06281693, + "balance_loss_mlp": 0.01259499, + "epoch": 0.4807455283330828, + "flos": 21182787728640.0, + "grad_norm": 1.7014068364920145, + "language_loss": 0.82857656, + "learning_rate": 2.2213440707461e-06, + "loss": 0.90566498, + "num_input_tokens_seen": 171926515, + "router_z_loss_clip": 1.55078125, + "router_z_loss_mlp": 0.12408447, + "step": 7996, + "time_per_iteration": 2.5223636627197266 + }, + { + "auxiliary_loss_clip": 0.06437848, + "auxiliary_loss_mlp": 0.01273993, + "balance_loss_clip": 0.06283682, + "balance_loss_mlp": 0.0126104, + "epoch": 0.4808056515857508, + "flos": 12280850611200.0, + "grad_norm": 2.0553444119055095, + "language_loss": 0.81048906, + "learning_rate": 2.220956997340516e-06, + "loss": 0.88760751, + "num_input_tokens_seen": 171943845, + "router_z_loss_clip": 1.54003906, + "router_z_loss_mlp": 0.12957764, + "step": 7997, + "time_per_iteration": 2.5387723445892334 + }, + { + "auxiliary_loss_clip": 0.06439243, + "auxiliary_loss_mlp": 0.01272881, + "balance_loss_clip": 0.06278609, + "balance_loss_mlp": 0.01258886, + "epoch": 0.48086577483841875, + "flos": 24832835435520.0, + "grad_norm": 1.673774189345091, + "language_loss": 0.72584945, + "learning_rate": 2.220569915556221e-06, + "loss": 0.80297071, + "num_input_tokens_seen": 171964970, + "router_z_loss_clip": 1.60742188, + "router_z_loss_mlp": 0.13989258, + "step": 7998, + "time_per_iteration": 2.5332131385803223 + }, + { + "auxiliary_loss_clip": 0.06438513, + "auxiliary_loss_mlp": 0.0127211, + "balance_loss_clip": 0.06282588, + "balance_loss_mlp": 0.01258931, + "epoch": 0.4809258980910867, + "flos": 24472786440960.0, + "grad_norm": 1.7584112558628078, + "language_loss": 0.71207035, + "learning_rate": 2.220182825407892e-06, + "loss": 0.78917658, + "num_input_tokens_seen": 171986340, + "router_z_loss_clip": 1.55664062, + "router_z_loss_mlp": 0.1317749, + "step": 7999, + "time_per_iteration": 2.5675172805786133 + }, + { + "auxiliary_loss_clip": 0.06447413, + "auxiliary_loss_mlp": 0.01268559, + "balance_loss_clip": 0.06285158, + "balance_loss_mlp": 0.01254581, + "epoch": 0.4809860213437547, + "flos": 21222465436800.0, + "grad_norm": 1.5803850534596136, + "language_loss": 0.71622467, + "learning_rate": 2.2197957269102083e-06, + "loss": 0.79338437, + "num_input_tokens_seen": 172007300, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.13983154, + "step": 8000, + "time_per_iteration": 4.0574305057525635 + }, + { + "auxiliary_loss_clip": 0.06440975, + "auxiliary_loss_mlp": 0.01266748, + "balance_loss_clip": 0.06282955, + "balance_loss_mlp": 0.01253558, + "epoch": 0.48104614459642264, + "flos": 37640929864320.0, + "grad_norm": 1.3783876991224597, + "language_loss": 0.75060636, + "learning_rate": 2.2194086200778485e-06, + "loss": 0.82768357, + "num_input_tokens_seen": 172029585, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.13189697, + "step": 8001, + "time_per_iteration": 2.6750619411468506 + }, + { + "auxiliary_loss_clip": 0.06444116, + "auxiliary_loss_mlp": 0.01269598, + "balance_loss_clip": 0.06285578, + "balance_loss_mlp": 0.0125667, + "epoch": 0.48110626784909066, + "flos": 18412093895040.0, + "grad_norm": 3.3850625220280066, + "language_loss": 0.81721932, + "learning_rate": 2.219021504925493e-06, + "loss": 0.89435649, + "num_input_tokens_seen": 172047495, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.12921143, + "step": 8002, + "time_per_iteration": 2.537611961364746 + }, + { + "auxiliary_loss_clip": 0.06444092, + "auxiliary_loss_mlp": 0.01266064, + "balance_loss_clip": 0.06282309, + "balance_loss_mlp": 0.0125232, + "epoch": 0.48116639110175863, + "flos": 28447481992320.0, + "grad_norm": 1.6717054522334394, + "language_loss": 0.71586967, + "learning_rate": 2.218634381467819e-06, + "loss": 0.79297119, + "num_input_tokens_seen": 172067625, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.13739014, + "step": 8003, + "time_per_iteration": 2.586836576461792 + }, + { + "auxiliary_loss_clip": 0.06435338, + "auxiliary_loss_mlp": 0.01268946, + "balance_loss_clip": 0.0628237, + "balance_loss_mlp": 0.01256375, + "epoch": 0.4812265143544266, + "flos": 21731582044800.0, + "grad_norm": 1.5740971137450945, + "language_loss": 0.82286322, + "learning_rate": 2.218247249719507e-06, + "loss": 0.89990604, + "num_input_tokens_seen": 172087885, + "router_z_loss_clip": 1.52832031, + "router_z_loss_mlp": 0.12561035, + "step": 8004, + "time_per_iteration": 2.5606155395507812 + }, + { + "auxiliary_loss_clip": 0.06454347, + "auxiliary_loss_mlp": 0.01272857, + "balance_loss_clip": 0.06285338, + "balance_loss_mlp": 0.01258004, + "epoch": 0.48128663760709456, + "flos": 13229707046400.0, + "grad_norm": 2.0390359670143465, + "language_loss": 0.77871376, + "learning_rate": 2.217860109695239e-06, + "loss": 0.85598582, + "num_input_tokens_seen": 172105815, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.14837646, + "step": 8005, + "time_per_iteration": 2.47816801071167 + }, + { + "auxiliary_loss_clip": 0.06444031, + "auxiliary_loss_mlp": 0.01265777, + "balance_loss_clip": 0.06283107, + "balance_loss_mlp": 0.01252902, + "epoch": 0.4813467608597625, + "flos": 24250317050880.0, + "grad_norm": 8.997763816911675, + "language_loss": 0.71145892, + "learning_rate": 2.217472961409692e-06, + "loss": 0.78855699, + "num_input_tokens_seen": 172126125, + "router_z_loss_clip": 1.61132812, + "router_z_loss_mlp": 0.12866211, + "step": 8006, + "time_per_iteration": 3.998465061187744 + }, + { + "auxiliary_loss_clip": 0.06443979, + "auxiliary_loss_mlp": 0.0126724, + "balance_loss_clip": 0.06283164, + "balance_loss_mlp": 0.01253502, + "epoch": 0.4814068841124305, + "flos": 27486131299200.0, + "grad_norm": 1.774717747938, + "language_loss": 0.7057631, + "learning_rate": 2.2170858048775495e-06, + "loss": 0.78287524, + "num_input_tokens_seen": 172141945, + "router_z_loss_clip": 1.60839844, + "router_z_loss_mlp": 0.13726807, + "step": 8007, + "time_per_iteration": 2.6010959148406982 + }, + { + "auxiliary_loss_clip": 0.06445048, + "auxiliary_loss_mlp": 0.01270091, + "balance_loss_clip": 0.06283326, + "balance_loss_mlp": 0.01256382, + "epoch": 0.48146700736509845, + "flos": 19578933527040.0, + "grad_norm": 1.7543289086675633, + "language_loss": 0.72215438, + "learning_rate": 2.2166986401134914e-06, + "loss": 0.79930574, + "num_input_tokens_seen": 172161095, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.137146, + "step": 8008, + "time_per_iteration": 2.5119597911834717 + }, + { + "auxiliary_loss_clip": 0.064485, + "auxiliary_loss_mlp": 0.01270116, + "balance_loss_clip": 0.06287649, + "balance_loss_mlp": 0.01256699, + "epoch": 0.4815271306177664, + "flos": 20633448360960.0, + "grad_norm": 2.3493781090087427, + "language_loss": 0.61680824, + "learning_rate": 2.216311467132199e-06, + "loss": 0.6939944, + "num_input_tokens_seen": 172178750, + "router_z_loss_clip": 1.60839844, + "router_z_loss_mlp": 0.13421631, + "step": 8009, + "time_per_iteration": 2.531614303588867 + }, + { + "auxiliary_loss_clip": 0.06337314, + "auxiliary_loss_mlp": 0.01256915, + "balance_loss_clip": 0.062691, + "balance_loss_mlp": 0.01254566, + "epoch": 0.4815872538704344, + "flos": 67710168904320.0, + "grad_norm": 0.8824544242806498, + "language_loss": 0.61164761, + "learning_rate": 2.2159242859483547e-06, + "loss": 0.68758988, + "num_input_tokens_seen": 172240235, + "router_z_loss_clip": 0.68017578, + "router_z_loss_mlp": 0.0234375, + "step": 8010, + "time_per_iteration": 3.1565909385681152 + }, + { + "auxiliary_loss_clip": 0.06445675, + "auxiliary_loss_mlp": 0.01270127, + "balance_loss_clip": 0.06287005, + "balance_loss_mlp": 0.01256364, + "epoch": 0.48164737712310235, + "flos": 22827451668480.0, + "grad_norm": 1.6746394307020662, + "language_loss": 0.73637664, + "learning_rate": 2.215537096576639e-06, + "loss": 0.81353462, + "num_input_tokens_seen": 172259875, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.1373291, + "step": 8011, + "time_per_iteration": 2.6046555042266846 + }, + { + "auxiliary_loss_clip": 0.0643819, + "auxiliary_loss_mlp": 0.01270392, + "balance_loss_clip": 0.06284268, + "balance_loss_mlp": 0.01257887, + "epoch": 0.4817075003757703, + "flos": 23740865026560.0, + "grad_norm": 1.8215201759984196, + "language_loss": 0.79494172, + "learning_rate": 2.2151498990317354e-06, + "loss": 0.87202752, + "num_input_tokens_seen": 172280150, + "router_z_loss_clip": 1.5390625, + "router_z_loss_mlp": 0.125, + "step": 8012, + "time_per_iteration": 2.5538861751556396 + }, + { + "auxiliary_loss_clip": 0.06444636, + "auxiliary_loss_mlp": 0.0127321, + "balance_loss_clip": 0.0628611, + "balance_loss_mlp": 0.01259501, + "epoch": 0.4817676236284383, + "flos": 28190282284800.0, + "grad_norm": 1.6047815948624113, + "language_loss": 0.73606604, + "learning_rate": 2.214762693328326e-06, + "loss": 0.81324452, + "num_input_tokens_seen": 172300810, + "router_z_loss_clip": 1.58398438, + "router_z_loss_mlp": 0.1373291, + "step": 8013, + "time_per_iteration": 2.6944220066070557 + }, + { + "auxiliary_loss_clip": 0.06441531, + "auxiliary_loss_mlp": 0.01264943, + "balance_loss_clip": 0.06285915, + "balance_loss_mlp": 0.01253094, + "epoch": 0.48182774688110624, + "flos": 17097360606720.0, + "grad_norm": 1.8755216355849496, + "language_loss": 0.91141838, + "learning_rate": 2.214375479481094e-06, + "loss": 0.98848319, + "num_input_tokens_seen": 172317930, + "router_z_loss_clip": 1.55371094, + "router_z_loss_mlp": 0.11859131, + "step": 8014, + "time_per_iteration": 2.501678466796875 + }, + { + "auxiliary_loss_clip": 0.06448989, + "auxiliary_loss_mlp": 0.0126993, + "balance_loss_clip": 0.06285382, + "balance_loss_mlp": 0.01256149, + "epoch": 0.4818878701337742, + "flos": 12572780636160.0, + "grad_norm": 2.068904383285823, + "language_loss": 0.75191212, + "learning_rate": 2.213988257504722e-06, + "loss": 0.82910132, + "num_input_tokens_seen": 172336340, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.13775635, + "step": 8015, + "time_per_iteration": 2.574915885925293 + }, + { + "auxiliary_loss_clip": 0.06450102, + "auxiliary_loss_mlp": 0.01268556, + "balance_loss_clip": 0.06285062, + "balance_loss_mlp": 0.01254942, + "epoch": 0.48194799338644223, + "flos": 24615481144320.0, + "grad_norm": 2.7940595212226693, + "language_loss": 0.80323374, + "learning_rate": 2.213601027413894e-06, + "loss": 0.88042033, + "num_input_tokens_seen": 172354315, + "router_z_loss_clip": 1.65136719, + "router_z_loss_mlp": 0.13604736, + "step": 8016, + "time_per_iteration": 2.545562744140625 + }, + { + "auxiliary_loss_clip": 0.06441234, + "auxiliary_loss_mlp": 0.01268233, + "balance_loss_clip": 0.06288698, + "balance_loss_mlp": 0.01255996, + "epoch": 0.4820081166391102, + "flos": 21111482304000.0, + "grad_norm": 1.7856263642868424, + "language_loss": 0.77840865, + "learning_rate": 2.2132137892232933e-06, + "loss": 0.85550332, + "num_input_tokens_seen": 172372695, + "router_z_loss_clip": 1.52636719, + "router_z_loss_mlp": 0.12237549, + "step": 8017, + "time_per_iteration": 2.548884153366089 + }, + { + "auxiliary_loss_clip": 0.06442289, + "auxiliary_loss_mlp": 0.01274842, + "balance_loss_clip": 0.06287417, + "balance_loss_mlp": 0.01261729, + "epoch": 0.48206823989177816, + "flos": 25271569013760.0, + "grad_norm": 1.8858588216369734, + "language_loss": 0.80356038, + "learning_rate": 2.2128265429476043e-06, + "loss": 0.8807317, + "num_input_tokens_seen": 172390905, + "router_z_loss_clip": 1.54785156, + "router_z_loss_mlp": 0.13098145, + "step": 8018, + "time_per_iteration": 2.5485877990722656 + }, + { + "auxiliary_loss_clip": 0.06443836, + "auxiliary_loss_mlp": 0.01268171, + "balance_loss_clip": 0.06283845, + "balance_loss_mlp": 0.01255177, + "epoch": 0.4821283631444461, + "flos": 24652056251520.0, + "grad_norm": 1.8013341989070415, + "language_loss": 0.76402384, + "learning_rate": 2.2124392886015124e-06, + "loss": 0.84114391, + "num_input_tokens_seen": 172412295, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.12988281, + "step": 8019, + "time_per_iteration": 2.583380937576294 + }, + { + "auxiliary_loss_clip": 0.06444359, + "auxiliary_loss_mlp": 0.01271658, + "balance_loss_clip": 0.06285813, + "balance_loss_mlp": 0.01258826, + "epoch": 0.4821884863971141, + "flos": 23959015931520.0, + "grad_norm": 1.6800720935629156, + "language_loss": 0.79355383, + "learning_rate": 2.212052026199701e-06, + "loss": 0.87071395, + "num_input_tokens_seen": 172432625, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 0.12841797, + "step": 8020, + "time_per_iteration": 2.531282663345337 + }, + { + "auxiliary_loss_clip": 0.06436829, + "auxiliary_loss_mlp": 0.01270595, + "balance_loss_clip": 0.06283663, + "balance_loss_mlp": 0.01257655, + "epoch": 0.48224860964978206, + "flos": 17165605357440.0, + "grad_norm": 1.8962985695511603, + "language_loss": 0.70203435, + "learning_rate": 2.211664755756855e-06, + "loss": 0.77910858, + "num_input_tokens_seen": 172450010, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.12945557, + "step": 8021, + "time_per_iteration": 2.5050454139709473 + }, + { + "auxiliary_loss_clip": 0.06448636, + "auxiliary_loss_mlp": 0.01267557, + "balance_loss_clip": 0.06284462, + "balance_loss_mlp": 0.01253568, + "epoch": 0.48230873290245, + "flos": 23082513096960.0, + "grad_norm": 1.8444275684859448, + "language_loss": 0.63131356, + "learning_rate": 2.2112774772876603e-06, + "loss": 0.70847559, + "num_input_tokens_seen": 172469080, + "router_z_loss_clip": 1.63964844, + "router_z_loss_mlp": 0.14001465, + "step": 8022, + "time_per_iteration": 2.5153286457061768 + }, + { + "auxiliary_loss_clip": 0.06439438, + "auxiliary_loss_mlp": 0.0127221, + "balance_loss_clip": 0.06284659, + "balance_loss_mlp": 0.01259544, + "epoch": 0.482368856155118, + "flos": 19359440956800.0, + "grad_norm": 2.0552590280374625, + "language_loss": 0.67256629, + "learning_rate": 2.2108901908068028e-06, + "loss": 0.74968272, + "num_input_tokens_seen": 172484850, + "router_z_loss_clip": 1.54785156, + "router_z_loss_mlp": 0.12664795, + "step": 8023, + "time_per_iteration": 2.5504207611083984 + }, + { + "auxiliary_loss_clip": 0.06441902, + "auxiliary_loss_mlp": 0.01274331, + "balance_loss_clip": 0.06284256, + "balance_loss_mlp": 0.01261426, + "epoch": 0.48242897940778595, + "flos": 20084318628480.0, + "grad_norm": 1.5610336564699971, + "language_loss": 0.76933229, + "learning_rate": 2.2105028963289683e-06, + "loss": 0.84649462, + "num_input_tokens_seen": 172503525, + "router_z_loss_clip": 1.57714844, + "router_z_loss_mlp": 0.12915039, + "step": 8024, + "time_per_iteration": 2.576347589492798 + }, + { + "auxiliary_loss_clip": 0.06441621, + "auxiliary_loss_mlp": 0.01268624, + "balance_loss_clip": 0.06283119, + "balance_loss_mlp": 0.01255553, + "epoch": 0.4824891026604539, + "flos": 23410682812800.0, + "grad_norm": 1.519749434932375, + "language_loss": 0.75555682, + "learning_rate": 2.2101155938688423e-06, + "loss": 0.83265924, + "num_input_tokens_seen": 172524360, + "router_z_loss_clip": 1.58203125, + "router_z_loss_mlp": 0.13067627, + "step": 8025, + "time_per_iteration": 2.559722900390625 + }, + { + "auxiliary_loss_clip": 0.06445173, + "auxiliary_loss_mlp": 0.01270078, + "balance_loss_clip": 0.06286605, + "balance_loss_mlp": 0.01256536, + "epoch": 0.4825492259131219, + "flos": 20373691104000.0, + "grad_norm": 3.210842824131336, + "language_loss": 0.71099132, + "learning_rate": 2.209728283441112e-06, + "loss": 0.78814387, + "num_input_tokens_seen": 172541480, + "router_z_loss_clip": 1.58398438, + "router_z_loss_mlp": 0.13543701, + "step": 8026, + "time_per_iteration": 2.512563943862915 + }, + { + "auxiliary_loss_clip": 0.06450065, + "auxiliary_loss_mlp": 0.0127128, + "balance_loss_clip": 0.06287996, + "balance_loss_mlp": 0.01257094, + "epoch": 0.48260934916578985, + "flos": 14324193077760.0, + "grad_norm": 2.0787728376845385, + "language_loss": 0.74646676, + "learning_rate": 2.209340965060465e-06, + "loss": 0.82368022, + "num_input_tokens_seen": 172559005, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.14190674, + "step": 8027, + "time_per_iteration": 2.523252248764038 + }, + { + "auxiliary_loss_clip": 0.06445143, + "auxiliary_loss_mlp": 0.01269951, + "balance_loss_clip": 0.06285772, + "balance_loss_mlp": 0.01257166, + "epoch": 0.4826694724184578, + "flos": 22126654846080.0, + "grad_norm": 1.6924958309049165, + "language_loss": 0.67414463, + "learning_rate": 2.2089536387415868e-06, + "loss": 0.75129557, + "num_input_tokens_seen": 172578435, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.12792969, + "step": 8028, + "time_per_iteration": 2.5118508338928223 + }, + { + "auxiliary_loss_clip": 0.06443746, + "auxiliary_loss_mlp": 0.01268069, + "balance_loss_clip": 0.06285068, + "balance_loss_mlp": 0.01254926, + "epoch": 0.48272959567112583, + "flos": 16186882141440.0, + "grad_norm": 1.4109383431826554, + "language_loss": 0.73031461, + "learning_rate": 2.2085663044991655e-06, + "loss": 0.80743277, + "num_input_tokens_seen": 172596095, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.13134766, + "step": 8029, + "time_per_iteration": 2.513986587524414 + }, + { + "auxiliary_loss_clip": 0.06447576, + "auxiliary_loss_mlp": 0.01267005, + "balance_loss_clip": 0.0628765, + "balance_loss_mlp": 0.01253755, + "epoch": 0.4827897189237938, + "flos": 23186326705920.0, + "grad_norm": 2.2851559020013994, + "language_loss": 0.84759653, + "learning_rate": 2.2081789623478896e-06, + "loss": 0.92474234, + "num_input_tokens_seen": 172615255, + "router_z_loss_clip": 1.59863281, + "router_z_loss_mlp": 0.13256836, + "step": 8030, + "time_per_iteration": 2.523336410522461 + }, + { + "auxiliary_loss_clip": 0.0644383, + "auxiliary_loss_mlp": 0.0126632, + "balance_loss_clip": 0.06286349, + "balance_loss_mlp": 0.01253374, + "epoch": 0.48284984217646176, + "flos": 21659018808960.0, + "grad_norm": 2.6563677126547858, + "language_loss": 0.73703504, + "learning_rate": 2.2077916123024466e-06, + "loss": 0.81413656, + "num_input_tokens_seen": 172633185, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.12945557, + "step": 8031, + "time_per_iteration": 2.523465633392334 + }, + { + "auxiliary_loss_clip": 0.06451262, + "auxiliary_loss_mlp": 0.01268996, + "balance_loss_clip": 0.06285872, + "balance_loss_mlp": 0.01254548, + "epoch": 0.48290996542912973, + "flos": 31475501314560.0, + "grad_norm": 1.5957405541522132, + "language_loss": 0.71345282, + "learning_rate": 2.2074042543775245e-06, + "loss": 0.79065537, + "num_input_tokens_seen": 172654280, + "router_z_loss_clip": 1.65234375, + "router_z_loss_mlp": 0.14434814, + "step": 8032, + "time_per_iteration": 4.084775924682617 + }, + { + "auxiliary_loss_clip": 0.06441716, + "auxiliary_loss_mlp": 0.01271696, + "balance_loss_clip": 0.06285156, + "balance_loss_mlp": 0.01259066, + "epoch": 0.4829700886817977, + "flos": 24468803372160.0, + "grad_norm": 1.3669631944631024, + "language_loss": 0.74361598, + "learning_rate": 2.2070168885878126e-06, + "loss": 0.82075012, + "num_input_tokens_seen": 172675545, + "router_z_loss_clip": 1.56542969, + "router_z_loss_mlp": 0.12609863, + "step": 8033, + "time_per_iteration": 2.558655023574829 + }, + { + "auxiliary_loss_clip": 0.06455428, + "auxiliary_loss_mlp": 0.0126933, + "balance_loss_clip": 0.06290704, + "balance_loss_mlp": 0.01255436, + "epoch": 0.48303021193446566, + "flos": 25709170561920.0, + "grad_norm": 1.5251236339326817, + "language_loss": 0.83579373, + "learning_rate": 2.2066295149479996e-06, + "loss": 0.91304129, + "num_input_tokens_seen": 172696455, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.13909912, + "step": 8034, + "time_per_iteration": 4.034566402435303 + }, + { + "auxiliary_loss_clip": 0.06441804, + "auxiliary_loss_mlp": 0.01267333, + "balance_loss_clip": 0.06286483, + "balance_loss_mlp": 0.01255162, + "epoch": 0.4830903351871336, + "flos": 20091613933440.0, + "grad_norm": 1.4995747649605073, + "language_loss": 0.80011666, + "learning_rate": 2.2062421334727744e-06, + "loss": 0.87720799, + "num_input_tokens_seen": 172716720, + "router_z_loss_clip": 1.55273438, + "router_z_loss_mlp": 0.12176514, + "step": 8035, + "time_per_iteration": 2.560216188430786 + }, + { + "auxiliary_loss_clip": 0.06443267, + "auxiliary_loss_mlp": 0.01272391, + "balance_loss_clip": 0.06284694, + "balance_loss_mlp": 0.01257996, + "epoch": 0.4831504584398016, + "flos": 39460670910720.0, + "grad_norm": 2.4180718513556196, + "language_loss": 0.69735384, + "learning_rate": 2.2058547441768267e-06, + "loss": 0.77451038, + "num_input_tokens_seen": 172737435, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.14385986, + "step": 8036, + "time_per_iteration": 2.676248550415039 + }, + { + "auxiliary_loss_clip": 0.06441773, + "auxiliary_loss_mlp": 0.01267179, + "balance_loss_clip": 0.06283154, + "balance_loss_mlp": 0.01254638, + "epoch": 0.48321058169246955, + "flos": 20012006954880.0, + "grad_norm": 1.964916404489229, + "language_loss": 0.7269727, + "learning_rate": 2.205467347074847e-06, + "loss": 0.80406225, + "num_input_tokens_seen": 172755700, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.12536621, + "step": 8037, + "time_per_iteration": 2.5361721515655518 + }, + { + "auxiliary_loss_clip": 0.06449978, + "auxiliary_loss_mlp": 0.01267952, + "balance_loss_clip": 0.06284893, + "balance_loss_mlp": 0.01254594, + "epoch": 0.4832707049451375, + "flos": 20747869511040.0, + "grad_norm": 2.294242093364334, + "language_loss": 0.69135344, + "learning_rate": 2.205079942181525e-06, + "loss": 0.76853275, + "num_input_tokens_seen": 172775185, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.13366699, + "step": 8038, + "time_per_iteration": 2.5300488471984863 + }, + { + "auxiliary_loss_clip": 0.06441218, + "auxiliary_loss_mlp": 0.01266351, + "balance_loss_clip": 0.06284897, + "balance_loss_mlp": 0.01253161, + "epoch": 0.4833308281978055, + "flos": 33153889322880.0, + "grad_norm": 1.5080177559172256, + "language_loss": 0.79238868, + "learning_rate": 2.20469252951155e-06, + "loss": 0.8694644, + "num_input_tokens_seen": 172796990, + "router_z_loss_clip": 1.56347656, + "router_z_loss_mlp": 0.13201904, + "step": 8039, + "time_per_iteration": 4.106697082519531 + }, + { + "auxiliary_loss_clip": 0.06443603, + "auxiliary_loss_mlp": 0.01270239, + "balance_loss_clip": 0.06284612, + "balance_loss_mlp": 0.01257221, + "epoch": 0.48339095145047345, + "flos": 19105301923200.0, + "grad_norm": 2.5245127885531926, + "language_loss": 0.78196943, + "learning_rate": 2.2043051090796143e-06, + "loss": 0.85910785, + "num_input_tokens_seen": 172814915, + "router_z_loss_clip": 1.58886719, + "router_z_loss_mlp": 0.13024902, + "step": 8040, + "time_per_iteration": 2.51356840133667 + }, + { + "auxiliary_loss_clip": 0.06449578, + "auxiliary_loss_mlp": 0.01268689, + "balance_loss_clip": 0.06287356, + "balance_loss_mlp": 0.01254342, + "epoch": 0.4834510747031414, + "flos": 34468035632640.0, + "grad_norm": 1.5686841461958603, + "language_loss": 0.75648201, + "learning_rate": 2.203917680900409e-06, + "loss": 0.83366466, + "num_input_tokens_seen": 172837060, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.14337158, + "step": 8041, + "time_per_iteration": 2.6821110248565674 + }, + { + "auxiliary_loss_clip": 0.06444554, + "auxiliary_loss_mlp": 0.01274011, + "balance_loss_clip": 0.06290209, + "balance_loss_mlp": 0.01261244, + "epoch": 0.48351119795580944, + "flos": 27388187475840.0, + "grad_norm": 1.655786729526556, + "language_loss": 0.66309774, + "learning_rate": 2.203530244988624e-06, + "loss": 0.74028337, + "num_input_tokens_seen": 172856545, + "router_z_loss_clip": 1.54296875, + "router_z_loss_mlp": 0.12756348, + "step": 8042, + "time_per_iteration": 2.587979316711426 + }, + { + "auxiliary_loss_clip": 0.0635567, + "auxiliary_loss_mlp": 0.01262787, + "balance_loss_clip": 0.06287327, + "balance_loss_mlp": 0.012603, + "epoch": 0.4835713212084774, + "flos": 67162967815680.0, + "grad_norm": 0.683297043643475, + "language_loss": 0.58432257, + "learning_rate": 2.2031428013589517e-06, + "loss": 0.66050708, + "num_input_tokens_seen": 172923055, + "router_z_loss_clip": 0.68359375, + "router_z_loss_mlp": 0.02485657, + "step": 8043, + "time_per_iteration": 3.240037441253662 + }, + { + "auxiliary_loss_clip": 0.06448962, + "auxiliary_loss_mlp": 0.01270561, + "balance_loss_clip": 0.06288527, + "balance_loss_mlp": 0.01256548, + "epoch": 0.48363144446114537, + "flos": 17973234535680.0, + "grad_norm": 8.666689726695457, + "language_loss": 0.71932065, + "learning_rate": 2.2027553500260847e-06, + "loss": 0.79651588, + "num_input_tokens_seen": 172940700, + "router_z_loss_clip": 1.60546875, + "router_z_loss_mlp": 0.14013672, + "step": 8044, + "time_per_iteration": 2.557222604751587 + }, + { + "auxiliary_loss_clip": 0.06443186, + "auxiliary_loss_mlp": 0.01271215, + "balance_loss_clip": 0.06287612, + "balance_loss_mlp": 0.01257667, + "epoch": 0.48369156771381333, + "flos": 20599556584320.0, + "grad_norm": 1.2792089170093015, + "language_loss": 0.76084363, + "learning_rate": 2.202367891004714e-06, + "loss": 0.83798766, + "num_input_tokens_seen": 172961125, + "router_z_loss_clip": 1.55371094, + "router_z_loss_mlp": 0.13549805, + "step": 8045, + "time_per_iteration": 3.9927117824554443 + }, + { + "auxiliary_loss_clip": 0.06452677, + "auxiliary_loss_mlp": 0.01268119, + "balance_loss_clip": 0.06291251, + "balance_loss_mlp": 0.01255274, + "epoch": 0.4837516909664813, + "flos": 22681780145280.0, + "grad_norm": 1.8159113209886955, + "language_loss": 0.69591677, + "learning_rate": 2.201980424309533e-06, + "loss": 0.77312469, + "num_input_tokens_seen": 172980405, + "router_z_loss_clip": 1.61230469, + "router_z_loss_mlp": 0.12854004, + "step": 8046, + "time_per_iteration": 2.563061237335205 + }, + { + "auxiliary_loss_clip": 0.06444287, + "auxiliary_loss_mlp": 0.01272531, + "balance_loss_clip": 0.06285235, + "balance_loss_mlp": 0.01259674, + "epoch": 0.48381181421914926, + "flos": 25525414558080.0, + "grad_norm": 1.7918831202662233, + "language_loss": 0.83005214, + "learning_rate": 2.2015929499552337e-06, + "loss": 0.90722024, + "num_input_tokens_seen": 172999105, + "router_z_loss_clip": 1.58886719, + "router_z_loss_mlp": 0.12866211, + "step": 8047, + "time_per_iteration": 2.5624239444732666 + }, + { + "auxiliary_loss_clip": 0.06441472, + "auxiliary_loss_mlp": 0.01268193, + "balance_loss_clip": 0.06286557, + "balance_loss_mlp": 0.01255522, + "epoch": 0.4838719374718172, + "flos": 24214454703360.0, + "grad_norm": 3.8503425220093273, + "language_loss": 0.8051095, + "learning_rate": 2.2012054679565092e-06, + "loss": 0.88220614, + "num_input_tokens_seen": 173019935, + "router_z_loss_clip": 1.54785156, + "router_z_loss_mlp": 0.12664795, + "step": 8048, + "time_per_iteration": 2.5535151958465576 + }, + { + "auxiliary_loss_clip": 0.06450336, + "auxiliary_loss_mlp": 0.01269587, + "balance_loss_clip": 0.06287669, + "balance_loss_mlp": 0.01255091, + "epoch": 0.4839320607244852, + "flos": 26731889971200.0, + "grad_norm": 1.601579819484506, + "language_loss": 0.8118276, + "learning_rate": 2.200817978328054e-06, + "loss": 0.88902682, + "num_input_tokens_seen": 173039700, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.14477539, + "step": 8049, + "time_per_iteration": 2.576237440109253 + }, + { + "auxiliary_loss_clip": 0.0644124, + "auxiliary_loss_mlp": 0.01266224, + "balance_loss_clip": 0.0628837, + "balance_loss_mlp": 0.01254392, + "epoch": 0.48399218397715316, + "flos": 20455142872320.0, + "grad_norm": 1.6782620987313854, + "language_loss": 0.7275942, + "learning_rate": 2.2004304810845602e-06, + "loss": 0.8046689, + "num_input_tokens_seen": 173059170, + "router_z_loss_clip": 1.52832031, + "router_z_loss_mlp": 0.1184082, + "step": 8050, + "time_per_iteration": 2.5001842975616455 + }, + { + "auxiliary_loss_clip": 0.06348944, + "auxiliary_loss_mlp": 0.01254327, + "balance_loss_clip": 0.06280461, + "balance_loss_mlp": 0.01252052, + "epoch": 0.4840523072298211, + "flos": 67199626776960.0, + "grad_norm": 0.6876828937687306, + "language_loss": 0.56319511, + "learning_rate": 2.200042976240723e-06, + "loss": 0.63922787, + "num_input_tokens_seen": 173119000, + "router_z_loss_clip": 0.6875, + "router_z_loss_mlp": 0.02278137, + "step": 8051, + "time_per_iteration": 3.1732234954833984 + }, + { + "auxiliary_loss_clip": 0.06445932, + "auxiliary_loss_mlp": 0.01267371, + "balance_loss_clip": 0.06285888, + "balance_loss_mlp": 0.01254806, + "epoch": 0.4841124304824891, + "flos": 22416782008320.0, + "grad_norm": 1.9466323687223244, + "language_loss": 0.75329518, + "learning_rate": 2.199655463811236e-06, + "loss": 0.83042824, + "num_input_tokens_seen": 173137570, + "router_z_loss_clip": 1.60058594, + "router_z_loss_mlp": 0.12554932, + "step": 8052, + "time_per_iteration": 2.525742769241333 + }, + { + "auxiliary_loss_clip": 0.06445011, + "auxiliary_loss_mlp": 0.01268398, + "balance_loss_clip": 0.0628748, + "balance_loss_mlp": 0.01255797, + "epoch": 0.48417255373515705, + "flos": 13848926319360.0, + "grad_norm": 9.22847684329053, + "language_loss": 0.65932119, + "learning_rate": 2.1992679438107936e-06, + "loss": 0.73645532, + "num_input_tokens_seen": 173154355, + "router_z_loss_clip": 1.57226562, + "router_z_loss_mlp": 0.1260376, + "step": 8053, + "time_per_iteration": 2.508634328842163 + }, + { + "auxiliary_loss_clip": 0.06439514, + "auxiliary_loss_mlp": 0.01270848, + "balance_loss_clip": 0.06286003, + "balance_loss_mlp": 0.01258242, + "epoch": 0.484232676987825, + "flos": 31657747944960.0, + "grad_norm": 1.9001102819500506, + "language_loss": 0.69764733, + "learning_rate": 2.198880416254091e-06, + "loss": 0.77475095, + "num_input_tokens_seen": 173174845, + "router_z_loss_clip": 1.53613281, + "router_z_loss_mlp": 0.12609863, + "step": 8054, + "time_per_iteration": 2.6046009063720703 + }, + { + "auxiliary_loss_clip": 0.06439343, + "auxiliary_loss_mlp": 0.01266256, + "balance_loss_clip": 0.062842, + "balance_loss_mlp": 0.01253578, + "epoch": 0.48429280024049304, + "flos": 24101878343040.0, + "grad_norm": 1.6288967613161636, + "language_loss": 0.69845426, + "learning_rate": 2.1984928811558233e-06, + "loss": 0.77551031, + "num_input_tokens_seen": 173195025, + "router_z_loss_clip": 1.55273438, + "router_z_loss_mlp": 0.12683105, + "step": 8055, + "time_per_iteration": 2.5645036697387695 + }, + { + "auxiliary_loss_clip": 0.06441051, + "auxiliary_loss_mlp": 0.01270203, + "balance_loss_clip": 0.06283379, + "balance_loss_mlp": 0.01257621, + "epoch": 0.484352923493161, + "flos": 17535842622720.0, + "grad_norm": 2.1100630556312256, + "language_loss": 0.63363564, + "learning_rate": 2.198105338530685e-06, + "loss": 0.71074814, + "num_input_tokens_seen": 173213065, + "router_z_loss_clip": 1.57617188, + "router_z_loss_mlp": 0.12597656, + "step": 8056, + "time_per_iteration": 2.4887776374816895 + }, + { + "auxiliary_loss_clip": 0.06441829, + "auxiliary_loss_mlp": 0.01269551, + "balance_loss_clip": 0.06283918, + "balance_loss_mlp": 0.0125639, + "epoch": 0.48441304674582897, + "flos": 29174204453760.0, + "grad_norm": 1.7583270452203597, + "language_loss": 0.67791545, + "learning_rate": 2.1977177883933726e-06, + "loss": 0.75502926, + "num_input_tokens_seen": 173234545, + "router_z_loss_clip": 1.58007812, + "router_z_loss_mlp": 0.1315918, + "step": 8057, + "time_per_iteration": 2.6147687435150146 + }, + { + "auxiliary_loss_clip": 0.06438136, + "auxiliary_loss_mlp": 0.01270959, + "balance_loss_clip": 0.06284122, + "balance_loss_mlp": 0.0125933, + "epoch": 0.48447316999849693, + "flos": 15891933369600.0, + "grad_norm": 1.7129310149903716, + "language_loss": 0.81615114, + "learning_rate": 2.1973302307585827e-06, + "loss": 0.89324206, + "num_input_tokens_seen": 173252175, + "router_z_loss_clip": 1.5390625, + "router_z_loss_mlp": 0.11627197, + "step": 8058, + "time_per_iteration": 2.499464273452759 + }, + { + "auxiliary_loss_clip": 0.06444308, + "auxiliary_loss_mlp": 0.01272607, + "balance_loss_clip": 0.06283933, + "balance_loss_mlp": 0.01259619, + "epoch": 0.4845332932511649, + "flos": 24386974260480.0, + "grad_norm": 1.694669299967896, + "language_loss": 0.79782939, + "learning_rate": 2.1969426656410097e-06, + "loss": 0.87499857, + "num_input_tokens_seen": 173268790, + "router_z_loss_clip": 1.60351562, + "router_z_loss_mlp": 0.12988281, + "step": 8059, + "time_per_iteration": 2.5456764698028564 + }, + { + "auxiliary_loss_clip": 0.06445169, + "auxiliary_loss_mlp": 0.0126972, + "balance_loss_clip": 0.06283809, + "balance_loss_mlp": 0.01256065, + "epoch": 0.48459341650383286, + "flos": 37124434097280.0, + "grad_norm": 2.171534570518566, + "language_loss": 0.67115712, + "learning_rate": 2.196555093055352e-06, + "loss": 0.74830604, + "num_input_tokens_seen": 173288030, + "router_z_loss_clip": 1.61230469, + "router_z_loss_mlp": 0.13659668, + "step": 8060, + "time_per_iteration": 2.639552593231201 + }, + { + "auxiliary_loss_clip": 0.06448266, + "auxiliary_loss_mlp": 0.01267897, + "balance_loss_clip": 0.06291284, + "balance_loss_mlp": 0.01255404, + "epoch": 0.48465353975650083, + "flos": 22973500535040.0, + "grad_norm": 1.9145476252385885, + "language_loss": 0.67691833, + "learning_rate": 2.1961675130163046e-06, + "loss": 0.75407994, + "num_input_tokens_seen": 173305965, + "router_z_loss_clip": 1.56835938, + "router_z_loss_mlp": 0.12506104, + "step": 8061, + "time_per_iteration": 2.636291265487671 + }, + { + "auxiliary_loss_clip": 0.06440581, + "auxiliary_loss_mlp": 0.012731, + "balance_loss_clip": 0.06285343, + "balance_loss_mlp": 0.01259581, + "epoch": 0.4847136630091688, + "flos": 17712680664960.0, + "grad_norm": 1.8103717294603696, + "language_loss": 0.83217871, + "learning_rate": 2.1957799255385653e-06, + "loss": 0.90931553, + "num_input_tokens_seen": 173321985, + "router_z_loss_clip": 1.55078125, + "router_z_loss_mlp": 0.13531494, + "step": 8062, + "time_per_iteration": 2.5335779190063477 + }, + { + "auxiliary_loss_clip": 0.06441268, + "auxiliary_loss_mlp": 0.01271147, + "balance_loss_clip": 0.06286018, + "balance_loss_mlp": 0.01259077, + "epoch": 0.48477378626183676, + "flos": 22024853735040.0, + "grad_norm": 1.4198166357723545, + "language_loss": 0.74425852, + "learning_rate": 2.1953923306368325e-06, + "loss": 0.82138264, + "num_input_tokens_seen": 173341315, + "router_z_loss_clip": 1.55273438, + "router_z_loss_mlp": 0.1206665, + "step": 8063, + "time_per_iteration": 2.575752019882202 + }, + { + "auxiliary_loss_clip": 0.06438752, + "auxiliary_loss_mlp": 0.01268531, + "balance_loss_clip": 0.06282612, + "balance_loss_mlp": 0.01256276, + "epoch": 0.4848339095145047, + "flos": 27970118881920.0, + "grad_norm": 1.5830553745787852, + "language_loss": 0.79034185, + "learning_rate": 2.1950047283258023e-06, + "loss": 0.86741465, + "num_input_tokens_seen": 173361055, + "router_z_loss_clip": 1.56054688, + "router_z_loss_mlp": 0.12255859, + "step": 8064, + "time_per_iteration": 2.601557731628418 + }, + { + "auxiliary_loss_clip": 0.06441826, + "auxiliary_loss_mlp": 0.01266756, + "balance_loss_clip": 0.06290108, + "balance_loss_mlp": 0.01254817, + "epoch": 0.4848940327671727, + "flos": 21695090791680.0, + "grad_norm": 1.71958305783472, + "language_loss": 0.795892, + "learning_rate": 2.194617118620173e-06, + "loss": 0.87297779, + "num_input_tokens_seen": 173379255, + "router_z_loss_clip": 1.51757812, + "router_z_loss_mlp": 0.1194458, + "step": 8065, + "time_per_iteration": 2.5325217247009277 + }, + { + "auxiliary_loss_clip": 0.06434904, + "auxiliary_loss_mlp": 0.0126868, + "balance_loss_clip": 0.06285697, + "balance_loss_mlp": 0.01256813, + "epoch": 0.48495415601984065, + "flos": 20637892627200.0, + "grad_norm": 1.7068711802888106, + "language_loss": 0.76162863, + "learning_rate": 2.194229501534644e-06, + "loss": 0.83866447, + "num_input_tokens_seen": 173398370, + "router_z_loss_clip": 1.49121094, + "router_z_loss_mlp": 0.11865234, + "step": 8066, + "time_per_iteration": 2.506598949432373 + }, + { + "auxiliary_loss_clip": 0.06438506, + "auxiliary_loss_mlp": 0.01268819, + "balance_loss_clip": 0.06285724, + "balance_loss_mlp": 0.01257375, + "epoch": 0.4850142792725086, + "flos": 25634972171520.0, + "grad_norm": 1.302389197624331, + "language_loss": 0.72176784, + "learning_rate": 2.193841877083912e-06, + "loss": 0.79884112, + "num_input_tokens_seen": 173419595, + "router_z_loss_clip": 1.52636719, + "router_z_loss_mlp": 0.11444092, + "step": 8067, + "time_per_iteration": 2.5921640396118164 + }, + { + "auxiliary_loss_clip": 0.06438944, + "auxiliary_loss_mlp": 0.01268187, + "balance_loss_clip": 0.06282091, + "balance_loss_mlp": 0.01255986, + "epoch": 0.4850744025251766, + "flos": 13777075843200.0, + "grad_norm": 2.2825284137915975, + "language_loss": 0.79257572, + "learning_rate": 2.1934542452826767e-06, + "loss": 0.86964703, + "num_input_tokens_seen": 173435390, + "router_z_loss_clip": 1.5703125, + "router_z_loss_mlp": 0.12219238, + "step": 8068, + "time_per_iteration": 2.5287444591522217 + }, + { + "auxiliary_loss_clip": 0.06435382, + "auxiliary_loss_mlp": 0.01268403, + "balance_loss_clip": 0.06280828, + "balance_loss_mlp": 0.012565, + "epoch": 0.4851345257778446, + "flos": 20266691040000.0, + "grad_norm": 1.4034205816126453, + "language_loss": 0.84740359, + "learning_rate": 2.193066606145638e-06, + "loss": 0.92444146, + "num_input_tokens_seen": 173454095, + "router_z_loss_clip": 1.54394531, + "router_z_loss_mlp": 0.11901855, + "step": 8069, + "time_per_iteration": 2.548593044281006 + }, + { + "auxiliary_loss_clip": 0.06435016, + "auxiliary_loss_mlp": 0.01266308, + "balance_loss_clip": 0.06280835, + "balance_loss_mlp": 0.01254763, + "epoch": 0.48519464903051257, + "flos": 27097095991680.0, + "grad_norm": 1.771109080244907, + "language_loss": 0.78544027, + "learning_rate": 2.192678959687493e-06, + "loss": 0.86245352, + "num_input_tokens_seen": 173475300, + "router_z_loss_clip": 1.54003906, + "router_z_loss_mlp": 0.11553955, + "step": 8070, + "time_per_iteration": 2.581026315689087 + }, + { + "auxiliary_loss_clip": 0.06432221, + "auxiliary_loss_mlp": 0.01268982, + "balance_loss_clip": 0.06279641, + "balance_loss_mlp": 0.01256239, + "epoch": 0.48525477228318054, + "flos": 17132677902720.0, + "grad_norm": 3.597843949572919, + "language_loss": 0.77929389, + "learning_rate": 2.192291305922943e-06, + "loss": 0.85630596, + "num_input_tokens_seen": 173492005, + "router_z_loss_clip": 1.52539062, + "router_z_loss_mlp": 0.12756348, + "step": 8071, + "time_per_iteration": 3.963555335998535 + }, + { + "auxiliary_loss_clip": 0.06438918, + "auxiliary_loss_mlp": 0.01270068, + "balance_loss_clip": 0.06282261, + "balance_loss_mlp": 0.01256777, + "epoch": 0.4853148955358485, + "flos": 28187263537920.0, + "grad_norm": 2.115731418126265, + "language_loss": 0.72008896, + "learning_rate": 2.1919036448666873e-06, + "loss": 0.7971788, + "num_input_tokens_seen": 173511995, + "router_z_loss_clip": 1.56835938, + "router_z_loss_mlp": 0.13299561, + "step": 8072, + "time_per_iteration": 2.6861536502838135 + }, + { + "auxiliary_loss_clip": 0.06439583, + "auxiliary_loss_mlp": 0.0126679, + "balance_loss_clip": 0.06282715, + "balance_loss_mlp": 0.01253761, + "epoch": 0.48537501878851647, + "flos": 17499015953280.0, + "grad_norm": 1.8999559951356444, + "language_loss": 0.88288134, + "learning_rate": 2.1915159765334262e-06, + "loss": 0.95994508, + "num_input_tokens_seen": 173530215, + "router_z_loss_clip": 1.56835938, + "router_z_loss_mlp": 0.13037109, + "step": 8073, + "time_per_iteration": 2.4814834594726562 + }, + { + "auxiliary_loss_clip": 0.06432822, + "auxiliary_loss_mlp": 0.01269151, + "balance_loss_clip": 0.06283282, + "balance_loss_mlp": 0.01257731, + "epoch": 0.48543514204118443, + "flos": 28592398828800.0, + "grad_norm": 2.458004055687259, + "language_loss": 0.61317194, + "learning_rate": 2.19112830093786e-06, + "loss": 0.69019163, + "num_input_tokens_seen": 173550920, + "router_z_loss_clip": 1.49316406, + "router_z_loss_mlp": 0.11413574, + "step": 8074, + "time_per_iteration": 3.984229326248169 + }, + { + "auxiliary_loss_clip": 0.06435922, + "auxiliary_loss_mlp": 0.01265981, + "balance_loss_clip": 0.0627804, + "balance_loss_mlp": 0.01254024, + "epoch": 0.4854952652938524, + "flos": 20966355832320.0, + "grad_norm": 1.641968552330247, + "language_loss": 0.73514569, + "learning_rate": 2.19074061809469e-06, + "loss": 0.81216466, + "num_input_tokens_seen": 173569065, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.11962891, + "step": 8075, + "time_per_iteration": 2.5479941368103027 + }, + { + "auxiliary_loss_clip": 0.06429431, + "auxiliary_loss_mlp": 0.01268393, + "balance_loss_clip": 0.06278814, + "balance_loss_mlp": 0.01256704, + "epoch": 0.48555538854652036, + "flos": 66543344000640.0, + "grad_norm": 1.7202852105657789, + "language_loss": 0.81976241, + "learning_rate": 2.1903529280186163e-06, + "loss": 0.89674067, + "num_input_tokens_seen": 173596085, + "router_z_loss_clip": 1.50488281, + "router_z_loss_mlp": 0.11676025, + "step": 8076, + "time_per_iteration": 2.9675233364105225 + }, + { + "auxiliary_loss_clip": 0.06435271, + "auxiliary_loss_mlp": 0.01273017, + "balance_loss_clip": 0.06280246, + "balance_loss_mlp": 0.01259242, + "epoch": 0.4856155117991883, + "flos": 15930520974720.0, + "grad_norm": 1.9409864090603182, + "language_loss": 0.86392474, + "learning_rate": 2.1899652307243407e-06, + "loss": 0.94100761, + "num_input_tokens_seen": 173613900, + "router_z_loss_clip": 1.55078125, + "router_z_loss_mlp": 0.13781738, + "step": 8077, + "time_per_iteration": 2.5062685012817383 + }, + { + "auxiliary_loss_clip": 0.06325787, + "auxiliary_loss_mlp": 0.01252172, + "balance_loss_clip": 0.062584, + "balance_loss_mlp": 0.0125022, + "epoch": 0.4856756350518563, + "flos": 71066986848000.0, + "grad_norm": 0.9289783803731909, + "language_loss": 0.58378243, + "learning_rate": 2.189577526226564e-06, + "loss": 0.65956199, + "num_input_tokens_seen": 173671305, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 0.01950073, + "step": 8078, + "time_per_iteration": 4.502991199493408 + }, + { + "auxiliary_loss_clip": 0.06440585, + "auxiliary_loss_mlp": 0.01268963, + "balance_loss_clip": 0.06280588, + "balance_loss_mlp": 0.01255886, + "epoch": 0.48573575830452426, + "flos": 29833478778240.0, + "grad_norm": 2.317528327629363, + "language_loss": 0.72874224, + "learning_rate": 2.1891898145399884e-06, + "loss": 0.80583775, + "num_input_tokens_seen": 173692070, + "router_z_loss_clip": 1.59863281, + "router_z_loss_mlp": 0.1307373, + "step": 8079, + "time_per_iteration": 2.5839955806732178 + }, + { + "auxiliary_loss_clip": 0.06440279, + "auxiliary_loss_mlp": 0.01268912, + "balance_loss_clip": 0.06283288, + "balance_loss_mlp": 0.01256925, + "epoch": 0.4857958815571922, + "flos": 17645274455040.0, + "grad_norm": 2.8950752184508843, + "language_loss": 0.80285943, + "learning_rate": 2.1888020956793172e-06, + "loss": 0.87995136, + "num_input_tokens_seen": 173709785, + "router_z_loss_clip": 1.5703125, + "router_z_loss_mlp": 0.11999512, + "step": 8080, + "time_per_iteration": 2.542607307434082 + }, + { + "auxiliary_loss_clip": 0.06436758, + "auxiliary_loss_mlp": 0.01265336, + "balance_loss_clip": 0.06281016, + "balance_loss_mlp": 0.01252754, + "epoch": 0.4858560048098602, + "flos": 21111817720320.0, + "grad_norm": 1.934060586134842, + "language_loss": 0.84237295, + "learning_rate": 2.188414369659251e-06, + "loss": 0.9193939, + "num_input_tokens_seen": 173728770, + "router_z_loss_clip": 1.55957031, + "router_z_loss_mlp": 0.12579346, + "step": 8081, + "time_per_iteration": 2.523787021636963 + }, + { + "auxiliary_loss_clip": 0.06433021, + "auxiliary_loss_mlp": 0.01268596, + "balance_loss_clip": 0.06277841, + "balance_loss_mlp": 0.0125512, + "epoch": 0.4859161280625282, + "flos": 22097375043840.0, + "grad_norm": 1.530246142437005, + "language_loss": 0.83824933, + "learning_rate": 2.1880266364944924e-06, + "loss": 0.91526556, + "num_input_tokens_seen": 173747355, + "router_z_loss_clip": 1.55175781, + "router_z_loss_mlp": 0.13464355, + "step": 8082, + "time_per_iteration": 2.562739372253418 + }, + { + "auxiliary_loss_clip": 0.0643435, + "auxiliary_loss_mlp": 0.01268115, + "balance_loss_clip": 0.06283809, + "balance_loss_mlp": 0.01255849, + "epoch": 0.4859762513151962, + "flos": 17499183661440.0, + "grad_norm": 1.9064651850671037, + "language_loss": 0.87366831, + "learning_rate": 2.187638896199746e-06, + "loss": 0.95069289, + "num_input_tokens_seen": 173764825, + "router_z_loss_clip": 1.50292969, + "router_z_loss_mlp": 0.12268066, + "step": 8083, + "time_per_iteration": 2.5062954425811768 + }, + { + "auxiliary_loss_clip": 0.064337, + "auxiliary_loss_mlp": 0.01268463, + "balance_loss_clip": 0.06281679, + "balance_loss_mlp": 0.01255356, + "epoch": 0.48603637456786414, + "flos": 18010061205120.0, + "grad_norm": 1.6184381568123027, + "language_loss": 0.81531483, + "learning_rate": 2.1872511487897126e-06, + "loss": 0.89233649, + "num_input_tokens_seen": 173783215, + "router_z_loss_clip": 1.51855469, + "router_z_loss_mlp": 0.13110352, + "step": 8084, + "time_per_iteration": 3.9548635482788086 + }, + { + "auxiliary_loss_clip": 0.06438272, + "auxiliary_loss_mlp": 0.01269138, + "balance_loss_clip": 0.06283273, + "balance_loss_mlp": 0.01256645, + "epoch": 0.4860964978205321, + "flos": 22498611120000.0, + "grad_norm": 1.8856401579659385, + "language_loss": 0.68814772, + "learning_rate": 2.186863394279098e-06, + "loss": 0.76522183, + "num_input_tokens_seen": 173801905, + "router_z_loss_clip": 1.54980469, + "router_z_loss_mlp": 0.12475586, + "step": 8085, + "time_per_iteration": 2.525697708129883 + }, + { + "auxiliary_loss_clip": 0.06434157, + "auxiliary_loss_mlp": 0.01270175, + "balance_loss_clip": 0.0627964, + "balance_loss_mlp": 0.01257158, + "epoch": 0.48615662107320007, + "flos": 23380061345280.0, + "grad_norm": 1.4159205206948002, + "language_loss": 0.77895916, + "learning_rate": 2.1864756326826046e-06, + "loss": 0.85600245, + "num_input_tokens_seen": 173824690, + "router_z_loss_clip": 1.54492188, + "router_z_loss_mlp": 0.13024902, + "step": 8086, + "time_per_iteration": 2.5914857387542725 + }, + { + "auxiliary_loss_clip": 0.06433852, + "auxiliary_loss_mlp": 0.01265857, + "balance_loss_clip": 0.06279776, + "balance_loss_mlp": 0.01253292, + "epoch": 0.48621674432586803, + "flos": 34426722769920.0, + "grad_norm": 1.8125320165569008, + "language_loss": 0.69750226, + "learning_rate": 2.1860878640149355e-06, + "loss": 0.7744993, + "num_input_tokens_seen": 173844450, + "router_z_loss_clip": 1.54101562, + "router_z_loss_mlp": 0.12573242, + "step": 8087, + "time_per_iteration": 2.611724615097046 + }, + { + "auxiliary_loss_clip": 0.06440983, + "auxiliary_loss_mlp": 0.01266005, + "balance_loss_clip": 0.06277409, + "balance_loss_mlp": 0.0125254, + "epoch": 0.486276867578536, + "flos": 33115595207040.0, + "grad_norm": 1.9401027694089865, + "language_loss": 0.73050213, + "learning_rate": 2.1857000882907974e-06, + "loss": 0.80757201, + "num_input_tokens_seen": 173864975, + "router_z_loss_clip": 1.63476562, + "router_z_loss_mlp": 0.13482666, + "step": 8088, + "time_per_iteration": 2.6235716342926025 + }, + { + "auxiliary_loss_clip": 0.06434947, + "auxiliary_loss_mlp": 0.01270457, + "balance_loss_clip": 0.06279397, + "balance_loss_mlp": 0.01257982, + "epoch": 0.48633699083120396, + "flos": 21477149521920.0, + "grad_norm": 1.5117477196191362, + "language_loss": 0.75765258, + "learning_rate": 2.185312305524892e-06, + "loss": 0.83470654, + "num_input_tokens_seen": 173883805, + "router_z_loss_clip": 1.55566406, + "router_z_loss_mlp": 0.12481689, + "step": 8089, + "time_per_iteration": 2.522033214569092 + }, + { + "auxiliary_loss_clip": 0.06432723, + "auxiliary_loss_mlp": 0.01266623, + "balance_loss_clip": 0.06276575, + "balance_loss_mlp": 0.01254702, + "epoch": 0.48639711408387193, + "flos": 20090565757440.0, + "grad_norm": 2.0719257974800307, + "language_loss": 0.84617764, + "learning_rate": 2.184924515731926e-06, + "loss": 0.92317104, + "num_input_tokens_seen": 173903520, + "router_z_loss_clip": 1.5625, + "router_z_loss_mlp": 0.1192627, + "step": 8090, + "time_per_iteration": 2.6032962799072266 + }, + { + "auxiliary_loss_clip": 0.06428317, + "auxiliary_loss_mlp": 0.01267937, + "balance_loss_clip": 0.06279606, + "balance_loss_mlp": 0.01256362, + "epoch": 0.4864572373365399, + "flos": 20785450867200.0, + "grad_norm": 1.460241002220635, + "language_loss": 0.76103806, + "learning_rate": 2.1845367189266045e-06, + "loss": 0.8380006, + "num_input_tokens_seen": 173924255, + "router_z_loss_clip": 1.48632812, + "router_z_loss_mlp": 0.11578369, + "step": 8091, + "time_per_iteration": 2.534083127975464 + }, + { + "auxiliary_loss_clip": 0.06434517, + "auxiliary_loss_mlp": 0.01264632, + "balance_loss_clip": 0.0627959, + "balance_loss_mlp": 0.01252651, + "epoch": 0.48651736058920786, + "flos": 26031554346240.0, + "grad_norm": 1.4698762569471817, + "language_loss": 0.8086524, + "learning_rate": 2.184148915123631e-06, + "loss": 0.88564396, + "num_input_tokens_seen": 173943285, + "router_z_loss_clip": 1.546875, + "router_z_loss_mlp": 0.11987305, + "step": 8092, + "time_per_iteration": 2.5732295513153076 + }, + { + "auxiliary_loss_clip": 0.06434911, + "auxiliary_loss_mlp": 0.01268235, + "balance_loss_clip": 0.06279235, + "balance_loss_mlp": 0.01254711, + "epoch": 0.4865774838418758, + "flos": 20491885687680.0, + "grad_norm": 1.359461965274961, + "language_loss": 0.71901554, + "learning_rate": 2.1837611043377126e-06, + "loss": 0.79604697, + "num_input_tokens_seen": 173962205, + "router_z_loss_clip": 1.55566406, + "router_z_loss_mlp": 0.13537598, + "step": 8093, + "time_per_iteration": 2.5315988063812256 + }, + { + "auxiliary_loss_clip": 0.06430057, + "auxiliary_loss_mlp": 0.01268667, + "balance_loss_clip": 0.06278083, + "balance_loss_mlp": 0.01256424, + "epoch": 0.4866376070945438, + "flos": 23554048348800.0, + "grad_norm": 1.746145283456106, + "language_loss": 0.68340707, + "learning_rate": 2.1833732865835545e-06, + "loss": 0.76039433, + "num_input_tokens_seen": 173980945, + "router_z_loss_clip": 1.51855469, + "router_z_loss_mlp": 0.12237549, + "step": 8094, + "time_per_iteration": 2.5621020793914795 + }, + { + "auxiliary_loss_clip": 0.06439431, + "auxiliary_loss_mlp": 0.01276508, + "balance_loss_clip": 0.06280254, + "balance_loss_mlp": 0.01263502, + "epoch": 0.4866977303472118, + "flos": 16696166457600.0, + "grad_norm": 2.187009986392795, + "language_loss": 0.66443598, + "learning_rate": 2.1829854618758636e-06, + "loss": 0.74159545, + "num_input_tokens_seen": 173998860, + "router_z_loss_clip": 1.59277344, + "router_z_loss_mlp": 0.13006592, + "step": 8095, + "time_per_iteration": 2.4823923110961914 + }, + { + "auxiliary_loss_clip": 0.06436304, + "auxiliary_loss_mlp": 0.01266824, + "balance_loss_clip": 0.06279348, + "balance_loss_mlp": 0.01254444, + "epoch": 0.4867578535998798, + "flos": 17902012965120.0, + "grad_norm": 1.919238290363099, + "language_loss": 0.79046065, + "learning_rate": 2.182597630229345e-06, + "loss": 0.86749196, + "num_input_tokens_seen": 174016665, + "router_z_loss_clip": 1.56933594, + "router_z_loss_mlp": 0.12384033, + "step": 8096, + "time_per_iteration": 2.507293701171875 + }, + { + "auxiliary_loss_clip": 0.06432957, + "auxiliary_loss_mlp": 0.01269945, + "balance_loss_clip": 0.06279905, + "balance_loss_mlp": 0.01257154, + "epoch": 0.48681797685254774, + "flos": 22644366497280.0, + "grad_norm": 2.003337305767246, + "language_loss": 0.68162191, + "learning_rate": 2.1822097916587067e-06, + "loss": 0.75865096, + "num_input_tokens_seen": 174034800, + "router_z_loss_clip": 1.53222656, + "router_z_loss_mlp": 0.12799072, + "step": 8097, + "time_per_iteration": 2.5473361015319824 + }, + { + "auxiliary_loss_clip": 0.06432723, + "auxiliary_loss_mlp": 0.01272073, + "balance_loss_clip": 0.06279548, + "balance_loss_mlp": 0.01259944, + "epoch": 0.4868781001052157, + "flos": 20892283223040.0, + "grad_norm": 1.4401604045572658, + "language_loss": 0.71418583, + "learning_rate": 2.1818219461786543e-06, + "loss": 0.79123378, + "num_input_tokens_seen": 174054445, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.12127686, + "step": 8098, + "time_per_iteration": 2.5543363094329834 + }, + { + "auxiliary_loss_clip": 0.06441437, + "auxiliary_loss_mlp": 0.01269071, + "balance_loss_clip": 0.06279659, + "balance_loss_mlp": 0.01255725, + "epoch": 0.48693822335788367, + "flos": 41984688723840.0, + "grad_norm": 1.4376447542768653, + "language_loss": 0.66435724, + "learning_rate": 2.1814340938038956e-06, + "loss": 0.74146235, + "num_input_tokens_seen": 174077890, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.13348389, + "step": 8099, + "time_per_iteration": 2.711822032928467 + }, + { + "auxiliary_loss_clip": 0.0643863, + "auxiliary_loss_mlp": 0.01271817, + "balance_loss_clip": 0.06281494, + "balance_loss_mlp": 0.01259485, + "epoch": 0.48699834661055164, + "flos": 24250149342720.0, + "grad_norm": 1.5852242434455028, + "language_loss": 0.66993374, + "learning_rate": 2.181046234549138e-06, + "loss": 0.74703825, + "num_input_tokens_seen": 174097460, + "router_z_loss_clip": 1.5703125, + "router_z_loss_mlp": 0.12329102, + "step": 8100, + "time_per_iteration": 2.5218353271484375 + }, + { + "auxiliary_loss_clip": 0.0643635, + "auxiliary_loss_mlp": 0.0127283, + "balance_loss_clip": 0.06283123, + "balance_loss_mlp": 0.01260176, + "epoch": 0.4870584698632196, + "flos": 25931388389760.0, + "grad_norm": 1.294146562327305, + "language_loss": 0.76505142, + "learning_rate": 2.180658368429088e-06, + "loss": 0.84214324, + "num_input_tokens_seen": 174120775, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.12664795, + "step": 8101, + "time_per_iteration": 2.645095109939575 + }, + { + "auxiliary_loss_clip": 0.06345028, + "auxiliary_loss_mlp": 0.01254744, + "balance_loss_clip": 0.06277841, + "balance_loss_mlp": 0.01252564, + "epoch": 0.48711859311588757, + "flos": 70232006511360.0, + "grad_norm": 0.6692636412141889, + "language_loss": 0.5212009, + "learning_rate": 2.1802704954584565e-06, + "loss": 0.59719861, + "num_input_tokens_seen": 174189135, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 0.02183533, + "step": 8102, + "time_per_iteration": 3.2782585620880127 + }, + { + "auxiliary_loss_clip": 0.06439511, + "auxiliary_loss_mlp": 0.01266928, + "balance_loss_clip": 0.06284305, + "balance_loss_mlp": 0.01253523, + "epoch": 0.48717871636855553, + "flos": 12346831301760.0, + "grad_norm": 2.023585148758525, + "language_loss": 0.7395249, + "learning_rate": 2.1798826156519484e-06, + "loss": 0.81658924, + "num_input_tokens_seen": 174203250, + "router_z_loss_clip": 1.55078125, + "router_z_loss_mlp": 0.13415527, + "step": 8103, + "time_per_iteration": 2.5020487308502197 + }, + { + "auxiliary_loss_clip": 0.06437068, + "auxiliary_loss_mlp": 0.01271054, + "balance_loss_clip": 0.06280553, + "balance_loss_mlp": 0.01257059, + "epoch": 0.4872388396212235, + "flos": 23483874954240.0, + "grad_norm": 1.425095223977108, + "language_loss": 0.6284436, + "learning_rate": 2.1794947290242737e-06, + "loss": 0.70552492, + "num_input_tokens_seen": 174224145, + "router_z_loss_clip": 1.56835938, + "router_z_loss_mlp": 0.13989258, + "step": 8104, + "time_per_iteration": 2.5457305908203125 + }, + { + "auxiliary_loss_clip": 0.06436496, + "auxiliary_loss_mlp": 0.012688, + "balance_loss_clip": 0.06281868, + "balance_loss_mlp": 0.01255759, + "epoch": 0.48729896287389146, + "flos": 31435068919680.0, + "grad_norm": 2.8385892248494575, + "language_loss": 0.69637764, + "learning_rate": 2.1791068355901413e-06, + "loss": 0.77343059, + "num_input_tokens_seen": 174244435, + "router_z_loss_clip": 1.54492188, + "router_z_loss_mlp": 0.13043213, + "step": 8105, + "time_per_iteration": 2.6453042030334473 + }, + { + "auxiliary_loss_clip": 0.0643308, + "auxiliary_loss_mlp": 0.01270898, + "balance_loss_clip": 0.06279837, + "balance_loss_mlp": 0.01258464, + "epoch": 0.4873590861265594, + "flos": 19063192446720.0, + "grad_norm": 1.510355754545757, + "language_loss": 0.73659271, + "learning_rate": 2.178718935364259e-06, + "loss": 0.81363249, + "num_input_tokens_seen": 174262710, + "router_z_loss_clip": 1.53222656, + "router_z_loss_mlp": 0.12451172, + "step": 8106, + "time_per_iteration": 2.4909706115722656 + }, + { + "auxiliary_loss_clip": 0.0644394, + "auxiliary_loss_mlp": 0.01272973, + "balance_loss_clip": 0.06283985, + "balance_loss_mlp": 0.01258888, + "epoch": 0.4874192093792274, + "flos": 24354424149120.0, + "grad_norm": 1.669305756095907, + "language_loss": 0.77040148, + "learning_rate": 2.1783310283613373e-06, + "loss": 0.84757066, + "num_input_tokens_seen": 174281545, + "router_z_loss_clip": 1.59863281, + "router_z_loss_mlp": 0.14080811, + "step": 8107, + "time_per_iteration": 2.5784239768981934 + }, + { + "auxiliary_loss_clip": 0.06432547, + "auxiliary_loss_mlp": 0.01266802, + "balance_loss_clip": 0.06281953, + "balance_loss_mlp": 0.01254971, + "epoch": 0.4874793326318954, + "flos": 23119339766400.0, + "grad_norm": 3.7362093355788857, + "language_loss": 0.75508547, + "learning_rate": 2.1779431145960853e-06, + "loss": 0.83207899, + "num_input_tokens_seen": 174300290, + "router_z_loss_clip": 1.50585938, + "router_z_loss_mlp": 0.1182251, + "step": 8108, + "time_per_iteration": 2.51676607131958 + }, + { + "auxiliary_loss_clip": 0.06434841, + "auxiliary_loss_mlp": 0.01268609, + "balance_loss_clip": 0.06281565, + "balance_loss_mlp": 0.01257522, + "epoch": 0.4875394558845634, + "flos": 19032193635840.0, + "grad_norm": 1.6826296910838767, + "language_loss": 0.73853874, + "learning_rate": 2.177555194083212e-06, + "loss": 0.81557322, + "num_input_tokens_seen": 174318490, + "router_z_loss_clip": 1.53320312, + "router_z_loss_mlp": 0.11090088, + "step": 8109, + "time_per_iteration": 2.594315767288208 + }, + { + "auxiliary_loss_clip": 0.06429494, + "auxiliary_loss_mlp": 0.01265982, + "balance_loss_clip": 0.0628022, + "balance_loss_mlp": 0.01253853, + "epoch": 0.48759957913723134, + "flos": 21439945509120.0, + "grad_norm": 1.7035668673577407, + "language_loss": 0.78900838, + "learning_rate": 2.177167266837428e-06, + "loss": 0.86596316, + "num_input_tokens_seen": 174335505, + "router_z_loss_clip": 1.49414062, + "router_z_loss_mlp": 0.12121582, + "step": 8110, + "time_per_iteration": 2.517711639404297 + }, + { + "auxiliary_loss_clip": 0.06435961, + "auxiliary_loss_mlp": 0.01271496, + "balance_loss_clip": 0.06281072, + "balance_loss_mlp": 0.01259265, + "epoch": 0.4876597023898993, + "flos": 17754412798080.0, + "grad_norm": 2.2958034596154238, + "language_loss": 0.72586286, + "learning_rate": 2.176779332873444e-06, + "loss": 0.80293739, + "num_input_tokens_seen": 174353990, + "router_z_loss_clip": 1.55078125, + "router_z_loss_mlp": 0.12231445, + "step": 8111, + "time_per_iteration": 3.939528465270996 + }, + { + "auxiliary_loss_clip": 0.06434079, + "auxiliary_loss_mlp": 0.01270804, + "balance_loss_clip": 0.06283166, + "balance_loss_mlp": 0.01257947, + "epoch": 0.4877198256425673, + "flos": 17025384349440.0, + "grad_norm": 1.699620610729742, + "language_loss": 0.76073879, + "learning_rate": 2.17639139220597e-06, + "loss": 0.83778763, + "num_input_tokens_seen": 174373425, + "router_z_loss_clip": 1.50976562, + "router_z_loss_mlp": 0.128479, + "step": 8112, + "time_per_iteration": 2.614734172821045 + }, + { + "auxiliary_loss_clip": 0.06443445, + "auxiliary_loss_mlp": 0.01270845, + "balance_loss_clip": 0.06281452, + "balance_loss_mlp": 0.01257445, + "epoch": 0.48777994889523524, + "flos": 22390898296320.0, + "grad_norm": 1.829058055025175, + "language_loss": 0.756136, + "learning_rate": 2.1760034448497166e-06, + "loss": 0.83327889, + "num_input_tokens_seen": 174393070, + "router_z_loss_clip": 1.61914062, + "router_z_loss_mlp": 0.13397217, + "step": 8113, + "time_per_iteration": 3.978013277053833 + }, + { + "auxiliary_loss_clip": 0.0633374, + "auxiliary_loss_mlp": 0.01252792, + "balance_loss_clip": 0.06267424, + "balance_loss_mlp": 0.0125078, + "epoch": 0.4878400721479032, + "flos": 61261237664640.0, + "grad_norm": 0.785084950627043, + "language_loss": 0.48805469, + "learning_rate": 2.1756154908193943e-06, + "loss": 0.56391996, + "num_input_tokens_seen": 174446880, + "router_z_loss_clip": 0.6640625, + "router_z_loss_mlp": 0.02011108, + "step": 8114, + "time_per_iteration": 3.0476014614105225 + }, + { + "auxiliary_loss_clip": 0.06435857, + "auxiliary_loss_mlp": 0.01268853, + "balance_loss_clip": 0.06280373, + "balance_loss_mlp": 0.01255507, + "epoch": 0.48790019540057117, + "flos": 24543756449280.0, + "grad_norm": 1.6081028897323706, + "language_loss": 0.77215505, + "learning_rate": 2.1752275301297155e-06, + "loss": 0.84920216, + "num_input_tokens_seen": 174468485, + "router_z_loss_clip": 1.55664062, + "router_z_loss_mlp": 0.13348389, + "step": 8115, + "time_per_iteration": 2.615709066390991 + }, + { + "auxiliary_loss_clip": 0.06438144, + "auxiliary_loss_mlp": 0.01270465, + "balance_loss_clip": 0.06279679, + "balance_loss_mlp": 0.01256858, + "epoch": 0.48796031865323913, + "flos": 21840175336320.0, + "grad_norm": 1.938320357328723, + "language_loss": 0.72471654, + "learning_rate": 2.1748395627953915e-06, + "loss": 0.80180264, + "num_input_tokens_seen": 174486360, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.13586426, + "step": 8116, + "time_per_iteration": 2.502880573272705 + }, + { + "auxiliary_loss_clip": 0.06428684, + "auxiliary_loss_mlp": 0.01266227, + "balance_loss_clip": 0.06277922, + "balance_loss_mlp": 0.0125349, + "epoch": 0.4880204419059071, + "flos": 18594969431040.0, + "grad_norm": 1.5984683769851484, + "language_loss": 0.63217908, + "learning_rate": 2.1744515888311335e-06, + "loss": 0.70912814, + "num_input_tokens_seen": 174505075, + "router_z_loss_clip": 1.5078125, + "router_z_loss_mlp": 0.12750244, + "step": 8117, + "time_per_iteration": 2.5082454681396484 + }, + { + "auxiliary_loss_clip": 0.06432296, + "auxiliary_loss_mlp": 0.01268211, + "balance_loss_clip": 0.06278604, + "balance_loss_mlp": 0.0125558, + "epoch": 0.48808056515857506, + "flos": 19178242502400.0, + "grad_norm": 1.8182073979213524, + "language_loss": 0.79733717, + "learning_rate": 2.1740636082516533e-06, + "loss": 0.87434226, + "num_input_tokens_seen": 174523385, + "router_z_loss_clip": 1.53808594, + "router_z_loss_mlp": 0.1262207, + "step": 8118, + "time_per_iteration": 3.925899028778076 + }, + { + "auxiliary_loss_clip": 0.06436172, + "auxiliary_loss_mlp": 0.01267812, + "balance_loss_clip": 0.06280739, + "balance_loss_mlp": 0.01254669, + "epoch": 0.48814068841124303, + "flos": 20126679667200.0, + "grad_norm": 1.6934286727955359, + "language_loss": 0.63701898, + "learning_rate": 2.1736756210716645e-06, + "loss": 0.71405882, + "num_input_tokens_seen": 174542200, + "router_z_loss_clip": 1.55273438, + "router_z_loss_mlp": 0.13134766, + "step": 8119, + "time_per_iteration": 2.575894832611084 + }, + { + "auxiliary_loss_clip": 0.06432833, + "auxiliary_loss_mlp": 0.01267436, + "balance_loss_clip": 0.0627794, + "balance_loss_mlp": 0.01254698, + "epoch": 0.488200811663911, + "flos": 22972116942720.0, + "grad_norm": 1.6464989706708673, + "language_loss": 0.72632396, + "learning_rate": 2.173287627305878e-06, + "loss": 0.80332661, + "num_input_tokens_seen": 174563620, + "router_z_loss_clip": 1.54882812, + "router_z_loss_mlp": 0.12744141, + "step": 8120, + "time_per_iteration": 2.5209426879882812 + }, + { + "auxiliary_loss_clip": 0.06438597, + "auxiliary_loss_mlp": 0.01268649, + "balance_loss_clip": 0.06279586, + "balance_loss_mlp": 0.01255297, + "epoch": 0.48826093491657896, + "flos": 33918947827200.0, + "grad_norm": 1.7374615150704595, + "language_loss": 0.63695973, + "learning_rate": 2.1728996269690075e-06, + "loss": 0.71403223, + "num_input_tokens_seen": 174586465, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.13336182, + "step": 8121, + "time_per_iteration": 2.619035005569458 + }, + { + "auxiliary_loss_clip": 0.0644285, + "auxiliary_loss_mlp": 0.01267435, + "balance_loss_clip": 0.06282102, + "balance_loss_mlp": 0.01253643, + "epoch": 0.488321058169247, + "flos": 23076056332800.0, + "grad_norm": 1.857577186148328, + "language_loss": 0.82684505, + "learning_rate": 2.1725116200757664e-06, + "loss": 0.90394789, + "num_input_tokens_seen": 174604035, + "router_z_loss_clip": 1.60546875, + "router_z_loss_mlp": 0.13800049, + "step": 8122, + "time_per_iteration": 2.5246660709381104 + }, + { + "auxiliary_loss_clip": 0.06440943, + "auxiliary_loss_mlp": 0.01268474, + "balance_loss_clip": 0.06282523, + "balance_loss_mlp": 0.01255397, + "epoch": 0.48838118142191494, + "flos": 19323746317440.0, + "grad_norm": 1.8250600769951077, + "language_loss": 0.85500193, + "learning_rate": 2.172123606640866e-06, + "loss": 0.93209612, + "num_input_tokens_seen": 174621715, + "router_z_loss_clip": 1.58300781, + "router_z_loss_mlp": 0.13085938, + "step": 8123, + "time_per_iteration": 2.5317881107330322 + }, + { + "auxiliary_loss_clip": 0.06441107, + "auxiliary_loss_mlp": 0.01272894, + "balance_loss_clip": 0.06282164, + "balance_loss_mlp": 0.0125934, + "epoch": 0.4884413046745829, + "flos": 25417701734400.0, + "grad_norm": 1.3930130047769251, + "language_loss": 0.85569358, + "learning_rate": 2.1717355866790227e-06, + "loss": 0.93283355, + "num_input_tokens_seen": 174643835, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.13549805, + "step": 8124, + "time_per_iteration": 4.062820196151733 + }, + { + "auxiliary_loss_clip": 0.0644336, + "auxiliary_loss_mlp": 0.01266972, + "balance_loss_clip": 0.06285739, + "balance_loss_mlp": 0.01253769, + "epoch": 0.4885014279272509, + "flos": 20997103080960.0, + "grad_norm": 2.2053414232015363, + "language_loss": 0.80210352, + "learning_rate": 2.171347560204948e-06, + "loss": 0.87920684, + "num_input_tokens_seen": 174660955, + "router_z_loss_clip": 1.57519531, + "router_z_loss_mlp": 0.13201904, + "step": 8125, + "time_per_iteration": 2.5117287635803223 + }, + { + "auxiliary_loss_clip": 0.06437683, + "auxiliary_loss_mlp": 0.01269334, + "balance_loss_clip": 0.06283394, + "balance_loss_mlp": 0.01255976, + "epoch": 0.48856155117991884, + "flos": 13776656572800.0, + "grad_norm": 2.5222320452086016, + "language_loss": 0.72852308, + "learning_rate": 2.170959527233356e-06, + "loss": 0.80559325, + "num_input_tokens_seen": 174678270, + "router_z_loss_clip": 1.54296875, + "router_z_loss_mlp": 0.13348389, + "step": 8126, + "time_per_iteration": 2.5177037715911865 + }, + { + "auxiliary_loss_clip": 0.06445107, + "auxiliary_loss_mlp": 0.01269465, + "balance_loss_clip": 0.06285033, + "balance_loss_mlp": 0.01256113, + "epoch": 0.4886216744325868, + "flos": 32095936471680.0, + "grad_norm": 1.5739512034612657, + "language_loss": 0.68640763, + "learning_rate": 2.1705714877789633e-06, + "loss": 0.76355338, + "num_input_tokens_seen": 174698360, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.13372803, + "step": 8127, + "time_per_iteration": 2.606557846069336 + }, + { + "auxiliary_loss_clip": 0.06442467, + "auxiliary_loss_mlp": 0.01268043, + "balance_loss_clip": 0.06283246, + "balance_loss_mlp": 0.01254972, + "epoch": 0.48868179768525477, + "flos": 19616221393920.0, + "grad_norm": 1.6528567440124056, + "language_loss": 0.7688967, + "learning_rate": 2.170183441856481e-06, + "loss": 0.84600174, + "num_input_tokens_seen": 174716755, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.13085938, + "step": 8128, + "time_per_iteration": 2.564112901687622 + }, + { + "auxiliary_loss_clip": 0.06448022, + "auxiliary_loss_mlp": 0.01274106, + "balance_loss_clip": 0.06289175, + "balance_loss_mlp": 0.01260653, + "epoch": 0.48874192093792274, + "flos": 21293100028800.0, + "grad_norm": 1.6046032409788031, + "language_loss": 0.76479989, + "learning_rate": 2.1697953894806265e-06, + "loss": 0.84202117, + "num_input_tokens_seen": 174735560, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.13452148, + "step": 8129, + "time_per_iteration": 2.5374317169189453 + }, + { + "auxiliary_loss_clip": 0.06444047, + "auxiliary_loss_mlp": 0.01266373, + "balance_loss_clip": 0.06286857, + "balance_loss_mlp": 0.01252944, + "epoch": 0.4888020441905907, + "flos": 14178647335680.0, + "grad_norm": 2.0974560904884867, + "language_loss": 0.65812773, + "learning_rate": 2.169407330666114e-06, + "loss": 0.735232, + "num_input_tokens_seen": 174752730, + "router_z_loss_clip": 1.5703125, + "router_z_loss_mlp": 0.13452148, + "step": 8130, + "time_per_iteration": 2.5409111976623535 + }, + { + "auxiliary_loss_clip": 0.06440154, + "auxiliary_loss_mlp": 0.01269301, + "balance_loss_clip": 0.06286357, + "balance_loss_mlp": 0.01256528, + "epoch": 0.48886216744325867, + "flos": 24104813235840.0, + "grad_norm": 1.7915788803825166, + "language_loss": 0.72896582, + "learning_rate": 2.169019265427658e-06, + "loss": 0.80606037, + "num_input_tokens_seen": 174772520, + "router_z_loss_clip": 1.53808594, + "router_z_loss_mlp": 0.12768555, + "step": 8131, + "time_per_iteration": 2.56299090385437 + }, + { + "auxiliary_loss_clip": 0.06451105, + "auxiliary_loss_mlp": 0.01270383, + "balance_loss_clip": 0.06289683, + "balance_loss_mlp": 0.01256811, + "epoch": 0.48892229069592663, + "flos": 38439838218240.0, + "grad_norm": 1.2588039875779695, + "language_loss": 0.69597721, + "learning_rate": 2.1686311937799745e-06, + "loss": 0.77319217, + "num_input_tokens_seen": 174796540, + "router_z_loss_clip": 1.61230469, + "router_z_loss_mlp": 0.13586426, + "step": 8132, + "time_per_iteration": 2.70053768157959 + }, + { + "auxiliary_loss_clip": 0.06438366, + "auxiliary_loss_mlp": 0.01270585, + "balance_loss_clip": 0.06285742, + "balance_loss_mlp": 0.01257436, + "epoch": 0.4889824139485946, + "flos": 23850338785920.0, + "grad_norm": 2.3033814193981454, + "language_loss": 0.70031691, + "learning_rate": 2.1682431157377797e-06, + "loss": 0.77740639, + "num_input_tokens_seen": 174817840, + "router_z_loss_clip": 1.52636719, + "router_z_loss_mlp": 0.13146973, + "step": 8133, + "time_per_iteration": 2.5559158325195312 + }, + { + "auxiliary_loss_clip": 0.06443258, + "auxiliary_loss_mlp": 0.01270512, + "balance_loss_clip": 0.0629006, + "balance_loss_mlp": 0.01257548, + "epoch": 0.48904253720126256, + "flos": 24432731389440.0, + "grad_norm": 1.67073327790382, + "language_loss": 0.71227533, + "learning_rate": 2.1678550313157883e-06, + "loss": 0.78941303, + "num_input_tokens_seen": 174837885, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.12957764, + "step": 8134, + "time_per_iteration": 2.5545125007629395 + }, + { + "auxiliary_loss_clip": 0.06444804, + "auxiliary_loss_mlp": 0.01271014, + "balance_loss_clip": 0.06283658, + "balance_loss_mlp": 0.01257055, + "epoch": 0.4891026604539306, + "flos": 24177586106880.0, + "grad_norm": 1.7998075455300961, + "language_loss": 0.80179673, + "learning_rate": 2.167466940528718e-06, + "loss": 0.87895489, + "num_input_tokens_seen": 174855240, + "router_z_loss_clip": 1.61132812, + "router_z_loss_mlp": 0.13977051, + "step": 8135, + "time_per_iteration": 2.54832124710083 + }, + { + "auxiliary_loss_clip": 0.06439205, + "auxiliary_loss_mlp": 0.01267223, + "balance_loss_clip": 0.06284894, + "balance_loss_mlp": 0.01255004, + "epoch": 0.48916278370659855, + "flos": 21477443011200.0, + "grad_norm": 1.5753098834035062, + "language_loss": 0.74565232, + "learning_rate": 2.1670788433912843e-06, + "loss": 0.82271659, + "num_input_tokens_seen": 174875145, + "router_z_loss_clip": 1.54199219, + "router_z_loss_mlp": 0.12213135, + "step": 8136, + "time_per_iteration": 2.5225162506103516 + }, + { + "auxiliary_loss_clip": 0.06440099, + "auxiliary_loss_mlp": 0.01265964, + "balance_loss_clip": 0.06286249, + "balance_loss_mlp": 0.01253519, + "epoch": 0.4892229069592665, + "flos": 22316322562560.0, + "grad_norm": 1.5544220345156794, + "language_loss": 0.73698246, + "learning_rate": 2.166690739918204e-06, + "loss": 0.81404305, + "num_input_tokens_seen": 174894770, + "router_z_loss_clip": 1.5390625, + "router_z_loss_mlp": 0.12451172, + "step": 8137, + "time_per_iteration": 2.5138792991638184 + }, + { + "auxiliary_loss_clip": 0.06443799, + "auxiliary_loss_mlp": 0.01270566, + "balance_loss_clip": 0.06287944, + "balance_loss_mlp": 0.01257673, + "epoch": 0.4892830302119345, + "flos": 12791812008960.0, + "grad_norm": 2.1813813764641448, + "language_loss": 0.75360358, + "learning_rate": 2.1663026301241944e-06, + "loss": 0.83074719, + "num_input_tokens_seen": 174912780, + "router_z_loss_clip": 1.55761719, + "router_z_loss_mlp": 0.12890625, + "step": 8138, + "time_per_iteration": 2.52406644821167 + }, + { + "auxiliary_loss_clip": 0.06443107, + "auxiliary_loss_mlp": 0.01267703, + "balance_loss_clip": 0.06287149, + "balance_loss_mlp": 0.01255192, + "epoch": 0.48934315346460244, + "flos": 20820223111680.0, + "grad_norm": 1.5609881437350468, + "language_loss": 0.74361938, + "learning_rate": 2.165914514023972e-06, + "loss": 0.82072747, + "num_input_tokens_seen": 174931250, + "router_z_loss_clip": 1.56347656, + "router_z_loss_mlp": 0.12518311, + "step": 8139, + "time_per_iteration": 2.5139529705047607 + }, + { + "auxiliary_loss_clip": 0.0643822, + "auxiliary_loss_mlp": 0.01266126, + "balance_loss_clip": 0.06281914, + "balance_loss_mlp": 0.01253144, + "epoch": 0.4894032767172704, + "flos": 19761641354880.0, + "grad_norm": 2.1585110635090388, + "language_loss": 0.62118167, + "learning_rate": 2.165526391632255e-06, + "loss": 0.69822514, + "num_input_tokens_seen": 174951105, + "router_z_loss_clip": 1.56347656, + "router_z_loss_mlp": 0.12988281, + "step": 8140, + "time_per_iteration": 2.5321638584136963 + }, + { + "auxiliary_loss_clip": 0.06444136, + "auxiliary_loss_mlp": 0.01271459, + "balance_loss_clip": 0.06286128, + "balance_loss_mlp": 0.01257506, + "epoch": 0.4894633999699384, + "flos": 17824292703360.0, + "grad_norm": 1.8580247423308633, + "language_loss": 0.82388717, + "learning_rate": 2.1651382629637608e-06, + "loss": 0.90104312, + "num_input_tokens_seen": 174969120, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.13946533, + "step": 8141, + "time_per_iteration": 2.4724786281585693 + }, + { + "auxiliary_loss_clip": 0.06448226, + "auxiliary_loss_mlp": 0.01272495, + "balance_loss_clip": 0.06290399, + "balance_loss_mlp": 0.01258279, + "epoch": 0.48952352322260634, + "flos": 25530781219200.0, + "grad_norm": 1.6913372633538968, + "language_loss": 0.72726512, + "learning_rate": 2.1647501280332066e-06, + "loss": 0.80447233, + "num_input_tokens_seen": 174991295, + "router_z_loss_clip": 1.57714844, + "router_z_loss_mlp": 0.14208984, + "step": 8142, + "time_per_iteration": 2.5858702659606934 + }, + { + "auxiliary_loss_clip": 0.06437673, + "auxiliary_loss_mlp": 0.01270492, + "balance_loss_clip": 0.062835, + "balance_loss_mlp": 0.01257624, + "epoch": 0.4895836464752743, + "flos": 29062508561280.0, + "grad_norm": 1.575435552323968, + "language_loss": 0.6727252, + "learning_rate": 2.1643619868553105e-06, + "loss": 0.74980688, + "num_input_tokens_seen": 175012830, + "router_z_loss_clip": 1.54003906, + "router_z_loss_mlp": 0.12860107, + "step": 8143, + "time_per_iteration": 2.576084613800049 + }, + { + "auxiliary_loss_clip": 0.06441937, + "auxiliary_loss_mlp": 0.01266921, + "balance_loss_clip": 0.06288718, + "balance_loss_mlp": 0.01254678, + "epoch": 0.48964376972794227, + "flos": 33555335034240.0, + "grad_norm": 1.550815752793646, + "language_loss": 0.75150239, + "learning_rate": 2.163973839444793e-06, + "loss": 0.82859099, + "num_input_tokens_seen": 175035695, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.12243652, + "step": 8144, + "time_per_iteration": 2.641314744949341 + }, + { + "auxiliary_loss_clip": 0.06442292, + "auxiliary_loss_mlp": 0.01272411, + "balance_loss_clip": 0.06287357, + "balance_loss_mlp": 0.01259089, + "epoch": 0.48970389298061023, + "flos": 22060506447360.0, + "grad_norm": 1.55007225141579, + "language_loss": 0.75850821, + "learning_rate": 2.1635856858163695e-06, + "loss": 0.83565521, + "num_input_tokens_seen": 175056425, + "router_z_loss_clip": 1.54785156, + "router_z_loss_mlp": 0.13311768, + "step": 8145, + "time_per_iteration": 2.5283498764038086 + }, + { + "auxiliary_loss_clip": 0.0644419, + "auxiliary_loss_mlp": 0.0126844, + "balance_loss_clip": 0.0628912, + "balance_loss_mlp": 0.01254564, + "epoch": 0.4897640162332782, + "flos": 20090523830400.0, + "grad_norm": 1.8073715924768365, + "language_loss": 0.8057586, + "learning_rate": 2.163197525984761e-06, + "loss": 0.88288498, + "num_input_tokens_seen": 175074800, + "router_z_loss_clip": 1.54980469, + "router_z_loss_mlp": 0.13861084, + "step": 8146, + "time_per_iteration": 2.5433614253997803 + }, + { + "auxiliary_loss_clip": 0.06439323, + "auxiliary_loss_mlp": 0.01272664, + "balance_loss_clip": 0.06288785, + "balance_loss_mlp": 0.01260737, + "epoch": 0.48982413948594616, + "flos": 23813134773120.0, + "grad_norm": 1.5096911604618644, + "language_loss": 0.74847698, + "learning_rate": 2.162809359964687e-06, + "loss": 0.82559681, + "num_input_tokens_seen": 175094500, + "router_z_loss_clip": 1.50585938, + "router_z_loss_mlp": 0.11920166, + "step": 8147, + "time_per_iteration": 2.5623743534088135 + }, + { + "auxiliary_loss_clip": 0.06440282, + "auxiliary_loss_mlp": 0.01269967, + "balance_loss_clip": 0.06287088, + "balance_loss_mlp": 0.01256615, + "epoch": 0.4898842627386142, + "flos": 17645442163200.0, + "grad_norm": 1.9926710345073115, + "language_loss": 0.82984591, + "learning_rate": 2.162421187770864e-06, + "loss": 0.90694839, + "num_input_tokens_seen": 175112920, + "router_z_loss_clip": 1.53222656, + "router_z_loss_mlp": 0.13360596, + "step": 8148, + "time_per_iteration": 2.5547962188720703 + }, + { + "auxiliary_loss_clip": 0.0644103, + "auxiliary_loss_mlp": 0.01267177, + "balance_loss_clip": 0.0629115, + "balance_loss_mlp": 0.01255363, + "epoch": 0.48994438599128215, + "flos": 16623519367680.0, + "grad_norm": 2.084842951303776, + "language_loss": 0.74672109, + "learning_rate": 2.162033009418015e-06, + "loss": 0.82380313, + "num_input_tokens_seen": 175129910, + "router_z_loss_clip": 1.49804688, + "router_z_loss_mlp": 0.11810303, + "step": 8149, + "time_per_iteration": 2.533867120742798 + }, + { + "auxiliary_loss_clip": 0.06448293, + "auxiliary_loss_mlp": 0.01270293, + "balance_loss_clip": 0.06289135, + "balance_loss_mlp": 0.01256507, + "epoch": 0.4900045092439501, + "flos": 26622080795520.0, + "grad_norm": 1.692853589800977, + "language_loss": 0.76331913, + "learning_rate": 2.1616448249208567e-06, + "loss": 0.840505, + "num_input_tokens_seen": 175148705, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.13787842, + "step": 8150, + "time_per_iteration": 3.964707374572754 + }, + { + "auxiliary_loss_clip": 0.06450059, + "auxiliary_loss_mlp": 0.01271131, + "balance_loss_clip": 0.06294075, + "balance_loss_mlp": 0.01257833, + "epoch": 0.4900646324966181, + "flos": 19908361054080.0, + "grad_norm": 2.244817701974514, + "language_loss": 0.72999722, + "learning_rate": 2.1612566342941106e-06, + "loss": 0.80720913, + "num_input_tokens_seen": 175167425, + "router_z_loss_clip": 1.56054688, + "router_z_loss_mlp": 0.13299561, + "step": 8151, + "time_per_iteration": 2.5549871921539307 + }, + { + "auxiliary_loss_clip": 0.06359711, + "auxiliary_loss_mlp": 0.01259283, + "balance_loss_clip": 0.06292651, + "balance_loss_mlp": 0.01257264, + "epoch": 0.49012475574928605, + "flos": 59207245729920.0, + "grad_norm": 0.8143029783085558, + "language_loss": 0.54076481, + "learning_rate": 2.1608684375524977e-06, + "loss": 0.6169548, + "num_input_tokens_seen": 175227985, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 0.02018738, + "step": 8152, + "time_per_iteration": 3.1047332286834717 + }, + { + "auxiliary_loss_clip": 0.06453663, + "auxiliary_loss_mlp": 0.01270304, + "balance_loss_clip": 0.06293964, + "balance_loss_mlp": 0.01257018, + "epoch": 0.490184879001954, + "flos": 45270285096960.0, + "grad_norm": 1.7665437022978014, + "language_loss": 0.6121304, + "learning_rate": 2.1604802347107364e-06, + "loss": 0.68937004, + "num_input_tokens_seen": 175251895, + "router_z_loss_clip": 1.59570312, + "router_z_loss_mlp": 0.13293457, + "step": 8153, + "time_per_iteration": 4.15813422203064 + }, + { + "auxiliary_loss_clip": 0.06445354, + "auxiliary_loss_mlp": 0.01267264, + "balance_loss_clip": 0.06291656, + "balance_loss_mlp": 0.01254074, + "epoch": 0.490245002254622, + "flos": 28009754663040.0, + "grad_norm": 1.583608688205754, + "language_loss": 0.76979434, + "learning_rate": 2.160092025783549e-06, + "loss": 0.84692061, + "num_input_tokens_seen": 175272770, + "router_z_loss_clip": 1.53417969, + "router_z_loss_mlp": 0.13195801, + "step": 8154, + "time_per_iteration": 2.5994982719421387 + }, + { + "auxiliary_loss_clip": 0.06359019, + "auxiliary_loss_mlp": 0.01255517, + "balance_loss_clip": 0.06291451, + "balance_loss_mlp": 0.01253472, + "epoch": 0.49030512550728994, + "flos": 58971764229120.0, + "grad_norm": 1.0610708177187165, + "language_loss": 0.669397, + "learning_rate": 2.1597038107856564e-06, + "loss": 0.74554235, + "num_input_tokens_seen": 175336320, + "router_z_loss_clip": 0.67578125, + "router_z_loss_mlp": 0.02046204, + "step": 8155, + "time_per_iteration": 3.2433578968048096 + }, + { + "auxiliary_loss_clip": 0.06448951, + "auxiliary_loss_mlp": 0.01269488, + "balance_loss_clip": 0.06294696, + "balance_loss_mlp": 0.0125743, + "epoch": 0.4903652487599579, + "flos": 19797922972800.0, + "grad_norm": 1.7256067083752205, + "language_loss": 0.77014565, + "learning_rate": 2.1593155897317784e-06, + "loss": 0.84733009, + "num_input_tokens_seen": 175353540, + "router_z_loss_clip": 1.54296875, + "router_z_loss_mlp": 0.12072754, + "step": 8156, + "time_per_iteration": 2.5398688316345215 + }, + { + "auxiliary_loss_clip": 0.06449247, + "auxiliary_loss_mlp": 0.01273385, + "balance_loss_clip": 0.06294699, + "balance_loss_mlp": 0.01259384, + "epoch": 0.49042537201262587, + "flos": 21768492568320.0, + "grad_norm": 1.9286441434498818, + "language_loss": 0.84019762, + "learning_rate": 2.1589273626366377e-06, + "loss": 0.91742396, + "num_input_tokens_seen": 175370445, + "router_z_loss_clip": 1.54492188, + "router_z_loss_mlp": 0.14007568, + "step": 8157, + "time_per_iteration": 2.5673582553863525 + }, + { + "auxiliary_loss_clip": 0.06449863, + "auxiliary_loss_mlp": 0.01266635, + "balance_loss_clip": 0.06293592, + "balance_loss_mlp": 0.01253701, + "epoch": 0.49048549526529384, + "flos": 18959043421440.0, + "grad_norm": 1.7147218979138201, + "language_loss": 0.79903084, + "learning_rate": 2.158539129514956e-06, + "loss": 0.87619579, + "num_input_tokens_seen": 175389020, + "router_z_loss_clip": 1.56152344, + "router_z_loss_mlp": 0.12927246, + "step": 8158, + "time_per_iteration": 3.982774496078491 + }, + { + "auxiliary_loss_clip": 0.0645184, + "auxiliary_loss_mlp": 0.01273348, + "balance_loss_clip": 0.06292954, + "balance_loss_mlp": 0.01259615, + "epoch": 0.4905456185179618, + "flos": 26913633477120.0, + "grad_norm": 1.6654114756309404, + "language_loss": 0.69551659, + "learning_rate": 2.158150890381454e-06, + "loss": 0.77276844, + "num_input_tokens_seen": 175409545, + "router_z_loss_clip": 1.58886719, + "router_z_loss_mlp": 0.1373291, + "step": 8159, + "time_per_iteration": 2.6114954948425293 + }, + { + "auxiliary_loss_clip": 0.06446424, + "auxiliary_loss_mlp": 0.01266602, + "balance_loss_clip": 0.06292199, + "balance_loss_mlp": 0.01253591, + "epoch": 0.49060574177062977, + "flos": 20418567765120.0, + "grad_norm": 1.7624184717579066, + "language_loss": 0.73495585, + "learning_rate": 2.157762645250854e-06, + "loss": 0.81208611, + "num_input_tokens_seen": 175429335, + "router_z_loss_clip": 1.54101562, + "router_z_loss_mlp": 0.13006592, + "step": 8160, + "time_per_iteration": 2.5310287475585938 + }, + { + "auxiliary_loss_clip": 0.06446327, + "auxiliary_loss_mlp": 0.01268684, + "balance_loss_clip": 0.06286773, + "balance_loss_mlp": 0.01254718, + "epoch": 0.4906658650232978, + "flos": 17499477150720.0, + "grad_norm": 1.9303786573731354, + "language_loss": 0.71921647, + "learning_rate": 2.1573743941378796e-06, + "loss": 0.79636657, + "num_input_tokens_seen": 175446955, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.13952637, + "step": 8161, + "time_per_iteration": 2.548387050628662 + }, + { + "auxiliary_loss_clip": 0.06438495, + "auxiliary_loss_mlp": 0.01269834, + "balance_loss_clip": 0.06285487, + "balance_loss_mlp": 0.01257102, + "epoch": 0.49072598827596575, + "flos": 26621619598080.0, + "grad_norm": 1.7423183419157489, + "language_loss": 0.68838918, + "learning_rate": 2.1569861370572517e-06, + "loss": 0.76547247, + "num_input_tokens_seen": 175468195, + "router_z_loss_clip": 1.53027344, + "router_z_loss_mlp": 0.12738037, + "step": 8162, + "time_per_iteration": 2.5565345287323 + }, + { + "auxiliary_loss_clip": 0.06445014, + "auxiliary_loss_mlp": 0.01271543, + "balance_loss_clip": 0.06284854, + "balance_loss_mlp": 0.01258048, + "epoch": 0.4907861115286337, + "flos": 20418861254400.0, + "grad_norm": 1.5998221011516633, + "language_loss": 0.6369257, + "learning_rate": 2.1565978740236944e-06, + "loss": 0.7140913, + "num_input_tokens_seen": 175487455, + "router_z_loss_clip": 1.60058594, + "router_z_loss_mlp": 0.1350708, + "step": 8163, + "time_per_iteration": 2.545926094055176 + }, + { + "auxiliary_loss_clip": 0.0643242, + "auxiliary_loss_mlp": 0.01272916, + "balance_loss_clip": 0.06283394, + "balance_loss_mlp": 0.01260471, + "epoch": 0.4908462347813017, + "flos": 14069508992640.0, + "grad_norm": 1.9421890992027433, + "language_loss": 0.77104688, + "learning_rate": 2.1562096050519293e-06, + "loss": 0.84810019, + "num_input_tokens_seen": 175504450, + "router_z_loss_clip": 1.48828125, + "router_z_loss_mlp": 0.12438965, + "step": 8164, + "time_per_iteration": 3.93280029296875 + }, + { + "auxiliary_loss_clip": 0.06443131, + "auxiliary_loss_mlp": 0.01271936, + "balance_loss_clip": 0.06285694, + "balance_loss_mlp": 0.01258382, + "epoch": 0.49090635803396965, + "flos": 18741227932800.0, + "grad_norm": 1.56961735096587, + "language_loss": 0.77229172, + "learning_rate": 2.1558213301566806e-06, + "loss": 0.84944236, + "num_input_tokens_seen": 175523600, + "router_z_loss_clip": 1.57324219, + "router_z_loss_mlp": 0.13562012, + "step": 8165, + "time_per_iteration": 2.493861436843872 + }, + { + "auxiliary_loss_clip": 0.06434909, + "auxiliary_loss_mlp": 0.01271922, + "balance_loss_clip": 0.06283913, + "balance_loss_mlp": 0.01258922, + "epoch": 0.4909664812866376, + "flos": 20564784339840.0, + "grad_norm": 2.2518376482371862, + "language_loss": 0.77749753, + "learning_rate": 2.1554330493526716e-06, + "loss": 0.85456586, + "num_input_tokens_seen": 175542720, + "router_z_loss_clip": 1.50683594, + "router_z_loss_mlp": 0.13006592, + "step": 8166, + "time_per_iteration": 2.578685760498047 + }, + { + "auxiliary_loss_clip": 0.06343444, + "auxiliary_loss_mlp": 0.01254597, + "balance_loss_clip": 0.06276363, + "balance_loss_mlp": 0.01252508, + "epoch": 0.4910266045393056, + "flos": 54704006622720.0, + "grad_norm": 0.7970989298383858, + "language_loss": 0.54202092, + "learning_rate": 2.1550447626546253e-06, + "loss": 0.61800134, + "num_input_tokens_seen": 175598640, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 0.02090454, + "step": 8167, + "time_per_iteration": 3.1805777549743652 + }, + { + "auxiliary_loss_clip": 0.06435132, + "auxiliary_loss_mlp": 0.01271015, + "balance_loss_clip": 0.06282446, + "balance_loss_mlp": 0.01257902, + "epoch": 0.49108672779197354, + "flos": 16250892261120.0, + "grad_norm": 1.7548504171286585, + "language_loss": 0.86375958, + "learning_rate": 2.1546564700772665e-06, + "loss": 0.94082105, + "num_input_tokens_seen": 175615675, + "router_z_loss_clip": 1.52832031, + "router_z_loss_mlp": 0.13110352, + "step": 8168, + "time_per_iteration": 2.5346431732177734 + }, + { + "auxiliary_loss_clip": 0.06439523, + "auxiliary_loss_mlp": 0.01270106, + "balance_loss_clip": 0.06287682, + "balance_loss_mlp": 0.01257667, + "epoch": 0.4911468510446415, + "flos": 19831018135680.0, + "grad_norm": 1.6618595444085258, + "language_loss": 0.73708379, + "learning_rate": 2.1542681716353193e-06, + "loss": 0.81418014, + "num_input_tokens_seen": 175632255, + "router_z_loss_clip": 1.51757812, + "router_z_loss_mlp": 0.12438965, + "step": 8169, + "time_per_iteration": 2.519845962524414 + }, + { + "auxiliary_loss_clip": 0.06435073, + "auxiliary_loss_mlp": 0.01267032, + "balance_loss_clip": 0.06282359, + "balance_loss_mlp": 0.01254795, + "epoch": 0.4912069742973095, + "flos": 21218650076160.0, + "grad_norm": 1.7105636772686297, + "language_loss": 0.78364748, + "learning_rate": 2.1538798673435068e-06, + "loss": 0.86066854, + "num_input_tokens_seen": 175651625, + "router_z_loss_clip": 1.52441406, + "router_z_loss_mlp": 0.12237549, + "step": 8170, + "time_per_iteration": 2.5751500129699707 + }, + { + "auxiliary_loss_clip": 0.06441889, + "auxiliary_loss_mlp": 0.01268553, + "balance_loss_clip": 0.06285594, + "balance_loss_mlp": 0.01255547, + "epoch": 0.49126709754997744, + "flos": 19543280814720.0, + "grad_norm": 2.6389457816540527, + "language_loss": 0.76311809, + "learning_rate": 2.1534915572165545e-06, + "loss": 0.84022248, + "num_input_tokens_seen": 175669265, + "router_z_loss_clip": 1.56347656, + "router_z_loss_mlp": 0.12988281, + "step": 8171, + "time_per_iteration": 2.5004677772521973 + }, + { + "auxiliary_loss_clip": 0.06443939, + "auxiliary_loss_mlp": 0.01268404, + "balance_loss_clip": 0.06285004, + "balance_loss_mlp": 0.01255947, + "epoch": 0.4913272208026454, + "flos": 12244568993280.0, + "grad_norm": 2.2552468133898684, + "language_loss": 0.81709123, + "learning_rate": 2.1531032412691875e-06, + "loss": 0.89421463, + "num_input_tokens_seen": 175686065, + "router_z_loss_clip": 1.58789062, + "router_z_loss_mlp": 0.12457275, + "step": 8172, + "time_per_iteration": 2.5347814559936523 + }, + { + "auxiliary_loss_clip": 0.06338271, + "auxiliary_loss_mlp": 0.01256316, + "balance_loss_clip": 0.06271008, + "balance_loss_mlp": 0.0125441, + "epoch": 0.49138734405531337, + "flos": 65484663661440.0, + "grad_norm": 0.6802144154671269, + "language_loss": 0.5333854, + "learning_rate": 2.1527149195161295e-06, + "loss": 0.60933125, + "num_input_tokens_seen": 175748595, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 0.01902771, + "step": 8173, + "time_per_iteration": 3.1376869678497314 + }, + { + "auxiliary_loss_clip": 0.06444144, + "auxiliary_loss_mlp": 0.01267282, + "balance_loss_clip": 0.0628697, + "balance_loss_mlp": 0.01253663, + "epoch": 0.4914474673079814, + "flos": 18444434371200.0, + "grad_norm": 1.9185770389222636, + "language_loss": 0.6246022, + "learning_rate": 2.152326591972107e-06, + "loss": 0.70171648, + "num_input_tokens_seen": 175766770, + "router_z_loss_clip": 1.57226562, + "router_z_loss_mlp": 0.1361084, + "step": 8174, + "time_per_iteration": 2.5815811157226562 + }, + { + "auxiliary_loss_clip": 0.06439996, + "auxiliary_loss_mlp": 0.01273325, + "balance_loss_clip": 0.0628511, + "balance_loss_mlp": 0.0126051, + "epoch": 0.49150759056064935, + "flos": 21690772306560.0, + "grad_norm": 2.0568306898238045, + "language_loss": 0.69594127, + "learning_rate": 2.1519382586518445e-06, + "loss": 0.77307451, + "num_input_tokens_seen": 175783605, + "router_z_loss_clip": 1.54980469, + "router_z_loss_mlp": 0.1282959, + "step": 8175, + "time_per_iteration": 2.5219566822052 + }, + { + "auxiliary_loss_clip": 0.06442218, + "auxiliary_loss_mlp": 0.0126783, + "balance_loss_clip": 0.06288453, + "balance_loss_mlp": 0.01255021, + "epoch": 0.4915677138133173, + "flos": 22388969652480.0, + "grad_norm": 1.5433299767806794, + "language_loss": 0.74403, + "learning_rate": 2.151549919570068e-06, + "loss": 0.82113051, + "num_input_tokens_seen": 175801390, + "router_z_loss_clip": 1.53613281, + "router_z_loss_mlp": 0.12805176, + "step": 8176, + "time_per_iteration": 2.5598292350769043 + }, + { + "auxiliary_loss_clip": 0.0643885, + "auxiliary_loss_mlp": 0.01272965, + "balance_loss_clip": 0.0628263, + "balance_loss_mlp": 0.01259977, + "epoch": 0.4916278370659853, + "flos": 18408320461440.0, + "grad_norm": 1.8239688366126487, + "language_loss": 0.70529395, + "learning_rate": 2.1511615747415036e-06, + "loss": 0.78241211, + "num_input_tokens_seen": 175819830, + "router_z_loss_clip": 1.56054688, + "router_z_loss_mlp": 0.12988281, + "step": 8177, + "time_per_iteration": 2.5329604148864746 + }, + { + "auxiliary_loss_clip": 0.06340313, + "auxiliary_loss_mlp": 0.01256045, + "balance_loss_clip": 0.06273533, + "balance_loss_mlp": 0.01253889, + "epoch": 0.49168796031865325, + "flos": 66630147701760.0, + "grad_norm": 0.6656640602529083, + "language_loss": 0.46068031, + "learning_rate": 2.150773224180877e-06, + "loss": 0.53664386, + "num_input_tokens_seen": 175881765, + "router_z_loss_clip": 0.66796875, + "router_z_loss_mlp": 0.02159119, + "step": 8178, + "time_per_iteration": 3.170982837677002 + }, + { + "auxiliary_loss_clip": 0.06445555, + "auxiliary_loss_mlp": 0.01272894, + "balance_loss_clip": 0.06286988, + "balance_loss_mlp": 0.01259597, + "epoch": 0.4917480835713212, + "flos": 20965601145600.0, + "grad_norm": 2.2617000627187407, + "language_loss": 0.6597743, + "learning_rate": 2.1503848679029147e-06, + "loss": 0.73695886, + "num_input_tokens_seen": 175901795, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.13299561, + "step": 8179, + "time_per_iteration": 2.5594394207000732 + }, + { + "auxiliary_loss_clip": 0.06447062, + "auxiliary_loss_mlp": 0.01267463, + "balance_loss_clip": 0.06285466, + "balance_loss_mlp": 0.01254088, + "epoch": 0.4918082068239892, + "flos": 15777386438400.0, + "grad_norm": 2.2633588866978442, + "language_loss": 0.70069337, + "learning_rate": 2.149996505922343e-06, + "loss": 0.77783871, + "num_input_tokens_seen": 175917770, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.1338501, + "step": 8180, + "time_per_iteration": 2.489649772644043 + }, + { + "auxiliary_loss_clip": 0.0643749, + "auxiliary_loss_mlp": 0.01267489, + "balance_loss_clip": 0.06285596, + "balance_loss_mlp": 0.01254406, + "epoch": 0.49186833007665715, + "flos": 24611162659200.0, + "grad_norm": 1.7052643417851399, + "language_loss": 0.84654552, + "learning_rate": 2.1496081382538895e-06, + "loss": 0.92359537, + "num_input_tokens_seen": 175937000, + "router_z_loss_clip": 1.51953125, + "router_z_loss_mlp": 0.13098145, + "step": 8181, + "time_per_iteration": 2.570831298828125 + }, + { + "auxiliary_loss_clip": 0.06432545, + "auxiliary_loss_mlp": 0.0127158, + "balance_loss_clip": 0.06282885, + "balance_loss_mlp": 0.01259843, + "epoch": 0.4919284533293251, + "flos": 22097039627520.0, + "grad_norm": 1.9771399001803804, + "language_loss": 0.73092818, + "learning_rate": 2.1492197649122793e-06, + "loss": 0.80796945, + "num_input_tokens_seen": 175955170, + "router_z_loss_clip": 1.49609375, + "router_z_loss_mlp": 0.11743164, + "step": 8182, + "time_per_iteration": 2.4966702461242676 + }, + { + "auxiliary_loss_clip": 0.06435409, + "auxiliary_loss_mlp": 0.01272985, + "balance_loss_clip": 0.06280539, + "balance_loss_mlp": 0.01260826, + "epoch": 0.4919885765819931, + "flos": 23374820465280.0, + "grad_norm": 1.9470010509475855, + "language_loss": 0.73167384, + "learning_rate": 2.1488313859122412e-06, + "loss": 0.80875778, + "num_input_tokens_seen": 175973725, + "router_z_loss_clip": 1.54882812, + "router_z_loss_mlp": 0.1217041, + "step": 8183, + "time_per_iteration": 2.5529325008392334 + }, + { + "auxiliary_loss_clip": 0.06441429, + "auxiliary_loss_mlp": 0.01268017, + "balance_loss_clip": 0.06279727, + "balance_loss_mlp": 0.01254523, + "epoch": 0.49204869983466104, + "flos": 21366795294720.0, + "grad_norm": 2.013163662705091, + "language_loss": 0.77443838, + "learning_rate": 2.1484430012685015e-06, + "loss": 0.85153282, + "num_input_tokens_seen": 175993885, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.1348877, + "step": 8184, + "time_per_iteration": 2.508230209350586 + }, + { + "auxiliary_loss_clip": 0.06435518, + "auxiliary_loss_mlp": 0.01266873, + "balance_loss_clip": 0.06281742, + "balance_loss_mlp": 0.01254523, + "epoch": 0.492108823087329, + "flos": 21149147514240.0, + "grad_norm": 2.3088868689892674, + "language_loss": 0.71377504, + "learning_rate": 2.148054610995789e-06, + "loss": 0.79079902, + "num_input_tokens_seen": 176014210, + "router_z_loss_clip": 1.53710938, + "router_z_loss_mlp": 0.12347412, + "step": 8185, + "time_per_iteration": 2.545316219329834 + }, + { + "auxiliary_loss_clip": 0.06437825, + "auxiliary_loss_mlp": 0.01266771, + "balance_loss_clip": 0.06280625, + "balance_loss_mlp": 0.01253074, + "epoch": 0.49216894633999697, + "flos": 25123214160000.0, + "grad_norm": 1.8318004423040046, + "language_loss": 0.75395268, + "learning_rate": 2.147666215108831e-06, + "loss": 0.8309986, + "num_input_tokens_seen": 176033890, + "router_z_loss_clip": 1.57128906, + "router_z_loss_mlp": 0.13684082, + "step": 8186, + "time_per_iteration": 2.5238165855407715 + }, + { + "auxiliary_loss_clip": 0.06435218, + "auxiliary_loss_mlp": 0.01274022, + "balance_loss_clip": 0.06281888, + "balance_loss_mlp": 0.01261124, + "epoch": 0.49222906959266494, + "flos": 22644534205440.0, + "grad_norm": 2.2257308208746975, + "language_loss": 0.68571508, + "learning_rate": 2.1472778136223545e-06, + "loss": 0.76280749, + "num_input_tokens_seen": 176052720, + "router_z_loss_clip": 1.53515625, + "router_z_loss_mlp": 0.12908936, + "step": 8187, + "time_per_iteration": 2.561488151550293 + }, + { + "auxiliary_loss_clip": 0.06434098, + "auxiliary_loss_mlp": 0.01272206, + "balance_loss_clip": 0.06280753, + "balance_loss_mlp": 0.01259653, + "epoch": 0.49228919284533296, + "flos": 20416471413120.0, + "grad_norm": 1.3887162782350388, + "language_loss": 0.67211652, + "learning_rate": 2.1468894065510894e-06, + "loss": 0.7491796, + "num_input_tokens_seen": 176072545, + "router_z_loss_clip": 1.53417969, + "router_z_loss_mlp": 0.12567139, + "step": 8188, + "time_per_iteration": 2.5019164085388184 + }, + { + "auxiliary_loss_clip": 0.06437577, + "auxiliary_loss_mlp": 0.01267268, + "balance_loss_clip": 0.06282844, + "balance_loss_mlp": 0.012549, + "epoch": 0.4923493160980009, + "flos": 27129142978560.0, + "grad_norm": 1.6466242872646388, + "language_loss": 0.74921268, + "learning_rate": 2.1465009939097623e-06, + "loss": 0.8262611, + "num_input_tokens_seen": 176091490, + "router_z_loss_clip": 1.54785156, + "router_z_loss_mlp": 0.12365723, + "step": 8189, + "time_per_iteration": 2.6160171031951904 + }, + { + "auxiliary_loss_clip": 0.06432211, + "auxiliary_loss_mlp": 0.01271904, + "balance_loss_clip": 0.0627953, + "balance_loss_mlp": 0.01259363, + "epoch": 0.4924094393506689, + "flos": 35745522981120.0, + "grad_norm": 1.6094215463667148, + "language_loss": 0.64780444, + "learning_rate": 2.146112575713104e-06, + "loss": 0.72484565, + "num_input_tokens_seen": 176113200, + "router_z_loss_clip": 1.52734375, + "router_z_loss_mlp": 0.12542725, + "step": 8190, + "time_per_iteration": 4.0641090869903564 + }, + { + "auxiliary_loss_clip": 0.06438321, + "auxiliary_loss_mlp": 0.01273117, + "balance_loss_clip": 0.06285122, + "balance_loss_mlp": 0.01260486, + "epoch": 0.49246956260333685, + "flos": 20418735473280.0, + "grad_norm": 1.8613448606205585, + "language_loss": 0.71446037, + "learning_rate": 2.1457241519758413e-06, + "loss": 0.79157472, + "num_input_tokens_seen": 176132485, + "router_z_loss_clip": 1.53222656, + "router_z_loss_mlp": 0.12628174, + "step": 8191, + "time_per_iteration": 2.5388033390045166 + }, + { + "auxiliary_loss_clip": 0.06437817, + "auxiliary_loss_mlp": 0.01265513, + "balance_loss_clip": 0.06282701, + "balance_loss_mlp": 0.01253193, + "epoch": 0.4925296858560048, + "flos": 38985152590080.0, + "grad_norm": 1.8396866027790106, + "language_loss": 0.72404003, + "learning_rate": 2.1453357227127043e-06, + "loss": 0.80107331, + "num_input_tokens_seen": 176155755, + "router_z_loss_clip": 1.55078125, + "router_z_loss_mlp": 0.12335205, + "step": 8192, + "time_per_iteration": 2.696115255355835 + }, + { + "auxiliary_loss_clip": 0.06334923, + "auxiliary_loss_mlp": 0.01254622, + "balance_loss_clip": 0.06267789, + "balance_loss_mlp": 0.01252217, + "epoch": 0.4925898091086728, + "flos": 64300367652480.0, + "grad_norm": 0.7283072322766662, + "language_loss": 0.51975358, + "learning_rate": 2.1449472879384224e-06, + "loss": 0.59564906, + "num_input_tokens_seen": 176216295, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 0.02401733, + "step": 8193, + "time_per_iteration": 4.540759086608887 + }, + { + "auxiliary_loss_clip": 0.06434911, + "auxiliary_loss_mlp": 0.01271982, + "balance_loss_clip": 0.06282961, + "balance_loss_mlp": 0.01259417, + "epoch": 0.49264993236134075, + "flos": 23042541899520.0, + "grad_norm": 1.3982393371006636, + "language_loss": 0.77103728, + "learning_rate": 2.1445588476677246e-06, + "loss": 0.84810621, + "num_input_tokens_seen": 176235925, + "router_z_loss_clip": 1.51953125, + "router_z_loss_mlp": 0.12554932, + "step": 8194, + "time_per_iteration": 2.585632085800171 + }, + { + "auxiliary_loss_clip": 0.06434575, + "auxiliary_loss_mlp": 0.01269697, + "balance_loss_clip": 0.06280608, + "balance_loss_mlp": 0.01257376, + "epoch": 0.4927100556140087, + "flos": 24725248392960.0, + "grad_norm": 2.1551580003064186, + "language_loss": 0.70539922, + "learning_rate": 2.144170401915341e-06, + "loss": 0.78244197, + "num_input_tokens_seen": 176253865, + "router_z_loss_clip": 1.5390625, + "router_z_loss_mlp": 0.12329102, + "step": 8195, + "time_per_iteration": 2.5881664752960205 + }, + { + "auxiliary_loss_clip": 0.06438025, + "auxiliary_loss_mlp": 0.01269625, + "balance_loss_clip": 0.06284925, + "balance_loss_mlp": 0.01257687, + "epoch": 0.4927701788666767, + "flos": 23510932623360.0, + "grad_norm": 2.3036054872688765, + "language_loss": 0.81165189, + "learning_rate": 2.143781950696001e-06, + "loss": 0.88872838, + "num_input_tokens_seen": 176271525, + "router_z_loss_clip": 1.52929688, + "router_z_loss_mlp": 0.11932373, + "step": 8196, + "time_per_iteration": 2.5550785064697266 + }, + { + "auxiliary_loss_clip": 0.06437081, + "auxiliary_loss_mlp": 0.01270899, + "balance_loss_clip": 0.06279114, + "balance_loss_mlp": 0.01258311, + "epoch": 0.49283030211934464, + "flos": 22935374127360.0, + "grad_norm": 1.9095456135696567, + "language_loss": 0.70909548, + "learning_rate": 2.1433934940244356e-06, + "loss": 0.78617525, + "num_input_tokens_seen": 176290810, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.12597656, + "step": 8197, + "time_per_iteration": 4.003530263900757 + }, + { + "auxiliary_loss_clip": 0.06434973, + "auxiliary_loss_mlp": 0.01271256, + "balance_loss_clip": 0.0628255, + "balance_loss_mlp": 0.01259699, + "epoch": 0.4928904253720126, + "flos": 16878622723200.0, + "grad_norm": 1.745870627956974, + "language_loss": 0.84271383, + "learning_rate": 2.143005031915374e-06, + "loss": 0.91977608, + "num_input_tokens_seen": 176309165, + "router_z_loss_clip": 1.52441406, + "router_z_loss_mlp": 0.11553955, + "step": 8198, + "time_per_iteration": 2.498107671737671 + }, + { + "auxiliary_loss_clip": 0.06443786, + "auxiliary_loss_mlp": 0.01268462, + "balance_loss_clip": 0.06287393, + "balance_loss_mlp": 0.01254521, + "epoch": 0.4929505486246806, + "flos": 14871855363840.0, + "grad_norm": 1.7338591596570678, + "language_loss": 0.76126587, + "learning_rate": 2.1426165643835467e-06, + "loss": 0.83838832, + "num_input_tokens_seen": 176324960, + "router_z_loss_clip": 1.56347656, + "router_z_loss_mlp": 0.13946533, + "step": 8199, + "time_per_iteration": 2.5254313945770264 + }, + { + "auxiliary_loss_clip": 0.06436033, + "auxiliary_loss_mlp": 0.01266476, + "balance_loss_clip": 0.06279432, + "balance_loss_mlp": 0.01252808, + "epoch": 0.49301067187734854, + "flos": 23849206755840.0, + "grad_norm": 1.3683337876027823, + "language_loss": 0.60070461, + "learning_rate": 2.1422280914436864e-06, + "loss": 0.67772967, + "num_input_tokens_seen": 176346195, + "router_z_loss_clip": 1.56542969, + "router_z_loss_mlp": 0.13647461, + "step": 8200, + "time_per_iteration": 2.54241943359375 + }, + { + "auxiliary_loss_clip": 0.06429607, + "auxiliary_loss_mlp": 0.01273188, + "balance_loss_clip": 0.06281705, + "balance_loss_mlp": 0.01261541, + "epoch": 0.49307079513001656, + "flos": 22497730652160.0, + "grad_norm": 1.4845406915411774, + "language_loss": 0.79454738, + "learning_rate": 2.1418396131105213e-06, + "loss": 0.87157536, + "num_input_tokens_seen": 176366735, + "router_z_loss_clip": 1.47851562, + "router_z_loss_mlp": 0.11657715, + "step": 8201, + "time_per_iteration": 2.590289831161499 + }, + { + "auxiliary_loss_clip": 0.0644393, + "auxiliary_loss_mlp": 0.01272695, + "balance_loss_clip": 0.06281954, + "balance_loss_mlp": 0.01259171, + "epoch": 0.4931309183826845, + "flos": 15930059777280.0, + "grad_norm": 1.9752291134223394, + "language_loss": 0.66993362, + "learning_rate": 2.141451129398785e-06, + "loss": 0.74709988, + "num_input_tokens_seen": 176384475, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.13525391, + "step": 8202, + "time_per_iteration": 2.5706307888031006 + }, + { + "auxiliary_loss_clip": 0.06429332, + "auxiliary_loss_mlp": 0.01267886, + "balance_loss_clip": 0.06277282, + "balance_loss_mlp": 0.01256055, + "epoch": 0.4931910416353525, + "flos": 27316588561920.0, + "grad_norm": 1.8969992308716948, + "language_loss": 0.75337243, + "learning_rate": 2.1410626403232076e-06, + "loss": 0.83034456, + "num_input_tokens_seen": 176402645, + "router_z_loss_clip": 1.52148438, + "router_z_loss_mlp": 0.11834717, + "step": 8203, + "time_per_iteration": 4.0727972984313965 + }, + { + "auxiliary_loss_clip": 0.06434371, + "auxiliary_loss_mlp": 0.01265731, + "balance_loss_clip": 0.06279419, + "balance_loss_mlp": 0.01253626, + "epoch": 0.49325116488802045, + "flos": 20811166871040.0, + "grad_norm": 2.0494104605673935, + "language_loss": 0.80605292, + "learning_rate": 2.1406741458985197e-06, + "loss": 0.8830539, + "num_input_tokens_seen": 176416715, + "router_z_loss_clip": 1.54980469, + "router_z_loss_mlp": 0.12103271, + "step": 8204, + "time_per_iteration": 2.6136350631713867 + }, + { + "auxiliary_loss_clip": 0.0643463, + "auxiliary_loss_mlp": 0.0126736, + "balance_loss_clip": 0.06280951, + "balance_loss_mlp": 0.01254664, + "epoch": 0.4933112881406884, + "flos": 19872247144320.0, + "grad_norm": 1.7256783924705517, + "language_loss": 0.65881336, + "learning_rate": 2.140285646139455e-06, + "loss": 0.73583329, + "num_input_tokens_seen": 176435755, + "router_z_loss_clip": 1.53613281, + "router_z_loss_mlp": 0.12695312, + "step": 8205, + "time_per_iteration": 2.5172812938690186 + }, + { + "auxiliary_loss_clip": 0.06445079, + "auxiliary_loss_mlp": 0.01273568, + "balance_loss_clip": 0.06283986, + "balance_loss_mlp": 0.0125971, + "epoch": 0.4933714113933564, + "flos": 21833215447680.0, + "grad_norm": 1.6546444342030124, + "language_loss": 0.66620767, + "learning_rate": 2.139897141060744e-06, + "loss": 0.74339426, + "num_input_tokens_seen": 176453915, + "router_z_loss_clip": 1.61035156, + "router_z_loss_mlp": 0.13861084, + "step": 8206, + "time_per_iteration": 2.556596040725708 + }, + { + "auxiliary_loss_clip": 0.06434575, + "auxiliary_loss_mlp": 0.0126512, + "balance_loss_clip": 0.06278799, + "balance_loss_mlp": 0.01253539, + "epoch": 0.49343153464602435, + "flos": 27897304083840.0, + "grad_norm": 1.8364733010130068, + "language_loss": 0.77070463, + "learning_rate": 2.1395086306771196e-06, + "loss": 0.84770155, + "num_input_tokens_seen": 176475175, + "router_z_loss_clip": 1.55761719, + "router_z_loss_mlp": 0.11584473, + "step": 8207, + "time_per_iteration": 2.591074228286743 + }, + { + "auxiliary_loss_clip": 0.06430385, + "auxiliary_loss_mlp": 0.01268434, + "balance_loss_clip": 0.06277601, + "balance_loss_mlp": 0.01256174, + "epoch": 0.4934916578986923, + "flos": 24688002453120.0, + "grad_norm": 2.876199477758729, + "language_loss": 0.60526079, + "learning_rate": 2.1391201150033147e-06, + "loss": 0.68224895, + "num_input_tokens_seen": 176494250, + "router_z_loss_clip": 1.53027344, + "router_z_loss_mlp": 0.12261963, + "step": 8208, + "time_per_iteration": 2.5641872882843018 + }, + { + "auxiliary_loss_clip": 0.06432977, + "auxiliary_loss_mlp": 0.01268916, + "balance_loss_clip": 0.06279885, + "balance_loss_mlp": 0.01256548, + "epoch": 0.4935517811513603, + "flos": 23412024478080.0, + "grad_norm": 2.3268226049750025, + "language_loss": 0.79136336, + "learning_rate": 2.1387315940540598e-06, + "loss": 0.86838233, + "num_input_tokens_seen": 176513325, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.12365723, + "step": 8209, + "time_per_iteration": 2.5345427989959717 + }, + { + "auxiliary_loss_clip": 0.06431048, + "auxiliary_loss_mlp": 0.01266279, + "balance_loss_clip": 0.06279348, + "balance_loss_mlp": 0.01253917, + "epoch": 0.49361190440402825, + "flos": 21950948833920.0, + "grad_norm": 3.2965997735856423, + "language_loss": 0.79514015, + "learning_rate": 2.138343067844089e-06, + "loss": 0.87211347, + "num_input_tokens_seen": 176532915, + "router_z_loss_clip": 1.51757812, + "router_z_loss_mlp": 0.12359619, + "step": 8210, + "time_per_iteration": 2.5686817169189453 + }, + { + "auxiliary_loss_clip": 0.06438643, + "auxiliary_loss_mlp": 0.01268716, + "balance_loss_clip": 0.06280634, + "balance_loss_mlp": 0.01256629, + "epoch": 0.4936720276566962, + "flos": 25122124056960.0, + "grad_norm": 2.539502696257949, + "language_loss": 0.81421793, + "learning_rate": 2.1379545363881363e-06, + "loss": 0.8912915, + "num_input_tokens_seen": 176552775, + "router_z_loss_clip": 1.58007812, + "router_z_loss_mlp": 0.12084961, + "step": 8211, + "time_per_iteration": 2.5667943954467773 + }, + { + "auxiliary_loss_clip": 0.06429391, + "auxiliary_loss_mlp": 0.0126729, + "balance_loss_clip": 0.06274866, + "balance_loss_mlp": 0.01254803, + "epoch": 0.4937321509093642, + "flos": 26366055045120.0, + "grad_norm": 2.1078758653058913, + "language_loss": 0.91783321, + "learning_rate": 2.137565999700933e-06, + "loss": 0.99480009, + "num_input_tokens_seen": 176572185, + "router_z_loss_clip": 1.54394531, + "router_z_loss_mlp": 0.12506104, + "step": 8212, + "time_per_iteration": 2.5892627239227295 + }, + { + "auxiliary_loss_clip": 0.06437102, + "auxiliary_loss_mlp": 0.01269581, + "balance_loss_clip": 0.06282008, + "balance_loss_mlp": 0.01257511, + "epoch": 0.49379227416203214, + "flos": 22967211479040.0, + "grad_norm": 1.9203573298750467, + "language_loss": 0.65474772, + "learning_rate": 2.1371774577972138e-06, + "loss": 0.7318145, + "num_input_tokens_seen": 176591490, + "router_z_loss_clip": 1.55078125, + "router_z_loss_mlp": 0.1206665, + "step": 8213, + "time_per_iteration": 2.5766966342926025 + }, + { + "auxiliary_loss_clip": 0.06435272, + "auxiliary_loss_mlp": 0.01267129, + "balance_loss_clip": 0.06281263, + "balance_loss_mlp": 0.01254957, + "epoch": 0.49385239741470016, + "flos": 32497340256000.0, + "grad_norm": 5.5178519689557435, + "language_loss": 0.76015925, + "learning_rate": 2.136788910691711e-06, + "loss": 0.83718324, + "num_input_tokens_seen": 176612715, + "router_z_loss_clip": 1.54003906, + "router_z_loss_mlp": 0.1217041, + "step": 8214, + "time_per_iteration": 2.6815552711486816 + }, + { + "auxiliary_loss_clip": 0.06435767, + "auxiliary_loss_mlp": 0.01267382, + "balance_loss_clip": 0.06282468, + "balance_loss_mlp": 0.0125508, + "epoch": 0.4939125206673681, + "flos": 22499575441920.0, + "grad_norm": 1.6727543381074526, + "language_loss": 0.84167933, + "learning_rate": 2.1364003583991594e-06, + "loss": 0.91871083, + "num_input_tokens_seen": 176631950, + "router_z_loss_clip": 1.53417969, + "router_z_loss_mlp": 0.12298584, + "step": 8215, + "time_per_iteration": 2.6213715076446533 + }, + { + "auxiliary_loss_clip": 0.06426814, + "auxiliary_loss_mlp": 0.01268273, + "balance_loss_clip": 0.06280927, + "balance_loss_mlp": 0.0125696, + "epoch": 0.4939726439200361, + "flos": 31184493684480.0, + "grad_norm": 1.9918722360209278, + "language_loss": 0.83712834, + "learning_rate": 2.136011800934292e-06, + "loss": 0.91407919, + "num_input_tokens_seen": 176653060, + "router_z_loss_clip": 1.45996094, + "router_z_loss_mlp": 0.11315918, + "step": 8216, + "time_per_iteration": 2.619922637939453 + }, + { + "auxiliary_loss_clip": 0.06434111, + "auxiliary_loss_mlp": 0.0127241, + "balance_loss_clip": 0.06283373, + "balance_loss_mlp": 0.01260614, + "epoch": 0.49403276717270406, + "flos": 22680773896320.0, + "grad_norm": 1.6954468061355052, + "language_loss": 0.75099367, + "learning_rate": 2.1356232383118442e-06, + "loss": 0.82805896, + "num_input_tokens_seen": 176673895, + "router_z_loss_clip": 1.50585938, + "router_z_loss_mlp": 0.11791992, + "step": 8217, + "time_per_iteration": 2.5473809242248535 + }, + { + "auxiliary_loss_clip": 0.06434639, + "auxiliary_loss_mlp": 0.01271118, + "balance_loss_clip": 0.06285703, + "balance_loss_mlp": 0.01258422, + "epoch": 0.494092890425372, + "flos": 20747408313600.0, + "grad_norm": 1.6176152886760666, + "language_loss": 0.78781378, + "learning_rate": 2.1352346705465494e-06, + "loss": 0.86487138, + "num_input_tokens_seen": 176692550, + "router_z_loss_clip": 1.48925781, + "router_z_loss_mlp": 0.12689209, + "step": 8218, + "time_per_iteration": 2.542994976043701 + }, + { + "auxiliary_loss_clip": 0.06433167, + "auxiliary_loss_mlp": 0.01265257, + "balance_loss_clip": 0.06283546, + "balance_loss_mlp": 0.01253628, + "epoch": 0.49415301367804, + "flos": 18374889882240.0, + "grad_norm": 2.39829798701753, + "language_loss": 0.77065396, + "learning_rate": 2.134846097653142e-06, + "loss": 0.84763819, + "num_input_tokens_seen": 176709335, + "router_z_loss_clip": 1.49609375, + "router_z_loss_mlp": 0.11639404, + "step": 8219, + "time_per_iteration": 2.5450475215911865 + }, + { + "auxiliary_loss_clip": 0.06439486, + "auxiliary_loss_mlp": 0.01269777, + "balance_loss_clip": 0.06285974, + "balance_loss_mlp": 0.01258321, + "epoch": 0.49421313693070795, + "flos": 17536471528320.0, + "grad_norm": 2.258549541306087, + "language_loss": 0.62705898, + "learning_rate": 2.134457519646357e-06, + "loss": 0.70415157, + "num_input_tokens_seen": 176727715, + "router_z_loss_clip": 1.53613281, + "router_z_loss_mlp": 0.11462402, + "step": 8220, + "time_per_iteration": 2.5296928882598877 + }, + { + "auxiliary_loss_clip": 0.06433114, + "auxiliary_loss_mlp": 0.01270633, + "balance_loss_clip": 0.06280304, + "balance_loss_mlp": 0.01259076, + "epoch": 0.4942732601833759, + "flos": 20818210613760.0, + "grad_norm": 1.8931623619102378, + "language_loss": 0.72802091, + "learning_rate": 2.1340689365409296e-06, + "loss": 0.80505836, + "num_input_tokens_seen": 176747530, + "router_z_loss_clip": 1.52929688, + "router_z_loss_mlp": 0.11572266, + "step": 8221, + "time_per_iteration": 2.521430253982544 + }, + { + "auxiliary_loss_clip": 0.06441319, + "auxiliary_loss_mlp": 0.01270693, + "balance_loss_clip": 0.06292681, + "balance_loss_mlp": 0.01258761, + "epoch": 0.4943333834360439, + "flos": 15054269702400.0, + "grad_norm": 1.6896047494674526, + "language_loss": 0.79253769, + "learning_rate": 2.133680348351595e-06, + "loss": 0.86965781, + "num_input_tokens_seen": 176765260, + "router_z_loss_clip": 1.48828125, + "router_z_loss_mlp": 0.11920166, + "step": 8222, + "time_per_iteration": 2.533997058868408 + }, + { + "auxiliary_loss_clip": 0.06434612, + "auxiliary_loss_mlp": 0.01272431, + "balance_loss_clip": 0.06282104, + "balance_loss_mlp": 0.0126051, + "epoch": 0.49439350668871185, + "flos": 16075899008640.0, + "grad_norm": 6.490136916654426, + "language_loss": 0.72483402, + "learning_rate": 2.133291755093088e-06, + "loss": 0.80190444, + "num_input_tokens_seen": 176781770, + "router_z_loss_clip": 1.5234375, + "router_z_loss_mlp": 0.1192627, + "step": 8223, + "time_per_iteration": 2.457361936569214 + }, + { + "auxiliary_loss_clip": 0.06444422, + "auxiliary_loss_mlp": 0.01270468, + "balance_loss_clip": 0.06287469, + "balance_loss_mlp": 0.01257367, + "epoch": 0.4944536299413798, + "flos": 20885281407360.0, + "grad_norm": 1.6318042764148617, + "language_loss": 0.75256205, + "learning_rate": 2.132903156780144e-06, + "loss": 0.82971096, + "num_input_tokens_seen": 176800655, + "router_z_loss_clip": 1.56933594, + "router_z_loss_mlp": 0.13122559, + "step": 8224, + "time_per_iteration": 2.5326499938964844 + }, + { + "auxiliary_loss_clip": 0.06441943, + "auxiliary_loss_mlp": 0.01267954, + "balance_loss_clip": 0.06287307, + "balance_loss_mlp": 0.01255646, + "epoch": 0.4945137531940478, + "flos": 26615162833920.0, + "grad_norm": 2.58625148433793, + "language_loss": 0.64002287, + "learning_rate": 2.1325145534274997e-06, + "loss": 0.71712184, + "num_input_tokens_seen": 176820610, + "router_z_loss_clip": 1.54394531, + "router_z_loss_mlp": 0.12322998, + "step": 8225, + "time_per_iteration": 2.555088996887207 + }, + { + "auxiliary_loss_clip": 0.06438252, + "auxiliary_loss_mlp": 0.01269636, + "balance_loss_clip": 0.06283222, + "balance_loss_mlp": 0.01258007, + "epoch": 0.49457387644671574, + "flos": 23995004060160.0, + "grad_norm": 2.0569415863505554, + "language_loss": 0.77084112, + "learning_rate": 2.1321259450498893e-06, + "loss": 0.84792, + "num_input_tokens_seen": 176840520, + "router_z_loss_clip": 1.54980469, + "router_z_loss_mlp": 0.11627197, + "step": 8226, + "time_per_iteration": 2.557900905609131 + }, + { + "auxiliary_loss_clip": 0.06436731, + "auxiliary_loss_mlp": 0.01270529, + "balance_loss_clip": 0.06281079, + "balance_loss_mlp": 0.01256958, + "epoch": 0.49463399969938376, + "flos": 26983387601280.0, + "grad_norm": 1.6446627405679832, + "language_loss": 0.71402973, + "learning_rate": 2.131737331662051e-06, + "loss": 0.79110235, + "num_input_tokens_seen": 176860265, + "router_z_loss_clip": 1.55664062, + "router_z_loss_mlp": 0.13568115, + "step": 8227, + "time_per_iteration": 2.533468246459961 + }, + { + "auxiliary_loss_clip": 0.06441461, + "auxiliary_loss_mlp": 0.01270684, + "balance_loss_clip": 0.06282251, + "balance_loss_mlp": 0.01258477, + "epoch": 0.49469412295205173, + "flos": 29689610117760.0, + "grad_norm": 1.6469495440568809, + "language_loss": 0.7179364, + "learning_rate": 2.131348713278718e-06, + "loss": 0.79505783, + "num_input_tokens_seen": 176882910, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.12213135, + "step": 8228, + "time_per_iteration": 2.621777296066284 + }, + { + "auxiliary_loss_clip": 0.06432875, + "auxiliary_loss_mlp": 0.01271248, + "balance_loss_clip": 0.06283268, + "balance_loss_mlp": 0.01259768, + "epoch": 0.4947542462047197, + "flos": 24138285742080.0, + "grad_norm": 1.3686875437171686, + "language_loss": 0.84044397, + "learning_rate": 2.1309600899146304e-06, + "loss": 0.91748512, + "num_input_tokens_seen": 176903030, + "router_z_loss_clip": 1.49511719, + "router_z_loss_mlp": 0.1149292, + "step": 8229, + "time_per_iteration": 2.620849609375 + }, + { + "auxiliary_loss_clip": 0.06443636, + "auxiliary_loss_mlp": 0.01271474, + "balance_loss_clip": 0.0628624, + "balance_loss_mlp": 0.01258134, + "epoch": 0.49481436945738766, + "flos": 20050804195200.0, + "grad_norm": 2.3211713476829656, + "language_loss": 0.75208747, + "learning_rate": 2.1305714615845227e-06, + "loss": 0.82923853, + "num_input_tokens_seen": 176919025, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.13342285, + "step": 8230, + "time_per_iteration": 3.9126293659210205 + }, + { + "auxiliary_loss_clip": 0.06439002, + "auxiliary_loss_mlp": 0.01268847, + "balance_loss_clip": 0.06284901, + "balance_loss_mlp": 0.01256432, + "epoch": 0.4948744927100556, + "flos": 15675040275840.0, + "grad_norm": 1.9615207178823395, + "language_loss": 0.80548179, + "learning_rate": 2.1301828283031314e-06, + "loss": 0.88256031, + "num_input_tokens_seen": 176937945, + "router_z_loss_clip": 1.54101562, + "router_z_loss_mlp": 0.1239624, + "step": 8231, + "time_per_iteration": 2.525049924850464 + }, + { + "auxiliary_loss_clip": 0.06329959, + "auxiliary_loss_mlp": 0.01257972, + "balance_loss_clip": 0.06262948, + "balance_loss_mlp": 0.0125556, + "epoch": 0.4949346159627236, + "flos": 68893611644160.0, + "grad_norm": 0.7512177245674743, + "language_loss": 0.60052431, + "learning_rate": 2.1297941900851944e-06, + "loss": 0.67640364, + "num_input_tokens_seen": 177004575, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 0.02409363, + "step": 8232, + "time_per_iteration": 4.674450159072876 + }, + { + "auxiliary_loss_clip": 0.06440374, + "auxiliary_loss_mlp": 0.01269686, + "balance_loss_clip": 0.06279664, + "balance_loss_mlp": 0.0125631, + "epoch": 0.49499473921539155, + "flos": 24797182723200.0, + "grad_norm": 1.782814520641974, + "language_loss": 0.68933427, + "learning_rate": 2.1294055469454496e-06, + "loss": 0.76643485, + "num_input_tokens_seen": 177024155, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.13366699, + "step": 8233, + "time_per_iteration": 2.574759006500244 + }, + { + "auxiliary_loss_clip": 0.06426412, + "auxiliary_loss_mlp": 0.01270358, + "balance_loss_clip": 0.06276375, + "balance_loss_mlp": 0.01258508, + "epoch": 0.4950548624680595, + "flos": 32716161993600.0, + "grad_norm": 2.8586701341507355, + "language_loss": 0.6684472, + "learning_rate": 2.129016898898633e-06, + "loss": 0.74541491, + "num_input_tokens_seen": 177046185, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.1184082, + "step": 8234, + "time_per_iteration": 2.653381824493408 + }, + { + "auxiliary_loss_clip": 0.06329186, + "auxiliary_loss_mlp": 0.0125637, + "balance_loss_clip": 0.06261852, + "balance_loss_mlp": 0.01254119, + "epoch": 0.4951149857207275, + "flos": 50100616287360.0, + "grad_norm": 0.7779673724008701, + "language_loss": 0.58149666, + "learning_rate": 2.128628245959482e-06, + "loss": 0.65735215, + "num_input_tokens_seen": 177099025, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 0.02255249, + "step": 8235, + "time_per_iteration": 3.0858991146087646 + }, + { + "auxiliary_loss_clip": 0.06437027, + "auxiliary_loss_mlp": 0.01272544, + "balance_loss_clip": 0.06281243, + "balance_loss_mlp": 0.01259401, + "epoch": 0.49517510897339545, + "flos": 22243340056320.0, + "grad_norm": 1.7279160321905627, + "language_loss": 0.77504063, + "learning_rate": 2.1282395881427355e-06, + "loss": 0.85213637, + "num_input_tokens_seen": 177118365, + "router_z_loss_clip": 1.55664062, + "router_z_loss_mlp": 0.13134766, + "step": 8236, + "time_per_iteration": 2.5753977298736572 + }, + { + "auxiliary_loss_clip": 0.06428996, + "auxiliary_loss_mlp": 0.01267571, + "balance_loss_clip": 0.06278376, + "balance_loss_mlp": 0.01256037, + "epoch": 0.4952352322260634, + "flos": 25381126627200.0, + "grad_norm": 1.6842676088909172, + "language_loss": 0.72880518, + "learning_rate": 2.1278509254631315e-06, + "loss": 0.80577087, + "num_input_tokens_seen": 177136415, + "router_z_loss_clip": 1.50390625, + "router_z_loss_mlp": 0.11529541, + "step": 8237, + "time_per_iteration": 4.036882400512695 + }, + { + "auxiliary_loss_clip": 0.06434725, + "auxiliary_loss_mlp": 0.01270554, + "balance_loss_clip": 0.06283747, + "balance_loss_mlp": 0.0125787, + "epoch": 0.4952953554787314, + "flos": 24615732706560.0, + "grad_norm": 2.2000126991913285, + "language_loss": 0.75703216, + "learning_rate": 2.127462257935406e-06, + "loss": 0.83408493, + "num_input_tokens_seen": 177155690, + "router_z_loss_clip": 1.50976562, + "router_z_loss_mlp": 0.12664795, + "step": 8238, + "time_per_iteration": 2.549431085586548 + }, + { + "auxiliary_loss_clip": 0.06435382, + "auxiliary_loss_mlp": 0.01269682, + "balance_loss_clip": 0.06280845, + "balance_loss_mlp": 0.01257081, + "epoch": 0.49535547873139935, + "flos": 17317020885120.0, + "grad_norm": 2.278500195677925, + "language_loss": 0.74391794, + "learning_rate": 2.1270735855743008e-06, + "loss": 0.82096863, + "num_input_tokens_seen": 177173350, + "router_z_loss_clip": 1.54492188, + "router_z_loss_mlp": 0.12615967, + "step": 8239, + "time_per_iteration": 2.571343183517456 + }, + { + "auxiliary_loss_clip": 0.06438212, + "auxiliary_loss_mlp": 0.01271609, + "balance_loss_clip": 0.06280148, + "balance_loss_mlp": 0.01257917, + "epoch": 0.4954156019840673, + "flos": 20746527845760.0, + "grad_norm": 2.0000035114581927, + "language_loss": 0.79093564, + "learning_rate": 2.126684908394552e-06, + "loss": 0.86803377, + "num_input_tokens_seen": 177191115, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.13684082, + "step": 8240, + "time_per_iteration": 2.531712532043457 + }, + { + "auxiliary_loss_clip": 0.06430051, + "auxiliary_loss_mlp": 0.01267271, + "balance_loss_clip": 0.06279683, + "balance_loss_mlp": 0.0125594, + "epoch": 0.49547572523673533, + "flos": 12825200661120.0, + "grad_norm": 2.1298693498085592, + "language_loss": 0.86484092, + "learning_rate": 2.126296226410898e-06, + "loss": 0.94181418, + "num_input_tokens_seen": 177206155, + "router_z_loss_clip": 1.50292969, + "router_z_loss_mlp": 0.11334229, + "step": 8241, + "time_per_iteration": 2.5414860248565674 + }, + { + "auxiliary_loss_clip": 0.06427231, + "auxiliary_loss_mlp": 0.01270719, + "balance_loss_clip": 0.06279866, + "balance_loss_mlp": 0.01260003, + "epoch": 0.4955358484894033, + "flos": 15602602821120.0, + "grad_norm": 1.7100085929309539, + "language_loss": 0.77987742, + "learning_rate": 2.1259075396380794e-06, + "loss": 0.85685694, + "num_input_tokens_seen": 177224815, + "router_z_loss_clip": 1.47460938, + "router_z_loss_mlp": 0.10723877, + "step": 8242, + "time_per_iteration": 2.500761032104492 + }, + { + "auxiliary_loss_clip": 0.06436419, + "auxiliary_loss_mlp": 0.01265138, + "balance_loss_clip": 0.06284536, + "balance_loss_mlp": 0.0125308, + "epoch": 0.49559597174207126, + "flos": 26470832976000.0, + "grad_norm": 1.8102794432235507, + "language_loss": 0.67317849, + "learning_rate": 2.125518848090833e-06, + "loss": 0.75019407, + "num_input_tokens_seen": 177244490, + "router_z_loss_clip": 1.51855469, + "router_z_loss_mlp": 0.1206665, + "step": 8243, + "time_per_iteration": 4.062270641326904 + }, + { + "auxiliary_loss_clip": 0.06430024, + "auxiliary_loss_mlp": 0.01269105, + "balance_loss_clip": 0.06279217, + "balance_loss_mlp": 0.0125722, + "epoch": 0.4956560949947392, + "flos": 23154824770560.0, + "grad_norm": 2.721585758888369, + "language_loss": 0.68786383, + "learning_rate": 2.125130151783901e-06, + "loss": 0.76485521, + "num_input_tokens_seen": 177264340, + "router_z_loss_clip": 1.5078125, + "router_z_loss_mlp": 0.11889648, + "step": 8244, + "time_per_iteration": 2.55732798576355 + }, + { + "auxiliary_loss_clip": 0.06434646, + "auxiliary_loss_mlp": 0.01266504, + "balance_loss_clip": 0.06280981, + "balance_loss_mlp": 0.01254541, + "epoch": 0.4957162182474072, + "flos": 20779119884160.0, + "grad_norm": 2.485823072522516, + "language_loss": 0.75575739, + "learning_rate": 2.12474145073202e-06, + "loss": 0.83276892, + "num_input_tokens_seen": 177283055, + "router_z_loss_clip": 1.53515625, + "router_z_loss_mlp": 0.11962891, + "step": 8245, + "time_per_iteration": 2.5086231231689453 + }, + { + "auxiliary_loss_clip": 0.06428742, + "auxiliary_loss_mlp": 0.01268325, + "balance_loss_clip": 0.06280199, + "balance_loss_mlp": 0.01256762, + "epoch": 0.49577634150007516, + "flos": 18740179756800.0, + "grad_norm": 1.8890947976192427, + "language_loss": 0.81602311, + "learning_rate": 2.1243527449499306e-06, + "loss": 0.89299381, + "num_input_tokens_seen": 177301140, + "router_z_loss_clip": 1.48632812, + "router_z_loss_mlp": 0.11572266, + "step": 8246, + "time_per_iteration": 2.534557342529297 + }, + { + "auxiliary_loss_clip": 0.06440324, + "auxiliary_loss_mlp": 0.01268715, + "balance_loss_clip": 0.06283663, + "balance_loss_mlp": 0.01256347, + "epoch": 0.4958364647527431, + "flos": 25560815708160.0, + "grad_norm": 1.7539344008969155, + "language_loss": 0.84379256, + "learning_rate": 2.1239640344523733e-06, + "loss": 0.92088294, + "num_input_tokens_seen": 177323095, + "router_z_loss_clip": 1.56640625, + "router_z_loss_mlp": 0.12359619, + "step": 8247, + "time_per_iteration": 2.5563809871673584 + }, + { + "auxiliary_loss_clip": 0.06436694, + "auxiliary_loss_mlp": 0.01269797, + "balance_loss_clip": 0.06282616, + "balance_loss_mlp": 0.01257798, + "epoch": 0.4958965880054111, + "flos": 24432144410880.0, + "grad_norm": 2.2837128243369658, + "language_loss": 0.84184051, + "learning_rate": 2.123575319254087e-06, + "loss": 0.91890538, + "num_input_tokens_seen": 177339845, + "router_z_loss_clip": 1.54101562, + "router_z_loss_mlp": 0.12011719, + "step": 8248, + "time_per_iteration": 2.566392660140991 + }, + { + "auxiliary_loss_clip": 0.0643697, + "auxiliary_loss_mlp": 0.01268541, + "balance_loss_clip": 0.06282248, + "balance_loss_mlp": 0.01256024, + "epoch": 0.49595671125807905, + "flos": 25090622121600.0, + "grad_norm": 1.727142692455913, + "language_loss": 0.73609596, + "learning_rate": 2.123186599369812e-06, + "loss": 0.813151, + "num_input_tokens_seen": 177359980, + "router_z_loss_clip": 1.54492188, + "router_z_loss_mlp": 0.12518311, + "step": 8249, + "time_per_iteration": 2.548520088195801 + }, + { + "auxiliary_loss_clip": 0.06441288, + "auxiliary_loss_mlp": 0.01269234, + "balance_loss_clip": 0.06283297, + "balance_loss_mlp": 0.01256365, + "epoch": 0.496016834510747, + "flos": 16441524299520.0, + "grad_norm": 2.7229998624345115, + "language_loss": 0.76506901, + "learning_rate": 2.122797874814289e-06, + "loss": 0.84217423, + "num_input_tokens_seen": 177378580, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.12860107, + "step": 8250, + "time_per_iteration": 2.524714231491089 + }, + { + "auxiliary_loss_clip": 0.06438759, + "auxiliary_loss_mlp": 0.01269282, + "balance_loss_clip": 0.06282068, + "balance_loss_mlp": 0.01256551, + "epoch": 0.496076957763415, + "flos": 23444197246080.0, + "grad_norm": 1.6959600873244032, + "language_loss": 0.7021333, + "learning_rate": 2.1224091456022585e-06, + "loss": 0.77921373, + "num_input_tokens_seen": 177398790, + "router_z_loss_clip": 1.56542969, + "router_z_loss_mlp": 0.12738037, + "step": 8251, + "time_per_iteration": 2.531841516494751 + }, + { + "auxiliary_loss_clip": 0.06437311, + "auxiliary_loss_mlp": 0.01271839, + "balance_loss_clip": 0.06285296, + "balance_loss_mlp": 0.01259871, + "epoch": 0.49613708101608295, + "flos": 16915113976320.0, + "grad_norm": 1.8201441219473296, + "language_loss": 0.7993809, + "learning_rate": 2.122020411748461e-06, + "loss": 0.87647241, + "num_input_tokens_seen": 177416515, + "router_z_loss_clip": 1.51855469, + "router_z_loss_mlp": 0.11975098, + "step": 8252, + "time_per_iteration": 2.5806944370269775 + }, + { + "auxiliary_loss_clip": 0.06434863, + "auxiliary_loss_mlp": 0.01270348, + "balance_loss_clip": 0.06282027, + "balance_loss_mlp": 0.01255905, + "epoch": 0.4961972042687509, + "flos": 16623729002880.0, + "grad_norm": 1.8109031344325417, + "language_loss": 0.81898755, + "learning_rate": 2.1216316732676363e-06, + "loss": 0.89603961, + "num_input_tokens_seen": 177434425, + "router_z_loss_clip": 1.52636719, + "router_z_loss_mlp": 0.14447021, + "step": 8253, + "time_per_iteration": 2.4936153888702393 + }, + { + "auxiliary_loss_clip": 0.0643016, + "auxiliary_loss_mlp": 0.01264565, + "balance_loss_clip": 0.06279143, + "balance_loss_mlp": 0.01253139, + "epoch": 0.49625732752141893, + "flos": 28965529059840.0, + "grad_norm": 1.4049535238306547, + "language_loss": 0.67659622, + "learning_rate": 2.1212429301745275e-06, + "loss": 0.7535435, + "num_input_tokens_seen": 177459675, + "router_z_loss_clip": 1.51074219, + "router_z_loss_mlp": 0.11437988, + "step": 8254, + "time_per_iteration": 2.681328058242798 + }, + { + "auxiliary_loss_clip": 0.06436362, + "auxiliary_loss_mlp": 0.01267121, + "balance_loss_clip": 0.06281647, + "balance_loss_mlp": 0.01254729, + "epoch": 0.4963174507740869, + "flos": 23119046277120.0, + "grad_norm": 6.04751780380752, + "language_loss": 0.74611968, + "learning_rate": 2.1208541824838743e-06, + "loss": 0.82315457, + "num_input_tokens_seen": 177478895, + "router_z_loss_clip": 1.54785156, + "router_z_loss_mlp": 0.12384033, + "step": 8255, + "time_per_iteration": 2.5586442947387695 + }, + { + "auxiliary_loss_clip": 0.06430424, + "auxiliary_loss_mlp": 0.01268774, + "balance_loss_clip": 0.06278734, + "balance_loss_mlp": 0.01256972, + "epoch": 0.49637757402675486, + "flos": 13922998928640.0, + "grad_norm": 1.9051204382469373, + "language_loss": 0.81712639, + "learning_rate": 2.1204654302104183e-06, + "loss": 0.89411843, + "num_input_tokens_seen": 177494920, + "router_z_loss_clip": 1.515625, + "router_z_loss_mlp": 0.11798096, + "step": 8256, + "time_per_iteration": 2.525191307067871 + }, + { + "auxiliary_loss_clip": 0.06430264, + "auxiliary_loss_mlp": 0.01267515, + "balance_loss_clip": 0.06279526, + "balance_loss_mlp": 0.01256035, + "epoch": 0.49643769727942283, + "flos": 22315442094720.0, + "grad_norm": 1.4246388626256767, + "language_loss": 0.81285727, + "learning_rate": 2.120076673368901e-06, + "loss": 0.889835, + "num_input_tokens_seen": 177515455, + "router_z_loss_clip": 1.50585938, + "router_z_loss_mlp": 0.11474609, + "step": 8257, + "time_per_iteration": 2.5366289615631104 + }, + { + "auxiliary_loss_clip": 0.06441522, + "auxiliary_loss_mlp": 0.01265551, + "balance_loss_clip": 0.06281207, + "balance_loss_mlp": 0.01253153, + "epoch": 0.4964978205320908, + "flos": 19506328364160.0, + "grad_norm": 1.7556989119603337, + "language_loss": 0.66651785, + "learning_rate": 2.1196879119740647e-06, + "loss": 0.74358857, + "num_input_tokens_seen": 177534040, + "router_z_loss_clip": 1.60351562, + "router_z_loss_mlp": 0.1239624, + "step": 8258, + "time_per_iteration": 2.567802667617798 + }, + { + "auxiliary_loss_clip": 0.06427691, + "auxiliary_loss_mlp": 0.01266798, + "balance_loss_clip": 0.06277505, + "balance_loss_mlp": 0.0125607, + "epoch": 0.49655794378475876, + "flos": 23442562091520.0, + "grad_norm": 1.5238866764667018, + "language_loss": 0.7778039, + "learning_rate": 2.1192991460406502e-06, + "loss": 0.85474873, + "num_input_tokens_seen": 177554510, + "router_z_loss_clip": 1.50195312, + "router_z_loss_mlp": 0.10723877, + "step": 8259, + "time_per_iteration": 2.5521552562713623 + }, + { + "auxiliary_loss_clip": 0.06430545, + "auxiliary_loss_mlp": 0.01266762, + "balance_loss_clip": 0.06279439, + "balance_loss_mlp": 0.01254954, + "epoch": 0.4966180670374267, + "flos": 26837967640320.0, + "grad_norm": 1.4589343239403403, + "language_loss": 0.78972054, + "learning_rate": 2.1189103755834e-06, + "loss": 0.86669362, + "num_input_tokens_seen": 177575780, + "router_z_loss_clip": 1.51171875, + "router_z_loss_mlp": 0.11816406, + "step": 8260, + "time_per_iteration": 2.6012649536132812 + }, + { + "auxiliary_loss_clip": 0.06434717, + "auxiliary_loss_mlp": 0.01267655, + "balance_loss_clip": 0.06279895, + "balance_loss_mlp": 0.01255055, + "epoch": 0.4966781902900947, + "flos": 22014413902080.0, + "grad_norm": 2.8586716221878206, + "language_loss": 0.76515198, + "learning_rate": 2.1185216006170573e-06, + "loss": 0.84217566, + "num_input_tokens_seen": 177588965, + "router_z_loss_clip": 1.54785156, + "router_z_loss_mlp": 0.12591553, + "step": 8261, + "time_per_iteration": 2.4737415313720703 + }, + { + "auxiliary_loss_clip": 0.06427643, + "auxiliary_loss_mlp": 0.01267002, + "balance_loss_clip": 0.0627794, + "balance_loss_mlp": 0.01255772, + "epoch": 0.49673831354276266, + "flos": 26220509303040.0, + "grad_norm": 1.7291004140234418, + "language_loss": 0.89456958, + "learning_rate": 2.1181328211563627e-06, + "loss": 0.97151601, + "num_input_tokens_seen": 177608425, + "router_z_loss_clip": 1.49609375, + "router_z_loss_mlp": 0.11230469, + "step": 8262, + "time_per_iteration": 2.613236665725708 + }, + { + "auxiliary_loss_clip": 0.06431636, + "auxiliary_loss_mlp": 0.01268648, + "balance_loss_clip": 0.06281907, + "balance_loss_mlp": 0.01256817, + "epoch": 0.4967984367954306, + "flos": 23188464984960.0, + "grad_norm": 1.4347791599980126, + "language_loss": 0.73918176, + "learning_rate": 2.11774403721606e-06, + "loss": 0.81618452, + "num_input_tokens_seen": 177628240, + "router_z_loss_clip": 1.49804688, + "router_z_loss_mlp": 0.11834717, + "step": 8263, + "time_per_iteration": 2.595635414123535 + }, + { + "auxiliary_loss_clip": 0.06439725, + "auxiliary_loss_mlp": 0.01274389, + "balance_loss_clip": 0.06283052, + "balance_loss_mlp": 0.01260239, + "epoch": 0.4968585600480986, + "flos": 19287506626560.0, + "grad_norm": 2.258936930728745, + "language_loss": 0.69678748, + "learning_rate": 2.1173552488108923e-06, + "loss": 0.77392858, + "num_input_tokens_seen": 177645920, + "router_z_loss_clip": 1.56542969, + "router_z_loss_mlp": 0.14147949, + "step": 8264, + "time_per_iteration": 2.5913755893707275 + }, + { + "auxiliary_loss_clip": 0.06438377, + "auxiliary_loss_mlp": 0.01267325, + "balance_loss_clip": 0.06281792, + "balance_loss_mlp": 0.01255136, + "epoch": 0.49691868330076655, + "flos": 22535312008320.0, + "grad_norm": 1.388736059607974, + "language_loss": 0.65131235, + "learning_rate": 2.1169664559556007e-06, + "loss": 0.72836947, + "num_input_tokens_seen": 177667185, + "router_z_loss_clip": 1.56933594, + "router_z_loss_mlp": 0.12188721, + "step": 8265, + "time_per_iteration": 2.528193473815918 + }, + { + "auxiliary_loss_clip": 0.06333993, + "auxiliary_loss_mlp": 0.01255399, + "balance_loss_clip": 0.06266748, + "balance_loss_mlp": 0.01253268, + "epoch": 0.4969788065534345, + "flos": 66598897328640.0, + "grad_norm": 0.8036364801041208, + "language_loss": 0.53402334, + "learning_rate": 2.1165776586649304e-06, + "loss": 0.60991728, + "num_input_tokens_seen": 177733020, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 0.02133179, + "step": 8266, + "time_per_iteration": 3.1838197708129883 + }, + { + "auxiliary_loss_clip": 0.06428756, + "auxiliary_loss_mlp": 0.01272627, + "balance_loss_clip": 0.06282037, + "balance_loss_mlp": 0.01260592, + "epoch": 0.49703892980610254, + "flos": 24066099849600.0, + "grad_norm": 1.4975664699088878, + "language_loss": 0.79899192, + "learning_rate": 2.1161888569536223e-06, + "loss": 0.87600571, + "num_input_tokens_seen": 177753370, + "router_z_loss_clip": 1.46679688, + "router_z_loss_mlp": 0.12036133, + "step": 8267, + "time_per_iteration": 2.556995391845703 + }, + { + "auxiliary_loss_clip": 0.06434017, + "auxiliary_loss_mlp": 0.01269443, + "balance_loss_clip": 0.06279886, + "balance_loss_mlp": 0.01256295, + "epoch": 0.4970990530587705, + "flos": 29132807736960.0, + "grad_norm": 3.0454644456900155, + "language_loss": 0.75843596, + "learning_rate": 2.1158000508364223e-06, + "loss": 0.83547056, + "num_input_tokens_seen": 177771530, + "router_z_loss_clip": 1.54003906, + "router_z_loss_mlp": 0.13146973, + "step": 8268, + "time_per_iteration": 2.6049721240997314 + }, + { + "auxiliary_loss_clip": 0.06435575, + "auxiliary_loss_mlp": 0.01270084, + "balance_loss_clip": 0.06281421, + "balance_loss_mlp": 0.01257185, + "epoch": 0.49715917631143847, + "flos": 46036811047680.0, + "grad_norm": 1.4862794016102487, + "language_loss": 0.68007714, + "learning_rate": 2.115411240328073e-06, + "loss": 0.75713372, + "num_input_tokens_seen": 177796355, + "router_z_loss_clip": 1.54003906, + "router_z_loss_mlp": 0.12902832, + "step": 8269, + "time_per_iteration": 4.128691911697388 + }, + { + "auxiliary_loss_clip": 0.06433591, + "auxiliary_loss_mlp": 0.01270109, + "balance_loss_clip": 0.06283623, + "balance_loss_mlp": 0.01258444, + "epoch": 0.49721929956410643, + "flos": 20197104624000.0, + "grad_norm": 1.5327488108804688, + "language_loss": 0.85668087, + "learning_rate": 2.1150224254433167e-06, + "loss": 0.93371785, + "num_input_tokens_seen": 177814300, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.11669922, + "step": 8270, + "time_per_iteration": 2.518367290496826 + }, + { + "auxiliary_loss_clip": 0.06438391, + "auxiliary_loss_mlp": 0.012695, + "balance_loss_clip": 0.06282806, + "balance_loss_mlp": 0.01258443, + "epoch": 0.4972794228167744, + "flos": 21660108912000.0, + "grad_norm": 1.8194061326909323, + "language_loss": 0.71364737, + "learning_rate": 2.114633606196899e-06, + "loss": 0.7907263, + "num_input_tokens_seen": 177833615, + "router_z_loss_clip": 1.55664062, + "router_z_loss_mlp": 0.1105957, + "step": 8271, + "time_per_iteration": 2.5573620796203613 + }, + { + "auxiliary_loss_clip": 0.06437098, + "auxiliary_loss_mlp": 0.01269156, + "balance_loss_clip": 0.06284092, + "balance_loss_mlp": 0.0125646, + "epoch": 0.49733954606944236, + "flos": 24286598668800.0, + "grad_norm": 1.3024187792808712, + "language_loss": 0.78511107, + "learning_rate": 2.1142447826035635e-06, + "loss": 0.86217368, + "num_input_tokens_seen": 177855315, + "router_z_loss_clip": 1.52832031, + "router_z_loss_mlp": 0.12677002, + "step": 8272, + "time_per_iteration": 4.061326742172241 + }, + { + "auxiliary_loss_clip": 0.06438889, + "auxiliary_loss_mlp": 0.01270506, + "balance_loss_clip": 0.06285517, + "balance_loss_mlp": 0.01257548, + "epoch": 0.4973996693221103, + "flos": 37861722172800.0, + "grad_norm": 2.25975995369767, + "language_loss": 0.66725254, + "learning_rate": 2.1138559546780544e-06, + "loss": 0.7443465, + "num_input_tokens_seen": 177875590, + "router_z_loss_clip": 1.53320312, + "router_z_loss_mlp": 0.12957764, + "step": 8273, + "time_per_iteration": 2.645908832550049 + }, + { + "auxiliary_loss_clip": 0.06436634, + "auxiliary_loss_mlp": 0.01276274, + "balance_loss_clip": 0.06285357, + "balance_loss_mlp": 0.01264109, + "epoch": 0.4974597925747783, + "flos": 21367885397760.0, + "grad_norm": 1.5281958400790516, + "language_loss": 0.78156513, + "learning_rate": 2.1134671224351163e-06, + "loss": 0.8586942, + "num_input_tokens_seen": 177894175, + "router_z_loss_clip": 1.51171875, + "router_z_loss_mlp": 0.12182617, + "step": 8274, + "time_per_iteration": 2.535804271697998 + }, + { + "auxiliary_loss_clip": 0.06437881, + "auxiliary_loss_mlp": 0.0127292, + "balance_loss_clip": 0.06281041, + "balance_loss_mlp": 0.01259992, + "epoch": 0.49751991582744626, + "flos": 30746137449600.0, + "grad_norm": 1.6098675264323796, + "language_loss": 0.76012516, + "learning_rate": 2.113078285889493e-06, + "loss": 0.83723313, + "num_input_tokens_seen": 177913920, + "router_z_loss_clip": 1.56640625, + "router_z_loss_mlp": 0.12939453, + "step": 8275, + "time_per_iteration": 2.5787549018859863 + }, + { + "auxiliary_loss_clip": 0.06438003, + "auxiliary_loss_mlp": 0.01271635, + "balance_loss_clip": 0.06282246, + "balance_loss_mlp": 0.01257789, + "epoch": 0.4975800390801142, + "flos": 14105748683520.0, + "grad_norm": 1.8196816586022186, + "language_loss": 0.84079218, + "learning_rate": 2.1126894450559303e-06, + "loss": 0.91788852, + "num_input_tokens_seen": 177930425, + "router_z_loss_clip": 1.55664062, + "router_z_loss_mlp": 0.1385498, + "step": 8276, + "time_per_iteration": 2.5156893730163574 + }, + { + "auxiliary_loss_clip": 0.06426419, + "auxiliary_loss_mlp": 0.01277009, + "balance_loss_clip": 0.06279768, + "balance_loss_mlp": 0.01265398, + "epoch": 0.4976401623327822, + "flos": 24214203141120.0, + "grad_norm": 1.3141436658277077, + "language_loss": 0.70087981, + "learning_rate": 2.112300599949172e-06, + "loss": 0.77791417, + "num_input_tokens_seen": 177949885, + "router_z_loss_clip": 1.46582031, + "router_z_loss_mlp": 0.1161499, + "step": 8277, + "time_per_iteration": 3.9860711097717285 + }, + { + "auxiliary_loss_clip": 0.06429198, + "auxiliary_loss_mlp": 0.01270973, + "balance_loss_clip": 0.06280812, + "balance_loss_mlp": 0.01258754, + "epoch": 0.49770028558545015, + "flos": 21142229552640.0, + "grad_norm": 1.8219149953370526, + "language_loss": 0.82141137, + "learning_rate": 2.111911750583964e-06, + "loss": 0.89841306, + "num_input_tokens_seen": 177965720, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.12231445, + "step": 8278, + "time_per_iteration": 2.5353100299835205 + }, + { + "auxiliary_loss_clip": 0.06435424, + "auxiliary_loss_mlp": 0.01268936, + "balance_loss_clip": 0.06279474, + "balance_loss_mlp": 0.01256246, + "epoch": 0.4977604088381181, + "flos": 16769568234240.0, + "grad_norm": 1.8298360040603827, + "language_loss": 0.68205428, + "learning_rate": 2.111522896975052e-06, + "loss": 0.75909793, + "num_input_tokens_seen": 177983190, + "router_z_loss_clip": 1.55957031, + "router_z_loss_mlp": 0.12695312, + "step": 8279, + "time_per_iteration": 2.538273334503174 + }, + { + "auxiliary_loss_clip": 0.06430422, + "auxiliary_loss_mlp": 0.01271809, + "balance_loss_clip": 0.06277534, + "balance_loss_mlp": 0.01258129, + "epoch": 0.49782053209078614, + "flos": 15708596636160.0, + "grad_norm": 1.929140490148881, + "language_loss": 0.70948005, + "learning_rate": 2.1111340391371794e-06, + "loss": 0.78650236, + "num_input_tokens_seen": 178000155, + "router_z_loss_clip": 1.52832031, + "router_z_loss_mlp": 0.13665771, + "step": 8280, + "time_per_iteration": 2.5344486236572266 + }, + { + "auxiliary_loss_clip": 0.06432884, + "auxiliary_loss_mlp": 0.01270682, + "balance_loss_clip": 0.06279922, + "balance_loss_mlp": 0.01257331, + "epoch": 0.4978806553434541, + "flos": 24760565688960.0, + "grad_norm": 1.4498126802552027, + "language_loss": 0.6468308, + "learning_rate": 2.1107451770850936e-06, + "loss": 0.72386646, + "num_input_tokens_seen": 178021060, + "router_z_loss_clip": 1.53027344, + "router_z_loss_mlp": 0.13366699, + "step": 8281, + "time_per_iteration": 2.5905003547668457 + }, + { + "auxiliary_loss_clip": 0.06432123, + "auxiliary_loss_mlp": 0.01269379, + "balance_loss_clip": 0.06277686, + "balance_loss_mlp": 0.01256141, + "epoch": 0.49794077859612207, + "flos": 13120820265600.0, + "grad_norm": 2.543831826961268, + "language_loss": 0.73404002, + "learning_rate": 2.1103563108335387e-06, + "loss": 0.81105494, + "num_input_tokens_seen": 178038180, + "router_z_loss_clip": 1.54199219, + "router_z_loss_mlp": 0.13226318, + "step": 8282, + "time_per_iteration": 2.481513023376465 + }, + { + "auxiliary_loss_clip": 0.06433594, + "auxiliary_loss_mlp": 0.01272132, + "balance_loss_clip": 0.062822, + "balance_loss_mlp": 0.01260748, + "epoch": 0.49800090184879003, + "flos": 27532223844480.0, + "grad_norm": 1.4555237952962066, + "language_loss": 0.7312296, + "learning_rate": 2.109967440397263e-06, + "loss": 0.80828691, + "num_input_tokens_seen": 178057565, + "router_z_loss_clip": 1.51464844, + "router_z_loss_mlp": 0.1138916, + "step": 8283, + "time_per_iteration": 4.015530824661255 + }, + { + "auxiliary_loss_clip": 0.06430134, + "auxiliary_loss_mlp": 0.01267653, + "balance_loss_clip": 0.06279625, + "balance_loss_mlp": 0.01254791, + "epoch": 0.498061025101458, + "flos": 19798677659520.0, + "grad_norm": 1.429490370630744, + "language_loss": 0.78535879, + "learning_rate": 2.1095785657910095e-06, + "loss": 0.8623367, + "num_input_tokens_seen": 178076965, + "router_z_loss_clip": 1.50488281, + "router_z_loss_mlp": 0.12860107, + "step": 8284, + "time_per_iteration": 2.4994332790374756 + }, + { + "auxiliary_loss_clip": 0.06437389, + "auxiliary_loss_mlp": 0.01269907, + "balance_loss_clip": 0.06278685, + "balance_loss_mlp": 0.01255864, + "epoch": 0.49812114835412596, + "flos": 29900926915200.0, + "grad_norm": 1.711585124439885, + "language_loss": 0.7343573, + "learning_rate": 2.109189687029526e-06, + "loss": 0.81143022, + "num_input_tokens_seen": 178095105, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 0.14044189, + "step": 8285, + "time_per_iteration": 2.566572904586792 + }, + { + "auxiliary_loss_clip": 0.06430154, + "auxiliary_loss_mlp": 0.01270611, + "balance_loss_clip": 0.0627718, + "balance_loss_mlp": 0.01258404, + "epoch": 0.49818127160679393, + "flos": 23153441178240.0, + "grad_norm": 1.4871294259616603, + "language_loss": 0.74281567, + "learning_rate": 2.1088008041275598e-06, + "loss": 0.81982332, + "num_input_tokens_seen": 178114505, + "router_z_loss_clip": 1.52832031, + "router_z_loss_mlp": 0.12207031, + "step": 8286, + "time_per_iteration": 2.5136756896972656 + }, + { + "auxiliary_loss_clip": 0.06434155, + "auxiliary_loss_mlp": 0.01266169, + "balance_loss_clip": 0.06279751, + "balance_loss_mlp": 0.0125358, + "epoch": 0.4982413948594619, + "flos": 21659228444160.0, + "grad_norm": 1.6982664351725185, + "language_loss": 0.85701174, + "learning_rate": 2.1084119170998545e-06, + "loss": 0.93401492, + "num_input_tokens_seen": 178131595, + "router_z_loss_clip": 1.54492188, + "router_z_loss_mlp": 0.12579346, + "step": 8287, + "time_per_iteration": 2.518136501312256 + }, + { + "auxiliary_loss_clip": 0.06432185, + "auxiliary_loss_mlp": 0.01270528, + "balance_loss_clip": 0.06276216, + "balance_loss_mlp": 0.01256801, + "epoch": 0.49830151811212986, + "flos": 32494866560640.0, + "grad_norm": 1.6945408763753198, + "language_loss": 0.72708082, + "learning_rate": 2.108023025961159e-06, + "loss": 0.80410802, + "num_input_tokens_seen": 178152055, + "router_z_loss_clip": 1.55957031, + "router_z_loss_mlp": 0.13745117, + "step": 8288, + "time_per_iteration": 2.590862512588501 + }, + { + "auxiliary_loss_clip": 0.06436619, + "auxiliary_loss_mlp": 0.01272174, + "balance_loss_clip": 0.0627879, + "balance_loss_mlp": 0.01258972, + "epoch": 0.4983616413647978, + "flos": 18146886122880.0, + "grad_norm": 4.0455531591406855, + "language_loss": 0.81054366, + "learning_rate": 2.10763413072622e-06, + "loss": 0.8876316, + "num_input_tokens_seen": 178168150, + "router_z_loss_clip": 1.57617188, + "router_z_loss_mlp": 0.13201904, + "step": 8289, + "time_per_iteration": 2.504817008972168 + }, + { + "auxiliary_loss_clip": 0.06432903, + "auxiliary_loss_mlp": 0.01270371, + "balance_loss_clip": 0.06279443, + "balance_loss_mlp": 0.01257074, + "epoch": 0.4984217646174658, + "flos": 19724898539520.0, + "grad_norm": 2.471620750065275, + "language_loss": 0.73847377, + "learning_rate": 2.107245231409784e-06, + "loss": 0.81550646, + "num_input_tokens_seen": 178186150, + "router_z_loss_clip": 1.53613281, + "router_z_loss_mlp": 0.13305664, + "step": 8290, + "time_per_iteration": 2.492176055908203 + }, + { + "auxiliary_loss_clip": 0.0643364, + "auxiliary_loss_mlp": 0.01275224, + "balance_loss_clip": 0.06278273, + "balance_loss_mlp": 0.01261157, + "epoch": 0.49848188787013376, + "flos": 24943525079040.0, + "grad_norm": 1.4456375643187662, + "language_loss": 0.84330356, + "learning_rate": 2.106856328026598e-06, + "loss": 0.92039216, + "num_input_tokens_seen": 178207665, + "router_z_loss_clip": 1.55273438, + "router_z_loss_mlp": 0.140625, + "step": 8291, + "time_per_iteration": 2.5577101707458496 + }, + { + "auxiliary_loss_clip": 0.06438746, + "auxiliary_loss_mlp": 0.01270664, + "balance_loss_clip": 0.06277075, + "balance_loss_mlp": 0.01257379, + "epoch": 0.4985420111228017, + "flos": 22388969652480.0, + "grad_norm": 1.8626179833436056, + "language_loss": 0.67868197, + "learning_rate": 2.106467420591409e-06, + "loss": 0.75577605, + "num_input_tokens_seen": 178226325, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.13275146, + "step": 8292, + "time_per_iteration": 2.5227880477905273 + }, + { + "auxiliary_loss_clip": 0.06428275, + "auxiliary_loss_mlp": 0.01268796, + "balance_loss_clip": 0.06275518, + "balance_loss_mlp": 0.01256977, + "epoch": 0.4986021343754697, + "flos": 16221989802240.0, + "grad_norm": 1.635019918785358, + "language_loss": 0.67247725, + "learning_rate": 2.106078509118965e-06, + "loss": 0.749448, + "num_input_tokens_seen": 178244960, + "router_z_loss_clip": 1.52832031, + "router_z_loss_mlp": 0.11798096, + "step": 8293, + "time_per_iteration": 2.5051913261413574 + }, + { + "auxiliary_loss_clip": 0.0643108, + "auxiliary_loss_mlp": 0.01270371, + "balance_loss_clip": 0.06275735, + "balance_loss_mlp": 0.01258891, + "epoch": 0.4986622576281377, + "flos": 23410221615360.0, + "grad_norm": 1.789605024821123, + "language_loss": 0.82488304, + "learning_rate": 2.1056895936240133e-06, + "loss": 0.90189755, + "num_input_tokens_seen": 178265400, + "router_z_loss_clip": 1.5546875, + "router_z_loss_mlp": 0.11480713, + "step": 8294, + "time_per_iteration": 2.5429139137268066 + }, + { + "auxiliary_loss_clip": 0.06432615, + "auxiliary_loss_mlp": 0.01272563, + "balance_loss_clip": 0.06277893, + "balance_loss_mlp": 0.01260315, + "epoch": 0.49872238088080567, + "flos": 19980714654720.0, + "grad_norm": 2.5766475970916285, + "language_loss": 0.73639232, + "learning_rate": 2.1053006741213016e-06, + "loss": 0.81344408, + "num_input_tokens_seen": 178284535, + "router_z_loss_clip": 1.546875, + "router_z_loss_mlp": 0.12249756, + "step": 8295, + "time_per_iteration": 2.535090923309326 + }, + { + "auxiliary_loss_clip": 0.06427556, + "auxiliary_loss_mlp": 0.01272493, + "balance_loss_clip": 0.06276329, + "balance_loss_mlp": 0.01259911, + "epoch": 0.49878250413347364, + "flos": 22899595633920.0, + "grad_norm": 1.8257233918976585, + "language_loss": 0.68199098, + "learning_rate": 2.1049117506255775e-06, + "loss": 0.75899148, + "num_input_tokens_seen": 178302425, + "router_z_loss_clip": 1.50976562, + "router_z_loss_mlp": 0.12591553, + "step": 8296, + "time_per_iteration": 2.5079848766326904 + }, + { + "auxiliary_loss_clip": 0.06433527, + "auxiliary_loss_mlp": 0.01272036, + "balance_loss_clip": 0.06276954, + "balance_loss_mlp": 0.0125878, + "epoch": 0.4988426273861416, + "flos": 32606688234240.0, + "grad_norm": 1.801119189108274, + "language_loss": 0.64925557, + "learning_rate": 2.1045228231515895e-06, + "loss": 0.72631121, + "num_input_tokens_seen": 178323065, + "router_z_loss_clip": 1.56347656, + "router_z_loss_mlp": 0.13256836, + "step": 8297, + "time_per_iteration": 2.6275887489318848 + }, + { + "auxiliary_loss_clip": 0.06427586, + "auxiliary_loss_mlp": 0.01270462, + "balance_loss_clip": 0.06278079, + "balance_loss_mlp": 0.01258845, + "epoch": 0.49890275063880957, + "flos": 20929990360320.0, + "grad_norm": 1.5890674789628483, + "language_loss": 0.69987392, + "learning_rate": 2.1041338917140857e-06, + "loss": 0.77685434, + "num_input_tokens_seen": 178343985, + "router_z_loss_clip": 1.49511719, + "router_z_loss_mlp": 0.11621094, + "step": 8298, + "time_per_iteration": 2.527082681655884 + }, + { + "auxiliary_loss_clip": 0.06428695, + "auxiliary_loss_mlp": 0.01265195, + "balance_loss_clip": 0.06276681, + "balance_loss_mlp": 0.01253668, + "epoch": 0.49896287389147753, + "flos": 18630370581120.0, + "grad_norm": 3.032196085375079, + "language_loss": 0.85047698, + "learning_rate": 2.103744956327814e-06, + "loss": 0.92741591, + "num_input_tokens_seen": 178362345, + "router_z_loss_clip": 1.51953125, + "router_z_loss_mlp": 0.11517334, + "step": 8299, + "time_per_iteration": 2.531541585922241 + }, + { + "auxiliary_loss_clip": 0.06429411, + "auxiliary_loss_mlp": 0.01267168, + "balance_loss_clip": 0.06274673, + "balance_loss_mlp": 0.0125412, + "epoch": 0.4990229971441455, + "flos": 24833422414080.0, + "grad_norm": 2.041795476236588, + "language_loss": 0.69284618, + "learning_rate": 2.1033560170075234e-06, + "loss": 0.76981199, + "num_input_tokens_seen": 178383190, + "router_z_loss_clip": 1.54296875, + "router_z_loss_mlp": 0.13061523, + "step": 8300, + "time_per_iteration": 2.562002658843994 + }, + { + "auxiliary_loss_clip": 0.0633271, + "auxiliary_loss_mlp": 0.01269781, + "balance_loss_clip": 0.06265618, + "balance_loss_mlp": 0.01267531, + "epoch": 0.49908312039681346, + "flos": 71405638323840.0, + "grad_norm": 0.7392878070409407, + "language_loss": 0.51101816, + "learning_rate": 2.1029670737679623e-06, + "loss": 0.58704311, + "num_input_tokens_seen": 178444250, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 0.02253723, + "step": 8301, + "time_per_iteration": 3.3210127353668213 + }, + { + "auxiliary_loss_clip": 0.06423864, + "auxiliary_loss_mlp": 0.01270768, + "balance_loss_clip": 0.06275457, + "balance_loss_mlp": 0.01258173, + "epoch": 0.4991432436494814, + "flos": 19834791569280.0, + "grad_norm": 2.2486532521822302, + "language_loss": 0.84452468, + "learning_rate": 2.102578126623879e-06, + "loss": 0.921471, + "num_input_tokens_seen": 178463250, + "router_z_loss_clip": 1.48339844, + "router_z_loss_mlp": 0.12591553, + "step": 8302, + "time_per_iteration": 2.547562837600708 + }, + { + "auxiliary_loss_clip": 0.06428537, + "auxiliary_loss_mlp": 0.01271397, + "balance_loss_clip": 0.06279141, + "balance_loss_mlp": 0.01259607, + "epoch": 0.4992033669021494, + "flos": 15127252208640.0, + "grad_norm": 1.6659174741740037, + "language_loss": 0.69610626, + "learning_rate": 2.102189175590024e-06, + "loss": 0.77310562, + "num_input_tokens_seen": 178481340, + "router_z_loss_clip": 1.49414062, + "router_z_loss_mlp": 0.11785889, + "step": 8303, + "time_per_iteration": 2.473879337310791 + }, + { + "auxiliary_loss_clip": 0.06429437, + "auxiliary_loss_mlp": 0.01266243, + "balance_loss_clip": 0.0627458, + "balance_loss_mlp": 0.01253851, + "epoch": 0.49926349015481736, + "flos": 31215282860160.0, + "grad_norm": 1.7036998151712766, + "language_loss": 0.72999942, + "learning_rate": 2.101800220681144e-06, + "loss": 0.80695617, + "num_input_tokens_seen": 178501545, + "router_z_loss_clip": 1.54785156, + "router_z_loss_mlp": 0.1239624, + "step": 8304, + "time_per_iteration": 2.611502170562744 + }, + { + "auxiliary_loss_clip": 0.0642409, + "auxiliary_loss_mlp": 0.0126995, + "balance_loss_clip": 0.0627369, + "balance_loss_mlp": 0.01257683, + "epoch": 0.4993236134074853, + "flos": 24907201534080.0, + "grad_norm": 2.0593873642803486, + "language_loss": 0.81677687, + "learning_rate": 2.10141126191199e-06, + "loss": 0.89371729, + "num_input_tokens_seen": 178519700, + "router_z_loss_clip": 1.50292969, + "router_z_loss_mlp": 0.1227417, + "step": 8305, + "time_per_iteration": 2.57425594329834 + }, + { + "auxiliary_loss_clip": 0.0632831, + "auxiliary_loss_mlp": 0.01255041, + "balance_loss_clip": 0.06261367, + "balance_loss_mlp": 0.01252826, + "epoch": 0.4993837366601533, + "flos": 70438962896640.0, + "grad_norm": 0.7837813432026206, + "language_loss": 0.56909657, + "learning_rate": 2.1010222992973107e-06, + "loss": 0.64493006, + "num_input_tokens_seen": 178576740, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 0.02220154, + "step": 8306, + "time_per_iteration": 3.2806143760681152 + }, + { + "auxiliary_loss_clip": 0.06430675, + "auxiliary_loss_mlp": 0.01269703, + "balance_loss_clip": 0.06278585, + "balance_loss_mlp": 0.01255422, + "epoch": 0.4994438599128213, + "flos": 15966718738560.0, + "grad_norm": 1.7475082532303507, + "language_loss": 0.83157074, + "learning_rate": 2.1006333328518556e-06, + "loss": 0.90857446, + "num_input_tokens_seen": 178594745, + "router_z_loss_clip": 1.52148438, + "router_z_loss_mlp": 0.1427002, + "step": 8307, + "time_per_iteration": 2.4851419925689697 + }, + { + "auxiliary_loss_clip": 0.06426803, + "auxiliary_loss_mlp": 0.01271631, + "balance_loss_clip": 0.06277731, + "balance_loss_mlp": 0.01258458, + "epoch": 0.4995039831654893, + "flos": 27935765907840.0, + "grad_norm": 1.9977557260500436, + "language_loss": 0.61003512, + "learning_rate": 2.1002443625903748e-06, + "loss": 0.68701947, + "num_input_tokens_seen": 178614110, + "router_z_loss_clip": 1.48925781, + "router_z_loss_mlp": 0.13189697, + "step": 8308, + "time_per_iteration": 2.5943245887756348 + }, + { + "auxiliary_loss_clip": 0.06426641, + "auxiliary_loss_mlp": 0.01271422, + "balance_loss_clip": 0.06278297, + "balance_loss_mlp": 0.01259948, + "epoch": 0.49956410641815724, + "flos": 24211310175360.0, + "grad_norm": 1.573691211270805, + "language_loss": 0.74911636, + "learning_rate": 2.0998553885276168e-06, + "loss": 0.82609695, + "num_input_tokens_seen": 178634170, + "router_z_loss_clip": 1.48242188, + "router_z_loss_mlp": 0.11468506, + "step": 8309, + "time_per_iteration": 3.9743635654449463 + }, + { + "auxiliary_loss_clip": 0.06430435, + "auxiliary_loss_mlp": 0.01268231, + "balance_loss_clip": 0.0627753, + "balance_loss_mlp": 0.0125578, + "epoch": 0.4996242296708252, + "flos": 16185666257280.0, + "grad_norm": 2.033466484631739, + "language_loss": 0.80080384, + "learning_rate": 2.0994664106783335e-06, + "loss": 0.87779051, + "num_input_tokens_seen": 178651775, + "router_z_loss_clip": 1.52734375, + "router_z_loss_mlp": 0.12438965, + "step": 8310, + "time_per_iteration": 2.475815534591675 + }, + { + "auxiliary_loss_clip": 0.06429116, + "auxiliary_loss_mlp": 0.01267368, + "balance_loss_clip": 0.06274112, + "balance_loss_mlp": 0.01254541, + "epoch": 0.49968435292349317, + "flos": 16879209701760.0, + "grad_norm": 1.5486293297173337, + "language_loss": 0.71370041, + "learning_rate": 2.0990774290572735e-06, + "loss": 0.79066527, + "num_input_tokens_seen": 178669720, + "router_z_loss_clip": 1.55078125, + "router_z_loss_mlp": 0.12823486, + "step": 8311, + "time_per_iteration": 4.01245641708374 + }, + { + "auxiliary_loss_clip": 0.06428856, + "auxiliary_loss_mlp": 0.01266033, + "balance_loss_clip": 0.06277557, + "balance_loss_mlp": 0.01254636, + "epoch": 0.49974447617616113, + "flos": 14944837870080.0, + "grad_norm": 1.8003339909908787, + "language_loss": 0.77129757, + "learning_rate": 2.098688443679187e-06, + "loss": 0.8482464, + "num_input_tokens_seen": 178686765, + "router_z_loss_clip": 1.51074219, + "router_z_loss_mlp": 0.11401367, + "step": 8312, + "time_per_iteration": 2.4761128425598145 + }, + { + "auxiliary_loss_clip": 0.0643132, + "auxiliary_loss_mlp": 0.01266437, + "balance_loss_clip": 0.06279029, + "balance_loss_mlp": 0.01254206, + "epoch": 0.4998045994288291, + "flos": 26658823610880.0, + "grad_norm": 1.6524127143489034, + "language_loss": 0.84981465, + "learning_rate": 2.0982994545588256e-06, + "loss": 0.9267922, + "num_input_tokens_seen": 178705845, + "router_z_loss_clip": 1.52246094, + "router_z_loss_mlp": 0.12231445, + "step": 8313, + "time_per_iteration": 2.6057398319244385 + }, + { + "auxiliary_loss_clip": 0.06431891, + "auxiliary_loss_mlp": 0.01267877, + "balance_loss_clip": 0.06279939, + "balance_loss_mlp": 0.01256224, + "epoch": 0.49986472268149706, + "flos": 20959102454400.0, + "grad_norm": 1.6979548607445847, + "language_loss": 0.81193811, + "learning_rate": 2.097910461710939e-06, + "loss": 0.8889358, + "num_input_tokens_seen": 178723410, + "router_z_loss_clip": 1.52050781, + "router_z_loss_mlp": 0.11657715, + "step": 8314, + "time_per_iteration": 2.5246880054473877 + }, + { + "auxiliary_loss_clip": 0.06430186, + "auxiliary_loss_mlp": 0.01269627, + "balance_loss_clip": 0.06278808, + "balance_loss_mlp": 0.01256341, + "epoch": 0.49992484593416503, + "flos": 22790499217920.0, + "grad_norm": 1.7217224756504992, + "language_loss": 0.79857439, + "learning_rate": 2.0975214651502773e-06, + "loss": 0.8755725, + "num_input_tokens_seen": 178743560, + "router_z_loss_clip": 1.51464844, + "router_z_loss_mlp": 0.13305664, + "step": 8315, + "time_per_iteration": 2.5382394790649414 + }, + { + "auxiliary_loss_clip": 0.06427336, + "auxiliary_loss_mlp": 0.01267686, + "balance_loss_clip": 0.06276156, + "balance_loss_mlp": 0.0125595, + "epoch": 0.499984969186833, + "flos": 46796838307200.0, + "grad_norm": 1.6656557215916168, + "language_loss": 0.74803257, + "learning_rate": 2.0971324648915926e-06, + "loss": 0.82498288, + "num_input_tokens_seen": 178767225, + "router_z_loss_clip": 1.50976562, + "router_z_loss_mlp": 0.11749268, + "step": 8316, + "time_per_iteration": 4.178734540939331 + }, + { + "auxiliary_loss_clip": 0.06424455, + "auxiliary_loss_mlp": 0.01269425, + "balance_loss_clip": 0.0627817, + "balance_loss_mlp": 0.01258083, + "epoch": 0.500045092439501, + "flos": 25564086017280.0, + "grad_norm": 1.744541126829246, + "language_loss": 0.81478661, + "learning_rate": 2.0967434609496343e-06, + "loss": 0.89172542, + "num_input_tokens_seen": 178786810, + "router_z_loss_clip": 1.46191406, + "router_z_loss_mlp": 0.11346436, + "step": 8317, + "time_per_iteration": 2.537320613861084 + }, + { + "auxiliary_loss_clip": 0.06427011, + "auxiliary_loss_mlp": 0.01270425, + "balance_loss_clip": 0.06274804, + "balance_loss_mlp": 0.01257586, + "epoch": 0.5001052156921689, + "flos": 20711126695680.0, + "grad_norm": 1.5732702518161361, + "language_loss": 0.83390272, + "learning_rate": 2.0963544533391548e-06, + "loss": 0.91087711, + "num_input_tokens_seen": 178805660, + "router_z_loss_clip": 1.52050781, + "router_z_loss_mlp": 0.12835693, + "step": 8318, + "time_per_iteration": 2.534135103225708 + }, + { + "auxiliary_loss_clip": 0.06428336, + "auxiliary_loss_mlp": 0.01269138, + "balance_loss_clip": 0.06277522, + "balance_loss_mlp": 0.01257109, + "epoch": 0.500165338944837, + "flos": 21257405389440.0, + "grad_norm": 1.6807233025456896, + "language_loss": 0.82012349, + "learning_rate": 2.0959654420749045e-06, + "loss": 0.89709824, + "num_input_tokens_seen": 178824780, + "router_z_loss_clip": 1.5078125, + "router_z_loss_mlp": 0.12030029, + "step": 8319, + "time_per_iteration": 2.515835762023926 + }, + { + "auxiliary_loss_clip": 0.06428086, + "auxiliary_loss_mlp": 0.01265652, + "balance_loss_clip": 0.0627624, + "balance_loss_mlp": 0.01254697, + "epoch": 0.5002254621975049, + "flos": 27861693298560.0, + "grad_norm": 1.6360150103182107, + "language_loss": 0.72118968, + "learning_rate": 2.095576427171635e-06, + "loss": 0.79812706, + "num_input_tokens_seen": 178845640, + "router_z_loss_clip": 1.51757812, + "router_z_loss_mlp": 0.10955811, + "step": 8320, + "time_per_iteration": 2.5796635150909424 + }, + { + "auxiliary_loss_clip": 0.06441814, + "auxiliary_loss_mlp": 0.01267293, + "balance_loss_clip": 0.06280147, + "balance_loss_mlp": 0.01253858, + "epoch": 0.5002855854501729, + "flos": 15556049078400.0, + "grad_norm": 2.4313263695255696, + "language_loss": 0.76678413, + "learning_rate": 2.0951874086440978e-06, + "loss": 0.84387517, + "num_input_tokens_seen": 178862290, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.13439941, + "step": 8321, + "time_per_iteration": 2.4691002368927 + }, + { + "auxiliary_loss_clip": 0.06428922, + "auxiliary_loss_mlp": 0.01268744, + "balance_loss_clip": 0.06276058, + "balance_loss_mlp": 0.0125556, + "epoch": 0.5003457087028408, + "flos": 16112977240320.0, + "grad_norm": 1.7492839336280708, + "language_loss": 0.82910907, + "learning_rate": 2.0947983865070455e-06, + "loss": 0.90608579, + "num_input_tokens_seen": 178879805, + "router_z_loss_clip": 1.52636719, + "router_z_loss_mlp": 0.13183594, + "step": 8322, + "time_per_iteration": 2.515460252761841 + }, + { + "auxiliary_loss_clip": 0.06431515, + "auxiliary_loss_mlp": 0.0126974, + "balance_loss_clip": 0.06279334, + "balance_loss_mlp": 0.01256973, + "epoch": 0.5004058319555088, + "flos": 22717055514240.0, + "grad_norm": 3.787468052495824, + "language_loss": 0.74021679, + "learning_rate": 2.094409360775228e-06, + "loss": 0.81722933, + "num_input_tokens_seen": 178896985, + "router_z_loss_clip": 1.52246094, + "router_z_loss_mlp": 0.12774658, + "step": 8323, + "time_per_iteration": 3.9577157497406006 + }, + { + "auxiliary_loss_clip": 0.06425107, + "auxiliary_loss_mlp": 0.01267421, + "balance_loss_clip": 0.06273489, + "balance_loss_mlp": 0.01254761, + "epoch": 0.5004659552081767, + "flos": 30125870000640.0, + "grad_norm": 1.569659839153646, + "language_loss": 0.69694078, + "learning_rate": 2.0940203314633977e-06, + "loss": 0.77386606, + "num_input_tokens_seen": 178920605, + "router_z_loss_clip": 1.515625, + "router_z_loss_mlp": 0.12670898, + "step": 8324, + "time_per_iteration": 2.5927038192749023 + }, + { + "auxiliary_loss_clip": 0.06426285, + "auxiliary_loss_mlp": 0.01267566, + "balance_loss_clip": 0.06274655, + "balance_loss_mlp": 0.012554, + "epoch": 0.5005260784608447, + "flos": 18630664070400.0, + "grad_norm": 1.9637621432589805, + "language_loss": 0.72455752, + "learning_rate": 2.0936312985863077e-06, + "loss": 0.80149603, + "num_input_tokens_seen": 178937760, + "router_z_loss_clip": 1.51367188, + "router_z_loss_mlp": 0.12164307, + "step": 8325, + "time_per_iteration": 2.5748932361602783 + }, + { + "auxiliary_loss_clip": 0.06431422, + "auxiliary_loss_mlp": 0.01266152, + "balance_loss_clip": 0.06278826, + "balance_loss_mlp": 0.01253069, + "epoch": 0.5005862017135126, + "flos": 24866349868800.0, + "grad_norm": 1.7160687334315328, + "language_loss": 0.73386943, + "learning_rate": 2.093242262158709e-06, + "loss": 0.8108452, + "num_input_tokens_seen": 178957985, + "router_z_loss_clip": 1.52636719, + "router_z_loss_mlp": 0.13085938, + "step": 8326, + "time_per_iteration": 2.5720608234405518 + }, + { + "auxiliary_loss_clip": 0.06426796, + "auxiliary_loss_mlp": 0.01267135, + "balance_loss_clip": 0.06276905, + "balance_loss_mlp": 0.01255763, + "epoch": 0.5006463249661807, + "flos": 18740389392000.0, + "grad_norm": 1.5629486934520718, + "language_loss": 0.78059208, + "learning_rate": 2.0928532221953544e-06, + "loss": 0.85753143, + "num_input_tokens_seen": 178977070, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.11364746, + "step": 8327, + "time_per_iteration": 2.5033681392669678 + }, + { + "auxiliary_loss_clip": 0.06429915, + "auxiliary_loss_mlp": 0.01266866, + "balance_loss_clip": 0.06277432, + "balance_loss_mlp": 0.01254533, + "epoch": 0.5007064482188487, + "flos": 13047124999680.0, + "grad_norm": 2.5584329331081253, + "language_loss": 0.88066995, + "learning_rate": 2.092464178710997e-06, + "loss": 0.95763773, + "num_input_tokens_seen": 178994175, + "router_z_loss_clip": 1.52441406, + "router_z_loss_mlp": 0.12329102, + "step": 8328, + "time_per_iteration": 2.469723701477051 + }, + { + "auxiliary_loss_clip": 0.06430298, + "auxiliary_loss_mlp": 0.0126735, + "balance_loss_clip": 0.06274554, + "balance_loss_mlp": 0.01254302, + "epoch": 0.5007665714715166, + "flos": 21295154453760.0, + "grad_norm": 2.120857663767784, + "language_loss": 0.74578768, + "learning_rate": 2.092075131720388e-06, + "loss": 0.82276416, + "num_input_tokens_seen": 179013710, + "router_z_loss_clip": 1.55664062, + "router_z_loss_mlp": 0.1305542, + "step": 8329, + "time_per_iteration": 2.527421236038208 + }, + { + "auxiliary_loss_clip": 0.06427623, + "auxiliary_loss_mlp": 0.01269321, + "balance_loss_clip": 0.06278372, + "balance_loss_mlp": 0.01257626, + "epoch": 0.5008266947241846, + "flos": 29762676478080.0, + "grad_norm": 1.5806360237517383, + "language_loss": 0.80007339, + "learning_rate": 2.091686081238281e-06, + "loss": 0.87704277, + "num_input_tokens_seen": 179035255, + "router_z_loss_clip": 1.49316406, + "router_z_loss_mlp": 0.11688232, + "step": 8330, + "time_per_iteration": 2.589132785797119 + }, + { + "auxiliary_loss_clip": 0.063256, + "auxiliary_loss_mlp": 0.01256172, + "balance_loss_clip": 0.06259131, + "balance_loss_mlp": 0.0125421, + "epoch": 0.5008868179768525, + "flos": 63574498460160.0, + "grad_norm": 0.7051231310601146, + "language_loss": 0.56005836, + "learning_rate": 2.0912970272794282e-06, + "loss": 0.63587606, + "num_input_tokens_seen": 179090915, + "router_z_loss_clip": 0.6640625, + "router_z_loss_mlp": 0.01960754, + "step": 8331, + "time_per_iteration": 2.9798707962036133 + }, + { + "auxiliary_loss_clip": 0.06425481, + "auxiliary_loss_mlp": 0.01267706, + "balance_loss_clip": 0.06278575, + "balance_loss_mlp": 0.01256125, + "epoch": 0.5009469412295205, + "flos": 27382108055040.0, + "grad_norm": 1.8793466545943338, + "language_loss": 0.65444684, + "learning_rate": 2.0909079698585833e-06, + "loss": 0.73137867, + "num_input_tokens_seen": 179109160, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 0.11584473, + "step": 8332, + "time_per_iteration": 2.548846483230591 + }, + { + "auxiliary_loss_clip": 0.06424412, + "auxiliary_loss_mlp": 0.01265076, + "balance_loss_clip": 0.06275713, + "balance_loss_mlp": 0.01253578, + "epoch": 0.5010070644821885, + "flos": 27385839561600.0, + "grad_norm": 1.4154143625456153, + "language_loss": 0.75122535, + "learning_rate": 2.0905189089904993e-06, + "loss": 0.82812029, + "num_input_tokens_seen": 179130610, + "router_z_loss_clip": 1.48632812, + "router_z_loss_mlp": 0.1149292, + "step": 8333, + "time_per_iteration": 2.600377082824707 + }, + { + "auxiliary_loss_clip": 0.06429033, + "auxiliary_loss_mlp": 0.01268641, + "balance_loss_clip": 0.06276083, + "balance_loss_mlp": 0.01256481, + "epoch": 0.5010671877348565, + "flos": 20668178678400.0, + "grad_norm": 1.9411742898612023, + "language_loss": 0.80806357, + "learning_rate": 2.090129844689929e-06, + "loss": 0.88504034, + "num_input_tokens_seen": 179147860, + "router_z_loss_clip": 1.53222656, + "router_z_loss_mlp": 0.12158203, + "step": 8334, + "time_per_iteration": 2.490330457687378 + }, + { + "auxiliary_loss_clip": 0.0633373, + "auxiliary_loss_mlp": 0.01254486, + "balance_loss_clip": 0.06267349, + "balance_loss_mlp": 0.01252466, + "epoch": 0.5011273109875244, + "flos": 59148266855040.0, + "grad_norm": 0.880609822046852, + "language_loss": 0.62818438, + "learning_rate": 2.089740776971626e-06, + "loss": 0.70406651, + "num_input_tokens_seen": 179210490, + "router_z_loss_clip": 0.6640625, + "router_z_loss_mlp": 0.02020264, + "step": 8335, + "time_per_iteration": 3.1081318855285645 + }, + { + "auxiliary_loss_clip": 0.06426011, + "auxiliary_loss_mlp": 0.01265932, + "balance_loss_clip": 0.06278515, + "balance_loss_mlp": 0.01255334, + "epoch": 0.5011874342401924, + "flos": 25343126000640.0, + "grad_norm": 1.3778270209342711, + "language_loss": 0.80092967, + "learning_rate": 2.0893517058503435e-06, + "loss": 0.8778491, + "num_input_tokens_seen": 179231360, + "router_z_loss_clip": 1.47558594, + "router_z_loss_mlp": 0.105896, + "step": 8336, + "time_per_iteration": 2.5390379428863525 + }, + { + "auxiliary_loss_clip": 0.06428748, + "auxiliary_loss_mlp": 0.0126676, + "balance_loss_clip": 0.06278357, + "balance_loss_mlp": 0.01254923, + "epoch": 0.5012475574928603, + "flos": 20236153426560.0, + "grad_norm": 1.7537768303990948, + "language_loss": 0.81054461, + "learning_rate": 2.088962631340836e-06, + "loss": 0.88749969, + "num_input_tokens_seen": 179250625, + "router_z_loss_clip": 1.50585938, + "router_z_loss_mlp": 0.11834717, + "step": 8337, + "time_per_iteration": 2.5480427742004395 + }, + { + "auxiliary_loss_clip": 0.06436703, + "auxiliary_loss_mlp": 0.01267216, + "balance_loss_clip": 0.06279006, + "balance_loss_mlp": 0.01254973, + "epoch": 0.5013076807455283, + "flos": 22716594316800.0, + "grad_norm": 1.7916878418610642, + "language_loss": 0.79506505, + "learning_rate": 2.0885735534578555e-06, + "loss": 0.87210429, + "num_input_tokens_seen": 179267360, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.12255859, + "step": 8338, + "time_per_iteration": 2.5164718627929688 + }, + { + "auxiliary_loss_clip": 0.0643065, + "auxiliary_loss_mlp": 0.01265282, + "balance_loss_clip": 0.06277832, + "balance_loss_mlp": 0.01253176, + "epoch": 0.5013678039981962, + "flos": 24252329548800.0, + "grad_norm": 1.5889596080337545, + "language_loss": 0.85034919, + "learning_rate": 2.0881844722161583e-06, + "loss": 0.9273085, + "num_input_tokens_seen": 179289810, + "router_z_loss_clip": 1.52636719, + "router_z_loss_mlp": 0.12127686, + "step": 8339, + "time_per_iteration": 2.5785508155822754 + }, + { + "auxiliary_loss_clip": 0.06426719, + "auxiliary_loss_mlp": 0.01269107, + "balance_loss_clip": 0.06276147, + "balance_loss_mlp": 0.0125814, + "epoch": 0.5014279272508643, + "flos": 26183808414720.0, + "grad_norm": 1.5165096284579775, + "language_loss": 0.71162677, + "learning_rate": 2.0877953876304962e-06, + "loss": 0.78858501, + "num_input_tokens_seen": 179310620, + "router_z_loss_clip": 1.50488281, + "router_z_loss_mlp": 0.10968018, + "step": 8340, + "time_per_iteration": 2.5929582118988037 + }, + { + "auxiliary_loss_clip": 0.06433477, + "auxiliary_loss_mlp": 0.01270076, + "balance_loss_clip": 0.06277999, + "balance_loss_mlp": 0.01256867, + "epoch": 0.5014880505035323, + "flos": 21436255929600.0, + "grad_norm": 2.442832877053188, + "language_loss": 0.7829324, + "learning_rate": 2.0874062997156245e-06, + "loss": 0.85996789, + "num_input_tokens_seen": 179329005, + "router_z_loss_clip": 1.5546875, + "router_z_loss_mlp": 0.13208008, + "step": 8341, + "time_per_iteration": 2.5200908184051514 + }, + { + "auxiliary_loss_clip": 0.06435034, + "auxiliary_loss_mlp": 0.01267489, + "balance_loss_clip": 0.062792, + "balance_loss_mlp": 0.01255407, + "epoch": 0.5015481737562002, + "flos": 15774870816000.0, + "grad_norm": 2.1824930872588917, + "language_loss": 0.89806843, + "learning_rate": 2.0870172084862975e-06, + "loss": 0.97509372, + "num_input_tokens_seen": 179343785, + "router_z_loss_clip": 1.5546875, + "router_z_loss_mlp": 0.12091064, + "step": 8342, + "time_per_iteration": 2.502265691757202 + }, + { + "auxiliary_loss_clip": 0.06427857, + "auxiliary_loss_mlp": 0.01264552, + "balance_loss_clip": 0.06276843, + "balance_loss_mlp": 0.0125275, + "epoch": 0.5016082970088682, + "flos": 26837590296960.0, + "grad_norm": 1.7003073455140034, + "language_loss": 0.76872855, + "learning_rate": 2.0866281139572682e-06, + "loss": 0.84565264, + "num_input_tokens_seen": 179364070, + "router_z_loss_clip": 1.50878906, + "router_z_loss_mlp": 0.11804199, + "step": 8343, + "time_per_iteration": 2.5502099990844727 + }, + { + "auxiliary_loss_clip": 0.06426306, + "auxiliary_loss_mlp": 0.01267626, + "balance_loss_clip": 0.0627844, + "balance_loss_mlp": 0.01256724, + "epoch": 0.5016684202615361, + "flos": 21477023740800.0, + "grad_norm": 3.7325470711422466, + "language_loss": 0.67772466, + "learning_rate": 2.086239016143293e-06, + "loss": 0.75466394, + "num_input_tokens_seen": 179384225, + "router_z_loss_clip": 1.47949219, + "router_z_loss_mlp": 0.10900879, + "step": 8344, + "time_per_iteration": 2.5443081855773926 + }, + { + "auxiliary_loss_clip": 0.06429319, + "auxiliary_loss_mlp": 0.01271563, + "balance_loss_clip": 0.06277445, + "balance_loss_mlp": 0.01259803, + "epoch": 0.5017285435142042, + "flos": 26253478684800.0, + "grad_norm": 2.15637603402593, + "language_loss": 0.75492197, + "learning_rate": 2.0858499150591258e-06, + "loss": 0.83193076, + "num_input_tokens_seen": 179402595, + "router_z_loss_clip": 1.51757812, + "router_z_loss_mlp": 0.11767578, + "step": 8345, + "time_per_iteration": 2.5757455825805664 + }, + { + "auxiliary_loss_clip": 0.06426319, + "auxiliary_loss_mlp": 0.01267207, + "balance_loss_clip": 0.06275543, + "balance_loss_mlp": 0.0125441, + "epoch": 0.5017886667668721, + "flos": 20783899566720.0, + "grad_norm": 2.131359070350305, + "language_loss": 0.78573453, + "learning_rate": 2.0854608107195203e-06, + "loss": 0.86266983, + "num_input_tokens_seen": 179419635, + "router_z_loss_clip": 1.50585938, + "router_z_loss_mlp": 0.12805176, + "step": 8346, + "time_per_iteration": 2.5463459491729736 + }, + { + "auxiliary_loss_clip": 0.06428749, + "auxiliary_loss_mlp": 0.012678, + "balance_loss_clip": 0.0627691, + "balance_loss_mlp": 0.01256201, + "epoch": 0.5018487900195401, + "flos": 20162500087680.0, + "grad_norm": 1.4665059060371557, + "language_loss": 0.69395542, + "learning_rate": 2.0850717031392333e-06, + "loss": 0.77092093, + "num_input_tokens_seen": 179438770, + "router_z_loss_clip": 1.51757812, + "router_z_loss_mlp": 0.11608887, + "step": 8347, + "time_per_iteration": 2.5277669429779053 + }, + { + "auxiliary_loss_clip": 0.06433204, + "auxiliary_loss_mlp": 0.0126827, + "balance_loss_clip": 0.06278361, + "balance_loss_mlp": 0.01256236, + "epoch": 0.501908913272208, + "flos": 18156613196160.0, + "grad_norm": 2.582566868470837, + "language_loss": 0.7215631, + "learning_rate": 2.0846825923330174e-06, + "loss": 0.79857785, + "num_input_tokens_seen": 179457475, + "router_z_loss_clip": 1.54882812, + "router_z_loss_mlp": 0.12030029, + "step": 8348, + "time_per_iteration": 3.996784210205078 + }, + { + "auxiliary_loss_clip": 0.06424178, + "auxiliary_loss_mlp": 0.01269515, + "balance_loss_clip": 0.06277803, + "balance_loss_mlp": 0.01258166, + "epoch": 0.501969036524876, + "flos": 23118962423040.0, + "grad_norm": 1.4308074213434065, + "language_loss": 0.74796462, + "learning_rate": 2.0842934783156303e-06, + "loss": 0.82490146, + "num_input_tokens_seen": 179478140, + "router_z_loss_clip": 1.46386719, + "router_z_loss_mlp": 0.11346436, + "step": 8349, + "time_per_iteration": 2.5489115715026855 + }, + { + "auxiliary_loss_clip": 0.06429881, + "auxiliary_loss_mlp": 0.01269935, + "balance_loss_clip": 0.06276442, + "balance_loss_mlp": 0.01257442, + "epoch": 0.5020291597775439, + "flos": 11367814596480.0, + "grad_norm": 1.898459652208493, + "language_loss": 0.63674343, + "learning_rate": 2.0839043611018266e-06, + "loss": 0.71374166, + "num_input_tokens_seen": 179494325, + "router_z_loss_clip": 1.53417969, + "router_z_loss_mlp": 0.12493896, + "step": 8350, + "time_per_iteration": 2.487217426300049 + }, + { + "auxiliary_loss_clip": 0.06323833, + "auxiliary_loss_mlp": 0.01259522, + "balance_loss_clip": 0.06257538, + "balance_loss_mlp": 0.01257642, + "epoch": 0.5020892830302119, + "flos": 64030422124800.0, + "grad_norm": 0.7586308907420236, + "language_loss": 0.59914774, + "learning_rate": 2.0835152407063597e-06, + "loss": 0.6749813, + "num_input_tokens_seen": 179553545, + "router_z_loss_clip": 0.6640625, + "router_z_loss_mlp": 0.01876831, + "step": 8351, + "time_per_iteration": 4.69463324546814 + }, + { + "auxiliary_loss_clip": 0.06434566, + "auxiliary_loss_mlp": 0.01269503, + "balance_loss_clip": 0.06280354, + "balance_loss_mlp": 0.01258029, + "epoch": 0.5021494062828799, + "flos": 23739691069440.0, + "grad_norm": 1.6219034526425078, + "language_loss": 0.75496215, + "learning_rate": 2.0831261171439873e-06, + "loss": 0.83200288, + "num_input_tokens_seen": 179573645, + "router_z_loss_clip": 1.54296875, + "router_z_loss_mlp": 0.11474609, + "step": 8352, + "time_per_iteration": 2.5164549350738525 + }, + { + "auxiliary_loss_clip": 0.06428628, + "auxiliary_loss_mlp": 0.01267422, + "balance_loss_clip": 0.06277371, + "balance_loss_mlp": 0.01254845, + "epoch": 0.5022095295355479, + "flos": 21582640212480.0, + "grad_norm": 1.8174761726271038, + "language_loss": 0.71818656, + "learning_rate": 2.082736990429464e-06, + "loss": 0.795147, + "num_input_tokens_seen": 179591435, + "router_z_loss_clip": 1.51171875, + "router_z_loss_mlp": 0.12573242, + "step": 8353, + "time_per_iteration": 2.51479172706604 + }, + { + "auxiliary_loss_clip": 0.06434356, + "auxiliary_loss_mlp": 0.01268164, + "balance_loss_clip": 0.06281401, + "balance_loss_mlp": 0.01256105, + "epoch": 0.5022696527882159, + "flos": 21403580037120.0, + "grad_norm": 2.9144841273148154, + "language_loss": 0.74235505, + "learning_rate": 2.0823478605775455e-06, + "loss": 0.81938022, + "num_input_tokens_seen": 179609955, + "router_z_loss_clip": 1.52734375, + "router_z_loss_mlp": 0.12060547, + "step": 8354, + "time_per_iteration": 2.5085036754608154 + }, + { + "auxiliary_loss_clip": 0.06431521, + "auxiliary_loss_mlp": 0.0126797, + "balance_loss_clip": 0.06281638, + "balance_loss_mlp": 0.01256216, + "epoch": 0.5023297760408838, + "flos": 27167814437760.0, + "grad_norm": 1.5801517406711547, + "language_loss": 0.7257005, + "learning_rate": 2.0819587276029884e-06, + "loss": 0.80269539, + "num_input_tokens_seen": 179630875, + "router_z_loss_clip": 1.49804688, + "router_z_loss_mlp": 0.11755371, + "step": 8355, + "time_per_iteration": 2.559136152267456 + }, + { + "auxiliary_loss_clip": 0.06435544, + "auxiliary_loss_mlp": 0.01267978, + "balance_loss_clip": 0.06278937, + "balance_loss_mlp": 0.01255134, + "epoch": 0.5023898992935518, + "flos": 26221054354560.0, + "grad_norm": 1.801551244152151, + "language_loss": 0.8142066, + "learning_rate": 2.081569591520548e-06, + "loss": 0.89124179, + "num_input_tokens_seen": 179649835, + "router_z_loss_clip": 1.56445312, + "router_z_loss_mlp": 0.1282959, + "step": 8356, + "time_per_iteration": 3.978407144546509 + }, + { + "auxiliary_loss_clip": 0.06435513, + "auxiliary_loss_mlp": 0.01268474, + "balance_loss_clip": 0.06275411, + "balance_loss_mlp": 0.01255272, + "epoch": 0.5024500225462197, + "flos": 13444839204480.0, + "grad_norm": 2.072167033386685, + "language_loss": 0.7662456, + "learning_rate": 2.0811804523449803e-06, + "loss": 0.84328556, + "num_input_tokens_seen": 179667605, + "router_z_loss_clip": 1.60058594, + "router_z_loss_mlp": 0.13201904, + "step": 8357, + "time_per_iteration": 2.488581657409668 + }, + { + "auxiliary_loss_clip": 0.06431419, + "auxiliary_loss_mlp": 0.01272086, + "balance_loss_clip": 0.06275965, + "balance_loss_mlp": 0.01258758, + "epoch": 0.5025101457988878, + "flos": 21585952448640.0, + "grad_norm": 1.5828459742560037, + "language_loss": 0.76457655, + "learning_rate": 2.0807913100910417e-06, + "loss": 0.84161162, + "num_input_tokens_seen": 179686910, + "router_z_loss_clip": 1.55371094, + "router_z_loss_mlp": 0.13342285, + "step": 8358, + "time_per_iteration": 2.62697434425354 + }, + { + "auxiliary_loss_clip": 0.06429468, + "auxiliary_loss_mlp": 0.01266352, + "balance_loss_clip": 0.06276305, + "balance_loss_mlp": 0.01253877, + "epoch": 0.5025702690515557, + "flos": 24652140105600.0, + "grad_norm": 2.247340947262335, + "language_loss": 0.72276986, + "learning_rate": 2.0804021647734887e-06, + "loss": 0.79972816, + "num_input_tokens_seen": 179706395, + "router_z_loss_clip": 1.53027344, + "router_z_loss_mlp": 0.12481689, + "step": 8359, + "time_per_iteration": 2.577232599258423 + }, + { + "auxiliary_loss_clip": 0.0642844, + "auxiliary_loss_mlp": 0.01267714, + "balance_loss_clip": 0.06277584, + "balance_loss_mlp": 0.01255263, + "epoch": 0.5026303923042237, + "flos": 22096578430080.0, + "grad_norm": 1.7221298639434877, + "language_loss": 0.77017021, + "learning_rate": 2.080013016407077e-06, + "loss": 0.84713173, + "num_input_tokens_seen": 179725735, + "router_z_loss_clip": 1.50683594, + "router_z_loss_mlp": 0.12451172, + "step": 8360, + "time_per_iteration": 2.5449211597442627 + }, + { + "auxiliary_loss_clip": 0.0642498, + "auxiliary_loss_mlp": 0.01267029, + "balance_loss_clip": 0.06274442, + "balance_loss_mlp": 0.0125571, + "epoch": 0.5026905155568916, + "flos": 23704164138240.0, + "grad_norm": 3.319216273479951, + "language_loss": 0.76811969, + "learning_rate": 2.0796238650065645e-06, + "loss": 0.84503973, + "num_input_tokens_seen": 179746150, + "router_z_loss_clip": 1.50488281, + "router_z_loss_mlp": 0.11322021, + "step": 8361, + "time_per_iteration": 2.5360496044158936 + }, + { + "auxiliary_loss_clip": 0.06433755, + "auxiliary_loss_mlp": 0.01271718, + "balance_loss_clip": 0.06276754, + "balance_loss_mlp": 0.01258641, + "epoch": 0.5027506388095596, + "flos": 25819566716160.0, + "grad_norm": 1.6478894806212292, + "language_loss": 0.85182559, + "learning_rate": 2.0792347105867065e-06, + "loss": 0.92888033, + "num_input_tokens_seen": 179767550, + "router_z_loss_clip": 1.56835938, + "router_z_loss_mlp": 0.13067627, + "step": 8362, + "time_per_iteration": 4.023087739944458 + }, + { + "auxiliary_loss_clip": 0.06433062, + "auxiliary_loss_mlp": 0.01266272, + "balance_loss_clip": 0.06277543, + "balance_loss_mlp": 0.01253851, + "epoch": 0.5028107620622275, + "flos": 27533942853120.0, + "grad_norm": 1.6676304720736304, + "language_loss": 0.79210544, + "learning_rate": 2.0788455531622605e-06, + "loss": 0.86909878, + "num_input_tokens_seen": 179790075, + "router_z_loss_clip": 1.5546875, + "router_z_loss_mlp": 0.12420654, + "step": 8363, + "time_per_iteration": 2.610635757446289 + }, + { + "auxiliary_loss_clip": 0.0642155, + "auxiliary_loss_mlp": 0.0126839, + "balance_loss_clip": 0.06275487, + "balance_loss_mlp": 0.01255903, + "epoch": 0.5028708853148955, + "flos": 24541031191680.0, + "grad_norm": 2.470464307064636, + "language_loss": 0.76251006, + "learning_rate": 2.0784563927479838e-06, + "loss": 0.83940947, + "num_input_tokens_seen": 179806515, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.12493896, + "step": 8364, + "time_per_iteration": 2.510077953338623 + }, + { + "auxiliary_loss_clip": 0.06429755, + "auxiliary_loss_mlp": 0.01267093, + "balance_loss_clip": 0.0627771, + "balance_loss_mlp": 0.0125556, + "epoch": 0.5029310085675635, + "flos": 20819887695360.0, + "grad_norm": 1.5150578704653515, + "language_loss": 0.69785869, + "learning_rate": 2.0780672293586317e-06, + "loss": 0.77482712, + "num_input_tokens_seen": 179826450, + "router_z_loss_clip": 1.52050781, + "router_z_loss_mlp": 0.11529541, + "step": 8365, + "time_per_iteration": 2.523810386657715 + }, + { + "auxiliary_loss_clip": 0.064358, + "auxiliary_loss_mlp": 0.01267788, + "balance_loss_clip": 0.06276847, + "balance_loss_mlp": 0.01254365, + "epoch": 0.5029911318202315, + "flos": 22348411476480.0, + "grad_norm": 1.5746180090110224, + "language_loss": 0.73351806, + "learning_rate": 2.0776780630089635e-06, + "loss": 0.81055391, + "num_input_tokens_seen": 179846770, + "router_z_loss_clip": 1.58789062, + "router_z_loss_mlp": 0.13439941, + "step": 8366, + "time_per_iteration": 2.538522481918335 + }, + { + "auxiliary_loss_clip": 0.06433431, + "auxiliary_loss_mlp": 0.01266603, + "balance_loss_clip": 0.06282506, + "balance_loss_mlp": 0.01254324, + "epoch": 0.5030512550728995, + "flos": 24359581175040.0, + "grad_norm": 1.43168858878555, + "language_loss": 0.78766662, + "learning_rate": 2.077288893713735e-06, + "loss": 0.86466694, + "num_input_tokens_seen": 179866585, + "router_z_loss_clip": 1.50976562, + "router_z_loss_mlp": 0.12268066, + "step": 8367, + "time_per_iteration": 2.58542799949646 + }, + { + "auxiliary_loss_clip": 0.064292, + "auxiliary_loss_mlp": 0.01267625, + "balance_loss_clip": 0.06276654, + "balance_loss_mlp": 0.01255835, + "epoch": 0.5031113783255674, + "flos": 18265835393280.0, + "grad_norm": 1.7642536194953051, + "language_loss": 0.70319581, + "learning_rate": 2.0768997214877035e-06, + "loss": 0.78016406, + "num_input_tokens_seen": 179885575, + "router_z_loss_clip": 1.5234375, + "router_z_loss_mlp": 0.11804199, + "step": 8368, + "time_per_iteration": 2.4808216094970703 + }, + { + "auxiliary_loss_clip": 0.06318872, + "auxiliary_loss_mlp": 0.01256661, + "balance_loss_clip": 0.06252527, + "balance_loss_mlp": 0.01254704, + "epoch": 0.5031715015782354, + "flos": 57270022859520.0, + "grad_norm": 0.9058846668072361, + "language_loss": 0.63429594, + "learning_rate": 2.0765105463456274e-06, + "loss": 0.7100513, + "num_input_tokens_seen": 179939650, + "router_z_loss_clip": 0.6640625, + "router_z_loss_mlp": 0.01954651, + "step": 8369, + "time_per_iteration": 3.0813984870910645 + }, + { + "auxiliary_loss_clip": 0.06425582, + "auxiliary_loss_mlp": 0.0126821, + "balance_loss_clip": 0.06275157, + "balance_loss_mlp": 0.01256873, + "epoch": 0.5032316248309033, + "flos": 27534823320960.0, + "grad_norm": 1.9780482072247232, + "language_loss": 0.60450232, + "learning_rate": 2.076121368302263e-06, + "loss": 0.68144017, + "num_input_tokens_seen": 179961765, + "router_z_loss_clip": 1.50390625, + "router_z_loss_mlp": 0.11328125, + "step": 8370, + "time_per_iteration": 2.6361827850341797 + }, + { + "auxiliary_loss_clip": 0.06429368, + "auxiliary_loss_mlp": 0.01269199, + "balance_loss_clip": 0.06273827, + "balance_loss_mlp": 0.01255901, + "epoch": 0.5032917480835714, + "flos": 34504401104640.0, + "grad_norm": 1.6209694165930644, + "language_loss": 0.68475735, + "learning_rate": 2.0757321873723695e-06, + "loss": 0.76174301, + "num_input_tokens_seen": 179983015, + "router_z_loss_clip": 1.55664062, + "router_z_loss_mlp": 0.13293457, + "step": 8371, + "time_per_iteration": 2.6757090091705322 + }, + { + "auxiliary_loss_clip": 0.06428707, + "auxiliary_loss_mlp": 0.01269533, + "balance_loss_clip": 0.06274853, + "balance_loss_mlp": 0.01256158, + "epoch": 0.5033518713362393, + "flos": 33665228064000.0, + "grad_norm": 1.992355635042309, + "language_loss": 0.67781597, + "learning_rate": 2.0753430035707042e-06, + "loss": 0.75479841, + "num_input_tokens_seen": 180003210, + "router_z_loss_clip": 1.54199219, + "router_z_loss_mlp": 0.13397217, + "step": 8372, + "time_per_iteration": 2.625875234603882 + }, + { + "auxiliary_loss_clip": 0.06429783, + "auxiliary_loss_mlp": 0.0126941, + "balance_loss_clip": 0.06275001, + "balance_loss_mlp": 0.0125582, + "epoch": 0.5034119945889073, + "flos": 28193301031680.0, + "grad_norm": 1.502668832263038, + "language_loss": 0.67200899, + "learning_rate": 2.0749538169120235e-06, + "loss": 0.74900091, + "num_input_tokens_seen": 180025530, + "router_z_loss_clip": 1.546875, + "router_z_loss_mlp": 0.13604736, + "step": 8373, + "time_per_iteration": 2.605649709701538 + }, + { + "auxiliary_loss_clip": 0.06426984, + "auxiliary_loss_mlp": 0.01270724, + "balance_loss_clip": 0.06274835, + "balance_loss_mlp": 0.01258362, + "epoch": 0.5034721178415752, + "flos": 21364698942720.0, + "grad_norm": 1.6635937081301206, + "language_loss": 0.75186062, + "learning_rate": 2.0745646274110872e-06, + "loss": 0.82883763, + "num_input_tokens_seen": 180043180, + "router_z_loss_clip": 1.52050781, + "router_z_loss_mlp": 0.12365723, + "step": 8374, + "time_per_iteration": 2.503739595413208 + }, + { + "auxiliary_loss_clip": 0.06431206, + "auxiliary_loss_mlp": 0.01268819, + "balance_loss_clip": 0.06274216, + "balance_loss_mlp": 0.01255945, + "epoch": 0.5035322410942432, + "flos": 22681486656000.0, + "grad_norm": 1.5469346618590563, + "language_loss": 0.68547672, + "learning_rate": 2.0741754350826525e-06, + "loss": 0.76247704, + "num_input_tokens_seen": 180062905, + "router_z_loss_clip": 1.5703125, + "router_z_loss_mlp": 0.12878418, + "step": 8375, + "time_per_iteration": 2.5394840240478516 + }, + { + "auxiliary_loss_clip": 0.06436669, + "auxiliary_loss_mlp": 0.0127122, + "balance_loss_clip": 0.06277038, + "balance_loss_mlp": 0.01257285, + "epoch": 0.5035923643469111, + "flos": 19834875423360.0, + "grad_norm": 1.6007016499880733, + "language_loss": 0.78976023, + "learning_rate": 2.0737862399414777e-06, + "loss": 0.86683917, + "num_input_tokens_seen": 180082000, + "router_z_loss_clip": 1.59570312, + "router_z_loss_mlp": 0.1394043, + "step": 8376, + "time_per_iteration": 2.480931520462036 + }, + { + "auxiliary_loss_clip": 0.06429401, + "auxiliary_loss_mlp": 0.01267025, + "balance_loss_clip": 0.06272124, + "balance_loss_mlp": 0.01254722, + "epoch": 0.5036524875995791, + "flos": 30521823269760.0, + "grad_norm": 2.1513689232389686, + "language_loss": 0.59716964, + "learning_rate": 2.0733970420023213e-06, + "loss": 0.6741339, + "num_input_tokens_seen": 180101340, + "router_z_loss_clip": 1.57226562, + "router_z_loss_mlp": 0.12304688, + "step": 8377, + "time_per_iteration": 2.5793137550354004 + }, + { + "auxiliary_loss_clip": 0.06430321, + "auxiliary_loss_mlp": 0.01267909, + "balance_loss_clip": 0.06277174, + "balance_loss_mlp": 0.01254617, + "epoch": 0.5037126108522471, + "flos": 14725848424320.0, + "grad_norm": 1.9178870854351904, + "language_loss": 0.76377517, + "learning_rate": 2.0730078412799425e-06, + "loss": 0.84075749, + "num_input_tokens_seen": 180119160, + "router_z_loss_clip": 1.53320312, + "router_z_loss_mlp": 0.13305664, + "step": 8378, + "time_per_iteration": 2.4622483253479004 + }, + { + "auxiliary_loss_clip": 0.06432158, + "auxiliary_loss_mlp": 0.01267261, + "balance_loss_clip": 0.06278415, + "balance_loss_mlp": 0.01254815, + "epoch": 0.5037727341049151, + "flos": 25304119125120.0, + "grad_norm": 1.5376418940503571, + "language_loss": 0.746418, + "learning_rate": 2.0726186377890985e-06, + "loss": 0.82341218, + "num_input_tokens_seen": 180138730, + "router_z_loss_clip": 1.53710938, + "router_z_loss_mlp": 0.12457275, + "step": 8379, + "time_per_iteration": 2.55764102935791 + }, + { + "auxiliary_loss_clip": 0.06427328, + "auxiliary_loss_mlp": 0.01273275, + "balance_loss_clip": 0.06276823, + "balance_loss_mlp": 0.01260138, + "epoch": 0.5038328573575831, + "flos": 28548193000320.0, + "grad_norm": 1.8355606211356674, + "language_loss": 0.66636741, + "learning_rate": 2.072229431544548e-06, + "loss": 0.74337339, + "num_input_tokens_seen": 180158810, + "router_z_loss_clip": 1.50195312, + "router_z_loss_mlp": 0.13146973, + "step": 8380, + "time_per_iteration": 2.566993474960327 + }, + { + "auxiliary_loss_clip": 0.06426656, + "auxiliary_loss_mlp": 0.01266484, + "balance_loss_clip": 0.0627608, + "balance_loss_mlp": 0.01254259, + "epoch": 0.503892980610251, + "flos": 31657957580160.0, + "grad_norm": 1.8901892775526132, + "language_loss": 0.63646573, + "learning_rate": 2.071840222561051e-06, + "loss": 0.71339715, + "num_input_tokens_seen": 180179700, + "router_z_loss_clip": 1.50488281, + "router_z_loss_mlp": 0.12213135, + "step": 8381, + "time_per_iteration": 2.5915544033050537 + }, + { + "auxiliary_loss_clip": 0.06427336, + "auxiliary_loss_mlp": 0.01268764, + "balance_loss_clip": 0.06275158, + "balance_loss_mlp": 0.01257087, + "epoch": 0.503953103862919, + "flos": 27096718648320.0, + "grad_norm": 1.5372847630358786, + "language_loss": 0.67925096, + "learning_rate": 2.071451010853365e-06, + "loss": 0.756212, + "num_input_tokens_seen": 180199890, + "router_z_loss_clip": 1.52246094, + "router_z_loss_mlp": 0.11676025, + "step": 8382, + "time_per_iteration": 2.553654432296753 + }, + { + "auxiliary_loss_clip": 0.06443429, + "auxiliary_loss_mlp": 0.01271028, + "balance_loss_clip": 0.06281322, + "balance_loss_mlp": 0.0125745, + "epoch": 0.5040132271155869, + "flos": 15638423241600.0, + "grad_norm": 1.8104420976136362, + "language_loss": 0.62072217, + "learning_rate": 2.0710617964362506e-06, + "loss": 0.69786668, + "num_input_tokens_seen": 180217840, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.13598633, + "step": 8383, + "time_per_iteration": 2.525148630142212 + }, + { + "auxiliary_loss_clip": 0.06426074, + "auxiliary_loss_mlp": 0.01267471, + "balance_loss_clip": 0.06277263, + "balance_loss_mlp": 0.01255609, + "epoch": 0.504073350368255, + "flos": 13595290410240.0, + "grad_norm": 1.7264517386370961, + "language_loss": 0.6736567, + "learning_rate": 2.070672579324465e-06, + "loss": 0.75059223, + "num_input_tokens_seen": 180236465, + "router_z_loss_clip": 1.48730469, + "router_z_loss_mlp": 0.11853027, + "step": 8384, + "time_per_iteration": 2.4712305068969727 + }, + { + "auxiliary_loss_clip": 0.064311, + "auxiliary_loss_mlp": 0.01267671, + "balance_loss_clip": 0.06277114, + "balance_loss_mlp": 0.01255059, + "epoch": 0.5041334736209229, + "flos": 29065611162240.0, + "grad_norm": 1.6378210813415193, + "language_loss": 0.71431983, + "learning_rate": 2.0702833595327674e-06, + "loss": 0.79130751, + "num_input_tokens_seen": 180258025, + "router_z_loss_clip": 1.54003906, + "router_z_loss_mlp": 0.12609863, + "step": 8385, + "time_per_iteration": 2.573953151702881 + }, + { + "auxiliary_loss_clip": 0.06426452, + "auxiliary_loss_mlp": 0.01264681, + "balance_loss_clip": 0.0627909, + "balance_loss_mlp": 0.01252916, + "epoch": 0.5041935968735909, + "flos": 24615313436160.0, + "grad_norm": 1.6953325653845304, + "language_loss": 0.83098906, + "learning_rate": 2.069894137075919e-06, + "loss": 0.90790039, + "num_input_tokens_seen": 180277825, + "router_z_loss_clip": 1.47167969, + "router_z_loss_mlp": 0.11767578, + "step": 8386, + "time_per_iteration": 2.5524075031280518 + }, + { + "auxiliary_loss_clip": 0.06431791, + "auxiliary_loss_mlp": 0.01268931, + "balance_loss_clip": 0.06277502, + "balance_loss_mlp": 0.01256146, + "epoch": 0.5042537201262588, + "flos": 26294204568960.0, + "grad_norm": 1.4563010196783333, + "language_loss": 0.669891, + "learning_rate": 2.0695049119686766e-06, + "loss": 0.74689829, + "num_input_tokens_seen": 180300465, + "router_z_loss_clip": 1.54296875, + "router_z_loss_mlp": 0.12780762, + "step": 8387, + "time_per_iteration": 3.9810335636138916 + }, + { + "auxiliary_loss_clip": 0.064284, + "auxiliary_loss_mlp": 0.01266601, + "balance_loss_clip": 0.06276827, + "balance_loss_mlp": 0.01254608, + "epoch": 0.5043138433789268, + "flos": 22023805559040.0, + "grad_norm": 3.745410743833339, + "language_loss": 0.80531698, + "learning_rate": 2.0691156842258016e-06, + "loss": 0.882267, + "num_input_tokens_seen": 180321050, + "router_z_loss_clip": 1.51464844, + "router_z_loss_mlp": 0.11999512, + "step": 8388, + "time_per_iteration": 2.5729317665100098 + }, + { + "auxiliary_loss_clip": 0.06426677, + "auxiliary_loss_mlp": 0.01268377, + "balance_loss_clip": 0.06274392, + "balance_loss_mlp": 0.01256075, + "epoch": 0.5043739666315947, + "flos": 28774645459200.0, + "grad_norm": 1.9801629056940246, + "language_loss": 0.70134413, + "learning_rate": 2.0687264538620537e-06, + "loss": 0.77829468, + "num_input_tokens_seen": 180338870, + "router_z_loss_clip": 1.52246094, + "router_z_loss_mlp": 0.12298584, + "step": 8389, + "time_per_iteration": 2.5604100227355957 + }, + { + "auxiliary_loss_clip": 0.06432408, + "auxiliary_loss_mlp": 0.01269066, + "balance_loss_clip": 0.06276394, + "balance_loss_mlp": 0.01256328, + "epoch": 0.5044340898842627, + "flos": 27606548016000.0, + "grad_norm": 1.4709504779743863, + "language_loss": 0.69360697, + "learning_rate": 2.068337220892191e-06, + "loss": 0.77062166, + "num_input_tokens_seen": 180361285, + "router_z_loss_clip": 1.56054688, + "router_z_loss_mlp": 0.12750244, + "step": 8390, + "time_per_iteration": 4.074434041976929 + }, + { + "auxiliary_loss_clip": 0.06327184, + "auxiliary_loss_mlp": 0.01253766, + "balance_loss_clip": 0.06261003, + "balance_loss_mlp": 0.01251581, + "epoch": 0.5044942131369307, + "flos": 67474744058880.0, + "grad_norm": 0.7911094819234682, + "language_loss": 0.52874231, + "learning_rate": 2.067947985330974e-06, + "loss": 0.60455179, + "num_input_tokens_seen": 180415170, + "router_z_loss_clip": 0.6640625, + "router_z_loss_mlp": 0.0218811, + "step": 8391, + "time_per_iteration": 2.939533233642578 + }, + { + "auxiliary_loss_clip": 0.06334387, + "auxiliary_loss_mlp": 0.01253845, + "balance_loss_clip": 0.06267701, + "balance_loss_mlp": 0.01251732, + "epoch": 0.5045543363895987, + "flos": 58646460280320.0, + "grad_norm": 0.8187125498801333, + "language_loss": 0.60630977, + "learning_rate": 2.0675587471931628e-06, + "loss": 0.68219203, + "num_input_tokens_seen": 180468060, + "router_z_loss_clip": 0.66796875, + "router_z_loss_mlp": 0.02114868, + "step": 8392, + "time_per_iteration": 2.9839742183685303 + }, + { + "auxiliary_loss_clip": 0.06425072, + "auxiliary_loss_mlp": 0.01265494, + "balance_loss_clip": 0.06275131, + "balance_loss_mlp": 0.01252631, + "epoch": 0.5046144596422667, + "flos": 22532880240000.0, + "grad_norm": 1.6790063296091327, + "language_loss": 0.85000169, + "learning_rate": 2.067169506493517e-06, + "loss": 0.9269073, + "num_input_tokens_seen": 180486610, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.12866211, + "step": 8393, + "time_per_iteration": 2.5764622688293457 + }, + { + "auxiliary_loss_clip": 0.06430794, + "auxiliary_loss_mlp": 0.01270713, + "balance_loss_clip": 0.06278183, + "balance_loss_mlp": 0.01258869, + "epoch": 0.5046745828949346, + "flos": 27461673106560.0, + "grad_norm": 1.8013259480756436, + "language_loss": 0.5139519, + "learning_rate": 2.0667802632467974e-06, + "loss": 0.590967, + "num_input_tokens_seen": 180508135, + "router_z_loss_clip": 1.52734375, + "router_z_loss_mlp": 0.11834717, + "step": 8394, + "time_per_iteration": 2.5577075481414795 + }, + { + "auxiliary_loss_clip": 0.06430504, + "auxiliary_loss_mlp": 0.012693, + "balance_loss_clip": 0.06275499, + "balance_loss_mlp": 0.01256664, + "epoch": 0.5047347061476026, + "flos": 17280236142720.0, + "grad_norm": 1.62433976950566, + "language_loss": 0.75468862, + "learning_rate": 2.0663910174677627e-06, + "loss": 0.83168674, + "num_input_tokens_seen": 180527000, + "router_z_loss_clip": 1.54980469, + "router_z_loss_mlp": 0.12628174, + "step": 8395, + "time_per_iteration": 4.00100040435791 + }, + { + "auxiliary_loss_clip": 0.06430663, + "auxiliary_loss_mlp": 0.01264928, + "balance_loss_clip": 0.06276973, + "balance_loss_mlp": 0.01252876, + "epoch": 0.5047948294002705, + "flos": 16654308543360.0, + "grad_norm": 3.1739634410128446, + "language_loss": 0.68759549, + "learning_rate": 2.0660017691711737e-06, + "loss": 0.76455134, + "num_input_tokens_seen": 180544715, + "router_z_loss_clip": 1.53613281, + "router_z_loss_mlp": 0.1206665, + "step": 8396, + "time_per_iteration": 2.5608737468719482 + }, + { + "auxiliary_loss_clip": 0.0643612, + "auxiliary_loss_mlp": 0.01265513, + "balance_loss_clip": 0.06282924, + "balance_loss_mlp": 0.01253235, + "epoch": 0.5048549526529386, + "flos": 26872236760320.0, + "grad_norm": 1.7251064316936986, + "language_loss": 0.7921707, + "learning_rate": 2.065612518371792e-06, + "loss": 0.869187, + "num_input_tokens_seen": 180565365, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.12268066, + "step": 8397, + "time_per_iteration": 2.5829713344573975 + }, + { + "auxiliary_loss_clip": 0.06430176, + "auxiliary_loss_mlp": 0.01271123, + "balance_loss_clip": 0.06278492, + "balance_loss_mlp": 0.01258571, + "epoch": 0.5049150759056065, + "flos": 21840175336320.0, + "grad_norm": 1.4916236371554883, + "language_loss": 0.66563869, + "learning_rate": 2.065223265084376e-06, + "loss": 0.7426517, + "num_input_tokens_seen": 180586670, + "router_z_loss_clip": 1.51660156, + "router_z_loss_mlp": 0.12554932, + "step": 8398, + "time_per_iteration": 2.5790011882781982 + }, + { + "auxiliary_loss_clip": 0.06432331, + "auxiliary_loss_mlp": 0.01272223, + "balance_loss_clip": 0.06280147, + "balance_loss_mlp": 0.01259688, + "epoch": 0.5049751991582745, + "flos": 21691652774400.0, + "grad_norm": 1.5799272085735376, + "language_loss": 0.72252852, + "learning_rate": 2.064834009323688e-06, + "loss": 0.79957408, + "num_input_tokens_seen": 180605085, + "router_z_loss_clip": 1.52148438, + "router_z_loss_mlp": 0.12524414, + "step": 8399, + "time_per_iteration": 2.5528035163879395 + }, + { + "auxiliary_loss_clip": 0.06433836, + "auxiliary_loss_mlp": 0.01270059, + "balance_loss_clip": 0.06277353, + "balance_loss_mlp": 0.01257267, + "epoch": 0.5050353224109424, + "flos": 21365495556480.0, + "grad_norm": 1.7587629772693838, + "language_loss": 0.81515628, + "learning_rate": 2.0644447511044878e-06, + "loss": 0.89219522, + "num_input_tokens_seen": 180624370, + "router_z_loss_clip": 1.56347656, + "router_z_loss_mlp": 0.12792969, + "step": 8400, + "time_per_iteration": 2.550828456878662 + }, + { + "auxiliary_loss_clip": 0.06428652, + "auxiliary_loss_mlp": 0.01266624, + "balance_loss_clip": 0.06276295, + "balance_loss_mlp": 0.01254852, + "epoch": 0.5050954456636104, + "flos": 22826655054720.0, + "grad_norm": 2.5272013560823403, + "language_loss": 0.79016161, + "learning_rate": 2.0640554904415362e-06, + "loss": 0.86711431, + "num_input_tokens_seen": 180642450, + "router_z_loss_clip": 1.5234375, + "router_z_loss_mlp": 0.11779785, + "step": 8401, + "time_per_iteration": 2.525132894515991 + }, + { + "auxiliary_loss_clip": 0.06433861, + "auxiliary_loss_mlp": 0.01265271, + "balance_loss_clip": 0.06275853, + "balance_loss_mlp": 0.01252778, + "epoch": 0.5051555689162783, + "flos": 30456513411840.0, + "grad_norm": 1.509144939938127, + "language_loss": 0.70489848, + "learning_rate": 2.063666227349593e-06, + "loss": 0.7818898, + "num_input_tokens_seen": 180665250, + "router_z_loss_clip": 1.58203125, + "router_z_loss_mlp": 0.125, + "step": 8402, + "time_per_iteration": 4.0306360721588135 + }, + { + "auxiliary_loss_clip": 0.06429238, + "auxiliary_loss_mlp": 0.01267033, + "balance_loss_clip": 0.06274545, + "balance_loss_mlp": 0.01254915, + "epoch": 0.5052156921689464, + "flos": 21294315912960.0, + "grad_norm": 1.5960111955062717, + "language_loss": 0.6935674, + "learning_rate": 2.063276961843422e-06, + "loss": 0.77053005, + "num_input_tokens_seen": 180687425, + "router_z_loss_clip": 1.54589844, + "router_z_loss_mlp": 0.12121582, + "step": 8403, + "time_per_iteration": 2.558231830596924 + }, + { + "auxiliary_loss_clip": 0.06433211, + "auxiliary_loss_mlp": 0.01267338, + "balance_loss_clip": 0.06281981, + "balance_loss_mlp": 0.01255799, + "epoch": 0.5052758154216143, + "flos": 25088106499200.0, + "grad_norm": 1.463323664554185, + "language_loss": 0.86018717, + "learning_rate": 2.062887693937781e-06, + "loss": 0.93719262, + "num_input_tokens_seen": 180708725, + "router_z_loss_clip": 1.51269531, + "router_z_loss_mlp": 0.11547852, + "step": 8404, + "time_per_iteration": 2.618649959564209 + }, + { + "auxiliary_loss_clip": 0.06428184, + "auxiliary_loss_mlp": 0.01270079, + "balance_loss_clip": 0.06276304, + "balance_loss_mlp": 0.01258092, + "epoch": 0.5053359386742823, + "flos": 20891612390400.0, + "grad_norm": 1.5475179634828664, + "language_loss": 0.75802314, + "learning_rate": 2.0624984236474322e-06, + "loss": 0.83500576, + "num_input_tokens_seen": 180727990, + "router_z_loss_clip": 1.51757812, + "router_z_loss_mlp": 0.11987305, + "step": 8405, + "time_per_iteration": 2.5067524909973145 + }, + { + "auxiliary_loss_clip": 0.0643079, + "auxiliary_loss_mlp": 0.01267126, + "balance_loss_clip": 0.0627564, + "balance_loss_mlp": 0.01253882, + "epoch": 0.5053960619269503, + "flos": 37752499975680.0, + "grad_norm": 1.6248618607930092, + "language_loss": 0.73678941, + "learning_rate": 2.0621091509871378e-06, + "loss": 0.81376863, + "num_input_tokens_seen": 180749765, + "router_z_loss_clip": 1.55175781, + "router_z_loss_mlp": 0.13250732, + "step": 8406, + "time_per_iteration": 2.8841259479522705 + }, + { + "auxiliary_loss_clip": 0.06424634, + "auxiliary_loss_mlp": 0.01267238, + "balance_loss_clip": 0.06275164, + "balance_loss_mlp": 0.01254662, + "epoch": 0.5054561851796182, + "flos": 23520617769600.0, + "grad_norm": 1.7553784713680058, + "language_loss": 0.77329504, + "learning_rate": 2.0617198759716568e-06, + "loss": 0.85021389, + "num_input_tokens_seen": 180769580, + "router_z_loss_clip": 1.49414062, + "router_z_loss_mlp": 0.12579346, + "step": 8407, + "time_per_iteration": 2.5749242305755615 + }, + { + "auxiliary_loss_clip": 0.06430455, + "auxiliary_loss_mlp": 0.01267206, + "balance_loss_clip": 0.06274534, + "balance_loss_mlp": 0.01255434, + "epoch": 0.5055163084322862, + "flos": 30418261223040.0, + "grad_norm": 1.7587183909270583, + "language_loss": 0.63584411, + "learning_rate": 2.0613305986157535e-06, + "loss": 0.71282065, + "num_input_tokens_seen": 180790295, + "router_z_loss_clip": 1.55859375, + "router_z_loss_mlp": 0.11767578, + "step": 8408, + "time_per_iteration": 2.5872433185577393 + }, + { + "auxiliary_loss_clip": 0.06432275, + "auxiliary_loss_mlp": 0.01267048, + "balance_loss_clip": 0.06279387, + "balance_loss_mlp": 0.01253387, + "epoch": 0.5055764316849541, + "flos": 20264720469120.0, + "grad_norm": 2.4280351300793086, + "language_loss": 0.63813823, + "learning_rate": 2.0609413189341865e-06, + "loss": 0.71513146, + "num_input_tokens_seen": 180807875, + "router_z_loss_clip": 1.52734375, + "router_z_loss_mlp": 0.13659668, + "step": 8409, + "time_per_iteration": 2.5165858268737793 + }, + { + "auxiliary_loss_clip": 0.064235, + "auxiliary_loss_mlp": 0.01266011, + "balance_loss_clip": 0.06273322, + "balance_loss_mlp": 0.01254895, + "epoch": 0.5056365549376222, + "flos": 26078611213440.0, + "grad_norm": 1.3852804971458688, + "language_loss": 0.71039546, + "learning_rate": 2.0605520369417193e-06, + "loss": 0.78729057, + "num_input_tokens_seen": 180831300, + "router_z_loss_clip": 1.50097656, + "router_z_loss_mlp": 0.11132812, + "step": 8410, + "time_per_iteration": 2.594809055328369 + }, + { + "auxiliary_loss_clip": 0.0643055, + "auxiliary_loss_mlp": 0.01267353, + "balance_loss_clip": 0.0627602, + "balance_loss_mlp": 0.01254437, + "epoch": 0.5056966781902901, + "flos": 19284739441920.0, + "grad_norm": 1.6144456520966346, + "language_loss": 0.79591584, + "learning_rate": 2.060162752653113e-06, + "loss": 0.87289482, + "num_input_tokens_seen": 180849055, + "router_z_loss_clip": 1.546875, + "router_z_loss_mlp": 0.12921143, + "step": 8411, + "time_per_iteration": 2.53426194190979 + }, + { + "auxiliary_loss_clip": 0.06433219, + "auxiliary_loss_mlp": 0.0126873, + "balance_loss_clip": 0.06276312, + "balance_loss_mlp": 0.01254979, + "epoch": 0.5057568014429581, + "flos": 21329507427840.0, + "grad_norm": 1.7389096144894618, + "language_loss": 0.81907368, + "learning_rate": 2.0597734660831285e-06, + "loss": 0.89609325, + "num_input_tokens_seen": 180867395, + "router_z_loss_clip": 1.56835938, + "router_z_loss_mlp": 0.13757324, + "step": 8412, + "time_per_iteration": 2.517216444015503 + }, + { + "auxiliary_loss_clip": 0.06429601, + "auxiliary_loss_mlp": 0.01270568, + "balance_loss_clip": 0.0627761, + "balance_loss_mlp": 0.01258134, + "epoch": 0.505816924695626, + "flos": 17499351369600.0, + "grad_norm": 1.7713461187517285, + "language_loss": 0.80336094, + "learning_rate": 2.0593841772465283e-06, + "loss": 0.88036257, + "num_input_tokens_seen": 180886670, + "router_z_loss_clip": 1.51953125, + "router_z_loss_mlp": 0.12438965, + "step": 8413, + "time_per_iteration": 2.524210214614868 + }, + { + "auxiliary_loss_clip": 0.06428088, + "auxiliary_loss_mlp": 0.01274079, + "balance_loss_clip": 0.06273276, + "balance_loss_mlp": 0.01260328, + "epoch": 0.505877047948294, + "flos": 21148434754560.0, + "grad_norm": 1.7829708596435327, + "language_loss": 0.80812234, + "learning_rate": 2.0589948861580737e-06, + "loss": 0.885144, + "num_input_tokens_seen": 180904645, + "router_z_loss_clip": 1.54589844, + "router_z_loss_mlp": 0.1373291, + "step": 8414, + "time_per_iteration": 2.5200514793395996 + }, + { + "auxiliary_loss_clip": 0.06426316, + "auxiliary_loss_mlp": 0.01270081, + "balance_loss_clip": 0.06272253, + "balance_loss_mlp": 0.01257468, + "epoch": 0.5059371712009619, + "flos": 36357824292480.0, + "grad_norm": 2.3266509400680935, + "language_loss": 0.62741381, + "learning_rate": 2.058605592832528e-06, + "loss": 0.70437777, + "num_input_tokens_seen": 180922340, + "router_z_loss_clip": 1.5390625, + "router_z_loss_mlp": 0.12615967, + "step": 8415, + "time_per_iteration": 2.676204204559326 + }, + { + "auxiliary_loss_clip": 0.06428116, + "auxiliary_loss_mlp": 0.01272149, + "balance_loss_clip": 0.06274984, + "balance_loss_mlp": 0.01259882, + "epoch": 0.50599729445363, + "flos": 22679809574400.0, + "grad_norm": 1.4983327127759412, + "language_loss": 0.82398355, + "learning_rate": 2.0582162972846515e-06, + "loss": 0.90098619, + "num_input_tokens_seen": 180941350, + "router_z_loss_clip": 1.53320312, + "router_z_loss_mlp": 0.12261963, + "step": 8416, + "time_per_iteration": 2.540487289428711 + }, + { + "auxiliary_loss_clip": 0.06427394, + "auxiliary_loss_mlp": 0.01269018, + "balance_loss_clip": 0.06278178, + "balance_loss_mlp": 0.01257705, + "epoch": 0.5060574177062979, + "flos": 22754553016320.0, + "grad_norm": 1.8321417063208305, + "language_loss": 0.79700905, + "learning_rate": 2.0578269995292078e-06, + "loss": 0.87397313, + "num_input_tokens_seen": 180960720, + "router_z_loss_clip": 1.49023438, + "router_z_loss_mlp": 0.11328125, + "step": 8417, + "time_per_iteration": 2.5462777614593506 + }, + { + "auxiliary_loss_clip": 0.06425334, + "auxiliary_loss_mlp": 0.01268694, + "balance_loss_clip": 0.06277245, + "balance_loss_mlp": 0.01256875, + "epoch": 0.5061175409589659, + "flos": 21659689641600.0, + "grad_norm": 1.7824010317095476, + "language_loss": 0.63313794, + "learning_rate": 2.0574376995809588e-06, + "loss": 0.71007824, + "num_input_tokens_seen": 180979725, + "router_z_loss_clip": 1.48046875, + "router_z_loss_mlp": 0.11816406, + "step": 8418, + "time_per_iteration": 2.5203146934509277 + }, + { + "auxiliary_loss_clip": 0.0643232, + "auxiliary_loss_mlp": 0.01270126, + "balance_loss_clip": 0.06277534, + "balance_loss_mlp": 0.01257877, + "epoch": 0.5061776642116339, + "flos": 21622653336960.0, + "grad_norm": 1.6210660838966935, + "language_loss": 0.77937323, + "learning_rate": 2.0570483974546653e-06, + "loss": 0.85639775, + "num_input_tokens_seen": 180998980, + "router_z_loss_clip": 1.54492188, + "router_z_loss_mlp": 0.12249756, + "step": 8419, + "time_per_iteration": 2.549057722091675 + }, + { + "auxiliary_loss_clip": 0.06433055, + "auxiliary_loss_mlp": 0.01272716, + "balance_loss_clip": 0.06277718, + "balance_loss_mlp": 0.01259955, + "epoch": 0.5062377874643018, + "flos": 24433276440960.0, + "grad_norm": 1.7091767496398438, + "language_loss": 0.77142859, + "learning_rate": 2.0566590931650917e-06, + "loss": 0.8484863, + "num_input_tokens_seen": 181019165, + "router_z_loss_clip": 1.55273438, + "router_z_loss_mlp": 0.12762451, + "step": 8420, + "time_per_iteration": 2.533263921737671 + }, + { + "auxiliary_loss_clip": 0.06430572, + "auxiliary_loss_mlp": 0.0127647, + "balance_loss_clip": 0.06276705, + "balance_loss_mlp": 0.01264311, + "epoch": 0.5062979107169698, + "flos": 22530322690560.0, + "grad_norm": 1.6514243222666503, + "language_loss": 0.77777469, + "learning_rate": 2.056269786726999e-06, + "loss": 0.85484511, + "num_input_tokens_seen": 181037110, + "router_z_loss_clip": 1.5390625, + "router_z_loss_mlp": 0.121521, + "step": 8421, + "time_per_iteration": 2.535022497177124 + }, + { + "auxiliary_loss_clip": 0.06429385, + "auxiliary_loss_mlp": 0.01273249, + "balance_loss_clip": 0.06276778, + "balance_loss_mlp": 0.01261895, + "epoch": 0.5063580339696377, + "flos": 24578947964160.0, + "grad_norm": 1.4350674480860695, + "language_loss": 0.67189109, + "learning_rate": 2.0558804781551512e-06, + "loss": 0.74891746, + "num_input_tokens_seen": 181057775, + "router_z_loss_clip": 1.52636719, + "router_z_loss_mlp": 0.11352539, + "step": 8422, + "time_per_iteration": 2.555051803588867 + }, + { + "auxiliary_loss_clip": 0.064266, + "auxiliary_loss_mlp": 0.01271001, + "balance_loss_clip": 0.06276479, + "balance_loss_mlp": 0.01259241, + "epoch": 0.5064181572223058, + "flos": 22601837750400.0, + "grad_norm": 1.5827559778751017, + "language_loss": 0.81783563, + "learning_rate": 2.05549116746431e-06, + "loss": 0.89481163, + "num_input_tokens_seen": 181078260, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.11755371, + "step": 8423, + "time_per_iteration": 2.606844663619995 + }, + { + "auxiliary_loss_clip": 0.06427386, + "auxiliary_loss_mlp": 0.01268856, + "balance_loss_clip": 0.06273049, + "balance_loss_mlp": 0.01256411, + "epoch": 0.5064782804749737, + "flos": 26002148762880.0, + "grad_norm": 2.1055931359181086, + "language_loss": 0.74535251, + "learning_rate": 2.055101854669237e-06, + "loss": 0.82231486, + "num_input_tokens_seen": 181098755, + "router_z_loss_clip": 1.54199219, + "router_z_loss_mlp": 0.12451172, + "step": 8424, + "time_per_iteration": 2.5353689193725586 + }, + { + "auxiliary_loss_clip": 0.06427233, + "auxiliary_loss_mlp": 0.01264898, + "balance_loss_clip": 0.06278618, + "balance_loss_mlp": 0.0125268, + "epoch": 0.5065384037276417, + "flos": 28561358090880.0, + "grad_norm": 1.333495130602937, + "language_loss": 0.71332014, + "learning_rate": 2.0547125397846975e-06, + "loss": 0.79024142, + "num_input_tokens_seen": 181121570, + "router_z_loss_clip": 1.48632812, + "router_z_loss_mlp": 0.12231445, + "step": 8425, + "time_per_iteration": 2.624431610107422 + }, + { + "auxiliary_loss_clip": 0.06429943, + "auxiliary_loss_mlp": 0.01268875, + "balance_loss_clip": 0.06278015, + "balance_loss_mlp": 0.01257187, + "epoch": 0.5065985269803096, + "flos": 22972620067200.0, + "grad_norm": 1.8777832339890803, + "language_loss": 0.78901541, + "learning_rate": 2.0543232228254524e-06, + "loss": 0.86600357, + "num_input_tokens_seen": 181140240, + "router_z_loss_clip": 1.51855469, + "router_z_loss_mlp": 0.11700439, + "step": 8426, + "time_per_iteration": 3.936661958694458 + }, + { + "auxiliary_loss_clip": 0.06432042, + "auxiliary_loss_mlp": 0.0127276, + "balance_loss_clip": 0.06277739, + "balance_loss_mlp": 0.01260768, + "epoch": 0.5066586502329776, + "flos": 21613680950400.0, + "grad_norm": 2.2511428758914325, + "language_loss": 0.7803759, + "learning_rate": 2.053933903806265e-06, + "loss": 0.85742396, + "num_input_tokens_seen": 181158630, + "router_z_loss_clip": 1.54199219, + "router_z_loss_mlp": 0.12005615, + "step": 8427, + "time_per_iteration": 2.5481557846069336 + }, + { + "auxiliary_loss_clip": 0.06424822, + "auxiliary_loss_mlp": 0.01267004, + "balance_loss_clip": 0.06275385, + "balance_loss_mlp": 0.01255268, + "epoch": 0.5067187734856455, + "flos": 20346214164480.0, + "grad_norm": 1.5242931798978783, + "language_loss": 0.719284, + "learning_rate": 2.0535445827418997e-06, + "loss": 0.79620224, + "num_input_tokens_seen": 181176405, + "router_z_loss_clip": 1.49316406, + "router_z_loss_mlp": 0.11737061, + "step": 8428, + "time_per_iteration": 2.5370116233825684 + }, + { + "auxiliary_loss_clip": 0.06427782, + "auxiliary_loss_mlp": 0.01268707, + "balance_loss_clip": 0.0627581, + "balance_loss_mlp": 0.0125799, + "epoch": 0.5067788967383136, + "flos": 28848801922560.0, + "grad_norm": 1.7598513800416933, + "language_loss": 0.83218622, + "learning_rate": 2.0531552596471168e-06, + "loss": 0.90915114, + "num_input_tokens_seen": 181197595, + "router_z_loss_clip": 1.51953125, + "router_z_loss_mlp": 0.10717773, + "step": 8429, + "time_per_iteration": 2.5739033222198486 + }, + { + "auxiliary_loss_clip": 0.06435312, + "auxiliary_loss_mlp": 0.01266816, + "balance_loss_clip": 0.06276707, + "balance_loss_mlp": 0.01254013, + "epoch": 0.5068390199909815, + "flos": 32457997964160.0, + "grad_norm": 4.868596583088969, + "language_loss": 0.7373606, + "learning_rate": 2.052765934536682e-06, + "loss": 0.8143819, + "num_input_tokens_seen": 181218560, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 0.12805176, + "step": 8430, + "time_per_iteration": 4.062525749206543 + }, + { + "auxiliary_loss_clip": 0.06428299, + "auxiliary_loss_mlp": 0.01270046, + "balance_loss_clip": 0.06275186, + "balance_loss_mlp": 0.01258334, + "epoch": 0.5068991432436495, + "flos": 23152896126720.0, + "grad_norm": 1.801463516744859, + "language_loss": 0.76942408, + "learning_rate": 2.0523766074253575e-06, + "loss": 0.84640753, + "num_input_tokens_seen": 181237095, + "router_z_loss_clip": 1.53027344, + "router_z_loss_mlp": 0.1171875, + "step": 8431, + "time_per_iteration": 2.535198211669922 + }, + { + "auxiliary_loss_clip": 0.06426188, + "auxiliary_loss_mlp": 0.01266777, + "balance_loss_clip": 0.06275809, + "balance_loss_mlp": 0.0125488, + "epoch": 0.5069592664963174, + "flos": 19941917414400.0, + "grad_norm": 1.5385752235820749, + "language_loss": 0.72917402, + "learning_rate": 2.0519872783279074e-06, + "loss": 0.80610371, + "num_input_tokens_seen": 181255940, + "router_z_loss_clip": 1.50292969, + "router_z_loss_mlp": 0.11901855, + "step": 8432, + "time_per_iteration": 2.5343048572540283 + }, + { + "auxiliary_loss_clip": 0.06319194, + "auxiliary_loss_mlp": 0.01252325, + "balance_loss_clip": 0.06253257, + "balance_loss_mlp": 0.01250496, + "epoch": 0.5070193897489854, + "flos": 65812539888000.0, + "grad_norm": 0.7543358557352665, + "language_loss": 0.63621199, + "learning_rate": 2.0515979472590945e-06, + "loss": 0.71192724, + "num_input_tokens_seen": 181316945, + "router_z_loss_clip": 0.66015625, + "router_z_loss_mlp": 0.01824951, + "step": 8433, + "time_per_iteration": 3.1825270652770996 + }, + { + "auxiliary_loss_clip": 0.06432432, + "auxiliary_loss_mlp": 0.01266931, + "balance_loss_clip": 0.06279546, + "balance_loss_mlp": 0.01254414, + "epoch": 0.5070795130016534, + "flos": 17281158537600.0, + "grad_norm": 2.2002665512489505, + "language_loss": 0.77719331, + "learning_rate": 2.051208614233681e-06, + "loss": 0.85418689, + "num_input_tokens_seen": 181335555, + "router_z_loss_clip": 1.52832031, + "router_z_loss_mlp": 0.12512207, + "step": 8434, + "time_per_iteration": 2.51298451423645 + }, + { + "auxiliary_loss_clip": 0.06435563, + "auxiliary_loss_mlp": 0.01265248, + "balance_loss_clip": 0.06278686, + "balance_loss_mlp": 0.01253047, + "epoch": 0.5071396362543213, + "flos": 21076416570240.0, + "grad_norm": 1.9257186196996396, + "language_loss": 0.7107513, + "learning_rate": 2.0508192792664326e-06, + "loss": 0.78775942, + "num_input_tokens_seen": 181354580, + "router_z_loss_clip": 1.56738281, + "router_z_loss_mlp": 0.12207031, + "step": 8435, + "time_per_iteration": 3.9952967166900635 + }, + { + "auxiliary_loss_clip": 0.06431434, + "auxiliary_loss_mlp": 0.01269503, + "balance_loss_clip": 0.06278223, + "balance_loss_mlp": 0.01256646, + "epoch": 0.5071997595069894, + "flos": 23150841701760.0, + "grad_norm": 1.974114732671287, + "language_loss": 0.72623628, + "learning_rate": 2.050429942372112e-06, + "loss": 0.80324566, + "num_input_tokens_seen": 181374320, + "router_z_loss_clip": 1.53222656, + "router_z_loss_mlp": 0.128479, + "step": 8436, + "time_per_iteration": 2.5126936435699463 + }, + { + "auxiliary_loss_clip": 0.06431168, + "auxiliary_loss_mlp": 0.01266132, + "balance_loss_clip": 0.06278354, + "balance_loss_mlp": 0.01253449, + "epoch": 0.5072598827596573, + "flos": 22753756402560.0, + "grad_norm": 2.390958224451536, + "language_loss": 0.84374195, + "learning_rate": 2.050040603565483e-06, + "loss": 0.92071497, + "num_input_tokens_seen": 181392190, + "router_z_loss_clip": 1.52832031, + "router_z_loss_mlp": 0.12701416, + "step": 8437, + "time_per_iteration": 2.5411131381988525 + }, + { + "auxiliary_loss_clip": 0.06423598, + "auxiliary_loss_mlp": 0.01265882, + "balance_loss_clip": 0.06273607, + "balance_loss_mlp": 0.01254128, + "epoch": 0.5073200060123253, + "flos": 22573102999680.0, + "grad_norm": 1.4207198809320167, + "language_loss": 0.80947453, + "learning_rate": 2.049651262861309e-06, + "loss": 0.88636929, + "num_input_tokens_seen": 181413890, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.11749268, + "step": 8438, + "time_per_iteration": 2.5992414951324463 + }, + { + "auxiliary_loss_clip": 0.06431951, + "auxiliary_loss_mlp": 0.01267455, + "balance_loss_clip": 0.06277303, + "balance_loss_mlp": 0.0125458, + "epoch": 0.5073801292649932, + "flos": 25812481046400.0, + "grad_norm": 1.639362892711676, + "language_loss": 0.7992267, + "learning_rate": 2.0492619202743543e-06, + "loss": 0.87622082, + "num_input_tokens_seen": 181433240, + "router_z_loss_clip": 1.54589844, + "router_z_loss_mlp": 0.12872314, + "step": 8439, + "time_per_iteration": 2.5635995864868164 + }, + { + "auxiliary_loss_clip": 0.06422722, + "auxiliary_loss_mlp": 0.01265384, + "balance_loss_clip": 0.06272503, + "balance_loss_mlp": 0.01253833, + "epoch": 0.5074402525176612, + "flos": 25380916992000.0, + "grad_norm": 1.6123120964481592, + "language_loss": 0.71044374, + "learning_rate": 2.048872575819383e-06, + "loss": 0.78732479, + "num_input_tokens_seen": 181453535, + "router_z_loss_clip": 1.50292969, + "router_z_loss_mlp": 0.11560059, + "step": 8440, + "time_per_iteration": 2.54082989692688 + }, + { + "auxiliary_loss_clip": 0.0642738, + "auxiliary_loss_mlp": 0.0126602, + "balance_loss_clip": 0.06274064, + "balance_loss_mlp": 0.01254278, + "epoch": 0.5075003757703291, + "flos": 26071064346240.0, + "grad_norm": 1.625029424987906, + "language_loss": 0.71058178, + "learning_rate": 2.048483229511158e-06, + "loss": 0.78751576, + "num_input_tokens_seen": 181474195, + "router_z_loss_clip": 1.53417969, + "router_z_loss_mlp": 0.11743164, + "step": 8441, + "time_per_iteration": 2.5597851276397705 + }, + { + "auxiliary_loss_clip": 0.06432067, + "auxiliary_loss_mlp": 0.0126825, + "balance_loss_clip": 0.06275806, + "balance_loss_mlp": 0.01255608, + "epoch": 0.5075604990229972, + "flos": 21841936272000.0, + "grad_norm": 1.6251927502787415, + "language_loss": 0.64299369, + "learning_rate": 2.0480938813644445e-06, + "loss": 0.71999681, + "num_input_tokens_seen": 181494000, + "router_z_loss_clip": 1.56152344, + "router_z_loss_mlp": 0.12634277, + "step": 8442, + "time_per_iteration": 3.9658992290496826 + }, + { + "auxiliary_loss_clip": 0.06421914, + "auxiliary_loss_mlp": 0.01270692, + "balance_loss_clip": 0.06274506, + "balance_loss_mlp": 0.01259475, + "epoch": 0.5076206222756651, + "flos": 31986923909760.0, + "grad_norm": 1.4468343781265969, + "language_loss": 0.71796834, + "learning_rate": 2.047704531394006e-06, + "loss": 0.7948944, + "num_input_tokens_seen": 181515955, + "router_z_loss_clip": 1.47460938, + "router_z_loss_mlp": 0.11212158, + "step": 8443, + "time_per_iteration": 2.6133296489715576 + }, + { + "auxiliary_loss_clip": 0.06430129, + "auxiliary_loss_mlp": 0.01267886, + "balance_loss_clip": 0.06273551, + "balance_loss_mlp": 0.01255506, + "epoch": 0.5076807455283331, + "flos": 36913033445760.0, + "grad_norm": 1.2663152678698668, + "language_loss": 0.62379253, + "learning_rate": 2.047315179614607e-06, + "loss": 0.70077264, + "num_input_tokens_seen": 181540225, + "router_z_loss_clip": 1.56542969, + "router_z_loss_mlp": 0.12390137, + "step": 8444, + "time_per_iteration": 2.670844554901123 + }, + { + "auxiliary_loss_clip": 0.06426448, + "auxiliary_loss_mlp": 0.01266149, + "balance_loss_clip": 0.06273904, + "balance_loss_mlp": 0.01255158, + "epoch": 0.507740868781001, + "flos": 29870263520640.0, + "grad_norm": 1.5635527032998127, + "language_loss": 0.64163882, + "learning_rate": 2.046925826041012e-06, + "loss": 0.71856481, + "num_input_tokens_seen": 181560125, + "router_z_loss_clip": 1.52441406, + "router_z_loss_mlp": 0.10992432, + "step": 8445, + "time_per_iteration": 2.564972162246704 + }, + { + "auxiliary_loss_clip": 0.06326441, + "auxiliary_loss_mlp": 0.01258393, + "balance_loss_clip": 0.06260093, + "balance_loss_mlp": 0.0125657, + "epoch": 0.507800992033669, + "flos": 61935872014080.0, + "grad_norm": 0.8045039829713045, + "language_loss": 0.61588788, + "learning_rate": 2.0465364706879845e-06, + "loss": 0.69173622, + "num_input_tokens_seen": 181618830, + "router_z_loss_clip": 0.6640625, + "router_z_loss_mlp": 0.01817322, + "step": 8446, + "time_per_iteration": 3.1747779846191406 + }, + { + "auxiliary_loss_clip": 0.06424413, + "auxiliary_loss_mlp": 0.01266643, + "balance_loss_clip": 0.06272733, + "balance_loss_mlp": 0.01254394, + "epoch": 0.507861115286337, + "flos": 20706137377920.0, + "grad_norm": 4.618603604158377, + "language_loss": 0.80737472, + "learning_rate": 2.04614711357029e-06, + "loss": 0.88428527, + "num_input_tokens_seen": 181637120, + "router_z_loss_clip": 1.51757812, + "router_z_loss_mlp": 0.12243652, + "step": 8447, + "time_per_iteration": 2.510443687438965 + }, + { + "auxiliary_loss_clip": 0.06422254, + "auxiliary_loss_mlp": 0.01267237, + "balance_loss_clip": 0.06272172, + "balance_loss_mlp": 0.01255775, + "epoch": 0.507921238539005, + "flos": 30854982303360.0, + "grad_norm": 1.2702922663182385, + "language_loss": 0.70493698, + "learning_rate": 2.0457577547026916e-06, + "loss": 0.78183186, + "num_input_tokens_seen": 181659965, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.11456299, + "step": 8448, + "time_per_iteration": 2.6021034717559814 + }, + { + "auxiliary_loss_clip": 0.06427675, + "auxiliary_loss_mlp": 0.01268661, + "balance_loss_clip": 0.0627776, + "balance_loss_mlp": 0.0125745, + "epoch": 0.507981361791673, + "flos": 35709031728000.0, + "grad_norm": 1.3111664343686333, + "language_loss": 0.72171003, + "learning_rate": 2.045368394099955e-06, + "loss": 0.79867339, + "num_input_tokens_seen": 181685290, + "router_z_loss_clip": 1.49804688, + "router_z_loss_mlp": 0.11199951, + "step": 8449, + "time_per_iteration": 2.6752874851226807 + }, + { + "auxiliary_loss_clip": 0.06426987, + "auxiliary_loss_mlp": 0.01268113, + "balance_loss_clip": 0.06274859, + "balance_loss_mlp": 0.0125686, + "epoch": 0.5080414850443409, + "flos": 27168694905600.0, + "grad_norm": 1.3940572087719376, + "language_loss": 0.73039591, + "learning_rate": 2.044979031776844e-06, + "loss": 0.80734688, + "num_input_tokens_seen": 181706080, + "router_z_loss_clip": 1.52050781, + "router_z_loss_mlp": 0.11254883, + "step": 8450, + "time_per_iteration": 2.6428375244140625 + }, + { + "auxiliary_loss_clip": 0.06430449, + "auxiliary_loss_mlp": 0.0127298, + "balance_loss_clip": 0.06278583, + "balance_loss_mlp": 0.01261148, + "epoch": 0.5081016082970089, + "flos": 27091855111680.0, + "grad_norm": 1.6054602673211236, + "language_loss": 0.7744205, + "learning_rate": 2.0445896677481234e-06, + "loss": 0.85145479, + "num_input_tokens_seen": 181724805, + "router_z_loss_clip": 1.51757812, + "router_z_loss_mlp": 0.1184082, + "step": 8451, + "time_per_iteration": 2.6066558361053467 + }, + { + "auxiliary_loss_clip": 0.06429529, + "auxiliary_loss_mlp": 0.01266696, + "balance_loss_clip": 0.06276423, + "balance_loss_mlp": 0.01254531, + "epoch": 0.5081617315496768, + "flos": 22863104380800.0, + "grad_norm": 1.825930217148951, + "language_loss": 0.85374677, + "learning_rate": 2.044200302028559e-06, + "loss": 0.930709, + "num_input_tokens_seen": 181743725, + "router_z_loss_clip": 1.53027344, + "router_z_loss_mlp": 0.12158203, + "step": 8452, + "time_per_iteration": 2.5062003135681152 + }, + { + "auxiliary_loss_clip": 0.06431726, + "auxiliary_loss_mlp": 0.01267185, + "balance_loss_clip": 0.06276073, + "balance_loss_mlp": 0.01254716, + "epoch": 0.5082218548023448, + "flos": 16286167630080.0, + "grad_norm": 2.3752555926719343, + "language_loss": 0.77806371, + "learning_rate": 2.0438109346329143e-06, + "loss": 0.85505283, + "num_input_tokens_seen": 181757720, + "router_z_loss_clip": 1.55566406, + "router_z_loss_mlp": 0.12463379, + "step": 8453, + "time_per_iteration": 2.4981954097747803 + }, + { + "auxiliary_loss_clip": 0.06430794, + "auxiliary_loss_mlp": 0.01268463, + "balance_loss_clip": 0.06281981, + "balance_loss_mlp": 0.0125774, + "epoch": 0.5082819780550127, + "flos": 24467419779840.0, + "grad_norm": 1.5957908763151711, + "language_loss": 0.76932752, + "learning_rate": 2.0434215655759544e-06, + "loss": 0.84632009, + "num_input_tokens_seen": 181778545, + "router_z_loss_clip": 1.48828125, + "router_z_loss_mlp": 0.1072998, + "step": 8454, + "time_per_iteration": 2.6134133338928223 + }, + { + "auxiliary_loss_clip": 0.06431732, + "auxiliary_loss_mlp": 0.01271277, + "balance_loss_clip": 0.06279022, + "balance_loss_mlp": 0.01259118, + "epoch": 0.5083421013076808, + "flos": 23409844272000.0, + "grad_norm": 1.4822981638740835, + "language_loss": 0.89621413, + "learning_rate": 2.0430321948724446e-06, + "loss": 0.97324431, + "num_input_tokens_seen": 181799495, + "router_z_loss_clip": 1.52734375, + "router_z_loss_mlp": 0.1217041, + "step": 8455, + "time_per_iteration": 2.6085920333862305 + }, + { + "auxiliary_loss_clip": 0.06434034, + "auxiliary_loss_mlp": 0.01274373, + "balance_loss_clip": 0.06275303, + "balance_loss_mlp": 0.01260831, + "epoch": 0.5084022245603487, + "flos": 23878528485120.0, + "grad_norm": 1.6442671341978696, + "language_loss": 0.62785953, + "learning_rate": 2.042642822537149e-06, + "loss": 0.7049436, + "num_input_tokens_seen": 181818400, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.13555908, + "step": 8456, + "time_per_iteration": 2.5377745628356934 + }, + { + "auxiliary_loss_clip": 0.06329988, + "auxiliary_loss_mlp": 0.01255905, + "balance_loss_clip": 0.06263152, + "balance_loss_mlp": 0.01253715, + "epoch": 0.5084623478130167, + "flos": 62891352921600.0, + "grad_norm": 0.8103581861082657, + "language_loss": 0.62548244, + "learning_rate": 2.0422534485848343e-06, + "loss": 0.70134139, + "num_input_tokens_seen": 181875975, + "router_z_loss_clip": 0.67138672, + "router_z_loss_mlp": 0.02194214, + "step": 8457, + "time_per_iteration": 3.0378763675689697 + }, + { + "auxiliary_loss_clip": 0.06436984, + "auxiliary_loss_mlp": 0.01271319, + "balance_loss_clip": 0.06280852, + "balance_loss_mlp": 0.01258337, + "epoch": 0.5085224710656846, + "flos": 22352688034560.0, + "grad_norm": 1.5276658426580998, + "language_loss": 0.67559206, + "learning_rate": 2.0418640730302644e-06, + "loss": 0.75267512, + "num_input_tokens_seen": 181896450, + "router_z_loss_clip": 1.56152344, + "router_z_loss_mlp": 0.12976074, + "step": 8458, + "time_per_iteration": 2.5329530239105225 + }, + { + "auxiliary_loss_clip": 0.06432781, + "auxiliary_loss_mlp": 0.01272615, + "balance_loss_clip": 0.0627652, + "balance_loss_mlp": 0.01260015, + "epoch": 0.5085825943183526, + "flos": 26073202625280.0, + "grad_norm": 1.618055128351248, + "language_loss": 0.77449083, + "learning_rate": 2.0414746958882043e-06, + "loss": 0.85154486, + "num_input_tokens_seen": 181916770, + "router_z_loss_clip": 1.56152344, + "router_z_loss_mlp": 0.1260376, + "step": 8459, + "time_per_iteration": 2.5590224266052246 + }, + { + "auxiliary_loss_clip": 0.06437792, + "auxiliary_loss_mlp": 0.01271084, + "balance_loss_clip": 0.06279328, + "balance_loss_mlp": 0.01258132, + "epoch": 0.5086427175710206, + "flos": 17426494644480.0, + "grad_norm": 2.2202109072156664, + "language_loss": 0.81101096, + "learning_rate": 2.0410853171734196e-06, + "loss": 0.88809973, + "num_input_tokens_seen": 181932710, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 0.12945557, + "step": 8460, + "time_per_iteration": 2.4797065258026123 + }, + { + "auxiliary_loss_clip": 0.06432672, + "auxiliary_loss_mlp": 0.01272652, + "balance_loss_clip": 0.06276584, + "balance_loss_mlp": 0.01259968, + "epoch": 0.5087028408236886, + "flos": 20638102262400.0, + "grad_norm": 1.6011145053716882, + "language_loss": 0.69150776, + "learning_rate": 2.0406959369006754e-06, + "loss": 0.76856101, + "num_input_tokens_seen": 181950665, + "router_z_loss_clip": 1.56054688, + "router_z_loss_mlp": 0.12677002, + "step": 8461, + "time_per_iteration": 2.5423507690429688 + }, + { + "auxiliary_loss_clip": 0.06423958, + "auxiliary_loss_mlp": 0.01270241, + "balance_loss_clip": 0.06275716, + "balance_loss_mlp": 0.01258052, + "epoch": 0.5087629640763566, + "flos": 25600996540800.0, + "grad_norm": 1.5704547594862186, + "language_loss": 0.76788783, + "learning_rate": 2.0403065550847375e-06, + "loss": 0.84482986, + "num_input_tokens_seen": 181971270, + "router_z_loss_clip": 1.48046875, + "router_z_loss_mlp": 0.12200928, + "step": 8462, + "time_per_iteration": 2.5558974742889404 + }, + { + "auxiliary_loss_clip": 0.06431352, + "auxiliary_loss_mlp": 0.01267196, + "balance_loss_clip": 0.06279621, + "balance_loss_mlp": 0.01255251, + "epoch": 0.5088230873290245, + "flos": 13266743351040.0, + "grad_norm": 1.98943246577739, + "language_loss": 0.81940925, + "learning_rate": 2.0399171717403706e-06, + "loss": 0.89639473, + "num_input_tokens_seen": 181988410, + "router_z_loss_clip": 1.515625, + "router_z_loss_mlp": 0.11938477, + "step": 8463, + "time_per_iteration": 2.5092854499816895 + }, + { + "auxiliary_loss_clip": 0.06429717, + "auxiliary_loss_mlp": 0.01268295, + "balance_loss_clip": 0.06277439, + "balance_loss_mlp": 0.01255974, + "epoch": 0.5088832105816925, + "flos": 20048959405440.0, + "grad_norm": 4.395577464341562, + "language_loss": 0.76639092, + "learning_rate": 2.039527786882341e-06, + "loss": 0.84337103, + "num_input_tokens_seen": 182006530, + "router_z_loss_clip": 1.5234375, + "router_z_loss_mlp": 0.12310791, + "step": 8464, + "time_per_iteration": 2.5100886821746826 + }, + { + "auxiliary_loss_clip": 0.06332754, + "auxiliary_loss_mlp": 0.01251908, + "balance_loss_clip": 0.06266724, + "balance_loss_mlp": 0.01250196, + "epoch": 0.5089433338343604, + "flos": 67445072184960.0, + "grad_norm": 0.674227101372006, + "language_loss": 0.59172922, + "learning_rate": 2.0391384005254133e-06, + "loss": 0.66757584, + "num_input_tokens_seen": 182074240, + "router_z_loss_clip": 0.66357422, + "router_z_loss_mlp": 0.01716614, + "step": 8465, + "time_per_iteration": 3.288703441619873 + }, + { + "auxiliary_loss_clip": 0.06429654, + "auxiliary_loss_mlp": 0.01267036, + "balance_loss_clip": 0.06277246, + "balance_loss_mlp": 0.01255026, + "epoch": 0.5090034570870284, + "flos": 22716845879040.0, + "grad_norm": 1.7766724873518385, + "language_loss": 0.80341208, + "learning_rate": 2.038749012684354e-06, + "loss": 0.88037896, + "num_input_tokens_seen": 182093360, + "router_z_loss_clip": 1.52246094, + "router_z_loss_mlp": 0.12005615, + "step": 8466, + "time_per_iteration": 3.9034652709960938 + }, + { + "auxiliary_loss_clip": 0.06428038, + "auxiliary_loss_mlp": 0.01262494, + "balance_loss_clip": 0.06276771, + "balance_loss_mlp": 0.01250603, + "epoch": 0.5090635803396963, + "flos": 20451537146880.0, + "grad_norm": 1.506058765425311, + "language_loss": 0.78925973, + "learning_rate": 2.0383596233739286e-06, + "loss": 0.86616498, + "num_input_tokens_seen": 182110170, + "router_z_loss_clip": 1.51074219, + "router_z_loss_mlp": 0.11895752, + "step": 8467, + "time_per_iteration": 2.483701229095459 + }, + { + "auxiliary_loss_clip": 0.06425558, + "auxiliary_loss_mlp": 0.01269027, + "balance_loss_clip": 0.06277174, + "balance_loss_mlp": 0.01257565, + "epoch": 0.5091237035923644, + "flos": 23775637271040.0, + "grad_norm": 1.593164773968791, + "language_loss": 0.74572229, + "learning_rate": 2.0379702326089013e-06, + "loss": 0.82266819, + "num_input_tokens_seen": 182129570, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.11468506, + "step": 8468, + "time_per_iteration": 2.550657033920288 + }, + { + "auxiliary_loss_clip": 0.06425174, + "auxiliary_loss_mlp": 0.01264118, + "balance_loss_clip": 0.06274162, + "balance_loss_mlp": 0.01252108, + "epoch": 0.5091838268450323, + "flos": 18332990040960.0, + "grad_norm": 1.7522760366327397, + "language_loss": 0.78574747, + "learning_rate": 2.03758084040404e-06, + "loss": 0.86264038, + "num_input_tokens_seen": 182147565, + "router_z_loss_clip": 1.50976562, + "router_z_loss_mlp": 0.12011719, + "step": 8469, + "time_per_iteration": 2.4776134490966797 + }, + { + "auxiliary_loss_clip": 0.06431125, + "auxiliary_loss_mlp": 0.012685, + "balance_loss_clip": 0.0627888, + "balance_loss_mlp": 0.01256526, + "epoch": 0.5092439500977003, + "flos": 29064982256640.0, + "grad_norm": 1.429622552318455, + "language_loss": 0.6959703, + "learning_rate": 2.037191446774109e-06, + "loss": 0.7729665, + "num_input_tokens_seen": 182169695, + "router_z_loss_clip": 1.52050781, + "router_z_loss_mlp": 0.11968994, + "step": 8470, + "time_per_iteration": 4.06356954574585 + }, + { + "auxiliary_loss_clip": 0.06432179, + "auxiliary_loss_mlp": 0.01268896, + "balance_loss_clip": 0.06276524, + "balance_loss_mlp": 0.01256278, + "epoch": 0.5093040733503682, + "flos": 13559134573440.0, + "grad_norm": 1.739958995441318, + "language_loss": 0.73736298, + "learning_rate": 2.0368020517338745e-06, + "loss": 0.81437373, + "num_input_tokens_seen": 182186385, + "router_z_loss_clip": 1.55664062, + "router_z_loss_mlp": 0.12615967, + "step": 8471, + "time_per_iteration": 2.5252416133880615 + }, + { + "auxiliary_loss_clip": 0.06330768, + "auxiliary_loss_mlp": 0.01255323, + "balance_loss_clip": 0.06264758, + "balance_loss_mlp": 0.01253313, + "epoch": 0.5093641966030362, + "flos": 68927838837120.0, + "grad_norm": 0.738097810584446, + "language_loss": 0.58042324, + "learning_rate": 2.036412655298103e-06, + "loss": 0.65628415, + "num_input_tokens_seen": 182247095, + "router_z_loss_clip": 0.66015625, + "router_z_loss_mlp": 0.02009583, + "step": 8472, + "time_per_iteration": 3.1610372066497803 + }, + { + "auxiliary_loss_clip": 0.06430018, + "auxiliary_loss_mlp": 0.01266308, + "balance_loss_clip": 0.06275266, + "balance_loss_mlp": 0.01254953, + "epoch": 0.5094243198557042, + "flos": 21587545676160.0, + "grad_norm": 1.8344067804800992, + "language_loss": 0.69000626, + "learning_rate": 2.03602325748156e-06, + "loss": 0.76696956, + "num_input_tokens_seen": 182266380, + "router_z_loss_clip": 1.54492188, + "router_z_loss_mlp": 0.11358643, + "step": 8473, + "time_per_iteration": 2.5834267139434814 + }, + { + "auxiliary_loss_clip": 0.06430315, + "auxiliary_loss_mlp": 0.01267159, + "balance_loss_clip": 0.06279565, + "balance_loss_mlp": 0.01255143, + "epoch": 0.5094844431083722, + "flos": 28848382652160.0, + "grad_norm": 2.5664905714857422, + "language_loss": 0.85103536, + "learning_rate": 2.0356338582990105e-06, + "loss": 0.92801011, + "num_input_tokens_seen": 182284685, + "router_z_loss_clip": 1.50585938, + "router_z_loss_mlp": 0.12011719, + "step": 8474, + "time_per_iteration": 2.5577685832977295 + }, + { + "auxiliary_loss_clip": 0.06432322, + "auxiliary_loss_mlp": 0.0126557, + "balance_loss_clip": 0.06278027, + "balance_loss_mlp": 0.01253488, + "epoch": 0.5095445663610402, + "flos": 14981454904320.0, + "grad_norm": 1.910358455820602, + "language_loss": 0.64868319, + "learning_rate": 2.035244457765222e-06, + "loss": 0.72566211, + "num_input_tokens_seen": 182301810, + "router_z_loss_clip": 1.54101562, + "router_z_loss_mlp": 0.12091064, + "step": 8475, + "time_per_iteration": 3.9494359493255615 + }, + { + "auxiliary_loss_clip": 0.06435733, + "auxiliary_loss_mlp": 0.01268463, + "balance_loss_clip": 0.0627934, + "balance_loss_mlp": 0.01255779, + "epoch": 0.5096046896137081, + "flos": 20783354515200.0, + "grad_norm": 2.1677913618760623, + "language_loss": 0.8248105, + "learning_rate": 2.0348550558949605e-06, + "loss": 0.90185243, + "num_input_tokens_seen": 182320285, + "router_z_loss_clip": 1.56445312, + "router_z_loss_mlp": 0.12689209, + "step": 8476, + "time_per_iteration": 2.533986806869507 + }, + { + "auxiliary_loss_clip": 0.06432153, + "auxiliary_loss_mlp": 0.01267228, + "balance_loss_clip": 0.06275326, + "balance_loss_mlp": 0.01254628, + "epoch": 0.5096648128663761, + "flos": 23191735294080.0, + "grad_norm": 2.112211155301917, + "language_loss": 0.81339389, + "learning_rate": 2.0344656527029917e-06, + "loss": 0.89038771, + "num_input_tokens_seen": 182339465, + "router_z_loss_clip": 1.56738281, + "router_z_loss_mlp": 0.12609863, + "step": 8477, + "time_per_iteration": 2.614363193511963 + }, + { + "auxiliary_loss_clip": 0.06429507, + "auxiliary_loss_mlp": 0.01267776, + "balance_loss_clip": 0.0627466, + "balance_loss_mlp": 0.01254741, + "epoch": 0.509724936119044, + "flos": 22315945219200.0, + "grad_norm": 1.7511302636686703, + "language_loss": 0.61918831, + "learning_rate": 2.034076248204082e-06, + "loss": 0.69616115, + "num_input_tokens_seen": 182358375, + "router_z_loss_clip": 1.546875, + "router_z_loss_mlp": 0.13024902, + "step": 8478, + "time_per_iteration": 2.5054080486297607 + }, + { + "auxiliary_loss_clip": 0.06424017, + "auxiliary_loss_mlp": 0.01267914, + "balance_loss_clip": 0.06273499, + "balance_loss_mlp": 0.01256136, + "epoch": 0.509785059371712, + "flos": 26294372277120.0, + "grad_norm": 1.8013233320362476, + "language_loss": 0.66670853, + "learning_rate": 2.0336868424129968e-06, + "loss": 0.74362785, + "num_input_tokens_seen": 182377935, + "router_z_loss_clip": 1.50390625, + "router_z_loss_mlp": 0.11773682, + "step": 8479, + "time_per_iteration": 2.5773558616638184 + }, + { + "auxiliary_loss_clip": 0.06427336, + "auxiliary_loss_mlp": 0.01266645, + "balance_loss_clip": 0.06276052, + "balance_loss_mlp": 0.01254795, + "epoch": 0.50984518262438, + "flos": 22970942985600.0, + "grad_norm": 1.5048945656562989, + "language_loss": 0.69523573, + "learning_rate": 2.0332974353445037e-06, + "loss": 0.77217555, + "num_input_tokens_seen": 182396440, + "router_z_loss_clip": 1.51367188, + "router_z_loss_mlp": 0.1184082, + "step": 8480, + "time_per_iteration": 2.5308327674865723 + }, + { + "auxiliary_loss_clip": 0.06433358, + "auxiliary_loss_mlp": 0.01264781, + "balance_loss_clip": 0.06277278, + "balance_loss_mlp": 0.01252908, + "epoch": 0.509905305877048, + "flos": 26220551230080.0, + "grad_norm": 1.695627830792001, + "language_loss": 0.79513025, + "learning_rate": 2.0329080270133688e-06, + "loss": 0.87211168, + "num_input_tokens_seen": 182415890, + "router_z_loss_clip": 1.5625, + "router_z_loss_mlp": 0.11865234, + "step": 8481, + "time_per_iteration": 3.9862852096557617 + }, + { + "auxiliary_loss_clip": 0.06423856, + "auxiliary_loss_mlp": 0.01267023, + "balance_loss_clip": 0.06274414, + "balance_loss_mlp": 0.01255186, + "epoch": 0.5099654291297159, + "flos": 20346381872640.0, + "grad_norm": 1.4463685523965593, + "language_loss": 0.83447778, + "learning_rate": 2.0325186174343578e-06, + "loss": 0.91138661, + "num_input_tokens_seen": 182434235, + "router_z_loss_clip": 1.49414062, + "router_z_loss_mlp": 0.1184082, + "step": 8482, + "time_per_iteration": 2.539057970046997 + }, + { + "auxiliary_loss_clip": 0.06432243, + "auxiliary_loss_mlp": 0.01269925, + "balance_loss_clip": 0.0627501, + "balance_loss_mlp": 0.01257682, + "epoch": 0.5100255523823839, + "flos": 29061711947520.0, + "grad_norm": 1.7174746607832896, + "language_loss": 0.85923511, + "learning_rate": 2.032129206622238e-06, + "loss": 0.93625677, + "num_input_tokens_seen": 182454360, + "router_z_loss_clip": 1.57128906, + "router_z_loss_mlp": 0.12243652, + "step": 8483, + "time_per_iteration": 2.5567803382873535 + }, + { + "auxiliary_loss_clip": 0.06428108, + "auxiliary_loss_mlp": 0.01267951, + "balance_loss_clip": 0.06273945, + "balance_loss_mlp": 0.01256352, + "epoch": 0.5100856756350518, + "flos": 22462539137280.0, + "grad_norm": 3.7192784343186367, + "language_loss": 0.83011222, + "learning_rate": 2.031739794591775e-06, + "loss": 0.90707278, + "num_input_tokens_seen": 182471940, + "router_z_loss_clip": 1.54101562, + "router_z_loss_mlp": 0.11590576, + "step": 8484, + "time_per_iteration": 2.50913143157959 + }, + { + "auxiliary_loss_clip": 0.0642792, + "auxiliary_loss_mlp": 0.0126741, + "balance_loss_clip": 0.06274521, + "balance_loss_mlp": 0.01254953, + "epoch": 0.5101457988877198, + "flos": 19176942764160.0, + "grad_norm": 1.8545423824290383, + "language_loss": 0.81929463, + "learning_rate": 2.031350381357736e-06, + "loss": 0.89624798, + "num_input_tokens_seen": 182490685, + "router_z_loss_clip": 1.53222656, + "router_z_loss_mlp": 0.12463379, + "step": 8485, + "time_per_iteration": 2.479165554046631 + }, + { + "auxiliary_loss_clip": 0.06421156, + "auxiliary_loss_mlp": 0.01266312, + "balance_loss_clip": 0.06272486, + "balance_loss_mlp": 0.01254522, + "epoch": 0.5102059221403878, + "flos": 14871645728640.0, + "grad_norm": 1.8580884452241668, + "language_loss": 0.73778898, + "learning_rate": 2.0309609669348874e-06, + "loss": 0.81466365, + "num_input_tokens_seen": 182508325, + "router_z_loss_clip": 1.48632812, + "router_z_loss_mlp": 0.11791992, + "step": 8486, + "time_per_iteration": 2.502035140991211 + }, + { + "auxiliary_loss_clip": 0.06432486, + "auxiliary_loss_mlp": 0.01268204, + "balance_loss_clip": 0.06276038, + "balance_loss_mlp": 0.01255115, + "epoch": 0.5102660453930558, + "flos": 22966876062720.0, + "grad_norm": 1.455931130318143, + "language_loss": 0.6993084, + "learning_rate": 2.0305715513379953e-06, + "loss": 0.77631527, + "num_input_tokens_seen": 182527020, + "router_z_loss_clip": 1.56542969, + "router_z_loss_mlp": 0.13092041, + "step": 8487, + "time_per_iteration": 2.5022764205932617 + }, + { + "auxiliary_loss_clip": 0.06425266, + "auxiliary_loss_mlp": 0.01265042, + "balance_loss_clip": 0.06274921, + "balance_loss_mlp": 0.01252072, + "epoch": 0.5103261686457238, + "flos": 23156082581760.0, + "grad_norm": 2.025146562514191, + "language_loss": 0.72757244, + "learning_rate": 2.030182134581827e-06, + "loss": 0.80447549, + "num_input_tokens_seen": 182543505, + "router_z_loss_clip": 1.50097656, + "router_z_loss_mlp": 0.12963867, + "step": 8488, + "time_per_iteration": 2.5181195735931396 + }, + { + "auxiliary_loss_clip": 0.06435129, + "auxiliary_loss_mlp": 0.01271711, + "balance_loss_clip": 0.06278089, + "balance_loss_mlp": 0.01259861, + "epoch": 0.5103862918983917, + "flos": 14324444640000.0, + "grad_norm": 1.9274143081394266, + "language_loss": 0.69714773, + "learning_rate": 2.0297927166811503e-06, + "loss": 0.77421612, + "num_input_tokens_seen": 182562250, + "router_z_loss_clip": 1.56933594, + "router_z_loss_mlp": 0.11846924, + "step": 8489, + "time_per_iteration": 2.491626739501953 + }, + { + "auxiliary_loss_clip": 0.06427855, + "auxiliary_loss_mlp": 0.01262645, + "balance_loss_clip": 0.06272568, + "balance_loss_mlp": 0.01251231, + "epoch": 0.5104464151510597, + "flos": 25855638698880.0, + "grad_norm": 1.7641928011440773, + "language_loss": 0.73334658, + "learning_rate": 2.0294032976507297e-06, + "loss": 0.81025159, + "num_input_tokens_seen": 182581910, + "router_z_loss_clip": 1.55078125, + "router_z_loss_mlp": 0.11407471, + "step": 8490, + "time_per_iteration": 2.6192476749420166 + }, + { + "auxiliary_loss_clip": 0.06422485, + "auxiliary_loss_mlp": 0.01268102, + "balance_loss_clip": 0.06271752, + "balance_loss_mlp": 0.01256628, + "epoch": 0.5105065384037276, + "flos": 21659354225280.0, + "grad_norm": 1.995020059533993, + "language_loss": 0.8080864, + "learning_rate": 2.0290138775053337e-06, + "loss": 0.8849923, + "num_input_tokens_seen": 182601350, + "router_z_loss_clip": 1.50585938, + "router_z_loss_mlp": 0.11474609, + "step": 8491, + "time_per_iteration": 2.5444910526275635 + }, + { + "auxiliary_loss_clip": 0.0642098, + "auxiliary_loss_mlp": 0.01268766, + "balance_loss_clip": 0.06274496, + "balance_loss_mlp": 0.01257089, + "epoch": 0.5105666616563956, + "flos": 22498066068480.0, + "grad_norm": 2.247071959069697, + "language_loss": 0.79263282, + "learning_rate": 2.028624456259728e-06, + "loss": 0.86953026, + "num_input_tokens_seen": 182619660, + "router_z_loss_clip": 1.46386719, + "router_z_loss_mlp": 0.11676025, + "step": 8492, + "time_per_iteration": 2.656888008117676 + }, + { + "auxiliary_loss_clip": 0.06433547, + "auxiliary_loss_mlp": 0.01271088, + "balance_loss_clip": 0.06276479, + "balance_loss_mlp": 0.01257838, + "epoch": 0.5106267849090635, + "flos": 22462371429120.0, + "grad_norm": 1.9309641209432507, + "language_loss": 0.77830237, + "learning_rate": 2.0282350339286804e-06, + "loss": 0.85534871, + "num_input_tokens_seen": 182639815, + "router_z_loss_clip": 1.5703125, + "router_z_loss_mlp": 0.13256836, + "step": 8493, + "time_per_iteration": 2.550326347351074 + }, + { + "auxiliary_loss_clip": 0.06427996, + "auxiliary_loss_mlp": 0.01265337, + "balance_loss_clip": 0.06275648, + "balance_loss_mlp": 0.01252879, + "epoch": 0.5106869081617316, + "flos": 23553335589120.0, + "grad_norm": 1.7342765336142327, + "language_loss": 0.84044284, + "learning_rate": 2.0278456105269574e-06, + "loss": 0.91737616, + "num_input_tokens_seen": 182659655, + "router_z_loss_clip": 1.52539062, + "router_z_loss_mlp": 0.12457275, + "step": 8494, + "time_per_iteration": 2.582463026046753 + }, + { + "auxiliary_loss_clip": 0.06430838, + "auxiliary_loss_mlp": 0.0126671, + "balance_loss_clip": 0.0627555, + "balance_loss_mlp": 0.0125492, + "epoch": 0.5107470314143995, + "flos": 26799547743360.0, + "grad_norm": 2.0062643152671877, + "language_loss": 0.79773927, + "learning_rate": 2.027456186069326e-06, + "loss": 0.87471473, + "num_input_tokens_seen": 182677075, + "router_z_loss_clip": 1.55273438, + "router_z_loss_mlp": 0.11798096, + "step": 8495, + "time_per_iteration": 2.5472564697265625 + }, + { + "auxiliary_loss_clip": 0.06425454, + "auxiliary_loss_mlp": 0.01268533, + "balance_loss_clip": 0.06273226, + "balance_loss_mlp": 0.01256308, + "epoch": 0.5108071546670675, + "flos": 25746877699200.0, + "grad_norm": 1.417654874659872, + "language_loss": 0.78675163, + "learning_rate": 2.0270667605705535e-06, + "loss": 0.86369145, + "num_input_tokens_seen": 182699625, + "router_z_loss_clip": 1.52148438, + "router_z_loss_mlp": 0.12231445, + "step": 8496, + "time_per_iteration": 2.5841569900512695 + }, + { + "auxiliary_loss_clip": 0.06422253, + "auxiliary_loss_mlp": 0.01267746, + "balance_loss_clip": 0.06273818, + "balance_loss_mlp": 0.01255998, + "epoch": 0.5108672779197354, + "flos": 18703478868480.0, + "grad_norm": 1.866540646775448, + "language_loss": 0.7912823, + "learning_rate": 2.0266773340454066e-06, + "loss": 0.8681823, + "num_input_tokens_seen": 182717020, + "router_z_loss_clip": 1.48046875, + "router_z_loss_mlp": 0.11755371, + "step": 8497, + "time_per_iteration": 2.5111966133117676 + }, + { + "auxiliary_loss_clip": 0.06429158, + "auxiliary_loss_mlp": 0.0126396, + "balance_loss_clip": 0.06277271, + "balance_loss_mlp": 0.01252277, + "epoch": 0.5109274011724034, + "flos": 26695482572160.0, + "grad_norm": 1.6666059931479484, + "language_loss": 0.81941032, + "learning_rate": 2.0262879065086525e-06, + "loss": 0.89634144, + "num_input_tokens_seen": 182736955, + "router_z_loss_clip": 1.51757812, + "router_z_loss_mlp": 0.11682129, + "step": 8498, + "time_per_iteration": 2.608631134033203 + }, + { + "auxiliary_loss_clip": 0.06424002, + "auxiliary_loss_mlp": 0.01271992, + "balance_loss_clip": 0.06274551, + "balance_loss_mlp": 0.01260267, + "epoch": 0.5109875244250714, + "flos": 22790666926080.0, + "grad_norm": 1.6923312462183162, + "language_loss": 0.71301198, + "learning_rate": 2.0258984779750584e-06, + "loss": 0.78997189, + "num_input_tokens_seen": 182757620, + "router_z_loss_clip": 1.49316406, + "router_z_loss_mlp": 0.11724854, + "step": 8499, + "time_per_iteration": 2.5150094032287598 + }, + { + "auxiliary_loss_clip": 0.06427284, + "auxiliary_loss_mlp": 0.01266703, + "balance_loss_clip": 0.06273851, + "balance_loss_mlp": 0.01255003, + "epoch": 0.5110476476777394, + "flos": 35596958492160.0, + "grad_norm": 1.3954443671639698, + "language_loss": 0.72611153, + "learning_rate": 2.0255090484593914e-06, + "loss": 0.80305135, + "num_input_tokens_seen": 182780195, + "router_z_loss_clip": 1.53320312, + "router_z_loss_mlp": 0.11694336, + "step": 8500, + "time_per_iteration": 2.633239269256592 + }, + { + "auxiliary_loss_clip": 0.06435662, + "auxiliary_loss_mlp": 0.01270607, + "balance_loss_clip": 0.06276006, + "balance_loss_mlp": 0.01256803, + "epoch": 0.5111077709304074, + "flos": 19286751939840.0, + "grad_norm": 2.7349973685574973, + "language_loss": 0.63562721, + "learning_rate": 2.0251196179764183e-06, + "loss": 0.71268988, + "num_input_tokens_seen": 182795765, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.13800049, + "step": 8501, + "time_per_iteration": 2.5091230869293213 + }, + { + "auxiliary_loss_clip": 0.06434844, + "auxiliary_loss_mlp": 0.01273353, + "balance_loss_clip": 0.06276836, + "balance_loss_mlp": 0.01260848, + "epoch": 0.5111678941830753, + "flos": 20674551588480.0, + "grad_norm": 1.8816899756355796, + "language_loss": 0.88057411, + "learning_rate": 2.024730186540907e-06, + "loss": 0.95765609, + "num_input_tokens_seen": 182813120, + "router_z_loss_clip": 1.58007812, + "router_z_loss_mlp": 0.12506104, + "step": 8502, + "time_per_iteration": 2.517728090286255 + }, + { + "auxiliary_loss_clip": 0.06425811, + "auxiliary_loss_mlp": 0.01265447, + "balance_loss_clip": 0.06274389, + "balance_loss_mlp": 0.01253663, + "epoch": 0.5112280174357433, + "flos": 26295336599040.0, + "grad_norm": 1.4524091598864723, + "language_loss": 0.82627225, + "learning_rate": 2.0243407541676253e-06, + "loss": 0.90318477, + "num_input_tokens_seen": 182835745, + "router_z_loss_clip": 1.51367188, + "router_z_loss_mlp": 0.11779785, + "step": 8503, + "time_per_iteration": 2.711451768875122 + }, + { + "auxiliary_loss_clip": 0.06333953, + "auxiliary_loss_mlp": 0.01255603, + "balance_loss_clip": 0.06268184, + "balance_loss_mlp": 0.0125384, + "epoch": 0.5112881406884112, + "flos": 59490706492800.0, + "grad_norm": 0.8512772291593351, + "language_loss": 0.63800937, + "learning_rate": 2.023951320871339e-06, + "loss": 0.71390492, + "num_input_tokens_seen": 182892540, + "router_z_loss_clip": 0.65966797, + "router_z_loss_mlp": 0.01766968, + "step": 8504, + "time_per_iteration": 3.1690919399261475 + }, + { + "auxiliary_loss_clip": 0.06425914, + "auxiliary_loss_mlp": 0.01265825, + "balance_loss_clip": 0.06275845, + "balance_loss_mlp": 0.01253576, + "epoch": 0.5113482639410792, + "flos": 26476073856000.0, + "grad_norm": 1.7986544100736102, + "language_loss": 0.84377933, + "learning_rate": 2.023561886666816e-06, + "loss": 0.92069674, + "num_input_tokens_seen": 182911515, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.12261963, + "step": 8505, + "time_per_iteration": 2.5755858421325684 + }, + { + "auxiliary_loss_clip": 0.0643035, + "auxiliary_loss_mlp": 0.01265823, + "balance_loss_clip": 0.06279911, + "balance_loss_mlp": 0.01254229, + "epoch": 0.5114083871937471, + "flos": 29903190975360.0, + "grad_norm": 1.7295208629505698, + "language_loss": 0.75707996, + "learning_rate": 2.0231724515688246e-06, + "loss": 0.83404166, + "num_input_tokens_seen": 182930860, + "router_z_loss_clip": 1.50585938, + "router_z_loss_mlp": 0.11590576, + "step": 8506, + "time_per_iteration": 3.947927713394165 + }, + { + "auxiliary_loss_clip": 0.0642788, + "auxiliary_loss_mlp": 0.01268518, + "balance_loss_clip": 0.06276722, + "balance_loss_mlp": 0.01255303, + "epoch": 0.5114685104464152, + "flos": 24321161278080.0, + "grad_norm": 1.7165713389532073, + "language_loss": 0.58250427, + "learning_rate": 2.022783015592131e-06, + "loss": 0.65946829, + "num_input_tokens_seen": 182949960, + "router_z_loss_clip": 1.50976562, + "router_z_loss_mlp": 0.13214111, + "step": 8507, + "time_per_iteration": 2.5460915565490723 + }, + { + "auxiliary_loss_clip": 0.06432099, + "auxiliary_loss_mlp": 0.01269517, + "balance_loss_clip": 0.06281347, + "balance_loss_mlp": 0.01257023, + "epoch": 0.5115286336990831, + "flos": 17024965079040.0, + "grad_norm": 1.7959155859668763, + "language_loss": 0.8588531, + "learning_rate": 2.022393578751503e-06, + "loss": 0.93586934, + "num_input_tokens_seen": 182968085, + "router_z_loss_clip": 1.5078125, + "router_z_loss_mlp": 0.12475586, + "step": 8508, + "time_per_iteration": 2.501931667327881 + }, + { + "auxiliary_loss_clip": 0.06430113, + "auxiliary_loss_mlp": 0.01267037, + "balance_loss_clip": 0.06279224, + "balance_loss_mlp": 0.012544, + "epoch": 0.5115887569517511, + "flos": 23666121584640.0, + "grad_norm": 1.985741338533524, + "language_loss": 0.72740698, + "learning_rate": 2.022004141061709e-06, + "loss": 0.80437851, + "num_input_tokens_seen": 182987275, + "router_z_loss_clip": 1.50976562, + "router_z_loss_mlp": 0.12640381, + "step": 8509, + "time_per_iteration": 3.9570322036743164 + }, + { + "auxiliary_loss_clip": 0.06425552, + "auxiliary_loss_mlp": 0.01264949, + "balance_loss_clip": 0.06277531, + "balance_loss_mlp": 0.01254476, + "epoch": 0.511648880204419, + "flos": 16112725678080.0, + "grad_norm": 1.6522242028614569, + "language_loss": 0.76532018, + "learning_rate": 2.0216147025375153e-06, + "loss": 0.84222525, + "num_input_tokens_seen": 183004700, + "router_z_loss_clip": 1.47949219, + "router_z_loss_mlp": 0.10479736, + "step": 8510, + "time_per_iteration": 2.5000293254852295 + }, + { + "auxiliary_loss_clip": 0.06424148, + "auxiliary_loss_mlp": 0.01267464, + "balance_loss_clip": 0.06276409, + "balance_loss_mlp": 0.01256402, + "epoch": 0.511709003457087, + "flos": 32643221414400.0, + "grad_norm": 1.8483097722803792, + "language_loss": 0.71295965, + "learning_rate": 2.0212252631936907e-06, + "loss": 0.78987575, + "num_input_tokens_seen": 183025830, + "router_z_loss_clip": 1.47753906, + "router_z_loss_mlp": 0.11053467, + "step": 8511, + "time_per_iteration": 2.5970981121063232 + }, + { + "auxiliary_loss_clip": 0.06426742, + "auxiliary_loss_mlp": 0.01265633, + "balance_loss_clip": 0.06277999, + "balance_loss_mlp": 0.0125404, + "epoch": 0.511769126709755, + "flos": 21768492568320.0, + "grad_norm": 1.8966780464465567, + "language_loss": 0.67139721, + "learning_rate": 2.020835823045001e-06, + "loss": 0.74832094, + "num_input_tokens_seen": 183045140, + "router_z_loss_clip": 1.48632812, + "router_z_loss_mlp": 0.11584473, + "step": 8512, + "time_per_iteration": 2.5369138717651367 + }, + { + "auxiliary_loss_clip": 0.06426971, + "auxiliary_loss_mlp": 0.01266218, + "balance_loss_clip": 0.06273089, + "balance_loss_mlp": 0.01253588, + "epoch": 0.511829249962423, + "flos": 23922231189120.0, + "grad_norm": 1.7695600544803753, + "language_loss": 0.67171764, + "learning_rate": 2.0204463821062146e-06, + "loss": 0.7486496, + "num_input_tokens_seen": 183063935, + "router_z_loss_clip": 1.54003906, + "router_z_loss_mlp": 0.12628174, + "step": 8513, + "time_per_iteration": 2.517648220062256 + }, + { + "auxiliary_loss_clip": 0.06423096, + "auxiliary_loss_mlp": 0.01268209, + "balance_loss_clip": 0.06275445, + "balance_loss_mlp": 0.01255948, + "epoch": 0.511889373215091, + "flos": 23732856961920.0, + "grad_norm": 1.8747309224946216, + "language_loss": 0.68931103, + "learning_rate": 2.0200569403921e-06, + "loss": 0.76622409, + "num_input_tokens_seen": 183084135, + "router_z_loss_clip": 1.47753906, + "router_z_loss_mlp": 0.1227417, + "step": 8514, + "time_per_iteration": 3.969726085662842 + }, + { + "auxiliary_loss_clip": 0.06422693, + "auxiliary_loss_mlp": 0.01265425, + "balance_loss_clip": 0.06273951, + "balance_loss_mlp": 0.01254357, + "epoch": 0.5119494964677589, + "flos": 28119144568320.0, + "grad_norm": 1.955376754159203, + "language_loss": 0.66104603, + "learning_rate": 2.019667497917424e-06, + "loss": 0.7379272, + "num_input_tokens_seen": 183104570, + "router_z_loss_clip": 1.48632812, + "router_z_loss_mlp": 0.11065674, + "step": 8515, + "time_per_iteration": 2.586984872817993 + }, + { + "auxiliary_loss_clip": 0.06415779, + "auxiliary_loss_mlp": 0.01265644, + "balance_loss_clip": 0.0627024, + "balance_loss_mlp": 0.01254754, + "epoch": 0.5120096197204269, + "flos": 24980225967360.0, + "grad_norm": 1.8485741123105555, + "language_loss": 0.76016974, + "learning_rate": 2.019278054696955e-06, + "loss": 0.83698404, + "num_input_tokens_seen": 183123850, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.10894775, + "step": 8516, + "time_per_iteration": 2.5933895111083984 + }, + { + "auxiliary_loss_clip": 0.06425153, + "auxiliary_loss_mlp": 0.01265819, + "balance_loss_clip": 0.0627657, + "balance_loss_mlp": 0.01254136, + "epoch": 0.5120697429730948, + "flos": 17973863441280.0, + "grad_norm": 1.9611042257937292, + "language_loss": 0.78053069, + "learning_rate": 2.0188886107454595e-06, + "loss": 0.85744041, + "num_input_tokens_seen": 183141725, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.11694336, + "step": 8517, + "time_per_iteration": 2.4962363243103027 + }, + { + "auxiliary_loss_clip": 0.06430522, + "auxiliary_loss_mlp": 0.01271394, + "balance_loss_clip": 0.06276728, + "balance_loss_mlp": 0.01259211, + "epoch": 0.5121298662257628, + "flos": 23298651504000.0, + "grad_norm": 1.7759167489555023, + "language_loss": 0.74719632, + "learning_rate": 2.0184991660777063e-06, + "loss": 0.82421547, + "num_input_tokens_seen": 183161300, + "router_z_loss_clip": 1.53710938, + "router_z_loss_mlp": 0.12164307, + "step": 8518, + "time_per_iteration": 2.5037240982055664 + }, + { + "auxiliary_loss_clip": 0.06424905, + "auxiliary_loss_mlp": 0.0126823, + "balance_loss_clip": 0.06274471, + "balance_loss_mlp": 0.01256529, + "epoch": 0.5121899894784308, + "flos": 17316769322880.0, + "grad_norm": 1.687169580100827, + "language_loss": 0.78467947, + "learning_rate": 2.0181097207084625e-06, + "loss": 0.86161083, + "num_input_tokens_seen": 183180495, + "router_z_loss_clip": 1.50097656, + "router_z_loss_mlp": 0.11706543, + "step": 8519, + "time_per_iteration": 2.524724006652832 + }, + { + "auxiliary_loss_clip": 0.06422982, + "auxiliary_loss_mlp": 0.01264919, + "balance_loss_clip": 0.06273712, + "balance_loss_mlp": 0.01253016, + "epoch": 0.5122501127310988, + "flos": 24935978211840.0, + "grad_norm": 1.6239003664198155, + "language_loss": 0.79446238, + "learning_rate": 2.017720274652497e-06, + "loss": 0.87134135, + "num_input_tokens_seen": 183200330, + "router_z_loss_clip": 1.49121094, + "router_z_loss_mlp": 0.11907959, + "step": 8520, + "time_per_iteration": 2.522068500518799 + }, + { + "auxiliary_loss_clip": 0.06431363, + "auxiliary_loss_mlp": 0.01269526, + "balance_loss_clip": 0.06276108, + "balance_loss_mlp": 0.01256151, + "epoch": 0.5123102359837667, + "flos": 18448878637440.0, + "grad_norm": 1.8569595834923718, + "language_loss": 0.81725198, + "learning_rate": 2.0173308279245765e-06, + "loss": 0.89426088, + "num_input_tokens_seen": 183218230, + "router_z_loss_clip": 1.55175781, + "router_z_loss_mlp": 0.13366699, + "step": 8521, + "time_per_iteration": 3.956547498703003 + }, + { + "auxiliary_loss_clip": 0.06422685, + "auxiliary_loss_mlp": 0.01264857, + "balance_loss_clip": 0.0627308, + "balance_loss_mlp": 0.01253383, + "epoch": 0.5123703592364347, + "flos": 26691625284480.0, + "grad_norm": 3.145804815574879, + "language_loss": 0.68764591, + "learning_rate": 2.0169413805394692e-06, + "loss": 0.7645213, + "num_input_tokens_seen": 183236735, + "router_z_loss_clip": 1.49316406, + "router_z_loss_mlp": 0.11462402, + "step": 8522, + "time_per_iteration": 2.53696608543396 + }, + { + "auxiliary_loss_clip": 0.06430639, + "auxiliary_loss_mlp": 0.01269235, + "balance_loss_clip": 0.06276414, + "balance_loss_mlp": 0.01256039, + "epoch": 0.5124304824891026, + "flos": 28811555982720.0, + "grad_norm": 1.853417160064295, + "language_loss": 0.622962, + "learning_rate": 2.0165519325119433e-06, + "loss": 0.69996071, + "num_input_tokens_seen": 183257550, + "router_z_loss_clip": 1.54199219, + "router_z_loss_mlp": 0.13201904, + "step": 8523, + "time_per_iteration": 2.589885950088501 + }, + { + "auxiliary_loss_clip": 0.06424818, + "auxiliary_loss_mlp": 0.01265688, + "balance_loss_clip": 0.06274516, + "balance_loss_mlp": 0.01254685, + "epoch": 0.5124906057417706, + "flos": 21768199079040.0, + "grad_norm": 1.9669486922935226, + "language_loss": 0.77939785, + "learning_rate": 2.0161624838567656e-06, + "loss": 0.85630286, + "num_input_tokens_seen": 183275515, + "router_z_loss_clip": 1.50195312, + "router_z_loss_mlp": 0.11004639, + "step": 8524, + "time_per_iteration": 2.506647825241089 + }, + { + "auxiliary_loss_clip": 0.06424855, + "auxiliary_loss_mlp": 0.01266329, + "balance_loss_clip": 0.06275764, + "balance_loss_mlp": 0.01255344, + "epoch": 0.5125507289944387, + "flos": 18886605966720.0, + "grad_norm": 1.985021925330002, + "language_loss": 0.74904448, + "learning_rate": 2.015773034588706e-06, + "loss": 0.82595634, + "num_input_tokens_seen": 183293880, + "router_z_loss_clip": 1.49121094, + "router_z_loss_mlp": 0.10986328, + "step": 8525, + "time_per_iteration": 2.509902000427246 + }, + { + "auxiliary_loss_clip": 0.06429298, + "auxiliary_loss_mlp": 0.01270559, + "balance_loss_clip": 0.06276, + "balance_loss_mlp": 0.01258412, + "epoch": 0.5126108522471066, + "flos": 35636761981440.0, + "grad_norm": 1.5788283001431092, + "language_loss": 0.74868685, + "learning_rate": 2.015383584722531e-06, + "loss": 0.82568544, + "num_input_tokens_seen": 183315860, + "router_z_loss_clip": 1.53222656, + "router_z_loss_mlp": 0.12127686, + "step": 8526, + "time_per_iteration": 2.640554428100586 + }, + { + "auxiliary_loss_clip": 0.06428048, + "auxiliary_loss_mlp": 0.01267884, + "balance_loss_clip": 0.06275488, + "balance_loss_mlp": 0.01256613, + "epoch": 0.5126709754997746, + "flos": 20196685353600.0, + "grad_norm": 1.5376970768591331, + "language_loss": 0.658445, + "learning_rate": 2.0149941342730088e-06, + "loss": 0.73540437, + "num_input_tokens_seen": 183335480, + "router_z_loss_clip": 1.52539062, + "router_z_loss_mlp": 0.11279297, + "step": 8527, + "time_per_iteration": 2.5079874992370605 + }, + { + "auxiliary_loss_clip": 0.06421998, + "auxiliary_loss_mlp": 0.01268926, + "balance_loss_clip": 0.0627896, + "balance_loss_mlp": 0.01258644, + "epoch": 0.5127310987524425, + "flos": 18594550160640.0, + "grad_norm": 1.4224570841542155, + "language_loss": 0.74258637, + "learning_rate": 2.014604683254908e-06, + "loss": 0.81949556, + "num_input_tokens_seen": 183354395, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.10290527, + "step": 8528, + "time_per_iteration": 2.5583620071411133 + }, + { + "auxiliary_loss_clip": 0.06424492, + "auxiliary_loss_mlp": 0.01266445, + "balance_loss_clip": 0.06275051, + "balance_loss_mlp": 0.01254816, + "epoch": 0.5127912220051105, + "flos": 22461113617920.0, + "grad_norm": 1.747082224822374, + "language_loss": 0.83357608, + "learning_rate": 2.014215231682995e-06, + "loss": 0.91048539, + "num_input_tokens_seen": 183372980, + "router_z_loss_clip": 1.49316406, + "router_z_loss_mlp": 0.11621094, + "step": 8529, + "time_per_iteration": 2.5290021896362305 + }, + { + "auxiliary_loss_clip": 0.06427129, + "auxiliary_loss_mlp": 0.01266072, + "balance_loss_clip": 0.06279376, + "balance_loss_mlp": 0.01255206, + "epoch": 0.5128513452577784, + "flos": 19098845159040.0, + "grad_norm": 1.7753814294124612, + "language_loss": 0.7435441, + "learning_rate": 2.01382577957204e-06, + "loss": 0.82047611, + "num_input_tokens_seen": 183390160, + "router_z_loss_clip": 1.47753906, + "router_z_loss_mlp": 0.10852051, + "step": 8530, + "time_per_iteration": 2.5009660720825195 + }, + { + "auxiliary_loss_clip": 0.06336609, + "auxiliary_loss_mlp": 0.01264939, + "balance_loss_clip": 0.062712, + "balance_loss_mlp": 0.01263291, + "epoch": 0.5129114685104464, + "flos": 67914553011840.0, + "grad_norm": 0.7560442553547831, + "language_loss": 0.60794806, + "learning_rate": 2.0134363269368095e-06, + "loss": 0.68396354, + "num_input_tokens_seen": 183455280, + "router_z_loss_clip": 0.65625, + "router_z_loss_mlp": 0.01651001, + "step": 8531, + "time_per_iteration": 3.2641408443450928 + }, + { + "auxiliary_loss_clip": 0.06436025, + "auxiliary_loss_mlp": 0.0126867, + "balance_loss_clip": 0.062833, + "balance_loss_mlp": 0.0125722, + "epoch": 0.5129715917631144, + "flos": 20455436361600.0, + "grad_norm": 1.5619116128751078, + "language_loss": 0.76922929, + "learning_rate": 2.0130468737920725e-06, + "loss": 0.84627628, + "num_input_tokens_seen": 183473955, + "router_z_loss_clip": 1.52832031, + "router_z_loss_mlp": 0.11444092, + "step": 8532, + "time_per_iteration": 2.54885196685791 + }, + { + "auxiliary_loss_clip": 0.06429256, + "auxiliary_loss_mlp": 0.01269177, + "balance_loss_clip": 0.0627965, + "balance_loss_mlp": 0.0125747, + "epoch": 0.5130317150157824, + "flos": 35124836261760.0, + "grad_norm": 2.143443364581078, + "language_loss": 0.67464834, + "learning_rate": 2.012657420152597e-06, + "loss": 0.75163269, + "num_input_tokens_seen": 183497195, + "router_z_loss_clip": 1.49609375, + "router_z_loss_mlp": 0.11706543, + "step": 8533, + "time_per_iteration": 2.634751081466675 + }, + { + "auxiliary_loss_clip": 0.06435291, + "auxiliary_loss_mlp": 0.01270583, + "balance_loss_clip": 0.06282294, + "balance_loss_mlp": 0.01257995, + "epoch": 0.5130918382684503, + "flos": 19797671410560.0, + "grad_norm": 2.0992969405941526, + "language_loss": 0.82022768, + "learning_rate": 2.01226796603315e-06, + "loss": 0.89728636, + "num_input_tokens_seen": 183513675, + "router_z_loss_clip": 1.52734375, + "router_z_loss_mlp": 0.12585449, + "step": 8534, + "time_per_iteration": 2.527186632156372 + }, + { + "auxiliary_loss_clip": 0.06432565, + "auxiliary_loss_mlp": 0.01272989, + "balance_loss_clip": 0.06280594, + "balance_loss_mlp": 0.0126077, + "epoch": 0.5131519615211183, + "flos": 26330318478720.0, + "grad_norm": 1.396585887996991, + "language_loss": 0.64072168, + "learning_rate": 2.0118785114485017e-06, + "loss": 0.71777725, + "num_input_tokens_seen": 183535165, + "router_z_loss_clip": 1.51953125, + "router_z_loss_mlp": 0.12225342, + "step": 8535, + "time_per_iteration": 2.5608325004577637 + }, + { + "auxiliary_loss_clip": 0.06432404, + "auxiliary_loss_mlp": 0.01265724, + "balance_loss_clip": 0.06282519, + "balance_loss_mlp": 0.01254036, + "epoch": 0.5132120847737862, + "flos": 19177949013120.0, + "grad_norm": 1.677219086168078, + "language_loss": 0.70047057, + "learning_rate": 2.011489056413418e-06, + "loss": 0.77745175, + "num_input_tokens_seen": 183553780, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.11682129, + "step": 8536, + "time_per_iteration": 2.562103509902954 + }, + { + "auxiliary_loss_clip": 0.06443835, + "auxiliary_loss_mlp": 0.01273704, + "balance_loss_clip": 0.06287554, + "balance_loss_mlp": 0.01260359, + "epoch": 0.5132722080264542, + "flos": 20236698478080.0, + "grad_norm": 2.053357085489985, + "language_loss": 0.71648562, + "learning_rate": 2.011099600942669e-06, + "loss": 0.793661, + "num_input_tokens_seen": 183572285, + "router_z_loss_clip": 1.56054688, + "router_z_loss_mlp": 0.13348389, + "step": 8537, + "time_per_iteration": 2.5208451747894287 + }, + { + "auxiliary_loss_clip": 0.06435503, + "auxiliary_loss_mlp": 0.01264426, + "balance_loss_clip": 0.06282058, + "balance_loss_mlp": 0.01252559, + "epoch": 0.5133323312791223, + "flos": 16474619462400.0, + "grad_norm": 2.3096480270315487, + "language_loss": 0.80560482, + "learning_rate": 2.0107101450510214e-06, + "loss": 0.88260412, + "num_input_tokens_seen": 183589330, + "router_z_loss_clip": 1.53222656, + "router_z_loss_mlp": 0.11859131, + "step": 8538, + "time_per_iteration": 2.5136818885803223 + }, + { + "auxiliary_loss_clip": 0.06432489, + "auxiliary_loss_mlp": 0.01269896, + "balance_loss_clip": 0.06280679, + "balance_loss_mlp": 0.01258177, + "epoch": 0.5133924545317902, + "flos": 26075340904320.0, + "grad_norm": 1.6767929293826078, + "language_loss": 0.78499532, + "learning_rate": 2.0103206887532437e-06, + "loss": 0.86201918, + "num_input_tokens_seen": 183609205, + "router_z_loss_clip": 1.51757812, + "router_z_loss_mlp": 0.1171875, + "step": 8539, + "time_per_iteration": 2.5898549556732178 + }, + { + "auxiliary_loss_clip": 0.06434882, + "auxiliary_loss_mlp": 0.01267576, + "balance_loss_clip": 0.06283914, + "balance_loss_mlp": 0.01255703, + "epoch": 0.5134525777844582, + "flos": 29138467887360.0, + "grad_norm": 1.6389084641418472, + "language_loss": 0.76422769, + "learning_rate": 2.009931232064105e-06, + "loss": 0.84125227, + "num_input_tokens_seen": 183629985, + "router_z_loss_clip": 1.50976562, + "router_z_loss_mlp": 0.11877441, + "step": 8540, + "time_per_iteration": 2.695279359817505 + }, + { + "auxiliary_loss_clip": 0.06437706, + "auxiliary_loss_mlp": 0.01272086, + "balance_loss_clip": 0.06283282, + "balance_loss_mlp": 0.01258812, + "epoch": 0.5135127010371261, + "flos": 17460134858880.0, + "grad_norm": 1.735384048528371, + "language_loss": 0.74720204, + "learning_rate": 2.0095417749983724e-06, + "loss": 0.82429993, + "num_input_tokens_seen": 183648220, + "router_z_loss_clip": 1.54296875, + "router_z_loss_mlp": 0.1328125, + "step": 8541, + "time_per_iteration": 2.5028650760650635 + }, + { + "auxiliary_loss_clip": 0.06433722, + "auxiliary_loss_mlp": 0.01268404, + "balance_loss_clip": 0.06282187, + "balance_loss_mlp": 0.01255905, + "epoch": 0.5135728242897941, + "flos": 21951493885440.0, + "grad_norm": 1.7658048645767805, + "language_loss": 0.71345925, + "learning_rate": 2.0091523175708162e-06, + "loss": 0.79048049, + "num_input_tokens_seen": 183668230, + "router_z_loss_clip": 1.51367188, + "router_z_loss_mlp": 0.12493896, + "step": 8542, + "time_per_iteration": 2.55663800239563 + }, + { + "auxiliary_loss_clip": 0.06432796, + "auxiliary_loss_mlp": 0.01267795, + "balance_loss_clip": 0.06282645, + "balance_loss_mlp": 0.01255939, + "epoch": 0.513632947542462, + "flos": 22681528583040.0, + "grad_norm": 1.8429175926110044, + "language_loss": 0.79735661, + "learning_rate": 2.0087628597962023e-06, + "loss": 0.87436259, + "num_input_tokens_seen": 183687800, + "router_z_loss_clip": 1.50195312, + "router_z_loss_mlp": 0.11846924, + "step": 8543, + "time_per_iteration": 2.530942440032959 + }, + { + "auxiliary_loss_clip": 0.06431838, + "auxiliary_loss_mlp": 0.01268863, + "balance_loss_clip": 0.06281078, + "balance_loss_mlp": 0.0125693, + "epoch": 0.51369307079513, + "flos": 29464289688960.0, + "grad_norm": 1.9724623685644402, + "language_loss": 0.68434304, + "learning_rate": 2.008373401689299e-06, + "loss": 0.76135004, + "num_input_tokens_seen": 183709025, + "router_z_loss_clip": 1.50878906, + "router_z_loss_mlp": 0.11932373, + "step": 8544, + "time_per_iteration": 2.581965684890747 + }, + { + "auxiliary_loss_clip": 0.06435554, + "auxiliary_loss_mlp": 0.01269408, + "balance_loss_clip": 0.0628157, + "balance_loss_mlp": 0.01257314, + "epoch": 0.513753194047798, + "flos": 18995325039360.0, + "grad_norm": 1.9173308249452852, + "language_loss": 0.73101795, + "learning_rate": 2.0079839432648765e-06, + "loss": 0.80806756, + "num_input_tokens_seen": 183725740, + "router_z_loss_clip": 1.53808594, + "router_z_loss_mlp": 0.12103271, + "step": 8545, + "time_per_iteration": 3.9112906455993652 + }, + { + "auxiliary_loss_clip": 0.06434133, + "auxiliary_loss_mlp": 0.01273161, + "balance_loss_clip": 0.06280358, + "balance_loss_mlp": 0.01260745, + "epoch": 0.513813317300466, + "flos": 17827646866560.0, + "grad_norm": 2.3149125381427322, + "language_loss": 0.82387555, + "learning_rate": 2.0075944845377016e-06, + "loss": 0.90094852, + "num_input_tokens_seen": 183743995, + "router_z_loss_clip": 1.53613281, + "router_z_loss_mlp": 0.12408447, + "step": 8546, + "time_per_iteration": 2.4859204292297363 + }, + { + "auxiliary_loss_clip": 0.06431763, + "auxiliary_loss_mlp": 0.0126748, + "balance_loss_clip": 0.062795, + "balance_loss_mlp": 0.01255101, + "epoch": 0.5138734405531339, + "flos": 24068070420480.0, + "grad_norm": 1.656069587269211, + "language_loss": 0.73464745, + "learning_rate": 2.007205025522544e-06, + "loss": 0.81163985, + "num_input_tokens_seen": 183764150, + "router_z_loss_clip": 1.52539062, + "router_z_loss_mlp": 0.12384033, + "step": 8547, + "time_per_iteration": 2.5682289600372314 + }, + { + "auxiliary_loss_clip": 0.0643255, + "auxiliary_loss_mlp": 0.01266832, + "balance_loss_clip": 0.06281269, + "balance_loss_mlp": 0.01254697, + "epoch": 0.5139335638058019, + "flos": 26103279041280.0, + "grad_norm": 1.7029090715356687, + "language_loss": 0.7379564, + "learning_rate": 2.0068155662341702e-06, + "loss": 0.81495023, + "num_input_tokens_seen": 183783280, + "router_z_loss_clip": 1.51269531, + "router_z_loss_mlp": 0.12121582, + "step": 8548, + "time_per_iteration": 2.534795045852661 + }, + { + "auxiliary_loss_clip": 0.06433449, + "auxiliary_loss_mlp": 0.01270968, + "balance_loss_clip": 0.06279913, + "balance_loss_mlp": 0.01259124, + "epoch": 0.5139936870584698, + "flos": 18923181073920.0, + "grad_norm": 1.5199417717256292, + "language_loss": 0.82597619, + "learning_rate": 2.0064261066873495e-06, + "loss": 0.90302038, + "num_input_tokens_seen": 183800725, + "router_z_loss_clip": 1.53417969, + "router_z_loss_mlp": 0.11853027, + "step": 8549, + "time_per_iteration": 3.9844579696655273 + }, + { + "auxiliary_loss_clip": 0.06431821, + "auxiliary_loss_mlp": 0.01268578, + "balance_loss_clip": 0.06283253, + "balance_loss_mlp": 0.01256913, + "epoch": 0.5140538103111378, + "flos": 16149594274560.0, + "grad_norm": 1.7893333067818897, + "language_loss": 0.72460294, + "learning_rate": 2.0060366468968504e-06, + "loss": 0.80160695, + "num_input_tokens_seen": 183818735, + "router_z_loss_clip": 1.48339844, + "router_z_loss_mlp": 0.11669922, + "step": 8550, + "time_per_iteration": 2.6143221855163574 + }, + { + "auxiliary_loss_clip": 0.06436016, + "auxiliary_loss_mlp": 0.01265894, + "balance_loss_clip": 0.06278858, + "balance_loss_mlp": 0.01253341, + "epoch": 0.5141139335638057, + "flos": 22426886424960.0, + "grad_norm": 1.3843612466681816, + "language_loss": 0.7537846, + "learning_rate": 2.0056471868774408e-06, + "loss": 0.83080363, + "num_input_tokens_seen": 183840015, + "router_z_loss_clip": 1.56933594, + "router_z_loss_mlp": 0.12536621, + "step": 8551, + "time_per_iteration": 2.563551664352417 + }, + { + "auxiliary_loss_clip": 0.06427439, + "auxiliary_loss_mlp": 0.01266176, + "balance_loss_clip": 0.06281094, + "balance_loss_mlp": 0.01255233, + "epoch": 0.5141740568164738, + "flos": 27097054064640.0, + "grad_norm": 1.547590229430392, + "language_loss": 0.69192576, + "learning_rate": 2.0052577266438897e-06, + "loss": 0.76886189, + "num_input_tokens_seen": 183860145, + "router_z_loss_clip": 1.46191406, + "router_z_loss_mlp": 0.10949707, + "step": 8552, + "time_per_iteration": 2.598309278488159 + }, + { + "auxiliary_loss_clip": 0.06434312, + "auxiliary_loss_mlp": 0.01271227, + "balance_loss_clip": 0.06280888, + "balance_loss_mlp": 0.01258972, + "epoch": 0.5142341800691418, + "flos": 24980267894400.0, + "grad_norm": 1.7162445999633908, + "language_loss": 0.75295067, + "learning_rate": 2.004868266210965e-06, + "loss": 0.830006, + "num_input_tokens_seen": 183880540, + "router_z_loss_clip": 1.53515625, + "router_z_loss_mlp": 0.12255859, + "step": 8553, + "time_per_iteration": 2.56817364692688 + }, + { + "auxiliary_loss_clip": 0.06427588, + "auxiliary_loss_mlp": 0.01265909, + "balance_loss_clip": 0.06277347, + "balance_loss_mlp": 0.01253642, + "epoch": 0.5142943033218097, + "flos": 20710833206400.0, + "grad_norm": 1.5512777085285745, + "language_loss": 0.68091589, + "learning_rate": 2.004478805593435e-06, + "loss": 0.75785089, + "num_input_tokens_seen": 183900895, + "router_z_loss_clip": 1.50195312, + "router_z_loss_mlp": 0.1227417, + "step": 8554, + "time_per_iteration": 4.041098117828369 + }, + { + "auxiliary_loss_clip": 0.06434806, + "auxiliary_loss_mlp": 0.01269189, + "balance_loss_clip": 0.0627867, + "balance_loss_mlp": 0.0125514, + "epoch": 0.5143544265744777, + "flos": 22931391058560.0, + "grad_norm": 1.9544744043919176, + "language_loss": 0.73420155, + "learning_rate": 2.004089344806068e-06, + "loss": 0.81124151, + "num_input_tokens_seen": 183920335, + "router_z_loss_clip": 1.56054688, + "router_z_loss_mlp": 0.14050293, + "step": 8555, + "time_per_iteration": 2.560406446456909 + }, + { + "auxiliary_loss_clip": 0.0643023, + "auxiliary_loss_mlp": 0.01264405, + "balance_loss_clip": 0.06277946, + "balance_loss_mlp": 0.0125305, + "epoch": 0.5144145498271456, + "flos": 15926328270720.0, + "grad_norm": 3.1721710851325478, + "language_loss": 0.74827576, + "learning_rate": 2.003699883863633e-06, + "loss": 0.82522213, + "num_input_tokens_seen": 183936220, + "router_z_loss_clip": 1.52246094, + "router_z_loss_mlp": 0.11346436, + "step": 8556, + "time_per_iteration": 2.510631561279297 + }, + { + "auxiliary_loss_clip": 0.06426013, + "auxiliary_loss_mlp": 0.01266484, + "balance_loss_clip": 0.06279086, + "balance_loss_mlp": 0.01255374, + "epoch": 0.5144746730798136, + "flos": 19687107548160.0, + "grad_norm": 1.7802365486116365, + "language_loss": 0.86600292, + "learning_rate": 2.003310422780898e-06, + "loss": 0.9429279, + "num_input_tokens_seen": 183953250, + "router_z_loss_clip": 1.46972656, + "router_z_loss_mlp": 0.11114502, + "step": 8557, + "time_per_iteration": 2.4897682666778564 + }, + { + "auxiliary_loss_clip": 0.06427194, + "auxiliary_loss_mlp": 0.01265116, + "balance_loss_clip": 0.06280152, + "balance_loss_mlp": 0.0125372, + "epoch": 0.5145347963324816, + "flos": 23921476502400.0, + "grad_norm": 1.7088292247190593, + "language_loss": 0.89943027, + "learning_rate": 2.0029209615726307e-06, + "loss": 0.97635341, + "num_input_tokens_seen": 183973865, + "router_z_loss_clip": 1.46972656, + "router_z_loss_mlp": 0.11407471, + "step": 8558, + "time_per_iteration": 2.552520513534546 + }, + { + "auxiliary_loss_clip": 0.06426296, + "auxiliary_loss_mlp": 0.01270393, + "balance_loss_clip": 0.06281744, + "balance_loss_mlp": 0.01259337, + "epoch": 0.5145949195851496, + "flos": 18265919247360.0, + "grad_norm": 1.814909546317071, + "language_loss": 0.65665084, + "learning_rate": 2.002531500253602e-06, + "loss": 0.73361778, + "num_input_tokens_seen": 183992555, + "router_z_loss_clip": 1.4453125, + "router_z_loss_mlp": 0.1105957, + "step": 8559, + "time_per_iteration": 2.5509958267211914 + }, + { + "auxiliary_loss_clip": 0.06428455, + "auxiliary_loss_mlp": 0.0126663, + "balance_loss_clip": 0.0628074, + "balance_loss_mlp": 0.0125527, + "epoch": 0.5146550428378175, + "flos": 26220593157120.0, + "grad_norm": 1.5790337478872891, + "language_loss": 0.63388872, + "learning_rate": 2.002142038838577e-06, + "loss": 0.71083951, + "num_input_tokens_seen": 184010825, + "router_z_loss_clip": 1.47851562, + "router_z_loss_mlp": 0.11358643, + "step": 8560, + "time_per_iteration": 2.5824177265167236 + }, + { + "auxiliary_loss_clip": 0.06429952, + "auxiliary_loss_mlp": 0.01265572, + "balance_loss_clip": 0.06279366, + "balance_loss_mlp": 0.01253597, + "epoch": 0.5147151660904855, + "flos": 22680731969280.0, + "grad_norm": 1.6548160663474087, + "language_loss": 0.70604181, + "learning_rate": 2.0017525773423265e-06, + "loss": 0.78299701, + "num_input_tokens_seen": 184030155, + "router_z_loss_clip": 1.50390625, + "router_z_loss_mlp": 0.11975098, + "step": 8561, + "time_per_iteration": 4.051865816116333 + }, + { + "auxiliary_loss_clip": 0.06432293, + "auxiliary_loss_mlp": 0.01266304, + "balance_loss_clip": 0.0628119, + "balance_loss_mlp": 0.01254937, + "epoch": 0.5147752893431534, + "flos": 24979261645440.0, + "grad_norm": 1.5164557892601689, + "language_loss": 0.67091215, + "learning_rate": 2.0013631157796177e-06, + "loss": 0.7478981, + "num_input_tokens_seen": 184051440, + "router_z_loss_clip": 1.51171875, + "router_z_loss_mlp": 0.1137085, + "step": 8562, + "time_per_iteration": 2.587117910385132 + }, + { + "auxiliary_loss_clip": 0.06434688, + "auxiliary_loss_mlp": 0.0126818, + "balance_loss_clip": 0.06283362, + "balance_loss_mlp": 0.01256945, + "epoch": 0.5148354125958214, + "flos": 22750821509760.0, + "grad_norm": 1.6017474228640745, + "language_loss": 0.77982432, + "learning_rate": 2.0009736541652188e-06, + "loss": 0.85685301, + "num_input_tokens_seen": 184070205, + "router_z_loss_clip": 1.51367188, + "router_z_loss_mlp": 0.11248779, + "step": 8563, + "time_per_iteration": 2.5995922088623047 + }, + { + "auxiliary_loss_clip": 0.06441233, + "auxiliary_loss_mlp": 0.01269901, + "balance_loss_clip": 0.06284129, + "balance_loss_mlp": 0.01257235, + "epoch": 0.5148955358484893, + "flos": 23074253470080.0, + "grad_norm": 2.0871441030394426, + "language_loss": 0.83276081, + "learning_rate": 2.0005841925139e-06, + "loss": 0.90987211, + "num_input_tokens_seen": 184087345, + "router_z_loss_clip": 1.57128906, + "router_z_loss_mlp": 0.12658691, + "step": 8564, + "time_per_iteration": 2.5510189533233643 + }, + { + "auxiliary_loss_clip": 0.06436282, + "auxiliary_loss_mlp": 0.01266369, + "balance_loss_clip": 0.06281953, + "balance_loss_mlp": 0.01253918, + "epoch": 0.5149556591011574, + "flos": 20346465726720.0, + "grad_norm": 3.2981963875061915, + "language_loss": 0.73735076, + "learning_rate": 2.0001947308404283e-06, + "loss": 0.81437725, + "num_input_tokens_seen": 184107110, + "router_z_loss_clip": 1.54589844, + "router_z_loss_mlp": 0.12451172, + "step": 8565, + "time_per_iteration": 2.565485715866089 + }, + { + "auxiliary_loss_clip": 0.06439919, + "auxiliary_loss_mlp": 0.01271905, + "balance_loss_clip": 0.06283022, + "balance_loss_mlp": 0.01259478, + "epoch": 0.5150157823538254, + "flos": 22644869621760.0, + "grad_norm": 2.0080537974138424, + "language_loss": 0.6841439, + "learning_rate": 1.9998052691595715e-06, + "loss": 0.76126206, + "num_input_tokens_seen": 184127105, + "router_z_loss_clip": 1.56542969, + "router_z_loss_mlp": 0.12438965, + "step": 8566, + "time_per_iteration": 2.540060520172119 + }, + { + "auxiliary_loss_clip": 0.06439756, + "auxiliary_loss_mlp": 0.01270124, + "balance_loss_clip": 0.06282447, + "balance_loss_mlp": 0.0125828, + "epoch": 0.5150759056064933, + "flos": 26074795852800.0, + "grad_norm": 1.7193676063763261, + "language_loss": 0.78763425, + "learning_rate": 1.9994158074861005e-06, + "loss": 0.86473316, + "num_input_tokens_seen": 184148060, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.11834717, + "step": 8567, + "time_per_iteration": 2.610316276550293 + }, + { + "auxiliary_loss_clip": 0.06433998, + "auxiliary_loss_mlp": 0.0126364, + "balance_loss_clip": 0.06282104, + "balance_loss_mlp": 0.01251535, + "epoch": 0.5151360288591613, + "flos": 25958865329280.0, + "grad_norm": 1.8031823951648205, + "language_loss": 0.79058564, + "learning_rate": 1.9990263458347806e-06, + "loss": 0.86756206, + "num_input_tokens_seen": 184166175, + "router_z_loss_clip": 1.51855469, + "router_z_loss_mlp": 0.12091064, + "step": 8568, + "time_per_iteration": 2.5746078491210938 + }, + { + "auxiliary_loss_clip": 0.06425972, + "auxiliary_loss_mlp": 0.01263804, + "balance_loss_clip": 0.06277977, + "balance_loss_mlp": 0.01252705, + "epoch": 0.5151961521118292, + "flos": 18511840581120.0, + "grad_norm": 2.107330893228774, + "language_loss": 0.90881652, + "learning_rate": 1.9986368842203825e-06, + "loss": 0.98571432, + "num_input_tokens_seen": 184182600, + "router_z_loss_clip": 1.48046875, + "router_z_loss_mlp": 0.11096191, + "step": 8569, + "time_per_iteration": 2.5259969234466553 + }, + { + "auxiliary_loss_clip": 0.06436515, + "auxiliary_loss_mlp": 0.01273396, + "balance_loss_clip": 0.06282495, + "balance_loss_mlp": 0.01261225, + "epoch": 0.5152562753644973, + "flos": 22239734330880.0, + "grad_norm": 1.7160477900396784, + "language_loss": 0.77020866, + "learning_rate": 1.998247422657674e-06, + "loss": 0.84730774, + "num_input_tokens_seen": 184202020, + "router_z_loss_clip": 1.53808594, + "router_z_loss_mlp": 0.12188721, + "step": 8570, + "time_per_iteration": 2.5214664936065674 + }, + { + "auxiliary_loss_clip": 0.06435493, + "auxiliary_loss_mlp": 0.01269852, + "balance_loss_clip": 0.06284317, + "balance_loss_mlp": 0.01256817, + "epoch": 0.5153163986171652, + "flos": 38445833784960.0, + "grad_norm": 1.5069722692963965, + "language_loss": 0.73508942, + "learning_rate": 1.9978579611614227e-06, + "loss": 0.81214285, + "num_input_tokens_seen": 184224850, + "router_z_loss_clip": 1.51269531, + "router_z_loss_mlp": 0.1305542, + "step": 8571, + "time_per_iteration": 2.6566643714904785 + }, + { + "auxiliary_loss_clip": 0.06335695, + "auxiliary_loss_mlp": 0.01251905, + "balance_loss_clip": 0.06270696, + "balance_loss_mlp": 0.01250073, + "epoch": 0.5153765218698332, + "flos": 66404533783680.0, + "grad_norm": 0.7650204220049751, + "language_loss": 0.52955389, + "learning_rate": 1.9974684997463984e-06, + "loss": 0.60542989, + "num_input_tokens_seen": 184288520, + "router_z_loss_clip": 0.64990234, + "router_z_loss_mlp": 0.01826477, + "step": 8572, + "time_per_iteration": 3.231537103652954 + }, + { + "auxiliary_loss_clip": 0.06429811, + "auxiliary_loss_mlp": 0.01269417, + "balance_loss_clip": 0.06284182, + "balance_loss_mlp": 0.01257622, + "epoch": 0.5154366451225011, + "flos": 24031537240320.0, + "grad_norm": 1.6307698114257092, + "language_loss": 0.76929724, + "learning_rate": 1.9970790384273687e-06, + "loss": 0.84628952, + "num_input_tokens_seen": 184308565, + "router_z_loss_clip": 1.45703125, + "router_z_loss_mlp": 0.11791992, + "step": 8573, + "time_per_iteration": 2.5637993812561035 + }, + { + "auxiliary_loss_clip": 0.06429262, + "auxiliary_loss_mlp": 0.01267008, + "balance_loss_clip": 0.06281111, + "balance_loss_mlp": 0.01255099, + "epoch": 0.5154967683751691, + "flos": 23474189808000.0, + "grad_norm": 2.3679054324331967, + "language_loss": 0.77109015, + "learning_rate": 1.996689577219102e-06, + "loss": 0.84805286, + "num_input_tokens_seen": 184326795, + "router_z_loss_clip": 1.48046875, + "router_z_loss_mlp": 0.11914062, + "step": 8574, + "time_per_iteration": 2.53300404548645 + }, + { + "auxiliary_loss_clip": 0.06429033, + "auxiliary_loss_mlp": 0.01263951, + "balance_loss_clip": 0.06281316, + "balance_loss_mlp": 0.01252691, + "epoch": 0.515556891627837, + "flos": 23812463940480.0, + "grad_norm": 1.7644957150045186, + "language_loss": 0.85785985, + "learning_rate": 1.996300116136367e-06, + "loss": 0.93478966, + "num_input_tokens_seen": 184345990, + "router_z_loss_clip": 1.4765625, + "router_z_loss_mlp": 0.11248779, + "step": 8575, + "time_per_iteration": 2.577409029006958 + }, + { + "auxiliary_loss_clip": 0.06435408, + "auxiliary_loss_mlp": 0.01265893, + "balance_loss_clip": 0.06283233, + "balance_loss_mlp": 0.01253859, + "epoch": 0.515617014880505, + "flos": 19834665788160.0, + "grad_norm": 1.5082721708333224, + "language_loss": 0.76947051, + "learning_rate": 1.995910655193932e-06, + "loss": 0.84648347, + "num_input_tokens_seen": 184366300, + "router_z_loss_clip": 1.52050781, + "router_z_loss_mlp": 0.1204834, + "step": 8576, + "time_per_iteration": 2.5881736278533936 + }, + { + "auxiliary_loss_clip": 0.06444222, + "auxiliary_loss_mlp": 0.01270832, + "balance_loss_clip": 0.06283684, + "balance_loss_mlp": 0.01258083, + "epoch": 0.515677138133173, + "flos": 14251042863360.0, + "grad_norm": 2.2995750246066406, + "language_loss": 0.75517124, + "learning_rate": 1.9955211944065654e-06, + "loss": 0.83232176, + "num_input_tokens_seen": 184383030, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.12762451, + "step": 8577, + "time_per_iteration": 2.518495559692383 + }, + { + "auxiliary_loss_clip": 0.06436984, + "auxiliary_loss_mlp": 0.01270084, + "balance_loss_clip": 0.0628281, + "balance_loss_mlp": 0.01257037, + "epoch": 0.515737261385841, + "flos": 28296653443200.0, + "grad_norm": 4.0524023742876345, + "language_loss": 0.81602645, + "learning_rate": 1.9951317337890353e-06, + "loss": 0.89309716, + "num_input_tokens_seen": 184403410, + "router_z_loss_clip": 1.54101562, + "router_z_loss_mlp": 0.13049316, + "step": 8578, + "time_per_iteration": 2.5854508876800537 + }, + { + "auxiliary_loss_clip": 0.06431551, + "auxiliary_loss_mlp": 0.01266524, + "balance_loss_clip": 0.06281303, + "balance_loss_mlp": 0.01254746, + "epoch": 0.515797384638509, + "flos": 27899400435840.0, + "grad_norm": 1.724028071509101, + "language_loss": 0.7613306, + "learning_rate": 1.9947422733561105e-06, + "loss": 0.83831137, + "num_input_tokens_seen": 184423830, + "router_z_loss_clip": 1.50097656, + "router_z_loss_mlp": 0.11785889, + "step": 8579, + "time_per_iteration": 2.5765621662139893 + }, + { + "auxiliary_loss_clip": 0.06434369, + "auxiliary_loss_mlp": 0.01265499, + "balance_loss_clip": 0.06280281, + "balance_loss_mlp": 0.01253053, + "epoch": 0.5158575078911769, + "flos": 23046860384640.0, + "grad_norm": 1.6181814769530192, + "language_loss": 0.79290402, + "learning_rate": 1.994352813122559e-06, + "loss": 0.86990273, + "num_input_tokens_seen": 184445050, + "router_z_loss_clip": 1.54199219, + "router_z_loss_mlp": 0.12457275, + "step": 8580, + "time_per_iteration": 2.5879290103912354 + }, + { + "auxiliary_loss_clip": 0.0643789, + "auxiliary_loss_mlp": 0.01268597, + "balance_loss_clip": 0.06283616, + "balance_loss_mlp": 0.01254763, + "epoch": 0.5159176311438449, + "flos": 12646350120960.0, + "grad_norm": 1.9944005001089613, + "language_loss": 0.73488963, + "learning_rate": 1.99396335310315e-06, + "loss": 0.81195444, + "num_input_tokens_seen": 184460775, + "router_z_loss_clip": 1.54199219, + "router_z_loss_mlp": 0.1383667, + "step": 8581, + "time_per_iteration": 2.500063180923462 + }, + { + "auxiliary_loss_clip": 0.06434488, + "auxiliary_loss_mlp": 0.01266672, + "balance_loss_clip": 0.06284754, + "balance_loss_mlp": 0.01254781, + "epoch": 0.5159777543965128, + "flos": 15563302456320.0, + "grad_norm": 1.882801773214852, + "language_loss": 0.74207276, + "learning_rate": 1.9935738933126508e-06, + "loss": 0.81908435, + "num_input_tokens_seen": 184477365, + "router_z_loss_clip": 1.49511719, + "router_z_loss_mlp": 0.11901855, + "step": 8582, + "time_per_iteration": 2.518564462661743 + }, + { + "auxiliary_loss_clip": 0.06429887, + "auxiliary_loss_mlp": 0.01265806, + "balance_loss_clip": 0.06280613, + "balance_loss_mlp": 0.01254648, + "epoch": 0.5160378776491809, + "flos": 23228352328320.0, + "grad_norm": 1.8807127189493567, + "language_loss": 0.66238904, + "learning_rate": 1.99318443376583e-06, + "loss": 0.73934591, + "num_input_tokens_seen": 184497045, + "router_z_loss_clip": 1.49316406, + "router_z_loss_mlp": 0.11157227, + "step": 8583, + "time_per_iteration": 2.542539119720459 + }, + { + "auxiliary_loss_clip": 0.06437095, + "auxiliary_loss_mlp": 0.01269933, + "balance_loss_clip": 0.06283841, + "balance_loss_mlp": 0.01257404, + "epoch": 0.5160980009018488, + "flos": 21951074615040.0, + "grad_norm": 1.3417837681818925, + "language_loss": 0.760252, + "learning_rate": 1.9927949744774568e-06, + "loss": 0.83732229, + "num_input_tokens_seen": 184517675, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.12524414, + "step": 8584, + "time_per_iteration": 2.587082624435425 + }, + { + "auxiliary_loss_clip": 0.06437847, + "auxiliary_loss_mlp": 0.01266727, + "balance_loss_clip": 0.06283042, + "balance_loss_mlp": 0.01253579, + "epoch": 0.5161581241545168, + "flos": 22790708853120.0, + "grad_norm": 1.8159571462416286, + "language_loss": 0.78972226, + "learning_rate": 1.9924055154622983e-06, + "loss": 0.866768, + "num_input_tokens_seen": 184537745, + "router_z_loss_clip": 1.54785156, + "router_z_loss_mlp": 0.13153076, + "step": 8585, + "time_per_iteration": 3.918409824371338 + }, + { + "auxiliary_loss_clip": 0.06432407, + "auxiliary_loss_mlp": 0.01268664, + "balance_loss_clip": 0.06287332, + "balance_loss_mlp": 0.01257076, + "epoch": 0.5162182474071847, + "flos": 19680273440640.0, + "grad_norm": 1.974004410778628, + "language_loss": 0.81013006, + "learning_rate": 1.9920160567351238e-06, + "loss": 0.88714075, + "num_input_tokens_seen": 184553630, + "router_z_loss_clip": 1.45117188, + "router_z_loss_mlp": 0.11578369, + "step": 8586, + "time_per_iteration": 2.4944536685943604 + }, + { + "auxiliary_loss_clip": 0.06434685, + "auxiliary_loss_mlp": 0.01270978, + "balance_loss_clip": 0.06284505, + "balance_loss_mlp": 0.01258473, + "epoch": 0.5162783706598527, + "flos": 20052145860480.0, + "grad_norm": 2.892216813448522, + "language_loss": 0.71914274, + "learning_rate": 1.991626598310701e-06, + "loss": 0.79619938, + "num_input_tokens_seen": 184573530, + "router_z_loss_clip": 1.50097656, + "router_z_loss_mlp": 0.125, + "step": 8587, + "time_per_iteration": 2.500964403152466 + }, + { + "auxiliary_loss_clip": 0.06328937, + "auxiliary_loss_mlp": 0.01260473, + "balance_loss_clip": 0.06264381, + "balance_loss_mlp": 0.01258639, + "epoch": 0.5163384939125206, + "flos": 69980089610880.0, + "grad_norm": 0.7154986672608752, + "language_loss": 0.57844335, + "learning_rate": 1.9912371402037984e-06, + "loss": 0.65433741, + "num_input_tokens_seen": 184637875, + "router_z_loss_clip": 0.6484375, + "router_z_loss_mlp": 0.01829529, + "step": 8588, + "time_per_iteration": 4.569206476211548 + }, + { + "auxiliary_loss_clip": 0.06434999, + "auxiliary_loss_mlp": 0.01267755, + "balance_loss_clip": 0.06281946, + "balance_loss_mlp": 0.01254618, + "epoch": 0.5163986171651886, + "flos": 17422176159360.0, + "grad_norm": 8.344302755834537, + "language_loss": 0.75224382, + "learning_rate": 1.990847682429185e-06, + "loss": 0.82927144, + "num_input_tokens_seen": 184656125, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.13134766, + "step": 8589, + "time_per_iteration": 2.551936388015747 + }, + { + "auxiliary_loss_clip": 0.06436837, + "auxiliary_loss_mlp": 0.01265639, + "balance_loss_clip": 0.0628375, + "balance_loss_mlp": 0.01254607, + "epoch": 0.5164587404178566, + "flos": 21328752741120.0, + "grad_norm": 1.4649655682055334, + "language_loss": 0.67921245, + "learning_rate": 1.990458225001627e-06, + "loss": 0.75623721, + "num_input_tokens_seen": 184675920, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.11035156, + "step": 8590, + "time_per_iteration": 2.5104808807373047 + }, + { + "auxiliary_loss_clip": 0.06330067, + "auxiliary_loss_mlp": 0.01255277, + "balance_loss_clip": 0.06265621, + "balance_loss_mlp": 0.01253319, + "epoch": 0.5165188636705246, + "flos": 68076506954880.0, + "grad_norm": 0.7672531816981234, + "language_loss": 0.55843657, + "learning_rate": 1.990068767935895e-06, + "loss": 0.63428998, + "num_input_tokens_seen": 184730520, + "router_z_loss_clip": 0.64550781, + "router_z_loss_mlp": 0.01956177, + "step": 8591, + "time_per_iteration": 3.0606987476348877 + }, + { + "auxiliary_loss_clip": 0.06426874, + "auxiliary_loss_mlp": 0.01264002, + "balance_loss_clip": 0.06283274, + "balance_loss_mlp": 0.01253261, + "epoch": 0.5165789869231926, + "flos": 19390859038080.0, + "grad_norm": 1.5432128891960295, + "language_loss": 0.81508362, + "learning_rate": 1.9896793112467566e-06, + "loss": 0.89199233, + "num_input_tokens_seen": 184748340, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.10736084, + "step": 8592, + "time_per_iteration": 2.5063397884368896 + }, + { + "auxiliary_loss_clip": 0.0642782, + "auxiliary_loss_mlp": 0.01262629, + "balance_loss_clip": 0.06281757, + "balance_loss_mlp": 0.01251626, + "epoch": 0.5166391101758605, + "flos": 20966607394560.0, + "grad_norm": 1.7131386706837877, + "language_loss": 0.83462119, + "learning_rate": 1.989289854948979e-06, + "loss": 0.91152561, + "num_input_tokens_seen": 184766615, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.11010742, + "step": 8593, + "time_per_iteration": 3.951284170150757 + }, + { + "auxiliary_loss_clip": 0.06431139, + "auxiliary_loss_mlp": 0.01265605, + "balance_loss_clip": 0.06281991, + "balance_loss_mlp": 0.01253833, + "epoch": 0.5166992334285285, + "flos": 29470411036800.0, + "grad_norm": 1.8647556534792968, + "language_loss": 0.69381714, + "learning_rate": 1.9889003990573314e-06, + "loss": 0.77078462, + "num_input_tokens_seen": 184788075, + "router_z_loss_clip": 1.49121094, + "router_z_loss_mlp": 0.11761475, + "step": 8594, + "time_per_iteration": 2.600724220275879 + }, + { + "auxiliary_loss_clip": 0.06431773, + "auxiliary_loss_mlp": 0.01266128, + "balance_loss_clip": 0.06282206, + "balance_loss_mlp": 0.0125441, + "epoch": 0.5167593566811964, + "flos": 20310813014400.0, + "grad_norm": 1.4700297891307748, + "language_loss": 0.77611995, + "learning_rate": 1.988510943586582e-06, + "loss": 0.85309899, + "num_input_tokens_seen": 184808710, + "router_z_loss_clip": 1.49511719, + "router_z_loss_mlp": 0.1171875, + "step": 8595, + "time_per_iteration": 2.5478954315185547 + }, + { + "auxiliary_loss_clip": 0.06431342, + "auxiliary_loss_mlp": 0.01266673, + "balance_loss_clip": 0.06281155, + "balance_loss_mlp": 0.01255563, + "epoch": 0.5168194799338645, + "flos": 14616668154240.0, + "grad_norm": 1.457832438333805, + "language_loss": 0.65828246, + "learning_rate": 1.9881214885514986e-06, + "loss": 0.73526263, + "num_input_tokens_seen": 184826475, + "router_z_loss_clip": 1.50292969, + "router_z_loss_mlp": 0.11114502, + "step": 8596, + "time_per_iteration": 2.5720162391662598 + }, + { + "auxiliary_loss_clip": 0.06432624, + "auxiliary_loss_mlp": 0.01271477, + "balance_loss_clip": 0.06281975, + "balance_loss_mlp": 0.01258483, + "epoch": 0.5168796031865324, + "flos": 25013866181760.0, + "grad_norm": 1.4915456509806782, + "language_loss": 0.75734007, + "learning_rate": 1.9877320339668492e-06, + "loss": 0.8343811, + "num_input_tokens_seen": 184845245, + "router_z_loss_clip": 1.50683594, + "router_z_loss_mlp": 0.12988281, + "step": 8597, + "time_per_iteration": 2.5495989322662354 + }, + { + "auxiliary_loss_clip": 0.06427812, + "auxiliary_loss_mlp": 0.01266343, + "balance_loss_clip": 0.06278015, + "balance_loss_mlp": 0.01254583, + "epoch": 0.5169397264392004, + "flos": 26946728640000.0, + "grad_norm": 1.7231987845025152, + "language_loss": 0.8152492, + "learning_rate": 1.987342579847403e-06, + "loss": 0.89219069, + "num_input_tokens_seen": 184866605, + "router_z_loss_clip": 1.49609375, + "router_z_loss_mlp": 0.11773682, + "step": 8598, + "time_per_iteration": 2.6746177673339844 + }, + { + "auxiliary_loss_clip": 0.06427282, + "auxiliary_loss_mlp": 0.0126742, + "balance_loss_clip": 0.06279184, + "balance_loss_mlp": 0.0125523, + "epoch": 0.5169998496918683, + "flos": 25414347571200.0, + "grad_norm": 1.537627068096994, + "language_loss": 0.7597698, + "learning_rate": 1.9869531262079273e-06, + "loss": 0.83671683, + "num_input_tokens_seen": 184886945, + "router_z_loss_clip": 1.47851562, + "router_z_loss_mlp": 0.12194824, + "step": 8599, + "time_per_iteration": 2.548478841781616 + }, + { + "auxiliary_loss_clip": 0.06428513, + "auxiliary_loss_mlp": 0.01264151, + "balance_loss_clip": 0.06279552, + "balance_loss_mlp": 0.01253291, + "epoch": 0.5170599729445363, + "flos": 24687667036800.0, + "grad_norm": 4.521028695007152, + "language_loss": 0.72775459, + "learning_rate": 1.9865636730631904e-06, + "loss": 0.80468118, + "num_input_tokens_seen": 184905590, + "router_z_loss_clip": 1.48828125, + "router_z_loss_mlp": 0.10852051, + "step": 8600, + "time_per_iteration": 3.977342367172241 + }, + { + "auxiliary_loss_clip": 0.06427286, + "auxiliary_loss_mlp": 0.01268182, + "balance_loss_clip": 0.06278619, + "balance_loss_mlp": 0.01256732, + "epoch": 0.5171200961972042, + "flos": 21000499171200.0, + "grad_norm": 1.369345328324843, + "language_loss": 0.74472946, + "learning_rate": 1.9861742204279602e-06, + "loss": 0.82168412, + "num_input_tokens_seen": 184925555, + "router_z_loss_clip": 1.48535156, + "router_z_loss_mlp": 0.11444092, + "step": 8601, + "time_per_iteration": 2.5409762859344482 + }, + { + "auxiliary_loss_clip": 0.06429532, + "auxiliary_loss_mlp": 0.01271067, + "balance_loss_clip": 0.06278992, + "balance_loss_mlp": 0.01258467, + "epoch": 0.5171802194498722, + "flos": 22751953539840.0, + "grad_norm": 1.8713669852223682, + "language_loss": 0.83940291, + "learning_rate": 1.9857847683170045e-06, + "loss": 0.9164089, + "num_input_tokens_seen": 184944490, + "router_z_loss_clip": 1.50585938, + "router_z_loss_mlp": 0.12597656, + "step": 8602, + "time_per_iteration": 2.5086002349853516 + }, + { + "auxiliary_loss_clip": 0.06431083, + "auxiliary_loss_mlp": 0.01265946, + "balance_loss_clip": 0.06279787, + "balance_loss_mlp": 0.01254026, + "epoch": 0.5172403427025402, + "flos": 28183070833920.0, + "grad_norm": 1.835239532551919, + "language_loss": 0.74816436, + "learning_rate": 1.9853953167450926e-06, + "loss": 0.82513469, + "num_input_tokens_seen": 184963190, + "router_z_loss_clip": 1.51269531, + "router_z_loss_mlp": 0.1192627, + "step": 8603, + "time_per_iteration": 2.628830909729004 + }, + { + "auxiliary_loss_clip": 0.06434101, + "auxiliary_loss_mlp": 0.01267589, + "balance_loss_clip": 0.06281082, + "balance_loss_mlp": 0.01255566, + "epoch": 0.5173004659552082, + "flos": 20343782396160.0, + "grad_norm": 2.436721116583926, + "language_loss": 0.73165393, + "learning_rate": 1.9850058657269915e-06, + "loss": 0.80867082, + "num_input_tokens_seen": 184981220, + "router_z_loss_clip": 1.52734375, + "router_z_loss_mlp": 0.12017822, + "step": 8604, + "time_per_iteration": 2.521681785583496 + }, + { + "auxiliary_loss_clip": 0.06440152, + "auxiliary_loss_mlp": 0.01268375, + "balance_loss_clip": 0.06279815, + "balance_loss_mlp": 0.01254469, + "epoch": 0.5173605892078762, + "flos": 19069481502720.0, + "grad_norm": 1.6971244246662016, + "language_loss": 0.85418487, + "learning_rate": 1.984616415277469e-06, + "loss": 0.93127012, + "num_input_tokens_seen": 184998810, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.13922119, + "step": 8605, + "time_per_iteration": 2.5182762145996094 + }, + { + "auxiliary_loss_clip": 0.06430884, + "auxiliary_loss_mlp": 0.01270289, + "balance_loss_clip": 0.06279606, + "balance_loss_mlp": 0.01258893, + "epoch": 0.5174207124605441, + "flos": 28001620817280.0, + "grad_norm": 1.308601391892793, + "language_loss": 0.64964187, + "learning_rate": 1.984226965411294e-06, + "loss": 0.72665358, + "num_input_tokens_seen": 185021185, + "router_z_loss_clip": 1.51269531, + "router_z_loss_mlp": 0.1138916, + "step": 8606, + "time_per_iteration": 2.5762083530426025 + }, + { + "auxiliary_loss_clip": 0.06431288, + "auxiliary_loss_mlp": 0.01265541, + "balance_loss_clip": 0.06280211, + "balance_loss_mlp": 0.0125362, + "epoch": 0.5174808357132121, + "flos": 19502135660160.0, + "grad_norm": 1.5729301555613031, + "language_loss": 0.78141046, + "learning_rate": 1.983837516143234e-06, + "loss": 0.85837877, + "num_input_tokens_seen": 185038465, + "router_z_loss_clip": 1.50976562, + "router_z_loss_mlp": 0.11914062, + "step": 8607, + "time_per_iteration": 2.5321435928344727 + }, + { + "auxiliary_loss_clip": 0.06431965, + "auxiliary_loss_mlp": 0.01271738, + "balance_loss_clip": 0.06280412, + "balance_loss_mlp": 0.01259049, + "epoch": 0.51754095896588, + "flos": 22790834634240.0, + "grad_norm": 1.7409540075434562, + "language_loss": 0.72313815, + "learning_rate": 1.983448067488057e-06, + "loss": 0.80017519, + "num_input_tokens_seen": 185057340, + "router_z_loss_clip": 1.51660156, + "router_z_loss_mlp": 0.12677002, + "step": 8608, + "time_per_iteration": 2.52758526802063 + }, + { + "auxiliary_loss_clip": 0.06435958, + "auxiliary_loss_mlp": 0.01273384, + "balance_loss_clip": 0.06279105, + "balance_loss_mlp": 0.01261046, + "epoch": 0.5176010822185481, + "flos": 22674987964800.0, + "grad_norm": 1.7194792439439102, + "language_loss": 0.86816031, + "learning_rate": 1.983058619460531e-06, + "loss": 0.94525373, + "num_input_tokens_seen": 185074935, + "router_z_loss_clip": 1.56738281, + "router_z_loss_mlp": 0.12341309, + "step": 8609, + "time_per_iteration": 2.538146495819092 + }, + { + "auxiliary_loss_clip": 0.06431948, + "auxiliary_loss_mlp": 0.0126355, + "balance_loss_clip": 0.06280786, + "balance_loss_mlp": 0.01252201, + "epoch": 0.517661205471216, + "flos": 23957967755520.0, + "grad_norm": 2.0604849644666943, + "language_loss": 0.73853832, + "learning_rate": 1.9826691720754237e-06, + "loss": 0.81549335, + "num_input_tokens_seen": 185095050, + "router_z_loss_clip": 1.51074219, + "router_z_loss_mlp": 0.11352539, + "step": 8610, + "time_per_iteration": 2.5313732624053955 + }, + { + "auxiliary_loss_clip": 0.064363, + "auxiliary_loss_mlp": 0.01270735, + "balance_loss_clip": 0.06279181, + "balance_loss_mlp": 0.01258051, + "epoch": 0.517721328723884, + "flos": 15601470791040.0, + "grad_norm": 2.184245135297296, + "language_loss": 0.67738098, + "learning_rate": 1.9822797253475034e-06, + "loss": 0.75445139, + "num_input_tokens_seen": 185112275, + "router_z_loss_clip": 1.57128906, + "router_z_loss_mlp": 0.12689209, + "step": 8611, + "time_per_iteration": 2.510500431060791 + }, + { + "auxiliary_loss_clip": 0.06427399, + "auxiliary_loss_mlp": 0.0126573, + "balance_loss_clip": 0.06275965, + "balance_loss_mlp": 0.01253153, + "epoch": 0.5177814519765519, + "flos": 20966607394560.0, + "grad_norm": 1.678614110348905, + "language_loss": 0.77387339, + "learning_rate": 1.9818902792915373e-06, + "loss": 0.85080469, + "num_input_tokens_seen": 185132165, + "router_z_loss_clip": 1.51367188, + "router_z_loss_mlp": 0.12573242, + "step": 8612, + "time_per_iteration": 2.5206472873687744 + }, + { + "auxiliary_loss_clip": 0.064338, + "auxiliary_loss_mlp": 0.01269204, + "balance_loss_clip": 0.0628019, + "balance_loss_mlp": 0.01257641, + "epoch": 0.5178415752292199, + "flos": 17973653806080.0, + "grad_norm": 1.9437798274552756, + "language_loss": 0.82318223, + "learning_rate": 1.981500833922294e-06, + "loss": 0.90021223, + "num_input_tokens_seen": 185151025, + "router_z_loss_clip": 1.53613281, + "router_z_loss_mlp": 0.11560059, + "step": 8613, + "time_per_iteration": 2.4999184608459473 + }, + { + "auxiliary_loss_clip": 0.06431679, + "auxiliary_loss_mlp": 0.01268922, + "balance_loss_clip": 0.062784, + "balance_loss_mlp": 0.01255511, + "epoch": 0.5179016984818878, + "flos": 17827227596160.0, + "grad_norm": 2.2958122780571473, + "language_loss": 0.66944718, + "learning_rate": 1.981111389254541e-06, + "loss": 0.74645323, + "num_input_tokens_seen": 185168455, + "router_z_loss_clip": 1.53320312, + "router_z_loss_mlp": 0.1340332, + "step": 8614, + "time_per_iteration": 2.480762004852295 + }, + { + "auxiliary_loss_clip": 0.06432712, + "auxiliary_loss_mlp": 0.0126997, + "balance_loss_clip": 0.06278278, + "balance_loss_mlp": 0.01257465, + "epoch": 0.5179618217345558, + "flos": 17826011712000.0, + "grad_norm": 1.8941766649542733, + "language_loss": 0.87114352, + "learning_rate": 1.9807219453030453e-06, + "loss": 0.94817036, + "num_input_tokens_seen": 185184415, + "router_z_loss_clip": 1.54492188, + "router_z_loss_mlp": 0.12493896, + "step": 8615, + "time_per_iteration": 2.500279188156128 + }, + { + "auxiliary_loss_clip": 0.06431083, + "auxiliary_loss_mlp": 0.01270372, + "balance_loss_clip": 0.06278686, + "balance_loss_mlp": 0.01258731, + "epoch": 0.5180219449872238, + "flos": 22527639360000.0, + "grad_norm": 1.466896191984659, + "language_loss": 0.80947113, + "learning_rate": 1.9803325020825763e-06, + "loss": 0.8864857, + "num_input_tokens_seen": 185202910, + "router_z_loss_clip": 1.5234375, + "router_z_loss_mlp": 0.11639404, + "step": 8616, + "time_per_iteration": 2.523977279663086 + }, + { + "auxiliary_loss_clip": 0.06436383, + "auxiliary_loss_mlp": 0.01270292, + "balance_loss_clip": 0.0627937, + "balance_loss_mlp": 0.01257554, + "epoch": 0.5180820682398918, + "flos": 23922356970240.0, + "grad_norm": 2.681335053285678, + "language_loss": 0.75563776, + "learning_rate": 1.9799430596079e-06, + "loss": 0.83270454, + "num_input_tokens_seen": 185223085, + "router_z_loss_clip": 1.56835938, + "router_z_loss_mlp": 0.12744141, + "step": 8617, + "time_per_iteration": 2.5584635734558105 + }, + { + "auxiliary_loss_clip": 0.0643236, + "auxiliary_loss_mlp": 0.01270738, + "balance_loss_clip": 0.06279179, + "balance_loss_mlp": 0.01258215, + "epoch": 0.5181421914925598, + "flos": 16985119662720.0, + "grad_norm": 2.384459515549961, + "language_loss": 0.70321333, + "learning_rate": 1.979553617893785e-06, + "loss": 0.78024429, + "num_input_tokens_seen": 185241295, + "router_z_loss_clip": 1.53320312, + "router_z_loss_mlp": 0.12518311, + "step": 8618, + "time_per_iteration": 2.4864299297332764 + }, + { + "auxiliary_loss_clip": 0.06326556, + "auxiliary_loss_mlp": 0.01258187, + "balance_loss_clip": 0.0626248, + "balance_loss_mlp": 0.01256348, + "epoch": 0.5182023147452277, + "flos": 66080472917760.0, + "grad_norm": 0.9021946533901657, + "language_loss": 0.6731512, + "learning_rate": 1.979164176954999e-06, + "loss": 0.74899864, + "num_input_tokens_seen": 185298295, + "router_z_loss_clip": 0.640625, + "router_z_loss_mlp": 0.01834106, + "step": 8619, + "time_per_iteration": 3.1113593578338623 + }, + { + "auxiliary_loss_clip": 0.06429242, + "auxiliary_loss_mlp": 0.01268132, + "balance_loss_clip": 0.06279487, + "balance_loss_mlp": 0.01256235, + "epoch": 0.5182624379978957, + "flos": 18193775281920.0, + "grad_norm": 1.7875432352275369, + "language_loss": 0.79252517, + "learning_rate": 1.97877473680631e-06, + "loss": 0.86949891, + "num_input_tokens_seen": 185317000, + "router_z_loss_clip": 1.49804688, + "router_z_loss_mlp": 0.11883545, + "step": 8620, + "time_per_iteration": 2.490337371826172 + }, + { + "auxiliary_loss_clip": 0.06426805, + "auxiliary_loss_mlp": 0.01265045, + "balance_loss_clip": 0.06278054, + "balance_loss_mlp": 0.01253815, + "epoch": 0.5183225612505636, + "flos": 14031759928320.0, + "grad_norm": 2.0424555394318347, + "language_loss": 0.82670712, + "learning_rate": 1.9783852974624846e-06, + "loss": 0.90362567, + "num_input_tokens_seen": 185331185, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.11236572, + "step": 8621, + "time_per_iteration": 2.5358636379241943 + }, + { + "auxiliary_loss_clip": 0.06430708, + "auxiliary_loss_mlp": 0.01270453, + "balance_loss_clip": 0.06278727, + "balance_loss_mlp": 0.01257787, + "epoch": 0.5183826845032317, + "flos": 23666582782080.0, + "grad_norm": 3.572556492630201, + "language_loss": 0.65903664, + "learning_rate": 1.9779958589382905e-06, + "loss": 0.73604816, + "num_input_tokens_seen": 185348955, + "router_z_loss_clip": 1.51757812, + "router_z_loss_mlp": 0.12664795, + "step": 8622, + "time_per_iteration": 2.5054616928100586 + }, + { + "auxiliary_loss_clip": 0.06440182, + "auxiliary_loss_mlp": 0.0126943, + "balance_loss_clip": 0.06282417, + "balance_loss_mlp": 0.01257419, + "epoch": 0.5184428077558996, + "flos": 15894155502720.0, + "grad_norm": 2.003886693767472, + "language_loss": 0.60810971, + "learning_rate": 1.977606421248497e-06, + "loss": 0.68520582, + "num_input_tokens_seen": 185367330, + "router_z_loss_clip": 1.57617188, + "router_z_loss_mlp": 0.12011719, + "step": 8623, + "time_per_iteration": 2.517026662826538 + }, + { + "auxiliary_loss_clip": 0.06431899, + "auxiliary_loss_mlp": 0.01268479, + "balance_loss_clip": 0.06278786, + "balance_loss_mlp": 0.01256766, + "epoch": 0.5185029310085676, + "flos": 21036864643200.0, + "grad_norm": 1.709310334319468, + "language_loss": 0.76342779, + "learning_rate": 1.9772169844078685e-06, + "loss": 0.84043157, + "num_input_tokens_seen": 185385060, + "router_z_loss_clip": 1.53027344, + "router_z_loss_mlp": 0.11712646, + "step": 8624, + "time_per_iteration": 2.5128896236419678 + }, + { + "auxiliary_loss_clip": 0.0643063, + "auxiliary_loss_mlp": 0.01264535, + "balance_loss_clip": 0.06277324, + "balance_loss_mlp": 0.01251684, + "epoch": 0.5185630542612355, + "flos": 26550062611200.0, + "grad_norm": 2.453361725716909, + "language_loss": 0.71663254, + "learning_rate": 1.9768275484311756e-06, + "loss": 0.79358423, + "num_input_tokens_seen": 185403745, + "router_z_loss_clip": 1.53320312, + "router_z_loss_mlp": 0.12854004, + "step": 8625, + "time_per_iteration": 3.9488492012023926 + }, + { + "auxiliary_loss_clip": 0.06427859, + "auxiliary_loss_mlp": 0.01267162, + "balance_loss_clip": 0.06276631, + "balance_loss_mlp": 0.01255378, + "epoch": 0.5186231775139035, + "flos": 20674803150720.0, + "grad_norm": 1.8867804759418334, + "language_loss": 0.68206352, + "learning_rate": 1.976438113333184e-06, + "loss": 0.75901365, + "num_input_tokens_seen": 185422620, + "router_z_loss_clip": 1.51171875, + "router_z_loss_mlp": 0.11785889, + "step": 8626, + "time_per_iteration": 2.5555548667907715 + }, + { + "auxiliary_loss_clip": 0.06429964, + "auxiliary_loss_mlp": 0.01270465, + "balance_loss_clip": 0.06278128, + "balance_loss_mlp": 0.01257459, + "epoch": 0.5186833007665714, + "flos": 20891612390400.0, + "grad_norm": 1.918580922134282, + "language_loss": 0.70565557, + "learning_rate": 1.9760486791286612e-06, + "loss": 0.78265989, + "num_input_tokens_seen": 185439380, + "router_z_loss_clip": 1.51757812, + "router_z_loss_mlp": 0.13006592, + "step": 8627, + "time_per_iteration": 2.481426954269409 + }, + { + "auxiliary_loss_clip": 0.0643362, + "auxiliary_loss_mlp": 0.01266564, + "balance_loss_clip": 0.06277519, + "balance_loss_mlp": 0.01254399, + "epoch": 0.5187434240192395, + "flos": 20893247544960.0, + "grad_norm": 1.7293286755655957, + "language_loss": 0.73529112, + "learning_rate": 1.9756592458323753e-06, + "loss": 0.81229293, + "num_input_tokens_seen": 185458830, + "router_z_loss_clip": 1.56152344, + "router_z_loss_mlp": 0.12164307, + "step": 8628, + "time_per_iteration": 3.9418892860412598 + }, + { + "auxiliary_loss_clip": 0.0642761, + "auxiliary_loss_mlp": 0.01268136, + "balance_loss_clip": 0.06276411, + "balance_loss_mlp": 0.01255851, + "epoch": 0.5188035472719074, + "flos": 19865203401600.0, + "grad_norm": 1.86469754984735, + "language_loss": 0.77606678, + "learning_rate": 1.9752698134590927e-06, + "loss": 0.85302424, + "num_input_tokens_seen": 185477270, + "router_z_loss_clip": 1.51074219, + "router_z_loss_mlp": 0.1229248, + "step": 8629, + "time_per_iteration": 2.536813974380493 + }, + { + "auxiliary_loss_clip": 0.06431592, + "auxiliary_loss_mlp": 0.01268458, + "balance_loss_clip": 0.06276736, + "balance_loss_mlp": 0.01255923, + "epoch": 0.5188636705245754, + "flos": 21144032415360.0, + "grad_norm": 2.295438438275443, + "language_loss": 0.74746907, + "learning_rate": 1.9748803820235815e-06, + "loss": 0.82446957, + "num_input_tokens_seen": 185495795, + "router_z_loss_clip": 1.546875, + "router_z_loss_mlp": 0.12536621, + "step": 8630, + "time_per_iteration": 2.5338122844696045 + }, + { + "auxiliary_loss_clip": 0.06432383, + "auxiliary_loss_mlp": 0.0126778, + "balance_loss_clip": 0.06276915, + "balance_loss_mlp": 0.01253636, + "epoch": 0.5189237937772434, + "flos": 22426467154560.0, + "grad_norm": 1.6718033524216807, + "language_loss": 0.80433989, + "learning_rate": 1.9744909515406093e-06, + "loss": 0.88134158, + "num_input_tokens_seen": 185514885, + "router_z_loss_clip": 1.5546875, + "router_z_loss_mlp": 0.14141846, + "step": 8631, + "time_per_iteration": 2.5228912830352783 + }, + { + "auxiliary_loss_clip": 0.06431842, + "auxiliary_loss_mlp": 0.01268253, + "balance_loss_clip": 0.06276682, + "balance_loss_mlp": 0.01255187, + "epoch": 0.5189839170299113, + "flos": 25453647936000.0, + "grad_norm": 1.4304618482279687, + "language_loss": 0.74388516, + "learning_rate": 1.974101522024942e-06, + "loss": 0.82088614, + "num_input_tokens_seen": 185537155, + "router_z_loss_clip": 1.55175781, + "router_z_loss_mlp": 0.1305542, + "step": 8632, + "time_per_iteration": 2.5850229263305664 + }, + { + "auxiliary_loss_clip": 0.06424779, + "auxiliary_loss_mlp": 0.01268865, + "balance_loss_clip": 0.06277869, + "balance_loss_mlp": 0.01255865, + "epoch": 0.5190440402825793, + "flos": 18593585838720.0, + "grad_norm": 1.7732237266140687, + "language_loss": 0.79105878, + "learning_rate": 1.9737120934913477e-06, + "loss": 0.86799526, + "num_input_tokens_seen": 185555520, + "router_z_loss_clip": 1.46777344, + "router_z_loss_mlp": 0.13018799, + "step": 8633, + "time_per_iteration": 3.944106340408325 + }, + { + "auxiliary_loss_clip": 0.06433854, + "auxiliary_loss_mlp": 0.01265699, + "balance_loss_clip": 0.06279819, + "balance_loss_mlp": 0.01253492, + "epoch": 0.5191041635352472, + "flos": 21915170340480.0, + "grad_norm": 1.7747709828095277, + "language_loss": 0.80929339, + "learning_rate": 1.9733226659545936e-06, + "loss": 0.88628888, + "num_input_tokens_seen": 185573855, + "router_z_loss_clip": 1.5390625, + "router_z_loss_mlp": 0.12200928, + "step": 8634, + "time_per_iteration": 2.4922289848327637 + }, + { + "auxiliary_loss_clip": 0.0643179, + "auxiliary_loss_mlp": 0.01268829, + "balance_loss_clip": 0.06280308, + "balance_loss_mlp": 0.01256985, + "epoch": 0.5191642867879153, + "flos": 27535536080640.0, + "grad_norm": 1.4623629686344204, + "language_loss": 0.69064617, + "learning_rate": 1.9729332394294467e-06, + "loss": 0.76765239, + "num_input_tokens_seen": 185595145, + "router_z_loss_clip": 1.51660156, + "router_z_loss_mlp": 0.11846924, + "step": 8635, + "time_per_iteration": 2.5806636810302734 + }, + { + "auxiliary_loss_clip": 0.06433641, + "auxiliary_loss_mlp": 0.01269766, + "balance_loss_clip": 0.06278556, + "balance_loss_mlp": 0.01257356, + "epoch": 0.5192244100405832, + "flos": 15711489601920.0, + "grad_norm": 1.5680222184402974, + "language_loss": 0.77829492, + "learning_rate": 1.9725438139306742e-06, + "loss": 0.85532898, + "num_input_tokens_seen": 185613320, + "router_z_loss_clip": 1.55371094, + "router_z_loss_mlp": 0.12414551, + "step": 8636, + "time_per_iteration": 2.5346691608428955 + }, + { + "auxiliary_loss_clip": 0.0643746, + "auxiliary_loss_mlp": 0.01268889, + "balance_loss_clip": 0.06281422, + "balance_loss_mlp": 0.01256122, + "epoch": 0.5192845332932512, + "flos": 12061903092480.0, + "grad_norm": 2.0443106284945016, + "language_loss": 0.72005326, + "learning_rate": 1.9721543894730425e-06, + "loss": 0.7971167, + "num_input_tokens_seen": 185630730, + "router_z_loss_clip": 1.55859375, + "router_z_loss_mlp": 0.12768555, + "step": 8637, + "time_per_iteration": 2.5669779777526855 + }, + { + "auxiliary_loss_clip": 0.06428012, + "auxiliary_loss_mlp": 0.01270032, + "balance_loss_clip": 0.06279644, + "balance_loss_mlp": 0.01257724, + "epoch": 0.5193446565459191, + "flos": 18959211129600.0, + "grad_norm": 2.0277263511036625, + "language_loss": 0.76600313, + "learning_rate": 1.9717649660713194e-06, + "loss": 0.8429836, + "num_input_tokens_seen": 185648515, + "router_z_loss_clip": 1.48339844, + "router_z_loss_mlp": 0.12298584, + "step": 8638, + "time_per_iteration": 2.4836151599884033 + }, + { + "auxiliary_loss_clip": 0.06427278, + "auxiliary_loss_mlp": 0.012673, + "balance_loss_clip": 0.06276545, + "balance_loss_mlp": 0.0125548, + "epoch": 0.5194047797985871, + "flos": 20381028336000.0, + "grad_norm": 1.8081920937255338, + "language_loss": 0.74863744, + "learning_rate": 1.971375543740272e-06, + "loss": 0.82558322, + "num_input_tokens_seen": 185665220, + "router_z_loss_clip": 1.50878906, + "router_z_loss_mlp": 0.11828613, + "step": 8639, + "time_per_iteration": 2.508589029312134 + }, + { + "auxiliary_loss_clip": 0.06432048, + "auxiliary_loss_mlp": 0.01270657, + "balance_loss_clip": 0.06280512, + "balance_loss_mlp": 0.01258045, + "epoch": 0.519464903051255, + "flos": 24359916591360.0, + "grad_norm": 1.679129082437046, + "language_loss": 0.77792585, + "learning_rate": 1.9709861224946665e-06, + "loss": 0.85495287, + "num_input_tokens_seen": 185683750, + "router_z_loss_clip": 1.51367188, + "router_z_loss_mlp": 0.12628174, + "step": 8640, + "time_per_iteration": 4.030183553695679 + }, + { + "auxiliary_loss_clip": 0.06430673, + "auxiliary_loss_mlp": 0.012682, + "balance_loss_clip": 0.06282452, + "balance_loss_mlp": 0.01256482, + "epoch": 0.519525026303923, + "flos": 14066657953920.0, + "grad_norm": 1.8086687453592558, + "language_loss": 0.66518152, + "learning_rate": 1.97059670234927e-06, + "loss": 0.74217027, + "num_input_tokens_seen": 185700625, + "router_z_loss_clip": 1.48046875, + "router_z_loss_mlp": 0.11700439, + "step": 8641, + "time_per_iteration": 2.471047878265381 + }, + { + "auxiliary_loss_clip": 0.06427969, + "auxiliary_loss_mlp": 0.01270672, + "balance_loss_clip": 0.06279019, + "balance_loss_mlp": 0.01259228, + "epoch": 0.519585149556591, + "flos": 28842722501760.0, + "grad_norm": 1.7536948571823123, + "language_loss": 0.76330602, + "learning_rate": 1.97020728331885e-06, + "loss": 0.84029233, + "num_input_tokens_seen": 185721155, + "router_z_loss_clip": 1.48730469, + "router_z_loss_mlp": 0.11456299, + "step": 8642, + "time_per_iteration": 2.5977513790130615 + }, + { + "auxiliary_loss_clip": 0.06428998, + "auxiliary_loss_mlp": 0.01266151, + "balance_loss_clip": 0.06280753, + "balance_loss_mlp": 0.01254374, + "epoch": 0.519645272809259, + "flos": 25379826888960.0, + "grad_norm": 21.827473826572724, + "language_loss": 0.83256245, + "learning_rate": 1.9698178654181726e-06, + "loss": 0.90951395, + "num_input_tokens_seen": 185740990, + "router_z_loss_clip": 1.48339844, + "router_z_loss_mlp": 0.11767578, + "step": 8643, + "time_per_iteration": 2.547438621520996 + }, + { + "auxiliary_loss_clip": 0.06436369, + "auxiliary_loss_mlp": 0.01268573, + "balance_loss_clip": 0.06280598, + "balance_loss_mlp": 0.01255508, + "epoch": 0.519705396061927, + "flos": 25379659180800.0, + "grad_norm": 1.5731350893002956, + "language_loss": 0.70531744, + "learning_rate": 1.969428448662004e-06, + "loss": 0.78236687, + "num_input_tokens_seen": 185762235, + "router_z_loss_clip": 1.55664062, + "router_z_loss_mlp": 0.13067627, + "step": 8644, + "time_per_iteration": 2.5876879692077637 + }, + { + "auxiliary_loss_clip": 0.06430183, + "auxiliary_loss_mlp": 0.01266621, + "balance_loss_clip": 0.0627798, + "balance_loss_mlp": 0.01254825, + "epoch": 0.5197655193145949, + "flos": 28483889391360.0, + "grad_norm": 1.5934186274855324, + "language_loss": 0.80385697, + "learning_rate": 1.9690390330651133e-06, + "loss": 0.88082504, + "num_input_tokens_seen": 185783415, + "router_z_loss_clip": 1.52148438, + "router_z_loss_mlp": 0.11804199, + "step": 8645, + "time_per_iteration": 2.574620246887207 + }, + { + "auxiliary_loss_clip": 0.06430401, + "auxiliary_loss_mlp": 0.01271116, + "balance_loss_clip": 0.06280167, + "balance_loss_mlp": 0.01258898, + "epoch": 0.5198256425672629, + "flos": 20014983774720.0, + "grad_norm": 1.690489867798711, + "language_loss": 0.78455305, + "learning_rate": 1.968649618642264e-06, + "loss": 0.86156821, + "num_input_tokens_seen": 185801345, + "router_z_loss_clip": 1.50292969, + "router_z_loss_mlp": 0.12207031, + "step": 8646, + "time_per_iteration": 2.6401519775390625 + }, + { + "auxiliary_loss_clip": 0.06429573, + "auxiliary_loss_mlp": 0.01268342, + "balance_loss_clip": 0.06279829, + "balance_loss_mlp": 0.01256243, + "epoch": 0.5198857658199308, + "flos": 19835043131520.0, + "grad_norm": 2.3656488760516132, + "language_loss": 0.66367847, + "learning_rate": 1.9682602054082252e-06, + "loss": 0.74065757, + "num_input_tokens_seen": 185820815, + "router_z_loss_clip": 1.49804688, + "router_z_loss_mlp": 0.12091064, + "step": 8647, + "time_per_iteration": 2.599353551864624 + }, + { + "auxiliary_loss_clip": 0.06438218, + "auxiliary_loss_mlp": 0.01268132, + "balance_loss_clip": 0.06282619, + "balance_loss_mlp": 0.0125462, + "epoch": 0.5199458890725989, + "flos": 24468761445120.0, + "grad_norm": 1.778197055342432, + "language_loss": 0.71491444, + "learning_rate": 1.967870793377763e-06, + "loss": 0.79197794, + "num_input_tokens_seen": 185841450, + "router_z_loss_clip": 1.55566406, + "router_z_loss_mlp": 0.13513184, + "step": 8648, + "time_per_iteration": 2.572368860244751 + }, + { + "auxiliary_loss_clip": 0.06438164, + "auxiliary_loss_mlp": 0.01268937, + "balance_loss_clip": 0.06285776, + "balance_loss_mlp": 0.01255884, + "epoch": 0.5200060123252668, + "flos": 23411605207680.0, + "grad_norm": 2.1583755088943875, + "language_loss": 0.64699459, + "learning_rate": 1.967481382565642e-06, + "loss": 0.72406554, + "num_input_tokens_seen": 185859935, + "router_z_loss_clip": 1.5234375, + "router_z_loss_mlp": 0.13031006, + "step": 8649, + "time_per_iteration": 2.5117433071136475 + }, + { + "auxiliary_loss_clip": 0.06439677, + "auxiliary_loss_mlp": 0.01274224, + "balance_loss_clip": 0.06281672, + "balance_loss_mlp": 0.01260778, + "epoch": 0.5200661355779348, + "flos": 17207002074240.0, + "grad_norm": 5.161359302041442, + "language_loss": 0.70409989, + "learning_rate": 1.9670919729866315e-06, + "loss": 0.78123897, + "num_input_tokens_seen": 185876795, + "router_z_loss_clip": 1.58105469, + "router_z_loss_mlp": 0.13446045, + "step": 8650, + "time_per_iteration": 2.5144400596618652 + }, + { + "auxiliary_loss_clip": 0.06431218, + "auxiliary_loss_mlp": 0.01268732, + "balance_loss_clip": 0.06279574, + "balance_loss_mlp": 0.01256936, + "epoch": 0.5201262588306027, + "flos": 18520980675840.0, + "grad_norm": 1.6145243882323275, + "language_loss": 0.78030795, + "learning_rate": 1.966702564655496e-06, + "loss": 0.85730743, + "num_input_tokens_seen": 185895570, + "router_z_loss_clip": 1.515625, + "router_z_loss_mlp": 0.11791992, + "step": 8651, + "time_per_iteration": 2.467643976211548 + }, + { + "auxiliary_loss_clip": 0.06437017, + "auxiliary_loss_mlp": 0.01266893, + "balance_loss_clip": 0.06283189, + "balance_loss_mlp": 0.01253709, + "epoch": 0.5201863820832707, + "flos": 18624458868480.0, + "grad_norm": 1.6266187944599841, + "language_loss": 0.79176587, + "learning_rate": 1.966313157587003e-06, + "loss": 0.86880493, + "num_input_tokens_seen": 185913700, + "router_z_loss_clip": 1.53613281, + "router_z_loss_mlp": 0.13171387, + "step": 8652, + "time_per_iteration": 2.5569629669189453 + }, + { + "auxiliary_loss_clip": 0.06434878, + "auxiliary_loss_mlp": 0.01268954, + "balance_loss_clip": 0.0628317, + "balance_loss_mlp": 0.01255919, + "epoch": 0.5202465053359386, + "flos": 22863817140480.0, + "grad_norm": 1.9022927985659936, + "language_loss": 0.70460284, + "learning_rate": 1.9659237517959187e-06, + "loss": 0.78164113, + "num_input_tokens_seen": 185932460, + "router_z_loss_clip": 1.515625, + "router_z_loss_mlp": 0.13049316, + "step": 8653, + "time_per_iteration": 2.5013556480407715 + }, + { + "auxiliary_loss_clip": 0.06435711, + "auxiliary_loss_mlp": 0.01270582, + "balance_loss_clip": 0.06279919, + "balance_loss_mlp": 0.01257124, + "epoch": 0.5203066285886067, + "flos": 21988068992640.0, + "grad_norm": 1.7386916801416297, + "language_loss": 0.78877962, + "learning_rate": 1.965534347297008e-06, + "loss": 0.86584258, + "num_input_tokens_seen": 185952030, + "router_z_loss_clip": 1.55859375, + "router_z_loss_mlp": 0.13452148, + "step": 8654, + "time_per_iteration": 2.5205516815185547 + }, + { + "auxiliary_loss_clip": 0.06439671, + "auxiliary_loss_mlp": 0.01271817, + "balance_loss_clip": 0.06283241, + "balance_loss_mlp": 0.01258763, + "epoch": 0.5203667518412746, + "flos": 20240094568320.0, + "grad_norm": 1.7537160659546802, + "language_loss": 0.84438735, + "learning_rate": 1.9651449441050393e-06, + "loss": 0.92150223, + "num_input_tokens_seen": 185973130, + "router_z_loss_clip": 1.56347656, + "router_z_loss_mlp": 0.13043213, + "step": 8655, + "time_per_iteration": 2.523545026779175 + }, + { + "auxiliary_loss_clip": 0.06427735, + "auxiliary_loss_mlp": 0.01264722, + "balance_loss_clip": 0.06279121, + "balance_loss_mlp": 0.01253027, + "epoch": 0.5204268750939426, + "flos": 15710860696320.0, + "grad_norm": 2.477748600032862, + "language_loss": 0.66631675, + "learning_rate": 1.9647555422347777e-06, + "loss": 0.74324131, + "num_input_tokens_seen": 185990200, + "router_z_loss_clip": 1.48632812, + "router_z_loss_mlp": 0.11688232, + "step": 8656, + "time_per_iteration": 2.504314661026001 + }, + { + "auxiliary_loss_clip": 0.06430535, + "auxiliary_loss_mlp": 0.01266767, + "balance_loss_clip": 0.06278028, + "balance_loss_mlp": 0.01254203, + "epoch": 0.5204869983466105, + "flos": 27456096810240.0, + "grad_norm": 1.7743424381892883, + "language_loss": 0.73250526, + "learning_rate": 1.9643661417009893e-06, + "loss": 0.80947828, + "num_input_tokens_seen": 186009880, + "router_z_loss_clip": 1.52539062, + "router_z_loss_mlp": 0.12567139, + "step": 8657, + "time_per_iteration": 2.547746419906616 + }, + { + "auxiliary_loss_clip": 0.06431027, + "auxiliary_loss_mlp": 0.01268378, + "balance_loss_clip": 0.06281261, + "balance_loss_mlp": 0.01255611, + "epoch": 0.5205471215992785, + "flos": 20601820644480.0, + "grad_norm": 1.9136699042437477, + "language_loss": 0.71553123, + "learning_rate": 1.9639767425184408e-06, + "loss": 0.79252529, + "num_input_tokens_seen": 186026680, + "router_z_loss_clip": 1.49804688, + "router_z_loss_mlp": 0.12756348, + "step": 8658, + "time_per_iteration": 2.523796796798706 + }, + { + "auxiliary_loss_clip": 0.06426262, + "auxiliary_loss_mlp": 0.01268222, + "balance_loss_clip": 0.06275812, + "balance_loss_mlp": 0.01255669, + "epoch": 0.5206072448519465, + "flos": 22134537129600.0, + "grad_norm": 1.8507369766537312, + "language_loss": 0.83638287, + "learning_rate": 1.963587344701897e-06, + "loss": 0.91332769, + "num_input_tokens_seen": 186046920, + "router_z_loss_clip": 1.50585938, + "router_z_loss_mlp": 0.12554932, + "step": 8659, + "time_per_iteration": 2.5169432163238525 + }, + { + "auxiliary_loss_clip": 0.06437267, + "auxiliary_loss_mlp": 0.01269684, + "balance_loss_clip": 0.06277223, + "balance_loss_mlp": 0.01255587, + "epoch": 0.5206673681046144, + "flos": 18335924933760.0, + "grad_norm": 2.050641453841446, + "language_loss": 0.75738013, + "learning_rate": 1.9631979482661253e-06, + "loss": 0.83444965, + "num_input_tokens_seen": 186062090, + "router_z_loss_clip": 1.59863281, + "router_z_loss_mlp": 0.14093018, + "step": 8660, + "time_per_iteration": 2.557415723800659 + }, + { + "auxiliary_loss_clip": 0.06428091, + "auxiliary_loss_mlp": 0.0126833, + "balance_loss_clip": 0.06277187, + "balance_loss_mlp": 0.01256105, + "epoch": 0.5207274913572825, + "flos": 20236488842880.0, + "grad_norm": 1.6215362458867588, + "language_loss": 0.77692747, + "learning_rate": 1.9628085532258906e-06, + "loss": 0.85389173, + "num_input_tokens_seen": 186081135, + "router_z_loss_clip": 1.50683594, + "router_z_loss_mlp": 0.12231445, + "step": 8661, + "time_per_iteration": 2.509428024291992 + }, + { + "auxiliary_loss_clip": 0.06431398, + "auxiliary_loss_mlp": 0.01266033, + "balance_loss_clip": 0.06278183, + "balance_loss_mlp": 0.01254112, + "epoch": 0.5207876146099504, + "flos": 22133530880640.0, + "grad_norm": 1.7321078317719976, + "language_loss": 0.70359308, + "learning_rate": 1.9624191595959603e-06, + "loss": 0.78056741, + "num_input_tokens_seen": 186099700, + "router_z_loss_clip": 1.53027344, + "router_z_loss_mlp": 0.1192627, + "step": 8662, + "time_per_iteration": 2.5810325145721436 + }, + { + "auxiliary_loss_clip": 0.0642472, + "auxiliary_loss_mlp": 0.01270038, + "balance_loss_clip": 0.06276304, + "balance_loss_mlp": 0.01257169, + "epoch": 0.5208477378626184, + "flos": 23885781863040.0, + "grad_norm": 1.845579934529664, + "language_loss": 0.70074278, + "learning_rate": 1.962029767391098e-06, + "loss": 0.77769035, + "num_input_tokens_seen": 186119740, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.12872314, + "step": 8663, + "time_per_iteration": 2.528122901916504 + }, + { + "auxiliary_loss_clip": 0.06433125, + "auxiliary_loss_mlp": 0.01272195, + "balance_loss_clip": 0.06282328, + "balance_loss_mlp": 0.01259619, + "epoch": 0.5209078611152863, + "flos": 20968158695040.0, + "grad_norm": 1.5162641399491859, + "language_loss": 0.77111858, + "learning_rate": 1.961640376626072e-06, + "loss": 0.84817183, + "num_input_tokens_seen": 186140645, + "router_z_loss_clip": 1.50878906, + "router_z_loss_mlp": 0.12591553, + "step": 8664, + "time_per_iteration": 3.9675118923187256 + }, + { + "auxiliary_loss_clip": 0.06426796, + "auxiliary_loss_mlp": 0.01274545, + "balance_loss_clip": 0.06275374, + "balance_loss_mlp": 0.01261641, + "epoch": 0.5209679843679543, + "flos": 20674006536960.0, + "grad_norm": 1.9585914111684504, + "language_loss": 0.76477247, + "learning_rate": 1.961250987315646e-06, + "loss": 0.84178591, + "num_input_tokens_seen": 186160130, + "router_z_loss_clip": 1.51269531, + "router_z_loss_mlp": 0.12915039, + "step": 8665, + "time_per_iteration": 2.541412830352783 + }, + { + "auxiliary_loss_clip": 0.06427725, + "auxiliary_loss_mlp": 0.01272532, + "balance_loss_clip": 0.06278466, + "balance_loss_mlp": 0.01260593, + "epoch": 0.5210281076206222, + "flos": 20233050825600.0, + "grad_norm": 1.6923585849410518, + "language_loss": 0.72734976, + "learning_rate": 1.960861599474586e-06, + "loss": 0.80435228, + "num_input_tokens_seen": 186179485, + "router_z_loss_clip": 1.4921875, + "router_z_loss_mlp": 0.11920166, + "step": 8666, + "time_per_iteration": 2.4996509552001953 + }, + { + "auxiliary_loss_clip": 0.06442789, + "auxiliary_loss_mlp": 0.01270993, + "balance_loss_clip": 0.0628055, + "balance_loss_mlp": 0.01256199, + "epoch": 0.5210882308732903, + "flos": 16075395884160.0, + "grad_norm": 2.8085912573953093, + "language_loss": 0.69292629, + "learning_rate": 1.9604722131176592e-06, + "loss": 0.77006412, + "num_input_tokens_seen": 186197140, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.14794922, + "step": 8667, + "time_per_iteration": 3.966068744659424 + }, + { + "auxiliary_loss_clip": 0.06427799, + "auxiliary_loss_mlp": 0.0127319, + "balance_loss_clip": 0.06280097, + "balance_loss_mlp": 0.01261793, + "epoch": 0.5211483541259582, + "flos": 24831954967680.0, + "grad_norm": 1.4529640974986662, + "language_loss": 0.8142345, + "learning_rate": 1.960082828259629e-06, + "loss": 0.89124429, + "num_input_tokens_seen": 186216800, + "router_z_loss_clip": 1.47558594, + "router_z_loss_mlp": 0.11401367, + "step": 8668, + "time_per_iteration": 2.531757116317749 + }, + { + "auxiliary_loss_clip": 0.06428734, + "auxiliary_loss_mlp": 0.01265972, + "balance_loss_clip": 0.06277529, + "balance_loss_mlp": 0.01253485, + "epoch": 0.5212084773786262, + "flos": 20375997091200.0, + "grad_norm": 2.3545461183864793, + "language_loss": 0.6399523, + "learning_rate": 1.9596934449152623e-06, + "loss": 0.71689939, + "num_input_tokens_seen": 186235320, + "router_z_loss_clip": 1.51171875, + "router_z_loss_mlp": 0.12493896, + "step": 8669, + "time_per_iteration": 2.582458019256592 + }, + { + "auxiliary_loss_clip": 0.06433244, + "auxiliary_loss_mlp": 0.01270095, + "balance_loss_clip": 0.06281579, + "balance_loss_mlp": 0.01257846, + "epoch": 0.5212686006312941, + "flos": 23151596388480.0, + "grad_norm": 1.5489696479352357, + "language_loss": 0.66586244, + "learning_rate": 1.959304063099325e-06, + "loss": 0.74289578, + "num_input_tokens_seen": 186254460, + "router_z_loss_clip": 1.51757812, + "router_z_loss_mlp": 0.12261963, + "step": 8670, + "time_per_iteration": 2.5730559825897217 + }, + { + "auxiliary_loss_clip": 0.0642543, + "auxiliary_loss_mlp": 0.01273699, + "balance_loss_clip": 0.06278989, + "balance_loss_mlp": 0.01262195, + "epoch": 0.5213287238839621, + "flos": 27780073822080.0, + "grad_norm": 2.549693242202028, + "language_loss": 0.76187384, + "learning_rate": 1.9589146828265806e-06, + "loss": 0.83886516, + "num_input_tokens_seen": 186269465, + "router_z_loss_clip": 1.46386719, + "router_z_loss_mlp": 0.11505127, + "step": 8671, + "time_per_iteration": 2.5233168601989746 + }, + { + "auxiliary_loss_clip": 0.064327, + "auxiliary_loss_mlp": 0.01274872, + "balance_loss_clip": 0.06278658, + "balance_loss_mlp": 0.01262534, + "epoch": 0.5213888471366301, + "flos": 19943762204160.0, + "grad_norm": 1.8121341163261586, + "language_loss": 0.78893673, + "learning_rate": 1.958525304111796e-06, + "loss": 0.86601251, + "num_input_tokens_seen": 186288660, + "router_z_loss_clip": 1.54003906, + "router_z_loss_mlp": 0.12341309, + "step": 8672, + "time_per_iteration": 3.9492485523223877 + }, + { + "auxiliary_loss_clip": 0.06431769, + "auxiliary_loss_mlp": 0.01269371, + "balance_loss_clip": 0.06282303, + "balance_loss_mlp": 0.01257957, + "epoch": 0.521448970389298, + "flos": 16988389971840.0, + "grad_norm": 2.0794497937850327, + "language_loss": 0.72609621, + "learning_rate": 1.958135926969736e-06, + "loss": 0.80310762, + "num_input_tokens_seen": 186305760, + "router_z_loss_clip": 1.49414062, + "router_z_loss_mlp": 0.11425781, + "step": 8673, + "time_per_iteration": 2.4858944416046143 + }, + { + "auxiliary_loss_clip": 0.06430827, + "auxiliary_loss_mlp": 0.01267899, + "balance_loss_clip": 0.06280996, + "balance_loss_mlp": 0.01256133, + "epoch": 0.5215090936419661, + "flos": 18995744309760.0, + "grad_norm": 1.6692646430310563, + "language_loss": 0.75224721, + "learning_rate": 1.957746551415166e-06, + "loss": 0.82923448, + "num_input_tokens_seen": 186324135, + "router_z_loss_clip": 1.49609375, + "router_z_loss_mlp": 0.11755371, + "step": 8674, + "time_per_iteration": 2.528323173522949 + }, + { + "auxiliary_loss_clip": 0.06432723, + "auxiliary_loss_mlp": 0.01271657, + "balance_loss_clip": 0.06279887, + "balance_loss_mlp": 0.01258812, + "epoch": 0.521569216894634, + "flos": 16148923441920.0, + "grad_norm": 2.0098628900715694, + "language_loss": 0.86161578, + "learning_rate": 1.9573571774628506e-06, + "loss": 0.93865955, + "num_input_tokens_seen": 186340205, + "router_z_loss_clip": 1.52832031, + "router_z_loss_mlp": 0.128479, + "step": 8675, + "time_per_iteration": 2.486656665802002 + }, + { + "auxiliary_loss_clip": 0.06328152, + "auxiliary_loss_mlp": 0.0125317, + "balance_loss_clip": 0.06263625, + "balance_loss_mlp": 0.01251218, + "epoch": 0.521629340147302, + "flos": 57596054296320.0, + "grad_norm": 0.8389911483177593, + "language_loss": 0.62711406, + "learning_rate": 1.9569678051275556e-06, + "loss": 0.70292729, + "num_input_tokens_seen": 186396940, + "router_z_loss_clip": 0.6484375, + "router_z_loss_mlp": 0.01950073, + "step": 8676, + "time_per_iteration": 3.09920597076416 + }, + { + "auxiliary_loss_clip": 0.06427533, + "auxiliary_loss_mlp": 0.01264396, + "balance_loss_clip": 0.06277495, + "balance_loss_mlp": 0.01252839, + "epoch": 0.5216894633999699, + "flos": 26804117790720.0, + "grad_norm": 1.458201451867465, + "language_loss": 0.69111204, + "learning_rate": 1.956578434424046e-06, + "loss": 0.7680313, + "num_input_tokens_seen": 186418680, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.11572266, + "step": 8677, + "time_per_iteration": 2.5477073192596436 + }, + { + "auxiliary_loss_clip": 0.06427766, + "auxiliary_loss_mlp": 0.01266893, + "balance_loss_clip": 0.06279552, + "balance_loss_mlp": 0.01255127, + "epoch": 0.5217495866526379, + "flos": 26365803482880.0, + "grad_norm": 1.7210863244717929, + "language_loss": 0.65549737, + "learning_rate": 1.956189065367086e-06, + "loss": 0.73244393, + "num_input_tokens_seen": 186438265, + "router_z_loss_clip": 1.48046875, + "router_z_loss_mlp": 0.11749268, + "step": 8678, + "time_per_iteration": 2.566591739654541 + }, + { + "auxiliary_loss_clip": 0.06434263, + "auxiliary_loss_mlp": 0.01268698, + "balance_loss_clip": 0.06280728, + "balance_loss_mlp": 0.01255531, + "epoch": 0.5218097099053058, + "flos": 23590329966720.0, + "grad_norm": 2.9370978110790507, + "language_loss": 0.68504936, + "learning_rate": 1.9557996979714414e-06, + "loss": 0.762079, + "num_input_tokens_seen": 186456870, + "router_z_loss_clip": 1.53417969, + "router_z_loss_mlp": 0.1317749, + "step": 8679, + "time_per_iteration": 2.510748863220215 + }, + { + "auxiliary_loss_clip": 0.06433919, + "auxiliary_loss_mlp": 0.01268379, + "balance_loss_clip": 0.06281881, + "balance_loss_mlp": 0.01256345, + "epoch": 0.5218698331579739, + "flos": 18083253346560.0, + "grad_norm": 1.6397075137651071, + "language_loss": 0.67471087, + "learning_rate": 1.9554103322518764e-06, + "loss": 0.7517339, + "num_input_tokens_seen": 186476425, + "router_z_loss_clip": 1.51855469, + "router_z_loss_mlp": 0.12036133, + "step": 8680, + "time_per_iteration": 3.9219276905059814 + }, + { + "auxiliary_loss_clip": 0.06433384, + "auxiliary_loss_mlp": 0.01271487, + "balance_loss_clip": 0.06281422, + "balance_loss_mlp": 0.01259595, + "epoch": 0.5219299564106418, + "flos": 19287129283200.0, + "grad_norm": 1.8649470617465917, + "language_loss": 0.83311534, + "learning_rate": 1.955020968223156e-06, + "loss": 0.91016412, + "num_input_tokens_seen": 186492555, + "router_z_loss_clip": 1.51953125, + "router_z_loss_mlp": 0.11889648, + "step": 8681, + "time_per_iteration": 2.516465663909912 + }, + { + "auxiliary_loss_clip": 0.06426493, + "auxiliary_loss_mlp": 0.0126523, + "balance_loss_clip": 0.06276904, + "balance_loss_mlp": 0.01253792, + "epoch": 0.5219900796633098, + "flos": 26658613975680.0, + "grad_norm": 1.6454147062415487, + "language_loss": 0.77514279, + "learning_rate": 1.9546316059000454e-06, + "loss": 0.85205996, + "num_input_tokens_seen": 186513190, + "router_z_loss_clip": 1.49511719, + "router_z_loss_mlp": 0.11437988, + "step": 8682, + "time_per_iteration": 2.554325819015503 + }, + { + "auxiliary_loss_clip": 0.06427193, + "auxiliary_loss_mlp": 0.01266482, + "balance_loss_clip": 0.06279279, + "balance_loss_mlp": 0.01254949, + "epoch": 0.5220502029159777, + "flos": 34321148225280.0, + "grad_norm": 1.635540508166305, + "language_loss": 0.693317, + "learning_rate": 1.9542422452973082e-06, + "loss": 0.77025378, + "num_input_tokens_seen": 186534830, + "router_z_loss_clip": 1.47949219, + "router_z_loss_mlp": 0.11529541, + "step": 8683, + "time_per_iteration": 2.6571457386016846 + }, + { + "auxiliary_loss_clip": 0.06430393, + "auxiliary_loss_mlp": 0.01269896, + "balance_loss_clip": 0.06278116, + "balance_loss_mlp": 0.01257629, + "epoch": 0.5221103261686457, + "flos": 22161804433920.0, + "grad_norm": 1.5499745188789709, + "language_loss": 0.76029563, + "learning_rate": 1.9538528864297104e-06, + "loss": 0.83729851, + "num_input_tokens_seen": 186554390, + "router_z_loss_clip": 1.52441406, + "router_z_loss_mlp": 0.12255859, + "step": 8684, + "time_per_iteration": 2.5611672401428223 + }, + { + "auxiliary_loss_clip": 0.06422482, + "auxiliary_loss_mlp": 0.01267711, + "balance_loss_clip": 0.06276357, + "balance_loss_mlp": 0.01256123, + "epoch": 0.5221704494213137, + "flos": 19214440266240.0, + "grad_norm": 1.9689133598672337, + "language_loss": 0.75993264, + "learning_rate": 1.9534635293120153e-06, + "loss": 0.83683455, + "num_input_tokens_seen": 186572360, + "router_z_loss_clip": 1.46191406, + "router_z_loss_mlp": 0.11590576, + "step": 8685, + "time_per_iteration": 2.592336416244507 + }, + { + "auxiliary_loss_clip": 0.06433201, + "auxiliary_loss_mlp": 0.01267661, + "balance_loss_clip": 0.06280906, + "balance_loss_mlp": 0.01255549, + "epoch": 0.5222305726739817, + "flos": 19360069862400.0, + "grad_norm": 1.8592295664699974, + "language_loss": 0.81054503, + "learning_rate": 1.9530741739589876e-06, + "loss": 0.88755369, + "num_input_tokens_seen": 186590655, + "router_z_loss_clip": 1.52246094, + "router_z_loss_mlp": 0.12103271, + "step": 8686, + "time_per_iteration": 2.529801845550537 + }, + { + "auxiliary_loss_clip": 0.06419135, + "auxiliary_loss_mlp": 0.01266554, + "balance_loss_clip": 0.06276063, + "balance_loss_mlp": 0.01255021, + "epoch": 0.5222906959266497, + "flos": 27821554392960.0, + "grad_norm": 1.7724306724007597, + "language_loss": 0.7060039, + "learning_rate": 1.9526848203853927e-06, + "loss": 0.78286076, + "num_input_tokens_seen": 186610345, + "router_z_loss_clip": 1.43164062, + "router_z_loss_mlp": 0.11535645, + "step": 8687, + "time_per_iteration": 2.580845594406128 + }, + { + "auxiliary_loss_clip": 0.06421649, + "auxiliary_loss_mlp": 0.01267038, + "balance_loss_clip": 0.06277607, + "balance_loss_mlp": 0.01256297, + "epoch": 0.5223508191793176, + "flos": 12717781326720.0, + "grad_norm": 2.573153086937961, + "language_loss": 0.82975262, + "learning_rate": 1.9522954686059936e-06, + "loss": 0.90663946, + "num_input_tokens_seen": 186624360, + "router_z_loss_clip": 1.43945312, + "router_z_loss_mlp": 0.10736084, + "step": 8688, + "time_per_iteration": 2.479219436645508 + }, + { + "auxiliary_loss_clip": 0.06427407, + "auxiliary_loss_mlp": 0.01268772, + "balance_loss_clip": 0.06280096, + "balance_loss_mlp": 0.01256345, + "epoch": 0.5224109424319856, + "flos": 15637584700800.0, + "grad_norm": 2.221621058495187, + "language_loss": 0.74186772, + "learning_rate": 1.9519061186355558e-06, + "loss": 0.81882954, + "num_input_tokens_seen": 186638680, + "router_z_loss_clip": 1.47363281, + "router_z_loss_mlp": 0.12426758, + "step": 8689, + "time_per_iteration": 2.519578456878662 + }, + { + "auxiliary_loss_clip": 0.06423427, + "auxiliary_loss_mlp": 0.01264867, + "balance_loss_clip": 0.06277696, + "balance_loss_mlp": 0.01253858, + "epoch": 0.5224710656846535, + "flos": 15747687365760.0, + "grad_norm": 1.8795858532487468, + "language_loss": 0.8292582, + "learning_rate": 1.9515167704888417e-06, + "loss": 0.90614116, + "num_input_tokens_seen": 186655840, + "router_z_loss_clip": 1.45703125, + "router_z_loss_mlp": 0.11022949, + "step": 8690, + "time_per_iteration": 2.4795632362365723 + }, + { + "auxiliary_loss_clip": 0.06425175, + "auxiliary_loss_mlp": 0.01267616, + "balance_loss_clip": 0.06276759, + "balance_loss_mlp": 0.0125542, + "epoch": 0.5225311889373215, + "flos": 26038136891520.0, + "grad_norm": 1.8859654188369186, + "language_loss": 0.79290485, + "learning_rate": 1.9511274241806173e-06, + "loss": 0.86983275, + "num_input_tokens_seen": 186674150, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.12200928, + "step": 8691, + "time_per_iteration": 2.554316520690918 + }, + { + "auxiliary_loss_clip": 0.06425714, + "auxiliary_loss_mlp": 0.01267876, + "balance_loss_clip": 0.06275393, + "balance_loss_mlp": 0.01255044, + "epoch": 0.5225913121899894, + "flos": 18375183371520.0, + "grad_norm": 2.097465391576973, + "language_loss": 0.76909935, + "learning_rate": 1.950738079725646e-06, + "loss": 0.84603524, + "num_input_tokens_seen": 186690675, + "router_z_loss_clip": 1.50292969, + "router_z_loss_mlp": 0.12835693, + "step": 8692, + "time_per_iteration": 2.508985757827759 + }, + { + "auxiliary_loss_clip": 0.06422729, + "auxiliary_loss_mlp": 0.01266471, + "balance_loss_clip": 0.06279368, + "balance_loss_mlp": 0.01254872, + "epoch": 0.5226514354426575, + "flos": 29280407904000.0, + "grad_norm": 1.831817200061648, + "language_loss": 0.73045087, + "learning_rate": 1.950348737138691e-06, + "loss": 0.80734289, + "num_input_tokens_seen": 186710380, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.11608887, + "step": 8693, + "time_per_iteration": 2.5672616958618164 + }, + { + "auxiliary_loss_clip": 0.06430539, + "auxiliary_loss_mlp": 0.01265444, + "balance_loss_clip": 0.06276198, + "balance_loss_mlp": 0.01252802, + "epoch": 0.5227115586953254, + "flos": 22859330947200.0, + "grad_norm": 2.034375584307348, + "language_loss": 0.8244431, + "learning_rate": 1.949959396434517e-06, + "loss": 0.90140283, + "num_input_tokens_seen": 186729135, + "router_z_loss_clip": 1.54199219, + "router_z_loss_mlp": 0.12640381, + "step": 8694, + "time_per_iteration": 2.511063814163208 + }, + { + "auxiliary_loss_clip": 0.06334698, + "auxiliary_loss_mlp": 0.01264, + "balance_loss_clip": 0.06270603, + "balance_loss_mlp": 0.01262187, + "epoch": 0.5227716819479934, + "flos": 57491695635840.0, + "grad_norm": 0.936740482735722, + "language_loss": 0.55577236, + "learning_rate": 1.949570057627888e-06, + "loss": 0.63175929, + "num_input_tokens_seen": 186791115, + "router_z_loss_clip": 0.640625, + "router_z_loss_mlp": 0.01809692, + "step": 8695, + "time_per_iteration": 3.201383113861084 + }, + { + "auxiliary_loss_clip": 0.06426679, + "auxiliary_loss_mlp": 0.01263614, + "balance_loss_clip": 0.06277943, + "balance_loss_mlp": 0.01252074, + "epoch": 0.5228318052006613, + "flos": 13813357461120.0, + "grad_norm": 1.622631737546212, + "language_loss": 0.73801219, + "learning_rate": 1.9491807207335672e-06, + "loss": 0.81491518, + "num_input_tokens_seen": 186808660, + "router_z_loss_clip": 1.48730469, + "router_z_loss_mlp": 0.11547852, + "step": 8696, + "time_per_iteration": 2.542386770248413 + }, + { + "auxiliary_loss_clip": 0.06429457, + "auxiliary_loss_mlp": 0.01266915, + "balance_loss_clip": 0.06279002, + "balance_loss_mlp": 0.01254589, + "epoch": 0.5228919284533293, + "flos": 15601596572160.0, + "grad_norm": 1.5536675741091566, + "language_loss": 0.71410191, + "learning_rate": 1.948791385766319e-06, + "loss": 0.79106563, + "num_input_tokens_seen": 186825900, + "router_z_loss_clip": 1.50488281, + "router_z_loss_mlp": 0.12341309, + "step": 8697, + "time_per_iteration": 2.520252227783203 + }, + { + "auxiliary_loss_clip": 0.06423891, + "auxiliary_loss_mlp": 0.01265854, + "balance_loss_clip": 0.0627815, + "balance_loss_mlp": 0.0125453, + "epoch": 0.5229520517059973, + "flos": 22497982214400.0, + "grad_norm": 1.650008991843684, + "language_loss": 0.80845451, + "learning_rate": 1.948402052740906e-06, + "loss": 0.88535196, + "num_input_tokens_seen": 186843735, + "router_z_loss_clip": 1.45605469, + "router_z_loss_mlp": 0.11328125, + "step": 8698, + "time_per_iteration": 2.5636022090911865 + }, + { + "auxiliary_loss_clip": 0.06426111, + "auxiliary_loss_mlp": 0.01266716, + "balance_loss_clip": 0.06278659, + "balance_loss_mlp": 0.01254908, + "epoch": 0.5230121749586653, + "flos": 22097416970880.0, + "grad_norm": 3.7708298280456023, + "language_loss": 0.74449289, + "learning_rate": 1.948012721672093e-06, + "loss": 0.82142115, + "num_input_tokens_seen": 186862440, + "router_z_loss_clip": 1.47167969, + "router_z_loss_mlp": 0.1182251, + "step": 8699, + "time_per_iteration": 2.531606912612915 + }, + { + "auxiliary_loss_clip": 0.06432469, + "auxiliary_loss_mlp": 0.0126789, + "balance_loss_clip": 0.06277843, + "balance_loss_mlp": 0.01255325, + "epoch": 0.5230722982113333, + "flos": 22133656661760.0, + "grad_norm": 1.5875927962566738, + "language_loss": 0.73680252, + "learning_rate": 1.947623392574642e-06, + "loss": 0.81380606, + "num_input_tokens_seen": 186880940, + "router_z_loss_clip": 1.546875, + "router_z_loss_mlp": 0.12561035, + "step": 8700, + "time_per_iteration": 2.542734146118164 + }, + { + "auxiliary_loss_clip": 0.06429377, + "auxiliary_loss_mlp": 0.01275322, + "balance_loss_clip": 0.06277132, + "balance_loss_mlp": 0.01263127, + "epoch": 0.5231324214640012, + "flos": 25016214096000.0, + "grad_norm": 1.8967545071734793, + "language_loss": 0.67123276, + "learning_rate": 1.947234065463318e-06, + "loss": 0.74827981, + "num_input_tokens_seen": 186900785, + "router_z_loss_clip": 1.52148438, + "router_z_loss_mlp": 0.12207031, + "step": 8701, + "time_per_iteration": 2.543332815170288 + }, + { + "auxiliary_loss_clip": 0.06421816, + "auxiliary_loss_mlp": 0.01266038, + "balance_loss_clip": 0.06274643, + "balance_loss_mlp": 0.01254696, + "epoch": 0.5231925447166692, + "flos": 25747842021120.0, + "grad_norm": 1.6886589098280236, + "language_loss": 0.66874444, + "learning_rate": 1.9468447403528826e-06, + "loss": 0.74562299, + "num_input_tokens_seen": 186920895, + "router_z_loss_clip": 1.46972656, + "router_z_loss_mlp": 0.11340332, + "step": 8702, + "time_per_iteration": 2.5511581897735596 + }, + { + "auxiliary_loss_clip": 0.06426294, + "auxiliary_loss_mlp": 0.01268357, + "balance_loss_clip": 0.06277906, + "balance_loss_mlp": 0.01255906, + "epoch": 0.5232526679693371, + "flos": 21440322852480.0, + "grad_norm": 3.970152828937024, + "language_loss": 0.76360488, + "learning_rate": 1.946455417258101e-06, + "loss": 0.84055138, + "num_input_tokens_seen": 186940605, + "router_z_loss_clip": 1.48632812, + "router_z_loss_mlp": 0.12457275, + "step": 8703, + "time_per_iteration": 2.4913880825042725 + }, + { + "auxiliary_loss_clip": 0.06434231, + "auxiliary_loss_mlp": 0.01268071, + "balance_loss_clip": 0.06279694, + "balance_loss_mlp": 0.01255471, + "epoch": 0.5233127912220051, + "flos": 35307082892160.0, + "grad_norm": 2.0695890072195344, + "language_loss": 0.77554905, + "learning_rate": 1.9460660961937348e-06, + "loss": 0.85257214, + "num_input_tokens_seen": 186960820, + "router_z_loss_clip": 1.54589844, + "router_z_loss_mlp": 0.1260376, + "step": 8704, + "time_per_iteration": 4.093170642852783 + }, + { + "auxiliary_loss_clip": 0.06425636, + "auxiliary_loss_mlp": 0.01277604, + "balance_loss_clip": 0.06278675, + "balance_loss_mlp": 0.012665, + "epoch": 0.523372914474673, + "flos": 17056257379200.0, + "grad_norm": 1.7488135640398956, + "language_loss": 0.78527272, + "learning_rate": 1.9456767771745474e-06, + "loss": 0.86230516, + "num_input_tokens_seen": 186976240, + "router_z_loss_clip": 1.47070312, + "router_z_loss_mlp": 0.11108398, + "step": 8705, + "time_per_iteration": 2.487792730331421 + }, + { + "auxiliary_loss_clip": 0.06433457, + "auxiliary_loss_mlp": 0.01264626, + "balance_loss_clip": 0.06280416, + "balance_loss_mlp": 0.0125221, + "epoch": 0.5234330377273411, + "flos": 18412303530240.0, + "grad_norm": 1.822089906899261, + "language_loss": 0.69768077, + "learning_rate": 1.9452874602153027e-06, + "loss": 0.77466154, + "num_input_tokens_seen": 186992855, + "router_z_loss_clip": 1.52832031, + "router_z_loss_mlp": 0.12408447, + "step": 8706, + "time_per_iteration": 2.52415132522583 + }, + { + "auxiliary_loss_clip": 0.06339821, + "auxiliary_loss_mlp": 0.01262622, + "balance_loss_clip": 0.06275055, + "balance_loss_mlp": 0.01260974, + "epoch": 0.523493160980009, + "flos": 65872426429440.0, + "grad_norm": 0.668265925718786, + "language_loss": 0.52398658, + "learning_rate": 1.9448981453307623e-06, + "loss": 0.60001105, + "num_input_tokens_seen": 187051205, + "router_z_loss_clip": 0.6484375, + "router_z_loss_mlp": 0.01651001, + "step": 8707, + "time_per_iteration": 4.596412658691406 + }, + { + "auxiliary_loss_clip": 0.06431062, + "auxiliary_loss_mlp": 0.01267285, + "balance_loss_clip": 0.06282815, + "balance_loss_mlp": 0.01255829, + "epoch": 0.523553284232677, + "flos": 21878595233280.0, + "grad_norm": 1.763620445487087, + "language_loss": 0.75447237, + "learning_rate": 1.9445088325356904e-06, + "loss": 0.83145583, + "num_input_tokens_seen": 187070540, + "router_z_loss_clip": 1.47949219, + "router_z_loss_mlp": 0.11450195, + "step": 8708, + "time_per_iteration": 2.515388011932373 + }, + { + "auxiliary_loss_clip": 0.06425884, + "auxiliary_loss_mlp": 0.01269189, + "balance_loss_clip": 0.06279897, + "balance_loss_mlp": 0.01258252, + "epoch": 0.5236134074853449, + "flos": 20854156815360.0, + "grad_norm": 1.5562083670602136, + "language_loss": 0.78041285, + "learning_rate": 1.944119521844849e-06, + "loss": 0.85736358, + "num_input_tokens_seen": 187089975, + "router_z_loss_clip": 1.45800781, + "router_z_loss_mlp": 0.109375, + "step": 8709, + "time_per_iteration": 2.569312810897827 + }, + { + "auxiliary_loss_clip": 0.06434496, + "auxiliary_loss_mlp": 0.01269997, + "balance_loss_clip": 0.062785, + "balance_loss_mlp": 0.01256872, + "epoch": 0.5236735307380129, + "flos": 25527510910080.0, + "grad_norm": 1.8691534112354709, + "language_loss": 0.83896649, + "learning_rate": 1.9437302132730003e-06, + "loss": 0.91601145, + "num_input_tokens_seen": 187108775, + "router_z_loss_clip": 1.55859375, + "router_z_loss_mlp": 0.13128662, + "step": 8710, + "time_per_iteration": 2.5364856719970703 + }, + { + "auxiliary_loss_clip": 0.06424439, + "auxiliary_loss_mlp": 0.01271523, + "balance_loss_clip": 0.06278566, + "balance_loss_mlp": 0.01260347, + "epoch": 0.523733653990681, + "flos": 23589281790720.0, + "grad_norm": 1.796806294076298, + "language_loss": 0.69453466, + "learning_rate": 1.943340906834908e-06, + "loss": 0.77149427, + "num_input_tokens_seen": 187128830, + "router_z_loss_clip": 1.45898438, + "router_z_loss_mlp": 0.11181641, + "step": 8711, + "time_per_iteration": 2.5488204956054688 + }, + { + "auxiliary_loss_clip": 0.06423855, + "auxiliary_loss_mlp": 0.01267852, + "balance_loss_clip": 0.06275582, + "balance_loss_mlp": 0.01256539, + "epoch": 0.5237937772433489, + "flos": 21112698188160.0, + "grad_norm": 1.676774757059823, + "language_loss": 0.82997072, + "learning_rate": 1.9429516025453345e-06, + "loss": 0.90688783, + "num_input_tokens_seen": 187149570, + "router_z_loss_clip": 1.48242188, + "router_z_loss_mlp": 0.11322021, + "step": 8712, + "time_per_iteration": 4.064100980758667 + }, + { + "auxiliary_loss_clip": 0.0643232, + "auxiliary_loss_mlp": 0.01271192, + "balance_loss_clip": 0.06279981, + "balance_loss_mlp": 0.01259051, + "epoch": 0.5238539004960169, + "flos": 19179081043200.0, + "grad_norm": 1.8094880941691576, + "language_loss": 0.6993227, + "learning_rate": 1.9425623004190415e-06, + "loss": 0.77635783, + "num_input_tokens_seen": 187170575, + "router_z_loss_clip": 1.52246094, + "router_z_loss_mlp": 0.121521, + "step": 8713, + "time_per_iteration": 2.544586420059204 + }, + { + "auxiliary_loss_clip": 0.06435391, + "auxiliary_loss_mlp": 0.01268239, + "balance_loss_clip": 0.06280154, + "balance_loss_mlp": 0.01254834, + "epoch": 0.5239140237486848, + "flos": 17892914797440.0, + "grad_norm": 2.8365689324721597, + "language_loss": 0.76947498, + "learning_rate": 1.9421730004707925e-06, + "loss": 0.84651124, + "num_input_tokens_seen": 187187190, + "router_z_loss_clip": 1.55078125, + "router_z_loss_mlp": 0.13409424, + "step": 8714, + "time_per_iteration": 2.5225958824157715 + }, + { + "auxiliary_loss_clip": 0.06430446, + "auxiliary_loss_mlp": 0.01267137, + "balance_loss_clip": 0.06279821, + "balance_loss_mlp": 0.01255085, + "epoch": 0.5239741470013528, + "flos": 17936072449920.0, + "grad_norm": 1.8206248729771282, + "language_loss": 0.76218581, + "learning_rate": 1.9417837027153483e-06, + "loss": 0.83916163, + "num_input_tokens_seen": 187204350, + "router_z_loss_clip": 1.50390625, + "router_z_loss_mlp": 0.12060547, + "step": 8715, + "time_per_iteration": 2.479482650756836 + }, + { + "auxiliary_loss_clip": 0.06428694, + "auxiliary_loss_mlp": 0.01265255, + "balance_loss_clip": 0.06280876, + "balance_loss_mlp": 0.01253537, + "epoch": 0.5240342702540207, + "flos": 31001408513280.0, + "grad_norm": 1.518077309755953, + "language_loss": 0.71405065, + "learning_rate": 1.9413944071674723e-06, + "loss": 0.79099017, + "num_input_tokens_seen": 187225605, + "router_z_loss_clip": 1.47851562, + "router_z_loss_mlp": 0.1171875, + "step": 8716, + "time_per_iteration": 2.6313345432281494 + }, + { + "auxiliary_loss_clip": 0.06429261, + "auxiliary_loss_mlp": 0.01264727, + "balance_loss_clip": 0.06279399, + "balance_loss_mlp": 0.012541, + "epoch": 0.5240943935066887, + "flos": 25011308632320.0, + "grad_norm": 2.053994478361076, + "language_loss": 0.87371016, + "learning_rate": 1.941005113841926e-06, + "loss": 0.95065004, + "num_input_tokens_seen": 187241335, + "router_z_loss_clip": 1.49804688, + "router_z_loss_mlp": 0.10626221, + "step": 8717, + "time_per_iteration": 2.5242137908935547 + }, + { + "auxiliary_loss_clip": 0.06427871, + "auxiliary_loss_mlp": 0.01272314, + "balance_loss_clip": 0.06276905, + "balance_loss_mlp": 0.01260184, + "epoch": 0.5241545167593566, + "flos": 23665786168320.0, + "grad_norm": 1.9379813616750423, + "language_loss": 0.62001824, + "learning_rate": 1.9406158227534723e-06, + "loss": 0.69702005, + "num_input_tokens_seen": 187259925, + "router_z_loss_clip": 1.5078125, + "router_z_loss_mlp": 0.12139893, + "step": 8718, + "time_per_iteration": 2.5543830394744873 + }, + { + "auxiliary_loss_clip": 0.06436223, + "auxiliary_loss_mlp": 0.01271154, + "balance_loss_clip": 0.06282552, + "balance_loss_mlp": 0.01259006, + "epoch": 0.5242146400120247, + "flos": 23406490108800.0, + "grad_norm": 1.965252740565909, + "language_loss": 0.72457337, + "learning_rate": 1.940226533916872e-06, + "loss": 0.80164713, + "num_input_tokens_seen": 187279035, + "router_z_loss_clip": 1.53613281, + "router_z_loss_mlp": 0.12145996, + "step": 8719, + "time_per_iteration": 3.9948794841766357 + }, + { + "auxiliary_loss_clip": 0.06428128, + "auxiliary_loss_mlp": 0.01267914, + "balance_loss_clip": 0.0628122, + "balance_loss_mlp": 0.01256983, + "epoch": 0.5242747632646926, + "flos": 17754873995520.0, + "grad_norm": 2.179080036180393, + "language_loss": 0.73360658, + "learning_rate": 1.9398372473468877e-06, + "loss": 0.81056702, + "num_input_tokens_seen": 187297555, + "router_z_loss_clip": 1.46777344, + "router_z_loss_mlp": 0.10919189, + "step": 8720, + "time_per_iteration": 2.561491012573242 + }, + { + "auxiliary_loss_clip": 0.06431387, + "auxiliary_loss_mlp": 0.0126878, + "balance_loss_clip": 0.06281313, + "balance_loss_mlp": 0.01256227, + "epoch": 0.5243348865173606, + "flos": 32605849693440.0, + "grad_norm": 1.7043415367979953, + "language_loss": 0.70633399, + "learning_rate": 1.939447963058281e-06, + "loss": 0.78333569, + "num_input_tokens_seen": 187320265, + "router_z_loss_clip": 1.49902344, + "router_z_loss_mlp": 0.12561035, + "step": 8721, + "time_per_iteration": 2.6254172325134277 + }, + { + "auxiliary_loss_clip": 0.06427501, + "auxiliary_loss_mlp": 0.01269506, + "balance_loss_clip": 0.06277889, + "balance_loss_mlp": 0.01258008, + "epoch": 0.5243950097700285, + "flos": 25491229292160.0, + "grad_norm": 1.669973954204285, + "language_loss": 0.86888224, + "learning_rate": 1.939058681065813e-06, + "loss": 0.94585228, + "num_input_tokens_seen": 187338045, + "router_z_loss_clip": 1.49707031, + "router_z_loss_mlp": 0.1151123, + "step": 8722, + "time_per_iteration": 2.532735586166382 + }, + { + "auxiliary_loss_clip": 0.06423786, + "auxiliary_loss_mlp": 0.01270795, + "balance_loss_clip": 0.06276488, + "balance_loss_mlp": 0.01259041, + "epoch": 0.5244551330226965, + "flos": 15273846126720.0, + "grad_norm": 1.6547564845342364, + "language_loss": 0.80303264, + "learning_rate": 1.938669401384247e-06, + "loss": 0.87997842, + "num_input_tokens_seen": 187356040, + "router_z_loss_clip": 1.47265625, + "router_z_loss_mlp": 0.11743164, + "step": 8723, + "time_per_iteration": 2.519230842590332 + }, + { + "auxiliary_loss_clip": 0.06433833, + "auxiliary_loss_mlp": 0.01269065, + "balance_loss_clip": 0.06281124, + "balance_loss_mlp": 0.01256286, + "epoch": 0.5245152562753645, + "flos": 22243717399680.0, + "grad_norm": 1.8110090728616772, + "language_loss": 0.75572187, + "learning_rate": 1.9382801240283426e-06, + "loss": 0.83275086, + "num_input_tokens_seen": 187374185, + "router_z_loss_clip": 1.52539062, + "router_z_loss_mlp": 0.12780762, + "step": 8724, + "time_per_iteration": 2.503331422805786 + }, + { + "auxiliary_loss_clip": 0.06439602, + "auxiliary_loss_mlp": 0.01267267, + "balance_loss_clip": 0.06280126, + "balance_loss_mlp": 0.01254428, + "epoch": 0.5245753795280325, + "flos": 29434548689280.0, + "grad_norm": 1.6762764466906133, + "language_loss": 0.70858645, + "learning_rate": 1.9378908490128625e-06, + "loss": 0.78565514, + "num_input_tokens_seen": 187396640, + "router_z_loss_clip": 1.59179688, + "router_z_loss_mlp": 0.12835693, + "step": 8725, + "time_per_iteration": 2.6268577575683594 + }, + { + "auxiliary_loss_clip": 0.06331155, + "auxiliary_loss_mlp": 0.01254987, + "balance_loss_clip": 0.06266978, + "balance_loss_mlp": 0.01252628, + "epoch": 0.5246355027807005, + "flos": 58853569645440.0, + "grad_norm": 0.7398874669792804, + "language_loss": 0.55689812, + "learning_rate": 1.937501576352568e-06, + "loss": 0.63275951, + "num_input_tokens_seen": 187455945, + "router_z_loss_clip": 0.64111328, + "router_z_loss_mlp": 0.02354431, + "step": 8726, + "time_per_iteration": 3.1253981590270996 + }, + { + "auxiliary_loss_clip": 0.06326637, + "auxiliary_loss_mlp": 0.01254365, + "balance_loss_clip": 0.06262497, + "balance_loss_mlp": 0.01252303, + "epoch": 0.5246956260333684, + "flos": 64546792110720.0, + "grad_norm": 0.7865731844335093, + "language_loss": 0.58442128, + "learning_rate": 1.937112306062219e-06, + "loss": 0.66023123, + "num_input_tokens_seen": 187519975, + "router_z_loss_clip": 0.640625, + "router_z_loss_mlp": 0.02062988, + "step": 8727, + "time_per_iteration": 3.176279306411743 + }, + { + "auxiliary_loss_clip": 0.06432917, + "auxiliary_loss_mlp": 0.01270503, + "balance_loss_clip": 0.06279024, + "balance_loss_mlp": 0.01258118, + "epoch": 0.5247557492860364, + "flos": 24540276504960.0, + "grad_norm": 1.4599497814344178, + "language_loss": 0.70513123, + "learning_rate": 1.9367230381565786e-06, + "loss": 0.78216541, + "num_input_tokens_seen": 187541775, + "router_z_loss_clip": 1.53808594, + "router_z_loss_mlp": 0.12390137, + "step": 8728, + "time_per_iteration": 2.635087728500366 + }, + { + "auxiliary_loss_clip": 0.06426623, + "auxiliary_loss_mlp": 0.01271129, + "balance_loss_clip": 0.06274961, + "balance_loss_mlp": 0.01258815, + "epoch": 0.5248158725387043, + "flos": 18811946378880.0, + "grad_norm": 1.5300920869777792, + "language_loss": 0.69649124, + "learning_rate": 1.9363337726504062e-06, + "loss": 0.77346873, + "num_input_tokens_seen": 187560425, + "router_z_loss_clip": 1.51757812, + "router_z_loss_mlp": 0.12310791, + "step": 8729, + "time_per_iteration": 2.5286824703216553 + }, + { + "auxiliary_loss_clip": 0.06429707, + "auxiliary_loss_mlp": 0.01272402, + "balance_loss_clip": 0.06276232, + "balance_loss_mlp": 0.01260112, + "epoch": 0.5248759957913723, + "flos": 20961534222720.0, + "grad_norm": 1.931767440888087, + "language_loss": 0.83841878, + "learning_rate": 1.935944509558464e-06, + "loss": 0.91543984, + "num_input_tokens_seen": 187579930, + "router_z_loss_clip": 1.53417969, + "router_z_loss_mlp": 0.12280273, + "step": 8730, + "time_per_iteration": 2.50693678855896 + }, + { + "auxiliary_loss_clip": 0.06424531, + "auxiliary_loss_mlp": 0.01265175, + "balance_loss_clip": 0.06273736, + "balance_loss_mlp": 0.01253301, + "epoch": 0.5249361190440403, + "flos": 18666903761280.0, + "grad_norm": 2.7205788659727634, + "language_loss": 0.79795074, + "learning_rate": 1.9355552488955125e-06, + "loss": 0.87484777, + "num_input_tokens_seen": 187595365, + "router_z_loss_clip": 1.50976562, + "router_z_loss_mlp": 0.11877441, + "step": 8731, + "time_per_iteration": 2.5262162685394287 + }, + { + "auxiliary_loss_clip": 0.06421249, + "auxiliary_loss_mlp": 0.01268831, + "balance_loss_clip": 0.06275119, + "balance_loss_mlp": 0.01256653, + "epoch": 0.5249962422967083, + "flos": 24870249083520.0, + "grad_norm": 2.282421292997204, + "language_loss": 0.83455729, + "learning_rate": 1.935165990676312e-06, + "loss": 0.91145802, + "num_input_tokens_seen": 187614715, + "router_z_loss_clip": 1.46191406, + "router_z_loss_mlp": 0.12182617, + "step": 8732, + "time_per_iteration": 2.5442264080047607 + }, + { + "auxiliary_loss_clip": 0.06426094, + "auxiliary_loss_mlp": 0.01271634, + "balance_loss_clip": 0.06276669, + "balance_loss_mlp": 0.01259654, + "epoch": 0.5250563655493762, + "flos": 15267179727360.0, + "grad_norm": 1.5246135300121169, + "language_loss": 0.77770185, + "learning_rate": 1.9347767349156237e-06, + "loss": 0.85467911, + "num_input_tokens_seen": 187630745, + "router_z_loss_clip": 1.49414062, + "router_z_loss_mlp": 0.11975098, + "step": 8733, + "time_per_iteration": 2.5826051235198975 + }, + { + "auxiliary_loss_clip": 0.0643189, + "auxiliary_loss_mlp": 0.01266095, + "balance_loss_clip": 0.0627751, + "balance_loss_mlp": 0.01253655, + "epoch": 0.5251164888020442, + "flos": 18631209121920.0, + "grad_norm": 3.9739558224943683, + "language_loss": 0.81671995, + "learning_rate": 1.934387481628208e-06, + "loss": 0.89369977, + "num_input_tokens_seen": 187648200, + "router_z_loss_clip": 1.54492188, + "router_z_loss_mlp": 0.12445068, + "step": 8734, + "time_per_iteration": 2.496502637863159 + }, + { + "auxiliary_loss_clip": 0.0642469, + "auxiliary_loss_mlp": 0.01264669, + "balance_loss_clip": 0.06276481, + "balance_loss_mlp": 0.01253041, + "epoch": 0.5251766120547121, + "flos": 29717632108800.0, + "grad_norm": 1.407036688227265, + "language_loss": 0.77114183, + "learning_rate": 1.933998230828826e-06, + "loss": 0.84803545, + "num_input_tokens_seen": 187669205, + "router_z_loss_clip": 1.48144531, + "router_z_loss_mlp": 0.11627197, + "step": 8735, + "time_per_iteration": 2.5745790004730225 + }, + { + "auxiliary_loss_clip": 0.06423082, + "auxiliary_loss_mlp": 0.01265046, + "balance_loss_clip": 0.06273593, + "balance_loss_mlp": 0.01253632, + "epoch": 0.5252367353073801, + "flos": 23446964430720.0, + "grad_norm": 1.5621679512535565, + "language_loss": 0.80604559, + "learning_rate": 1.9336089825322376e-06, + "loss": 0.88292682, + "num_input_tokens_seen": 187690890, + "router_z_loss_clip": 1.49414062, + "router_z_loss_mlp": 0.11419678, + "step": 8736, + "time_per_iteration": 2.5257420539855957 + }, + { + "auxiliary_loss_clip": 0.06425665, + "auxiliary_loss_mlp": 0.0127044, + "balance_loss_clip": 0.06277201, + "balance_loss_mlp": 0.01258334, + "epoch": 0.5252968585600482, + "flos": 30818658758400.0, + "grad_norm": 2.1177707386756697, + "language_loss": 0.70240873, + "learning_rate": 1.9332197367532033e-06, + "loss": 0.77936983, + "num_input_tokens_seen": 187713045, + "router_z_loss_clip": 1.48339844, + "router_z_loss_mlp": 0.12097168, + "step": 8737, + "time_per_iteration": 2.5996742248535156 + }, + { + "auxiliary_loss_clip": 0.06423551, + "auxiliary_loss_mlp": 0.01268169, + "balance_loss_clip": 0.06272718, + "balance_loss_mlp": 0.01256564, + "epoch": 0.5253569818127161, + "flos": 20634035339520.0, + "grad_norm": 1.5486622918302246, + "language_loss": 0.7715745, + "learning_rate": 1.9328304935064833e-06, + "loss": 0.84849167, + "num_input_tokens_seen": 187733640, + "router_z_loss_clip": 1.50878906, + "router_z_loss_mlp": 0.11608887, + "step": 8738, + "time_per_iteration": 2.5352158546447754 + }, + { + "auxiliary_loss_clip": 0.06323943, + "auxiliary_loss_mlp": 0.01255398, + "balance_loss_clip": 0.06260057, + "balance_loss_mlp": 0.01253626, + "epoch": 0.5254171050653841, + "flos": 63448155302400.0, + "grad_norm": 0.7261228489339219, + "language_loss": 0.54416603, + "learning_rate": 1.932441252806837e-06, + "loss": 0.61995941, + "num_input_tokens_seen": 187792930, + "router_z_loss_clip": 0.640625, + "router_z_loss_mlp": 0.01774597, + "step": 8739, + "time_per_iteration": 3.1277644634246826 + }, + { + "auxiliary_loss_clip": 0.06426128, + "auxiliary_loss_mlp": 0.01267449, + "balance_loss_clip": 0.06276017, + "balance_loss_mlp": 0.01255457, + "epoch": 0.525477228318052, + "flos": 34678136545920.0, + "grad_norm": 1.6647555558701046, + "language_loss": 0.84639645, + "learning_rate": 1.9320520146690263e-06, + "loss": 0.92333221, + "num_input_tokens_seen": 187812495, + "router_z_loss_clip": 1.50195312, + "router_z_loss_mlp": 0.11993408, + "step": 8740, + "time_per_iteration": 2.658111572265625 + }, + { + "auxiliary_loss_clip": 0.06423901, + "auxiliary_loss_mlp": 0.01263794, + "balance_loss_clip": 0.06275214, + "balance_loss_mlp": 0.01251843, + "epoch": 0.52553735157072, + "flos": 17936575574400.0, + "grad_norm": 2.0969213447662156, + "language_loss": 0.69862366, + "learning_rate": 1.9316627791078093e-06, + "loss": 0.77550066, + "num_input_tokens_seen": 187829685, + "router_z_loss_clip": 1.48535156, + "router_z_loss_mlp": 0.11938477, + "step": 8741, + "time_per_iteration": 2.4757626056671143 + }, + { + "auxiliary_loss_clip": 0.0642582, + "auxiliary_loss_mlp": 0.01266561, + "balance_loss_clip": 0.06271701, + "balance_loss_mlp": 0.01254378, + "epoch": 0.5255974748233879, + "flos": 9945326557440.0, + "grad_norm": 2.083494644749303, + "language_loss": 0.66346633, + "learning_rate": 1.931273546137947e-06, + "loss": 0.74039018, + "num_input_tokens_seen": 187846495, + "router_z_loss_clip": 1.5390625, + "router_z_loss_mlp": 0.12188721, + "step": 8742, + "time_per_iteration": 2.4912760257720947 + }, + { + "auxiliary_loss_clip": 0.06430671, + "auxiliary_loss_mlp": 0.01267776, + "balance_loss_clip": 0.06273881, + "balance_loss_mlp": 0.01254592, + "epoch": 0.5256575980760559, + "flos": 16873256062080.0, + "grad_norm": 2.278792899782439, + "language_loss": 0.62974113, + "learning_rate": 1.9308843157741983e-06, + "loss": 0.7067256, + "num_input_tokens_seen": 187862010, + "router_z_loss_clip": 1.56835938, + "router_z_loss_mlp": 0.13195801, + "step": 8743, + "time_per_iteration": 3.8745810985565186 + }, + { + "auxiliary_loss_clip": 0.06328367, + "auxiliary_loss_mlp": 0.01251768, + "balance_loss_clip": 0.06264926, + "balance_loss_mlp": 0.01249956, + "epoch": 0.5257177213287239, + "flos": 62408105297280.0, + "grad_norm": 0.7594186151089873, + "language_loss": 0.54170012, + "learning_rate": 1.930495088031323e-06, + "loss": 0.6175015, + "num_input_tokens_seen": 187922730, + "router_z_loss_clip": 0.63671875, + "router_z_loss_mlp": 0.01808167, + "step": 8744, + "time_per_iteration": 3.2680962085723877 + }, + { + "auxiliary_loss_clip": 0.06434917, + "auxiliary_loss_mlp": 0.01266273, + "balance_loss_clip": 0.0627819, + "balance_loss_mlp": 0.01252635, + "epoch": 0.5257778445813919, + "flos": 20783144880000.0, + "grad_norm": 1.988296485781083, + "language_loss": 0.76358819, + "learning_rate": 1.9301058629240814e-06, + "loss": 0.84060007, + "num_input_tokens_seen": 187940160, + "router_z_loss_clip": 1.56640625, + "router_z_loss_mlp": 0.13653564, + "step": 8745, + "time_per_iteration": 2.5416345596313477 + }, + { + "auxiliary_loss_clip": 0.06422935, + "auxiliary_loss_mlp": 0.01269048, + "balance_loss_clip": 0.06273594, + "balance_loss_mlp": 0.0125733, + "epoch": 0.5258379678340598, + "flos": 17024168465280.0, + "grad_norm": 2.2863222877599703, + "language_loss": 0.81917781, + "learning_rate": 1.9297166404672324e-06, + "loss": 0.8960976, + "num_input_tokens_seen": 187958625, + "router_z_loss_clip": 1.49316406, + "router_z_loss_mlp": 0.1171875, + "step": 8746, + "time_per_iteration": 3.8924081325531006 + }, + { + "auxiliary_loss_clip": 0.06420557, + "auxiliary_loss_mlp": 0.01268181, + "balance_loss_clip": 0.06274772, + "balance_loss_mlp": 0.01257011, + "epoch": 0.5258980910867278, + "flos": 21075032977920.0, + "grad_norm": 1.8269554832422097, + "language_loss": 0.76250327, + "learning_rate": 1.9293274206755353e-06, + "loss": 0.83939064, + "num_input_tokens_seen": 187977575, + "router_z_loss_clip": 1.45703125, + "router_z_loss_mlp": 0.11157227, + "step": 8747, + "time_per_iteration": 2.5338385105133057 + }, + { + "auxiliary_loss_clip": 0.0641925, + "auxiliary_loss_mlp": 0.01266781, + "balance_loss_clip": 0.06273648, + "balance_loss_mlp": 0.01254443, + "epoch": 0.5259582143393957, + "flos": 18010312767360.0, + "grad_norm": 1.781184467493656, + "language_loss": 0.82852685, + "learning_rate": 1.9289382035637505e-06, + "loss": 0.90538716, + "num_input_tokens_seen": 187996650, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.12353516, + "step": 8748, + "time_per_iteration": 2.4989612102508545 + }, + { + "auxiliary_loss_clip": 0.06428373, + "auxiliary_loss_mlp": 0.0126857, + "balance_loss_clip": 0.06276021, + "balance_loss_mlp": 0.01255803, + "epoch": 0.5260183375920637, + "flos": 22790457290880.0, + "grad_norm": 2.0798716741461862, + "language_loss": 0.81033522, + "learning_rate": 1.9285489891466345e-06, + "loss": 0.88730466, + "num_input_tokens_seen": 188013510, + "router_z_loss_clip": 1.5234375, + "router_z_loss_mlp": 0.12756348, + "step": 8749, + "time_per_iteration": 2.541492462158203 + }, + { + "auxiliary_loss_clip": 0.06426647, + "auxiliary_loss_mlp": 0.01269736, + "balance_loss_clip": 0.06276764, + "balance_loss_mlp": 0.01257857, + "epoch": 0.5260784608447318, + "flos": 27059682343680.0, + "grad_norm": 1.8461671999009361, + "language_loss": 0.72827047, + "learning_rate": 1.9281597774389487e-06, + "loss": 0.80523431, + "num_input_tokens_seen": 188032085, + "router_z_loss_clip": 1.49804688, + "router_z_loss_mlp": 0.11877441, + "step": 8750, + "time_per_iteration": 2.55197811126709 + }, + { + "auxiliary_loss_clip": 0.06428036, + "auxiliary_loss_mlp": 0.01265815, + "balance_loss_clip": 0.06278102, + "balance_loss_mlp": 0.0125393, + "epoch": 0.5261385840973997, + "flos": 20668262532480.0, + "grad_norm": 1.3256906405876772, + "language_loss": 0.76755565, + "learning_rate": 1.9277705684554517e-06, + "loss": 0.8444941, + "num_input_tokens_seen": 188050590, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.11883545, + "step": 8751, + "time_per_iteration": 3.989189624786377 + }, + { + "auxiliary_loss_clip": 0.06427495, + "auxiliary_loss_mlp": 0.01266896, + "balance_loss_clip": 0.0627936, + "balance_loss_mlp": 0.01255286, + "epoch": 0.5261987073500677, + "flos": 23629336842240.0, + "grad_norm": 1.3401050149591014, + "language_loss": 0.76360512, + "learning_rate": 1.927381362210902e-06, + "loss": 0.84054899, + "num_input_tokens_seen": 188071620, + "router_z_loss_clip": 1.48242188, + "router_z_loss_mlp": 0.11608887, + "step": 8752, + "time_per_iteration": 2.6008472442626953 + }, + { + "auxiliary_loss_clip": 0.06432231, + "auxiliary_loss_mlp": 0.01266695, + "balance_loss_clip": 0.06278201, + "balance_loss_mlp": 0.01253487, + "epoch": 0.5262588306027356, + "flos": 27643626247680.0, + "grad_norm": 1.396446170400335, + "language_loss": 0.68317235, + "learning_rate": 1.926992158720058e-06, + "loss": 0.76016164, + "num_input_tokens_seen": 188091740, + "router_z_loss_clip": 1.53808594, + "router_z_loss_mlp": 0.13208008, + "step": 8753, + "time_per_iteration": 2.5851571559906006 + }, + { + "auxiliary_loss_clip": 0.06430234, + "auxiliary_loss_mlp": 0.01269545, + "balance_loss_clip": 0.06281005, + "balance_loss_mlp": 0.01257142, + "epoch": 0.5263189538554036, + "flos": 21765725383680.0, + "grad_norm": 1.5666571832863774, + "language_loss": 0.8392294, + "learning_rate": 1.9266029579976785e-06, + "loss": 0.91622722, + "num_input_tokens_seen": 188111165, + "router_z_loss_clip": 1.49121094, + "router_z_loss_mlp": 0.12384033, + "step": 8754, + "time_per_iteration": 2.552424907684326 + }, + { + "auxiliary_loss_clip": 0.06431299, + "auxiliary_loss_mlp": 0.01267122, + "balance_loss_clip": 0.06278868, + "balance_loss_mlp": 0.01254969, + "epoch": 0.5263790771080715, + "flos": 14280490373760.0, + "grad_norm": 9.005791031911038, + "language_loss": 0.87464845, + "learning_rate": 1.926213760058522e-06, + "loss": 0.95163268, + "num_input_tokens_seen": 188127825, + "router_z_loss_clip": 1.52539062, + "router_z_loss_mlp": 0.12139893, + "step": 8755, + "time_per_iteration": 2.4848403930664062 + }, + { + "auxiliary_loss_clip": 0.06329039, + "auxiliary_loss_mlp": 0.01251879, + "balance_loss_clip": 0.06265183, + "balance_loss_mlp": 0.01250204, + "epoch": 0.5264392003607395, + "flos": 65827298206080.0, + "grad_norm": 0.7019882104343015, + "language_loss": 0.5870319, + "learning_rate": 1.9258245649173477e-06, + "loss": 0.66284108, + "num_input_tokens_seen": 188194050, + "router_z_loss_clip": 0.640625, + "router_z_loss_mlp": 0.01678467, + "step": 8756, + "time_per_iteration": 3.275596857070923 + }, + { + "auxiliary_loss_clip": 0.06435139, + "auxiliary_loss_mlp": 0.0126978, + "balance_loss_clip": 0.06280214, + "balance_loss_mlp": 0.01257001, + "epoch": 0.5264993236134075, + "flos": 21038709432960.0, + "grad_norm": 1.5391071607522773, + "language_loss": 0.70246553, + "learning_rate": 1.925435372588913e-06, + "loss": 0.77951479, + "num_input_tokens_seen": 188212565, + "router_z_loss_clip": 1.54882812, + "router_z_loss_mlp": 0.12762451, + "step": 8757, + "time_per_iteration": 2.5078463554382324 + }, + { + "auxiliary_loss_clip": 0.06425242, + "auxiliary_loss_mlp": 0.01271353, + "balance_loss_clip": 0.06274789, + "balance_loss_mlp": 0.01259015, + "epoch": 0.5265594468660755, + "flos": 16623854784000.0, + "grad_norm": 1.5949031044885071, + "language_loss": 0.88366896, + "learning_rate": 1.9250461830879768e-06, + "loss": 0.96063495, + "num_input_tokens_seen": 188229505, + "router_z_loss_clip": 1.50683594, + "router_z_loss_mlp": 0.12341309, + "step": 8758, + "time_per_iteration": 2.503643751144409 + }, + { + "auxiliary_loss_clip": 0.06431897, + "auxiliary_loss_mlp": 0.01273559, + "balance_loss_clip": 0.06277955, + "balance_loss_mlp": 0.01260165, + "epoch": 0.5266195701187434, + "flos": 24141010999680.0, + "grad_norm": 1.3529199811462889, + "language_loss": 0.76677716, + "learning_rate": 1.9246569964292965e-06, + "loss": 0.84383172, + "num_input_tokens_seen": 188250395, + "router_z_loss_clip": 1.53808594, + "router_z_loss_mlp": 0.13391113, + "step": 8759, + "time_per_iteration": 4.0746564865112305 + }, + { + "auxiliary_loss_clip": 0.06426352, + "auxiliary_loss_mlp": 0.01272091, + "balance_loss_clip": 0.06278519, + "balance_loss_mlp": 0.01258603, + "epoch": 0.5266796933714114, + "flos": 15848314519680.0, + "grad_norm": 1.866695897182309, + "language_loss": 0.72062105, + "learning_rate": 1.9242678126276307e-06, + "loss": 0.79760551, + "num_input_tokens_seen": 188266785, + "router_z_loss_clip": 1.47753906, + "router_z_loss_mlp": 0.1348877, + "step": 8760, + "time_per_iteration": 2.4678292274475098 + }, + { + "auxiliary_loss_clip": 0.06434111, + "auxiliary_loss_mlp": 0.01266301, + "balance_loss_clip": 0.06277363, + "balance_loss_mlp": 0.01253152, + "epoch": 0.5267398166240793, + "flos": 20956377196800.0, + "grad_norm": 2.1261739839163263, + "language_loss": 0.76520377, + "learning_rate": 1.923878631697736e-06, + "loss": 0.84220791, + "num_input_tokens_seen": 188282525, + "router_z_loss_clip": 1.56835938, + "router_z_loss_mlp": 0.13140869, + "step": 8761, + "time_per_iteration": 2.5250892639160156 + }, + { + "auxiliary_loss_clip": 0.06431311, + "auxiliary_loss_mlp": 0.01268211, + "balance_loss_clip": 0.06277812, + "balance_loss_mlp": 0.01256696, + "epoch": 0.5267999398767473, + "flos": 21002763231360.0, + "grad_norm": 1.6289028393625449, + "language_loss": 0.7137605, + "learning_rate": 1.923489453654373e-06, + "loss": 0.79075569, + "num_input_tokens_seen": 188301395, + "router_z_loss_clip": 1.53417969, + "router_z_loss_mlp": 0.1151123, + "step": 8762, + "time_per_iteration": 2.50102162361145 + }, + { + "auxiliary_loss_clip": 0.06330161, + "auxiliary_loss_mlp": 0.01253956, + "balance_loss_clip": 0.06266978, + "balance_loss_mlp": 0.01252303, + "epoch": 0.5268600631294152, + "flos": 66867935189760.0, + "grad_norm": 0.9166133094312116, + "language_loss": 0.65129638, + "learning_rate": 1.9231002785122963e-06, + "loss": 0.72713745, + "num_input_tokens_seen": 188357665, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 0.01655579, + "step": 8763, + "time_per_iteration": 3.076136827468872 + }, + { + "auxiliary_loss_clip": 0.06428451, + "auxiliary_loss_mlp": 0.01268489, + "balance_loss_clip": 0.06276652, + "balance_loss_mlp": 0.01255918, + "epoch": 0.5269201863820833, + "flos": 17171307434880.0, + "grad_norm": 1.6120731347351738, + "language_loss": 0.71481144, + "learning_rate": 1.922711106286265e-06, + "loss": 0.79178083, + "num_input_tokens_seen": 188376935, + "router_z_loss_clip": 1.51660156, + "router_z_loss_mlp": 0.12579346, + "step": 8764, + "time_per_iteration": 2.5250110626220703 + }, + { + "auxiliary_loss_clip": 0.06431142, + "auxiliary_loss_mlp": 0.01269659, + "balance_loss_clip": 0.06278007, + "balance_loss_mlp": 0.01256141, + "epoch": 0.5269803096347513, + "flos": 20528963919360.0, + "grad_norm": 1.6456726211241999, + "language_loss": 0.74125087, + "learning_rate": 1.9223219369910368e-06, + "loss": 0.81825888, + "num_input_tokens_seen": 188394995, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.13531494, + "step": 8765, + "time_per_iteration": 2.552011251449585 + }, + { + "auxiliary_loss_clip": 0.06432463, + "auxiliary_loss_mlp": 0.01265724, + "balance_loss_clip": 0.06276315, + "balance_loss_mlp": 0.01253076, + "epoch": 0.5270404328874192, + "flos": 27237652416000.0, + "grad_norm": 1.4730640837864142, + "language_loss": 0.8564899, + "learning_rate": 1.9219327706413677e-06, + "loss": 0.9334718, + "num_input_tokens_seen": 188415475, + "router_z_loss_clip": 1.55957031, + "router_z_loss_mlp": 0.12640381, + "step": 8766, + "time_per_iteration": 2.5471248626708984 + }, + { + "auxiliary_loss_clip": 0.06432243, + "auxiliary_loss_mlp": 0.01271497, + "balance_loss_clip": 0.06278689, + "balance_loss_mlp": 0.01257812, + "epoch": 0.5271005561400872, + "flos": 23116866071040.0, + "grad_norm": 1.6309488802468612, + "language_loss": 0.79294145, + "learning_rate": 1.921543607252017e-06, + "loss": 0.8699789, + "num_input_tokens_seen": 188435665, + "router_z_loss_clip": 1.53515625, + "router_z_loss_mlp": 0.13690186, + "step": 8767, + "time_per_iteration": 2.5700509548187256 + }, + { + "auxiliary_loss_clip": 0.06431086, + "auxiliary_loss_mlp": 0.01269174, + "balance_loss_clip": 0.06277132, + "balance_loss_mlp": 0.01256532, + "epoch": 0.5271606793927551, + "flos": 22571342064000.0, + "grad_norm": 1.7993411408437945, + "language_loss": 0.73931158, + "learning_rate": 1.9211544468377394e-06, + "loss": 0.81631416, + "num_input_tokens_seen": 188455405, + "router_z_loss_clip": 1.5390625, + "router_z_loss_mlp": 0.12646484, + "step": 8768, + "time_per_iteration": 2.5251431465148926 + }, + { + "auxiliary_loss_clip": 0.06428067, + "auxiliary_loss_mlp": 0.01269059, + "balance_loss_clip": 0.0627723, + "balance_loss_mlp": 0.01257174, + "epoch": 0.5272208026454231, + "flos": 18769166069760.0, + "grad_norm": 1.6856667564577028, + "language_loss": 0.74105024, + "learning_rate": 1.9207652894132933e-06, + "loss": 0.81802148, + "num_input_tokens_seen": 188472940, + "router_z_loss_clip": 1.50878906, + "router_z_loss_mlp": 0.11883545, + "step": 8769, + "time_per_iteration": 2.518446683883667 + }, + { + "auxiliary_loss_clip": 0.06431002, + "auxiliary_loss_mlp": 0.01267563, + "balance_loss_clip": 0.06279421, + "balance_loss_mlp": 0.01255172, + "epoch": 0.5272809258980911, + "flos": 20418358129920.0, + "grad_norm": 1.672714058447801, + "language_loss": 0.74041271, + "learning_rate": 1.920376134993436e-06, + "loss": 0.81739843, + "num_input_tokens_seen": 188493035, + "router_z_loss_clip": 1.51367188, + "router_z_loss_mlp": 0.1239624, + "step": 8770, + "time_per_iteration": 2.5188913345336914 + }, + { + "auxiliary_loss_clip": 0.06428713, + "auxiliary_loss_mlp": 0.01271059, + "balance_loss_clip": 0.06278759, + "balance_loss_mlp": 0.01259085, + "epoch": 0.5273410491507591, + "flos": 28264271040000.0, + "grad_norm": 1.8244918854449486, + "language_loss": 0.68641269, + "learning_rate": 1.9199869835929224e-06, + "loss": 0.76341033, + "num_input_tokens_seen": 188513860, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.11987305, + "step": 8771, + "time_per_iteration": 2.5867247581481934 + }, + { + "auxiliary_loss_clip": 0.06424269, + "auxiliary_loss_mlp": 0.01271661, + "balance_loss_clip": 0.06276186, + "balance_loss_mlp": 0.01259704, + "epoch": 0.527401172403427, + "flos": 22461658669440.0, + "grad_norm": 11.676913645943259, + "language_loss": 0.7669906, + "learning_rate": 1.9195978352265115e-06, + "loss": 0.84394991, + "num_input_tokens_seen": 188533345, + "router_z_loss_clip": 1.48242188, + "router_z_loss_mlp": 0.11938477, + "step": 8772, + "time_per_iteration": 2.5199668407440186 + }, + { + "auxiliary_loss_clip": 0.06429616, + "auxiliary_loss_mlp": 0.01267782, + "balance_loss_clip": 0.0627689, + "balance_loss_mlp": 0.01255599, + "epoch": 0.527461295656095, + "flos": 21037158132480.0, + "grad_norm": 2.161876297932061, + "language_loss": 0.66294622, + "learning_rate": 1.9192086899089585e-06, + "loss": 0.73992014, + "num_input_tokens_seen": 188551550, + "router_z_loss_clip": 1.52734375, + "router_z_loss_mlp": 0.12176514, + "step": 8773, + "time_per_iteration": 2.5476229190826416 + }, + { + "auxiliary_loss_clip": 0.06430208, + "auxiliary_loss_mlp": 0.01267896, + "balance_loss_clip": 0.06277348, + "balance_loss_mlp": 0.01256643, + "epoch": 0.5275214189087629, + "flos": 26329060667520.0, + "grad_norm": 1.7199176113539936, + "language_loss": 0.86321867, + "learning_rate": 1.91881954765502e-06, + "loss": 0.94019973, + "num_input_tokens_seen": 188571615, + "router_z_loss_clip": 1.52929688, + "router_z_loss_mlp": 0.11254883, + "step": 8774, + "time_per_iteration": 2.545171022415161 + }, + { + "auxiliary_loss_clip": 0.06427547, + "auxiliary_loss_mlp": 0.01271648, + "balance_loss_clip": 0.06276767, + "balance_loss_mlp": 0.01259525, + "epoch": 0.5275815421614309, + "flos": 20053110182400.0, + "grad_norm": 1.6744248524719214, + "language_loss": 0.80195713, + "learning_rate": 1.9184304084794523e-06, + "loss": 0.87894905, + "num_input_tokens_seen": 188591965, + "router_z_loss_clip": 1.50683594, + "router_z_loss_mlp": 0.12121582, + "step": 8775, + "time_per_iteration": 2.544409990310669 + }, + { + "auxiliary_loss_clip": 0.06422298, + "auxiliary_loss_mlp": 0.01270371, + "balance_loss_clip": 0.06275839, + "balance_loss_mlp": 0.01257968, + "epoch": 0.5276416654140988, + "flos": 21438310354560.0, + "grad_norm": 1.5933640173688606, + "language_loss": 0.83310181, + "learning_rate": 1.918041272397012e-06, + "loss": 0.91002852, + "num_input_tokens_seen": 188610675, + "router_z_loss_clip": 1.46289062, + "router_z_loss_mlp": 0.1239624, + "step": 8776, + "time_per_iteration": 2.5175352096557617 + }, + { + "auxiliary_loss_clip": 0.06428739, + "auxiliary_loss_mlp": 0.012708, + "balance_loss_clip": 0.06277907, + "balance_loss_mlp": 0.0125867, + "epoch": 0.5277017886667669, + "flos": 17170762383360.0, + "grad_norm": 1.5849666431846519, + "language_loss": 0.67932826, + "learning_rate": 1.9176521394224547e-06, + "loss": 0.7563237, + "num_input_tokens_seen": 188628235, + "router_z_loss_clip": 1.50878906, + "router_z_loss_mlp": 0.12127686, + "step": 8777, + "time_per_iteration": 2.5778138637542725 + }, + { + "auxiliary_loss_clip": 0.06429909, + "auxiliary_loss_mlp": 0.01265517, + "balance_loss_clip": 0.06281164, + "balance_loss_mlp": 0.01253935, + "epoch": 0.5277619119194349, + "flos": 20454262404480.0, + "grad_norm": 1.855602906151282, + "language_loss": 0.82547855, + "learning_rate": 1.9172630095705358e-06, + "loss": 0.90243274, + "num_input_tokens_seen": 188648925, + "router_z_loss_clip": 1.48632812, + "router_z_loss_mlp": 0.11584473, + "step": 8778, + "time_per_iteration": 2.571700096130371 + }, + { + "auxiliary_loss_clip": 0.06433128, + "auxiliary_loss_mlp": 0.01269297, + "balance_loss_clip": 0.06280521, + "balance_loss_mlp": 0.01257114, + "epoch": 0.5278220351721028, + "flos": 24067944639360.0, + "grad_norm": 1.9512823836083997, + "language_loss": 0.79944891, + "learning_rate": 1.916873882856013e-06, + "loss": 0.87647313, + "num_input_tokens_seen": 188668125, + "router_z_loss_clip": 1.52441406, + "router_z_loss_mlp": 0.1217041, + "step": 8779, + "time_per_iteration": 2.562757968902588 + }, + { + "auxiliary_loss_clip": 0.06427805, + "auxiliary_loss_mlp": 0.01263718, + "balance_loss_clip": 0.06278832, + "balance_loss_mlp": 0.01252429, + "epoch": 0.5278821584247708, + "flos": 24649540629120.0, + "grad_norm": 2.3350915047762957, + "language_loss": 0.77251387, + "learning_rate": 1.9164847592936406e-06, + "loss": 0.84942913, + "num_input_tokens_seen": 188684410, + "router_z_loss_clip": 1.48828125, + "router_z_loss_mlp": 0.11291504, + "step": 8780, + "time_per_iteration": 2.517606258392334 + }, + { + "auxiliary_loss_clip": 0.0643455, + "auxiliary_loss_mlp": 0.01267518, + "balance_loss_clip": 0.06281555, + "balance_loss_mlp": 0.01254507, + "epoch": 0.5279422816774387, + "flos": 35417017848960.0, + "grad_norm": 1.6574386864631518, + "language_loss": 0.69489729, + "learning_rate": 1.916095638898174e-06, + "loss": 0.77191794, + "num_input_tokens_seen": 188706130, + "router_z_loss_clip": 1.52832031, + "router_z_loss_mlp": 0.13018799, + "step": 8781, + "time_per_iteration": 2.693525791168213 + }, + { + "auxiliary_loss_clip": 0.06421035, + "auxiliary_loss_mlp": 0.01270298, + "balance_loss_clip": 0.06274436, + "balance_loss_mlp": 0.01259051, + "epoch": 0.5280024049301068, + "flos": 22973794024320.0, + "grad_norm": 1.4417281394316688, + "language_loss": 0.7270093, + "learning_rate": 1.9157065216843696e-06, + "loss": 0.80392265, + "num_input_tokens_seen": 188725030, + "router_z_loss_clip": 1.46679688, + "router_z_loss_mlp": 0.11254883, + "step": 8782, + "time_per_iteration": 2.5421454906463623 + }, + { + "auxiliary_loss_clip": 0.06428084, + "auxiliary_loss_mlp": 0.01267241, + "balance_loss_clip": 0.06279479, + "balance_loss_mlp": 0.01255314, + "epoch": 0.5280625281827747, + "flos": 21514143899520.0, + "grad_norm": 1.839654531053583, + "language_loss": 0.68914783, + "learning_rate": 1.915317407666982e-06, + "loss": 0.76610112, + "num_input_tokens_seen": 188744325, + "router_z_loss_clip": 1.48535156, + "router_z_loss_mlp": 0.1192627, + "step": 8783, + "time_per_iteration": 4.037707328796387 + }, + { + "auxiliary_loss_clip": 0.06440329, + "auxiliary_loss_mlp": 0.01270068, + "balance_loss_clip": 0.06282043, + "balance_loss_mlp": 0.01256281, + "epoch": 0.5281226514354427, + "flos": 31215534422400.0, + "grad_norm": 1.947626233704344, + "language_loss": 0.69763857, + "learning_rate": 1.9149282968607674e-06, + "loss": 0.77474254, + "num_input_tokens_seen": 188765100, + "router_z_loss_clip": 1.58203125, + "router_z_loss_mlp": 0.13793945, + "step": 8784, + "time_per_iteration": 2.6415882110595703 + }, + { + "auxiliary_loss_clip": 0.06436743, + "auxiliary_loss_mlp": 0.01269839, + "balance_loss_clip": 0.06277036, + "balance_loss_mlp": 0.01256393, + "epoch": 0.5281827746881106, + "flos": 25084039576320.0, + "grad_norm": 1.9575438568521135, + "language_loss": 0.75138849, + "learning_rate": 1.91453918928048e-06, + "loss": 0.82845432, + "num_input_tokens_seen": 188783995, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.13458252, + "step": 8785, + "time_per_iteration": 2.5360119342803955 + }, + { + "auxiliary_loss_clip": 0.06430692, + "auxiliary_loss_mlp": 0.01270335, + "balance_loss_clip": 0.06279787, + "balance_loss_mlp": 0.01257806, + "epoch": 0.5282428979407786, + "flos": 20637515283840.0, + "grad_norm": 2.81532856062796, + "language_loss": 0.83379281, + "learning_rate": 1.9141500849408745e-06, + "loss": 0.91080302, + "num_input_tokens_seen": 188803120, + "router_z_loss_clip": 1.50976562, + "router_z_loss_mlp": 0.12518311, + "step": 8786, + "time_per_iteration": 3.923038959503174 + }, + { + "auxiliary_loss_clip": 0.06426571, + "auxiliary_loss_mlp": 0.01265911, + "balance_loss_clip": 0.0628151, + "balance_loss_mlp": 0.01255248, + "epoch": 0.5283030211934465, + "flos": 22426005957120.0, + "grad_norm": 2.0503071903036134, + "language_loss": 0.82639015, + "learning_rate": 1.9137609838567076e-06, + "loss": 0.90331495, + "num_input_tokens_seen": 188820960, + "router_z_loss_clip": 1.45117188, + "router_z_loss_mlp": 0.10650635, + "step": 8787, + "time_per_iteration": 2.549422025680542 + }, + { + "auxiliary_loss_clip": 0.06423321, + "auxiliary_loss_mlp": 0.01271192, + "balance_loss_clip": 0.06276572, + "balance_loss_mlp": 0.01259932, + "epoch": 0.5283631444461145, + "flos": 23620951434240.0, + "grad_norm": 1.6336970157139816, + "language_loss": 0.83324271, + "learning_rate": 1.9133718860427316e-06, + "loss": 0.91018784, + "num_input_tokens_seen": 188837165, + "router_z_loss_clip": 1.46582031, + "router_z_loss_mlp": 0.11260986, + "step": 8788, + "time_per_iteration": 2.4937057495117188 + }, + { + "auxiliary_loss_clip": 0.06426245, + "auxiliary_loss_mlp": 0.01271299, + "balance_loss_clip": 0.06279786, + "balance_loss_mlp": 0.0125886, + "epoch": 0.5284232676987825, + "flos": 32680341573120.0, + "grad_norm": 1.675322731323109, + "language_loss": 0.75004017, + "learning_rate": 1.9129827915137027e-06, + "loss": 0.82701558, + "num_input_tokens_seen": 188858555, + "router_z_loss_clip": 1.46484375, + "router_z_loss_mlp": 0.12451172, + "step": 8789, + "time_per_iteration": 2.6138312816619873 + }, + { + "auxiliary_loss_clip": 0.06430633, + "auxiliary_loss_mlp": 0.01265881, + "balance_loss_clip": 0.06280988, + "balance_loss_mlp": 0.01254139, + "epoch": 0.5284833909514505, + "flos": 26768213516160.0, + "grad_norm": 1.5707088647426293, + "language_loss": 0.70574284, + "learning_rate": 1.9125937002843754e-06, + "loss": 0.78270793, + "num_input_tokens_seen": 188879050, + "router_z_loss_clip": 1.49609375, + "router_z_loss_mlp": 0.11743164, + "step": 8790, + "time_per_iteration": 2.5883655548095703 + }, + { + "auxiliary_loss_clip": 0.06427436, + "auxiliary_loss_mlp": 0.01266819, + "balance_loss_clip": 0.06280458, + "balance_loss_mlp": 0.01255506, + "epoch": 0.5285435142041185, + "flos": 22097207335680.0, + "grad_norm": 1.512627214826232, + "language_loss": 0.79474425, + "learning_rate": 1.9122046123695036e-06, + "loss": 0.87168682, + "num_input_tokens_seen": 188898885, + "router_z_loss_clip": 1.46972656, + "router_z_loss_mlp": 0.11309814, + "step": 8791, + "time_per_iteration": 4.033270835876465 + }, + { + "auxiliary_loss_clip": 0.06429024, + "auxiliary_loss_mlp": 0.01266875, + "balance_loss_clip": 0.06280901, + "balance_loss_mlp": 0.01255205, + "epoch": 0.5286036374567864, + "flos": 20381615314560.0, + "grad_norm": 2.07521505612664, + "language_loss": 0.65493345, + "learning_rate": 1.9118155277838423e-06, + "loss": 0.73189247, + "num_input_tokens_seen": 188917225, + "router_z_loss_clip": 1.48144531, + "router_z_loss_mlp": 0.11676025, + "step": 8792, + "time_per_iteration": 2.521308183670044 + }, + { + "auxiliary_loss_clip": 0.06423797, + "auxiliary_loss_mlp": 0.01264198, + "balance_loss_clip": 0.06276767, + "balance_loss_mlp": 0.01253415, + "epoch": 0.5286637607094544, + "flos": 24358952269440.0, + "grad_norm": 2.076646851589869, + "language_loss": 0.79861224, + "learning_rate": 1.9114264465421443e-06, + "loss": 0.87549216, + "num_input_tokens_seen": 188936120, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 0.10778809, + "step": 8793, + "time_per_iteration": 2.5511038303375244 + }, + { + "auxiliary_loss_clip": 0.06422493, + "auxiliary_loss_mlp": 0.01268071, + "balance_loss_clip": 0.062755, + "balance_loss_mlp": 0.01256168, + "epoch": 0.5287238839621223, + "flos": 17276295000960.0, + "grad_norm": 2.078436862745294, + "language_loss": 0.85337698, + "learning_rate": 1.9110373686591645e-06, + "loss": 0.93028271, + "num_input_tokens_seen": 188953405, + "router_z_loss_clip": 1.46972656, + "router_z_loss_mlp": 0.11901855, + "step": 8794, + "time_per_iteration": 2.4898123741149902 + }, + { + "auxiliary_loss_clip": 0.06434184, + "auxiliary_loss_mlp": 0.01268266, + "balance_loss_clip": 0.0627749, + "balance_loss_mlp": 0.01255284, + "epoch": 0.5287840072147904, + "flos": 17572711219200.0, + "grad_norm": 2.1545808018265427, + "language_loss": 0.67890751, + "learning_rate": 1.9106482941496564e-06, + "loss": 0.75593209, + "num_input_tokens_seen": 188971150, + "router_z_loss_clip": 1.56542969, + "router_z_loss_mlp": 0.12982178, + "step": 8795, + "time_per_iteration": 2.5213987827301025 + }, + { + "auxiliary_loss_clip": 0.0642955, + "auxiliary_loss_mlp": 0.01269682, + "balance_loss_clip": 0.06279209, + "balance_loss_mlp": 0.01257714, + "epoch": 0.5288441304674583, + "flos": 18558100834560.0, + "grad_norm": 1.7521680482784363, + "language_loss": 0.80681872, + "learning_rate": 1.910259223028374e-06, + "loss": 0.88381112, + "num_input_tokens_seen": 188989550, + "router_z_loss_clip": 1.50390625, + "router_z_loss_mlp": 0.11968994, + "step": 8796, + "time_per_iteration": 2.4875407218933105 + }, + { + "auxiliary_loss_clip": 0.06428242, + "auxiliary_loss_mlp": 0.01267761, + "balance_loss_clip": 0.06279264, + "balance_loss_mlp": 0.01255656, + "epoch": 0.5289042537201263, + "flos": 20820935871360.0, + "grad_norm": 1.952583587455058, + "language_loss": 0.69353104, + "learning_rate": 1.909870155310071e-06, + "loss": 0.770491, + "num_input_tokens_seen": 189008795, + "router_z_loss_clip": 1.48925781, + "router_z_loss_mlp": 0.12097168, + "step": 8797, + "time_per_iteration": 2.5311903953552246 + }, + { + "auxiliary_loss_clip": 0.06424771, + "auxiliary_loss_mlp": 0.01268361, + "balance_loss_clip": 0.06280869, + "balance_loss_mlp": 0.01256857, + "epoch": 0.5289643769727942, + "flos": 15739553520000.0, + "grad_norm": 1.4672049002002021, + "language_loss": 0.82371795, + "learning_rate": 1.9094810910095005e-06, + "loss": 0.90064925, + "num_input_tokens_seen": 189025540, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.11499023, + "step": 8798, + "time_per_iteration": 3.947748899459839 + }, + { + "auxiliary_loss_clip": 0.06430193, + "auxiliary_loss_mlp": 0.01268372, + "balance_loss_clip": 0.06277348, + "balance_loss_mlp": 0.01255181, + "epoch": 0.5290245002254622, + "flos": 19543490449920.0, + "grad_norm": 2.0391495748491133, + "language_loss": 0.71206701, + "learning_rate": 1.9090920301414166e-06, + "loss": 0.78905261, + "num_input_tokens_seen": 189044885, + "router_z_loss_clip": 1.52929688, + "router_z_loss_mlp": 0.13201904, + "step": 8799, + "time_per_iteration": 2.5031862258911133 + }, + { + "auxiliary_loss_clip": 0.06420026, + "auxiliary_loss_mlp": 0.01267776, + "balance_loss_clip": 0.06277078, + "balance_loss_mlp": 0.01256124, + "epoch": 0.5290846234781301, + "flos": 15820586017920.0, + "grad_norm": 1.9322407735459124, + "language_loss": 0.69337815, + "learning_rate": 1.9087029727205716e-06, + "loss": 0.77025622, + "num_input_tokens_seen": 189061280, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.11657715, + "step": 8800, + "time_per_iteration": 2.5130701065063477 + }, + { + "auxiliary_loss_clip": 0.06335981, + "auxiliary_loss_mlp": 0.01252268, + "balance_loss_clip": 0.06272759, + "balance_loss_mlp": 0.01250352, + "epoch": 0.5291447467307981, + "flos": 70076272498560.0, + "grad_norm": 0.8722049049478691, + "language_loss": 0.5706265, + "learning_rate": 1.9083139187617193e-06, + "loss": 0.64650893, + "num_input_tokens_seen": 189114775, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 0.01913452, + "step": 8801, + "time_per_iteration": 3.0075480937957764 + }, + { + "auxiliary_loss_clip": 0.06425781, + "auxiliary_loss_mlp": 0.01269363, + "balance_loss_clip": 0.06275494, + "balance_loss_mlp": 0.01257978, + "epoch": 0.529204869983466, + "flos": 28371396885120.0, + "grad_norm": 1.559087936128458, + "language_loss": 0.64462554, + "learning_rate": 1.9079248682796123e-06, + "loss": 0.72157693, + "num_input_tokens_seen": 189134700, + "router_z_loss_clip": 1.50292969, + "router_z_loss_mlp": 0.1138916, + "step": 8802, + "time_per_iteration": 2.568263053894043 + }, + { + "auxiliary_loss_clip": 0.06423493, + "auxiliary_loss_mlp": 0.01268948, + "balance_loss_clip": 0.06277072, + "balance_loss_mlp": 0.01257969, + "epoch": 0.5292649932361341, + "flos": 33766064853120.0, + "grad_norm": 1.9436732858799899, + "language_loss": 0.69115645, + "learning_rate": 1.907535821289003e-06, + "loss": 0.76808089, + "num_input_tokens_seen": 189155365, + "router_z_loss_clip": 1.46289062, + "router_z_loss_mlp": 0.10980225, + "step": 8803, + "time_per_iteration": 2.637096881866455 + }, + { + "auxiliary_loss_clip": 0.06421783, + "auxiliary_loss_mlp": 0.01270558, + "balance_loss_clip": 0.0627604, + "balance_loss_mlp": 0.01258596, + "epoch": 0.5293251164888021, + "flos": 20453717352960.0, + "grad_norm": 1.815171914881367, + "language_loss": 0.75997305, + "learning_rate": 1.9071467778046458e-06, + "loss": 0.83689642, + "num_input_tokens_seen": 189173885, + "router_z_loss_clip": 1.45800781, + "router_z_loss_mlp": 0.11962891, + "step": 8804, + "time_per_iteration": 2.5163068771362305 + }, + { + "auxiliary_loss_clip": 0.0632845, + "auxiliary_loss_mlp": 0.01252381, + "balance_loss_clip": 0.06265265, + "balance_loss_mlp": 0.01250461, + "epoch": 0.52938523974147, + "flos": 66567856590720.0, + "grad_norm": 0.7410273965373205, + "language_loss": 0.52945232, + "learning_rate": 1.906757737841291e-06, + "loss": 0.60526061, + "num_input_tokens_seen": 189236515, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 0.01916504, + "step": 8805, + "time_per_iteration": 3.24060320854187 + }, + { + "auxiliary_loss_clip": 0.06328098, + "auxiliary_loss_mlp": 0.01252617, + "balance_loss_clip": 0.06265187, + "balance_loss_mlp": 0.01250968, + "epoch": 0.529445362994138, + "flos": 67172065983360.0, + "grad_norm": 1.018872897712542, + "language_loss": 0.63735455, + "learning_rate": 1.906368701413693e-06, + "loss": 0.71316171, + "num_input_tokens_seen": 189300500, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 0.01652527, + "step": 8806, + "time_per_iteration": 3.1444826126098633 + }, + { + "auxiliary_loss_clip": 0.06429877, + "auxiliary_loss_mlp": 0.01268417, + "balance_loss_clip": 0.06274825, + "balance_loss_mlp": 0.01256073, + "epoch": 0.5295054862468059, + "flos": 17755167484800.0, + "grad_norm": 1.837636262170248, + "language_loss": 0.7251606, + "learning_rate": 1.9059796685366026e-06, + "loss": 0.80214357, + "num_input_tokens_seen": 189319745, + "router_z_loss_clip": 1.54980469, + "router_z_loss_mlp": 0.12335205, + "step": 8807, + "time_per_iteration": 2.513139247894287 + }, + { + "auxiliary_loss_clip": 0.06424799, + "auxiliary_loss_mlp": 0.01267744, + "balance_loss_clip": 0.06278958, + "balance_loss_mlp": 0.01257241, + "epoch": 0.529565609499474, + "flos": 11401622519040.0, + "grad_norm": 2.5266289150801295, + "language_loss": 0.69956362, + "learning_rate": 1.9055906392247723e-06, + "loss": 0.77648908, + "num_input_tokens_seen": 189334550, + "router_z_loss_clip": 1.45800781, + "router_z_loss_mlp": 0.1050415, + "step": 8808, + "time_per_iteration": 2.472822666168213 + }, + { + "auxiliary_loss_clip": 0.06422195, + "auxiliary_loss_mlp": 0.0126947, + "balance_loss_clip": 0.06274572, + "balance_loss_mlp": 0.01258861, + "epoch": 0.5296257327521419, + "flos": 17201174215680.0, + "grad_norm": 2.036831994826339, + "language_loss": 0.87141514, + "learning_rate": 1.9052016134929554e-06, + "loss": 0.94833171, + "num_input_tokens_seen": 189351735, + "router_z_loss_clip": 1.47851562, + "router_z_loss_mlp": 0.10614014, + "step": 8809, + "time_per_iteration": 2.5245158672332764 + }, + { + "auxiliary_loss_clip": 0.06436493, + "auxiliary_loss_mlp": 0.01270155, + "balance_loss_clip": 0.062795, + "balance_loss_mlp": 0.01257138, + "epoch": 0.5296858560048099, + "flos": 39972806265600.0, + "grad_norm": 1.6505081453472243, + "language_loss": 0.64378583, + "learning_rate": 1.9048125913559016e-06, + "loss": 0.72085232, + "num_input_tokens_seen": 189373105, + "router_z_loss_clip": 1.56835938, + "router_z_loss_mlp": 0.13037109, + "step": 8810, + "time_per_iteration": 2.6857082843780518 + }, + { + "auxiliary_loss_clip": 0.06422746, + "auxiliary_loss_mlp": 0.01270623, + "balance_loss_clip": 0.06277126, + "balance_loss_mlp": 0.01259012, + "epoch": 0.5297459792574778, + "flos": 20968032913920.0, + "grad_norm": 1.5863211204070509, + "language_loss": 0.68117309, + "learning_rate": 1.9044235728283646e-06, + "loss": 0.75810677, + "num_input_tokens_seen": 189394615, + "router_z_loss_clip": 1.45605469, + "router_z_loss_mlp": 0.11608887, + "step": 8811, + "time_per_iteration": 2.5947864055633545 + }, + { + "auxiliary_loss_clip": 0.06326769, + "auxiliary_loss_mlp": 0.01252115, + "balance_loss_clip": 0.06264065, + "balance_loss_mlp": 0.0125052, + "epoch": 0.5298061025101458, + "flos": 66542532658560.0, + "grad_norm": 0.6560344299955198, + "language_loss": 0.53324163, + "learning_rate": 1.9040345579250953e-06, + "loss": 0.60903049, + "num_input_tokens_seen": 189459750, + "router_z_loss_clip": 0.62890625, + "router_z_loss_mlp": 0.01597595, + "step": 8812, + "time_per_iteration": 3.2503774166107178 + }, + { + "auxiliary_loss_clip": 0.06327102, + "auxiliary_loss_mlp": 0.01252134, + "balance_loss_clip": 0.06264044, + "balance_loss_mlp": 0.01250548, + "epoch": 0.5298662257628137, + "flos": 67683488578560.0, + "grad_norm": 0.7118690065629296, + "language_loss": 0.56452167, + "learning_rate": 1.9036455466608453e-06, + "loss": 0.64031398, + "num_input_tokens_seen": 189527540, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 0.01586151, + "step": 8813, + "time_per_iteration": 3.211704730987549 + }, + { + "auxiliary_loss_clip": 0.06420116, + "auxiliary_loss_mlp": 0.0126288, + "balance_loss_clip": 0.06277177, + "balance_loss_mlp": 0.01252223, + "epoch": 0.5299263490154817, + "flos": 19652544938880.0, + "grad_norm": 1.6476785970765333, + "language_loss": 0.82062042, + "learning_rate": 1.9032565390503657e-06, + "loss": 0.89745033, + "num_input_tokens_seen": 189546900, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.10656738, + "step": 8814, + "time_per_iteration": 2.5407004356384277 + }, + { + "auxiliary_loss_clip": 0.06433088, + "auxiliary_loss_mlp": 0.01266965, + "balance_loss_clip": 0.062782, + "balance_loss_mlp": 0.01255646, + "epoch": 0.5299864722681497, + "flos": 22061638477440.0, + "grad_norm": 1.5146312250557674, + "language_loss": 0.85424864, + "learning_rate": 1.9028675351084076e-06, + "loss": 0.93124914, + "num_input_tokens_seen": 189566490, + "router_z_loss_clip": 1.54882812, + "router_z_loss_mlp": 0.11322021, + "step": 8815, + "time_per_iteration": 2.511718273162842 + }, + { + "auxiliary_loss_clip": 0.06421779, + "auxiliary_loss_mlp": 0.01265999, + "balance_loss_clip": 0.0627707, + "balance_loss_mlp": 0.01254573, + "epoch": 0.5300465955208177, + "flos": 21770379285120.0, + "grad_norm": 2.2057457770846947, + "language_loss": 0.67210793, + "learning_rate": 1.9024785348497225e-06, + "loss": 0.74898565, + "num_input_tokens_seen": 189585580, + "router_z_loss_clip": 1.44824219, + "router_z_loss_mlp": 0.11431885, + "step": 8816, + "time_per_iteration": 2.564680576324463 + }, + { + "auxiliary_loss_clip": 0.06425485, + "auxiliary_loss_mlp": 0.01269628, + "balance_loss_clip": 0.06278205, + "balance_loss_mlp": 0.01258106, + "epoch": 0.5301067187734857, + "flos": 43006401884160.0, + "grad_norm": 1.5302739112082, + "language_loss": 0.72652006, + "learning_rate": 1.9020895382890611e-06, + "loss": 0.80347115, + "num_input_tokens_seen": 189608485, + "router_z_loss_clip": 1.47265625, + "router_z_loss_mlp": 0.1151123, + "step": 8817, + "time_per_iteration": 2.719486951828003 + }, + { + "auxiliary_loss_clip": 0.06425378, + "auxiliary_loss_mlp": 0.0126821, + "balance_loss_clip": 0.06274515, + "balance_loss_mlp": 0.01256957, + "epoch": 0.5301668420261536, + "flos": 20559878876160.0, + "grad_norm": 1.5998738611170542, + "language_loss": 0.65166581, + "learning_rate": 1.9017005454411743e-06, + "loss": 0.72860169, + "num_input_tokens_seen": 189627815, + "router_z_loss_clip": 1.5078125, + "router_z_loss_mlp": 0.11242676, + "step": 8818, + "time_per_iteration": 2.573202610015869 + }, + { + "auxiliary_loss_clip": 0.06425599, + "auxiliary_loss_mlp": 0.01266023, + "balance_loss_clip": 0.06275538, + "balance_loss_mlp": 0.0125378, + "epoch": 0.5302269652788216, + "flos": 17491259450880.0, + "grad_norm": 1.7883158874481297, + "language_loss": 0.75112927, + "learning_rate": 1.9013115563208126e-06, + "loss": 0.82804549, + "num_input_tokens_seen": 189644850, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.12249756, + "step": 8819, + "time_per_iteration": 2.4882779121398926 + }, + { + "auxiliary_loss_clip": 0.06426901, + "auxiliary_loss_mlp": 0.01268351, + "balance_loss_clip": 0.06273513, + "balance_loss_mlp": 0.01255995, + "epoch": 0.5302870885314895, + "flos": 14579380287360.0, + "grad_norm": 2.7239673645734905, + "language_loss": 0.82232261, + "learning_rate": 1.9009225709427267e-06, + "loss": 0.89927506, + "num_input_tokens_seen": 189660945, + "router_z_loss_clip": 1.53027344, + "router_z_loss_mlp": 0.12353516, + "step": 8820, + "time_per_iteration": 2.5082767009735107 + }, + { + "auxiliary_loss_clip": 0.06421572, + "auxiliary_loss_mlp": 0.0126791, + "balance_loss_clip": 0.06271127, + "balance_loss_mlp": 0.01257437, + "epoch": 0.5303472117841576, + "flos": 23444323027200.0, + "grad_norm": 1.7959737859178544, + "language_loss": 0.72743207, + "learning_rate": 1.9005335893216667e-06, + "loss": 0.80432689, + "num_input_tokens_seen": 189680425, + "router_z_loss_clip": 1.50390625, + "router_z_loss_mlp": 0.10479736, + "step": 8821, + "time_per_iteration": 2.5132317543029785 + }, + { + "auxiliary_loss_clip": 0.06418677, + "auxiliary_loss_mlp": 0.01266676, + "balance_loss_clip": 0.06273392, + "balance_loss_mlp": 0.01255643, + "epoch": 0.5304073350368255, + "flos": 22715294578560.0, + "grad_norm": 1.486709371307985, + "language_loss": 0.74618089, + "learning_rate": 1.9001446114723824e-06, + "loss": 0.82303441, + "num_input_tokens_seen": 189700375, + "router_z_loss_clip": 1.45214844, + "router_z_loss_mlp": 0.11035156, + "step": 8822, + "time_per_iteration": 2.528388261795044 + }, + { + "auxiliary_loss_clip": 0.06422541, + "auxiliary_loss_mlp": 0.0126649, + "balance_loss_clip": 0.06275284, + "balance_loss_mlp": 0.01255094, + "epoch": 0.5304674582894935, + "flos": 27936059397120.0, + "grad_norm": 1.8362514047395362, + "language_loss": 0.67618608, + "learning_rate": 1.8997556374096257e-06, + "loss": 0.75307631, + "num_input_tokens_seen": 189721225, + "router_z_loss_clip": 1.47167969, + "router_z_loss_mlp": 0.11401367, + "step": 8823, + "time_per_iteration": 3.9042444229125977 + }, + { + "auxiliary_loss_clip": 0.06425376, + "auxiliary_loss_mlp": 0.01269944, + "balance_loss_clip": 0.06273329, + "balance_loss_mlp": 0.01257969, + "epoch": 0.5305275815421614, + "flos": 21256860337920.0, + "grad_norm": 1.7650443733670647, + "language_loss": 0.69634396, + "learning_rate": 1.8993666671481444e-06, + "loss": 0.77329719, + "num_input_tokens_seen": 189740170, + "router_z_loss_clip": 1.51953125, + "router_z_loss_mlp": 0.11968994, + "step": 8824, + "time_per_iteration": 2.5146212577819824 + }, + { + "auxiliary_loss_clip": 0.06418572, + "auxiliary_loss_mlp": 0.01266795, + "balance_loss_clip": 0.06275523, + "balance_loss_mlp": 0.01256292, + "epoch": 0.5305877047948294, + "flos": 17608867056000.0, + "grad_norm": 1.7570108593506664, + "language_loss": 0.76559019, + "learning_rate": 1.898977700702689e-06, + "loss": 0.84244382, + "num_input_tokens_seen": 189757890, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.1050415, + "step": 8825, + "time_per_iteration": 2.4815242290496826 + }, + { + "auxiliary_loss_clip": 0.06420843, + "auxiliary_loss_mlp": 0.01268607, + "balance_loss_clip": 0.06275746, + "balance_loss_mlp": 0.01257335, + "epoch": 0.5306478280474973, + "flos": 15200947474560.0, + "grad_norm": 2.5706419514423526, + "language_loss": 0.85959315, + "learning_rate": 1.8985887380880103e-06, + "loss": 0.93648767, + "num_input_tokens_seen": 189775390, + "router_z_loss_clip": 1.45117188, + "router_z_loss_mlp": 0.11279297, + "step": 8826, + "time_per_iteration": 3.921194076538086 + }, + { + "auxiliary_loss_clip": 0.06417906, + "auxiliary_loss_mlp": 0.01264941, + "balance_loss_clip": 0.06272666, + "balance_loss_mlp": 0.01253759, + "epoch": 0.5307079513001653, + "flos": 15346660924800.0, + "grad_norm": 1.4506860249913964, + "language_loss": 0.64565361, + "learning_rate": 1.8981997793188558e-06, + "loss": 0.72248203, + "num_input_tokens_seen": 189793975, + "router_z_loss_clip": 1.45117188, + "router_z_loss_mlp": 0.11181641, + "step": 8827, + "time_per_iteration": 2.4920613765716553 + }, + { + "auxiliary_loss_clip": 0.06420277, + "auxiliary_loss_mlp": 0.01268465, + "balance_loss_clip": 0.06272143, + "balance_loss_mlp": 0.01256961, + "epoch": 0.5307680745528333, + "flos": 43554567294720.0, + "grad_norm": 1.8307336922940562, + "language_loss": 0.59537661, + "learning_rate": 1.8978108244099762e-06, + "loss": 0.6722641, + "num_input_tokens_seen": 189817870, + "router_z_loss_clip": 1.48046875, + "router_z_loss_mlp": 0.11499023, + "step": 8828, + "time_per_iteration": 2.7917306423187256 + }, + { + "auxiliary_loss_clip": 0.06423927, + "auxiliary_loss_mlp": 0.012663, + "balance_loss_clip": 0.06272669, + "balance_loss_mlp": 0.01254725, + "epoch": 0.5308281978055013, + "flos": 20055332315520.0, + "grad_norm": 1.5709125682754386, + "language_loss": 0.81926584, + "learning_rate": 1.8974218733761208e-06, + "loss": 0.89616817, + "num_input_tokens_seen": 189837905, + "router_z_loss_clip": 1.51171875, + "router_z_loss_mlp": 0.11578369, + "step": 8829, + "time_per_iteration": 2.606851100921631 + }, + { + "auxiliary_loss_clip": 0.06417149, + "auxiliary_loss_mlp": 0.01263824, + "balance_loss_clip": 0.06271777, + "balance_loss_mlp": 0.01253316, + "epoch": 0.5308883210581693, + "flos": 20710162373760.0, + "grad_norm": 1.3864012566435717, + "language_loss": 0.78353059, + "learning_rate": 1.8970329262320375e-06, + "loss": 0.86034036, + "num_input_tokens_seen": 189856970, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.1050415, + "step": 8830, + "time_per_iteration": 3.954951286315918 + }, + { + "auxiliary_loss_clip": 0.06420083, + "auxiliary_loss_mlp": 0.01268446, + "balance_loss_clip": 0.06272915, + "balance_loss_mlp": 0.01256924, + "epoch": 0.5309484443108372, + "flos": 14360684330880.0, + "grad_norm": 2.11171769837039, + "language_loss": 0.81423479, + "learning_rate": 1.8966439829924768e-06, + "loss": 0.89112008, + "num_input_tokens_seen": 189872830, + "router_z_loss_clip": 1.47167969, + "router_z_loss_mlp": 0.11517334, + "step": 8831, + "time_per_iteration": 2.469822883605957 + }, + { + "auxiliary_loss_clip": 0.06415518, + "auxiliary_loss_mlp": 0.01266871, + "balance_loss_clip": 0.06270008, + "balance_loss_mlp": 0.0125579, + "epoch": 0.5310085675635052, + "flos": 20016577002240.0, + "grad_norm": 1.695592927900533, + "language_loss": 0.73638004, + "learning_rate": 1.896255043672186e-06, + "loss": 0.81320393, + "num_input_tokens_seen": 189891635, + "router_z_loss_clip": 1.45507812, + "router_z_loss_mlp": 0.11071777, + "step": 8832, + "time_per_iteration": 2.527545213699341 + }, + { + "auxiliary_loss_clip": 0.06424195, + "auxiliary_loss_mlp": 0.01266175, + "balance_loss_clip": 0.06271979, + "balance_loss_mlp": 0.01253831, + "epoch": 0.5310686908161731, + "flos": 22133824369920.0, + "grad_norm": 1.9494235860340738, + "language_loss": 0.75823116, + "learning_rate": 1.8958661082859143e-06, + "loss": 0.83513486, + "num_input_tokens_seen": 189909050, + "router_z_loss_clip": 1.52246094, + "router_z_loss_mlp": 0.12341309, + "step": 8833, + "time_per_iteration": 2.497962236404419 + }, + { + "auxiliary_loss_clip": 0.06426589, + "auxiliary_loss_mlp": 0.01264835, + "balance_loss_clip": 0.06274767, + "balance_loss_mlp": 0.01252861, + "epoch": 0.5311288140688412, + "flos": 24724871049600.0, + "grad_norm": 1.6156023907192425, + "language_loss": 0.7400462, + "learning_rate": 1.8954771768484103e-06, + "loss": 0.81696039, + "num_input_tokens_seen": 189927405, + "router_z_loss_clip": 1.51660156, + "router_z_loss_mlp": 0.11975098, + "step": 8834, + "time_per_iteration": 2.5790417194366455 + }, + { + "auxiliary_loss_clip": 0.06429796, + "auxiliary_loss_mlp": 0.01267125, + "balance_loss_clip": 0.06274004, + "balance_loss_mlp": 0.01254322, + "epoch": 0.5311889373215091, + "flos": 24104603600640.0, + "grad_norm": 1.6077843194652517, + "language_loss": 0.77900589, + "learning_rate": 1.8950882493744226e-06, + "loss": 0.85597509, + "num_input_tokens_seen": 189947740, + "router_z_loss_clip": 1.55761719, + "router_z_loss_mlp": 0.12817383, + "step": 8835, + "time_per_iteration": 2.5299718379974365 + }, + { + "auxiliary_loss_clip": 0.06422241, + "auxiliary_loss_mlp": 0.01265258, + "balance_loss_clip": 0.06272303, + "balance_loss_mlp": 0.01253147, + "epoch": 0.5312490605741771, + "flos": 22023386288640.0, + "grad_norm": 1.8854276384026003, + "language_loss": 0.72502893, + "learning_rate": 1.8946993258786985e-06, + "loss": 0.80190396, + "num_input_tokens_seen": 189966495, + "router_z_loss_clip": 1.49804688, + "router_z_loss_mlp": 0.12115479, + "step": 8836, + "time_per_iteration": 2.548025131225586 + }, + { + "auxiliary_loss_clip": 0.06424102, + "auxiliary_loss_mlp": 0.01268272, + "balance_loss_clip": 0.06273144, + "balance_loss_mlp": 0.01255815, + "epoch": 0.531309183826845, + "flos": 19396561115520.0, + "grad_norm": 1.819661501339542, + "language_loss": 0.81157684, + "learning_rate": 1.894310406375987e-06, + "loss": 0.88850057, + "num_input_tokens_seen": 189985325, + "router_z_loss_clip": 1.50976562, + "router_z_loss_mlp": 0.12463379, + "step": 8837, + "time_per_iteration": 2.484968662261963 + }, + { + "auxiliary_loss_clip": 0.06418987, + "auxiliary_loss_mlp": 0.0126777, + "balance_loss_clip": 0.06274254, + "balance_loss_mlp": 0.01255778, + "epoch": 0.531369307079513, + "flos": 20195679104640.0, + "grad_norm": 1.8987589865078431, + "language_loss": 0.86269474, + "learning_rate": 1.893921490881035e-06, + "loss": 0.93956232, + "num_input_tokens_seen": 190003290, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.11981201, + "step": 8838, + "time_per_iteration": 3.9265315532684326 + }, + { + "auxiliary_loss_clip": 0.06418579, + "auxiliary_loss_mlp": 0.01264671, + "balance_loss_clip": 0.06271757, + "balance_loss_mlp": 0.01253584, + "epoch": 0.5314294303321809, + "flos": 18886144769280.0, + "grad_norm": 1.6029216559450563, + "language_loss": 0.73087633, + "learning_rate": 1.8935325794085906e-06, + "loss": 0.8077088, + "num_input_tokens_seen": 190023260, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 0.11077881, + "step": 8839, + "time_per_iteration": 2.595414876937866 + }, + { + "auxiliary_loss_clip": 0.06421834, + "auxiliary_loss_mlp": 0.01265258, + "balance_loss_clip": 0.06271024, + "balance_loss_mlp": 0.01253551, + "epoch": 0.531489553584849, + "flos": 23046818457600.0, + "grad_norm": 1.6603149015146987, + "language_loss": 0.76847923, + "learning_rate": 1.8931436719734023e-06, + "loss": 0.84535015, + "num_input_tokens_seen": 190042035, + "router_z_loss_clip": 1.5078125, + "router_z_loss_mlp": 0.11712646, + "step": 8840, + "time_per_iteration": 2.543708086013794 + }, + { + "auxiliary_loss_clip": 0.06426372, + "auxiliary_loss_mlp": 0.01267236, + "balance_loss_clip": 0.06275196, + "balance_loss_mlp": 0.01255291, + "epoch": 0.5315496768375169, + "flos": 19796329745280.0, + "grad_norm": 3.0684588696132553, + "language_loss": 0.7743901, + "learning_rate": 1.892754768590216e-06, + "loss": 0.85132617, + "num_input_tokens_seen": 190057545, + "router_z_loss_clip": 1.51171875, + "router_z_loss_mlp": 0.11932373, + "step": 8841, + "time_per_iteration": 2.5301966667175293 + }, + { + "auxiliary_loss_clip": 0.0631949, + "auxiliary_loss_mlp": 0.01253613, + "balance_loss_clip": 0.06256352, + "balance_loss_mlp": 0.01251976, + "epoch": 0.5316098000901849, + "flos": 71044876569600.0, + "grad_norm": 0.6765052539549429, + "language_loss": 0.56618965, + "learning_rate": 1.8923658692737793e-06, + "loss": 0.64192069, + "num_input_tokens_seen": 190123800, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 0.0164032, + "step": 8842, + "time_per_iteration": 3.2740724086761475 + }, + { + "auxiliary_loss_clip": 0.06425814, + "auxiliary_loss_mlp": 0.01266185, + "balance_loss_clip": 0.06272734, + "balance_loss_mlp": 0.01252876, + "epoch": 0.5316699233428529, + "flos": 16441146956160.0, + "grad_norm": 1.7388474755658287, + "language_loss": 0.73801279, + "learning_rate": 1.8919769740388407e-06, + "loss": 0.81493276, + "num_input_tokens_seen": 190141625, + "router_z_loss_clip": 1.53027344, + "router_z_loss_mlp": 0.13317871, + "step": 8843, + "time_per_iteration": 2.5188851356506348 + }, + { + "auxiliary_loss_clip": 0.06319, + "auxiliary_loss_mlp": 0.01253092, + "balance_loss_clip": 0.06256077, + "balance_loss_mlp": 0.01251205, + "epoch": 0.5317300465955208, + "flos": 67443478957440.0, + "grad_norm": 0.8484317442594647, + "language_loss": 0.60991502, + "learning_rate": 1.891588082900145e-06, + "loss": 0.68563592, + "num_input_tokens_seen": 190198110, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 0.01882935, + "step": 8844, + "time_per_iteration": 3.1943981647491455 + }, + { + "auxiliary_loss_clip": 0.06316474, + "auxiliary_loss_mlp": 0.01252227, + "balance_loss_clip": 0.06253788, + "balance_loss_mlp": 0.01250519, + "epoch": 0.5317901698481888, + "flos": 59524095144960.0, + "grad_norm": 0.8355266908782794, + "language_loss": 0.62249273, + "learning_rate": 1.8911991958724411e-06, + "loss": 0.69817972, + "num_input_tokens_seen": 190259950, + "router_z_loss_clip": 0.62744141, + "router_z_loss_mlp": 0.01712036, + "step": 8845, + "time_per_iteration": 3.149904727935791 + }, + { + "auxiliary_loss_clip": 0.06421602, + "auxiliary_loss_mlp": 0.01271191, + "balance_loss_clip": 0.06273656, + "balance_loss_mlp": 0.01258424, + "epoch": 0.5318502931008567, + "flos": 19134204382080.0, + "grad_norm": 1.8837935046538667, + "language_loss": 0.7569865, + "learning_rate": 1.890810312970474e-06, + "loss": 0.8339144, + "num_input_tokens_seen": 190278265, + "router_z_loss_clip": 1.48046875, + "router_z_loss_mlp": 0.12774658, + "step": 8846, + "time_per_iteration": 2.5158872604370117 + }, + { + "auxiliary_loss_clip": 0.0642429, + "auxiliary_loss_mlp": 0.01267758, + "balance_loss_clip": 0.06273554, + "balance_loss_mlp": 0.01256838, + "epoch": 0.5319104163535248, + "flos": 24687960526080.0, + "grad_norm": 1.6867562646607668, + "language_loss": 0.75546432, + "learning_rate": 1.8904214342089903e-06, + "loss": 0.83238477, + "num_input_tokens_seen": 190298400, + "router_z_loss_clip": 1.50585938, + "router_z_loss_mlp": 0.10913086, + "step": 8847, + "time_per_iteration": 2.5634870529174805 + }, + { + "auxiliary_loss_clip": 0.06415805, + "auxiliary_loss_mlp": 0.01265969, + "balance_loss_clip": 0.06269352, + "balance_loss_mlp": 0.01254823, + "epoch": 0.5319705396061927, + "flos": 19390691329920.0, + "grad_norm": 1.5354205561883685, + "language_loss": 0.87653261, + "learning_rate": 1.8900325596027378e-06, + "loss": 0.95335042, + "num_input_tokens_seen": 190316235, + "router_z_loss_clip": 1.46386719, + "router_z_loss_mlp": 0.1114502, + "step": 8848, + "time_per_iteration": 2.4771876335144043 + }, + { + "auxiliary_loss_clip": 0.06423473, + "auxiliary_loss_mlp": 0.01274581, + "balance_loss_clip": 0.06273171, + "balance_loss_mlp": 0.01261564, + "epoch": 0.5320306628588607, + "flos": 18265122633600.0, + "grad_norm": 1.744694135662772, + "language_loss": 0.74510658, + "learning_rate": 1.8896436891664609e-06, + "loss": 0.82208717, + "num_input_tokens_seen": 190335060, + "router_z_loss_clip": 1.50195312, + "router_z_loss_mlp": 0.13012695, + "step": 8849, + "time_per_iteration": 2.5036580562591553 + }, + { + "auxiliary_loss_clip": 0.06429593, + "auxiliary_loss_mlp": 0.01265611, + "balance_loss_clip": 0.06274542, + "balance_loss_mlp": 0.01253761, + "epoch": 0.5320907861115286, + "flos": 23739062163840.0, + "grad_norm": 1.9586489533772713, + "language_loss": 0.79968703, + "learning_rate": 1.8892548229149066e-06, + "loss": 0.87663901, + "num_input_tokens_seen": 190353265, + "router_z_loss_clip": 1.54980469, + "router_z_loss_mlp": 0.11853027, + "step": 8850, + "time_per_iteration": 2.5143027305603027 + }, + { + "auxiliary_loss_clip": 0.06426045, + "auxiliary_loss_mlp": 0.0126479, + "balance_loss_clip": 0.06276459, + "balance_loss_mlp": 0.01254086, + "epoch": 0.5321509093641966, + "flos": 34503730272000.0, + "grad_norm": 1.273724424531188, + "language_loss": 0.55058682, + "learning_rate": 1.888865960862821e-06, + "loss": 0.62749517, + "num_input_tokens_seen": 190376575, + "router_z_loss_clip": 1.49609375, + "router_z_loss_mlp": 0.1071167, + "step": 8851, + "time_per_iteration": 2.6221299171447754 + }, + { + "auxiliary_loss_clip": 0.06426491, + "auxiliary_loss_mlp": 0.01267353, + "balance_loss_clip": 0.06274278, + "balance_loss_mlp": 0.01255844, + "epoch": 0.5322110326168645, + "flos": 20017080126720.0, + "grad_norm": 1.7230657412679744, + "language_loss": 0.69354177, + "learning_rate": 1.8884771030249484e-06, + "loss": 0.77048028, + "num_input_tokens_seen": 190395185, + "router_z_loss_clip": 1.52050781, + "router_z_loss_mlp": 0.11517334, + "step": 8852, + "time_per_iteration": 2.483614206314087 + }, + { + "auxiliary_loss_clip": 0.06316812, + "auxiliary_loss_mlp": 0.01252104, + "balance_loss_clip": 0.06254005, + "balance_loss_mlp": 0.01250446, + "epoch": 0.5322711558695326, + "flos": 64650563792640.0, + "grad_norm": 0.7839220079179184, + "language_loss": 0.62548178, + "learning_rate": 1.8880882494160357e-06, + "loss": 0.70117098, + "num_input_tokens_seen": 190452595, + "router_z_loss_clip": 0.62890625, + "router_z_loss_mlp": 0.01661682, + "step": 8853, + "time_per_iteration": 3.085580587387085 + }, + { + "auxiliary_loss_clip": 0.06429263, + "auxiliary_loss_mlp": 0.01267576, + "balance_loss_clip": 0.06274428, + "balance_loss_mlp": 0.01256364, + "epoch": 0.5323312791222005, + "flos": 14944628234880.0, + "grad_norm": 2.314845805246822, + "language_loss": 0.79806542, + "learning_rate": 1.8876994000508278e-06, + "loss": 0.87503386, + "num_input_tokens_seen": 190469140, + "router_z_loss_clip": 1.54785156, + "router_z_loss_mlp": 0.11212158, + "step": 8854, + "time_per_iteration": 2.5530436038970947 + }, + { + "auxiliary_loss_clip": 0.06415577, + "auxiliary_loss_mlp": 0.01266542, + "balance_loss_clip": 0.06272486, + "balance_loss_mlp": 0.0125663, + "epoch": 0.5323914023748685, + "flos": 23447593336320.0, + "grad_norm": 2.5938972527955038, + "language_loss": 0.74205482, + "learning_rate": 1.8873105549440698e-06, + "loss": 0.81887597, + "num_input_tokens_seen": 190489015, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.09912109, + "step": 8855, + "time_per_iteration": 2.527981996536255 + }, + { + "auxiliary_loss_clip": 0.0641944, + "auxiliary_loss_mlp": 0.01263629, + "balance_loss_clip": 0.06272254, + "balance_loss_mlp": 0.01253371, + "epoch": 0.5324515256275365, + "flos": 26293324101120.0, + "grad_norm": 4.18366969320272, + "language_loss": 0.64945328, + "learning_rate": 1.886921714110507e-06, + "loss": 0.72628403, + "num_input_tokens_seen": 190508065, + "router_z_loss_clip": 1.47070312, + "router_z_loss_mlp": 0.10266113, + "step": 8856, + "time_per_iteration": 2.5942611694335938 + }, + { + "auxiliary_loss_clip": 0.06428003, + "auxiliary_loss_mlp": 0.01267402, + "balance_loss_clip": 0.06274043, + "balance_loss_mlp": 0.01255177, + "epoch": 0.5325116488802044, + "flos": 26878316181120.0, + "grad_norm": 1.8445625051613121, + "language_loss": 0.77944165, + "learning_rate": 1.8865328775648842e-06, + "loss": 0.85639572, + "num_input_tokens_seen": 190527045, + "router_z_loss_clip": 1.53808594, + "router_z_loss_mlp": 0.12231445, + "step": 8857, + "time_per_iteration": 2.551980972290039 + }, + { + "auxiliary_loss_clip": 0.06420985, + "auxiliary_loss_mlp": 0.01266182, + "balance_loss_clip": 0.06271584, + "balance_loss_mlp": 0.01254422, + "epoch": 0.5325717721328724, + "flos": 25891794535680.0, + "grad_norm": 1.6903303041385833, + "language_loss": 0.71116436, + "learning_rate": 1.8861440453219456e-06, + "loss": 0.78803611, + "num_input_tokens_seen": 190544075, + "router_z_loss_clip": 1.49316406, + "router_z_loss_mlp": 0.11749268, + "step": 8858, + "time_per_iteration": 2.564082384109497 + }, + { + "auxiliary_loss_clip": 0.0642374, + "auxiliary_loss_mlp": 0.01268133, + "balance_loss_clip": 0.06274494, + "balance_loss_mlp": 0.01255968, + "epoch": 0.5326318953855403, + "flos": 21805864289280.0, + "grad_norm": 3.8992078644613217, + "language_loss": 0.69476694, + "learning_rate": 1.8857552173964367e-06, + "loss": 0.77168566, + "num_input_tokens_seen": 190566030, + "router_z_loss_clip": 1.49316406, + "router_z_loss_mlp": 0.12158203, + "step": 8859, + "time_per_iteration": 2.5558056831359863 + }, + { + "auxiliary_loss_clip": 0.06418291, + "auxiliary_loss_mlp": 0.01266588, + "balance_loss_clip": 0.06275187, + "balance_loss_mlp": 0.0125624, + "epoch": 0.5326920186382084, + "flos": 20929193746560.0, + "grad_norm": 1.4322040270296341, + "language_loss": 0.69681478, + "learning_rate": 1.8853663938031013e-06, + "loss": 0.77366364, + "num_input_tokens_seen": 190585605, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.10339355, + "step": 8860, + "time_per_iteration": 2.5150671005249023 + }, + { + "auxiliary_loss_clip": 0.06419887, + "auxiliary_loss_mlp": 0.01266208, + "balance_loss_clip": 0.06273462, + "balance_loss_mlp": 0.01255259, + "epoch": 0.5327521418908763, + "flos": 21439735873920.0, + "grad_norm": 1.9652920134152139, + "language_loss": 0.77936381, + "learning_rate": 1.884977574556683e-06, + "loss": 0.85622478, + "num_input_tokens_seen": 190604625, + "router_z_loss_clip": 1.46484375, + "router_z_loss_mlp": 0.10955811, + "step": 8861, + "time_per_iteration": 2.527064561843872 + }, + { + "auxiliary_loss_clip": 0.06428909, + "auxiliary_loss_mlp": 0.01269839, + "balance_loss_clip": 0.06279886, + "balance_loss_mlp": 0.012579, + "epoch": 0.5328122651435443, + "flos": 21766354289280.0, + "grad_norm": 1.487259241409864, + "language_loss": 0.8585394, + "learning_rate": 1.8845887596719279e-06, + "loss": 0.93552685, + "num_input_tokens_seen": 190625060, + "router_z_loss_clip": 1.48828125, + "router_z_loss_mlp": 0.11938477, + "step": 8862, + "time_per_iteration": 4.031865358352661 + }, + { + "auxiliary_loss_clip": 0.06431703, + "auxiliary_loss_mlp": 0.01269915, + "balance_loss_clip": 0.06279312, + "balance_loss_mlp": 0.01257046, + "epoch": 0.5328723883962122, + "flos": 18302410500480.0, + "grad_norm": 1.6037650471474167, + "language_loss": 0.61557126, + "learning_rate": 1.8841999491635778e-06, + "loss": 0.69258749, + "num_input_tokens_seen": 190643150, + "router_z_loss_clip": 1.5234375, + "router_z_loss_mlp": 0.12866211, + "step": 8863, + "time_per_iteration": 2.499657154083252 + }, + { + "auxiliary_loss_clip": 0.06422713, + "auxiliary_loss_mlp": 0.01268054, + "balance_loss_clip": 0.06278422, + "balance_loss_mlp": 0.01257736, + "epoch": 0.5329325116488802, + "flos": 25382049022080.0, + "grad_norm": 1.8448114340212167, + "language_loss": 0.73693913, + "learning_rate": 1.883811143046377e-06, + "loss": 0.81384677, + "num_input_tokens_seen": 190662725, + "router_z_loss_clip": 1.44335938, + "router_z_loss_mlp": 0.10314941, + "step": 8864, + "time_per_iteration": 2.549104928970337 + }, + { + "auxiliary_loss_clip": 0.06424475, + "auxiliary_loss_mlp": 0.01267423, + "balance_loss_clip": 0.06276639, + "balance_loss_mlp": 0.0125636, + "epoch": 0.5329926349015481, + "flos": 25598984042880.0, + "grad_norm": 1.865165386122464, + "language_loss": 0.64464402, + "learning_rate": 1.8834223413350702e-06, + "loss": 0.72156298, + "num_input_tokens_seen": 190683680, + "router_z_loss_clip": 1.47851562, + "router_z_loss_mlp": 0.11065674, + "step": 8865, + "time_per_iteration": 4.099254608154297 + }, + { + "auxiliary_loss_clip": 0.0642702, + "auxiliary_loss_mlp": 0.01269229, + "balance_loss_clip": 0.06277309, + "balance_loss_mlp": 0.01257874, + "epoch": 0.5330527581542162, + "flos": 22895612565120.0, + "grad_norm": 1.6799514905357744, + "language_loss": 0.78778207, + "learning_rate": 1.8830335440443989e-06, + "loss": 0.86474454, + "num_input_tokens_seen": 190703350, + "router_z_loss_clip": 1.49511719, + "router_z_loss_mlp": 0.11346436, + "step": 8866, + "time_per_iteration": 2.505974531173706 + }, + { + "auxiliary_loss_clip": 0.06424611, + "auxiliary_loss_mlp": 0.01266962, + "balance_loss_clip": 0.06276287, + "balance_loss_mlp": 0.01255333, + "epoch": 0.5331128814068841, + "flos": 16031022347520.0, + "grad_norm": 1.850684934112151, + "language_loss": 0.74175781, + "learning_rate": 1.882644751189108e-06, + "loss": 0.81867361, + "num_input_tokens_seen": 190721170, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.11633301, + "step": 8867, + "time_per_iteration": 2.5437192916870117 + }, + { + "auxiliary_loss_clip": 0.0642608, + "auxiliary_loss_mlp": 0.0126656, + "balance_loss_clip": 0.06276974, + "balance_loss_mlp": 0.01254204, + "epoch": 0.5331730046595521, + "flos": 39353461211520.0, + "grad_norm": 1.4678278533937592, + "language_loss": 0.72377831, + "learning_rate": 1.88225596278394e-06, + "loss": 0.80070472, + "num_input_tokens_seen": 190743795, + "router_z_loss_clip": 1.49121094, + "router_z_loss_mlp": 0.12353516, + "step": 8868, + "time_per_iteration": 2.6680116653442383 + }, + { + "auxiliary_loss_clip": 0.06425264, + "auxiliary_loss_mlp": 0.01269354, + "balance_loss_clip": 0.06276006, + "balance_loss_mlp": 0.01258345, + "epoch": 0.5332331279122201, + "flos": 24031201824000.0, + "grad_norm": 1.7262272651388555, + "language_loss": 0.78884375, + "learning_rate": 1.881867178843637e-06, + "loss": 0.86578989, + "num_input_tokens_seen": 190761560, + "router_z_loss_clip": 1.49121094, + "router_z_loss_mlp": 0.11016846, + "step": 8869, + "time_per_iteration": 3.9937024116516113 + }, + { + "auxiliary_loss_clip": 0.06438692, + "auxiliary_loss_mlp": 0.0126748, + "balance_loss_clip": 0.06282986, + "balance_loss_mlp": 0.01255434, + "epoch": 0.533293251164888, + "flos": 17135109671040.0, + "grad_norm": 2.017265080243192, + "language_loss": 0.7622692, + "learning_rate": 1.8814783993829434e-06, + "loss": 0.83933091, + "num_input_tokens_seen": 190778875, + "router_z_loss_clip": 1.55566406, + "router_z_loss_mlp": 0.1204834, + "step": 8870, + "time_per_iteration": 2.520585536956787 + }, + { + "auxiliary_loss_clip": 0.06435512, + "auxiliary_loss_mlp": 0.01273068, + "balance_loss_clip": 0.06280903, + "balance_loss_mlp": 0.01260366, + "epoch": 0.533353374417556, + "flos": 22132734266880.0, + "grad_norm": 2.1166188019250316, + "language_loss": 0.76185441, + "learning_rate": 1.8810896244165997e-06, + "loss": 0.83894014, + "num_input_tokens_seen": 190799830, + "router_z_loss_clip": 1.546875, + "router_z_loss_mlp": 0.12713623, + "step": 8871, + "time_per_iteration": 2.5372307300567627 + }, + { + "auxiliary_loss_clip": 0.06427529, + "auxiliary_loss_mlp": 0.01272588, + "balance_loss_clip": 0.06279083, + "balance_loss_mlp": 0.01261383, + "epoch": 0.533413497670224, + "flos": 15016185221760.0, + "grad_norm": 1.8709318225271354, + "language_loss": 0.72608036, + "learning_rate": 1.8807008539593498e-06, + "loss": 0.80308151, + "num_input_tokens_seen": 190817155, + "router_z_loss_clip": 1.48339844, + "router_z_loss_mlp": 0.11206055, + "step": 8872, + "time_per_iteration": 2.486344337463379 + }, + { + "auxiliary_loss_clip": 0.06426945, + "auxiliary_loss_mlp": 0.01270876, + "balance_loss_clip": 0.06280041, + "balance_loss_mlp": 0.01258925, + "epoch": 0.533473620922892, + "flos": 19616095612800.0, + "grad_norm": 1.6405410033387824, + "language_loss": 0.65059078, + "learning_rate": 1.880312088025936e-06, + "loss": 0.72756892, + "num_input_tokens_seen": 190835240, + "router_z_loss_clip": 1.46777344, + "router_z_loss_mlp": 0.11956787, + "step": 8873, + "time_per_iteration": 2.4989571571350098 + }, + { + "auxiliary_loss_clip": 0.06430013, + "auxiliary_loss_mlp": 0.01270669, + "balance_loss_clip": 0.06281542, + "balance_loss_mlp": 0.01260113, + "epoch": 0.5335337441755599, + "flos": 14287827605760.0, + "grad_norm": 2.154155286859053, + "language_loss": 0.80397201, + "learning_rate": 1.879923326631099e-06, + "loss": 0.88097882, + "num_input_tokens_seen": 190851620, + "router_z_loss_clip": 1.48535156, + "router_z_loss_mlp": 0.10559082, + "step": 8874, + "time_per_iteration": 2.5248029232025146 + }, + { + "auxiliary_loss_clip": 0.06429289, + "auxiliary_loss_mlp": 0.01270488, + "balance_loss_clip": 0.06281012, + "balance_loss_mlp": 0.01259306, + "epoch": 0.5335938674282279, + "flos": 20821313214720.0, + "grad_norm": 1.9252791788754828, + "language_loss": 0.70199001, + "learning_rate": 1.879534569789582e-06, + "loss": 0.77898782, + "num_input_tokens_seen": 190870545, + "router_z_loss_clip": 1.48339844, + "router_z_loss_mlp": 0.11181641, + "step": 8875, + "time_per_iteration": 2.514606475830078 + }, + { + "auxiliary_loss_clip": 0.06327371, + "auxiliary_loss_mlp": 0.01252854, + "balance_loss_clip": 0.06264151, + "balance_loss_mlp": 0.01251167, + "epoch": 0.5336539906808958, + "flos": 71419558101120.0, + "grad_norm": 0.7076326652144627, + "language_loss": 0.59621203, + "learning_rate": 1.879145817516126e-06, + "loss": 0.6720143, + "num_input_tokens_seen": 190931995, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 0.01690674, + "step": 8876, + "time_per_iteration": 3.2623958587646484 + }, + { + "auxiliary_loss_clip": 0.06431912, + "auxiliary_loss_mlp": 0.0127027, + "balance_loss_clip": 0.06282675, + "balance_loss_mlp": 0.01259833, + "epoch": 0.5337141139335638, + "flos": 20158517018880.0, + "grad_norm": 1.761940945107411, + "language_loss": 0.75235462, + "learning_rate": 1.8787570698254727e-06, + "loss": 0.8293764, + "num_input_tokens_seen": 190949890, + "router_z_loss_clip": 1.49121094, + "router_z_loss_mlp": 0.10437012, + "step": 8877, + "time_per_iteration": 4.019563674926758 + }, + { + "auxiliary_loss_clip": 0.06329054, + "auxiliary_loss_mlp": 0.01254827, + "balance_loss_clip": 0.06265914, + "balance_loss_mlp": 0.01253019, + "epoch": 0.5337742371862317, + "flos": 67747624479360.0, + "grad_norm": 0.7353643225564799, + "language_loss": 0.57172877, + "learning_rate": 1.8783683267323629e-06, + "loss": 0.64756757, + "num_input_tokens_seen": 191008480, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 0.01803589, + "step": 8878, + "time_per_iteration": 3.0581912994384766 + }, + { + "auxiliary_loss_clip": 0.06440037, + "auxiliary_loss_mlp": 0.0127241, + "balance_loss_clip": 0.06285742, + "balance_loss_mlp": 0.01260573, + "epoch": 0.5338343604388998, + "flos": 25015794825600.0, + "grad_norm": 1.5270572668187339, + "language_loss": 0.7260288, + "learning_rate": 1.8779795882515395e-06, + "loss": 0.80315328, + "num_input_tokens_seen": 191028995, + "router_z_loss_clip": 1.54492188, + "router_z_loss_mlp": 0.11834717, + "step": 8879, + "time_per_iteration": 2.594075918197632 + }, + { + "auxiliary_loss_clip": 0.06432897, + "auxiliary_loss_mlp": 0.01271434, + "balance_loss_clip": 0.06280228, + "balance_loss_mlp": 0.01259644, + "epoch": 0.5338944836915677, + "flos": 17606728776960.0, + "grad_norm": 2.8683921774089445, + "language_loss": 0.84095323, + "learning_rate": 1.8775908543977416e-06, + "loss": 0.91799653, + "num_input_tokens_seen": 191045285, + "router_z_loss_clip": 1.52636719, + "router_z_loss_mlp": 0.11785889, + "step": 8880, + "time_per_iteration": 2.4828426837921143 + }, + { + "auxiliary_loss_clip": 0.06424058, + "auxiliary_loss_mlp": 0.01273011, + "balance_loss_clip": 0.06279065, + "balance_loss_mlp": 0.01262277, + "epoch": 0.5339546069442357, + "flos": 21730282306560.0, + "grad_norm": 1.3465483600758703, + "language_loss": 0.79582727, + "learning_rate": 1.8772021251857107e-06, + "loss": 0.87279797, + "num_input_tokens_seen": 191066105, + "router_z_loss_clip": 1.45019531, + "router_z_loss_mlp": 0.1072998, + "step": 8881, + "time_per_iteration": 2.5683958530426025 + }, + { + "auxiliary_loss_clip": 0.06324948, + "auxiliary_loss_mlp": 0.01252734, + "balance_loss_clip": 0.06261811, + "balance_loss_mlp": 0.01251199, + "epoch": 0.5340147301969036, + "flos": 69741226748160.0, + "grad_norm": 0.7871410050477539, + "language_loss": 0.5924378, + "learning_rate": 1.8768134006301882e-06, + "loss": 0.66821468, + "num_input_tokens_seen": 191126315, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 0.01533508, + "step": 8882, + "time_per_iteration": 3.0768346786499023 + }, + { + "auxiliary_loss_clip": 0.06325522, + "auxiliary_loss_mlp": 0.01253695, + "balance_loss_clip": 0.06262392, + "balance_loss_mlp": 0.01252035, + "epoch": 0.5340748534495716, + "flos": 63896504901120.0, + "grad_norm": 0.885852476410532, + "language_loss": 0.63786471, + "learning_rate": 1.876424680745913e-06, + "loss": 0.7136569, + "num_input_tokens_seen": 191174240, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 0.01663208, + "step": 8883, + "time_per_iteration": 2.967287063598633 + }, + { + "auxiliary_loss_clip": 0.06432307, + "auxiliary_loss_mlp": 0.01267155, + "balance_loss_clip": 0.06278822, + "balance_loss_mlp": 0.01254942, + "epoch": 0.5341349767022396, + "flos": 28701872588160.0, + "grad_norm": 2.199844959316804, + "language_loss": 0.82043612, + "learning_rate": 1.8760359655476272e-06, + "loss": 0.89743072, + "num_input_tokens_seen": 191193335, + "router_z_loss_clip": 1.53515625, + "router_z_loss_mlp": 0.12200928, + "step": 8884, + "time_per_iteration": 2.5675361156463623 + }, + { + "auxiliary_loss_clip": 0.06425676, + "auxiliary_loss_mlp": 0.01268668, + "balance_loss_clip": 0.06280383, + "balance_loss_mlp": 0.01257873, + "epoch": 0.5341950999549075, + "flos": 16295265797760.0, + "grad_norm": 1.5488539614491517, + "language_loss": 0.72820723, + "learning_rate": 1.8756472550500695e-06, + "loss": 0.80515063, + "num_input_tokens_seen": 191210900, + "router_z_loss_clip": 1.45214844, + "router_z_loss_mlp": 0.10784912, + "step": 8885, + "time_per_iteration": 2.5164196491241455 + }, + { + "auxiliary_loss_clip": 0.06432982, + "auxiliary_loss_mlp": 0.01266357, + "balance_loss_clip": 0.06277923, + "balance_loss_mlp": 0.01254525, + "epoch": 0.5342552232075756, + "flos": 14360852039040.0, + "grad_norm": 1.8494222651114738, + "language_loss": 0.78934276, + "learning_rate": 1.87525854926798e-06, + "loss": 0.86633611, + "num_input_tokens_seen": 191226730, + "router_z_loss_clip": 1.54980469, + "router_z_loss_mlp": 0.11834717, + "step": 8886, + "time_per_iteration": 2.524366855621338 + }, + { + "auxiliary_loss_clip": 0.06429981, + "auxiliary_loss_mlp": 0.01268189, + "balance_loss_clip": 0.06279354, + "balance_loss_mlp": 0.01255606, + "epoch": 0.5343153464602435, + "flos": 30305517154560.0, + "grad_norm": 1.3913460534471052, + "language_loss": 0.75135863, + "learning_rate": 1.8748698482160996e-06, + "loss": 0.82834035, + "num_input_tokens_seen": 191250435, + "router_z_loss_clip": 1.50683594, + "router_z_loss_mlp": 0.12579346, + "step": 8887, + "time_per_iteration": 2.6564323902130127 + }, + { + "auxiliary_loss_clip": 0.06427558, + "auxiliary_loss_mlp": 0.01265573, + "balance_loss_clip": 0.06278411, + "balance_loss_mlp": 0.0125401, + "epoch": 0.5343754697129115, + "flos": 15601722353280.0, + "grad_norm": 2.357980716065106, + "language_loss": 0.69295096, + "learning_rate": 1.8744811519091663e-06, + "loss": 0.76988232, + "num_input_tokens_seen": 191268315, + "router_z_loss_clip": 1.49023438, + "router_z_loss_mlp": 0.11560059, + "step": 8888, + "time_per_iteration": 2.4917025566101074 + }, + { + "auxiliary_loss_clip": 0.06442724, + "auxiliary_loss_mlp": 0.01272933, + "balance_loss_clip": 0.06283408, + "balance_loss_mlp": 0.01260935, + "epoch": 0.5344355929655794, + "flos": 16915239757440.0, + "grad_norm": 1.9387999695924976, + "language_loss": 0.78584576, + "learning_rate": 1.8740924603619208e-06, + "loss": 0.8630023, + "num_input_tokens_seen": 191287000, + "router_z_loss_clip": 1.59277344, + "router_z_loss_mlp": 0.12005615, + "step": 8889, + "time_per_iteration": 2.5028741359710693 + }, + { + "auxiliary_loss_clip": 0.06424284, + "auxiliary_loss_mlp": 0.01268375, + "balance_loss_clip": 0.06276136, + "balance_loss_mlp": 0.01256431, + "epoch": 0.5344957162182474, + "flos": 16803460010880.0, + "grad_norm": 1.9089962398127316, + "language_loss": 0.69733131, + "learning_rate": 1.873703773589102e-06, + "loss": 0.7742579, + "num_input_tokens_seen": 191304565, + "router_z_loss_clip": 1.47949219, + "router_z_loss_mlp": 0.1194458, + "step": 8890, + "time_per_iteration": 2.4705469608306885 + }, + { + "auxiliary_loss_clip": 0.06430273, + "auxiliary_loss_mlp": 0.01267824, + "balance_loss_clip": 0.0627601, + "balance_loss_mlp": 0.01255635, + "epoch": 0.5345558394709153, + "flos": 12709144356480.0, + "grad_norm": 3.2953855429591536, + "language_loss": 0.77688992, + "learning_rate": 1.8733150916054483e-06, + "loss": 0.85387087, + "num_input_tokens_seen": 191318300, + "router_z_loss_clip": 1.54101562, + "router_z_loss_mlp": 0.12182617, + "step": 8891, + "time_per_iteration": 2.500333547592163 + }, + { + "auxiliary_loss_clip": 0.06428199, + "auxiliary_loss_mlp": 0.01268573, + "balance_loss_clip": 0.06281698, + "balance_loss_mlp": 0.01257486, + "epoch": 0.5346159627235834, + "flos": 22461532888320.0, + "grad_norm": 1.516620120390114, + "language_loss": 0.74519014, + "learning_rate": 1.872926414425699e-06, + "loss": 0.82215786, + "num_input_tokens_seen": 191337925, + "router_z_loss_clip": 1.46484375, + "router_z_loss_mlp": 0.11102295, + "step": 8892, + "time_per_iteration": 2.4968128204345703 + }, + { + "auxiliary_loss_clip": 0.06427278, + "auxiliary_loss_mlp": 0.01264312, + "balance_loss_clip": 0.06277005, + "balance_loss_mlp": 0.01253566, + "epoch": 0.5346760859762513, + "flos": 22421771326080.0, + "grad_norm": 1.6631056082688196, + "language_loss": 0.87902844, + "learning_rate": 1.8725377420645932e-06, + "loss": 0.95594442, + "num_input_tokens_seen": 191357120, + "router_z_loss_clip": 1.50292969, + "router_z_loss_mlp": 0.10742188, + "step": 8893, + "time_per_iteration": 2.5580215454101562 + }, + { + "auxiliary_loss_clip": 0.06429157, + "auxiliary_loss_mlp": 0.01263801, + "balance_loss_clip": 0.06281421, + "balance_loss_mlp": 0.01253155, + "epoch": 0.5347362092289193, + "flos": 22822043080320.0, + "grad_norm": 1.612055893952936, + "language_loss": 0.72799695, + "learning_rate": 1.872149074536869e-06, + "loss": 0.80492651, + "num_input_tokens_seen": 191375395, + "router_z_loss_clip": 1.4765625, + "router_z_loss_mlp": 0.10650635, + "step": 8894, + "time_per_iteration": 2.54834246635437 + }, + { + "auxiliary_loss_clip": 0.06422012, + "auxiliary_loss_mlp": 0.01266432, + "balance_loss_clip": 0.06275687, + "balance_loss_mlp": 0.01254571, + "epoch": 0.5347963324815872, + "flos": 23225794778880.0, + "grad_norm": 1.4320398201671862, + "language_loss": 0.75047934, + "learning_rate": 1.8717604118572648e-06, + "loss": 0.82736373, + "num_input_tokens_seen": 191395595, + "router_z_loss_clip": 1.46289062, + "router_z_loss_mlp": 0.11865234, + "step": 8895, + "time_per_iteration": 2.5309391021728516 + }, + { + "auxiliary_loss_clip": 0.06432986, + "auxiliary_loss_mlp": 0.01266799, + "balance_loss_clip": 0.06282157, + "balance_loss_mlp": 0.01255606, + "epoch": 0.5348564557342552, + "flos": 22607917171200.0, + "grad_norm": 1.7183644079473714, + "language_loss": 0.77449572, + "learning_rate": 1.8713717540405178e-06, + "loss": 0.8514936, + "num_input_tokens_seen": 191413730, + "router_z_loss_clip": 1.50683594, + "router_z_loss_mlp": 0.11181641, + "step": 8896, + "time_per_iteration": 2.5175390243530273 + }, + { + "auxiliary_loss_clip": 0.06424737, + "auxiliary_loss_mlp": 0.01267928, + "balance_loss_clip": 0.06278285, + "balance_loss_mlp": 0.01256639, + "epoch": 0.5349165789869232, + "flos": 18007880999040.0, + "grad_norm": 1.7578614055599853, + "language_loss": 0.79043764, + "learning_rate": 1.8709831011013676e-06, + "loss": 0.86736429, + "num_input_tokens_seen": 191432400, + "router_z_loss_clip": 1.46386719, + "router_z_loss_mlp": 0.11297607, + "step": 8897, + "time_per_iteration": 2.5068724155426025 + }, + { + "auxiliary_loss_clip": 0.06429999, + "auxiliary_loss_mlp": 0.01264934, + "balance_loss_clip": 0.06279507, + "balance_loss_mlp": 0.01253365, + "epoch": 0.5349767022395912, + "flos": 17164557181440.0, + "grad_norm": 1.7104987912832146, + "language_loss": 0.76011693, + "learning_rate": 1.8705944530545509e-06, + "loss": 0.83706623, + "num_input_tokens_seen": 191448855, + "router_z_loss_clip": 1.50488281, + "router_z_loss_mlp": 0.11566162, + "step": 8898, + "time_per_iteration": 2.5468573570251465 + }, + { + "auxiliary_loss_clip": 0.06323466, + "auxiliary_loss_mlp": 0.01262304, + "balance_loss_clip": 0.06260733, + "balance_loss_mlp": 0.01260944, + "epoch": 0.5350368254922592, + "flos": 71014590518400.0, + "grad_norm": 0.8026406428525971, + "language_loss": 0.57916105, + "learning_rate": 1.8702058099148052e-06, + "loss": 0.65501881, + "num_input_tokens_seen": 191519690, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 0.01361847, + "step": 8899, + "time_per_iteration": 3.354367256164551 + }, + { + "auxiliary_loss_clip": 0.06428243, + "auxiliary_loss_mlp": 0.01266735, + "balance_loss_clip": 0.06281818, + "balance_loss_mlp": 0.01255857, + "epoch": 0.5350969487449271, + "flos": 27425265707520.0, + "grad_norm": 1.5056303351191316, + "language_loss": 0.70071346, + "learning_rate": 1.869817171696868e-06, + "loss": 0.77766323, + "num_input_tokens_seen": 191539380, + "router_z_loss_clip": 1.46484375, + "router_z_loss_mlp": 0.10882568, + "step": 8900, + "time_per_iteration": 2.596675395965576 + }, + { + "auxiliary_loss_clip": 0.0643241, + "auxiliary_loss_mlp": 0.01268767, + "balance_loss_clip": 0.06280074, + "balance_loss_mlp": 0.0125743, + "epoch": 0.5351570719975951, + "flos": 19321901527680.0, + "grad_norm": 1.5148336766284718, + "language_loss": 0.71324182, + "learning_rate": 1.8694285384154777e-06, + "loss": 0.79025364, + "num_input_tokens_seen": 191557400, + "router_z_loss_clip": 1.52148438, + "router_z_loss_mlp": 0.11346436, + "step": 8901, + "time_per_iteration": 2.526811122894287 + }, + { + "auxiliary_loss_clip": 0.06432061, + "auxiliary_loss_mlp": 0.01269417, + "balance_loss_clip": 0.06280375, + "balance_loss_mlp": 0.01257377, + "epoch": 0.535217195250263, + "flos": 19834707715200.0, + "grad_norm": 1.961594084549487, + "language_loss": 0.77373689, + "learning_rate": 1.8690399100853699e-06, + "loss": 0.85075164, + "num_input_tokens_seen": 191575860, + "router_z_loss_clip": 1.51757812, + "router_z_loss_mlp": 0.1204834, + "step": 8902, + "time_per_iteration": 3.931328773498535 + }, + { + "auxiliary_loss_clip": 0.06422594, + "auxiliary_loss_mlp": 0.01261364, + "balance_loss_clip": 0.0627951, + "balance_loss_mlp": 0.01250188, + "epoch": 0.535277318502931, + "flos": 22134495202560.0, + "grad_norm": 1.5214881410098744, + "language_loss": 0.7052539, + "learning_rate": 1.868651286721281e-06, + "loss": 0.78209347, + "num_input_tokens_seen": 191595775, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.1116333, + "step": 8903, + "time_per_iteration": 2.5344340801239014 + }, + { + "auxiliary_loss_clip": 0.06433277, + "auxiliary_loss_mlp": 0.01267717, + "balance_loss_clip": 0.06279396, + "balance_loss_mlp": 0.01255426, + "epoch": 0.5353374417555989, + "flos": 25052873057280.0, + "grad_norm": 1.5307499252390009, + "language_loss": 0.72374737, + "learning_rate": 1.86826266833795e-06, + "loss": 0.80075729, + "num_input_tokens_seen": 191617785, + "router_z_loss_clip": 1.53710938, + "router_z_loss_mlp": 0.12304688, + "step": 8904, + "time_per_iteration": 3.979325294494629 + }, + { + "auxiliary_loss_clip": 0.06430352, + "auxiliary_loss_mlp": 0.0127012, + "balance_loss_clip": 0.06280231, + "balance_loss_mlp": 0.01257961, + "epoch": 0.535397565008267, + "flos": 19394422836480.0, + "grad_norm": 1.7887132092295748, + "language_loss": 0.73359382, + "learning_rate": 1.8678740549501103e-06, + "loss": 0.81059849, + "num_input_tokens_seen": 191636900, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.121521, + "step": 8905, + "time_per_iteration": 2.5468502044677734 + }, + { + "auxiliary_loss_clip": 0.06426303, + "auxiliary_loss_mlp": 0.01263381, + "balance_loss_clip": 0.06282683, + "balance_loss_mlp": 0.01252402, + "epoch": 0.5354576882609349, + "flos": 21477736500480.0, + "grad_norm": 1.458955847450215, + "language_loss": 0.83904094, + "learning_rate": 1.8674854465725005e-06, + "loss": 0.91593778, + "num_input_tokens_seen": 191656720, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.10980225, + "step": 8906, + "time_per_iteration": 2.5199477672576904 + }, + { + "auxiliary_loss_clip": 0.06430362, + "auxiliary_loss_mlp": 0.01270808, + "balance_loss_clip": 0.06278186, + "balance_loss_mlp": 0.01258416, + "epoch": 0.5355178115136029, + "flos": 20783857639680.0, + "grad_norm": 1.893504710630849, + "language_loss": 0.74486792, + "learning_rate": 1.8670968432198563e-06, + "loss": 0.82187963, + "num_input_tokens_seen": 191674445, + "router_z_loss_clip": 1.52246094, + "router_z_loss_mlp": 0.1237793, + "step": 8907, + "time_per_iteration": 2.5200021266937256 + }, + { + "auxiliary_loss_clip": 0.06428273, + "auxiliary_loss_mlp": 0.01264992, + "balance_loss_clip": 0.06280483, + "balance_loss_mlp": 0.0125421, + "epoch": 0.5355779347662708, + "flos": 23520827404800.0, + "grad_norm": 1.6955230805298804, + "language_loss": 0.76706243, + "learning_rate": 1.866708244906912e-06, + "loss": 0.84399509, + "num_input_tokens_seen": 191695000, + "router_z_loss_clip": 1.47851562, + "router_z_loss_mlp": 0.10772705, + "step": 8908, + "time_per_iteration": 4.040110349655151 + }, + { + "auxiliary_loss_clip": 0.06432807, + "auxiliary_loss_mlp": 0.01271179, + "balance_loss_clip": 0.06280953, + "balance_loss_mlp": 0.01258835, + "epoch": 0.5356380580189388, + "flos": 20309471349120.0, + "grad_norm": 2.626231250487559, + "language_loss": 0.74318033, + "learning_rate": 1.8663196516484055e-06, + "loss": 0.82022017, + "num_input_tokens_seen": 191713295, + "router_z_loss_clip": 1.51855469, + "router_z_loss_mlp": 0.12347412, + "step": 8909, + "time_per_iteration": 2.503324031829834 + }, + { + "auxiliary_loss_clip": 0.06428281, + "auxiliary_loss_mlp": 0.01268046, + "balance_loss_clip": 0.06279926, + "balance_loss_mlp": 0.0125724, + "epoch": 0.5356981812716068, + "flos": 21368136960000.0, + "grad_norm": 2.2429477917403435, + "language_loss": 0.84013373, + "learning_rate": 1.8659310634590702e-06, + "loss": 0.91709697, + "num_input_tokens_seen": 191732725, + "router_z_loss_clip": 1.48242188, + "router_z_loss_mlp": 0.10803223, + "step": 8910, + "time_per_iteration": 2.532768726348877 + }, + { + "auxiliary_loss_clip": 0.06428899, + "auxiliary_loss_mlp": 0.01267044, + "balance_loss_clip": 0.06278617, + "balance_loss_mlp": 0.01255152, + "epoch": 0.5357583045242748, + "flos": 23117746538880.0, + "grad_norm": 1.5068539432144845, + "language_loss": 0.82170522, + "learning_rate": 1.8655424803536427e-06, + "loss": 0.89866459, + "num_input_tokens_seen": 191753765, + "router_z_loss_clip": 1.50292969, + "router_z_loss_mlp": 0.11895752, + "step": 8911, + "time_per_iteration": 2.530242681503296 + }, + { + "auxiliary_loss_clip": 0.06427851, + "auxiliary_loss_mlp": 0.01268226, + "balance_loss_clip": 0.06279093, + "balance_loss_mlp": 0.01256794, + "epoch": 0.5358184277769428, + "flos": 21148057411200.0, + "grad_norm": 1.7566097539058134, + "language_loss": 0.6953544, + "learning_rate": 1.8651539023468585e-06, + "loss": 0.7723152, + "num_input_tokens_seen": 191773560, + "router_z_loss_clip": 1.48730469, + "router_z_loss_mlp": 0.11425781, + "step": 8912, + "time_per_iteration": 2.52546763420105 + }, + { + "auxiliary_loss_clip": 0.06429117, + "auxiliary_loss_mlp": 0.01266082, + "balance_loss_clip": 0.06281352, + "balance_loss_mlp": 0.01255234, + "epoch": 0.5358785510296107, + "flos": 16286754608640.0, + "grad_norm": 1.7988140692342254, + "language_loss": 0.71504682, + "learning_rate": 1.8647653294534509e-06, + "loss": 0.79199886, + "num_input_tokens_seen": 191791255, + "router_z_loss_clip": 1.4765625, + "router_z_loss_mlp": 0.10858154, + "step": 8913, + "time_per_iteration": 2.4723551273345947 + }, + { + "auxiliary_loss_clip": 0.06437049, + "auxiliary_loss_mlp": 0.01269643, + "balance_loss_clip": 0.06283163, + "balance_loss_mlp": 0.01257883, + "epoch": 0.5359386742822787, + "flos": 16981555864320.0, + "grad_norm": 1.6333944745256754, + "language_loss": 0.72038394, + "learning_rate": 1.864376761688156e-06, + "loss": 0.7974509, + "num_input_tokens_seen": 191809325, + "router_z_loss_clip": 1.5390625, + "router_z_loss_mlp": 0.11761475, + "step": 8914, + "time_per_iteration": 2.5807461738586426 + }, + { + "auxiliary_loss_clip": 0.06438086, + "auxiliary_loss_mlp": 0.01272172, + "balance_loss_clip": 0.06283066, + "balance_loss_mlp": 0.01259327, + "epoch": 0.5359987975349466, + "flos": 20819091081600.0, + "grad_norm": 1.7157890571158112, + "language_loss": 0.706487, + "learning_rate": 1.8639881990657079e-06, + "loss": 0.7835896, + "num_input_tokens_seen": 191829795, + "router_z_loss_clip": 1.54785156, + "router_z_loss_mlp": 0.12841797, + "step": 8915, + "time_per_iteration": 2.542787790298462 + }, + { + "auxiliary_loss_clip": 0.06428587, + "auxiliary_loss_mlp": 0.01269302, + "balance_loss_clip": 0.06281634, + "balance_loss_mlp": 0.01257918, + "epoch": 0.5360589207876146, + "flos": 22206429532800.0, + "grad_norm": 1.674776865577312, + "language_loss": 0.75600839, + "learning_rate": 1.8635996416008408e-06, + "loss": 0.83298731, + "num_input_tokens_seen": 191850840, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 0.11383057, + "step": 8916, + "time_per_iteration": 2.5621731281280518 + }, + { + "auxiliary_loss_clip": 0.06429151, + "auxiliary_loss_mlp": 0.01268516, + "balance_loss_clip": 0.06277589, + "balance_loss_mlp": 0.01256995, + "epoch": 0.5361190440402825, + "flos": 31402393027200.0, + "grad_norm": 2.5448267428400655, + "language_loss": 0.72810572, + "learning_rate": 1.863211089308289e-06, + "loss": 0.80508238, + "num_input_tokens_seen": 191869520, + "router_z_loss_clip": 1.51660156, + "router_z_loss_mlp": 0.1151123, + "step": 8917, + "time_per_iteration": 4.027824401855469 + }, + { + "auxiliary_loss_clip": 0.06433325, + "auxiliary_loss_mlp": 0.01268717, + "balance_loss_clip": 0.06283134, + "balance_loss_mlp": 0.01257195, + "epoch": 0.5361791672929506, + "flos": 16075270103040.0, + "grad_norm": 1.844905450054995, + "language_loss": 0.71658254, + "learning_rate": 1.8628225422027865e-06, + "loss": 0.793603, + "num_input_tokens_seen": 191887240, + "router_z_loss_clip": 1.50097656, + "router_z_loss_mlp": 0.11529541, + "step": 8918, + "time_per_iteration": 2.5032598972320557 + }, + { + "auxiliary_loss_clip": 0.06431636, + "auxiliary_loss_mlp": 0.01270312, + "balance_loss_clip": 0.06282899, + "balance_loss_mlp": 0.01258933, + "epoch": 0.5362392905456185, + "flos": 20747240605440.0, + "grad_norm": 1.4549229797282903, + "language_loss": 0.75235254, + "learning_rate": 1.862434000299067e-06, + "loss": 0.82937205, + "num_input_tokens_seen": 191905690, + "router_z_loss_clip": 1.48632812, + "router_z_loss_mlp": 0.11383057, + "step": 8919, + "time_per_iteration": 2.5361175537109375 + }, + { + "auxiliary_loss_clip": 0.06430984, + "auxiliary_loss_mlp": 0.01266509, + "balance_loss_clip": 0.06280042, + "balance_loss_mlp": 0.01255244, + "epoch": 0.5362994137982865, + "flos": 17344539751680.0, + "grad_norm": 10.323313850773834, + "language_loss": 0.71843415, + "learning_rate": 1.862045463611864e-06, + "loss": 0.79540908, + "num_input_tokens_seen": 191920725, + "router_z_loss_clip": 1.50878906, + "router_z_loss_mlp": 0.11254883, + "step": 8920, + "time_per_iteration": 2.481144666671753 + }, + { + "auxiliary_loss_clip": 0.06425787, + "auxiliary_loss_mlp": 0.0126502, + "balance_loss_clip": 0.06276651, + "balance_loss_mlp": 0.01253659, + "epoch": 0.5363595370509544, + "flos": 42823819837440.0, + "grad_norm": 1.3389140049198536, + "language_loss": 0.68970168, + "learning_rate": 1.8616569321559105e-06, + "loss": 0.76660967, + "num_input_tokens_seen": 191944645, + "router_z_loss_clip": 1.49023438, + "router_z_loss_mlp": 0.11352539, + "step": 8921, + "time_per_iteration": 2.7377495765686035 + }, + { + "auxiliary_loss_clip": 0.06429093, + "auxiliary_loss_mlp": 0.01267258, + "balance_loss_clip": 0.06280531, + "balance_loss_mlp": 0.01255575, + "epoch": 0.5364196603036224, + "flos": 19177990940160.0, + "grad_norm": 2.2769865828018516, + "language_loss": 0.81912661, + "learning_rate": 1.86126840594594e-06, + "loss": 0.89609009, + "num_input_tokens_seen": 191962265, + "router_z_loss_clip": 1.48632812, + "router_z_loss_mlp": 0.11676025, + "step": 8922, + "time_per_iteration": 2.491041660308838 + }, + { + "auxiliary_loss_clip": 0.06431051, + "auxiliary_loss_mlp": 0.01267721, + "balance_loss_clip": 0.06279019, + "balance_loss_mlp": 0.01256539, + "epoch": 0.5364797835562904, + "flos": 17936827136640.0, + "grad_norm": 1.913279005224502, + "language_loss": 0.76818264, + "learning_rate": 1.860879884996686e-06, + "loss": 0.84517032, + "num_input_tokens_seen": 191978850, + "router_z_loss_clip": 1.52050781, + "router_z_loss_mlp": 0.11175537, + "step": 8923, + "time_per_iteration": 2.502797842025757 + }, + { + "auxiliary_loss_clip": 0.06430578, + "auxiliary_loss_mlp": 0.01268847, + "balance_loss_clip": 0.06277579, + "balance_loss_mlp": 0.01257052, + "epoch": 0.5365399068089584, + "flos": 30236098446720.0, + "grad_norm": 1.4167756526815838, + "language_loss": 0.70506531, + "learning_rate": 1.8604913693228804e-06, + "loss": 0.78205955, + "num_input_tokens_seen": 192002000, + "router_z_loss_clip": 1.53027344, + "router_z_loss_mlp": 0.11791992, + "step": 8924, + "time_per_iteration": 2.5783135890960693 + }, + { + "auxiliary_loss_clip": 0.06433783, + "auxiliary_loss_mlp": 0.01269029, + "balance_loss_clip": 0.06280564, + "balance_loss_mlp": 0.01256804, + "epoch": 0.5366000300616264, + "flos": 24897264825600.0, + "grad_norm": 2.5342740284522516, + "language_loss": 0.87064564, + "learning_rate": 1.8601028589392558e-06, + "loss": 0.9476738, + "num_input_tokens_seen": 192019100, + "router_z_loss_clip": 1.53320312, + "router_z_loss_mlp": 0.12231445, + "step": 8925, + "time_per_iteration": 2.555947780609131 + }, + { + "auxiliary_loss_clip": 0.0643315, + "auxiliary_loss_mlp": 0.012686, + "balance_loss_clip": 0.06278683, + "balance_loss_mlp": 0.01256911, + "epoch": 0.5366601533142943, + "flos": 29834610808320.0, + "grad_norm": 1.6615305931190325, + "language_loss": 0.78511882, + "learning_rate": 1.8597143538605455e-06, + "loss": 0.86213624, + "num_input_tokens_seen": 192041660, + "router_z_loss_clip": 1.54296875, + "router_z_loss_mlp": 0.11694336, + "step": 8926, + "time_per_iteration": 2.575540781021118 + }, + { + "auxiliary_loss_clip": 0.06420288, + "auxiliary_loss_mlp": 0.01265367, + "balance_loss_clip": 0.06276788, + "balance_loss_mlp": 0.0125437, + "epoch": 0.5367202765669623, + "flos": 27206821313280.0, + "grad_norm": 1.3335091711279083, + "language_loss": 0.66572356, + "learning_rate": 1.85932585410148e-06, + "loss": 0.74258018, + "num_input_tokens_seen": 192063540, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.11004639, + "step": 8927, + "time_per_iteration": 2.574263572692871 + }, + { + "auxiliary_loss_clip": 0.06429082, + "auxiliary_loss_mlp": 0.0126451, + "balance_loss_clip": 0.06277999, + "balance_loss_mlp": 0.0125309, + "epoch": 0.5367803998196302, + "flos": 20236153426560.0, + "grad_norm": 1.7727091217622297, + "language_loss": 0.73473167, + "learning_rate": 1.8589373596767929e-06, + "loss": 0.81166756, + "num_input_tokens_seen": 192081760, + "router_z_loss_clip": 1.51074219, + "router_z_loss_mlp": 0.11413574, + "step": 8928, + "time_per_iteration": 2.4792275428771973 + }, + { + "auxiliary_loss_clip": 0.06429128, + "auxiliary_loss_mlp": 0.01265529, + "balance_loss_clip": 0.06278329, + "balance_loss_mlp": 0.01254609, + "epoch": 0.5368405230722982, + "flos": 32161791381120.0, + "grad_norm": 1.7479222402462038, + "language_loss": 0.62972343, + "learning_rate": 1.8585488706012154e-06, + "loss": 0.70666999, + "num_input_tokens_seen": 192101620, + "router_z_loss_clip": 1.50683594, + "router_z_loss_mlp": 0.10919189, + "step": 8929, + "time_per_iteration": 2.622292995452881 + }, + { + "auxiliary_loss_clip": 0.06432647, + "auxiliary_loss_mlp": 0.01265269, + "balance_loss_clip": 0.0628202, + "balance_loss_mlp": 0.01254433, + "epoch": 0.5369006463249661, + "flos": 26254778423040.0, + "grad_norm": 1.591710131173975, + "language_loss": 0.66400939, + "learning_rate": 1.8581603868894781e-06, + "loss": 0.74098849, + "num_input_tokens_seen": 192121805, + "router_z_loss_clip": 1.50585938, + "router_z_loss_mlp": 0.10845947, + "step": 8930, + "time_per_iteration": 2.543949604034424 + }, + { + "auxiliary_loss_clip": 0.06424774, + "auxiliary_loss_mlp": 0.01264361, + "balance_loss_clip": 0.06279226, + "balance_loss_mlp": 0.01253299, + "epoch": 0.5369607695776342, + "flos": 26218119461760.0, + "grad_norm": 1.4676781117198738, + "language_loss": 0.67308921, + "learning_rate": 1.8577719085563136e-06, + "loss": 0.74998057, + "num_input_tokens_seen": 192141765, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.1105957, + "step": 8931, + "time_per_iteration": 2.5630295276641846 + }, + { + "auxiliary_loss_clip": 0.06432625, + "auxiliary_loss_mlp": 0.01268662, + "balance_loss_clip": 0.0628577, + "balance_loss_mlp": 0.01256598, + "epoch": 0.5370208928303021, + "flos": 25015920606720.0, + "grad_norm": 1.565512656212007, + "language_loss": 0.76494187, + "learning_rate": 1.8573834356164525e-06, + "loss": 0.84195477, + "num_input_tokens_seen": 192161560, + "router_z_loss_clip": 1.46777344, + "router_z_loss_mlp": 0.12072754, + "step": 8932, + "time_per_iteration": 2.5423011779785156 + }, + { + "auxiliary_loss_clip": 0.0642775, + "auxiliary_loss_mlp": 0.01267942, + "balance_loss_clip": 0.06280537, + "balance_loss_mlp": 0.01255723, + "epoch": 0.5370810160829701, + "flos": 31799646034560.0, + "grad_norm": 1.681669184165067, + "language_loss": 0.66588402, + "learning_rate": 1.8569949680846261e-06, + "loss": 0.74284095, + "num_input_tokens_seen": 192180190, + "router_z_loss_clip": 1.47167969, + "router_z_loss_mlp": 0.12219238, + "step": 8933, + "time_per_iteration": 2.6461243629455566 + }, + { + "auxiliary_loss_clip": 0.0642833, + "auxiliary_loss_mlp": 0.01268413, + "balance_loss_clip": 0.06281729, + "balance_loss_mlp": 0.01256515, + "epoch": 0.537141139335638, + "flos": 23849500245120.0, + "grad_norm": 1.5934461108199862, + "language_loss": 0.83294082, + "learning_rate": 1.856606505975565e-06, + "loss": 0.90990818, + "num_input_tokens_seen": 192198855, + "router_z_loss_clip": 1.46582031, + "router_z_loss_mlp": 0.11895752, + "step": 8934, + "time_per_iteration": 2.5241549015045166 + }, + { + "auxiliary_loss_clip": 0.06428687, + "auxiliary_loss_mlp": 0.01267543, + "balance_loss_clip": 0.06283442, + "balance_loss_mlp": 0.01256033, + "epoch": 0.537201262588306, + "flos": 18513685370880.0, + "grad_norm": 1.6222709830765285, + "language_loss": 0.7995823, + "learning_rate": 1.856218049303999e-06, + "loss": 0.87654459, + "num_input_tokens_seen": 192216555, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.11517334, + "step": 8935, + "time_per_iteration": 2.5692355632781982 + }, + { + "auxiliary_loss_clip": 0.06432107, + "auxiliary_loss_mlp": 0.01265757, + "balance_loss_clip": 0.06282724, + "balance_loss_mlp": 0.01253556, + "epoch": 0.537261385840974, + "flos": 25669492853760.0, + "grad_norm": 4.395420873174801, + "language_loss": 0.83744997, + "learning_rate": 1.855829598084659e-06, + "loss": 0.91442859, + "num_input_tokens_seen": 192236910, + "router_z_loss_clip": 1.49414062, + "router_z_loss_mlp": 0.12200928, + "step": 8936, + "time_per_iteration": 2.53723406791687 + }, + { + "auxiliary_loss_clip": 0.06430986, + "auxiliary_loss_mlp": 0.0126655, + "balance_loss_clip": 0.06284051, + "balance_loss_mlp": 0.01255458, + "epoch": 0.537321509093642, + "flos": 40744656950400.0, + "grad_norm": 1.238966659536207, + "language_loss": 0.73065245, + "learning_rate": 1.8554411523322754e-06, + "loss": 0.8076278, + "num_input_tokens_seen": 192260790, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 0.11096191, + "step": 8937, + "time_per_iteration": 2.7185041904449463 + }, + { + "auxiliary_loss_clip": 0.06432244, + "auxiliary_loss_mlp": 0.01269226, + "balance_loss_clip": 0.06281854, + "balance_loss_mlp": 0.01257591, + "epoch": 0.53738163234631, + "flos": 17244248014080.0, + "grad_norm": 2.3423795733880506, + "language_loss": 0.82399505, + "learning_rate": 1.8550527120615778e-06, + "loss": 0.90100974, + "num_input_tokens_seen": 192277230, + "router_z_loss_clip": 1.50585938, + "router_z_loss_mlp": 0.11645508, + "step": 8938, + "time_per_iteration": 2.497788906097412 + }, + { + "auxiliary_loss_clip": 0.06440363, + "auxiliary_loss_mlp": 0.01269336, + "balance_loss_clip": 0.06284846, + "balance_loss_mlp": 0.01257505, + "epoch": 0.5374417555989779, + "flos": 12826710034560.0, + "grad_norm": 2.237788663184982, + "language_loss": 0.80566859, + "learning_rate": 1.8546642772872957e-06, + "loss": 0.88276565, + "num_input_tokens_seen": 192292840, + "router_z_loss_clip": 1.5546875, + "router_z_loss_mlp": 0.1184082, + "step": 8939, + "time_per_iteration": 2.506603479385376 + }, + { + "auxiliary_loss_clip": 0.06330699, + "auxiliary_loss_mlp": 0.01256495, + "balance_loss_clip": 0.06268299, + "balance_loss_mlp": 0.01254887, + "epoch": 0.5375018788516459, + "flos": 67275502248960.0, + "grad_norm": 0.6889137998662954, + "language_loss": 0.5233649, + "learning_rate": 1.8542758480241589e-06, + "loss": 0.59923685, + "num_input_tokens_seen": 192358240, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 0.01609802, + "step": 8940, + "time_per_iteration": 3.1455881595611572 + }, + { + "auxiliary_loss_clip": 0.06424817, + "auxiliary_loss_mlp": 0.01265039, + "balance_loss_clip": 0.06280527, + "balance_loss_mlp": 0.01254197, + "epoch": 0.5375620021043138, + "flos": 18120080016000.0, + "grad_norm": 1.7572331791906293, + "language_loss": 0.71456778, + "learning_rate": 1.8538874242868965e-06, + "loss": 0.7914663, + "num_input_tokens_seen": 192377370, + "router_z_loss_clip": 1.44238281, + "router_z_loss_mlp": 0.1083374, + "step": 8941, + "time_per_iteration": 3.9169673919677734 + }, + { + "auxiliary_loss_clip": 0.06423429, + "auxiliary_loss_mlp": 0.01266734, + "balance_loss_clip": 0.06280611, + "balance_loss_mlp": 0.01256554, + "epoch": 0.5376221253569818, + "flos": 23156166435840.0, + "grad_norm": 1.5985240277338788, + "language_loss": 0.79660439, + "learning_rate": 1.853499006090237e-06, + "loss": 0.87350607, + "num_input_tokens_seen": 192396450, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.10174561, + "step": 8942, + "time_per_iteration": 2.5441763401031494 + }, + { + "auxiliary_loss_clip": 0.06433077, + "auxiliary_loss_mlp": 0.01269882, + "balance_loss_clip": 0.06281331, + "balance_loss_mlp": 0.01258229, + "epoch": 0.5376822486096497, + "flos": 29980240404480.0, + "grad_norm": 1.695957968467341, + "language_loss": 0.7061829, + "learning_rate": 1.853110593448911e-06, + "loss": 0.78321248, + "num_input_tokens_seen": 192417390, + "router_z_loss_clip": 1.51855469, + "router_z_loss_mlp": 0.11645508, + "step": 8943, + "time_per_iteration": 2.5876903533935547 + }, + { + "auxiliary_loss_clip": 0.06327454, + "auxiliary_loss_mlp": 0.01255314, + "balance_loss_clip": 0.06264913, + "balance_loss_mlp": 0.0125356, + "epoch": 0.5377423718623178, + "flos": 54188139761280.0, + "grad_norm": 0.7834151101556619, + "language_loss": 0.59688759, + "learning_rate": 1.852722186377645e-06, + "loss": 0.67271525, + "num_input_tokens_seen": 192478060, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 0.01757812, + "step": 8944, + "time_per_iteration": 4.5469114780426025 + }, + { + "auxiliary_loss_clip": 0.06439775, + "auxiliary_loss_mlp": 0.01267766, + "balance_loss_clip": 0.06283297, + "balance_loss_mlp": 0.01256066, + "epoch": 0.5378024951149857, + "flos": 23263585770240.0, + "grad_norm": 2.6705245070619754, + "language_loss": 0.776173, + "learning_rate": 1.852333784891169e-06, + "loss": 0.85324842, + "num_input_tokens_seen": 192495985, + "router_z_loss_clip": 1.56347656, + "router_z_loss_mlp": 0.11706543, + "step": 8945, + "time_per_iteration": 2.61606502532959 + }, + { + "auxiliary_loss_clip": 0.06428292, + "auxiliary_loss_mlp": 0.01263773, + "balance_loss_clip": 0.06278516, + "balance_loss_mlp": 0.01252883, + "epoch": 0.5378626183676537, + "flos": 24030866407680.0, + "grad_norm": 1.7469475045380867, + "language_loss": 0.68958521, + "learning_rate": 1.8519453890042112e-06, + "loss": 0.76650584, + "num_input_tokens_seen": 192515445, + "router_z_loss_clip": 1.49804688, + "router_z_loss_mlp": 0.10888672, + "step": 8946, + "time_per_iteration": 2.6660590171813965 + }, + { + "auxiliary_loss_clip": 0.06427687, + "auxiliary_loss_mlp": 0.0126763, + "balance_loss_clip": 0.06282603, + "balance_loss_mlp": 0.01256704, + "epoch": 0.5379227416203216, + "flos": 27169072248960.0, + "grad_norm": 1.5118478086705984, + "language_loss": 0.77489585, + "learning_rate": 1.851556998731498e-06, + "loss": 0.85184896, + "num_input_tokens_seen": 192536530, + "router_z_loss_clip": 1.45019531, + "router_z_loss_mlp": 0.10925293, + "step": 8947, + "time_per_iteration": 2.618797779083252 + }, + { + "auxiliary_loss_clip": 0.06429853, + "auxiliary_loss_mlp": 0.0126878, + "balance_loss_clip": 0.06282403, + "balance_loss_mlp": 0.01257688, + "epoch": 0.5379828648729896, + "flos": 24688631358720.0, + "grad_norm": 1.962883252611848, + "language_loss": 0.60299599, + "learning_rate": 1.8511686140877592e-06, + "loss": 0.6799823, + "num_input_tokens_seen": 192556075, + "router_z_loss_clip": 1.47265625, + "router_z_loss_mlp": 0.11090088, + "step": 8948, + "time_per_iteration": 3.99113392829895 + }, + { + "auxiliary_loss_clip": 0.06430186, + "auxiliary_loss_mlp": 0.01265436, + "balance_loss_clip": 0.06282011, + "balance_loss_mlp": 0.01254629, + "epoch": 0.5380429881256577, + "flos": 22528981025280.0, + "grad_norm": 1.6036817147437437, + "language_loss": 0.7965849, + "learning_rate": 1.8507802350877205e-06, + "loss": 0.87354112, + "num_input_tokens_seen": 192575535, + "router_z_loss_clip": 1.48046875, + "router_z_loss_mlp": 0.10803223, + "step": 8949, + "time_per_iteration": 2.5306220054626465 + }, + { + "auxiliary_loss_clip": 0.06424635, + "auxiliary_loss_mlp": 0.01267697, + "balance_loss_clip": 0.06281022, + "balance_loss_mlp": 0.01256796, + "epoch": 0.5381031113783256, + "flos": 26986825618560.0, + "grad_norm": 1.5758786571118277, + "language_loss": 0.78447008, + "learning_rate": 1.850391861746111e-06, + "loss": 0.86139345, + "num_input_tokens_seen": 192594490, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.10900879, + "step": 8950, + "time_per_iteration": 2.5665290355682373 + }, + { + "auxiliary_loss_clip": 0.0642289, + "auxiliary_loss_mlp": 0.01269045, + "balance_loss_clip": 0.06281261, + "balance_loss_mlp": 0.01258793, + "epoch": 0.5381632346309936, + "flos": 24761026886400.0, + "grad_norm": 1.6449806756094487, + "language_loss": 0.72907847, + "learning_rate": 1.8500034940776573e-06, + "loss": 0.80599785, + "num_input_tokens_seen": 192615650, + "router_z_loss_clip": 1.41503906, + "router_z_loss_mlp": 0.10253906, + "step": 8951, + "time_per_iteration": 2.5389561653137207 + }, + { + "auxiliary_loss_clip": 0.0643057, + "auxiliary_loss_mlp": 0.01265397, + "balance_loss_clip": 0.06280816, + "balance_loss_mlp": 0.01254626, + "epoch": 0.5382233578836615, + "flos": 15565524589440.0, + "grad_norm": 1.8886102084278436, + "language_loss": 0.75767493, + "learning_rate": 1.849615132097085e-06, + "loss": 0.83463454, + "num_input_tokens_seen": 192633840, + "router_z_loss_clip": 1.49707031, + "router_z_loss_mlp": 0.10760498, + "step": 8952, + "time_per_iteration": 2.5009233951568604 + }, + { + "auxiliary_loss_clip": 0.06423527, + "auxiliary_loss_mlp": 0.01265834, + "balance_loss_clip": 0.0627749, + "balance_loss_mlp": 0.01254384, + "epoch": 0.5382834811363295, + "flos": 25091838005760.0, + "grad_norm": 1.352822721598185, + "language_loss": 0.79742837, + "learning_rate": 1.8492267758191228e-06, + "loss": 0.87432194, + "num_input_tokens_seen": 192655890, + "router_z_loss_clip": 1.45996094, + "router_z_loss_mlp": 0.11456299, + "step": 8953, + "time_per_iteration": 2.5382277965545654 + }, + { + "auxiliary_loss_clip": 0.06422, + "auxiliary_loss_mlp": 0.01263666, + "balance_loss_clip": 0.06278996, + "balance_loss_mlp": 0.01253193, + "epoch": 0.5383436043889974, + "flos": 13302983041920.0, + "grad_norm": 1.682075048645487, + "language_loss": 0.80507964, + "learning_rate": 1.8488384252584964e-06, + "loss": 0.88193631, + "num_input_tokens_seen": 192673025, + "router_z_loss_clip": 1.42871094, + "router_z_loss_mlp": 0.10473633, + "step": 8954, + "time_per_iteration": 2.5006446838378906 + }, + { + "auxiliary_loss_clip": 0.06425533, + "auxiliary_loss_mlp": 0.01268977, + "balance_loss_clip": 0.06279075, + "balance_loss_mlp": 0.01258123, + "epoch": 0.5384037276416654, + "flos": 23046063770880.0, + "grad_norm": 2.297323300751636, + "language_loss": 0.77060652, + "learning_rate": 1.8484500804299318e-06, + "loss": 0.84755164, + "num_input_tokens_seen": 192692190, + "router_z_loss_clip": 1.46386719, + "router_z_loss_mlp": 0.10858154, + "step": 8955, + "time_per_iteration": 2.5469982624053955 + }, + { + "auxiliary_loss_clip": 0.06422862, + "auxiliary_loss_mlp": 0.01268692, + "balance_loss_clip": 0.06278117, + "balance_loss_mlp": 0.01257624, + "epoch": 0.5384638508943334, + "flos": 20637389502720.0, + "grad_norm": 1.4766809485278785, + "language_loss": 0.78634906, + "learning_rate": 1.8480617413481557e-06, + "loss": 0.86326456, + "num_input_tokens_seen": 192710380, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.11071777, + "step": 8956, + "time_per_iteration": 3.9486958980560303 + }, + { + "auxiliary_loss_clip": 0.06328554, + "auxiliary_loss_mlp": 0.01254386, + "balance_loss_clip": 0.0626571, + "balance_loss_mlp": 0.01252584, + "epoch": 0.5385239741470014, + "flos": 66755820026880.0, + "grad_norm": 0.8475755828975666, + "language_loss": 0.63483834, + "learning_rate": 1.8476734080278932e-06, + "loss": 0.71066773, + "num_input_tokens_seen": 192768995, + "router_z_loss_clip": 0.62890625, + "router_z_loss_mlp": 0.01797485, + "step": 8957, + "time_per_iteration": 3.0589206218719482 + }, + { + "auxiliary_loss_clip": 0.06326501, + "auxiliary_loss_mlp": 0.01256038, + "balance_loss_clip": 0.06263363, + "balance_loss_mlp": 0.01254215, + "epoch": 0.5385840973996693, + "flos": 64737466076160.0, + "grad_norm": 0.6942778211869604, + "language_loss": 0.51190817, + "learning_rate": 1.8472850804838705e-06, + "loss": 0.58773351, + "num_input_tokens_seen": 192825585, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 0.01818848, + "step": 8958, + "time_per_iteration": 3.1954948902130127 + }, + { + "auxiliary_loss_clip": 0.06433147, + "auxiliary_loss_mlp": 0.01266859, + "balance_loss_clip": 0.06283388, + "balance_loss_mlp": 0.01255189, + "epoch": 0.5386442206523373, + "flos": 26149161951360.0, + "grad_norm": 1.5085241385719446, + "language_loss": 0.77482343, + "learning_rate": 1.8468967587308128e-06, + "loss": 0.85182357, + "num_input_tokens_seen": 192847335, + "router_z_loss_clip": 1.49609375, + "router_z_loss_mlp": 0.11669922, + "step": 8959, + "time_per_iteration": 2.595390558242798 + }, + { + "auxiliary_loss_clip": 0.06429408, + "auxiliary_loss_mlp": 0.01266713, + "balance_loss_clip": 0.06280766, + "balance_loss_mlp": 0.01255269, + "epoch": 0.5387043439050052, + "flos": 18256401809280.0, + "grad_norm": 2.0832623304514373, + "language_loss": 0.84442693, + "learning_rate": 1.8465084427834455e-06, + "loss": 0.92138815, + "num_input_tokens_seen": 192862205, + "router_z_loss_clip": 1.48632812, + "router_z_loss_mlp": 0.11437988, + "step": 8960, + "time_per_iteration": 2.459411382675171 + }, + { + "auxiliary_loss_clip": 0.0642896, + "auxiliary_loss_mlp": 0.01265224, + "balance_loss_clip": 0.06281836, + "balance_loss_mlp": 0.01254495, + "epoch": 0.5387644671576732, + "flos": 29795939349120.0, + "grad_norm": 1.5299241540989073, + "language_loss": 0.78738272, + "learning_rate": 1.8461201326564933e-06, + "loss": 0.86432457, + "num_input_tokens_seen": 192883695, + "router_z_loss_clip": 1.47070312, + "router_z_loss_mlp": 0.1072998, + "step": 8961, + "time_per_iteration": 2.6379730701446533 + }, + { + "auxiliary_loss_clip": 0.06425574, + "auxiliary_loss_mlp": 0.01265079, + "balance_loss_clip": 0.06280299, + "balance_loss_mlp": 0.01254106, + "epoch": 0.5388245904103413, + "flos": 22379661849600.0, + "grad_norm": 1.7063822520278231, + "language_loss": 0.85018182, + "learning_rate": 1.845731828364681e-06, + "loss": 0.92708838, + "num_input_tokens_seen": 192900190, + "router_z_loss_clip": 1.45214844, + "router_z_loss_mlp": 0.10980225, + "step": 8962, + "time_per_iteration": 2.495314359664917 + }, + { + "auxiliary_loss_clip": 0.06324032, + "auxiliary_loss_mlp": 0.01253937, + "balance_loss_clip": 0.06261306, + "balance_loss_mlp": 0.01252085, + "epoch": 0.5388847136630092, + "flos": 69827332417920.0, + "grad_norm": 0.7252434381461927, + "language_loss": 0.54196495, + "learning_rate": 1.8453435299227333e-06, + "loss": 0.61774462, + "num_input_tokens_seen": 192958675, + "router_z_loss_clip": 0.62792969, + "router_z_loss_mlp": 0.01847839, + "step": 8963, + "time_per_iteration": 3.0685930252075195 + }, + { + "auxiliary_loss_clip": 0.06319527, + "auxiliary_loss_mlp": 0.01253383, + "balance_loss_clip": 0.0625699, + "balance_loss_mlp": 0.01251595, + "epoch": 0.5389448369156772, + "flos": 69844270942080.0, + "grad_norm": 0.7817796987422422, + "language_loss": 0.62972116, + "learning_rate": 1.8449552373453744e-06, + "loss": 0.7054503, + "num_input_tokens_seen": 193033135, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 0.01786804, + "step": 8964, + "time_per_iteration": 3.2163538932800293 + }, + { + "auxiliary_loss_clip": 0.0643357, + "auxiliary_loss_mlp": 0.01266947, + "balance_loss_clip": 0.06280617, + "balance_loss_mlp": 0.01255462, + "epoch": 0.5390049601683451, + "flos": 31730478888960.0, + "grad_norm": 1.575337207693627, + "language_loss": 0.70121396, + "learning_rate": 1.8445669506473287e-06, + "loss": 0.77821916, + "num_input_tokens_seen": 193055570, + "router_z_loss_clip": 1.52734375, + "router_z_loss_mlp": 0.11499023, + "step": 8965, + "time_per_iteration": 2.6127662658691406 + }, + { + "auxiliary_loss_clip": 0.06431293, + "auxiliary_loss_mlp": 0.01269597, + "balance_loss_clip": 0.06281815, + "balance_loss_mlp": 0.01258546, + "epoch": 0.5390650834210131, + "flos": 18119283402240.0, + "grad_norm": 2.027850604452939, + "language_loss": 0.82445288, + "learning_rate": 1.8441786698433192e-06, + "loss": 0.90146178, + "num_input_tokens_seen": 193073120, + "router_z_loss_clip": 1.49414062, + "router_z_loss_mlp": 0.11047363, + "step": 8966, + "time_per_iteration": 2.472459554672241 + }, + { + "auxiliary_loss_clip": 0.06426321, + "auxiliary_loss_mlp": 0.01267306, + "balance_loss_clip": 0.06281838, + "balance_loss_mlp": 0.01256326, + "epoch": 0.539125206673681, + "flos": 17421798816000.0, + "grad_norm": 2.5704499610569282, + "language_loss": 0.72936428, + "learning_rate": 1.8437903949480706e-06, + "loss": 0.80630052, + "num_input_tokens_seen": 193090105, + "router_z_loss_clip": 1.44335938, + "router_z_loss_mlp": 0.10980225, + "step": 8967, + "time_per_iteration": 2.4896764755249023 + }, + { + "auxiliary_loss_clip": 0.06424848, + "auxiliary_loss_mlp": 0.01264578, + "balance_loss_clip": 0.06278098, + "balance_loss_mlp": 0.01254493, + "epoch": 0.539185329926349, + "flos": 22205255575680.0, + "grad_norm": 1.5589784366040595, + "language_loss": 0.81895125, + "learning_rate": 1.8434021259763065e-06, + "loss": 0.89584547, + "num_input_tokens_seen": 193109325, + "router_z_loss_clip": 1.46679688, + "router_z_loss_mlp": 0.10083008, + "step": 8968, + "time_per_iteration": 2.5401480197906494 + }, + { + "auxiliary_loss_clip": 0.06428899, + "auxiliary_loss_mlp": 0.01265753, + "balance_loss_clip": 0.0628034, + "balance_loss_mlp": 0.01254118, + "epoch": 0.539245453179017, + "flos": 21440867904000.0, + "grad_norm": 1.4575649765742498, + "language_loss": 0.74243855, + "learning_rate": 1.8430138629427484e-06, + "loss": 0.81938505, + "num_input_tokens_seen": 193130595, + "router_z_loss_clip": 1.48632812, + "router_z_loss_mlp": 0.11633301, + "step": 8969, + "time_per_iteration": 2.553879976272583 + }, + { + "auxiliary_loss_clip": 0.06430885, + "auxiliary_loss_mlp": 0.01265593, + "balance_loss_clip": 0.06278199, + "balance_loss_mlp": 0.01254214, + "epoch": 0.539305576431685, + "flos": 20740322643840.0, + "grad_norm": 2.1595830648072347, + "language_loss": 0.827712, + "learning_rate": 1.8426256058621205e-06, + "loss": 0.90467674, + "num_input_tokens_seen": 193148930, + "router_z_loss_clip": 1.52636719, + "router_z_loss_mlp": 0.1137085, + "step": 8970, + "time_per_iteration": 2.478726863861084 + }, + { + "auxiliary_loss_clip": 0.06422678, + "auxiliary_loss_mlp": 0.01263676, + "balance_loss_clip": 0.06278254, + "balance_loss_mlp": 0.01253185, + "epoch": 0.5393656996843529, + "flos": 30928467934080.0, + "grad_norm": 1.400352356553148, + "language_loss": 0.75607336, + "learning_rate": 1.842237354749146e-06, + "loss": 0.83293688, + "num_input_tokens_seen": 193170140, + "router_z_loss_clip": 1.4453125, + "router_z_loss_mlp": 0.1048584, + "step": 8971, + "time_per_iteration": 2.5901689529418945 + }, + { + "auxiliary_loss_clip": 0.06318198, + "auxiliary_loss_mlp": 0.01253533, + "balance_loss_clip": 0.06255443, + "balance_loss_mlp": 0.0125168, + "epoch": 0.5394258229370209, + "flos": 50332953260160.0, + "grad_norm": 0.8588377208931133, + "language_loss": 0.60451257, + "learning_rate": 1.8418491096185465e-06, + "loss": 0.68022978, + "num_input_tokens_seen": 193227235, + "router_z_loss_clip": 0.62841797, + "router_z_loss_mlp": 0.01847839, + "step": 8972, + "time_per_iteration": 3.1413605213165283 + }, + { + "auxiliary_loss_clip": 0.06426257, + "auxiliary_loss_mlp": 0.01269177, + "balance_loss_clip": 0.06278059, + "balance_loss_mlp": 0.01257918, + "epoch": 0.5394859461896888, + "flos": 25419169180800.0, + "grad_norm": 1.5980875117754325, + "language_loss": 0.787233, + "learning_rate": 1.841460870485045e-06, + "loss": 0.8641873, + "num_input_tokens_seen": 193248435, + "router_z_loss_clip": 1.48046875, + "router_z_loss_mlp": 0.1126709, + "step": 8973, + "time_per_iteration": 2.5336296558380127 + }, + { + "auxiliary_loss_clip": 0.06433228, + "auxiliary_loss_mlp": 0.01267524, + "balance_loss_clip": 0.06279569, + "balance_loss_mlp": 0.0125546, + "epoch": 0.5395460694423568, + "flos": 25484646746880.0, + "grad_norm": 1.7949926655699973, + "language_loss": 0.7381959, + "learning_rate": 1.8410726373633623e-06, + "loss": 0.81520343, + "num_input_tokens_seen": 193267490, + "router_z_loss_clip": 1.53710938, + "router_z_loss_mlp": 0.12078857, + "step": 8974, + "time_per_iteration": 2.5483648777008057 + }, + { + "auxiliary_loss_clip": 0.06318444, + "auxiliary_loss_mlp": 0.01253276, + "balance_loss_clip": 0.06255525, + "balance_loss_mlp": 0.01251373, + "epoch": 0.5396061926950249, + "flos": 53267305317120.0, + "grad_norm": 0.7276638901828621, + "language_loss": 0.50946128, + "learning_rate": 1.8406844102682215e-06, + "loss": 0.58517849, + "num_input_tokens_seen": 193326050, + "router_z_loss_clip": 0.62890625, + "router_z_loss_mlp": 0.01899719, + "step": 8975, + "time_per_iteration": 3.125056028366089 + }, + { + "auxiliary_loss_clip": 0.06423691, + "auxiliary_loss_mlp": 0.01264945, + "balance_loss_clip": 0.06277017, + "balance_loss_mlp": 0.01253215, + "epoch": 0.5396663159476928, + "flos": 26732476949760.0, + "grad_norm": 1.546051077066994, + "language_loss": 0.72722358, + "learning_rate": 1.840296189214344e-06, + "loss": 0.80410993, + "num_input_tokens_seen": 193348785, + "router_z_loss_clip": 1.46777344, + "router_z_loss_mlp": 0.11724854, + "step": 8976, + "time_per_iteration": 2.563626766204834 + }, + { + "auxiliary_loss_clip": 0.06424834, + "auxiliary_loss_mlp": 0.01268763, + "balance_loss_clip": 0.06278136, + "balance_loss_mlp": 0.01257999, + "epoch": 0.5397264392003608, + "flos": 23259267285120.0, + "grad_norm": 1.9541916066514684, + "language_loss": 0.70649612, + "learning_rate": 1.8399079742164509e-06, + "loss": 0.78343207, + "num_input_tokens_seen": 193367080, + "router_z_loss_clip": 1.46679688, + "router_z_loss_mlp": 0.10766602, + "step": 8977, + "time_per_iteration": 2.5443131923675537 + }, + { + "auxiliary_loss_clip": 0.06428454, + "auxiliary_loss_mlp": 0.01267706, + "balance_loss_clip": 0.06278601, + "balance_loss_mlp": 0.01256691, + "epoch": 0.5397865624530287, + "flos": 18299727169920.0, + "grad_norm": 1.8457096410810847, + "language_loss": 0.72901827, + "learning_rate": 1.8395197652892636e-06, + "loss": 0.80597985, + "num_input_tokens_seen": 193383715, + "router_z_loss_clip": 1.49804688, + "router_z_loss_mlp": 0.11016846, + "step": 8978, + "time_per_iteration": 2.511715888977051 + }, + { + "auxiliary_loss_clip": 0.06434547, + "auxiliary_loss_mlp": 0.01269171, + "balance_loss_clip": 0.0627895, + "balance_loss_mlp": 0.01256821, + "epoch": 0.5398466857056967, + "flos": 15301742336640.0, + "grad_norm": 1.7083695222951265, + "language_loss": 0.74513042, + "learning_rate": 1.8391315624475028e-06, + "loss": 0.82216758, + "num_input_tokens_seen": 193400560, + "router_z_loss_clip": 1.5546875, + "router_z_loss_mlp": 0.12347412, + "step": 8979, + "time_per_iteration": 2.4654295444488525 + }, + { + "auxiliary_loss_clip": 0.06435215, + "auxiliary_loss_mlp": 0.01268104, + "balance_loss_clip": 0.062815, + "balance_loss_mlp": 0.0125551, + "epoch": 0.5399068089583646, + "flos": 17827521085440.0, + "grad_norm": 2.1729763122828567, + "language_loss": 0.77298462, + "learning_rate": 1.8387433657058892e-06, + "loss": 0.85001791, + "num_input_tokens_seen": 193418680, + "router_z_loss_clip": 1.53710938, + "router_z_loss_mlp": 0.12609863, + "step": 8980, + "time_per_iteration": 2.5131070613861084 + }, + { + "auxiliary_loss_clip": 0.06428653, + "auxiliary_loss_mlp": 0.01266817, + "balance_loss_clip": 0.06278711, + "balance_loss_mlp": 0.01256202, + "epoch": 0.5399669322110326, + "flos": 27389109870720.0, + "grad_norm": 1.7146505379249901, + "language_loss": 0.82213032, + "learning_rate": 1.8383551750791431e-06, + "loss": 0.89908504, + "num_input_tokens_seen": 193439310, + "router_z_loss_clip": 1.49902344, + "router_z_loss_mlp": 0.10626221, + "step": 8981, + "time_per_iteration": 4.00026273727417 + }, + { + "auxiliary_loss_clip": 0.06430832, + "auxiliary_loss_mlp": 0.01266682, + "balance_loss_clip": 0.06279931, + "balance_loss_mlp": 0.01255292, + "epoch": 0.5400270554637006, + "flos": 20455394434560.0, + "grad_norm": 1.8197401655909293, + "language_loss": 0.67626458, + "learning_rate": 1.8379669905819857e-06, + "loss": 0.75323975, + "num_input_tokens_seen": 193458115, + "router_z_loss_clip": 1.5078125, + "router_z_loss_mlp": 0.11395264, + "step": 8982, + "time_per_iteration": 2.7018609046936035 + }, + { + "auxiliary_loss_clip": 0.06430931, + "auxiliary_loss_mlp": 0.01272335, + "balance_loss_clip": 0.06282471, + "balance_loss_mlp": 0.0126123, + "epoch": 0.5400871787163686, + "flos": 21696055113600.0, + "grad_norm": 1.5105940902505235, + "language_loss": 0.82925522, + "learning_rate": 1.8375788122291358e-06, + "loss": 0.90628791, + "num_input_tokens_seen": 193477365, + "router_z_loss_clip": 1.48632812, + "router_z_loss_mlp": 0.11108398, + "step": 8983, + "time_per_iteration": 4.0147035121917725 + }, + { + "auxiliary_loss_clip": 0.06427681, + "auxiliary_loss_mlp": 0.01265838, + "balance_loss_clip": 0.06280811, + "balance_loss_mlp": 0.01254233, + "epoch": 0.5401473019690365, + "flos": 19210163708160.0, + "grad_norm": 2.5381589556683752, + "language_loss": 0.70748949, + "learning_rate": 1.8371906400353138e-06, + "loss": 0.78442466, + "num_input_tokens_seen": 193495595, + "router_z_loss_clip": 1.46679688, + "router_z_loss_mlp": 0.11608887, + "step": 8984, + "time_per_iteration": 2.485203742980957 + }, + { + "auxiliary_loss_clip": 0.06436664, + "auxiliary_loss_mlp": 0.01270492, + "balance_loss_clip": 0.06283301, + "balance_loss_mlp": 0.01258702, + "epoch": 0.5402074252217045, + "flos": 20632987163520.0, + "grad_norm": 1.6283776116809212, + "language_loss": 0.80336136, + "learning_rate": 1.8368024740152386e-06, + "loss": 0.88043296, + "num_input_tokens_seen": 193514035, + "router_z_loss_clip": 1.53417969, + "router_z_loss_mlp": 0.11798096, + "step": 8985, + "time_per_iteration": 2.5176138877868652 + }, + { + "auxiliary_loss_clip": 0.06421156, + "auxiliary_loss_mlp": 0.01266547, + "balance_loss_clip": 0.06279361, + "balance_loss_mlp": 0.01255497, + "epoch": 0.5402675484743724, + "flos": 24980519456640.0, + "grad_norm": 1.4261046169392377, + "language_loss": 0.79538441, + "learning_rate": 1.83641431418363e-06, + "loss": 0.87226146, + "num_input_tokens_seen": 193535445, + "router_z_loss_clip": 1.41796875, + "router_z_loss_mlp": 0.11053467, + "step": 8986, + "time_per_iteration": 2.528057098388672 + }, + { + "auxiliary_loss_clip": 0.06426872, + "auxiliary_loss_mlp": 0.01269311, + "balance_loss_clip": 0.06277602, + "balance_loss_mlp": 0.01258636, + "epoch": 0.5403276717270404, + "flos": 19464302741760.0, + "grad_norm": 1.7453745991771563, + "language_loss": 0.77310205, + "learning_rate": 1.8360261605552075e-06, + "loss": 0.85006386, + "num_input_tokens_seen": 193554780, + "router_z_loss_clip": 1.4921875, + "router_z_loss_mlp": 0.10681152, + "step": 8987, + "time_per_iteration": 3.9355413913726807 + }, + { + "auxiliary_loss_clip": 0.06426796, + "auxiliary_loss_mlp": 0.01265394, + "balance_loss_clip": 0.06278582, + "balance_loss_mlp": 0.01254147, + "epoch": 0.5403877949797083, + "flos": 18448040096640.0, + "grad_norm": 1.594164869128485, + "language_loss": 0.70988709, + "learning_rate": 1.8356380131446887e-06, + "loss": 0.78680897, + "num_input_tokens_seen": 193573580, + "router_z_loss_clip": 1.48046875, + "router_z_loss_mlp": 0.11248779, + "step": 8988, + "time_per_iteration": 2.529665470123291 + }, + { + "auxiliary_loss_clip": 0.06432524, + "auxiliary_loss_mlp": 0.01266539, + "balance_loss_clip": 0.06283048, + "balance_loss_mlp": 0.0125528, + "epoch": 0.5404479182323764, + "flos": 28300343022720.0, + "grad_norm": 2.353153070088846, + "language_loss": 0.68308997, + "learning_rate": 1.8352498719667934e-06, + "loss": 0.76008058, + "num_input_tokens_seen": 193590490, + "router_z_loss_clip": 1.49414062, + "router_z_loss_mlp": 0.11260986, + "step": 8989, + "time_per_iteration": 2.541705846786499 + }, + { + "auxiliary_loss_clip": 0.06425673, + "auxiliary_loss_mlp": 0.0126717, + "balance_loss_clip": 0.06277242, + "balance_loss_mlp": 0.01255071, + "epoch": 0.5405080414850444, + "flos": 23373981924480.0, + "grad_norm": 1.5774927452360248, + "language_loss": 0.77866185, + "learning_rate": 1.8348617370362399e-06, + "loss": 0.85559022, + "num_input_tokens_seen": 193609900, + "router_z_loss_clip": 1.48339844, + "router_z_loss_mlp": 0.12091064, + "step": 8990, + "time_per_iteration": 2.570016384124756 + }, + { + "auxiliary_loss_clip": 0.06423812, + "auxiliary_loss_mlp": 0.01264876, + "balance_loss_clip": 0.06277065, + "balance_loss_mlp": 0.01254517, + "epoch": 0.5405681647377123, + "flos": 21112907823360.0, + "grad_norm": 1.4794826200904196, + "language_loss": 0.69081038, + "learning_rate": 1.834473608367745e-06, + "loss": 0.76769722, + "num_input_tokens_seen": 193629775, + "router_z_loss_clip": 1.46679688, + "router_z_loss_mlp": 0.10357666, + "step": 8991, + "time_per_iteration": 2.491284132003784 + }, + { + "auxiliary_loss_clip": 0.06430428, + "auxiliary_loss_mlp": 0.01268215, + "balance_loss_clip": 0.06280528, + "balance_loss_mlp": 0.01256598, + "epoch": 0.5406282879903803, + "flos": 20455478288640.0, + "grad_norm": 1.6151673604367662, + "language_loss": 0.76260269, + "learning_rate": 1.8340854859760277e-06, + "loss": 0.83958906, + "num_input_tokens_seen": 193648070, + "router_z_loss_clip": 1.49707031, + "router_z_loss_mlp": 0.11621094, + "step": 8992, + "time_per_iteration": 2.506131649017334 + }, + { + "auxiliary_loss_clip": 0.06429817, + "auxiliary_loss_mlp": 0.01266516, + "balance_loss_clip": 0.06278399, + "balance_loss_mlp": 0.01255871, + "epoch": 0.5406884112430482, + "flos": 14214635464320.0, + "grad_norm": 2.867003800231527, + "language_loss": 0.7616564, + "learning_rate": 1.8336973698758056e-06, + "loss": 0.83861977, + "num_input_tokens_seen": 193665060, + "router_z_loss_clip": 1.51367188, + "router_z_loss_mlp": 0.10644531, + "step": 8993, + "time_per_iteration": 2.5104384422302246 + }, + { + "auxiliary_loss_clip": 0.06425033, + "auxiliary_loss_mlp": 0.01270182, + "balance_loss_clip": 0.06278533, + "balance_loss_mlp": 0.01259024, + "epoch": 0.5407485344957162, + "flos": 23881882648320.0, + "grad_norm": 1.5714876378286171, + "language_loss": 0.70600474, + "learning_rate": 1.8333092600817959e-06, + "loss": 0.78295696, + "num_input_tokens_seen": 193683620, + "router_z_loss_clip": 1.46289062, + "router_z_loss_mlp": 0.11151123, + "step": 8994, + "time_per_iteration": 2.557224988937378 + }, + { + "auxiliary_loss_clip": 0.06430587, + "auxiliary_loss_mlp": 0.01267062, + "balance_loss_clip": 0.06279735, + "balance_loss_mlp": 0.01255397, + "epoch": 0.5408086577483842, + "flos": 23155118259840.0, + "grad_norm": 1.7868138082728735, + "language_loss": 0.7559076, + "learning_rate": 1.8329211566087157e-06, + "loss": 0.83288407, + "num_input_tokens_seen": 193702990, + "router_z_loss_clip": 1.50878906, + "router_z_loss_mlp": 0.11657715, + "step": 8995, + "time_per_iteration": 4.038757085800171 + }, + { + "auxiliary_loss_clip": 0.06426084, + "auxiliary_loss_mlp": 0.01266333, + "balance_loss_clip": 0.06281247, + "balance_loss_mlp": 0.01255748, + "epoch": 0.5408687810010522, + "flos": 18777090280320.0, + "grad_norm": 1.7506118703188027, + "language_loss": 0.73407996, + "learning_rate": 1.832533059471282e-06, + "loss": 0.81100416, + "num_input_tokens_seen": 193721785, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.105896, + "step": 8996, + "time_per_iteration": 2.4787185192108154 + }, + { + "auxiliary_loss_clip": 0.06423852, + "auxiliary_loss_mlp": 0.01266299, + "balance_loss_clip": 0.06280176, + "balance_loss_mlp": 0.01254801, + "epoch": 0.5409289042537201, + "flos": 13886717310720.0, + "grad_norm": 1.8157411884483814, + "language_loss": 0.73422438, + "learning_rate": 1.8321449686842115e-06, + "loss": 0.81112587, + "num_input_tokens_seen": 193740315, + "router_z_loss_clip": 1.43652344, + "router_z_loss_mlp": 0.11499023, + "step": 8997, + "time_per_iteration": 2.5067830085754395 + }, + { + "auxiliary_loss_clip": 0.0643085, + "auxiliary_loss_mlp": 0.01267668, + "balance_loss_clip": 0.06281897, + "balance_loss_mlp": 0.01256802, + "epoch": 0.5409890275063881, + "flos": 14470619287680.0, + "grad_norm": 2.2163933004413625, + "language_loss": 0.72107315, + "learning_rate": 1.8317568842622207e-06, + "loss": 0.79805827, + "num_input_tokens_seen": 193757580, + "router_z_loss_clip": 1.48925781, + "router_z_loss_mlp": 0.10870361, + "step": 8998, + "time_per_iteration": 2.499892234802246 + }, + { + "auxiliary_loss_clip": 0.06424686, + "auxiliary_loss_mlp": 0.01266284, + "balance_loss_clip": 0.0627818, + "balance_loss_mlp": 0.01255281, + "epoch": 0.541049150759056, + "flos": 48987906721920.0, + "grad_norm": 1.4223172525448995, + "language_loss": 0.7060768, + "learning_rate": 1.8313688062200256e-06, + "loss": 0.78298652, + "num_input_tokens_seen": 193780965, + "router_z_loss_clip": 1.46386719, + "router_z_loss_mlp": 0.11004639, + "step": 8999, + "time_per_iteration": 2.75883412361145 + }, + { + "auxiliary_loss_clip": 0.06424989, + "auxiliary_loss_mlp": 0.01267453, + "balance_loss_clip": 0.06280144, + "balance_loss_mlp": 0.01255818, + "epoch": 0.541109274011724, + "flos": 18153007470720.0, + "grad_norm": 3.0241903502045884, + "language_loss": 0.8099103, + "learning_rate": 1.8309807345723422e-06, + "loss": 0.88683468, + "num_input_tokens_seen": 193797855, + "router_z_loss_clip": 1.44726562, + "router_z_loss_mlp": 0.11639404, + "step": 9000, + "time_per_iteration": 2.4591987133026123 + }, + { + "auxiliary_loss_clip": 0.06425589, + "auxiliary_loss_mlp": 0.01267626, + "balance_loss_clip": 0.0628029, + "balance_loss_mlp": 0.01256438, + "epoch": 0.541169397264392, + "flos": 20528921992320.0, + "grad_norm": 1.444857324942775, + "language_loss": 0.73542678, + "learning_rate": 1.8305926693338863e-06, + "loss": 0.81235898, + "num_input_tokens_seen": 193817375, + "router_z_loss_clip": 1.45117188, + "router_z_loss_mlp": 0.11193848, + "step": 9001, + "time_per_iteration": 2.5392372608184814 + }, + { + "auxiliary_loss_clip": 0.06428811, + "auxiliary_loss_mlp": 0.0126804, + "balance_loss_clip": 0.0627747, + "balance_loss_mlp": 0.01256489, + "epoch": 0.54122952051706, + "flos": 20049630238080.0, + "grad_norm": 2.1661909625933675, + "language_loss": 0.85214329, + "learning_rate": 1.8302046105193734e-06, + "loss": 0.92911184, + "num_input_tokens_seen": 193832205, + "router_z_loss_clip": 1.51171875, + "router_z_loss_mlp": 0.11560059, + "step": 9002, + "time_per_iteration": 2.4666826725006104 + }, + { + "auxiliary_loss_clip": 0.06425083, + "auxiliary_loss_mlp": 0.01263895, + "balance_loss_clip": 0.06280569, + "balance_loss_mlp": 0.01253792, + "epoch": 0.541289643769728, + "flos": 19068223691520.0, + "grad_norm": 1.8644067392145132, + "language_loss": 0.78467226, + "learning_rate": 1.8298165581435183e-06, + "loss": 0.86156201, + "num_input_tokens_seen": 193849830, + "router_z_loss_clip": 1.44433594, + "router_z_loss_mlp": 0.10101318, + "step": 9003, + "time_per_iteration": 2.536766767501831 + }, + { + "auxiliary_loss_clip": 0.06424496, + "auxiliary_loss_mlp": 0.01263823, + "balance_loss_clip": 0.06279116, + "balance_loss_mlp": 0.01253005, + "epoch": 0.5413497670223959, + "flos": 22388801944320.0, + "grad_norm": 1.7504010601062234, + "language_loss": 0.69487125, + "learning_rate": 1.8294285122210372e-06, + "loss": 0.77175444, + "num_input_tokens_seen": 193869945, + "router_z_loss_clip": 1.45019531, + "router_z_loss_mlp": 0.1081543, + "step": 9004, + "time_per_iteration": 2.522757053375244 + }, + { + "auxiliary_loss_clip": 0.06323519, + "auxiliary_loss_mlp": 0.01256562, + "balance_loss_clip": 0.0626113, + "balance_loss_mlp": 0.01254622, + "epoch": 0.5414098902750639, + "flos": 70052149722240.0, + "grad_norm": 0.9317133774182984, + "language_loss": 0.58728683, + "learning_rate": 1.8290404727666434e-06, + "loss": 0.66308761, + "num_input_tokens_seen": 193930860, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 0.01937866, + "step": 9005, + "time_per_iteration": 3.227922201156616 + }, + { + "auxiliary_loss_clip": 0.06426564, + "auxiliary_loss_mlp": 0.01264985, + "balance_loss_clip": 0.06276372, + "balance_loss_mlp": 0.01254477, + "epoch": 0.5414700135277318, + "flos": 21805445018880.0, + "grad_norm": 2.0206216562473416, + "language_loss": 0.78202778, + "learning_rate": 1.8286524397950517e-06, + "loss": 0.85894328, + "num_input_tokens_seen": 193949075, + "router_z_loss_clip": 1.50097656, + "router_z_loss_mlp": 0.10510254, + "step": 9006, + "time_per_iteration": 2.557199001312256 + }, + { + "auxiliary_loss_clip": 0.06423091, + "auxiliary_loss_mlp": 0.01269943, + "balance_loss_clip": 0.06278808, + "balance_loss_mlp": 0.01259965, + "epoch": 0.5415301367803999, + "flos": 16913269186560.0, + "grad_norm": 3.052189299631263, + "language_loss": 0.8345896, + "learning_rate": 1.8282644133209777e-06, + "loss": 0.91152, + "num_input_tokens_seen": 193967630, + "router_z_loss_clip": 1.44238281, + "router_z_loss_mlp": 0.09979248, + "step": 9007, + "time_per_iteration": 2.5309536457061768 + }, + { + "auxiliary_loss_clip": 0.06427018, + "auxiliary_loss_mlp": 0.01265497, + "balance_loss_clip": 0.06280112, + "balance_loss_mlp": 0.01254089, + "epoch": 0.5415902600330678, + "flos": 25711518476160.0, + "grad_norm": 1.8242309219870276, + "language_loss": 0.67383778, + "learning_rate": 1.8278763933591334e-06, + "loss": 0.750763, + "num_input_tokens_seen": 193988730, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 0.11401367, + "step": 9008, + "time_per_iteration": 2.5476038455963135 + }, + { + "auxiliary_loss_clip": 0.0643273, + "auxiliary_loss_mlp": 0.01271282, + "balance_loss_clip": 0.06281075, + "balance_loss_mlp": 0.01259432, + "epoch": 0.5416503832857358, + "flos": 19214146776960.0, + "grad_norm": 1.9758514689639541, + "language_loss": 0.7415235, + "learning_rate": 1.827488379924234e-06, + "loss": 0.81856364, + "num_input_tokens_seen": 194005160, + "router_z_loss_clip": 1.51464844, + "router_z_loss_mlp": 0.11846924, + "step": 9009, + "time_per_iteration": 2.519923448562622 + }, + { + "auxiliary_loss_clip": 0.06433536, + "auxiliary_loss_mlp": 0.012676, + "balance_loss_clip": 0.0628282, + "balance_loss_mlp": 0.01255691, + "epoch": 0.5417105065384037, + "flos": 12718619867520.0, + "grad_norm": 2.008927815850951, + "language_loss": 0.88025904, + "learning_rate": 1.8271003730309923e-06, + "loss": 0.95727038, + "num_input_tokens_seen": 194021700, + "router_z_loss_clip": 1.50683594, + "router_z_loss_mlp": 0.11907959, + "step": 9010, + "time_per_iteration": 2.4986653327941895 + }, + { + "auxiliary_loss_clip": 0.06425326, + "auxiliary_loss_mlp": 0.01266313, + "balance_loss_clip": 0.06279215, + "balance_loss_mlp": 0.0125562, + "epoch": 0.5417706297910717, + "flos": 30343727416320.0, + "grad_norm": 1.9869037800658418, + "language_loss": 0.64700162, + "learning_rate": 1.826712372694122e-06, + "loss": 0.72391802, + "num_input_tokens_seen": 194042620, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.10693359, + "step": 9011, + "time_per_iteration": 2.639526605606079 + }, + { + "auxiliary_loss_clip": 0.06426919, + "auxiliary_loss_mlp": 0.0126718, + "balance_loss_clip": 0.06280383, + "balance_loss_mlp": 0.01256368, + "epoch": 0.5418307530437396, + "flos": 29028323295360.0, + "grad_norm": 2.488283502034593, + "language_loss": 0.79704046, + "learning_rate": 1.8263243789283362e-06, + "loss": 0.87398142, + "num_input_tokens_seen": 194061800, + "router_z_loss_clip": 1.46386719, + "router_z_loss_mlp": 0.1081543, + "step": 9012, + "time_per_iteration": 2.546048641204834 + }, + { + "auxiliary_loss_clip": 0.06429458, + "auxiliary_loss_mlp": 0.01265294, + "balance_loss_clip": 0.06280975, + "balance_loss_mlp": 0.01254464, + "epoch": 0.5418908762964076, + "flos": 16879125847680.0, + "grad_norm": 2.3471098958204712, + "language_loss": 0.74353266, + "learning_rate": 1.8259363917483466e-06, + "loss": 0.82048023, + "num_input_tokens_seen": 194079890, + "router_z_loss_clip": 1.48242188, + "router_z_loss_mlp": 0.10839844, + "step": 9013, + "time_per_iteration": 2.544989585876465 + }, + { + "auxiliary_loss_clip": 0.06429175, + "auxiliary_loss_mlp": 0.01265654, + "balance_loss_clip": 0.06277567, + "balance_loss_mlp": 0.01254806, + "epoch": 0.5419509995490756, + "flos": 18955144206720.0, + "grad_norm": 2.592240526053277, + "language_loss": 0.72416294, + "learning_rate": 1.8255484111688667e-06, + "loss": 0.80111116, + "num_input_tokens_seen": 194097625, + "router_z_loss_clip": 1.51464844, + "router_z_loss_mlp": 0.10852051, + "step": 9014, + "time_per_iteration": 2.4757673740386963 + }, + { + "auxiliary_loss_clip": 0.06427553, + "auxiliary_loss_mlp": 0.01267434, + "balance_loss_clip": 0.06280749, + "balance_loss_mlp": 0.01256413, + "epoch": 0.5420111228017436, + "flos": 18083630689920.0, + "grad_norm": 1.4576837239395228, + "language_loss": 0.80686474, + "learning_rate": 1.8251604372046085e-06, + "loss": 0.88381469, + "num_input_tokens_seen": 194116055, + "router_z_loss_clip": 1.46777344, + "router_z_loss_mlp": 0.11010742, + "step": 9015, + "time_per_iteration": 2.50618839263916 + }, + { + "auxiliary_loss_clip": 0.06436689, + "auxiliary_loss_mlp": 0.01270112, + "balance_loss_clip": 0.06286176, + "balance_loss_mlp": 0.01259061, + "epoch": 0.5420712460544116, + "flos": 19067678640000.0, + "grad_norm": 2.2120132338352105, + "language_loss": 0.81892127, + "learning_rate": 1.8247724698702843e-06, + "loss": 0.8959893, + "num_input_tokens_seen": 194130365, + "router_z_loss_clip": 1.50488281, + "router_z_loss_mlp": 0.11053467, + "step": 9016, + "time_per_iteration": 2.475426197052002 + }, + { + "auxiliary_loss_clip": 0.06424853, + "auxiliary_loss_mlp": 0.01269653, + "balance_loss_clip": 0.06277838, + "balance_loss_mlp": 0.01259258, + "epoch": 0.5421313693070795, + "flos": 18193020595200.0, + "grad_norm": 1.7396358642065415, + "language_loss": 0.81981838, + "learning_rate": 1.8243845091806053e-06, + "loss": 0.89676344, + "num_input_tokens_seen": 194148975, + "router_z_loss_clip": 1.46777344, + "router_z_loss_mlp": 0.10388184, + "step": 9017, + "time_per_iteration": 2.4966297149658203 + }, + { + "auxiliary_loss_clip": 0.06421264, + "auxiliary_loss_mlp": 0.01267488, + "balance_loss_clip": 0.06278099, + "balance_loss_mlp": 0.01256301, + "epoch": 0.5421914925597475, + "flos": 13010969162880.0, + "grad_norm": 1.7307795983641447, + "language_loss": 0.77940953, + "learning_rate": 1.8239965551502837e-06, + "loss": 0.85629702, + "num_input_tokens_seen": 194167185, + "router_z_loss_clip": 1.43164062, + "router_z_loss_mlp": 0.11193848, + "step": 9018, + "time_per_iteration": 2.4861438274383545 + }, + { + "auxiliary_loss_clip": 0.0643111, + "auxiliary_loss_mlp": 0.01264327, + "balance_loss_clip": 0.06279995, + "balance_loss_mlp": 0.01253557, + "epoch": 0.5422516158124154, + "flos": 46769654856960.0, + "grad_norm": 1.436078593305458, + "language_loss": 0.66629684, + "learning_rate": 1.8236086077940303e-06, + "loss": 0.7432512, + "num_input_tokens_seen": 194192840, + "router_z_loss_clip": 1.51074219, + "router_z_loss_mlp": 0.10772705, + "step": 9019, + "time_per_iteration": 2.793942928314209 + }, + { + "auxiliary_loss_clip": 0.06420586, + "auxiliary_loss_mlp": 0.01266098, + "balance_loss_clip": 0.06277826, + "balance_loss_mlp": 0.01256627, + "epoch": 0.5423117390650835, + "flos": 31766634725760.0, + "grad_norm": 1.5531318778473993, + "language_loss": 0.69972849, + "learning_rate": 1.8232206671265555e-06, + "loss": 0.77659535, + "num_input_tokens_seen": 194213150, + "router_z_loss_clip": 1.42871094, + "router_z_loss_mlp": 0.0947876, + "step": 9020, + "time_per_iteration": 3.977450132369995 + }, + { + "auxiliary_loss_clip": 0.0642193, + "auxiliary_loss_mlp": 0.01268231, + "balance_loss_clip": 0.0627913, + "balance_loss_mlp": 0.01257586, + "epoch": 0.5423718623177514, + "flos": 27209881987200.0, + "grad_norm": 1.41400284004279, + "language_loss": 0.80270976, + "learning_rate": 1.8228327331625717e-06, + "loss": 0.87961137, + "num_input_tokens_seen": 194234665, + "router_z_loss_clip": 1.42871094, + "router_z_loss_mlp": 0.10650635, + "step": 9021, + "time_per_iteration": 2.5875015258789062 + }, + { + "auxiliary_loss_clip": 0.06426784, + "auxiliary_loss_mlp": 0.0126779, + "balance_loss_clip": 0.0628023, + "balance_loss_mlp": 0.01257162, + "epoch": 0.5424319855704194, + "flos": 23552580902400.0, + "grad_norm": 2.7424242746142298, + "language_loss": 0.78868818, + "learning_rate": 1.822444805916788e-06, + "loss": 0.86563396, + "num_input_tokens_seen": 194253790, + "router_z_loss_clip": 1.46582031, + "router_z_loss_mlp": 0.10626221, + "step": 9022, + "time_per_iteration": 2.6569435596466064 + }, + { + "auxiliary_loss_clip": 0.06421105, + "auxiliary_loss_mlp": 0.01267956, + "balance_loss_clip": 0.06275026, + "balance_loss_mlp": 0.01257132, + "epoch": 0.5424921088230873, + "flos": 26623003190400.0, + "grad_norm": 2.014349133750916, + "language_loss": 0.82876647, + "learning_rate": 1.822056885403915e-06, + "loss": 0.90565705, + "num_input_tokens_seen": 194274950, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.10827637, + "step": 9023, + "time_per_iteration": 4.035135746002197 + }, + { + "auxiliary_loss_clip": 0.06427208, + "auxiliary_loss_mlp": 0.01266773, + "balance_loss_clip": 0.06280831, + "balance_loss_mlp": 0.01256718, + "epoch": 0.5425522320757553, + "flos": 23593600275840.0, + "grad_norm": 1.5793438869499181, + "language_loss": 0.71421236, + "learning_rate": 1.8216689716386627e-06, + "loss": 0.79115218, + "num_input_tokens_seen": 194296155, + "router_z_loss_clip": 1.46386719, + "router_z_loss_mlp": 0.10058594, + "step": 9024, + "time_per_iteration": 2.540205717086792 + }, + { + "auxiliary_loss_clip": 0.06424701, + "auxiliary_loss_mlp": 0.01264518, + "balance_loss_clip": 0.06276532, + "balance_loss_mlp": 0.01253908, + "epoch": 0.5426123553284232, + "flos": 30600256291200.0, + "grad_norm": 1.6177082091395079, + "language_loss": 0.65074164, + "learning_rate": 1.8212810646357405e-06, + "loss": 0.72763383, + "num_input_tokens_seen": 194318025, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.10601807, + "step": 9025, + "time_per_iteration": 2.6120383739471436 + }, + { + "auxiliary_loss_clip": 0.06428426, + "auxiliary_loss_mlp": 0.01269591, + "balance_loss_clip": 0.06278306, + "balance_loss_mlp": 0.0125891, + "epoch": 0.5426724785810912, + "flos": 12500049692160.0, + "grad_norm": 9.095866287209772, + "language_loss": 0.73753297, + "learning_rate": 1.8208931644098591e-06, + "loss": 0.81451309, + "num_input_tokens_seen": 194336150, + "router_z_loss_clip": 1.49902344, + "router_z_loss_mlp": 0.10681152, + "step": 9026, + "time_per_iteration": 2.47986102104187 + }, + { + "auxiliary_loss_clip": 0.06430142, + "auxiliary_loss_mlp": 0.01269421, + "balance_loss_clip": 0.06282182, + "balance_loss_mlp": 0.01256993, + "epoch": 0.5427326018337592, + "flos": 26071273981440.0, + "grad_norm": 2.23504413576904, + "language_loss": 0.78765059, + "learning_rate": 1.8205052709757265e-06, + "loss": 0.8646462, + "num_input_tokens_seen": 194355980, + "router_z_loss_clip": 1.47949219, + "router_z_loss_mlp": 0.12432861, + "step": 9027, + "time_per_iteration": 3.9859650135040283 + }, + { + "auxiliary_loss_clip": 0.06320234, + "auxiliary_loss_mlp": 0.01252608, + "balance_loss_clip": 0.06257887, + "balance_loss_mlp": 0.01250684, + "epoch": 0.5427927250864272, + "flos": 66004974789120.0, + "grad_norm": 0.7416092139326844, + "language_loss": 0.56562424, + "learning_rate": 1.8201173843480515e-06, + "loss": 0.64135265, + "num_input_tokens_seen": 194422660, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 0.01921082, + "step": 9028, + "time_per_iteration": 3.155468702316284 + }, + { + "auxiliary_loss_clip": 0.06432774, + "auxiliary_loss_mlp": 0.01272049, + "balance_loss_clip": 0.06283672, + "balance_loss_mlp": 0.01260158, + "epoch": 0.5428528483390952, + "flos": 19981678976640.0, + "grad_norm": 2.1493249613849015, + "language_loss": 0.78262091, + "learning_rate": 1.8197295045415442e-06, + "loss": 0.85966909, + "num_input_tokens_seen": 194438545, + "router_z_loss_clip": 1.49023438, + "router_z_loss_mlp": 0.11883545, + "step": 9029, + "time_per_iteration": 2.59745192527771 + }, + { + "auxiliary_loss_clip": 0.06422626, + "auxiliary_loss_mlp": 0.0127098, + "balance_loss_clip": 0.06278758, + "balance_loss_mlp": 0.01260108, + "epoch": 0.5429129715917631, + "flos": 21838288619520.0, + "grad_norm": 1.5330300742008836, + "language_loss": 0.83522928, + "learning_rate": 1.8193416315709112e-06, + "loss": 0.9121654, + "num_input_tokens_seen": 194458060, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.10870361, + "step": 9030, + "time_per_iteration": 2.579742670059204 + }, + { + "auxiliary_loss_clip": 0.06426223, + "auxiliary_loss_mlp": 0.01263686, + "balance_loss_clip": 0.06282306, + "balance_loss_mlp": 0.01252903, + "epoch": 0.5429730948444311, + "flos": 27790178238720.0, + "grad_norm": 1.5430505390577234, + "language_loss": 0.75487745, + "learning_rate": 1.8189537654508623e-06, + "loss": 0.8317765, + "num_input_tokens_seen": 194477405, + "router_z_loss_clip": 1.43847656, + "router_z_loss_mlp": 0.10784912, + "step": 9031, + "time_per_iteration": 2.5645737648010254 + }, + { + "auxiliary_loss_clip": 0.06421311, + "auxiliary_loss_mlp": 0.01265953, + "balance_loss_clip": 0.0628026, + "balance_loss_mlp": 0.01256226, + "epoch": 0.543033218097099, + "flos": 26767668464640.0, + "grad_norm": 1.6242541501700514, + "language_loss": 0.85659242, + "learning_rate": 1.8185659061961045e-06, + "loss": 0.933465, + "num_input_tokens_seen": 194497085, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.097229, + "step": 9032, + "time_per_iteration": 2.5913095474243164 + }, + { + "auxiliary_loss_clip": 0.06434417, + "auxiliary_loss_mlp": 0.01272349, + "balance_loss_clip": 0.06282632, + "balance_loss_mlp": 0.01260815, + "epoch": 0.5430933413497671, + "flos": 22681989780480.0, + "grad_norm": 1.5840496509982642, + "language_loss": 0.74130201, + "learning_rate": 1.8181780538213457e-06, + "loss": 0.81836969, + "num_input_tokens_seen": 194516785, + "router_z_loss_clip": 1.515625, + "router_z_loss_mlp": 0.11535645, + "step": 9033, + "time_per_iteration": 2.546196937561035 + }, + { + "auxiliary_loss_clip": 0.06426211, + "auxiliary_loss_mlp": 0.01268108, + "balance_loss_clip": 0.06281157, + "balance_loss_mlp": 0.01256569, + "epoch": 0.543153464602435, + "flos": 24614307187200.0, + "grad_norm": 1.5750334880362715, + "language_loss": 0.76250172, + "learning_rate": 1.8177902083412935e-06, + "loss": 0.83944499, + "num_input_tokens_seen": 194536475, + "router_z_loss_clip": 1.44921875, + "router_z_loss_mlp": 0.11535645, + "step": 9034, + "time_per_iteration": 2.5637965202331543 + }, + { + "auxiliary_loss_clip": 0.0642693, + "auxiliary_loss_mlp": 0.0126457, + "balance_loss_clip": 0.06282238, + "balance_loss_mlp": 0.01254002, + "epoch": 0.543213587855103, + "flos": 19031690511360.0, + "grad_norm": 1.6968779523598936, + "language_loss": 0.84307218, + "learning_rate": 1.817402369770655e-06, + "loss": 0.91998708, + "num_input_tokens_seen": 194554495, + "router_z_loss_clip": 1.4453125, + "router_z_loss_mlp": 0.10583496, + "step": 9035, + "time_per_iteration": 4.028722524642944 + }, + { + "auxiliary_loss_clip": 0.063224, + "auxiliary_loss_mlp": 0.01251692, + "balance_loss_clip": 0.06260421, + "balance_loss_mlp": 0.01250003, + "epoch": 0.5432737111077709, + "flos": 65705539824000.0, + "grad_norm": 0.6842717349937131, + "language_loss": 0.55272961, + "learning_rate": 1.8170145381241364e-06, + "loss": 0.62847054, + "num_input_tokens_seen": 194617620, + "router_z_loss_clip": 0.61816406, + "router_z_loss_mlp": 0.01693726, + "step": 9036, + "time_per_iteration": 3.117825746536255 + }, + { + "auxiliary_loss_clip": 0.06427496, + "auxiliary_loss_mlp": 0.01266068, + "balance_loss_clip": 0.06278114, + "balance_loss_mlp": 0.0125423, + "epoch": 0.5433338343604389, + "flos": 22098339365760.0, + "grad_norm": 1.6522952339212897, + "language_loss": 0.75599706, + "learning_rate": 1.8166267134164451e-06, + "loss": 0.83293271, + "num_input_tokens_seen": 194637690, + "router_z_loss_clip": 1.4921875, + "router_z_loss_mlp": 0.1184082, + "step": 9037, + "time_per_iteration": 2.520371913909912 + }, + { + "auxiliary_loss_clip": 0.06428872, + "auxiliary_loss_mlp": 0.01263373, + "balance_loss_clip": 0.06282881, + "balance_loss_mlp": 0.01252561, + "epoch": 0.5433939576131068, + "flos": 34680316752000.0, + "grad_norm": 1.5920545337485463, + "language_loss": 0.66775727, + "learning_rate": 1.8162388956622875e-06, + "loss": 0.74467969, + "num_input_tokens_seen": 194659520, + "router_z_loss_clip": 1.45800781, + "router_z_loss_mlp": 0.1081543, + "step": 9038, + "time_per_iteration": 2.6492366790771484 + }, + { + "auxiliary_loss_clip": 0.06424891, + "auxiliary_loss_mlp": 0.01265017, + "balance_loss_clip": 0.06279261, + "balance_loss_mlp": 0.01254395, + "epoch": 0.5434540808657748, + "flos": 20309639057280.0, + "grad_norm": 2.8075357913922687, + "language_loss": 0.78373635, + "learning_rate": 1.8158510848763692e-06, + "loss": 0.8606354, + "num_input_tokens_seen": 194677645, + "router_z_loss_clip": 1.45507812, + "router_z_loss_mlp": 0.10626221, + "step": 9039, + "time_per_iteration": 2.528156280517578 + }, + { + "auxiliary_loss_clip": 0.06428317, + "auxiliary_loss_mlp": 0.01269709, + "balance_loss_clip": 0.06281251, + "balance_loss_mlp": 0.01258677, + "epoch": 0.5435142041184428, + "flos": 23119549401600.0, + "grad_norm": 1.7481925172590123, + "language_loss": 0.76885521, + "learning_rate": 1.8154632810733962e-06, + "loss": 0.84583545, + "num_input_tokens_seen": 194697400, + "router_z_loss_clip": 1.46777344, + "router_z_loss_mlp": 0.11029053, + "step": 9040, + "time_per_iteration": 2.5517256259918213 + }, + { + "auxiliary_loss_clip": 0.06319717, + "auxiliary_loss_mlp": 0.01257021, + "balance_loss_clip": 0.06257772, + "balance_loss_mlp": 0.01255075, + "epoch": 0.5435743273711108, + "flos": 64032350768640.0, + "grad_norm": 0.6699998863594594, + "language_loss": 0.52323502, + "learning_rate": 1.815075484268074e-06, + "loss": 0.59900236, + "num_input_tokens_seen": 194761205, + "router_z_loss_clip": 0.61865234, + "router_z_loss_mlp": 0.0194397, + "step": 9041, + "time_per_iteration": 3.166306972503662 + }, + { + "auxiliary_loss_clip": 0.06428386, + "auxiliary_loss_mlp": 0.01265505, + "balance_loss_clip": 0.06280383, + "balance_loss_mlp": 0.01254687, + "epoch": 0.5436344506237788, + "flos": 25125897490560.0, + "grad_norm": 1.7575616905304456, + "language_loss": 0.762761, + "learning_rate": 1.8146876944751078e-06, + "loss": 0.83969998, + "num_input_tokens_seen": 194782445, + "router_z_loss_clip": 1.48242188, + "router_z_loss_mlp": 0.10821533, + "step": 9042, + "time_per_iteration": 2.5450282096862793 + }, + { + "auxiliary_loss_clip": 0.0642225, + "auxiliary_loss_mlp": 0.01265245, + "balance_loss_clip": 0.06278253, + "balance_loss_mlp": 0.01254176, + "epoch": 0.5436945738764467, + "flos": 19579017381120.0, + "grad_norm": 2.3576554691894054, + "language_loss": 0.6770978, + "learning_rate": 1.8142999117092033e-06, + "loss": 0.75397277, + "num_input_tokens_seen": 194800325, + "router_z_loss_clip": 1.43945312, + "router_z_loss_mlp": 0.11065674, + "step": 9043, + "time_per_iteration": 2.5310070514678955 + }, + { + "auxiliary_loss_clip": 0.06421092, + "auxiliary_loss_mlp": 0.01266758, + "balance_loss_clip": 0.06277125, + "balance_loss_mlp": 0.01256065, + "epoch": 0.5437546971291147, + "flos": 21148937879040.0, + "grad_norm": 1.5176966924106092, + "language_loss": 0.84091616, + "learning_rate": 1.8139121359850644e-06, + "loss": 0.91779459, + "num_input_tokens_seen": 194818675, + "router_z_loss_clip": 1.43847656, + "router_z_loss_mlp": 0.10699463, + "step": 9044, + "time_per_iteration": 2.4937691688537598 + }, + { + "auxiliary_loss_clip": 0.06427783, + "auxiliary_loss_mlp": 0.01267965, + "balance_loss_clip": 0.06275944, + "balance_loss_mlp": 0.01256056, + "epoch": 0.5438148203817826, + "flos": 25125645928320.0, + "grad_norm": 1.559720453478778, + "language_loss": 0.62531364, + "learning_rate": 1.8135243673173956e-06, + "loss": 0.70227116, + "num_input_tokens_seen": 194836595, + "router_z_loss_clip": 1.51660156, + "router_z_loss_mlp": 0.11914062, + "step": 9045, + "time_per_iteration": 2.558842182159424 + }, + { + "auxiliary_loss_clip": 0.06425174, + "auxiliary_loss_mlp": 0.01267999, + "balance_loss_clip": 0.06278486, + "balance_loss_mlp": 0.01257312, + "epoch": 0.5438749436344507, + "flos": 23009614444800.0, + "grad_norm": 1.4475609839642107, + "language_loss": 0.70189548, + "learning_rate": 1.8131366057209023e-06, + "loss": 0.77882719, + "num_input_tokens_seen": 194857520, + "router_z_loss_clip": 1.46679688, + "router_z_loss_mlp": 0.10687256, + "step": 9046, + "time_per_iteration": 2.546400785446167 + }, + { + "auxiliary_loss_clip": 0.06422587, + "auxiliary_loss_mlp": 0.01263416, + "balance_loss_clip": 0.06278922, + "balance_loss_mlp": 0.01253087, + "epoch": 0.5439350668871186, + "flos": 15492458229120.0, + "grad_norm": 1.7829079763234368, + "language_loss": 0.77310658, + "learning_rate": 1.8127488512102868e-06, + "loss": 0.84996659, + "num_input_tokens_seen": 194876020, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.10333252, + "step": 9047, + "time_per_iteration": 2.5223042964935303 + }, + { + "auxiliary_loss_clip": 0.06424624, + "auxiliary_loss_mlp": 0.01269137, + "balance_loss_clip": 0.06278106, + "balance_loss_mlp": 0.01257598, + "epoch": 0.5439951901397866, + "flos": 17244164160000.0, + "grad_norm": 2.1796692597227363, + "language_loss": 0.73181236, + "learning_rate": 1.8123611038002547e-06, + "loss": 0.80874991, + "num_input_tokens_seen": 194894650, + "router_z_loss_clip": 1.46484375, + "router_z_loss_mlp": 0.11547852, + "step": 9048, + "time_per_iteration": 2.4901275634765625 + }, + { + "auxiliary_loss_clip": 0.06419719, + "auxiliary_loss_mlp": 0.01268414, + "balance_loss_clip": 0.06275168, + "balance_loss_mlp": 0.01256773, + "epoch": 0.5440553133924545, + "flos": 18666945688320.0, + "grad_norm": 2.2913555210162535, + "language_loss": 0.93342638, + "learning_rate": 1.8119733635055076e-06, + "loss": 1.01030767, + "num_input_tokens_seen": 194911935, + "router_z_loss_clip": 1.4453125, + "router_z_loss_mlp": 0.11639404, + "step": 9049, + "time_per_iteration": 2.5185091495513916 + }, + { + "auxiliary_loss_clip": 0.0641875, + "auxiliary_loss_mlp": 0.01267443, + "balance_loss_clip": 0.06274416, + "balance_loss_mlp": 0.01257155, + "epoch": 0.5441154366451225, + "flos": 27129813811200.0, + "grad_norm": 1.6778604645700708, + "language_loss": 0.74161297, + "learning_rate": 1.8115856303407492e-06, + "loss": 0.81847489, + "num_input_tokens_seen": 194931620, + "router_z_loss_clip": 1.44140625, + "router_z_loss_mlp": 0.10284424, + "step": 9050, + "time_per_iteration": 2.551227331161499 + }, + { + "auxiliary_loss_clip": 0.06424956, + "auxiliary_loss_mlp": 0.01268538, + "balance_loss_clip": 0.06277525, + "balance_loss_mlp": 0.01257684, + "epoch": 0.5441755598977904, + "flos": 26000890951680.0, + "grad_norm": 1.7704942450323604, + "language_loss": 0.67003465, + "learning_rate": 1.8111979043206832e-06, + "loss": 0.74696958, + "num_input_tokens_seen": 194952560, + "router_z_loss_clip": 1.47460938, + "router_z_loss_mlp": 0.10852051, + "step": 9051, + "time_per_iteration": 2.586360454559326 + }, + { + "auxiliary_loss_clip": 0.06422283, + "auxiliary_loss_mlp": 0.01264215, + "balance_loss_clip": 0.06277864, + "balance_loss_mlp": 0.01253629, + "epoch": 0.5442356831504584, + "flos": 32388327694080.0, + "grad_norm": 1.6805683860476124, + "language_loss": 0.68003166, + "learning_rate": 1.810810185460011e-06, + "loss": 0.75689662, + "num_input_tokens_seen": 194973915, + "router_z_loss_clip": 1.44238281, + "router_z_loss_mlp": 0.10583496, + "step": 9052, + "time_per_iteration": 2.595308303833008 + }, + { + "auxiliary_loss_clip": 0.0642236, + "auxiliary_loss_mlp": 0.01267208, + "balance_loss_clip": 0.06275343, + "balance_loss_mlp": 0.01255413, + "epoch": 0.5442958064031264, + "flos": 24170123093760.0, + "grad_norm": 1.9713868762163456, + "language_loss": 0.93283188, + "learning_rate": 1.810422473773436e-06, + "loss": 1.0097276, + "num_input_tokens_seen": 194990170, + "router_z_loss_clip": 1.46972656, + "router_z_loss_mlp": 0.11791992, + "step": 9053, + "time_per_iteration": 2.5700409412384033 + }, + { + "auxiliary_loss_clip": 0.06427357, + "auxiliary_loss_mlp": 0.0127068, + "balance_loss_clip": 0.06279093, + "balance_loss_mlp": 0.01258509, + "epoch": 0.5443559296557944, + "flos": 18769669194240.0, + "grad_norm": 1.9808667763978582, + "language_loss": 0.83683395, + "learning_rate": 1.8100347692756595e-06, + "loss": 0.91381431, + "num_input_tokens_seen": 195006395, + "router_z_loss_clip": 1.48339844, + "router_z_loss_mlp": 0.1217041, + "step": 9054, + "time_per_iteration": 2.4873886108398438 + }, + { + "auxiliary_loss_clip": 0.06424912, + "auxiliary_loss_mlp": 0.01271948, + "balance_loss_clip": 0.06277627, + "balance_loss_mlp": 0.01260021, + "epoch": 0.5444160529084624, + "flos": 22638245149440.0, + "grad_norm": 1.9496494567304603, + "language_loss": 0.68541598, + "learning_rate": 1.8096470719813836e-06, + "loss": 0.76238453, + "num_input_tokens_seen": 195025080, + "router_z_loss_clip": 1.47265625, + "router_z_loss_mlp": 0.11920166, + "step": 9055, + "time_per_iteration": 2.5629093647003174 + }, + { + "auxiliary_loss_clip": 0.06326497, + "auxiliary_loss_mlp": 0.01261063, + "balance_loss_clip": 0.06264114, + "balance_loss_mlp": 0.01259381, + "epoch": 0.5444761761611303, + "flos": 69693106976640.0, + "grad_norm": 0.7193405715621726, + "language_loss": 0.57599837, + "learning_rate": 1.80925938190531e-06, + "loss": 0.65187401, + "num_input_tokens_seen": 195085725, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 0.01686096, + "step": 9056, + "time_per_iteration": 3.1249008178710938 + }, + { + "auxiliary_loss_clip": 0.06428131, + "auxiliary_loss_mlp": 0.01267471, + "balance_loss_clip": 0.06279279, + "balance_loss_mlp": 0.01255676, + "epoch": 0.5445362994137983, + "flos": 14282922142080.0, + "grad_norm": 1.7879789013056906, + "language_loss": 0.69611216, + "learning_rate": 1.8088716990621395e-06, + "loss": 0.77306819, + "num_input_tokens_seen": 195102585, + "router_z_loss_clip": 1.48828125, + "router_z_loss_mlp": 0.11798096, + "step": 9057, + "time_per_iteration": 2.498568296432495 + }, + { + "auxiliary_loss_clip": 0.06425367, + "auxiliary_loss_mlp": 0.01267238, + "balance_loss_clip": 0.06281108, + "balance_loss_mlp": 0.01255651, + "epoch": 0.5445964226664662, + "flos": 28993802613120.0, + "grad_norm": 1.9346963255645138, + "language_loss": 0.75279379, + "learning_rate": 1.8084840234665738e-06, + "loss": 0.8297199, + "num_input_tokens_seen": 195120055, + "router_z_loss_clip": 1.44433594, + "router_z_loss_mlp": 0.11578369, + "step": 9058, + "time_per_iteration": 2.569481134414673 + }, + { + "auxiliary_loss_clip": 0.06324711, + "auxiliary_loss_mlp": 0.01255513, + "balance_loss_clip": 0.06262248, + "balance_loss_mlp": 0.01253708, + "epoch": 0.5446565459191343, + "flos": 68642323649280.0, + "grad_norm": 0.781118187376451, + "language_loss": 0.62576413, + "learning_rate": 1.808096355133312e-06, + "loss": 0.7015664, + "num_input_tokens_seen": 195181045, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 0.01800537, + "step": 9059, + "time_per_iteration": 4.5610737800598145 + }, + { + "auxiliary_loss_clip": 0.06421264, + "auxiliary_loss_mlp": 0.01268955, + "balance_loss_clip": 0.06278148, + "balance_loss_mlp": 0.01257993, + "epoch": 0.5447166691718022, + "flos": 16221989802240.0, + "grad_norm": 1.8006783567998876, + "language_loss": 0.79601544, + "learning_rate": 1.8077086940770572e-06, + "loss": 0.87291771, + "num_input_tokens_seen": 195198840, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.10961914, + "step": 9060, + "time_per_iteration": 2.511836290359497 + }, + { + "auxiliary_loss_clip": 0.06426552, + "auxiliary_loss_mlp": 0.0126624, + "balance_loss_clip": 0.06279396, + "balance_loss_mlp": 0.0125454, + "epoch": 0.5447767924244702, + "flos": 25856225677440.0, + "grad_norm": 1.542760917466334, + "language_loss": 0.80138546, + "learning_rate": 1.8073210403125072e-06, + "loss": 0.87831336, + "num_input_tokens_seen": 195218720, + "router_z_loss_clip": 1.46972656, + "router_z_loss_mlp": 0.11700439, + "step": 9061, + "time_per_iteration": 2.5398924350738525 + }, + { + "auxiliary_loss_clip": 0.06425673, + "auxiliary_loss_mlp": 0.01265991, + "balance_loss_clip": 0.06280909, + "balance_loss_mlp": 0.01255221, + "epoch": 0.5448369156771381, + "flos": 19682998698240.0, + "grad_norm": 1.6196021204279303, + "language_loss": 0.87203825, + "learning_rate": 1.8069333938543627e-06, + "loss": 0.94895482, + "num_input_tokens_seen": 195235770, + "router_z_loss_clip": 1.44921875, + "router_z_loss_mlp": 0.10772705, + "step": 9062, + "time_per_iteration": 4.0366997718811035 + }, + { + "auxiliary_loss_clip": 0.06433238, + "auxiliary_loss_mlp": 0.01268748, + "balance_loss_clip": 0.0628314, + "balance_loss_mlp": 0.01256392, + "epoch": 0.5448970389298061, + "flos": 19287925896960.0, + "grad_norm": 1.7163800985020743, + "language_loss": 0.82674021, + "learning_rate": 1.8065457547173233e-06, + "loss": 0.90376008, + "num_input_tokens_seen": 195254870, + "router_z_loss_clip": 1.50097656, + "router_z_loss_mlp": 0.12359619, + "step": 9063, + "time_per_iteration": 2.5397801399230957 + }, + { + "auxiliary_loss_clip": 0.06429115, + "auxiliary_loss_mlp": 0.01269549, + "balance_loss_clip": 0.0628127, + "balance_loss_mlp": 0.01257264, + "epoch": 0.544957162182474, + "flos": 20997270789120.0, + "grad_norm": 1.590898869425655, + "language_loss": 0.63855612, + "learning_rate": 1.8061581229160878e-06, + "loss": 0.71554273, + "num_input_tokens_seen": 195273390, + "router_z_loss_clip": 1.47558594, + "router_z_loss_mlp": 0.1227417, + "step": 9064, + "time_per_iteration": 2.511350631713867 + }, + { + "auxiliary_loss_clip": 0.06432661, + "auxiliary_loss_mlp": 0.01263974, + "balance_loss_clip": 0.06282693, + "balance_loss_mlp": 0.01251863, + "epoch": 0.545017285435142, + "flos": 25381671678720.0, + "grad_norm": 1.596100575558465, + "language_loss": 0.80746907, + "learning_rate": 1.8057704984653566e-06, + "loss": 0.88443542, + "num_input_tokens_seen": 195295635, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.12115479, + "step": 9065, + "time_per_iteration": 2.589707136154175 + }, + { + "auxiliary_loss_clip": 0.06425799, + "auxiliary_loss_mlp": 0.01266335, + "balance_loss_clip": 0.06280494, + "balance_loss_mlp": 0.01255916, + "epoch": 0.54507740868781, + "flos": 19140661146240.0, + "grad_norm": 1.9404249818077939, + "language_loss": 0.78152055, + "learning_rate": 1.805382881379827e-06, + "loss": 0.85844183, + "num_input_tokens_seen": 195312545, + "router_z_loss_clip": 1.45214844, + "router_z_loss_mlp": 0.10412598, + "step": 9066, + "time_per_iteration": 2.5037317276000977 + }, + { + "auxiliary_loss_clip": 0.06434928, + "auxiliary_loss_mlp": 0.01268701, + "balance_loss_clip": 0.06284117, + "balance_loss_mlp": 0.01256714, + "epoch": 0.545137531940478, + "flos": 26256958629120.0, + "grad_norm": 1.5302055737642422, + "language_loss": 0.76331961, + "learning_rate": 1.8049952716741975e-06, + "loss": 0.84035593, + "num_input_tokens_seen": 195332955, + "router_z_loss_clip": 1.5078125, + "router_z_loss_mlp": 0.11993408, + "step": 9067, + "time_per_iteration": 4.019241571426392 + }, + { + "auxiliary_loss_clip": 0.06438933, + "auxiliary_loss_mlp": 0.01268386, + "balance_loss_clip": 0.06285474, + "balance_loss_mlp": 0.01255685, + "epoch": 0.545197655193146, + "flos": 37563880435200.0, + "grad_norm": 1.8087199149855477, + "language_loss": 0.62992573, + "learning_rate": 1.8046076693631682e-06, + "loss": 0.70699894, + "num_input_tokens_seen": 195355930, + "router_z_loss_clip": 1.53417969, + "router_z_loss_mlp": 0.12701416, + "step": 9068, + "time_per_iteration": 2.6678848266601562 + }, + { + "auxiliary_loss_clip": 0.06424262, + "auxiliary_loss_mlp": 0.01267107, + "balance_loss_clip": 0.06280495, + "balance_loss_mlp": 0.01256163, + "epoch": 0.5452577784458139, + "flos": 26038430380800.0, + "grad_norm": 1.5391820181686233, + "language_loss": 0.72328687, + "learning_rate": 1.8042200744614343e-06, + "loss": 0.80020058, + "num_input_tokens_seen": 195376445, + "router_z_loss_clip": 1.43847656, + "router_z_loss_mlp": 0.10949707, + "step": 9069, + "time_per_iteration": 2.555837631225586 + }, + { + "auxiliary_loss_clip": 0.06424727, + "auxiliary_loss_mlp": 0.0126738, + "balance_loss_clip": 0.0628207, + "balance_loss_mlp": 0.01256723, + "epoch": 0.5453179016984819, + "flos": 17644729403520.0, + "grad_norm": 1.699483734463513, + "language_loss": 0.74651837, + "learning_rate": 1.8038324869836957e-06, + "loss": 0.82343948, + "num_input_tokens_seen": 195393725, + "router_z_loss_clip": 1.42480469, + "router_z_loss_mlp": 0.10662842, + "step": 9070, + "time_per_iteration": 2.493806838989258 + }, + { + "auxiliary_loss_clip": 0.06424981, + "auxiliary_loss_mlp": 0.01264741, + "balance_loss_clip": 0.06277809, + "balance_loss_mlp": 0.01253839, + "epoch": 0.5453780249511498, + "flos": 23222524469760.0, + "grad_norm": 1.8987434929949667, + "language_loss": 0.61238426, + "learning_rate": 1.8034449069446489e-06, + "loss": 0.68928152, + "num_input_tokens_seen": 195411380, + "router_z_loss_clip": 1.47167969, + "router_z_loss_mlp": 0.10900879, + "step": 9071, + "time_per_iteration": 2.522620677947998 + }, + { + "auxiliary_loss_clip": 0.06331067, + "auxiliary_loss_mlp": 0.01252658, + "balance_loss_clip": 0.06269144, + "balance_loss_mlp": 0.01250867, + "epoch": 0.5454381482038179, + "flos": 68719163443200.0, + "grad_norm": 0.6892933067721945, + "language_loss": 0.57065922, + "learning_rate": 1.80305733435899e-06, + "loss": 0.64649647, + "num_input_tokens_seen": 195482015, + "router_z_loss_clip": 0.61767578, + "router_z_loss_mlp": 0.01786804, + "step": 9072, + "time_per_iteration": 3.235288381576538 + }, + { + "auxiliary_loss_clip": 0.06422395, + "auxiliary_loss_mlp": 0.01268326, + "balance_loss_clip": 0.06280763, + "balance_loss_mlp": 0.01257424, + "epoch": 0.5454982714564858, + "flos": 13265569393920.0, + "grad_norm": 1.8411374110080903, + "language_loss": 0.69644904, + "learning_rate": 1.8026697692414174e-06, + "loss": 0.77335626, + "num_input_tokens_seen": 195500440, + "router_z_loss_clip": 1.41601562, + "router_z_loss_mlp": 0.10906982, + "step": 9073, + "time_per_iteration": 2.476053237915039 + }, + { + "auxiliary_loss_clip": 0.06421326, + "auxiliary_loss_mlp": 0.01272164, + "balance_loss_clip": 0.06280228, + "balance_loss_mlp": 0.01261477, + "epoch": 0.5455583947091538, + "flos": 21842439396480.0, + "grad_norm": 1.836952800264558, + "language_loss": 0.71413183, + "learning_rate": 1.802282211606627e-06, + "loss": 0.79106677, + "num_input_tokens_seen": 195520860, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.10687256, + "step": 9074, + "time_per_iteration": 3.981220006942749 + }, + { + "auxiliary_loss_clip": 0.06424403, + "auxiliary_loss_mlp": 0.01266647, + "balance_loss_clip": 0.06278551, + "balance_loss_mlp": 0.01255364, + "epoch": 0.5456185179618217, + "flos": 17822489840640.0, + "grad_norm": 1.975994190229167, + "language_loss": 0.68697762, + "learning_rate": 1.8018946614693148e-06, + "loss": 0.76388818, + "num_input_tokens_seen": 195538615, + "router_z_loss_clip": 1.45800781, + "router_z_loss_mlp": 0.112854, + "step": 9075, + "time_per_iteration": 2.506155490875244 + }, + { + "auxiliary_loss_clip": 0.06425694, + "auxiliary_loss_mlp": 0.01265713, + "balance_loss_clip": 0.06281726, + "balance_loss_mlp": 0.01254942, + "epoch": 0.5456786412144897, + "flos": 21075787664640.0, + "grad_norm": 1.6135772994791406, + "language_loss": 0.80784404, + "learning_rate": 1.8015071188441768e-06, + "loss": 0.88475811, + "num_input_tokens_seen": 195557460, + "router_z_loss_clip": 1.43847656, + "router_z_loss_mlp": 0.10778809, + "step": 9076, + "time_per_iteration": 2.538940906524658 + }, + { + "auxiliary_loss_clip": 0.06430642, + "auxiliary_loss_mlp": 0.01272688, + "balance_loss_clip": 0.06283286, + "balance_loss_mlp": 0.01261005, + "epoch": 0.5457387644671576, + "flos": 23301712177920.0, + "grad_norm": 1.7804219771063188, + "language_loss": 0.80408549, + "learning_rate": 1.8011195837459089e-06, + "loss": 0.88111883, + "num_input_tokens_seen": 195577985, + "router_z_loss_clip": 1.47363281, + "router_z_loss_mlp": 0.11682129, + "step": 9077, + "time_per_iteration": 2.6752305030822754 + }, + { + "auxiliary_loss_clip": 0.06424201, + "auxiliary_loss_mlp": 0.01267583, + "balance_loss_clip": 0.06278477, + "balance_loss_mlp": 0.0125698, + "epoch": 0.5457988877198257, + "flos": 21623575731840.0, + "grad_norm": 1.8316897806182997, + "language_loss": 0.67871404, + "learning_rate": 1.8007320561892064e-06, + "loss": 0.75563186, + "num_input_tokens_seen": 195597620, + "router_z_loss_clip": 1.45800781, + "router_z_loss_mlp": 0.1060791, + "step": 9078, + "time_per_iteration": 2.5634307861328125 + }, + { + "auxiliary_loss_clip": 0.06428619, + "auxiliary_loss_mlp": 0.01268679, + "balance_loss_clip": 0.0628078, + "balance_loss_mlp": 0.01256722, + "epoch": 0.5458590109724936, + "flos": 23768174257920.0, + "grad_norm": 2.0367985655242116, + "language_loss": 0.81582344, + "learning_rate": 1.800344536188764e-06, + "loss": 0.8927964, + "num_input_tokens_seen": 195615910, + "router_z_loss_clip": 1.4765625, + "router_z_loss_mlp": 0.1194458, + "step": 9079, + "time_per_iteration": 2.563260078430176 + }, + { + "auxiliary_loss_clip": 0.06434448, + "auxiliary_loss_mlp": 0.01267346, + "balance_loss_clip": 0.06280699, + "balance_loss_mlp": 0.01255341, + "epoch": 0.5459191342251616, + "flos": 24430928526720.0, + "grad_norm": 1.7111364231373303, + "language_loss": 0.76216662, + "learning_rate": 1.799957023759277e-06, + "loss": 0.83918452, + "num_input_tokens_seen": 195635620, + "router_z_loss_clip": 1.53710938, + "router_z_loss_mlp": 0.12011719, + "step": 9080, + "time_per_iteration": 2.538072347640991 + }, + { + "auxiliary_loss_clip": 0.06429628, + "auxiliary_loss_mlp": 0.0126983, + "balance_loss_clip": 0.06281854, + "balance_loss_mlp": 0.0125816, + "epoch": 0.5459792574778296, + "flos": 23629756112640.0, + "grad_norm": 1.9762884364861095, + "language_loss": 0.83489871, + "learning_rate": 1.7995695189154392e-06, + "loss": 0.91189325, + "num_input_tokens_seen": 195652495, + "router_z_loss_clip": 1.47753906, + "router_z_loss_mlp": 0.11669922, + "step": 9081, + "time_per_iteration": 2.583111047744751 + }, + { + "auxiliary_loss_clip": 0.06430145, + "auxiliary_loss_mlp": 0.01267495, + "balance_loss_clip": 0.0628006, + "balance_loss_mlp": 0.01256552, + "epoch": 0.5460393807304975, + "flos": 19141583541120.0, + "grad_norm": 2.327386206353707, + "language_loss": 0.70079756, + "learning_rate": 1.7991820216719461e-06, + "loss": 0.77777398, + "num_input_tokens_seen": 195671965, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.10943604, + "step": 9082, + "time_per_iteration": 2.5038371086120605 + }, + { + "auxiliary_loss_clip": 0.06421287, + "auxiliary_loss_mlp": 0.01265183, + "balance_loss_clip": 0.06277952, + "balance_loss_mlp": 0.01253959, + "epoch": 0.5460995039831655, + "flos": 35927308414080.0, + "grad_norm": 1.8952773157154152, + "language_loss": 0.66865891, + "learning_rate": 1.7987945320434906e-06, + "loss": 0.74552357, + "num_input_tokens_seen": 195694725, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.11224365, + "step": 9083, + "time_per_iteration": 2.6453137397766113 + }, + { + "auxiliary_loss_clip": 0.06418573, + "auxiliary_loss_mlp": 0.01266425, + "balance_loss_clip": 0.06276823, + "balance_loss_mlp": 0.01256019, + "epoch": 0.5461596272358334, + "flos": 26766242945280.0, + "grad_norm": 1.5423197483893423, + "language_loss": 0.7895304, + "learning_rate": 1.798407050044766e-06, + "loss": 0.86638033, + "num_input_tokens_seen": 195714090, + "router_z_loss_clip": 1.41992188, + "router_z_loss_mlp": 0.10406494, + "step": 9084, + "time_per_iteration": 2.5392911434173584 + }, + { + "auxiliary_loss_clip": 0.06427852, + "auxiliary_loss_mlp": 0.01262899, + "balance_loss_clip": 0.06280479, + "balance_loss_mlp": 0.01252004, + "epoch": 0.5462197504885015, + "flos": 20892870201600.0, + "grad_norm": 1.8818428979315067, + "language_loss": 0.75159836, + "learning_rate": 1.7980195756904675e-06, + "loss": 0.82850587, + "num_input_tokens_seen": 195733585, + "router_z_loss_clip": 1.47265625, + "router_z_loss_mlp": 0.10900879, + "step": 9085, + "time_per_iteration": 2.5238590240478516 + }, + { + "auxiliary_loss_clip": 0.06428534, + "auxiliary_loss_mlp": 0.012646, + "balance_loss_clip": 0.06279323, + "balance_loss_mlp": 0.01252995, + "epoch": 0.5462798737411694, + "flos": 25810887818880.0, + "grad_norm": 1.69825848629267, + "language_loss": 0.74606055, + "learning_rate": 1.7976321089952857e-06, + "loss": 0.82299185, + "num_input_tokens_seen": 195752820, + "router_z_loss_clip": 1.4921875, + "router_z_loss_mlp": 0.1161499, + "step": 9086, + "time_per_iteration": 2.5416669845581055 + }, + { + "auxiliary_loss_clip": 0.06424639, + "auxiliary_loss_mlp": 0.01267462, + "balance_loss_clip": 0.06277122, + "balance_loss_mlp": 0.01255834, + "epoch": 0.5463399969938374, + "flos": 25782027287040.0, + "grad_norm": 1.4075791244754594, + "language_loss": 0.76979077, + "learning_rate": 1.7972446499739155e-06, + "loss": 0.84671181, + "num_input_tokens_seen": 195773740, + "router_z_loss_clip": 1.47363281, + "router_z_loss_mlp": 0.11633301, + "step": 9087, + "time_per_iteration": 2.5764284133911133 + }, + { + "auxiliary_loss_clip": 0.0642488, + "auxiliary_loss_mlp": 0.01270837, + "balance_loss_clip": 0.06278133, + "balance_loss_mlp": 0.01258088, + "epoch": 0.5464001202465053, + "flos": 18849234245760.0, + "grad_norm": 1.6014949266825944, + "language_loss": 0.77368462, + "learning_rate": 1.7968571986410484e-06, + "loss": 0.85064179, + "num_input_tokens_seen": 195792125, + "router_z_loss_clip": 1.46777344, + "router_z_loss_mlp": 0.12744141, + "step": 9088, + "time_per_iteration": 2.4971888065338135 + }, + { + "auxiliary_loss_clip": 0.06317829, + "auxiliary_loss_mlp": 0.01258554, + "balance_loss_clip": 0.062563, + "balance_loss_mlp": 0.0125685, + "epoch": 0.5464602434991733, + "flos": 69070281978240.0, + "grad_norm": 0.7120973935253039, + "language_loss": 0.57630938, + "learning_rate": 1.7964697550113758e-06, + "loss": 0.6520732, + "num_input_tokens_seen": 195854935, + "router_z_loss_clip": 0.6171875, + "router_z_loss_mlp": 0.01708984, + "step": 9089, + "time_per_iteration": 3.251268148422241 + }, + { + "auxiliary_loss_clip": 0.06429952, + "auxiliary_loss_mlp": 0.01270687, + "balance_loss_clip": 0.06279282, + "balance_loss_mlp": 0.01258945, + "epoch": 0.5465203667518412, + "flos": 27566870307840.0, + "grad_norm": 1.7671189132091156, + "language_loss": 0.77121699, + "learning_rate": 1.7960823190995918e-06, + "loss": 0.84822339, + "num_input_tokens_seen": 195874715, + "router_z_loss_clip": 1.50585938, + "router_z_loss_mlp": 0.11743164, + "step": 9090, + "time_per_iteration": 2.5513298511505127 + }, + { + "auxiliary_loss_clip": 0.06428426, + "auxiliary_loss_mlp": 0.01269928, + "balance_loss_clip": 0.06277205, + "balance_loss_mlp": 0.01257268, + "epoch": 0.5465804900045093, + "flos": 21215757110400.0, + "grad_norm": 1.8390444270451474, + "language_loss": 0.73801088, + "learning_rate": 1.7956948909203855e-06, + "loss": 0.81499445, + "num_input_tokens_seen": 195892610, + "router_z_loss_clip": 1.51269531, + "router_z_loss_mlp": 0.12670898, + "step": 9091, + "time_per_iteration": 2.5593018531799316 + }, + { + "auxiliary_loss_clip": 0.06426038, + "auxiliary_loss_mlp": 0.01268102, + "balance_loss_clip": 0.06278463, + "balance_loss_mlp": 0.01255948, + "epoch": 0.5466406132571772, + "flos": 22495005394560.0, + "grad_norm": 3.020884161734631, + "language_loss": 0.77827132, + "learning_rate": 1.7953074704884498e-06, + "loss": 0.85521269, + "num_input_tokens_seen": 195911085, + "router_z_loss_clip": 1.47363281, + "router_z_loss_mlp": 0.12164307, + "step": 9092, + "time_per_iteration": 2.5000102519989014 + }, + { + "auxiliary_loss_clip": 0.06431385, + "auxiliary_loss_mlp": 0.01266206, + "balance_loss_clip": 0.06280962, + "balance_loss_mlp": 0.01254583, + "epoch": 0.5467007365098452, + "flos": 17681598000000.0, + "grad_norm": 2.033807673433485, + "language_loss": 0.75258666, + "learning_rate": 1.794920057818476e-06, + "loss": 0.82956254, + "num_input_tokens_seen": 195929845, + "router_z_loss_clip": 1.50195312, + "router_z_loss_mlp": 0.11627197, + "step": 9093, + "time_per_iteration": 2.5118560791015625 + }, + { + "auxiliary_loss_clip": 0.06426246, + "auxiliary_loss_mlp": 0.01271687, + "balance_loss_clip": 0.06277527, + "balance_loss_mlp": 0.01258634, + "epoch": 0.5467608597625132, + "flos": 15703146120960.0, + "grad_norm": 3.7072671758327993, + "language_loss": 0.69514894, + "learning_rate": 1.7945326529251533e-06, + "loss": 0.77212822, + "num_input_tokens_seen": 195946350, + "router_z_loss_clip": 1.48828125, + "router_z_loss_mlp": 0.13067627, + "step": 9094, + "time_per_iteration": 2.471296787261963 + }, + { + "auxiliary_loss_clip": 0.06427498, + "auxiliary_loss_mlp": 0.01268457, + "balance_loss_clip": 0.06281194, + "balance_loss_mlp": 0.0125799, + "epoch": 0.5468209830151811, + "flos": 24319106853120.0, + "grad_norm": 3.067574771902978, + "language_loss": 0.68405867, + "learning_rate": 1.7941452558231731e-06, + "loss": 0.76101816, + "num_input_tokens_seen": 195959840, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.10467529, + "step": 9095, + "time_per_iteration": 2.559969186782837 + }, + { + "auxiliary_loss_clip": 0.06427877, + "auxiliary_loss_mlp": 0.01266121, + "balance_loss_clip": 0.06280283, + "balance_loss_mlp": 0.01255058, + "epoch": 0.5468811062678491, + "flos": 29173575548160.0, + "grad_norm": 1.4017188918581747, + "language_loss": 0.67021394, + "learning_rate": 1.7937578665272256e-06, + "loss": 0.747154, + "num_input_tokens_seen": 195981125, + "router_z_loss_clip": 1.4765625, + "router_z_loss_mlp": 0.11065674, + "step": 9096, + "time_per_iteration": 2.5755646228790283 + }, + { + "auxiliary_loss_clip": 0.06321621, + "auxiliary_loss_mlp": 0.01252605, + "balance_loss_clip": 0.06259765, + "balance_loss_mlp": 0.01250808, + "epoch": 0.546941229520517, + "flos": 67885078302720.0, + "grad_norm": 0.7312259601273227, + "language_loss": 0.57564938, + "learning_rate": 1.7933704850520007e-06, + "loss": 0.65139174, + "num_input_tokens_seen": 196038880, + "router_z_loss_clip": 0.6171875, + "router_z_loss_mlp": 0.01792908, + "step": 9097, + "time_per_iteration": 3.239208698272705 + }, + { + "auxiliary_loss_clip": 0.06323195, + "auxiliary_loss_mlp": 0.01252523, + "balance_loss_clip": 0.06261444, + "balance_loss_mlp": 0.01250845, + "epoch": 0.5470013527731851, + "flos": 58286578993920.0, + "grad_norm": 0.8922489191245683, + "language_loss": 0.64733016, + "learning_rate": 1.7929831114121868e-06, + "loss": 0.72308731, + "num_input_tokens_seen": 196099215, + "router_z_loss_clip": 0.6171875, + "router_z_loss_mlp": 0.01681519, + "step": 9098, + "time_per_iteration": 4.485429763793945 + }, + { + "auxiliary_loss_clip": 0.06427541, + "auxiliary_loss_mlp": 0.01271404, + "balance_loss_clip": 0.06279691, + "balance_loss_mlp": 0.0125937, + "epoch": 0.547061476025853, + "flos": 22972494286080.0, + "grad_norm": 1.4988253633991158, + "language_loss": 0.73256373, + "learning_rate": 1.7925957456224753e-06, + "loss": 0.80955321, + "num_input_tokens_seen": 196120370, + "router_z_loss_clip": 1.47753906, + "router_z_loss_mlp": 0.12042236, + "step": 9099, + "time_per_iteration": 2.5771172046661377 + }, + { + "auxiliary_loss_clip": 0.06428638, + "auxiliary_loss_mlp": 0.01265011, + "balance_loss_clip": 0.06282665, + "balance_loss_mlp": 0.01254712, + "epoch": 0.547121599278521, + "flos": 29975502648960.0, + "grad_norm": 1.9003011025398133, + "language_loss": 0.73232269, + "learning_rate": 1.7922083876975537e-06, + "loss": 0.80925912, + "num_input_tokens_seen": 196139075, + "router_z_loss_clip": 1.45898438, + "router_z_loss_mlp": 0.10296631, + "step": 9100, + "time_per_iteration": 2.613353967666626 + }, + { + "auxiliary_loss_clip": 0.06426845, + "auxiliary_loss_mlp": 0.01268034, + "balance_loss_clip": 0.06282172, + "balance_loss_mlp": 0.01256376, + "epoch": 0.5471817225311889, + "flos": 36543760502400.0, + "grad_norm": 3.16405552040578, + "language_loss": 0.68177283, + "learning_rate": 1.7918210376521102e-06, + "loss": 0.75872165, + "num_input_tokens_seen": 196159990, + "router_z_loss_clip": 1.44726562, + "router_z_loss_mlp": 0.11663818, + "step": 9101, + "time_per_iteration": 2.645268440246582 + }, + { + "auxiliary_loss_clip": 0.06429439, + "auxiliary_loss_mlp": 0.01267587, + "balance_loss_clip": 0.06282283, + "balance_loss_mlp": 0.01256482, + "epoch": 0.5472418457838569, + "flos": 25782278849280.0, + "grad_norm": 1.6236525701759785, + "language_loss": 0.78028667, + "learning_rate": 1.7914336955008343e-06, + "loss": 0.85725689, + "num_input_tokens_seen": 196180570, + "router_z_loss_clip": 1.46972656, + "router_z_loss_mlp": 0.11114502, + "step": 9102, + "time_per_iteration": 4.018383264541626 + }, + { + "auxiliary_loss_clip": 0.06425326, + "auxiliary_loss_mlp": 0.01265935, + "balance_loss_clip": 0.06284064, + "balance_loss_mlp": 0.01255659, + "epoch": 0.5473019690365248, + "flos": 27894453045120.0, + "grad_norm": 1.4050316255430886, + "language_loss": 0.72370696, + "learning_rate": 1.791046361258413e-06, + "loss": 0.80061954, + "num_input_tokens_seen": 196200300, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.1027832, + "step": 9103, + "time_per_iteration": 2.613557815551758 + }, + { + "auxiliary_loss_clip": 0.06427938, + "auxiliary_loss_mlp": 0.01268597, + "balance_loss_clip": 0.06282217, + "balance_loss_mlp": 0.01257237, + "epoch": 0.5473620922891929, + "flos": 57644551411200.0, + "grad_norm": 1.2696818989696173, + "language_loss": 0.65471172, + "learning_rate": 1.7906590349395356e-06, + "loss": 0.73167711, + "num_input_tokens_seen": 196228525, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.11352539, + "step": 9104, + "time_per_iteration": 2.8648996353149414 + }, + { + "auxiliary_loss_clip": 0.0643408, + "auxiliary_loss_mlp": 0.01271697, + "balance_loss_clip": 0.06284557, + "balance_loss_mlp": 0.01259174, + "epoch": 0.5474222155418608, + "flos": 19360069862400.0, + "grad_norm": 1.73787664165883, + "language_loss": 0.8214826, + "learning_rate": 1.790271716558888e-06, + "loss": 0.89854038, + "num_input_tokens_seen": 196247690, + "router_z_loss_clip": 1.49609375, + "router_z_loss_mlp": 0.12536621, + "step": 9105, + "time_per_iteration": 2.5110819339752197 + }, + { + "auxiliary_loss_clip": 0.06424334, + "auxiliary_loss_mlp": 0.01267412, + "balance_loss_clip": 0.06280238, + "balance_loss_mlp": 0.01256474, + "epoch": 0.5474823387945288, + "flos": 25127700353280.0, + "grad_norm": 1.5738849579324676, + "language_loss": 0.80505264, + "learning_rate": 1.7898844061311575e-06, + "loss": 0.88197005, + "num_input_tokens_seen": 196268555, + "router_z_loss_clip": 1.43945312, + "router_z_loss_mlp": 0.10943604, + "step": 9106, + "time_per_iteration": 2.545797824859619 + }, + { + "auxiliary_loss_clip": 0.0642664, + "auxiliary_loss_mlp": 0.01267343, + "balance_loss_clip": 0.06280842, + "balance_loss_mlp": 0.01256334, + "epoch": 0.5475424620471967, + "flos": 18009977351040.0, + "grad_norm": 1.8936776188065845, + "language_loss": 0.69983113, + "learning_rate": 1.7894971036710322e-06, + "loss": 0.77677101, + "num_input_tokens_seen": 196285585, + "router_z_loss_clip": 1.45800781, + "router_z_loss_mlp": 0.11010742, + "step": 9107, + "time_per_iteration": 3.930511474609375 + }, + { + "auxiliary_loss_clip": 0.06431143, + "auxiliary_loss_mlp": 0.01263745, + "balance_loss_clip": 0.06281775, + "balance_loss_mlp": 0.01252438, + "epoch": 0.5476025852998647, + "flos": 22315819438080.0, + "grad_norm": 1.6441057037047366, + "language_loss": 0.63668221, + "learning_rate": 1.789109809193197e-06, + "loss": 0.71363103, + "num_input_tokens_seen": 196305085, + "router_z_loss_clip": 1.49316406, + "router_z_loss_mlp": 0.11309814, + "step": 9108, + "time_per_iteration": 2.548469305038452 + }, + { + "auxiliary_loss_clip": 0.06427735, + "auxiliary_loss_mlp": 0.01264812, + "balance_loss_clip": 0.06281575, + "balance_loss_mlp": 0.01254632, + "epoch": 0.5476627085525327, + "flos": 20126679667200.0, + "grad_norm": 1.6544017163405356, + "language_loss": 0.75096864, + "learning_rate": 1.7887225227123396e-06, + "loss": 0.82789409, + "num_input_tokens_seen": 196323945, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.10174561, + "step": 9109, + "time_per_iteration": 2.505537748336792 + }, + { + "auxiliary_loss_clip": 0.06426554, + "auxiliary_loss_mlp": 0.01271245, + "balance_loss_clip": 0.06282739, + "balance_loss_mlp": 0.01259235, + "epoch": 0.5477228318052006, + "flos": 17718382742400.0, + "grad_norm": 1.7609925306613563, + "language_loss": 0.78101015, + "learning_rate": 1.7883352442431457e-06, + "loss": 0.85798812, + "num_input_tokens_seen": 196342200, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.12005615, + "step": 9110, + "time_per_iteration": 2.5898001194000244 + }, + { + "auxiliary_loss_clip": 0.0642444, + "auxiliary_loss_mlp": 0.01264653, + "balance_loss_clip": 0.06281163, + "balance_loss_mlp": 0.01253948, + "epoch": 0.5477829550578687, + "flos": 25856057969280.0, + "grad_norm": 1.4117567478996924, + "language_loss": 0.71281165, + "learning_rate": 1.7879479738002993e-06, + "loss": 0.78970265, + "num_input_tokens_seen": 196362940, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.10699463, + "step": 9111, + "time_per_iteration": 2.5514800548553467 + }, + { + "auxiliary_loss_clip": 0.06428348, + "auxiliary_loss_mlp": 0.01265751, + "balance_loss_clip": 0.06282744, + "balance_loss_mlp": 0.01254021, + "epoch": 0.5478430783105366, + "flos": 23046399187200.0, + "grad_norm": 1.7318252125729088, + "language_loss": 0.71129775, + "learning_rate": 1.7875607113984876e-06, + "loss": 0.7882387, + "num_input_tokens_seen": 196383070, + "router_z_loss_clip": 1.45605469, + "router_z_loss_mlp": 0.1171875, + "step": 9112, + "time_per_iteration": 2.5733911991119385 + }, + { + "auxiliary_loss_clip": 0.06428306, + "auxiliary_loss_mlp": 0.01265183, + "balance_loss_clip": 0.06280322, + "balance_loss_mlp": 0.0125412, + "epoch": 0.5479032015632046, + "flos": 16076821403520.0, + "grad_norm": 1.865243038866792, + "language_loss": 0.88150853, + "learning_rate": 1.7871734570523953e-06, + "loss": 0.95844346, + "num_input_tokens_seen": 196398485, + "router_z_loss_clip": 1.47949219, + "router_z_loss_mlp": 0.1105957, + "step": 9113, + "time_per_iteration": 4.03569483757019 + }, + { + "auxiliary_loss_clip": 0.06427854, + "auxiliary_loss_mlp": 0.01265805, + "balance_loss_clip": 0.0628054, + "balance_loss_mlp": 0.01254171, + "epoch": 0.5479633248158725, + "flos": 24285382784640.0, + "grad_norm": 1.9056802782338742, + "language_loss": 0.73404038, + "learning_rate": 1.7867862107767067e-06, + "loss": 0.81097698, + "num_input_tokens_seen": 196417725, + "router_z_loss_clip": 1.47363281, + "router_z_loss_mlp": 0.11633301, + "step": 9114, + "time_per_iteration": 2.552778959274292 + }, + { + "auxiliary_loss_clip": 0.06424817, + "auxiliary_loss_mlp": 0.0126649, + "balance_loss_clip": 0.06279442, + "balance_loss_mlp": 0.0125582, + "epoch": 0.5480234480685405, + "flos": 26365216504320.0, + "grad_norm": 1.4540698273743113, + "language_loss": 0.72457099, + "learning_rate": 1.7863989725861066e-06, + "loss": 0.80148405, + "num_input_tokens_seen": 196437840, + "router_z_loss_clip": 1.45214844, + "router_z_loss_mlp": 0.10662842, + "step": 9115, + "time_per_iteration": 2.5838403701782227 + }, + { + "auxiliary_loss_clip": 0.06436512, + "auxiliary_loss_mlp": 0.01267671, + "balance_loss_clip": 0.06284098, + "balance_loss_mlp": 0.01256066, + "epoch": 0.5480835713212084, + "flos": 22061722331520.0, + "grad_norm": 1.7541916767056687, + "language_loss": 0.72373956, + "learning_rate": 1.7860117424952781e-06, + "loss": 0.80078137, + "num_input_tokens_seen": 196457300, + "router_z_loss_clip": 1.52539062, + "router_z_loss_mlp": 0.1161499, + "step": 9116, + "time_per_iteration": 2.5292439460754395 + }, + { + "auxiliary_loss_clip": 0.06426133, + "auxiliary_loss_mlp": 0.01267118, + "balance_loss_clip": 0.06279518, + "balance_loss_mlp": 0.01256205, + "epoch": 0.5481436945738765, + "flos": 25308018339840.0, + "grad_norm": 1.941043285146296, + "language_loss": 0.76906073, + "learning_rate": 1.7856245205189063e-06, + "loss": 0.84599322, + "num_input_tokens_seen": 196476720, + "router_z_loss_clip": 1.46582031, + "router_z_loss_mlp": 0.10906982, + "step": 9117, + "time_per_iteration": 2.5854122638702393 + }, + { + "auxiliary_loss_clip": 0.06421119, + "auxiliary_loss_mlp": 0.01264207, + "balance_loss_clip": 0.06279179, + "balance_loss_mlp": 0.01253532, + "epoch": 0.5482038178265444, + "flos": 33588807540480.0, + "grad_norm": 1.613198613591587, + "language_loss": 0.62954283, + "learning_rate": 1.785237306671674e-06, + "loss": 0.7063961, + "num_input_tokens_seen": 196496765, + "router_z_loss_clip": 1.41894531, + "router_z_loss_mlp": 0.10675049, + "step": 9118, + "time_per_iteration": 2.61136531829834 + }, + { + "auxiliary_loss_clip": 0.06429429, + "auxiliary_loss_mlp": 0.01266281, + "balance_loss_clip": 0.06280537, + "balance_loss_mlp": 0.0125436, + "epoch": 0.5482639410792124, + "flos": 19032235562880.0, + "grad_norm": 1.6774564392555322, + "language_loss": 0.79138243, + "learning_rate": 1.7848501009682646e-06, + "loss": 0.86833954, + "num_input_tokens_seen": 196516220, + "router_z_loss_clip": 1.48925781, + "router_z_loss_mlp": 0.11920166, + "step": 9119, + "time_per_iteration": 2.5309953689575195 + }, + { + "auxiliary_loss_clip": 0.06425598, + "auxiliary_loss_mlp": 0.01271106, + "balance_loss_clip": 0.06281713, + "balance_loss_mlp": 0.0126033, + "epoch": 0.5483240643318803, + "flos": 25417282464000.0, + "grad_norm": 1.5630724809093546, + "language_loss": 0.82719064, + "learning_rate": 1.7844629034233604e-06, + "loss": 0.9041577, + "num_input_tokens_seen": 196533860, + "router_z_loss_clip": 1.43945312, + "router_z_loss_mlp": 0.10772705, + "step": 9120, + "time_per_iteration": 2.551790952682495 + }, + { + "auxiliary_loss_clip": 0.06432922, + "auxiliary_loss_mlp": 0.01264861, + "balance_loss_clip": 0.06284823, + "balance_loss_mlp": 0.01253292, + "epoch": 0.5483841875845483, + "flos": 21472705255680.0, + "grad_norm": 1.7308751336861314, + "language_loss": 0.80248237, + "learning_rate": 1.7840757140516455e-06, + "loss": 0.87946028, + "num_input_tokens_seen": 196551305, + "router_z_loss_clip": 1.48242188, + "router_z_loss_mlp": 0.11566162, + "step": 9121, + "time_per_iteration": 2.5354321002960205 + }, + { + "auxiliary_loss_clip": 0.06429829, + "auxiliary_loss_mlp": 0.01267, + "balance_loss_clip": 0.06280297, + "balance_loss_mlp": 0.01255651, + "epoch": 0.5484443108372163, + "flos": 24753060748800.0, + "grad_norm": 1.8214688446413962, + "language_loss": 0.6171329, + "learning_rate": 1.7836885328678008e-06, + "loss": 0.69410121, + "num_input_tokens_seen": 196569420, + "router_z_loss_clip": 1.49609375, + "router_z_loss_mlp": 0.11352539, + "step": 9122, + "time_per_iteration": 2.536548614501953 + }, + { + "auxiliary_loss_clip": 0.06426375, + "auxiliary_loss_mlp": 0.01268013, + "balance_loss_clip": 0.06283108, + "balance_loss_mlp": 0.0125729, + "epoch": 0.5485044340898843, + "flos": 25382594073600.0, + "grad_norm": 1.6758320366866328, + "language_loss": 0.71812153, + "learning_rate": 1.7833013598865084e-06, + "loss": 0.7950654, + "num_input_tokens_seen": 196590610, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.1071167, + "step": 9123, + "time_per_iteration": 2.563128709793091 + }, + { + "auxiliary_loss_clip": 0.06422795, + "auxiliary_loss_mlp": 0.01264644, + "balance_loss_clip": 0.06277866, + "balance_loss_mlp": 0.01254839, + "epoch": 0.5485645573425523, + "flos": 12646140485760.0, + "grad_norm": 2.0499300220900367, + "language_loss": 0.83466411, + "learning_rate": 1.7829141951224505e-06, + "loss": 0.91153848, + "num_input_tokens_seen": 196606495, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.09802246, + "step": 9124, + "time_per_iteration": 2.4774932861328125 + }, + { + "auxiliary_loss_clip": 0.06423289, + "auxiliary_loss_mlp": 0.01272789, + "balance_loss_clip": 0.06280372, + "balance_loss_mlp": 0.01262054, + "epoch": 0.5486246805952202, + "flos": 28336918129920.0, + "grad_norm": 1.5704023496451165, + "language_loss": 0.80787551, + "learning_rate": 1.7825270385903075e-06, + "loss": 0.88483626, + "num_input_tokens_seen": 196626365, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.10736084, + "step": 9125, + "time_per_iteration": 2.6640827655792236 + }, + { + "auxiliary_loss_clip": 0.06429766, + "auxiliary_loss_mlp": 0.01266738, + "balance_loss_clip": 0.06281759, + "balance_loss_mlp": 0.0125558, + "epoch": 0.5486848038478882, + "flos": 16805598289920.0, + "grad_norm": 1.778522251586277, + "language_loss": 0.74475932, + "learning_rate": 1.7821398903047617e-06, + "loss": 0.82172436, + "num_input_tokens_seen": 196644465, + "router_z_loss_clip": 1.47753906, + "router_z_loss_mlp": 0.1114502, + "step": 9126, + "time_per_iteration": 2.4920494556427 + }, + { + "auxiliary_loss_clip": 0.0643461, + "auxiliary_loss_mlp": 0.01271917, + "balance_loss_clip": 0.06284419, + "balance_loss_mlp": 0.01260383, + "epoch": 0.5487449271005561, + "flos": 17241606610560.0, + "grad_norm": 2.5065680491325217, + "language_loss": 0.66843152, + "learning_rate": 1.7817527502804928e-06, + "loss": 0.74549675, + "num_input_tokens_seen": 196659160, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.11535645, + "step": 9127, + "time_per_iteration": 2.498995304107666 + }, + { + "auxiliary_loss_clip": 0.0642729, + "auxiliary_loss_mlp": 0.0126947, + "balance_loss_clip": 0.06281507, + "balance_loss_mlp": 0.01257072, + "epoch": 0.5488050503532241, + "flos": 17345462146560.0, + "grad_norm": 1.8347258108428224, + "language_loss": 0.83430481, + "learning_rate": 1.781365618532181e-06, + "loss": 0.91127241, + "num_input_tokens_seen": 196677410, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.1239624, + "step": 9128, + "time_per_iteration": 2.4851553440093994 + }, + { + "auxiliary_loss_clip": 0.06423862, + "auxiliary_loss_mlp": 0.01267411, + "balance_loss_clip": 0.06279477, + "balance_loss_mlp": 0.01256032, + "epoch": 0.548865173605892, + "flos": 17245044627840.0, + "grad_norm": 1.9721748285442382, + "language_loss": 0.73992771, + "learning_rate": 1.7809784950745078e-06, + "loss": 0.81684041, + "num_input_tokens_seen": 196696765, + "router_z_loss_clip": 1.44335938, + "router_z_loss_mlp": 0.1138916, + "step": 9129, + "time_per_iteration": 2.5088050365448 + }, + { + "auxiliary_loss_clip": 0.06436306, + "auxiliary_loss_mlp": 0.0126816, + "balance_loss_clip": 0.0628598, + "balance_loss_mlp": 0.01256108, + "epoch": 0.5489252968585601, + "flos": 17462398919040.0, + "grad_norm": 2.1982698674747745, + "language_loss": 0.63327444, + "learning_rate": 1.7805913799221511e-06, + "loss": 0.7103191, + "num_input_tokens_seen": 196714895, + "router_z_loss_clip": 1.50292969, + "router_z_loss_mlp": 0.12054443, + "step": 9130, + "time_per_iteration": 2.4861414432525635 + }, + { + "auxiliary_loss_clip": 0.06431893, + "auxiliary_loss_mlp": 0.01266818, + "balance_loss_clip": 0.06281481, + "balance_loss_mlp": 0.01255046, + "epoch": 0.548985420111228, + "flos": 26330653895040.0, + "grad_norm": 1.729948569228587, + "language_loss": 0.63358611, + "learning_rate": 1.7802042730897915e-06, + "loss": 0.71057326, + "num_input_tokens_seen": 196735510, + "router_z_loss_clip": 1.50390625, + "router_z_loss_mlp": 0.11773682, + "step": 9131, + "time_per_iteration": 2.589580535888672 + }, + { + "auxiliary_loss_clip": 0.0643028, + "auxiliary_loss_mlp": 0.01268323, + "balance_loss_clip": 0.06282265, + "balance_loss_mlp": 0.01255955, + "epoch": 0.549045543363896, + "flos": 18699034602240.0, + "grad_norm": 1.7539544854272515, + "language_loss": 0.75148702, + "learning_rate": 1.7798171745921084e-06, + "loss": 0.82847303, + "num_input_tokens_seen": 196752855, + "router_z_loss_clip": 1.47851562, + "router_z_loss_mlp": 0.12353516, + "step": 9132, + "time_per_iteration": 2.461970329284668 + }, + { + "auxiliary_loss_clip": 0.06429279, + "auxiliary_loss_mlp": 0.01266105, + "balance_loss_clip": 0.06280597, + "balance_loss_mlp": 0.01255234, + "epoch": 0.5491056666165639, + "flos": 24724284071040.0, + "grad_norm": 2.6052413777049144, + "language_loss": 0.8162328, + "learning_rate": 1.7794300844437795e-06, + "loss": 0.89318669, + "num_input_tokens_seen": 196772230, + "router_z_loss_clip": 1.48730469, + "router_z_loss_mlp": 0.10870361, + "step": 9133, + "time_per_iteration": 2.5799684524536133 + }, + { + "auxiliary_loss_clip": 0.06426433, + "auxiliary_loss_mlp": 0.01271009, + "balance_loss_clip": 0.06280407, + "balance_loss_mlp": 0.01259691, + "epoch": 0.5491657898692319, + "flos": 21582849847680.0, + "grad_norm": 1.8788464104374898, + "language_loss": 0.70385146, + "learning_rate": 1.7790430026594841e-06, + "loss": 0.78082585, + "num_input_tokens_seen": 196790405, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.11328125, + "step": 9134, + "time_per_iteration": 2.5116565227508545 + }, + { + "auxiliary_loss_clip": 0.06431407, + "auxiliary_loss_mlp": 0.01267106, + "balance_loss_clip": 0.06281983, + "balance_loss_mlp": 0.01256062, + "epoch": 0.5492259131219, + "flos": 50487653825280.0, + "grad_norm": 2.3217483044436955, + "language_loss": 0.61379695, + "learning_rate": 1.7786559292539004e-06, + "loss": 0.69078213, + "num_input_tokens_seen": 196813785, + "router_z_loss_clip": 1.49316406, + "router_z_loss_mlp": 0.11035156, + "step": 9135, + "time_per_iteration": 2.8019859790802 + }, + { + "auxiliary_loss_clip": 0.06430922, + "auxiliary_loss_mlp": 0.01266434, + "balance_loss_clip": 0.06280293, + "balance_loss_mlp": 0.01254591, + "epoch": 0.5492860363745679, + "flos": 25126316760960.0, + "grad_norm": 1.8569102400294533, + "language_loss": 0.72833902, + "learning_rate": 1.7782688642417058e-06, + "loss": 0.80531251, + "num_input_tokens_seen": 196834390, + "router_z_loss_clip": 1.50488281, + "router_z_loss_mlp": 0.11846924, + "step": 9136, + "time_per_iteration": 2.5313796997070312 + }, + { + "auxiliary_loss_clip": 0.06434008, + "auxiliary_loss_mlp": 0.01267878, + "balance_loss_clip": 0.06279632, + "balance_loss_mlp": 0.01255551, + "epoch": 0.5493461596272359, + "flos": 22639670668800.0, + "grad_norm": 2.4335907064216302, + "language_loss": 0.6873585, + "learning_rate": 1.7778818076375781e-06, + "loss": 0.76437736, + "num_input_tokens_seen": 196853290, + "router_z_loss_clip": 1.54296875, + "router_z_loss_mlp": 0.12329102, + "step": 9137, + "time_per_iteration": 2.606400489807129 + }, + { + "auxiliary_loss_clip": 0.06325421, + "auxiliary_loss_mlp": 0.01260391, + "balance_loss_clip": 0.06263588, + "balance_loss_mlp": 0.01258753, + "epoch": 0.5494062828799038, + "flos": 66169486281600.0, + "grad_norm": 0.7309885412732349, + "language_loss": 0.65176189, + "learning_rate": 1.7774947594561947e-06, + "loss": 0.72762001, + "num_input_tokens_seen": 196913120, + "router_z_loss_clip": 0.61767578, + "router_z_loss_mlp": 0.0164032, + "step": 9138, + "time_per_iteration": 4.603189945220947 + }, + { + "auxiliary_loss_clip": 0.06431855, + "auxiliary_loss_mlp": 0.01265593, + "balance_loss_clip": 0.06282654, + "balance_loss_mlp": 0.01253803, + "epoch": 0.5494664061325718, + "flos": 21112362771840.0, + "grad_norm": 1.7352131741027665, + "language_loss": 0.75659418, + "learning_rate": 1.7771077197122321e-06, + "loss": 0.83356863, + "num_input_tokens_seen": 196931530, + "router_z_loss_clip": 1.49121094, + "router_z_loss_mlp": 0.11785889, + "step": 9139, + "time_per_iteration": 2.5063250064849854 + }, + { + "auxiliary_loss_clip": 0.06427477, + "auxiliary_loss_mlp": 0.01268876, + "balance_loss_clip": 0.06281833, + "balance_loss_mlp": 0.01257599, + "epoch": 0.5495265293852397, + "flos": 14397846416640.0, + "grad_norm": 2.090947018102217, + "language_loss": 0.71453607, + "learning_rate": 1.7767206884203672e-06, + "loss": 0.79149961, + "num_input_tokens_seen": 196949430, + "router_z_loss_clip": 1.45507812, + "router_z_loss_mlp": 0.11273193, + "step": 9140, + "time_per_iteration": 2.516493558883667 + }, + { + "auxiliary_loss_clip": 0.06426564, + "auxiliary_loss_mlp": 0.01265679, + "balance_loss_clip": 0.06279987, + "balance_loss_mlp": 0.01254623, + "epoch": 0.5495866526379077, + "flos": 25554945922560.0, + "grad_norm": 1.591757169874098, + "language_loss": 0.76439172, + "learning_rate": 1.7763336655952762e-06, + "loss": 0.84131408, + "num_input_tokens_seen": 196968265, + "router_z_loss_clip": 1.46484375, + "router_z_loss_mlp": 0.1105957, + "step": 9141, + "time_per_iteration": 4.032621383666992 + }, + { + "auxiliary_loss_clip": 0.06420414, + "auxiliary_loss_mlp": 0.01268222, + "balance_loss_clip": 0.06278077, + "balance_loss_mlp": 0.01257648, + "epoch": 0.5496467758905756, + "flos": 21322421758080.0, + "grad_norm": 1.9135284052459163, + "language_loss": 0.75301933, + "learning_rate": 1.7759466512516346e-06, + "loss": 0.82990575, + "num_input_tokens_seen": 196984930, + "router_z_loss_clip": 1.42285156, + "router_z_loss_mlp": 0.10577393, + "step": 9142, + "time_per_iteration": 2.517458438873291 + }, + { + "auxiliary_loss_clip": 0.06433351, + "auxiliary_loss_mlp": 0.01271982, + "balance_loss_clip": 0.06284253, + "balance_loss_mlp": 0.01259895, + "epoch": 0.5497068991432437, + "flos": 22239021571200.0, + "grad_norm": 1.7111366793556597, + "language_loss": 0.77014959, + "learning_rate": 1.7755596454041192e-06, + "loss": 0.84720296, + "num_input_tokens_seen": 197002320, + "router_z_loss_clip": 1.49121094, + "router_z_loss_mlp": 0.12091064, + "step": 9143, + "time_per_iteration": 2.516505002975464 + }, + { + "auxiliary_loss_clip": 0.06424481, + "auxiliary_loss_mlp": 0.01268074, + "balance_loss_clip": 0.06278251, + "balance_loss_mlp": 0.01256416, + "epoch": 0.5497670223959116, + "flos": 18485076401280.0, + "grad_norm": 3.356687572137957, + "language_loss": 0.79973668, + "learning_rate": 1.7751726480674044e-06, + "loss": 0.87666219, + "num_input_tokens_seen": 197020825, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.11663818, + "step": 9144, + "time_per_iteration": 2.4832475185394287 + }, + { + "auxiliary_loss_clip": 0.0642961, + "auxiliary_loss_mlp": 0.01268496, + "balance_loss_clip": 0.06281358, + "balance_loss_mlp": 0.01257153, + "epoch": 0.5498271456485796, + "flos": 29212750131840.0, + "grad_norm": 1.7313830940317911, + "language_loss": 0.7154156, + "learning_rate": 1.7747856592561645e-06, + "loss": 0.79239666, + "num_input_tokens_seen": 197040450, + "router_z_loss_clip": 1.48242188, + "router_z_loss_mlp": 0.11346436, + "step": 9145, + "time_per_iteration": 2.6261048316955566 + }, + { + "auxiliary_loss_clip": 0.06426725, + "auxiliary_loss_mlp": 0.01264568, + "balance_loss_clip": 0.06279887, + "balance_loss_mlp": 0.01254197, + "epoch": 0.5498872689012475, + "flos": 34833032017920.0, + "grad_norm": 1.5682468167397778, + "language_loss": 0.70529747, + "learning_rate": 1.774398678985076e-06, + "loss": 0.78221035, + "num_input_tokens_seen": 197063930, + "router_z_loss_clip": 1.46582031, + "router_z_loss_mlp": 0.10369873, + "step": 9146, + "time_per_iteration": 4.087557315826416 + }, + { + "auxiliary_loss_clip": 0.06419109, + "auxiliary_loss_mlp": 0.01264014, + "balance_loss_clip": 0.06276917, + "balance_loss_mlp": 0.01253923, + "epoch": 0.5499473921539155, + "flos": 25929124329600.0, + "grad_norm": 2.0128119517228305, + "language_loss": 0.64188051, + "learning_rate": 1.7740117072688113e-06, + "loss": 0.71871173, + "num_input_tokens_seen": 197082660, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.10095215, + "step": 9147, + "time_per_iteration": 2.5406603813171387 + }, + { + "auxiliary_loss_clip": 0.06424303, + "auxiliary_loss_mlp": 0.01265827, + "balance_loss_clip": 0.06279408, + "balance_loss_mlp": 0.01255122, + "epoch": 0.5500075154065835, + "flos": 22280334433920.0, + "grad_norm": 1.893989099652022, + "language_loss": 0.81534255, + "learning_rate": 1.7736247441220458e-06, + "loss": 0.89224386, + "num_input_tokens_seen": 197100675, + "router_z_loss_clip": 1.44921875, + "router_z_loss_mlp": 0.1071167, + "step": 9148, + "time_per_iteration": 2.5051376819610596 + }, + { + "auxiliary_loss_clip": 0.06424436, + "auxiliary_loss_mlp": 0.01270935, + "balance_loss_clip": 0.06277981, + "balance_loss_mlp": 0.0125992, + "epoch": 0.5500676386592515, + "flos": 28044946177920.0, + "grad_norm": 1.7460739337347344, + "language_loss": 0.7916007, + "learning_rate": 1.773237789559453e-06, + "loss": 0.86855441, + "num_input_tokens_seen": 197121320, + "router_z_loss_clip": 1.46484375, + "router_z_loss_mlp": 0.11016846, + "step": 9149, + "time_per_iteration": 2.5586931705474854 + }, + { + "auxiliary_loss_clip": 0.0642364, + "auxiliary_loss_mlp": 0.01264747, + "balance_loss_clip": 0.06277739, + "balance_loss_mlp": 0.01253852, + "epoch": 0.5501277619119195, + "flos": 23921602283520.0, + "grad_norm": 2.0079288501902965, + "language_loss": 0.7263124, + "learning_rate": 1.7728508435957052e-06, + "loss": 0.80319625, + "num_input_tokens_seen": 197138965, + "router_z_loss_clip": 1.45996094, + "router_z_loss_mlp": 0.10888672, + "step": 9150, + "time_per_iteration": 2.5097196102142334 + }, + { + "auxiliary_loss_clip": 0.06428004, + "auxiliary_loss_mlp": 0.01265548, + "balance_loss_clip": 0.06278474, + "balance_loss_mlp": 0.01253454, + "epoch": 0.5501878851645874, + "flos": 20930199995520.0, + "grad_norm": 1.7516173490285718, + "language_loss": 0.74991822, + "learning_rate": 1.772463906245477e-06, + "loss": 0.82685369, + "num_input_tokens_seen": 197156460, + "router_z_loss_clip": 1.49316406, + "router_z_loss_mlp": 0.12103271, + "step": 9151, + "time_per_iteration": 2.4953532218933105 + }, + { + "auxiliary_loss_clip": 0.06421181, + "auxiliary_loss_mlp": 0.01264237, + "balance_loss_clip": 0.06275992, + "balance_loss_mlp": 0.01253317, + "epoch": 0.5502480084172554, + "flos": 20671155498240.0, + "grad_norm": 1.7180580365194615, + "language_loss": 0.76128006, + "learning_rate": 1.7720769775234394e-06, + "loss": 0.83813429, + "num_input_tokens_seen": 197175140, + "router_z_loss_clip": 1.45117188, + "router_z_loss_mlp": 0.10925293, + "step": 9152, + "time_per_iteration": 2.5041630268096924 + }, + { + "auxiliary_loss_clip": 0.06418908, + "auxiliary_loss_mlp": 0.01264981, + "balance_loss_clip": 0.06276076, + "balance_loss_mlp": 0.01254336, + "epoch": 0.5503081316699233, + "flos": 26439792238080.0, + "grad_norm": 3.86516963702514, + "language_loss": 0.82636946, + "learning_rate": 1.7716900574442662e-06, + "loss": 0.90320837, + "num_input_tokens_seen": 197194345, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.10650635, + "step": 9153, + "time_per_iteration": 4.000823259353638 + }, + { + "auxiliary_loss_clip": 0.06419568, + "auxiliary_loss_mlp": 0.01266317, + "balance_loss_clip": 0.0627673, + "balance_loss_mlp": 0.01254682, + "epoch": 0.5503682549225913, + "flos": 30637208741760.0, + "grad_norm": 1.7185020713354737, + "language_loss": 0.7442615, + "learning_rate": 1.7713031460226294e-06, + "loss": 0.82112032, + "num_input_tokens_seen": 197215535, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.11633301, + "step": 9154, + "time_per_iteration": 2.619478225708008 + }, + { + "auxiliary_loss_clip": 0.06431979, + "auxiliary_loss_mlp": 0.01267491, + "balance_loss_clip": 0.0627896, + "balance_loss_mlp": 0.01256273, + "epoch": 0.5504283781752592, + "flos": 22572096750720.0, + "grad_norm": 1.5448619232700234, + "language_loss": 0.73359931, + "learning_rate": 1.770916243273199e-06, + "loss": 0.81059402, + "num_input_tokens_seen": 197234945, + "router_z_loss_clip": 1.53027344, + "router_z_loss_mlp": 0.11212158, + "step": 9155, + "time_per_iteration": 2.5512940883636475 + }, + { + "auxiliary_loss_clip": 0.0632084, + "auxiliary_loss_mlp": 0.01252943, + "balance_loss_clip": 0.06258567, + "balance_loss_mlp": 0.01251311, + "epoch": 0.5504885014279273, + "flos": 67918634663040.0, + "grad_norm": 0.7176527357407121, + "language_loss": 0.5550307, + "learning_rate": 1.7705293492106483e-06, + "loss": 0.63076854, + "num_input_tokens_seen": 197302285, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 0.01634216, + "step": 9156, + "time_per_iteration": 3.3401191234588623 + }, + { + "auxiliary_loss_clip": 0.06423487, + "auxiliary_loss_mlp": 0.01263997, + "balance_loss_clip": 0.06277417, + "balance_loss_mlp": 0.01254115, + "epoch": 0.5505486246805952, + "flos": 22455705029760.0, + "grad_norm": 1.7228062733410818, + "language_loss": 0.82601535, + "learning_rate": 1.7701424638496475e-06, + "loss": 0.90289015, + "num_input_tokens_seen": 197321575, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.09881592, + "step": 9157, + "time_per_iteration": 2.5331945419311523 + }, + { + "auxiliary_loss_clip": 0.06433383, + "auxiliary_loss_mlp": 0.01267609, + "balance_loss_clip": 0.06279938, + "balance_loss_mlp": 0.01255885, + "epoch": 0.5506087479332632, + "flos": 26914220455680.0, + "grad_norm": 2.384583042502796, + "language_loss": 0.7632947, + "learning_rate": 1.7697555872048677e-06, + "loss": 0.84030461, + "num_input_tokens_seen": 197340255, + "router_z_loss_clip": 1.53320312, + "router_z_loss_mlp": 0.11743164, + "step": 9158, + "time_per_iteration": 2.5622854232788086 + }, + { + "auxiliary_loss_clip": 0.06422579, + "auxiliary_loss_mlp": 0.01265094, + "balance_loss_clip": 0.06281133, + "balance_loss_mlp": 0.01255134, + "epoch": 0.5506688711859311, + "flos": 22936967354880.0, + "grad_norm": 1.858566635879154, + "language_loss": 0.70421213, + "learning_rate": 1.769368719290979e-06, + "loss": 0.78108883, + "num_input_tokens_seen": 197360360, + "router_z_loss_clip": 1.41503906, + "router_z_loss_mlp": 0.09967041, + "step": 9159, + "time_per_iteration": 2.5299885272979736 + }, + { + "auxiliary_loss_clip": 0.06426555, + "auxiliary_loss_mlp": 0.01265176, + "balance_loss_clip": 0.06279982, + "balance_loss_mlp": 0.01254114, + "epoch": 0.5507289944385991, + "flos": 29614111989120.0, + "grad_norm": 1.5102709537150474, + "language_loss": 0.68438101, + "learning_rate": 1.7689818601226516e-06, + "loss": 0.7612983, + "num_input_tokens_seen": 197381905, + "router_z_loss_clip": 1.46484375, + "router_z_loss_mlp": 0.11065674, + "step": 9160, + "time_per_iteration": 2.5797348022460938 + }, + { + "auxiliary_loss_clip": 0.06423666, + "auxiliary_loss_mlp": 0.01264259, + "balance_loss_clip": 0.06278166, + "balance_loss_mlp": 0.01252774, + "epoch": 0.5507891176912671, + "flos": 15338736714240.0, + "grad_norm": 1.8978617290593418, + "language_loss": 0.7231009, + "learning_rate": 1.7685950097145552e-06, + "loss": 0.79998016, + "num_input_tokens_seen": 197398555, + "router_z_loss_clip": 1.45605469, + "router_z_loss_mlp": 0.11474609, + "step": 9161, + "time_per_iteration": 2.4746181964874268 + }, + { + "auxiliary_loss_clip": 0.06425308, + "auxiliary_loss_mlp": 0.01270177, + "balance_loss_clip": 0.0627985, + "balance_loss_mlp": 0.01259472, + "epoch": 0.5508492409439351, + "flos": 26585547615360.0, + "grad_norm": 4.143741197260591, + "language_loss": 0.69514179, + "learning_rate": 1.768208168081359e-06, + "loss": 0.77209663, + "num_input_tokens_seen": 197419630, + "router_z_loss_clip": 1.45507812, + "router_z_loss_mlp": 0.10717773, + "step": 9162, + "time_per_iteration": 2.601036548614502 + }, + { + "auxiliary_loss_clip": 0.06422161, + "auxiliary_loss_mlp": 0.01271792, + "balance_loss_clip": 0.06278013, + "balance_loss_mlp": 0.01261164, + "epoch": 0.5509093641966031, + "flos": 25449832575360.0, + "grad_norm": 1.6789972101454846, + "language_loss": 0.85959709, + "learning_rate": 1.767821335237733e-06, + "loss": 0.93653667, + "num_input_tokens_seen": 197438480, + "router_z_loss_clip": 1.44140625, + "router_z_loss_mlp": 0.10638428, + "step": 9163, + "time_per_iteration": 2.539546489715576 + }, + { + "auxiliary_loss_clip": 0.06425934, + "auxiliary_loss_mlp": 0.0126949, + "balance_loss_clip": 0.06282654, + "balance_loss_mlp": 0.01258856, + "epoch": 0.550969487449271, + "flos": 18704652825600.0, + "grad_norm": 1.572244133846192, + "language_loss": 0.81101871, + "learning_rate": 1.7674345111983441e-06, + "loss": 0.88797295, + "num_input_tokens_seen": 197456755, + "router_z_loss_clip": 1.43359375, + "router_z_loss_mlp": 0.10638428, + "step": 9164, + "time_per_iteration": 2.5266709327697754 + }, + { + "auxiliary_loss_clip": 0.06427547, + "auxiliary_loss_mlp": 0.01271715, + "balance_loss_clip": 0.06278498, + "balance_loss_mlp": 0.01260026, + "epoch": 0.551029610701939, + "flos": 22714959162240.0, + "grad_norm": 1.8760540237074659, + "language_loss": 0.73664248, + "learning_rate": 1.767047695977863e-06, + "loss": 0.81363511, + "num_input_tokens_seen": 197475530, + "router_z_loss_clip": 1.49121094, + "router_z_loss_mlp": 0.11688232, + "step": 9165, + "time_per_iteration": 2.511892318725586 + }, + { + "auxiliary_loss_clip": 0.06419477, + "auxiliary_loss_mlp": 0.01269172, + "balance_loss_clip": 0.06277155, + "balance_loss_mlp": 0.01258479, + "epoch": 0.5510897339546069, + "flos": 12425138542080.0, + "grad_norm": 2.0479120482719084, + "language_loss": 0.79496598, + "learning_rate": 1.7666608895909563e-06, + "loss": 0.87185252, + "num_input_tokens_seen": 197490835, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.10687256, + "step": 9166, + "time_per_iteration": 2.5217325687408447 + }, + { + "auxiliary_loss_clip": 0.06426241, + "auxiliary_loss_mlp": 0.01268783, + "balance_loss_clip": 0.06279847, + "balance_loss_mlp": 0.01257232, + "epoch": 0.5511498572072749, + "flos": 18776545228800.0, + "grad_norm": 2.094065158330193, + "language_loss": 0.77047074, + "learning_rate": 1.7662740920522913e-06, + "loss": 0.84742099, + "num_input_tokens_seen": 197508770, + "router_z_loss_clip": 1.46386719, + "router_z_loss_mlp": 0.11560059, + "step": 9167, + "time_per_iteration": 2.5210516452789307 + }, + { + "auxiliary_loss_clip": 0.06422734, + "auxiliary_loss_mlp": 0.01276612, + "balance_loss_clip": 0.06278996, + "balance_loss_mlp": 0.01264995, + "epoch": 0.5512099804599428, + "flos": 19579436651520.0, + "grad_norm": 1.8110306936777156, + "language_loss": 0.80698925, + "learning_rate": 1.7658873033765374e-06, + "loss": 0.88398266, + "num_input_tokens_seen": 197527340, + "router_z_loss_clip": 1.43847656, + "router_z_loss_mlp": 0.11627197, + "step": 9168, + "time_per_iteration": 2.5044801235198975 + }, + { + "auxiliary_loss_clip": 0.06426235, + "auxiliary_loss_mlp": 0.01266078, + "balance_loss_clip": 0.06278569, + "balance_loss_mlp": 0.01255206, + "epoch": 0.5512701037126109, + "flos": 26252053165440.0, + "grad_norm": 1.768039916500128, + "language_loss": 0.6941396, + "learning_rate": 1.7655005235783591e-06, + "loss": 0.77106273, + "num_input_tokens_seen": 197547280, + "router_z_loss_clip": 1.47460938, + "router_z_loss_mlp": 0.10876465, + "step": 9169, + "time_per_iteration": 2.5712435245513916 + }, + { + "auxiliary_loss_clip": 0.06426435, + "auxiliary_loss_mlp": 0.01277267, + "balance_loss_clip": 0.06284146, + "balance_loss_mlp": 0.01267092, + "epoch": 0.5513302269652788, + "flos": 21951997009920.0, + "grad_norm": 1.7919633768432253, + "language_loss": 0.85238504, + "learning_rate": 1.7651137526724251e-06, + "loss": 0.92942202, + "num_input_tokens_seen": 197565045, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.10174561, + "step": 9170, + "time_per_iteration": 2.6517226696014404 + }, + { + "auxiliary_loss_clip": 0.06339835, + "auxiliary_loss_mlp": 0.01252247, + "balance_loss_clip": 0.06277715, + "balance_loss_mlp": 0.01250597, + "epoch": 0.5513903502179468, + "flos": 68254728589440.0, + "grad_norm": 0.7663699077680228, + "language_loss": 0.59884483, + "learning_rate": 1.7647269906734017e-06, + "loss": 0.67476565, + "num_input_tokens_seen": 197625005, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 0.01652527, + "step": 9171, + "time_per_iteration": 3.190981864929199 + }, + { + "auxiliary_loss_clip": 0.06426144, + "auxiliary_loss_mlp": 0.01271114, + "balance_loss_clip": 0.06280371, + "balance_loss_mlp": 0.01260159, + "epoch": 0.5514504734706147, + "flos": 18740221683840.0, + "grad_norm": 1.5861452481841698, + "language_loss": 0.7047599, + "learning_rate": 1.7643402375959533e-06, + "loss": 0.78173256, + "num_input_tokens_seen": 197645050, + "router_z_loss_clip": 1.45800781, + "router_z_loss_mlp": 0.10961914, + "step": 9172, + "time_per_iteration": 2.5032176971435547 + }, + { + "auxiliary_loss_clip": 0.06426188, + "auxiliary_loss_mlp": 0.01273715, + "balance_loss_clip": 0.06281123, + "balance_loss_mlp": 0.01263218, + "epoch": 0.5515105967232827, + "flos": 22277147978880.0, + "grad_norm": 1.7175476935278873, + "language_loss": 0.76203263, + "learning_rate": 1.7639534934547474e-06, + "loss": 0.8390317, + "num_input_tokens_seen": 197663910, + "router_z_loss_clip": 1.44921875, + "router_z_loss_mlp": 0.10498047, + "step": 9173, + "time_per_iteration": 2.577878713607788 + }, + { + "auxiliary_loss_clip": 0.06421756, + "auxiliary_loss_mlp": 0.01264421, + "balance_loss_clip": 0.0627896, + "balance_loss_mlp": 0.01253359, + "epoch": 0.5515707199759508, + "flos": 22563040510080.0, + "grad_norm": 1.5999460100016771, + "language_loss": 0.75182664, + "learning_rate": 1.7635667582644484e-06, + "loss": 0.82868844, + "num_input_tokens_seen": 197681580, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.11077881, + "step": 9174, + "time_per_iteration": 2.520578384399414 + }, + { + "auxiliary_loss_clip": 0.06429856, + "auxiliary_loss_mlp": 0.0126509, + "balance_loss_clip": 0.06282729, + "balance_loss_mlp": 0.01253866, + "epoch": 0.5516308432286187, + "flos": 28298246670720.0, + "grad_norm": 1.7068220971376928, + "language_loss": 0.72958624, + "learning_rate": 1.7631800320397217e-06, + "loss": 0.80653572, + "num_input_tokens_seen": 197702095, + "router_z_loss_clip": 1.47070312, + "router_z_loss_mlp": 0.11206055, + "step": 9175, + "time_per_iteration": 2.5991220474243164 + }, + { + "auxiliary_loss_clip": 0.06423448, + "auxiliary_loss_mlp": 0.01272105, + "balance_loss_clip": 0.06278881, + "balance_loss_mlp": 0.01261192, + "epoch": 0.5516909664812867, + "flos": 18769417632000.0, + "grad_norm": 1.996679187528513, + "language_loss": 0.69295454, + "learning_rate": 1.7627933147952318e-06, + "loss": 0.7699101, + "num_input_tokens_seen": 197720720, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.10919189, + "step": 9176, + "time_per_iteration": 2.4903998374938965 + }, + { + "auxiliary_loss_clip": 0.06421016, + "auxiliary_loss_mlp": 0.01270885, + "balance_loss_clip": 0.06278497, + "balance_loss_mlp": 0.01260467, + "epoch": 0.5517510897339546, + "flos": 27746852878080.0, + "grad_norm": 1.714802927656724, + "language_loss": 0.71279752, + "learning_rate": 1.7624066065456435e-06, + "loss": 0.78971648, + "num_input_tokens_seen": 197741820, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.10418701, + "step": 9177, + "time_per_iteration": 3.9531290531158447 + }, + { + "auxiliary_loss_clip": 0.06428478, + "auxiliary_loss_mlp": 0.01269605, + "balance_loss_clip": 0.06282966, + "balance_loss_mlp": 0.0125924, + "epoch": 0.5518112129866226, + "flos": 18410165251200.0, + "grad_norm": 1.801915682479776, + "language_loss": 0.80691963, + "learning_rate": 1.7620199073056204e-06, + "loss": 0.8839004, + "num_input_tokens_seen": 197759160, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.10369873, + "step": 9178, + "time_per_iteration": 2.5356597900390625 + }, + { + "auxiliary_loss_clip": 0.06432515, + "auxiliary_loss_mlp": 0.01265625, + "balance_loss_clip": 0.06282209, + "balance_loss_mlp": 0.01254228, + "epoch": 0.5518713362392905, + "flos": 25089699726720.0, + "grad_norm": 1.5622133019409348, + "language_loss": 0.7545979, + "learning_rate": 1.761633217089826e-06, + "loss": 0.83157933, + "num_input_tokens_seen": 197779760, + "router_z_loss_clip": 1.50292969, + "router_z_loss_mlp": 0.11395264, + "step": 9179, + "time_per_iteration": 2.598055124282837 + }, + { + "auxiliary_loss_clip": 0.06425376, + "auxiliary_loss_mlp": 0.01269609, + "balance_loss_clip": 0.06280036, + "balance_loss_mlp": 0.01259005, + "epoch": 0.5519314594919585, + "flos": 36547911279360.0, + "grad_norm": 1.6999645614086591, + "language_loss": 0.70073718, + "learning_rate": 1.761246535912924e-06, + "loss": 0.77768701, + "num_input_tokens_seen": 197801545, + "router_z_loss_clip": 1.453125, + "router_z_loss_mlp": 0.1060791, + "step": 9180, + "time_per_iteration": 2.6791419982910156 + }, + { + "auxiliary_loss_clip": 0.06424871, + "auxiliary_loss_mlp": 0.01268506, + "balance_loss_clip": 0.06279478, + "balance_loss_mlp": 0.01257121, + "epoch": 0.5519915827446265, + "flos": 20454807456000.0, + "grad_norm": 1.7661274413355668, + "language_loss": 0.67505682, + "learning_rate": 1.7608598637895776e-06, + "loss": 0.75199056, + "num_input_tokens_seen": 197820760, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.11376953, + "step": 9181, + "time_per_iteration": 4.004978656768799 + }, + { + "auxiliary_loss_clip": 0.06431428, + "auxiliary_loss_mlp": 0.01267631, + "balance_loss_clip": 0.06280805, + "balance_loss_mlp": 0.01256682, + "epoch": 0.5520517059972945, + "flos": 23774672949120.0, + "grad_norm": 1.9095811471330626, + "language_loss": 0.79281217, + "learning_rate": 1.7604732007344486e-06, + "loss": 0.86980277, + "num_input_tokens_seen": 197840195, + "router_z_loss_clip": 1.50683594, + "router_z_loss_mlp": 0.10949707, + "step": 9182, + "time_per_iteration": 2.537867546081543 + }, + { + "auxiliary_loss_clip": 0.06428897, + "auxiliary_loss_mlp": 0.0126956, + "balance_loss_clip": 0.06281601, + "balance_loss_mlp": 0.01258259, + "epoch": 0.5521118292499624, + "flos": 22202362609920.0, + "grad_norm": 1.7640468757897252, + "language_loss": 0.83230162, + "learning_rate": 1.7600865467622003e-06, + "loss": 0.9092862, + "num_input_tokens_seen": 197859475, + "router_z_loss_clip": 1.47167969, + "router_z_loss_mlp": 0.11303711, + "step": 9183, + "time_per_iteration": 2.5279808044433594 + }, + { + "auxiliary_loss_clip": 0.0642349, + "auxiliary_loss_mlp": 0.01270068, + "balance_loss_clip": 0.0627853, + "balance_loss_mlp": 0.01259632, + "epoch": 0.5521719525026304, + "flos": 23589491425920.0, + "grad_norm": 1.2800662076099543, + "language_loss": 0.67446053, + "learning_rate": 1.7596999018874936e-06, + "loss": 0.75139618, + "num_input_tokens_seen": 197879395, + "router_z_loss_clip": 1.45019531, + "router_z_loss_mlp": 0.10437012, + "step": 9184, + "time_per_iteration": 2.684945821762085 + }, + { + "auxiliary_loss_clip": 0.06425154, + "auxiliary_loss_mlp": 0.01269673, + "balance_loss_clip": 0.06279694, + "balance_loss_mlp": 0.01258652, + "epoch": 0.5522320757552983, + "flos": 26144298414720.0, + "grad_norm": 1.5606033277911597, + "language_loss": 0.76214409, + "learning_rate": 1.7593132661249917e-06, + "loss": 0.83909237, + "num_input_tokens_seen": 197900815, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.11016846, + "step": 9185, + "time_per_iteration": 2.654999017715454 + }, + { + "auxiliary_loss_clip": 0.06428938, + "auxiliary_loss_mlp": 0.01270824, + "balance_loss_clip": 0.06280778, + "balance_loss_mlp": 0.01259661, + "epoch": 0.5522921990079663, + "flos": 24682258448640.0, + "grad_norm": 1.714573937603497, + "language_loss": 0.73903292, + "learning_rate": 1.7589266394893536e-06, + "loss": 0.8160305, + "num_input_tokens_seen": 197918985, + "router_z_loss_clip": 1.47949219, + "router_z_loss_mlp": 0.1116333, + "step": 9186, + "time_per_iteration": 4.173564672470093 + }, + { + "auxiliary_loss_clip": 0.06430478, + "auxiliary_loss_mlp": 0.0127082, + "balance_loss_clip": 0.06282008, + "balance_loss_mlp": 0.01260032, + "epoch": 0.5523523222606344, + "flos": 22754888432640.0, + "grad_norm": 1.9890242222634391, + "language_loss": 0.66822404, + "learning_rate": 1.7585400219952421e-06, + "loss": 0.74523699, + "num_input_tokens_seen": 197937725, + "router_z_loss_clip": 1.48339844, + "router_z_loss_mlp": 0.10784912, + "step": 9187, + "time_per_iteration": 2.5402488708496094 + }, + { + "auxiliary_loss_clip": 0.06424463, + "auxiliary_loss_mlp": 0.01272464, + "balance_loss_clip": 0.06278258, + "balance_loss_mlp": 0.01261663, + "epoch": 0.5524124455133023, + "flos": 19761976771200.0, + "grad_norm": 1.6249988598177185, + "language_loss": 0.77965587, + "learning_rate": 1.758153413657318e-06, + "loss": 0.85662508, + "num_input_tokens_seen": 197955635, + "router_z_loss_clip": 1.45996094, + "router_z_loss_mlp": 0.10803223, + "step": 9188, + "time_per_iteration": 2.4915547370910645 + }, + { + "auxiliary_loss_clip": 0.06426179, + "auxiliary_loss_mlp": 0.01274155, + "balance_loss_clip": 0.06280048, + "balance_loss_mlp": 0.01262579, + "epoch": 0.5524725687659703, + "flos": 23301544469760.0, + "grad_norm": 1.615723789328545, + "language_loss": 0.81586993, + "learning_rate": 1.7577668144902394e-06, + "loss": 0.89287329, + "num_input_tokens_seen": 197974490, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.11572266, + "step": 9189, + "time_per_iteration": 2.540083885192871 + }, + { + "auxiliary_loss_clip": 0.06419186, + "auxiliary_loss_mlp": 0.01269353, + "balance_loss_clip": 0.06276601, + "balance_loss_mlp": 0.0125776, + "epoch": 0.5525326920186382, + "flos": 24868907418240.0, + "grad_norm": 1.331008644060519, + "language_loss": 0.76847303, + "learning_rate": 1.7573802245086684e-06, + "loss": 0.84535837, + "num_input_tokens_seen": 197995735, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.1159668, + "step": 9190, + "time_per_iteration": 2.597717046737671 + }, + { + "auxiliary_loss_clip": 0.0643147, + "auxiliary_loss_mlp": 0.01272383, + "balance_loss_clip": 0.06278718, + "balance_loss_mlp": 0.01260438, + "epoch": 0.5525928152713062, + "flos": 13740710371200.0, + "grad_norm": 2.3910114977567787, + "language_loss": 0.79437977, + "learning_rate": 1.7569936437272627e-06, + "loss": 0.87141836, + "num_input_tokens_seen": 198009685, + "router_z_loss_clip": 1.52636719, + "router_z_loss_mlp": 0.11950684, + "step": 9191, + "time_per_iteration": 2.547445774078369 + }, + { + "auxiliary_loss_clip": 0.06422585, + "auxiliary_loss_mlp": 0.01264097, + "balance_loss_clip": 0.06276913, + "balance_loss_mlp": 0.01253624, + "epoch": 0.5526529385239741, + "flos": 13075398552960.0, + "grad_norm": 2.207227027061606, + "language_loss": 0.6899271, + "learning_rate": 1.7566070721606829e-06, + "loss": 0.76679391, + "num_input_tokens_seen": 198026845, + "router_z_loss_clip": 1.45800781, + "router_z_loss_mlp": 0.10473633, + "step": 9192, + "time_per_iteration": 2.4774858951568604 + }, + { + "auxiliary_loss_clip": 0.06421191, + "auxiliary_loss_mlp": 0.01268175, + "balance_loss_clip": 0.06277353, + "balance_loss_mlp": 0.01257786, + "epoch": 0.5527130617766421, + "flos": 23154992478720.0, + "grad_norm": 1.5351732563488263, + "language_loss": 0.77348876, + "learning_rate": 1.756220509823588e-06, + "loss": 0.85038239, + "num_input_tokens_seen": 198045275, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.10400391, + "step": 9193, + "time_per_iteration": 3.9115588665008545 + }, + { + "auxiliary_loss_clip": 0.06421337, + "auxiliary_loss_mlp": 0.01271193, + "balance_loss_clip": 0.06275223, + "balance_loss_mlp": 0.01260357, + "epoch": 0.55277318502931, + "flos": 21291506801280.0, + "grad_norm": 1.5126002389204065, + "language_loss": 0.79036456, + "learning_rate": 1.7558339567306344e-06, + "loss": 0.8672899, + "num_input_tokens_seen": 198065760, + "router_z_loss_clip": 1.45996094, + "router_z_loss_mlp": 0.1083374, + "step": 9194, + "time_per_iteration": 2.5319602489471436 + }, + { + "auxiliary_loss_clip": 0.06427231, + "auxiliary_loss_mlp": 0.01269531, + "balance_loss_clip": 0.06274066, + "balance_loss_mlp": 0.01258189, + "epoch": 0.5528333082819781, + "flos": 38333383205760.0, + "grad_norm": 1.8079647356103097, + "language_loss": 0.70506799, + "learning_rate": 1.7554474128964825e-06, + "loss": 0.78203559, + "num_input_tokens_seen": 198087595, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.11340332, + "step": 9195, + "time_per_iteration": 2.6384387016296387 + }, + { + "auxiliary_loss_clip": 0.06436112, + "auxiliary_loss_mlp": 0.01266283, + "balance_loss_clip": 0.06281462, + "balance_loss_mlp": 0.01253778, + "epoch": 0.552893431534646, + "flos": 13558799157120.0, + "grad_norm": 2.003941554047622, + "language_loss": 0.74570775, + "learning_rate": 1.7550608783357887e-06, + "loss": 0.82273173, + "num_input_tokens_seen": 198104620, + "router_z_loss_clip": 1.546875, + "router_z_loss_mlp": 0.12506104, + "step": 9196, + "time_per_iteration": 2.5033600330352783 + }, + { + "auxiliary_loss_clip": 0.06429259, + "auxiliary_loss_mlp": 0.01263784, + "balance_loss_clip": 0.0628302, + "balance_loss_mlp": 0.01252656, + "epoch": 0.552953554787314, + "flos": 21944995194240.0, + "grad_norm": 1.6318385903460113, + "language_loss": 0.77179539, + "learning_rate": 1.7546743530632115e-06, + "loss": 0.8487258, + "num_input_tokens_seen": 198123565, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.11126709, + "step": 9197, + "time_per_iteration": 2.500624895095825 + }, + { + "auxiliary_loss_clip": 0.06421226, + "auxiliary_loss_mlp": 0.01269574, + "balance_loss_clip": 0.06276499, + "balance_loss_mlp": 0.01259316, + "epoch": 0.5530136780399819, + "flos": 43668820736640.0, + "grad_norm": 1.4562548285485233, + "language_loss": 0.76468647, + "learning_rate": 1.754287837093407e-06, + "loss": 0.84159452, + "num_input_tokens_seen": 198148270, + "router_z_loss_clip": 1.44824219, + "router_z_loss_mlp": 0.1026001, + "step": 9198, + "time_per_iteration": 2.7432668209075928 + }, + { + "auxiliary_loss_clip": 0.06427757, + "auxiliary_loss_mlp": 0.0126746, + "balance_loss_clip": 0.06281044, + "balance_loss_mlp": 0.01256994, + "epoch": 0.5530738012926499, + "flos": 25052411859840.0, + "grad_norm": 1.5004430901507595, + "language_loss": 0.79301012, + "learning_rate": 1.7539013304410327e-06, + "loss": 0.86996233, + "num_input_tokens_seen": 198168810, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 0.10461426, + "step": 9199, + "time_per_iteration": 2.547755241394043 + }, + { + "auxiliary_loss_clip": 0.06422742, + "auxiliary_loss_mlp": 0.01266548, + "balance_loss_clip": 0.06276976, + "balance_loss_mlp": 0.01255962, + "epoch": 0.553133924545318, + "flos": 16477680136320.0, + "grad_norm": 1.9305306774012563, + "language_loss": 0.63492346, + "learning_rate": 1.7535148331207443e-06, + "loss": 0.71181637, + "num_input_tokens_seen": 198186200, + "router_z_loss_clip": 1.45605469, + "router_z_loss_mlp": 0.10577393, + "step": 9200, + "time_per_iteration": 2.5127363204956055 + }, + { + "auxiliary_loss_clip": 0.06431345, + "auxiliary_loss_mlp": 0.01265429, + "balance_loss_clip": 0.06280623, + "balance_loss_mlp": 0.01253866, + "epoch": 0.5531940477979859, + "flos": 24612797813760.0, + "grad_norm": 1.757338852617271, + "language_loss": 0.66817963, + "learning_rate": 1.7531283451471978e-06, + "loss": 0.74514735, + "num_input_tokens_seen": 198207050, + "router_z_loss_clip": 1.50683594, + "router_z_loss_mlp": 0.11560059, + "step": 9201, + "time_per_iteration": 2.5651068687438965 + }, + { + "auxiliary_loss_clip": 0.06425701, + "auxiliary_loss_mlp": 0.01270434, + "balance_loss_clip": 0.06278911, + "balance_loss_mlp": 0.0125871, + "epoch": 0.5532541710506539, + "flos": 22165410159360.0, + "grad_norm": 2.045638683899954, + "language_loss": 0.61266994, + "learning_rate": 1.7527418665350502e-06, + "loss": 0.68963134, + "num_input_tokens_seen": 198224565, + "router_z_loss_clip": 1.46679688, + "router_z_loss_mlp": 0.11737061, + "step": 9202, + "time_per_iteration": 2.5841257572174072 + }, + { + "auxiliary_loss_clip": 0.06419975, + "auxiliary_loss_mlp": 0.01264358, + "balance_loss_clip": 0.06278098, + "balance_loss_mlp": 0.01253493, + "epoch": 0.5533142943033218, + "flos": 21403621964160.0, + "grad_norm": 1.6777411475808515, + "language_loss": 0.64766765, + "learning_rate": 1.7523553972989548e-06, + "loss": 0.72451103, + "num_input_tokens_seen": 198244790, + "router_z_loss_clip": 1.41894531, + "router_z_loss_mlp": 0.10864258, + "step": 9203, + "time_per_iteration": 2.502300977706909 + }, + { + "auxiliary_loss_clip": 0.06425197, + "auxiliary_loss_mlp": 0.01269086, + "balance_loss_clip": 0.06279255, + "balance_loss_mlp": 0.01258065, + "epoch": 0.5533744175559898, + "flos": 23557360584960.0, + "grad_norm": 1.630044734052438, + "language_loss": 0.63918829, + "learning_rate": 1.7519689374535683e-06, + "loss": 0.71613109, + "num_input_tokens_seen": 198264375, + "router_z_loss_clip": 1.45996094, + "router_z_loss_mlp": 0.11022949, + "step": 9204, + "time_per_iteration": 2.5487308502197266 + }, + { + "auxiliary_loss_clip": 0.0642142, + "auxiliary_loss_mlp": 0.01264869, + "balance_loss_clip": 0.06278381, + "balance_loss_mlp": 0.01254451, + "epoch": 0.5534345408086577, + "flos": 24068447763840.0, + "grad_norm": 1.4496742073495597, + "language_loss": 0.77449042, + "learning_rate": 1.7515824870135445e-06, + "loss": 0.85135335, + "num_input_tokens_seen": 198283895, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.10418701, + "step": 9205, + "time_per_iteration": 2.5445451736450195 + }, + { + "auxiliary_loss_clip": 0.06419459, + "auxiliary_loss_mlp": 0.01264463, + "balance_loss_clip": 0.06277758, + "balance_loss_mlp": 0.01254104, + "epoch": 0.5534946640613257, + "flos": 33781242441600.0, + "grad_norm": 1.38023808830968, + "language_loss": 0.72729224, + "learning_rate": 1.751196045993537e-06, + "loss": 0.80413151, + "num_input_tokens_seen": 198310035, + "router_z_loss_clip": 1.41699219, + "router_z_loss_mlp": 0.1036377, + "step": 9206, + "time_per_iteration": 2.7339117527008057 + }, + { + "auxiliary_loss_clip": 0.06421407, + "auxiliary_loss_mlp": 0.01265704, + "balance_loss_clip": 0.06274875, + "balance_loss_mlp": 0.01255005, + "epoch": 0.5535547873139937, + "flos": 15164707783680.0, + "grad_norm": 1.9977188658051825, + "language_loss": 0.7547437, + "learning_rate": 1.7508096144082012e-06, + "loss": 0.83161485, + "num_input_tokens_seen": 198327810, + "router_z_loss_clip": 1.46386719, + "router_z_loss_mlp": 0.10699463, + "step": 9207, + "time_per_iteration": 2.482356548309326 + }, + { + "auxiliary_loss_clip": 0.06436527, + "auxiliary_loss_mlp": 0.01265889, + "balance_loss_clip": 0.06285885, + "balance_loss_mlp": 0.01254493, + "epoch": 0.5536149105666617, + "flos": 16986209765760.0, + "grad_norm": 2.498092208232672, + "language_loss": 0.61888683, + "learning_rate": 1.750423192272189e-06, + "loss": 0.69591099, + "num_input_tokens_seen": 198343150, + "router_z_loss_clip": 1.50488281, + "router_z_loss_mlp": 0.1138916, + "step": 9208, + "time_per_iteration": 2.493628740310669 + }, + { + "auxiliary_loss_clip": 0.06428279, + "auxiliary_loss_mlp": 0.01268207, + "balance_loss_clip": 0.06278799, + "balance_loss_mlp": 0.01256543, + "epoch": 0.5536750338193296, + "flos": 18155732728320.0, + "grad_norm": 2.094677241914043, + "language_loss": 0.64708155, + "learning_rate": 1.7500367796001547e-06, + "loss": 0.72404641, + "num_input_tokens_seen": 198360925, + "router_z_loss_clip": 1.49511719, + "router_z_loss_mlp": 0.11663818, + "step": 9209, + "time_per_iteration": 2.4616804122924805 + }, + { + "auxiliary_loss_clip": 0.06424735, + "auxiliary_loss_mlp": 0.01272111, + "balance_loss_clip": 0.06279891, + "balance_loss_mlp": 0.01260863, + "epoch": 0.5537351570719976, + "flos": 22754469162240.0, + "grad_norm": 1.8280568303571236, + "language_loss": 0.82967091, + "learning_rate": 1.7496503764067513e-06, + "loss": 0.90663934, + "num_input_tokens_seen": 198379265, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.11242676, + "step": 9210, + "time_per_iteration": 2.564713954925537 + }, + { + "auxiliary_loss_clip": 0.06418703, + "auxiliary_loss_mlp": 0.01265805, + "balance_loss_clip": 0.06275869, + "balance_loss_mlp": 0.01255381, + "epoch": 0.5537952803246655, + "flos": 26362658954880.0, + "grad_norm": 1.71176011345987, + "language_loss": 0.72960317, + "learning_rate": 1.74926398270663e-06, + "loss": 0.80644828, + "num_input_tokens_seen": 198399490, + "router_z_loss_clip": 1.42675781, + "router_z_loss_mlp": 0.10430908, + "step": 9211, + "time_per_iteration": 2.5312066078186035 + }, + { + "auxiliary_loss_clip": 0.06431179, + "auxiliary_loss_mlp": 0.01267507, + "balance_loss_clip": 0.06280635, + "balance_loss_mlp": 0.01256045, + "epoch": 0.5538554035773335, + "flos": 18042695170560.0, + "grad_norm": 2.3508559175952803, + "language_loss": 0.67497891, + "learning_rate": 1.7488775985144437e-06, + "loss": 0.75196576, + "num_input_tokens_seen": 198419110, + "router_z_loss_clip": 1.50585938, + "router_z_loss_mlp": 0.11462402, + "step": 9212, + "time_per_iteration": 2.5141408443450928 + }, + { + "auxiliary_loss_clip": 0.06429373, + "auxiliary_loss_mlp": 0.01268343, + "balance_loss_clip": 0.06279323, + "balance_loss_mlp": 0.0125554, + "epoch": 0.5539155268300014, + "flos": 31694323052160.0, + "grad_norm": 1.4365879651928444, + "language_loss": 0.5225575, + "learning_rate": 1.7484912238448443e-06, + "loss": 0.59953463, + "num_input_tokens_seen": 198441360, + "router_z_loss_clip": 1.49902344, + "router_z_loss_mlp": 0.12792969, + "step": 9213, + "time_per_iteration": 2.5764448642730713 + }, + { + "auxiliary_loss_clip": 0.06431083, + "auxiliary_loss_mlp": 0.01264603, + "balance_loss_clip": 0.06282363, + "balance_loss_mlp": 0.01253302, + "epoch": 0.5539756500826695, + "flos": 15198934976640.0, + "grad_norm": 1.6892906357761146, + "language_loss": 0.85764515, + "learning_rate": 1.7481048587124827e-06, + "loss": 0.93460202, + "num_input_tokens_seen": 198459835, + "router_z_loss_clip": 1.48632812, + "router_z_loss_mlp": 0.11303711, + "step": 9214, + "time_per_iteration": 2.5433578491210938 + }, + { + "auxiliary_loss_clip": 0.06422558, + "auxiliary_loss_mlp": 0.01262824, + "balance_loss_clip": 0.06277601, + "balance_loss_mlp": 0.01252333, + "epoch": 0.5540357733353375, + "flos": 26359262864640.0, + "grad_norm": 1.8961662277212366, + "language_loss": 0.70100081, + "learning_rate": 1.7477185031320108e-06, + "loss": 0.77785456, + "num_input_tokens_seen": 198478955, + "router_z_loss_clip": 1.45019531, + "router_z_loss_mlp": 0.10491943, + "step": 9215, + "time_per_iteration": 2.548687696456909 + }, + { + "auxiliary_loss_clip": 0.06428155, + "auxiliary_loss_mlp": 0.01266334, + "balance_loss_clip": 0.06279612, + "balance_loss_mlp": 0.01254825, + "epoch": 0.5540958965880054, + "flos": 21329926698240.0, + "grad_norm": 1.6927060371572338, + "language_loss": 0.73713386, + "learning_rate": 1.7473321571180773e-06, + "loss": 0.81407875, + "num_input_tokens_seen": 198499030, + "router_z_loss_clip": 1.48535156, + "router_z_loss_mlp": 0.1151123, + "step": 9216, + "time_per_iteration": 2.541210174560547 + }, + { + "auxiliary_loss_clip": 0.06421469, + "auxiliary_loss_mlp": 0.01265486, + "balance_loss_clip": 0.06278324, + "balance_loss_mlp": 0.01254471, + "epoch": 0.5541560198406734, + "flos": 25674020974080.0, + "grad_norm": 1.768513313341331, + "language_loss": 0.71651757, + "learning_rate": 1.7469458206853345e-06, + "loss": 0.79338706, + "num_input_tokens_seen": 198520265, + "router_z_loss_clip": 1.43164062, + "router_z_loss_mlp": 0.11029053, + "step": 9217, + "time_per_iteration": 4.048692226409912 + }, + { + "auxiliary_loss_clip": 0.0642062, + "auxiliary_loss_mlp": 0.01262573, + "balance_loss_clip": 0.06274968, + "balance_loss_mlp": 0.01251993, + "epoch": 0.5542161430933413, + "flos": 21945246756480.0, + "grad_norm": 1.641855173543887, + "language_loss": 0.78896093, + "learning_rate": 1.7465594938484315e-06, + "loss": 0.86579281, + "num_input_tokens_seen": 198539645, + "router_z_loss_clip": 1.45703125, + "router_z_loss_mlp": 0.10577393, + "step": 9218, + "time_per_iteration": 2.5090229511260986 + }, + { + "auxiliary_loss_clip": 0.06429659, + "auxiliary_loss_mlp": 0.01266909, + "balance_loss_clip": 0.06280088, + "balance_loss_mlp": 0.01255023, + "epoch": 0.5542762663460093, + "flos": 19577256445440.0, + "grad_norm": 1.9145093316494244, + "language_loss": 0.72342837, + "learning_rate": 1.7461731766220176e-06, + "loss": 0.80039406, + "num_input_tokens_seen": 198558710, + "router_z_loss_clip": 1.49316406, + "router_z_loss_mlp": 0.11889648, + "step": 9219, + "time_per_iteration": 2.6097207069396973 + }, + { + "auxiliary_loss_clip": 0.06423312, + "auxiliary_loss_mlp": 0.01267842, + "balance_loss_clip": 0.06275792, + "balance_loss_mlp": 0.01256809, + "epoch": 0.5543363895986773, + "flos": 19504944771840.0, + "grad_norm": 1.6265573389583097, + "language_loss": 0.7175796, + "learning_rate": 1.7457868690207426e-06, + "loss": 0.79449117, + "num_input_tokens_seen": 198577050, + "router_z_loss_clip": 1.47558594, + "router_z_loss_mlp": 0.11035156, + "step": 9220, + "time_per_iteration": 3.953366756439209 + }, + { + "auxiliary_loss_clip": 0.0641966, + "auxiliary_loss_mlp": 0.01266715, + "balance_loss_clip": 0.06276264, + "balance_loss_mlp": 0.01256154, + "epoch": 0.5543965128513453, + "flos": 22641808947840.0, + "grad_norm": 1.5837082117197903, + "language_loss": 0.79554594, + "learning_rate": 1.7454005710592547e-06, + "loss": 0.8724097, + "num_input_tokens_seen": 198595290, + "router_z_loss_clip": 1.43359375, + "router_z_loss_mlp": 0.10565186, + "step": 9221, + "time_per_iteration": 2.6012284755706787 + }, + { + "auxiliary_loss_clip": 0.06419835, + "auxiliary_loss_mlp": 0.01268367, + "balance_loss_clip": 0.06276818, + "balance_loss_mlp": 0.0125715, + "epoch": 0.5544566361040132, + "flos": 25996320904320.0, + "grad_norm": 1.7031606951897913, + "language_loss": 0.8378005, + "learning_rate": 1.7450142827522027e-06, + "loss": 0.91468251, + "num_input_tokens_seen": 198614110, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.11224365, + "step": 9222, + "time_per_iteration": 2.5621228218078613 + }, + { + "auxiliary_loss_clip": 0.06426205, + "auxiliary_loss_mlp": 0.01268401, + "balance_loss_clip": 0.06276226, + "balance_loss_mlp": 0.01256236, + "epoch": 0.5545167593566812, + "flos": 28265235361920.0, + "grad_norm": 1.624171595552914, + "language_loss": 0.75644016, + "learning_rate": 1.7446280041142344e-06, + "loss": 0.83338618, + "num_input_tokens_seen": 198633880, + "router_z_loss_clip": 1.50097656, + "router_z_loss_mlp": 0.1217041, + "step": 9223, + "time_per_iteration": 2.6189255714416504 + }, + { + "auxiliary_loss_clip": 0.06421085, + "auxiliary_loss_mlp": 0.012666, + "balance_loss_clip": 0.06275317, + "balance_loss_mlp": 0.01255168, + "epoch": 0.5545768826093491, + "flos": 28484266734720.0, + "grad_norm": 1.537609394832996, + "language_loss": 0.81879461, + "learning_rate": 1.7442417351599986e-06, + "loss": 0.89567149, + "num_input_tokens_seen": 198653505, + "router_z_loss_clip": 1.45800781, + "router_z_loss_mlp": 0.11425781, + "step": 9224, + "time_per_iteration": 2.5794196128845215 + }, + { + "auxiliary_loss_clip": 0.06424309, + "auxiliary_loss_mlp": 0.01271127, + "balance_loss_clip": 0.06276202, + "balance_loss_mlp": 0.01259432, + "epoch": 0.5546370058620171, + "flos": 18483860517120.0, + "grad_norm": 1.6794429489770297, + "language_loss": 0.57241935, + "learning_rate": 1.743855475904141e-06, + "loss": 0.64937371, + "num_input_tokens_seen": 198671890, + "router_z_loss_clip": 1.47949219, + "router_z_loss_mlp": 0.11688232, + "step": 9225, + "time_per_iteration": 3.9698383808135986 + }, + { + "auxiliary_loss_clip": 0.06422257, + "auxiliary_loss_mlp": 0.01267893, + "balance_loss_clip": 0.06275012, + "balance_loss_mlp": 0.01257009, + "epoch": 0.554697129114685, + "flos": 22937260844160.0, + "grad_norm": 1.5804786041677554, + "language_loss": 0.6778791, + "learning_rate": 1.7434692263613098e-06, + "loss": 0.75478059, + "num_input_tokens_seen": 198691995, + "router_z_loss_clip": 1.47460938, + "router_z_loss_mlp": 0.10870361, + "step": 9226, + "time_per_iteration": 2.5307633876800537 + }, + { + "auxiliary_loss_clip": 0.06423603, + "auxiliary_loss_mlp": 0.01267041, + "balance_loss_clip": 0.06275073, + "balance_loss_mlp": 0.01256002, + "epoch": 0.5547572523673531, + "flos": 21803348666880.0, + "grad_norm": 1.2977635143377364, + "language_loss": 0.74954712, + "learning_rate": 1.7430829865461518e-06, + "loss": 0.82645351, + "num_input_tokens_seen": 198712440, + "router_z_loss_clip": 1.48339844, + "router_z_loss_mlp": 0.11047363, + "step": 9227, + "time_per_iteration": 2.5083706378936768 + }, + { + "auxiliary_loss_clip": 0.06423934, + "auxiliary_loss_mlp": 0.01266224, + "balance_loss_clip": 0.06275739, + "balance_loss_mlp": 0.01254768, + "epoch": 0.5548173756200211, + "flos": 22348830746880.0, + "grad_norm": 1.524887798675916, + "language_loss": 0.73794919, + "learning_rate": 1.7426967564733118e-06, + "loss": 0.81485081, + "num_input_tokens_seen": 198731515, + "router_z_loss_clip": 1.48046875, + "router_z_loss_mlp": 0.11444092, + "step": 9228, + "time_per_iteration": 2.555020809173584 + }, + { + "auxiliary_loss_clip": 0.06423147, + "auxiliary_loss_mlp": 0.01263866, + "balance_loss_clip": 0.06276013, + "balance_loss_mlp": 0.01253465, + "epoch": 0.554877498872689, + "flos": 17864599317120.0, + "grad_norm": 1.7043498128680434, + "language_loss": 0.76352561, + "learning_rate": 1.7423105361574373e-06, + "loss": 0.84039581, + "num_input_tokens_seen": 198749750, + "router_z_loss_clip": 1.47167969, + "router_z_loss_mlp": 0.10400391, + "step": 9229, + "time_per_iteration": 2.4959444999694824 + }, + { + "auxiliary_loss_clip": 0.06423293, + "auxiliary_loss_mlp": 0.01266918, + "balance_loss_clip": 0.06275852, + "balance_loss_mlp": 0.0125464, + "epoch": 0.554937622125357, + "flos": 17244080305920.0, + "grad_norm": 1.4897541866361217, + "language_loss": 0.69068646, + "learning_rate": 1.741924325613172e-06, + "loss": 0.76758856, + "num_input_tokens_seen": 198768320, + "router_z_loss_clip": 1.47363281, + "router_z_loss_mlp": 0.12280273, + "step": 9230, + "time_per_iteration": 2.5090713500976562 + }, + { + "auxiliary_loss_clip": 0.06427252, + "auxiliary_loss_mlp": 0.01267128, + "balance_loss_clip": 0.06276985, + "balance_loss_mlp": 0.01254587, + "epoch": 0.5549977453780249, + "flos": 25374082884480.0, + "grad_norm": 2.3665837136773047, + "language_loss": 0.68808627, + "learning_rate": 1.741538124855163e-06, + "loss": 0.76503003, + "num_input_tokens_seen": 198787230, + "router_z_loss_clip": 1.50097656, + "router_z_loss_mlp": 0.12554932, + "step": 9231, + "time_per_iteration": 2.5350747108459473 + }, + { + "auxiliary_loss_clip": 0.06429425, + "auxiliary_loss_mlp": 0.01269438, + "balance_loss_clip": 0.06277338, + "balance_loss_mlp": 0.01256885, + "epoch": 0.555057868630693, + "flos": 25085548949760.0, + "grad_norm": 1.6698826084601515, + "language_loss": 0.78408533, + "learning_rate": 1.7411519338980548e-06, + "loss": 0.86107397, + "num_input_tokens_seen": 198806720, + "router_z_loss_clip": 1.51953125, + "router_z_loss_mlp": 0.12542725, + "step": 9232, + "time_per_iteration": 4.055214881896973 + }, + { + "auxiliary_loss_clip": 0.06416719, + "auxiliary_loss_mlp": 0.01266689, + "balance_loss_clip": 0.06273052, + "balance_loss_mlp": 0.01255972, + "epoch": 0.5551179918833609, + "flos": 26111412887040.0, + "grad_norm": 1.627879634610194, + "language_loss": 0.83063745, + "learning_rate": 1.7407657527564898e-06, + "loss": 0.90747154, + "num_input_tokens_seen": 198826235, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.10723877, + "step": 9233, + "time_per_iteration": 2.6376969814300537 + }, + { + "auxiliary_loss_clip": 0.06430396, + "auxiliary_loss_mlp": 0.01266353, + "balance_loss_clip": 0.06277359, + "balance_loss_mlp": 0.01254927, + "epoch": 0.5551781151360289, + "flos": 19389810862080.0, + "grad_norm": 2.483522309942904, + "language_loss": 0.7549684, + "learning_rate": 1.7403795814451142e-06, + "loss": 0.83193588, + "num_input_tokens_seen": 198842655, + "router_z_loss_clip": 1.52832031, + "router_z_loss_mlp": 0.11431885, + "step": 9234, + "time_per_iteration": 2.4859883785247803 + }, + { + "auxiliary_loss_clip": 0.06418739, + "auxiliary_loss_mlp": 0.01265554, + "balance_loss_clip": 0.06273869, + "balance_loss_mlp": 0.01255129, + "epoch": 0.5552382383886968, + "flos": 21732420585600.0, + "grad_norm": 1.8065340969909298, + "language_loss": 0.64963275, + "learning_rate": 1.7399934199785706e-06, + "loss": 0.72647566, + "num_input_tokens_seen": 198861210, + "router_z_loss_clip": 1.44824219, + "router_z_loss_mlp": 0.10418701, + "step": 9235, + "time_per_iteration": 2.523128032684326 + }, + { + "auxiliary_loss_clip": 0.06420863, + "auxiliary_loss_mlp": 0.01267129, + "balance_loss_clip": 0.06272598, + "balance_loss_mlp": 0.0125519, + "epoch": 0.5552983616413648, + "flos": 14361480944640.0, + "grad_norm": 1.6397834212981734, + "language_loss": 0.68087149, + "learning_rate": 1.7396072683715029e-06, + "loss": 0.75775141, + "num_input_tokens_seen": 198880045, + "router_z_loss_clip": 1.48339844, + "router_z_loss_mlp": 0.11932373, + "step": 9236, + "time_per_iteration": 2.506023406982422 + }, + { + "auxiliary_loss_clip": 0.06416081, + "auxiliary_loss_mlp": 0.01266517, + "balance_loss_clip": 0.06273347, + "balance_loss_mlp": 0.01256068, + "epoch": 0.5553584848940327, + "flos": 25484730600960.0, + "grad_norm": 1.5459271274239896, + "language_loss": 0.86436939, + "learning_rate": 1.7392211266385536e-06, + "loss": 0.94119537, + "num_input_tokens_seen": 198900210, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.10449219, + "step": 9237, + "time_per_iteration": 2.580103874206543 + }, + { + "auxiliary_loss_clip": 0.0641643, + "auxiliary_loss_mlp": 0.01267385, + "balance_loss_clip": 0.06273238, + "balance_loss_mlp": 0.01255875, + "epoch": 0.5554186081467007, + "flos": 22170399477120.0, + "grad_norm": 1.8042242059193758, + "language_loss": 0.73774469, + "learning_rate": 1.7388349947943652e-06, + "loss": 0.81458282, + "num_input_tokens_seen": 198919055, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.11517334, + "step": 9238, + "time_per_iteration": 2.5031590461730957 + }, + { + "auxiliary_loss_clip": 0.0642554, + "auxiliary_loss_mlp": 0.01266682, + "balance_loss_clip": 0.06275032, + "balance_loss_mlp": 0.01255924, + "epoch": 0.5554787313993687, + "flos": 49757744908800.0, + "grad_norm": 1.5320503148177431, + "language_loss": 0.78384852, + "learning_rate": 1.73844887285358e-06, + "loss": 0.86077076, + "num_input_tokens_seen": 198943505, + "router_z_loss_clip": 1.50585938, + "router_z_loss_mlp": 0.10766602, + "step": 9239, + "time_per_iteration": 2.7739756107330322 + }, + { + "auxiliary_loss_clip": 0.06423195, + "auxiliary_loss_mlp": 0.01266863, + "balance_loss_clip": 0.06276082, + "balance_loss_mlp": 0.0125546, + "epoch": 0.5555388546520367, + "flos": 22133908224000.0, + "grad_norm": 1.4777059666754715, + "language_loss": 0.80562818, + "learning_rate": 1.7380627608308393e-06, + "loss": 0.88252878, + "num_input_tokens_seen": 198963590, + "router_z_loss_clip": 1.47070312, + "router_z_loss_mlp": 0.11401367, + "step": 9240, + "time_per_iteration": 2.5036380290985107 + }, + { + "auxiliary_loss_clip": 0.06419357, + "auxiliary_loss_mlp": 0.01266651, + "balance_loss_clip": 0.06273453, + "balance_loss_mlp": 0.01255142, + "epoch": 0.5555989779047047, + "flos": 24689218337280.0, + "grad_norm": 1.7126628457644222, + "language_loss": 0.65465248, + "learning_rate": 1.737676658740786e-06, + "loss": 0.73151255, + "num_input_tokens_seen": 198982680, + "router_z_loss_clip": 1.45996094, + "router_z_loss_mlp": 0.1151123, + "step": 9241, + "time_per_iteration": 2.5851833820343018 + }, + { + "auxiliary_loss_clip": 0.06422672, + "auxiliary_loss_mlp": 0.01264033, + "balance_loss_clip": 0.06276439, + "balance_loss_mlp": 0.01252566, + "epoch": 0.5556591011573726, + "flos": 16111929064320.0, + "grad_norm": 1.8766289396676605, + "language_loss": 0.73123193, + "learning_rate": 1.7372905665980594e-06, + "loss": 0.80809897, + "num_input_tokens_seen": 199000185, + "router_z_loss_clip": 1.46289062, + "router_z_loss_mlp": 0.11474609, + "step": 9242, + "time_per_iteration": 2.467933416366577 + }, + { + "auxiliary_loss_clip": 0.06423976, + "auxiliary_loss_mlp": 0.0126539, + "balance_loss_clip": 0.06276064, + "balance_loss_mlp": 0.01253022, + "epoch": 0.5557192244100406, + "flos": 12938825197440.0, + "grad_norm": 6.974019127266796, + "language_loss": 0.64053857, + "learning_rate": 1.7369044844173012e-06, + "loss": 0.71743226, + "num_input_tokens_seen": 199018380, + "router_z_loss_clip": 1.47851562, + "router_z_loss_mlp": 0.12365723, + "step": 9243, + "time_per_iteration": 2.528529167175293 + }, + { + "auxiliary_loss_clip": 0.0642553, + "auxiliary_loss_mlp": 0.01269814, + "balance_loss_clip": 0.06280211, + "balance_loss_mlp": 0.01258614, + "epoch": 0.5557793476627085, + "flos": 23118291590400.0, + "grad_norm": 3.1703508621435095, + "language_loss": 0.75212169, + "learning_rate": 1.7365184122131509e-06, + "loss": 0.82907516, + "num_input_tokens_seen": 199037115, + "router_z_loss_clip": 1.45214844, + "router_z_loss_mlp": 0.11199951, + "step": 9244, + "time_per_iteration": 2.5159640312194824 + }, + { + "auxiliary_loss_clip": 0.06417421, + "auxiliary_loss_mlp": 0.01263368, + "balance_loss_clip": 0.06277108, + "balance_loss_mlp": 0.01252938, + "epoch": 0.5558394709153766, + "flos": 21433446817920.0, + "grad_norm": 2.161992759062338, + "language_loss": 0.74536991, + "learning_rate": 1.7361323500002486e-06, + "loss": 0.82217783, + "num_input_tokens_seen": 199053375, + "router_z_loss_clip": 1.40332031, + "router_z_loss_mlp": 0.10437012, + "step": 9245, + "time_per_iteration": 2.5320873260498047 + }, + { + "auxiliary_loss_clip": 0.06425805, + "auxiliary_loss_mlp": 0.01268074, + "balance_loss_clip": 0.06274477, + "balance_loss_mlp": 0.01255533, + "epoch": 0.5558995941680445, + "flos": 25084626554880.0, + "grad_norm": 2.1186554191459575, + "language_loss": 0.79345202, + "learning_rate": 1.7357462977932348e-06, + "loss": 0.87039083, + "num_input_tokens_seen": 199070930, + "router_z_loss_clip": 1.51269531, + "router_z_loss_mlp": 0.12530518, + "step": 9246, + "time_per_iteration": 2.5617494583129883 + }, + { + "auxiliary_loss_clip": 0.06425521, + "auxiliary_loss_mlp": 0.01270795, + "balance_loss_clip": 0.06276709, + "balance_loss_mlp": 0.01258993, + "epoch": 0.5559597174207125, + "flos": 20017331688960.0, + "grad_norm": 1.8080775090170724, + "language_loss": 0.7423467, + "learning_rate": 1.7353602556067471e-06, + "loss": 0.81930989, + "num_input_tokens_seen": 199088675, + "router_z_loss_clip": 1.48730469, + "router_z_loss_mlp": 0.11810303, + "step": 9247, + "time_per_iteration": 2.5472562313079834 + }, + { + "auxiliary_loss_clip": 0.06421669, + "auxiliary_loss_mlp": 0.01265666, + "balance_loss_clip": 0.06275357, + "balance_loss_mlp": 0.01254007, + "epoch": 0.5560198406733804, + "flos": 16841125221120.0, + "grad_norm": 2.9360607038713127, + "language_loss": 0.75686443, + "learning_rate": 1.7349742234554254e-06, + "loss": 0.83373785, + "num_input_tokens_seen": 199103075, + "router_z_loss_clip": 1.46386719, + "router_z_loss_mlp": 0.11645508, + "step": 9248, + "time_per_iteration": 2.4991230964660645 + }, + { + "auxiliary_loss_clip": 0.06332292, + "auxiliary_loss_mlp": 0.01252325, + "balance_loss_clip": 0.06269792, + "balance_loss_mlp": 0.01250564, + "epoch": 0.5560799639260484, + "flos": 70719012840960.0, + "grad_norm": 0.8521249277155936, + "language_loss": 0.5948171, + "learning_rate": 1.7345882013539081e-06, + "loss": 0.67066324, + "num_input_tokens_seen": 199160325, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 0.01763916, + "step": 9249, + "time_per_iteration": 3.2450287342071533 + }, + { + "auxiliary_loss_clip": 0.06424973, + "auxiliary_loss_mlp": 0.0126469, + "balance_loss_clip": 0.06276406, + "balance_loss_mlp": 0.01253943, + "epoch": 0.5561400871787163, + "flos": 23155244040960.0, + "grad_norm": 2.0335955894649036, + "language_loss": 0.79889202, + "learning_rate": 1.734202189316832e-06, + "loss": 0.87578869, + "num_input_tokens_seen": 199179760, + "router_z_loss_clip": 1.48339844, + "router_z_loss_mlp": 0.10748291, + "step": 9250, + "time_per_iteration": 2.5372138023376465 + }, + { + "auxiliary_loss_clip": 0.06427802, + "auxiliary_loss_mlp": 0.0126907, + "balance_loss_clip": 0.06277002, + "balance_loss_mlp": 0.01257471, + "epoch": 0.5562002104313843, + "flos": 17572166167680.0, + "grad_norm": 3.4851408255327856, + "language_loss": 0.69400316, + "learning_rate": 1.733816187358836e-06, + "loss": 0.77097189, + "num_input_tokens_seen": 199196695, + "router_z_loss_clip": 1.5078125, + "router_z_loss_mlp": 0.11584473, + "step": 9251, + "time_per_iteration": 2.554487943649292 + }, + { + "auxiliary_loss_clip": 0.06422772, + "auxiliary_loss_mlp": 0.01265424, + "balance_loss_clip": 0.06275512, + "balance_loss_mlp": 0.01253676, + "epoch": 0.5562603336840523, + "flos": 25052328005760.0, + "grad_norm": 1.4438817767967254, + "language_loss": 0.75297302, + "learning_rate": 1.7334301954945569e-06, + "loss": 0.82985497, + "num_input_tokens_seen": 199217845, + "router_z_loss_clip": 1.47265625, + "router_z_loss_mlp": 0.11743164, + "step": 9252, + "time_per_iteration": 2.554103374481201 + }, + { + "auxiliary_loss_clip": 0.06427599, + "auxiliary_loss_mlp": 0.01265088, + "balance_loss_clip": 0.0627709, + "balance_loss_mlp": 0.01254115, + "epoch": 0.5563204569367203, + "flos": 29066617411200.0, + "grad_norm": 1.5076691298158018, + "language_loss": 0.72903025, + "learning_rate": 1.7330442137386313e-06, + "loss": 0.80595708, + "num_input_tokens_seen": 199239250, + "router_z_loss_clip": 1.50488281, + "router_z_loss_mlp": 0.10980225, + "step": 9253, + "time_per_iteration": 2.5654473304748535 + }, + { + "auxiliary_loss_clip": 0.06422551, + "auxiliary_loss_mlp": 0.01269621, + "balance_loss_clip": 0.06276682, + "balance_loss_mlp": 0.01259161, + "epoch": 0.5563805801893883, + "flos": 22096913846400.0, + "grad_norm": 1.9717474280435598, + "language_loss": 0.83141911, + "learning_rate": 1.7326582421056965e-06, + "loss": 0.90834075, + "num_input_tokens_seen": 199258320, + "router_z_loss_clip": 1.45996094, + "router_z_loss_mlp": 0.10455322, + "step": 9254, + "time_per_iteration": 2.5113630294799805 + }, + { + "auxiliary_loss_clip": 0.06332405, + "auxiliary_loss_mlp": 0.01255231, + "balance_loss_clip": 0.06269685, + "balance_loss_mlp": 0.01253453, + "epoch": 0.5564407034420562, + "flos": 58652623555200.0, + "grad_norm": 0.8548643960281289, + "language_loss": 0.64887053, + "learning_rate": 1.732272280610387e-06, + "loss": 0.72474694, + "num_input_tokens_seen": 199314840, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 0.01777649, + "step": 9255, + "time_per_iteration": 2.980931043624878 + }, + { + "auxiliary_loss_clip": 0.06420524, + "auxiliary_loss_mlp": 0.01265076, + "balance_loss_clip": 0.06275329, + "balance_loss_mlp": 0.01254175, + "epoch": 0.5565008266947242, + "flos": 23119004350080.0, + "grad_norm": 1.731717948076331, + "language_loss": 0.69607276, + "learning_rate": 1.7318863292673399e-06, + "loss": 0.77292871, + "num_input_tokens_seen": 199335405, + "router_z_loss_clip": 1.45117188, + "router_z_loss_mlp": 0.10900879, + "step": 9256, + "time_per_iteration": 3.9532642364501953 + }, + { + "auxiliary_loss_clip": 0.06418847, + "auxiliary_loss_mlp": 0.01264994, + "balance_loss_clip": 0.06276, + "balance_loss_mlp": 0.01254551, + "epoch": 0.5565609499473921, + "flos": 21584568856320.0, + "grad_norm": 1.4749881970234011, + "language_loss": 0.76680368, + "learning_rate": 1.73150038809119e-06, + "loss": 0.84364206, + "num_input_tokens_seen": 199354345, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.10443115, + "step": 9257, + "time_per_iteration": 2.4937705993652344 + }, + { + "auxiliary_loss_clip": 0.06425476, + "auxiliary_loss_mlp": 0.01273625, + "balance_loss_clip": 0.0627654, + "balance_loss_mlp": 0.01262735, + "epoch": 0.5566210732000602, + "flos": 18375602641920.0, + "grad_norm": 2.7130999997532563, + "language_loss": 0.61334699, + "learning_rate": 1.7311144570965724e-06, + "loss": 0.69033802, + "num_input_tokens_seen": 199372250, + "router_z_loss_clip": 1.48828125, + "router_z_loss_mlp": 0.10894775, + "step": 9258, + "time_per_iteration": 2.5560710430145264 + }, + { + "auxiliary_loss_clip": 0.06420255, + "auxiliary_loss_mlp": 0.01266708, + "balance_loss_clip": 0.0627327, + "balance_loss_mlp": 0.01255431, + "epoch": 0.5566811964527281, + "flos": 25710554154240.0, + "grad_norm": 1.5983859944569927, + "language_loss": 0.79631943, + "learning_rate": 1.7307285362981215e-06, + "loss": 0.87318903, + "num_input_tokens_seen": 199392815, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 0.11279297, + "step": 9259, + "time_per_iteration": 2.582550525665283 + }, + { + "auxiliary_loss_clip": 0.06421982, + "auxiliary_loss_mlp": 0.01267837, + "balance_loss_clip": 0.06275143, + "balance_loss_mlp": 0.01257013, + "epoch": 0.5567413197053961, + "flos": 26951424468480.0, + "grad_norm": 1.7768491917262519, + "language_loss": 0.81632483, + "learning_rate": 1.7303426257104712e-06, + "loss": 0.89322305, + "num_input_tokens_seen": 199412375, + "router_z_loss_clip": 1.46777344, + "router_z_loss_mlp": 0.10821533, + "step": 9260, + "time_per_iteration": 3.994185209274292 + }, + { + "auxiliary_loss_clip": 0.0642475, + "auxiliary_loss_mlp": 0.0126987, + "balance_loss_clip": 0.06276532, + "balance_loss_mlp": 0.01257598, + "epoch": 0.556801442958064, + "flos": 20856965927040.0, + "grad_norm": 1.6577209620324271, + "language_loss": 0.69569898, + "learning_rate": 1.729956725348256e-06, + "loss": 0.77264518, + "num_input_tokens_seen": 199431490, + "router_z_loss_clip": 1.48144531, + "router_z_loss_mlp": 0.1227417, + "step": 9261, + "time_per_iteration": 2.558511734008789 + }, + { + "auxiliary_loss_clip": 0.06317247, + "auxiliary_loss_mlp": 0.01254512, + "balance_loss_clip": 0.06255186, + "balance_loss_mlp": 0.01252651, + "epoch": 0.556861566210732, + "flos": 70517395918080.0, + "grad_norm": 0.7170849600938061, + "language_loss": 0.61090672, + "learning_rate": 1.729570835226108e-06, + "loss": 0.68662429, + "num_input_tokens_seen": 199495855, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 0.01856995, + "step": 9262, + "time_per_iteration": 3.134216070175171 + }, + { + "auxiliary_loss_clip": 0.06422806, + "auxiliary_loss_mlp": 0.01270562, + "balance_loss_clip": 0.06273758, + "balance_loss_mlp": 0.01259214, + "epoch": 0.5569216894633999, + "flos": 25344216103680.0, + "grad_norm": 1.5027402480240113, + "language_loss": 0.64822662, + "learning_rate": 1.7291849553586622e-06, + "loss": 0.72516024, + "num_input_tokens_seen": 199515870, + "router_z_loss_clip": 1.48925781, + "router_z_loss_mlp": 0.11340332, + "step": 9263, + "time_per_iteration": 2.5533127784729004 + }, + { + "auxiliary_loss_clip": 0.06420417, + "auxiliary_loss_mlp": 0.01271706, + "balance_loss_clip": 0.06274161, + "balance_loss_mlp": 0.01260679, + "epoch": 0.556981812716068, + "flos": 22645456600320.0, + "grad_norm": 1.647856593864945, + "language_loss": 0.73077464, + "learning_rate": 1.7287990857605497e-06, + "loss": 0.80769587, + "num_input_tokens_seen": 199535745, + "router_z_loss_clip": 1.46484375, + "router_z_loss_mlp": 0.11035156, + "step": 9264, + "time_per_iteration": 2.5055153369903564 + }, + { + "auxiliary_loss_clip": 0.06421056, + "auxiliary_loss_mlp": 0.01267322, + "balance_loss_clip": 0.06273742, + "balance_loss_mlp": 0.01255765, + "epoch": 0.5570419359687359, + "flos": 11040567275520.0, + "grad_norm": 1.7723772076526776, + "language_loss": 0.7667138, + "learning_rate": 1.7284132264464022e-06, + "loss": 0.84359753, + "num_input_tokens_seen": 199554035, + "router_z_loss_clip": 1.47265625, + "router_z_loss_mlp": 0.11553955, + "step": 9265, + "time_per_iteration": 3.964038372039795 + }, + { + "auxiliary_loss_clip": 0.064167, + "auxiliary_loss_mlp": 0.01273186, + "balance_loss_clip": 0.06276511, + "balance_loss_mlp": 0.01262368, + "epoch": 0.5571020592214039, + "flos": 22830218853120.0, + "grad_norm": 1.7025735740351078, + "language_loss": 0.71389985, + "learning_rate": 1.7280273774308536e-06, + "loss": 0.79079872, + "num_input_tokens_seen": 199576120, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.1081543, + "step": 9266, + "time_per_iteration": 2.5572071075439453 + }, + { + "auxiliary_loss_clip": 0.06418756, + "auxiliary_loss_mlp": 0.01270352, + "balance_loss_clip": 0.06275168, + "balance_loss_mlp": 0.01259701, + "epoch": 0.5571621824740719, + "flos": 22934074389120.0, + "grad_norm": 1.5846567867344512, + "language_loss": 0.68614411, + "learning_rate": 1.727641538728533e-06, + "loss": 0.76303518, + "num_input_tokens_seen": 199593780, + "router_z_loss_clip": 1.43457031, + "router_z_loss_mlp": 0.10656738, + "step": 9267, + "time_per_iteration": 2.4949660301208496 + }, + { + "auxiliary_loss_clip": 0.06419186, + "auxiliary_loss_mlp": 0.01266996, + "balance_loss_clip": 0.06277707, + "balance_loss_mlp": 0.01255677, + "epoch": 0.5572223057267398, + "flos": 22973416680960.0, + "grad_norm": 2.0664301257613684, + "language_loss": 0.75132561, + "learning_rate": 1.7272557103540736e-06, + "loss": 0.82818741, + "num_input_tokens_seen": 199613220, + "router_z_loss_clip": 1.41601562, + "router_z_loss_mlp": 0.11315918, + "step": 9268, + "time_per_iteration": 2.5834717750549316 + }, + { + "auxiliary_loss_clip": 0.06420539, + "auxiliary_loss_mlp": 0.01262996, + "balance_loss_clip": 0.06276375, + "balance_loss_mlp": 0.01252184, + "epoch": 0.5572824289794078, + "flos": 20966439686400.0, + "grad_norm": 2.076388090189787, + "language_loss": 0.75247812, + "learning_rate": 1.726869892322104e-06, + "loss": 0.8293134, + "num_input_tokens_seen": 199632085, + "router_z_loss_clip": 1.44042969, + "router_z_loss_mlp": 0.10803223, + "step": 9269, + "time_per_iteration": 2.6340525150299072 + }, + { + "auxiliary_loss_clip": 0.06420279, + "auxiliary_loss_mlp": 0.01268076, + "balance_loss_clip": 0.06274693, + "balance_loss_mlp": 0.01257091, + "epoch": 0.5573425522320757, + "flos": 25048806134400.0, + "grad_norm": 1.9328220368280318, + "language_loss": 0.82704222, + "learning_rate": 1.726484084647256e-06, + "loss": 0.90392578, + "num_input_tokens_seen": 199649295, + "router_z_loss_clip": 1.45507812, + "router_z_loss_mlp": 0.10986328, + "step": 9270, + "time_per_iteration": 2.6455605030059814 + }, + { + "auxiliary_loss_clip": 0.06426194, + "auxiliary_loss_mlp": 0.01267053, + "balance_loss_clip": 0.06276838, + "balance_loss_mlp": 0.01255657, + "epoch": 0.5574026754847438, + "flos": 23666415073920.0, + "grad_norm": 1.8553396052443616, + "language_loss": 0.79884106, + "learning_rate": 1.7260982873441591e-06, + "loss": 0.87577355, + "num_input_tokens_seen": 199668870, + "router_z_loss_clip": 1.49414062, + "router_z_loss_mlp": 0.1138916, + "step": 9271, + "time_per_iteration": 4.060855388641357 + }, + { + "auxiliary_loss_clip": 0.0642622, + "auxiliary_loss_mlp": 0.01265728, + "balance_loss_clip": 0.0627868, + "balance_loss_mlp": 0.01254153, + "epoch": 0.5574627987374117, + "flos": 24787791066240.0, + "grad_norm": 1.7644146130703546, + "language_loss": 0.90646034, + "learning_rate": 1.725712500427442e-06, + "loss": 0.9833799, + "num_input_tokens_seen": 199684870, + "router_z_loss_clip": 1.47265625, + "router_z_loss_mlp": 0.11572266, + "step": 9272, + "time_per_iteration": 2.534665107727051 + }, + { + "auxiliary_loss_clip": 0.0641982, + "auxiliary_loss_mlp": 0.01265463, + "balance_loss_clip": 0.06279024, + "balance_loss_mlp": 0.0125446, + "epoch": 0.5575229219900797, + "flos": 21841349293440.0, + "grad_norm": 1.8989818213493146, + "language_loss": 0.84368634, + "learning_rate": 1.7253267239117347e-06, + "loss": 0.92053914, + "num_input_tokens_seen": 199701975, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.10992432, + "step": 9273, + "time_per_iteration": 2.5200788974761963 + }, + { + "auxiliary_loss_clip": 0.06423581, + "auxiliary_loss_mlp": 0.01268606, + "balance_loss_clip": 0.06278996, + "balance_loss_mlp": 0.01256059, + "epoch": 0.5575830452427476, + "flos": 27821973663360.0, + "grad_norm": 1.9193499092419828, + "language_loss": 0.75017828, + "learning_rate": 1.7249409578116655e-06, + "loss": 0.82710016, + "num_input_tokens_seen": 199721865, + "router_z_loss_clip": 1.44433594, + "router_z_loss_mlp": 0.12548828, + "step": 9274, + "time_per_iteration": 2.548865795135498 + }, + { + "auxiliary_loss_clip": 0.06435296, + "auxiliary_loss_mlp": 0.01273341, + "balance_loss_clip": 0.06282236, + "balance_loss_mlp": 0.01260806, + "epoch": 0.5576431684954156, + "flos": 17817081252480.0, + "grad_norm": 2.8160029917848397, + "language_loss": 0.78999293, + "learning_rate": 1.7245552021418629e-06, + "loss": 0.86707926, + "num_input_tokens_seen": 199736455, + "router_z_loss_clip": 1.52832031, + "router_z_loss_mlp": 0.12530518, + "step": 9275, + "time_per_iteration": 2.503168821334839 + }, + { + "auxiliary_loss_clip": 0.06426495, + "auxiliary_loss_mlp": 0.01264959, + "balance_loss_clip": 0.06279385, + "balance_loss_mlp": 0.01253372, + "epoch": 0.5577032917480835, + "flos": 15492290520960.0, + "grad_norm": 1.5722489245589244, + "language_loss": 0.75639874, + "learning_rate": 1.7241694569169546e-06, + "loss": 0.83331323, + "num_input_tokens_seen": 199753125, + "router_z_loss_clip": 1.47070312, + "router_z_loss_mlp": 0.11584473, + "step": 9276, + "time_per_iteration": 2.466275215148926 + }, + { + "auxiliary_loss_clip": 0.06423229, + "auxiliary_loss_mlp": 0.012674, + "balance_loss_clip": 0.06277048, + "balance_loss_mlp": 0.01256379, + "epoch": 0.5577634150007516, + "flos": 21586162083840.0, + "grad_norm": 1.8200099839217898, + "language_loss": 0.75387412, + "learning_rate": 1.7237837221515678e-06, + "loss": 0.83078039, + "num_input_tokens_seen": 199771365, + "router_z_loss_clip": 1.46191406, + "router_z_loss_mlp": 0.11022949, + "step": 9277, + "time_per_iteration": 2.514432907104492 + }, + { + "auxiliary_loss_clip": 0.06420221, + "auxiliary_loss_mlp": 0.01266216, + "balance_loss_clip": 0.06277104, + "balance_loss_mlp": 0.01255535, + "epoch": 0.5578235382534195, + "flos": 21145709496960.0, + "grad_norm": 1.5944068660293211, + "language_loss": 0.7198559, + "learning_rate": 1.7233979978603304e-06, + "loss": 0.79672027, + "num_input_tokens_seen": 199790035, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.10681152, + "step": 9278, + "time_per_iteration": 2.4954776763916016 + }, + { + "auxiliary_loss_clip": 0.06425839, + "auxiliary_loss_mlp": 0.01267939, + "balance_loss_clip": 0.06276909, + "balance_loss_mlp": 0.01255166, + "epoch": 0.5578836615060875, + "flos": 26512397400960.0, + "grad_norm": 1.4623548994871365, + "language_loss": 0.75693482, + "learning_rate": 1.723012284057868e-06, + "loss": 0.83387262, + "num_input_tokens_seen": 199811125, + "router_z_loss_clip": 1.48925781, + "router_z_loss_mlp": 0.12786865, + "step": 9279, + "time_per_iteration": 2.5537941455841064 + }, + { + "auxiliary_loss_clip": 0.06422286, + "auxiliary_loss_mlp": 0.01267149, + "balance_loss_clip": 0.06276134, + "balance_loss_mlp": 0.01255354, + "epoch": 0.5579437847587555, + "flos": 20159439413760.0, + "grad_norm": 1.637545301877737, + "language_loss": 0.67443848, + "learning_rate": 1.7226265807588082e-06, + "loss": 0.75133282, + "num_input_tokens_seen": 199829915, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.11791992, + "step": 9280, + "time_per_iteration": 2.489867925643921 + }, + { + "auxiliary_loss_clip": 0.06426547, + "auxiliary_loss_mlp": 0.01266943, + "balance_loss_clip": 0.06276332, + "balance_loss_mlp": 0.01255851, + "epoch": 0.5580039080114234, + "flos": 26109148826880.0, + "grad_norm": 1.5394249927656036, + "language_loss": 0.7336756, + "learning_rate": 1.7222408879777763e-06, + "loss": 0.81061053, + "num_input_tokens_seen": 199850670, + "router_z_loss_clip": 1.50195312, + "router_z_loss_mlp": 0.11090088, + "step": 9281, + "time_per_iteration": 2.693004846572876 + }, + { + "auxiliary_loss_clip": 0.06420805, + "auxiliary_loss_mlp": 0.01265902, + "balance_loss_clip": 0.06277525, + "balance_loss_mlp": 0.01255244, + "epoch": 0.5580640312640914, + "flos": 13776740426880.0, + "grad_norm": 2.347269898773066, + "language_loss": 0.75313729, + "learning_rate": 1.7218552057293974e-06, + "loss": 0.83000439, + "num_input_tokens_seen": 199867645, + "router_z_loss_clip": 1.43359375, + "router_z_loss_mlp": 0.10662842, + "step": 9282, + "time_per_iteration": 2.472775936126709 + }, + { + "auxiliary_loss_clip": 0.06421494, + "auxiliary_loss_mlp": 0.01270202, + "balance_loss_clip": 0.0627737, + "balance_loss_mlp": 0.01258871, + "epoch": 0.5581241545167593, + "flos": 17681765708160.0, + "grad_norm": 1.6208158464679243, + "language_loss": 0.66451746, + "learning_rate": 1.721469534028297e-06, + "loss": 0.74143445, + "num_input_tokens_seen": 199886320, + "router_z_loss_clip": 1.44140625, + "router_z_loss_mlp": 0.11334229, + "step": 9283, + "time_per_iteration": 2.495039224624634 + }, + { + "auxiliary_loss_clip": 0.06423882, + "auxiliary_loss_mlp": 0.01268075, + "balance_loss_clip": 0.06277125, + "balance_loss_mlp": 0.01257489, + "epoch": 0.5581842777694274, + "flos": 19574573114880.0, + "grad_norm": 1.8440828180500004, + "language_loss": 0.83265072, + "learning_rate": 1.7210838728890994e-06, + "loss": 0.90957028, + "num_input_tokens_seen": 199904895, + "router_z_loss_clip": 1.46582031, + "router_z_loss_mlp": 0.10583496, + "step": 9284, + "time_per_iteration": 2.479743719100952 + }, + { + "auxiliary_loss_clip": 0.06423684, + "auxiliary_loss_mlp": 0.01266546, + "balance_loss_clip": 0.06278059, + "balance_loss_mlp": 0.01255412, + "epoch": 0.5582444010220953, + "flos": 20601485228160.0, + "grad_norm": 2.4189186360573407, + "language_loss": 0.86142218, + "learning_rate": 1.7206982223264304e-06, + "loss": 0.93832451, + "num_input_tokens_seen": 199921090, + "router_z_loss_clip": 1.45800781, + "router_z_loss_mlp": 0.11132812, + "step": 9285, + "time_per_iteration": 2.528890371322632 + }, + { + "auxiliary_loss_clip": 0.06422924, + "auxiliary_loss_mlp": 0.01266589, + "balance_loss_clip": 0.06277917, + "balance_loss_mlp": 0.01255818, + "epoch": 0.5583045242747633, + "flos": 19141541614080.0, + "grad_norm": 2.3862114712175013, + "language_loss": 0.74476177, + "learning_rate": 1.720312582354912e-06, + "loss": 0.82165694, + "num_input_tokens_seen": 199939925, + "router_z_loss_clip": 1.44726562, + "router_z_loss_mlp": 0.10772705, + "step": 9286, + "time_per_iteration": 2.502807378768921 + }, + { + "auxiliary_loss_clip": 0.06421416, + "auxiliary_loss_mlp": 0.01267963, + "balance_loss_clip": 0.06276793, + "balance_loss_mlp": 0.01256448, + "epoch": 0.5583646475274312, + "flos": 27462050449920.0, + "grad_norm": 1.681368685974995, + "language_loss": 0.74959427, + "learning_rate": 1.7199269529891684e-06, + "loss": 0.82648808, + "num_input_tokens_seen": 199960015, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.11529541, + "step": 9287, + "time_per_iteration": 2.5700645446777344 + }, + { + "auxiliary_loss_clip": 0.06430193, + "auxiliary_loss_mlp": 0.01266629, + "balance_loss_clip": 0.06279745, + "balance_loss_mlp": 0.01254601, + "epoch": 0.5584247707800992, + "flos": 23659580966400.0, + "grad_norm": 1.4753035778898818, + "language_loss": 0.75157738, + "learning_rate": 1.7195413342438233e-06, + "loss": 0.82854563, + "num_input_tokens_seen": 199980505, + "router_z_loss_clip": 1.50488281, + "router_z_loss_mlp": 0.12036133, + "step": 9288, + "time_per_iteration": 2.529250383377075 + }, + { + "auxiliary_loss_clip": 0.06424332, + "auxiliary_loss_mlp": 0.01267185, + "balance_loss_clip": 0.06280167, + "balance_loss_mlp": 0.01254847, + "epoch": 0.5584848940327671, + "flos": 13703967555840.0, + "grad_norm": 2.2558701039351696, + "language_loss": 0.78180242, + "learning_rate": 1.7191557261334984e-06, + "loss": 0.85871768, + "num_input_tokens_seen": 199999020, + "router_z_loss_clip": 1.44238281, + "router_z_loss_mlp": 0.12329102, + "step": 9289, + "time_per_iteration": 2.5093841552734375 + }, + { + "auxiliary_loss_clip": 0.06428449, + "auxiliary_loss_mlp": 0.0126783, + "balance_loss_clip": 0.06276964, + "balance_loss_mlp": 0.01255921, + "epoch": 0.5585450172854352, + "flos": 27023526506880.0, + "grad_norm": 1.7277790144481269, + "language_loss": 0.61688149, + "learning_rate": 1.718770128672817e-06, + "loss": 0.69384426, + "num_input_tokens_seen": 200019020, + "router_z_loss_clip": 1.51464844, + "router_z_loss_mlp": 0.11914062, + "step": 9290, + "time_per_iteration": 2.5534214973449707 + }, + { + "auxiliary_loss_clip": 0.0642647, + "auxiliary_loss_mlp": 0.01268365, + "balance_loss_clip": 0.06277582, + "balance_loss_mlp": 0.01256581, + "epoch": 0.5586051405381031, + "flos": 23192406126720.0, + "grad_norm": 2.1760973422208965, + "language_loss": 0.67914414, + "learning_rate": 1.7183845418764e-06, + "loss": 0.75609255, + "num_input_tokens_seen": 200038110, + "router_z_loss_clip": 1.48925781, + "router_z_loss_mlp": 0.11767578, + "step": 9291, + "time_per_iteration": 2.5376763343811035 + }, + { + "auxiliary_loss_clip": 0.0642361, + "auxiliary_loss_mlp": 0.01267339, + "balance_loss_clip": 0.06277996, + "balance_loss_mlp": 0.01255764, + "epoch": 0.5586652637907711, + "flos": 20781551652480.0, + "grad_norm": 1.760966459417108, + "language_loss": 0.84366935, + "learning_rate": 1.7179989657588698e-06, + "loss": 0.92057884, + "num_input_tokens_seen": 200056210, + "router_z_loss_clip": 1.45703125, + "router_z_loss_mlp": 0.11578369, + "step": 9292, + "time_per_iteration": 2.5204405784606934 + }, + { + "auxiliary_loss_clip": 0.06422292, + "auxiliary_loss_mlp": 0.01268661, + "balance_loss_clip": 0.06279489, + "balance_loss_mlp": 0.01257848, + "epoch": 0.5587253870434391, + "flos": 28227360516480.0, + "grad_norm": 1.8754942991534513, + "language_loss": 0.7459076, + "learning_rate": 1.7176134003348476e-06, + "loss": 0.82281709, + "num_input_tokens_seen": 200075620, + "router_z_loss_clip": 1.42871094, + "router_z_loss_mlp": 0.10821533, + "step": 9293, + "time_per_iteration": 2.6592154502868652 + }, + { + "auxiliary_loss_clip": 0.06418014, + "auxiliary_loss_mlp": 0.01265353, + "balance_loss_clip": 0.06274671, + "balance_loss_mlp": 0.01254809, + "epoch": 0.558785510296107, + "flos": 26623128971520.0, + "grad_norm": 1.7285534178917525, + "language_loss": 0.72416651, + "learning_rate": 1.7172278456189523e-06, + "loss": 0.80100018, + "num_input_tokens_seen": 200095945, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.10546875, + "step": 9294, + "time_per_iteration": 2.538320779800415 + }, + { + "auxiliary_loss_clip": 0.06421927, + "auxiliary_loss_mlp": 0.01268134, + "balance_loss_clip": 0.06276325, + "balance_loss_mlp": 0.01257208, + "epoch": 0.558845633548775, + "flos": 20162919358080.0, + "grad_norm": 2.7937117268116656, + "language_loss": 0.69210899, + "learning_rate": 1.716842301625806e-06, + "loss": 0.76900959, + "num_input_tokens_seen": 200114185, + "router_z_loss_clip": 1.45507812, + "router_z_loss_mlp": 0.109375, + "step": 9295, + "time_per_iteration": 2.5218520164489746 + }, + { + "auxiliary_loss_clip": 0.06418794, + "auxiliary_loss_mlp": 0.0126519, + "balance_loss_clip": 0.06273518, + "balance_loss_mlp": 0.01253776, + "epoch": 0.5589057568014429, + "flos": 24357317114880.0, + "grad_norm": 1.5440712557728564, + "language_loss": 0.80893242, + "learning_rate": 1.7164567683700281e-06, + "loss": 0.88577229, + "num_input_tokens_seen": 200135030, + "router_z_loss_clip": 1.45117188, + "router_z_loss_mlp": 0.11419678, + "step": 9296, + "time_per_iteration": 3.9467618465423584 + }, + { + "auxiliary_loss_clip": 0.06419219, + "auxiliary_loss_mlp": 0.01265962, + "balance_loss_clip": 0.06275849, + "balance_loss_mlp": 0.01255019, + "epoch": 0.558965880054111, + "flos": 21111440376960.0, + "grad_norm": 1.9869508208087105, + "language_loss": 0.65690488, + "learning_rate": 1.7160712458662379e-06, + "loss": 0.73375666, + "num_input_tokens_seen": 200154290, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.10955811, + "step": 9297, + "time_per_iteration": 2.528181791305542 + }, + { + "auxiliary_loss_clip": 0.06424123, + "auxiliary_loss_mlp": 0.01267328, + "balance_loss_clip": 0.06275574, + "balance_loss_mlp": 0.0125527, + "epoch": 0.5590260033067789, + "flos": 18440954426880.0, + "grad_norm": 1.490575561372924, + "language_loss": 0.75263643, + "learning_rate": 1.7156857341290544e-06, + "loss": 0.82955098, + "num_input_tokens_seen": 200171555, + "router_z_loss_clip": 1.48632812, + "router_z_loss_mlp": 0.12054443, + "step": 9298, + "time_per_iteration": 2.5208308696746826 + }, + { + "auxiliary_loss_clip": 0.06311645, + "auxiliary_loss_mlp": 0.01252986, + "balance_loss_clip": 0.06249566, + "balance_loss_mlp": 0.01251184, + "epoch": 0.5590861265594469, + "flos": 70597673729280.0, + "grad_norm": 0.6945904868111653, + "language_loss": 0.52248931, + "learning_rate": 1.7153002331730967e-06, + "loss": 0.59813559, + "num_input_tokens_seen": 200237010, + "router_z_loss_clip": 0.62158203, + "router_z_loss_mlp": 0.01797485, + "step": 9299, + "time_per_iteration": 4.702880144119263 + }, + { + "auxiliary_loss_clip": 0.06418106, + "auxiliary_loss_mlp": 0.01267473, + "balance_loss_clip": 0.06276019, + "balance_loss_mlp": 0.01256905, + "epoch": 0.5591462498121148, + "flos": 30672274475520.0, + "grad_norm": 1.7758709427362191, + "language_loss": 0.68987107, + "learning_rate": 1.7149147430129824e-06, + "loss": 0.76672685, + "num_input_tokens_seen": 200260820, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.10571289, + "step": 9300, + "time_per_iteration": 2.6169886589050293 + }, + { + "auxiliary_loss_clip": 0.06428309, + "auxiliary_loss_mlp": 0.01266499, + "balance_loss_clip": 0.06278549, + "balance_loss_mlp": 0.01254727, + "epoch": 0.5592063730647828, + "flos": 18156319706880.0, + "grad_norm": 3.029569475440017, + "language_loss": 0.81908011, + "learning_rate": 1.7145292636633293e-06, + "loss": 0.89602816, + "num_input_tokens_seen": 200278035, + "router_z_loss_clip": 1.49804688, + "router_z_loss_mlp": 0.11761475, + "step": 9301, + "time_per_iteration": 2.4880383014678955 + }, + { + "auxiliary_loss_clip": 0.06421784, + "auxiliary_loss_mlp": 0.0126742, + "balance_loss_clip": 0.06274376, + "balance_loss_mlp": 0.01256101, + "epoch": 0.5592664963174507, + "flos": 24067148025600.0, + "grad_norm": 2.0495431587104216, + "language_loss": 0.67981839, + "learning_rate": 1.714143795138756e-06, + "loss": 0.75671041, + "num_input_tokens_seen": 200297255, + "router_z_loss_clip": 1.47265625, + "router_z_loss_mlp": 0.11315918, + "step": 9302, + "time_per_iteration": 2.5440263748168945 + }, + { + "auxiliary_loss_clip": 0.06427488, + "auxiliary_loss_mlp": 0.01266173, + "balance_loss_clip": 0.0627801, + "balance_loss_mlp": 0.01254121, + "epoch": 0.5593266195701188, + "flos": 19833911101440.0, + "grad_norm": 1.543967288464222, + "language_loss": 0.70932961, + "learning_rate": 1.713758337453878e-06, + "loss": 0.78626627, + "num_input_tokens_seen": 200317505, + "router_z_loss_clip": 1.49609375, + "router_z_loss_mlp": 0.12042236, + "step": 9303, + "time_per_iteration": 2.52182674407959 + }, + { + "auxiliary_loss_clip": 0.06417537, + "auxiliary_loss_mlp": 0.01265621, + "balance_loss_clip": 0.06276484, + "balance_loss_mlp": 0.01255453, + "epoch": 0.5593867428227867, + "flos": 25307682923520.0, + "grad_norm": 1.5891501411536748, + "language_loss": 0.73189592, + "learning_rate": 1.7133728906233124e-06, + "loss": 0.8087275, + "num_input_tokens_seen": 200338350, + "router_z_loss_clip": 1.41210938, + "router_z_loss_mlp": 0.10168457, + "step": 9304, + "time_per_iteration": 3.999878406524658 + }, + { + "auxiliary_loss_clip": 0.06421353, + "auxiliary_loss_mlp": 0.01266821, + "balance_loss_clip": 0.06276563, + "balance_loss_mlp": 0.01255693, + "epoch": 0.5594468660754547, + "flos": 12938028583680.0, + "grad_norm": 2.1417504305353563, + "language_loss": 0.78262866, + "learning_rate": 1.7129874546616763e-06, + "loss": 0.85951042, + "num_input_tokens_seen": 200353965, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.11132812, + "step": 9305, + "time_per_iteration": 2.5058751106262207 + }, + { + "auxiliary_loss_clip": 0.06419225, + "auxiliary_loss_mlp": 0.0126404, + "balance_loss_clip": 0.06278518, + "balance_loss_mlp": 0.01253341, + "epoch": 0.5595069893281227, + "flos": 19068768743040.0, + "grad_norm": 1.6214418695958237, + "language_loss": 0.69748855, + "learning_rate": 1.7126020295835836e-06, + "loss": 0.7743212, + "num_input_tokens_seen": 200373595, + "router_z_loss_clip": 1.40429688, + "router_z_loss_mlp": 0.10705566, + "step": 9306, + "time_per_iteration": 2.5216495990753174 + }, + { + "auxiliary_loss_clip": 0.06329086, + "auxiliary_loss_mlp": 0.01251264, + "balance_loss_clip": 0.06266434, + "balance_loss_mlp": 0.01249626, + "epoch": 0.5595671125807906, + "flos": 70291530437760.0, + "grad_norm": 0.8883282828550626, + "language_loss": 0.60321748, + "learning_rate": 1.7122166154036518e-06, + "loss": 0.679021, + "num_input_tokens_seen": 200429155, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 0.0164032, + "step": 9307, + "time_per_iteration": 3.2440812587738037 + }, + { + "auxiliary_loss_clip": 0.06421244, + "auxiliary_loss_mlp": 0.01267485, + "balance_loss_clip": 0.06278248, + "balance_loss_mlp": 0.01257013, + "epoch": 0.5596272358334586, + "flos": 20671407060480.0, + "grad_norm": 1.5654652346016935, + "language_loss": 0.7418704, + "learning_rate": 1.7118312121364943e-06, + "loss": 0.81875765, + "num_input_tokens_seen": 200448290, + "router_z_loss_clip": 1.42871094, + "router_z_loss_mlp": 0.10467529, + "step": 9308, + "time_per_iteration": 2.527722120285034 + }, + { + "auxiliary_loss_clip": 0.06423165, + "auxiliary_loss_mlp": 0.01265447, + "balance_loss_clip": 0.06275736, + "balance_loss_mlp": 0.01253371, + "epoch": 0.5596873590861265, + "flos": 25047170979840.0, + "grad_norm": 1.7977154981427412, + "language_loss": 0.70390081, + "learning_rate": 1.7114458197967257e-06, + "loss": 0.78078693, + "num_input_tokens_seen": 200466555, + "router_z_loss_clip": 1.47363281, + "router_z_loss_mlp": 0.12072754, + "step": 9309, + "time_per_iteration": 2.5592753887176514 + }, + { + "auxiliary_loss_clip": 0.06425751, + "auxiliary_loss_mlp": 0.01268716, + "balance_loss_clip": 0.06278521, + "balance_loss_mlp": 0.01255889, + "epoch": 0.5597474823387946, + "flos": 25965573655680.0, + "grad_norm": 1.826608872454741, + "language_loss": 0.7546587, + "learning_rate": 1.7110604383989613e-06, + "loss": 0.83160329, + "num_input_tokens_seen": 200485980, + "router_z_loss_clip": 1.47265625, + "router_z_loss_mlp": 0.12835693, + "step": 9310, + "time_per_iteration": 2.5775809288024902 + }, + { + "auxiliary_loss_clip": 0.06428897, + "auxiliary_loss_mlp": 0.01266019, + "balance_loss_clip": 0.06280525, + "balance_loss_mlp": 0.0125343, + "epoch": 0.5598076055914625, + "flos": 26184688882560.0, + "grad_norm": 2.287225356977705, + "language_loss": 0.70149207, + "learning_rate": 1.7106750679578133e-06, + "loss": 0.77844125, + "num_input_tokens_seen": 200504555, + "router_z_loss_clip": 1.48339844, + "router_z_loss_mlp": 0.12579346, + "step": 9311, + "time_per_iteration": 3.9833383560180664 + }, + { + "auxiliary_loss_clip": 0.06422099, + "auxiliary_loss_mlp": 0.01265696, + "balance_loss_clip": 0.06277782, + "balance_loss_mlp": 0.01254061, + "epoch": 0.5598677288441305, + "flos": 11660541235200.0, + "grad_norm": 2.2749325214124605, + "language_loss": 0.72917002, + "learning_rate": 1.7102897084878962e-06, + "loss": 0.80604798, + "num_input_tokens_seen": 200522700, + "router_z_loss_clip": 1.44335938, + "router_z_loss_mlp": 0.11645508, + "step": 9312, + "time_per_iteration": 2.5323050022125244 + }, + { + "auxiliary_loss_clip": 0.06420854, + "auxiliary_loss_mlp": 0.01267281, + "balance_loss_clip": 0.06276432, + "balance_loss_mlp": 0.01255772, + "epoch": 0.5599278520967984, + "flos": 22973290899840.0, + "grad_norm": 1.8427769518341257, + "language_loss": 0.89498973, + "learning_rate": 1.709904360003822e-06, + "loss": 0.97187102, + "num_input_tokens_seen": 200541910, + "router_z_loss_clip": 1.44335938, + "router_z_loss_mlp": 0.1151123, + "step": 9313, + "time_per_iteration": 2.5141191482543945 + }, + { + "auxiliary_loss_clip": 0.06423395, + "auxiliary_loss_mlp": 0.01268039, + "balance_loss_clip": 0.06279235, + "balance_loss_mlp": 0.01256804, + "epoch": 0.5599879753494664, + "flos": 21222004239360.0, + "grad_norm": 1.3323867384007686, + "language_loss": 0.7802453, + "learning_rate": 1.709519022520204e-06, + "loss": 0.85715961, + "num_input_tokens_seen": 200562600, + "router_z_loss_clip": 1.44140625, + "router_z_loss_mlp": 0.11242676, + "step": 9314, + "time_per_iteration": 2.587451934814453 + }, + { + "auxiliary_loss_clip": 0.06420899, + "auxiliary_loss_mlp": 0.01265189, + "balance_loss_clip": 0.06276683, + "balance_loss_mlp": 0.01254109, + "epoch": 0.5600480986021343, + "flos": 31911006510720.0, + "grad_norm": 1.5829567025911722, + "language_loss": 0.70587456, + "learning_rate": 1.7091336960516537e-06, + "loss": 0.78273547, + "num_input_tokens_seen": 200584795, + "router_z_loss_clip": 1.44140625, + "router_z_loss_mlp": 0.11083984, + "step": 9315, + "time_per_iteration": 2.585667371749878 + }, + { + "auxiliary_loss_clip": 0.06425041, + "auxiliary_loss_mlp": 0.01268206, + "balance_loss_clip": 0.06275864, + "balance_loss_mlp": 0.01256571, + "epoch": 0.5601082218548024, + "flos": 28483679756160.0, + "grad_norm": 1.7585144874491871, + "language_loss": 0.67066777, + "learning_rate": 1.7087483806127824e-06, + "loss": 0.7476002, + "num_input_tokens_seen": 200606945, + "router_z_loss_clip": 1.49023438, + "router_z_loss_mlp": 0.11645508, + "step": 9316, + "time_per_iteration": 2.5536792278289795 + }, + { + "auxiliary_loss_clip": 0.06421398, + "auxiliary_loss_mlp": 0.01264577, + "balance_loss_clip": 0.06276462, + "balance_loss_mlp": 0.01253324, + "epoch": 0.5601683451074703, + "flos": 24103974695040.0, + "grad_norm": 1.9270955506174936, + "language_loss": 0.87415564, + "learning_rate": 1.7083630762182022e-06, + "loss": 0.95101541, + "num_input_tokens_seen": 200626340, + "router_z_loss_clip": 1.45019531, + "router_z_loss_mlp": 0.11236572, + "step": 9317, + "time_per_iteration": 2.6297550201416016 + }, + { + "auxiliary_loss_clip": 0.06425779, + "auxiliary_loss_mlp": 0.01267741, + "balance_loss_clip": 0.06277692, + "balance_loss_mlp": 0.01255122, + "epoch": 0.5602284683601383, + "flos": 26362868590080.0, + "grad_norm": 1.81541721599753, + "language_loss": 0.77282947, + "learning_rate": 1.7079777828825233e-06, + "loss": 0.84976465, + "num_input_tokens_seen": 200644520, + "router_z_loss_clip": 1.47949219, + "router_z_loss_mlp": 0.1260376, + "step": 9318, + "time_per_iteration": 2.558359146118164 + }, + { + "auxiliary_loss_clip": 0.06418364, + "auxiliary_loss_mlp": 0.01266654, + "balance_loss_clip": 0.06273092, + "balance_loss_mlp": 0.01256301, + "epoch": 0.5602885916128063, + "flos": 24502904784000.0, + "grad_norm": 1.570238706906967, + "language_loss": 0.76465648, + "learning_rate": 1.7075925006203558e-06, + "loss": 0.84150666, + "num_input_tokens_seen": 200664845, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.10357666, + "step": 9319, + "time_per_iteration": 2.526543617248535 + }, + { + "auxiliary_loss_clip": 0.06418289, + "auxiliary_loss_mlp": 0.01264734, + "balance_loss_clip": 0.06273629, + "balance_loss_mlp": 0.01253427, + "epoch": 0.5603487148654742, + "flos": 27352450909440.0, + "grad_norm": 1.3333617188310043, + "language_loss": 0.85846102, + "learning_rate": 1.7072072294463101e-06, + "loss": 0.93529117, + "num_input_tokens_seen": 200686535, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.11309814, + "step": 9320, + "time_per_iteration": 2.5673651695251465 + }, + { + "auxiliary_loss_clip": 0.06334086, + "auxiliary_loss_mlp": 0.01252081, + "balance_loss_clip": 0.06272272, + "balance_loss_mlp": 0.01250187, + "epoch": 0.5604088381181422, + "flos": 54105555962880.0, + "grad_norm": 0.7541324814402665, + "language_loss": 0.52607638, + "learning_rate": 1.706821969374996e-06, + "loss": 0.60193801, + "num_input_tokens_seen": 200736965, + "router_z_loss_clip": 0.6171875, + "router_z_loss_mlp": 0.01890564, + "step": 9321, + "time_per_iteration": 2.977881908416748 + }, + { + "auxiliary_loss_clip": 0.06418586, + "auxiliary_loss_mlp": 0.01265276, + "balance_loss_clip": 0.06276635, + "balance_loss_mlp": 0.01254208, + "epoch": 0.5604689613708101, + "flos": 22242878858880.0, + "grad_norm": 1.3667787345793438, + "language_loss": 0.7480129, + "learning_rate": 1.7064367204210216e-06, + "loss": 0.82485151, + "num_input_tokens_seen": 200757420, + "router_z_loss_clip": 1.41699219, + "router_z_loss_mlp": 0.1105957, + "step": 9322, + "time_per_iteration": 2.532274007797241 + }, + { + "auxiliary_loss_clip": 0.06422681, + "auxiliary_loss_mlp": 0.01271383, + "balance_loss_clip": 0.06276275, + "balance_loss_mlp": 0.01258842, + "epoch": 0.5605290846234782, + "flos": 35306370132480.0, + "grad_norm": 1.7253794934771503, + "language_loss": 0.73680359, + "learning_rate": 1.7060514825989963e-06, + "loss": 0.81374425, + "num_input_tokens_seen": 200779520, + "router_z_loss_clip": 1.46289062, + "router_z_loss_mlp": 0.12542725, + "step": 9323, + "time_per_iteration": 2.6399970054626465 + }, + { + "auxiliary_loss_clip": 0.06425279, + "auxiliary_loss_mlp": 0.01266665, + "balance_loss_clip": 0.06275266, + "balance_loss_mlp": 0.01254505, + "epoch": 0.5605892078761461, + "flos": 20268997027200.0, + "grad_norm": 1.5398366577575928, + "language_loss": 0.62584162, + "learning_rate": 1.7056662559235286e-06, + "loss": 0.70276111, + "num_input_tokens_seen": 200799485, + "router_z_loss_clip": 1.49902344, + "router_z_loss_mlp": 0.12164307, + "step": 9324, + "time_per_iteration": 2.5179386138916016 + }, + { + "auxiliary_loss_clip": 0.06420085, + "auxiliary_loss_mlp": 0.01268132, + "balance_loss_clip": 0.0627415, + "balance_loss_mlp": 0.01255055, + "epoch": 0.5606493311288141, + "flos": 17313582867840.0, + "grad_norm": 2.467078298144656, + "language_loss": 0.88032669, + "learning_rate": 1.705281040409226e-06, + "loss": 0.95720887, + "num_input_tokens_seen": 200817540, + "router_z_loss_clip": 1.45898438, + "router_z_loss_mlp": 0.13092041, + "step": 9325, + "time_per_iteration": 2.5009984970092773 + }, + { + "auxiliary_loss_clip": 0.06425651, + "auxiliary_loss_mlp": 0.01271739, + "balance_loss_clip": 0.0627806, + "balance_loss_mlp": 0.01259454, + "epoch": 0.560709454381482, + "flos": 21659438079360.0, + "grad_norm": 1.5802994463075606, + "language_loss": 0.74048662, + "learning_rate": 1.7048958360706952e-06, + "loss": 0.81746054, + "num_input_tokens_seen": 200838380, + "router_z_loss_clip": 1.47558594, + "router_z_loss_mlp": 0.1229248, + "step": 9326, + "time_per_iteration": 2.53534197807312 + }, + { + "auxiliary_loss_clip": 0.06427591, + "auxiliary_loss_mlp": 0.0127498, + "balance_loss_clip": 0.06276761, + "balance_loss_mlp": 0.01262648, + "epoch": 0.56076957763415, + "flos": 20309639057280.0, + "grad_norm": 1.7151684776487535, + "language_loss": 0.79090071, + "learning_rate": 1.7045106429225447e-06, + "loss": 0.86792642, + "num_input_tokens_seen": 200855640, + "router_z_loss_clip": 1.50976562, + "router_z_loss_mlp": 0.12329102, + "step": 9327, + "time_per_iteration": 2.505734920501709 + }, + { + "auxiliary_loss_clip": 0.06422938, + "auxiliary_loss_mlp": 0.01268373, + "balance_loss_clip": 0.06277183, + "balance_loss_mlp": 0.01256201, + "epoch": 0.5608297008868179, + "flos": 25052873057280.0, + "grad_norm": 1.3540928387883675, + "language_loss": 0.7848016, + "learning_rate": 1.7041254609793795e-06, + "loss": 0.86171472, + "num_input_tokens_seen": 200876585, + "router_z_loss_clip": 1.45605469, + "router_z_loss_mlp": 0.12176514, + "step": 9328, + "time_per_iteration": 2.5479724407196045 + }, + { + "auxiliary_loss_clip": 0.06421052, + "auxiliary_loss_mlp": 0.01265937, + "balance_loss_clip": 0.06277333, + "balance_loss_mlp": 0.01255023, + "epoch": 0.560889824139486, + "flos": 19873253393280.0, + "grad_norm": 1.4144017329991472, + "language_loss": 0.7383225, + "learning_rate": 1.7037402902558066e-06, + "loss": 0.8151924, + "num_input_tokens_seen": 200898175, + "router_z_loss_clip": 1.43652344, + "router_z_loss_mlp": 0.10913086, + "step": 9329, + "time_per_iteration": 2.665193796157837 + }, + { + "auxiliary_loss_clip": 0.06430677, + "auxiliary_loss_mlp": 0.01265446, + "balance_loss_clip": 0.06278004, + "balance_loss_mlp": 0.01253269, + "epoch": 0.5609499473921539, + "flos": 22935961105920.0, + "grad_norm": 1.4811079467360542, + "language_loss": 0.83903289, + "learning_rate": 1.7033551307664324e-06, + "loss": 0.91599417, + "num_input_tokens_seen": 200917515, + "router_z_loss_clip": 1.5234375, + "router_z_loss_mlp": 0.12176514, + "step": 9330, + "time_per_iteration": 2.574812650680542 + }, + { + "auxiliary_loss_clip": 0.06343255, + "auxiliary_loss_mlp": 0.01254504, + "balance_loss_clip": 0.06281585, + "balance_loss_mlp": 0.01252853, + "epoch": 0.5610100706448219, + "flos": 53054479146240.0, + "grad_norm": 0.7010589280292991, + "language_loss": 0.57785869, + "learning_rate": 1.7029699825258603e-06, + "loss": 0.65383625, + "num_input_tokens_seen": 200978615, + "router_z_loss_clip": 0.6171875, + "router_z_loss_mlp": 0.01654053, + "step": 9331, + "time_per_iteration": 3.16204833984375 + }, + { + "auxiliary_loss_clip": 0.06429492, + "auxiliary_loss_mlp": 0.01266406, + "balance_loss_clip": 0.06280065, + "balance_loss_mlp": 0.01254723, + "epoch": 0.5610701938974898, + "flos": 21841349293440.0, + "grad_norm": 1.62115536838187, + "language_loss": 0.81915009, + "learning_rate": 1.7025848455486971e-06, + "loss": 0.89610904, + "num_input_tokens_seen": 200997745, + "router_z_loss_clip": 1.49414062, + "router_z_loss_mlp": 0.11682129, + "step": 9332, + "time_per_iteration": 2.503162145614624 + }, + { + "auxiliary_loss_clip": 0.06436246, + "auxiliary_loss_mlp": 0.01268376, + "balance_loss_clip": 0.06285603, + "balance_loss_mlp": 0.01255936, + "epoch": 0.5611303171501578, + "flos": 17462943970560.0, + "grad_norm": 2.4447262023658314, + "language_loss": 0.8238855, + "learning_rate": 1.7021997198495454e-06, + "loss": 0.90093172, + "num_input_tokens_seen": 201016370, + "router_z_loss_clip": 1.50585938, + "router_z_loss_mlp": 0.12451172, + "step": 9333, + "time_per_iteration": 2.5434911251068115 + }, + { + "auxiliary_loss_clip": 0.06429712, + "auxiliary_loss_mlp": 0.01266007, + "balance_loss_clip": 0.062811, + "balance_loss_mlp": 0.01254843, + "epoch": 0.5611904404028258, + "flos": 22644366497280.0, + "grad_norm": 1.7517485290647843, + "language_loss": 0.73036361, + "learning_rate": 1.7018146054430108e-06, + "loss": 0.80732077, + "num_input_tokens_seen": 201034310, + "router_z_loss_clip": 1.48535156, + "router_z_loss_mlp": 0.11157227, + "step": 9334, + "time_per_iteration": 2.5099892616271973 + }, + { + "auxiliary_loss_clip": 0.06427494, + "auxiliary_loss_mlp": 0.01271173, + "balance_loss_clip": 0.06281948, + "balance_loss_mlp": 0.01259771, + "epoch": 0.5612505636554938, + "flos": 14321048549760.0, + "grad_norm": 1.6258746678295788, + "language_loss": 0.71251893, + "learning_rate": 1.7014295023436961e-06, + "loss": 0.7895056, + "num_input_tokens_seen": 201052030, + "router_z_loss_clip": 1.453125, + "router_z_loss_mlp": 0.11395264, + "step": 9335, + "time_per_iteration": 3.8910462856292725 + }, + { + "auxiliary_loss_clip": 0.06430685, + "auxiliary_loss_mlp": 0.01266094, + "balance_loss_clip": 0.06283418, + "balance_loss_mlp": 0.01254149, + "epoch": 0.5613106869081618, + "flos": 16513835973120.0, + "grad_norm": 1.6562270786725333, + "language_loss": 0.7703501, + "learning_rate": 1.701044410566205e-06, + "loss": 0.84731793, + "num_input_tokens_seen": 201068445, + "router_z_loss_clip": 1.46972656, + "router_z_loss_mlp": 0.11932373, + "step": 9336, + "time_per_iteration": 2.5473687648773193 + }, + { + "auxiliary_loss_clip": 0.0642574, + "auxiliary_loss_mlp": 0.01265654, + "balance_loss_clip": 0.06282386, + "balance_loss_mlp": 0.0125489, + "epoch": 0.5613708101608297, + "flos": 24065009746560.0, + "grad_norm": 2.1630350478443625, + "language_loss": 0.64571506, + "learning_rate": 1.7006593301251393e-06, + "loss": 0.72262907, + "num_input_tokens_seen": 201082140, + "router_z_loss_clip": 1.43359375, + "router_z_loss_mlp": 0.10766602, + "step": 9337, + "time_per_iteration": 2.5193097591400146 + }, + { + "auxiliary_loss_clip": 0.06341661, + "auxiliary_loss_mlp": 0.01252845, + "balance_loss_clip": 0.06279477, + "balance_loss_mlp": 0.01251057, + "epoch": 0.5614309334134977, + "flos": 64922284984320.0, + "grad_norm": 0.883081868959654, + "language_loss": 0.62614578, + "learning_rate": 1.700274261035102e-06, + "loss": 0.7020908, + "num_input_tokens_seen": 201137245, + "router_z_loss_clip": 0.62353516, + "router_z_loss_mlp": 0.01785278, + "step": 9338, + "time_per_iteration": 3.115088939666748 + }, + { + "auxiliary_loss_clip": 0.06430536, + "auxiliary_loss_mlp": 0.01265908, + "balance_loss_clip": 0.0628281, + "balance_loss_mlp": 0.01254428, + "epoch": 0.5614910566661656, + "flos": 32926975666560.0, + "grad_norm": 1.7643724476932883, + "language_loss": 0.66069186, + "learning_rate": 1.6998892033106946e-06, + "loss": 0.73765635, + "num_input_tokens_seen": 201157270, + "router_z_loss_clip": 1.47851562, + "router_z_loss_mlp": 0.11474609, + "step": 9339, + "time_per_iteration": 4.156280040740967 + }, + { + "auxiliary_loss_clip": 0.06427112, + "auxiliary_loss_mlp": 0.01266835, + "balance_loss_clip": 0.06283177, + "balance_loss_mlp": 0.01254055, + "epoch": 0.5615511799188336, + "flos": 18594927504000.0, + "grad_norm": 1.6693116386089952, + "language_loss": 0.69893128, + "learning_rate": 1.6995041569665184e-06, + "loss": 0.77587074, + "num_input_tokens_seen": 201174530, + "router_z_loss_clip": 1.44042969, + "router_z_loss_mlp": 0.12774658, + "step": 9340, + "time_per_iteration": 2.4951670169830322 + }, + { + "auxiliary_loss_clip": 0.06425936, + "auxiliary_loss_mlp": 0.0126872, + "balance_loss_clip": 0.06286716, + "balance_loss_mlp": 0.01257168, + "epoch": 0.5616113031715015, + "flos": 22826571200640.0, + "grad_norm": 1.554264314492227, + "language_loss": 0.77897537, + "learning_rate": 1.6991191220171756e-06, + "loss": 0.85592192, + "num_input_tokens_seen": 201194905, + "router_z_loss_clip": 1.39257812, + "router_z_loss_mlp": 0.11566162, + "step": 9341, + "time_per_iteration": 2.557020902633667 + }, + { + "auxiliary_loss_clip": 0.06432091, + "auxiliary_loss_mlp": 0.01269049, + "balance_loss_clip": 0.06284195, + "balance_loss_mlp": 0.01256776, + "epoch": 0.5616714264241696, + "flos": 22352184910080.0, + "grad_norm": 1.797407374183417, + "language_loss": 0.80132401, + "learning_rate": 1.6987340984772653e-06, + "loss": 0.87833536, + "num_input_tokens_seen": 201213715, + "router_z_loss_clip": 1.47851562, + "router_z_loss_mlp": 0.12261963, + "step": 9342, + "time_per_iteration": 2.5441479682922363 + }, + { + "auxiliary_loss_clip": 0.06439396, + "auxiliary_loss_mlp": 0.01269037, + "balance_loss_clip": 0.06290646, + "balance_loss_mlp": 0.01257325, + "epoch": 0.5617315496768375, + "flos": 18813875022720.0, + "grad_norm": 2.3951377685236346, + "language_loss": 0.75757158, + "learning_rate": 1.6983490863613882e-06, + "loss": 0.83465594, + "num_input_tokens_seen": 201231415, + "router_z_loss_clip": 1.48828125, + "router_z_loss_mlp": 0.1171875, + "step": 9343, + "time_per_iteration": 2.552783489227295 + }, + { + "auxiliary_loss_clip": 0.06435137, + "auxiliary_loss_mlp": 0.01268416, + "balance_loss_clip": 0.06290908, + "balance_loss_mlp": 0.0125656, + "epoch": 0.5617916729295055, + "flos": 18375225298560.0, + "grad_norm": 1.7365132961619254, + "language_loss": 0.69429743, + "learning_rate": 1.6979640856841442e-06, + "loss": 0.77133292, + "num_input_tokens_seen": 201249625, + "router_z_loss_clip": 1.44140625, + "router_z_loss_mlp": 0.11853027, + "step": 9344, + "time_per_iteration": 3.940319061279297 + }, + { + "auxiliary_loss_clip": 0.06436205, + "auxiliary_loss_mlp": 0.01265611, + "balance_loss_clip": 0.06290596, + "balance_loss_mlp": 0.01254048, + "epoch": 0.5618517961821734, + "flos": 28186844267520.0, + "grad_norm": 2.084209166838754, + "language_loss": 0.66667032, + "learning_rate": 1.6975790964601318e-06, + "loss": 0.74368846, + "num_input_tokens_seen": 201271205, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.11560059, + "step": 9345, + "time_per_iteration": 2.5695786476135254 + }, + { + "auxiliary_loss_clip": 0.06434141, + "auxiliary_loss_mlp": 0.01269002, + "balance_loss_clip": 0.06287882, + "balance_loss_mlp": 0.01257683, + "epoch": 0.5619119194348414, + "flos": 15492290520960.0, + "grad_norm": 1.7418235878832828, + "language_loss": 0.88078266, + "learning_rate": 1.6971941187039512e-06, + "loss": 0.9578141, + "num_input_tokens_seen": 201287700, + "router_z_loss_clip": 1.46289062, + "router_z_loss_mlp": 0.11328125, + "step": 9346, + "time_per_iteration": 2.470212697982788 + }, + { + "auxiliary_loss_clip": 0.06433322, + "auxiliary_loss_mlp": 0.01273387, + "balance_loss_clip": 0.06289656, + "balance_loss_mlp": 0.01261257, + "epoch": 0.5619720426875094, + "flos": 29135700702720.0, + "grad_norm": 2.0124429779516335, + "language_loss": 0.5980221, + "learning_rate": 1.6968091524301993e-06, + "loss": 0.67508924, + "num_input_tokens_seen": 201307530, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.12139893, + "step": 9347, + "time_per_iteration": 2.5825982093811035 + }, + { + "auxiliary_loss_clip": 0.06435403, + "auxiliary_loss_mlp": 0.01270938, + "balance_loss_clip": 0.06288013, + "balance_loss_mlp": 0.01258349, + "epoch": 0.5620321659401774, + "flos": 18009474226560.0, + "grad_norm": 2.2126455504112066, + "language_loss": 0.69822383, + "learning_rate": 1.6964241976534745e-06, + "loss": 0.77528727, + "num_input_tokens_seen": 201326210, + "router_z_loss_clip": 1.47265625, + "router_z_loss_mlp": 0.12609863, + "step": 9348, + "time_per_iteration": 2.5037167072296143 + }, + { + "auxiliary_loss_clip": 0.0644159, + "auxiliary_loss_mlp": 0.01266077, + "balance_loss_clip": 0.06289469, + "balance_loss_mlp": 0.01254037, + "epoch": 0.5620922891928454, + "flos": 20600730541440.0, + "grad_norm": 3.445873194626742, + "language_loss": 0.79441649, + "learning_rate": 1.6960392543883754e-06, + "loss": 0.87149316, + "num_input_tokens_seen": 201346120, + "router_z_loss_clip": 1.52148438, + "router_z_loss_mlp": 0.12036133, + "step": 9349, + "time_per_iteration": 2.5519816875457764 + }, + { + "auxiliary_loss_clip": 0.06431362, + "auxiliary_loss_mlp": 0.01269513, + "balance_loss_clip": 0.06285249, + "balance_loss_mlp": 0.01257014, + "epoch": 0.5621524124455133, + "flos": 26294288423040.0, + "grad_norm": 2.015932955485816, + "language_loss": 0.67743355, + "learning_rate": 1.6956543226494975e-06, + "loss": 0.75444239, + "num_input_tokens_seen": 201365700, + "router_z_loss_clip": 1.45996094, + "router_z_loss_mlp": 0.12493896, + "step": 9350, + "time_per_iteration": 4.01330304145813 + }, + { + "auxiliary_loss_clip": 0.06434298, + "auxiliary_loss_mlp": 0.012681, + "balance_loss_clip": 0.06285301, + "balance_loss_mlp": 0.01256281, + "epoch": 0.5622125356981813, + "flos": 12755236901760.0, + "grad_norm": 2.011118504157059, + "language_loss": 0.78970456, + "learning_rate": 1.6952694024514381e-06, + "loss": 0.86672854, + "num_input_tokens_seen": 201382795, + "router_z_loss_clip": 1.49023438, + "router_z_loss_mlp": 0.11834717, + "step": 9351, + "time_per_iteration": 2.502434015274048 + }, + { + "auxiliary_loss_clip": 0.06430681, + "auxiliary_loss_mlp": 0.01265572, + "balance_loss_clip": 0.06279105, + "balance_loss_mlp": 0.01252894, + "epoch": 0.5622726589508492, + "flos": 23812086597120.0, + "grad_norm": 1.4860121982116354, + "language_loss": 0.59339732, + "learning_rate": 1.6948844938087945e-06, + "loss": 0.67035985, + "num_input_tokens_seen": 201402780, + "router_z_loss_clip": 1.51660156, + "router_z_loss_mlp": 0.12677002, + "step": 9352, + "time_per_iteration": 2.5574684143066406 + }, + { + "auxiliary_loss_clip": 0.06420172, + "auxiliary_loss_mlp": 0.01265668, + "balance_loss_clip": 0.062802, + "balance_loss_mlp": 0.01255041, + "epoch": 0.5623327822035172, + "flos": 24725248392960.0, + "grad_norm": 2.450009031651053, + "language_loss": 0.72177416, + "learning_rate": 1.6944995967361604e-06, + "loss": 0.7986325, + "num_input_tokens_seen": 201424140, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.10632324, + "step": 9353, + "time_per_iteration": 2.5429112911224365 + }, + { + "auxiliary_loss_clip": 0.06427602, + "auxiliary_loss_mlp": 0.01266418, + "balance_loss_clip": 0.06280185, + "balance_loss_mlp": 0.01255207, + "epoch": 0.5623929054561851, + "flos": 14023081031040.0, + "grad_norm": 3.091375667054191, + "language_loss": 0.7687071, + "learning_rate": 1.6941147112481327e-06, + "loss": 0.84564734, + "num_input_tokens_seen": 201439645, + "router_z_loss_clip": 1.47460938, + "router_z_loss_mlp": 0.11212158, + "step": 9354, + "time_per_iteration": 2.511843204498291 + }, + { + "auxiliary_loss_clip": 0.0643307, + "auxiliary_loss_mlp": 0.01268158, + "balance_loss_clip": 0.0628096, + "balance_loss_mlp": 0.01256672, + "epoch": 0.5624530287088532, + "flos": 20710707425280.0, + "grad_norm": 1.9243574999426976, + "language_loss": 0.72663665, + "learning_rate": 1.6937298373593056e-06, + "loss": 0.80364901, + "num_input_tokens_seen": 201459970, + "router_z_loss_clip": 1.52050781, + "router_z_loss_mlp": 0.1149292, + "step": 9355, + "time_per_iteration": 2.5472323894500732 + }, + { + "auxiliary_loss_clip": 0.06422609, + "auxiliary_loss_mlp": 0.01264166, + "balance_loss_clip": 0.06276853, + "balance_loss_mlp": 0.01252638, + "epoch": 0.5625131519615211, + "flos": 21477401084160.0, + "grad_norm": 1.4661709593952188, + "language_loss": 0.73949313, + "learning_rate": 1.693344975084274e-06, + "loss": 0.81636083, + "num_input_tokens_seen": 201480055, + "router_z_loss_clip": 1.45800781, + "router_z_loss_mlp": 0.11535645, + "step": 9356, + "time_per_iteration": 2.5417375564575195 + }, + { + "auxiliary_loss_clip": 0.06421204, + "auxiliary_loss_mlp": 0.01265523, + "balance_loss_clip": 0.0627971, + "balance_loss_mlp": 0.01254043, + "epoch": 0.5625732752141891, + "flos": 18704023920000.0, + "grad_norm": 1.8811670281572186, + "language_loss": 0.83384252, + "learning_rate": 1.6929601244376318e-06, + "loss": 0.9107098, + "num_input_tokens_seen": 201497645, + "router_z_loss_clip": 1.41601562, + "router_z_loss_mlp": 0.11480713, + "step": 9357, + "time_per_iteration": 2.4678521156311035 + }, + { + "auxiliary_loss_clip": 0.06426045, + "auxiliary_loss_mlp": 0.01266314, + "balance_loss_clip": 0.06279635, + "balance_loss_mlp": 0.01255705, + "epoch": 0.562633398466857, + "flos": 16222492926720.0, + "grad_norm": 2.0645024289256293, + "language_loss": 0.7263062, + "learning_rate": 1.6925752854339722e-06, + "loss": 0.80322981, + "num_input_tokens_seen": 201515455, + "router_z_loss_clip": 1.46289062, + "router_z_loss_mlp": 0.1060791, + "step": 9358, + "time_per_iteration": 2.5186126232147217 + }, + { + "auxiliary_loss_clip": 0.06416523, + "auxiliary_loss_mlp": 0.01266054, + "balance_loss_clip": 0.06273469, + "balance_loss_mlp": 0.0125408, + "epoch": 0.562693521719525, + "flos": 22498485338880.0, + "grad_norm": 1.808809546066597, + "language_loss": 0.78313565, + "learning_rate": 1.6921904580878885e-06, + "loss": 0.85996139, + "num_input_tokens_seen": 201534500, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.11981201, + "step": 9359, + "time_per_iteration": 2.4950146675109863 + }, + { + "auxiliary_loss_clip": 0.06422278, + "auxiliary_loss_mlp": 0.01266031, + "balance_loss_clip": 0.06277263, + "balance_loss_mlp": 0.01254123, + "epoch": 0.562753644972193, + "flos": 25337088506880.0, + "grad_norm": 1.6393117198147682, + "language_loss": 0.70198202, + "learning_rate": 1.6918056424139736e-06, + "loss": 0.77886516, + "num_input_tokens_seen": 201553280, + "router_z_loss_clip": 1.44921875, + "router_z_loss_mlp": 0.11920166, + "step": 9360, + "time_per_iteration": 2.5677337646484375 + }, + { + "auxiliary_loss_clip": 0.06333196, + "auxiliary_loss_mlp": 0.01259618, + "balance_loss_clip": 0.06271995, + "balance_loss_mlp": 0.01258209, + "epoch": 0.562813768224861, + "flos": 67410566231040.0, + "grad_norm": 0.7608015706194778, + "language_loss": 0.55599511, + "learning_rate": 1.6914208384268197e-06, + "loss": 0.63192326, + "num_input_tokens_seen": 201610030, + "router_z_loss_clip": 0.61328125, + "router_z_loss_mlp": 0.0140686, + "step": 9361, + "time_per_iteration": 3.047746419906616 + }, + { + "auxiliary_loss_clip": 0.06421309, + "auxiliary_loss_mlp": 0.01270958, + "balance_loss_clip": 0.06278641, + "balance_loss_mlp": 0.01260271, + "epoch": 0.562873891477529, + "flos": 23337868014720.0, + "grad_norm": 1.4415772957289732, + "language_loss": 0.82031697, + "learning_rate": 1.691036046141018e-06, + "loss": 0.89723963, + "num_input_tokens_seen": 201628370, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.10687256, + "step": 9362, + "time_per_iteration": 2.5085341930389404 + }, + { + "auxiliary_loss_clip": 0.06425183, + "auxiliary_loss_mlp": 0.01265052, + "balance_loss_clip": 0.06282046, + "balance_loss_mlp": 0.01254067, + "epoch": 0.5629340147301969, + "flos": 38482073475840.0, + "grad_norm": 1.5514506959778531, + "language_loss": 0.74991751, + "learning_rate": 1.6906512655711614e-06, + "loss": 0.8268199, + "num_input_tokens_seen": 201649790, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.10992432, + "step": 9363, + "time_per_iteration": 2.6483652591705322 + }, + { + "auxiliary_loss_clip": 0.06428041, + "auxiliary_loss_mlp": 0.01269517, + "balance_loss_clip": 0.06280389, + "balance_loss_mlp": 0.01257573, + "epoch": 0.5629941379828649, + "flos": 29249744509440.0, + "grad_norm": 1.527132274705304, + "language_loss": 0.82966727, + "learning_rate": 1.690266496731839e-06, + "loss": 0.90664279, + "num_input_tokens_seen": 201669175, + "router_z_loss_clip": 1.47460938, + "router_z_loss_mlp": 0.11962891, + "step": 9364, + "time_per_iteration": 2.585028648376465 + }, + { + "auxiliary_loss_clip": 0.06420554, + "auxiliary_loss_mlp": 0.01264228, + "balance_loss_clip": 0.06281281, + "balance_loss_mlp": 0.01253207, + "epoch": 0.5630542612355328, + "flos": 19425882844800.0, + "grad_norm": 1.9441356766600106, + "language_loss": 0.65449685, + "learning_rate": 1.689881739637642e-06, + "loss": 0.7313447, + "num_input_tokens_seen": 201687000, + "router_z_loss_clip": 1.39257812, + "router_z_loss_mlp": 0.11022949, + "step": 9365, + "time_per_iteration": 2.5320210456848145 + }, + { + "auxiliary_loss_clip": 0.06432588, + "auxiliary_loss_mlp": 0.01270623, + "balance_loss_clip": 0.06279749, + "balance_loss_mlp": 0.0125841, + "epoch": 0.5631143844882008, + "flos": 22271697463680.0, + "grad_norm": 2.4081978900655114, + "language_loss": 0.81779563, + "learning_rate": 1.6894969943031611e-06, + "loss": 0.89482784, + "num_input_tokens_seen": 201703335, + "router_z_loss_clip": 1.52929688, + "router_z_loss_mlp": 0.12213135, + "step": 9366, + "time_per_iteration": 2.5602293014526367 + }, + { + "auxiliary_loss_clip": 0.06419416, + "auxiliary_loss_mlp": 0.01263434, + "balance_loss_clip": 0.06277686, + "balance_loss_mlp": 0.01253033, + "epoch": 0.5631745077408687, + "flos": 22971781526400.0, + "grad_norm": 1.4555155937951827, + "language_loss": 0.73903221, + "learning_rate": 1.6891122607429845e-06, + "loss": 0.81586075, + "num_input_tokens_seen": 201723495, + "router_z_loss_clip": 1.41894531, + "router_z_loss_mlp": 0.10400391, + "step": 9367, + "time_per_iteration": 2.5222184658050537 + }, + { + "auxiliary_loss_clip": 0.0633425, + "auxiliary_loss_mlp": 0.01256933, + "balance_loss_clip": 0.06272865, + "balance_loss_mlp": 0.01255295, + "epoch": 0.5632346309935368, + "flos": 65101917409920.0, + "grad_norm": 0.6175920076853201, + "language_loss": 0.5334087, + "learning_rate": 1.6887275389717028e-06, + "loss": 0.60932058, + "num_input_tokens_seen": 201792615, + "router_z_loss_clip": 0.61669922, + "router_z_loss_mlp": 0.0164032, + "step": 9368, + "time_per_iteration": 3.3093104362487793 + }, + { + "auxiliary_loss_clip": 0.06421301, + "auxiliary_loss_mlp": 0.0127307, + "balance_loss_clip": 0.06277905, + "balance_loss_mlp": 0.01261757, + "epoch": 0.5632947542462047, + "flos": 23009572517760.0, + "grad_norm": 1.6075197920052449, + "language_loss": 0.69183493, + "learning_rate": 1.6883428290039046e-06, + "loss": 0.76877862, + "num_input_tokens_seen": 201812520, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.11315918, + "step": 9369, + "time_per_iteration": 2.5406625270843506 + }, + { + "auxiliary_loss_clip": 0.06420332, + "auxiliary_loss_mlp": 0.01269293, + "balance_loss_clip": 0.06275883, + "balance_loss_mlp": 0.01258105, + "epoch": 0.5633548774988727, + "flos": 30490530969600.0, + "grad_norm": 1.6779781841725052, + "language_loss": 0.76048809, + "learning_rate": 1.6879581308541763e-06, + "loss": 0.83738434, + "num_input_tokens_seen": 201834185, + "router_z_loss_clip": 1.4453125, + "router_z_loss_mlp": 0.11175537, + "step": 9370, + "time_per_iteration": 2.591212272644043 + }, + { + "auxiliary_loss_clip": 0.06424968, + "auxiliary_loss_mlp": 0.01266151, + "balance_loss_clip": 0.06275932, + "balance_loss_mlp": 0.01253908, + "epoch": 0.5634150007515406, + "flos": 18520938748800.0, + "grad_norm": 1.8374331787518619, + "language_loss": 0.76029092, + "learning_rate": 1.687573444537108e-06, + "loss": 0.83720207, + "num_input_tokens_seen": 201851305, + "router_z_loss_clip": 1.49023438, + "router_z_loss_mlp": 0.12237549, + "step": 9371, + "time_per_iteration": 2.5327818393707275 + }, + { + "auxiliary_loss_clip": 0.06421979, + "auxiliary_loss_mlp": 0.01268189, + "balance_loss_clip": 0.06277596, + "balance_loss_mlp": 0.01256739, + "epoch": 0.5634751240042086, + "flos": 19250679957120.0, + "grad_norm": 1.7360135917661768, + "language_loss": 0.762514, + "learning_rate": 1.687188770067285e-06, + "loss": 0.83941567, + "num_input_tokens_seen": 201870350, + "router_z_loss_clip": 1.44335938, + "router_z_loss_mlp": 0.11456299, + "step": 9372, + "time_per_iteration": 2.519404411315918 + }, + { + "auxiliary_loss_clip": 0.06422761, + "auxiliary_loss_mlp": 0.01266353, + "balance_loss_clip": 0.06280088, + "balance_loss_mlp": 0.01255016, + "epoch": 0.5635352472568766, + "flos": 12025453766400.0, + "grad_norm": 1.884768041604824, + "language_loss": 0.71853095, + "learning_rate": 1.6868041074592956e-06, + "loss": 0.79542208, + "num_input_tokens_seen": 201886800, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.11334229, + "step": 9373, + "time_per_iteration": 2.5053837299346924 + }, + { + "auxiliary_loss_clip": 0.06422034, + "auxiliary_loss_mlp": 0.01268801, + "balance_loss_clip": 0.06277832, + "balance_loss_mlp": 0.01256367, + "epoch": 0.5635953705095446, + "flos": 21878092108800.0, + "grad_norm": 1.841933865019323, + "language_loss": 0.83263683, + "learning_rate": 1.6864194567277264e-06, + "loss": 0.90954518, + "num_input_tokens_seen": 201904730, + "router_z_loss_clip": 1.44238281, + "router_z_loss_mlp": 0.12438965, + "step": 9374, + "time_per_iteration": 3.904900074005127 + }, + { + "auxiliary_loss_clip": 0.06420377, + "auxiliary_loss_mlp": 0.01266878, + "balance_loss_clip": 0.06277412, + "balance_loss_mlp": 0.01256131, + "epoch": 0.5636554937622126, + "flos": 27133587244800.0, + "grad_norm": 2.5670866003984583, + "language_loss": 0.66696084, + "learning_rate": 1.6860348178871618e-06, + "loss": 0.74383336, + "num_input_tokens_seen": 201924850, + "router_z_loss_clip": 1.42871094, + "router_z_loss_mlp": 0.10754395, + "step": 9375, + "time_per_iteration": 2.581921339035034 + }, + { + "auxiliary_loss_clip": 0.06426428, + "auxiliary_loss_mlp": 0.01265809, + "balance_loss_clip": 0.06279501, + "balance_loss_mlp": 0.0125433, + "epoch": 0.5637156170148805, + "flos": 12930314008320.0, + "grad_norm": 12.279905367602915, + "language_loss": 0.81403673, + "learning_rate": 1.6856501909521889e-06, + "loss": 0.89095908, + "num_input_tokens_seen": 201939500, + "router_z_loss_clip": 1.46972656, + "router_z_loss_mlp": 0.11474609, + "step": 9376, + "time_per_iteration": 2.5271008014678955 + }, + { + "auxiliary_loss_clip": 0.06430367, + "auxiliary_loss_mlp": 0.01265466, + "balance_loss_clip": 0.06280433, + "balance_loss_mlp": 0.01253974, + "epoch": 0.5637757402675485, + "flos": 45561460435200.0, + "grad_norm": 1.3765625381603785, + "language_loss": 0.69569075, + "learning_rate": 1.6852655759373925e-06, + "loss": 0.77264911, + "num_input_tokens_seen": 201963000, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.1149292, + "step": 9377, + "time_per_iteration": 2.7878713607788086 + }, + { + "auxiliary_loss_clip": 0.06418754, + "auxiliary_loss_mlp": 0.01265562, + "balance_loss_clip": 0.06278635, + "balance_loss_mlp": 0.01254887, + "epoch": 0.5638358635202164, + "flos": 20892241296000.0, + "grad_norm": 1.4815499035204616, + "language_loss": 0.75006419, + "learning_rate": 1.6848809728573565e-06, + "loss": 0.82690734, + "num_input_tokens_seen": 201983145, + "router_z_loss_clip": 1.40039062, + "router_z_loss_mlp": 0.10668945, + "step": 9378, + "time_per_iteration": 2.5742552280426025 + }, + { + "auxiliary_loss_clip": 0.06432593, + "auxiliary_loss_mlp": 0.01271419, + "balance_loss_clip": 0.06279133, + "balance_loss_mlp": 0.01258837, + "epoch": 0.5638959867728844, + "flos": 18812449503360.0, + "grad_norm": 2.3058329321149555, + "language_loss": 0.81874716, + "learning_rate": 1.6844963817266656e-06, + "loss": 0.8957873, + "num_input_tokens_seen": 202000335, + "router_z_loss_clip": 1.53710938, + "router_z_loss_mlp": 0.12585449, + "step": 9379, + "time_per_iteration": 3.9022350311279297 + }, + { + "auxiliary_loss_clip": 0.06428088, + "auxiliary_loss_mlp": 0.012688, + "balance_loss_clip": 0.06281307, + "balance_loss_mlp": 0.01256933, + "epoch": 0.5639561100255523, + "flos": 27497703162240.0, + "grad_norm": 1.9515300720121755, + "language_loss": 0.71783185, + "learning_rate": 1.6841118025599042e-06, + "loss": 0.79480064, + "num_input_tokens_seen": 202018275, + "router_z_loss_clip": 1.46582031, + "router_z_loss_mlp": 0.11859131, + "step": 9380, + "time_per_iteration": 2.6338086128234863 + }, + { + "auxiliary_loss_clip": 0.0642691, + "auxiliary_loss_mlp": 0.01266641, + "balance_loss_clip": 0.06279925, + "balance_loss_mlp": 0.01254857, + "epoch": 0.5640162332782204, + "flos": 18082289024640.0, + "grad_norm": 2.0751114915079687, + "language_loss": 0.75207865, + "learning_rate": 1.6837272353716542e-06, + "loss": 0.82901412, + "num_input_tokens_seen": 202034330, + "router_z_loss_clip": 1.46777344, + "router_z_loss_mlp": 0.11779785, + "step": 9381, + "time_per_iteration": 2.4637959003448486 + }, + { + "auxiliary_loss_clip": 0.06430316, + "auxiliary_loss_mlp": 0.01273879, + "balance_loss_clip": 0.06282466, + "balance_loss_mlp": 0.01262822, + "epoch": 0.5640763565308883, + "flos": 20890857703680.0, + "grad_norm": 2.2840815632275846, + "language_loss": 0.72823429, + "learning_rate": 1.683342680176499e-06, + "loss": 0.80527627, + "num_input_tokens_seen": 202053100, + "router_z_loss_clip": 1.47753906, + "router_z_loss_mlp": 0.11053467, + "step": 9382, + "time_per_iteration": 2.6038217544555664 + }, + { + "auxiliary_loss_clip": 0.0632898, + "auxiliary_loss_mlp": 0.01252773, + "balance_loss_clip": 0.06268109, + "balance_loss_mlp": 0.01251134, + "epoch": 0.5641364797835563, + "flos": 64467143205120.0, + "grad_norm": 0.7593633930380659, + "language_loss": 0.54457784, + "learning_rate": 1.682958136989022e-06, + "loss": 0.62039542, + "num_input_tokens_seen": 202120125, + "router_z_loss_clip": 0.609375, + "router_z_loss_mlp": 0.01641846, + "step": 9383, + "time_per_iteration": 4.702574253082275 + }, + { + "auxiliary_loss_clip": 0.06430694, + "auxiliary_loss_mlp": 0.01271925, + "balance_loss_clip": 0.06278884, + "balance_loss_mlp": 0.01260129, + "epoch": 0.5641966030362242, + "flos": 18666861834240.0, + "grad_norm": 1.6723183303987958, + "language_loss": 0.71441197, + "learning_rate": 1.6825736058238033e-06, + "loss": 0.79143822, + "num_input_tokens_seen": 202138030, + "router_z_loss_clip": 1.51757812, + "router_z_loss_mlp": 0.11798096, + "step": 9384, + "time_per_iteration": 2.4753105640411377 + }, + { + "auxiliary_loss_clip": 0.06421386, + "auxiliary_loss_mlp": 0.01266582, + "balance_loss_clip": 0.0627472, + "balance_loss_mlp": 0.01254626, + "epoch": 0.5642567262888922, + "flos": 22498946536320.0, + "grad_norm": 1.9187169203117838, + "language_loss": 0.76415217, + "learning_rate": 1.6821890866954263e-06, + "loss": 0.84103185, + "num_input_tokens_seen": 202155580, + "router_z_loss_clip": 1.46386719, + "router_z_loss_mlp": 0.1194458, + "step": 9385, + "time_per_iteration": 2.5245208740234375 + }, + { + "auxiliary_loss_clip": 0.06417953, + "auxiliary_loss_mlp": 0.01265769, + "balance_loss_clip": 0.0627504, + "balance_loss_mlp": 0.01255028, + "epoch": 0.5643168495415603, + "flos": 13008663175680.0, + "grad_norm": 1.914249541829808, + "language_loss": 0.82386243, + "learning_rate": 1.6818045796184703e-06, + "loss": 0.90069962, + "num_input_tokens_seen": 202170365, + "router_z_loss_clip": 1.42871094, + "router_z_loss_mlp": 0.10748291, + "step": 9386, + "time_per_iteration": 2.4669172763824463 + }, + { + "auxiliary_loss_clip": 0.06427868, + "auxiliary_loss_mlp": 0.01266292, + "balance_loss_clip": 0.06277144, + "balance_loss_mlp": 0.01255014, + "epoch": 0.5643769727942282, + "flos": 18594256671360.0, + "grad_norm": 1.9656567849197715, + "language_loss": 0.70471108, + "learning_rate": 1.681420084607516e-06, + "loss": 0.78165275, + "num_input_tokens_seen": 202189095, + "router_z_loss_clip": 1.50488281, + "router_z_loss_mlp": 0.112854, + "step": 9387, + "time_per_iteration": 2.5076122283935547 + }, + { + "auxiliary_loss_clip": 0.0642679, + "auxiliary_loss_mlp": 0.01267525, + "balance_loss_clip": 0.06276885, + "balance_loss_mlp": 0.01255348, + "epoch": 0.5644370960468962, + "flos": 33815343853440.0, + "grad_norm": 1.4623673546412521, + "language_loss": 0.75064629, + "learning_rate": 1.6810356016771452e-06, + "loss": 0.82758939, + "num_input_tokens_seen": 202213500, + "router_z_loss_clip": 1.49707031, + "router_z_loss_mlp": 0.12176514, + "step": 9388, + "time_per_iteration": 2.651616096496582 + }, + { + "auxiliary_loss_clip": 0.06417996, + "auxiliary_loss_mlp": 0.01267245, + "balance_loss_clip": 0.06276226, + "balance_loss_mlp": 0.01256892, + "epoch": 0.5644972192995641, + "flos": 21221249552640.0, + "grad_norm": 1.4874039445981817, + "language_loss": 0.82212514, + "learning_rate": 1.6806511308419353e-06, + "loss": 0.89897752, + "num_input_tokens_seen": 202231920, + "router_z_loss_clip": 1.41699219, + "router_z_loss_mlp": 0.10357666, + "step": 9389, + "time_per_iteration": 2.5609359741210938 + }, + { + "auxiliary_loss_clip": 0.06426319, + "auxiliary_loss_mlp": 0.01270818, + "balance_loss_clip": 0.06278206, + "balance_loss_mlp": 0.01258468, + "epoch": 0.5645573425522321, + "flos": 18593585838720.0, + "grad_norm": 2.1560569688057036, + "language_loss": 0.64486635, + "learning_rate": 1.680266672116467e-06, + "loss": 0.72183776, + "num_input_tokens_seen": 202247600, + "router_z_loss_clip": 1.47949219, + "router_z_loss_mlp": 0.12329102, + "step": 9390, + "time_per_iteration": 3.8905534744262695 + }, + { + "auxiliary_loss_clip": 0.06417844, + "auxiliary_loss_mlp": 0.01265997, + "balance_loss_clip": 0.06276117, + "balance_loss_mlp": 0.01255334, + "epoch": 0.5646174658049, + "flos": 18119660745600.0, + "grad_norm": 1.743379462466535, + "language_loss": 0.92393249, + "learning_rate": 1.6798822255153192e-06, + "loss": 1.00077093, + "num_input_tokens_seen": 202265350, + "router_z_loss_clip": 1.41992188, + "router_z_loss_mlp": 0.10662842, + "step": 9391, + "time_per_iteration": 2.4846012592315674 + }, + { + "auxiliary_loss_clip": 0.06426747, + "auxiliary_loss_mlp": 0.01269795, + "balance_loss_clip": 0.06274952, + "balance_loss_mlp": 0.0125751, + "epoch": 0.564677589057568, + "flos": 28337547035520.0, + "grad_norm": 2.079245602273352, + "language_loss": 0.60616773, + "learning_rate": 1.6794977910530684e-06, + "loss": 0.68313313, + "num_input_tokens_seen": 202284285, + "router_z_loss_clip": 1.51660156, + "router_z_loss_mlp": 0.12286377, + "step": 9392, + "time_per_iteration": 2.5709118843078613 + }, + { + "auxiliary_loss_clip": 0.06418676, + "auxiliary_loss_mlp": 0.01266956, + "balance_loss_clip": 0.06274032, + "balance_loss_mlp": 0.01255619, + "epoch": 0.564737712310236, + "flos": 22170273696000.0, + "grad_norm": 2.32400153493691, + "language_loss": 0.81762815, + "learning_rate": 1.6791133687442937e-06, + "loss": 0.8944844, + "num_input_tokens_seen": 202303450, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.11334229, + "step": 9393, + "time_per_iteration": 2.49820613861084 + }, + { + "auxiliary_loss_clip": 0.06420048, + "auxiliary_loss_mlp": 0.01268955, + "balance_loss_clip": 0.06274317, + "balance_loss_mlp": 0.01257434, + "epoch": 0.564797835562904, + "flos": 20965223802240.0, + "grad_norm": 1.8189771095125196, + "language_loss": 0.87738705, + "learning_rate": 1.6787289586035725e-06, + "loss": 0.95427704, + "num_input_tokens_seen": 202322315, + "router_z_loss_clip": 1.45703125, + "router_z_loss_mlp": 0.11523438, + "step": 9394, + "time_per_iteration": 2.5385193824768066 + }, + { + "auxiliary_loss_clip": 0.06421189, + "auxiliary_loss_mlp": 0.01271733, + "balance_loss_clip": 0.06278495, + "balance_loss_mlp": 0.01261135, + "epoch": 0.5648579588155719, + "flos": 17425991520000.0, + "grad_norm": 1.7000053900358165, + "language_loss": 0.84579873, + "learning_rate": 1.6783445606454814e-06, + "loss": 0.92272794, + "num_input_tokens_seen": 202339905, + "router_z_loss_clip": 1.42871094, + "router_z_loss_mlp": 0.1060791, + "step": 9395, + "time_per_iteration": 2.470017433166504 + }, + { + "auxiliary_loss_clip": 0.06326792, + "auxiliary_loss_mlp": 0.01253109, + "balance_loss_clip": 0.06265698, + "balance_loss_mlp": 0.01251535, + "epoch": 0.5649180820682399, + "flos": 69951187152000.0, + "grad_norm": 0.7657809500788333, + "language_loss": 0.57918489, + "learning_rate": 1.677960174884597e-06, + "loss": 0.65498388, + "num_input_tokens_seen": 202397320, + "router_z_loss_clip": 0.61132812, + "router_z_loss_mlp": 0.01573944, + "step": 9396, + "time_per_iteration": 3.1468727588653564 + }, + { + "auxiliary_loss_clip": 0.06423569, + "auxiliary_loss_mlp": 0.01267357, + "balance_loss_clip": 0.06276156, + "balance_loss_mlp": 0.01256205, + "epoch": 0.5649782053209078, + "flos": 24980058259200.0, + "grad_norm": 1.9294071175656426, + "language_loss": 0.70135093, + "learning_rate": 1.6775758013354943e-06, + "loss": 0.77826023, + "num_input_tokens_seen": 202416865, + "router_z_loss_clip": 1.47460938, + "router_z_loss_mlp": 0.11157227, + "step": 9397, + "time_per_iteration": 2.5551769733428955 + }, + { + "auxiliary_loss_clip": 0.06421924, + "auxiliary_loss_mlp": 0.01267113, + "balance_loss_clip": 0.06274733, + "balance_loss_mlp": 0.01256277, + "epoch": 0.5650383285735758, + "flos": 21733175272320.0, + "grad_norm": 3.1535749018048094, + "language_loss": 0.67165595, + "learning_rate": 1.67719144001275e-06, + "loss": 0.74854636, + "num_input_tokens_seen": 202436210, + "router_z_loss_clip": 1.47167969, + "router_z_loss_mlp": 0.10839844, + "step": 9398, + "time_per_iteration": 2.5690701007843018 + }, + { + "auxiliary_loss_clip": 0.06324084, + "auxiliary_loss_mlp": 0.01251867, + "balance_loss_clip": 0.06263297, + "balance_loss_mlp": 0.01250375, + "epoch": 0.5650984518262439, + "flos": 65923481093760.0, + "grad_norm": 0.7518933539640298, + "language_loss": 0.58143103, + "learning_rate": 1.6768070909309386e-06, + "loss": 0.65719062, + "num_input_tokens_seen": 202492925, + "router_z_loss_clip": 0.609375, + "router_z_loss_mlp": 0.01491547, + "step": 9399, + "time_per_iteration": 3.073493719100952 + }, + { + "auxiliary_loss_clip": 0.06425194, + "auxiliary_loss_mlp": 0.01269138, + "balance_loss_clip": 0.06275368, + "balance_loss_mlp": 0.01257158, + "epoch": 0.5651585750789118, + "flos": 21038919068160.0, + "grad_norm": 2.9284187471842213, + "language_loss": 0.73483676, + "learning_rate": 1.6764227541046347e-06, + "loss": 0.8117801, + "num_input_tokens_seen": 202511905, + "router_z_loss_clip": 1.49902344, + "router_z_loss_mlp": 0.11987305, + "step": 9400, + "time_per_iteration": 2.5129287242889404 + }, + { + "auxiliary_loss_clip": 0.06431332, + "auxiliary_loss_mlp": 0.01270587, + "balance_loss_clip": 0.06281202, + "balance_loss_mlp": 0.01258267, + "epoch": 0.5652186983315798, + "flos": 18557891199360.0, + "grad_norm": 1.781312568353633, + "language_loss": 0.61062682, + "learning_rate": 1.676038429548412e-06, + "loss": 0.68764603, + "num_input_tokens_seen": 202529815, + "router_z_loss_clip": 1.50097656, + "router_z_loss_mlp": 0.12322998, + "step": 9401, + "time_per_iteration": 2.484562397003174 + }, + { + "auxiliary_loss_clip": 0.06419288, + "auxiliary_loss_mlp": 0.01272594, + "balance_loss_clip": 0.06274588, + "balance_loss_mlp": 0.01261859, + "epoch": 0.5652788215842477, + "flos": 18484573276800.0, + "grad_norm": 1.8682667341725439, + "language_loss": 0.81175613, + "learning_rate": 1.6756541172768453e-06, + "loss": 0.88867497, + "num_input_tokens_seen": 202547710, + "router_z_loss_clip": 1.4453125, + "router_z_loss_mlp": 0.10736084, + "step": 9402, + "time_per_iteration": 2.5402467250823975 + }, + { + "auxiliary_loss_clip": 0.0641814, + "auxiliary_loss_mlp": 0.01269888, + "balance_loss_clip": 0.06276071, + "balance_loss_mlp": 0.0125898, + "epoch": 0.5653389448369157, + "flos": 30051797391360.0, + "grad_norm": 1.3435358668606565, + "language_loss": 0.77710259, + "learning_rate": 1.6752698173045068e-06, + "loss": 0.85398287, + "num_input_tokens_seen": 202568835, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.10900879, + "step": 9403, + "time_per_iteration": 2.5728204250335693 + }, + { + "auxiliary_loss_clip": 0.06421928, + "auxiliary_loss_mlp": 0.01268633, + "balance_loss_clip": 0.06276687, + "balance_loss_mlp": 0.01257458, + "epoch": 0.5653990680895836, + "flos": 16733202762240.0, + "grad_norm": 1.6255859835861872, + "language_loss": 0.69364876, + "learning_rate": 1.6748855296459685e-06, + "loss": 0.7705543, + "num_input_tokens_seen": 202587385, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.11187744, + "step": 9404, + "time_per_iteration": 2.5076894760131836 + }, + { + "auxiliary_loss_clip": 0.06414986, + "auxiliary_loss_mlp": 0.01268861, + "balance_loss_clip": 0.06274591, + "balance_loss_mlp": 0.01258156, + "epoch": 0.5654591913422516, + "flos": 14543517939840.0, + "grad_norm": 1.937007916536723, + "language_loss": 0.6753332, + "learning_rate": 1.6745012543158045e-06, + "loss": 0.75217164, + "num_input_tokens_seen": 202604815, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.1071167, + "step": 9405, + "time_per_iteration": 2.4678986072540283 + }, + { + "auxiliary_loss_clip": 0.06417301, + "auxiliary_loss_mlp": 0.01269096, + "balance_loss_clip": 0.0627932, + "balance_loss_mlp": 0.0125891, + "epoch": 0.5655193145949196, + "flos": 26216484307200.0, + "grad_norm": 1.7078210782531607, + "language_loss": 0.74488431, + "learning_rate": 1.6741169913285852e-06, + "loss": 0.82174826, + "num_input_tokens_seen": 202623775, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.10180664, + "step": 9406, + "time_per_iteration": 2.5344419479370117 + }, + { + "auxiliary_loss_clip": 0.06423233, + "auxiliary_loss_mlp": 0.01269998, + "balance_loss_clip": 0.06274547, + "balance_loss_mlp": 0.01258101, + "epoch": 0.5655794378475876, + "flos": 25053669671040.0, + "grad_norm": 1.6572482823915473, + "language_loss": 0.80165344, + "learning_rate": 1.673732740698882e-06, + "loss": 0.87858582, + "num_input_tokens_seen": 202643375, + "router_z_loss_clip": 1.48828125, + "router_z_loss_mlp": 0.11901855, + "step": 9407, + "time_per_iteration": 2.5318515300750732 + }, + { + "auxiliary_loss_clip": 0.06414818, + "auxiliary_loss_mlp": 0.01281674, + "balance_loss_clip": 0.06276679, + "balance_loss_mlp": 0.01270641, + "epoch": 0.5656395611002555, + "flos": 31041379710720.0, + "grad_norm": 1.3106223538314048, + "language_loss": 0.71445584, + "learning_rate": 1.6733485024412666e-06, + "loss": 0.79142082, + "num_input_tokens_seen": 202668400, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.1104126, + "step": 9408, + "time_per_iteration": 2.6315321922302246 + }, + { + "auxiliary_loss_clip": 0.06416275, + "auxiliary_loss_mlp": 0.01273077, + "balance_loss_clip": 0.06275165, + "balance_loss_mlp": 0.01262151, + "epoch": 0.5656996843529235, + "flos": 20235650302080.0, + "grad_norm": 1.8647463769564316, + "language_loss": 0.81496549, + "learning_rate": 1.672964276570308e-06, + "loss": 0.89185899, + "num_input_tokens_seen": 202685125, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.109375, + "step": 9409, + "time_per_iteration": 2.4874367713928223 + }, + { + "auxiliary_loss_clip": 0.06420213, + "auxiliary_loss_mlp": 0.01273147, + "balance_loss_clip": 0.06275219, + "balance_loss_mlp": 0.01261953, + "epoch": 0.5657598076055914, + "flos": 21002595523200.0, + "grad_norm": 1.5982364261864173, + "language_loss": 0.78488803, + "learning_rate": 1.6725800631005776e-06, + "loss": 0.86182165, + "num_input_tokens_seen": 202703830, + "router_z_loss_clip": 1.44921875, + "router_z_loss_mlp": 0.11187744, + "step": 9410, + "time_per_iteration": 2.568018913269043 + }, + { + "auxiliary_loss_clip": 0.06420635, + "auxiliary_loss_mlp": 0.01269622, + "balance_loss_clip": 0.06277133, + "balance_loss_mlp": 0.01258607, + "epoch": 0.5658199308582594, + "flos": 11550690132480.0, + "grad_norm": 1.9303419986806551, + "language_loss": 0.83679706, + "learning_rate": 1.6721958620466432e-06, + "loss": 0.91369963, + "num_input_tokens_seen": 202719835, + "router_z_loss_clip": 1.43457031, + "router_z_loss_mlp": 0.11016846, + "step": 9411, + "time_per_iteration": 2.4616551399230957 + }, + { + "auxiliary_loss_clip": 0.06428169, + "auxiliary_loss_mlp": 0.01269272, + "balance_loss_clip": 0.06277955, + "balance_loss_mlp": 0.01256725, + "epoch": 0.5658800541109275, + "flos": 14177137962240.0, + "grad_norm": 2.370687982223235, + "language_loss": 0.67829227, + "learning_rate": 1.6718116734230749e-06, + "loss": 0.75526661, + "num_input_tokens_seen": 202736795, + "router_z_loss_clip": 1.50292969, + "router_z_loss_mlp": 0.12548828, + "step": 9412, + "time_per_iteration": 2.5216641426086426 + }, + { + "auxiliary_loss_clip": 0.06415425, + "auxiliary_loss_mlp": 0.01268228, + "balance_loss_clip": 0.06277046, + "balance_loss_mlp": 0.01258488, + "epoch": 0.5659401773635954, + "flos": 27311934660480.0, + "grad_norm": 1.581889394574198, + "language_loss": 0.58742762, + "learning_rate": 1.6714274972444413e-06, + "loss": 0.6642642, + "num_input_tokens_seen": 202756900, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.09741211, + "step": 9413, + "time_per_iteration": 2.564143657684326 + }, + { + "auxiliary_loss_clip": 0.06415551, + "auxiliary_loss_mlp": 0.01265095, + "balance_loss_clip": 0.06274314, + "balance_loss_mlp": 0.01254294, + "epoch": 0.5660003006162634, + "flos": 16733957448960.0, + "grad_norm": 2.47913455673049, + "language_loss": 0.69196904, + "learning_rate": 1.6710433335253092e-06, + "loss": 0.76877546, + "num_input_tokens_seen": 202775145, + "router_z_loss_clip": 1.40917969, + "router_z_loss_mlp": 0.10791016, + "step": 9414, + "time_per_iteration": 3.924028158187866 + }, + { + "auxiliary_loss_clip": 0.0641676, + "auxiliary_loss_mlp": 0.01269168, + "balance_loss_clip": 0.06275219, + "balance_loss_mlp": 0.01258475, + "epoch": 0.5660604238689313, + "flos": 21659983130880.0, + "grad_norm": 1.6269222060357784, + "language_loss": 0.78177273, + "learning_rate": 1.670659182280247e-06, + "loss": 0.85863203, + "num_input_tokens_seen": 202794505, + "router_z_loss_clip": 1.41601562, + "router_z_loss_mlp": 0.10693359, + "step": 9415, + "time_per_iteration": 2.5426433086395264 + }, + { + "auxiliary_loss_clip": 0.06321331, + "auxiliary_loss_mlp": 0.01255911, + "balance_loss_clip": 0.06260875, + "balance_loss_mlp": 0.01254426, + "epoch": 0.5661205471215993, + "flos": 68843619884160.0, + "grad_norm": 0.6697066651048145, + "language_loss": 0.48973382, + "learning_rate": 1.670275043523822e-06, + "loss": 0.56550622, + "num_input_tokens_seen": 202858580, + "router_z_loss_clip": 0.60546875, + "router_z_loss_mlp": 0.0148468, + "step": 9416, + "time_per_iteration": 3.2625491619110107 + }, + { + "auxiliary_loss_clip": 0.06421995, + "auxiliary_loss_mlp": 0.01268122, + "balance_loss_clip": 0.06277312, + "balance_loss_mlp": 0.01256416, + "epoch": 0.5661806703742672, + "flos": 28629393206400.0, + "grad_norm": 1.9136616805420137, + "language_loss": 0.63439846, + "learning_rate": 1.6698909172706e-06, + "loss": 0.7112996, + "num_input_tokens_seen": 202878565, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.11706543, + "step": 9417, + "time_per_iteration": 2.5860400199890137 + }, + { + "auxiliary_loss_clip": 0.06423697, + "auxiliary_loss_mlp": 0.01269251, + "balance_loss_clip": 0.06277792, + "balance_loss_mlp": 0.01257419, + "epoch": 0.5662407936269352, + "flos": 21404418577920.0, + "grad_norm": 2.3766145169256485, + "language_loss": 0.6936692, + "learning_rate": 1.6695068035351479e-06, + "loss": 0.77059871, + "num_input_tokens_seen": 202897350, + "router_z_loss_clip": 1.45703125, + "router_z_loss_mlp": 0.1184082, + "step": 9418, + "time_per_iteration": 3.955557346343994 + }, + { + "auxiliary_loss_clip": 0.0642141, + "auxiliary_loss_mlp": 0.01267462, + "balance_loss_clip": 0.06276925, + "balance_loss_mlp": 0.01255261, + "epoch": 0.5663009168796032, + "flos": 25666054836480.0, + "grad_norm": 1.7349550199621107, + "language_loss": 0.65210938, + "learning_rate": 1.6691227023320304e-06, + "loss": 0.72899818, + "num_input_tokens_seen": 202916745, + "router_z_loss_clip": 1.44335938, + "router_z_loss_mlp": 0.12219238, + "step": 9419, + "time_per_iteration": 2.5426688194274902 + }, + { + "auxiliary_loss_clip": 0.06328249, + "auxiliary_loss_mlp": 0.01252694, + "balance_loss_clip": 0.06267616, + "balance_loss_mlp": 0.01251344, + "epoch": 0.5663610401322712, + "flos": 67953014835840.0, + "grad_norm": 0.7058455662611458, + "language_loss": 0.59640646, + "learning_rate": 1.6687386136758135e-06, + "loss": 0.67221588, + "num_input_tokens_seen": 202982375, + "router_z_loss_clip": 0.609375, + "router_z_loss_mlp": 0.01351929, + "step": 9420, + "time_per_iteration": 3.2174880504608154 + }, + { + "auxiliary_loss_clip": 0.064177, + "auxiliary_loss_mlp": 0.0126554, + "balance_loss_clip": 0.06276117, + "balance_loss_mlp": 0.01255235, + "epoch": 0.5664211633849391, + "flos": 24616487393280.0, + "grad_norm": 1.6106095517088517, + "language_loss": 0.74370563, + "learning_rate": 1.6683545375810618e-06, + "loss": 0.82053804, + "num_input_tokens_seen": 203002430, + "router_z_loss_clip": 1.41503906, + "router_z_loss_mlp": 0.10308838, + "step": 9421, + "time_per_iteration": 2.5415146350860596 + }, + { + "auxiliary_loss_clip": 0.06425875, + "auxiliary_loss_mlp": 0.0127111, + "balance_loss_clip": 0.0627939, + "balance_loss_mlp": 0.0125941, + "epoch": 0.5664812866376071, + "flos": 11652407389440.0, + "grad_norm": 1.8136120935488778, + "language_loss": 0.73536521, + "learning_rate": 1.6679704740623389e-06, + "loss": 0.81233501, + "num_input_tokens_seen": 203019425, + "router_z_loss_clip": 1.46679688, + "router_z_loss_mlp": 0.11700439, + "step": 9422, + "time_per_iteration": 2.4822769165039062 + }, + { + "auxiliary_loss_clip": 0.06420115, + "auxiliary_loss_mlp": 0.01264, + "balance_loss_clip": 0.06278713, + "balance_loss_mlp": 0.01253355, + "epoch": 0.566541409890275, + "flos": 24650798440320.0, + "grad_norm": 1.7038149529307767, + "language_loss": 0.8178972, + "learning_rate": 1.6675864231342085e-06, + "loss": 0.89473832, + "num_input_tokens_seen": 203039035, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.10656738, + "step": 9423, + "time_per_iteration": 4.039041519165039 + }, + { + "auxiliary_loss_clip": 0.06420702, + "auxiliary_loss_mlp": 0.01272474, + "balance_loss_clip": 0.06276573, + "balance_loss_mlp": 0.01260392, + "epoch": 0.566601533142943, + "flos": 22276686781440.0, + "grad_norm": 2.1916345423108092, + "language_loss": 0.81182116, + "learning_rate": 1.6672023848112353e-06, + "loss": 0.88875294, + "num_input_tokens_seen": 203059320, + "router_z_loss_clip": 1.44140625, + "router_z_loss_mlp": 0.12091064, + "step": 9424, + "time_per_iteration": 2.6186363697052 + }, + { + "auxiliary_loss_clip": 0.06424181, + "auxiliary_loss_mlp": 0.01267084, + "balance_loss_clip": 0.06276239, + "balance_loss_mlp": 0.01254788, + "epoch": 0.5666616563956111, + "flos": 29979485717760.0, + "grad_norm": 1.8421028893936136, + "language_loss": 0.79108143, + "learning_rate": 1.6668183591079805e-06, + "loss": 0.86799419, + "num_input_tokens_seen": 203078490, + "router_z_loss_clip": 1.47949219, + "router_z_loss_mlp": 0.1229248, + "step": 9425, + "time_per_iteration": 2.6103405952453613 + }, + { + "auxiliary_loss_clip": 0.06423585, + "auxiliary_loss_mlp": 0.01266807, + "balance_loss_clip": 0.06280398, + "balance_loss_mlp": 0.01254958, + "epoch": 0.566721779648279, + "flos": 17786585566080.0, + "grad_norm": 1.8792171756054583, + "language_loss": 0.59002221, + "learning_rate": 1.6664343460390064e-06, + "loss": 0.66692609, + "num_input_tokens_seen": 203096065, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.11853027, + "step": 9426, + "time_per_iteration": 2.5017449855804443 + }, + { + "auxiliary_loss_clip": 0.06425668, + "auxiliary_loss_mlp": 0.01271587, + "balance_loss_clip": 0.06278071, + "balance_loss_mlp": 0.01259881, + "epoch": 0.566781902900947, + "flos": 21039967244160.0, + "grad_norm": 1.8634987355301997, + "language_loss": 0.82228333, + "learning_rate": 1.6660503456188764e-06, + "loss": 0.89925593, + "num_input_tokens_seen": 203115270, + "router_z_loss_clip": 1.47558594, + "router_z_loss_mlp": 0.1171875, + "step": 9427, + "time_per_iteration": 2.565479040145874 + }, + { + "auxiliary_loss_clip": 0.06418218, + "auxiliary_loss_mlp": 0.01268516, + "balance_loss_clip": 0.06277822, + "balance_loss_mlp": 0.01257853, + "epoch": 0.5668420261536149, + "flos": 23155244040960.0, + "grad_norm": 1.8170517561621367, + "language_loss": 0.86107284, + "learning_rate": 1.6656663578621498e-06, + "loss": 0.93794018, + "num_input_tokens_seen": 203134290, + "router_z_loss_clip": 1.40332031, + "router_z_loss_mlp": 0.10662842, + "step": 9428, + "time_per_iteration": 2.5440726280212402 + }, + { + "auxiliary_loss_clip": 0.06425078, + "auxiliary_loss_mlp": 0.01266256, + "balance_loss_clip": 0.06276559, + "balance_loss_mlp": 0.01254549, + "epoch": 0.5669021494062829, + "flos": 22608210660480.0, + "grad_norm": 1.979218692390264, + "language_loss": 0.74058932, + "learning_rate": 1.6652823827833886e-06, + "loss": 0.81750262, + "num_input_tokens_seen": 203152935, + "router_z_loss_clip": 1.48339844, + "router_z_loss_mlp": 0.11700439, + "step": 9429, + "time_per_iteration": 2.5536460876464844 + }, + { + "auxiliary_loss_clip": 0.06425272, + "auxiliary_loss_mlp": 0.01264673, + "balance_loss_clip": 0.06277645, + "balance_loss_mlp": 0.01252943, + "epoch": 0.5669622726589508, + "flos": 17386481520000.0, + "grad_norm": 1.7940156011993331, + "language_loss": 0.75663137, + "learning_rate": 1.6648984203971538e-06, + "loss": 0.8335309, + "num_input_tokens_seen": 203170110, + "router_z_loss_clip": 1.47460938, + "router_z_loss_mlp": 0.11724854, + "step": 9430, + "time_per_iteration": 3.9432384967803955 + }, + { + "auxiliary_loss_clip": 0.06418042, + "auxiliary_loss_mlp": 0.01265203, + "balance_loss_clip": 0.06273438, + "balance_loss_mlp": 0.01254498, + "epoch": 0.5670223959116188, + "flos": 18767992112640.0, + "grad_norm": 1.7725274526585868, + "language_loss": 0.73046589, + "learning_rate": 1.6645144707180032e-06, + "loss": 0.80729836, + "num_input_tokens_seen": 203188825, + "router_z_loss_clip": 1.44726562, + "router_z_loss_mlp": 0.10705566, + "step": 9431, + "time_per_iteration": 2.4891881942749023 + }, + { + "auxiliary_loss_clip": 0.06413169, + "auxiliary_loss_mlp": 0.01269495, + "balance_loss_clip": 0.06278919, + "balance_loss_mlp": 0.0125907, + "epoch": 0.5670825191642868, + "flos": 13558463740800.0, + "grad_norm": 1.5232840780961514, + "language_loss": 0.7352109, + "learning_rate": 1.6641305337604984e-06, + "loss": 0.81203753, + "num_input_tokens_seen": 203206860, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.10424805, + "step": 9432, + "time_per_iteration": 2.539503812789917 + }, + { + "auxiliary_loss_clip": 0.06419028, + "auxiliary_loss_mlp": 0.0126609, + "balance_loss_clip": 0.0627542, + "balance_loss_mlp": 0.01254914, + "epoch": 0.5671426424169548, + "flos": 22060506447360.0, + "grad_norm": 1.4799006758092328, + "language_loss": 0.78516906, + "learning_rate": 1.663746609539197e-06, + "loss": 0.86202025, + "num_input_tokens_seen": 203225625, + "router_z_loss_clip": 1.43457031, + "router_z_loss_mlp": 0.11169434, + "step": 9433, + "time_per_iteration": 2.5004031658172607 + }, + { + "auxiliary_loss_clip": 0.06427075, + "auxiliary_loss_mlp": 0.01270712, + "balance_loss_clip": 0.06279536, + "balance_loss_mlp": 0.01257569, + "epoch": 0.5672027656696227, + "flos": 21330262114560.0, + "grad_norm": 1.7709414309866778, + "language_loss": 0.63719839, + "learning_rate": 1.6633626980686582e-06, + "loss": 0.71417624, + "num_input_tokens_seen": 203242920, + "router_z_loss_clip": 1.47363281, + "router_z_loss_mlp": 0.13134766, + "step": 9434, + "time_per_iteration": 2.5424575805664062 + }, + { + "auxiliary_loss_clip": 0.06413743, + "auxiliary_loss_mlp": 0.0126735, + "balance_loss_clip": 0.06274401, + "balance_loss_mlp": 0.01257188, + "epoch": 0.5672628889222907, + "flos": 23520869331840.0, + "grad_norm": 1.9335938837076005, + "language_loss": 0.66754067, + "learning_rate": 1.6629787993634399e-06, + "loss": 0.74435163, + "num_input_tokens_seen": 203261995, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.10162354, + "step": 9435, + "time_per_iteration": 2.5177414417266846 + }, + { + "auxiliary_loss_clip": 0.06416117, + "auxiliary_loss_mlp": 0.0126839, + "balance_loss_clip": 0.06274259, + "balance_loss_mlp": 0.01257333, + "epoch": 0.5673230121749586, + "flos": 27128639854080.0, + "grad_norm": 1.3319121805553942, + "language_loss": 0.71799958, + "learning_rate": 1.6625949134380984e-06, + "loss": 0.79484463, + "num_input_tokens_seen": 203280670, + "router_z_loss_clip": 1.41992188, + "router_z_loss_mlp": 0.11053467, + "step": 9436, + "time_per_iteration": 2.6037702560424805 + }, + { + "auxiliary_loss_clip": 0.06424177, + "auxiliary_loss_mlp": 0.01266696, + "balance_loss_clip": 0.06276658, + "balance_loss_mlp": 0.01254548, + "epoch": 0.5673831354276266, + "flos": 31150476126720.0, + "grad_norm": 1.399584944388347, + "language_loss": 0.7441892, + "learning_rate": 1.6622110403071921e-06, + "loss": 0.82109791, + "num_input_tokens_seen": 203304800, + "router_z_loss_clip": 1.47460938, + "router_z_loss_mlp": 0.12145996, + "step": 9437, + "time_per_iteration": 2.5982627868652344 + }, + { + "auxiliary_loss_clip": 0.0642609, + "auxiliary_loss_mlp": 0.01270521, + "balance_loss_clip": 0.06280209, + "balance_loss_mlp": 0.01258719, + "epoch": 0.5674432586802945, + "flos": 27680662552320.0, + "grad_norm": 1.8153515221603815, + "language_loss": 0.61647224, + "learning_rate": 1.661827179985277e-06, + "loss": 0.69343835, + "num_input_tokens_seen": 203324060, + "router_z_loss_clip": 1.45800781, + "router_z_loss_mlp": 0.11798096, + "step": 9438, + "time_per_iteration": 2.6188385486602783 + }, + { + "auxiliary_loss_clip": 0.0642384, + "auxiliary_loss_mlp": 0.01263986, + "balance_loss_clip": 0.06276964, + "balance_loss_mlp": 0.01252935, + "epoch": 0.5675033819329626, + "flos": 26622458138880.0, + "grad_norm": 1.4984637138093548, + "language_loss": 0.75628054, + "learning_rate": 1.661443332486909e-06, + "loss": 0.83315879, + "num_input_tokens_seen": 203344360, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 0.11053467, + "step": 9439, + "time_per_iteration": 2.5383174419403076 + }, + { + "auxiliary_loss_clip": 0.06420992, + "auxiliary_loss_mlp": 0.01270038, + "balance_loss_clip": 0.06280455, + "balance_loss_mlp": 0.0125798, + "epoch": 0.5675635051856306, + "flos": 19104295674240.0, + "grad_norm": 1.7526345830300347, + "language_loss": 0.8402319, + "learning_rate": 1.6610594978266438e-06, + "loss": 0.91714221, + "num_input_tokens_seen": 203362115, + "router_z_loss_clip": 1.40332031, + "router_z_loss_mlp": 0.1206665, + "step": 9440, + "time_per_iteration": 2.5894699096679688 + }, + { + "auxiliary_loss_clip": 0.06425986, + "auxiliary_loss_mlp": 0.01267618, + "balance_loss_clip": 0.06275898, + "balance_loss_mlp": 0.01255393, + "epoch": 0.5676236284382985, + "flos": 17572040386560.0, + "grad_norm": 2.304829714160468, + "language_loss": 0.75825876, + "learning_rate": 1.6606756760190365e-06, + "loss": 0.83519483, + "num_input_tokens_seen": 203380550, + "router_z_loss_clip": 1.50097656, + "router_z_loss_mlp": 0.12231445, + "step": 9441, + "time_per_iteration": 2.4910314083099365 + }, + { + "auxiliary_loss_clip": 0.0641818, + "auxiliary_loss_mlp": 0.0126441, + "balance_loss_clip": 0.0627504, + "balance_loss_mlp": 0.01253454, + "epoch": 0.5676837516909665, + "flos": 15958375257600.0, + "grad_norm": 1.9240949658540871, + "language_loss": 0.83086008, + "learning_rate": 1.6602918670786413e-06, + "loss": 0.907686, + "num_input_tokens_seen": 203396590, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.10955811, + "step": 9442, + "time_per_iteration": 2.53488826751709 + }, + { + "auxiliary_loss_clip": 0.06416862, + "auxiliary_loss_mlp": 0.01269111, + "balance_loss_clip": 0.06279622, + "balance_loss_mlp": 0.01258543, + "epoch": 0.5677438749436344, + "flos": 18301739667840.0, + "grad_norm": 1.8387898612646743, + "language_loss": 0.74695265, + "learning_rate": 1.6599080710200126e-06, + "loss": 0.82381237, + "num_input_tokens_seen": 203414280, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.10571289, + "step": 9443, + "time_per_iteration": 2.4844577312469482 + }, + { + "auxiliary_loss_clip": 0.06418682, + "auxiliary_loss_mlp": 0.01270397, + "balance_loss_clip": 0.06275757, + "balance_loss_mlp": 0.01258947, + "epoch": 0.5678039981963025, + "flos": 17937120625920.0, + "grad_norm": 2.224999400227568, + "language_loss": 0.77901411, + "learning_rate": 1.6595242878577046e-06, + "loss": 0.85590482, + "num_input_tokens_seen": 203433280, + "router_z_loss_clip": 1.42871094, + "router_z_loss_mlp": 0.11450195, + "step": 9444, + "time_per_iteration": 2.5525596141815186 + }, + { + "auxiliary_loss_clip": 0.06428226, + "auxiliary_loss_mlp": 0.01266607, + "balance_loss_clip": 0.06281613, + "balance_loss_mlp": 0.01255228, + "epoch": 0.5678641214489704, + "flos": 19322153089920.0, + "grad_norm": 1.7258632756557413, + "language_loss": 0.81218302, + "learning_rate": 1.6591405176062687e-06, + "loss": 0.88913137, + "num_input_tokens_seen": 203449935, + "router_z_loss_clip": 1.46484375, + "router_z_loss_mlp": 0.11376953, + "step": 9445, + "time_per_iteration": 2.501241683959961 + }, + { + "auxiliary_loss_clip": 0.06419222, + "auxiliary_loss_mlp": 0.01265991, + "balance_loss_clip": 0.06275924, + "balance_loss_mlp": 0.01255548, + "epoch": 0.5679242447016384, + "flos": 27759389063040.0, + "grad_norm": 1.2498061463372896, + "language_loss": 0.71243447, + "learning_rate": 1.658756760280259e-06, + "loss": 0.78928661, + "num_input_tokens_seen": 203473025, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.10443115, + "step": 9446, + "time_per_iteration": 2.6276121139526367 + }, + { + "auxiliary_loss_clip": 0.06425235, + "auxiliary_loss_mlp": 0.01269109, + "balance_loss_clip": 0.06276199, + "balance_loss_mlp": 0.01257277, + "epoch": 0.5679843679543063, + "flos": 23775888833280.0, + "grad_norm": 1.7407480451238082, + "language_loss": 0.73674792, + "learning_rate": 1.6583730158942276e-06, + "loss": 0.81369138, + "num_input_tokens_seen": 203492895, + "router_z_loss_clip": 1.48925781, + "router_z_loss_mlp": 0.11828613, + "step": 9447, + "time_per_iteration": 2.5189285278320312 + }, + { + "auxiliary_loss_clip": 0.06428251, + "auxiliary_loss_mlp": 0.01269652, + "balance_loss_clip": 0.06280248, + "balance_loss_mlp": 0.01257272, + "epoch": 0.5680444912069743, + "flos": 25598732480640.0, + "grad_norm": 1.8734928972182148, + "language_loss": 0.75381124, + "learning_rate": 1.657989284462725e-06, + "loss": 0.83079028, + "num_input_tokens_seen": 203513710, + "router_z_loss_clip": 1.47851562, + "router_z_loss_mlp": 0.1239624, + "step": 9448, + "time_per_iteration": 2.5984859466552734 + }, + { + "auxiliary_loss_clip": 0.06428179, + "auxiliary_loss_mlp": 0.01269794, + "balance_loss_clip": 0.0627953, + "balance_loss_mlp": 0.01258415, + "epoch": 0.5681046144596422, + "flos": 23702528983680.0, + "grad_norm": 2.0524228921166556, + "language_loss": 0.76618403, + "learning_rate": 1.6576055660003038e-06, + "loss": 0.84316373, + "num_input_tokens_seen": 203531630, + "router_z_loss_clip": 1.48632812, + "router_z_loss_mlp": 0.1137085, + "step": 9449, + "time_per_iteration": 2.515456438064575 + }, + { + "auxiliary_loss_clip": 0.06423233, + "auxiliary_loss_mlp": 0.012706, + "balance_loss_clip": 0.06278287, + "balance_loss_mlp": 0.01259174, + "epoch": 0.5681647377123102, + "flos": 28008161435520.0, + "grad_norm": 1.4260887566171934, + "language_loss": 0.74914038, + "learning_rate": 1.6572218605215128e-06, + "loss": 0.82607877, + "num_input_tokens_seen": 203551885, + "router_z_loss_clip": 1.44921875, + "router_z_loss_mlp": 0.11425781, + "step": 9450, + "time_per_iteration": 2.5997612476348877 + }, + { + "auxiliary_loss_clip": 0.06425043, + "auxiliary_loss_mlp": 0.01263493, + "balance_loss_clip": 0.06278814, + "balance_loss_mlp": 0.01252526, + "epoch": 0.5682248609649782, + "flos": 22754427235200.0, + "grad_norm": 1.6712621343134006, + "language_loss": 0.66650134, + "learning_rate": 1.6568381680409038e-06, + "loss": 0.74338675, + "num_input_tokens_seen": 203572250, + "router_z_loss_clip": 1.46191406, + "router_z_loss_mlp": 0.10974121, + "step": 9451, + "time_per_iteration": 2.5041069984436035 + }, + { + "auxiliary_loss_clip": 0.06437647, + "auxiliary_loss_mlp": 0.01268609, + "balance_loss_clip": 0.06282589, + "balance_loss_mlp": 0.01255126, + "epoch": 0.5682849842176462, + "flos": 21295070599680.0, + "grad_norm": 1.8399857372619135, + "language_loss": 0.72354877, + "learning_rate": 1.656454488573026e-06, + "loss": 0.80061138, + "num_input_tokens_seen": 203590605, + "router_z_loss_clip": 1.54980469, + "router_z_loss_mlp": 0.1348877, + "step": 9452, + "time_per_iteration": 2.529772996902466 + }, + { + "auxiliary_loss_clip": 0.06419612, + "auxiliary_loss_mlp": 0.01265219, + "balance_loss_clip": 0.06277338, + "balance_loss_mlp": 0.01253799, + "epoch": 0.5683451074703142, + "flos": 21147973557120.0, + "grad_norm": 1.3918203076927713, + "language_loss": 0.70862073, + "learning_rate": 1.656070822132428e-06, + "loss": 0.78546906, + "num_input_tokens_seen": 203610080, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.11419678, + "step": 9453, + "time_per_iteration": 3.975252151489258 + }, + { + "auxiliary_loss_clip": 0.06420393, + "auxiliary_loss_mlp": 0.01265438, + "balance_loss_clip": 0.06276751, + "balance_loss_mlp": 0.01255001, + "epoch": 0.5684052307229821, + "flos": 22350759390720.0, + "grad_norm": 1.7444047953592532, + "language_loss": 0.70346195, + "learning_rate": 1.6556871687336592e-06, + "loss": 0.78032023, + "num_input_tokens_seen": 203630060, + "router_z_loss_clip": 1.43652344, + "router_z_loss_mlp": 0.10443115, + "step": 9454, + "time_per_iteration": 2.530397415161133 + }, + { + "auxiliary_loss_clip": 0.06417777, + "auxiliary_loss_mlp": 0.01265567, + "balance_loss_clip": 0.06276377, + "balance_loss_mlp": 0.01255572, + "epoch": 0.5684653539756501, + "flos": 21805067675520.0, + "grad_norm": 2.3221034941278256, + "language_loss": 0.6090889, + "learning_rate": 1.6553035283912671e-06, + "loss": 0.68592238, + "num_input_tokens_seen": 203649065, + "router_z_loss_clip": 1.41308594, + "router_z_loss_mlp": 0.10003662, + "step": 9455, + "time_per_iteration": 2.5284998416900635 + }, + { + "auxiliary_loss_clip": 0.06432047, + "auxiliary_loss_mlp": 0.01270821, + "balance_loss_clip": 0.06281373, + "balance_loss_mlp": 0.01259144, + "epoch": 0.568525477228318, + "flos": 23005757157120.0, + "grad_norm": 1.7024948062012655, + "language_loss": 0.73315781, + "learning_rate": 1.6549199011198e-06, + "loss": 0.81018651, + "num_input_tokens_seen": 203667545, + "router_z_loss_clip": 1.5078125, + "router_z_loss_mlp": 0.11669922, + "step": 9456, + "time_per_iteration": 2.5266809463500977 + }, + { + "auxiliary_loss_clip": 0.06419168, + "auxiliary_loss_mlp": 0.01265297, + "balance_loss_clip": 0.06275652, + "balance_loss_mlp": 0.01254771, + "epoch": 0.568585600480986, + "flos": 21398045667840.0, + "grad_norm": 1.7476092517075434, + "language_loss": 0.77197653, + "learning_rate": 1.6545362869338048e-06, + "loss": 0.84882128, + "num_input_tokens_seen": 203686025, + "router_z_loss_clip": 1.43457031, + "router_z_loss_mlp": 0.10534668, + "step": 9457, + "time_per_iteration": 2.6098482608795166 + }, + { + "auxiliary_loss_clip": 0.06424686, + "auxiliary_loss_mlp": 0.01267717, + "balance_loss_clip": 0.06278071, + "balance_loss_mlp": 0.01255969, + "epoch": 0.568645723733654, + "flos": 30015054576000.0, + "grad_norm": 1.8479320449106564, + "language_loss": 0.6697377, + "learning_rate": 1.6541526858478285e-06, + "loss": 0.74666172, + "num_input_tokens_seen": 203705540, + "router_z_loss_clip": 1.46484375, + "router_z_loss_mlp": 0.11749268, + "step": 9458, + "time_per_iteration": 4.003401756286621 + }, + { + "auxiliary_loss_clip": 0.06424286, + "auxiliary_loss_mlp": 0.01264614, + "balance_loss_clip": 0.06276263, + "balance_loss_mlp": 0.01253295, + "epoch": 0.568705846986322, + "flos": 20418945108480.0, + "grad_norm": 2.1992346625709427, + "language_loss": 0.68311954, + "learning_rate": 1.6537690978764167e-06, + "loss": 0.76000857, + "num_input_tokens_seen": 203723670, + "router_z_loss_clip": 1.47949219, + "router_z_loss_mlp": 0.11315918, + "step": 9459, + "time_per_iteration": 2.5213470458984375 + }, + { + "auxiliary_loss_clip": 0.06427266, + "auxiliary_loss_mlp": 0.01264866, + "balance_loss_clip": 0.06277259, + "balance_loss_mlp": 0.01253756, + "epoch": 0.5687659702389899, + "flos": 17462440846080.0, + "grad_norm": 2.588089844490271, + "language_loss": 0.77003014, + "learning_rate": 1.6533855230341155e-06, + "loss": 0.84695148, + "num_input_tokens_seen": 203739705, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.11102295, + "step": 9460, + "time_per_iteration": 2.5016860961914062 + }, + { + "auxiliary_loss_clip": 0.06424034, + "auxiliary_loss_mlp": 0.01270464, + "balance_loss_clip": 0.0627469, + "balance_loss_mlp": 0.01258865, + "epoch": 0.5688260934916579, + "flos": 25412335073280.0, + "grad_norm": 1.5686079353810067, + "language_loss": 0.72504562, + "learning_rate": 1.65300196133547e-06, + "loss": 0.80199063, + "num_input_tokens_seen": 203759000, + "router_z_loss_clip": 1.49316406, + "router_z_loss_mlp": 0.11602783, + "step": 9461, + "time_per_iteration": 2.652650833129883 + }, + { + "auxiliary_loss_clip": 0.06420281, + "auxiliary_loss_mlp": 0.01265549, + "balance_loss_clip": 0.06276302, + "balance_loss_mlp": 0.01254707, + "epoch": 0.5688862167443258, + "flos": 21613052044800.0, + "grad_norm": 1.8456676032626356, + "language_loss": 0.73588586, + "learning_rate": 1.6526184127950249e-06, + "loss": 0.81274414, + "num_input_tokens_seen": 203774295, + "router_z_loss_clip": 1.43945312, + "router_z_loss_mlp": 0.10839844, + "step": 9462, + "time_per_iteration": 3.9915239810943604 + }, + { + "auxiliary_loss_clip": 0.06414893, + "auxiliary_loss_mlp": 0.01264818, + "balance_loss_clip": 0.06275715, + "balance_loss_mlp": 0.01254715, + "epoch": 0.5689463399969938, + "flos": 22425544759680.0, + "grad_norm": 2.0067901163228212, + "language_loss": 0.72924364, + "learning_rate": 1.6522348774273246e-06, + "loss": 0.80604076, + "num_input_tokens_seen": 203792710, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.10107422, + "step": 9463, + "time_per_iteration": 2.5026743412017822 + }, + { + "auxiliary_loss_clip": 0.06417, + "auxiliary_loss_mlp": 0.01266249, + "balance_loss_clip": 0.06272251, + "balance_loss_mlp": 0.01255115, + "epoch": 0.5690064632496618, + "flos": 18302787843840.0, + "grad_norm": 1.7796234570298675, + "language_loss": 0.7436375, + "learning_rate": 1.6518513552469123e-06, + "loss": 0.82046998, + "num_input_tokens_seen": 203811645, + "router_z_loss_clip": 1.4453125, + "router_z_loss_mlp": 0.11126709, + "step": 9464, + "time_per_iteration": 2.5418522357940674 + }, + { + "auxiliary_loss_clip": 0.06420638, + "auxiliary_loss_mlp": 0.01265209, + "balance_loss_clip": 0.06273931, + "balance_loss_mlp": 0.01253169, + "epoch": 0.5690665865023298, + "flos": 21585575105280.0, + "grad_norm": 1.531985348456469, + "language_loss": 0.84518385, + "learning_rate": 1.6514678462683312e-06, + "loss": 0.92204237, + "num_input_tokens_seen": 203830040, + "router_z_loss_clip": 1.46679688, + "router_z_loss_mlp": 0.12060547, + "step": 9465, + "time_per_iteration": 2.501640558242798 + }, + { + "auxiliary_loss_clip": 0.06416291, + "auxiliary_loss_mlp": 0.01262582, + "balance_loss_clip": 0.06275291, + "balance_loss_mlp": 0.01251954, + "epoch": 0.5691267097549978, + "flos": 24427616290560.0, + "grad_norm": 1.5399864144711508, + "language_loss": 0.72636294, + "learning_rate": 1.651084350506125e-06, + "loss": 0.80315161, + "num_input_tokens_seen": 203851245, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.10638428, + "step": 9466, + "time_per_iteration": 2.5872812271118164 + }, + { + "auxiliary_loss_clip": 0.06322309, + "auxiliary_loss_mlp": 0.01252779, + "balance_loss_clip": 0.06261392, + "balance_loss_mlp": 0.01251253, + "epoch": 0.5691868330076657, + "flos": 61679915389440.0, + "grad_norm": 0.706168287542021, + "language_loss": 0.55225098, + "learning_rate": 1.6507008679748343e-06, + "loss": 0.62800181, + "num_input_tokens_seen": 203916400, + "router_z_loss_clip": 0.609375, + "router_z_loss_mlp": 0.01525879, + "step": 9467, + "time_per_iteration": 3.1809115409851074 + }, + { + "auxiliary_loss_clip": 0.06421535, + "auxiliary_loss_mlp": 0.01265338, + "balance_loss_clip": 0.06275938, + "balance_loss_mlp": 0.01253471, + "epoch": 0.5692469562603337, + "flos": 21331687633920.0, + "grad_norm": 1.821723086609738, + "language_loss": 0.64103729, + "learning_rate": 1.6503173986890023e-06, + "loss": 0.717906, + "num_input_tokens_seen": 203935870, + "router_z_loss_clip": 1.45605469, + "router_z_loss_mlp": 0.11865234, + "step": 9468, + "time_per_iteration": 2.5419483184814453 + }, + { + "auxiliary_loss_clip": 0.06420718, + "auxiliary_loss_mlp": 0.01268612, + "balance_loss_clip": 0.06276828, + "balance_loss_mlp": 0.01257508, + "epoch": 0.5693070795130016, + "flos": 23374652757120.0, + "grad_norm": 2.0216455322076885, + "language_loss": 0.79510915, + "learning_rate": 1.64993394266317e-06, + "loss": 0.87200236, + "num_input_tokens_seen": 203954950, + "router_z_loss_clip": 1.43945312, + "router_z_loss_mlp": 0.11102295, + "step": 9469, + "time_per_iteration": 3.974965810775757 + }, + { + "auxiliary_loss_clip": 0.06424933, + "auxiliary_loss_mlp": 0.01267744, + "balance_loss_clip": 0.06275818, + "balance_loss_mlp": 0.01256133, + "epoch": 0.5693672027656697, + "flos": 18703143452160.0, + "grad_norm": 1.8253898689046395, + "language_loss": 0.69934285, + "learning_rate": 1.6495504999118769e-06, + "loss": 0.77626961, + "num_input_tokens_seen": 203972715, + "router_z_loss_clip": 1.48925781, + "router_z_loss_mlp": 0.11608887, + "step": 9470, + "time_per_iteration": 2.490144729614258 + }, + { + "auxiliary_loss_clip": 0.06418116, + "auxiliary_loss_mlp": 0.01266169, + "balance_loss_clip": 0.06273302, + "balance_loss_mlp": 0.01254391, + "epoch": 0.5694273260183376, + "flos": 20455478288640.0, + "grad_norm": 2.1472118271494574, + "language_loss": 0.75247335, + "learning_rate": 1.6491670704496644e-06, + "loss": 0.82931614, + "num_input_tokens_seen": 203990775, + "router_z_loss_clip": 1.44824219, + "router_z_loss_mlp": 0.11785889, + "step": 9471, + "time_per_iteration": 2.5518500804901123 + }, + { + "auxiliary_loss_clip": 0.06417546, + "auxiliary_loss_mlp": 0.01266321, + "balance_loss_clip": 0.06276481, + "balance_loss_mlp": 0.01255616, + "epoch": 0.5694874492710056, + "flos": 17608992837120.0, + "grad_norm": 1.6827496814774499, + "language_loss": 0.57877314, + "learning_rate": 1.6487836542910716e-06, + "loss": 0.65561181, + "num_input_tokens_seen": 204008845, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.10705566, + "step": 9472, + "time_per_iteration": 2.535846710205078 + }, + { + "auxiliary_loss_clip": 0.06416848, + "auxiliary_loss_mlp": 0.01268789, + "balance_loss_clip": 0.06277969, + "balance_loss_mlp": 0.01257411, + "epoch": 0.5695475725236735, + "flos": 13375923621120.0, + "grad_norm": 1.7815747768820038, + "language_loss": 0.73987466, + "learning_rate": 1.648400251450638e-06, + "loss": 0.81673104, + "num_input_tokens_seen": 204023755, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.11376953, + "step": 9473, + "time_per_iteration": 2.4858133792877197 + }, + { + "auxiliary_loss_clip": 0.06327727, + "auxiliary_loss_mlp": 0.01252353, + "balance_loss_clip": 0.06266978, + "balance_loss_mlp": 0.01250914, + "epoch": 0.5696076957763415, + "flos": 68195078881920.0, + "grad_norm": 0.6484051468543478, + "language_loss": 0.57388628, + "learning_rate": 1.6480168619429023e-06, + "loss": 0.64968711, + "num_input_tokens_seen": 204091255, + "router_z_loss_clip": 0.609375, + "router_z_loss_mlp": 0.01437378, + "step": 9474, + "time_per_iteration": 3.1554436683654785 + }, + { + "auxiliary_loss_clip": 0.06415011, + "auxiliary_loss_mlp": 0.01264959, + "balance_loss_clip": 0.06274811, + "balance_loss_mlp": 0.01254111, + "epoch": 0.5696678190290094, + "flos": 33846636153600.0, + "grad_norm": 1.6105466561987234, + "language_loss": 0.54358017, + "learning_rate": 1.6476334857824017e-06, + "loss": 0.62037987, + "num_input_tokens_seen": 204113285, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.10845947, + "step": 9475, + "time_per_iteration": 2.6193020343780518 + }, + { + "auxiliary_loss_clip": 0.06419323, + "auxiliary_loss_mlp": 0.01263613, + "balance_loss_clip": 0.06274848, + "balance_loss_mlp": 0.01252234, + "epoch": 0.5697279422816774, + "flos": 26363329787520.0, + "grad_norm": 2.008545727860435, + "language_loss": 0.79765999, + "learning_rate": 1.647250122983675e-06, + "loss": 0.87448931, + "num_input_tokens_seen": 204133045, + "router_z_loss_clip": 1.44726562, + "router_z_loss_mlp": 0.11383057, + "step": 9476, + "time_per_iteration": 2.543100595474243 + }, + { + "auxiliary_loss_clip": 0.06428041, + "auxiliary_loss_mlp": 0.01271624, + "balance_loss_clip": 0.06279552, + "balance_loss_mlp": 0.01260209, + "epoch": 0.5697880655343454, + "flos": 22937260844160.0, + "grad_norm": 1.735529425276041, + "language_loss": 0.66121185, + "learning_rate": 1.6468667735612592e-06, + "loss": 0.73820853, + "num_input_tokens_seen": 204152590, + "router_z_loss_clip": 1.48339844, + "router_z_loss_mlp": 0.11407471, + "step": 9477, + "time_per_iteration": 2.5366005897521973 + }, + { + "auxiliary_loss_clip": 0.06423311, + "auxiliary_loss_mlp": 0.01266336, + "balance_loss_clip": 0.06277082, + "balance_loss_mlp": 0.0125553, + "epoch": 0.5698481887870134, + "flos": 26768674713600.0, + "grad_norm": 1.6190739346076362, + "language_loss": 0.71115196, + "learning_rate": 1.6464834375296906e-06, + "loss": 0.78804839, + "num_input_tokens_seen": 204171815, + "router_z_loss_clip": 1.46289062, + "router_z_loss_mlp": 0.1081543, + "step": 9478, + "time_per_iteration": 2.5513012409210205 + }, + { + "auxiliary_loss_clip": 0.06415288, + "auxiliary_loss_mlp": 0.01266638, + "balance_loss_clip": 0.06277218, + "balance_loss_mlp": 0.01255718, + "epoch": 0.5699083120396814, + "flos": 15747729292800.0, + "grad_norm": 1.4794360727515914, + "language_loss": 0.69306439, + "learning_rate": 1.6461001149035055e-06, + "loss": 0.76988363, + "num_input_tokens_seen": 204188535, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.10913086, + "step": 9479, + "time_per_iteration": 2.5828471183776855 + }, + { + "auxiliary_loss_clip": 0.06413876, + "auxiliary_loss_mlp": 0.01267563, + "balance_loss_clip": 0.06275865, + "balance_loss_mlp": 0.0125734, + "epoch": 0.5699684352923493, + "flos": 19543448522880.0, + "grad_norm": 1.5013072139655574, + "language_loss": 0.71621788, + "learning_rate": 1.6457168056972392e-06, + "loss": 0.79303229, + "num_input_tokens_seen": 204208365, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.10223389, + "step": 9480, + "time_per_iteration": 2.5247299671173096 + }, + { + "auxiliary_loss_clip": 0.06418922, + "auxiliary_loss_mlp": 0.01267091, + "balance_loss_clip": 0.06276189, + "balance_loss_mlp": 0.01255319, + "epoch": 0.5700285585450173, + "flos": 16258942252800.0, + "grad_norm": 4.885605743124815, + "language_loss": 0.72444856, + "learning_rate": 1.6453335099254276e-06, + "loss": 0.80130869, + "num_input_tokens_seen": 204226560, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.11779785, + "step": 9481, + "time_per_iteration": 2.508589506149292 + }, + { + "auxiliary_loss_clip": 0.06421519, + "auxiliary_loss_mlp": 0.01270221, + "balance_loss_clip": 0.06279288, + "balance_loss_mlp": 0.01258461, + "epoch": 0.5700886817976852, + "flos": 19871115114240.0, + "grad_norm": 1.897422682992244, + "language_loss": 0.78625083, + "learning_rate": 1.6449502276026041e-06, + "loss": 0.86316824, + "num_input_tokens_seen": 204245410, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.11761475, + "step": 9482, + "time_per_iteration": 2.5139269828796387 + }, + { + "auxiliary_loss_clip": 0.06417527, + "auxiliary_loss_mlp": 0.01263816, + "balance_loss_clip": 0.06276704, + "balance_loss_mlp": 0.01253242, + "epoch": 0.5701488050503533, + "flos": 23848452069120.0, + "grad_norm": 2.496783055499815, + "language_loss": 0.78338385, + "learning_rate": 1.6445669587433043e-06, + "loss": 0.86019731, + "num_input_tokens_seen": 204264840, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.10571289, + "step": 9483, + "time_per_iteration": 2.547522783279419 + }, + { + "auxiliary_loss_clip": 0.06420138, + "auxiliary_loss_mlp": 0.01264109, + "balance_loss_clip": 0.06276282, + "balance_loss_mlp": 0.0125369, + "epoch": 0.5702089283030212, + "flos": 23666457000960.0, + "grad_norm": 1.5289248173251733, + "language_loss": 0.81642497, + "learning_rate": 1.6441837033620612e-06, + "loss": 0.89326739, + "num_input_tokens_seen": 204284335, + "router_z_loss_clip": 1.43652344, + "router_z_loss_mlp": 0.10424805, + "step": 9484, + "time_per_iteration": 2.546597719192505 + }, + { + "auxiliary_loss_clip": 0.06420925, + "auxiliary_loss_mlp": 0.01267488, + "balance_loss_clip": 0.06277504, + "balance_loss_mlp": 0.01255924, + "epoch": 0.5702690515556892, + "flos": 27898519968000.0, + "grad_norm": 1.8682928794178455, + "language_loss": 0.61101806, + "learning_rate": 1.6438004614734073e-06, + "loss": 0.68790221, + "num_input_tokens_seen": 204302590, + "router_z_loss_clip": 1.43457031, + "router_z_loss_mlp": 0.11560059, + "step": 9485, + "time_per_iteration": 2.5931575298309326 + }, + { + "auxiliary_loss_clip": 0.06421611, + "auxiliary_loss_mlp": 0.01267401, + "balance_loss_clip": 0.06277725, + "balance_loss_mlp": 0.01255748, + "epoch": 0.5703291748083571, + "flos": 24030698699520.0, + "grad_norm": 1.7282499785723824, + "language_loss": 0.65970731, + "learning_rate": 1.6434172330918757e-06, + "loss": 0.73659742, + "num_input_tokens_seen": 204323055, + "router_z_loss_clip": 1.43945312, + "router_z_loss_mlp": 0.11645508, + "step": 9486, + "time_per_iteration": 2.546604871749878 + }, + { + "auxiliary_loss_clip": 0.06330933, + "auxiliary_loss_mlp": 0.01257137, + "balance_loss_clip": 0.06271148, + "balance_loss_mlp": 0.01255769, + "epoch": 0.5703892980610251, + "flos": 57044478067200.0, + "grad_norm": 0.6556389442355417, + "language_loss": 0.47978726, + "learning_rate": 1.6430340182319978e-06, + "loss": 0.55566794, + "num_input_tokens_seen": 204386160, + "router_z_loss_clip": 0.60058594, + "router_z_loss_mlp": 0.01370239, + "step": 9487, + "time_per_iteration": 3.216449499130249 + }, + { + "auxiliary_loss_clip": 0.06419921, + "auxiliary_loss_mlp": 0.01266304, + "balance_loss_clip": 0.06275571, + "balance_loss_mlp": 0.01255212, + "epoch": 0.570449421313693, + "flos": 24357610604160.0, + "grad_norm": 1.4009858057112485, + "language_loss": 0.8597424, + "learning_rate": 1.6426508169083067e-06, + "loss": 0.93660462, + "num_input_tokens_seen": 204406315, + "router_z_loss_clip": 1.44335938, + "router_z_loss_mlp": 0.11102295, + "step": 9488, + "time_per_iteration": 2.5608506202697754 + }, + { + "auxiliary_loss_clip": 0.06428364, + "auxiliary_loss_mlp": 0.01270308, + "balance_loss_clip": 0.06281118, + "balance_loss_mlp": 0.01259055, + "epoch": 0.570509544566361, + "flos": 24835770328320.0, + "grad_norm": 1.8825828159705935, + "language_loss": 0.79195142, + "learning_rate": 1.6422676291353314e-06, + "loss": 0.86893809, + "num_input_tokens_seen": 204427645, + "router_z_loss_clip": 1.47265625, + "router_z_loss_mlp": 0.11260986, + "step": 9489, + "time_per_iteration": 2.553471088409424 + }, + { + "auxiliary_loss_clip": 0.06419341, + "auxiliary_loss_mlp": 0.01263993, + "balance_loss_clip": 0.06276694, + "balance_loss_mlp": 0.01253646, + "epoch": 0.570569667819029, + "flos": 21403663891200.0, + "grad_norm": 1.6360729178743676, + "language_loss": 0.7047472, + "learning_rate": 1.641884454927604e-06, + "loss": 0.78158057, + "num_input_tokens_seen": 204445910, + "router_z_loss_clip": 1.42480469, + "router_z_loss_mlp": 0.10345459, + "step": 9490, + "time_per_iteration": 2.5905275344848633 + }, + { + "auxiliary_loss_clip": 0.06421432, + "auxiliary_loss_mlp": 0.01269104, + "balance_loss_clip": 0.06279342, + "balance_loss_mlp": 0.01257803, + "epoch": 0.570629791071697, + "flos": 23222608323840.0, + "grad_norm": 1.4492809017584538, + "language_loss": 0.76252091, + "learning_rate": 1.6415012942996548e-06, + "loss": 0.83942628, + "num_input_tokens_seen": 204464680, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.11291504, + "step": 9491, + "time_per_iteration": 2.523472309112549 + }, + { + "auxiliary_loss_clip": 0.06328943, + "auxiliary_loss_mlp": 0.01263516, + "balance_loss_clip": 0.06268945, + "balance_loss_mlp": 0.01261694, + "epoch": 0.570689914324365, + "flos": 65303632915200.0, + "grad_norm": 0.7890932915341226, + "language_loss": 0.57371008, + "learning_rate": 1.641118147266011e-06, + "loss": 0.64963466, + "num_input_tokens_seen": 204525580, + "router_z_loss_clip": 0.6015625, + "router_z_loss_mlp": 0.01817322, + "step": 9492, + "time_per_iteration": 4.556811571121216 + }, + { + "auxiliary_loss_clip": 0.06420883, + "auxiliary_loss_mlp": 0.01266854, + "balance_loss_clip": 0.0627829, + "balance_loss_mlp": 0.01255809, + "epoch": 0.5707500375770329, + "flos": 21148225119360.0, + "grad_norm": 2.4823752626433357, + "language_loss": 0.71714401, + "learning_rate": 1.6407350138412035e-06, + "loss": 0.79402137, + "num_input_tokens_seen": 204541320, + "router_z_loss_clip": 1.42480469, + "router_z_loss_mlp": 0.1104126, + "step": 9493, + "time_per_iteration": 2.5404999256134033 + }, + { + "auxiliary_loss_clip": 0.06425234, + "auxiliary_loss_mlp": 0.01270244, + "balance_loss_clip": 0.06277438, + "balance_loss_mlp": 0.01258812, + "epoch": 0.5708101608297009, + "flos": 20818881446400.0, + "grad_norm": 1.6649189140980358, + "language_loss": 0.77940559, + "learning_rate": 1.6403518940397606e-06, + "loss": 0.85636032, + "num_input_tokens_seen": 204560275, + "router_z_loss_clip": 1.4765625, + "router_z_loss_mlp": 0.11431885, + "step": 9494, + "time_per_iteration": 2.5486340522766113 + }, + { + "auxiliary_loss_clip": 0.06427161, + "auxiliary_loss_mlp": 0.01267162, + "balance_loss_clip": 0.06276955, + "balance_loss_mlp": 0.01255026, + "epoch": 0.5708702840823688, + "flos": 25819482862080.0, + "grad_norm": 2.058789415113096, + "language_loss": 0.80377084, + "learning_rate": 1.6399687878762096e-06, + "loss": 0.88071406, + "num_input_tokens_seen": 204579430, + "router_z_loss_clip": 1.49902344, + "router_z_loss_mlp": 0.12127686, + "step": 9495, + "time_per_iteration": 2.5960187911987305 + }, + { + "auxiliary_loss_clip": 0.06429706, + "auxiliary_loss_mlp": 0.01275013, + "balance_loss_clip": 0.06277497, + "balance_loss_mlp": 0.01261567, + "epoch": 0.5709304073350369, + "flos": 23657400760320.0, + "grad_norm": 1.9375866549540641, + "language_loss": 0.66475153, + "learning_rate": 1.6395856953650784e-06, + "loss": 0.74179876, + "num_input_tokens_seen": 204597710, + "router_z_loss_clip": 1.5234375, + "router_z_loss_mlp": 0.13446045, + "step": 9496, + "time_per_iteration": 2.536844253540039 + }, + { + "auxiliary_loss_clip": 0.06424591, + "auxiliary_loss_mlp": 0.0126837, + "balance_loss_clip": 0.06275633, + "balance_loss_mlp": 0.01256485, + "epoch": 0.5709905305877048, + "flos": 16113144948480.0, + "grad_norm": 2.1097086993227068, + "language_loss": 0.70119512, + "learning_rate": 1.6392026165208938e-06, + "loss": 0.77812475, + "num_input_tokens_seen": 204616140, + "router_z_loss_clip": 1.49023438, + "router_z_loss_mlp": 0.11877441, + "step": 9497, + "time_per_iteration": 2.5001566410064697 + }, + { + "auxiliary_loss_clip": 0.06421457, + "auxiliary_loss_mlp": 0.01273203, + "balance_loss_clip": 0.06275579, + "balance_loss_mlp": 0.01261455, + "epoch": 0.5710506538403728, + "flos": 24757211525760.0, + "grad_norm": 5.203790092819982, + "language_loss": 0.81695306, + "learning_rate": 1.638819551358182e-06, + "loss": 0.89389962, + "num_input_tokens_seen": 204636470, + "router_z_loss_clip": 1.45800781, + "router_z_loss_mlp": 0.11755371, + "step": 9498, + "time_per_iteration": 3.979785203933716 + }, + { + "auxiliary_loss_clip": 0.06421061, + "auxiliary_loss_mlp": 0.01268836, + "balance_loss_clip": 0.06273817, + "balance_loss_mlp": 0.0125707, + "epoch": 0.5711107770930407, + "flos": 21988907533440.0, + "grad_norm": 1.778867640796668, + "language_loss": 0.66763413, + "learning_rate": 1.638436499891469e-06, + "loss": 0.74453306, + "num_input_tokens_seen": 204656640, + "router_z_loss_clip": 1.47167969, + "router_z_loss_mlp": 0.11767578, + "step": 9499, + "time_per_iteration": 2.560131788253784 + }, + { + "auxiliary_loss_clip": 0.06422064, + "auxiliary_loss_mlp": 0.01267168, + "balance_loss_clip": 0.06276537, + "balance_loss_mlp": 0.01255432, + "epoch": 0.5711709003457087, + "flos": 19580233265280.0, + "grad_norm": 1.5461706893268885, + "language_loss": 0.71884078, + "learning_rate": 1.6380534621352805e-06, + "loss": 0.79573303, + "num_input_tokens_seen": 204675475, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.11743164, + "step": 9500, + "time_per_iteration": 2.51857852935791 + }, + { + "auxiliary_loss_clip": 0.06426705, + "auxiliary_loss_mlp": 0.01270529, + "balance_loss_clip": 0.06277592, + "balance_loss_mlp": 0.01257893, + "epoch": 0.5712310235983766, + "flos": 24249436583040.0, + "grad_norm": 1.9132916799477426, + "language_loss": 0.76773643, + "learning_rate": 1.6376704381041407e-06, + "loss": 0.8447088, + "num_input_tokens_seen": 204695385, + "router_z_loss_clip": 1.4921875, + "router_z_loss_mlp": 0.12640381, + "step": 9501, + "time_per_iteration": 2.585303544998169 + }, + { + "auxiliary_loss_clip": 0.06424866, + "auxiliary_loss_mlp": 0.01265647, + "balance_loss_clip": 0.06278552, + "balance_loss_mlp": 0.01254233, + "epoch": 0.5712911468510447, + "flos": 21002469742080.0, + "grad_norm": 1.6366629976038132, + "language_loss": 0.75004148, + "learning_rate": 1.6372874278125742e-06, + "loss": 0.82694662, + "num_input_tokens_seen": 204714730, + "router_z_loss_clip": 1.46289062, + "router_z_loss_mlp": 0.11419678, + "step": 9502, + "time_per_iteration": 3.9893364906311035 + }, + { + "auxiliary_loss_clip": 0.06420161, + "auxiliary_loss_mlp": 0.0126738, + "balance_loss_clip": 0.0627653, + "balance_loss_mlp": 0.01256561, + "epoch": 0.5713512701037126, + "flos": 18923055292800.0, + "grad_norm": 1.7156142062685982, + "language_loss": 0.82350051, + "learning_rate": 1.636904431275105e-06, + "loss": 0.90037596, + "num_input_tokens_seen": 204735025, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.10827637, + "step": 9503, + "time_per_iteration": 2.5289459228515625 + }, + { + "auxiliary_loss_clip": 0.06420251, + "auxiliary_loss_mlp": 0.01271521, + "balance_loss_clip": 0.06276201, + "balance_loss_mlp": 0.01260375, + "epoch": 0.5714113933563806, + "flos": 17417983455360.0, + "grad_norm": 2.1350982520901827, + "language_loss": 0.86264861, + "learning_rate": 1.6365214485062553e-06, + "loss": 0.93956631, + "num_input_tokens_seen": 204751365, + "router_z_loss_clip": 1.44140625, + "router_z_loss_mlp": 0.1114502, + "step": 9504, + "time_per_iteration": 2.5180015563964844 + }, + { + "auxiliary_loss_clip": 0.06417073, + "auxiliary_loss_mlp": 0.01266636, + "balance_loss_clip": 0.06275576, + "balance_loss_mlp": 0.01255651, + "epoch": 0.5714715166090486, + "flos": 20199536392320.0, + "grad_norm": 2.0316869593340265, + "language_loss": 0.75480437, + "learning_rate": 1.6361384795205496e-06, + "loss": 0.83164144, + "num_input_tokens_seen": 204768980, + "router_z_loss_clip": 1.41503906, + "router_z_loss_mlp": 0.10980225, + "step": 9505, + "time_per_iteration": 2.497009754180908 + }, + { + "auxiliary_loss_clip": 0.06418754, + "auxiliary_loss_mlp": 0.01267922, + "balance_loss_clip": 0.06276823, + "balance_loss_mlp": 0.01256419, + "epoch": 0.5715316398617165, + "flos": 18557597710080.0, + "grad_norm": 1.6474042198541896, + "language_loss": 0.82215714, + "learning_rate": 1.635755524332509e-06, + "loss": 0.89902395, + "num_input_tokens_seen": 204788110, + "router_z_loss_clip": 1.41796875, + "router_z_loss_mlp": 0.1151123, + "step": 9506, + "time_per_iteration": 2.5657498836517334 + }, + { + "auxiliary_loss_clip": 0.06418438, + "auxiliary_loss_mlp": 0.01265118, + "balance_loss_clip": 0.0627599, + "balance_loss_mlp": 0.01254568, + "epoch": 0.5715917631143845, + "flos": 18484028225280.0, + "grad_norm": 1.482727560680873, + "language_loss": 0.77285796, + "learning_rate": 1.6353725829566552e-06, + "loss": 0.84969354, + "num_input_tokens_seen": 204807240, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.10546875, + "step": 9507, + "time_per_iteration": 2.485496997833252 + }, + { + "auxiliary_loss_clip": 0.06422855, + "auxiliary_loss_mlp": 0.01269089, + "balance_loss_clip": 0.06276034, + "balance_loss_mlp": 0.01257091, + "epoch": 0.5716518863670524, + "flos": 24026128652160.0, + "grad_norm": 1.4323391248104125, + "language_loss": 0.68799454, + "learning_rate": 1.63498965540751e-06, + "loss": 0.76491398, + "num_input_tokens_seen": 204826415, + "router_z_loss_clip": 1.46777344, + "router_z_loss_mlp": 0.12005615, + "step": 9508, + "time_per_iteration": 2.5643258094787598 + }, + { + "auxiliary_loss_clip": 0.06422228, + "auxiliary_loss_mlp": 0.01264898, + "balance_loss_clip": 0.06276274, + "balance_loss_mlp": 0.012529, + "epoch": 0.5717120096197205, + "flos": 17824879681920.0, + "grad_norm": 2.05386002816889, + "language_loss": 0.80054557, + "learning_rate": 1.634606741699593e-06, + "loss": 0.87741685, + "num_input_tokens_seen": 204844305, + "router_z_loss_clip": 1.45605469, + "router_z_loss_mlp": 0.11987305, + "step": 9509, + "time_per_iteration": 3.8947436809539795 + }, + { + "auxiliary_loss_clip": 0.06415324, + "auxiliary_loss_mlp": 0.0126599, + "balance_loss_clip": 0.06274744, + "balance_loss_mlp": 0.01255691, + "epoch": 0.5717721328723884, + "flos": 21871551490560.0, + "grad_norm": 1.798702817725972, + "language_loss": 0.72265553, + "learning_rate": 1.6342238418474255e-06, + "loss": 0.79946876, + "num_input_tokens_seen": 204861765, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.10302734, + "step": 9510, + "time_per_iteration": 2.496246099472046 + }, + { + "auxiliary_loss_clip": 0.06419715, + "auxiliary_loss_mlp": 0.01266842, + "balance_loss_clip": 0.0627699, + "balance_loss_mlp": 0.01255946, + "epoch": 0.5718322561250564, + "flos": 28444924442880.0, + "grad_norm": 1.3126461366590796, + "language_loss": 0.69652188, + "learning_rate": 1.6338409558655264e-06, + "loss": 0.77338743, + "num_input_tokens_seen": 204882505, + "router_z_loss_clip": 1.42480469, + "router_z_loss_mlp": 0.10906982, + "step": 9511, + "time_per_iteration": 2.5713541507720947 + }, + { + "auxiliary_loss_clip": 0.06420782, + "auxiliary_loss_mlp": 0.01268426, + "balance_loss_clip": 0.06277648, + "balance_loss_mlp": 0.01257136, + "epoch": 0.5718923793777243, + "flos": 13556702805120.0, + "grad_norm": 2.0681515910732715, + "language_loss": 0.61827439, + "learning_rate": 1.6334580837684152e-06, + "loss": 0.69516647, + "num_input_tokens_seen": 204899830, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.112854, + "step": 9512, + "time_per_iteration": 2.49580454826355 + }, + { + "auxiliary_loss_clip": 0.06421502, + "auxiliary_loss_mlp": 0.01268423, + "balance_loss_clip": 0.06278209, + "balance_loss_mlp": 0.01257498, + "epoch": 0.5719525026303923, + "flos": 17827856501760.0, + "grad_norm": 2.3676523534955685, + "language_loss": 0.76396298, + "learning_rate": 1.6330752255706104e-06, + "loss": 0.84086221, + "num_input_tokens_seen": 204918100, + "router_z_loss_clip": 1.43359375, + "router_z_loss_mlp": 0.10919189, + "step": 9513, + "time_per_iteration": 2.500870704650879 + }, + { + "auxiliary_loss_clip": 0.06326592, + "auxiliary_loss_mlp": 0.01253708, + "balance_loss_clip": 0.06266873, + "balance_loss_mlp": 0.01252076, + "epoch": 0.5720126258830602, + "flos": 61314724097280.0, + "grad_norm": 0.891161207726192, + "language_loss": 0.66879886, + "learning_rate": 1.6326923812866288e-06, + "loss": 0.74460191, + "num_input_tokens_seen": 204972925, + "router_z_loss_clip": 0.59765625, + "router_z_loss_mlp": 0.01634216, + "step": 9514, + "time_per_iteration": 3.1455137729644775 + }, + { + "auxiliary_loss_clip": 0.06430741, + "auxiliary_loss_mlp": 0.0127094, + "balance_loss_clip": 0.06282684, + "balance_loss_mlp": 0.01258941, + "epoch": 0.5720727491357283, + "flos": 23994878279040.0, + "grad_norm": 2.149685980416527, + "language_loss": 0.81938076, + "learning_rate": 1.63230955093099e-06, + "loss": 0.89639759, + "num_input_tokens_seen": 204990910, + "router_z_loss_clip": 1.48046875, + "router_z_loss_mlp": 0.12005615, + "step": 9515, + "time_per_iteration": 2.5996580123901367 + }, + { + "auxiliary_loss_clip": 0.0641297, + "auxiliary_loss_mlp": 0.01267881, + "balance_loss_clip": 0.06274894, + "balance_loss_mlp": 0.01257259, + "epoch": 0.5721328723883962, + "flos": 23412359894400.0, + "grad_norm": 1.6126279146943563, + "language_loss": 0.86095083, + "learning_rate": 1.6319267345182092e-06, + "loss": 0.93775928, + "num_input_tokens_seen": 205010500, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.10620117, + "step": 9516, + "time_per_iteration": 2.5553810596466064 + }, + { + "auxiliary_loss_clip": 0.06417726, + "auxiliary_loss_mlp": 0.01271814, + "balance_loss_clip": 0.06275768, + "balance_loss_mlp": 0.01260572, + "epoch": 0.5721929956410642, + "flos": 18810520859520.0, + "grad_norm": 2.197571780359881, + "language_loss": 0.87770617, + "learning_rate": 1.6315439320628038e-06, + "loss": 0.95460165, + "num_input_tokens_seen": 205028560, + "router_z_loss_clip": 1.41992188, + "router_z_loss_mlp": 0.11242676, + "step": 9517, + "time_per_iteration": 2.5858652591705322 + }, + { + "auxiliary_loss_clip": 0.06417002, + "auxiliary_loss_mlp": 0.01265386, + "balance_loss_clip": 0.0627486, + "balance_loss_mlp": 0.01254114, + "epoch": 0.5722531188937322, + "flos": 27203676785280.0, + "grad_norm": 1.5341934137919409, + "language_loss": 0.85065883, + "learning_rate": 1.6311611435792893e-06, + "loss": 0.92748272, + "num_input_tokens_seen": 205048650, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.11273193, + "step": 9518, + "time_per_iteration": 2.5850136280059814 + }, + { + "auxiliary_loss_clip": 0.06417416, + "auxiliary_loss_mlp": 0.01266921, + "balance_loss_clip": 0.06278273, + "balance_loss_mlp": 0.01256044, + "epoch": 0.5723132421464001, + "flos": 15201157109760.0, + "grad_norm": 1.5672659775495308, + "language_loss": 0.78797317, + "learning_rate": 1.6307783690821812e-06, + "loss": 0.86481655, + "num_input_tokens_seen": 205066480, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.10870361, + "step": 9519, + "time_per_iteration": 2.5459818840026855 + }, + { + "auxiliary_loss_clip": 0.06418845, + "auxiliary_loss_mlp": 0.01271535, + "balance_loss_clip": 0.06277601, + "balance_loss_mlp": 0.01260675, + "epoch": 0.5723733653990681, + "flos": 27606757651200.0, + "grad_norm": 1.4075514987328583, + "language_loss": 0.83134615, + "learning_rate": 1.6303956085859944e-06, + "loss": 0.90824991, + "num_input_tokens_seen": 205087475, + "router_z_loss_clip": 1.41210938, + "router_z_loss_mlp": 0.10864258, + "step": 9520, + "time_per_iteration": 2.66892671585083 + }, + { + "auxiliary_loss_clip": 0.06426139, + "auxiliary_loss_mlp": 0.01264412, + "balance_loss_clip": 0.06279796, + "balance_loss_mlp": 0.01253022, + "epoch": 0.572433488651736, + "flos": 18228673307520.0, + "grad_norm": 1.9996427544433133, + "language_loss": 0.73064411, + "learning_rate": 1.630012862105243e-06, + "loss": 0.80754966, + "num_input_tokens_seen": 205106495, + "router_z_loss_clip": 1.46484375, + "router_z_loss_mlp": 0.11383057, + "step": 9521, + "time_per_iteration": 2.5980701446533203 + }, + { + "auxiliary_loss_clip": 0.06419297, + "auxiliary_loss_mlp": 0.01270088, + "balance_loss_clip": 0.06276461, + "balance_loss_mlp": 0.01259073, + "epoch": 0.5724936119044041, + "flos": 31257224628480.0, + "grad_norm": 1.5867052207792396, + "language_loss": 0.77991247, + "learning_rate": 1.6296301296544415e-06, + "loss": 0.85680634, + "num_input_tokens_seen": 205128285, + "router_z_loss_clip": 1.42675781, + "router_z_loss_mlp": 0.11022949, + "step": 9522, + "time_per_iteration": 2.5890755653381348 + }, + { + "auxiliary_loss_clip": 0.06416851, + "auxiliary_loss_mlp": 0.01267889, + "balance_loss_clip": 0.06278282, + "balance_loss_mlp": 0.01257649, + "epoch": 0.572553735157072, + "flos": 19207186888320.0, + "grad_norm": 1.441878230551161, + "language_loss": 0.72110128, + "learning_rate": 1.629247411248102e-06, + "loss": 0.79794878, + "num_input_tokens_seen": 205146595, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.10235596, + "step": 9523, + "time_per_iteration": 2.511115789413452 + }, + { + "auxiliary_loss_clip": 0.06417882, + "auxiliary_loss_mlp": 0.0126736, + "balance_loss_clip": 0.06277744, + "balance_loss_mlp": 0.01257025, + "epoch": 0.57261385840974, + "flos": 21221249552640.0, + "grad_norm": 1.7953059857975224, + "language_loss": 0.70372975, + "learning_rate": 1.628864706900738e-06, + "loss": 0.78058219, + "num_input_tokens_seen": 205164295, + "router_z_loss_clip": 1.40039062, + "router_z_loss_mlp": 0.10339355, + "step": 9524, + "time_per_iteration": 2.507387161254883 + }, + { + "auxiliary_loss_clip": 0.0641823, + "auxiliary_loss_mlp": 0.0127028, + "balance_loss_clip": 0.06276852, + "balance_loss_mlp": 0.01259188, + "epoch": 0.5726739816624079, + "flos": 33992936582400.0, + "grad_norm": 1.3727338087163001, + "language_loss": 0.6519655, + "learning_rate": 1.6284820166268615e-06, + "loss": 0.7288506, + "num_input_tokens_seen": 205185380, + "router_z_loss_clip": 1.41503906, + "router_z_loss_mlp": 0.11096191, + "step": 9525, + "time_per_iteration": 2.6264822483062744 + }, + { + "auxiliary_loss_clip": 0.0641274, + "auxiliary_loss_mlp": 0.01266201, + "balance_loss_clip": 0.06272839, + "balance_loss_mlp": 0.01255842, + "epoch": 0.5727341049150759, + "flos": 24282196329600.0, + "grad_norm": 1.6388418597669483, + "language_loss": 0.72797775, + "learning_rate": 1.628099340440984e-06, + "loss": 0.80476719, + "num_input_tokens_seen": 205204895, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.10351562, + "step": 9526, + "time_per_iteration": 2.5209100246429443 + }, + { + "auxiliary_loss_clip": 0.06418388, + "auxiliary_loss_mlp": 0.01268542, + "balance_loss_clip": 0.06280835, + "balance_loss_mlp": 0.01257897, + "epoch": 0.5727942281677438, + "flos": 28407762357120.0, + "grad_norm": 1.5546981496666945, + "language_loss": 0.80170763, + "learning_rate": 1.6277166783576176e-06, + "loss": 0.87857693, + "num_input_tokens_seen": 205223440, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.10650635, + "step": 9527, + "time_per_iteration": 2.6143245697021484 + }, + { + "auxiliary_loss_clip": 0.06413873, + "auxiliary_loss_mlp": 0.01269872, + "balance_loss_clip": 0.06275712, + "balance_loss_mlp": 0.01258983, + "epoch": 0.5728543514204119, + "flos": 19542861544320.0, + "grad_norm": 2.5128112924339585, + "language_loss": 0.72641492, + "learning_rate": 1.6273340303912713e-06, + "loss": 0.8032524, + "num_input_tokens_seen": 205242800, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.10894775, + "step": 9528, + "time_per_iteration": 2.4896552562713623 + }, + { + "auxiliary_loss_clip": 0.06418886, + "auxiliary_loss_mlp": 0.01267185, + "balance_loss_clip": 0.06277183, + "balance_loss_mlp": 0.0125577, + "epoch": 0.5729144746730798, + "flos": 21513137650560.0, + "grad_norm": 1.7938485336826149, + "language_loss": 0.85978115, + "learning_rate": 1.6269513965564557e-06, + "loss": 0.93664181, + "num_input_tokens_seen": 205259465, + "router_z_loss_clip": 1.41699219, + "router_z_loss_mlp": 0.11407471, + "step": 9529, + "time_per_iteration": 2.539447784423828 + }, + { + "auxiliary_loss_clip": 0.063314, + "auxiliary_loss_mlp": 0.01256121, + "balance_loss_clip": 0.06271826, + "balance_loss_mlp": 0.0125448, + "epoch": 0.5729745979257478, + "flos": 58699638495360.0, + "grad_norm": 0.750499003321047, + "language_loss": 0.55969286, + "learning_rate": 1.6265687768676813e-06, + "loss": 0.63556802, + "num_input_tokens_seen": 205314100, + "router_z_loss_clip": 0.59375, + "router_z_loss_mlp": 0.01643372, + "step": 9530, + "time_per_iteration": 3.007678747177124 + }, + { + "auxiliary_loss_clip": 0.06425051, + "auxiliary_loss_mlp": 0.01265649, + "balance_loss_clip": 0.06280611, + "balance_loss_mlp": 0.01254276, + "epoch": 0.5730347211784158, + "flos": 18558100834560.0, + "grad_norm": 1.9102815745402744, + "language_loss": 0.66843903, + "learning_rate": 1.6261861713394553e-06, + "loss": 0.74534607, + "num_input_tokens_seen": 205333420, + "router_z_loss_clip": 1.4453125, + "router_z_loss_mlp": 0.1137085, + "step": 9531, + "time_per_iteration": 3.9059529304504395 + }, + { + "auxiliary_loss_clip": 0.06417044, + "auxiliary_loss_mlp": 0.01269124, + "balance_loss_clip": 0.06274498, + "balance_loss_mlp": 0.01257966, + "epoch": 0.5730948444310837, + "flos": 38040069588480.0, + "grad_norm": 1.9862057863273674, + "language_loss": 0.75881588, + "learning_rate": 1.6258035799862876e-06, + "loss": 0.83567762, + "num_input_tokens_seen": 205350995, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.11169434, + "step": 9532, + "time_per_iteration": 2.640389919281006 + }, + { + "auxiliary_loss_clip": 0.06421025, + "auxiliary_loss_mlp": 0.01267077, + "balance_loss_clip": 0.06278558, + "balance_loss_mlp": 0.01255794, + "epoch": 0.5731549676837517, + "flos": 25233861876480.0, + "grad_norm": 1.2592580925122039, + "language_loss": 0.79252976, + "learning_rate": 1.625421002822686e-06, + "loss": 0.86941075, + "num_input_tokens_seen": 205372675, + "router_z_loss_clip": 1.42480469, + "router_z_loss_mlp": 0.11291504, + "step": 9533, + "time_per_iteration": 2.559293508529663 + }, + { + "auxiliary_loss_clip": 0.06417587, + "auxiliary_loss_mlp": 0.01266539, + "balance_loss_clip": 0.06278279, + "balance_loss_mlp": 0.01256067, + "epoch": 0.5732150909364196, + "flos": 23375030100480.0, + "grad_norm": 3.634749275276224, + "language_loss": 0.8597486, + "learning_rate": 1.6250384398631574e-06, + "loss": 0.93658984, + "num_input_tokens_seen": 205392590, + "router_z_loss_clip": 1.39257812, + "router_z_loss_mlp": 0.10467529, + "step": 9534, + "time_per_iteration": 2.539487838745117 + }, + { + "auxiliary_loss_clip": 0.06421855, + "auxiliary_loss_mlp": 0.01269069, + "balance_loss_clip": 0.06279784, + "balance_loss_mlp": 0.01257625, + "epoch": 0.5732752141890877, + "flos": 23086621946880.0, + "grad_norm": 1.944302626791885, + "language_loss": 0.75668436, + "learning_rate": 1.6246558911222085e-06, + "loss": 0.83359355, + "num_input_tokens_seen": 205414885, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.11444092, + "step": 9535, + "time_per_iteration": 2.5488839149475098 + }, + { + "auxiliary_loss_clip": 0.06425361, + "auxiliary_loss_mlp": 0.01268179, + "balance_loss_clip": 0.06278601, + "balance_loss_mlp": 0.01256288, + "epoch": 0.5733353374417556, + "flos": 24359078050560.0, + "grad_norm": 1.5155376410848522, + "language_loss": 0.71395552, + "learning_rate": 1.624273356614346e-06, + "loss": 0.79089081, + "num_input_tokens_seen": 205434440, + "router_z_loss_clip": 1.46582031, + "router_z_loss_mlp": 0.11895752, + "step": 9536, + "time_per_iteration": 2.553239345550537 + }, + { + "auxiliary_loss_clip": 0.06416988, + "auxiliary_loss_mlp": 0.01269432, + "balance_loss_clip": 0.06275923, + "balance_loss_mlp": 0.01258244, + "epoch": 0.5733954606944236, + "flos": 27206234334720.0, + "grad_norm": 1.742372783929404, + "language_loss": 0.70031548, + "learning_rate": 1.6238908363540755e-06, + "loss": 0.77717972, + "num_input_tokens_seen": 205454225, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.11187744, + "step": 9537, + "time_per_iteration": 2.5490598678588867 + }, + { + "auxiliary_loss_clip": 0.06419763, + "auxiliary_loss_mlp": 0.0126804, + "balance_loss_clip": 0.06277005, + "balance_loss_mlp": 0.01257317, + "epoch": 0.5734555839470915, + "flos": 28772339472000.0, + "grad_norm": 2.334146865026381, + "language_loss": 0.63052773, + "learning_rate": 1.623508330355902e-06, + "loss": 0.70740581, + "num_input_tokens_seen": 205474750, + "router_z_loss_clip": 1.42480469, + "router_z_loss_mlp": 0.10723877, + "step": 9538, + "time_per_iteration": 4.013959169387817 + }, + { + "auxiliary_loss_clip": 0.0641904, + "auxiliary_loss_mlp": 0.01273663, + "balance_loss_clip": 0.06277157, + "balance_loss_mlp": 0.0126136, + "epoch": 0.5735157071997595, + "flos": 22973542462080.0, + "grad_norm": 1.806157803076428, + "language_loss": 0.82720077, + "learning_rate": 1.6231258386343306e-06, + "loss": 0.90412778, + "num_input_tokens_seen": 205495495, + "router_z_loss_clip": 1.41796875, + "router_z_loss_mlp": 0.12310791, + "step": 9539, + "time_per_iteration": 2.554189682006836 + }, + { + "auxiliary_loss_clip": 0.06422378, + "auxiliary_loss_mlp": 0.01264392, + "balance_loss_clip": 0.06276339, + "balance_loss_mlp": 0.01253115, + "epoch": 0.5735758304524274, + "flos": 18995450820480.0, + "grad_norm": 2.0055639259958107, + "language_loss": 0.73150325, + "learning_rate": 1.6227433612038647e-06, + "loss": 0.80837095, + "num_input_tokens_seen": 205510070, + "router_z_loss_clip": 1.45996094, + "router_z_loss_mlp": 0.11279297, + "step": 9540, + "time_per_iteration": 2.500077486038208 + }, + { + "auxiliary_loss_clip": 0.0641907, + "auxiliary_loss_mlp": 0.01265571, + "balance_loss_clip": 0.06276584, + "balance_loss_mlp": 0.01255039, + "epoch": 0.5736359537050955, + "flos": 28404701683200.0, + "grad_norm": 2.024476848130698, + "language_loss": 0.80249465, + "learning_rate": 1.6223608980790089e-06, + "loss": 0.87934107, + "num_input_tokens_seen": 205530190, + "router_z_loss_clip": 1.42675781, + "router_z_loss_mlp": 0.10528564, + "step": 9541, + "time_per_iteration": 4.051165342330933 + }, + { + "auxiliary_loss_clip": 0.06425047, + "auxiliary_loss_mlp": 0.01265692, + "balance_loss_clip": 0.06278428, + "balance_loss_mlp": 0.01253998, + "epoch": 0.5736960769577634, + "flos": 15631714915200.0, + "grad_norm": 2.008860171144918, + "language_loss": 0.64482939, + "learning_rate": 1.6219784492742654e-06, + "loss": 0.72173679, + "num_input_tokens_seen": 205547380, + "router_z_loss_clip": 1.46386719, + "router_z_loss_mlp": 0.11700439, + "step": 9542, + "time_per_iteration": 2.5055642127990723 + }, + { + "auxiliary_loss_clip": 0.06417751, + "auxiliary_loss_mlp": 0.01265337, + "balance_loss_clip": 0.0627488, + "balance_loss_mlp": 0.01254691, + "epoch": 0.5737562002104314, + "flos": 18009767715840.0, + "grad_norm": 2.2598183554381146, + "language_loss": 0.83200055, + "learning_rate": 1.6215960148041365e-06, + "loss": 0.90883142, + "num_input_tokens_seen": 205566540, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.10638428, + "step": 9543, + "time_per_iteration": 2.4916088581085205 + }, + { + "auxiliary_loss_clip": 0.06426359, + "auxiliary_loss_mlp": 0.0126626, + "balance_loss_clip": 0.06279086, + "balance_loss_mlp": 0.01254422, + "epoch": 0.5738163234630994, + "flos": 20703454047360.0, + "grad_norm": 1.617850922862876, + "language_loss": 0.74024302, + "learning_rate": 1.6212135946831257e-06, + "loss": 0.81716919, + "num_input_tokens_seen": 205584200, + "router_z_loss_clip": 1.47460938, + "router_z_loss_mlp": 0.1184082, + "step": 9544, + "time_per_iteration": 2.536583662033081 + }, + { + "auxiliary_loss_clip": 0.06424204, + "auxiliary_loss_mlp": 0.01268479, + "balance_loss_clip": 0.06278355, + "balance_loss_mlp": 0.01256809, + "epoch": 0.5738764467157673, + "flos": 23156082581760.0, + "grad_norm": 3.1974440280178595, + "language_loss": 0.76412272, + "learning_rate": 1.620831188925733e-06, + "loss": 0.84104949, + "num_input_tokens_seen": 205604675, + "router_z_loss_clip": 1.45898438, + "router_z_loss_mlp": 0.11676025, + "step": 9545, + "time_per_iteration": 2.5427141189575195 + }, + { + "auxiliary_loss_clip": 0.06423136, + "auxiliary_loss_mlp": 0.01267499, + "balance_loss_clip": 0.06279323, + "balance_loss_mlp": 0.01256162, + "epoch": 0.5739365699684353, + "flos": 29499942401280.0, + "grad_norm": 2.3578945444753447, + "language_loss": 0.56573224, + "learning_rate": 1.620448797546459e-06, + "loss": 0.64263856, + "num_input_tokens_seen": 205624680, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.11334229, + "step": 9546, + "time_per_iteration": 2.608128309249878 + }, + { + "auxiliary_loss_clip": 0.06422536, + "auxiliary_loss_mlp": 0.01268737, + "balance_loss_clip": 0.0627693, + "balance_loss_mlp": 0.01257746, + "epoch": 0.5739966932211032, + "flos": 14032388833920.0, + "grad_norm": 2.2022917684402996, + "language_loss": 0.76728261, + "learning_rate": 1.6200664205598055e-06, + "loss": 0.84419537, + "num_input_tokens_seen": 205641950, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.10980225, + "step": 9547, + "time_per_iteration": 2.5017452239990234 + }, + { + "auxiliary_loss_clip": 0.06421655, + "auxiliary_loss_mlp": 0.01268546, + "balance_loss_clip": 0.06277436, + "balance_loss_mlp": 0.01257114, + "epoch": 0.5740568164737713, + "flos": 19067972129280.0, + "grad_norm": 1.9505887412268983, + "language_loss": 0.7442795, + "learning_rate": 1.6196840579802704e-06, + "loss": 0.82118154, + "num_input_tokens_seen": 205660130, + "router_z_loss_clip": 1.44042969, + "router_z_loss_mlp": 0.11444092, + "step": 9548, + "time_per_iteration": 2.549558639526367 + }, + { + "auxiliary_loss_clip": 0.06418206, + "auxiliary_loss_mlp": 0.01266472, + "balance_loss_clip": 0.0627545, + "balance_loss_mlp": 0.01255064, + "epoch": 0.5741169397264392, + "flos": 22134453275520.0, + "grad_norm": 2.3791642109865228, + "language_loss": 0.69704068, + "learning_rate": 1.619301709822355e-06, + "loss": 0.77388746, + "num_input_tokens_seen": 205678895, + "router_z_loss_clip": 1.42871094, + "router_z_loss_mlp": 0.11419678, + "step": 9549, + "time_per_iteration": 3.933781147003174 + }, + { + "auxiliary_loss_clip": 0.06420065, + "auxiliary_loss_mlp": 0.01265483, + "balance_loss_clip": 0.06279664, + "balance_loss_mlp": 0.01254611, + "epoch": 0.5741770629791072, + "flos": 24943860495360.0, + "grad_norm": 1.461228472430463, + "language_loss": 0.79521686, + "learning_rate": 1.6189193761005564e-06, + "loss": 0.87207234, + "num_input_tokens_seen": 205698450, + "router_z_loss_clip": 1.40527344, + "router_z_loss_mlp": 0.10870361, + "step": 9550, + "time_per_iteration": 2.577768087387085 + }, + { + "auxiliary_loss_clip": 0.06419414, + "auxiliary_loss_mlp": 0.01265674, + "balance_loss_clip": 0.06276274, + "balance_loss_mlp": 0.01254832, + "epoch": 0.5742371862317751, + "flos": 18806495863680.0, + "grad_norm": 2.119345289493334, + "language_loss": 0.68877375, + "learning_rate": 1.6185370568293727e-06, + "loss": 0.76562458, + "num_input_tokens_seen": 205714870, + "router_z_loss_clip": 1.43164062, + "router_z_loss_mlp": 0.10845947, + "step": 9551, + "time_per_iteration": 2.480468273162842 + }, + { + "auxiliary_loss_clip": 0.06424205, + "auxiliary_loss_mlp": 0.01267294, + "balance_loss_clip": 0.06276421, + "balance_loss_mlp": 0.0125579, + "epoch": 0.5742973094844431, + "flos": 24467293998720.0, + "grad_norm": 1.5487820488887025, + "language_loss": 0.72033125, + "learning_rate": 1.6181547520233031e-06, + "loss": 0.79724622, + "num_input_tokens_seen": 205736045, + "router_z_loss_clip": 1.47460938, + "router_z_loss_mlp": 0.11505127, + "step": 9552, + "time_per_iteration": 2.5759360790252686 + }, + { + "auxiliary_loss_clip": 0.06417461, + "auxiliary_loss_mlp": 0.01265348, + "balance_loss_clip": 0.06274983, + "balance_loss_mlp": 0.0125469, + "epoch": 0.574357432737111, + "flos": 21659186517120.0, + "grad_norm": 3.0495771997900163, + "language_loss": 0.79982221, + "learning_rate": 1.617772461696843e-06, + "loss": 0.87665033, + "num_input_tokens_seen": 205754445, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.10662842, + "step": 9553, + "time_per_iteration": 2.49290132522583 + }, + { + "auxiliary_loss_clip": 0.06423397, + "auxiliary_loss_mlp": 0.01264041, + "balance_loss_clip": 0.06275378, + "balance_loss_mlp": 0.0125333, + "epoch": 0.5744175559897791, + "flos": 16550285299200.0, + "grad_norm": 2.1324379432349425, + "language_loss": 0.83817756, + "learning_rate": 1.6173901858644895e-06, + "loss": 0.91505194, + "num_input_tokens_seen": 205770595, + "router_z_loss_clip": 1.48144531, + "router_z_loss_mlp": 0.1071167, + "step": 9554, + "time_per_iteration": 2.5118370056152344 + }, + { + "auxiliary_loss_clip": 0.06422277, + "auxiliary_loss_mlp": 0.01267015, + "balance_loss_clip": 0.06277014, + "balance_loss_mlp": 0.0125575, + "epoch": 0.574477679242447, + "flos": 24214580484480.0, + "grad_norm": 1.3861221814355518, + "language_loss": 0.71406233, + "learning_rate": 1.6170079245407385e-06, + "loss": 0.79095531, + "num_input_tokens_seen": 205791935, + "router_z_loss_clip": 1.45117188, + "router_z_loss_mlp": 0.11254883, + "step": 9555, + "time_per_iteration": 2.5466480255126953 + }, + { + "auxiliary_loss_clip": 0.06421511, + "auxiliary_loss_mlp": 0.01268077, + "balance_loss_clip": 0.06277835, + "balance_loss_mlp": 0.01256478, + "epoch": 0.574537802495115, + "flos": 14908304689920.0, + "grad_norm": 2.185347344801511, + "language_loss": 0.73004574, + "learning_rate": 1.6166256777400853e-06, + "loss": 0.80694163, + "num_input_tokens_seen": 205807260, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.1159668, + "step": 9556, + "time_per_iteration": 2.4900078773498535 + }, + { + "auxiliary_loss_clip": 0.0641879, + "auxiliary_loss_mlp": 0.01265172, + "balance_loss_clip": 0.06276919, + "balance_loss_mlp": 0.01253406, + "epoch": 0.5745979257477829, + "flos": 24941680289280.0, + "grad_norm": 1.5306662340422301, + "language_loss": 0.74479866, + "learning_rate": 1.6162434454770248e-06, + "loss": 0.82163835, + "num_input_tokens_seen": 205826885, + "router_z_loss_clip": 1.41796875, + "router_z_loss_mlp": 0.11761475, + "step": 9557, + "time_per_iteration": 2.576296329498291 + }, + { + "auxiliary_loss_clip": 0.06420197, + "auxiliary_loss_mlp": 0.01263736, + "balance_loss_clip": 0.06277291, + "balance_loss_mlp": 0.01252572, + "epoch": 0.5746580490004509, + "flos": 17241061559040.0, + "grad_norm": 1.5775139248237169, + "language_loss": 0.68007201, + "learning_rate": 1.6158612277660514e-06, + "loss": 0.75691128, + "num_input_tokens_seen": 205844630, + "router_z_loss_clip": 1.42871094, + "router_z_loss_mlp": 0.11157227, + "step": 9558, + "time_per_iteration": 2.531812906265259 + }, + { + "auxiliary_loss_clip": 0.06424935, + "auxiliary_loss_mlp": 0.01267243, + "balance_loss_clip": 0.06275487, + "balance_loss_mlp": 0.01253779, + "epoch": 0.5747181722531189, + "flos": 13192838449920.0, + "grad_norm": 2.425506842460266, + "language_loss": 0.71628273, + "learning_rate": 1.615479024621659e-06, + "loss": 0.79320455, + "num_input_tokens_seen": 205860960, + "router_z_loss_clip": 1.49511719, + "router_z_loss_mlp": 0.13482666, + "step": 9559, + "time_per_iteration": 2.473419189453125 + }, + { + "auxiliary_loss_clip": 0.06419484, + "auxiliary_loss_mlp": 0.01266983, + "balance_loss_clip": 0.06277612, + "balance_loss_mlp": 0.01256921, + "epoch": 0.5747782955057869, + "flos": 22969098195840.0, + "grad_norm": 1.5670628486073652, + "language_loss": 0.79416776, + "learning_rate": 1.6150968360583398e-06, + "loss": 0.87103242, + "num_input_tokens_seen": 205880675, + "router_z_loss_clip": 1.41796875, + "router_z_loss_mlp": 0.10064697, + "step": 9560, + "time_per_iteration": 2.532862663269043 + }, + { + "auxiliary_loss_clip": 0.06421925, + "auxiliary_loss_mlp": 0.01267007, + "balance_loss_clip": 0.06276737, + "balance_loss_mlp": 0.01255581, + "epoch": 0.5748384187584549, + "flos": 23409802344960.0, + "grad_norm": 1.793006683486937, + "language_loss": 0.64777875, + "learning_rate": 1.614714662090588e-06, + "loss": 0.72466803, + "num_input_tokens_seen": 205900050, + "router_z_loss_clip": 1.45214844, + "router_z_loss_mlp": 0.11431885, + "step": 9561, + "time_per_iteration": 2.5111758708953857 + }, + { + "auxiliary_loss_clip": 0.06426983, + "auxiliary_loss_mlp": 0.01268046, + "balance_loss_clip": 0.06277155, + "balance_loss_mlp": 0.01256369, + "epoch": 0.5748985420111228, + "flos": 17791323321600.0, + "grad_norm": 1.4966227163397983, + "language_loss": 0.7114228, + "learning_rate": 1.6143325027328945e-06, + "loss": 0.78837311, + "num_input_tokens_seen": 205918855, + "router_z_loss_clip": 1.49804688, + "router_z_loss_mlp": 0.11682129, + "step": 9562, + "time_per_iteration": 2.5162081718444824 + }, + { + "auxiliary_loss_clip": 0.06425486, + "auxiliary_loss_mlp": 0.01266976, + "balance_loss_clip": 0.06280454, + "balance_loss_mlp": 0.01256081, + "epoch": 0.5749586652637908, + "flos": 19872582560640.0, + "grad_norm": 1.4328664867345224, + "language_loss": 0.84269559, + "learning_rate": 1.613950357999751e-06, + "loss": 0.91962022, + "num_input_tokens_seen": 205936970, + "router_z_loss_clip": 1.44921875, + "router_z_loss_mlp": 0.10888672, + "step": 9563, + "time_per_iteration": 2.5183188915252686 + }, + { + "auxiliary_loss_clip": 0.06421089, + "auxiliary_loss_mlp": 0.01268857, + "balance_loss_clip": 0.06273992, + "balance_loss_mlp": 0.01256733, + "epoch": 0.5750187885164587, + "flos": 21293477372160.0, + "grad_norm": 2.089685167133714, + "language_loss": 0.57297182, + "learning_rate": 1.6135682279056488e-06, + "loss": 0.64987123, + "num_input_tokens_seen": 205954630, + "router_z_loss_clip": 1.46972656, + "router_z_loss_mlp": 0.12127686, + "step": 9564, + "time_per_iteration": 2.5219571590423584 + }, + { + "auxiliary_loss_clip": 0.06414357, + "auxiliary_loss_mlp": 0.01264565, + "balance_loss_clip": 0.06276927, + "balance_loss_mlp": 0.0125389, + "epoch": 0.5750789117691267, + "flos": 18810227370240.0, + "grad_norm": 1.5824685354584669, + "language_loss": 0.76484299, + "learning_rate": 1.613186112465078e-06, + "loss": 0.84163225, + "num_input_tokens_seen": 205971510, + "router_z_loss_clip": 1.37402344, + "router_z_loss_mlp": 0.10681152, + "step": 9565, + "time_per_iteration": 2.4752280712127686 + }, + { + "auxiliary_loss_clip": 0.06321105, + "auxiliary_loss_mlp": 0.01250694, + "balance_loss_clip": 0.06260607, + "balance_loss_mlp": 0.01249219, + "epoch": 0.5751390350217946, + "flos": 70685624188800.0, + "grad_norm": 0.721103953507815, + "language_loss": 0.6068033, + "learning_rate": 1.6128040116925287e-06, + "loss": 0.68252128, + "num_input_tokens_seen": 206035125, + "router_z_loss_clip": 0.609375, + "router_z_loss_mlp": 0.01473999, + "step": 9566, + "time_per_iteration": 3.222144603729248 + }, + { + "auxiliary_loss_clip": 0.06420306, + "auxiliary_loss_mlp": 0.01268432, + "balance_loss_clip": 0.06276581, + "balance_loss_mlp": 0.01257673, + "epoch": 0.5751991582744627, + "flos": 14251545987840.0, + "grad_norm": 2.0959328312792467, + "language_loss": 0.75654471, + "learning_rate": 1.6124219256024901e-06, + "loss": 0.83343208, + "num_input_tokens_seen": 206052075, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.10760498, + "step": 9567, + "time_per_iteration": 2.4892570972442627 + }, + { + "auxiliary_loss_clip": 0.06417775, + "auxiliary_loss_mlp": 0.01267193, + "balance_loss_clip": 0.06274199, + "balance_loss_mlp": 0.01255875, + "epoch": 0.5752592815271306, + "flos": 18333283530240.0, + "grad_norm": 1.4488652909067903, + "language_loss": 0.75253701, + "learning_rate": 1.6120398542094504e-06, + "loss": 0.82938665, + "num_input_tokens_seen": 206069970, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.11322021, + "step": 9568, + "time_per_iteration": 2.473475217819214 + }, + { + "auxiliary_loss_clip": 0.06419896, + "auxiliary_loss_mlp": 0.01265316, + "balance_loss_clip": 0.06276227, + "balance_loss_mlp": 0.01254349, + "epoch": 0.5753194047797986, + "flos": 20928984111360.0, + "grad_norm": 1.5107907301615, + "language_loss": 0.71293747, + "learning_rate": 1.6116577975278994e-06, + "loss": 0.78978956, + "num_input_tokens_seen": 206088950, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.10968018, + "step": 9569, + "time_per_iteration": 2.6541481018066406 + }, + { + "auxiliary_loss_clip": 0.06420765, + "auxiliary_loss_mlp": 0.01267458, + "balance_loss_clip": 0.06275727, + "balance_loss_mlp": 0.01255764, + "epoch": 0.5753795280324665, + "flos": 19287925896960.0, + "grad_norm": 2.027519323892087, + "language_loss": 0.56120193, + "learning_rate": 1.6112757555723223e-06, + "loss": 0.63808417, + "num_input_tokens_seen": 206107780, + "router_z_loss_clip": 1.45019531, + "router_z_loss_mlp": 0.11694336, + "step": 9570, + "time_per_iteration": 2.5568745136260986 + }, + { + "auxiliary_loss_clip": 0.0641574, + "auxiliary_loss_mlp": 0.01264384, + "balance_loss_clip": 0.06274444, + "balance_loss_mlp": 0.01253715, + "epoch": 0.5754396512851345, + "flos": 21659312298240.0, + "grad_norm": 3.8103947749492355, + "language_loss": 0.64502007, + "learning_rate": 1.6108937283572082e-06, + "loss": 0.72182131, + "num_input_tokens_seen": 206127445, + "router_z_loss_clip": 1.41210938, + "router_z_loss_mlp": 0.10675049, + "step": 9571, + "time_per_iteration": 3.9861292839050293 + }, + { + "auxiliary_loss_clip": 0.06417111, + "auxiliary_loss_mlp": 0.01267965, + "balance_loss_clip": 0.06275386, + "balance_loss_mlp": 0.01257153, + "epoch": 0.5754997745378025, + "flos": 51032674707840.0, + "grad_norm": 1.44401056534108, + "language_loss": 0.67167187, + "learning_rate": 1.6105117158970434e-06, + "loss": 0.74852264, + "num_input_tokens_seen": 206152005, + "router_z_loss_clip": 1.41601562, + "router_z_loss_mlp": 0.10821533, + "step": 9572, + "time_per_iteration": 2.775322198867798 + }, + { + "auxiliary_loss_clip": 0.06417632, + "auxiliary_loss_mlp": 0.0126415, + "balance_loss_clip": 0.06276821, + "balance_loss_mlp": 0.01252378, + "epoch": 0.5755598977904705, + "flos": 22863523651200.0, + "grad_norm": 1.9643261986613603, + "language_loss": 0.72534865, + "learning_rate": 1.6101297182063123e-06, + "loss": 0.80216646, + "num_input_tokens_seen": 206169875, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.11767578, + "step": 9573, + "time_per_iteration": 2.504248857498169 + }, + { + "auxiliary_loss_clip": 0.06413124, + "auxiliary_loss_mlp": 0.01264891, + "balance_loss_clip": 0.06276227, + "balance_loss_mlp": 0.0125495, + "epoch": 0.5756200210431385, + "flos": 38482073475840.0, + "grad_norm": 1.6390607800794645, + "language_loss": 0.76527274, + "learning_rate": 1.6097477352995022e-06, + "loss": 0.84205294, + "num_input_tokens_seen": 206192635, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.09954834, + "step": 9574, + "time_per_iteration": 2.675445079803467 + }, + { + "auxiliary_loss_clip": 0.06426176, + "auxiliary_loss_mlp": 0.01264732, + "balance_loss_clip": 0.06277125, + "balance_loss_mlp": 0.01252865, + "epoch": 0.5756801442958064, + "flos": 23915984060160.0, + "grad_norm": 3.486560074307127, + "language_loss": 0.67186499, + "learning_rate": 1.6093657671910968e-06, + "loss": 0.74877405, + "num_input_tokens_seen": 206211485, + "router_z_loss_clip": 1.49121094, + "router_z_loss_mlp": 0.11877441, + "step": 9575, + "time_per_iteration": 2.5086028575897217 + }, + { + "auxiliary_loss_clip": 0.06414266, + "auxiliary_loss_mlp": 0.01263942, + "balance_loss_clip": 0.06275645, + "balance_loss_mlp": 0.01253899, + "epoch": 0.5757402675484744, + "flos": 21111566158080.0, + "grad_norm": 1.4184952738773886, + "language_loss": 0.80574554, + "learning_rate": 1.6089838138955804e-06, + "loss": 0.88252765, + "num_input_tokens_seen": 206231740, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.1005249, + "step": 9576, + "time_per_iteration": 2.502372980117798 + }, + { + "auxiliary_loss_clip": 0.06413178, + "auxiliary_loss_mlp": 0.01266947, + "balance_loss_clip": 0.06273341, + "balance_loss_mlp": 0.01256439, + "epoch": 0.5758003908011423, + "flos": 20565497099520.0, + "grad_norm": 1.5791511975506907, + "language_loss": 0.69807208, + "learning_rate": 1.6086018754274372e-06, + "loss": 0.77487338, + "num_input_tokens_seen": 206250975, + "router_z_loss_clip": 1.39648438, + "router_z_loss_mlp": 0.10510254, + "step": 9577, + "time_per_iteration": 4.000526428222656 + }, + { + "auxiliary_loss_clip": 0.06420817, + "auxiliary_loss_mlp": 0.0126492, + "balance_loss_clip": 0.06274913, + "balance_loss_mlp": 0.012544, + "epoch": 0.5758605140538103, + "flos": 16478770239360.0, + "grad_norm": 1.7483336770936004, + "language_loss": 0.66710907, + "learning_rate": 1.6082199518011504e-06, + "loss": 0.74396646, + "num_input_tokens_seen": 206268800, + "router_z_loss_clip": 1.45800781, + "router_z_loss_mlp": 0.10510254, + "step": 9578, + "time_per_iteration": 2.495589256286621 + }, + { + "auxiliary_loss_clip": 0.06417773, + "auxiliary_loss_mlp": 0.01264669, + "balance_loss_clip": 0.06276586, + "balance_loss_mlp": 0.01254274, + "epoch": 0.5759206373064782, + "flos": 21293854715520.0, + "grad_norm": 1.4632151435184575, + "language_loss": 0.72808439, + "learning_rate": 1.6078380430312016e-06, + "loss": 0.80490887, + "num_input_tokens_seen": 206287190, + "router_z_loss_clip": 1.41210938, + "router_z_loss_mlp": 0.10388184, + "step": 9579, + "time_per_iteration": 2.4900078773498535 + }, + { + "auxiliary_loss_clip": 0.06426738, + "auxiliary_loss_mlp": 0.01266533, + "balance_loss_clip": 0.06278113, + "balance_loss_mlp": 0.01254451, + "epoch": 0.5759807605591463, + "flos": 26075089342080.0, + "grad_norm": 2.9637416190029597, + "language_loss": 0.64800644, + "learning_rate": 1.6074561491320742e-06, + "loss": 0.72493923, + "num_input_tokens_seen": 206307020, + "router_z_loss_clip": 1.48535156, + "router_z_loss_mlp": 0.12072754, + "step": 9580, + "time_per_iteration": 2.532273292541504 + }, + { + "auxiliary_loss_clip": 0.06420532, + "auxiliary_loss_mlp": 0.01266688, + "balance_loss_clip": 0.06275357, + "balance_loss_mlp": 0.01255554, + "epoch": 0.5760408838118142, + "flos": 18877885142400.0, + "grad_norm": 1.6521602857434026, + "language_loss": 0.85497582, + "learning_rate": 1.6070742701182486e-06, + "loss": 0.93184799, + "num_input_tokens_seen": 206324095, + "router_z_loss_clip": 1.45214844, + "router_z_loss_mlp": 0.11132812, + "step": 9581, + "time_per_iteration": 3.9159321784973145 + }, + { + "auxiliary_loss_clip": 0.06425697, + "auxiliary_loss_mlp": 0.01268939, + "balance_loss_clip": 0.06276281, + "balance_loss_mlp": 0.01257483, + "epoch": 0.5761010070644822, + "flos": 15383655302400.0, + "grad_norm": 2.053627577895993, + "language_loss": 0.67847329, + "learning_rate": 1.6066924060042057e-06, + "loss": 0.75541961, + "num_input_tokens_seen": 206343210, + "router_z_loss_clip": 1.49414062, + "router_z_loss_mlp": 0.11450195, + "step": 9582, + "time_per_iteration": 2.468289613723755 + }, + { + "auxiliary_loss_clip": 0.06323063, + "auxiliary_loss_mlp": 0.0125238, + "balance_loss_clip": 0.06262786, + "balance_loss_mlp": 0.01250932, + "epoch": 0.5761611303171501, + "flos": 71495475500160.0, + "grad_norm": 0.6295597289579254, + "language_loss": 0.5722791, + "learning_rate": 1.6063105568044271e-06, + "loss": 0.64803356, + "num_input_tokens_seen": 206415935, + "router_z_loss_clip": 0.60253906, + "router_z_loss_mlp": 0.0144577, + "step": 9583, + "time_per_iteration": 3.280832052230835 + }, + { + "auxiliary_loss_clip": 0.06416009, + "auxiliary_loss_mlp": 0.0126413, + "balance_loss_clip": 0.06274246, + "balance_loss_mlp": 0.01253437, + "epoch": 0.5762212535698181, + "flos": 16250556844800.0, + "grad_norm": 1.895482028357212, + "language_loss": 0.82933408, + "learning_rate": 1.6059287225333912e-06, + "loss": 0.90613544, + "num_input_tokens_seen": 206431900, + "router_z_loss_clip": 1.41894531, + "router_z_loss_mlp": 0.10693359, + "step": 9584, + "time_per_iteration": 2.473771333694458 + }, + { + "auxiliary_loss_clip": 0.06325932, + "auxiliary_loss_mlp": 0.01252168, + "balance_loss_clip": 0.06265227, + "balance_loss_mlp": 0.01250696, + "epoch": 0.5762813768224861, + "flos": 70207254829440.0, + "grad_norm": 0.6148723792494001, + "language_loss": 0.49547607, + "learning_rate": 1.6055469032055773e-06, + "loss": 0.57125711, + "num_input_tokens_seen": 206501200, + "router_z_loss_clip": 0.609375, + "router_z_loss_mlp": 0.0147171, + "step": 9585, + "time_per_iteration": 3.220283031463623 + }, + { + "auxiliary_loss_clip": 0.06417918, + "auxiliary_loss_mlp": 0.0126733, + "balance_loss_clip": 0.06276701, + "balance_loss_mlp": 0.01256446, + "epoch": 0.5763415000751541, + "flos": 20523639185280.0, + "grad_norm": 1.396891707955096, + "language_loss": 0.84832788, + "learning_rate": 1.605165098835465e-06, + "loss": 0.92518032, + "num_input_tokens_seen": 206520575, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.10876465, + "step": 9586, + "time_per_iteration": 2.5044658184051514 + }, + { + "auxiliary_loss_clip": 0.0641425, + "auxiliary_loss_mlp": 0.01268611, + "balance_loss_clip": 0.06270906, + "balance_loss_mlp": 0.01257584, + "epoch": 0.5764016233278221, + "flos": 15821047215360.0, + "grad_norm": 1.5476594832750246, + "language_loss": 0.80150878, + "learning_rate": 1.6047833094375308e-06, + "loss": 0.87833744, + "num_input_tokens_seen": 206538060, + "router_z_loss_clip": 1.43457031, + "router_z_loss_mlp": 0.11035156, + "step": 9587, + "time_per_iteration": 2.494929552078247 + }, + { + "auxiliary_loss_clip": 0.06421454, + "auxiliary_loss_mlp": 0.01267229, + "balance_loss_clip": 0.06277972, + "balance_loss_mlp": 0.01256184, + "epoch": 0.57646174658049, + "flos": 20777778218880.0, + "grad_norm": 1.3785070074858572, + "language_loss": 0.6626485, + "learning_rate": 1.6044015350262542e-06, + "loss": 0.73953533, + "num_input_tokens_seen": 206557320, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.11047363, + "step": 9588, + "time_per_iteration": 3.990769863128662 + }, + { + "auxiliary_loss_clip": 0.06420319, + "auxiliary_loss_mlp": 0.01268847, + "balance_loss_clip": 0.0627601, + "balance_loss_mlp": 0.01256491, + "epoch": 0.576521869833158, + "flos": 23556647825280.0, + "grad_norm": 1.8252792275452514, + "language_loss": 0.79050291, + "learning_rate": 1.6040197756161104e-06, + "loss": 0.86739457, + "num_input_tokens_seen": 206575780, + "router_z_loss_clip": 1.44335938, + "router_z_loss_mlp": 0.1237793, + "step": 9589, + "time_per_iteration": 2.5151610374450684 + }, + { + "auxiliary_loss_clip": 0.06414266, + "auxiliary_loss_mlp": 0.01264887, + "balance_loss_clip": 0.06275681, + "balance_loss_mlp": 0.01254652, + "epoch": 0.5765819930858259, + "flos": 20272812387840.0, + "grad_norm": 1.9044444718181142, + "language_loss": 0.79799986, + "learning_rate": 1.6036380312215762e-06, + "loss": 0.87479138, + "num_input_tokens_seen": 206594100, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.10229492, + "step": 9590, + "time_per_iteration": 2.502588987350464 + }, + { + "auxiliary_loss_clip": 0.06424554, + "auxiliary_loss_mlp": 0.01266306, + "balance_loss_clip": 0.06279668, + "balance_loss_mlp": 0.01256096, + "epoch": 0.5766421163384939, + "flos": 23155453676160.0, + "grad_norm": 1.9323149052957644, + "language_loss": 0.63195986, + "learning_rate": 1.6032563018571283e-06, + "loss": 0.7088685, + "num_input_tokens_seen": 206613325, + "router_z_loss_clip": 1.44921875, + "router_z_loss_mlp": 0.10217285, + "step": 9591, + "time_per_iteration": 2.5217199325561523 + }, + { + "auxiliary_loss_clip": 0.0641837, + "auxiliary_loss_mlp": 0.0126852, + "balance_loss_clip": 0.06274436, + "balance_loss_mlp": 0.01258048, + "epoch": 0.5767022395911618, + "flos": 25856057969280.0, + "grad_norm": 1.7751118346977903, + "language_loss": 0.78161305, + "learning_rate": 1.6028745875372406e-06, + "loss": 0.85848188, + "num_input_tokens_seen": 206634265, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.10473633, + "step": 9592, + "time_per_iteration": 2.586398124694824 + }, + { + "auxiliary_loss_clip": 0.06325077, + "auxiliary_loss_mlp": 0.0125376, + "balance_loss_clip": 0.06264462, + "balance_loss_mlp": 0.01252203, + "epoch": 0.5767623628438299, + "flos": 68315579452800.0, + "grad_norm": 0.723864489522512, + "language_loss": 0.59626555, + "learning_rate": 1.6024928882763885e-06, + "loss": 0.67205393, + "num_input_tokens_seen": 206696990, + "router_z_loss_clip": 0.60888672, + "router_z_loss_mlp": 0.01555634, + "step": 9593, + "time_per_iteration": 3.245339870452881 + }, + { + "auxiliary_loss_clip": 0.06419121, + "auxiliary_loss_mlp": 0.01266388, + "balance_loss_clip": 0.06272256, + "balance_loss_mlp": 0.01254432, + "epoch": 0.5768224860964978, + "flos": 30195959541120.0, + "grad_norm": 1.4712512924104606, + "language_loss": 0.70970887, + "learning_rate": 1.6021112040890463e-06, + "loss": 0.78656393, + "num_input_tokens_seen": 206717815, + "router_z_loss_clip": 1.46972656, + "router_z_loss_mlp": 0.11956787, + "step": 9594, + "time_per_iteration": 2.575716018676758 + }, + { + "auxiliary_loss_clip": 0.06417293, + "auxiliary_loss_mlp": 0.01269346, + "balance_loss_clip": 0.0627408, + "balance_loss_mlp": 0.01259237, + "epoch": 0.5768826093491658, + "flos": 17900880935040.0, + "grad_norm": 1.6705807126416699, + "language_loss": 0.71305418, + "learning_rate": 1.6017295349896863e-06, + "loss": 0.78992057, + "num_input_tokens_seen": 206735985, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.10101318, + "step": 9595, + "time_per_iteration": 2.492614269256592 + }, + { + "auxiliary_loss_clip": 0.06416321, + "auxiliary_loss_mlp": 0.01269009, + "balance_loss_clip": 0.06273369, + "balance_loss_mlp": 0.01257481, + "epoch": 0.5769427326018337, + "flos": 17462943970560.0, + "grad_norm": 1.9433978950195214, + "language_loss": 0.69787997, + "learning_rate": 1.6013478809927828e-06, + "loss": 0.77473325, + "num_input_tokens_seen": 206753370, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.11529541, + "step": 9596, + "time_per_iteration": 2.527899742126465 + }, + { + "auxiliary_loss_clip": 0.06425576, + "auxiliary_loss_mlp": 0.01267355, + "balance_loss_clip": 0.06275462, + "balance_loss_mlp": 0.01254558, + "epoch": 0.5770028558545017, + "flos": 39431181473280.0, + "grad_norm": 1.7020557646527, + "language_loss": 0.67913234, + "learning_rate": 1.6009662421128074e-06, + "loss": 0.75606167, + "num_input_tokens_seen": 206777645, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.12792969, + "step": 9597, + "time_per_iteration": 2.6754841804504395 + }, + { + "auxiliary_loss_clip": 0.06417054, + "auxiliary_loss_mlp": 0.01265896, + "balance_loss_clip": 0.06273974, + "balance_loss_mlp": 0.01255322, + "epoch": 0.5770629791071697, + "flos": 21541620839040.0, + "grad_norm": 1.8412029810529236, + "language_loss": 0.82291842, + "learning_rate": 1.6005846183642323e-06, + "loss": 0.89974791, + "num_input_tokens_seen": 206794865, + "router_z_loss_clip": 1.43164062, + "router_z_loss_mlp": 0.105896, + "step": 9598, + "time_per_iteration": 2.510817527770996 + }, + { + "auxiliary_loss_clip": 0.06420396, + "auxiliary_loss_mlp": 0.01268157, + "balance_loss_clip": 0.06275374, + "balance_loss_mlp": 0.01256511, + "epoch": 0.5771231023598377, + "flos": 20893121763840.0, + "grad_norm": 1.43847663479929, + "language_loss": 0.73386133, + "learning_rate": 1.6002030097615277e-06, + "loss": 0.81074691, + "num_input_tokens_seen": 206814095, + "router_z_loss_clip": 1.44921875, + "router_z_loss_mlp": 0.11639404, + "step": 9599, + "time_per_iteration": 2.492751121520996 + }, + { + "auxiliary_loss_clip": 0.06411996, + "auxiliary_loss_mlp": 0.01265144, + "balance_loss_clip": 0.06272705, + "balance_loss_mlp": 0.01254772, + "epoch": 0.5771832256125057, + "flos": 18083043711360.0, + "grad_norm": 1.7867114623476337, + "language_loss": 0.78284144, + "learning_rate": 1.5998214163191663e-06, + "loss": 0.85961294, + "num_input_tokens_seen": 206832245, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.10369873, + "step": 9600, + "time_per_iteration": 2.4890565872192383 + }, + { + "auxiliary_loss_clip": 0.06422748, + "auxiliary_loss_mlp": 0.01268331, + "balance_loss_clip": 0.06276144, + "balance_loss_mlp": 0.01256893, + "epoch": 0.5772433488651736, + "flos": 26366222753280.0, + "grad_norm": 1.8856132517408855, + "language_loss": 0.72472572, + "learning_rate": 1.5994398380516163e-06, + "loss": 0.80163646, + "num_input_tokens_seen": 206851535, + "router_z_loss_clip": 1.46582031, + "router_z_loss_mlp": 0.11450195, + "step": 9601, + "time_per_iteration": 2.536994218826294 + }, + { + "auxiliary_loss_clip": 0.06415705, + "auxiliary_loss_mlp": 0.0126476, + "balance_loss_clip": 0.06274568, + "balance_loss_mlp": 0.01253506, + "epoch": 0.5773034721178416, + "flos": 19686814058880.0, + "grad_norm": 1.49916876372247, + "language_loss": 0.68989396, + "learning_rate": 1.599058274973348e-06, + "loss": 0.7666986, + "num_input_tokens_seen": 206870595, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.11254883, + "step": 9602, + "time_per_iteration": 2.4855434894561768 + }, + { + "auxiliary_loss_clip": 0.06409699, + "auxiliary_loss_mlp": 0.01267414, + "balance_loss_clip": 0.06272521, + "balance_loss_mlp": 0.01257287, + "epoch": 0.5773635953705095, + "flos": 25089951288960.0, + "grad_norm": 1.4178586949074146, + "language_loss": 0.73199558, + "learning_rate": 1.5986767270988297e-06, + "loss": 0.80876672, + "num_input_tokens_seen": 206892320, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.10125732, + "step": 9603, + "time_per_iteration": 2.5496528148651123 + }, + { + "auxiliary_loss_clip": 0.06418322, + "auxiliary_loss_mlp": 0.01267189, + "balance_loss_clip": 0.06276152, + "balance_loss_mlp": 0.01256162, + "epoch": 0.5774237186231775, + "flos": 21039380265600.0, + "grad_norm": 1.5159674911644692, + "language_loss": 0.76686621, + "learning_rate": 1.5982951944425298e-06, + "loss": 0.84372133, + "num_input_tokens_seen": 206912485, + "router_z_loss_clip": 1.41992188, + "router_z_loss_mlp": 0.11035156, + "step": 9604, + "time_per_iteration": 2.522033452987671 + }, + { + "auxiliary_loss_clip": 0.06420808, + "auxiliary_loss_mlp": 0.01271467, + "balance_loss_clip": 0.06277063, + "balance_loss_mlp": 0.01259373, + "epoch": 0.5774838418758454, + "flos": 15237145238400.0, + "grad_norm": 2.0065352138527808, + "language_loss": 0.83384192, + "learning_rate": 1.5979136770189174e-06, + "loss": 0.91076463, + "num_input_tokens_seen": 206929100, + "router_z_loss_clip": 1.43652344, + "router_z_loss_mlp": 0.12097168, + "step": 9605, + "time_per_iteration": 2.4643824100494385 + }, + { + "auxiliary_loss_clip": 0.0643101, + "auxiliary_loss_mlp": 0.01267132, + "balance_loss_clip": 0.06278086, + "balance_loss_mlp": 0.01254913, + "epoch": 0.5775439651285135, + "flos": 23588694812160.0, + "grad_norm": 1.6400067603153077, + "language_loss": 0.78330255, + "learning_rate": 1.5975321748424581e-06, + "loss": 0.86028397, + "num_input_tokens_seen": 206947020, + "router_z_loss_clip": 1.52832031, + "router_z_loss_mlp": 0.12207031, + "step": 9606, + "time_per_iteration": 2.5217928886413574 + }, + { + "auxiliary_loss_clip": 0.06417712, + "auxiliary_loss_mlp": 0.0126431, + "balance_loss_clip": 0.06273665, + "balance_loss_mlp": 0.01252687, + "epoch": 0.5776040883811814, + "flos": 18046300896000.0, + "grad_norm": 1.7192315062710783, + "language_loss": 0.73891246, + "learning_rate": 1.597150687927619e-06, + "loss": 0.81573272, + "num_input_tokens_seen": 206964065, + "router_z_loss_clip": 1.43847656, + "router_z_loss_mlp": 0.11633301, + "step": 9607, + "time_per_iteration": 2.4798216819763184 + }, + { + "auxiliary_loss_clip": 0.06424229, + "auxiliary_loss_mlp": 0.01268528, + "balance_loss_clip": 0.06277244, + "balance_loss_mlp": 0.01256368, + "epoch": 0.5776642116338494, + "flos": 18630580216320.0, + "grad_norm": 1.602339688767026, + "language_loss": 0.69749868, + "learning_rate": 1.5967692162888664e-06, + "loss": 0.77442622, + "num_input_tokens_seen": 206981940, + "router_z_loss_clip": 1.46972656, + "router_z_loss_mlp": 0.121521, + "step": 9608, + "time_per_iteration": 2.5238630771636963 + }, + { + "auxiliary_loss_clip": 0.06419271, + "auxiliary_loss_mlp": 0.01267568, + "balance_loss_clip": 0.06275126, + "balance_loss_mlp": 0.01255814, + "epoch": 0.5777243348865173, + "flos": 28410068344320.0, + "grad_norm": 1.9615645043462706, + "language_loss": 0.76945466, + "learning_rate": 1.596387759940665e-06, + "loss": 0.84632301, + "num_input_tokens_seen": 207002365, + "router_z_loss_clip": 1.44042969, + "router_z_loss_mlp": 0.11749268, + "step": 9609, + "time_per_iteration": 2.549933671951294 + }, + { + "auxiliary_loss_clip": 0.0642001, + "auxiliary_loss_mlp": 0.01267285, + "balance_loss_clip": 0.06273153, + "balance_loss_mlp": 0.01255084, + "epoch": 0.5777844581391853, + "flos": 24031579167360.0, + "grad_norm": 1.544459178362984, + "language_loss": 0.77057648, + "learning_rate": 1.5960063188974808e-06, + "loss": 0.84744948, + "num_input_tokens_seen": 207021195, + "router_z_loss_clip": 1.46777344, + "router_z_loss_mlp": 0.12200928, + "step": 9610, + "time_per_iteration": 2.5409657955169678 + }, + { + "auxiliary_loss_clip": 0.06419136, + "auxiliary_loss_mlp": 0.01273329, + "balance_loss_clip": 0.06273989, + "balance_loss_mlp": 0.01261104, + "epoch": 0.5778445813918534, + "flos": 17781805883520.0, + "grad_norm": 2.0334076468596463, + "language_loss": 0.69377804, + "learning_rate": 1.5956248931737777e-06, + "loss": 0.77070266, + "num_input_tokens_seen": 207037465, + "router_z_loss_clip": 1.44921875, + "router_z_loss_mlp": 0.12231445, + "step": 9611, + "time_per_iteration": 3.8771145343780518 + }, + { + "auxiliary_loss_clip": 0.06415454, + "auxiliary_loss_mlp": 0.01265667, + "balance_loss_clip": 0.06272358, + "balance_loss_mlp": 0.01254795, + "epoch": 0.5779047046445213, + "flos": 22239147352320.0, + "grad_norm": 1.7756554406320284, + "language_loss": 0.84048247, + "learning_rate": 1.5952434827840185e-06, + "loss": 0.91729373, + "num_input_tokens_seen": 207054230, + "router_z_loss_clip": 1.43359375, + "router_z_loss_mlp": 0.10876465, + "step": 9612, + "time_per_iteration": 2.4897758960723877 + }, + { + "auxiliary_loss_clip": 0.06417899, + "auxiliary_loss_mlp": 0.01267936, + "balance_loss_clip": 0.06275887, + "balance_loss_mlp": 0.01257046, + "epoch": 0.5779648278971893, + "flos": 21440825976960.0, + "grad_norm": 1.4853190478070708, + "language_loss": 0.80038643, + "learning_rate": 1.594862087742667e-06, + "loss": 0.87724483, + "num_input_tokens_seen": 207073150, + "router_z_loss_clip": 1.41894531, + "router_z_loss_mlp": 0.10894775, + "step": 9613, + "time_per_iteration": 2.512202501296997 + }, + { + "auxiliary_loss_clip": 0.06417654, + "auxiliary_loss_mlp": 0.01265916, + "balance_loss_clip": 0.06274515, + "balance_loss_mlp": 0.01254996, + "epoch": 0.5780249511498572, + "flos": 19032151708800.0, + "grad_norm": 1.6718641196950235, + "language_loss": 0.7774657, + "learning_rate": 1.5944807080641863e-06, + "loss": 0.85430139, + "num_input_tokens_seen": 207090375, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.10925293, + "step": 9614, + "time_per_iteration": 2.4882118701934814 + }, + { + "auxiliary_loss_clip": 0.06421545, + "auxiliary_loss_mlp": 0.0126591, + "balance_loss_clip": 0.06274751, + "balance_loss_mlp": 0.01254543, + "epoch": 0.5780850744025252, + "flos": 12128596542720.0, + "grad_norm": 2.0494146854902175, + "language_loss": 0.82224047, + "learning_rate": 1.5940993437630375e-06, + "loss": 0.89911503, + "num_input_tokens_seen": 207106030, + "router_z_loss_clip": 1.46679688, + "router_z_loss_mlp": 0.1137085, + "step": 9615, + "time_per_iteration": 2.472621440887451 + }, + { + "auxiliary_loss_clip": 0.0642141, + "auxiliary_loss_mlp": 0.01267646, + "balance_loss_clip": 0.06274787, + "balance_loss_mlp": 0.01255552, + "epoch": 0.5781451976551931, + "flos": 25051154048640.0, + "grad_norm": 1.4669220513135932, + "language_loss": 0.67472255, + "learning_rate": 1.5937179948536825e-06, + "loss": 0.75161308, + "num_input_tokens_seen": 207125435, + "router_z_loss_clip": 1.46679688, + "router_z_loss_mlp": 0.12097168, + "step": 9616, + "time_per_iteration": 2.534846782684326 + }, + { + "auxiliary_loss_clip": 0.06417294, + "auxiliary_loss_mlp": 0.01269205, + "balance_loss_clip": 0.06275527, + "balance_loss_mlp": 0.01257528, + "epoch": 0.5782053209078611, + "flos": 19251770060160.0, + "grad_norm": 1.8155832257801603, + "language_loss": 0.77963018, + "learning_rate": 1.5933366613505812e-06, + "loss": 0.85649514, + "num_input_tokens_seen": 207145095, + "router_z_loss_clip": 1.41699219, + "router_z_loss_mlp": 0.11669922, + "step": 9617, + "time_per_iteration": 4.014554977416992 + }, + { + "auxiliary_loss_clip": 0.064207, + "auxiliary_loss_mlp": 0.01269929, + "balance_loss_clip": 0.06277206, + "balance_loss_mlp": 0.012578, + "epoch": 0.578265444160529, + "flos": 26000849024640.0, + "grad_norm": 1.3678407791087424, + "language_loss": 0.75333905, + "learning_rate": 1.5929553432681947e-06, + "loss": 0.83024538, + "num_input_tokens_seen": 207166045, + "router_z_loss_clip": 1.43457031, + "router_z_loss_mlp": 0.12139893, + "step": 9618, + "time_per_iteration": 2.5390572547912598 + }, + { + "auxiliary_loss_clip": 0.06416163, + "auxiliary_loss_mlp": 0.01265823, + "balance_loss_clip": 0.06273779, + "balance_loss_mlp": 0.01254355, + "epoch": 0.5783255674131971, + "flos": 21805025748480.0, + "grad_norm": 1.6109172194310035, + "language_loss": 0.81657064, + "learning_rate": 1.5925740406209826e-06, + "loss": 0.89339048, + "num_input_tokens_seen": 207185290, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.11468506, + "step": 9619, + "time_per_iteration": 2.505831718444824 + }, + { + "auxiliary_loss_clip": 0.06420539, + "auxiliary_loss_mlp": 0.01265219, + "balance_loss_clip": 0.06275585, + "balance_loss_mlp": 0.01253972, + "epoch": 0.578385690665865, + "flos": 24796553817600.0, + "grad_norm": 1.540190718879446, + "language_loss": 0.72668874, + "learning_rate": 1.5921927534234039e-06, + "loss": 0.80354631, + "num_input_tokens_seen": 207205505, + "router_z_loss_clip": 1.44921875, + "router_z_loss_mlp": 0.11248779, + "step": 9620, + "time_per_iteration": 3.9673268795013428 + }, + { + "auxiliary_loss_clip": 0.06423381, + "auxiliary_loss_mlp": 0.01270714, + "balance_loss_clip": 0.06277235, + "balance_loss_mlp": 0.01258942, + "epoch": 0.578445813918533, + "flos": 21218859711360.0, + "grad_norm": 1.6605075192862409, + "language_loss": 0.77349472, + "learning_rate": 1.591811481689916e-06, + "loss": 0.85043567, + "num_input_tokens_seen": 207225315, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.11767578, + "step": 9621, + "time_per_iteration": 2.5077648162841797 + }, + { + "auxiliary_loss_clip": 0.06420489, + "auxiliary_loss_mlp": 0.01264338, + "balance_loss_clip": 0.0627306, + "balance_loss_mlp": 0.01252477, + "epoch": 0.5785059371712009, + "flos": 25053921233280.0, + "grad_norm": 1.4404835359445094, + "language_loss": 0.7094593, + "learning_rate": 1.5914302254349787e-06, + "loss": 0.78630757, + "num_input_tokens_seen": 207247690, + "router_z_loss_clip": 1.47558594, + "router_z_loss_mlp": 0.11859131, + "step": 9622, + "time_per_iteration": 2.5468451976776123 + }, + { + "auxiliary_loss_clip": 0.06311069, + "auxiliary_loss_mlp": 0.01252444, + "balance_loss_clip": 0.06251176, + "balance_loss_mlp": 0.01250508, + "epoch": 0.5785660604238689, + "flos": 70865187488640.0, + "grad_norm": 0.7596176351080388, + "language_loss": 0.55852556, + "learning_rate": 1.5910489846730476e-06, + "loss": 0.6341607, + "num_input_tokens_seen": 207301735, + "router_z_loss_clip": 0.6015625, + "router_z_loss_mlp": 0.01933289, + "step": 9623, + "time_per_iteration": 3.153353452682495 + }, + { + "auxiliary_loss_clip": 0.06425077, + "auxiliary_loss_mlp": 0.01267172, + "balance_loss_clip": 0.06277281, + "balance_loss_mlp": 0.01255233, + "epoch": 0.578626183676537, + "flos": 31658083361280.0, + "grad_norm": 2.2034040135587936, + "language_loss": 0.71319884, + "learning_rate": 1.5906677594185799e-06, + "loss": 0.79012132, + "num_input_tokens_seen": 207321240, + "router_z_loss_clip": 1.47949219, + "router_z_loss_mlp": 0.1194458, + "step": 9624, + "time_per_iteration": 2.5956926345825195 + }, + { + "auxiliary_loss_clip": 0.06420659, + "auxiliary_loss_mlp": 0.01270578, + "balance_loss_clip": 0.06275962, + "balance_loss_mlp": 0.01258222, + "epoch": 0.5786863069292049, + "flos": 21870545241600.0, + "grad_norm": 1.7015470008848133, + "language_loss": 0.82409322, + "learning_rate": 1.5902865496860322e-06, + "loss": 0.90100557, + "num_input_tokens_seen": 207339540, + "router_z_loss_clip": 1.44824219, + "router_z_loss_mlp": 0.12353516, + "step": 9625, + "time_per_iteration": 2.5166807174682617 + }, + { + "auxiliary_loss_clip": 0.06417123, + "auxiliary_loss_mlp": 0.01266026, + "balance_loss_clip": 0.06274764, + "balance_loss_mlp": 0.01253647, + "epoch": 0.5787464301818729, + "flos": 23371214739840.0, + "grad_norm": 1.4015207824111633, + "language_loss": 0.70712119, + "learning_rate": 1.5899053554898591e-06, + "loss": 0.78395265, + "num_input_tokens_seen": 207360470, + "router_z_loss_clip": 1.42382812, + "router_z_loss_mlp": 0.12384033, + "step": 9626, + "time_per_iteration": 2.5232555866241455 + }, + { + "auxiliary_loss_clip": 0.06417292, + "auxiliary_loss_mlp": 0.01266097, + "balance_loss_clip": 0.06275232, + "balance_loss_mlp": 0.01255278, + "epoch": 0.5788065534345408, + "flos": 30011155361280.0, + "grad_norm": 1.650883867076693, + "language_loss": 0.71934295, + "learning_rate": 1.5895241768445166e-06, + "loss": 0.79617685, + "num_input_tokens_seen": 207383080, + "router_z_loss_clip": 1.41992188, + "router_z_loss_mlp": 0.10827637, + "step": 9627, + "time_per_iteration": 2.5862505435943604 + }, + { + "auxiliary_loss_clip": 0.06419323, + "auxiliary_loss_mlp": 0.01268778, + "balance_loss_clip": 0.06276532, + "balance_loss_mlp": 0.01257643, + "epoch": 0.5788666766872088, + "flos": 24533526251520.0, + "grad_norm": 1.6845581870111699, + "language_loss": 0.84154361, + "learning_rate": 1.589143013764458e-06, + "loss": 0.91842461, + "num_input_tokens_seen": 207401000, + "router_z_loss_clip": 1.42675781, + "router_z_loss_mlp": 0.11138916, + "step": 9628, + "time_per_iteration": 4.011742830276489 + }, + { + "auxiliary_loss_clip": 0.06420035, + "auxiliary_loss_mlp": 0.01267996, + "balance_loss_clip": 0.06274278, + "balance_loss_mlp": 0.01255443, + "epoch": 0.5789267999398767, + "flos": 23739649142400.0, + "grad_norm": 1.4211285900013286, + "language_loss": 0.72366357, + "learning_rate": 1.5887618662641376e-06, + "loss": 0.8005439, + "num_input_tokens_seen": 207419230, + "router_z_loss_clip": 1.45898438, + "router_z_loss_mlp": 0.12548828, + "step": 9629, + "time_per_iteration": 2.535161018371582 + }, + { + "auxiliary_loss_clip": 0.06419079, + "auxiliary_loss_mlp": 0.01266785, + "balance_loss_clip": 0.06275524, + "balance_loss_mlp": 0.01254894, + "epoch": 0.5789869231925447, + "flos": 21140217054720.0, + "grad_norm": 1.8234862135922645, + "language_loss": 0.74396068, + "learning_rate": 1.5883807343580087e-06, + "loss": 0.82081938, + "num_input_tokens_seen": 207437615, + "router_z_loss_clip": 1.43652344, + "router_z_loss_mlp": 0.11883545, + "step": 9630, + "time_per_iteration": 2.4906413555145264 + }, + { + "auxiliary_loss_clip": 0.06409539, + "auxiliary_loss_mlp": 0.0126483, + "balance_loss_clip": 0.06270717, + "balance_loss_mlp": 0.0125344, + "epoch": 0.5790470464452127, + "flos": 21215086277760.0, + "grad_norm": 1.5521366007555986, + "language_loss": 0.78864127, + "learning_rate": 1.587999618060523e-06, + "loss": 0.86538494, + "num_input_tokens_seen": 207457270, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.11395264, + "step": 9631, + "time_per_iteration": 2.500326633453369 + }, + { + "auxiliary_loss_clip": 0.06417775, + "auxiliary_loss_mlp": 0.01264538, + "balance_loss_clip": 0.06272215, + "balance_loss_mlp": 0.01253147, + "epoch": 0.5791071696978807, + "flos": 23411144010240.0, + "grad_norm": 1.6622191818478913, + "language_loss": 0.7546376, + "learning_rate": 1.5876185173861333e-06, + "loss": 0.83146071, + "num_input_tokens_seen": 207477890, + "router_z_loss_clip": 1.45605469, + "router_z_loss_mlp": 0.1138916, + "step": 9632, + "time_per_iteration": 2.5060648918151855 + }, + { + "auxiliary_loss_clip": 0.06419455, + "auxiliary_loss_mlp": 0.01267637, + "balance_loss_clip": 0.06274837, + "balance_loss_mlp": 0.0125562, + "epoch": 0.5791672929505486, + "flos": 24213322673280.0, + "grad_norm": 1.7292582736877316, + "language_loss": 0.79532528, + "learning_rate": 1.5872374323492915e-06, + "loss": 0.8721962, + "num_input_tokens_seen": 207497670, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.12011719, + "step": 9633, + "time_per_iteration": 2.516359567642212 + }, + { + "auxiliary_loss_clip": 0.0643272, + "auxiliary_loss_mlp": 0.01269361, + "balance_loss_clip": 0.06278707, + "balance_loss_mlp": 0.01256635, + "epoch": 0.5792274162032166, + "flos": 24355094981760.0, + "grad_norm": 1.6340208840931036, + "language_loss": 0.7790345, + "learning_rate": 1.5868563629644464e-06, + "loss": 0.85605538, + "num_input_tokens_seen": 207516105, + "router_z_loss_clip": 1.5390625, + "router_z_loss_mlp": 0.1272583, + "step": 9634, + "time_per_iteration": 2.541090488433838 + }, + { + "auxiliary_loss_clip": 0.06422533, + "auxiliary_loss_mlp": 0.01268072, + "balance_loss_clip": 0.06273677, + "balance_loss_mlp": 0.01255406, + "epoch": 0.5792875394558845, + "flos": 20455729850880.0, + "grad_norm": 1.975369322400224, + "language_loss": 0.64063549, + "learning_rate": 1.5864753092460502e-06, + "loss": 0.71754158, + "num_input_tokens_seen": 207533685, + "router_z_loss_clip": 1.48828125, + "router_z_loss_mlp": 0.12652588, + "step": 9635, + "time_per_iteration": 2.4916157722473145 + }, + { + "auxiliary_loss_clip": 0.06417014, + "auxiliary_loss_mlp": 0.01265438, + "balance_loss_clip": 0.06273466, + "balance_loss_mlp": 0.01253327, + "epoch": 0.5793476627085525, + "flos": 24067064171520.0, + "grad_norm": 1.4766518541506428, + "language_loss": 0.77494228, + "learning_rate": 1.5860942712085516e-06, + "loss": 0.85176682, + "num_input_tokens_seen": 207552840, + "router_z_loss_clip": 1.43359375, + "router_z_loss_mlp": 0.12115479, + "step": 9636, + "time_per_iteration": 2.516622304916382 + }, + { + "auxiliary_loss_clip": 0.06411137, + "auxiliary_loss_mlp": 0.01268567, + "balance_loss_clip": 0.06271987, + "balance_loss_mlp": 0.01258226, + "epoch": 0.5794077859612206, + "flos": 22060799936640.0, + "grad_norm": 1.6556351940576073, + "language_loss": 0.68772542, + "learning_rate": 1.5857132488663998e-06, + "loss": 0.76452249, + "num_input_tokens_seen": 207572095, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.10333252, + "step": 9637, + "time_per_iteration": 2.509833812713623 + }, + { + "auxiliary_loss_clip": 0.06421766, + "auxiliary_loss_mlp": 0.0126905, + "balance_loss_clip": 0.06273458, + "balance_loss_mlp": 0.01256784, + "epoch": 0.5794679092138885, + "flos": 11439245802240.0, + "grad_norm": 2.540580609640148, + "language_loss": 0.72712755, + "learning_rate": 1.585332242234043e-06, + "loss": 0.80403578, + "num_input_tokens_seen": 207587495, + "router_z_loss_clip": 1.48339844, + "router_z_loss_mlp": 0.12261963, + "step": 9638, + "time_per_iteration": 2.4528071880340576 + }, + { + "auxiliary_loss_clip": 0.06416277, + "auxiliary_loss_mlp": 0.01266332, + "balance_loss_clip": 0.06273618, + "balance_loss_mlp": 0.0125521, + "epoch": 0.5795280324665565, + "flos": 18886228623360.0, + "grad_norm": 1.607875789180523, + "language_loss": 0.72792935, + "learning_rate": 1.5849512513259291e-06, + "loss": 0.80475545, + "num_input_tokens_seen": 207606795, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.11120605, + "step": 9639, + "time_per_iteration": 2.510347604751587 + }, + { + "auxiliary_loss_clip": 0.06418437, + "auxiliary_loss_mlp": 0.01269692, + "balance_loss_clip": 0.06273493, + "balance_loss_mlp": 0.01258332, + "epoch": 0.5795881557192244, + "flos": 13010969162880.0, + "grad_norm": 1.751039086833101, + "language_loss": 0.69813907, + "learning_rate": 1.5845702761565054e-06, + "loss": 0.7750203, + "num_input_tokens_seen": 207623620, + "router_z_loss_clip": 1.44824219, + "router_z_loss_mlp": 0.11364746, + "step": 9640, + "time_per_iteration": 2.453831672668457 + }, + { + "auxiliary_loss_clip": 0.06430758, + "auxiliary_loss_mlp": 0.01271889, + "balance_loss_clip": 0.0627775, + "balance_loss_mlp": 0.01259509, + "epoch": 0.5796482789718924, + "flos": 19937598929280.0, + "grad_norm": 2.3188274360648298, + "language_loss": 0.78378308, + "learning_rate": 1.5841893167402183e-06, + "loss": 0.8608095, + "num_input_tokens_seen": 207639380, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.12371826, + "step": 9641, + "time_per_iteration": 2.487333059310913 + }, + { + "auxiliary_loss_clip": 0.06416615, + "auxiliary_loss_mlp": 0.01268516, + "balance_loss_clip": 0.06271899, + "balance_loss_mlp": 0.01256685, + "epoch": 0.5797084022245603, + "flos": 21656880529920.0, + "grad_norm": 2.422042135441505, + "language_loss": 0.74201375, + "learning_rate": 1.5838083730915143e-06, + "loss": 0.81886506, + "num_input_tokens_seen": 207657915, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.1182251, + "step": 9642, + "time_per_iteration": 2.4917688369750977 + }, + { + "auxiliary_loss_clip": 0.06419542, + "auxiliary_loss_mlp": 0.01264152, + "balance_loss_clip": 0.06275794, + "balance_loss_mlp": 0.01252582, + "epoch": 0.5797685254772283, + "flos": 26038807724160.0, + "grad_norm": 1.4983613319397562, + "language_loss": 0.73538697, + "learning_rate": 1.5834274452248378e-06, + "loss": 0.81222391, + "num_input_tokens_seen": 207678620, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.11566162, + "step": 9643, + "time_per_iteration": 2.5357465744018555 + }, + { + "auxiliary_loss_clip": 0.06417159, + "auxiliary_loss_mlp": 0.01264721, + "balance_loss_clip": 0.06270476, + "balance_loss_mlp": 0.01253175, + "epoch": 0.5798286487298963, + "flos": 22710808385280.0, + "grad_norm": 1.6774180539317567, + "language_loss": 0.67605746, + "learning_rate": 1.5830465331546352e-06, + "loss": 0.75287628, + "num_input_tokens_seen": 207696980, + "router_z_loss_clip": 1.46679688, + "router_z_loss_mlp": 0.11547852, + "step": 9644, + "time_per_iteration": 2.485366106033325 + }, + { + "auxiliary_loss_clip": 0.06425455, + "auxiliary_loss_mlp": 0.01268613, + "balance_loss_clip": 0.06276956, + "balance_loss_mlp": 0.01256078, + "epoch": 0.5798887719825643, + "flos": 23155705238400.0, + "grad_norm": 2.0120452642465865, + "language_loss": 0.85497642, + "learning_rate": 1.5826656368953496e-06, + "loss": 0.93191713, + "num_input_tokens_seen": 207714065, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.12542725, + "step": 9645, + "time_per_iteration": 2.505467414855957 + }, + { + "auxiliary_loss_clip": 0.06418729, + "auxiliary_loss_mlp": 0.01266861, + "balance_loss_clip": 0.06275458, + "balance_loss_mlp": 0.01255774, + "epoch": 0.5799488952352322, + "flos": 24432982951680.0, + "grad_norm": 1.7616171208033915, + "language_loss": 0.75737381, + "learning_rate": 1.5822847564614244e-06, + "loss": 0.83422971, + "num_input_tokens_seen": 207734720, + "router_z_loss_clip": 1.43164062, + "router_z_loss_mlp": 0.11102295, + "step": 9646, + "time_per_iteration": 2.527848958969116 + }, + { + "auxiliary_loss_clip": 0.06425247, + "auxiliary_loss_mlp": 0.01268889, + "balance_loss_clip": 0.06276453, + "balance_loss_mlp": 0.01256461, + "epoch": 0.5800090184879002, + "flos": 38404478995200.0, + "grad_norm": 1.7871006843554935, + "language_loss": 0.59099573, + "learning_rate": 1.5819038918673038e-06, + "loss": 0.6679371, + "num_input_tokens_seen": 207755435, + "router_z_loss_clip": 1.48828125, + "router_z_loss_mlp": 0.12426758, + "step": 9647, + "time_per_iteration": 2.643890142440796 + }, + { + "auxiliary_loss_clip": 0.06425125, + "auxiliary_loss_mlp": 0.01271805, + "balance_loss_clip": 0.06275211, + "balance_loss_mlp": 0.01259276, + "epoch": 0.5800691417405681, + "flos": 19789747200000.0, + "grad_norm": 1.4917917867847632, + "language_loss": 0.84483784, + "learning_rate": 1.5815230431274288e-06, + "loss": 0.92180717, + "num_input_tokens_seen": 207773570, + "router_z_loss_clip": 1.49902344, + "router_z_loss_mlp": 0.12524414, + "step": 9648, + "time_per_iteration": 2.48917818069458 + }, + { + "auxiliary_loss_clip": 0.06311809, + "auxiliary_loss_mlp": 0.01252996, + "balance_loss_clip": 0.06251512, + "balance_loss_mlp": 0.01251245, + "epoch": 0.5801292649932361, + "flos": 70333514133120.0, + "grad_norm": 0.8366168453621474, + "language_loss": 0.63013005, + "learning_rate": 1.581142210256242e-06, + "loss": 0.70577806, + "num_input_tokens_seen": 207830095, + "router_z_loss_clip": 0.60449219, + "router_z_loss_mlp": 0.01756287, + "step": 9649, + "time_per_iteration": 3.167630434036255 + }, + { + "auxiliary_loss_clip": 0.064106, + "auxiliary_loss_mlp": 0.01264864, + "balance_loss_clip": 0.06269349, + "balance_loss_mlp": 0.01253903, + "epoch": 0.5801893882459042, + "flos": 18740892516480.0, + "grad_norm": 1.6385207780550837, + "language_loss": 0.82320833, + "learning_rate": 1.5807613932681857e-06, + "loss": 0.89996296, + "num_input_tokens_seen": 207848555, + "router_z_loss_clip": 1.41308594, + "router_z_loss_mlp": 0.10968018, + "step": 9650, + "time_per_iteration": 2.495060920715332 + }, + { + "auxiliary_loss_clip": 0.06424958, + "auxiliary_loss_mlp": 0.01267787, + "balance_loss_clip": 0.0627567, + "balance_loss_mlp": 0.01256194, + "epoch": 0.5802495114985721, + "flos": 15601973915520.0, + "grad_norm": 2.051158244012986, + "language_loss": 0.77640611, + "learning_rate": 1.580380592177698e-06, + "loss": 0.85333359, + "num_input_tokens_seen": 207867060, + "router_z_loss_clip": 1.49121094, + "router_z_loss_mlp": 0.11584473, + "step": 9651, + "time_per_iteration": 3.9003303050994873 + }, + { + "auxiliary_loss_clip": 0.06421195, + "auxiliary_loss_mlp": 0.01270828, + "balance_loss_clip": 0.0627306, + "balance_loss_mlp": 0.01258627, + "epoch": 0.5803096347512401, + "flos": 18260552586240.0, + "grad_norm": 1.678926948492491, + "language_loss": 0.74017727, + "learning_rate": 1.5799998069992213e-06, + "loss": 0.81709743, + "num_input_tokens_seen": 207884520, + "router_z_loss_clip": 1.48046875, + "router_z_loss_mlp": 0.12207031, + "step": 9652, + "time_per_iteration": 2.5226869583129883 + }, + { + "auxiliary_loss_clip": 0.0642662, + "auxiliary_loss_mlp": 0.01267654, + "balance_loss_clip": 0.06278314, + "balance_loss_mlp": 0.012559, + "epoch": 0.580369758003908, + "flos": 22899763342080.0, + "grad_norm": 1.9284827518212118, + "language_loss": 0.77118474, + "learning_rate": 1.579619037747193e-06, + "loss": 0.84812748, + "num_input_tokens_seen": 207905370, + "router_z_loss_clip": 1.48242188, + "router_z_loss_mlp": 0.11749268, + "step": 9653, + "time_per_iteration": 2.5736207962036133 + }, + { + "auxiliary_loss_clip": 0.06425463, + "auxiliary_loss_mlp": 0.01265074, + "balance_loss_clip": 0.06277624, + "balance_loss_mlp": 0.01252789, + "epoch": 0.580429881256576, + "flos": 18703646576640.0, + "grad_norm": 1.9366371532767657, + "language_loss": 0.75627828, + "learning_rate": 1.5792382844360534e-06, + "loss": 0.83318365, + "num_input_tokens_seen": 207923790, + "router_z_loss_clip": 1.47851562, + "router_z_loss_mlp": 0.1229248, + "step": 9654, + "time_per_iteration": 2.667048931121826 + }, + { + "auxiliary_loss_clip": 0.06413651, + "auxiliary_loss_mlp": 0.01265944, + "balance_loss_clip": 0.062739, + "balance_loss_mlp": 0.01254959, + "epoch": 0.5804900045092439, + "flos": 24689050629120.0, + "grad_norm": 1.638178903008904, + "language_loss": 0.70858634, + "learning_rate": 1.5788575470802408e-06, + "loss": 0.78538227, + "num_input_tokens_seen": 207942335, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.10992432, + "step": 9655, + "time_per_iteration": 2.5496294498443604 + }, + { + "auxiliary_loss_clip": 0.06424456, + "auxiliary_loss_mlp": 0.01266011, + "balance_loss_clip": 0.06273113, + "balance_loss_mlp": 0.0125378, + "epoch": 0.580550127761912, + "flos": 23119549401600.0, + "grad_norm": 2.0310142592924314, + "language_loss": 0.70043373, + "learning_rate": 1.5784768256941915e-06, + "loss": 0.77733833, + "num_input_tokens_seen": 207961975, + "router_z_loss_clip": 1.51171875, + "router_z_loss_mlp": 0.12231445, + "step": 9656, + "time_per_iteration": 4.0007078647613525 + }, + { + "auxiliary_loss_clip": 0.06411725, + "auxiliary_loss_mlp": 0.01265789, + "balance_loss_clip": 0.0627184, + "balance_loss_mlp": 0.01255203, + "epoch": 0.5806102510145799, + "flos": 18481093332480.0, + "grad_norm": 1.6851014534608593, + "language_loss": 0.71761322, + "learning_rate": 1.5780961202923433e-06, + "loss": 0.79438841, + "num_input_tokens_seen": 207979520, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.105896, + "step": 9657, + "time_per_iteration": 2.52081298828125 + }, + { + "auxiliary_loss_clip": 0.06426618, + "auxiliary_loss_mlp": 0.01265336, + "balance_loss_clip": 0.06275696, + "balance_loss_mlp": 0.01252843, + "epoch": 0.5806703742672479, + "flos": 23922566605440.0, + "grad_norm": 1.7911249599131025, + "language_loss": 0.70450497, + "learning_rate": 1.5777154308891328e-06, + "loss": 0.78142452, + "num_input_tokens_seen": 207998375, + "router_z_loss_clip": 1.50976562, + "router_z_loss_mlp": 0.12506104, + "step": 9658, + "time_per_iteration": 2.509723424911499 + }, + { + "auxiliary_loss_clip": 0.06307676, + "auxiliary_loss_mlp": 0.01252681, + "balance_loss_clip": 0.06247197, + "balance_loss_mlp": 0.01250939, + "epoch": 0.5807304975199158, + "flos": 66332096328960.0, + "grad_norm": 0.6445385314606554, + "language_loss": 0.53559077, + "learning_rate": 1.5773347574989953e-06, + "loss": 0.61119437, + "num_input_tokens_seen": 208060605, + "router_z_loss_clip": 0.60546875, + "router_z_loss_mlp": 0.01747131, + "step": 9659, + "time_per_iteration": 3.164217233657837 + }, + { + "auxiliary_loss_clip": 0.0642177, + "auxiliary_loss_mlp": 0.01266172, + "balance_loss_clip": 0.06271978, + "balance_loss_mlp": 0.01254191, + "epoch": 0.5807906207725838, + "flos": 31730478888960.0, + "grad_norm": 1.678223545722946, + "language_loss": 0.62300181, + "learning_rate": 1.576954100136366e-06, + "loss": 0.69988132, + "num_input_tokens_seen": 208080320, + "router_z_loss_clip": 1.49804688, + "router_z_loss_mlp": 0.11987305, + "step": 9660, + "time_per_iteration": 4.055291175842285 + }, + { + "auxiliary_loss_clip": 0.06418584, + "auxiliary_loss_mlp": 0.01267971, + "balance_loss_clip": 0.06270796, + "balance_loss_mlp": 0.01256443, + "epoch": 0.5808507440252517, + "flos": 23807223060480.0, + "grad_norm": 1.5142376676823694, + "language_loss": 0.65793735, + "learning_rate": 1.5765734588156797e-06, + "loss": 0.73480284, + "num_input_tokens_seen": 208099305, + "router_z_loss_clip": 1.4765625, + "router_z_loss_mlp": 0.11541748, + "step": 9661, + "time_per_iteration": 2.50545334815979 + }, + { + "auxiliary_loss_clip": 0.06409734, + "auxiliary_loss_mlp": 0.01265632, + "balance_loss_clip": 0.062701, + "balance_loss_mlp": 0.01255565, + "epoch": 0.5809108672779197, + "flos": 13703464431360.0, + "grad_norm": 1.88238902360882, + "language_loss": 0.74297959, + "learning_rate": 1.5761928335513704e-06, + "loss": 0.81973332, + "num_input_tokens_seen": 208116960, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.10070801, + "step": 9662, + "time_per_iteration": 2.4924473762512207 + }, + { + "auxiliary_loss_clip": 0.06306686, + "auxiliary_loss_mlp": 0.01251122, + "balance_loss_clip": 0.06246165, + "balance_loss_mlp": 0.0124951, + "epoch": 0.5809709905305876, + "flos": 69157687386240.0, + "grad_norm": 0.8243605057954629, + "language_loss": 0.58189029, + "learning_rate": 1.5758122243578709e-06, + "loss": 0.65746832, + "num_input_tokens_seen": 208182190, + "router_z_loss_clip": 0.60546875, + "router_z_loss_mlp": 0.0161438, + "step": 9663, + "time_per_iteration": 3.215336799621582 + }, + { + "auxiliary_loss_clip": 0.06414537, + "auxiliary_loss_mlp": 0.01265807, + "balance_loss_clip": 0.06272955, + "balance_loss_mlp": 0.01254392, + "epoch": 0.5810311137832557, + "flos": 19833491831040.0, + "grad_norm": 2.48301510503896, + "language_loss": 0.82404405, + "learning_rate": 1.5754316312496152e-06, + "loss": 0.90084743, + "num_input_tokens_seen": 208197015, + "router_z_loss_clip": 1.41601562, + "router_z_loss_mlp": 0.11413574, + "step": 9664, + "time_per_iteration": 2.663583278656006 + }, + { + "auxiliary_loss_clip": 0.06419012, + "auxiliary_loss_mlp": 0.01263414, + "balance_loss_clip": 0.06271498, + "balance_loss_mlp": 0.01252423, + "epoch": 0.5810912370359237, + "flos": 29245635659520.0, + "grad_norm": 1.676690255308112, + "language_loss": 0.81861937, + "learning_rate": 1.5750510542410337e-06, + "loss": 0.89544368, + "num_input_tokens_seen": 208215795, + "router_z_loss_clip": 1.47460938, + "router_z_loss_mlp": 0.10992432, + "step": 9665, + "time_per_iteration": 2.5936458110809326 + }, + { + "auxiliary_loss_clip": 0.06425443, + "auxiliary_loss_mlp": 0.01269377, + "balance_loss_clip": 0.0627546, + "balance_loss_mlp": 0.01257098, + "epoch": 0.5811513602885916, + "flos": 22792469788800.0, + "grad_norm": 1.7928396623098657, + "language_loss": 0.80963171, + "learning_rate": 1.5746704933465599e-06, + "loss": 0.88657987, + "num_input_tokens_seen": 208234655, + "router_z_loss_clip": 1.49804688, + "router_z_loss_mlp": 0.12268066, + "step": 9666, + "time_per_iteration": 2.556262969970703 + }, + { + "auxiliary_loss_clip": 0.06412445, + "auxiliary_loss_mlp": 0.01266794, + "balance_loss_clip": 0.06271029, + "balance_loss_mlp": 0.01256059, + "epoch": 0.5812114835412596, + "flos": 18740347464960.0, + "grad_norm": 1.6774912146747003, + "language_loss": 0.79895651, + "learning_rate": 1.5742899485806227e-06, + "loss": 0.87574893, + "num_input_tokens_seen": 208251300, + "router_z_loss_clip": 1.41308594, + "router_z_loss_mlp": 0.1072998, + "step": 9667, + "time_per_iteration": 3.980412483215332 + }, + { + "auxiliary_loss_clip": 0.06427534, + "auxiliary_loss_mlp": 0.01265338, + "balance_loss_clip": 0.06274875, + "balance_loss_mlp": 0.01252791, + "epoch": 0.5812716067939275, + "flos": 26438324791680.0, + "grad_norm": 1.482922365624984, + "language_loss": 0.79118401, + "learning_rate": 1.573909419957653e-06, + "loss": 0.86811268, + "num_input_tokens_seen": 208272685, + "router_z_loss_clip": 1.52636719, + "router_z_loss_mlp": 0.12536621, + "step": 9668, + "time_per_iteration": 2.565986156463623 + }, + { + "auxiliary_loss_clip": 0.06418585, + "auxiliary_loss_mlp": 0.01270366, + "balance_loss_clip": 0.06273644, + "balance_loss_mlp": 0.0125872, + "epoch": 0.5813317300465956, + "flos": 43407847595520.0, + "grad_norm": 1.832859625901051, + "language_loss": 0.64703673, + "learning_rate": 1.5735289074920819e-06, + "loss": 0.72392619, + "num_input_tokens_seen": 208294315, + "router_z_loss_clip": 1.44726562, + "router_z_loss_mlp": 0.11657715, + "step": 9669, + "time_per_iteration": 2.804957151412964 + }, + { + "auxiliary_loss_clip": 0.06415828, + "auxiliary_loss_mlp": 0.01266389, + "balance_loss_clip": 0.0627243, + "balance_loss_mlp": 0.01254969, + "epoch": 0.5813918532992635, + "flos": 24791564499840.0, + "grad_norm": 1.4489654033865982, + "language_loss": 0.73791713, + "learning_rate": 1.5731484111983363e-06, + "loss": 0.81473929, + "num_input_tokens_seen": 208315610, + "router_z_loss_clip": 1.43457031, + "router_z_loss_mlp": 0.11425781, + "step": 9670, + "time_per_iteration": 2.54849910736084 + }, + { + "auxiliary_loss_clip": 0.0641885, + "auxiliary_loss_mlp": 0.01269355, + "balance_loss_clip": 0.06272031, + "balance_loss_mlp": 0.0125822, + "epoch": 0.5814519765519315, + "flos": 22864068702720.0, + "grad_norm": 1.8471376195746119, + "language_loss": 0.79354227, + "learning_rate": 1.5727679310908464e-06, + "loss": 0.87042427, + "num_input_tokens_seen": 208334725, + "router_z_loss_clip": 1.46679688, + "router_z_loss_mlp": 0.11138916, + "step": 9671, + "time_per_iteration": 2.553971529006958 + }, + { + "auxiliary_loss_clip": 0.06426669, + "auxiliary_loss_mlp": 0.0126691, + "balance_loss_clip": 0.06274676, + "balance_loss_mlp": 0.01254685, + "epoch": 0.5815120998045994, + "flos": 24067651150080.0, + "grad_norm": 2.0867956489424495, + "language_loss": 0.61609662, + "learning_rate": 1.5723874671840399e-06, + "loss": 0.6930325, + "num_input_tokens_seen": 208353825, + "router_z_loss_clip": 1.51855469, + "router_z_loss_mlp": 0.12219238, + "step": 9672, + "time_per_iteration": 2.5135464668273926 + }, + { + "auxiliary_loss_clip": 0.06413487, + "auxiliary_loss_mlp": 0.01267774, + "balance_loss_clip": 0.06271096, + "balance_loss_mlp": 0.01256735, + "epoch": 0.5815722230572674, + "flos": 24286305179520.0, + "grad_norm": 2.966012751852424, + "language_loss": 0.81724179, + "learning_rate": 1.572007019492342e-06, + "loss": 0.89405441, + "num_input_tokens_seen": 208374160, + "router_z_loss_clip": 1.42285156, + "router_z_loss_mlp": 0.1104126, + "step": 9673, + "time_per_iteration": 2.531637668609619 + }, + { + "auxiliary_loss_clip": 0.06422119, + "auxiliary_loss_mlp": 0.01271004, + "balance_loss_clip": 0.06272921, + "balance_loss_mlp": 0.01258976, + "epoch": 0.5816323463099353, + "flos": 22206932657280.0, + "grad_norm": 1.7930668974507213, + "language_loss": 0.88784432, + "learning_rate": 1.5716265880301817e-06, + "loss": 0.9647755, + "num_input_tokens_seen": 208392105, + "router_z_loss_clip": 1.49121094, + "router_z_loss_mlp": 0.12030029, + "step": 9674, + "time_per_iteration": 2.490135908126831 + }, + { + "auxiliary_loss_clip": 0.06420779, + "auxiliary_loss_mlp": 0.01264457, + "balance_loss_clip": 0.0627536, + "balance_loss_mlp": 0.01253799, + "epoch": 0.5816924695626033, + "flos": 24141388343040.0, + "grad_norm": 1.4439307600636533, + "language_loss": 0.78848791, + "learning_rate": 1.571246172811984e-06, + "loss": 0.86534023, + "num_input_tokens_seen": 208411755, + "router_z_loss_clip": 1.453125, + "router_z_loss_mlp": 0.10656738, + "step": 9675, + "time_per_iteration": 2.570401191711426 + }, + { + "auxiliary_loss_clip": 0.06415851, + "auxiliary_loss_mlp": 0.01264178, + "balance_loss_clip": 0.06271321, + "balance_loss_mlp": 0.01252901, + "epoch": 0.5817525928152713, + "flos": 21330555603840.0, + "grad_norm": 2.1244098418378234, + "language_loss": 0.70489943, + "learning_rate": 1.5708657738521748e-06, + "loss": 0.78169978, + "num_input_tokens_seen": 208429995, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.11279297, + "step": 9676, + "time_per_iteration": 2.5234405994415283 + }, + { + "auxiliary_loss_clip": 0.06419084, + "auxiliary_loss_mlp": 0.01273498, + "balance_loss_clip": 0.06272397, + "balance_loss_mlp": 0.01262579, + "epoch": 0.5818127160679393, + "flos": 26940355729920.0, + "grad_norm": 2.3696751764318478, + "language_loss": 0.63762164, + "learning_rate": 1.5704853911651779e-06, + "loss": 0.71454746, + "num_input_tokens_seen": 208443655, + "router_z_loss_clip": 1.46582031, + "router_z_loss_mlp": 0.10906982, + "step": 9677, + "time_per_iteration": 2.5408287048339844 + }, + { + "auxiliary_loss_clip": 0.06307964, + "auxiliary_loss_mlp": 0.01264809, + "balance_loss_clip": 0.06247746, + "balance_loss_mlp": 0.01262844, + "epoch": 0.5818728393206073, + "flos": 63940779855360.0, + "grad_norm": 0.7897947317556949, + "language_loss": 0.54107881, + "learning_rate": 1.5701050247654182e-06, + "loss": 0.61680651, + "num_input_tokens_seen": 208498405, + "router_z_loss_clip": 0.6015625, + "router_z_loss_mlp": 0.01963806, + "step": 9678, + "time_per_iteration": 3.1962106227874756 + }, + { + "auxiliary_loss_clip": 0.0631143, + "auxiliary_loss_mlp": 0.0126129, + "balance_loss_clip": 0.06251128, + "balance_loss_mlp": 0.01259724, + "epoch": 0.5819329625732752, + "flos": 64972654087680.0, + "grad_norm": 0.717265543619072, + "language_loss": 0.56126428, + "learning_rate": 1.569724674667319e-06, + "loss": 0.6369915, + "num_input_tokens_seen": 208559075, + "router_z_loss_clip": 0.60546875, + "router_z_loss_mlp": 0.01565552, + "step": 9679, + "time_per_iteration": 3.0475993156433105 + }, + { + "auxiliary_loss_clip": 0.06420414, + "auxiliary_loss_mlp": 0.01271497, + "balance_loss_clip": 0.06274636, + "balance_loss_mlp": 0.01260386, + "epoch": 0.5819930858259432, + "flos": 21221668823040.0, + "grad_norm": 1.5334769221386826, + "language_loss": 0.65937847, + "learning_rate": 1.5693443408853032e-06, + "loss": 0.73629761, + "num_input_tokens_seen": 208577770, + "router_z_loss_clip": 1.45703125, + "router_z_loss_mlp": 0.11102295, + "step": 9680, + "time_per_iteration": 2.526440382003784 + }, + { + "auxiliary_loss_clip": 0.06418791, + "auxiliary_loss_mlp": 0.01266513, + "balance_loss_clip": 0.06274027, + "balance_loss_mlp": 0.01255909, + "epoch": 0.5820532090786111, + "flos": 19463715763200.0, + "grad_norm": 1.789175734331282, + "language_loss": 0.84067512, + "learning_rate": 1.5689640234337933e-06, + "loss": 0.91752815, + "num_input_tokens_seen": 208595110, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.10601807, + "step": 9681, + "time_per_iteration": 2.4850056171417236 + }, + { + "auxiliary_loss_clip": 0.06416699, + "auxiliary_loss_mlp": 0.01267084, + "balance_loss_clip": 0.06272473, + "balance_loss_mlp": 0.01255908, + "epoch": 0.5821133323312792, + "flos": 17718424669440.0, + "grad_norm": 2.261651210831951, + "language_loss": 0.76110494, + "learning_rate": 1.5685837223272109e-06, + "loss": 0.83794284, + "num_input_tokens_seen": 208612080, + "router_z_loss_clip": 1.44140625, + "router_z_loss_mlp": 0.11181641, + "step": 9682, + "time_per_iteration": 2.5017287731170654 + }, + { + "auxiliary_loss_clip": 0.06430176, + "auxiliary_loss_mlp": 0.01270705, + "balance_loss_clip": 0.06278756, + "balance_loss_mlp": 0.01258951, + "epoch": 0.5821734555839471, + "flos": 24578738328960.0, + "grad_norm": 2.1342093378293785, + "language_loss": 0.75805819, + "learning_rate": 1.568203437579977e-06, + "loss": 0.83506703, + "num_input_tokens_seen": 208630235, + "router_z_loss_clip": 1.515625, + "router_z_loss_mlp": 0.11749268, + "step": 9683, + "time_per_iteration": 2.5426952838897705 + }, + { + "auxiliary_loss_clip": 0.06429425, + "auxiliary_loss_mlp": 0.01275466, + "balance_loss_clip": 0.06278548, + "balance_loss_mlp": 0.0126283, + "epoch": 0.5822335788366151, + "flos": 22388760017280.0, + "grad_norm": 1.6377653311732083, + "language_loss": 0.74168241, + "learning_rate": 1.5678231692065116e-06, + "loss": 0.81873143, + "num_input_tokens_seen": 208647925, + "router_z_loss_clip": 1.50683594, + "router_z_loss_mlp": 0.12646484, + "step": 9684, + "time_per_iteration": 2.521773338317871 + }, + { + "auxiliary_loss_clip": 0.06424329, + "auxiliary_loss_mlp": 0.01273987, + "balance_loss_clip": 0.06276318, + "balance_loss_mlp": 0.01262114, + "epoch": 0.582293702089283, + "flos": 26729458202880.0, + "grad_norm": 2.7880175036552446, + "language_loss": 0.78406078, + "learning_rate": 1.5674429172212348e-06, + "loss": 0.86104393, + "num_input_tokens_seen": 208666180, + "router_z_loss_clip": 1.47851562, + "router_z_loss_mlp": 0.11871338, + "step": 9685, + "time_per_iteration": 2.53759503364563 + }, + { + "auxiliary_loss_clip": 0.06423293, + "auxiliary_loss_mlp": 0.01274993, + "balance_loss_clip": 0.06276082, + "balance_loss_mlp": 0.0126337, + "epoch": 0.582353825341951, + "flos": 17354560314240.0, + "grad_norm": 1.6209571199936617, + "language_loss": 0.75622851, + "learning_rate": 1.5670626816385667e-06, + "loss": 0.83321142, + "num_input_tokens_seen": 208684240, + "router_z_loss_clip": 1.47167969, + "router_z_loss_mlp": 0.11627197, + "step": 9686, + "time_per_iteration": 2.5203354358673096 + }, + { + "auxiliary_loss_clip": 0.06317171, + "auxiliary_loss_mlp": 0.01254478, + "balance_loss_clip": 0.06256813, + "balance_loss_mlp": 0.012529, + "epoch": 0.5824139485946189, + "flos": 55491133478400.0, + "grad_norm": 0.7976004724910164, + "language_loss": 0.57134593, + "learning_rate": 1.5666824624729244e-06, + "loss": 0.64706242, + "num_input_tokens_seen": 208736090, + "router_z_loss_clip": 0.60546875, + "router_z_loss_mlp": 0.01578522, + "step": 9687, + "time_per_iteration": 2.9669835567474365 + }, + { + "auxiliary_loss_clip": 0.06422709, + "auxiliary_loss_mlp": 0.01267333, + "balance_loss_clip": 0.06275669, + "balance_loss_mlp": 0.01255221, + "epoch": 0.582474071847287, + "flos": 20309261713920.0, + "grad_norm": 1.877177452165203, + "language_loss": 0.70002449, + "learning_rate": 1.566302259738727e-06, + "loss": 0.77692491, + "num_input_tokens_seen": 208754600, + "router_z_loss_clip": 1.47070312, + "router_z_loss_mlp": 0.12109375, + "step": 9688, + "time_per_iteration": 2.506741762161255 + }, + { + "auxiliary_loss_clip": 0.06417575, + "auxiliary_loss_mlp": 0.01265264, + "balance_loss_clip": 0.0627282, + "balance_loss_mlp": 0.01254673, + "epoch": 0.5825341950999549, + "flos": 23884733687040.0, + "grad_norm": 2.896352551150335, + "language_loss": 0.65452719, + "learning_rate": 1.5659220734503918e-06, + "loss": 0.73135561, + "num_input_tokens_seen": 208773140, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.10595703, + "step": 9689, + "time_per_iteration": 2.506406784057617 + }, + { + "auxiliary_loss_clip": 0.06415856, + "auxiliary_loss_mlp": 0.01273228, + "balance_loss_clip": 0.06272023, + "balance_loss_mlp": 0.0126126, + "epoch": 0.5825943183526229, + "flos": 23119842890880.0, + "grad_norm": 1.995545981005341, + "language_loss": 0.73637474, + "learning_rate": 1.5655419036223341e-06, + "loss": 0.81326556, + "num_input_tokens_seen": 208793410, + "router_z_loss_clip": 1.43847656, + "router_z_loss_mlp": 0.11956787, + "step": 9690, + "time_per_iteration": 3.9373486042022705 + }, + { + "auxiliary_loss_clip": 0.0642629, + "auxiliary_loss_mlp": 0.01267094, + "balance_loss_clip": 0.06275761, + "balance_loss_mlp": 0.01254887, + "epoch": 0.5826544416052909, + "flos": 22864152556800.0, + "grad_norm": 1.6091940048024238, + "language_loss": 0.76358879, + "learning_rate": 1.5651617502689717e-06, + "loss": 0.84052265, + "num_input_tokens_seen": 208811920, + "router_z_loss_clip": 1.50488281, + "router_z_loss_mlp": 0.12207031, + "step": 9691, + "time_per_iteration": 2.5036911964416504 + }, + { + "auxiliary_loss_clip": 0.06422149, + "auxiliary_loss_mlp": 0.01270283, + "balance_loss_clip": 0.06274154, + "balance_loss_mlp": 0.0125906, + "epoch": 0.5827145648579588, + "flos": 31509560799360.0, + "grad_norm": 1.692225094183595, + "language_loss": 0.80700606, + "learning_rate": 1.5647816134047184e-06, + "loss": 0.88393039, + "num_input_tokens_seen": 208834720, + "router_z_loss_clip": 1.48046875, + "router_z_loss_mlp": 0.11218262, + "step": 9692, + "time_per_iteration": 2.588819980621338 + }, + { + "auxiliary_loss_clip": 0.06307849, + "auxiliary_loss_mlp": 0.01251158, + "balance_loss_clip": 0.06247954, + "balance_loss_mlp": 0.01249412, + "epoch": 0.5827746881106268, + "flos": 69832028246400.0, + "grad_norm": 0.7844854120913538, + "language_loss": 0.5681411, + "learning_rate": 1.5644014930439907e-06, + "loss": 0.64373118, + "num_input_tokens_seen": 208898415, + "router_z_loss_clip": 0.6015625, + "router_z_loss_mlp": 0.01751709, + "step": 9693, + "time_per_iteration": 3.1347033977508545 + }, + { + "auxiliary_loss_clip": 0.0641888, + "auxiliary_loss_mlp": 0.01268479, + "balance_loss_clip": 0.06273088, + "balance_loss_mlp": 0.0125815, + "epoch": 0.5828348113632947, + "flos": 23119088204160.0, + "grad_norm": 1.522522739802819, + "language_loss": 0.78923696, + "learning_rate": 1.5640213892012025e-06, + "loss": 0.86611056, + "num_input_tokens_seen": 208919045, + "router_z_loss_clip": 1.45703125, + "router_z_loss_mlp": 0.10327148, + "step": 9694, + "time_per_iteration": 2.5068466663360596 + }, + { + "auxiliary_loss_clip": 0.06411383, + "auxiliary_loss_mlp": 0.01263322, + "balance_loss_clip": 0.06271289, + "balance_loss_mlp": 0.01253302, + "epoch": 0.5828949346159628, + "flos": 21879769190400.0, + "grad_norm": 1.3653324202123376, + "language_loss": 0.76330042, + "learning_rate": 1.5636413018907656e-06, + "loss": 0.84004748, + "num_input_tokens_seen": 208939375, + "router_z_loss_clip": 1.40332031, + "router_z_loss_mlp": 0.10021973, + "step": 9695, + "time_per_iteration": 2.556309700012207 + }, + { + "auxiliary_loss_clip": 0.06315481, + "auxiliary_loss_mlp": 0.01251352, + "balance_loss_clip": 0.06255624, + "balance_loss_mlp": 0.01249797, + "epoch": 0.5829550578686307, + "flos": 65985170497920.0, + "grad_norm": 0.7496740614083074, + "language_loss": 0.54866987, + "learning_rate": 1.563261231127095e-06, + "loss": 0.62433827, + "num_input_tokens_seen": 209004760, + "router_z_loss_clip": 0.6015625, + "router_z_loss_mlp": 0.01553345, + "step": 9696, + "time_per_iteration": 4.669760704040527 + }, + { + "auxiliary_loss_clip": 0.06418857, + "auxiliary_loss_mlp": 0.01264307, + "balance_loss_clip": 0.06272456, + "balance_loss_mlp": 0.01252893, + "epoch": 0.5830151811212987, + "flos": 16295391578880.0, + "grad_norm": 1.8785254946392194, + "language_loss": 0.76464188, + "learning_rate": 1.5628811769246021e-06, + "loss": 0.84147352, + "num_input_tokens_seen": 209022930, + "router_z_loss_clip": 1.46289062, + "router_z_loss_mlp": 0.11413574, + "step": 9697, + "time_per_iteration": 2.5041255950927734 + }, + { + "auxiliary_loss_clip": 0.06422254, + "auxiliary_loss_mlp": 0.01268851, + "balance_loss_clip": 0.06272788, + "balance_loss_mlp": 0.01256668, + "epoch": 0.5830753043739666, + "flos": 24175447827840.0, + "grad_norm": 1.6024364882265518, + "language_loss": 0.77965522, + "learning_rate": 1.5625011392976991e-06, + "loss": 0.85656625, + "num_input_tokens_seen": 209043740, + "router_z_loss_clip": 1.49511719, + "router_z_loss_mlp": 0.12188721, + "step": 9698, + "time_per_iteration": 2.5902624130249023 + }, + { + "auxiliary_loss_clip": 0.06415899, + "auxiliary_loss_mlp": 0.01273709, + "balance_loss_clip": 0.06272474, + "balance_loss_mlp": 0.01260894, + "epoch": 0.5831354276266346, + "flos": 27067438846080.0, + "grad_norm": 1.5547381527883266, + "language_loss": 0.84016132, + "learning_rate": 1.5621211182607966e-06, + "loss": 0.91705739, + "num_input_tokens_seen": 209068885, + "router_z_loss_clip": 1.43359375, + "router_z_loss_mlp": 0.12817383, + "step": 9699, + "time_per_iteration": 2.6469032764434814 + }, + { + "auxiliary_loss_clip": 0.0642215, + "auxiliary_loss_mlp": 0.01265721, + "balance_loss_clip": 0.06274705, + "balance_loss_mlp": 0.01254104, + "epoch": 0.5831955508793025, + "flos": 23630301164160.0, + "grad_norm": 1.933998465104238, + "language_loss": 0.65971506, + "learning_rate": 1.561741113828305e-06, + "loss": 0.73659378, + "num_input_tokens_seen": 209087340, + "router_z_loss_clip": 1.47558594, + "router_z_loss_mlp": 0.1161499, + "step": 9700, + "time_per_iteration": 3.9589943885803223 + }, + { + "auxiliary_loss_clip": 0.06417754, + "auxiliary_loss_mlp": 0.0126768, + "balance_loss_clip": 0.0627218, + "balance_loss_mlp": 0.01256086, + "epoch": 0.5832556741319705, + "flos": 24980267894400.0, + "grad_norm": 1.7460823027462598, + "language_loss": 0.71739107, + "learning_rate": 1.5613611260146344e-06, + "loss": 0.79424536, + "num_input_tokens_seen": 209108840, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.1159668, + "step": 9701, + "time_per_iteration": 2.591634511947632 + }, + { + "auxiliary_loss_clip": 0.06415233, + "auxiliary_loss_mlp": 0.01264901, + "balance_loss_clip": 0.06270908, + "balance_loss_mlp": 0.01253278, + "epoch": 0.5833157973846385, + "flos": 23228226547200.0, + "grad_norm": 1.7061750612547373, + "language_loss": 0.85686189, + "learning_rate": 1.5609811548341936e-06, + "loss": 0.93366319, + "num_input_tokens_seen": 209127985, + "router_z_loss_clip": 1.44335938, + "router_z_loss_mlp": 0.11627197, + "step": 9702, + "time_per_iteration": 2.552055835723877 + }, + { + "auxiliary_loss_clip": 0.0641585, + "auxiliary_loss_mlp": 0.01263882, + "balance_loss_clip": 0.0627341, + "balance_loss_mlp": 0.01253511, + "epoch": 0.5833759206373065, + "flos": 21983876288640.0, + "grad_norm": 1.4269240656932136, + "language_loss": 0.78200948, + "learning_rate": 1.560601200301392e-06, + "loss": 0.85880685, + "num_input_tokens_seen": 209146885, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.10369873, + "step": 9703, + "time_per_iteration": 2.500241279602051 + }, + { + "auxiliary_loss_clip": 0.06420664, + "auxiliary_loss_mlp": 0.01264639, + "balance_loss_clip": 0.06272513, + "balance_loss_mlp": 0.01252831, + "epoch": 0.5834360438899745, + "flos": 21768869911680.0, + "grad_norm": 1.5504614474031426, + "language_loss": 0.71309936, + "learning_rate": 1.5602212624306366e-06, + "loss": 0.78995246, + "num_input_tokens_seen": 209166130, + "router_z_loss_clip": 1.48242188, + "router_z_loss_mlp": 0.11816406, + "step": 9704, + "time_per_iteration": 2.5374741554260254 + }, + { + "auxiliary_loss_clip": 0.06421441, + "auxiliary_loss_mlp": 0.0126726, + "balance_loss_clip": 0.06276259, + "balance_loss_mlp": 0.01256919, + "epoch": 0.5834961671426424, + "flos": 15997214424960.0, + "grad_norm": 1.6199693671180324, + "language_loss": 0.81965989, + "learning_rate": 1.559841341236335e-06, + "loss": 0.89654684, + "num_input_tokens_seen": 209183350, + "router_z_loss_clip": 1.45117188, + "router_z_loss_mlp": 0.10339355, + "step": 9705, + "time_per_iteration": 2.5450189113616943 + }, + { + "auxiliary_loss_clip": 0.06418713, + "auxiliary_loss_mlp": 0.01264358, + "balance_loss_clip": 0.06273229, + "balance_loss_mlp": 0.01253379, + "epoch": 0.5835562903953104, + "flos": 22824600629760.0, + "grad_norm": 1.6206416307327924, + "language_loss": 0.80445373, + "learning_rate": 1.5594614367328937e-06, + "loss": 0.88128448, + "num_input_tokens_seen": 209203945, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.10986328, + "step": 9706, + "time_per_iteration": 2.5352673530578613 + }, + { + "auxiliary_loss_clip": 0.06415439, + "auxiliary_loss_mlp": 0.01273281, + "balance_loss_clip": 0.06272696, + "balance_loss_mlp": 0.01261003, + "epoch": 0.5836164136479783, + "flos": 48478664332800.0, + "grad_norm": 1.6746295019388222, + "language_loss": 0.74755418, + "learning_rate": 1.5590815489347187e-06, + "loss": 0.82444143, + "num_input_tokens_seen": 209227080, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.1227417, + "step": 9707, + "time_per_iteration": 4.184760808944702 + }, + { + "auxiliary_loss_clip": 0.06414578, + "auxiliary_loss_mlp": 0.01264036, + "balance_loss_clip": 0.06273817, + "balance_loss_mlp": 0.01253463, + "epoch": 0.5836765369006464, + "flos": 26913172279680.0, + "grad_norm": 1.726633366654796, + "language_loss": 0.81783116, + "learning_rate": 1.5587016778562163e-06, + "loss": 0.89461732, + "num_input_tokens_seen": 209248170, + "router_z_loss_clip": 1.40527344, + "router_z_loss_mlp": 0.10571289, + "step": 9708, + "time_per_iteration": 2.5494630336761475 + }, + { + "auxiliary_loss_clip": 0.064155, + "auxiliary_loss_mlp": 0.01267312, + "balance_loss_clip": 0.06274238, + "balance_loss_mlp": 0.01256404, + "epoch": 0.5837366601533143, + "flos": 20090230341120.0, + "grad_norm": 1.3928808196753693, + "language_loss": 0.78363276, + "learning_rate": 1.5583218235117896e-06, + "loss": 0.86046088, + "num_input_tokens_seen": 209267730, + "router_z_loss_clip": 1.41308594, + "router_z_loss_mlp": 0.10906982, + "step": 9709, + "time_per_iteration": 2.54146409034729 + }, + { + "auxiliary_loss_clip": 0.06313366, + "auxiliary_loss_mlp": 0.01252195, + "balance_loss_clip": 0.06253533, + "balance_loss_mlp": 0.01250684, + "epoch": 0.5837967834059823, + "flos": 65383910726400.0, + "grad_norm": 0.7481338178050596, + "language_loss": 0.5665468, + "learning_rate": 1.557941985915844e-06, + "loss": 0.64220238, + "num_input_tokens_seen": 209332510, + "router_z_loss_clip": 0.60058594, + "router_z_loss_mlp": 0.0151062, + "step": 9710, + "time_per_iteration": 3.130523443222046 + }, + { + "auxiliary_loss_clip": 0.06414168, + "auxiliary_loss_mlp": 0.01266687, + "balance_loss_clip": 0.06273045, + "balance_loss_mlp": 0.01256495, + "epoch": 0.5838569066586502, + "flos": 25345809331200.0, + "grad_norm": 1.5024705126599753, + "language_loss": 0.65656877, + "learning_rate": 1.5575621650827833e-06, + "loss": 0.73337734, + "num_input_tokens_seen": 209353355, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.10198975, + "step": 9711, + "time_per_iteration": 2.558560609817505 + }, + { + "auxiliary_loss_clip": 0.06425221, + "auxiliary_loss_mlp": 0.0126656, + "balance_loss_clip": 0.06273845, + "balance_loss_mlp": 0.0125393, + "epoch": 0.5839170299113182, + "flos": 22234535377920.0, + "grad_norm": 1.9299970772651502, + "language_loss": 0.79264128, + "learning_rate": 1.5571823610270085e-06, + "loss": 0.86955917, + "num_input_tokens_seen": 209370960, + "router_z_loss_clip": 1.51269531, + "router_z_loss_mlp": 0.12640381, + "step": 9712, + "time_per_iteration": 2.571164131164551 + }, + { + "auxiliary_loss_clip": 0.06417041, + "auxiliary_loss_mlp": 0.01264809, + "balance_loss_clip": 0.06273463, + "balance_loss_mlp": 0.01254021, + "epoch": 0.5839771531639861, + "flos": 22206513386880.0, + "grad_norm": 1.5054581881557743, + "language_loss": 0.73669749, + "learning_rate": 1.5568025737629234e-06, + "loss": 0.81351602, + "num_input_tokens_seen": 209390955, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.10784912, + "step": 9713, + "time_per_iteration": 2.5475780963897705 + }, + { + "auxiliary_loss_clip": 0.06424147, + "auxiliary_loss_mlp": 0.01265979, + "balance_loss_clip": 0.06274505, + "balance_loss_mlp": 0.01252932, + "epoch": 0.5840372764166541, + "flos": 22425964030080.0, + "grad_norm": 1.9255335004661567, + "language_loss": 0.70002109, + "learning_rate": 1.5564228033049292e-06, + "loss": 0.77692235, + "num_input_tokens_seen": 209410260, + "router_z_loss_clip": 1.49511719, + "router_z_loss_mlp": 0.13049316, + "step": 9714, + "time_per_iteration": 2.523638963699341 + }, + { + "auxiliary_loss_clip": 0.06419174, + "auxiliary_loss_mlp": 0.01266096, + "balance_loss_clip": 0.06273787, + "balance_loss_mlp": 0.012543, + "epoch": 0.5840973996693221, + "flos": 19834330371840.0, + "grad_norm": 1.8598920078622099, + "language_loss": 0.80627859, + "learning_rate": 1.5560430496674268e-06, + "loss": 0.88313133, + "num_input_tokens_seen": 209429920, + "router_z_loss_clip": 1.453125, + "router_z_loss_mlp": 0.11798096, + "step": 9715, + "time_per_iteration": 2.5382297039031982 + }, + { + "auxiliary_loss_clip": 0.06417744, + "auxiliary_loss_mlp": 0.01265495, + "balance_loss_clip": 0.0627513, + "balance_loss_mlp": 0.01254194, + "epoch": 0.5841575229219901, + "flos": 21149482930560.0, + "grad_norm": 1.9876848107590372, + "language_loss": 0.73826301, + "learning_rate": 1.5556633128648167e-06, + "loss": 0.81509537, + "num_input_tokens_seen": 209449470, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.11303711, + "step": 9716, + "time_per_iteration": 2.5080726146698 + }, + { + "auxiliary_loss_clip": 0.06413358, + "auxiliary_loss_mlp": 0.01264669, + "balance_loss_clip": 0.0627432, + "balance_loss_mlp": 0.01254202, + "epoch": 0.5842176461746581, + "flos": 24646521882240.0, + "grad_norm": 2.3723983049620876, + "language_loss": 0.75045407, + "learning_rate": 1.5552835929114976e-06, + "loss": 0.82723433, + "num_input_tokens_seen": 209467695, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.10467529, + "step": 9717, + "time_per_iteration": 2.5569300651550293 + }, + { + "auxiliary_loss_clip": 0.06420394, + "auxiliary_loss_mlp": 0.01265893, + "balance_loss_clip": 0.06276444, + "balance_loss_mlp": 0.01254759, + "epoch": 0.584277769427326, + "flos": 19136468442240.0, + "grad_norm": 2.2457444336667343, + "language_loss": 0.80242944, + "learning_rate": 1.5549038898218697e-06, + "loss": 0.87929225, + "num_input_tokens_seen": 209484250, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.11132812, + "step": 9718, + "time_per_iteration": 2.5623273849487305 + }, + { + "auxiliary_loss_clip": 0.06421262, + "auxiliary_loss_mlp": 0.01264972, + "balance_loss_clip": 0.0627823, + "balance_loss_mlp": 0.01253117, + "epoch": 0.584337892679994, + "flos": 22681822072320.0, + "grad_norm": 1.5991831303569484, + "language_loss": 0.67348599, + "learning_rate": 1.5545242036103306e-06, + "loss": 0.75034833, + "num_input_tokens_seen": 209502830, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.11853027, + "step": 9719, + "time_per_iteration": 2.5381717681884766 + }, + { + "auxiliary_loss_clip": 0.0641831, + "auxiliary_loss_mlp": 0.01263454, + "balance_loss_clip": 0.06271739, + "balance_loss_mlp": 0.01252022, + "epoch": 0.5843980159326619, + "flos": 31291954945920.0, + "grad_norm": 1.728104183061379, + "language_loss": 0.75697351, + "learning_rate": 1.5541445342912786e-06, + "loss": 0.83379114, + "num_input_tokens_seen": 209525995, + "router_z_loss_clip": 1.46679688, + "router_z_loss_mlp": 0.11425781, + "step": 9720, + "time_per_iteration": 2.6132402420043945 + }, + { + "auxiliary_loss_clip": 0.06421956, + "auxiliary_loss_mlp": 0.01266891, + "balance_loss_clip": 0.06276225, + "balance_loss_mlp": 0.01255799, + "epoch": 0.58445813918533, + "flos": 22754846505600.0, + "grad_norm": 1.447216358863969, + "language_loss": 0.83020425, + "learning_rate": 1.5537648818791105e-06, + "loss": 0.90709275, + "num_input_tokens_seen": 209545895, + "router_z_loss_clip": 1.45605469, + "router_z_loss_mlp": 0.11090088, + "step": 9721, + "time_per_iteration": 2.5127675533294678 + }, + { + "auxiliary_loss_clip": 0.06310159, + "auxiliary_loss_mlp": 0.01253726, + "balance_loss_clip": 0.06250554, + "balance_loss_mlp": 0.01252051, + "epoch": 0.5845182624379979, + "flos": 60704602992000.0, + "grad_norm": 0.9150346622366115, + "language_loss": 0.71186364, + "learning_rate": 1.5533852463882226e-06, + "loss": 0.78750253, + "num_input_tokens_seen": 209602315, + "router_z_loss_clip": 0.59765625, + "router_z_loss_mlp": 0.01678467, + "step": 9722, + "time_per_iteration": 3.1494555473327637 + }, + { + "auxiliary_loss_clip": 0.06417061, + "auxiliary_loss_mlp": 0.01268389, + "balance_loss_clip": 0.06274655, + "balance_loss_mlp": 0.01257255, + "epoch": 0.5845783856906659, + "flos": 16367996741760.0, + "grad_norm": 1.9087918582550145, + "language_loss": 0.8944329, + "learning_rate": 1.5530056278330113e-06, + "loss": 0.97128743, + "num_input_tokens_seen": 209617615, + "router_z_loss_clip": 1.42285156, + "router_z_loss_mlp": 0.11132812, + "step": 9723, + "time_per_iteration": 2.4576761722564697 + }, + { + "auxiliary_loss_clip": 0.06417491, + "auxiliary_loss_mlp": 0.01267794, + "balance_loss_clip": 0.06274551, + "balance_loss_mlp": 0.01256922, + "epoch": 0.5846385089433338, + "flos": 20089475654400.0, + "grad_norm": 1.3439404505357262, + "language_loss": 0.68925285, + "learning_rate": 1.5526260262278709e-06, + "loss": 0.76610565, + "num_input_tokens_seen": 209637005, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.10870361, + "step": 9724, + "time_per_iteration": 2.5088019371032715 + }, + { + "auxiliary_loss_clip": 0.06417604, + "auxiliary_loss_mlp": 0.01265081, + "balance_loss_clip": 0.06271344, + "balance_loss_mlp": 0.01252922, + "epoch": 0.5846986321960018, + "flos": 17316769322880.0, + "grad_norm": 2.3711774156816188, + "language_loss": 0.86716926, + "learning_rate": 1.552246441587197e-06, + "loss": 0.94399607, + "num_input_tokens_seen": 209653170, + "router_z_loss_clip": 1.46289062, + "router_z_loss_mlp": 0.121521, + "step": 9725, + "time_per_iteration": 2.4511706829071045 + }, + { + "auxiliary_loss_clip": 0.06423703, + "auxiliary_loss_mlp": 0.0127082, + "balance_loss_clip": 0.06276515, + "balance_loss_mlp": 0.01258995, + "epoch": 0.5847587554486697, + "flos": 17202977078400.0, + "grad_norm": 1.45457124956925, + "language_loss": 0.8335436, + "learning_rate": 1.5518668739253821e-06, + "loss": 0.91048884, + "num_input_tokens_seen": 209671275, + "router_z_loss_clip": 1.47265625, + "router_z_loss_mlp": 0.1182251, + "step": 9726, + "time_per_iteration": 2.506606340408325 + }, + { + "auxiliary_loss_clip": 0.06418396, + "auxiliary_loss_mlp": 0.01263644, + "balance_loss_clip": 0.06271654, + "balance_loss_mlp": 0.01252957, + "epoch": 0.5848188787013378, + "flos": 24534993697920.0, + "grad_norm": 1.7434091697787477, + "language_loss": 0.67301726, + "learning_rate": 1.5514873232568206e-06, + "loss": 0.7498377, + "num_input_tokens_seen": 209690380, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 0.10675049, + "step": 9727, + "time_per_iteration": 2.5283849239349365 + }, + { + "auxiliary_loss_clip": 0.06419774, + "auxiliary_loss_mlp": 0.01272592, + "balance_loss_clip": 0.06275018, + "balance_loss_mlp": 0.0126054, + "epoch": 0.5848790019540057, + "flos": 20634161120640.0, + "grad_norm": 1.6131340234861964, + "language_loss": 0.82272881, + "learning_rate": 1.5511077895959055e-06, + "loss": 0.89965248, + "num_input_tokens_seen": 209708845, + "router_z_loss_clip": 1.44824219, + "router_z_loss_mlp": 0.12060547, + "step": 9728, + "time_per_iteration": 2.5226187705993652 + }, + { + "auxiliary_loss_clip": 0.06412318, + "auxiliary_loss_mlp": 0.01270439, + "balance_loss_clip": 0.06272879, + "balance_loss_mlp": 0.01260198, + "epoch": 0.5849391252066737, + "flos": 22425377051520.0, + "grad_norm": 1.6963428440366448, + "language_loss": 0.78290164, + "learning_rate": 1.550728272957027e-06, + "loss": 0.85972917, + "num_input_tokens_seen": 209729000, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.10241699, + "step": 9729, + "time_per_iteration": 3.922197103500366 + }, + { + "auxiliary_loss_clip": 0.06418414, + "auxiliary_loss_mlp": 0.01265654, + "balance_loss_clip": 0.06272924, + "balance_loss_mlp": 0.01254228, + "epoch": 0.5849992484593417, + "flos": 25417995223680.0, + "grad_norm": 1.7817091958189777, + "language_loss": 0.71144295, + "learning_rate": 1.5503487733545782e-06, + "loss": 0.78828371, + "num_input_tokens_seen": 209747435, + "router_z_loss_clip": 1.45605469, + "router_z_loss_mlp": 0.11419678, + "step": 9730, + "time_per_iteration": 2.5403687953948975 + }, + { + "auxiliary_loss_clip": 0.06422406, + "auxiliary_loss_mlp": 0.01268067, + "balance_loss_clip": 0.0627557, + "balance_loss_mlp": 0.01256188, + "epoch": 0.5850593717120096, + "flos": 21070840273920.0, + "grad_norm": 1.6620919701985222, + "language_loss": 0.78394347, + "learning_rate": 1.5499692908029482e-06, + "loss": 0.86084819, + "num_input_tokens_seen": 209764910, + "router_z_loss_clip": 1.46679688, + "router_z_loss_mlp": 0.11883545, + "step": 9731, + "time_per_iteration": 2.5166611671447754 + }, + { + "auxiliary_loss_clip": 0.06415913, + "auxiliary_loss_mlp": 0.01267443, + "balance_loss_clip": 0.0627268, + "balance_loss_mlp": 0.01256088, + "epoch": 0.5851194949646776, + "flos": 25308605318400.0, + "grad_norm": 2.100344301849282, + "language_loss": 0.70174819, + "learning_rate": 1.549589825316528e-06, + "loss": 0.77858174, + "num_input_tokens_seen": 209786115, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.11352539, + "step": 9732, + "time_per_iteration": 2.538188934326172 + }, + { + "auxiliary_loss_clip": 0.06423078, + "auxiliary_loss_mlp": 0.01269533, + "balance_loss_clip": 0.06275669, + "balance_loss_mlp": 0.01256707, + "epoch": 0.5851796182173455, + "flos": 23594103400320.0, + "grad_norm": 2.4062469566098685, + "language_loss": 0.53286588, + "learning_rate": 1.5492103769097075e-06, + "loss": 0.60979199, + "num_input_tokens_seen": 209806095, + "router_z_loss_clip": 1.47363281, + "router_z_loss_mlp": 0.12823486, + "step": 9733, + "time_per_iteration": 2.511302947998047 + }, + { + "auxiliary_loss_clip": 0.06417008, + "auxiliary_loss_mlp": 0.01268655, + "balance_loss_clip": 0.06273425, + "balance_loss_mlp": 0.01256657, + "epoch": 0.5852397414700136, + "flos": 24828936220800.0, + "grad_norm": 2.0225140710518184, + "language_loss": 0.87949061, + "learning_rate": 1.5488309455968739e-06, + "loss": 0.95634717, + "num_input_tokens_seen": 209823650, + "router_z_loss_clip": 1.43652344, + "router_z_loss_mlp": 0.12005615, + "step": 9734, + "time_per_iteration": 2.538619041442871 + }, + { + "auxiliary_loss_clip": 0.06415038, + "auxiliary_loss_mlp": 0.01266318, + "balance_loss_clip": 0.06276681, + "balance_loss_mlp": 0.01255667, + "epoch": 0.5852998647226815, + "flos": 19943887985280.0, + "grad_norm": 1.4699537388912873, + "language_loss": 0.72430563, + "learning_rate": 1.5484515313924163e-06, + "loss": 0.80111921, + "num_input_tokens_seen": 209843220, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.10656738, + "step": 9735, + "time_per_iteration": 3.9566004276275635 + }, + { + "auxiliary_loss_clip": 0.06418768, + "auxiliary_loss_mlp": 0.01267652, + "balance_loss_clip": 0.06273651, + "balance_loss_mlp": 0.0125563, + "epoch": 0.5853599879753495, + "flos": 16724817354240.0, + "grad_norm": 2.1987965595401135, + "language_loss": 0.7462939, + "learning_rate": 1.5480721343107217e-06, + "loss": 0.82315814, + "num_input_tokens_seen": 209854880, + "router_z_loss_clip": 1.45117188, + "router_z_loss_mlp": 0.12017822, + "step": 9736, + "time_per_iteration": 2.4270691871643066 + }, + { + "auxiliary_loss_clip": 0.06417002, + "auxiliary_loss_mlp": 0.01263204, + "balance_loss_clip": 0.06274146, + "balance_loss_mlp": 0.0125241, + "epoch": 0.5854201112280174, + "flos": 44466848622720.0, + "grad_norm": 1.4975519288318198, + "language_loss": 0.7076987, + "learning_rate": 1.5476927543661772e-06, + "loss": 0.78450084, + "num_input_tokens_seen": 209877870, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.10791016, + "step": 9737, + "time_per_iteration": 2.744206190109253 + }, + { + "auxiliary_loss_clip": 0.06416388, + "auxiliary_loss_mlp": 0.01270708, + "balance_loss_clip": 0.06274648, + "balance_loss_mlp": 0.01259556, + "epoch": 0.5854802344806854, + "flos": 20345375623680.0, + "grad_norm": 1.6871127807078519, + "language_loss": 0.82840961, + "learning_rate": 1.547313391573169e-06, + "loss": 0.90528059, + "num_input_tokens_seen": 209896690, + "router_z_loss_clip": 1.41796875, + "router_z_loss_mlp": 0.11151123, + "step": 9738, + "time_per_iteration": 2.4849019050598145 + }, + { + "auxiliary_loss_clip": 0.06422549, + "auxiliary_loss_mlp": 0.01269287, + "balance_loss_clip": 0.06275184, + "balance_loss_mlp": 0.01257431, + "epoch": 0.5855403577333533, + "flos": 20927013540480.0, + "grad_norm": 1.6194676695443784, + "language_loss": 0.69157064, + "learning_rate": 1.546934045946082e-06, + "loss": 0.768489, + "num_input_tokens_seen": 209914640, + "router_z_loss_clip": 1.47363281, + "router_z_loss_mlp": 0.11846924, + "step": 9739, + "time_per_iteration": 3.941681146621704 + }, + { + "auxiliary_loss_clip": 0.0641816, + "auxiliary_loss_mlp": 0.01267651, + "balance_loss_clip": 0.06272583, + "balance_loss_mlp": 0.01255796, + "epoch": 0.5856004809860214, + "flos": 20454849383040.0, + "grad_norm": 2.1509507460713038, + "language_loss": 0.59265625, + "learning_rate": 1.5465547174993017e-06, + "loss": 0.66951436, + "num_input_tokens_seen": 209933375, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.11859131, + "step": 9740, + "time_per_iteration": 2.5459988117218018 + }, + { + "auxiliary_loss_clip": 0.06417701, + "auxiliary_loss_mlp": 0.01265897, + "balance_loss_clip": 0.06273193, + "balance_loss_mlp": 0.0125487, + "epoch": 0.5856606042386893, + "flos": 19645962393600.0, + "grad_norm": 1.6784070122461718, + "language_loss": 0.75433791, + "learning_rate": 1.5461754062472113e-06, + "loss": 0.83117396, + "num_input_tokens_seen": 209952055, + "router_z_loss_clip": 1.44726562, + "router_z_loss_mlp": 0.11029053, + "step": 9741, + "time_per_iteration": 2.488905668258667 + }, + { + "auxiliary_loss_clip": 0.06418155, + "auxiliary_loss_mlp": 0.01263599, + "balance_loss_clip": 0.06272431, + "balance_loss_mlp": 0.01251857, + "epoch": 0.5857207274913573, + "flos": 21692072044800.0, + "grad_norm": 1.4885669249171192, + "language_loss": 0.76157856, + "learning_rate": 1.5457961122041959e-06, + "loss": 0.83839613, + "num_input_tokens_seen": 209971190, + "router_z_loss_clip": 1.45605469, + "router_z_loss_mlp": 0.11743164, + "step": 9742, + "time_per_iteration": 2.5480451583862305 + }, + { + "auxiliary_loss_clip": 0.06415333, + "auxiliary_loss_mlp": 0.01266181, + "balance_loss_clip": 0.06272702, + "balance_loss_mlp": 0.01254737, + "epoch": 0.5857808507440253, + "flos": 23188968109440.0, + "grad_norm": 1.7165353954706328, + "language_loss": 0.75240624, + "learning_rate": 1.5454168353846369e-06, + "loss": 0.82922137, + "num_input_tokens_seen": 209990695, + "router_z_loss_clip": 1.42480469, + "router_z_loss_mlp": 0.11444092, + "step": 9743, + "time_per_iteration": 2.503702163696289 + }, + { + "auxiliary_loss_clip": 0.0641541, + "auxiliary_loss_mlp": 0.01265703, + "balance_loss_clip": 0.06275813, + "balance_loss_mlp": 0.01254944, + "epoch": 0.5858409739966932, + "flos": 27242683660800.0, + "grad_norm": 1.53753206771929, + "language_loss": 0.81320727, + "learning_rate": 1.5450375758029172e-06, + "loss": 0.8900184, + "num_input_tokens_seen": 210010210, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.10760498, + "step": 9744, + "time_per_iteration": 2.5923476219177246 + }, + { + "auxiliary_loss_clip": 0.06429034, + "auxiliary_loss_mlp": 0.01268911, + "balance_loss_clip": 0.06278567, + "balance_loss_mlp": 0.01256847, + "epoch": 0.5859010972493612, + "flos": 27862993036800.0, + "grad_norm": 1.7800190043611435, + "language_loss": 0.71494257, + "learning_rate": 1.5446583334734183e-06, + "loss": 0.79192197, + "num_input_tokens_seen": 210030030, + "router_z_loss_clip": 1.50390625, + "router_z_loss_mlp": 0.12072754, + "step": 9745, + "time_per_iteration": 2.5417301654815674 + }, + { + "auxiliary_loss_clip": 0.06318981, + "auxiliary_loss_mlp": 0.01251832, + "balance_loss_clip": 0.06258826, + "balance_loss_mlp": 0.01250336, + "epoch": 0.5859612205020291, + "flos": 70029452465280.0, + "grad_norm": 0.7182748841957548, + "language_loss": 0.53236032, + "learning_rate": 1.5442791084105204e-06, + "loss": 0.60806841, + "num_input_tokens_seen": 210094840, + "router_z_loss_clip": 0.6015625, + "router_z_loss_mlp": 0.01495361, + "step": 9746, + "time_per_iteration": 4.6102893352508545 + }, + { + "auxiliary_loss_clip": 0.06421819, + "auxiliary_loss_mlp": 0.01265345, + "balance_loss_clip": 0.06276383, + "balance_loss_mlp": 0.01253907, + "epoch": 0.5860213437546972, + "flos": 24062032926720.0, + "grad_norm": 1.805241505686608, + "language_loss": 0.7322374, + "learning_rate": 1.5438999006286054e-06, + "loss": 0.80910903, + "num_input_tokens_seen": 210114660, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.11437988, + "step": 9747, + "time_per_iteration": 2.5299086570739746 + }, + { + "auxiliary_loss_clip": 0.06420729, + "auxiliary_loss_mlp": 0.01264964, + "balance_loss_clip": 0.06275554, + "balance_loss_mlp": 0.01253806, + "epoch": 0.5860814670073651, + "flos": 18952670511360.0, + "grad_norm": 1.7528078306488855, + "language_loss": 0.81229597, + "learning_rate": 1.543520710142051e-06, + "loss": 0.88915294, + "num_input_tokens_seen": 210132770, + "router_z_loss_clip": 1.45214844, + "router_z_loss_mlp": 0.1116333, + "step": 9748, + "time_per_iteration": 2.5070362091064453 + }, + { + "auxiliary_loss_clip": 0.06422453, + "auxiliary_loss_mlp": 0.01268094, + "balance_loss_clip": 0.06275974, + "balance_loss_mlp": 0.01256674, + "epoch": 0.5861415902600331, + "flos": 22567904046720.0, + "grad_norm": 2.1315206911445217, + "language_loss": 0.72122687, + "learning_rate": 1.5431415369652375e-06, + "loss": 0.7981323, + "num_input_tokens_seen": 210151895, + "router_z_loss_clip": 1.46484375, + "router_z_loss_mlp": 0.11419678, + "step": 9749, + "time_per_iteration": 2.5568935871124268 + }, + { + "auxiliary_loss_clip": 0.06413895, + "auxiliary_loss_mlp": 0.01265815, + "balance_loss_clip": 0.06272951, + "balance_loss_mlp": 0.01254765, + "epoch": 0.586201713512701, + "flos": 14397217511040.0, + "grad_norm": 2.3126679183899608, + "language_loss": 0.75373948, + "learning_rate": 1.5427623811125428e-06, + "loss": 0.8305366, + "num_input_tokens_seen": 210168040, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.11053467, + "step": 9750, + "time_per_iteration": 2.456709623336792 + }, + { + "auxiliary_loss_clip": 0.06418054, + "auxiliary_loss_mlp": 0.01267589, + "balance_loss_clip": 0.06274709, + "balance_loss_mlp": 0.01256091, + "epoch": 0.586261836765369, + "flos": 19504357793280.0, + "grad_norm": 1.5048801591853769, + "language_loss": 0.70914859, + "learning_rate": 1.542383242598344e-06, + "loss": 0.78600496, + "num_input_tokens_seen": 210187720, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.11505127, + "step": 9751, + "time_per_iteration": 2.516965389251709 + }, + { + "auxiliary_loss_clip": 0.06427741, + "auxiliary_loss_mlp": 0.01267026, + "balance_loss_clip": 0.06278099, + "balance_loss_mlp": 0.01254748, + "epoch": 0.5863219600180369, + "flos": 20707688678400.0, + "grad_norm": 2.2695397417566134, + "language_loss": 0.74817115, + "learning_rate": 1.5420041214370184e-06, + "loss": 0.82511884, + "num_input_tokens_seen": 210206080, + "router_z_loss_clip": 1.49511719, + "router_z_loss_mlp": 0.12280273, + "step": 9752, + "time_per_iteration": 2.4829437732696533 + }, + { + "auxiliary_loss_clip": 0.06419428, + "auxiliary_loss_mlp": 0.01266631, + "balance_loss_clip": 0.06275827, + "balance_loss_mlp": 0.01255026, + "epoch": 0.586382083270705, + "flos": 19798258389120.0, + "grad_norm": 1.7375633359019997, + "language_loss": 0.77788973, + "learning_rate": 1.541625017642943e-06, + "loss": 0.85475028, + "num_input_tokens_seen": 210225660, + "router_z_loss_clip": 1.43457031, + "router_z_loss_mlp": 0.1159668, + "step": 9753, + "time_per_iteration": 2.5376296043395996 + }, + { + "auxiliary_loss_clip": 0.06415142, + "auxiliary_loss_mlp": 0.01266521, + "balance_loss_clip": 0.06275599, + "balance_loss_mlp": 0.01256478, + "epoch": 0.5864422065233729, + "flos": 16504821659520.0, + "grad_norm": 1.5941521516898884, + "language_loss": 0.71418774, + "learning_rate": 1.5412459312304927e-06, + "loss": 0.79100442, + "num_input_tokens_seen": 210242725, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.1003418, + "step": 9754, + "time_per_iteration": 2.482060670852661 + }, + { + "auxiliary_loss_clip": 0.06418964, + "auxiliary_loss_mlp": 0.01266127, + "balance_loss_clip": 0.06275275, + "balance_loss_mlp": 0.01254706, + "epoch": 0.5865023297760409, + "flos": 20419657868160.0, + "grad_norm": 1.5122611907827943, + "language_loss": 0.72473872, + "learning_rate": 1.540866862214043e-06, + "loss": 0.80158961, + "num_input_tokens_seen": 210263225, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.11407471, + "step": 9755, + "time_per_iteration": 2.5370032787323 + }, + { + "auxiliary_loss_clip": 0.06317496, + "auxiliary_loss_mlp": 0.01251101, + "balance_loss_clip": 0.06257688, + "balance_loss_mlp": 0.01249532, + "epoch": 0.5865624530287089, + "flos": 63369386864640.0, + "grad_norm": 0.7287908319651881, + "language_loss": 0.56949997, + "learning_rate": 1.540487810607967e-06, + "loss": 0.64518595, + "num_input_tokens_seen": 210322310, + "router_z_loss_clip": 0.60058594, + "router_z_loss_mlp": 0.01570129, + "step": 9756, + "time_per_iteration": 3.10322904586792 + }, + { + "auxiliary_loss_clip": 0.06418074, + "auxiliary_loss_mlp": 0.01268383, + "balance_loss_clip": 0.06273533, + "balance_loss_mlp": 0.01258048, + "epoch": 0.5866225762813768, + "flos": 27023610360960.0, + "grad_norm": 1.7386050489235434, + "language_loss": 0.76836097, + "learning_rate": 1.5401087764266396e-06, + "loss": 0.84522557, + "num_input_tokens_seen": 210340845, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.10333252, + "step": 9757, + "time_per_iteration": 2.5645911693573 + }, + { + "auxiliary_loss_clip": 0.06316153, + "auxiliary_loss_mlp": 0.01253974, + "balance_loss_clip": 0.06255822, + "balance_loss_mlp": 0.01252219, + "epoch": 0.5866826995340448, + "flos": 73007941224960.0, + "grad_norm": 0.8367731636564993, + "language_loss": 0.60245061, + "learning_rate": 1.5397297596844337e-06, + "loss": 0.67815191, + "num_input_tokens_seen": 210397815, + "router_z_loss_clip": 0.60449219, + "router_z_loss_mlp": 0.01760864, + "step": 9758, + "time_per_iteration": 3.129420042037964 + }, + { + "auxiliary_loss_clip": 0.06425761, + "auxiliary_loss_mlp": 0.01269834, + "balance_loss_clip": 0.06276144, + "balance_loss_mlp": 0.0125824, + "epoch": 0.5867428227867127, + "flos": 21291716436480.0, + "grad_norm": 2.341889353580635, + "language_loss": 0.7231499, + "learning_rate": 1.5393507603957212e-06, + "loss": 0.80010581, + "num_input_tokens_seen": 210413900, + "router_z_loss_clip": 1.49316406, + "router_z_loss_mlp": 0.11602783, + "step": 9759, + "time_per_iteration": 2.5044219493865967 + }, + { + "auxiliary_loss_clip": 0.06416983, + "auxiliary_loss_mlp": 0.01266034, + "balance_loss_clip": 0.06274659, + "balance_loss_mlp": 0.01254924, + "epoch": 0.5868029460393808, + "flos": 33476356961280.0, + "grad_norm": 1.459885556596891, + "language_loss": 0.73556709, + "learning_rate": 1.5389717785748742e-06, + "loss": 0.8123973, + "num_input_tokens_seen": 210434110, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.11114502, + "step": 9760, + "time_per_iteration": 2.662318229675293 + }, + { + "auxiliary_loss_clip": 0.06416824, + "auxiliary_loss_mlp": 0.01264293, + "balance_loss_clip": 0.06273922, + "balance_loss_mlp": 0.01252944, + "epoch": 0.5868630692920487, + "flos": 17894382243840.0, + "grad_norm": 1.6271911446451897, + "language_loss": 0.7251972, + "learning_rate": 1.5385928142362637e-06, + "loss": 0.80200839, + "num_input_tokens_seen": 210451685, + "router_z_loss_clip": 1.42871094, + "router_z_loss_mlp": 0.11352539, + "step": 9761, + "time_per_iteration": 2.635671377182007 + }, + { + "auxiliary_loss_clip": 0.06421126, + "auxiliary_loss_mlp": 0.01265487, + "balance_loss_clip": 0.06272967, + "balance_loss_mlp": 0.01253274, + "epoch": 0.5869231925447167, + "flos": 21041770106880.0, + "grad_norm": 1.8098960680000724, + "language_loss": 0.74938971, + "learning_rate": 1.5382138673942597e-06, + "loss": 0.8262558, + "num_input_tokens_seen": 210470825, + "router_z_loss_clip": 1.47949219, + "router_z_loss_mlp": 0.12200928, + "step": 9762, + "time_per_iteration": 2.511338472366333 + }, + { + "auxiliary_loss_clip": 0.06414436, + "auxiliary_loss_mlp": 0.01266483, + "balance_loss_clip": 0.06275184, + "balance_loss_mlp": 0.01255766, + "epoch": 0.5869833157973846, + "flos": 74753288974080.0, + "grad_norm": 1.2323244190692502, + "language_loss": 0.72678411, + "learning_rate": 1.5378349380632317e-06, + "loss": 0.80359328, + "num_input_tokens_seen": 210500075, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.10723877, + "step": 9763, + "time_per_iteration": 2.966012716293335 + }, + { + "auxiliary_loss_clip": 0.06416167, + "auxiliary_loss_mlp": 0.01264221, + "balance_loss_clip": 0.06274015, + "balance_loss_mlp": 0.01253296, + "epoch": 0.5870434390500526, + "flos": 17644687476480.0, + "grad_norm": 1.6070407244149296, + "language_loss": 0.79883134, + "learning_rate": 1.53745602625755e-06, + "loss": 0.87563521, + "num_input_tokens_seen": 210518150, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.10931396, + "step": 9764, + "time_per_iteration": 2.5360097885131836 + }, + { + "auxiliary_loss_clip": 0.06420099, + "auxiliary_loss_mlp": 0.01269959, + "balance_loss_clip": 0.06274946, + "balance_loss_mlp": 0.01258342, + "epoch": 0.5871035623027205, + "flos": 21512424890880.0, + "grad_norm": 2.0596306569779967, + "language_loss": 0.79149717, + "learning_rate": 1.5370771319915819e-06, + "loss": 0.86839771, + "num_input_tokens_seen": 210537760, + "router_z_loss_clip": 1.45019531, + "router_z_loss_mlp": 0.1161499, + "step": 9765, + "time_per_iteration": 2.523232936859131 + }, + { + "auxiliary_loss_clip": 0.06413256, + "auxiliary_loss_mlp": 0.01264834, + "balance_loss_clip": 0.06272542, + "balance_loss_mlp": 0.01254427, + "epoch": 0.5871636855553886, + "flos": 13556744732160.0, + "grad_norm": 1.6377752901078153, + "language_loss": 0.83660257, + "learning_rate": 1.5366982552796947e-06, + "loss": 0.91338348, + "num_input_tokens_seen": 210555515, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.10406494, + "step": 9766, + "time_per_iteration": 2.468043804168701 + }, + { + "auxiliary_loss_clip": 0.06423902, + "auxiliary_loss_mlp": 0.01268958, + "balance_loss_clip": 0.06274862, + "balance_loss_mlp": 0.01257639, + "epoch": 0.5872238088080565, + "flos": 26220006178560.0, + "grad_norm": 1.5173362705755495, + "language_loss": 0.69876915, + "learning_rate": 1.536319396136257e-06, + "loss": 0.77569771, + "num_input_tokens_seen": 210575000, + "router_z_loss_clip": 1.49121094, + "router_z_loss_mlp": 0.11322021, + "step": 9767, + "time_per_iteration": 2.53935170173645 + }, + { + "auxiliary_loss_clip": 0.06416009, + "auxiliary_loss_mlp": 0.01268588, + "balance_loss_clip": 0.06271268, + "balance_loss_mlp": 0.0125743, + "epoch": 0.5872839320607245, + "flos": 30673196870400.0, + "grad_norm": 6.458419959703109, + "language_loss": 0.64030594, + "learning_rate": 1.5359405545756336e-06, + "loss": 0.71715188, + "num_input_tokens_seen": 210595185, + "router_z_loss_clip": 1.44824219, + "router_z_loss_mlp": 0.11151123, + "step": 9768, + "time_per_iteration": 2.6036899089813232 + }, + { + "auxiliary_loss_clip": 0.06324692, + "auxiliary_loss_mlp": 0.01254391, + "balance_loss_clip": 0.06264571, + "balance_loss_mlp": 0.01252818, + "epoch": 0.5873440553133924, + "flos": 60324623925120.0, + "grad_norm": 0.7185710562845293, + "language_loss": 0.53754711, + "learning_rate": 1.5355617306121914e-06, + "loss": 0.61333793, + "num_input_tokens_seen": 210653210, + "router_z_loss_clip": 0.6015625, + "router_z_loss_mlp": 0.01573944, + "step": 9769, + "time_per_iteration": 4.53153133392334 + }, + { + "auxiliary_loss_clip": 0.06416724, + "auxiliary_loss_mlp": 0.01267359, + "balance_loss_clip": 0.0627375, + "balance_loss_mlp": 0.01256409, + "epoch": 0.5874041785660604, + "flos": 21545016929280.0, + "grad_norm": 1.3491952646211745, + "language_loss": 0.70993185, + "learning_rate": 1.5351829242602945e-06, + "loss": 0.78677267, + "num_input_tokens_seen": 210673750, + "router_z_loss_clip": 1.42871094, + "router_z_loss_mlp": 0.10949707, + "step": 9770, + "time_per_iteration": 2.5152831077575684 + }, + { + "auxiliary_loss_clip": 0.06416201, + "auxiliary_loss_mlp": 0.01268245, + "balance_loss_clip": 0.06274108, + "balance_loss_mlp": 0.01256801, + "epoch": 0.5874643018187284, + "flos": 24395778938880.0, + "grad_norm": 1.9550841164663295, + "language_loss": 0.67880088, + "learning_rate": 1.5348041355343077e-06, + "loss": 0.75564533, + "num_input_tokens_seen": 210692960, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.11444092, + "step": 9771, + "time_per_iteration": 2.518069267272949 + }, + { + "auxiliary_loss_clip": 0.06421787, + "auxiliary_loss_mlp": 0.0126791, + "balance_loss_clip": 0.06274431, + "balance_loss_mlp": 0.0125531, + "epoch": 0.5875244250713964, + "flos": 28155300405120.0, + "grad_norm": 1.4791048602495522, + "language_loss": 0.66491324, + "learning_rate": 1.5344253644485954e-06, + "loss": 0.74181026, + "num_input_tokens_seen": 210714040, + "router_z_loss_clip": 1.47363281, + "router_z_loss_mlp": 0.1260376, + "step": 9772, + "time_per_iteration": 2.5565338134765625 + }, + { + "auxiliary_loss_clip": 0.0642426, + "auxiliary_loss_mlp": 0.01271472, + "balance_loss_clip": 0.06276119, + "balance_loss_mlp": 0.01258866, + "epoch": 0.5875845483240644, + "flos": 25819566716160.0, + "grad_norm": 1.5545187987766196, + "language_loss": 0.7466417, + "learning_rate": 1.534046611017519e-06, + "loss": 0.82359904, + "num_input_tokens_seen": 210733710, + "router_z_loss_clip": 1.48144531, + "router_z_loss_mlp": 0.12615967, + "step": 9773, + "time_per_iteration": 2.533243179321289 + }, + { + "auxiliary_loss_clip": 0.06421398, + "auxiliary_loss_mlp": 0.0126674, + "balance_loss_clip": 0.06276072, + "balance_loss_mlp": 0.01255606, + "epoch": 0.5876446715767323, + "flos": 26913843112320.0, + "grad_norm": 1.8911636717759477, + "language_loss": 0.54071677, + "learning_rate": 1.5336678752554421e-06, + "loss": 0.61759812, + "num_input_tokens_seen": 210753580, + "router_z_loss_clip": 1.45507812, + "router_z_loss_mlp": 0.11138916, + "step": 9774, + "time_per_iteration": 2.5565576553344727 + }, + { + "auxiliary_loss_clip": 0.06419463, + "auxiliary_loss_mlp": 0.01264428, + "balance_loss_clip": 0.06276506, + "balance_loss_mlp": 0.01253192, + "epoch": 0.5877047948294003, + "flos": 36693750510720.0, + "grad_norm": 2.5652883668591886, + "language_loss": 0.65881801, + "learning_rate": 1.5332891571767264e-06, + "loss": 0.73565692, + "num_input_tokens_seen": 210773495, + "router_z_loss_clip": 1.42871094, + "router_z_loss_mlp": 0.11242676, + "step": 9775, + "time_per_iteration": 4.102318525314331 + }, + { + "auxiliary_loss_clip": 0.06418855, + "auxiliary_loss_mlp": 0.01267575, + "balance_loss_clip": 0.06274112, + "balance_loss_mlp": 0.01256459, + "epoch": 0.5877649180820682, + "flos": 26732057679360.0, + "grad_norm": 1.541611587459476, + "language_loss": 0.73877925, + "learning_rate": 1.5329104567957326e-06, + "loss": 0.81564349, + "num_input_tokens_seen": 210793645, + "router_z_loss_clip": 1.44726562, + "router_z_loss_mlp": 0.11114502, + "step": 9776, + "time_per_iteration": 2.534105062484741 + }, + { + "auxiliary_loss_clip": 0.06416035, + "auxiliary_loss_mlp": 0.01267161, + "balance_loss_clip": 0.06270815, + "balance_loss_mlp": 0.0125586, + "epoch": 0.5878250413347362, + "flos": 21038457870720.0, + "grad_norm": 1.5037279013590201, + "language_loss": 0.7431531, + "learning_rate": 1.532531774126821e-06, + "loss": 0.81998503, + "num_input_tokens_seen": 210813415, + "router_z_loss_clip": 1.45117188, + "router_z_loss_mlp": 0.11315918, + "step": 9777, + "time_per_iteration": 2.501791000366211 + }, + { + "auxiliary_loss_clip": 0.06412566, + "auxiliary_loss_mlp": 0.01267719, + "balance_loss_clip": 0.06273127, + "balance_loss_mlp": 0.01257407, + "epoch": 0.5878851645874041, + "flos": 25491397000320.0, + "grad_norm": 1.389592011343503, + "language_loss": 0.74136406, + "learning_rate": 1.5321531091843512e-06, + "loss": 0.81816691, + "num_input_tokens_seen": 210833850, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.10302734, + "step": 9778, + "time_per_iteration": 2.5198276042938232 + }, + { + "auxiliary_loss_clip": 0.06416066, + "auxiliary_loss_mlp": 0.01272779, + "balance_loss_clip": 0.06272696, + "balance_loss_mlp": 0.01261293, + "epoch": 0.5879452878400722, + "flos": 23775930760320.0, + "grad_norm": 1.6684393614308786, + "language_loss": 0.70061487, + "learning_rate": 1.5317744619826824e-06, + "loss": 0.77750337, + "num_input_tokens_seen": 210853115, + "router_z_loss_clip": 1.43457031, + "router_z_loss_mlp": 0.11486816, + "step": 9779, + "time_per_iteration": 3.9999070167541504 + }, + { + "auxiliary_loss_clip": 0.06419669, + "auxiliary_loss_mlp": 0.01264938, + "balance_loss_clip": 0.06273909, + "balance_loss_mlp": 0.0125331, + "epoch": 0.5880054110927401, + "flos": 17830749467520.0, + "grad_norm": 1.9325071243234666, + "language_loss": 0.67414713, + "learning_rate": 1.5313958325361727e-06, + "loss": 0.75099313, + "num_input_tokens_seen": 210872090, + "router_z_loss_clip": 1.45800781, + "router_z_loss_mlp": 0.11633301, + "step": 9780, + "time_per_iteration": 2.525421142578125 + }, + { + "auxiliary_loss_clip": 0.06422442, + "auxiliary_loss_mlp": 0.01271374, + "balance_loss_clip": 0.0627559, + "balance_loss_mlp": 0.0125981, + "epoch": 0.5880655343454081, + "flos": 19469417840640.0, + "grad_norm": 1.9086155780635632, + "language_loss": 0.73100537, + "learning_rate": 1.5310172208591807e-06, + "loss": 0.80794352, + "num_input_tokens_seen": 210888490, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 0.11572266, + "step": 9781, + "time_per_iteration": 2.4647257328033447 + }, + { + "auxiliary_loss_clip": 0.06415875, + "auxiliary_loss_mlp": 0.01269752, + "balance_loss_clip": 0.06273176, + "balance_loss_mlp": 0.01258731, + "epoch": 0.588125657598076, + "flos": 21403999307520.0, + "grad_norm": 1.283507981192047, + "language_loss": 0.7022016, + "learning_rate": 1.5306386269660622e-06, + "loss": 0.77905786, + "num_input_tokens_seen": 210908220, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.11016846, + "step": 9782, + "time_per_iteration": 2.531780481338501 + }, + { + "auxiliary_loss_clip": 0.06420694, + "auxiliary_loss_mlp": 0.01268128, + "balance_loss_clip": 0.06274669, + "balance_loss_mlp": 0.01256314, + "epoch": 0.588185780850744, + "flos": 16040246296320.0, + "grad_norm": 2.020771184042221, + "language_loss": 0.71036118, + "learning_rate": 1.5302600508711741e-06, + "loss": 0.78724945, + "num_input_tokens_seen": 210923945, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.11804199, + "step": 9783, + "time_per_iteration": 2.452061176300049 + }, + { + "auxiliary_loss_clip": 0.06426281, + "auxiliary_loss_mlp": 0.01267542, + "balance_loss_clip": 0.06277394, + "balance_loss_mlp": 0.01255538, + "epoch": 0.588245904103412, + "flos": 23734282481280.0, + "grad_norm": 1.861465214251895, + "language_loss": 0.69312334, + "learning_rate": 1.5298814925888719e-06, + "loss": 0.77006149, + "num_input_tokens_seen": 210941955, + "router_z_loss_clip": 1.48828125, + "router_z_loss_mlp": 0.12005615, + "step": 9784, + "time_per_iteration": 2.552767515182495 + }, + { + "auxiliary_loss_clip": 0.06421058, + "auxiliary_loss_mlp": 0.01265879, + "balance_loss_clip": 0.06273105, + "balance_loss_mlp": 0.01254596, + "epoch": 0.58830602735608, + "flos": 33810983441280.0, + "grad_norm": 1.7066395827536198, + "language_loss": 0.69576097, + "learning_rate": 1.5295029521335102e-06, + "loss": 0.77263039, + "num_input_tokens_seen": 210963105, + "router_z_loss_clip": 1.47753906, + "router_z_loss_mlp": 0.112854, + "step": 9785, + "time_per_iteration": 3.9847395420074463 + }, + { + "auxiliary_loss_clip": 0.06415717, + "auxiliary_loss_mlp": 0.01265718, + "balance_loss_clip": 0.06274907, + "balance_loss_mlp": 0.01255352, + "epoch": 0.588366150608748, + "flos": 17096144722560.0, + "grad_norm": 1.8665479354272698, + "language_loss": 0.78022271, + "learning_rate": 1.5291244295194448e-06, + "loss": 0.85703707, + "num_input_tokens_seen": 210978720, + "router_z_loss_clip": 1.40722656, + "router_z_loss_mlp": 0.10369873, + "step": 9786, + "time_per_iteration": 2.4842867851257324 + }, + { + "auxiliary_loss_clip": 0.06423976, + "auxiliary_loss_mlp": 0.01266691, + "balance_loss_clip": 0.06278287, + "balance_loss_mlp": 0.01255128, + "epoch": 0.5884262738614159, + "flos": 22133698588800.0, + "grad_norm": 1.4734886628165487, + "language_loss": 0.78796208, + "learning_rate": 1.5287459247610276e-06, + "loss": 0.86486876, + "num_input_tokens_seen": 210998750, + "router_z_loss_clip": 1.45605469, + "router_z_loss_mlp": 0.11566162, + "step": 9787, + "time_per_iteration": 2.497192144393921 + }, + { + "auxiliary_loss_clip": 0.06418703, + "auxiliary_loss_mlp": 0.01265555, + "balance_loss_clip": 0.06275064, + "balance_loss_mlp": 0.01254617, + "epoch": 0.5884863971140839, + "flos": 21038038600320.0, + "grad_norm": 1.5088398107909506, + "language_loss": 0.66488671, + "learning_rate": 1.5283674378726116e-06, + "loss": 0.74172926, + "num_input_tokens_seen": 211017550, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.10943604, + "step": 9788, + "time_per_iteration": 2.5208425521850586 + }, + { + "auxiliary_loss_clip": 0.06416389, + "auxiliary_loss_mlp": 0.01266859, + "balance_loss_clip": 0.0627557, + "balance_loss_mlp": 0.01255212, + "epoch": 0.5885465203667518, + "flos": 23811835034880.0, + "grad_norm": 2.124690797246634, + "language_loss": 0.8100794, + "learning_rate": 1.5279889688685506e-06, + "loss": 0.88691187, + "num_input_tokens_seen": 211034135, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.11651611, + "step": 9789, + "time_per_iteration": 2.497751235961914 + }, + { + "auxiliary_loss_clip": 0.06413969, + "auxiliary_loss_mlp": 0.01268072, + "balance_loss_clip": 0.06274658, + "balance_loss_mlp": 0.01257432, + "epoch": 0.5886066436194198, + "flos": 18886647893760.0, + "grad_norm": 1.5219157367370164, + "language_loss": 0.69998693, + "learning_rate": 1.5276105177631944e-06, + "loss": 0.77680737, + "num_input_tokens_seen": 211053850, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.10638428, + "step": 9790, + "time_per_iteration": 2.5238122940063477 + }, + { + "auxiliary_loss_clip": 0.06416899, + "auxiliary_loss_mlp": 0.01266137, + "balance_loss_clip": 0.06275025, + "balance_loss_mlp": 0.01254484, + "epoch": 0.5886667668720877, + "flos": 24797015015040.0, + "grad_norm": 1.9547129753533632, + "language_loss": 0.83327186, + "learning_rate": 1.527232084570895e-06, + "loss": 0.91010225, + "num_input_tokens_seen": 211072165, + "router_z_loss_clip": 1.41601562, + "router_z_loss_mlp": 0.11651611, + "step": 9791, + "time_per_iteration": 2.518833637237549 + }, + { + "auxiliary_loss_clip": 0.06420578, + "auxiliary_loss_mlp": 0.01270103, + "balance_loss_clip": 0.06276245, + "balance_loss_mlp": 0.01259297, + "epoch": 0.5887268901247558, + "flos": 21620473130880.0, + "grad_norm": 1.5293641441028467, + "language_loss": 0.76486295, + "learning_rate": 1.5268536693060026e-06, + "loss": 0.84176975, + "num_input_tokens_seen": 211089630, + "router_z_loss_clip": 1.44140625, + "router_z_loss_mlp": 0.1081543, + "step": 9792, + "time_per_iteration": 2.5101959705352783 + }, + { + "auxiliary_loss_clip": 0.06421857, + "auxiliary_loss_mlp": 0.01269547, + "balance_loss_clip": 0.06273879, + "balance_loss_mlp": 0.01258424, + "epoch": 0.5887870133774237, + "flos": 20487357567360.0, + "grad_norm": 2.1847202997614477, + "language_loss": 0.69169068, + "learning_rate": 1.5264752719828662e-06, + "loss": 0.76860476, + "num_input_tokens_seen": 211106120, + "router_z_loss_clip": 1.48046875, + "router_z_loss_mlp": 0.11114502, + "step": 9793, + "time_per_iteration": 2.4927995204925537 + }, + { + "auxiliary_loss_clip": 0.06418081, + "auxiliary_loss_mlp": 0.01269605, + "balance_loss_clip": 0.06276278, + "balance_loss_mlp": 0.01258483, + "epoch": 0.5888471366300917, + "flos": 19211966570880.0, + "grad_norm": 1.7416997591947727, + "language_loss": 0.60439771, + "learning_rate": 1.5260968926158353e-06, + "loss": 0.68127453, + "num_input_tokens_seen": 211122450, + "router_z_loss_clip": 1.41796875, + "router_z_loss_mlp": 0.11132812, + "step": 9794, + "time_per_iteration": 2.543231248855591 + }, + { + "auxiliary_loss_clip": 0.06420963, + "auxiliary_loss_mlp": 0.01267396, + "balance_loss_clip": 0.06275123, + "balance_loss_mlp": 0.01256113, + "epoch": 0.5889072598827596, + "flos": 19978786010880.0, + "grad_norm": 1.5723031838894885, + "language_loss": 0.65483499, + "learning_rate": 1.525718531219257e-06, + "loss": 0.73171854, + "num_input_tokens_seen": 211141765, + "router_z_loss_clip": 1.45703125, + "router_z_loss_mlp": 0.11291504, + "step": 9795, + "time_per_iteration": 2.502537965774536 + }, + { + "auxiliary_loss_clip": 0.06414207, + "auxiliary_loss_mlp": 0.01265609, + "balance_loss_clip": 0.06274657, + "balance_loss_mlp": 0.01255197, + "epoch": 0.5889673831354276, + "flos": 20747617948800.0, + "grad_norm": 1.4841948976653832, + "language_loss": 0.74256188, + "learning_rate": 1.5253401878074801e-06, + "loss": 0.81936008, + "num_input_tokens_seen": 211160475, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.10418701, + "step": 9796, + "time_per_iteration": 2.496511220932007 + }, + { + "auxiliary_loss_clip": 0.06417978, + "auxiliary_loss_mlp": 0.01267283, + "balance_loss_clip": 0.06275263, + "balance_loss_mlp": 0.01256238, + "epoch": 0.5890275063880956, + "flos": 25307892558720.0, + "grad_norm": 2.3243895650299566, + "language_loss": 0.83142781, + "learning_rate": 1.5249618623948507e-06, + "loss": 0.90828037, + "num_input_tokens_seen": 211180480, + "router_z_loss_clip": 1.42675781, + "router_z_loss_mlp": 0.11047363, + "step": 9797, + "time_per_iteration": 2.5991365909576416 + }, + { + "auxiliary_loss_clip": 0.06417, + "auxiliary_loss_mlp": 0.01261637, + "balance_loss_clip": 0.06275804, + "balance_loss_mlp": 0.01250806, + "epoch": 0.5890876296407636, + "flos": 11770182702720.0, + "grad_norm": 1.5626242229143896, + "language_loss": 0.79473782, + "learning_rate": 1.5245835549957152e-06, + "loss": 0.87152421, + "num_input_tokens_seen": 211198000, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.1083374, + "step": 9798, + "time_per_iteration": 2.5399045944213867 + }, + { + "auxiliary_loss_clip": 0.06414175, + "auxiliary_loss_mlp": 0.01264907, + "balance_loss_clip": 0.06274281, + "balance_loss_mlp": 0.01254584, + "epoch": 0.5891477528934316, + "flos": 13594535723520.0, + "grad_norm": 2.254418827792415, + "language_loss": 0.75000322, + "learning_rate": 1.5242052656244186e-06, + "loss": 0.82679403, + "num_input_tokens_seen": 211214765, + "router_z_loss_clip": 1.39941406, + "router_z_loss_mlp": 0.10321045, + "step": 9799, + "time_per_iteration": 2.4642131328582764 + }, + { + "auxiliary_loss_clip": 0.06420485, + "auxiliary_loss_mlp": 0.01266976, + "balance_loss_clip": 0.06274568, + "balance_loss_mlp": 0.01254798, + "epoch": 0.5892078761460995, + "flos": 15054563191680.0, + "grad_norm": 1.9320779180150096, + "language_loss": 0.76666486, + "learning_rate": 1.5238269942953064e-06, + "loss": 0.84353948, + "num_input_tokens_seen": 211232335, + "router_z_loss_clip": 1.45898438, + "router_z_loss_mlp": 0.12182617, + "step": 9800, + "time_per_iteration": 2.5170304775238037 + }, + { + "auxiliary_loss_clip": 0.06421179, + "auxiliary_loss_mlp": 0.01264846, + "balance_loss_clip": 0.06275316, + "balance_loss_mlp": 0.0125361, + "epoch": 0.5892679993987675, + "flos": 15783591640320.0, + "grad_norm": 1.6350760782373632, + "language_loss": 0.79415876, + "learning_rate": 1.523448741022722e-06, + "loss": 0.87101901, + "num_input_tokens_seen": 211249985, + "router_z_loss_clip": 1.45898438, + "router_z_loss_mlp": 0.11242676, + "step": 9801, + "time_per_iteration": 2.4804494380950928 + }, + { + "auxiliary_loss_clip": 0.06421967, + "auxiliary_loss_mlp": 0.01265274, + "balance_loss_clip": 0.06274579, + "balance_loss_mlp": 0.01253467, + "epoch": 0.5893281226514354, + "flos": 25272281773440.0, + "grad_norm": 1.6257193775599612, + "language_loss": 0.6664654, + "learning_rate": 1.5230705058210088e-06, + "loss": 0.74333781, + "num_input_tokens_seen": 211268425, + "router_z_loss_clip": 1.47265625, + "router_z_loss_mlp": 0.11804199, + "step": 9802, + "time_per_iteration": 2.536524534225464 + }, + { + "auxiliary_loss_clip": 0.06417859, + "auxiliary_loss_mlp": 0.01267449, + "balance_loss_clip": 0.06276833, + "balance_loss_mlp": 0.01256475, + "epoch": 0.5893882459041034, + "flos": 19463380346880.0, + "grad_norm": 2.7221530495776953, + "language_loss": 0.78339422, + "learning_rate": 1.5226922887045108e-06, + "loss": 0.86024731, + "num_input_tokens_seen": 211286680, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.10986328, + "step": 9803, + "time_per_iteration": 2.4658396244049072 + }, + { + "auxiliary_loss_clip": 0.06422158, + "auxiliary_loss_mlp": 0.01266134, + "balance_loss_clip": 0.06275959, + "balance_loss_mlp": 0.01255143, + "epoch": 0.5894483691567713, + "flos": 20640785592960.0, + "grad_norm": 1.3509589673333673, + "language_loss": 0.73070806, + "learning_rate": 1.5223140896875686e-06, + "loss": 0.80759096, + "num_input_tokens_seen": 211307700, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.10986328, + "step": 9804, + "time_per_iteration": 2.5561769008636475 + }, + { + "auxiliary_loss_clip": 0.06421436, + "auxiliary_loss_mlp": 0.01267021, + "balance_loss_clip": 0.06279321, + "balance_loss_mlp": 0.01255779, + "epoch": 0.5895084924094394, + "flos": 17782812132480.0, + "grad_norm": 4.893575785915148, + "language_loss": 0.74802667, + "learning_rate": 1.5219359087845234e-06, + "loss": 0.82491124, + "num_input_tokens_seen": 211324835, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.11254883, + "step": 9805, + "time_per_iteration": 2.4777255058288574 + }, + { + "auxiliary_loss_clip": 0.06430615, + "auxiliary_loss_mlp": 0.01266439, + "balance_loss_clip": 0.06278822, + "balance_loss_mlp": 0.01254542, + "epoch": 0.5895686156621073, + "flos": 20127350499840.0, + "grad_norm": 1.9675390106462767, + "language_loss": 0.78339982, + "learning_rate": 1.5215577460097174e-06, + "loss": 0.8603704, + "num_input_tokens_seen": 211344130, + "router_z_loss_clip": 1.515625, + "router_z_loss_mlp": 0.11901855, + "step": 9806, + "time_per_iteration": 2.556187868118286 + }, + { + "auxiliary_loss_clip": 0.06426841, + "auxiliary_loss_mlp": 0.01268335, + "balance_loss_clip": 0.06283563, + "balance_loss_mlp": 0.01256813, + "epoch": 0.5896287389147753, + "flos": 20856337021440.0, + "grad_norm": 1.8953677951134942, + "language_loss": 0.77413982, + "learning_rate": 1.5211796013774887e-06, + "loss": 0.85109162, + "num_input_tokens_seen": 211362915, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.11523438, + "step": 9807, + "time_per_iteration": 2.519200325012207 + }, + { + "auxiliary_loss_clip": 0.06425367, + "auxiliary_loss_mlp": 0.01268029, + "balance_loss_clip": 0.06276954, + "balance_loss_mlp": 0.01256341, + "epoch": 0.5896888621674432, + "flos": 14543098669440.0, + "grad_norm": 1.5805632295861456, + "language_loss": 0.75183058, + "learning_rate": 1.5208014749021786e-06, + "loss": 0.82876456, + "num_input_tokens_seen": 211380700, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.11694336, + "step": 9808, + "time_per_iteration": 3.908586025238037 + }, + { + "auxiliary_loss_clip": 0.06422409, + "auxiliary_loss_mlp": 0.01266023, + "balance_loss_clip": 0.06277257, + "balance_loss_mlp": 0.01253912, + "epoch": 0.5897489854201112, + "flos": 20893079836800.0, + "grad_norm": 1.9290339931200338, + "language_loss": 0.71909666, + "learning_rate": 1.5204233665981236e-06, + "loss": 0.79598099, + "num_input_tokens_seen": 211400095, + "router_z_loss_clip": 1.45019531, + "router_z_loss_mlp": 0.12103271, + "step": 9809, + "time_per_iteration": 2.5768144130706787 + }, + { + "auxiliary_loss_clip": 0.06423716, + "auxiliary_loss_mlp": 0.01266116, + "balance_loss_clip": 0.0627635, + "balance_loss_mlp": 0.01254272, + "epoch": 0.5898091086727792, + "flos": 20017331688960.0, + "grad_norm": 2.0062119760557473, + "language_loss": 0.82969332, + "learning_rate": 1.5200452764796627e-06, + "loss": 0.90659165, + "num_input_tokens_seen": 211417810, + "router_z_loss_clip": 1.47070312, + "router_z_loss_mlp": 0.1184082, + "step": 9810, + "time_per_iteration": 2.5024096965789795 + }, + { + "auxiliary_loss_clip": 0.06418087, + "auxiliary_loss_mlp": 0.01268409, + "balance_loss_clip": 0.06278655, + "balance_loss_mlp": 0.01257394, + "epoch": 0.5898692319254472, + "flos": 16258816471680.0, + "grad_norm": 2.656719323590735, + "language_loss": 0.81247234, + "learning_rate": 1.5196672045611336e-06, + "loss": 0.8893373, + "num_input_tokens_seen": 211436020, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.11016846, + "step": 9811, + "time_per_iteration": 2.5079774856567383 + }, + { + "auxiliary_loss_clip": 0.06424809, + "auxiliary_loss_mlp": 0.01266333, + "balance_loss_clip": 0.06278014, + "balance_loss_mlp": 0.01254442, + "epoch": 0.5899293551781152, + "flos": 20454723601920.0, + "grad_norm": 1.7175276958807264, + "language_loss": 0.7698791, + "learning_rate": 1.5192891508568715e-06, + "loss": 0.84679055, + "num_input_tokens_seen": 211454335, + "router_z_loss_clip": 1.46777344, + "router_z_loss_mlp": 0.11883545, + "step": 9812, + "time_per_iteration": 2.4813108444213867 + }, + { + "auxiliary_loss_clip": 0.06419283, + "auxiliary_loss_mlp": 0.01264428, + "balance_loss_clip": 0.0627578, + "balance_loss_mlp": 0.01253992, + "epoch": 0.5899894784307831, + "flos": 13886885018880.0, + "grad_norm": 1.6786934004730485, + "language_loss": 0.71137106, + "learning_rate": 1.5189111153812133e-06, + "loss": 0.78820813, + "num_input_tokens_seen": 211472775, + "router_z_loss_clip": 1.43359375, + "router_z_loss_mlp": 0.10437012, + "step": 9813, + "time_per_iteration": 2.5212063789367676 + }, + { + "auxiliary_loss_clip": 0.0641876, + "auxiliary_loss_mlp": 0.01270874, + "balance_loss_clip": 0.06273647, + "balance_loss_mlp": 0.01259394, + "epoch": 0.5900496016834511, + "flos": 20089936851840.0, + "grad_norm": 1.420675326684763, + "language_loss": 0.7244218, + "learning_rate": 1.518533098148494e-06, + "loss": 0.80131817, + "num_input_tokens_seen": 211492195, + "router_z_loss_clip": 1.45019531, + "router_z_loss_mlp": 0.11468506, + "step": 9814, + "time_per_iteration": 2.4773387908935547 + }, + { + "auxiliary_loss_clip": 0.06421163, + "auxiliary_loss_mlp": 0.01268081, + "balance_loss_clip": 0.06276704, + "balance_loss_mlp": 0.01256768, + "epoch": 0.590109724936119, + "flos": 20264133490560.0, + "grad_norm": 1.7152732807584992, + "language_loss": 0.7885775, + "learning_rate": 1.5181550991730476e-06, + "loss": 0.86546993, + "num_input_tokens_seen": 211510220, + "router_z_loss_clip": 1.4453125, + "router_z_loss_mlp": 0.11309814, + "step": 9815, + "time_per_iteration": 3.939445972442627 + }, + { + "auxiliary_loss_clip": 0.06427211, + "auxiliary_loss_mlp": 0.01267373, + "balance_loss_clip": 0.06275927, + "balance_loss_mlp": 0.01255142, + "epoch": 0.590169848188787, + "flos": 24240548050560.0, + "grad_norm": 1.7218203048390952, + "language_loss": 0.76316988, + "learning_rate": 1.5177771184692083e-06, + "loss": 0.84011579, + "num_input_tokens_seen": 211526260, + "router_z_loss_clip": 1.51171875, + "router_z_loss_mlp": 0.12243652, + "step": 9816, + "time_per_iteration": 2.5245048999786377 + }, + { + "auxiliary_loss_clip": 0.06419881, + "auxiliary_loss_mlp": 0.01267479, + "balance_loss_clip": 0.06277047, + "balance_loss_mlp": 0.01255725, + "epoch": 0.590229971441455, + "flos": 17790400926720.0, + "grad_norm": 1.8371364848215923, + "language_loss": 0.81572855, + "learning_rate": 1.517399156051309e-06, + "loss": 0.89260209, + "num_input_tokens_seen": 211542890, + "router_z_loss_clip": 1.42871094, + "router_z_loss_mlp": 0.11743164, + "step": 9817, + "time_per_iteration": 2.4621410369873047 + }, + { + "auxiliary_loss_clip": 0.06418833, + "auxiliary_loss_mlp": 0.01268261, + "balance_loss_clip": 0.06274004, + "balance_loss_mlp": 0.01257544, + "epoch": 0.590290094694123, + "flos": 22243465837440.0, + "grad_norm": 1.5541077044812335, + "language_loss": 0.76864719, + "learning_rate": 1.517021211933682e-06, + "loss": 0.84551811, + "num_input_tokens_seen": 211562685, + "router_z_loss_clip": 1.44726562, + "router_z_loss_mlp": 0.10717773, + "step": 9818, + "time_per_iteration": 2.5125410556793213 + }, + { + "auxiliary_loss_clip": 0.06416667, + "auxiliary_loss_mlp": 0.01265866, + "balance_loss_clip": 0.06275138, + "balance_loss_mlp": 0.01255501, + "epoch": 0.5903502179467909, + "flos": 19104589163520.0, + "grad_norm": 1.8321116335564553, + "language_loss": 0.67227435, + "learning_rate": 1.5166432861306592e-06, + "loss": 0.74909973, + "num_input_tokens_seen": 211579960, + "router_z_loss_clip": 1.41601562, + "router_z_loss_mlp": 0.10369873, + "step": 9819, + "time_per_iteration": 4.011074066162109 + }, + { + "auxiliary_loss_clip": 0.06420997, + "auxiliary_loss_mlp": 0.01266547, + "balance_loss_clip": 0.06275985, + "balance_loss_mlp": 0.01255819, + "epoch": 0.5904103411994589, + "flos": 24241051175040.0, + "grad_norm": 1.4923193447304384, + "language_loss": 0.7829935, + "learning_rate": 1.5162653786565714e-06, + "loss": 0.85986888, + "num_input_tokens_seen": 211599310, + "router_z_loss_clip": 1.45117188, + "router_z_loss_mlp": 0.10723877, + "step": 9820, + "time_per_iteration": 2.5523388385772705 + }, + { + "auxiliary_loss_clip": 0.06318125, + "auxiliary_loss_mlp": 0.01254512, + "balance_loss_clip": 0.06258737, + "balance_loss_mlp": 0.01253092, + "epoch": 0.5904704644521268, + "flos": 64894388774400.0, + "grad_norm": 0.9340841048050909, + "language_loss": 0.65183949, + "learning_rate": 1.5158874895257487e-06, + "loss": 0.72756588, + "num_input_tokens_seen": 211658790, + "router_z_loss_clip": 0.59375, + "router_z_loss_mlp": 0.01417542, + "step": 9821, + "time_per_iteration": 3.1619784832000732 + }, + { + "auxiliary_loss_clip": 0.06416959, + "auxiliary_loss_mlp": 0.0126236, + "balance_loss_clip": 0.06275654, + "balance_loss_mlp": 0.01251935, + "epoch": 0.5905305877047948, + "flos": 19616137539840.0, + "grad_norm": 2.101599923194391, + "language_loss": 0.6190716, + "learning_rate": 1.515509618752521e-06, + "loss": 0.69586486, + "num_input_tokens_seen": 211677240, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.10412598, + "step": 9822, + "time_per_iteration": 2.519482374191284 + }, + { + "auxiliary_loss_clip": 0.06419894, + "auxiliary_loss_mlp": 0.01268851, + "balance_loss_clip": 0.06275024, + "balance_loss_mlp": 0.01257365, + "epoch": 0.5905907109574628, + "flos": 18995660455680.0, + "grad_norm": 1.8507285157055846, + "language_loss": 0.82910419, + "learning_rate": 1.5151317663512173e-06, + "loss": 0.90599167, + "num_input_tokens_seen": 211695485, + "router_z_loss_clip": 1.44921875, + "router_z_loss_mlp": 0.1149292, + "step": 9823, + "time_per_iteration": 2.5134451389312744 + }, + { + "auxiliary_loss_clip": 0.06417045, + "auxiliary_loss_mlp": 0.01267549, + "balance_loss_clip": 0.06275238, + "balance_loss_mlp": 0.01256546, + "epoch": 0.5906508342101308, + "flos": 22206974584320.0, + "grad_norm": 1.8772651852061113, + "language_loss": 0.73388183, + "learning_rate": 1.514753932336165e-06, + "loss": 0.81072783, + "num_input_tokens_seen": 211713090, + "router_z_loss_clip": 1.41796875, + "router_z_loss_mlp": 0.11004639, + "step": 9824, + "time_per_iteration": 3.8841147422790527 + }, + { + "auxiliary_loss_clip": 0.064331, + "auxiliary_loss_mlp": 0.01267625, + "balance_loss_clip": 0.06277563, + "balance_loss_mlp": 0.01255013, + "epoch": 0.5907109574627988, + "flos": 20892995982720.0, + "grad_norm": 1.9523854086350827, + "language_loss": 0.82938302, + "learning_rate": 1.514376116721693e-06, + "loss": 0.90639031, + "num_input_tokens_seen": 211732510, + "router_z_loss_clip": 1.55566406, + "router_z_loss_mlp": 0.12609863, + "step": 9825, + "time_per_iteration": 2.527808427810669 + }, + { + "auxiliary_loss_clip": 0.06417271, + "auxiliary_loss_mlp": 0.01264281, + "balance_loss_clip": 0.06277614, + "balance_loss_mlp": 0.0125422, + "epoch": 0.5907710807154667, + "flos": 21513011869440.0, + "grad_norm": 1.8272335212588457, + "language_loss": 0.76679188, + "learning_rate": 1.5139983195221272e-06, + "loss": 0.84360743, + "num_input_tokens_seen": 211748695, + "router_z_loss_clip": 1.39648438, + "router_z_loss_mlp": 0.10058594, + "step": 9826, + "time_per_iteration": 2.4867746829986572 + }, + { + "auxiliary_loss_clip": 0.06416261, + "auxiliary_loss_mlp": 0.01262552, + "balance_loss_clip": 0.06274769, + "balance_loss_mlp": 0.01252419, + "epoch": 0.5908312039681347, + "flos": 22024979516160.0, + "grad_norm": 1.5050840799955296, + "language_loss": 0.7292102, + "learning_rate": 1.513620540751793e-06, + "loss": 0.80599833, + "num_input_tokens_seen": 211768545, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.10131836, + "step": 9827, + "time_per_iteration": 2.5261569023132324 + }, + { + "auxiliary_loss_clip": 0.06419525, + "auxiliary_loss_mlp": 0.01266997, + "balance_loss_clip": 0.0627335, + "balance_loss_mlp": 0.0125588, + "epoch": 0.5908913272208026, + "flos": 18485579525760.0, + "grad_norm": 1.8170415974974599, + "language_loss": 0.80223072, + "learning_rate": 1.5132427804250178e-06, + "loss": 0.87909591, + "num_input_tokens_seen": 211786665, + "router_z_loss_clip": 1.46191406, + "router_z_loss_mlp": 0.11120605, + "step": 9828, + "time_per_iteration": 2.4725866317749023 + }, + { + "auxiliary_loss_clip": 0.06421993, + "auxiliary_loss_mlp": 0.01272492, + "balance_loss_clip": 0.06275676, + "balance_loss_mlp": 0.01260375, + "epoch": 0.5909514504734706, + "flos": 12317006448000.0, + "grad_norm": 1.8455350152663679, + "language_loss": 0.88620806, + "learning_rate": 1.5128650385561241e-06, + "loss": 0.96315295, + "num_input_tokens_seen": 211801215, + "router_z_loss_clip": 1.46289062, + "router_z_loss_mlp": 0.12133789, + "step": 9829, + "time_per_iteration": 2.4783804416656494 + }, + { + "auxiliary_loss_clip": 0.06324679, + "auxiliary_loss_mlp": 0.01254341, + "balance_loss_clip": 0.06265787, + "balance_loss_mlp": 0.01252693, + "epoch": 0.5910115737261386, + "flos": 70233557811840.0, + "grad_norm": 0.7549892406299625, + "language_loss": 0.57903004, + "learning_rate": 1.5124873151594376e-06, + "loss": 0.6548202, + "num_input_tokens_seen": 211857005, + "router_z_loss_clip": 0.58984375, + "router_z_loss_mlp": 0.01651001, + "step": 9830, + "time_per_iteration": 3.0390307903289795 + }, + { + "auxiliary_loss_clip": 0.0643173, + "auxiliary_loss_mlp": 0.01269908, + "balance_loss_clip": 0.06281478, + "balance_loss_mlp": 0.01257308, + "epoch": 0.5910716969788066, + "flos": 22024266756480.0, + "grad_norm": 2.1560619163105965, + "language_loss": 0.75963652, + "learning_rate": 1.5121096102492812e-06, + "loss": 0.83665287, + "num_input_tokens_seen": 211876675, + "router_z_loss_clip": 1.50097656, + "router_z_loss_mlp": 0.12591553, + "step": 9831, + "time_per_iteration": 2.5367510318756104 + }, + { + "auxiliary_loss_clip": 0.06409759, + "auxiliary_loss_mlp": 0.01262704, + "balance_loss_clip": 0.06272636, + "balance_loss_mlp": 0.01252124, + "epoch": 0.5911318202314745, + "flos": 21258034295040.0, + "grad_norm": 1.5753423885742641, + "language_loss": 0.77885556, + "learning_rate": 1.5117319238399767e-06, + "loss": 0.85558021, + "num_input_tokens_seen": 211895725, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.10583496, + "step": 9832, + "time_per_iteration": 2.504584789276123 + }, + { + "auxiliary_loss_clip": 0.06416824, + "auxiliary_loss_mlp": 0.01265662, + "balance_loss_clip": 0.06274946, + "balance_loss_mlp": 0.01254797, + "epoch": 0.5911919434841425, + "flos": 17827353377280.0, + "grad_norm": 1.6998910709640538, + "language_loss": 0.83265263, + "learning_rate": 1.511354255945847e-06, + "loss": 0.90947747, + "num_input_tokens_seen": 211913860, + "router_z_loss_clip": 1.41894531, + "router_z_loss_mlp": 0.10864258, + "step": 9833, + "time_per_iteration": 2.508920192718506 + }, + { + "auxiliary_loss_clip": 0.06420296, + "auxiliary_loss_mlp": 0.01269729, + "balance_loss_clip": 0.06274877, + "balance_loss_mlp": 0.01259006, + "epoch": 0.5912520667368104, + "flos": 20380818700800.0, + "grad_norm": 1.4145847544307324, + "language_loss": 0.74488783, + "learning_rate": 1.5109766065812123e-06, + "loss": 0.82178807, + "num_input_tokens_seen": 211932880, + "router_z_loss_clip": 1.453125, + "router_z_loss_mlp": 0.10723877, + "step": 9834, + "time_per_iteration": 2.515340566635132 + }, + { + "auxiliary_loss_clip": 0.06420908, + "auxiliary_loss_mlp": 0.0126652, + "balance_loss_clip": 0.06276181, + "balance_loss_mlp": 0.01255308, + "epoch": 0.5913121899894784, + "flos": 17936240158080.0, + "grad_norm": 2.2554155860211296, + "language_loss": 0.78118962, + "learning_rate": 1.5105989757603942e-06, + "loss": 0.85806394, + "num_input_tokens_seen": 211948625, + "router_z_loss_clip": 1.44824219, + "router_z_loss_mlp": 0.11212158, + "step": 9835, + "time_per_iteration": 2.516449213027954 + }, + { + "auxiliary_loss_clip": 0.06422424, + "auxiliary_loss_mlp": 0.01268422, + "balance_loss_clip": 0.06274521, + "balance_loss_mlp": 0.0125724, + "epoch": 0.5913723132421465, + "flos": 22133405099520.0, + "grad_norm": 1.7910918924229287, + "language_loss": 0.74562353, + "learning_rate": 1.5102213634977117e-06, + "loss": 0.82253206, + "num_input_tokens_seen": 211965355, + "router_z_loss_clip": 1.47949219, + "router_z_loss_mlp": 0.11187744, + "step": 9836, + "time_per_iteration": 2.4944818019866943 + }, + { + "auxiliary_loss_clip": 0.06421088, + "auxiliary_loss_mlp": 0.01263965, + "balance_loss_clip": 0.06274953, + "balance_loss_mlp": 0.01252396, + "epoch": 0.5914324364948144, + "flos": 15702056017920.0, + "grad_norm": 1.9466597288818261, + "language_loss": 0.82267582, + "learning_rate": 1.5098437698074841e-06, + "loss": 0.89952636, + "num_input_tokens_seen": 211982245, + "router_z_loss_clip": 1.46289062, + "router_z_loss_mlp": 0.11572266, + "step": 9837, + "time_per_iteration": 2.5073657035827637 + }, + { + "auxiliary_loss_clip": 0.06423111, + "auxiliary_loss_mlp": 0.01265723, + "balance_loss_clip": 0.06276567, + "balance_loss_mlp": 0.01253665, + "epoch": 0.5914925597474824, + "flos": 22753924110720.0, + "grad_norm": 1.6146002375859378, + "language_loss": 0.7983368, + "learning_rate": 1.5094661947040304e-06, + "loss": 0.87522513, + "num_input_tokens_seen": 212000250, + "router_z_loss_clip": 1.46386719, + "router_z_loss_mlp": 0.1206665, + "step": 9838, + "time_per_iteration": 2.5024936199188232 + }, + { + "auxiliary_loss_clip": 0.06421801, + "auxiliary_loss_mlp": 0.01267887, + "balance_loss_clip": 0.06276052, + "balance_loss_mlp": 0.01256503, + "epoch": 0.5915526830001503, + "flos": 18298092015360.0, + "grad_norm": 1.7930328536333848, + "language_loss": 0.70194936, + "learning_rate": 1.5090886382016673e-06, + "loss": 0.77884626, + "num_input_tokens_seen": 212017505, + "router_z_loss_clip": 1.45703125, + "router_z_loss_mlp": 0.11383057, + "step": 9839, + "time_per_iteration": 2.5000133514404297 + }, + { + "auxiliary_loss_clip": 0.06421608, + "auxiliary_loss_mlp": 0.01265072, + "balance_loss_clip": 0.06275722, + "balance_loss_mlp": 0.01254462, + "epoch": 0.5916128062528183, + "flos": 17024713516800.0, + "grad_norm": 2.2460586823912254, + "language_loss": 0.65840614, + "learning_rate": 1.5087111003147124e-06, + "loss": 0.73527294, + "num_input_tokens_seen": 212034595, + "router_z_loss_clip": 1.45898438, + "router_z_loss_mlp": 0.10614014, + "step": 9840, + "time_per_iteration": 2.472325325012207 + }, + { + "auxiliary_loss_clip": 0.06421183, + "auxiliary_loss_mlp": 0.01269035, + "balance_loss_clip": 0.06273993, + "balance_loss_mlp": 0.01257019, + "epoch": 0.5916729295054862, + "flos": 24761194594560.0, + "grad_norm": 7.488465580129743, + "language_loss": 0.82013118, + "learning_rate": 1.5083335810574813e-06, + "loss": 0.89703333, + "num_input_tokens_seen": 212055775, + "router_z_loss_clip": 1.47265625, + "router_z_loss_mlp": 0.12023926, + "step": 9841, + "time_per_iteration": 2.539569139480591 + }, + { + "auxiliary_loss_clip": 0.06417108, + "auxiliary_loss_mlp": 0.01266112, + "balance_loss_clip": 0.06275231, + "balance_loss_mlp": 0.01255782, + "epoch": 0.5917330527581542, + "flos": 15963196867200.0, + "grad_norm": 1.7355438933283587, + "language_loss": 0.69817364, + "learning_rate": 1.507956080444291e-06, + "loss": 0.77500588, + "num_input_tokens_seen": 212074000, + "router_z_loss_clip": 1.41796875, + "router_z_loss_mlp": 0.10333252, + "step": 9842, + "time_per_iteration": 2.4748387336730957 + }, + { + "auxiliary_loss_clip": 0.06423896, + "auxiliary_loss_mlp": 0.0126808, + "balance_loss_clip": 0.06278209, + "balance_loss_mlp": 0.01256332, + "epoch": 0.5917931760108222, + "flos": 23806719936000.0, + "grad_norm": 2.0642371985300105, + "language_loss": 0.83243513, + "learning_rate": 1.5075785984894549e-06, + "loss": 0.90935493, + "num_input_tokens_seen": 212091415, + "router_z_loss_clip": 1.45703125, + "router_z_loss_mlp": 0.11755371, + "step": 9843, + "time_per_iteration": 2.5579354763031006 + }, + { + "auxiliary_loss_clip": 0.06423706, + "auxiliary_loss_mlp": 0.01264185, + "balance_loss_clip": 0.06277691, + "balance_loss_mlp": 0.01252419, + "epoch": 0.5918532992634902, + "flos": 23254864945920.0, + "grad_norm": 2.21208381325965, + "language_loss": 0.81869078, + "learning_rate": 1.5072011352072875e-06, + "loss": 0.89556968, + "num_input_tokens_seen": 212105255, + "router_z_loss_clip": 1.45898438, + "router_z_loss_mlp": 0.11773682, + "step": 9844, + "time_per_iteration": 2.4732062816619873 + }, + { + "auxiliary_loss_clip": 0.06423113, + "auxiliary_loss_mlp": 0.01264577, + "balance_loss_clip": 0.0627753, + "balance_loss_mlp": 0.01253496, + "epoch": 0.5919134225161581, + "flos": 19505867166720.0, + "grad_norm": 2.0396261684123966, + "language_loss": 0.74979722, + "learning_rate": 1.5068236906121032e-06, + "loss": 0.8266741, + "num_input_tokens_seen": 212122765, + "router_z_loss_clip": 1.45507812, + "router_z_loss_mlp": 0.11077881, + "step": 9845, + "time_per_iteration": 2.5498902797698975 + }, + { + "auxiliary_loss_clip": 0.0642004, + "auxiliary_loss_mlp": 0.01267189, + "balance_loss_clip": 0.06273404, + "balance_loss_mlp": 0.01255215, + "epoch": 0.5919735457688261, + "flos": 38810201264640.0, + "grad_norm": 1.7793580681254029, + "language_loss": 0.64624578, + "learning_rate": 1.506446264718213e-06, + "loss": 0.72311807, + "num_input_tokens_seen": 212143960, + "router_z_loss_clip": 1.46484375, + "router_z_loss_mlp": 0.11962891, + "step": 9846, + "time_per_iteration": 2.6562187671661377 + }, + { + "auxiliary_loss_clip": 0.0641156, + "auxiliary_loss_mlp": 0.01268591, + "balance_loss_clip": 0.06275991, + "balance_loss_mlp": 0.01258851, + "epoch": 0.592033669021494, + "flos": 22170567185280.0, + "grad_norm": 1.5989871653678733, + "language_loss": 0.76435882, + "learning_rate": 1.506068857539931e-06, + "loss": 0.84116036, + "num_input_tokens_seen": 212162005, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.09735107, + "step": 9847, + "time_per_iteration": 2.5877273082733154 + }, + { + "auxiliary_loss_clip": 0.06420001, + "auxiliary_loss_mlp": 0.01267428, + "balance_loss_clip": 0.06274936, + "balance_loss_mlp": 0.01255477, + "epoch": 0.592093792274162, + "flos": 22717600565760.0, + "grad_norm": 1.9085044692476394, + "language_loss": 0.62601185, + "learning_rate": 1.5056914690915667e-06, + "loss": 0.70288616, + "num_input_tokens_seen": 212181635, + "router_z_loss_clip": 1.44921875, + "router_z_loss_mlp": 0.11956787, + "step": 9848, + "time_per_iteration": 3.9838032722473145 + }, + { + "auxiliary_loss_clip": 0.06422321, + "auxiliary_loss_mlp": 0.01263974, + "balance_loss_clip": 0.06275022, + "balance_loss_mlp": 0.01252959, + "epoch": 0.59215391552683, + "flos": 22535605497600.0, + "grad_norm": 2.0066393042716855, + "language_loss": 0.76503384, + "learning_rate": 1.5053140993874312e-06, + "loss": 0.84189683, + "num_input_tokens_seen": 212201615, + "router_z_loss_clip": 1.47265625, + "router_z_loss_mlp": 0.11022949, + "step": 9849, + "time_per_iteration": 2.5015931129455566 + }, + { + "auxiliary_loss_clip": 0.06421839, + "auxiliary_loss_mlp": 0.01268681, + "balance_loss_clip": 0.06277264, + "balance_loss_mlp": 0.01256671, + "epoch": 0.592214038779498, + "flos": 24505965457920.0, + "grad_norm": 1.745648722955103, + "language_loss": 0.75836027, + "learning_rate": 1.5049367484418353e-06, + "loss": 0.8352654, + "num_input_tokens_seen": 212219355, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.12005615, + "step": 9850, + "time_per_iteration": 2.600179672241211 + }, + { + "auxiliary_loss_clip": 0.06417172, + "auxiliary_loss_mlp": 0.01268411, + "balance_loss_clip": 0.06275059, + "balance_loss_mlp": 0.01257367, + "epoch": 0.592274162032166, + "flos": 21837156589440.0, + "grad_norm": 1.6508975523953922, + "language_loss": 0.75545883, + "learning_rate": 1.5045594162690868e-06, + "loss": 0.83231473, + "num_input_tokens_seen": 212236710, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.1105957, + "step": 9851, + "time_per_iteration": 2.4818735122680664 + }, + { + "auxiliary_loss_clip": 0.06419359, + "auxiliary_loss_mlp": 0.01266702, + "balance_loss_clip": 0.06275028, + "balance_loss_mlp": 0.01254918, + "epoch": 0.5923342852848339, + "flos": 24615061873920.0, + "grad_norm": 1.7463946887344501, + "language_loss": 0.70506394, + "learning_rate": 1.5041821028834954e-06, + "loss": 0.78192449, + "num_input_tokens_seen": 212256195, + "router_z_loss_clip": 1.44238281, + "router_z_loss_mlp": 0.11779785, + "step": 9852, + "time_per_iteration": 2.587822675704956 + }, + { + "auxiliary_loss_clip": 0.06423963, + "auxiliary_loss_mlp": 0.01273382, + "balance_loss_clip": 0.06275325, + "balance_loss_mlp": 0.01261043, + "epoch": 0.5923944085375019, + "flos": 19944307255680.0, + "grad_norm": 1.582534152024796, + "language_loss": 0.80272847, + "learning_rate": 1.5038048082993685e-06, + "loss": 0.87970185, + "num_input_tokens_seen": 212274085, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.12347412, + "step": 9853, + "time_per_iteration": 2.4834022521972656 + }, + { + "auxiliary_loss_clip": 0.06412584, + "auxiliary_loss_mlp": 0.01264493, + "balance_loss_clip": 0.06272356, + "balance_loss_mlp": 0.01253985, + "epoch": 0.5924545317901698, + "flos": 28666177948800.0, + "grad_norm": 1.4145056961897013, + "language_loss": 0.67743915, + "learning_rate": 1.5034275325310124e-06, + "loss": 0.75421, + "num_input_tokens_seen": 212295530, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.1050415, + "step": 9854, + "time_per_iteration": 3.9716901779174805 + }, + { + "auxiliary_loss_clip": 0.06417395, + "auxiliary_loss_mlp": 0.01268291, + "balance_loss_clip": 0.06274853, + "balance_loss_mlp": 0.01257514, + "epoch": 0.5925146550428378, + "flos": 19870989333120.0, + "grad_norm": 1.7006302713228023, + "language_loss": 0.89085132, + "learning_rate": 1.5030502755927344e-06, + "loss": 0.96770817, + "num_input_tokens_seen": 212313770, + "router_z_loss_clip": 1.42675781, + "router_z_loss_mlp": 0.10772705, + "step": 9855, + "time_per_iteration": 2.54018235206604 + }, + { + "auxiliary_loss_clip": 0.06414687, + "auxiliary_loss_mlp": 0.01266215, + "balance_loss_clip": 0.06275393, + "balance_loss_mlp": 0.0125585, + "epoch": 0.5925747782955058, + "flos": 15128510019840.0, + "grad_norm": 1.7501100927117066, + "language_loss": 0.86997199, + "learning_rate": 1.5026730374988397e-06, + "loss": 0.94678098, + "num_input_tokens_seen": 212331525, + "router_z_loss_clip": 1.39257812, + "router_z_loss_mlp": 0.10369873, + "step": 9856, + "time_per_iteration": 2.5016441345214844 + }, + { + "auxiliary_loss_clip": 0.06422357, + "auxiliary_loss_mlp": 0.01265204, + "balance_loss_clip": 0.06275797, + "balance_loss_mlp": 0.01254177, + "epoch": 0.5926349015481738, + "flos": 18411297281280.0, + "grad_norm": 1.7487529922228526, + "language_loss": 0.77790916, + "learning_rate": 1.5022958182636332e-06, + "loss": 0.85478473, + "num_input_tokens_seen": 212347295, + "router_z_loss_clip": 1.46484375, + "router_z_loss_mlp": 0.11016846, + "step": 9857, + "time_per_iteration": 2.5232088565826416 + }, + { + "auxiliary_loss_clip": 0.06421745, + "auxiliary_loss_mlp": 0.01266127, + "balance_loss_clip": 0.06278913, + "balance_loss_mlp": 0.01254689, + "epoch": 0.5926950248008417, + "flos": 23117620757760.0, + "grad_norm": 2.3581492349261524, + "language_loss": 0.65045798, + "learning_rate": 1.501918617901419e-06, + "loss": 0.72733665, + "num_input_tokens_seen": 212365750, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.11431885, + "step": 9858, + "time_per_iteration": 4.080450773239136 + }, + { + "auxiliary_loss_clip": 0.06418257, + "auxiliary_loss_mlp": 0.01268065, + "balance_loss_clip": 0.06277932, + "balance_loss_mlp": 0.01256662, + "epoch": 0.5927551480535097, + "flos": 28040753473920.0, + "grad_norm": 1.620046821031832, + "language_loss": 0.77013564, + "learning_rate": 1.501541436426501e-06, + "loss": 0.84699887, + "num_input_tokens_seen": 212385300, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.11395264, + "step": 9859, + "time_per_iteration": 2.5496175289154053 + }, + { + "auxiliary_loss_clip": 0.06422819, + "auxiliary_loss_mlp": 0.01272084, + "balance_loss_clip": 0.06277181, + "balance_loss_mlp": 0.01260217, + "epoch": 0.5928152713061776, + "flos": 21805109602560.0, + "grad_norm": 2.0806402016169914, + "language_loss": 0.75381404, + "learning_rate": 1.5011642738531818e-06, + "loss": 0.8307631, + "num_input_tokens_seen": 212402140, + "router_z_loss_clip": 1.45507812, + "router_z_loss_mlp": 0.11865234, + "step": 9860, + "time_per_iteration": 2.4913806915283203 + }, + { + "auxiliary_loss_clip": 0.06419, + "auxiliary_loss_mlp": 0.01267797, + "balance_loss_clip": 0.06277152, + "balance_loss_mlp": 0.01257557, + "epoch": 0.5928753945588456, + "flos": 24323802681600.0, + "grad_norm": 1.5719426663731493, + "language_loss": 0.7657429, + "learning_rate": 1.500787130195763e-06, + "loss": 0.84261084, + "num_input_tokens_seen": 212421790, + "router_z_loss_clip": 1.41894531, + "router_z_loss_mlp": 0.10235596, + "step": 9861, + "time_per_iteration": 2.542318344116211 + }, + { + "auxiliary_loss_clip": 0.06416907, + "auxiliary_loss_mlp": 0.01266144, + "balance_loss_clip": 0.0627644, + "balance_loss_mlp": 0.01255355, + "epoch": 0.5929355178115137, + "flos": 26471126465280.0, + "grad_norm": 1.7884263747312634, + "language_loss": 0.70557332, + "learning_rate": 1.5004100054685465e-06, + "loss": 0.78240383, + "num_input_tokens_seen": 212442115, + "router_z_loss_clip": 1.40429688, + "router_z_loss_mlp": 0.10797119, + "step": 9862, + "time_per_iteration": 2.5269577503204346 + }, + { + "auxiliary_loss_clip": 0.06422247, + "auxiliary_loss_mlp": 0.01262904, + "balance_loss_clip": 0.06279124, + "balance_loss_mlp": 0.01252455, + "epoch": 0.5929956410641816, + "flos": 24971798632320.0, + "grad_norm": 1.7042567790148921, + "language_loss": 0.7816, + "learning_rate": 1.500032899685832e-06, + "loss": 0.85845149, + "num_input_tokens_seen": 212459535, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.10449219, + "step": 9863, + "time_per_iteration": 2.560952663421631 + }, + { + "auxiliary_loss_clip": 0.06423997, + "auxiliary_loss_mlp": 0.01269473, + "balance_loss_clip": 0.06280629, + "balance_loss_mlp": 0.01258917, + "epoch": 0.5930557643168496, + "flos": 26214639517440.0, + "grad_norm": 1.987432864542063, + "language_loss": 0.71297693, + "learning_rate": 1.499655812861921e-06, + "loss": 0.78991163, + "num_input_tokens_seen": 212479385, + "router_z_loss_clip": 1.43359375, + "router_z_loss_mlp": 0.10565186, + "step": 9864, + "time_per_iteration": 4.022796869277954 + }, + { + "auxiliary_loss_clip": 0.0642028, + "auxiliary_loss_mlp": 0.01268386, + "balance_loss_clip": 0.06276219, + "balance_loss_mlp": 0.01256578, + "epoch": 0.5931158875695175, + "flos": 27862322204160.0, + "grad_norm": 2.045271412380321, + "language_loss": 0.67615211, + "learning_rate": 1.4992787450111112e-06, + "loss": 0.75303876, + "num_input_tokens_seen": 212500060, + "router_z_loss_clip": 1.43847656, + "router_z_loss_mlp": 0.11816406, + "step": 9865, + "time_per_iteration": 2.542477607727051 + }, + { + "auxiliary_loss_clip": 0.06424178, + "auxiliary_loss_mlp": 0.01265131, + "balance_loss_clip": 0.06278679, + "balance_loss_mlp": 0.01253597, + "epoch": 0.5931760108221855, + "flos": 15419014525440.0, + "grad_norm": 2.0467341556470906, + "language_loss": 0.78422129, + "learning_rate": 1.4989016961477015e-06, + "loss": 0.86111438, + "num_input_tokens_seen": 212518590, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.11535645, + "step": 9866, + "time_per_iteration": 2.5601937770843506 + }, + { + "auxiliary_loss_clip": 0.06417245, + "auxiliary_loss_mlp": 0.01267033, + "balance_loss_clip": 0.06280121, + "balance_loss_mlp": 0.01256114, + "epoch": 0.5932361340748534, + "flos": 30196043395200.0, + "grad_norm": 1.6991427361252174, + "language_loss": 0.72385359, + "learning_rate": 1.4985246662859903e-06, + "loss": 0.80069637, + "num_input_tokens_seen": 212538190, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.10919189, + "step": 9867, + "time_per_iteration": 2.582200527191162 + }, + { + "auxiliary_loss_clip": 0.06421208, + "auxiliary_loss_mlp": 0.01268228, + "balance_loss_clip": 0.06280105, + "balance_loss_mlp": 0.0125589, + "epoch": 0.5932962573275214, + "flos": 20163841752960.0, + "grad_norm": 1.4126147288957658, + "language_loss": 0.6694321, + "learning_rate": 1.4981476554402732e-06, + "loss": 0.74632645, + "num_input_tokens_seen": 212557820, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.12335205, + "step": 9868, + "time_per_iteration": 2.515268087387085 + }, + { + "auxiliary_loss_clip": 0.06420252, + "auxiliary_loss_mlp": 0.01266526, + "balance_loss_clip": 0.06275701, + "balance_loss_mlp": 0.01255046, + "epoch": 0.5933563805801894, + "flos": 25452725541120.0, + "grad_norm": 1.59033500525529, + "language_loss": 0.75624323, + "learning_rate": 1.4977706636248478e-06, + "loss": 0.83311105, + "num_input_tokens_seen": 212577645, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.11474609, + "step": 9869, + "time_per_iteration": 2.5264642238616943 + }, + { + "auxiliary_loss_clip": 0.06425707, + "auxiliary_loss_mlp": 0.01265896, + "balance_loss_clip": 0.06281111, + "balance_loss_mlp": 0.01254779, + "epoch": 0.5934165038328574, + "flos": 60007971674880.0, + "grad_norm": 1.9233451977688907, + "language_loss": 0.74787021, + "learning_rate": 1.4973936908540091e-06, + "loss": 0.82478619, + "num_input_tokens_seen": 212603430, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.11114502, + "step": 9870, + "time_per_iteration": 2.8604302406311035 + }, + { + "auxiliary_loss_clip": 0.06422332, + "auxiliary_loss_mlp": 0.01265883, + "balance_loss_clip": 0.0627723, + "balance_loss_mlp": 0.01254719, + "epoch": 0.5934766270855253, + "flos": 24426568114560.0, + "grad_norm": 2.4352017906666226, + "language_loss": 0.72491121, + "learning_rate": 1.4970167371420517e-06, + "loss": 0.80179334, + "num_input_tokens_seen": 212620730, + "router_z_loss_clip": 1.45214844, + "router_z_loss_mlp": 0.11169434, + "step": 9871, + "time_per_iteration": 2.504990577697754 + }, + { + "auxiliary_loss_clip": 0.06424776, + "auxiliary_loss_mlp": 0.01266102, + "balance_loss_clip": 0.0627915, + "balance_loss_mlp": 0.01254843, + "epoch": 0.5935367503381933, + "flos": 23519821155840.0, + "grad_norm": 2.2688315988077736, + "language_loss": 0.74858117, + "learning_rate": 1.496639802503271e-06, + "loss": 0.82548994, + "num_input_tokens_seen": 212639745, + "router_z_loss_clip": 1.45703125, + "router_z_loss_mlp": 0.11254883, + "step": 9872, + "time_per_iteration": 2.5957329273223877 + }, + { + "auxiliary_loss_clip": 0.06431574, + "auxiliary_loss_mlp": 0.01267461, + "balance_loss_clip": 0.06283869, + "balance_loss_mlp": 0.01255517, + "epoch": 0.5935968735908612, + "flos": 18953550979200.0, + "grad_norm": 11.679124704717912, + "language_loss": 0.79073173, + "learning_rate": 1.4962628869519583e-06, + "loss": 0.86772209, + "num_input_tokens_seen": 212655915, + "router_z_loss_clip": 1.4765625, + "router_z_loss_mlp": 0.1194458, + "step": 9873, + "time_per_iteration": 2.4669687747955322 + }, + { + "auxiliary_loss_clip": 0.064208, + "auxiliary_loss_mlp": 0.01267302, + "balance_loss_clip": 0.06276259, + "balance_loss_mlp": 0.01255459, + "epoch": 0.5936569968435292, + "flos": 25490432678400.0, + "grad_norm": 1.6349451241448802, + "language_loss": 0.85223055, + "learning_rate": 1.4958859905024078e-06, + "loss": 0.9291116, + "num_input_tokens_seen": 212676115, + "router_z_loss_clip": 1.44726562, + "router_z_loss_mlp": 0.11853027, + "step": 9874, + "time_per_iteration": 2.5542490482330322 + }, + { + "auxiliary_loss_clip": 0.06322969, + "auxiliary_loss_mlp": 0.01256968, + "balance_loss_clip": 0.0626381, + "balance_loss_mlp": 0.01255485, + "epoch": 0.5937171200961973, + "flos": 66397364259840.0, + "grad_norm": 0.7006393782995821, + "language_loss": 0.59778833, + "learning_rate": 1.4955091131689115e-06, + "loss": 0.67358768, + "num_input_tokens_seen": 212737560, + "router_z_loss_clip": 0.59375, + "router_z_loss_mlp": 0.01482391, + "step": 9875, + "time_per_iteration": 3.2118613719940186 + }, + { + "auxiliary_loss_clip": 0.06429566, + "auxiliary_loss_mlp": 0.01269748, + "balance_loss_clip": 0.06278439, + "balance_loss_mlp": 0.01257302, + "epoch": 0.5937772433488652, + "flos": 14908849741440.0, + "grad_norm": 2.56951836872527, + "language_loss": 0.78072035, + "learning_rate": 1.4951322549657594e-06, + "loss": 0.85771352, + "num_input_tokens_seen": 212755365, + "router_z_loss_clip": 1.51367188, + "router_z_loss_mlp": 0.12451172, + "step": 9876, + "time_per_iteration": 2.488849401473999 + }, + { + "auxiliary_loss_clip": 0.06411201, + "auxiliary_loss_mlp": 0.0126454, + "balance_loss_clip": 0.06273002, + "balance_loss_mlp": 0.01253764, + "epoch": 0.5938373666015332, + "flos": 22567484776320.0, + "grad_norm": 1.5512644369371444, + "language_loss": 0.7603606, + "learning_rate": 1.494755415907243e-06, + "loss": 0.83711803, + "num_input_tokens_seen": 212773875, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.10772705, + "step": 9877, + "time_per_iteration": 2.5584661960601807 + }, + { + "auxiliary_loss_clip": 0.06419433, + "auxiliary_loss_mlp": 0.01268567, + "balance_loss_clip": 0.06274508, + "balance_loss_mlp": 0.01256801, + "epoch": 0.5938974898542011, + "flos": 18446572650240.0, + "grad_norm": 2.5934425226299243, + "language_loss": 0.81566256, + "learning_rate": 1.4943785960076522e-06, + "loss": 0.8925426, + "num_input_tokens_seen": 212790590, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.11779785, + "step": 9878, + "time_per_iteration": 2.498063802719116 + }, + { + "auxiliary_loss_clip": 0.0642112, + "auxiliary_loss_mlp": 0.0126802, + "balance_loss_clip": 0.06274901, + "balance_loss_mlp": 0.01256993, + "epoch": 0.5939576131068691, + "flos": 45597029293440.0, + "grad_norm": 1.6161422600744055, + "language_loss": 0.71359301, + "learning_rate": 1.4940017952812754e-06, + "loss": 0.79048443, + "num_input_tokens_seen": 212812265, + "router_z_loss_clip": 1.46191406, + "router_z_loss_mlp": 0.11029053, + "step": 9879, + "time_per_iteration": 2.7588438987731934 + }, + { + "auxiliary_loss_clip": 0.06414315, + "auxiliary_loss_mlp": 0.0126561, + "balance_loss_clip": 0.06272938, + "balance_loss_mlp": 0.01254166, + "epoch": 0.594017736359537, + "flos": 23594648451840.0, + "grad_norm": 1.558347600048505, + "language_loss": 0.57834136, + "learning_rate": 1.493625013742401e-06, + "loss": 0.65514064, + "num_input_tokens_seen": 212831915, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.11431885, + "step": 9880, + "time_per_iteration": 2.5477280616760254 + }, + { + "auxiliary_loss_clip": 0.0641728, + "auxiliary_loss_mlp": 0.012706, + "balance_loss_clip": 0.06272745, + "balance_loss_mlp": 0.01258751, + "epoch": 0.594077859612205, + "flos": 29464373543040.0, + "grad_norm": 1.9254284711947285, + "language_loss": 0.78115642, + "learning_rate": 1.4932482514053177e-06, + "loss": 0.85803521, + "num_input_tokens_seen": 212851350, + "router_z_loss_clip": 1.4453125, + "router_z_loss_mlp": 0.11846924, + "step": 9881, + "time_per_iteration": 2.596902847290039 + }, + { + "auxiliary_loss_clip": 0.06421138, + "auxiliary_loss_mlp": 0.0126373, + "balance_loss_clip": 0.06276222, + "balance_loss_mlp": 0.01252882, + "epoch": 0.594137982864873, + "flos": 16805682144000.0, + "grad_norm": 2.173471904433077, + "language_loss": 0.83138072, + "learning_rate": 1.4928715082843112e-06, + "loss": 0.90822935, + "num_input_tokens_seen": 212867995, + "router_z_loss_clip": 1.44726562, + "router_z_loss_mlp": 0.10839844, + "step": 9882, + "time_per_iteration": 2.483264446258545 + }, + { + "auxiliary_loss_clip": 0.06420217, + "auxiliary_loss_mlp": 0.01271488, + "balance_loss_clip": 0.06276472, + "balance_loss_mlp": 0.01260318, + "epoch": 0.594198106117541, + "flos": 12755194974720.0, + "grad_norm": 2.093124407330454, + "language_loss": 0.79720157, + "learning_rate": 1.492494784393667e-06, + "loss": 0.87411857, + "num_input_tokens_seen": 212885220, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.11175537, + "step": 9883, + "time_per_iteration": 2.5007734298706055 + }, + { + "auxiliary_loss_clip": 0.06424005, + "auxiliary_loss_mlp": 0.01269731, + "balance_loss_clip": 0.06275944, + "balance_loss_mlp": 0.01258097, + "epoch": 0.5942582293702089, + "flos": 21002930939520.0, + "grad_norm": 1.7867915832733556, + "language_loss": 0.7479161, + "learning_rate": 1.4921180797476725e-06, + "loss": 0.82485354, + "num_input_tokens_seen": 212903195, + "router_z_loss_clip": 1.48144531, + "router_z_loss_mlp": 0.11645508, + "step": 9884, + "time_per_iteration": 2.5044338703155518 + }, + { + "auxiliary_loss_clip": 0.06419083, + "auxiliary_loss_mlp": 0.01265524, + "balance_loss_clip": 0.06275263, + "balance_loss_mlp": 0.01253549, + "epoch": 0.5943183526228769, + "flos": 28298665941120.0, + "grad_norm": 2.661403390475952, + "language_loss": 0.6670655, + "learning_rate": 1.4917413943606106e-06, + "loss": 0.7439115, + "num_input_tokens_seen": 212923340, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.11975098, + "step": 9885, + "time_per_iteration": 2.592233180999756 + }, + { + "auxiliary_loss_clip": 0.06417437, + "auxiliary_loss_mlp": 0.01268066, + "balance_loss_clip": 0.06274392, + "balance_loss_mlp": 0.01256884, + "epoch": 0.5943784758755448, + "flos": 26621829233280.0, + "grad_norm": 2.23147400779812, + "language_loss": 0.76914746, + "learning_rate": 1.4913647282467667e-06, + "loss": 0.84600246, + "num_input_tokens_seen": 212942755, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.11181641, + "step": 9886, + "time_per_iteration": 2.5211451053619385 + }, + { + "auxiliary_loss_clip": 0.06318811, + "auxiliary_loss_mlp": 0.01252302, + "balance_loss_clip": 0.06259875, + "balance_loss_mlp": 0.01250785, + "epoch": 0.5944385991282128, + "flos": 64209859643520.0, + "grad_norm": 0.8085761446732002, + "language_loss": 0.64425516, + "learning_rate": 1.490988081420423e-06, + "loss": 0.71996629, + "num_input_tokens_seen": 212999355, + "router_z_loss_clip": 0.58984375, + "router_z_loss_mlp": 0.01515961, + "step": 9887, + "time_per_iteration": 4.4216148853302 + }, + { + "auxiliary_loss_clip": 0.06419201, + "auxiliary_loss_mlp": 0.01265936, + "balance_loss_clip": 0.06275857, + "balance_loss_mlp": 0.01254307, + "epoch": 0.5944987223808808, + "flos": 19577885351040.0, + "grad_norm": 1.7443994329425772, + "language_loss": 0.691764, + "learning_rate": 1.4906114538958615e-06, + "loss": 0.76861531, + "num_input_tokens_seen": 213018570, + "router_z_loss_clip": 1.43164062, + "router_z_loss_mlp": 0.11633301, + "step": 9888, + "time_per_iteration": 2.558119058609009 + }, + { + "auxiliary_loss_clip": 0.06419526, + "auxiliary_loss_mlp": 0.01269907, + "balance_loss_clip": 0.06276903, + "balance_loss_mlp": 0.01258773, + "epoch": 0.5945588456335488, + "flos": 26184856590720.0, + "grad_norm": 1.5028057851776446, + "language_loss": 0.7952224, + "learning_rate": 1.490234845687366e-06, + "loss": 0.87211674, + "num_input_tokens_seen": 213037735, + "router_z_loss_clip": 1.42675781, + "router_z_loss_mlp": 0.11138916, + "step": 9889, + "time_per_iteration": 2.556455612182617 + }, + { + "auxiliary_loss_clip": 0.06416804, + "auxiliary_loss_mlp": 0.01267591, + "balance_loss_clip": 0.06273508, + "balance_loss_mlp": 0.01257076, + "epoch": 0.5946189688862168, + "flos": 20452333760640.0, + "grad_norm": 1.5171149074997012, + "language_loss": 0.70987219, + "learning_rate": 1.4898582568092154e-06, + "loss": 0.7867161, + "num_input_tokens_seen": 213057160, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.1050415, + "step": 9890, + "time_per_iteration": 2.572852373123169 + }, + { + "auxiliary_loss_clip": 0.06420811, + "auxiliary_loss_mlp": 0.01269509, + "balance_loss_clip": 0.06275058, + "balance_loss_mlp": 0.01258041, + "epoch": 0.5946790921388847, + "flos": 13441568895360.0, + "grad_norm": 1.9815921383050485, + "language_loss": 0.697523, + "learning_rate": 1.489481687275691e-06, + "loss": 0.77442622, + "num_input_tokens_seen": 213073630, + "router_z_loss_clip": 1.45800781, + "router_z_loss_mlp": 0.11468506, + "step": 9891, + "time_per_iteration": 2.474308729171753 + }, + { + "auxiliary_loss_clip": 0.06419806, + "auxiliary_loss_mlp": 0.01266103, + "balance_loss_clip": 0.06277567, + "balance_loss_mlp": 0.01255839, + "epoch": 0.5947392153915527, + "flos": 20418483911040.0, + "grad_norm": 1.7485359350265648, + "language_loss": 0.53498697, + "learning_rate": 1.4891051371010726e-06, + "loss": 0.61184609, + "num_input_tokens_seen": 213092450, + "router_z_loss_clip": 1.42285156, + "router_z_loss_mlp": 0.10266113, + "step": 9892, + "time_per_iteration": 2.534221649169922 + }, + { + "auxiliary_loss_clip": 0.06313733, + "auxiliary_loss_mlp": 0.01253007, + "balance_loss_clip": 0.06254771, + "balance_loss_mlp": 0.01251455, + "epoch": 0.5947993386442206, + "flos": 65639181790080.0, + "grad_norm": 0.6531062006914405, + "language_loss": 0.54571462, + "learning_rate": 1.4887286062996375e-06, + "loss": 0.621382, + "num_input_tokens_seen": 213155465, + "router_z_loss_clip": 0.58984375, + "router_z_loss_mlp": 0.01551056, + "step": 9893, + "time_per_iteration": 3.1853702068328857 + }, + { + "auxiliary_loss_clip": 0.064126, + "auxiliary_loss_mlp": 0.0126532, + "balance_loss_clip": 0.06272365, + "balance_loss_mlp": 0.01254841, + "epoch": 0.5948594618968887, + "flos": 23189429306880.0, + "grad_norm": 1.6806512476713673, + "language_loss": 0.75017619, + "learning_rate": 1.4883520948856658e-06, + "loss": 0.82695538, + "num_input_tokens_seen": 213174875, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.10473633, + "step": 9894, + "time_per_iteration": 4.046506643295288 + }, + { + "auxiliary_loss_clip": 0.06415449, + "auxiliary_loss_mlp": 0.01265281, + "balance_loss_clip": 0.06273435, + "balance_loss_mlp": 0.01253831, + "epoch": 0.5949195851495566, + "flos": 13631991298560.0, + "grad_norm": 1.844376504699444, + "language_loss": 0.77997828, + "learning_rate": 1.487975602873434e-06, + "loss": 0.8567856, + "num_input_tokens_seen": 213192695, + "router_z_loss_clip": 1.41894531, + "router_z_loss_mlp": 0.11444092, + "step": 9895, + "time_per_iteration": 2.5028066635131836 + }, + { + "auxiliary_loss_clip": 0.06421571, + "auxiliary_loss_mlp": 0.01264682, + "balance_loss_clip": 0.06273872, + "balance_loss_mlp": 0.01252862, + "epoch": 0.5949797084022246, + "flos": 19756358547840.0, + "grad_norm": 2.034072439962686, + "language_loss": 0.79318964, + "learning_rate": 1.4875991302772182e-06, + "loss": 0.8700521, + "num_input_tokens_seen": 213211195, + "router_z_loss_clip": 1.47753906, + "router_z_loss_mlp": 0.11816406, + "step": 9896, + "time_per_iteration": 2.496610164642334 + }, + { + "auxiliary_loss_clip": 0.06420637, + "auxiliary_loss_mlp": 0.01265344, + "balance_loss_clip": 0.06275238, + "balance_loss_mlp": 0.01253709, + "epoch": 0.5950398316548925, + "flos": 25780685621760.0, + "grad_norm": 1.4418973411464253, + "language_loss": 0.8331461, + "learning_rate": 1.4872226771112954e-06, + "loss": 0.91000593, + "num_input_tokens_seen": 213231975, + "router_z_loss_clip": 1.45507812, + "router_z_loss_mlp": 0.11645508, + "step": 9897, + "time_per_iteration": 2.6055963039398193 + }, + { + "auxiliary_loss_clip": 0.06422365, + "auxiliary_loss_mlp": 0.0126489, + "balance_loss_clip": 0.06278124, + "balance_loss_mlp": 0.01254012, + "epoch": 0.5950999549075605, + "flos": 23045644500480.0, + "grad_norm": 2.157917564883112, + "language_loss": 0.71089602, + "learning_rate": 1.486846243389939e-06, + "loss": 0.78776848, + "num_input_tokens_seen": 213249760, + "router_z_loss_clip": 1.44335938, + "router_z_loss_mlp": 0.10882568, + "step": 9898, + "time_per_iteration": 3.95219087600708 + }, + { + "auxiliary_loss_clip": 0.06426959, + "auxiliary_loss_mlp": 0.01267336, + "balance_loss_clip": 0.06277014, + "balance_loss_mlp": 0.01254897, + "epoch": 0.5951600781602284, + "flos": 32453553697920.0, + "grad_norm": 2.106705884146929, + "language_loss": 0.63699448, + "learning_rate": 1.4864698291274251e-06, + "loss": 0.71393746, + "num_input_tokens_seen": 213269890, + "router_z_loss_clip": 1.49902344, + "router_z_loss_mlp": 0.12451172, + "step": 9899, + "time_per_iteration": 2.597721576690674 + }, + { + "auxiliary_loss_clip": 0.06419618, + "auxiliary_loss_mlp": 0.01270579, + "balance_loss_clip": 0.06276435, + "balance_loss_mlp": 0.01259999, + "epoch": 0.5952202014128964, + "flos": 23806887644160.0, + "grad_norm": 1.5164228353921223, + "language_loss": 0.72182071, + "learning_rate": 1.4860934343380267e-06, + "loss": 0.79872268, + "num_input_tokens_seen": 213289400, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.10571289, + "step": 9900, + "time_per_iteration": 2.5579535961151123 + }, + { + "auxiliary_loss_clip": 0.06414567, + "auxiliary_loss_mlp": 0.01267005, + "balance_loss_clip": 0.06274517, + "balance_loss_mlp": 0.01255484, + "epoch": 0.5952803246655644, + "flos": 22498778828160.0, + "grad_norm": 1.774545476213964, + "language_loss": 0.84691358, + "learning_rate": 1.4857170590360169e-06, + "loss": 0.9237293, + "num_input_tokens_seen": 213308040, + "router_z_loss_clip": 1.40039062, + "router_z_loss_mlp": 0.11523438, + "step": 9901, + "time_per_iteration": 2.532650947570801 + }, + { + "auxiliary_loss_clip": 0.06311554, + "auxiliary_loss_mlp": 0.01252152, + "balance_loss_clip": 0.06252782, + "balance_loss_mlp": 0.01250599, + "epoch": 0.5953404479182324, + "flos": 51250810884480.0, + "grad_norm": 0.7741789718205083, + "language_loss": 0.58204901, + "learning_rate": 1.4853407032356674e-06, + "loss": 0.65768605, + "num_input_tokens_seen": 213358585, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 0.01550293, + "step": 9902, + "time_per_iteration": 2.995508909225464 + }, + { + "auxiliary_loss_clip": 0.06422149, + "auxiliary_loss_mlp": 0.01268252, + "balance_loss_clip": 0.06274737, + "balance_loss_mlp": 0.01256653, + "epoch": 0.5954005711709004, + "flos": 23119423620480.0, + "grad_norm": 1.8631652775155525, + "language_loss": 0.77643347, + "learning_rate": 1.4849643669512503e-06, + "loss": 0.85333747, + "num_input_tokens_seen": 213379585, + "router_z_loss_clip": 1.47363281, + "router_z_loss_mlp": 0.11608887, + "step": 9903, + "time_per_iteration": 2.526265859603882 + }, + { + "auxiliary_loss_clip": 0.06419012, + "auxiliary_loss_mlp": 0.01265075, + "balance_loss_clip": 0.06274754, + "balance_loss_mlp": 0.01253691, + "epoch": 0.5954606944235683, + "flos": 35963464250880.0, + "grad_norm": 1.7611381352056217, + "language_loss": 0.78137469, + "learning_rate": 1.4845880501970362e-06, + "loss": 0.85821557, + "num_input_tokens_seen": 213401465, + "router_z_loss_clip": 1.44140625, + "router_z_loss_mlp": 0.1138916, + "step": 9904, + "time_per_iteration": 4.04362940788269 + }, + { + "auxiliary_loss_clip": 0.0642558, + "auxiliary_loss_mlp": 0.012642, + "balance_loss_clip": 0.06275237, + "balance_loss_mlp": 0.01252619, + "epoch": 0.5955208176762363, + "flos": 30451188677760.0, + "grad_norm": 1.2800711014437993, + "language_loss": 0.72963494, + "learning_rate": 1.4842117529872942e-06, + "loss": 0.80653274, + "num_input_tokens_seen": 213422720, + "router_z_loss_clip": 1.50390625, + "router_z_loss_mlp": 0.11566162, + "step": 9905, + "time_per_iteration": 2.630237340927124 + }, + { + "auxiliary_loss_clip": 0.06417751, + "auxiliary_loss_mlp": 0.01267213, + "balance_loss_clip": 0.06273159, + "balance_loss_mlp": 0.01255942, + "epoch": 0.5955809409289042, + "flos": 17645987214720.0, + "grad_norm": 2.1926975812717524, + "language_loss": 0.70104027, + "learning_rate": 1.483835475336295e-06, + "loss": 0.77788991, + "num_input_tokens_seen": 213439480, + "router_z_loss_clip": 1.4453125, + "router_z_loss_mlp": 0.11273193, + "step": 9906, + "time_per_iteration": 2.5136594772338867 + }, + { + "auxiliary_loss_clip": 0.06423035, + "auxiliary_loss_mlp": 0.0126641, + "balance_loss_clip": 0.06276789, + "balance_loss_mlp": 0.01254316, + "epoch": 0.5956410641815723, + "flos": 24286766376960.0, + "grad_norm": 1.7055783949352592, + "language_loss": 0.74976909, + "learning_rate": 1.4834592172583057e-06, + "loss": 0.82666361, + "num_input_tokens_seen": 213458895, + "router_z_loss_clip": 1.46191406, + "router_z_loss_mlp": 0.12103271, + "step": 9907, + "time_per_iteration": 2.5186941623687744 + }, + { + "auxiliary_loss_clip": 0.06419441, + "auxiliary_loss_mlp": 0.01268122, + "balance_loss_clip": 0.06274839, + "balance_loss_mlp": 0.01256618, + "epoch": 0.5957011874342402, + "flos": 35742713869440.0, + "grad_norm": 1.9121613205115942, + "language_loss": 0.67437243, + "learning_rate": 1.483082978767595e-06, + "loss": 0.75124806, + "num_input_tokens_seen": 213481730, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.11505127, + "step": 9908, + "time_per_iteration": 2.641977310180664 + }, + { + "auxiliary_loss_clip": 0.06417987, + "auxiliary_loss_mlp": 0.01266521, + "balance_loss_clip": 0.0627388, + "balance_loss_mlp": 0.01255459, + "epoch": 0.5957613106869082, + "flos": 21250277792640.0, + "grad_norm": 1.9262426125407, + "language_loss": 0.7637223, + "learning_rate": 1.4827067598784298e-06, + "loss": 0.84056735, + "num_input_tokens_seen": 213497225, + "router_z_loss_clip": 1.43945312, + "router_z_loss_mlp": 0.1105957, + "step": 9909, + "time_per_iteration": 2.4708259105682373 + }, + { + "auxiliary_loss_clip": 0.06309633, + "auxiliary_loss_mlp": 0.01253319, + "balance_loss_clip": 0.06250934, + "balance_loss_mlp": 0.01251702, + "epoch": 0.5958214339395761, + "flos": 65959972346880.0, + "grad_norm": 0.8925366465224025, + "language_loss": 0.73392916, + "learning_rate": 1.4823305606050753e-06, + "loss": 0.80955869, + "num_input_tokens_seen": 213556890, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 0.01618958, + "step": 9910, + "time_per_iteration": 3.2132058143615723 + }, + { + "auxiliary_loss_clip": 0.06420797, + "auxiliary_loss_mlp": 0.01266027, + "balance_loss_clip": 0.06273291, + "balance_loss_mlp": 0.01253838, + "epoch": 0.5958815571922441, + "flos": 23224872384000.0, + "grad_norm": 1.906132958424511, + "language_loss": 0.69966662, + "learning_rate": 1.481954380961799e-06, + "loss": 0.77653486, + "num_input_tokens_seen": 213575800, + "router_z_loss_clip": 1.47753906, + "router_z_loss_mlp": 0.12194824, + "step": 9911, + "time_per_iteration": 2.5891547203063965 + }, + { + "auxiliary_loss_clip": 0.06430559, + "auxiliary_loss_mlp": 0.01269185, + "balance_loss_clip": 0.06277213, + "balance_loss_mlp": 0.01256471, + "epoch": 0.595941680444912, + "flos": 16543157702400.0, + "grad_norm": 1.8117496085568294, + "language_loss": 0.65995622, + "learning_rate": 1.4815782209628631e-06, + "loss": 0.73695368, + "num_input_tokens_seen": 213592740, + "router_z_loss_clip": 1.53515625, + "router_z_loss_mlp": 0.12713623, + "step": 9912, + "time_per_iteration": 2.5106897354125977 + }, + { + "auxiliary_loss_clip": 0.06418723, + "auxiliary_loss_mlp": 0.01269847, + "balance_loss_clip": 0.06273462, + "balance_loss_mlp": 0.01257681, + "epoch": 0.59600180369758, + "flos": 27826334075520.0, + "grad_norm": 1.8937269812557305, + "language_loss": 0.73603946, + "learning_rate": 1.4812020806225337e-06, + "loss": 0.81292516, + "num_input_tokens_seen": 213611970, + "router_z_loss_clip": 1.45117188, + "router_z_loss_mlp": 0.12145996, + "step": 9913, + "time_per_iteration": 2.5845842361450195 + }, + { + "auxiliary_loss_clip": 0.06422256, + "auxiliary_loss_mlp": 0.01265933, + "balance_loss_clip": 0.06272183, + "balance_loss_mlp": 0.01254316, + "epoch": 0.596061926950248, + "flos": 29498349173760.0, + "grad_norm": 2.1687664822630692, + "language_loss": 0.79983938, + "learning_rate": 1.4808259599550738e-06, + "loss": 0.87672126, + "num_input_tokens_seen": 213632230, + "router_z_loss_clip": 1.50097656, + "router_z_loss_mlp": 0.1161499, + "step": 9914, + "time_per_iteration": 2.677943229675293 + }, + { + "auxiliary_loss_clip": 0.06418366, + "auxiliary_loss_mlp": 0.01267743, + "balance_loss_clip": 0.06274056, + "balance_loss_mlp": 0.01256233, + "epoch": 0.596122050202916, + "flos": 16842424959360.0, + "grad_norm": 1.662988077903936, + "language_loss": 0.67750293, + "learning_rate": 1.4804498589747448e-06, + "loss": 0.75436401, + "num_input_tokens_seen": 213649645, + "router_z_loss_clip": 1.44238281, + "router_z_loss_mlp": 0.1149292, + "step": 9915, + "time_per_iteration": 2.527804374694824 + }, + { + "auxiliary_loss_clip": 0.06422138, + "auxiliary_loss_mlp": 0.01266284, + "balance_loss_clip": 0.06274668, + "balance_loss_mlp": 0.01254888, + "epoch": 0.596182173455584, + "flos": 21003056720640.0, + "grad_norm": 1.4119869222981658, + "language_loss": 0.7862711, + "learning_rate": 1.4800737776958095e-06, + "loss": 0.86315531, + "num_input_tokens_seen": 213668850, + "router_z_loss_clip": 1.47363281, + "router_z_loss_mlp": 0.11395264, + "step": 9916, + "time_per_iteration": 2.5146098136901855 + }, + { + "auxiliary_loss_clip": 0.06422624, + "auxiliary_loss_mlp": 0.01266472, + "balance_loss_clip": 0.06273377, + "balance_loss_mlp": 0.01254808, + "epoch": 0.5962422967082519, + "flos": 16070364639360.0, + "grad_norm": 1.8279133386942186, + "language_loss": 0.83302379, + "learning_rate": 1.4796977161325286e-06, + "loss": 0.90991473, + "num_input_tokens_seen": 213685695, + "router_z_loss_clip": 1.49511719, + "router_z_loss_mlp": 0.11657715, + "step": 9917, + "time_per_iteration": 2.5148332118988037 + }, + { + "auxiliary_loss_clip": 0.06418853, + "auxiliary_loss_mlp": 0.0126709, + "balance_loss_clip": 0.06274682, + "balance_loss_mlp": 0.01256236, + "epoch": 0.5963024199609199, + "flos": 12171879976320.0, + "grad_norm": 1.6879177929284592, + "language_loss": 0.77521312, + "learning_rate": 1.4793216742991625e-06, + "loss": 0.85207248, + "num_input_tokens_seen": 213703515, + "router_z_loss_clip": 1.44335938, + "router_z_loss_mlp": 0.10852051, + "step": 9918, + "time_per_iteration": 2.4897613525390625 + }, + { + "auxiliary_loss_clip": 0.06419399, + "auxiliary_loss_mlp": 0.01267485, + "balance_loss_clip": 0.06274245, + "balance_loss_mlp": 0.01256661, + "epoch": 0.5963625432135878, + "flos": 28081772847360.0, + "grad_norm": 1.5296515450402863, + "language_loss": 0.7930398, + "learning_rate": 1.4789456522099707e-06, + "loss": 0.86990869, + "num_input_tokens_seen": 213724170, + "router_z_loss_clip": 1.45214844, + "router_z_loss_mlp": 0.10821533, + "step": 9919, + "time_per_iteration": 2.6023364067077637 + }, + { + "auxiliary_loss_clip": 0.06424099, + "auxiliary_loss_mlp": 0.01265086, + "balance_loss_clip": 0.06277885, + "balance_loss_mlp": 0.01253434, + "epoch": 0.5964226664662559, + "flos": 19865664599040.0, + "grad_norm": 2.0582572283345537, + "language_loss": 0.77598941, + "learning_rate": 1.4785696498792122e-06, + "loss": 0.85288125, + "num_input_tokens_seen": 213740620, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.11645508, + "step": 9920, + "time_per_iteration": 2.499610424041748 + }, + { + "auxiliary_loss_clip": 0.06428593, + "auxiliary_loss_mlp": 0.01269926, + "balance_loss_clip": 0.06280707, + "balance_loss_mlp": 0.01258124, + "epoch": 0.5964827897189238, + "flos": 12937567386240.0, + "grad_norm": 2.9535163377991647, + "language_loss": 0.8317768, + "learning_rate": 1.4781936673211446e-06, + "loss": 0.90876198, + "num_input_tokens_seen": 213755390, + "router_z_loss_clip": 1.47753906, + "router_z_loss_mlp": 0.11798096, + "step": 9921, + "time_per_iteration": 2.5134449005126953 + }, + { + "auxiliary_loss_clip": 0.06416389, + "auxiliary_loss_mlp": 0.01268083, + "balance_loss_clip": 0.0627341, + "balance_loss_mlp": 0.01256389, + "epoch": 0.5965429129715918, + "flos": 18156738977280.0, + "grad_norm": 1.8928045831706461, + "language_loss": 0.80601788, + "learning_rate": 1.4778177045500252e-06, + "loss": 0.88286257, + "num_input_tokens_seen": 213773225, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.11694336, + "step": 9922, + "time_per_iteration": 2.4813597202301025 + }, + { + "auxiliary_loss_clip": 0.06417114, + "auxiliary_loss_mlp": 0.01269772, + "balance_loss_clip": 0.06271716, + "balance_loss_mlp": 0.01258828, + "epoch": 0.5966030362242597, + "flos": 21769834233600.0, + "grad_norm": 3.055273537118157, + "language_loss": 0.7726593, + "learning_rate": 1.477441761580111e-06, + "loss": 0.84952813, + "num_input_tokens_seen": 213791860, + "router_z_loss_clip": 1.453125, + "router_z_loss_mlp": 0.10949707, + "step": 9923, + "time_per_iteration": 2.5638489723205566 + }, + { + "auxiliary_loss_clip": 0.06424043, + "auxiliary_loss_mlp": 0.01268694, + "balance_loss_clip": 0.06273048, + "balance_loss_mlp": 0.01254973, + "epoch": 0.5966631594769277, + "flos": 18813204190080.0, + "grad_norm": 1.8922524994378742, + "language_loss": 0.76095831, + "learning_rate": 1.4770658384256573e-06, + "loss": 0.83788568, + "num_input_tokens_seen": 213809455, + "router_z_loss_clip": 1.51074219, + "router_z_loss_mlp": 0.13720703, + "step": 9924, + "time_per_iteration": 2.4999732971191406 + }, + { + "auxiliary_loss_clip": 0.06413831, + "auxiliary_loss_mlp": 0.01268542, + "balance_loss_clip": 0.06272236, + "balance_loss_mlp": 0.01256633, + "epoch": 0.5967232827295956, + "flos": 14069383211520.0, + "grad_norm": 1.7112851014893713, + "language_loss": 0.66830564, + "learning_rate": 1.4766899351009204e-06, + "loss": 0.74512935, + "num_input_tokens_seen": 213826615, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.11920166, + "step": 9925, + "time_per_iteration": 2.5139551162719727 + }, + { + "auxiliary_loss_clip": 0.06421202, + "auxiliary_loss_mlp": 0.0126999, + "balance_loss_clip": 0.06279947, + "balance_loss_mlp": 0.01258409, + "epoch": 0.5967834059822636, + "flos": 17243954524800.0, + "grad_norm": 1.861204364539265, + "language_loss": 0.72200316, + "learning_rate": 1.4763140516201528e-06, + "loss": 0.79891503, + "num_input_tokens_seen": 213844495, + "router_z_loss_clip": 1.41210938, + "router_z_loss_mlp": 0.11584473, + "step": 9926, + "time_per_iteration": 3.9693188667297363 + }, + { + "auxiliary_loss_clip": 0.06422362, + "auxiliary_loss_mlp": 0.01270656, + "balance_loss_clip": 0.06274919, + "balance_loss_mlp": 0.01258556, + "epoch": 0.5968435292349316, + "flos": 42529751533440.0, + "grad_norm": 1.9299553445847866, + "language_loss": 0.70147216, + "learning_rate": 1.4759381879976088e-06, + "loss": 0.77840233, + "num_input_tokens_seen": 213869125, + "router_z_loss_clip": 1.47460938, + "router_z_loss_mlp": 0.12103271, + "step": 9927, + "time_per_iteration": 2.7299752235412598 + }, + { + "auxiliary_loss_clip": 0.06429256, + "auxiliary_loss_mlp": 0.01266883, + "balance_loss_clip": 0.06277983, + "balance_loss_mlp": 0.0125467, + "epoch": 0.5969036524875996, + "flos": 37639546272000.0, + "grad_norm": 1.5668113041571725, + "language_loss": 0.63611758, + "learning_rate": 1.4755623442475415e-06, + "loss": 0.71307898, + "num_input_tokens_seen": 213891115, + "router_z_loss_clip": 1.51269531, + "router_z_loss_mlp": 0.12213135, + "step": 9928, + "time_per_iteration": 2.7166144847869873 + }, + { + "auxiliary_loss_clip": 0.06418041, + "auxiliary_loss_mlp": 0.01265529, + "balance_loss_clip": 0.06274209, + "balance_loss_mlp": 0.01254454, + "epoch": 0.5969637757402676, + "flos": 23154992478720.0, + "grad_norm": 2.1979213221977596, + "language_loss": 0.69668317, + "learning_rate": 1.4751865203842022e-06, + "loss": 0.77351892, + "num_input_tokens_seen": 213911925, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.1105957, + "step": 9929, + "time_per_iteration": 2.51379656791687 + }, + { + "auxiliary_loss_clip": 0.0641327, + "auxiliary_loss_mlp": 0.01270831, + "balance_loss_clip": 0.06274718, + "balance_loss_mlp": 0.01259697, + "epoch": 0.5970238989929355, + "flos": 24027176828160.0, + "grad_norm": 1.690473988948275, + "language_loss": 0.7685796, + "learning_rate": 1.4748107164218431e-06, + "loss": 0.8454206, + "num_input_tokens_seen": 213930715, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.11138916, + "step": 9930, + "time_per_iteration": 2.590068817138672 + }, + { + "auxiliary_loss_clip": 0.06427103, + "auxiliary_loss_mlp": 0.01271306, + "balance_loss_clip": 0.06277532, + "balance_loss_mlp": 0.01259206, + "epoch": 0.5970840222456035, + "flos": 19432884660480.0, + "grad_norm": 1.4319660868037594, + "language_loss": 0.69073558, + "learning_rate": 1.4744349323747146e-06, + "loss": 0.76771963, + "num_input_tokens_seen": 213950015, + "router_z_loss_clip": 1.49511719, + "router_z_loss_mlp": 0.12097168, + "step": 9931, + "time_per_iteration": 2.4879701137542725 + }, + { + "auxiliary_loss_clip": 0.06314774, + "auxiliary_loss_mlp": 0.01252398, + "balance_loss_clip": 0.06255934, + "balance_loss_mlp": 0.01250752, + "epoch": 0.5971441454982714, + "flos": 62993615230080.0, + "grad_norm": 0.8560146868595252, + "language_loss": 0.64260876, + "learning_rate": 1.474059168257065e-06, + "loss": 0.71828043, + "num_input_tokens_seen": 214003330, + "router_z_loss_clip": 0.58984375, + "router_z_loss_mlp": 0.01649475, + "step": 9932, + "time_per_iteration": 3.0806198120117188 + }, + { + "auxiliary_loss_clip": 0.06415366, + "auxiliary_loss_mlp": 0.01270842, + "balance_loss_clip": 0.06272191, + "balance_loss_mlp": 0.01259976, + "epoch": 0.5972042687509395, + "flos": 20272393117440.0, + "grad_norm": 1.7768464871728415, + "language_loss": 0.74403048, + "learning_rate": 1.4736834240831454e-06, + "loss": 0.82089257, + "num_input_tokens_seen": 214021680, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.10864258, + "step": 9933, + "time_per_iteration": 3.9164891242980957 + }, + { + "auxiliary_loss_clip": 0.06316046, + "auxiliary_loss_mlp": 0.01258623, + "balance_loss_clip": 0.06257492, + "balance_loss_mlp": 0.01256835, + "epoch": 0.5972643920036074, + "flos": 71675625778560.0, + "grad_norm": 0.666650666050939, + "language_loss": 0.51957405, + "learning_rate": 1.473307699867203e-06, + "loss": 0.59532076, + "num_input_tokens_seen": 214090265, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 0.01785278, + "step": 9934, + "time_per_iteration": 3.263599157333374 + }, + { + "auxiliary_loss_clip": 0.06320157, + "auxiliary_loss_mlp": 0.01253316, + "balance_loss_clip": 0.06261201, + "balance_loss_mlp": 0.01251523, + "epoch": 0.5973245152562754, + "flos": 56910225427200.0, + "grad_norm": 0.8129555240105609, + "language_loss": 0.54121673, + "learning_rate": 1.4729319956234849e-06, + "loss": 0.61695147, + "num_input_tokens_seen": 214146375, + "router_z_loss_clip": 0.58984375, + "router_z_loss_mlp": 0.0178833, + "step": 9935, + "time_per_iteration": 3.13610577583313 + }, + { + "auxiliary_loss_clip": 0.0641949, + "auxiliary_loss_mlp": 0.01266102, + "balance_loss_clip": 0.06273362, + "balance_loss_mlp": 0.01254229, + "epoch": 0.5973846385089433, + "flos": 24170206947840.0, + "grad_norm": 1.6283043946182527, + "language_loss": 0.65934885, + "learning_rate": 1.4725563113662394e-06, + "loss": 0.7362048, + "num_input_tokens_seen": 214165340, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.11883545, + "step": 9936, + "time_per_iteration": 2.5317225456237793 + }, + { + "auxiliary_loss_clip": 0.06426519, + "auxiliary_loss_mlp": 0.01266905, + "balance_loss_clip": 0.06278973, + "balance_loss_mlp": 0.01256027, + "epoch": 0.5974447617616113, + "flos": 17675476652160.0, + "grad_norm": 1.977673103112211, + "language_loss": 0.67786443, + "learning_rate": 1.4721806471097103e-06, + "loss": 0.75479865, + "num_input_tokens_seen": 214181360, + "router_z_loss_clip": 1.47460938, + "router_z_loss_mlp": 0.10882568, + "step": 9937, + "time_per_iteration": 2.51056170463562 + }, + { + "auxiliary_loss_clip": 0.0642201, + "auxiliary_loss_mlp": 0.01272578, + "balance_loss_clip": 0.06274251, + "balance_loss_mlp": 0.01260073, + "epoch": 0.5975048850142792, + "flos": 22899008655360.0, + "grad_norm": 2.0510739773646853, + "language_loss": 0.77639204, + "learning_rate": 1.4718050028681442e-06, + "loss": 0.85333794, + "num_input_tokens_seen": 214198525, + "router_z_loss_clip": 1.47558594, + "router_z_loss_mlp": 0.12512207, + "step": 9938, + "time_per_iteration": 3.988826274871826 + }, + { + "auxiliary_loss_clip": 0.06425326, + "auxiliary_loss_mlp": 0.01266797, + "balance_loss_clip": 0.06278642, + "balance_loss_mlp": 0.01255145, + "epoch": 0.5975650082669473, + "flos": 24360042372480.0, + "grad_norm": 1.4729050693859964, + "language_loss": 0.76065636, + "learning_rate": 1.4714293786557855e-06, + "loss": 0.83757758, + "num_input_tokens_seen": 214218710, + "router_z_loss_clip": 1.46679688, + "router_z_loss_mlp": 0.11645508, + "step": 9939, + "time_per_iteration": 2.556417226791382 + }, + { + "auxiliary_loss_clip": 0.06427339, + "auxiliary_loss_mlp": 0.01268522, + "balance_loss_clip": 0.06275803, + "balance_loss_mlp": 0.01255206, + "epoch": 0.5976251315196152, + "flos": 20929696871040.0, + "grad_norm": 2.2639919876209498, + "language_loss": 0.68839771, + "learning_rate": 1.471053774486878e-06, + "loss": 0.7653563, + "num_input_tokens_seen": 214237800, + "router_z_loss_clip": 1.515625, + "router_z_loss_mlp": 0.13323975, + "step": 9940, + "time_per_iteration": 2.5342793464660645 + }, + { + "auxiliary_loss_clip": 0.06417148, + "auxiliary_loss_mlp": 0.01270575, + "balance_loss_clip": 0.06276263, + "balance_loss_mlp": 0.01259602, + "epoch": 0.5976852547722832, + "flos": 35853193877760.0, + "grad_norm": 1.2345186889810322, + "language_loss": 0.69966424, + "learning_rate": 1.470678190375664e-06, + "loss": 0.77654147, + "num_input_tokens_seen": 214260355, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.10968018, + "step": 9941, + "time_per_iteration": 2.6775453090667725 + }, + { + "auxiliary_loss_clip": 0.06416304, + "auxiliary_loss_mlp": 0.01265548, + "balance_loss_clip": 0.06272396, + "balance_loss_mlp": 0.0125433, + "epoch": 0.5977453780249512, + "flos": 12860266394880.0, + "grad_norm": 1.7893879951427467, + "language_loss": 0.77519101, + "learning_rate": 1.470302626336386e-06, + "loss": 0.85200953, + "num_input_tokens_seen": 214277120, + "router_z_loss_clip": 1.43847656, + "router_z_loss_mlp": 0.11224365, + "step": 9942, + "time_per_iteration": 2.5630502700805664 + }, + { + "auxiliary_loss_clip": 0.06422595, + "auxiliary_loss_mlp": 0.01267488, + "balance_loss_clip": 0.06273595, + "balance_loss_mlp": 0.0125478, + "epoch": 0.5978055012776191, + "flos": 20965391510400.0, + "grad_norm": 1.999196380936964, + "language_loss": 0.76118851, + "learning_rate": 1.4699270823832857e-06, + "loss": 0.83808935, + "num_input_tokens_seen": 214295300, + "router_z_loss_clip": 1.48828125, + "router_z_loss_mlp": 0.12713623, + "step": 9943, + "time_per_iteration": 3.9001221656799316 + }, + { + "auxiliary_loss_clip": 0.06417957, + "auxiliary_loss_mlp": 0.01266022, + "balance_loss_clip": 0.06274446, + "balance_loss_mlp": 0.01255728, + "epoch": 0.5978656245302871, + "flos": 34066506067200.0, + "grad_norm": 1.9908445339246823, + "language_loss": 0.62211335, + "learning_rate": 1.4695515585306032e-06, + "loss": 0.69895315, + "num_input_tokens_seen": 214317050, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.10296631, + "step": 9944, + "time_per_iteration": 2.6546871662139893 + }, + { + "auxiliary_loss_clip": 0.06420632, + "auxiliary_loss_mlp": 0.01266771, + "balance_loss_clip": 0.06276795, + "balance_loss_mlp": 0.01255333, + "epoch": 0.597925747782955, + "flos": 37381508023680.0, + "grad_norm": 1.6358533401507223, + "language_loss": 0.72854936, + "learning_rate": 1.4691760547925795e-06, + "loss": 0.80542344, + "num_input_tokens_seen": 214337470, + "router_z_loss_clip": 1.43945312, + "router_z_loss_mlp": 0.11450195, + "step": 9945, + "time_per_iteration": 2.631753444671631 + }, + { + "auxiliary_loss_clip": 0.06419382, + "auxiliary_loss_mlp": 0.01270411, + "balance_loss_clip": 0.06275386, + "balance_loss_mlp": 0.01258997, + "epoch": 0.5979858710356231, + "flos": 25381923240960.0, + "grad_norm": 1.7624660559370904, + "language_loss": 0.67425656, + "learning_rate": 1.4688005711834522e-06, + "loss": 0.75115454, + "num_input_tokens_seen": 214357975, + "router_z_loss_clip": 1.43945312, + "router_z_loss_mlp": 0.11401367, + "step": 9946, + "time_per_iteration": 2.5964295864105225 + }, + { + "auxiliary_loss_clip": 0.06427635, + "auxiliary_loss_mlp": 0.01269885, + "balance_loss_clip": 0.06277838, + "balance_loss_mlp": 0.01257678, + "epoch": 0.598045994288291, + "flos": 13703422504320.0, + "grad_norm": 1.825350503307894, + "language_loss": 0.88689518, + "learning_rate": 1.468425107717461e-06, + "loss": 0.96387035, + "num_input_tokens_seen": 214374125, + "router_z_loss_clip": 1.49804688, + "router_z_loss_mlp": 0.12194824, + "step": 9947, + "time_per_iteration": 2.47194766998291 + }, + { + "auxiliary_loss_clip": 0.06412566, + "auxiliary_loss_mlp": 0.01263948, + "balance_loss_clip": 0.06274778, + "balance_loss_mlp": 0.01253409, + "epoch": 0.598106117540959, + "flos": 21987859357440.0, + "grad_norm": 1.5868690486029033, + "language_loss": 0.71892309, + "learning_rate": 1.4680496644088432e-06, + "loss": 0.79568821, + "num_input_tokens_seen": 214393395, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.10540771, + "step": 9948, + "time_per_iteration": 2.519465446472168 + }, + { + "auxiliary_loss_clip": 0.06424625, + "auxiliary_loss_mlp": 0.0126681, + "balance_loss_clip": 0.06277405, + "balance_loss_mlp": 0.01255015, + "epoch": 0.5981662407936269, + "flos": 20565790588800.0, + "grad_norm": 1.9625714193598658, + "language_loss": 0.89521587, + "learning_rate": 1.4676742412718347e-06, + "loss": 0.97213024, + "num_input_tokens_seen": 214411550, + "router_z_loss_clip": 1.46972656, + "router_z_loss_mlp": 0.11791992, + "step": 9949, + "time_per_iteration": 2.512617588043213 + }, + { + "auxiliary_loss_clip": 0.0641937, + "auxiliary_loss_mlp": 0.01266363, + "balance_loss_clip": 0.06276002, + "balance_loss_mlp": 0.0125524, + "epoch": 0.5982263640462949, + "flos": 14069005868160.0, + "grad_norm": 2.2044341220338484, + "language_loss": 0.70866632, + "learning_rate": 1.467298838320673e-06, + "loss": 0.78552365, + "num_input_tokens_seen": 214429780, + "router_z_loss_clip": 1.43359375, + "router_z_loss_mlp": 0.11126709, + "step": 9950, + "time_per_iteration": 2.4983901977539062 + }, + { + "auxiliary_loss_clip": 0.06423427, + "auxiliary_loss_mlp": 0.01265207, + "balance_loss_clip": 0.06276861, + "balance_loss_mlp": 0.01254103, + "epoch": 0.5982864872989628, + "flos": 17712135613440.0, + "grad_norm": 1.7147951868971159, + "language_loss": 0.7865026, + "learning_rate": 1.4669234555695921e-06, + "loss": 0.86338896, + "num_input_tokens_seen": 214447775, + "router_z_loss_clip": 1.46484375, + "router_z_loss_mlp": 0.11102295, + "step": 9951, + "time_per_iteration": 2.5179500579833984 + }, + { + "auxiliary_loss_clip": 0.06422336, + "auxiliary_loss_mlp": 0.01268893, + "balance_loss_clip": 0.06276189, + "balance_loss_mlp": 0.01256215, + "epoch": 0.5983466105516309, + "flos": 16770574483200.0, + "grad_norm": 2.724642744329358, + "language_loss": 0.73936313, + "learning_rate": 1.4665480930328275e-06, + "loss": 0.81627548, + "num_input_tokens_seen": 214467245, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.12689209, + "step": 9952, + "time_per_iteration": 2.5671274662017822 + }, + { + "auxiliary_loss_clip": 0.06420863, + "auxiliary_loss_mlp": 0.01266742, + "balance_loss_clip": 0.06275067, + "balance_loss_mlp": 0.01254243, + "epoch": 0.5984067338042988, + "flos": 20048078937600.0, + "grad_norm": 1.9086154248374307, + "language_loss": 0.79033399, + "learning_rate": 1.466172750724613e-06, + "loss": 0.86721003, + "num_input_tokens_seen": 214484385, + "router_z_loss_clip": 1.45800781, + "router_z_loss_mlp": 0.12512207, + "step": 9953, + "time_per_iteration": 2.5575039386749268 + }, + { + "auxiliary_loss_clip": 0.06419245, + "auxiliary_loss_mlp": 0.01268437, + "balance_loss_clip": 0.06276231, + "balance_loss_mlp": 0.01257267, + "epoch": 0.5984668570569668, + "flos": 26326586972160.0, + "grad_norm": 1.3586799739820394, + "language_loss": 0.69871485, + "learning_rate": 1.4657974286591807e-06, + "loss": 0.77559167, + "num_input_tokens_seen": 214503465, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.1116333, + "step": 9954, + "time_per_iteration": 2.5664639472961426 + }, + { + "auxiliary_loss_clip": 0.06421678, + "auxiliary_loss_mlp": 0.01264771, + "balance_loss_clip": 0.06275603, + "balance_loss_mlp": 0.01253953, + "epoch": 0.5985269803096348, + "flos": 20599808146560.0, + "grad_norm": 3.504460387705041, + "language_loss": 0.73099947, + "learning_rate": 1.4654221268507637e-06, + "loss": 0.80786395, + "num_input_tokens_seen": 214520725, + "router_z_loss_clip": 1.46191406, + "router_z_loss_mlp": 0.10803223, + "step": 9955, + "time_per_iteration": 2.5450916290283203 + }, + { + "auxiliary_loss_clip": 0.06417805, + "auxiliary_loss_mlp": 0.01264034, + "balance_loss_clip": 0.06273872, + "balance_loss_mlp": 0.01252632, + "epoch": 0.5985871035623027, + "flos": 26871859416960.0, + "grad_norm": 1.7558609344018261, + "language_loss": 0.68993962, + "learning_rate": 1.4650468453135934e-06, + "loss": 0.76675797, + "num_input_tokens_seen": 214540675, + "router_z_loss_clip": 1.43945312, + "router_z_loss_mlp": 0.11401367, + "step": 9956, + "time_per_iteration": 2.596081256866455 + }, + { + "auxiliary_loss_clip": 0.06423829, + "auxiliary_loss_mlp": 0.01264045, + "balance_loss_clip": 0.06278121, + "balance_loss_mlp": 0.01253346, + "epoch": 0.5986472268149707, + "flos": 19615802123520.0, + "grad_norm": 2.031153762409854, + "language_loss": 0.74002242, + "learning_rate": 1.4646715840618999e-06, + "loss": 0.81690115, + "num_input_tokens_seen": 214559910, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.10699463, + "step": 9957, + "time_per_iteration": 2.5518100261688232 + }, + { + "auxiliary_loss_clip": 0.06412163, + "auxiliary_loss_mlp": 0.01266872, + "balance_loss_clip": 0.06272288, + "balance_loss_mlp": 0.01256071, + "epoch": 0.5987073500676386, + "flos": 21800371847040.0, + "grad_norm": 1.7255020808995434, + "language_loss": 0.84429491, + "learning_rate": 1.4642963431099138e-06, + "loss": 0.92108524, + "num_input_tokens_seen": 214575960, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.10803223, + "step": 9958, + "time_per_iteration": 2.5053975582122803 + }, + { + "auxiliary_loss_clip": 0.06420925, + "auxiliary_loss_mlp": 0.01267847, + "balance_loss_clip": 0.06275073, + "balance_loss_mlp": 0.01256594, + "epoch": 0.5987674733203067, + "flos": 24320909715840.0, + "grad_norm": 1.676255529467866, + "language_loss": 0.66404957, + "learning_rate": 1.463921122471864e-06, + "loss": 0.74093723, + "num_input_tokens_seen": 214594230, + "router_z_loss_clip": 1.45703125, + "router_z_loss_mlp": 0.11248779, + "step": 9959, + "time_per_iteration": 2.577558994293213 + }, + { + "auxiliary_loss_clip": 0.06423216, + "auxiliary_loss_mlp": 0.01263705, + "balance_loss_clip": 0.06278974, + "balance_loss_mlp": 0.01253418, + "epoch": 0.5988275965729746, + "flos": 21325859775360.0, + "grad_norm": 1.5343309289681366, + "language_loss": 0.83860743, + "learning_rate": 1.4635459221619796e-06, + "loss": 0.91547662, + "num_input_tokens_seen": 214613130, + "router_z_loss_clip": 1.44335938, + "router_z_loss_mlp": 0.10296631, + "step": 9960, + "time_per_iteration": 2.5171096324920654 + }, + { + "auxiliary_loss_clip": 0.06416292, + "auxiliary_loss_mlp": 0.01266192, + "balance_loss_clip": 0.0627322, + "balance_loss_mlp": 0.01254927, + "epoch": 0.5988877198256426, + "flos": 25124891241600.0, + "grad_norm": 1.3977520489587403, + "language_loss": 0.79645187, + "learning_rate": 1.4631707421944868e-06, + "loss": 0.87327671, + "num_input_tokens_seen": 214634470, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.11260986, + "step": 9961, + "time_per_iteration": 2.5664830207824707 + }, + { + "auxiliary_loss_clip": 0.06418522, + "auxiliary_loss_mlp": 0.01263845, + "balance_loss_clip": 0.0627479, + "balance_loss_mlp": 0.01253337, + "epoch": 0.5989478430783105, + "flos": 26435767242240.0, + "grad_norm": 1.8145848373023497, + "language_loss": 0.67511421, + "learning_rate": 1.4627955825836136e-06, + "loss": 0.75193793, + "num_input_tokens_seen": 214654030, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.10516357, + "step": 9962, + "time_per_iteration": 2.5658552646636963 + }, + { + "auxiliary_loss_clip": 0.06419411, + "auxiliary_loss_mlp": 0.01269677, + "balance_loss_clip": 0.06275185, + "balance_loss_mlp": 0.01258698, + "epoch": 0.5990079663309785, + "flos": 25786010355840.0, + "grad_norm": 1.2715525883777674, + "language_loss": 0.74696618, + "learning_rate": 1.4624204433435857e-06, + "loss": 0.82385707, + "num_input_tokens_seen": 214676985, + "router_z_loss_clip": 1.44238281, + "router_z_loss_mlp": 0.10980225, + "step": 9963, + "time_per_iteration": 2.5959842205047607 + }, + { + "auxiliary_loss_clip": 0.06414087, + "auxiliary_loss_mlp": 0.01266086, + "balance_loss_clip": 0.06273367, + "balance_loss_mlp": 0.01255494, + "epoch": 0.5990680895836464, + "flos": 36840889480320.0, + "grad_norm": 1.7000475586235915, + "language_loss": 0.68318057, + "learning_rate": 1.4620453244886281e-06, + "loss": 0.75998235, + "num_input_tokens_seen": 214700105, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.10601807, + "step": 9964, + "time_per_iteration": 2.652066230773926 + }, + { + "auxiliary_loss_clip": 0.06415234, + "auxiliary_loss_mlp": 0.01266775, + "balance_loss_clip": 0.06276559, + "balance_loss_mlp": 0.01256219, + "epoch": 0.5991282128363145, + "flos": 24140340167040.0, + "grad_norm": 1.9446201927807645, + "language_loss": 0.77307773, + "learning_rate": 1.4616702260329662e-06, + "loss": 0.84989786, + "num_input_tokens_seen": 214717885, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.10559082, + "step": 9965, + "time_per_iteration": 2.5652666091918945 + }, + { + "auxiliary_loss_clip": 0.0641766, + "auxiliary_loss_mlp": 0.01265032, + "balance_loss_clip": 0.0627239, + "balance_loss_mlp": 0.01254076, + "epoch": 0.5991883360889824, + "flos": 10308310444800.0, + "grad_norm": 2.43508720605834, + "language_loss": 0.77253437, + "learning_rate": 1.4612951479908229e-06, + "loss": 0.8493613, + "num_input_tokens_seen": 214733680, + "router_z_loss_clip": 1.45214844, + "router_z_loss_mlp": 0.10955811, + "step": 9966, + "time_per_iteration": 3.8983960151672363 + }, + { + "auxiliary_loss_clip": 0.06418956, + "auxiliary_loss_mlp": 0.01264547, + "balance_loss_clip": 0.06277221, + "balance_loss_mlp": 0.01254462, + "epoch": 0.5992484593416504, + "flos": 23957967755520.0, + "grad_norm": 1.382537362814459, + "language_loss": 0.73829538, + "learning_rate": 1.460920090376422e-06, + "loss": 0.81513047, + "num_input_tokens_seen": 214753285, + "router_z_loss_clip": 1.41894531, + "router_z_loss_mlp": 0.10095215, + "step": 9967, + "time_per_iteration": 2.55789852142334 + }, + { + "auxiliary_loss_clip": 0.06430869, + "auxiliary_loss_mlp": 0.01269853, + "balance_loss_clip": 0.06279887, + "balance_loss_mlp": 0.01258177, + "epoch": 0.5993085825943184, + "flos": 11948320483200.0, + "grad_norm": 2.02451624384261, + "language_loss": 0.69043863, + "learning_rate": 1.4605450532039847e-06, + "loss": 0.76744592, + "num_input_tokens_seen": 214767810, + "router_z_loss_clip": 1.50878906, + "router_z_loss_mlp": 0.11669922, + "step": 9968, + "time_per_iteration": 2.4782519340515137 + }, + { + "auxiliary_loss_clip": 0.06417669, + "auxiliary_loss_mlp": 0.01265537, + "balance_loss_clip": 0.06270653, + "balance_loss_mlp": 0.01253926, + "epoch": 0.5993687058469863, + "flos": 19032990249600.0, + "grad_norm": 1.5128271497944086, + "language_loss": 0.79284239, + "learning_rate": 1.4601700364877334e-06, + "loss": 0.86967438, + "num_input_tokens_seen": 214786040, + "router_z_loss_clip": 1.46972656, + "router_z_loss_mlp": 0.11608887, + "step": 9969, + "time_per_iteration": 2.5151612758636475 + }, + { + "auxiliary_loss_clip": 0.06416395, + "auxiliary_loss_mlp": 0.01265086, + "balance_loss_clip": 0.06272908, + "balance_loss_mlp": 0.0125369, + "epoch": 0.5994288290996543, + "flos": 14288204949120.0, + "grad_norm": 1.5374697799261579, + "language_loss": 0.81015587, + "learning_rate": 1.4597950402418889e-06, + "loss": 0.88697076, + "num_input_tokens_seen": 214803110, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.11383057, + "step": 9970, + "time_per_iteration": 2.5037295818328857 + }, + { + "auxiliary_loss_clip": 0.06425726, + "auxiliary_loss_mlp": 0.01265933, + "balance_loss_clip": 0.06278643, + "balance_loss_mlp": 0.01253136, + "epoch": 0.5994889523523222, + "flos": 19212385841280.0, + "grad_norm": 1.7784771847806544, + "language_loss": 0.6253432, + "learning_rate": 1.4594200644806697e-06, + "loss": 0.70225984, + "num_input_tokens_seen": 214819945, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 0.12805176, + "step": 9971, + "time_per_iteration": 2.5600948333740234 + }, + { + "auxiliary_loss_clip": 0.0641441, + "auxiliary_loss_mlp": 0.01262981, + "balance_loss_clip": 0.06275569, + "balance_loss_mlp": 0.01252121, + "epoch": 0.5995490756049903, + "flos": 28044401126400.0, + "grad_norm": 1.5809560666799003, + "language_loss": 0.79321986, + "learning_rate": 1.4590451092182962e-06, + "loss": 0.86999381, + "num_input_tokens_seen": 214838810, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.10864258, + "step": 9972, + "time_per_iteration": 2.5908236503601074 + }, + { + "auxiliary_loss_clip": 0.06426332, + "auxiliary_loss_mlp": 0.01268265, + "balance_loss_clip": 0.06275315, + "balance_loss_mlp": 0.01256595, + "epoch": 0.5996091988576582, + "flos": 29059531741440.0, + "grad_norm": 2.0347749890566957, + "language_loss": 0.76122165, + "learning_rate": 1.4586701744689864e-06, + "loss": 0.83816767, + "num_input_tokens_seen": 214857040, + "router_z_loss_clip": 1.50878906, + "router_z_loss_mlp": 0.11663818, + "step": 9973, + "time_per_iteration": 4.03744912147522 + }, + { + "auxiliary_loss_clip": 0.06415765, + "auxiliary_loss_mlp": 0.01269004, + "balance_loss_clip": 0.06271605, + "balance_loss_mlp": 0.01258048, + "epoch": 0.5996693221103262, + "flos": 20820306965760.0, + "grad_norm": 8.14230844682113, + "language_loss": 0.65456331, + "learning_rate": 1.4582952602469578e-06, + "loss": 0.73141098, + "num_input_tokens_seen": 214873375, + "router_z_loss_clip": 1.44140625, + "router_z_loss_mlp": 0.10961914, + "step": 9974, + "time_per_iteration": 2.545727491378784 + }, + { + "auxiliary_loss_clip": 0.06421987, + "auxiliary_loss_mlp": 0.01267073, + "balance_loss_clip": 0.06277154, + "balance_loss_mlp": 0.0125607, + "epoch": 0.5997294453629941, + "flos": 23775679198080.0, + "grad_norm": 1.6348808694128185, + "language_loss": 0.74560261, + "learning_rate": 1.457920366566428e-06, + "loss": 0.8224932, + "num_input_tokens_seen": 214893900, + "router_z_loss_clip": 1.44726562, + "router_z_loss_mlp": 0.11010742, + "step": 9975, + "time_per_iteration": 2.515960931777954 + }, + { + "auxiliary_loss_clip": 0.06416074, + "auxiliary_loss_mlp": 0.01267839, + "balance_loss_clip": 0.06272042, + "balance_loss_mlp": 0.01256985, + "epoch": 0.5997895686156621, + "flos": 20966397759360.0, + "grad_norm": 1.627086760059136, + "language_loss": 0.77381539, + "learning_rate": 1.457545493441611e-06, + "loss": 0.85065448, + "num_input_tokens_seen": 214912110, + "router_z_loss_clip": 1.44042969, + "router_z_loss_mlp": 0.10864258, + "step": 9976, + "time_per_iteration": 2.5143842697143555 + }, + { + "auxiliary_loss_clip": 0.06419265, + "auxiliary_loss_mlp": 0.01265963, + "balance_loss_clip": 0.06276691, + "balance_loss_mlp": 0.01255162, + "epoch": 0.59984969186833, + "flos": 28372864331520.0, + "grad_norm": 2.2336999868815837, + "language_loss": 0.75166976, + "learning_rate": 1.4571706408867237e-06, + "loss": 0.82852209, + "num_input_tokens_seen": 214930140, + "router_z_loss_clip": 1.42480469, + "router_z_loss_mlp": 0.10803223, + "step": 9977, + "time_per_iteration": 2.5434179306030273 + }, + { + "auxiliary_loss_clip": 0.06417818, + "auxiliary_loss_mlp": 0.01269861, + "balance_loss_clip": 0.06272452, + "balance_loss_mlp": 0.01258358, + "epoch": 0.5999098151209981, + "flos": 22572641802240.0, + "grad_norm": 1.5140714638849335, + "language_loss": 0.69135988, + "learning_rate": 1.4567958089159802e-06, + "loss": 0.76823664, + "num_input_tokens_seen": 214949200, + "router_z_loss_clip": 1.453125, + "router_z_loss_mlp": 0.11499023, + "step": 9978, + "time_per_iteration": 3.9952354431152344 + }, + { + "auxiliary_loss_clip": 0.06421594, + "auxiliary_loss_mlp": 0.01266213, + "balance_loss_clip": 0.06274537, + "balance_loss_mlp": 0.01254977, + "epoch": 0.599969938373666, + "flos": 18774365022720.0, + "grad_norm": 1.8838130799328623, + "language_loss": 0.81737733, + "learning_rate": 1.456420997543594e-06, + "loss": 0.89425546, + "num_input_tokens_seen": 214965775, + "router_z_loss_clip": 1.46972656, + "router_z_loss_mlp": 0.11236572, + "step": 9979, + "time_per_iteration": 2.5352437496185303 + }, + { + "auxiliary_loss_clip": 0.06412499, + "auxiliary_loss_mlp": 0.01267556, + "balance_loss_clip": 0.06274675, + "balance_loss_mlp": 0.01257239, + "epoch": 0.600030061626334, + "flos": 11331910321920.0, + "grad_norm": 1.7106471218945785, + "language_loss": 0.70199746, + "learning_rate": 1.4560462067837782e-06, + "loss": 0.77879798, + "num_input_tokens_seen": 214982480, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.10314941, + "step": 9980, + "time_per_iteration": 2.4757728576660156 + }, + { + "auxiliary_loss_clip": 0.06423149, + "auxiliary_loss_mlp": 0.01269991, + "balance_loss_clip": 0.06274426, + "balance_loss_mlp": 0.01258463, + "epoch": 0.600090184879002, + "flos": 16583799732480.0, + "grad_norm": 2.417469697653489, + "language_loss": 0.690139, + "learning_rate": 1.4556714366507445e-06, + "loss": 0.76707041, + "num_input_tokens_seen": 214998110, + "router_z_loss_clip": 1.48925781, + "router_z_loss_mlp": 0.11523438, + "step": 9981, + "time_per_iteration": 2.4791438579559326 + }, + { + "auxiliary_loss_clip": 0.0641709, + "auxiliary_loss_mlp": 0.01265689, + "balance_loss_clip": 0.0627474, + "balance_loss_mlp": 0.01255342, + "epoch": 0.6001503081316699, + "flos": 23624641013760.0, + "grad_norm": 3.5503488009813275, + "language_loss": 0.78682542, + "learning_rate": 1.4552966871587048e-06, + "loss": 0.86365318, + "num_input_tokens_seen": 215017995, + "router_z_loss_clip": 1.42382812, + "router_z_loss_mlp": 0.10345459, + "step": 9982, + "time_per_iteration": 2.517265796661377 + }, + { + "auxiliary_loss_clip": 0.06418465, + "auxiliary_loss_mlp": 0.01269533, + "balance_loss_clip": 0.06276916, + "balance_loss_mlp": 0.01258852, + "epoch": 0.6002104313843379, + "flos": 20673922682880.0, + "grad_norm": 1.4834511581102687, + "language_loss": 0.72993171, + "learning_rate": 1.4549219583218686e-06, + "loss": 0.80681169, + "num_input_tokens_seen": 215038285, + "router_z_loss_clip": 1.41308594, + "router_z_loss_mlp": 0.10681152, + "step": 9983, + "time_per_iteration": 2.5322060585021973 + }, + { + "auxiliary_loss_clip": 0.06419442, + "auxiliary_loss_mlp": 0.01265277, + "balance_loss_clip": 0.0627455, + "balance_loss_mlp": 0.01254274, + "epoch": 0.6002705546370058, + "flos": 22461742523520.0, + "grad_norm": 1.817313812044092, + "language_loss": 0.77973288, + "learning_rate": 1.454547250154447e-06, + "loss": 0.85658008, + "num_input_tokens_seen": 215057825, + "router_z_loss_clip": 1.44921875, + "router_z_loss_mlp": 0.10998535, + "step": 9984, + "time_per_iteration": 3.889902353286743 + }, + { + "auxiliary_loss_clip": 0.06414619, + "auxiliary_loss_mlp": 0.01267761, + "balance_loss_clip": 0.06271429, + "balance_loss_mlp": 0.01256568, + "epoch": 0.6003306778896739, + "flos": 25199005777920.0, + "grad_norm": 1.5215747487142872, + "language_loss": 0.83512825, + "learning_rate": 1.4541725626706485e-06, + "loss": 0.91195202, + "num_input_tokens_seen": 215077790, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.11199951, + "step": 9985, + "time_per_iteration": 2.575650691986084 + }, + { + "auxiliary_loss_clip": 0.06417745, + "auxiliary_loss_mlp": 0.01270811, + "balance_loss_clip": 0.06274939, + "balance_loss_mlp": 0.01260666, + "epoch": 0.6003908011423418, + "flos": 26694979447680.0, + "grad_norm": 1.7185413261664646, + "language_loss": 0.71617854, + "learning_rate": 1.4537978958846809e-06, + "loss": 0.79306406, + "num_input_tokens_seen": 215097650, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.10144043, + "step": 9986, + "time_per_iteration": 2.603126287460327 + }, + { + "auxiliary_loss_clip": 0.06418968, + "auxiliary_loss_mlp": 0.0127052, + "balance_loss_clip": 0.0627557, + "balance_loss_mlp": 0.01259451, + "epoch": 0.6004509243950098, + "flos": 22571677480320.0, + "grad_norm": 1.4916160282529034, + "language_loss": 0.72118956, + "learning_rate": 1.4534232498107514e-06, + "loss": 0.79808438, + "num_input_tokens_seen": 215118235, + "router_z_loss_clip": 1.43359375, + "router_z_loss_mlp": 0.11077881, + "step": 9987, + "time_per_iteration": 2.5536653995513916 + }, + { + "auxiliary_loss_clip": 0.06410448, + "auxiliary_loss_mlp": 0.01267963, + "balance_loss_clip": 0.06270513, + "balance_loss_mlp": 0.01257741, + "epoch": 0.6005110476476777, + "flos": 19725443591040.0, + "grad_norm": 1.6002442710001008, + "language_loss": 0.85169375, + "learning_rate": 1.4530486244630673e-06, + "loss": 0.92847788, + "num_input_tokens_seen": 215136755, + "router_z_loss_clip": 1.39941406, + "router_z_loss_mlp": 0.10223389, + "step": 9988, + "time_per_iteration": 2.676584482192993 + }, + { + "auxiliary_loss_clip": 0.06413879, + "auxiliary_loss_mlp": 0.01268869, + "balance_loss_clip": 0.06271169, + "balance_loss_mlp": 0.0125783, + "epoch": 0.6005711709003457, + "flos": 17718340815360.0, + "grad_norm": 1.8176771569563623, + "language_loss": 0.66009402, + "learning_rate": 1.4526740198558346e-06, + "loss": 0.73692149, + "num_input_tokens_seen": 215155225, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.1104126, + "step": 9989, + "time_per_iteration": 2.486422300338745 + }, + { + "auxiliary_loss_clip": 0.06419196, + "auxiliary_loss_mlp": 0.01266624, + "balance_loss_clip": 0.06276186, + "balance_loss_mlp": 0.01256288, + "epoch": 0.6006312941530136, + "flos": 18520267916160.0, + "grad_norm": 1.406905965203465, + "language_loss": 0.80891693, + "learning_rate": 1.452299436003257e-06, + "loss": 0.88577515, + "num_input_tokens_seen": 215174815, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.10327148, + "step": 9990, + "time_per_iteration": 2.535477876663208 + }, + { + "auxiliary_loss_clip": 0.06421524, + "auxiliary_loss_mlp": 0.01272993, + "balance_loss_clip": 0.06275146, + "balance_loss_mlp": 0.01261829, + "epoch": 0.6006914174056817, + "flos": 21396117024000.0, + "grad_norm": 2.6934120952656557, + "language_loss": 0.82880741, + "learning_rate": 1.4519248729195403e-06, + "loss": 0.9057526, + "num_input_tokens_seen": 215192045, + "router_z_loss_clip": 1.46386719, + "router_z_loss_mlp": 0.11157227, + "step": 9991, + "time_per_iteration": 2.518101215362549 + }, + { + "auxiliary_loss_clip": 0.06412101, + "auxiliary_loss_mlp": 0.01267116, + "balance_loss_clip": 0.06272052, + "balance_loss_mlp": 0.01256012, + "epoch": 0.6007515406583496, + "flos": 12755488464000.0, + "grad_norm": 1.8815822669797526, + "language_loss": 0.83029675, + "learning_rate": 1.4515503306188878e-06, + "loss": 0.90708888, + "num_input_tokens_seen": 215209885, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.11096191, + "step": 9992, + "time_per_iteration": 2.521474599838257 + }, + { + "auxiliary_loss_clip": 0.06415074, + "auxiliary_loss_mlp": 0.01267357, + "balance_loss_clip": 0.06272477, + "balance_loss_mlp": 0.01256098, + "epoch": 0.6008116639110176, + "flos": 19212679330560.0, + "grad_norm": 1.7865103371256597, + "language_loss": 0.66380614, + "learning_rate": 1.4511758091155008e-06, + "loss": 0.74063051, + "num_input_tokens_seen": 215228150, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.11260986, + "step": 9993, + "time_per_iteration": 2.4865942001342773 + }, + { + "auxiliary_loss_clip": 0.0641458, + "auxiliary_loss_mlp": 0.01267154, + "balance_loss_clip": 0.06271669, + "balance_loss_mlp": 0.0125633, + "epoch": 0.6008717871636855, + "flos": 17060953207680.0, + "grad_norm": 2.3852752129116115, + "language_loss": 0.81380951, + "learning_rate": 1.4508013084235826e-06, + "loss": 0.89062685, + "num_input_tokens_seen": 215243755, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.1083374, + "step": 9994, + "time_per_iteration": 2.500990390777588 + }, + { + "auxiliary_loss_clip": 0.0640981, + "auxiliary_loss_mlp": 0.01267464, + "balance_loss_clip": 0.06272399, + "balance_loss_mlp": 0.01257242, + "epoch": 0.6009319104163535, + "flos": 20304188542080.0, + "grad_norm": 1.763050873993328, + "language_loss": 0.72585195, + "learning_rate": 1.4504268285573337e-06, + "loss": 0.8026247, + "num_input_tokens_seen": 215262130, + "router_z_loss_clip": 1.37402344, + "router_z_loss_mlp": 0.10229492, + "step": 9995, + "time_per_iteration": 2.482269287109375 + }, + { + "auxiliary_loss_clip": 0.06416491, + "auxiliary_loss_mlp": 0.01267971, + "balance_loss_clip": 0.06272282, + "balance_loss_mlp": 0.01257242, + "epoch": 0.6009920336690215, + "flos": 21843487572480.0, + "grad_norm": 1.6604568353476683, + "language_loss": 0.81016338, + "learning_rate": 1.4500523695309546e-06, + "loss": 0.88700801, + "num_input_tokens_seen": 215281785, + "router_z_loss_clip": 1.44433594, + "router_z_loss_mlp": 0.10736084, + "step": 9996, + "time_per_iteration": 2.5466809272766113 + }, + { + "auxiliary_loss_clip": 0.06416655, + "auxiliary_loss_mlp": 0.01270292, + "balance_loss_clip": 0.06274925, + "balance_loss_mlp": 0.01259772, + "epoch": 0.6010521569216895, + "flos": 22601795823360.0, + "grad_norm": 1.669746646683285, + "language_loss": 0.79055232, + "learning_rate": 1.4496779313586447e-06, + "loss": 0.86742181, + "num_input_tokens_seen": 215297550, + "router_z_loss_clip": 1.41796875, + "router_z_loss_mlp": 0.10522461, + "step": 9997, + "time_per_iteration": 2.489703416824341 + }, + { + "auxiliary_loss_clip": 0.06421417, + "auxiliary_loss_mlp": 0.01266279, + "balance_loss_clip": 0.06274536, + "balance_loss_mlp": 0.0125496, + "epoch": 0.6011122801743575, + "flos": 19177697450880.0, + "grad_norm": 1.7167006806270684, + "language_loss": 0.72813851, + "learning_rate": 1.4493035140546028e-06, + "loss": 0.80501544, + "num_input_tokens_seen": 215316360, + "router_z_loss_clip": 1.46777344, + "router_z_loss_mlp": 0.11315918, + "step": 9998, + "time_per_iteration": 2.5477771759033203 + }, + { + "auxiliary_loss_clip": 0.06413899, + "auxiliary_loss_mlp": 0.01264976, + "balance_loss_clip": 0.0627325, + "balance_loss_mlp": 0.01254671, + "epoch": 0.6011724034270254, + "flos": 25017094563840.0, + "grad_norm": 1.4177411729498055, + "language_loss": 0.72547859, + "learning_rate": 1.448929117633027e-06, + "loss": 0.80226737, + "num_input_tokens_seen": 215336405, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.10302734, + "step": 9999, + "time_per_iteration": 2.658071517944336 + }, + { + "auxiliary_loss_clip": 0.06419925, + "auxiliary_loss_mlp": 0.0126529, + "balance_loss_clip": 0.06273222, + "balance_loss_mlp": 0.01253948, + "epoch": 0.6012325266796934, + "flos": 21803935645440.0, + "grad_norm": 1.3735035595460474, + "language_loss": 0.78419137, + "learning_rate": 1.4485547421081142e-06, + "loss": 0.86104351, + "num_input_tokens_seen": 215356590, + "router_z_loss_clip": 1.46777344, + "router_z_loss_mlp": 0.11346436, + "step": 10000, + "time_per_iteration": 2.6216328144073486 + }, + { + "auxiliary_loss_clip": 0.06423375, + "auxiliary_loss_mlp": 0.0126636, + "balance_loss_clip": 0.06274506, + "balance_loss_mlp": 0.01253974, + "epoch": 0.6012926499323613, + "flos": 19579059308160.0, + "grad_norm": 2.6942443051056797, + "language_loss": 0.77449071, + "learning_rate": 1.4481803874940608e-06, + "loss": 0.85138798, + "num_input_tokens_seen": 215374295, + "router_z_loss_clip": 1.48828125, + "router_z_loss_mlp": 0.1239624, + "step": 10001, + "time_per_iteration": 2.4916481971740723 + }, + { + "auxiliary_loss_clip": 0.06419365, + "auxiliary_loss_mlp": 0.01264494, + "balance_loss_clip": 0.06272592, + "balance_loss_mlp": 0.01253479, + "epoch": 0.6013527731850293, + "flos": 34869439416960.0, + "grad_norm": 2.005983259780714, + "language_loss": 0.59280682, + "learning_rate": 1.4478060538050624e-06, + "loss": 0.66964543, + "num_input_tokens_seen": 215394535, + "router_z_loss_clip": 1.46679688, + "router_z_loss_mlp": 0.11004639, + "step": 10002, + "time_per_iteration": 2.6645169258117676 + }, + { + "auxiliary_loss_clip": 0.06426313, + "auxiliary_loss_mlp": 0.01266842, + "balance_loss_clip": 0.06280068, + "balance_loss_mlp": 0.01255636, + "epoch": 0.6014128964376972, + "flos": 23298190306560.0, + "grad_norm": 1.4832163301855164, + "language_loss": 0.78208435, + "learning_rate": 1.447431741055314e-06, + "loss": 0.85901594, + "num_input_tokens_seen": 215414355, + "router_z_loss_clip": 1.46289062, + "router_z_loss_mlp": 0.11199951, + "step": 10003, + "time_per_iteration": 2.5180611610412598 + }, + { + "auxiliary_loss_clip": 0.0641861, + "auxiliary_loss_mlp": 0.01265947, + "balance_loss_clip": 0.06273924, + "balance_loss_mlp": 0.01254503, + "epoch": 0.6014730196903653, + "flos": 24826839868800.0, + "grad_norm": 2.3891485516500857, + "language_loss": 0.77473211, + "learning_rate": 1.4470574492590091e-06, + "loss": 0.8515777, + "num_input_tokens_seen": 215428280, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.11437988, + "step": 10004, + "time_per_iteration": 2.6330173015594482 + }, + { + "auxiliary_loss_clip": 0.06419056, + "auxiliary_loss_mlp": 0.01264798, + "balance_loss_clip": 0.06274185, + "balance_loss_mlp": 0.01253622, + "epoch": 0.6015331429430332, + "flos": 23119046277120.0, + "grad_norm": 1.439097178617253, + "language_loss": 0.72748709, + "learning_rate": 1.4466831784303408e-06, + "loss": 0.80432558, + "num_input_tokens_seen": 215448970, + "router_z_loss_clip": 1.45019531, + "router_z_loss_mlp": 0.11187744, + "step": 10005, + "time_per_iteration": 3.9784722328186035 + }, + { + "auxiliary_loss_clip": 0.06408843, + "auxiliary_loss_mlp": 0.01267392, + "balance_loss_clip": 0.06270996, + "balance_loss_mlp": 0.01257415, + "epoch": 0.6015932661957012, + "flos": 19206222566400.0, + "grad_norm": 2.0810783182593453, + "language_loss": 0.75111496, + "learning_rate": 1.4463089285835026e-06, + "loss": 0.82787728, + "num_input_tokens_seen": 215465260, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.09979248, + "step": 10006, + "time_per_iteration": 2.479973793029785 + }, + { + "auxiliary_loss_clip": 0.06413963, + "auxiliary_loss_mlp": 0.01266799, + "balance_loss_clip": 0.06270643, + "balance_loss_mlp": 0.01255659, + "epoch": 0.6016533894483691, + "flos": 18119451110400.0, + "grad_norm": 1.7404924752402045, + "language_loss": 0.74258769, + "learning_rate": 1.445934699732685e-06, + "loss": 0.8193953, + "num_input_tokens_seen": 215482725, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.1114502, + "step": 10007, + "time_per_iteration": 2.514868974685669 + }, + { + "auxiliary_loss_clip": 0.06414758, + "auxiliary_loss_mlp": 0.01265594, + "balance_loss_clip": 0.06273913, + "balance_loss_mlp": 0.01254161, + "epoch": 0.6017135127010371, + "flos": 16222492926720.0, + "grad_norm": 1.6904603378944318, + "language_loss": 0.70442504, + "learning_rate": 1.4455604918920785e-06, + "loss": 0.78122854, + "num_input_tokens_seen": 215500420, + "router_z_loss_clip": 1.40722656, + "router_z_loss_mlp": 0.11425781, + "step": 10008, + "time_per_iteration": 2.491718053817749 + }, + { + "auxiliary_loss_clip": 0.0641681, + "auxiliary_loss_mlp": 0.01264471, + "balance_loss_clip": 0.06274457, + "balance_loss_mlp": 0.01254291, + "epoch": 0.6017736359537051, + "flos": 23451576405120.0, + "grad_norm": 1.626126690886893, + "language_loss": 0.7634151, + "learning_rate": 1.4451863050758748e-06, + "loss": 0.84022784, + "num_input_tokens_seen": 215522260, + "router_z_loss_clip": 1.42480469, + "router_z_loss_mlp": 0.10186768, + "step": 10009, + "time_per_iteration": 2.599497079849243 + }, + { + "auxiliary_loss_clip": 0.06414296, + "auxiliary_loss_mlp": 0.01267744, + "balance_loss_clip": 0.06272782, + "balance_loss_mlp": 0.01256455, + "epoch": 0.601833759206373, + "flos": 23520869331840.0, + "grad_norm": 2.016447610820272, + "language_loss": 0.73958981, + "learning_rate": 1.4448121392982608e-06, + "loss": 0.8164103, + "num_input_tokens_seen": 215541715, + "router_z_loss_clip": 1.41503906, + "router_z_loss_mlp": 0.11279297, + "step": 10010, + "time_per_iteration": 2.542102098464966 + }, + { + "auxiliary_loss_clip": 0.06320257, + "auxiliary_loss_mlp": 0.01264863, + "balance_loss_clip": 0.06261265, + "balance_loss_mlp": 0.01263333, + "epoch": 0.6018938824590411, + "flos": 64013846215680.0, + "grad_norm": 0.9512553520354263, + "language_loss": 0.55134046, + "learning_rate": 1.4444379945734268e-06, + "loss": 0.6271916, + "num_input_tokens_seen": 215603020, + "router_z_loss_clip": 0.59130859, + "router_z_loss_mlp": 0.01529694, + "step": 10011, + "time_per_iteration": 3.219438076019287 + }, + { + "auxiliary_loss_clip": 0.064165, + "auxiliary_loss_mlp": 0.01266395, + "balance_loss_clip": 0.06272937, + "balance_loss_mlp": 0.01256233, + "epoch": 0.601954005711709, + "flos": 34648311692160.0, + "grad_norm": 1.3620910382501825, + "language_loss": 0.6241864, + "learning_rate": 1.44406387091556e-06, + "loss": 0.70101535, + "num_input_tokens_seen": 215625115, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.1015625, + "step": 10012, + "time_per_iteration": 4.187492609024048 + }, + { + "auxiliary_loss_clip": 0.06412341, + "auxiliary_loss_mlp": 0.01261432, + "balance_loss_clip": 0.06271702, + "balance_loss_mlp": 0.0125155, + "epoch": 0.602014128964377, + "flos": 19433094295680.0, + "grad_norm": 1.6346863878236784, + "language_loss": 0.75188845, + "learning_rate": 1.4436897683388462e-06, + "loss": 0.82862616, + "num_input_tokens_seen": 215643730, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.09881592, + "step": 10013, + "time_per_iteration": 2.4897818565368652 + }, + { + "auxiliary_loss_clip": 0.06409096, + "auxiliary_loss_mlp": 0.01262449, + "balance_loss_clip": 0.06273073, + "balance_loss_mlp": 0.01252823, + "epoch": 0.6020742522170449, + "flos": 28336876202880.0, + "grad_norm": 1.4752372512859242, + "language_loss": 0.81565046, + "learning_rate": 1.4433156868574732e-06, + "loss": 0.89236587, + "num_input_tokens_seen": 215664425, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.09625244, + "step": 10014, + "time_per_iteration": 2.5903513431549072 + }, + { + "auxiliary_loss_clip": 0.06408108, + "auxiliary_loss_mlp": 0.01262661, + "balance_loss_clip": 0.06272644, + "balance_loss_mlp": 0.01252617, + "epoch": 0.6021343754697129, + "flos": 22753588694400.0, + "grad_norm": 1.6084117246958012, + "language_loss": 0.72432387, + "learning_rate": 1.442941626485624e-06, + "loss": 0.80103159, + "num_input_tokens_seen": 215684280, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.10046387, + "step": 10015, + "time_per_iteration": 2.5320956707000732 + }, + { + "auxiliary_loss_clip": 0.06313504, + "auxiliary_loss_mlp": 0.01271116, + "balance_loss_clip": 0.06254423, + "balance_loss_mlp": 0.01269587, + "epoch": 0.6021944987223808, + "flos": 65769885360000.0, + "grad_norm": 0.8212846281484271, + "language_loss": 0.54902303, + "learning_rate": 1.4425675872374848e-06, + "loss": 0.62486923, + "num_input_tokens_seen": 215739780, + "router_z_loss_clip": 0.59375, + "router_z_loss_mlp": 0.01528168, + "step": 10016, + "time_per_iteration": 3.0691990852355957 + }, + { + "auxiliary_loss_clip": 0.06413935, + "auxiliary_loss_mlp": 0.01266264, + "balance_loss_clip": 0.06274504, + "balance_loss_mlp": 0.01255767, + "epoch": 0.6022546219750489, + "flos": 16110377763840.0, + "grad_norm": 1.6476177539901398, + "language_loss": 0.82975459, + "learning_rate": 1.4421935691272381e-06, + "loss": 0.90655655, + "num_input_tokens_seen": 215757885, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.10498047, + "step": 10017, + "time_per_iteration": 4.000306606292725 + }, + { + "auxiliary_loss_clip": 0.06413059, + "auxiliary_loss_mlp": 0.01267664, + "balance_loss_clip": 0.06276649, + "balance_loss_mlp": 0.01257465, + "epoch": 0.6023147452277168, + "flos": 25518328888320.0, + "grad_norm": 1.7212842530240955, + "language_loss": 0.83736604, + "learning_rate": 1.4418195721690677e-06, + "loss": 0.91417325, + "num_input_tokens_seen": 215776415, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.10198975, + "step": 10018, + "time_per_iteration": 2.5354957580566406 + }, + { + "auxiliary_loss_clip": 0.06423128, + "auxiliary_loss_mlp": 0.01265844, + "balance_loss_clip": 0.06276394, + "balance_loss_mlp": 0.01254751, + "epoch": 0.6023748684803848, + "flos": 22642353999360.0, + "grad_norm": 1.5941982193166335, + "language_loss": 0.78464353, + "learning_rate": 1.4414455963771549e-06, + "loss": 0.86153316, + "num_input_tokens_seen": 215794865, + "router_z_loss_clip": 1.46777344, + "router_z_loss_mlp": 0.11096191, + "step": 10019, + "time_per_iteration": 2.534315586090088 + }, + { + "auxiliary_loss_clip": 0.06414038, + "auxiliary_loss_mlp": 0.0126418, + "balance_loss_clip": 0.06272502, + "balance_loss_mlp": 0.01253433, + "epoch": 0.6024349917330527, + "flos": 26217113212800.0, + "grad_norm": 1.7295998133508477, + "language_loss": 0.7397396, + "learning_rate": 1.441071641765681e-06, + "loss": 0.81652176, + "num_input_tokens_seen": 215816840, + "router_z_loss_clip": 1.41503906, + "router_z_loss_mlp": 0.10742188, + "step": 10020, + "time_per_iteration": 2.5745153427124023 + }, + { + "auxiliary_loss_clip": 0.06419009, + "auxiliary_loss_mlp": 0.01267121, + "balance_loss_clip": 0.06276802, + "balance_loss_mlp": 0.01256875, + "epoch": 0.6024951149857207, + "flos": 21258160076160.0, + "grad_norm": 1.6276524527254101, + "language_loss": 0.64517641, + "learning_rate": 1.4406977083488264e-06, + "loss": 0.72203767, + "num_input_tokens_seen": 215836100, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.10247803, + "step": 10021, + "time_per_iteration": 2.5457210540771484 + }, + { + "auxiliary_loss_clip": 0.06415432, + "auxiliary_loss_mlp": 0.01267969, + "balance_loss_clip": 0.06273261, + "balance_loss_mlp": 0.01256776, + "epoch": 0.6025552382383887, + "flos": 26950795562880.0, + "grad_norm": 1.4058190289621155, + "language_loss": 0.80931878, + "learning_rate": 1.4403237961407704e-06, + "loss": 0.88615286, + "num_input_tokens_seen": 215858480, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.11187744, + "step": 10022, + "time_per_iteration": 4.0118248462677 + }, + { + "auxiliary_loss_clip": 0.06419462, + "auxiliary_loss_mlp": 0.0126571, + "balance_loss_clip": 0.06273965, + "balance_loss_mlp": 0.01255089, + "epoch": 0.6026153614910567, + "flos": 31692142846080.0, + "grad_norm": 1.4147504892998892, + "language_loss": 0.66787559, + "learning_rate": 1.439949905155693e-06, + "loss": 0.74472731, + "num_input_tokens_seen": 215879950, + "router_z_loss_clip": 1.45703125, + "router_z_loss_mlp": 0.10620117, + "step": 10023, + "time_per_iteration": 2.6242425441741943 + }, + { + "auxiliary_loss_clip": 0.0642107, + "auxiliary_loss_mlp": 0.01268575, + "balance_loss_clip": 0.06277968, + "balance_loss_mlp": 0.01257554, + "epoch": 0.6026754847437247, + "flos": 29320085612160.0, + "grad_norm": 1.6857710992723132, + "language_loss": 0.73865843, + "learning_rate": 1.4395760354077707e-06, + "loss": 0.81555492, + "num_input_tokens_seen": 215899830, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.11029053, + "step": 10024, + "time_per_iteration": 2.5943942070007324 + }, + { + "auxiliary_loss_clip": 0.06414223, + "auxiliary_loss_mlp": 0.01264046, + "balance_loss_clip": 0.06273946, + "balance_loss_mlp": 0.01253454, + "epoch": 0.6027356079963926, + "flos": 23593558348800.0, + "grad_norm": 1.5719504936966129, + "language_loss": 0.72838885, + "learning_rate": 1.4392021869111815e-06, + "loss": 0.80517155, + "num_input_tokens_seen": 215920440, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.105896, + "step": 10025, + "time_per_iteration": 2.5456719398498535 + }, + { + "auxiliary_loss_clip": 0.06421927, + "auxiliary_loss_mlp": 0.0126511, + "balance_loss_clip": 0.06274527, + "balance_loss_mlp": 0.01253469, + "epoch": 0.6027957312490606, + "flos": 20820055403520.0, + "grad_norm": 2.0657942826528526, + "language_loss": 0.67852134, + "learning_rate": 1.4388283596801016e-06, + "loss": 0.75539172, + "num_input_tokens_seen": 215940535, + "router_z_loss_clip": 1.47265625, + "router_z_loss_mlp": 0.11651611, + "step": 10026, + "time_per_iteration": 2.598649024963379 + }, + { + "auxiliary_loss_clip": 0.06409953, + "auxiliary_loss_mlp": 0.0126467, + "balance_loss_clip": 0.06272997, + "balance_loss_mlp": 0.01254794, + "epoch": 0.6028558545017285, + "flos": 19941540071040.0, + "grad_norm": 1.6702920817519378, + "language_loss": 0.80409044, + "learning_rate": 1.4384545537287061e-06, + "loss": 0.88083661, + "num_input_tokens_seen": 215958045, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.09881592, + "step": 10027, + "time_per_iteration": 2.4931211471557617 + }, + { + "auxiliary_loss_clip": 0.06421126, + "auxiliary_loss_mlp": 0.01265388, + "balance_loss_clip": 0.06276809, + "balance_loss_mlp": 0.01254516, + "epoch": 0.6029159777543965, + "flos": 22827535522560.0, + "grad_norm": 2.164274421178336, + "language_loss": 0.71328938, + "learning_rate": 1.438080769071171e-06, + "loss": 0.79015452, + "num_input_tokens_seen": 215977330, + "router_z_loss_clip": 1.44335938, + "router_z_loss_mlp": 0.10876465, + "step": 10028, + "time_per_iteration": 2.5468251705169678 + }, + { + "auxiliary_loss_clip": 0.06418602, + "auxiliary_loss_mlp": 0.01267926, + "balance_loss_clip": 0.06276453, + "balance_loss_mlp": 0.01256911, + "epoch": 0.6029761010070644, + "flos": 23594103400320.0, + "grad_norm": 1.6575222347679248, + "language_loss": 0.84050506, + "learning_rate": 1.437707005721669e-06, + "loss": 0.91737038, + "num_input_tokens_seen": 215997865, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.11016846, + "step": 10029, + "time_per_iteration": 2.529097557067871 + }, + { + "auxiliary_loss_clip": 0.06414534, + "auxiliary_loss_mlp": 0.01271064, + "balance_loss_clip": 0.06275196, + "balance_loss_mlp": 0.01261146, + "epoch": 0.6030362242597325, + "flos": 13667518229760.0, + "grad_norm": 1.639514659773033, + "language_loss": 0.800816, + "learning_rate": 1.437333263694373e-06, + "loss": 0.8776719, + "num_input_tokens_seen": 216016230, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.09918213, + "step": 10030, + "time_per_iteration": 2.527984619140625 + }, + { + "auxiliary_loss_clip": 0.06420292, + "auxiliary_loss_mlp": 0.01267091, + "balance_loss_clip": 0.06277453, + "balance_loss_mlp": 0.01256595, + "epoch": 0.6030963475124004, + "flos": 24429293372160.0, + "grad_norm": 1.55352827539933, + "language_loss": 0.71218026, + "learning_rate": 1.4369595430034572e-06, + "loss": 0.7890541, + "num_input_tokens_seen": 216035785, + "router_z_loss_clip": 1.42675781, + "router_z_loss_mlp": 0.1050415, + "step": 10031, + "time_per_iteration": 2.5585272312164307 + }, + { + "auxiliary_loss_clip": 0.06422323, + "auxiliary_loss_mlp": 0.01265322, + "balance_loss_clip": 0.06275461, + "balance_loss_mlp": 0.01253592, + "epoch": 0.6031564707650684, + "flos": 29651944907520.0, + "grad_norm": 1.5252565411095604, + "language_loss": 0.73936534, + "learning_rate": 1.4365858436630912e-06, + "loss": 0.81624174, + "num_input_tokens_seen": 216059555, + "router_z_loss_clip": 1.46777344, + "router_z_loss_mlp": 0.11730957, + "step": 10032, + "time_per_iteration": 2.6043312549591064 + }, + { + "auxiliary_loss_clip": 0.06425112, + "auxiliary_loss_mlp": 0.01269372, + "balance_loss_clip": 0.06280036, + "balance_loss_mlp": 0.01258124, + "epoch": 0.6032165940177363, + "flos": 16624525616640.0, + "grad_norm": 1.652390402199518, + "language_loss": 0.68466848, + "learning_rate": 1.4362121656874465e-06, + "loss": 0.76161331, + "num_input_tokens_seen": 216077235, + "router_z_loss_clip": 1.45214844, + "router_z_loss_mlp": 0.11242676, + "step": 10033, + "time_per_iteration": 2.4788658618927 + }, + { + "auxiliary_loss_clip": 0.06415801, + "auxiliary_loss_mlp": 0.01267578, + "balance_loss_clip": 0.06276157, + "balance_loss_mlp": 0.01256676, + "epoch": 0.6032767172704043, + "flos": 17493020386560.0, + "grad_norm": 2.062963272365632, + "language_loss": 0.76036859, + "learning_rate": 1.4358385090906934e-06, + "loss": 0.83720237, + "num_input_tokens_seen": 216094985, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.10900879, + "step": 10034, + "time_per_iteration": 2.5080766677856445 + }, + { + "auxiliary_loss_clip": 0.06421614, + "auxiliary_loss_mlp": 0.01268433, + "balance_loss_clip": 0.06277142, + "balance_loss_mlp": 0.01257668, + "epoch": 0.6033368405230723, + "flos": 26840105919360.0, + "grad_norm": 1.6546972875454138, + "language_loss": 0.74774975, + "learning_rate": 1.4354648738870004e-06, + "loss": 0.82465017, + "num_input_tokens_seen": 216115905, + "router_z_loss_clip": 1.4453125, + "router_z_loss_mlp": 0.10754395, + "step": 10035, + "time_per_iteration": 2.563206434249878 + }, + { + "auxiliary_loss_clip": 0.06417766, + "auxiliary_loss_mlp": 0.0126329, + "balance_loss_clip": 0.06278257, + "balance_loss_mlp": 0.01252752, + "epoch": 0.6033969637757403, + "flos": 16915575173760.0, + "grad_norm": 1.5348173305795916, + "language_loss": 0.86666334, + "learning_rate": 1.435091260090536e-06, + "loss": 0.94347388, + "num_input_tokens_seen": 216132420, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.10552979, + "step": 10036, + "time_per_iteration": 2.5237104892730713 + }, + { + "auxiliary_loss_clip": 0.06422649, + "auxiliary_loss_mlp": 0.01265037, + "balance_loss_clip": 0.06279111, + "balance_loss_mlp": 0.01253641, + "epoch": 0.6034570870284083, + "flos": 22936757719680.0, + "grad_norm": 1.8203362960867906, + "language_loss": 0.70372736, + "learning_rate": 1.4347176677154676e-06, + "loss": 0.78060424, + "num_input_tokens_seen": 216149800, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.11401367, + "step": 10037, + "time_per_iteration": 2.5395092964172363 + }, + { + "auxiliary_loss_clip": 0.06418501, + "auxiliary_loss_mlp": 0.0126923, + "balance_loss_clip": 0.06279185, + "balance_loss_mlp": 0.01258603, + "epoch": 0.6035172102810762, + "flos": 23372807967360.0, + "grad_norm": 1.59892513624744, + "language_loss": 0.85074937, + "learning_rate": 1.4343440967759616e-06, + "loss": 0.92762661, + "num_input_tokens_seen": 216168200, + "router_z_loss_clip": 1.39257812, + "router_z_loss_mlp": 0.10626221, + "step": 10038, + "time_per_iteration": 2.5844480991363525 + }, + { + "auxiliary_loss_clip": 0.06419212, + "auxiliary_loss_mlp": 0.01268169, + "balance_loss_clip": 0.06275028, + "balance_loss_mlp": 0.01257786, + "epoch": 0.6035773335337442, + "flos": 20893457180160.0, + "grad_norm": 2.8819957775512757, + "language_loss": 0.77070892, + "learning_rate": 1.4339705472861846e-06, + "loss": 0.8475827, + "num_input_tokens_seen": 216187105, + "router_z_loss_clip": 1.44140625, + "router_z_loss_mlp": 0.1038208, + "step": 10039, + "time_per_iteration": 2.5122628211975098 + }, + { + "auxiliary_loss_clip": 0.06415309, + "auxiliary_loss_mlp": 0.01264287, + "balance_loss_clip": 0.06274001, + "balance_loss_mlp": 0.01253981, + "epoch": 0.6036374567864121, + "flos": 24943231589760.0, + "grad_norm": 1.5604135097118987, + "language_loss": 0.71224856, + "learning_rate": 1.433597019260301e-06, + "loss": 0.78904456, + "num_input_tokens_seen": 216205440, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.10296631, + "step": 10040, + "time_per_iteration": 2.571869373321533 + }, + { + "auxiliary_loss_clip": 0.06419596, + "auxiliary_loss_mlp": 0.01268369, + "balance_loss_clip": 0.06274244, + "balance_loss_mlp": 0.01256627, + "epoch": 0.6036975800390801, + "flos": 23154866697600.0, + "grad_norm": 1.8943612239225145, + "language_loss": 0.7865687, + "learning_rate": 1.433223512712475e-06, + "loss": 0.86344838, + "num_input_tokens_seen": 216223130, + "router_z_loss_clip": 1.45117188, + "router_z_loss_mlp": 0.11749268, + "step": 10041, + "time_per_iteration": 2.4987337589263916 + }, + { + "auxiliary_loss_clip": 0.0641794, + "auxiliary_loss_mlp": 0.01264385, + "balance_loss_clip": 0.0627731, + "balance_loss_mlp": 0.01254026, + "epoch": 0.603757703291748, + "flos": 18666610272000.0, + "grad_norm": 4.973303913397253, + "language_loss": 0.75757015, + "learning_rate": 1.4328500276568704e-06, + "loss": 0.83439338, + "num_input_tokens_seen": 216240260, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.10357666, + "step": 10042, + "time_per_iteration": 2.5307700634002686 + }, + { + "auxiliary_loss_clip": 0.06414665, + "auxiliary_loss_mlp": 0.0126551, + "balance_loss_clip": 0.06273496, + "balance_loss_mlp": 0.01254477, + "epoch": 0.6038178265444161, + "flos": 19688700775680.0, + "grad_norm": 1.7644311631125091, + "language_loss": 0.84805411, + "learning_rate": 1.4324765641076498e-06, + "loss": 0.92485589, + "num_input_tokens_seen": 216258510, + "router_z_loss_clip": 1.41210938, + "router_z_loss_mlp": 0.1104126, + "step": 10043, + "time_per_iteration": 2.483207941055298 + }, + { + "auxiliary_loss_clip": 0.06418431, + "auxiliary_loss_mlp": 0.01267651, + "balance_loss_clip": 0.06272442, + "balance_loss_mlp": 0.01256034, + "epoch": 0.603877949797084, + "flos": 22644869621760.0, + "grad_norm": 1.873589684997381, + "language_loss": 0.69873232, + "learning_rate": 1.432103122078974e-06, + "loss": 0.77559316, + "num_input_tokens_seen": 216277550, + "router_z_loss_clip": 1.45996094, + "router_z_loss_mlp": 0.1161499, + "step": 10044, + "time_per_iteration": 3.940486192703247 + }, + { + "auxiliary_loss_clip": 0.0642198, + "auxiliary_loss_mlp": 0.01265838, + "balance_loss_clip": 0.06277104, + "balance_loss_mlp": 0.01254168, + "epoch": 0.603938073049752, + "flos": 25455031528320.0, + "grad_norm": 2.2351691288080966, + "language_loss": 0.77851117, + "learning_rate": 1.4317297015850057e-06, + "loss": 0.85538936, + "num_input_tokens_seen": 216296690, + "router_z_loss_clip": 1.44726562, + "router_z_loss_mlp": 0.11669922, + "step": 10045, + "time_per_iteration": 2.5411202907562256 + }, + { + "auxiliary_loss_clip": 0.06414884, + "auxiliary_loss_mlp": 0.01268718, + "balance_loss_clip": 0.06274995, + "balance_loss_mlp": 0.01257697, + "epoch": 0.6039981963024199, + "flos": 22345686218880.0, + "grad_norm": 1.7669017569149148, + "language_loss": 0.77354729, + "learning_rate": 1.4313563026399036e-06, + "loss": 0.85038328, + "num_input_tokens_seen": 216316110, + "router_z_loss_clip": 1.39648438, + "router_z_loss_mlp": 0.11010742, + "step": 10046, + "time_per_iteration": 2.6118433475494385 + }, + { + "auxiliary_loss_clip": 0.064179, + "auxiliary_loss_mlp": 0.01266383, + "balance_loss_clip": 0.06273997, + "balance_loss_mlp": 0.0125643, + "epoch": 0.6040583195550879, + "flos": 20709239978880.0, + "grad_norm": 1.4772024450084065, + "language_loss": 0.87242824, + "learning_rate": 1.430982925257827e-06, + "loss": 0.94927108, + "num_input_tokens_seen": 216333855, + "router_z_loss_clip": 1.43945312, + "router_z_loss_mlp": 0.09960938, + "step": 10047, + "time_per_iteration": 2.5964560508728027 + }, + { + "auxiliary_loss_clip": 0.06416798, + "auxiliary_loss_mlp": 0.01263003, + "balance_loss_clip": 0.06279427, + "balance_loss_mlp": 0.01252459, + "epoch": 0.604118442807756, + "flos": 27170623549440.0, + "grad_norm": 1.57099000963109, + "language_loss": 0.76137155, + "learning_rate": 1.4306095694529358e-06, + "loss": 0.83816957, + "num_input_tokens_seen": 216354890, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.10540771, + "step": 10048, + "time_per_iteration": 2.619131326675415 + }, + { + "auxiliary_loss_clip": 0.06423929, + "auxiliary_loss_mlp": 0.01267255, + "balance_loss_clip": 0.06274632, + "balance_loss_mlp": 0.0125512, + "epoch": 0.6041785660604239, + "flos": 30889125642240.0, + "grad_norm": 2.0836935767176508, + "language_loss": 0.66702586, + "learning_rate": 1.430236235239386e-06, + "loss": 0.74393767, + "num_input_tokens_seen": 216376055, + "router_z_loss_clip": 1.49316406, + "router_z_loss_mlp": 0.12121582, + "step": 10049, + "time_per_iteration": 2.650125741958618 + }, + { + "auxiliary_loss_clip": 0.06413972, + "auxiliary_loss_mlp": 0.01268699, + "balance_loss_clip": 0.06272484, + "balance_loss_mlp": 0.0125769, + "epoch": 0.6042386893130919, + "flos": 19944391109760.0, + "grad_norm": 1.425076043351067, + "language_loss": 0.6651637, + "learning_rate": 1.429862922631336e-06, + "loss": 0.74199045, + "num_input_tokens_seen": 216396295, + "router_z_loss_clip": 1.41503906, + "router_z_loss_mlp": 0.11004639, + "step": 10050, + "time_per_iteration": 2.523010015487671 + }, + { + "auxiliary_loss_clip": 0.06421351, + "auxiliary_loss_mlp": 0.01263894, + "balance_loss_clip": 0.06279106, + "balance_loss_mlp": 0.01252956, + "epoch": 0.6042988125657598, + "flos": 32424106187520.0, + "grad_norm": 1.5652221823172618, + "language_loss": 0.70055592, + "learning_rate": 1.4294896316429408e-06, + "loss": 0.7774083, + "num_input_tokens_seen": 216416605, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.10949707, + "step": 10051, + "time_per_iteration": 2.6328225135803223 + }, + { + "auxiliary_loss_clip": 0.06413503, + "auxiliary_loss_mlp": 0.01264826, + "balance_loss_clip": 0.062729, + "balance_loss_mlp": 0.01253167, + "epoch": 0.6043589358184278, + "flos": 17426578498560.0, + "grad_norm": 1.814191650563656, + "language_loss": 0.64989793, + "learning_rate": 1.4291163622883553e-06, + "loss": 0.72668123, + "num_input_tokens_seen": 216435130, + "router_z_loss_clip": 1.40722656, + "router_z_loss_mlp": 0.11651611, + "step": 10052, + "time_per_iteration": 4.032447814941406 + }, + { + "auxiliary_loss_clip": 0.06422505, + "auxiliary_loss_mlp": 0.01270462, + "balance_loss_clip": 0.06280071, + "balance_loss_mlp": 0.01259275, + "epoch": 0.6044190590710957, + "flos": 27680243281920.0, + "grad_norm": 1.5013537444726899, + "language_loss": 0.69046491, + "learning_rate": 1.4287431145817358e-06, + "loss": 0.76739454, + "num_input_tokens_seen": 216455640, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.11187744, + "step": 10053, + "time_per_iteration": 2.5837066173553467 + }, + { + "auxiliary_loss_clip": 0.06317958, + "auxiliary_loss_mlp": 0.01251886, + "balance_loss_clip": 0.06259381, + "balance_loss_mlp": 0.01250314, + "epoch": 0.6044791823237637, + "flos": 65334422090880.0, + "grad_norm": 0.7098963484594624, + "language_loss": 0.60469133, + "learning_rate": 1.4283698885372336e-06, + "loss": 0.68038976, + "num_input_tokens_seen": 216518130, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 0.01572418, + "step": 10054, + "time_per_iteration": 3.282451868057251 + }, + { + "auxiliary_loss_clip": 0.0641373, + "auxiliary_loss_mlp": 0.0126602, + "balance_loss_clip": 0.06275851, + "balance_loss_mlp": 0.01255023, + "epoch": 0.6045393055764317, + "flos": 24498208955520.0, + "grad_norm": 1.4963816601479185, + "language_loss": 0.85832298, + "learning_rate": 1.4279966841690027e-06, + "loss": 0.93512046, + "num_input_tokens_seen": 216536845, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.10998535, + "step": 10055, + "time_per_iteration": 2.5359747409820557 + }, + { + "auxiliary_loss_clip": 0.06417194, + "auxiliary_loss_mlp": 0.01268307, + "balance_loss_clip": 0.0627384, + "balance_loss_mlp": 0.01256416, + "epoch": 0.6045994288290997, + "flos": 19058999742720.0, + "grad_norm": 2.4042532312332243, + "language_loss": 0.74155682, + "learning_rate": 1.4276235014911952e-06, + "loss": 0.81841183, + "num_input_tokens_seen": 216551860, + "router_z_loss_clip": 1.43164062, + "router_z_loss_mlp": 0.11895752, + "step": 10056, + "time_per_iteration": 2.5254933834075928 + }, + { + "auxiliary_loss_clip": 0.06408785, + "auxiliary_loss_mlp": 0.01263059, + "balance_loss_clip": 0.06271578, + "balance_loss_mlp": 0.01252926, + "epoch": 0.6046595520817676, + "flos": 26583660898560.0, + "grad_norm": 1.6233300173420022, + "language_loss": 0.80582207, + "learning_rate": 1.4272503405179616e-06, + "loss": 0.88254052, + "num_input_tokens_seen": 216574775, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.10137939, + "step": 10057, + "time_per_iteration": 3.975159168243408 + }, + { + "auxiliary_loss_clip": 0.06411809, + "auxiliary_loss_mlp": 0.01267453, + "balance_loss_clip": 0.06273948, + "balance_loss_mlp": 0.01256557, + "epoch": 0.6047196753344356, + "flos": 13586150315520.0, + "grad_norm": 2.1360006581590727, + "language_loss": 0.751284, + "learning_rate": 1.4268772012634527e-06, + "loss": 0.82807666, + "num_input_tokens_seen": 216590100, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.10900879, + "step": 10058, + "time_per_iteration": 2.519793748855591 + }, + { + "auxiliary_loss_clip": 0.06412867, + "auxiliary_loss_mlp": 0.0126516, + "balance_loss_clip": 0.06274287, + "balance_loss_mlp": 0.01253967, + "epoch": 0.6047797985871035, + "flos": 25527552837120.0, + "grad_norm": 1.8108696315105546, + "language_loss": 0.70813042, + "learning_rate": 1.4265040837418176e-06, + "loss": 0.78491068, + "num_input_tokens_seen": 216610145, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.11181641, + "step": 10059, + "time_per_iteration": 2.5327351093292236 + }, + { + "auxiliary_loss_clip": 0.06417379, + "auxiliary_loss_mlp": 0.01264428, + "balance_loss_clip": 0.06274404, + "balance_loss_mlp": 0.01253538, + "epoch": 0.6048399218397715, + "flos": 20526112880640.0, + "grad_norm": 1.5165980047863354, + "language_loss": 0.76569366, + "learning_rate": 1.4261309879672054e-06, + "loss": 0.84251177, + "num_input_tokens_seen": 216630625, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.10888672, + "step": 10060, + "time_per_iteration": 2.5674891471862793 + }, + { + "auxiliary_loss_clip": 0.06412329, + "auxiliary_loss_mlp": 0.01266467, + "balance_loss_clip": 0.06271071, + "balance_loss_mlp": 0.01256036, + "epoch": 0.6049000450924396, + "flos": 20414416988160.0, + "grad_norm": 1.961791815817934, + "language_loss": 0.73817396, + "learning_rate": 1.4257579139537628e-06, + "loss": 0.81496191, + "num_input_tokens_seen": 216649255, + "router_z_loss_clip": 1.41210938, + "router_z_loss_mlp": 0.10418701, + "step": 10061, + "time_per_iteration": 2.4917149543762207 + }, + { + "auxiliary_loss_clip": 0.06419303, + "auxiliary_loss_mlp": 0.0126307, + "balance_loss_clip": 0.06275985, + "balance_loss_mlp": 0.01252497, + "epoch": 0.6049601683451075, + "flos": 20747743729920.0, + "grad_norm": 1.6943031579927808, + "language_loss": 0.67628121, + "learning_rate": 1.425384861715639e-06, + "loss": 0.75310493, + "num_input_tokens_seen": 216668100, + "router_z_loss_clip": 1.43164062, + "router_z_loss_mlp": 0.10565186, + "step": 10062, + "time_per_iteration": 3.9096996784210205 + }, + { + "auxiliary_loss_clip": 0.06412483, + "auxiliary_loss_mlp": 0.01265649, + "balance_loss_clip": 0.06272361, + "balance_loss_mlp": 0.01254246, + "epoch": 0.6050202915977755, + "flos": 20089140238080.0, + "grad_norm": 1.9017616396263957, + "language_loss": 0.71490061, + "learning_rate": 1.425011831266978e-06, + "loss": 0.79168195, + "num_input_tokens_seen": 216686125, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.11395264, + "step": 10063, + "time_per_iteration": 2.532278299331665 + }, + { + "auxiliary_loss_clip": 0.06410936, + "auxiliary_loss_mlp": 0.01264295, + "balance_loss_clip": 0.06271436, + "balance_loss_mlp": 0.01253858, + "epoch": 0.6050804148504434, + "flos": 15966257541120.0, + "grad_norm": 1.545014679780644, + "language_loss": 0.84818602, + "learning_rate": 1.424638822621926e-06, + "loss": 0.92493832, + "num_input_tokens_seen": 216704265, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.10430908, + "step": 10064, + "time_per_iteration": 2.4977669715881348 + }, + { + "auxiliary_loss_clip": 0.06412817, + "auxiliary_loss_mlp": 0.01264433, + "balance_loss_clip": 0.06272112, + "balance_loss_mlp": 0.01253567, + "epoch": 0.6051405381031114, + "flos": 17462315064960.0, + "grad_norm": 2.0946043423181293, + "language_loss": 0.801759, + "learning_rate": 1.4242658357946278e-06, + "loss": 0.87853146, + "num_input_tokens_seen": 216721765, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.10870361, + "step": 10065, + "time_per_iteration": 2.563521146774292 + }, + { + "auxiliary_loss_clip": 0.06424835, + "auxiliary_loss_mlp": 0.01265118, + "balance_loss_clip": 0.06278696, + "balance_loss_mlp": 0.0125371, + "epoch": 0.6052006613557793, + "flos": 11404808974080.0, + "grad_norm": 1.8141288170700578, + "language_loss": 0.7897802, + "learning_rate": 1.423892870799226e-06, + "loss": 0.86667973, + "num_input_tokens_seen": 216738295, + "router_z_loss_clip": 1.45996094, + "router_z_loss_mlp": 0.11413574, + "step": 10066, + "time_per_iteration": 2.4816365242004395 + }, + { + "auxiliary_loss_clip": 0.0641356, + "auxiliary_loss_mlp": 0.0126889, + "balance_loss_clip": 0.06272712, + "balance_loss_mlp": 0.01257857, + "epoch": 0.6052607846084473, + "flos": 24757421160960.0, + "grad_norm": 1.6017965029602446, + "language_loss": 0.73526549, + "learning_rate": 1.4235199276498655e-06, + "loss": 0.81208998, + "num_input_tokens_seen": 216759875, + "router_z_loss_clip": 1.40722656, + "router_z_loss_mlp": 0.1104126, + "step": 10067, + "time_per_iteration": 2.585381269454956 + }, + { + "auxiliary_loss_clip": 0.06416602, + "auxiliary_loss_mlp": 0.01267036, + "balance_loss_clip": 0.06275155, + "balance_loss_mlp": 0.01255646, + "epoch": 0.6053209078611153, + "flos": 20747492167680.0, + "grad_norm": 1.2388364270447627, + "language_loss": 0.68978894, + "learning_rate": 1.4231470063606863e-06, + "loss": 0.76662529, + "num_input_tokens_seen": 216780705, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.1138916, + "step": 10068, + "time_per_iteration": 2.533571243286133 + }, + { + "auxiliary_loss_clip": 0.06416383, + "auxiliary_loss_mlp": 0.01265473, + "balance_loss_clip": 0.06272757, + "balance_loss_mlp": 0.01254864, + "epoch": 0.6053810311137833, + "flos": 18959169202560.0, + "grad_norm": 2.164785155160147, + "language_loss": 0.87104344, + "learning_rate": 1.4227741069458303e-06, + "loss": 0.94786203, + "num_input_tokens_seen": 216797625, + "router_z_loss_clip": 1.43652344, + "router_z_loss_mlp": 0.1060791, + "step": 10069, + "time_per_iteration": 2.5425305366516113 + }, + { + "auxiliary_loss_clip": 0.06414159, + "auxiliary_loss_mlp": 0.01265922, + "balance_loss_clip": 0.06274873, + "balance_loss_mlp": 0.01255259, + "epoch": 0.6054411543664512, + "flos": 23957883901440.0, + "grad_norm": 1.623757415978513, + "language_loss": 0.83496463, + "learning_rate": 1.4224012294194387e-06, + "loss": 0.91176546, + "num_input_tokens_seen": 216817610, + "router_z_loss_clip": 1.39257812, + "router_z_loss_mlp": 0.10662842, + "step": 10070, + "time_per_iteration": 2.528780221939087 + }, + { + "auxiliary_loss_clip": 0.06416136, + "auxiliary_loss_mlp": 0.0126614, + "balance_loss_clip": 0.06271877, + "balance_loss_mlp": 0.0125528, + "epoch": 0.6055012776191192, + "flos": 20600101635840.0, + "grad_norm": 1.4904746237370996, + "language_loss": 0.86489964, + "learning_rate": 1.4220283737956496e-06, + "loss": 0.94172239, + "num_input_tokens_seen": 216836835, + "router_z_loss_clip": 1.44238281, + "router_z_loss_mlp": 0.10858154, + "step": 10071, + "time_per_iteration": 2.538874387741089 + }, + { + "auxiliary_loss_clip": 0.06422232, + "auxiliary_loss_mlp": 0.01271365, + "balance_loss_clip": 0.06276511, + "balance_loss_mlp": 0.01259129, + "epoch": 0.6055614008717871, + "flos": 30305768716800.0, + "grad_norm": 1.8258498039752344, + "language_loss": 0.77371645, + "learning_rate": 1.421655540088603e-06, + "loss": 0.85065246, + "num_input_tokens_seen": 216856760, + "router_z_loss_clip": 1.45703125, + "router_z_loss_mlp": 0.12231445, + "step": 10072, + "time_per_iteration": 2.5658671855926514 + }, + { + "auxiliary_loss_clip": 0.06419331, + "auxiliary_loss_mlp": 0.01267468, + "balance_loss_clip": 0.06274524, + "balance_loss_mlp": 0.01255523, + "epoch": 0.6056215241244551, + "flos": 27132245579520.0, + "grad_norm": 1.5250709401817175, + "language_loss": 0.74363017, + "learning_rate": 1.4212827283124367e-06, + "loss": 0.82049823, + "num_input_tokens_seen": 216878795, + "router_z_loss_clip": 1.44921875, + "router_z_loss_mlp": 0.11962891, + "step": 10073, + "time_per_iteration": 2.5838263034820557 + }, + { + "auxiliary_loss_clip": 0.06330025, + "auxiliary_loss_mlp": 0.01255009, + "balance_loss_clip": 0.06271286, + "balance_loss_mlp": 0.01253449, + "epoch": 0.6056816473771232, + "flos": 56023073124480.0, + "grad_norm": 0.7392641743542041, + "language_loss": 0.55267042, + "learning_rate": 1.4209099384812863e-06, + "loss": 0.62852079, + "num_input_tokens_seen": 216937800, + "router_z_loss_clip": 0.58740234, + "router_z_loss_mlp": 0.01560211, + "step": 10074, + "time_per_iteration": 3.192260503768921 + }, + { + "auxiliary_loss_clip": 0.06416894, + "auxiliary_loss_mlp": 0.01266981, + "balance_loss_clip": 0.0627609, + "balance_loss_mlp": 0.01256353, + "epoch": 0.6057417706297911, + "flos": 23556144700800.0, + "grad_norm": 1.6660379644056391, + "language_loss": 0.81972474, + "learning_rate": 1.4205371706092894e-06, + "loss": 0.89656347, + "num_input_tokens_seen": 216955280, + "router_z_loss_clip": 1.40722656, + "router_z_loss_mlp": 0.10626221, + "step": 10075, + "time_per_iteration": 2.514631509780884 + }, + { + "auxiliary_loss_clip": 0.06414524, + "auxiliary_loss_mlp": 0.01266219, + "balance_loss_clip": 0.06272351, + "balance_loss_mlp": 0.01255526, + "epoch": 0.6058018938824591, + "flos": 27751464852480.0, + "grad_norm": 1.6456827746682687, + "language_loss": 0.78334481, + "learning_rate": 1.4201644247105813e-06, + "loss": 0.86015224, + "num_input_tokens_seen": 216976950, + "router_z_loss_clip": 1.41992188, + "router_z_loss_mlp": 0.10699463, + "step": 10076, + "time_per_iteration": 2.5620245933532715 + }, + { + "auxiliary_loss_clip": 0.06419735, + "auxiliary_loss_mlp": 0.01264098, + "balance_loss_clip": 0.06275415, + "balance_loss_mlp": 0.01252994, + "epoch": 0.605862017135127, + "flos": 22789912239360.0, + "grad_norm": 1.939163307933087, + "language_loss": 0.72597015, + "learning_rate": 1.4197917007992964e-06, + "loss": 0.80280852, + "num_input_tokens_seen": 216996945, + "router_z_loss_clip": 1.44238281, + "router_z_loss_mlp": 0.11102295, + "step": 10077, + "time_per_iteration": 2.5249850749969482 + }, + { + "auxiliary_loss_clip": 0.06421016, + "auxiliary_loss_mlp": 0.0126711, + "balance_loss_clip": 0.06278025, + "balance_loss_mlp": 0.01256155, + "epoch": 0.605922140387795, + "flos": 21221375333760.0, + "grad_norm": 1.5785416430125656, + "language_loss": 0.55953008, + "learning_rate": 1.4194189988895682e-06, + "loss": 0.63641137, + "num_input_tokens_seen": 217016580, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.10961914, + "step": 10078, + "time_per_iteration": 2.5278408527374268 + }, + { + "auxiliary_loss_clip": 0.06424035, + "auxiliary_loss_mlp": 0.01271223, + "balance_loss_clip": 0.06278145, + "balance_loss_mlp": 0.01259911, + "epoch": 0.6059822636404629, + "flos": 27275191845120.0, + "grad_norm": 1.4527216797355516, + "language_loss": 0.70788896, + "learning_rate": 1.4190463189955297e-06, + "loss": 0.78484154, + "num_input_tokens_seen": 217037300, + "router_z_loss_clip": 1.45800781, + "router_z_loss_mlp": 0.11322021, + "step": 10079, + "time_per_iteration": 2.5871152877807617 + }, + { + "auxiliary_loss_clip": 0.06417212, + "auxiliary_loss_mlp": 0.0126863, + "balance_loss_clip": 0.06276966, + "balance_loss_mlp": 0.01257991, + "epoch": 0.606042386893131, + "flos": 20637599137920.0, + "grad_norm": 1.8315516840845918, + "language_loss": 0.63098562, + "learning_rate": 1.4186736611313131e-06, + "loss": 0.70784402, + "num_input_tokens_seen": 217055805, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.10638428, + "step": 10080, + "time_per_iteration": 2.491398334503174 + }, + { + "auxiliary_loss_clip": 0.06417031, + "auxiliary_loss_mlp": 0.01266608, + "balance_loss_clip": 0.06274322, + "balance_loss_mlp": 0.01255289, + "epoch": 0.6061025101457989, + "flos": 23008859758080.0, + "grad_norm": 1.6961363468706865, + "language_loss": 0.71255064, + "learning_rate": 1.4183010253110492e-06, + "loss": 0.78938705, + "num_input_tokens_seen": 217074175, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.11322021, + "step": 10081, + "time_per_iteration": 2.512700080871582 + }, + { + "auxiliary_loss_clip": 0.06420416, + "auxiliary_loss_mlp": 0.01269117, + "balance_loss_clip": 0.06277903, + "balance_loss_mlp": 0.01258406, + "epoch": 0.6061626333984669, + "flos": 29906796700800.0, + "grad_norm": 1.5910736573937334, + "language_loss": 0.69392467, + "learning_rate": 1.4179284115488691e-06, + "loss": 0.77082002, + "num_input_tokens_seen": 217095695, + "router_z_loss_clip": 1.42382812, + "router_z_loss_mlp": 0.10717773, + "step": 10082, + "time_per_iteration": 2.5597543716430664 + }, + { + "auxiliary_loss_clip": 0.06418272, + "auxiliary_loss_mlp": 0.01266999, + "balance_loss_clip": 0.06275124, + "balance_loss_mlp": 0.01256514, + "epoch": 0.6062227566511348, + "flos": 25016130241920.0, + "grad_norm": 1.2876460924932913, + "language_loss": 0.66258222, + "learning_rate": 1.4175558198589015e-06, + "loss": 0.7394349, + "num_input_tokens_seen": 217116260, + "router_z_loss_clip": 1.43164062, + "router_z_loss_mlp": 0.1048584, + "step": 10083, + "time_per_iteration": 4.032879114151001 + }, + { + "auxiliary_loss_clip": 0.06418855, + "auxiliary_loss_mlp": 0.01266697, + "balance_loss_clip": 0.06274892, + "balance_loss_mlp": 0.01256147, + "epoch": 0.6062828799038028, + "flos": 19470046746240.0, + "grad_norm": 1.984600644426631, + "language_loss": 0.74219275, + "learning_rate": 1.4171832502552764e-06, + "loss": 0.81904829, + "num_input_tokens_seen": 217134465, + "router_z_loss_clip": 1.43945312, + "router_z_loss_mlp": 0.10546875, + "step": 10084, + "time_per_iteration": 2.549463987350464 + }, + { + "auxiliary_loss_clip": 0.0641944, + "auxiliary_loss_mlp": 0.0126482, + "balance_loss_clip": 0.06277829, + "balance_loss_mlp": 0.01254305, + "epoch": 0.6063430031564707, + "flos": 13594661504640.0, + "grad_norm": 2.649456512280636, + "language_loss": 0.72717726, + "learning_rate": 1.4168107027521204e-06, + "loss": 0.80401981, + "num_input_tokens_seen": 217149920, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.10516357, + "step": 10085, + "time_per_iteration": 2.569584846496582 + }, + { + "auxiliary_loss_clip": 0.06415457, + "auxiliary_loss_mlp": 0.01267297, + "balance_loss_clip": 0.06275511, + "balance_loss_mlp": 0.01256771, + "epoch": 0.6064031264091387, + "flos": 23261740980480.0, + "grad_norm": 2.0482376544916057, + "language_loss": 0.76309711, + "learning_rate": 1.4164381773635605e-06, + "loss": 0.83992463, + "num_input_tokens_seen": 217168165, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.10522461, + "step": 10086, + "time_per_iteration": 2.5559799671173096 + }, + { + "auxiliary_loss_clip": 0.0641108, + "auxiliary_loss_mlp": 0.01265292, + "balance_loss_clip": 0.06273226, + "balance_loss_mlp": 0.01255231, + "epoch": 0.6064632496618068, + "flos": 22465515957120.0, + "grad_norm": 1.2564833731282572, + "language_loss": 0.72978222, + "learning_rate": 1.4160656741037246e-06, + "loss": 0.80654591, + "num_input_tokens_seen": 217190070, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.10070801, + "step": 10087, + "time_per_iteration": 2.5399293899536133 + }, + { + "auxiliary_loss_clip": 0.06412689, + "auxiliary_loss_mlp": 0.01269622, + "balance_loss_clip": 0.06275249, + "balance_loss_mlp": 0.01259555, + "epoch": 0.6065233729144747, + "flos": 25125604001280.0, + "grad_norm": 1.521602814132933, + "language_loss": 0.83829105, + "learning_rate": 1.4156931929867355e-06, + "loss": 0.91511416, + "num_input_tokens_seen": 217209370, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.10058594, + "step": 10088, + "time_per_iteration": 2.5622670650482178 + }, + { + "auxiliary_loss_clip": 0.06411251, + "auxiliary_loss_mlp": 0.01268104, + "balance_loss_clip": 0.06272328, + "balance_loss_mlp": 0.01257709, + "epoch": 0.6065834961671427, + "flos": 23484126516480.0, + "grad_norm": 1.9713789944159437, + "language_loss": 0.71166384, + "learning_rate": 1.4153207340267201e-06, + "loss": 0.78845739, + "num_input_tokens_seen": 217226990, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.10400391, + "step": 10089, + "time_per_iteration": 2.516352891921997 + }, + { + "auxiliary_loss_clip": 0.06418794, + "auxiliary_loss_mlp": 0.01265974, + "balance_loss_clip": 0.06277877, + "balance_loss_mlp": 0.01255835, + "epoch": 0.6066436194198106, + "flos": 17025090860160.0, + "grad_norm": 1.830033701594393, + "language_loss": 0.82651365, + "learning_rate": 1.4149482972378009e-06, + "loss": 0.90336132, + "num_input_tokens_seen": 217244585, + "router_z_loss_clip": 1.40917969, + "router_z_loss_mlp": 0.10137939, + "step": 10090, + "time_per_iteration": 2.5144259929656982 + }, + { + "auxiliary_loss_clip": 0.06427157, + "auxiliary_loss_mlp": 0.01267358, + "balance_loss_clip": 0.06276274, + "balance_loss_mlp": 0.01255848, + "epoch": 0.6067037426724786, + "flos": 18520603332480.0, + "grad_norm": 2.204687443594168, + "language_loss": 0.76034927, + "learning_rate": 1.4145758826341e-06, + "loss": 0.83729446, + "num_input_tokens_seen": 217263435, + "router_z_loss_clip": 1.5078125, + "router_z_loss_mlp": 0.11505127, + "step": 10091, + "time_per_iteration": 2.4818389415740967 + }, + { + "auxiliary_loss_clip": 0.06416716, + "auxiliary_loss_mlp": 0.01268883, + "balance_loss_clip": 0.06278287, + "balance_loss_mlp": 0.01258041, + "epoch": 0.6067638659251465, + "flos": 22352520326400.0, + "grad_norm": 1.3588116701946646, + "language_loss": 0.7976529, + "learning_rate": 1.4142034902297415e-06, + "loss": 0.87450886, + "num_input_tokens_seen": 217283725, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.10858154, + "step": 10092, + "time_per_iteration": 4.102951765060425 + }, + { + "auxiliary_loss_clip": 0.06413257, + "auxiliary_loss_mlp": 0.01264393, + "balance_loss_clip": 0.06271385, + "balance_loss_mlp": 0.01253623, + "epoch": 0.6068239891778145, + "flos": 12454669906560.0, + "grad_norm": 1.7580568445861304, + "language_loss": 0.76897407, + "learning_rate": 1.4138311200388444e-06, + "loss": 0.84575057, + "num_input_tokens_seen": 217301120, + "router_z_loss_clip": 1.41796875, + "router_z_loss_mlp": 0.10778809, + "step": 10093, + "time_per_iteration": 2.5497262477874756 + }, + { + "auxiliary_loss_clip": 0.06417312, + "auxiliary_loss_mlp": 0.01264272, + "balance_loss_clip": 0.06280127, + "balance_loss_mlp": 0.01254396, + "epoch": 0.6068841124304825, + "flos": 23192657688960.0, + "grad_norm": 1.756366452209319, + "language_loss": 0.87924957, + "learning_rate": 1.4134587720755304e-06, + "loss": 0.95606542, + "num_input_tokens_seen": 217319585, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.09887695, + "step": 10094, + "time_per_iteration": 2.5853447914123535 + }, + { + "auxiliary_loss_clip": 0.06414801, + "auxiliary_loss_mlp": 0.01269704, + "balance_loss_clip": 0.06274891, + "balance_loss_mlp": 0.01258891, + "epoch": 0.6069442356831505, + "flos": 18593795473920.0, + "grad_norm": 1.6037560799373654, + "language_loss": 0.72400463, + "learning_rate": 1.413086446353919e-06, + "loss": 0.80084968, + "num_input_tokens_seen": 217338880, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.1081543, + "step": 10095, + "time_per_iteration": 2.522684335708618 + }, + { + "auxiliary_loss_clip": 0.06416344, + "auxiliary_loss_mlp": 0.01265543, + "balance_loss_clip": 0.06275313, + "balance_loss_mlp": 0.01255202, + "epoch": 0.6070043589358184, + "flos": 20966775102720.0, + "grad_norm": 1.6943237110311855, + "language_loss": 0.76768452, + "learning_rate": 1.4127141428881273e-06, + "loss": 0.8445034, + "num_input_tokens_seen": 217357480, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.10333252, + "step": 10096, + "time_per_iteration": 3.974635362625122 + }, + { + "auxiliary_loss_clip": 0.06419063, + "auxiliary_loss_mlp": 0.01267681, + "balance_loss_clip": 0.06276296, + "balance_loss_mlp": 0.01257018, + "epoch": 0.6070644821884864, + "flos": 11697242123520.0, + "grad_norm": 1.6709554759687573, + "language_loss": 0.80418944, + "learning_rate": 1.4123418616922749e-06, + "loss": 0.8810569, + "num_input_tokens_seen": 217374575, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.10668945, + "step": 10097, + "time_per_iteration": 2.5277743339538574 + }, + { + "auxiliary_loss_clip": 0.06411067, + "auxiliary_loss_mlp": 0.01267086, + "balance_loss_clip": 0.0627345, + "balance_loss_mlp": 0.01256888, + "epoch": 0.6071246054411543, + "flos": 19315402836480.0, + "grad_norm": 1.4624120271510725, + "language_loss": 0.6741221, + "learning_rate": 1.411969602780478e-06, + "loss": 0.75090361, + "num_input_tokens_seen": 217392950, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.10198975, + "step": 10098, + "time_per_iteration": 2.476284980773926 + }, + { + "auxiliary_loss_clip": 0.06410795, + "auxiliary_loss_mlp": 0.01267637, + "balance_loss_clip": 0.06272739, + "balance_loss_mlp": 0.01257695, + "epoch": 0.6071847286938223, + "flos": 17754832068480.0, + "grad_norm": 1.6528826990411218, + "language_loss": 0.80661249, + "learning_rate": 1.4115973661668523e-06, + "loss": 0.8833968, + "num_input_tokens_seen": 217412145, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.0994873, + "step": 10099, + "time_per_iteration": 2.5101730823516846 + }, + { + "auxiliary_loss_clip": 0.06419415, + "auxiliary_loss_mlp": 0.01267814, + "balance_loss_clip": 0.06273925, + "balance_loss_mlp": 0.01256382, + "epoch": 0.6072448519464904, + "flos": 22644031080960.0, + "grad_norm": 1.7660509562429656, + "language_loss": 0.71092284, + "learning_rate": 1.4112251518655133e-06, + "loss": 0.78779513, + "num_input_tokens_seen": 217432080, + "router_z_loss_clip": 1.45507812, + "router_z_loss_mlp": 0.11437988, + "step": 10100, + "time_per_iteration": 2.5284388065338135 + }, + { + "auxiliary_loss_clip": 0.06417382, + "auxiliary_loss_mlp": 0.012671, + "balance_loss_clip": 0.06275873, + "balance_loss_mlp": 0.01255072, + "epoch": 0.6073049751991583, + "flos": 19543490449920.0, + "grad_norm": 2.5847426043420807, + "language_loss": 0.71003377, + "learning_rate": 1.4108529598905764e-06, + "loss": 0.78687859, + "num_input_tokens_seen": 217450945, + "router_z_loss_clip": 1.41503906, + "router_z_loss_mlp": 0.12030029, + "step": 10101, + "time_per_iteration": 2.5114076137542725 + }, + { + "auxiliary_loss_clip": 0.06414101, + "auxiliary_loss_mlp": 0.01264452, + "balance_loss_clip": 0.06275541, + "balance_loss_mlp": 0.01254534, + "epoch": 0.6073650984518263, + "flos": 28301936250240.0, + "grad_norm": 1.5889760307817664, + "language_loss": 0.69726598, + "learning_rate": 1.410480790256154e-06, + "loss": 0.77405149, + "num_input_tokens_seen": 217473105, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.09924316, + "step": 10102, + "time_per_iteration": 4.067505836486816 + }, + { + "auxiliary_loss_clip": 0.06414825, + "auxiliary_loss_mlp": 0.01266798, + "balance_loss_clip": 0.06273274, + "balance_loss_mlp": 0.01256409, + "epoch": 0.6074252217044942, + "flos": 25671211862400.0, + "grad_norm": 1.7072302673605428, + "language_loss": 0.73599881, + "learning_rate": 1.4101086429763589e-06, + "loss": 0.81281507, + "num_input_tokens_seen": 217491780, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.10394287, + "step": 10103, + "time_per_iteration": 2.5059690475463867 + }, + { + "auxiliary_loss_clip": 0.06429945, + "auxiliary_loss_mlp": 0.01270767, + "balance_loss_clip": 0.06280673, + "balance_loss_mlp": 0.01259215, + "epoch": 0.6074853449571622, + "flos": 22863775213440.0, + "grad_norm": 2.6623380378388943, + "language_loss": 0.76573825, + "learning_rate": 1.4097365180653032e-06, + "loss": 0.84274542, + "num_input_tokens_seen": 217510605, + "router_z_loss_clip": 1.4921875, + "router_z_loss_mlp": 0.11560059, + "step": 10104, + "time_per_iteration": 2.5691661834716797 + }, + { + "auxiliary_loss_clip": 0.06324141, + "auxiliary_loss_mlp": 0.01255914, + "balance_loss_clip": 0.0626532, + "balance_loss_mlp": 0.01253873, + "epoch": 0.6075454682098301, + "flos": 67131088536960.0, + "grad_norm": 0.6977033795055727, + "language_loss": 0.55382067, + "learning_rate": 1.4093644155370977e-06, + "loss": 0.62962115, + "num_input_tokens_seen": 217574815, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 0.02041626, + "step": 10105, + "time_per_iteration": 3.1780333518981934 + }, + { + "auxiliary_loss_clip": 0.06325028, + "auxiliary_loss_mlp": 0.0125398, + "balance_loss_clip": 0.06266589, + "balance_loss_mlp": 0.01252049, + "epoch": 0.6076055914624982, + "flos": 70730389797120.0, + "grad_norm": 1.0472762602622778, + "language_loss": 0.5682922, + "learning_rate": 1.4089923354058533e-06, + "loss": 0.64408225, + "num_input_tokens_seen": 217632375, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 0.01928711, + "step": 10106, + "time_per_iteration": 3.1282505989074707 + }, + { + "auxiliary_loss_clip": 0.06414115, + "auxiliary_loss_mlp": 0.01266002, + "balance_loss_clip": 0.06276634, + "balance_loss_mlp": 0.01256042, + "epoch": 0.6076657147151661, + "flos": 28371816155520.0, + "grad_norm": 1.4629042426300594, + "language_loss": 0.69019145, + "learning_rate": 1.4086202776856784e-06, + "loss": 0.76699257, + "num_input_tokens_seen": 217653055, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.09954834, + "step": 10107, + "time_per_iteration": 2.6175951957702637 + }, + { + "auxiliary_loss_clip": 0.0642143, + "auxiliary_loss_mlp": 0.01265799, + "balance_loss_clip": 0.06277055, + "balance_loss_mlp": 0.01255297, + "epoch": 0.6077258379678341, + "flos": 15055234024320.0, + "grad_norm": 1.7550359653422893, + "language_loss": 0.80674279, + "learning_rate": 1.4082482423906815e-06, + "loss": 0.88361514, + "num_input_tokens_seen": 217671520, + "router_z_loss_clip": 1.44433594, + "router_z_loss_mlp": 0.1050415, + "step": 10108, + "time_per_iteration": 2.482895851135254 + }, + { + "auxiliary_loss_clip": 0.06424679, + "auxiliary_loss_mlp": 0.01267352, + "balance_loss_clip": 0.06279299, + "balance_loss_mlp": 0.01256223, + "epoch": 0.607785961220502, + "flos": 36174948756480.0, + "grad_norm": 1.6080944832957944, + "language_loss": 0.71795905, + "learning_rate": 1.4078762295349714e-06, + "loss": 0.79487944, + "num_input_tokens_seen": 217691880, + "router_z_loss_clip": 1.45214844, + "router_z_loss_mlp": 0.11138916, + "step": 10109, + "time_per_iteration": 2.6855504512786865 + }, + { + "auxiliary_loss_clip": 0.06412528, + "auxiliary_loss_mlp": 0.01268721, + "balance_loss_clip": 0.06276727, + "balance_loss_mlp": 0.01259119, + "epoch": 0.60784608447317, + "flos": 22530113055360.0, + "grad_norm": 1.591486225286121, + "language_loss": 0.80463254, + "learning_rate": 1.407504239132653e-06, + "loss": 0.88144499, + "num_input_tokens_seen": 217710530, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.09613037, + "step": 10110, + "time_per_iteration": 2.4970977306365967 + }, + { + "auxiliary_loss_clip": 0.06416238, + "auxiliary_loss_mlp": 0.01268709, + "balance_loss_clip": 0.06275235, + "balance_loss_mlp": 0.01258052, + "epoch": 0.6079062077258379, + "flos": 23847823163520.0, + "grad_norm": 17.062743331014456, + "language_loss": 0.7053231, + "learning_rate": 1.4071322711978338e-06, + "loss": 0.78217256, + "num_input_tokens_seen": 217728650, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.10656738, + "step": 10111, + "time_per_iteration": 2.5446176528930664 + }, + { + "auxiliary_loss_clip": 0.0641928, + "auxiliary_loss_mlp": 0.01267582, + "balance_loss_clip": 0.06276086, + "balance_loss_mlp": 0.01255631, + "epoch": 0.6079663309785059, + "flos": 23373646508160.0, + "grad_norm": 1.767884967540518, + "language_loss": 0.64890563, + "learning_rate": 1.4067603257446186e-06, + "loss": 0.72577429, + "num_input_tokens_seen": 217747135, + "router_z_loss_clip": 1.43164062, + "router_z_loss_mlp": 0.11950684, + "step": 10112, + "time_per_iteration": 2.5041110515594482 + }, + { + "auxiliary_loss_clip": 0.06319214, + "auxiliary_loss_mlp": 0.01254153, + "balance_loss_clip": 0.0626073, + "balance_loss_mlp": 0.01252635, + "epoch": 0.6080264542311739, + "flos": 71403709680000.0, + "grad_norm": 0.6188727131541597, + "language_loss": 0.49428421, + "learning_rate": 1.4063884027871105e-06, + "loss": 0.57001793, + "num_input_tokens_seen": 217811860, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 0.01517487, + "step": 10113, + "time_per_iteration": 3.2030844688415527 + }, + { + "auxiliary_loss_clip": 0.06322706, + "auxiliary_loss_mlp": 0.01253815, + "balance_loss_clip": 0.06264073, + "balance_loss_mlp": 0.01252375, + "epoch": 0.6080865774838419, + "flos": 66549786036480.0, + "grad_norm": 0.826261074954681, + "language_loss": 0.57000625, + "learning_rate": 1.4060165023394147e-06, + "loss": 0.64577138, + "num_input_tokens_seen": 217866510, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 0.01438141, + "step": 10114, + "time_per_iteration": 3.0561811923980713 + }, + { + "auxiliary_loss_clip": 0.06416565, + "auxiliary_loss_mlp": 0.0126699, + "balance_loss_clip": 0.0627362, + "balance_loss_mlp": 0.01255528, + "epoch": 0.6081467007365099, + "flos": 19213895214720.0, + "grad_norm": 2.9429969583310744, + "language_loss": 0.70665103, + "learning_rate": 1.4056446244156317e-06, + "loss": 0.7834866, + "num_input_tokens_seen": 217885650, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.11456299, + "step": 10115, + "time_per_iteration": 2.536123037338257 + }, + { + "auxiliary_loss_clip": 0.06416753, + "auxiliary_loss_mlp": 0.01265083, + "balance_loss_clip": 0.06275412, + "balance_loss_mlp": 0.01254128, + "epoch": 0.6082068239891778, + "flos": 24174148089600.0, + "grad_norm": 2.2262194131188617, + "language_loss": 0.72516567, + "learning_rate": 1.4052727690298642e-06, + "loss": 0.80198407, + "num_input_tokens_seen": 217905300, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.10961914, + "step": 10116, + "time_per_iteration": 2.5744457244873047 + }, + { + "auxiliary_loss_clip": 0.06418931, + "auxiliary_loss_mlp": 0.0126628, + "balance_loss_clip": 0.06275393, + "balance_loss_mlp": 0.01254562, + "epoch": 0.6082669472418458, + "flos": 37422150053760.0, + "grad_norm": 1.8492666967546532, + "language_loss": 0.54224104, + "learning_rate": 1.4049009361962138e-06, + "loss": 0.61909318, + "num_input_tokens_seen": 217927845, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.1171875, + "step": 10117, + "time_per_iteration": 2.7010717391967773 + }, + { + "auxiliary_loss_clip": 0.06415669, + "auxiliary_loss_mlp": 0.01262582, + "balance_loss_clip": 0.06273679, + "balance_loss_mlp": 0.01252431, + "epoch": 0.6083270704945137, + "flos": 15090886736640.0, + "grad_norm": 1.6926126638400165, + "language_loss": 0.70553619, + "learning_rate": 1.4045291259287786e-06, + "loss": 0.78231865, + "num_input_tokens_seen": 217946145, + "router_z_loss_clip": 1.41894531, + "router_z_loss_mlp": 0.1015625, + "step": 10118, + "time_per_iteration": 2.5118987560272217 + }, + { + "auxiliary_loss_clip": 0.0641689, + "auxiliary_loss_mlp": 0.01265841, + "balance_loss_clip": 0.06276117, + "balance_loss_mlp": 0.01255857, + "epoch": 0.6083871937471818, + "flos": 20674845077760.0, + "grad_norm": 1.454621938136119, + "language_loss": 0.75087917, + "learning_rate": 1.4041573382416588e-06, + "loss": 0.82770652, + "num_input_tokens_seen": 217965190, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.09979248, + "step": 10119, + "time_per_iteration": 2.5343713760375977 + }, + { + "auxiliary_loss_clip": 0.06418591, + "auxiliary_loss_mlp": 0.01266372, + "balance_loss_clip": 0.06277768, + "balance_loss_mlp": 0.0125559, + "epoch": 0.6084473169998497, + "flos": 21513305358720.0, + "grad_norm": 1.7245965425427678, + "language_loss": 0.67339104, + "learning_rate": 1.4037855731489525e-06, + "loss": 0.75024068, + "num_input_tokens_seen": 217983625, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.10784912, + "step": 10120, + "time_per_iteration": 2.4992902278900146 + }, + { + "auxiliary_loss_clip": 0.06424947, + "auxiliary_loss_mlp": 0.01267829, + "balance_loss_clip": 0.06279485, + "balance_loss_mlp": 0.0125673, + "epoch": 0.6085074402525177, + "flos": 26877309932160.0, + "grad_norm": 1.7168671771406325, + "language_loss": 0.74690855, + "learning_rate": 1.4034138306647571e-06, + "loss": 0.82383633, + "num_input_tokens_seen": 218006005, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.11096191, + "step": 10121, + "time_per_iteration": 2.552943468093872 + }, + { + "auxiliary_loss_clip": 0.06415446, + "auxiliary_loss_mlp": 0.0126478, + "balance_loss_clip": 0.06275289, + "balance_loss_mlp": 0.01254844, + "epoch": 0.6085675635051856, + "flos": 10894518408960.0, + "grad_norm": 1.695682661500106, + "language_loss": 0.80907005, + "learning_rate": 1.4030421108031685e-06, + "loss": 0.88587236, + "num_input_tokens_seen": 218024195, + "router_z_loss_clip": 1.40039062, + "router_z_loss_mlp": 0.09936523, + "step": 10122, + "time_per_iteration": 3.890413522720337 + }, + { + "auxiliary_loss_clip": 0.06419112, + "auxiliary_loss_mlp": 0.0126449, + "balance_loss_clip": 0.06278858, + "balance_loss_mlp": 0.01254483, + "epoch": 0.6086276867578536, + "flos": 34871074571520.0, + "grad_norm": 1.4621063194109842, + "language_loss": 0.55791676, + "learning_rate": 1.402670413578284e-06, + "loss": 0.63475281, + "num_input_tokens_seen": 218047190, + "router_z_loss_clip": 1.40429688, + "router_z_loss_mlp": 0.10015869, + "step": 10123, + "time_per_iteration": 2.6325483322143555 + }, + { + "auxiliary_loss_clip": 0.06419839, + "auxiliary_loss_mlp": 0.01264678, + "balance_loss_clip": 0.06281708, + "balance_loss_mlp": 0.0125355, + "epoch": 0.6086878100105215, + "flos": 20053906796160.0, + "grad_norm": 1.6808318536129285, + "language_loss": 0.74430656, + "learning_rate": 1.4022987390041965e-06, + "loss": 0.82115179, + "num_input_tokens_seen": 218065945, + "router_z_loss_clip": 1.38183594, + "router_z_loss_mlp": 0.11114502, + "step": 10124, + "time_per_iteration": 2.5358493328094482 + }, + { + "auxiliary_loss_clip": 0.06421429, + "auxiliary_loss_mlp": 0.01269718, + "balance_loss_clip": 0.06278759, + "balance_loss_mlp": 0.01258393, + "epoch": 0.6087479332631895, + "flos": 18338314775040.0, + "grad_norm": 11.543954575524463, + "language_loss": 0.65884316, + "learning_rate": 1.4019270870950006e-06, + "loss": 0.73575461, + "num_input_tokens_seen": 218085285, + "router_z_loss_clip": 1.42675781, + "router_z_loss_mlp": 0.11322021, + "step": 10125, + "time_per_iteration": 2.4864342212677 + }, + { + "auxiliary_loss_clip": 0.06421918, + "auxiliary_loss_mlp": 0.01264385, + "balance_loss_clip": 0.06282578, + "balance_loss_mlp": 0.01253841, + "epoch": 0.6088080565158575, + "flos": 24499424839680.0, + "grad_norm": 2.2712886028305, + "language_loss": 0.76395416, + "learning_rate": 1.40155545786479e-06, + "loss": 0.84081715, + "num_input_tokens_seen": 218104735, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.10552979, + "step": 10126, + "time_per_iteration": 2.5664777755737305 + }, + { + "auxiliary_loss_clip": 0.06427297, + "auxiliary_loss_mlp": 0.01266279, + "balance_loss_clip": 0.06280977, + "balance_loss_mlp": 0.0125524, + "epoch": 0.6088681797685255, + "flos": 10273496273280.0, + "grad_norm": 5.11214091408941, + "language_loss": 0.71820217, + "learning_rate": 1.4011838513276558e-06, + "loss": 0.79513788, + "num_input_tokens_seen": 218121855, + "router_z_loss_clip": 1.46386719, + "router_z_loss_mlp": 0.1104126, + "step": 10127, + "time_per_iteration": 2.478034257888794 + }, + { + "auxiliary_loss_clip": 0.06430127, + "auxiliary_loss_mlp": 0.01266951, + "balance_loss_clip": 0.06284942, + "balance_loss_mlp": 0.01255465, + "epoch": 0.6089283030211935, + "flos": 21978928897920.0, + "grad_norm": 2.2629720759221996, + "language_loss": 0.72788715, + "learning_rate": 1.400812267497691e-06, + "loss": 0.80485797, + "num_input_tokens_seen": 218137325, + "router_z_loss_clip": 1.44921875, + "router_z_loss_mlp": 0.11486816, + "step": 10128, + "time_per_iteration": 2.553764820098877 + }, + { + "auxiliary_loss_clip": 0.06422316, + "auxiliary_loss_mlp": 0.0126747, + "balance_loss_clip": 0.06282373, + "balance_loss_mlp": 0.01257355, + "epoch": 0.6089884262738614, + "flos": 17790945978240.0, + "grad_norm": 1.9776728101481476, + "language_loss": 0.7314598, + "learning_rate": 1.4004407063889842e-06, + "loss": 0.8083576, + "num_input_tokens_seen": 218155530, + "router_z_loss_clip": 1.40039062, + "router_z_loss_mlp": 0.10119629, + "step": 10129, + "time_per_iteration": 2.4939491748809814 + }, + { + "auxiliary_loss_clip": 0.06421769, + "auxiliary_loss_mlp": 0.01271284, + "balance_loss_clip": 0.06280705, + "balance_loss_mlp": 0.01260764, + "epoch": 0.6090485495265294, + "flos": 36920496458880.0, + "grad_norm": 1.3316519758914749, + "language_loss": 0.65839994, + "learning_rate": 1.400069168015626e-06, + "loss": 0.73533046, + "num_input_tokens_seen": 218182535, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.10528564, + "step": 10130, + "time_per_iteration": 2.7194180488586426 + }, + { + "auxiliary_loss_clip": 0.0641261, + "auxiliary_loss_mlp": 0.012646, + "balance_loss_clip": 0.06274526, + "balance_loss_mlp": 0.01254926, + "epoch": 0.6091086727791973, + "flos": 19904755328640.0, + "grad_norm": 1.5918133317154841, + "language_loss": 0.77794468, + "learning_rate": 1.3996976523917054e-06, + "loss": 0.85471684, + "num_input_tokens_seen": 218201740, + "router_z_loss_clip": 1.38183594, + "router_z_loss_mlp": 0.09680176, + "step": 10131, + "time_per_iteration": 2.5376133918762207 + }, + { + "auxiliary_loss_clip": 0.0641945, + "auxiliary_loss_mlp": 0.01265085, + "balance_loss_clip": 0.06279676, + "balance_loss_mlp": 0.01255071, + "epoch": 0.6091687960318654, + "flos": 22170147914880.0, + "grad_norm": 1.8790929127191944, + "language_loss": 0.77705514, + "learning_rate": 1.3993261595313093e-06, + "loss": 0.85390049, + "num_input_tokens_seen": 218219800, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.10003662, + "step": 10132, + "time_per_iteration": 3.9999635219573975 + }, + { + "auxiliary_loss_clip": 0.06414825, + "auxiliary_loss_mlp": 0.01267619, + "balance_loss_clip": 0.06278821, + "balance_loss_mlp": 0.01257618, + "epoch": 0.6092289192845333, + "flos": 21470818538880.0, + "grad_norm": 2.2139477747978136, + "language_loss": 0.75865889, + "learning_rate": 1.3989546894485261e-06, + "loss": 0.83548331, + "num_input_tokens_seen": 218237585, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10003662, + "step": 10133, + "time_per_iteration": 2.545747756958008 + }, + { + "auxiliary_loss_clip": 0.06417366, + "auxiliary_loss_mlp": 0.01266553, + "balance_loss_clip": 0.06276603, + "balance_loss_mlp": 0.01255973, + "epoch": 0.6092890425372013, + "flos": 28702585347840.0, + "grad_norm": 1.8044338362434222, + "language_loss": 0.64228314, + "learning_rate": 1.3985832421574414e-06, + "loss": 0.71912241, + "num_input_tokens_seen": 218258700, + "router_z_loss_clip": 1.40722656, + "router_z_loss_mlp": 0.10583496, + "step": 10134, + "time_per_iteration": 2.563861131668091 + }, + { + "auxiliary_loss_clip": 0.06424356, + "auxiliary_loss_mlp": 0.01263619, + "balance_loss_clip": 0.06285493, + "balance_loss_mlp": 0.01253331, + "epoch": 0.6093491657898692, + "flos": 20819384570880.0, + "grad_norm": 1.7758601490441968, + "language_loss": 0.78973985, + "learning_rate": 1.3982118176721397e-06, + "loss": 0.86661959, + "num_input_tokens_seen": 218275655, + "router_z_loss_clip": 1.38476562, + "router_z_loss_mlp": 0.10290527, + "step": 10135, + "time_per_iteration": 2.553738832473755 + }, + { + "auxiliary_loss_clip": 0.06420235, + "auxiliary_loss_mlp": 0.01266173, + "balance_loss_clip": 0.06279118, + "balance_loss_mlp": 0.01256416, + "epoch": 0.6094092890425372, + "flos": 25453983352320.0, + "grad_norm": 1.626137919034545, + "language_loss": 0.72278392, + "learning_rate": 1.3978404160067069e-06, + "loss": 0.79964805, + "num_input_tokens_seen": 218295720, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.09753418, + "step": 10136, + "time_per_iteration": 4.003901958465576 + }, + { + "auxiliary_loss_clip": 0.06420286, + "auxiliary_loss_mlp": 0.01265077, + "balance_loss_clip": 0.06279141, + "balance_loss_mlp": 0.0125464, + "epoch": 0.6094694122952051, + "flos": 35629089333120.0, + "grad_norm": 1.6356074117681172, + "language_loss": 0.74919081, + "learning_rate": 1.3974690371752253e-06, + "loss": 0.82604444, + "num_input_tokens_seen": 218316745, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.10443115, + "step": 10137, + "time_per_iteration": 2.634158134460449 + }, + { + "auxiliary_loss_clip": 0.06417631, + "auxiliary_loss_mlp": 0.01266963, + "balance_loss_clip": 0.06275456, + "balance_loss_mlp": 0.01256246, + "epoch": 0.6095295355478731, + "flos": 24462975513600.0, + "grad_norm": 2.0845106182551163, + "language_loss": 0.80188054, + "learning_rate": 1.3970976811917785e-06, + "loss": 0.87872648, + "num_input_tokens_seen": 218335385, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.10717773, + "step": 10138, + "time_per_iteration": 2.5884156227111816 + }, + { + "auxiliary_loss_clip": 0.06410988, + "auxiliary_loss_mlp": 0.01265559, + "balance_loss_clip": 0.06275302, + "balance_loss_mlp": 0.01255354, + "epoch": 0.6095896588005411, + "flos": 15638716730880.0, + "grad_norm": 1.5018300865324132, + "language_loss": 0.81360239, + "learning_rate": 1.3967263480704481e-06, + "loss": 0.89036787, + "num_input_tokens_seen": 218353320, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.10205078, + "step": 10139, + "time_per_iteration": 2.4757158756256104 + }, + { + "auxiliary_loss_clip": 0.06419017, + "auxiliary_loss_mlp": 0.01267763, + "balance_loss_clip": 0.06276064, + "balance_loss_mlp": 0.01255895, + "epoch": 0.6096497820532091, + "flos": 15554455850880.0, + "grad_norm": 1.944047007891517, + "language_loss": 0.83626902, + "learning_rate": 1.396355037825315e-06, + "loss": 0.91313678, + "num_input_tokens_seen": 218365620, + "router_z_loss_clip": 1.42871094, + "router_z_loss_mlp": 0.11865234, + "step": 10140, + "time_per_iteration": 2.5361695289611816 + }, + { + "auxiliary_loss_clip": 0.06419208, + "auxiliary_loss_mlp": 0.0126965, + "balance_loss_clip": 0.06277294, + "balance_loss_mlp": 0.01258718, + "epoch": 0.6097099053058771, + "flos": 24210932832000.0, + "grad_norm": 1.8133263657959964, + "language_loss": 0.75536144, + "learning_rate": 1.3959837504704592e-06, + "loss": 0.83225, + "num_input_tokens_seen": 218383785, + "router_z_loss_clip": 1.41992188, + "router_z_loss_mlp": 0.10925293, + "step": 10141, + "time_per_iteration": 3.9623372554779053 + }, + { + "auxiliary_loss_clip": 0.06413428, + "auxiliary_loss_mlp": 0.01263151, + "balance_loss_clip": 0.06275016, + "balance_loss_mlp": 0.01253358, + "epoch": 0.609770028558545, + "flos": 19575830926080.0, + "grad_norm": 2.621888589140599, + "language_loss": 0.76574522, + "learning_rate": 1.3956124860199603e-06, + "loss": 0.842511, + "num_input_tokens_seen": 218399055, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.09790039, + "step": 10142, + "time_per_iteration": 2.5719213485717773 + }, + { + "auxiliary_loss_clip": 0.06415378, + "auxiliary_loss_mlp": 0.01266124, + "balance_loss_clip": 0.06274366, + "balance_loss_mlp": 0.01255979, + "epoch": 0.609830151811213, + "flos": 23955619841280.0, + "grad_norm": 1.612746865863279, + "language_loss": 0.76346582, + "learning_rate": 1.3952412444878964e-06, + "loss": 0.84028077, + "num_input_tokens_seen": 218419120, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.10150146, + "step": 10143, + "time_per_iteration": 2.529778242111206 + }, + { + "auxiliary_loss_clip": 0.06417874, + "auxiliary_loss_mlp": 0.01264047, + "balance_loss_clip": 0.06277366, + "balance_loss_mlp": 0.01253467, + "epoch": 0.6098902750638809, + "flos": 16185205059840.0, + "grad_norm": 2.5594432881750104, + "language_loss": 0.7530098, + "learning_rate": 1.3948700258883448e-06, + "loss": 0.82982898, + "num_input_tokens_seen": 218435290, + "router_z_loss_clip": 1.40527344, + "router_z_loss_mlp": 0.105896, + "step": 10144, + "time_per_iteration": 2.526620864868164 + }, + { + "auxiliary_loss_clip": 0.06420074, + "auxiliary_loss_mlp": 0.01264405, + "balance_loss_clip": 0.06276617, + "balance_loss_mlp": 0.01253634, + "epoch": 0.609950398316549, + "flos": 44536141549440.0, + "grad_norm": 2.1298130564389224, + "language_loss": 0.73869997, + "learning_rate": 1.394498830235383e-06, + "loss": 0.81554472, + "num_input_tokens_seen": 218457880, + "router_z_loss_clip": 1.43457031, + "router_z_loss_mlp": 0.10772705, + "step": 10145, + "time_per_iteration": 2.7241427898406982 + }, + { + "auxiliary_loss_clip": 0.06415195, + "auxiliary_loss_mlp": 0.01263159, + "balance_loss_clip": 0.06274907, + "balance_loss_mlp": 0.01252156, + "epoch": 0.6100105215692169, + "flos": 23228436182400.0, + "grad_norm": 1.5962491809481525, + "language_loss": 0.69665307, + "learning_rate": 1.3941276575430862e-06, + "loss": 0.77343661, + "num_input_tokens_seen": 218475930, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.11004639, + "step": 10146, + "time_per_iteration": 2.557990312576294 + }, + { + "auxiliary_loss_clip": 0.0641242, + "auxiliary_loss_mlp": 0.01264789, + "balance_loss_clip": 0.06276412, + "balance_loss_mlp": 0.01254865, + "epoch": 0.6100706448218849, + "flos": 15017904230400.0, + "grad_norm": 1.5284940617625797, + "language_loss": 0.76506376, + "learning_rate": 1.3937565078255289e-06, + "loss": 0.84183586, + "num_input_tokens_seen": 218493675, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.09936523, + "step": 10147, + "time_per_iteration": 2.5613648891448975 + }, + { + "auxiliary_loss_clip": 0.06412101, + "auxiliary_loss_mlp": 0.01262446, + "balance_loss_clip": 0.0627313, + "balance_loss_mlp": 0.01252153, + "epoch": 0.6101307680745528, + "flos": 19645039998720.0, + "grad_norm": 1.6729040728987632, + "language_loss": 0.78694391, + "learning_rate": 1.393385381096786e-06, + "loss": 0.86368936, + "num_input_tokens_seen": 218511780, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.10296631, + "step": 10148, + "time_per_iteration": 2.5073816776275635 + }, + { + "auxiliary_loss_clip": 0.06424719, + "auxiliary_loss_mlp": 0.01265462, + "balance_loss_clip": 0.06278485, + "balance_loss_mlp": 0.01253672, + "epoch": 0.6101908913272208, + "flos": 29943455662080.0, + "grad_norm": 11.644498336945409, + "language_loss": 0.53887326, + "learning_rate": 1.39301427737093e-06, + "loss": 0.61577505, + "num_input_tokens_seen": 218531850, + "router_z_loss_clip": 1.45996094, + "router_z_loss_mlp": 0.11779785, + "step": 10149, + "time_per_iteration": 2.579378843307495 + }, + { + "auxiliary_loss_clip": 0.0641048, + "auxiliary_loss_mlp": 0.01264861, + "balance_loss_clip": 0.06277239, + "balance_loss_mlp": 0.0125511, + "epoch": 0.6102510145798887, + "flos": 21805067675520.0, + "grad_norm": 1.6674264382808133, + "language_loss": 0.80347526, + "learning_rate": 1.3926431966620333e-06, + "loss": 0.8802287, + "num_input_tokens_seen": 218551245, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.09753418, + "step": 10150, + "time_per_iteration": 2.542039394378662 + }, + { + "auxiliary_loss_clip": 0.06418844, + "auxiliary_loss_mlp": 0.01266292, + "balance_loss_clip": 0.06277014, + "balance_loss_mlp": 0.01254747, + "epoch": 0.6103111378325567, + "flos": 20712719923200.0, + "grad_norm": 1.6063484518637994, + "language_loss": 0.69615412, + "learning_rate": 1.3922721389841684e-06, + "loss": 0.77300549, + "num_input_tokens_seen": 218571365, + "router_z_loss_clip": 1.41699219, + "router_z_loss_mlp": 0.11529541, + "step": 10151, + "time_per_iteration": 2.5254616737365723 + }, + { + "auxiliary_loss_clip": 0.06415872, + "auxiliary_loss_mlp": 0.01264029, + "balance_loss_clip": 0.06276833, + "balance_loss_mlp": 0.01254218, + "epoch": 0.6103712610852247, + "flos": 29388330362880.0, + "grad_norm": 1.5395706469140102, + "language_loss": 0.71042097, + "learning_rate": 1.3919011043514036e-06, + "loss": 0.78722, + "num_input_tokens_seen": 218588315, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.0980835, + "step": 10152, + "time_per_iteration": 2.565767288208008 + }, + { + "auxiliary_loss_clip": 0.06416918, + "auxiliary_loss_mlp": 0.01268582, + "balance_loss_clip": 0.06275494, + "balance_loss_mlp": 0.01257883, + "epoch": 0.6104313843378927, + "flos": 20819216862720.0, + "grad_norm": 1.604020409534104, + "language_loss": 0.78784543, + "learning_rate": 1.391530092777811e-06, + "loss": 0.86470044, + "num_input_tokens_seen": 218605940, + "router_z_loss_clip": 1.41308594, + "router_z_loss_mlp": 0.10699463, + "step": 10153, + "time_per_iteration": 2.5230531692504883 + }, + { + "auxiliary_loss_clip": 0.06414121, + "auxiliary_loss_mlp": 0.01268779, + "balance_loss_clip": 0.06273308, + "balance_loss_mlp": 0.01258873, + "epoch": 0.6104915075905607, + "flos": 26585715323520.0, + "grad_norm": 1.630222855772095, + "language_loss": 0.79992545, + "learning_rate": 1.3911591042774573e-06, + "loss": 0.8767544, + "num_input_tokens_seen": 218626100, + "router_z_loss_clip": 1.40917969, + "router_z_loss_mlp": 0.09906006, + "step": 10154, + "time_per_iteration": 2.5763237476348877 + }, + { + "auxiliary_loss_clip": 0.06417637, + "auxiliary_loss_mlp": 0.01269392, + "balance_loss_clip": 0.06279704, + "balance_loss_mlp": 0.01258937, + "epoch": 0.6105516308432286, + "flos": 23922734313600.0, + "grad_norm": 1.4598935838539129, + "language_loss": 0.70770371, + "learning_rate": 1.3907881388644116e-06, + "loss": 0.78457403, + "num_input_tokens_seen": 218645060, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.10455322, + "step": 10155, + "time_per_iteration": 2.5680413246154785 + }, + { + "auxiliary_loss_clip": 0.06418546, + "auxiliary_loss_mlp": 0.01266443, + "balance_loss_clip": 0.06278499, + "balance_loss_mlp": 0.0125569, + "epoch": 0.6106117540958966, + "flos": 31585520125440.0, + "grad_norm": 1.5387182092943745, + "language_loss": 0.71842468, + "learning_rate": 1.3904171965527413e-06, + "loss": 0.79527456, + "num_input_tokens_seen": 218667690, + "router_z_loss_clip": 1.39941406, + "router_z_loss_mlp": 0.10742188, + "step": 10156, + "time_per_iteration": 2.6240859031677246 + }, + { + "auxiliary_loss_clip": 0.06412362, + "auxiliary_loss_mlp": 0.01266681, + "balance_loss_clip": 0.06277083, + "balance_loss_mlp": 0.01255422, + "epoch": 0.6106718773485645, + "flos": 19613999260800.0, + "grad_norm": 1.3880208824071523, + "language_loss": 0.67516112, + "learning_rate": 1.3900462773565114e-06, + "loss": 0.75195158, + "num_input_tokens_seen": 218687505, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.11254883, + "step": 10157, + "time_per_iteration": 2.533141613006592 + }, + { + "auxiliary_loss_clip": 0.06414488, + "auxiliary_loss_mlp": 0.01264295, + "balance_loss_clip": 0.06273255, + "balance_loss_mlp": 0.01253888, + "epoch": 0.6107320006012326, + "flos": 17128778688000.0, + "grad_norm": 1.7065905103759618, + "language_loss": 0.72894049, + "learning_rate": 1.3896753812897877e-06, + "loss": 0.80572832, + "num_input_tokens_seen": 218705315, + "router_z_loss_clip": 1.41308594, + "router_z_loss_mlp": 0.10400391, + "step": 10158, + "time_per_iteration": 2.4852585792541504 + }, + { + "auxiliary_loss_clip": 0.06417953, + "auxiliary_loss_mlp": 0.01268086, + "balance_loss_clip": 0.06274998, + "balance_loss_mlp": 0.01257917, + "epoch": 0.6107921238539005, + "flos": 30155107875840.0, + "grad_norm": 1.7026117107079757, + "language_loss": 0.69434297, + "learning_rate": 1.389304508366635e-06, + "loss": 0.7712034, + "num_input_tokens_seen": 218725735, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.1015625, + "step": 10159, + "time_per_iteration": 2.6481263637542725 + }, + { + "auxiliary_loss_clip": 0.06416903, + "auxiliary_loss_mlp": 0.01266619, + "balance_loss_clip": 0.06276091, + "balance_loss_mlp": 0.01255747, + "epoch": 0.6108522471065685, + "flos": 18445859890560.0, + "grad_norm": 1.7469967655501557, + "language_loss": 0.79027724, + "learning_rate": 1.3889336586011167e-06, + "loss": 0.86711246, + "num_input_tokens_seen": 218743215, + "router_z_loss_clip": 1.40722656, + "router_z_loss_mlp": 0.10876465, + "step": 10160, + "time_per_iteration": 2.5056142807006836 + }, + { + "auxiliary_loss_clip": 0.06325343, + "auxiliary_loss_mlp": 0.01260291, + "balance_loss_clip": 0.06266694, + "balance_loss_mlp": 0.01258597, + "epoch": 0.6109123703592364, + "flos": 64157295605760.0, + "grad_norm": 0.797024648042973, + "language_loss": 0.61520749, + "learning_rate": 1.388562832007295e-06, + "loss": 0.69106382, + "num_input_tokens_seen": 218806440, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 0.01698303, + "step": 10161, + "time_per_iteration": 3.325639486312866 + }, + { + "auxiliary_loss_clip": 0.06418448, + "auxiliary_loss_mlp": 0.01268382, + "balance_loss_clip": 0.06276111, + "balance_loss_mlp": 0.01257099, + "epoch": 0.6109724936119044, + "flos": 20674132318080.0, + "grad_norm": 2.3454759388543316, + "language_loss": 0.76444739, + "learning_rate": 1.3881920285992324e-06, + "loss": 0.84131569, + "num_input_tokens_seen": 218825720, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.112854, + "step": 10162, + "time_per_iteration": 4.040041446685791 + }, + { + "auxiliary_loss_clip": 0.06414326, + "auxiliary_loss_mlp": 0.01264875, + "balance_loss_clip": 0.0627545, + "balance_loss_mlp": 0.01253669, + "epoch": 0.6110326168645723, + "flos": 31358899958400.0, + "grad_norm": 1.528039199186958, + "language_loss": 0.71962601, + "learning_rate": 1.3878212483909888e-06, + "loss": 0.79641795, + "num_input_tokens_seen": 218847735, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.11218262, + "step": 10163, + "time_per_iteration": 2.5920441150665283 + }, + { + "auxiliary_loss_clip": 0.06409657, + "auxiliary_loss_mlp": 0.01267118, + "balance_loss_clip": 0.06273548, + "balance_loss_mlp": 0.01257903, + "epoch": 0.6110927401172404, + "flos": 25009338061440.0, + "grad_norm": 1.7630876229655692, + "language_loss": 0.60071069, + "learning_rate": 1.387450491396625e-06, + "loss": 0.67747843, + "num_input_tokens_seen": 218866585, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.09210205, + "step": 10164, + "time_per_iteration": 2.559441328048706 + }, + { + "auxiliary_loss_clip": 0.06414106, + "auxiliary_loss_mlp": 0.01269871, + "balance_loss_clip": 0.0627519, + "balance_loss_mlp": 0.0125975, + "epoch": 0.6111528633699083, + "flos": 26254946131200.0, + "grad_norm": 1.466434652755145, + "language_loss": 0.75936824, + "learning_rate": 1.3870797576302003e-06, + "loss": 0.83620799, + "num_input_tokens_seen": 218885560, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.10119629, + "step": 10165, + "time_per_iteration": 2.521923542022705 + }, + { + "auxiliary_loss_clip": 0.0641854, + "auxiliary_loss_mlp": 0.01268441, + "balance_loss_clip": 0.06282263, + "balance_loss_mlp": 0.0125807, + "epoch": 0.6112129866225763, + "flos": 22389011579520.0, + "grad_norm": 1.518231620716018, + "language_loss": 0.79607749, + "learning_rate": 1.3867090471057719e-06, + "loss": 0.87294728, + "num_input_tokens_seen": 218905055, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.10375977, + "step": 10166, + "time_per_iteration": 2.5410702228546143 + }, + { + "auxiliary_loss_clip": 0.06416941, + "auxiliary_loss_mlp": 0.01265827, + "balance_loss_clip": 0.06276624, + "balance_loss_mlp": 0.01254949, + "epoch": 0.6112731098752443, + "flos": 25234826198400.0, + "grad_norm": 7.9003095632563385, + "language_loss": 0.68483454, + "learning_rate": 1.3863383598373987e-06, + "loss": 0.76166224, + "num_input_tokens_seen": 218924030, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.10876465, + "step": 10167, + "time_per_iteration": 2.5295464992523193 + }, + { + "auxiliary_loss_clip": 0.0641242, + "auxiliary_loss_mlp": 0.01265962, + "balance_loss_clip": 0.06275839, + "balance_loss_mlp": 0.01256586, + "epoch": 0.6113332331279122, + "flos": 22899763342080.0, + "grad_norm": 1.6873056368761516, + "language_loss": 0.7915386, + "learning_rate": 1.3859676958391364e-06, + "loss": 0.86832243, + "num_input_tokens_seen": 218943750, + "router_z_loss_clip": 1.36621094, + "router_z_loss_mlp": 0.09381104, + "step": 10168, + "time_per_iteration": 2.53782320022583 + }, + { + "auxiliary_loss_clip": 0.06426514, + "auxiliary_loss_mlp": 0.0126727, + "balance_loss_clip": 0.06277908, + "balance_loss_mlp": 0.01254991, + "epoch": 0.6113933563805802, + "flos": 18625548971520.0, + "grad_norm": 2.2514835469058405, + "language_loss": 0.86128104, + "learning_rate": 1.3855970551250398e-06, + "loss": 0.93821883, + "num_input_tokens_seen": 218957585, + "router_z_loss_clip": 1.48632812, + "router_z_loss_mlp": 0.12286377, + "step": 10169, + "time_per_iteration": 2.4681122303009033 + }, + { + "auxiliary_loss_clip": 0.06415342, + "auxiliary_loss_mlp": 0.01264532, + "balance_loss_clip": 0.06275853, + "balance_loss_mlp": 0.01254871, + "epoch": 0.6114534796332481, + "flos": 41876137359360.0, + "grad_norm": 1.5861355547500362, + "language_loss": 0.79530609, + "learning_rate": 1.3852264377091652e-06, + "loss": 0.87210482, + "num_input_tokens_seen": 218980025, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.09661865, + "step": 10170, + "time_per_iteration": 2.707791566848755 + }, + { + "auxiliary_loss_clip": 0.06423808, + "auxiliary_loss_mlp": 0.01264285, + "balance_loss_clip": 0.06277203, + "balance_loss_mlp": 0.01252359, + "epoch": 0.6115136028859162, + "flos": 21914960705280.0, + "grad_norm": 2.240444553593937, + "language_loss": 0.6873374, + "learning_rate": 1.3848558436055651e-06, + "loss": 0.76421833, + "num_input_tokens_seen": 218998200, + "router_z_loss_clip": 1.46679688, + "router_z_loss_mlp": 0.1192627, + "step": 10171, + "time_per_iteration": 2.505051612854004 + }, + { + "auxiliary_loss_clip": 0.06420024, + "auxiliary_loss_mlp": 0.01266591, + "balance_loss_clip": 0.06277289, + "balance_loss_mlp": 0.01254634, + "epoch": 0.6115737261385841, + "flos": 28812604158720.0, + "grad_norm": 6.231678075331036, + "language_loss": 0.79464412, + "learning_rate": 1.3844852728282934e-06, + "loss": 0.87151027, + "num_input_tokens_seen": 219017910, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.11962891, + "step": 10172, + "time_per_iteration": 4.057689666748047 + }, + { + "auxiliary_loss_clip": 0.06425016, + "auxiliary_loss_mlp": 0.01268326, + "balance_loss_clip": 0.06279068, + "balance_loss_mlp": 0.01257222, + "epoch": 0.6116338493912521, + "flos": 21257824659840.0, + "grad_norm": 1.6337666078989976, + "language_loss": 0.67181307, + "learning_rate": 1.3841147253914022e-06, + "loss": 0.74874651, + "num_input_tokens_seen": 219037730, + "router_z_loss_clip": 1.45703125, + "router_z_loss_mlp": 0.11108398, + "step": 10173, + "time_per_iteration": 2.5301437377929688 + }, + { + "auxiliary_loss_clip": 0.06418002, + "auxiliary_loss_mlp": 0.01270854, + "balance_loss_clip": 0.06275578, + "balance_loss_mlp": 0.01259261, + "epoch": 0.61169397264392, + "flos": 17535968403840.0, + "grad_norm": 1.769252328158937, + "language_loss": 0.56344169, + "learning_rate": 1.3837442013089416e-06, + "loss": 0.64033026, + "num_input_tokens_seen": 219056755, + "router_z_loss_clip": 1.42285156, + "router_z_loss_mlp": 0.1159668, + "step": 10174, + "time_per_iteration": 2.530437707901001 + }, + { + "auxiliary_loss_clip": 0.064185, + "auxiliary_loss_mlp": 0.01267148, + "balance_loss_clip": 0.06277028, + "balance_loss_mlp": 0.01255931, + "epoch": 0.611754095896588, + "flos": 23958387025920.0, + "grad_norm": 1.6825013036462741, + "language_loss": 0.66233337, + "learning_rate": 1.3833737005949628e-06, + "loss": 0.73918986, + "num_input_tokens_seen": 219076985, + "router_z_loss_clip": 1.41699219, + "router_z_loss_mlp": 0.11212158, + "step": 10175, + "time_per_iteration": 4.048693656921387 + }, + { + "auxiliary_loss_clip": 0.06415173, + "auxiliary_loss_mlp": 0.01263056, + "balance_loss_clip": 0.06275052, + "balance_loss_mlp": 0.01253019, + "epoch": 0.6118142191492559, + "flos": 26002064908800.0, + "grad_norm": 1.985962827753808, + "language_loss": 0.82859969, + "learning_rate": 1.3830032232635154e-06, + "loss": 0.90538198, + "num_input_tokens_seen": 219096050, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.10040283, + "step": 10176, + "time_per_iteration": 2.5558836460113525 + }, + { + "auxiliary_loss_clip": 0.06419128, + "auxiliary_loss_mlp": 0.01271507, + "balance_loss_clip": 0.06277899, + "balance_loss_mlp": 0.01259491, + "epoch": 0.611874342401924, + "flos": 24609275942400.0, + "grad_norm": 1.5904100346197647, + "language_loss": 0.77812099, + "learning_rate": 1.3826327693286474e-06, + "loss": 0.85502738, + "num_input_tokens_seen": 219112665, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.12011719, + "step": 10177, + "time_per_iteration": 2.5346739292144775 + }, + { + "auxiliary_loss_clip": 0.06416818, + "auxiliary_loss_mlp": 0.01269124, + "balance_loss_clip": 0.06275249, + "balance_loss_mlp": 0.01257924, + "epoch": 0.6119344656545919, + "flos": 15892436494080.0, + "grad_norm": 2.6097925851891755, + "language_loss": 0.75949138, + "learning_rate": 1.3822623388044065e-06, + "loss": 0.8363508, + "num_input_tokens_seen": 219129120, + "router_z_loss_clip": 1.41503906, + "router_z_loss_mlp": 0.11212158, + "step": 10178, + "time_per_iteration": 2.524557113647461 + }, + { + "auxiliary_loss_clip": 0.06418636, + "auxiliary_loss_mlp": 0.01267998, + "balance_loss_clip": 0.06276189, + "balance_loss_mlp": 0.01256435, + "epoch": 0.6119945889072599, + "flos": 21659312298240.0, + "grad_norm": 1.5720284026291744, + "language_loss": 0.67318261, + "learning_rate": 1.3818919317048402e-06, + "loss": 0.75004888, + "num_input_tokens_seen": 219148950, + "router_z_loss_clip": 1.42285156, + "router_z_loss_mlp": 0.11553955, + "step": 10179, + "time_per_iteration": 2.5297069549560547 + }, + { + "auxiliary_loss_clip": 0.06419764, + "auxiliary_loss_mlp": 0.01264087, + "balance_loss_clip": 0.06276909, + "balance_loss_mlp": 0.01253179, + "epoch": 0.6120547121599279, + "flos": 13777746675840.0, + "grad_norm": 1.9709040238374929, + "language_loss": 0.83888078, + "learning_rate": 1.3815215480439933e-06, + "loss": 0.91571933, + "num_input_tokens_seen": 219165585, + "router_z_loss_clip": 1.42675781, + "router_z_loss_mlp": 0.10906982, + "step": 10180, + "time_per_iteration": 3.9827919006347656 + }, + { + "auxiliary_loss_clip": 0.06417181, + "auxiliary_loss_mlp": 0.01268448, + "balance_loss_clip": 0.06276719, + "balance_loss_mlp": 0.01256683, + "epoch": 0.6121148354125958, + "flos": 20084528263680.0, + "grad_norm": 1.549982980411044, + "language_loss": 0.77731764, + "learning_rate": 1.3811511878359113e-06, + "loss": 0.8541739, + "num_input_tokens_seen": 219183280, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.11761475, + "step": 10181, + "time_per_iteration": 2.4853463172912598 + }, + { + "auxiliary_loss_clip": 0.06420098, + "auxiliary_loss_mlp": 0.01269807, + "balance_loss_clip": 0.06277204, + "balance_loss_mlp": 0.01258565, + "epoch": 0.6121749586652638, + "flos": 13474915620480.0, + "grad_norm": 2.0089243925599973, + "language_loss": 0.8071022, + "learning_rate": 1.3807808510946384e-06, + "loss": 0.88400126, + "num_input_tokens_seen": 219197200, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.11248779, + "step": 10182, + "time_per_iteration": 2.4935574531555176 + }, + { + "auxiliary_loss_clip": 0.06411545, + "auxiliary_loss_mlp": 0.01267116, + "balance_loss_clip": 0.0627587, + "balance_loss_mlp": 0.0125805, + "epoch": 0.6122350819179317, + "flos": 20126721594240.0, + "grad_norm": 1.501667213386016, + "language_loss": 0.83102655, + "learning_rate": 1.3804105378342177e-06, + "loss": 0.90781319, + "num_input_tokens_seen": 219216825, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.09069824, + "step": 10183, + "time_per_iteration": 2.5836997032165527 + }, + { + "auxiliary_loss_clip": 0.06327992, + "auxiliary_loss_mlp": 0.01253825, + "balance_loss_clip": 0.06268366, + "balance_loss_mlp": 0.01252147, + "epoch": 0.6122952051705998, + "flos": 65448004700160.0, + "grad_norm": 0.7149962337899693, + "language_loss": 0.62764937, + "learning_rate": 1.3800402480686914e-06, + "loss": 0.70346749, + "num_input_tokens_seen": 219283795, + "router_z_loss_clip": 0.59765625, + "router_z_loss_mlp": 0.01681519, + "step": 10184, + "time_per_iteration": 3.3003170490264893 + }, + { + "auxiliary_loss_clip": 0.06420484, + "auxiliary_loss_mlp": 0.01263793, + "balance_loss_clip": 0.06279504, + "balance_loss_mlp": 0.01253857, + "epoch": 0.6123553284232677, + "flos": 20382537709440.0, + "grad_norm": 1.6441224641064962, + "language_loss": 0.82408071, + "learning_rate": 1.379669981812101e-06, + "loss": 0.90092349, + "num_input_tokens_seen": 219302385, + "router_z_loss_clip": 1.40917969, + "router_z_loss_mlp": 0.09936523, + "step": 10185, + "time_per_iteration": 2.5150225162506104 + }, + { + "auxiliary_loss_clip": 0.06425197, + "auxiliary_loss_mlp": 0.01266627, + "balance_loss_clip": 0.06278922, + "balance_loss_mlp": 0.01255487, + "epoch": 0.6124154516759357, + "flos": 23994417081600.0, + "grad_norm": 1.7366290964606979, + "language_loss": 0.75121021, + "learning_rate": 1.3792997390784868e-06, + "loss": 0.82812846, + "num_input_tokens_seen": 219319765, + "router_z_loss_clip": 1.46386719, + "router_z_loss_mlp": 0.11151123, + "step": 10186, + "time_per_iteration": 2.627387046813965 + }, + { + "auxiliary_loss_clip": 0.06415901, + "auxiliary_loss_mlp": 0.01262607, + "balance_loss_clip": 0.06275809, + "balance_loss_mlp": 0.01252599, + "epoch": 0.6124755749286036, + "flos": 21474927388800.0, + "grad_norm": 1.4642741872217127, + "language_loss": 0.78637451, + "learning_rate": 1.3789295198818895e-06, + "loss": 0.8631596, + "num_input_tokens_seen": 219337440, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.10003662, + "step": 10187, + "time_per_iteration": 2.49202561378479 + }, + { + "auxiliary_loss_clip": 0.06414475, + "auxiliary_loss_mlp": 0.01265646, + "balance_loss_clip": 0.06274372, + "balance_loss_mlp": 0.0125472, + "epoch": 0.6125356981812716, + "flos": 23886117279360.0, + "grad_norm": 1.4743912854017487, + "language_loss": 0.83344066, + "learning_rate": 1.3785593242363462e-06, + "loss": 0.91024196, + "num_input_tokens_seen": 219357525, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.10925293, + "step": 10188, + "time_per_iteration": 2.555687427520752 + }, + { + "auxiliary_loss_clip": 0.06417944, + "auxiliary_loss_mlp": 0.01265819, + "balance_loss_clip": 0.06276008, + "balance_loss_mlp": 0.01255168, + "epoch": 0.6125958214339395, + "flos": 14430312673920.0, + "grad_norm": 1.6601752905069214, + "language_loss": 0.75527823, + "learning_rate": 1.378189152155896e-06, + "loss": 0.83211589, + "num_input_tokens_seen": 219374855, + "router_z_loss_clip": 1.41992188, + "router_z_loss_mlp": 0.10656738, + "step": 10189, + "time_per_iteration": 2.4994595050811768 + }, + { + "auxiliary_loss_clip": 0.06417951, + "auxiliary_loss_mlp": 0.01265327, + "balance_loss_clip": 0.06275356, + "balance_loss_mlp": 0.012543, + "epoch": 0.6126559446866076, + "flos": 23265933684480.0, + "grad_norm": 1.4192081343801892, + "language_loss": 0.74300897, + "learning_rate": 1.3778190036545758e-06, + "loss": 0.81984174, + "num_input_tokens_seen": 219394740, + "router_z_loss_clip": 1.42480469, + "router_z_loss_mlp": 0.11016846, + "step": 10190, + "time_per_iteration": 2.6080024242401123 + }, + { + "auxiliary_loss_clip": 0.06418385, + "auxiliary_loss_mlp": 0.01266786, + "balance_loss_clip": 0.0627688, + "balance_loss_mlp": 0.01255044, + "epoch": 0.6127160679392755, + "flos": 26871188584320.0, + "grad_norm": 1.672928736412144, + "language_loss": 0.68484575, + "learning_rate": 1.3774488787464207e-06, + "loss": 0.76169741, + "num_input_tokens_seen": 219413755, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.11749268, + "step": 10191, + "time_per_iteration": 2.54805064201355 + }, + { + "auxiliary_loss_clip": 0.06419395, + "auxiliary_loss_mlp": 0.012717, + "balance_loss_clip": 0.06276383, + "balance_loss_mlp": 0.01259833, + "epoch": 0.6127761911919435, + "flos": 26403720255360.0, + "grad_norm": 1.7824154048725067, + "language_loss": 0.73771405, + "learning_rate": 1.377078777445467e-06, + "loss": 0.81462502, + "num_input_tokens_seen": 219433560, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.11859131, + "step": 10192, + "time_per_iteration": 2.556392192840576 + }, + { + "auxiliary_loss_clip": 0.06413901, + "auxiliary_loss_mlp": 0.01263543, + "balance_loss_clip": 0.06275194, + "balance_loss_mlp": 0.01253225, + "epoch": 0.6128363144446115, + "flos": 22640802698880.0, + "grad_norm": 1.814520897334069, + "language_loss": 0.84227109, + "learning_rate": 1.3767086997657478e-06, + "loss": 0.91904557, + "num_input_tokens_seen": 219452640, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.10314941, + "step": 10193, + "time_per_iteration": 2.5000216960906982 + }, + { + "auxiliary_loss_clip": 0.06417094, + "auxiliary_loss_mlp": 0.01267497, + "balance_loss_clip": 0.06275633, + "balance_loss_mlp": 0.01256625, + "epoch": 0.6128964376972794, + "flos": 26766033310080.0, + "grad_norm": 2.0280898056271255, + "language_loss": 0.707515, + "learning_rate": 1.3763386457212979e-06, + "loss": 0.78436089, + "num_input_tokens_seen": 219468585, + "router_z_loss_clip": 1.41308594, + "router_z_loss_mlp": 0.10870361, + "step": 10194, + "time_per_iteration": 2.5357043743133545 + }, + { + "auxiliary_loss_clip": 0.06330009, + "auxiliary_loss_mlp": 0.01254574, + "balance_loss_clip": 0.06270672, + "balance_loss_mlp": 0.01252429, + "epoch": 0.6129565609499474, + "flos": 65585500450560.0, + "grad_norm": 0.7963949843311754, + "language_loss": 0.58648682, + "learning_rate": 1.375968615326149e-06, + "loss": 0.66233265, + "num_input_tokens_seen": 219523015, + "router_z_loss_clip": 0.59375, + "router_z_loss_mlp": 0.02146912, + "step": 10195, + "time_per_iteration": 2.935722589492798 + }, + { + "auxiliary_loss_clip": 0.06416507, + "auxiliary_loss_mlp": 0.01269514, + "balance_loss_clip": 0.06275862, + "balance_loss_mlp": 0.01257873, + "epoch": 0.6130166842026153, + "flos": 16367577471360.0, + "grad_norm": 1.8676293874241905, + "language_loss": 0.69944096, + "learning_rate": 1.3755986085943324e-06, + "loss": 0.77630115, + "num_input_tokens_seen": 219539980, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.11639404, + "step": 10196, + "time_per_iteration": 2.522855520248413 + }, + { + "auxiliary_loss_clip": 0.06413607, + "auxiliary_loss_mlp": 0.0126591, + "balance_loss_clip": 0.06273703, + "balance_loss_mlp": 0.01255795, + "epoch": 0.6130768074552834, + "flos": 23658029665920.0, + "grad_norm": 1.6623431982713033, + "language_loss": 0.7114116, + "learning_rate": 1.3752286255398788e-06, + "loss": 0.78820676, + "num_input_tokens_seen": 219556980, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.10113525, + "step": 10197, + "time_per_iteration": 2.576241970062256 + }, + { + "auxiliary_loss_clip": 0.06418445, + "auxiliary_loss_mlp": 0.01271491, + "balance_loss_clip": 0.06275209, + "balance_loss_mlp": 0.01260828, + "epoch": 0.6131369307079513, + "flos": 20053613306880.0, + "grad_norm": 1.7635400810353365, + "language_loss": 0.78912157, + "learning_rate": 1.3748586661768191e-06, + "loss": 0.86602092, + "num_input_tokens_seen": 219576410, + "router_z_loss_clip": 1.43164062, + "router_z_loss_mlp": 0.10675049, + "step": 10198, + "time_per_iteration": 2.5441195964813232 + }, + { + "auxiliary_loss_clip": 0.06419414, + "auxiliary_loss_mlp": 0.01266374, + "balance_loss_clip": 0.06274287, + "balance_loss_mlp": 0.01255138, + "epoch": 0.6131970539606193, + "flos": 22678384055040.0, + "grad_norm": 1.422407986186852, + "language_loss": 0.74737686, + "learning_rate": 1.374488730519181e-06, + "loss": 0.82423472, + "num_input_tokens_seen": 219597180, + "router_z_loss_clip": 1.44824219, + "router_z_loss_mlp": 0.11236572, + "step": 10199, + "time_per_iteration": 2.567636251449585 + }, + { + "auxiliary_loss_clip": 0.06417924, + "auxiliary_loss_mlp": 0.01269269, + "balance_loss_clip": 0.06272729, + "balance_loss_mlp": 0.01257735, + "epoch": 0.6132571772132872, + "flos": 26878316181120.0, + "grad_norm": 1.5670545162327942, + "language_loss": 0.62008464, + "learning_rate": 1.374118818580993e-06, + "loss": 0.69695652, + "num_input_tokens_seen": 219617630, + "router_z_loss_clip": 1.45214844, + "router_z_loss_mlp": 0.11541748, + "step": 10200, + "time_per_iteration": 2.561591863632202 + }, + { + "auxiliary_loss_clip": 0.06416481, + "auxiliary_loss_mlp": 0.01270085, + "balance_loss_clip": 0.06275273, + "balance_loss_mlp": 0.0125944, + "epoch": 0.6133173004659552, + "flos": 22899176363520.0, + "grad_norm": 1.7093296118249273, + "language_loss": 0.69054127, + "learning_rate": 1.3737489303762822e-06, + "loss": 0.76740688, + "num_input_tokens_seen": 219637025, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.10644531, + "step": 10201, + "time_per_iteration": 3.9431076049804688 + }, + { + "auxiliary_loss_clip": 0.06409751, + "auxiliary_loss_mlp": 0.01268274, + "balance_loss_clip": 0.06271015, + "balance_loss_mlp": 0.01257462, + "epoch": 0.6133774237186231, + "flos": 20491298709120.0, + "grad_norm": 2.3821613548396368, + "language_loss": 0.83898175, + "learning_rate": 1.3733790659190746e-06, + "loss": 0.91576207, + "num_input_tokens_seen": 219656625, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.10809326, + "step": 10202, + "time_per_iteration": 2.496201276779175 + }, + { + "auxiliary_loss_clip": 0.06332828, + "auxiliary_loss_mlp": 0.01255453, + "balance_loss_clip": 0.06274157, + "balance_loss_mlp": 0.01253526, + "epoch": 0.6134375469712912, + "flos": 69433643208960.0, + "grad_norm": 0.8530026378603166, + "language_loss": 0.66995066, + "learning_rate": 1.3730092252233953e-06, + "loss": 0.74583346, + "num_input_tokens_seen": 219718090, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 0.01924133, + "step": 10203, + "time_per_iteration": 3.1688590049743652 + }, + { + "auxiliary_loss_clip": 0.06417629, + "auxiliary_loss_mlp": 0.01266234, + "balance_loss_clip": 0.06275114, + "balance_loss_mlp": 0.0125538, + "epoch": 0.6134976702239591, + "flos": 41291145279360.0, + "grad_norm": 1.6901163598507989, + "language_loss": 0.61053431, + "learning_rate": 1.37263940830327e-06, + "loss": 0.68737298, + "num_input_tokens_seen": 219740100, + "router_z_loss_clip": 1.42675781, + "router_z_loss_mlp": 0.10845947, + "step": 10204, + "time_per_iteration": 2.7038605213165283 + }, + { + "auxiliary_loss_clip": 0.06412404, + "auxiliary_loss_mlp": 0.01263093, + "balance_loss_clip": 0.06273691, + "balance_loss_mlp": 0.01252901, + "epoch": 0.6135577934766271, + "flos": 22353233086080.0, + "grad_norm": 1.6787218918093536, + "language_loss": 0.72929007, + "learning_rate": 1.3722696151727204e-06, + "loss": 0.80604506, + "num_input_tokens_seen": 219761225, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.10198975, + "step": 10205, + "time_per_iteration": 2.5766189098358154 + }, + { + "auxiliary_loss_clip": 0.06411709, + "auxiliary_loss_mlp": 0.01265007, + "balance_loss_clip": 0.06273441, + "balance_loss_mlp": 0.01253843, + "epoch": 0.6136179167292951, + "flos": 23734198627200.0, + "grad_norm": 1.5218154078879744, + "language_loss": 0.76180834, + "learning_rate": 1.3718998458457701e-06, + "loss": 0.83857548, + "num_input_tokens_seen": 219780085, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.1116333, + "step": 10206, + "time_per_iteration": 2.5717761516571045 + }, + { + "auxiliary_loss_clip": 0.0641268, + "auxiliary_loss_mlp": 0.01265782, + "balance_loss_clip": 0.06271984, + "balance_loss_mlp": 0.01254595, + "epoch": 0.613678039981963, + "flos": 26030757732480.0, + "grad_norm": 2.128320629636919, + "language_loss": 0.7591306, + "learning_rate": 1.3715301003364407e-06, + "loss": 0.83591521, + "num_input_tokens_seen": 219797895, + "router_z_loss_clip": 1.40722656, + "router_z_loss_mlp": 0.11181641, + "step": 10207, + "time_per_iteration": 2.5353450775146484 + }, + { + "auxiliary_loss_clip": 0.06418657, + "auxiliary_loss_mlp": 0.01264109, + "balance_loss_clip": 0.06278594, + "balance_loss_mlp": 0.01253362, + "epoch": 0.613738163234631, + "flos": 9863078175360.0, + "grad_norm": 1.9702213064203427, + "language_loss": 0.82853335, + "learning_rate": 1.3711603786587525e-06, + "loss": 0.90536106, + "num_input_tokens_seen": 219811295, + "router_z_loss_clip": 1.39941406, + "router_z_loss_mlp": 0.10748291, + "step": 10208, + "time_per_iteration": 2.4810874462127686 + }, + { + "auxiliary_loss_clip": 0.06422867, + "auxiliary_loss_mlp": 0.01267664, + "balance_loss_clip": 0.06278636, + "balance_loss_mlp": 0.01255814, + "epoch": 0.613798286487299, + "flos": 33190380576000.0, + "grad_norm": 1.7610608340758167, + "language_loss": 0.72894984, + "learning_rate": 1.3707906808267265e-06, + "loss": 0.8058551, + "num_input_tokens_seen": 219832735, + "router_z_loss_clip": 1.44238281, + "router_z_loss_mlp": 0.1184082, + "step": 10209, + "time_per_iteration": 2.6061112880706787 + }, + { + "auxiliary_loss_clip": 0.06413165, + "auxiliary_loss_mlp": 0.01267749, + "balance_loss_clip": 0.06273563, + "balance_loss_mlp": 0.01257157, + "epoch": 0.613858409739967, + "flos": 25634678682240.0, + "grad_norm": 1.6794559835324834, + "language_loss": 0.74641943, + "learning_rate": 1.37042100685438e-06, + "loss": 0.8232286, + "num_input_tokens_seen": 219852755, + "router_z_loss_clip": 1.39648438, + "router_z_loss_mlp": 0.10595703, + "step": 10210, + "time_per_iteration": 2.5699121952056885 + }, + { + "auxiliary_loss_clip": 0.06324588, + "auxiliary_loss_mlp": 0.01253647, + "balance_loss_clip": 0.06266326, + "balance_loss_mlp": 0.01251882, + "epoch": 0.6139185329926349, + "flos": 67213336919040.0, + "grad_norm": 0.8410650121869828, + "language_loss": 0.65019715, + "learning_rate": 1.3700513567557325e-06, + "loss": 0.72597951, + "num_input_tokens_seen": 219922785, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 0.01765442, + "step": 10211, + "time_per_iteration": 3.2996082305908203 + }, + { + "auxiliary_loss_clip": 0.06413533, + "auxiliary_loss_mlp": 0.01270005, + "balance_loss_clip": 0.06274238, + "balance_loss_mlp": 0.01258889, + "epoch": 0.6139786562453029, + "flos": 21550090101120.0, + "grad_norm": 1.5192132224806107, + "language_loss": 0.75830382, + "learning_rate": 1.369681730544801e-06, + "loss": 0.83513916, + "num_input_tokens_seen": 219942215, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.11120605, + "step": 10212, + "time_per_iteration": 3.9495487213134766 + }, + { + "auxiliary_loss_clip": 0.06416361, + "auxiliary_loss_mlp": 0.01273486, + "balance_loss_clip": 0.06276919, + "balance_loss_mlp": 0.01262614, + "epoch": 0.6140387794979708, + "flos": 26075802101760.0, + "grad_norm": 1.4991601562707406, + "language_loss": 0.74122798, + "learning_rate": 1.3693121282356009e-06, + "loss": 0.8181265, + "num_input_tokens_seen": 219963830, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.10882568, + "step": 10213, + "time_per_iteration": 2.550542116165161 + }, + { + "auxiliary_loss_clip": 0.06420778, + "auxiliary_loss_mlp": 0.01265233, + "balance_loss_clip": 0.06275892, + "balance_loss_mlp": 0.01253742, + "epoch": 0.6140989027506388, + "flos": 23701145391360.0, + "grad_norm": 1.8705312076501914, + "language_loss": 0.73641956, + "learning_rate": 1.3689425498421483e-06, + "loss": 0.81327969, + "num_input_tokens_seen": 219983815, + "router_z_loss_clip": 1.44824219, + "router_z_loss_mlp": 0.11499023, + "step": 10214, + "time_per_iteration": 2.524115562438965 + }, + { + "auxiliary_loss_clip": 0.06416141, + "auxiliary_loss_mlp": 0.012644, + "balance_loss_clip": 0.06273637, + "balance_loss_mlp": 0.01253289, + "epoch": 0.6141590260033067, + "flos": 22237428343680.0, + "grad_norm": 1.5033107567748507, + "language_loss": 0.74553859, + "learning_rate": 1.3685729953784572e-06, + "loss": 0.82234401, + "num_input_tokens_seen": 220003165, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.11108398, + "step": 10215, + "time_per_iteration": 3.9794795513153076 + }, + { + "auxiliary_loss_clip": 0.06410043, + "auxiliary_loss_mlp": 0.01269466, + "balance_loss_clip": 0.06271295, + "balance_loss_mlp": 0.01258719, + "epoch": 0.6142191492559748, + "flos": 23877312600960.0, + "grad_norm": 1.5966298517178832, + "language_loss": 0.78681469, + "learning_rate": 1.368203464858542e-06, + "loss": 0.86360973, + "num_input_tokens_seen": 220021015, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.10742188, + "step": 10216, + "time_per_iteration": 2.5095551013946533 + }, + { + "auxiliary_loss_clip": 0.06413998, + "auxiliary_loss_mlp": 0.01267734, + "balance_loss_clip": 0.06273836, + "balance_loss_mlp": 0.0125694, + "epoch": 0.6142792725086427, + "flos": 15046764762240.0, + "grad_norm": 2.0499714549796475, + "language_loss": 0.8017531, + "learning_rate": 1.3678339582964147e-06, + "loss": 0.87857044, + "num_input_tokens_seen": 220035780, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.10797119, + "step": 10217, + "time_per_iteration": 2.530963897705078 + }, + { + "auxiliary_loss_clip": 0.06415407, + "auxiliary_loss_mlp": 0.01266792, + "balance_loss_clip": 0.06273971, + "balance_loss_mlp": 0.01255789, + "epoch": 0.6143393957613107, + "flos": 23337616452480.0, + "grad_norm": 2.309819184905194, + "language_loss": 0.78097677, + "learning_rate": 1.3674644757060865e-06, + "loss": 0.85779876, + "num_input_tokens_seen": 220054280, + "router_z_loss_clip": 1.41503906, + "router_z_loss_mlp": 0.11004639, + "step": 10218, + "time_per_iteration": 2.5020768642425537 + }, + { + "auxiliary_loss_clip": 0.06413251, + "auxiliary_loss_mlp": 0.01268832, + "balance_loss_clip": 0.06275171, + "balance_loss_mlp": 0.01258032, + "epoch": 0.6143995190139786, + "flos": 20122696598400.0, + "grad_norm": 1.7507364905585892, + "language_loss": 0.82176745, + "learning_rate": 1.367095017101569e-06, + "loss": 0.89858824, + "num_input_tokens_seen": 220074120, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.10803223, + "step": 10219, + "time_per_iteration": 4.098464250564575 + }, + { + "auxiliary_loss_clip": 0.06413841, + "auxiliary_loss_mlp": 0.01271094, + "balance_loss_clip": 0.06272161, + "balance_loss_mlp": 0.01259602, + "epoch": 0.6144596422666466, + "flos": 42313403491200.0, + "grad_norm": 1.6881627886326696, + "language_loss": 0.66870147, + "learning_rate": 1.3667255824968717e-06, + "loss": 0.74555075, + "num_input_tokens_seen": 220096320, + "router_z_loss_clip": 1.41699219, + "router_z_loss_mlp": 0.1149292, + "step": 10220, + "time_per_iteration": 2.724275827407837 + }, + { + "auxiliary_loss_clip": 0.0641406, + "auxiliary_loss_mlp": 0.01269064, + "balance_loss_clip": 0.06274959, + "balance_loss_mlp": 0.012584, + "epoch": 0.6145197655193146, + "flos": 21578992560000.0, + "grad_norm": 2.2248894315314454, + "language_loss": 0.72078216, + "learning_rate": 1.3663561719060041e-06, + "loss": 0.79761338, + "num_input_tokens_seen": 220114850, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.10656738, + "step": 10221, + "time_per_iteration": 2.5253100395202637 + }, + { + "auxiliary_loss_clip": 0.06412181, + "auxiliary_loss_mlp": 0.01267039, + "balance_loss_clip": 0.06273518, + "balance_loss_mlp": 0.01256609, + "epoch": 0.6145798887719826, + "flos": 21477610719360.0, + "grad_norm": 1.6538985449457846, + "language_loss": 0.7942664, + "learning_rate": 1.3659867853429735e-06, + "loss": 0.87105858, + "num_input_tokens_seen": 220133395, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.10430908, + "step": 10222, + "time_per_iteration": 2.5524139404296875 + }, + { + "auxiliary_loss_clip": 0.06418169, + "auxiliary_loss_mlp": 0.01267247, + "balance_loss_clip": 0.06275628, + "balance_loss_mlp": 0.01256447, + "epoch": 0.6146400120246506, + "flos": 20783270661120.0, + "grad_norm": 1.750623742282724, + "language_loss": 0.76586866, + "learning_rate": 1.365617422821788e-06, + "loss": 0.84272277, + "num_input_tokens_seen": 220152790, + "router_z_loss_clip": 1.42480469, + "router_z_loss_mlp": 0.10803223, + "step": 10223, + "time_per_iteration": 2.507918119430542 + }, + { + "auxiliary_loss_clip": 0.06413615, + "auxiliary_loss_mlp": 0.01266598, + "balance_loss_clip": 0.06278135, + "balance_loss_mlp": 0.01255392, + "epoch": 0.6147001352773185, + "flos": 13886423821440.0, + "grad_norm": 2.0249480129984287, + "language_loss": 0.78430009, + "learning_rate": 1.3652480843564535e-06, + "loss": 0.86110222, + "num_input_tokens_seen": 220169535, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.11212158, + "step": 10224, + "time_per_iteration": 2.5212504863739014 + }, + { + "auxiliary_loss_clip": 0.06409969, + "auxiliary_loss_mlp": 0.0126517, + "balance_loss_clip": 0.06274317, + "balance_loss_mlp": 0.01255359, + "epoch": 0.6147602585299865, + "flos": 56653920915840.0, + "grad_norm": 1.2562846499273215, + "language_loss": 0.66504145, + "learning_rate": 1.3648787699609746e-06, + "loss": 0.74179292, + "num_input_tokens_seen": 220195305, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.09814453, + "step": 10225, + "time_per_iteration": 2.814272880554199 + }, + { + "auxiliary_loss_clip": 0.06418905, + "auxiliary_loss_mlp": 0.01269548, + "balance_loss_clip": 0.06276867, + "balance_loss_mlp": 0.01258884, + "epoch": 0.6148203817826544, + "flos": 32825468044800.0, + "grad_norm": 1.9241791753141533, + "language_loss": 0.6340794, + "learning_rate": 1.364509479649357e-06, + "loss": 0.71096396, + "num_input_tokens_seen": 220215040, + "router_z_loss_clip": 1.41796875, + "router_z_loss_mlp": 0.10675049, + "step": 10226, + "time_per_iteration": 2.629307270050049 + }, + { + "auxiliary_loss_clip": 0.06414378, + "auxiliary_loss_mlp": 0.01266247, + "balance_loss_clip": 0.0627353, + "balance_loss_mlp": 0.01255303, + "epoch": 0.6148805050353224, + "flos": 18337811650560.0, + "grad_norm": 1.8500325381447646, + "language_loss": 0.76063347, + "learning_rate": 1.3641402134356037e-06, + "loss": 0.83743972, + "num_input_tokens_seen": 220234205, + "router_z_loss_clip": 1.40722656, + "router_z_loss_mlp": 0.10949707, + "step": 10227, + "time_per_iteration": 2.5072264671325684 + }, + { + "auxiliary_loss_clip": 0.06417207, + "auxiliary_loss_mlp": 0.01270328, + "balance_loss_clip": 0.06274723, + "balance_loss_mlp": 0.0125678, + "epoch": 0.6149406282879903, + "flos": 14069173576320.0, + "grad_norm": 4.1558900532043, + "language_loss": 0.62490618, + "learning_rate": 1.3637709713337164e-06, + "loss": 0.70178151, + "num_input_tokens_seen": 220252730, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.13568115, + "step": 10228, + "time_per_iteration": 2.625681161880493 + }, + { + "auxiliary_loss_clip": 0.06412059, + "auxiliary_loss_mlp": 0.01265474, + "balance_loss_clip": 0.0627415, + "balance_loss_mlp": 0.01254763, + "epoch": 0.6150007515406584, + "flos": 25196909425920.0, + "grad_norm": 1.4129638919460634, + "language_loss": 0.74878526, + "learning_rate": 1.3634017533576985e-06, + "loss": 0.82556051, + "num_input_tokens_seen": 220273345, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.1071167, + "step": 10229, + "time_per_iteration": 2.5437581539154053 + }, + { + "auxiliary_loss_clip": 0.06413749, + "auxiliary_loss_mlp": 0.01267795, + "balance_loss_clip": 0.0627471, + "balance_loss_mlp": 0.01256876, + "epoch": 0.6150608747933263, + "flos": 21951829301760.0, + "grad_norm": 1.6020000118574074, + "language_loss": 0.78397381, + "learning_rate": 1.3630325595215493e-06, + "loss": 0.86078924, + "num_input_tokens_seen": 220293845, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.10906982, + "step": 10230, + "time_per_iteration": 2.530174732208252 + }, + { + "auxiliary_loss_clip": 0.06413004, + "auxiliary_loss_mlp": 0.01266985, + "balance_loss_clip": 0.06270448, + "balance_loss_mlp": 0.01256149, + "epoch": 0.6151209980459943, + "flos": 30125283022080.0, + "grad_norm": 1.40012821108437, + "language_loss": 0.72963595, + "learning_rate": 1.36266338983927e-06, + "loss": 0.80643588, + "num_input_tokens_seen": 220316070, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.10827637, + "step": 10231, + "time_per_iteration": 2.5843095779418945 + }, + { + "auxiliary_loss_clip": 0.0641135, + "auxiliary_loss_mlp": 0.01267055, + "balance_loss_clip": 0.06271622, + "balance_loss_mlp": 0.01256434, + "epoch": 0.6151811212986622, + "flos": 30016228533120.0, + "grad_norm": 1.7264160083970947, + "language_loss": 0.70266879, + "learning_rate": 1.362294244324858e-06, + "loss": 0.77945286, + "num_input_tokens_seen": 220335695, + "router_z_loss_clip": 1.39648438, + "router_z_loss_mlp": 0.10626221, + "step": 10232, + "time_per_iteration": 2.5726914405822754 + }, + { + "auxiliary_loss_clip": 0.06409374, + "auxiliary_loss_mlp": 0.01268072, + "balance_loss_clip": 0.06274308, + "balance_loss_mlp": 0.01258112, + "epoch": 0.6152412445513302, + "flos": 18877675507200.0, + "grad_norm": 2.1019570874525484, + "language_loss": 0.92268974, + "learning_rate": 1.3619251229923126e-06, + "loss": 0.99946421, + "num_input_tokens_seen": 220353720, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.09960938, + "step": 10233, + "time_per_iteration": 2.475142002105713 + }, + { + "auxiliary_loss_clip": 0.06412026, + "auxiliary_loss_mlp": 0.01266426, + "balance_loss_clip": 0.06274009, + "balance_loss_mlp": 0.01256019, + "epoch": 0.6153013678039982, + "flos": 25710847643520.0, + "grad_norm": 1.7026564571899578, + "language_loss": 0.7220425, + "learning_rate": 1.3615560258556306e-06, + "loss": 0.79882705, + "num_input_tokens_seen": 220372515, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.10412598, + "step": 10234, + "time_per_iteration": 2.538825750350952 + }, + { + "auxiliary_loss_clip": 0.06412051, + "auxiliary_loss_mlp": 0.01265802, + "balance_loss_clip": 0.06270387, + "balance_loss_mlp": 0.01255187, + "epoch": 0.6153614910566662, + "flos": 28517529605760.0, + "grad_norm": 1.8042716232808833, + "language_loss": 0.67118728, + "learning_rate": 1.3611869529288077e-06, + "loss": 0.74796581, + "num_input_tokens_seen": 220393490, + "router_z_loss_clip": 1.41699219, + "router_z_loss_mlp": 0.10620117, + "step": 10235, + "time_per_iteration": 2.5539941787719727 + }, + { + "auxiliary_loss_clip": 0.06416909, + "auxiliary_loss_mlp": 0.01269314, + "balance_loss_clip": 0.06272343, + "balance_loss_mlp": 0.01258489, + "epoch": 0.6154216143093342, + "flos": 23556480117120.0, + "grad_norm": 1.5012129447427485, + "language_loss": 0.81535256, + "learning_rate": 1.3608179042258398e-06, + "loss": 0.89221478, + "num_input_tokens_seen": 220412855, + "router_z_loss_clip": 1.4453125, + "router_z_loss_mlp": 0.10821533, + "step": 10236, + "time_per_iteration": 2.538961887359619 + }, + { + "auxiliary_loss_clip": 0.06413287, + "auxiliary_loss_mlp": 0.01269421, + "balance_loss_clip": 0.06269701, + "balance_loss_mlp": 0.01258281, + "epoch": 0.6154817375620021, + "flos": 22754804578560.0, + "grad_norm": 1.3960361226739142, + "language_loss": 0.8069132, + "learning_rate": 1.360448879760721e-06, + "loss": 0.88374025, + "num_input_tokens_seen": 220433440, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.11138916, + "step": 10237, + "time_per_iteration": 2.5317978858947754 + }, + { + "auxiliary_loss_clip": 0.06410801, + "auxiliary_loss_mlp": 0.01271969, + "balance_loss_clip": 0.06272944, + "balance_loss_mlp": 0.01261198, + "epoch": 0.6155418608146701, + "flos": 27170455841280.0, + "grad_norm": 1.5039507372145677, + "language_loss": 0.76442957, + "learning_rate": 1.3600798795474449e-06, + "loss": 0.84125727, + "num_input_tokens_seen": 220453445, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.10772705, + "step": 10238, + "time_per_iteration": 2.5912821292877197 + }, + { + "auxiliary_loss_clip": 0.06320563, + "auxiliary_loss_mlp": 0.01256509, + "balance_loss_clip": 0.06262375, + "balance_loss_mlp": 0.01254774, + "epoch": 0.615601984067338, + "flos": 68828610003840.0, + "grad_norm": 1.135422984419524, + "language_loss": 0.57526618, + "learning_rate": 1.3597109036000036e-06, + "loss": 0.65103698, + "num_input_tokens_seen": 220509730, + "router_z_loss_clip": 0.58203125, + "router_z_loss_mlp": 0.01739502, + "step": 10239, + "time_per_iteration": 3.167433738708496 + }, + { + "auxiliary_loss_clip": 0.06415902, + "auxiliary_loss_mlp": 0.01263733, + "balance_loss_clip": 0.06273024, + "balance_loss_mlp": 0.0125323, + "epoch": 0.615662107320006, + "flos": 15521528396160.0, + "grad_norm": 1.8815161483190883, + "language_loss": 0.77940285, + "learning_rate": 1.3593419519323892e-06, + "loss": 0.8561992, + "num_input_tokens_seen": 220527295, + "router_z_loss_clip": 1.42871094, + "router_z_loss_mlp": 0.10498047, + "step": 10240, + "time_per_iteration": 2.4900901317596436 + }, + { + "auxiliary_loss_clip": 0.06418262, + "auxiliary_loss_mlp": 0.01272722, + "balance_loss_clip": 0.06275868, + "balance_loss_mlp": 0.01262017, + "epoch": 0.615722230572674, + "flos": 21069121265280.0, + "grad_norm": 2.263045257123095, + "language_loss": 0.72996962, + "learning_rate": 1.3589730245585922e-06, + "loss": 0.80687952, + "num_input_tokens_seen": 220542730, + "router_z_loss_clip": 1.42382812, + "router_z_loss_mlp": 0.1071167, + "step": 10241, + "time_per_iteration": 3.901360511779785 + }, + { + "auxiliary_loss_clip": 0.06409363, + "auxiliary_loss_mlp": 0.01269863, + "balance_loss_clip": 0.0627209, + "balance_loss_mlp": 0.01259873, + "epoch": 0.615782353825342, + "flos": 23263250353920.0, + "grad_norm": 1.504543290987149, + "language_loss": 0.72248924, + "learning_rate": 1.3586041214926018e-06, + "loss": 0.79928148, + "num_input_tokens_seen": 220562995, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.09997559, + "step": 10242, + "time_per_iteration": 2.5169565677642822 + }, + { + "auxiliary_loss_clip": 0.06411266, + "auxiliary_loss_mlp": 0.01265628, + "balance_loss_clip": 0.06271993, + "balance_loss_mlp": 0.01255066, + "epoch": 0.6158424770780099, + "flos": 21109972930560.0, + "grad_norm": 2.215067200442713, + "language_loss": 0.7281, + "learning_rate": 1.3582352427484086e-06, + "loss": 0.80486894, + "num_input_tokens_seen": 220581775, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.10565186, + "step": 10243, + "time_per_iteration": 2.540512800216675 + }, + { + "auxiliary_loss_clip": 0.06321675, + "auxiliary_loss_mlp": 0.01255828, + "balance_loss_clip": 0.06263578, + "balance_loss_mlp": 0.01254183, + "epoch": 0.6159026003306779, + "flos": 70355358120960.0, + "grad_norm": 0.7449608811837395, + "language_loss": 0.56762981, + "learning_rate": 1.3578663883399984e-06, + "loss": 0.64340484, + "num_input_tokens_seen": 220646395, + "router_z_loss_clip": 0.58105469, + "router_z_loss_mlp": 0.01647949, + "step": 10244, + "time_per_iteration": 3.2194366455078125 + }, + { + "auxiliary_loss_clip": 0.06409553, + "auxiliary_loss_mlp": 0.01267536, + "balance_loss_clip": 0.06271067, + "balance_loss_mlp": 0.01256855, + "epoch": 0.6159627235833458, + "flos": 33882624282240.0, + "grad_norm": 1.5482958097169006, + "language_loss": 0.63865972, + "learning_rate": 1.3574975582813593e-06, + "loss": 0.71543062, + "num_input_tokens_seen": 220668335, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.10675049, + "step": 10245, + "time_per_iteration": 2.640113353729248 + }, + { + "auxiliary_loss_clip": 0.06409854, + "auxiliary_loss_mlp": 0.01267557, + "balance_loss_clip": 0.06270616, + "balance_loss_mlp": 0.01257442, + "epoch": 0.6160228468360138, + "flos": 26582193452160.0, + "grad_norm": 1.6235599905950853, + "language_loss": 0.79032344, + "learning_rate": 1.3571287525864771e-06, + "loss": 0.8670975, + "num_input_tokens_seen": 220688915, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.10119629, + "step": 10246, + "time_per_iteration": 2.5686607360839844 + }, + { + "auxiliary_loss_clip": 0.0641896, + "auxiliary_loss_mlp": 0.0127079, + "balance_loss_clip": 0.0627369, + "balance_loss_mlp": 0.01258952, + "epoch": 0.6160829700886818, + "flos": 17197568490240.0, + "grad_norm": 2.4844316843996825, + "language_loss": 0.88253343, + "learning_rate": 1.3567599712693368e-06, + "loss": 0.95943093, + "num_input_tokens_seen": 220703465, + "router_z_loss_clip": 1.453125, + "router_z_loss_mlp": 0.1184082, + "step": 10247, + "time_per_iteration": 2.450960397720337 + }, + { + "auxiliary_loss_clip": 0.06417046, + "auxiliary_loss_mlp": 0.01268101, + "balance_loss_clip": 0.06275311, + "balance_loss_mlp": 0.01258028, + "epoch": 0.6161430933413498, + "flos": 23630385018240.0, + "grad_norm": 1.598841912113341, + "language_loss": 0.80267406, + "learning_rate": 1.3563912143439235e-06, + "loss": 0.87952548, + "num_input_tokens_seen": 220722090, + "router_z_loss_clip": 1.41699219, + "router_z_loss_mlp": 0.10076904, + "step": 10248, + "time_per_iteration": 2.5717732906341553 + }, + { + "auxiliary_loss_clip": 0.06409503, + "auxiliary_loss_mlp": 0.01268015, + "balance_loss_clip": 0.06271905, + "balance_loss_mlp": 0.01257733, + "epoch": 0.6162032165940178, + "flos": 23009027466240.0, + "grad_norm": 1.6786182085700423, + "language_loss": 0.87678397, + "learning_rate": 1.3560224818242191e-06, + "loss": 0.95355916, + "num_input_tokens_seen": 220741075, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.10284424, + "step": 10249, + "time_per_iteration": 2.5637669563293457 + }, + { + "auxiliary_loss_clip": 0.06414458, + "auxiliary_loss_mlp": 0.01265908, + "balance_loss_clip": 0.06273694, + "balance_loss_mlp": 0.01255239, + "epoch": 0.6162633398466857, + "flos": 39431474962560.0, + "grad_norm": 2.372002019412244, + "language_loss": 0.70129162, + "learning_rate": 1.3556537737242072e-06, + "loss": 0.7780953, + "num_input_tokens_seen": 220763395, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.10668945, + "step": 10250, + "time_per_iteration": 2.700856924057007 + }, + { + "auxiliary_loss_clip": 0.06403701, + "auxiliary_loss_mlp": 0.01263182, + "balance_loss_clip": 0.06270384, + "balance_loss_mlp": 0.0125386, + "epoch": 0.6163234630993537, + "flos": 19250679957120.0, + "grad_norm": 1.6751579708994577, + "language_loss": 0.74076283, + "learning_rate": 1.3552850900578692e-06, + "loss": 0.81743157, + "num_input_tokens_seen": 220780640, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.09320068, + "step": 10251, + "time_per_iteration": 3.9032137393951416 + }, + { + "auxiliary_loss_clip": 0.06412694, + "auxiliary_loss_mlp": 0.01265039, + "balance_loss_clip": 0.06272181, + "balance_loss_mlp": 0.01255288, + "epoch": 0.6163835863520216, + "flos": 15967389571200.0, + "grad_norm": 1.9695671027525665, + "language_loss": 0.69094777, + "learning_rate": 1.3549164308391844e-06, + "loss": 0.76772505, + "num_input_tokens_seen": 220797960, + "router_z_loss_clip": 1.40429688, + "router_z_loss_mlp": 0.09753418, + "step": 10252, + "time_per_iteration": 2.546041250228882 + }, + { + "auxiliary_loss_clip": 0.06321114, + "auxiliary_loss_mlp": 0.01253403, + "balance_loss_clip": 0.06262837, + "balance_loss_mlp": 0.01252003, + "epoch": 0.6164437096046896, + "flos": 68124905487360.0, + "grad_norm": 0.8614248496363994, + "language_loss": 0.57690394, + "learning_rate": 1.3545477960821333e-06, + "loss": 0.6526491, + "num_input_tokens_seen": 220856930, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 0.01400757, + "step": 10253, + "time_per_iteration": 3.1977267265319824 + }, + { + "auxiliary_loss_clip": 0.06417613, + "auxiliary_loss_mlp": 0.01268494, + "balance_loss_clip": 0.06274711, + "balance_loss_mlp": 0.01257783, + "epoch": 0.6165038328573575, + "flos": 21367633835520.0, + "grad_norm": 1.503369483441608, + "language_loss": 0.79960692, + "learning_rate": 1.3541791858006946e-06, + "loss": 0.876468, + "num_input_tokens_seen": 220877595, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.1071167, + "step": 10254, + "time_per_iteration": 3.95928692817688 + }, + { + "auxiliary_loss_clip": 0.06419028, + "auxiliary_loss_mlp": 0.01264054, + "balance_loss_clip": 0.06276255, + "balance_loss_mlp": 0.01253128, + "epoch": 0.6165639561100256, + "flos": 21107708870400.0, + "grad_norm": 1.746255949432921, + "language_loss": 0.81143081, + "learning_rate": 1.353810600008846e-06, + "loss": 0.88826168, + "num_input_tokens_seen": 220896880, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.10925293, + "step": 10255, + "time_per_iteration": 2.5300750732421875 + }, + { + "auxiliary_loss_clip": 0.06416211, + "auxiliary_loss_mlp": 0.01266666, + "balance_loss_clip": 0.06273863, + "balance_loss_mlp": 0.01255371, + "epoch": 0.6166240793626935, + "flos": 25345683550080.0, + "grad_norm": 1.880965378472566, + "language_loss": 0.65514123, + "learning_rate": 1.3534420387205646e-06, + "loss": 0.73196995, + "num_input_tokens_seen": 220916425, + "router_z_loss_clip": 1.42285156, + "router_z_loss_mlp": 0.11291504, + "step": 10256, + "time_per_iteration": 2.539006233215332 + }, + { + "auxiliary_loss_clip": 0.06415517, + "auxiliary_loss_mlp": 0.01267871, + "balance_loss_clip": 0.06277969, + "balance_loss_mlp": 0.0125806, + "epoch": 0.6166842026153615, + "flos": 19688742702720.0, + "grad_norm": 1.5659047978931129, + "language_loss": 0.72409272, + "learning_rate": 1.353073501949825e-06, + "loss": 0.80092663, + "num_input_tokens_seen": 220935050, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.09802246, + "step": 10257, + "time_per_iteration": 2.5153865814208984 + }, + { + "auxiliary_loss_clip": 0.06416216, + "auxiliary_loss_mlp": 0.01264385, + "balance_loss_clip": 0.06275131, + "balance_loss_mlp": 0.01253788, + "epoch": 0.6167443258680294, + "flos": 19324501004160.0, + "grad_norm": 1.6557108650811327, + "language_loss": 0.71972775, + "learning_rate": 1.3527049897106034e-06, + "loss": 0.79653382, + "num_input_tokens_seen": 220953085, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.1060791, + "step": 10258, + "time_per_iteration": 2.480304718017578 + }, + { + "auxiliary_loss_clip": 0.06417316, + "auxiliary_loss_mlp": 0.01263861, + "balance_loss_clip": 0.06275502, + "balance_loss_mlp": 0.01253222, + "epoch": 0.6168044491206974, + "flos": 25272323700480.0, + "grad_norm": 1.9257678582667488, + "language_loss": 0.63553512, + "learning_rate": 1.3523365020168735e-06, + "loss": 0.71234685, + "num_input_tokens_seen": 220969050, + "router_z_loss_clip": 1.41601562, + "router_z_loss_mlp": 0.10638428, + "step": 10259, + "time_per_iteration": 4.02075719833374 + }, + { + "auxiliary_loss_clip": 0.06410451, + "auxiliary_loss_mlp": 0.0126865, + "balance_loss_clip": 0.0627453, + "balance_loss_mlp": 0.01257898, + "epoch": 0.6168645723733654, + "flos": 13224130750080.0, + "grad_norm": 1.6228127894065456, + "language_loss": 0.71578032, + "learning_rate": 1.3519680388826084e-06, + "loss": 0.79257131, + "num_input_tokens_seen": 220985825, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.10748291, + "step": 10260, + "time_per_iteration": 2.4910624027252197 + }, + { + "auxiliary_loss_clip": 0.06424432, + "auxiliary_loss_mlp": 0.01268478, + "balance_loss_clip": 0.06278151, + "balance_loss_mlp": 0.01256492, + "epoch": 0.6169246956260334, + "flos": 26659410589440.0, + "grad_norm": 1.7088590339487795, + "language_loss": 0.68640685, + "learning_rate": 1.3515996003217803e-06, + "loss": 0.76333594, + "num_input_tokens_seen": 221004465, + "router_z_loss_clip": 1.46289062, + "router_z_loss_mlp": 0.11981201, + "step": 10261, + "time_per_iteration": 2.5747649669647217 + }, + { + "auxiliary_loss_clip": 0.06414127, + "auxiliary_loss_mlp": 0.01264284, + "balance_loss_clip": 0.06274065, + "balance_loss_mlp": 0.01254151, + "epoch": 0.6169848188787014, + "flos": 23155034405760.0, + "grad_norm": 1.7119551141937153, + "language_loss": 0.71845949, + "learning_rate": 1.3512311863483602e-06, + "loss": 0.79524362, + "num_input_tokens_seen": 221023260, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.10131836, + "step": 10262, + "time_per_iteration": 2.560232162475586 + }, + { + "auxiliary_loss_clip": 0.06416971, + "auxiliary_loss_mlp": 0.01265583, + "balance_loss_clip": 0.06277905, + "balance_loss_mlp": 0.01254425, + "epoch": 0.6170449421313693, + "flos": 23338748482560.0, + "grad_norm": 1.8792858261778465, + "language_loss": 0.70386994, + "learning_rate": 1.3508627969763188e-06, + "loss": 0.7806955, + "num_input_tokens_seen": 221043090, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.11157227, + "step": 10263, + "time_per_iteration": 2.5188369750976562 + }, + { + "auxiliary_loss_clip": 0.06418619, + "auxiliary_loss_mlp": 0.01266762, + "balance_loss_clip": 0.06274839, + "balance_loss_mlp": 0.01256618, + "epoch": 0.6171050653840373, + "flos": 15857077271040.0, + "grad_norm": 2.3172465393141404, + "language_loss": 0.76572752, + "learning_rate": 1.3504944322196244e-06, + "loss": 0.84258133, + "num_input_tokens_seen": 221061435, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.10150146, + "step": 10264, + "time_per_iteration": 2.525599956512451 + }, + { + "auxiliary_loss_clip": 0.06414546, + "auxiliary_loss_mlp": 0.01266705, + "balance_loss_clip": 0.06275049, + "balance_loss_mlp": 0.01255726, + "epoch": 0.6171651886367052, + "flos": 20051349246720.0, + "grad_norm": 2.349171582745048, + "language_loss": 0.85150325, + "learning_rate": 1.350126092092247e-06, + "loss": 0.92831576, + "num_input_tokens_seen": 221078705, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.10992432, + "step": 10265, + "time_per_iteration": 2.5084152221679688 + }, + { + "auxiliary_loss_clip": 0.06410134, + "auxiliary_loss_mlp": 0.01264888, + "balance_loss_clip": 0.0627327, + "balance_loss_mlp": 0.01254099, + "epoch": 0.6172253118893732, + "flos": 26439959946240.0, + "grad_norm": 2.0102817715219112, + "language_loss": 0.64766055, + "learning_rate": 1.349757776608153e-06, + "loss": 0.72441077, + "num_input_tokens_seen": 221099245, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.10791016, + "step": 10266, + "time_per_iteration": 2.5796725749969482 + }, + { + "auxiliary_loss_clip": 0.06410654, + "auxiliary_loss_mlp": 0.01267414, + "balance_loss_clip": 0.06270823, + "balance_loss_mlp": 0.01257263, + "epoch": 0.6172854351420412, + "flos": 22638622492800.0, + "grad_norm": 1.5096082169739153, + "language_loss": 0.76070148, + "learning_rate": 1.3493894857813094e-06, + "loss": 0.83748215, + "num_input_tokens_seen": 221116930, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.10150146, + "step": 10267, + "time_per_iteration": 2.5105693340301514 + }, + { + "auxiliary_loss_clip": 0.06419747, + "auxiliary_loss_mlp": 0.01265039, + "balance_loss_clip": 0.06275809, + "balance_loss_mlp": 0.01254066, + "epoch": 0.6173455583947092, + "flos": 21218943565440.0, + "grad_norm": 1.6454778934730863, + "language_loss": 0.7525773, + "learning_rate": 1.3490212196256818e-06, + "loss": 0.82942522, + "num_input_tokens_seen": 221137660, + "router_z_loss_clip": 1.43847656, + "router_z_loss_mlp": 0.10974121, + "step": 10268, + "time_per_iteration": 2.587233543395996 + }, + { + "auxiliary_loss_clip": 0.06419453, + "auxiliary_loss_mlp": 0.0126697, + "balance_loss_clip": 0.06273817, + "balance_loss_mlp": 0.01256396, + "epoch": 0.6174056816473771, + "flos": 19506370291200.0, + "grad_norm": 1.5800856340056704, + "language_loss": 0.75772798, + "learning_rate": 1.3486529781552342e-06, + "loss": 0.83459222, + "num_input_tokens_seen": 221156225, + "router_z_loss_clip": 1.45703125, + "router_z_loss_mlp": 0.10583496, + "step": 10269, + "time_per_iteration": 2.4955811500549316 + }, + { + "auxiliary_loss_clip": 0.06411718, + "auxiliary_loss_mlp": 0.01267212, + "balance_loss_clip": 0.06271979, + "balance_loss_mlp": 0.01256549, + "epoch": 0.6174658049000451, + "flos": 16002790721280.0, + "grad_norm": 2.3324483712409685, + "language_loss": 0.76473081, + "learning_rate": 1.3482847613839318e-06, + "loss": 0.84152013, + "num_input_tokens_seen": 221173820, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.10662842, + "step": 10270, + "time_per_iteration": 2.5138041973114014 + }, + { + "auxiliary_loss_clip": 0.0641441, + "auxiliary_loss_mlp": 0.01270386, + "balance_loss_clip": 0.06274129, + "balance_loss_mlp": 0.0125986, + "epoch": 0.617525928152713, + "flos": 21909635971200.0, + "grad_norm": 1.7440039477364133, + "language_loss": 0.82272917, + "learning_rate": 1.347916569325736e-06, + "loss": 0.89957708, + "num_input_tokens_seen": 221191815, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.10522461, + "step": 10271, + "time_per_iteration": 2.488560676574707 + }, + { + "auxiliary_loss_clip": 0.06416266, + "auxiliary_loss_mlp": 0.01264784, + "balance_loss_clip": 0.06273527, + "balance_loss_mlp": 0.01254801, + "epoch": 0.617586051405381, + "flos": 21112362771840.0, + "grad_norm": 1.4517106193495921, + "language_loss": 0.77416623, + "learning_rate": 1.3475484019946093e-06, + "loss": 0.85097671, + "num_input_tokens_seen": 221211205, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.09985352, + "step": 10272, + "time_per_iteration": 2.520111560821533 + }, + { + "auxiliary_loss_clip": 0.06312063, + "auxiliary_loss_mlp": 0.01254406, + "balance_loss_clip": 0.06253687, + "balance_loss_mlp": 0.01252749, + "epoch": 0.617646174658049, + "flos": 58629129684480.0, + "grad_norm": 0.7932568322885909, + "language_loss": 0.59031951, + "learning_rate": 1.347180259404513e-06, + "loss": 0.66598421, + "num_input_tokens_seen": 221268430, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 0.01660156, + "step": 10273, + "time_per_iteration": 2.9967992305755615 + }, + { + "auxiliary_loss_clip": 0.0640862, + "auxiliary_loss_mlp": 0.01264919, + "balance_loss_clip": 0.06270938, + "balance_loss_mlp": 0.01254274, + "epoch": 0.617706297910717, + "flos": 13883363147520.0, + "grad_norm": 2.2785278271278897, + "language_loss": 0.73286194, + "learning_rate": 1.3468121415694059e-06, + "loss": 0.80959731, + "num_input_tokens_seen": 221281930, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.10632324, + "step": 10274, + "time_per_iteration": 2.4770405292510986 + }, + { + "auxiliary_loss_clip": 0.06412372, + "auxiliary_loss_mlp": 0.01266317, + "balance_loss_clip": 0.06272519, + "balance_loss_mlp": 0.01255713, + "epoch": 0.617766421163385, + "flos": 19214482193280.0, + "grad_norm": 1.605129158536194, + "language_loss": 0.77453375, + "learning_rate": 1.3464440485032484e-06, + "loss": 0.85132062, + "num_input_tokens_seen": 221301605, + "router_z_loss_clip": 1.39941406, + "router_z_loss_mlp": 0.1060791, + "step": 10275, + "time_per_iteration": 2.4878437519073486 + }, + { + "auxiliary_loss_clip": 0.06409969, + "auxiliary_loss_mlp": 0.01271601, + "balance_loss_clip": 0.06272689, + "balance_loss_mlp": 0.01261134, + "epoch": 0.6178265444160529, + "flos": 22572725656320.0, + "grad_norm": 1.5524938527976675, + "language_loss": 0.79471135, + "learning_rate": 1.346075980219998e-06, + "loss": 0.87152702, + "num_input_tokens_seen": 221320105, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.10461426, + "step": 10276, + "time_per_iteration": 2.644413709640503 + }, + { + "auxiliary_loss_clip": 0.06416178, + "auxiliary_loss_mlp": 0.0126935, + "balance_loss_clip": 0.06274026, + "balance_loss_mlp": 0.01258192, + "epoch": 0.6178866676687209, + "flos": 11989130221440.0, + "grad_norm": 2.611664280498841, + "language_loss": 0.81007028, + "learning_rate": 1.345707936733612e-06, + "loss": 0.88692558, + "num_input_tokens_seen": 221335915, + "router_z_loss_clip": 1.42285156, + "router_z_loss_mlp": 0.1114502, + "step": 10277, + "time_per_iteration": 2.497955799102783 + }, + { + "auxiliary_loss_clip": 0.06418674, + "auxiliary_loss_mlp": 0.01267294, + "balance_loss_clip": 0.06274392, + "balance_loss_mlp": 0.01256381, + "epoch": 0.6179467909213888, + "flos": 20997061153920.0, + "grad_norm": 1.6653557744536012, + "language_loss": 0.81855345, + "learning_rate": 1.3453399180580466e-06, + "loss": 0.89541304, + "num_input_tokens_seen": 221353965, + "router_z_loss_clip": 1.44042969, + "router_z_loss_mlp": 0.10925293, + "step": 10278, + "time_per_iteration": 2.529439687728882 + }, + { + "auxiliary_loss_clip": 0.06410799, + "auxiliary_loss_mlp": 0.01263691, + "balance_loss_clip": 0.06271666, + "balance_loss_mlp": 0.0125394, + "epoch": 0.6180069141740568, + "flos": 25345180425600.0, + "grad_norm": 1.5510866303043802, + "language_loss": 0.74313521, + "learning_rate": 1.3449719242072567e-06, + "loss": 0.81988013, + "num_input_tokens_seen": 221374080, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.09753418, + "step": 10279, + "time_per_iteration": 2.5355474948883057 + }, + { + "auxiliary_loss_clip": 0.06408358, + "auxiliary_loss_mlp": 0.01263048, + "balance_loss_clip": 0.06268996, + "balance_loss_mlp": 0.0125316, + "epoch": 0.6180670374267248, + "flos": 19651748325120.0, + "grad_norm": 1.3695497899575455, + "language_loss": 0.70764935, + "learning_rate": 1.3446039551951975e-06, + "loss": 0.78436339, + "num_input_tokens_seen": 221392910, + "router_z_loss_clip": 1.39257812, + "router_z_loss_mlp": 0.09887695, + "step": 10280, + "time_per_iteration": 3.9792449474334717 + }, + { + "auxiliary_loss_clip": 0.06417054, + "auxiliary_loss_mlp": 0.01267828, + "balance_loss_clip": 0.06274389, + "balance_loss_mlp": 0.01256873, + "epoch": 0.6181271606793928, + "flos": 19471136849280.0, + "grad_norm": 1.3977623720923391, + "language_loss": 0.73107064, + "learning_rate": 1.3442360110358215e-06, + "loss": 0.8079195, + "num_input_tokens_seen": 221410990, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.10943604, + "step": 10281, + "time_per_iteration": 2.515800952911377 + }, + { + "auxiliary_loss_clip": 0.06410573, + "auxiliary_loss_mlp": 0.01266845, + "balance_loss_clip": 0.06274214, + "balance_loss_mlp": 0.01256927, + "epoch": 0.6181872839320607, + "flos": 25601541592320.0, + "grad_norm": 1.5934743777966283, + "language_loss": 0.76599932, + "learning_rate": 1.3438680917430827e-06, + "loss": 0.84277344, + "num_input_tokens_seen": 221431020, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.09924316, + "step": 10282, + "time_per_iteration": 2.5432822704315186 + }, + { + "auxiliary_loss_clip": 0.06415926, + "auxiliary_loss_mlp": 0.01266703, + "balance_loss_clip": 0.06272847, + "balance_loss_mlp": 0.01254884, + "epoch": 0.6182474071847287, + "flos": 25558048523520.0, + "grad_norm": 1.5342450755249748, + "language_loss": 0.69123679, + "learning_rate": 1.343500197330931e-06, + "loss": 0.76806307, + "num_input_tokens_seen": 221453235, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.1182251, + "step": 10283, + "time_per_iteration": 2.588545322418213 + }, + { + "auxiliary_loss_clip": 0.06422709, + "auxiliary_loss_mlp": 0.0126698, + "balance_loss_clip": 0.06273957, + "balance_loss_mlp": 0.01255607, + "epoch": 0.6183075304373966, + "flos": 22129673592960.0, + "grad_norm": 1.473012438045687, + "language_loss": 0.75165606, + "learning_rate": 1.3431323278133176e-06, + "loss": 0.82855296, + "num_input_tokens_seen": 221472560, + "router_z_loss_clip": 1.48828125, + "router_z_loss_mlp": 0.11364746, + "step": 10284, + "time_per_iteration": 2.4986348152160645 + }, + { + "auxiliary_loss_clip": 0.06405671, + "auxiliary_loss_mlp": 0.01268655, + "balance_loss_clip": 0.06274024, + "balance_loss_mlp": 0.01259034, + "epoch": 0.6183676536900646, + "flos": 22462161793920.0, + "grad_norm": 1.4548798471123576, + "language_loss": 0.75635868, + "learning_rate": 1.3427644832041922e-06, + "loss": 0.83310193, + "num_input_tokens_seen": 221492835, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09619141, + "step": 10285, + "time_per_iteration": 2.585350513458252 + }, + { + "auxiliary_loss_clip": 0.06410024, + "auxiliary_loss_mlp": 0.0126635, + "balance_loss_clip": 0.06269899, + "balance_loss_mlp": 0.01255377, + "epoch": 0.6184277769427327, + "flos": 23370250417920.0, + "grad_norm": 1.3734994412846095, + "language_loss": 0.72883123, + "learning_rate": 1.342396663517503e-06, + "loss": 0.80559498, + "num_input_tokens_seen": 221511870, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.10974121, + "step": 10286, + "time_per_iteration": 2.569110870361328 + }, + { + "auxiliary_loss_clip": 0.06411327, + "auxiliary_loss_mlp": 0.01268421, + "balance_loss_clip": 0.0627317, + "balance_loss_mlp": 0.01257311, + "epoch": 0.6184879001954006, + "flos": 22717684419840.0, + "grad_norm": 1.5486281180664692, + "language_loss": 0.76501298, + "learning_rate": 1.342028868767199e-06, + "loss": 0.84181046, + "num_input_tokens_seen": 221529915, + "router_z_loss_clip": 1.38183594, + "router_z_loss_mlp": 0.11108398, + "step": 10287, + "time_per_iteration": 2.5511634349823 + }, + { + "auxiliary_loss_clip": 0.06411948, + "auxiliary_loss_mlp": 0.01264572, + "balance_loss_clip": 0.06272362, + "balance_loss_mlp": 0.01253587, + "epoch": 0.6185480234480686, + "flos": 23848703631360.0, + "grad_norm": 1.5880408145773481, + "language_loss": 0.73586667, + "learning_rate": 1.3416610989672262e-06, + "loss": 0.81263179, + "num_input_tokens_seen": 221549745, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.10986328, + "step": 10288, + "time_per_iteration": 2.507291555404663 + }, + { + "auxiliary_loss_clip": 0.06409134, + "auxiliary_loss_mlp": 0.01263119, + "balance_loss_clip": 0.06273092, + "balance_loss_mlp": 0.0125264, + "epoch": 0.6186081467007365, + "flos": 45487932877440.0, + "grad_norm": 1.4570853227015406, + "language_loss": 0.73074299, + "learning_rate": 1.3412933541315296e-06, + "loss": 0.80746555, + "num_input_tokens_seen": 221572455, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.10473633, + "step": 10289, + "time_per_iteration": 2.7538769245147705 + }, + { + "auxiliary_loss_clip": 0.0641107, + "auxiliary_loss_mlp": 0.01268567, + "balance_loss_clip": 0.06269012, + "balance_loss_mlp": 0.01257468, + "epoch": 0.6186682699534045, + "flos": 23557737928320.0, + "grad_norm": 1.4253961785396534, + "language_loss": 0.79380536, + "learning_rate": 1.340925634274056e-06, + "loss": 0.87060177, + "num_input_tokens_seen": 221591325, + "router_z_loss_clip": 1.41992188, + "router_z_loss_mlp": 0.11090088, + "step": 10290, + "time_per_iteration": 2.532860040664673 + }, + { + "auxiliary_loss_clip": 0.06417654, + "auxiliary_loss_mlp": 0.01269395, + "balance_loss_clip": 0.06273635, + "balance_loss_mlp": 0.01258374, + "epoch": 0.6187283932060724, + "flos": 25781062965120.0, + "grad_norm": 1.5195693495374782, + "language_loss": 0.81756544, + "learning_rate": 1.3405579394087475e-06, + "loss": 0.89443594, + "num_input_tokens_seen": 221611640, + "router_z_loss_clip": 1.44140625, + "router_z_loss_mlp": 0.11022949, + "step": 10291, + "time_per_iteration": 3.985360860824585 + }, + { + "auxiliary_loss_clip": 0.06414646, + "auxiliary_loss_mlp": 0.0126579, + "balance_loss_clip": 0.06274836, + "balance_loss_mlp": 0.01255967, + "epoch": 0.6187885164587404, + "flos": 25272281773440.0, + "grad_norm": 5.259543114674327, + "language_loss": 0.78044999, + "learning_rate": 1.3401902695495487e-06, + "loss": 0.85725427, + "num_input_tokens_seen": 221631225, + "router_z_loss_clip": 1.39941406, + "router_z_loss_mlp": 0.09820557, + "step": 10292, + "time_per_iteration": 2.5699048042297363 + }, + { + "auxiliary_loss_clip": 0.06421922, + "auxiliary_loss_mlp": 0.01269104, + "balance_loss_clip": 0.06274973, + "balance_loss_mlp": 0.01257285, + "epoch": 0.6188486397114084, + "flos": 26258090659200.0, + "grad_norm": 2.757581205213687, + "language_loss": 0.73825526, + "learning_rate": 1.339822624710401e-06, + "loss": 0.81516558, + "num_input_tokens_seen": 221651035, + "router_z_loss_clip": 1.46972656, + "router_z_loss_mlp": 0.11816406, + "step": 10293, + "time_per_iteration": 4.005521774291992 + }, + { + "auxiliary_loss_clip": 0.06414802, + "auxiliary_loss_mlp": 0.01268302, + "balance_loss_clip": 0.06274456, + "balance_loss_mlp": 0.0125721, + "epoch": 0.6189087629640764, + "flos": 20929738798080.0, + "grad_norm": 1.751787926809697, + "language_loss": 0.83461618, + "learning_rate": 1.3394550049052454e-06, + "loss": 0.91144723, + "num_input_tokens_seen": 221671300, + "router_z_loss_clip": 1.40332031, + "router_z_loss_mlp": 0.11096191, + "step": 10294, + "time_per_iteration": 2.5416274070739746 + }, + { + "auxiliary_loss_clip": 0.06413339, + "auxiliary_loss_mlp": 0.01271366, + "balance_loss_clip": 0.06272751, + "balance_loss_mlp": 0.01260434, + "epoch": 0.6189688862167443, + "flos": 14835070621440.0, + "grad_norm": 2.3983238935990525, + "language_loss": 0.70671308, + "learning_rate": 1.3390874101480225e-06, + "loss": 0.7835601, + "num_input_tokens_seen": 221687320, + "router_z_loss_clip": 1.40429688, + "router_z_loss_mlp": 0.10931396, + "step": 10295, + "time_per_iteration": 2.474698781967163 + }, + { + "auxiliary_loss_clip": 0.06411821, + "auxiliary_loss_mlp": 0.01272777, + "balance_loss_clip": 0.06273046, + "balance_loss_mlp": 0.01261494, + "epoch": 0.6190290094694123, + "flos": 24292803870720.0, + "grad_norm": 1.4317659849997142, + "language_loss": 0.69952327, + "learning_rate": 1.3387198404526705e-06, + "loss": 0.77636921, + "num_input_tokens_seen": 221710175, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.11291504, + "step": 10296, + "time_per_iteration": 2.618892192840576 + }, + { + "auxiliary_loss_clip": 0.06412887, + "auxiliary_loss_mlp": 0.01267051, + "balance_loss_clip": 0.06270926, + "balance_loss_mlp": 0.0125547, + "epoch": 0.6190891327220802, + "flos": 22536192476160.0, + "grad_norm": 1.9563521083429962, + "language_loss": 0.71887541, + "learning_rate": 1.3383522958331287e-06, + "loss": 0.7956748, + "num_input_tokens_seen": 221728145, + "router_z_loss_clip": 1.41992188, + "router_z_loss_mlp": 0.11584473, + "step": 10297, + "time_per_iteration": 2.5115151405334473 + }, + { + "auxiliary_loss_clip": 0.0631431, + "auxiliary_loss_mlp": 0.01254184, + "balance_loss_clip": 0.0625589, + "balance_loss_mlp": 0.01252958, + "epoch": 0.6191492559747482, + "flos": 67748756509440.0, + "grad_norm": 0.8712851262632907, + "language_loss": 0.64291644, + "learning_rate": 1.3379847763033345e-06, + "loss": 0.71860135, + "num_input_tokens_seen": 221786100, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 0.01225281, + "step": 10298, + "time_per_iteration": 3.0254995822906494 + }, + { + "auxiliary_loss_clip": 0.06415632, + "auxiliary_loss_mlp": 0.01266663, + "balance_loss_clip": 0.06271654, + "balance_loss_mlp": 0.01255517, + "epoch": 0.6192093792274163, + "flos": 22353316940160.0, + "grad_norm": 1.6622389387462033, + "language_loss": 0.73995864, + "learning_rate": 1.3376172818772236e-06, + "loss": 0.81678164, + "num_input_tokens_seen": 221806450, + "router_z_loss_clip": 1.43945312, + "router_z_loss_mlp": 0.11157227, + "step": 10299, + "time_per_iteration": 3.9369277954101562 + }, + { + "auxiliary_loss_clip": 0.06421331, + "auxiliary_loss_mlp": 0.01268355, + "balance_loss_clip": 0.06274632, + "balance_loss_mlp": 0.01257054, + "epoch": 0.6192695024800842, + "flos": 13559176500480.0, + "grad_norm": 1.5604516058647369, + "language_loss": 0.68912721, + "learning_rate": 1.337249812568732e-06, + "loss": 0.76602411, + "num_input_tokens_seen": 221823330, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 0.11297607, + "step": 10300, + "time_per_iteration": 2.462852716445923 + }, + { + "auxiliary_loss_clip": 0.06414428, + "auxiliary_loss_mlp": 0.01266769, + "balance_loss_clip": 0.06272526, + "balance_loss_mlp": 0.01255241, + "epoch": 0.6193296257327522, + "flos": 17420163661440.0, + "grad_norm": 1.6482033452585196, + "language_loss": 0.67021179, + "learning_rate": 1.3368823683917939e-06, + "loss": 0.74702382, + "num_input_tokens_seen": 221839360, + "router_z_loss_clip": 1.41992188, + "router_z_loss_mlp": 0.11529541, + "step": 10301, + "time_per_iteration": 2.496779680252075 + }, + { + "auxiliary_loss_clip": 0.06414926, + "auxiliary_loss_mlp": 0.01266961, + "balance_loss_clip": 0.06272815, + "balance_loss_mlp": 0.01256411, + "epoch": 0.6193897489854201, + "flos": 31108869774720.0, + "grad_norm": 1.608536765976836, + "language_loss": 0.72948015, + "learning_rate": 1.3365149493603424e-06, + "loss": 0.80629897, + "num_input_tokens_seen": 221859465, + "router_z_loss_clip": 1.41992188, + "router_z_loss_mlp": 0.10546875, + "step": 10302, + "time_per_iteration": 2.5844531059265137 + }, + { + "auxiliary_loss_clip": 0.06413972, + "auxiliary_loss_mlp": 0.01269333, + "balance_loss_clip": 0.06273288, + "balance_loss_mlp": 0.01258038, + "epoch": 0.6194498722380881, + "flos": 19139822605440.0, + "grad_norm": 1.7442373384203957, + "language_loss": 0.81269908, + "learning_rate": 1.3361475554883107e-06, + "loss": 0.88953209, + "num_input_tokens_seen": 221878555, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.11303711, + "step": 10303, + "time_per_iteration": 2.527067184448242 + }, + { + "auxiliary_loss_clip": 0.06420361, + "auxiliary_loss_mlp": 0.01268221, + "balance_loss_clip": 0.06272827, + "balance_loss_mlp": 0.01255274, + "epoch": 0.619509995490756, + "flos": 21841517001600.0, + "grad_norm": 1.6019319576417599, + "language_loss": 0.76846468, + "learning_rate": 1.3357801867896307e-06, + "loss": 0.8453505, + "num_input_tokens_seen": 221898790, + "router_z_loss_clip": 1.47851562, + "router_z_loss_mlp": 0.12957764, + "step": 10304, + "time_per_iteration": 2.4880640506744385 + }, + { + "auxiliary_loss_clip": 0.06424797, + "auxiliary_loss_mlp": 0.01268109, + "balance_loss_clip": 0.06276388, + "balance_loss_mlp": 0.0125617, + "epoch": 0.619570118743424, + "flos": 23813512116480.0, + "grad_norm": 1.7485917713195505, + "language_loss": 0.77554089, + "learning_rate": 1.3354128432782324e-06, + "loss": 0.85246998, + "num_input_tokens_seen": 221918875, + "router_z_loss_clip": 1.48339844, + "router_z_loss_mlp": 0.1194458, + "step": 10305, + "time_per_iteration": 2.5362794399261475 + }, + { + "auxiliary_loss_clip": 0.06418667, + "auxiliary_loss_mlp": 0.01267084, + "balance_loss_clip": 0.06272887, + "balance_loss_mlp": 0.0125508, + "epoch": 0.619630241996092, + "flos": 21107289600000.0, + "grad_norm": 1.5608682149054525, + "language_loss": 0.79292911, + "learning_rate": 1.335045524968045e-06, + "loss": 0.86978668, + "num_input_tokens_seen": 221937895, + "router_z_loss_clip": 1.45703125, + "router_z_loss_mlp": 0.12005615, + "step": 10306, + "time_per_iteration": 2.5073060989379883 + }, + { + "auxiliary_loss_clip": 0.0640957, + "auxiliary_loss_mlp": 0.01267646, + "balance_loss_clip": 0.06271125, + "balance_loss_mlp": 0.01258067, + "epoch": 0.61969036524876, + "flos": 27315666167040.0, + "grad_norm": 1.5979283875043302, + "language_loss": 0.80772972, + "learning_rate": 1.3346782318729988e-06, + "loss": 0.88450187, + "num_input_tokens_seen": 221955920, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.09576416, + "step": 10307, + "time_per_iteration": 2.576525926589966 + }, + { + "auxiliary_loss_clip": 0.06313084, + "auxiliary_loss_mlp": 0.01252494, + "balance_loss_clip": 0.06255361, + "balance_loss_mlp": 0.01251256, + "epoch": 0.6197504885014279, + "flos": 51667308403200.0, + "grad_norm": 0.783320902533958, + "language_loss": 0.59562945, + "learning_rate": 1.3343109640070203e-06, + "loss": 0.67128521, + "num_input_tokens_seen": 222011405, + "router_z_loss_clip": 0.578125, + "router_z_loss_mlp": 0.01237488, + "step": 10308, + "time_per_iteration": 3.167433738708496 + }, + { + "auxiliary_loss_clip": 0.06410602, + "auxiliary_loss_mlp": 0.01264735, + "balance_loss_clip": 0.06271984, + "balance_loss_mlp": 0.01254191, + "epoch": 0.6198106117540959, + "flos": 30565316338560.0, + "grad_norm": 1.6157907948964547, + "language_loss": 0.68128729, + "learning_rate": 1.333943721384037e-06, + "loss": 0.75804067, + "num_input_tokens_seen": 222034545, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.10540771, + "step": 10309, + "time_per_iteration": 2.5872271060943604 + }, + { + "auxiliary_loss_clip": 0.06412695, + "auxiliary_loss_mlp": 0.01268034, + "balance_loss_clip": 0.06273058, + "balance_loss_mlp": 0.01257108, + "epoch": 0.6198707350067638, + "flos": 18914586030720.0, + "grad_norm": 1.6991122803597551, + "language_loss": 0.725124, + "learning_rate": 1.3335765040179746e-06, + "loss": 0.80193126, + "num_input_tokens_seen": 222052690, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.10925293, + "step": 10310, + "time_per_iteration": 2.5339155197143555 + }, + { + "auxiliary_loss_clip": 0.0642102, + "auxiliary_loss_mlp": 0.01267157, + "balance_loss_clip": 0.06275747, + "balance_loss_mlp": 0.01254974, + "epoch": 0.6199308582594318, + "flos": 21440238998400.0, + "grad_norm": 1.796323815916351, + "language_loss": 0.78780711, + "learning_rate": 1.3332093119227573e-06, + "loss": 0.86468887, + "num_input_tokens_seen": 222069095, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.12176514, + "step": 10311, + "time_per_iteration": 2.5148420333862305 + }, + { + "auxiliary_loss_clip": 0.06414344, + "auxiliary_loss_mlp": 0.01267618, + "balance_loss_clip": 0.06271456, + "balance_loss_mlp": 0.0125643, + "epoch": 0.6199909815120999, + "flos": 18413561341440.0, + "grad_norm": 2.1642456621818935, + "language_loss": 0.72494328, + "learning_rate": 1.3328421451123105e-06, + "loss": 0.80176294, + "num_input_tokens_seen": 222087360, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.11175537, + "step": 10312, + "time_per_iteration": 2.5287880897521973 + }, + { + "auxiliary_loss_clip": 0.0642011, + "auxiliary_loss_mlp": 0.01266003, + "balance_loss_clip": 0.06274375, + "balance_loss_mlp": 0.01254744, + "epoch": 0.6200511047647678, + "flos": 21472663328640.0, + "grad_norm": 5.562964449835012, + "language_loss": 0.72224271, + "learning_rate": 1.3324750036005557e-06, + "loss": 0.79910386, + "num_input_tokens_seen": 222106130, + "router_z_loss_clip": 1.45898438, + "router_z_loss_mlp": 0.1126709, + "step": 10313, + "time_per_iteration": 2.5028812885284424 + }, + { + "auxiliary_loss_clip": 0.06422722, + "auxiliary_loss_mlp": 0.01266585, + "balance_loss_clip": 0.06275584, + "balance_loss_mlp": 0.01254521, + "epoch": 0.6201112280174358, + "flos": 18220539461760.0, + "grad_norm": 1.7747609453089435, + "language_loss": 0.78361583, + "learning_rate": 1.332107887401416e-06, + "loss": 0.86050892, + "num_input_tokens_seen": 222123125, + "router_z_loss_clip": 1.47167969, + "router_z_loss_mlp": 0.12054443, + "step": 10314, + "time_per_iteration": 2.5241122245788574 + }, + { + "auxiliary_loss_clip": 0.06416035, + "auxiliary_loss_mlp": 0.01264642, + "balance_loss_clip": 0.06273148, + "balance_loss_mlp": 0.01253723, + "epoch": 0.6201713512701037, + "flos": 20017373616000.0, + "grad_norm": 1.7540334225503873, + "language_loss": 0.78008437, + "learning_rate": 1.331740796528812e-06, + "loss": 0.8568911, + "num_input_tokens_seen": 222140655, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.10925293, + "step": 10315, + "time_per_iteration": 2.515916585922241 + }, + { + "auxiliary_loss_clip": 0.06417818, + "auxiliary_loss_mlp": 0.01268496, + "balance_loss_clip": 0.06271202, + "balance_loss_mlp": 0.01257719, + "epoch": 0.6202314745227717, + "flos": 22493537948160.0, + "grad_norm": 2.219101181270965, + "language_loss": 0.76005399, + "learning_rate": 1.3313737309966641e-06, + "loss": 0.83691716, + "num_input_tokens_seen": 222160450, + "router_z_loss_clip": 1.46582031, + "router_z_loss_mlp": 0.10766602, + "step": 10316, + "time_per_iteration": 2.5367636680603027 + }, + { + "auxiliary_loss_clip": 0.06417404, + "auxiliary_loss_mlp": 0.01267951, + "balance_loss_clip": 0.06271914, + "balance_loss_mlp": 0.01256948, + "epoch": 0.6202915977754396, + "flos": 26835116601600.0, + "grad_norm": 1.8483221587209677, + "language_loss": 0.77761883, + "learning_rate": 1.3310066908188915e-06, + "loss": 0.8544724, + "num_input_tokens_seen": 222179170, + "router_z_loss_clip": 1.45507812, + "router_z_loss_mlp": 0.11004639, + "step": 10317, + "time_per_iteration": 2.5396320819854736 + }, + { + "auxiliary_loss_clip": 0.06315257, + "auxiliary_loss_mlp": 0.01256399, + "balance_loss_clip": 0.0625724, + "balance_loss_mlp": 0.01255023, + "epoch": 0.6203517210281076, + "flos": 62763248828160.0, + "grad_norm": 0.6893904060556487, + "language_loss": 0.58856946, + "learning_rate": 1.3306396760094122e-06, + "loss": 0.66428602, + "num_input_tokens_seen": 222242660, + "router_z_loss_clip": 0.578125, + "router_z_loss_mlp": 0.01377869, + "step": 10318, + "time_per_iteration": 3.1691195964813232 + }, + { + "auxiliary_loss_clip": 0.06414767, + "auxiliary_loss_mlp": 0.01270191, + "balance_loss_clip": 0.06272453, + "balance_loss_mlp": 0.01258425, + "epoch": 0.6204118442807756, + "flos": 23411018229120.0, + "grad_norm": 1.7666446205430133, + "language_loss": 0.78163171, + "learning_rate": 1.330272686582143e-06, + "loss": 0.85848129, + "num_input_tokens_seen": 222262170, + "router_z_loss_clip": 1.41992188, + "router_z_loss_mlp": 0.11755371, + "step": 10319, + "time_per_iteration": 2.5313587188720703 + }, + { + "auxiliary_loss_clip": 0.06410229, + "auxiliary_loss_mlp": 0.01267722, + "balance_loss_clip": 0.06271461, + "balance_loss_mlp": 0.01257589, + "epoch": 0.6204719675334436, + "flos": 20199871808640.0, + "grad_norm": 1.5707406021720693, + "language_loss": 0.66525, + "learning_rate": 1.3299057225510013e-06, + "loss": 0.74202955, + "num_input_tokens_seen": 222280375, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.10137939, + "step": 10320, + "time_per_iteration": 3.8696272373199463 + }, + { + "auxiliary_loss_clip": 0.06407389, + "auxiliary_loss_mlp": 0.01264937, + "balance_loss_clip": 0.06270511, + "balance_loss_mlp": 0.01255025, + "epoch": 0.6205320907861115, + "flos": 13193048085120.0, + "grad_norm": 1.6249727148286428, + "language_loss": 0.76339847, + "learning_rate": 1.3295387839299013e-06, + "loss": 0.84012175, + "num_input_tokens_seen": 222297325, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.09912109, + "step": 10321, + "time_per_iteration": 2.4867870807647705 + }, + { + "auxiliary_loss_clip": 0.06409396, + "auxiliary_loss_mlp": 0.01266949, + "balance_loss_clip": 0.06270664, + "balance_loss_mlp": 0.01256256, + "epoch": 0.6205922140387795, + "flos": 20674761223680.0, + "grad_norm": 1.5610091783179405, + "language_loss": 0.74460745, + "learning_rate": 1.329171870732758e-06, + "loss": 0.82137096, + "num_input_tokens_seen": 222317095, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.10693359, + "step": 10322, + "time_per_iteration": 2.506465196609497 + }, + { + "auxiliary_loss_clip": 0.06410797, + "auxiliary_loss_mlp": 0.01264937, + "balance_loss_clip": 0.06272407, + "balance_loss_mlp": 0.01255275, + "epoch": 0.6206523372914474, + "flos": 23884524051840.0, + "grad_norm": 1.6823894915828839, + "language_loss": 0.72711974, + "learning_rate": 1.3288049829734845e-06, + "loss": 0.80387706, + "num_input_tokens_seen": 222337055, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.09667969, + "step": 10323, + "time_per_iteration": 2.5490479469299316 + }, + { + "auxiliary_loss_clip": 0.06424229, + "auxiliary_loss_mlp": 0.012682, + "balance_loss_clip": 0.06274472, + "balance_loss_mlp": 0.01257322, + "epoch": 0.6207124605441154, + "flos": 13411576333440.0, + "grad_norm": 31.978129858103646, + "language_loss": 0.59017056, + "learning_rate": 1.3284381206659933e-06, + "loss": 0.66709483, + "num_input_tokens_seen": 222354515, + "router_z_loss_clip": 1.49707031, + "router_z_loss_mlp": 0.10876465, + "step": 10324, + "time_per_iteration": 2.5541300773620605 + }, + { + "auxiliary_loss_clip": 0.0641806, + "auxiliary_loss_mlp": 0.01267454, + "balance_loss_clip": 0.06274732, + "balance_loss_mlp": 0.01255664, + "epoch": 0.6207725837967835, + "flos": 18922300606080.0, + "grad_norm": 1.723600813321157, + "language_loss": 0.76792443, + "learning_rate": 1.3280712838241956e-06, + "loss": 0.84477955, + "num_input_tokens_seen": 222372755, + "router_z_loss_clip": 1.43164062, + "router_z_loss_mlp": 0.11791992, + "step": 10325, + "time_per_iteration": 2.5330686569213867 + }, + { + "auxiliary_loss_clip": 0.06421543, + "auxiliary_loss_mlp": 0.01267318, + "balance_loss_clip": 0.06275088, + "balance_loss_mlp": 0.01256207, + "epoch": 0.6208327070494514, + "flos": 23985738184320.0, + "grad_norm": 1.8229064209367492, + "language_loss": 0.72747815, + "learning_rate": 1.327704472462003e-06, + "loss": 0.80436671, + "num_input_tokens_seen": 222391380, + "router_z_loss_clip": 1.46289062, + "router_z_loss_mlp": 0.11120605, + "step": 10326, + "time_per_iteration": 2.5343799591064453 + }, + { + "auxiliary_loss_clip": 0.06419887, + "auxiliary_loss_mlp": 0.01268815, + "balance_loss_clip": 0.06274612, + "balance_loss_mlp": 0.0125687, + "epoch": 0.6208928303021194, + "flos": 22827032398080.0, + "grad_norm": 1.9354170249209526, + "language_loss": 0.73989004, + "learning_rate": 1.3273376865933234e-06, + "loss": 0.81677705, + "num_input_tokens_seen": 222411165, + "router_z_loss_clip": 1.45214844, + "router_z_loss_mlp": 0.11950684, + "step": 10327, + "time_per_iteration": 2.555742025375366 + }, + { + "auxiliary_loss_clip": 0.06417272, + "auxiliary_loss_mlp": 0.0126664, + "balance_loss_clip": 0.06271765, + "balance_loss_mlp": 0.01255261, + "epoch": 0.6209529535547873, + "flos": 17569944034560.0, + "grad_norm": 2.1609251311460493, + "language_loss": 0.80099189, + "learning_rate": 1.326970926232066e-06, + "loss": 0.8778311, + "num_input_tokens_seen": 222428110, + "router_z_loss_clip": 1.45507812, + "router_z_loss_mlp": 0.11364746, + "step": 10328, + "time_per_iteration": 2.4839911460876465 + }, + { + "auxiliary_loss_clip": 0.06413457, + "auxiliary_loss_mlp": 0.0126611, + "balance_loss_clip": 0.06270879, + "balance_loss_mlp": 0.01254791, + "epoch": 0.6210130768074553, + "flos": 22017432648960.0, + "grad_norm": 1.8104585499122046, + "language_loss": 0.78316593, + "learning_rate": 1.3266041913921396e-06, + "loss": 0.85996157, + "num_input_tokens_seen": 222446385, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.11322021, + "step": 10329, + "time_per_iteration": 2.551748514175415 + }, + { + "auxiliary_loss_clip": 0.06317136, + "auxiliary_loss_mlp": 0.01252093, + "balance_loss_clip": 0.0625931, + "balance_loss_mlp": 0.0125077, + "epoch": 0.6210732000601232, + "flos": 63695166739200.0, + "grad_norm": 0.8181079803134828, + "language_loss": 0.62296569, + "learning_rate": 1.3262374820874484e-06, + "loss": 0.69865799, + "num_input_tokens_seen": 222502150, + "router_z_loss_clip": 0.578125, + "router_z_loss_mlp": 0.013237, + "step": 10330, + "time_per_iteration": 4.52486252784729 + }, + { + "auxiliary_loss_clip": 0.06422883, + "auxiliary_loss_mlp": 0.01268071, + "balance_loss_clip": 0.06276384, + "balance_loss_mlp": 0.01256275, + "epoch": 0.6211333233127913, + "flos": 24250233196800.0, + "grad_norm": 2.0105352809521517, + "language_loss": 0.77933174, + "learning_rate": 1.3258707983319002e-06, + "loss": 0.85624135, + "num_input_tokens_seen": 222519880, + "router_z_loss_clip": 1.46582031, + "router_z_loss_mlp": 0.11791992, + "step": 10331, + "time_per_iteration": 2.558311939239502 + }, + { + "auxiliary_loss_clip": 0.06423557, + "auxiliary_loss_mlp": 0.01267101, + "balance_loss_clip": 0.06275949, + "balance_loss_mlp": 0.01255151, + "epoch": 0.6211934465654592, + "flos": 16949047680000.0, + "grad_norm": 2.3537089497540147, + "language_loss": 0.67977309, + "learning_rate": 1.3255041401393992e-06, + "loss": 0.75667971, + "num_input_tokens_seen": 222538545, + "router_z_loss_clip": 1.4765625, + "router_z_loss_mlp": 0.11950684, + "step": 10332, + "time_per_iteration": 2.4883179664611816 + }, + { + "auxiliary_loss_clip": 0.06419694, + "auxiliary_loss_mlp": 0.01266096, + "balance_loss_clip": 0.06276092, + "balance_loss_mlp": 0.01255677, + "epoch": 0.6212535698181272, + "flos": 15272672169600.0, + "grad_norm": 1.3382118578807503, + "language_loss": 0.76498306, + "learning_rate": 1.3251375075238476e-06, + "loss": 0.84184092, + "num_input_tokens_seen": 222556935, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.10418701, + "step": 10333, + "time_per_iteration": 3.9705252647399902 + }, + { + "auxiliary_loss_clip": 0.06414539, + "auxiliary_loss_mlp": 0.01267678, + "balance_loss_clip": 0.06275988, + "balance_loss_mlp": 0.012563, + "epoch": 0.6213136930707951, + "flos": 13449073835520.0, + "grad_norm": 2.1789310130446227, + "language_loss": 0.70102298, + "learning_rate": 1.3247709004991507e-06, + "loss": 0.77784514, + "num_input_tokens_seen": 222574035, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.11383057, + "step": 10334, + "time_per_iteration": 2.5797176361083984 + }, + { + "auxiliary_loss_clip": 0.06414784, + "auxiliary_loss_mlp": 0.01264307, + "balance_loss_clip": 0.06275611, + "balance_loss_mlp": 0.01254168, + "epoch": 0.6213738163234631, + "flos": 18116641998720.0, + "grad_norm": 1.637338123067712, + "language_loss": 0.70408571, + "learning_rate": 1.3244043190792078e-06, + "loss": 0.78087658, + "num_input_tokens_seen": 222592290, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.10137939, + "step": 10335, + "time_per_iteration": 2.482482671737671 + }, + { + "auxiliary_loss_clip": 0.06413939, + "auxiliary_loss_mlp": 0.01267616, + "balance_loss_clip": 0.0627524, + "balance_loss_mlp": 0.01257185, + "epoch": 0.621433939576131, + "flos": 25344299957760.0, + "grad_norm": 1.5093006351890013, + "language_loss": 0.80123997, + "learning_rate": 1.3240377632779213e-06, + "loss": 0.87805557, + "num_input_tokens_seen": 222612805, + "router_z_loss_clip": 1.38476562, + "router_z_loss_mlp": 0.10430908, + "step": 10336, + "time_per_iteration": 2.5523369312286377 + }, + { + "auxiliary_loss_clip": 0.06410298, + "auxiliary_loss_mlp": 0.01268916, + "balance_loss_clip": 0.06271983, + "balance_loss_mlp": 0.0125848, + "epoch": 0.621494062828799, + "flos": 22572306385920.0, + "grad_norm": 1.6169920799644502, + "language_loss": 0.73330015, + "learning_rate": 1.3236712331091907e-06, + "loss": 0.81009233, + "num_input_tokens_seen": 222632260, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.10437012, + "step": 10337, + "time_per_iteration": 2.4964675903320312 + }, + { + "auxiliary_loss_clip": 0.0642301, + "auxiliary_loss_mlp": 0.01266548, + "balance_loss_clip": 0.06278226, + "balance_loss_mlp": 0.012548, + "epoch": 0.621554186081467, + "flos": 27425433415680.0, + "grad_norm": 1.8853547327091988, + "language_loss": 0.63167447, + "learning_rate": 1.3233047285869145e-06, + "loss": 0.70857, + "num_input_tokens_seen": 222653570, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.11755371, + "step": 10338, + "time_per_iteration": 4.016883611679077 + }, + { + "auxiliary_loss_clip": 0.06417143, + "auxiliary_loss_mlp": 0.0126833, + "balance_loss_clip": 0.06275916, + "balance_loss_mlp": 0.01257787, + "epoch": 0.621614309334135, + "flos": 22353484648320.0, + "grad_norm": 1.7306917238363975, + "language_loss": 0.71876323, + "learning_rate": 1.322938249724991e-06, + "loss": 0.79561794, + "num_input_tokens_seen": 222672480, + "router_z_loss_clip": 1.41210938, + "router_z_loss_mlp": 0.10546875, + "step": 10339, + "time_per_iteration": 2.5129294395446777 + }, + { + "auxiliary_loss_clip": 0.06411034, + "auxiliary_loss_mlp": 0.01266092, + "balance_loss_clip": 0.06274111, + "balance_loss_mlp": 0.0125519, + "epoch": 0.621674432586803, + "flos": 19287255064320.0, + "grad_norm": 1.654477546235719, + "language_loss": 0.69824433, + "learning_rate": 1.3225717965373166e-06, + "loss": 0.77501559, + "num_input_tokens_seen": 222691200, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.10906982, + "step": 10340, + "time_per_iteration": 2.491989850997925 + }, + { + "auxiliary_loss_clip": 0.0641477, + "auxiliary_loss_mlp": 0.01265499, + "balance_loss_clip": 0.06276464, + "balance_loss_mlp": 0.01255074, + "epoch": 0.6217345558394709, + "flos": 21614812980480.0, + "grad_norm": 1.760593238290477, + "language_loss": 0.68765497, + "learning_rate": 1.322205369037788e-06, + "loss": 0.76445758, + "num_input_tokens_seen": 222709975, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.10430908, + "step": 10341, + "time_per_iteration": 2.6119179725646973 + }, + { + "auxiliary_loss_clip": 0.06421542, + "auxiliary_loss_mlp": 0.01267354, + "balance_loss_clip": 0.06278797, + "balance_loss_mlp": 0.01256089, + "epoch": 0.6217946790921389, + "flos": 18009893496960.0, + "grad_norm": 2.3031674054515867, + "language_loss": 0.81059158, + "learning_rate": 1.321838967240299e-06, + "loss": 0.88748062, + "num_input_tokens_seen": 222729005, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.11273193, + "step": 10342, + "time_per_iteration": 2.4969582557678223 + }, + { + "auxiliary_loss_clip": 0.0631469, + "auxiliary_loss_mlp": 0.0125491, + "balance_loss_clip": 0.0625717, + "balance_loss_mlp": 0.01253292, + "epoch": 0.6218548023448068, + "flos": 61993578349440.0, + "grad_norm": 0.8110464269458239, + "language_loss": 0.5724324, + "learning_rate": 1.3214725911587452e-06, + "loss": 0.64812839, + "num_input_tokens_seen": 222786090, + "router_z_loss_clip": 0.578125, + "router_z_loss_mlp": 0.01620483, + "step": 10343, + "time_per_iteration": 3.0396130084991455 + }, + { + "auxiliary_loss_clip": 0.06411558, + "auxiliary_loss_mlp": 0.01264969, + "balance_loss_clip": 0.06274949, + "balance_loss_mlp": 0.01254812, + "epoch": 0.6219149255974749, + "flos": 25746248793600.0, + "grad_norm": 1.838833235576279, + "language_loss": 0.73063612, + "learning_rate": 1.3211062408070184e-06, + "loss": 0.80740142, + "num_input_tokens_seen": 222806100, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.1015625, + "step": 10344, + "time_per_iteration": 2.5173933506011963 + }, + { + "auxiliary_loss_clip": 0.0641374, + "auxiliary_loss_mlp": 0.01264496, + "balance_loss_clip": 0.06273273, + "balance_loss_mlp": 0.01253803, + "epoch": 0.6219750488501428, + "flos": 25418162931840.0, + "grad_norm": 2.137498021001217, + "language_loss": 0.60161531, + "learning_rate": 1.3207399161990105e-06, + "loss": 0.67839766, + "num_input_tokens_seen": 222826575, + "router_z_loss_clip": 1.40332031, + "router_z_loss_mlp": 0.10699463, + "step": 10345, + "time_per_iteration": 2.5472302436828613 + }, + { + "auxiliary_loss_clip": 0.06417334, + "auxiliary_loss_mlp": 0.01264437, + "balance_loss_clip": 0.06275278, + "balance_loss_mlp": 0.01253357, + "epoch": 0.6220351721028108, + "flos": 20053529452800.0, + "grad_norm": 2.827284227984571, + "language_loss": 0.78566015, + "learning_rate": 1.320373617348614e-06, + "loss": 0.86247778, + "num_input_tokens_seen": 222845285, + "router_z_loss_clip": 1.41992188, + "router_z_loss_mlp": 0.11083984, + "step": 10346, + "time_per_iteration": 2.487410068511963 + }, + { + "auxiliary_loss_clip": 0.06418615, + "auxiliary_loss_mlp": 0.01266577, + "balance_loss_clip": 0.06276032, + "balance_loss_mlp": 0.01255419, + "epoch": 0.6220952953554787, + "flos": 27495439102080.0, + "grad_norm": 1.506091245470688, + "language_loss": 0.71672869, + "learning_rate": 1.3200073442697171e-06, + "loss": 0.79358065, + "num_input_tokens_seen": 222864575, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.11151123, + "step": 10347, + "time_per_iteration": 2.589825391769409 + }, + { + "auxiliary_loss_clip": 0.06409717, + "auxiliary_loss_mlp": 0.01264267, + "balance_loss_clip": 0.06270842, + "balance_loss_mlp": 0.01254117, + "epoch": 0.6221554186081467, + "flos": 19213517871360.0, + "grad_norm": 1.5983272943469429, + "language_loss": 0.7253015, + "learning_rate": 1.3196410969762108e-06, + "loss": 0.80204135, + "num_input_tokens_seen": 222884420, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.10144043, + "step": 10348, + "time_per_iteration": 2.497612953186035 + }, + { + "auxiliary_loss_clip": 0.06308477, + "auxiliary_loss_mlp": 0.01254968, + "balance_loss_clip": 0.06251626, + "balance_loss_mlp": 0.01253483, + "epoch": 0.6222155418608146, + "flos": 62969744016000.0, + "grad_norm": 0.7906840461302661, + "language_loss": 0.54113448, + "learning_rate": 1.3192748754819815e-06, + "loss": 0.61676896, + "num_input_tokens_seen": 222944690, + "router_z_loss_clip": 0.5703125, + "router_z_loss_mlp": 0.01483154, + "step": 10349, + "time_per_iteration": 3.123992681503296 + }, + { + "auxiliary_loss_clip": 0.06409817, + "auxiliary_loss_mlp": 0.01266982, + "balance_loss_clip": 0.06269394, + "balance_loss_mlp": 0.01256086, + "epoch": 0.6222756651134826, + "flos": 22607623681920.0, + "grad_norm": 1.7328717856317462, + "language_loss": 0.69908136, + "learning_rate": 1.3189086798009173e-06, + "loss": 0.77584934, + "num_input_tokens_seen": 222962990, + "router_z_loss_clip": 1.40429688, + "router_z_loss_mlp": 0.10894775, + "step": 10350, + "time_per_iteration": 2.5098471641540527 + }, + { + "auxiliary_loss_clip": 0.0641721, + "auxiliary_loss_mlp": 0.01269342, + "balance_loss_clip": 0.06275678, + "balance_loss_mlp": 0.01257946, + "epoch": 0.6223357883661506, + "flos": 21148602462720.0, + "grad_norm": 1.8273350624055802, + "language_loss": 0.57737762, + "learning_rate": 1.3185425099469046e-06, + "loss": 0.65424317, + "num_input_tokens_seen": 222980715, + "router_z_loss_clip": 1.41308594, + "router_z_loss_mlp": 0.11395264, + "step": 10351, + "time_per_iteration": 2.508089780807495 + }, + { + "auxiliary_loss_clip": 0.06308511, + "auxiliary_loss_mlp": 0.01256508, + "balance_loss_clip": 0.06251398, + "balance_loss_mlp": 0.01254946, + "epoch": 0.6223959116188186, + "flos": 63785926310400.0, + "grad_norm": 0.780725998939495, + "language_loss": 0.61087048, + "learning_rate": 1.3181763659338276e-06, + "loss": 0.6865207, + "num_input_tokens_seen": 223040685, + "router_z_loss_clip": 0.5703125, + "router_z_loss_mlp": 0.01560974, + "step": 10352, + "time_per_iteration": 3.1217076778411865 + }, + { + "auxiliary_loss_clip": 0.06412127, + "auxiliary_loss_mlp": 0.0126301, + "balance_loss_clip": 0.06274231, + "balance_loss_mlp": 0.01252866, + "epoch": 0.6224560348714866, + "flos": 22572432167040.0, + "grad_norm": 2.017492088511814, + "language_loss": 0.82234097, + "learning_rate": 1.3178102477755714e-06, + "loss": 0.89909232, + "num_input_tokens_seen": 223059000, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.10144043, + "step": 10353, + "time_per_iteration": 2.527926445007324 + }, + { + "auxiliary_loss_clip": 0.06406288, + "auxiliary_loss_mlp": 0.01271685, + "balance_loss_clip": 0.06271318, + "balance_loss_mlp": 0.01261474, + "epoch": 0.6225161581241545, + "flos": 24104645527680.0, + "grad_norm": 1.3564318500578532, + "language_loss": 0.75680768, + "learning_rate": 1.3174441554860195e-06, + "loss": 0.83358729, + "num_input_tokens_seen": 223079345, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.10217285, + "step": 10354, + "time_per_iteration": 2.577965021133423 + }, + { + "auxiliary_loss_clip": 0.06411938, + "auxiliary_loss_mlp": 0.0126369, + "balance_loss_clip": 0.06273422, + "balance_loss_mlp": 0.01253659, + "epoch": 0.6225762813768225, + "flos": 20448853816320.0, + "grad_norm": 1.3905640818253433, + "language_loss": 0.7869665, + "learning_rate": 1.3170780890790528e-06, + "loss": 0.8637228, + "num_input_tokens_seen": 223097880, + "router_z_loss_clip": 1.38476562, + "router_z_loss_mlp": 0.1003418, + "step": 10355, + "time_per_iteration": 2.520951986312866 + }, + { + "auxiliary_loss_clip": 0.06414016, + "auxiliary_loss_mlp": 0.01267836, + "balance_loss_clip": 0.06272769, + "balance_loss_mlp": 0.01257757, + "epoch": 0.6226364046294904, + "flos": 27205395793920.0, + "grad_norm": 1.8039879302815294, + "language_loss": 0.78103602, + "learning_rate": 1.3167120485685538e-06, + "loss": 0.85785455, + "num_input_tokens_seen": 223118185, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.10083008, + "step": 10356, + "time_per_iteration": 2.595402956008911 + }, + { + "auxiliary_loss_clip": 0.06422309, + "auxiliary_loss_mlp": 0.01269591, + "balance_loss_clip": 0.06274671, + "balance_loss_mlp": 0.01257307, + "epoch": 0.6226965278821585, + "flos": 20451495219840.0, + "grad_norm": 2.2679706310330037, + "language_loss": 0.67886806, + "learning_rate": 1.3163460339684024e-06, + "loss": 0.75578707, + "num_input_tokens_seen": 223137600, + "router_z_loss_clip": 1.47753906, + "router_z_loss_mlp": 0.1229248, + "step": 10357, + "time_per_iteration": 2.5113070011138916 + }, + { + "auxiliary_loss_clip": 0.06419406, + "auxiliary_loss_mlp": 0.01267785, + "balance_loss_clip": 0.06272604, + "balance_loss_mlp": 0.0125578, + "epoch": 0.6227566511348264, + "flos": 22169099738880.0, + "grad_norm": 2.9791987901041788, + "language_loss": 0.76851863, + "learning_rate": 1.3159800452924778e-06, + "loss": 0.84539044, + "num_input_tokens_seen": 223154360, + "router_z_loss_clip": 1.46777344, + "router_z_loss_mlp": 0.11999512, + "step": 10358, + "time_per_iteration": 2.532348394393921 + }, + { + "auxiliary_loss_clip": 0.06416389, + "auxiliary_loss_mlp": 0.01266377, + "balance_loss_clip": 0.0627404, + "balance_loss_mlp": 0.01255922, + "epoch": 0.6228167743874944, + "flos": 18046720166400.0, + "grad_norm": 1.8844002351613314, + "language_loss": 0.82833385, + "learning_rate": 1.3156140825546588e-06, + "loss": 0.9051615, + "num_input_tokens_seen": 223172255, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.10455322, + "step": 10359, + "time_per_iteration": 3.914476156234741 + }, + { + "auxiliary_loss_clip": 0.06410404, + "auxiliary_loss_mlp": 0.01263862, + "balance_loss_clip": 0.06273699, + "balance_loss_mlp": 0.01253353, + "epoch": 0.6228768976401623, + "flos": 17747620617600.0, + "grad_norm": 2.053797228905972, + "language_loss": 0.73535556, + "learning_rate": 1.315248145768822e-06, + "loss": 0.81209821, + "num_input_tokens_seen": 223186965, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.10510254, + "step": 10360, + "time_per_iteration": 2.476815700531006 + }, + { + "auxiliary_loss_clip": 0.06415363, + "auxiliary_loss_mlp": 0.01268466, + "balance_loss_clip": 0.06274994, + "balance_loss_mlp": 0.01257999, + "epoch": 0.6229370208928303, + "flos": 17900755153920.0, + "grad_norm": 2.156230361739645, + "language_loss": 0.77647728, + "learning_rate": 1.3148822349488442e-06, + "loss": 0.85331559, + "num_input_tokens_seen": 223206045, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.10461426, + "step": 10361, + "time_per_iteration": 2.4798471927642822 + }, + { + "auxiliary_loss_clip": 0.06413896, + "auxiliary_loss_mlp": 0.01265649, + "balance_loss_clip": 0.0627467, + "balance_loss_mlp": 0.01255618, + "epoch": 0.6229971441454982, + "flos": 17353512138240.0, + "grad_norm": 1.5462012893965447, + "language_loss": 0.68078434, + "learning_rate": 1.3145163501086005e-06, + "loss": 0.7575798, + "num_input_tokens_seen": 223224820, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.1003418, + "step": 10362, + "time_per_iteration": 2.5225536823272705 + }, + { + "auxiliary_loss_clip": 0.06412376, + "auxiliary_loss_mlp": 0.01266163, + "balance_loss_clip": 0.06272472, + "balance_loss_mlp": 0.0125466, + "epoch": 0.6230572673981662, + "flos": 29248989822720.0, + "grad_norm": 1.9753113738567412, + "language_loss": 0.67607152, + "learning_rate": 1.3141504912619658e-06, + "loss": 0.75285697, + "num_input_tokens_seen": 223243205, + "router_z_loss_clip": 1.40039062, + "router_z_loss_mlp": 0.11505127, + "step": 10363, + "time_per_iteration": 2.5485036373138428 + }, + { + "auxiliary_loss_clip": 0.06417742, + "auxiliary_loss_mlp": 0.01267367, + "balance_loss_clip": 0.06273825, + "balance_loss_mlp": 0.01256305, + "epoch": 0.6231173906508342, + "flos": 16331505488640.0, + "grad_norm": 1.8348569408777065, + "language_loss": 0.86522818, + "learning_rate": 1.3137846584228127e-06, + "loss": 0.94207931, + "num_input_tokens_seen": 223261370, + "router_z_loss_clip": 1.44042969, + "router_z_loss_mlp": 0.11071777, + "step": 10364, + "time_per_iteration": 2.510781764984131 + }, + { + "auxiliary_loss_clip": 0.06305057, + "auxiliary_loss_mlp": 0.01252144, + "balance_loss_clip": 0.06248282, + "balance_loss_mlp": 0.01250801, + "epoch": 0.6231775139035022, + "flos": 68719513587840.0, + "grad_norm": 0.8659025027753965, + "language_loss": 0.60801929, + "learning_rate": 1.313418851605015e-06, + "loss": 0.68359125, + "num_input_tokens_seen": 223315050, + "router_z_loss_clip": 0.5703125, + "router_z_loss_mlp": 0.01345062, + "step": 10365, + "time_per_iteration": 3.1263084411621094 + }, + { + "auxiliary_loss_clip": 0.06424095, + "auxiliary_loss_mlp": 0.0127084, + "balance_loss_clip": 0.0627584, + "balance_loss_mlp": 0.01257948, + "epoch": 0.6232376371561702, + "flos": 19825903036800.0, + "grad_norm": 1.776687810821879, + "language_loss": 0.75874949, + "learning_rate": 1.3130530708224427e-06, + "loss": 0.83569884, + "num_input_tokens_seen": 223332130, + "router_z_loss_clip": 1.48242188, + "router_z_loss_mlp": 0.12884521, + "step": 10366, + "time_per_iteration": 2.522902488708496 + }, + { + "auxiliary_loss_clip": 0.06416557, + "auxiliary_loss_mlp": 0.0126853, + "balance_loss_clip": 0.06272408, + "balance_loss_mlp": 0.01257372, + "epoch": 0.6232977604088381, + "flos": 23264969362560.0, + "grad_norm": 1.9573356945915528, + "language_loss": 0.77186829, + "learning_rate": 1.3126873160889665e-06, + "loss": 0.84871918, + "num_input_tokens_seen": 223351605, + "router_z_loss_clip": 1.44042969, + "router_z_loss_mlp": 0.11157227, + "step": 10367, + "time_per_iteration": 2.538060426712036 + }, + { + "auxiliary_loss_clip": 0.06409356, + "auxiliary_loss_mlp": 0.01268566, + "balance_loss_clip": 0.06272524, + "balance_loss_mlp": 0.0125841, + "epoch": 0.6233578836615061, + "flos": 21112907823360.0, + "grad_norm": 1.357507759578204, + "language_loss": 0.78851044, + "learning_rate": 1.312321587418457e-06, + "loss": 0.86528963, + "num_input_tokens_seen": 223372090, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.10162354, + "step": 10368, + "time_per_iteration": 2.525911569595337 + }, + { + "auxiliary_loss_clip": 0.06415667, + "auxiliary_loss_mlp": 0.01267784, + "balance_loss_clip": 0.06274077, + "balance_loss_mlp": 0.0125693, + "epoch": 0.623418006914174, + "flos": 23776266176640.0, + "grad_norm": 1.7380644464591393, + "language_loss": 0.69022548, + "learning_rate": 1.3119558848247811e-06, + "loss": 0.76706004, + "num_input_tokens_seen": 223390110, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.10864258, + "step": 10369, + "time_per_iteration": 3.9844348430633545 + }, + { + "auxiliary_loss_clip": 0.06414494, + "auxiliary_loss_mlp": 0.0126546, + "balance_loss_clip": 0.06272612, + "balance_loss_mlp": 0.01253861, + "epoch": 0.6234781301668421, + "flos": 17895556200960.0, + "grad_norm": 1.8898374142824015, + "language_loss": 0.88083899, + "learning_rate": 1.3115902083218072e-06, + "loss": 0.95763862, + "num_input_tokens_seen": 223404205, + "router_z_loss_clip": 1.41894531, + "router_z_loss_mlp": 0.1161499, + "step": 10370, + "time_per_iteration": 2.4602532386779785 + }, + { + "auxiliary_loss_clip": 0.06409945, + "auxiliary_loss_mlp": 0.01266714, + "balance_loss_clip": 0.06271629, + "balance_loss_mlp": 0.01256217, + "epoch": 0.62353825341951, + "flos": 26182424822400.0, + "grad_norm": 1.435666838781933, + "language_loss": 0.66256654, + "learning_rate": 1.311224557923402e-06, + "loss": 0.73933315, + "num_input_tokens_seen": 223424855, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.10510254, + "step": 10371, + "time_per_iteration": 2.585590124130249 + }, + { + "auxiliary_loss_clip": 0.06403823, + "auxiliary_loss_mlp": 0.01263874, + "balance_loss_clip": 0.06271943, + "balance_loss_mlp": 0.01254474, + "epoch": 0.623598376672178, + "flos": 31148044358400.0, + "grad_norm": 3.7034450225790962, + "language_loss": 0.77720612, + "learning_rate": 1.3108589336434298e-06, + "loss": 0.85388303, + "num_input_tokens_seen": 223447225, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09405518, + "step": 10372, + "time_per_iteration": 4.1913182735443115 + }, + { + "auxiliary_loss_clip": 0.06414315, + "auxiliary_loss_mlp": 0.01265562, + "balance_loss_clip": 0.06273008, + "balance_loss_mlp": 0.01254011, + "epoch": 0.6236584999248459, + "flos": 23736588468480.0, + "grad_norm": 1.6658386756111663, + "language_loss": 0.78006816, + "learning_rate": 1.3104933354957568e-06, + "loss": 0.85686696, + "num_input_tokens_seen": 223467520, + "router_z_loss_clip": 1.41210938, + "router_z_loss_mlp": 0.11553955, + "step": 10373, + "time_per_iteration": 2.5229697227478027 + }, + { + "auxiliary_loss_clip": 0.06407828, + "auxiliary_loss_mlp": 0.01266675, + "balance_loss_clip": 0.06271695, + "balance_loss_mlp": 0.01256494, + "epoch": 0.6237186231775139, + "flos": 21769289182080.0, + "grad_norm": 1.5443019053614775, + "language_loss": 0.69842112, + "learning_rate": 1.3101277634942448e-06, + "loss": 0.77516615, + "num_input_tokens_seen": 223488130, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.10174561, + "step": 10374, + "time_per_iteration": 2.546381711959839 + }, + { + "auxiliary_loss_clip": 0.06416135, + "auxiliary_loss_mlp": 0.01266815, + "balance_loss_clip": 0.06273846, + "balance_loss_mlp": 0.01256325, + "epoch": 0.6237787464301818, + "flos": 14944795943040.0, + "grad_norm": 1.644641658888945, + "language_loss": 0.77371937, + "learning_rate": 1.3097622176527577e-06, + "loss": 0.85054886, + "num_input_tokens_seen": 223505105, + "router_z_loss_clip": 1.42480469, + "router_z_loss_mlp": 0.10491943, + "step": 10375, + "time_per_iteration": 2.4894163608551025 + }, + { + "auxiliary_loss_clip": 0.06411552, + "auxiliary_loss_mlp": 0.01264147, + "balance_loss_clip": 0.06274613, + "balance_loss_mlp": 0.0125439, + "epoch": 0.6238388696828499, + "flos": 35599054844160.0, + "grad_norm": 1.2901779302370762, + "language_loss": 0.70425236, + "learning_rate": 1.3093966979851566e-06, + "loss": 0.78100938, + "num_input_tokens_seen": 223528065, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.09753418, + "step": 10376, + "time_per_iteration": 2.6778111457824707 + }, + { + "auxiliary_loss_clip": 0.06417015, + "auxiliary_loss_mlp": 0.01265351, + "balance_loss_clip": 0.06274082, + "balance_loss_mlp": 0.0125405, + "epoch": 0.6238989929355178, + "flos": 23630343091200.0, + "grad_norm": 1.5935175737828453, + "language_loss": 0.76607609, + "learning_rate": 1.309031204505301e-06, + "loss": 0.84289968, + "num_input_tokens_seen": 223547305, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.11315918, + "step": 10377, + "time_per_iteration": 4.115941524505615 + }, + { + "auxiliary_loss_clip": 0.06413018, + "auxiliary_loss_mlp": 0.01268384, + "balance_loss_clip": 0.06273637, + "balance_loss_mlp": 0.01258442, + "epoch": 0.6239591161881858, + "flos": 22093433902080.0, + "grad_norm": 1.8691726356193223, + "language_loss": 0.67910546, + "learning_rate": 1.308665737227052e-06, + "loss": 0.75591946, + "num_input_tokens_seen": 223567205, + "router_z_loss_clip": 1.39257812, + "router_z_loss_mlp": 0.09942627, + "step": 10378, + "time_per_iteration": 2.5460588932037354 + }, + { + "auxiliary_loss_clip": 0.06413449, + "auxiliary_loss_mlp": 0.01265408, + "balance_loss_clip": 0.06274828, + "balance_loss_mlp": 0.01254572, + "epoch": 0.6240192394408538, + "flos": 24542959835520.0, + "grad_norm": 1.7661801800879762, + "language_loss": 0.7668879, + "learning_rate": 1.3083002961642675e-06, + "loss": 0.84367645, + "num_input_tokens_seen": 223586560, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.1083374, + "step": 10379, + "time_per_iteration": 2.594383955001831 + }, + { + "auxiliary_loss_clip": 0.06411, + "auxiliary_loss_mlp": 0.01266487, + "balance_loss_clip": 0.0627025, + "balance_loss_mlp": 0.01255723, + "epoch": 0.6240793626935217, + "flos": 27940000538880.0, + "grad_norm": 1.331820718073444, + "language_loss": 0.79390121, + "learning_rate": 1.3079348813308051e-06, + "loss": 0.87067604, + "num_input_tokens_seen": 223610595, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.10766602, + "step": 10380, + "time_per_iteration": 2.593872308731079 + }, + { + "auxiliary_loss_clip": 0.06410354, + "auxiliary_loss_mlp": 0.01264738, + "balance_loss_clip": 0.06274755, + "balance_loss_mlp": 0.01254486, + "epoch": 0.6241394859461897, + "flos": 22899008655360.0, + "grad_norm": 1.5236398593874663, + "language_loss": 0.8010897, + "learning_rate": 1.3075694927405207e-06, + "loss": 0.87784058, + "num_input_tokens_seen": 223630230, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.1026001, + "step": 10381, + "time_per_iteration": 2.640678882598877 + }, + { + "auxiliary_loss_clip": 0.06414736, + "auxiliary_loss_mlp": 0.01267898, + "balance_loss_clip": 0.06274471, + "balance_loss_mlp": 0.01257079, + "epoch": 0.6241996091988576, + "flos": 12755781953280.0, + "grad_norm": 1.9060003648467456, + "language_loss": 0.74558902, + "learning_rate": 1.3072041304072718e-06, + "loss": 0.82241541, + "num_input_tokens_seen": 223648360, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.10821533, + "step": 10382, + "time_per_iteration": 2.479747772216797 + }, + { + "auxiliary_loss_clip": 0.06410253, + "auxiliary_loss_mlp": 0.01268006, + "balance_loss_clip": 0.06273764, + "balance_loss_mlp": 0.01258243, + "epoch": 0.6242597324515257, + "flos": 25858867080960.0, + "grad_norm": 1.410036242187738, + "language_loss": 0.78590852, + "learning_rate": 1.306838794344911e-06, + "loss": 0.8626911, + "num_input_tokens_seen": 223671255, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.09771729, + "step": 10383, + "time_per_iteration": 2.598404884338379 + }, + { + "auxiliary_loss_clip": 0.06411845, + "auxiliary_loss_mlp": 0.01264124, + "balance_loss_clip": 0.06273676, + "balance_loss_mlp": 0.01254236, + "epoch": 0.6243198557041936, + "flos": 19943804131200.0, + "grad_norm": 1.7487914543970622, + "language_loss": 0.75636935, + "learning_rate": 1.3064734845672925e-06, + "loss": 0.83312905, + "num_input_tokens_seen": 223689860, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.09899902, + "step": 10384, + "time_per_iteration": 2.493638038635254 + }, + { + "auxiliary_loss_clip": 0.06412329, + "auxiliary_loss_mlp": 0.01265797, + "balance_loss_clip": 0.06271704, + "balance_loss_mlp": 0.01254353, + "epoch": 0.6243799789568616, + "flos": 18412177749120.0, + "grad_norm": 2.229109392374204, + "language_loss": 0.66725862, + "learning_rate": 1.3061082010882694e-06, + "loss": 0.74403983, + "num_input_tokens_seen": 223707835, + "router_z_loss_clip": 1.40917969, + "router_z_loss_mlp": 0.11444092, + "step": 10385, + "time_per_iteration": 2.5185563564300537 + }, + { + "auxiliary_loss_clip": 0.06304897, + "auxiliary_loss_mlp": 0.01254771, + "balance_loss_clip": 0.06248314, + "balance_loss_mlp": 0.01253304, + "epoch": 0.6244401022095295, + "flos": 66048887128320.0, + "grad_norm": 0.7408334865403556, + "language_loss": 0.61911088, + "learning_rate": 1.305742943921692e-06, + "loss": 0.69470763, + "num_input_tokens_seen": 223771875, + "router_z_loss_clip": 0.56640625, + "router_z_loss_mlp": 0.01464844, + "step": 10386, + "time_per_iteration": 3.1636085510253906 + }, + { + "auxiliary_loss_clip": 0.06412023, + "auxiliary_loss_mlp": 0.01269919, + "balance_loss_clip": 0.06271843, + "balance_loss_mlp": 0.01258952, + "epoch": 0.6245002254621975, + "flos": 24578109423360.0, + "grad_norm": 2.35418101440168, + "language_loss": 0.71798837, + "learning_rate": 1.3053777130814128e-06, + "loss": 0.79480779, + "num_input_tokens_seen": 223788895, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.10974121, + "step": 10387, + "time_per_iteration": 2.5554144382476807 + }, + { + "auxiliary_loss_clip": 0.06417753, + "auxiliary_loss_mlp": 0.01266065, + "balance_loss_clip": 0.06271799, + "balance_loss_mlp": 0.01253399, + "epoch": 0.6245603487148654, + "flos": 29176510440960.0, + "grad_norm": 2.0504228233869886, + "language_loss": 0.65577459, + "learning_rate": 1.3050125085812798e-06, + "loss": 0.73261279, + "num_input_tokens_seen": 223810385, + "router_z_loss_clip": 1.45898438, + "router_z_loss_mlp": 0.12664795, + "step": 10388, + "time_per_iteration": 2.5694010257720947 + }, + { + "auxiliary_loss_clip": 0.0641178, + "auxiliary_loss_mlp": 0.0126472, + "balance_loss_clip": 0.06273006, + "balance_loss_mlp": 0.01255505, + "epoch": 0.6246204719675335, + "flos": 14794805934720.0, + "grad_norm": 1.572723869665335, + "language_loss": 0.79661775, + "learning_rate": 1.3046473304351417e-06, + "loss": 0.87338269, + "num_input_tokens_seen": 223826040, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.09216309, + "step": 10389, + "time_per_iteration": 2.497745990753174 + }, + { + "auxiliary_loss_clip": 0.06407995, + "auxiliary_loss_mlp": 0.01263578, + "balance_loss_clip": 0.06270336, + "balance_loss_mlp": 0.01253928, + "epoch": 0.6246805952202014, + "flos": 12498204902400.0, + "grad_norm": 2.3002980745210384, + "language_loss": 0.60729766, + "learning_rate": 1.3042821786568475e-06, + "loss": 0.68401337, + "num_input_tokens_seen": 223842300, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.09643555, + "step": 10390, + "time_per_iteration": 2.47084379196167 + }, + { + "auxiliary_loss_clip": 0.06418662, + "auxiliary_loss_mlp": 0.01265735, + "balance_loss_clip": 0.06275147, + "balance_loss_mlp": 0.01254553, + "epoch": 0.6247407184728694, + "flos": 12791602373760.0, + "grad_norm": 1.9019889358611486, + "language_loss": 0.77116674, + "learning_rate": 1.3039170532602416e-06, + "loss": 0.84801072, + "num_input_tokens_seen": 223858320, + "router_z_loss_clip": 1.43652344, + "router_z_loss_mlp": 0.11181641, + "step": 10391, + "time_per_iteration": 2.5408506393432617 + }, + { + "auxiliary_loss_clip": 0.06416374, + "auxiliary_loss_mlp": 0.0126612, + "balance_loss_clip": 0.06274267, + "balance_loss_mlp": 0.01255165, + "epoch": 0.6248008417255374, + "flos": 40639417822080.0, + "grad_norm": 1.6390307551388046, + "language_loss": 0.64875287, + "learning_rate": 1.3035519542591718e-06, + "loss": 0.72557783, + "num_input_tokens_seen": 223883545, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.10943604, + "step": 10392, + "time_per_iteration": 2.7098827362060547 + }, + { + "auxiliary_loss_clip": 0.06416553, + "auxiliary_loss_mlp": 0.01266782, + "balance_loss_clip": 0.06274416, + "balance_loss_mlp": 0.01255618, + "epoch": 0.6248609649782053, + "flos": 19908235272960.0, + "grad_norm": 1.9113748677122278, + "language_loss": 0.76920122, + "learning_rate": 1.3031868816674819e-06, + "loss": 0.84603459, + "num_input_tokens_seen": 223901445, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.11169434, + "step": 10393, + "time_per_iteration": 2.548680543899536 + }, + { + "auxiliary_loss_clip": 0.0641488, + "auxiliary_loss_mlp": 0.01268434, + "balance_loss_clip": 0.06272462, + "balance_loss_mlp": 0.01255971, + "epoch": 0.6249210882308733, + "flos": 19688868483840.0, + "grad_norm": 1.752087282406205, + "language_loss": 0.82699966, + "learning_rate": 1.3028218354990142e-06, + "loss": 0.90383279, + "num_input_tokens_seen": 223920170, + "router_z_loss_clip": 1.42285156, + "router_z_loss_mlp": 0.12451172, + "step": 10394, + "time_per_iteration": 2.5310568809509277 + }, + { + "auxiliary_loss_clip": 0.064147, + "auxiliary_loss_mlp": 0.01265504, + "balance_loss_clip": 0.06271648, + "balance_loss_mlp": 0.01254721, + "epoch": 0.6249812114835412, + "flos": 13995855653760.0, + "grad_norm": 1.7190801919243177, + "language_loss": 0.75490797, + "learning_rate": 1.3024568157676128e-06, + "loss": 0.83170998, + "num_input_tokens_seen": 223936495, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.10784912, + "step": 10395, + "time_per_iteration": 2.5296716690063477 + }, + { + "auxiliary_loss_clip": 0.06417533, + "auxiliary_loss_mlp": 0.01267604, + "balance_loss_clip": 0.06273706, + "balance_loss_mlp": 0.01256536, + "epoch": 0.6250413347362093, + "flos": 14533916647680.0, + "grad_norm": 2.451423836023636, + "language_loss": 0.73157996, + "learning_rate": 1.302091822487119e-06, + "loss": 0.80843133, + "num_input_tokens_seen": 223950070, + "router_z_loss_clip": 1.44042969, + "router_z_loss_mlp": 0.11065674, + "step": 10396, + "time_per_iteration": 2.5183842182159424 + }, + { + "auxiliary_loss_clip": 0.06411869, + "auxiliary_loss_mlp": 0.01266063, + "balance_loss_clip": 0.06272602, + "balance_loss_mlp": 0.01255936, + "epoch": 0.6251014579888772, + "flos": 22969098195840.0, + "grad_norm": 1.6502966804998584, + "language_loss": 0.76563799, + "learning_rate": 1.3017268556713732e-06, + "loss": 0.84241736, + "num_input_tokens_seen": 223970065, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.10131836, + "step": 10397, + "time_per_iteration": 2.5712759494781494 + }, + { + "auxiliary_loss_clip": 0.06415206, + "auxiliary_loss_mlp": 0.01267814, + "balance_loss_clip": 0.06274014, + "balance_loss_mlp": 0.0125718, + "epoch": 0.6251615812415452, + "flos": 28118809152000.0, + "grad_norm": 1.853529789472771, + "language_loss": 0.75433117, + "learning_rate": 1.3013619153342154e-06, + "loss": 0.83116138, + "num_input_tokens_seen": 223990315, + "router_z_loss_clip": 1.41308594, + "router_z_loss_mlp": 0.10638428, + "step": 10398, + "time_per_iteration": 4.095698595046997 + }, + { + "auxiliary_loss_clip": 0.0641809, + "auxiliary_loss_mlp": 0.01267876, + "balance_loss_clip": 0.0627377, + "balance_loss_mlp": 0.01256462, + "epoch": 0.6252217044942131, + "flos": 26731764190080.0, + "grad_norm": 1.615458357588448, + "language_loss": 0.74413693, + "learning_rate": 1.300997001489483e-06, + "loss": 0.82099664, + "num_input_tokens_seen": 224009960, + "router_z_loss_clip": 1.44335938, + "router_z_loss_mlp": 0.11419678, + "step": 10399, + "time_per_iteration": 2.5753824710845947 + }, + { + "auxiliary_loss_clip": 0.06412279, + "auxiliary_loss_mlp": 0.01266467, + "balance_loss_clip": 0.0627217, + "balance_loss_mlp": 0.01256287, + "epoch": 0.6252818277468811, + "flos": 20012216590080.0, + "grad_norm": 1.6187380573242784, + "language_loss": 0.74690026, + "learning_rate": 1.3006321141510147e-06, + "loss": 0.82368767, + "num_input_tokens_seen": 224028870, + "router_z_loss_clip": 1.40039062, + "router_z_loss_mlp": 0.10180664, + "step": 10400, + "time_per_iteration": 2.5361061096191406 + }, + { + "auxiliary_loss_clip": 0.06307141, + "auxiliary_loss_mlp": 0.01253939, + "balance_loss_clip": 0.06249951, + "balance_loss_mlp": 0.01252542, + "epoch": 0.625341950999549, + "flos": 59298550352640.0, + "grad_norm": 0.8247682302462489, + "language_loss": 0.56403446, + "learning_rate": 1.3002672533326465e-06, + "loss": 0.63964522, + "num_input_tokens_seen": 224094140, + "router_z_loss_clip": 0.57373047, + "router_z_loss_mlp": 0.01399231, + "step": 10401, + "time_per_iteration": 3.2024521827697754 + }, + { + "auxiliary_loss_clip": 0.06411454, + "auxiliary_loss_mlp": 0.01264191, + "balance_loss_clip": 0.06270526, + "balance_loss_mlp": 0.01253135, + "epoch": 0.625402074252217, + "flos": 20163296701440.0, + "grad_norm": 1.9270860159318792, + "language_loss": 0.82986021, + "learning_rate": 1.2999024190482146e-06, + "loss": 0.90661669, + "num_input_tokens_seen": 224113235, + "router_z_loss_clip": 1.40722656, + "router_z_loss_mlp": 0.1105957, + "step": 10402, + "time_per_iteration": 2.5365302562713623 + }, + { + "auxiliary_loss_clip": 0.06408338, + "auxiliary_loss_mlp": 0.01268072, + "balance_loss_clip": 0.06270024, + "balance_loss_mlp": 0.0125751, + "epoch": 0.625462197504885, + "flos": 29140228823040.0, + "grad_norm": 1.8928346901761637, + "language_loss": 0.68982589, + "learning_rate": 1.2995376113115527e-06, + "loss": 0.76659, + "num_input_tokens_seen": 224134530, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.10565186, + "step": 10403, + "time_per_iteration": 2.582432985305786 + }, + { + "auxiliary_loss_clip": 0.06414935, + "auxiliary_loss_mlp": 0.01268099, + "balance_loss_clip": 0.06273684, + "balance_loss_mlp": 0.01255791, + "epoch": 0.625522320757553, + "flos": 26111664449280.0, + "grad_norm": 1.458072120324879, + "language_loss": 0.7191205, + "learning_rate": 1.2991728301364954e-06, + "loss": 0.79595077, + "num_input_tokens_seen": 224154170, + "router_z_loss_clip": 1.41308594, + "router_z_loss_mlp": 0.12310791, + "step": 10404, + "time_per_iteration": 2.561168909072876 + }, + { + "auxiliary_loss_clip": 0.06414899, + "auxiliary_loss_mlp": 0.01265432, + "balance_loss_clip": 0.06274525, + "balance_loss_mlp": 0.01254376, + "epoch": 0.625582444010221, + "flos": 20637179867520.0, + "grad_norm": 1.708836006791191, + "language_loss": 0.69769311, + "learning_rate": 1.2988080755368742e-06, + "loss": 0.77449644, + "num_input_tokens_seen": 224172730, + "router_z_loss_clip": 1.40527344, + "router_z_loss_mlp": 0.11053467, + "step": 10405, + "time_per_iteration": 2.5165655612945557 + }, + { + "auxiliary_loss_clip": 0.06413669, + "auxiliary_loss_mlp": 0.01268037, + "balance_loss_clip": 0.06275192, + "balance_loss_mlp": 0.01257332, + "epoch": 0.6256425672628889, + "flos": 20527706108160.0, + "grad_norm": 1.5616382463324912, + "language_loss": 0.79137939, + "learning_rate": 1.2984433475265207e-06, + "loss": 0.86819649, + "num_input_tokens_seen": 224192620, + "router_z_loss_clip": 1.38476562, + "router_z_loss_mlp": 0.10693359, + "step": 10406, + "time_per_iteration": 2.526115894317627 + }, + { + "auxiliary_loss_clip": 0.06414723, + "auxiliary_loss_mlp": 0.01268249, + "balance_loss_clip": 0.06273726, + "balance_loss_mlp": 0.01257598, + "epoch": 0.6257026905155569, + "flos": 29536182092160.0, + "grad_norm": 1.7875701803121953, + "language_loss": 0.69265002, + "learning_rate": 1.2980786461192666e-06, + "loss": 0.76947975, + "num_input_tokens_seen": 224214660, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.10650635, + "step": 10407, + "time_per_iteration": 2.58450984954834 + }, + { + "auxiliary_loss_clip": 0.06403035, + "auxiliary_loss_mlp": 0.01268168, + "balance_loss_clip": 0.06269637, + "balance_loss_mlp": 0.01257898, + "epoch": 0.6257628137682248, + "flos": 24031788802560.0, + "grad_norm": 1.594681235705685, + "language_loss": 0.85355765, + "learning_rate": 1.2977139713289398e-06, + "loss": 0.93026972, + "num_input_tokens_seen": 224234170, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.10272217, + "step": 10408, + "time_per_iteration": 2.5464730262756348 + }, + { + "auxiliary_loss_clip": 0.06411938, + "auxiliary_loss_mlp": 0.01265758, + "balance_loss_clip": 0.06273568, + "balance_loss_mlp": 0.01255757, + "epoch": 0.6258229370208929, + "flos": 20857385197440.0, + "grad_norm": 1.6518363285256767, + "language_loss": 0.7993108, + "learning_rate": 1.2973493231693699e-06, + "loss": 0.87608778, + "num_input_tokens_seen": 224253115, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.09997559, + "step": 10409, + "time_per_iteration": 4.006382465362549 + }, + { + "auxiliary_loss_clip": 0.06408045, + "auxiliary_loss_mlp": 0.01265341, + "balance_loss_clip": 0.06269314, + "balance_loss_mlp": 0.0125475, + "epoch": 0.6258830602735608, + "flos": 22237218708480.0, + "grad_norm": 2.026280584027718, + "language_loss": 0.6951521, + "learning_rate": 1.2969847016543845e-06, + "loss": 0.77188593, + "num_input_tokens_seen": 224271375, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.10601807, + "step": 10410, + "time_per_iteration": 2.4960851669311523 + }, + { + "auxiliary_loss_clip": 0.06406428, + "auxiliary_loss_mlp": 0.0126592, + "balance_loss_clip": 0.0627175, + "balance_loss_mlp": 0.01256712, + "epoch": 0.6259431835262288, + "flos": 25082949473280.0, + "grad_norm": 1.7089284959721278, + "language_loss": 0.68380713, + "learning_rate": 1.2966201067978086e-06, + "loss": 0.76053059, + "num_input_tokens_seen": 224290315, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.09210205, + "step": 10411, + "time_per_iteration": 2.555173397064209 + }, + { + "auxiliary_loss_clip": 0.06413864, + "auxiliary_loss_mlp": 0.01267605, + "balance_loss_clip": 0.06273196, + "balance_loss_mlp": 0.01256489, + "epoch": 0.6260033067788967, + "flos": 28259072087040.0, + "grad_norm": 1.650436219337463, + "language_loss": 0.70024323, + "learning_rate": 1.2962555386134702e-06, + "loss": 0.77705795, + "num_input_tokens_seen": 224310545, + "router_z_loss_clip": 1.40722656, + "router_z_loss_mlp": 0.11114502, + "step": 10412, + "time_per_iteration": 4.113879919052124 + }, + { + "auxiliary_loss_clip": 0.06406923, + "auxiliary_loss_mlp": 0.01266017, + "balance_loss_clip": 0.06270111, + "balance_loss_mlp": 0.01256152, + "epoch": 0.6260634300315647, + "flos": 23374107705600.0, + "grad_norm": 1.4649345950741752, + "language_loss": 0.69805682, + "learning_rate": 1.2958909971151908e-06, + "loss": 0.77478617, + "num_input_tokens_seen": 224331115, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.09869385, + "step": 10413, + "time_per_iteration": 2.519340753555298 + }, + { + "auxiliary_loss_clip": 0.06415603, + "auxiliary_loss_mlp": 0.0126598, + "balance_loss_clip": 0.06269616, + "balance_loss_mlp": 0.01254101, + "epoch": 0.6261235532842326, + "flos": 18040221475200.0, + "grad_norm": 2.973303633857383, + "language_loss": 0.81012505, + "learning_rate": 1.295526482316796e-06, + "loss": 0.88694084, + "num_input_tokens_seen": 224347525, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.11877441, + "step": 10414, + "time_per_iteration": 2.5359139442443848 + }, + { + "auxiliary_loss_clip": 0.06411665, + "auxiliary_loss_mlp": 0.01265079, + "balance_loss_clip": 0.06273223, + "balance_loss_mlp": 0.012545, + "epoch": 0.6261836765369007, + "flos": 22016677962240.0, + "grad_norm": 1.921958755127535, + "language_loss": 0.74850363, + "learning_rate": 1.2951619942321083e-06, + "loss": 0.82527107, + "num_input_tokens_seen": 224367045, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.10577393, + "step": 10415, + "time_per_iteration": 2.529327630996704 + }, + { + "auxiliary_loss_clip": 0.06409019, + "auxiliary_loss_mlp": 0.01267114, + "balance_loss_clip": 0.06273155, + "balance_loss_mlp": 0.01256993, + "epoch": 0.6262437997895686, + "flos": 24942896173440.0, + "grad_norm": 1.4283741323498855, + "language_loss": 0.74384236, + "learning_rate": 1.2947975328749472e-06, + "loss": 0.82060367, + "num_input_tokens_seen": 224388860, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10119629, + "step": 10416, + "time_per_iteration": 2.626948595046997 + }, + { + "auxiliary_loss_clip": 0.06405699, + "auxiliary_loss_mlp": 0.01264334, + "balance_loss_clip": 0.06271897, + "balance_loss_mlp": 0.01254523, + "epoch": 0.6263039230422366, + "flos": 31615680395520.0, + "grad_norm": 1.6046151983772523, + "language_loss": 0.84637046, + "learning_rate": 1.2944330982591352e-06, + "loss": 0.92307079, + "num_input_tokens_seen": 224409645, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.09814453, + "step": 10417, + "time_per_iteration": 4.062727689743042 + }, + { + "auxiliary_loss_clip": 0.06414269, + "auxiliary_loss_mlp": 0.01264072, + "balance_loss_clip": 0.0627402, + "balance_loss_mlp": 0.01253713, + "epoch": 0.6263640462949046, + "flos": 17645232528000.0, + "grad_norm": 2.126036841621572, + "language_loss": 0.57267582, + "learning_rate": 1.2940686903984904e-06, + "loss": 0.6494593, + "num_input_tokens_seen": 224428530, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.10357666, + "step": 10418, + "time_per_iteration": 2.5384292602539062 + }, + { + "auxiliary_loss_clip": 0.06423989, + "auxiliary_loss_mlp": 0.01267395, + "balance_loss_clip": 0.06278068, + "balance_loss_mlp": 0.01255629, + "epoch": 0.6264241695475725, + "flos": 19981175852160.0, + "grad_norm": 2.5601033776039688, + "language_loss": 0.85281551, + "learning_rate": 1.2937043093068316e-06, + "loss": 0.92972934, + "num_input_tokens_seen": 224447175, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.11767578, + "step": 10419, + "time_per_iteration": 2.6254498958587646 + }, + { + "auxiliary_loss_clip": 0.0641915, + "auxiliary_loss_mlp": 0.01269689, + "balance_loss_clip": 0.06276678, + "balance_loss_mlp": 0.01258644, + "epoch": 0.6264842928002405, + "flos": 27351654295680.0, + "grad_norm": 1.7349665783281947, + "language_loss": 0.64790374, + "learning_rate": 1.2933399549979762e-06, + "loss": 0.72479212, + "num_input_tokens_seen": 224469445, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.1104126, + "step": 10420, + "time_per_iteration": 2.6838459968566895 + }, + { + "auxiliary_loss_clip": 0.06413981, + "auxiliary_loss_mlp": 0.01268518, + "balance_loss_clip": 0.0627203, + "balance_loss_mlp": 0.01257056, + "epoch": 0.6265444160529084, + "flos": 23002989972480.0, + "grad_norm": 1.7751280230906503, + "language_loss": 0.85910356, + "learning_rate": 1.292975627485741e-06, + "loss": 0.93592852, + "num_input_tokens_seen": 224486590, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.11462402, + "step": 10421, + "time_per_iteration": 2.502638101577759 + }, + { + "auxiliary_loss_clip": 0.06412976, + "auxiliary_loss_mlp": 0.01265861, + "balance_loss_clip": 0.06274194, + "balance_loss_mlp": 0.01255454, + "epoch": 0.6266045393055765, + "flos": 19944516890880.0, + "grad_norm": 1.9594550321950581, + "language_loss": 0.79719132, + "learning_rate": 1.2926113267839403e-06, + "loss": 0.87397969, + "num_input_tokens_seen": 224502795, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.10406494, + "step": 10422, + "time_per_iteration": 2.506927013397217 + }, + { + "auxiliary_loss_clip": 0.06411508, + "auxiliary_loss_mlp": 0.01266347, + "balance_loss_clip": 0.062728, + "balance_loss_mlp": 0.01255845, + "epoch": 0.6266646625582444, + "flos": 24395946647040.0, + "grad_norm": 1.5344190640547188, + "language_loss": 0.74784446, + "learning_rate": 1.292247052906389e-06, + "loss": 0.82462305, + "num_input_tokens_seen": 224522300, + "router_z_loss_clip": 1.38476562, + "router_z_loss_mlp": 0.10510254, + "step": 10423, + "time_per_iteration": 2.5245227813720703 + }, + { + "auxiliary_loss_clip": 0.064162, + "auxiliary_loss_mlp": 0.01266629, + "balance_loss_clip": 0.06277235, + "balance_loss_mlp": 0.01256186, + "epoch": 0.6267247858109124, + "flos": 14689021754880.0, + "grad_norm": 2.220018745384266, + "language_loss": 0.77700025, + "learning_rate": 1.2918828058669004e-06, + "loss": 0.85382849, + "num_input_tokens_seen": 224538260, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.10443115, + "step": 10424, + "time_per_iteration": 2.477313756942749 + }, + { + "auxiliary_loss_clip": 0.06416199, + "auxiliary_loss_mlp": 0.0126622, + "balance_loss_clip": 0.06277827, + "balance_loss_mlp": 0.01255139, + "epoch": 0.6267849090635803, + "flos": 24935852430720.0, + "grad_norm": 1.661217463389483, + "language_loss": 0.69195008, + "learning_rate": 1.2915185856792868e-06, + "loss": 0.76877427, + "num_input_tokens_seen": 224559155, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.11077881, + "step": 10425, + "time_per_iteration": 2.543240547180176 + }, + { + "auxiliary_loss_clip": 0.06407383, + "auxiliary_loss_mlp": 0.01264995, + "balance_loss_clip": 0.0627373, + "balance_loss_mlp": 0.01255232, + "epoch": 0.6268450323162483, + "flos": 25344886936320.0, + "grad_norm": 1.5301783551006911, + "language_loss": 0.74874127, + "learning_rate": 1.2911543923573598e-06, + "loss": 0.82546508, + "num_input_tokens_seen": 224578660, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.09765625, + "step": 10426, + "time_per_iteration": 2.541133403778076 + }, + { + "auxiliary_loss_clip": 0.06415579, + "auxiliary_loss_mlp": 0.01266633, + "balance_loss_clip": 0.06275427, + "balance_loss_mlp": 0.01256268, + "epoch": 0.6269051555689162, + "flos": 26184521174400.0, + "grad_norm": 1.3173967967859561, + "language_loss": 0.80809879, + "learning_rate": 1.290790225914929e-06, + "loss": 0.88492095, + "num_input_tokens_seen": 224599080, + "router_z_loss_clip": 1.40429688, + "router_z_loss_mlp": 0.10369873, + "step": 10427, + "time_per_iteration": 2.582977294921875 + }, + { + "auxiliary_loss_clip": 0.06420124, + "auxiliary_loss_mlp": 0.01267442, + "balance_loss_clip": 0.06276904, + "balance_loss_mlp": 0.01256618, + "epoch": 0.6269652788215843, + "flos": 18262271594880.0, + "grad_norm": 2.288264071636072, + "language_loss": 0.68539417, + "learning_rate": 1.2904260863658034e-06, + "loss": 0.76226991, + "num_input_tokens_seen": 224614225, + "router_z_loss_clip": 1.43164062, + "router_z_loss_mlp": 0.10821533, + "step": 10428, + "time_per_iteration": 2.470303773880005 + }, + { + "auxiliary_loss_clip": 0.06415083, + "auxiliary_loss_mlp": 0.01265748, + "balance_loss_clip": 0.06275322, + "balance_loss_mlp": 0.01255156, + "epoch": 0.6270254020742522, + "flos": 11770224629760.0, + "grad_norm": 1.7672728863863079, + "language_loss": 0.71438128, + "learning_rate": 1.2900619737237928e-06, + "loss": 0.79118955, + "num_input_tokens_seen": 224632365, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.105896, + "step": 10429, + "time_per_iteration": 2.4885928630828857 + }, + { + "auxiliary_loss_clip": 0.0641719, + "auxiliary_loss_mlp": 0.01266586, + "balance_loss_clip": 0.06274317, + "balance_loss_mlp": 0.01254665, + "epoch": 0.6270855253269202, + "flos": 23482114018560.0, + "grad_norm": 1.4192780160361307, + "language_loss": 0.80064285, + "learning_rate": 1.2896978880027023e-06, + "loss": 0.87748063, + "num_input_tokens_seen": 224651125, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.11920166, + "step": 10430, + "time_per_iteration": 2.695157766342163 + }, + { + "auxiliary_loss_clip": 0.06316154, + "auxiliary_loss_mlp": 0.01261761, + "balance_loss_clip": 0.0625899, + "balance_loss_mlp": 0.01260201, + "epoch": 0.6271456485795882, + "flos": 70084322490240.0, + "grad_norm": 0.7576452894497838, + "language_loss": 0.59208155, + "learning_rate": 1.2893338292163393e-06, + "loss": 0.66786075, + "num_input_tokens_seen": 224716115, + "router_z_loss_clip": 0.5703125, + "router_z_loss_mlp": 0.01556396, + "step": 10431, + "time_per_iteration": 3.2964041233062744 + }, + { + "auxiliary_loss_clip": 0.06312843, + "auxiliary_loss_mlp": 0.01258809, + "balance_loss_clip": 0.06255913, + "balance_loss_mlp": 0.01257378, + "epoch": 0.6272057718322561, + "flos": 65178673349760.0, + "grad_norm": 0.9858891279415538, + "language_loss": 0.63665617, + "learning_rate": 1.2889697973785095e-06, + "loss": 0.71237266, + "num_input_tokens_seen": 224782930, + "router_z_loss_clip": 0.5703125, + "router_z_loss_mlp": 0.01428223, + "step": 10432, + "time_per_iteration": 3.2280328273773193 + }, + { + "auxiliary_loss_clip": 0.06412185, + "auxiliary_loss_mlp": 0.01266828, + "balance_loss_clip": 0.06274938, + "balance_loss_mlp": 0.01256952, + "epoch": 0.6272658950849241, + "flos": 24396240136320.0, + "grad_norm": 1.6010176873941773, + "language_loss": 0.65241134, + "learning_rate": 1.2886057925030153e-06, + "loss": 0.72920156, + "num_input_tokens_seen": 224802010, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.09875488, + "step": 10433, + "time_per_iteration": 2.6001501083374023 + }, + { + "auxiliary_loss_clip": 0.06421921, + "auxiliary_loss_mlp": 0.01264381, + "balance_loss_clip": 0.06276838, + "balance_loss_mlp": 0.01252985, + "epoch": 0.627326018337592, + "flos": 17971515527040.0, + "grad_norm": 2.0859900141473897, + "language_loss": 0.62490857, + "learning_rate": 1.2882418146036612e-06, + "loss": 0.70177162, + "num_input_tokens_seen": 224818875, + "router_z_loss_clip": 1.44921875, + "router_z_loss_mlp": 0.1138916, + "step": 10434, + "time_per_iteration": 2.4881582260131836 + }, + { + "auxiliary_loss_clip": 0.06417267, + "auxiliary_loss_mlp": 0.01265758, + "balance_loss_clip": 0.06275722, + "balance_loss_mlp": 0.01255363, + "epoch": 0.6273861415902601, + "flos": 20236321134720.0, + "grad_norm": 1.4988303322096788, + "language_loss": 0.84577382, + "learning_rate": 1.2878778636942484e-06, + "loss": 0.92260414, + "num_input_tokens_seen": 224837790, + "router_z_loss_clip": 1.41601562, + "router_z_loss_mlp": 0.10394287, + "step": 10435, + "time_per_iteration": 2.508821487426758 + }, + { + "auxiliary_loss_clip": 0.06310409, + "auxiliary_loss_mlp": 0.01254017, + "balance_loss_clip": 0.06253147, + "balance_loss_mlp": 0.0125247, + "epoch": 0.627446264842928, + "flos": 64971605911680.0, + "grad_norm": 0.7140995203776986, + "language_loss": 0.6143651, + "learning_rate": 1.2875139397885786e-06, + "loss": 0.69000936, + "num_input_tokens_seen": 224899685, + "router_z_loss_clip": 0.57421875, + "router_z_loss_mlp": 0.01544952, + "step": 10436, + "time_per_iteration": 3.1841728687286377 + }, + { + "auxiliary_loss_clip": 0.06415884, + "auxiliary_loss_mlp": 0.01270936, + "balance_loss_clip": 0.06275365, + "balance_loss_mlp": 0.01259635, + "epoch": 0.627506388095596, + "flos": 23590623456000.0, + "grad_norm": 1.4165717499809394, + "language_loss": 0.77800572, + "learning_rate": 1.2871500429004523e-06, + "loss": 0.8548739, + "num_input_tokens_seen": 224918650, + "router_z_loss_clip": 1.40722656, + "router_z_loss_mlp": 0.11303711, + "step": 10437, + "time_per_iteration": 2.5377817153930664 + }, + { + "auxiliary_loss_clip": 0.06309696, + "auxiliary_loss_mlp": 0.01252859, + "balance_loss_clip": 0.06252521, + "balance_loss_mlp": 0.01251612, + "epoch": 0.6275665113482639, + "flos": 67603043059200.0, + "grad_norm": 0.7073778525823976, + "language_loss": 0.54094195, + "learning_rate": 1.2867861730436667e-06, + "loss": 0.61656755, + "num_input_tokens_seen": 224981575, + "router_z_loss_clip": 0.57226562, + "router_z_loss_mlp": 0.01247406, + "step": 10438, + "time_per_iteration": 4.560008764266968 + }, + { + "auxiliary_loss_clip": 0.06412268, + "auxiliary_loss_mlp": 0.01266715, + "balance_loss_clip": 0.06273399, + "balance_loss_mlp": 0.01255569, + "epoch": 0.6276266346009319, + "flos": 27644422861440.0, + "grad_norm": 1.692810124153385, + "language_loss": 0.84027016, + "learning_rate": 1.2864223302320214e-06, + "loss": 0.91705996, + "num_input_tokens_seen": 225000820, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.11138916, + "step": 10439, + "time_per_iteration": 2.5736849308013916 + }, + { + "auxiliary_loss_clip": 0.06415922, + "auxiliary_loss_mlp": 0.01266206, + "balance_loss_clip": 0.06272548, + "balance_loss_mlp": 0.01255399, + "epoch": 0.6276867578535998, + "flos": 22752540518400.0, + "grad_norm": 2.0302945438571047, + "language_loss": 0.80827779, + "learning_rate": 1.2860585144793128e-06, + "loss": 0.88509905, + "num_input_tokens_seen": 225017585, + "router_z_loss_clip": 1.43359375, + "router_z_loss_mlp": 0.10809326, + "step": 10440, + "time_per_iteration": 2.5353291034698486 + }, + { + "auxiliary_loss_clip": 0.06405526, + "auxiliary_loss_mlp": 0.01265635, + "balance_loss_clip": 0.0627224, + "balance_loss_mlp": 0.01256241, + "epoch": 0.6277468811062679, + "flos": 24651050002560.0, + "grad_norm": 1.4466963642107937, + "language_loss": 0.74692273, + "learning_rate": 1.285694725799337e-06, + "loss": 0.82363433, + "num_input_tokens_seen": 225039085, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.09393311, + "step": 10441, + "time_per_iteration": 2.5965688228607178 + }, + { + "auxiliary_loss_clip": 0.06410202, + "auxiliary_loss_mlp": 0.01267405, + "balance_loss_clip": 0.06272199, + "balance_loss_mlp": 0.01256932, + "epoch": 0.6278070043589358, + "flos": 19684466144640.0, + "grad_norm": 1.738690700547975, + "language_loss": 0.72243971, + "learning_rate": 1.2853309642058884e-06, + "loss": 0.79921579, + "num_input_tokens_seen": 225058105, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.1048584, + "step": 10442, + "time_per_iteration": 2.5236124992370605 + }, + { + "auxiliary_loss_clip": 0.06413672, + "auxiliary_loss_mlp": 0.01264225, + "balance_loss_clip": 0.06273279, + "balance_loss_mlp": 0.01254443, + "epoch": 0.6278671276116038, + "flos": 22127451459840.0, + "grad_norm": 1.5746919411428797, + "language_loss": 0.71842909, + "learning_rate": 1.284967229712762e-06, + "loss": 0.7952081, + "num_input_tokens_seen": 225077605, + "router_z_loss_clip": 1.40332031, + "router_z_loss_mlp": 0.09783936, + "step": 10443, + "time_per_iteration": 2.523799419403076 + }, + { + "auxiliary_loss_clip": 0.06411857, + "auxiliary_loss_mlp": 0.01265717, + "balance_loss_clip": 0.06272158, + "balance_loss_mlp": 0.01255412, + "epoch": 0.6279272508642717, + "flos": 23045099448960.0, + "grad_norm": 2.0032164077839787, + "language_loss": 0.73292875, + "learning_rate": 1.2846035223337492e-06, + "loss": 0.80970454, + "num_input_tokens_seen": 225097775, + "router_z_loss_clip": 1.39648438, + "router_z_loss_mlp": 0.10302734, + "step": 10444, + "time_per_iteration": 2.557166337966919 + }, + { + "auxiliary_loss_clip": 0.06410734, + "auxiliary_loss_mlp": 0.01266329, + "balance_loss_clip": 0.0627318, + "balance_loss_mlp": 0.01255595, + "epoch": 0.6279873741169397, + "flos": 19829466835200.0, + "grad_norm": 2.156521717901959, + "language_loss": 0.72276205, + "learning_rate": 1.2842398420826423e-06, + "loss": 0.79953271, + "num_input_tokens_seen": 225115585, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.10736084, + "step": 10445, + "time_per_iteration": 2.526127815246582 + }, + { + "auxiliary_loss_clip": 0.06412753, + "auxiliary_loss_mlp": 0.01265639, + "balance_loss_clip": 0.0627303, + "balance_loss_mlp": 0.01254601, + "epoch": 0.6280474973696077, + "flos": 23922273116160.0, + "grad_norm": 1.5888677783518865, + "language_loss": 0.69281161, + "learning_rate": 1.2838761889732331e-06, + "loss": 0.76959556, + "num_input_tokens_seen": 225135575, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.1104126, + "step": 10446, + "time_per_iteration": 2.530104637145996 + }, + { + "auxiliary_loss_clip": 0.06423883, + "auxiliary_loss_mlp": 0.01267771, + "balance_loss_clip": 0.06276697, + "balance_loss_mlp": 0.01256637, + "epoch": 0.6281076206222757, + "flos": 17973821514240.0, + "grad_norm": 1.8539120492479848, + "language_loss": 0.73894954, + "learning_rate": 1.2835125630193102e-06, + "loss": 0.81586611, + "num_input_tokens_seen": 225154230, + "router_z_loss_clip": 1.47167969, + "router_z_loss_mlp": 0.1114502, + "step": 10447, + "time_per_iteration": 2.4985270500183105 + }, + { + "auxiliary_loss_clip": 0.06304939, + "auxiliary_loss_mlp": 0.01257491, + "balance_loss_clip": 0.06248139, + "balance_loss_mlp": 0.01256266, + "epoch": 0.6281677438749437, + "flos": 66797216743680.0, + "grad_norm": 0.6871055611916008, + "language_loss": 0.51990867, + "learning_rate": 1.2831489642346626e-06, + "loss": 0.59553301, + "num_input_tokens_seen": 225213650, + "router_z_loss_clip": 0.5703125, + "router_z_loss_mlp": 0.01223755, + "step": 10448, + "time_per_iteration": 4.437039136886597 + }, + { + "auxiliary_loss_clip": 0.0641938, + "auxiliary_loss_mlp": 0.01268052, + "balance_loss_clip": 0.0627671, + "balance_loss_mlp": 0.01256346, + "epoch": 0.6282278671276116, + "flos": 11661002432640.0, + "grad_norm": 1.9501627229016425, + "language_loss": 0.91483194, + "learning_rate": 1.282785392633079e-06, + "loss": 0.99170625, + "num_input_tokens_seen": 225230135, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.1171875, + "step": 10449, + "time_per_iteration": 2.5085034370422363 + }, + { + "auxiliary_loss_clip": 0.06415906, + "auxiliary_loss_mlp": 0.01270346, + "balance_loss_clip": 0.06275564, + "balance_loss_mlp": 0.01260452, + "epoch": 0.6282879903802796, + "flos": 42751550090880.0, + "grad_norm": 1.4186227693043074, + "language_loss": 0.60281998, + "learning_rate": 1.2824218482283438e-06, + "loss": 0.67968249, + "num_input_tokens_seen": 225253520, + "router_z_loss_clip": 1.40332031, + "router_z_loss_mlp": 0.09893799, + "step": 10450, + "time_per_iteration": 2.6810834407806396 + }, + { + "auxiliary_loss_clip": 0.06408551, + "auxiliary_loss_mlp": 0.0126717, + "balance_loss_clip": 0.06272364, + "balance_loss_mlp": 0.01256269, + "epoch": 0.6283481136329475, + "flos": 20015067628800.0, + "grad_norm": 1.5189772221694435, + "language_loss": 0.77163285, + "learning_rate": 1.2820583310342452e-06, + "loss": 0.8483901, + "num_input_tokens_seen": 225272460, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.10906982, + "step": 10451, + "time_per_iteration": 2.5098116397857666 + }, + { + "auxiliary_loss_clip": 0.06416346, + "auxiliary_loss_mlp": 0.01265995, + "balance_loss_clip": 0.06274851, + "balance_loss_mlp": 0.01254652, + "epoch": 0.6284082368856155, + "flos": 21910264876800.0, + "grad_norm": 1.4797334153303925, + "language_loss": 0.77516776, + "learning_rate": 1.281694841064566e-06, + "loss": 0.85199118, + "num_input_tokens_seen": 225291700, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.11346436, + "step": 10452, + "time_per_iteration": 4.029058933258057 + }, + { + "auxiliary_loss_clip": 0.06413398, + "auxiliary_loss_mlp": 0.01268188, + "balance_loss_clip": 0.06273846, + "balance_loss_mlp": 0.01257173, + "epoch": 0.6284683601382834, + "flos": 25491313146240.0, + "grad_norm": 1.654591158178899, + "language_loss": 0.72948235, + "learning_rate": 1.2813313783330904e-06, + "loss": 0.8062982, + "num_input_tokens_seen": 225311470, + "router_z_loss_clip": 1.39648438, + "router_z_loss_mlp": 0.11029053, + "step": 10453, + "time_per_iteration": 2.542074680328369 + }, + { + "auxiliary_loss_clip": 0.06415626, + "auxiliary_loss_mlp": 0.0126451, + "balance_loss_clip": 0.0627359, + "balance_loss_mlp": 0.01253268, + "epoch": 0.6285284833909515, + "flos": 16543241556480.0, + "grad_norm": 1.6231177337896328, + "language_loss": 0.80777168, + "learning_rate": 1.2809679428536013e-06, + "loss": 0.88457304, + "num_input_tokens_seen": 225328385, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.11236572, + "step": 10454, + "time_per_iteration": 2.5263936519622803 + }, + { + "auxiliary_loss_clip": 0.06409679, + "auxiliary_loss_mlp": 0.0127067, + "balance_loss_clip": 0.06273915, + "balance_loss_mlp": 0.01260728, + "epoch": 0.6285886066436194, + "flos": 22827367814400.0, + "grad_norm": 1.7338027562142968, + "language_loss": 0.82249027, + "learning_rate": 1.2806045346398792e-06, + "loss": 0.89929378, + "num_input_tokens_seen": 225348415, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.09936523, + "step": 10455, + "time_per_iteration": 2.500506639480591 + }, + { + "auxiliary_loss_clip": 0.06415103, + "auxiliary_loss_mlp": 0.01264745, + "balance_loss_clip": 0.0627536, + "balance_loss_mlp": 0.01254225, + "epoch": 0.6286487298962874, + "flos": 24722355427200.0, + "grad_norm": 1.4932136487879293, + "language_loss": 0.82079554, + "learning_rate": 1.280241153705706e-06, + "loss": 0.89759403, + "num_input_tokens_seen": 225367740, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.10516357, + "step": 10456, + "time_per_iteration": 2.561309814453125 + }, + { + "auxiliary_loss_clip": 0.06420746, + "auxiliary_loss_mlp": 0.01268645, + "balance_loss_clip": 0.06275859, + "balance_loss_mlp": 0.01257624, + "epoch": 0.6287088531489553, + "flos": 20747114824320.0, + "grad_norm": 1.4461153744951818, + "language_loss": 0.72119695, + "learning_rate": 1.27987780006486e-06, + "loss": 0.79809082, + "num_input_tokens_seen": 225388405, + "router_z_loss_clip": 1.44921875, + "router_z_loss_mlp": 0.11022949, + "step": 10457, + "time_per_iteration": 3.957395076751709 + }, + { + "auxiliary_loss_clip": 0.06422028, + "auxiliary_loss_mlp": 0.01264534, + "balance_loss_clip": 0.06275769, + "balance_loss_mlp": 0.01253316, + "epoch": 0.6287689764016233, + "flos": 23076433676160.0, + "grad_norm": 1.6277999457875445, + "language_loss": 0.79939413, + "learning_rate": 1.2795144737311202e-06, + "loss": 0.8762598, + "num_input_tokens_seen": 225408360, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.11224365, + "step": 10458, + "time_per_iteration": 2.5144598484039307 + }, + { + "auxiliary_loss_clip": 0.06420826, + "auxiliary_loss_mlp": 0.01272203, + "balance_loss_clip": 0.06276783, + "balance_loss_mlp": 0.01261081, + "epoch": 0.6288290996542913, + "flos": 32241859557120.0, + "grad_norm": 1.5510176438747023, + "language_loss": 0.61428088, + "learning_rate": 1.2791511747182635e-06, + "loss": 0.69121122, + "num_input_tokens_seen": 225431310, + "router_z_loss_clip": 1.44140625, + "router_z_loss_mlp": 0.11120605, + "step": 10459, + "time_per_iteration": 2.673271894454956 + }, + { + "auxiliary_loss_clip": 0.06418507, + "auxiliary_loss_mlp": 0.01266867, + "balance_loss_clip": 0.06276773, + "balance_loss_mlp": 0.01256066, + "epoch": 0.6288892229069593, + "flos": 24647695839360.0, + "grad_norm": 1.5279768291149622, + "language_loss": 0.79008341, + "learning_rate": 1.2787879030400666e-06, + "loss": 0.86693716, + "num_input_tokens_seen": 225450385, + "router_z_loss_clip": 1.41894531, + "router_z_loss_mlp": 0.10809326, + "step": 10460, + "time_per_iteration": 2.5390427112579346 + }, + { + "auxiliary_loss_clip": 0.06411569, + "auxiliary_loss_mlp": 0.0126639, + "balance_loss_clip": 0.06274751, + "balance_loss_mlp": 0.01256305, + "epoch": 0.6289493461596273, + "flos": 17864138119680.0, + "grad_norm": 1.9201849344746347, + "language_loss": 0.73887581, + "learning_rate": 1.2784246587103047e-06, + "loss": 0.81565541, + "num_input_tokens_seen": 225467325, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.10089111, + "step": 10461, + "time_per_iteration": 2.524601459503174 + }, + { + "auxiliary_loss_clip": 0.06411408, + "auxiliary_loss_mlp": 0.01263734, + "balance_loss_clip": 0.06275996, + "balance_loss_mlp": 0.01253637, + "epoch": 0.6290094694122952, + "flos": 22351807566720.0, + "grad_norm": 1.8529909730554852, + "language_loss": 0.70305121, + "learning_rate": 1.2780614417427523e-06, + "loss": 0.77980262, + "num_input_tokens_seen": 225487370, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.10101318, + "step": 10462, + "time_per_iteration": 2.5161097049713135 + }, + { + "auxiliary_loss_clip": 0.06407323, + "auxiliary_loss_mlp": 0.01263666, + "balance_loss_clip": 0.06275059, + "balance_loss_mlp": 0.01254224, + "epoch": 0.6290695926649632, + "flos": 28409942563200.0, + "grad_norm": 1.9398923730208482, + "language_loss": 0.72176754, + "learning_rate": 1.2776982521511821e-06, + "loss": 0.79847741, + "num_input_tokens_seen": 225506915, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.09442139, + "step": 10463, + "time_per_iteration": 2.579223394393921 + }, + { + "auxiliary_loss_clip": 0.06409386, + "auxiliary_loss_mlp": 0.01271723, + "balance_loss_clip": 0.06276202, + "balance_loss_mlp": 0.01261751, + "epoch": 0.6291297159176311, + "flos": 21511628277120.0, + "grad_norm": 1.539324014350412, + "language_loss": 0.7288208, + "learning_rate": 1.2773350899493665e-06, + "loss": 0.80563188, + "num_input_tokens_seen": 225525670, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.09967041, + "step": 10464, + "time_per_iteration": 2.494276762008667 + }, + { + "auxiliary_loss_clip": 0.06412283, + "auxiliary_loss_mlp": 0.01266428, + "balance_loss_clip": 0.06275527, + "balance_loss_mlp": 0.01256969, + "epoch": 0.6291898391702991, + "flos": 12208203521280.0, + "grad_norm": 1.7590102978799784, + "language_loss": 0.69385099, + "learning_rate": 1.2769719551510768e-06, + "loss": 0.77063811, + "num_input_tokens_seen": 225542235, + "router_z_loss_clip": 1.36621094, + "router_z_loss_mlp": 0.09466553, + "step": 10465, + "time_per_iteration": 2.5754034519195557 + }, + { + "auxiliary_loss_clip": 0.06303164, + "auxiliary_loss_mlp": 0.01258656, + "balance_loss_clip": 0.06246626, + "balance_loss_mlp": 0.01257341, + "epoch": 0.629249962422967, + "flos": 69319347840000.0, + "grad_norm": 0.6721611616517246, + "language_loss": 0.59656096, + "learning_rate": 1.2766088477700832e-06, + "loss": 0.67217922, + "num_input_tokens_seen": 225607185, + "router_z_loss_clip": 0.56640625, + "router_z_loss_mlp": 0.01316833, + "step": 10466, + "time_per_iteration": 3.231010913848877 + }, + { + "auxiliary_loss_clip": 0.0640944, + "auxiliary_loss_mlp": 0.01262544, + "balance_loss_clip": 0.06271854, + "balance_loss_mlp": 0.01253305, + "epoch": 0.6293100856756351, + "flos": 40087353196800.0, + "grad_norm": 2.1464377164547916, + "language_loss": 0.64920712, + "learning_rate": 1.276245767820154e-06, + "loss": 0.72592694, + "num_input_tokens_seen": 225628785, + "router_z_loss_clip": 1.37695312, + "router_z_loss_mlp": 0.09234619, + "step": 10467, + "time_per_iteration": 2.7820122241973877 + }, + { + "auxiliary_loss_clip": 0.06300975, + "auxiliary_loss_mlp": 0.01258806, + "balance_loss_clip": 0.06244308, + "balance_loss_mlp": 0.01257555, + "epoch": 0.629370208928303, + "flos": 67518907960320.0, + "grad_norm": 0.7784779642706487, + "language_loss": 0.56803113, + "learning_rate": 1.2758827153150586e-06, + "loss": 0.64362895, + "num_input_tokens_seen": 225678980, + "router_z_loss_clip": 0.56640625, + "router_z_loss_mlp": 0.01250458, + "step": 10468, + "time_per_iteration": 2.934441089630127 + }, + { + "auxiliary_loss_clip": 0.06299016, + "auxiliary_loss_mlp": 0.0125297, + "balance_loss_clip": 0.06242396, + "balance_loss_mlp": 0.01251782, + "epoch": 0.629430332180971, + "flos": 60680228653440.0, + "grad_norm": 0.7475097067157215, + "language_loss": 0.57685459, + "learning_rate": 1.2755196902685626e-06, + "loss": 0.65237445, + "num_input_tokens_seen": 225740295, + "router_z_loss_clip": 0.56689453, + "router_z_loss_mlp": 0.01186371, + "step": 10469, + "time_per_iteration": 3.097425699234009 + }, + { + "auxiliary_loss_clip": 0.06301235, + "auxiliary_loss_mlp": 0.01251651, + "balance_loss_clip": 0.0624446, + "balance_loss_mlp": 0.01250373, + "epoch": 0.6294904554336389, + "flos": 66891707821440.0, + "grad_norm": 0.675756451414952, + "language_loss": 0.5208174, + "learning_rate": 1.2751566926944329e-06, + "loss": 0.59634632, + "num_input_tokens_seen": 225805615, + "router_z_loss_clip": 0.5703125, + "router_z_loss_mlp": 0.01277924, + "step": 10470, + "time_per_iteration": 3.224271774291992 + }, + { + "auxiliary_loss_clip": 0.06409313, + "auxiliary_loss_mlp": 0.01268407, + "balance_loss_clip": 0.06274216, + "balance_loss_mlp": 0.01258322, + "epoch": 0.6295505786863069, + "flos": 42532728353280.0, + "grad_norm": 1.628220195821946, + "language_loss": 0.75025994, + "learning_rate": 1.2747937226064342e-06, + "loss": 0.8270371, + "num_input_tokens_seen": 225826585, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.10076904, + "step": 10471, + "time_per_iteration": 2.7104806900024414 + }, + { + "auxiliary_loss_clip": 0.06416945, + "auxiliary_loss_mlp": 0.01263691, + "balance_loss_clip": 0.06276174, + "balance_loss_mlp": 0.01253689, + "epoch": 0.629610701938975, + "flos": 17389877610240.0, + "grad_norm": 1.7371618192940372, + "language_loss": 0.63321209, + "learning_rate": 1.2744307800183297e-06, + "loss": 0.71001846, + "num_input_tokens_seen": 225844095, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.10003662, + "step": 10472, + "time_per_iteration": 2.51810884475708 + }, + { + "auxiliary_loss_clip": 0.06414427, + "auxiliary_loss_mlp": 0.01266162, + "balance_loss_clip": 0.06273856, + "balance_loss_mlp": 0.01255434, + "epoch": 0.6296708251916429, + "flos": 24249730072320.0, + "grad_norm": 1.5892163482922788, + "language_loss": 0.69503713, + "learning_rate": 1.2740678649438828e-06, + "loss": 0.77184302, + "num_input_tokens_seen": 225864310, + "router_z_loss_clip": 1.40527344, + "router_z_loss_mlp": 0.10717773, + "step": 10473, + "time_per_iteration": 2.5234594345092773 + }, + { + "auxiliary_loss_clip": 0.06411944, + "auxiliary_loss_mlp": 0.0126239, + "balance_loss_clip": 0.0627473, + "balance_loss_mlp": 0.01252502, + "epoch": 0.6297309484443109, + "flos": 19284110536320.0, + "grad_norm": 1.4968676246915393, + "language_loss": 0.74922514, + "learning_rate": 1.2737049773968554e-06, + "loss": 0.8259685, + "num_input_tokens_seen": 225883830, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.09899902, + "step": 10474, + "time_per_iteration": 2.581749200820923 + }, + { + "auxiliary_loss_clip": 0.06412183, + "auxiliary_loss_mlp": 0.01264808, + "balance_loss_clip": 0.06272481, + "balance_loss_mlp": 0.0125455, + "epoch": 0.6297910716969788, + "flos": 30670261977600.0, + "grad_norm": 1.6340326591826166, + "language_loss": 0.66562986, + "learning_rate": 1.2733421173910081e-06, + "loss": 0.74239981, + "num_input_tokens_seen": 225905755, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.10253906, + "step": 10475, + "time_per_iteration": 2.6167984008789062 + }, + { + "auxiliary_loss_clip": 0.06403632, + "auxiliary_loss_mlp": 0.01261865, + "balance_loss_clip": 0.06270278, + "balance_loss_mlp": 0.01252107, + "epoch": 0.6298511949496468, + "flos": 14427293927040.0, + "grad_norm": 1.8082220709351975, + "language_loss": 0.90615106, + "learning_rate": 1.272979284940101e-06, + "loss": 0.98280615, + "num_input_tokens_seen": 225922155, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.09759521, + "step": 10476, + "time_per_iteration": 2.5575828552246094 + }, + { + "auxiliary_loss_clip": 0.06412712, + "auxiliary_loss_mlp": 0.01271614, + "balance_loss_clip": 0.06276231, + "balance_loss_mlp": 0.01261285, + "epoch": 0.6299113182023147, + "flos": 23520995112960.0, + "grad_norm": 1.6129960695216716, + "language_loss": 0.75463134, + "learning_rate": 1.2726164800578913e-06, + "loss": 0.83147454, + "num_input_tokens_seen": 225941060, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.10321045, + "step": 10477, + "time_per_iteration": 4.080779314041138 + }, + { + "auxiliary_loss_clip": 0.06409407, + "auxiliary_loss_mlp": 0.01263638, + "balance_loss_clip": 0.06271356, + "balance_loss_mlp": 0.01252927, + "epoch": 0.6299714414549827, + "flos": 22681109312640.0, + "grad_norm": 1.9893759064975287, + "language_loss": 0.70635891, + "learning_rate": 1.272253702758138e-06, + "loss": 0.7830894, + "num_input_tokens_seen": 225960870, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.10717773, + "step": 10478, + "time_per_iteration": 2.526340961456299 + }, + { + "auxiliary_loss_clip": 0.06415921, + "auxiliary_loss_mlp": 0.01267154, + "balance_loss_clip": 0.06272538, + "balance_loss_mlp": 0.01256419, + "epoch": 0.6300315647076506, + "flos": 14506984759680.0, + "grad_norm": 2.55864896023097, + "language_loss": 0.6816293, + "learning_rate": 1.2718909530545974e-06, + "loss": 0.75846004, + "num_input_tokens_seen": 225977895, + "router_z_loss_clip": 1.43359375, + "router_z_loss_mlp": 0.10742188, + "step": 10479, + "time_per_iteration": 2.5156965255737305 + }, + { + "auxiliary_loss_clip": 0.06411125, + "auxiliary_loss_mlp": 0.01264946, + "balance_loss_clip": 0.06273742, + "balance_loss_mlp": 0.01254188, + "epoch": 0.6300916879603187, + "flos": 21878134035840.0, + "grad_norm": 1.462422599280115, + "language_loss": 0.73846787, + "learning_rate": 1.2715282309610245e-06, + "loss": 0.81522858, + "num_input_tokens_seen": 225997835, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.10760498, + "step": 10480, + "time_per_iteration": 2.528325319290161 + }, + { + "auxiliary_loss_clip": 0.06412197, + "auxiliary_loss_mlp": 0.01263721, + "balance_loss_clip": 0.06272143, + "balance_loss_mlp": 0.01253141, + "epoch": 0.6301518112129866, + "flos": 21840301117440.0, + "grad_norm": 1.7175758648379602, + "language_loss": 0.78970373, + "learning_rate": 1.2711655364911744e-06, + "loss": 0.86646283, + "num_input_tokens_seen": 226017620, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.10571289, + "step": 10481, + "time_per_iteration": 2.60512638092041 + }, + { + "auxiliary_loss_clip": 0.06303924, + "auxiliary_loss_mlp": 0.01252426, + "balance_loss_clip": 0.06247687, + "balance_loss_mlp": 0.01251297, + "epoch": 0.6302119344656546, + "flos": 44348429675520.0, + "grad_norm": 0.8754005674495109, + "language_loss": 0.61759591, + "learning_rate": 1.2708028696588e-06, + "loss": 0.69315946, + "num_input_tokens_seen": 226068755, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01131439, + "step": 10482, + "time_per_iteration": 2.8790156841278076 + }, + { + "auxiliary_loss_clip": 0.06422234, + "auxiliary_loss_mlp": 0.01270647, + "balance_loss_clip": 0.0627502, + "balance_loss_mlp": 0.01259125, + "epoch": 0.6302720577183225, + "flos": 11222604270720.0, + "grad_norm": 1.8532441203732761, + "language_loss": 0.82836294, + "learning_rate": 1.2704402304776541e-06, + "loss": 0.90529174, + "num_input_tokens_seen": 226084395, + "router_z_loss_clip": 1.47167969, + "router_z_loss_mlp": 0.11517334, + "step": 10483, + "time_per_iteration": 2.5396814346313477 + }, + { + "auxiliary_loss_clip": 0.06401882, + "auxiliary_loss_mlp": 0.01265558, + "balance_loss_clip": 0.06271434, + "balance_loss_mlp": 0.01255873, + "epoch": 0.6303321809709905, + "flos": 27972424869120.0, + "grad_norm": 1.7223788623313236, + "language_loss": 0.72617853, + "learning_rate": 1.270077618961487e-06, + "loss": 0.80285299, + "num_input_tokens_seen": 226105890, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.09680176, + "step": 10484, + "time_per_iteration": 2.580455780029297 + }, + { + "auxiliary_loss_clip": 0.06412905, + "auxiliary_loss_mlp": 0.01264883, + "balance_loss_clip": 0.06272406, + "balance_loss_mlp": 0.01254804, + "epoch": 0.6303923042236586, + "flos": 28228366765440.0, + "grad_norm": 1.5965857276488986, + "language_loss": 0.74397701, + "learning_rate": 1.2697150351240506e-06, + "loss": 0.82075489, + "num_input_tokens_seen": 226126760, + "router_z_loss_clip": 1.40429688, + "router_z_loss_mlp": 0.10064697, + "step": 10485, + "time_per_iteration": 2.5941050052642822 + }, + { + "auxiliary_loss_clip": 0.06418431, + "auxiliary_loss_mlp": 0.01266454, + "balance_loss_clip": 0.06274744, + "balance_loss_mlp": 0.01255552, + "epoch": 0.6304524274763265, + "flos": 27637546826880.0, + "grad_norm": 2.046844751133349, + "language_loss": 0.81281161, + "learning_rate": 1.269352478979093e-06, + "loss": 0.88966042, + "num_input_tokens_seen": 226147315, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.10906982, + "step": 10486, + "time_per_iteration": 2.558913469314575 + }, + { + "auxiliary_loss_clip": 0.06410582, + "auxiliary_loss_mlp": 0.01264672, + "balance_loss_clip": 0.06273519, + "balance_loss_mlp": 0.01254617, + "epoch": 0.6305125507289945, + "flos": 17317062812160.0, + "grad_norm": 2.0599224612771923, + "language_loss": 0.6412251, + "learning_rate": 1.2689899505403628e-06, + "loss": 0.71797758, + "num_input_tokens_seen": 226165935, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.1005249, + "step": 10487, + "time_per_iteration": 2.5042107105255127 + }, + { + "auxiliary_loss_clip": 0.06409851, + "auxiliary_loss_mlp": 0.01271472, + "balance_loss_clip": 0.06273416, + "balance_loss_mlp": 0.01261816, + "epoch": 0.6305726739816624, + "flos": 25814745106560.0, + "grad_norm": 1.4604670858512163, + "language_loss": 0.67510849, + "learning_rate": 1.2686274498216065e-06, + "loss": 0.75192171, + "num_input_tokens_seen": 226186890, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.09655762, + "step": 10488, + "time_per_iteration": 4.039014101028442 + }, + { + "auxiliary_loss_clip": 0.06409914, + "auxiliary_loss_mlp": 0.01266449, + "balance_loss_clip": 0.06271197, + "balance_loss_mlp": 0.01255827, + "epoch": 0.6306327972343304, + "flos": 21803684083200.0, + "grad_norm": 1.7399651792203026, + "language_loss": 0.67476416, + "learning_rate": 1.2682649768365706e-06, + "loss": 0.75152779, + "num_input_tokens_seen": 226206710, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.10620117, + "step": 10489, + "time_per_iteration": 2.522010564804077 + }, + { + "auxiliary_loss_clip": 0.06421866, + "auxiliary_loss_mlp": 0.01264052, + "balance_loss_clip": 0.06273416, + "balance_loss_mlp": 0.0125256, + "epoch": 0.6306929204869983, + "flos": 20783689931520.0, + "grad_norm": 1.8067939569631877, + "language_loss": 0.69957733, + "learning_rate": 1.2679025315990007e-06, + "loss": 0.77643645, + "num_input_tokens_seen": 226225565, + "router_z_loss_clip": 1.48242188, + "router_z_loss_mlp": 0.11486816, + "step": 10490, + "time_per_iteration": 2.56429123878479 + }, + { + "auxiliary_loss_clip": 0.06410774, + "auxiliary_loss_mlp": 0.01267822, + "balance_loss_clip": 0.06271295, + "balance_loss_mlp": 0.01257123, + "epoch": 0.6307530437396663, + "flos": 23660084090880.0, + "grad_norm": 1.7944305121470099, + "language_loss": 0.78453183, + "learning_rate": 1.2675401141226393e-06, + "loss": 0.86131787, + "num_input_tokens_seen": 226243680, + "router_z_loss_clip": 1.39648438, + "router_z_loss_mlp": 0.10699463, + "step": 10491, + "time_per_iteration": 3.9702792167663574 + }, + { + "auxiliary_loss_clip": 0.06410797, + "auxiliary_loss_mlp": 0.01264458, + "balance_loss_clip": 0.0627301, + "balance_loss_mlp": 0.01253753, + "epoch": 0.6308131669923343, + "flos": 24726170787840.0, + "grad_norm": 2.4094216465826914, + "language_loss": 0.55782068, + "learning_rate": 1.2671777244212308e-06, + "loss": 0.63457322, + "num_input_tokens_seen": 226264345, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.10705566, + "step": 10492, + "time_per_iteration": 2.5553138256073 + }, + { + "auxiliary_loss_clip": 0.06413063, + "auxiliary_loss_mlp": 0.01265406, + "balance_loss_clip": 0.06272843, + "balance_loss_mlp": 0.01254772, + "epoch": 0.6308732902450023, + "flos": 22572054823680.0, + "grad_norm": 2.1354270064325935, + "language_loss": 0.64787519, + "learning_rate": 1.2668153625085168e-06, + "loss": 0.72465986, + "num_input_tokens_seen": 226283165, + "router_z_loss_clip": 1.40332031, + "router_z_loss_mlp": 0.10620117, + "step": 10493, + "time_per_iteration": 2.532414197921753 + }, + { + "auxiliary_loss_clip": 0.06409045, + "auxiliary_loss_mlp": 0.01266138, + "balance_loss_clip": 0.06271107, + "balance_loss_mlp": 0.01255797, + "epoch": 0.6309334134976702, + "flos": 24651050002560.0, + "grad_norm": 1.3969800101414371, + "language_loss": 0.82710558, + "learning_rate": 1.2664530283982367e-06, + "loss": 0.90385741, + "num_input_tokens_seen": 226304080, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.10345459, + "step": 10494, + "time_per_iteration": 2.5479516983032227 + }, + { + "auxiliary_loss_clip": 0.06410792, + "auxiliary_loss_mlp": 0.01270884, + "balance_loss_clip": 0.06271842, + "balance_loss_mlp": 0.01260691, + "epoch": 0.6309935367503382, + "flos": 41437655343360.0, + "grad_norm": 1.6454448829725794, + "language_loss": 0.79526448, + "learning_rate": 1.2660907221041317e-06, + "loss": 0.87208128, + "num_input_tokens_seen": 226325925, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.10192871, + "step": 10495, + "time_per_iteration": 2.705066204071045 + }, + { + "auxiliary_loss_clip": 0.06412271, + "auxiliary_loss_mlp": 0.01267403, + "balance_loss_clip": 0.06272048, + "balance_loss_mlp": 0.01257019, + "epoch": 0.6310536600030061, + "flos": 15123772264320.0, + "grad_norm": 1.7689443425086426, + "language_loss": 0.70583153, + "learning_rate": 1.2657284436399403e-06, + "loss": 0.78262818, + "num_input_tokens_seen": 226344190, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.1038208, + "step": 10496, + "time_per_iteration": 2.4985408782958984 + }, + { + "auxiliary_loss_clip": 0.06412859, + "auxiliary_loss_mlp": 0.01267227, + "balance_loss_clip": 0.06273797, + "balance_loss_mlp": 0.01256212, + "epoch": 0.6311137832556741, + "flos": 15237019457280.0, + "grad_norm": 3.784046746171531, + "language_loss": 0.80308318, + "learning_rate": 1.2653661930193997e-06, + "loss": 0.879884, + "num_input_tokens_seen": 226361520, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.11016846, + "step": 10497, + "time_per_iteration": 3.934098243713379 + }, + { + "auxiliary_loss_clip": 0.06407946, + "auxiliary_loss_mlp": 0.0126368, + "balance_loss_clip": 0.06270217, + "balance_loss_mlp": 0.01254495, + "epoch": 0.6311739065083422, + "flos": 22025314932480.0, + "grad_norm": 1.763173694901495, + "language_loss": 0.7404235, + "learning_rate": 1.265003970256247e-06, + "loss": 0.81713974, + "num_input_tokens_seen": 226381920, + "router_z_loss_clip": 1.37695312, + "router_z_loss_mlp": 0.09185791, + "step": 10498, + "time_per_iteration": 2.499866485595703 + }, + { + "auxiliary_loss_clip": 0.06410685, + "auxiliary_loss_mlp": 0.01267486, + "balance_loss_clip": 0.06272636, + "balance_loss_mlp": 0.01257174, + "epoch": 0.6312340297610101, + "flos": 22717349003520.0, + "grad_norm": 2.1933614541595543, + "language_loss": 0.70156991, + "learning_rate": 1.264641775364217e-06, + "loss": 0.77835166, + "num_input_tokens_seen": 226400035, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.10308838, + "step": 10499, + "time_per_iteration": 2.52750825881958 + }, + { + "auxiliary_loss_clip": 0.06406461, + "auxiliary_loss_mlp": 0.01267196, + "balance_loss_clip": 0.06273144, + "balance_loss_mlp": 0.01257122, + "epoch": 0.6312941530136781, + "flos": 24287017939200.0, + "grad_norm": 1.829578685045339, + "language_loss": 0.69904381, + "learning_rate": 1.2642796083570448e-06, + "loss": 0.77578032, + "num_input_tokens_seen": 226418280, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.10083008, + "step": 10500, + "time_per_iteration": 2.5188052654266357 + }, + { + "auxiliary_loss_clip": 0.06409658, + "auxiliary_loss_mlp": 0.01264556, + "balance_loss_clip": 0.06272549, + "balance_loss_mlp": 0.01254412, + "epoch": 0.631354276266346, + "flos": 21732420585600.0, + "grad_norm": 1.7241647945677354, + "language_loss": 0.74330127, + "learning_rate": 1.2639174692484634e-06, + "loss": 0.82004339, + "num_input_tokens_seen": 226436650, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.10144043, + "step": 10501, + "time_per_iteration": 2.5523152351379395 + }, + { + "auxiliary_loss_clip": 0.06406975, + "auxiliary_loss_mlp": 0.01265441, + "balance_loss_clip": 0.06271887, + "balance_loss_mlp": 0.01254331, + "epoch": 0.631414399519014, + "flos": 24032040364800.0, + "grad_norm": 1.6086243864849348, + "language_loss": 0.75708318, + "learning_rate": 1.2635553580522053e-06, + "loss": 0.83380735, + "num_input_tokens_seen": 226456275, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.11102295, + "step": 10502, + "time_per_iteration": 2.531738519668579 + }, + { + "auxiliary_loss_clip": 0.06415547, + "auxiliary_loss_mlp": 0.01269255, + "balance_loss_clip": 0.06271978, + "balance_loss_mlp": 0.01258026, + "epoch": 0.6314745227716819, + "flos": 24322586797440.0, + "grad_norm": 1.857189484196882, + "language_loss": 0.85481834, + "learning_rate": 1.2631932747820022e-06, + "loss": 0.93166631, + "num_input_tokens_seen": 226473610, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.11230469, + "step": 10503, + "time_per_iteration": 2.552402973175049 + }, + { + "auxiliary_loss_clip": 0.06410381, + "auxiliary_loss_mlp": 0.01264313, + "balance_loss_clip": 0.06270783, + "balance_loss_mlp": 0.01254061, + "epoch": 0.6315346460243499, + "flos": 23372891821440.0, + "grad_norm": 1.6307573056927078, + "language_loss": 0.86482477, + "learning_rate": 1.2628312194515838e-06, + "loss": 0.94157171, + "num_input_tokens_seen": 226493665, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.10253906, + "step": 10504, + "time_per_iteration": 2.5060269832611084 + }, + { + "auxiliary_loss_clip": 0.064176, + "auxiliary_loss_mlp": 0.01268121, + "balance_loss_clip": 0.06273593, + "balance_loss_mlp": 0.01257142, + "epoch": 0.6315947692770179, + "flos": 20265517082880.0, + "grad_norm": 1.678620058857516, + "language_loss": 0.76972538, + "learning_rate": 1.2624691920746793e-06, + "loss": 0.84658259, + "num_input_tokens_seen": 226511625, + "router_z_loss_clip": 1.44042969, + "router_z_loss_mlp": 0.10974121, + "step": 10505, + "time_per_iteration": 2.5305702686309814 + }, + { + "auxiliary_loss_clip": 0.06409689, + "auxiliary_loss_mlp": 0.01264983, + "balance_loss_clip": 0.06271394, + "balance_loss_mlp": 0.01254647, + "epoch": 0.6316548925296859, + "flos": 25273036460160.0, + "grad_norm": 1.9130295201566025, + "language_loss": 0.82312322, + "learning_rate": 1.2621071926650166e-06, + "loss": 0.89986992, + "num_input_tokens_seen": 226530085, + "router_z_loss_clip": 1.38476562, + "router_z_loss_mlp": 0.10339355, + "step": 10506, + "time_per_iteration": 2.5286946296691895 + }, + { + "auxiliary_loss_clip": 0.06409711, + "auxiliary_loss_mlp": 0.01264286, + "balance_loss_clip": 0.06270994, + "balance_loss_mlp": 0.01253164, + "epoch": 0.6317150157823538, + "flos": 22937344698240.0, + "grad_norm": 1.904699510430935, + "language_loss": 0.74647379, + "learning_rate": 1.2617452212363238e-06, + "loss": 0.82321376, + "num_input_tokens_seen": 226548115, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.11120605, + "step": 10507, + "time_per_iteration": 2.5269975662231445 + }, + { + "auxiliary_loss_clip": 0.06414819, + "auxiliary_loss_mlp": 0.01266326, + "balance_loss_clip": 0.06273329, + "balance_loss_mlp": 0.01254876, + "epoch": 0.6317751390350218, + "flos": 22533383364480.0, + "grad_norm": 1.9107193302266279, + "language_loss": 0.68296039, + "learning_rate": 1.2613832778023258e-06, + "loss": 0.75977188, + "num_input_tokens_seen": 226567955, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.11456299, + "step": 10508, + "time_per_iteration": 2.522627830505371 + }, + { + "auxiliary_loss_clip": 0.06408058, + "auxiliary_loss_mlp": 0.01267063, + "balance_loss_clip": 0.06270574, + "balance_loss_mlp": 0.0125662, + "epoch": 0.6318352622876897, + "flos": 23301460615680.0, + "grad_norm": 1.6343142360187424, + "language_loss": 0.70864749, + "learning_rate": 1.2610213623767478e-06, + "loss": 0.78539872, + "num_input_tokens_seen": 226588205, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.10449219, + "step": 10509, + "time_per_iteration": 2.542271614074707 + }, + { + "auxiliary_loss_clip": 0.06404234, + "auxiliary_loss_mlp": 0.01267915, + "balance_loss_clip": 0.06269038, + "balance_loss_mlp": 0.01257901, + "epoch": 0.6318953855403577, + "flos": 20710330081920.0, + "grad_norm": 1.5692460316561092, + "language_loss": 0.79883605, + "learning_rate": 1.2606594749733143e-06, + "loss": 0.87555748, + "num_input_tokens_seen": 226606965, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.10003662, + "step": 10510, + "time_per_iteration": 2.5088951587677 + }, + { + "auxiliary_loss_clip": 0.06416003, + "auxiliary_loss_mlp": 0.01266499, + "balance_loss_clip": 0.0627503, + "balance_loss_mlp": 0.01255627, + "epoch": 0.6319555087930258, + "flos": 22826613127680.0, + "grad_norm": 1.472787804562701, + "language_loss": 0.71112996, + "learning_rate": 1.2602976156057469e-06, + "loss": 0.78795499, + "num_input_tokens_seen": 226627845, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.10870361, + "step": 10511, + "time_per_iteration": 2.5239315032958984 + }, + { + "auxiliary_loss_clip": 0.06404155, + "auxiliary_loss_mlp": 0.01264501, + "balance_loss_clip": 0.06270795, + "balance_loss_mlp": 0.01254863, + "epoch": 0.6320156320456937, + "flos": 19976480023680.0, + "grad_norm": 1.5136926076294552, + "language_loss": 0.80152798, + "learning_rate": 1.2599357842877684e-06, + "loss": 0.87821454, + "num_input_tokens_seen": 226645855, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.09631348, + "step": 10512, + "time_per_iteration": 2.4730801582336426 + }, + { + "auxiliary_loss_clip": 0.06412748, + "auxiliary_loss_mlp": 0.01269686, + "balance_loss_clip": 0.06273789, + "balance_loss_mlp": 0.01258599, + "epoch": 0.6320757552983617, + "flos": 27020256197760.0, + "grad_norm": 1.640445181436539, + "language_loss": 0.71047747, + "learning_rate": 1.2595739810330994e-06, + "loss": 0.7873019, + "num_input_tokens_seen": 226665375, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.11090088, + "step": 10513, + "time_per_iteration": 2.554516077041626 + }, + { + "auxiliary_loss_clip": 0.06414016, + "auxiliary_loss_mlp": 0.01265922, + "balance_loss_clip": 0.06272392, + "balance_loss_mlp": 0.01255527, + "epoch": 0.6321358785510296, + "flos": 23702696691840.0, + "grad_norm": 1.6086341634408383, + "language_loss": 0.67001855, + "learning_rate": 1.259212205855459e-06, + "loss": 0.74681789, + "num_input_tokens_seen": 226685270, + "router_z_loss_clip": 1.41699219, + "router_z_loss_mlp": 0.10394287, + "step": 10514, + "time_per_iteration": 2.519026517868042 + }, + { + "auxiliary_loss_clip": 0.06407337, + "auxiliary_loss_mlp": 0.01266338, + "balance_loss_clip": 0.06271799, + "balance_loss_mlp": 0.01256491, + "epoch": 0.6321960018036976, + "flos": 26002484179200.0, + "grad_norm": 1.6426182718028832, + "language_loss": 0.74301624, + "learning_rate": 1.2588504587685663e-06, + "loss": 0.81975299, + "num_input_tokens_seen": 226705325, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.09851074, + "step": 10515, + "time_per_iteration": 2.6021077632904053 + }, + { + "auxiliary_loss_clip": 0.06406167, + "auxiliary_loss_mlp": 0.0126568, + "balance_loss_clip": 0.0627216, + "balance_loss_mlp": 0.01256054, + "epoch": 0.6322561250563655, + "flos": 22827745157760.0, + "grad_norm": 1.6516346518134952, + "language_loss": 0.90002799, + "learning_rate": 1.2584887397861379e-06, + "loss": 0.9767465, + "num_input_tokens_seen": 226723815, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.09631348, + "step": 10516, + "time_per_iteration": 3.9120290279388428 + }, + { + "auxiliary_loss_clip": 0.0641951, + "auxiliary_loss_mlp": 0.01266296, + "balance_loss_clip": 0.06273714, + "balance_loss_mlp": 0.01254208, + "epoch": 0.6323162483090335, + "flos": 18994234936320.0, + "grad_norm": 1.6653274793264599, + "language_loss": 0.81976604, + "learning_rate": 1.2581270489218911e-06, + "loss": 0.89662409, + "num_input_tokens_seen": 226741550, + "router_z_loss_clip": 1.45898438, + "router_z_loss_mlp": 0.12084961, + "step": 10517, + "time_per_iteration": 2.478886127471924 + }, + { + "auxiliary_loss_clip": 0.06409353, + "auxiliary_loss_mlp": 0.01263914, + "balance_loss_clip": 0.06273272, + "balance_loss_mlp": 0.01254312, + "epoch": 0.6323763715617015, + "flos": 19871324749440.0, + "grad_norm": 1.77487902385547, + "language_loss": 0.77740157, + "learning_rate": 1.257765386189541e-06, + "loss": 0.8541342, + "num_input_tokens_seen": 226761115, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.0960083, + "step": 10518, + "time_per_iteration": 2.529668092727661 + }, + { + "auxiliary_loss_clip": 0.06409025, + "auxiliary_loss_mlp": 0.01262964, + "balance_loss_clip": 0.0627432, + "balance_loss_mlp": 0.01253475, + "epoch": 0.6324364948143695, + "flos": 22789115625600.0, + "grad_norm": 1.399689960822604, + "language_loss": 0.85268837, + "learning_rate": 1.2574037516028018e-06, + "loss": 0.92940825, + "num_input_tokens_seen": 226782225, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.0949707, + "step": 10519, + "time_per_iteration": 2.5316224098205566 + }, + { + "auxiliary_loss_clip": 0.06407413, + "auxiliary_loss_mlp": 0.01264534, + "balance_loss_clip": 0.06274519, + "balance_loss_mlp": 0.01255081, + "epoch": 0.6324966180670374, + "flos": 22242333807360.0, + "grad_norm": 1.7591221317630206, + "language_loss": 0.7227571, + "learning_rate": 1.2570421451753867e-06, + "loss": 0.79947662, + "num_input_tokens_seen": 226802375, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.09454346, + "step": 10520, + "time_per_iteration": 2.593050479888916 + }, + { + "auxiliary_loss_clip": 0.06409709, + "auxiliary_loss_mlp": 0.01264525, + "balance_loss_clip": 0.06273077, + "balance_loss_mlp": 0.01254405, + "epoch": 0.6325567413197054, + "flos": 21695593916160.0, + "grad_norm": 1.8135575738100813, + "language_loss": 0.71838474, + "learning_rate": 1.2566805669210081e-06, + "loss": 0.79512703, + "num_input_tokens_seen": 226822165, + "router_z_loss_clip": 1.36621094, + "router_z_loss_mlp": 0.10119629, + "step": 10521, + "time_per_iteration": 2.5069823265075684 + }, + { + "auxiliary_loss_clip": 0.06414442, + "auxiliary_loss_mlp": 0.01265675, + "balance_loss_clip": 0.06276147, + "balance_loss_mlp": 0.01255018, + "epoch": 0.6326168645723733, + "flos": 19943133298560.0, + "grad_norm": 1.6828366730110347, + "language_loss": 0.7199434, + "learning_rate": 1.256319016853377e-06, + "loss": 0.79674459, + "num_input_tokens_seen": 226841645, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.10662842, + "step": 10522, + "time_per_iteration": 2.6152310371398926 + }, + { + "auxiliary_loss_clip": 0.06406049, + "auxiliary_loss_mlp": 0.0126663, + "balance_loss_clip": 0.06271145, + "balance_loss_mlp": 0.01256897, + "epoch": 0.6326769878250413, + "flos": 20236614624000.0, + "grad_norm": 1.7290468863072455, + "language_loss": 0.8156153, + "learning_rate": 1.2559574949862023e-06, + "loss": 0.89234209, + "num_input_tokens_seen": 226860355, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.09735107, + "step": 10523, + "time_per_iteration": 2.5101752281188965 + }, + { + "auxiliary_loss_clip": 0.06411799, + "auxiliary_loss_mlp": 0.012662, + "balance_loss_clip": 0.06276074, + "balance_loss_mlp": 0.01256669, + "epoch": 0.6327371110777094, + "flos": 20781803214720.0, + "grad_norm": 1.7543720010709223, + "language_loss": 0.73841488, + "learning_rate": 1.255596001333195e-06, + "loss": 0.81519485, + "num_input_tokens_seen": 226878390, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.09527588, + "step": 10524, + "time_per_iteration": 2.5357463359832764 + }, + { + "auxiliary_loss_clip": 0.06421272, + "auxiliary_loss_mlp": 0.01269485, + "balance_loss_clip": 0.06276855, + "balance_loss_mlp": 0.01258977, + "epoch": 0.6327972343303773, + "flos": 30344440176000.0, + "grad_norm": 2.100184187405554, + "language_loss": 0.84972739, + "learning_rate": 1.2552345359080615e-06, + "loss": 0.92663497, + "num_input_tokens_seen": 226898420, + "router_z_loss_clip": 1.44140625, + "router_z_loss_mlp": 0.10510254, + "step": 10525, + "time_per_iteration": 2.579566478729248 + }, + { + "auxiliary_loss_clip": 0.06407693, + "auxiliary_loss_mlp": 0.01265026, + "balance_loss_clip": 0.06272401, + "balance_loss_mlp": 0.01255632, + "epoch": 0.6328573575830453, + "flos": 17097947585280.0, + "grad_norm": 1.5662936390284432, + "language_loss": 0.67044812, + "learning_rate": 1.2548730987245093e-06, + "loss": 0.74717528, + "num_input_tokens_seen": 226916305, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.09393311, + "step": 10526, + "time_per_iteration": 2.6565749645233154 + }, + { + "auxiliary_loss_clip": 0.06418256, + "auxiliary_loss_mlp": 0.0126482, + "balance_loss_clip": 0.06276698, + "balance_loss_mlp": 0.01254002, + "epoch": 0.6329174808357132, + "flos": 25054340503680.0, + "grad_norm": 1.744260985628437, + "language_loss": 0.73593014, + "learning_rate": 1.254511689796244e-06, + "loss": 0.81276095, + "num_input_tokens_seen": 226937705, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.10821533, + "step": 10527, + "time_per_iteration": 4.000992298126221 + }, + { + "auxiliary_loss_clip": 0.06408013, + "auxiliary_loss_mlp": 0.01264369, + "balance_loss_clip": 0.062744, + "balance_loss_mlp": 0.01255124, + "epoch": 0.6329776040883812, + "flos": 16842466886400.0, + "grad_norm": 2.0238254127026347, + "language_loss": 0.72017205, + "learning_rate": 1.2541503091369693e-06, + "loss": 0.79689586, + "num_input_tokens_seen": 226954880, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.0925293, + "step": 10528, + "time_per_iteration": 2.482356548309326 + }, + { + "auxiliary_loss_clip": 0.06410971, + "auxiliary_loss_mlp": 0.01266595, + "balance_loss_clip": 0.0627386, + "balance_loss_mlp": 0.01256647, + "epoch": 0.6330377273410491, + "flos": 13521804779520.0, + "grad_norm": 2.0709634573058966, + "language_loss": 0.67286944, + "learning_rate": 1.2537889567603905e-06, + "loss": 0.74964511, + "num_input_tokens_seen": 226972595, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.0994873, + "step": 10529, + "time_per_iteration": 2.506375551223755 + }, + { + "auxiliary_loss_clip": 0.06417675, + "auxiliary_loss_mlp": 0.01266042, + "balance_loss_clip": 0.06276476, + "balance_loss_mlp": 0.0125486, + "epoch": 0.6330978505937171, + "flos": 21544471877760.0, + "grad_norm": 1.8153408645192133, + "language_loss": 0.75284207, + "learning_rate": 1.2534276326802092e-06, + "loss": 0.82967925, + "num_input_tokens_seen": 226991910, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.11181641, + "step": 10530, + "time_per_iteration": 4.016285419464111 + }, + { + "auxiliary_loss_clip": 0.06421702, + "auxiliary_loss_mlp": 0.01265839, + "balance_loss_clip": 0.06280397, + "balance_loss_mlp": 0.01255557, + "epoch": 0.6331579738463851, + "flos": 25016465658240.0, + "grad_norm": 1.412209042537855, + "language_loss": 0.74000126, + "learning_rate": 1.2530663369101259e-06, + "loss": 0.81687671, + "num_input_tokens_seen": 227010175, + "router_z_loss_clip": 1.41308594, + "router_z_loss_mlp": 0.10284424, + "step": 10531, + "time_per_iteration": 2.5478739738464355 + }, + { + "auxiliary_loss_clip": 0.06410021, + "auxiliary_loss_mlp": 0.0126691, + "balance_loss_clip": 0.06275932, + "balance_loss_mlp": 0.01257093, + "epoch": 0.6332180970990531, + "flos": 14981329123200.0, + "grad_norm": 4.395160978524889, + "language_loss": 0.80356932, + "learning_rate": 1.2527050694638432e-06, + "loss": 0.88033861, + "num_input_tokens_seen": 227025540, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.0980835, + "step": 10532, + "time_per_iteration": 2.4629757404327393 + }, + { + "auxiliary_loss_clip": 0.06411614, + "auxiliary_loss_mlp": 0.01265113, + "balance_loss_clip": 0.06276565, + "balance_loss_mlp": 0.01256017, + "epoch": 0.633278220351721, + "flos": 22712904737280.0, + "grad_norm": 1.6509114242634397, + "language_loss": 0.75345361, + "learning_rate": 1.2523438303550582e-06, + "loss": 0.83022094, + "num_input_tokens_seen": 227045520, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.09094238, + "step": 10533, + "time_per_iteration": 2.5486817359924316 + }, + { + "auxiliary_loss_clip": 0.06421439, + "auxiliary_loss_mlp": 0.01266816, + "balance_loss_clip": 0.0627851, + "balance_loss_mlp": 0.01255586, + "epoch": 0.633338343604389, + "flos": 12607594807680.0, + "grad_norm": 2.155852114283844, + "language_loss": 0.7738024, + "learning_rate": 1.2519826195974706e-06, + "loss": 0.850685, + "num_input_tokens_seen": 227059420, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.11224365, + "step": 10534, + "time_per_iteration": 2.447556257247925 + }, + { + "auxiliary_loss_clip": 0.06414493, + "auxiliary_loss_mlp": 0.01265709, + "balance_loss_clip": 0.06277296, + "balance_loss_mlp": 0.01255314, + "epoch": 0.6333984668570569, + "flos": 25967586153600.0, + "grad_norm": 8.614230799549778, + "language_loss": 0.85787749, + "learning_rate": 1.251621437204777e-06, + "loss": 0.93467951, + "num_input_tokens_seen": 227081310, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.10400391, + "step": 10535, + "time_per_iteration": 2.564028739929199 + }, + { + "auxiliary_loss_clip": 0.06413931, + "auxiliary_loss_mlp": 0.01265846, + "balance_loss_clip": 0.06276763, + "balance_loss_mlp": 0.01255606, + "epoch": 0.6334585901097249, + "flos": 23665953876480.0, + "grad_norm": 1.7881941276129079, + "language_loss": 0.76803362, + "learning_rate": 1.2512602831906733e-06, + "loss": 0.84483141, + "num_input_tokens_seen": 227100365, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.10235596, + "step": 10536, + "time_per_iteration": 4.017718315124512 + }, + { + "auxiliary_loss_clip": 0.06411674, + "auxiliary_loss_mlp": 0.01264671, + "balance_loss_clip": 0.06276245, + "balance_loss_mlp": 0.01254848, + "epoch": 0.633518713362393, + "flos": 28766930883840.0, + "grad_norm": 1.5924161290871786, + "language_loss": 0.6050871, + "learning_rate": 1.250899157568855e-06, + "loss": 0.68185055, + "num_input_tokens_seen": 227119680, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.09820557, + "step": 10537, + "time_per_iteration": 2.575690746307373 + }, + { + "auxiliary_loss_clip": 0.0632174, + "auxiliary_loss_mlp": 0.01257375, + "balance_loss_clip": 0.06265318, + "balance_loss_mlp": 0.01256043, + "epoch": 0.6335788366150609, + "flos": 70438669407360.0, + "grad_norm": 0.7645314683588974, + "language_loss": 0.5222913, + "learning_rate": 1.2505380603530155e-06, + "loss": 0.59808248, + "num_input_tokens_seen": 227184465, + "router_z_loss_clip": 0.56298828, + "router_z_loss_mlp": 0.01334381, + "step": 10538, + "time_per_iteration": 3.254763126373291 + }, + { + "auxiliary_loss_clip": 0.06417011, + "auxiliary_loss_mlp": 0.01268273, + "balance_loss_clip": 0.06275439, + "balance_loss_mlp": 0.01257383, + "epoch": 0.6336389598677289, + "flos": 23738768674560.0, + "grad_norm": 1.8043673999860153, + "language_loss": 0.83927584, + "learning_rate": 1.250176991556848e-06, + "loss": 0.91612864, + "num_input_tokens_seen": 227202185, + "router_z_loss_clip": 1.41503906, + "router_z_loss_mlp": 0.10888672, + "step": 10539, + "time_per_iteration": 2.533168315887451 + }, + { + "auxiliary_loss_clip": 0.06413823, + "auxiliary_loss_mlp": 0.01265505, + "balance_loss_clip": 0.06273531, + "balance_loss_mlp": 0.01254526, + "epoch": 0.6336990831203968, + "flos": 29284097483520.0, + "grad_norm": 1.5633861305622094, + "language_loss": 0.87373441, + "learning_rate": 1.2498159511940438e-06, + "loss": 0.95052767, + "num_input_tokens_seen": 227222020, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.10980225, + "step": 10540, + "time_per_iteration": 2.5700464248657227 + }, + { + "auxiliary_loss_clip": 0.0641039, + "auxiliary_loss_mlp": 0.01263695, + "balance_loss_clip": 0.06275897, + "balance_loss_mlp": 0.01254671, + "epoch": 0.6337592063730648, + "flos": 29104659964800.0, + "grad_norm": 1.757260374288504, + "language_loss": 0.7308234, + "learning_rate": 1.2494549392782943e-06, + "loss": 0.80756426, + "num_input_tokens_seen": 227240885, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.090271, + "step": 10541, + "time_per_iteration": 2.5605950355529785 + }, + { + "auxiliary_loss_clip": 0.06419826, + "auxiliary_loss_mlp": 0.01267808, + "balance_loss_clip": 0.06276362, + "balance_loss_mlp": 0.01255934, + "epoch": 0.6338193296257327, + "flos": 34713705404160.0, + "grad_norm": 3.0522247844622217, + "language_loss": 0.85394645, + "learning_rate": 1.2490939558232887e-06, + "loss": 0.93082273, + "num_input_tokens_seen": 227257880, + "router_z_loss_clip": 1.43359375, + "router_z_loss_mlp": 0.11865234, + "step": 10542, + "time_per_iteration": 2.711641788482666 + }, + { + "auxiliary_loss_clip": 0.06413235, + "auxiliary_loss_mlp": 0.01264694, + "balance_loss_clip": 0.06275927, + "balance_loss_mlp": 0.01253477, + "epoch": 0.6338794528784008, + "flos": 16692644586240.0, + "grad_norm": 1.6414110705076674, + "language_loss": 0.77927899, + "learning_rate": 1.2487330008427153e-06, + "loss": 0.85605824, + "num_input_tokens_seen": 227274840, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.11224365, + "step": 10543, + "time_per_iteration": 2.4868364334106445 + }, + { + "auxiliary_loss_clip": 0.06406207, + "auxiliary_loss_mlp": 0.01263491, + "balance_loss_clip": 0.0627438, + "balance_loss_mlp": 0.01254049, + "epoch": 0.6339395761310687, + "flos": 22353233086080.0, + "grad_norm": 1.4561914884468037, + "language_loss": 0.73388422, + "learning_rate": 1.2483720743502618e-06, + "loss": 0.81058121, + "num_input_tokens_seen": 227294835, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09442139, + "step": 10544, + "time_per_iteration": 2.5364322662353516 + }, + { + "auxiliary_loss_clip": 0.06420652, + "auxiliary_loss_mlp": 0.01265757, + "balance_loss_clip": 0.06277749, + "balance_loss_mlp": 0.0125501, + "epoch": 0.6339996993837367, + "flos": 18557765418240.0, + "grad_norm": 2.1124884217915953, + "language_loss": 0.68196738, + "learning_rate": 1.2480111763596144e-06, + "loss": 0.7588315, + "num_input_tokens_seen": 227314935, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.10736084, + "step": 10545, + "time_per_iteration": 2.498805284500122 + }, + { + "auxiliary_loss_clip": 0.06407638, + "auxiliary_loss_mlp": 0.01263932, + "balance_loss_clip": 0.06273287, + "balance_loss_mlp": 0.01254217, + "epoch": 0.6340598226364046, + "flos": 12974519836800.0, + "grad_norm": 1.9119054748089928, + "language_loss": 0.71463943, + "learning_rate": 1.2476503068844592e-06, + "loss": 0.79135519, + "num_input_tokens_seen": 227332905, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.09710693, + "step": 10546, + "time_per_iteration": 2.494575262069702 + }, + { + "auxiliary_loss_clip": 0.06404417, + "auxiliary_loss_mlp": 0.012635, + "balance_loss_clip": 0.06273207, + "balance_loss_mlp": 0.01254214, + "epoch": 0.6341199458890726, + "flos": 26695272936960.0, + "grad_norm": 1.3275160208019028, + "language_loss": 0.78403944, + "learning_rate": 1.2472894659384792e-06, + "loss": 0.86071861, + "num_input_tokens_seen": 227354915, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09283447, + "step": 10547, + "time_per_iteration": 2.565394639968872 + }, + { + "auxiliary_loss_clip": 0.06415725, + "auxiliary_loss_mlp": 0.01266589, + "balance_loss_clip": 0.0627535, + "balance_loss_mlp": 0.01256462, + "epoch": 0.6341800691417405, + "flos": 18740263610880.0, + "grad_norm": 1.5896144863347355, + "language_loss": 0.63801014, + "learning_rate": 1.2469286535353578e-06, + "loss": 0.71483326, + "num_input_tokens_seen": 227372990, + "router_z_loss_clip": 1.40332031, + "router_z_loss_mlp": 0.10131836, + "step": 10548, + "time_per_iteration": 2.531881332397461 + }, + { + "auxiliary_loss_clip": 0.06408647, + "auxiliary_loss_mlp": 0.01263438, + "balance_loss_clip": 0.06272966, + "balance_loss_mlp": 0.01253746, + "epoch": 0.6342401923944085, + "flos": 26256539358720.0, + "grad_norm": 1.5473137822842997, + "language_loss": 0.61999178, + "learning_rate": 1.2465678696887785e-06, + "loss": 0.69671261, + "num_input_tokens_seen": 227393270, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.09698486, + "step": 10549, + "time_per_iteration": 2.590090274810791 + }, + { + "auxiliary_loss_clip": 0.06413013, + "auxiliary_loss_mlp": 0.01265888, + "balance_loss_clip": 0.06276116, + "balance_loss_mlp": 0.01256047, + "epoch": 0.6343003156470765, + "flos": 24687834744960.0, + "grad_norm": 1.5414529536537591, + "language_loss": 0.74040842, + "learning_rate": 1.2462071144124197e-06, + "loss": 0.81719744, + "num_input_tokens_seen": 227413630, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.09844971, + "step": 10550, + "time_per_iteration": 2.575768232345581 + }, + { + "auxiliary_loss_clip": 0.06314379, + "auxiliary_loss_mlp": 0.01254446, + "balance_loss_clip": 0.06258175, + "balance_loss_mlp": 0.01252981, + "epoch": 0.6343604388997445, + "flos": 69824481379200.0, + "grad_norm": 0.6831342981577847, + "language_loss": 0.57712334, + "learning_rate": 1.2458463877199638e-06, + "loss": 0.65281159, + "num_input_tokens_seen": 227476630, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01463318, + "step": 10551, + "time_per_iteration": 3.169085741043091 + }, + { + "auxiliary_loss_clip": 0.06408188, + "auxiliary_loss_mlp": 0.01264711, + "balance_loss_clip": 0.06273001, + "balance_loss_mlp": 0.01255257, + "epoch": 0.6344205621524125, + "flos": 21989117168640.0, + "grad_norm": 1.9821146557890166, + "language_loss": 0.67052966, + "learning_rate": 1.2454856896250881e-06, + "loss": 0.74725866, + "num_input_tokens_seen": 227496060, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.09454346, + "step": 10552, + "time_per_iteration": 2.51409649848938 + }, + { + "auxiliary_loss_clip": 0.06415403, + "auxiliary_loss_mlp": 0.01263367, + "balance_loss_clip": 0.0627457, + "balance_loss_mlp": 0.01252883, + "epoch": 0.6344806854050804, + "flos": 20455100945280.0, + "grad_norm": 1.6854116098373486, + "language_loss": 0.82256383, + "learning_rate": 1.24512502014147e-06, + "loss": 0.89935154, + "num_input_tokens_seen": 227513440, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.1048584, + "step": 10553, + "time_per_iteration": 2.5263893604278564 + }, + { + "auxiliary_loss_clip": 0.06412624, + "auxiliary_loss_mlp": 0.01266225, + "balance_loss_clip": 0.06273618, + "balance_loss_mlp": 0.01256021, + "epoch": 0.6345408086577484, + "flos": 40519294594560.0, + "grad_norm": 1.7209630881675668, + "language_loss": 0.55282557, + "learning_rate": 1.2447643792827879e-06, + "loss": 0.629614, + "num_input_tokens_seen": 227535395, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.10205078, + "step": 10554, + "time_per_iteration": 2.6742208003997803 + }, + { + "auxiliary_loss_clip": 0.06412828, + "auxiliary_loss_mlp": 0.01266072, + "balance_loss_clip": 0.06274945, + "balance_loss_mlp": 0.01255701, + "epoch": 0.6346009319104163, + "flos": 21367759616640.0, + "grad_norm": 1.6547697162667994, + "language_loss": 0.7092278, + "learning_rate": 1.2444037670627153e-06, + "loss": 0.78601682, + "num_input_tokens_seen": 227554545, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.10369873, + "step": 10555, + "time_per_iteration": 2.5059010982513428 + }, + { + "auxiliary_loss_clip": 0.06308, + "auxiliary_loss_mlp": 0.01256771, + "balance_loss_clip": 0.06252061, + "balance_loss_mlp": 0.01255482, + "epoch": 0.6346610551630844, + "flos": 71383333138560.0, + "grad_norm": 0.7594485734837986, + "language_loss": 0.5526008, + "learning_rate": 1.2440431834949276e-06, + "loss": 0.62824851, + "num_input_tokens_seen": 227608575, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.01290131, + "step": 10556, + "time_per_iteration": 4.480233669281006 + }, + { + "auxiliary_loss_clip": 0.0641848, + "auxiliary_loss_mlp": 0.01268051, + "balance_loss_clip": 0.06276923, + "balance_loss_mlp": 0.01257227, + "epoch": 0.6347211784157523, + "flos": 25418666056320.0, + "grad_norm": 1.720664259353744, + "language_loss": 0.68248415, + "learning_rate": 1.2436826285930985e-06, + "loss": 0.75934947, + "num_input_tokens_seen": 227628175, + "router_z_loss_clip": 1.41503906, + "router_z_loss_mlp": 0.10827637, + "step": 10557, + "time_per_iteration": 2.5347533226013184 + }, + { + "auxiliary_loss_clip": 0.06415346, + "auxiliary_loss_mlp": 0.01266286, + "balance_loss_clip": 0.06277986, + "balance_loss_mlp": 0.01256069, + "epoch": 0.6347813016684203, + "flos": 15748274344320.0, + "grad_norm": 1.7185775847351308, + "language_loss": 0.7034533, + "learning_rate": 1.2433221023709002e-06, + "loss": 0.78026962, + "num_input_tokens_seen": 227645330, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.10211182, + "step": 10558, + "time_per_iteration": 2.5184271335601807 + }, + { + "auxiliary_loss_clip": 0.06415297, + "auxiliary_loss_mlp": 0.01267927, + "balance_loss_clip": 0.06277342, + "balance_loss_mlp": 0.01257812, + "epoch": 0.6348414249210882, + "flos": 21470231560320.0, + "grad_norm": 1.5690247234550625, + "language_loss": 0.78373873, + "learning_rate": 1.2429616048420031e-06, + "loss": 0.86057091, + "num_input_tokens_seen": 227665250, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.10113525, + "step": 10559, + "time_per_iteration": 2.5017571449279785 + }, + { + "auxiliary_loss_clip": 0.06413186, + "auxiliary_loss_mlp": 0.01268043, + "balance_loss_clip": 0.06274431, + "balance_loss_mlp": 0.01257404, + "epoch": 0.6349015481737562, + "flos": 21659521933440.0, + "grad_norm": 1.6584174732731671, + "language_loss": 0.68334514, + "learning_rate": 1.242601136020078e-06, + "loss": 0.76015741, + "num_input_tokens_seen": 227685070, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.10638428, + "step": 10560, + "time_per_iteration": 2.536973237991333 + }, + { + "auxiliary_loss_clip": 0.06413247, + "auxiliary_loss_mlp": 0.01267835, + "balance_loss_clip": 0.06275544, + "balance_loss_mlp": 0.01257679, + "epoch": 0.6349616714264241, + "flos": 22200643601280.0, + "grad_norm": 1.5868389258687317, + "language_loss": 0.77125943, + "learning_rate": 1.2422406959187939e-06, + "loss": 0.84807026, + "num_input_tokens_seen": 227704430, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.10150146, + "step": 10561, + "time_per_iteration": 2.5515172481536865 + }, + { + "auxiliary_loss_clip": 0.06412898, + "auxiliary_loss_mlp": 0.01265705, + "balance_loss_clip": 0.06273612, + "balance_loss_mlp": 0.01254433, + "epoch": 0.6350217946790921, + "flos": 25417324391040.0, + "grad_norm": 1.8175837603303404, + "language_loss": 0.72219515, + "learning_rate": 1.2418802845518178e-06, + "loss": 0.79898125, + "num_input_tokens_seen": 227724920, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.11279297, + "step": 10562, + "time_per_iteration": 2.563812255859375 + }, + { + "auxiliary_loss_clip": 0.06418765, + "auxiliary_loss_mlp": 0.0126928, + "balance_loss_clip": 0.0627933, + "balance_loss_mlp": 0.01258808, + "epoch": 0.63508191793176, + "flos": 19725024320640.0, + "grad_norm": 1.9663518722420297, + "language_loss": 0.81324869, + "learning_rate": 1.2415199019328185e-06, + "loss": 0.89012909, + "num_input_tokens_seen": 227743400, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.10473633, + "step": 10563, + "time_per_iteration": 2.618112087249756 + }, + { + "auxiliary_loss_clip": 0.06424198, + "auxiliary_loss_mlp": 0.01272987, + "balance_loss_clip": 0.06281862, + "balance_loss_mlp": 0.01262092, + "epoch": 0.6351420411844281, + "flos": 18192810960000.0, + "grad_norm": 2.213984919304992, + "language_loss": 0.81394589, + "learning_rate": 1.2411595480754597e-06, + "loss": 0.89091778, + "num_input_tokens_seen": 227759990, + "router_z_loss_clip": 1.42382812, + "router_z_loss_mlp": 0.10913086, + "step": 10564, + "time_per_iteration": 2.54693341255188 + }, + { + "auxiliary_loss_clip": 0.06417058, + "auxiliary_loss_mlp": 0.01266083, + "balance_loss_clip": 0.06278841, + "balance_loss_mlp": 0.01256272, + "epoch": 0.6352021644370961, + "flos": 33734437136640.0, + "grad_norm": 2.2491852390349614, + "language_loss": 0.73082668, + "learning_rate": 1.240799222993407e-06, + "loss": 0.80765808, + "num_input_tokens_seen": 227780835, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.09796143, + "step": 10565, + "time_per_iteration": 2.6810452938079834 + }, + { + "auxiliary_loss_clip": 0.06416303, + "auxiliary_loss_mlp": 0.01267579, + "balance_loss_clip": 0.06276368, + "balance_loss_mlp": 0.01256093, + "epoch": 0.635262287689764, + "flos": 20380818700800.0, + "grad_norm": 2.01281164224499, + "language_loss": 0.68792611, + "learning_rate": 1.240438926700324e-06, + "loss": 0.7647649, + "num_input_tokens_seen": 227798580, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.1149292, + "step": 10566, + "time_per_iteration": 2.5485215187072754 + }, + { + "auxiliary_loss_clip": 0.06410012, + "auxiliary_loss_mlp": 0.01265054, + "balance_loss_clip": 0.06277308, + "balance_loss_mlp": 0.01255022, + "epoch": 0.635322410942432, + "flos": 27532559260800.0, + "grad_norm": 1.717445195940493, + "language_loss": 0.69661963, + "learning_rate": 1.2400786592098725e-06, + "loss": 0.77337033, + "num_input_tokens_seen": 227819210, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.1003418, + "step": 10567, + "time_per_iteration": 4.017431974411011 + }, + { + "auxiliary_loss_clip": 0.064077, + "auxiliary_loss_mlp": 0.01265057, + "balance_loss_clip": 0.06274484, + "balance_loss_mlp": 0.01255151, + "epoch": 0.6353825341950999, + "flos": 21550048174080.0, + "grad_norm": 1.9561940375454367, + "language_loss": 0.84912741, + "learning_rate": 1.2397184205357154e-06, + "loss": 0.92585498, + "num_input_tokens_seen": 227838340, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.09906006, + "step": 10568, + "time_per_iteration": 2.528050422668457 + }, + { + "auxiliary_loss_clip": 0.06414051, + "auxiliary_loss_mlp": 0.012645, + "balance_loss_clip": 0.06275208, + "balance_loss_mlp": 0.01254427, + "epoch": 0.635442657447768, + "flos": 31767934464000.0, + "grad_norm": 1.8080598645215213, + "language_loss": 0.84412146, + "learning_rate": 1.2393582106915113e-06, + "loss": 0.92090696, + "num_input_tokens_seen": 227859170, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.10070801, + "step": 10569, + "time_per_iteration": 2.6543846130371094 + }, + { + "auxiliary_loss_clip": 0.06409843, + "auxiliary_loss_mlp": 0.01268445, + "balance_loss_clip": 0.06274843, + "balance_loss_mlp": 0.0125811, + "epoch": 0.6355027807004359, + "flos": 19835001204480.0, + "grad_norm": 1.4845804125044393, + "language_loss": 0.69596767, + "learning_rate": 1.2389980296909198e-06, + "loss": 0.77275056, + "num_input_tokens_seen": 227878545, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.10327148, + "step": 10570, + "time_per_iteration": 3.903024435043335 + }, + { + "auxiliary_loss_clip": 0.06413252, + "auxiliary_loss_mlp": 0.01264199, + "balance_loss_clip": 0.06273122, + "balance_loss_mlp": 0.01253989, + "epoch": 0.6355629039531039, + "flos": 30380176742400.0, + "grad_norm": 1.6479967140904772, + "language_loss": 0.66236866, + "learning_rate": 1.2386378775476e-06, + "loss": 0.73914319, + "num_input_tokens_seen": 227898875, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.10211182, + "step": 10571, + "time_per_iteration": 2.571477174758911 + }, + { + "auxiliary_loss_clip": 0.06416899, + "auxiliary_loss_mlp": 0.01266469, + "balance_loss_clip": 0.06277502, + "balance_loss_mlp": 0.01256097, + "epoch": 0.6356230272057718, + "flos": 17938001093760.0, + "grad_norm": 1.5990791790465455, + "language_loss": 0.71629465, + "learning_rate": 1.2382777542752074e-06, + "loss": 0.79312837, + "num_input_tokens_seen": 227917130, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.10375977, + "step": 10572, + "time_per_iteration": 2.466371774673462 + }, + { + "auxiliary_loss_clip": 0.06409136, + "auxiliary_loss_mlp": 0.0126563, + "balance_loss_clip": 0.06273179, + "balance_loss_mlp": 0.01255623, + "epoch": 0.6356831504584398, + "flos": 25383139125120.0, + "grad_norm": 1.3707006156469355, + "language_loss": 0.81310254, + "learning_rate": 1.2379176598873992e-06, + "loss": 0.88985026, + "num_input_tokens_seen": 227939550, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.10015869, + "step": 10573, + "time_per_iteration": 2.5966269969940186 + }, + { + "auxiliary_loss_clip": 0.06417162, + "auxiliary_loss_mlp": 0.01267057, + "balance_loss_clip": 0.06277572, + "balance_loss_mlp": 0.01255899, + "epoch": 0.6357432737111077, + "flos": 46511029630080.0, + "grad_norm": 1.745983210040395, + "language_loss": 0.68758935, + "learning_rate": 1.2375575943978303e-06, + "loss": 0.76443154, + "num_input_tokens_seen": 227962200, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.11151123, + "step": 10574, + "time_per_iteration": 2.7297935485839844 + }, + { + "auxiliary_loss_clip": 0.06411967, + "auxiliary_loss_mlp": 0.01265063, + "balance_loss_clip": 0.06275427, + "balance_loss_mlp": 0.01254513, + "epoch": 0.6358033969637757, + "flos": 17280026507520.0, + "grad_norm": 2.032779061466396, + "language_loss": 0.8712132, + "learning_rate": 1.2371975578201525e-06, + "loss": 0.9479835, + "num_input_tokens_seen": 227979270, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.10540771, + "step": 10575, + "time_per_iteration": 2.505861520767212 + }, + { + "auxiliary_loss_clip": 0.06410281, + "auxiliary_loss_mlp": 0.01265614, + "balance_loss_clip": 0.06273504, + "balance_loss_mlp": 0.01255946, + "epoch": 0.6358635202164437, + "flos": 27132832558080.0, + "grad_norm": 1.4971132099643523, + "language_loss": 0.72510445, + "learning_rate": 1.2368375501680204e-06, + "loss": 0.80186343, + "num_input_tokens_seen": 228000550, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.09667969, + "step": 10576, + "time_per_iteration": 3.991710901260376 + }, + { + "auxiliary_loss_clip": 0.06415755, + "auxiliary_loss_mlp": 0.01267596, + "balance_loss_clip": 0.06276268, + "balance_loss_mlp": 0.01257368, + "epoch": 0.6359236434691117, + "flos": 27532307698560.0, + "grad_norm": 1.4171583307321047, + "language_loss": 0.6902113, + "learning_rate": 1.236477571455085e-06, + "loss": 0.76704478, + "num_input_tokens_seen": 228022005, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.10223389, + "step": 10577, + "time_per_iteration": 2.553823947906494 + }, + { + "auxiliary_loss_clip": 0.06410993, + "auxiliary_loss_mlp": 0.01267287, + "balance_loss_clip": 0.06274246, + "balance_loss_mlp": 0.01257613, + "epoch": 0.6359837667217797, + "flos": 39357653915520.0, + "grad_norm": 1.7634862953282429, + "language_loss": 0.72702098, + "learning_rate": 1.2361176216949964e-06, + "loss": 0.8038038, + "num_input_tokens_seen": 228043770, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.09674072, + "step": 10578, + "time_per_iteration": 2.7065927982330322 + }, + { + "auxiliary_loss_clip": 0.06310344, + "auxiliary_loss_mlp": 0.01250981, + "balance_loss_clip": 0.06254056, + "balance_loss_mlp": 0.0124968, + "epoch": 0.6360438899744476, + "flos": 56430472475520.0, + "grad_norm": 0.7091193353039391, + "language_loss": 0.54502332, + "learning_rate": 1.2357577009014044e-06, + "loss": 0.62063658, + "num_input_tokens_seen": 228104985, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01301575, + "step": 10579, + "time_per_iteration": 3.198455333709717 + }, + { + "auxiliary_loss_clip": 0.06409089, + "auxiliary_loss_mlp": 0.01264424, + "balance_loss_clip": 0.06272582, + "balance_loss_mlp": 0.01254369, + "epoch": 0.6361040132271156, + "flos": 24980100186240.0, + "grad_norm": 1.5151266119166613, + "language_loss": 0.77508366, + "learning_rate": 1.2353978090879568e-06, + "loss": 0.8518188, + "num_input_tokens_seen": 228125620, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.1005249, + "step": 10580, + "time_per_iteration": 2.5561928749084473 + }, + { + "auxiliary_loss_clip": 0.06411447, + "auxiliary_loss_mlp": 0.01269158, + "balance_loss_clip": 0.06273703, + "balance_loss_mlp": 0.01259043, + "epoch": 0.6361641364797835, + "flos": 23266059465600.0, + "grad_norm": 1.9638125336396983, + "language_loss": 0.66766918, + "learning_rate": 1.235037946268301e-06, + "loss": 0.74447519, + "num_input_tokens_seen": 228143495, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.10113525, + "step": 10581, + "time_per_iteration": 2.5164785385131836 + }, + { + "auxiliary_loss_clip": 0.06410715, + "auxiliary_loss_mlp": 0.01264464, + "balance_loss_clip": 0.06273356, + "balance_loss_mlp": 0.01254683, + "epoch": 0.6362242597324516, + "flos": 26001645638400.0, + "grad_norm": 1.4228320252439628, + "language_loss": 0.6843577, + "learning_rate": 1.2346781124560828e-06, + "loss": 0.76110947, + "num_input_tokens_seen": 228166500, + "router_z_loss_clip": 1.37402344, + "router_z_loss_mlp": 0.09783936, + "step": 10582, + "time_per_iteration": 2.6015806198120117 + }, + { + "auxiliary_loss_clip": 0.06416672, + "auxiliary_loss_mlp": 0.01264747, + "balance_loss_clip": 0.06276425, + "balance_loss_mlp": 0.01254203, + "epoch": 0.6362843829851195, + "flos": 25710428373120.0, + "grad_norm": 2.448331234664856, + "language_loss": 0.84422374, + "learning_rate": 1.2343183076649473e-06, + "loss": 0.92103791, + "num_input_tokens_seen": 228185325, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.10552979, + "step": 10583, + "time_per_iteration": 2.5657055377960205 + }, + { + "auxiliary_loss_clip": 0.06411825, + "auxiliary_loss_mlp": 0.01266338, + "balance_loss_clip": 0.06278308, + "balance_loss_mlp": 0.01256086, + "epoch": 0.6363445062377875, + "flos": 20529341262720.0, + "grad_norm": 1.5773260338409785, + "language_loss": 0.75534987, + "learning_rate": 1.233958531908538e-06, + "loss": 0.83213151, + "num_input_tokens_seen": 228204050, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.10247803, + "step": 10584, + "time_per_iteration": 2.527031421661377 + }, + { + "auxiliary_loss_clip": 0.06414576, + "auxiliary_loss_mlp": 0.01267643, + "balance_loss_clip": 0.06273754, + "balance_loss_mlp": 0.01256139, + "epoch": 0.6364046294904554, + "flos": 19469879038080.0, + "grad_norm": 1.7122506045265105, + "language_loss": 0.73591262, + "learning_rate": 1.2335987852004985e-06, + "loss": 0.81273478, + "num_input_tokens_seen": 228222430, + "router_z_loss_clip": 1.40722656, + "router_z_loss_mlp": 0.11505127, + "step": 10585, + "time_per_iteration": 2.4975733757019043 + }, + { + "auxiliary_loss_clip": 0.06413724, + "auxiliary_loss_mlp": 0.01264888, + "balance_loss_clip": 0.06275959, + "balance_loss_mlp": 0.01254981, + "epoch": 0.6364647527431234, + "flos": 21002176252800.0, + "grad_norm": 1.805788279769041, + "language_loss": 0.83174026, + "learning_rate": 1.2332390675544697e-06, + "loss": 0.9085263, + "num_input_tokens_seen": 228241925, + "router_z_loss_clip": 1.37695312, + "router_z_loss_mlp": 0.09906006, + "step": 10586, + "time_per_iteration": 2.531947612762451 + }, + { + "auxiliary_loss_clip": 0.06412107, + "auxiliary_loss_mlp": 0.0126422, + "balance_loss_clip": 0.06275982, + "balance_loss_mlp": 0.0125435, + "epoch": 0.6365248759957913, + "flos": 25777079896320.0, + "grad_norm": 1.5441547949198797, + "language_loss": 0.72916567, + "learning_rate": 1.2328793789840918e-06, + "loss": 0.80592889, + "num_input_tokens_seen": 228262535, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.09863281, + "step": 10587, + "time_per_iteration": 2.589169979095459 + }, + { + "auxiliary_loss_clip": 0.06412084, + "auxiliary_loss_mlp": 0.01264457, + "balance_loss_clip": 0.062725, + "balance_loss_mlp": 0.01254014, + "epoch": 0.6365849992484593, + "flos": 22462161793920.0, + "grad_norm": 2.0110608871651823, + "language_loss": 0.77360207, + "learning_rate": 1.2325197195030058e-06, + "loss": 0.85036743, + "num_input_tokens_seen": 228281340, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.10443115, + "step": 10588, + "time_per_iteration": 2.5107719898223877 + }, + { + "auxiliary_loss_clip": 0.06404337, + "auxiliary_loss_mlp": 0.01266834, + "balance_loss_clip": 0.06271751, + "balance_loss_mlp": 0.01256564, + "epoch": 0.6366451225011273, + "flos": 19031648584320.0, + "grad_norm": 1.403923680448765, + "language_loss": 0.79945314, + "learning_rate": 1.2321600891248478e-06, + "loss": 0.87616491, + "num_input_tokens_seen": 228300865, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.10266113, + "step": 10589, + "time_per_iteration": 2.5198166370391846 + }, + { + "auxiliary_loss_clip": 0.06407724, + "auxiliary_loss_mlp": 0.01266892, + "balance_loss_clip": 0.06272867, + "balance_loss_mlp": 0.01256616, + "epoch": 0.6367052457537953, + "flos": 25235413176960.0, + "grad_norm": 1.9669131634706534, + "language_loss": 0.67181933, + "learning_rate": 1.231800487863257e-06, + "loss": 0.74856544, + "num_input_tokens_seen": 228320815, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.1027832, + "step": 10590, + "time_per_iteration": 2.5376667976379395 + }, + { + "auxiliary_loss_clip": 0.0642258, + "auxiliary_loss_mlp": 0.01266478, + "balance_loss_clip": 0.06278451, + "balance_loss_mlp": 0.01254945, + "epoch": 0.6367653690064633, + "flos": 19214482193280.0, + "grad_norm": 1.635127472973657, + "language_loss": 0.7910291, + "learning_rate": 1.2314409157318685e-06, + "loss": 0.86791968, + "num_input_tokens_seen": 228339065, + "router_z_loss_clip": 1.44238281, + "router_z_loss_mlp": 0.11523438, + "step": 10591, + "time_per_iteration": 2.542515993118286 + }, + { + "auxiliary_loss_clip": 0.06405823, + "auxiliary_loss_mlp": 0.01265189, + "balance_loss_clip": 0.06271368, + "balance_loss_mlp": 0.0125564, + "epoch": 0.6368254922591312, + "flos": 23553000172800.0, + "grad_norm": 1.3721943309197018, + "language_loss": 0.89071333, + "learning_rate": 1.231081372744317e-06, + "loss": 0.96742344, + "num_input_tokens_seen": 228359210, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.09552002, + "step": 10592, + "time_per_iteration": 2.51094126701355 + }, + { + "auxiliary_loss_clip": 0.06405515, + "auxiliary_loss_mlp": 0.01266442, + "balance_loss_clip": 0.06272536, + "balance_loss_mlp": 0.01256906, + "epoch": 0.6368856155117992, + "flos": 26474270993280.0, + "grad_norm": 1.3189503052137, + "language_loss": 0.68928409, + "learning_rate": 1.2307218589142376e-06, + "loss": 0.76600361, + "num_input_tokens_seen": 228379630, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.09533691, + "step": 10593, + "time_per_iteration": 2.5533511638641357 + }, + { + "auxiliary_loss_clip": 0.06408849, + "auxiliary_loss_mlp": 0.01266265, + "balance_loss_clip": 0.06273521, + "balance_loss_mlp": 0.01256329, + "epoch": 0.6369457387644671, + "flos": 33700754995200.0, + "grad_norm": 1.6851555086975611, + "language_loss": 0.6369772, + "learning_rate": 1.2303623742552618e-06, + "loss": 0.71372831, + "num_input_tokens_seen": 228401410, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.09942627, + "step": 10594, + "time_per_iteration": 2.6149699687957764 + }, + { + "auxiliary_loss_clip": 0.06308158, + "auxiliary_loss_mlp": 0.01250909, + "balance_loss_clip": 0.06252004, + "balance_loss_mlp": 0.01249539, + "epoch": 0.6370058620171352, + "flos": 70929365316480.0, + "grad_norm": 0.7572264790485472, + "language_loss": 0.54663223, + "learning_rate": 1.230002918781022e-06, + "loss": 0.6222229, + "num_input_tokens_seen": 228470335, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01372528, + "step": 10595, + "time_per_iteration": 4.630947589874268 + }, + { + "auxiliary_loss_clip": 0.06416945, + "auxiliary_loss_mlp": 0.01266296, + "balance_loss_clip": 0.06275225, + "balance_loss_mlp": 0.01255436, + "epoch": 0.6370659852698031, + "flos": 21148267046400.0, + "grad_norm": 1.6750235845380184, + "language_loss": 0.66897941, + "learning_rate": 1.2296434925051493e-06, + "loss": 0.74581182, + "num_input_tokens_seen": 228490765, + "router_z_loss_clip": 1.41699219, + "router_z_loss_mlp": 0.10858154, + "step": 10596, + "time_per_iteration": 2.550053834915161 + }, + { + "auxiliary_loss_clip": 0.06410693, + "auxiliary_loss_mlp": 0.01266417, + "balance_loss_clip": 0.06275079, + "balance_loss_mlp": 0.01256022, + "epoch": 0.6371261085224711, + "flos": 20199452538240.0, + "grad_norm": 4.2038058583126405, + "language_loss": 0.79555941, + "learning_rate": 1.2292840954412718e-06, + "loss": 0.87233055, + "num_input_tokens_seen": 228509700, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.10400391, + "step": 10597, + "time_per_iteration": 2.5332624912261963 + }, + { + "auxiliary_loss_clip": 0.06414443, + "auxiliary_loss_mlp": 0.01266294, + "balance_loss_clip": 0.06275137, + "balance_loss_mlp": 0.01255446, + "epoch": 0.637186231775139, + "flos": 19689790878720.0, + "grad_norm": 1.6206633129115742, + "language_loss": 0.7509104, + "learning_rate": 1.2289247276030189e-06, + "loss": 0.82771772, + "num_input_tokens_seen": 228529050, + "router_z_loss_clip": 1.39257812, + "router_z_loss_mlp": 0.10852051, + "step": 10598, + "time_per_iteration": 2.5732879638671875 + }, + { + "auxiliary_loss_clip": 0.06411502, + "auxiliary_loss_mlp": 0.01263084, + "balance_loss_clip": 0.06272967, + "balance_loss_mlp": 0.01253381, + "epoch": 0.637246355027807, + "flos": 13074937355520.0, + "grad_norm": 1.7290939316313776, + "language_loss": 0.68839526, + "learning_rate": 1.2285653890040176e-06, + "loss": 0.76514107, + "num_input_tokens_seen": 228544665, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.0970459, + "step": 10599, + "time_per_iteration": 2.476140260696411 + }, + { + "auxiliary_loss_clip": 0.06417891, + "auxiliary_loss_mlp": 0.01266352, + "balance_loss_clip": 0.06276424, + "balance_loss_mlp": 0.01254664, + "epoch": 0.6373064782804749, + "flos": 18228421745280.0, + "grad_norm": 1.9832548083292807, + "language_loss": 0.80652881, + "learning_rate": 1.2282060796578942e-06, + "loss": 0.88337129, + "num_input_tokens_seen": 228562060, + "router_z_loss_clip": 1.41308594, + "router_z_loss_mlp": 0.11700439, + "step": 10600, + "time_per_iteration": 2.496344804763794 + }, + { + "auxiliary_loss_clip": 0.06407046, + "auxiliary_loss_mlp": 0.0126749, + "balance_loss_clip": 0.06272307, + "balance_loss_mlp": 0.01257626, + "epoch": 0.637366601533143, + "flos": 24505336552320.0, + "grad_norm": 1.383513371134078, + "language_loss": 0.79706007, + "learning_rate": 1.2278467995782732e-06, + "loss": 0.8738054, + "num_input_tokens_seen": 228582550, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.09863281, + "step": 10601, + "time_per_iteration": 2.533555269241333 + }, + { + "auxiliary_loss_clip": 0.06416898, + "auxiliary_loss_mlp": 0.01263888, + "balance_loss_clip": 0.06276521, + "balance_loss_mlp": 0.01253332, + "epoch": 0.6374267247858109, + "flos": 26366180826240.0, + "grad_norm": 2.20794570441013, + "language_loss": 0.67092741, + "learning_rate": 1.2274875487787797e-06, + "loss": 0.74773526, + "num_input_tokens_seen": 228604960, + "router_z_loss_clip": 1.40332031, + "router_z_loss_mlp": 0.10559082, + "step": 10602, + "time_per_iteration": 2.5890238285064697 + }, + { + "auxiliary_loss_clip": 0.06413972, + "auxiliary_loss_mlp": 0.01266008, + "balance_loss_clip": 0.06275181, + "balance_loss_mlp": 0.0125578, + "epoch": 0.6374868480384789, + "flos": 20377254902400.0, + "grad_norm": 1.5742012675871089, + "language_loss": 0.79736137, + "learning_rate": 1.2271283272730354e-06, + "loss": 0.87416112, + "num_input_tokens_seen": 228622195, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.10223389, + "step": 10603, + "time_per_iteration": 2.4978857040405273 + }, + { + "auxiliary_loss_clip": 0.0641142, + "auxiliary_loss_mlp": 0.01265674, + "balance_loss_clip": 0.06272836, + "balance_loss_mlp": 0.01255058, + "epoch": 0.6375469712911469, + "flos": 21002595523200.0, + "grad_norm": 2.075723287568445, + "language_loss": 0.76759392, + "learning_rate": 1.2267691350746621e-06, + "loss": 0.84436482, + "num_input_tokens_seen": 228639735, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.10626221, + "step": 10604, + "time_per_iteration": 2.5228052139282227 + }, + { + "auxiliary_loss_clip": 0.0641887, + "auxiliary_loss_mlp": 0.01265156, + "balance_loss_clip": 0.062751, + "balance_loss_mlp": 0.01253551, + "epoch": 0.6376070945438148, + "flos": 19721292814080.0, + "grad_norm": 2.969254888536146, + "language_loss": 0.77310598, + "learning_rate": 1.226409972197281e-06, + "loss": 0.84994626, + "num_input_tokens_seen": 228658195, + "router_z_loss_clip": 1.43847656, + "router_z_loss_mlp": 0.11608887, + "step": 10605, + "time_per_iteration": 2.4766769409179688 + }, + { + "auxiliary_loss_clip": 0.06417184, + "auxiliary_loss_mlp": 0.01265543, + "balance_loss_clip": 0.06277403, + "balance_loss_mlp": 0.01254087, + "epoch": 0.6376672177964828, + "flos": 21513137650560.0, + "grad_norm": 1.8415567136743551, + "language_loss": 0.66146404, + "learning_rate": 1.2260508386545106e-06, + "loss": 0.73829126, + "num_input_tokens_seen": 228677415, + "router_z_loss_clip": 1.39648438, + "router_z_loss_mlp": 0.11437988, + "step": 10606, + "time_per_iteration": 3.962454080581665 + }, + { + "auxiliary_loss_clip": 0.06409881, + "auxiliary_loss_mlp": 0.01267672, + "balance_loss_clip": 0.06276855, + "balance_loss_mlp": 0.01257891, + "epoch": 0.6377273410491507, + "flos": 18849905078400.0, + "grad_norm": 1.5392078588294233, + "language_loss": 0.75399411, + "learning_rate": 1.225691734459971e-06, + "loss": 0.8307696, + "num_input_tokens_seen": 228696450, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.09777832, + "step": 10607, + "time_per_iteration": 2.481400489807129 + }, + { + "auxiliary_loss_clip": 0.06417431, + "auxiliary_loss_mlp": 0.01270028, + "balance_loss_clip": 0.06278283, + "balance_loss_mlp": 0.01259514, + "epoch": 0.6377874643018188, + "flos": 53073962749440.0, + "grad_norm": 1.6290224643321956, + "language_loss": 0.655065, + "learning_rate": 1.225332659627278e-06, + "loss": 0.73193955, + "num_input_tokens_seen": 228721600, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.1050415, + "step": 10608, + "time_per_iteration": 2.80210018157959 + }, + { + "auxiliary_loss_clip": 0.06314453, + "auxiliary_loss_mlp": 0.01252573, + "balance_loss_clip": 0.0625798, + "balance_loss_mlp": 0.01251221, + "epoch": 0.6378475875544867, + "flos": 65153349417600.0, + "grad_norm": 0.7210390428690479, + "language_loss": 0.5201869, + "learning_rate": 1.2249736141700475e-06, + "loss": 0.59585714, + "num_input_tokens_seen": 228784535, + "router_z_loss_clip": 0.56591797, + "router_z_loss_mlp": 0.01354218, + "step": 10609, + "time_per_iteration": 4.542863368988037 + }, + { + "auxiliary_loss_clip": 0.06406713, + "auxiliary_loss_mlp": 0.01266217, + "balance_loss_clip": 0.06272352, + "balance_loss_mlp": 0.01257122, + "epoch": 0.6379077108071547, + "flos": 23009404809600.0, + "grad_norm": 1.4796346735577246, + "language_loss": 0.74981046, + "learning_rate": 1.2246145981018965e-06, + "loss": 0.82653975, + "num_input_tokens_seen": 228804110, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.09100342, + "step": 10610, + "time_per_iteration": 2.5884346961975098 + }, + { + "auxiliary_loss_clip": 0.06314634, + "auxiliary_loss_mlp": 0.01251771, + "balance_loss_clip": 0.06257996, + "balance_loss_mlp": 0.01250523, + "epoch": 0.6379678340598226, + "flos": 67624425849600.0, + "grad_norm": 0.8350558513372389, + "language_loss": 0.62598002, + "learning_rate": 1.2242556114364364e-06, + "loss": 0.70164406, + "num_input_tokens_seen": 228867705, + "router_z_loss_clip": 0.56689453, + "router_z_loss_mlp": 0.01247406, + "step": 10611, + "time_per_iteration": 3.208292245864868 + }, + { + "auxiliary_loss_clip": 0.06416688, + "auxiliary_loss_mlp": 0.01263819, + "balance_loss_clip": 0.06276392, + "balance_loss_mlp": 0.01253513, + "epoch": 0.6380279573124906, + "flos": 29687891109120.0, + "grad_norm": 2.188557109067727, + "language_loss": 0.72870415, + "learning_rate": 1.223896654187282e-06, + "loss": 0.80550921, + "num_input_tokens_seen": 228889215, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.10308838, + "step": 10612, + "time_per_iteration": 2.5807394981384277 + }, + { + "auxiliary_loss_clip": 0.06312064, + "auxiliary_loss_mlp": 0.01253142, + "balance_loss_clip": 0.06255382, + "balance_loss_mlp": 0.01251885, + "epoch": 0.6380880805651585, + "flos": 66502435680000.0, + "grad_norm": 0.7266099968525627, + "language_loss": 0.57775903, + "learning_rate": 1.2235377263680446e-06, + "loss": 0.65341103, + "num_input_tokens_seen": 228948465, + "router_z_loss_clip": 0.56982422, + "router_z_loss_mlp": 0.01256561, + "step": 10613, + "time_per_iteration": 3.0924766063690186 + }, + { + "auxiliary_loss_clip": 0.06422254, + "auxiliary_loss_mlp": 0.01264432, + "balance_loss_clip": 0.06280632, + "balance_loss_mlp": 0.01253483, + "epoch": 0.6381482038178266, + "flos": 23921811918720.0, + "grad_norm": 1.7742162127346608, + "language_loss": 0.75586814, + "learning_rate": 1.2231788279923334e-06, + "loss": 0.832735, + "num_input_tokens_seen": 228967955, + "router_z_loss_clip": 1.41601562, + "router_z_loss_mlp": 0.10949707, + "step": 10614, + "time_per_iteration": 2.5669398307800293 + }, + { + "auxiliary_loss_clip": 0.06413062, + "auxiliary_loss_mlp": 0.01263583, + "balance_loss_clip": 0.0627507, + "balance_loss_mlp": 0.01253277, + "epoch": 0.6382083270704945, + "flos": 24249855853440.0, + "grad_norm": 1.866062102155962, + "language_loss": 0.79879516, + "learning_rate": 1.2228199590737599e-06, + "loss": 0.87556159, + "num_input_tokens_seen": 228985495, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.10314941, + "step": 10615, + "time_per_iteration": 3.9333317279815674 + }, + { + "auxiliary_loss_clip": 0.06313558, + "auxiliary_loss_mlp": 0.01251207, + "balance_loss_clip": 0.0625703, + "balance_loss_mlp": 0.01249947, + "epoch": 0.6382684503231625, + "flos": 70798452111360.0, + "grad_norm": 0.6364915071256667, + "language_loss": 0.55039352, + "learning_rate": 1.2224611196259305e-06, + "loss": 0.62604117, + "num_input_tokens_seen": 229052995, + "router_z_loss_clip": 0.56640625, + "router_z_loss_mlp": 0.01260376, + "step": 10616, + "time_per_iteration": 3.2114999294281006 + }, + { + "auxiliary_loss_clip": 0.06411368, + "auxiliary_loss_mlp": 0.01263079, + "balance_loss_clip": 0.06272632, + "balance_loss_mlp": 0.01252654, + "epoch": 0.6383285735758305, + "flos": 16550411080320.0, + "grad_norm": 1.6623229086008653, + "language_loss": 0.84516096, + "learning_rate": 1.2221023096624538e-06, + "loss": 0.92190546, + "num_input_tokens_seen": 229071030, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.10430908, + "step": 10617, + "time_per_iteration": 2.50490665435791 + }, + { + "auxiliary_loss_clip": 0.06414464, + "auxiliary_loss_mlp": 0.01266034, + "balance_loss_clip": 0.06274582, + "balance_loss_mlp": 0.01255037, + "epoch": 0.6383886968284984, + "flos": 14432702515200.0, + "grad_norm": 1.7049012321551236, + "language_loss": 0.86996436, + "learning_rate": 1.221743529196936e-06, + "loss": 0.94676924, + "num_input_tokens_seen": 229088275, + "router_z_loss_clip": 1.39941406, + "router_z_loss_mlp": 0.10998535, + "step": 10618, + "time_per_iteration": 2.4782254695892334 + }, + { + "auxiliary_loss_clip": 0.06414133, + "auxiliary_loss_mlp": 0.01263472, + "balance_loss_clip": 0.06273396, + "balance_loss_mlp": 0.01253536, + "epoch": 0.6384488200811664, + "flos": 17935191982080.0, + "grad_norm": 1.660467856665914, + "language_loss": 0.73454595, + "learning_rate": 1.2213847782429806e-06, + "loss": 0.81132197, + "num_input_tokens_seen": 229105190, + "router_z_loss_clip": 1.40527344, + "router_z_loss_mlp": 0.09936523, + "step": 10619, + "time_per_iteration": 2.5073039531707764 + }, + { + "auxiliary_loss_clip": 0.06421836, + "auxiliary_loss_mlp": 0.01269484, + "balance_loss_clip": 0.06276071, + "balance_loss_mlp": 0.01258475, + "epoch": 0.6385089433338343, + "flos": 18521567654400.0, + "grad_norm": 1.8426309945064288, + "language_loss": 0.7661649, + "learning_rate": 1.221026056814193e-06, + "loss": 0.84307802, + "num_input_tokens_seen": 229122290, + "router_z_loss_clip": 1.45703125, + "router_z_loss_mlp": 0.11010742, + "step": 10620, + "time_per_iteration": 2.5297937393188477 + }, + { + "auxiliary_loss_clip": 0.06419566, + "auxiliary_loss_mlp": 0.01267834, + "balance_loss_clip": 0.0628044, + "balance_loss_mlp": 0.01256963, + "epoch": 0.6385690665865024, + "flos": 24760481834880.0, + "grad_norm": 2.368652650522925, + "language_loss": 0.70688897, + "learning_rate": 1.2206673649241752e-06, + "loss": 0.78376299, + "num_input_tokens_seen": 229141620, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.10870361, + "step": 10621, + "time_per_iteration": 2.5605804920196533 + }, + { + "auxiliary_loss_clip": 0.0640726, + "auxiliary_loss_mlp": 0.01264019, + "balance_loss_clip": 0.06274956, + "balance_loss_mlp": 0.01254887, + "epoch": 0.6386291898391703, + "flos": 20126763521280.0, + "grad_norm": 1.5541804815340177, + "language_loss": 0.77669823, + "learning_rate": 1.220308702586529e-06, + "loss": 0.85341108, + "num_input_tokens_seen": 229161570, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.09130859, + "step": 10622, + "time_per_iteration": 2.495631217956543 + }, + { + "auxiliary_loss_clip": 0.06408195, + "auxiliary_loss_mlp": 0.0126391, + "balance_loss_clip": 0.06273771, + "balance_loss_mlp": 0.01253903, + "epoch": 0.6386893130918383, + "flos": 16871914396800.0, + "grad_norm": 1.737894673487703, + "language_loss": 0.74773431, + "learning_rate": 1.2199500698148546e-06, + "loss": 0.82445532, + "num_input_tokens_seen": 229178465, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.10015869, + "step": 10623, + "time_per_iteration": 2.5214576721191406 + }, + { + "auxiliary_loss_clip": 0.0640855, + "auxiliary_loss_mlp": 0.01264001, + "balance_loss_clip": 0.06273185, + "balance_loss_mlp": 0.01254512, + "epoch": 0.6387494363445062, + "flos": 22972913556480.0, + "grad_norm": 1.3339080512049293, + "language_loss": 0.77151477, + "learning_rate": 1.2195914666227527e-06, + "loss": 0.84824026, + "num_input_tokens_seen": 229198975, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.09490967, + "step": 10624, + "time_per_iteration": 2.5108532905578613 + }, + { + "auxiliary_loss_clip": 0.064144, + "auxiliary_loss_mlp": 0.01262692, + "balance_loss_clip": 0.06276258, + "balance_loss_mlp": 0.01252637, + "epoch": 0.6388095595971742, + "flos": 22864487973120.0, + "grad_norm": 1.5899649446688702, + "language_loss": 0.80630493, + "learning_rate": 1.21923289302382e-06, + "loss": 0.88307583, + "num_input_tokens_seen": 229218825, + "router_z_loss_clip": 1.38183594, + "router_z_loss_mlp": 0.10064697, + "step": 10625, + "time_per_iteration": 2.5426197052001953 + }, + { + "auxiliary_loss_clip": 0.06416376, + "auxiliary_loss_mlp": 0.0126597, + "balance_loss_clip": 0.06277139, + "balance_loss_mlp": 0.01254842, + "epoch": 0.6388696828498421, + "flos": 17317314374400.0, + "grad_norm": 1.7136519687434957, + "language_loss": 0.72979832, + "learning_rate": 1.218874349031654e-06, + "loss": 0.80662179, + "num_input_tokens_seen": 229236060, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.11126709, + "step": 10626, + "time_per_iteration": 2.494306802749634 + }, + { + "auxiliary_loss_clip": 0.06408393, + "auxiliary_loss_mlp": 0.01265881, + "balance_loss_clip": 0.06270021, + "balance_loss_mlp": 0.01255015, + "epoch": 0.6389298061025102, + "flos": 17134313057280.0, + "grad_norm": 1.513972649351316, + "language_loss": 0.73141295, + "learning_rate": 1.2185158346598517e-06, + "loss": 0.80815566, + "num_input_tokens_seen": 229255160, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.10870361, + "step": 10627, + "time_per_iteration": 2.5244781970977783 + }, + { + "auxiliary_loss_clip": 0.06419984, + "auxiliary_loss_mlp": 0.01267157, + "balance_loss_clip": 0.06274766, + "balance_loss_mlp": 0.01255391, + "epoch": 0.6389899293551781, + "flos": 27718663178880.0, + "grad_norm": 1.6703880840860492, + "language_loss": 0.66923428, + "learning_rate": 1.2181573499220064e-06, + "loss": 0.74610573, + "num_input_tokens_seen": 229278705, + "router_z_loss_clip": 1.45117188, + "router_z_loss_mlp": 0.11773682, + "step": 10628, + "time_per_iteration": 2.575000762939453 + }, + { + "auxiliary_loss_clip": 0.06410551, + "auxiliary_loss_mlp": 0.01264342, + "balance_loss_clip": 0.0627692, + "balance_loss_mlp": 0.01254197, + "epoch": 0.6390500526078461, + "flos": 21222171947520.0, + "grad_norm": 1.956585229435901, + "language_loss": 0.68194425, + "learning_rate": 1.2177988948317135e-06, + "loss": 0.7586931, + "num_input_tokens_seen": 229299990, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.10150146, + "step": 10629, + "time_per_iteration": 2.5807948112487793 + }, + { + "auxiliary_loss_clip": 0.06422858, + "auxiliary_loss_mlp": 0.01271827, + "balance_loss_clip": 0.0627673, + "balance_loss_mlp": 0.01258708, + "epoch": 0.6391101758605141, + "flos": 21587671457280.0, + "grad_norm": 1.5207801965767835, + "language_loss": 0.75444686, + "learning_rate": 1.2174404694025646e-06, + "loss": 0.83139372, + "num_input_tokens_seen": 229319230, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.13116455, + "step": 10630, + "time_per_iteration": 2.5017268657684326 + }, + { + "auxiliary_loss_clip": 0.06408527, + "auxiliary_loss_mlp": 0.01264942, + "balance_loss_clip": 0.06272866, + "balance_loss_mlp": 0.01255, + "epoch": 0.639170299113182, + "flos": 19906432410240.0, + "grad_norm": 1.6356950234102068, + "language_loss": 0.70487773, + "learning_rate": 1.2170820736481511e-06, + "loss": 0.78161246, + "num_input_tokens_seen": 229338600, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.09942627, + "step": 10631, + "time_per_iteration": 2.55197811126709 + }, + { + "auxiliary_loss_clip": 0.06314358, + "auxiliary_loss_mlp": 0.01251531, + "balance_loss_clip": 0.06258033, + "balance_loss_mlp": 0.01250199, + "epoch": 0.63923042236585, + "flos": 69896625344640.0, + "grad_norm": 0.7602289508759135, + "language_loss": 0.62733555, + "learning_rate": 1.2167237075820646e-06, + "loss": 0.70299447, + "num_input_tokens_seen": 229402420, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01333618, + "step": 10632, + "time_per_iteration": 3.190108060836792 + }, + { + "auxiliary_loss_clip": 0.06410427, + "auxiliary_loss_mlp": 0.01266129, + "balance_loss_clip": 0.0627519, + "balance_loss_mlp": 0.01255948, + "epoch": 0.639290545618518, + "flos": 22681486656000.0, + "grad_norm": 2.160270989856127, + "language_loss": 0.66821963, + "learning_rate": 1.216365371217893e-06, + "loss": 0.74498516, + "num_input_tokens_seen": 229419185, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.10174561, + "step": 10633, + "time_per_iteration": 2.552823543548584 + }, + { + "auxiliary_loss_clip": 0.06411168, + "auxiliary_loss_mlp": 0.01267004, + "balance_loss_clip": 0.06274083, + "balance_loss_mlp": 0.01256472, + "epoch": 0.639350668871186, + "flos": 19835420474880.0, + "grad_norm": 2.0078331211958638, + "language_loss": 0.82085246, + "learning_rate": 1.216007064569225e-06, + "loss": 0.89763421, + "num_input_tokens_seen": 229436735, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.10540771, + "step": 10634, + "time_per_iteration": 3.9264204502105713 + }, + { + "auxiliary_loss_clip": 0.06411835, + "auxiliary_loss_mlp": 0.01269552, + "balance_loss_clip": 0.06274228, + "balance_loss_mlp": 0.01258585, + "epoch": 0.6394107921238539, + "flos": 20558746846080.0, + "grad_norm": 1.4689992647467067, + "language_loss": 0.75053954, + "learning_rate": 1.2156487876496483e-06, + "loss": 0.82735342, + "num_input_tokens_seen": 229455595, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.10968018, + "step": 10635, + "time_per_iteration": 2.4891774654388428 + }, + { + "auxiliary_loss_clip": 0.06409803, + "auxiliary_loss_mlp": 0.01264504, + "balance_loss_clip": 0.06272061, + "balance_loss_mlp": 0.01254878, + "epoch": 0.6394709153765219, + "flos": 25781985360000.0, + "grad_norm": 1.6046642220248264, + "language_loss": 0.71619642, + "learning_rate": 1.2152905404727475e-06, + "loss": 0.79293942, + "num_input_tokens_seen": 229476230, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.09637451, + "step": 10636, + "time_per_iteration": 2.5812439918518066 + }, + { + "auxiliary_loss_clip": 0.06415339, + "auxiliary_loss_mlp": 0.01266128, + "balance_loss_clip": 0.0627417, + "balance_loss_mlp": 0.01255352, + "epoch": 0.6395310386291898, + "flos": 17535926476800.0, + "grad_norm": 2.1920700627694867, + "language_loss": 0.73530567, + "learning_rate": 1.2149323230521085e-06, + "loss": 0.81212032, + "num_input_tokens_seen": 229494300, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.10772705, + "step": 10637, + "time_per_iteration": 2.485643148422241 + }, + { + "auxiliary_loss_clip": 0.0641741, + "auxiliary_loss_mlp": 0.01266874, + "balance_loss_clip": 0.06276354, + "balance_loss_mlp": 0.01255871, + "epoch": 0.6395911618818578, + "flos": 18594172817280.0, + "grad_norm": 1.7577292466251317, + "language_loss": 0.78289723, + "learning_rate": 1.2145741354013143e-06, + "loss": 0.85974002, + "num_input_tokens_seen": 229512985, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.10986328, + "step": 10638, + "time_per_iteration": 2.482006549835205 + }, + { + "auxiliary_loss_clip": 0.06409052, + "auxiliary_loss_mlp": 0.01264378, + "balance_loss_clip": 0.0627173, + "balance_loss_mlp": 0.01253655, + "epoch": 0.6396512851345257, + "flos": 28374164069760.0, + "grad_norm": 1.4288466998721474, + "language_loss": 0.815153, + "learning_rate": 1.2142159775339478e-06, + "loss": 0.89188731, + "num_input_tokens_seen": 229534270, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.10717773, + "step": 10639, + "time_per_iteration": 2.553853750228882 + }, + { + "auxiliary_loss_clip": 0.06314266, + "auxiliary_loss_mlp": 0.01251751, + "balance_loss_clip": 0.06258021, + "balance_loss_mlp": 0.01250554, + "epoch": 0.6397114083871938, + "flos": 70744728844800.0, + "grad_norm": 0.7996184433796636, + "language_loss": 0.59009802, + "learning_rate": 1.21385784946359e-06, + "loss": 0.66575813, + "num_input_tokens_seen": 229596455, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01195526, + "step": 10640, + "time_per_iteration": 3.0804762840270996 + }, + { + "auxiliary_loss_clip": 0.0640569, + "auxiliary_loss_mlp": 0.01265577, + "balance_loss_clip": 0.06272022, + "balance_loss_mlp": 0.01255963, + "epoch": 0.6397715316398617, + "flos": 18147095758080.0, + "grad_norm": 1.6659836554468106, + "language_loss": 0.78961474, + "learning_rate": 1.2134997512038215e-06, + "loss": 0.8663274, + "num_input_tokens_seen": 229612860, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.09619141, + "step": 10641, + "time_per_iteration": 2.470735788345337 + }, + { + "auxiliary_loss_clip": 0.06423657, + "auxiliary_loss_mlp": 0.01266539, + "balance_loss_clip": 0.06278598, + "balance_loss_mlp": 0.01255422, + "epoch": 0.6398316548925297, + "flos": 25746668064000.0, + "grad_norm": 2.1982581134788672, + "language_loss": 0.63584703, + "learning_rate": 1.2131416827682209e-06, + "loss": 0.712749, + "num_input_tokens_seen": 229633960, + "router_z_loss_clip": 1.45117188, + "router_z_loss_mlp": 0.11120605, + "step": 10642, + "time_per_iteration": 2.572493314743042 + }, + { + "auxiliary_loss_clip": 0.06314563, + "auxiliary_loss_mlp": 0.0125166, + "balance_loss_clip": 0.06258431, + "balance_loss_mlp": 0.01250544, + "epoch": 0.6398917781451977, + "flos": 71231246778240.0, + "grad_norm": 0.888550554325656, + "language_loss": 0.55987263, + "learning_rate": 1.2127836441703667e-06, + "loss": 0.63553476, + "num_input_tokens_seen": 229686730, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01118469, + "step": 10643, + "time_per_iteration": 3.0916545391082764 + }, + { + "auxiliary_loss_clip": 0.06416592, + "auxiliary_loss_mlp": 0.01266284, + "balance_loss_clip": 0.06274326, + "balance_loss_mlp": 0.01255252, + "epoch": 0.6399519013978656, + "flos": 20528083451520.0, + "grad_norm": 1.8692423093064807, + "language_loss": 0.772012, + "learning_rate": 1.2124256354238358e-06, + "loss": 0.84884077, + "num_input_tokens_seen": 229704800, + "router_z_loss_clip": 1.42285156, + "router_z_loss_mlp": 0.11022949, + "step": 10644, + "time_per_iteration": 2.523844003677368 + }, + { + "auxiliary_loss_clip": 0.06409791, + "auxiliary_loss_mlp": 0.0126534, + "balance_loss_clip": 0.06274743, + "balance_loss_mlp": 0.01254676, + "epoch": 0.6400120246505336, + "flos": 24467503633920.0, + "grad_norm": 1.3560803021320431, + "language_loss": 0.82639438, + "learning_rate": 1.212067656542203e-06, + "loss": 0.90314567, + "num_input_tokens_seen": 229725265, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.10675049, + "step": 10645, + "time_per_iteration": 2.546128749847412 + }, + { + "auxiliary_loss_clip": 0.06421367, + "auxiliary_loss_mlp": 0.01263793, + "balance_loss_clip": 0.06277816, + "balance_loss_mlp": 0.01251997, + "epoch": 0.6400721479032015, + "flos": 28373619018240.0, + "grad_norm": 1.814178451427478, + "language_loss": 0.73952079, + "learning_rate": 1.2117097075390447e-06, + "loss": 0.81637239, + "num_input_tokens_seen": 229744840, + "router_z_loss_clip": 1.43652344, + "router_z_loss_mlp": 0.11798096, + "step": 10646, + "time_per_iteration": 3.966240167617798 + }, + { + "auxiliary_loss_clip": 0.06412562, + "auxiliary_loss_mlp": 0.01268277, + "balance_loss_clip": 0.06275235, + "balance_loss_mlp": 0.01257167, + "epoch": 0.6401322711558696, + "flos": 17821441664640.0, + "grad_norm": 1.9335985649403467, + "language_loss": 0.80623794, + "learning_rate": 1.2113517884279327e-06, + "loss": 0.88304639, + "num_input_tokens_seen": 229759095, + "router_z_loss_clip": 1.37402344, + "router_z_loss_mlp": 0.11114502, + "step": 10647, + "time_per_iteration": 2.497234582901001 + }, + { + "auxiliary_loss_clip": 0.06410154, + "auxiliary_loss_mlp": 0.01265126, + "balance_loss_clip": 0.06276208, + "balance_loss_mlp": 0.01255094, + "epoch": 0.6401923944085375, + "flos": 26037969183360.0, + "grad_norm": 1.5109233302980645, + "language_loss": 0.75784671, + "learning_rate": 1.2109938992224399e-06, + "loss": 0.83459949, + "num_input_tokens_seen": 229777750, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.10028076, + "step": 10648, + "time_per_iteration": 2.5445501804351807 + }, + { + "auxiliary_loss_clip": 0.06407083, + "auxiliary_loss_mlp": 0.01263508, + "balance_loss_clip": 0.06269361, + "balance_loss_mlp": 0.01253525, + "epoch": 0.6402525176612055, + "flos": 23593181005440.0, + "grad_norm": 1.948589206417596, + "language_loss": 0.79203671, + "learning_rate": 1.210636039936138e-06, + "loss": 0.86874264, + "num_input_tokens_seen": 229796785, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.09979248, + "step": 10649, + "time_per_iteration": 3.9821319580078125 + }, + { + "auxiliary_loss_clip": 0.06411543, + "auxiliary_loss_mlp": 0.01264939, + "balance_loss_clip": 0.06272741, + "balance_loss_mlp": 0.01254222, + "epoch": 0.6403126409138734, + "flos": 18047349072000.0, + "grad_norm": 2.12746104130849, + "language_loss": 0.75310314, + "learning_rate": 1.2102782105825956e-06, + "loss": 0.82986802, + "num_input_tokens_seen": 229815425, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.1071167, + "step": 10650, + "time_per_iteration": 2.488818883895874 + }, + { + "auxiliary_loss_clip": 0.06408805, + "auxiliary_loss_mlp": 0.01268267, + "balance_loss_clip": 0.06273156, + "balance_loss_mlp": 0.01256513, + "epoch": 0.6403727641665414, + "flos": 21985679151360.0, + "grad_norm": 1.3966136649863612, + "language_loss": 0.70929539, + "learning_rate": 1.2099204111753833e-06, + "loss": 0.78606611, + "num_input_tokens_seen": 229834545, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.11743164, + "step": 10651, + "time_per_iteration": 2.5219950675964355 + }, + { + "auxiliary_loss_clip": 0.06413059, + "auxiliary_loss_mlp": 0.01269512, + "balance_loss_clip": 0.06274731, + "balance_loss_mlp": 0.0125824, + "epoch": 0.6404328874192093, + "flos": 24901751018880.0, + "grad_norm": 2.1293665277256624, + "language_loss": 0.64404488, + "learning_rate": 1.2095626417280684e-06, + "loss": 0.72087055, + "num_input_tokens_seen": 229849175, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.11273193, + "step": 10652, + "time_per_iteration": 2.5231480598449707 + }, + { + "auxiliary_loss_clip": 0.06411535, + "auxiliary_loss_mlp": 0.01262653, + "balance_loss_clip": 0.06274502, + "balance_loss_mlp": 0.01252509, + "epoch": 0.6404930106718774, + "flos": 17601991021440.0, + "grad_norm": 1.8908665793351147, + "language_loss": 0.79652649, + "learning_rate": 1.2092049022542168e-06, + "loss": 0.87326837, + "num_input_tokens_seen": 229865400, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.10150146, + "step": 10653, + "time_per_iteration": 2.5704574584960938 + }, + { + "auxiliary_loss_clip": 0.06425246, + "auxiliary_loss_mlp": 0.0127165, + "balance_loss_clip": 0.06277368, + "balance_loss_mlp": 0.01259973, + "epoch": 0.6405531339245453, + "flos": 20164219096320.0, + "grad_norm": 2.6567000735134463, + "language_loss": 0.70885104, + "learning_rate": 1.2088471927673952e-06, + "loss": 0.78582001, + "num_input_tokens_seen": 229882945, + "router_z_loss_clip": 1.47753906, + "router_z_loss_mlp": 0.11682129, + "step": 10654, + "time_per_iteration": 2.534069061279297 + }, + { + "auxiliary_loss_clip": 0.0641733, + "auxiliary_loss_mlp": 0.01267285, + "balance_loss_clip": 0.06274031, + "balance_loss_mlp": 0.0125574, + "epoch": 0.6406132571772133, + "flos": 21948349357440.0, + "grad_norm": 1.5377239110005414, + "language_loss": 0.72583055, + "learning_rate": 1.2084895132811666e-06, + "loss": 0.80267668, + "num_input_tokens_seen": 229901590, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.11553955, + "step": 10655, + "time_per_iteration": 3.9230480194091797 + }, + { + "auxiliary_loss_clip": 0.06412716, + "auxiliary_loss_mlp": 0.01268726, + "balance_loss_clip": 0.06272289, + "balance_loss_mlp": 0.01257074, + "epoch": 0.6406733804298813, + "flos": 28775693635200.0, + "grad_norm": 1.9128350177290707, + "language_loss": 0.82931209, + "learning_rate": 1.2081318638090952e-06, + "loss": 0.90612656, + "num_input_tokens_seen": 229922535, + "router_z_loss_clip": 1.40332031, + "router_z_loss_mlp": 0.11657715, + "step": 10656, + "time_per_iteration": 2.601238489151001 + }, + { + "auxiliary_loss_clip": 0.06410467, + "auxiliary_loss_mlp": 0.01268343, + "balance_loss_clip": 0.06271605, + "balance_loss_mlp": 0.01257817, + "epoch": 0.6407335036825492, + "flos": 17462943970560.0, + "grad_norm": 3.923220638478792, + "language_loss": 0.72232449, + "learning_rate": 1.2077742443647433e-06, + "loss": 0.79911268, + "num_input_tokens_seen": 229939575, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.10516357, + "step": 10657, + "time_per_iteration": 2.478569984436035 + }, + { + "auxiliary_loss_clip": 0.06411502, + "auxiliary_loss_mlp": 0.01272042, + "balance_loss_clip": 0.06272899, + "balance_loss_mlp": 0.01261766, + "epoch": 0.6407936269352172, + "flos": 22131476455680.0, + "grad_norm": 1.5017144440006371, + "language_loss": 0.77455044, + "learning_rate": 1.2074166549616707e-06, + "loss": 0.85138589, + "num_input_tokens_seen": 229958840, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.10272217, + "step": 10658, + "time_per_iteration": 2.6262331008911133 + }, + { + "auxiliary_loss_clip": 0.06414957, + "auxiliary_loss_mlp": 0.01267425, + "balance_loss_clip": 0.06273896, + "balance_loss_mlp": 0.01256494, + "epoch": 0.6408537501878852, + "flos": 23117033779200.0, + "grad_norm": 1.5568653096914684, + "language_loss": 0.76262242, + "learning_rate": 1.2070590956134386e-06, + "loss": 0.83944625, + "num_input_tokens_seen": 229979680, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.10943604, + "step": 10659, + "time_per_iteration": 2.5234532356262207 + }, + { + "auxiliary_loss_clip": 0.06413037, + "auxiliary_loss_mlp": 0.01263947, + "balance_loss_clip": 0.06271739, + "balance_loss_mlp": 0.01253719, + "epoch": 0.6409138734405532, + "flos": 16478099406720.0, + "grad_norm": 1.5970917751630926, + "language_loss": 0.77884215, + "learning_rate": 1.2067015663336046e-06, + "loss": 0.85561204, + "num_input_tokens_seen": 229996830, + "router_z_loss_clip": 1.41210938, + "router_z_loss_mlp": 0.10229492, + "step": 10660, + "time_per_iteration": 2.522568941116333 + }, + { + "auxiliary_loss_clip": 0.0642052, + "auxiliary_loss_mlp": 0.01265628, + "balance_loss_clip": 0.06275806, + "balance_loss_mlp": 0.01253796, + "epoch": 0.6409739966932211, + "flos": 22783539329280.0, + "grad_norm": 1.8503290839739344, + "language_loss": 0.6901319, + "learning_rate": 1.206344067135727e-06, + "loss": 0.7669934, + "num_input_tokens_seen": 230015115, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.11834717, + "step": 10661, + "time_per_iteration": 2.5030124187469482 + }, + { + "auxiliary_loss_clip": 0.06407891, + "auxiliary_loss_mlp": 0.01269221, + "balance_loss_clip": 0.06273415, + "balance_loss_mlp": 0.01259017, + "epoch": 0.6410341199458891, + "flos": 25158489528960.0, + "grad_norm": 1.7100659203746285, + "language_loss": 0.7628997, + "learning_rate": 1.205986598033362e-06, + "loss": 0.83967084, + "num_input_tokens_seen": 230035515, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.10205078, + "step": 10662, + "time_per_iteration": 2.5515527725219727 + }, + { + "auxiliary_loss_clip": 0.0641142, + "auxiliary_loss_mlp": 0.01265377, + "balance_loss_clip": 0.06272576, + "balance_loss_mlp": 0.01255507, + "epoch": 0.641094243198557, + "flos": 27052428965760.0, + "grad_norm": 1.7631594614441006, + "language_loss": 0.69671446, + "learning_rate": 1.2056291590400644e-06, + "loss": 0.77348244, + "num_input_tokens_seen": 230054355, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.09863281, + "step": 10663, + "time_per_iteration": 2.5377395153045654 + }, + { + "auxiliary_loss_clip": 0.06414999, + "auxiliary_loss_mlp": 0.01271226, + "balance_loss_clip": 0.062755, + "balance_loss_mlp": 0.01258876, + "epoch": 0.641154366451225, + "flos": 25381629751680.0, + "grad_norm": 1.9040182096837255, + "language_loss": 0.68253797, + "learning_rate": 1.205271750169389e-06, + "loss": 0.75940025, + "num_input_tokens_seen": 230074605, + "router_z_loss_clip": 1.39648438, + "router_z_loss_mlp": 0.12353516, + "step": 10664, + "time_per_iteration": 2.5686044692993164 + }, + { + "auxiliary_loss_clip": 0.06408753, + "auxiliary_loss_mlp": 0.01265685, + "balance_loss_clip": 0.06271468, + "balance_loss_mlp": 0.01255081, + "epoch": 0.6412144897038929, + "flos": 25159998902400.0, + "grad_norm": 1.8980640494634613, + "language_loss": 0.66647685, + "learning_rate": 1.2049143714348881e-06, + "loss": 0.74322122, + "num_input_tokens_seen": 230093820, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.10601807, + "step": 10665, + "time_per_iteration": 2.5681324005126953 + }, + { + "auxiliary_loss_clip": 0.06406175, + "auxiliary_loss_mlp": 0.01263975, + "balance_loss_clip": 0.06270282, + "balance_loss_mlp": 0.01254027, + "epoch": 0.641274612956561, + "flos": 23447509482240.0, + "grad_norm": 1.7797122960809293, + "language_loss": 0.64406478, + "learning_rate": 1.2045570228501145e-06, + "loss": 0.72076625, + "num_input_tokens_seen": 230114285, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.0994873, + "step": 10666, + "time_per_iteration": 2.560159921646118 + }, + { + "auxiliary_loss_clip": 0.06411792, + "auxiliary_loss_mlp": 0.01267404, + "balance_loss_clip": 0.06272641, + "balance_loss_mlp": 0.01256556, + "epoch": 0.6413347362092289, + "flos": 19433597420160.0, + "grad_norm": 1.633933286881918, + "language_loss": 0.70997214, + "learning_rate": 1.2041997044286176e-06, + "loss": 0.78676403, + "num_input_tokens_seen": 230132760, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.10839844, + "step": 10667, + "time_per_iteration": 2.478955030441284 + }, + { + "auxiliary_loss_clip": 0.06424954, + "auxiliary_loss_mlp": 0.0127036, + "balance_loss_clip": 0.0627383, + "balance_loss_mlp": 0.01258004, + "epoch": 0.6413948594618969, + "flos": 17201425777920.0, + "grad_norm": 2.6317109326582204, + "language_loss": 0.78275955, + "learning_rate": 1.2038424161839484e-06, + "loss": 0.85971272, + "num_input_tokens_seen": 230149690, + "router_z_loss_clip": 1.50976562, + "router_z_loss_mlp": 0.12359619, + "step": 10668, + "time_per_iteration": 2.5198874473571777 + }, + { + "auxiliary_loss_clip": 0.06411108, + "auxiliary_loss_mlp": 0.01270624, + "balance_loss_clip": 0.06274307, + "balance_loss_mlp": 0.01259913, + "epoch": 0.6414549827145648, + "flos": 22275764386560.0, + "grad_norm": 1.497004648642511, + "language_loss": 0.67674375, + "learning_rate": 1.2034851581296544e-06, + "loss": 0.75356108, + "num_input_tokens_seen": 230166950, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.10705566, + "step": 10669, + "time_per_iteration": 2.589388132095337 + }, + { + "auxiliary_loss_clip": 0.06420371, + "auxiliary_loss_mlp": 0.01265605, + "balance_loss_clip": 0.0627445, + "balance_loss_mlp": 0.01254382, + "epoch": 0.6415151059672328, + "flos": 19645291560960.0, + "grad_norm": 1.6345904804173623, + "language_loss": 0.7890048, + "learning_rate": 1.2031279302792825e-06, + "loss": 0.86586452, + "num_input_tokens_seen": 230184785, + "router_z_loss_clip": 1.45703125, + "router_z_loss_mlp": 0.11224365, + "step": 10670, + "time_per_iteration": 2.539581537246704 + }, + { + "auxiliary_loss_clip": 0.06415358, + "auxiliary_loss_mlp": 0.01263886, + "balance_loss_clip": 0.06272778, + "balance_loss_mlp": 0.01252752, + "epoch": 0.6415752292199008, + "flos": 14871016823040.0, + "grad_norm": 2.295733548922842, + "language_loss": 0.88453639, + "learning_rate": 1.20277073264638e-06, + "loss": 0.96132886, + "num_input_tokens_seen": 230201385, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.11138916, + "step": 10671, + "time_per_iteration": 2.477959632873535 + }, + { + "auxiliary_loss_clip": 0.06407315, + "auxiliary_loss_mlp": 0.01263473, + "balance_loss_clip": 0.0627213, + "balance_loss_mlp": 0.01253591, + "epoch": 0.6416353524725688, + "flos": 13740710371200.0, + "grad_norm": 1.4227697494992897, + "language_loss": 0.6938256, + "learning_rate": 1.2024135652444907e-06, + "loss": 0.77053344, + "num_input_tokens_seen": 230220380, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.09893799, + "step": 10672, + "time_per_iteration": 2.5083000659942627 + }, + { + "auxiliary_loss_clip": 0.06417342, + "auxiliary_loss_mlp": 0.01266287, + "balance_loss_clip": 0.06272715, + "balance_loss_mlp": 0.0125343, + "epoch": 0.6416954757252368, + "flos": 24541785878400.0, + "grad_norm": 1.8997700971465656, + "language_loss": 0.74453592, + "learning_rate": 1.2020564280871593e-06, + "loss": 0.82137227, + "num_input_tokens_seen": 230239845, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.128479, + "step": 10673, + "time_per_iteration": 3.9653780460357666 + }, + { + "auxiliary_loss_clip": 0.06409254, + "auxiliary_loss_mlp": 0.01267909, + "balance_loss_clip": 0.06269009, + "balance_loss_mlp": 0.01256948, + "epoch": 0.6417555989779047, + "flos": 27717531148800.0, + "grad_norm": 1.5327640795153767, + "language_loss": 0.69868958, + "learning_rate": 1.2016993211879283e-06, + "loss": 0.77546132, + "num_input_tokens_seen": 230262420, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.10961914, + "step": 10674, + "time_per_iteration": 2.563626766204834 + }, + { + "auxiliary_loss_clip": 0.06417114, + "auxiliary_loss_mlp": 0.01264104, + "balance_loss_clip": 0.06272194, + "balance_loss_mlp": 0.01253376, + "epoch": 0.6418157222305727, + "flos": 20562604133760.0, + "grad_norm": 1.803070032007693, + "language_loss": 0.67809439, + "learning_rate": 1.201342244560338e-06, + "loss": 0.75490659, + "num_input_tokens_seen": 230279950, + "router_z_loss_clip": 1.44921875, + "router_z_loss_mlp": 0.10736084, + "step": 10675, + "time_per_iteration": 2.508819580078125 + }, + { + "auxiliary_loss_clip": 0.06411684, + "auxiliary_loss_mlp": 0.01266305, + "balance_loss_clip": 0.06274499, + "balance_loss_mlp": 0.01255648, + "epoch": 0.6418758454832406, + "flos": 22608126806400.0, + "grad_norm": 1.6761966103099513, + "language_loss": 0.66968966, + "learning_rate": 1.2009851982179307e-06, + "loss": 0.7464695, + "num_input_tokens_seen": 230299705, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.10662842, + "step": 10676, + "time_per_iteration": 2.504427909851074 + }, + { + "auxiliary_loss_clip": 0.06413673, + "auxiliary_loss_mlp": 0.01266671, + "balance_loss_clip": 0.06272808, + "balance_loss_mlp": 0.01255078, + "epoch": 0.6419359687359086, + "flos": 27381479149440.0, + "grad_norm": 1.8338510977392408, + "language_loss": 0.75681728, + "learning_rate": 1.2006281821742446e-06, + "loss": 0.83362073, + "num_input_tokens_seen": 230320030, + "router_z_loss_clip": 1.40917969, + "router_z_loss_mlp": 0.11590576, + "step": 10677, + "time_per_iteration": 2.5891265869140625 + }, + { + "auxiliary_loss_clip": 0.06311014, + "auxiliary_loss_mlp": 0.01250224, + "balance_loss_clip": 0.06254409, + "balance_loss_mlp": 0.01249042, + "epoch": 0.6419960919885765, + "flos": 67270722566400.0, + "grad_norm": 0.7408362116441561, + "language_loss": 0.60777372, + "learning_rate": 1.200271196442818e-06, + "loss": 0.68338609, + "num_input_tokens_seen": 230381495, + "router_z_loss_clip": 0.56640625, + "router_z_loss_mlp": 0.01180267, + "step": 10678, + "time_per_iteration": 3.185296058654785 + }, + { + "auxiliary_loss_clip": 0.06408557, + "auxiliary_loss_mlp": 0.01266002, + "balance_loss_clip": 0.06272914, + "balance_loss_mlp": 0.01255816, + "epoch": 0.6420562152412446, + "flos": 19908067564800.0, + "grad_norm": 2.4133916332472083, + "language_loss": 0.67507815, + "learning_rate": 1.1999142410371875e-06, + "loss": 0.75182372, + "num_input_tokens_seen": 230401385, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10186768, + "step": 10679, + "time_per_iteration": 2.5243141651153564 + }, + { + "auxiliary_loss_clip": 0.06412959, + "auxiliary_loss_mlp": 0.01264697, + "balance_loss_clip": 0.06271341, + "balance_loss_mlp": 0.0125395, + "epoch": 0.6421163384939125, + "flos": 24797056942080.0, + "grad_norm": 1.7795780158399093, + "language_loss": 0.73073864, + "learning_rate": 1.1995573159708897e-06, + "loss": 0.8075152, + "num_input_tokens_seen": 230421340, + "router_z_loss_clip": 1.41503906, + "router_z_loss_mlp": 0.10742188, + "step": 10680, + "time_per_iteration": 2.5331122875213623 + }, + { + "auxiliary_loss_clip": 0.06414793, + "auxiliary_loss_mlp": 0.01266326, + "balance_loss_clip": 0.06276178, + "balance_loss_mlp": 0.01256014, + "epoch": 0.6421764617465805, + "flos": 25599822583680.0, + "grad_norm": 2.391895628783687, + "language_loss": 0.68047994, + "learning_rate": 1.1992004212574582e-06, + "loss": 0.75729114, + "num_input_tokens_seen": 230441270, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.10308838, + "step": 10681, + "time_per_iteration": 2.53722882270813 + }, + { + "auxiliary_loss_clip": 0.06410016, + "auxiliary_loss_mlp": 0.01263743, + "balance_loss_clip": 0.06272537, + "balance_loss_mlp": 0.01253318, + "epoch": 0.6422365849992484, + "flos": 14139556606080.0, + "grad_norm": 1.5905545864535235, + "language_loss": 0.74707049, + "learning_rate": 1.198843556910427e-06, + "loss": 0.82380807, + "num_input_tokens_seen": 230457455, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.10437012, + "step": 10682, + "time_per_iteration": 2.472856283187866 + }, + { + "auxiliary_loss_clip": 0.06400837, + "auxiliary_loss_mlp": 0.01268483, + "balance_loss_clip": 0.06270464, + "balance_loss_mlp": 0.01258499, + "epoch": 0.6422967082519164, + "flos": 22390688661120.0, + "grad_norm": 1.4486797107477571, + "language_loss": 0.79339921, + "learning_rate": 1.1984867229433287e-06, + "loss": 0.87009233, + "num_input_tokens_seen": 230478955, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.09985352, + "step": 10683, + "time_per_iteration": 2.5533552169799805 + }, + { + "auxiliary_loss_clip": 0.06413358, + "auxiliary_loss_mlp": 0.01265956, + "balance_loss_clip": 0.06272833, + "balance_loss_mlp": 0.01254607, + "epoch": 0.6423568315045844, + "flos": 14653243261440.0, + "grad_norm": 1.9282526307042827, + "language_loss": 0.67605591, + "learning_rate": 1.1981299193696941e-06, + "loss": 0.75284898, + "num_input_tokens_seen": 230496425, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.11334229, + "step": 10684, + "time_per_iteration": 2.482949733734131 + }, + { + "auxiliary_loss_clip": 0.06413907, + "auxiliary_loss_mlp": 0.01267282, + "balance_loss_clip": 0.06273498, + "balance_loss_mlp": 0.01255909, + "epoch": 0.6424169547572524, + "flos": 26841237949440.0, + "grad_norm": 1.917462680158283, + "language_loss": 0.71542668, + "learning_rate": 1.1977731462030533e-06, + "loss": 0.79223859, + "num_input_tokens_seen": 230516245, + "router_z_loss_clip": 1.40527344, + "router_z_loss_mlp": 0.1137085, + "step": 10685, + "time_per_iteration": 3.9797728061676025 + }, + { + "auxiliary_loss_clip": 0.06408305, + "auxiliary_loss_mlp": 0.01271537, + "balance_loss_clip": 0.06272995, + "balance_loss_mlp": 0.01260451, + "epoch": 0.6424770780099204, + "flos": 22713449788800.0, + "grad_norm": 1.7465950797369785, + "language_loss": 0.75233316, + "learning_rate": 1.197416403456935e-06, + "loss": 0.8291316, + "num_input_tokens_seen": 230534745, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.11083984, + "step": 10686, + "time_per_iteration": 2.5496456623077393 + }, + { + "auxiliary_loss_clip": 0.06415822, + "auxiliary_loss_mlp": 0.01270285, + "balance_loss_clip": 0.06274287, + "balance_loss_mlp": 0.01258501, + "epoch": 0.6425372012625883, + "flos": 28476049034880.0, + "grad_norm": 2.381729998669287, + "language_loss": 0.68881834, + "learning_rate": 1.197059691144867e-06, + "loss": 0.76567948, + "num_input_tokens_seen": 230555895, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.11767578, + "step": 10687, + "time_per_iteration": 2.570040464401245 + }, + { + "auxiliary_loss_clip": 0.06416762, + "auxiliary_loss_mlp": 0.01265122, + "balance_loss_clip": 0.06275085, + "balance_loss_mlp": 0.01254089, + "epoch": 0.6425973245152563, + "flos": 29359469831040.0, + "grad_norm": 1.9635514388954842, + "language_loss": 0.66698802, + "learning_rate": 1.1967030092803767e-06, + "loss": 0.74380684, + "num_input_tokens_seen": 230577460, + "router_z_loss_clip": 1.41796875, + "router_z_loss_mlp": 0.11029053, + "step": 10688, + "time_per_iteration": 4.0477213859558105 + }, + { + "auxiliary_loss_clip": 0.06411983, + "auxiliary_loss_mlp": 0.01266463, + "balance_loss_clip": 0.06273896, + "balance_loss_mlp": 0.01255716, + "epoch": 0.6426574477679242, + "flos": 16435109462400.0, + "grad_norm": 1.9153737313813421, + "language_loss": 0.73537695, + "learning_rate": 1.1963463578769876e-06, + "loss": 0.81216139, + "num_input_tokens_seen": 230595030, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.10742188, + "step": 10689, + "time_per_iteration": 2.5043931007385254 + }, + { + "auxiliary_loss_clip": 0.06405617, + "auxiliary_loss_mlp": 0.01262867, + "balance_loss_clip": 0.06272008, + "balance_loss_mlp": 0.01252758, + "epoch": 0.6427175710205922, + "flos": 21842481323520.0, + "grad_norm": 2.0498755252573932, + "language_loss": 0.72094941, + "learning_rate": 1.195989736948226e-06, + "loss": 0.79763424, + "num_input_tokens_seen": 230615135, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.10101318, + "step": 10690, + "time_per_iteration": 2.5244081020355225 + }, + { + "auxiliary_loss_clip": 0.06408664, + "auxiliary_loss_mlp": 0.01266562, + "balance_loss_clip": 0.06273106, + "balance_loss_mlp": 0.01256203, + "epoch": 0.6427776942732601, + "flos": 17792623059840.0, + "grad_norm": 2.705995899316003, + "language_loss": 0.78068197, + "learning_rate": 1.1956331465076143e-06, + "loss": 0.85743421, + "num_input_tokens_seen": 230631965, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.1036377, + "step": 10691, + "time_per_iteration": 2.530010461807251 + }, + { + "auxiliary_loss_clip": 0.0641586, + "auxiliary_loss_mlp": 0.0126902, + "balance_loss_clip": 0.06274788, + "balance_loss_mlp": 0.0125822, + "epoch": 0.6428378175259282, + "flos": 15091306007040.0, + "grad_norm": 1.6963645960197293, + "language_loss": 0.74278462, + "learning_rate": 1.1952765865686738e-06, + "loss": 0.81963336, + "num_input_tokens_seen": 230649565, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.10797119, + "step": 10692, + "time_per_iteration": 2.4988198280334473 + }, + { + "auxiliary_loss_clip": 0.06415784, + "auxiliary_loss_mlp": 0.01265088, + "balance_loss_clip": 0.06276909, + "balance_loss_mlp": 0.01254371, + "epoch": 0.6428979407785961, + "flos": 23848535923200.0, + "grad_norm": 1.7731596560048748, + "language_loss": 0.61612236, + "learning_rate": 1.1949200571449263e-06, + "loss": 0.69293106, + "num_input_tokens_seen": 230669265, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.1071167, + "step": 10693, + "time_per_iteration": 2.5508644580841064 + }, + { + "auxiliary_loss_clip": 0.06415299, + "auxiliary_loss_mlp": 0.01263917, + "balance_loss_clip": 0.06272541, + "balance_loss_mlp": 0.01252258, + "epoch": 0.6429580640312641, + "flos": 32935151439360.0, + "grad_norm": 1.6308651969538634, + "language_loss": 0.59823889, + "learning_rate": 1.1945635582498903e-06, + "loss": 0.67503107, + "num_input_tokens_seen": 230690575, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.11669922, + "step": 10694, + "time_per_iteration": 3.998856544494629 + }, + { + "auxiliary_loss_clip": 0.0641511, + "auxiliary_loss_mlp": 0.012666, + "balance_loss_clip": 0.06274424, + "balance_loss_mlp": 0.01255645, + "epoch": 0.643018187283932, + "flos": 21074571780480.0, + "grad_norm": 1.333714526566846, + "language_loss": 0.79901004, + "learning_rate": 1.1942070898970853e-06, + "loss": 0.87582707, + "num_input_tokens_seen": 230709420, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.10961914, + "step": 10695, + "time_per_iteration": 2.5433716773986816 + }, + { + "auxiliary_loss_clip": 0.0641124, + "auxiliary_loss_mlp": 0.01265686, + "balance_loss_clip": 0.06271, + "balance_loss_mlp": 0.01254904, + "epoch": 0.6430783105366, + "flos": 26731973825280.0, + "grad_norm": 1.5735391795945948, + "language_loss": 0.73628104, + "learning_rate": 1.1938506521000285e-06, + "loss": 0.81305027, + "num_input_tokens_seen": 230729350, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.10778809, + "step": 10696, + "time_per_iteration": 2.5438404083251953 + }, + { + "auxiliary_loss_clip": 0.06407514, + "auxiliary_loss_mlp": 0.01265995, + "balance_loss_clip": 0.06272715, + "balance_loss_mlp": 0.01255779, + "epoch": 0.643138433789268, + "flos": 23703744867840.0, + "grad_norm": 1.7384218375133755, + "language_loss": 0.75689638, + "learning_rate": 1.1934942448722347e-06, + "loss": 0.83363152, + "num_input_tokens_seen": 230749220, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.10211182, + "step": 10697, + "time_per_iteration": 2.538093090057373 + }, + { + "auxiliary_loss_clip": 0.06406935, + "auxiliary_loss_mlp": 0.01264883, + "balance_loss_clip": 0.06271957, + "balance_loss_mlp": 0.01255066, + "epoch": 0.643198557041936, + "flos": 34210416654720.0, + "grad_norm": 1.3977759922631694, + "language_loss": 0.65892148, + "learning_rate": 1.1931378682272208e-06, + "loss": 0.73563969, + "num_input_tokens_seen": 230770245, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.09820557, + "step": 10698, + "time_per_iteration": 2.598088026046753 + }, + { + "auxiliary_loss_clip": 0.06311838, + "auxiliary_loss_mlp": 0.01254343, + "balance_loss_clip": 0.06254914, + "balance_loss_mlp": 0.01253054, + "epoch": 0.643258680294604, + "flos": 67646955398400.0, + "grad_norm": 0.7781801094870626, + "language_loss": 0.63529652, + "learning_rate": 1.1927815221784996e-06, + "loss": 0.71095836, + "num_input_tokens_seen": 230837030, + "router_z_loss_clip": 0.5703125, + "router_z_loss_mlp": 0.01290131, + "step": 10699, + "time_per_iteration": 3.115173101425171 + }, + { + "auxiliary_loss_clip": 0.06406387, + "auxiliary_loss_mlp": 0.01265934, + "balance_loss_clip": 0.06272414, + "balance_loss_mlp": 0.01256397, + "epoch": 0.6433188035472719, + "flos": 25192003962240.0, + "grad_norm": 1.4785466380460042, + "language_loss": 0.69763827, + "learning_rate": 1.1924252067395838e-06, + "loss": 0.77436155, + "num_input_tokens_seen": 230856845, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.09545898, + "step": 10700, + "time_per_iteration": 2.5910451412200928 + }, + { + "auxiliary_loss_clip": 0.06412176, + "auxiliary_loss_mlp": 0.01267748, + "balance_loss_clip": 0.0627284, + "balance_loss_mlp": 0.01256918, + "epoch": 0.6433789267999399, + "flos": 24980645237760.0, + "grad_norm": 1.528088543997644, + "language_loss": 0.73932713, + "learning_rate": 1.1920689219239855e-06, + "loss": 0.81612635, + "num_input_tokens_seen": 230878785, + "router_z_loss_clip": 1.39257812, + "router_z_loss_mlp": 0.10827637, + "step": 10701, + "time_per_iteration": 2.544930934906006 + }, + { + "auxiliary_loss_clip": 0.06417713, + "auxiliary_loss_mlp": 0.01266156, + "balance_loss_clip": 0.06274359, + "balance_loss_mlp": 0.012551, + "epoch": 0.6434390500526078, + "flos": 17571704970240.0, + "grad_norm": 2.0241741030403064, + "language_loss": 0.81973577, + "learning_rate": 1.1917126677452144e-06, + "loss": 0.8965745, + "num_input_tokens_seen": 230895445, + "router_z_loss_clip": 1.43457031, + "router_z_loss_mlp": 0.1105957, + "step": 10702, + "time_per_iteration": 2.5270791053771973 + }, + { + "auxiliary_loss_clip": 0.06410103, + "auxiliary_loss_mlp": 0.01270083, + "balance_loss_clip": 0.06273524, + "balance_loss_mlp": 0.01259927, + "epoch": 0.6434991733052758, + "flos": 20848790154240.0, + "grad_norm": 1.961461723280124, + "language_loss": 0.74951881, + "learning_rate": 1.1913564442167798e-06, + "loss": 0.82632065, + "num_input_tokens_seen": 230911375, + "router_z_loss_clip": 1.36621094, + "router_z_loss_mlp": 0.1015625, + "step": 10703, + "time_per_iteration": 2.490809917449951 + }, + { + "auxiliary_loss_clip": 0.06306668, + "auxiliary_loss_mlp": 0.01249951, + "balance_loss_clip": 0.06250144, + "balance_loss_mlp": 0.01248577, + "epoch": 0.6435592965579437, + "flos": 66114909745920.0, + "grad_norm": 0.6384717488493646, + "language_loss": 0.54610157, + "learning_rate": 1.1910002513521898e-06, + "loss": 0.62166774, + "num_input_tokens_seen": 230975990, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01377106, + "step": 10704, + "time_per_iteration": 3.160659074783325 + }, + { + "auxiliary_loss_clip": 0.06412737, + "auxiliary_loss_mlp": 0.01269762, + "balance_loss_clip": 0.06273799, + "balance_loss_mlp": 0.0125994, + "epoch": 0.6436194198106118, + "flos": 23775595344000.0, + "grad_norm": 1.7759265636720112, + "language_loss": 0.77319264, + "learning_rate": 1.1906440891649519e-06, + "loss": 0.85001761, + "num_input_tokens_seen": 230997110, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.09814453, + "step": 10705, + "time_per_iteration": 2.543015718460083 + }, + { + "auxiliary_loss_clip": 0.06412525, + "auxiliary_loss_mlp": 0.01267692, + "balance_loss_clip": 0.06272702, + "balance_loss_mlp": 0.0125694, + "epoch": 0.6436795430632797, + "flos": 20236572696960.0, + "grad_norm": 1.551816271189714, + "language_loss": 0.79286802, + "learning_rate": 1.1902879576685708e-06, + "loss": 0.86967015, + "num_input_tokens_seen": 231015590, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.10748291, + "step": 10706, + "time_per_iteration": 2.571018934249878 + }, + { + "auxiliary_loss_clip": 0.06408278, + "auxiliary_loss_mlp": 0.01264089, + "balance_loss_clip": 0.06270924, + "balance_loss_mlp": 0.01253807, + "epoch": 0.6437396663159477, + "flos": 20307878121600.0, + "grad_norm": 1.8116162091626624, + "language_loss": 0.80532277, + "learning_rate": 1.1899318568765518e-06, + "loss": 0.8820464, + "num_input_tokens_seen": 231033800, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.10284424, + "step": 10707, + "time_per_iteration": 2.49252986907959 + }, + { + "auxiliary_loss_clip": 0.06408471, + "auxiliary_loss_mlp": 0.01266248, + "balance_loss_clip": 0.06271025, + "balance_loss_mlp": 0.01256151, + "epoch": 0.6437997895686156, + "flos": 23885404519680.0, + "grad_norm": 1.5335483275855415, + "language_loss": 0.85439938, + "learning_rate": 1.1895757868023978e-06, + "loss": 0.93114662, + "num_input_tokens_seen": 231053160, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.10101318, + "step": 10708, + "time_per_iteration": 2.554351806640625 + }, + { + "auxiliary_loss_clip": 0.0642588, + "auxiliary_loss_mlp": 0.01267773, + "balance_loss_clip": 0.06278181, + "balance_loss_mlp": 0.0125649, + "epoch": 0.6438599128212836, + "flos": 18995241185280.0, + "grad_norm": 2.1632531373454507, + "language_loss": 0.66272986, + "learning_rate": 1.1892197474596106e-06, + "loss": 0.73966646, + "num_input_tokens_seen": 231069470, + "router_z_loss_clip": 1.4765625, + "router_z_loss_mlp": 0.11279297, + "step": 10709, + "time_per_iteration": 2.4882705211639404 + }, + { + "auxiliary_loss_clip": 0.06406571, + "auxiliary_loss_mlp": 0.01264597, + "balance_loss_clip": 0.06270951, + "balance_loss_mlp": 0.01254793, + "epoch": 0.6439200360739517, + "flos": 24103010373120.0, + "grad_norm": 1.6506823259196688, + "language_loss": 0.80511576, + "learning_rate": 1.1888637388616929e-06, + "loss": 0.88182747, + "num_input_tokens_seen": 231088205, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.09802246, + "step": 10710, + "time_per_iteration": 2.56453537940979 + }, + { + "auxiliary_loss_clip": 0.0640994, + "auxiliary_loss_mlp": 0.01264827, + "balance_loss_clip": 0.06274116, + "balance_loss_mlp": 0.01254676, + "epoch": 0.6439801593266196, + "flos": 31909748699520.0, + "grad_norm": 1.6423775297739596, + "language_loss": 0.66664886, + "learning_rate": 1.1885077610221425e-06, + "loss": 0.74339652, + "num_input_tokens_seen": 231107850, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.1015625, + "step": 10711, + "time_per_iteration": 2.5858142375946045 + }, + { + "auxiliary_loss_clip": 0.06416127, + "auxiliary_loss_mlp": 0.01267658, + "balance_loss_clip": 0.06276122, + "balance_loss_mlp": 0.01257155, + "epoch": 0.6440402825792876, + "flos": 27133251828480.0, + "grad_norm": 1.4850866798945335, + "language_loss": 0.78739464, + "learning_rate": 1.1881518139544597e-06, + "loss": 0.86423248, + "num_input_tokens_seen": 231127200, + "router_z_loss_clip": 1.39941406, + "router_z_loss_mlp": 0.10498047, + "step": 10712, + "time_per_iteration": 2.5875256061553955 + }, + { + "auxiliary_loss_clip": 0.06415762, + "auxiliary_loss_mlp": 0.01268856, + "balance_loss_clip": 0.0627311, + "balance_loss_mlp": 0.01258264, + "epoch": 0.6441004058319555, + "flos": 20673964609920.0, + "grad_norm": 4.153275753738836, + "language_loss": 0.82697159, + "learning_rate": 1.1877958976721417e-06, + "loss": 0.90381777, + "num_input_tokens_seen": 231146360, + "router_z_loss_clip": 1.42480469, + "router_z_loss_mlp": 0.105896, + "step": 10713, + "time_per_iteration": 3.9446072578430176 + }, + { + "auxiliary_loss_clip": 0.06405178, + "auxiliary_loss_mlp": 0.01267711, + "balance_loss_clip": 0.06273344, + "balance_loss_mlp": 0.0125691, + "epoch": 0.6441605290846235, + "flos": 26032309032960.0, + "grad_norm": 1.3361931407869754, + "language_loss": 0.78574234, + "learning_rate": 1.187440012188684e-06, + "loss": 0.86247128, + "num_input_tokens_seen": 231168350, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.10809326, + "step": 10714, + "time_per_iteration": 2.530367612838745 + }, + { + "auxiliary_loss_clip": 0.06407861, + "auxiliary_loss_mlp": 0.01264356, + "balance_loss_clip": 0.0627133, + "balance_loss_mlp": 0.01254741, + "epoch": 0.6442206523372914, + "flos": 24906362993280.0, + "grad_norm": 1.4535353305453917, + "language_loss": 0.81736881, + "learning_rate": 1.187084157517583e-06, + "loss": 0.89409101, + "num_input_tokens_seen": 231188385, + "router_z_loss_clip": 1.36621094, + "router_z_loss_mlp": 0.09619141, + "step": 10715, + "time_per_iteration": 2.563981294631958 + }, + { + "auxiliary_loss_clip": 0.06417291, + "auxiliary_loss_mlp": 0.01266314, + "balance_loss_clip": 0.06276529, + "balance_loss_mlp": 0.01255812, + "epoch": 0.6442807755899594, + "flos": 25163478846720.0, + "grad_norm": 2.5611767206234335, + "language_loss": 0.81585336, + "learning_rate": 1.186728333672332e-06, + "loss": 0.89268947, + "num_input_tokens_seen": 231209880, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.10498047, + "step": 10716, + "time_per_iteration": 2.54089617729187 + }, + { + "auxiliary_loss_clip": 0.06414896, + "auxiliary_loss_mlp": 0.0126582, + "balance_loss_clip": 0.06271458, + "balance_loss_mlp": 0.01254931, + "epoch": 0.6443408988426274, + "flos": 27351863930880.0, + "grad_norm": 1.9349198900461007, + "language_loss": 0.783328, + "learning_rate": 1.186372540666424e-06, + "loss": 0.8601352, + "num_input_tokens_seen": 231230765, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.10894775, + "step": 10717, + "time_per_iteration": 2.726794719696045 + }, + { + "auxiliary_loss_clip": 0.06407352, + "auxiliary_loss_mlp": 0.0126699, + "balance_loss_clip": 0.06274462, + "balance_loss_mlp": 0.01256929, + "epoch": 0.6444010220952954, + "flos": 27935807834880.0, + "grad_norm": 1.5112707746860563, + "language_loss": 0.68381333, + "learning_rate": 1.1860167785133513e-06, + "loss": 0.76055682, + "num_input_tokens_seen": 231252350, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.10058594, + "step": 10718, + "time_per_iteration": 2.610858201980591 + }, + { + "auxiliary_loss_clip": 0.0630646, + "auxiliary_loss_mlp": 0.01253706, + "balance_loss_clip": 0.06250188, + "balance_loss_mlp": 0.01252236, + "epoch": 0.6444611453479633, + "flos": 71232169173120.0, + "grad_norm": 0.7437918033374209, + "language_loss": 0.49586019, + "learning_rate": 1.185661047226603e-06, + "loss": 0.5714618, + "num_input_tokens_seen": 231313865, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01467896, + "step": 10719, + "time_per_iteration": 3.303040027618408 + }, + { + "auxiliary_loss_clip": 0.06416054, + "auxiliary_loss_mlp": 0.01264815, + "balance_loss_clip": 0.06274591, + "balance_loss_mlp": 0.01253598, + "epoch": 0.6445212686006313, + "flos": 22710766458240.0, + "grad_norm": 1.8616807218185105, + "language_loss": 0.77902591, + "learning_rate": 1.18530534681967e-06, + "loss": 0.8558346, + "num_input_tokens_seen": 231331710, + "router_z_loss_clip": 1.41503906, + "router_z_loss_mlp": 0.11212158, + "step": 10720, + "time_per_iteration": 2.4988739490509033 + }, + { + "auxiliary_loss_clip": 0.06409489, + "auxiliary_loss_mlp": 0.01265868, + "balance_loss_clip": 0.06272486, + "balance_loss_mlp": 0.01255556, + "epoch": 0.6445813918532992, + "flos": 21185219496960.0, + "grad_norm": 1.7169707268636247, + "language_loss": 0.77512503, + "learning_rate": 1.18494967730604e-06, + "loss": 0.85187852, + "num_input_tokens_seen": 231350705, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.10314941, + "step": 10721, + "time_per_iteration": 2.5300545692443848 + }, + { + "auxiliary_loss_clip": 0.06412297, + "auxiliary_loss_mlp": 0.01265332, + "balance_loss_clip": 0.0627277, + "balance_loss_mlp": 0.01254722, + "epoch": 0.6446415151059672, + "flos": 25198921923840.0, + "grad_norm": 2.0971313720175253, + "language_loss": 0.72901034, + "learning_rate": 1.1845940386991995e-06, + "loss": 0.80578673, + "num_input_tokens_seen": 231369550, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.1060791, + "step": 10722, + "time_per_iteration": 2.5350587368011475 + }, + { + "auxiliary_loss_clip": 0.06411985, + "auxiliary_loss_mlp": 0.01267025, + "balance_loss_clip": 0.06273404, + "balance_loss_mlp": 0.01257149, + "epoch": 0.6447016383586353, + "flos": 25309401932160.0, + "grad_norm": 1.4844277887266815, + "language_loss": 0.78381926, + "learning_rate": 1.184238431012635e-06, + "loss": 0.86060935, + "num_input_tokens_seen": 231389285, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.09881592, + "step": 10723, + "time_per_iteration": 2.550785541534424 + }, + { + "auxiliary_loss_clip": 0.06412604, + "auxiliary_loss_mlp": 0.01264685, + "balance_loss_clip": 0.06270273, + "balance_loss_mlp": 0.01253825, + "epoch": 0.6447617616113032, + "flos": 27709523084160.0, + "grad_norm": 1.5774078355025598, + "language_loss": 0.58958089, + "learning_rate": 1.1838828542598312e-06, + "loss": 0.66635382, + "num_input_tokens_seen": 231408820, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.10858154, + "step": 10724, + "time_per_iteration": 2.54042387008667 + }, + { + "auxiliary_loss_clip": 0.06404805, + "auxiliary_loss_mlp": 0.012629, + "balance_loss_clip": 0.06271456, + "balance_loss_mlp": 0.0125294, + "epoch": 0.6448218848639712, + "flos": 23045728354560.0, + "grad_norm": 1.8379385823931873, + "language_loss": 0.83613712, + "learning_rate": 1.183527308454271e-06, + "loss": 0.91281414, + "num_input_tokens_seen": 231428100, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.09960938, + "step": 10725, + "time_per_iteration": 3.910567045211792 + }, + { + "auxiliary_loss_clip": 0.06409329, + "auxiliary_loss_mlp": 0.0126531, + "balance_loss_clip": 0.06272514, + "balance_loss_mlp": 0.01255123, + "epoch": 0.6448820081166391, + "flos": 24502569367680.0, + "grad_norm": 1.6966621719955104, + "language_loss": 0.82546258, + "learning_rate": 1.1831717936094368e-06, + "loss": 0.90220898, + "num_input_tokens_seen": 231445810, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.10186768, + "step": 10726, + "time_per_iteration": 2.5510244369506836 + }, + { + "auxiliary_loss_clip": 0.06413421, + "auxiliary_loss_mlp": 0.01265367, + "balance_loss_clip": 0.06271534, + "balance_loss_mlp": 0.01254757, + "epoch": 0.6449421313693071, + "flos": 22425880176000.0, + "grad_norm": 1.8351379370292278, + "language_loss": 0.82230431, + "learning_rate": 1.1828163097388108e-06, + "loss": 0.8990922, + "num_input_tokens_seen": 231463570, + "router_z_loss_clip": 1.41992188, + "router_z_loss_mlp": 0.10601807, + "step": 10727, + "time_per_iteration": 4.002009153366089 + }, + { + "auxiliary_loss_clip": 0.0641925, + "auxiliary_loss_mlp": 0.01267298, + "balance_loss_clip": 0.06273851, + "balance_loss_mlp": 0.01255908, + "epoch": 0.645002254621975, + "flos": 20231206035840.0, + "grad_norm": 1.8310574877771004, + "language_loss": 0.79621851, + "learning_rate": 1.1824608568558717e-06, + "loss": 0.87308395, + "num_input_tokens_seen": 231482155, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.1138916, + "step": 10728, + "time_per_iteration": 2.500166416168213 + }, + { + "auxiliary_loss_clip": 0.06411231, + "auxiliary_loss_mlp": 0.0126533, + "balance_loss_clip": 0.06273383, + "balance_loss_mlp": 0.01253767, + "epoch": 0.645062377874643, + "flos": 27862909182720.0, + "grad_norm": 1.7840301112259453, + "language_loss": 0.7434454, + "learning_rate": 1.1821054349740988e-06, + "loss": 0.82021105, + "num_input_tokens_seen": 231502465, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.11578369, + "step": 10729, + "time_per_iteration": 2.5444576740264893 + }, + { + "auxiliary_loss_clip": 0.06416906, + "auxiliary_loss_mlp": 0.01269622, + "balance_loss_clip": 0.06276138, + "balance_loss_mlp": 0.01258971, + "epoch": 0.645122501127311, + "flos": 25308563391360.0, + "grad_norm": 1.804382369686425, + "language_loss": 0.66694868, + "learning_rate": 1.1817500441069706e-06, + "loss": 0.74381399, + "num_input_tokens_seen": 231522740, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.10662842, + "step": 10730, + "time_per_iteration": 2.557570695877075 + }, + { + "auxiliary_loss_clip": 0.06414691, + "auxiliary_loss_mlp": 0.01268999, + "balance_loss_clip": 0.0627515, + "balance_loss_mlp": 0.01257823, + "epoch": 0.645182624379979, + "flos": 18813371898240.0, + "grad_norm": 1.7610800842195338, + "language_loss": 0.64359826, + "learning_rate": 1.1813946842679614e-06, + "loss": 0.72043514, + "num_input_tokens_seen": 231542050, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.11181641, + "step": 10731, + "time_per_iteration": 2.496885299682617 + }, + { + "auxiliary_loss_clip": 0.06408474, + "auxiliary_loss_mlp": 0.01264128, + "balance_loss_clip": 0.06272513, + "balance_loss_mlp": 0.01253507, + "epoch": 0.6452427476326469, + "flos": 18337979358720.0, + "grad_norm": 1.6539865973631505, + "language_loss": 0.68541694, + "learning_rate": 1.1810393554705492e-06, + "loss": 0.76214296, + "num_input_tokens_seen": 231560380, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.10620117, + "step": 10732, + "time_per_iteration": 2.5379278659820557 + }, + { + "auxiliary_loss_clip": 0.06405264, + "auxiliary_loss_mlp": 0.01268037, + "balance_loss_clip": 0.06272335, + "balance_loss_mlp": 0.01257392, + "epoch": 0.6453028708853149, + "flos": 22791505466880.0, + "grad_norm": 1.6003799317808598, + "language_loss": 0.75854611, + "learning_rate": 1.1806840577282055e-06, + "loss": 0.83527917, + "num_input_tokens_seen": 231580810, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.10638428, + "step": 10733, + "time_per_iteration": 2.5387895107269287 + }, + { + "auxiliary_loss_clip": 0.06419903, + "auxiliary_loss_mlp": 0.01269065, + "balance_loss_clip": 0.06276383, + "balance_loss_mlp": 0.01257466, + "epoch": 0.6453629941379828, + "flos": 23951888334720.0, + "grad_norm": 1.8221527595961244, + "language_loss": 0.6735214, + "learning_rate": 1.1803287910544048e-06, + "loss": 0.75041103, + "num_input_tokens_seen": 231600585, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.1159668, + "step": 10734, + "time_per_iteration": 3.968029260635376 + }, + { + "auxiliary_loss_clip": 0.06404681, + "auxiliary_loss_mlp": 0.01263578, + "balance_loss_clip": 0.06273787, + "balance_loss_mlp": 0.01252694, + "epoch": 0.6454231173906508, + "flos": 17682226905600.0, + "grad_norm": 2.0600495273099377, + "language_loss": 0.7393254, + "learning_rate": 1.1799735554626191e-06, + "loss": 0.81600797, + "num_input_tokens_seen": 231618765, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.10882568, + "step": 10735, + "time_per_iteration": 2.5028645992279053 + }, + { + "auxiliary_loss_clip": 0.06413495, + "auxiliary_loss_mlp": 0.01265876, + "balance_loss_clip": 0.06275123, + "balance_loss_mlp": 0.01255791, + "epoch": 0.6454832406433189, + "flos": 23299154628480.0, + "grad_norm": 1.713856204545893, + "language_loss": 0.75178444, + "learning_rate": 1.1796183509663176e-06, + "loss": 0.82857811, + "num_input_tokens_seen": 231638525, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.10083008, + "step": 10736, + "time_per_iteration": 2.52396821975708 + }, + { + "auxiliary_loss_clip": 0.06414569, + "auxiliary_loss_mlp": 0.01265141, + "balance_loss_clip": 0.06272043, + "balance_loss_mlp": 0.01254097, + "epoch": 0.6455433638959868, + "flos": 20163422482560.0, + "grad_norm": 1.900325282027751, + "language_loss": 0.70704216, + "learning_rate": 1.1792631775789708e-06, + "loss": 0.78383923, + "num_input_tokens_seen": 231656785, + "router_z_loss_clip": 1.42382812, + "router_z_loss_mlp": 0.1104126, + "step": 10737, + "time_per_iteration": 2.533444404602051 + }, + { + "auxiliary_loss_clip": 0.06321093, + "auxiliary_loss_mlp": 0.01260403, + "balance_loss_clip": 0.06264752, + "balance_loss_mlp": 0.01258907, + "epoch": 0.6456034871486548, + "flos": 66553391761920.0, + "grad_norm": 0.7654525046837665, + "language_loss": 0.58448923, + "learning_rate": 1.1789080353140464e-06, + "loss": 0.66030419, + "num_input_tokens_seen": 231719075, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01495361, + "step": 10738, + "time_per_iteration": 3.180669069290161 + }, + { + "auxiliary_loss_clip": 0.06409475, + "auxiliary_loss_mlp": 0.01265038, + "balance_loss_clip": 0.06273897, + "balance_loss_mlp": 0.0125478, + "epoch": 0.6456636104013227, + "flos": 24212819548800.0, + "grad_norm": 2.1666946936849434, + "language_loss": 0.74776822, + "learning_rate": 1.1785529241850118e-06, + "loss": 0.82451332, + "num_input_tokens_seen": 231737810, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.1026001, + "step": 10739, + "time_per_iteration": 2.556649923324585 + }, + { + "auxiliary_loss_clip": 0.06415305, + "auxiliary_loss_mlp": 0.01264707, + "balance_loss_clip": 0.06273393, + "balance_loss_mlp": 0.01254098, + "epoch": 0.6457237336539907, + "flos": 23631013923840.0, + "grad_norm": 1.691973671023819, + "language_loss": 0.71430027, + "learning_rate": 1.1781978442053324e-06, + "loss": 0.79110038, + "num_input_tokens_seen": 231756140, + "router_z_loss_clip": 1.41894531, + "router_z_loss_mlp": 0.1060791, + "step": 10740, + "time_per_iteration": 2.5294902324676514 + }, + { + "auxiliary_loss_clip": 0.06311092, + "auxiliary_loss_mlp": 0.0125644, + "balance_loss_clip": 0.06254861, + "balance_loss_mlp": 0.01255001, + "epoch": 0.6457838569066586, + "flos": 65867437111680.0, + "grad_norm": 1.1432056527915397, + "language_loss": 0.55345345, + "learning_rate": 1.1778427953884733e-06, + "loss": 0.62912881, + "num_input_tokens_seen": 231823665, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01437378, + "step": 10741, + "time_per_iteration": 3.1684045791625977 + }, + { + "auxiliary_loss_clip": 0.06412791, + "auxiliary_loss_mlp": 0.01265658, + "balance_loss_clip": 0.06276751, + "balance_loss_mlp": 0.01255149, + "epoch": 0.6458439801593266, + "flos": 22388424600960.0, + "grad_norm": 1.6129388785112204, + "language_loss": 0.80396634, + "learning_rate": 1.1774877777478977e-06, + "loss": 0.88075083, + "num_input_tokens_seen": 231844500, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.1050415, + "step": 10742, + "time_per_iteration": 2.5326621532440186 + }, + { + "auxiliary_loss_clip": 0.06404757, + "auxiliary_loss_mlp": 0.01265116, + "balance_loss_clip": 0.06273461, + "balance_loss_mlp": 0.01254643, + "epoch": 0.6459041034119946, + "flos": 24795966839040.0, + "grad_norm": 1.5649270887964326, + "language_loss": 0.81750703, + "learning_rate": 1.1771327912970678e-06, + "loss": 0.89420575, + "num_input_tokens_seen": 231864510, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.10467529, + "step": 10743, + "time_per_iteration": 2.525972366333008 + }, + { + "auxiliary_loss_clip": 0.06406047, + "auxiliary_loss_mlp": 0.01265343, + "balance_loss_clip": 0.06271668, + "balance_loss_mlp": 0.01255377, + "epoch": 0.6459642266646626, + "flos": 18330013221120.0, + "grad_norm": 1.6048937891157424, + "language_loss": 0.71681064, + "learning_rate": 1.1767778360494453e-06, + "loss": 0.79352456, + "num_input_tokens_seen": 231881555, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.09973145, + "step": 10744, + "time_per_iteration": 2.571387767791748 + }, + { + "auxiliary_loss_clip": 0.06408056, + "auxiliary_loss_mlp": 0.01267463, + "balance_loss_clip": 0.0627251, + "balance_loss_mlp": 0.01257753, + "epoch": 0.6460243499173305, + "flos": 43591561672320.0, + "grad_norm": 1.9454844326150766, + "language_loss": 0.67213976, + "learning_rate": 1.1764229120184896e-06, + "loss": 0.74889499, + "num_input_tokens_seen": 231905945, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.0970459, + "step": 10745, + "time_per_iteration": 2.6937074661254883 + }, + { + "auxiliary_loss_clip": 0.06406983, + "auxiliary_loss_mlp": 0.01268476, + "balance_loss_clip": 0.0627151, + "balance_loss_mlp": 0.01257711, + "epoch": 0.6460844731699985, + "flos": 19249925270400.0, + "grad_norm": 2.096395113743082, + "language_loss": 0.74313092, + "learning_rate": 1.1760680192176597e-06, + "loss": 0.81988549, + "num_input_tokens_seen": 231922535, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.10778809, + "step": 10746, + "time_per_iteration": 2.5105156898498535 + }, + { + "auxiliary_loss_clip": 0.06413017, + "auxiliary_loss_mlp": 0.012653, + "balance_loss_clip": 0.06273216, + "balance_loss_mlp": 0.01254649, + "epoch": 0.6461445964226664, + "flos": 27460624930560.0, + "grad_norm": 1.4939234449131917, + "language_loss": 0.67274344, + "learning_rate": 1.175713157660413e-06, + "loss": 0.74952662, + "num_input_tokens_seen": 231944800, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.10644531, + "step": 10747, + "time_per_iteration": 2.5424420833587646 + }, + { + "auxiliary_loss_clip": 0.0641461, + "auxiliary_loss_mlp": 0.01265405, + "balance_loss_clip": 0.0627532, + "balance_loss_mlp": 0.01255272, + "epoch": 0.6462047196753344, + "flos": 20300457035520.0, + "grad_norm": 1.6454594650819265, + "language_loss": 0.67613244, + "learning_rate": 1.1753583273602056e-06, + "loss": 0.75293255, + "num_input_tokens_seen": 231962970, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.10137939, + "step": 10748, + "time_per_iteration": 2.529270887374878 + }, + { + "auxiliary_loss_clip": 0.0641374, + "auxiliary_loss_mlp": 0.01266285, + "balance_loss_clip": 0.06271876, + "balance_loss_mlp": 0.01254764, + "epoch": 0.6462648429280025, + "flos": 22024937589120.0, + "grad_norm": 1.9564061615945416, + "language_loss": 0.76055253, + "learning_rate": 1.1750035283304937e-06, + "loss": 0.83735275, + "num_input_tokens_seen": 231981195, + "router_z_loss_clip": 1.41796875, + "router_z_loss_mlp": 0.11517334, + "step": 10749, + "time_per_iteration": 2.5083682537078857 + }, + { + "auxiliary_loss_clip": 0.06411772, + "auxiliary_loss_mlp": 0.01264574, + "balance_loss_clip": 0.0627101, + "balance_loss_mlp": 0.01254208, + "epoch": 0.6463249661806704, + "flos": 27788375376000.0, + "grad_norm": 1.4570564957131642, + "language_loss": 0.77334827, + "learning_rate": 1.17464876058473e-06, + "loss": 0.85011172, + "num_input_tokens_seen": 232001735, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.10369873, + "step": 10750, + "time_per_iteration": 2.5812573432922363 + }, + { + "auxiliary_loss_clip": 0.06417309, + "auxiliary_loss_mlp": 0.01269158, + "balance_loss_clip": 0.06274579, + "balance_loss_mlp": 0.01258268, + "epoch": 0.6463850894333384, + "flos": 22056481451520.0, + "grad_norm": 2.0670822566581437, + "language_loss": 0.6898241, + "learning_rate": 1.1742940241363683e-06, + "loss": 0.76668882, + "num_input_tokens_seen": 232019830, + "router_z_loss_clip": 1.42675781, + "router_z_loss_mlp": 0.10900879, + "step": 10751, + "time_per_iteration": 2.4936625957489014 + }, + { + "auxiliary_loss_clip": 0.06414577, + "auxiliary_loss_mlp": 0.01265138, + "balance_loss_clip": 0.06273049, + "balance_loss_mlp": 0.0125448, + "epoch": 0.6464452126860063, + "flos": 21112698188160.0, + "grad_norm": 1.7780067956451429, + "language_loss": 0.71182156, + "learning_rate": 1.1739393189988604e-06, + "loss": 0.78861868, + "num_input_tokens_seen": 232039625, + "router_z_loss_clip": 1.41601562, + "router_z_loss_mlp": 0.10662842, + "step": 10752, + "time_per_iteration": 3.927877426147461 + }, + { + "auxiliary_loss_clip": 0.06415342, + "auxiliary_loss_mlp": 0.01266476, + "balance_loss_clip": 0.06274153, + "balance_loss_mlp": 0.01253661, + "epoch": 0.6465053359386743, + "flos": 16032531720960.0, + "grad_norm": 1.540910380020274, + "language_loss": 0.77855444, + "learning_rate": 1.1735846451856554e-06, + "loss": 0.85537261, + "num_input_tokens_seen": 232055855, + "router_z_loss_clip": 1.41210938, + "router_z_loss_mlp": 0.12823486, + "step": 10753, + "time_per_iteration": 2.4648597240448 + }, + { + "auxiliary_loss_clip": 0.06412196, + "auxiliary_loss_mlp": 0.01268464, + "balance_loss_clip": 0.0627618, + "balance_loss_mlp": 0.01256871, + "epoch": 0.6465654591913422, + "flos": 23404477610880.0, + "grad_norm": 1.596791967646976, + "language_loss": 0.85541224, + "learning_rate": 1.1732300027102041e-06, + "loss": 0.93221891, + "num_input_tokens_seen": 232073475, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.11584473, + "step": 10754, + "time_per_iteration": 2.5978291034698486 + }, + { + "auxiliary_loss_clip": 0.06414384, + "auxiliary_loss_mlp": 0.01267288, + "balance_loss_clip": 0.06275849, + "balance_loss_mlp": 0.01256374, + "epoch": 0.6466255824440102, + "flos": 15382649053440.0, + "grad_norm": 2.138696261718271, + "language_loss": 0.6015234, + "learning_rate": 1.1728753915859541e-06, + "loss": 0.67834014, + "num_input_tokens_seen": 232091090, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.10919189, + "step": 10755, + "time_per_iteration": 2.5456504821777344 + }, + { + "auxiliary_loss_clip": 0.06412394, + "auxiliary_loss_mlp": 0.01268659, + "balance_loss_clip": 0.06275767, + "balance_loss_mlp": 0.01257025, + "epoch": 0.6466857056966782, + "flos": 16258355274240.0, + "grad_norm": 2.6815820423410845, + "language_loss": 0.68557096, + "learning_rate": 1.1725208118263518e-06, + "loss": 0.76238149, + "num_input_tokens_seen": 232107320, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.11633301, + "step": 10756, + "time_per_iteration": 2.4882616996765137 + }, + { + "auxiliary_loss_clip": 0.06423604, + "auxiliary_loss_mlp": 0.01266345, + "balance_loss_clip": 0.06278333, + "balance_loss_mlp": 0.01255199, + "epoch": 0.6467458289493462, + "flos": 21184548664320.0, + "grad_norm": 2.427580887606393, + "language_loss": 0.74556214, + "learning_rate": 1.172166263444844e-06, + "loss": 0.82246166, + "num_input_tokens_seen": 232123930, + "router_z_loss_clip": 1.453125, + "router_z_loss_mlp": 0.1114502, + "step": 10757, + "time_per_iteration": 2.5800364017486572 + }, + { + "auxiliary_loss_clip": 0.06404246, + "auxiliary_loss_mlp": 0.01268605, + "balance_loss_clip": 0.06271093, + "balance_loss_mlp": 0.01257357, + "epoch": 0.6468059522020141, + "flos": 17974198857600.0, + "grad_norm": 1.6114695233803533, + "language_loss": 0.74794757, + "learning_rate": 1.1718117464548734e-06, + "loss": 0.82467604, + "num_input_tokens_seen": 232142905, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.11248779, + "step": 10758, + "time_per_iteration": 2.537113666534424 + }, + { + "auxiliary_loss_clip": 0.06411805, + "auxiliary_loss_mlp": 0.0127172, + "balance_loss_clip": 0.06272358, + "balance_loss_mlp": 0.01259715, + "epoch": 0.6468660754546821, + "flos": 17895178857600.0, + "grad_norm": 1.7921091077439633, + "language_loss": 0.6853838, + "learning_rate": 1.1714572608698845e-06, + "loss": 0.76221907, + "num_input_tokens_seen": 232162230, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.11999512, + "step": 10759, + "time_per_iteration": 2.5501279830932617 + }, + { + "auxiliary_loss_clip": 0.06419058, + "auxiliary_loss_mlp": 0.01268931, + "balance_loss_clip": 0.0627493, + "balance_loss_mlp": 0.01257666, + "epoch": 0.64692619870735, + "flos": 22607497900800.0, + "grad_norm": 1.5782597023408493, + "language_loss": 0.75492609, + "learning_rate": 1.1711028067033197e-06, + "loss": 0.831806, + "num_input_tokens_seen": 232182700, + "router_z_loss_clip": 1.44335938, + "router_z_loss_mlp": 0.11273193, + "step": 10760, + "time_per_iteration": 2.5426504611968994 + }, + { + "auxiliary_loss_clip": 0.06408913, + "auxiliary_loss_mlp": 0.0126904, + "balance_loss_clip": 0.06273125, + "balance_loss_mlp": 0.01258621, + "epoch": 0.646986321960018, + "flos": 49611863750400.0, + "grad_norm": 1.5088139829750542, + "language_loss": 0.65700191, + "learning_rate": 1.1707483839686194e-06, + "loss": 0.73378146, + "num_input_tokens_seen": 232208235, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10406494, + "step": 10761, + "time_per_iteration": 2.8235716819763184 + }, + { + "auxiliary_loss_clip": 0.0641157, + "auxiliary_loss_mlp": 0.01270239, + "balance_loss_clip": 0.06273905, + "balance_loss_mlp": 0.0125886, + "epoch": 0.6470464452126861, + "flos": 21914960705280.0, + "grad_norm": 4.087602702214583, + "language_loss": 0.70041698, + "learning_rate": 1.1703939926792235e-06, + "loss": 0.77723515, + "num_input_tokens_seen": 232228720, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.11376953, + "step": 10762, + "time_per_iteration": 2.4962708950042725 + }, + { + "auxiliary_loss_clip": 0.06415009, + "auxiliary_loss_mlp": 0.01270412, + "balance_loss_clip": 0.06273261, + "balance_loss_mlp": 0.01259039, + "epoch": 0.647106568465354, + "flos": 18110688359040.0, + "grad_norm": 2.044366921559264, + "language_loss": 0.82845706, + "learning_rate": 1.1700396328485705e-06, + "loss": 0.90531123, + "num_input_tokens_seen": 232244655, + "router_z_loss_clip": 1.41699219, + "router_z_loss_mlp": 0.11364746, + "step": 10763, + "time_per_iteration": 2.5127148628234863 + }, + { + "auxiliary_loss_clip": 0.06315573, + "auxiliary_loss_mlp": 0.01250562, + "balance_loss_clip": 0.06259283, + "balance_loss_mlp": 0.01249394, + "epoch": 0.647166691718022, + "flos": 69499623899520.0, + "grad_norm": 0.6915624783517184, + "language_loss": 0.5774473, + "learning_rate": 1.1696853044900978e-06, + "loss": 0.65310872, + "num_input_tokens_seen": 232308685, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01165009, + "step": 10764, + "time_per_iteration": 4.764317035675049 + }, + { + "auxiliary_loss_clip": 0.06411065, + "auxiliary_loss_mlp": 0.01264999, + "balance_loss_clip": 0.06273772, + "balance_loss_mlp": 0.01254532, + "epoch": 0.6472268149706899, + "flos": 34103793934080.0, + "grad_norm": 1.637421021891431, + "language_loss": 0.60742128, + "learning_rate": 1.1693310076172413e-06, + "loss": 0.68418187, + "num_input_tokens_seen": 232327520, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.10467529, + "step": 10765, + "time_per_iteration": 2.6306469440460205 + }, + { + "auxiliary_loss_clip": 0.06408644, + "auxiliary_loss_mlp": 0.01269206, + "balance_loss_clip": 0.0627322, + "balance_loss_mlp": 0.01258924, + "epoch": 0.6472869382233579, + "flos": 28118809152000.0, + "grad_norm": 2.0826927975642273, + "language_loss": 0.63338971, + "learning_rate": 1.168976742243437e-06, + "loss": 0.71016824, + "num_input_tokens_seen": 232349025, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.1027832, + "step": 10766, + "time_per_iteration": 2.608025074005127 + }, + { + "auxiliary_loss_clip": 0.06411771, + "auxiliary_loss_mlp": 0.01268357, + "balance_loss_clip": 0.06273695, + "balance_loss_mlp": 0.01257616, + "epoch": 0.6473470614760258, + "flos": 22498736901120.0, + "grad_norm": 1.6916160768027213, + "language_loss": 0.75775635, + "learning_rate": 1.1686225083821174e-06, + "loss": 0.83455759, + "num_input_tokens_seen": 232367835, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.10736084, + "step": 10767, + "time_per_iteration": 3.9129326343536377 + }, + { + "auxiliary_loss_clip": 0.06410106, + "auxiliary_loss_mlp": 0.01266795, + "balance_loss_clip": 0.06272191, + "balance_loss_mlp": 0.01255613, + "epoch": 0.6474071847286939, + "flos": 14544314553600.0, + "grad_norm": 1.8076972632130168, + "language_loss": 0.77841228, + "learning_rate": 1.1682683060467153e-06, + "loss": 0.85518134, + "num_input_tokens_seen": 232385840, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.11187744, + "step": 10768, + "time_per_iteration": 2.5130937099456787 + }, + { + "auxiliary_loss_clip": 0.06411847, + "auxiliary_loss_mlp": 0.01266069, + "balance_loss_clip": 0.06274557, + "balance_loss_mlp": 0.01255894, + "epoch": 0.6474673079813618, + "flos": 24105190579200.0, + "grad_norm": 1.6392494709530092, + "language_loss": 0.71794009, + "learning_rate": 1.167914135250663e-06, + "loss": 0.79471928, + "num_input_tokens_seen": 232406205, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.10162354, + "step": 10769, + "time_per_iteration": 2.5274879932403564 + }, + { + "auxiliary_loss_clip": 0.06409761, + "auxiliary_loss_mlp": 0.01267023, + "balance_loss_clip": 0.06276036, + "balance_loss_mlp": 0.01256985, + "epoch": 0.6475274312340298, + "flos": 14981538758400.0, + "grad_norm": 1.8331179769777781, + "language_loss": 0.73102438, + "learning_rate": 1.1675599960073895e-06, + "loss": 0.80779225, + "num_input_tokens_seen": 232424995, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.10040283, + "step": 10770, + "time_per_iteration": 2.4902164936065674 + }, + { + "auxiliary_loss_clip": 0.0641522, + "auxiliary_loss_mlp": 0.01267005, + "balance_loss_clip": 0.0627073, + "balance_loss_mlp": 0.01254357, + "epoch": 0.6475875544866977, + "flos": 25052202224640.0, + "grad_norm": 1.6464816515513445, + "language_loss": 0.73554993, + "learning_rate": 1.167205888330325e-06, + "loss": 0.81237221, + "num_input_tokens_seen": 232445870, + "router_z_loss_clip": 1.4453125, + "router_z_loss_mlp": 0.12646484, + "step": 10771, + "time_per_iteration": 2.5617709159851074 + }, + { + "auxiliary_loss_clip": 0.06412145, + "auxiliary_loss_mlp": 0.0126638, + "balance_loss_clip": 0.06276581, + "balance_loss_mlp": 0.01255324, + "epoch": 0.6476476777393657, + "flos": 16477763990400.0, + "grad_norm": 2.394956758167514, + "language_loss": 0.74415565, + "learning_rate": 1.1668518122328958e-06, + "loss": 0.82094085, + "num_input_tokens_seen": 232464285, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.1105957, + "step": 10772, + "time_per_iteration": 2.54032826423645 + }, + { + "auxiliary_loss_clip": 0.06408937, + "auxiliary_loss_mlp": 0.01268327, + "balance_loss_clip": 0.06275553, + "balance_loss_mlp": 0.01258987, + "epoch": 0.6477078009920336, + "flos": 25819399008000.0, + "grad_norm": 1.4893197324025274, + "language_loss": 0.82968116, + "learning_rate": 1.1664977677285305e-06, + "loss": 0.90645373, + "num_input_tokens_seen": 232485815, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.09338379, + "step": 10773, + "time_per_iteration": 3.9616613388061523 + }, + { + "auxiliary_loss_clip": 0.06405786, + "auxiliary_loss_mlp": 0.01267593, + "balance_loss_clip": 0.06272345, + "balance_loss_mlp": 0.01257776, + "epoch": 0.6477679242447016, + "flos": 17681933416320.0, + "grad_norm": 1.4328505723610274, + "language_loss": 0.78670597, + "learning_rate": 1.1661437548306524e-06, + "loss": 0.8634398, + "num_input_tokens_seen": 232504875, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.0982666, + "step": 10774, + "time_per_iteration": 2.471349000930786 + }, + { + "auxiliary_loss_clip": 0.06414998, + "auxiliary_loss_mlp": 0.01270742, + "balance_loss_clip": 0.0627519, + "balance_loss_mlp": 0.01259406, + "epoch": 0.6478280474973696, + "flos": 21038583651840.0, + "grad_norm": 2.0152385899029763, + "language_loss": 0.69592845, + "learning_rate": 1.1657897735526867e-06, + "loss": 0.7727859, + "num_input_tokens_seen": 232521945, + "router_z_loss_clip": 1.40039062, + "router_z_loss_mlp": 0.11346436, + "step": 10775, + "time_per_iteration": 2.518340826034546 + }, + { + "auxiliary_loss_clip": 0.06416593, + "auxiliary_loss_mlp": 0.01267491, + "balance_loss_clip": 0.06272413, + "balance_loss_mlp": 0.01256792, + "epoch": 0.6478881707500376, + "flos": 21623449950720.0, + "grad_norm": 1.6656343992417288, + "language_loss": 0.65808022, + "learning_rate": 1.1654358239080574e-06, + "loss": 0.73492104, + "num_input_tokens_seen": 232541500, + "router_z_loss_clip": 1.44238281, + "router_z_loss_mlp": 0.10705566, + "step": 10776, + "time_per_iteration": 2.511101722717285 + }, + { + "auxiliary_loss_clip": 0.06413212, + "auxiliary_loss_mlp": 0.01268424, + "balance_loss_clip": 0.06273791, + "balance_loss_mlp": 0.01257343, + "epoch": 0.6479482940027056, + "flos": 18448543221120.0, + "grad_norm": 2.2928682482209015, + "language_loss": 0.79598206, + "learning_rate": 1.1650819059101839e-06, + "loss": 0.87279832, + "num_input_tokens_seen": 232559720, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.11096191, + "step": 10777, + "time_per_iteration": 2.554004669189453 + }, + { + "auxiliary_loss_clip": 0.06412454, + "auxiliary_loss_mlp": 0.01266878, + "balance_loss_clip": 0.06275424, + "balance_loss_mlp": 0.01256311, + "epoch": 0.6480084172553735, + "flos": 22170651039360.0, + "grad_norm": 1.8955877147463427, + "language_loss": 0.74017107, + "learning_rate": 1.1647280195724896e-06, + "loss": 0.81696445, + "num_input_tokens_seen": 232579370, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.10571289, + "step": 10778, + "time_per_iteration": 2.5087220668792725 + }, + { + "auxiliary_loss_clip": 0.06407086, + "auxiliary_loss_mlp": 0.01264071, + "balance_loss_clip": 0.06272884, + "balance_loss_mlp": 0.01253694, + "epoch": 0.6480685405080415, + "flos": 24323089921920.0, + "grad_norm": 1.3775726820823926, + "language_loss": 0.78463447, + "learning_rate": 1.1643741649083923e-06, + "loss": 0.86134601, + "num_input_tokens_seen": 232600495, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.10388184, + "step": 10779, + "time_per_iteration": 2.5677905082702637 + }, + { + "auxiliary_loss_clip": 0.06319194, + "auxiliary_loss_mlp": 0.01254794, + "balance_loss_clip": 0.06262461, + "balance_loss_mlp": 0.01253526, + "epoch": 0.6481286637607094, + "flos": 59910348539520.0, + "grad_norm": 0.7063734620210058, + "language_loss": 0.59437895, + "learning_rate": 1.1640203419313095e-06, + "loss": 0.67011881, + "num_input_tokens_seen": 232663165, + "router_z_loss_clip": 0.56640625, + "router_z_loss_mlp": 0.01268005, + "step": 10780, + "time_per_iteration": 3.11826229095459 + }, + { + "auxiliary_loss_clip": 0.06409959, + "auxiliary_loss_mlp": 0.01264952, + "balance_loss_clip": 0.06273422, + "balance_loss_mlp": 0.01254974, + "epoch": 0.6481887870133775, + "flos": 25491313146240.0, + "grad_norm": 1.83776143864241, + "language_loss": 0.79705411, + "learning_rate": 1.1636665506546599e-06, + "loss": 0.87380326, + "num_input_tokens_seen": 232683385, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.09979248, + "step": 10781, + "time_per_iteration": 2.5406956672668457 + }, + { + "auxiliary_loss_clip": 0.06418487, + "auxiliary_loss_mlp": 0.0127058, + "balance_loss_clip": 0.06278095, + "balance_loss_mlp": 0.01258636, + "epoch": 0.6482489102660454, + "flos": 19935041379840.0, + "grad_norm": 2.151495176949557, + "language_loss": 0.78676552, + "learning_rate": 1.1633127910918578e-06, + "loss": 0.86365616, + "num_input_tokens_seen": 232699095, + "router_z_loss_clip": 1.40429688, + "router_z_loss_mlp": 0.11938477, + "step": 10782, + "time_per_iteration": 2.5015201568603516 + }, + { + "auxiliary_loss_clip": 0.06412151, + "auxiliary_loss_mlp": 0.01268158, + "balance_loss_clip": 0.06272621, + "balance_loss_mlp": 0.01257525, + "epoch": 0.6483090335187134, + "flos": 26986741764480.0, + "grad_norm": 3.0083350466584378, + "language_loss": 0.64055502, + "learning_rate": 1.1629590632563187e-06, + "loss": 0.71735811, + "num_input_tokens_seen": 232717920, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.10632324, + "step": 10783, + "time_per_iteration": 2.536803960800171 + }, + { + "auxiliary_loss_clip": 0.06416991, + "auxiliary_loss_mlp": 0.01269846, + "balance_loss_clip": 0.06275127, + "balance_loss_mlp": 0.01258426, + "epoch": 0.6483691567713813, + "flos": 25084207284480.0, + "grad_norm": 1.8907849838824615, + "language_loss": 0.89016545, + "learning_rate": 1.1626053671614561e-06, + "loss": 0.96703386, + "num_input_tokens_seen": 232737605, + "router_z_loss_clip": 1.41894531, + "router_z_loss_mlp": 0.11431885, + "step": 10784, + "time_per_iteration": 2.5452053546905518 + }, + { + "auxiliary_loss_clip": 0.0641108, + "auxiliary_loss_mlp": 0.01266426, + "balance_loss_clip": 0.06276603, + "balance_loss_mlp": 0.01254636, + "epoch": 0.6484292800240493, + "flos": 16111300158720.0, + "grad_norm": 2.486751490302504, + "language_loss": 0.73449266, + "learning_rate": 1.1622517028206815e-06, + "loss": 0.81126773, + "num_input_tokens_seen": 232755110, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.11798096, + "step": 10785, + "time_per_iteration": 2.4847772121429443 + }, + { + "auxiliary_loss_clip": 0.06405519, + "auxiliary_loss_mlp": 0.01266455, + "balance_loss_clip": 0.06272283, + "balance_loss_mlp": 0.01256507, + "epoch": 0.6484894032767172, + "flos": 28848005308800.0, + "grad_norm": 1.4322253483725718, + "language_loss": 0.69456708, + "learning_rate": 1.1618980702474071e-06, + "loss": 0.77128685, + "num_input_tokens_seen": 232779040, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.0994873, + "step": 10786, + "time_per_iteration": 2.585789918899536 + }, + { + "auxiliary_loss_clip": 0.06408978, + "auxiliary_loss_mlp": 0.0126988, + "balance_loss_clip": 0.06272955, + "balance_loss_mlp": 0.01259122, + "epoch": 0.6485495265293852, + "flos": 30234924489600.0, + "grad_norm": 2.0420211875900285, + "language_loss": 0.71877193, + "learning_rate": 1.161544469455041e-06, + "loss": 0.79556048, + "num_input_tokens_seen": 232800515, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.10760498, + "step": 10787, + "time_per_iteration": 2.566206216812134 + }, + { + "auxiliary_loss_clip": 0.06411794, + "auxiliary_loss_mlp": 0.01266479, + "balance_loss_clip": 0.06271984, + "balance_loss_mlp": 0.01255595, + "epoch": 0.6486096497820532, + "flos": 20088050135040.0, + "grad_norm": 1.7621323533283269, + "language_loss": 0.84403133, + "learning_rate": 1.1611909004569934e-06, + "loss": 0.92081404, + "num_input_tokens_seen": 232818450, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.10882568, + "step": 10788, + "time_per_iteration": 2.482072353363037 + }, + { + "auxiliary_loss_clip": 0.06410778, + "auxiliary_loss_mlp": 0.01268935, + "balance_loss_clip": 0.06273876, + "balance_loss_mlp": 0.01258111, + "epoch": 0.6486697730347212, + "flos": 17134816181760.0, + "grad_norm": 2.2095301330311643, + "language_loss": 0.77364171, + "learning_rate": 1.1608373632666708e-06, + "loss": 0.85043883, + "num_input_tokens_seen": 232834785, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.10821533, + "step": 10789, + "time_per_iteration": 2.5368380546569824 + }, + { + "auxiliary_loss_clip": 0.06408279, + "auxiliary_loss_mlp": 0.01268929, + "balance_loss_clip": 0.06272905, + "balance_loss_mlp": 0.01258606, + "epoch": 0.6487298962873892, + "flos": 38921477886720.0, + "grad_norm": 1.570352466870208, + "language_loss": 0.76618487, + "learning_rate": 1.160483857897479e-06, + "loss": 0.8429569, + "num_input_tokens_seen": 232856050, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.10327148, + "step": 10790, + "time_per_iteration": 2.6590943336486816 + }, + { + "auxiliary_loss_clip": 0.06408708, + "auxiliary_loss_mlp": 0.01266087, + "balance_loss_clip": 0.0627384, + "balance_loss_mlp": 0.01256169, + "epoch": 0.6487900195400571, + "flos": 11952680895360.0, + "grad_norm": 2.134716405653686, + "language_loss": 0.59979677, + "learning_rate": 1.160130384362823e-06, + "loss": 0.67654467, + "num_input_tokens_seen": 232873945, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.09924316, + "step": 10791, + "time_per_iteration": 3.963503360748291 + }, + { + "auxiliary_loss_clip": 0.06410848, + "auxiliary_loss_mlp": 0.01268892, + "balance_loss_clip": 0.06274579, + "balance_loss_mlp": 0.01258646, + "epoch": 0.6488501427927251, + "flos": 22350717463680.0, + "grad_norm": 1.5491724826349689, + "language_loss": 0.8594861, + "learning_rate": 1.1597769426761082e-06, + "loss": 0.93628347, + "num_input_tokens_seen": 232892160, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.10253906, + "step": 10792, + "time_per_iteration": 2.555723190307617 + }, + { + "auxiliary_loss_clip": 0.06414551, + "auxiliary_loss_mlp": 0.01268197, + "balance_loss_clip": 0.06273945, + "balance_loss_mlp": 0.01256419, + "epoch": 0.648910266045393, + "flos": 22242753077760.0, + "grad_norm": 1.7314529044761888, + "language_loss": 0.78069973, + "learning_rate": 1.159423532850735e-06, + "loss": 0.85752726, + "num_input_tokens_seen": 232911725, + "router_z_loss_clip": 1.40527344, + "router_z_loss_mlp": 0.11773682, + "step": 10793, + "time_per_iteration": 2.5019938945770264 + }, + { + "auxiliary_loss_clip": 0.06413871, + "auxiliary_loss_mlp": 0.01268245, + "balance_loss_clip": 0.06274308, + "balance_loss_mlp": 0.01257367, + "epoch": 0.6489703892980611, + "flos": 25308269902080.0, + "grad_norm": 1.950729669882986, + "language_loss": 0.74567354, + "learning_rate": 1.1590701549001055e-06, + "loss": 0.82249475, + "num_input_tokens_seen": 232929085, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.10882568, + "step": 10794, + "time_per_iteration": 2.5795669555664062 + }, + { + "auxiliary_loss_clip": 0.06410497, + "auxiliary_loss_mlp": 0.01265921, + "balance_loss_clip": 0.06272513, + "balance_loss_mlp": 0.01254655, + "epoch": 0.649030512550729, + "flos": 24578864110080.0, + "grad_norm": 1.8148879038848986, + "language_loss": 0.699453, + "learning_rate": 1.158716808837621e-06, + "loss": 0.77621716, + "num_input_tokens_seen": 232949455, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.11273193, + "step": 10795, + "time_per_iteration": 2.538400173187256 + }, + { + "auxiliary_loss_clip": 0.06416844, + "auxiliary_loss_mlp": 0.01273855, + "balance_loss_clip": 0.06276066, + "balance_loss_mlp": 0.01261964, + "epoch": 0.649090635803397, + "flos": 26251004989440.0, + "grad_norm": 1.9678382508243188, + "language_loss": 0.54238826, + "learning_rate": 1.158363494676679e-06, + "loss": 0.61929524, + "num_input_tokens_seen": 232969445, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.11895752, + "step": 10796, + "time_per_iteration": 2.6402297019958496 + }, + { + "auxiliary_loss_clip": 0.06412029, + "auxiliary_loss_mlp": 0.01265233, + "balance_loss_clip": 0.06273568, + "balance_loss_mlp": 0.01254767, + "epoch": 0.6491507590560649, + "flos": 24944489400960.0, + "grad_norm": 1.676360773921332, + "language_loss": 0.77936971, + "learning_rate": 1.1580102124306775e-06, + "loss": 0.85614228, + "num_input_tokens_seen": 232988900, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.10467529, + "step": 10797, + "time_per_iteration": 2.5467689037323 + }, + { + "auxiliary_loss_clip": 0.06405483, + "auxiliary_loss_mlp": 0.01265668, + "balance_loss_clip": 0.06273736, + "balance_loss_mlp": 0.01255935, + "epoch": 0.6492108823087329, + "flos": 19505783312640.0, + "grad_norm": 3.2369805565604053, + "language_loss": 0.7037648, + "learning_rate": 1.1576569621130134e-06, + "loss": 0.78047633, + "num_input_tokens_seen": 233005060, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.09729004, + "step": 10798, + "time_per_iteration": 2.5187807083129883 + }, + { + "auxiliary_loss_clip": 0.06409095, + "auxiliary_loss_mlp": 0.01266435, + "balance_loss_clip": 0.06272874, + "balance_loss_mlp": 0.01256493, + "epoch": 0.6492710055614008, + "flos": 19725443591040.0, + "grad_norm": 1.928025975497767, + "language_loss": 0.77484357, + "learning_rate": 1.1573037437370811e-06, + "loss": 0.85159886, + "num_input_tokens_seen": 233023375, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.09942627, + "step": 10799, + "time_per_iteration": 2.4996323585510254 + }, + { + "auxiliary_loss_clip": 0.06416353, + "auxiliary_loss_mlp": 0.01268958, + "balance_loss_clip": 0.06274813, + "balance_loss_mlp": 0.01257466, + "epoch": 0.6493311288140688, + "flos": 24324012316800.0, + "grad_norm": 1.6859277521525557, + "language_loss": 0.72046328, + "learning_rate": 1.1569505573162755e-06, + "loss": 0.79731631, + "num_input_tokens_seen": 233043130, + "router_z_loss_clip": 1.41503906, + "router_z_loss_mlp": 0.11480713, + "step": 10800, + "time_per_iteration": 2.5757715702056885 + }, + { + "auxiliary_loss_clip": 0.06306565, + "auxiliary_loss_mlp": 0.01256479, + "balance_loss_clip": 0.06250083, + "balance_loss_mlp": 0.01255134, + "epoch": 0.6493912520667368, + "flos": 70953655800960.0, + "grad_norm": 0.743676703722325, + "language_loss": 0.60158885, + "learning_rate": 1.1565974028639897e-06, + "loss": 0.67721927, + "num_input_tokens_seen": 233110560, + "router_z_loss_clip": 0.56640625, + "router_z_loss_mlp": 0.01346588, + "step": 10801, + "time_per_iteration": 3.246039867401123 + }, + { + "auxiliary_loss_clip": 0.06415623, + "auxiliary_loss_mlp": 0.01272232, + "balance_loss_clip": 0.06277107, + "balance_loss_mlp": 0.01260782, + "epoch": 0.6494513753194048, + "flos": 25344803082240.0, + "grad_norm": 1.7594241437691729, + "language_loss": 0.78884411, + "learning_rate": 1.156244280393614e-06, + "loss": 0.86572272, + "num_input_tokens_seen": 233130080, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.11456299, + "step": 10802, + "time_per_iteration": 2.563626766204834 + }, + { + "auxiliary_loss_clip": 0.06407687, + "auxiliary_loss_mlp": 0.01265006, + "balance_loss_clip": 0.06270398, + "balance_loss_mlp": 0.01254385, + "epoch": 0.6495114985720728, + "flos": 24689050629120.0, + "grad_norm": 1.4701116877862836, + "language_loss": 0.7461825, + "learning_rate": 1.155891189918541e-06, + "loss": 0.82290947, + "num_input_tokens_seen": 233150235, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.10620117, + "step": 10803, + "time_per_iteration": 2.6647095680236816 + }, + { + "auxiliary_loss_clip": 0.06410737, + "auxiliary_loss_mlp": 0.01268913, + "balance_loss_clip": 0.06273716, + "balance_loss_mlp": 0.01258112, + "epoch": 0.6495716218247407, + "flos": 23656520292480.0, + "grad_norm": 2.024891036997784, + "language_loss": 0.6987229, + "learning_rate": 1.1555381314521578e-06, + "loss": 0.77551937, + "num_input_tokens_seen": 233166710, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.10803223, + "step": 10804, + "time_per_iteration": 3.998316526412964 + }, + { + "auxiliary_loss_clip": 0.06410199, + "auxiliary_loss_mlp": 0.01264742, + "balance_loss_clip": 0.0627581, + "balance_loss_mlp": 0.01254019, + "epoch": 0.6496317450774087, + "flos": 22352729961600.0, + "grad_norm": 1.61833096357978, + "language_loss": 0.72940427, + "learning_rate": 1.1551851050078537e-06, + "loss": 0.80615366, + "num_input_tokens_seen": 233185445, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.1072998, + "step": 10805, + "time_per_iteration": 2.550152540206909 + }, + { + "auxiliary_loss_clip": 0.06408597, + "auxiliary_loss_mlp": 0.01268433, + "balance_loss_clip": 0.06270424, + "balance_loss_mlp": 0.01258384, + "epoch": 0.6496918683300766, + "flos": 30526519098240.0, + "grad_norm": 1.9854028073217467, + "language_loss": 0.66420656, + "learning_rate": 1.1548321105990155e-06, + "loss": 0.74097693, + "num_input_tokens_seen": 233205805, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.1005249, + "step": 10806, + "time_per_iteration": 4.017642021179199 + }, + { + "auxiliary_loss_clip": 0.06412096, + "auxiliary_loss_mlp": 0.01270405, + "balance_loss_clip": 0.06272469, + "balance_loss_mlp": 0.01259587, + "epoch": 0.6497519915827447, + "flos": 12463977709440.0, + "grad_norm": 2.120421469188937, + "language_loss": 0.79874885, + "learning_rate": 1.1544791482390275e-06, + "loss": 0.87557387, + "num_input_tokens_seen": 233224215, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.10821533, + "step": 10807, + "time_per_iteration": 2.47318959236145 + }, + { + "auxiliary_loss_clip": 0.06308749, + "auxiliary_loss_mlp": 0.01254009, + "balance_loss_clip": 0.06252696, + "balance_loss_mlp": 0.01252862, + "epoch": 0.6498121148354126, + "flos": 69115787544960.0, + "grad_norm": 0.7752767775633225, + "language_loss": 0.5892998, + "learning_rate": 1.1541262179412745e-06, + "loss": 0.66492736, + "num_input_tokens_seen": 233294440, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01145935, + "step": 10808, + "time_per_iteration": 3.316317319869995 + }, + { + "auxiliary_loss_clip": 0.06407646, + "auxiliary_loss_mlp": 0.0126717, + "balance_loss_clip": 0.06275291, + "balance_loss_mlp": 0.01257043, + "epoch": 0.6498722380880806, + "flos": 36904983454080.0, + "grad_norm": 1.693655644054658, + "language_loss": 0.63518184, + "learning_rate": 1.1537733197191415e-06, + "loss": 0.71192998, + "num_input_tokens_seen": 233316125, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.10131836, + "step": 10809, + "time_per_iteration": 2.6661953926086426 + }, + { + "auxiliary_loss_clip": 0.06407648, + "auxiliary_loss_mlp": 0.01268298, + "balance_loss_clip": 0.06274128, + "balance_loss_mlp": 0.01258499, + "epoch": 0.6499323613407485, + "flos": 29024549861760.0, + "grad_norm": 1.455455865849343, + "language_loss": 0.81994486, + "learning_rate": 1.153420453586008e-06, + "loss": 0.89670432, + "num_input_tokens_seen": 233336140, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.09796143, + "step": 10810, + "time_per_iteration": 2.582893133163452 + }, + { + "auxiliary_loss_clip": 0.06403928, + "auxiliary_loss_mlp": 0.01273294, + "balance_loss_clip": 0.06272624, + "balance_loss_mlp": 0.01263382, + "epoch": 0.6499924845934165, + "flos": 20125212220800.0, + "grad_norm": 1.5531414073118446, + "language_loss": 0.71929145, + "learning_rate": 1.1530676195552561e-06, + "loss": 0.79606366, + "num_input_tokens_seen": 233356095, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.09912109, + "step": 10811, + "time_per_iteration": 2.5130205154418945 + }, + { + "auxiliary_loss_clip": 0.06403043, + "auxiliary_loss_mlp": 0.01269239, + "balance_loss_clip": 0.06273396, + "balance_loss_mlp": 0.01259273, + "epoch": 0.6500526078460844, + "flos": 24427490509440.0, + "grad_norm": 1.5864651817553501, + "language_loss": 0.78127778, + "learning_rate": 1.1527148176402649e-06, + "loss": 0.85800058, + "num_input_tokens_seen": 233376830, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.09967041, + "step": 10812, + "time_per_iteration": 2.5567028522491455 + }, + { + "auxiliary_loss_clip": 0.06411995, + "auxiliary_loss_mlp": 0.01269878, + "balance_loss_clip": 0.06273413, + "balance_loss_mlp": 0.01258887, + "epoch": 0.6501127310987524, + "flos": 23337700306560.0, + "grad_norm": 1.8208092909693303, + "language_loss": 0.85530257, + "learning_rate": 1.152362047854413e-06, + "loss": 0.93212128, + "num_input_tokens_seen": 233395275, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.10992432, + "step": 10813, + "time_per_iteration": 3.9791102409362793 + }, + { + "auxiliary_loss_clip": 0.06410386, + "auxiliary_loss_mlp": 0.01268379, + "balance_loss_clip": 0.0627619, + "balance_loss_mlp": 0.01257955, + "epoch": 0.6501728543514204, + "flos": 18703814284800.0, + "grad_norm": 1.7861415482224605, + "language_loss": 0.80307227, + "learning_rate": 1.1520093102110764e-06, + "loss": 0.87985992, + "num_input_tokens_seen": 233413345, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.10424805, + "step": 10814, + "time_per_iteration": 2.4790940284729004 + }, + { + "auxiliary_loss_clip": 0.06415637, + "auxiliary_loss_mlp": 0.01266919, + "balance_loss_clip": 0.06275604, + "balance_loss_mlp": 0.01256119, + "epoch": 0.6502329776040884, + "flos": 44209858550400.0, + "grad_norm": 1.5485248232594282, + "language_loss": 0.65536499, + "learning_rate": 1.1516566047236328e-06, + "loss": 0.73219061, + "num_input_tokens_seen": 233436105, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.10803223, + "step": 10815, + "time_per_iteration": 2.7446234226226807 + }, + { + "auxiliary_loss_clip": 0.06417957, + "auxiliary_loss_mlp": 0.01270506, + "balance_loss_clip": 0.06274943, + "balance_loss_mlp": 0.01257667, + "epoch": 0.6502931008567564, + "flos": 14580009192960.0, + "grad_norm": 1.8474906541134053, + "language_loss": 0.75516546, + "learning_rate": 1.1513039314054546e-06, + "loss": 0.83205009, + "num_input_tokens_seen": 233452320, + "router_z_loss_clip": 1.43164062, + "router_z_loss_mlp": 0.12841797, + "step": 10816, + "time_per_iteration": 2.4595513343811035 + }, + { + "auxiliary_loss_clip": 0.06411922, + "auxiliary_loss_mlp": 0.01272269, + "balance_loss_clip": 0.06278138, + "balance_loss_mlp": 0.01261845, + "epoch": 0.6503532241094243, + "flos": 21400980560640.0, + "grad_norm": 1.6906297848786114, + "language_loss": 0.73428237, + "learning_rate": 1.1509512902699174e-06, + "loss": 0.81112432, + "num_input_tokens_seen": 233469920, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.10424805, + "step": 10817, + "time_per_iteration": 2.5484201908111572 + }, + { + "auxiliary_loss_clip": 0.06410678, + "auxiliary_loss_mlp": 0.01266458, + "balance_loss_clip": 0.06273761, + "balance_loss_mlp": 0.01255783, + "epoch": 0.6504133473620923, + "flos": 74756349648000.0, + "grad_norm": 1.454828626029086, + "language_loss": 0.71655715, + "learning_rate": 1.1505986813303916e-06, + "loss": 0.79332852, + "num_input_tokens_seen": 233499780, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.10675049, + "step": 10818, + "time_per_iteration": 2.908658504486084 + }, + { + "auxiliary_loss_clip": 0.06415702, + "auxiliary_loss_mlp": 0.01267764, + "balance_loss_clip": 0.06276265, + "balance_loss_mlp": 0.01257261, + "epoch": 0.6504734706147602, + "flos": 19718399848320.0, + "grad_norm": 2.191602402717942, + "language_loss": 0.64758539, + "learning_rate": 1.150246104600249e-06, + "loss": 0.72442001, + "num_input_tokens_seen": 233518235, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.10510254, + "step": 10819, + "time_per_iteration": 2.5333735942840576 + }, + { + "auxiliary_loss_clip": 0.06412923, + "auxiliary_loss_mlp": 0.01268465, + "balance_loss_clip": 0.06274152, + "balance_loss_mlp": 0.01257849, + "epoch": 0.6505335938674283, + "flos": 25563960236160.0, + "grad_norm": 1.7905989506117173, + "language_loss": 0.83637512, + "learning_rate": 1.14989356009286e-06, + "loss": 0.91318899, + "num_input_tokens_seen": 233535215, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.10614014, + "step": 10820, + "time_per_iteration": 2.5265371799468994 + }, + { + "auxiliary_loss_clip": 0.06416887, + "auxiliary_loss_mlp": 0.01268038, + "balance_loss_clip": 0.06276121, + "balance_loss_mlp": 0.01256278, + "epoch": 0.6505937171200962, + "flos": 17827143742080.0, + "grad_norm": 2.110303525663697, + "language_loss": 0.78078735, + "learning_rate": 1.1495410478215914e-06, + "loss": 0.85763657, + "num_input_tokens_seen": 233552775, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.11755371, + "step": 10821, + "time_per_iteration": 2.5157594680786133 + }, + { + "auxiliary_loss_clip": 0.06407174, + "auxiliary_loss_mlp": 0.01267611, + "balance_loss_clip": 0.06274926, + "balance_loss_mlp": 0.01258193, + "epoch": 0.6506538403727642, + "flos": 20674467734400.0, + "grad_norm": 1.345963122833849, + "language_loss": 0.79950106, + "learning_rate": 1.1491885677998126e-06, + "loss": 0.8762489, + "num_input_tokens_seen": 233572080, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.09417725, + "step": 10822, + "time_per_iteration": 2.556008815765381 + }, + { + "auxiliary_loss_clip": 0.06409828, + "auxiliary_loss_mlp": 0.01265301, + "balance_loss_clip": 0.06275606, + "balance_loss_mlp": 0.01254489, + "epoch": 0.6507139636254321, + "flos": 11724970625280.0, + "grad_norm": 1.7704738467059193, + "language_loss": 0.87903178, + "learning_rate": 1.1488361200408883e-06, + "loss": 0.95578313, + "num_input_tokens_seen": 233589155, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.1081543, + "step": 10823, + "time_per_iteration": 2.5153284072875977 + }, + { + "auxiliary_loss_clip": 0.06410562, + "auxiliary_loss_mlp": 0.01263649, + "balance_loss_clip": 0.06273584, + "balance_loss_mlp": 0.01252885, + "epoch": 0.6507740868781001, + "flos": 26769177838080.0, + "grad_norm": 1.5876907781405154, + "language_loss": 0.66698307, + "learning_rate": 1.148483704558183e-06, + "loss": 0.74372518, + "num_input_tokens_seen": 233608180, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.10760498, + "step": 10824, + "time_per_iteration": 2.5415477752685547 + }, + { + "auxiliary_loss_clip": 0.06414588, + "auxiliary_loss_mlp": 0.01270098, + "balance_loss_clip": 0.06274509, + "balance_loss_mlp": 0.01259471, + "epoch": 0.650834210130768, + "flos": 16477260865920.0, + "grad_norm": 2.5628817527572365, + "language_loss": 0.88034272, + "learning_rate": 1.1481313213650607e-06, + "loss": 0.95718956, + "num_input_tokens_seen": 233625750, + "router_z_loss_clip": 1.40039062, + "router_z_loss_mlp": 0.10632324, + "step": 10825, + "time_per_iteration": 2.5432024002075195 + }, + { + "auxiliary_loss_clip": 0.06415717, + "auxiliary_loss_mlp": 0.01269359, + "balance_loss_clip": 0.0627567, + "balance_loss_mlp": 0.012577, + "epoch": 0.650894333383436, + "flos": 17134354984320.0, + "grad_norm": 2.078178971450375, + "language_loss": 0.73451078, + "learning_rate": 1.147778970474885e-06, + "loss": 0.81136155, + "num_input_tokens_seen": 233644235, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.11651611, + "step": 10826, + "time_per_iteration": 2.483405113220215 + }, + { + "auxiliary_loss_clip": 0.06414787, + "auxiliary_loss_mlp": 0.0126516, + "balance_loss_clip": 0.06277563, + "balance_loss_mlp": 0.01255057, + "epoch": 0.650954456636104, + "flos": 18740221683840.0, + "grad_norm": 2.050300118391263, + "language_loss": 0.69847488, + "learning_rate": 1.1474266519010157e-06, + "loss": 0.7752744, + "num_input_tokens_seen": 233662845, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.10107422, + "step": 10827, + "time_per_iteration": 2.529306650161743 + }, + { + "auxiliary_loss_clip": 0.06416346, + "auxiliary_loss_mlp": 0.01267488, + "balance_loss_clip": 0.0627773, + "balance_loss_mlp": 0.01256479, + "epoch": 0.651014579888772, + "flos": 24533987448960.0, + "grad_norm": 2.390068067700356, + "language_loss": 0.77023715, + "learning_rate": 1.1470743656568136e-06, + "loss": 0.84707546, + "num_input_tokens_seen": 233681990, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.10998535, + "step": 10828, + "time_per_iteration": 2.5035903453826904 + }, + { + "auxiliary_loss_clip": 0.06409818, + "auxiliary_loss_mlp": 0.01263344, + "balance_loss_clip": 0.06275382, + "balance_loss_mlp": 0.01252961, + "epoch": 0.65107470314144, + "flos": 24067944639360.0, + "grad_norm": 1.7088923896554455, + "language_loss": 0.89246607, + "learning_rate": 1.1467221117556362e-06, + "loss": 0.96919769, + "num_input_tokens_seen": 233698930, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.10388184, + "step": 10829, + "time_per_iteration": 2.51090931892395 + }, + { + "auxiliary_loss_clip": 0.06314664, + "auxiliary_loss_mlp": 0.0125328, + "balance_loss_clip": 0.06258522, + "balance_loss_mlp": 0.01251908, + "epoch": 0.6511348263941079, + "flos": 72502304561280.0, + "grad_norm": 0.6366010219235949, + "language_loss": 0.55376649, + "learning_rate": 1.1463698902108428e-06, + "loss": 0.62944591, + "num_input_tokens_seen": 233769825, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01374817, + "step": 10830, + "time_per_iteration": 3.2892563343048096 + }, + { + "auxiliary_loss_clip": 0.06424817, + "auxiliary_loss_mlp": 0.01266709, + "balance_loss_clip": 0.06282428, + "balance_loss_mlp": 0.01255372, + "epoch": 0.6511949496467759, + "flos": 23374401194880.0, + "grad_norm": 2.1202653739592026, + "language_loss": 0.75132632, + "learning_rate": 1.1460177010357878e-06, + "loss": 0.82824159, + "num_input_tokens_seen": 233787095, + "router_z_loss_clip": 1.42480469, + "router_z_loss_mlp": 0.11334229, + "step": 10831, + "time_per_iteration": 4.007694482803345 + }, + { + "auxiliary_loss_clip": 0.06315142, + "auxiliary_loss_mlp": 0.01253248, + "balance_loss_clip": 0.06259014, + "balance_loss_mlp": 0.01251801, + "epoch": 0.6512550728994438, + "flos": 67353390218880.0, + "grad_norm": 0.6347055670227107, + "language_loss": 0.51072258, + "learning_rate": 1.145665544243828e-06, + "loss": 0.58640647, + "num_input_tokens_seen": 233853050, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01445007, + "step": 10832, + "time_per_iteration": 3.2983696460723877 + }, + { + "auxiliary_loss_clip": 0.06417792, + "auxiliary_loss_mlp": 0.01264906, + "balance_loss_clip": 0.06276103, + "balance_loss_mlp": 0.01254195, + "epoch": 0.6513151961521119, + "flos": 21147973557120.0, + "grad_norm": 2.2140276605758693, + "language_loss": 0.8367548, + "learning_rate": 1.145313419848316e-06, + "loss": 0.91358173, + "num_input_tokens_seen": 233871385, + "router_z_loss_clip": 1.41699219, + "router_z_loss_mlp": 0.10699463, + "step": 10833, + "time_per_iteration": 2.511261463165283 + }, + { + "auxiliary_loss_clip": 0.06416205, + "auxiliary_loss_mlp": 0.01266301, + "balance_loss_clip": 0.06280707, + "balance_loss_mlp": 0.01255471, + "epoch": 0.6513753194047798, + "flos": 15164246586240.0, + "grad_norm": 10.86743731426701, + "language_loss": 0.84111547, + "learning_rate": 1.1449613278626049e-06, + "loss": 0.9179405, + "num_input_tokens_seen": 233888175, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.1083374, + "step": 10834, + "time_per_iteration": 2.4789986610412598 + }, + { + "auxiliary_loss_clip": 0.06416395, + "auxiliary_loss_mlp": 0.01267897, + "balance_loss_clip": 0.06278732, + "balance_loss_mlp": 0.01257979, + "epoch": 0.6514354426574478, + "flos": 30234421365120.0, + "grad_norm": 1.7456774308536143, + "language_loss": 0.77525127, + "learning_rate": 1.1446092683000455e-06, + "loss": 0.85209417, + "num_input_tokens_seen": 233911470, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.09918213, + "step": 10835, + "time_per_iteration": 2.588974714279175 + }, + { + "auxiliary_loss_clip": 0.06414215, + "auxiliary_loss_mlp": 0.01268341, + "balance_loss_clip": 0.06276295, + "balance_loss_mlp": 0.01257624, + "epoch": 0.6514955659101157, + "flos": 24212232570240.0, + "grad_norm": 5.683759297238724, + "language_loss": 0.77732491, + "learning_rate": 1.1442572411739882e-06, + "loss": 0.85415047, + "num_input_tokens_seen": 233932135, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.10717773, + "step": 10836, + "time_per_iteration": 2.5676357746124268 + }, + { + "auxiliary_loss_clip": 0.06414723, + "auxiliary_loss_mlp": 0.01267155, + "balance_loss_clip": 0.06277227, + "balance_loss_mlp": 0.01256552, + "epoch": 0.6515556891627837, + "flos": 12381351984000.0, + "grad_norm": 1.8169643503490496, + "language_loss": 0.82167637, + "learning_rate": 1.143905246497783e-06, + "loss": 0.8984952, + "num_input_tokens_seen": 233947880, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.1060791, + "step": 10837, + "time_per_iteration": 2.483123779296875 + }, + { + "auxiliary_loss_clip": 0.06414027, + "auxiliary_loss_mlp": 0.01269762, + "balance_loss_clip": 0.06281339, + "balance_loss_mlp": 0.01258753, + "epoch": 0.6516158124154516, + "flos": 49612366874880.0, + "grad_norm": 1.9745505880128194, + "language_loss": 0.59549761, + "learning_rate": 1.1435532842847758e-06, + "loss": 0.67233551, + "num_input_tokens_seen": 233971475, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.11004639, + "step": 10838, + "time_per_iteration": 2.762786865234375 + }, + { + "auxiliary_loss_clip": 0.06317103, + "auxiliary_loss_mlp": 0.01253866, + "balance_loss_clip": 0.06261341, + "balance_loss_mlp": 0.01252529, + "epoch": 0.6516759356681197, + "flos": 59720848531200.0, + "grad_norm": 0.7135395932752281, + "language_loss": 0.60686612, + "learning_rate": 1.1432013545483147e-06, + "loss": 0.68257582, + "num_input_tokens_seen": 234030690, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.01338196, + "step": 10839, + "time_per_iteration": 3.223712921142578 + }, + { + "auxiliary_loss_clip": 0.06412867, + "auxiliary_loss_mlp": 0.01261941, + "balance_loss_clip": 0.06278579, + "balance_loss_mlp": 0.01252809, + "epoch": 0.6517360589207876, + "flos": 37459815264000.0, + "grad_norm": 1.5945463275519725, + "language_loss": 0.67963755, + "learning_rate": 1.1428494573017439e-06, + "loss": 0.75638568, + "num_input_tokens_seen": 234052470, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.09136963, + "step": 10840, + "time_per_iteration": 2.6288609504699707 + }, + { + "auxiliary_loss_clip": 0.06418526, + "auxiliary_loss_mlp": 0.01264725, + "balance_loss_clip": 0.06281736, + "balance_loss_mlp": 0.01254264, + "epoch": 0.6517961821734556, + "flos": 25382049022080.0, + "grad_norm": 2.724184034803811, + "language_loss": 0.73645818, + "learning_rate": 1.1424975925584071e-06, + "loss": 0.81329072, + "num_input_tokens_seen": 234071495, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.10461426, + "step": 10841, + "time_per_iteration": 2.6020925045013428 + }, + { + "auxiliary_loss_clip": 0.06416935, + "auxiliary_loss_mlp": 0.01263784, + "balance_loss_clip": 0.0627799, + "balance_loss_mlp": 0.01252632, + "epoch": 0.6518563054261236, + "flos": 28774519678080.0, + "grad_norm": 1.3493483862035613, + "language_loss": 0.6300385, + "learning_rate": 1.142145760331648e-06, + "loss": 0.7068457, + "num_input_tokens_seen": 234092325, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.11151123, + "step": 10842, + "time_per_iteration": 2.550992012023926 + }, + { + "auxiliary_loss_clip": 0.06321006, + "auxiliary_loss_mlp": 0.01250785, + "balance_loss_clip": 0.06265368, + "balance_loss_mlp": 0.01249527, + "epoch": 0.6519164286787915, + "flos": 68942905372800.0, + "grad_norm": 0.8268303815829595, + "language_loss": 0.56121087, + "learning_rate": 1.141793960634807e-06, + "loss": 0.6369288, + "num_input_tokens_seen": 234148005, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01258087, + "step": 10843, + "time_per_iteration": 4.4302709102630615 + }, + { + "auxiliary_loss_clip": 0.06418709, + "auxiliary_loss_mlp": 0.01268693, + "balance_loss_clip": 0.06276315, + "balance_loss_mlp": 0.01256844, + "epoch": 0.6519765519314595, + "flos": 20447009026560.0, + "grad_norm": 1.9018808017225726, + "language_loss": 0.83082736, + "learning_rate": 1.1414421934812253e-06, + "loss": 0.90770137, + "num_input_tokens_seen": 234164280, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.11846924, + "step": 10844, + "time_per_iteration": 2.600843906402588 + }, + { + "auxiliary_loss_clip": 0.06412451, + "auxiliary_loss_mlp": 0.01265857, + "balance_loss_clip": 0.06274604, + "balance_loss_mlp": 0.01254598, + "epoch": 0.6520366751841274, + "flos": 28410571468800.0, + "grad_norm": 1.712600797448846, + "language_loss": 0.60434437, + "learning_rate": 1.1410904588842421e-06, + "loss": 0.68112737, + "num_input_tokens_seen": 234185090, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.11260986, + "step": 10845, + "time_per_iteration": 2.5539886951446533 + }, + { + "auxiliary_loss_clip": 0.0641913, + "auxiliary_loss_mlp": 0.01267979, + "balance_loss_clip": 0.0628117, + "balance_loss_mlp": 0.01256964, + "epoch": 0.6520967984367955, + "flos": 22279999017600.0, + "grad_norm": 1.7154837264423382, + "language_loss": 0.79721403, + "learning_rate": 1.140738756857194e-06, + "loss": 0.87408507, + "num_input_tokens_seen": 234204050, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.11010742, + "step": 10846, + "time_per_iteration": 3.9483704566955566 + }, + { + "auxiliary_loss_clip": 0.06323321, + "auxiliary_loss_mlp": 0.01252083, + "balance_loss_clip": 0.06267467, + "balance_loss_mlp": 0.01250644, + "epoch": 0.6521569216894634, + "flos": 68940123459840.0, + "grad_norm": 0.9959560363450068, + "language_loss": 0.60117191, + "learning_rate": 1.1403870874134192e-06, + "loss": 0.67692602, + "num_input_tokens_seen": 234269790, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.01437378, + "step": 10847, + "time_per_iteration": 3.259263277053833 + }, + { + "auxiliary_loss_clip": 0.06419109, + "auxiliary_loss_mlp": 0.01266521, + "balance_loss_clip": 0.06278636, + "balance_loss_mlp": 0.01255196, + "epoch": 0.6522170449421314, + "flos": 29137880908800.0, + "grad_norm": 1.6024469489184654, + "language_loss": 0.81200469, + "learning_rate": 1.1400354505662514e-06, + "loss": 0.88886106, + "num_input_tokens_seen": 234290135, + "router_z_loss_clip": 1.40429688, + "router_z_loss_mlp": 0.11322021, + "step": 10848, + "time_per_iteration": 2.5693862438201904 + }, + { + "auxiliary_loss_clip": 0.06413288, + "auxiliary_loss_mlp": 0.0127236, + "balance_loss_clip": 0.06276944, + "balance_loss_mlp": 0.01262072, + "epoch": 0.6522771681947993, + "flos": 26659284808320.0, + "grad_norm": 2.0899993216020527, + "language_loss": 0.74621618, + "learning_rate": 1.1396838463290263e-06, + "loss": 0.82307267, + "num_input_tokens_seen": 234309535, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.10284424, + "step": 10849, + "time_per_iteration": 2.636046886444092 + }, + { + "auxiliary_loss_clip": 0.06412181, + "auxiliary_loss_mlp": 0.01268851, + "balance_loss_clip": 0.06278128, + "balance_loss_mlp": 0.01258129, + "epoch": 0.6523372914474673, + "flos": 25746961553280.0, + "grad_norm": 1.4470039882385268, + "language_loss": 0.68371421, + "learning_rate": 1.1393322747150752e-06, + "loss": 0.76052451, + "num_input_tokens_seen": 234328755, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.1072998, + "step": 10850, + "time_per_iteration": 2.52057147026062 + }, + { + "auxiliary_loss_clip": 0.0640863, + "auxiliary_loss_mlp": 0.01263783, + "balance_loss_clip": 0.06275396, + "balance_loss_mlp": 0.01253752, + "epoch": 0.6523974147001352, + "flos": 24834344808960.0, + "grad_norm": 1.562549828159254, + "language_loss": 0.67212379, + "learning_rate": 1.1389807357377313e-06, + "loss": 0.7488479, + "num_input_tokens_seen": 234348655, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.10046387, + "step": 10851, + "time_per_iteration": 2.5808029174804688 + }, + { + "auxiliary_loss_clip": 0.06416307, + "auxiliary_loss_mlp": 0.01265062, + "balance_loss_clip": 0.06276499, + "balance_loss_mlp": 0.01254386, + "epoch": 0.6524575379528033, + "flos": 26323945568640.0, + "grad_norm": 2.0070314818502695, + "language_loss": 0.7443608, + "learning_rate": 1.1386292294103235e-06, + "loss": 0.8211745, + "num_input_tokens_seen": 234367445, + "router_z_loss_clip": 1.39941406, + "router_z_loss_mlp": 0.10687256, + "step": 10852, + "time_per_iteration": 4.013243675231934 + }, + { + "auxiliary_loss_clip": 0.0641986, + "auxiliary_loss_mlp": 0.01268659, + "balance_loss_clip": 0.06278665, + "balance_loss_mlp": 0.01257191, + "epoch": 0.6525176612054712, + "flos": 19499200767360.0, + "grad_norm": 1.9187417240841533, + "language_loss": 0.67066777, + "learning_rate": 1.1382777557461812e-06, + "loss": 0.74755299, + "num_input_tokens_seen": 234384825, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.11468506, + "step": 10853, + "time_per_iteration": 2.506601572036743 + }, + { + "auxiliary_loss_clip": 0.06318477, + "auxiliary_loss_mlp": 0.01256063, + "balance_loss_clip": 0.06262536, + "balance_loss_mlp": 0.0125474, + "epoch": 0.6525777844581392, + "flos": 71727057786240.0, + "grad_norm": 0.715298954462881, + "language_loss": 0.63038433, + "learning_rate": 1.137926314758634e-06, + "loss": 0.70612979, + "num_input_tokens_seen": 234450630, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01324463, + "step": 10854, + "time_per_iteration": 3.2700932025909424 + }, + { + "auxiliary_loss_clip": 0.06413402, + "auxiliary_loss_mlp": 0.01267951, + "balance_loss_clip": 0.06275877, + "balance_loss_mlp": 0.01256549, + "epoch": 0.6526379077108072, + "flos": 26660668400640.0, + "grad_norm": 1.6617688619573214, + "language_loss": 0.77541685, + "learning_rate": 1.1375749064610072e-06, + "loss": 0.85223043, + "num_input_tokens_seen": 234473505, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.11407471, + "step": 10855, + "time_per_iteration": 2.5642480850219727 + }, + { + "auxiliary_loss_clip": 0.06405862, + "auxiliary_loss_mlp": 0.01265521, + "balance_loss_clip": 0.06274554, + "balance_loss_mlp": 0.01255174, + "epoch": 0.6526980309634751, + "flos": 22826990471040.0, + "grad_norm": 1.7631241717885235, + "language_loss": 0.79621822, + "learning_rate": 1.1372235308666256e-06, + "loss": 0.87293208, + "num_input_tokens_seen": 234492485, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.10345459, + "step": 10856, + "time_per_iteration": 2.537353992462158 + }, + { + "auxiliary_loss_clip": 0.06408816, + "auxiliary_loss_mlp": 0.01267488, + "balance_loss_clip": 0.06273422, + "balance_loss_mlp": 0.01256408, + "epoch": 0.6527581542161431, + "flos": 28372403134080.0, + "grad_norm": 1.6923564955573929, + "language_loss": 0.73936152, + "learning_rate": 1.136872187988815e-06, + "loss": 0.81612456, + "num_input_tokens_seen": 234512645, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.11077881, + "step": 10857, + "time_per_iteration": 2.5426032543182373 + }, + { + "auxiliary_loss_clip": 0.06409546, + "auxiliary_loss_mlp": 0.01266483, + "balance_loss_clip": 0.06273436, + "balance_loss_mlp": 0.01256195, + "epoch": 0.652818277468811, + "flos": 18375099517440.0, + "grad_norm": 2.1707425213383136, + "language_loss": 0.63389534, + "learning_rate": 1.1365208778408965e-06, + "loss": 0.71065563, + "num_input_tokens_seen": 234529310, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.10290527, + "step": 10858, + "time_per_iteration": 2.495542049407959 + }, + { + "auxiliary_loss_clip": 0.06408103, + "auxiliary_loss_mlp": 0.01265114, + "balance_loss_clip": 0.06274083, + "balance_loss_mlp": 0.01254784, + "epoch": 0.6528784007214791, + "flos": 18041227724160.0, + "grad_norm": 1.644037371034234, + "language_loss": 0.78852642, + "learning_rate": 1.1361696004361939e-06, + "loss": 0.86525851, + "num_input_tokens_seen": 234546685, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.10333252, + "step": 10859, + "time_per_iteration": 2.5497894287109375 + }, + { + "auxiliary_loss_clip": 0.06414656, + "auxiliary_loss_mlp": 0.01263542, + "balance_loss_clip": 0.06273727, + "balance_loss_mlp": 0.01252611, + "epoch": 0.652938523974147, + "flos": 22388466528000.0, + "grad_norm": 1.5493254250566866, + "language_loss": 0.67967153, + "learning_rate": 1.1358183557880256e-06, + "loss": 0.75645357, + "num_input_tokens_seen": 234566255, + "router_z_loss_clip": 1.40917969, + "router_z_loss_mlp": 0.10931396, + "step": 10860, + "time_per_iteration": 2.5913808345794678 + }, + { + "auxiliary_loss_clip": 0.06418759, + "auxiliary_loss_mlp": 0.01268006, + "balance_loss_clip": 0.06276677, + "balance_loss_mlp": 0.01257426, + "epoch": 0.652998647226815, + "flos": 16769694015360.0, + "grad_norm": 1.8207811146767594, + "language_loss": 0.67290318, + "learning_rate": 1.135467143909712e-06, + "loss": 0.74977076, + "num_input_tokens_seen": 234585405, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.10583496, + "step": 10861, + "time_per_iteration": 2.50136137008667 + }, + { + "auxiliary_loss_clip": 0.06415796, + "auxiliary_loss_mlp": 0.01266199, + "balance_loss_clip": 0.06276291, + "balance_loss_mlp": 0.01254886, + "epoch": 0.6530587704794829, + "flos": 35781259547520.0, + "grad_norm": 2.0180062200449744, + "language_loss": 0.65632504, + "learning_rate": 1.135115964814572e-06, + "loss": 0.733145, + "num_input_tokens_seen": 234608095, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.11309814, + "step": 10862, + "time_per_iteration": 2.7082483768463135 + }, + { + "auxiliary_loss_clip": 0.06413227, + "auxiliary_loss_mlp": 0.01267111, + "balance_loss_clip": 0.06276508, + "balance_loss_mlp": 0.01256912, + "epoch": 0.6531188937321509, + "flos": 19321901527680.0, + "grad_norm": 1.7523951884589628, + "language_loss": 0.77599865, + "learning_rate": 1.13476481851592e-06, + "loss": 0.85280204, + "num_input_tokens_seen": 234627335, + "router_z_loss_clip": 1.36621094, + "router_z_loss_mlp": 0.10198975, + "step": 10863, + "time_per_iteration": 2.525467872619629 + }, + { + "auxiliary_loss_clip": 0.06412541, + "auxiliary_loss_mlp": 0.01266016, + "balance_loss_clip": 0.06275116, + "balance_loss_mlp": 0.01255579, + "epoch": 0.6531790169848188, + "flos": 22900476101760.0, + "grad_norm": 1.5537645301307006, + "language_loss": 0.74952781, + "learning_rate": 1.1344137050270739e-06, + "loss": 0.82631332, + "num_input_tokens_seen": 234646540, + "router_z_loss_clip": 1.37402344, + "router_z_loss_mlp": 0.10430908, + "step": 10864, + "time_per_iteration": 2.5613489151000977 + }, + { + "auxiliary_loss_clip": 0.06410347, + "auxiliary_loss_mlp": 0.01267199, + "balance_loss_clip": 0.06275168, + "balance_loss_mlp": 0.01256929, + "epoch": 0.6532391402374869, + "flos": 29570157722880.0, + "grad_norm": 1.9052418824081008, + "language_loss": 0.86169875, + "learning_rate": 1.1340626243613458e-06, + "loss": 0.93847424, + "num_input_tokens_seen": 234665470, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.1026001, + "step": 10865, + "time_per_iteration": 2.5604805946350098 + }, + { + "auxiliary_loss_clip": 0.06417938, + "auxiliary_loss_mlp": 0.01269286, + "balance_loss_clip": 0.0627698, + "balance_loss_mlp": 0.01258926, + "epoch": 0.6532992634901548, + "flos": 23110996285440.0, + "grad_norm": 1.6108799527314137, + "language_loss": 0.81515527, + "learning_rate": 1.133711576532051e-06, + "loss": 0.8920275, + "num_input_tokens_seen": 234683955, + "router_z_loss_clip": 1.40917969, + "router_z_loss_mlp": 0.10357666, + "step": 10866, + "time_per_iteration": 2.5684125423431396 + }, + { + "auxiliary_loss_clip": 0.06411187, + "auxiliary_loss_mlp": 0.01264991, + "balance_loss_clip": 0.06275405, + "balance_loss_mlp": 0.01254382, + "epoch": 0.6533593867428228, + "flos": 26074460436480.0, + "grad_norm": 1.6718467663998162, + "language_loss": 0.82545173, + "learning_rate": 1.1333605615524995e-06, + "loss": 0.90221351, + "num_input_tokens_seen": 234704595, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.10614014, + "step": 10867, + "time_per_iteration": 2.5475850105285645 + }, + { + "auxiliary_loss_clip": 0.06413805, + "auxiliary_loss_mlp": 0.01264816, + "balance_loss_clip": 0.06276451, + "balance_loss_mlp": 0.01254314, + "epoch": 0.6534195099954908, + "flos": 21218398513920.0, + "grad_norm": 1.6506076303544417, + "language_loss": 0.81211448, + "learning_rate": 1.1330095794360016e-06, + "loss": 0.88890064, + "num_input_tokens_seen": 234724090, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.1050415, + "step": 10868, + "time_per_iteration": 2.5498743057250977 + }, + { + "auxiliary_loss_clip": 0.06418251, + "auxiliary_loss_mlp": 0.01266421, + "balance_loss_clip": 0.06277823, + "balance_loss_mlp": 0.01255579, + "epoch": 0.6534796332481587, + "flos": 19652754574080.0, + "grad_norm": 1.774479415812712, + "language_loss": 0.7959047, + "learning_rate": 1.1326586301958675e-06, + "loss": 0.87275141, + "num_input_tokens_seen": 234742560, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.10845947, + "step": 10869, + "time_per_iteration": 2.5166242122650146 + }, + { + "auxiliary_loss_clip": 0.06413683, + "auxiliary_loss_mlp": 0.01266573, + "balance_loss_clip": 0.0627695, + "balance_loss_mlp": 0.01256172, + "epoch": 0.6535397565008267, + "flos": 24028979690880.0, + "grad_norm": 2.0325113837901703, + "language_loss": 0.72014058, + "learning_rate": 1.1323077138454063e-06, + "loss": 0.79694319, + "num_input_tokens_seen": 234762315, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.10406494, + "step": 10870, + "time_per_iteration": 2.5486953258514404 + }, + { + "auxiliary_loss_clip": 0.06413276, + "auxiliary_loss_mlp": 0.01265068, + "balance_loss_clip": 0.06275949, + "balance_loss_mlp": 0.01254584, + "epoch": 0.6535998797534947, + "flos": 24608772817920.0, + "grad_norm": 1.9753517025590153, + "language_loss": 0.74408901, + "learning_rate": 1.1319568303979221e-06, + "loss": 0.82087243, + "num_input_tokens_seen": 234781300, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.10479736, + "step": 10871, + "time_per_iteration": 4.039932489395142 + }, + { + "auxiliary_loss_clip": 0.06410573, + "auxiliary_loss_mlp": 0.01263739, + "balance_loss_clip": 0.06277861, + "balance_loss_mlp": 0.01253791, + "epoch": 0.6536600030061627, + "flos": 23370292344960.0, + "grad_norm": 1.4980578991412412, + "language_loss": 0.56041443, + "learning_rate": 1.1316059798667227e-06, + "loss": 0.6371575, + "num_input_tokens_seen": 234801040, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.0994873, + "step": 10872, + "time_per_iteration": 2.502490282058716 + }, + { + "auxiliary_loss_clip": 0.06416132, + "auxiliary_loss_mlp": 0.01267921, + "balance_loss_clip": 0.06281123, + "balance_loss_mlp": 0.01256918, + "epoch": 0.6537201262588306, + "flos": 23885278738560.0, + "grad_norm": 1.5337992373700162, + "language_loss": 0.75344592, + "learning_rate": 1.1312551622651112e-06, + "loss": 0.8302865, + "num_input_tokens_seen": 234821415, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.11022949, + "step": 10873, + "time_per_iteration": 2.5598514080047607 + }, + { + "auxiliary_loss_clip": 0.06410979, + "auxiliary_loss_mlp": 0.012657, + "balance_loss_clip": 0.06274614, + "balance_loss_mlp": 0.01255585, + "epoch": 0.6537802495114986, + "flos": 24361971016320.0, + "grad_norm": 1.420531378230647, + "language_loss": 0.76059687, + "learning_rate": 1.1309043776063917e-06, + "loss": 0.8373636, + "num_input_tokens_seen": 234843795, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.10113525, + "step": 10874, + "time_per_iteration": 2.549380302429199 + }, + { + "auxiliary_loss_clip": 0.06415659, + "auxiliary_loss_mlp": 0.01268814, + "balance_loss_clip": 0.06279317, + "balance_loss_mlp": 0.01258032, + "epoch": 0.6538403727641665, + "flos": 28003633315200.0, + "grad_norm": 1.5256219818178185, + "language_loss": 0.81805712, + "learning_rate": 1.1305536259038642e-06, + "loss": 0.89490187, + "num_input_tokens_seen": 234862350, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.10784912, + "step": 10875, + "time_per_iteration": 2.583240270614624 + }, + { + "auxiliary_loss_clip": 0.06411637, + "auxiliary_loss_mlp": 0.01266928, + "balance_loss_clip": 0.06273378, + "balance_loss_mlp": 0.01256372, + "epoch": 0.6539004960168345, + "flos": 27571021084800.0, + "grad_norm": 1.6524409835803482, + "language_loss": 0.69961172, + "learning_rate": 1.1302029071708314e-06, + "loss": 0.77639741, + "num_input_tokens_seen": 234881790, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.10552979, + "step": 10876, + "time_per_iteration": 2.53607439994812 + }, + { + "auxiliary_loss_clip": 0.0641342, + "auxiliary_loss_mlp": 0.01265066, + "balance_loss_clip": 0.06277761, + "balance_loss_mlp": 0.01254177, + "epoch": 0.6539606192695024, + "flos": 14533958574720.0, + "grad_norm": 1.8504141345372043, + "language_loss": 0.79613322, + "learning_rate": 1.1298522214205908e-06, + "loss": 0.87291813, + "num_input_tokens_seen": 234897775, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.10888672, + "step": 10877, + "time_per_iteration": 2.482450246810913 + }, + { + "auxiliary_loss_clip": 0.0641083, + "auxiliary_loss_mlp": 0.01271317, + "balance_loss_clip": 0.06274553, + "balance_loss_mlp": 0.01260976, + "epoch": 0.6540207425221705, + "flos": 21622779118080.0, + "grad_norm": 2.1988791511764507, + "language_loss": 0.80130821, + "learning_rate": 1.1295015686664408e-06, + "loss": 0.87812972, + "num_input_tokens_seen": 234918395, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.10333252, + "step": 10878, + "time_per_iteration": 2.4935176372528076 + }, + { + "auxiliary_loss_clip": 0.06409772, + "auxiliary_loss_mlp": 0.01267486, + "balance_loss_clip": 0.06272677, + "balance_loss_mlp": 0.01256185, + "epoch": 0.6540808657748384, + "flos": 17673589935360.0, + "grad_norm": 2.582136269580718, + "language_loss": 0.8441155, + "learning_rate": 1.1291509489216797e-06, + "loss": 0.92088807, + "num_input_tokens_seen": 234936260, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.11309814, + "step": 10879, + "time_per_iteration": 2.478309392929077 + }, + { + "auxiliary_loss_clip": 0.06413597, + "auxiliary_loss_mlp": 0.01266934, + "balance_loss_clip": 0.06273437, + "balance_loss_mlp": 0.01255937, + "epoch": 0.6541409890275064, + "flos": 14543559866880.0, + "grad_norm": 2.245673949677598, + "language_loss": 0.72627622, + "learning_rate": 1.128800362199601e-06, + "loss": 0.80308151, + "num_input_tokens_seen": 234952110, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.11004639, + "step": 10880, + "time_per_iteration": 2.448975086212158 + }, + { + "auxiliary_loss_clip": 0.06410271, + "auxiliary_loss_mlp": 0.01269229, + "balance_loss_clip": 0.06275423, + "balance_loss_mlp": 0.01258899, + "epoch": 0.6542011122801744, + "flos": 17171013945600.0, + "grad_norm": 1.8546451564603688, + "language_loss": 0.84333724, + "learning_rate": 1.1284498085135005e-06, + "loss": 0.92013222, + "num_input_tokens_seen": 234970810, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.10333252, + "step": 10881, + "time_per_iteration": 2.5005478858947754 + }, + { + "auxiliary_loss_clip": 0.06415182, + "auxiliary_loss_mlp": 0.01264701, + "balance_loss_clip": 0.0627469, + "balance_loss_mlp": 0.01252995, + "epoch": 0.6542612355328423, + "flos": 18192433616640.0, + "grad_norm": 1.7673801500025483, + "language_loss": 0.78099298, + "learning_rate": 1.1280992878766699e-06, + "loss": 0.85779178, + "num_input_tokens_seen": 234989565, + "router_z_loss_clip": 1.40332031, + "router_z_loss_mlp": 0.11700439, + "step": 10882, + "time_per_iteration": 2.4750256538391113 + }, + { + "auxiliary_loss_clip": 0.06413694, + "auxiliary_loss_mlp": 0.01268989, + "balance_loss_clip": 0.06275713, + "balance_loss_mlp": 0.01257837, + "epoch": 0.6543213587855103, + "flos": 19798635732480.0, + "grad_norm": 1.55805041018917, + "language_loss": 0.81790304, + "learning_rate": 1.1277488003024024e-06, + "loss": 0.89472985, + "num_input_tokens_seen": 235007955, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.1116333, + "step": 10883, + "time_per_iteration": 3.958979368209839 + }, + { + "auxiliary_loss_clip": 0.06415352, + "auxiliary_loss_mlp": 0.0126774, + "balance_loss_clip": 0.06277536, + "balance_loss_mlp": 0.01256427, + "epoch": 0.6543814820381783, + "flos": 21111356522880.0, + "grad_norm": 2.318256186808643, + "language_loss": 0.85692853, + "learning_rate": 1.127398345803988e-06, + "loss": 0.93375945, + "num_input_tokens_seen": 235024860, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.11322021, + "step": 10884, + "time_per_iteration": 2.4991559982299805 + }, + { + "auxiliary_loss_clip": 0.06414054, + "auxiliary_loss_mlp": 0.01263304, + "balance_loss_clip": 0.06276435, + "balance_loss_mlp": 0.01252623, + "epoch": 0.6544416052908463, + "flos": 20200333006080.0, + "grad_norm": 2.0262705152465985, + "language_loss": 0.8030138, + "learning_rate": 1.127047924394715e-06, + "loss": 0.87978739, + "num_input_tokens_seen": 235043815, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.10687256, + "step": 10885, + "time_per_iteration": 3.945915699005127 + }, + { + "auxiliary_loss_clip": 0.06412613, + "auxiliary_loss_mlp": 0.01269809, + "balance_loss_clip": 0.06277589, + "balance_loss_mlp": 0.01259468, + "epoch": 0.6545017285435142, + "flos": 23375072027520.0, + "grad_norm": 1.9399514462864902, + "language_loss": 0.72038162, + "learning_rate": 1.1266975360878722e-06, + "loss": 0.79720581, + "num_input_tokens_seen": 235062985, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.10339355, + "step": 10886, + "time_per_iteration": 2.592869520187378 + }, + { + "auxiliary_loss_clip": 0.06412855, + "auxiliary_loss_mlp": 0.0126236, + "balance_loss_clip": 0.06275351, + "balance_loss_mlp": 0.01252591, + "epoch": 0.6545618517961822, + "flos": 19140619219200.0, + "grad_norm": 1.841753490100957, + "language_loss": 0.78875196, + "learning_rate": 1.1263471808967468e-06, + "loss": 0.86550403, + "num_input_tokens_seen": 235081670, + "router_z_loss_clip": 1.37402344, + "router_z_loss_mlp": 0.09765625, + "step": 10887, + "time_per_iteration": 2.4951751232147217 + }, + { + "auxiliary_loss_clip": 0.06415602, + "auxiliary_loss_mlp": 0.0126552, + "balance_loss_clip": 0.06278757, + "balance_loss_mlp": 0.01255346, + "epoch": 0.6546219750488501, + "flos": 14943789694080.0, + "grad_norm": 1.7286309451287045, + "language_loss": 0.791143, + "learning_rate": 1.1259968588346234e-06, + "loss": 0.86795419, + "num_input_tokens_seen": 235098510, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.10168457, + "step": 10888, + "time_per_iteration": 2.5363447666168213 + }, + { + "auxiliary_loss_clip": 0.0641408, + "auxiliary_loss_mlp": 0.01266895, + "balance_loss_clip": 0.06279381, + "balance_loss_mlp": 0.01257025, + "epoch": 0.6546820983015181, + "flos": 36329466885120.0, + "grad_norm": 1.4489059834180797, + "language_loss": 0.66680413, + "learning_rate": 1.1256465699147874e-06, + "loss": 0.7436139, + "num_input_tokens_seen": 235119990, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.09869385, + "step": 10889, + "time_per_iteration": 2.631702184677124 + }, + { + "auxiliary_loss_clip": 0.06413323, + "auxiliary_loss_mlp": 0.01267679, + "balance_loss_clip": 0.06274237, + "balance_loss_mlp": 0.01255359, + "epoch": 0.654742221554186, + "flos": 20417519589120.0, + "grad_norm": 1.4090787224296468, + "language_loss": 0.80175591, + "learning_rate": 1.1252963141505203e-06, + "loss": 0.87856597, + "num_input_tokens_seen": 235139255, + "router_z_loss_clip": 1.39257812, + "router_z_loss_mlp": 0.12322998, + "step": 10890, + "time_per_iteration": 2.553987503051758 + }, + { + "auxiliary_loss_clip": 0.06413622, + "auxiliary_loss_mlp": 0.01266787, + "balance_loss_clip": 0.06272978, + "balance_loss_mlp": 0.01255963, + "epoch": 0.6548023448068541, + "flos": 24870626426880.0, + "grad_norm": 1.9658735826984712, + "language_loss": 0.66080928, + "learning_rate": 1.1249460915551052e-06, + "loss": 0.73761332, + "num_input_tokens_seen": 235158455, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.10827637, + "step": 10891, + "time_per_iteration": 3.981126546859741 + }, + { + "auxiliary_loss_clip": 0.06412101, + "auxiliary_loss_mlp": 0.01268584, + "balance_loss_clip": 0.06276606, + "balance_loss_mlp": 0.01258314, + "epoch": 0.654862468059522, + "flos": 21432901766400.0, + "grad_norm": 1.7619514062333756, + "language_loss": 0.80124283, + "learning_rate": 1.1245959021418214e-06, + "loss": 0.87804967, + "num_input_tokens_seen": 235177350, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.1027832, + "step": 10892, + "time_per_iteration": 2.487014055252075 + }, + { + "auxiliary_loss_clip": 0.06417862, + "auxiliary_loss_mlp": 0.01267184, + "balance_loss_clip": 0.06275848, + "balance_loss_mlp": 0.01256502, + "epoch": 0.65492259131219, + "flos": 26585002563840.0, + "grad_norm": 1.8517707324094554, + "language_loss": 0.78348118, + "learning_rate": 1.1242457459239497e-06, + "loss": 0.86033165, + "num_input_tokens_seen": 235196435, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.10675049, + "step": 10893, + "time_per_iteration": 2.5751121044158936 + }, + { + "auxiliary_loss_clip": 0.06416593, + "auxiliary_loss_mlp": 0.01265779, + "balance_loss_clip": 0.06276494, + "balance_loss_mlp": 0.01254126, + "epoch": 0.6549827145648579, + "flos": 21506806667520.0, + "grad_norm": 1.5510106151766068, + "language_loss": 0.70386314, + "learning_rate": 1.123895622914766e-06, + "loss": 0.78068686, + "num_input_tokens_seen": 235215430, + "router_z_loss_clip": 1.40039062, + "router_z_loss_mlp": 0.11651611, + "step": 10894, + "time_per_iteration": 2.492877721786499 + }, + { + "auxiliary_loss_clip": 0.06416629, + "auxiliary_loss_mlp": 0.01264665, + "balance_loss_clip": 0.06276509, + "balance_loss_mlp": 0.01252959, + "epoch": 0.6550428378175259, + "flos": 22599657544320.0, + "grad_norm": 2.852975580128828, + "language_loss": 0.62881947, + "learning_rate": 1.123545533127549e-06, + "loss": 0.70563233, + "num_input_tokens_seen": 235232015, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.11712646, + "step": 10895, + "time_per_iteration": 2.508265733718872 + }, + { + "auxiliary_loss_clip": 0.06409365, + "auxiliary_loss_mlp": 0.01264591, + "balance_loss_clip": 0.06273523, + "balance_loss_mlp": 0.0125487, + "epoch": 0.655102961070194, + "flos": 12828848313600.0, + "grad_norm": 1.7300998551667346, + "language_loss": 0.79205835, + "learning_rate": 1.1231954765755722e-06, + "loss": 0.8687979, + "num_input_tokens_seen": 235248115, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.097229, + "step": 10896, + "time_per_iteration": 2.4711906909942627 + }, + { + "auxiliary_loss_clip": 0.06409965, + "auxiliary_loss_mlp": 0.0126749, + "balance_loss_clip": 0.06276735, + "balance_loss_mlp": 0.01257417, + "epoch": 0.6551630843228619, + "flos": 24798105118080.0, + "grad_norm": 1.3882264371892772, + "language_loss": 0.70543504, + "learning_rate": 1.1228454532721111e-06, + "loss": 0.78220963, + "num_input_tokens_seen": 235270785, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.10076904, + "step": 10897, + "time_per_iteration": 2.6822469234466553 + }, + { + "auxiliary_loss_clip": 0.06417882, + "auxiliary_loss_mlp": 0.01268345, + "balance_loss_clip": 0.06276685, + "balance_loss_mlp": 0.01257628, + "epoch": 0.6552232075755299, + "flos": 16729597036800.0, + "grad_norm": 1.5280933060289523, + "language_loss": 0.75582546, + "learning_rate": 1.1224954632304391e-06, + "loss": 0.83268768, + "num_input_tokens_seen": 235287905, + "router_z_loss_clip": 1.41210938, + "router_z_loss_mlp": 0.10717773, + "step": 10898, + "time_per_iteration": 2.475172519683838 + }, + { + "auxiliary_loss_clip": 0.06413586, + "auxiliary_loss_mlp": 0.0126988, + "balance_loss_clip": 0.06276682, + "balance_loss_mlp": 0.0125986, + "epoch": 0.6552833308281978, + "flos": 22022757383040.0, + "grad_norm": 2.1698837802172193, + "language_loss": 0.7396723, + "learning_rate": 1.122145506463827e-06, + "loss": 0.81650698, + "num_input_tokens_seen": 235305525, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.10028076, + "step": 10899, + "time_per_iteration": 2.5430071353912354 + }, + { + "auxiliary_loss_clip": 0.06414597, + "auxiliary_loss_mlp": 0.0126991, + "balance_loss_clip": 0.06275821, + "balance_loss_mlp": 0.01259229, + "epoch": 0.6553434540808658, + "flos": 24870332937600.0, + "grad_norm": 2.0271227306533346, + "language_loss": 0.56131774, + "learning_rate": 1.1217955829855443e-06, + "loss": 0.63816285, + "num_input_tokens_seen": 235324415, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.10693359, + "step": 10900, + "time_per_iteration": 2.5413925647735596 + }, + { + "auxiliary_loss_clip": 0.06419879, + "auxiliary_loss_mlp": 0.01265514, + "balance_loss_clip": 0.06280822, + "balance_loss_mlp": 0.01254511, + "epoch": 0.6554035773335337, + "flos": 23227639568640.0, + "grad_norm": 1.632650390975927, + "language_loss": 0.77087748, + "learning_rate": 1.1214456928088622e-06, + "loss": 0.84773135, + "num_input_tokens_seen": 235341595, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.11004639, + "step": 10901, + "time_per_iteration": 2.5584566593170166 + }, + { + "auxiliary_loss_clip": 0.06417914, + "auxiliary_loss_mlp": 0.01269861, + "balance_loss_clip": 0.06281441, + "balance_loss_mlp": 0.01259484, + "epoch": 0.6554637005862017, + "flos": 22790163801600.0, + "grad_norm": 1.6269884512414954, + "language_loss": 0.73415089, + "learning_rate": 1.1210958359470463e-06, + "loss": 0.81102872, + "num_input_tokens_seen": 235361700, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.10375977, + "step": 10902, + "time_per_iteration": 2.5149738788604736 + }, + { + "auxiliary_loss_clip": 0.06411173, + "auxiliary_loss_mlp": 0.01265501, + "balance_loss_clip": 0.0627598, + "balance_loss_mlp": 0.01255118, + "epoch": 0.6555238238388696, + "flos": 21513682702080.0, + "grad_norm": 2.0084891996216254, + "language_loss": 0.68054104, + "learning_rate": 1.1207460124133645e-06, + "loss": 0.75730777, + "num_input_tokens_seen": 235382065, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.10388184, + "step": 10903, + "time_per_iteration": 2.5427961349487305 + }, + { + "auxiliary_loss_clip": 0.06420846, + "auxiliary_loss_mlp": 0.01267584, + "balance_loss_clip": 0.06277949, + "balance_loss_mlp": 0.01255926, + "epoch": 0.6555839470915377, + "flos": 30527483420160.0, + "grad_norm": 1.6549904072812014, + "language_loss": 0.67021459, + "learning_rate": 1.1203962222210832e-06, + "loss": 0.74709886, + "num_input_tokens_seen": 235402130, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.11645508, + "step": 10904, + "time_per_iteration": 2.5631024837493896 + }, + { + "auxiliary_loss_clip": 0.06421356, + "auxiliary_loss_mlp": 0.01264475, + "balance_loss_clip": 0.06279784, + "balance_loss_mlp": 0.0125327, + "epoch": 0.6556440703442056, + "flos": 24649582556160.0, + "grad_norm": 1.7705609323248692, + "language_loss": 0.90557879, + "learning_rate": 1.120046465383464e-06, + "loss": 0.98243713, + "num_input_tokens_seen": 235420435, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.11212158, + "step": 10905, + "time_per_iteration": 2.551908493041992 + }, + { + "auxiliary_loss_clip": 0.06408294, + "auxiliary_loss_mlp": 0.01265256, + "balance_loss_clip": 0.06275466, + "balance_loss_mlp": 0.01255194, + "epoch": 0.6557041935968736, + "flos": 23739229872000.0, + "grad_norm": 1.7103913409482634, + "language_loss": 0.75575101, + "learning_rate": 1.1196967419137721e-06, + "loss": 0.83248651, + "num_input_tokens_seen": 235439960, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.10058594, + "step": 10906, + "time_per_iteration": 2.5098323822021484 + }, + { + "auxiliary_loss_clip": 0.06419322, + "auxiliary_loss_mlp": 0.0126702, + "balance_loss_clip": 0.06278144, + "balance_loss_mlp": 0.01256094, + "epoch": 0.6557643168495415, + "flos": 11106464112000.0, + "grad_norm": 2.5310893479547385, + "language_loss": 0.75316978, + "learning_rate": 1.119347051825267e-06, + "loss": 0.83003318, + "num_input_tokens_seen": 235457495, + "router_z_loss_clip": 1.41210938, + "router_z_loss_mlp": 0.10925293, + "step": 10907, + "time_per_iteration": 2.5110371112823486 + }, + { + "auxiliary_loss_clip": 0.06413908, + "auxiliary_loss_mlp": 0.01264522, + "balance_loss_clip": 0.06275952, + "balance_loss_mlp": 0.01253585, + "epoch": 0.6558244401022095, + "flos": 30198978288000.0, + "grad_norm": 1.3099733417202022, + "language_loss": 0.7233519, + "learning_rate": 1.118997395131211e-06, + "loss": 0.80013621, + "num_input_tokens_seen": 235479525, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.109375, + "step": 10908, + "time_per_iteration": 2.6000733375549316 + }, + { + "auxiliary_loss_clip": 0.06419864, + "auxiliary_loss_mlp": 0.01265366, + "balance_loss_clip": 0.06280993, + "balance_loss_mlp": 0.01254912, + "epoch": 0.6558845633548775, + "flos": 17936827136640.0, + "grad_norm": 2.2254285972113155, + "language_loss": 0.82226503, + "learning_rate": 1.118647771844861e-06, + "loss": 0.89911729, + "num_input_tokens_seen": 235496305, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.10455322, + "step": 10909, + "time_per_iteration": 2.524258613586426 + }, + { + "auxiliary_loss_clip": 0.06420204, + "auxiliary_loss_mlp": 0.01267528, + "balance_loss_clip": 0.0627941, + "balance_loss_mlp": 0.01256567, + "epoch": 0.6559446866075455, + "flos": 21909929460480.0, + "grad_norm": 2.0664641654441334, + "language_loss": 0.64063025, + "learning_rate": 1.1182981819794767e-06, + "loss": 0.71750748, + "num_input_tokens_seen": 235512545, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.10968018, + "step": 10910, + "time_per_iteration": 4.0342183113098145 + }, + { + "auxiliary_loss_clip": 0.06428535, + "auxiliary_loss_mlp": 0.012681, + "balance_loss_clip": 0.06281586, + "balance_loss_mlp": 0.01256501, + "epoch": 0.6560048098602135, + "flos": 14131674322560.0, + "grad_norm": 2.6155993780376408, + "language_loss": 0.76254046, + "learning_rate": 1.117948625548313e-06, + "loss": 0.8395068, + "num_input_tokens_seen": 235526045, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 0.1159668, + "step": 10911, + "time_per_iteration": 2.447054386138916 + }, + { + "auxiliary_loss_clip": 0.06411637, + "auxiliary_loss_mlp": 0.01268286, + "balance_loss_clip": 0.0627694, + "balance_loss_mlp": 0.0125798, + "epoch": 0.6560649331128814, + "flos": 18813623460480.0, + "grad_norm": 1.5982338886507241, + "language_loss": 0.756971, + "learning_rate": 1.1175991025646265e-06, + "loss": 0.83377028, + "num_input_tokens_seen": 235545285, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.10308838, + "step": 10912, + "time_per_iteration": 2.5681815147399902 + }, + { + "auxiliary_loss_clip": 0.06430128, + "auxiliary_loss_mlp": 0.01272614, + "balance_loss_clip": 0.0628223, + "balance_loss_mlp": 0.01260431, + "epoch": 0.6561250563655494, + "flos": 17058940709760.0, + "grad_norm": 1.6202794136024683, + "language_loss": 0.77903795, + "learning_rate": 1.1172496130416697e-06, + "loss": 0.85606527, + "num_input_tokens_seen": 235563150, + "router_z_loss_clip": 1.47949219, + "router_z_loss_mlp": 0.12176514, + "step": 10913, + "time_per_iteration": 2.4939568042755127 + }, + { + "auxiliary_loss_clip": 0.0641174, + "auxiliary_loss_mlp": 0.01263849, + "balance_loss_clip": 0.06277423, + "balance_loss_mlp": 0.01254425, + "epoch": 0.6561851796182173, + "flos": 22644198789120.0, + "grad_norm": 1.7766660084969559, + "language_loss": 0.71619821, + "learning_rate": 1.1169001569926961e-06, + "loss": 0.79295409, + "num_input_tokens_seen": 235582535, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.09423828, + "step": 10914, + "time_per_iteration": 2.569068431854248 + }, + { + "auxiliary_loss_clip": 0.06418359, + "auxiliary_loss_mlp": 0.01264819, + "balance_loss_clip": 0.06280423, + "balance_loss_mlp": 0.01254149, + "epoch": 0.6562453028708853, + "flos": 19244307047040.0, + "grad_norm": 1.8135755345317126, + "language_loss": 0.74166334, + "learning_rate": 1.116550734430958e-06, + "loss": 0.81849515, + "num_input_tokens_seen": 235601490, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.10675049, + "step": 10915, + "time_per_iteration": 2.487908363342285 + }, + { + "auxiliary_loss_clip": 0.06413562, + "auxiliary_loss_mlp": 0.01266089, + "balance_loss_clip": 0.06277299, + "balance_loss_mlp": 0.01254823, + "epoch": 0.6563054261235532, + "flos": 23807390768640.0, + "grad_norm": 1.4909835290624114, + "language_loss": 0.79751885, + "learning_rate": 1.1162013453697042e-06, + "loss": 0.87431538, + "num_input_tokens_seen": 235619165, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.11254883, + "step": 10916, + "time_per_iteration": 2.5246381759643555 + }, + { + "auxiliary_loss_clip": 0.06414592, + "auxiliary_loss_mlp": 0.01266229, + "balance_loss_clip": 0.06275203, + "balance_loss_mlp": 0.01255727, + "epoch": 0.6563655493762213, + "flos": 19245271368960.0, + "grad_norm": 1.7342152629791572, + "language_loss": 0.76458621, + "learning_rate": 1.1158519898221831e-06, + "loss": 0.84139442, + "num_input_tokens_seen": 235637115, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.10498047, + "step": 10917, + "time_per_iteration": 2.468027353286743 + }, + { + "auxiliary_loss_clip": 0.06412656, + "auxiliary_loss_mlp": 0.01267091, + "balance_loss_clip": 0.06277646, + "balance_loss_mlp": 0.0125678, + "epoch": 0.6564256726288892, + "flos": 25563457111680.0, + "grad_norm": 1.7726258593528208, + "language_loss": 0.70893037, + "learning_rate": 1.1155026678016445e-06, + "loss": 0.78572786, + "num_input_tokens_seen": 235656330, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.10314941, + "step": 10918, + "time_per_iteration": 2.5601627826690674 + }, + { + "auxiliary_loss_clip": 0.06410314, + "auxiliary_loss_mlp": 0.01263599, + "balance_loss_clip": 0.06277462, + "balance_loss_mlp": 0.01253806, + "epoch": 0.6564857958815572, + "flos": 22207226146560.0, + "grad_norm": 1.5162098354406723, + "language_loss": 0.76179051, + "learning_rate": 1.115153379321332e-06, + "loss": 0.83852965, + "num_input_tokens_seen": 235674510, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09802246, + "step": 10919, + "time_per_iteration": 2.515432357788086 + }, + { + "auxiliary_loss_clip": 0.06311788, + "auxiliary_loss_mlp": 0.01255206, + "balance_loss_clip": 0.06255645, + "balance_loss_mlp": 0.01254054, + "epoch": 0.6565459191342251, + "flos": 58139188462080.0, + "grad_norm": 0.7048888157954881, + "language_loss": 0.52975726, + "learning_rate": 1.1148041243944931e-06, + "loss": 0.60542721, + "num_input_tokens_seen": 235735050, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01150513, + "step": 10920, + "time_per_iteration": 3.225492238998413 + }, + { + "auxiliary_loss_clip": 0.06409396, + "auxiliary_loss_mlp": 0.01264778, + "balance_loss_clip": 0.06275034, + "balance_loss_mlp": 0.01254252, + "epoch": 0.6566060423868931, + "flos": 30817400947200.0, + "grad_norm": 2.612121109527078, + "language_loss": 0.66109598, + "learning_rate": 1.1144549030343697e-06, + "loss": 0.73783767, + "num_input_tokens_seen": 235757545, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.10516357, + "step": 10921, + "time_per_iteration": 2.5863046646118164 + }, + { + "auxiliary_loss_clip": 0.06413272, + "auxiliary_loss_mlp": 0.01265745, + "balance_loss_clip": 0.06276343, + "balance_loss_mlp": 0.01254086, + "epoch": 0.6566661656395612, + "flos": 23374107705600.0, + "grad_norm": 1.6764293200295557, + "language_loss": 0.81199658, + "learning_rate": 1.114105715254205e-06, + "loss": 0.88878673, + "num_input_tokens_seen": 235777265, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.11657715, + "step": 10922, + "time_per_iteration": 3.958033800125122 + }, + { + "auxiliary_loss_clip": 0.06414749, + "auxiliary_loss_mlp": 0.01268836, + "balance_loss_clip": 0.06275846, + "balance_loss_mlp": 0.01258131, + "epoch": 0.6567262888922291, + "flos": 25742098016640.0, + "grad_norm": 1.8770672525164127, + "language_loss": 0.71403915, + "learning_rate": 1.1137565610672414e-06, + "loss": 0.79087496, + "num_input_tokens_seen": 235796565, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.1071167, + "step": 10923, + "time_per_iteration": 2.6299500465393066 + }, + { + "auxiliary_loss_clip": 0.06414993, + "auxiliary_loss_mlp": 0.01266649, + "balance_loss_clip": 0.06276433, + "balance_loss_mlp": 0.0125629, + "epoch": 0.6567864121448971, + "flos": 17128569052800.0, + "grad_norm": 1.8445128185559154, + "language_loss": 0.80703431, + "learning_rate": 1.1134074404867169e-06, + "loss": 0.88385069, + "num_input_tokens_seen": 235814805, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.10357666, + "step": 10924, + "time_per_iteration": 2.474226713180542 + }, + { + "auxiliary_loss_clip": 0.06413686, + "auxiliary_loss_mlp": 0.01262003, + "balance_loss_clip": 0.06275852, + "balance_loss_mlp": 0.0125187, + "epoch": 0.656846535397565, + "flos": 22425922103040.0, + "grad_norm": 2.0896707953815543, + "language_loss": 0.72634912, + "learning_rate": 1.1130583535258717e-06, + "loss": 0.80310595, + "num_input_tokens_seen": 235833405, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.10137939, + "step": 10925, + "time_per_iteration": 4.006798982620239 + }, + { + "auxiliary_loss_clip": 0.0641509, + "auxiliary_loss_mlp": 0.01263906, + "balance_loss_clip": 0.06276507, + "balance_loss_mlp": 0.01253768, + "epoch": 0.656906658650233, + "flos": 17708991085440.0, + "grad_norm": 2.4212353880000586, + "language_loss": 0.72549468, + "learning_rate": 1.112709300197942e-06, + "loss": 0.80228466, + "num_input_tokens_seen": 235848530, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.10137939, + "step": 10926, + "time_per_iteration": 2.470264434814453 + }, + { + "auxiliary_loss_clip": 0.06419797, + "auxiliary_loss_mlp": 0.01265954, + "balance_loss_clip": 0.06277547, + "balance_loss_mlp": 0.01254498, + "epoch": 0.6569667819029009, + "flos": 21180942938880.0, + "grad_norm": 1.9117955392450259, + "language_loss": 0.72684854, + "learning_rate": 1.1123602805161656e-06, + "loss": 0.80370605, + "num_input_tokens_seen": 235867225, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.11468506, + "step": 10927, + "time_per_iteration": 2.5509166717529297 + }, + { + "auxiliary_loss_clip": 0.06310604, + "auxiliary_loss_mlp": 0.01252717, + "balance_loss_clip": 0.06254312, + "balance_loss_mlp": 0.01251483, + "epoch": 0.6570269051555689, + "flos": 68783299344000.0, + "grad_norm": 0.7240640825769642, + "language_loss": 0.64406443, + "learning_rate": 1.112011294493775e-06, + "loss": 0.71969765, + "num_input_tokens_seen": 235932925, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.0123291, + "step": 10928, + "time_per_iteration": 3.1493797302246094 + }, + { + "auxiliary_loss_clip": 0.06413682, + "auxiliary_loss_mlp": 0.01270572, + "balance_loss_clip": 0.06277151, + "balance_loss_mlp": 0.01259354, + "epoch": 0.6570870284082369, + "flos": 26325874212480.0, + "grad_norm": 2.727605777521059, + "language_loss": 0.78076899, + "learning_rate": 1.1116623421440063e-06, + "loss": 0.85761154, + "num_input_tokens_seen": 235952680, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.11212158, + "step": 10929, + "time_per_iteration": 2.602822780609131 + }, + { + "auxiliary_loss_clip": 0.06411244, + "auxiliary_loss_mlp": 0.01264052, + "balance_loss_clip": 0.06275063, + "balance_loss_mlp": 0.01253181, + "epoch": 0.6571471516609049, + "flos": 26181544354560.0, + "grad_norm": 1.645365805026195, + "language_loss": 0.65459454, + "learning_rate": 1.1113134234800895e-06, + "loss": 0.73134756, + "num_input_tokens_seen": 235972075, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.10876465, + "step": 10930, + "time_per_iteration": 3.964470863342285 + }, + { + "auxiliary_loss_clip": 0.06414342, + "auxiliary_loss_mlp": 0.01268622, + "balance_loss_clip": 0.06276581, + "balance_loss_mlp": 0.01257733, + "epoch": 0.6572072749135728, + "flos": 20382537709440.0, + "grad_norm": 1.4804583724978688, + "language_loss": 0.71204734, + "learning_rate": 1.110964538515258e-06, + "loss": 0.78887701, + "num_input_tokens_seen": 235990340, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.10888672, + "step": 10931, + "time_per_iteration": 2.4909491539001465 + }, + { + "auxiliary_loss_clip": 0.06417586, + "auxiliary_loss_mlp": 0.0127043, + "balance_loss_clip": 0.06275665, + "balance_loss_mlp": 0.01259784, + "epoch": 0.6572673981662408, + "flos": 17134438838400.0, + "grad_norm": 1.8915521473051504, + "language_loss": 0.68812561, + "learning_rate": 1.1106156872627393e-06, + "loss": 0.76500577, + "num_input_tokens_seen": 236007470, + "router_z_loss_clip": 1.41894531, + "router_z_loss_mlp": 0.10644531, + "step": 10932, + "time_per_iteration": 2.5176515579223633 + }, + { + "auxiliary_loss_clip": 0.06412166, + "auxiliary_loss_mlp": 0.01268175, + "balance_loss_clip": 0.06274658, + "balance_loss_mlp": 0.01257952, + "epoch": 0.6573275214189087, + "flos": 41283640339200.0, + "grad_norm": 1.6891496229276404, + "language_loss": 0.80723727, + "learning_rate": 1.1102668697357626e-06, + "loss": 0.88404071, + "num_input_tokens_seen": 236029030, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.10229492, + "step": 10933, + "time_per_iteration": 2.6675453186035156 + }, + { + "auxiliary_loss_clip": 0.06419124, + "auxiliary_loss_mlp": 0.01264988, + "balance_loss_clip": 0.06278023, + "balance_loss_mlp": 0.01254432, + "epoch": 0.6573876446715767, + "flos": 22896241470720.0, + "grad_norm": 1.753523075649994, + "language_loss": 0.73957497, + "learning_rate": 1.1099180859475571e-06, + "loss": 0.81641608, + "num_input_tokens_seen": 236047160, + "router_z_loss_clip": 1.41210938, + "router_z_loss_mlp": 0.10552979, + "step": 10934, + "time_per_iteration": 2.555539131164551 + }, + { + "auxiliary_loss_clip": 0.0641007, + "auxiliary_loss_mlp": 0.01270037, + "balance_loss_clip": 0.0627473, + "balance_loss_mlp": 0.01259445, + "epoch": 0.6574477679242448, + "flos": 44028240825600.0, + "grad_norm": 1.5029164504422408, + "language_loss": 0.76213276, + "learning_rate": 1.1095693359113454e-06, + "loss": 0.83893389, + "num_input_tokens_seen": 236069215, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.10583496, + "step": 10935, + "time_per_iteration": 2.6976189613342285 + }, + { + "auxiliary_loss_clip": 0.06416147, + "auxiliary_loss_mlp": 0.01270518, + "balance_loss_clip": 0.06277473, + "balance_loss_mlp": 0.01258967, + "epoch": 0.6575078911769127, + "flos": 24578402912640.0, + "grad_norm": 1.4839652411177968, + "language_loss": 0.78411627, + "learning_rate": 1.1092206196403538e-06, + "loss": 0.86098289, + "num_input_tokens_seen": 236088335, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.11553955, + "step": 10936, + "time_per_iteration": 2.518728494644165 + }, + { + "auxiliary_loss_clip": 0.06411346, + "auxiliary_loss_mlp": 0.01270987, + "balance_loss_clip": 0.06275463, + "balance_loss_mlp": 0.01261301, + "epoch": 0.6575680144295807, + "flos": 20930493484800.0, + "grad_norm": 1.7706689890869223, + "language_loss": 0.68970346, + "learning_rate": 1.1088719371478056e-06, + "loss": 0.76652682, + "num_input_tokens_seen": 236108540, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.09692383, + "step": 10937, + "time_per_iteration": 2.5257480144500732 + }, + { + "auxiliary_loss_clip": 0.06410159, + "auxiliary_loss_mlp": 0.01266555, + "balance_loss_clip": 0.06273675, + "balance_loss_mlp": 0.01255696, + "epoch": 0.6576281376822486, + "flos": 10930213048320.0, + "grad_norm": 2.6009314091519804, + "language_loss": 0.68779373, + "learning_rate": 1.1085232884469236e-06, + "loss": 0.76456088, + "num_input_tokens_seen": 236124495, + "router_z_loss_clip": 1.36621094, + "router_z_loss_mlp": 0.10858154, + "step": 10938, + "time_per_iteration": 2.487494468688965 + }, + { + "auxiliary_loss_clip": 0.06411414, + "auxiliary_loss_mlp": 0.01265537, + "balance_loss_clip": 0.06273697, + "balance_loss_mlp": 0.01254659, + "epoch": 0.6576882609349166, + "flos": 19287632407680.0, + "grad_norm": 1.7840896081065163, + "language_loss": 0.71399069, + "learning_rate": 1.108174673550927e-06, + "loss": 0.79076016, + "num_input_tokens_seen": 236142550, + "router_z_loss_clip": 1.37695312, + "router_z_loss_mlp": 0.10876465, + "step": 10939, + "time_per_iteration": 2.4861202239990234 + }, + { + "auxiliary_loss_clip": 0.0641602, + "auxiliary_loss_mlp": 0.01267708, + "balance_loss_clip": 0.06275935, + "balance_loss_mlp": 0.01256199, + "epoch": 0.6577483841875845, + "flos": 20225168542080.0, + "grad_norm": 5.914491475263239, + "language_loss": 0.77965903, + "learning_rate": 1.107826092473037e-06, + "loss": 0.85649633, + "num_input_tokens_seen": 236156620, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.11505127, + "step": 10940, + "time_per_iteration": 2.491938829421997 + }, + { + "auxiliary_loss_clip": 0.06417249, + "auxiliary_loss_mlp": 0.01271369, + "balance_loss_clip": 0.06273827, + "balance_loss_mlp": 0.01260253, + "epoch": 0.6578085074402525, + "flos": 34759672168320.0, + "grad_norm": 1.9394980575704135, + "language_loss": 0.69278842, + "learning_rate": 1.107477545226471e-06, + "loss": 0.76967466, + "num_input_tokens_seen": 236177095, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.11132812, + "step": 10941, + "time_per_iteration": 2.6296122074127197 + }, + { + "auxiliary_loss_clip": 0.06406929, + "auxiliary_loss_mlp": 0.0126384, + "balance_loss_clip": 0.06270303, + "balance_loss_mlp": 0.01253934, + "epoch": 0.6578686306929205, + "flos": 23476705430400.0, + "grad_norm": 1.8720735918703966, + "language_loss": 0.68617851, + "learning_rate": 1.1071290318244448e-06, + "loss": 0.76288623, + "num_input_tokens_seen": 236194695, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.09906006, + "step": 10942, + "time_per_iteration": 2.5199849605560303 + }, + { + "auxiliary_loss_clip": 0.06417514, + "auxiliary_loss_mlp": 0.01265909, + "balance_loss_clip": 0.0627285, + "balance_loss_mlp": 0.0125391, + "epoch": 0.6579287539455885, + "flos": 18082876003200.0, + "grad_norm": 1.8863772080566783, + "language_loss": 0.71839166, + "learning_rate": 1.1067805522801753e-06, + "loss": 0.7952258, + "num_input_tokens_seen": 236213885, + "router_z_loss_clip": 1.44433594, + "router_z_loss_mlp": 0.12005615, + "step": 10943, + "time_per_iteration": 2.4810752868652344 + }, + { + "auxiliary_loss_clip": 0.06409079, + "auxiliary_loss_mlp": 0.01268026, + "balance_loss_clip": 0.06272689, + "balance_loss_mlp": 0.01257327, + "epoch": 0.6579888771982564, + "flos": 28669532112000.0, + "grad_norm": 1.7035342930552537, + "language_loss": 0.59567684, + "learning_rate": 1.1064321066068778e-06, + "loss": 0.67244786, + "num_input_tokens_seen": 236237315, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.10687256, + "step": 10944, + "time_per_iteration": 2.593003273010254 + }, + { + "auxiliary_loss_clip": 0.06423099, + "auxiliary_loss_mlp": 0.01269429, + "balance_loss_clip": 0.06277057, + "balance_loss_mlp": 0.01257555, + "epoch": 0.6580490004509244, + "flos": 25053627744000.0, + "grad_norm": 1.4789836122868327, + "language_loss": 0.72602201, + "learning_rate": 1.1060836948177646e-06, + "loss": 0.80294728, + "num_input_tokens_seen": 236256345, + "router_z_loss_clip": 1.45898438, + "router_z_loss_mlp": 0.11871338, + "step": 10945, + "time_per_iteration": 2.53983998298645 + }, + { + "auxiliary_loss_clip": 0.06410586, + "auxiliary_loss_mlp": 0.01266442, + "balance_loss_clip": 0.06275351, + "balance_loss_mlp": 0.01256321, + "epoch": 0.6581091237035923, + "flos": 43519040363520.0, + "grad_norm": 1.838349836001675, + "language_loss": 0.70316982, + "learning_rate": 1.105735316926046e-06, + "loss": 0.77994007, + "num_input_tokens_seen": 236281890, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.10119629, + "step": 10946, + "time_per_iteration": 2.798476219177246 + }, + { + "auxiliary_loss_clip": 0.06410632, + "auxiliary_loss_mlp": 0.01265957, + "balance_loss_clip": 0.06272982, + "balance_loss_mlp": 0.01255514, + "epoch": 0.6581692469562603, + "flos": 22421352055680.0, + "grad_norm": 1.8876327732241813, + "language_loss": 0.82383513, + "learning_rate": 1.105386972944934e-06, + "loss": 0.90060103, + "num_input_tokens_seen": 236298370, + "router_z_loss_clip": 1.37402344, + "router_z_loss_mlp": 0.10443115, + "step": 10947, + "time_per_iteration": 2.5243499279022217 + }, + { + "auxiliary_loss_clip": 0.06414369, + "auxiliary_loss_mlp": 0.01263895, + "balance_loss_clip": 0.0627495, + "balance_loss_mlp": 0.01253447, + "epoch": 0.6582293702089284, + "flos": 24866098306560.0, + "grad_norm": 1.5151980350674914, + "language_loss": 0.77415752, + "learning_rate": 1.1050386628876385e-06, + "loss": 0.85094017, + "num_input_tokens_seen": 236317380, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.10449219, + "step": 10948, + "time_per_iteration": 2.543790578842163 + }, + { + "auxiliary_loss_clip": 0.06411085, + "auxiliary_loss_mlp": 0.01265504, + "balance_loss_clip": 0.06274116, + "balance_loss_mlp": 0.01255288, + "epoch": 0.6582894934615963, + "flos": 23046399187200.0, + "grad_norm": 1.478986900014917, + "language_loss": 0.79121858, + "learning_rate": 1.1046903867673655e-06, + "loss": 0.86798447, + "num_input_tokens_seen": 236336210, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.10223389, + "step": 10949, + "time_per_iteration": 2.535895824432373 + }, + { + "auxiliary_loss_clip": 0.06312477, + "auxiliary_loss_mlp": 0.01264797, + "balance_loss_clip": 0.06256588, + "balance_loss_mlp": 0.01263514, + "epoch": 0.6583496167142643, + "flos": 72573274569600.0, + "grad_norm": 0.7232821189613112, + "language_loss": 0.61788374, + "learning_rate": 1.104342144597323e-06, + "loss": 0.69365644, + "num_input_tokens_seen": 236403090, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.01284027, + "step": 10950, + "time_per_iteration": 4.580410957336426 + }, + { + "auxiliary_loss_clip": 0.06408125, + "auxiliary_loss_mlp": 0.01268595, + "balance_loss_clip": 0.06274961, + "balance_loss_mlp": 0.01258778, + "epoch": 0.6584097399669322, + "flos": 13083867815040.0, + "grad_norm": 2.2244546266186354, + "language_loss": 0.6719563, + "learning_rate": 1.1039939363907178e-06, + "loss": 0.74872345, + "num_input_tokens_seen": 236420475, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.09820557, + "step": 10951, + "time_per_iteration": 2.510561466217041 + }, + { + "auxiliary_loss_clip": 0.06409305, + "auxiliary_loss_mlp": 0.01270102, + "balance_loss_clip": 0.06273426, + "balance_loss_mlp": 0.01259921, + "epoch": 0.6584698632196002, + "flos": 28700530922880.0, + "grad_norm": 1.3260041408046892, + "language_loss": 0.76428199, + "learning_rate": 1.1036457621607504e-06, + "loss": 0.84107602, + "num_input_tokens_seen": 236441915, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.10180664, + "step": 10952, + "time_per_iteration": 2.5918259620666504 + }, + { + "auxiliary_loss_clip": 0.06409515, + "auxiliary_loss_mlp": 0.01268051, + "balance_loss_clip": 0.06275044, + "balance_loss_mlp": 0.01257954, + "epoch": 0.6585299864722681, + "flos": 14324486567040.0, + "grad_norm": 1.6835884668716123, + "language_loss": 0.73700249, + "learning_rate": 1.1032976219206257e-06, + "loss": 0.81377816, + "num_input_tokens_seen": 236460340, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.10083008, + "step": 10953, + "time_per_iteration": 2.5165388584136963 + }, + { + "auxiliary_loss_clip": 0.06410642, + "auxiliary_loss_mlp": 0.01266102, + "balance_loss_clip": 0.06274508, + "balance_loss_mlp": 0.01255427, + "epoch": 0.6585901097249361, + "flos": 26805291747840.0, + "grad_norm": 1.6924688741082035, + "language_loss": 0.79007798, + "learning_rate": 1.102949515683546e-06, + "loss": 0.86684537, + "num_input_tokens_seen": 236478280, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.10681152, + "step": 10954, + "time_per_iteration": 2.564539909362793 + }, + { + "auxiliary_loss_clip": 0.06413999, + "auxiliary_loss_mlp": 0.01267466, + "balance_loss_clip": 0.06276879, + "balance_loss_mlp": 0.01257411, + "epoch": 0.658650232977604, + "flos": 18738921945600.0, + "grad_norm": 3.4725197474545215, + "language_loss": 0.69489324, + "learning_rate": 1.1026014434627096e-06, + "loss": 0.77170783, + "num_input_tokens_seen": 236493225, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.10058594, + "step": 10955, + "time_per_iteration": 2.495082139968872 + }, + { + "auxiliary_loss_clip": 0.06405246, + "auxiliary_loss_mlp": 0.01266042, + "balance_loss_clip": 0.06274106, + "balance_loss_mlp": 0.01256398, + "epoch": 0.6587103562302721, + "flos": 24760272199680.0, + "grad_norm": 2.1168101225513056, + "language_loss": 0.81125724, + "learning_rate": 1.1022534052713172e-06, + "loss": 0.88797009, + "num_input_tokens_seen": 236514420, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.09637451, + "step": 10956, + "time_per_iteration": 2.636908531188965 + }, + { + "auxiliary_loss_clip": 0.06413392, + "auxiliary_loss_mlp": 0.0127424, + "balance_loss_clip": 0.06275264, + "balance_loss_mlp": 0.01262808, + "epoch": 0.65877047948294, + "flos": 22352688034560.0, + "grad_norm": 2.1582606979270462, + "language_loss": 0.81753582, + "learning_rate": 1.1019054011225648e-06, + "loss": 0.89441204, + "num_input_tokens_seen": 236532785, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.11431885, + "step": 10957, + "time_per_iteration": 2.6302380561828613 + }, + { + "auxiliary_loss_clip": 0.06405203, + "auxiliary_loss_mlp": 0.01264716, + "balance_loss_clip": 0.0627131, + "balance_loss_mlp": 0.01255513, + "epoch": 0.658830602735608, + "flos": 45189965358720.0, + "grad_norm": 1.6069945820528309, + "language_loss": 0.76651394, + "learning_rate": 1.1015574310296506e-06, + "loss": 0.8432132, + "num_input_tokens_seen": 236553330, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.09197998, + "step": 10958, + "time_per_iteration": 2.7235934734344482 + }, + { + "auxiliary_loss_clip": 0.06409356, + "auxiliary_loss_mlp": 0.01266973, + "balance_loss_clip": 0.0627449, + "balance_loss_mlp": 0.01256811, + "epoch": 0.6588907259882759, + "flos": 19907774075520.0, + "grad_norm": 1.6704982273704214, + "language_loss": 0.75102574, + "learning_rate": 1.1012094950057678e-06, + "loss": 0.82778907, + "num_input_tokens_seen": 236572960, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.10168457, + "step": 10959, + "time_per_iteration": 2.4919495582580566 + }, + { + "auxiliary_loss_clip": 0.06411363, + "auxiliary_loss_mlp": 0.01263366, + "balance_loss_clip": 0.062753, + "balance_loss_mlp": 0.01253609, + "epoch": 0.6589508492409439, + "flos": 24140591729280.0, + "grad_norm": 1.5345825682480954, + "language_loss": 0.65334243, + "learning_rate": 1.1008615930641107e-06, + "loss": 0.73008978, + "num_input_tokens_seen": 236594090, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.09759521, + "step": 10960, + "time_per_iteration": 2.539113998413086 + }, + { + "auxiliary_loss_clip": 0.06417534, + "auxiliary_loss_mlp": 0.01266682, + "balance_loss_clip": 0.06274159, + "balance_loss_mlp": 0.01256305, + "epoch": 0.659010972493612, + "flos": 18228715234560.0, + "grad_norm": 1.960089741542263, + "language_loss": 0.81517863, + "learning_rate": 1.1005137252178734e-06, + "loss": 0.89202076, + "num_input_tokens_seen": 236610190, + "router_z_loss_clip": 1.43359375, + "router_z_loss_mlp": 0.1038208, + "step": 10961, + "time_per_iteration": 3.8582499027252197 + }, + { + "auxiliary_loss_clip": 0.0641351, + "auxiliary_loss_mlp": 0.01267598, + "balance_loss_clip": 0.06275603, + "balance_loss_mlp": 0.01257292, + "epoch": 0.6590710957462799, + "flos": 27607428483840.0, + "grad_norm": 1.7237322524813996, + "language_loss": 0.736247, + "learning_rate": 1.1001658914802453e-06, + "loss": 0.81305802, + "num_input_tokens_seen": 236631575, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.10302734, + "step": 10962, + "time_per_iteration": 2.542795419692993 + }, + { + "auxiliary_loss_clip": 0.06414889, + "auxiliary_loss_mlp": 0.01268579, + "balance_loss_clip": 0.06274842, + "balance_loss_mlp": 0.01257522, + "epoch": 0.6591312189989479, + "flos": 20309177859840.0, + "grad_norm": 1.8258870034084347, + "language_loss": 0.80250466, + "learning_rate": 1.0998180918644165e-06, + "loss": 0.87933934, + "num_input_tokens_seen": 236649815, + "router_z_loss_clip": 1.39941406, + "router_z_loss_mlp": 0.11071777, + "step": 10963, + "time_per_iteration": 2.484524965286255 + }, + { + "auxiliary_loss_clip": 0.06407138, + "auxiliary_loss_mlp": 0.01266706, + "balance_loss_clip": 0.06273393, + "balance_loss_mlp": 0.0125696, + "epoch": 0.6591913422516158, + "flos": 12317886915840.0, + "grad_norm": 1.5886018528393113, + "language_loss": 0.78204167, + "learning_rate": 1.0994703263835754e-06, + "loss": 0.85878009, + "num_input_tokens_seen": 236668335, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.09753418, + "step": 10964, + "time_per_iteration": 4.032490015029907 + }, + { + "auxiliary_loss_clip": 0.06414784, + "auxiliary_loss_mlp": 0.0126533, + "balance_loss_clip": 0.06274471, + "balance_loss_mlp": 0.01255787, + "epoch": 0.6592514655042838, + "flos": 25891626827520.0, + "grad_norm": 1.653857660787362, + "language_loss": 0.7398777, + "learning_rate": 1.0991225950509106e-06, + "loss": 0.81667888, + "num_input_tokens_seen": 236688945, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.09539795, + "step": 10965, + "time_per_iteration": 2.558753490447998 + }, + { + "auxiliary_loss_clip": 0.06415711, + "auxiliary_loss_mlp": 0.01266111, + "balance_loss_clip": 0.0627279, + "balance_loss_mlp": 0.0125528, + "epoch": 0.6593115887569517, + "flos": 14068754305920.0, + "grad_norm": 2.292623636057082, + "language_loss": 0.74313521, + "learning_rate": 1.0987748978796067e-06, + "loss": 0.81995344, + "num_input_tokens_seen": 236707055, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.1083374, + "step": 10966, + "time_per_iteration": 2.4695546627044678 + }, + { + "auxiliary_loss_clip": 0.06410235, + "auxiliary_loss_mlp": 0.01265948, + "balance_loss_clip": 0.06273091, + "balance_loss_mlp": 0.01255273, + "epoch": 0.6593717120096197, + "flos": 24724912976640.0, + "grad_norm": 1.5343869413599147, + "language_loss": 0.77172506, + "learning_rate": 1.0984272348828487e-06, + "loss": 0.8484869, + "num_input_tokens_seen": 236725900, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.10662842, + "step": 10967, + "time_per_iteration": 2.554844856262207 + }, + { + "auxiliary_loss_clip": 0.0630592, + "auxiliary_loss_mlp": 0.01258736, + "balance_loss_clip": 0.06250164, + "balance_loss_mlp": 0.01257491, + "epoch": 0.6594318352622877, + "flos": 55577951907840.0, + "grad_norm": 0.6831964979389027, + "language_loss": 0.48237032, + "learning_rate": 1.0980796060738221e-06, + "loss": 0.5580169, + "num_input_tokens_seen": 236788415, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.01243591, + "step": 10968, + "time_per_iteration": 3.1279184818267822 + }, + { + "auxiliary_loss_clip": 0.06412826, + "auxiliary_loss_mlp": 0.01261785, + "balance_loss_clip": 0.06273898, + "balance_loss_mlp": 0.01251569, + "epoch": 0.6594919585149557, + "flos": 17462650481280.0, + "grad_norm": 1.6973549586156937, + "language_loss": 0.79805654, + "learning_rate": 1.0977320114657058e-06, + "loss": 0.87480259, + "num_input_tokens_seen": 236805155, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.10211182, + "step": 10969, + "time_per_iteration": 3.929111957550049 + }, + { + "auxiliary_loss_clip": 0.0641497, + "auxiliary_loss_mlp": 0.01265533, + "balance_loss_clip": 0.06276352, + "balance_loss_mlp": 0.01255239, + "epoch": 0.6595520817676236, + "flos": 18229092577920.0, + "grad_norm": 1.9822858612354273, + "language_loss": 0.65968251, + "learning_rate": 1.0973844510716817e-06, + "loss": 0.73648757, + "num_input_tokens_seen": 236824360, + "router_z_loss_clip": 1.38476562, + "router_z_loss_mlp": 0.10296631, + "step": 10970, + "time_per_iteration": 2.534639835357666 + }, + { + "auxiliary_loss_clip": 0.06411758, + "auxiliary_loss_mlp": 0.01263435, + "balance_loss_clip": 0.06272757, + "balance_loss_mlp": 0.01253368, + "epoch": 0.6596122050202916, + "flos": 22206219897600.0, + "grad_norm": 1.4827049257585125, + "language_loss": 0.76440203, + "learning_rate": 1.0970369249049308e-06, + "loss": 0.84115398, + "num_input_tokens_seen": 236844640, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.10064697, + "step": 10971, + "time_per_iteration": 2.518568515777588 + }, + { + "auxiliary_loss_clip": 0.06414073, + "auxiliary_loss_mlp": 0.01263478, + "balance_loss_clip": 0.06274455, + "balance_loss_mlp": 0.01253101, + "epoch": 0.6596723282729595, + "flos": 14179108533120.0, + "grad_norm": 2.58028286016492, + "language_loss": 0.70073628, + "learning_rate": 1.096689432978629e-06, + "loss": 0.77751178, + "num_input_tokens_seen": 236861160, + "router_z_loss_clip": 1.39648438, + "router_z_loss_mlp": 0.10388184, + "step": 10972, + "time_per_iteration": 2.5301804542541504 + }, + { + "auxiliary_loss_clip": 0.06411418, + "auxiliary_loss_mlp": 0.01263284, + "balance_loss_clip": 0.0627436, + "balance_loss_mlp": 0.01252931, + "epoch": 0.6597324515256275, + "flos": 30560746291200.0, + "grad_norm": 1.6494264278825825, + "language_loss": 0.55793309, + "learning_rate": 1.0963419753059556e-06, + "loss": 0.63468015, + "num_input_tokens_seen": 236880465, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.10351562, + "step": 10973, + "time_per_iteration": 2.5836968421936035 + }, + { + "auxiliary_loss_clip": 0.06425004, + "auxiliary_loss_mlp": 0.01265958, + "balance_loss_clip": 0.06279783, + "balance_loss_mlp": 0.0125579, + "epoch": 0.6597925747782956, + "flos": 17645693725440.0, + "grad_norm": 2.424477152178303, + "language_loss": 0.78669357, + "learning_rate": 1.0959945519000839e-06, + "loss": 0.86360323, + "num_input_tokens_seen": 236897730, + "router_z_loss_clip": 1.45214844, + "router_z_loss_mlp": 0.10174561, + "step": 10974, + "time_per_iteration": 2.5438265800476074 + }, + { + "auxiliary_loss_clip": 0.06416789, + "auxiliary_loss_mlp": 0.01266385, + "balance_loss_clip": 0.06276938, + "balance_loss_mlp": 0.01255567, + "epoch": 0.6598526980309635, + "flos": 22825523024640.0, + "grad_norm": 2.75247163208804, + "language_loss": 0.69161505, + "learning_rate": 1.0956471627741906e-06, + "loss": 0.7684468, + "num_input_tokens_seen": 236917300, + "router_z_loss_clip": 1.39941406, + "router_z_loss_mlp": 0.10821533, + "step": 10975, + "time_per_iteration": 2.517643690109253 + }, + { + "auxiliary_loss_clip": 0.06413519, + "auxiliary_loss_mlp": 0.01263226, + "balance_loss_clip": 0.06275275, + "balance_loss_mlp": 0.01252766, + "epoch": 0.6599128212836315, + "flos": 21074194437120.0, + "grad_norm": 1.6033931639433516, + "language_loss": 0.70794642, + "learning_rate": 1.0952998079414464e-06, + "loss": 0.78471386, + "num_input_tokens_seen": 236935590, + "router_z_loss_clip": 1.38183594, + "router_z_loss_mlp": 0.10455322, + "step": 10976, + "time_per_iteration": 2.5318117141723633 + }, + { + "auxiliary_loss_clip": 0.06410262, + "auxiliary_loss_mlp": 0.01267729, + "balance_loss_clip": 0.06275579, + "balance_loss_mlp": 0.01257065, + "epoch": 0.6599729445362994, + "flos": 22170022133760.0, + "grad_norm": 1.5758270650588126, + "language_loss": 0.67691094, + "learning_rate": 1.0949524874150243e-06, + "loss": 0.75369084, + "num_input_tokens_seen": 236952830, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.10668945, + "step": 10977, + "time_per_iteration": 2.485891342163086 + }, + { + "auxiliary_loss_clip": 0.06420588, + "auxiliary_loss_mlp": 0.01267585, + "balance_loss_clip": 0.0627695, + "balance_loss_mlp": 0.01256427, + "epoch": 0.6600330677889674, + "flos": 18155900436480.0, + "grad_norm": 2.2117923844530694, + "language_loss": 0.81200063, + "learning_rate": 1.0946052012080952e-06, + "loss": 0.8888824, + "num_input_tokens_seen": 236971930, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.11157227, + "step": 10978, + "time_per_iteration": 2.5422048568725586 + }, + { + "auxiliary_loss_clip": 0.0641408, + "auxiliary_loss_mlp": 0.01266547, + "balance_loss_clip": 0.06273466, + "balance_loss_mlp": 0.01255461, + "epoch": 0.6600931910416353, + "flos": 18155942363520.0, + "grad_norm": 2.6619753374489767, + "language_loss": 0.67523986, + "learning_rate": 1.0942579493338278e-06, + "loss": 0.75204611, + "num_input_tokens_seen": 236989920, + "router_z_loss_clip": 1.40429688, + "router_z_loss_mlp": 0.11096191, + "step": 10979, + "time_per_iteration": 2.5064504146575928 + }, + { + "auxiliary_loss_clip": 0.06413005, + "auxiliary_loss_mlp": 0.01265818, + "balance_loss_clip": 0.06272849, + "balance_loss_mlp": 0.0125528, + "epoch": 0.6601533142943034, + "flos": 17426494644480.0, + "grad_norm": 2.8604366894108324, + "language_loss": 0.73473299, + "learning_rate": 1.0939107318053889e-06, + "loss": 0.81152123, + "num_input_tokens_seen": 237006570, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.10540771, + "step": 10980, + "time_per_iteration": 2.5004913806915283 + }, + { + "auxiliary_loss_clip": 0.06408733, + "auxiliary_loss_mlp": 0.01271257, + "balance_loss_clip": 0.06274907, + "balance_loss_mlp": 0.01261441, + "epoch": 0.6602134375469713, + "flos": 28226983173120.0, + "grad_norm": 1.584002725324806, + "language_loss": 0.72518432, + "learning_rate": 1.0935635486359459e-06, + "loss": 0.80198425, + "num_input_tokens_seen": 237028415, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.09814453, + "step": 10981, + "time_per_iteration": 2.552730083465576 + }, + { + "auxiliary_loss_clip": 0.0641138, + "auxiliary_loss_mlp": 0.01266077, + "balance_loss_clip": 0.06272905, + "balance_loss_mlp": 0.01256111, + "epoch": 0.6602735607996393, + "flos": 29424737761920.0, + "grad_norm": 1.8532747935564327, + "language_loss": 0.69432831, + "learning_rate": 1.0932163998386647e-06, + "loss": 0.77110291, + "num_input_tokens_seen": 237046595, + "router_z_loss_clip": 1.38476562, + "router_z_loss_mlp": 0.09960938, + "step": 10982, + "time_per_iteration": 2.591977834701538 + }, + { + "auxiliary_loss_clip": 0.06413966, + "auxiliary_loss_mlp": 0.01264907, + "balance_loss_clip": 0.06277901, + "balance_loss_mlp": 0.01254148, + "epoch": 0.6603336840523072, + "flos": 18593963182080.0, + "grad_norm": 1.4024673840301536, + "language_loss": 0.69806457, + "learning_rate": 1.0928692854267075e-06, + "loss": 0.77485329, + "num_input_tokens_seen": 237066150, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.10760498, + "step": 10983, + "time_per_iteration": 2.483527660369873 + }, + { + "auxiliary_loss_clip": 0.06413279, + "auxiliary_loss_mlp": 0.012674, + "balance_loss_clip": 0.06274509, + "balance_loss_mlp": 0.01256409, + "epoch": 0.6603938073049752, + "flos": 33263153447040.0, + "grad_norm": 1.5623815208568963, + "language_loss": 0.70765328, + "learning_rate": 1.092522205413239e-06, + "loss": 0.78446013, + "num_input_tokens_seen": 237087060, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.10998535, + "step": 10984, + "time_per_iteration": 2.6334474086761475 + }, + { + "auxiliary_loss_clip": 0.06408207, + "auxiliary_loss_mlp": 0.01266467, + "balance_loss_clip": 0.06274273, + "balance_loss_mlp": 0.01256078, + "epoch": 0.6604539305576431, + "flos": 17390045318400.0, + "grad_norm": 1.8218342593599246, + "language_loss": 0.84316599, + "learning_rate": 1.0921751598114193e-06, + "loss": 0.9199127, + "num_input_tokens_seen": 237103825, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.10394287, + "step": 10985, + "time_per_iteration": 2.4621846675872803 + }, + { + "auxiliary_loss_clip": 0.06415112, + "auxiliary_loss_mlp": 0.01267549, + "balance_loss_clip": 0.06275454, + "balance_loss_mlp": 0.01256779, + "epoch": 0.6605140538103111, + "flos": 21257447316480.0, + "grad_norm": 1.9945336241456124, + "language_loss": 0.74090636, + "learning_rate": 1.0918281486344077e-06, + "loss": 0.81773293, + "num_input_tokens_seen": 237121740, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.10778809, + "step": 10986, + "time_per_iteration": 2.5241971015930176 + }, + { + "auxiliary_loss_clip": 0.06414539, + "auxiliary_loss_mlp": 0.01269603, + "balance_loss_clip": 0.06278964, + "balance_loss_mlp": 0.01259673, + "epoch": 0.6605741770629792, + "flos": 13886885018880.0, + "grad_norm": 1.8900199688101529, + "language_loss": 0.79989499, + "learning_rate": 1.0914811718953636e-06, + "loss": 0.8767364, + "num_input_tokens_seen": 237139565, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.09936523, + "step": 10987, + "time_per_iteration": 2.467759132385254 + }, + { + "auxiliary_loss_clip": 0.06315437, + "auxiliary_loss_mlp": 0.01250965, + "balance_loss_clip": 0.06259646, + "balance_loss_mlp": 0.0124932, + "epoch": 0.6606343003156471, + "flos": 69338885840640.0, + "grad_norm": 0.958585987636571, + "language_loss": 0.5413903, + "learning_rate": 1.0911342296074454e-06, + "loss": 0.61705434, + "num_input_tokens_seen": 237201055, + "router_z_loss_clip": 0.55566406, + "router_z_loss_mlp": 0.01647949, + "step": 10988, + "time_per_iteration": 3.2449100017547607 + }, + { + "auxiliary_loss_clip": 0.0641297, + "auxiliary_loss_mlp": 0.01265201, + "balance_loss_clip": 0.06277774, + "balance_loss_mlp": 0.0125508, + "epoch": 0.6606944235683151, + "flos": 27279887673600.0, + "grad_norm": 1.4331259688792952, + "language_loss": 0.77265781, + "learning_rate": 1.0907873217838077e-06, + "loss": 0.8494395, + "num_input_tokens_seen": 237221805, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.10119629, + "step": 10989, + "time_per_iteration": 2.565397262573242 + }, + { + "auxiliary_loss_clip": 0.06413271, + "auxiliary_loss_mlp": 0.01268256, + "balance_loss_clip": 0.06277858, + "balance_loss_mlp": 0.01257796, + "epoch": 0.660754546820983, + "flos": 13778082092160.0, + "grad_norm": 1.981088082283497, + "language_loss": 0.77234143, + "learning_rate": 1.0904404484376064e-06, + "loss": 0.84915674, + "num_input_tokens_seen": 237238270, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.10461426, + "step": 10990, + "time_per_iteration": 3.8957126140594482 + }, + { + "auxiliary_loss_clip": 0.06422216, + "auxiliary_loss_mlp": 0.01267426, + "balance_loss_clip": 0.06283079, + "balance_loss_mlp": 0.0125693, + "epoch": 0.660814670073651, + "flos": 15710567207040.0, + "grad_norm": 2.3076268356000864, + "language_loss": 0.60737276, + "learning_rate": 1.0900936095819937e-06, + "loss": 0.68426919, + "num_input_tokens_seen": 237255400, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.10491943, + "step": 10991, + "time_per_iteration": 2.528184175491333 + }, + { + "auxiliary_loss_clip": 0.0641991, + "auxiliary_loss_mlp": 0.01270981, + "balance_loss_clip": 0.06280324, + "balance_loss_mlp": 0.012599, + "epoch": 0.6608747933263189, + "flos": 20856295094400.0, + "grad_norm": 2.771721604026619, + "language_loss": 0.67745811, + "learning_rate": 1.0897468052301234e-06, + "loss": 0.75436699, + "num_input_tokens_seen": 237273105, + "router_z_loss_clip": 1.39257812, + "router_z_loss_mlp": 0.11083984, + "step": 10992, + "time_per_iteration": 2.5081818103790283 + }, + { + "auxiliary_loss_clip": 0.06419984, + "auxiliary_loss_mlp": 0.01265645, + "balance_loss_clip": 0.06279188, + "balance_loss_mlp": 0.01254588, + "epoch": 0.660934916578987, + "flos": 20638521532800.0, + "grad_norm": 1.8747370045388403, + "language_loss": 0.87962919, + "learning_rate": 1.0894000353951444e-06, + "loss": 0.95648551, + "num_input_tokens_seen": 237292650, + "router_z_loss_clip": 1.40722656, + "router_z_loss_mlp": 0.11053467, + "step": 10993, + "time_per_iteration": 2.5521185398101807 + }, + { + "auxiliary_loss_clip": 0.0642574, + "auxiliary_loss_mlp": 0.0126717, + "balance_loss_clip": 0.06281907, + "balance_loss_mlp": 0.01255434, + "epoch": 0.6609950398316549, + "flos": 25119692288640.0, + "grad_norm": 1.7537930651875573, + "language_loss": 0.67272747, + "learning_rate": 1.0890533000902078e-06, + "loss": 0.74965656, + "num_input_tokens_seen": 237312865, + "router_z_loss_clip": 1.43847656, + "router_z_loss_mlp": 0.11737061, + "step": 10994, + "time_per_iteration": 2.6144933700561523 + }, + { + "auxiliary_loss_clip": 0.06417718, + "auxiliary_loss_mlp": 0.01264904, + "balance_loss_clip": 0.06279863, + "balance_loss_mlp": 0.01253812, + "epoch": 0.6610551630843229, + "flos": 18667155323520.0, + "grad_norm": 1.5859648112701323, + "language_loss": 0.77035165, + "learning_rate": 1.0887065993284626e-06, + "loss": 0.84717792, + "num_input_tokens_seen": 237331210, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.11096191, + "step": 10995, + "time_per_iteration": 2.5111653804779053 + }, + { + "auxiliary_loss_clip": 0.06421737, + "auxiliary_loss_mlp": 0.01276001, + "balance_loss_clip": 0.06282931, + "balance_loss_mlp": 0.01265868, + "epoch": 0.6611152863369908, + "flos": 23264885508480.0, + "grad_norm": 1.7748442712796604, + "language_loss": 0.74969876, + "learning_rate": 1.088359933123053e-06, + "loss": 0.82667613, + "num_input_tokens_seen": 237349455, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.10137939, + "step": 10996, + "time_per_iteration": 2.5098516941070557 + }, + { + "auxiliary_loss_clip": 0.06418104, + "auxiliary_loss_mlp": 0.01265908, + "balance_loss_clip": 0.06280057, + "balance_loss_mlp": 0.0125562, + "epoch": 0.6611754095896588, + "flos": 22165577867520.0, + "grad_norm": 1.6113039426712623, + "language_loss": 0.69186199, + "learning_rate": 1.088013301487126e-06, + "loss": 0.76870203, + "num_input_tokens_seen": 237367100, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.10296631, + "step": 10997, + "time_per_iteration": 2.525808095932007 + }, + { + "auxiliary_loss_clip": 0.06421575, + "auxiliary_loss_mlp": 0.01265058, + "balance_loss_clip": 0.06279309, + "balance_loss_mlp": 0.01254467, + "epoch": 0.6612355328423267, + "flos": 13996442632320.0, + "grad_norm": 1.959031062109239, + "language_loss": 0.68880165, + "learning_rate": 1.0876667044338269e-06, + "loss": 0.76566797, + "num_input_tokens_seen": 237384840, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.10601807, + "step": 10998, + "time_per_iteration": 2.457221269607544 + }, + { + "auxiliary_loss_clip": 0.06313896, + "auxiliary_loss_mlp": 0.01252861, + "balance_loss_clip": 0.06257924, + "balance_loss_mlp": 0.01251496, + "epoch": 0.6612956560949947, + "flos": 61472051337600.0, + "grad_norm": 0.641819710963161, + "language_loss": 0.50997436, + "learning_rate": 1.087320141976297e-06, + "loss": 0.58564192, + "num_input_tokens_seen": 237443355, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01367188, + "step": 10999, + "time_per_iteration": 3.1182916164398193 + }, + { + "auxiliary_loss_clip": 0.06424031, + "auxiliary_loss_mlp": 0.01268354, + "balance_loss_clip": 0.06280085, + "balance_loss_mlp": 0.01257554, + "epoch": 0.6613557793476627, + "flos": 21623114534400.0, + "grad_norm": 2.559990275838241, + "language_loss": 0.70366681, + "learning_rate": 1.086973614127679e-06, + "loss": 0.78059065, + "num_input_tokens_seen": 237459205, + "router_z_loss_clip": 1.43945312, + "router_z_loss_mlp": 0.10797119, + "step": 11000, + "time_per_iteration": 3.9581432342529297 + }, + { + "auxiliary_loss_clip": 0.06411293, + "auxiliary_loss_mlp": 0.01264334, + "balance_loss_clip": 0.06276174, + "balance_loss_mlp": 0.01254523, + "epoch": 0.6614159026003307, + "flos": 34028379659520.0, + "grad_norm": 1.6165930596704574, + "language_loss": 0.65563923, + "learning_rate": 1.0866271209011133e-06, + "loss": 0.73239553, + "num_input_tokens_seen": 237483580, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.0980835, + "step": 11001, + "time_per_iteration": 2.6200945377349854 + }, + { + "auxiliary_loss_clip": 0.06414855, + "auxiliary_loss_mlp": 0.01264334, + "balance_loss_clip": 0.06279069, + "balance_loss_mlp": 0.01254207, + "epoch": 0.6614760258529987, + "flos": 24104100476160.0, + "grad_norm": 1.733561890110771, + "language_loss": 0.73266578, + "learning_rate": 1.086280662309739e-06, + "loss": 0.80945766, + "num_input_tokens_seen": 237502860, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.10137939, + "step": 11002, + "time_per_iteration": 2.5620791912078857 + }, + { + "auxiliary_loss_clip": 0.06415205, + "auxiliary_loss_mlp": 0.01266083, + "balance_loss_clip": 0.06279428, + "balance_loss_mlp": 0.01255372, + "epoch": 0.6615361491056666, + "flos": 14909227084800.0, + "grad_norm": 2.451590701969631, + "language_loss": 0.79098624, + "learning_rate": 1.0859342383666928e-06, + "loss": 0.86779916, + "num_input_tokens_seen": 237521030, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10705566, + "step": 11003, + "time_per_iteration": 2.481431007385254 + }, + { + "auxiliary_loss_clip": 0.06419842, + "auxiliary_loss_mlp": 0.01267917, + "balance_loss_clip": 0.06279956, + "balance_loss_mlp": 0.01256449, + "epoch": 0.6615962723583346, + "flos": 15310337379840.0, + "grad_norm": 2.101443479539304, + "language_loss": 0.69193184, + "learning_rate": 1.0855878490851119e-06, + "loss": 0.76880944, + "num_input_tokens_seen": 237539585, + "router_z_loss_clip": 1.39941406, + "router_z_loss_mlp": 0.11468506, + "step": 11004, + "time_per_iteration": 4.006279945373535 + }, + { + "auxiliary_loss_clip": 0.06422809, + "auxiliary_loss_mlp": 0.0127206, + "balance_loss_clip": 0.06279877, + "balance_loss_mlp": 0.01260741, + "epoch": 0.6616563956110025, + "flos": 18738293040000.0, + "grad_norm": 2.056452219231189, + "language_loss": 0.70325673, + "learning_rate": 1.085241494478132e-06, + "loss": 0.78020537, + "num_input_tokens_seen": 237557655, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.11328125, + "step": 11005, + "time_per_iteration": 2.4944448471069336 + }, + { + "auxiliary_loss_clip": 0.06413882, + "auxiliary_loss_mlp": 0.01266539, + "balance_loss_clip": 0.06277984, + "balance_loss_mlp": 0.01256019, + "epoch": 0.6617165188636706, + "flos": 24501353483520.0, + "grad_norm": 1.5254702956902315, + "language_loss": 0.78776741, + "learning_rate": 1.0848951745588855e-06, + "loss": 0.86457157, + "num_input_tokens_seen": 237577000, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10510254, + "step": 11006, + "time_per_iteration": 2.5451557636260986 + }, + { + "auxiliary_loss_clip": 0.06416766, + "auxiliary_loss_mlp": 0.01267157, + "balance_loss_clip": 0.06280621, + "balance_loss_mlp": 0.01256649, + "epoch": 0.6617766421163385, + "flos": 22385741270400.0, + "grad_norm": 1.834529140929997, + "language_loss": 0.76486355, + "learning_rate": 1.0845488893405068e-06, + "loss": 0.84170276, + "num_input_tokens_seen": 237597960, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.1050415, + "step": 11007, + "time_per_iteration": 2.5298049449920654 + }, + { + "auxiliary_loss_clip": 0.0641939, + "auxiliary_loss_mlp": 0.01265491, + "balance_loss_clip": 0.06281586, + "balance_loss_mlp": 0.01255185, + "epoch": 0.6618367653690065, + "flos": 20856756291840.0, + "grad_norm": 1.4555215695175368, + "language_loss": 0.78606236, + "learning_rate": 1.0842026388361248e-06, + "loss": 0.86291116, + "num_input_tokens_seen": 237616385, + "router_z_loss_clip": 1.37695312, + "router_z_loss_mlp": 0.10302734, + "step": 11008, + "time_per_iteration": 4.0146424770355225 + }, + { + "auxiliary_loss_clip": 0.06420049, + "auxiliary_loss_mlp": 0.01265114, + "balance_loss_clip": 0.06275912, + "balance_loss_mlp": 0.01254004, + "epoch": 0.6618968886216744, + "flos": 17718089253120.0, + "grad_norm": 1.6552311812920846, + "language_loss": 0.82077724, + "learning_rate": 1.0838564230588715e-06, + "loss": 0.89762884, + "num_input_tokens_seen": 237634930, + "router_z_loss_clip": 1.44140625, + "router_z_loss_mlp": 0.11114502, + "step": 11009, + "time_per_iteration": 2.532111883163452 + }, + { + "auxiliary_loss_clip": 0.06314184, + "auxiliary_loss_mlp": 0.01255522, + "balance_loss_clip": 0.06257774, + "balance_loss_mlp": 0.01254004, + "epoch": 0.6619570118743424, + "flos": 67054500305280.0, + "grad_norm": 0.9881156540659067, + "language_loss": 0.67673898, + "learning_rate": 1.0835102420218735e-06, + "loss": 0.75243598, + "num_input_tokens_seen": 237693175, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01517487, + "step": 11010, + "time_per_iteration": 3.0648674964904785 + }, + { + "auxiliary_loss_clip": 0.06415196, + "auxiliary_loss_mlp": 0.01266404, + "balance_loss_clip": 0.0627633, + "balance_loss_mlp": 0.01254745, + "epoch": 0.6620171351270103, + "flos": 18666819907200.0, + "grad_norm": 1.5625294645604648, + "language_loss": 0.71682811, + "learning_rate": 1.0831640957382593e-06, + "loss": 0.79364407, + "num_input_tokens_seen": 237713160, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.11657715, + "step": 11011, + "time_per_iteration": 2.527869939804077 + }, + { + "auxiliary_loss_clip": 0.06418953, + "auxiliary_loss_mlp": 0.0126958, + "balance_loss_clip": 0.06281759, + "balance_loss_mlp": 0.01259548, + "epoch": 0.6620772583796783, + "flos": 24177376471680.0, + "grad_norm": 1.61722758281003, + "language_loss": 0.72627336, + "learning_rate": 1.0828179842211557e-06, + "loss": 0.80315864, + "num_input_tokens_seen": 237733600, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.10040283, + "step": 11012, + "time_per_iteration": 2.53691029548645 + }, + { + "auxiliary_loss_clip": 0.0640786, + "auxiliary_loss_mlp": 0.01270166, + "balance_loss_clip": 0.06279317, + "balance_loss_mlp": 0.01260903, + "epoch": 0.6621373816323463, + "flos": 23630385018240.0, + "grad_norm": 1.5542286383883441, + "language_loss": 0.79656094, + "learning_rate": 1.0824719074836845e-06, + "loss": 0.8733412, + "num_input_tokens_seen": 237752135, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.09265137, + "step": 11013, + "time_per_iteration": 2.5782439708709717 + }, + { + "auxiliary_loss_clip": 0.06413269, + "auxiliary_loss_mlp": 0.01265248, + "balance_loss_clip": 0.062774, + "balance_loss_mlp": 0.01254973, + "epoch": 0.6621975048850143, + "flos": 18448123950720.0, + "grad_norm": 1.9713400088604554, + "language_loss": 0.70423663, + "learning_rate": 1.082125865538971e-06, + "loss": 0.78102177, + "num_input_tokens_seen": 237770735, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10266113, + "step": 11014, + "time_per_iteration": 2.474597454071045 + }, + { + "auxiliary_loss_clip": 0.06411768, + "auxiliary_loss_mlp": 0.01265368, + "balance_loss_clip": 0.06278192, + "balance_loss_mlp": 0.01256475, + "epoch": 0.6622576281376823, + "flos": 14069047795200.0, + "grad_norm": 1.5898800545059366, + "language_loss": 0.77497208, + "learning_rate": 1.081779858400137e-06, + "loss": 0.85174346, + "num_input_tokens_seen": 237789005, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.08886719, + "step": 11015, + "time_per_iteration": 2.5123109817504883 + }, + { + "auxiliary_loss_clip": 0.06413803, + "auxiliary_loss_mlp": 0.01267289, + "balance_loss_clip": 0.06278028, + "balance_loss_mlp": 0.01256191, + "epoch": 0.6623177513903502, + "flos": 17024587735680.0, + "grad_norm": 1.7138462778054382, + "language_loss": 0.82368481, + "learning_rate": 1.0814338860803021e-06, + "loss": 0.90049571, + "num_input_tokens_seen": 237807740, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.11102295, + "step": 11016, + "time_per_iteration": 2.477137565612793 + }, + { + "auxiliary_loss_clip": 0.06418676, + "auxiliary_loss_mlp": 0.01263773, + "balance_loss_clip": 0.06277445, + "balance_loss_mlp": 0.01253175, + "epoch": 0.6623778746430182, + "flos": 17276127292800.0, + "grad_norm": 2.159067097867079, + "language_loss": 0.70195687, + "learning_rate": 1.0810879485925864e-06, + "loss": 0.77878135, + "num_input_tokens_seen": 237826340, + "router_z_loss_clip": 1.41210938, + "router_z_loss_mlp": 0.10583496, + "step": 11017, + "time_per_iteration": 2.5194361209869385 + }, + { + "auxiliary_loss_clip": 0.06414436, + "auxiliary_loss_mlp": 0.01267466, + "balance_loss_clip": 0.0627765, + "balance_loss_mlp": 0.01257101, + "epoch": 0.6624379978956861, + "flos": 48802725198720.0, + "grad_norm": 1.7089146920832974, + "language_loss": 0.77715868, + "learning_rate": 1.0807420459501084e-06, + "loss": 0.85397768, + "num_input_tokens_seen": 237848305, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.1036377, + "step": 11018, + "time_per_iteration": 2.7684452533721924 + }, + { + "auxiliary_loss_clip": 0.06414039, + "auxiliary_loss_mlp": 0.0126262, + "balance_loss_clip": 0.06278235, + "balance_loss_mlp": 0.01252714, + "epoch": 0.6624981211483542, + "flos": 18958330661760.0, + "grad_norm": 1.809730512167174, + "language_loss": 0.83465689, + "learning_rate": 1.0803961781659841e-06, + "loss": 0.91142356, + "num_input_tokens_seen": 237867020, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.09899902, + "step": 11019, + "time_per_iteration": 2.5207102298736572 + }, + { + "auxiliary_loss_clip": 0.06410275, + "auxiliary_loss_mlp": 0.01263185, + "balance_loss_clip": 0.0627672, + "balance_loss_mlp": 0.01253434, + "epoch": 0.6625582444010221, + "flos": 23262998791680.0, + "grad_norm": 1.565039350749023, + "language_loss": 0.72290635, + "learning_rate": 1.080050345253328e-06, + "loss": 0.79964089, + "num_input_tokens_seen": 237886710, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.09747314, + "step": 11020, + "time_per_iteration": 2.52868914604187 + }, + { + "auxiliary_loss_clip": 0.06419435, + "auxiliary_loss_mlp": 0.01268652, + "balance_loss_clip": 0.06276274, + "balance_loss_mlp": 0.01257601, + "epoch": 0.6626183676536901, + "flos": 21400770925440.0, + "grad_norm": 3.661943544447812, + "language_loss": 0.72194296, + "learning_rate": 1.0797045472252554e-06, + "loss": 0.79882383, + "num_input_tokens_seen": 237904795, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.11047363, + "step": 11021, + "time_per_iteration": 2.5214977264404297 + }, + { + "auxiliary_loss_clip": 0.06417044, + "auxiliary_loss_mlp": 0.01269377, + "balance_loss_clip": 0.06279403, + "balance_loss_mlp": 0.0125891, + "epoch": 0.662678490906358, + "flos": 14575984197120.0, + "grad_norm": 4.221661740882693, + "language_loss": 0.83307576, + "learning_rate": 1.0793587840948793e-06, + "loss": 0.90993994, + "num_input_tokens_seen": 237921320, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.10467529, + "step": 11022, + "time_per_iteration": 2.495877981185913 + }, + { + "auxiliary_loss_clip": 0.0642494, + "auxiliary_loss_mlp": 0.01267242, + "balance_loss_clip": 0.06277288, + "balance_loss_mlp": 0.0125513, + "epoch": 0.662738614159026, + "flos": 15996962862720.0, + "grad_norm": 2.5511625457855116, + "language_loss": 0.73115802, + "learning_rate": 1.0790130558753099e-06, + "loss": 0.80807984, + "num_input_tokens_seen": 237933525, + "router_z_loss_clip": 1.47753906, + "router_z_loss_mlp": 0.12115479, + "step": 11023, + "time_per_iteration": 2.475238800048828 + }, + { + "auxiliary_loss_clip": 0.06413288, + "auxiliary_loss_mlp": 0.01270086, + "balance_loss_clip": 0.06276564, + "balance_loss_mlp": 0.01259327, + "epoch": 0.6627987374116939, + "flos": 19542358419840.0, + "grad_norm": 1.582084315278466, + "language_loss": 0.75136846, + "learning_rate": 1.0786673625796574e-06, + "loss": 0.82820219, + "num_input_tokens_seen": 237953395, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.10748291, + "step": 11024, + "time_per_iteration": 2.5104072093963623 + }, + { + "auxiliary_loss_clip": 0.06414796, + "auxiliary_loss_mlp": 0.01267042, + "balance_loss_clip": 0.06277162, + "balance_loss_mlp": 0.01256635, + "epoch": 0.662858860664362, + "flos": 15707800022400.0, + "grad_norm": 3.5687971531497236, + "language_loss": 0.70028591, + "learning_rate": 1.0783217042210306e-06, + "loss": 0.77710426, + "num_input_tokens_seen": 237971445, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.10406494, + "step": 11025, + "time_per_iteration": 2.528007745742798 + }, + { + "auxiliary_loss_clip": 0.06416678, + "auxiliary_loss_mlp": 0.01266074, + "balance_loss_clip": 0.06279378, + "balance_loss_mlp": 0.01255513, + "epoch": 0.6629189839170299, + "flos": 20160026392320.0, + "grad_norm": 1.3776452398710215, + "language_loss": 0.78906387, + "learning_rate": 1.0779760808125379e-06, + "loss": 0.8658914, + "num_input_tokens_seen": 237989965, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.10565186, + "step": 11026, + "time_per_iteration": 2.5116465091705322 + }, + { + "auxiliary_loss_clip": 0.06413042, + "auxiliary_loss_mlp": 0.0126691, + "balance_loss_clip": 0.06277484, + "balance_loss_mlp": 0.01256759, + "epoch": 0.6629791071696979, + "flos": 20920430995200.0, + "grad_norm": 1.672126176860425, + "language_loss": 0.76636124, + "learning_rate": 1.0776304923672842e-06, + "loss": 0.84316075, + "num_input_tokens_seen": 238006820, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.1015625, + "step": 11027, + "time_per_iteration": 2.496917486190796 + }, + { + "auxiliary_loss_clip": 0.06414916, + "auxiliary_loss_mlp": 0.01265895, + "balance_loss_clip": 0.0627641, + "balance_loss_mlp": 0.01254708, + "epoch": 0.6630392304223659, + "flos": 20852647441920.0, + "grad_norm": 2.0836235208298115, + "language_loss": 0.70842957, + "learning_rate": 1.0772849388983742e-06, + "loss": 0.78523767, + "num_input_tokens_seen": 238022560, + "router_z_loss_clip": 1.38476562, + "router_z_loss_mlp": 0.11193848, + "step": 11028, + "time_per_iteration": 2.5055668354034424 + }, + { + "auxiliary_loss_clip": 0.06413043, + "auxiliary_loss_mlp": 0.01264718, + "balance_loss_clip": 0.06275769, + "balance_loss_mlp": 0.01254741, + "epoch": 0.6630993536750338, + "flos": 21002092398720.0, + "grad_norm": 1.9464575885295123, + "language_loss": 0.79627401, + "learning_rate": 1.0769394204189138e-06, + "loss": 0.87305164, + "num_input_tokens_seen": 238041895, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.09979248, + "step": 11029, + "time_per_iteration": 4.029799461364746 + }, + { + "auxiliary_loss_clip": 0.06414881, + "auxiliary_loss_mlp": 0.01267354, + "balance_loss_clip": 0.06275269, + "balance_loss_mlp": 0.01255755, + "epoch": 0.6631594769277018, + "flos": 18264787217280.0, + "grad_norm": 2.0842184585841994, + "language_loss": 0.76459014, + "learning_rate": 1.0765939369420012e-06, + "loss": 0.84141254, + "num_input_tokens_seen": 238060445, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.1159668, + "step": 11030, + "time_per_iteration": 2.499678611755371 + }, + { + "auxiliary_loss_clip": 0.06420542, + "auxiliary_loss_mlp": 0.01269601, + "balance_loss_clip": 0.06277149, + "balance_loss_mlp": 0.01258426, + "epoch": 0.6632196001803697, + "flos": 17826053639040.0, + "grad_norm": 2.267864257363868, + "language_loss": 0.75185478, + "learning_rate": 1.0762484884807391e-06, + "loss": 0.82875621, + "num_input_tokens_seen": 238077080, + "router_z_loss_clip": 1.43359375, + "router_z_loss_mlp": 0.11169434, + "step": 11031, + "time_per_iteration": 2.470355272293091 + }, + { + "auxiliary_loss_clip": 0.06414694, + "auxiliary_loss_mlp": 0.01264566, + "balance_loss_clip": 0.06273525, + "balance_loss_mlp": 0.0125342, + "epoch": 0.6632797234330378, + "flos": 12673910914560.0, + "grad_norm": 2.431299325405645, + "language_loss": 0.74500775, + "learning_rate": 1.075903075048228e-06, + "loss": 0.82180035, + "num_input_tokens_seen": 238091045, + "router_z_loss_clip": 1.41210938, + "router_z_loss_mlp": 0.11151123, + "step": 11032, + "time_per_iteration": 2.485921859741211 + }, + { + "auxiliary_loss_clip": 0.06407184, + "auxiliary_loss_mlp": 0.01266513, + "balance_loss_clip": 0.06272276, + "balance_loss_mlp": 0.01256296, + "epoch": 0.6633398466857057, + "flos": 23591168507520.0, + "grad_norm": 1.735276154326279, + "language_loss": 0.80570471, + "learning_rate": 1.0755576966575635e-06, + "loss": 0.88244164, + "num_input_tokens_seen": 238110220, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.10217285, + "step": 11033, + "time_per_iteration": 2.5526669025421143 + }, + { + "auxiliary_loss_clip": 0.0641445, + "auxiliary_loss_mlp": 0.01269108, + "balance_loss_clip": 0.06275126, + "balance_loss_mlp": 0.01257497, + "epoch": 0.6633999699383737, + "flos": 20638018408320.0, + "grad_norm": 1.5867971062319928, + "language_loss": 0.80710161, + "learning_rate": 1.0752123533218451e-06, + "loss": 0.88393718, + "num_input_tokens_seen": 238130400, + "router_z_loss_clip": 1.39257812, + "router_z_loss_mlp": 0.11608887, + "step": 11034, + "time_per_iteration": 2.5465288162231445 + }, + { + "auxiliary_loss_clip": 0.06408665, + "auxiliary_loss_mlp": 0.01266422, + "balance_loss_clip": 0.06272399, + "balance_loss_mlp": 0.01256569, + "epoch": 0.6634600931910416, + "flos": 21803264812800.0, + "grad_norm": 1.6372739814417405, + "language_loss": 0.76400816, + "learning_rate": 1.074867045054166e-06, + "loss": 0.84075904, + "num_input_tokens_seen": 238148165, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.09851074, + "step": 11035, + "time_per_iteration": 2.5024783611297607 + }, + { + "auxiliary_loss_clip": 0.06416409, + "auxiliary_loss_mlp": 0.01265126, + "balance_loss_clip": 0.06273785, + "balance_loss_mlp": 0.01254648, + "epoch": 0.6635202164437096, + "flos": 18738628456320.0, + "grad_norm": 1.632864185122063, + "language_loss": 0.8277241, + "learning_rate": 1.074521771867622e-06, + "loss": 0.90453947, + "num_input_tokens_seen": 238166360, + "router_z_loss_clip": 1.42675781, + "router_z_loss_mlp": 0.10491943, + "step": 11036, + "time_per_iteration": 2.5380334854125977 + }, + { + "auxiliary_loss_clip": 0.06308148, + "auxiliary_loss_mlp": 0.01254977, + "balance_loss_clip": 0.06252232, + "balance_loss_mlp": 0.0125369, + "epoch": 0.6635803396963775, + "flos": 60242501324160.0, + "grad_norm": 0.7586749678323187, + "language_loss": 0.5225606, + "learning_rate": 1.0741765337753044e-06, + "loss": 0.59819186, + "num_input_tokens_seen": 238227630, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01287842, + "step": 11037, + "time_per_iteration": 3.1442580223083496 + }, + { + "auxiliary_loss_clip": 0.06412059, + "auxiliary_loss_mlp": 0.01266845, + "balance_loss_clip": 0.06273833, + "balance_loss_mlp": 0.01255443, + "epoch": 0.6636404629490456, + "flos": 29174414088960.0, + "grad_norm": 1.6208815133420311, + "language_loss": 0.79116094, + "learning_rate": 1.0738313307903052e-06, + "loss": 0.86795002, + "num_input_tokens_seen": 238248435, + "router_z_loss_clip": 1.38476562, + "router_z_loss_mlp": 0.11407471, + "step": 11038, + "time_per_iteration": 2.5753371715545654 + }, + { + "auxiliary_loss_clip": 0.06411879, + "auxiliary_loss_mlp": 0.01264789, + "balance_loss_clip": 0.06272987, + "balance_loss_mlp": 0.01253542, + "epoch": 0.6637005862017135, + "flos": 38916530496000.0, + "grad_norm": 2.008253443704211, + "language_loss": 0.6435625, + "learning_rate": 1.073486162925716e-06, + "loss": 0.72032923, + "num_input_tokens_seen": 238268755, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.11248779, + "step": 11039, + "time_per_iteration": 2.6589627265930176 + }, + { + "auxiliary_loss_clip": 0.06414853, + "auxiliary_loss_mlp": 0.01265068, + "balance_loss_clip": 0.06273548, + "balance_loss_mlp": 0.01254613, + "epoch": 0.6637607094543815, + "flos": 22789870312320.0, + "grad_norm": 2.5741405662525856, + "language_loss": 0.64139444, + "learning_rate": 1.0731410301946237e-06, + "loss": 0.71819365, + "num_input_tokens_seen": 238290120, + "router_z_loss_clip": 1.41503906, + "router_z_loss_mlp": 0.10455322, + "step": 11040, + "time_per_iteration": 3.924652338027954 + }, + { + "auxiliary_loss_clip": 0.06410997, + "auxiliary_loss_mlp": 0.01267386, + "balance_loss_clip": 0.06275022, + "balance_loss_mlp": 0.01257909, + "epoch": 0.6638208327070495, + "flos": 18119996161920.0, + "grad_norm": 1.923413934429174, + "language_loss": 0.72439963, + "learning_rate": 1.0727959326101161e-06, + "loss": 0.80118346, + "num_input_tokens_seen": 238309290, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.09484863, + "step": 11041, + "time_per_iteration": 2.5356383323669434 + }, + { + "auxiliary_loss_clip": 0.06416036, + "auxiliary_loss_mlp": 0.01265882, + "balance_loss_clip": 0.06278844, + "balance_loss_mlp": 0.01255415, + "epoch": 0.6638809559597174, + "flos": 29432703899520.0, + "grad_norm": 2.049859271676146, + "language_loss": 0.61855423, + "learning_rate": 1.0724508701852806e-06, + "loss": 0.69537336, + "num_input_tokens_seen": 238327280, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.10473633, + "step": 11042, + "time_per_iteration": 2.664304256439209 + }, + { + "auxiliary_loss_clip": 0.06417962, + "auxiliary_loss_mlp": 0.012679, + "balance_loss_clip": 0.06273351, + "balance_loss_mlp": 0.01256444, + "epoch": 0.6639410792123854, + "flos": 28079928057600.0, + "grad_norm": 1.8233607330526647, + "language_loss": 0.69058919, + "learning_rate": 1.0721058429331998e-06, + "loss": 0.76744783, + "num_input_tokens_seen": 238346330, + "router_z_loss_clip": 1.44433594, + "router_z_loss_mlp": 0.11462402, + "step": 11043, + "time_per_iteration": 4.0889365673065186 + }, + { + "auxiliary_loss_clip": 0.06404908, + "auxiliary_loss_mlp": 0.01269104, + "balance_loss_clip": 0.06272525, + "balance_loss_mlp": 0.01259818, + "epoch": 0.6640012024650533, + "flos": 25563373257600.0, + "grad_norm": 1.464057970327077, + "language_loss": 0.83693618, + "learning_rate": 1.0717608508669587e-06, + "loss": 0.91367632, + "num_input_tokens_seen": 238364650, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.09283447, + "step": 11044, + "time_per_iteration": 2.5765178203582764 + }, + { + "auxiliary_loss_clip": 0.0640911, + "auxiliary_loss_mlp": 0.01263885, + "balance_loss_clip": 0.0627351, + "balance_loss_mlp": 0.01253234, + "epoch": 0.6640613257177214, + "flos": 14872316561280.0, + "grad_norm": 2.273920138408825, + "language_loss": 0.69855309, + "learning_rate": 1.0714158939996392e-06, + "loss": 0.77528304, + "num_input_tokens_seen": 238381630, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.10650635, + "step": 11045, + "time_per_iteration": 2.475839376449585 + }, + { + "auxiliary_loss_clip": 0.06414758, + "auxiliary_loss_mlp": 0.01268834, + "balance_loss_clip": 0.06275514, + "balance_loss_mlp": 0.01258349, + "epoch": 0.6641214489703893, + "flos": 23227681495680.0, + "grad_norm": 1.3157905928087725, + "language_loss": 0.64253563, + "learning_rate": 1.0710709723443235e-06, + "loss": 0.71937156, + "num_input_tokens_seen": 238402595, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.10479736, + "step": 11046, + "time_per_iteration": 2.550718307495117 + }, + { + "auxiliary_loss_clip": 0.06412549, + "auxiliary_loss_mlp": 0.01265992, + "balance_loss_clip": 0.06275138, + "balance_loss_mlp": 0.01255859, + "epoch": 0.6641815722230573, + "flos": 37751661434880.0, + "grad_norm": 1.3902156312209348, + "language_loss": 0.71747851, + "learning_rate": 1.070726085914088e-06, + "loss": 0.79426396, + "num_input_tokens_seen": 238426860, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.10137939, + "step": 11047, + "time_per_iteration": 2.6542744636535645 + }, + { + "auxiliary_loss_clip": 0.06412829, + "auxiliary_loss_mlp": 0.01265859, + "balance_loss_clip": 0.06275409, + "balance_loss_mlp": 0.01255226, + "epoch": 0.6642416954757252, + "flos": 17936910990720.0, + "grad_norm": 1.7027644321315345, + "language_loss": 0.77464539, + "learning_rate": 1.0703812347220126e-06, + "loss": 0.8514322, + "num_input_tokens_seen": 238443990, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.10632324, + "step": 11048, + "time_per_iteration": 3.896479606628418 + }, + { + "auxiliary_loss_clip": 0.06311446, + "auxiliary_loss_mlp": 0.01254354, + "balance_loss_clip": 0.06255244, + "balance_loss_mlp": 0.01253094, + "epoch": 0.6643018187283932, + "flos": 52010712362880.0, + "grad_norm": 0.7347657101869507, + "language_loss": 0.55013496, + "learning_rate": 1.0700364187811745e-06, + "loss": 0.62579298, + "num_input_tokens_seen": 238503045, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01259613, + "step": 11049, + "time_per_iteration": 3.139099359512329 + }, + { + "auxiliary_loss_clip": 0.06414302, + "auxiliary_loss_mlp": 0.01268369, + "balance_loss_clip": 0.06277852, + "balance_loss_mlp": 0.01258189, + "epoch": 0.6643619419810611, + "flos": 30234463292160.0, + "grad_norm": 1.5235184894534042, + "language_loss": 0.64387465, + "learning_rate": 1.069691638104648e-06, + "loss": 0.72070134, + "num_input_tokens_seen": 238527320, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.10174561, + "step": 11050, + "time_per_iteration": 2.5815443992614746 + }, + { + "auxiliary_loss_clip": 0.06413838, + "auxiliary_loss_mlp": 0.0126498, + "balance_loss_clip": 0.06278379, + "balance_loss_mlp": 0.01254948, + "epoch": 0.6644220652337292, + "flos": 22972745848320.0, + "grad_norm": 1.9836199726179196, + "language_loss": 0.7914626, + "learning_rate": 1.0693468927055085e-06, + "loss": 0.86825073, + "num_input_tokens_seen": 238546030, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.1003418, + "step": 11051, + "time_per_iteration": 2.554255247116089 + }, + { + "auxiliary_loss_clip": 0.06413689, + "auxiliary_loss_mlp": 0.01267197, + "balance_loss_clip": 0.06275009, + "balance_loss_mlp": 0.01256778, + "epoch": 0.6644821884863971, + "flos": 21148602462720.0, + "grad_norm": 1.572752749022216, + "language_loss": 0.85833442, + "learning_rate": 1.0690021825968276e-06, + "loss": 0.93514335, + "num_input_tokens_seen": 238564175, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.10418701, + "step": 11052, + "time_per_iteration": 2.526331663131714 + }, + { + "auxiliary_loss_clip": 0.06422149, + "auxiliary_loss_mlp": 0.01265962, + "balance_loss_clip": 0.06279093, + "balance_loss_mlp": 0.01255108, + "epoch": 0.6645423117390651, + "flos": 20198907486720.0, + "grad_norm": 2.2521915942040134, + "language_loss": 0.75079048, + "learning_rate": 1.0686575077916776e-06, + "loss": 0.82767153, + "num_input_tokens_seen": 238581010, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.10864258, + "step": 11053, + "time_per_iteration": 2.495643377304077 + }, + { + "auxiliary_loss_clip": 0.06411796, + "auxiliary_loss_mlp": 0.01267797, + "balance_loss_clip": 0.06275838, + "balance_loss_mlp": 0.01257659, + "epoch": 0.6646024349917331, + "flos": 24358700707200.0, + "grad_norm": 1.4285282050820745, + "language_loss": 0.79548883, + "learning_rate": 1.0683128683031278e-06, + "loss": 0.87228477, + "num_input_tokens_seen": 238601365, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10144043, + "step": 11054, + "time_per_iteration": 2.533238649368286 + }, + { + "auxiliary_loss_clip": 0.06410603, + "auxiliary_loss_mlp": 0.0126873, + "balance_loss_clip": 0.06275114, + "balance_loss_mlp": 0.01258848, + "epoch": 0.664662558244401, + "flos": 18812617211520.0, + "grad_norm": 1.7645551715374934, + "language_loss": 0.73951137, + "learning_rate": 1.0679682641442472e-06, + "loss": 0.81630468, + "num_input_tokens_seen": 238619850, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.09875488, + "step": 11055, + "time_per_iteration": 2.5263750553131104 + }, + { + "auxiliary_loss_clip": 0.0641698, + "auxiliary_loss_mlp": 0.01266606, + "balance_loss_clip": 0.06276543, + "balance_loss_mlp": 0.01255186, + "epoch": 0.664722681497069, + "flos": 18958749932160.0, + "grad_norm": 1.6799288466366076, + "language_loss": 0.72991651, + "learning_rate": 1.0676236953281042e-06, + "loss": 0.80675244, + "num_input_tokens_seen": 238637635, + "router_z_loss_clip": 1.40527344, + "router_z_loss_mlp": 0.11431885, + "step": 11056, + "time_per_iteration": 2.4944491386413574 + }, + { + "auxiliary_loss_clip": 0.064121, + "auxiliary_loss_mlp": 0.01267868, + "balance_loss_clip": 0.06275958, + "balance_loss_mlp": 0.01257508, + "epoch": 0.6647828047497369, + "flos": 19577046810240.0, + "grad_norm": 1.7319313014316244, + "language_loss": 0.69902766, + "learning_rate": 1.0672791618677641e-06, + "loss": 0.77582735, + "num_input_tokens_seen": 238656200, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.1036377, + "step": 11057, + "time_per_iteration": 2.5427403450012207 + }, + { + "auxiliary_loss_clip": 0.06416071, + "auxiliary_loss_mlp": 0.01265479, + "balance_loss_clip": 0.06276184, + "balance_loss_mlp": 0.01255298, + "epoch": 0.664842928002405, + "flos": 23156250289920.0, + "grad_norm": 1.6627595883052484, + "language_loss": 0.80624598, + "learning_rate": 1.066934663776291e-06, + "loss": 0.88306141, + "num_input_tokens_seen": 238675005, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.10186768, + "step": 11058, + "time_per_iteration": 2.543358325958252 + }, + { + "auxiliary_loss_clip": 0.06310651, + "auxiliary_loss_mlp": 0.01251744, + "balance_loss_clip": 0.06254779, + "balance_loss_mlp": 0.01250295, + "epoch": 0.6649030512550729, + "flos": 65263326301440.0, + "grad_norm": 0.7825270857978761, + "language_loss": 0.6256783, + "learning_rate": 1.0665902010667496e-06, + "loss": 0.70130229, + "num_input_tokens_seen": 238731425, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.01447296, + "step": 11059, + "time_per_iteration": 3.081268548965454 + }, + { + "auxiliary_loss_clip": 0.0641288, + "auxiliary_loss_mlp": 0.01266045, + "balance_loss_clip": 0.06275995, + "balance_loss_mlp": 0.01255549, + "epoch": 0.6649631745077409, + "flos": 20201213473920.0, + "grad_norm": 1.6475331375538982, + "language_loss": 0.79008389, + "learning_rate": 1.0662457737522008e-06, + "loss": 0.86687315, + "num_input_tokens_seen": 238752020, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.1048584, + "step": 11060, + "time_per_iteration": 2.5021138191223145 + }, + { + "auxiliary_loss_clip": 0.06418125, + "auxiliary_loss_mlp": 0.0126778, + "balance_loss_clip": 0.06280607, + "balance_loss_mlp": 0.01257266, + "epoch": 0.6650232977604088, + "flos": 17244331868160.0, + "grad_norm": 2.2525334751718358, + "language_loss": 0.79225111, + "learning_rate": 1.0659013818457055e-06, + "loss": 0.86911017, + "num_input_tokens_seen": 238769665, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.10510254, + "step": 11061, + "time_per_iteration": 2.4997215270996094 + }, + { + "auxiliary_loss_clip": 0.06414805, + "auxiliary_loss_mlp": 0.01266652, + "balance_loss_clip": 0.06278637, + "balance_loss_mlp": 0.01256102, + "epoch": 0.6650834210130768, + "flos": 10010175217920.0, + "grad_norm": 1.965420807772364, + "language_loss": 0.57191408, + "learning_rate": 1.0655570253603243e-06, + "loss": 0.64872867, + "num_input_tokens_seen": 238782180, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.10552979, + "step": 11062, + "time_per_iteration": 2.457599401473999 + }, + { + "auxiliary_loss_clip": 0.06419773, + "auxiliary_loss_mlp": 0.01266686, + "balance_loss_clip": 0.06275927, + "balance_loss_mlp": 0.01254533, + "epoch": 0.6651435442657447, + "flos": 10456707225600.0, + "grad_norm": 2.498798138431811, + "language_loss": 0.76121116, + "learning_rate": 1.0652127043091144e-06, + "loss": 0.83807576, + "num_input_tokens_seen": 238800315, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.121521, + "step": 11063, + "time_per_iteration": 2.5354268550872803 + }, + { + "auxiliary_loss_clip": 0.06417998, + "auxiliary_loss_mlp": 0.01266902, + "balance_loss_clip": 0.06278798, + "balance_loss_mlp": 0.0125724, + "epoch": 0.6652036675184128, + "flos": 22350465901440.0, + "grad_norm": 2.2315353157370836, + "language_loss": 0.708628, + "learning_rate": 1.0648684187051316e-06, + "loss": 0.78547704, + "num_input_tokens_seen": 238822250, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.09655762, + "step": 11064, + "time_per_iteration": 2.601271390914917 + }, + { + "auxiliary_loss_clip": 0.06307759, + "auxiliary_loss_mlp": 0.01252714, + "balance_loss_clip": 0.06251188, + "balance_loss_mlp": 0.01251267, + "epoch": 0.6652637907710807, + "flos": 52925467386240.0, + "grad_norm": 0.8269137521288277, + "language_loss": 0.62977844, + "learning_rate": 1.0645241685614322e-06, + "loss": 0.70538318, + "num_input_tokens_seen": 238877190, + "router_z_loss_clip": 0.56640625, + "router_z_loss_mlp": 0.01445007, + "step": 11065, + "time_per_iteration": 3.088651180267334 + }, + { + "auxiliary_loss_clip": 0.06417314, + "auxiliary_loss_mlp": 0.01265582, + "balance_loss_clip": 0.06277956, + "balance_loss_mlp": 0.01255091, + "epoch": 0.6653239140237487, + "flos": 23110031963520.0, + "grad_norm": 1.7770048566161585, + "language_loss": 0.62216848, + "learning_rate": 1.0641799538910708e-06, + "loss": 0.69899738, + "num_input_tokens_seen": 238896010, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.10491943, + "step": 11066, + "time_per_iteration": 2.514662981033325 + }, + { + "auxiliary_loss_clip": 0.06416589, + "auxiliary_loss_mlp": 0.01266733, + "balance_loss_clip": 0.06276087, + "balance_loss_mlp": 0.01256123, + "epoch": 0.6653840372764167, + "flos": 25966747612800.0, + "grad_norm": 1.500590710166923, + "language_loss": 0.70431817, + "learning_rate": 1.0638357747070985e-06, + "loss": 0.78115141, + "num_input_tokens_seen": 238918990, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.1060791, + "step": 11067, + "time_per_iteration": 2.629611015319824 + }, + { + "auxiliary_loss_clip": 0.06312129, + "auxiliary_loss_mlp": 0.01250999, + "balance_loss_clip": 0.06255849, + "balance_loss_mlp": 0.01249609, + "epoch": 0.6654441605290846, + "flos": 66059593251840.0, + "grad_norm": 0.8851345245048583, + "language_loss": 0.71944451, + "learning_rate": 1.0634916310225684e-06, + "loss": 0.79507577, + "num_input_tokens_seen": 238975735, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01391602, + "step": 11068, + "time_per_iteration": 3.1097211837768555 + }, + { + "auxiliary_loss_clip": 0.06313328, + "auxiliary_loss_mlp": 0.01253328, + "balance_loss_clip": 0.0625675, + "balance_loss_mlp": 0.01251991, + "epoch": 0.6655042837817526, + "flos": 65218560693120.0, + "grad_norm": 0.7108385158391787, + "language_loss": 0.577793, + "learning_rate": 1.0631475228505285e-06, + "loss": 0.65345955, + "num_input_tokens_seen": 239042360, + "router_z_loss_clip": 0.56640625, + "router_z_loss_mlp": 0.01338959, + "step": 11069, + "time_per_iteration": 4.7683820724487305 + }, + { + "auxiliary_loss_clip": 0.0631298, + "auxiliary_loss_mlp": 0.01252294, + "balance_loss_clip": 0.0625658, + "balance_loss_mlp": 0.01250911, + "epoch": 0.6655644070344205, + "flos": 69028759480320.0, + "grad_norm": 0.7328423376388431, + "language_loss": 0.63529485, + "learning_rate": 1.062803450204029e-06, + "loss": 0.71094757, + "num_input_tokens_seen": 239109410, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01386261, + "step": 11070, + "time_per_iteration": 3.218775749206543 + }, + { + "auxiliary_loss_clip": 0.06412843, + "auxiliary_loss_mlp": 0.0126417, + "balance_loss_clip": 0.06274422, + "balance_loss_mlp": 0.01253668, + "epoch": 0.6656245302870886, + "flos": 36323680953600.0, + "grad_norm": 1.5647890242278204, + "language_loss": 0.58715665, + "learning_rate": 1.062459413096116e-06, + "loss": 0.66392684, + "num_input_tokens_seen": 239135345, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.1050415, + "step": 11071, + "time_per_iteration": 2.6759583950042725 + }, + { + "auxiliary_loss_clip": 0.06415486, + "auxiliary_loss_mlp": 0.01266737, + "balance_loss_clip": 0.06278834, + "balance_loss_mlp": 0.01256544, + "epoch": 0.6656846535397565, + "flos": 21800623409280.0, + "grad_norm": 1.6094882760656495, + "language_loss": 0.7278558, + "learning_rate": 1.0621154115398364e-06, + "loss": 0.80467808, + "num_input_tokens_seen": 239154340, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.10192871, + "step": 11072, + "time_per_iteration": 2.506439685821533 + }, + { + "auxiliary_loss_clip": 0.0641008, + "auxiliary_loss_mlp": 0.01266315, + "balance_loss_clip": 0.06274915, + "balance_loss_mlp": 0.01255729, + "epoch": 0.6657447767924245, + "flos": 37496683860480.0, + "grad_norm": 1.9931671493726393, + "language_loss": 0.70538545, + "learning_rate": 1.0617714455482353e-06, + "loss": 0.78214943, + "num_input_tokens_seen": 239177815, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.10583496, + "step": 11073, + "time_per_iteration": 2.687361240386963 + }, + { + "auxiliary_loss_clip": 0.06420862, + "auxiliary_loss_mlp": 0.01262058, + "balance_loss_clip": 0.06278072, + "balance_loss_mlp": 0.01251353, + "epoch": 0.6658049000450924, + "flos": 16843473135360.0, + "grad_norm": 1.8042269767870909, + "language_loss": 0.5659616, + "learning_rate": 1.061427515134354e-06, + "loss": 0.64279079, + "num_input_tokens_seen": 239195735, + "router_z_loss_clip": 1.42675781, + "router_z_loss_mlp": 0.10699463, + "step": 11074, + "time_per_iteration": 2.476226568222046 + }, + { + "auxiliary_loss_clip": 0.06415518, + "auxiliary_loss_mlp": 0.01268741, + "balance_loss_clip": 0.06278802, + "balance_loss_mlp": 0.01258417, + "epoch": 0.6658650232977604, + "flos": 33519430759680.0, + "grad_norm": 1.4700349170865334, + "language_loss": 0.72126347, + "learning_rate": 1.061083620311235e-06, + "loss": 0.79810607, + "num_input_tokens_seen": 239217535, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.10321045, + "step": 11075, + "time_per_iteration": 2.655700922012329 + }, + { + "auxiliary_loss_clip": 0.06410009, + "auxiliary_loss_mlp": 0.01264716, + "balance_loss_clip": 0.06274687, + "balance_loss_mlp": 0.01254983, + "epoch": 0.6659251465504283, + "flos": 37715379816960.0, + "grad_norm": 1.432398272569416, + "language_loss": 0.66657937, + "learning_rate": 1.0607397610919202e-06, + "loss": 0.7433266, + "num_input_tokens_seen": 239241975, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.09729004, + "step": 11076, + "time_per_iteration": 2.66424822807312 + }, + { + "auxiliary_loss_clip": 0.06411892, + "auxiliary_loss_mlp": 0.01265269, + "balance_loss_clip": 0.06275803, + "balance_loss_mlp": 0.01254433, + "epoch": 0.6659852698030964, + "flos": 24899277323520.0, + "grad_norm": 1.6226979142446254, + "language_loss": 0.75448096, + "learning_rate": 1.0603959374894468e-06, + "loss": 0.83125257, + "num_input_tokens_seen": 239262025, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.10845947, + "step": 11077, + "time_per_iteration": 2.5727341175079346 + }, + { + "auxiliary_loss_clip": 0.06412426, + "auxiliary_loss_mlp": 0.01263175, + "balance_loss_clip": 0.06273601, + "balance_loss_mlp": 0.01252631, + "epoch": 0.6660453930557643, + "flos": 24359706956160.0, + "grad_norm": 1.8442117034793826, + "language_loss": 0.66886055, + "learning_rate": 1.0600521495168538e-06, + "loss": 0.74561661, + "num_input_tokens_seen": 239282775, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.10546875, + "step": 11078, + "time_per_iteration": 2.543839931488037 + }, + { + "auxiliary_loss_clip": 0.06421163, + "auxiliary_loss_mlp": 0.01268494, + "balance_loss_clip": 0.06279247, + "balance_loss_mlp": 0.01257533, + "epoch": 0.6661055163084323, + "flos": 10602420675840.0, + "grad_norm": 1.9694934778902873, + "language_loss": 0.69631219, + "learning_rate": 1.0597083971871783e-06, + "loss": 0.77320874, + "num_input_tokens_seen": 239299775, + "router_z_loss_clip": 1.41894531, + "router_z_loss_mlp": 0.10961914, + "step": 11079, + "time_per_iteration": 2.541069269180298 + }, + { + "auxiliary_loss_clip": 0.06411281, + "auxiliary_loss_mlp": 0.01265438, + "balance_loss_clip": 0.06274305, + "balance_loss_mlp": 0.01255067, + "epoch": 0.6661656395611003, + "flos": 24063751935360.0, + "grad_norm": 2.893983796141558, + "language_loss": 0.80461812, + "learning_rate": 1.0593646805134544e-06, + "loss": 0.88138527, + "num_input_tokens_seen": 239319660, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.10375977, + "step": 11080, + "time_per_iteration": 4.085668087005615 + }, + { + "auxiliary_loss_clip": 0.06407166, + "auxiliary_loss_mlp": 0.01263859, + "balance_loss_clip": 0.06275053, + "balance_loss_mlp": 0.01254114, + "epoch": 0.6662257628137682, + "flos": 23042332264320.0, + "grad_norm": 1.7166684069014877, + "language_loss": 0.78285092, + "learning_rate": 1.0590209995087157e-06, + "loss": 0.85956115, + "num_input_tokens_seen": 239339215, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09729004, + "step": 11081, + "time_per_iteration": 2.5193705558776855 + }, + { + "auxiliary_loss_clip": 0.06415745, + "auxiliary_loss_mlp": 0.01265653, + "balance_loss_clip": 0.06274147, + "balance_loss_mlp": 0.01254364, + "epoch": 0.6662858860664362, + "flos": 24761446156800.0, + "grad_norm": 1.6242146726224216, + "language_loss": 0.80530953, + "learning_rate": 1.0586773541859946e-06, + "loss": 0.88212347, + "num_input_tokens_seen": 239358545, + "router_z_loss_clip": 1.41601562, + "router_z_loss_mlp": 0.11291504, + "step": 11082, + "time_per_iteration": 2.569957971572876 + }, + { + "auxiliary_loss_clip": 0.0641424, + "auxiliary_loss_mlp": 0.01265735, + "balance_loss_clip": 0.06276894, + "balance_loss_mlp": 0.01255757, + "epoch": 0.6663460093191041, + "flos": 20014899920640.0, + "grad_norm": 1.3932549437891448, + "language_loss": 0.83467507, + "learning_rate": 1.0583337445583234e-06, + "loss": 0.91147482, + "num_input_tokens_seen": 239376665, + "router_z_loss_clip": 1.37402344, + "router_z_loss_mlp": 0.09979248, + "step": 11083, + "time_per_iteration": 3.9742698669433594 + }, + { + "auxiliary_loss_clip": 0.06423122, + "auxiliary_loss_mlp": 0.01266045, + "balance_loss_clip": 0.06280323, + "balance_loss_mlp": 0.01254995, + "epoch": 0.6664061325717722, + "flos": 17827101815040.0, + "grad_norm": 2.1194460311014023, + "language_loss": 0.85585803, + "learning_rate": 1.057990170638731e-06, + "loss": 0.93274969, + "num_input_tokens_seen": 239394345, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.11053467, + "step": 11084, + "time_per_iteration": 2.4959633350372314 + }, + { + "auxiliary_loss_clip": 0.0642017, + "auxiliary_loss_mlp": 0.01265063, + "balance_loss_clip": 0.06277794, + "balance_loss_mlp": 0.0125434, + "epoch": 0.6664662558244401, + "flos": 18082666368000.0, + "grad_norm": 2.6259945452160185, + "language_loss": 0.73187411, + "learning_rate": 1.0576466324402452e-06, + "loss": 0.80872643, + "num_input_tokens_seen": 239410605, + "router_z_loss_clip": 1.42382812, + "router_z_loss_mlp": 0.1072998, + "step": 11085, + "time_per_iteration": 2.475743055343628 + }, + { + "auxiliary_loss_clip": 0.06412315, + "auxiliary_loss_mlp": 0.01264882, + "balance_loss_clip": 0.06275545, + "balance_loss_mlp": 0.01253718, + "epoch": 0.6665263790771081, + "flos": 21579663392640.0, + "grad_norm": 1.7551532896089992, + "language_loss": 0.80931759, + "learning_rate": 1.057303129975894e-06, + "loss": 0.88608956, + "num_input_tokens_seen": 239427155, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.11157227, + "step": 11086, + "time_per_iteration": 2.537797689437866 + }, + { + "auxiliary_loss_clip": 0.06411488, + "auxiliary_loss_mlp": 0.01267617, + "balance_loss_clip": 0.06275479, + "balance_loss_mlp": 0.01257079, + "epoch": 0.666586502329776, + "flos": 24213448454400.0, + "grad_norm": 1.98835460832662, + "language_loss": 0.7529, + "learning_rate": 1.056959663258702e-06, + "loss": 0.82969105, + "num_input_tokens_seen": 239445510, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.10540771, + "step": 11087, + "time_per_iteration": 2.5238702297210693 + }, + { + "auxiliary_loss_clip": 0.06414294, + "auxiliary_loss_mlp": 0.01264278, + "balance_loss_clip": 0.06277943, + "balance_loss_mlp": 0.01253621, + "epoch": 0.666646625582444, + "flos": 22207100365440.0, + "grad_norm": 1.5295252788179032, + "language_loss": 0.65136206, + "learning_rate": 1.0566162323016939e-06, + "loss": 0.72814775, + "num_input_tokens_seen": 239464805, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.10656738, + "step": 11088, + "time_per_iteration": 3.9619038105010986 + }, + { + "auxiliary_loss_clip": 0.06416193, + "auxiliary_loss_mlp": 0.01266589, + "balance_loss_clip": 0.06277834, + "balance_loss_mlp": 0.01255753, + "epoch": 0.6667067488351119, + "flos": 18265835393280.0, + "grad_norm": 1.9855105228277763, + "language_loss": 0.64599085, + "learning_rate": 1.0562728371178928e-06, + "loss": 0.72281867, + "num_input_tokens_seen": 239483890, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.1083374, + "step": 11089, + "time_per_iteration": 2.5900728702545166 + }, + { + "auxiliary_loss_clip": 0.06409112, + "auxiliary_loss_mlp": 0.01265636, + "balance_loss_clip": 0.06274208, + "balance_loss_mlp": 0.01255313, + "epoch": 0.66676687208778, + "flos": 17241983953920.0, + "grad_norm": 2.1106067212474704, + "language_loss": 0.81439161, + "learning_rate": 1.0559294777203221e-06, + "loss": 0.89113915, + "num_input_tokens_seen": 239500080, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.10314941, + "step": 11090, + "time_per_iteration": 2.4597456455230713 + }, + { + "auxiliary_loss_clip": 0.06415623, + "auxiliary_loss_mlp": 0.01266415, + "balance_loss_clip": 0.06274828, + "balance_loss_mlp": 0.01255877, + "epoch": 0.6668269953404479, + "flos": 19757742140160.0, + "grad_norm": 1.8443713907824004, + "language_loss": 0.7767818, + "learning_rate": 1.0555861541219984e-06, + "loss": 0.85360217, + "num_input_tokens_seen": 239517335, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.10540771, + "step": 11091, + "time_per_iteration": 2.5587215423583984 + }, + { + "auxiliary_loss_clip": 0.06415166, + "auxiliary_loss_mlp": 0.01267323, + "balance_loss_clip": 0.06277118, + "balance_loss_mlp": 0.01256487, + "epoch": 0.6668871185931159, + "flos": 20564700485760.0, + "grad_norm": 3.5971234891656265, + "language_loss": 0.79227078, + "learning_rate": 1.0552428663359425e-06, + "loss": 0.86909568, + "num_input_tokens_seen": 239536240, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.10827637, + "step": 11092, + "time_per_iteration": 2.4899661540985107 + }, + { + "auxiliary_loss_clip": 0.06313632, + "auxiliary_loss_mlp": 0.01258221, + "balance_loss_clip": 0.06257559, + "balance_loss_mlp": 0.0125709, + "epoch": 0.6669472418457839, + "flos": 58104458144640.0, + "grad_norm": 0.7522047627769642, + "language_loss": 0.57524383, + "learning_rate": 1.0548996143751724e-06, + "loss": 0.65096241, + "num_input_tokens_seen": 239598000, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01133728, + "step": 11093, + "time_per_iteration": 3.147273540496826 + }, + { + "auxiliary_loss_clip": 0.06411624, + "auxiliary_loss_mlp": 0.01265167, + "balance_loss_clip": 0.06275775, + "balance_loss_mlp": 0.0125504, + "epoch": 0.6670073650984518, + "flos": 26071860960000.0, + "grad_norm": 1.491694696645918, + "language_loss": 0.76499665, + "learning_rate": 1.054556398252703e-06, + "loss": 0.84176457, + "num_input_tokens_seen": 239617650, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.10125732, + "step": 11094, + "time_per_iteration": 2.654946804046631 + }, + { + "auxiliary_loss_clip": 0.06412062, + "auxiliary_loss_mlp": 0.01267472, + "balance_loss_clip": 0.06274471, + "balance_loss_mlp": 0.01256349, + "epoch": 0.6670674883511198, + "flos": 32425196290560.0, + "grad_norm": 1.786455566216807, + "language_loss": 0.73555851, + "learning_rate": 1.05421321798155e-06, + "loss": 0.81235385, + "num_input_tokens_seen": 239639825, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.11132812, + "step": 11095, + "time_per_iteration": 2.6546003818511963 + }, + { + "auxiliary_loss_clip": 0.06414741, + "auxiliary_loss_mlp": 0.01270593, + "balance_loss_clip": 0.06277339, + "balance_loss_mlp": 0.01260145, + "epoch": 0.6671276116037878, + "flos": 18043114440960.0, + "grad_norm": 1.9034949183118532, + "language_loss": 0.73389214, + "learning_rate": 1.053870073574727e-06, + "loss": 0.81074548, + "num_input_tokens_seen": 239656300, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.10437012, + "step": 11096, + "time_per_iteration": 2.5232880115509033 + }, + { + "auxiliary_loss_clip": 0.06407115, + "auxiliary_loss_mlp": 0.01267563, + "balance_loss_clip": 0.06273691, + "balance_loss_mlp": 0.01257419, + "epoch": 0.6671877348564558, + "flos": 23773498992000.0, + "grad_norm": 1.8900040408751917, + "language_loss": 0.64173019, + "learning_rate": 1.0535269650452456e-06, + "loss": 0.71847701, + "num_input_tokens_seen": 239676655, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.10144043, + "step": 11097, + "time_per_iteration": 2.53245210647583 + }, + { + "auxiliary_loss_clip": 0.06414811, + "auxiliary_loss_mlp": 0.01270626, + "balance_loss_clip": 0.06272861, + "balance_loss_mlp": 0.01259939, + "epoch": 0.6672478581091237, + "flos": 20923869012480.0, + "grad_norm": 1.7889953519105342, + "language_loss": 0.76164997, + "learning_rate": 1.0531838924061158e-06, + "loss": 0.83850437, + "num_input_tokens_seen": 239695430, + "router_z_loss_clip": 1.41894531, + "router_z_loss_mlp": 0.10681152, + "step": 11098, + "time_per_iteration": 2.5418834686279297 + }, + { + "auxiliary_loss_clip": 0.0641548, + "auxiliary_loss_mlp": 0.01271314, + "balance_loss_clip": 0.0627536, + "balance_loss_mlp": 0.01260675, + "epoch": 0.6673079813617917, + "flos": 27863328453120.0, + "grad_norm": 1.4249693183378689, + "language_loss": 0.74138522, + "learning_rate": 1.0528408556703476e-06, + "loss": 0.81825316, + "num_input_tokens_seen": 239717070, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.10632324, + "step": 11099, + "time_per_iteration": 2.6019399166107178 + }, + { + "auxiliary_loss_clip": 0.06409659, + "auxiliary_loss_mlp": 0.01264891, + "balance_loss_clip": 0.06275995, + "balance_loss_mlp": 0.01254722, + "epoch": 0.6673681046144596, + "flos": 21623366096640.0, + "grad_norm": 1.7662195801139693, + "language_loss": 0.78545117, + "learning_rate": 1.0524978548509502e-06, + "loss": 0.86219656, + "num_input_tokens_seen": 239737105, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.1015625, + "step": 11100, + "time_per_iteration": 2.681669235229492 + }, + { + "auxiliary_loss_clip": 0.06412613, + "auxiliary_loss_mlp": 0.01264451, + "balance_loss_clip": 0.06276593, + "balance_loss_mlp": 0.01254247, + "epoch": 0.6674282278671276, + "flos": 20896727489280.0, + "grad_norm": 1.8459209339693166, + "language_loss": 0.60927689, + "learning_rate": 1.0521548899609288e-06, + "loss": 0.68604755, + "num_input_tokens_seen": 239757835, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.10211182, + "step": 11101, + "time_per_iteration": 2.53374981880188 + }, + { + "auxiliary_loss_clip": 0.06421657, + "auxiliary_loss_mlp": 0.0126643, + "balance_loss_clip": 0.06276177, + "balance_loss_mlp": 0.01254276, + "epoch": 0.6674883511197955, + "flos": 23631139704960.0, + "grad_norm": 1.6188105594216948, + "language_loss": 0.7136634, + "learning_rate": 1.0518119610132884e-06, + "loss": 0.79054427, + "num_input_tokens_seen": 239775425, + "router_z_loss_clip": 1.45507812, + "router_z_loss_mlp": 0.121521, + "step": 11102, + "time_per_iteration": 2.572932481765747 + }, + { + "auxiliary_loss_clip": 0.06414107, + "auxiliary_loss_mlp": 0.01266311, + "balance_loss_clip": 0.06274531, + "balance_loss_mlp": 0.01256041, + "epoch": 0.6675484743724636, + "flos": 19615760196480.0, + "grad_norm": 1.3319232732101594, + "language_loss": 0.84587741, + "learning_rate": 1.051469068021034e-06, + "loss": 0.92268157, + "num_input_tokens_seen": 239794605, + "router_z_loss_clip": 1.39648438, + "router_z_loss_mlp": 0.1027832, + "step": 11103, + "time_per_iteration": 2.5075833797454834 + }, + { + "auxiliary_loss_clip": 0.06411143, + "auxiliary_loss_mlp": 0.01263708, + "balance_loss_clip": 0.06271936, + "balance_loss_mlp": 0.01254482, + "epoch": 0.6676085976251315, + "flos": 14324696202240.0, + "grad_norm": 1.9260757560792952, + "language_loss": 0.78627831, + "learning_rate": 1.0511262109971668e-06, + "loss": 0.86302686, + "num_input_tokens_seen": 239812135, + "router_z_loss_clip": 1.39257812, + "router_z_loss_mlp": 0.09222412, + "step": 11104, + "time_per_iteration": 2.5494680404663086 + }, + { + "auxiliary_loss_clip": 0.06418018, + "auxiliary_loss_mlp": 0.01267231, + "balance_loss_clip": 0.0627483, + "balance_loss_mlp": 0.01256531, + "epoch": 0.6676687208777995, + "flos": 38113219802880.0, + "grad_norm": 1.3963666193820934, + "language_loss": 0.58238858, + "learning_rate": 1.0507833899546889e-06, + "loss": 0.65924108, + "num_input_tokens_seen": 239835845, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.10693359, + "step": 11105, + "time_per_iteration": 2.6544291973114014 + }, + { + "auxiliary_loss_clip": 0.06419846, + "auxiliary_loss_mlp": 0.01267664, + "balance_loss_clip": 0.06274708, + "balance_loss_mlp": 0.01255921, + "epoch": 0.6677288441304675, + "flos": 23987331411840.0, + "grad_norm": 1.4856417680447878, + "language_loss": 0.72987849, + "learning_rate": 1.0504406049066e-06, + "loss": 0.80675358, + "num_input_tokens_seen": 239853820, + "router_z_loss_clip": 1.45019531, + "router_z_loss_mlp": 0.11749268, + "step": 11106, + "time_per_iteration": 2.591508150100708 + }, + { + "auxiliary_loss_clip": 0.06410738, + "auxiliary_loss_mlp": 0.01269876, + "balance_loss_clip": 0.06272997, + "balance_loss_mlp": 0.01259392, + "epoch": 0.6677889673831354, + "flos": 24177586106880.0, + "grad_norm": 1.6277621549569181, + "language_loss": 0.76611882, + "learning_rate": 1.0500978558659e-06, + "loss": 0.84292495, + "num_input_tokens_seen": 239873365, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.1048584, + "step": 11107, + "time_per_iteration": 2.5117390155792236 + }, + { + "auxiliary_loss_clip": 0.06407823, + "auxiliary_loss_mlp": 0.01272133, + "balance_loss_clip": 0.06275569, + "balance_loss_mlp": 0.01262364, + "epoch": 0.6678490906358034, + "flos": 22316196781440.0, + "grad_norm": 2.1688615595462033, + "language_loss": 0.90383065, + "learning_rate": 1.049755142845583e-06, + "loss": 0.98063028, + "num_input_tokens_seen": 239891215, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.09765625, + "step": 11108, + "time_per_iteration": 3.940439224243164 + }, + { + "auxiliary_loss_clip": 0.06408696, + "auxiliary_loss_mlp": 0.01263517, + "balance_loss_clip": 0.0627287, + "balance_loss_mlp": 0.01254499, + "epoch": 0.6679092138884714, + "flos": 36906870170880.0, + "grad_norm": 1.379580541372803, + "language_loss": 0.82916903, + "learning_rate": 1.049412465858646e-06, + "loss": 0.90589124, + "num_input_tokens_seen": 239913490, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.09020996, + "step": 11109, + "time_per_iteration": 2.6550536155700684 + }, + { + "auxiliary_loss_clip": 0.06415845, + "auxiliary_loss_mlp": 0.01269099, + "balance_loss_clip": 0.06276993, + "balance_loss_mlp": 0.01257869, + "epoch": 0.6679693371411394, + "flos": 18156151998720.0, + "grad_norm": 1.7439527968582467, + "language_loss": 0.69522661, + "learning_rate": 1.0490698249180847e-06, + "loss": 0.77207607, + "num_input_tokens_seen": 239931565, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.11236572, + "step": 11110, + "time_per_iteration": 2.505737543106079 + }, + { + "auxiliary_loss_clip": 0.06418422, + "auxiliary_loss_mlp": 0.01267651, + "balance_loss_clip": 0.06277301, + "balance_loss_mlp": 0.01255886, + "epoch": 0.6680294603938073, + "flos": 27205437720960.0, + "grad_norm": 1.4770947447978742, + "language_loss": 0.73935318, + "learning_rate": 1.04872722003689e-06, + "loss": 0.81621397, + "num_input_tokens_seen": 239952395, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.11767578, + "step": 11111, + "time_per_iteration": 2.6036081314086914 + }, + { + "auxiliary_loss_clip": 0.06412682, + "auxiliary_loss_mlp": 0.01267643, + "balance_loss_clip": 0.06276079, + "balance_loss_mlp": 0.01257266, + "epoch": 0.6680895836464753, + "flos": 21731665898880.0, + "grad_norm": 1.7721381481924603, + "language_loss": 0.65662813, + "learning_rate": 1.0483846512280553e-06, + "loss": 0.73343134, + "num_input_tokens_seen": 239968910, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.10375977, + "step": 11112, + "time_per_iteration": 2.5148162841796875 + }, + { + "auxiliary_loss_clip": 0.06408017, + "auxiliary_loss_mlp": 0.01264862, + "balance_loss_clip": 0.06270978, + "balance_loss_mlp": 0.01254509, + "epoch": 0.6681497068991432, + "flos": 19652628792960.0, + "grad_norm": 2.188254018589407, + "language_loss": 0.63796169, + "learning_rate": 1.048042118504569e-06, + "loss": 0.71469045, + "num_input_tokens_seen": 239987680, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.10357666, + "step": 11113, + "time_per_iteration": 2.5091605186462402 + }, + { + "auxiliary_loss_clip": 0.06408161, + "auxiliary_loss_mlp": 0.0126667, + "balance_loss_clip": 0.06274618, + "balance_loss_mlp": 0.01257008, + "epoch": 0.6682098301518112, + "flos": 17424649854720.0, + "grad_norm": 1.7204263321571711, + "language_loss": 0.65997386, + "learning_rate": 1.047699621879422e-06, + "loss": 0.73672217, + "num_input_tokens_seen": 240005790, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.09667969, + "step": 11114, + "time_per_iteration": 2.5244226455688477 + }, + { + "auxiliary_loss_clip": 0.06406785, + "auxiliary_loss_mlp": 0.01265665, + "balance_loss_clip": 0.06270755, + "balance_loss_mlp": 0.01255378, + "epoch": 0.6682699534044791, + "flos": 22605191913600.0, + "grad_norm": 1.4259756578870375, + "language_loss": 0.78704619, + "learning_rate": 1.0473571613655998e-06, + "loss": 0.86377072, + "num_input_tokens_seen": 240025895, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.10290527, + "step": 11115, + "time_per_iteration": 2.544543504714966 + }, + { + "auxiliary_loss_clip": 0.06410562, + "auxiliary_loss_mlp": 0.01266412, + "balance_loss_clip": 0.06271002, + "balance_loss_mlp": 0.01256703, + "epoch": 0.6683300766571472, + "flos": 24870668353920.0, + "grad_norm": 1.896886529208747, + "language_loss": 0.79640424, + "learning_rate": 1.0470147369760896e-06, + "loss": 0.87317395, + "num_input_tokens_seen": 240044880, + "router_z_loss_clip": 1.39648438, + "router_z_loss_mlp": 0.09716797, + "step": 11116, + "time_per_iteration": 2.5271427631378174 + }, + { + "auxiliary_loss_clip": 0.06415811, + "auxiliary_loss_mlp": 0.01274733, + "balance_loss_clip": 0.06276368, + "balance_loss_mlp": 0.01263891, + "epoch": 0.6683901999098151, + "flos": 27134132296320.0, + "grad_norm": 1.70831438842013, + "language_loss": 0.79465652, + "learning_rate": 1.0466723487238768e-06, + "loss": 0.871562, + "num_input_tokens_seen": 240065785, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.10852051, + "step": 11117, + "time_per_iteration": 2.5867950916290283 + }, + { + "auxiliary_loss_clip": 0.06413716, + "auxiliary_loss_mlp": 0.01269769, + "balance_loss_clip": 0.06274913, + "balance_loss_mlp": 0.01258147, + "epoch": 0.6684503231624831, + "flos": 20745018472320.0, + "grad_norm": 1.68089949787921, + "language_loss": 0.65774792, + "learning_rate": 1.0463299966219441e-06, + "loss": 0.73458278, + "num_input_tokens_seen": 240085130, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.1161499, + "step": 11118, + "time_per_iteration": 2.5065219402313232 + }, + { + "auxiliary_loss_clip": 0.06409101, + "auxiliary_loss_mlp": 0.01267125, + "balance_loss_clip": 0.06272688, + "balance_loss_mlp": 0.01256426, + "epoch": 0.668510446415151, + "flos": 21768618349440.0, + "grad_norm": 1.4670277033373609, + "language_loss": 0.69327927, + "learning_rate": 1.0459876806832727e-06, + "loss": 0.77004153, + "num_input_tokens_seen": 240105495, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.10705566, + "step": 11119, + "time_per_iteration": 3.9497127532958984 + }, + { + "auxiliary_loss_clip": 0.06411311, + "auxiliary_loss_mlp": 0.01263174, + "balance_loss_clip": 0.06272611, + "balance_loss_mlp": 0.01253155, + "epoch": 0.668570569667819, + "flos": 30199229850240.0, + "grad_norm": 1.557441143928688, + "language_loss": 0.67133182, + "learning_rate": 1.0456454009208448e-06, + "loss": 0.74807668, + "num_input_tokens_seen": 240125455, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.10015869, + "step": 11120, + "time_per_iteration": 2.583557605743408 + }, + { + "auxiliary_loss_clip": 0.06409501, + "auxiliary_loss_mlp": 0.01266513, + "balance_loss_clip": 0.06271403, + "balance_loss_mlp": 0.0125551, + "epoch": 0.668630692920487, + "flos": 24177544179840.0, + "grad_norm": 1.6997365737566905, + "language_loss": 0.72227985, + "learning_rate": 1.045303157347638e-06, + "loss": 0.79904002, + "num_input_tokens_seen": 240143870, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.10998535, + "step": 11121, + "time_per_iteration": 2.5303213596343994 + }, + { + "auxiliary_loss_clip": 0.06415744, + "auxiliary_loss_mlp": 0.01268909, + "balance_loss_clip": 0.06275598, + "balance_loss_mlp": 0.01258442, + "epoch": 0.668690816173155, + "flos": 17462902043520.0, + "grad_norm": 2.410576654010779, + "language_loss": 0.70488191, + "learning_rate": 1.0449609499766316e-06, + "loss": 0.78172839, + "num_input_tokens_seen": 240161020, + "router_z_loss_clip": 1.39941406, + "router_z_loss_mlp": 0.10467529, + "step": 11122, + "time_per_iteration": 2.480928897857666 + }, + { + "auxiliary_loss_clip": 0.06412323, + "auxiliary_loss_mlp": 0.01265084, + "balance_loss_clip": 0.06273821, + "balance_loss_mlp": 0.01254683, + "epoch": 0.668750939425823, + "flos": 25011350559360.0, + "grad_norm": 1.579363869036545, + "language_loss": 0.71597642, + "learning_rate": 1.0446187788208015e-06, + "loss": 0.79275048, + "num_input_tokens_seen": 240179820, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.10406494, + "step": 11123, + "time_per_iteration": 3.993523597717285 + }, + { + "auxiliary_loss_clip": 0.06416023, + "auxiliary_loss_mlp": 0.01267794, + "balance_loss_clip": 0.06275098, + "balance_loss_mlp": 0.01256713, + "epoch": 0.6688110626784909, + "flos": 24103513497600.0, + "grad_norm": 1.6918402194537734, + "language_loss": 0.79247653, + "learning_rate": 1.0442766438931244e-06, + "loss": 0.86931467, + "num_input_tokens_seen": 240200130, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.11090088, + "step": 11124, + "time_per_iteration": 2.5730183124542236 + }, + { + "auxiliary_loss_clip": 0.06414519, + "auxiliary_loss_mlp": 0.0126539, + "balance_loss_clip": 0.06277663, + "balance_loss_mlp": 0.01255496, + "epoch": 0.6688711859311589, + "flos": 21765515748480.0, + "grad_norm": 1.8258374996153537, + "language_loss": 0.74714315, + "learning_rate": 1.0439345452065716e-06, + "loss": 0.8239423, + "num_input_tokens_seen": 240217945, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.09899902, + "step": 11125, + "time_per_iteration": 2.586688995361328 + }, + { + "auxiliary_loss_clip": 0.06414272, + "auxiliary_loss_mlp": 0.01265114, + "balance_loss_clip": 0.06274511, + "balance_loss_mlp": 0.01254802, + "epoch": 0.6689313091838268, + "flos": 22936254595200.0, + "grad_norm": 1.821756692805589, + "language_loss": 0.66474277, + "learning_rate": 1.043592482774116e-06, + "loss": 0.74153662, + "num_input_tokens_seen": 240237220, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.10314941, + "step": 11126, + "time_per_iteration": 2.5671706199645996 + }, + { + "auxiliary_loss_clip": 0.06412929, + "auxiliary_loss_mlp": 0.01267353, + "balance_loss_clip": 0.06274001, + "balance_loss_mlp": 0.01256774, + "epoch": 0.6689914324364948, + "flos": 20892367077120.0, + "grad_norm": 1.6855233783346146, + "language_loss": 0.71609974, + "learning_rate": 1.0432504566087305e-06, + "loss": 0.79290259, + "num_input_tokens_seen": 240256000, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.10577393, + "step": 11127, + "time_per_iteration": 3.9430463314056396 + }, + { + "auxiliary_loss_clip": 0.06417182, + "auxiliary_loss_mlp": 0.01267327, + "balance_loss_clip": 0.06273168, + "balance_loss_mlp": 0.01255841, + "epoch": 0.6690515556891627, + "flos": 22754972286720.0, + "grad_norm": 1.8544786849615413, + "language_loss": 0.80330718, + "learning_rate": 1.0429084667233827e-06, + "loss": 0.88015223, + "num_input_tokens_seen": 240275845, + "router_z_loss_clip": 1.44042969, + "router_z_loss_mlp": 0.11486816, + "step": 11128, + "time_per_iteration": 2.545502185821533 + }, + { + "auxiliary_loss_clip": 0.06412885, + "auxiliary_loss_mlp": 0.01266335, + "balance_loss_clip": 0.06271905, + "balance_loss_mlp": 0.01255582, + "epoch": 0.6691116789418308, + "flos": 23338203431040.0, + "grad_norm": 1.7840790291668756, + "language_loss": 0.81335264, + "learning_rate": 1.0425665131310427e-06, + "loss": 0.89014482, + "num_input_tokens_seen": 240294095, + "router_z_loss_clip": 1.40917969, + "router_z_loss_mlp": 0.10742188, + "step": 11129, + "time_per_iteration": 2.5280702114105225 + }, + { + "auxiliary_loss_clip": 0.06404583, + "auxiliary_loss_mlp": 0.01264694, + "balance_loss_clip": 0.06271389, + "balance_loss_mlp": 0.01254972, + "epoch": 0.6691718021944987, + "flos": 32454308384640.0, + "grad_norm": 1.6197681941265856, + "language_loss": 0.70428884, + "learning_rate": 1.0422245958446762e-06, + "loss": 0.7809816, + "num_input_tokens_seen": 240313460, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.097229, + "step": 11130, + "time_per_iteration": 2.578578233718872 + }, + { + "auxiliary_loss_clip": 0.06406342, + "auxiliary_loss_mlp": 0.01262916, + "balance_loss_clip": 0.0627137, + "balance_loss_mlp": 0.0125301, + "epoch": 0.6692319254471667, + "flos": 23738223623040.0, + "grad_norm": 1.529399392054523, + "language_loss": 0.70701146, + "learning_rate": 1.0418827148772486e-06, + "loss": 0.78370404, + "num_input_tokens_seen": 240333540, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.09918213, + "step": 11131, + "time_per_iteration": 2.537551164627075 + }, + { + "auxiliary_loss_clip": 0.06414618, + "auxiliary_loss_mlp": 0.01266754, + "balance_loss_clip": 0.06274183, + "balance_loss_mlp": 0.01255906, + "epoch": 0.6692920486998346, + "flos": 14432996004480.0, + "grad_norm": 2.3888765741874645, + "language_loss": 0.65664881, + "learning_rate": 1.0415408702417243e-06, + "loss": 0.73346257, + "num_input_tokens_seen": 240350085, + "router_z_loss_clip": 1.40527344, + "router_z_loss_mlp": 0.10858154, + "step": 11132, + "time_per_iteration": 2.45595383644104 + }, + { + "auxiliary_loss_clip": 0.06414949, + "auxiliary_loss_mlp": 0.0126617, + "balance_loss_clip": 0.06275167, + "balance_loss_mlp": 0.01254839, + "epoch": 0.6693521719525026, + "flos": 21513976191360.0, + "grad_norm": 1.5662057284927036, + "language_loss": 0.74730015, + "learning_rate": 1.0411990619510661e-06, + "loss": 0.82411134, + "num_input_tokens_seen": 240370015, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.11340332, + "step": 11133, + "time_per_iteration": 2.5248849391937256 + }, + { + "auxiliary_loss_clip": 0.06419569, + "auxiliary_loss_mlp": 0.01271511, + "balance_loss_clip": 0.06276593, + "balance_loss_mlp": 0.01259412, + "epoch": 0.6694122952051706, + "flos": 25413341322240.0, + "grad_norm": 3.5912228691538757, + "language_loss": 0.66650522, + "learning_rate": 1.0408572900182363e-06, + "loss": 0.74341607, + "num_input_tokens_seen": 240390770, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.12097168, + "step": 11134, + "time_per_iteration": 2.556043863296509 + }, + { + "auxiliary_loss_clip": 0.06424067, + "auxiliary_loss_mlp": 0.01264606, + "balance_loss_clip": 0.06279507, + "balance_loss_mlp": 0.01253294, + "epoch": 0.6694724184578386, + "flos": 25668067334400.0, + "grad_norm": 1.7597980858171118, + "language_loss": 0.77272904, + "learning_rate": 1.0405155544561943e-06, + "loss": 0.84961575, + "num_input_tokens_seen": 240409590, + "router_z_loss_clip": 1.4453125, + "router_z_loss_mlp": 0.11309814, + "step": 11135, + "time_per_iteration": 2.572221279144287 + }, + { + "auxiliary_loss_clip": 0.06406624, + "auxiliary_loss_mlp": 0.0126679, + "balance_loss_clip": 0.0627154, + "balance_loss_mlp": 0.01256079, + "epoch": 0.6695325417105066, + "flos": 17714567381760.0, + "grad_norm": 1.4860361528198607, + "language_loss": 0.74150556, + "learning_rate": 1.040173855277898e-06, + "loss": 0.81823969, + "num_input_tokens_seen": 240428180, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.1071167, + "step": 11136, + "time_per_iteration": 2.482616662979126 + }, + { + "auxiliary_loss_clip": 0.06421445, + "auxiliary_loss_mlp": 0.01264954, + "balance_loss_clip": 0.06277363, + "balance_loss_mlp": 0.01253814, + "epoch": 0.6695926649631745, + "flos": 24466581239040.0, + "grad_norm": 1.5006390680612098, + "language_loss": 0.622679, + "learning_rate": 1.0398321924963061e-06, + "loss": 0.69954294, + "num_input_tokens_seen": 240447815, + "router_z_loss_clip": 1.43945312, + "router_z_loss_mlp": 0.1114502, + "step": 11137, + "time_per_iteration": 2.60404109954834 + }, + { + "auxiliary_loss_clip": 0.06413136, + "auxiliary_loss_mlp": 0.01269365, + "balance_loss_clip": 0.0627469, + "balance_loss_mlp": 0.01258535, + "epoch": 0.6696527882158425, + "flos": 24287059866240.0, + "grad_norm": 1.73693802973788, + "language_loss": 0.66198957, + "learning_rate": 1.0394905661243724e-06, + "loss": 0.73881459, + "num_input_tokens_seen": 240468635, + "router_z_loss_clip": 1.38476562, + "router_z_loss_mlp": 0.1083374, + "step": 11138, + "time_per_iteration": 2.5446555614471436 + }, + { + "auxiliary_loss_clip": 0.06407638, + "auxiliary_loss_mlp": 0.01264748, + "balance_loss_clip": 0.06273118, + "balance_loss_mlp": 0.01255009, + "epoch": 0.6697129114685104, + "flos": 23009404809600.0, + "grad_norm": 1.563215252926209, + "language_loss": 0.73026919, + "learning_rate": 1.039148976175053e-06, + "loss": 0.80699301, + "num_input_tokens_seen": 240488550, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.09741211, + "step": 11139, + "time_per_iteration": 2.5669844150543213 + }, + { + "auxiliary_loss_clip": 0.06403776, + "auxiliary_loss_mlp": 0.01266346, + "balance_loss_clip": 0.06271777, + "balance_loss_mlp": 0.01256326, + "epoch": 0.6697730347211784, + "flos": 22644743840640.0, + "grad_norm": 1.6502373859256334, + "language_loss": 0.70972526, + "learning_rate": 1.0388074226613016e-06, + "loss": 0.78642654, + "num_input_tokens_seen": 240508330, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.10015869, + "step": 11140, + "time_per_iteration": 2.524345874786377 + }, + { + "auxiliary_loss_clip": 0.06414337, + "auxiliary_loss_mlp": 0.01264927, + "balance_loss_clip": 0.06273174, + "balance_loss_mlp": 0.01254103, + "epoch": 0.6698331579738463, + "flos": 28884915832320.0, + "grad_norm": 1.9955464769525513, + "language_loss": 0.75788713, + "learning_rate": 1.0384659055960691e-06, + "loss": 0.83467978, + "num_input_tokens_seen": 240528470, + "router_z_loss_clip": 1.41210938, + "router_z_loss_mlp": 0.1083374, + "step": 11141, + "time_per_iteration": 2.610853433609009 + }, + { + "auxiliary_loss_clip": 0.06411906, + "auxiliary_loss_mlp": 0.01271137, + "balance_loss_clip": 0.06273371, + "balance_loss_mlp": 0.01260337, + "epoch": 0.6698932812265144, + "flos": 24213993505920.0, + "grad_norm": 1.7317387192226181, + "language_loss": 0.82309425, + "learning_rate": 1.0381244249923052e-06, + "loss": 0.8999247, + "num_input_tokens_seen": 240547815, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.10803223, + "step": 11142, + "time_per_iteration": 2.5797901153564453 + }, + { + "auxiliary_loss_clip": 0.0640756, + "auxiliary_loss_mlp": 0.01269267, + "balance_loss_clip": 0.06271559, + "balance_loss_mlp": 0.01258556, + "epoch": 0.6699534044791823, + "flos": 22096704211200.0, + "grad_norm": 1.4627194343759278, + "language_loss": 0.70282012, + "learning_rate": 1.037782980862959e-06, + "loss": 0.77958834, + "num_input_tokens_seen": 240567765, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.1071167, + "step": 11143, + "time_per_iteration": 2.543877601623535 + }, + { + "auxiliary_loss_clip": 0.06405188, + "auxiliary_loss_mlp": 0.01262215, + "balance_loss_clip": 0.06271453, + "balance_loss_mlp": 0.01252577, + "epoch": 0.6700135277318503, + "flos": 25199466975360.0, + "grad_norm": 1.4915968751654103, + "language_loss": 0.70360661, + "learning_rate": 1.0374415732209796e-06, + "loss": 0.78028065, + "num_input_tokens_seen": 240590750, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.09637451, + "step": 11144, + "time_per_iteration": 2.5488550662994385 + }, + { + "auxiliary_loss_clip": 0.06411098, + "auxiliary_loss_mlp": 0.01264928, + "balance_loss_clip": 0.06275296, + "balance_loss_mlp": 0.01253735, + "epoch": 0.6700736509845182, + "flos": 23446838649600.0, + "grad_norm": 1.6240872047460435, + "language_loss": 0.74927717, + "learning_rate": 1.0371002020793114e-06, + "loss": 0.82603747, + "num_input_tokens_seen": 240608875, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.11193848, + "step": 11145, + "time_per_iteration": 2.542711019515991 + }, + { + "auxiliary_loss_clip": 0.06415901, + "auxiliary_loss_mlp": 0.0126542, + "balance_loss_clip": 0.06274743, + "balance_loss_mlp": 0.01254405, + "epoch": 0.6701337742371862, + "flos": 24396952896000.0, + "grad_norm": 1.5772021074008409, + "language_loss": 0.71292794, + "learning_rate": 1.0367588674509008e-06, + "loss": 0.7897411, + "num_input_tokens_seen": 240628565, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.11016846, + "step": 11146, + "time_per_iteration": 2.5397775173187256 + }, + { + "auxiliary_loss_clip": 0.06402436, + "auxiliary_loss_mlp": 0.01264562, + "balance_loss_clip": 0.06271266, + "balance_loss_mlp": 0.0125459, + "epoch": 0.6701938974898543, + "flos": 14798956711680.0, + "grad_norm": 2.075971191875419, + "language_loss": 0.78937066, + "learning_rate": 1.0364175693486905e-06, + "loss": 0.86604059, + "num_input_tokens_seen": 240646325, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09979248, + "step": 11147, + "time_per_iteration": 2.521651029586792 + }, + { + "auxiliary_loss_clip": 0.06408454, + "auxiliary_loss_mlp": 0.01268691, + "balance_loss_clip": 0.06272413, + "balance_loss_mlp": 0.0125801, + "epoch": 0.6702540207425222, + "flos": 20159690976000.0, + "grad_norm": 1.9550194289938683, + "language_loss": 0.70223355, + "learning_rate": 1.0360763077856218e-06, + "loss": 0.77900505, + "num_input_tokens_seen": 240666145, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.10687256, + "step": 11148, + "time_per_iteration": 4.084912300109863 + }, + { + "auxiliary_loss_clip": 0.06407622, + "auxiliary_loss_mlp": 0.01263909, + "balance_loss_clip": 0.06271225, + "balance_loss_mlp": 0.01253991, + "epoch": 0.6703141439951902, + "flos": 21220369084800.0, + "grad_norm": 1.6593895437552093, + "language_loss": 0.70494747, + "learning_rate": 1.035735082774636e-06, + "loss": 0.78166282, + "num_input_tokens_seen": 240685570, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.09918213, + "step": 11149, + "time_per_iteration": 2.532682418823242 + }, + { + "auxiliary_loss_clip": 0.06408584, + "auxiliary_loss_mlp": 0.0126327, + "balance_loss_clip": 0.06269559, + "balance_loss_mlp": 0.01253245, + "epoch": 0.6703742672478581, + "flos": 23119255912320.0, + "grad_norm": 2.1651783548168124, + "language_loss": 0.73744798, + "learning_rate": 1.0353938943286727e-06, + "loss": 0.81416655, + "num_input_tokens_seen": 240706945, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.10028076, + "step": 11150, + "time_per_iteration": 2.591546058654785 + }, + { + "auxiliary_loss_clip": 0.06414528, + "auxiliary_loss_mlp": 0.01264123, + "balance_loss_clip": 0.06276007, + "balance_loss_mlp": 0.01253829, + "epoch": 0.6704343905005261, + "flos": 22535563570560.0, + "grad_norm": 1.9523081475406603, + "language_loss": 0.78322434, + "learning_rate": 1.035052742460671e-06, + "loss": 0.86001086, + "num_input_tokens_seen": 240727990, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.10296631, + "step": 11151, + "time_per_iteration": 2.536759853363037 + }, + { + "auxiliary_loss_clip": 0.06307358, + "auxiliary_loss_mlp": 0.01251405, + "balance_loss_clip": 0.06251603, + "balance_loss_mlp": 0.01250013, + "epoch": 0.670494513753194, + "flos": 64815270192000.0, + "grad_norm": 0.7758908798936945, + "language_loss": 0.55567682, + "learning_rate": 1.0347116271835643e-06, + "loss": 0.63126445, + "num_input_tokens_seen": 240790380, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.0139389, + "step": 11152, + "time_per_iteration": 3.201535224914551 + }, + { + "auxiliary_loss_clip": 0.06410956, + "auxiliary_loss_mlp": 0.01264996, + "balance_loss_clip": 0.06271775, + "balance_loss_mlp": 0.01254815, + "epoch": 0.670554637005862, + "flos": 23517892512000.0, + "grad_norm": 1.915770962366586, + "language_loss": 0.81010997, + "learning_rate": 1.0343705485102896e-06, + "loss": 0.88686949, + "num_input_tokens_seen": 240811545, + "router_z_loss_clip": 1.39257812, + "router_z_loss_mlp": 0.10186768, + "step": 11153, + "time_per_iteration": 2.537212371826172 + }, + { + "auxiliary_loss_clip": 0.06411768, + "auxiliary_loss_mlp": 0.0126436, + "balance_loss_clip": 0.06273108, + "balance_loss_mlp": 0.01253822, + "epoch": 0.67061476025853, + "flos": 19469417840640.0, + "grad_norm": 1.508737872634347, + "language_loss": 0.76268411, + "learning_rate": 1.0340295064537814e-06, + "loss": 0.83944541, + "num_input_tokens_seen": 240831380, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.10534668, + "step": 11154, + "time_per_iteration": 2.558519124984741 + }, + { + "auxiliary_loss_clip": 0.06415759, + "auxiliary_loss_mlp": 0.01269836, + "balance_loss_clip": 0.06274074, + "balance_loss_mlp": 0.01259304, + "epoch": 0.670674883511198, + "flos": 20525903245440.0, + "grad_norm": 3.082678767747609, + "language_loss": 0.76461852, + "learning_rate": 1.0336885010269702e-06, + "loss": 0.84147453, + "num_input_tokens_seen": 240851855, + "router_z_loss_clip": 1.41699219, + "router_z_loss_mlp": 0.10534668, + "step": 11155, + "time_per_iteration": 2.504171371459961 + }, + { + "auxiliary_loss_clip": 0.06407665, + "auxiliary_loss_mlp": 0.01265908, + "balance_loss_clip": 0.0627, + "balance_loss_mlp": 0.01256187, + "epoch": 0.6707350067638659, + "flos": 25491061584000.0, + "grad_norm": 2.1059181531121873, + "language_loss": 0.82157421, + "learning_rate": 1.0333475322427878e-06, + "loss": 0.89830995, + "num_input_tokens_seen": 240869980, + "router_z_loss_clip": 1.37695312, + "router_z_loss_mlp": 0.09716797, + "step": 11156, + "time_per_iteration": 2.562812089920044 + }, + { + "auxiliary_loss_clip": 0.06406271, + "auxiliary_loss_mlp": 0.01265165, + "balance_loss_clip": 0.06271681, + "balance_loss_mlp": 0.01255706, + "epoch": 0.6707951300165339, + "flos": 22280040944640.0, + "grad_norm": 1.7628533784510112, + "language_loss": 0.74903405, + "learning_rate": 1.033006600114165e-06, + "loss": 0.82574838, + "num_input_tokens_seen": 240888680, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.09460449, + "step": 11157, + "time_per_iteration": 2.5089879035949707 + }, + { + "auxiliary_loss_clip": 0.06412502, + "auxiliary_loss_mlp": 0.01267451, + "balance_loss_clip": 0.06273752, + "balance_loss_mlp": 0.01256919, + "epoch": 0.6708552532692018, + "flos": 23990853283200.0, + "grad_norm": 1.6697268751930758, + "language_loss": 0.74289936, + "learning_rate": 1.0326657046540282e-06, + "loss": 0.81969893, + "num_input_tokens_seen": 240909050, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.10528564, + "step": 11158, + "time_per_iteration": 2.5533461570739746 + }, + { + "auxiliary_loss_clip": 0.06413293, + "auxiliary_loss_mlp": 0.01263254, + "balance_loss_clip": 0.06271626, + "balance_loss_mlp": 0.01253449, + "epoch": 0.6709153765218698, + "flos": 24944657109120.0, + "grad_norm": 1.5416620862644819, + "language_loss": 0.81707746, + "learning_rate": 1.0323248458753044e-06, + "loss": 0.89384294, + "num_input_tokens_seen": 240930035, + "router_z_loss_clip": 1.41699219, + "router_z_loss_mlp": 0.0980835, + "step": 11159, + "time_per_iteration": 4.040963649749756 + }, + { + "auxiliary_loss_clip": 0.06412386, + "auxiliary_loss_mlp": 0.01268767, + "balance_loss_clip": 0.06273866, + "balance_loss_mlp": 0.01258986, + "epoch": 0.6709754997745379, + "flos": 17536010330880.0, + "grad_norm": 1.5609798446772174, + "language_loss": 0.7718569, + "learning_rate": 1.0319840237909193e-06, + "loss": 0.84866846, + "num_input_tokens_seen": 240948895, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.09783936, + "step": 11160, + "time_per_iteration": 2.4715282917022705 + }, + { + "auxiliary_loss_clip": 0.06406286, + "auxiliary_loss_mlp": 0.01263422, + "balance_loss_clip": 0.06271639, + "balance_loss_mlp": 0.01254397, + "epoch": 0.6710356230272058, + "flos": 22097416970880.0, + "grad_norm": 1.6605543467204091, + "language_loss": 0.73893428, + "learning_rate": 1.0316432384137978e-06, + "loss": 0.81563139, + "num_input_tokens_seen": 240967770, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.09020996, + "step": 11161, + "time_per_iteration": 2.5761518478393555 + }, + { + "auxiliary_loss_clip": 0.0641313, + "auxiliary_loss_mlp": 0.01268388, + "balance_loss_clip": 0.06271637, + "balance_loss_mlp": 0.01257874, + "epoch": 0.6710957462798738, + "flos": 24213238819200.0, + "grad_norm": 1.698475212339427, + "language_loss": 0.68223077, + "learning_rate": 1.0313024897568618e-06, + "loss": 0.75904596, + "num_input_tokens_seen": 240988985, + "router_z_loss_clip": 1.41503906, + "router_z_loss_mlp": 0.10522461, + "step": 11162, + "time_per_iteration": 4.0347349643707275 + }, + { + "auxiliary_loss_clip": 0.06406809, + "auxiliary_loss_mlp": 0.01265434, + "balance_loss_clip": 0.06271581, + "balance_loss_mlp": 0.01255367, + "epoch": 0.6711558695325417, + "flos": 19099138648320.0, + "grad_norm": 1.6208038414483141, + "language_loss": 0.70270795, + "learning_rate": 1.030961777833032e-06, + "loss": 0.77943039, + "num_input_tokens_seen": 241005455, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.10064697, + "step": 11163, + "time_per_iteration": 2.4880189895629883 + }, + { + "auxiliary_loss_clip": 0.06402589, + "auxiliary_loss_mlp": 0.01262753, + "balance_loss_clip": 0.06269395, + "balance_loss_mlp": 0.0125383, + "epoch": 0.6712159927852097, + "flos": 25565134193280.0, + "grad_norm": 1.5352927814280746, + "language_loss": 0.75905788, + "learning_rate": 1.0306211026552291e-06, + "loss": 0.8357113, + "num_input_tokens_seen": 241026175, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.08929443, + "step": 11164, + "time_per_iteration": 2.5312371253967285 + }, + { + "auxiliary_loss_clip": 0.06409736, + "auxiliary_loss_mlp": 0.01265492, + "balance_loss_clip": 0.06273673, + "balance_loss_mlp": 0.01254907, + "epoch": 0.6712761160378776, + "flos": 22234032253440.0, + "grad_norm": 2.0741329798372408, + "language_loss": 0.65590626, + "learning_rate": 1.0302804642363704e-06, + "loss": 0.73265851, + "num_input_tokens_seen": 241044040, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.10595703, + "step": 11165, + "time_per_iteration": 2.5017032623291016 + }, + { + "auxiliary_loss_clip": 0.06407681, + "auxiliary_loss_mlp": 0.01264452, + "balance_loss_clip": 0.06272089, + "balance_loss_mlp": 0.01254444, + "epoch": 0.6713362392905456, + "flos": 22462077939840.0, + "grad_norm": 1.8809222742523355, + "language_loss": 0.71774828, + "learning_rate": 1.0299398625893738e-06, + "loss": 0.79446959, + "num_input_tokens_seen": 241063615, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.10015869, + "step": 11166, + "time_per_iteration": 2.50738787651062 + }, + { + "auxiliary_loss_clip": 0.06404926, + "auxiliary_loss_mlp": 0.01262643, + "balance_loss_clip": 0.06272519, + "balance_loss_mlp": 0.0125282, + "epoch": 0.6713963625432136, + "flos": 25637362012800.0, + "grad_norm": 1.8955119453047675, + "language_loss": 0.77147096, + "learning_rate": 1.0295992977271546e-06, + "loss": 0.84814668, + "num_input_tokens_seen": 241082520, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.09814453, + "step": 11167, + "time_per_iteration": 3.929837942123413 + }, + { + "auxiliary_loss_clip": 0.06410499, + "auxiliary_loss_mlp": 0.01266509, + "balance_loss_clip": 0.06272188, + "balance_loss_mlp": 0.01256078, + "epoch": 0.6714564857958816, + "flos": 35015110940160.0, + "grad_norm": 1.8086126039126507, + "language_loss": 0.68893099, + "learning_rate": 1.029258769662629e-06, + "loss": 0.76570106, + "num_input_tokens_seen": 241103505, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.10437012, + "step": 11168, + "time_per_iteration": 2.6505095958709717 + }, + { + "auxiliary_loss_clip": 0.06413946, + "auxiliary_loss_mlp": 0.01269172, + "balance_loss_clip": 0.0627404, + "balance_loss_mlp": 0.012578, + "epoch": 0.6715166090485495, + "flos": 26286028796160.0, + "grad_norm": 1.7287934282524213, + "language_loss": 0.73465478, + "learning_rate": 1.0289182784087068e-06, + "loss": 0.81148595, + "num_input_tokens_seen": 241122885, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.11358643, + "step": 11169, + "time_per_iteration": 2.5538253784179688 + }, + { + "auxiliary_loss_clip": 0.06410573, + "auxiliary_loss_mlp": 0.01265262, + "balance_loss_clip": 0.0627141, + "balance_loss_mlp": 0.01254706, + "epoch": 0.6715767323012175, + "flos": 15929556652800.0, + "grad_norm": 1.9811109571628822, + "language_loss": 0.76329374, + "learning_rate": 1.0285778239783005e-06, + "loss": 0.84005201, + "num_input_tokens_seen": 241140865, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.10565186, + "step": 11170, + "time_per_iteration": 2.5357441902160645 + }, + { + "auxiliary_loss_clip": 0.06412025, + "auxiliary_loss_mlp": 0.01265598, + "balance_loss_clip": 0.06272931, + "balance_loss_mlp": 0.01254964, + "epoch": 0.6716368555538854, + "flos": 17496835747200.0, + "grad_norm": 1.8551997359651162, + "language_loss": 0.74972916, + "learning_rate": 1.0282374063843212e-06, + "loss": 0.82650542, + "num_input_tokens_seen": 241158225, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.10626221, + "step": 11171, + "time_per_iteration": 2.4740569591522217 + }, + { + "auxiliary_loss_clip": 0.06413123, + "auxiliary_loss_mlp": 0.01262691, + "balance_loss_clip": 0.06273066, + "balance_loss_mlp": 0.01252344, + "epoch": 0.6716969788065534, + "flos": 16766759122560.0, + "grad_norm": 1.4543204322223777, + "language_loss": 0.86493564, + "learning_rate": 1.0278970256396762e-06, + "loss": 0.94169378, + "num_input_tokens_seen": 241175215, + "router_z_loss_clip": 1.40039062, + "router_z_loss_mlp": 0.10345459, + "step": 11172, + "time_per_iteration": 2.5120010375976562 + }, + { + "auxiliary_loss_clip": 0.06408751, + "auxiliary_loss_mlp": 0.01266926, + "balance_loss_clip": 0.06271499, + "balance_loss_mlp": 0.01256763, + "epoch": 0.6717571020592215, + "flos": 22716216973440.0, + "grad_norm": 2.0454540055069863, + "language_loss": 0.63633478, + "learning_rate": 1.0275566817572733e-06, + "loss": 0.71309155, + "num_input_tokens_seen": 241195250, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.10168457, + "step": 11173, + "time_per_iteration": 2.49975848197937 + }, + { + "auxiliary_loss_clip": 0.06422, + "auxiliary_loss_mlp": 0.01271665, + "balance_loss_clip": 0.06275772, + "balance_loss_mlp": 0.01260549, + "epoch": 0.6718172253118894, + "flos": 18740053975680.0, + "grad_norm": 4.441337622220845, + "language_loss": 0.71819955, + "learning_rate": 1.02721637475002e-06, + "loss": 0.79513621, + "num_input_tokens_seen": 241210720, + "router_z_loss_clip": 1.46191406, + "router_z_loss_mlp": 0.11108398, + "step": 11174, + "time_per_iteration": 2.483900547027588 + }, + { + "auxiliary_loss_clip": 0.06401111, + "auxiliary_loss_mlp": 0.01264943, + "balance_loss_clip": 0.06269203, + "balance_loss_mlp": 0.01255472, + "epoch": 0.6718773485645574, + "flos": 15637920117120.0, + "grad_norm": 1.9560679016643376, + "language_loss": 0.69026506, + "learning_rate": 1.0268761046308178e-06, + "loss": 0.76692557, + "num_input_tokens_seen": 241227395, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.09472656, + "step": 11175, + "time_per_iteration": 2.463592767715454 + }, + { + "auxiliary_loss_clip": 0.06406569, + "auxiliary_loss_mlp": 0.01265187, + "balance_loss_clip": 0.06273869, + "balance_loss_mlp": 0.01255341, + "epoch": 0.6719374718172253, + "flos": 19360908403200.0, + "grad_norm": 1.7117830890697936, + "language_loss": 0.74226189, + "learning_rate": 1.0265358714125714e-06, + "loss": 0.8189795, + "num_input_tokens_seen": 241246355, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09844971, + "step": 11176, + "time_per_iteration": 2.5074222087860107 + }, + { + "auxiliary_loss_clip": 0.06410944, + "auxiliary_loss_mlp": 0.0126684, + "balance_loss_clip": 0.062714, + "balance_loss_mlp": 0.0125654, + "epoch": 0.6719975950698933, + "flos": 21987817430400.0, + "grad_norm": 2.8444182697169014, + "language_loss": 0.73030323, + "learning_rate": 1.026195675108182e-06, + "loss": 0.80708104, + "num_input_tokens_seen": 241264180, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.10296631, + "step": 11177, + "time_per_iteration": 2.4807181358337402 + }, + { + "auxiliary_loss_clip": 0.06411102, + "auxiliary_loss_mlp": 0.01268926, + "balance_loss_clip": 0.06272686, + "balance_loss_mlp": 0.01258144, + "epoch": 0.6720577183225612, + "flos": 25235035833600.0, + "grad_norm": 2.1466059593233755, + "language_loss": 0.76338404, + "learning_rate": 1.025855515730551e-06, + "loss": 0.84018433, + "num_input_tokens_seen": 241282245, + "router_z_loss_clip": 1.38476562, + "router_z_loss_mlp": 0.10772705, + "step": 11178, + "time_per_iteration": 2.5277843475341797 + }, + { + "auxiliary_loss_clip": 0.06410985, + "auxiliary_loss_mlp": 0.01264657, + "balance_loss_clip": 0.06272356, + "balance_loss_mlp": 0.01255007, + "epoch": 0.6721178415752292, + "flos": 16951479448320.0, + "grad_norm": 1.7634405951154783, + "language_loss": 0.70127761, + "learning_rate": 1.0255153932925766e-06, + "loss": 0.77803409, + "num_input_tokens_seen": 241300745, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.09643555, + "step": 11179, + "time_per_iteration": 2.4638893604278564 + }, + { + "auxiliary_loss_clip": 0.06403655, + "auxiliary_loss_mlp": 0.01265471, + "balance_loss_clip": 0.06269027, + "balance_loss_mlp": 0.01256077, + "epoch": 0.6721779648278972, + "flos": 21547448697600.0, + "grad_norm": 1.4326115817211162, + "language_loss": 0.74262661, + "learning_rate": 1.0251753078071557e-06, + "loss": 0.81931782, + "num_input_tokens_seen": 241319320, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.09393311, + "step": 11180, + "time_per_iteration": 2.5094285011291504 + }, + { + "auxiliary_loss_clip": 0.0640661, + "auxiliary_loss_mlp": 0.01263274, + "balance_loss_clip": 0.06271511, + "balance_loss_mlp": 0.01252843, + "epoch": 0.6722380880805652, + "flos": 22612696853760.0, + "grad_norm": 1.3575184211837767, + "language_loss": 0.75178289, + "learning_rate": 1.0248352592871848e-06, + "loss": 0.82848167, + "num_input_tokens_seen": 241342225, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.10424805, + "step": 11181, + "time_per_iteration": 2.5373446941375732 + }, + { + "auxiliary_loss_clip": 0.06412126, + "auxiliary_loss_mlp": 0.0126461, + "balance_loss_clip": 0.06272763, + "balance_loss_mlp": 0.01254615, + "epoch": 0.6722982113332331, + "flos": 15930856391040.0, + "grad_norm": 2.2936660091873597, + "language_loss": 0.75133812, + "learning_rate": 1.0244952477455585e-06, + "loss": 0.82810551, + "num_input_tokens_seen": 241358240, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.09991455, + "step": 11182, + "time_per_iteration": 2.5146076679229736 + }, + { + "auxiliary_loss_clip": 0.06407333, + "auxiliary_loss_mlp": 0.01266179, + "balance_loss_clip": 0.06272985, + "balance_loss_mlp": 0.01256535, + "epoch": 0.6723583345859011, + "flos": 20602659185280.0, + "grad_norm": 1.7825231183024703, + "language_loss": 0.69884634, + "learning_rate": 1.0241552731951699e-06, + "loss": 0.77558148, + "num_input_tokens_seen": 241378420, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.09643555, + "step": 11183, + "time_per_iteration": 2.510972499847412 + }, + { + "auxiliary_loss_clip": 0.06407849, + "auxiliary_loss_mlp": 0.01268223, + "balance_loss_clip": 0.06270228, + "balance_loss_mlp": 0.01258234, + "epoch": 0.672418457838569, + "flos": 21732294804480.0, + "grad_norm": 1.4388499153565433, + "language_loss": 0.78377849, + "learning_rate": 1.0238153356489112e-06, + "loss": 0.8605392, + "num_input_tokens_seen": 241397185, + "router_z_loss_clip": 1.37695312, + "router_z_loss_mlp": 0.09985352, + "step": 11184, + "time_per_iteration": 2.5102083683013916 + }, + { + "auxiliary_loss_clip": 0.06418785, + "auxiliary_loss_mlp": 0.01263963, + "balance_loss_clip": 0.06274929, + "balance_loss_mlp": 0.01253305, + "epoch": 0.672478581091237, + "flos": 21476772178560.0, + "grad_norm": 2.087218631508525, + "language_loss": 0.66671652, + "learning_rate": 1.0234754351196743e-06, + "loss": 0.74354398, + "num_input_tokens_seen": 241415785, + "router_z_loss_clip": 1.43847656, + "router_z_loss_mlp": 0.10668945, + "step": 11185, + "time_per_iteration": 2.4922776222229004 + }, + { + "auxiliary_loss_clip": 0.06405509, + "auxiliary_loss_mlp": 0.01264604, + "balance_loss_clip": 0.06269497, + "balance_loss_mlp": 0.01253905, + "epoch": 0.6725387043439051, + "flos": 30854646887040.0, + "grad_norm": 3.8783146360767518, + "language_loss": 0.80847633, + "learning_rate": 1.023135571620345e-06, + "loss": 0.88517749, + "num_input_tokens_seen": 241437390, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.10693359, + "step": 11186, + "time_per_iteration": 2.650069236755371 + }, + { + "auxiliary_loss_clip": 0.06405525, + "auxiliary_loss_mlp": 0.01268075, + "balance_loss_clip": 0.06272745, + "balance_loss_mlp": 0.01258753, + "epoch": 0.672598827596573, + "flos": 24061949072640.0, + "grad_norm": 1.3182024269377546, + "language_loss": 0.807257, + "learning_rate": 1.022795745163813e-06, + "loss": 0.88399297, + "num_input_tokens_seen": 241458085, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.09320068, + "step": 11187, + "time_per_iteration": 2.5736026763916016 + }, + { + "auxiliary_loss_clip": 0.06414247, + "auxiliary_loss_mlp": 0.01266802, + "balance_loss_clip": 0.06271032, + "balance_loss_mlp": 0.01255996, + "epoch": 0.672658950849241, + "flos": 21878343671040.0, + "grad_norm": 1.7328673404989177, + "language_loss": 0.71004307, + "learning_rate": 1.022455955762965e-06, + "loss": 0.78685355, + "num_input_tokens_seen": 241476880, + "router_z_loss_clip": 1.43164062, + "router_z_loss_mlp": 0.1081543, + "step": 11188, + "time_per_iteration": 3.9358599185943604 + }, + { + "auxiliary_loss_clip": 0.06400838, + "auxiliary_loss_mlp": 0.01264937, + "balance_loss_clip": 0.06269124, + "balance_loss_mlp": 0.01255364, + "epoch": 0.6727190741019089, + "flos": 23228855452800.0, + "grad_norm": 1.7513555431786316, + "language_loss": 0.75587308, + "learning_rate": 1.0221162034306842e-06, + "loss": 0.83253086, + "num_input_tokens_seen": 241496535, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09576416, + "step": 11189, + "time_per_iteration": 2.558595895767212 + }, + { + "auxiliary_loss_clip": 0.06412518, + "auxiliary_loss_mlp": 0.01264313, + "balance_loss_clip": 0.06271306, + "balance_loss_mlp": 0.01252762, + "epoch": 0.6727791973545769, + "flos": 15784052837760.0, + "grad_norm": 2.0872354058578186, + "language_loss": 0.75281942, + "learning_rate": 1.0217764881798562e-06, + "loss": 0.8295877, + "num_input_tokens_seen": 241513465, + "router_z_loss_clip": 1.41210938, + "router_z_loss_mlp": 0.11547852, + "step": 11190, + "time_per_iteration": 2.465223550796509 + }, + { + "auxiliary_loss_clip": 0.06406397, + "auxiliary_loss_mlp": 0.01267439, + "balance_loss_clip": 0.06271145, + "balance_loss_mlp": 0.01256788, + "epoch": 0.6728393206072448, + "flos": 21255937943040.0, + "grad_norm": 1.3785573959073936, + "language_loss": 0.76754856, + "learning_rate": 1.0214368100233612e-06, + "loss": 0.84428692, + "num_input_tokens_seen": 241534125, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.10650635, + "step": 11191, + "time_per_iteration": 2.519883155822754 + }, + { + "auxiliary_loss_clip": 0.06406602, + "auxiliary_loss_mlp": 0.01266147, + "balance_loss_clip": 0.06273556, + "balance_loss_mlp": 0.01256509, + "epoch": 0.6728994438599128, + "flos": 32131295694720.0, + "grad_norm": 1.5727699537163, + "language_loss": 0.86438018, + "learning_rate": 1.0210971689740802e-06, + "loss": 0.94110769, + "num_input_tokens_seen": 241556340, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.09637451, + "step": 11192, + "time_per_iteration": 2.589451789855957 + }, + { + "auxiliary_loss_clip": 0.06414255, + "auxiliary_loss_mlp": 0.0126838, + "balance_loss_clip": 0.06275576, + "balance_loss_mlp": 0.01256948, + "epoch": 0.6729595671125808, + "flos": 23119046277120.0, + "grad_norm": 2.0400596637632997, + "language_loss": 0.76247764, + "learning_rate": 1.0207575650448923e-06, + "loss": 0.83930409, + "num_input_tokens_seen": 241575185, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.11437988, + "step": 11193, + "time_per_iteration": 2.569079637527466 + }, + { + "auxiliary_loss_clip": 0.06408816, + "auxiliary_loss_mlp": 0.01268779, + "balance_loss_clip": 0.06272899, + "balance_loss_mlp": 0.0125802, + "epoch": 0.6730196903652488, + "flos": 14616710081280.0, + "grad_norm": 1.7886354434370773, + "language_loss": 0.78477633, + "learning_rate": 1.0204179982486758e-06, + "loss": 0.86155224, + "num_input_tokens_seen": 241592970, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.10766602, + "step": 11194, + "time_per_iteration": 2.501262664794922 + }, + { + "auxiliary_loss_clip": 0.06410375, + "auxiliary_loss_mlp": 0.01264075, + "balance_loss_clip": 0.06271183, + "balance_loss_mlp": 0.01253889, + "epoch": 0.6730798136179167, + "flos": 21112320844800.0, + "grad_norm": 1.7894428961307616, + "language_loss": 0.90123671, + "learning_rate": 1.0200784685983075e-06, + "loss": 0.97798121, + "num_input_tokens_seen": 241610245, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.10192871, + "step": 11195, + "time_per_iteration": 2.529911994934082 + }, + { + "auxiliary_loss_clip": 0.06404506, + "auxiliary_loss_mlp": 0.01267592, + "balance_loss_clip": 0.0627027, + "balance_loss_mlp": 0.01257119, + "epoch": 0.6731399368705847, + "flos": 28993886467200.0, + "grad_norm": 1.9634861378348352, + "language_loss": 0.72801971, + "learning_rate": 1.019738976106662e-06, + "loss": 0.80474073, + "num_input_tokens_seen": 241630350, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.10467529, + "step": 11196, + "time_per_iteration": 2.5403385162353516 + }, + { + "auxiliary_loss_clip": 0.06306562, + "auxiliary_loss_mlp": 0.01254217, + "balance_loss_clip": 0.06250267, + "balance_loss_mlp": 0.01253061, + "epoch": 0.6732000601232526, + "flos": 64763643277440.0, + "grad_norm": 0.755157348431284, + "language_loss": 0.56539071, + "learning_rate": 1.0193995207866123e-06, + "loss": 0.64099848, + "num_input_tokens_seen": 241692380, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01152802, + "step": 11197, + "time_per_iteration": 3.103764295578003 + }, + { + "auxiliary_loss_clip": 0.06400825, + "auxiliary_loss_mlp": 0.01269132, + "balance_loss_clip": 0.06270334, + "balance_loss_mlp": 0.01259316, + "epoch": 0.6732601833759206, + "flos": 17207337490560.0, + "grad_norm": 1.957045035118017, + "language_loss": 0.76133382, + "learning_rate": 1.0190601026510312e-06, + "loss": 0.83803332, + "num_input_tokens_seen": 241710430, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.09814453, + "step": 11198, + "time_per_iteration": 2.4750118255615234 + }, + { + "auxiliary_loss_clip": 0.06411158, + "auxiliary_loss_mlp": 0.01264495, + "balance_loss_clip": 0.06272125, + "balance_loss_mlp": 0.01253492, + "epoch": 0.6733203066285887, + "flos": 18664430065920.0, + "grad_norm": 2.5858701419359185, + "language_loss": 0.81900644, + "learning_rate": 1.0187207217127892e-06, + "loss": 0.89576292, + "num_input_tokens_seen": 241724775, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.11010742, + "step": 11199, + "time_per_iteration": 3.915224075317383 + }, + { + "auxiliary_loss_clip": 0.06408331, + "auxiliary_loss_mlp": 0.01268211, + "balance_loss_clip": 0.06268819, + "balance_loss_mlp": 0.01257566, + "epoch": 0.6733804298812566, + "flos": 35818128144000.0, + "grad_norm": 1.7377353958720951, + "language_loss": 0.71924305, + "learning_rate": 1.0183813779847552e-06, + "loss": 0.79600847, + "num_input_tokens_seen": 241744440, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.10650635, + "step": 11200, + "time_per_iteration": 2.6547374725341797 + }, + { + "auxiliary_loss_clip": 0.06413474, + "auxiliary_loss_mlp": 0.01270012, + "balance_loss_clip": 0.06276008, + "balance_loss_mlp": 0.01259581, + "epoch": 0.6734405531339246, + "flos": 61651545511680.0, + "grad_norm": 1.525289564934158, + "language_loss": 0.64700097, + "learning_rate": 1.0180420714797987e-06, + "loss": 0.72383583, + "num_input_tokens_seen": 241771705, + "router_z_loss_clip": 1.37402344, + "router_z_loss_mlp": 0.10437012, + "step": 11201, + "time_per_iteration": 2.884462356567383 + }, + { + "auxiliary_loss_clip": 0.06414636, + "auxiliary_loss_mlp": 0.01267107, + "balance_loss_clip": 0.06272763, + "balance_loss_mlp": 0.01255466, + "epoch": 0.6735006763865925, + "flos": 20528670430080.0, + "grad_norm": 1.5117322786205176, + "language_loss": 0.63124895, + "learning_rate": 1.0177028022107856e-06, + "loss": 0.7080664, + "num_input_tokens_seen": 241790830, + "router_z_loss_clip": 1.41796875, + "router_z_loss_mlp": 0.11639404, + "step": 11202, + "time_per_iteration": 3.9962854385375977 + }, + { + "auxiliary_loss_clip": 0.06410715, + "auxiliary_loss_mlp": 0.0126867, + "balance_loss_clip": 0.06272809, + "balance_loss_mlp": 0.01258198, + "epoch": 0.6735607996392605, + "flos": 13924172885760.0, + "grad_norm": 1.7265240314624624, + "language_loss": 0.75169051, + "learning_rate": 1.0173635701905796e-06, + "loss": 0.82848436, + "num_input_tokens_seen": 241808165, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.10473633, + "step": 11203, + "time_per_iteration": 2.4805357456207275 + }, + { + "auxiliary_loss_clip": 0.06417318, + "auxiliary_loss_mlp": 0.0126681, + "balance_loss_clip": 0.0627423, + "balance_loss_mlp": 0.01254979, + "epoch": 0.6736209228919284, + "flos": 18813246117120.0, + "grad_norm": 2.5086879815410996, + "language_loss": 0.6739623, + "learning_rate": 1.0170243754320456e-06, + "loss": 0.75080359, + "num_input_tokens_seen": 241826925, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.11834717, + "step": 11204, + "time_per_iteration": 2.5092830657958984 + }, + { + "auxiliary_loss_clip": 0.06417938, + "auxiliary_loss_mlp": 0.01267705, + "balance_loss_clip": 0.06275398, + "balance_loss_mlp": 0.01256565, + "epoch": 0.6736810461445965, + "flos": 20378890056960.0, + "grad_norm": 1.4739361265515354, + "language_loss": 0.74145937, + "learning_rate": 1.0166852179480465e-06, + "loss": 0.81831586, + "num_input_tokens_seen": 241845525, + "router_z_loss_clip": 1.42480469, + "router_z_loss_mlp": 0.11151123, + "step": 11205, + "time_per_iteration": 2.5575578212738037 + }, + { + "auxiliary_loss_clip": 0.06405318, + "auxiliary_loss_mlp": 0.01270325, + "balance_loss_clip": 0.06271175, + "balance_loss_mlp": 0.0126027, + "epoch": 0.6737411693972644, + "flos": 30015264211200.0, + "grad_norm": 1.4826905039931084, + "language_loss": 0.71781552, + "learning_rate": 1.0163460977514416e-06, + "loss": 0.79457194, + "num_input_tokens_seen": 241866815, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.10058594, + "step": 11206, + "time_per_iteration": 4.010627031326294 + }, + { + "auxiliary_loss_clip": 0.0641677, + "auxiliary_loss_mlp": 0.01267501, + "balance_loss_clip": 0.06272575, + "balance_loss_mlp": 0.0125648, + "epoch": 0.6738012926499324, + "flos": 25454402622720.0, + "grad_norm": 2.885338634405065, + "language_loss": 0.67620468, + "learning_rate": 1.016007014855092e-06, + "loss": 0.75304735, + "num_input_tokens_seen": 241887050, + "router_z_loss_clip": 1.44335938, + "router_z_loss_mlp": 0.11016846, + "step": 11207, + "time_per_iteration": 2.5686817169189453 + }, + { + "auxiliary_loss_clip": 0.06404196, + "auxiliary_loss_mlp": 0.01268865, + "balance_loss_clip": 0.06272342, + "balance_loss_mlp": 0.01258672, + "epoch": 0.6738614159026003, + "flos": 20783102952960.0, + "grad_norm": 2.0413352600750145, + "language_loss": 0.74134195, + "learning_rate": 1.0156679692718553e-06, + "loss": 0.81807256, + "num_input_tokens_seen": 241904280, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.10186768, + "step": 11208, + "time_per_iteration": 2.4913690090179443 + }, + { + "auxiliary_loss_clip": 0.06408808, + "auxiliary_loss_mlp": 0.01269437, + "balance_loss_clip": 0.06270136, + "balance_loss_mlp": 0.01257432, + "epoch": 0.6739215391552683, + "flos": 19571931711360.0, + "grad_norm": 1.741711609442522, + "language_loss": 0.75868964, + "learning_rate": 1.0153289610145867e-06, + "loss": 0.83547217, + "num_input_tokens_seen": 241919190, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.11999512, + "step": 11209, + "time_per_iteration": 2.494077444076538 + }, + { + "auxiliary_loss_clip": 0.06402588, + "auxiliary_loss_mlp": 0.01265115, + "balance_loss_clip": 0.062707, + "balance_loss_mlp": 0.01255042, + "epoch": 0.6739816624079362, + "flos": 24394898471040.0, + "grad_norm": 1.8799682247559513, + "language_loss": 0.66601419, + "learning_rate": 1.0149899900961428e-06, + "loss": 0.74269128, + "num_input_tokens_seen": 241940525, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.10064697, + "step": 11210, + "time_per_iteration": 2.531925916671753 + }, + { + "auxiliary_loss_clip": 0.06400777, + "auxiliary_loss_mlp": 0.0126575, + "balance_loss_clip": 0.06269025, + "balance_loss_mlp": 0.01256297, + "epoch": 0.6740417856606042, + "flos": 22534683102720.0, + "grad_norm": 3.725779709718602, + "language_loss": 0.8045913, + "learning_rate": 1.014651056529377e-06, + "loss": 0.88125658, + "num_input_tokens_seen": 241959290, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09454346, + "step": 11211, + "time_per_iteration": 2.546027898788452 + }, + { + "auxiliary_loss_clip": 0.06403598, + "auxiliary_loss_mlp": 0.01265983, + "balance_loss_clip": 0.06271007, + "balance_loss_mlp": 0.01256208, + "epoch": 0.6741019089132723, + "flos": 25782530411520.0, + "grad_norm": 1.3057254169112946, + "language_loss": 0.76753151, + "learning_rate": 1.014312160327143e-06, + "loss": 0.84422737, + "num_input_tokens_seen": 241980715, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.09777832, + "step": 11212, + "time_per_iteration": 2.542628049850464 + }, + { + "auxiliary_loss_clip": 0.06409732, + "auxiliary_loss_mlp": 0.01268637, + "balance_loss_clip": 0.06270209, + "balance_loss_mlp": 0.01257539, + "epoch": 0.6741620321659402, + "flos": 21112027355520.0, + "grad_norm": 1.7288185495326422, + "language_loss": 0.78622723, + "learning_rate": 1.0139733015022905e-06, + "loss": 0.86301088, + "num_input_tokens_seen": 241999985, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.11108398, + "step": 11213, + "time_per_iteration": 2.553414821624756 + }, + { + "auxiliary_loss_clip": 0.06413242, + "auxiliary_loss_mlp": 0.01267804, + "balance_loss_clip": 0.06272203, + "balance_loss_mlp": 0.01256789, + "epoch": 0.6742221554186082, + "flos": 20746653626880.0, + "grad_norm": 1.7499991393106977, + "language_loss": 0.6779902, + "learning_rate": 1.0136344800676685e-06, + "loss": 0.75480068, + "num_input_tokens_seen": 242018990, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.11016846, + "step": 11214, + "time_per_iteration": 2.4924774169921875 + }, + { + "auxiliary_loss_clip": 0.06411138, + "auxiliary_loss_mlp": 0.01266837, + "balance_loss_clip": 0.06271094, + "balance_loss_mlp": 0.01256907, + "epoch": 0.6742822786712761, + "flos": 37782366756480.0, + "grad_norm": 1.5348832786859372, + "language_loss": 0.73044717, + "learning_rate": 1.0132956960361263e-06, + "loss": 0.8072269, + "num_input_tokens_seen": 242039340, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.0993042, + "step": 11215, + "time_per_iteration": 2.6919710636138916 + }, + { + "auxiliary_loss_clip": 0.06411563, + "auxiliary_loss_mlp": 0.01266913, + "balance_loss_clip": 0.06272543, + "balance_loss_mlp": 0.0125653, + "epoch": 0.6743424019239441, + "flos": 37272118118400.0, + "grad_norm": 1.6783781241391482, + "language_loss": 0.66716719, + "learning_rate": 1.0129569494205096e-06, + "loss": 0.74395192, + "num_input_tokens_seen": 242062215, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.1038208, + "step": 11216, + "time_per_iteration": 2.6457085609436035 + }, + { + "auxiliary_loss_clip": 0.06304459, + "auxiliary_loss_mlp": 0.01251318, + "balance_loss_clip": 0.06248666, + "balance_loss_mlp": 0.012498, + "epoch": 0.674402525176612, + "flos": 66020152377600.0, + "grad_norm": 0.6583920548662452, + "language_loss": 0.56272531, + "learning_rate": 1.0126182402336646e-06, + "loss": 0.63828307, + "num_input_tokens_seen": 242131130, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.01516724, + "step": 11217, + "time_per_iteration": 3.2267727851867676 + }, + { + "auxiliary_loss_clip": 0.064037, + "auxiliary_loss_mlp": 0.01266203, + "balance_loss_clip": 0.06268451, + "balance_loss_mlp": 0.01255939, + "epoch": 0.67446264842928, + "flos": 26467143396480.0, + "grad_norm": 1.8797709757007424, + "language_loss": 0.74946856, + "learning_rate": 1.0122795684884363e-06, + "loss": 0.82616764, + "num_input_tokens_seen": 242149720, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.1026001, + "step": 11218, + "time_per_iteration": 2.5534565448760986 + }, + { + "auxiliary_loss_clip": 0.06412031, + "auxiliary_loss_mlp": 0.01268347, + "balance_loss_clip": 0.06273925, + "balance_loss_mlp": 0.01257189, + "epoch": 0.674522771681948, + "flos": 23739146017920.0, + "grad_norm": 1.571619211134611, + "language_loss": 0.6640991, + "learning_rate": 1.0119409341976639e-06, + "loss": 0.74090284, + "num_input_tokens_seen": 242168875, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.1116333, + "step": 11219, + "time_per_iteration": 2.5408942699432373 + }, + { + "auxiliary_loss_clip": 0.06409343, + "auxiliary_loss_mlp": 0.01269022, + "balance_loss_clip": 0.062702, + "balance_loss_mlp": 0.01257935, + "epoch": 0.674582894934616, + "flos": 24761320375680.0, + "grad_norm": 1.6133708722293332, + "language_loss": 0.75378865, + "learning_rate": 1.0116023373741904e-06, + "loss": 0.83057231, + "num_input_tokens_seen": 242188465, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.11090088, + "step": 11220, + "time_per_iteration": 2.556192398071289 + }, + { + "auxiliary_loss_clip": 0.0640621, + "auxiliary_loss_mlp": 0.01265502, + "balance_loss_clip": 0.06268732, + "balance_loss_mlp": 0.01254988, + "epoch": 0.6746430181872839, + "flos": 24833506268160.0, + "grad_norm": 1.5601512803843804, + "language_loss": 0.70583248, + "learning_rate": 1.0112637780308554e-06, + "loss": 0.78254962, + "num_input_tokens_seen": 242208675, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.10522461, + "step": 11221, + "time_per_iteration": 2.538742780685425 + }, + { + "auxiliary_loss_clip": 0.06408031, + "auxiliary_loss_mlp": 0.01264539, + "balance_loss_clip": 0.06272538, + "balance_loss_mlp": 0.01255032, + "epoch": 0.6747031414399519, + "flos": 16879167774720.0, + "grad_norm": 2.089456373953198, + "language_loss": 0.58824384, + "learning_rate": 1.010925256180498e-06, + "loss": 0.66496956, + "num_input_tokens_seen": 242227440, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.09509277, + "step": 11222, + "time_per_iteration": 2.5625038146972656 + }, + { + "auxiliary_loss_clip": 0.06411393, + "auxiliary_loss_mlp": 0.01266063, + "balance_loss_clip": 0.0627331, + "balance_loss_mlp": 0.01255, + "epoch": 0.6747632646926198, + "flos": 22791715102080.0, + "grad_norm": 1.7403006489773343, + "language_loss": 0.76732111, + "learning_rate": 1.0105867718359528e-06, + "loss": 0.84409571, + "num_input_tokens_seen": 242245240, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.11065674, + "step": 11223, + "time_per_iteration": 2.499220132827759 + }, + { + "auxiliary_loss_clip": 0.06407724, + "auxiliary_loss_mlp": 0.01267921, + "balance_loss_clip": 0.06270097, + "balance_loss_mlp": 0.01257854, + "epoch": 0.6748233879452878, + "flos": 20052020079360.0, + "grad_norm": 1.8418495567149014, + "language_loss": 0.75473273, + "learning_rate": 1.0102483250100574e-06, + "loss": 0.83148926, + "num_input_tokens_seen": 242263435, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.10064697, + "step": 11224, + "time_per_iteration": 2.5515925884246826 + }, + { + "auxiliary_loss_clip": 0.06404493, + "auxiliary_loss_mlp": 0.01263212, + "balance_loss_clip": 0.06271124, + "balance_loss_mlp": 0.01254289, + "epoch": 0.6748835111979558, + "flos": 23009488663680.0, + "grad_norm": 1.6780430249692133, + "language_loss": 0.63333517, + "learning_rate": 1.0099099157156445e-06, + "loss": 0.7100122, + "num_input_tokens_seen": 242282765, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.0892334, + "step": 11225, + "time_per_iteration": 2.5058155059814453 + }, + { + "auxiliary_loss_clip": 0.0639993, + "auxiliary_loss_mlp": 0.01263232, + "balance_loss_clip": 0.06269206, + "balance_loss_mlp": 0.012541, + "epoch": 0.6749436344506238, + "flos": 12201201705600.0, + "grad_norm": 1.7347966506914976, + "language_loss": 0.64211845, + "learning_rate": 1.0095715439655462e-06, + "loss": 0.71875006, + "num_input_tokens_seen": 242298980, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.09130859, + "step": 11226, + "time_per_iteration": 2.5148916244506836 + }, + { + "auxiliary_loss_clip": 0.06412213, + "auxiliary_loss_mlp": 0.01266854, + "balance_loss_clip": 0.06273121, + "balance_loss_mlp": 0.01256256, + "epoch": 0.6750037577032918, + "flos": 11878356723840.0, + "grad_norm": 2.584638628864584, + "language_loss": 0.72339863, + "learning_rate": 1.0092332097725945e-06, + "loss": 0.80018932, + "num_input_tokens_seen": 242315420, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.10595703, + "step": 11227, + "time_per_iteration": 2.4601356983184814 + }, + { + "auxiliary_loss_clip": 0.06406709, + "auxiliary_loss_mlp": 0.01263943, + "balance_loss_clip": 0.06272034, + "balance_loss_mlp": 0.01254097, + "epoch": 0.6750638809559597, + "flos": 17025342422400.0, + "grad_norm": 2.4759856374415077, + "language_loss": 0.7107985, + "learning_rate": 1.0088949131496183e-06, + "loss": 0.78750503, + "num_input_tokens_seen": 242332805, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.09851074, + "step": 11228, + "time_per_iteration": 3.974013566970825 + }, + { + "auxiliary_loss_clip": 0.0630679, + "auxiliary_loss_mlp": 0.01262425, + "balance_loss_clip": 0.06250891, + "balance_loss_mlp": 0.01260476, + "epoch": 0.6751240042086277, + "flos": 70972774531200.0, + "grad_norm": 0.7443387383646383, + "language_loss": 0.52992356, + "learning_rate": 1.0085566541094482e-06, + "loss": 0.60561574, + "num_input_tokens_seen": 242396160, + "router_z_loss_clip": 0.55908203, + "router_z_loss_mlp": 0.01947021, + "step": 11229, + "time_per_iteration": 3.1949167251586914 + }, + { + "auxiliary_loss_clip": 0.06405008, + "auxiliary_loss_mlp": 0.01265887, + "balance_loss_clip": 0.06271674, + "balance_loss_mlp": 0.01256249, + "epoch": 0.6751841274612956, + "flos": 22681863999360.0, + "grad_norm": 2.9468842422151673, + "language_loss": 0.80432749, + "learning_rate": 1.0082184326649072e-06, + "loss": 0.88103646, + "num_input_tokens_seen": 242414660, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.09625244, + "step": 11230, + "time_per_iteration": 2.5213663578033447 + }, + { + "auxiliary_loss_clip": 0.06402741, + "auxiliary_loss_mlp": 0.01262658, + "balance_loss_clip": 0.06269971, + "balance_loss_mlp": 0.0125333, + "epoch": 0.6752442507139637, + "flos": 21295112526720.0, + "grad_norm": 1.434197979050497, + "language_loss": 0.65974534, + "learning_rate": 1.0078802488288228e-06, + "loss": 0.73639941, + "num_input_tokens_seen": 242434225, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09326172, + "step": 11231, + "time_per_iteration": 2.512449026107788 + }, + { + "auxiliary_loss_clip": 0.06417508, + "auxiliary_loss_mlp": 0.01271667, + "balance_loss_clip": 0.06276156, + "balance_loss_mlp": 0.01260396, + "epoch": 0.6753043739666316, + "flos": 28264480675200.0, + "grad_norm": 1.8511033060394846, + "language_loss": 0.66944438, + "learning_rate": 1.0075421026140198e-06, + "loss": 0.7463361, + "num_input_tokens_seen": 242454355, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.11260986, + "step": 11232, + "time_per_iteration": 2.5738155841827393 + }, + { + "auxiliary_loss_clip": 0.06404346, + "auxiliary_loss_mlp": 0.01266971, + "balance_loss_clip": 0.0627114, + "balance_loss_mlp": 0.01257226, + "epoch": 0.6753644972192996, + "flos": 21366627586560.0, + "grad_norm": 1.674017645319507, + "language_loss": 0.72178799, + "learning_rate": 1.0072039940333188e-06, + "loss": 0.79850119, + "num_input_tokens_seen": 242474935, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.09735107, + "step": 11233, + "time_per_iteration": 2.5327250957489014 + }, + { + "auxiliary_loss_clip": 0.0640566, + "auxiliary_loss_mlp": 0.01263187, + "balance_loss_clip": 0.06269811, + "balance_loss_mlp": 0.01253579, + "epoch": 0.6754246204719675, + "flos": 26549224070400.0, + "grad_norm": 1.499022886883579, + "language_loss": 0.7716381, + "learning_rate": 1.0068659230995418e-06, + "loss": 0.84832656, + "num_input_tokens_seen": 242495530, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.09606934, + "step": 11234, + "time_per_iteration": 2.607923746109009 + }, + { + "auxiliary_loss_clip": 0.0640721, + "auxiliary_loss_mlp": 0.01266453, + "balance_loss_clip": 0.06272233, + "balance_loss_mlp": 0.01255224, + "epoch": 0.6754847437246355, + "flos": 25563750600960.0, + "grad_norm": 1.4543561341667586, + "language_loss": 0.75457549, + "learning_rate": 1.0065278898255101e-06, + "loss": 0.83131212, + "num_input_tokens_seen": 242514550, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.11230469, + "step": 11235, + "time_per_iteration": 2.614145278930664 + }, + { + "auxiliary_loss_clip": 0.06304054, + "auxiliary_loss_mlp": 0.01255487, + "balance_loss_clip": 0.06248432, + "balance_loss_mlp": 0.01253944, + "epoch": 0.6755448669773034, + "flos": 59530216492800.0, + "grad_norm": 0.7576799363115112, + "language_loss": 0.51220065, + "learning_rate": 1.0061898942240387e-06, + "loss": 0.58779609, + "num_input_tokens_seen": 242569200, + "router_z_loss_clip": 0.55712891, + "router_z_loss_mlp": 0.01538849, + "step": 11236, + "time_per_iteration": 3.079153060913086 + }, + { + "auxiliary_loss_clip": 0.06406215, + "auxiliary_loss_mlp": 0.01265101, + "balance_loss_clip": 0.06270912, + "balance_loss_mlp": 0.01253931, + "epoch": 0.6756049902299714, + "flos": 23301209053440.0, + "grad_norm": 1.9064890293106858, + "language_loss": 0.75501907, + "learning_rate": 1.0058519363079464e-06, + "loss": 0.83173215, + "num_input_tokens_seen": 242586950, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.11181641, + "step": 11237, + "time_per_iteration": 2.591219186782837 + }, + { + "auxiliary_loss_clip": 0.06407686, + "auxiliary_loss_mlp": 0.01265319, + "balance_loss_clip": 0.06271937, + "balance_loss_mlp": 0.01254441, + "epoch": 0.6756651134826394, + "flos": 31583256065280.0, + "grad_norm": 1.6435273747755843, + "language_loss": 0.77603805, + "learning_rate": 1.0055140160900482e-06, + "loss": 0.85276806, + "num_input_tokens_seen": 242607380, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10876465, + "step": 11238, + "time_per_iteration": 4.004278659820557 + }, + { + "auxiliary_loss_clip": 0.06411187, + "auxiliary_loss_mlp": 0.01266355, + "balance_loss_clip": 0.06269816, + "balance_loss_mlp": 0.01255834, + "epoch": 0.6757252367353074, + "flos": 27279761892480.0, + "grad_norm": 1.8597789781280543, + "language_loss": 0.66815203, + "learning_rate": 1.0051761335831587e-06, + "loss": 0.74492747, + "num_input_tokens_seen": 242628025, + "router_z_loss_clip": 1.41503906, + "router_z_loss_mlp": 0.10510254, + "step": 11239, + "time_per_iteration": 2.5872182846069336 + }, + { + "auxiliary_loss_clip": 0.06401898, + "auxiliary_loss_mlp": 0.01262458, + "balance_loss_clip": 0.06269912, + "balance_loss_mlp": 0.01252927, + "epoch": 0.6757853599879754, + "flos": 16835548924800.0, + "grad_norm": 2.5961823999819074, + "language_loss": 0.8317802, + "learning_rate": 1.0048382888000898e-06, + "loss": 0.90842378, + "num_input_tokens_seen": 242643825, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09533691, + "step": 11240, + "time_per_iteration": 2.4803500175476074 + }, + { + "auxiliary_loss_clip": 0.0641778, + "auxiliary_loss_mlp": 0.01269049, + "balance_loss_clip": 0.06275319, + "balance_loss_mlp": 0.0125677, + "epoch": 0.6758454832406433, + "flos": 23226465611520.0, + "grad_norm": 1.9848396876019143, + "language_loss": 0.7422142, + "learning_rate": 1.0045004817536525e-06, + "loss": 0.8190825, + "num_input_tokens_seen": 242661820, + "router_z_loss_clip": 1.42382812, + "router_z_loss_mlp": 0.12268066, + "step": 11241, + "time_per_iteration": 2.526111602783203 + }, + { + "auxiliary_loss_clip": 0.06407639, + "auxiliary_loss_mlp": 0.0126396, + "balance_loss_clip": 0.06271756, + "balance_loss_mlp": 0.01253833, + "epoch": 0.6759056064933113, + "flos": 16295098089600.0, + "grad_norm": 2.0527933437331343, + "language_loss": 0.80294073, + "learning_rate": 1.0041627124566572e-06, + "loss": 0.87965673, + "num_input_tokens_seen": 242679890, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.10131836, + "step": 11242, + "time_per_iteration": 3.933396339416504 + }, + { + "auxiliary_loss_clip": 0.06405968, + "auxiliary_loss_mlp": 0.01263229, + "balance_loss_clip": 0.06268989, + "balance_loss_mlp": 0.01253734, + "epoch": 0.6759657297459792, + "flos": 25929543600000.0, + "grad_norm": 1.6744190932532899, + "language_loss": 0.72630656, + "learning_rate": 1.0038249809219109e-06, + "loss": 0.80299854, + "num_input_tokens_seen": 242699495, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.09490967, + "step": 11243, + "time_per_iteration": 2.514404535293579 + }, + { + "auxiliary_loss_clip": 0.06407295, + "auxiliary_loss_mlp": 0.01265212, + "balance_loss_clip": 0.06272102, + "balance_loss_mlp": 0.01255306, + "epoch": 0.6760258529986473, + "flos": 23007140749440.0, + "grad_norm": 1.5647847453275578, + "language_loss": 0.72900802, + "learning_rate": 1.003487287162221e-06, + "loss": 0.80573308, + "num_input_tokens_seen": 242719500, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.09906006, + "step": 11244, + "time_per_iteration": 2.5581138134002686 + }, + { + "auxiliary_loss_clip": 0.06405992, + "auxiliary_loss_mlp": 0.01266317, + "balance_loss_clip": 0.06269385, + "balance_loss_mlp": 0.01255887, + "epoch": 0.6760859762513152, + "flos": 20965601145600.0, + "grad_norm": 4.977975302469332, + "language_loss": 0.85911322, + "learning_rate": 1.003149631190393e-06, + "loss": 0.93583632, + "num_input_tokens_seen": 242738325, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.10437012, + "step": 11245, + "time_per_iteration": 2.485227584838867 + }, + { + "auxiliary_loss_clip": 0.06410875, + "auxiliary_loss_mlp": 0.01265401, + "balance_loss_clip": 0.06269195, + "balance_loss_mlp": 0.01254743, + "epoch": 0.6761460995039832, + "flos": 23629672258560.0, + "grad_norm": 1.7215460318487352, + "language_loss": 0.74000847, + "learning_rate": 1.0028120130192327e-06, + "loss": 0.81677115, + "num_input_tokens_seen": 242756620, + "router_z_loss_clip": 1.41796875, + "router_z_loss_mlp": 0.10656738, + "step": 11246, + "time_per_iteration": 3.958766460418701 + }, + { + "auxiliary_loss_clip": 0.06405219, + "auxiliary_loss_mlp": 0.01262106, + "balance_loss_clip": 0.0626854, + "balance_loss_mlp": 0.01251896, + "epoch": 0.6762062227566511, + "flos": 20776101137280.0, + "grad_norm": 1.7168055925724897, + "language_loss": 0.87943971, + "learning_rate": 1.002474432661539e-06, + "loss": 0.95611298, + "num_input_tokens_seen": 242774505, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.10205078, + "step": 11247, + "time_per_iteration": 2.586812973022461 + }, + { + "auxiliary_loss_clip": 0.06307312, + "auxiliary_loss_mlp": 0.01250807, + "balance_loss_clip": 0.06251501, + "balance_loss_mlp": 0.01249509, + "epoch": 0.6762663460093191, + "flos": 52836915219840.0, + "grad_norm": 0.8036403587512043, + "language_loss": 0.53957772, + "learning_rate": 1.002136890130115e-06, + "loss": 0.61515892, + "num_input_tokens_seen": 242828645, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.01298523, + "step": 11248, + "time_per_iteration": 3.125509262084961 + }, + { + "auxiliary_loss_clip": 0.06402693, + "auxiliary_loss_mlp": 0.01266342, + "balance_loss_clip": 0.06271251, + "balance_loss_mlp": 0.0125671, + "epoch": 0.676326469261987, + "flos": 23703115962240.0, + "grad_norm": 1.8151620805455404, + "language_loss": 0.73989308, + "learning_rate": 1.001799385437761e-06, + "loss": 0.81658345, + "num_input_tokens_seen": 242850100, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09625244, + "step": 11249, + "time_per_iteration": 2.6366310119628906 + }, + { + "auxiliary_loss_clip": 0.06411433, + "auxiliary_loss_mlp": 0.01264935, + "balance_loss_clip": 0.06270382, + "balance_loss_mlp": 0.01253372, + "epoch": 0.676386592514655, + "flos": 14068880087040.0, + "grad_norm": 2.152895610647936, + "language_loss": 0.74230921, + "learning_rate": 1.0014619185972732e-06, + "loss": 0.81907284, + "num_input_tokens_seen": 242867775, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.11566162, + "step": 11250, + "time_per_iteration": 2.458453416824341 + }, + { + "auxiliary_loss_clip": 0.06409556, + "auxiliary_loss_mlp": 0.01266298, + "balance_loss_clip": 0.06271183, + "balance_loss_mlp": 0.01256082, + "epoch": 0.676446715767323, + "flos": 20418441984000.0, + "grad_norm": 1.8697083640776453, + "language_loss": 0.74947959, + "learning_rate": 1.0011244896214497e-06, + "loss": 0.82623816, + "num_input_tokens_seen": 242886865, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.10217285, + "step": 11251, + "time_per_iteration": 2.568087100982666 + }, + { + "auxiliary_loss_clip": 0.06411379, + "auxiliary_loss_mlp": 0.01266225, + "balance_loss_clip": 0.06275384, + "balance_loss_mlp": 0.012553, + "epoch": 0.676506839019991, + "flos": 21294651329280.0, + "grad_norm": 1.5310605534253319, + "language_loss": 0.69863832, + "learning_rate": 1.0007870985230873e-06, + "loss": 0.77541435, + "num_input_tokens_seen": 242906705, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.109375, + "step": 11252, + "time_per_iteration": 2.541651725769043 + }, + { + "auxiliary_loss_clip": 0.06405863, + "auxiliary_loss_mlp": 0.0126458, + "balance_loss_clip": 0.06270701, + "balance_loss_mlp": 0.01254406, + "epoch": 0.676566962272659, + "flos": 29939849936640.0, + "grad_norm": 2.258609602750375, + "language_loss": 0.67108035, + "learning_rate": 1.0004497453149765e-06, + "loss": 0.74778473, + "num_input_tokens_seen": 242925215, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.10174561, + "step": 11253, + "time_per_iteration": 2.6143195629119873 + }, + { + "auxiliary_loss_clip": 0.06413913, + "auxiliary_loss_mlp": 0.01267285, + "balance_loss_clip": 0.06273795, + "balance_loss_mlp": 0.01255722, + "epoch": 0.6766270855253269, + "flos": 17936994844800.0, + "grad_norm": 1.5309002898419535, + "language_loss": 0.77274752, + "learning_rate": 1.0001124300099115e-06, + "loss": 0.84955955, + "num_input_tokens_seen": 242944750, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.11560059, + "step": 11254, + "time_per_iteration": 2.4911346435546875 + }, + { + "auxiliary_loss_clip": 0.06411318, + "auxiliary_loss_mlp": 0.01266788, + "balance_loss_clip": 0.06272383, + "balance_loss_mlp": 0.01255439, + "epoch": 0.6766872087779949, + "flos": 23110283525760.0, + "grad_norm": 2.0449563599790874, + "language_loss": 0.71835911, + "learning_rate": 9.997751526206835e-07, + "loss": 0.79514015, + "num_input_tokens_seen": 242963860, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.11340332, + "step": 11255, + "time_per_iteration": 2.5604913234710693 + }, + { + "auxiliary_loss_clip": 0.0641115, + "auxiliary_loss_mlp": 0.0126876, + "balance_loss_clip": 0.06271946, + "balance_loss_mlp": 0.01257376, + "epoch": 0.6767473320306628, + "flos": 26220257740800.0, + "grad_norm": 1.9457423412026578, + "language_loss": 0.75806832, + "learning_rate": 9.994379131600828e-07, + "loss": 0.83486742, + "num_input_tokens_seen": 242983050, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.11383057, + "step": 11256, + "time_per_iteration": 2.5321764945983887 + }, + { + "auxiliary_loss_clip": 0.06411014, + "auxiliary_loss_mlp": 0.01265861, + "balance_loss_clip": 0.06275011, + "balance_loss_mlp": 0.01255192, + "epoch": 0.6768074552833309, + "flos": 18374554465920.0, + "grad_norm": 2.012218384442974, + "language_loss": 0.65943599, + "learning_rate": 9.991007116408965e-07, + "loss": 0.73620474, + "num_input_tokens_seen": 243001125, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.10662842, + "step": 11257, + "time_per_iteration": 2.502154588699341 + }, + { + "auxiliary_loss_clip": 0.06409346, + "auxiliary_loss_mlp": 0.01265352, + "balance_loss_clip": 0.0627479, + "balance_loss_mlp": 0.01255159, + "epoch": 0.6768675785359988, + "flos": 23046692676480.0, + "grad_norm": 1.399276257571999, + "language_loss": 0.75707698, + "learning_rate": 9.987635480759109e-07, + "loss": 0.83382392, + "num_input_tokens_seen": 243021865, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.10186768, + "step": 11258, + "time_per_iteration": 2.536574602127075 + }, + { + "auxiliary_loss_clip": 0.06402203, + "auxiliary_loss_mlp": 0.01264608, + "balance_loss_clip": 0.06270992, + "balance_loss_mlp": 0.01254696, + "epoch": 0.6769277017886668, + "flos": 33044876760960.0, + "grad_norm": 1.5373580485699971, + "language_loss": 0.66955268, + "learning_rate": 9.984264224779127e-07, + "loss": 0.74622083, + "num_input_tokens_seen": 243042970, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.09912109, + "step": 11259, + "time_per_iteration": 2.59914231300354 + }, + { + "auxiliary_loss_clip": 0.06411228, + "auxiliary_loss_mlp": 0.01264994, + "balance_loss_clip": 0.06273773, + "balance_loss_mlp": 0.01254218, + "epoch": 0.6769878250413347, + "flos": 20854408377600.0, + "grad_norm": 2.0822099065238397, + "language_loss": 0.85664153, + "learning_rate": 9.980893348596839e-07, + "loss": 0.93340379, + "num_input_tokens_seen": 243058470, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.10778809, + "step": 11260, + "time_per_iteration": 2.470489501953125 + }, + { + "auxiliary_loss_clip": 0.06415793, + "auxiliary_loss_mlp": 0.01265606, + "balance_loss_clip": 0.06273471, + "balance_loss_mlp": 0.01253453, + "epoch": 0.6770479482940027, + "flos": 15601345009920.0, + "grad_norm": 2.2691636202149206, + "language_loss": 0.77703118, + "learning_rate": 9.977522852340081e-07, + "loss": 0.85384524, + "num_input_tokens_seen": 243076630, + "router_z_loss_clip": 1.42285156, + "router_z_loss_mlp": 0.12164307, + "step": 11261, + "time_per_iteration": 2.5071561336517334 + }, + { + "auxiliary_loss_clip": 0.06410246, + "auxiliary_loss_mlp": 0.01267278, + "balance_loss_clip": 0.06271648, + "balance_loss_mlp": 0.01256013, + "epoch": 0.6771080715466706, + "flos": 18626345585280.0, + "grad_norm": 1.5719770677718063, + "language_loss": 0.87847519, + "learning_rate": 9.97415273613666e-07, + "loss": 0.95525038, + "num_input_tokens_seen": 243092260, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.1126709, + "step": 11262, + "time_per_iteration": 2.4645345211029053 + }, + { + "auxiliary_loss_clip": 0.06413369, + "auxiliary_loss_mlp": 0.01265118, + "balance_loss_clip": 0.06273858, + "balance_loss_mlp": 0.01254371, + "epoch": 0.6771681947993387, + "flos": 12500427035520.0, + "grad_norm": 1.7525589115394145, + "language_loss": 0.74310911, + "learning_rate": 9.97078300011439e-07, + "loss": 0.81989402, + "num_input_tokens_seen": 243109405, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.10754395, + "step": 11263, + "time_per_iteration": 2.6041438579559326 + }, + { + "auxiliary_loss_clip": 0.06415032, + "auxiliary_loss_mlp": 0.01264304, + "balance_loss_clip": 0.06272443, + "balance_loss_mlp": 0.01252406, + "epoch": 0.6772283180520066, + "flos": 22243549691520.0, + "grad_norm": 2.1938876589125544, + "language_loss": 0.68432045, + "learning_rate": 9.967413644401016e-07, + "loss": 0.76111376, + "num_input_tokens_seen": 243128135, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.11901855, + "step": 11264, + "time_per_iteration": 2.5002152919769287 + }, + { + "auxiliary_loss_clip": 0.0641073, + "auxiliary_loss_mlp": 0.01264807, + "balance_loss_clip": 0.062745, + "balance_loss_mlp": 0.01254006, + "epoch": 0.6772884413046746, + "flos": 16148588025600.0, + "grad_norm": 1.8587455254700258, + "language_loss": 0.73335183, + "learning_rate": 9.964044669124324e-07, + "loss": 0.81010723, + "num_input_tokens_seen": 243146785, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.10797119, + "step": 11265, + "time_per_iteration": 2.469163179397583 + }, + { + "auxiliary_loss_clip": 0.06407094, + "auxiliary_loss_mlp": 0.01269883, + "balance_loss_clip": 0.06273008, + "balance_loss_mlp": 0.01258969, + "epoch": 0.6773485645573426, + "flos": 19141835103360.0, + "grad_norm": 1.6254501454395083, + "language_loss": 0.61922127, + "learning_rate": 9.96067607441207e-07, + "loss": 0.69599104, + "num_input_tokens_seen": 243165275, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.10913086, + "step": 11266, + "time_per_iteration": 2.495842933654785 + }, + { + "auxiliary_loss_clip": 0.06409343, + "auxiliary_loss_mlp": 0.01269206, + "balance_loss_clip": 0.06271549, + "balance_loss_mlp": 0.01258829, + "epoch": 0.6774086878100105, + "flos": 14142114155520.0, + "grad_norm": 1.8179552610473837, + "language_loss": 0.70953995, + "learning_rate": 9.957307860391976e-07, + "loss": 0.78632545, + "num_input_tokens_seen": 243182845, + "router_z_loss_clip": 1.37695312, + "router_z_loss_mlp": 0.1038208, + "step": 11267, + "time_per_iteration": 2.517019033432007 + }, + { + "auxiliary_loss_clip": 0.06410597, + "auxiliary_loss_mlp": 0.01264315, + "balance_loss_clip": 0.06273153, + "balance_loss_mlp": 0.01254009, + "epoch": 0.6774688110626785, + "flos": 22203075369600.0, + "grad_norm": 4.7399438404850525, + "language_loss": 0.71134216, + "learning_rate": 9.953940027191785e-07, + "loss": 0.7880913, + "num_input_tokens_seen": 243201475, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.10314941, + "step": 11268, + "time_per_iteration": 3.937225103378296 + }, + { + "auxiliary_loss_clip": 0.06412301, + "auxiliary_loss_mlp": 0.01268549, + "balance_loss_clip": 0.06274435, + "balance_loss_mlp": 0.0125726, + "epoch": 0.6775289343153464, + "flos": 23046734603520.0, + "grad_norm": 1.4295252958840357, + "language_loss": 0.76893616, + "learning_rate": 9.950572574939194e-07, + "loss": 0.84574473, + "num_input_tokens_seen": 243221850, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.11291504, + "step": 11269, + "time_per_iteration": 2.5114824771881104 + }, + { + "auxiliary_loss_clip": 0.06414156, + "auxiliary_loss_mlp": 0.01271853, + "balance_loss_clip": 0.06274021, + "balance_loss_mlp": 0.01259879, + "epoch": 0.6775890575680145, + "flos": 18298930556160.0, + "grad_norm": 1.7033288836702745, + "language_loss": 0.74101746, + "learning_rate": 9.94720550376189e-07, + "loss": 0.81787759, + "num_input_tokens_seen": 243239855, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.11968994, + "step": 11270, + "time_per_iteration": 2.4997193813323975 + }, + { + "auxiliary_loss_clip": 0.06411543, + "auxiliary_loss_mlp": 0.01265167, + "balance_loss_clip": 0.06274433, + "balance_loss_mlp": 0.01254504, + "epoch": 0.6776491808206824, + "flos": 25343251781760.0, + "grad_norm": 1.5419173604084193, + "language_loss": 0.72974074, + "learning_rate": 9.94383881378756e-07, + "loss": 0.80650789, + "num_input_tokens_seen": 243260085, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.10668945, + "step": 11271, + "time_per_iteration": 2.5310120582580566 + }, + { + "auxiliary_loss_clip": 0.06411068, + "auxiliary_loss_mlp": 0.01265404, + "balance_loss_clip": 0.06274058, + "balance_loss_mlp": 0.01254902, + "epoch": 0.6777093040733504, + "flos": 26034908509440.0, + "grad_norm": 1.6287619781350626, + "language_loss": 0.6787045, + "learning_rate": 9.94047250514387e-07, + "loss": 0.75546926, + "num_input_tokens_seen": 243280065, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.10498047, + "step": 11272, + "time_per_iteration": 2.556326389312744 + }, + { + "auxiliary_loss_clip": 0.06416756, + "auxiliary_loss_mlp": 0.01268859, + "balance_loss_clip": 0.06274517, + "balance_loss_mlp": 0.01256723, + "epoch": 0.6777694273260183, + "flos": 18009306518400.0, + "grad_norm": 2.0957855047238865, + "language_loss": 0.73988581, + "learning_rate": 9.937106577958481e-07, + "loss": 0.81674194, + "num_input_tokens_seen": 243297775, + "router_z_loss_clip": 1.42285156, + "router_z_loss_mlp": 0.121521, + "step": 11273, + "time_per_iteration": 2.4888038635253906 + }, + { + "auxiliary_loss_clip": 0.0640964, + "auxiliary_loss_mlp": 0.01267095, + "balance_loss_clip": 0.06273794, + "balance_loss_mlp": 0.01256069, + "epoch": 0.6778295505786863, + "flos": 23447886825600.0, + "grad_norm": 1.597740332843532, + "language_loss": 0.70512903, + "learning_rate": 9.933741032359015e-07, + "loss": 0.78189635, + "num_input_tokens_seen": 243315760, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.11022949, + "step": 11274, + "time_per_iteration": 2.5328569412231445 + }, + { + "auxiliary_loss_clip": 0.06408958, + "auxiliary_loss_mlp": 0.01270481, + "balance_loss_clip": 0.06268886, + "balance_loss_mlp": 0.01259413, + "epoch": 0.6778896738313542, + "flos": 19104337601280.0, + "grad_norm": 1.549823334564571, + "language_loss": 0.65894532, + "learning_rate": 9.930375868473093e-07, + "loss": 0.73573971, + "num_input_tokens_seen": 243335715, + "router_z_loss_clip": 1.40039062, + "router_z_loss_mlp": 0.1105957, + "step": 11275, + "time_per_iteration": 2.511591672897339 + }, + { + "auxiliary_loss_clip": 0.06410493, + "auxiliary_loss_mlp": 0.01266749, + "balance_loss_clip": 0.06273688, + "balance_loss_mlp": 0.01256801, + "epoch": 0.6779497970840223, + "flos": 26111077470720.0, + "grad_norm": 1.6541358125051857, + "language_loss": 0.72680271, + "learning_rate": 9.927011086428335e-07, + "loss": 0.80357516, + "num_input_tokens_seen": 243356935, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.0994873, + "step": 11276, + "time_per_iteration": 2.5891473293304443 + }, + { + "auxiliary_loss_clip": 0.06409149, + "auxiliary_loss_mlp": 0.01265896, + "balance_loss_clip": 0.06273319, + "balance_loss_mlp": 0.01255245, + "epoch": 0.6780099203366902, + "flos": 19725359736960.0, + "grad_norm": 1.5650058182326292, + "language_loss": 0.76883596, + "learning_rate": 9.923646686352317e-07, + "loss": 0.84558642, + "num_input_tokens_seen": 243375625, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.10650635, + "step": 11277, + "time_per_iteration": 3.915508985519409 + }, + { + "auxiliary_loss_clip": 0.06416161, + "auxiliary_loss_mlp": 0.01266536, + "balance_loss_clip": 0.06275125, + "balance_loss_mlp": 0.01254633, + "epoch": 0.6780700435893582, + "flos": 18218946234240.0, + "grad_norm": 2.711703251949157, + "language_loss": 0.83725727, + "learning_rate": 9.920282668372627e-07, + "loss": 0.91408426, + "num_input_tokens_seen": 243390195, + "router_z_loss_clip": 1.40917969, + "router_z_loss_mlp": 0.11907959, + "step": 11278, + "time_per_iteration": 2.4728851318359375 + }, + { + "auxiliary_loss_clip": 0.06408397, + "auxiliary_loss_mlp": 0.01270203, + "balance_loss_clip": 0.06273898, + "balance_loss_mlp": 0.01259862, + "epoch": 0.6781301668420262, + "flos": 25383600322560.0, + "grad_norm": 1.4808013348463376, + "language_loss": 0.70247126, + "learning_rate": 9.916919032616844e-07, + "loss": 0.77925724, + "num_input_tokens_seen": 243411690, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.10339355, + "step": 11279, + "time_per_iteration": 2.5876686573028564 + }, + { + "auxiliary_loss_clip": 0.06411046, + "auxiliary_loss_mlp": 0.01265971, + "balance_loss_clip": 0.06272636, + "balance_loss_mlp": 0.01254027, + "epoch": 0.6781902900946941, + "flos": 24026589849600.0, + "grad_norm": 1.7835400791989957, + "language_loss": 0.74185818, + "learning_rate": 9.913555779212485e-07, + "loss": 0.81862831, + "num_input_tokens_seen": 243430280, + "router_z_loss_clip": 1.38476562, + "router_z_loss_mlp": 0.1194458, + "step": 11280, + "time_per_iteration": 2.558945655822754 + }, + { + "auxiliary_loss_clip": 0.06412832, + "auxiliary_loss_mlp": 0.01263795, + "balance_loss_clip": 0.06270506, + "balance_loss_mlp": 0.01251844, + "epoch": 0.6782504133473621, + "flos": 19652964209280.0, + "grad_norm": 1.818075538813212, + "language_loss": 0.70597506, + "learning_rate": 9.910192908287104e-07, + "loss": 0.78274131, + "num_input_tokens_seen": 243448690, + "router_z_loss_clip": 1.42382812, + "router_z_loss_mlp": 0.11950684, + "step": 11281, + "time_per_iteration": 2.5192151069641113 + }, + { + "auxiliary_loss_clip": 0.06408101, + "auxiliary_loss_mlp": 0.01268091, + "balance_loss_clip": 0.06274794, + "balance_loss_mlp": 0.01257821, + "epoch": 0.67831053660003, + "flos": 24939080812800.0, + "grad_norm": 1.5294707212527767, + "language_loss": 0.63880533, + "learning_rate": 9.906830419968217e-07, + "loss": 0.71556723, + "num_input_tokens_seen": 243470695, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.1026001, + "step": 11282, + "time_per_iteration": 4.0389556884765625 + }, + { + "auxiliary_loss_clip": 0.06416775, + "auxiliary_loss_mlp": 0.01269152, + "balance_loss_clip": 0.06272826, + "balance_loss_mlp": 0.01257434, + "epoch": 0.6783706598526981, + "flos": 31215785984640.0, + "grad_norm": 1.5661846366283017, + "language_loss": 0.74472761, + "learning_rate": 9.90346831438334e-07, + "loss": 0.82158691, + "num_input_tokens_seen": 243493345, + "router_z_loss_clip": 1.43945312, + "router_z_loss_mlp": 0.11712646, + "step": 11283, + "time_per_iteration": 2.5889575481414795 + }, + { + "auxiliary_loss_clip": 0.06409109, + "auxiliary_loss_mlp": 0.01265353, + "balance_loss_clip": 0.06271229, + "balance_loss_mlp": 0.01255179, + "epoch": 0.678430783105366, + "flos": 35449526033280.0, + "grad_norm": 1.6303319808688523, + "language_loss": 0.57121617, + "learning_rate": 9.900106591659948e-07, + "loss": 0.64796078, + "num_input_tokens_seen": 243515670, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.10180664, + "step": 11284, + "time_per_iteration": 2.622241258621216 + }, + { + "auxiliary_loss_clip": 0.0640896, + "auxiliary_loss_mlp": 0.01263816, + "balance_loss_clip": 0.06271388, + "balance_loss_mlp": 0.01253719, + "epoch": 0.678490906358034, + "flos": 14434044180480.0, + "grad_norm": 1.7585312003136033, + "language_loss": 0.75540352, + "learning_rate": 9.896745251925535e-07, + "loss": 0.83213127, + "num_input_tokens_seen": 243533625, + "router_z_loss_clip": 1.37402344, + "router_z_loss_mlp": 0.10095215, + "step": 11285, + "time_per_iteration": 3.914513111114502 + }, + { + "auxiliary_loss_clip": 0.06408092, + "auxiliary_loss_mlp": 0.01264708, + "balance_loss_clip": 0.06274541, + "balance_loss_mlp": 0.01254355, + "epoch": 0.6785510296107019, + "flos": 24317262063360.0, + "grad_norm": 1.6087593577428982, + "language_loss": 0.66518104, + "learning_rate": 9.893384295307557e-07, + "loss": 0.74190903, + "num_input_tokens_seen": 243553040, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.10351562, + "step": 11286, + "time_per_iteration": 2.5443532466888428 + }, + { + "auxiliary_loss_clip": 0.06411726, + "auxiliary_loss_mlp": 0.01266212, + "balance_loss_clip": 0.06271268, + "balance_loss_mlp": 0.01254553, + "epoch": 0.6786111528633699, + "flos": 26984142288000.0, + "grad_norm": 2.2563712255718453, + "language_loss": 0.52888298, + "learning_rate": 9.890023721933447e-07, + "loss": 0.60566235, + "num_input_tokens_seen": 243572590, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.11663818, + "step": 11287, + "time_per_iteration": 2.5215566158294678 + }, + { + "auxiliary_loss_clip": 0.06408818, + "auxiliary_loss_mlp": 0.01265445, + "balance_loss_clip": 0.06271936, + "balance_loss_mlp": 0.01255265, + "epoch": 0.6786712761160378, + "flos": 24324641222400.0, + "grad_norm": 1.4827043233914352, + "language_loss": 0.7744714, + "learning_rate": 9.886663531930655e-07, + "loss": 0.85121405, + "num_input_tokens_seen": 243594140, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.10180664, + "step": 11288, + "time_per_iteration": 2.5451719760894775 + }, + { + "auxiliary_loss_clip": 0.06414543, + "auxiliary_loss_mlp": 0.01270807, + "balance_loss_clip": 0.06275427, + "balance_loss_mlp": 0.0125993, + "epoch": 0.6787313993687059, + "flos": 22937176990080.0, + "grad_norm": 1.9021636809125866, + "language_loss": 0.73458755, + "learning_rate": 9.883303725426593e-07, + "loss": 0.81144106, + "num_input_tokens_seen": 243615170, + "router_z_loss_clip": 1.39257812, + "router_z_loss_mlp": 0.10882568, + "step": 11289, + "time_per_iteration": 2.524062395095825 + }, + { + "auxiliary_loss_clip": 0.0640981, + "auxiliary_loss_mlp": 0.01268655, + "balance_loss_clip": 0.06271172, + "balance_loss_mlp": 0.01257795, + "epoch": 0.6787915226213738, + "flos": 26875423215360.0, + "grad_norm": 1.3961935649800772, + "language_loss": 0.80240023, + "learning_rate": 9.879944302548682e-07, + "loss": 0.87918484, + "num_input_tokens_seen": 243635675, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.10852051, + "step": 11290, + "time_per_iteration": 2.563781499862671 + }, + { + "auxiliary_loss_clip": 0.06406706, + "auxiliary_loss_mlp": 0.01270194, + "balance_loss_clip": 0.06273251, + "balance_loss_mlp": 0.01260395, + "epoch": 0.6788516458740418, + "flos": 20014648358400.0, + "grad_norm": 1.3943952846011585, + "language_loss": 0.75320244, + "learning_rate": 9.87658526342428e-07, + "loss": 0.82997143, + "num_input_tokens_seen": 243654950, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.09802246, + "step": 11291, + "time_per_iteration": 2.4833710193634033 + }, + { + "auxiliary_loss_clip": 0.06409583, + "auxiliary_loss_mlp": 0.01265199, + "balance_loss_clip": 0.06270351, + "balance_loss_mlp": 0.01254709, + "epoch": 0.6789117691267098, + "flos": 28734045356160.0, + "grad_norm": 1.6032413484745063, + "language_loss": 0.75235522, + "learning_rate": 9.873226608180785e-07, + "loss": 0.82910305, + "num_input_tokens_seen": 243674970, + "router_z_loss_clip": 1.39257812, + "router_z_loss_mlp": 0.10491943, + "step": 11292, + "time_per_iteration": 2.5987610816955566 + }, + { + "auxiliary_loss_clip": 0.06407046, + "auxiliary_loss_mlp": 0.01271571, + "balance_loss_clip": 0.06271286, + "balance_loss_mlp": 0.01261235, + "epoch": 0.6789718923793777, + "flos": 23410053907200.0, + "grad_norm": 1.8128590339737811, + "language_loss": 0.84362906, + "learning_rate": 9.869868336945556e-07, + "loss": 0.92041528, + "num_input_tokens_seen": 243693440, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.10345459, + "step": 11293, + "time_per_iteration": 2.6490092277526855 + }, + { + "auxiliary_loss_clip": 0.06418362, + "auxiliary_loss_mlp": 0.01266521, + "balance_loss_clip": 0.06273804, + "balance_loss_mlp": 0.01255661, + "epoch": 0.6790320156320457, + "flos": 20455100945280.0, + "grad_norm": 2.3830710729233937, + "language_loss": 0.79575551, + "learning_rate": 9.866510449845929e-07, + "loss": 0.87260431, + "num_input_tokens_seen": 243710055, + "router_z_loss_clip": 1.44433594, + "router_z_loss_mlp": 0.10852051, + "step": 11294, + "time_per_iteration": 2.540187120437622 + }, + { + "auxiliary_loss_clip": 0.06410551, + "auxiliary_loss_mlp": 0.0126649, + "balance_loss_clip": 0.06273465, + "balance_loss_mlp": 0.01256507, + "epoch": 0.6790921388847136, + "flos": 24173519184000.0, + "grad_norm": 1.663290513792591, + "language_loss": 0.79323423, + "learning_rate": 9.86315294700924e-07, + "loss": 0.87000465, + "num_input_tokens_seen": 243728635, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.09985352, + "step": 11295, + "time_per_iteration": 2.539522171020508 + }, + { + "auxiliary_loss_clip": 0.06403016, + "auxiliary_loss_mlp": 0.01270622, + "balance_loss_clip": 0.06270514, + "balance_loss_mlp": 0.01261312, + "epoch": 0.6791522621373817, + "flos": 21914541434880.0, + "grad_norm": 1.9398184157871654, + "language_loss": 0.71742594, + "learning_rate": 9.859795828562823e-07, + "loss": 0.79416239, + "num_input_tokens_seen": 243748330, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.09313965, + "step": 11296, + "time_per_iteration": 2.5390286445617676 + }, + { + "auxiliary_loss_clip": 0.06406362, + "auxiliary_loss_mlp": 0.01266184, + "balance_loss_clip": 0.06269884, + "balance_loss_mlp": 0.01256212, + "epoch": 0.6792123853900496, + "flos": 24833380487040.0, + "grad_norm": 1.7008493408846614, + "language_loss": 0.70970011, + "learning_rate": 9.856439094633949e-07, + "loss": 0.78642553, + "num_input_tokens_seen": 243769380, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.09979248, + "step": 11297, + "time_per_iteration": 2.5342774391174316 + }, + { + "auxiliary_loss_clip": 0.06413988, + "auxiliary_loss_mlp": 0.01268754, + "balance_loss_clip": 0.06271179, + "balance_loss_mlp": 0.01257691, + "epoch": 0.6792725086427176, + "flos": 17571998459520.0, + "grad_norm": 2.072165205112126, + "language_loss": 0.66610634, + "learning_rate": 9.853082745349918e-07, + "loss": 0.74293375, + "num_input_tokens_seen": 243785510, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.11071777, + "step": 11298, + "time_per_iteration": 2.5330231189727783 + }, + { + "auxiliary_loss_clip": 0.06408876, + "auxiliary_loss_mlp": 0.01265536, + "balance_loss_clip": 0.06269588, + "balance_loss_mlp": 0.01255767, + "epoch": 0.6793326318953855, + "flos": 26948908846080.0, + "grad_norm": 1.6501656577542423, + "language_loss": 0.71810848, + "learning_rate": 9.84972678083801e-07, + "loss": 0.79485255, + "num_input_tokens_seen": 243805545, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.09771729, + "step": 11299, + "time_per_iteration": 2.547666072845459 + }, + { + "auxiliary_loss_clip": 0.06407908, + "auxiliary_loss_mlp": 0.01269253, + "balance_loss_clip": 0.06269622, + "balance_loss_mlp": 0.01258196, + "epoch": 0.6793927551480535, + "flos": 24325479763200.0, + "grad_norm": 1.2577197776351332, + "language_loss": 0.77542967, + "learning_rate": 9.846371201225488e-07, + "loss": 0.85220122, + "num_input_tokens_seen": 243825185, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.1105957, + "step": 11300, + "time_per_iteration": 2.568537473678589 + }, + { + "auxiliary_loss_clip": 0.06409447, + "auxiliary_loss_mlp": 0.01267373, + "balance_loss_clip": 0.06272208, + "balance_loss_mlp": 0.01256847, + "epoch": 0.6794528784007214, + "flos": 11441300227200.0, + "grad_norm": 1.9915071500414414, + "language_loss": 0.63348699, + "learning_rate": 9.843016006639577e-07, + "loss": 0.71025515, + "num_input_tokens_seen": 243841600, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.10534668, + "step": 11301, + "time_per_iteration": 2.4696924686431885 + }, + { + "auxiliary_loss_clip": 0.06409229, + "auxiliary_loss_mlp": 0.01266875, + "balance_loss_clip": 0.06270877, + "balance_loss_mlp": 0.01256772, + "epoch": 0.6795130016533895, + "flos": 25236922550400.0, + "grad_norm": 1.7173390721705748, + "language_loss": 0.82948458, + "learning_rate": 9.839661197207525e-07, + "loss": 0.90624553, + "num_input_tokens_seen": 243862250, + "router_z_loss_clip": 1.38476562, + "router_z_loss_mlp": 0.10107422, + "step": 11302, + "time_per_iteration": 2.598444938659668 + }, + { + "auxiliary_loss_clip": 0.0641208, + "auxiliary_loss_mlp": 0.01264081, + "balance_loss_clip": 0.06272297, + "balance_loss_mlp": 0.01254121, + "epoch": 0.6795731249060574, + "flos": 18302326646400.0, + "grad_norm": 1.7779256028698032, + "language_loss": 0.69851995, + "learning_rate": 9.83630677305654e-07, + "loss": 0.77528167, + "num_input_tokens_seen": 243880560, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.09954834, + "step": 11303, + "time_per_iteration": 2.4852330684661865 + }, + { + "auxiliary_loss_clip": 0.06413473, + "auxiliary_loss_mlp": 0.01264262, + "balance_loss_clip": 0.06271894, + "balance_loss_mlp": 0.0125336, + "epoch": 0.6796332481587254, + "flos": 20306159112960.0, + "grad_norm": 1.8204218049780263, + "language_loss": 0.70597726, + "learning_rate": 9.832952734313813e-07, + "loss": 0.7827546, + "num_input_tokens_seen": 243900635, + "router_z_loss_clip": 1.41601562, + "router_z_loss_mlp": 0.10900879, + "step": 11304, + "time_per_iteration": 2.5139074325561523 + }, + { + "auxiliary_loss_clip": 0.0641301, + "auxiliary_loss_mlp": 0.01268726, + "balance_loss_clip": 0.0627501, + "balance_loss_mlp": 0.01257794, + "epoch": 0.6796933714113934, + "flos": 23593642202880.0, + "grad_norm": 2.4376362863510046, + "language_loss": 0.72319949, + "learning_rate": 9.829599081106536e-07, + "loss": 0.80001682, + "num_input_tokens_seen": 243920160, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.109375, + "step": 11305, + "time_per_iteration": 2.522174119949341 + }, + { + "auxiliary_loss_clip": 0.06407507, + "auxiliary_loss_mlp": 0.01264269, + "balance_loss_clip": 0.06268832, + "balance_loss_mlp": 0.01252986, + "epoch": 0.6797534946640613, + "flos": 27126291939840.0, + "grad_norm": 2.8826024363137535, + "language_loss": 0.66289663, + "learning_rate": 9.826245813561882e-07, + "loss": 0.73961437, + "num_input_tokens_seen": 243939015, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.11297607, + "step": 11306, + "time_per_iteration": 2.5523674488067627 + }, + { + "auxiliary_loss_clip": 0.06408583, + "auxiliary_loss_mlp": 0.01265584, + "balance_loss_clip": 0.06272008, + "balance_loss_mlp": 0.01255547, + "epoch": 0.6798136179167293, + "flos": 22133992078080.0, + "grad_norm": 1.614397517334369, + "language_loss": 0.80464542, + "learning_rate": 9.822892931807021e-07, + "loss": 0.88138705, + "num_input_tokens_seen": 243958470, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.10028076, + "step": 11307, + "time_per_iteration": 3.9510881900787354 + }, + { + "auxiliary_loss_clip": 0.06403545, + "auxiliary_loss_mlp": 0.0126431, + "balance_loss_clip": 0.06269792, + "balance_loss_mlp": 0.01253677, + "epoch": 0.6798737411693972, + "flos": 17493565438080.0, + "grad_norm": 1.503954365849396, + "language_loss": 0.89141631, + "learning_rate": 9.819540435969066e-07, + "loss": 0.96809489, + "num_input_tokens_seen": 243975450, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.10638428, + "step": 11308, + "time_per_iteration": 2.454899549484253 + }, + { + "auxiliary_loss_clip": 0.06406927, + "auxiliary_loss_mlp": 0.01264598, + "balance_loss_clip": 0.06268145, + "balance_loss_mlp": 0.01253792, + "epoch": 0.6799338644220653, + "flos": 22898715166080.0, + "grad_norm": 1.9892982746856287, + "language_loss": 0.71669519, + "learning_rate": 9.816188326175154e-07, + "loss": 0.79341042, + "num_input_tokens_seen": 243994355, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.1081543, + "step": 11309, + "time_per_iteration": 2.537949562072754 + }, + { + "auxiliary_loss_clip": 0.06407045, + "auxiliary_loss_mlp": 0.01269522, + "balance_loss_clip": 0.0626999, + "balance_loss_mlp": 0.01259312, + "epoch": 0.6799939876747332, + "flos": 23186284778880.0, + "grad_norm": 2.168983976078807, + "language_loss": 0.84444106, + "learning_rate": 9.812836602552411e-07, + "loss": 0.92120677, + "num_input_tokens_seen": 244011620, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.10217285, + "step": 11310, + "time_per_iteration": 2.5093727111816406 + }, + { + "auxiliary_loss_clip": 0.06401814, + "auxiliary_loss_mlp": 0.01262918, + "balance_loss_clip": 0.06269856, + "balance_loss_mlp": 0.0125331, + "epoch": 0.6800541109274012, + "flos": 19505951020800.0, + "grad_norm": 1.936116503903549, + "language_loss": 0.83367699, + "learning_rate": 9.80948526522792e-07, + "loss": 0.91032434, + "num_input_tokens_seen": 244029925, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09613037, + "step": 11311, + "time_per_iteration": 2.5046095848083496 + }, + { + "auxiliary_loss_clip": 0.064105, + "auxiliary_loss_mlp": 0.01269609, + "balance_loss_clip": 0.06267536, + "balance_loss_mlp": 0.01257491, + "epoch": 0.6801142341800691, + "flos": 22284946408320.0, + "grad_norm": 1.5408548920294685, + "language_loss": 0.7658841, + "learning_rate": 9.806134314328767e-07, + "loss": 0.84268516, + "num_input_tokens_seen": 244051225, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.12133789, + "step": 11312, + "time_per_iteration": 2.5174195766448975 + }, + { + "auxiliary_loss_clip": 0.06310892, + "auxiliary_loss_mlp": 0.01252687, + "balance_loss_clip": 0.06255079, + "balance_loss_mlp": 0.01251411, + "epoch": 0.6801743574327371, + "flos": 68734439614080.0, + "grad_norm": 0.6438614608961274, + "language_loss": 0.57270527, + "learning_rate": 9.802783749982038e-07, + "loss": 0.64834106, + "num_input_tokens_seen": 244115930, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.01276398, + "step": 11313, + "time_per_iteration": 3.2520179748535156 + }, + { + "auxiliary_loss_clip": 0.06408104, + "auxiliary_loss_mlp": 0.01265154, + "balance_loss_clip": 0.06268254, + "balance_loss_mlp": 0.0125483, + "epoch": 0.680234480685405, + "flos": 29468146976640.0, + "grad_norm": 1.6190653949052565, + "language_loss": 0.69341791, + "learning_rate": 9.799433572314754e-07, + "loss": 0.77015042, + "num_input_tokens_seen": 244137320, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.10327148, + "step": 11314, + "time_per_iteration": 2.5535359382629395 + }, + { + "auxiliary_loss_clip": 0.06404889, + "auxiliary_loss_mlp": 0.01267434, + "balance_loss_clip": 0.06268796, + "balance_loss_mlp": 0.01257731, + "epoch": 0.6802946039380731, + "flos": 15921045463680.0, + "grad_norm": 1.9728888269672866, + "language_loss": 0.81508797, + "learning_rate": 9.796083781453972e-07, + "loss": 0.89181113, + "num_input_tokens_seen": 244152755, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.0970459, + "step": 11315, + "time_per_iteration": 2.5169835090637207 + }, + { + "auxiliary_loss_clip": 0.06405143, + "auxiliary_loss_mlp": 0.01264219, + "balance_loss_clip": 0.06267972, + "balance_loss_mlp": 0.01253723, + "epoch": 0.680354727190741, + "flos": 22025314932480.0, + "grad_norm": 1.6675934827220065, + "language_loss": 0.70277983, + "learning_rate": 9.792734377526718e-07, + "loss": 0.77947348, + "num_input_tokens_seen": 244171480, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.1048584, + "step": 11316, + "time_per_iteration": 2.4984679222106934 + }, + { + "auxiliary_loss_clip": 0.06405444, + "auxiliary_loss_mlp": 0.01268676, + "balance_loss_clip": 0.06269848, + "balance_loss_mlp": 0.01258478, + "epoch": 0.680414850443409, + "flos": 18447285409920.0, + "grad_norm": 2.1628292849287267, + "language_loss": 0.67277592, + "learning_rate": 9.789385360660003e-07, + "loss": 0.74951708, + "num_input_tokens_seen": 244187920, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.10205078, + "step": 11317, + "time_per_iteration": 3.912996292114258 + }, + { + "auxiliary_loss_clip": 0.06412488, + "auxiliary_loss_mlp": 0.01266936, + "balance_loss_clip": 0.06273043, + "balance_loss_mlp": 0.01256666, + "epoch": 0.680474973696077, + "flos": 26365677701760.0, + "grad_norm": 1.4339432029892007, + "language_loss": 0.74834979, + "learning_rate": 9.78603673098082e-07, + "loss": 0.82514405, + "num_input_tokens_seen": 244209565, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.10266113, + "step": 11318, + "time_per_iteration": 2.613416910171509 + }, + { + "auxiliary_loss_clip": 0.06405453, + "auxiliary_loss_mlp": 0.01261508, + "balance_loss_clip": 0.06270547, + "balance_loss_mlp": 0.01252502, + "epoch": 0.6805350969487449, + "flos": 18339069461760.0, + "grad_norm": 1.741381394136802, + "language_loss": 0.6821155, + "learning_rate": 9.782688488616143e-07, + "loss": 0.75878513, + "num_input_tokens_seen": 244228015, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.09008789, + "step": 11319, + "time_per_iteration": 2.4735772609710693 + }, + { + "auxiliary_loss_clip": 0.06402999, + "auxiliary_loss_mlp": 0.01267278, + "balance_loss_clip": 0.06269106, + "balance_loss_mlp": 0.01257354, + "epoch": 0.6805952202014129, + "flos": 19943552568960.0, + "grad_norm": 1.589394100312008, + "language_loss": 0.77030569, + "learning_rate": 9.779340633692945e-07, + "loss": 0.84700847, + "num_input_tokens_seen": 244245615, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.09924316, + "step": 11320, + "time_per_iteration": 2.5447402000427246 + }, + { + "auxiliary_loss_clip": 0.06406876, + "auxiliary_loss_mlp": 0.01264766, + "balance_loss_clip": 0.06270229, + "balance_loss_mlp": 0.01254341, + "epoch": 0.6806553434540809, + "flos": 25230633494400.0, + "grad_norm": 1.8063346564210203, + "language_loss": 0.75357598, + "learning_rate": 9.77599316633817e-07, + "loss": 0.8302924, + "num_input_tokens_seen": 244263625, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.10437012, + "step": 11321, + "time_per_iteration": 3.959946393966675 + }, + { + "auxiliary_loss_clip": 0.064097, + "auxiliary_loss_mlp": 0.01264729, + "balance_loss_clip": 0.06270082, + "balance_loss_mlp": 0.01254274, + "epoch": 0.6807154667067489, + "flos": 17791407175680.0, + "grad_norm": 2.0443838016403495, + "language_loss": 0.73213184, + "learning_rate": 9.772646086678758e-07, + "loss": 0.80887616, + "num_input_tokens_seen": 244282745, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.10461426, + "step": 11322, + "time_per_iteration": 2.508143663406372 + }, + { + "auxiliary_loss_clip": 0.0641022, + "auxiliary_loss_mlp": 0.01264866, + "balance_loss_clip": 0.06270386, + "balance_loss_mlp": 0.01253517, + "epoch": 0.6807755899594168, + "flos": 22206387605760.0, + "grad_norm": 1.7755779600619086, + "language_loss": 0.78547817, + "learning_rate": 9.769299394841638e-07, + "loss": 0.86222905, + "num_input_tokens_seen": 244303770, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.11352539, + "step": 11323, + "time_per_iteration": 2.5345656871795654 + }, + { + "auxiliary_loss_clip": 0.06315179, + "auxiliary_loss_mlp": 0.01251391, + "balance_loss_clip": 0.06259721, + "balance_loss_mlp": 0.0125015, + "epoch": 0.6808357132120848, + "flos": 68648878995840.0, + "grad_norm": 0.7384546914137473, + "language_loss": 0.57113785, + "learning_rate": 9.765953090953714e-07, + "loss": 0.64680356, + "num_input_tokens_seen": 244355910, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.0124054, + "step": 11324, + "time_per_iteration": 2.9890177249908447 + }, + { + "auxiliary_loss_clip": 0.06410179, + "auxiliary_loss_mlp": 0.01265495, + "balance_loss_clip": 0.06271601, + "balance_loss_mlp": 0.01254301, + "epoch": 0.6808958364647527, + "flos": 23850380712960.0, + "grad_norm": 1.8768737712077719, + "language_loss": 0.68368208, + "learning_rate": 9.76260717514186e-07, + "loss": 0.76043886, + "num_input_tokens_seen": 244376610, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.11193848, + "step": 11325, + "time_per_iteration": 4.024105072021484 + }, + { + "auxiliary_loss_clip": 0.06410693, + "auxiliary_loss_mlp": 0.0126769, + "balance_loss_clip": 0.06269176, + "balance_loss_mlp": 0.01256705, + "epoch": 0.6809559597174207, + "flos": 17717376493440.0, + "grad_norm": 2.1078464153023924, + "language_loss": 0.70419264, + "learning_rate": 9.759261647532974e-07, + "loss": 0.78097641, + "num_input_tokens_seen": 244393000, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.10986328, + "step": 11326, + "time_per_iteration": 2.484449625015259 + }, + { + "auxiliary_loss_clip": 0.06407395, + "auxiliary_loss_mlp": 0.01261696, + "balance_loss_clip": 0.06270957, + "balance_loss_mlp": 0.01251551, + "epoch": 0.6810160829700886, + "flos": 22498443411840.0, + "grad_norm": 1.638017241748174, + "language_loss": 0.72914612, + "learning_rate": 9.75591650825392e-07, + "loss": 0.80583698, + "num_input_tokens_seen": 244409515, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.10150146, + "step": 11327, + "time_per_iteration": 2.502293586730957 + }, + { + "auxiliary_loss_clip": 0.06405802, + "auxiliary_loss_mlp": 0.01266544, + "balance_loss_clip": 0.06270967, + "balance_loss_mlp": 0.01255839, + "epoch": 0.6810762062227567, + "flos": 16837854912000.0, + "grad_norm": 1.827919270381089, + "language_loss": 0.77294552, + "learning_rate": 9.752571757431526e-07, + "loss": 0.84966898, + "num_input_tokens_seen": 244427165, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.10705566, + "step": 11328, + "time_per_iteration": 2.469923734664917 + }, + { + "auxiliary_loss_clip": 0.06412201, + "auxiliary_loss_mlp": 0.01264628, + "balance_loss_clip": 0.0627179, + "balance_loss_mlp": 0.01253941, + "epoch": 0.6811363294754246, + "flos": 12719751897600.0, + "grad_norm": 1.8250307958699987, + "language_loss": 0.64754045, + "learning_rate": 9.74922739519265e-07, + "loss": 0.72430873, + "num_input_tokens_seen": 244445705, + "router_z_loss_clip": 1.40527344, + "router_z_loss_mlp": 0.10681152, + "step": 11329, + "time_per_iteration": 2.5292539596557617 + }, + { + "auxiliary_loss_clip": 0.06409349, + "auxiliary_loss_mlp": 0.01264815, + "balance_loss_clip": 0.06270607, + "balance_loss_mlp": 0.01254182, + "epoch": 0.6811964527280926, + "flos": 17717669982720.0, + "grad_norm": 1.8641198647355242, + "language_loss": 0.79316872, + "learning_rate": 9.745883421664096e-07, + "loss": 0.86991036, + "num_input_tokens_seen": 244460415, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.10638428, + "step": 11330, + "time_per_iteration": 2.4813790321350098 + }, + { + "auxiliary_loss_clip": 0.0641039, + "auxiliary_loss_mlp": 0.01264709, + "balance_loss_clip": 0.06272174, + "balance_loss_mlp": 0.0125376, + "epoch": 0.6812565759807605, + "flos": 24870416791680.0, + "grad_norm": 2.109092836267495, + "language_loss": 0.64502859, + "learning_rate": 9.742539836972665e-07, + "loss": 0.72177964, + "num_input_tokens_seen": 244480555, + "router_z_loss_clip": 1.38183594, + "router_z_loss_mlp": 0.10943604, + "step": 11331, + "time_per_iteration": 2.6124520301818848 + }, + { + "auxiliary_loss_clip": 0.06407228, + "auxiliary_loss_mlp": 0.01265019, + "balance_loss_clip": 0.06270872, + "balance_loss_mlp": 0.01254666, + "epoch": 0.6813166992334285, + "flos": 17171852486400.0, + "grad_norm": 1.5406157015161637, + "language_loss": 0.72821605, + "learning_rate": 9.739196641245148e-07, + "loss": 0.80493855, + "num_input_tokens_seen": 244498540, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.1036377, + "step": 11332, + "time_per_iteration": 2.483144760131836 + }, + { + "auxiliary_loss_clip": 0.06412952, + "auxiliary_loss_mlp": 0.01267338, + "balance_loss_clip": 0.06272908, + "balance_loss_mlp": 0.01256705, + "epoch": 0.6813768224860965, + "flos": 18849527735040.0, + "grad_norm": 2.149720533461842, + "language_loss": 0.74508882, + "learning_rate": 9.735853834608326e-07, + "loss": 0.82189173, + "num_input_tokens_seen": 244517015, + "router_z_loss_clip": 1.39941406, + "router_z_loss_mlp": 0.10638428, + "step": 11333, + "time_per_iteration": 2.5427186489105225 + }, + { + "auxiliary_loss_clip": 0.06414136, + "auxiliary_loss_mlp": 0.01267127, + "balance_loss_clip": 0.06272501, + "balance_loss_mlp": 0.01256786, + "epoch": 0.6814369457387645, + "flos": 24539228328960.0, + "grad_norm": 1.3823548887580743, + "language_loss": 0.72367668, + "learning_rate": 9.732511417188963e-07, + "loss": 0.80048931, + "num_input_tokens_seen": 244537450, + "router_z_loss_clip": 1.41601562, + "router_z_loss_mlp": 0.10345459, + "step": 11334, + "time_per_iteration": 2.537958860397339 + }, + { + "auxiliary_loss_clip": 0.06405447, + "auxiliary_loss_mlp": 0.01266429, + "balance_loss_clip": 0.06271046, + "balance_loss_mlp": 0.0125607, + "epoch": 0.6814970689914325, + "flos": 18228799088640.0, + "grad_norm": 1.6460074116702026, + "language_loss": 0.86505604, + "learning_rate": 9.729169389113791e-07, + "loss": 0.94177485, + "num_input_tokens_seen": 244555640, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.10357666, + "step": 11335, + "time_per_iteration": 2.5018861293792725 + }, + { + "auxiliary_loss_clip": 0.06401964, + "auxiliary_loss_mlp": 0.01265777, + "balance_loss_clip": 0.06271435, + "balance_loss_mlp": 0.01255656, + "epoch": 0.6815571922441004, + "flos": 25235874374400.0, + "grad_norm": 1.6438782420335836, + "language_loss": 0.81760287, + "learning_rate": 9.725827750509542e-07, + "loss": 0.89428031, + "num_input_tokens_seen": 244574005, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.10125732, + "step": 11336, + "time_per_iteration": 2.5359947681427 + }, + { + "auxiliary_loss_clip": 0.06403621, + "auxiliary_loss_mlp": 0.01268492, + "balance_loss_clip": 0.06270905, + "balance_loss_mlp": 0.0125818, + "epoch": 0.6816173154967684, + "flos": 19460864724480.0, + "grad_norm": 1.9165693219649298, + "language_loss": 0.82064402, + "learning_rate": 9.72248650150294e-07, + "loss": 0.89736515, + "num_input_tokens_seen": 244591395, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.10321045, + "step": 11337, + "time_per_iteration": 2.511289119720459 + }, + { + "auxiliary_loss_clip": 0.06404516, + "auxiliary_loss_mlp": 0.01264446, + "balance_loss_clip": 0.06271403, + "balance_loss_mlp": 0.01254462, + "epoch": 0.6816774387494363, + "flos": 17937288334080.0, + "grad_norm": 1.560533910826156, + "language_loss": 0.73002589, + "learning_rate": 9.719145642220673e-07, + "loss": 0.80671549, + "num_input_tokens_seen": 244610400, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.09979248, + "step": 11338, + "time_per_iteration": 2.511681318283081 + }, + { + "auxiliary_loss_clip": 0.06413732, + "auxiliary_loss_mlp": 0.01264898, + "balance_loss_clip": 0.06275684, + "balance_loss_mlp": 0.01254337, + "epoch": 0.6817375620021043, + "flos": 22238937717120.0, + "grad_norm": 1.4240412111564371, + "language_loss": 0.77416432, + "learning_rate": 9.715805172789435e-07, + "loss": 0.8509506, + "num_input_tokens_seen": 244630400, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.10559082, + "step": 11339, + "time_per_iteration": 2.5428354740142822 + }, + { + "auxiliary_loss_clip": 0.06410687, + "auxiliary_loss_mlp": 0.01264953, + "balance_loss_clip": 0.06272987, + "balance_loss_mlp": 0.012542, + "epoch": 0.6817976852547722, + "flos": 25381462043520.0, + "grad_norm": 1.7944902461652392, + "language_loss": 0.71041632, + "learning_rate": 9.712465093335901e-07, + "loss": 0.78717273, + "num_input_tokens_seen": 244649155, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.10748291, + "step": 11340, + "time_per_iteration": 2.550901412963867 + }, + { + "auxiliary_loss_clip": 0.06413396, + "auxiliary_loss_mlp": 0.01267156, + "balance_loss_clip": 0.06273545, + "balance_loss_mlp": 0.01256725, + "epoch": 0.6818578085074403, + "flos": 22271068558080.0, + "grad_norm": 2.180704981107058, + "language_loss": 0.84409666, + "learning_rate": 9.709125403986722e-07, + "loss": 0.92090219, + "num_input_tokens_seen": 244665470, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.10437012, + "step": 11341, + "time_per_iteration": 2.5165159702301025 + }, + { + "auxiliary_loss_clip": 0.06414375, + "auxiliary_loss_mlp": 0.01266506, + "balance_loss_clip": 0.06275092, + "balance_loss_mlp": 0.01255831, + "epoch": 0.6819179317601082, + "flos": 19324249441920.0, + "grad_norm": 1.5598647366733476, + "language_loss": 0.68810844, + "learning_rate": 9.705786104868531e-07, + "loss": 0.76491725, + "num_input_tokens_seen": 244684390, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.10681152, + "step": 11342, + "time_per_iteration": 2.593763589859009 + }, + { + "auxiliary_loss_clip": 0.06407441, + "auxiliary_loss_mlp": 0.01261474, + "balance_loss_clip": 0.0627171, + "balance_loss_mlp": 0.01251342, + "epoch": 0.6819780550127762, + "flos": 21110224492800.0, + "grad_norm": 1.6656061272859015, + "language_loss": 0.74818993, + "learning_rate": 9.702447196107963e-07, + "loss": 0.82487905, + "num_input_tokens_seen": 244703370, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.10131836, + "step": 11343, + "time_per_iteration": 2.524341344833374 + }, + { + "auxiliary_loss_clip": 0.06415273, + "auxiliary_loss_mlp": 0.01266539, + "balance_loss_clip": 0.06277119, + "balance_loss_mlp": 0.01256055, + "epoch": 0.6820381782654441, + "flos": 29724214654080.0, + "grad_norm": 1.6102730777044594, + "language_loss": 0.80077457, + "learning_rate": 9.699108677831639e-07, + "loss": 0.87759268, + "num_input_tokens_seen": 244723325, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.1048584, + "step": 11344, + "time_per_iteration": 2.559631586074829 + }, + { + "auxiliary_loss_clip": 0.06412022, + "auxiliary_loss_mlp": 0.01263183, + "balance_loss_clip": 0.06272998, + "balance_loss_mlp": 0.01252747, + "epoch": 0.6820983015181121, + "flos": 29249870290560.0, + "grad_norm": 1.8689488071291331, + "language_loss": 0.66530693, + "learning_rate": 9.695770550166136e-07, + "loss": 0.74205899, + "num_input_tokens_seen": 244745650, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.10424805, + "step": 11345, + "time_per_iteration": 2.588878870010376 + }, + { + "auxiliary_loss_clip": 0.06416089, + "auxiliary_loss_mlp": 0.01264993, + "balance_loss_clip": 0.06275414, + "balance_loss_mlp": 0.01254538, + "epoch": 0.6821584247707801, + "flos": 18876375768960.0, + "grad_norm": 2.261790357681116, + "language_loss": 0.65540516, + "learning_rate": 9.692432813238054e-07, + "loss": 0.732216, + "num_input_tokens_seen": 244760270, + "router_z_loss_clip": 1.40527344, + "router_z_loss_mlp": 0.10461426, + "step": 11346, + "time_per_iteration": 2.4776885509490967 + }, + { + "auxiliary_loss_clip": 0.06415972, + "auxiliary_loss_mlp": 0.01264195, + "balance_loss_clip": 0.06274392, + "balance_loss_mlp": 0.01253567, + "epoch": 0.6822185480234481, + "flos": 21330974874240.0, + "grad_norm": 1.434084459819624, + "language_loss": 0.7886349, + "learning_rate": 9.689095467173952e-07, + "loss": 0.86543655, + "num_input_tokens_seen": 244779565, + "router_z_loss_clip": 1.41699219, + "router_z_loss_mlp": 0.10632324, + "step": 11347, + "time_per_iteration": 3.919304132461548 + }, + { + "auxiliary_loss_clip": 0.06316185, + "auxiliary_loss_mlp": 0.01255511, + "balance_loss_clip": 0.06260848, + "balance_loss_mlp": 0.01254305, + "epoch": 0.6822786712761161, + "flos": 63505540949760.0, + "grad_norm": 0.7177694724545725, + "language_loss": 0.52512419, + "learning_rate": 9.685758512100378e-07, + "loss": 0.60084116, + "num_input_tokens_seen": 244838480, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01203918, + "step": 11348, + "time_per_iteration": 3.14101505279541 + }, + { + "auxiliary_loss_clip": 0.06413009, + "auxiliary_loss_mlp": 0.01264656, + "balance_loss_clip": 0.06278681, + "balance_loss_mlp": 0.01255209, + "epoch": 0.682338794528784, + "flos": 21075242613120.0, + "grad_norm": 1.7094709865372797, + "language_loss": 0.79881036, + "learning_rate": 9.682421948143873e-07, + "loss": 0.87558699, + "num_input_tokens_seen": 244855265, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.09448242, + "step": 11349, + "time_per_iteration": 2.497866630554199 + }, + { + "auxiliary_loss_clip": 0.06425133, + "auxiliary_loss_mlp": 0.01267838, + "balance_loss_clip": 0.06278804, + "balance_loss_mlp": 0.01255595, + "epoch": 0.682398917781452, + "flos": 36292053237120.0, + "grad_norm": 1.5698213232216975, + "language_loss": 0.7393533, + "learning_rate": 9.67908577543096e-07, + "loss": 0.81628305, + "num_input_tokens_seen": 244875555, + "router_z_loss_clip": 1.46289062, + "router_z_loss_mlp": 0.12243652, + "step": 11350, + "time_per_iteration": 2.62261700630188 + }, + { + "auxiliary_loss_clip": 0.06411327, + "auxiliary_loss_mlp": 0.01267917, + "balance_loss_clip": 0.06275079, + "balance_loss_mlp": 0.01258094, + "epoch": 0.6824590410341199, + "flos": 24865427473920.0, + "grad_norm": 1.5591585279724258, + "language_loss": 0.79965377, + "learning_rate": 9.675749994088161e-07, + "loss": 0.87644625, + "num_input_tokens_seen": 244895270, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.09832764, + "step": 11351, + "time_per_iteration": 2.528369665145874 + }, + { + "auxiliary_loss_clip": 0.06409021, + "auxiliary_loss_mlp": 0.01262582, + "balance_loss_clip": 0.06272362, + "balance_loss_mlp": 0.0125292, + "epoch": 0.6825191642867879, + "flos": 22458430287360.0, + "grad_norm": 1.5623570195172147, + "language_loss": 0.73523104, + "learning_rate": 9.672414604241954e-07, + "loss": 0.81194711, + "num_input_tokens_seen": 244914535, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.09661865, + "step": 11352, + "time_per_iteration": 2.522172451019287 + }, + { + "auxiliary_loss_clip": 0.06413847, + "auxiliary_loss_mlp": 0.0126533, + "balance_loss_clip": 0.0627329, + "balance_loss_mlp": 0.01253677, + "epoch": 0.6825792875394558, + "flos": 29432116920960.0, + "grad_norm": 1.626079801889606, + "language_loss": 0.804649, + "learning_rate": 9.669079606018814e-07, + "loss": 0.88144076, + "num_input_tokens_seen": 244936095, + "router_z_loss_clip": 1.40332031, + "router_z_loss_mlp": 0.11639404, + "step": 11353, + "time_per_iteration": 2.5686585903167725 + }, + { + "auxiliary_loss_clip": 0.06413363, + "auxiliary_loss_mlp": 0.01265606, + "balance_loss_clip": 0.06276349, + "balance_loss_mlp": 0.01254747, + "epoch": 0.6826394107921239, + "flos": 18777006426240.0, + "grad_norm": 1.604562568600035, + "language_loss": 0.78506744, + "learning_rate": 9.665744999545218e-07, + "loss": 0.86185712, + "num_input_tokens_seen": 244955290, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.10864258, + "step": 11354, + "time_per_iteration": 2.5204999446868896 + }, + { + "auxiliary_loss_clip": 0.06408085, + "auxiliary_loss_mlp": 0.01263379, + "balance_loss_clip": 0.06272091, + "balance_loss_mlp": 0.0125355, + "epoch": 0.6826995340447918, + "flos": 16623142024320.0, + "grad_norm": 2.019321118646576, + "language_loss": 0.62111843, + "learning_rate": 9.662410784947599e-07, + "loss": 0.69783312, + "num_input_tokens_seen": 244972935, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.09814453, + "step": 11355, + "time_per_iteration": 2.4766104221343994 + }, + { + "auxiliary_loss_clip": 0.06412464, + "auxiliary_loss_mlp": 0.01263892, + "balance_loss_clip": 0.0627443, + "balance_loss_mlp": 0.01254117, + "epoch": 0.6827596572974598, + "flos": 20854282596480.0, + "grad_norm": 1.7897850919384148, + "language_loss": 0.82221437, + "learning_rate": 9.659076962352398e-07, + "loss": 0.89897794, + "num_input_tokens_seen": 244989440, + "router_z_loss_clip": 1.38183594, + "router_z_loss_mlp": 0.09771729, + "step": 11356, + "time_per_iteration": 3.9204885959625244 + }, + { + "auxiliary_loss_clip": 0.06415853, + "auxiliary_loss_mlp": 0.01263188, + "balance_loss_clip": 0.06275809, + "balance_loss_mlp": 0.01252561, + "epoch": 0.6828197805501277, + "flos": 22754804578560.0, + "grad_norm": 1.6532324250211312, + "language_loss": 0.78508228, + "learning_rate": 9.655743531886052e-07, + "loss": 0.86187267, + "num_input_tokens_seen": 245007830, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.10626221, + "step": 11357, + "time_per_iteration": 2.5153608322143555 + }, + { + "auxiliary_loss_clip": 0.06314074, + "auxiliary_loss_mlp": 0.01254778, + "balance_loss_clip": 0.06258625, + "balance_loss_mlp": 0.01253596, + "epoch": 0.6828799038027957, + "flos": 71668833598080.0, + "grad_norm": 0.7966113468619515, + "language_loss": 0.59682757, + "learning_rate": 9.65241049367493e-07, + "loss": 0.67251611, + "num_input_tokens_seen": 245070720, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01180267, + "step": 11358, + "time_per_iteration": 3.1846532821655273 + }, + { + "auxiliary_loss_clip": 0.06419402, + "auxiliary_loss_mlp": 0.01269456, + "balance_loss_clip": 0.06276588, + "balance_loss_mlp": 0.01257648, + "epoch": 0.6829400270554637, + "flos": 19835378547840.0, + "grad_norm": 1.7044245093067194, + "language_loss": 0.78866333, + "learning_rate": 9.64907784784544e-07, + "loss": 0.86555189, + "num_input_tokens_seen": 245089070, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.11816406, + "step": 11359, + "time_per_iteration": 2.5490803718566895 + }, + { + "auxiliary_loss_clip": 0.064127, + "auxiliary_loss_mlp": 0.01264331, + "balance_loss_clip": 0.06273861, + "balance_loss_mlp": 0.01253734, + "epoch": 0.6830001503081317, + "flos": 21987020816640.0, + "grad_norm": 2.0193369174380664, + "language_loss": 0.82223153, + "learning_rate": 9.645745594523958e-07, + "loss": 0.89900184, + "num_input_tokens_seen": 245106500, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.105896, + "step": 11360, + "time_per_iteration": 3.9807236194610596 + }, + { + "auxiliary_loss_clip": 0.0641343, + "auxiliary_loss_mlp": 0.01265293, + "balance_loss_clip": 0.06274153, + "balance_loss_mlp": 0.01254677, + "epoch": 0.6830602735607997, + "flos": 24323718827520.0, + "grad_norm": 1.651921957497636, + "language_loss": 0.75011313, + "learning_rate": 9.642413733836844e-07, + "loss": 0.82690036, + "num_input_tokens_seen": 245125260, + "router_z_loss_clip": 1.39257812, + "router_z_loss_mlp": 0.1060791, + "step": 11361, + "time_per_iteration": 2.535749673843384 + }, + { + "auxiliary_loss_clip": 0.06309322, + "auxiliary_loss_mlp": 0.01254085, + "balance_loss_clip": 0.06253715, + "balance_loss_mlp": 0.01252928, + "epoch": 0.6831203968134676, + "flos": 57706827793920.0, + "grad_norm": 0.8409522652001101, + "language_loss": 0.595146, + "learning_rate": 9.639082265910437e-07, + "loss": 0.67078006, + "num_input_tokens_seen": 245188730, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01154327, + "step": 11362, + "time_per_iteration": 3.249852180480957 + }, + { + "auxiliary_loss_clip": 0.06412338, + "auxiliary_loss_mlp": 0.0126686, + "balance_loss_clip": 0.06271093, + "balance_loss_mlp": 0.01255792, + "epoch": 0.6831805200661356, + "flos": 14393024807040.0, + "grad_norm": 2.0585212828502004, + "language_loss": 0.76010299, + "learning_rate": 9.635751190871074e-07, + "loss": 0.83689499, + "num_input_tokens_seen": 245205065, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.11077881, + "step": 11363, + "time_per_iteration": 2.5203006267547607 + }, + { + "auxiliary_loss_clip": 0.06410082, + "auxiliary_loss_mlp": 0.01264688, + "balance_loss_clip": 0.06273843, + "balance_loss_mlp": 0.01253828, + "epoch": 0.6832406433188035, + "flos": 22826906616960.0, + "grad_norm": 2.358731005347766, + "language_loss": 0.89481944, + "learning_rate": 9.632420508845063e-07, + "loss": 0.97156709, + "num_input_tokens_seen": 245224265, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.10870361, + "step": 11364, + "time_per_iteration": 2.5663001537323 + }, + { + "auxiliary_loss_clip": 0.06405666, + "auxiliary_loss_mlp": 0.0126555, + "balance_loss_clip": 0.06269991, + "balance_loss_mlp": 0.01255721, + "epoch": 0.6833007665714715, + "flos": 17566673725440.0, + "grad_norm": 1.8217270673941708, + "language_loss": 0.88218802, + "learning_rate": 9.629090219958697e-07, + "loss": 0.95890021, + "num_input_tokens_seen": 245243360, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.09838867, + "step": 11365, + "time_per_iteration": 3.9711902141571045 + }, + { + "auxiliary_loss_clip": 0.06422257, + "auxiliary_loss_mlp": 0.0127244, + "balance_loss_clip": 0.06279552, + "balance_loss_mlp": 0.01261222, + "epoch": 0.6833608898241395, + "flos": 22450883420160.0, + "grad_norm": 1.95679459658848, + "language_loss": 0.81100428, + "learning_rate": 9.625760324338272e-07, + "loss": 0.88795125, + "num_input_tokens_seen": 245256350, + "router_z_loss_clip": 1.42675781, + "router_z_loss_mlp": 0.11230469, + "step": 11366, + "time_per_iteration": 2.496051788330078 + }, + { + "auxiliary_loss_clip": 0.06410712, + "auxiliary_loss_mlp": 0.01263817, + "balance_loss_clip": 0.06271282, + "balance_loss_mlp": 0.01253434, + "epoch": 0.6834210130768075, + "flos": 24541450462080.0, + "grad_norm": 1.3668234382616995, + "language_loss": 0.76664793, + "learning_rate": 9.622430822110062e-07, + "loss": 0.84339321, + "num_input_tokens_seen": 245277575, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.1038208, + "step": 11367, + "time_per_iteration": 2.597698450088501 + }, + { + "auxiliary_loss_clip": 0.06411598, + "auxiliary_loss_mlp": 0.01263902, + "balance_loss_clip": 0.06272662, + "balance_loss_mlp": 0.0125312, + "epoch": 0.6834811363294754, + "flos": 20053235963520.0, + "grad_norm": 1.5010742143698117, + "language_loss": 0.69233596, + "learning_rate": 9.619101713400312e-07, + "loss": 0.76909101, + "num_input_tokens_seen": 245296615, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.10791016, + "step": 11368, + "time_per_iteration": 2.520679473876953 + }, + { + "auxiliary_loss_clip": 0.06409574, + "auxiliary_loss_mlp": 0.01266367, + "balance_loss_clip": 0.06272889, + "balance_loss_mlp": 0.0125553, + "epoch": 0.6835412595821434, + "flos": 24797727774720.0, + "grad_norm": 1.604090291521746, + "language_loss": 0.73295021, + "learning_rate": 9.615772998335261e-07, + "loss": 0.80970967, + "num_input_tokens_seen": 245316275, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.1083374, + "step": 11369, + "time_per_iteration": 2.5773866176605225 + }, + { + "auxiliary_loss_clip": 0.06409427, + "auxiliary_loss_mlp": 0.01264128, + "balance_loss_clip": 0.06271335, + "balance_loss_mlp": 0.01254067, + "epoch": 0.6836013828348113, + "flos": 19506454145280.0, + "grad_norm": 1.9399454003386187, + "language_loss": 0.79163188, + "learning_rate": 9.612444677041138e-07, + "loss": 0.86836743, + "num_input_tokens_seen": 245334595, + "router_z_loss_clip": 1.38183594, + "router_z_loss_mlp": 0.10064697, + "step": 11370, + "time_per_iteration": 2.4922618865966797 + }, + { + "auxiliary_loss_clip": 0.06306867, + "auxiliary_loss_mlp": 0.01250813, + "balance_loss_clip": 0.06251401, + "balance_loss_mlp": 0.0124961, + "epoch": 0.6836615060874793, + "flos": 58383753402240.0, + "grad_norm": 0.8179842252969125, + "language_loss": 0.59746689, + "learning_rate": 9.609116749644162e-07, + "loss": 0.67304367, + "num_input_tokens_seen": 245389750, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.0120163, + "step": 11371, + "time_per_iteration": 3.0478594303131104 + }, + { + "auxiliary_loss_clip": 0.06402698, + "auxiliary_loss_mlp": 0.01263932, + "balance_loss_clip": 0.06270069, + "balance_loss_mlp": 0.01254175, + "epoch": 0.6837216293401474, + "flos": 12171796122240.0, + "grad_norm": 1.5508500684767301, + "language_loss": 0.63639355, + "learning_rate": 9.605789216270511e-07, + "loss": 0.71305984, + "num_input_tokens_seen": 245407530, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09759521, + "step": 11372, + "time_per_iteration": 2.4811301231384277 + }, + { + "auxiliary_loss_clip": 0.06408484, + "auxiliary_loss_mlp": 0.01265592, + "balance_loss_clip": 0.06272547, + "balance_loss_mlp": 0.01255137, + "epoch": 0.6837817525928153, + "flos": 22134159786240.0, + "grad_norm": 1.4333850518313196, + "language_loss": 0.71846133, + "learning_rate": 9.602462077046375e-07, + "loss": 0.79520208, + "num_input_tokens_seen": 245427000, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.10461426, + "step": 11373, + "time_per_iteration": 2.5287580490112305 + }, + { + "auxiliary_loss_clip": 0.06305692, + "auxiliary_loss_mlp": 0.01251081, + "balance_loss_clip": 0.06250165, + "balance_loss_mlp": 0.01249923, + "epoch": 0.6838418758454833, + "flos": 65027048186880.0, + "grad_norm": 1.1033743133145881, + "language_loss": 0.56752723, + "learning_rate": 9.599135332097935e-07, + "loss": 0.6430949, + "num_input_tokens_seen": 245491620, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01155853, + "step": 11374, + "time_per_iteration": 3.302116632461548 + }, + { + "auxiliary_loss_clip": 0.06410992, + "auxiliary_loss_mlp": 0.01268892, + "balance_loss_clip": 0.06272627, + "balance_loss_mlp": 0.01257895, + "epoch": 0.6839019990981512, + "flos": 21036864643200.0, + "grad_norm": 1.4837774857580213, + "language_loss": 0.7423023, + "learning_rate": 9.595808981551312e-07, + "loss": 0.81910115, + "num_input_tokens_seen": 245511285, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.11001587, + "step": 11375, + "time_per_iteration": 2.5274906158447266 + }, + { + "auxiliary_loss_clip": 0.06406655, + "auxiliary_loss_mlp": 0.01267316, + "balance_loss_clip": 0.06271502, + "balance_loss_mlp": 0.01257684, + "epoch": 0.6839621223508192, + "flos": 24942351121920.0, + "grad_norm": 1.6223536594822023, + "language_loss": 0.7043916, + "learning_rate": 9.592483025532651e-07, + "loss": 0.78113139, + "num_input_tokens_seen": 245532910, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.09637451, + "step": 11376, + "time_per_iteration": 2.5494120121002197 + }, + { + "auxiliary_loss_clip": 0.06412984, + "auxiliary_loss_mlp": 0.012638, + "balance_loss_clip": 0.06272008, + "balance_loss_mlp": 0.01253161, + "epoch": 0.6840222456034871, + "flos": 26365929264000.0, + "grad_norm": 1.7833627654713686, + "language_loss": 0.74259639, + "learning_rate": 9.58915746416808e-07, + "loss": 0.81936419, + "num_input_tokens_seen": 245550540, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.10632324, + "step": 11377, + "time_per_iteration": 2.5434489250183105 + }, + { + "auxiliary_loss_clip": 0.06309253, + "auxiliary_loss_mlp": 0.01251187, + "balance_loss_clip": 0.06253564, + "balance_loss_mlp": 0.01249992, + "epoch": 0.6840823688561551, + "flos": 66009167493120.0, + "grad_norm": 0.7064811243320783, + "language_loss": 0.56814432, + "learning_rate": 9.585832297583707e-07, + "loss": 0.64374876, + "num_input_tokens_seen": 245619570, + "router_z_loss_clip": 0.55517578, + "router_z_loss_mlp": 0.01193237, + "step": 11378, + "time_per_iteration": 3.2616686820983887 + }, + { + "auxiliary_loss_clip": 0.06409612, + "auxiliary_loss_mlp": 0.01265612, + "balance_loss_clip": 0.06271753, + "balance_loss_mlp": 0.01254764, + "epoch": 0.684142492108823, + "flos": 21403999307520.0, + "grad_norm": 1.6132418851945567, + "language_loss": 0.78663373, + "learning_rate": 9.58250752590561e-07, + "loss": 0.86338598, + "num_input_tokens_seen": 245637980, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.10858154, + "step": 11379, + "time_per_iteration": 2.53483247756958 + }, + { + "auxiliary_loss_clip": 0.06401949, + "auxiliary_loss_mlp": 0.01263666, + "balance_loss_clip": 0.06271493, + "balance_loss_mlp": 0.01254976, + "epoch": 0.6842026153614911, + "flos": 18806453936640.0, + "grad_norm": 2.5056443246249, + "language_loss": 0.68875623, + "learning_rate": 9.57918314925988e-07, + "loss": 0.76541233, + "num_input_tokens_seen": 245655690, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.08685303, + "step": 11380, + "time_per_iteration": 2.5189809799194336 + }, + { + "auxiliary_loss_clip": 0.06407002, + "auxiliary_loss_mlp": 0.01265061, + "balance_loss_clip": 0.06271026, + "balance_loss_mlp": 0.01254678, + "epoch": 0.684262738614159, + "flos": 19652544938880.0, + "grad_norm": 1.774794382077768, + "language_loss": 0.78619421, + "learning_rate": 9.575859167772568e-07, + "loss": 0.8629148, + "num_input_tokens_seen": 245671525, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.1038208, + "step": 11381, + "time_per_iteration": 2.5038013458251953 + }, + { + "auxiliary_loss_clip": 0.0631157, + "auxiliary_loss_mlp": 0.01250817, + "balance_loss_clip": 0.06255913, + "balance_loss_mlp": 0.01249629, + "epoch": 0.684322861866827, + "flos": 62371041793920.0, + "grad_norm": 0.8443750872588546, + "language_loss": 0.67272472, + "learning_rate": 9.572535581569713e-07, + "loss": 0.74834859, + "num_input_tokens_seen": 245724115, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01186371, + "step": 11382, + "time_per_iteration": 3.022620677947998 + }, + { + "auxiliary_loss_clip": 0.06309118, + "auxiliary_loss_mlp": 0.01252769, + "balance_loss_clip": 0.06253339, + "balance_loss_mlp": 0.01251537, + "epoch": 0.6843829851194949, + "flos": 65825704978560.0, + "grad_norm": 0.8346748203160914, + "language_loss": 0.58115959, + "learning_rate": 9.569212390777356e-07, + "loss": 0.65677845, + "num_input_tokens_seen": 245789245, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.01231384, + "step": 11383, + "time_per_iteration": 3.205733060836792 + }, + { + "auxiliary_loss_clip": 0.06403822, + "auxiliary_loss_mlp": 0.01263656, + "balance_loss_clip": 0.06269525, + "balance_loss_mlp": 0.01253697, + "epoch": 0.6844431083721629, + "flos": 27862573766400.0, + "grad_norm": 1.743965936300629, + "language_loss": 0.79892695, + "learning_rate": 9.565889595521517e-07, + "loss": 0.87560171, + "num_input_tokens_seen": 245812420, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.09960938, + "step": 11384, + "time_per_iteration": 2.576397657394409 + }, + { + "auxiliary_loss_clip": 0.0641057, + "auxiliary_loss_mlp": 0.01264349, + "balance_loss_clip": 0.06270487, + "balance_loss_mlp": 0.01253459, + "epoch": 0.684503231624831, + "flos": 18260091388800.0, + "grad_norm": 1.8125132078887, + "language_loss": 0.77559322, + "learning_rate": 9.562567195928187e-07, + "loss": 0.85234237, + "num_input_tokens_seen": 245829135, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.10894775, + "step": 11385, + "time_per_iteration": 2.5222182273864746 + }, + { + "auxiliary_loss_clip": 0.06418984, + "auxiliary_loss_mlp": 0.01266461, + "balance_loss_clip": 0.0627387, + "balance_loss_mlp": 0.01254397, + "epoch": 0.6845633548774989, + "flos": 17645484090240.0, + "grad_norm": 2.2044599558463105, + "language_loss": 0.84624577, + "learning_rate": 9.55924519212335e-07, + "loss": 0.92310023, + "num_input_tokens_seen": 245847140, + "router_z_loss_clip": 1.45019531, + "router_z_loss_mlp": 0.12072754, + "step": 11386, + "time_per_iteration": 3.9474587440490723 + }, + { + "auxiliary_loss_clip": 0.06409421, + "auxiliary_loss_mlp": 0.01262563, + "balance_loss_clip": 0.06272484, + "balance_loss_mlp": 0.01252883, + "epoch": 0.6846234781301669, + "flos": 20812843952640.0, + "grad_norm": 1.925558647056537, + "language_loss": 0.83398205, + "learning_rate": 9.555923584232984e-07, + "loss": 0.91070187, + "num_input_tokens_seen": 245862855, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.09680176, + "step": 11387, + "time_per_iteration": 2.5117714405059814 + }, + { + "auxiliary_loss_clip": 0.06405626, + "auxiliary_loss_mlp": 0.01263725, + "balance_loss_clip": 0.06270427, + "balance_loss_mlp": 0.01254033, + "epoch": 0.6846836013828348, + "flos": 36110016241920.0, + "grad_norm": 1.588804983998274, + "language_loss": 0.72422922, + "learning_rate": 9.552602372383047e-07, + "loss": 0.80092275, + "num_input_tokens_seen": 245885415, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.09692383, + "step": 11388, + "time_per_iteration": 2.669675588607788 + }, + { + "auxiliary_loss_clip": 0.0640699, + "auxiliary_loss_mlp": 0.01267663, + "balance_loss_clip": 0.06272318, + "balance_loss_mlp": 0.01258198, + "epoch": 0.6847437246355028, + "flos": 43152408823680.0, + "grad_norm": 2.116517308354933, + "language_loss": 0.63188899, + "learning_rate": 9.549281556699469e-07, + "loss": 0.70863551, + "num_input_tokens_seen": 245906285, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.09460449, + "step": 11389, + "time_per_iteration": 2.775179862976074 + }, + { + "auxiliary_loss_clip": 0.06304318, + "auxiliary_loss_mlp": 0.01252682, + "balance_loss_clip": 0.06248381, + "balance_loss_mlp": 0.01251546, + "epoch": 0.6848038478881707, + "flos": 71682768103680.0, + "grad_norm": 0.7038129025924749, + "language_loss": 0.55774271, + "learning_rate": 9.54596113730818e-07, + "loss": 0.63331264, + "num_input_tokens_seen": 245967620, + "router_z_loss_clip": 0.55908203, + "router_z_loss_mlp": 0.01138306, + "step": 11390, + "time_per_iteration": 3.2121734619140625 + }, + { + "auxiliary_loss_clip": 0.06409647, + "auxiliary_loss_mlp": 0.01266416, + "balance_loss_clip": 0.06272963, + "balance_loss_mlp": 0.01255997, + "epoch": 0.6848639711408387, + "flos": 19943929912320.0, + "grad_norm": 1.8977282247890388, + "language_loss": 0.87613106, + "learning_rate": 9.542641114335109e-07, + "loss": 0.95289165, + "num_input_tokens_seen": 245985075, + "router_z_loss_clip": 1.36621094, + "router_z_loss_mlp": 0.10424805, + "step": 11391, + "time_per_iteration": 2.500140428543091 + }, + { + "auxiliary_loss_clip": 0.06412797, + "auxiliary_loss_mlp": 0.01263893, + "balance_loss_clip": 0.0627296, + "balance_loss_mlp": 0.01253343, + "epoch": 0.6849240943935067, + "flos": 26874333112320.0, + "grad_norm": 1.48935328965904, + "language_loss": 0.79339015, + "learning_rate": 9.539321487906117e-07, + "loss": 0.870157, + "num_input_tokens_seen": 246003560, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.10552979, + "step": 11392, + "time_per_iteration": 2.557020902633667 + }, + { + "auxiliary_loss_clip": 0.06403191, + "auxiliary_loss_mlp": 0.01264788, + "balance_loss_clip": 0.06271005, + "balance_loss_mlp": 0.01254751, + "epoch": 0.6849842176461747, + "flos": 13740458808960.0, + "grad_norm": 2.0081405471627884, + "language_loss": 0.71175981, + "learning_rate": 9.536002258147104e-07, + "loss": 0.78843963, + "num_input_tokens_seen": 246019600, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.10040283, + "step": 11393, + "time_per_iteration": 2.5271036624908447 + }, + { + "auxiliary_loss_clip": 0.06415832, + "auxiliary_loss_mlp": 0.01265598, + "balance_loss_clip": 0.0627556, + "balance_loss_mlp": 0.01255, + "epoch": 0.6850443408988426, + "flos": 24980058259200.0, + "grad_norm": 1.5317798757580128, + "language_loss": 0.64661515, + "learning_rate": 9.532683425183936e-07, + "loss": 0.72342944, + "num_input_tokens_seen": 246038920, + "router_z_loss_clip": 1.40332031, + "router_z_loss_mlp": 0.10595703, + "step": 11394, + "time_per_iteration": 2.53812313079834 + }, + { + "auxiliary_loss_clip": 0.06411145, + "auxiliary_loss_mlp": 0.01264493, + "balance_loss_clip": 0.06272422, + "balance_loss_mlp": 0.0125439, + "epoch": 0.6851044641515106, + "flos": 27751380998400.0, + "grad_norm": 1.5645262580549901, + "language_loss": 0.80918968, + "learning_rate": 9.529364989142468e-07, + "loss": 0.88594604, + "num_input_tokens_seen": 246060490, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.10101318, + "step": 11395, + "time_per_iteration": 2.550346851348877 + }, + { + "auxiliary_loss_clip": 0.06410371, + "auxiliary_loss_mlp": 0.01268735, + "balance_loss_clip": 0.06274814, + "balance_loss_mlp": 0.01258144, + "epoch": 0.6851645874041785, + "flos": 24357652531200.0, + "grad_norm": 1.7469268170163024, + "language_loss": 0.72832096, + "learning_rate": 9.526046950148527e-07, + "loss": 0.80511206, + "num_input_tokens_seen": 246081465, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10595703, + "step": 11396, + "time_per_iteration": 3.9635422229766846 + }, + { + "auxiliary_loss_clip": 0.06410467, + "auxiliary_loss_mlp": 0.01265588, + "balance_loss_clip": 0.06270725, + "balance_loss_mlp": 0.01255056, + "epoch": 0.6852247106568465, + "flos": 15081914350080.0, + "grad_norm": 2.3772034852800643, + "language_loss": 0.79818743, + "learning_rate": 9.522729308327931e-07, + "loss": 0.87494791, + "num_input_tokens_seen": 246096110, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.10528564, + "step": 11397, + "time_per_iteration": 2.481863260269165 + }, + { + "auxiliary_loss_clip": 0.06411494, + "auxiliary_loss_mlp": 0.01267109, + "balance_loss_clip": 0.0627315, + "balance_loss_mlp": 0.01256828, + "epoch": 0.6852848339095146, + "flos": 18775874396160.0, + "grad_norm": 1.839103323810105, + "language_loss": 0.71941662, + "learning_rate": 9.519412063806493e-07, + "loss": 0.7962026, + "num_input_tokens_seen": 246114785, + "router_z_loss_clip": 1.38476562, + "router_z_loss_mlp": 0.10284424, + "step": 11398, + "time_per_iteration": 2.5322060585021973 + }, + { + "auxiliary_loss_clip": 0.06403108, + "auxiliary_loss_mlp": 0.01265797, + "balance_loss_clip": 0.06270117, + "balance_loss_mlp": 0.0125632, + "epoch": 0.6853449571621825, + "flos": 27861651371520.0, + "grad_norm": 1.5188649145265738, + "language_loss": 0.71170795, + "learning_rate": 9.516095216709996e-07, + "loss": 0.78839701, + "num_input_tokens_seen": 246136375, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.0947876, + "step": 11399, + "time_per_iteration": 3.972925901412964 + }, + { + "auxiliary_loss_clip": 0.06411214, + "auxiliary_loss_mlp": 0.01269341, + "balance_loss_clip": 0.06273123, + "balance_loss_mlp": 0.01259119, + "epoch": 0.6854050804148505, + "flos": 18156403560960.0, + "grad_norm": 1.6092651373600877, + "language_loss": 0.70567757, + "learning_rate": 9.512778767164217e-07, + "loss": 0.78248316, + "num_input_tokens_seen": 246155090, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.10217285, + "step": 11400, + "time_per_iteration": 2.474824905395508 + }, + { + "auxiliary_loss_clip": 0.06426042, + "auxiliary_loss_mlp": 0.01267609, + "balance_loss_clip": 0.06277213, + "balance_loss_mlp": 0.01255163, + "epoch": 0.6854652036675184, + "flos": 16331798977920.0, + "grad_norm": 1.9177955333528751, + "language_loss": 0.77889669, + "learning_rate": 9.509462715294927e-07, + "loss": 0.85583317, + "num_input_tokens_seen": 246172645, + "router_z_loss_clip": 1.48632812, + "router_z_loss_mlp": 0.12463379, + "step": 11401, + "time_per_iteration": 2.5186407566070557 + }, + { + "auxiliary_loss_clip": 0.06405222, + "auxiliary_loss_mlp": 0.01266109, + "balance_loss_clip": 0.06271464, + "balance_loss_mlp": 0.01256537, + "epoch": 0.6855253269201864, + "flos": 14946347243520.0, + "grad_norm": 2.060399475016654, + "language_loss": 0.75462782, + "learning_rate": 9.50614706122786e-07, + "loss": 0.83134115, + "num_input_tokens_seen": 246189055, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.0958252, + "step": 11402, + "time_per_iteration": 2.461958885192871 + }, + { + "auxiliary_loss_clip": 0.06414859, + "auxiliary_loss_mlp": 0.01266931, + "balance_loss_clip": 0.06273296, + "balance_loss_mlp": 0.01255487, + "epoch": 0.6855854501728543, + "flos": 23044135127040.0, + "grad_norm": 1.4779944862214063, + "language_loss": 0.73165995, + "learning_rate": 9.502831805088742e-07, + "loss": 0.80847782, + "num_input_tokens_seen": 246207990, + "router_z_loss_clip": 1.41503906, + "router_z_loss_mlp": 0.11444092, + "step": 11403, + "time_per_iteration": 2.5588088035583496 + }, + { + "auxiliary_loss_clip": 0.06407753, + "auxiliary_loss_mlp": 0.01264829, + "balance_loss_clip": 0.06272316, + "balance_loss_mlp": 0.0125522, + "epoch": 0.6856455734255223, + "flos": 13257393621120.0, + "grad_norm": 3.459862281853561, + "language_loss": 0.81727648, + "learning_rate": 9.499516947003294e-07, + "loss": 0.89400232, + "num_input_tokens_seen": 246221595, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.09613037, + "step": 11404, + "time_per_iteration": 3.899538993835449 + }, + { + "auxiliary_loss_clip": 0.06407394, + "auxiliary_loss_mlp": 0.01269418, + "balance_loss_clip": 0.06274688, + "balance_loss_mlp": 0.01259381, + "epoch": 0.6857056966781903, + "flos": 23340551345280.0, + "grad_norm": 1.3350169784860642, + "language_loss": 0.7794162, + "learning_rate": 9.496202487097222e-07, + "loss": 0.8561843, + "num_input_tokens_seen": 246242970, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.10046387, + "step": 11405, + "time_per_iteration": 2.618781089782715 + }, + { + "auxiliary_loss_clip": 0.06313835, + "auxiliary_loss_mlp": 0.01251022, + "balance_loss_clip": 0.06257869, + "balance_loss_mlp": 0.01250013, + "epoch": 0.6857658199308583, + "flos": 61870646010240.0, + "grad_norm": 0.7926132752302004, + "language_loss": 0.60793728, + "learning_rate": 9.492888425496199e-07, + "loss": 0.68358588, + "num_input_tokens_seen": 246300405, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01009369, + "step": 11406, + "time_per_iteration": 3.192826986312866 + }, + { + "auxiliary_loss_clip": 0.06409362, + "auxiliary_loss_mlp": 0.0126412, + "balance_loss_clip": 0.06271882, + "balance_loss_mlp": 0.01253826, + "epoch": 0.6858259431835262, + "flos": 16660178328960.0, + "grad_norm": 1.6678552032285212, + "language_loss": 0.77383244, + "learning_rate": 9.489574762325907e-07, + "loss": 0.85056722, + "num_input_tokens_seen": 246318780, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.10296631, + "step": 11407, + "time_per_iteration": 2.5133752822875977 + }, + { + "auxiliary_loss_clip": 0.06408191, + "auxiliary_loss_mlp": 0.01265606, + "balance_loss_clip": 0.0626992, + "balance_loss_mlp": 0.0125455, + "epoch": 0.6858860664361942, + "flos": 21879643409280.0, + "grad_norm": 2.893760051958565, + "language_loss": 0.71341193, + "learning_rate": 9.486261497711991e-07, + "loss": 0.79014993, + "num_input_tokens_seen": 246339405, + "router_z_loss_clip": 1.38183594, + "router_z_loss_mlp": 0.11053467, + "step": 11408, + "time_per_iteration": 2.5356616973876953 + }, + { + "auxiliary_loss_clip": 0.06413727, + "auxiliary_loss_mlp": 0.01265844, + "balance_loss_clip": 0.06273487, + "balance_loss_mlp": 0.0125514, + "epoch": 0.6859461896888621, + "flos": 15272965658880.0, + "grad_norm": 1.731957908279727, + "language_loss": 0.70413965, + "learning_rate": 9.482948631780087e-07, + "loss": 0.78093535, + "num_input_tokens_seen": 246357055, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.1071167, + "step": 11409, + "time_per_iteration": 2.52020525932312 + }, + { + "auxiliary_loss_clip": 0.0640128, + "auxiliary_loss_mlp": 0.01263971, + "balance_loss_clip": 0.06270745, + "balance_loss_mlp": 0.01254733, + "epoch": 0.6860063129415301, + "flos": 18625507044480.0, + "grad_norm": 1.590904402895803, + "language_loss": 0.78129441, + "learning_rate": 9.479636164655825e-07, + "loss": 0.85794687, + "num_input_tokens_seen": 246374050, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.09240723, + "step": 11410, + "time_per_iteration": 2.546893358230591 + }, + { + "auxiliary_loss_clip": 0.06412078, + "auxiliary_loss_mlp": 0.01266884, + "balance_loss_clip": 0.06270525, + "balance_loss_mlp": 0.01256078, + "epoch": 0.6860664361941982, + "flos": 23958177390720.0, + "grad_norm": 1.8721880718662787, + "language_loss": 0.7200377, + "learning_rate": 9.476324096464821e-07, + "loss": 0.79682732, + "num_input_tokens_seen": 246392910, + "router_z_loss_clip": 1.41601562, + "router_z_loss_mlp": 0.1081543, + "step": 11411, + "time_per_iteration": 2.532982349395752 + }, + { + "auxiliary_loss_clip": 0.0641197, + "auxiliary_loss_mlp": 0.01268743, + "balance_loss_clip": 0.06274374, + "balance_loss_mlp": 0.01258551, + "epoch": 0.6861265594468661, + "flos": 20413243031040.0, + "grad_norm": 1.9740044070304406, + "language_loss": 0.70534211, + "learning_rate": 9.473012427332654e-07, + "loss": 0.78214926, + "num_input_tokens_seen": 246411540, + "router_z_loss_clip": 1.37695312, + "router_z_loss_mlp": 0.10192871, + "step": 11412, + "time_per_iteration": 2.5798745155334473 + }, + { + "auxiliary_loss_clip": 0.06410308, + "auxiliary_loss_mlp": 0.01266719, + "balance_loss_clip": 0.06272474, + "balance_loss_mlp": 0.01256324, + "epoch": 0.6861866826995341, + "flos": 11431908570240.0, + "grad_norm": 3.0856036818138692, + "language_loss": 0.71973193, + "learning_rate": 9.469701157384919e-07, + "loss": 0.79650223, + "num_input_tokens_seen": 246423295, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.10394287, + "step": 11413, + "time_per_iteration": 2.4693074226379395 + }, + { + "auxiliary_loss_clip": 0.06411856, + "auxiliary_loss_mlp": 0.01267734, + "balance_loss_clip": 0.06274316, + "balance_loss_mlp": 0.01257518, + "epoch": 0.686246805952202, + "flos": 16003084210560.0, + "grad_norm": 1.8173139685722925, + "language_loss": 0.73670095, + "learning_rate": 9.466390286747164e-07, + "loss": 0.81349689, + "num_input_tokens_seen": 246441045, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.10217285, + "step": 11414, + "time_per_iteration": 2.510739803314209 + }, + { + "auxiliary_loss_clip": 0.06415157, + "auxiliary_loss_mlp": 0.01267285, + "balance_loss_clip": 0.06276812, + "balance_loss_mlp": 0.01256425, + "epoch": 0.68630692920487, + "flos": 19832527509120.0, + "grad_norm": 2.474590574257684, + "language_loss": 0.87128049, + "learning_rate": 9.46307981554495e-07, + "loss": 0.94810498, + "num_input_tokens_seen": 246456905, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.10852051, + "step": 11415, + "time_per_iteration": 2.4847946166992188 + }, + { + "auxiliary_loss_clip": 0.06415314, + "auxiliary_loss_mlp": 0.01266339, + "balance_loss_clip": 0.06276202, + "balance_loss_mlp": 0.01254705, + "epoch": 0.6863670524575379, + "flos": 26293366028160.0, + "grad_norm": 9.907368268016192, + "language_loss": 0.67353249, + "learning_rate": 9.459769743903801e-07, + "loss": 0.75034899, + "num_input_tokens_seen": 246477545, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.11633301, + "step": 11416, + "time_per_iteration": 2.5904948711395264 + }, + { + "auxiliary_loss_clip": 0.06403923, + "auxiliary_loss_mlp": 0.0126434, + "balance_loss_clip": 0.06269173, + "balance_loss_mlp": 0.01254284, + "epoch": 0.686427175710206, + "flos": 19179374532480.0, + "grad_norm": 1.4750819254499818, + "language_loss": 0.76489693, + "learning_rate": 9.456460071949237e-07, + "loss": 0.84157956, + "num_input_tokens_seen": 246496705, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.10058594, + "step": 11417, + "time_per_iteration": 2.487197160720825 + }, + { + "auxiliary_loss_clip": 0.06410322, + "auxiliary_loss_mlp": 0.0126862, + "balance_loss_clip": 0.0627322, + "balance_loss_mlp": 0.01258863, + "epoch": 0.6864872989628739, + "flos": 18922636022400.0, + "grad_norm": 1.8452434101813986, + "language_loss": 0.77370739, + "learning_rate": 9.45315079980678e-07, + "loss": 0.85049683, + "num_input_tokens_seen": 246514860, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.09759521, + "step": 11418, + "time_per_iteration": 2.510810375213623 + }, + { + "auxiliary_loss_clip": 0.06410821, + "auxiliary_loss_mlp": 0.01265598, + "balance_loss_clip": 0.06272699, + "balance_loss_mlp": 0.01255382, + "epoch": 0.6865474222155419, + "flos": 25963016106240.0, + "grad_norm": 1.6317928435070383, + "language_loss": 0.76463497, + "learning_rate": 9.449841927601887e-07, + "loss": 0.84139907, + "num_input_tokens_seen": 246536145, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.10217285, + "step": 11419, + "time_per_iteration": 2.5700454711914062 + }, + { + "auxiliary_loss_clip": 0.06407338, + "auxiliary_loss_mlp": 0.01267938, + "balance_loss_clip": 0.06270772, + "balance_loss_mlp": 0.01258359, + "epoch": 0.6866075454682098, + "flos": 18483902444160.0, + "grad_norm": 1.6443171286333353, + "language_loss": 0.71588171, + "learning_rate": 9.446533455460044e-07, + "loss": 0.79263443, + "num_input_tokens_seen": 246553265, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.0958252, + "step": 11420, + "time_per_iteration": 2.5144495964050293 + }, + { + "auxiliary_loss_clip": 0.06407318, + "auxiliary_loss_mlp": 0.0126343, + "balance_loss_clip": 0.06272776, + "balance_loss_mlp": 0.01253506, + "epoch": 0.6866676687208778, + "flos": 34248459208320.0, + "grad_norm": 1.3410332761873145, + "language_loss": 0.75059515, + "learning_rate": 9.443225383506712e-07, + "loss": 0.82730258, + "num_input_tokens_seen": 246575130, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.09924316, + "step": 11421, + "time_per_iteration": 2.61454176902771 + }, + { + "auxiliary_loss_clip": 0.0640727, + "auxiliary_loss_mlp": 0.01265626, + "balance_loss_clip": 0.06272772, + "balance_loss_mlp": 0.01255982, + "epoch": 0.6867277919735457, + "flos": 21727515121920.0, + "grad_norm": 1.6725729939473468, + "language_loss": 0.77230668, + "learning_rate": 9.439917711867338e-07, + "loss": 0.84903562, + "num_input_tokens_seen": 246593095, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.09637451, + "step": 11422, + "time_per_iteration": 2.5174617767333984 + }, + { + "auxiliary_loss_clip": 0.0641562, + "auxiliary_loss_mlp": 0.01272736, + "balance_loss_clip": 0.06279219, + "balance_loss_mlp": 0.01261536, + "epoch": 0.6867879152262137, + "flos": 24104939016960.0, + "grad_norm": 1.647039828063758, + "language_loss": 0.77276117, + "learning_rate": 9.436610440667334e-07, + "loss": 0.84964472, + "num_input_tokens_seen": 246612165, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.11206055, + "step": 11423, + "time_per_iteration": 2.5189144611358643 + }, + { + "auxiliary_loss_clip": 0.06414216, + "auxiliary_loss_mlp": 0.01267082, + "balance_loss_clip": 0.06274028, + "balance_loss_mlp": 0.01256461, + "epoch": 0.6868480384788818, + "flos": 21622150212480.0, + "grad_norm": 1.4426214659548335, + "language_loss": 0.73124474, + "learning_rate": 9.433303570032129e-07, + "loss": 0.80805779, + "num_input_tokens_seen": 246632065, + "router_z_loss_clip": 1.40332031, + "router_z_loss_mlp": 0.10614014, + "step": 11424, + "time_per_iteration": 2.5789601802825928 + }, + { + "auxiliary_loss_clip": 0.06411408, + "auxiliary_loss_mlp": 0.01265287, + "balance_loss_clip": 0.06273325, + "balance_loss_mlp": 0.01254839, + "epoch": 0.6869081617315497, + "flos": 26293282174080.0, + "grad_norm": 1.8417753723265369, + "language_loss": 0.65276968, + "learning_rate": 9.429997100087112e-07, + "loss": 0.72953665, + "num_input_tokens_seen": 246651245, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.10437012, + "step": 11425, + "time_per_iteration": 2.547678232192993 + }, + { + "auxiliary_loss_clip": 0.06408506, + "auxiliary_loss_mlp": 0.0126771, + "balance_loss_clip": 0.06275355, + "balance_loss_mlp": 0.01257381, + "epoch": 0.6869682849842177, + "flos": 21111356522880.0, + "grad_norm": 1.3347714221988014, + "language_loss": 0.71902603, + "learning_rate": 9.426691030957657e-07, + "loss": 0.79578817, + "num_input_tokens_seen": 246672225, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.10327148, + "step": 11426, + "time_per_iteration": 4.051712512969971 + }, + { + "auxiliary_loss_clip": 0.06412126, + "auxiliary_loss_mlp": 0.01266408, + "balance_loss_clip": 0.06274693, + "balance_loss_mlp": 0.0125606, + "epoch": 0.6870284082368856, + "flos": 17098408782720.0, + "grad_norm": 2.192498277588843, + "language_loss": 0.85740101, + "learning_rate": 9.423385362769136e-07, + "loss": 0.93418634, + "num_input_tokens_seen": 246688385, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.10351562, + "step": 11427, + "time_per_iteration": 2.533590316772461 + }, + { + "auxiliary_loss_clip": 0.06408241, + "auxiliary_loss_mlp": 0.01263719, + "balance_loss_clip": 0.06273334, + "balance_loss_mlp": 0.01253312, + "epoch": 0.6870885314895536, + "flos": 27315456531840.0, + "grad_norm": 1.4340637684485376, + "language_loss": 0.76548541, + "learning_rate": 9.420080095646909e-07, + "loss": 0.84220493, + "num_input_tokens_seen": 246710730, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.10412598, + "step": 11428, + "time_per_iteration": 2.579432249069214 + }, + { + "auxiliary_loss_clip": 0.06414707, + "auxiliary_loss_mlp": 0.0127044, + "balance_loss_clip": 0.06273684, + "balance_loss_mlp": 0.01259002, + "epoch": 0.6871486547422215, + "flos": 20820977798400.0, + "grad_norm": 2.1898072552839087, + "language_loss": 0.73509127, + "learning_rate": 9.4167752297163e-07, + "loss": 0.81194276, + "num_input_tokens_seen": 246730350, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.11437988, + "step": 11429, + "time_per_iteration": 2.508434772491455 + }, + { + "auxiliary_loss_clip": 0.0641626, + "auxiliary_loss_mlp": 0.01266327, + "balance_loss_clip": 0.06277661, + "balance_loss_mlp": 0.01256474, + "epoch": 0.6872087779948896, + "flos": 30161983910400.0, + "grad_norm": 1.931452469341354, + "language_loss": 0.83630431, + "learning_rate": 9.413470765102643e-07, + "loss": 0.91313016, + "num_input_tokens_seen": 246751700, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.09851074, + "step": 11430, + "time_per_iteration": 2.630755662918091 + }, + { + "auxiliary_loss_clip": 0.06412026, + "auxiliary_loss_mlp": 0.0126587, + "balance_loss_clip": 0.06274621, + "balance_loss_mlp": 0.0125504, + "epoch": 0.6872689012475575, + "flos": 20710917060480.0, + "grad_norm": 2.0596974928309253, + "language_loss": 0.70543802, + "learning_rate": 9.410166701931225e-07, + "loss": 0.78221703, + "num_input_tokens_seen": 246769860, + "router_z_loss_clip": 1.37402344, + "router_z_loss_mlp": 0.10827637, + "step": 11431, + "time_per_iteration": 2.491147756576538 + }, + { + "auxiliary_loss_clip": 0.06409967, + "auxiliary_loss_mlp": 0.01264771, + "balance_loss_clip": 0.06271458, + "balance_loss_mlp": 0.01254293, + "epoch": 0.6873290245002255, + "flos": 25528014034560.0, + "grad_norm": 1.7781814059522836, + "language_loss": 0.80397063, + "learning_rate": 9.406863040327355e-07, + "loss": 0.88071799, + "num_input_tokens_seen": 246789905, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.1048584, + "step": 11432, + "time_per_iteration": 2.5659162998199463 + }, + { + "auxiliary_loss_clip": 0.06404472, + "auxiliary_loss_mlp": 0.01268851, + "balance_loss_clip": 0.06272881, + "balance_loss_mlp": 0.01259362, + "epoch": 0.6873891477528934, + "flos": 25198418799360.0, + "grad_norm": 2.2741442538336125, + "language_loss": 0.68286675, + "learning_rate": 9.403559780416295e-07, + "loss": 0.75959998, + "num_input_tokens_seen": 246808815, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09490967, + "step": 11433, + "time_per_iteration": 2.6121439933776855 + }, + { + "auxiliary_loss_clip": 0.064156, + "auxiliary_loss_mlp": 0.01269066, + "balance_loss_clip": 0.06278776, + "balance_loss_mlp": 0.01258665, + "epoch": 0.6874492710055614, + "flos": 35161034025600.0, + "grad_norm": 2.030098002823672, + "language_loss": 0.72783715, + "learning_rate": 9.400256922323309e-07, + "loss": 0.8046838, + "num_input_tokens_seen": 246829775, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.10400391, + "step": 11434, + "time_per_iteration": 2.6294844150543213 + }, + { + "auxiliary_loss_clip": 0.06410138, + "auxiliary_loss_mlp": 0.01269251, + "balance_loss_clip": 0.06275442, + "balance_loss_mlp": 0.0125919, + "epoch": 0.6875093942582293, + "flos": 17828066136960.0, + "grad_norm": 1.5552043430175444, + "language_loss": 0.80520236, + "learning_rate": 9.396954466173657e-07, + "loss": 0.88199627, + "num_input_tokens_seen": 246848045, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.10064697, + "step": 11435, + "time_per_iteration": 2.501239061355591 + }, + { + "auxiliary_loss_clip": 0.06411996, + "auxiliary_loss_mlp": 0.01269183, + "balance_loss_clip": 0.06272568, + "balance_loss_mlp": 0.01258227, + "epoch": 0.6875695175108973, + "flos": 20710875133440.0, + "grad_norm": 9.52111477806384, + "language_loss": 0.8158865, + "learning_rate": 9.393652412092538e-07, + "loss": 0.89269829, + "num_input_tokens_seen": 246866095, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.10943604, + "step": 11436, + "time_per_iteration": 3.8841755390167236 + }, + { + "auxiliary_loss_clip": 0.064064, + "auxiliary_loss_mlp": 0.01268806, + "balance_loss_clip": 0.0627645, + "balance_loss_mlp": 0.01259806, + "epoch": 0.6876296407635654, + "flos": 25381000846080.0, + "grad_norm": 1.6419248940044093, + "language_loss": 0.81966716, + "learning_rate": 9.390350760205183e-07, + "loss": 0.89641917, + "num_input_tokens_seen": 246883975, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.08996582, + "step": 11437, + "time_per_iteration": 2.5980188846588135 + }, + { + "auxiliary_loss_clip": 0.06421375, + "auxiliary_loss_mlp": 0.01270532, + "balance_loss_clip": 0.06274987, + "balance_loss_mlp": 0.01257729, + "epoch": 0.6876897640162333, + "flos": 23229107015040.0, + "grad_norm": 2.1640181952928486, + "language_loss": 0.77725911, + "learning_rate": 9.387049510636793e-07, + "loss": 0.85417819, + "num_input_tokens_seen": 246901560, + "router_z_loss_clip": 1.46484375, + "router_z_loss_mlp": 0.12792969, + "step": 11438, + "time_per_iteration": 2.5095889568328857 + }, + { + "auxiliary_loss_clip": 0.06405748, + "auxiliary_loss_mlp": 0.01270285, + "balance_loss_clip": 0.06273987, + "balance_loss_mlp": 0.01260838, + "epoch": 0.6877498872689013, + "flos": 27131448965760.0, + "grad_norm": 1.6644547524403899, + "language_loss": 0.72329235, + "learning_rate": 9.383748663512554e-07, + "loss": 0.80005264, + "num_input_tokens_seen": 246922655, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09448242, + "step": 11439, + "time_per_iteration": 3.9927306175231934 + }, + { + "auxiliary_loss_clip": 0.06406644, + "auxiliary_loss_mlp": 0.01268484, + "balance_loss_clip": 0.06271771, + "balance_loss_mlp": 0.01258554, + "epoch": 0.6878100105215692, + "flos": 11586217063680.0, + "grad_norm": 1.9676653989850965, + "language_loss": 0.75157619, + "learning_rate": 9.380448218957623e-07, + "loss": 0.82832754, + "num_input_tokens_seen": 246940100, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.09936523, + "step": 11440, + "time_per_iteration": 2.4851269721984863 + }, + { + "auxiliary_loss_clip": 0.06404521, + "auxiliary_loss_mlp": 0.01267859, + "balance_loss_clip": 0.06272353, + "balance_loss_mlp": 0.012584, + "epoch": 0.6878701337742372, + "flos": 20309429422080.0, + "grad_norm": 1.4828372396976293, + "language_loss": 0.71795368, + "learning_rate": 9.377148177097167e-07, + "loss": 0.79467738, + "num_input_tokens_seen": 246958545, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.09448242, + "step": 11441, + "time_per_iteration": 2.514653444290161 + }, + { + "auxiliary_loss_clip": 0.06418902, + "auxiliary_loss_mlp": 0.01272176, + "balance_loss_clip": 0.06276838, + "balance_loss_mlp": 0.01260893, + "epoch": 0.6879302570269051, + "flos": 13844398199040.0, + "grad_norm": 1.6175108384355714, + "language_loss": 0.66777945, + "learning_rate": 9.373848538056317e-07, + "loss": 0.74469018, + "num_input_tokens_seen": 246974805, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.11291504, + "step": 11442, + "time_per_iteration": 2.5146420001983643 + }, + { + "auxiliary_loss_clip": 0.06411453, + "auxiliary_loss_mlp": 0.01266841, + "balance_loss_clip": 0.06274946, + "balance_loss_mlp": 0.01256547, + "epoch": 0.6879903802795732, + "flos": 21331058728320.0, + "grad_norm": 2.38232064736284, + "language_loss": 0.69958794, + "learning_rate": 9.370549301960189e-07, + "loss": 0.77637082, + "num_input_tokens_seen": 246992505, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.10290527, + "step": 11443, + "time_per_iteration": 2.493436574935913 + }, + { + "auxiliary_loss_clip": 0.06419516, + "auxiliary_loss_mlp": 0.01266925, + "balance_loss_clip": 0.06279808, + "balance_loss_mlp": 0.01256524, + "epoch": 0.6880505035322411, + "flos": 25158489528960.0, + "grad_norm": 1.390720225309701, + "language_loss": 0.763533, + "learning_rate": 9.367250468933893e-07, + "loss": 0.84039736, + "num_input_tokens_seen": 247013370, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.10394287, + "step": 11444, + "time_per_iteration": 3.9500269889831543 + }, + { + "auxiliary_loss_clip": 0.06406762, + "auxiliary_loss_mlp": 0.01267311, + "balance_loss_clip": 0.06272952, + "balance_loss_mlp": 0.01257059, + "epoch": 0.6881106267849091, + "flos": 23221182804480.0, + "grad_norm": 1.8756092745031845, + "language_loss": 0.76660252, + "learning_rate": 9.363952039102536e-07, + "loss": 0.84334326, + "num_input_tokens_seen": 247029855, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.10253906, + "step": 11445, + "time_per_iteration": 2.488555908203125 + }, + { + "auxiliary_loss_clip": 0.06317502, + "auxiliary_loss_mlp": 0.01252549, + "balance_loss_clip": 0.06261797, + "balance_loss_mlp": 0.01251243, + "epoch": 0.688170750037577, + "flos": 48497741136000.0, + "grad_norm": 0.8087198242159813, + "language_loss": 0.58278191, + "learning_rate": 9.360654012591183e-07, + "loss": 0.65848243, + "num_input_tokens_seen": 247085030, + "router_z_loss_clip": 0.55810547, + "router_z_loss_mlp": 0.01306915, + "step": 11446, + "time_per_iteration": 3.1777503490448 + }, + { + "auxiliary_loss_clip": 0.06413881, + "auxiliary_loss_mlp": 0.0126538, + "balance_loss_clip": 0.06273392, + "balance_loss_mlp": 0.01254562, + "epoch": 0.688230873290245, + "flos": 22790205728640.0, + "grad_norm": 1.616943103064761, + "language_loss": 0.76008183, + "learning_rate": 9.357356389524886e-07, + "loss": 0.83687443, + "num_input_tokens_seen": 247104840, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.10821533, + "step": 11447, + "time_per_iteration": 2.5756897926330566 + }, + { + "auxiliary_loss_clip": 0.06411539, + "auxiliary_loss_mlp": 0.01266898, + "balance_loss_clip": 0.06274877, + "balance_loss_mlp": 0.01256884, + "epoch": 0.6882909965429129, + "flos": 22462245648000.0, + "grad_norm": 1.9129765382773336, + "language_loss": 0.74044937, + "learning_rate": 9.354059170028705e-07, + "loss": 0.81723368, + "num_input_tokens_seen": 247121905, + "router_z_loss_clip": 1.36621094, + "router_z_loss_mlp": 0.10015869, + "step": 11448, + "time_per_iteration": 2.5083351135253906 + }, + { + "auxiliary_loss_clip": 0.06417549, + "auxiliary_loss_mlp": 0.01266481, + "balance_loss_clip": 0.06275415, + "balance_loss_mlp": 0.01255376, + "epoch": 0.688351119795581, + "flos": 26221431697920.0, + "grad_norm": 1.5605900643108004, + "language_loss": 0.74581099, + "learning_rate": 9.350762354227673e-07, + "loss": 0.82265133, + "num_input_tokens_seen": 247142375, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.11102295, + "step": 11449, + "time_per_iteration": 2.585969924926758 + }, + { + "auxiliary_loss_clip": 0.06408881, + "auxiliary_loss_mlp": 0.01266876, + "balance_loss_clip": 0.06273638, + "balance_loss_mlp": 0.01256809, + "epoch": 0.6884112430482489, + "flos": 22571887115520.0, + "grad_norm": 1.6262008407242425, + "language_loss": 0.70027089, + "learning_rate": 9.34746594224679e-07, + "loss": 0.77702844, + "num_input_tokens_seen": 247161095, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.1005249, + "step": 11450, + "time_per_iteration": 2.5182437896728516 + }, + { + "auxiliary_loss_clip": 0.06418543, + "auxiliary_loss_mlp": 0.0126869, + "balance_loss_clip": 0.06276023, + "balance_loss_mlp": 0.01257187, + "epoch": 0.6884713663009169, + "flos": 17345671781760.0, + "grad_norm": 1.9477242871289788, + "language_loss": 0.76100504, + "learning_rate": 9.344169934211068e-07, + "loss": 0.83787739, + "num_input_tokens_seen": 247178565, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.1151123, + "step": 11451, + "time_per_iteration": 2.5395891666412354 + }, + { + "auxiliary_loss_clip": 0.06416887, + "auxiliary_loss_mlp": 0.01263826, + "balance_loss_clip": 0.06276768, + "balance_loss_mlp": 0.01253926, + "epoch": 0.6885314895535849, + "flos": 26478379843200.0, + "grad_norm": 1.2780895399548546, + "language_loss": 0.69393182, + "learning_rate": 9.340874330245505e-07, + "loss": 0.77073896, + "num_input_tokens_seen": 247202345, + "router_z_loss_clip": 1.40039062, + "router_z_loss_mlp": 0.09899902, + "step": 11452, + "time_per_iteration": 2.584246873855591 + }, + { + "auxiliary_loss_clip": 0.06409479, + "auxiliary_loss_mlp": 0.01267469, + "balance_loss_clip": 0.06273156, + "balance_loss_mlp": 0.0125553, + "epoch": 0.6885916128062528, + "flos": 20527748035200.0, + "grad_norm": 1.553726438653973, + "language_loss": 0.71749568, + "learning_rate": 9.337579130475042e-07, + "loss": 0.79426515, + "num_input_tokens_seen": 247219240, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.11932373, + "step": 11453, + "time_per_iteration": 2.5244805812835693 + }, + { + "auxiliary_loss_clip": 0.06314202, + "auxiliary_loss_mlp": 0.01249184, + "balance_loss_clip": 0.06258714, + "balance_loss_mlp": 0.01248031, + "epoch": 0.6886517360589208, + "flos": 70734792136320.0, + "grad_norm": 0.77256871445285, + "language_loss": 0.50623441, + "learning_rate": 9.334284335024644e-07, + "loss": 0.58186829, + "num_input_tokens_seen": 247272010, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01150513, + "step": 11454, + "time_per_iteration": 2.982760190963745 + }, + { + "auxiliary_loss_clip": 0.06402037, + "auxiliary_loss_mlp": 0.01264708, + "balance_loss_clip": 0.06273487, + "balance_loss_mlp": 0.01254998, + "epoch": 0.6887118593115887, + "flos": 17899119999360.0, + "grad_norm": 1.70106225646023, + "language_loss": 0.75493348, + "learning_rate": 9.330989944019263e-07, + "loss": 0.8316009, + "num_input_tokens_seen": 247290630, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.09716797, + "step": 11455, + "time_per_iteration": 2.5417535305023193 + }, + { + "auxiliary_loss_clip": 0.0641242, + "auxiliary_loss_mlp": 0.01266873, + "balance_loss_clip": 0.06273204, + "balance_loss_mlp": 0.01255286, + "epoch": 0.6887719825642568, + "flos": 17458080433920.0, + "grad_norm": 2.3349527650336945, + "language_loss": 0.72984523, + "learning_rate": 9.327695957583803e-07, + "loss": 0.80663818, + "num_input_tokens_seen": 247304800, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.11578369, + "step": 11456, + "time_per_iteration": 2.452291250228882 + }, + { + "auxiliary_loss_clip": 0.0640955, + "auxiliary_loss_mlp": 0.01265957, + "balance_loss_clip": 0.06275116, + "balance_loss_mlp": 0.01255621, + "epoch": 0.6888321058169247, + "flos": 23075930551680.0, + "grad_norm": 1.6190505365782226, + "language_loss": 0.81124002, + "learning_rate": 9.32440237584319e-07, + "loss": 0.88799506, + "num_input_tokens_seen": 247323450, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.10339355, + "step": 11457, + "time_per_iteration": 2.540853977203369 + }, + { + "auxiliary_loss_clip": 0.06415743, + "auxiliary_loss_mlp": 0.01267797, + "balance_loss_clip": 0.06276038, + "balance_loss_mlp": 0.01257152, + "epoch": 0.6888922290695927, + "flos": 23375742860160.0, + "grad_norm": 1.590427454304544, + "language_loss": 0.7679534, + "learning_rate": 9.321109198922301e-07, + "loss": 0.84478879, + "num_input_tokens_seen": 247343845, + "router_z_loss_clip": 1.39648438, + "router_z_loss_mlp": 0.10638428, + "step": 11458, + "time_per_iteration": 2.510422706604004 + }, + { + "auxiliary_loss_clip": 0.06409671, + "auxiliary_loss_mlp": 0.01264265, + "balance_loss_clip": 0.0627234, + "balance_loss_mlp": 0.012539, + "epoch": 0.6889523523222606, + "flos": 17636092433280.0, + "grad_norm": 2.414805126891923, + "language_loss": 0.68316978, + "learning_rate": 9.31781642694603e-07, + "loss": 0.75990915, + "num_input_tokens_seen": 247356650, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.1036377, + "step": 11459, + "time_per_iteration": 2.5042388439178467 + }, + { + "auxiliary_loss_clip": 0.06414565, + "auxiliary_loss_mlp": 0.01267614, + "balance_loss_clip": 0.06275657, + "balance_loss_mlp": 0.01257976, + "epoch": 0.6890124755749286, + "flos": 25235119687680.0, + "grad_norm": 1.5145065442588617, + "language_loss": 0.68853188, + "learning_rate": 9.314524060039221e-07, + "loss": 0.76535368, + "num_input_tokens_seen": 247377340, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.09637451, + "step": 11460, + "time_per_iteration": 2.548172950744629 + }, + { + "auxiliary_loss_clip": 0.06421833, + "auxiliary_loss_mlp": 0.01269493, + "balance_loss_clip": 0.06274051, + "balance_loss_mlp": 0.01257727, + "epoch": 0.6890725988275965, + "flos": 20236488842880.0, + "grad_norm": 1.6636597256364867, + "language_loss": 0.77513885, + "learning_rate": 9.311232098326731e-07, + "loss": 0.85205209, + "num_input_tokens_seen": 247395805, + "router_z_loss_clip": 1.47753906, + "router_z_loss_mlp": 0.11761475, + "step": 11461, + "time_per_iteration": 2.524261474609375 + }, + { + "auxiliary_loss_clip": 0.06409161, + "auxiliary_loss_mlp": 0.01267077, + "balance_loss_clip": 0.06273255, + "balance_loss_mlp": 0.01256777, + "epoch": 0.6891327220802645, + "flos": 14540079922560.0, + "grad_norm": 2.0638516380212932, + "language_loss": 0.69867802, + "learning_rate": 9.307940541933401e-07, + "loss": 0.77544034, + "num_input_tokens_seen": 247413165, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.10302734, + "step": 11462, + "time_per_iteration": 2.470341444015503 + }, + { + "auxiliary_loss_clip": 0.06410427, + "auxiliary_loss_mlp": 0.01263925, + "balance_loss_clip": 0.06272087, + "balance_loss_mlp": 0.01253864, + "epoch": 0.6891928453329325, + "flos": 21144996737280.0, + "grad_norm": 1.4840489217528152, + "language_loss": 0.87375474, + "learning_rate": 9.304649390984034e-07, + "loss": 0.95049822, + "num_input_tokens_seen": 247433140, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.10064697, + "step": 11463, + "time_per_iteration": 2.550734043121338 + }, + { + "auxiliary_loss_clip": 0.06405184, + "auxiliary_loss_mlp": 0.01265431, + "balance_loss_clip": 0.06273332, + "balance_loss_mlp": 0.01255656, + "epoch": 0.6892529685856005, + "flos": 17864347754880.0, + "grad_norm": 1.4959389236419984, + "language_loss": 0.68525398, + "learning_rate": 9.301358645603428e-07, + "loss": 0.76196021, + "num_input_tokens_seen": 247451265, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.09771729, + "step": 11464, + "time_per_iteration": 3.9007256031036377 + }, + { + "auxiliary_loss_clip": 0.06409206, + "auxiliary_loss_mlp": 0.01266234, + "balance_loss_clip": 0.06272039, + "balance_loss_mlp": 0.01255571, + "epoch": 0.6893130918382685, + "flos": 29942575194240.0, + "grad_norm": 1.7446769813388354, + "language_loss": 0.65578705, + "learning_rate": 9.298068305916373e-07, + "loss": 0.73254144, + "num_input_tokens_seen": 247471645, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.10662842, + "step": 11465, + "time_per_iteration": 2.554800271987915 + }, + { + "auxiliary_loss_clip": 0.06418021, + "auxiliary_loss_mlp": 0.01264957, + "balance_loss_clip": 0.06274985, + "balance_loss_mlp": 0.01253388, + "epoch": 0.6893732150909364, + "flos": 24395275814400.0, + "grad_norm": 1.468256683851191, + "language_loss": 0.72699749, + "learning_rate": 9.294778372047649e-07, + "loss": 0.80382729, + "num_input_tokens_seen": 247491170, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.11578369, + "step": 11466, + "time_per_iteration": 2.5593020915985107 + }, + { + "auxiliary_loss_clip": 0.06412645, + "auxiliary_loss_mlp": 0.01265937, + "balance_loss_clip": 0.06275305, + "balance_loss_mlp": 0.01255632, + "epoch": 0.6894333383436044, + "flos": 16988557680000.0, + "grad_norm": 1.6869523120590046, + "language_loss": 0.72136575, + "learning_rate": 9.291488844121995e-07, + "loss": 0.79815149, + "num_input_tokens_seen": 247509005, + "router_z_loss_clip": 1.37402344, + "router_z_loss_mlp": 0.10302734, + "step": 11467, + "time_per_iteration": 2.4603004455566406 + }, + { + "auxiliary_loss_clip": 0.06414096, + "auxiliary_loss_mlp": 0.01266553, + "balance_loss_clip": 0.0627349, + "balance_loss_mlp": 0.0125462, + "epoch": 0.6894934615962723, + "flos": 18990880773120.0, + "grad_norm": 1.8974823893079618, + "language_loss": 0.80639178, + "learning_rate": 9.288199722264156e-07, + "loss": 0.88319826, + "num_input_tokens_seen": 247527050, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.11950684, + "step": 11468, + "time_per_iteration": 2.500204086303711 + }, + { + "auxiliary_loss_clip": 0.06415653, + "auxiliary_loss_mlp": 0.01266091, + "balance_loss_clip": 0.06276623, + "balance_loss_mlp": 0.01255941, + "epoch": 0.6895535848489404, + "flos": 34540137671040.0, + "grad_norm": 1.4230744907421156, + "language_loss": 0.66238683, + "learning_rate": 9.284911006598875e-07, + "loss": 0.73920429, + "num_input_tokens_seen": 247547765, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.10137939, + "step": 11469, + "time_per_iteration": 2.6155412197113037 + }, + { + "auxiliary_loss_clip": 0.06315388, + "auxiliary_loss_mlp": 0.01251862, + "balance_loss_clip": 0.06259958, + "balance_loss_mlp": 0.01250618, + "epoch": 0.6896137081016083, + "flos": 50093237128320.0, + "grad_norm": 0.7794555860117556, + "language_loss": 0.54945397, + "learning_rate": 9.281622697250824e-07, + "loss": 0.62512648, + "num_input_tokens_seen": 247603515, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01243591, + "step": 11470, + "time_per_iteration": 3.0223581790924072 + }, + { + "auxiliary_loss_clip": 0.0640993, + "auxiliary_loss_mlp": 0.01264419, + "balance_loss_clip": 0.0627588, + "balance_loss_mlp": 0.01255133, + "epoch": 0.6896738313542763, + "flos": 19944391109760.0, + "grad_norm": 1.6677407290115414, + "language_loss": 0.78484243, + "learning_rate": 9.278334794344715e-07, + "loss": 0.86158597, + "num_input_tokens_seen": 247622110, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.09283447, + "step": 11471, + "time_per_iteration": 2.486112594604492 + }, + { + "auxiliary_loss_clip": 0.0641201, + "auxiliary_loss_mlp": 0.0126608, + "balance_loss_clip": 0.06274249, + "balance_loss_mlp": 0.01255369, + "epoch": 0.6897339546069442, + "flos": 21731875534080.0, + "grad_norm": 1.810273606719927, + "language_loss": 0.78542721, + "learning_rate": 9.275047298005232e-07, + "loss": 0.86220813, + "num_input_tokens_seen": 247641905, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.10723877, + "step": 11472, + "time_per_iteration": 2.5265328884124756 + }, + { + "auxiliary_loss_clip": 0.06408779, + "auxiliary_loss_mlp": 0.01266157, + "balance_loss_clip": 0.06272413, + "balance_loss_mlp": 0.01256168, + "epoch": 0.6897940778596122, + "flos": 19832275946880.0, + "grad_norm": 1.5025655331144128, + "language_loss": 0.76723063, + "learning_rate": 9.271760208357024e-07, + "loss": 0.84398007, + "num_input_tokens_seen": 247660945, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.09985352, + "step": 11473, + "time_per_iteration": 2.5112764835357666 + }, + { + "auxiliary_loss_clip": 0.06415299, + "auxiliary_loss_mlp": 0.01264941, + "balance_loss_clip": 0.06274555, + "balance_loss_mlp": 0.01254099, + "epoch": 0.6898542011122801, + "flos": 17315595365760.0, + "grad_norm": 1.762455288405268, + "language_loss": 0.75548446, + "learning_rate": 9.268473525524751e-07, + "loss": 0.83228695, + "num_input_tokens_seen": 247678395, + "router_z_loss_clip": 1.40722656, + "router_z_loss_mlp": 0.10839844, + "step": 11474, + "time_per_iteration": 2.527608871459961 + }, + { + "auxiliary_loss_clip": 0.06414007, + "auxiliary_loss_mlp": 0.0127013, + "balance_loss_clip": 0.06276175, + "balance_loss_mlp": 0.01259097, + "epoch": 0.6899143243649482, + "flos": 24760984959360.0, + "grad_norm": 1.5301145681679174, + "language_loss": 0.74686491, + "learning_rate": 9.26518724963303e-07, + "loss": 0.82370627, + "num_input_tokens_seen": 247698380, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.11047363, + "step": 11475, + "time_per_iteration": 2.61885404586792 + }, + { + "auxiliary_loss_clip": 0.06408798, + "auxiliary_loss_mlp": 0.01264551, + "balance_loss_clip": 0.0627286, + "balance_loss_mlp": 0.01254168, + "epoch": 0.6899744476176161, + "flos": 17239636039680.0, + "grad_norm": 1.9758347439707513, + "language_loss": 0.89060938, + "learning_rate": 9.261901380806491e-07, + "loss": 0.96734291, + "num_input_tokens_seen": 247716370, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.1038208, + "step": 11476, + "time_per_iteration": 3.9992854595184326 + }, + { + "auxiliary_loss_clip": 0.06409539, + "auxiliary_loss_mlp": 0.01267337, + "balance_loss_clip": 0.06274991, + "balance_loss_mlp": 0.01256864, + "epoch": 0.6900345708702841, + "flos": 25417701734400.0, + "grad_norm": 1.3283080082562368, + "language_loss": 0.70312291, + "learning_rate": 9.258615919169724e-07, + "loss": 0.77989161, + "num_input_tokens_seen": 247737335, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.10473633, + "step": 11477, + "time_per_iteration": 2.5792300701141357 + }, + { + "auxiliary_loss_clip": 0.06419337, + "auxiliary_loss_mlp": 0.01267418, + "balance_loss_clip": 0.06276701, + "balance_loss_mlp": 0.0125567, + "epoch": 0.6900946941229521, + "flos": 23439836833920.0, + "grad_norm": 2.3323261899860386, + "language_loss": 0.68125427, + "learning_rate": 9.255330864847313e-07, + "loss": 0.75812185, + "num_input_tokens_seen": 247756680, + "router_z_loss_clip": 1.42675781, + "router_z_loss_mlp": 0.11737061, + "step": 11478, + "time_per_iteration": 4.033671855926514 + }, + { + "auxiliary_loss_clip": 0.06415287, + "auxiliary_loss_mlp": 0.01266031, + "balance_loss_clip": 0.06275256, + "balance_loss_mlp": 0.01255469, + "epoch": 0.69015481737562, + "flos": 17825592441600.0, + "grad_norm": 2.187140386680911, + "language_loss": 0.76715493, + "learning_rate": 9.252046217963843e-07, + "loss": 0.84396803, + "num_input_tokens_seen": 247774265, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.10565186, + "step": 11479, + "time_per_iteration": 2.507310390472412 + }, + { + "auxiliary_loss_clip": 0.06417705, + "auxiliary_loss_mlp": 0.0126466, + "balance_loss_clip": 0.06277484, + "balance_loss_mlp": 0.01253084, + "epoch": 0.690214940628288, + "flos": 17462147356800.0, + "grad_norm": 1.7422547235207548, + "language_loss": 0.78936756, + "learning_rate": 9.248761978643856e-07, + "loss": 0.86619121, + "num_input_tokens_seen": 247792395, + "router_z_loss_clip": 1.40039062, + "router_z_loss_mlp": 0.11584473, + "step": 11480, + "time_per_iteration": 2.4853224754333496 + }, + { + "auxiliary_loss_clip": 0.06408322, + "auxiliary_loss_mlp": 0.01267563, + "balance_loss_clip": 0.06271941, + "balance_loss_mlp": 0.01256685, + "epoch": 0.6902750638809559, + "flos": 29573847302400.0, + "grad_norm": 1.6397986809458904, + "language_loss": 0.75654733, + "learning_rate": 9.245478147011885e-07, + "loss": 0.83330619, + "num_input_tokens_seen": 247811985, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.10870361, + "step": 11481, + "time_per_iteration": 2.557511806488037 + }, + { + "auxiliary_loss_clip": 0.06409919, + "auxiliary_loss_mlp": 0.01267642, + "balance_loss_clip": 0.06274407, + "balance_loss_mlp": 0.01257151, + "epoch": 0.690335187133624, + "flos": 25564253725440.0, + "grad_norm": 1.7034098487881468, + "language_loss": 0.69767886, + "learning_rate": 9.24219472319246e-07, + "loss": 0.77445447, + "num_input_tokens_seen": 247831880, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.10491943, + "step": 11482, + "time_per_iteration": 2.52620267868042 + }, + { + "auxiliary_loss_clip": 0.06410135, + "auxiliary_loss_mlp": 0.01265009, + "balance_loss_clip": 0.06271818, + "balance_loss_mlp": 0.0125403, + "epoch": 0.6903953103862919, + "flos": 22494418416000.0, + "grad_norm": 1.3936382068363662, + "language_loss": 0.82645047, + "learning_rate": 9.238911707310096e-07, + "loss": 0.90320188, + "num_input_tokens_seen": 247851170, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.10980225, + "step": 11483, + "time_per_iteration": 3.9243674278259277 + }, + { + "auxiliary_loss_clip": 0.06413989, + "auxiliary_loss_mlp": 0.01264114, + "balance_loss_clip": 0.06273346, + "balance_loss_mlp": 0.01254202, + "epoch": 0.6904554336389599, + "flos": 26107094401920.0, + "grad_norm": 1.7789545949672325, + "language_loss": 0.65774268, + "learning_rate": 9.235629099489273e-07, + "loss": 0.73452371, + "num_input_tokens_seen": 247868950, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.09918213, + "step": 11484, + "time_per_iteration": 2.570255994796753 + }, + { + "auxiliary_loss_clip": 0.06407849, + "auxiliary_loss_mlp": 0.01267989, + "balance_loss_clip": 0.06274161, + "balance_loss_mlp": 0.01257838, + "epoch": 0.6905155568916278, + "flos": 31179127023360.0, + "grad_norm": 1.529832254030816, + "language_loss": 0.73510063, + "learning_rate": 9.232346899854479e-07, + "loss": 0.81185901, + "num_input_tokens_seen": 247889805, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.1015625, + "step": 11485, + "time_per_iteration": 2.6148314476013184 + }, + { + "auxiliary_loss_clip": 0.06415319, + "auxiliary_loss_mlp": 0.0126655, + "balance_loss_clip": 0.0627619, + "balance_loss_mlp": 0.01255863, + "epoch": 0.6905756801442958, + "flos": 17645484090240.0, + "grad_norm": 1.7447168149804075, + "language_loss": 0.85063231, + "learning_rate": 9.22906510853017e-07, + "loss": 0.92745095, + "num_input_tokens_seen": 247908585, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.10687256, + "step": 11486, + "time_per_iteration": 2.5396366119384766 + }, + { + "auxiliary_loss_clip": 0.06414411, + "auxiliary_loss_mlp": 0.0126458, + "balance_loss_clip": 0.06275952, + "balance_loss_mlp": 0.01254071, + "epoch": 0.6906358033969637, + "flos": 22349836995840.0, + "grad_norm": 1.4442882109961312, + "language_loss": 0.73110938, + "learning_rate": 9.225783725640786e-07, + "loss": 0.8078993, + "num_input_tokens_seen": 247928480, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.10510254, + "step": 11487, + "time_per_iteration": 2.5067358016967773 + }, + { + "auxiliary_loss_clip": 0.06322645, + "auxiliary_loss_mlp": 0.01254949, + "balance_loss_clip": 0.06266931, + "balance_loss_mlp": 0.01253606, + "epoch": 0.6906959266496318, + "flos": 69769485573120.0, + "grad_norm": 0.8802440439282012, + "language_loss": 0.66566062, + "learning_rate": 9.222502751310759e-07, + "loss": 0.74143648, + "num_input_tokens_seen": 247988855, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.01345062, + "step": 11488, + "time_per_iteration": 3.1760408878326416 + }, + { + "auxiliary_loss_clip": 0.06420241, + "auxiliary_loss_mlp": 0.01268855, + "balance_loss_clip": 0.06275697, + "balance_loss_mlp": 0.01256773, + "epoch": 0.6907560499022997, + "flos": 21440700195840.0, + "grad_norm": 1.9049138044907, + "language_loss": 0.75416613, + "learning_rate": 9.219222185664519e-07, + "loss": 0.83105707, + "num_input_tokens_seen": 248007685, + "router_z_loss_clip": 1.4453125, + "router_z_loss_mlp": 0.12072754, + "step": 11489, + "time_per_iteration": 2.515700578689575 + }, + { + "auxiliary_loss_clip": 0.06413751, + "auxiliary_loss_mlp": 0.01269098, + "balance_loss_clip": 0.06274071, + "balance_loss_mlp": 0.01257862, + "epoch": 0.6908161731549677, + "flos": 14397427146240.0, + "grad_norm": 2.0018253870073806, + "language_loss": 0.62274224, + "learning_rate": 9.215942028826445e-07, + "loss": 0.69957072, + "num_input_tokens_seen": 248025145, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.11236572, + "step": 11490, + "time_per_iteration": 2.532935857772827 + }, + { + "auxiliary_loss_clip": 0.06417898, + "auxiliary_loss_mlp": 0.01266366, + "balance_loss_clip": 0.06278036, + "balance_loss_mlp": 0.01255911, + "epoch": 0.6908762964076357, + "flos": 20017122053760.0, + "grad_norm": 1.8130615922920168, + "language_loss": 0.73057532, + "learning_rate": 9.212662280920937e-07, + "loss": 0.80741799, + "num_input_tokens_seen": 248043750, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.10455322, + "step": 11491, + "time_per_iteration": 2.521466016769409 + }, + { + "auxiliary_loss_clip": 0.0640818, + "auxiliary_loss_mlp": 0.0126409, + "balance_loss_clip": 0.06273587, + "balance_loss_mlp": 0.01253117, + "epoch": 0.6909364196603036, + "flos": 28776951446400.0, + "grad_norm": 1.7336299759284137, + "language_loss": 0.7042138, + "learning_rate": 9.20938294207235e-07, + "loss": 0.78093648, + "num_input_tokens_seen": 248065765, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.10968018, + "step": 11492, + "time_per_iteration": 2.585730791091919 + }, + { + "auxiliary_loss_clip": 0.06420228, + "auxiliary_loss_mlp": 0.01266161, + "balance_loss_clip": 0.0627589, + "balance_loss_mlp": 0.01255545, + "epoch": 0.6909965429129716, + "flos": 22534641175680.0, + "grad_norm": 1.7712531915598577, + "language_loss": 0.7470516, + "learning_rate": 9.206104012405049e-07, + "loss": 0.82391548, + "num_input_tokens_seen": 248083810, + "router_z_loss_clip": 1.44335938, + "router_z_loss_mlp": 0.1060791, + "step": 11493, + "time_per_iteration": 2.5050244331359863 + }, + { + "auxiliary_loss_clip": 0.06412148, + "auxiliary_loss_mlp": 0.01265374, + "balance_loss_clip": 0.06274831, + "balance_loss_mlp": 0.01254211, + "epoch": 0.6910566661656395, + "flos": 18411884259840.0, + "grad_norm": 1.6258065693735415, + "language_loss": 0.74673963, + "learning_rate": 9.20282549204336e-07, + "loss": 0.82351482, + "num_input_tokens_seen": 248103185, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.1116333, + "step": 11494, + "time_per_iteration": 2.5276567935943604 + }, + { + "auxiliary_loss_clip": 0.06411964, + "auxiliary_loss_mlp": 0.01267354, + "balance_loss_clip": 0.06274857, + "balance_loss_mlp": 0.01257263, + "epoch": 0.6911167894183076, + "flos": 30781874016000.0, + "grad_norm": 1.529019816420153, + "language_loss": 0.68227768, + "learning_rate": 9.19954738111161e-07, + "loss": 0.75907087, + "num_input_tokens_seen": 248125665, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.10101318, + "step": 11495, + "time_per_iteration": 2.5842087268829346 + }, + { + "auxiliary_loss_clip": 0.06411652, + "auxiliary_loss_mlp": 0.01268081, + "balance_loss_clip": 0.06274678, + "balance_loss_mlp": 0.01256863, + "epoch": 0.6911769126709755, + "flos": 13740878079360.0, + "grad_norm": 1.6566133128888745, + "language_loss": 0.74368346, + "learning_rate": 9.196269679734119e-07, + "loss": 0.82048082, + "num_input_tokens_seen": 248142545, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.11224365, + "step": 11496, + "time_per_iteration": 2.5154151916503906 + }, + { + "auxiliary_loss_clip": 0.06410149, + "auxiliary_loss_mlp": 0.01262738, + "balance_loss_clip": 0.06274073, + "balance_loss_mlp": 0.01252987, + "epoch": 0.6912370359236435, + "flos": 17572669292160.0, + "grad_norm": 1.7205825998793636, + "language_loss": 0.80305141, + "learning_rate": 9.19299238803515e-07, + "loss": 0.87978023, + "num_input_tokens_seen": 248160225, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.09753418, + "step": 11497, + "time_per_iteration": 2.4925076961517334 + }, + { + "auxiliary_loss_clip": 0.06416431, + "auxiliary_loss_mlp": 0.01267714, + "balance_loss_clip": 0.06275152, + "balance_loss_mlp": 0.01256061, + "epoch": 0.6912971591763114, + "flos": 22097291189760.0, + "grad_norm": 1.653826561150034, + "language_loss": 0.8077867, + "learning_rate": 9.189715506138993e-07, + "loss": 0.88462818, + "num_input_tokens_seen": 248180430, + "router_z_loss_clip": 1.41308594, + "router_z_loss_mlp": 0.11651611, + "step": 11498, + "time_per_iteration": 2.5465574264526367 + }, + { + "auxiliary_loss_clip": 0.06408113, + "auxiliary_loss_mlp": 0.01262525, + "balance_loss_clip": 0.06274167, + "balance_loss_mlp": 0.01251701, + "epoch": 0.6913572824289794, + "flos": 29979276082560.0, + "grad_norm": 2.039776107623003, + "language_loss": 0.85973012, + "learning_rate": 9.186439034169915e-07, + "loss": 0.93643653, + "num_input_tokens_seen": 248202365, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.10827637, + "step": 11499, + "time_per_iteration": 2.5665283203125 + }, + { + "auxiliary_loss_clip": 0.06408866, + "auxiliary_loss_mlp": 0.01265419, + "balance_loss_clip": 0.06275891, + "balance_loss_mlp": 0.01255399, + "epoch": 0.6914174056816473, + "flos": 20455184799360.0, + "grad_norm": 1.6118393659485355, + "language_loss": 0.7559222, + "learning_rate": 9.183162972252145e-07, + "loss": 0.83266509, + "num_input_tokens_seen": 248221750, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.10021973, + "step": 11500, + "time_per_iteration": 2.503854751586914 + }, + { + "auxiliary_loss_clip": 0.06412221, + "auxiliary_loss_mlp": 0.01266959, + "balance_loss_clip": 0.06274468, + "balance_loss_mlp": 0.0125567, + "epoch": 0.6914775289343154, + "flos": 21287984929920.0, + "grad_norm": 1.8512682937239455, + "language_loss": 0.77863973, + "learning_rate": 9.179887320509921e-07, + "loss": 0.85543144, + "num_input_tokens_seen": 248239535, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.112854, + "step": 11501, + "time_per_iteration": 2.4953453540802 + }, + { + "auxiliary_loss_clip": 0.06417021, + "auxiliary_loss_mlp": 0.01267471, + "balance_loss_clip": 0.06276537, + "balance_loss_mlp": 0.01256748, + "epoch": 0.6915376521869833, + "flos": 23884859468160.0, + "grad_norm": 1.8723825147208624, + "language_loss": 0.73532307, + "learning_rate": 9.176612079067458e-07, + "loss": 0.81216794, + "num_input_tokens_seen": 248259055, + "router_z_loss_clip": 1.40527344, + "router_z_loss_mlp": 0.10717773, + "step": 11502, + "time_per_iteration": 2.5416178703308105 + }, + { + "auxiliary_loss_clip": 0.06414314, + "auxiliary_loss_mlp": 0.01265378, + "balance_loss_clip": 0.06275123, + "balance_loss_mlp": 0.01253993, + "epoch": 0.6915977754396513, + "flos": 11515079347200.0, + "grad_norm": 1.8781803370630783, + "language_loss": 0.73954153, + "learning_rate": 9.173337248048953e-07, + "loss": 0.81633848, + "num_input_tokens_seen": 248276765, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.11395264, + "step": 11503, + "time_per_iteration": 2.499391794204712 + }, + { + "auxiliary_loss_clip": 0.06408094, + "auxiliary_loss_mlp": 0.01262533, + "balance_loss_clip": 0.06271478, + "balance_loss_mlp": 0.01252233, + "epoch": 0.6916578986923193, + "flos": 22607833317120.0, + "grad_norm": 1.5988526178616205, + "language_loss": 0.77127218, + "learning_rate": 9.170062827578575e-07, + "loss": 0.84797841, + "num_input_tokens_seen": 248295310, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.10302734, + "step": 11504, + "time_per_iteration": 3.9501583576202393 + }, + { + "auxiliary_loss_clip": 0.06413034, + "auxiliary_loss_mlp": 0.01266076, + "balance_loss_clip": 0.06275813, + "balance_loss_mlp": 0.01255472, + "epoch": 0.6917180219449872, + "flos": 23484126516480.0, + "grad_norm": 1.8617681816675509, + "language_loss": 0.73855585, + "learning_rate": 9.166788817780499e-07, + "loss": 0.81534696, + "num_input_tokens_seen": 248315230, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.10601807, + "step": 11505, + "time_per_iteration": 2.5829193592071533 + }, + { + "auxiliary_loss_clip": 0.06409241, + "auxiliary_loss_mlp": 0.01267959, + "balance_loss_clip": 0.06273368, + "balance_loss_mlp": 0.0125723, + "epoch": 0.6917781451976552, + "flos": 23739313726080.0, + "grad_norm": 1.75743437760736, + "language_loss": 0.876764, + "learning_rate": 9.163515218778886e-07, + "loss": 0.95353591, + "num_input_tokens_seen": 248332980, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.1072998, + "step": 11506, + "time_per_iteration": 2.5154294967651367 + }, + { + "auxiliary_loss_clip": 0.06412455, + "auxiliary_loss_mlp": 0.01265369, + "balance_loss_clip": 0.06276374, + "balance_loss_mlp": 0.01254783, + "epoch": 0.6918382684503231, + "flos": 31474704700800.0, + "grad_norm": 2.0688391280679648, + "language_loss": 0.7024008, + "learning_rate": 9.160242030697856e-07, + "loss": 0.7791791, + "num_input_tokens_seen": 248352865, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.105896, + "step": 11507, + "time_per_iteration": 2.5845768451690674 + }, + { + "auxiliary_loss_clip": 0.06413335, + "auxiliary_loss_mlp": 0.01264122, + "balance_loss_clip": 0.06273569, + "balance_loss_mlp": 0.01253631, + "epoch": 0.6918983917029912, + "flos": 21656503186560.0, + "grad_norm": 1.743467082940077, + "language_loss": 0.77142328, + "learning_rate": 9.156969253661538e-07, + "loss": 0.84819788, + "num_input_tokens_seen": 248371125, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.10491943, + "step": 11508, + "time_per_iteration": 2.4946086406707764 + }, + { + "auxiliary_loss_clip": 0.06406476, + "auxiliary_loss_mlp": 0.01267235, + "balance_loss_clip": 0.06273084, + "balance_loss_mlp": 0.01257501, + "epoch": 0.6919585149556591, + "flos": 25556036025600.0, + "grad_norm": 1.485663055998357, + "language_loss": 0.75072491, + "learning_rate": 9.153696887794027e-07, + "loss": 0.82746202, + "num_input_tokens_seen": 248390455, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.09735107, + "step": 11509, + "time_per_iteration": 2.591611623764038 + }, + { + "auxiliary_loss_clip": 0.06409086, + "auxiliary_loss_mlp": 0.0126353, + "balance_loss_clip": 0.06273773, + "balance_loss_mlp": 0.01253344, + "epoch": 0.6920186382083271, + "flos": 23666582782080.0, + "grad_norm": 1.6709622746913153, + "language_loss": 0.64358246, + "learning_rate": 9.150424933219425e-07, + "loss": 0.7203086, + "num_input_tokens_seen": 248411305, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.10192871, + "step": 11510, + "time_per_iteration": 2.522277593612671 + }, + { + "auxiliary_loss_clip": 0.06419423, + "auxiliary_loss_mlp": 0.0126943, + "balance_loss_clip": 0.06276652, + "balance_loss_mlp": 0.01257938, + "epoch": 0.692078761460995, + "flos": 19067888275200.0, + "grad_norm": 1.58502931536568, + "language_loss": 0.75757432, + "learning_rate": 9.147153390061788e-07, + "loss": 0.83446282, + "num_input_tokens_seen": 248430190, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.1149292, + "step": 11511, + "time_per_iteration": 2.5163841247558594 + }, + { + "auxiliary_loss_clip": 0.06410709, + "auxiliary_loss_mlp": 0.0126443, + "balance_loss_clip": 0.06275946, + "balance_loss_mlp": 0.01254482, + "epoch": 0.692138884713663, + "flos": 29031006625920.0, + "grad_norm": 1.5915143740912923, + "language_loss": 0.62864697, + "learning_rate": 9.143882258445184e-07, + "loss": 0.70539832, + "num_input_tokens_seen": 248450830, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.0994873, + "step": 11512, + "time_per_iteration": 2.5597567558288574 + }, + { + "auxiliary_loss_clip": 0.06413583, + "auxiliary_loss_mlp": 0.01267879, + "balance_loss_clip": 0.06275637, + "balance_loss_mlp": 0.01257323, + "epoch": 0.6921990079663309, + "flos": 14763262072320.0, + "grad_norm": 2.1370127100150373, + "language_loss": 0.83359182, + "learning_rate": 9.140611538493666e-07, + "loss": 0.91040647, + "num_input_tokens_seen": 248468585, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.10559082, + "step": 11513, + "time_per_iteration": 2.5295650959014893 + }, + { + "auxiliary_loss_clip": 0.06406762, + "auxiliary_loss_mlp": 0.01263079, + "balance_loss_clip": 0.06272393, + "balance_loss_mlp": 0.01253614, + "epoch": 0.692259131218999, + "flos": 23848619777280.0, + "grad_norm": 1.3335195335102994, + "language_loss": 0.78370172, + "learning_rate": 9.137341230331233e-07, + "loss": 0.86040014, + "num_input_tokens_seen": 248490535, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.09466553, + "step": 11514, + "time_per_iteration": 2.5325093269348145 + }, + { + "auxiliary_loss_clip": 0.06413436, + "auxiliary_loss_mlp": 0.01264156, + "balance_loss_clip": 0.06271526, + "balance_loss_mlp": 0.0125323, + "epoch": 0.6923192544716669, + "flos": 19141038489600.0, + "grad_norm": 1.7641312985276416, + "language_loss": 0.7541517, + "learning_rate": 9.134071334081907e-07, + "loss": 0.83092761, + "num_input_tokens_seen": 248508575, + "router_z_loss_clip": 1.41796875, + "router_z_loss_mlp": 0.10919189, + "step": 11515, + "time_per_iteration": 2.4964303970336914 + }, + { + "auxiliary_loss_clip": 0.06405345, + "auxiliary_loss_mlp": 0.01265608, + "balance_loss_clip": 0.06272751, + "balance_loss_mlp": 0.01255606, + "epoch": 0.6923793777243349, + "flos": 28082192117760.0, + "grad_norm": 1.899911587445346, + "language_loss": 0.53861475, + "learning_rate": 9.130801849869694e-07, + "loss": 0.61532426, + "num_input_tokens_seen": 248527025, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.10003662, + "step": 11516, + "time_per_iteration": 3.975773811340332 + }, + { + "auxiliary_loss_clip": 0.06402789, + "auxiliary_loss_mlp": 0.01269302, + "balance_loss_clip": 0.06273137, + "balance_loss_mlp": 0.01258812, + "epoch": 0.6924395009770029, + "flos": 16586818479360.0, + "grad_norm": 1.754197992941401, + "language_loss": 0.73113155, + "learning_rate": 9.127532777818557e-07, + "loss": 0.80785251, + "num_input_tokens_seen": 248544275, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.1048584, + "step": 11517, + "time_per_iteration": 2.5128793716430664 + }, + { + "auxiliary_loss_clip": 0.06413449, + "auxiliary_loss_mlp": 0.01270737, + "balance_loss_clip": 0.06275631, + "balance_loss_mlp": 0.01260223, + "epoch": 0.6924996242296708, + "flos": 16661058796800.0, + "grad_norm": 1.5645702983922471, + "language_loss": 0.76377338, + "learning_rate": 9.124264118052465e-07, + "loss": 0.84061527, + "num_input_tokens_seen": 248561870, + "router_z_loss_clip": 1.37695312, + "router_z_loss_mlp": 0.10510254, + "step": 11518, + "time_per_iteration": 4.030726432800293 + }, + { + "auxiliary_loss_clip": 0.06418861, + "auxiliary_loss_mlp": 0.01271759, + "balance_loss_clip": 0.06276505, + "balance_loss_mlp": 0.01260065, + "epoch": 0.6925597474823388, + "flos": 34763277893760.0, + "grad_norm": 1.2922865476436283, + "language_loss": 0.64748263, + "learning_rate": 9.120995870695376e-07, + "loss": 0.72438884, + "num_input_tokens_seen": 248588190, + "router_z_loss_clip": 1.42382812, + "router_z_loss_mlp": 0.11712646, + "step": 11519, + "time_per_iteration": 2.6468279361724854 + }, + { + "auxiliary_loss_clip": 0.06410517, + "auxiliary_loss_mlp": 0.01266916, + "balance_loss_clip": 0.06272532, + "balance_loss_mlp": 0.01255746, + "epoch": 0.6926198707350067, + "flos": 21878175962880.0, + "grad_norm": 1.754829284599123, + "language_loss": 0.62671852, + "learning_rate": 9.117728035871212e-07, + "loss": 0.70349276, + "num_input_tokens_seen": 248606460, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.1116333, + "step": 11520, + "time_per_iteration": 2.6443254947662354 + }, + { + "auxiliary_loss_clip": 0.06421007, + "auxiliary_loss_mlp": 0.0127025, + "balance_loss_clip": 0.06274754, + "balance_loss_mlp": 0.01259104, + "epoch": 0.6926799939876748, + "flos": 13011346506240.0, + "grad_norm": 1.8045037459633815, + "language_loss": 0.78247267, + "learning_rate": 9.114460613703887e-07, + "loss": 0.85938519, + "num_input_tokens_seen": 248623715, + "router_z_loss_clip": 1.46484375, + "router_z_loss_mlp": 0.11151123, + "step": 11521, + "time_per_iteration": 2.540693521499634 + }, + { + "auxiliary_loss_clip": 0.0641452, + "auxiliary_loss_mlp": 0.0126495, + "balance_loss_clip": 0.06273233, + "balance_loss_mlp": 0.0125356, + "epoch": 0.6927401172403427, + "flos": 16766423706240.0, + "grad_norm": 1.8333636519131566, + "language_loss": 0.82234508, + "learning_rate": 9.111193604317304e-07, + "loss": 0.89913976, + "num_input_tokens_seen": 248640575, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.11383057, + "step": 11522, + "time_per_iteration": 3.9248740673065186 + }, + { + "auxiliary_loss_clip": 0.06410085, + "auxiliary_loss_mlp": 0.01264492, + "balance_loss_clip": 0.06274056, + "balance_loss_mlp": 0.01254013, + "epoch": 0.6928002404930107, + "flos": 25713237484800.0, + "grad_norm": 1.543280654363121, + "language_loss": 0.77247906, + "learning_rate": 9.107927007835361e-07, + "loss": 0.84922481, + "num_input_tokens_seen": 248663535, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.10479736, + "step": 11523, + "time_per_iteration": 2.6300647258758545 + }, + { + "auxiliary_loss_clip": 0.0640799, + "auxiliary_loss_mlp": 0.01264871, + "balance_loss_clip": 0.06273483, + "balance_loss_mlp": 0.01255227, + "epoch": 0.6928603637456786, + "flos": 18594214744320.0, + "grad_norm": 1.7989990955818747, + "language_loss": 0.68682468, + "learning_rate": 9.104660824381915e-07, + "loss": 0.76355332, + "num_input_tokens_seen": 248681125, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.09637451, + "step": 11524, + "time_per_iteration": 2.4765005111694336 + }, + { + "auxiliary_loss_clip": 0.06415472, + "auxiliary_loss_mlp": 0.01265103, + "balance_loss_clip": 0.06274404, + "balance_loss_mlp": 0.0125385, + "epoch": 0.6929204869983466, + "flos": 22207519635840.0, + "grad_norm": 1.775837201090113, + "language_loss": 0.64731717, + "learning_rate": 9.101395054080815e-07, + "loss": 0.72412294, + "num_input_tokens_seen": 248700555, + "router_z_loss_clip": 1.41210938, + "router_z_loss_mlp": 0.1126709, + "step": 11525, + "time_per_iteration": 2.5243499279022217 + }, + { + "auxiliary_loss_clip": 0.06416623, + "auxiliary_loss_mlp": 0.01268916, + "balance_loss_clip": 0.06279063, + "balance_loss_mlp": 0.01258568, + "epoch": 0.6929806102510145, + "flos": 17900545518720.0, + "grad_norm": 2.0930840901881007, + "language_loss": 0.70522892, + "learning_rate": 9.098129697055907e-07, + "loss": 0.78208423, + "num_input_tokens_seen": 248716095, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.10351562, + "step": 11526, + "time_per_iteration": 2.4600794315338135 + }, + { + "auxiliary_loss_clip": 0.06409934, + "auxiliary_loss_mlp": 0.01263712, + "balance_loss_clip": 0.06273712, + "balance_loss_mlp": 0.01253186, + "epoch": 0.6930407335036826, + "flos": 19761222084480.0, + "grad_norm": 1.7010928543667516, + "language_loss": 0.76265514, + "learning_rate": 9.094864753431022e-07, + "loss": 0.83939159, + "num_input_tokens_seen": 248735330, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.10516357, + "step": 11527, + "time_per_iteration": 2.5164694786071777 + }, + { + "auxiliary_loss_clip": 0.06411794, + "auxiliary_loss_mlp": 0.01263204, + "balance_loss_clip": 0.06273556, + "balance_loss_mlp": 0.01253149, + "epoch": 0.6931008567563505, + "flos": 21550802860800.0, + "grad_norm": 1.5438747158568011, + "language_loss": 0.79877269, + "learning_rate": 9.091600223329952e-07, + "loss": 0.87552267, + "num_input_tokens_seen": 248754530, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.1005249, + "step": 11528, + "time_per_iteration": 2.501044988632202 + }, + { + "auxiliary_loss_clip": 0.06405636, + "auxiliary_loss_mlp": 0.01267062, + "balance_loss_clip": 0.06273603, + "balance_loss_mlp": 0.01256917, + "epoch": 0.6931609800090185, + "flos": 26257210191360.0, + "grad_norm": 1.3083455635421857, + "language_loss": 0.75950116, + "learning_rate": 9.088336106876491e-07, + "loss": 0.83622813, + "num_input_tokens_seen": 248775825, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.10144043, + "step": 11529, + "time_per_iteration": 2.5608596801757812 + }, + { + "auxiliary_loss_clip": 0.06410852, + "auxiliary_loss_mlp": 0.01265207, + "balance_loss_clip": 0.06276192, + "balance_loss_mlp": 0.01254961, + "epoch": 0.6932211032616865, + "flos": 32351626805760.0, + "grad_norm": 2.07531682890069, + "language_loss": 0.73131585, + "learning_rate": 9.085072404194436e-07, + "loss": 0.80807638, + "num_input_tokens_seen": 248796180, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.10241699, + "step": 11530, + "time_per_iteration": 2.5931029319763184 + }, + { + "auxiliary_loss_clip": 0.06423162, + "auxiliary_loss_mlp": 0.01267459, + "balance_loss_clip": 0.06278834, + "balance_loss_mlp": 0.0125598, + "epoch": 0.6932812265143544, + "flos": 22054720515840.0, + "grad_norm": 1.8331163383956572, + "language_loss": 0.78110623, + "learning_rate": 9.081809115407513e-07, + "loss": 0.85801244, + "num_input_tokens_seen": 248814735, + "router_z_loss_clip": 1.44238281, + "router_z_loss_mlp": 0.11474609, + "step": 11531, + "time_per_iteration": 2.537781000137329 + }, + { + "auxiliary_loss_clip": 0.06406952, + "auxiliary_loss_mlp": 0.01266064, + "balance_loss_clip": 0.06274234, + "balance_loss_mlp": 0.01256092, + "epoch": 0.6933413497670224, + "flos": 26264924766720.0, + "grad_norm": 1.4723585148230005, + "language_loss": 0.69516993, + "learning_rate": 9.078546240639484e-07, + "loss": 0.77190006, + "num_input_tokens_seen": 248839140, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.09973145, + "step": 11532, + "time_per_iteration": 2.6068294048309326 + }, + { + "auxiliary_loss_clip": 0.06414198, + "auxiliary_loss_mlp": 0.01265385, + "balance_loss_clip": 0.06275293, + "balance_loss_mlp": 0.0125403, + "epoch": 0.6934014730196904, + "flos": 19579059308160.0, + "grad_norm": 1.68179431170249, + "language_loss": 0.66939062, + "learning_rate": 9.075283780014082e-07, + "loss": 0.74618644, + "num_input_tokens_seen": 248858300, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.11358643, + "step": 11533, + "time_per_iteration": 2.5188937187194824 + }, + { + "auxiliary_loss_clip": 0.06414025, + "auxiliary_loss_mlp": 0.01266342, + "balance_loss_clip": 0.06274263, + "balance_loss_mlp": 0.01254892, + "epoch": 0.6934615962723584, + "flos": 22124432712960.0, + "grad_norm": 2.2635878062852384, + "language_loss": 0.59154713, + "learning_rate": 9.072021733655007e-07, + "loss": 0.66835076, + "num_input_tokens_seen": 248876310, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.11456299, + "step": 11534, + "time_per_iteration": 2.513169288635254 + }, + { + "auxiliary_loss_clip": 0.06412862, + "auxiliary_loss_mlp": 0.01265908, + "balance_loss_clip": 0.06276149, + "balance_loss_mlp": 0.01255639, + "epoch": 0.6935217195250263, + "flos": 21367172638080.0, + "grad_norm": 2.468732709113743, + "language_loss": 0.71063632, + "learning_rate": 9.068760101685971e-07, + "loss": 0.78742403, + "num_input_tokens_seen": 248895650, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.10266113, + "step": 11535, + "time_per_iteration": 2.5125019550323486 + }, + { + "auxiliary_loss_clip": 0.0632171, + "auxiliary_loss_mlp": 0.012535, + "balance_loss_clip": 0.06265885, + "balance_loss_mlp": 0.01252321, + "epoch": 0.6935818427776943, + "flos": 64085864400000.0, + "grad_norm": 0.6899850160451471, + "language_loss": 0.58968407, + "learning_rate": 9.065498884230638e-07, + "loss": 0.66543621, + "num_input_tokens_seen": 248963920, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.01176453, + "step": 11536, + "time_per_iteration": 3.2811362743377686 + }, + { + "auxiliary_loss_clip": 0.06415699, + "auxiliary_loss_mlp": 0.01266201, + "balance_loss_clip": 0.06274739, + "balance_loss_mlp": 0.01255628, + "epoch": 0.6936419660303622, + "flos": 20308716662400.0, + "grad_norm": 1.4806055752543272, + "language_loss": 0.72754341, + "learning_rate": 9.062238081412692e-07, + "loss": 0.80436242, + "num_input_tokens_seen": 248983380, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.10571289, + "step": 11537, + "time_per_iteration": 2.521667242050171 + }, + { + "auxiliary_loss_clip": 0.06322287, + "auxiliary_loss_mlp": 0.01253211, + "balance_loss_clip": 0.06266545, + "balance_loss_mlp": 0.01252035, + "epoch": 0.6937020892830302, + "flos": 67201974691200.0, + "grad_norm": 0.7781896456354132, + "language_loss": 0.5562225, + "learning_rate": 9.058977693355767e-07, + "loss": 0.63197744, + "num_input_tokens_seen": 249044680, + "router_z_loss_clip": 0.55517578, + "router_z_loss_mlp": 0.01173401, + "step": 11538, + "time_per_iteration": 3.133890390396118 + }, + { + "auxiliary_loss_clip": 0.06402846, + "auxiliary_loss_mlp": 0.01263458, + "balance_loss_clip": 0.0627329, + "balance_loss_mlp": 0.01253844, + "epoch": 0.6937622125356981, + "flos": 23884943322240.0, + "grad_norm": 1.4430233846230829, + "language_loss": 0.7770322, + "learning_rate": 9.055717720183505e-07, + "loss": 0.85369527, + "num_input_tokens_seen": 249061060, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.09613037, + "step": 11539, + "time_per_iteration": 2.5152971744537354 + }, + { + "auxiliary_loss_clip": 0.0640855, + "auxiliary_loss_mlp": 0.01262731, + "balance_loss_clip": 0.06274487, + "balance_loss_mlp": 0.01252664, + "epoch": 0.6938223357883662, + "flos": 28738154206080.0, + "grad_norm": 1.7708768043043424, + "language_loss": 0.64184511, + "learning_rate": 9.05245816201953e-07, + "loss": 0.71855795, + "num_input_tokens_seen": 249081430, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.10070801, + "step": 11540, + "time_per_iteration": 2.5849952697753906 + }, + { + "auxiliary_loss_clip": 0.06409811, + "auxiliary_loss_mlp": 0.01263592, + "balance_loss_clip": 0.06274833, + "balance_loss_mlp": 0.01254288, + "epoch": 0.6938824590410341, + "flos": 28662111025920.0, + "grad_norm": 1.4340903998261632, + "language_loss": 0.87096, + "learning_rate": 9.049199018987437e-07, + "loss": 0.94769406, + "num_input_tokens_seen": 249103020, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.09301758, + "step": 11541, + "time_per_iteration": 2.5415987968444824 + }, + { + "auxiliary_loss_clip": 0.06411604, + "auxiliary_loss_mlp": 0.01265165, + "balance_loss_clip": 0.06272925, + "balance_loss_mlp": 0.0125474, + "epoch": 0.6939425822937021, + "flos": 18987987807360.0, + "grad_norm": 1.6079825627082245, + "language_loss": 0.84464371, + "learning_rate": 9.04594029121081e-07, + "loss": 0.92141145, + "num_input_tokens_seen": 249120810, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.10418701, + "step": 11542, + "time_per_iteration": 2.499424457550049 + }, + { + "auxiliary_loss_clip": 0.06415489, + "auxiliary_loss_mlp": 0.01265068, + "balance_loss_clip": 0.06275496, + "balance_loss_mlp": 0.01254136, + "epoch": 0.6940027055463701, + "flos": 23082513096960.0, + "grad_norm": 1.8518042954467828, + "language_loss": 0.75316143, + "learning_rate": 9.04268197881323e-07, + "loss": 0.82996696, + "num_input_tokens_seen": 249138050, + "router_z_loss_clip": 1.39941406, + "router_z_loss_mlp": 0.10931396, + "step": 11543, + "time_per_iteration": 3.9085495471954346 + }, + { + "auxiliary_loss_clip": 0.06410378, + "auxiliary_loss_mlp": 0.01265988, + "balance_loss_clip": 0.06273862, + "balance_loss_mlp": 0.01255373, + "epoch": 0.694062828799038, + "flos": 18192391689600.0, + "grad_norm": 1.648222513312388, + "language_loss": 0.76331246, + "learning_rate": 9.039424081918241e-07, + "loss": 0.84007609, + "num_input_tokens_seen": 249155570, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.10614014, + "step": 11544, + "time_per_iteration": 2.5347986221313477 + }, + { + "auxiliary_loss_clip": 0.06413911, + "auxiliary_loss_mlp": 0.0126496, + "balance_loss_clip": 0.06275374, + "balance_loss_mlp": 0.012541, + "epoch": 0.694122952051706, + "flos": 17827269523200.0, + "grad_norm": 1.8058959765981615, + "language_loss": 0.71283519, + "learning_rate": 9.036166600649388e-07, + "loss": 0.78962398, + "num_input_tokens_seen": 249172960, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.10864258, + "step": 11545, + "time_per_iteration": 2.4718210697174072 + }, + { + "auxiliary_loss_clip": 0.06407937, + "auxiliary_loss_mlp": 0.01262856, + "balance_loss_clip": 0.06275916, + "balance_loss_mlp": 0.01253039, + "epoch": 0.694183075304374, + "flos": 21221710750080.0, + "grad_norm": 1.516472070644587, + "language_loss": 0.79896855, + "learning_rate": 9.0329095351302e-07, + "loss": 0.87567645, + "num_input_tokens_seen": 249192450, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.09814453, + "step": 11546, + "time_per_iteration": 2.5148062705993652 + }, + { + "auxiliary_loss_clip": 0.06411743, + "auxiliary_loss_mlp": 0.01267153, + "balance_loss_clip": 0.06275012, + "balance_loss_mlp": 0.01256281, + "epoch": 0.694243198557042, + "flos": 24067273806720.0, + "grad_norm": 1.4558199270771826, + "language_loss": 0.7883184, + "learning_rate": 9.029652885484194e-07, + "loss": 0.8651073, + "num_input_tokens_seen": 249214320, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.10870361, + "step": 11547, + "time_per_iteration": 2.5461182594299316 + }, + { + "auxiliary_loss_clip": 0.06409074, + "auxiliary_loss_mlp": 0.01267288, + "balance_loss_clip": 0.06275046, + "balance_loss_mlp": 0.01256845, + "epoch": 0.6943033218097099, + "flos": 21148183192320.0, + "grad_norm": 2.180775706849967, + "language_loss": 0.80900609, + "learning_rate": 9.026396651834834e-07, + "loss": 0.88576972, + "num_input_tokens_seen": 249230925, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.10443115, + "step": 11548, + "time_per_iteration": 2.499633312225342 + }, + { + "auxiliary_loss_clip": 0.06316315, + "auxiliary_loss_mlp": 0.01251651, + "balance_loss_clip": 0.06260554, + "balance_loss_mlp": 0.01250445, + "epoch": 0.6943634450623779, + "flos": 57830892163200.0, + "grad_norm": 0.8127275261655555, + "language_loss": 0.53539848, + "learning_rate": 9.023140834305613e-07, + "loss": 0.61107814, + "num_input_tokens_seen": 249293975, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.01203918, + "step": 11549, + "time_per_iteration": 3.1340725421905518 + }, + { + "auxiliary_loss_clip": 0.06409207, + "auxiliary_loss_mlp": 0.01267856, + "balance_loss_clip": 0.0627339, + "balance_loss_mlp": 0.01256924, + "epoch": 0.6944235683150458, + "flos": 30598411501440.0, + "grad_norm": 1.3218169673539149, + "language_loss": 0.73849893, + "learning_rate": 9.01988543302e-07, + "loss": 0.81526959, + "num_input_tokens_seen": 249315285, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.109375, + "step": 11550, + "time_per_iteration": 2.5708651542663574 + }, + { + "auxiliary_loss_clip": 0.06414837, + "auxiliary_loss_mlp": 0.01267221, + "balance_loss_clip": 0.06273603, + "balance_loss_mlp": 0.01255836, + "epoch": 0.6944836915677138, + "flos": 19725611299200.0, + "grad_norm": 2.422306593837277, + "language_loss": 0.7436735, + "learning_rate": 9.016630448101425e-07, + "loss": 0.82049412, + "num_input_tokens_seen": 249333505, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.11364746, + "step": 11551, + "time_per_iteration": 2.527280807495117 + }, + { + "auxiliary_loss_clip": 0.06412678, + "auxiliary_loss_mlp": 0.01266399, + "balance_loss_clip": 0.06274699, + "balance_loss_mlp": 0.01255592, + "epoch": 0.6945438148203817, + "flos": 24870542572800.0, + "grad_norm": 1.4976139060418592, + "language_loss": 0.84468353, + "learning_rate": 9.01337587967333e-07, + "loss": 0.92147428, + "num_input_tokens_seen": 249354180, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.10797119, + "step": 11552, + "time_per_iteration": 2.5304994583129883 + }, + { + "auxiliary_loss_clip": 0.06412995, + "auxiliary_loss_mlp": 0.01266444, + "balance_loss_clip": 0.06275281, + "balance_loss_mlp": 0.01255787, + "epoch": 0.6946039380730498, + "flos": 33334752360960.0, + "grad_norm": 1.8566044703469122, + "language_loss": 0.67553848, + "learning_rate": 9.010121727859117e-07, + "loss": 0.75233287, + "num_input_tokens_seen": 249377035, + "router_z_loss_clip": 1.37695312, + "router_z_loss_mlp": 0.10656738, + "step": 11553, + "time_per_iteration": 2.6192421913146973 + }, + { + "auxiliary_loss_clip": 0.064182, + "auxiliary_loss_mlp": 0.01265466, + "balance_loss_clip": 0.06275068, + "balance_loss_mlp": 0.01254314, + "epoch": 0.6946640613257177, + "flos": 20857385197440.0, + "grad_norm": 1.702671495962781, + "language_loss": 0.79674661, + "learning_rate": 9.006867992782195e-07, + "loss": 0.87358326, + "num_input_tokens_seen": 249396155, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.11138916, + "step": 11554, + "time_per_iteration": 2.486833095550537 + }, + { + "auxiliary_loss_clip": 0.06411414, + "auxiliary_loss_mlp": 0.0126656, + "balance_loss_clip": 0.06272921, + "balance_loss_mlp": 0.01256064, + "epoch": 0.6947241845783857, + "flos": 19360992257280.0, + "grad_norm": 2.4583328560659825, + "language_loss": 0.72664356, + "learning_rate": 9.003614674565934e-07, + "loss": 0.80342329, + "num_input_tokens_seen": 249414555, + "router_z_loss_clip": 1.38476562, + "router_z_loss_mlp": 0.10498047, + "step": 11555, + "time_per_iteration": 4.000531196594238 + }, + { + "auxiliary_loss_clip": 0.0640734, + "auxiliary_loss_mlp": 0.01264698, + "balance_loss_clip": 0.0627168, + "balance_loss_mlp": 0.01254404, + "epoch": 0.6947843078310536, + "flos": 27126669283200.0, + "grad_norm": 1.6806828217534537, + "language_loss": 0.78220618, + "learning_rate": 9.000361773333705e-07, + "loss": 0.85892653, + "num_input_tokens_seen": 249433570, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10284424, + "step": 11556, + "time_per_iteration": 2.5366411209106445 + }, + { + "auxiliary_loss_clip": 0.06412055, + "auxiliary_loss_mlp": 0.01264593, + "balance_loss_clip": 0.06273782, + "balance_loss_mlp": 0.01254198, + "epoch": 0.6948444310837216, + "flos": 28592692318080.0, + "grad_norm": 2.2663636290746205, + "language_loss": 0.60655725, + "learning_rate": 8.997109289208869e-07, + "loss": 0.68332362, + "num_input_tokens_seen": 249453735, + "router_z_loss_clip": 1.38183594, + "router_z_loss_mlp": 0.10394287, + "step": 11557, + "time_per_iteration": 2.5730667114257812 + }, + { + "auxiliary_loss_clip": 0.06406298, + "auxiliary_loss_mlp": 0.0126677, + "balance_loss_clip": 0.06273069, + "balance_loss_mlp": 0.01256923, + "epoch": 0.6949045543363896, + "flos": 15674704859520.0, + "grad_norm": 1.6481144158645147, + "language_loss": 0.85564643, + "learning_rate": 8.993857222314752e-07, + "loss": 0.9323771, + "num_input_tokens_seen": 249470805, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.09851074, + "step": 11558, + "time_per_iteration": 3.9160499572753906 + }, + { + "auxiliary_loss_clip": 0.06415498, + "auxiliary_loss_mlp": 0.01268636, + "balance_loss_clip": 0.06274904, + "balance_loss_mlp": 0.01257764, + "epoch": 0.6949646775890576, + "flos": 23266311027840.0, + "grad_norm": 1.591782165805242, + "language_loss": 0.70581871, + "learning_rate": 8.990605572774664e-07, + "loss": 0.78266007, + "num_input_tokens_seen": 249491150, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.10876465, + "step": 11559, + "time_per_iteration": 2.527818441390991 + }, + { + "auxiliary_loss_clip": 0.06411439, + "auxiliary_loss_mlp": 0.01267371, + "balance_loss_clip": 0.06274717, + "balance_loss_mlp": 0.01256946, + "epoch": 0.6950248008417256, + "flos": 22389095433600.0, + "grad_norm": 1.4072009263276422, + "language_loss": 0.78738344, + "learning_rate": 8.987354340711921e-07, + "loss": 0.8641715, + "num_input_tokens_seen": 249511560, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.10424805, + "step": 11560, + "time_per_iteration": 2.5627846717834473 + }, + { + "auxiliary_loss_clip": 0.06408294, + "auxiliary_loss_mlp": 0.01264919, + "balance_loss_clip": 0.06274506, + "balance_loss_mlp": 0.01255614, + "epoch": 0.6950849240943935, + "flos": 23484126516480.0, + "grad_norm": 1.4947787442240967, + "language_loss": 0.76889873, + "learning_rate": 8.9841035262498e-07, + "loss": 0.84563088, + "num_input_tokens_seen": 249531910, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.09307861, + "step": 11561, + "time_per_iteration": 2.4997048377990723 + }, + { + "auxiliary_loss_clip": 0.06411804, + "auxiliary_loss_mlp": 0.01269689, + "balance_loss_clip": 0.06277403, + "balance_loss_mlp": 0.012589, + "epoch": 0.6951450473470615, + "flos": 17426285009280.0, + "grad_norm": 1.734417047783141, + "language_loss": 0.78360051, + "learning_rate": 8.980853129511577e-07, + "loss": 0.86041546, + "num_input_tokens_seen": 249550300, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.10784912, + "step": 11562, + "time_per_iteration": 3.868687868118286 + }, + { + "auxiliary_loss_clip": 0.06413691, + "auxiliary_loss_mlp": 0.0126516, + "balance_loss_clip": 0.06274996, + "balance_loss_mlp": 0.01254509, + "epoch": 0.6952051705997294, + "flos": 20492053395840.0, + "grad_norm": 2.791172268200526, + "language_loss": 0.69210434, + "learning_rate": 8.977603150620515e-07, + "loss": 0.76889294, + "num_input_tokens_seen": 249567740, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.10656738, + "step": 11563, + "time_per_iteration": 2.521984338760376 + }, + { + "auxiliary_loss_clip": 0.0640626, + "auxiliary_loss_mlp": 0.01264877, + "balance_loss_clip": 0.06274064, + "balance_loss_mlp": 0.01255006, + "epoch": 0.6952652938523974, + "flos": 13994472061440.0, + "grad_norm": 2.2938813143699943, + "language_loss": 0.73795921, + "learning_rate": 8.974353589699846e-07, + "loss": 0.81467056, + "num_input_tokens_seen": 249582700, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.09869385, + "step": 11564, + "time_per_iteration": 2.454090118408203 + }, + { + "auxiliary_loss_clip": 0.06431751, + "auxiliary_loss_mlp": 0.01272001, + "balance_loss_clip": 0.06280031, + "balance_loss_mlp": 0.01259174, + "epoch": 0.6953254171050653, + "flos": 30961479242880.0, + "grad_norm": 1.9156541387809913, + "language_loss": 0.71630907, + "learning_rate": 8.971104446872785e-07, + "loss": 0.79334664, + "num_input_tokens_seen": 249602920, + "router_z_loss_clip": 1.51660156, + "router_z_loss_mlp": 0.12823486, + "step": 11565, + "time_per_iteration": 2.6339352130889893 + }, + { + "auxiliary_loss_clip": 0.06312925, + "auxiliary_loss_mlp": 0.01254517, + "balance_loss_clip": 0.0625705, + "balance_loss_mlp": 0.01253326, + "epoch": 0.6953855403577334, + "flos": 61688231671680.0, + "grad_norm": 0.9056621867794188, + "language_loss": 0.58358586, + "learning_rate": 8.96785572226255e-07, + "loss": 0.65926027, + "num_input_tokens_seen": 249660400, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.01189423, + "step": 11566, + "time_per_iteration": 2.9703423976898193 + }, + { + "auxiliary_loss_clip": 0.0641438, + "auxiliary_loss_mlp": 0.01265896, + "balance_loss_clip": 0.06273914, + "balance_loss_mlp": 0.01254237, + "epoch": 0.6954456636104013, + "flos": 23045644500480.0, + "grad_norm": 1.741502187715767, + "language_loss": 0.74213183, + "learning_rate": 8.964607415992338e-07, + "loss": 0.81893462, + "num_input_tokens_seen": 249679335, + "router_z_loss_clip": 1.40429688, + "router_z_loss_mlp": 0.11663818, + "step": 11567, + "time_per_iteration": 2.5282747745513916 + }, + { + "auxiliary_loss_clip": 0.06409914, + "auxiliary_loss_mlp": 0.01264668, + "balance_loss_clip": 0.06274567, + "balance_loss_mlp": 0.0125382, + "epoch": 0.6955057868630693, + "flos": 23925920768640.0, + "grad_norm": 1.2088897193849768, + "language_loss": 0.76795661, + "learning_rate": 8.961359528185313e-07, + "loss": 0.84470242, + "num_input_tokens_seen": 249701805, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.10858154, + "step": 11568, + "time_per_iteration": 2.555664300918579 + }, + { + "auxiliary_loss_clip": 0.06409561, + "auxiliary_loss_mlp": 0.01267134, + "balance_loss_clip": 0.06274664, + "balance_loss_mlp": 0.01257567, + "epoch": 0.6955659101157372, + "flos": 22600076814720.0, + "grad_norm": 2.0811162561190444, + "language_loss": 0.72560644, + "learning_rate": 8.958112058964649e-07, + "loss": 0.80237341, + "num_input_tokens_seen": 249720550, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.09570312, + "step": 11569, + "time_per_iteration": 2.550203323364258 + }, + { + "auxiliary_loss_clip": 0.06412488, + "auxiliary_loss_mlp": 0.01267303, + "balance_loss_clip": 0.0627417, + "balance_loss_mlp": 0.01256568, + "epoch": 0.6956260333684052, + "flos": 24579576869760.0, + "grad_norm": 1.4598042665233286, + "language_loss": 0.77169657, + "learning_rate": 8.954865008453471e-07, + "loss": 0.84849441, + "num_input_tokens_seen": 249740325, + "router_z_loss_clip": 1.38183594, + "router_z_loss_mlp": 0.10736084, + "step": 11570, + "time_per_iteration": 2.5227878093719482 + }, + { + "auxiliary_loss_clip": 0.06413926, + "auxiliary_loss_mlp": 0.01265729, + "balance_loss_clip": 0.06273335, + "balance_loss_mlp": 0.01255436, + "epoch": 0.6956861566210732, + "flos": 25852745733120.0, + "grad_norm": 1.7591175950059927, + "language_loss": 0.7487582, + "learning_rate": 8.95161837677493e-07, + "loss": 0.82555479, + "num_input_tokens_seen": 249760570, + "router_z_loss_clip": 1.40429688, + "router_z_loss_mlp": 0.10284424, + "step": 11571, + "time_per_iteration": 2.597681999206543 + }, + { + "auxiliary_loss_clip": 0.06403409, + "auxiliary_loss_mlp": 0.01263016, + "balance_loss_clip": 0.062727, + "balance_loss_mlp": 0.01253241, + "epoch": 0.6957462798737412, + "flos": 15306270456960.0, + "grad_norm": 1.6743829197171876, + "language_loss": 0.74611163, + "learning_rate": 8.948372164052118e-07, + "loss": 0.8227759, + "num_input_tokens_seen": 249778290, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09771729, + "step": 11572, + "time_per_iteration": 2.479717254638672 + }, + { + "auxiliary_loss_clip": 0.06411865, + "auxiliary_loss_mlp": 0.01266562, + "balance_loss_clip": 0.06272524, + "balance_loss_mlp": 0.01256036, + "epoch": 0.6958064031264092, + "flos": 36255645838080.0, + "grad_norm": 1.9177386659246018, + "language_loss": 0.70336205, + "learning_rate": 8.94512637040814e-07, + "loss": 0.7801463, + "num_input_tokens_seen": 249800925, + "router_z_loss_clip": 1.39257812, + "router_z_loss_mlp": 0.10522461, + "step": 11573, + "time_per_iteration": 2.646585702896118 + }, + { + "auxiliary_loss_clip": 0.064174, + "auxiliary_loss_mlp": 0.01266296, + "balance_loss_clip": 0.06275034, + "balance_loss_mlp": 0.01254935, + "epoch": 0.6958665263790771, + "flos": 19214817609600.0, + "grad_norm": 1.6543405774844155, + "language_loss": 0.75180942, + "learning_rate": 8.941880995966095e-07, + "loss": 0.82864642, + "num_input_tokens_seen": 249820500, + "router_z_loss_clip": 1.42382812, + "router_z_loss_mlp": 0.11364746, + "step": 11574, + "time_per_iteration": 2.5017471313476562 + }, + { + "auxiliary_loss_clip": 0.06413898, + "auxiliary_loss_mlp": 0.0126532, + "balance_loss_clip": 0.06272998, + "balance_loss_mlp": 0.01254996, + "epoch": 0.6959266496317451, + "flos": 21801797366400.0, + "grad_norm": 1.6788443251259586, + "language_loss": 0.74745572, + "learning_rate": 8.938636040849014e-07, + "loss": 0.8242479, + "num_input_tokens_seen": 249839845, + "router_z_loss_clip": 1.40722656, + "router_z_loss_mlp": 0.10327148, + "step": 11575, + "time_per_iteration": 2.5528361797332764 + }, + { + "auxiliary_loss_clip": 0.06409347, + "auxiliary_loss_mlp": 0.01269096, + "balance_loss_clip": 0.06271648, + "balance_loss_mlp": 0.01258248, + "epoch": 0.695986772884413, + "flos": 20564490850560.0, + "grad_norm": 1.717283083984882, + "language_loss": 0.79060346, + "learning_rate": 8.935391505179966e-07, + "loss": 0.86738789, + "num_input_tokens_seen": 249857400, + "router_z_loss_clip": 1.37695312, + "router_z_loss_mlp": 0.10845947, + "step": 11576, + "time_per_iteration": 2.4801833629608154 + }, + { + "auxiliary_loss_clip": 0.06413432, + "auxiliary_loss_mlp": 0.01262741, + "balance_loss_clip": 0.06272326, + "balance_loss_mlp": 0.01252191, + "epoch": 0.696046896137081, + "flos": 14940980582400.0, + "grad_norm": 2.5670489052023404, + "language_loss": 0.57032454, + "learning_rate": 8.932147389081985e-07, + "loss": 0.64708626, + "num_input_tokens_seen": 249871645, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.10559082, + "step": 11577, + "time_per_iteration": 2.502033233642578 + }, + { + "auxiliary_loss_clip": 0.06404924, + "auxiliary_loss_mlp": 0.01266503, + "balance_loss_clip": 0.06274053, + "balance_loss_mlp": 0.01257521, + "epoch": 0.696107019389749, + "flos": 30748569217920.0, + "grad_norm": 1.378295678041548, + "language_loss": 0.76719046, + "learning_rate": 8.928903692678081e-07, + "loss": 0.84390473, + "num_input_tokens_seen": 249894215, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.08984375, + "step": 11578, + "time_per_iteration": 2.605837821960449 + }, + { + "auxiliary_loss_clip": 0.06414018, + "auxiliary_loss_mlp": 0.01262965, + "balance_loss_clip": 0.0627658, + "balance_loss_mlp": 0.01253249, + "epoch": 0.696167142642417, + "flos": 20782935244800.0, + "grad_norm": 3.119426120413718, + "language_loss": 0.79773849, + "learning_rate": 8.925660416091254e-07, + "loss": 0.87450832, + "num_input_tokens_seen": 249912850, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.09716797, + "step": 11579, + "time_per_iteration": 2.5537924766540527 + }, + { + "auxiliary_loss_clip": 0.06405934, + "auxiliary_loss_mlp": 0.01263768, + "balance_loss_clip": 0.06271495, + "balance_loss_mlp": 0.01253558, + "epoch": 0.6962272658950849, + "flos": 22571761334400.0, + "grad_norm": 1.5861987374843416, + "language_loss": 0.72813702, + "learning_rate": 8.922417559444502e-07, + "loss": 0.80483407, + "num_input_tokens_seen": 249932650, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.10205078, + "step": 11580, + "time_per_iteration": 2.5217056274414062 + }, + { + "auxiliary_loss_clip": 0.0641515, + "auxiliary_loss_mlp": 0.01267668, + "balance_loss_clip": 0.06275546, + "balance_loss_mlp": 0.01255896, + "epoch": 0.6962873891477529, + "flos": 22206681095040.0, + "grad_norm": 2.1085212775747975, + "language_loss": 0.66371673, + "learning_rate": 8.919175122860787e-07, + "loss": 0.74054492, + "num_input_tokens_seen": 249951205, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.11767578, + "step": 11581, + "time_per_iteration": 2.5470681190490723 + }, + { + "auxiliary_loss_clip": 0.06415606, + "auxiliary_loss_mlp": 0.01263239, + "balance_loss_clip": 0.06278277, + "balance_loss_mlp": 0.01253726, + "epoch": 0.6963475124004208, + "flos": 12493718709120.0, + "grad_norm": 3.192459541289618, + "language_loss": 0.76738924, + "learning_rate": 8.915933106463056e-07, + "loss": 0.84417772, + "num_input_tokens_seen": 249967045, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.09509277, + "step": 11582, + "time_per_iteration": 2.5975067615509033 + }, + { + "auxiliary_loss_clip": 0.06411912, + "auxiliary_loss_mlp": 0.01266649, + "balance_loss_clip": 0.06274536, + "balance_loss_mlp": 0.01256355, + "epoch": 0.6964076356530888, + "flos": 17170762383360.0, + "grad_norm": 2.14882454800848, + "language_loss": 0.70161986, + "learning_rate": 8.91269151037425e-07, + "loss": 0.77840543, + "num_input_tokens_seen": 249984565, + "router_z_loss_clip": 1.37402344, + "router_z_loss_mlp": 0.10290527, + "step": 11583, + "time_per_iteration": 3.9500138759613037 + }, + { + "auxiliary_loss_clip": 0.06410628, + "auxiliary_loss_mlp": 0.01268947, + "balance_loss_clip": 0.06274879, + "balance_loss_mlp": 0.01258272, + "epoch": 0.6964677589057569, + "flos": 19943342933760.0, + "grad_norm": 1.7749969250449007, + "language_loss": 0.82683307, + "learning_rate": 8.909450334717301e-07, + "loss": 0.90362883, + "num_input_tokens_seen": 250004235, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.10681152, + "step": 11584, + "time_per_iteration": 2.5435311794281006 + }, + { + "auxiliary_loss_clip": 0.06411311, + "auxiliary_loss_mlp": 0.01267824, + "balance_loss_clip": 0.06271736, + "balance_loss_mlp": 0.01256565, + "epoch": 0.6965278821584248, + "flos": 22790708853120.0, + "grad_norm": 2.098465309846489, + "language_loss": 0.79802585, + "learning_rate": 8.906209579615107e-07, + "loss": 0.87481719, + "num_input_tokens_seen": 250017645, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.1126709, + "step": 11585, + "time_per_iteration": 2.490299701690674 + }, + { + "auxiliary_loss_clip": 0.06406368, + "auxiliary_loss_mlp": 0.01265153, + "balance_loss_clip": 0.06273674, + "balance_loss_mlp": 0.01255735, + "epoch": 0.6965880054110928, + "flos": 20053739088000.0, + "grad_norm": 1.7604905238703683, + "language_loss": 0.77940738, + "learning_rate": 8.90296924519055e-07, + "loss": 0.85612255, + "num_input_tokens_seen": 250037640, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.09411621, + "step": 11586, + "time_per_iteration": 2.5373406410217285 + }, + { + "auxiliary_loss_clip": 0.06404427, + "auxiliary_loss_mlp": 0.01266758, + "balance_loss_clip": 0.06273477, + "balance_loss_mlp": 0.0125706, + "epoch": 0.6966481286637607, + "flos": 21914709143040.0, + "grad_norm": 1.8539557700987637, + "language_loss": 0.78935838, + "learning_rate": 8.899729331566519e-07, + "loss": 0.86607027, + "num_input_tokens_seen": 250056490, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09698486, + "step": 11587, + "time_per_iteration": 2.4801838397979736 + }, + { + "auxiliary_loss_clip": 0.06406583, + "auxiliary_loss_mlp": 0.01264851, + "balance_loss_clip": 0.0627536, + "balance_loss_mlp": 0.01254915, + "epoch": 0.6967082519164287, + "flos": 15638674803840.0, + "grad_norm": 1.9230111566874013, + "language_loss": 0.73017895, + "learning_rate": 8.896489838865857e-07, + "loss": 0.80689335, + "num_input_tokens_seen": 250074285, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09936523, + "step": 11588, + "time_per_iteration": 2.488046646118164 + }, + { + "auxiliary_loss_clip": 0.06411311, + "auxiliary_loss_mlp": 0.01262306, + "balance_loss_clip": 0.06274327, + "balance_loss_mlp": 0.01252507, + "epoch": 0.6967683751690966, + "flos": 24031453386240.0, + "grad_norm": 2.0364063263002885, + "language_loss": 0.74887639, + "learning_rate": 8.893250767211413e-07, + "loss": 0.82561255, + "num_input_tokens_seen": 250093350, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.09802246, + "step": 11589, + "time_per_iteration": 2.548539400100708 + }, + { + "auxiliary_loss_clip": 0.06411868, + "auxiliary_loss_mlp": 0.01265329, + "balance_loss_clip": 0.06274883, + "balance_loss_mlp": 0.01254773, + "epoch": 0.6968284984217646, + "flos": 31031862272640.0, + "grad_norm": 4.3993143538672275, + "language_loss": 0.63862813, + "learning_rate": 8.890012116726012e-07, + "loss": 0.71539998, + "num_input_tokens_seen": 250114170, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.10552979, + "step": 11590, + "time_per_iteration": 2.6050679683685303 + }, + { + "auxiliary_loss_clip": 0.06316171, + "auxiliary_loss_mlp": 0.01251394, + "balance_loss_clip": 0.06259812, + "balance_loss_mlp": 0.0125019, + "epoch": 0.6968886216744326, + "flos": 67642888475520.0, + "grad_norm": 0.7383814790063842, + "language_loss": 0.6120699, + "learning_rate": 8.88677388753248e-07, + "loss": 0.68774557, + "num_input_tokens_seen": 250178250, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01203156, + "step": 11591, + "time_per_iteration": 3.205728530883789 + }, + { + "auxiliary_loss_clip": 0.06413443, + "auxiliary_loss_mlp": 0.01267566, + "balance_loss_clip": 0.0627727, + "balance_loss_mlp": 0.01256539, + "epoch": 0.6969487449271006, + "flos": 24870668353920.0, + "grad_norm": 1.4802717401382182, + "language_loss": 0.69663697, + "learning_rate": 8.883536079753582e-07, + "loss": 0.77344704, + "num_input_tokens_seen": 250198420, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.11029053, + "step": 11592, + "time_per_iteration": 2.530959367752075 + }, + { + "auxiliary_loss_clip": 0.06411387, + "auxiliary_loss_mlp": 0.01269289, + "balance_loss_clip": 0.06275564, + "balance_loss_mlp": 0.01259132, + "epoch": 0.6970088681797685, + "flos": 28775525927040.0, + "grad_norm": 1.753602003372511, + "language_loss": 0.62838447, + "learning_rate": 8.880298693512109e-07, + "loss": 0.70519125, + "num_input_tokens_seen": 250220650, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.10150146, + "step": 11593, + "time_per_iteration": 2.5508384704589844 + }, + { + "auxiliary_loss_clip": 0.06406593, + "auxiliary_loss_mlp": 0.01263771, + "balance_loss_clip": 0.06274071, + "balance_loss_mlp": 0.012547, + "epoch": 0.6970689914324365, + "flos": 27316001583360.0, + "grad_norm": 1.3874621408455479, + "language_loss": 0.54750943, + "learning_rate": 8.877061728930832e-07, + "loss": 0.6242131, + "num_input_tokens_seen": 250241750, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09069824, + "step": 11594, + "time_per_iteration": 2.559556484222412 + }, + { + "auxiliary_loss_clip": 0.06411646, + "auxiliary_loss_mlp": 0.01264287, + "balance_loss_clip": 0.06274341, + "balance_loss_mlp": 0.01254106, + "epoch": 0.6971291146851044, + "flos": 19142422081920.0, + "grad_norm": 1.79939196206485, + "language_loss": 0.77473152, + "learning_rate": 8.87382518613248e-07, + "loss": 0.85149086, + "num_input_tokens_seen": 250259445, + "router_z_loss_clip": 1.37402344, + "router_z_loss_mlp": 0.10186768, + "step": 11595, + "time_per_iteration": 3.9267494678497314 + }, + { + "auxiliary_loss_clip": 0.06412616, + "auxiliary_loss_mlp": 0.0126537, + "balance_loss_clip": 0.06274199, + "balance_loss_mlp": 0.01254611, + "epoch": 0.6971892379377724, + "flos": 14615661905280.0, + "grad_norm": 2.356908454706418, + "language_loss": 0.72375011, + "learning_rate": 8.870589065239793e-07, + "loss": 0.80052996, + "num_input_tokens_seen": 250275640, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.10766602, + "step": 11596, + "time_per_iteration": 2.4861929416656494 + }, + { + "auxiliary_loss_clip": 0.0641246, + "auxiliary_loss_mlp": 0.01264522, + "balance_loss_clip": 0.06275618, + "balance_loss_mlp": 0.0125368, + "epoch": 0.6972493611904405, + "flos": 22313639232000.0, + "grad_norm": 1.9958593203679207, + "language_loss": 0.76570636, + "learning_rate": 8.867353366375492e-07, + "loss": 0.84247619, + "num_input_tokens_seen": 250296435, + "router_z_loss_clip": 1.36621094, + "router_z_loss_mlp": 0.10839844, + "step": 11597, + "time_per_iteration": 3.9746484756469727 + }, + { + "auxiliary_loss_clip": 0.064082, + "auxiliary_loss_mlp": 0.01267633, + "balance_loss_clip": 0.06272379, + "balance_loss_mlp": 0.01257232, + "epoch": 0.6973094844431084, + "flos": 17426075374080.0, + "grad_norm": 1.890364129189079, + "language_loss": 0.74871194, + "learning_rate": 8.864118089662267e-07, + "loss": 0.82547033, + "num_input_tokens_seen": 250314035, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10406494, + "step": 11598, + "time_per_iteration": 2.4967358112335205 + }, + { + "auxiliary_loss_clip": 0.06416015, + "auxiliary_loss_mlp": 0.01267108, + "balance_loss_clip": 0.06276817, + "balance_loss_mlp": 0.01256111, + "epoch": 0.6973696076957764, + "flos": 27242767514880.0, + "grad_norm": 1.672066699636808, + "language_loss": 0.89636326, + "learning_rate": 8.860883235222791e-07, + "loss": 0.97319448, + "num_input_tokens_seen": 250332995, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.10998535, + "step": 11599, + "time_per_iteration": 2.5665690898895264 + }, + { + "auxiliary_loss_clip": 0.06421445, + "auxiliary_loss_mlp": 0.01269073, + "balance_loss_clip": 0.06277397, + "balance_loss_mlp": 0.0125798, + "epoch": 0.6974297309484443, + "flos": 22024644099840.0, + "grad_norm": 1.8416467781869745, + "language_loss": 0.70383334, + "learning_rate": 8.85764880317974e-07, + "loss": 0.78073853, + "num_input_tokens_seen": 250352120, + "router_z_loss_clip": 1.43847656, + "router_z_loss_mlp": 0.11090088, + "step": 11600, + "time_per_iteration": 2.491593360900879 + }, + { + "auxiliary_loss_clip": 0.0641008, + "auxiliary_loss_mlp": 0.0126546, + "balance_loss_clip": 0.06272715, + "balance_loss_mlp": 0.01254958, + "epoch": 0.6974898542011123, + "flos": 28374038288640.0, + "grad_norm": 1.5173038128226022, + "language_loss": 0.76574016, + "learning_rate": 8.854414793655771e-07, + "loss": 0.84249556, + "num_input_tokens_seen": 250371705, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.10498047, + "step": 11601, + "time_per_iteration": 4.1049439907073975 + }, + { + "auxiliary_loss_clip": 0.06404468, + "auxiliary_loss_mlp": 0.01265462, + "balance_loss_clip": 0.06272994, + "balance_loss_mlp": 0.01255615, + "epoch": 0.6975499774537802, + "flos": 15237522581760.0, + "grad_norm": 1.8655763623744426, + "language_loss": 0.72371268, + "learning_rate": 8.851181206773508e-07, + "loss": 0.80041194, + "num_input_tokens_seen": 250390485, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09851074, + "step": 11602, + "time_per_iteration": 2.5268797874450684 + }, + { + "auxiliary_loss_clip": 0.06410255, + "auxiliary_loss_mlp": 0.01265285, + "balance_loss_clip": 0.06275497, + "balance_loss_mlp": 0.01255343, + "epoch": 0.6976101007064482, + "flos": 22162894536960.0, + "grad_norm": 2.1937279130738365, + "language_loss": 0.77231717, + "learning_rate": 8.847948042655567e-07, + "loss": 0.84907258, + "num_input_tokens_seen": 250407020, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.09942627, + "step": 11603, + "time_per_iteration": 2.4806923866271973 + }, + { + "auxiliary_loss_clip": 0.06408552, + "auxiliary_loss_mlp": 0.01263968, + "balance_loss_clip": 0.06273254, + "balance_loss_mlp": 0.01254211, + "epoch": 0.6976702239591162, + "flos": 22280124798720.0, + "grad_norm": 1.4370854048834028, + "language_loss": 0.62313223, + "learning_rate": 8.844715301424557e-07, + "loss": 0.69985747, + "num_input_tokens_seen": 250425880, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.09759521, + "step": 11604, + "time_per_iteration": 2.556675910949707 + }, + { + "auxiliary_loss_clip": 0.06411324, + "auxiliary_loss_mlp": 0.01265602, + "balance_loss_clip": 0.06273848, + "balance_loss_mlp": 0.01254486, + "epoch": 0.6977303472117842, + "flos": 25855722552960.0, + "grad_norm": 2.158609093070266, + "language_loss": 0.8206296, + "learning_rate": 8.841482983203057e-07, + "loss": 0.89739883, + "num_input_tokens_seen": 250442925, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.11120605, + "step": 11605, + "time_per_iteration": 2.5453009605407715 + }, + { + "auxiliary_loss_clip": 0.06408873, + "auxiliary_loss_mlp": 0.01266358, + "balance_loss_clip": 0.0627379, + "balance_loss_mlp": 0.01256637, + "epoch": 0.6977904704644521, + "flos": 20965894634880.0, + "grad_norm": 1.4817287317876005, + "language_loss": 0.7024073, + "learning_rate": 8.838251088113638e-07, + "loss": 0.77915967, + "num_input_tokens_seen": 250461220, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.09716797, + "step": 11606, + "time_per_iteration": 2.524181604385376 + }, + { + "auxiliary_loss_clip": 0.06411228, + "auxiliary_loss_mlp": 0.01265998, + "balance_loss_clip": 0.06271623, + "balance_loss_mlp": 0.01255221, + "epoch": 0.6978505937171201, + "flos": 22061680404480.0, + "grad_norm": 2.145616317364061, + "language_loss": 0.82643318, + "learning_rate": 8.835019616278856e-07, + "loss": 0.90320545, + "num_input_tokens_seen": 250480975, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.10772705, + "step": 11607, + "time_per_iteration": 2.4895663261413574 + }, + { + "auxiliary_loss_clip": 0.06416652, + "auxiliary_loss_mlp": 0.01265204, + "balance_loss_clip": 0.06274567, + "balance_loss_mlp": 0.01254201, + "epoch": 0.697910716969788, + "flos": 20049252894720.0, + "grad_norm": 2.008483115639311, + "language_loss": 0.79149514, + "learning_rate": 8.831788567821265e-07, + "loss": 0.86831373, + "num_input_tokens_seen": 250497980, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.11004639, + "step": 11608, + "time_per_iteration": 2.517848014831543 + }, + { + "auxiliary_loss_clip": 0.06411079, + "auxiliary_loss_mlp": 0.01264975, + "balance_loss_clip": 0.06272355, + "balance_loss_mlp": 0.0125461, + "epoch": 0.697970840222456, + "flos": 15893736232320.0, + "grad_norm": 1.856773515642951, + "language_loss": 0.9026711, + "learning_rate": 8.828557942863357e-07, + "loss": 0.97943169, + "num_input_tokens_seen": 250511910, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.10357666, + "step": 11609, + "time_per_iteration": 2.464045763015747 + }, + { + "auxiliary_loss_clip": 0.06410901, + "auxiliary_loss_mlp": 0.01262705, + "balance_loss_clip": 0.06270923, + "balance_loss_mlp": 0.01252965, + "epoch": 0.698030963475124, + "flos": 21222088093440.0, + "grad_norm": 1.4134029282176452, + "language_loss": 0.64230514, + "learning_rate": 8.82532774152765e-07, + "loss": 0.71904123, + "num_input_tokens_seen": 250531090, + "router_z_loss_clip": 1.40039062, + "router_z_loss_mlp": 0.09747314, + "step": 11610, + "time_per_iteration": 2.5426440238952637 + }, + { + "auxiliary_loss_clip": 0.06407233, + "auxiliary_loss_mlp": 0.01264187, + "balance_loss_clip": 0.06273091, + "balance_loss_mlp": 0.0125446, + "epoch": 0.698091086727792, + "flos": 33767113029120.0, + "grad_norm": 1.5536592755713354, + "language_loss": 0.84326196, + "learning_rate": 8.822097963936643e-07, + "loss": 0.91997612, + "num_input_tokens_seen": 250551565, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.097229, + "step": 11611, + "time_per_iteration": 2.6129181385040283 + }, + { + "auxiliary_loss_clip": 0.06411347, + "auxiliary_loss_mlp": 0.01264511, + "balance_loss_clip": 0.06272921, + "balance_loss_mlp": 0.01253752, + "epoch": 0.69815120998046, + "flos": 15893275034880.0, + "grad_norm": 1.864564945323593, + "language_loss": 0.70917654, + "learning_rate": 8.818868610212793e-07, + "loss": 0.78593516, + "num_input_tokens_seen": 250569625, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.10754395, + "step": 11612, + "time_per_iteration": 2.4869654178619385 + }, + { + "auxiliary_loss_clip": 0.06406604, + "auxiliary_loss_mlp": 0.01264449, + "balance_loss_clip": 0.06273325, + "balance_loss_mlp": 0.01254096, + "epoch": 0.6982113332331279, + "flos": 18952041605760.0, + "grad_norm": 1.4951443393996662, + "language_loss": 0.81150031, + "learning_rate": 8.815639680478573e-07, + "loss": 0.88821077, + "num_input_tokens_seen": 250586960, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.10345459, + "step": 11613, + "time_per_iteration": 2.4747042655944824 + }, + { + "auxiliary_loss_clip": 0.06409472, + "auxiliary_loss_mlp": 0.01267068, + "balance_loss_clip": 0.06274355, + "balance_loss_mlp": 0.01257335, + "epoch": 0.6982714564857959, + "flos": 24396533625600.0, + "grad_norm": 1.8067810947897194, + "language_loss": 0.75539565, + "learning_rate": 8.812411174856411e-07, + "loss": 0.83216107, + "num_input_tokens_seen": 250605080, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.09741211, + "step": 11614, + "time_per_iteration": 2.533997058868408 + }, + { + "auxiliary_loss_clip": 0.06408294, + "auxiliary_loss_mlp": 0.01268326, + "balance_loss_clip": 0.06272974, + "balance_loss_mlp": 0.0125817, + "epoch": 0.6983315797384638, + "flos": 20089852997760.0, + "grad_norm": 1.9161960736489865, + "language_loss": 0.77505577, + "learning_rate": 8.809183093468746e-07, + "loss": 0.85182202, + "num_input_tokens_seen": 250623965, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.10162354, + "step": 11615, + "time_per_iteration": 2.4810245037078857 + }, + { + "auxiliary_loss_clip": 0.06403261, + "auxiliary_loss_mlp": 0.01262746, + "balance_loss_clip": 0.06272578, + "balance_loss_mlp": 0.01253048, + "epoch": 0.6983917029911318, + "flos": 13516815461760.0, + "grad_norm": 1.8844428750511293, + "language_loss": 0.73254174, + "learning_rate": 8.80595543643797e-07, + "loss": 0.80920184, + "num_input_tokens_seen": 250640675, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.09692383, + "step": 11616, + "time_per_iteration": 2.4856157302856445 + }, + { + "auxiliary_loss_clip": 0.06408458, + "auxiliary_loss_mlp": 0.01264075, + "balance_loss_clip": 0.06277423, + "balance_loss_mlp": 0.01254091, + "epoch": 0.6984518262437998, + "flos": 22025021443200.0, + "grad_norm": 1.4724184586515745, + "language_loss": 0.84294975, + "learning_rate": 8.802728203886487e-07, + "loss": 0.91967505, + "num_input_tokens_seen": 250660295, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.09979248, + "step": 11617, + "time_per_iteration": 2.503758668899536 + }, + { + "auxiliary_loss_clip": 0.0641643, + "auxiliary_loss_mlp": 0.0126771, + "balance_loss_clip": 0.0627649, + "balance_loss_mlp": 0.01257035, + "epoch": 0.6985119494964678, + "flos": 18776587155840.0, + "grad_norm": 2.0634899151280623, + "language_loss": 0.59477413, + "learning_rate": 8.799501395936682e-07, + "loss": 0.67161554, + "num_input_tokens_seen": 250678155, + "router_z_loss_clip": 1.39941406, + "router_z_loss_mlp": 0.10668945, + "step": 11618, + "time_per_iteration": 2.502458333969116 + }, + { + "auxiliary_loss_clip": 0.06411035, + "auxiliary_loss_mlp": 0.0126303, + "balance_loss_clip": 0.06276886, + "balance_loss_mlp": 0.0125307, + "epoch": 0.6985720727491357, + "flos": 22389430849920.0, + "grad_norm": 2.158587147069475, + "language_loss": 0.83073372, + "learning_rate": 8.796275012710903e-07, + "loss": 0.9074744, + "num_input_tokens_seen": 250697230, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.0994873, + "step": 11619, + "time_per_iteration": 2.4989545345306396 + }, + { + "auxiliary_loss_clip": 0.06409271, + "auxiliary_loss_mlp": 0.01266979, + "balance_loss_clip": 0.06278059, + "balance_loss_mlp": 0.01258152, + "epoch": 0.6986321960018037, + "flos": 39577398048000.0, + "grad_norm": 1.554266189454373, + "language_loss": 0.67337298, + "learning_rate": 8.793049054331494e-07, + "loss": 0.75013542, + "num_input_tokens_seen": 250719865, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.08825684, + "step": 11620, + "time_per_iteration": 2.765410900115967 + }, + { + "auxiliary_loss_clip": 0.06411748, + "auxiliary_loss_mlp": 0.01266896, + "balance_loss_clip": 0.06273868, + "balance_loss_mlp": 0.01256621, + "epoch": 0.6986923192544716, + "flos": 17973528024960.0, + "grad_norm": 2.4474211013812432, + "language_loss": 0.73446906, + "learning_rate": 8.789823520920794e-07, + "loss": 0.81125557, + "num_input_tokens_seen": 250736565, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.1027832, + "step": 11621, + "time_per_iteration": 2.4840140342712402 + }, + { + "auxiliary_loss_clip": 0.06412227, + "auxiliary_loss_mlp": 0.01264203, + "balance_loss_clip": 0.06272949, + "balance_loss_mlp": 0.01253737, + "epoch": 0.6987524425071396, + "flos": 25601583519360.0, + "grad_norm": 1.724040192260788, + "language_loss": 0.68410677, + "learning_rate": 8.7865984126011e-07, + "loss": 0.76087105, + "num_input_tokens_seen": 250757235, + "router_z_loss_clip": 1.39257812, + "router_z_loss_mlp": 0.10461426, + "step": 11622, + "time_per_iteration": 3.950021743774414 + }, + { + "auxiliary_loss_clip": 0.06409498, + "auxiliary_loss_mlp": 0.01267194, + "balance_loss_clip": 0.0627782, + "balance_loss_mlp": 0.01257383, + "epoch": 0.6988125657598077, + "flos": 17535842622720.0, + "grad_norm": 1.8022622371846757, + "language_loss": 0.62591398, + "learning_rate": 8.783373729494721e-07, + "loss": 0.70268083, + "num_input_tokens_seen": 250775585, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.09814453, + "step": 11623, + "time_per_iteration": 2.529270887374878 + }, + { + "auxiliary_loss_clip": 0.06415178, + "auxiliary_loss_mlp": 0.01265248, + "balance_loss_clip": 0.06272644, + "balance_loss_mlp": 0.01254817, + "epoch": 0.6988726890124756, + "flos": 39175029941760.0, + "grad_norm": 1.7670185249526673, + "language_loss": 0.60458779, + "learning_rate": 8.780149471723932e-07, + "loss": 0.68139207, + "num_input_tokens_seen": 250795725, + "router_z_loss_clip": 1.42285156, + "router_z_loss_mlp": 0.10430908, + "step": 11624, + "time_per_iteration": 2.6375675201416016 + }, + { + "auxiliary_loss_clip": 0.06411561, + "auxiliary_loss_mlp": 0.01267973, + "balance_loss_clip": 0.06272775, + "balance_loss_mlp": 0.01256564, + "epoch": 0.6989328122651436, + "flos": 20199662173440.0, + "grad_norm": 1.5069469972343055, + "language_loss": 0.78510606, + "learning_rate": 8.776925639411017e-07, + "loss": 0.8619014, + "num_input_tokens_seen": 250814555, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.11413574, + "step": 11625, + "time_per_iteration": 2.534061908721924 + }, + { + "auxiliary_loss_clip": 0.06406638, + "auxiliary_loss_mlp": 0.01266638, + "balance_loss_clip": 0.06275126, + "balance_loss_mlp": 0.01257256, + "epoch": 0.6989929355178115, + "flos": 21841265439360.0, + "grad_norm": 1.6759866105601053, + "language_loss": 0.66316259, + "learning_rate": 8.773702232678188e-07, + "loss": 0.73989534, + "num_input_tokens_seen": 250833105, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09381104, + "step": 11626, + "time_per_iteration": 2.4902937412261963 + }, + { + "auxiliary_loss_clip": 0.06411765, + "auxiliary_loss_mlp": 0.0126589, + "balance_loss_clip": 0.06275335, + "balance_loss_mlp": 0.01255733, + "epoch": 0.6990530587704795, + "flos": 26330066916480.0, + "grad_norm": 2.0325683536698205, + "language_loss": 0.70813847, + "learning_rate": 8.770479251647697e-07, + "loss": 0.78491497, + "num_input_tokens_seen": 250852570, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.10144043, + "step": 11627, + "time_per_iteration": 2.5748379230499268 + }, + { + "auxiliary_loss_clip": 0.0640467, + "auxiliary_loss_mlp": 0.01264187, + "balance_loss_clip": 0.06273688, + "balance_loss_mlp": 0.0125508, + "epoch": 0.6991131820231474, + "flos": 19835168912640.0, + "grad_norm": 1.7164277105253158, + "language_loss": 0.62609565, + "learning_rate": 8.767256696441768e-07, + "loss": 0.70278424, + "num_input_tokens_seen": 250870500, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.09112549, + "step": 11628, + "time_per_iteration": 2.4829564094543457 + }, + { + "auxiliary_loss_clip": 0.06410889, + "auxiliary_loss_mlp": 0.01265997, + "balance_loss_clip": 0.06272821, + "balance_loss_mlp": 0.0125559, + "epoch": 0.6991733052758154, + "flos": 33993271998720.0, + "grad_norm": 1.816957818772296, + "language_loss": 0.68972111, + "learning_rate": 8.764034567182581e-07, + "loss": 0.76648998, + "num_input_tokens_seen": 250892745, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.10412598, + "step": 11629, + "time_per_iteration": 2.6509320735931396 + }, + { + "auxiliary_loss_clip": 0.06409748, + "auxiliary_loss_mlp": 0.01265873, + "balance_loss_clip": 0.06276409, + "balance_loss_mlp": 0.0125515, + "epoch": 0.6992334285284834, + "flos": 15638632876800.0, + "grad_norm": 1.5060784407018701, + "language_loss": 0.72445923, + "learning_rate": 8.760812863992337e-07, + "loss": 0.80121547, + "num_input_tokens_seen": 250910225, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.1072998, + "step": 11630, + "time_per_iteration": 2.4783284664154053 + }, + { + "auxiliary_loss_clip": 0.0641311, + "auxiliary_loss_mlp": 0.01265861, + "balance_loss_clip": 0.06278898, + "balance_loss_mlp": 0.01255943, + "epoch": 0.6992935517811514, + "flos": 21732797928960.0, + "grad_norm": 1.7108311606213942, + "language_loss": 0.74144894, + "learning_rate": 8.757591586993196e-07, + "loss": 0.81823862, + "num_input_tokens_seen": 250929715, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.09912109, + "step": 11631, + "time_per_iteration": 2.5788233280181885 + }, + { + "auxiliary_loss_clip": 0.06419384, + "auxiliary_loss_mlp": 0.01269329, + "balance_loss_clip": 0.0628057, + "balance_loss_mlp": 0.01258022, + "epoch": 0.6993536750338193, + "flos": 20120558319360.0, + "grad_norm": 2.3602125436995105, + "language_loss": 0.89111435, + "learning_rate": 8.7543707363073e-07, + "loss": 0.96800154, + "num_input_tokens_seen": 250944230, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.11303711, + "step": 11632, + "time_per_iteration": 2.473422050476074 + }, + { + "auxiliary_loss_clip": 0.06414177, + "auxiliary_loss_mlp": 0.01264877, + "balance_loss_clip": 0.06276321, + "balance_loss_mlp": 0.01254864, + "epoch": 0.6994137982864873, + "flos": 22015839421440.0, + "grad_norm": 1.6028389301274413, + "language_loss": 0.79952157, + "learning_rate": 8.751150312056792e-07, + "loss": 0.87631214, + "num_input_tokens_seen": 250961865, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.10009766, + "step": 11633, + "time_per_iteration": 2.513282060623169 + }, + { + "auxiliary_loss_clip": 0.06417207, + "auxiliary_loss_mlp": 0.01265902, + "balance_loss_clip": 0.06276365, + "balance_loss_mlp": 0.01254202, + "epoch": 0.6994739215391552, + "flos": 25525875755520.0, + "grad_norm": 1.8057869627886596, + "language_loss": 0.67083466, + "learning_rate": 8.747930314363794e-07, + "loss": 0.7476657, + "num_input_tokens_seen": 250982025, + "router_z_loss_clip": 1.40917969, + "router_z_loss_mlp": 0.11712646, + "step": 11634, + "time_per_iteration": 3.9409241676330566 + }, + { + "auxiliary_loss_clip": 0.06321115, + "auxiliary_loss_mlp": 0.0125178, + "balance_loss_clip": 0.06264269, + "balance_loss_mlp": 0.01250645, + "epoch": 0.6995340447918232, + "flos": 59147931438720.0, + "grad_norm": 0.6717939190194797, + "language_loss": 0.53298014, + "learning_rate": 8.744710743350412e-07, + "loss": 0.6087091, + "num_input_tokens_seen": 251046900, + "router_z_loss_clip": 0.5703125, + "router_z_loss_mlp": 0.0113678, + "step": 11635, + "time_per_iteration": 3.2486236095428467 + }, + { + "auxiliary_loss_clip": 0.06412114, + "auxiliary_loss_mlp": 0.01264348, + "balance_loss_clip": 0.06275758, + "balance_loss_mlp": 0.01253631, + "epoch": 0.6995941680444913, + "flos": 17973653806080.0, + "grad_norm": 1.479923932232007, + "language_loss": 0.8206256, + "learning_rate": 8.741491599138726e-07, + "loss": 0.89739013, + "num_input_tokens_seen": 251065050, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.1071167, + "step": 11636, + "time_per_iteration": 2.516813039779663 + }, + { + "auxiliary_loss_clip": 0.06416257, + "auxiliary_loss_mlp": 0.01266147, + "balance_loss_clip": 0.06278151, + "balance_loss_mlp": 0.01255722, + "epoch": 0.6996542912971592, + "flos": 21986391911040.0, + "grad_norm": 3.1669516008633813, + "language_loss": 0.83141685, + "learning_rate": 8.738272881850801e-07, + "loss": 0.90824091, + "num_input_tokens_seen": 251083355, + "router_z_loss_clip": 1.38183594, + "router_z_loss_mlp": 0.10430908, + "step": 11637, + "time_per_iteration": 3.917647123336792 + }, + { + "auxiliary_loss_clip": 0.06409974, + "auxiliary_loss_mlp": 0.0126639, + "balance_loss_clip": 0.06274991, + "balance_loss_mlp": 0.0125584, + "epoch": 0.6997144145498272, + "flos": 11689904891520.0, + "grad_norm": 1.7413253088603204, + "language_loss": 0.68017536, + "learning_rate": 8.735054591608704e-07, + "loss": 0.75693905, + "num_input_tokens_seen": 251096420, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.10559082, + "step": 11638, + "time_per_iteration": 2.455333709716797 + }, + { + "auxiliary_loss_clip": 0.06417674, + "auxiliary_loss_mlp": 0.01267445, + "balance_loss_clip": 0.06275746, + "balance_loss_mlp": 0.01255244, + "epoch": 0.6997745378024951, + "flos": 29614992456960.0, + "grad_norm": 1.8583897053492529, + "language_loss": 0.77953184, + "learning_rate": 8.731836728534459e-07, + "loss": 0.85638303, + "num_input_tokens_seen": 251115410, + "router_z_loss_clip": 1.41894531, + "router_z_loss_mlp": 0.12200928, + "step": 11639, + "time_per_iteration": 2.5732390880584717 + }, + { + "auxiliary_loss_clip": 0.06415096, + "auxiliary_loss_mlp": 0.01267452, + "balance_loss_clip": 0.06277713, + "balance_loss_mlp": 0.01256842, + "epoch": 0.6998346610551631, + "flos": 20892912128640.0, + "grad_norm": 1.9224229885402988, + "language_loss": 0.83357054, + "learning_rate": 8.728619292750093e-07, + "loss": 0.91039604, + "num_input_tokens_seen": 251133530, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.10601807, + "step": 11640, + "time_per_iteration": 2.518707275390625 + }, + { + "auxiliary_loss_clip": 0.06408644, + "auxiliary_loss_mlp": 0.01265078, + "balance_loss_clip": 0.06273933, + "balance_loss_mlp": 0.01255422, + "epoch": 0.699894784307831, + "flos": 27170539695360.0, + "grad_norm": 1.6039437808829469, + "language_loss": 0.75522578, + "learning_rate": 8.725402284377619e-07, + "loss": 0.83196306, + "num_input_tokens_seen": 251153985, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.09655762, + "step": 11641, + "time_per_iteration": 4.078887701034546 + }, + { + "auxiliary_loss_clip": 0.06412257, + "auxiliary_loss_mlp": 0.01266553, + "balance_loss_clip": 0.06275941, + "balance_loss_mlp": 0.01256361, + "epoch": 0.699954907560499, + "flos": 20930032287360.0, + "grad_norm": 1.8680055959443465, + "language_loss": 0.77721083, + "learning_rate": 8.722185703539022e-07, + "loss": 0.85399896, + "num_input_tokens_seen": 251173225, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.10192871, + "step": 11642, + "time_per_iteration": 2.500046730041504 + }, + { + "auxiliary_loss_clip": 0.0641754, + "auxiliary_loss_mlp": 0.01265471, + "balance_loss_clip": 0.06277227, + "balance_loss_mlp": 0.01253592, + "epoch": 0.700015030813167, + "flos": 28665339408000.0, + "grad_norm": 2.533169755671386, + "language_loss": 0.74393576, + "learning_rate": 8.718969550356266e-07, + "loss": 0.82076585, + "num_input_tokens_seen": 251192485, + "router_z_loss_clip": 1.40332031, + "router_z_loss_mlp": 0.11883545, + "step": 11643, + "time_per_iteration": 2.5775840282440186 + }, + { + "auxiliary_loss_clip": 0.06414674, + "auxiliary_loss_mlp": 0.01264637, + "balance_loss_clip": 0.06276005, + "balance_loss_mlp": 0.01254362, + "epoch": 0.700075154065835, + "flos": 29212959767040.0, + "grad_norm": 1.5245425147272047, + "language_loss": 0.60040998, + "learning_rate": 8.715753824951315e-07, + "loss": 0.67720306, + "num_input_tokens_seen": 251214965, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.1027832, + "step": 11644, + "time_per_iteration": 2.552072286605835 + }, + { + "auxiliary_loss_clip": 0.06407935, + "auxiliary_loss_mlp": 0.01271385, + "balance_loss_clip": 0.06275052, + "balance_loss_mlp": 0.01260579, + "epoch": 0.7001352773185029, + "flos": 23119130131200.0, + "grad_norm": 1.5458952120749485, + "language_loss": 0.82132351, + "learning_rate": 8.712538527446119e-07, + "loss": 0.89811671, + "num_input_tokens_seen": 251234500, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.10809326, + "step": 11645, + "time_per_iteration": 2.558337450027466 + }, + { + "auxiliary_loss_clip": 0.06407823, + "auxiliary_loss_mlp": 0.01266733, + "balance_loss_clip": 0.06274226, + "balance_loss_mlp": 0.01256743, + "epoch": 0.7001954005711709, + "flos": 21328962376320.0, + "grad_norm": 2.5779246493483177, + "language_loss": 0.68295795, + "learning_rate": 8.709323657962584e-07, + "loss": 0.75970346, + "num_input_tokens_seen": 251254360, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.09985352, + "step": 11646, + "time_per_iteration": 2.5126430988311768 + }, + { + "auxiliary_loss_clip": 0.06410798, + "auxiliary_loss_mlp": 0.01264039, + "balance_loss_clip": 0.06276618, + "balance_loss_mlp": 0.0125371, + "epoch": 0.7002555238238388, + "flos": 24542834054400.0, + "grad_norm": 1.467898418777351, + "language_loss": 0.71547973, + "learning_rate": 8.706109216622635e-07, + "loss": 0.7922281, + "num_input_tokens_seen": 251274790, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.10339355, + "step": 11647, + "time_per_iteration": 2.5304250717163086 + }, + { + "auxiliary_loss_clip": 0.06414019, + "auxiliary_loss_mlp": 0.01269431, + "balance_loss_clip": 0.0627712, + "balance_loss_mlp": 0.01258041, + "epoch": 0.7003156470765068, + "flos": 39065891598720.0, + "grad_norm": 1.749288264158044, + "language_loss": 0.72289455, + "learning_rate": 8.702895203548155e-07, + "loss": 0.79972911, + "num_input_tokens_seen": 251296275, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.1138916, + "step": 11648, + "time_per_iteration": 2.678863525390625 + }, + { + "auxiliary_loss_clip": 0.06409213, + "auxiliary_loss_mlp": 0.01267629, + "balance_loss_clip": 0.06275574, + "balance_loss_mlp": 0.01257377, + "epoch": 0.7003757703291749, + "flos": 28811723690880.0, + "grad_norm": 1.4492190580209505, + "language_loss": 0.77860492, + "learning_rate": 8.699681618861014e-07, + "loss": 0.85537332, + "num_input_tokens_seen": 251317375, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.10247803, + "step": 11649, + "time_per_iteration": 2.558931589126587 + }, + { + "auxiliary_loss_clip": 0.06409431, + "auxiliary_loss_mlp": 0.01267142, + "balance_loss_clip": 0.06275406, + "balance_loss_mlp": 0.01257421, + "epoch": 0.7004358935818428, + "flos": 15958123695360.0, + "grad_norm": 1.4433792721312992, + "language_loss": 0.78238451, + "learning_rate": 8.69646846268308e-07, + "loss": 0.85915029, + "num_input_tokens_seen": 251333570, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.097229, + "step": 11650, + "time_per_iteration": 2.461639642715454 + }, + { + "auxiliary_loss_clip": 0.06409653, + "auxiliary_loss_mlp": 0.0126613, + "balance_loss_clip": 0.06273135, + "balance_loss_mlp": 0.01256247, + "epoch": 0.7004960168345108, + "flos": 20418148494720.0, + "grad_norm": 2.0802744101319406, + "language_loss": 0.78669983, + "learning_rate": 8.693255735136194e-07, + "loss": 0.86345768, + "num_input_tokens_seen": 251351070, + "router_z_loss_clip": 1.36621094, + "router_z_loss_mlp": 0.09881592, + "step": 11651, + "time_per_iteration": 2.500000238418579 + }, + { + "auxiliary_loss_clip": 0.06420258, + "auxiliary_loss_mlp": 0.01269045, + "balance_loss_clip": 0.06280224, + "balance_loss_mlp": 0.01258649, + "epoch": 0.7005561400871787, + "flos": 17353260576000.0, + "grad_norm": 1.5099151755448044, + "language_loss": 0.70310026, + "learning_rate": 8.690043436342198e-07, + "loss": 0.7799933, + "num_input_tokens_seen": 251370005, + "router_z_loss_clip": 1.40039062, + "router_z_loss_mlp": 0.10388184, + "step": 11652, + "time_per_iteration": 2.4739015102386475 + }, + { + "auxiliary_loss_clip": 0.06413841, + "auxiliary_loss_mlp": 0.01263486, + "balance_loss_clip": 0.06277132, + "balance_loss_mlp": 0.01253663, + "epoch": 0.7006162633398467, + "flos": 25309276151040.0, + "grad_norm": 1.323517960695476, + "language_loss": 0.74456298, + "learning_rate": 8.686831566422874e-07, + "loss": 0.82133621, + "num_input_tokens_seen": 251391210, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.0982666, + "step": 11653, + "time_per_iteration": 2.532655954360962 + }, + { + "auxiliary_loss_clip": 0.06417534, + "auxiliary_loss_mlp": 0.01263141, + "balance_loss_clip": 0.06278478, + "balance_loss_mlp": 0.0125271, + "epoch": 0.7006763865925146, + "flos": 20675473983360.0, + "grad_norm": 2.0288883835732228, + "language_loss": 0.70729959, + "learning_rate": 8.68362012550003e-07, + "loss": 0.78410637, + "num_input_tokens_seen": 251411505, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.10430908, + "step": 11654, + "time_per_iteration": 2.519660711288452 + }, + { + "auxiliary_loss_clip": 0.06415437, + "auxiliary_loss_mlp": 0.0126811, + "balance_loss_clip": 0.06277716, + "balance_loss_mlp": 0.01256696, + "epoch": 0.7007365098451827, + "flos": 20052439349760.0, + "grad_norm": 2.2628281377067134, + "language_loss": 0.72993428, + "learning_rate": 8.680409113695453e-07, + "loss": 0.80676985, + "num_input_tokens_seen": 251428975, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.11413574, + "step": 11655, + "time_per_iteration": 2.48612117767334 + }, + { + "auxiliary_loss_clip": 0.06424905, + "auxiliary_loss_mlp": 0.01271007, + "balance_loss_clip": 0.06280498, + "balance_loss_mlp": 0.01259062, + "epoch": 0.7007966330978506, + "flos": 20783689931520.0, + "grad_norm": 1.9221196897273614, + "language_loss": 0.70366073, + "learning_rate": 8.677198531130889e-07, + "loss": 0.78061986, + "num_input_tokens_seen": 251446940, + "router_z_loss_clip": 1.44335938, + "router_z_loss_mlp": 0.11950684, + "step": 11656, + "time_per_iteration": 2.4856395721435547 + }, + { + "auxiliary_loss_clip": 0.06408404, + "auxiliary_loss_mlp": 0.01266899, + "balance_loss_clip": 0.06273983, + "balance_loss_mlp": 0.01257123, + "epoch": 0.7008567563505186, + "flos": 29645110800000.0, + "grad_norm": 1.5392970097639627, + "language_loss": 0.78185248, + "learning_rate": 8.673988377928092e-07, + "loss": 0.8586055, + "num_input_tokens_seen": 251466205, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.09783936, + "step": 11657, + "time_per_iteration": 2.5812113285064697 + }, + { + "auxiliary_loss_clip": 0.06419835, + "auxiliary_loss_mlp": 0.01268196, + "balance_loss_clip": 0.06277259, + "balance_loss_mlp": 0.01257229, + "epoch": 0.7009168796031865, + "flos": 17097654096000.0, + "grad_norm": 2.227553712273129, + "language_loss": 0.78159571, + "learning_rate": 8.670778654208797e-07, + "loss": 0.85847604, + "num_input_tokens_seen": 251484820, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.10968018, + "step": 11658, + "time_per_iteration": 2.4778008460998535 + }, + { + "auxiliary_loss_clip": 0.0640991, + "auxiliary_loss_mlp": 0.0126385, + "balance_loss_clip": 0.06276852, + "balance_loss_mlp": 0.01254099, + "epoch": 0.7009770028558545, + "flos": 20455226726400.0, + "grad_norm": 1.6635136984807588, + "language_loss": 0.83274609, + "learning_rate": 8.667569360094713e-07, + "loss": 0.90948367, + "num_input_tokens_seen": 251502670, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.09747314, + "step": 11659, + "time_per_iteration": 2.4965016841888428 + }, + { + "auxiliary_loss_clip": 0.06406507, + "auxiliary_loss_mlp": 0.01265707, + "balance_loss_clip": 0.06273511, + "balance_loss_mlp": 0.01256296, + "epoch": 0.7010371261085224, + "flos": 19251225008640.0, + "grad_norm": 2.205019124031737, + "language_loss": 0.69561887, + "learning_rate": 8.664360495707526e-07, + "loss": 0.77234095, + "num_input_tokens_seen": 251521630, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.09411621, + "step": 11660, + "time_per_iteration": 2.4827144145965576 + }, + { + "auxiliary_loss_clip": 0.06414962, + "auxiliary_loss_mlp": 0.01266553, + "balance_loss_clip": 0.06275482, + "balance_loss_mlp": 0.01256134, + "epoch": 0.7010972493611904, + "flos": 22134159786240.0, + "grad_norm": 2.0869897578232295, + "language_loss": 0.81401628, + "learning_rate": 8.661152061168924e-07, + "loss": 0.89083141, + "num_input_tokens_seen": 251540105, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.10412598, + "step": 11661, + "time_per_iteration": 3.9388158321380615 + }, + { + "auxiliary_loss_clip": 0.06407215, + "auxiliary_loss_mlp": 0.01264683, + "balance_loss_clip": 0.06272362, + "balance_loss_mlp": 0.01254544, + "epoch": 0.7011573726138585, + "flos": 31398619593600.0, + "grad_norm": 1.8643289831680394, + "language_loss": 0.79429448, + "learning_rate": 8.657944056600579e-07, + "loss": 0.87101352, + "num_input_tokens_seen": 251560530, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.10137939, + "step": 11662, + "time_per_iteration": 2.6265618801116943 + }, + { + "auxiliary_loss_clip": 0.06416287, + "auxiliary_loss_mlp": 0.01267119, + "balance_loss_clip": 0.06277344, + "balance_loss_mlp": 0.01256295, + "epoch": 0.7012174958665264, + "flos": 18156487415040.0, + "grad_norm": 1.6800388441509395, + "language_loss": 0.83806753, + "learning_rate": 8.654736482124134e-07, + "loss": 0.91490161, + "num_input_tokens_seen": 251577930, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.10821533, + "step": 11663, + "time_per_iteration": 2.488739252090454 + }, + { + "auxiliary_loss_clip": 0.06318727, + "auxiliary_loss_mlp": 0.01250759, + "balance_loss_clip": 0.06262303, + "balance_loss_mlp": 0.012494, + "epoch": 0.7012776191191944, + "flos": 60669495331200.0, + "grad_norm": 0.8224381055881935, + "language_loss": 0.5391866, + "learning_rate": 8.651529337861209e-07, + "loss": 0.6148814, + "num_input_tokens_seen": 251638820, + "router_z_loss_clip": 0.56591797, + "router_z_loss_mlp": 0.01361084, + "step": 11664, + "time_per_iteration": 3.160693645477295 + }, + { + "auxiliary_loss_clip": 0.06413987, + "auxiliary_loss_mlp": 0.01267114, + "balance_loss_clip": 0.06275371, + "balance_loss_mlp": 0.01256731, + "epoch": 0.7013377423718623, + "flos": 27205940845440.0, + "grad_norm": 1.7370315255440756, + "language_loss": 0.79090619, + "learning_rate": 8.64832262393344e-07, + "loss": 0.86771721, + "num_input_tokens_seen": 251658070, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.1038208, + "step": 11665, + "time_per_iteration": 2.5398123264312744 + }, + { + "auxiliary_loss_clip": 0.06412809, + "auxiliary_loss_mlp": 0.01262516, + "balance_loss_clip": 0.06277609, + "balance_loss_mlp": 0.01252563, + "epoch": 0.7013978656245303, + "flos": 16548901706880.0, + "grad_norm": 2.00554211734292, + "language_loss": 0.76867342, + "learning_rate": 8.645116340462404e-07, + "loss": 0.84542668, + "num_input_tokens_seen": 251671575, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.09954834, + "step": 11666, + "time_per_iteration": 2.4652414321899414 + }, + { + "auxiliary_loss_clip": 0.0641577, + "auxiliary_loss_mlp": 0.01267108, + "balance_loss_clip": 0.06279963, + "balance_loss_mlp": 0.01256725, + "epoch": 0.7014579888771982, + "flos": 23149625817600.0, + "grad_norm": 1.7866180274258885, + "language_loss": 0.81048751, + "learning_rate": 8.641910487569695e-07, + "loss": 0.88731629, + "num_input_tokens_seen": 251689350, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10388184, + "step": 11667, + "time_per_iteration": 2.5062241554260254 + }, + { + "auxiliary_loss_clip": 0.06409969, + "auxiliary_loss_mlp": 0.01266348, + "balance_loss_clip": 0.06275474, + "balance_loss_mlp": 0.01255917, + "epoch": 0.7015181121298663, + "flos": 25089028894080.0, + "grad_norm": 2.0567499658134087, + "language_loss": 0.65901959, + "learning_rate": 8.638705065376879e-07, + "loss": 0.73578274, + "num_input_tokens_seen": 251704635, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.10443115, + "step": 11668, + "time_per_iteration": 2.6001944541931152 + }, + { + "auxiliary_loss_clip": 0.06415643, + "auxiliary_loss_mlp": 0.01266119, + "balance_loss_clip": 0.06275932, + "balance_loss_mlp": 0.01255248, + "epoch": 0.7015782353825342, + "flos": 23334052654080.0, + "grad_norm": 1.636860913695636, + "language_loss": 0.76856339, + "learning_rate": 8.635500074005519e-07, + "loss": 0.84538102, + "num_input_tokens_seen": 251723035, + "router_z_loss_clip": 1.39648438, + "router_z_loss_mlp": 0.10870361, + "step": 11669, + "time_per_iteration": 2.580120801925659 + }, + { + "auxiliary_loss_clip": 0.06316374, + "auxiliary_loss_mlp": 0.01249475, + "balance_loss_clip": 0.06259722, + "balance_loss_mlp": 0.01248101, + "epoch": 0.7016383586352022, + "flos": 70417733086080.0, + "grad_norm": 0.683633883002792, + "language_loss": 0.54477966, + "learning_rate": 8.632295513577122e-07, + "loss": 0.62043816, + "num_input_tokens_seen": 251791630, + "router_z_loss_clip": 0.56640625, + "router_z_loss_mlp": 0.01376343, + "step": 11670, + "time_per_iteration": 3.239391565322876 + }, + { + "auxiliary_loss_clip": 0.06410887, + "auxiliary_loss_mlp": 0.01266693, + "balance_loss_clip": 0.06276417, + "balance_loss_mlp": 0.01256447, + "epoch": 0.7016984818878701, + "flos": 19798426097280.0, + "grad_norm": 1.5820465602747873, + "language_loss": 0.81851846, + "learning_rate": 8.629091384213218e-07, + "loss": 0.89529431, + "num_input_tokens_seen": 251809840, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.10247803, + "step": 11671, + "time_per_iteration": 2.5156307220458984 + }, + { + "auxiliary_loss_clip": 0.06415814, + "auxiliary_loss_mlp": 0.01265108, + "balance_loss_clip": 0.06276827, + "balance_loss_mlp": 0.01254611, + "epoch": 0.7017586051405381, + "flos": 12901998528000.0, + "grad_norm": 1.7162410726978943, + "language_loss": 0.74825186, + "learning_rate": 8.625887686035313e-07, + "loss": 0.82506108, + "num_input_tokens_seen": 251827550, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.10498047, + "step": 11672, + "time_per_iteration": 2.4657065868377686 + }, + { + "auxiliary_loss_clip": 0.064162, + "auxiliary_loss_mlp": 0.01267901, + "balance_loss_clip": 0.06278486, + "balance_loss_mlp": 0.01256922, + "epoch": 0.701818728393206, + "flos": 18338734045440.0, + "grad_norm": 1.6561114230567193, + "language_loss": 0.87079096, + "learning_rate": 8.622684419164883e-07, + "loss": 0.94763196, + "num_input_tokens_seen": 251844880, + "router_z_loss_clip": 1.37695312, + "router_z_loss_mlp": 0.10980225, + "step": 11673, + "time_per_iteration": 2.51084303855896 + }, + { + "auxiliary_loss_clip": 0.06411691, + "auxiliary_loss_mlp": 0.01268986, + "balance_loss_clip": 0.06277934, + "balance_loss_mlp": 0.01258502, + "epoch": 0.701878851645874, + "flos": 17389961464320.0, + "grad_norm": 1.7599431551764082, + "language_loss": 0.73397923, + "learning_rate": 8.619481583723399e-07, + "loss": 0.81078601, + "num_input_tokens_seen": 251861025, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.10491943, + "step": 11674, + "time_per_iteration": 3.8845224380493164 + }, + { + "auxiliary_loss_clip": 0.06408197, + "auxiliary_loss_mlp": 0.01264811, + "balance_loss_clip": 0.06276836, + "balance_loss_mlp": 0.01255173, + "epoch": 0.701938974898542, + "flos": 23922398897280.0, + "grad_norm": 1.5893184098427633, + "language_loss": 0.72403145, + "learning_rate": 8.616279179832329e-07, + "loss": 0.80076146, + "num_input_tokens_seen": 251880175, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09631348, + "step": 11675, + "time_per_iteration": 2.535900115966797 + }, + { + "auxiliary_loss_clip": 0.06414977, + "auxiliary_loss_mlp": 0.01267979, + "balance_loss_clip": 0.06276758, + "balance_loss_mlp": 0.01257047, + "epoch": 0.70199909815121, + "flos": 21801503877120.0, + "grad_norm": 2.0246464203601278, + "language_loss": 0.51067138, + "learning_rate": 8.613077207613078e-07, + "loss": 0.58750093, + "num_input_tokens_seen": 251899005, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.109375, + "step": 11676, + "time_per_iteration": 2.555906057357788 + }, + { + "auxiliary_loss_clip": 0.06319048, + "auxiliary_loss_mlp": 0.01249904, + "balance_loss_clip": 0.06262474, + "balance_loss_mlp": 0.01248563, + "epoch": 0.702059221403878, + "flos": 71736575224320.0, + "grad_norm": 0.7224738346499476, + "language_loss": 0.59202904, + "learning_rate": 8.609875667187079e-07, + "loss": 0.66771859, + "num_input_tokens_seen": 251966790, + "router_z_loss_clip": 0.56640625, + "router_z_loss_mlp": 0.01343536, + "step": 11677, + "time_per_iteration": 4.580153942108154 + }, + { + "auxiliary_loss_clip": 0.06413269, + "auxiliary_loss_mlp": 0.01266317, + "balance_loss_clip": 0.06275491, + "balance_loss_mlp": 0.0125582, + "epoch": 0.7021193446565459, + "flos": 28118599516800.0, + "grad_norm": 1.944945343813431, + "language_loss": 0.6293093, + "learning_rate": 8.606674558675737e-07, + "loss": 0.70610511, + "num_input_tokens_seen": 251989315, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.10498047, + "step": 11678, + "time_per_iteration": 2.652944803237915 + }, + { + "auxiliary_loss_clip": 0.06410077, + "auxiliary_loss_mlp": 0.0126477, + "balance_loss_clip": 0.06276654, + "balance_loss_mlp": 0.01254786, + "epoch": 0.7021794679092139, + "flos": 22930720225920.0, + "grad_norm": 1.5864608475530155, + "language_loss": 0.7993412, + "learning_rate": 8.603473882200444e-07, + "loss": 0.87608963, + "num_input_tokens_seen": 252006620, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.09991455, + "step": 11679, + "time_per_iteration": 2.517608404159546 + }, + { + "auxiliary_loss_clip": 0.06410368, + "auxiliary_loss_mlp": 0.01263633, + "balance_loss_clip": 0.06277052, + "balance_loss_mlp": 0.01254615, + "epoch": 0.7022395911618818, + "flos": 18083756471040.0, + "grad_norm": 2.1970830940848614, + "language_loss": 0.70462888, + "learning_rate": 8.600273637882567e-07, + "loss": 0.78136891, + "num_input_tokens_seen": 252024570, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.09014893, + "step": 11680, + "time_per_iteration": 2.4937846660614014 + }, + { + "auxiliary_loss_clip": 0.06416643, + "auxiliary_loss_mlp": 0.01267202, + "balance_loss_clip": 0.06276958, + "balance_loss_mlp": 0.01256408, + "epoch": 0.7022997144145499, + "flos": 16039827025920.0, + "grad_norm": 1.5993399056299638, + "language_loss": 0.74800062, + "learning_rate": 8.597073825843446e-07, + "loss": 0.82483912, + "num_input_tokens_seen": 252042775, + "router_z_loss_clip": 1.39648438, + "router_z_loss_mlp": 0.10791016, + "step": 11681, + "time_per_iteration": 3.912652015686035 + }, + { + "auxiliary_loss_clip": 0.06407465, + "auxiliary_loss_mlp": 0.01264961, + "balance_loss_clip": 0.06273095, + "balance_loss_mlp": 0.01254536, + "epoch": 0.7023598376672178, + "flos": 26475864220800.0, + "grad_norm": 1.529501150189484, + "language_loss": 0.77074146, + "learning_rate": 8.593874446204434e-07, + "loss": 0.84746575, + "num_input_tokens_seen": 252063690, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.10424805, + "step": 11682, + "time_per_iteration": 2.5244510173797607 + }, + { + "auxiliary_loss_clip": 0.06414787, + "auxiliary_loss_mlp": 0.01267242, + "balance_loss_clip": 0.06274539, + "balance_loss_mlp": 0.01255625, + "epoch": 0.7024199609198858, + "flos": 17061624040320.0, + "grad_norm": 2.0146711656624947, + "language_loss": 0.73610115, + "learning_rate": 8.590675499086841e-07, + "loss": 0.81292146, + "num_input_tokens_seen": 252080335, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.11627197, + "step": 11683, + "time_per_iteration": 2.4807722568511963 + }, + { + "auxiliary_loss_clip": 0.06412771, + "auxiliary_loss_mlp": 0.01265673, + "balance_loss_clip": 0.06278127, + "balance_loss_mlp": 0.01254467, + "epoch": 0.7024800841725537, + "flos": 25856225677440.0, + "grad_norm": 1.8616488886702496, + "language_loss": 0.7201761, + "learning_rate": 8.587476984611976e-07, + "loss": 0.79696059, + "num_input_tokens_seen": 252101075, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.11212158, + "step": 11684, + "time_per_iteration": 2.5248489379882812 + }, + { + "auxiliary_loss_clip": 0.06409675, + "auxiliary_loss_mlp": 0.01268405, + "balance_loss_clip": 0.06274322, + "balance_loss_mlp": 0.01257741, + "epoch": 0.7025402074252217, + "flos": 23519653447680.0, + "grad_norm": 2.2560693638667386, + "language_loss": 0.72109079, + "learning_rate": 8.584278902901128e-07, + "loss": 0.79787153, + "num_input_tokens_seen": 252120510, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.10668945, + "step": 11685, + "time_per_iteration": 2.5545883178710938 + }, + { + "auxiliary_loss_clip": 0.06411938, + "auxiliary_loss_mlp": 0.01264141, + "balance_loss_clip": 0.06274469, + "balance_loss_mlp": 0.01254021, + "epoch": 0.7026003306778896, + "flos": 20156169104640.0, + "grad_norm": 1.6059462262520903, + "language_loss": 0.8497479, + "learning_rate": 8.581081254075582e-07, + "loss": 0.92650867, + "num_input_tokens_seen": 252137590, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.10119629, + "step": 11686, + "time_per_iteration": 2.4869866371154785 + }, + { + "auxiliary_loss_clip": 0.06311645, + "auxiliary_loss_mlp": 0.01250458, + "balance_loss_clip": 0.06255314, + "balance_loss_mlp": 0.01249239, + "epoch": 0.7026604539305576, + "flos": 64791036362880.0, + "grad_norm": 0.9748591985428325, + "language_loss": 0.6989513, + "learning_rate": 8.577884038256566e-07, + "loss": 0.77457231, + "num_input_tokens_seen": 252199830, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01217651, + "step": 11687, + "time_per_iteration": 3.2795140743255615 + }, + { + "auxiliary_loss_clip": 0.06411874, + "auxiliary_loss_mlp": 0.01269631, + "balance_loss_clip": 0.06276284, + "balance_loss_mlp": 0.01259421, + "epoch": 0.7027205771832256, + "flos": 21877882473600.0, + "grad_norm": 2.1687744057978575, + "language_loss": 0.7759158, + "learning_rate": 8.574687255565329e-07, + "loss": 0.85273087, + "num_input_tokens_seen": 252217200, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.10205078, + "step": 11688, + "time_per_iteration": 2.506697416305542 + }, + { + "auxiliary_loss_clip": 0.06409185, + "auxiliary_loss_mlp": 0.01263217, + "balance_loss_clip": 0.06273778, + "balance_loss_mlp": 0.0125287, + "epoch": 0.7027807004358936, + "flos": 23374526976000.0, + "grad_norm": 2.0500924601059687, + "language_loss": 0.69007778, + "learning_rate": 8.571490906123107e-07, + "loss": 0.76680183, + "num_input_tokens_seen": 252236105, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.10339355, + "step": 11689, + "time_per_iteration": 2.526963472366333 + }, + { + "auxiliary_loss_clip": 0.06412712, + "auxiliary_loss_mlp": 0.01267707, + "balance_loss_clip": 0.0627338, + "balance_loss_mlp": 0.01255834, + "epoch": 0.7028408236885616, + "flos": 15309624620160.0, + "grad_norm": 2.4528764604041977, + "language_loss": 0.79761183, + "learning_rate": 8.568294990051086e-07, + "loss": 0.87441605, + "num_input_tokens_seen": 252253315, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.11871338, + "step": 11690, + "time_per_iteration": 2.5314319133758545 + }, + { + "auxiliary_loss_clip": 0.06412818, + "auxiliary_loss_mlp": 0.01269418, + "balance_loss_clip": 0.06277384, + "balance_loss_mlp": 0.01258677, + "epoch": 0.7029009469412295, + "flos": 22024769880960.0, + "grad_norm": 1.8333973382314617, + "language_loss": 0.75588238, + "learning_rate": 8.56509950747047e-07, + "loss": 0.83270478, + "num_input_tokens_seen": 252272765, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.10748291, + "step": 11691, + "time_per_iteration": 2.5446360111236572 + }, + { + "auxiliary_loss_clip": 0.06412929, + "auxiliary_loss_mlp": 0.01264486, + "balance_loss_clip": 0.06278588, + "balance_loss_mlp": 0.0125449, + "epoch": 0.7029610701938975, + "flos": 21842020126080.0, + "grad_norm": 1.7290780486458988, + "language_loss": 0.81951666, + "learning_rate": 8.561904458502429e-07, + "loss": 0.89629078, + "num_input_tokens_seen": 252290510, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.09997559, + "step": 11692, + "time_per_iteration": 2.475939989089966 + }, + { + "auxiliary_loss_clip": 0.06407632, + "auxiliary_loss_mlp": 0.01264663, + "balance_loss_clip": 0.06272383, + "balance_loss_mlp": 0.01253577, + "epoch": 0.7030211934465654, + "flos": 19141709322240.0, + "grad_norm": 1.4786815492141234, + "language_loss": 0.76637983, + "learning_rate": 8.558709843268111e-07, + "loss": 0.84310281, + "num_input_tokens_seen": 252309365, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.11090088, + "step": 11693, + "time_per_iteration": 2.523207664489746 + }, + { + "auxiliary_loss_clip": 0.06409247, + "auxiliary_loss_mlp": 0.01267485, + "balance_loss_clip": 0.06274758, + "balance_loss_mlp": 0.01256959, + "epoch": 0.7030813166992335, + "flos": 38555307544320.0, + "grad_norm": 3.0680910714990945, + "language_loss": 0.685, + "learning_rate": 8.55551566188866e-07, + "loss": 0.76176739, + "num_input_tokens_seen": 252333010, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.10522461, + "step": 11694, + "time_per_iteration": 2.6671559810638428 + }, + { + "auxiliary_loss_clip": 0.06413712, + "auxiliary_loss_mlp": 0.01265339, + "balance_loss_clip": 0.06276645, + "balance_loss_mlp": 0.01255105, + "epoch": 0.7031414399519014, + "flos": 14726225767680.0, + "grad_norm": 2.01117706312431, + "language_loss": 0.75637174, + "learning_rate": 8.552321914485203e-07, + "loss": 0.83316225, + "num_input_tokens_seen": 252351330, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.10235596, + "step": 11695, + "time_per_iteration": 2.508373975753784 + }, + { + "auxiliary_loss_clip": 0.0642024, + "auxiliary_loss_mlp": 0.01270249, + "balance_loss_clip": 0.06280233, + "balance_loss_mlp": 0.01258644, + "epoch": 0.7032015632045694, + "flos": 14032388833920.0, + "grad_norm": 1.954001814184471, + "language_loss": 0.74258196, + "learning_rate": 8.549128601178852e-07, + "loss": 0.81948686, + "num_input_tokens_seen": 252369580, + "router_z_loss_clip": 1.39941406, + "router_z_loss_mlp": 0.11602783, + "step": 11696, + "time_per_iteration": 2.4646289348602295 + }, + { + "auxiliary_loss_clip": 0.06413354, + "auxiliary_loss_mlp": 0.01266085, + "balance_loss_clip": 0.06275193, + "balance_loss_mlp": 0.01254969, + "epoch": 0.7032616864572373, + "flos": 27644716350720.0, + "grad_norm": 7.188542829701478, + "language_loss": 0.75876927, + "learning_rate": 8.545935722090693e-07, + "loss": 0.83556366, + "num_input_tokens_seen": 252390525, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.11108398, + "step": 11697, + "time_per_iteration": 2.564423084259033 + }, + { + "auxiliary_loss_clip": 0.06411704, + "auxiliary_loss_mlp": 0.0126863, + "balance_loss_clip": 0.06273724, + "balance_loss_mlp": 0.01257508, + "epoch": 0.7033218097099053, + "flos": 17973024900480.0, + "grad_norm": 1.6931225387398507, + "language_loss": 0.80683148, + "learning_rate": 8.542743277341793e-07, + "loss": 0.88363487, + "num_input_tokens_seen": 252407470, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.11126709, + "step": 11698, + "time_per_iteration": 2.4535627365112305 + }, + { + "auxiliary_loss_clip": 0.0641105, + "auxiliary_loss_mlp": 0.01266224, + "balance_loss_clip": 0.06272902, + "balance_loss_mlp": 0.01255239, + "epoch": 0.7033819329625732, + "flos": 19508047372800.0, + "grad_norm": 1.3566537423348073, + "language_loss": 0.84644032, + "learning_rate": 8.539551267053222e-07, + "loss": 0.92321312, + "num_input_tokens_seen": 252427025, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.10974121, + "step": 11699, + "time_per_iteration": 2.5543456077575684 + }, + { + "auxiliary_loss_clip": 0.06408502, + "auxiliary_loss_mlp": 0.01265387, + "balance_loss_clip": 0.06274264, + "balance_loss_mlp": 0.01254628, + "epoch": 0.7034420562152413, + "flos": 23994417081600.0, + "grad_norm": 1.970773248623371, + "language_loss": 0.7962184, + "learning_rate": 8.53635969134601e-07, + "loss": 0.87295729, + "num_input_tokens_seen": 252445410, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.10760498, + "step": 11700, + "time_per_iteration": 2.4985594749450684 + }, + { + "auxiliary_loss_clip": 0.06412737, + "auxiliary_loss_mlp": 0.01264767, + "balance_loss_clip": 0.06273302, + "balance_loss_mlp": 0.01253507, + "epoch": 0.7035021794679092, + "flos": 35052147244800.0, + "grad_norm": 1.812061465534113, + "language_loss": 0.74477667, + "learning_rate": 8.533168550341186e-07, + "loss": 0.82155174, + "num_input_tokens_seen": 252463905, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.11254883, + "step": 11701, + "time_per_iteration": 4.042437314987183 + }, + { + "auxiliary_loss_clip": 0.064155, + "auxiliary_loss_mlp": 0.01264422, + "balance_loss_clip": 0.06275072, + "balance_loss_mlp": 0.01253246, + "epoch": 0.7035623027205772, + "flos": 11001811962240.0, + "grad_norm": 2.072031067866928, + "language_loss": 0.83952713, + "learning_rate": 8.529977844159769e-07, + "loss": 0.91632634, + "num_input_tokens_seen": 252478655, + "router_z_loss_clip": 1.40527344, + "router_z_loss_mlp": 0.11175537, + "step": 11702, + "time_per_iteration": 2.5586178302764893 + }, + { + "auxiliary_loss_clip": 0.0641142, + "auxiliary_loss_mlp": 0.01264208, + "balance_loss_clip": 0.06272231, + "balance_loss_mlp": 0.01253825, + "epoch": 0.7036224259732452, + "flos": 23630594653440.0, + "grad_norm": 1.6523267572786273, + "language_loss": 0.61088848, + "learning_rate": 8.526787572922738e-07, + "loss": 0.68764472, + "num_input_tokens_seen": 252498740, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.1038208, + "step": 11703, + "time_per_iteration": 2.521512985229492 + }, + { + "auxiliary_loss_clip": 0.06413552, + "auxiliary_loss_mlp": 0.01266937, + "balance_loss_clip": 0.06275339, + "balance_loss_mlp": 0.01255869, + "epoch": 0.7036825492259131, + "flos": 31694239198080.0, + "grad_norm": 1.8799008475861942, + "language_loss": 0.61646456, + "learning_rate": 8.523597736751067e-07, + "loss": 0.69326943, + "num_input_tokens_seen": 252517800, + "router_z_loss_clip": 1.38183594, + "router_z_loss_mlp": 0.11065674, + "step": 11704, + "time_per_iteration": 2.637000560760498 + }, + { + "auxiliary_loss_clip": 0.06406493, + "auxiliary_loss_mlp": 0.0126777, + "balance_loss_clip": 0.06273523, + "balance_loss_mlp": 0.01258109, + "epoch": 0.7037426724785811, + "flos": 30201116567040.0, + "grad_norm": 1.5166852635712837, + "language_loss": 0.70736712, + "learning_rate": 8.520408335765719e-07, + "loss": 0.78410971, + "num_input_tokens_seen": 252539620, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.09667969, + "step": 11705, + "time_per_iteration": 2.5815892219543457 + }, + { + "auxiliary_loss_clip": 0.06409339, + "auxiliary_loss_mlp": 0.01265192, + "balance_loss_clip": 0.06274589, + "balance_loss_mlp": 0.01254833, + "epoch": 0.703802795731249, + "flos": 24317597479680.0, + "grad_norm": 1.8692688199911445, + "language_loss": 0.61916155, + "learning_rate": 8.517219370087645e-07, + "loss": 0.69590688, + "num_input_tokens_seen": 252557300, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.10351562, + "step": 11706, + "time_per_iteration": 2.537567615509033 + }, + { + "auxiliary_loss_clip": 0.06410844, + "auxiliary_loss_mlp": 0.01265613, + "balance_loss_clip": 0.06273291, + "balance_loss_mlp": 0.01254061, + "epoch": 0.7038629189839171, + "flos": 22535605497600.0, + "grad_norm": 2.4391424281987506, + "language_loss": 0.68479651, + "learning_rate": 8.514030839837756e-07, + "loss": 0.76156104, + "num_input_tokens_seen": 252576715, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.11560059, + "step": 11707, + "time_per_iteration": 2.4984869956970215 + }, + { + "auxiliary_loss_clip": 0.06406912, + "auxiliary_loss_mlp": 0.01267156, + "balance_loss_clip": 0.06272735, + "balance_loss_mlp": 0.01257101, + "epoch": 0.703923042236585, + "flos": 26257755242880.0, + "grad_norm": 1.9008341016793249, + "language_loss": 0.76335013, + "learning_rate": 8.510842745136974e-07, + "loss": 0.84009075, + "num_input_tokens_seen": 252596190, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.10058594, + "step": 11708, + "time_per_iteration": 2.552219867706299 + }, + { + "auxiliary_loss_clip": 0.06407606, + "auxiliary_loss_mlp": 0.01261422, + "balance_loss_clip": 0.06274488, + "balance_loss_mlp": 0.01251313, + "epoch": 0.703983165489253, + "flos": 19396225699200.0, + "grad_norm": 1.582678176456311, + "language_loss": 0.7205376, + "learning_rate": 8.50765508610619e-07, + "loss": 0.79722786, + "num_input_tokens_seen": 252613410, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.10107422, + "step": 11709, + "time_per_iteration": 2.479956865310669 + }, + { + "auxiliary_loss_clip": 0.06409952, + "auxiliary_loss_mlp": 0.01266177, + "balance_loss_clip": 0.06274274, + "balance_loss_mlp": 0.01256098, + "epoch": 0.7040432887419209, + "flos": 16688032611840.0, + "grad_norm": 1.9337929130323093, + "language_loss": 0.79638529, + "learning_rate": 8.504467862866267e-07, + "loss": 0.87314653, + "num_input_tokens_seen": 252629150, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10076904, + "step": 11710, + "time_per_iteration": 2.495333194732666 + }, + { + "auxiliary_loss_clip": 0.06415999, + "auxiliary_loss_mlp": 0.0126626, + "balance_loss_clip": 0.06278241, + "balance_loss_mlp": 0.01255674, + "epoch": 0.7041034119945889, + "flos": 21147638140800.0, + "grad_norm": 1.663598845140954, + "language_loss": 0.77776545, + "learning_rate": 8.501281075538076e-07, + "loss": 0.85458803, + "num_input_tokens_seen": 252648225, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.105896, + "step": 11711, + "time_per_iteration": 2.500640392303467 + }, + { + "auxiliary_loss_clip": 0.06410688, + "auxiliary_loss_mlp": 0.01265823, + "balance_loss_clip": 0.06276608, + "balance_loss_mlp": 0.01255237, + "epoch": 0.7041635352472568, + "flos": 16916036371200.0, + "grad_norm": 1.9928632293831094, + "language_loss": 0.7447651, + "learning_rate": 8.498094724242457e-07, + "loss": 0.82153022, + "num_input_tokens_seen": 252665380, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.10583496, + "step": 11712, + "time_per_iteration": 2.501585006713867 + }, + { + "auxiliary_loss_clip": 0.06320854, + "auxiliary_loss_mlp": 0.01257118, + "balance_loss_clip": 0.06264362, + "balance_loss_mlp": 0.01255823, + "epoch": 0.7042236584999249, + "flos": 71703186572160.0, + "grad_norm": 0.8590002483868424, + "language_loss": 0.64672804, + "learning_rate": 8.494908809100247e-07, + "loss": 0.72250772, + "num_input_tokens_seen": 252727950, + "router_z_loss_clip": 0.56640625, + "router_z_loss_mlp": 0.01295471, + "step": 11713, + "time_per_iteration": 4.5734851360321045 + }, + { + "auxiliary_loss_clip": 0.06410141, + "auxiliary_loss_mlp": 0.01263047, + "balance_loss_clip": 0.06274079, + "balance_loss_mlp": 0.01252991, + "epoch": 0.7042837817525928, + "flos": 28665800605440.0, + "grad_norm": 1.9680516689018257, + "language_loss": 0.72915512, + "learning_rate": 8.49172333023225e-07, + "loss": 0.80588698, + "num_input_tokens_seen": 252746770, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.1005249, + "step": 11714, + "time_per_iteration": 2.5535781383514404 + }, + { + "auxiliary_loss_clip": 0.06411086, + "auxiliary_loss_mlp": 0.01268594, + "balance_loss_clip": 0.06275805, + "balance_loss_mlp": 0.01256757, + "epoch": 0.7043439050052608, + "flos": 19759335367680.0, + "grad_norm": 2.3616586102145805, + "language_loss": 0.80244958, + "learning_rate": 8.488538287759248e-07, + "loss": 0.87924635, + "num_input_tokens_seen": 252765610, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.11828613, + "step": 11715, + "time_per_iteration": 2.4991419315338135 + }, + { + "auxiliary_loss_clip": 0.06414278, + "auxiliary_loss_mlp": 0.01267093, + "balance_loss_clip": 0.0627607, + "balance_loss_mlp": 0.01256155, + "epoch": 0.7044040282579288, + "flos": 11541969308160.0, + "grad_norm": 1.9765202948162532, + "language_loss": 0.71383488, + "learning_rate": 8.485353681802037e-07, + "loss": 0.79064858, + "num_input_tokens_seen": 252781610, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.10931396, + "step": 11716, + "time_per_iteration": 3.9245705604553223 + }, + { + "auxiliary_loss_clip": 0.06418915, + "auxiliary_loss_mlp": 0.01264541, + "balance_loss_clip": 0.06277251, + "balance_loss_mlp": 0.01253783, + "epoch": 0.7044641515105967, + "flos": 33664473377280.0, + "grad_norm": 1.7730534730356675, + "language_loss": 0.66482782, + "learning_rate": 8.482169512481358e-07, + "loss": 0.74166238, + "num_input_tokens_seen": 252800600, + "router_z_loss_clip": 1.41699219, + "router_z_loss_mlp": 0.10760498, + "step": 11717, + "time_per_iteration": 2.6029398441314697 + }, + { + "auxiliary_loss_clip": 0.06415347, + "auxiliary_loss_mlp": 0.01266424, + "balance_loss_clip": 0.0627737, + "balance_loss_mlp": 0.01256011, + "epoch": 0.7045242747632647, + "flos": 26731051430400.0, + "grad_norm": 1.5043477958415044, + "language_loss": 0.74609149, + "learning_rate": 8.478985779917967e-07, + "loss": 0.82290918, + "num_input_tokens_seen": 252822310, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.10412598, + "step": 11718, + "time_per_iteration": 2.574075937271118 + }, + { + "auxiliary_loss_clip": 0.06412348, + "auxiliary_loss_mlp": 0.01264631, + "balance_loss_clip": 0.06277113, + "balance_loss_mlp": 0.01254224, + "epoch": 0.7045843980159326, + "flos": 26804998258560.0, + "grad_norm": 1.5984477962629227, + "language_loss": 0.80229437, + "learning_rate": 8.475802484232606e-07, + "loss": 0.8790642, + "num_input_tokens_seen": 252842355, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.10412598, + "step": 11719, + "time_per_iteration": 2.557602643966675 + }, + { + "auxiliary_loss_clip": 0.0641358, + "auxiliary_loss_mlp": 0.01263485, + "balance_loss_clip": 0.06277666, + "balance_loss_mlp": 0.01252524, + "epoch": 0.7046445212686007, + "flos": 41584710458880.0, + "grad_norm": 1.6868566975802164, + "language_loss": 0.65635586, + "learning_rate": 8.472619625545951e-07, + "loss": 0.73312646, + "num_input_tokens_seen": 252866785, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.10961914, + "step": 11720, + "time_per_iteration": 4.092779159545898 + }, + { + "auxiliary_loss_clip": 0.06422915, + "auxiliary_loss_mlp": 0.01266179, + "balance_loss_clip": 0.06280062, + "balance_loss_mlp": 0.01255194, + "epoch": 0.7047046445212686, + "flos": 15565650370560.0, + "grad_norm": 2.147768548041585, + "language_loss": 0.8022362, + "learning_rate": 8.46943720397872e-07, + "loss": 0.87912714, + "num_input_tokens_seen": 252881870, + "router_z_loss_clip": 1.42871094, + "router_z_loss_mlp": 0.10986328, + "step": 11721, + "time_per_iteration": 2.4634041786193848 + }, + { + "auxiliary_loss_clip": 0.06318594, + "auxiliary_loss_mlp": 0.01253531, + "balance_loss_clip": 0.06262027, + "balance_loss_mlp": 0.01252384, + "epoch": 0.7047647677739366, + "flos": 70433036455680.0, + "grad_norm": 0.7472916144331851, + "language_loss": 0.64821076, + "learning_rate": 8.466255219651582e-07, + "loss": 0.72393203, + "num_input_tokens_seen": 252951300, + "router_z_loss_clip": 0.56640625, + "router_z_loss_mlp": 0.01146698, + "step": 11722, + "time_per_iteration": 3.2447893619537354 + }, + { + "auxiliary_loss_clip": 0.06410772, + "auxiliary_loss_mlp": 0.0126411, + "balance_loss_clip": 0.06275559, + "balance_loss_mlp": 0.01253536, + "epoch": 0.7048248910266045, + "flos": 23666876271360.0, + "grad_norm": 2.268842508315268, + "language_loss": 0.66067719, + "learning_rate": 8.463073672685211e-07, + "loss": 0.73742604, + "num_input_tokens_seen": 252971400, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.10571289, + "step": 11723, + "time_per_iteration": 2.556645154953003 + }, + { + "auxiliary_loss_clip": 0.06413794, + "auxiliary_loss_mlp": 0.01263861, + "balance_loss_clip": 0.06275541, + "balance_loss_mlp": 0.01252703, + "epoch": 0.7048850142792725, + "flos": 21403496183040.0, + "grad_norm": 1.9667058211108481, + "language_loss": 0.80938751, + "learning_rate": 8.459892563200235e-07, + "loss": 0.88616407, + "num_input_tokens_seen": 252989475, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.11151123, + "step": 11724, + "time_per_iteration": 2.521294116973877 + }, + { + "auxiliary_loss_clip": 0.06412652, + "auxiliary_loss_mlp": 0.01263234, + "balance_loss_clip": 0.06273533, + "balance_loss_mlp": 0.01252619, + "epoch": 0.7049451375319404, + "flos": 21653736001920.0, + "grad_norm": 1.878825511688235, + "language_loss": 0.73036087, + "learning_rate": 8.456711891317296e-07, + "loss": 0.80711973, + "num_input_tokens_seen": 253007220, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.10620117, + "step": 11725, + "time_per_iteration": 2.491532325744629 + }, + { + "auxiliary_loss_clip": 0.06419054, + "auxiliary_loss_mlp": 0.01266944, + "balance_loss_clip": 0.06278444, + "balance_loss_mlp": 0.01256275, + "epoch": 0.7050052607846085, + "flos": 14872148853120.0, + "grad_norm": 1.93227359409925, + "language_loss": 0.78747177, + "learning_rate": 8.453531657156998e-07, + "loss": 0.86433172, + "num_input_tokens_seen": 253025410, + "router_z_loss_clip": 1.40527344, + "router_z_loss_mlp": 0.10668945, + "step": 11726, + "time_per_iteration": 2.625894069671631 + }, + { + "auxiliary_loss_clip": 0.06411958, + "auxiliary_loss_mlp": 0.0126862, + "balance_loss_clip": 0.06275987, + "balance_loss_mlp": 0.01258273, + "epoch": 0.7050653840372764, + "flos": 19247283866880.0, + "grad_norm": 2.1540780661141374, + "language_loss": 0.70452571, + "learning_rate": 8.450351860839931e-07, + "loss": 0.78133154, + "num_input_tokens_seen": 253043305, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.10351562, + "step": 11727, + "time_per_iteration": 2.540519952774048 + }, + { + "auxiliary_loss_clip": 0.06403094, + "auxiliary_loss_mlp": 0.01263675, + "balance_loss_clip": 0.0627404, + "balance_loss_mlp": 0.01254752, + "epoch": 0.7051255072899444, + "flos": 27787536835200.0, + "grad_norm": 1.531115099301347, + "language_loss": 0.69006073, + "learning_rate": 8.44717250248668e-07, + "loss": 0.7667284, + "num_input_tokens_seen": 253062790, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.08917236, + "step": 11728, + "time_per_iteration": 2.5793302059173584 + }, + { + "auxiliary_loss_clip": 0.06412704, + "auxiliary_loss_mlp": 0.0126399, + "balance_loss_clip": 0.06276618, + "balance_loss_mlp": 0.01253773, + "epoch": 0.7051856305426124, + "flos": 27899526216960.0, + "grad_norm": 1.8133071590962522, + "language_loss": 0.73397171, + "learning_rate": 8.443993582217803e-07, + "loss": 0.81073868, + "num_input_tokens_seen": 253082055, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.10211182, + "step": 11729, + "time_per_iteration": 2.632077693939209 + }, + { + "auxiliary_loss_clip": 0.06421916, + "auxiliary_loss_mlp": 0.01265278, + "balance_loss_clip": 0.06277753, + "balance_loss_mlp": 0.01253775, + "epoch": 0.7052457537952803, + "flos": 25050147799680.0, + "grad_norm": 1.613038649768226, + "language_loss": 0.78167063, + "learning_rate": 8.440815100153862e-07, + "loss": 0.8585425, + "num_input_tokens_seen": 253102575, + "router_z_loss_clip": 1.44140625, + "router_z_loss_mlp": 0.1149292, + "step": 11730, + "time_per_iteration": 2.5648131370544434 + }, + { + "auxiliary_loss_clip": 0.06414882, + "auxiliary_loss_mlp": 0.01268388, + "balance_loss_clip": 0.06275609, + "balance_loss_mlp": 0.0125698, + "epoch": 0.7053058770479483, + "flos": 21878175962880.0, + "grad_norm": 2.325298368428052, + "language_loss": 0.62874782, + "learning_rate": 8.437637056415359e-07, + "loss": 0.70558047, + "num_input_tokens_seen": 253121290, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.11401367, + "step": 11731, + "time_per_iteration": 2.546156167984009 + }, + { + "auxiliary_loss_clip": 0.06416281, + "auxiliary_loss_mlp": 0.01270278, + "balance_loss_clip": 0.06275978, + "balance_loss_mlp": 0.01258679, + "epoch": 0.7053660003006162, + "flos": 16404236432640.0, + "grad_norm": 1.9339047251972874, + "language_loss": 0.74811733, + "learning_rate": 8.434459451122815e-07, + "loss": 0.82498294, + "num_input_tokens_seen": 253139720, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.1159668, + "step": 11732, + "time_per_iteration": 2.4927430152893066 + }, + { + "auxiliary_loss_clip": 0.06408133, + "auxiliary_loss_mlp": 0.01266798, + "balance_loss_clip": 0.06274602, + "balance_loss_mlp": 0.01256534, + "epoch": 0.7054261235532843, + "flos": 22718271398400.0, + "grad_norm": 1.4288707050417415, + "language_loss": 0.71580064, + "learning_rate": 8.431282284396735e-07, + "loss": 0.79254997, + "num_input_tokens_seen": 253160250, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.1026001, + "step": 11733, + "time_per_iteration": 2.543832540512085 + }, + { + "auxiliary_loss_clip": 0.06411871, + "auxiliary_loss_mlp": 0.01268245, + "balance_loss_clip": 0.06275688, + "balance_loss_mlp": 0.01258154, + "epoch": 0.7054862468059522, + "flos": 13594829212800.0, + "grad_norm": 1.9266065814345037, + "language_loss": 0.73917806, + "learning_rate": 8.428105556357583e-07, + "loss": 0.81597924, + "num_input_tokens_seen": 253178710, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.10095215, + "step": 11734, + "time_per_iteration": 2.496680736541748 + }, + { + "auxiliary_loss_clip": 0.06421253, + "auxiliary_loss_mlp": 0.01273046, + "balance_loss_clip": 0.06277873, + "balance_loss_mlp": 0.0126184, + "epoch": 0.7055463700586202, + "flos": 15884931553920.0, + "grad_norm": 4.995085142451974, + "language_loss": 0.70442164, + "learning_rate": 8.424929267125829e-07, + "loss": 0.78136462, + "num_input_tokens_seen": 253194805, + "router_z_loss_clip": 1.43359375, + "router_z_loss_mlp": 0.11206055, + "step": 11735, + "time_per_iteration": 2.560451030731201 + }, + { + "auxiliary_loss_clip": 0.06413963, + "auxiliary_loss_mlp": 0.01270144, + "balance_loss_clip": 0.06274843, + "balance_loss_mlp": 0.01257955, + "epoch": 0.7056064933112881, + "flos": 23082890440320.0, + "grad_norm": 1.6821797399985068, + "language_loss": 0.72724199, + "learning_rate": 8.421753416821933e-07, + "loss": 0.80408299, + "num_input_tokens_seen": 253213895, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.12182617, + "step": 11736, + "time_per_iteration": 2.5113935470581055 + }, + { + "auxiliary_loss_clip": 0.06410478, + "auxiliary_loss_mlp": 0.0126459, + "balance_loss_clip": 0.06277382, + "balance_loss_mlp": 0.01254356, + "epoch": 0.7056666165639561, + "flos": 24063374592000.0, + "grad_norm": 1.617495345914111, + "language_loss": 0.69220245, + "learning_rate": 8.41857800556629e-07, + "loss": 0.7689532, + "num_input_tokens_seen": 253231620, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.10235596, + "step": 11737, + "time_per_iteration": 2.5327107906341553 + }, + { + "auxiliary_loss_clip": 0.06416027, + "auxiliary_loss_mlp": 0.01265741, + "balance_loss_clip": 0.06277978, + "balance_loss_mlp": 0.01254279, + "epoch": 0.705726739816624, + "flos": 17498932099200.0, + "grad_norm": 1.8698204681752435, + "language_loss": 0.67921227, + "learning_rate": 8.415403033479332e-07, + "loss": 0.75602996, + "num_input_tokens_seen": 253249590, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.11474609, + "step": 11738, + "time_per_iteration": 2.458019733428955 + }, + { + "auxiliary_loss_clip": 0.06411514, + "auxiliary_loss_mlp": 0.01264856, + "balance_loss_clip": 0.06274632, + "balance_loss_mlp": 0.0125408, + "epoch": 0.7057868630692921, + "flos": 51361515256320.0, + "grad_norm": 7.975241590020644, + "language_loss": 0.74895537, + "learning_rate": 8.41222850068145e-07, + "loss": 0.82571906, + "num_input_tokens_seen": 253273870, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.10784912, + "step": 11739, + "time_per_iteration": 2.7849392890930176 + }, + { + "auxiliary_loss_clip": 0.0641078, + "auxiliary_loss_mlp": 0.01263148, + "balance_loss_clip": 0.0627811, + "balance_loss_mlp": 0.01252663, + "epoch": 0.70584698632196, + "flos": 26109945440640.0, + "grad_norm": 1.5818256072351289, + "language_loss": 0.71794957, + "learning_rate": 8.409054407293032e-07, + "loss": 0.79468888, + "num_input_tokens_seen": 253293720, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.10479736, + "step": 11740, + "time_per_iteration": 4.018102645874023 + }, + { + "auxiliary_loss_clip": 0.06408996, + "auxiliary_loss_mlp": 0.01270494, + "balance_loss_clip": 0.06274964, + "balance_loss_mlp": 0.01260939, + "epoch": 0.705907109574628, + "flos": 21549503122560.0, + "grad_norm": 1.4620628375932287, + "language_loss": 0.82029426, + "learning_rate": 8.405880753434434e-07, + "loss": 0.89708912, + "num_input_tokens_seen": 253313700, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.09558105, + "step": 11741, + "time_per_iteration": 2.5226922035217285 + }, + { + "auxiliary_loss_clip": 0.06412125, + "auxiliary_loss_mlp": 0.0126669, + "balance_loss_clip": 0.06276572, + "balance_loss_mlp": 0.01255389, + "epoch": 0.705967232827296, + "flos": 22717432857600.0, + "grad_norm": 1.792685843416777, + "language_loss": 0.7848987, + "learning_rate": 8.402707539225993e-07, + "loss": 0.86168694, + "num_input_tokens_seen": 253332425, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.11297607, + "step": 11742, + "time_per_iteration": 2.4881513118743896 + }, + { + "auxiliary_loss_clip": 0.06420448, + "auxiliary_loss_mlp": 0.01267345, + "balance_loss_clip": 0.06277722, + "balance_loss_mlp": 0.01256408, + "epoch": 0.7060273560799639, + "flos": 28698266862720.0, + "grad_norm": 1.447375520003719, + "language_loss": 0.64323652, + "learning_rate": 8.39953476478805e-07, + "loss": 0.72011447, + "num_input_tokens_seen": 253353620, + "router_z_loss_clip": 1.42675781, + "router_z_loss_mlp": 0.10919189, + "step": 11743, + "time_per_iteration": 2.5737526416778564 + }, + { + "auxiliary_loss_clip": 0.06413458, + "auxiliary_loss_mlp": 0.01269024, + "balance_loss_clip": 0.0627328, + "balance_loss_mlp": 0.01257693, + "epoch": 0.7060874793326319, + "flos": 15711699237120.0, + "grad_norm": 1.7211358867446458, + "language_loss": 0.65871137, + "learning_rate": 8.396362430240902e-07, + "loss": 0.73553622, + "num_input_tokens_seen": 253370930, + "router_z_loss_clip": 1.40039062, + "router_z_loss_mlp": 0.11322021, + "step": 11744, + "time_per_iteration": 2.479001998901367 + }, + { + "auxiliary_loss_clip": 0.06408134, + "auxiliary_loss_mlp": 0.01271135, + "balance_loss_clip": 0.06274446, + "balance_loss_mlp": 0.01260728, + "epoch": 0.7061476025852998, + "flos": 21513137650560.0, + "grad_norm": 2.025199572577618, + "language_loss": 0.63794267, + "learning_rate": 8.393190535704857e-07, + "loss": 0.71473539, + "num_input_tokens_seen": 253389810, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.10394287, + "step": 11745, + "time_per_iteration": 2.52616810798645 + }, + { + "auxiliary_loss_clip": 0.06410205, + "auxiliary_loss_mlp": 0.01263676, + "balance_loss_clip": 0.06273259, + "balance_loss_mlp": 0.01253311, + "epoch": 0.7062077258379679, + "flos": 28189024473600.0, + "grad_norm": 1.8444242196367828, + "language_loss": 0.71914798, + "learning_rate": 8.390019081300188e-07, + "loss": 0.79588681, + "num_input_tokens_seen": 253408685, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.10369873, + "step": 11746, + "time_per_iteration": 2.5588066577911377 + }, + { + "auxiliary_loss_clip": 0.06411352, + "auxiliary_loss_mlp": 0.01268167, + "balance_loss_clip": 0.06275406, + "balance_loss_mlp": 0.01257653, + "epoch": 0.7062678490906358, + "flos": 27860854757760.0, + "grad_norm": 1.5188195218955072, + "language_loss": 0.79773951, + "learning_rate": 8.386848067147175e-07, + "loss": 0.87453461, + "num_input_tokens_seen": 253429685, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.10510254, + "step": 11747, + "time_per_iteration": 2.5661420822143555 + }, + { + "auxiliary_loss_clip": 0.06411886, + "auxiliary_loss_mlp": 0.01264357, + "balance_loss_clip": 0.06277459, + "balance_loss_mlp": 0.01254307, + "epoch": 0.7063279723433038, + "flos": 23191483731840.0, + "grad_norm": 1.5251666611578065, + "language_loss": 0.65140951, + "learning_rate": 8.383677493366031e-07, + "loss": 0.72817194, + "num_input_tokens_seen": 253448260, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.1005249, + "step": 11748, + "time_per_iteration": 2.5165350437164307 + }, + { + "auxiliary_loss_clip": 0.06412359, + "auxiliary_loss_mlp": 0.01266364, + "balance_loss_clip": 0.06276652, + "balance_loss_mlp": 0.0125548, + "epoch": 0.7063880955959717, + "flos": 20194043950080.0, + "grad_norm": 1.8580174500745112, + "language_loss": 0.79421908, + "learning_rate": 8.380507360077003e-07, + "loss": 0.87100631, + "num_input_tokens_seen": 253467725, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.10888672, + "step": 11749, + "time_per_iteration": 2.5304911136627197 + }, + { + "auxiliary_loss_clip": 0.06318866, + "auxiliary_loss_mlp": 0.01253368, + "balance_loss_clip": 0.06263049, + "balance_loss_mlp": 0.01252189, + "epoch": 0.7064482188486397, + "flos": 63685020395520.0, + "grad_norm": 0.7869711578789559, + "language_loss": 0.54065382, + "learning_rate": 8.377337667400304e-07, + "loss": 0.61637622, + "num_input_tokens_seen": 253526940, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.01176453, + "step": 11750, + "time_per_iteration": 3.118065118789673 + }, + { + "auxiliary_loss_clip": 0.06410946, + "auxiliary_loss_mlp": 0.01265459, + "balance_loss_clip": 0.06275111, + "balance_loss_mlp": 0.01254623, + "epoch": 0.7065083421013076, + "flos": 25198125310080.0, + "grad_norm": 1.6339849961789776, + "language_loss": 0.78829509, + "learning_rate": 8.37416841545612e-07, + "loss": 0.86505914, + "num_input_tokens_seen": 253546160, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.10839844, + "step": 11751, + "time_per_iteration": 2.5452511310577393 + }, + { + "auxiliary_loss_clip": 0.0640781, + "auxiliary_loss_mlp": 0.0126673, + "balance_loss_clip": 0.06274024, + "balance_loss_mlp": 0.01256842, + "epoch": 0.7065684653539757, + "flos": 22900392247680.0, + "grad_norm": 1.6672445306420212, + "language_loss": 0.68168157, + "learning_rate": 8.370999604364634e-07, + "loss": 0.75842696, + "num_input_tokens_seen": 253565505, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.09893799, + "step": 11752, + "time_per_iteration": 3.9393372535705566 + }, + { + "auxiliary_loss_clip": 0.06408882, + "auxiliary_loss_mlp": 0.01267025, + "balance_loss_clip": 0.06275536, + "balance_loss_mlp": 0.01256934, + "epoch": 0.7066285886066436, + "flos": 23557025168640.0, + "grad_norm": 1.8022680768003871, + "language_loss": 0.76729679, + "learning_rate": 8.367831234246025e-07, + "loss": 0.84405589, + "num_input_tokens_seen": 253585125, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.10083008, + "step": 11753, + "time_per_iteration": 2.5189971923828125 + }, + { + "auxiliary_loss_clip": 0.06404173, + "auxiliary_loss_mlp": 0.01265164, + "balance_loss_clip": 0.06273716, + "balance_loss_mlp": 0.01255097, + "epoch": 0.7066887118593116, + "flos": 21075661883520.0, + "grad_norm": 1.4940357111697604, + "language_loss": 0.7128973, + "learning_rate": 8.364663305220405e-07, + "loss": 0.78959066, + "num_input_tokens_seen": 253604815, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.10076904, + "step": 11754, + "time_per_iteration": 2.5660195350646973 + }, + { + "auxiliary_loss_clip": 0.064097, + "auxiliary_loss_mlp": 0.01267445, + "balance_loss_clip": 0.0627328, + "balance_loss_mlp": 0.01257491, + "epoch": 0.7067488351119796, + "flos": 21182284604160.0, + "grad_norm": 1.5428805294467156, + "language_loss": 0.89486808, + "learning_rate": 8.361495817407919e-07, + "loss": 0.97163951, + "num_input_tokens_seen": 253622855, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.09960938, + "step": 11755, + "time_per_iteration": 2.507603883743286 + }, + { + "auxiliary_loss_clip": 0.06407668, + "auxiliary_loss_mlp": 0.01267402, + "balance_loss_clip": 0.06274056, + "balance_loss_mlp": 0.01257293, + "epoch": 0.7068089583646475, + "flos": 20455520215680.0, + "grad_norm": 1.4982614193498491, + "language_loss": 0.79735661, + "learning_rate": 8.358328770928678e-07, + "loss": 0.87410736, + "num_input_tokens_seen": 253642760, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.10119629, + "step": 11756, + "time_per_iteration": 3.994943618774414 + }, + { + "auxiliary_loss_clip": 0.06321511, + "auxiliary_loss_mlp": 0.0125505, + "balance_loss_clip": 0.06265193, + "balance_loss_mlp": 0.01253739, + "epoch": 0.7068690816173155, + "flos": 59125542399360.0, + "grad_norm": 0.8066454127458581, + "language_loss": 0.6018793, + "learning_rate": 8.355162165902785e-07, + "loss": 0.67764497, + "num_input_tokens_seen": 253695685, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01311493, + "step": 11757, + "time_per_iteration": 2.9342048168182373 + }, + { + "auxiliary_loss_clip": 0.06406799, + "auxiliary_loss_mlp": 0.01267209, + "balance_loss_clip": 0.06273741, + "balance_loss_mlp": 0.01256135, + "epoch": 0.7069292048699835, + "flos": 16256845900800.0, + "grad_norm": 2.1598051545702264, + "language_loss": 0.80614579, + "learning_rate": 8.351996002450307e-07, + "loss": 0.88288587, + "num_input_tokens_seen": 253713305, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.11071777, + "step": 11758, + "time_per_iteration": 2.4969773292541504 + }, + { + "auxiliary_loss_clip": 0.06407057, + "auxiliary_loss_mlp": 0.01266896, + "balance_loss_clip": 0.06273986, + "balance_loss_mlp": 0.01256143, + "epoch": 0.7069893281226515, + "flos": 41182468133760.0, + "grad_norm": 1.7333024967156656, + "language_loss": 0.77613515, + "learning_rate": 8.348830280691304e-07, + "loss": 0.85287464, + "num_input_tokens_seen": 253736100, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.10754395, + "step": 11759, + "time_per_iteration": 2.6857149600982666 + }, + { + "auxiliary_loss_clip": 0.06407617, + "auxiliary_loss_mlp": 0.01266387, + "balance_loss_clip": 0.06274342, + "balance_loss_mlp": 0.01254746, + "epoch": 0.7070494513753194, + "flos": 24214203141120.0, + "grad_norm": 1.49498062494056, + "language_loss": 0.68238914, + "learning_rate": 8.34566500074583e-07, + "loss": 0.75912917, + "num_input_tokens_seen": 253757350, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.11639404, + "step": 11760, + "time_per_iteration": 4.106550455093384 + }, + { + "auxiliary_loss_clip": 0.06414315, + "auxiliary_loss_mlp": 0.01264826, + "balance_loss_clip": 0.06276926, + "balance_loss_mlp": 0.01254354, + "epoch": 0.7071095746279874, + "flos": 20190564005760.0, + "grad_norm": 1.927414071449925, + "language_loss": 0.79955995, + "learning_rate": 8.342500162733899e-07, + "loss": 0.8763513, + "num_input_tokens_seen": 253772855, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.10479736, + "step": 11761, + "time_per_iteration": 2.4826464653015137 + }, + { + "auxiliary_loss_clip": 0.0640934, + "auxiliary_loss_mlp": 0.01267235, + "balance_loss_clip": 0.06273883, + "balance_loss_mlp": 0.0125588, + "epoch": 0.7071696978806553, + "flos": 18188282839680.0, + "grad_norm": 2.2121961398440684, + "language_loss": 0.75218999, + "learning_rate": 8.33933576677553e-07, + "loss": 0.82895583, + "num_input_tokens_seen": 253790360, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.11352539, + "step": 11762, + "time_per_iteration": 2.4954895973205566 + }, + { + "auxiliary_loss_clip": 0.06405114, + "auxiliary_loss_mlp": 0.01264533, + "balance_loss_clip": 0.06272383, + "balance_loss_mlp": 0.01254579, + "epoch": 0.7072298211333233, + "flos": 24138201888000.0, + "grad_norm": 1.8799497376122591, + "language_loss": 0.77263492, + "learning_rate": 8.336171812990724e-07, + "loss": 0.84933138, + "num_input_tokens_seen": 253810585, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09954834, + "step": 11763, + "time_per_iteration": 2.53564453125 + }, + { + "auxiliary_loss_clip": 0.06407874, + "auxiliary_loss_mlp": 0.01264442, + "balance_loss_clip": 0.062722, + "balance_loss_mlp": 0.01253493, + "epoch": 0.7072899443859912, + "flos": 27205731210240.0, + "grad_norm": 2.480752014730448, + "language_loss": 0.78787279, + "learning_rate": 8.333008301499453e-07, + "loss": 0.86459595, + "num_input_tokens_seen": 253829080, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10949707, + "step": 11764, + "time_per_iteration": 2.652902841567993 + }, + { + "auxiliary_loss_clip": 0.06416324, + "auxiliary_loss_mlp": 0.01267754, + "balance_loss_clip": 0.06276786, + "balance_loss_mlp": 0.01256852, + "epoch": 0.7073500676386593, + "flos": 16441188883200.0, + "grad_norm": 1.6649904523449048, + "language_loss": 0.79710478, + "learning_rate": 8.32984523242167e-07, + "loss": 0.87394559, + "num_input_tokens_seen": 253846780, + "router_z_loss_clip": 1.39648438, + "router_z_loss_mlp": 0.10900879, + "step": 11765, + "time_per_iteration": 2.478731632232666 + }, + { + "auxiliary_loss_clip": 0.0640541, + "auxiliary_loss_mlp": 0.01265613, + "balance_loss_clip": 0.06272826, + "balance_loss_mlp": 0.0125638, + "epoch": 0.7074101908913272, + "flos": 27681291457920.0, + "grad_norm": 1.64401676901429, + "language_loss": 0.69017607, + "learning_rate": 8.326682605877324e-07, + "loss": 0.76688629, + "num_input_tokens_seen": 253867075, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09222412, + "step": 11766, + "time_per_iteration": 2.5636019706726074 + }, + { + "auxiliary_loss_clip": 0.06409839, + "auxiliary_loss_mlp": 0.01267425, + "balance_loss_clip": 0.06272456, + "balance_loss_mlp": 0.01256399, + "epoch": 0.7074703141439952, + "flos": 22244849429760.0, + "grad_norm": 1.7806465184891558, + "language_loss": 0.64121795, + "learning_rate": 8.323520421986352e-07, + "loss": 0.71799058, + "num_input_tokens_seen": 253885790, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.11016846, + "step": 11767, + "time_per_iteration": 2.509098529815674 + }, + { + "auxiliary_loss_clip": 0.06408227, + "auxiliary_loss_mlp": 0.01264258, + "balance_loss_clip": 0.06273193, + "balance_loss_mlp": 0.01253768, + "epoch": 0.7075304373966632, + "flos": 29650980585600.0, + "grad_norm": 1.5320251232109037, + "language_loss": 0.53099549, + "learning_rate": 8.320358680868646e-07, + "loss": 0.60772038, + "num_input_tokens_seen": 253907070, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.10491943, + "step": 11768, + "time_per_iteration": 2.5991628170013428 + }, + { + "auxiliary_loss_clip": 0.06404776, + "auxiliary_loss_mlp": 0.01263382, + "balance_loss_clip": 0.06271052, + "balance_loss_mlp": 0.01253565, + "epoch": 0.7075905606493311, + "flos": 19761264011520.0, + "grad_norm": 1.5482480325031622, + "language_loss": 0.75826794, + "learning_rate": 8.317197382644119e-07, + "loss": 0.83494949, + "num_input_tokens_seen": 253927290, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.0980835, + "step": 11769, + "time_per_iteration": 2.553248167037964 + }, + { + "auxiliary_loss_clip": 0.063171, + "auxiliary_loss_mlp": 0.01250363, + "balance_loss_clip": 0.06260812, + "balance_loss_mlp": 0.01249205, + "epoch": 0.7076506839019991, + "flos": 65734106866560.0, + "grad_norm": 0.8156037445248981, + "language_loss": 0.6198988, + "learning_rate": 8.314036527432637e-07, + "loss": 0.69557339, + "num_input_tokens_seen": 253983440, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.01155853, + "step": 11770, + "time_per_iteration": 3.0812795162200928 + }, + { + "auxiliary_loss_clip": 0.0641284, + "auxiliary_loss_mlp": 0.01265072, + "balance_loss_clip": 0.06274459, + "balance_loss_mlp": 0.01254516, + "epoch": 0.707710807154667, + "flos": 23771444567040.0, + "grad_norm": 1.6411438931926623, + "language_loss": 0.76769519, + "learning_rate": 8.310876115354055e-07, + "loss": 0.84447432, + "num_input_tokens_seen": 254003825, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.10552979, + "step": 11771, + "time_per_iteration": 2.5363407135009766 + }, + { + "auxiliary_loss_clip": 0.06403352, + "auxiliary_loss_mlp": 0.01265567, + "balance_loss_clip": 0.06272224, + "balance_loss_mlp": 0.01255482, + "epoch": 0.7077709304073351, + "flos": 21257698878720.0, + "grad_norm": 1.3979456660804543, + "language_loss": 0.71690625, + "learning_rate": 8.307716146528221e-07, + "loss": 0.79359543, + "num_input_tokens_seen": 254023345, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.10083008, + "step": 11772, + "time_per_iteration": 2.517993688583374 + }, + { + "auxiliary_loss_clip": 0.06417513, + "auxiliary_loss_mlp": 0.01264872, + "balance_loss_clip": 0.06277703, + "balance_loss_mlp": 0.01253535, + "epoch": 0.707831053660003, + "flos": 20747030970240.0, + "grad_norm": 1.7220446646082324, + "language_loss": 0.69968081, + "learning_rate": 8.30455662107496e-07, + "loss": 0.77650464, + "num_input_tokens_seen": 254041815, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.11334229, + "step": 11773, + "time_per_iteration": 2.487250328063965 + }, + { + "auxiliary_loss_clip": 0.06409782, + "auxiliary_loss_mlp": 0.01269179, + "balance_loss_clip": 0.0627438, + "balance_loss_mlp": 0.01259016, + "epoch": 0.707891176912671, + "flos": 21987440087040.0, + "grad_norm": 1.361330798775882, + "language_loss": 0.70201778, + "learning_rate": 8.301397539114095e-07, + "loss": 0.77880728, + "num_input_tokens_seen": 254062065, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.10150146, + "step": 11774, + "time_per_iteration": 2.519763231277466 + }, + { + "auxiliary_loss_clip": 0.0640517, + "auxiliary_loss_mlp": 0.01266109, + "balance_loss_clip": 0.06274074, + "balance_loss_mlp": 0.01256316, + "epoch": 0.7079513001653389, + "flos": 21075284540160.0, + "grad_norm": 1.498970377219278, + "language_loss": 0.7492069, + "learning_rate": 8.298238900765407e-07, + "loss": 0.82591969, + "num_input_tokens_seen": 254080605, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.09802246, + "step": 11775, + "time_per_iteration": 2.5430877208709717 + }, + { + "auxiliary_loss_clip": 0.06415135, + "auxiliary_loss_mlp": 0.01264314, + "balance_loss_clip": 0.06278447, + "balance_loss_mlp": 0.01254014, + "epoch": 0.7080114234180069, + "flos": 18046468604160.0, + "grad_norm": 1.621138107650678, + "language_loss": 0.87510455, + "learning_rate": 8.295080706148665e-07, + "loss": 0.95189905, + "num_input_tokens_seen": 254098710, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.10314941, + "step": 11776, + "time_per_iteration": 2.517082691192627 + }, + { + "auxiliary_loss_clip": 0.06408748, + "auxiliary_loss_mlp": 0.01265871, + "balance_loss_clip": 0.0627363, + "balance_loss_mlp": 0.01256096, + "epoch": 0.7080715466706748, + "flos": 15127671479040.0, + "grad_norm": 1.4637417425019663, + "language_loss": 0.75087041, + "learning_rate": 8.291922955383641e-07, + "loss": 0.82761657, + "num_input_tokens_seen": 254117200, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.09777832, + "step": 11777, + "time_per_iteration": 2.5164589881896973 + }, + { + "auxiliary_loss_clip": 0.06418398, + "auxiliary_loss_mlp": 0.01264812, + "balance_loss_clip": 0.0627712, + "balance_loss_mlp": 0.01253928, + "epoch": 0.7081316699233429, + "flos": 14427042364800.0, + "grad_norm": 1.984175776722718, + "language_loss": 0.82697594, + "learning_rate": 8.288765648590066e-07, + "loss": 0.903808, + "num_input_tokens_seen": 254132115, + "router_z_loss_clip": 1.41308594, + "router_z_loss_mlp": 0.10888672, + "step": 11778, + "time_per_iteration": 2.5013656616210938 + }, + { + "auxiliary_loss_clip": 0.06404569, + "auxiliary_loss_mlp": 0.01264308, + "balance_loss_clip": 0.06274152, + "balance_loss_mlp": 0.01255213, + "epoch": 0.7081917931760108, + "flos": 23229190869120.0, + "grad_norm": 1.4143364906484888, + "language_loss": 0.84851789, + "learning_rate": 8.285608785887673e-07, + "loss": 0.9252066, + "num_input_tokens_seen": 254152285, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.09100342, + "step": 11779, + "time_per_iteration": 2.5495359897613525 + }, + { + "auxiliary_loss_clip": 0.06410395, + "auxiliary_loss_mlp": 0.01264448, + "balance_loss_clip": 0.06273511, + "balance_loss_mlp": 0.01254321, + "epoch": 0.7082519164286788, + "flos": 39317221520640.0, + "grad_norm": 1.7515830912849983, + "language_loss": 0.7191208, + "learning_rate": 8.28245236739618e-07, + "loss": 0.79586923, + "num_input_tokens_seen": 254172805, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.10125732, + "step": 11780, + "time_per_iteration": 4.163387775421143 + }, + { + "auxiliary_loss_clip": 0.06407901, + "auxiliary_loss_mlp": 0.01267276, + "balance_loss_clip": 0.06274346, + "balance_loss_mlp": 0.01257382, + "epoch": 0.7083120396813467, + "flos": 21657299800320.0, + "grad_norm": 1.349993887717698, + "language_loss": 0.73180461, + "learning_rate": 8.279296393235256e-07, + "loss": 0.80855638, + "num_input_tokens_seen": 254191890, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.09887695, + "step": 11781, + "time_per_iteration": 2.523428440093994 + }, + { + "auxiliary_loss_clip": 0.06407337, + "auxiliary_loss_mlp": 0.0126471, + "balance_loss_clip": 0.06273166, + "balance_loss_mlp": 0.01254625, + "epoch": 0.7083721629340147, + "flos": 17572878927360.0, + "grad_norm": 2.699338792660173, + "language_loss": 0.77578008, + "learning_rate": 8.276140863524585e-07, + "loss": 0.85250056, + "num_input_tokens_seen": 254210150, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.10089111, + "step": 11782, + "time_per_iteration": 2.458449363708496 + }, + { + "auxiliary_loss_clip": 0.06406146, + "auxiliary_loss_mlp": 0.0126417, + "balance_loss_clip": 0.06272672, + "balance_loss_mlp": 0.01254991, + "epoch": 0.7084322861866827, + "flos": 29358086238720.0, + "grad_norm": 1.4360937815095354, + "language_loss": 0.70182502, + "learning_rate": 8.272985778383828e-07, + "loss": 0.77852821, + "num_input_tokens_seen": 254233015, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.09173584, + "step": 11783, + "time_per_iteration": 2.5887033939361572 + }, + { + "auxiliary_loss_clip": 0.06414656, + "auxiliary_loss_mlp": 0.01268164, + "balance_loss_clip": 0.06274001, + "balance_loss_mlp": 0.0125768, + "epoch": 0.7084924094393507, + "flos": 20200626495360.0, + "grad_norm": 1.5971747704172947, + "language_loss": 0.79307884, + "learning_rate": 8.269831137932632e-07, + "loss": 0.86990702, + "num_input_tokens_seen": 254251345, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.1048584, + "step": 11784, + "time_per_iteration": 2.490954637527466 + }, + { + "auxiliary_loss_clip": 0.0640732, + "auxiliary_loss_mlp": 0.01267, + "balance_loss_clip": 0.06272314, + "balance_loss_mlp": 0.01256737, + "epoch": 0.7085525326920187, + "flos": 23483958808320.0, + "grad_norm": 1.617674750849371, + "language_loss": 0.77606887, + "learning_rate": 8.266676942290609e-07, + "loss": 0.85281205, + "num_input_tokens_seen": 254269905, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.1026001, + "step": 11785, + "time_per_iteration": 2.521693706512451 + }, + { + "auxiliary_loss_clip": 0.06413119, + "auxiliary_loss_mlp": 0.01265727, + "balance_loss_clip": 0.06278774, + "balance_loss_mlp": 0.01255934, + "epoch": 0.7086126559446866, + "flos": 25966076780160.0, + "grad_norm": 1.4386102379185288, + "language_loss": 0.78040558, + "learning_rate": 8.26352319157738e-07, + "loss": 0.85719407, + "num_input_tokens_seen": 254289990, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.09796143, + "step": 11786, + "time_per_iteration": 2.522735834121704 + }, + { + "auxiliary_loss_clip": 0.06412391, + "auxiliary_loss_mlp": 0.01268502, + "balance_loss_clip": 0.06275783, + "balance_loss_mlp": 0.0125834, + "epoch": 0.7086727791973546, + "flos": 26732141533440.0, + "grad_norm": 1.8351634972642936, + "language_loss": 0.79121733, + "learning_rate": 8.260369885912526e-07, + "loss": 0.86802632, + "num_input_tokens_seen": 254309085, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.10162354, + "step": 11787, + "time_per_iteration": 2.5581464767456055 + }, + { + "auxiliary_loss_clip": 0.06412619, + "auxiliary_loss_mlp": 0.01271025, + "balance_loss_clip": 0.06277216, + "balance_loss_mlp": 0.01260475, + "epoch": 0.7087329024500225, + "flos": 21688801735680.0, + "grad_norm": 1.8228289571149952, + "language_loss": 0.76948512, + "learning_rate": 8.257217025415615e-07, + "loss": 0.84632152, + "num_input_tokens_seen": 254327045, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.10540771, + "step": 11788, + "time_per_iteration": 2.490006446838379 + }, + { + "auxiliary_loss_clip": 0.06420539, + "auxiliary_loss_mlp": 0.01270333, + "balance_loss_clip": 0.06279223, + "balance_loss_mlp": 0.01259014, + "epoch": 0.7087930257026905, + "flos": 17936827136640.0, + "grad_norm": 2.296634586886211, + "language_loss": 0.67989695, + "learning_rate": 8.254064610206212e-07, + "loss": 0.75680566, + "num_input_tokens_seen": 254344585, + "router_z_loss_clip": 1.41308594, + "router_z_loss_mlp": 0.11322021, + "step": 11789, + "time_per_iteration": 2.5101919174194336 + }, + { + "auxiliary_loss_clip": 0.06411231, + "auxiliary_loss_mlp": 0.01266357, + "balance_loss_clip": 0.06272002, + "balance_loss_mlp": 0.01256111, + "epoch": 0.7088531489553584, + "flos": 18916682382720.0, + "grad_norm": 1.5602629922400044, + "language_loss": 0.77709448, + "learning_rate": 8.250912640403858e-07, + "loss": 0.85387033, + "num_input_tokens_seen": 254362470, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.10241699, + "step": 11790, + "time_per_iteration": 2.484931468963623 + }, + { + "auxiliary_loss_clip": 0.06419586, + "auxiliary_loss_mlp": 0.01267055, + "balance_loss_clip": 0.06277139, + "balance_loss_mlp": 0.01255253, + "epoch": 0.7089132722080265, + "flos": 27388229402880.0, + "grad_norm": 1.5308750679240268, + "language_loss": 0.71250129, + "learning_rate": 8.247761116128085e-07, + "loss": 0.78936774, + "num_input_tokens_seen": 254383190, + "router_z_loss_clip": 1.42382812, + "router_z_loss_mlp": 0.11798096, + "step": 11791, + "time_per_iteration": 2.583948850631714 + }, + { + "auxiliary_loss_clip": 0.06410724, + "auxiliary_loss_mlp": 0.01267551, + "balance_loss_clip": 0.06275617, + "balance_loss_mlp": 0.0125675, + "epoch": 0.7089733954606944, + "flos": 22169309374080.0, + "grad_norm": 1.511652721397476, + "language_loss": 0.82245874, + "learning_rate": 8.244610037498376e-07, + "loss": 0.89924157, + "num_input_tokens_seen": 254403115, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.1081543, + "step": 11792, + "time_per_iteration": 3.987499475479126 + }, + { + "auxiliary_loss_clip": 0.06412215, + "auxiliary_loss_mlp": 0.01267904, + "balance_loss_clip": 0.06272028, + "balance_loss_mlp": 0.01256817, + "epoch": 0.7090335187133624, + "flos": 24432731389440.0, + "grad_norm": 1.9294753325302831, + "language_loss": 0.65135908, + "learning_rate": 8.241459404634232e-07, + "loss": 0.72816032, + "num_input_tokens_seen": 254421875, + "router_z_loss_clip": 1.40039062, + "router_z_loss_mlp": 0.11083984, + "step": 11793, + "time_per_iteration": 2.5396199226379395 + }, + { + "auxiliary_loss_clip": 0.06407128, + "auxiliary_loss_mlp": 0.01268973, + "balance_loss_clip": 0.06271678, + "balance_loss_mlp": 0.01258834, + "epoch": 0.7090936419660303, + "flos": 21841684709760.0, + "grad_norm": 1.9925409901798494, + "language_loss": 0.70387089, + "learning_rate": 8.238309217655133e-07, + "loss": 0.78063184, + "num_input_tokens_seen": 254440765, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.10144043, + "step": 11794, + "time_per_iteration": 2.5805962085723877 + }, + { + "auxiliary_loss_clip": 0.06410742, + "auxiliary_loss_mlp": 0.01263848, + "balance_loss_clip": 0.0627709, + "balance_loss_mlp": 0.01253828, + "epoch": 0.7091537652186983, + "flos": 20088259770240.0, + "grad_norm": 1.8813846026416328, + "language_loss": 0.76058149, + "learning_rate": 8.23515947668052e-07, + "loss": 0.83732742, + "num_input_tokens_seen": 254459480, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.10015869, + "step": 11795, + "time_per_iteration": 3.9482054710388184 + }, + { + "auxiliary_loss_clip": 0.06412329, + "auxiliary_loss_mlp": 0.0126988, + "balance_loss_clip": 0.06275567, + "balance_loss_mlp": 0.01258812, + "epoch": 0.7092138884713663, + "flos": 13156556832000.0, + "grad_norm": 2.0194589674634242, + "language_loss": 0.75623167, + "learning_rate": 8.232010181829838e-07, + "loss": 0.83305377, + "num_input_tokens_seen": 254473985, + "router_z_loss_clip": 1.36621094, + "router_z_loss_mlp": 0.11077881, + "step": 11796, + "time_per_iteration": 2.49794340133667 + }, + { + "auxiliary_loss_clip": 0.06421532, + "auxiliary_loss_mlp": 0.01265378, + "balance_loss_clip": 0.06280202, + "balance_loss_mlp": 0.01254024, + "epoch": 0.7092740117240343, + "flos": 21651262306560.0, + "grad_norm": 1.5362456233213855, + "language_loss": 0.74430573, + "learning_rate": 8.228861333222523e-07, + "loss": 0.8211748, + "num_input_tokens_seen": 254492135, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.11352539, + "step": 11797, + "time_per_iteration": 2.5082199573516846 + }, + { + "auxiliary_loss_clip": 0.06411034, + "auxiliary_loss_mlp": 0.01266935, + "balance_loss_clip": 0.06274262, + "balance_loss_mlp": 0.01256326, + "epoch": 0.7093341349767023, + "flos": 21038835214080.0, + "grad_norm": 1.402262543828535, + "language_loss": 0.79553568, + "learning_rate": 8.225712930977953e-07, + "loss": 0.87231541, + "num_input_tokens_seen": 254512865, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.10614014, + "step": 11798, + "time_per_iteration": 2.5451393127441406 + }, + { + "auxiliary_loss_clip": 0.06409004, + "auxiliary_loss_mlp": 0.01266407, + "balance_loss_clip": 0.06273472, + "balance_loss_mlp": 0.01255911, + "epoch": 0.7093942582293702, + "flos": 22024140975360.0, + "grad_norm": 2.0553615011101236, + "language_loss": 0.67001218, + "learning_rate": 8.222564975215529e-07, + "loss": 0.74676633, + "num_input_tokens_seen": 254532605, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.10491943, + "step": 11799, + "time_per_iteration": 3.9047088623046875 + }, + { + "auxiliary_loss_clip": 0.06411745, + "auxiliary_loss_mlp": 0.01265473, + "balance_loss_clip": 0.06276356, + "balance_loss_mlp": 0.01254548, + "epoch": 0.7094543814820382, + "flos": 27243019077120.0, + "grad_norm": 1.5384407371377906, + "language_loss": 0.82004559, + "learning_rate": 8.219417466054622e-07, + "loss": 0.89681768, + "num_input_tokens_seen": 254553780, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.10925293, + "step": 11800, + "time_per_iteration": 2.54984188079834 + }, + { + "auxiliary_loss_clip": 0.06408048, + "auxiliary_loss_mlp": 0.01264695, + "balance_loss_clip": 0.06274039, + "balance_loss_mlp": 0.01255218, + "epoch": 0.7095145047347061, + "flos": 12093237319680.0, + "grad_norm": 1.8049515172262331, + "language_loss": 0.86792338, + "learning_rate": 8.21627040361459e-07, + "loss": 0.94465083, + "num_input_tokens_seen": 254567510, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.0947876, + "step": 11801, + "time_per_iteration": 2.472968339920044 + }, + { + "auxiliary_loss_clip": 0.06414308, + "auxiliary_loss_mlp": 0.01268303, + "balance_loss_clip": 0.06278587, + "balance_loss_mlp": 0.01257896, + "epoch": 0.7095746279873741, + "flos": 19388678832000.0, + "grad_norm": 1.9685683260033982, + "language_loss": 0.7659384, + "learning_rate": 8.213123788014758e-07, + "loss": 0.8427645, + "num_input_tokens_seen": 254585565, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.10412598, + "step": 11802, + "time_per_iteration": 2.469217300415039 + }, + { + "auxiliary_loss_clip": 0.06413268, + "auxiliary_loss_mlp": 0.01270796, + "balance_loss_clip": 0.06277166, + "balance_loss_mlp": 0.01259948, + "epoch": 0.709634751240042, + "flos": 21366921075840.0, + "grad_norm": 1.7164711115559128, + "language_loss": 0.81734449, + "learning_rate": 8.209977619374462e-07, + "loss": 0.89418513, + "num_input_tokens_seen": 254603465, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.10845947, + "step": 11803, + "time_per_iteration": 2.5675346851348877 + }, + { + "auxiliary_loss_clip": 0.06413771, + "auxiliary_loss_mlp": 0.01268086, + "balance_loss_clip": 0.0627571, + "balance_loss_mlp": 0.01256702, + "epoch": 0.7096948744927101, + "flos": 13922034606720.0, + "grad_norm": 2.2508010678544363, + "language_loss": 0.6771282, + "learning_rate": 8.206831897812995e-07, + "loss": 0.75394678, + "num_input_tokens_seen": 254620500, + "router_z_loss_clip": 1.38183594, + "router_z_loss_mlp": 0.1137085, + "step": 11804, + "time_per_iteration": 2.4850802421569824 + }, + { + "auxiliary_loss_clip": 0.06406445, + "auxiliary_loss_mlp": 0.01269291, + "balance_loss_clip": 0.06276047, + "balance_loss_mlp": 0.01259694, + "epoch": 0.709754997745378, + "flos": 30305936424960.0, + "grad_norm": 1.836033307049916, + "language_loss": 0.78141153, + "learning_rate": 8.203686623449637e-07, + "loss": 0.8581689, + "num_input_tokens_seen": 254638565, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.0960083, + "step": 11805, + "time_per_iteration": 2.5807907581329346 + }, + { + "auxiliary_loss_clip": 0.06411435, + "auxiliary_loss_mlp": 0.01266806, + "balance_loss_clip": 0.06275858, + "balance_loss_mlp": 0.01256202, + "epoch": 0.709815120998046, + "flos": 18521064529920.0, + "grad_norm": 3.360423816262503, + "language_loss": 0.78911841, + "learning_rate": 8.200541796403667e-07, + "loss": 0.86590087, + "num_input_tokens_seen": 254657505, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.1060791, + "step": 11806, + "time_per_iteration": 2.4750113487243652 + }, + { + "auxiliary_loss_clip": 0.06409614, + "auxiliary_loss_mlp": 0.01266594, + "balance_loss_clip": 0.06275766, + "balance_loss_mlp": 0.01256503, + "epoch": 0.7098752442507139, + "flos": 22279034695680.0, + "grad_norm": 3.0880614568331883, + "language_loss": 0.56418979, + "learning_rate": 8.197397416794332e-07, + "loss": 0.64095187, + "num_input_tokens_seen": 254674730, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.10095215, + "step": 11807, + "time_per_iteration": 2.5265543460845947 + }, + { + "auxiliary_loss_clip": 0.06416228, + "auxiliary_loss_mlp": 0.01269148, + "balance_loss_clip": 0.06274513, + "balance_loss_mlp": 0.01257686, + "epoch": 0.7099353675033819, + "flos": 19280504810880.0, + "grad_norm": 2.07369456244542, + "language_loss": 0.68290567, + "learning_rate": 8.194253484740882e-07, + "loss": 0.75975943, + "num_input_tokens_seen": 254691665, + "router_z_loss_clip": 1.41699219, + "router_z_loss_mlp": 0.11462402, + "step": 11808, + "time_per_iteration": 2.472132444381714 + }, + { + "auxiliary_loss_clip": 0.06414328, + "auxiliary_loss_mlp": 0.01264603, + "balance_loss_clip": 0.06275385, + "balance_loss_mlp": 0.01254512, + "epoch": 0.70999549075605, + "flos": 21915044559360.0, + "grad_norm": 1.9968242899147548, + "language_loss": 0.71669781, + "learning_rate": 8.191110000362513e-07, + "loss": 0.79348707, + "num_input_tokens_seen": 254711610, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.10089111, + "step": 11809, + "time_per_iteration": 2.524571180343628 + }, + { + "auxiliary_loss_clip": 0.06322539, + "auxiliary_loss_mlp": 0.01256903, + "balance_loss_clip": 0.06266782, + "balance_loss_mlp": 0.01255681, + "epoch": 0.7100556140087179, + "flos": 70474280192640.0, + "grad_norm": 0.7372364518861584, + "language_loss": 0.59065175, + "learning_rate": 8.187966963778435e-07, + "loss": 0.66644615, + "num_input_tokens_seen": 254772615, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.01220703, + "step": 11810, + "time_per_iteration": 3.2093372344970703 + }, + { + "auxiliary_loss_clip": 0.06413063, + "auxiliary_loss_mlp": 0.01263776, + "balance_loss_clip": 0.06277919, + "balance_loss_mlp": 0.01253721, + "epoch": 0.7101157372613859, + "flos": 23046273406080.0, + "grad_norm": 1.545725512324635, + "language_loss": 0.74353242, + "learning_rate": 8.18482437510784e-07, + "loss": 0.82030082, + "num_input_tokens_seen": 254791375, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.10064697, + "step": 11811, + "time_per_iteration": 2.5427846908569336 + }, + { + "auxiliary_loss_clip": 0.06404351, + "auxiliary_loss_mlp": 0.0126459, + "balance_loss_clip": 0.06272991, + "balance_loss_mlp": 0.01255149, + "epoch": 0.7101758605140538, + "flos": 23192028783360.0, + "grad_norm": 1.7044281012631433, + "language_loss": 0.83467686, + "learning_rate": 8.181682234469882e-07, + "loss": 0.91136628, + "num_input_tokens_seen": 254809300, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09442139, + "step": 11812, + "time_per_iteration": 2.5327343940734863 + }, + { + "auxiliary_loss_clip": 0.0641521, + "auxiliary_loss_mlp": 0.0126703, + "balance_loss_clip": 0.06277661, + "balance_loss_mlp": 0.01256659, + "epoch": 0.7102359837667218, + "flos": 23702906327040.0, + "grad_norm": 1.4051092754707344, + "language_loss": 0.69960868, + "learning_rate": 8.178540541983716e-07, + "loss": 0.77643108, + "num_input_tokens_seen": 254829325, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.10375977, + "step": 11813, + "time_per_iteration": 2.6402204036712646 + }, + { + "auxiliary_loss_clip": 0.06402316, + "auxiliary_loss_mlp": 0.01264286, + "balance_loss_clip": 0.06270487, + "balance_loss_mlp": 0.01254451, + "epoch": 0.7102961070193897, + "flos": 19397231948160.0, + "grad_norm": 1.7011399194035903, + "language_loss": 0.82479846, + "learning_rate": 8.175399297768495e-07, + "loss": 0.90146458, + "num_input_tokens_seen": 254847690, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09832764, + "step": 11814, + "time_per_iteration": 2.4825360774993896 + }, + { + "auxiliary_loss_clip": 0.06407954, + "auxiliary_loss_mlp": 0.01266287, + "balance_loss_clip": 0.06273861, + "balance_loss_mlp": 0.01255308, + "epoch": 0.7103562302720577, + "flos": 21514018118400.0, + "grad_norm": 1.9900571557306543, + "language_loss": 0.76711023, + "learning_rate": 8.172258501943301e-07, + "loss": 0.84385264, + "num_input_tokens_seen": 254865960, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.10974121, + "step": 11815, + "time_per_iteration": 2.5411629676818848 + }, + { + "auxiliary_loss_clip": 0.06407356, + "auxiliary_loss_mlp": 0.01265787, + "balance_loss_clip": 0.0627517, + "balance_loss_mlp": 0.01256012, + "epoch": 0.7104163535247257, + "flos": 14539786433280.0, + "grad_norm": 2.148014854725882, + "language_loss": 0.78734261, + "learning_rate": 8.16911815462725e-07, + "loss": 0.86407399, + "num_input_tokens_seen": 254882815, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.09777832, + "step": 11816, + "time_per_iteration": 2.4732110500335693 + }, + { + "auxiliary_loss_clip": 0.06409387, + "auxiliary_loss_mlp": 0.0126716, + "balance_loss_clip": 0.06273407, + "balance_loss_mlp": 0.01257415, + "epoch": 0.7104764767773937, + "flos": 11405018609280.0, + "grad_norm": 1.710233044928932, + "language_loss": 0.87136269, + "learning_rate": 8.165978255939426e-07, + "loss": 0.9481281, + "num_input_tokens_seen": 254898705, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.09747314, + "step": 11817, + "time_per_iteration": 2.4930732250213623 + }, + { + "auxiliary_loss_clip": 0.06405669, + "auxiliary_loss_mlp": 0.01263794, + "balance_loss_clip": 0.06273086, + "balance_loss_mlp": 0.01254358, + "epoch": 0.7105366000300616, + "flos": 11694894209280.0, + "grad_norm": 2.3467290312942906, + "language_loss": 0.84727818, + "learning_rate": 8.162838805998897e-07, + "loss": 0.92397279, + "num_input_tokens_seen": 254913665, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09436035, + "step": 11818, + "time_per_iteration": 2.4601902961730957 + }, + { + "auxiliary_loss_clip": 0.06407452, + "auxiliary_loss_mlp": 0.01265048, + "balance_loss_clip": 0.06270839, + "balance_loss_mlp": 0.01254808, + "epoch": 0.7105967232827296, + "flos": 19360027935360.0, + "grad_norm": 1.943101872130184, + "language_loss": 0.76065433, + "learning_rate": 8.159699804924709e-07, + "loss": 0.83737928, + "num_input_tokens_seen": 254932140, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.10235596, + "step": 11819, + "time_per_iteration": 2.5082414150238037 + }, + { + "auxiliary_loss_clip": 0.06408325, + "auxiliary_loss_mlp": 0.01273169, + "balance_loss_clip": 0.06273748, + "balance_loss_mlp": 0.01262422, + "epoch": 0.7106568465353975, + "flos": 22937135063040.0, + "grad_norm": 1.5613953087486683, + "language_loss": 0.71238112, + "learning_rate": 8.156561252835883e-07, + "loss": 0.78919601, + "num_input_tokens_seen": 254951580, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.10748291, + "step": 11820, + "time_per_iteration": 3.9562554359436035 + }, + { + "auxiliary_loss_clip": 0.06406607, + "auxiliary_loss_mlp": 0.01266388, + "balance_loss_clip": 0.06272983, + "balance_loss_mlp": 0.01256309, + "epoch": 0.7107169697880655, + "flos": 19105805047680.0, + "grad_norm": 1.709009415960719, + "language_loss": 0.75201517, + "learning_rate": 8.153423149851449e-07, + "loss": 0.82874513, + "num_input_tokens_seen": 254969425, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.10083008, + "step": 11821, + "time_per_iteration": 2.4773855209350586 + }, + { + "auxiliary_loss_clip": 0.0631486, + "auxiliary_loss_mlp": 0.0125056, + "balance_loss_clip": 0.06259306, + "balance_loss_mlp": 0.01249267, + "epoch": 0.7107770930407336, + "flos": 63655950228480.0, + "grad_norm": 0.8065746142119063, + "language_loss": 0.55105186, + "learning_rate": 8.150285496090388e-07, + "loss": 0.626706, + "num_input_tokens_seen": 255032680, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01293182, + "step": 11822, + "time_per_iteration": 3.1728925704956055 + }, + { + "auxiliary_loss_clip": 0.06399868, + "auxiliary_loss_mlp": 0.01265617, + "balance_loss_clip": 0.0627214, + "balance_loss_mlp": 0.01256313, + "epoch": 0.7108372162934015, + "flos": 22061009571840.0, + "grad_norm": 1.7664810996184872, + "language_loss": 0.61042011, + "learning_rate": 8.147148291671688e-07, + "loss": 0.68707502, + "num_input_tokens_seen": 255054400, + "router_z_loss_clip": 1.27539062, + "router_z_loss_mlp": 0.09301758, + "step": 11823, + "time_per_iteration": 2.685396194458008 + }, + { + "auxiliary_loss_clip": 0.06409906, + "auxiliary_loss_mlp": 0.01263571, + "balance_loss_clip": 0.0627628, + "balance_loss_mlp": 0.01254195, + "epoch": 0.7108973395460695, + "flos": 19141122343680.0, + "grad_norm": 1.95026020169961, + "language_loss": 0.71794426, + "learning_rate": 8.144011536714322e-07, + "loss": 0.79467905, + "num_input_tokens_seen": 255072785, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.09375, + "step": 11824, + "time_per_iteration": 2.5620133876800537 + }, + { + "auxiliary_loss_clip": 0.06401232, + "auxiliary_loss_mlp": 0.01266002, + "balance_loss_clip": 0.06271533, + "balance_loss_mlp": 0.01256841, + "epoch": 0.7109574627987374, + "flos": 17900168175360.0, + "grad_norm": 2.011245948242179, + "language_loss": 0.72948581, + "learning_rate": 8.140875231337223e-07, + "loss": 0.80615819, + "num_input_tokens_seen": 255091820, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.09161377, + "step": 11825, + "time_per_iteration": 2.481990098953247 + }, + { + "auxiliary_loss_clip": 0.06409375, + "auxiliary_loss_mlp": 0.01265195, + "balance_loss_clip": 0.06273198, + "balance_loss_mlp": 0.01254669, + "epoch": 0.7110175860514054, + "flos": 28986129964800.0, + "grad_norm": 1.8577779500908889, + "language_loss": 0.80001605, + "learning_rate": 8.137739375659321e-07, + "loss": 0.87676173, + "num_input_tokens_seen": 255111720, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.10540771, + "step": 11826, + "time_per_iteration": 2.5934202671051025 + }, + { + "auxiliary_loss_clip": 0.06401698, + "auxiliary_loss_mlp": 0.01267979, + "balance_loss_clip": 0.06270775, + "balance_loss_mlp": 0.0125846, + "epoch": 0.7110777093040733, + "flos": 26179867272960.0, + "grad_norm": 1.3769409852595975, + "language_loss": 0.83070964, + "learning_rate": 8.134603969799527e-07, + "loss": 0.90740645, + "num_input_tokens_seen": 255133495, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09521484, + "step": 11827, + "time_per_iteration": 2.5412826538085938 + }, + { + "auxiliary_loss_clip": 0.0640677, + "auxiliary_loss_mlp": 0.01267116, + "balance_loss_clip": 0.06271519, + "balance_loss_mlp": 0.01256507, + "epoch": 0.7111378325567413, + "flos": 26877184151040.0, + "grad_norm": 1.489155185626094, + "language_loss": 0.62609684, + "learning_rate": 8.131469013876748e-07, + "loss": 0.70283562, + "num_input_tokens_seen": 255156880, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.10601807, + "step": 11828, + "time_per_iteration": 2.549358367919922 + }, + { + "auxiliary_loss_clip": 0.0640718, + "auxiliary_loss_mlp": 0.01265747, + "balance_loss_clip": 0.06272048, + "balance_loss_mlp": 0.01255543, + "epoch": 0.7111979558094093, + "flos": 27279216840960.0, + "grad_norm": 1.3931875657884774, + "language_loss": 0.72552299, + "learning_rate": 8.128334508009846e-07, + "loss": 0.80225229, + "num_input_tokens_seen": 255178920, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.10205078, + "step": 11829, + "time_per_iteration": 2.538902997970581 + }, + { + "auxiliary_loss_clip": 0.06404835, + "auxiliary_loss_mlp": 0.01268934, + "balance_loss_clip": 0.06271756, + "balance_loss_mlp": 0.01259343, + "epoch": 0.7112580790620773, + "flos": 25054088941440.0, + "grad_norm": 1.7068284012281256, + "language_loss": 0.80460179, + "learning_rate": 8.125200452317697e-07, + "loss": 0.88133949, + "num_input_tokens_seen": 255198095, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.09594727, + "step": 11830, + "time_per_iteration": 2.527684450149536 + }, + { + "auxiliary_loss_clip": 0.064045, + "auxiliary_loss_mlp": 0.01264964, + "balance_loss_clip": 0.06270975, + "balance_loss_mlp": 0.01255338, + "epoch": 0.7113182023147452, + "flos": 21652016993280.0, + "grad_norm": 1.5791795722004685, + "language_loss": 0.84228051, + "learning_rate": 8.122066846919138e-07, + "loss": 0.91897511, + "num_input_tokens_seen": 255215860, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.09625244, + "step": 11831, + "time_per_iteration": 3.8946433067321777 + }, + { + "auxiliary_loss_clip": 0.06405313, + "auxiliary_loss_mlp": 0.01264799, + "balance_loss_clip": 0.06270519, + "balance_loss_mlp": 0.01255453, + "epoch": 0.7113783255674132, + "flos": 21002637450240.0, + "grad_norm": 1.9181792200519638, + "language_loss": 0.77265865, + "learning_rate": 8.118933691932985e-07, + "loss": 0.84935975, + "num_input_tokens_seen": 255235425, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.09344482, + "step": 11832, + "time_per_iteration": 2.517416477203369 + }, + { + "auxiliary_loss_clip": 0.06316236, + "auxiliary_loss_mlp": 0.01252897, + "balance_loss_clip": 0.06260582, + "balance_loss_mlp": 0.01251798, + "epoch": 0.7114384488200811, + "flos": 66788705554560.0, + "grad_norm": 0.7355523312106115, + "language_loss": 0.56510413, + "learning_rate": 8.115800987478059e-07, + "loss": 0.64079541, + "num_input_tokens_seen": 255291680, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01100922, + "step": 11833, + "time_per_iteration": 3.083800792694092 + }, + { + "auxiliary_loss_clip": 0.06404281, + "auxiliary_loss_mlp": 0.01264607, + "balance_loss_clip": 0.06270045, + "balance_loss_mlp": 0.01255255, + "epoch": 0.7114985720727491, + "flos": 25017136490880.0, + "grad_norm": 1.685224360571569, + "language_loss": 0.71167994, + "learning_rate": 8.11266873367315e-07, + "loss": 0.78836882, + "num_input_tokens_seen": 255313880, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.09350586, + "step": 11834, + "time_per_iteration": 2.5492658615112305 + }, + { + "auxiliary_loss_clip": 0.06408249, + "auxiliary_loss_mlp": 0.01268558, + "balance_loss_clip": 0.06272918, + "balance_loss_mlp": 0.01257972, + "epoch": 0.7115586953254172, + "flos": 21476478689280.0, + "grad_norm": 1.811757150622914, + "language_loss": 0.79512018, + "learning_rate": 8.10953693063704e-07, + "loss": 0.87188828, + "num_input_tokens_seen": 255332390, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.10583496, + "step": 11835, + "time_per_iteration": 3.936241865158081 + }, + { + "auxiliary_loss_clip": 0.06403308, + "auxiliary_loss_mlp": 0.01266062, + "balance_loss_clip": 0.06270957, + "balance_loss_mlp": 0.0125646, + "epoch": 0.7116188185780851, + "flos": 28630357528320.0, + "grad_norm": 1.5711246954693516, + "language_loss": 0.76045537, + "learning_rate": 8.10640557848848e-07, + "loss": 0.83714908, + "num_input_tokens_seen": 255354025, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09606934, + "step": 11836, + "time_per_iteration": 2.5701663494110107 + }, + { + "auxiliary_loss_clip": 0.06406698, + "auxiliary_loss_mlp": 0.01265952, + "balance_loss_clip": 0.06274588, + "balance_loss_mlp": 0.01256653, + "epoch": 0.7116789418307531, + "flos": 25299339442560.0, + "grad_norm": 1.6743206701340672, + "language_loss": 0.69986928, + "learning_rate": 8.103274677346208e-07, + "loss": 0.77659577, + "num_input_tokens_seen": 255371400, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09301758, + "step": 11837, + "time_per_iteration": 2.575038194656372 + }, + { + "auxiliary_loss_clip": 0.0641223, + "auxiliary_loss_mlp": 0.01266229, + "balance_loss_clip": 0.06274512, + "balance_loss_mlp": 0.01255494, + "epoch": 0.711739065083421, + "flos": 25564463360640.0, + "grad_norm": 1.8455270082673318, + "language_loss": 0.61858809, + "learning_rate": 8.100144227328958e-07, + "loss": 0.69537258, + "num_input_tokens_seen": 255390710, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.10736084, + "step": 11838, + "time_per_iteration": 2.5805752277374268 + }, + { + "auxiliary_loss_clip": 0.06409779, + "auxiliary_loss_mlp": 0.01267582, + "balance_loss_clip": 0.0627556, + "balance_loss_mlp": 0.01257699, + "epoch": 0.711799188336089, + "flos": 26148239556480.0, + "grad_norm": 2.1939319933932424, + "language_loss": 0.68031204, + "learning_rate": 8.097014228555426e-07, + "loss": 0.75708568, + "num_input_tokens_seen": 255408790, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.09875488, + "step": 11839, + "time_per_iteration": 3.951659679412842 + }, + { + "auxiliary_loss_clip": 0.06405699, + "auxiliary_loss_mlp": 0.01265718, + "balance_loss_clip": 0.06272204, + "balance_loss_mlp": 0.01256349, + "epoch": 0.7118593115887569, + "flos": 21146757672960.0, + "grad_norm": 2.0203738416997226, + "language_loss": 0.8447386, + "learning_rate": 8.093884681144305e-07, + "loss": 0.92145276, + "num_input_tokens_seen": 255426280, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.09375, + "step": 11840, + "time_per_iteration": 2.5161664485931396 + }, + { + "auxiliary_loss_clip": 0.0641197, + "auxiliary_loss_mlp": 0.01266296, + "balance_loss_clip": 0.06274749, + "balance_loss_mlp": 0.01256413, + "epoch": 0.711919434841425, + "flos": 14980951779840.0, + "grad_norm": 1.9072315995358804, + "language_loss": 0.77299631, + "learning_rate": 8.090755585214277e-07, + "loss": 0.84977901, + "num_input_tokens_seen": 255442935, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.09881592, + "step": 11841, + "time_per_iteration": 2.5373709201812744 + }, + { + "auxiliary_loss_clip": 0.06406824, + "auxiliary_loss_mlp": 0.01265843, + "balance_loss_clip": 0.06271842, + "balance_loss_mlp": 0.01256348, + "epoch": 0.7119795580940929, + "flos": 16514674513920.0, + "grad_norm": 2.1386907373947186, + "language_loss": 0.75567174, + "learning_rate": 8.087626940883994e-07, + "loss": 0.83239841, + "num_input_tokens_seen": 255460925, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.0949707, + "step": 11842, + "time_per_iteration": 2.5253396034240723 + }, + { + "auxiliary_loss_clip": 0.06309856, + "auxiliary_loss_mlp": 0.01250631, + "balance_loss_clip": 0.06254404, + "balance_loss_mlp": 0.01249538, + "epoch": 0.7120396813467609, + "flos": 66591434315520.0, + "grad_norm": 0.7631692514869006, + "language_loss": 0.61363775, + "learning_rate": 8.084498748272082e-07, + "loss": 0.6892426, + "num_input_tokens_seen": 255521360, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01094818, + "step": 11843, + "time_per_iteration": 3.097399950027466 + }, + { + "auxiliary_loss_clip": 0.06403574, + "auxiliary_loss_mlp": 0.01266422, + "balance_loss_clip": 0.06270365, + "balance_loss_mlp": 0.01256432, + "epoch": 0.7120998045994288, + "flos": 26440001873280.0, + "grad_norm": 3.96385360450405, + "language_loss": 0.80268991, + "learning_rate": 8.081371007497171e-07, + "loss": 0.87938976, + "num_input_tokens_seen": 255541435, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.09997559, + "step": 11844, + "time_per_iteration": 2.552259683609009 + }, + { + "auxiliary_loss_clip": 0.06406216, + "auxiliary_loss_mlp": 0.01262016, + "balance_loss_clip": 0.06270443, + "balance_loss_mlp": 0.01252759, + "epoch": 0.7121599278520968, + "flos": 16432300350720.0, + "grad_norm": 2.2064261749206784, + "language_loss": 0.79144967, + "learning_rate": 8.078243718677873e-07, + "loss": 0.868132, + "num_input_tokens_seen": 255558505, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.09259033, + "step": 11845, + "time_per_iteration": 2.5421273708343506 + }, + { + "auxiliary_loss_clip": 0.06402468, + "auxiliary_loss_mlp": 0.01265331, + "balance_loss_clip": 0.06271762, + "balance_loss_mlp": 0.01255532, + "epoch": 0.7122200511047647, + "flos": 28957520995200.0, + "grad_norm": 2.3428288803792485, + "language_loss": 0.77299261, + "learning_rate": 8.075116881932762e-07, + "loss": 0.84967065, + "num_input_tokens_seen": 255577815, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09796143, + "step": 11846, + "time_per_iteration": 2.527745485305786 + }, + { + "auxiliary_loss_clip": 0.06408693, + "auxiliary_loss_mlp": 0.01266657, + "balance_loss_clip": 0.06274035, + "balance_loss_mlp": 0.01256334, + "epoch": 0.7122801743574327, + "flos": 16477428574080.0, + "grad_norm": 1.8749902395969622, + "language_loss": 0.58446372, + "learning_rate": 8.071990497380421e-07, + "loss": 0.66121721, + "num_input_tokens_seen": 255595885, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.10314941, + "step": 11847, + "time_per_iteration": 2.4880757331848145 + }, + { + "auxiliary_loss_clip": 0.06397726, + "auxiliary_loss_mlp": 0.01263987, + "balance_loss_clip": 0.06270626, + "balance_loss_mlp": 0.01254081, + "epoch": 0.7123402976101008, + "flos": 20637263721600.0, + "grad_norm": 1.2877189780235179, + "language_loss": 0.71294212, + "learning_rate": 8.068864565139395e-07, + "loss": 0.78955925, + "num_input_tokens_seen": 255616750, + "router_z_loss_clip": 1.27050781, + "router_z_loss_mlp": 0.09918213, + "step": 11848, + "time_per_iteration": 2.5513198375701904 + }, + { + "auxiliary_loss_clip": 0.0630827, + "auxiliary_loss_mlp": 0.01254097, + "balance_loss_clip": 0.06252526, + "balance_loss_mlp": 0.01252904, + "epoch": 0.7124004208627687, + "flos": 62343606781440.0, + "grad_norm": 0.847952001487362, + "language_loss": 0.6271292, + "learning_rate": 8.065739085328211e-07, + "loss": 0.70275289, + "num_input_tokens_seen": 255677900, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.01190948, + "step": 11849, + "time_per_iteration": 3.1112751960754395 + }, + { + "auxiliary_loss_clip": 0.06405951, + "auxiliary_loss_mlp": 0.01264545, + "balance_loss_clip": 0.06269863, + "balance_loss_mlp": 0.01254699, + "epoch": 0.7124605441154367, + "flos": 39685278579840.0, + "grad_norm": 1.4089636975562345, + "language_loss": 0.64458466, + "learning_rate": 8.0626140580654e-07, + "loss": 0.72128963, + "num_input_tokens_seen": 255699140, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.09844971, + "step": 11850, + "time_per_iteration": 2.632457733154297 + }, + { + "auxiliary_loss_clip": 0.06404182, + "auxiliary_loss_mlp": 0.01262554, + "balance_loss_clip": 0.06269325, + "balance_loss_mlp": 0.0125254, + "epoch": 0.7125206673681046, + "flos": 28189066400640.0, + "grad_norm": 1.5452031150775634, + "language_loss": 0.70381355, + "learning_rate": 8.05948948346946e-07, + "loss": 0.78048086, + "num_input_tokens_seen": 255719640, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.10003662, + "step": 11851, + "time_per_iteration": 2.563063144683838 + }, + { + "auxiliary_loss_clip": 0.06402514, + "auxiliary_loss_mlp": 0.01261637, + "balance_loss_clip": 0.06271089, + "balance_loss_mlp": 0.0125275, + "epoch": 0.7125807906207726, + "flos": 26184101904000.0, + "grad_norm": 1.4548821396986709, + "language_loss": 0.83386576, + "learning_rate": 8.056365361658882e-07, + "loss": 0.9105072, + "num_input_tokens_seen": 255740450, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.08892822, + "step": 11852, + "time_per_iteration": 2.5185182094573975 + }, + { + "auxiliary_loss_clip": 0.06408215, + "auxiliary_loss_mlp": 0.01266945, + "balance_loss_clip": 0.06270768, + "balance_loss_mlp": 0.01256759, + "epoch": 0.7126409138734405, + "flos": 17161706142720.0, + "grad_norm": 2.03558575161385, + "language_loss": 0.72365862, + "learning_rate": 8.053241692752126e-07, + "loss": 0.80041021, + "num_input_tokens_seen": 255758070, + "router_z_loss_clip": 1.37402344, + "router_z_loss_mlp": 0.10186768, + "step": 11853, + "time_per_iteration": 2.4712510108947754 + }, + { + "auxiliary_loss_clip": 0.06400356, + "auxiliary_loss_mlp": 0.01265707, + "balance_loss_clip": 0.06273182, + "balance_loss_mlp": 0.01257005, + "epoch": 0.7127010371261085, + "flos": 18775790542080.0, + "grad_norm": 1.725464250509213, + "language_loss": 0.92318237, + "learning_rate": 8.050118476867635e-07, + "loss": 0.999843, + "num_input_tokens_seen": 255775685, + "router_z_loss_clip": 1.27246094, + "router_z_loss_mlp": 0.08703613, + "step": 11854, + "time_per_iteration": 2.4725341796875 + }, + { + "auxiliary_loss_clip": 0.06403268, + "auxiliary_loss_mlp": 0.01268625, + "balance_loss_clip": 0.06272953, + "balance_loss_mlp": 0.01260018, + "epoch": 0.7127611603787765, + "flos": 20382747344640.0, + "grad_norm": 1.8133122260210155, + "language_loss": 0.79957211, + "learning_rate": 8.046995714123856e-07, + "loss": 0.8762911, + "num_input_tokens_seen": 255794750, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.08612061, + "step": 11855, + "time_per_iteration": 2.5004756450653076 + }, + { + "auxiliary_loss_clip": 0.0640405, + "auxiliary_loss_mlp": 0.01264358, + "balance_loss_clip": 0.06273045, + "balance_loss_mlp": 0.01254244, + "epoch": 0.7128212836314445, + "flos": 20455268653440.0, + "grad_norm": 1.8163189094799566, + "language_loss": 0.73227429, + "learning_rate": 8.043873404639192e-07, + "loss": 0.80895841, + "num_input_tokens_seen": 255813325, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.10119629, + "step": 11856, + "time_per_iteration": 2.489022731781006 + }, + { + "auxiliary_loss_clip": 0.06408788, + "auxiliary_loss_mlp": 0.01268564, + "balance_loss_clip": 0.06273171, + "balance_loss_mlp": 0.01258634, + "epoch": 0.7128814068841124, + "flos": 23447593336320.0, + "grad_norm": 1.4996097551327818, + "language_loss": 0.69965553, + "learning_rate": 8.040751548532046e-07, + "loss": 0.77642906, + "num_input_tokens_seen": 255832470, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.0993042, + "step": 11857, + "time_per_iteration": 2.5889153480529785 + }, + { + "auxiliary_loss_clip": 0.06401453, + "auxiliary_loss_mlp": 0.01263435, + "balance_loss_clip": 0.06270251, + "balance_loss_mlp": 0.01253488, + "epoch": 0.7129415301367804, + "flos": 18228757161600.0, + "grad_norm": 1.9673696792632074, + "language_loss": 0.85894734, + "learning_rate": 8.03763014592081e-07, + "loss": 0.93559623, + "num_input_tokens_seen": 255849740, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.09942627, + "step": 11858, + "time_per_iteration": 2.4554738998413086 + }, + { + "auxiliary_loss_clip": 0.0641135, + "auxiliary_loss_mlp": 0.01265866, + "balance_loss_clip": 0.062728, + "balance_loss_mlp": 0.01255697, + "epoch": 0.7130016533894483, + "flos": 15529410679680.0, + "grad_norm": 1.7544523597871677, + "language_loss": 0.80554175, + "learning_rate": 8.034509196923829e-07, + "loss": 0.88231397, + "num_input_tokens_seen": 255866975, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.10168457, + "step": 11859, + "time_per_iteration": 3.9745945930480957 + }, + { + "auxiliary_loss_clip": 0.06400725, + "auxiliary_loss_mlp": 0.01264096, + "balance_loss_clip": 0.06269667, + "balance_loss_mlp": 0.0125472, + "epoch": 0.7130617766421163, + "flos": 57127804081920.0, + "grad_norm": 1.1922495989293056, + "language_loss": 0.69005597, + "learning_rate": 8.031388701659456e-07, + "loss": 0.76670408, + "num_input_tokens_seen": 255892915, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.09381104, + "step": 11860, + "time_per_iteration": 2.891012668609619 + }, + { + "auxiliary_loss_clip": 0.06406054, + "auxiliary_loss_mlp": 0.01266268, + "balance_loss_clip": 0.06271956, + "balance_loss_mlp": 0.01255575, + "epoch": 0.7131218998947844, + "flos": 19793730268800.0, + "grad_norm": 2.1261081147363097, + "language_loss": 0.64239693, + "learning_rate": 8.028268660246023e-07, + "loss": 0.71912014, + "num_input_tokens_seen": 255911480, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.10693359, + "step": 11861, + "time_per_iteration": 2.5796282291412354 + }, + { + "auxiliary_loss_clip": 0.06410623, + "auxiliary_loss_mlp": 0.01265484, + "balance_loss_clip": 0.06273146, + "balance_loss_mlp": 0.01254857, + "epoch": 0.7131820231474523, + "flos": 26659242881280.0, + "grad_norm": 3.187443939826819, + "language_loss": 0.67274332, + "learning_rate": 8.025149072801849e-07, + "loss": 0.74950445, + "num_input_tokens_seen": 255931140, + "router_z_loss_clip": 1.37402344, + "router_z_loss_mlp": 0.10620117, + "step": 11862, + "time_per_iteration": 2.576899528503418 + }, + { + "auxiliary_loss_clip": 0.064044, + "auxiliary_loss_mlp": 0.01265218, + "balance_loss_clip": 0.06273039, + "balance_loss_mlp": 0.01255926, + "epoch": 0.7132421464001203, + "flos": 29213337110400.0, + "grad_norm": 2.2144093674445426, + "language_loss": 0.67745155, + "learning_rate": 8.022029939445214e-07, + "loss": 0.75414771, + "num_input_tokens_seen": 255951665, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.09283447, + "step": 11863, + "time_per_iteration": 2.563467264175415 + }, + { + "auxiliary_loss_clip": 0.06412646, + "auxiliary_loss_mlp": 0.0126882, + "balance_loss_clip": 0.06272405, + "balance_loss_mlp": 0.01258103, + "epoch": 0.7133022696527882, + "flos": 23079913620480.0, + "grad_norm": 1.7053563824160904, + "language_loss": 0.6612097, + "learning_rate": 8.018911260294414e-07, + "loss": 0.73802435, + "num_input_tokens_seen": 255970055, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.10717773, + "step": 11864, + "time_per_iteration": 2.5226974487304688 + }, + { + "auxiliary_loss_clip": 0.06409131, + "auxiliary_loss_mlp": 0.01265229, + "balance_loss_clip": 0.06273311, + "balance_loss_mlp": 0.01255019, + "epoch": 0.7133623929054562, + "flos": 17462860116480.0, + "grad_norm": 3.439605466883789, + "language_loss": 0.86094218, + "learning_rate": 8.015793035467697e-07, + "loss": 0.93768573, + "num_input_tokens_seen": 255987720, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10217285, + "step": 11865, + "time_per_iteration": 2.441121816635132 + }, + { + "auxiliary_loss_clip": 0.06408411, + "auxiliary_loss_mlp": 0.01263379, + "balance_loss_clip": 0.06273241, + "balance_loss_mlp": 0.01252942, + "epoch": 0.7134225161581241, + "flos": 19542609982080.0, + "grad_norm": 2.0189990892571807, + "language_loss": 0.75141108, + "learning_rate": 8.012675265083304e-07, + "loss": 0.82812905, + "num_input_tokens_seen": 256005490, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.10443115, + "step": 11866, + "time_per_iteration": 2.4785237312316895 + }, + { + "auxiliary_loss_clip": 0.06411657, + "auxiliary_loss_mlp": 0.01267167, + "balance_loss_clip": 0.06275963, + "balance_loss_mlp": 0.01256408, + "epoch": 0.7134826394107922, + "flos": 26257294045440.0, + "grad_norm": 3.679418691378197, + "language_loss": 0.70483118, + "learning_rate": 8.009557949259464e-07, + "loss": 0.78161943, + "num_input_tokens_seen": 256026030, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.10748291, + "step": 11867, + "time_per_iteration": 2.518202066421509 + }, + { + "auxiliary_loss_clip": 0.06403194, + "auxiliary_loss_mlp": 0.01266031, + "balance_loss_clip": 0.06272841, + "balance_loss_mlp": 0.01256477, + "epoch": 0.7135427626634601, + "flos": 15820795653120.0, + "grad_norm": 4.975034900378342, + "language_loss": 0.71782935, + "learning_rate": 8.006441088114397e-07, + "loss": 0.79452157, + "num_input_tokens_seen": 256043680, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.09552002, + "step": 11868, + "time_per_iteration": 2.4938719272613525 + }, + { + "auxiliary_loss_clip": 0.06411693, + "auxiliary_loss_mlp": 0.01268858, + "balance_loss_clip": 0.06273223, + "balance_loss_mlp": 0.01257635, + "epoch": 0.7136028859161281, + "flos": 18229302213120.0, + "grad_norm": 1.9405833387691556, + "language_loss": 0.66333723, + "learning_rate": 8.003324681766286e-07, + "loss": 0.7401427, + "num_input_tokens_seen": 256059705, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.11236572, + "step": 11869, + "time_per_iteration": 2.4637274742126465 + }, + { + "auxiliary_loss_clip": 0.06408057, + "auxiliary_loss_mlp": 0.01264796, + "balance_loss_clip": 0.06273142, + "balance_loss_mlp": 0.01255003, + "epoch": 0.713663009168796, + "flos": 24321454767360.0, + "grad_norm": 1.4404508285538464, + "language_loss": 0.77963442, + "learning_rate": 8.000208730333298e-07, + "loss": 0.856363, + "num_input_tokens_seen": 256079785, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.09790039, + "step": 11870, + "time_per_iteration": 2.545146942138672 + }, + { + "auxiliary_loss_clip": 0.06407803, + "auxiliary_loss_mlp": 0.01266072, + "balance_loss_clip": 0.06275113, + "balance_loss_mlp": 0.01255248, + "epoch": 0.713723132421464, + "flos": 26545157147520.0, + "grad_norm": 2.250105845614367, + "language_loss": 0.81401408, + "learning_rate": 7.997093233933597e-07, + "loss": 0.89075279, + "num_input_tokens_seen": 256099000, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.10821533, + "step": 11871, + "time_per_iteration": 4.061939477920532 + }, + { + "auxiliary_loss_clip": 0.06409386, + "auxiliary_loss_mlp": 0.01271871, + "balance_loss_clip": 0.06272148, + "balance_loss_mlp": 0.01261541, + "epoch": 0.7137832556741319, + "flos": 19871911728000.0, + "grad_norm": 1.5669444552919631, + "language_loss": 0.78963834, + "learning_rate": 7.993978192685331e-07, + "loss": 0.86645091, + "num_input_tokens_seen": 256117985, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.10321045, + "step": 11872, + "time_per_iteration": 2.502652645111084 + }, + { + "auxiliary_loss_clip": 0.06413025, + "auxiliary_loss_mlp": 0.01263574, + "balance_loss_clip": 0.06273353, + "balance_loss_mlp": 0.01253108, + "epoch": 0.7138433789267999, + "flos": 21695300426880.0, + "grad_norm": 2.078419347550335, + "language_loss": 0.83881956, + "learning_rate": 7.990863606706606e-07, + "loss": 0.91558552, + "num_input_tokens_seen": 256134350, + "router_z_loss_clip": 1.39648438, + "router_z_loss_mlp": 0.10473633, + "step": 11873, + "time_per_iteration": 2.49755859375 + }, + { + "auxiliary_loss_clip": 0.06404479, + "auxiliary_loss_mlp": 0.01264148, + "balance_loss_clip": 0.06273785, + "balance_loss_mlp": 0.0125491, + "epoch": 0.713903502179468, + "flos": 17608447785600.0, + "grad_norm": 2.139862978747737, + "language_loss": 0.85866129, + "learning_rate": 7.987749476115539e-07, + "loss": 0.93534762, + "num_input_tokens_seen": 256150610, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.09240723, + "step": 11874, + "time_per_iteration": 2.446295976638794 + }, + { + "auxiliary_loss_clip": 0.0641006, + "auxiliary_loss_mlp": 0.01266331, + "balance_loss_clip": 0.06275686, + "balance_loss_mlp": 0.01256043, + "epoch": 0.7139636254321359, + "flos": 18046091260800.0, + "grad_norm": 1.75973654551926, + "language_loss": 0.83120143, + "learning_rate": 7.984635801030228e-07, + "loss": 0.90796536, + "num_input_tokens_seen": 256168620, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.10284424, + "step": 11875, + "time_per_iteration": 3.8960680961608887 + }, + { + "auxiliary_loss_clip": 0.06414599, + "auxiliary_loss_mlp": 0.01267914, + "balance_loss_clip": 0.06272531, + "balance_loss_mlp": 0.01256136, + "epoch": 0.7140237486848039, + "flos": 23337826087680.0, + "grad_norm": 1.757783447264505, + "language_loss": 0.69900811, + "learning_rate": 7.981522581568721e-07, + "loss": 0.77583325, + "num_input_tokens_seen": 256186700, + "router_z_loss_clip": 1.41992188, + "router_z_loss_mlp": 0.11779785, + "step": 11876, + "time_per_iteration": 2.491225481033325 + }, + { + "auxiliary_loss_clip": 0.06411763, + "auxiliary_loss_mlp": 0.01262915, + "balance_loss_clip": 0.06274708, + "balance_loss_mlp": 0.01252663, + "epoch": 0.7140838719374718, + "flos": 16842760375680.0, + "grad_norm": 1.8106538192439035, + "language_loss": 0.78886259, + "learning_rate": 7.978409817849079e-07, + "loss": 0.86560941, + "num_input_tokens_seen": 256205390, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.10253906, + "step": 11877, + "time_per_iteration": 2.493778705596924 + }, + { + "auxiliary_loss_clip": 0.0640865, + "auxiliary_loss_mlp": 0.01267195, + "balance_loss_clip": 0.06276323, + "balance_loss_mlp": 0.01257611, + "epoch": 0.7141439951901398, + "flos": 21148350900480.0, + "grad_norm": 1.8508532405281077, + "language_loss": 0.70390731, + "learning_rate": 7.97529750998934e-07, + "loss": 0.78066581, + "num_input_tokens_seen": 256224575, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.0958252, + "step": 11878, + "time_per_iteration": 3.8979172706604004 + }, + { + "auxiliary_loss_clip": 0.06407811, + "auxiliary_loss_mlp": 0.01264089, + "balance_loss_clip": 0.06277137, + "balance_loss_mlp": 0.01254153, + "epoch": 0.7142041184428077, + "flos": 24724661414400.0, + "grad_norm": 1.94673596086021, + "language_loss": 0.67341477, + "learning_rate": 7.972185658107535e-07, + "loss": 0.75013375, + "num_input_tokens_seen": 256242130, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09936523, + "step": 11879, + "time_per_iteration": 2.5100598335266113 + }, + { + "auxiliary_loss_clip": 0.06410161, + "auxiliary_loss_mlp": 0.01262301, + "balance_loss_clip": 0.06275017, + "balance_loss_mlp": 0.01252037, + "epoch": 0.7142642416954758, + "flos": 21914667216000.0, + "grad_norm": 1.6535111085971643, + "language_loss": 0.69445574, + "learning_rate": 7.969074262321646e-07, + "loss": 0.77118039, + "num_input_tokens_seen": 256261920, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.10266113, + "step": 11880, + "time_per_iteration": 2.507603406906128 + }, + { + "auxiliary_loss_clip": 0.0641037, + "auxiliary_loss_mlp": 0.01264833, + "balance_loss_clip": 0.06273447, + "balance_loss_mlp": 0.01254772, + "epoch": 0.7143243649481437, + "flos": 20810579892480.0, + "grad_norm": 2.0343383375931894, + "language_loss": 0.80753726, + "learning_rate": 7.965963322749674e-07, + "loss": 0.88428932, + "num_input_tokens_seen": 256277970, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.10058594, + "step": 11881, + "time_per_iteration": 2.4606220722198486 + }, + { + "auxiliary_loss_clip": 0.06409037, + "auxiliary_loss_mlp": 0.01264183, + "balance_loss_clip": 0.06274998, + "balance_loss_mlp": 0.01254539, + "epoch": 0.7143844882008117, + "flos": 27242348244480.0, + "grad_norm": 1.58430278316452, + "language_loss": 0.64282894, + "learning_rate": 7.962852839509579e-07, + "loss": 0.71956116, + "num_input_tokens_seen": 256298205, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.09643555, + "step": 11882, + "time_per_iteration": 2.56210994720459 + }, + { + "auxiliary_loss_clip": 0.06411886, + "auxiliary_loss_mlp": 0.01263564, + "balance_loss_clip": 0.06275278, + "balance_loss_mlp": 0.01253473, + "epoch": 0.7144446114534796, + "flos": 17935150055040.0, + "grad_norm": 1.872999181445386, + "language_loss": 0.69193482, + "learning_rate": 7.959742812719304e-07, + "loss": 0.76868939, + "num_input_tokens_seen": 256316685, + "router_z_loss_clip": 1.36621094, + "router_z_loss_mlp": 0.10101318, + "step": 11883, + "time_per_iteration": 2.4767167568206787 + }, + { + "auxiliary_loss_clip": 0.06408374, + "auxiliary_loss_mlp": 0.01263792, + "balance_loss_clip": 0.06277797, + "balance_loss_mlp": 0.01253761, + "epoch": 0.7145047347061476, + "flos": 20747282532480.0, + "grad_norm": 2.264759730138534, + "language_loss": 0.7842024, + "learning_rate": 7.956633242496788e-07, + "loss": 0.86092412, + "num_input_tokens_seen": 256334205, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.10040283, + "step": 11884, + "time_per_iteration": 2.5488386154174805 + }, + { + "auxiliary_loss_clip": 0.06414723, + "auxiliary_loss_mlp": 0.01266561, + "balance_loss_clip": 0.06273861, + "balance_loss_mlp": 0.01255517, + "epoch": 0.7145648579588155, + "flos": 21184967934720.0, + "grad_norm": 5.179157665604164, + "language_loss": 0.74281037, + "learning_rate": 7.953524128959954e-07, + "loss": 0.81962323, + "num_input_tokens_seen": 256353340, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.1104126, + "step": 11885, + "time_per_iteration": 2.4918782711029053 + }, + { + "auxiliary_loss_clip": 0.06317447, + "auxiliary_loss_mlp": 0.01252483, + "balance_loss_clip": 0.06261733, + "balance_loss_mlp": 0.0125137, + "epoch": 0.7146249812114835, + "flos": 64805207702400.0, + "grad_norm": 0.9938747796430238, + "language_loss": 0.66419291, + "learning_rate": 7.95041547222669e-07, + "loss": 0.73989218, + "num_input_tokens_seen": 256411550, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01115417, + "step": 11886, + "time_per_iteration": 3.0856966972351074 + }, + { + "auxiliary_loss_clip": 0.06409487, + "auxiliary_loss_mlp": 0.01262772, + "balance_loss_clip": 0.06275956, + "balance_loss_mlp": 0.01253361, + "epoch": 0.7146851044641516, + "flos": 18119744599680.0, + "grad_norm": 1.9726076644282031, + "language_loss": 0.75334477, + "learning_rate": 7.947307272414874e-07, + "loss": 0.8300674, + "num_input_tokens_seen": 256430360, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.09411621, + "step": 11887, + "time_per_iteration": 2.457226037979126 + }, + { + "auxiliary_loss_clip": 0.06411713, + "auxiliary_loss_mlp": 0.01264697, + "balance_loss_clip": 0.06275448, + "balance_loss_mlp": 0.01254701, + "epoch": 0.7147452277168195, + "flos": 19249715635200.0, + "grad_norm": 1.4837579130348453, + "language_loss": 0.71681702, + "learning_rate": 7.944199529642372e-07, + "loss": 0.79358119, + "num_input_tokens_seen": 256449750, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.10003662, + "step": 11888, + "time_per_iteration": 2.5040013790130615 + }, + { + "auxiliary_loss_clip": 0.06412415, + "auxiliary_loss_mlp": 0.0126625, + "balance_loss_clip": 0.06273472, + "balance_loss_mlp": 0.01256266, + "epoch": 0.7148053509694875, + "flos": 23770773734400.0, + "grad_norm": 1.770417967060374, + "language_loss": 0.84754878, + "learning_rate": 7.941092244027041e-07, + "loss": 0.92433536, + "num_input_tokens_seen": 256467330, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.09991455, + "step": 11889, + "time_per_iteration": 2.498847246170044 + }, + { + "auxiliary_loss_clip": 0.0640825, + "auxiliary_loss_mlp": 0.01263505, + "balance_loss_clip": 0.06273344, + "balance_loss_mlp": 0.0125401, + "epoch": 0.7148654742221554, + "flos": 22490770763520.0, + "grad_norm": 1.697229185177074, + "language_loss": 0.75894499, + "learning_rate": 7.937985415686695e-07, + "loss": 0.8356626, + "num_input_tokens_seen": 256485705, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.0949707, + "step": 11890, + "time_per_iteration": 2.5205180644989014 + }, + { + "auxiliary_loss_clip": 0.06404347, + "auxiliary_loss_mlp": 0.01264735, + "balance_loss_clip": 0.06271873, + "balance_loss_mlp": 0.01255073, + "epoch": 0.7149255974748234, + "flos": 24685822247040.0, + "grad_norm": 1.9172824039571863, + "language_loss": 0.74212694, + "learning_rate": 7.934879044739147e-07, + "loss": 0.81881773, + "num_input_tokens_seen": 256504755, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.09667969, + "step": 11891, + "time_per_iteration": 2.515684127807617 + }, + { + "auxiliary_loss_clip": 0.06409282, + "auxiliary_loss_mlp": 0.0126706, + "balance_loss_clip": 0.06273171, + "balance_loss_mlp": 0.01256963, + "epoch": 0.7149857207274913, + "flos": 18411464989440.0, + "grad_norm": 1.8378637994341889, + "language_loss": 0.68246537, + "learning_rate": 7.931773131302211e-07, + "loss": 0.75922883, + "num_input_tokens_seen": 256523670, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.10101318, + "step": 11892, + "time_per_iteration": 2.4761176109313965 + }, + { + "auxiliary_loss_clip": 0.06410619, + "auxiliary_loss_mlp": 0.01266239, + "balance_loss_clip": 0.06271711, + "balance_loss_mlp": 0.01254813, + "epoch": 0.7150458439801594, + "flos": 24975907482240.0, + "grad_norm": 1.712623401245163, + "language_loss": 0.74044412, + "learning_rate": 7.928667675493632e-07, + "loss": 0.81721264, + "num_input_tokens_seen": 256542225, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.11413574, + "step": 11893, + "time_per_iteration": 2.5127475261688232 + }, + { + "auxiliary_loss_clip": 0.06412932, + "auxiliary_loss_mlp": 0.01265609, + "balance_loss_clip": 0.06273164, + "balance_loss_mlp": 0.01253873, + "epoch": 0.7151059672328273, + "flos": 16696376092800.0, + "grad_norm": 2.7158372012320315, + "language_loss": 0.66545182, + "learning_rate": 7.925562677431185e-07, + "loss": 0.74223733, + "num_input_tokens_seen": 256560730, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.11743164, + "step": 11894, + "time_per_iteration": 2.5338070392608643 + }, + { + "auxiliary_loss_clip": 0.06413232, + "auxiliary_loss_mlp": 0.01263618, + "balance_loss_clip": 0.06275386, + "balance_loss_mlp": 0.0125364, + "epoch": 0.7151660904854953, + "flos": 27279216840960.0, + "grad_norm": 7.327232790836601, + "language_loss": 0.77995753, + "learning_rate": 7.922458137232613e-07, + "loss": 0.85672593, + "num_input_tokens_seen": 256580505, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.09979248, + "step": 11895, + "time_per_iteration": 2.545539379119873 + }, + { + "auxiliary_loss_clip": 0.06408492, + "auxiliary_loss_mlp": 0.01262254, + "balance_loss_clip": 0.06271514, + "balance_loss_mlp": 0.01251776, + "epoch": 0.7152262137381632, + "flos": 18338063212800.0, + "grad_norm": 2.1720944859755327, + "language_loss": 0.69649661, + "learning_rate": 7.919354055015643e-07, + "loss": 0.77320409, + "num_input_tokens_seen": 256597330, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.1048584, + "step": 11896, + "time_per_iteration": 2.5020852088928223 + }, + { + "auxiliary_loss_clip": 0.06410179, + "auxiliary_loss_mlp": 0.01270904, + "balance_loss_clip": 0.06272502, + "balance_loss_mlp": 0.01259203, + "epoch": 0.7152863369908312, + "flos": 21805822362240.0, + "grad_norm": 1.8979241109476415, + "language_loss": 0.8686198, + "learning_rate": 7.91625043089798e-07, + "loss": 0.94543064, + "num_input_tokens_seen": 256616030, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.11694336, + "step": 11897, + "time_per_iteration": 2.4981558322906494 + }, + { + "auxiliary_loss_clip": 0.06406087, + "auxiliary_loss_mlp": 0.01264151, + "balance_loss_clip": 0.06274753, + "balance_loss_mlp": 0.01254084, + "epoch": 0.7153464602434991, + "flos": 22164068494080.0, + "grad_norm": 1.7720635566598981, + "language_loss": 0.78347677, + "learning_rate": 7.913147264997304e-07, + "loss": 0.86017919, + "num_input_tokens_seen": 256635570, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.10070801, + "step": 11898, + "time_per_iteration": 2.568208694458008 + }, + { + "auxiliary_loss_clip": 0.06413846, + "auxiliary_loss_mlp": 0.01263525, + "balance_loss_clip": 0.0627441, + "balance_loss_mlp": 0.01252868, + "epoch": 0.7154065834961671, + "flos": 24722732770560.0, + "grad_norm": 1.7720575063877593, + "language_loss": 0.73240674, + "learning_rate": 7.910044557431302e-07, + "loss": 0.8091805, + "num_input_tokens_seen": 256655290, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.10656738, + "step": 11899, + "time_per_iteration": 3.9873409271240234 + }, + { + "auxiliary_loss_clip": 0.06406702, + "auxiliary_loss_mlp": 0.0126605, + "balance_loss_clip": 0.06271633, + "balance_loss_mlp": 0.01255482, + "epoch": 0.7154667067488351, + "flos": 22608084879360.0, + "grad_norm": 2.7184837218905216, + "language_loss": 0.75906515, + "learning_rate": 7.906942308317614e-07, + "loss": 0.83579266, + "num_input_tokens_seen": 256671605, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.10565186, + "step": 11900, + "time_per_iteration": 2.48612380027771 + }, + { + "auxiliary_loss_clip": 0.06410916, + "auxiliary_loss_mlp": 0.01263744, + "balance_loss_clip": 0.06274971, + "balance_loss_mlp": 0.01254064, + "epoch": 0.7155268300015031, + "flos": 18777216061440.0, + "grad_norm": 1.8830405388899822, + "language_loss": 0.80537415, + "learning_rate": 7.903840517773886e-07, + "loss": 0.88212073, + "num_input_tokens_seen": 256689680, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.09680176, + "step": 11901, + "time_per_iteration": 2.538071632385254 + }, + { + "auxiliary_loss_clip": 0.06413621, + "auxiliary_loss_mlp": 0.01265462, + "balance_loss_clip": 0.0627216, + "balance_loss_mlp": 0.01254626, + "epoch": 0.7155869532541711, + "flos": 18302242792320.0, + "grad_norm": 1.8091761354011133, + "language_loss": 0.82077742, + "learning_rate": 7.900739185917744e-07, + "loss": 0.89756829, + "num_input_tokens_seen": 256707760, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.10839844, + "step": 11902, + "time_per_iteration": 2.4796504974365234 + }, + { + "auxiliary_loss_clip": 0.06407838, + "auxiliary_loss_mlp": 0.01263676, + "balance_loss_clip": 0.06271887, + "balance_loss_mlp": 0.01254306, + "epoch": 0.715647076506839, + "flos": 11985063298560.0, + "grad_norm": 1.8489548968848413, + "language_loss": 0.68603027, + "learning_rate": 7.897638312866785e-07, + "loss": 0.76274538, + "num_input_tokens_seen": 256724150, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.09356689, + "step": 11903, + "time_per_iteration": 2.502664566040039 + }, + { + "auxiliary_loss_clip": 0.06406122, + "auxiliary_loss_mlp": 0.01265685, + "balance_loss_clip": 0.06273056, + "balance_loss_mlp": 0.0125591, + "epoch": 0.715707199759507, + "flos": 18957408266880.0, + "grad_norm": 1.5823213300778882, + "language_loss": 0.75905824, + "learning_rate": 7.894537898738589e-07, + "loss": 0.83577633, + "num_input_tokens_seen": 256742780, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.09765625, + "step": 11904, + "time_per_iteration": 2.4838523864746094 + }, + { + "auxiliary_loss_clip": 0.06408757, + "auxiliary_loss_mlp": 0.01267288, + "balance_loss_clip": 0.06273915, + "balance_loss_mlp": 0.01255838, + "epoch": 0.7157673230121749, + "flos": 15309792328320.0, + "grad_norm": 1.6671251370747393, + "language_loss": 0.7200684, + "learning_rate": 7.891437943650727e-07, + "loss": 0.79682887, + "num_input_tokens_seen": 256761355, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.11456299, + "step": 11905, + "time_per_iteration": 2.5194296836853027 + }, + { + "auxiliary_loss_clip": 0.06407201, + "auxiliary_loss_mlp": 0.0126414, + "balance_loss_clip": 0.06273023, + "balance_loss_mlp": 0.01254377, + "epoch": 0.715827446264843, + "flos": 23228561963520.0, + "grad_norm": 1.7268826203228764, + "language_loss": 0.7871933, + "learning_rate": 7.88833844772076e-07, + "loss": 0.86390674, + "num_input_tokens_seen": 256781335, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.09765625, + "step": 11906, + "time_per_iteration": 2.505692720413208 + }, + { + "auxiliary_loss_clip": 0.06311446, + "auxiliary_loss_mlp": 0.01249409, + "balance_loss_clip": 0.06255978, + "balance_loss_mlp": 0.01248228, + "epoch": 0.7158875695175109, + "flos": 60993011145600.0, + "grad_norm": 0.7186868091888179, + "language_loss": 0.55247056, + "learning_rate": 7.885239411066205e-07, + "loss": 0.62807906, + "num_input_tokens_seen": 256838890, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01179504, + "step": 11907, + "time_per_iteration": 3.077824354171753 + }, + { + "auxiliary_loss_clip": 0.06404838, + "auxiliary_loss_mlp": 0.01262889, + "balance_loss_clip": 0.06269851, + "balance_loss_mlp": 0.01252893, + "epoch": 0.7159476927701789, + "flos": 17134480765440.0, + "grad_norm": 1.7650418564568968, + "language_loss": 0.69603425, + "learning_rate": 7.882140833804593e-07, + "loss": 0.77271152, + "num_input_tokens_seen": 256858145, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.09985352, + "step": 11908, + "time_per_iteration": 2.4865145683288574 + }, + { + "auxiliary_loss_clip": 0.06412758, + "auxiliary_loss_mlp": 0.01264461, + "balance_loss_clip": 0.06276048, + "balance_loss_mlp": 0.01253625, + "epoch": 0.7160078160228468, + "flos": 22496934038400.0, + "grad_norm": 1.9817565541714355, + "language_loss": 0.71485305, + "learning_rate": 7.879042716053415e-07, + "loss": 0.79162526, + "num_input_tokens_seen": 256878545, + "router_z_loss_clip": 1.36621094, + "router_z_loss_mlp": 0.1083374, + "step": 11909, + "time_per_iteration": 2.5261456966400146 + }, + { + "auxiliary_loss_clip": 0.06411682, + "auxiliary_loss_mlp": 0.01264075, + "balance_loss_clip": 0.06275836, + "balance_loss_mlp": 0.01253316, + "epoch": 0.7160679392755148, + "flos": 30598704990720.0, + "grad_norm": 1.38087645688004, + "language_loss": 0.75330472, + "learning_rate": 7.875945057930144e-07, + "loss": 0.83006227, + "num_input_tokens_seen": 256899920, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.10766602, + "step": 11910, + "time_per_iteration": 4.044188022613525 + }, + { + "auxiliary_loss_clip": 0.06406509, + "auxiliary_loss_mlp": 0.01266213, + "balance_loss_clip": 0.06271985, + "balance_loss_mlp": 0.01256098, + "epoch": 0.7161280625281827, + "flos": 21329884771200.0, + "grad_norm": 1.597685322541952, + "language_loss": 0.76519787, + "learning_rate": 7.872847859552251e-07, + "loss": 0.84192502, + "num_input_tokens_seen": 256918460, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.10107422, + "step": 11911, + "time_per_iteration": 2.665767192840576 + }, + { + "auxiliary_loss_clip": 0.06409479, + "auxiliary_loss_mlp": 0.01265159, + "balance_loss_clip": 0.06274366, + "balance_loss_mlp": 0.01254376, + "epoch": 0.7161881857808508, + "flos": 61873218288000.0, + "grad_norm": 1.667698649027388, + "language_loss": 0.58612812, + "learning_rate": 7.869751121037192e-07, + "loss": 0.66287452, + "num_input_tokens_seen": 256942015, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.10791016, + "step": 11912, + "time_per_iteration": 2.9163358211517334 + }, + { + "auxiliary_loss_clip": 0.06408441, + "auxiliary_loss_mlp": 0.01264274, + "balance_loss_clip": 0.06275295, + "balance_loss_mlp": 0.0125398, + "epoch": 0.7162483090335187, + "flos": 20818126759680.0, + "grad_norm": 1.9057750004055583, + "language_loss": 0.78541219, + "learning_rate": 7.866654842502376e-07, + "loss": 0.86213928, + "num_input_tokens_seen": 256961065, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.10296631, + "step": 11913, + "time_per_iteration": 2.496882438659668 + }, + { + "auxiliary_loss_clip": 0.06405665, + "auxiliary_loss_mlp": 0.01267442, + "balance_loss_clip": 0.06273191, + "balance_loss_mlp": 0.01257864, + "epoch": 0.7163084322861867, + "flos": 24104393965440.0, + "grad_norm": 1.590904649851159, + "language_loss": 0.7420674, + "learning_rate": 7.863559024065234e-07, + "loss": 0.81879842, + "num_input_tokens_seen": 256982165, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.0958252, + "step": 11914, + "time_per_iteration": 3.96821665763855 + }, + { + "auxiliary_loss_clip": 0.06403452, + "auxiliary_loss_mlp": 0.01261289, + "balance_loss_clip": 0.06272376, + "balance_loss_mlp": 0.01251574, + "epoch": 0.7163685555388547, + "flos": 20086540761600.0, + "grad_norm": 1.6632734389842445, + "language_loss": 0.74058056, + "learning_rate": 7.860463665843143e-07, + "loss": 0.81722796, + "num_input_tokens_seen": 256999825, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.097229, + "step": 11915, + "time_per_iteration": 2.4962167739868164 + }, + { + "auxiliary_loss_clip": 0.06405881, + "auxiliary_loss_mlp": 0.01264509, + "balance_loss_clip": 0.06270003, + "balance_loss_mlp": 0.01254323, + "epoch": 0.7164286787915226, + "flos": 17462692408320.0, + "grad_norm": 1.6596246771079706, + "language_loss": 0.81293082, + "learning_rate": 7.85736876795349e-07, + "loss": 0.88963467, + "num_input_tokens_seen": 257017450, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.10186768, + "step": 11916, + "time_per_iteration": 2.5293524265289307 + }, + { + "auxiliary_loss_clip": 0.06407885, + "auxiliary_loss_mlp": 0.01267283, + "balance_loss_clip": 0.06272584, + "balance_loss_mlp": 0.01257555, + "epoch": 0.7164888020441906, + "flos": 19724982393600.0, + "grad_norm": 1.9910779108762084, + "language_loss": 0.68661398, + "learning_rate": 7.854274330513626e-07, + "loss": 0.76336563, + "num_input_tokens_seen": 257035465, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.09729004, + "step": 11917, + "time_per_iteration": 2.5082740783691406 + }, + { + "auxiliary_loss_clip": 0.0640521, + "auxiliary_loss_mlp": 0.01268808, + "balance_loss_clip": 0.06270327, + "balance_loss_mlp": 0.01258127, + "epoch": 0.7165489252968585, + "flos": 21476939886720.0, + "grad_norm": 1.5888688683522953, + "language_loss": 0.76160645, + "learning_rate": 7.851180353640896e-07, + "loss": 0.8383466, + "num_input_tokens_seen": 257053750, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.10687256, + "step": 11918, + "time_per_iteration": 3.8991646766662598 + }, + { + "auxiliary_loss_clip": 0.06316125, + "auxiliary_loss_mlp": 0.01260952, + "balance_loss_clip": 0.06260598, + "balance_loss_mlp": 0.01259661, + "epoch": 0.7166090485495266, + "flos": 69949426216320.0, + "grad_norm": 0.6355552708819127, + "language_loss": 0.53723788, + "learning_rate": 7.848086837452639e-07, + "loss": 0.61300862, + "num_input_tokens_seen": 257121215, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01291656, + "step": 11919, + "time_per_iteration": 3.2083816528320312 + }, + { + "auxiliary_loss_clip": 0.06411423, + "auxiliary_loss_mlp": 0.01266152, + "balance_loss_clip": 0.06274234, + "balance_loss_mlp": 0.01255948, + "epoch": 0.7166691718021945, + "flos": 27351151171200.0, + "grad_norm": 2.064464674479712, + "language_loss": 0.69286996, + "learning_rate": 7.844993782066132e-07, + "loss": 0.76964575, + "num_input_tokens_seen": 257143370, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.10211182, + "step": 11920, + "time_per_iteration": 2.6113531589508057 + }, + { + "auxiliary_loss_clip": 0.064086, + "auxiliary_loss_mlp": 0.01265203, + "balance_loss_clip": 0.06273469, + "balance_loss_mlp": 0.01255106, + "epoch": 0.7167292950548625, + "flos": 30416667995520.0, + "grad_norm": 1.8345459175809258, + "language_loss": 0.75019145, + "learning_rate": 7.841901187598678e-07, + "loss": 0.82692945, + "num_input_tokens_seen": 257162160, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.10101318, + "step": 11921, + "time_per_iteration": 2.5700902938842773 + }, + { + "auxiliary_loss_clip": 0.06416579, + "auxiliary_loss_mlp": 0.01267308, + "balance_loss_clip": 0.06275436, + "balance_loss_mlp": 0.01254177, + "epoch": 0.7167894183075304, + "flos": 14575942270080.0, + "grad_norm": 1.9367359294583022, + "language_loss": 0.75734651, + "learning_rate": 7.83880905416755e-07, + "loss": 0.83418536, + "num_input_tokens_seen": 257179300, + "router_z_loss_clip": 1.41210938, + "router_z_loss_mlp": 0.13128662, + "step": 11922, + "time_per_iteration": 2.465078830718994 + }, + { + "auxiliary_loss_clip": 0.06313948, + "auxiliary_loss_mlp": 0.0125594, + "balance_loss_clip": 0.06258468, + "balance_loss_mlp": 0.01254771, + "epoch": 0.7168495415601984, + "flos": 64128365948160.0, + "grad_norm": 0.7346387486828846, + "language_loss": 0.55178893, + "learning_rate": 7.83571738189001e-07, + "loss": 0.62748784, + "num_input_tokens_seen": 257235470, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01166534, + "step": 11923, + "time_per_iteration": 2.953462839126587 + }, + { + "auxiliary_loss_clip": 0.06408657, + "auxiliary_loss_mlp": 0.01267487, + "balance_loss_clip": 0.06272471, + "balance_loss_mlp": 0.01257062, + "epoch": 0.7169096648128663, + "flos": 24688421723520.0, + "grad_norm": 1.4959305525203388, + "language_loss": 0.77240855, + "learning_rate": 7.832626170883279e-07, + "loss": 0.84916997, + "num_input_tokens_seen": 257255850, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.10430908, + "step": 11924, + "time_per_iteration": 2.540371894836426 + }, + { + "auxiliary_loss_clip": 0.06404062, + "auxiliary_loss_mlp": 0.01264587, + "balance_loss_clip": 0.06271583, + "balance_loss_mlp": 0.01254776, + "epoch": 0.7169697880655344, + "flos": 20673754974720.0, + "grad_norm": 1.6022064591556118, + "language_loss": 0.68295527, + "learning_rate": 7.829535421264588e-07, + "loss": 0.75964177, + "num_input_tokens_seen": 257275425, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.0980835, + "step": 11925, + "time_per_iteration": 2.517883539199829 + }, + { + "auxiliary_loss_clip": 0.06401929, + "auxiliary_loss_mlp": 0.01264464, + "balance_loss_clip": 0.062714, + "balance_loss_mlp": 0.01254689, + "epoch": 0.7170299113182023, + "flos": 21039044849280.0, + "grad_norm": 1.4805989114047955, + "language_loss": 0.77453327, + "learning_rate": 7.826445133151133e-07, + "loss": 0.85119712, + "num_input_tokens_seen": 257295740, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.09771729, + "step": 11926, + "time_per_iteration": 2.525294065475464 + }, + { + "auxiliary_loss_clip": 0.06412005, + "auxiliary_loss_mlp": 0.01265458, + "balance_loss_clip": 0.06270812, + "balance_loss_mlp": 0.01254652, + "epoch": 0.7170900345708703, + "flos": 22899931050240.0, + "grad_norm": 2.0777865418109798, + "language_loss": 0.77830517, + "learning_rate": 7.823355306660093e-07, + "loss": 0.85507977, + "num_input_tokens_seen": 257315970, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.10809326, + "step": 11927, + "time_per_iteration": 2.5361175537109375 + }, + { + "auxiliary_loss_clip": 0.06405352, + "auxiliary_loss_mlp": 0.012651, + "balance_loss_clip": 0.06273961, + "balance_loss_mlp": 0.01255134, + "epoch": 0.7171501578235383, + "flos": 15523331258880.0, + "grad_norm": 1.5750787532555974, + "language_loss": 0.69694316, + "learning_rate": 7.820265941908642e-07, + "loss": 0.77364767, + "num_input_tokens_seen": 257334230, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09960938, + "step": 11928, + "time_per_iteration": 2.5053482055664062 + }, + { + "auxiliary_loss_clip": 0.06404196, + "auxiliary_loss_mlp": 0.01263642, + "balance_loss_clip": 0.06272393, + "balance_loss_mlp": 0.01253563, + "epoch": 0.7172102810762062, + "flos": 26111496741120.0, + "grad_norm": 1.7658790260288333, + "language_loss": 0.65507495, + "learning_rate": 7.817177039013931e-07, + "loss": 0.73175335, + "num_input_tokens_seen": 257352145, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.10076904, + "step": 11929, + "time_per_iteration": 2.5298080444335938 + }, + { + "auxiliary_loss_clip": 0.06411615, + "auxiliary_loss_mlp": 0.01264642, + "balance_loss_clip": 0.06275426, + "balance_loss_mlp": 0.01254455, + "epoch": 0.7172704043288742, + "flos": 21513011869440.0, + "grad_norm": 1.88648366975717, + "language_loss": 0.70105934, + "learning_rate": 7.81408859809308e-07, + "loss": 0.7778219, + "num_input_tokens_seen": 257371460, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.10186768, + "step": 11930, + "time_per_iteration": 2.492851972579956 + }, + { + "auxiliary_loss_clip": 0.06407914, + "auxiliary_loss_mlp": 0.0126604, + "balance_loss_clip": 0.06271791, + "balance_loss_mlp": 0.01255675, + "epoch": 0.7173305275815421, + "flos": 18776964499200.0, + "grad_norm": 1.6767880793565944, + "language_loss": 0.80551809, + "learning_rate": 7.811000619263219e-07, + "loss": 0.88225758, + "num_input_tokens_seen": 257390800, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.10351562, + "step": 11931, + "time_per_iteration": 2.5129940509796143 + }, + { + "auxiliary_loss_clip": 0.06405962, + "auxiliary_loss_mlp": 0.01263185, + "balance_loss_clip": 0.06272676, + "balance_loss_mlp": 0.01253398, + "epoch": 0.7173906508342102, + "flos": 16185372768000.0, + "grad_norm": 2.3164344242090245, + "language_loss": 0.78938711, + "learning_rate": 7.80791310264143e-07, + "loss": 0.8660785, + "num_input_tokens_seen": 257407495, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.09790039, + "step": 11932, + "time_per_iteration": 2.458064317703247 + }, + { + "auxiliary_loss_clip": 0.06406456, + "auxiliary_loss_mlp": 0.01265129, + "balance_loss_clip": 0.06274296, + "balance_loss_mlp": 0.01255163, + "epoch": 0.7174507740868781, + "flos": 26620948765440.0, + "grad_norm": 2.941669914403725, + "language_loss": 0.75155187, + "learning_rate": 7.804826048344803e-07, + "loss": 0.82826775, + "num_input_tokens_seen": 257429675, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.09960938, + "step": 11933, + "time_per_iteration": 2.5739805698394775 + }, + { + "auxiliary_loss_clip": 0.06418844, + "auxiliary_loss_mlp": 0.01266714, + "balance_loss_clip": 0.06277472, + "balance_loss_mlp": 0.01254858, + "epoch": 0.7175108973395461, + "flos": 18437264847360.0, + "grad_norm": 7.531680164120171, + "language_loss": 0.69827807, + "learning_rate": 7.801739456490388e-07, + "loss": 0.77513361, + "num_input_tokens_seen": 257442765, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.11859131, + "step": 11934, + "time_per_iteration": 2.4455020427703857 + }, + { + "auxiliary_loss_clip": 0.06406108, + "auxiliary_loss_mlp": 0.01263916, + "balance_loss_clip": 0.06272999, + "balance_loss_mlp": 0.0125395, + "epoch": 0.717571020592214, + "flos": 23921769991680.0, + "grad_norm": 2.2343261949316013, + "language_loss": 0.86673319, + "learning_rate": 7.798653327195237e-07, + "loss": 0.9434334, + "num_input_tokens_seen": 257459310, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.09967041, + "step": 11935, + "time_per_iteration": 2.528456211090088 + }, + { + "auxiliary_loss_clip": 0.06406541, + "auxiliary_loss_mlp": 0.0126352, + "balance_loss_clip": 0.06272122, + "balance_loss_mlp": 0.01253202, + "epoch": 0.717631143844882, + "flos": 38266647828480.0, + "grad_norm": 1.602642316585254, + "language_loss": 0.73995256, + "learning_rate": 7.795567660576388e-07, + "loss": 0.81665319, + "num_input_tokens_seen": 257484750, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.10314941, + "step": 11936, + "time_per_iteration": 2.67246413230896 + }, + { + "auxiliary_loss_clip": 0.06313888, + "auxiliary_loss_mlp": 0.01249886, + "balance_loss_clip": 0.06258012, + "balance_loss_mlp": 0.01248772, + "epoch": 0.7176912670975499, + "flos": 65536961408640.0, + "grad_norm": 0.7536478557805156, + "language_loss": 0.55813849, + "learning_rate": 7.79248245675082e-07, + "loss": 0.63377625, + "num_input_tokens_seen": 257543110, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.0111618, + "step": 11937, + "time_per_iteration": 3.14385724067688 + }, + { + "auxiliary_loss_clip": 0.06410685, + "auxiliary_loss_mlp": 0.01264931, + "balance_loss_clip": 0.06272934, + "balance_loss_mlp": 0.01254042, + "epoch": 0.717751390350218, + "flos": 31288433074560.0, + "grad_norm": 3.0696111718968555, + "language_loss": 0.54891688, + "learning_rate": 7.789397715835542e-07, + "loss": 0.62567306, + "num_input_tokens_seen": 257567410, + "router_z_loss_clip": 1.37695312, + "router_z_loss_mlp": 0.10900879, + "step": 11938, + "time_per_iteration": 2.612314462661743 + }, + { + "auxiliary_loss_clip": 0.06404351, + "auxiliary_loss_mlp": 0.01261396, + "balance_loss_clip": 0.06274119, + "balance_loss_mlp": 0.01251811, + "epoch": 0.7178115136028859, + "flos": 19864155225600.0, + "grad_norm": 1.5149026364788483, + "language_loss": 0.77031577, + "learning_rate": 7.786313437947527e-07, + "loss": 0.84697324, + "num_input_tokens_seen": 257586270, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.09576416, + "step": 11939, + "time_per_iteration": 3.9376840591430664 + }, + { + "auxiliary_loss_clip": 0.06311642, + "auxiliary_loss_mlp": 0.01253055, + "balance_loss_clip": 0.06255894, + "balance_loss_mlp": 0.01251996, + "epoch": 0.7178716368555539, + "flos": 64369576725120.0, + "grad_norm": 0.7379302398056043, + "language_loss": 0.6123156, + "learning_rate": 7.783229623203738e-07, + "loss": 0.68796259, + "num_input_tokens_seen": 257647415, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01060486, + "step": 11940, + "time_per_iteration": 3.106687545776367 + }, + { + "auxiliary_loss_clip": 0.0640372, + "auxiliary_loss_mlp": 0.01262674, + "balance_loss_clip": 0.06272845, + "balance_loss_mlp": 0.01253209, + "epoch": 0.7179317601082219, + "flos": 26770184087040.0, + "grad_norm": 1.6027609306181398, + "language_loss": 0.59101206, + "learning_rate": 7.780146271721097e-07, + "loss": 0.66767597, + "num_input_tokens_seen": 257669795, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09466553, + "step": 11941, + "time_per_iteration": 2.6309211254119873 + }, + { + "auxiliary_loss_clip": 0.06405525, + "auxiliary_loss_mlp": 0.01263209, + "balance_loss_clip": 0.06273725, + "balance_loss_mlp": 0.01253779, + "epoch": 0.7179918833608898, + "flos": 23520575842560.0, + "grad_norm": 1.7346427869736905, + "language_loss": 0.79611468, + "learning_rate": 7.777063383616543e-07, + "loss": 0.87280202, + "num_input_tokens_seen": 257687415, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09429932, + "step": 11942, + "time_per_iteration": 2.5131733417510986 + }, + { + "auxiliary_loss_clip": 0.06404739, + "auxiliary_loss_mlp": 0.01268984, + "balance_loss_clip": 0.06271753, + "balance_loss_mlp": 0.01258345, + "epoch": 0.7180520066135578, + "flos": 17171349361920.0, + "grad_norm": 2.144705941723289, + "language_loss": 0.66274554, + "learning_rate": 7.773980959006968e-07, + "loss": 0.73948282, + "num_input_tokens_seen": 257706215, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.10638428, + "step": 11943, + "time_per_iteration": 2.5236313343048096 + }, + { + "auxiliary_loss_clip": 0.06407227, + "auxiliary_loss_mlp": 0.01268193, + "balance_loss_clip": 0.06273103, + "balance_loss_mlp": 0.01257798, + "epoch": 0.7181121298662257, + "flos": 17572417729920.0, + "grad_norm": 1.703985250404805, + "language_loss": 0.78651738, + "learning_rate": 7.770898998009254e-07, + "loss": 0.86327153, + "num_input_tokens_seen": 257724740, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.10388184, + "step": 11944, + "time_per_iteration": 2.489701271057129 + }, + { + "auxiliary_loss_clip": 0.06407581, + "auxiliary_loss_mlp": 0.01268486, + "balance_loss_clip": 0.06269886, + "balance_loss_mlp": 0.01256243, + "epoch": 0.7181722531188938, + "flos": 11952471260160.0, + "grad_norm": 2.3927781343480024, + "language_loss": 0.62825882, + "learning_rate": 7.767817500740277e-07, + "loss": 0.70501947, + "num_input_tokens_seen": 257742060, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.12243652, + "step": 11945, + "time_per_iteration": 2.523031711578369 + }, + { + "auxiliary_loss_clip": 0.0631476, + "auxiliary_loss_mlp": 0.0125155, + "balance_loss_clip": 0.06259042, + "balance_loss_mlp": 0.01250277, + "epoch": 0.7182323763715617, + "flos": 65522664288000.0, + "grad_norm": 0.6825637115139678, + "language_loss": 0.5092659, + "learning_rate": 7.76473646731689e-07, + "loss": 0.58492899, + "num_input_tokens_seen": 257802250, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.01273346, + "step": 11946, + "time_per_iteration": 3.0530238151550293 + }, + { + "auxiliary_loss_clip": 0.06408353, + "auxiliary_loss_mlp": 0.01265169, + "balance_loss_clip": 0.06271833, + "balance_loss_mlp": 0.01254553, + "epoch": 0.7182924996242297, + "flos": 20637137940480.0, + "grad_norm": 1.6252151206202925, + "language_loss": 0.7525813, + "learning_rate": 7.761655897855925e-07, + "loss": 0.8293165, + "num_input_tokens_seen": 257821155, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.10620117, + "step": 11947, + "time_per_iteration": 2.535158157348633 + }, + { + "auxiliary_loss_clip": 0.0640206, + "auxiliary_loss_mlp": 0.01266117, + "balance_loss_clip": 0.06270691, + "balance_loss_mlp": 0.01256556, + "epoch": 0.7183526228768976, + "flos": 16221947875200.0, + "grad_norm": 1.376797817491515, + "language_loss": 0.7316047, + "learning_rate": 7.758575792474187e-07, + "loss": 0.80828649, + "num_input_tokens_seen": 257839905, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09564209, + "step": 11948, + "time_per_iteration": 2.465437173843384 + }, + { + "auxiliary_loss_clip": 0.06408493, + "auxiliary_loss_mlp": 0.01270033, + "balance_loss_clip": 0.06272705, + "balance_loss_mlp": 0.0125959, + "epoch": 0.7184127461295656, + "flos": 22238518446720.0, + "grad_norm": 1.618352037269111, + "language_loss": 0.71604127, + "learning_rate": 7.755496151288483e-07, + "loss": 0.79282653, + "num_input_tokens_seen": 257860055, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10443115, + "step": 11949, + "time_per_iteration": 2.5727827548980713 + }, + { + "auxiliary_loss_clip": 0.06405893, + "auxiliary_loss_mlp": 0.01265064, + "balance_loss_clip": 0.06273241, + "balance_loss_mlp": 0.01255659, + "epoch": 0.7184728693822335, + "flos": 27351863930880.0, + "grad_norm": 2.584174612007466, + "language_loss": 0.76537007, + "learning_rate": 7.752416974415598e-07, + "loss": 0.84207964, + "num_input_tokens_seen": 257879315, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09411621, + "step": 11950, + "time_per_iteration": 4.074851751327515 + }, + { + "auxiliary_loss_clip": 0.0641187, + "auxiliary_loss_mlp": 0.01266048, + "balance_loss_clip": 0.06275279, + "balance_loss_mlp": 0.01254968, + "epoch": 0.7185329926349016, + "flos": 16514129462400.0, + "grad_norm": 2.1607831663839163, + "language_loss": 0.67883182, + "learning_rate": 7.749338261972282e-07, + "loss": 0.75561094, + "num_input_tokens_seen": 257896570, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.11071777, + "step": 11951, + "time_per_iteration": 2.4646525382995605 + }, + { + "auxiliary_loss_clip": 0.06409188, + "auxiliary_loss_mlp": 0.0126641, + "balance_loss_clip": 0.0627277, + "balance_loss_mlp": 0.01254549, + "epoch": 0.7185931158875695, + "flos": 23957800047360.0, + "grad_norm": 1.7824491955160577, + "language_loss": 0.78629339, + "learning_rate": 7.746260014075286e-07, + "loss": 0.86304945, + "num_input_tokens_seen": 257916855, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.11865234, + "step": 11952, + "time_per_iteration": 2.516615390777588 + }, + { + "auxiliary_loss_clip": 0.06412063, + "auxiliary_loss_mlp": 0.01268038, + "balance_loss_clip": 0.06272954, + "balance_loss_mlp": 0.0125725, + "epoch": 0.7186532391402375, + "flos": 26549265997440.0, + "grad_norm": 1.8155741690117748, + "language_loss": 0.74781901, + "learning_rate": 7.743182230841352e-07, + "loss": 0.82462001, + "num_input_tokens_seen": 257937140, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.10803223, + "step": 11953, + "time_per_iteration": 2.527876853942871 + }, + { + "auxiliary_loss_clip": 0.06407471, + "auxiliary_loss_mlp": 0.01266403, + "balance_loss_clip": 0.06272335, + "balance_loss_mlp": 0.01256223, + "epoch": 0.7187133623929055, + "flos": 22389682412160.0, + "grad_norm": 1.6183356638137696, + "language_loss": 0.73045003, + "learning_rate": 7.740104912387164e-07, + "loss": 0.80718875, + "num_input_tokens_seen": 257956785, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.10180664, + "step": 11954, + "time_per_iteration": 3.9654276371002197 + }, + { + "auxiliary_loss_clip": 0.06407467, + "auxiliary_loss_mlp": 0.01268821, + "balance_loss_clip": 0.0627225, + "balance_loss_mlp": 0.01258372, + "epoch": 0.7187734856455734, + "flos": 15785184867840.0, + "grad_norm": 1.5034974225164766, + "language_loss": 0.74558902, + "learning_rate": 7.737028058829425e-07, + "loss": 0.82235181, + "num_input_tokens_seen": 257975455, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.10455322, + "step": 11955, + "time_per_iteration": 2.478512763977051 + }, + { + "auxiliary_loss_clip": 0.0640816, + "auxiliary_loss_mlp": 0.01262735, + "balance_loss_clip": 0.06272267, + "balance_loss_mlp": 0.01253032, + "epoch": 0.7188336088982414, + "flos": 31767766755840.0, + "grad_norm": 1.8388372007030418, + "language_loss": 0.73576057, + "learning_rate": 7.733951670284817e-07, + "loss": 0.81246948, + "num_input_tokens_seen": 257996850, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.09698486, + "step": 11956, + "time_per_iteration": 2.5664751529693604 + }, + { + "auxiliary_loss_clip": 0.06408941, + "auxiliary_loss_mlp": 0.01266307, + "balance_loss_clip": 0.06270766, + "balance_loss_mlp": 0.01255793, + "epoch": 0.7188937321509093, + "flos": 21470734684800.0, + "grad_norm": 1.7841137783080476, + "language_loss": 0.70991242, + "learning_rate": 7.730875746869987e-07, + "loss": 0.7866649, + "num_input_tokens_seen": 258016145, + "router_z_loss_clip": 1.38183594, + "router_z_loss_mlp": 0.1050415, + "step": 11957, + "time_per_iteration": 2.5579633712768555 + }, + { + "auxiliary_loss_clip": 0.0641226, + "auxiliary_loss_mlp": 0.01268285, + "balance_loss_clip": 0.06273985, + "balance_loss_mlp": 0.01256966, + "epoch": 0.7189538554035774, + "flos": 27278839497600.0, + "grad_norm": 1.7957042197859685, + "language_loss": 0.74078369, + "learning_rate": 7.727800288701582e-07, + "loss": 0.81758916, + "num_input_tokens_seen": 258035420, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.11322021, + "step": 11958, + "time_per_iteration": 3.9170804023742676 + }, + { + "auxiliary_loss_clip": 0.06403583, + "auxiliary_loss_mlp": 0.01264508, + "balance_loss_clip": 0.06271464, + "balance_loss_mlp": 0.0125484, + "epoch": 0.7190139786562453, + "flos": 21587168332800.0, + "grad_norm": 1.5040650051227977, + "language_loss": 0.84225762, + "learning_rate": 7.724725295896215e-07, + "loss": 0.91893852, + "num_input_tokens_seen": 258053520, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09667969, + "step": 11959, + "time_per_iteration": 2.506953239440918 + }, + { + "auxiliary_loss_clip": 0.06412622, + "auxiliary_loss_mlp": 0.01263708, + "balance_loss_clip": 0.06274716, + "balance_loss_mlp": 0.01253665, + "epoch": 0.7190741019089133, + "flos": 26727990756480.0, + "grad_norm": 1.629776742462507, + "language_loss": 0.82108045, + "learning_rate": 7.7216507685705e-07, + "loss": 0.89784372, + "num_input_tokens_seen": 258073020, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.10046387, + "step": 11960, + "time_per_iteration": 2.5172626972198486 + }, + { + "auxiliary_loss_clip": 0.06408188, + "auxiliary_loss_mlp": 0.01266297, + "balance_loss_clip": 0.06274426, + "balance_loss_mlp": 0.01256051, + "epoch": 0.7191342251615812, + "flos": 26112041792640.0, + "grad_norm": 2.013110188990865, + "language_loss": 0.7794981, + "learning_rate": 7.718576706841013e-07, + "loss": 0.85624301, + "num_input_tokens_seen": 258093155, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.10241699, + "step": 11961, + "time_per_iteration": 2.585214853286743 + }, + { + "auxiliary_loss_clip": 0.06404266, + "auxiliary_loss_mlp": 0.01266808, + "balance_loss_clip": 0.06274937, + "balance_loss_mlp": 0.01257164, + "epoch": 0.7191943484142492, + "flos": 22973794024320.0, + "grad_norm": 1.3445368370245, + "language_loss": 0.75350589, + "learning_rate": 7.715503110824326e-07, + "loss": 0.83021665, + "num_input_tokens_seen": 258113905, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.09643555, + "step": 11962, + "time_per_iteration": 2.5126750469207764 + }, + { + "auxiliary_loss_clip": 0.06408066, + "auxiliary_loss_mlp": 0.01264043, + "balance_loss_clip": 0.06272985, + "balance_loss_mlp": 0.01253428, + "epoch": 0.7192544716669171, + "flos": 22571970969600.0, + "grad_norm": 1.8990374225745255, + "language_loss": 0.7543835, + "learning_rate": 7.712429980637001e-07, + "loss": 0.83110464, + "num_input_tokens_seen": 258132820, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.10614014, + "step": 11963, + "time_per_iteration": 2.531531572341919 + }, + { + "auxiliary_loss_clip": 0.0641598, + "auxiliary_loss_mlp": 0.01268254, + "balance_loss_clip": 0.06276201, + "balance_loss_mlp": 0.01256888, + "epoch": 0.7193145949195852, + "flos": 18986981558400.0, + "grad_norm": 2.117256305222674, + "language_loss": 0.81201178, + "learning_rate": 7.709357316395564e-07, + "loss": 0.88885415, + "num_input_tokens_seen": 258148055, + "router_z_loss_clip": 1.39941406, + "router_z_loss_mlp": 0.11364746, + "step": 11964, + "time_per_iteration": 2.455134630203247 + }, + { + "auxiliary_loss_clip": 0.06404482, + "auxiliary_loss_mlp": 0.01268831, + "balance_loss_clip": 0.0627102, + "balance_loss_mlp": 0.01258854, + "epoch": 0.7193747181722531, + "flos": 18010061205120.0, + "grad_norm": 1.7059884029893508, + "language_loss": 0.75202858, + "learning_rate": 7.70628511821652e-07, + "loss": 0.8287617, + "num_input_tokens_seen": 258165995, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.09979248, + "step": 11965, + "time_per_iteration": 2.49127459526062 + }, + { + "auxiliary_loss_clip": 0.06410991, + "auxiliary_loss_mlp": 0.01265781, + "balance_loss_clip": 0.06272766, + "balance_loss_mlp": 0.01255172, + "epoch": 0.7194348414249211, + "flos": 24396323990400.0, + "grad_norm": 1.448883188350496, + "language_loss": 0.77801377, + "learning_rate": 7.703213386216377e-07, + "loss": 0.85478151, + "num_input_tokens_seen": 258186165, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.1060791, + "step": 11966, + "time_per_iteration": 2.5172245502471924 + }, + { + "auxiliary_loss_clip": 0.06405114, + "auxiliary_loss_mlp": 0.01265324, + "balance_loss_clip": 0.06270087, + "balance_loss_mlp": 0.01254953, + "epoch": 0.7194949646775891, + "flos": 22169938279680.0, + "grad_norm": 1.704579112714729, + "language_loss": 0.73619503, + "learning_rate": 7.700142120511619e-07, + "loss": 0.81289935, + "num_input_tokens_seen": 258204595, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.10375977, + "step": 11967, + "time_per_iteration": 2.5002834796905518 + }, + { + "auxiliary_loss_clip": 0.06399344, + "auxiliary_loss_mlp": 0.01265984, + "balance_loss_clip": 0.06271313, + "balance_loss_mlp": 0.01256679, + "epoch": 0.719555087930257, + "flos": 20272560825600.0, + "grad_norm": 1.5295572568049065, + "language_loss": 0.82314783, + "learning_rate": 7.6970713212187e-07, + "loss": 0.89980114, + "num_input_tokens_seen": 258223110, + "router_z_loss_clip": 1.27929688, + "router_z_loss_mlp": 0.09307861, + "step": 11968, + "time_per_iteration": 2.5851659774780273 + }, + { + "auxiliary_loss_clip": 0.06403178, + "auxiliary_loss_mlp": 0.01265341, + "balance_loss_clip": 0.0627176, + "balance_loss_mlp": 0.01255262, + "epoch": 0.719615211182925, + "flos": 24723026259840.0, + "grad_norm": 1.755748062324177, + "language_loss": 0.76839387, + "learning_rate": 7.69400098845407e-07, + "loss": 0.84507906, + "num_input_tokens_seen": 258242660, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.10070801, + "step": 11969, + "time_per_iteration": 2.52701997756958 + }, + { + "auxiliary_loss_clip": 0.06404562, + "auxiliary_loss_mlp": 0.01266338, + "balance_loss_clip": 0.06269367, + "balance_loss_mlp": 0.01255973, + "epoch": 0.719675334435593, + "flos": 20015570753280.0, + "grad_norm": 1.3860945342705195, + "language_loss": 0.71083385, + "learning_rate": 7.69093112233417e-07, + "loss": 0.78754288, + "num_input_tokens_seen": 258261850, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.1036377, + "step": 11970, + "time_per_iteration": 2.4650230407714844 + }, + { + "auxiliary_loss_clip": 0.0631284, + "auxiliary_loss_mlp": 0.01254485, + "balance_loss_clip": 0.06257641, + "balance_loss_mlp": 0.0125341, + "epoch": 0.719735457688261, + "flos": 44215965169920.0, + "grad_norm": 0.888192753215213, + "language_loss": 0.60509741, + "learning_rate": 7.68786172297538e-07, + "loss": 0.68077064, + "num_input_tokens_seen": 258312570, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01076508, + "step": 11971, + "time_per_iteration": 3.049323558807373 + }, + { + "auxiliary_loss_clip": 0.06412932, + "auxiliary_loss_mlp": 0.01264656, + "balance_loss_clip": 0.06273159, + "balance_loss_mlp": 0.01254541, + "epoch": 0.7197955809409289, + "flos": 16808952453120.0, + "grad_norm": 1.9914531833581635, + "language_loss": 0.79825729, + "learning_rate": 7.684792790494105e-07, + "loss": 0.87503314, + "num_input_tokens_seen": 258331600, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.10107422, + "step": 11972, + "time_per_iteration": 2.4930012226104736 + }, + { + "auxiliary_loss_clip": 0.06406973, + "auxiliary_loss_mlp": 0.01266584, + "balance_loss_clip": 0.062718, + "balance_loss_mlp": 0.01256487, + "epoch": 0.7198557041935969, + "flos": 24542330929920.0, + "grad_norm": 1.4491238198032386, + "language_loss": 0.76038206, + "learning_rate": 7.681724325006733e-07, + "loss": 0.83711761, + "num_input_tokens_seen": 258351785, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.10095215, + "step": 11973, + "time_per_iteration": 2.548208475112915 + }, + { + "auxiliary_loss_clip": 0.06313819, + "auxiliary_loss_mlp": 0.01251276, + "balance_loss_clip": 0.06258664, + "balance_loss_mlp": 0.01250185, + "epoch": 0.7199158274462648, + "flos": 70729006204800.0, + "grad_norm": 0.8373324972209466, + "language_loss": 0.57018536, + "learning_rate": 7.6786563266296e-07, + "loss": 0.64583629, + "num_input_tokens_seen": 258404035, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01093292, + "step": 11974, + "time_per_iteration": 2.9727988243103027 + }, + { + "auxiliary_loss_clip": 0.06406881, + "auxiliary_loss_mlp": 0.01266132, + "balance_loss_clip": 0.06270801, + "balance_loss_mlp": 0.01256082, + "epoch": 0.7199759506989328, + "flos": 29355151345920.0, + "grad_norm": 2.3495582662204164, + "language_loss": 0.61703098, + "learning_rate": 7.675588795479062e-07, + "loss": 0.69376105, + "num_input_tokens_seen": 258424850, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.10058594, + "step": 11975, + "time_per_iteration": 2.5667810440063477 + }, + { + "auxiliary_loss_clip": 0.06407548, + "auxiliary_loss_mlp": 0.01266502, + "balance_loss_clip": 0.06274091, + "balance_loss_mlp": 0.01256465, + "epoch": 0.7200360739516007, + "flos": 24646689590400.0, + "grad_norm": 1.7506172714592478, + "language_loss": 0.6773572, + "learning_rate": 7.672521731671425e-07, + "loss": 0.7540977, + "num_input_tokens_seen": 258445485, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.10040283, + "step": 11976, + "time_per_iteration": 2.5304412841796875 + }, + { + "auxiliary_loss_clip": 0.06406543, + "auxiliary_loss_mlp": 0.01261585, + "balance_loss_clip": 0.06271597, + "balance_loss_mlp": 0.01252024, + "epoch": 0.7200961972042688, + "flos": 20819007227520.0, + "grad_norm": 1.8109272198274133, + "language_loss": 0.6749649, + "learning_rate": 7.669455135323004e-07, + "loss": 0.75164616, + "num_input_tokens_seen": 258464505, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.09564209, + "step": 11977, + "time_per_iteration": 2.547656536102295 + }, + { + "auxiliary_loss_clip": 0.06408069, + "auxiliary_loss_mlp": 0.012691, + "balance_loss_clip": 0.06271597, + "balance_loss_mlp": 0.01258336, + "epoch": 0.7201563204569367, + "flos": 31253493121920.0, + "grad_norm": 1.5436676151403905, + "language_loss": 0.754664, + "learning_rate": 7.666389006550074e-07, + "loss": 0.83143568, + "num_input_tokens_seen": 258487190, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.10766602, + "step": 11978, + "time_per_iteration": 4.067101240158081 + }, + { + "auxiliary_loss_clip": 0.06403241, + "auxiliary_loss_mlp": 0.01264162, + "balance_loss_clip": 0.06271459, + "balance_loss_mlp": 0.01254327, + "epoch": 0.7202164437096047, + "flos": 26658655902720.0, + "grad_norm": 1.78319056574555, + "language_loss": 0.78890365, + "learning_rate": 7.663323345468908e-07, + "loss": 0.86557764, + "num_input_tokens_seen": 258503790, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.09832764, + "step": 11979, + "time_per_iteration": 2.5176994800567627 + }, + { + "auxiliary_loss_clip": 0.06404784, + "auxiliary_loss_mlp": 0.01266233, + "balance_loss_clip": 0.06271341, + "balance_loss_mlp": 0.01255999, + "epoch": 0.7202765669622727, + "flos": 25966999175040.0, + "grad_norm": 1.5387882255892862, + "language_loss": 0.64881861, + "learning_rate": 7.660258152195767e-07, + "loss": 0.72552878, + "num_input_tokens_seen": 258527335, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.10235596, + "step": 11980, + "time_per_iteration": 2.5968124866485596 + }, + { + "auxiliary_loss_clip": 0.06408978, + "auxiliary_loss_mlp": 0.01265341, + "balance_loss_clip": 0.06272249, + "balance_loss_mlp": 0.01254618, + "epoch": 0.7203366902149406, + "flos": 28519961374080.0, + "grad_norm": 1.8098282466640043, + "language_loss": 0.67242014, + "learning_rate": 7.657193426846871e-07, + "loss": 0.74916333, + "num_input_tokens_seen": 258546690, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.10717773, + "step": 11981, + "time_per_iteration": 2.5330793857574463 + }, + { + "auxiliary_loss_clip": 0.0640622, + "auxiliary_loss_mlp": 0.01265599, + "balance_loss_clip": 0.06270846, + "balance_loss_mlp": 0.01255555, + "epoch": 0.7203968134676086, + "flos": 21112446625920.0, + "grad_norm": 1.6958532399278234, + "language_loss": 0.74167675, + "learning_rate": 7.65412916953843e-07, + "loss": 0.81839496, + "num_input_tokens_seen": 258566340, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.10040283, + "step": 11982, + "time_per_iteration": 2.510929584503174 + }, + { + "auxiliary_loss_clip": 0.06405748, + "auxiliary_loss_mlp": 0.01266184, + "balance_loss_clip": 0.06270775, + "balance_loss_mlp": 0.01256802, + "epoch": 0.7204569367202766, + "flos": 18337937431680.0, + "grad_norm": 1.8860370503158916, + "language_loss": 0.65837574, + "learning_rate": 7.65106538038665e-07, + "loss": 0.73509502, + "num_input_tokens_seen": 258584455, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.09387207, + "step": 11983, + "time_per_iteration": 2.4505462646484375 + }, + { + "auxiliary_loss_clip": 0.06406046, + "auxiliary_loss_mlp": 0.01264887, + "balance_loss_clip": 0.06271453, + "balance_loss_mlp": 0.01254445, + "epoch": 0.7205170599729446, + "flos": 23261279783040.0, + "grad_norm": 1.4437514392705604, + "language_loss": 0.66617727, + "learning_rate": 7.648002059507715e-07, + "loss": 0.74288666, + "num_input_tokens_seen": 258604725, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.10449219, + "step": 11984, + "time_per_iteration": 2.547555446624756 + }, + { + "auxiliary_loss_clip": 0.06413494, + "auxiliary_loss_mlp": 0.01268675, + "balance_loss_clip": 0.06275403, + "balance_loss_mlp": 0.01257994, + "epoch": 0.7205771832256125, + "flos": 20127140864640.0, + "grad_norm": 1.765838717363193, + "language_loss": 0.74360126, + "learning_rate": 7.644939207017771e-07, + "loss": 0.82042295, + "num_input_tokens_seen": 258622885, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.10687256, + "step": 11985, + "time_per_iteration": 2.4865455627441406 + }, + { + "auxiliary_loss_clip": 0.06406047, + "auxiliary_loss_mlp": 0.0126526, + "balance_loss_clip": 0.06272492, + "balance_loss_mlp": 0.01255652, + "epoch": 0.7206373064782805, + "flos": 27709648865280.0, + "grad_norm": 1.7467712742919994, + "language_loss": 0.62577748, + "learning_rate": 7.641876823032977e-07, + "loss": 0.70249057, + "num_input_tokens_seen": 258644305, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.0960083, + "step": 11986, + "time_per_iteration": 2.5774106979370117 + }, + { + "auxiliary_loss_clip": 0.06410712, + "auxiliary_loss_mlp": 0.0127024, + "balance_loss_clip": 0.06274345, + "balance_loss_mlp": 0.01258951, + "epoch": 0.7206974297309484, + "flos": 17974031149440.0, + "grad_norm": 1.663451860117408, + "language_loss": 0.72484905, + "learning_rate": 7.638814907669455e-07, + "loss": 0.80165857, + "num_input_tokens_seen": 258661775, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.11291504, + "step": 11987, + "time_per_iteration": 2.4724771976470947 + }, + { + "auxiliary_loss_clip": 0.06410339, + "auxiliary_loss_mlp": 0.01263822, + "balance_loss_clip": 0.06273559, + "balance_loss_mlp": 0.01253689, + "epoch": 0.7207575529836164, + "flos": 16988893096320.0, + "grad_norm": 2.5242604109279574, + "language_loss": 0.78976148, + "learning_rate": 7.635753461043301e-07, + "loss": 0.86650312, + "num_input_tokens_seen": 258679830, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.10125732, + "step": 11988, + "time_per_iteration": 2.495361566543579 + }, + { + "auxiliary_loss_clip": 0.06404472, + "auxiliary_loss_mlp": 0.01263556, + "balance_loss_clip": 0.06269506, + "balance_loss_mlp": 0.01253489, + "epoch": 0.7208176762362843, + "flos": 18732465181440.0, + "grad_norm": 1.7087764254113869, + "language_loss": 0.79046804, + "learning_rate": 7.632692483270618e-07, + "loss": 0.86714828, + "num_input_tokens_seen": 258697415, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.10064697, + "step": 11989, + "time_per_iteration": 2.5043447017669678 + }, + { + "auxiliary_loss_clip": 0.06400688, + "auxiliary_loss_mlp": 0.01267699, + "balance_loss_clip": 0.06270982, + "balance_loss_mlp": 0.01257364, + "epoch": 0.7208777994889524, + "flos": 18740515173120.0, + "grad_norm": 1.790178990562424, + "language_loss": 0.8290503, + "learning_rate": 7.629631974467481e-07, + "loss": 0.90573412, + "num_input_tokens_seen": 258716755, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.10345459, + "step": 11990, + "time_per_iteration": 3.926800012588501 + }, + { + "auxiliary_loss_clip": 0.064039, + "auxiliary_loss_mlp": 0.01274305, + "balance_loss_clip": 0.06273188, + "balance_loss_mlp": 0.0126484, + "epoch": 0.7209379227416203, + "flos": 14798705149440.0, + "grad_norm": 2.036094389130557, + "language_loss": 0.7637105, + "learning_rate": 7.626571934749931e-07, + "loss": 0.84049255, + "num_input_tokens_seen": 258733270, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09472656, + "step": 11991, + "time_per_iteration": 2.504420042037964 + }, + { + "auxiliary_loss_clip": 0.06401916, + "auxiliary_loss_mlp": 0.01266823, + "balance_loss_clip": 0.06271645, + "balance_loss_mlp": 0.01256976, + "epoch": 0.7209980459942883, + "flos": 29643559499520.0, + "grad_norm": 1.4029888682461984, + "language_loss": 0.72727466, + "learning_rate": 7.623512364234022e-07, + "loss": 0.80396211, + "num_input_tokens_seen": 258755270, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.09844971, + "step": 11992, + "time_per_iteration": 2.5568339824676514 + }, + { + "auxiliary_loss_clip": 0.06410159, + "auxiliary_loss_mlp": 0.01263161, + "balance_loss_clip": 0.06273486, + "balance_loss_mlp": 0.01252695, + "epoch": 0.7210581692469563, + "flos": 23483916881280.0, + "grad_norm": 1.4497931031993367, + "language_loss": 0.66405648, + "learning_rate": 7.620453263035755e-07, + "loss": 0.74078965, + "num_input_tokens_seen": 258775340, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.10473633, + "step": 11993, + "time_per_iteration": 2.6186561584472656 + }, + { + "auxiliary_loss_clip": 0.06405848, + "auxiliary_loss_mlp": 0.01269619, + "balance_loss_clip": 0.06271709, + "balance_loss_mlp": 0.01259695, + "epoch": 0.7211182924996242, + "flos": 26106297788160.0, + "grad_norm": 1.8933872495895026, + "language_loss": 0.6622234, + "learning_rate": 7.61739463127115e-07, + "loss": 0.73897809, + "num_input_tokens_seen": 258794580, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.0993042, + "step": 11994, + "time_per_iteration": 3.895599126815796 + }, + { + "auxiliary_loss_clip": 0.06404895, + "auxiliary_loss_mlp": 0.01266355, + "balance_loss_clip": 0.06270497, + "balance_loss_mlp": 0.01255888, + "epoch": 0.7211784157522922, + "flos": 17717795763840.0, + "grad_norm": 1.9331486787733179, + "language_loss": 0.67162377, + "learning_rate": 7.614336469056172e-07, + "loss": 0.7483362, + "num_input_tokens_seen": 258812330, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.10473633, + "step": 11995, + "time_per_iteration": 2.4796035289764404 + }, + { + "auxiliary_loss_clip": 0.06403686, + "auxiliary_loss_mlp": 0.01265301, + "balance_loss_clip": 0.06274262, + "balance_loss_mlp": 0.01254721, + "epoch": 0.7212385390049602, + "flos": 24430173840000.0, + "grad_norm": 1.6348621026253527, + "language_loss": 0.7952925, + "learning_rate": 7.6112787765068e-07, + "loss": 0.87198234, + "num_input_tokens_seen": 258831770, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.10577393, + "step": 11996, + "time_per_iteration": 2.513824939727783 + }, + { + "auxiliary_loss_clip": 0.06409439, + "auxiliary_loss_mlp": 0.0126526, + "balance_loss_clip": 0.06274767, + "balance_loss_mlp": 0.01255056, + "epoch": 0.7212986622576282, + "flos": 28154755353600.0, + "grad_norm": 3.3591238798386285, + "language_loss": 0.81663775, + "learning_rate": 7.60822155373899e-07, + "loss": 0.89338481, + "num_input_tokens_seen": 258849090, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.10192871, + "step": 11997, + "time_per_iteration": 3.9435391426086426 + }, + { + "auxiliary_loss_clip": 0.06409244, + "auxiliary_loss_mlp": 0.01266354, + "balance_loss_clip": 0.06272081, + "balance_loss_mlp": 0.01255363, + "epoch": 0.7213587855102961, + "flos": 21842313615360.0, + "grad_norm": 1.9166262285811178, + "language_loss": 0.67322028, + "learning_rate": 7.605164800868646e-07, + "loss": 0.74997622, + "num_input_tokens_seen": 258868230, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.10992432, + "step": 11998, + "time_per_iteration": 2.496742010116577 + }, + { + "auxiliary_loss_clip": 0.06405417, + "auxiliary_loss_mlp": 0.01266027, + "balance_loss_clip": 0.06271802, + "balance_loss_mlp": 0.01256777, + "epoch": 0.7214189087629641, + "flos": 14616877789440.0, + "grad_norm": 1.7752534320688365, + "language_loss": 0.72513527, + "learning_rate": 7.602108518011696e-07, + "loss": 0.80184972, + "num_input_tokens_seen": 258885525, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.0925293, + "step": 11999, + "time_per_iteration": 2.458315849304199 + }, + { + "auxiliary_loss_clip": 0.0640653, + "auxiliary_loss_mlp": 0.01266506, + "balance_loss_clip": 0.06272668, + "balance_loss_mlp": 0.01256158, + "epoch": 0.721479032015632, + "flos": 19396938458880.0, + "grad_norm": 2.0883117148535937, + "language_loss": 0.83569586, + "learning_rate": 7.599052705284039e-07, + "loss": 0.91242623, + "num_input_tokens_seen": 258903245, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.10351562, + "step": 12000, + "time_per_iteration": 2.4941916465759277 + }, + { + "auxiliary_loss_clip": 0.06409671, + "auxiliary_loss_mlp": 0.01262001, + "balance_loss_clip": 0.06275104, + "balance_loss_mlp": 0.01251826, + "epoch": 0.7215391552683, + "flos": 18518423126400.0, + "grad_norm": 1.7464338798301249, + "language_loss": 0.77261817, + "learning_rate": 7.59599736280154e-07, + "loss": 0.8493349, + "num_input_tokens_seen": 258921245, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.10174561, + "step": 12001, + "time_per_iteration": 2.4661076068878174 + }, + { + "auxiliary_loss_clip": 0.0640439, + "auxiliary_loss_mlp": 0.01267788, + "balance_loss_clip": 0.06274766, + "balance_loss_mlp": 0.01258323, + "epoch": 0.721599278520968, + "flos": 23265514414080.0, + "grad_norm": 2.52401774728115, + "language_loss": 0.81887865, + "learning_rate": 7.592942490680066e-07, + "loss": 0.89560032, + "num_input_tokens_seen": 258939425, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.09454346, + "step": 12002, + "time_per_iteration": 2.5698509216308594 + }, + { + "auxiliary_loss_clip": 0.06409481, + "auxiliary_loss_mlp": 0.01264806, + "balance_loss_clip": 0.06272879, + "balance_loss_mlp": 0.01254363, + "epoch": 0.721659401773636, + "flos": 39207831615360.0, + "grad_norm": 2.1337554314771117, + "language_loss": 0.62387294, + "learning_rate": 7.589888089035462e-07, + "loss": 0.70061582, + "num_input_tokens_seen": 258960710, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.10437012, + "step": 12003, + "time_per_iteration": 2.646667003631592 + }, + { + "auxiliary_loss_clip": 0.06408672, + "auxiliary_loss_mlp": 0.01269946, + "balance_loss_clip": 0.06271918, + "balance_loss_mlp": 0.01258639, + "epoch": 0.7217195250263039, + "flos": 14945299067520.0, + "grad_norm": 3.165928110898167, + "language_loss": 0.69158828, + "learning_rate": 7.586834157983544e-07, + "loss": 0.76837444, + "num_input_tokens_seen": 258978475, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.11297607, + "step": 12004, + "time_per_iteration": 2.4904415607452393 + }, + { + "auxiliary_loss_clip": 0.06301466, + "auxiliary_loss_mlp": 0.0124999, + "balance_loss_clip": 0.06246269, + "balance_loss_mlp": 0.01249087, + "epoch": 0.7217796482789719, + "flos": 70889477973120.0, + "grad_norm": 0.8473059140767815, + "language_loss": 0.54124975, + "learning_rate": 7.583780697640112e-07, + "loss": 0.61676431, + "num_input_tokens_seen": 259037520, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.00901794, + "step": 12005, + "time_per_iteration": 3.085909366607666 + }, + { + "auxiliary_loss_clip": 0.06406818, + "auxiliary_loss_mlp": 0.0126308, + "balance_loss_clip": 0.06273066, + "balance_loss_mlp": 0.0125349, + "epoch": 0.7218397715316398, + "flos": 37460653804800.0, + "grad_norm": 1.5183383178903638, + "language_loss": 0.63201904, + "learning_rate": 7.580727708120962e-07, + "loss": 0.708718, + "num_input_tokens_seen": 259061325, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.09588623, + "step": 12006, + "time_per_iteration": 2.7121994495391846 + }, + { + "auxiliary_loss_clip": 0.06407326, + "auxiliary_loss_mlp": 0.01263158, + "balance_loss_clip": 0.0627062, + "balance_loss_mlp": 0.0125352, + "epoch": 0.7218998947843078, + "flos": 22717223222400.0, + "grad_norm": 1.5926677831370504, + "language_loss": 0.92170072, + "learning_rate": 7.577675189541865e-07, + "loss": 0.99840552, + "num_input_tokens_seen": 259078135, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.09643555, + "step": 12007, + "time_per_iteration": 2.534914016723633 + }, + { + "auxiliary_loss_clip": 0.06408784, + "auxiliary_loss_mlp": 0.01266152, + "balance_loss_clip": 0.06272783, + "balance_loss_mlp": 0.01255191, + "epoch": 0.7219600180369758, + "flos": 12172131538560.0, + "grad_norm": 1.6024431968555108, + "language_loss": 0.63807905, + "learning_rate": 7.574623142018568e-07, + "loss": 0.71482843, + "num_input_tokens_seen": 259095910, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.10961914, + "step": 12008, + "time_per_iteration": 2.5015389919281006 + }, + { + "auxiliary_loss_clip": 0.0641045, + "auxiliary_loss_mlp": 0.01266515, + "balance_loss_clip": 0.06271577, + "balance_loss_mlp": 0.01256144, + "epoch": 0.7220201412896438, + "flos": 22602340874880.0, + "grad_norm": 1.927754748237573, + "language_loss": 0.79281247, + "learning_rate": 7.57157156566681e-07, + "loss": 0.86958218, + "num_input_tokens_seen": 259114225, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.1038208, + "step": 12009, + "time_per_iteration": 2.5008604526519775 + }, + { + "auxiliary_loss_clip": 0.06407045, + "auxiliary_loss_mlp": 0.01266982, + "balance_loss_clip": 0.06269218, + "balance_loss_mlp": 0.01255533, + "epoch": 0.7220802645423118, + "flos": 26724972009600.0, + "grad_norm": 2.605024867459915, + "language_loss": 0.6418041, + "learning_rate": 7.568520460602297e-07, + "loss": 0.71854436, + "num_input_tokens_seen": 259134660, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.11450195, + "step": 12010, + "time_per_iteration": 2.527949571609497 + }, + { + "auxiliary_loss_clip": 0.06404176, + "auxiliary_loss_mlp": 0.01266927, + "balance_loss_clip": 0.06270487, + "balance_loss_mlp": 0.01256854, + "epoch": 0.7221403877949797, + "flos": 24426568114560.0, + "grad_norm": 1.594533265957021, + "language_loss": 0.77320325, + "learning_rate": 7.565469826940742e-07, + "loss": 0.84991425, + "num_input_tokens_seen": 259153300, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.10070801, + "step": 12011, + "time_per_iteration": 2.5198636054992676 + }, + { + "auxiliary_loss_clip": 0.0640825, + "auxiliary_loss_mlp": 0.01263324, + "balance_loss_clip": 0.06273598, + "balance_loss_mlp": 0.0125368, + "epoch": 0.7222005110476477, + "flos": 23521246675200.0, + "grad_norm": 1.6737582547209497, + "language_loss": 0.79734701, + "learning_rate": 7.56241966479781e-07, + "loss": 0.87406272, + "num_input_tokens_seen": 259172115, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.09637451, + "step": 12012, + "time_per_iteration": 2.5218822956085205 + }, + { + "auxiliary_loss_clip": 0.06409319, + "auxiliary_loss_mlp": 0.01264498, + "balance_loss_clip": 0.0627391, + "balance_loss_mlp": 0.01254955, + "epoch": 0.7222606343003156, + "flos": 23119255912320.0, + "grad_norm": 2.6909809043391744, + "language_loss": 0.76237571, + "learning_rate": 7.559369974289171e-07, + "loss": 0.83911389, + "num_input_tokens_seen": 259191345, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.09533691, + "step": 12013, + "time_per_iteration": 2.501549005508423 + }, + { + "auxiliary_loss_clip": 0.06401782, + "auxiliary_loss_mlp": 0.01266309, + "balance_loss_clip": 0.06270641, + "balance_loss_mlp": 0.01256456, + "epoch": 0.7223207575529836, + "flos": 24357778312320.0, + "grad_norm": 1.4242237370924462, + "language_loss": 0.76199239, + "learning_rate": 7.556320755530484e-07, + "loss": 0.83867329, + "num_input_tokens_seen": 259211700, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09851074, + "step": 12014, + "time_per_iteration": 2.6219167709350586 + }, + { + "auxiliary_loss_clip": 0.0640952, + "auxiliary_loss_mlp": 0.01262375, + "balance_loss_clip": 0.0627341, + "balance_loss_mlp": 0.01252445, + "epoch": 0.7223808808056515, + "flos": 28337798597760.0, + "grad_norm": 1.6715764427822655, + "language_loss": 0.86861187, + "learning_rate": 7.553272008637346e-07, + "loss": 0.9453308, + "num_input_tokens_seen": 259233825, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.09924316, + "step": 12015, + "time_per_iteration": 2.5629379749298096 + }, + { + "auxiliary_loss_clip": 0.0640379, + "auxiliary_loss_mlp": 0.01267259, + "balance_loss_clip": 0.06271358, + "balance_loss_mlp": 0.01257365, + "epoch": 0.7224410040583196, + "flos": 21075829591680.0, + "grad_norm": 2.031854447065517, + "language_loss": 0.78420502, + "learning_rate": 7.55022373372538e-07, + "loss": 0.86091554, + "num_input_tokens_seen": 259253055, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.09899902, + "step": 12016, + "time_per_iteration": 2.549696207046509 + }, + { + "auxiliary_loss_clip": 0.06403818, + "auxiliary_loss_mlp": 0.01265816, + "balance_loss_clip": 0.06270836, + "balance_loss_mlp": 0.01255839, + "epoch": 0.7225011273109875, + "flos": 26802398782080.0, + "grad_norm": 1.3727875388559247, + "language_loss": 0.77603066, + "learning_rate": 7.547175930910186e-07, + "loss": 0.85272694, + "num_input_tokens_seen": 259273420, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.09979248, + "step": 12017, + "time_per_iteration": 2.5937881469726562 + }, + { + "auxiliary_loss_clip": 0.06402834, + "auxiliary_loss_mlp": 0.01265872, + "balance_loss_clip": 0.06271364, + "balance_loss_mlp": 0.0125609, + "epoch": 0.7225612505636555, + "flos": 23589826842240.0, + "grad_norm": 1.6197156862149726, + "language_loss": 0.74198735, + "learning_rate": 7.54412860030732e-07, + "loss": 0.81867433, + "num_input_tokens_seen": 259291000, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09783936, + "step": 12018, + "time_per_iteration": 3.996819257736206 + }, + { + "auxiliary_loss_clip": 0.06402058, + "auxiliary_loss_mlp": 0.01270158, + "balance_loss_clip": 0.06272961, + "balance_loss_mlp": 0.01260812, + "epoch": 0.7226213738163234, + "flos": 20783983420800.0, + "grad_norm": 1.7233802894536456, + "language_loss": 0.77552009, + "learning_rate": 7.541081742032347e-07, + "loss": 0.85224223, + "num_input_tokens_seen": 259312390, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.09344482, + "step": 12019, + "time_per_iteration": 2.52474308013916 + }, + { + "auxiliary_loss_clip": 0.0640556, + "auxiliary_loss_mlp": 0.01263394, + "balance_loss_clip": 0.06272571, + "balance_loss_mlp": 0.01253363, + "epoch": 0.7226814970689914, + "flos": 32644227663360.0, + "grad_norm": 1.6248881332172511, + "language_loss": 0.73835564, + "learning_rate": 7.53803535620081e-07, + "loss": 0.81504518, + "num_input_tokens_seen": 259332645, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.10028076, + "step": 12020, + "time_per_iteration": 2.577397346496582 + }, + { + "auxiliary_loss_clip": 0.06409635, + "auxiliary_loss_mlp": 0.01262192, + "balance_loss_clip": 0.06272969, + "balance_loss_mlp": 0.01252054, + "epoch": 0.7227416203216595, + "flos": 22460736274560.0, + "grad_norm": 1.6075634360932833, + "language_loss": 0.77574962, + "learning_rate": 7.534989442928219e-07, + "loss": 0.85246789, + "num_input_tokens_seen": 259353810, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.10137939, + "step": 12021, + "time_per_iteration": 2.530141592025757 + }, + { + "auxiliary_loss_clip": 0.06403421, + "auxiliary_loss_mlp": 0.0126503, + "balance_loss_clip": 0.06270886, + "balance_loss_mlp": 0.01255267, + "epoch": 0.7228017435743274, + "flos": 21658641465600.0, + "grad_norm": 1.5420069016517286, + "language_loss": 0.68414694, + "learning_rate": 7.531944002330073e-07, + "loss": 0.76083142, + "num_input_tokens_seen": 259372460, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09765625, + "step": 12022, + "time_per_iteration": 2.504757881164551 + }, + { + "auxiliary_loss_clip": 0.06407183, + "auxiliary_loss_mlp": 0.01266298, + "balance_loss_clip": 0.06271838, + "balance_loss_mlp": 0.0125613, + "epoch": 0.7228618668269954, + "flos": 29541171409920.0, + "grad_norm": 1.8382982507035688, + "language_loss": 0.69865435, + "learning_rate": 7.528899034521858e-07, + "loss": 0.77538919, + "num_input_tokens_seen": 259393275, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.10168457, + "step": 12023, + "time_per_iteration": 2.572157859802246 + }, + { + "auxiliary_loss_clip": 0.06405231, + "auxiliary_loss_mlp": 0.01262251, + "balance_loss_clip": 0.06272452, + "balance_loss_mlp": 0.01252356, + "epoch": 0.7229219900796633, + "flos": 27461169982080.0, + "grad_norm": 1.6264829845814306, + "language_loss": 0.71353316, + "learning_rate": 7.525854539619052e-07, + "loss": 0.79020798, + "num_input_tokens_seen": 259416205, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09887695, + "step": 12024, + "time_per_iteration": 2.548758029937744 + }, + { + "auxiliary_loss_clip": 0.06407243, + "auxiliary_loss_mlp": 0.01265115, + "balance_loss_clip": 0.06272963, + "balance_loss_mlp": 0.01254946, + "epoch": 0.7229821133323313, + "flos": 16294888454400.0, + "grad_norm": 2.8784491415688427, + "language_loss": 0.75972795, + "learning_rate": 7.522810517737089e-07, + "loss": 0.83645153, + "num_input_tokens_seen": 259433115, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.10168457, + "step": 12025, + "time_per_iteration": 2.4729340076446533 + }, + { + "auxiliary_loss_clip": 0.06403269, + "auxiliary_loss_mlp": 0.01264783, + "balance_loss_clip": 0.06271631, + "balance_loss_mlp": 0.01255049, + "epoch": 0.7230422365849992, + "flos": 20418567765120.0, + "grad_norm": 1.900331951753324, + "language_loss": 0.76300782, + "learning_rate": 7.519766968991395e-07, + "loss": 0.83968836, + "num_input_tokens_seen": 259450475, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09741211, + "step": 12026, + "time_per_iteration": 2.4887609481811523 + }, + { + "auxiliary_loss_clip": 0.06407255, + "auxiliary_loss_mlp": 0.01263175, + "balance_loss_clip": 0.06272575, + "balance_loss_mlp": 0.01253114, + "epoch": 0.7231023598376672, + "flos": 25600619197440.0, + "grad_norm": 1.727853118389861, + "language_loss": 0.67822838, + "learning_rate": 7.516723893497388e-07, + "loss": 0.75493264, + "num_input_tokens_seen": 259469355, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.10064697, + "step": 12027, + "time_per_iteration": 2.5328831672668457 + }, + { + "auxiliary_loss_clip": 0.06409849, + "auxiliary_loss_mlp": 0.01267637, + "balance_loss_clip": 0.06273012, + "balance_loss_mlp": 0.01256372, + "epoch": 0.7231624830903352, + "flos": 25155638490240.0, + "grad_norm": 20.233836516227683, + "language_loss": 0.79796958, + "learning_rate": 7.513681291370469e-07, + "loss": 0.87474453, + "num_input_tokens_seen": 259486565, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.11260986, + "step": 12028, + "time_per_iteration": 2.5175299644470215 + }, + { + "auxiliary_loss_clip": 0.06406561, + "auxiliary_loss_mlp": 0.01262813, + "balance_loss_clip": 0.06271036, + "balance_loss_mlp": 0.01252722, + "epoch": 0.7232226063430032, + "flos": 21732169023360.0, + "grad_norm": 1.6712799697819898, + "language_loss": 0.8266964, + "learning_rate": 7.510639162726e-07, + "loss": 0.90339005, + "num_input_tokens_seen": 259505070, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.10089111, + "step": 12029, + "time_per_iteration": 3.9506967067718506 + }, + { + "auxiliary_loss_clip": 0.06311534, + "auxiliary_loss_mlp": 0.01251495, + "balance_loss_clip": 0.06256342, + "balance_loss_mlp": 0.01250514, + "epoch": 0.7232827295956711, + "flos": 68458693426560.0, + "grad_norm": 0.7790969864555375, + "language_loss": 0.6171549, + "learning_rate": 7.507597507679347e-07, + "loss": 0.6927852, + "num_input_tokens_seen": 259569135, + "router_z_loss_clip": 0.55322266, + "router_z_loss_mlp": 0.00980377, + "step": 12030, + "time_per_iteration": 3.187685489654541 + }, + { + "auxiliary_loss_clip": 0.06405394, + "auxiliary_loss_mlp": 0.01265748, + "balance_loss_clip": 0.06273839, + "balance_loss_mlp": 0.01255753, + "epoch": 0.7233428528483391, + "flos": 20198697851520.0, + "grad_norm": 1.6342080054038326, + "language_loss": 0.78514922, + "learning_rate": 7.504556326345859e-07, + "loss": 0.86186063, + "num_input_tokens_seen": 259587035, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09997559, + "step": 12031, + "time_per_iteration": 2.47151255607605 + }, + { + "auxiliary_loss_clip": 0.06411318, + "auxiliary_loss_mlp": 0.01265123, + "balance_loss_clip": 0.0627391, + "balance_loss_mlp": 0.01254465, + "epoch": 0.723402976101007, + "flos": 23955955257600.0, + "grad_norm": 1.8287937473952962, + "language_loss": 0.81728959, + "learning_rate": 7.501515618840834e-07, + "loss": 0.894054, + "num_input_tokens_seen": 259606140, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.10656738, + "step": 12032, + "time_per_iteration": 2.5481441020965576 + }, + { + "auxiliary_loss_clip": 0.06416769, + "auxiliary_loss_mlp": 0.01265155, + "balance_loss_clip": 0.06275293, + "balance_loss_mlp": 0.01254485, + "epoch": 0.723463099353675, + "flos": 20819636133120.0, + "grad_norm": 1.8204115009796795, + "language_loss": 0.75397038, + "learning_rate": 7.498475385279592e-07, + "loss": 0.83078963, + "num_input_tokens_seen": 259624275, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.10662842, + "step": 12033, + "time_per_iteration": 3.957021951675415 + }, + { + "auxiliary_loss_clip": 0.0640196, + "auxiliary_loss_mlp": 0.01261304, + "balance_loss_clip": 0.06271483, + "balance_loss_mlp": 0.01251874, + "epoch": 0.723523222606343, + "flos": 19103876403840.0, + "grad_norm": 1.563188843970664, + "language_loss": 0.75271815, + "learning_rate": 7.495435625777423e-07, + "loss": 0.82935083, + "num_input_tokens_seen": 259643465, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.09423828, + "step": 12034, + "time_per_iteration": 2.479860782623291 + }, + { + "auxiliary_loss_clip": 0.0640718, + "auxiliary_loss_mlp": 0.01261832, + "balance_loss_clip": 0.06273372, + "balance_loss_mlp": 0.01252146, + "epoch": 0.723583345859011, + "flos": 26514493752960.0, + "grad_norm": 1.7350921748415202, + "language_loss": 0.80701005, + "learning_rate": 7.492396340449578e-07, + "loss": 0.88370025, + "num_input_tokens_seen": 259662500, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.09680176, + "step": 12035, + "time_per_iteration": 2.559680700302124 + }, + { + "auxiliary_loss_clip": 0.06410785, + "auxiliary_loss_mlp": 0.01263828, + "balance_loss_clip": 0.06273998, + "balance_loss_mlp": 0.01253361, + "epoch": 0.723643469111679, + "flos": 16039323901440.0, + "grad_norm": 3.114522084917199, + "language_loss": 0.61466223, + "learning_rate": 7.489357529411326e-07, + "loss": 0.69140834, + "num_input_tokens_seen": 259680140, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.10473633, + "step": 12036, + "time_per_iteration": 2.4680371284484863 + }, + { + "auxiliary_loss_clip": 0.06403697, + "auxiliary_loss_mlp": 0.01264009, + "balance_loss_clip": 0.06272744, + "balance_loss_mlp": 0.01254914, + "epoch": 0.7237035923643469, + "flos": 21952164718080.0, + "grad_norm": 1.4930749372643133, + "language_loss": 0.67717707, + "learning_rate": 7.486319192777883e-07, + "loss": 0.75385416, + "num_input_tokens_seen": 259700160, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09094238, + "step": 12037, + "time_per_iteration": 3.957728862762451 + }, + { + "auxiliary_loss_clip": 0.06406017, + "auxiliary_loss_mlp": 0.01265379, + "balance_loss_clip": 0.06273565, + "balance_loss_mlp": 0.01255091, + "epoch": 0.7237637156170149, + "flos": 23589281790720.0, + "grad_norm": 1.7134802369768287, + "language_loss": 0.73071694, + "learning_rate": 7.483281330664479e-07, + "loss": 0.80743086, + "num_input_tokens_seen": 259720525, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.10296631, + "step": 12038, + "time_per_iteration": 2.5239899158477783 + }, + { + "auxiliary_loss_clip": 0.06408326, + "auxiliary_loss_mlp": 0.012677, + "balance_loss_clip": 0.06274582, + "balance_loss_mlp": 0.0125625, + "epoch": 0.7238238388696828, + "flos": 20600940176640.0, + "grad_norm": 1.583420390669157, + "language_loss": 0.72335035, + "learning_rate": 7.480243943186293e-07, + "loss": 0.80011058, + "num_input_tokens_seen": 259738680, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.11437988, + "step": 12039, + "time_per_iteration": 2.5016210079193115 + }, + { + "auxiliary_loss_clip": 0.06408711, + "auxiliary_loss_mlp": 0.01262586, + "balance_loss_clip": 0.06274222, + "balance_loss_mlp": 0.0125346, + "epoch": 0.7238839621223508, + "flos": 24213909651840.0, + "grad_norm": 1.553952761498081, + "language_loss": 0.7617048, + "learning_rate": 7.477207030458513e-07, + "loss": 0.83841777, + "num_input_tokens_seen": 259758790, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.09130859, + "step": 12040, + "time_per_iteration": 2.4979355335235596 + }, + { + "auxiliary_loss_clip": 0.0640977, + "auxiliary_loss_mlp": 0.01263735, + "balance_loss_clip": 0.06273755, + "balance_loss_mlp": 0.01252898, + "epoch": 0.7239440853750188, + "flos": 14214928953600.0, + "grad_norm": 1.6058378864892022, + "language_loss": 0.77005613, + "learning_rate": 7.474170592596301e-07, + "loss": 0.84679121, + "num_input_tokens_seen": 259777370, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.10845947, + "step": 12041, + "time_per_iteration": 2.519228458404541 + }, + { + "auxiliary_loss_clip": 0.06408431, + "auxiliary_loss_mlp": 0.01263027, + "balance_loss_clip": 0.06271957, + "balance_loss_mlp": 0.01253365, + "epoch": 0.7240042086276868, + "flos": 21620976255360.0, + "grad_norm": 1.9889626365674344, + "language_loss": 0.63348103, + "learning_rate": 7.471134629714797e-07, + "loss": 0.7101956, + "num_input_tokens_seen": 259794665, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.09667969, + "step": 12042, + "time_per_iteration": 2.475182294845581 + }, + { + "auxiliary_loss_clip": 0.06410774, + "auxiliary_loss_mlp": 0.01268078, + "balance_loss_clip": 0.06275245, + "balance_loss_mlp": 0.012567, + "epoch": 0.7240643318803547, + "flos": 23338203431040.0, + "grad_norm": 1.8474585554645233, + "language_loss": 0.83173352, + "learning_rate": 7.468099141929116e-07, + "loss": 0.90852207, + "num_input_tokens_seen": 259811110, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.11376953, + "step": 12043, + "time_per_iteration": 2.5139901638031006 + }, + { + "auxiliary_loss_clip": 0.06409861, + "auxiliary_loss_mlp": 0.01267458, + "balance_loss_clip": 0.06273165, + "balance_loss_mlp": 0.01256354, + "epoch": 0.7241244551330227, + "flos": 24031746875520.0, + "grad_norm": 2.293056245042729, + "language_loss": 0.64671153, + "learning_rate": 7.465064129354379e-07, + "loss": 0.72348469, + "num_input_tokens_seen": 259831080, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.11102295, + "step": 12044, + "time_per_iteration": 2.499971866607666 + }, + { + "auxiliary_loss_clip": 0.06411785, + "auxiliary_loss_mlp": 0.01265003, + "balance_loss_clip": 0.06276388, + "balance_loss_mlp": 0.01254781, + "epoch": 0.7241845783856906, + "flos": 18735651636480.0, + "grad_norm": 1.9189721390747507, + "language_loss": 0.81796312, + "learning_rate": 7.462029592105658e-07, + "loss": 0.89473093, + "num_input_tokens_seen": 259850135, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.10211182, + "step": 12045, + "time_per_iteration": 2.4791791439056396 + }, + { + "auxiliary_loss_clip": 0.06403655, + "auxiliary_loss_mlp": 0.0126726, + "balance_loss_clip": 0.06274088, + "balance_loss_mlp": 0.01256752, + "epoch": 0.7242447016383586, + "flos": 19504483574400.0, + "grad_norm": 2.888520203836974, + "language_loss": 0.72249848, + "learning_rate": 7.458995530298034e-07, + "loss": 0.79920763, + "num_input_tokens_seen": 259868185, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.1050415, + "step": 12046, + "time_per_iteration": 2.4642648696899414 + }, + { + "auxiliary_loss_clip": 0.064097, + "auxiliary_loss_mlp": 0.01264976, + "balance_loss_clip": 0.06273885, + "balance_loss_mlp": 0.01254396, + "epoch": 0.7243048248910267, + "flos": 22169980206720.0, + "grad_norm": 1.724287594820583, + "language_loss": 0.71379775, + "learning_rate": 7.455961944046553e-07, + "loss": 0.79054451, + "num_input_tokens_seen": 259887055, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.10571289, + "step": 12047, + "time_per_iteration": 2.5032777786254883 + }, + { + "auxiliary_loss_clip": 0.06410667, + "auxiliary_loss_mlp": 0.01264735, + "balance_loss_clip": 0.06274027, + "balance_loss_mlp": 0.01253673, + "epoch": 0.7243649481436946, + "flos": 27680159427840.0, + "grad_norm": 1.6409687158316038, + "language_loss": 0.70148283, + "learning_rate": 7.45292883346627e-07, + "loss": 0.77823687, + "num_input_tokens_seen": 259908295, + "router_z_loss_clip": 1.36621094, + "router_z_loss_mlp": 0.11065674, + "step": 12048, + "time_per_iteration": 2.537400007247925 + }, + { + "auxiliary_loss_clip": 0.06309511, + "auxiliary_loss_mlp": 0.01254196, + "balance_loss_clip": 0.06254156, + "balance_loss_mlp": 0.01253124, + "epoch": 0.7244250713963626, + "flos": 63263686538880.0, + "grad_norm": 0.8079275009265211, + "language_loss": 0.53702354, + "learning_rate": 7.449896198672168e-07, + "loss": 0.61266059, + "num_input_tokens_seen": 259968475, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01072693, + "step": 12049, + "time_per_iteration": 3.117490768432617 + }, + { + "auxiliary_loss_clip": 0.06415777, + "auxiliary_loss_mlp": 0.01264713, + "balance_loss_clip": 0.06273454, + "balance_loss_mlp": 0.01252971, + "epoch": 0.7244851946490305, + "flos": 17972815265280.0, + "grad_norm": 2.160877059772018, + "language_loss": 0.60396636, + "learning_rate": 7.446864039779258e-07, + "loss": 0.68077123, + "num_input_tokens_seen": 259984865, + "router_z_loss_clip": 1.42382812, + "router_z_loss_mlp": 0.11737061, + "step": 12050, + "time_per_iteration": 2.4579668045043945 + }, + { + "auxiliary_loss_clip": 0.06310994, + "auxiliary_loss_mlp": 0.01250921, + "balance_loss_clip": 0.06255537, + "balance_loss_mlp": 0.01249847, + "epoch": 0.7245453179016985, + "flos": 70964179488000.0, + "grad_norm": 0.6964887094333322, + "language_loss": 0.53128082, + "learning_rate": 7.443832356902528e-07, + "loss": 0.60689998, + "num_input_tokens_seen": 260046735, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01075745, + "step": 12051, + "time_per_iteration": 3.1524975299835205 + }, + { + "auxiliary_loss_clip": 0.06405707, + "auxiliary_loss_mlp": 0.01263012, + "balance_loss_clip": 0.06273422, + "balance_loss_mlp": 0.01253594, + "epoch": 0.7246054411543664, + "flos": 24574839114240.0, + "grad_norm": 1.4328858557340107, + "language_loss": 0.71919692, + "learning_rate": 7.440801150156927e-07, + "loss": 0.79588413, + "num_input_tokens_seen": 260067950, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.09417725, + "step": 12052, + "time_per_iteration": 2.599375009536743 + }, + { + "auxiliary_loss_clip": 0.06409772, + "auxiliary_loss_mlp": 0.01266045, + "balance_loss_clip": 0.06275947, + "balance_loss_mlp": 0.01255608, + "epoch": 0.7246655644070344, + "flos": 32345715093120.0, + "grad_norm": 1.7264545008228058, + "language_loss": 0.74337375, + "learning_rate": 7.437770419657415e-07, + "loss": 0.8201319, + "num_input_tokens_seen": 260087730, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.10430908, + "step": 12053, + "time_per_iteration": 2.572556495666504 + }, + { + "auxiliary_loss_clip": 0.06411305, + "auxiliary_loss_mlp": 0.01265609, + "balance_loss_clip": 0.06278073, + "balance_loss_mlp": 0.01254952, + "epoch": 0.7247256876597024, + "flos": 21879056430720.0, + "grad_norm": 2.130811806275834, + "language_loss": 0.78439468, + "learning_rate": 7.434740165518898e-07, + "loss": 0.86116385, + "num_input_tokens_seen": 260107760, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.10650635, + "step": 12054, + "time_per_iteration": 2.594451427459717 + }, + { + "auxiliary_loss_clip": 0.0641022, + "auxiliary_loss_mlp": 0.01263418, + "balance_loss_clip": 0.06276123, + "balance_loss_mlp": 0.01253011, + "epoch": 0.7247858109123704, + "flos": 16218048660480.0, + "grad_norm": 2.4211075094396692, + "language_loss": 0.68897808, + "learning_rate": 7.431710387856301e-07, + "loss": 0.76571441, + "num_input_tokens_seen": 260123660, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.10406494, + "step": 12055, + "time_per_iteration": 2.490989923477173 + }, + { + "auxiliary_loss_clip": 0.06406957, + "auxiliary_loss_mlp": 0.01264855, + "balance_loss_clip": 0.062728, + "balance_loss_mlp": 0.01255467, + "epoch": 0.7248459341650383, + "flos": 20857091708160.0, + "grad_norm": 1.6323335153205245, + "language_loss": 0.74211532, + "learning_rate": 7.428681086784496e-07, + "loss": 0.81883347, + "num_input_tokens_seen": 260142690, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.09387207, + "step": 12056, + "time_per_iteration": 2.5162346363067627 + }, + { + "auxiliary_loss_clip": 0.06405525, + "auxiliary_loss_mlp": 0.01261212, + "balance_loss_clip": 0.06274804, + "balance_loss_mlp": 0.0125152, + "epoch": 0.7249060574177063, + "flos": 25928956621440.0, + "grad_norm": 1.8158169987002448, + "language_loss": 0.70777828, + "learning_rate": 7.425652262418368e-07, + "loss": 0.78444564, + "num_input_tokens_seen": 260162590, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09680176, + "step": 12057, + "time_per_iteration": 4.079265594482422 + }, + { + "auxiliary_loss_clip": 0.0641495, + "auxiliary_loss_mlp": 0.01269409, + "balance_loss_clip": 0.06275235, + "balance_loss_mlp": 0.01258704, + "epoch": 0.7249661806703742, + "flos": 17350912661760.0, + "grad_norm": 1.9388728601507708, + "language_loss": 0.62604892, + "learning_rate": 7.42262391487277e-07, + "loss": 0.70289254, + "num_input_tokens_seen": 260181065, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.1071167, + "step": 12058, + "time_per_iteration": 2.567502737045288 + }, + { + "auxiliary_loss_clip": 0.06412682, + "auxiliary_loss_mlp": 0.01264257, + "balance_loss_clip": 0.06279195, + "balance_loss_mlp": 0.01253963, + "epoch": 0.7250263039230422, + "flos": 19580400973440.0, + "grad_norm": 1.9516605705856642, + "language_loss": 0.75217509, + "learning_rate": 7.419596044262535e-07, + "loss": 0.82894444, + "num_input_tokens_seen": 260200330, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.10290527, + "step": 12059, + "time_per_iteration": 2.4943277835845947 + }, + { + "auxiliary_loss_clip": 0.06405184, + "auxiliary_loss_mlp": 0.01262509, + "balance_loss_clip": 0.06274289, + "balance_loss_mlp": 0.01253282, + "epoch": 0.7250864271757103, + "flos": 21982366915200.0, + "grad_norm": 1.7883051719653056, + "language_loss": 0.79778695, + "learning_rate": 7.416568650702472e-07, + "loss": 0.87446392, + "num_input_tokens_seen": 260219975, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09222412, + "step": 12060, + "time_per_iteration": 2.519117593765259 + }, + { + "auxiliary_loss_clip": 0.06412885, + "auxiliary_loss_mlp": 0.01266886, + "balance_loss_clip": 0.06276695, + "balance_loss_mlp": 0.01256449, + "epoch": 0.7251465504283782, + "flos": 25020113310720.0, + "grad_norm": 1.8093299142299697, + "language_loss": 0.76421869, + "learning_rate": 7.413541734307393e-07, + "loss": 0.84101641, + "num_input_tokens_seen": 260242025, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.10443115, + "step": 12061, + "time_per_iteration": 2.5503969192504883 + }, + { + "auxiliary_loss_clip": 0.06405508, + "auxiliary_loss_mlp": 0.01263826, + "balance_loss_clip": 0.06275885, + "balance_loss_mlp": 0.01253747, + "epoch": 0.7252066736810462, + "flos": 16695621406080.0, + "grad_norm": 1.6247315463998022, + "language_loss": 0.81481957, + "learning_rate": 7.410515295192068e-07, + "loss": 0.89151287, + "num_input_tokens_seen": 260260015, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.10083008, + "step": 12062, + "time_per_iteration": 2.478410482406616 + }, + { + "auxiliary_loss_clip": 0.06418011, + "auxiliary_loss_mlp": 0.01265854, + "balance_loss_clip": 0.06279325, + "balance_loss_mlp": 0.01255066, + "epoch": 0.7252667969337141, + "flos": 25710176810880.0, + "grad_norm": 2.2019312286273705, + "language_loss": 0.69337016, + "learning_rate": 7.407489333471262e-07, + "loss": 0.77020884, + "num_input_tokens_seen": 260278635, + "router_z_loss_clip": 1.38476562, + "router_z_loss_mlp": 0.10778809, + "step": 12063, + "time_per_iteration": 2.5213000774383545 + }, + { + "auxiliary_loss_clip": 0.06404665, + "auxiliary_loss_mlp": 0.01264944, + "balance_loss_clip": 0.06275742, + "balance_loss_mlp": 0.01255186, + "epoch": 0.7253269201863821, + "flos": 18265835393280.0, + "grad_norm": 1.3337230483147808, + "language_loss": 0.70080262, + "learning_rate": 7.40446384925973e-07, + "loss": 0.77749866, + "num_input_tokens_seen": 260298510, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.09759521, + "step": 12064, + "time_per_iteration": 2.4883687496185303 + }, + { + "auxiliary_loss_clip": 0.06412718, + "auxiliary_loss_mlp": 0.01263925, + "balance_loss_clip": 0.06279429, + "balance_loss_mlp": 0.01253846, + "epoch": 0.72538704343905, + "flos": 20417938859520.0, + "grad_norm": 1.6031100014197759, + "language_loss": 0.90715456, + "learning_rate": 7.401438842672192e-07, + "loss": 0.98392093, + "num_input_tokens_seen": 260317405, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.10076904, + "step": 12065, + "time_per_iteration": 2.6608688831329346 + }, + { + "auxiliary_loss_clip": 0.06315897, + "auxiliary_loss_mlp": 0.01252262, + "balance_loss_clip": 0.0626056, + "balance_loss_mlp": 0.01251238, + "epoch": 0.725447166691718, + "flos": 70173321125760.0, + "grad_norm": 0.6440962314349006, + "language_loss": 0.56150329, + "learning_rate": 7.398414313823349e-07, + "loss": 0.63718486, + "num_input_tokens_seen": 260388085, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01023865, + "step": 12066, + "time_per_iteration": 3.253070592880249 + }, + { + "auxiliary_loss_clip": 0.064081, + "auxiliary_loss_mlp": 0.01266559, + "balance_loss_clip": 0.06276315, + "balance_loss_mlp": 0.01257029, + "epoch": 0.725507289944386, + "flos": 27059598489600.0, + "grad_norm": 1.6969511416209166, + "language_loss": 0.76925343, + "learning_rate": 7.395390262827897e-07, + "loss": 0.84600002, + "num_input_tokens_seen": 260406165, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09533691, + "step": 12067, + "time_per_iteration": 2.553955554962158 + }, + { + "auxiliary_loss_clip": 0.0632008, + "auxiliary_loss_mlp": 0.01251739, + "balance_loss_clip": 0.06264634, + "balance_loss_mlp": 0.01250711, + "epoch": 0.725567413197054, + "flos": 62941973587200.0, + "grad_norm": 0.7126407397816765, + "language_loss": 0.56957459, + "learning_rate": 7.392366689800515e-07, + "loss": 0.64529276, + "num_input_tokens_seen": 260461365, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01028442, + "step": 12068, + "time_per_iteration": 3.020040512084961 + }, + { + "auxiliary_loss_clip": 0.06320577, + "auxiliary_loss_mlp": 0.01251119, + "balance_loss_clip": 0.0626526, + "balance_loss_mlp": 0.01250047, + "epoch": 0.7256275364497219, + "flos": 60315735392640.0, + "grad_norm": 0.6491964300681237, + "language_loss": 0.55317146, + "learning_rate": 7.389343594855848e-07, + "loss": 0.62888843, + "num_input_tokens_seen": 260523795, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01074219, + "step": 12069, + "time_per_iteration": 4.627661228179932 + }, + { + "auxiliary_loss_clip": 0.0640723, + "auxiliary_loss_mlp": 0.01261481, + "balance_loss_clip": 0.06277817, + "balance_loss_mlp": 0.01252726, + "epoch": 0.7256876597023899, + "flos": 24505378479360.0, + "grad_norm": 2.803632714871867, + "language_loss": 0.80079329, + "learning_rate": 7.38632097810854e-07, + "loss": 0.87748045, + "num_input_tokens_seen": 260544765, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.08770752, + "step": 12070, + "time_per_iteration": 2.5643179416656494 + }, + { + "auxiliary_loss_clip": 0.06405459, + "auxiliary_loss_mlp": 0.01262838, + "balance_loss_clip": 0.06277329, + "balance_loss_mlp": 0.01252867, + "epoch": 0.7257477829550578, + "flos": 24359623102080.0, + "grad_norm": 1.9027271039299547, + "language_loss": 0.72591138, + "learning_rate": 7.383298839673197e-07, + "loss": 0.80259442, + "num_input_tokens_seen": 260564340, + "router_z_loss_clip": 1.28027344, + "router_z_loss_mlp": 0.09979248, + "step": 12071, + "time_per_iteration": 2.527245283126831 + }, + { + "auxiliary_loss_clip": 0.06408995, + "auxiliary_loss_mlp": 0.01268506, + "balance_loss_clip": 0.06277612, + "balance_loss_mlp": 0.01258379, + "epoch": 0.7258079062077258, + "flos": 17208008323200.0, + "grad_norm": 1.784714322475179, + "language_loss": 0.70686817, + "learning_rate": 7.380277179664436e-07, + "loss": 0.78364313, + "num_input_tokens_seen": 260582565, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.10113525, + "step": 12072, + "time_per_iteration": 3.9422738552093506 + }, + { + "auxiliary_loss_clip": 0.06411255, + "auxiliary_loss_mlp": 0.01265945, + "balance_loss_clip": 0.06273982, + "balance_loss_mlp": 0.01255264, + "epoch": 0.7258680294603939, + "flos": 21586832916480.0, + "grad_norm": 1.7307594033578553, + "language_loss": 0.79001957, + "learning_rate": 7.377255998196821e-07, + "loss": 0.86679161, + "num_input_tokens_seen": 260601700, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.10675049, + "step": 12073, + "time_per_iteration": 2.5204336643218994 + }, + { + "auxiliary_loss_clip": 0.06408107, + "auxiliary_loss_mlp": 0.01262862, + "balance_loss_clip": 0.06276815, + "balance_loss_mlp": 0.0125292, + "epoch": 0.7259281527130618, + "flos": 34863150360960.0, + "grad_norm": 1.4580787781655038, + "language_loss": 0.7035231, + "learning_rate": 7.374235295384923e-07, + "loss": 0.78023279, + "num_input_tokens_seen": 260623040, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09942627, + "step": 12074, + "time_per_iteration": 2.6230850219726562 + }, + { + "auxiliary_loss_clip": 0.06411288, + "auxiliary_loss_mlp": 0.01265218, + "balance_loss_clip": 0.06275726, + "balance_loss_mlp": 0.01255342, + "epoch": 0.7259882759657298, + "flos": 25410657991680.0, + "grad_norm": 2.2056247097324193, + "language_loss": 0.74623215, + "learning_rate": 7.371215071343302e-07, + "loss": 0.82299727, + "num_input_tokens_seen": 260642735, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.09875488, + "step": 12075, + "time_per_iteration": 2.556225538253784 + }, + { + "auxiliary_loss_clip": 0.06410095, + "auxiliary_loss_mlp": 0.01264907, + "balance_loss_clip": 0.06275606, + "balance_loss_mlp": 0.01254089, + "epoch": 0.7260483992183977, + "flos": 62966781924480.0, + "grad_norm": 1.5598815820341405, + "language_loss": 0.64038914, + "learning_rate": 7.368195326186458e-07, + "loss": 0.71713918, + "num_input_tokens_seen": 260669935, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.10803223, + "step": 12076, + "time_per_iteration": 4.355054616928101 + }, + { + "auxiliary_loss_clip": 0.064101, + "auxiliary_loss_mlp": 0.01263502, + "balance_loss_clip": 0.06276171, + "balance_loss_mlp": 0.01253703, + "epoch": 0.7261085224710657, + "flos": 26474522555520.0, + "grad_norm": 1.8575056289170144, + "language_loss": 0.7908951, + "learning_rate": 7.365176060028912e-07, + "loss": 0.86763114, + "num_input_tokens_seen": 260689605, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.09796143, + "step": 12077, + "time_per_iteration": 2.5509204864501953 + }, + { + "auxiliary_loss_clip": 0.06314351, + "auxiliary_loss_mlp": 0.01251566, + "balance_loss_clip": 0.06259085, + "balance_loss_mlp": 0.01250447, + "epoch": 0.7261686457237336, + "flos": 66790634198400.0, + "grad_norm": 0.8642282673020346, + "language_loss": 0.64994717, + "learning_rate": 7.362157272985163e-07, + "loss": 0.72560632, + "num_input_tokens_seen": 260748265, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01121521, + "step": 12078, + "time_per_iteration": 3.138261556625366 + }, + { + "auxiliary_loss_clip": 0.06315269, + "auxiliary_loss_mlp": 0.0125259, + "balance_loss_clip": 0.06259946, + "balance_loss_mlp": 0.01251419, + "epoch": 0.7262287689764017, + "flos": 70020731640960.0, + "grad_norm": 0.7225013247461266, + "language_loss": 0.59434861, + "learning_rate": 7.359138965169671e-07, + "loss": 0.67002714, + "num_input_tokens_seen": 260816715, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.0116806, + "step": 12079, + "time_per_iteration": 3.2418954372406006 + }, + { + "auxiliary_loss_clip": 0.06405665, + "auxiliary_loss_mlp": 0.01266491, + "balance_loss_clip": 0.06273351, + "balance_loss_mlp": 0.01256495, + "epoch": 0.7262888922290696, + "flos": 23812212378240.0, + "grad_norm": 1.9020587797469353, + "language_loss": 0.64648104, + "learning_rate": 7.356121136696895e-07, + "loss": 0.72320265, + "num_input_tokens_seen": 260836765, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.09997559, + "step": 12080, + "time_per_iteration": 2.559204339981079 + }, + { + "auxiliary_loss_clip": 0.06412919, + "auxiliary_loss_mlp": 0.01265282, + "balance_loss_clip": 0.06278147, + "balance_loss_mlp": 0.01254637, + "epoch": 0.7263490154817376, + "flos": 19506412218240.0, + "grad_norm": 2.774312810040863, + "language_loss": 0.70093364, + "learning_rate": 7.35310378768128e-07, + "loss": 0.77771568, + "num_input_tokens_seen": 260854610, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.10644531, + "step": 12081, + "time_per_iteration": 2.4881443977355957 + }, + { + "auxiliary_loss_clip": 0.06414886, + "auxiliary_loss_mlp": 0.01264794, + "balance_loss_clip": 0.06277792, + "balance_loss_mlp": 0.01255144, + "epoch": 0.7264091387344055, + "flos": 16291240801920.0, + "grad_norm": 1.7064307786891335, + "language_loss": 0.81121981, + "learning_rate": 7.350086918237237e-07, + "loss": 0.88801658, + "num_input_tokens_seen": 260871620, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.09655762, + "step": 12082, + "time_per_iteration": 2.51804256439209 + }, + { + "auxiliary_loss_clip": 0.06418996, + "auxiliary_loss_mlp": 0.0126418, + "balance_loss_clip": 0.06277427, + "balance_loss_mlp": 0.01252474, + "epoch": 0.7264692619870735, + "flos": 24358784561280.0, + "grad_norm": 2.224005114416304, + "language_loss": 0.77144599, + "learning_rate": 7.347070528479158e-07, + "loss": 0.84827775, + "num_input_tokens_seen": 260890490, + "router_z_loss_clip": 1.41503906, + "router_z_loss_mlp": 0.11706543, + "step": 12083, + "time_per_iteration": 2.5199551582336426 + }, + { + "auxiliary_loss_clip": 0.06416926, + "auxiliary_loss_mlp": 0.01265943, + "balance_loss_clip": 0.06278973, + "balance_loss_mlp": 0.01255441, + "epoch": 0.7265293852397414, + "flos": 25126568323200.0, + "grad_norm": 1.6593932119603014, + "language_loss": 0.72771877, + "learning_rate": 7.344054618521433e-07, + "loss": 0.80454749, + "num_input_tokens_seen": 260909700, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.10498047, + "step": 12084, + "time_per_iteration": 2.5542185306549072 + }, + { + "auxiliary_loss_clip": 0.06412492, + "auxiliary_loss_mlp": 0.0126483, + "balance_loss_clip": 0.06276167, + "balance_loss_mlp": 0.01254173, + "epoch": 0.7265895084924094, + "flos": 22644869621760.0, + "grad_norm": 1.8149106211320094, + "language_loss": 0.78171599, + "learning_rate": 7.34103918847843e-07, + "loss": 0.85848927, + "num_input_tokens_seen": 260929090, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.10656738, + "step": 12085, + "time_per_iteration": 2.5213918685913086 + }, + { + "auxiliary_loss_clip": 0.06410021, + "auxiliary_loss_mlp": 0.0126413, + "balance_loss_clip": 0.06274905, + "balance_loss_mlp": 0.0125473, + "epoch": 0.7266496317450775, + "flos": 23375030100480.0, + "grad_norm": 1.688683771457735, + "language_loss": 0.7278198, + "learning_rate": 7.338024238464493e-07, + "loss": 0.80456126, + "num_input_tokens_seen": 260946615, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.09405518, + "step": 12086, + "time_per_iteration": 2.5169167518615723 + }, + { + "auxiliary_loss_clip": 0.06407881, + "auxiliary_loss_mlp": 0.01265357, + "balance_loss_clip": 0.06275129, + "balance_loss_mlp": 0.01255123, + "epoch": 0.7267097549977454, + "flos": 28082150190720.0, + "grad_norm": 1.7618222753787933, + "language_loss": 0.69773293, + "learning_rate": 7.335009768593938e-07, + "loss": 0.77446526, + "num_input_tokens_seen": 260968515, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.10247803, + "step": 12087, + "time_per_iteration": 2.552579641342163 + }, + { + "auxiliary_loss_clip": 0.06413816, + "auxiliary_loss_mlp": 0.01265425, + "balance_loss_clip": 0.06276657, + "balance_loss_mlp": 0.01254195, + "epoch": 0.7267698782504134, + "flos": 22201272506880.0, + "grad_norm": 1.8690535814436378, + "language_loss": 0.79212523, + "learning_rate": 7.331995778981088e-07, + "loss": 0.86891758, + "num_input_tokens_seen": 260986790, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.11230469, + "step": 12088, + "time_per_iteration": 2.5224051475524902 + }, + { + "auxiliary_loss_clip": 0.06411967, + "auxiliary_loss_mlp": 0.01267729, + "balance_loss_clip": 0.06275503, + "balance_loss_mlp": 0.01257561, + "epoch": 0.7268300015030813, + "flos": 18520729113600.0, + "grad_norm": 2.081138271531092, + "language_loss": 0.74134862, + "learning_rate": 7.328982269740221e-07, + "loss": 0.81814551, + "num_input_tokens_seen": 261004925, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.10168457, + "step": 12089, + "time_per_iteration": 2.4536690711975098 + }, + { + "auxiliary_loss_clip": 0.06410675, + "auxiliary_loss_mlp": 0.01266044, + "balance_loss_clip": 0.06273594, + "balance_loss_mlp": 0.0125606, + "epoch": 0.7268901247557493, + "flos": 23992530364800.0, + "grad_norm": 1.672566959006191, + "language_loss": 0.71264297, + "learning_rate": 7.325969240985616e-07, + "loss": 0.78941011, + "num_input_tokens_seen": 261023895, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.09979248, + "step": 12090, + "time_per_iteration": 2.518209457397461 + }, + { + "auxiliary_loss_clip": 0.06411642, + "auxiliary_loss_mlp": 0.01265075, + "balance_loss_clip": 0.06275435, + "balance_loss_mlp": 0.01254209, + "epoch": 0.7269502480084172, + "flos": 32096313815040.0, + "grad_norm": 1.7636278155243394, + "language_loss": 0.774212, + "learning_rate": 7.322956692831528e-07, + "loss": 0.85097921, + "num_input_tokens_seen": 261045445, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.10864258, + "step": 12091, + "time_per_iteration": 2.5809051990509033 + }, + { + "auxiliary_loss_clip": 0.06407169, + "auxiliary_loss_mlp": 0.01262324, + "balance_loss_clip": 0.06273061, + "balance_loss_mlp": 0.01251947, + "epoch": 0.7270103712610853, + "flos": 19068852597120.0, + "grad_norm": 1.7821213244340646, + "language_loss": 0.71747637, + "learning_rate": 7.319944625392205e-07, + "loss": 0.79417133, + "num_input_tokens_seen": 261064275, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.10375977, + "step": 12092, + "time_per_iteration": 2.5037333965301514 + }, + { + "auxiliary_loss_clip": 0.06409185, + "auxiliary_loss_mlp": 0.01262916, + "balance_loss_clip": 0.0627584, + "balance_loss_mlp": 0.01252718, + "epoch": 0.7270704945137532, + "flos": 34541605117440.0, + "grad_norm": 1.8451884643439012, + "language_loss": 0.61625177, + "learning_rate": 7.31693303878184e-07, + "loss": 0.69297278, + "num_input_tokens_seen": 261083310, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.10198975, + "step": 12093, + "time_per_iteration": 2.6145272254943848 + }, + { + "auxiliary_loss_clip": 0.06407997, + "auxiliary_loss_mlp": 0.0127204, + "balance_loss_clip": 0.06275733, + "balance_loss_mlp": 0.01261461, + "epoch": 0.7271306177664212, + "flos": 21514101972480.0, + "grad_norm": 1.4518547441748084, + "language_loss": 0.7566582, + "learning_rate": 7.313921933114644e-07, + "loss": 0.83345854, + "num_input_tokens_seen": 261103460, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.10583496, + "step": 12094, + "time_per_iteration": 2.5348317623138428 + }, + { + "auxiliary_loss_clip": 0.06402551, + "auxiliary_loss_mlp": 0.01268346, + "balance_loss_clip": 0.0627303, + "balance_loss_mlp": 0.01258976, + "epoch": 0.7271907410190891, + "flos": 22278866987520.0, + "grad_norm": 1.9666023712862966, + "language_loss": 0.84875292, + "learning_rate": 7.310911308504808e-07, + "loss": 0.92546189, + "num_input_tokens_seen": 261121375, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.09375, + "step": 12095, + "time_per_iteration": 2.4921047687530518 + }, + { + "auxiliary_loss_clip": 0.06408881, + "auxiliary_loss_mlp": 0.01266756, + "balance_loss_clip": 0.06273626, + "balance_loss_mlp": 0.01256319, + "epoch": 0.7272508642717571, + "flos": 22899721415040.0, + "grad_norm": 1.6073112969743308, + "language_loss": 0.77431858, + "learning_rate": 7.307901165066479e-07, + "loss": 0.85107493, + "num_input_tokens_seen": 261141105, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.10437012, + "step": 12096, + "time_per_iteration": 2.5228958129882812 + }, + { + "auxiliary_loss_clip": 0.06409237, + "auxiliary_loss_mlp": 0.01264949, + "balance_loss_clip": 0.06274968, + "balance_loss_mlp": 0.01254852, + "epoch": 0.727310987524425, + "flos": 11660667016320.0, + "grad_norm": 1.766744410162751, + "language_loss": 0.72485346, + "learning_rate": 7.30489150291381e-07, + "loss": 0.80159533, + "num_input_tokens_seen": 261159255, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.10095215, + "step": 12097, + "time_per_iteration": 3.9472336769104004 + }, + { + "auxiliary_loss_clip": 0.06410161, + "auxiliary_loss_mlp": 0.01263507, + "balance_loss_clip": 0.06275506, + "balance_loss_mlp": 0.01253111, + "epoch": 0.727371110777093, + "flos": 24542247075840.0, + "grad_norm": 1.6914945832849257, + "language_loss": 0.76620024, + "learning_rate": 7.301882322160935e-07, + "loss": 0.84293687, + "num_input_tokens_seen": 261177960, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.10400391, + "step": 12098, + "time_per_iteration": 2.5401840209960938 + }, + { + "auxiliary_loss_clip": 0.06412796, + "auxiliary_loss_mlp": 0.0126774, + "balance_loss_clip": 0.06274997, + "balance_loss_mlp": 0.01256982, + "epoch": 0.7274312340297611, + "flos": 74755175690880.0, + "grad_norm": 1.647144818498915, + "language_loss": 0.67571467, + "learning_rate": 7.298873622921952e-07, + "loss": 0.75252008, + "num_input_tokens_seen": 261205660, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.10766602, + "step": 12099, + "time_per_iteration": 2.933919668197632 + }, + { + "auxiliary_loss_clip": 0.06414318, + "auxiliary_loss_mlp": 0.01268861, + "balance_loss_clip": 0.06274534, + "balance_loss_mlp": 0.0125731, + "epoch": 0.727491357282429, + "flos": 22348872673920.0, + "grad_norm": 1.593136067800256, + "language_loss": 0.72549355, + "learning_rate": 7.29586540531095e-07, + "loss": 0.80232537, + "num_input_tokens_seen": 261225185, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.11560059, + "step": 12100, + "time_per_iteration": 2.485959053039551 + }, + { + "auxiliary_loss_clip": 0.06406155, + "auxiliary_loss_mlp": 0.01265862, + "balance_loss_clip": 0.06273396, + "balance_loss_mlp": 0.01256778, + "epoch": 0.727551480535097, + "flos": 23304730924800.0, + "grad_norm": 1.4119889543918884, + "language_loss": 0.75127757, + "learning_rate": 7.292857669442005e-07, + "loss": 0.82799774, + "num_input_tokens_seen": 261247965, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.09088135, + "step": 12101, + "time_per_iteration": 2.610421895980835 + }, + { + "auxiliary_loss_clip": 0.06405263, + "auxiliary_loss_mlp": 0.01263956, + "balance_loss_clip": 0.06274393, + "balance_loss_mlp": 0.01254651, + "epoch": 0.7276116037877649, + "flos": 21476981813760.0, + "grad_norm": 1.6630445155880014, + "language_loss": 0.82583451, + "learning_rate": 7.289850415429177e-07, + "loss": 0.90252674, + "num_input_tokens_seen": 261267585, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.09295654, + "step": 12102, + "time_per_iteration": 2.5227344036102295 + }, + { + "auxiliary_loss_clip": 0.06406877, + "auxiliary_loss_mlp": 0.01266073, + "balance_loss_clip": 0.06273448, + "balance_loss_mlp": 0.012565, + "epoch": 0.7276717270404329, + "flos": 21469393019520.0, + "grad_norm": 2.031204621507473, + "language_loss": 0.81889427, + "learning_rate": 7.286843643386495e-07, + "loss": 0.89562374, + "num_input_tokens_seen": 261285200, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.09570312, + "step": 12103, + "time_per_iteration": 2.4974191188812256 + }, + { + "auxiliary_loss_clip": 0.06410246, + "auxiliary_loss_mlp": 0.01264171, + "balance_loss_clip": 0.06276208, + "balance_loss_mlp": 0.01253818, + "epoch": 0.7277318502931008, + "flos": 16842928083840.0, + "grad_norm": 1.574176499871837, + "language_loss": 0.66993153, + "learning_rate": 7.283837353427968e-07, + "loss": 0.74667573, + "num_input_tokens_seen": 261303645, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.10351562, + "step": 12104, + "time_per_iteration": 2.4653480052948 + }, + { + "auxiliary_loss_clip": 0.06406664, + "auxiliary_loss_mlp": 0.01268067, + "balance_loss_clip": 0.06276865, + "balance_loss_mlp": 0.01257815, + "epoch": 0.7277919735457689, + "flos": 33408824970240.0, + "grad_norm": 1.70221768283368, + "language_loss": 0.65823901, + "learning_rate": 7.280831545667611e-07, + "loss": 0.73498631, + "num_input_tokens_seen": 261323265, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.1026001, + "step": 12105, + "time_per_iteration": 2.6353166103363037 + }, + { + "auxiliary_loss_clip": 0.06408508, + "auxiliary_loss_mlp": 0.01267339, + "balance_loss_clip": 0.06276379, + "balance_loss_mlp": 0.01257599, + "epoch": 0.7278520967984368, + "flos": 19212218133120.0, + "grad_norm": 2.1199426403905197, + "language_loss": 0.75508106, + "learning_rate": 7.27782622021939e-07, + "loss": 0.83183956, + "num_input_tokens_seen": 261339745, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.09741211, + "step": 12106, + "time_per_iteration": 2.46575665473938 + }, + { + "auxiliary_loss_clip": 0.06411369, + "auxiliary_loss_mlp": 0.01266618, + "balance_loss_clip": 0.06273164, + "balance_loss_mlp": 0.01255228, + "epoch": 0.7279122200511048, + "flos": 34103206955520.0, + "grad_norm": 1.806710660650235, + "language_loss": 0.70616901, + "learning_rate": 7.274821377197273e-07, + "loss": 0.78294891, + "num_input_tokens_seen": 261359310, + "router_z_loss_clip": 1.38183594, + "router_z_loss_mlp": 0.11395264, + "step": 12107, + "time_per_iteration": 2.6280477046966553 + }, + { + "auxiliary_loss_clip": 0.06407417, + "auxiliary_loss_mlp": 0.01263404, + "balance_loss_clip": 0.06274886, + "balance_loss_mlp": 0.01253885, + "epoch": 0.7279723433037727, + "flos": 54610913865600.0, + "grad_norm": 1.4427675680101948, + "language_loss": 0.75342691, + "learning_rate": 7.271817016715205e-07, + "loss": 0.83013523, + "num_input_tokens_seen": 261384640, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.09515381, + "step": 12108, + "time_per_iteration": 4.324532985687256 + }, + { + "auxiliary_loss_clip": 0.0640891, + "auxiliary_loss_mlp": 0.01265846, + "balance_loss_clip": 0.06273282, + "balance_loss_mlp": 0.01255809, + "epoch": 0.7280324665564407, + "flos": 36146297859840.0, + "grad_norm": 1.5700716356881925, + "language_loss": 0.67018294, + "learning_rate": 7.268813138887124e-07, + "loss": 0.74693048, + "num_input_tokens_seen": 261405290, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.1003418, + "step": 12109, + "time_per_iteration": 2.615412473678589 + }, + { + "auxiliary_loss_clip": 0.06406409, + "auxiliary_loss_mlp": 0.01267734, + "balance_loss_clip": 0.06273519, + "balance_loss_mlp": 0.01256218, + "epoch": 0.7280925898091086, + "flos": 11623169514240.0, + "grad_norm": 7.186110502128194, + "language_loss": 0.63434047, + "learning_rate": 7.265809743826912e-07, + "loss": 0.71108198, + "num_input_tokens_seen": 261419710, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.11517334, + "step": 12110, + "time_per_iteration": 2.4591712951660156 + }, + { + "auxiliary_loss_clip": 0.06409231, + "auxiliary_loss_mlp": 0.01266788, + "balance_loss_clip": 0.06271385, + "balance_loss_mlp": 0.01256184, + "epoch": 0.7281527130617766, + "flos": 34285663221120.0, + "grad_norm": 1.770442169865723, + "language_loss": 0.5852263, + "learning_rate": 7.26280683164847e-07, + "loss": 0.66198647, + "num_input_tokens_seen": 261442385, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.10595703, + "step": 12111, + "time_per_iteration": 2.5891120433807373 + }, + { + "auxiliary_loss_clip": 0.06411764, + "auxiliary_loss_mlp": 0.01265175, + "balance_loss_clip": 0.0627564, + "balance_loss_mlp": 0.01254744, + "epoch": 0.7282128363144446, + "flos": 13923208563840.0, + "grad_norm": 2.24560382762785, + "language_loss": 0.74143445, + "learning_rate": 7.259804402465677e-07, + "loss": 0.81820381, + "num_input_tokens_seen": 261459805, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.10430908, + "step": 12112, + "time_per_iteration": 3.927354335784912 + }, + { + "auxiliary_loss_clip": 0.0640655, + "auxiliary_loss_mlp": 0.01266322, + "balance_loss_clip": 0.06273867, + "balance_loss_mlp": 0.01256767, + "epoch": 0.7282729595671126, + "flos": 20783983420800.0, + "grad_norm": 2.386616636448106, + "language_loss": 0.66917908, + "learning_rate": 7.25680245639237e-07, + "loss": 0.74590778, + "num_input_tokens_seen": 261477175, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09552002, + "step": 12113, + "time_per_iteration": 2.501143455505371 + }, + { + "auxiliary_loss_clip": 0.06406707, + "auxiliary_loss_mlp": 0.01264241, + "balance_loss_clip": 0.06271793, + "balance_loss_mlp": 0.01254311, + "epoch": 0.7283330828197806, + "flos": 16330876583040.0, + "grad_norm": 1.6899344961685594, + "language_loss": 0.73054916, + "learning_rate": 7.253800993542399e-07, + "loss": 0.80725861, + "num_input_tokens_seen": 261494990, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.0993042, + "step": 12114, + "time_per_iteration": 2.492030382156372 + }, + { + "auxiliary_loss_clip": 0.06404929, + "auxiliary_loss_mlp": 0.01265418, + "balance_loss_clip": 0.06272236, + "balance_loss_mlp": 0.01255429, + "epoch": 0.7283932060724485, + "flos": 27497535454080.0, + "grad_norm": 1.7662061899425427, + "language_loss": 0.68715543, + "learning_rate": 7.250800014029564e-07, + "loss": 0.76385891, + "num_input_tokens_seen": 261514445, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09985352, + "step": 12115, + "time_per_iteration": 2.557182788848877 + }, + { + "auxiliary_loss_clip": 0.06409318, + "auxiliary_loss_mlp": 0.01265218, + "balance_loss_clip": 0.06271549, + "balance_loss_mlp": 0.01254811, + "epoch": 0.7284533293251165, + "flos": 18373548216960.0, + "grad_norm": 1.8492705823258373, + "language_loss": 0.60310125, + "learning_rate": 7.247799517967674e-07, + "loss": 0.67984653, + "num_input_tokens_seen": 261533565, + "router_z_loss_clip": 1.37695312, + "router_z_loss_mlp": 0.10406494, + "step": 12116, + "time_per_iteration": 3.906881093978882 + }, + { + "auxiliary_loss_clip": 0.06408231, + "auxiliary_loss_mlp": 0.01266827, + "balance_loss_clip": 0.06275375, + "balance_loss_mlp": 0.01256766, + "epoch": 0.7285134525777844, + "flos": 21731917461120.0, + "grad_norm": 1.7320251042844839, + "language_loss": 0.72842097, + "learning_rate": 7.2447995054705e-07, + "loss": 0.80517155, + "num_input_tokens_seen": 261553795, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.10058594, + "step": 12117, + "time_per_iteration": 2.522825002670288 + }, + { + "auxiliary_loss_clip": 0.06408626, + "auxiliary_loss_mlp": 0.01265235, + "balance_loss_clip": 0.06274951, + "balance_loss_mlp": 0.01254673, + "epoch": 0.7285735758304525, + "flos": 20747743729920.0, + "grad_norm": 1.8305634695552309, + "language_loss": 0.69773346, + "learning_rate": 7.241799976651807e-07, + "loss": 0.77447206, + "num_input_tokens_seen": 261572565, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.10565186, + "step": 12118, + "time_per_iteration": 2.48207426071167 + }, + { + "auxiliary_loss_clip": 0.06402861, + "auxiliary_loss_mlp": 0.01267316, + "balance_loss_clip": 0.06275323, + "balance_loss_mlp": 0.01257714, + "epoch": 0.7286336990831204, + "flos": 17316643541760.0, + "grad_norm": 1.7593601335155638, + "language_loss": 0.84603906, + "learning_rate": 7.238800931625346e-07, + "loss": 0.92274088, + "num_input_tokens_seen": 261590910, + "router_z_loss_clip": 1.27539062, + "router_z_loss_mlp": 0.0960083, + "step": 12119, + "time_per_iteration": 2.6029109954833984 + }, + { + "auxiliary_loss_clip": 0.0640807, + "auxiliary_loss_mlp": 0.01265759, + "balance_loss_clip": 0.06272867, + "balance_loss_mlp": 0.01255454, + "epoch": 0.7286938223357884, + "flos": 19792724019840.0, + "grad_norm": 1.9939013522780928, + "language_loss": 0.82186806, + "learning_rate": 7.235802370504831e-07, + "loss": 0.89860642, + "num_input_tokens_seen": 261606005, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.10308838, + "step": 12120, + "time_per_iteration": 2.4777402877807617 + }, + { + "auxiliary_loss_clip": 0.06409417, + "auxiliary_loss_mlp": 0.01265212, + "balance_loss_clip": 0.06275336, + "balance_loss_mlp": 0.0125496, + "epoch": 0.7287539455884563, + "flos": 15346241654400.0, + "grad_norm": 1.8086433157736466, + "language_loss": 0.7907117, + "learning_rate": 7.232804293403963e-07, + "loss": 0.86745799, + "num_input_tokens_seen": 261622305, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.10266113, + "step": 12121, + "time_per_iteration": 2.493319511413574 + }, + { + "auxiliary_loss_clip": 0.06409892, + "auxiliary_loss_mlp": 0.01266243, + "balance_loss_clip": 0.06270927, + "balance_loss_mlp": 0.01255693, + "epoch": 0.7288140688411243, + "flos": 25199592756480.0, + "grad_norm": 1.5783623622806526, + "language_loss": 0.69521451, + "learning_rate": 7.229806700436441e-07, + "loss": 0.77197587, + "num_input_tokens_seen": 261642465, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.10559082, + "step": 12122, + "time_per_iteration": 2.524064064025879 + }, + { + "auxiliary_loss_clip": 0.06402311, + "auxiliary_loss_mlp": 0.01263872, + "balance_loss_clip": 0.06270998, + "balance_loss_mlp": 0.01254586, + "epoch": 0.7288741920937922, + "flos": 23990350158720.0, + "grad_norm": 1.7454149846167522, + "language_loss": 0.87436593, + "learning_rate": 7.226809591715923e-07, + "loss": 0.95102781, + "num_input_tokens_seen": 261661420, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.09283447, + "step": 12123, + "time_per_iteration": 2.542051315307617 + }, + { + "auxiliary_loss_clip": 0.06402463, + "auxiliary_loss_mlp": 0.01265281, + "balance_loss_clip": 0.06270853, + "balance_loss_mlp": 0.01255094, + "epoch": 0.7289343153464602, + "flos": 22751114999040.0, + "grad_norm": 1.6465558507133775, + "language_loss": 0.8315962, + "learning_rate": 7.223812967356065e-07, + "loss": 0.90827358, + "num_input_tokens_seen": 261680865, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.10186768, + "step": 12124, + "time_per_iteration": 2.493330955505371 + }, + { + "auxiliary_loss_clip": 0.06405756, + "auxiliary_loss_mlp": 0.01266287, + "balance_loss_clip": 0.06272376, + "balance_loss_mlp": 0.01256173, + "epoch": 0.7289944385991282, + "flos": 24906991898880.0, + "grad_norm": 1.5973594077423074, + "language_loss": 0.66998374, + "learning_rate": 7.220816827470499e-07, + "loss": 0.74670422, + "num_input_tokens_seen": 261701455, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.10113525, + "step": 12125, + "time_per_iteration": 2.5571157932281494 + }, + { + "auxiliary_loss_clip": 0.06410982, + "auxiliary_loss_mlp": 0.01267293, + "balance_loss_clip": 0.06272356, + "balance_loss_mlp": 0.01255521, + "epoch": 0.7290545618517962, + "flos": 22973835951360.0, + "grad_norm": 1.7735347741305036, + "language_loss": 0.75574493, + "learning_rate": 7.217821172172855e-07, + "loss": 0.83252764, + "num_input_tokens_seen": 261721260, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.11773682, + "step": 12126, + "time_per_iteration": 2.4986443519592285 + }, + { + "auxiliary_loss_clip": 0.0631386, + "auxiliary_loss_mlp": 0.01254001, + "balance_loss_clip": 0.06258902, + "balance_loss_mlp": 0.01252942, + "epoch": 0.7291146851044642, + "flos": 61921602092160.0, + "grad_norm": 0.8043212871024376, + "language_loss": 0.58652955, + "learning_rate": 7.2148260015767e-07, + "loss": 0.66220808, + "num_input_tokens_seen": 261779370, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.01060486, + "step": 12127, + "time_per_iteration": 3.065887689590454 + }, + { + "auxiliary_loss_clip": 0.06406868, + "auxiliary_loss_mlp": 0.01268134, + "balance_loss_clip": 0.06276388, + "balance_loss_mlp": 0.01259032, + "epoch": 0.7291748083571321, + "flos": 23337616452480.0, + "grad_norm": 2.002154348717822, + "language_loss": 0.68532437, + "learning_rate": 7.21183131579562e-07, + "loss": 0.76207435, + "num_input_tokens_seen": 261798050, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.09100342, + "step": 12128, + "time_per_iteration": 2.5636982917785645 + }, + { + "auxiliary_loss_clip": 0.06407112, + "auxiliary_loss_mlp": 0.01265972, + "balance_loss_clip": 0.06272791, + "balance_loss_mlp": 0.01255493, + "epoch": 0.7292349316098001, + "flos": 28337588962560.0, + "grad_norm": 1.9770234243530824, + "language_loss": 0.65893352, + "learning_rate": 7.20883711494319e-07, + "loss": 0.73566437, + "num_input_tokens_seen": 261817660, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.10479736, + "step": 12129, + "time_per_iteration": 2.5952858924865723 + }, + { + "auxiliary_loss_clip": 0.06401228, + "auxiliary_loss_mlp": 0.01265963, + "balance_loss_clip": 0.06271209, + "balance_loss_mlp": 0.01255878, + "epoch": 0.729295054862468, + "flos": 24138788866560.0, + "grad_norm": 2.8834397381641206, + "language_loss": 0.74323857, + "learning_rate": 7.205843399132927e-07, + "loss": 0.81991053, + "num_input_tokens_seen": 261837935, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.10076904, + "step": 12130, + "time_per_iteration": 2.5151498317718506 + }, + { + "auxiliary_loss_clip": 0.06408465, + "auxiliary_loss_mlp": 0.01266174, + "balance_loss_clip": 0.06273751, + "balance_loss_mlp": 0.01256548, + "epoch": 0.7293551781151361, + "flos": 22822168861440.0, + "grad_norm": 1.7601185133573507, + "language_loss": 0.69902027, + "learning_rate": 7.202850168478374e-07, + "loss": 0.77576661, + "num_input_tokens_seen": 261857575, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.09625244, + "step": 12131, + "time_per_iteration": 2.5700907707214355 + }, + { + "auxiliary_loss_clip": 0.06405198, + "auxiliary_loss_mlp": 0.0126315, + "balance_loss_clip": 0.06273468, + "balance_loss_mlp": 0.01253238, + "epoch": 0.729415301367804, + "flos": 22133111610240.0, + "grad_norm": 1.4321727616978588, + "language_loss": 0.77646959, + "learning_rate": 7.199857423093025e-07, + "loss": 0.85315311, + "num_input_tokens_seen": 261877265, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09912109, + "step": 12132, + "time_per_iteration": 2.5047810077667236 + }, + { + "auxiliary_loss_clip": 0.06406032, + "auxiliary_loss_mlp": 0.01268163, + "balance_loss_clip": 0.06274553, + "balance_loss_mlp": 0.01258382, + "epoch": 0.729475424620472, + "flos": 12354587804160.0, + "grad_norm": 2.26553261567321, + "language_loss": 0.79865611, + "learning_rate": 7.196865163090358e-07, + "loss": 0.87539804, + "num_input_tokens_seen": 261893695, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.09790039, + "step": 12133, + "time_per_iteration": 2.5156800746917725 + }, + { + "auxiliary_loss_clip": 0.06405626, + "auxiliary_loss_mlp": 0.01262377, + "balance_loss_clip": 0.06273352, + "balance_loss_mlp": 0.01252555, + "epoch": 0.7295355478731399, + "flos": 22201020944640.0, + "grad_norm": 2.1172065702021228, + "language_loss": 0.72792143, + "learning_rate": 7.193873388583846e-07, + "loss": 0.80460143, + "num_input_tokens_seen": 261911825, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.09832764, + "step": 12134, + "time_per_iteration": 2.493656873703003 + }, + { + "auxiliary_loss_clip": 0.06407951, + "auxiliary_loss_mlp": 0.01266233, + "balance_loss_clip": 0.06272922, + "balance_loss_mlp": 0.01255796, + "epoch": 0.7295956711258079, + "flos": 23228771598720.0, + "grad_norm": 1.8016892870366705, + "language_loss": 0.7149846, + "learning_rate": 7.190882099686939e-07, + "loss": 0.79172647, + "num_input_tokens_seen": 261931190, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.10424805, + "step": 12135, + "time_per_iteration": 2.5029256343841553 + }, + { + "auxiliary_loss_clip": 0.06412001, + "auxiliary_loss_mlp": 0.01266167, + "balance_loss_clip": 0.06275906, + "balance_loss_mlp": 0.01256362, + "epoch": 0.7296557943784758, + "flos": 31877282442240.0, + "grad_norm": 2.0055855777259683, + "language_loss": 0.62525374, + "learning_rate": 7.187891296513075e-07, + "loss": 0.70203543, + "num_input_tokens_seen": 261951240, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.0980835, + "step": 12136, + "time_per_iteration": 2.6325221061706543 + }, + { + "auxiliary_loss_clip": 0.06405275, + "auxiliary_loss_mlp": 0.01264655, + "balance_loss_clip": 0.06272214, + "balance_loss_mlp": 0.01255184, + "epoch": 0.7297159176311439, + "flos": 26659033246080.0, + "grad_norm": 1.794436841721563, + "language_loss": 0.7470715, + "learning_rate": 7.184900979175654e-07, + "loss": 0.82377088, + "num_input_tokens_seen": 261971605, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.09472656, + "step": 12137, + "time_per_iteration": 3.958789825439453 + }, + { + "auxiliary_loss_clip": 0.06406206, + "auxiliary_loss_mlp": 0.0126361, + "balance_loss_clip": 0.0627296, + "balance_loss_mlp": 0.01253466, + "epoch": 0.7297760408838118, + "flos": 24755744079360.0, + "grad_norm": 1.5243930727188364, + "language_loss": 0.74341732, + "learning_rate": 7.181911147788069e-07, + "loss": 0.82011551, + "num_input_tokens_seen": 261990830, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.10162354, + "step": 12138, + "time_per_iteration": 2.5344252586364746 + }, + { + "auxiliary_loss_clip": 0.06401816, + "auxiliary_loss_mlp": 0.01265792, + "balance_loss_clip": 0.06270966, + "balance_loss_mlp": 0.01256434, + "epoch": 0.7298361641364798, + "flos": 18079018715520.0, + "grad_norm": 2.292743835188078, + "language_loss": 0.72074485, + "learning_rate": 7.178921802463702e-07, + "loss": 0.79742092, + "num_input_tokens_seen": 262008190, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09350586, + "step": 12139, + "time_per_iteration": 2.4686436653137207 + }, + { + "auxiliary_loss_clip": 0.06401777, + "auxiliary_loss_mlp": 0.01264266, + "balance_loss_clip": 0.06273351, + "balance_loss_mlp": 0.01255015, + "epoch": 0.7298962873891478, + "flos": 29902897486080.0, + "grad_norm": 1.4427366017316514, + "language_loss": 0.73659438, + "learning_rate": 7.175932943315898e-07, + "loss": 0.81325477, + "num_input_tokens_seen": 262030460, + "router_z_loss_clip": 1.28417969, + "router_z_loss_mlp": 0.09246826, + "step": 12140, + "time_per_iteration": 2.5841948986053467 + }, + { + "auxiliary_loss_clip": 0.06410205, + "auxiliary_loss_mlp": 0.01265019, + "balance_loss_clip": 0.06274636, + "balance_loss_mlp": 0.01254266, + "epoch": 0.7299564106418157, + "flos": 32273613054720.0, + "grad_norm": 1.4465948977154814, + "language_loss": 0.55615419, + "learning_rate": 7.172944570458003e-07, + "loss": 0.63290644, + "num_input_tokens_seen": 262050830, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.10748291, + "step": 12141, + "time_per_iteration": 2.5818471908569336 + }, + { + "auxiliary_loss_clip": 0.06406234, + "auxiliary_loss_mlp": 0.01263105, + "balance_loss_clip": 0.06276207, + "balance_loss_mlp": 0.01254277, + "epoch": 0.7300165338944837, + "flos": 22937009281920.0, + "grad_norm": 1.432470794912082, + "language_loss": 0.73197258, + "learning_rate": 7.169956684003342e-07, + "loss": 0.80866599, + "num_input_tokens_seen": 262071245, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.0881958, + "step": 12142, + "time_per_iteration": 2.5505692958831787 + }, + { + "auxiliary_loss_clip": 0.0640648, + "auxiliary_loss_mlp": 0.01261695, + "balance_loss_clip": 0.06273788, + "balance_loss_mlp": 0.01252629, + "epoch": 0.7300766571471516, + "flos": 19834959277440.0, + "grad_norm": 1.6768515180809767, + "language_loss": 0.74087632, + "learning_rate": 7.16696928406521e-07, + "loss": 0.81755805, + "num_input_tokens_seen": 262087525, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.09063721, + "step": 12143, + "time_per_iteration": 2.490084648132324 + }, + { + "auxiliary_loss_clip": 0.06409657, + "auxiliary_loss_mlp": 0.01263891, + "balance_loss_clip": 0.0627545, + "balance_loss_mlp": 0.01253907, + "epoch": 0.7301367803998197, + "flos": 24353879097600.0, + "grad_norm": 2.204410002817552, + "language_loss": 0.66878092, + "learning_rate": 7.163982370756882e-07, + "loss": 0.74551642, + "num_input_tokens_seen": 262107355, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.09973145, + "step": 12144, + "time_per_iteration": 2.54231858253479 + }, + { + "auxiliary_loss_clip": 0.06408693, + "auxiliary_loss_mlp": 0.0126374, + "balance_loss_clip": 0.06274417, + "balance_loss_mlp": 0.01253232, + "epoch": 0.7301969036524876, + "flos": 15309918109440.0, + "grad_norm": 1.5759955689849319, + "language_loss": 0.79171866, + "learning_rate": 7.160995944191627e-07, + "loss": 0.86844301, + "num_input_tokens_seen": 262125645, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.10510254, + "step": 12145, + "time_per_iteration": 2.479991912841797 + }, + { + "auxiliary_loss_clip": 0.06406255, + "auxiliary_loss_mlp": 0.01266826, + "balance_loss_clip": 0.06275664, + "balance_loss_mlp": 0.01256819, + "epoch": 0.7302570269051556, + "flos": 23512945121280.0, + "grad_norm": 1.601000858309641, + "language_loss": 0.92001355, + "learning_rate": 7.158010004482702e-07, + "loss": 0.99674433, + "num_input_tokens_seen": 262144075, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.10003662, + "step": 12146, + "time_per_iteration": 2.536653757095337 + }, + { + "auxiliary_loss_clip": 0.06406654, + "auxiliary_loss_mlp": 0.01262625, + "balance_loss_clip": 0.06276748, + "balance_loss_mlp": 0.01252885, + "epoch": 0.7303171501578235, + "flos": 20529508970880.0, + "grad_norm": 1.778676340204468, + "language_loss": 0.62199593, + "learning_rate": 7.155024551743316e-07, + "loss": 0.69868875, + "num_input_tokens_seen": 262165940, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.097229, + "step": 12147, + "time_per_iteration": 3.9292736053466797 + }, + { + "auxiliary_loss_clip": 0.06418571, + "auxiliary_loss_mlp": 0.01266018, + "balance_loss_clip": 0.06282554, + "balance_loss_mlp": 0.0125579, + "epoch": 0.7303772734104915, + "flos": 18338482483200.0, + "grad_norm": 1.749812940389672, + "language_loss": 0.75328469, + "learning_rate": 7.152039586086693e-07, + "loss": 0.83013058, + "num_input_tokens_seen": 262184520, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.10229492, + "step": 12148, + "time_per_iteration": 2.466489791870117 + }, + { + "auxiliary_loss_clip": 0.06311627, + "auxiliary_loss_mlp": 0.01255211, + "balance_loss_clip": 0.06256207, + "balance_loss_mlp": 0.01254079, + "epoch": 0.7304373966631594, + "flos": 60673604181120.0, + "grad_norm": 3.1920126472148245, + "language_loss": 0.56622815, + "learning_rate": 7.149055107626017e-07, + "loss": 0.64189649, + "num_input_tokens_seen": 262247070, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01133728, + "step": 12149, + "time_per_iteration": 3.1208536624908447 + }, + { + "auxiliary_loss_clip": 0.06409251, + "auxiliary_loss_mlp": 0.01266086, + "balance_loss_clip": 0.06273203, + "balance_loss_mlp": 0.01256108, + "epoch": 0.7304975199158275, + "flos": 19834120736640.0, + "grad_norm": 2.2110460738796847, + "language_loss": 0.74197543, + "learning_rate": 7.146071116474451e-07, + "loss": 0.8187288, + "num_input_tokens_seen": 262266605, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.09979248, + "step": 12150, + "time_per_iteration": 2.563061475753784 + }, + { + "auxiliary_loss_clip": 0.06411943, + "auxiliary_loss_mlp": 0.01268026, + "balance_loss_clip": 0.0627417, + "balance_loss_mlp": 0.01257804, + "epoch": 0.7305576431684954, + "flos": 13228910432640.0, + "grad_norm": 2.0644493545304012, + "language_loss": 0.845092, + "learning_rate": 7.143087612745158e-07, + "loss": 0.92189169, + "num_input_tokens_seen": 262283880, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.10229492, + "step": 12151, + "time_per_iteration": 3.9333503246307373 + }, + { + "auxiliary_loss_clip": 0.0641029, + "auxiliary_loss_mlp": 0.01268677, + "balance_loss_clip": 0.06276184, + "balance_loss_mlp": 0.01258395, + "epoch": 0.7306177664211634, + "flos": 24067231879680.0, + "grad_norm": 1.709088154989502, + "language_loss": 0.77853483, + "learning_rate": 7.14010459655127e-07, + "loss": 0.85532451, + "num_input_tokens_seen": 262304155, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.10272217, + "step": 12152, + "time_per_iteration": 2.549255132675171 + }, + { + "auxiliary_loss_clip": 0.06408677, + "auxiliary_loss_mlp": 0.01265757, + "balance_loss_clip": 0.06275931, + "balance_loss_mlp": 0.0125588, + "epoch": 0.7306778896738314, + "flos": 27096425159040.0, + "grad_norm": 1.4467429234304112, + "language_loss": 0.79911304, + "learning_rate": 7.137122068005919e-07, + "loss": 0.87585741, + "num_input_tokens_seen": 262325660, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.09875488, + "step": 12153, + "time_per_iteration": 2.584221839904785 + }, + { + "auxiliary_loss_clip": 0.06409719, + "auxiliary_loss_mlp": 0.01268444, + "balance_loss_clip": 0.06271548, + "balance_loss_mlp": 0.01258473, + "epoch": 0.7307380129264993, + "flos": 16696250311680.0, + "grad_norm": 1.5292836861635837, + "language_loss": 0.67226088, + "learning_rate": 7.134140027222173e-07, + "loss": 0.74904257, + "num_input_tokens_seen": 262344075, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.09967041, + "step": 12154, + "time_per_iteration": 2.482377052307129 + }, + { + "auxiliary_loss_clip": 0.06408456, + "auxiliary_loss_mlp": 0.01265496, + "balance_loss_clip": 0.06273298, + "balance_loss_mlp": 0.01255584, + "epoch": 0.7307981361791673, + "flos": 21732169023360.0, + "grad_norm": 1.735892015555871, + "language_loss": 0.66179639, + "learning_rate": 7.131158474313128e-07, + "loss": 0.73853588, + "num_input_tokens_seen": 262363305, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.09912109, + "step": 12155, + "time_per_iteration": 3.920834541320801 + }, + { + "auxiliary_loss_clip": 0.06405047, + "auxiliary_loss_mlp": 0.01263947, + "balance_loss_clip": 0.06273931, + "balance_loss_mlp": 0.01254416, + "epoch": 0.7308582594318352, + "flos": 18046468604160.0, + "grad_norm": 1.7732442430270934, + "language_loss": 0.82409012, + "learning_rate": 7.128177409391851e-07, + "loss": 0.90078008, + "num_input_tokens_seen": 262380730, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.09527588, + "step": 12156, + "time_per_iteration": 2.498297691345215 + }, + { + "auxiliary_loss_clip": 0.06404316, + "auxiliary_loss_mlp": 0.01268424, + "balance_loss_clip": 0.06272586, + "balance_loss_mlp": 0.01259304, + "epoch": 0.7309183826845033, + "flos": 13850100276480.0, + "grad_norm": 2.231479695583903, + "language_loss": 0.75512803, + "learning_rate": 7.125196832571367e-07, + "loss": 0.83185542, + "num_input_tokens_seen": 262395480, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09118652, + "step": 12157, + "time_per_iteration": 2.469118595123291 + }, + { + "auxiliary_loss_clip": 0.06404246, + "auxiliary_loss_mlp": 0.0126719, + "balance_loss_clip": 0.06274454, + "balance_loss_mlp": 0.0125816, + "epoch": 0.7309785059371712, + "flos": 17024881224960.0, + "grad_norm": 1.9988755435472185, + "language_loss": 0.73910487, + "learning_rate": 7.122216743964713e-07, + "loss": 0.81581926, + "num_input_tokens_seen": 262413340, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.090271, + "step": 12158, + "time_per_iteration": 2.498945713043213 + }, + { + "auxiliary_loss_clip": 0.06413946, + "auxiliary_loss_mlp": 0.01263808, + "balance_loss_clip": 0.06278427, + "balance_loss_mlp": 0.0125417, + "epoch": 0.7310386291898392, + "flos": 26509127091840.0, + "grad_norm": 1.5605455050098358, + "language_loss": 0.85817492, + "learning_rate": 7.119237143684896e-07, + "loss": 0.93495244, + "num_input_tokens_seen": 262433455, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.09637451, + "step": 12159, + "time_per_iteration": 2.5414113998413086 + }, + { + "auxiliary_loss_clip": 0.06415824, + "auxiliary_loss_mlp": 0.01267306, + "balance_loss_clip": 0.0627675, + "balance_loss_mlp": 0.01256148, + "epoch": 0.7310987524425071, + "flos": 16951521375360.0, + "grad_norm": 1.9612355888194155, + "language_loss": 0.74199778, + "learning_rate": 7.116258031844895e-07, + "loss": 0.81882906, + "num_input_tokens_seen": 262450335, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.1114502, + "step": 12160, + "time_per_iteration": 2.598435163497925 + }, + { + "auxiliary_loss_clip": 0.06413984, + "auxiliary_loss_mlp": 0.01266348, + "balance_loss_clip": 0.06275676, + "balance_loss_mlp": 0.01256304, + "epoch": 0.7311588756951751, + "flos": 13850477619840.0, + "grad_norm": 2.3687706371159023, + "language_loss": 0.72816062, + "learning_rate": 7.113279408557675e-07, + "loss": 0.80496389, + "num_input_tokens_seen": 262468240, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.10040283, + "step": 12161, + "time_per_iteration": 2.487931728363037 + }, + { + "auxiliary_loss_clip": 0.06419692, + "auxiliary_loss_mlp": 0.01265118, + "balance_loss_clip": 0.06277676, + "balance_loss_mlp": 0.01254413, + "epoch": 0.731218998947843, + "flos": 28775567854080.0, + "grad_norm": 1.7390428804054665, + "language_loss": 0.69832623, + "learning_rate": 7.110301273936192e-07, + "loss": 0.77517438, + "num_input_tokens_seen": 262487045, + "router_z_loss_clip": 1.41992188, + "router_z_loss_mlp": 0.10705566, + "step": 12162, + "time_per_iteration": 2.578719139099121 + }, + { + "auxiliary_loss_clip": 0.06409628, + "auxiliary_loss_mlp": 0.01266805, + "balance_loss_clip": 0.0627304, + "balance_loss_mlp": 0.01256785, + "epoch": 0.7312791222005111, + "flos": 27096047815680.0, + "grad_norm": 1.6401378277284773, + "language_loss": 0.67019415, + "learning_rate": 7.107323628093382e-07, + "loss": 0.74695843, + "num_input_tokens_seen": 262504855, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.10028076, + "step": 12163, + "time_per_iteration": 2.5393404960632324 + }, + { + "auxiliary_loss_clip": 0.06406513, + "auxiliary_loss_mlp": 0.01266726, + "balance_loss_clip": 0.0627192, + "balance_loss_mlp": 0.012566, + "epoch": 0.731339245453179, + "flos": 20930493484800.0, + "grad_norm": 1.6144773935767842, + "language_loss": 0.68972957, + "learning_rate": 7.104346471142153e-07, + "loss": 0.76646197, + "num_input_tokens_seen": 262524920, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.10119629, + "step": 12164, + "time_per_iteration": 2.5153493881225586 + }, + { + "auxiliary_loss_clip": 0.06404346, + "auxiliary_loss_mlp": 0.01263865, + "balance_loss_clip": 0.06274466, + "balance_loss_mlp": 0.01254262, + "epoch": 0.731399368705847, + "flos": 23082345388800.0, + "grad_norm": 1.4748874559419136, + "language_loss": 0.73714507, + "learning_rate": 7.101369803195391e-07, + "loss": 0.81382716, + "num_input_tokens_seen": 262545725, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.0960083, + "step": 12165, + "time_per_iteration": 2.5240328311920166 + }, + { + "auxiliary_loss_clip": 0.06409434, + "auxiliary_loss_mlp": 0.01264974, + "balance_loss_clip": 0.06273365, + "balance_loss_mlp": 0.01254782, + "epoch": 0.731459491958515, + "flos": 23588778666240.0, + "grad_norm": 1.7494932066214843, + "language_loss": 0.76978707, + "learning_rate": 7.098393624365988e-07, + "loss": 0.84653127, + "num_input_tokens_seen": 262565480, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.10192871, + "step": 12166, + "time_per_iteration": 2.535602569580078 + }, + { + "auxiliary_loss_clip": 0.06405294, + "auxiliary_loss_mlp": 0.01264593, + "balance_loss_clip": 0.06273952, + "balance_loss_mlp": 0.01254574, + "epoch": 0.7315196152111829, + "flos": 22385280072960.0, + "grad_norm": 1.6529519301050002, + "language_loss": 0.79870826, + "learning_rate": 7.095417934766781e-07, + "loss": 0.87540716, + "num_input_tokens_seen": 262584145, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.10015869, + "step": 12167, + "time_per_iteration": 2.5016744136810303 + }, + { + "auxiliary_loss_clip": 0.06406464, + "auxiliary_loss_mlp": 0.0126602, + "balance_loss_clip": 0.06274685, + "balance_loss_mlp": 0.01256155, + "epoch": 0.7315797384638509, + "flos": 26184227685120.0, + "grad_norm": 1.5786791569795495, + "language_loss": 0.77113497, + "learning_rate": 7.092442734510622e-07, + "loss": 0.84785974, + "num_input_tokens_seen": 262604045, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09863281, + "step": 12168, + "time_per_iteration": 2.550841808319092 + }, + { + "auxiliary_loss_clip": 0.06411693, + "auxiliary_loss_mlp": 0.01264978, + "balance_loss_clip": 0.06273279, + "balance_loss_mlp": 0.01254011, + "epoch": 0.7316398617165188, + "flos": 21512634526080.0, + "grad_norm": 1.4637772541157787, + "language_loss": 0.82124925, + "learning_rate": 7.089468023710326e-07, + "loss": 0.89801592, + "num_input_tokens_seen": 262624540, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.10955811, + "step": 12169, + "time_per_iteration": 2.4971840381622314 + }, + { + "auxiliary_loss_clip": 0.06413089, + "auxiliary_loss_mlp": 0.01269449, + "balance_loss_clip": 0.06276171, + "balance_loss_mlp": 0.01259031, + "epoch": 0.7316999849691869, + "flos": 30490489042560.0, + "grad_norm": 1.5962469016193046, + "language_loss": 0.70136017, + "learning_rate": 7.08649380247871e-07, + "loss": 0.77818549, + "num_input_tokens_seen": 262644545, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.10418701, + "step": 12170, + "time_per_iteration": 2.580601692199707 + }, + { + "auxiliary_loss_clip": 0.06408713, + "auxiliary_loss_mlp": 0.01268064, + "balance_loss_clip": 0.06273422, + "balance_loss_mlp": 0.01256655, + "epoch": 0.7317601082218548, + "flos": 21550257809280.0, + "grad_norm": 1.8557087884597323, + "language_loss": 0.69686925, + "learning_rate": 7.083520070928533e-07, + "loss": 0.773637, + "num_input_tokens_seen": 262662570, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.11413574, + "step": 12171, + "time_per_iteration": 2.483708143234253 + }, + { + "auxiliary_loss_clip": 0.06406379, + "auxiliary_loss_mlp": 0.01269004, + "balance_loss_clip": 0.06272611, + "balance_loss_mlp": 0.01258406, + "epoch": 0.7318202314745228, + "flos": 33259338086400.0, + "grad_norm": 1.4958611702028526, + "language_loss": 0.65253127, + "learning_rate": 7.080546829172564e-07, + "loss": 0.72928506, + "num_input_tokens_seen": 262683245, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.10595703, + "step": 12172, + "time_per_iteration": 2.6077332496643066 + }, + { + "auxiliary_loss_clip": 0.06410083, + "auxiliary_loss_mlp": 0.01266169, + "balance_loss_clip": 0.06274219, + "balance_loss_mlp": 0.0125547, + "epoch": 0.7318803547271907, + "flos": 20163254774400.0, + "grad_norm": 2.043922732836794, + "language_loss": 0.61819667, + "learning_rate": 7.077574077323564e-07, + "loss": 0.69495922, + "num_input_tokens_seen": 262701585, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.10693359, + "step": 12173, + "time_per_iteration": 2.4937400817871094 + }, + { + "auxiliary_loss_clip": 0.06411927, + "auxiliary_loss_mlp": 0.01265932, + "balance_loss_clip": 0.0627674, + "balance_loss_mlp": 0.01256395, + "epoch": 0.7319404779798587, + "flos": 20564826266880.0, + "grad_norm": 1.776213405218001, + "language_loss": 0.74138248, + "learning_rate": 7.074601815494243e-07, + "loss": 0.81816107, + "num_input_tokens_seen": 262719295, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.09533691, + "step": 12174, + "time_per_iteration": 2.5296590328216553 + }, + { + "auxiliary_loss_clip": 0.06402949, + "auxiliary_loss_mlp": 0.01263773, + "balance_loss_clip": 0.06272517, + "balance_loss_mlp": 0.01254701, + "epoch": 0.7320006012325266, + "flos": 28703130399360.0, + "grad_norm": 1.6525649397268998, + "language_loss": 0.81230605, + "learning_rate": 7.071630043797317e-07, + "loss": 0.88897324, + "num_input_tokens_seen": 262739995, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.09069824, + "step": 12175, + "time_per_iteration": 2.5799436569213867 + }, + { + "auxiliary_loss_clip": 0.06408073, + "auxiliary_loss_mlp": 0.01263853, + "balance_loss_clip": 0.06274186, + "balance_loss_mlp": 0.01253846, + "epoch": 0.7320607244851947, + "flos": 16368290231040.0, + "grad_norm": 1.8780371649414138, + "language_loss": 0.76478672, + "learning_rate": 7.068658762345488e-07, + "loss": 0.841506, + "num_input_tokens_seen": 262757680, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.10009766, + "step": 12176, + "time_per_iteration": 2.48456072807312 + }, + { + "auxiliary_loss_clip": 0.06404638, + "auxiliary_loss_mlp": 0.01267397, + "balance_loss_clip": 0.06272592, + "balance_loss_mlp": 0.01257526, + "epoch": 0.7321208477378626, + "flos": 20960653754880.0, + "grad_norm": 1.8116961288906432, + "language_loss": 0.76882672, + "learning_rate": 7.065687971251399e-07, + "loss": 0.84554708, + "num_input_tokens_seen": 262776990, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.09881592, + "step": 12177, + "time_per_iteration": 3.9612483978271484 + }, + { + "auxiliary_loss_clip": 0.06404608, + "auxiliary_loss_mlp": 0.01266624, + "balance_loss_clip": 0.06272198, + "balance_loss_mlp": 0.01257183, + "epoch": 0.7321809709905306, + "flos": 13850226057600.0, + "grad_norm": 2.0192997733839855, + "language_loss": 0.74703526, + "learning_rate": 7.06271767062772e-07, + "loss": 0.82374752, + "num_input_tokens_seen": 262795440, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09442139, + "step": 12178, + "time_per_iteration": 2.451946973800659 + }, + { + "auxiliary_loss_clip": 0.06407191, + "auxiliary_loss_mlp": 0.0126406, + "balance_loss_clip": 0.062708, + "balance_loss_mlp": 0.01253617, + "epoch": 0.7322410942431986, + "flos": 26987286816000.0, + "grad_norm": 1.9092278699703453, + "language_loss": 0.82810688, + "learning_rate": 7.059747860587084e-07, + "loss": 0.90481937, + "num_input_tokens_seen": 262816385, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.10449219, + "step": 12179, + "time_per_iteration": 2.5572235584259033 + }, + { + "auxiliary_loss_clip": 0.06400885, + "auxiliary_loss_mlp": 0.01265696, + "balance_loss_clip": 0.0627311, + "balance_loss_mlp": 0.0125573, + "epoch": 0.7323012174958665, + "flos": 17645526017280.0, + "grad_norm": 1.5024024158805138, + "language_loss": 0.7521069, + "learning_rate": 7.056778541242115e-07, + "loss": 0.82877266, + "num_input_tokens_seen": 262834955, + "router_z_loss_clip": 1.27832031, + "router_z_loss_mlp": 0.09960938, + "step": 12180, + "time_per_iteration": 2.455678701400757 + }, + { + "auxiliary_loss_clip": 0.06411432, + "auxiliary_loss_mlp": 0.01267053, + "balance_loss_clip": 0.06272306, + "balance_loss_mlp": 0.01256503, + "epoch": 0.7323613407485345, + "flos": 32350914046080.0, + "grad_norm": 1.8054283665304076, + "language_loss": 0.79850274, + "learning_rate": 7.053809712705396e-07, + "loss": 0.87528759, + "num_input_tokens_seen": 262853555, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.10552979, + "step": 12181, + "time_per_iteration": 2.595571756362915 + }, + { + "auxiliary_loss_clip": 0.06413537, + "auxiliary_loss_mlp": 0.01272467, + "balance_loss_clip": 0.06274928, + "balance_loss_mlp": 0.01261625, + "epoch": 0.7324214640012024, + "flos": 18367594577280.0, + "grad_norm": 1.7248361460474335, + "language_loss": 0.72176909, + "learning_rate": 7.050841375089506e-07, + "loss": 0.79862905, + "num_input_tokens_seen": 262870975, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.10852051, + "step": 12182, + "time_per_iteration": 2.4603164196014404 + }, + { + "auxiliary_loss_clip": 0.06412099, + "auxiliary_loss_mlp": 0.01267282, + "balance_loss_clip": 0.06276859, + "balance_loss_mlp": 0.01257268, + "epoch": 0.7324815872538705, + "flos": 30820503548160.0, + "grad_norm": 1.5618517746342058, + "language_loss": 0.71680033, + "learning_rate": 7.047873528507015e-07, + "loss": 0.79359412, + "num_input_tokens_seen": 262892635, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.10021973, + "step": 12183, + "time_per_iteration": 2.6027462482452393 + }, + { + "auxiliary_loss_clip": 0.0641363, + "auxiliary_loss_mlp": 0.01270088, + "balance_loss_clip": 0.06275654, + "balance_loss_mlp": 0.01258441, + "epoch": 0.7325417105065384, + "flos": 21511167079680.0, + "grad_norm": 1.8564082179513295, + "language_loss": 0.72663099, + "learning_rate": 7.04490617307045e-07, + "loss": 0.80346817, + "num_input_tokens_seen": 262910725, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.11639404, + "step": 12184, + "time_per_iteration": 2.481126070022583 + }, + { + "auxiliary_loss_clip": 0.06312383, + "auxiliary_loss_mlp": 0.01252618, + "balance_loss_clip": 0.06257074, + "balance_loss_mlp": 0.01251615, + "epoch": 0.7326018337592064, + "flos": 67277514746880.0, + "grad_norm": 0.738407632839968, + "language_loss": 0.65071452, + "learning_rate": 7.041939308892344e-07, + "loss": 0.72636449, + "num_input_tokens_seen": 262974150, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01002502, + "step": 12185, + "time_per_iteration": 3.106149196624756 + }, + { + "auxiliary_loss_clip": 0.06409767, + "auxiliary_loss_mlp": 0.01263715, + "balance_loss_clip": 0.06272019, + "balance_loss_mlp": 0.01253278, + "epoch": 0.7326619570118743, + "flos": 22863733286400.0, + "grad_norm": 1.8830306075887209, + "language_loss": 0.8029325, + "learning_rate": 7.038972936085197e-07, + "loss": 0.87966728, + "num_input_tokens_seen": 262993370, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.10443115, + "step": 12186, + "time_per_iteration": 3.9164252281188965 + }, + { + "auxiliary_loss_clip": 0.06409957, + "auxiliary_loss_mlp": 0.01267283, + "balance_loss_clip": 0.06272968, + "balance_loss_mlp": 0.0125656, + "epoch": 0.7327220802645423, + "flos": 23333591456640.0, + "grad_norm": 3.1049708773187685, + "language_loss": 0.73623288, + "learning_rate": 7.036007054761508e-07, + "loss": 0.81300521, + "num_input_tokens_seen": 263012665, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.10717773, + "step": 12187, + "time_per_iteration": 2.534468412399292 + }, + { + "auxiliary_loss_clip": 0.06412861, + "auxiliary_loss_mlp": 0.01267726, + "balance_loss_clip": 0.06277903, + "balance_loss_mlp": 0.01257462, + "epoch": 0.7327822035172102, + "flos": 23186578268160.0, + "grad_norm": 1.736323244132865, + "language_loss": 0.89323306, + "learning_rate": 7.033041665033716e-07, + "loss": 0.97003901, + "num_input_tokens_seen": 263031475, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.10272217, + "step": 12188, + "time_per_iteration": 2.6024370193481445 + }, + { + "auxiliary_loss_clip": 0.06405529, + "auxiliary_loss_mlp": 0.01267933, + "balance_loss_clip": 0.06268479, + "balance_loss_mlp": 0.01257449, + "epoch": 0.7328423267698783, + "flos": 21072517355520.0, + "grad_norm": 1.8789204802001953, + "language_loss": 0.75451827, + "learning_rate": 7.030076767014284e-07, + "loss": 0.83125293, + "num_input_tokens_seen": 263051445, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.10479736, + "step": 12189, + "time_per_iteration": 2.4941177368164062 + }, + { + "auxiliary_loss_clip": 0.06409896, + "auxiliary_loss_mlp": 0.01268331, + "balance_loss_clip": 0.06272592, + "balance_loss_mlp": 0.01257865, + "epoch": 0.7329024500225462, + "flos": 21696055113600.0, + "grad_norm": 1.5072102792760083, + "language_loss": 0.82332706, + "learning_rate": 7.027112360815648e-07, + "loss": 0.90010929, + "num_input_tokens_seen": 263070835, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.10473633, + "step": 12190, + "time_per_iteration": 2.526470184326172 + }, + { + "auxiliary_loss_clip": 0.06406286, + "auxiliary_loss_mlp": 0.01269765, + "balance_loss_clip": 0.06270757, + "balance_loss_mlp": 0.01258995, + "epoch": 0.7329625732752142, + "flos": 24169829604480.0, + "grad_norm": 1.85565696251354, + "language_loss": 0.72012609, + "learning_rate": 7.024148446550204e-07, + "loss": 0.79688656, + "num_input_tokens_seen": 263090070, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.10766602, + "step": 12191, + "time_per_iteration": 3.952462673187256 + }, + { + "auxiliary_loss_clip": 0.06405483, + "auxiliary_loss_mlp": 0.01267854, + "balance_loss_clip": 0.06271866, + "balance_loss_mlp": 0.01257793, + "epoch": 0.7330226965278822, + "flos": 30085227970560.0, + "grad_norm": 1.8630604521541774, + "language_loss": 0.69281983, + "learning_rate": 7.021185024330361e-07, + "loss": 0.76955318, + "num_input_tokens_seen": 263110030, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.10058594, + "step": 12192, + "time_per_iteration": 2.569606065750122 + }, + { + "auxiliary_loss_clip": 0.06404717, + "auxiliary_loss_mlp": 0.01264705, + "balance_loss_clip": 0.06270668, + "balance_loss_mlp": 0.01254859, + "epoch": 0.7330828197805501, + "flos": 23375113954560.0, + "grad_norm": 2.149879925519752, + "language_loss": 0.73025858, + "learning_rate": 7.01822209426848e-07, + "loss": 0.80695283, + "num_input_tokens_seen": 263129735, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.09844971, + "step": 12193, + "time_per_iteration": 2.5172417163848877 + }, + { + "auxiliary_loss_clip": 0.06408362, + "auxiliary_loss_mlp": 0.01270537, + "balance_loss_clip": 0.06271482, + "balance_loss_mlp": 0.01260207, + "epoch": 0.7331429430332181, + "flos": 21039170630400.0, + "grad_norm": 1.6561607292660703, + "language_loss": 0.77499682, + "learning_rate": 7.015259656476911e-07, + "loss": 0.85178578, + "num_input_tokens_seen": 263149100, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.10333252, + "step": 12194, + "time_per_iteration": 2.479529857635498 + }, + { + "auxiliary_loss_clip": 0.06405737, + "auxiliary_loss_mlp": 0.01263406, + "balance_loss_clip": 0.06272283, + "balance_loss_mlp": 0.0125285, + "epoch": 0.733203066285886, + "flos": 14653201334400.0, + "grad_norm": 1.6173563987107382, + "language_loss": 0.70813656, + "learning_rate": 7.012297711067998e-07, + "loss": 0.78482801, + "num_input_tokens_seen": 263166620, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.10552979, + "step": 12195, + "time_per_iteration": 3.877392292022705 + }, + { + "auxiliary_loss_clip": 0.06408596, + "auxiliary_loss_mlp": 0.01263504, + "balance_loss_clip": 0.06272919, + "balance_loss_mlp": 0.01253991, + "epoch": 0.7332631895385541, + "flos": 17171013945600.0, + "grad_norm": 1.8915458632347482, + "language_loss": 0.72392344, + "learning_rate": 7.009336258154057e-07, + "loss": 0.80064452, + "num_input_tokens_seen": 263184780, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.09515381, + "step": 12196, + "time_per_iteration": 2.475527286529541 + }, + { + "auxiliary_loss_clip": 0.0640474, + "auxiliary_loss_mlp": 0.01267096, + "balance_loss_clip": 0.06272123, + "balance_loss_mlp": 0.01256808, + "epoch": 0.733323312791222, + "flos": 28665758678400.0, + "grad_norm": 1.6827859274042947, + "language_loss": 0.7184931, + "learning_rate": 7.006375297847394e-07, + "loss": 0.79521143, + "num_input_tokens_seen": 263204625, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.10290527, + "step": 12197, + "time_per_iteration": 2.535411834716797 + }, + { + "auxiliary_loss_clip": 0.06414885, + "auxiliary_loss_mlp": 0.0127094, + "balance_loss_clip": 0.06273107, + "balance_loss_mlp": 0.0125918, + "epoch": 0.73338343604389, + "flos": 16624106346240.0, + "grad_norm": 1.8099581096795507, + "language_loss": 0.7810899, + "learning_rate": 7.003414830260282e-07, + "loss": 0.85794812, + "num_input_tokens_seen": 263221565, + "router_z_loss_clip": 1.41699219, + "router_z_loss_mlp": 0.11767578, + "step": 12198, + "time_per_iteration": 2.5611343383789062 + }, + { + "auxiliary_loss_clip": 0.06406511, + "auxiliary_loss_mlp": 0.01266433, + "balance_loss_clip": 0.06270938, + "balance_loss_mlp": 0.0125661, + "epoch": 0.7334435592965579, + "flos": 21148434754560.0, + "grad_norm": 1.7977488720869146, + "language_loss": 0.74877429, + "learning_rate": 7.000454855504974e-07, + "loss": 0.82550371, + "num_input_tokens_seen": 263240620, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.0982666, + "step": 12199, + "time_per_iteration": 2.549605369567871 + }, + { + "auxiliary_loss_clip": 0.06412543, + "auxiliary_loss_mlp": 0.01267154, + "balance_loss_clip": 0.06272766, + "balance_loss_mlp": 0.01255984, + "epoch": 0.7335036825492259, + "flos": 17130455769600.0, + "grad_norm": 2.1057189118558655, + "language_loss": 0.76952875, + "learning_rate": 6.997495373693729e-07, + "loss": 0.84632576, + "num_input_tokens_seen": 263254365, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.11175537, + "step": 12200, + "time_per_iteration": 2.4664149284362793 + }, + { + "auxiliary_loss_clip": 0.06406954, + "auxiliary_loss_mlp": 0.01269537, + "balance_loss_clip": 0.06272939, + "balance_loss_mlp": 0.01258874, + "epoch": 0.7335638058018938, + "flos": 23738475185280.0, + "grad_norm": 1.6692295634407006, + "language_loss": 0.61729515, + "learning_rate": 6.994536384938754e-07, + "loss": 0.69406003, + "num_input_tokens_seen": 263275880, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.10662842, + "step": 12201, + "time_per_iteration": 2.5405964851379395 + }, + { + "auxiliary_loss_clip": 0.0640207, + "auxiliary_loss_mlp": 0.01264063, + "balance_loss_clip": 0.06269816, + "balance_loss_mlp": 0.01254544, + "epoch": 0.7336239290545619, + "flos": 34941876871680.0, + "grad_norm": 1.7828880391385733, + "language_loss": 0.52268887, + "learning_rate": 6.991577889352264e-07, + "loss": 0.59935021, + "num_input_tokens_seen": 263298315, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.09521484, + "step": 12202, + "time_per_iteration": 2.610280990600586 + }, + { + "auxiliary_loss_clip": 0.06403884, + "auxiliary_loss_mlp": 0.01264935, + "balance_loss_clip": 0.06270868, + "balance_loss_mlp": 0.01255082, + "epoch": 0.7336840523072298, + "flos": 21108966681600.0, + "grad_norm": 3.0029682825255706, + "language_loss": 0.686993, + "learning_rate": 6.98861988704645e-07, + "loss": 0.76368117, + "num_input_tokens_seen": 263318615, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.09844971, + "step": 12203, + "time_per_iteration": 2.507932424545288 + }, + { + "auxiliary_loss_clip": 0.06414039, + "auxiliary_loss_mlp": 0.01270628, + "balance_loss_clip": 0.06272701, + "balance_loss_mlp": 0.01259959, + "epoch": 0.7337441755598978, + "flos": 24031243751040.0, + "grad_norm": 2.856553755482537, + "language_loss": 0.66825521, + "learning_rate": 6.985662378133474e-07, + "loss": 0.74510193, + "num_input_tokens_seen": 263336705, + "router_z_loss_clip": 1.41210938, + "router_z_loss_mlp": 0.10668945, + "step": 12204, + "time_per_iteration": 2.514671802520752 + }, + { + "auxiliary_loss_clip": 0.06406862, + "auxiliary_loss_mlp": 0.01263286, + "balance_loss_clip": 0.06273827, + "balance_loss_mlp": 0.01253779, + "epoch": 0.7338042988125658, + "flos": 22717977909120.0, + "grad_norm": 1.8458208661726296, + "language_loss": 0.77401447, + "learning_rate": 6.982705362725479e-07, + "loss": 0.85071599, + "num_input_tokens_seen": 263355065, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.09509277, + "step": 12205, + "time_per_iteration": 2.5407674312591553 + }, + { + "auxiliary_loss_clip": 0.06401809, + "auxiliary_loss_mlp": 0.01264175, + "balance_loss_clip": 0.06270801, + "balance_loss_mlp": 0.01255288, + "epoch": 0.7338644220652337, + "flos": 21367382273280.0, + "grad_norm": 2.465584123041792, + "language_loss": 0.80136371, + "learning_rate": 6.979748840934601e-07, + "loss": 0.87802351, + "num_input_tokens_seen": 263374460, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.08892822, + "step": 12206, + "time_per_iteration": 2.505405902862549 + }, + { + "auxiliary_loss_clip": 0.06407475, + "auxiliary_loss_mlp": 0.01266198, + "balance_loss_clip": 0.06271542, + "balance_loss_mlp": 0.01256447, + "epoch": 0.7339245453179017, + "flos": 30928216371840.0, + "grad_norm": 1.8649817824814656, + "language_loss": 0.71671152, + "learning_rate": 6.976792812872958e-07, + "loss": 0.79344821, + "num_input_tokens_seen": 263393610, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.09747314, + "step": 12207, + "time_per_iteration": 2.5743727684020996 + }, + { + "auxiliary_loss_clip": 0.06311717, + "auxiliary_loss_mlp": 0.01252748, + "balance_loss_clip": 0.06256534, + "balance_loss_mlp": 0.01251759, + "epoch": 0.7339846685705697, + "flos": 67916789873280.0, + "grad_norm": 0.7657187342696471, + "language_loss": 0.54859233, + "learning_rate": 6.97383727865263e-07, + "loss": 0.62423694, + "num_input_tokens_seen": 263450340, + "router_z_loss_clip": 0.55419922, + "router_z_loss_mlp": 0.00988007, + "step": 12208, + "time_per_iteration": 3.215527057647705 + }, + { + "auxiliary_loss_clip": 0.06409256, + "auxiliary_loss_mlp": 0.01263774, + "balance_loss_clip": 0.06273347, + "balance_loss_mlp": 0.01253963, + "epoch": 0.7340447918232377, + "flos": 22243298129280.0, + "grad_norm": 1.295062015849254, + "language_loss": 0.80369568, + "learning_rate": 6.970882238385703e-07, + "loss": 0.88042593, + "num_input_tokens_seen": 263471735, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.0980835, + "step": 12209, + "time_per_iteration": 2.604940414428711 + }, + { + "auxiliary_loss_clip": 0.06402272, + "auxiliary_loss_mlp": 0.01265832, + "balance_loss_clip": 0.06270745, + "balance_loss_mlp": 0.01256164, + "epoch": 0.7341049150759056, + "flos": 23770857588480.0, + "grad_norm": 1.3756281752304946, + "language_loss": 0.7923339, + "learning_rate": 6.96792769218423e-07, + "loss": 0.86901498, + "num_input_tokens_seen": 263493245, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09661865, + "step": 12210, + "time_per_iteration": 2.586808919906616 + }, + { + "auxiliary_loss_clip": 0.06405463, + "auxiliary_loss_mlp": 0.01263055, + "balance_loss_clip": 0.06273089, + "balance_loss_mlp": 0.01253142, + "epoch": 0.7341650383285736, + "flos": 17241983953920.0, + "grad_norm": 1.587399394910607, + "language_loss": 0.76868075, + "learning_rate": 6.964973640160236e-07, + "loss": 0.84536588, + "num_input_tokens_seen": 263511660, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.09918213, + "step": 12211, + "time_per_iteration": 2.5032119750976562 + }, + { + "auxiliary_loss_clip": 0.06406663, + "auxiliary_loss_mlp": 0.01269483, + "balance_loss_clip": 0.06271002, + "balance_loss_mlp": 0.01259464, + "epoch": 0.7342251615812415, + "flos": 23410640885760.0, + "grad_norm": 1.8683107617310235, + "language_loss": 0.7257871, + "learning_rate": 6.962020082425748e-07, + "loss": 0.80254853, + "num_input_tokens_seen": 263530875, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.10021973, + "step": 12212, + "time_per_iteration": 2.529822826385498 + }, + { + "auxiliary_loss_clip": 0.06408443, + "auxiliary_loss_mlp": 0.01264026, + "balance_loss_clip": 0.06274249, + "balance_loss_mlp": 0.01253983, + "epoch": 0.7342852848339095, + "flos": 22753756402560.0, + "grad_norm": 1.4731208484223037, + "language_loss": 0.69065344, + "learning_rate": 6.959067019092766e-07, + "loss": 0.76737809, + "num_input_tokens_seen": 263551585, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.10046387, + "step": 12213, + "time_per_iteration": 2.5050880908966064 + }, + { + "auxiliary_loss_clip": 0.06311147, + "auxiliary_loss_mlp": 0.01250993, + "balance_loss_clip": 0.06256209, + "balance_loss_mlp": 0.01250006, + "epoch": 0.7343454080865774, + "flos": 53960219856000.0, + "grad_norm": 0.6961582505379801, + "language_loss": 0.54205143, + "learning_rate": 6.956114450273276e-07, + "loss": 0.61767286, + "num_input_tokens_seen": 263609545, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.00987244, + "step": 12214, + "time_per_iteration": 3.01758074760437 + }, + { + "auxiliary_loss_clip": 0.06412373, + "auxiliary_loss_mlp": 0.0126565, + "balance_loss_clip": 0.06272756, + "balance_loss_mlp": 0.01255058, + "epoch": 0.7344055313392455, + "flos": 12171754195200.0, + "grad_norm": 1.9351269551691648, + "language_loss": 0.70493495, + "learning_rate": 6.953162376079233e-07, + "loss": 0.78171515, + "num_input_tokens_seen": 263627880, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.105896, + "step": 12215, + "time_per_iteration": 2.450974941253662 + }, + { + "auxiliary_loss_clip": 0.06400481, + "auxiliary_loss_mlp": 0.0126608, + "balance_loss_clip": 0.06270639, + "balance_loss_mlp": 0.01256347, + "epoch": 0.7344656545919134, + "flos": 18555710993280.0, + "grad_norm": 1.5126294577685706, + "language_loss": 0.7330094, + "learning_rate": 6.950210796622573e-07, + "loss": 0.80967498, + "num_input_tokens_seen": 263645665, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.09741211, + "step": 12216, + "time_per_iteration": 3.8361501693725586 + }, + { + "auxiliary_loss_clip": 0.06417778, + "auxiliary_loss_mlp": 0.01265589, + "balance_loss_clip": 0.06274825, + "balance_loss_mlp": 0.01254085, + "epoch": 0.7345257778445814, + "flos": 23668762988160.0, + "grad_norm": 1.664988120098628, + "language_loss": 0.78114659, + "learning_rate": 6.947259712015236e-07, + "loss": 0.85798025, + "num_input_tokens_seen": 263668170, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.11505127, + "step": 12217, + "time_per_iteration": 2.5286312103271484 + }, + { + "auxiliary_loss_clip": 0.06405286, + "auxiliary_loss_mlp": 0.01265343, + "balance_loss_clip": 0.06273887, + "balance_loss_mlp": 0.01256056, + "epoch": 0.7345859010972494, + "flos": 13813818658560.0, + "grad_norm": 2.564959401036019, + "language_loss": 0.78167617, + "learning_rate": 6.94430912236911e-07, + "loss": 0.85838252, + "num_input_tokens_seen": 263684190, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.09289551, + "step": 12218, + "time_per_iteration": 2.4696590900421143 + }, + { + "auxiliary_loss_clip": 0.06401719, + "auxiliary_loss_mlp": 0.0126533, + "balance_loss_clip": 0.06270626, + "balance_loss_mlp": 0.01255567, + "epoch": 0.7346460243499173, + "flos": 22279202403840.0, + "grad_norm": 1.5944736181083394, + "language_loss": 0.72325158, + "learning_rate": 6.941359027796092e-07, + "loss": 0.79992205, + "num_input_tokens_seen": 263702095, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09777832, + "step": 12219, + "time_per_iteration": 2.5853631496429443 + }, + { + "auxiliary_loss_clip": 0.06402183, + "auxiliary_loss_mlp": 0.01264937, + "balance_loss_clip": 0.06272361, + "balance_loss_mlp": 0.01255531, + "epoch": 0.7347061476025853, + "flos": 23261447491200.0, + "grad_norm": 1.646626241048598, + "language_loss": 0.74960732, + "learning_rate": 6.938409428408061e-07, + "loss": 0.82627851, + "num_input_tokens_seen": 263721385, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.09405518, + "step": 12220, + "time_per_iteration": 2.5074381828308105 + }, + { + "auxiliary_loss_clip": 0.06411088, + "auxiliary_loss_mlp": 0.01266137, + "balance_loss_clip": 0.06272232, + "balance_loss_mlp": 0.01255384, + "epoch": 0.7347662708552533, + "flos": 15272881804800.0, + "grad_norm": 1.5752596580091636, + "language_loss": 0.65676045, + "learning_rate": 6.93546032431684e-07, + "loss": 0.73353267, + "num_input_tokens_seen": 263737835, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.10742188, + "step": 12221, + "time_per_iteration": 2.4807536602020264 + }, + { + "auxiliary_loss_clip": 0.06407331, + "auxiliary_loss_mlp": 0.01266734, + "balance_loss_clip": 0.06272253, + "balance_loss_mlp": 0.01256809, + "epoch": 0.7348263941079213, + "flos": 24866349868800.0, + "grad_norm": 1.700720501906822, + "language_loss": 0.6957171, + "learning_rate": 6.932511715634273e-07, + "loss": 0.77245772, + "num_input_tokens_seen": 263756480, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.09918213, + "step": 12222, + "time_per_iteration": 2.550657272338867 + }, + { + "auxiliary_loss_clip": 0.06405503, + "auxiliary_loss_mlp": 0.01265893, + "balance_loss_clip": 0.06273381, + "balance_loss_mlp": 0.01257054, + "epoch": 0.7348865173605892, + "flos": 24358868415360.0, + "grad_norm": 1.4474540063064079, + "language_loss": 0.66394234, + "learning_rate": 6.92956360247217e-07, + "loss": 0.74065632, + "num_input_tokens_seen": 263776440, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.08843994, + "step": 12223, + "time_per_iteration": 2.5699193477630615 + }, + { + "auxiliary_loss_clip": 0.06405693, + "auxiliary_loss_mlp": 0.0126412, + "balance_loss_clip": 0.06272776, + "balance_loss_mlp": 0.01254404, + "epoch": 0.7349466406132572, + "flos": 20009700967680.0, + "grad_norm": 2.3059227794211834, + "language_loss": 0.72692394, + "learning_rate": 6.926615984942332e-07, + "loss": 0.80362213, + "num_input_tokens_seen": 263793700, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.09716797, + "step": 12224, + "time_per_iteration": 2.470388412475586 + }, + { + "auxiliary_loss_clip": 0.06410325, + "auxiliary_loss_mlp": 0.01265671, + "balance_loss_clip": 0.06273518, + "balance_loss_mlp": 0.01254776, + "epoch": 0.7350067638659251, + "flos": 29832766018560.0, + "grad_norm": 1.7299293804881801, + "language_loss": 0.72725701, + "learning_rate": 6.92366886315652e-07, + "loss": 0.80401695, + "num_input_tokens_seen": 263814620, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.10900879, + "step": 12225, + "time_per_iteration": 2.596513509750366 + }, + { + "auxiliary_loss_clip": 0.06415132, + "auxiliary_loss_mlp": 0.0126347, + "balance_loss_clip": 0.06274726, + "balance_loss_mlp": 0.01252825, + "epoch": 0.7350668871185931, + "flos": 21871677271680.0, + "grad_norm": 1.7624309121462833, + "language_loss": 0.76816809, + "learning_rate": 6.920722237226501e-07, + "loss": 0.84495413, + "num_input_tokens_seen": 263832725, + "router_z_loss_clip": 1.40429688, + "router_z_loss_mlp": 0.10644531, + "step": 12226, + "time_per_iteration": 3.9786300659179688 + }, + { + "auxiliary_loss_clip": 0.06405763, + "auxiliary_loss_mlp": 0.01263929, + "balance_loss_clip": 0.06270237, + "balance_loss_mlp": 0.01254041, + "epoch": 0.735127010371261, + "flos": 22572893364480.0, + "grad_norm": 1.4073989113743075, + "language_loss": 0.67142195, + "learning_rate": 6.917776107264008e-07, + "loss": 0.74811888, + "num_input_tokens_seen": 263853850, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.09893799, + "step": 12227, + "time_per_iteration": 2.5849621295928955 + }, + { + "auxiliary_loss_clip": 0.06410711, + "auxiliary_loss_mlp": 0.012626, + "balance_loss_clip": 0.06274848, + "balance_loss_mlp": 0.0125292, + "epoch": 0.7351871336239291, + "flos": 25891333338240.0, + "grad_norm": 1.4691171153634894, + "language_loss": 0.63763392, + "learning_rate": 6.914830473380749e-07, + "loss": 0.71436703, + "num_input_tokens_seen": 263874760, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.09680176, + "step": 12228, + "time_per_iteration": 2.535334587097168 + }, + { + "auxiliary_loss_clip": 0.06409031, + "auxiliary_loss_mlp": 0.01263285, + "balance_loss_clip": 0.06274029, + "balance_loss_mlp": 0.0125404, + "epoch": 0.735247256876597, + "flos": 17938126874880.0, + "grad_norm": 1.6163859960159983, + "language_loss": 0.6387676, + "learning_rate": 6.911885335688427e-07, + "loss": 0.7154907, + "num_input_tokens_seen": 263893390, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.09246826, + "step": 12229, + "time_per_iteration": 2.5226519107818604 + }, + { + "auxiliary_loss_clip": 0.06409419, + "auxiliary_loss_mlp": 0.01264039, + "balance_loss_clip": 0.06271814, + "balance_loss_mlp": 0.01253352, + "epoch": 0.735307380129265, + "flos": 28882484064000.0, + "grad_norm": 1.5503109559277863, + "language_loss": 0.734267, + "learning_rate": 6.908940694298726e-07, + "loss": 0.81100154, + "num_input_tokens_seen": 263911180, + "router_z_loss_clip": 1.37695312, + "router_z_loss_mlp": 0.10693359, + "step": 12230, + "time_per_iteration": 3.9754912853240967 + }, + { + "auxiliary_loss_clip": 0.06410781, + "auxiliary_loss_mlp": 0.01267625, + "balance_loss_clip": 0.06275117, + "balance_loss_mlp": 0.01257177, + "epoch": 0.7353675033819329, + "flos": 13630691560320.0, + "grad_norm": 2.023268936424561, + "language_loss": 0.72356808, + "learning_rate": 6.90599654932332e-07, + "loss": 0.8003521, + "num_input_tokens_seen": 263928975, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.10455322, + "step": 12231, + "time_per_iteration": 2.4864163398742676 + }, + { + "auxiliary_loss_clip": 0.06412238, + "auxiliary_loss_mlp": 0.01272917, + "balance_loss_clip": 0.06275348, + "balance_loss_mlp": 0.01262003, + "epoch": 0.7354276266346009, + "flos": 19469040497280.0, + "grad_norm": 2.0034739477169965, + "language_loss": 0.64325827, + "learning_rate": 6.903052900873823e-07, + "loss": 0.72010976, + "num_input_tokens_seen": 263944495, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.10906982, + "step": 12232, + "time_per_iteration": 2.5125675201416016 + }, + { + "auxiliary_loss_clip": 0.06407313, + "auxiliary_loss_mlp": 0.01267406, + "balance_loss_clip": 0.06270695, + "balance_loss_mlp": 0.01256922, + "epoch": 0.735487749887269, + "flos": 15776170554240.0, + "grad_norm": 1.8738456436799267, + "language_loss": 0.75562924, + "learning_rate": 6.900109749061874e-07, + "loss": 0.83237642, + "num_input_tokens_seen": 263961325, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.10491943, + "step": 12233, + "time_per_iteration": 2.496495246887207 + }, + { + "auxiliary_loss_clip": 0.06407893, + "auxiliary_loss_mlp": 0.01268588, + "balance_loss_clip": 0.06273118, + "balance_loss_mlp": 0.0125805, + "epoch": 0.7355478731399369, + "flos": 18266673934080.0, + "grad_norm": 1.8052457003626037, + "language_loss": 0.73313487, + "learning_rate": 6.897167093999079e-07, + "loss": 0.80989963, + "num_input_tokens_seen": 263980445, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.10534668, + "step": 12234, + "time_per_iteration": 3.9552576541900635 + }, + { + "auxiliary_loss_clip": 0.064089, + "auxiliary_loss_mlp": 0.01265135, + "balance_loss_clip": 0.06273393, + "balance_loss_mlp": 0.01255104, + "epoch": 0.7356079963926049, + "flos": 26549307924480.0, + "grad_norm": 1.8318735304656244, + "language_loss": 0.59923625, + "learning_rate": 6.894224935797017e-07, + "loss": 0.67597657, + "num_input_tokens_seen": 263999330, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.10028076, + "step": 12235, + "time_per_iteration": 2.536958932876587 + }, + { + "auxiliary_loss_clip": 0.06406462, + "auxiliary_loss_mlp": 0.01266095, + "balance_loss_clip": 0.06273465, + "balance_loss_mlp": 0.01255611, + "epoch": 0.7356681196452728, + "flos": 10782990224640.0, + "grad_norm": 2.1420111841430445, + "language_loss": 0.86364961, + "learning_rate": 6.891283274567259e-07, + "loss": 0.94037515, + "num_input_tokens_seen": 264014150, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.10479736, + "step": 12236, + "time_per_iteration": 2.4920454025268555 + }, + { + "auxiliary_loss_clip": 0.0641176, + "auxiliary_loss_mlp": 0.01264567, + "balance_loss_clip": 0.06274892, + "balance_loss_mlp": 0.01254178, + "epoch": 0.7357282428979408, + "flos": 19724730831360.0, + "grad_norm": 1.819458830371115, + "language_loss": 0.69971436, + "learning_rate": 6.888342110421364e-07, + "loss": 0.77647763, + "num_input_tokens_seen": 264033140, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.1038208, + "step": 12237, + "time_per_iteration": 2.5083632469177246 + }, + { + "auxiliary_loss_clip": 0.0640821, + "auxiliary_loss_mlp": 0.01262709, + "balance_loss_clip": 0.06271386, + "balance_loss_mlp": 0.01252647, + "epoch": 0.7357883661506087, + "flos": 19470130600320.0, + "grad_norm": 1.6051120472726816, + "language_loss": 0.72315025, + "learning_rate": 6.885401443470839e-07, + "loss": 0.79985946, + "num_input_tokens_seen": 264052105, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.10070801, + "step": 12238, + "time_per_iteration": 2.5418028831481934 + }, + { + "auxiliary_loss_clip": 0.06415435, + "auxiliary_loss_mlp": 0.01267845, + "balance_loss_clip": 0.06272001, + "balance_loss_mlp": 0.01257038, + "epoch": 0.7358484894032767, + "flos": 27129897665280.0, + "grad_norm": 1.6224977172165573, + "language_loss": 0.73030883, + "learning_rate": 6.882461273827205e-07, + "loss": 0.8071416, + "num_input_tokens_seen": 264070690, + "router_z_loss_clip": 1.43359375, + "router_z_loss_mlp": 0.10809326, + "step": 12239, + "time_per_iteration": 2.57132887840271 + }, + { + "auxiliary_loss_clip": 0.06405096, + "auxiliary_loss_mlp": 0.01263816, + "balance_loss_clip": 0.06275095, + "balance_loss_mlp": 0.01254786, + "epoch": 0.7359086126559446, + "flos": 24509780818560.0, + "grad_norm": 1.236291832045993, + "language_loss": 0.79114598, + "learning_rate": 6.879521601601954e-07, + "loss": 0.8678351, + "num_input_tokens_seen": 264094225, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.09033203, + "step": 12240, + "time_per_iteration": 2.574645757675171 + }, + { + "auxiliary_loss_clip": 0.06410246, + "auxiliary_loss_mlp": 0.01266401, + "balance_loss_clip": 0.0627753, + "balance_loss_mlp": 0.01256942, + "epoch": 0.7359687359086127, + "flos": 23337993795840.0, + "grad_norm": 1.821182153740144, + "language_loss": 0.83331031, + "learning_rate": 6.876582426906565e-07, + "loss": 0.91007674, + "num_input_tokens_seen": 264113190, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09454346, + "step": 12241, + "time_per_iteration": 2.5325047969818115 + }, + { + "auxiliary_loss_clip": 0.06407616, + "auxiliary_loss_mlp": 0.01262523, + "balance_loss_clip": 0.06274907, + "balance_loss_mlp": 0.01252909, + "epoch": 0.7360288591612806, + "flos": 20199578319360.0, + "grad_norm": 1.8489352198230395, + "language_loss": 0.78972995, + "learning_rate": 6.873643749852484e-07, + "loss": 0.86643136, + "num_input_tokens_seen": 264132050, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09606934, + "step": 12242, + "time_per_iteration": 2.4817190170288086 + }, + { + "auxiliary_loss_clip": 0.06405145, + "auxiliary_loss_mlp": 0.01268429, + "balance_loss_clip": 0.06273502, + "balance_loss_mlp": 0.01258981, + "epoch": 0.7360889824139486, + "flos": 24979722842880.0, + "grad_norm": 1.7750845941868088, + "language_loss": 0.79797709, + "learning_rate": 6.870705570551145e-07, + "loss": 0.87471282, + "num_input_tokens_seen": 264152800, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09436035, + "step": 12243, + "time_per_iteration": 2.5396323204040527 + }, + { + "auxiliary_loss_clip": 0.06411023, + "auxiliary_loss_mlp": 0.01264312, + "balance_loss_clip": 0.06271946, + "balance_loss_mlp": 0.01253423, + "epoch": 0.7361491056666165, + "flos": 15017610741120.0, + "grad_norm": 2.051473837828663, + "language_loss": 0.74682987, + "learning_rate": 6.867767889113969e-07, + "loss": 0.82358325, + "num_input_tokens_seen": 264169650, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.10888672, + "step": 12244, + "time_per_iteration": 2.468791961669922 + }, + { + "auxiliary_loss_clip": 0.06409503, + "auxiliary_loss_mlp": 0.01266285, + "balance_loss_clip": 0.06271558, + "balance_loss_mlp": 0.01256033, + "epoch": 0.7362092289192845, + "flos": 22937135063040.0, + "grad_norm": 1.5646917897943269, + "language_loss": 0.69797492, + "learning_rate": 6.864830705652347e-07, + "loss": 0.77473283, + "num_input_tokens_seen": 264190530, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.10253906, + "step": 12245, + "time_per_iteration": 2.6041831970214844 + }, + { + "auxiliary_loss_clip": 0.06401391, + "auxiliary_loss_mlp": 0.01266236, + "balance_loss_clip": 0.06273212, + "balance_loss_mlp": 0.01255787, + "epoch": 0.7362693521719526, + "flos": 20708694927360.0, + "grad_norm": 1.4104590909640493, + "language_loss": 0.73381358, + "learning_rate": 6.861894020277658e-07, + "loss": 0.81048983, + "num_input_tokens_seen": 264210820, + "router_z_loss_clip": 1.28222656, + "router_z_loss_mlp": 0.10449219, + "step": 12246, + "time_per_iteration": 2.5084409713745117 + }, + { + "auxiliary_loss_clip": 0.06402211, + "auxiliary_loss_mlp": 0.01268595, + "balance_loss_clip": 0.06273086, + "balance_loss_mlp": 0.01259833, + "epoch": 0.7363294754246205, + "flos": 13115747093760.0, + "grad_norm": 1.8401513132222869, + "language_loss": 0.73210883, + "learning_rate": 6.858957833101266e-07, + "loss": 0.80881691, + "num_input_tokens_seen": 264227430, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.08758545, + "step": 12247, + "time_per_iteration": 2.5997636318206787 + }, + { + "auxiliary_loss_clip": 0.06406122, + "auxiliary_loss_mlp": 0.01262591, + "balance_loss_clip": 0.06276006, + "balance_loss_mlp": 0.0125269, + "epoch": 0.7363895986772885, + "flos": 14032598469120.0, + "grad_norm": 1.520275800225871, + "language_loss": 0.74474341, + "learning_rate": 6.856022144234526e-07, + "loss": 0.8214305, + "num_input_tokens_seen": 264245230, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.09899902, + "step": 12248, + "time_per_iteration": 2.4908292293548584 + }, + { + "auxiliary_loss_clip": 0.06410165, + "auxiliary_loss_mlp": 0.01271268, + "balance_loss_clip": 0.06274056, + "balance_loss_mlp": 0.01261022, + "epoch": 0.7364497219299564, + "flos": 19726240204800.0, + "grad_norm": 1.8587136102784652, + "language_loss": 0.73065788, + "learning_rate": 6.853086953788727e-07, + "loss": 0.80747223, + "num_input_tokens_seen": 264263945, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.1026001, + "step": 12249, + "time_per_iteration": 2.5477547645568848 + }, + { + "auxiliary_loss_clip": 0.06408364, + "auxiliary_loss_mlp": 0.01269722, + "balance_loss_clip": 0.06275103, + "balance_loss_mlp": 0.0125922, + "epoch": 0.7365098451826244, + "flos": 21367843470720.0, + "grad_norm": 1.7459434910305351, + "language_loss": 0.7680105, + "learning_rate": 6.850152261875189e-07, + "loss": 0.84479141, + "num_input_tokens_seen": 264281500, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.1050415, + "step": 12250, + "time_per_iteration": 2.50736665725708 + }, + { + "auxiliary_loss_clip": 0.06411077, + "auxiliary_loss_mlp": 0.01264873, + "balance_loss_clip": 0.0627429, + "balance_loss_mlp": 0.01254043, + "epoch": 0.7365699684352923, + "flos": 23375030100480.0, + "grad_norm": 1.6059448981622937, + "language_loss": 0.71334994, + "learning_rate": 6.8472180686052e-07, + "loss": 0.79010946, + "num_input_tokens_seen": 264301625, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.10839844, + "step": 12251, + "time_per_iteration": 2.545740842819214 + }, + { + "auxiliary_loss_clip": 0.0640523, + "auxiliary_loss_mlp": 0.01263198, + "balance_loss_clip": 0.06272127, + "balance_loss_mlp": 0.01253584, + "epoch": 0.7366300916879603, + "flos": 59537610380160.0, + "grad_norm": 1.4529727777201047, + "language_loss": 0.66069037, + "learning_rate": 6.844284374090015e-07, + "loss": 0.73737466, + "num_input_tokens_seen": 264323975, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.09606934, + "step": 12252, + "time_per_iteration": 2.884873628616333 + }, + { + "auxiliary_loss_clip": 0.06412438, + "auxiliary_loss_mlp": 0.0126905, + "balance_loss_clip": 0.06274702, + "balance_loss_mlp": 0.01258488, + "epoch": 0.7366902149406283, + "flos": 20929445308800.0, + "grad_norm": 1.6593281267940243, + "language_loss": 0.79292876, + "learning_rate": 6.841351178440884e-07, + "loss": 0.86974359, + "num_input_tokens_seen": 264343785, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.10559082, + "step": 12253, + "time_per_iteration": 2.56786847114563 + }, + { + "auxiliary_loss_clip": 0.06405851, + "auxiliary_loss_mlp": 0.01262554, + "balance_loss_clip": 0.06274677, + "balance_loss_mlp": 0.01253739, + "epoch": 0.7367503381932963, + "flos": 17353973335680.0, + "grad_norm": 1.9323805517919423, + "language_loss": 0.76607239, + "learning_rate": 6.83841848176905e-07, + "loss": 0.84275639, + "num_input_tokens_seen": 264361130, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.08813477, + "step": 12254, + "time_per_iteration": 2.465092182159424 + }, + { + "auxiliary_loss_clip": 0.06408474, + "auxiliary_loss_mlp": 0.01264148, + "balance_loss_clip": 0.06274708, + "balance_loss_mlp": 0.0125361, + "epoch": 0.7368104614459642, + "flos": 17827017960960.0, + "grad_norm": 3.2694109886339366, + "language_loss": 0.69397593, + "learning_rate": 6.835486284185692e-07, + "loss": 0.77070212, + "num_input_tokens_seen": 264376965, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.10534668, + "step": 12255, + "time_per_iteration": 2.5002591609954834 + }, + { + "auxiliary_loss_clip": 0.06412044, + "auxiliary_loss_mlp": 0.01265607, + "balance_loss_clip": 0.06276523, + "balance_loss_mlp": 0.01255117, + "epoch": 0.7368705846986322, + "flos": 24612672032640.0, + "grad_norm": 1.5801315841847023, + "language_loss": 0.75219184, + "learning_rate": 6.832554585802012e-07, + "loss": 0.82896841, + "num_input_tokens_seen": 264396310, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.10491943, + "step": 12256, + "time_per_iteration": 4.017148494720459 + }, + { + "auxiliary_loss_clip": 0.06408297, + "auxiliary_loss_mlp": 0.0126377, + "balance_loss_clip": 0.06273545, + "balance_loss_mlp": 0.01254043, + "epoch": 0.7369307079513001, + "flos": 34978829322240.0, + "grad_norm": 1.5326155216287436, + "language_loss": 0.74032342, + "learning_rate": 6.829623386729182e-07, + "loss": 0.81704414, + "num_input_tokens_seen": 264418085, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.09729004, + "step": 12257, + "time_per_iteration": 2.647477388381958 + }, + { + "auxiliary_loss_clip": 0.06406973, + "auxiliary_loss_mlp": 0.01263484, + "balance_loss_clip": 0.0627301, + "balance_loss_mlp": 0.01253965, + "epoch": 0.7369908312039681, + "flos": 21220872209280.0, + "grad_norm": 1.4761434387135868, + "language_loss": 0.78534251, + "learning_rate": 6.826692687078362e-07, + "loss": 0.86204708, + "num_input_tokens_seen": 264437595, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.09521484, + "step": 12258, + "time_per_iteration": 2.572261333465576 + }, + { + "auxiliary_loss_clip": 0.06412143, + "auxiliary_loss_mlp": 0.01264951, + "balance_loss_clip": 0.06274798, + "balance_loss_mlp": 0.01255194, + "epoch": 0.7370509544566362, + "flos": 23630510799360.0, + "grad_norm": 1.4160381635671, + "language_loss": 0.66616917, + "learning_rate": 6.823762486960674e-07, + "loss": 0.74294007, + "num_input_tokens_seen": 264457385, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.09759521, + "step": 12259, + "time_per_iteration": 2.507096290588379 + }, + { + "auxiliary_loss_clip": 0.06408918, + "auxiliary_loss_mlp": 0.01264842, + "balance_loss_clip": 0.06274989, + "balance_loss_mlp": 0.01254406, + "epoch": 0.7371110777093041, + "flos": 24834764079360.0, + "grad_norm": 1.6356397611324185, + "language_loss": 0.73572636, + "learning_rate": 6.820832786487225e-07, + "loss": 0.81246388, + "num_input_tokens_seen": 264477205, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.10424805, + "step": 12260, + "time_per_iteration": 2.55729341506958 + }, + { + "auxiliary_loss_clip": 0.06410116, + "auxiliary_loss_mlp": 0.0126791, + "balance_loss_clip": 0.06274181, + "balance_loss_mlp": 0.01257217, + "epoch": 0.7371712009619721, + "flos": 23156292216960.0, + "grad_norm": 1.5911507549060615, + "language_loss": 0.7366817, + "learning_rate": 6.817903585769125e-07, + "loss": 0.81346196, + "num_input_tokens_seen": 264497195, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.10693359, + "step": 12261, + "time_per_iteration": 2.4976613521575928 + }, + { + "auxiliary_loss_clip": 0.06411919, + "auxiliary_loss_mlp": 0.01266277, + "balance_loss_clip": 0.06273584, + "balance_loss_mlp": 0.01254845, + "epoch": 0.73723132421464, + "flos": 23119675182720.0, + "grad_norm": 1.9595701183137586, + "language_loss": 0.67333376, + "learning_rate": 6.814974884917438e-07, + "loss": 0.75011569, + "num_input_tokens_seen": 264516950, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.11425781, + "step": 12262, + "time_per_iteration": 2.5359151363372803 + }, + { + "auxiliary_loss_clip": 0.06410287, + "auxiliary_loss_mlp": 0.01266365, + "balance_loss_clip": 0.06273925, + "balance_loss_mlp": 0.01255881, + "epoch": 0.737291447467308, + "flos": 19278031115520.0, + "grad_norm": 1.8055684860594015, + "language_loss": 0.8872509, + "learning_rate": 6.81204668404322e-07, + "loss": 0.96401745, + "num_input_tokens_seen": 264532675, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.10479736, + "step": 12263, + "time_per_iteration": 2.4645025730133057 + }, + { + "auxiliary_loss_clip": 0.06401009, + "auxiliary_loss_mlp": 0.01262824, + "balance_loss_clip": 0.06273551, + "balance_loss_mlp": 0.01253717, + "epoch": 0.7373515707199759, + "flos": 25125142803840.0, + "grad_norm": 1.5128594481302715, + "language_loss": 0.67552602, + "learning_rate": 6.809118983257522e-07, + "loss": 0.75216436, + "num_input_tokens_seen": 264555635, + "router_z_loss_clip": 1.27441406, + "router_z_loss_mlp": 0.09100342, + "step": 12264, + "time_per_iteration": 2.569833517074585 + }, + { + "auxiliary_loss_clip": 0.06405195, + "auxiliary_loss_mlp": 0.012641, + "balance_loss_clip": 0.06273174, + "balance_loss_mlp": 0.0125442, + "epoch": 0.737411693972644, + "flos": 32415427290240.0, + "grad_norm": 1.6707890497545697, + "language_loss": 0.80282211, + "learning_rate": 6.806191782671356e-07, + "loss": 0.87951505, + "num_input_tokens_seen": 264573140, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.09674072, + "step": 12265, + "time_per_iteration": 3.997997283935547 + }, + { + "auxiliary_loss_clip": 0.06415318, + "auxiliary_loss_mlp": 0.01264678, + "balance_loss_clip": 0.06273959, + "balance_loss_mlp": 0.01253758, + "epoch": 0.7374718172253119, + "flos": 24322586797440.0, + "grad_norm": 1.6052844739789887, + "language_loss": 0.75045347, + "learning_rate": 6.803265082395711e-07, + "loss": 0.82725346, + "num_input_tokens_seen": 264591610, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.10919189, + "step": 12266, + "time_per_iteration": 2.5624334812164307 + }, + { + "auxiliary_loss_clip": 0.06408488, + "auxiliary_loss_mlp": 0.01267186, + "balance_loss_clip": 0.06273493, + "balance_loss_mlp": 0.0125697, + "epoch": 0.7375319404779799, + "flos": 27162447776640.0, + "grad_norm": 1.557791078804126, + "language_loss": 0.73471284, + "learning_rate": 6.800338882541576e-07, + "loss": 0.81146955, + "num_input_tokens_seen": 264611170, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.10217285, + "step": 12267, + "time_per_iteration": 2.561325788497925 + }, + { + "auxiliary_loss_clip": 0.06408671, + "auxiliary_loss_mlp": 0.0126606, + "balance_loss_clip": 0.06273606, + "balance_loss_mlp": 0.01256654, + "epoch": 0.7375920637306478, + "flos": 18885977061120.0, + "grad_norm": 1.9471728084971924, + "language_loss": 0.83236742, + "learning_rate": 6.797413183219923e-07, + "loss": 0.90911472, + "num_input_tokens_seen": 264629365, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.09411621, + "step": 12268, + "time_per_iteration": 2.515185832977295 + }, + { + "auxiliary_loss_clip": 0.06403858, + "auxiliary_loss_mlp": 0.01268762, + "balance_loss_clip": 0.06272093, + "balance_loss_mlp": 0.01258641, + "epoch": 0.7376521869833158, + "flos": 15675291838080.0, + "grad_norm": 1.7639029349548874, + "language_loss": 0.73450869, + "learning_rate": 6.794487984541677e-07, + "loss": 0.81123489, + "num_input_tokens_seen": 264647915, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.10113525, + "step": 12269, + "time_per_iteration": 3.9070801734924316 + }, + { + "auxiliary_loss_clip": 0.06414587, + "auxiliary_loss_mlp": 0.01264636, + "balance_loss_clip": 0.06275409, + "balance_loss_mlp": 0.01253186, + "epoch": 0.7377123102359837, + "flos": 36980146166400.0, + "grad_norm": 1.919355815322485, + "language_loss": 0.70780635, + "learning_rate": 6.791563286617776e-07, + "loss": 0.78459859, + "num_input_tokens_seen": 264669620, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.11450195, + "step": 12270, + "time_per_iteration": 2.6150050163269043 + }, + { + "auxiliary_loss_clip": 0.06405621, + "auxiliary_loss_mlp": 0.01267086, + "balance_loss_clip": 0.06273162, + "balance_loss_mlp": 0.01257514, + "epoch": 0.7377724334886517, + "flos": 24502779002880.0, + "grad_norm": 1.650003260672948, + "language_loss": 0.69519281, + "learning_rate": 6.788639089559119e-07, + "loss": 0.77191985, + "num_input_tokens_seen": 264689345, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09564209, + "step": 12271, + "time_per_iteration": 2.545802593231201 + }, + { + "auxiliary_loss_clip": 0.06407182, + "auxiliary_loss_mlp": 0.01265449, + "balance_loss_clip": 0.06271105, + "balance_loss_mlp": 0.01254565, + "epoch": 0.7378325567413198, + "flos": 24397036750080.0, + "grad_norm": 2.0373077116973577, + "language_loss": 0.67736673, + "learning_rate": 6.785715393476586e-07, + "loss": 0.75409299, + "num_input_tokens_seen": 264707625, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.10882568, + "step": 12272, + "time_per_iteration": 2.5161080360412598 + }, + { + "auxiliary_loss_clip": 0.064047, + "auxiliary_loss_mlp": 0.01266291, + "balance_loss_clip": 0.06272876, + "balance_loss_mlp": 0.01255812, + "epoch": 0.7378926799939877, + "flos": 17421421472640.0, + "grad_norm": 1.6693820905355277, + "language_loss": 0.78472829, + "learning_rate": 6.782792198481049e-07, + "loss": 0.86143827, + "num_input_tokens_seen": 264725575, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.10479736, + "step": 12273, + "time_per_iteration": 2.527449369430542 + }, + { + "auxiliary_loss_clip": 0.0640404, + "auxiliary_loss_mlp": 0.01265172, + "balance_loss_clip": 0.06270438, + "balance_loss_mlp": 0.01255111, + "epoch": 0.7379528032466557, + "flos": 18479374323840.0, + "grad_norm": 1.7204820046502844, + "language_loss": 0.83983135, + "learning_rate": 6.779869504683355e-07, + "loss": 0.91652346, + "num_input_tokens_seen": 264742855, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.1005249, + "step": 12274, + "time_per_iteration": 3.8728952407836914 + }, + { + "auxiliary_loss_clip": 0.06420162, + "auxiliary_loss_mlp": 0.0126937, + "balance_loss_clip": 0.06277606, + "balance_loss_mlp": 0.01258414, + "epoch": 0.7380129264993236, + "flos": 17827814574720.0, + "grad_norm": 1.7616073867402775, + "language_loss": 0.7422626, + "learning_rate": 6.776947312194341e-07, + "loss": 0.81915796, + "num_input_tokens_seen": 264761155, + "router_z_loss_clip": 1.42382812, + "router_z_loss_mlp": 0.10961914, + "step": 12275, + "time_per_iteration": 2.528137445449829 + }, + { + "auxiliary_loss_clip": 0.06413853, + "auxiliary_loss_mlp": 0.01270057, + "balance_loss_clip": 0.06274875, + "balance_loss_mlp": 0.01259352, + "epoch": 0.7380730497519916, + "flos": 23003115753600.0, + "grad_norm": 1.6499843647208283, + "language_loss": 0.73819113, + "learning_rate": 6.774025621124813e-07, + "loss": 0.81503022, + "num_input_tokens_seen": 264780660, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.10699463, + "step": 12276, + "time_per_iteration": 2.49808931350708 + }, + { + "auxiliary_loss_clip": 0.06408275, + "auxiliary_loss_mlp": 0.01262969, + "balance_loss_clip": 0.062733, + "balance_loss_mlp": 0.0125329, + "epoch": 0.7381331730046595, + "flos": 20272435044480.0, + "grad_norm": 1.938538877021236, + "language_loss": 0.77922094, + "learning_rate": 6.771104431585551e-07, + "loss": 0.85593343, + "num_input_tokens_seen": 264798850, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.09680176, + "step": 12277, + "time_per_iteration": 2.5433340072631836 + }, + { + "auxiliary_loss_clip": 0.06408259, + "auxiliary_loss_mlp": 0.01270849, + "balance_loss_clip": 0.06276105, + "balance_loss_mlp": 0.01260495, + "epoch": 0.7381932962573275, + "flos": 19760467397760.0, + "grad_norm": 1.5941630218798921, + "language_loss": 0.79001057, + "learning_rate": 6.768183743687338e-07, + "loss": 0.86680162, + "num_input_tokens_seen": 264816795, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.10351562, + "step": 12278, + "time_per_iteration": 2.5074949264526367 + }, + { + "auxiliary_loss_clip": 0.06409795, + "auxiliary_loss_mlp": 0.01264815, + "balance_loss_clip": 0.06271898, + "balance_loss_mlp": 0.01254248, + "epoch": 0.7382534195099955, + "flos": 17310060996480.0, + "grad_norm": 3.5373334504988474, + "language_loss": 0.71857256, + "learning_rate": 6.765263557540921e-07, + "loss": 0.79531866, + "num_input_tokens_seen": 264834105, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.10577393, + "step": 12279, + "time_per_iteration": 2.516350269317627 + }, + { + "auxiliary_loss_clip": 0.06410283, + "auxiliary_loss_mlp": 0.01266626, + "balance_loss_clip": 0.0627284, + "balance_loss_mlp": 0.01256243, + "epoch": 0.7383135427626635, + "flos": 18703269233280.0, + "grad_norm": 2.101190205716009, + "language_loss": 0.85982198, + "learning_rate": 6.762343873257034e-07, + "loss": 0.93659103, + "num_input_tokens_seen": 264850895, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.10388184, + "step": 12280, + "time_per_iteration": 2.4823272228240967 + }, + { + "auxiliary_loss_clip": 0.06411093, + "auxiliary_loss_mlp": 0.01264508, + "balance_loss_clip": 0.06273913, + "balance_loss_mlp": 0.01253493, + "epoch": 0.7383736660153314, + "flos": 20886706926720.0, + "grad_norm": 1.8639643742325518, + "language_loss": 0.72394395, + "learning_rate": 6.759424690946408e-07, + "loss": 0.80069995, + "num_input_tokens_seen": 264869505, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.11016846, + "step": 12281, + "time_per_iteration": 2.5224528312683105 + }, + { + "auxiliary_loss_clip": 0.06412193, + "auxiliary_loss_mlp": 0.01266264, + "balance_loss_clip": 0.06275124, + "balance_loss_mlp": 0.01255821, + "epoch": 0.7384337892679994, + "flos": 20668723729920.0, + "grad_norm": 1.7354362664323408, + "language_loss": 0.61005342, + "learning_rate": 6.756506010719711e-07, + "loss": 0.68683791, + "num_input_tokens_seen": 264886915, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.10449219, + "step": 12282, + "time_per_iteration": 2.5047874450683594 + }, + { + "auxiliary_loss_clip": 0.06414121, + "auxiliary_loss_mlp": 0.01267578, + "balance_loss_clip": 0.06274915, + "balance_loss_mlp": 0.01256945, + "epoch": 0.7384939125206673, + "flos": 29177432835840.0, + "grad_norm": 1.7016014462601576, + "language_loss": 0.6800909, + "learning_rate": 6.753587832687632e-07, + "loss": 0.75690794, + "num_input_tokens_seen": 264910350, + "router_z_loss_clip": 1.39257812, + "router_z_loss_mlp": 0.10632324, + "step": 12283, + "time_per_iteration": 2.5679969787597656 + }, + { + "auxiliary_loss_clip": 0.06408164, + "auxiliary_loss_mlp": 0.01266727, + "balance_loss_clip": 0.06274041, + "balance_loss_mlp": 0.01256636, + "epoch": 0.7385540357733353, + "flos": 36320494498560.0, + "grad_norm": 1.58111004650423, + "language_loss": 0.76160252, + "learning_rate": 6.750670156960832e-07, + "loss": 0.83835149, + "num_input_tokens_seen": 264930705, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.10095215, + "step": 12284, + "time_per_iteration": 2.6471667289733887 + }, + { + "auxiliary_loss_clip": 0.06415117, + "auxiliary_loss_mlp": 0.01265727, + "balance_loss_clip": 0.06277623, + "balance_loss_mlp": 0.01255028, + "epoch": 0.7386141590260034, + "flos": 20308758589440.0, + "grad_norm": 2.367235737464537, + "language_loss": 0.69446218, + "learning_rate": 6.747752983649954e-07, + "loss": 0.77127063, + "num_input_tokens_seen": 264946975, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.10705566, + "step": 12285, + "time_per_iteration": 2.473684549331665 + }, + { + "auxiliary_loss_clip": 0.06417808, + "auxiliary_loss_mlp": 0.01266655, + "balance_loss_clip": 0.06276424, + "balance_loss_mlp": 0.0125499, + "epoch": 0.7386742822786713, + "flos": 25490851948800.0, + "grad_norm": 1.8974918118522153, + "language_loss": 0.80231923, + "learning_rate": 6.744836312865602e-07, + "loss": 0.87916386, + "num_input_tokens_seen": 264967665, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.11669922, + "step": 12286, + "time_per_iteration": 2.552478313446045 + }, + { + "auxiliary_loss_clip": 0.06409865, + "auxiliary_loss_mlp": 0.01264773, + "balance_loss_clip": 0.06276139, + "balance_loss_mlp": 0.01254897, + "epoch": 0.7387344055313393, + "flos": 13777075843200.0, + "grad_norm": 2.0836319453796452, + "language_loss": 0.65815514, + "learning_rate": 6.741920144718396e-07, + "loss": 0.73490155, + "num_input_tokens_seen": 264985480, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.09881592, + "step": 12287, + "time_per_iteration": 2.47298264503479 + }, + { + "auxiliary_loss_clip": 0.0640405, + "auxiliary_loss_mlp": 0.01265177, + "balance_loss_clip": 0.06273359, + "balance_loss_mlp": 0.01255557, + "epoch": 0.7387945287840072, + "flos": 27862615693440.0, + "grad_norm": 1.674403553414071, + "language_loss": 0.76529717, + "learning_rate": 6.739004479318903e-07, + "loss": 0.84198946, + "num_input_tokens_seen": 265004790, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.09619141, + "step": 12288, + "time_per_iteration": 2.5699422359466553 + }, + { + "auxiliary_loss_clip": 0.06413888, + "auxiliary_loss_mlp": 0.0126915, + "balance_loss_clip": 0.06274378, + "balance_loss_mlp": 0.01257689, + "epoch": 0.7388546520366752, + "flos": 44242492515840.0, + "grad_norm": 1.8421640794180243, + "language_loss": 0.58466721, + "learning_rate": 6.736089316777684e-07, + "loss": 0.66149765, + "num_input_tokens_seen": 265028790, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.11462402, + "step": 12289, + "time_per_iteration": 2.691962242126465 + }, + { + "auxiliary_loss_clip": 0.06318665, + "auxiliary_loss_mlp": 0.01255253, + "balance_loss_clip": 0.06263465, + "balance_loss_mlp": 0.01254091, + "epoch": 0.7389147752893431, + "flos": 70700145672960.0, + "grad_norm": 0.6181631309216685, + "language_loss": 0.49242556, + "learning_rate": 6.733174657205287e-07, + "loss": 0.56816471, + "num_input_tokens_seen": 265096660, + "router_z_loss_clip": 0.55419922, + "router_z_loss_mlp": 0.01159668, + "step": 12290, + "time_per_iteration": 3.2382025718688965 + }, + { + "auxiliary_loss_clip": 0.06410427, + "auxiliary_loss_mlp": 0.01269006, + "balance_loss_clip": 0.0627414, + "balance_loss_mlp": 0.01256811, + "epoch": 0.7389748985420111, + "flos": 26002190689920.0, + "grad_norm": 1.6462515447687802, + "language_loss": 0.67644894, + "learning_rate": 6.730260500712237e-07, + "loss": 0.75324321, + "num_input_tokens_seen": 265116375, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.12182617, + "step": 12291, + "time_per_iteration": 2.5330934524536133 + }, + { + "auxiliary_loss_clip": 0.06323051, + "auxiliary_loss_mlp": 0.01253715, + "balance_loss_clip": 0.06267922, + "balance_loss_mlp": 0.01252465, + "epoch": 0.7390350217946791, + "flos": 54419428558080.0, + "grad_norm": 0.9538265155410941, + "language_loss": 0.60977232, + "learning_rate": 6.727346847409052e-07, + "loss": 0.68553996, + "num_input_tokens_seen": 265161230, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01249695, + "step": 12292, + "time_per_iteration": 2.809068202972412 + }, + { + "auxiliary_loss_clip": 0.06409512, + "auxiliary_loss_mlp": 0.01265193, + "balance_loss_clip": 0.06275129, + "balance_loss_mlp": 0.01255388, + "epoch": 0.7390951450473471, + "flos": 32205116741760.0, + "grad_norm": 2.042192821638958, + "language_loss": 0.67519832, + "learning_rate": 6.724433697406191e-07, + "loss": 0.75194532, + "num_input_tokens_seen": 265182515, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.09814453, + "step": 12293, + "time_per_iteration": 2.633490800857544 + }, + { + "auxiliary_loss_clip": 0.06407283, + "auxiliary_loss_mlp": 0.01264321, + "balance_loss_clip": 0.06273873, + "balance_loss_mlp": 0.01253682, + "epoch": 0.739155268300015, + "flos": 16688745371520.0, + "grad_norm": 1.7465858872032636, + "language_loss": 0.84024155, + "learning_rate": 6.721521050814134e-07, + "loss": 0.91695762, + "num_input_tokens_seen": 265198160, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.10644531, + "step": 12294, + "time_per_iteration": 2.4902942180633545 + }, + { + "auxiliary_loss_clip": 0.064035, + "auxiliary_loss_mlp": 0.01264966, + "balance_loss_clip": 0.06273185, + "balance_loss_mlp": 0.01254976, + "epoch": 0.739215391552683, + "flos": 31657831799040.0, + "grad_norm": 1.4686013728036598, + "language_loss": 0.72988927, + "learning_rate": 6.718608907743337e-07, + "loss": 0.80657387, + "num_input_tokens_seen": 265218480, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.09985352, + "step": 12295, + "time_per_iteration": 4.01623272895813 + }, + { + "auxiliary_loss_clip": 0.06404971, + "auxiliary_loss_mlp": 0.01264401, + "balance_loss_clip": 0.06274794, + "balance_loss_mlp": 0.01254906, + "epoch": 0.7392755148053509, + "flos": 29726688349440.0, + "grad_norm": 1.6462168088608014, + "language_loss": 0.78829199, + "learning_rate": 6.715697268304215e-07, + "loss": 0.8649857, + "num_input_tokens_seen": 265240165, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.09490967, + "step": 12296, + "time_per_iteration": 2.6365103721618652 + }, + { + "auxiliary_loss_clip": 0.06404981, + "auxiliary_loss_mlp": 0.01267023, + "balance_loss_clip": 0.0627135, + "balance_loss_mlp": 0.01256008, + "epoch": 0.7393356380580189, + "flos": 37059585436800.0, + "grad_norm": 1.8865876945980686, + "language_loss": 0.67489415, + "learning_rate": 6.712786132607182e-07, + "loss": 0.75161421, + "num_input_tokens_seen": 265263295, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.11010742, + "step": 12297, + "time_per_iteration": 2.6924734115600586 + }, + { + "auxiliary_loss_clip": 0.06407569, + "auxiliary_loss_mlp": 0.01264759, + "balance_loss_clip": 0.06272732, + "balance_loss_mlp": 0.01254447, + "epoch": 0.739395761310687, + "flos": 19725820934400.0, + "grad_norm": 1.5263040230444953, + "language_loss": 0.68836749, + "learning_rate": 6.709875500762645e-07, + "loss": 0.7650907, + "num_input_tokens_seen": 265282740, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.10308838, + "step": 12298, + "time_per_iteration": 2.501797914505005 + }, + { + "auxiliary_loss_clip": 0.06407927, + "auxiliary_loss_mlp": 0.01267097, + "balance_loss_clip": 0.06273854, + "balance_loss_mlp": 0.01256559, + "epoch": 0.7394558845633549, + "flos": 11806254685440.0, + "grad_norm": 2.783354408484115, + "language_loss": 0.74698675, + "learning_rate": 6.706965372880946e-07, + "loss": 0.82373697, + "num_input_tokens_seen": 265300175, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.10534668, + "step": 12299, + "time_per_iteration": 2.479194164276123 + }, + { + "auxiliary_loss_clip": 0.06317861, + "auxiliary_loss_mlp": 0.01251014, + "balance_loss_clip": 0.06262733, + "balance_loss_mlp": 0.01249821, + "epoch": 0.7395160078160229, + "flos": 66214782213120.0, + "grad_norm": 0.7124865082748734, + "language_loss": 0.60634726, + "learning_rate": 6.704055749072455e-07, + "loss": 0.68203598, + "num_input_tokens_seen": 265363275, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.01190948, + "step": 12300, + "time_per_iteration": 3.154963493347168 + }, + { + "auxiliary_loss_clip": 0.06409278, + "auxiliary_loss_mlp": 0.01265061, + "balance_loss_clip": 0.06273282, + "balance_loss_mlp": 0.01254451, + "epoch": 0.7395761310686908, + "flos": 21255770234880.0, + "grad_norm": 1.6643476346606387, + "language_loss": 0.80243456, + "learning_rate": 6.7011466294475e-07, + "loss": 0.87917793, + "num_input_tokens_seen": 265382935, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.1060791, + "step": 12301, + "time_per_iteration": 2.529728889465332 + }, + { + "auxiliary_loss_clip": 0.064082, + "auxiliary_loss_mlp": 0.01264915, + "balance_loss_clip": 0.06274755, + "balance_loss_mlp": 0.01254508, + "epoch": 0.7396362543213588, + "flos": 25961967930240.0, + "grad_norm": 1.3607409082618038, + "language_loss": 0.72955477, + "learning_rate": 6.698238014116406e-07, + "loss": 0.80628592, + "num_input_tokens_seen": 265403245, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.10406494, + "step": 12302, + "time_per_iteration": 2.546940326690674 + }, + { + "auxiliary_loss_clip": 0.06409822, + "auxiliary_loss_mlp": 0.01265837, + "balance_loss_clip": 0.06272913, + "balance_loss_mlp": 0.01255567, + "epoch": 0.7396963775740267, + "flos": 27384791385600.0, + "grad_norm": 1.8966052271775322, + "language_loss": 0.74529129, + "learning_rate": 6.695329903189451e-07, + "loss": 0.82204789, + "num_input_tokens_seen": 265423105, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.1027832, + "step": 12303, + "time_per_iteration": 2.5615267753601074 + }, + { + "auxiliary_loss_clip": 0.06403703, + "auxiliary_loss_mlp": 0.01264851, + "balance_loss_clip": 0.06271822, + "balance_loss_mlp": 0.01255546, + "epoch": 0.7397565008266948, + "flos": 25527175493760.0, + "grad_norm": 1.6634023085525402, + "language_loss": 0.54497898, + "learning_rate": 6.692422296776927e-07, + "loss": 0.62166452, + "num_input_tokens_seen": 265443445, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09307861, + "step": 12304, + "time_per_iteration": 2.5219099521636963 + }, + { + "auxiliary_loss_clip": 0.06408396, + "auxiliary_loss_mlp": 0.01263792, + "balance_loss_clip": 0.06273419, + "balance_loss_mlp": 0.01253808, + "epoch": 0.7398166240793627, + "flos": 23733737429760.0, + "grad_norm": 6.743550792885306, + "language_loss": 0.84620976, + "learning_rate": 6.689515194989084e-07, + "loss": 0.92293161, + "num_input_tokens_seen": 265462085, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.09979248, + "step": 12305, + "time_per_iteration": 3.947659969329834 + }, + { + "auxiliary_loss_clip": 0.06311572, + "auxiliary_loss_mlp": 0.01252487, + "balance_loss_clip": 0.06256508, + "balance_loss_mlp": 0.01251203, + "epoch": 0.7398767473320307, + "flos": 67289002755840.0, + "grad_norm": 0.8626934880407965, + "language_loss": 0.57769525, + "learning_rate": 6.68660859793615e-07, + "loss": 0.65333581, + "num_input_tokens_seen": 265521190, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.0128479, + "step": 12306, + "time_per_iteration": 3.1756792068481445 + }, + { + "auxiliary_loss_clip": 0.06411088, + "auxiliary_loss_mlp": 0.01263791, + "balance_loss_clip": 0.06273864, + "balance_loss_mlp": 0.0125327, + "epoch": 0.7399368705846986, + "flos": 22025356859520.0, + "grad_norm": 1.7963583951725388, + "language_loss": 0.81658536, + "learning_rate": 6.683702505728355e-07, + "loss": 0.89333415, + "num_input_tokens_seen": 265539705, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.10516357, + "step": 12307, + "time_per_iteration": 2.506915330886841 + }, + { + "auxiliary_loss_clip": 0.06403811, + "auxiliary_loss_mlp": 0.0126475, + "balance_loss_clip": 0.06274117, + "balance_loss_mlp": 0.01255696, + "epoch": 0.7399969938373666, + "flos": 14179150460160.0, + "grad_norm": 1.6050625884123768, + "language_loss": 0.70237017, + "learning_rate": 6.680796918475893e-07, + "loss": 0.77905583, + "num_input_tokens_seen": 265555855, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.09051514, + "step": 12308, + "time_per_iteration": 3.91337513923645 + }, + { + "auxiliary_loss_clip": 0.06401709, + "auxiliary_loss_mlp": 0.01262204, + "balance_loss_clip": 0.06271876, + "balance_loss_mlp": 0.01252459, + "epoch": 0.7400571170900345, + "flos": 25308521464320.0, + "grad_norm": 1.6982405979686375, + "language_loss": 0.81117153, + "learning_rate": 6.67789183628896e-07, + "loss": 0.88781071, + "num_input_tokens_seen": 265575455, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.09747314, + "step": 12309, + "time_per_iteration": 2.5796985626220703 + }, + { + "auxiliary_loss_clip": 0.06409381, + "auxiliary_loss_mlp": 0.01269417, + "balance_loss_clip": 0.06270479, + "balance_loss_mlp": 0.01258534, + "epoch": 0.7401172403427025, + "flos": 22718019836160.0, + "grad_norm": 5.238582270491251, + "language_loss": 0.73371196, + "learning_rate": 6.674987259277692e-07, + "loss": 0.81049991, + "num_input_tokens_seen": 265595250, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.10882568, + "step": 12310, + "time_per_iteration": 2.5165646076202393 + }, + { + "auxiliary_loss_clip": 0.06409644, + "auxiliary_loss_mlp": 0.01269084, + "balance_loss_clip": 0.06274551, + "balance_loss_mlp": 0.01257669, + "epoch": 0.7401773635953706, + "flos": 18071639556480.0, + "grad_norm": 2.7222235322625417, + "language_loss": 0.89223385, + "learning_rate": 6.672083187552239e-07, + "loss": 0.96902108, + "num_input_tokens_seen": 265606945, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.11425781, + "step": 12311, + "time_per_iteration": 2.467475652694702 + }, + { + "auxiliary_loss_clip": 0.0640601, + "auxiliary_loss_mlp": 0.01266757, + "balance_loss_clip": 0.06272036, + "balance_loss_mlp": 0.01256934, + "epoch": 0.7402374868480385, + "flos": 22718942231040.0, + "grad_norm": 1.4999851664761075, + "language_loss": 0.8031621, + "learning_rate": 6.669179621222738e-07, + "loss": 0.87988985, + "num_input_tokens_seen": 265626115, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.09832764, + "step": 12312, + "time_per_iteration": 2.5331287384033203 + }, + { + "auxiliary_loss_clip": 0.06405149, + "auxiliary_loss_mlp": 0.01264931, + "balance_loss_clip": 0.06272588, + "balance_loss_mlp": 0.01255072, + "epoch": 0.7402976101007065, + "flos": 22863272088960.0, + "grad_norm": 1.7972684240515402, + "language_loss": 0.78719336, + "learning_rate": 6.666276560399273e-07, + "loss": 0.86389416, + "num_input_tokens_seen": 265646520, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.09857178, + "step": 12313, + "time_per_iteration": 2.5370211601257324 + }, + { + "auxiliary_loss_clip": 0.06407566, + "auxiliary_loss_mlp": 0.01265144, + "balance_loss_clip": 0.0626882, + "balance_loss_mlp": 0.01254308, + "epoch": 0.7403577333533744, + "flos": 12350143537920.0, + "grad_norm": 1.8417739265455044, + "language_loss": 0.79031622, + "learning_rate": 6.663374005191937e-07, + "loss": 0.86704326, + "num_input_tokens_seen": 265661875, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.10827637, + "step": 12314, + "time_per_iteration": 3.856675148010254 + }, + { + "auxiliary_loss_clip": 0.06317294, + "auxiliary_loss_mlp": 0.01250351, + "balance_loss_clip": 0.06261952, + "balance_loss_mlp": 0.01249078, + "epoch": 0.7404178566060424, + "flos": 60346189152000.0, + "grad_norm": 0.8038008604712399, + "language_loss": 0.55230701, + "learning_rate": 6.660471955710809e-07, + "loss": 0.62798345, + "num_input_tokens_seen": 265721255, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01273346, + "step": 12315, + "time_per_iteration": 3.094839334487915 + }, + { + "auxiliary_loss_clip": 0.06400545, + "auxiliary_loss_mlp": 0.01269055, + "balance_loss_clip": 0.06270912, + "balance_loss_mlp": 0.01259298, + "epoch": 0.7404779798587103, + "flos": 32022786257280.0, + "grad_norm": 1.42588959053577, + "language_loss": 0.79849303, + "learning_rate": 6.65757041206591e-07, + "loss": 0.87518907, + "num_input_tokens_seen": 265743970, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.09759521, + "step": 12316, + "time_per_iteration": 2.6217541694641113 + }, + { + "auxiliary_loss_clip": 0.06405086, + "auxiliary_loss_mlp": 0.01263693, + "balance_loss_clip": 0.06270514, + "balance_loss_mlp": 0.01253703, + "epoch": 0.7405381031113784, + "flos": 12893571192960.0, + "grad_norm": 1.9031027598783419, + "language_loss": 0.74949759, + "learning_rate": 6.654669374367275e-07, + "loss": 0.82618535, + "num_input_tokens_seen": 265760890, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.09997559, + "step": 12317, + "time_per_iteration": 2.4909305572509766 + }, + { + "auxiliary_loss_clip": 0.06398293, + "auxiliary_loss_mlp": 0.01265661, + "balance_loss_clip": 0.06270675, + "balance_loss_mlp": 0.01256625, + "epoch": 0.7405982263640463, + "flos": 20235189104640.0, + "grad_norm": 1.7604511064610666, + "language_loss": 0.81780982, + "learning_rate": 6.651768842724917e-07, + "loss": 0.89444935, + "num_input_tokens_seen": 265779600, + "router_z_loss_clip": 1.27636719, + "router_z_loss_mlp": 0.09039307, + "step": 12318, + "time_per_iteration": 2.5435891151428223 + }, + { + "auxiliary_loss_clip": 0.06408297, + "auxiliary_loss_mlp": 0.01266199, + "balance_loss_clip": 0.06271499, + "balance_loss_mlp": 0.01256317, + "epoch": 0.7406583496167143, + "flos": 17573088562560.0, + "grad_norm": 1.866306408499981, + "language_loss": 0.76751161, + "learning_rate": 6.648868817248827e-07, + "loss": 0.84425652, + "num_input_tokens_seen": 265797030, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.09887695, + "step": 12319, + "time_per_iteration": 2.4622530937194824 + }, + { + "auxiliary_loss_clip": 0.0640564, + "auxiliary_loss_mlp": 0.01263336, + "balance_loss_clip": 0.06272121, + "balance_loss_mlp": 0.01253645, + "epoch": 0.7407184728693822, + "flos": 18301530032640.0, + "grad_norm": 2.0432497673800563, + "language_loss": 0.63919193, + "learning_rate": 6.64596929804897e-07, + "loss": 0.71588171, + "num_input_tokens_seen": 265815055, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.09698486, + "step": 12320, + "time_per_iteration": 2.491823196411133 + }, + { + "auxiliary_loss_clip": 0.06412543, + "auxiliary_loss_mlp": 0.01263353, + "balance_loss_clip": 0.06273834, + "balance_loss_mlp": 0.01252761, + "epoch": 0.7407785961220502, + "flos": 16696124530560.0, + "grad_norm": 2.5007986584617767, + "language_loss": 0.82488716, + "learning_rate": 6.643070285235288e-07, + "loss": 0.90164608, + "num_input_tokens_seen": 265828480, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.10583496, + "step": 12321, + "time_per_iteration": 2.472942352294922 + }, + { + "auxiliary_loss_clip": 0.06413056, + "auxiliary_loss_mlp": 0.01275475, + "balance_loss_clip": 0.06272203, + "balance_loss_mlp": 0.01263488, + "epoch": 0.7408387193747181, + "flos": 22094440151040.0, + "grad_norm": 1.687827757394498, + "language_loss": 0.72481614, + "learning_rate": 6.640171778917727e-07, + "loss": 0.80170149, + "num_input_tokens_seen": 265845825, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.11993408, + "step": 12322, + "time_per_iteration": 2.5148372650146484 + }, + { + "auxiliary_loss_clip": 0.06410389, + "auxiliary_loss_mlp": 0.01264964, + "balance_loss_clip": 0.06275401, + "balance_loss_mlp": 0.01254969, + "epoch": 0.7408988426273861, + "flos": 24242476694400.0, + "grad_norm": 1.7223397407589476, + "language_loss": 0.64227688, + "learning_rate": 6.637273779206183e-07, + "loss": 0.71903044, + "num_input_tokens_seen": 265866335, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.09991455, + "step": 12323, + "time_per_iteration": 2.545907735824585 + }, + { + "auxiliary_loss_clip": 0.06410556, + "auxiliary_loss_mlp": 0.01267934, + "balance_loss_clip": 0.06273916, + "balance_loss_mlp": 0.01257348, + "epoch": 0.7409589658800542, + "flos": 29030671209600.0, + "grad_norm": 1.3447635409056256, + "language_loss": 0.76155257, + "learning_rate": 6.634376286210559e-07, + "loss": 0.83833748, + "num_input_tokens_seen": 265888945, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.105896, + "step": 12324, + "time_per_iteration": 2.6743714809417725 + }, + { + "auxiliary_loss_clip": 0.06405617, + "auxiliary_loss_mlp": 0.01264226, + "balance_loss_clip": 0.06272118, + "balance_loss_mlp": 0.01254272, + "epoch": 0.7410190891327221, + "flos": 19356925334400.0, + "grad_norm": 13.963490844682125, + "language_loss": 0.74922419, + "learning_rate": 6.63147930004073e-07, + "loss": 0.82592261, + "num_input_tokens_seen": 265908030, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.09960938, + "step": 12325, + "time_per_iteration": 2.471677780151367 + }, + { + "auxiliary_loss_clip": 0.064167, + "auxiliary_loss_mlp": 0.01267104, + "balance_loss_clip": 0.06275749, + "balance_loss_mlp": 0.01256208, + "epoch": 0.7410792123853901, + "flos": 22754301454080.0, + "grad_norm": 1.6510689232341687, + "language_loss": 0.68920004, + "learning_rate": 6.628582820806545e-07, + "loss": 0.76603806, + "num_input_tokens_seen": 265927030, + "router_z_loss_clip": 1.40917969, + "router_z_loss_mlp": 0.10906982, + "step": 12326, + "time_per_iteration": 2.544271469116211 + }, + { + "auxiliary_loss_clip": 0.06406512, + "auxiliary_loss_mlp": 0.01270057, + "balance_loss_clip": 0.06271762, + "balance_loss_mlp": 0.01259943, + "epoch": 0.741139335638058, + "flos": 25379156056320.0, + "grad_norm": 2.684979070680883, + "language_loss": 0.89408934, + "learning_rate": 6.625686848617835e-07, + "loss": 0.97085506, + "num_input_tokens_seen": 265945490, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.10113525, + "step": 12327, + "time_per_iteration": 2.514342784881592 + }, + { + "auxiliary_loss_clip": 0.06405853, + "auxiliary_loss_mlp": 0.01270995, + "balance_loss_clip": 0.0627297, + "balance_loss_mlp": 0.01260326, + "epoch": 0.741199458890726, + "flos": 18591154070400.0, + "grad_norm": 1.616289045038266, + "language_loss": 0.86022431, + "learning_rate": 6.62279138358442e-07, + "loss": 0.93699282, + "num_input_tokens_seen": 265963265, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.10668945, + "step": 12328, + "time_per_iteration": 2.546849012374878 + }, + { + "auxiliary_loss_clip": 0.06404015, + "auxiliary_loss_mlp": 0.01266041, + "balance_loss_clip": 0.06273206, + "balance_loss_mlp": 0.01256373, + "epoch": 0.7412595821433939, + "flos": 22133572807680.0, + "grad_norm": 3.0862478099951476, + "language_loss": 0.66898477, + "learning_rate": 6.619896425816103e-07, + "loss": 0.74568534, + "num_input_tokens_seen": 265982270, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09655762, + "step": 12329, + "time_per_iteration": 2.4837799072265625 + }, + { + "auxiliary_loss_clip": 0.06415252, + "auxiliary_loss_mlp": 0.01271747, + "balance_loss_clip": 0.06274865, + "balance_loss_mlp": 0.01261262, + "epoch": 0.741319705396062, + "flos": 29177516689920.0, + "grad_norm": 1.6153996639831127, + "language_loss": 0.67172372, + "learning_rate": 6.617001975422647e-07, + "loss": 0.74859369, + "num_input_tokens_seen": 266003835, + "router_z_loss_clip": 1.40332031, + "router_z_loss_mlp": 0.10479736, + "step": 12330, + "time_per_iteration": 2.59244441986084 + }, + { + "auxiliary_loss_clip": 0.06414045, + "auxiliary_loss_mlp": 0.01265631, + "balance_loss_clip": 0.06274007, + "balance_loss_mlp": 0.01254467, + "epoch": 0.7413798286487299, + "flos": 20673713047680.0, + "grad_norm": 1.8418070280678467, + "language_loss": 0.85594726, + "learning_rate": 6.614108032513823e-07, + "loss": 0.93274403, + "num_input_tokens_seen": 266021595, + "router_z_loss_clip": 1.39941406, + "router_z_loss_mlp": 0.11169434, + "step": 12331, + "time_per_iteration": 2.6050429344177246 + }, + { + "auxiliary_loss_clip": 0.06410865, + "auxiliary_loss_mlp": 0.01264119, + "balance_loss_clip": 0.06275013, + "balance_loss_mlp": 0.01253837, + "epoch": 0.7414399519013979, + "flos": 16404446067840.0, + "grad_norm": 1.9259075760322277, + "language_loss": 0.69746608, + "learning_rate": 6.611214597199364e-07, + "loss": 0.77421594, + "num_input_tokens_seen": 266039860, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10284424, + "step": 12332, + "time_per_iteration": 2.519845485687256 + }, + { + "auxiliary_loss_clip": 0.06408165, + "auxiliary_loss_mlp": 0.01266174, + "balance_loss_clip": 0.06273398, + "balance_loss_mlp": 0.01255761, + "epoch": 0.7415000751540658, + "flos": 25637403939840.0, + "grad_norm": 1.899841467346803, + "language_loss": 0.63552696, + "learning_rate": 6.608321669588984e-07, + "loss": 0.71227038, + "num_input_tokens_seen": 266058050, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.10418701, + "step": 12333, + "time_per_iteration": 2.5220582485198975 + }, + { + "auxiliary_loss_clip": 0.06403545, + "auxiliary_loss_mlp": 0.0126491, + "balance_loss_clip": 0.06274091, + "balance_loss_mlp": 0.01255391, + "epoch": 0.7415601984067338, + "flos": 24506803998720.0, + "grad_norm": 1.7352435942597948, + "language_loss": 0.7115826, + "learning_rate": 6.605429249792387e-07, + "loss": 0.78826714, + "num_input_tokens_seen": 266078060, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.09521484, + "step": 12334, + "time_per_iteration": 3.9428293704986572 + }, + { + "auxiliary_loss_clip": 0.0640265, + "auxiliary_loss_mlp": 0.01263886, + "balance_loss_clip": 0.06269788, + "balance_loss_mlp": 0.01253628, + "epoch": 0.7416203216594017, + "flos": 20893541034240.0, + "grad_norm": 1.579239832257194, + "language_loss": 0.82769573, + "learning_rate": 6.602537337919257e-07, + "loss": 0.90436113, + "num_input_tokens_seen": 266097110, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.10253906, + "step": 12335, + "time_per_iteration": 2.5163700580596924 + }, + { + "auxiliary_loss_clip": 0.06406333, + "auxiliary_loss_mlp": 0.01267868, + "balance_loss_clip": 0.06269982, + "balance_loss_mlp": 0.01257556, + "epoch": 0.7416804449120697, + "flos": 15628276897920.0, + "grad_norm": 2.378220107859676, + "language_loss": 0.75595701, + "learning_rate": 6.599645934079259e-07, + "loss": 0.832699, + "num_input_tokens_seen": 266110870, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.10308838, + "step": 12336, + "time_per_iteration": 2.471386432647705 + }, + { + "auxiliary_loss_clip": 0.06412801, + "auxiliary_loss_mlp": 0.01265477, + "balance_loss_clip": 0.06276821, + "balance_loss_mlp": 0.01255582, + "epoch": 0.7417405681647377, + "flos": 17124795619200.0, + "grad_norm": 1.7670482081057908, + "language_loss": 0.73856127, + "learning_rate": 6.596755038382029e-07, + "loss": 0.8153441, + "num_input_tokens_seen": 266127845, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.09899902, + "step": 12337, + "time_per_iteration": 2.466338872909546 + }, + { + "auxiliary_loss_clip": 0.06405115, + "auxiliary_loss_mlp": 0.01266953, + "balance_loss_clip": 0.06274252, + "balance_loss_mlp": 0.01257428, + "epoch": 0.7418006914174057, + "flos": 18886354404480.0, + "grad_norm": 1.7252215797420232, + "language_loss": 0.76747906, + "learning_rate": 6.593864650937186e-07, + "loss": 0.84419966, + "num_input_tokens_seen": 266145400, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09527588, + "step": 12338, + "time_per_iteration": 2.4993648529052734 + }, + { + "auxiliary_loss_clip": 0.06403196, + "auxiliary_loss_mlp": 0.01266291, + "balance_loss_clip": 0.06271601, + "balance_loss_mlp": 0.01256993, + "epoch": 0.7418608146700737, + "flos": 21587294113920.0, + "grad_norm": 1.629364816328998, + "language_loss": 0.72958922, + "learning_rate": 6.590974771854345e-07, + "loss": 0.80628407, + "num_input_tokens_seen": 266164430, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09301758, + "step": 12339, + "time_per_iteration": 2.4901506900787354 + }, + { + "auxiliary_loss_clip": 0.06403936, + "auxiliary_loss_mlp": 0.01263048, + "balance_loss_clip": 0.06271182, + "balance_loss_mlp": 0.01253011, + "epoch": 0.7419209379227416, + "flos": 22346063562240.0, + "grad_norm": 3.4897351250421322, + "language_loss": 0.79916894, + "learning_rate": 6.588085401243077e-07, + "loss": 0.87583876, + "num_input_tokens_seen": 266183855, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.10046387, + "step": 12340, + "time_per_iteration": 2.5338644981384277 + }, + { + "auxiliary_loss_clip": 0.0640725, + "auxiliary_loss_mlp": 0.0126408, + "balance_loss_clip": 0.06272589, + "balance_loss_mlp": 0.01254168, + "epoch": 0.7419810611754096, + "flos": 16767639590400.0, + "grad_norm": 1.374564761122075, + "language_loss": 0.76099288, + "learning_rate": 6.585196539212958e-07, + "loss": 0.83770621, + "num_input_tokens_seen": 266202085, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.09912109, + "step": 12341, + "time_per_iteration": 2.495758056640625 + }, + { + "auxiliary_loss_clip": 0.06401518, + "auxiliary_loss_mlp": 0.01269793, + "balance_loss_clip": 0.06276906, + "balance_loss_mlp": 0.01260292, + "epoch": 0.7420411844280775, + "flos": 26220048105600.0, + "grad_norm": 1.417674408189636, + "language_loss": 0.80324268, + "learning_rate": 6.582308185873535e-07, + "loss": 0.87995577, + "num_input_tokens_seen": 266223445, + "router_z_loss_clip": 1.24511719, + "router_z_loss_mlp": 0.09503174, + "step": 12342, + "time_per_iteration": 2.5588223934173584 + }, + { + "auxiliary_loss_clip": 0.06405, + "auxiliary_loss_mlp": 0.01266068, + "balance_loss_clip": 0.06272244, + "balance_loss_mlp": 0.01256328, + "epoch": 0.7421013076807456, + "flos": 68542354857600.0, + "grad_norm": 1.7864358028362888, + "language_loss": 0.7745598, + "learning_rate": 6.57942034133433e-07, + "loss": 0.85127044, + "num_input_tokens_seen": 266246575, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09741211, + "step": 12343, + "time_per_iteration": 2.893523693084717 + }, + { + "auxiliary_loss_clip": 0.0640204, + "auxiliary_loss_mlp": 0.01267663, + "balance_loss_clip": 0.06267961, + "balance_loss_mlp": 0.01257482, + "epoch": 0.7421614309334135, + "flos": 24432144410880.0, + "grad_norm": 1.492444453579108, + "language_loss": 0.68024582, + "learning_rate": 6.576533005704843e-07, + "loss": 0.75694287, + "num_input_tokens_seen": 266266055, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.10186768, + "step": 12344, + "time_per_iteration": 4.0460686683654785 + }, + { + "auxiliary_loss_clip": 0.0640749, + "auxiliary_loss_mlp": 0.0126471, + "balance_loss_clip": 0.06272101, + "balance_loss_mlp": 0.01254178, + "epoch": 0.7422215541860815, + "flos": 12315706709760.0, + "grad_norm": 2.0673948051612983, + "language_loss": 0.81438386, + "learning_rate": 6.573646179094572e-07, + "loss": 0.89110589, + "num_input_tokens_seen": 266282240, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.10522461, + "step": 12345, + "time_per_iteration": 2.5168869495391846 + }, + { + "auxiliary_loss_clip": 0.06405216, + "auxiliary_loss_mlp": 0.01263643, + "balance_loss_clip": 0.06271648, + "balance_loss_mlp": 0.01253975, + "epoch": 0.7422816774387494, + "flos": 19651580616960.0, + "grad_norm": 1.781451237104089, + "language_loss": 0.70713991, + "learning_rate": 6.570759861612988e-07, + "loss": 0.7838285, + "num_input_tokens_seen": 266300980, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.09661865, + "step": 12346, + "time_per_iteration": 2.481515407562256 + }, + { + "auxiliary_loss_clip": 0.06407449, + "auxiliary_loss_mlp": 0.01266551, + "balance_loss_clip": 0.06272161, + "balance_loss_mlp": 0.0125683, + "epoch": 0.7423418006914174, + "flos": 32024337557760.0, + "grad_norm": 1.4530238546108785, + "language_loss": 0.73483253, + "learning_rate": 6.56787405336953e-07, + "loss": 0.81157255, + "num_input_tokens_seen": 266322215, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.097229, + "step": 12347, + "time_per_iteration": 2.6118276119232178 + }, + { + "auxiliary_loss_clip": 0.06410117, + "auxiliary_loss_mlp": 0.01263875, + "balance_loss_clip": 0.06271449, + "balance_loss_mlp": 0.01253355, + "epoch": 0.7424019239440853, + "flos": 18923013365760.0, + "grad_norm": 2.221279445831195, + "language_loss": 0.81336832, + "learning_rate": 6.564988754473642e-07, + "loss": 0.89010823, + "num_input_tokens_seen": 266341600, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.10522461, + "step": 12348, + "time_per_iteration": 3.9795804023742676 + }, + { + "auxiliary_loss_clip": 0.06404714, + "auxiliary_loss_mlp": 0.01264602, + "balance_loss_clip": 0.06274206, + "balance_loss_mlp": 0.01254827, + "epoch": 0.7424620471967533, + "flos": 35884360396800.0, + "grad_norm": 1.7176907745599117, + "language_loss": 0.72897398, + "learning_rate": 6.562103965034724e-07, + "loss": 0.8056671, + "num_input_tokens_seen": 266362895, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.09765625, + "step": 12349, + "time_per_iteration": 2.5986247062683105 + }, + { + "auxiliary_loss_clip": 0.0641204, + "auxiliary_loss_mlp": 0.01266614, + "balance_loss_clip": 0.06272119, + "balance_loss_mlp": 0.01255629, + "epoch": 0.7425221704494213, + "flos": 27023987704320.0, + "grad_norm": 1.8752409058268018, + "language_loss": 0.79401171, + "learning_rate": 6.559219685162165e-07, + "loss": 0.87079823, + "num_input_tokens_seen": 266384015, + "router_z_loss_clip": 1.39941406, + "router_z_loss_mlp": 0.10986328, + "step": 12350, + "time_per_iteration": 2.5616562366485596 + }, + { + "auxiliary_loss_clip": 0.06404371, + "auxiliary_loss_mlp": 0.01263238, + "balance_loss_clip": 0.06270912, + "balance_loss_mlp": 0.01253147, + "epoch": 0.7425822937020893, + "flos": 34175602483200.0, + "grad_norm": 3.363091942962461, + "language_loss": 0.75271994, + "learning_rate": 6.556335914965343e-07, + "loss": 0.82939601, + "num_input_tokens_seen": 266405990, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.10101318, + "step": 12351, + "time_per_iteration": 2.5991873741149902 + }, + { + "auxiliary_loss_clip": 0.06407189, + "auxiliary_loss_mlp": 0.01264826, + "balance_loss_clip": 0.06273928, + "balance_loss_mlp": 0.01255033, + "epoch": 0.7426424169547573, + "flos": 21289200814080.0, + "grad_norm": 1.9305253620740155, + "language_loss": 0.81533462, + "learning_rate": 6.553452654553611e-07, + "loss": 0.89205474, + "num_input_tokens_seen": 266424260, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.09790039, + "step": 12352, + "time_per_iteration": 2.531691551208496 + }, + { + "auxiliary_loss_clip": 0.06410765, + "auxiliary_loss_mlp": 0.01263525, + "balance_loss_clip": 0.06275038, + "balance_loss_mlp": 0.01253386, + "epoch": 0.7427025402074252, + "flos": 22453818312960.0, + "grad_norm": 1.6215241658944841, + "language_loss": 0.71717203, + "learning_rate": 6.550569904036307e-07, + "loss": 0.79391491, + "num_input_tokens_seen": 266444580, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10144043, + "step": 12353, + "time_per_iteration": 4.0272791385650635 + }, + { + "auxiliary_loss_clip": 0.06404988, + "auxiliary_loss_mlp": 0.01265185, + "balance_loss_clip": 0.0627149, + "balance_loss_mlp": 0.01255731, + "epoch": 0.7427626634600932, + "flos": 22530532325760.0, + "grad_norm": 2.41683810368099, + "language_loss": 0.72524661, + "learning_rate": 6.547687663522739e-07, + "loss": 0.80194831, + "num_input_tokens_seen": 266465640, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.09454346, + "step": 12354, + "time_per_iteration": 2.5672101974487305 + }, + { + "auxiliary_loss_clip": 0.06316006, + "auxiliary_loss_mlp": 0.01252952, + "balance_loss_clip": 0.0626021, + "balance_loss_mlp": 0.01251813, + "epoch": 0.7428227867127611, + "flos": 67227271424640.0, + "grad_norm": 0.6879551946330541, + "language_loss": 0.59384382, + "learning_rate": 6.544805933122199e-07, + "loss": 0.66953337, + "num_input_tokens_seen": 266531950, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.01139832, + "step": 12355, + "time_per_iteration": 3.244594097137451 + }, + { + "auxiliary_loss_clip": 0.06405793, + "auxiliary_loss_mlp": 0.01264507, + "balance_loss_clip": 0.06270608, + "balance_loss_mlp": 0.01254363, + "epoch": 0.7428829099654292, + "flos": 14726603111040.0, + "grad_norm": 1.6011597337483758, + "language_loss": 0.67696226, + "learning_rate": 6.541924712943971e-07, + "loss": 0.75366527, + "num_input_tokens_seen": 266550665, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.10150146, + "step": 12356, + "time_per_iteration": 2.48699951171875 + }, + { + "auxiliary_loss_clip": 0.06406914, + "auxiliary_loss_mlp": 0.01263054, + "balance_loss_clip": 0.06269816, + "balance_loss_mlp": 0.01252623, + "epoch": 0.7429430332180971, + "flos": 48656466696960.0, + "grad_norm": 1.5868291550448252, + "language_loss": 0.72533596, + "learning_rate": 6.539044003097301e-07, + "loss": 0.80203569, + "num_input_tokens_seen": 266572455, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.10424805, + "step": 12357, + "time_per_iteration": 2.8397207260131836 + }, + { + "auxiliary_loss_clip": 0.06402919, + "auxiliary_loss_mlp": 0.01263418, + "balance_loss_clip": 0.06274128, + "balance_loss_mlp": 0.01254495, + "epoch": 0.7430031564707651, + "flos": 16769735942400.0, + "grad_norm": 1.978658121021226, + "language_loss": 0.65120018, + "learning_rate": 6.53616380369143e-07, + "loss": 0.72786361, + "num_input_tokens_seen": 266590895, + "router_z_loss_clip": 1.28808594, + "router_z_loss_mlp": 0.08917236, + "step": 12358, + "time_per_iteration": 2.4834437370300293 + }, + { + "auxiliary_loss_clip": 0.06409361, + "auxiliary_loss_mlp": 0.01267679, + "balance_loss_clip": 0.06271667, + "balance_loss_mlp": 0.01256807, + "epoch": 0.743063279723433, + "flos": 23876054789760.0, + "grad_norm": 1.7508744864963774, + "language_loss": 0.81005955, + "learning_rate": 6.533284114835591e-07, + "loss": 0.88682991, + "num_input_tokens_seen": 266607660, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.10864258, + "step": 12359, + "time_per_iteration": 2.5511791706085205 + }, + { + "auxiliary_loss_clip": 0.06404864, + "auxiliary_loss_mlp": 0.01269499, + "balance_loss_clip": 0.06269827, + "balance_loss_mlp": 0.01259491, + "epoch": 0.743123402976101, + "flos": 14396840167680.0, + "grad_norm": 2.4409850901837924, + "language_loss": 0.688115, + "learning_rate": 6.530404936638956e-07, + "loss": 0.7648586, + "num_input_tokens_seen": 266624260, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.10009766, + "step": 12360, + "time_per_iteration": 2.454799175262451 + }, + { + "auxiliary_loss_clip": 0.06402747, + "auxiliary_loss_mlp": 0.01266625, + "balance_loss_clip": 0.06271, + "balance_loss_mlp": 0.01256695, + "epoch": 0.7431835262287689, + "flos": 27461756960640.0, + "grad_norm": 1.612303136385371, + "language_loss": 0.73023605, + "learning_rate": 6.527526269210715e-07, + "loss": 0.80692977, + "num_input_tokens_seen": 266644210, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09936523, + "step": 12361, + "time_per_iteration": 2.563950538635254 + }, + { + "auxiliary_loss_clip": 0.06409371, + "auxiliary_loss_mlp": 0.01263731, + "balance_loss_clip": 0.06271869, + "balance_loss_mlp": 0.01253706, + "epoch": 0.743243649481437, + "flos": 20965810780800.0, + "grad_norm": 2.1605200841945345, + "language_loss": 0.56417334, + "learning_rate": 6.524648112660027e-07, + "loss": 0.64090431, + "num_input_tokens_seen": 266664230, + "router_z_loss_clip": 1.37695312, + "router_z_loss_mlp": 0.10028076, + "step": 12362, + "time_per_iteration": 2.5222644805908203 + }, + { + "auxiliary_loss_clip": 0.06406482, + "auxiliary_loss_mlp": 0.01263965, + "balance_loss_clip": 0.06272303, + "balance_loss_mlp": 0.012541, + "epoch": 0.7433037727341049, + "flos": 22789660677120.0, + "grad_norm": 2.4729179704806796, + "language_loss": 0.77661127, + "learning_rate": 6.521770467096039e-07, + "loss": 0.85331571, + "num_input_tokens_seen": 266683270, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.09869385, + "step": 12363, + "time_per_iteration": 2.5122897624969482 + }, + { + "auxiliary_loss_clip": 0.06408481, + "auxiliary_loss_mlp": 0.01264275, + "balance_loss_clip": 0.06273359, + "balance_loss_mlp": 0.01255054, + "epoch": 0.7433638959867729, + "flos": 22202656099200.0, + "grad_norm": 1.616246538203827, + "language_loss": 0.78287363, + "learning_rate": 6.518893332627862e-07, + "loss": 0.85960114, + "num_input_tokens_seen": 266701235, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.09222412, + "step": 12364, + "time_per_iteration": 2.492027521133423 + }, + { + "auxiliary_loss_clip": 0.06406204, + "auxiliary_loss_mlp": 0.01264726, + "balance_loss_clip": 0.06272129, + "balance_loss_mlp": 0.01254867, + "epoch": 0.7434240192394409, + "flos": 23303808529920.0, + "grad_norm": 1.801205271942991, + "language_loss": 0.78693449, + "learning_rate": 6.516016709364604e-07, + "loss": 0.86364377, + "num_input_tokens_seen": 266721495, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.09851074, + "step": 12365, + "time_per_iteration": 2.536839485168457 + }, + { + "auxiliary_loss_clip": 0.06409302, + "auxiliary_loss_mlp": 0.01265053, + "balance_loss_clip": 0.06271569, + "balance_loss_mlp": 0.01254884, + "epoch": 0.7434841424921088, + "flos": 54020387416320.0, + "grad_norm": 1.5444951998265788, + "language_loss": 0.77106571, + "learning_rate": 6.513140597415346e-07, + "loss": 0.8478092, + "num_input_tokens_seen": 266747400, + "router_z_loss_clip": 1.37695312, + "router_z_loss_mlp": 0.10168457, + "step": 12366, + "time_per_iteration": 2.7708029747009277 + }, + { + "auxiliary_loss_clip": 0.06405418, + "auxiliary_loss_mlp": 0.01263106, + "balance_loss_clip": 0.06275211, + "balance_loss_mlp": 0.01254588, + "epoch": 0.7435442657447768, + "flos": 21440364779520.0, + "grad_norm": 1.560298463472275, + "language_loss": 0.71305168, + "learning_rate": 6.510264996889141e-07, + "loss": 0.78973687, + "num_input_tokens_seen": 266767630, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.08514404, + "step": 12367, + "time_per_iteration": 2.5184154510498047 + }, + { + "auxiliary_loss_clip": 0.06410043, + "auxiliary_loss_mlp": 0.01265202, + "balance_loss_clip": 0.06271939, + "balance_loss_mlp": 0.01255242, + "epoch": 0.7436043889974447, + "flos": 24506426655360.0, + "grad_norm": 1.476887140959893, + "language_loss": 0.75017029, + "learning_rate": 6.507389907895038e-07, + "loss": 0.82692266, + "num_input_tokens_seen": 266788015, + "router_z_loss_clip": 1.38183594, + "router_z_loss_mlp": 0.09960938, + "step": 12368, + "time_per_iteration": 2.5212924480438232 + }, + { + "auxiliary_loss_clip": 0.0640331, + "auxiliary_loss_mlp": 0.01266737, + "balance_loss_clip": 0.06271964, + "balance_loss_mlp": 0.01257248, + "epoch": 0.7436645122501128, + "flos": 40707997989120.0, + "grad_norm": 1.6519128138397359, + "language_loss": 0.69042623, + "learning_rate": 6.50451533054207e-07, + "loss": 0.76712668, + "num_input_tokens_seen": 266809010, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.09490967, + "step": 12369, + "time_per_iteration": 2.7047884464263916 + }, + { + "auxiliary_loss_clip": 0.06408005, + "auxiliary_loss_mlp": 0.01266433, + "balance_loss_clip": 0.06272747, + "balance_loss_mlp": 0.01256258, + "epoch": 0.7437246355027807, + "flos": 18913537854720.0, + "grad_norm": 1.595861424874944, + "language_loss": 0.75370234, + "learning_rate": 6.501641264939233e-07, + "loss": 0.83044672, + "num_input_tokens_seen": 266825390, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.10168457, + "step": 12370, + "time_per_iteration": 2.473238468170166 + }, + { + "auxiliary_loss_clip": 0.06403841, + "auxiliary_loss_mlp": 0.01266197, + "balance_loss_clip": 0.06273004, + "balance_loss_mlp": 0.01256487, + "epoch": 0.7437847587554487, + "flos": 21550299736320.0, + "grad_norm": 1.5233822709060378, + "language_loss": 0.78544998, + "learning_rate": 6.498767711195503e-07, + "loss": 0.86215037, + "num_input_tokens_seen": 266844675, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09710693, + "step": 12371, + "time_per_iteration": 2.5248806476593018 + }, + { + "auxiliary_loss_clip": 0.06407221, + "auxiliary_loss_mlp": 0.0126359, + "balance_loss_clip": 0.06274284, + "balance_loss_mlp": 0.01253415, + "epoch": 0.7438448820081166, + "flos": 27789926676480.0, + "grad_norm": 1.5517667722387558, + "language_loss": 0.69689578, + "learning_rate": 6.495894669419857e-07, + "loss": 0.77360392, + "num_input_tokens_seen": 266865160, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.10168457, + "step": 12372, + "time_per_iteration": 2.552630662918091 + }, + { + "auxiliary_loss_clip": 0.06404461, + "auxiliary_loss_mlp": 0.01263234, + "balance_loss_clip": 0.06271353, + "balance_loss_mlp": 0.01253519, + "epoch": 0.7439050052607846, + "flos": 17973653806080.0, + "grad_norm": 1.7715467949119694, + "language_loss": 0.75746936, + "learning_rate": 6.493022139721245e-07, + "loss": 0.83414626, + "num_input_tokens_seen": 266883285, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.09716797, + "step": 12373, + "time_per_iteration": 2.546383857727051 + }, + { + "auxiliary_loss_clip": 0.06406415, + "auxiliary_loss_mlp": 0.01264372, + "balance_loss_clip": 0.06269443, + "balance_loss_mlp": 0.01253643, + "epoch": 0.7439651285134525, + "flos": 22964066951040.0, + "grad_norm": 1.646659393981313, + "language_loss": 0.77668065, + "learning_rate": 6.49015012220858e-07, + "loss": 0.85338849, + "num_input_tokens_seen": 266900960, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.10723877, + "step": 12374, + "time_per_iteration": 3.92050838470459 + }, + { + "auxiliary_loss_clip": 0.0640787, + "auxiliary_loss_mlp": 0.01263238, + "balance_loss_clip": 0.06273149, + "balance_loss_mlp": 0.0125323, + "epoch": 0.7440252517661206, + "flos": 18812701065600.0, + "grad_norm": 2.0942511176343936, + "language_loss": 0.76647848, + "learning_rate": 6.487278616990774e-07, + "loss": 0.8431896, + "num_input_tokens_seen": 266917710, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.10009766, + "step": 12375, + "time_per_iteration": 2.4693682193756104 + }, + { + "auxiliary_loss_clip": 0.06401422, + "auxiliary_loss_mlp": 0.01264376, + "balance_loss_clip": 0.06271002, + "balance_loss_mlp": 0.0125509, + "epoch": 0.7440853750187885, + "flos": 20272476971520.0, + "grad_norm": 1.9421008713204126, + "language_loss": 0.77613479, + "learning_rate": 6.484407624176733e-07, + "loss": 0.85279274, + "num_input_tokens_seen": 266934220, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.09289551, + "step": 12376, + "time_per_iteration": 2.5313687324523926 + }, + { + "auxiliary_loss_clip": 0.06411325, + "auxiliary_loss_mlp": 0.0126521, + "balance_loss_clip": 0.06274679, + "balance_loss_mlp": 0.01254195, + "epoch": 0.7441454982714565, + "flos": 25344216103680.0, + "grad_norm": 1.6879518297233593, + "language_loss": 0.79368329, + "learning_rate": 6.481537143875296e-07, + "loss": 0.87044865, + "num_input_tokens_seen": 266955210, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.11010742, + "step": 12377, + "time_per_iteration": 2.5384654998779297 + }, + { + "auxiliary_loss_clip": 0.0640887, + "auxiliary_loss_mlp": 0.01264545, + "balance_loss_clip": 0.06272136, + "balance_loss_mlp": 0.01254025, + "epoch": 0.7442056215241245, + "flos": 64493460915840.0, + "grad_norm": 1.858045271266799, + "language_loss": 0.67843312, + "learning_rate": 6.478667176195322e-07, + "loss": 0.75516731, + "num_input_tokens_seen": 266976555, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.10528564, + "step": 12378, + "time_per_iteration": 2.898494005203247 + }, + { + "auxiliary_loss_clip": 0.06408532, + "auxiliary_loss_mlp": 0.0126824, + "balance_loss_clip": 0.06271744, + "balance_loss_mlp": 0.01256784, + "epoch": 0.7442657447767924, + "flos": 31293464319360.0, + "grad_norm": 1.6105987456814335, + "language_loss": 0.71894264, + "learning_rate": 6.475797721245648e-07, + "loss": 0.79571033, + "num_input_tokens_seen": 266997640, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.11462402, + "step": 12379, + "time_per_iteration": 2.5628533363342285 + }, + { + "auxiliary_loss_clip": 0.06407094, + "auxiliary_loss_mlp": 0.01265472, + "balance_loss_clip": 0.06273466, + "balance_loss_mlp": 0.01255292, + "epoch": 0.7443258680294604, + "flos": 20813221296000.0, + "grad_norm": 1.9550409468219483, + "language_loss": 0.65543461, + "learning_rate": 6.472928779135085e-07, + "loss": 0.73216021, + "num_input_tokens_seen": 267016165, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.10186768, + "step": 12380, + "time_per_iteration": 2.5494651794433594 + }, + { + "auxiliary_loss_clip": 0.06408666, + "auxiliary_loss_mlp": 0.01266245, + "balance_loss_clip": 0.0627347, + "balance_loss_mlp": 0.01256267, + "epoch": 0.7443859912821283, + "flos": 22206303751680.0, + "grad_norm": 1.8887848682533184, + "language_loss": 0.79213363, + "learning_rate": 6.470060349972411e-07, + "loss": 0.86888278, + "num_input_tokens_seen": 267034075, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.09973145, + "step": 12381, + "time_per_iteration": 2.4954755306243896 + }, + { + "auxiliary_loss_clip": 0.06412176, + "auxiliary_loss_mlp": 0.01265606, + "balance_loss_clip": 0.06274785, + "balance_loss_mlp": 0.0125446, + "epoch": 0.7444461145347964, + "flos": 22024350610560.0, + "grad_norm": 1.8902076761628224, + "language_loss": 0.73109865, + "learning_rate": 6.467192433866411e-07, + "loss": 0.80787647, + "num_input_tokens_seen": 267053645, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.1114502, + "step": 12382, + "time_per_iteration": 2.534949779510498 + }, + { + "auxiliary_loss_clip": 0.06317867, + "auxiliary_loss_mlp": 0.01256388, + "balance_loss_clip": 0.06262469, + "balance_loss_mlp": 0.01255137, + "epoch": 0.7445062377874643, + "flos": 70582313704320.0, + "grad_norm": 0.6399574084951353, + "language_loss": 0.54684198, + "learning_rate": 6.464325030925831e-07, + "loss": 0.62258446, + "num_input_tokens_seen": 267121830, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01251221, + "step": 12383, + "time_per_iteration": 3.2762465476989746 + }, + { + "auxiliary_loss_clip": 0.06408082, + "auxiliary_loss_mlp": 0.01263086, + "balance_loss_clip": 0.06273709, + "balance_loss_mlp": 0.01253168, + "epoch": 0.7445663610401323, + "flos": 22171070309760.0, + "grad_norm": 1.8693949570564194, + "language_loss": 0.76230967, + "learning_rate": 6.461458141259395e-07, + "loss": 0.83902138, + "num_input_tokens_seen": 267141145, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.09924316, + "step": 12384, + "time_per_iteration": 3.9471797943115234 + }, + { + "auxiliary_loss_clip": 0.0640517, + "auxiliary_loss_mlp": 0.01268527, + "balance_loss_clip": 0.06271986, + "balance_loss_mlp": 0.01258162, + "epoch": 0.7446264842928002, + "flos": 24177082982400.0, + "grad_norm": 2.0160606528555665, + "language_loss": 0.79418957, + "learning_rate": 6.458591764975823e-07, + "loss": 0.87092656, + "num_input_tokens_seen": 267159280, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.1036377, + "step": 12385, + "time_per_iteration": 2.548703193664551 + }, + { + "auxiliary_loss_clip": 0.06411269, + "auxiliary_loss_mlp": 0.01267945, + "balance_loss_clip": 0.06273325, + "balance_loss_mlp": 0.0125609, + "epoch": 0.7446866075454682, + "flos": 24141514124160.0, + "grad_norm": 1.683035804247251, + "language_loss": 0.81670487, + "learning_rate": 6.455725902183813e-07, + "loss": 0.89349711, + "num_input_tokens_seen": 267179390, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.11859131, + "step": 12386, + "time_per_iteration": 2.5256152153015137 + }, + { + "auxiliary_loss_clip": 0.06404106, + "auxiliary_loss_mlp": 0.01267713, + "balance_loss_clip": 0.06274322, + "balance_loss_mlp": 0.01257598, + "epoch": 0.7447467307981361, + "flos": 23554467619200.0, + "grad_norm": 1.6483993248680413, + "language_loss": 0.71268487, + "learning_rate": 6.452860552992037e-07, + "loss": 0.78940308, + "num_input_tokens_seen": 267198165, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.10119629, + "step": 12387, + "time_per_iteration": 3.9517242908477783 + }, + { + "auxiliary_loss_clip": 0.0640709, + "auxiliary_loss_mlp": 0.01265221, + "balance_loss_clip": 0.06274819, + "balance_loss_mlp": 0.01255464, + "epoch": 0.7448068540508042, + "flos": 19573021814400.0, + "grad_norm": 1.9204384374405874, + "language_loss": 0.70408261, + "learning_rate": 6.449995717509138e-07, + "loss": 0.78080571, + "num_input_tokens_seen": 267214520, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.09771729, + "step": 12388, + "time_per_iteration": 2.5048129558563232 + }, + { + "auxiliary_loss_clip": 0.06406976, + "auxiliary_loss_mlp": 0.0126489, + "balance_loss_clip": 0.06273593, + "balance_loss_mlp": 0.01254727, + "epoch": 0.7448669773034721, + "flos": 21846925589760.0, + "grad_norm": 1.5688285062230494, + "language_loss": 0.85222888, + "learning_rate": 6.447131395843761e-07, + "loss": 0.92894751, + "num_input_tokens_seen": 267236555, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.10162354, + "step": 12389, + "time_per_iteration": 2.5551319122314453 + }, + { + "auxiliary_loss_clip": 0.06411929, + "auxiliary_loss_mlp": 0.01264711, + "balance_loss_clip": 0.06275173, + "balance_loss_mlp": 0.01254388, + "epoch": 0.7449271005561401, + "flos": 25162388743680.0, + "grad_norm": 1.6015967900986, + "language_loss": 0.79076087, + "learning_rate": 6.444267588104526e-07, + "loss": 0.86752725, + "num_input_tokens_seen": 267254800, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.10333252, + "step": 12390, + "time_per_iteration": 2.5427069664001465 + }, + { + "auxiliary_loss_clip": 0.06406707, + "auxiliary_loss_mlp": 0.01265994, + "balance_loss_clip": 0.06271118, + "balance_loss_mlp": 0.01255414, + "epoch": 0.7449872238088081, + "flos": 22279915163520.0, + "grad_norm": 1.7310702404068883, + "language_loss": 0.84598923, + "learning_rate": 6.441404294400014e-07, + "loss": 0.92271626, + "num_input_tokens_seen": 267274610, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10577393, + "step": 12391, + "time_per_iteration": 2.563535451889038 + }, + { + "auxiliary_loss_clip": 0.0640666, + "auxiliary_loss_mlp": 0.01267143, + "balance_loss_clip": 0.06273681, + "balance_loss_mlp": 0.01257481, + "epoch": 0.745047347061476, + "flos": 20601065957760.0, + "grad_norm": 1.6668133059608343, + "language_loss": 0.74029422, + "learning_rate": 6.438541514838811e-07, + "loss": 0.81703228, + "num_input_tokens_seen": 267292600, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.09655762, + "step": 12392, + "time_per_iteration": 2.54951548576355 + }, + { + "auxiliary_loss_clip": 0.06402859, + "auxiliary_loss_mlp": 0.01260815, + "balance_loss_clip": 0.06272476, + "balance_loss_mlp": 0.01251344, + "epoch": 0.745107470314144, + "flos": 22134117859200.0, + "grad_norm": 1.5576525473269558, + "language_loss": 0.76858068, + "learning_rate": 6.435679249529487e-07, + "loss": 0.84521741, + "num_input_tokens_seen": 267311295, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.09466553, + "step": 12393, + "time_per_iteration": 3.9006175994873047 + }, + { + "auxiliary_loss_clip": 0.06406154, + "auxiliary_loss_mlp": 0.01264743, + "balance_loss_clip": 0.06273723, + "balance_loss_mlp": 0.01253681, + "epoch": 0.745167593566812, + "flos": 22243004640000.0, + "grad_norm": 1.8129190571327771, + "language_loss": 0.72895974, + "learning_rate": 6.432817498580552e-07, + "loss": 0.80566871, + "num_input_tokens_seen": 267328390, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.11065674, + "step": 12394, + "time_per_iteration": 2.5072154998779297 + }, + { + "auxiliary_loss_clip": 0.06409433, + "auxiliary_loss_mlp": 0.0126662, + "balance_loss_clip": 0.062764, + "balance_loss_mlp": 0.01256386, + "epoch": 0.74522771681948, + "flos": 20672245601280.0, + "grad_norm": 1.907024512464057, + "language_loss": 0.81604195, + "learning_rate": 6.429956262100535e-07, + "loss": 0.89280254, + "num_input_tokens_seen": 267348185, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.10229492, + "step": 12395, + "time_per_iteration": 2.558364152908325 + }, + { + "auxiliary_loss_clip": 0.06410865, + "auxiliary_loss_mlp": 0.01263239, + "balance_loss_clip": 0.06272958, + "balance_loss_mlp": 0.0125276, + "epoch": 0.7452878400721479, + "flos": 21113578656000.0, + "grad_norm": 2.0296389774228696, + "language_loss": 0.71353412, + "learning_rate": 6.427095540197937e-07, + "loss": 0.7902751, + "num_input_tokens_seen": 267367010, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.10479736, + "step": 12396, + "time_per_iteration": 2.5333800315856934 + }, + { + "auxiliary_loss_clip": 0.06410335, + "auxiliary_loss_mlp": 0.01270272, + "balance_loss_clip": 0.0627405, + "balance_loss_mlp": 0.01259817, + "epoch": 0.7453479633248159, + "flos": 26695356791040.0, + "grad_norm": 1.7653498862939656, + "language_loss": 0.68180245, + "learning_rate": 6.424235332981245e-07, + "loss": 0.75860852, + "num_input_tokens_seen": 267386605, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.10455322, + "step": 12397, + "time_per_iteration": 2.578571081161499 + }, + { + "auxiliary_loss_clip": 0.06405051, + "auxiliary_loss_mlp": 0.0126851, + "balance_loss_clip": 0.0627315, + "balance_loss_mlp": 0.01258926, + "epoch": 0.7454080865774838, + "flos": 17021191645440.0, + "grad_norm": 1.6817792283863804, + "language_loss": 0.77217615, + "learning_rate": 6.421375640558908e-07, + "loss": 0.84891176, + "num_input_tokens_seen": 267404135, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09576416, + "step": 12398, + "time_per_iteration": 2.512648344039917 + }, + { + "auxiliary_loss_clip": 0.06403591, + "auxiliary_loss_mlp": 0.01261876, + "balance_loss_clip": 0.06272794, + "balance_loss_mlp": 0.01252328, + "epoch": 0.7454682098301518, + "flos": 21330178260480.0, + "grad_norm": 1.5838932633911913, + "language_loss": 0.78415573, + "learning_rate": 6.418516463039363e-07, + "loss": 0.8608104, + "num_input_tokens_seen": 267423120, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09552002, + "step": 12399, + "time_per_iteration": 2.505819320678711 + }, + { + "auxiliary_loss_clip": 0.06400932, + "auxiliary_loss_mlp": 0.01264955, + "balance_loss_clip": 0.06273317, + "balance_loss_mlp": 0.01255728, + "epoch": 0.7455283330828197, + "flos": 17864138119680.0, + "grad_norm": 1.9696837581168143, + "language_loss": 0.7409634, + "learning_rate": 6.415657800531038e-07, + "loss": 0.81762224, + "num_input_tokens_seen": 267441250, + "router_z_loss_clip": 1.27539062, + "router_z_loss_mlp": 0.09222412, + "step": 12400, + "time_per_iteration": 2.5325090885162354 + }, + { + "auxiliary_loss_clip": 0.06404567, + "auxiliary_loss_mlp": 0.01264569, + "balance_loss_clip": 0.06272677, + "balance_loss_mlp": 0.01254829, + "epoch": 0.7455884563354878, + "flos": 30782209432320.0, + "grad_norm": 1.9542118355306637, + "language_loss": 0.82345331, + "learning_rate": 6.412799653142327e-07, + "loss": 0.90014458, + "num_input_tokens_seen": 267462820, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.09735107, + "step": 12401, + "time_per_iteration": 2.577702283859253 + }, + { + "auxiliary_loss_clip": 0.06408406, + "auxiliary_loss_mlp": 0.01262184, + "balance_loss_clip": 0.06275339, + "balance_loss_mlp": 0.01252689, + "epoch": 0.7456485795881557, + "flos": 23192280345600.0, + "grad_norm": 1.6740517505744856, + "language_loss": 0.65013397, + "learning_rate": 6.409942020981611e-07, + "loss": 0.72683978, + "num_input_tokens_seen": 267483065, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.0949707, + "step": 12402, + "time_per_iteration": 2.6253459453582764 + }, + { + "auxiliary_loss_clip": 0.06401449, + "auxiliary_loss_mlp": 0.01264812, + "balance_loss_clip": 0.06271583, + "balance_loss_mlp": 0.01255472, + "epoch": 0.7457087028408237, + "flos": 38736254436480.0, + "grad_norm": 1.537912259359591, + "language_loss": 0.73276114, + "learning_rate": 6.407084904157265e-07, + "loss": 0.8094238, + "num_input_tokens_seen": 267504825, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.09350586, + "step": 12403, + "time_per_iteration": 2.700143575668335 + }, + { + "auxiliary_loss_clip": 0.06316997, + "auxiliary_loss_mlp": 0.01251636, + "balance_loss_clip": 0.06261828, + "balance_loss_mlp": 0.0125041, + "epoch": 0.7457688260934917, + "flos": 56059480523520.0, + "grad_norm": 1.1139053392521483, + "language_loss": 0.58594716, + "learning_rate": 6.404228302777621e-07, + "loss": 0.66163349, + "num_input_tokens_seen": 267559260, + "router_z_loss_clip": 0.55371094, + "router_z_loss_mlp": 0.01225281, + "step": 12404, + "time_per_iteration": 2.995051145553589 + }, + { + "auxiliary_loss_clip": 0.06405495, + "auxiliary_loss_mlp": 0.01263977, + "balance_loss_clip": 0.06272737, + "balance_loss_mlp": 0.01254256, + "epoch": 0.7458289493461596, + "flos": 20121606495360.0, + "grad_norm": 1.4914507939432748, + "language_loss": 0.77947497, + "learning_rate": 6.401372216950995e-07, + "loss": 0.85616976, + "num_input_tokens_seen": 267578720, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09710693, + "step": 12405, + "time_per_iteration": 2.5471739768981934 + }, + { + "auxiliary_loss_clip": 0.0640135, + "auxiliary_loss_mlp": 0.01269033, + "balance_loss_clip": 0.06272865, + "balance_loss_mlp": 0.01259067, + "epoch": 0.7458890725988276, + "flos": 20199200976000.0, + "grad_norm": 1.4963815731193124, + "language_loss": 0.69489747, + "learning_rate": 6.398516646785698e-07, + "loss": 0.77160132, + "num_input_tokens_seen": 267598250, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.09960938, + "step": 12406, + "time_per_iteration": 2.5200746059417725 + }, + { + "auxiliary_loss_clip": 0.0641366, + "auxiliary_loss_mlp": 0.012669, + "balance_loss_clip": 0.06274001, + "balance_loss_mlp": 0.01256344, + "epoch": 0.7459491958514956, + "flos": 17024336173440.0, + "grad_norm": 1.8403958635643813, + "language_loss": 0.65422976, + "learning_rate": 6.39566159239002e-07, + "loss": 0.73103529, + "num_input_tokens_seen": 267615430, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.10559082, + "step": 12407, + "time_per_iteration": 2.508833408355713 + }, + { + "auxiliary_loss_clip": 0.06406917, + "auxiliary_loss_mlp": 0.01262212, + "balance_loss_clip": 0.06270534, + "balance_loss_mlp": 0.01251775, + "epoch": 0.7460093191041636, + "flos": 25085087752320.0, + "grad_norm": 1.7359295101063332, + "language_loss": 0.721986, + "learning_rate": 6.392807053872212e-07, + "loss": 0.79867733, + "num_input_tokens_seen": 267635075, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.10443115, + "step": 12408, + "time_per_iteration": 2.5363566875457764 + }, + { + "auxiliary_loss_clip": 0.06410854, + "auxiliary_loss_mlp": 0.01270325, + "balance_loss_clip": 0.06272398, + "balance_loss_mlp": 0.01258875, + "epoch": 0.7460694423568315, + "flos": 21915044559360.0, + "grad_norm": 1.699572837322079, + "language_loss": 0.72972172, + "learning_rate": 6.38995303134053e-07, + "loss": 0.80653358, + "num_input_tokens_seen": 267654105, + "router_z_loss_clip": 1.38476562, + "router_z_loss_mlp": 0.11444092, + "step": 12409, + "time_per_iteration": 2.546006441116333 + }, + { + "auxiliary_loss_clip": 0.06399277, + "auxiliary_loss_mlp": 0.01265888, + "balance_loss_clip": 0.06271146, + "balance_loss_mlp": 0.0125671, + "epoch": 0.7461295656094995, + "flos": 21222213874560.0, + "grad_norm": 1.598232986197546, + "language_loss": 0.6626668, + "learning_rate": 6.38709952490319e-07, + "loss": 0.73931849, + "num_input_tokens_seen": 267673090, + "router_z_loss_clip": 1.28222656, + "router_z_loss_mlp": 0.09173584, + "step": 12410, + "time_per_iteration": 2.539109468460083 + }, + { + "auxiliary_loss_clip": 0.06399163, + "auxiliary_loss_mlp": 0.01263377, + "balance_loss_clip": 0.06269792, + "balance_loss_mlp": 0.01253912, + "epoch": 0.7461896888621674, + "flos": 22353526575360.0, + "grad_norm": 1.945676042330692, + "language_loss": 0.84313834, + "learning_rate": 6.384246534668396e-07, + "loss": 0.9197638, + "num_input_tokens_seen": 267690605, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.09466553, + "step": 12411, + "time_per_iteration": 2.5426361560821533 + }, + { + "auxiliary_loss_clip": 0.06406285, + "auxiliary_loss_mlp": 0.01265139, + "balance_loss_clip": 0.06272309, + "balance_loss_mlp": 0.01255412, + "epoch": 0.7462498121148354, + "flos": 25489845699840.0, + "grad_norm": 1.4027823600738436, + "language_loss": 0.78116751, + "learning_rate": 6.381394060744339e-07, + "loss": 0.85788167, + "num_input_tokens_seen": 267710540, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.09729004, + "step": 12412, + "time_per_iteration": 2.533936023712158 + }, + { + "auxiliary_loss_clip": 0.06404398, + "auxiliary_loss_mlp": 0.01264219, + "balance_loss_clip": 0.06270991, + "balance_loss_mlp": 0.01254599, + "epoch": 0.7463099353675033, + "flos": 33956319548160.0, + "grad_norm": 1.7620547753312321, + "language_loss": 0.62684309, + "learning_rate": 6.378542103239188e-07, + "loss": 0.70352924, + "num_input_tokens_seen": 267730780, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.09625244, + "step": 12413, + "time_per_iteration": 2.6400840282440186 + }, + { + "auxiliary_loss_clip": 0.06308331, + "auxiliary_loss_mlp": 0.01251289, + "balance_loss_clip": 0.06253117, + "balance_loss_mlp": 0.01250132, + "epoch": 0.7463700586201714, + "flos": 62786365355520.0, + "grad_norm": 0.710053456092447, + "language_loss": 0.54915559, + "learning_rate": 6.375690662261082e-07, + "loss": 0.62475181, + "num_input_tokens_seen": 267794240, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.0115509, + "step": 12414, + "time_per_iteration": 4.637887954711914 + }, + { + "auxiliary_loss_clip": 0.06405766, + "auxiliary_loss_mlp": 0.01265973, + "balance_loss_clip": 0.06271549, + "balance_loss_mlp": 0.01255924, + "epoch": 0.7464301818728393, + "flos": 33440201124480.0, + "grad_norm": 1.8480790856179932, + "language_loss": 0.54996049, + "learning_rate": 6.372839737918154e-07, + "loss": 0.62667787, + "num_input_tokens_seen": 267817190, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.1005249, + "step": 12415, + "time_per_iteration": 2.615811347961426 + }, + { + "auxiliary_loss_clip": 0.06405137, + "auxiliary_loss_mlp": 0.01263099, + "balance_loss_clip": 0.06273064, + "balance_loss_mlp": 0.01252985, + "epoch": 0.7464903051255073, + "flos": 26877100296960.0, + "grad_norm": 1.5361542558007044, + "language_loss": 0.75346631, + "learning_rate": 6.369989330318506e-07, + "loss": 0.8301487, + "num_input_tokens_seen": 267836245, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.10107422, + "step": 12416, + "time_per_iteration": 2.5900840759277344 + }, + { + "auxiliary_loss_clip": 0.06405427, + "auxiliary_loss_mlp": 0.0126512, + "balance_loss_clip": 0.06271985, + "balance_loss_mlp": 0.01254868, + "epoch": 0.7465504283781753, + "flos": 44096359795200.0, + "grad_norm": 1.4549877982075725, + "language_loss": 0.69495994, + "learning_rate": 6.367139439570233e-07, + "loss": 0.77166545, + "num_input_tokens_seen": 267858310, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.10247803, + "step": 12417, + "time_per_iteration": 2.7127816677093506 + }, + { + "auxiliary_loss_clip": 0.06411283, + "auxiliary_loss_mlp": 0.01262613, + "balance_loss_clip": 0.0627514, + "balance_loss_mlp": 0.01252456, + "epoch": 0.7466105516308432, + "flos": 19681111981440.0, + "grad_norm": 1.698297081844245, + "language_loss": 0.74025893, + "learning_rate": 6.364290065781392e-07, + "loss": 0.81699783, + "num_input_tokens_seen": 267876345, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.10162354, + "step": 12418, + "time_per_iteration": 2.535360336303711 + }, + { + "auxiliary_loss_clip": 0.06406084, + "auxiliary_loss_mlp": 0.01266736, + "balance_loss_clip": 0.06273702, + "balance_loss_mlp": 0.01256526, + "epoch": 0.7466706748835112, + "flos": 20526783713280.0, + "grad_norm": 1.5246031666283997, + "language_loss": 0.68934214, + "learning_rate": 6.361441209060039e-07, + "loss": 0.76607031, + "num_input_tokens_seen": 267896740, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.10211182, + "step": 12419, + "time_per_iteration": 2.555774211883545 + }, + { + "auxiliary_loss_clip": 0.06398122, + "auxiliary_loss_mlp": 0.01265504, + "balance_loss_clip": 0.06271016, + "balance_loss_mlp": 0.01256307, + "epoch": 0.7467307981361792, + "flos": 21696851727360.0, + "grad_norm": 1.9457389695389966, + "language_loss": 0.7466985, + "learning_rate": 6.358592869514216e-07, + "loss": 0.82333469, + "num_input_tokens_seen": 267914765, + "router_z_loss_clip": 1.27050781, + "router_z_loss_mlp": 0.09197998, + "step": 12420, + "time_per_iteration": 2.570023536682129 + }, + { + "auxiliary_loss_clip": 0.06408262, + "auxiliary_loss_mlp": 0.01264113, + "balance_loss_clip": 0.06273928, + "balance_loss_mlp": 0.01253152, + "epoch": 0.7467909213888472, + "flos": 19579855921920.0, + "grad_norm": 2.0032714530696087, + "language_loss": 0.67321241, + "learning_rate": 6.355745047251904e-07, + "loss": 0.7499361, + "num_input_tokens_seen": 267934085, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.10955811, + "step": 12421, + "time_per_iteration": 2.474916696548462 + }, + { + "auxiliary_loss_clip": 0.06408735, + "auxiliary_loss_mlp": 0.01265956, + "balance_loss_clip": 0.06271867, + "balance_loss_mlp": 0.0125574, + "epoch": 0.7468510446415151, + "flos": 23701858151040.0, + "grad_norm": 1.5609377146869152, + "language_loss": 0.72308791, + "learning_rate": 6.352897742381107e-07, + "loss": 0.79983485, + "num_input_tokens_seen": 267955170, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.10223389, + "step": 12422, + "time_per_iteration": 2.5997939109802246 + }, + { + "auxiliary_loss_clip": 0.06401733, + "auxiliary_loss_mlp": 0.01265232, + "balance_loss_clip": 0.06271507, + "balance_loss_mlp": 0.01255272, + "epoch": 0.7469111678941831, + "flos": 29323649410560.0, + "grad_norm": 1.8474742568559126, + "language_loss": 0.75012529, + "learning_rate": 6.350050955009796e-07, + "loss": 0.82679492, + "num_input_tokens_seen": 267974980, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.09960938, + "step": 12423, + "time_per_iteration": 4.05024266242981 + }, + { + "auxiliary_loss_clip": 0.06402838, + "auxiliary_loss_mlp": 0.01263552, + "balance_loss_clip": 0.06272693, + "balance_loss_mlp": 0.01254534, + "epoch": 0.746971291146851, + "flos": 21805067675520.0, + "grad_norm": 1.325189199688027, + "language_loss": 0.67964166, + "learning_rate": 6.347204685245929e-07, + "loss": 0.75630558, + "num_input_tokens_seen": 267994985, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.09020996, + "step": 12424, + "time_per_iteration": 2.531129837036133 + }, + { + "auxiliary_loss_clip": 0.06410465, + "auxiliary_loss_mlp": 0.01267373, + "balance_loss_clip": 0.06274019, + "balance_loss_mlp": 0.01257491, + "epoch": 0.747031414399519, + "flos": 36253591413120.0, + "grad_norm": 1.7828664572749888, + "language_loss": 0.74532795, + "learning_rate": 6.344358933197418e-07, + "loss": 0.82210636, + "num_input_tokens_seen": 268014985, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.09881592, + "step": 12425, + "time_per_iteration": 2.7197470664978027 + }, + { + "auxiliary_loss_clip": 0.06402496, + "auxiliary_loss_mlp": 0.01265684, + "balance_loss_clip": 0.06268051, + "balance_loss_mlp": 0.01254431, + "epoch": 0.7470915376521869, + "flos": 19981133925120.0, + "grad_norm": 2.1292666289385016, + "language_loss": 0.69784462, + "learning_rate": 6.341513698972194e-07, + "loss": 0.77452642, + "num_input_tokens_seen": 268034395, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.1126709, + "step": 12426, + "time_per_iteration": 3.9324328899383545 + }, + { + "auxiliary_loss_clip": 0.06403908, + "auxiliary_loss_mlp": 0.01267662, + "balance_loss_clip": 0.06274264, + "balance_loss_mlp": 0.01258269, + "epoch": 0.747151660904855, + "flos": 20090523830400.0, + "grad_norm": 1.610031666552814, + "language_loss": 0.65698165, + "learning_rate": 6.338668982678139e-07, + "loss": 0.73369735, + "num_input_tokens_seen": 268054485, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.09399414, + "step": 12427, + "time_per_iteration": 2.544971466064453 + }, + { + "auxiliary_loss_clip": 0.06408876, + "auxiliary_loss_mlp": 0.01263755, + "balance_loss_clip": 0.06273834, + "balance_loss_mlp": 0.01253754, + "epoch": 0.7472117841575229, + "flos": 16296062411520.0, + "grad_norm": 1.5416820216719087, + "language_loss": 0.74925625, + "learning_rate": 6.335824784423118e-07, + "loss": 0.82598257, + "num_input_tokens_seen": 268072250, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.09997559, + "step": 12428, + "time_per_iteration": 2.4757473468780518 + }, + { + "auxiliary_loss_clip": 0.06413485, + "auxiliary_loss_mlp": 0.01264592, + "balance_loss_clip": 0.06274045, + "balance_loss_mlp": 0.01253756, + "epoch": 0.7472719074101909, + "flos": 21395068848000.0, + "grad_norm": 2.468151584449191, + "language_loss": 0.58381009, + "learning_rate": 6.33298110431499e-07, + "loss": 0.66059089, + "num_input_tokens_seen": 268089840, + "router_z_loss_clip": 1.39257812, + "router_z_loss_mlp": 0.1083374, + "step": 12429, + "time_per_iteration": 2.5076515674591064 + }, + { + "auxiliary_loss_clip": 0.06411515, + "auxiliary_loss_mlp": 0.0126451, + "balance_loss_clip": 0.06274679, + "balance_loss_mlp": 0.01254395, + "epoch": 0.7473320306628589, + "flos": 29651064439680.0, + "grad_norm": 1.7643839025540142, + "language_loss": 0.60671711, + "learning_rate": 6.330137942461595e-07, + "loss": 0.6834774, + "num_input_tokens_seen": 268109360, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.10113525, + "step": 12430, + "time_per_iteration": 2.580826997756958 + }, + { + "auxiliary_loss_clip": 0.06397452, + "auxiliary_loss_mlp": 0.01264423, + "balance_loss_clip": 0.06268569, + "balance_loss_mlp": 0.01255339, + "epoch": 0.7473921539155268, + "flos": 24143316986880.0, + "grad_norm": 1.3480044268517646, + "language_loss": 0.7548542, + "learning_rate": 6.327295298970734e-07, + "loss": 0.83147293, + "num_input_tokens_seen": 268131840, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.09088135, + "step": 12431, + "time_per_iteration": 2.5767364501953125 + }, + { + "auxiliary_loss_clip": 0.06404008, + "auxiliary_loss_mlp": 0.01264023, + "balance_loss_clip": 0.06270575, + "balance_loss_mlp": 0.01253831, + "epoch": 0.7474522771681948, + "flos": 17492768824320.0, + "grad_norm": 2.003596145191226, + "language_loss": 0.75284076, + "learning_rate": 6.32445317395021e-07, + "loss": 0.82952106, + "num_input_tokens_seen": 268148300, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.10198975, + "step": 12432, + "time_per_iteration": 3.9378252029418945 + }, + { + "auxiliary_loss_clip": 0.06408846, + "auxiliary_loss_mlp": 0.01264276, + "balance_loss_clip": 0.06271054, + "balance_loss_mlp": 0.01253833, + "epoch": 0.7475124004208628, + "flos": 16732909272960.0, + "grad_norm": 2.3826566050681652, + "language_loss": 0.70483506, + "learning_rate": 6.321611567507787e-07, + "loss": 0.78156626, + "num_input_tokens_seen": 268166450, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.10437012, + "step": 12433, + "time_per_iteration": 2.4768426418304443 + }, + { + "auxiliary_loss_clip": 0.06408405, + "auxiliary_loss_mlp": 0.01266362, + "balance_loss_clip": 0.06274009, + "balance_loss_mlp": 0.01255782, + "epoch": 0.7475725236735308, + "flos": 19726533694080.0, + "grad_norm": 1.7388304285111835, + "language_loss": 0.67580962, + "learning_rate": 6.318770479751232e-07, + "loss": 0.75255728, + "num_input_tokens_seen": 268186165, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.105896, + "step": 12434, + "time_per_iteration": 2.547088384628296 + }, + { + "auxiliary_loss_clip": 0.06395668, + "auxiliary_loss_mlp": 0.01264935, + "balance_loss_clip": 0.06270221, + "balance_loss_mlp": 0.01256042, + "epoch": 0.7476326469261987, + "flos": 26293114465920.0, + "grad_norm": 1.4738346539678335, + "language_loss": 0.7966851, + "learning_rate": 6.315929910788263e-07, + "loss": 0.87329113, + "num_input_tokens_seen": 268208145, + "router_z_loss_clip": 1.25488281, + "router_z_loss_mlp": 0.08898926, + "step": 12435, + "time_per_iteration": 2.5363943576812744 + }, + { + "auxiliary_loss_clip": 0.06409591, + "auxiliary_loss_mlp": 0.01267417, + "balance_loss_clip": 0.0627221, + "balance_loss_mlp": 0.01257236, + "epoch": 0.7476927701788667, + "flos": 31839868794240.0, + "grad_norm": 2.1319276645513736, + "language_loss": 0.68030941, + "learning_rate": 6.313089860726604e-07, + "loss": 0.75707954, + "num_input_tokens_seen": 268228345, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.10180664, + "step": 12436, + "time_per_iteration": 2.655866861343384 + }, + { + "auxiliary_loss_clip": 0.06408997, + "auxiliary_loss_mlp": 0.01263336, + "balance_loss_clip": 0.06271006, + "balance_loss_mlp": 0.01252732, + "epoch": 0.7477528934315346, + "flos": 31803545249280.0, + "grad_norm": 1.4428842251570377, + "language_loss": 0.7086063, + "learning_rate": 6.31025032967396e-07, + "loss": 0.78532964, + "num_input_tokens_seen": 268250260, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.10601807, + "step": 12437, + "time_per_iteration": 2.5668420791625977 + }, + { + "auxiliary_loss_clip": 0.06400211, + "auxiliary_loss_mlp": 0.01266102, + "balance_loss_clip": 0.06271319, + "balance_loss_mlp": 0.01256929, + "epoch": 0.7478130166842026, + "flos": 20377548391680.0, + "grad_norm": 1.5941584942666511, + "language_loss": 0.6725921, + "learning_rate": 6.307411317737986e-07, + "loss": 0.74925524, + "num_input_tokens_seen": 268268440, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.09179688, + "step": 12438, + "time_per_iteration": 2.5391809940338135 + }, + { + "auxiliary_loss_clip": 0.06402425, + "auxiliary_loss_mlp": 0.01269468, + "balance_loss_clip": 0.06270497, + "balance_loss_mlp": 0.01259878, + "epoch": 0.7478731399368705, + "flos": 18154558771200.0, + "grad_norm": 1.5910882903057735, + "language_loss": 0.81170976, + "learning_rate": 6.304572825026344e-07, + "loss": 0.88842869, + "num_input_tokens_seen": 268285765, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09588623, + "step": 12439, + "time_per_iteration": 2.530305862426758 + }, + { + "auxiliary_loss_clip": 0.06401659, + "auxiliary_loss_mlp": 0.01264664, + "balance_loss_clip": 0.0627173, + "balance_loss_mlp": 0.0125502, + "epoch": 0.7479332631895386, + "flos": 15273259148160.0, + "grad_norm": 2.0986943273037335, + "language_loss": 0.71237975, + "learning_rate": 6.301734851646674e-07, + "loss": 0.78904307, + "num_input_tokens_seen": 268304015, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.09655762, + "step": 12440, + "time_per_iteration": 2.5543224811553955 + }, + { + "auxiliary_loss_clip": 0.06400722, + "auxiliary_loss_mlp": 0.01265179, + "balance_loss_clip": 0.06271139, + "balance_loss_mlp": 0.01255606, + "epoch": 0.7479933864422065, + "flos": 21148937879040.0, + "grad_norm": 1.8969303435383589, + "language_loss": 0.74162072, + "learning_rate": 6.298897397706597e-07, + "loss": 0.81827968, + "num_input_tokens_seen": 268323290, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.09570312, + "step": 12441, + "time_per_iteration": 2.4814085960388184 + }, + { + "auxiliary_loss_clip": 0.06407572, + "auxiliary_loss_mlp": 0.01269518, + "balance_loss_clip": 0.06270711, + "balance_loss_mlp": 0.01258664, + "epoch": 0.7480535096948745, + "flos": 14397217511040.0, + "grad_norm": 2.1766125237206384, + "language_loss": 0.82771671, + "learning_rate": 6.296060463313698e-07, + "loss": 0.90448761, + "num_input_tokens_seen": 268339490, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.10858154, + "step": 12442, + "time_per_iteration": 2.474766969680786 + }, + { + "auxiliary_loss_clip": 0.06407404, + "auxiliary_loss_mlp": 0.01264143, + "balance_loss_clip": 0.06271537, + "balance_loss_mlp": 0.01253551, + "epoch": 0.7481136329475425, + "flos": 27352073566080.0, + "grad_norm": 2.1201863783826087, + "language_loss": 0.63084489, + "learning_rate": 6.293224048575565e-07, + "loss": 0.7075603, + "num_input_tokens_seen": 268359865, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.105896, + "step": 12443, + "time_per_iteration": 2.537418842315674 + }, + { + "auxiliary_loss_clip": 0.06402731, + "auxiliary_loss_mlp": 0.01263567, + "balance_loss_clip": 0.06270343, + "balance_loss_mlp": 0.01254, + "epoch": 0.7481737562002104, + "flos": 19536656342400.0, + "grad_norm": 1.7130617298160193, + "language_loss": 0.71587157, + "learning_rate": 6.29038815359975e-07, + "loss": 0.79253459, + "num_input_tokens_seen": 268377065, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09570312, + "step": 12444, + "time_per_iteration": 2.5142312049865723 + }, + { + "auxiliary_loss_clip": 0.06404774, + "auxiliary_loss_mlp": 0.01263681, + "balance_loss_clip": 0.06271861, + "balance_loss_mlp": 0.01253483, + "epoch": 0.7482338794528784, + "flos": 21766102727040.0, + "grad_norm": 1.3467287331144688, + "language_loss": 0.68781805, + "learning_rate": 6.287552778493786e-07, + "loss": 0.76450258, + "num_input_tokens_seen": 268396935, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.10198975, + "step": 12445, + "time_per_iteration": 2.498960018157959 + }, + { + "auxiliary_loss_clip": 0.06400403, + "auxiliary_loss_mlp": 0.01264071, + "balance_loss_clip": 0.06269241, + "balance_loss_mlp": 0.01254319, + "epoch": 0.7482940027055464, + "flos": 18703269233280.0, + "grad_norm": 1.5654377266954753, + "language_loss": 0.74401557, + "learning_rate": 6.28471792336519e-07, + "loss": 0.82066035, + "num_input_tokens_seen": 268414460, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.09747314, + "step": 12446, + "time_per_iteration": 2.489685535430908 + }, + { + "auxiliary_loss_clip": 0.06408426, + "auxiliary_loss_mlp": 0.01264963, + "balance_loss_clip": 0.06271491, + "balance_loss_mlp": 0.01254467, + "epoch": 0.7483541259582144, + "flos": 16003587335040.0, + "grad_norm": 1.896183227268288, + "language_loss": 0.7341156, + "learning_rate": 6.281883588321475e-07, + "loss": 0.81084955, + "num_input_tokens_seen": 268432225, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.10491943, + "step": 12447, + "time_per_iteration": 2.464768648147583 + }, + { + "auxiliary_loss_clip": 0.06403442, + "auxiliary_loss_mlp": 0.01263884, + "balance_loss_clip": 0.06270905, + "balance_loss_mlp": 0.01254102, + "epoch": 0.7484142492108823, + "flos": 25563289403520.0, + "grad_norm": 2.623161293575912, + "language_loss": 0.72332132, + "learning_rate": 6.279049773470109e-07, + "loss": 0.79999459, + "num_input_tokens_seen": 268449270, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09777832, + "step": 12448, + "time_per_iteration": 2.601579427719116 + }, + { + "auxiliary_loss_clip": 0.06408041, + "auxiliary_loss_mlp": 0.01266135, + "balance_loss_clip": 0.06272004, + "balance_loss_mlp": 0.01256145, + "epoch": 0.7484743724635503, + "flos": 22893432359040.0, + "grad_norm": 1.636804246707767, + "language_loss": 0.73365426, + "learning_rate": 6.276216478918543e-07, + "loss": 0.81039608, + "num_input_tokens_seen": 268467250, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.09991455, + "step": 12449, + "time_per_iteration": 2.54630184173584 + }, + { + "auxiliary_loss_clip": 0.06411887, + "auxiliary_loss_mlp": 0.012677, + "balance_loss_clip": 0.06271833, + "balance_loss_mlp": 0.01256548, + "epoch": 0.7485344957162182, + "flos": 25307137872000.0, + "grad_norm": 1.841554129413667, + "language_loss": 0.61420983, + "learning_rate": 6.273383704774225e-07, + "loss": 0.69100565, + "num_input_tokens_seen": 268487270, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.11151123, + "step": 12450, + "time_per_iteration": 2.5542476177215576 + }, + { + "auxiliary_loss_clip": 0.06399691, + "auxiliary_loss_mlp": 0.01263156, + "balance_loss_clip": 0.0627162, + "balance_loss_mlp": 0.01254156, + "epoch": 0.7485946189688862, + "flos": 27060395103360.0, + "grad_norm": 1.84091608525743, + "language_loss": 0.70658576, + "learning_rate": 6.270551451144577e-07, + "loss": 0.78321427, + "num_input_tokens_seen": 268508020, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.08990479, + "step": 12451, + "time_per_iteration": 2.552686929702759 + }, + { + "auxiliary_loss_clip": 0.06414381, + "auxiliary_loss_mlp": 0.01265729, + "balance_loss_clip": 0.06273015, + "balance_loss_mlp": 0.01255143, + "epoch": 0.7486547422215541, + "flos": 26914052747520.0, + "grad_norm": 1.8323009368960723, + "language_loss": 0.80237973, + "learning_rate": 6.267719718136988e-07, + "loss": 0.87918079, + "num_input_tokens_seen": 268527375, + "router_z_loss_clip": 1.41308594, + "router_z_loss_mlp": 0.105896, + "step": 12452, + "time_per_iteration": 2.525906562805176 + }, + { + "auxiliary_loss_clip": 0.06414159, + "auxiliary_loss_mlp": 0.01265227, + "balance_loss_clip": 0.06274606, + "balance_loss_mlp": 0.01254898, + "epoch": 0.7487148654742222, + "flos": 22352855742720.0, + "grad_norm": 2.4829537234299184, + "language_loss": 0.72200477, + "learning_rate": 6.264888505858843e-07, + "loss": 0.79879862, + "num_input_tokens_seen": 268544870, + "router_z_loss_clip": 1.39648438, + "router_z_loss_mlp": 0.10333252, + "step": 12453, + "time_per_iteration": 3.899683952331543 + }, + { + "auxiliary_loss_clip": 0.06408122, + "auxiliary_loss_mlp": 0.01265158, + "balance_loss_clip": 0.06273174, + "balance_loss_mlp": 0.01255544, + "epoch": 0.7487749887268901, + "flos": 23045392938240.0, + "grad_norm": 1.5935388766621728, + "language_loss": 0.74146187, + "learning_rate": 6.262057814417517e-07, + "loss": 0.81819469, + "num_input_tokens_seen": 268564580, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.09619141, + "step": 12454, + "time_per_iteration": 2.494929552078247 + }, + { + "auxiliary_loss_clip": 0.06311407, + "auxiliary_loss_mlp": 0.012513, + "balance_loss_clip": 0.06256338, + "balance_loss_mlp": 0.01250216, + "epoch": 0.7488351119795581, + "flos": 71545565842560.0, + "grad_norm": 0.7199296433862132, + "language_loss": 0.59468263, + "learning_rate": 6.259227643920322e-07, + "loss": 0.67030972, + "num_input_tokens_seen": 268629550, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.01085663, + "step": 12455, + "time_per_iteration": 3.2877697944641113 + }, + { + "auxiliary_loss_clip": 0.06402359, + "auxiliary_loss_mlp": 0.01260932, + "balance_loss_clip": 0.06271666, + "balance_loss_mlp": 0.01251759, + "epoch": 0.748895235232226, + "flos": 17201048434560.0, + "grad_norm": 1.6203322015377568, + "language_loss": 0.79953825, + "learning_rate": 6.256397994474592e-07, + "loss": 0.87617117, + "num_input_tokens_seen": 268646645, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.09179688, + "step": 12456, + "time_per_iteration": 2.4608328342437744 + }, + { + "auxiliary_loss_clip": 0.06310637, + "auxiliary_loss_mlp": 0.01250455, + "balance_loss_clip": 0.06255627, + "balance_loss_mlp": 0.01249323, + "epoch": 0.748955358484894, + "flos": 58998276846720.0, + "grad_norm": 0.8208514355444383, + "language_loss": 0.61328387, + "learning_rate": 6.25356886618763e-07, + "loss": 0.68889475, + "num_input_tokens_seen": 268702275, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.01134491, + "step": 12457, + "time_per_iteration": 3.048952102661133 + }, + { + "auxiliary_loss_clip": 0.06408623, + "auxiliary_loss_mlp": 0.01266166, + "balance_loss_clip": 0.06272934, + "balance_loss_mlp": 0.01255867, + "epoch": 0.749015481737562, + "flos": 11364544287360.0, + "grad_norm": 1.9496047447072924, + "language_loss": 0.67320937, + "learning_rate": 6.250740259166711e-07, + "loss": 0.7499572, + "num_input_tokens_seen": 268716265, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.10308838, + "step": 12458, + "time_per_iteration": 2.4301834106445312 + }, + { + "auxiliary_loss_clip": 0.06403044, + "auxiliary_loss_mlp": 0.01266185, + "balance_loss_clip": 0.06271131, + "balance_loss_mlp": 0.01256279, + "epoch": 0.74907560499023, + "flos": 21112991677440.0, + "grad_norm": 1.7212914648304267, + "language_loss": 0.80174047, + "learning_rate": 6.247912173519106e-07, + "loss": 0.87843275, + "num_input_tokens_seen": 268734330, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09912109, + "step": 12459, + "time_per_iteration": 2.518477439880371 + }, + { + "auxiliary_loss_clip": 0.06404047, + "auxiliary_loss_mlp": 0.01264599, + "balance_loss_clip": 0.06271756, + "balance_loss_mlp": 0.01254926, + "epoch": 0.749135728242898, + "flos": 22273709961600.0, + "grad_norm": 1.512865855807545, + "language_loss": 0.80564761, + "learning_rate": 6.245084609352043e-07, + "loss": 0.88233417, + "num_input_tokens_seen": 268753500, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.09674072, + "step": 12460, + "time_per_iteration": 2.5079431533813477 + }, + { + "auxiliary_loss_clip": 0.06403753, + "auxiliary_loss_mlp": 0.01265433, + "balance_loss_clip": 0.0627199, + "balance_loss_mlp": 0.0125477, + "epoch": 0.7491958514955659, + "flos": 24063793862400.0, + "grad_norm": 1.6076689252740726, + "language_loss": 0.86212254, + "learning_rate": 6.242257566772755e-07, + "loss": 0.93881446, + "num_input_tokens_seen": 268772055, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.10662842, + "step": 12461, + "time_per_iteration": 2.542217969894409 + }, + { + "auxiliary_loss_clip": 0.06400948, + "auxiliary_loss_mlp": 0.01263344, + "balance_loss_clip": 0.06270917, + "balance_loss_mlp": 0.01254254, + "epoch": 0.7492559747482339, + "flos": 24497915466240.0, + "grad_norm": 1.880430722981425, + "language_loss": 0.69432622, + "learning_rate": 6.239431045888435e-07, + "loss": 0.77096915, + "num_input_tokens_seen": 268792265, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.09088135, + "step": 12462, + "time_per_iteration": 2.5493383407592773 + }, + { + "auxiliary_loss_clip": 0.06405858, + "auxiliary_loss_mlp": 0.01266202, + "balance_loss_clip": 0.06273175, + "balance_loss_mlp": 0.01255301, + "epoch": 0.7493160980009018, + "flos": 27752680736640.0, + "grad_norm": 1.8211376167609288, + "language_loss": 0.70671761, + "learning_rate": 6.236605046806267e-07, + "loss": 0.78343821, + "num_input_tokens_seen": 268812735, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.10900879, + "step": 12463, + "time_per_iteration": 3.986877918243408 + }, + { + "auxiliary_loss_clip": 0.06407613, + "auxiliary_loss_mlp": 0.01265922, + "balance_loss_clip": 0.06274509, + "balance_loss_mlp": 0.01255664, + "epoch": 0.7493762212535698, + "flos": 30233918240640.0, + "grad_norm": 1.7635457747868553, + "language_loss": 0.77660054, + "learning_rate": 6.233779569633419e-07, + "loss": 0.85333592, + "num_input_tokens_seen": 268833090, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.10247803, + "step": 12464, + "time_per_iteration": 2.613281726837158 + }, + { + "auxiliary_loss_clip": 0.06402797, + "auxiliary_loss_mlp": 0.01263814, + "balance_loss_clip": 0.06269908, + "balance_loss_mlp": 0.01254289, + "epoch": 0.7494363445062378, + "flos": 21950906906880.0, + "grad_norm": 1.6126979618339465, + "language_loss": 0.78109074, + "learning_rate": 6.230954614477034e-07, + "loss": 0.85775691, + "num_input_tokens_seen": 268851880, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.09521484, + "step": 12465, + "time_per_iteration": 2.4863994121551514 + }, + { + "auxiliary_loss_clip": 0.06420696, + "auxiliary_loss_mlp": 0.01267627, + "balance_loss_clip": 0.06278575, + "balance_loss_mlp": 0.01256332, + "epoch": 0.7494964677589058, + "flos": 12494473395840.0, + "grad_norm": 2.5697202625678877, + "language_loss": 0.74354923, + "learning_rate": 6.22813018144422e-07, + "loss": 0.82043248, + "num_input_tokens_seen": 268867910, + "router_z_loss_clip": 1.41992188, + "router_z_loss_mlp": 0.11303711, + "step": 12466, + "time_per_iteration": 3.9045188426971436 + }, + { + "auxiliary_loss_clip": 0.06406893, + "auxiliary_loss_mlp": 0.01262068, + "balance_loss_clip": 0.06270187, + "balance_loss_mlp": 0.01252293, + "epoch": 0.7495565910115737, + "flos": 21659521933440.0, + "grad_norm": 1.9829684209764449, + "language_loss": 0.66688263, + "learning_rate": 6.22530627064209e-07, + "loss": 0.74357224, + "num_input_tokens_seen": 268887260, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.09777832, + "step": 12467, + "time_per_iteration": 2.54917049407959 + }, + { + "auxiliary_loss_clip": 0.06409226, + "auxiliary_loss_mlp": 0.01263538, + "balance_loss_clip": 0.06273383, + "balance_loss_mlp": 0.01253501, + "epoch": 0.7496167142642417, + "flos": 15274013834880.0, + "grad_norm": 2.0991094746025416, + "language_loss": 0.76436639, + "learning_rate": 6.222482882177735e-07, + "loss": 0.84109402, + "num_input_tokens_seen": 268902520, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.1003418, + "step": 12468, + "time_per_iteration": 2.4655251502990723 + }, + { + "auxiliary_loss_clip": 0.0640367, + "auxiliary_loss_mlp": 0.01266554, + "balance_loss_clip": 0.06271279, + "balance_loss_mlp": 0.01256129, + "epoch": 0.7496768375169096, + "flos": 22061554623360.0, + "grad_norm": 1.9736124429451793, + "language_loss": 0.69775021, + "learning_rate": 6.219660016158201e-07, + "loss": 0.77445245, + "num_input_tokens_seen": 268920970, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.10430908, + "step": 12469, + "time_per_iteration": 2.533859968185425 + }, + { + "auxiliary_loss_clip": 0.06409403, + "auxiliary_loss_mlp": 0.01264633, + "balance_loss_clip": 0.06274202, + "balance_loss_mlp": 0.01254726, + "epoch": 0.7497369607695776, + "flos": 19062144270720.0, + "grad_norm": 2.2473454659812107, + "language_loss": 0.6920374, + "learning_rate": 6.216837672690543e-07, + "loss": 0.76877773, + "num_input_tokens_seen": 268936600, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.09899902, + "step": 12470, + "time_per_iteration": 2.4770658016204834 + }, + { + "auxiliary_loss_clip": 0.06413378, + "auxiliary_loss_mlp": 0.01268274, + "balance_loss_clip": 0.06271495, + "balance_loss_mlp": 0.01256329, + "epoch": 0.7497970840222457, + "flos": 21624036929280.0, + "grad_norm": 1.7361312699239924, + "language_loss": 0.75303179, + "learning_rate": 6.214015851881793e-07, + "loss": 0.82984829, + "num_input_tokens_seen": 268956560, + "router_z_loss_clip": 1.41894531, + "router_z_loss_mlp": 0.11950684, + "step": 12471, + "time_per_iteration": 2.5342705249786377 + }, + { + "auxiliary_loss_clip": 0.06412168, + "auxiliary_loss_mlp": 0.01265091, + "balance_loss_clip": 0.06277177, + "balance_loss_mlp": 0.01255, + "epoch": 0.7498572072749136, + "flos": 13740710371200.0, + "grad_norm": 2.1773399303982663, + "language_loss": 0.77400845, + "learning_rate": 6.211194553838929e-07, + "loss": 0.85078096, + "num_input_tokens_seen": 268973945, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.10089111, + "step": 12472, + "time_per_iteration": 3.870166540145874 + }, + { + "auxiliary_loss_clip": 0.06403755, + "auxiliary_loss_mlp": 0.01264487, + "balance_loss_clip": 0.06272653, + "balance_loss_mlp": 0.01255039, + "epoch": 0.7499173305275816, + "flos": 22973207045760.0, + "grad_norm": 1.4354078089227125, + "language_loss": 0.84353936, + "learning_rate": 6.208373778668951e-07, + "loss": 0.92022181, + "num_input_tokens_seen": 268993245, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.09460449, + "step": 12473, + "time_per_iteration": 2.537057399749756 + }, + { + "auxiliary_loss_clip": 0.06410777, + "auxiliary_loss_mlp": 0.01268473, + "balance_loss_clip": 0.06273849, + "balance_loss_mlp": 0.01257261, + "epoch": 0.7499774537802495, + "flos": 22745916046080.0, + "grad_norm": 1.8524575994010102, + "language_loss": 0.73466665, + "learning_rate": 6.205553526478829e-07, + "loss": 0.81145918, + "num_input_tokens_seen": 269012125, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.11212158, + "step": 12474, + "time_per_iteration": 2.4842028617858887 + }, + { + "auxiliary_loss_clip": 0.06415059, + "auxiliary_loss_mlp": 0.01267095, + "balance_loss_clip": 0.06274258, + "balance_loss_mlp": 0.01255311, + "epoch": 0.7500375770329175, + "flos": 18302494354560.0, + "grad_norm": 1.6095037145271875, + "language_loss": 0.74770164, + "learning_rate": 6.202733797375492e-07, + "loss": 0.82452309, + "num_input_tokens_seen": 269030545, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.11779785, + "step": 12475, + "time_per_iteration": 2.4979960918426514 + }, + { + "auxiliary_loss_clip": 0.06415677, + "auxiliary_loss_mlp": 0.01269527, + "balance_loss_clip": 0.06274221, + "balance_loss_mlp": 0.01257898, + "epoch": 0.7500977002855854, + "flos": 19175684952960.0, + "grad_norm": 2.1095772826483907, + "language_loss": 0.80763221, + "learning_rate": 6.199914591465878e-07, + "loss": 0.88448429, + "num_input_tokens_seen": 269048180, + "router_z_loss_clip": 1.41308594, + "router_z_loss_mlp": 0.11633301, + "step": 12476, + "time_per_iteration": 2.491819381713867 + }, + { + "auxiliary_loss_clip": 0.06407472, + "auxiliary_loss_mlp": 0.01264673, + "balance_loss_clip": 0.06272332, + "balance_loss_mlp": 0.01254999, + "epoch": 0.7501578235382534, + "flos": 22170441404160.0, + "grad_norm": 7.116833282628377, + "language_loss": 0.77544057, + "learning_rate": 6.19709590885688e-07, + "loss": 0.852162, + "num_input_tokens_seen": 269068600, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.09674072, + "step": 12477, + "time_per_iteration": 2.5502593517303467 + }, + { + "auxiliary_loss_clip": 0.06310226, + "auxiliary_loss_mlp": 0.01250565, + "balance_loss_clip": 0.06254882, + "balance_loss_mlp": 0.01249338, + "epoch": 0.7502179467909214, + "flos": 64481035783680.0, + "grad_norm": 0.7848730842725032, + "language_loss": 0.54270738, + "learning_rate": 6.194277749655394e-07, + "loss": 0.61831528, + "num_input_tokens_seen": 269119045, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01226044, + "step": 12478, + "time_per_iteration": 3.0923471450805664 + }, + { + "auxiliary_loss_clip": 0.06402513, + "auxiliary_loss_mlp": 0.01266297, + "balance_loss_clip": 0.06272154, + "balance_loss_mlp": 0.01255747, + "epoch": 0.7502780700435894, + "flos": 20483332571520.0, + "grad_norm": 1.5542360710976224, + "language_loss": 0.80265927, + "learning_rate": 6.191460113968272e-07, + "loss": 0.87934738, + "num_input_tokens_seen": 269136755, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.10559082, + "step": 12479, + "time_per_iteration": 2.503929615020752 + }, + { + "auxiliary_loss_clip": 0.06412464, + "auxiliary_loss_mlp": 0.01265738, + "balance_loss_clip": 0.06273077, + "balance_loss_mlp": 0.01254162, + "epoch": 0.7503381932962573, + "flos": 20450908241280.0, + "grad_norm": 4.66275961009968, + "language_loss": 0.62624717, + "learning_rate": 6.188643001902369e-07, + "loss": 0.70302922, + "num_input_tokens_seen": 269156120, + "router_z_loss_clip": 1.39257812, + "router_z_loss_mlp": 0.11566162, + "step": 12480, + "time_per_iteration": 2.488246202468872 + }, + { + "auxiliary_loss_clip": 0.06401666, + "auxiliary_loss_mlp": 0.0126556, + "balance_loss_clip": 0.06272847, + "balance_loss_mlp": 0.01256148, + "epoch": 0.7503983165489253, + "flos": 22388382673920.0, + "grad_norm": 1.5669372883229389, + "language_loss": 0.784675, + "learning_rate": 6.185826413564512e-07, + "loss": 0.86134732, + "num_input_tokens_seen": 269175650, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.09411621, + "step": 12481, + "time_per_iteration": 2.514516830444336 + }, + { + "auxiliary_loss_clip": 0.06406647, + "auxiliary_loss_mlp": 0.01271353, + "balance_loss_clip": 0.06270355, + "balance_loss_mlp": 0.01260159, + "epoch": 0.7504584398015932, + "flos": 24906321066240.0, + "grad_norm": 1.6690563670496772, + "language_loss": 0.71560133, + "learning_rate": 6.183010349061501e-07, + "loss": 0.79238129, + "num_input_tokens_seen": 269197080, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.11193848, + "step": 12482, + "time_per_iteration": 2.570258140563965 + }, + { + "auxiliary_loss_clip": 0.06406072, + "auxiliary_loss_mlp": 0.01265844, + "balance_loss_clip": 0.06272655, + "balance_loss_mlp": 0.0125505, + "epoch": 0.7505185630542612, + "flos": 25892381514240.0, + "grad_norm": 1.622739148659245, + "language_loss": 0.70420146, + "learning_rate": 6.180194808500118e-07, + "loss": 0.78092062, + "num_input_tokens_seen": 269218600, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.10784912, + "step": 12483, + "time_per_iteration": 2.545875072479248 + }, + { + "auxiliary_loss_clip": 0.06406315, + "auxiliary_loss_mlp": 0.01266459, + "balance_loss_clip": 0.0627225, + "balance_loss_mlp": 0.01257227, + "epoch": 0.7505786863069293, + "flos": 23149709671680.0, + "grad_norm": 1.6112204819340308, + "language_loss": 0.74173069, + "learning_rate": 6.177379791987131e-07, + "loss": 0.81845844, + "num_input_tokens_seen": 269239245, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.09240723, + "step": 12484, + "time_per_iteration": 2.50899600982666 + }, + { + "auxiliary_loss_clip": 0.06404275, + "auxiliary_loss_mlp": 0.01267227, + "balance_loss_clip": 0.06272139, + "balance_loss_mlp": 0.01256761, + "epoch": 0.7506388095595972, + "flos": 16989144658560.0, + "grad_norm": 1.988075921906434, + "language_loss": 0.84860504, + "learning_rate": 6.174565299629295e-07, + "loss": 0.92532003, + "num_input_tokens_seen": 269258520, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.10473633, + "step": 12485, + "time_per_iteration": 2.5089685916900635 + }, + { + "auxiliary_loss_clip": 0.06403236, + "auxiliary_loss_mlp": 0.01262842, + "balance_loss_clip": 0.06270488, + "balance_loss_mlp": 0.01253121, + "epoch": 0.7506989328122652, + "flos": 22351346369280.0, + "grad_norm": 1.4931669119648077, + "language_loss": 0.78489572, + "learning_rate": 6.171751331533323e-07, + "loss": 0.86155653, + "num_input_tokens_seen": 269278320, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.097229, + "step": 12486, + "time_per_iteration": 2.5051820278167725 + }, + { + "auxiliary_loss_clip": 0.06408528, + "auxiliary_loss_mlp": 0.01263313, + "balance_loss_clip": 0.06272987, + "balance_loss_mlp": 0.01253245, + "epoch": 0.7507590560649331, + "flos": 25783243171200.0, + "grad_norm": 1.7753955887486508, + "language_loss": 0.73021758, + "learning_rate": 6.168937887805932e-07, + "loss": 0.80693603, + "num_input_tokens_seen": 269298025, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.10064697, + "step": 12487, + "time_per_iteration": 2.547999382019043 + }, + { + "auxiliary_loss_clip": 0.06404672, + "auxiliary_loss_mlp": 0.01263386, + "balance_loss_clip": 0.0626927, + "balance_loss_mlp": 0.01253528, + "epoch": 0.7508191793176011, + "flos": 24286221325440.0, + "grad_norm": 1.9310699455089921, + "language_loss": 0.67608893, + "learning_rate": 6.166124968553801e-07, + "loss": 0.75276947, + "num_input_tokens_seen": 269316770, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.09857178, + "step": 12488, + "time_per_iteration": 2.5895445346832275 + }, + { + "auxiliary_loss_clip": 0.0640392, + "auxiliary_loss_mlp": 0.0126508, + "balance_loss_clip": 0.06270676, + "balance_loss_mlp": 0.01254822, + "epoch": 0.750879302570269, + "flos": 19905384234240.0, + "grad_norm": 1.5890652635946048, + "language_loss": 0.77430677, + "learning_rate": 6.163312573883592e-07, + "loss": 0.85099679, + "num_input_tokens_seen": 269334755, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.10253906, + "step": 12489, + "time_per_iteration": 2.5337159633636475 + }, + { + "auxiliary_loss_clip": 0.0640057, + "auxiliary_loss_mlp": 0.01265302, + "balance_loss_clip": 0.06270728, + "balance_loss_mlp": 0.01255431, + "epoch": 0.750939425822937, + "flos": 29213420964480.0, + "grad_norm": 1.5668986388800445, + "language_loss": 0.75072443, + "learning_rate": 6.160500703901956e-07, + "loss": 0.8273831, + "num_input_tokens_seen": 269353810, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.09875488, + "step": 12490, + "time_per_iteration": 2.5781826972961426 + }, + { + "auxiliary_loss_clip": 0.06405737, + "auxiliary_loss_mlp": 0.01266052, + "balance_loss_clip": 0.06274259, + "balance_loss_mlp": 0.0125592, + "epoch": 0.750999549075605, + "flos": 21148686316800.0, + "grad_norm": 1.487741862942094, + "language_loss": 0.7861315, + "learning_rate": 6.157689358715527e-07, + "loss": 0.86284935, + "num_input_tokens_seen": 269372910, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.10144043, + "step": 12491, + "time_per_iteration": 2.5030393600463867 + }, + { + "auxiliary_loss_clip": 0.06398296, + "auxiliary_loss_mlp": 0.01269676, + "balance_loss_clip": 0.06269314, + "balance_loss_mlp": 0.01260473, + "epoch": 0.751059672328273, + "flos": 23554090275840.0, + "grad_norm": 1.6435305052483133, + "language_loss": 0.76645952, + "learning_rate": 6.154878538430899e-07, + "loss": 0.84313929, + "num_input_tokens_seen": 269391545, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.09210205, + "step": 12492, + "time_per_iteration": 2.5466179847717285 + }, + { + "auxiliary_loss_clip": 0.06403392, + "auxiliary_loss_mlp": 0.01267084, + "balance_loss_clip": 0.06270675, + "balance_loss_mlp": 0.01257446, + "epoch": 0.7511197955809409, + "flos": 18995786236800.0, + "grad_norm": 1.8268388211945472, + "language_loss": 0.71465898, + "learning_rate": 6.152068243154671e-07, + "loss": 0.79136372, + "num_input_tokens_seen": 269408530, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09637451, + "step": 12493, + "time_per_iteration": 3.923126697540283 + }, + { + "auxiliary_loss_clip": 0.06408728, + "auxiliary_loss_mlp": 0.01268664, + "balance_loss_clip": 0.06274524, + "balance_loss_mlp": 0.01258603, + "epoch": 0.7511799188336089, + "flos": 22052246820480.0, + "grad_norm": 1.6129417562793205, + "language_loss": 0.80984807, + "learning_rate": 6.149258472993395e-07, + "loss": 0.88662201, + "num_input_tokens_seen": 269425930, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.10070801, + "step": 12494, + "time_per_iteration": 2.499166488647461 + }, + { + "auxiliary_loss_clip": 0.06403729, + "auxiliary_loss_mlp": 0.01266628, + "balance_loss_clip": 0.06270036, + "balance_loss_mlp": 0.01256418, + "epoch": 0.7512400420862768, + "flos": 16471894204800.0, + "grad_norm": 1.701536760083375, + "language_loss": 0.79124582, + "learning_rate": 6.146449228053634e-07, + "loss": 0.86794937, + "num_input_tokens_seen": 269443945, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.10211182, + "step": 12495, + "time_per_iteration": 2.482259511947632 + }, + { + "auxiliary_loss_clip": 0.06400186, + "auxiliary_loss_mlp": 0.01262526, + "balance_loss_clip": 0.06269289, + "balance_loss_mlp": 0.01253108, + "epoch": 0.7513001653389448, + "flos": 20454472039680.0, + "grad_norm": 1.7104928099780732, + "language_loss": 0.71375751, + "learning_rate": 6.143640508441898e-07, + "loss": 0.79038465, + "num_input_tokens_seen": 269463625, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09417725, + "step": 12496, + "time_per_iteration": 2.513437032699585 + }, + { + "auxiliary_loss_clip": 0.06405301, + "auxiliary_loss_mlp": 0.01263444, + "balance_loss_clip": 0.06272015, + "balance_loss_mlp": 0.01253907, + "epoch": 0.7513602885916129, + "flos": 23483497610880.0, + "grad_norm": 1.6654554654788911, + "language_loss": 0.78218853, + "learning_rate": 6.140832314264705e-07, + "loss": 0.85887605, + "num_input_tokens_seen": 269483415, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.09533691, + "step": 12497, + "time_per_iteration": 2.513091564178467 + }, + { + "auxiliary_loss_clip": 0.06402559, + "auxiliary_loss_mlp": 0.01266934, + "balance_loss_clip": 0.06268804, + "balance_loss_mlp": 0.01256867, + "epoch": 0.7514204118442808, + "flos": 26804495134080.0, + "grad_norm": 1.4375816508354362, + "language_loss": 0.77240133, + "learning_rate": 6.13802464562855e-07, + "loss": 0.8490963, + "num_input_tokens_seen": 269504635, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.10070801, + "step": 12498, + "time_per_iteration": 2.5410008430480957 + }, + { + "auxiliary_loss_clip": 0.06400871, + "auxiliary_loss_mlp": 0.01263117, + "balance_loss_clip": 0.06272262, + "balance_loss_mlp": 0.01254462, + "epoch": 0.7514805350969488, + "flos": 19871869800960.0, + "grad_norm": 1.7337697309070021, + "language_loss": 0.74015534, + "learning_rate": 6.135217502639878e-07, + "loss": 0.81679523, + "num_input_tokens_seen": 269523955, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.08654785, + "step": 12499, + "time_per_iteration": 2.557349443435669 + }, + { + "auxiliary_loss_clip": 0.06399096, + "auxiliary_loss_mlp": 0.01264017, + "balance_loss_clip": 0.06268655, + "balance_loss_mlp": 0.01254737, + "epoch": 0.7515406583496167, + "flos": 24578444839680.0, + "grad_norm": 2.167576832097364, + "language_loss": 0.79499745, + "learning_rate": 6.132410885405148e-07, + "loss": 0.87162852, + "num_input_tokens_seen": 269544410, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.09277344, + "step": 12500, + "time_per_iteration": 2.5547473430633545 + }, + { + "auxiliary_loss_clip": 0.06415384, + "auxiliary_loss_mlp": 0.01265407, + "balance_loss_clip": 0.06272934, + "balance_loss_mlp": 0.01253772, + "epoch": 0.7516007816022847, + "flos": 20126386177920.0, + "grad_norm": 1.9841359152283422, + "language_loss": 0.73215604, + "learning_rate": 6.129604794030794e-07, + "loss": 0.80896389, + "num_input_tokens_seen": 269563315, + "router_z_loss_clip": 1.42382812, + "router_z_loss_mlp": 0.11639404, + "step": 12501, + "time_per_iteration": 2.4737539291381836 + }, + { + "auxiliary_loss_clip": 0.06401603, + "auxiliary_loss_mlp": 0.0126533, + "balance_loss_clip": 0.06269078, + "balance_loss_mlp": 0.01255764, + "epoch": 0.7516609048549526, + "flos": 22791379685760.0, + "grad_norm": 1.708165440784374, + "language_loss": 0.7856493, + "learning_rate": 6.126799228623207e-07, + "loss": 0.86231852, + "num_input_tokens_seen": 269583950, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09570312, + "step": 12502, + "time_per_iteration": 4.065747499465942 + }, + { + "auxiliary_loss_clip": 0.0640514, + "auxiliary_loss_mlp": 0.01262296, + "balance_loss_clip": 0.06270734, + "balance_loss_mlp": 0.01251895, + "epoch": 0.7517210281076206, + "flos": 10638576512640.0, + "grad_norm": 2.198342230636315, + "language_loss": 0.70527124, + "learning_rate": 6.123994189288786e-07, + "loss": 0.78194559, + "num_input_tokens_seen": 269600120, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.10406494, + "step": 12503, + "time_per_iteration": 2.4975264072418213 + }, + { + "auxiliary_loss_clip": 0.06308451, + "auxiliary_loss_mlp": 0.01250423, + "balance_loss_clip": 0.06253403, + "balance_loss_mlp": 0.01249304, + "epoch": 0.7517811513602886, + "flos": 66071542458240.0, + "grad_norm": 0.9653674550577583, + "language_loss": 0.63868368, + "learning_rate": 6.121189676133903e-07, + "loss": 0.71427244, + "num_input_tokens_seen": 269659815, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.01122284, + "step": 12504, + "time_per_iteration": 3.0423572063446045 + }, + { + "auxiliary_loss_clip": 0.06398649, + "auxiliary_loss_mlp": 0.01267599, + "balance_loss_clip": 0.06269499, + "balance_loss_mlp": 0.01258533, + "epoch": 0.7518412746129566, + "flos": 37277317071360.0, + "grad_norm": 1.461644685561848, + "language_loss": 0.68779212, + "learning_rate": 6.118385689264896e-07, + "loss": 0.7644546, + "num_input_tokens_seen": 269684565, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.09069824, + "step": 12505, + "time_per_iteration": 4.1895623207092285 + }, + { + "auxiliary_loss_clip": 0.06309824, + "auxiliary_loss_mlp": 0.01250829, + "balance_loss_clip": 0.06254642, + "balance_loss_mlp": 0.01249779, + "epoch": 0.7519013978656245, + "flos": 60539001396480.0, + "grad_norm": 0.633292190388587, + "language_loss": 0.55014133, + "learning_rate": 6.11558222878809e-07, + "loss": 0.6257478, + "num_input_tokens_seen": 269752325, + "router_z_loss_clip": 0.55126953, + "router_z_loss_mlp": 0.01050568, + "step": 12506, + "time_per_iteration": 3.249525785446167 + }, + { + "auxiliary_loss_clip": 0.06407043, + "auxiliary_loss_mlp": 0.01265184, + "balance_loss_clip": 0.0627189, + "balance_loss_mlp": 0.01254831, + "epoch": 0.7519615211182925, + "flos": 18812826846720.0, + "grad_norm": 1.7032377600653197, + "language_loss": 0.78890646, + "learning_rate": 6.112779294809796e-07, + "loss": 0.86562872, + "num_input_tokens_seen": 269770630, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.10339355, + "step": 12507, + "time_per_iteration": 2.4874064922332764 + }, + { + "auxiliary_loss_clip": 0.06398805, + "auxiliary_loss_mlp": 0.01267855, + "balance_loss_clip": 0.06269046, + "balance_loss_mlp": 0.0125808, + "epoch": 0.7520216443709604, + "flos": 14580596171520.0, + "grad_norm": 1.7335317284626974, + "language_loss": 0.71662533, + "learning_rate": 6.10997688743631e-07, + "loss": 0.79329199, + "num_input_tokens_seen": 269787280, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.09777832, + "step": 12508, + "time_per_iteration": 2.5105843544006348 + }, + { + "auxiliary_loss_clip": 0.06401521, + "auxiliary_loss_mlp": 0.0126325, + "balance_loss_clip": 0.06269743, + "balance_loss_mlp": 0.01254262, + "epoch": 0.7520817676236284, + "flos": 17062420654080.0, + "grad_norm": 1.5570539032807615, + "language_loss": 0.72277093, + "learning_rate": 6.107175006773885e-07, + "loss": 0.79941863, + "num_input_tokens_seen": 269805205, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.08984375, + "step": 12509, + "time_per_iteration": 2.452536106109619 + }, + { + "auxiliary_loss_clip": 0.06410283, + "auxiliary_loss_mlp": 0.01268332, + "balance_loss_clip": 0.06271298, + "balance_loss_mlp": 0.01257496, + "epoch": 0.7521418908762965, + "flos": 25673517849600.0, + "grad_norm": 1.5708944313915068, + "language_loss": 0.61849803, + "learning_rate": 6.104373652928785e-07, + "loss": 0.69528419, + "num_input_tokens_seen": 269824820, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.10839844, + "step": 12510, + "time_per_iteration": 2.5873842239379883 + }, + { + "auxiliary_loss_clip": 0.0640108, + "auxiliary_loss_mlp": 0.01265287, + "balance_loss_clip": 0.06272186, + "balance_loss_mlp": 0.01255613, + "epoch": 0.7522020141289644, + "flos": 20893079836800.0, + "grad_norm": 2.376424166314484, + "language_loss": 0.81816781, + "learning_rate": 6.10157282600722e-07, + "loss": 0.89483154, + "num_input_tokens_seen": 269842825, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.09674072, + "step": 12511, + "time_per_iteration": 3.9771971702575684 + }, + { + "auxiliary_loss_clip": 0.06408679, + "auxiliary_loss_mlp": 0.01269282, + "balance_loss_clip": 0.06270606, + "balance_loss_mlp": 0.01258571, + "epoch": 0.7522621373816324, + "flos": 12645134236800.0, + "grad_norm": 1.635821418460478, + "language_loss": 0.76383078, + "learning_rate": 6.098772526115412e-07, + "loss": 0.84061033, + "num_input_tokens_seen": 269859000, + "router_z_loss_clip": 1.38183594, + "router_z_loss_mlp": 0.1071167, + "step": 12512, + "time_per_iteration": 2.497439384460449 + }, + { + "auxiliary_loss_clip": 0.06396883, + "auxiliary_loss_mlp": 0.01265576, + "balance_loss_clip": 0.06270725, + "balance_loss_mlp": 0.01256557, + "epoch": 0.7523222606343003, + "flos": 25632624257280.0, + "grad_norm": 1.702992973321348, + "language_loss": 0.82472456, + "learning_rate": 6.095972753359537e-07, + "loss": 0.90134907, + "num_input_tokens_seen": 269878895, + "router_z_loss_clip": 1.26171875, + "router_z_loss_mlp": 0.09002686, + "step": 12513, + "time_per_iteration": 2.581941604614258 + }, + { + "auxiliary_loss_clip": 0.06405152, + "auxiliary_loss_mlp": 0.01262838, + "balance_loss_clip": 0.06268971, + "balance_loss_mlp": 0.01252747, + "epoch": 0.7523823838869683, + "flos": 20455142872320.0, + "grad_norm": 1.6682256759648477, + "language_loss": 0.7510156, + "learning_rate": 6.093173507845771e-07, + "loss": 0.82769549, + "num_input_tokens_seen": 269897280, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.10089111, + "step": 12514, + "time_per_iteration": 2.4942328929901123 + }, + { + "auxiliary_loss_clip": 0.06397319, + "auxiliary_loss_mlp": 0.0126564, + "balance_loss_clip": 0.06269006, + "balance_loss_mlp": 0.01256955, + "epoch": 0.7524425071396362, + "flos": 14725890351360.0, + "grad_norm": 1.7883586477571864, + "language_loss": 0.689107, + "learning_rate": 6.090374789680271e-07, + "loss": 0.76573658, + "num_input_tokens_seen": 269914640, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.08679199, + "step": 12515, + "time_per_iteration": 2.494940996170044 + }, + { + "auxiliary_loss_clip": 0.06405492, + "auxiliary_loss_mlp": 0.01266715, + "balance_loss_clip": 0.06272881, + "balance_loss_mlp": 0.01257225, + "epoch": 0.7525026303923043, + "flos": 30600004728960.0, + "grad_norm": 2.8396136921883905, + "language_loss": 0.70415783, + "learning_rate": 6.087576598969137e-07, + "loss": 0.78087991, + "num_input_tokens_seen": 269934960, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.09490967, + "step": 12516, + "time_per_iteration": 2.584015130996704 + }, + { + "auxiliary_loss_clip": 0.06399474, + "auxiliary_loss_mlp": 0.01267761, + "balance_loss_clip": 0.06270649, + "balance_loss_mlp": 0.01258325, + "epoch": 0.7525627536449722, + "flos": 24798901731840.0, + "grad_norm": 1.5910108360276343, + "language_loss": 0.89611065, + "learning_rate": 6.084778935818495e-07, + "loss": 0.97278303, + "num_input_tokens_seen": 269956655, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.09436035, + "step": 12517, + "time_per_iteration": 2.5272841453552246 + }, + { + "auxiliary_loss_clip": 0.06410724, + "auxiliary_loss_mlp": 0.01264516, + "balance_loss_clip": 0.06273246, + "balance_loss_mlp": 0.01254359, + "epoch": 0.7526228768976402, + "flos": 20786499043200.0, + "grad_norm": 1.4709684896857864, + "language_loss": 0.74636328, + "learning_rate": 6.081981800334437e-07, + "loss": 0.82311571, + "num_input_tokens_seen": 269976835, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.10150146, + "step": 12518, + "time_per_iteration": 2.507249116897583 + }, + { + "auxiliary_loss_clip": 0.06313983, + "auxiliary_loss_mlp": 0.01251233, + "balance_loss_clip": 0.06258783, + "balance_loss_mlp": 0.01250141, + "epoch": 0.7526830001503081, + "flos": 66578017662720.0, + "grad_norm": 0.6920212642256274, + "language_loss": 0.55552846, + "learning_rate": 6.079185192623017e-07, + "loss": 0.63118064, + "num_input_tokens_seen": 270040630, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01094055, + "step": 12519, + "time_per_iteration": 3.1638381481170654 + }, + { + "auxiliary_loss_clip": 0.06402172, + "auxiliary_loss_mlp": 0.0126505, + "balance_loss_clip": 0.06268954, + "balance_loss_mlp": 0.01255423, + "epoch": 0.7527431234029761, + "flos": 23484755422080.0, + "grad_norm": 1.392327642078427, + "language_loss": 0.77952313, + "learning_rate": 6.07638911279029e-07, + "loss": 0.85619533, + "num_input_tokens_seen": 270059695, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.09625244, + "step": 12520, + "time_per_iteration": 2.5008206367492676 + }, + { + "auxiliary_loss_clip": 0.06405456, + "auxiliary_loss_mlp": 0.01265903, + "balance_loss_clip": 0.06273633, + "balance_loss_mlp": 0.01256158, + "epoch": 0.752803246655644, + "flos": 22055265567360.0, + "grad_norm": 8.971083878889642, + "language_loss": 0.74495649, + "learning_rate": 6.07359356094229e-07, + "loss": 0.82167011, + "num_input_tokens_seen": 270078420, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.09747314, + "step": 12521, + "time_per_iteration": 2.5451552867889404 + }, + { + "auxiliary_loss_clip": 0.06412265, + "auxiliary_loss_mlp": 0.0126799, + "balance_loss_clip": 0.06272561, + "balance_loss_mlp": 0.01257059, + "epoch": 0.752863369908312, + "flos": 30161606567040.0, + "grad_norm": 1.8189760564155686, + "language_loss": 0.67176616, + "learning_rate": 6.070798537185016e-07, + "loss": 0.74856877, + "num_input_tokens_seen": 270097040, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.10925293, + "step": 12522, + "time_per_iteration": 2.556718349456787 + }, + { + "auxiliary_loss_clip": 0.06409014, + "auxiliary_loss_mlp": 0.01271964, + "balance_loss_clip": 0.06271487, + "balance_loss_mlp": 0.01261825, + "epoch": 0.7529234931609801, + "flos": 24573874792320.0, + "grad_norm": 1.5612093736475694, + "language_loss": 0.78733182, + "learning_rate": 6.068004041624453e-07, + "loss": 0.86414158, + "num_input_tokens_seen": 270116365, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.10137939, + "step": 12523, + "time_per_iteration": 2.5776190757751465 + }, + { + "auxiliary_loss_clip": 0.0639995, + "auxiliary_loss_mlp": 0.01266629, + "balance_loss_clip": 0.06269381, + "balance_loss_mlp": 0.0125683, + "epoch": 0.752983616413648, + "flos": 23119088204160.0, + "grad_norm": 1.791528721862032, + "language_loss": 0.80482811, + "learning_rate": 6.065210074366571e-07, + "loss": 0.88149387, + "num_input_tokens_seen": 270135395, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.09796143, + "step": 12524, + "time_per_iteration": 2.500800132751465 + }, + { + "auxiliary_loss_clip": 0.06402539, + "auxiliary_loss_mlp": 0.01269955, + "balance_loss_clip": 0.06271717, + "balance_loss_mlp": 0.01260996, + "epoch": 0.753043739666316, + "flos": 24323928462720.0, + "grad_norm": 1.510186119620748, + "language_loss": 0.74149638, + "learning_rate": 6.062416635517326e-07, + "loss": 0.81822133, + "num_input_tokens_seen": 270156425, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.08953857, + "step": 12525, + "time_per_iteration": 2.5363988876342773 + }, + { + "auxiliary_loss_clip": 0.0639966, + "auxiliary_loss_mlp": 0.01264528, + "balance_loss_clip": 0.06270238, + "balance_loss_mlp": 0.01254777, + "epoch": 0.7531038629189839, + "flos": 24250149342720.0, + "grad_norm": 1.8502310757699438, + "language_loss": 0.725272, + "learning_rate": 6.059623725182641e-07, + "loss": 0.80191386, + "num_input_tokens_seen": 270176905, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.09753418, + "step": 12526, + "time_per_iteration": 2.5115420818328857 + }, + { + "auxiliary_loss_clip": 0.06402011, + "auxiliary_loss_mlp": 0.01263679, + "balance_loss_clip": 0.06270412, + "balance_loss_mlp": 0.01254167, + "epoch": 0.7531639861716519, + "flos": 30196378811520.0, + "grad_norm": 1.617761308290089, + "language_loss": 0.72719419, + "learning_rate": 6.056831343468414e-07, + "loss": 0.80385113, + "num_input_tokens_seen": 270196640, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09509277, + "step": 12527, + "time_per_iteration": 2.620079517364502 + }, + { + "auxiliary_loss_clip": 0.06399914, + "auxiliary_loss_mlp": 0.01265035, + "balance_loss_clip": 0.06268723, + "balance_loss_mlp": 0.01255558, + "epoch": 0.7532241094243198, + "flos": 18229050650880.0, + "grad_norm": 1.8406342788129475, + "language_loss": 0.81231797, + "learning_rate": 6.054039490480539e-07, + "loss": 0.88896745, + "num_input_tokens_seen": 270213905, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.0947876, + "step": 12528, + "time_per_iteration": 2.4696736335754395 + }, + { + "auxiliary_loss_clip": 0.06403716, + "auxiliary_loss_mlp": 0.01265532, + "balance_loss_clip": 0.06269462, + "balance_loss_mlp": 0.0125525, + "epoch": 0.7532842326769879, + "flos": 20886413437440.0, + "grad_norm": 2.282089070313471, + "language_loss": 0.85098541, + "learning_rate": 6.051248166324892e-07, + "loss": 0.92767787, + "num_input_tokens_seen": 270231995, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.1027832, + "step": 12529, + "time_per_iteration": 2.5071592330932617 + }, + { + "auxiliary_loss_clip": 0.06410262, + "auxiliary_loss_mlp": 0.01264635, + "balance_loss_clip": 0.06272294, + "balance_loss_mlp": 0.01254818, + "epoch": 0.7533443559296558, + "flos": 18084762720000.0, + "grad_norm": 1.902579288696582, + "language_loss": 0.74726146, + "learning_rate": 6.048457371107303e-07, + "loss": 0.82401049, + "num_input_tokens_seen": 270251480, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.09814453, + "step": 12530, + "time_per_iteration": 2.502178192138672 + }, + { + "auxiliary_loss_clip": 0.06308636, + "auxiliary_loss_mlp": 0.01252721, + "balance_loss_clip": 0.06253405, + "balance_loss_mlp": 0.01251678, + "epoch": 0.7534044791823238, + "flos": 50271668398080.0, + "grad_norm": 0.8173638776820421, + "language_loss": 0.63636577, + "learning_rate": 6.045667104933612e-07, + "loss": 0.71197939, + "num_input_tokens_seen": 270306480, + "router_z_loss_clip": 0.55273438, + "router_z_loss_mlp": 0.01044464, + "step": 12531, + "time_per_iteration": 2.9869658946990967 + }, + { + "auxiliary_loss_clip": 0.06406563, + "auxiliary_loss_mlp": 0.01265391, + "balance_loss_clip": 0.06270574, + "balance_loss_mlp": 0.01255437, + "epoch": 0.7534646024349917, + "flos": 20856588583680.0, + "grad_norm": 2.370705934223187, + "language_loss": 0.70650482, + "learning_rate": 6.042877367909633e-07, + "loss": 0.78322434, + "num_input_tokens_seen": 270324595, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.0994873, + "step": 12532, + "time_per_iteration": 3.92488169670105 + }, + { + "auxiliary_loss_clip": 0.06397863, + "auxiliary_loss_mlp": 0.01267261, + "balance_loss_clip": 0.06270358, + "balance_loss_mlp": 0.01257814, + "epoch": 0.7535247256876597, + "flos": 23077775341440.0, + "grad_norm": 1.5088215588647627, + "language_loss": 0.77771306, + "learning_rate": 6.040088160141132e-07, + "loss": 0.85436428, + "num_input_tokens_seen": 270344375, + "router_z_loss_clip": 1.27441406, + "router_z_loss_mlp": 0.09442139, + "step": 12533, + "time_per_iteration": 2.489647626876831 + }, + { + "auxiliary_loss_clip": 0.06306736, + "auxiliary_loss_mlp": 0.01251137, + "balance_loss_clip": 0.06251442, + "balance_loss_mlp": 0.01250062, + "epoch": 0.7535848489403276, + "flos": 58643888002560.0, + "grad_norm": 0.7841580581676975, + "language_loss": 0.57404244, + "learning_rate": 6.037299481733886e-07, + "loss": 0.64962119, + "num_input_tokens_seen": 270405235, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01076508, + "step": 12534, + "time_per_iteration": 3.1910510063171387 + }, + { + "auxiliary_loss_clip": 0.06403376, + "auxiliary_loss_mlp": 0.01267552, + "balance_loss_clip": 0.06270553, + "balance_loss_mlp": 0.01257568, + "epoch": 0.7536449721929956, + "flos": 26585044490880.0, + "grad_norm": 1.3288810458432065, + "language_loss": 0.71601486, + "learning_rate": 6.03451133279365e-07, + "loss": 0.79272413, + "num_input_tokens_seen": 270425820, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.09991455, + "step": 12535, + "time_per_iteration": 2.5521280765533447 + }, + { + "auxiliary_loss_clip": 0.06405595, + "auxiliary_loss_mlp": 0.0126787, + "balance_loss_clip": 0.06269699, + "balance_loss_mlp": 0.01258024, + "epoch": 0.7537050954456637, + "flos": 25742559214080.0, + "grad_norm": 1.4204428074088968, + "language_loss": 0.80683547, + "learning_rate": 6.031723713426135e-07, + "loss": 0.88357008, + "num_input_tokens_seen": 270447120, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.09838867, + "step": 12536, + "time_per_iteration": 2.612800359725952 + }, + { + "auxiliary_loss_clip": 0.06397747, + "auxiliary_loss_mlp": 0.01263423, + "balance_loss_clip": 0.06268154, + "balance_loss_mlp": 0.01254006, + "epoch": 0.7537652186983316, + "flos": 30231863815680.0, + "grad_norm": 2.5926766320548333, + "language_loss": 0.7478568, + "learning_rate": 6.028936623737067e-07, + "loss": 0.82446849, + "num_input_tokens_seen": 270468680, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.09423828, + "step": 12537, + "time_per_iteration": 2.6071624755859375 + }, + { + "auxiliary_loss_clip": 0.06407893, + "auxiliary_loss_mlp": 0.01268462, + "balance_loss_clip": 0.06273423, + "balance_loss_mlp": 0.01258771, + "epoch": 0.7538253419509996, + "flos": 12646224339840.0, + "grad_norm": 1.6302297616085528, + "language_loss": 0.74427301, + "learning_rate": 6.026150063832111e-07, + "loss": 0.82103658, + "num_input_tokens_seen": 270486310, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.09698486, + "step": 12538, + "time_per_iteration": 2.532360076904297 + }, + { + "auxiliary_loss_clip": 0.06404191, + "auxiliary_loss_mlp": 0.01267125, + "balance_loss_clip": 0.06270571, + "balance_loss_mlp": 0.01256676, + "epoch": 0.7538854652036675, + "flos": 23192783470080.0, + "grad_norm": 1.9550849129782661, + "language_loss": 0.67649639, + "learning_rate": 6.023364033816956e-07, + "loss": 0.75320947, + "num_input_tokens_seen": 270507210, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.10455322, + "step": 12539, + "time_per_iteration": 2.5289549827575684 + }, + { + "auxiliary_loss_clip": 0.06399977, + "auxiliary_loss_mlp": 0.01264844, + "balance_loss_clip": 0.06269806, + "balance_loss_mlp": 0.01255296, + "epoch": 0.7539455884563355, + "flos": 23193076959360.0, + "grad_norm": 1.5765955359694397, + "language_loss": 0.74866569, + "learning_rate": 6.020578533797229e-07, + "loss": 0.82531393, + "num_input_tokens_seen": 270525250, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.09552002, + "step": 12540, + "time_per_iteration": 2.519505023956299 + }, + { + "auxiliary_loss_clip": 0.06404985, + "auxiliary_loss_mlp": 0.01264812, + "balance_loss_clip": 0.06269932, + "balance_loss_mlp": 0.01254816, + "epoch": 0.7540057117090034, + "flos": 13184998093440.0, + "grad_norm": 1.8443764292717588, + "language_loss": 0.73148596, + "learning_rate": 6.017793563878566e-07, + "loss": 0.80818391, + "num_input_tokens_seen": 270539295, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.10003662, + "step": 12541, + "time_per_iteration": 2.4335999488830566 + }, + { + "auxiliary_loss_clip": 0.06404177, + "auxiliary_loss_mlp": 0.0126394, + "balance_loss_clip": 0.06270086, + "balance_loss_mlp": 0.01254254, + "epoch": 0.7540658349616715, + "flos": 45488561783040.0, + "grad_norm": 1.5152984414319595, + "language_loss": 0.72388256, + "learning_rate": 6.015009124166576e-07, + "loss": 0.80056369, + "num_input_tokens_seen": 270562815, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.09680176, + "step": 12542, + "time_per_iteration": 4.1390299797058105 + }, + { + "auxiliary_loss_clip": 0.06397901, + "auxiliary_loss_mlp": 0.01264113, + "balance_loss_clip": 0.06268644, + "balance_loss_mlp": 0.01254344, + "epoch": 0.7541259582143394, + "flos": 19935754139520.0, + "grad_norm": 2.884156487358873, + "language_loss": 0.84689027, + "learning_rate": 6.012225214766844e-07, + "loss": 0.92351043, + "num_input_tokens_seen": 270579055, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.09771729, + "step": 12543, + "time_per_iteration": 2.503478765487671 + }, + { + "auxiliary_loss_clip": 0.06401214, + "auxiliary_loss_mlp": 0.0126353, + "balance_loss_clip": 0.06271526, + "balance_loss_mlp": 0.01253886, + "epoch": 0.7541860814670074, + "flos": 27205521575040.0, + "grad_norm": 2.0819371266250095, + "language_loss": 0.73893505, + "learning_rate": 6.009441835784927e-07, + "loss": 0.81558251, + "num_input_tokens_seen": 270599080, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.09643555, + "step": 12544, + "time_per_iteration": 2.5382394790649414 + }, + { + "auxiliary_loss_clip": 0.06402065, + "auxiliary_loss_mlp": 0.01263786, + "balance_loss_clip": 0.06270371, + "balance_loss_mlp": 0.01254505, + "epoch": 0.7542462047196753, + "flos": 21330471749760.0, + "grad_norm": 1.7394409636932977, + "language_loss": 0.68186235, + "learning_rate": 6.006658987326383e-07, + "loss": 0.7585209, + "num_input_tokens_seen": 270618715, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09277344, + "step": 12545, + "time_per_iteration": 3.9819624423980713 + }, + { + "auxiliary_loss_clip": 0.06407365, + "auxiliary_loss_mlp": 0.01263612, + "balance_loss_clip": 0.06273335, + "balance_loss_mlp": 0.01254326, + "epoch": 0.7543063279723433, + "flos": 11944630903680.0, + "grad_norm": 1.6656335194491443, + "language_loss": 0.69190776, + "learning_rate": 6.003876669496728e-07, + "loss": 0.76861751, + "num_input_tokens_seen": 270635695, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.09283447, + "step": 12546, + "time_per_iteration": 2.5855300426483154 + }, + { + "auxiliary_loss_clip": 0.06408285, + "auxiliary_loss_mlp": 0.01269444, + "balance_loss_clip": 0.06272961, + "balance_loss_mlp": 0.01258423, + "epoch": 0.7543664512250112, + "flos": 22826529273600.0, + "grad_norm": 2.2583251382821268, + "language_loss": 0.73943269, + "learning_rate": 6.00109488240147e-07, + "loss": 0.81620997, + "num_input_tokens_seen": 270654325, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.11022949, + "step": 12547, + "time_per_iteration": 2.5086138248443604 + }, + { + "auxiliary_loss_clip": 0.0640479, + "auxiliary_loss_mlp": 0.01264266, + "balance_loss_clip": 0.06272002, + "balance_loss_mlp": 0.01253943, + "epoch": 0.7544265744776792, + "flos": 20930283849600.0, + "grad_norm": 1.77678899313766, + "language_loss": 0.68066597, + "learning_rate": 5.998313626146099e-07, + "loss": 0.75735652, + "num_input_tokens_seen": 270674260, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.10333252, + "step": 12548, + "time_per_iteration": 2.534188747406006 + }, + { + "auxiliary_loss_clip": 0.0640662, + "auxiliary_loss_mlp": 0.01267563, + "balance_loss_clip": 0.06271043, + "balance_loss_mlp": 0.01257811, + "epoch": 0.7544866977303473, + "flos": 15200947474560.0, + "grad_norm": 1.8925592973514778, + "language_loss": 0.87693512, + "learning_rate": 5.995532900836088e-07, + "loss": 0.95367694, + "num_input_tokens_seen": 270692200, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.09747314, + "step": 12549, + "time_per_iteration": 2.508145332336426 + }, + { + "auxiliary_loss_clip": 0.06395473, + "auxiliary_loss_mlp": 0.01264006, + "balance_loss_clip": 0.06269422, + "balance_loss_mlp": 0.01254213, + "epoch": 0.7545468209830152, + "flos": 27090094176000.0, + "grad_norm": 1.707615461244764, + "language_loss": 0.77432424, + "learning_rate": 5.992752706576865e-07, + "loss": 0.85091901, + "num_input_tokens_seen": 270709675, + "router_z_loss_clip": 1.25976562, + "router_z_loss_mlp": 0.09790039, + "step": 12550, + "time_per_iteration": 3.9424808025360107 + }, + { + "auxiliary_loss_clip": 0.06406951, + "auxiliary_loss_mlp": 0.01264837, + "balance_loss_clip": 0.06272922, + "balance_loss_mlp": 0.01254967, + "epoch": 0.7546069442356832, + "flos": 26879238576000.0, + "grad_norm": 1.4048272187532633, + "language_loss": 0.6982311, + "learning_rate": 5.98997304347386e-07, + "loss": 0.77494895, + "num_input_tokens_seen": 270733055, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.09869385, + "step": 12551, + "time_per_iteration": 2.577078342437744 + }, + { + "auxiliary_loss_clip": 0.06402165, + "auxiliary_loss_mlp": 0.01267501, + "balance_loss_clip": 0.06271981, + "balance_loss_mlp": 0.0125766, + "epoch": 0.7546670674883511, + "flos": 15748735541760.0, + "grad_norm": 1.8643367564290814, + "language_loss": 0.86457175, + "learning_rate": 5.987193911632487e-07, + "loss": 0.94126844, + "num_input_tokens_seen": 270749275, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.09832764, + "step": 12552, + "time_per_iteration": 2.5127792358398438 + }, + { + "auxiliary_loss_clip": 0.06407504, + "auxiliary_loss_mlp": 0.01265602, + "balance_loss_clip": 0.0627365, + "balance_loss_mlp": 0.01256393, + "epoch": 0.7547271907410191, + "flos": 23484545786880.0, + "grad_norm": 1.6196877851330536, + "language_loss": 0.78280461, + "learning_rate": 5.98441531115812e-07, + "loss": 0.85953569, + "num_input_tokens_seen": 270768230, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.09210205, + "step": 12553, + "time_per_iteration": 2.5273962020874023 + }, + { + "auxiliary_loss_clip": 0.06404902, + "auxiliary_loss_mlp": 0.01264178, + "balance_loss_clip": 0.06272501, + "balance_loss_mlp": 0.01254027, + "epoch": 0.754787313993687, + "flos": 31730898159360.0, + "grad_norm": 2.42415612197757, + "language_loss": 0.63542819, + "learning_rate": 5.981637242156135e-07, + "loss": 0.71211898, + "num_input_tokens_seen": 270786285, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.1015625, + "step": 12554, + "time_per_iteration": 2.5882747173309326 + }, + { + "auxiliary_loss_clip": 0.06402658, + "auxiliary_loss_mlp": 0.01263371, + "balance_loss_clip": 0.06271334, + "balance_loss_mlp": 0.01253983, + "epoch": 0.7548474372463551, + "flos": 27570392179200.0, + "grad_norm": 1.504037054855903, + "language_loss": 0.73400116, + "learning_rate": 5.978859704731864e-07, + "loss": 0.81066149, + "num_input_tokens_seen": 270805505, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09393311, + "step": 12555, + "time_per_iteration": 2.539822578430176 + }, + { + "auxiliary_loss_clip": 0.0640943, + "auxiliary_loss_mlp": 0.01263982, + "balance_loss_clip": 0.06275169, + "balance_loss_mlp": 0.01253599, + "epoch": 0.754907560499023, + "flos": 19324752566400.0, + "grad_norm": 1.737792546565587, + "language_loss": 0.78918052, + "learning_rate": 5.976082698990645e-07, + "loss": 0.86591458, + "num_input_tokens_seen": 270824610, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.10388184, + "step": 12556, + "time_per_iteration": 2.520672082901001 + }, + { + "auxiliary_loss_clip": 0.06309493, + "auxiliary_loss_mlp": 0.01252888, + "balance_loss_clip": 0.06254127, + "balance_loss_mlp": 0.01251748, + "epoch": 0.754967683751691, + "flos": 69765795993600.0, + "grad_norm": 0.6939528334291757, + "language_loss": 0.50454944, + "learning_rate": 5.973306225037769e-07, + "loss": 0.58017325, + "num_input_tokens_seen": 270886155, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01139832, + "step": 12557, + "time_per_iteration": 3.1293344497680664 + }, + { + "auxiliary_loss_clip": 0.06408815, + "auxiliary_loss_mlp": 0.01264037, + "balance_loss_clip": 0.06273429, + "balance_loss_mlp": 0.01253857, + "epoch": 0.7550278070043589, + "flos": 24428161342080.0, + "grad_norm": 1.622493392306736, + "language_loss": 0.71709013, + "learning_rate": 5.970530282978525e-07, + "loss": 0.79381871, + "num_input_tokens_seen": 270905325, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.10186768, + "step": 12558, + "time_per_iteration": 2.5321953296661377 + }, + { + "auxiliary_loss_clip": 0.06402349, + "auxiliary_loss_mlp": 0.01266792, + "balance_loss_clip": 0.0626944, + "balance_loss_mlp": 0.01257726, + "epoch": 0.7550879302570269, + "flos": 32642802144000.0, + "grad_norm": 1.8637892647127214, + "language_loss": 0.80580068, + "learning_rate": 5.967754872918187e-07, + "loss": 0.88249207, + "num_input_tokens_seen": 270927535, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09063721, + "step": 12559, + "time_per_iteration": 2.615544557571411 + }, + { + "auxiliary_loss_clip": 0.06405831, + "auxiliary_loss_mlp": 0.01265308, + "balance_loss_clip": 0.06270069, + "balance_loss_mlp": 0.01255276, + "epoch": 0.7551480535096948, + "flos": 21801461950080.0, + "grad_norm": 1.6337605293226678, + "language_loss": 0.78857327, + "learning_rate": 5.96497999496199e-07, + "loss": 0.86528468, + "num_input_tokens_seen": 270946920, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.10021973, + "step": 12560, + "time_per_iteration": 2.5266849994659424 + }, + { + "auxiliary_loss_clip": 0.06401823, + "auxiliary_loss_mlp": 0.01266116, + "balance_loss_clip": 0.06271054, + "balance_loss_mlp": 0.01256752, + "epoch": 0.7552081767623628, + "flos": 18521022602880.0, + "grad_norm": 1.579385743882106, + "language_loss": 0.70900261, + "learning_rate": 5.96220564921515e-07, + "loss": 0.78568202, + "num_input_tokens_seen": 270965705, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09362793, + "step": 12561, + "time_per_iteration": 2.4935779571533203 + }, + { + "auxiliary_loss_clip": 0.06401284, + "auxiliary_loss_mlp": 0.01266321, + "balance_loss_clip": 0.06268281, + "balance_loss_mlp": 0.01256594, + "epoch": 0.7552683000150308, + "flos": 27641949166080.0, + "grad_norm": 1.5637953071800728, + "language_loss": 0.7579698, + "learning_rate": 5.959431835782889e-07, + "loss": 0.83464587, + "num_input_tokens_seen": 270986550, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.09735107, + "step": 12562, + "time_per_iteration": 2.5509040355682373 + }, + { + "auxiliary_loss_clip": 0.06403111, + "auxiliary_loss_mlp": 0.01264985, + "balance_loss_clip": 0.06271905, + "balance_loss_mlp": 0.01255144, + "epoch": 0.7553284232676988, + "flos": 20309135932800.0, + "grad_norm": 1.8403167486550738, + "language_loss": 0.75524759, + "learning_rate": 5.956658554770371e-07, + "loss": 0.83192855, + "num_input_tokens_seen": 271006250, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09838867, + "step": 12563, + "time_per_iteration": 2.513921022415161 + }, + { + "auxiliary_loss_clip": 0.06417328, + "auxiliary_loss_mlp": 0.01264981, + "balance_loss_clip": 0.06274921, + "balance_loss_mlp": 0.01253454, + "epoch": 0.7553885465203668, + "flos": 33263866206720.0, + "grad_norm": 2.816655574793258, + "language_loss": 0.67061448, + "learning_rate": 5.953885806282768e-07, + "loss": 0.7474376, + "num_input_tokens_seen": 271025575, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.11529541, + "step": 12564, + "time_per_iteration": 2.5836448669433594 + }, + { + "auxiliary_loss_clip": 0.06408054, + "auxiliary_loss_mlp": 0.01265348, + "balance_loss_clip": 0.06272587, + "balance_loss_mlp": 0.01254929, + "epoch": 0.7554486697730347, + "flos": 21622653336960.0, + "grad_norm": 1.6673790511457676, + "language_loss": 0.68740308, + "learning_rate": 5.951113590425228e-07, + "loss": 0.76413709, + "num_input_tokens_seen": 271045805, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.10412598, + "step": 12565, + "time_per_iteration": 2.547016143798828 + }, + { + "auxiliary_loss_clip": 0.06408931, + "auxiliary_loss_mlp": 0.01266223, + "balance_loss_clip": 0.06269513, + "balance_loss_mlp": 0.01255864, + "epoch": 0.7555087930257027, + "flos": 27639810887040.0, + "grad_norm": 1.5709631477548602, + "language_loss": 0.74854088, + "learning_rate": 5.94834190730287e-07, + "loss": 0.82529235, + "num_input_tokens_seen": 271066065, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.10357666, + "step": 12566, + "time_per_iteration": 2.5360589027404785 + }, + { + "auxiliary_loss_clip": 0.06412722, + "auxiliary_loss_mlp": 0.01268164, + "balance_loss_clip": 0.0627517, + "balance_loss_mlp": 0.01257399, + "epoch": 0.7555689162783706, + "flos": 23628162885120.0, + "grad_norm": 2.012452039611991, + "language_loss": 0.74581742, + "learning_rate": 5.945570757020789e-07, + "loss": 0.82262623, + "num_input_tokens_seen": 271085870, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.10766602, + "step": 12567, + "time_per_iteration": 2.5815160274505615 + }, + { + "auxiliary_loss_clip": 0.06405583, + "auxiliary_loss_mlp": 0.01263668, + "balance_loss_clip": 0.06273046, + "balance_loss_mlp": 0.01254155, + "epoch": 0.7556290395310387, + "flos": 24869955594240.0, + "grad_norm": 2.2187055340404216, + "language_loss": 0.62846589, + "learning_rate": 5.942800139684073e-07, + "loss": 0.70515835, + "num_input_tokens_seen": 271104260, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09515381, + "step": 12568, + "time_per_iteration": 2.5301473140716553 + }, + { + "auxiliary_loss_clip": 0.06402, + "auxiliary_loss_mlp": 0.01264781, + "balance_loss_clip": 0.06270471, + "balance_loss_mlp": 0.01255471, + "epoch": 0.7556891627837066, + "flos": 43553770680960.0, + "grad_norm": 1.9192871198198145, + "language_loss": 0.66908652, + "learning_rate": 5.940030055397789e-07, + "loss": 0.7457543, + "num_input_tokens_seen": 271125745, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09301758, + "step": 12569, + "time_per_iteration": 2.707559585571289 + }, + { + "auxiliary_loss_clip": 0.06408378, + "auxiliary_loss_mlp": 0.01264951, + "balance_loss_clip": 0.06271549, + "balance_loss_mlp": 0.01254527, + "epoch": 0.7557492860363746, + "flos": 26658110851200.0, + "grad_norm": 2.041017717148161, + "language_loss": 0.67703956, + "learning_rate": 5.93726050426697e-07, + "loss": 0.75377285, + "num_input_tokens_seen": 271147145, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.10424805, + "step": 12570, + "time_per_iteration": 2.5359280109405518 + }, + { + "auxiliary_loss_clip": 0.06407271, + "auxiliary_loss_mlp": 0.0126553, + "balance_loss_clip": 0.0627284, + "balance_loss_mlp": 0.01255868, + "epoch": 0.7558094092890425, + "flos": 55194857769600.0, + "grad_norm": 1.6855740351628876, + "language_loss": 0.71908271, + "learning_rate": 5.934491486396647e-07, + "loss": 0.7958107, + "num_input_tokens_seen": 271170865, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.09667969, + "step": 12571, + "time_per_iteration": 2.8340237140655518 + }, + { + "auxiliary_loss_clip": 0.06408758, + "auxiliary_loss_mlp": 0.01265226, + "balance_loss_clip": 0.06272244, + "balance_loss_mlp": 0.01255242, + "epoch": 0.7558695325417105, + "flos": 23995171768320.0, + "grad_norm": 1.5360803868989372, + "language_loss": 0.74071586, + "learning_rate": 5.931723001891811e-07, + "loss": 0.81745565, + "num_input_tokens_seen": 271191450, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.09985352, + "step": 12572, + "time_per_iteration": 4.078891754150391 + }, + { + "auxiliary_loss_clip": 0.06408488, + "auxiliary_loss_mlp": 0.01264697, + "balance_loss_clip": 0.06272225, + "balance_loss_mlp": 0.01254981, + "epoch": 0.7559296557943784, + "flos": 14616542373120.0, + "grad_norm": 2.087893523265595, + "language_loss": 0.77022463, + "learning_rate": 5.928955050857456e-07, + "loss": 0.84695649, + "num_input_tokens_seen": 271207335, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.097229, + "step": 12573, + "time_per_iteration": 2.4667983055114746 + }, + { + "auxiliary_loss_clip": 0.06406313, + "auxiliary_loss_mlp": 0.01264981, + "balance_loss_clip": 0.06269629, + "balance_loss_mlp": 0.01254032, + "epoch": 0.7559897790470465, + "flos": 18556214117760.0, + "grad_norm": 1.6481386316669568, + "language_loss": 0.69339514, + "learning_rate": 5.926187633398527e-07, + "loss": 0.7701081, + "num_input_tokens_seen": 271226895, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.10955811, + "step": 12574, + "time_per_iteration": 2.521108627319336 + }, + { + "auxiliary_loss_clip": 0.06401183, + "auxiliary_loss_mlp": 0.0126439, + "balance_loss_clip": 0.0626963, + "balance_loss_mlp": 0.01254532, + "epoch": 0.7560499022997144, + "flos": 17973695733120.0, + "grad_norm": 2.167691196758321, + "language_loss": 0.71799374, + "learning_rate": 5.923420749619974e-07, + "loss": 0.79464948, + "num_input_tokens_seen": 271244375, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09869385, + "step": 12575, + "time_per_iteration": 2.4676809310913086 + }, + { + "auxiliary_loss_clip": 0.0640292, + "auxiliary_loss_mlp": 0.01261787, + "balance_loss_clip": 0.0626974, + "balance_loss_mlp": 0.0125222, + "epoch": 0.7561100255523824, + "flos": 15742530339840.0, + "grad_norm": 1.985003709379718, + "language_loss": 0.7146281, + "learning_rate": 5.92065439962673e-07, + "loss": 0.79127514, + "num_input_tokens_seen": 271259530, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.09564209, + "step": 12576, + "time_per_iteration": 2.525620937347412 + }, + { + "auxiliary_loss_clip": 0.06402552, + "auxiliary_loss_mlp": 0.01265228, + "balance_loss_clip": 0.06271128, + "balance_loss_mlp": 0.0125497, + "epoch": 0.7561701488050504, + "flos": 15893568524160.0, + "grad_norm": 1.7792307856828309, + "language_loss": 0.67103839, + "learning_rate": 5.917888583523669e-07, + "loss": 0.74771613, + "num_input_tokens_seen": 271276835, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.10247803, + "step": 12577, + "time_per_iteration": 2.468843936920166 + }, + { + "auxiliary_loss_clip": 0.06400042, + "auxiliary_loss_mlp": 0.01263628, + "balance_loss_clip": 0.06269364, + "balance_loss_mlp": 0.01253978, + "epoch": 0.7562302720577183, + "flos": 20345333696640.0, + "grad_norm": 1.5059365090765435, + "language_loss": 0.78157711, + "learning_rate": 5.915123301415685e-07, + "loss": 0.85821384, + "num_input_tokens_seen": 271296275, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09649658, + "step": 12578, + "time_per_iteration": 2.530263900756836 + }, + { + "auxiliary_loss_clip": 0.0640607, + "auxiliary_loss_mlp": 0.01262105, + "balance_loss_clip": 0.06271346, + "balance_loss_mlp": 0.01251871, + "epoch": 0.7562903953103863, + "flos": 20818252540800.0, + "grad_norm": 1.5853993549027412, + "language_loss": 0.76139581, + "learning_rate": 5.912358553407641e-07, + "loss": 0.83807755, + "num_input_tokens_seen": 271315685, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.10229492, + "step": 12579, + "time_per_iteration": 2.507765054702759 + }, + { + "auxiliary_loss_clip": 0.06411377, + "auxiliary_loss_mlp": 0.01264596, + "balance_loss_clip": 0.06272445, + "balance_loss_mlp": 0.01253599, + "epoch": 0.7563505185630542, + "flos": 37606073765760.0, + "grad_norm": 1.7167109835920158, + "language_loss": 0.62744486, + "learning_rate": 5.90959433960437e-07, + "loss": 0.70420462, + "num_input_tokens_seen": 271336790, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.11004639, + "step": 12580, + "time_per_iteration": 2.6855556964874268 + }, + { + "auxiliary_loss_clip": 0.06404164, + "auxiliary_loss_mlp": 0.01266589, + "balance_loss_clip": 0.06272098, + "balance_loss_mlp": 0.01256355, + "epoch": 0.7564106418157223, + "flos": 20237369310720.0, + "grad_norm": 3.698052227516868, + "language_loss": 0.75504309, + "learning_rate": 5.906830660110691e-07, + "loss": 0.83175057, + "num_input_tokens_seen": 271355470, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.10241699, + "step": 12581, + "time_per_iteration": 3.9208571910858154 + }, + { + "auxiliary_loss_clip": 0.06411214, + "auxiliary_loss_mlp": 0.01264654, + "balance_loss_clip": 0.06274357, + "balance_loss_mlp": 0.01254682, + "epoch": 0.7564707650683902, + "flos": 24761949281280.0, + "grad_norm": 1.712129660168012, + "language_loss": 0.63223112, + "learning_rate": 5.904067515031412e-07, + "loss": 0.70898986, + "num_input_tokens_seen": 271375810, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.09967041, + "step": 12582, + "time_per_iteration": 2.5469281673431396 + }, + { + "auxiliary_loss_clip": 0.06310637, + "auxiliary_loss_mlp": 0.01252832, + "balance_loss_clip": 0.06255485, + "balance_loss_mlp": 0.01251842, + "epoch": 0.7565308883210582, + "flos": 48544965711360.0, + "grad_norm": 0.9271563619933442, + "language_loss": 0.60731697, + "learning_rate": 5.901304904471307e-07, + "loss": 0.68295169, + "num_input_tokens_seen": 271424775, + "router_z_loss_clip": 0.55322266, + "router_z_loss_mlp": 0.00989532, + "step": 12583, + "time_per_iteration": 2.8734805583953857 + }, + { + "auxiliary_loss_clip": 0.06408859, + "auxiliary_loss_mlp": 0.01265781, + "balance_loss_clip": 0.06275302, + "balance_loss_mlp": 0.01255792, + "epoch": 0.7565910115737261, + "flos": 12500007765120.0, + "grad_norm": 1.9446553716026287, + "language_loss": 0.7914691, + "learning_rate": 5.898542828535125e-07, + "loss": 0.8682155, + "num_input_tokens_seen": 271440500, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.09985352, + "step": 12584, + "time_per_iteration": 2.5946009159088135 + }, + { + "auxiliary_loss_clip": 0.06402295, + "auxiliary_loss_mlp": 0.01264418, + "balance_loss_clip": 0.06272683, + "balance_loss_mlp": 0.01254559, + "epoch": 0.7566511348263941, + "flos": 21178427316480.0, + "grad_norm": 5.075260482718231, + "language_loss": 0.7806747, + "learning_rate": 5.895781287327612e-07, + "loss": 0.85734189, + "num_input_tokens_seen": 271458180, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.09857178, + "step": 12585, + "time_per_iteration": 4.006917953491211 + }, + { + "auxiliary_loss_clip": 0.06406915, + "auxiliary_loss_mlp": 0.01263646, + "balance_loss_clip": 0.06271342, + "balance_loss_mlp": 0.01253609, + "epoch": 0.756711258079062, + "flos": 21760023306240.0, + "grad_norm": 1.5685604080996611, + "language_loss": 0.83183873, + "learning_rate": 5.893020280953493e-07, + "loss": 0.9085443, + "num_input_tokens_seen": 271475730, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.1003418, + "step": 12586, + "time_per_iteration": 2.4981296062469482 + }, + { + "auxiliary_loss_clip": 0.06409433, + "auxiliary_loss_mlp": 0.01265703, + "balance_loss_clip": 0.06271765, + "balance_loss_mlp": 0.01255487, + "epoch": 0.75677138133173, + "flos": 22389514704000.0, + "grad_norm": 2.1588778105399116, + "language_loss": 0.83529806, + "learning_rate": 5.890259809517459e-07, + "loss": 0.91204941, + "num_input_tokens_seen": 271495030, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.10223389, + "step": 12587, + "time_per_iteration": 2.5264017581939697 + }, + { + "auxiliary_loss_clip": 0.06405166, + "auxiliary_loss_mlp": 0.01262614, + "balance_loss_clip": 0.06272217, + "balance_loss_mlp": 0.01252356, + "epoch": 0.756831504584398, + "flos": 22715252651520.0, + "grad_norm": 1.5206694910339098, + "language_loss": 0.71336639, + "learning_rate": 5.88749987312418e-07, + "loss": 0.79004425, + "num_input_tokens_seen": 271515355, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.1026001, + "step": 12588, + "time_per_iteration": 2.522880792617798 + }, + { + "auxiliary_loss_clip": 0.06410505, + "auxiliary_loss_mlp": 0.01264184, + "balance_loss_clip": 0.06272528, + "balance_loss_mlp": 0.01253777, + "epoch": 0.756891627837066, + "flos": 24105358287360.0, + "grad_norm": 1.8052754527396453, + "language_loss": 0.69118118, + "learning_rate": 5.884740471878327e-07, + "loss": 0.76792806, + "num_input_tokens_seen": 271535090, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.10412598, + "step": 12589, + "time_per_iteration": 2.543221950531006 + }, + { + "auxiliary_loss_clip": 0.06404439, + "auxiliary_loss_mlp": 0.01269435, + "balance_loss_clip": 0.06271015, + "balance_loss_mlp": 0.01259499, + "epoch": 0.756951751089734, + "flos": 19754010633600.0, + "grad_norm": 1.742132882513342, + "language_loss": 0.92203468, + "learning_rate": 5.881981605884522e-07, + "loss": 0.99877346, + "num_input_tokens_seen": 271551075, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.09942627, + "step": 12590, + "time_per_iteration": 3.913285732269287 + }, + { + "auxiliary_loss_clip": 0.06402917, + "auxiliary_loss_mlp": 0.01263743, + "balance_loss_clip": 0.06272686, + "balance_loss_mlp": 0.01253092, + "epoch": 0.7570118743424019, + "flos": 35087883811200.0, + "grad_norm": 1.7860803954634257, + "language_loss": 0.65924931, + "learning_rate": 5.879223275247391e-07, + "loss": 0.7359159, + "num_input_tokens_seen": 271571035, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.10644531, + "step": 12591, + "time_per_iteration": 2.6003847122192383 + }, + { + "auxiliary_loss_clip": 0.06403872, + "auxiliary_loss_mlp": 0.01263019, + "balance_loss_clip": 0.06273251, + "balance_loss_mlp": 0.01253667, + "epoch": 0.7570719975950699, + "flos": 25601835081600.0, + "grad_norm": 1.452450221530786, + "language_loss": 0.73701084, + "learning_rate": 5.876465480071528e-07, + "loss": 0.81367981, + "num_input_tokens_seen": 271592950, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.09356689, + "step": 12592, + "time_per_iteration": 2.5929007530212402 + }, + { + "auxiliary_loss_clip": 0.06405754, + "auxiliary_loss_mlp": 0.01266898, + "balance_loss_clip": 0.06270353, + "balance_loss_mlp": 0.01257165, + "epoch": 0.7571321208477378, + "flos": 10820781216000.0, + "grad_norm": 2.164551759300356, + "language_loss": 0.71882141, + "learning_rate": 5.873708220461522e-07, + "loss": 0.79554784, + "num_input_tokens_seen": 271608835, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.09741211, + "step": 12593, + "time_per_iteration": 2.4659135341644287 + }, + { + "auxiliary_loss_clip": 0.0640605, + "auxiliary_loss_mlp": 0.01263408, + "balance_loss_clip": 0.06271473, + "balance_loss_mlp": 0.01253216, + "epoch": 0.7571922441004059, + "flos": 18266045028480.0, + "grad_norm": 1.7009854752836593, + "language_loss": 0.66789973, + "learning_rate": 5.870951496521903e-07, + "loss": 0.74459434, + "num_input_tokens_seen": 271627730, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.10192871, + "step": 12594, + "time_per_iteration": 2.6039915084838867 + }, + { + "auxiliary_loss_clip": 0.06412069, + "auxiliary_loss_mlp": 0.01266946, + "balance_loss_clip": 0.06273807, + "balance_loss_mlp": 0.01256599, + "epoch": 0.7572523673530738, + "flos": 22896660741120.0, + "grad_norm": 1.6054592725551893, + "language_loss": 0.80899853, + "learning_rate": 5.86819530835722e-07, + "loss": 0.88578868, + "num_input_tokens_seen": 271646415, + "router_z_loss_clip": 1.38183594, + "router_z_loss_mlp": 0.10339355, + "step": 12595, + "time_per_iteration": 2.571235179901123 + }, + { + "auxiliary_loss_clip": 0.06404546, + "auxiliary_loss_mlp": 0.01268345, + "balance_loss_clip": 0.06273299, + "balance_loss_mlp": 0.01259166, + "epoch": 0.7573124906057418, + "flos": 21002679377280.0, + "grad_norm": 1.9975391540186431, + "language_loss": 0.71918476, + "learning_rate": 5.865439656071993e-07, + "loss": 0.7959137, + "num_input_tokens_seen": 271666240, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09185791, + "step": 12596, + "time_per_iteration": 2.551135301589966 + }, + { + "auxiliary_loss_clip": 0.0640128, + "auxiliary_loss_mlp": 0.01266132, + "balance_loss_clip": 0.06271507, + "balance_loss_mlp": 0.0125737, + "epoch": 0.7573726138584097, + "flos": 20892534785280.0, + "grad_norm": 1.4422973158795673, + "language_loss": 0.80943167, + "learning_rate": 5.862684539770706e-07, + "loss": 0.8861059, + "num_input_tokens_seen": 271686370, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.08764648, + "step": 12597, + "time_per_iteration": 2.4924709796905518 + }, + { + "auxiliary_loss_clip": 0.06410646, + "auxiliary_loss_mlp": 0.01265912, + "balance_loss_clip": 0.06274585, + "balance_loss_mlp": 0.01255076, + "epoch": 0.7574327371110777, + "flos": 24536628852480.0, + "grad_norm": 1.549330306362407, + "language_loss": 0.83572793, + "learning_rate": 5.859929959557835e-07, + "loss": 0.91249353, + "num_input_tokens_seen": 271705050, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.10839844, + "step": 12598, + "time_per_iteration": 2.5620381832122803 + }, + { + "auxiliary_loss_clip": 0.0640049, + "auxiliary_loss_mlp": 0.01265859, + "balance_loss_clip": 0.06269588, + "balance_loss_mlp": 0.01256656, + "epoch": 0.7574928603637456, + "flos": 23370711615360.0, + "grad_norm": 1.5128329006829742, + "language_loss": 0.62814438, + "learning_rate": 5.857175915537845e-07, + "loss": 0.70480788, + "num_input_tokens_seen": 271724915, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09197998, + "step": 12599, + "time_per_iteration": 2.517794132232666 + }, + { + "auxiliary_loss_clip": 0.06412463, + "auxiliary_loss_mlp": 0.01264733, + "balance_loss_clip": 0.06273595, + "balance_loss_mlp": 0.01253641, + "epoch": 0.7575529836164137, + "flos": 13521301655040.0, + "grad_norm": 2.5096070763269047, + "language_loss": 0.63904691, + "learning_rate": 5.854422407815161e-07, + "loss": 0.71581882, + "num_input_tokens_seen": 271742410, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.11096191, + "step": 12600, + "time_per_iteration": 2.4784600734710693 + }, + { + "auxiliary_loss_clip": 0.06401792, + "auxiliary_loss_mlp": 0.01265717, + "balance_loss_clip": 0.06272122, + "balance_loss_mlp": 0.01255709, + "epoch": 0.7576131068690816, + "flos": 19652754574080.0, + "grad_norm": 1.7462695207740195, + "language_loss": 0.66372097, + "learning_rate": 5.851669436494191e-07, + "loss": 0.74039608, + "num_input_tokens_seen": 271761425, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.10003662, + "step": 12601, + "time_per_iteration": 2.473879337310791 + }, + { + "auxiliary_loss_clip": 0.06400197, + "auxiliary_loss_mlp": 0.01265733, + "balance_loss_clip": 0.06269629, + "balance_loss_mlp": 0.01256429, + "epoch": 0.7576732301217496, + "flos": 20054535701760.0, + "grad_norm": 2.2130741302051904, + "language_loss": 0.68382788, + "learning_rate": 5.848917001679335e-07, + "loss": 0.7604872, + "num_input_tokens_seen": 271780875, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09301758, + "step": 12602, + "time_per_iteration": 2.49818754196167 + }, + { + "auxiliary_loss_clip": 0.0640595, + "auxiliary_loss_mlp": 0.0126578, + "balance_loss_clip": 0.06273246, + "balance_loss_mlp": 0.01255373, + "epoch": 0.7577333533744176, + "flos": 15382439418240.0, + "grad_norm": 1.7531421277811328, + "language_loss": 0.67018741, + "learning_rate": 5.846165103474967e-07, + "loss": 0.74690473, + "num_input_tokens_seen": 271799490, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.10412598, + "step": 12603, + "time_per_iteration": 2.4679315090179443 + }, + { + "auxiliary_loss_clip": 0.06399174, + "auxiliary_loss_mlp": 0.0126693, + "balance_loss_clip": 0.06270204, + "balance_loss_mlp": 0.01257441, + "epoch": 0.7577934766270855, + "flos": 17900671299840.0, + "grad_norm": 2.0091560992358417, + "language_loss": 0.62072337, + "learning_rate": 5.843413741985439e-07, + "loss": 0.69738448, + "num_input_tokens_seen": 271817040, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.09484863, + "step": 12604, + "time_per_iteration": 2.4903266429901123 + }, + { + "auxiliary_loss_clip": 0.06405266, + "auxiliary_loss_mlp": 0.01268866, + "balance_loss_clip": 0.0627261, + "balance_loss_mlp": 0.01258256, + "epoch": 0.7578535998797535, + "flos": 21619760371200.0, + "grad_norm": 1.8724094104834093, + "language_loss": 0.80161738, + "learning_rate": 5.840662917315076e-07, + "loss": 0.87835866, + "num_input_tokens_seen": 271835480, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.10614014, + "step": 12605, + "time_per_iteration": 2.4841203689575195 + }, + { + "auxiliary_loss_clip": 0.06405874, + "auxiliary_loss_mlp": 0.01267443, + "balance_loss_clip": 0.06269677, + "balance_loss_mlp": 0.01256863, + "epoch": 0.7579137231324214, + "flos": 18484237860480.0, + "grad_norm": 2.5250222349386866, + "language_loss": 0.80021864, + "learning_rate": 5.837912629568198e-07, + "loss": 0.87695181, + "num_input_tokens_seen": 271849835, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.10577393, + "step": 12606, + "time_per_iteration": 2.4846410751342773 + }, + { + "auxiliary_loss_clip": 0.06398265, + "auxiliary_loss_mlp": 0.01262661, + "balance_loss_clip": 0.06272207, + "balance_loss_mlp": 0.01254048, + "epoch": 0.7579738463850895, + "flos": 23261195928960.0, + "grad_norm": 1.3978882073919028, + "language_loss": 0.73257685, + "learning_rate": 5.835162878849087e-07, + "loss": 0.8091861, + "num_input_tokens_seen": 271869560, + "router_z_loss_clip": 1.25976562, + "router_z_loss_mlp": 0.08612061, + "step": 12607, + "time_per_iteration": 2.5159242153167725 + }, + { + "auxiliary_loss_clip": 0.06412238, + "auxiliary_loss_mlp": 0.01270562, + "balance_loss_clip": 0.06273781, + "balance_loss_mlp": 0.01260798, + "epoch": 0.7580339696377574, + "flos": 14032137271680.0, + "grad_norm": 1.9743130927740786, + "language_loss": 0.74911094, + "learning_rate": 5.83241366526202e-07, + "loss": 0.82593894, + "num_input_tokens_seen": 271887950, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.09759521, + "step": 12608, + "time_per_iteration": 2.497614622116089 + }, + { + "auxiliary_loss_clip": 0.06404, + "auxiliary_loss_mlp": 0.01265498, + "balance_loss_clip": 0.06272872, + "balance_loss_mlp": 0.01255335, + "epoch": 0.7580940928904254, + "flos": 25089825507840.0, + "grad_norm": 1.4850994343846526, + "language_loss": 0.71440935, + "learning_rate": 5.829664988911245e-07, + "loss": 0.79110432, + "num_input_tokens_seen": 271907700, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.10162354, + "step": 12609, + "time_per_iteration": 2.5046613216400146 + }, + { + "auxiliary_loss_clip": 0.06403238, + "auxiliary_loss_mlp": 0.01266218, + "balance_loss_clip": 0.06269231, + "balance_loss_mlp": 0.01255149, + "epoch": 0.7581542161430933, + "flos": 23842288794240.0, + "grad_norm": 1.5362768058581475, + "language_loss": 0.81678033, + "learning_rate": 5.826916849901007e-07, + "loss": 0.89347494, + "num_input_tokens_seen": 271926840, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.11071777, + "step": 12610, + "time_per_iteration": 2.517946243286133 + }, + { + "auxiliary_loss_clip": 0.06408758, + "auxiliary_loss_mlp": 0.01262988, + "balance_loss_clip": 0.0627152, + "balance_loss_mlp": 0.01252921, + "epoch": 0.7582143393957613, + "flos": 22243591618560.0, + "grad_norm": 1.594141702958548, + "language_loss": 0.70561087, + "learning_rate": 5.824169248335488e-07, + "loss": 0.78232837, + "num_input_tokens_seen": 271946465, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.10070801, + "step": 12611, + "time_per_iteration": 2.490994930267334 + }, + { + "auxiliary_loss_clip": 0.06402324, + "auxiliary_loss_mlp": 0.01265013, + "balance_loss_clip": 0.0626975, + "balance_loss_mlp": 0.0125516, + "epoch": 0.7582744626484292, + "flos": 21112865896320.0, + "grad_norm": 1.5348173916293948, + "language_loss": 0.70921582, + "learning_rate": 5.821422184318893e-07, + "loss": 0.78588921, + "num_input_tokens_seen": 271967295, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.09857178, + "step": 12612, + "time_per_iteration": 3.989048719406128 + }, + { + "auxiliary_loss_clip": 0.06410398, + "auxiliary_loss_mlp": 0.01264672, + "balance_loss_clip": 0.06273097, + "balance_loss_mlp": 0.01254641, + "epoch": 0.7583345859010973, + "flos": 24611120732160.0, + "grad_norm": 1.3541649077655429, + "language_loss": 0.60250545, + "learning_rate": 5.818675657955397e-07, + "loss": 0.6792562, + "num_input_tokens_seen": 271987960, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.10028076, + "step": 12613, + "time_per_iteration": 2.5280654430389404 + }, + { + "auxiliary_loss_clip": 0.06406002, + "auxiliary_loss_mlp": 0.01265434, + "balance_loss_clip": 0.06272647, + "balance_loss_mlp": 0.01255367, + "epoch": 0.7583947091537652, + "flos": 33555167326080.0, + "grad_norm": 1.434876816663814, + "language_loss": 0.60180938, + "learning_rate": 5.815929669349135e-07, + "loss": 0.67852372, + "num_input_tokens_seen": 272011780, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.10064697, + "step": 12614, + "time_per_iteration": 2.6500730514526367 + }, + { + "auxiliary_loss_clip": 0.06408043, + "auxiliary_loss_mlp": 0.01264127, + "balance_loss_clip": 0.06270881, + "balance_loss_mlp": 0.01253976, + "epoch": 0.7584548324064332, + "flos": 20127266645760.0, + "grad_norm": 1.6646286333989884, + "language_loss": 0.73613036, + "learning_rate": 5.813184218604246e-07, + "loss": 0.81285203, + "num_input_tokens_seen": 272030825, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.1015625, + "step": 12615, + "time_per_iteration": 2.5028393268585205 + }, + { + "auxiliary_loss_clip": 0.06306437, + "auxiliary_loss_mlp": 0.01253251, + "balance_loss_clip": 0.06250888, + "balance_loss_mlp": 0.01251755, + "epoch": 0.7585149556591012, + "flos": 70424064069120.0, + "grad_norm": 0.8421080448004001, + "language_loss": 0.67521149, + "learning_rate": 5.810439305824828e-07, + "loss": 0.75080836, + "num_input_tokens_seen": 272095825, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01496124, + "step": 12616, + "time_per_iteration": 3.1849849224090576 + }, + { + "auxiliary_loss_clip": 0.06408077, + "auxiliary_loss_mlp": 0.01262858, + "balance_loss_clip": 0.06270512, + "balance_loss_mlp": 0.01252779, + "epoch": 0.7585750789117691, + "flos": 16149342712320.0, + "grad_norm": 1.7878130457508898, + "language_loss": 0.84241217, + "learning_rate": 5.807694931114979e-07, + "loss": 0.9191215, + "num_input_tokens_seen": 272113950, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.10076904, + "step": 12617, + "time_per_iteration": 2.4973013401031494 + }, + { + "auxiliary_loss_clip": 0.06407297, + "auxiliary_loss_mlp": 0.01262597, + "balance_loss_clip": 0.06272709, + "balance_loss_mlp": 0.01253257, + "epoch": 0.7586352021644371, + "flos": 17498848245120.0, + "grad_norm": 2.3587408181523544, + "language_loss": 0.74931777, + "learning_rate": 5.804951094578757e-07, + "loss": 0.82601666, + "num_input_tokens_seen": 272130315, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.09338379, + "step": 12618, + "time_per_iteration": 2.494654417037964 + }, + { + "auxiliary_loss_clip": 0.06410335, + "auxiliary_loss_mlp": 0.01262457, + "balance_loss_clip": 0.06271516, + "balance_loss_mlp": 0.01251967, + "epoch": 0.758695325417105, + "flos": 17280990829440.0, + "grad_norm": 2.0665265442485485, + "language_loss": 0.77541107, + "learning_rate": 5.802207796320209e-07, + "loss": 0.852139, + "num_input_tokens_seen": 272149080, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.1048584, + "step": 12619, + "time_per_iteration": 2.5350186824798584 + }, + { + "auxiliary_loss_clip": 0.06403962, + "auxiliary_loss_mlp": 0.01265943, + "balance_loss_clip": 0.06272481, + "balance_loss_mlp": 0.01255751, + "epoch": 0.7587554486697731, + "flos": 29503128856320.0, + "grad_norm": 1.7154948098726508, + "language_loss": 0.82232845, + "learning_rate": 5.79946503644337e-07, + "loss": 0.89902753, + "num_input_tokens_seen": 272168285, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.10180664, + "step": 12620, + "time_per_iteration": 2.5445215702056885 + }, + { + "auxiliary_loss_clip": 0.06409103, + "auxiliary_loss_mlp": 0.01267734, + "balance_loss_clip": 0.06271064, + "balance_loss_mlp": 0.0125651, + "epoch": 0.758815571922441, + "flos": 16105262664960.0, + "grad_norm": 2.254667976985654, + "language_loss": 0.82809436, + "learning_rate": 5.796722815052242e-07, + "loss": 0.90486276, + "num_input_tokens_seen": 272184585, + "router_z_loss_clip": 1.38183594, + "router_z_loss_mlp": 0.11236572, + "step": 12621, + "time_per_iteration": 3.918266534805298 + }, + { + "auxiliary_loss_clip": 0.0640413, + "auxiliary_loss_mlp": 0.01267456, + "balance_loss_clip": 0.06271367, + "balance_loss_mlp": 0.01257717, + "epoch": 0.758875695175109, + "flos": 16149258858240.0, + "grad_norm": 1.986087185770293, + "language_loss": 0.73904622, + "learning_rate": 5.7939811322508e-07, + "loss": 0.81576204, + "num_input_tokens_seen": 272200205, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09747314, + "step": 12622, + "time_per_iteration": 2.4622373580932617 + }, + { + "auxiliary_loss_clip": 0.06310892, + "auxiliary_loss_mlp": 0.01253319, + "balance_loss_clip": 0.06255639, + "balance_loss_mlp": 0.01252096, + "epoch": 0.7589358184277769, + "flos": 68482019589120.0, + "grad_norm": 0.8176590581901009, + "language_loss": 0.60799408, + "learning_rate": 5.791239988143024e-07, + "loss": 0.68363619, + "num_input_tokens_seen": 272259670, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01221466, + "step": 12623, + "time_per_iteration": 3.143218755722046 + }, + { + "auxiliary_loss_clip": 0.06401753, + "auxiliary_loss_mlp": 0.01262985, + "balance_loss_clip": 0.06271981, + "balance_loss_mlp": 0.01254349, + "epoch": 0.7589959416804449, + "flos": 20053445598720.0, + "grad_norm": 1.8387445657701582, + "language_loss": 0.67715496, + "learning_rate": 5.788499382832847e-07, + "loss": 0.75380242, + "num_input_tokens_seen": 272277925, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.08636475, + "step": 12624, + "time_per_iteration": 3.9293882846832275 + }, + { + "auxiliary_loss_clip": 0.06401351, + "auxiliary_loss_mlp": 0.01266658, + "balance_loss_clip": 0.06270081, + "balance_loss_mlp": 0.01257038, + "epoch": 0.7590560649331128, + "flos": 18777970748160.0, + "grad_norm": 1.6859497284261105, + "language_loss": 0.76178044, + "learning_rate": 5.785759316424196e-07, + "loss": 0.83846056, + "num_input_tokens_seen": 272296010, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.09625244, + "step": 12625, + "time_per_iteration": 2.4780449867248535 + }, + { + "auxiliary_loss_clip": 0.06401481, + "auxiliary_loss_mlp": 0.01264022, + "balance_loss_clip": 0.06271295, + "balance_loss_mlp": 0.0125383, + "epoch": 0.7591161881857809, + "flos": 29833017580800.0, + "grad_norm": 1.7327397977395311, + "language_loss": 0.63387203, + "learning_rate": 5.783019789020977e-07, + "loss": 0.71052712, + "num_input_tokens_seen": 272318330, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.10198975, + "step": 12626, + "time_per_iteration": 2.5631775856018066 + }, + { + "auxiliary_loss_clip": 0.06407394, + "auxiliary_loss_mlp": 0.01265555, + "balance_loss_clip": 0.06272081, + "balance_loss_mlp": 0.01255715, + "epoch": 0.7591763114384488, + "flos": 20308884370560.0, + "grad_norm": 1.7841706388815284, + "language_loss": 0.74468005, + "learning_rate": 5.780280800727084e-07, + "loss": 0.82140952, + "num_input_tokens_seen": 272335265, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.09844971, + "step": 12627, + "time_per_iteration": 2.469609260559082 + }, + { + "auxiliary_loss_clip": 0.06408302, + "auxiliary_loss_mlp": 0.012668, + "balance_loss_clip": 0.06272177, + "balance_loss_mlp": 0.01257412, + "epoch": 0.7592364346911168, + "flos": 20819887695360.0, + "grad_norm": 2.5677146388224728, + "language_loss": 0.69222355, + "learning_rate": 5.777542351646356e-07, + "loss": 0.76897466, + "num_input_tokens_seen": 272354795, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.09387207, + "step": 12628, + "time_per_iteration": 2.520756483078003 + }, + { + "auxiliary_loss_clip": 0.06418896, + "auxiliary_loss_mlp": 0.01268483, + "balance_loss_clip": 0.06277822, + "balance_loss_mlp": 0.01257951, + "epoch": 0.7592965579437848, + "flos": 21257866586880.0, + "grad_norm": 2.617063400341695, + "language_loss": 0.62842494, + "learning_rate": 5.774804441882648e-07, + "loss": 0.70529878, + "num_input_tokens_seen": 272372875, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.10528564, + "step": 12629, + "time_per_iteration": 3.9617972373962402 + }, + { + "auxiliary_loss_clip": 0.06400847, + "auxiliary_loss_mlp": 0.01264471, + "balance_loss_clip": 0.06271888, + "balance_loss_mlp": 0.0125463, + "epoch": 0.7593566811964527, + "flos": 26220802792320.0, + "grad_norm": 1.4187303097446593, + "language_loss": 0.7784214, + "learning_rate": 5.772067071539786e-07, + "loss": 0.85507464, + "num_input_tokens_seen": 272394715, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.09844971, + "step": 12630, + "time_per_iteration": 2.5400242805480957 + }, + { + "auxiliary_loss_clip": 0.0631338, + "auxiliary_loss_mlp": 0.01256151, + "balance_loss_clip": 0.06257843, + "balance_loss_mlp": 0.01255109, + "epoch": 0.7594168044491207, + "flos": 71258122010880.0, + "grad_norm": 0.8178625518129599, + "language_loss": 0.61609149, + "learning_rate": 5.769330240721562e-07, + "loss": 0.69178677, + "num_input_tokens_seen": 272458775, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01042175, + "step": 12631, + "time_per_iteration": 3.2121753692626953 + }, + { + "auxiliary_loss_clip": 0.06412616, + "auxiliary_loss_mlp": 0.0126774, + "balance_loss_clip": 0.06273548, + "balance_loss_mlp": 0.01256188, + "epoch": 0.7594769277017887, + "flos": 26620319859840.0, + "grad_norm": 1.723696706430517, + "language_loss": 0.74189103, + "learning_rate": 5.766593949531767e-07, + "loss": 0.81869459, + "num_input_tokens_seen": 272479355, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.11547852, + "step": 12632, + "time_per_iteration": 2.633206605911255 + }, + { + "auxiliary_loss_clip": 0.06406914, + "auxiliary_loss_mlp": 0.01263252, + "balance_loss_clip": 0.06272458, + "balance_loss_mlp": 0.01252743, + "epoch": 0.7595370509544567, + "flos": 17600523575040.0, + "grad_norm": 1.7631507541187388, + "language_loss": 0.75345957, + "learning_rate": 5.763858198074154e-07, + "loss": 0.83016121, + "num_input_tokens_seen": 272493555, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.1050415, + "step": 12633, + "time_per_iteration": 2.4908735752105713 + }, + { + "auxiliary_loss_clip": 0.06404668, + "auxiliary_loss_mlp": 0.01264134, + "balance_loss_clip": 0.06271268, + "balance_loss_mlp": 0.01254883, + "epoch": 0.7595971742071246, + "flos": 18008551831680.0, + "grad_norm": 1.9259614725215357, + "language_loss": 0.73589694, + "learning_rate": 5.76112298645246e-07, + "loss": 0.81258494, + "num_input_tokens_seen": 272508925, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.09240723, + "step": 12634, + "time_per_iteration": 2.463972330093384 + }, + { + "auxiliary_loss_clip": 0.06401845, + "auxiliary_loss_mlp": 0.01266383, + "balance_loss_clip": 0.06269458, + "balance_loss_mlp": 0.01256715, + "epoch": 0.7596572974597926, + "flos": 28847921454720.0, + "grad_norm": 1.6183361542433332, + "language_loss": 0.65202701, + "learning_rate": 5.758388314770408e-07, + "loss": 0.72870934, + "num_input_tokens_seen": 272528805, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.09661865, + "step": 12635, + "time_per_iteration": 2.5608267784118652 + }, + { + "auxiliary_loss_clip": 0.06408376, + "auxiliary_loss_mlp": 0.01262438, + "balance_loss_clip": 0.06272096, + "balance_loss_mlp": 0.01252252, + "epoch": 0.7597174207124605, + "flos": 14288037240960.0, + "grad_norm": 1.6247637528825494, + "language_loss": 0.69144988, + "learning_rate": 5.7556541831317e-07, + "loss": 0.76815796, + "num_input_tokens_seen": 272546655, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.10186768, + "step": 12636, + "time_per_iteration": 2.4801905155181885 + }, + { + "auxiliary_loss_clip": 0.0640962, + "auxiliary_loss_mlp": 0.01262748, + "balance_loss_clip": 0.06271771, + "balance_loss_mlp": 0.01252103, + "epoch": 0.7597775439651285, + "flos": 21695300426880.0, + "grad_norm": 1.9394255431745338, + "language_loss": 0.81419599, + "learning_rate": 5.752920591640018e-07, + "loss": 0.89091963, + "num_input_tokens_seen": 272564010, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.10650635, + "step": 12637, + "time_per_iteration": 2.535862922668457 + }, + { + "auxiliary_loss_clip": 0.06405479, + "auxiliary_loss_mlp": 0.01261246, + "balance_loss_clip": 0.06269705, + "balance_loss_mlp": 0.01251781, + "epoch": 0.7598376672177964, + "flos": 36110100096000.0, + "grad_norm": 1.8287091414841325, + "language_loss": 0.66797674, + "learning_rate": 5.750187540399017e-07, + "loss": 0.74464405, + "num_input_tokens_seen": 272585840, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.09460449, + "step": 12638, + "time_per_iteration": 2.620074987411499 + }, + { + "auxiliary_loss_clip": 0.06408533, + "auxiliary_loss_mlp": 0.01265156, + "balance_loss_clip": 0.06273371, + "balance_loss_mlp": 0.01254135, + "epoch": 0.7598977904704645, + "flos": 18338147066880.0, + "grad_norm": 2.2175642348047746, + "language_loss": 0.65482736, + "learning_rate": 5.747455029512323e-07, + "loss": 0.73156428, + "num_input_tokens_seen": 272602300, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.11022949, + "step": 12639, + "time_per_iteration": 2.495577096939087 + }, + { + "auxiliary_loss_clip": 0.06406114, + "auxiliary_loss_mlp": 0.01266924, + "balance_loss_clip": 0.06273108, + "balance_loss_mlp": 0.0125706, + "epoch": 0.7599579137231324, + "flos": 20198697851520.0, + "grad_norm": 2.4320385733819814, + "language_loss": 0.69979274, + "learning_rate": 5.744723059083572e-07, + "loss": 0.77652305, + "num_input_tokens_seen": 272619595, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.09863281, + "step": 12640, + "time_per_iteration": 2.5001392364501953 + }, + { + "auxiliary_loss_clip": 0.06408872, + "auxiliary_loss_mlp": 0.01266047, + "balance_loss_clip": 0.06271793, + "balance_loss_mlp": 0.01254788, + "epoch": 0.7600180369758004, + "flos": 24031746875520.0, + "grad_norm": 1.6154408738671377, + "language_loss": 0.66895354, + "learning_rate": 5.741991629216343e-07, + "loss": 0.74570274, + "num_input_tokens_seen": 272638825, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.11260986, + "step": 12641, + "time_per_iteration": 2.5159339904785156 + }, + { + "auxiliary_loss_clip": 0.064065, + "auxiliary_loss_mlp": 0.01265385, + "balance_loss_clip": 0.06269056, + "balance_loss_mlp": 0.01254865, + "epoch": 0.7600781602284684, + "flos": 18995534674560.0, + "grad_norm": 2.038376474313416, + "language_loss": 0.6667732, + "learning_rate": 5.73926074001422e-07, + "loss": 0.74349207, + "num_input_tokens_seen": 272657240, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.10522461, + "step": 12642, + "time_per_iteration": 2.4950852394104004 + }, + { + "auxiliary_loss_clip": 0.06405585, + "auxiliary_loss_mlp": 0.01265846, + "balance_loss_clip": 0.0627634, + "balance_loss_mlp": 0.01256571, + "epoch": 0.7601382834811363, + "flos": 26074670071680.0, + "grad_norm": 1.8779608812077913, + "language_loss": 0.75724566, + "learning_rate": 5.736530391580765e-07, + "loss": 0.83396, + "num_input_tokens_seen": 272677520, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.0927124, + "step": 12643, + "time_per_iteration": 2.660304069519043 + }, + { + "auxiliary_loss_clip": 0.06411186, + "auxiliary_loss_mlp": 0.01265406, + "balance_loss_clip": 0.06275575, + "balance_loss_mlp": 0.01254219, + "epoch": 0.7601984067338043, + "flos": 18850324348800.0, + "grad_norm": 1.8216194715113248, + "language_loss": 0.78901958, + "learning_rate": 5.733800584019508e-07, + "loss": 0.86578548, + "num_input_tokens_seen": 272696770, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.11187744, + "step": 12644, + "time_per_iteration": 2.513680934906006 + }, + { + "auxiliary_loss_clip": 0.06404514, + "auxiliary_loss_mlp": 0.01261707, + "balance_loss_clip": 0.06268981, + "balance_loss_mlp": 0.01251801, + "epoch": 0.7602585299864723, + "flos": 24653607552000.0, + "grad_norm": 1.4015203810474768, + "language_loss": 0.807042, + "learning_rate": 5.731071317433957e-07, + "loss": 0.88370419, + "num_input_tokens_seen": 272718340, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.09912109, + "step": 12645, + "time_per_iteration": 2.7170186042785645 + }, + { + "auxiliary_loss_clip": 0.06406523, + "auxiliary_loss_mlp": 0.01267162, + "balance_loss_clip": 0.06271391, + "balance_loss_mlp": 0.01256779, + "epoch": 0.7603186532391403, + "flos": 23848913266560.0, + "grad_norm": 1.4313892113151905, + "language_loss": 0.7345466, + "learning_rate": 5.728342591927611e-07, + "loss": 0.81128347, + "num_input_tokens_seen": 272739575, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.1038208, + "step": 12646, + "time_per_iteration": 2.7041969299316406 + }, + { + "auxiliary_loss_clip": 0.06405969, + "auxiliary_loss_mlp": 0.01267521, + "balance_loss_clip": 0.06275387, + "balance_loss_mlp": 0.0125842, + "epoch": 0.7603787764918082, + "flos": 22206387605760.0, + "grad_norm": 1.8247890758149474, + "language_loss": 0.67541718, + "learning_rate": 5.725614407603949e-07, + "loss": 0.75215209, + "num_input_tokens_seen": 272758710, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.09100342, + "step": 12647, + "time_per_iteration": 2.631646156311035 + }, + { + "auxiliary_loss_clip": 0.06309351, + "auxiliary_loss_mlp": 0.01254415, + "balance_loss_clip": 0.06253824, + "balance_loss_mlp": 0.01253126, + "epoch": 0.7604388997444762, + "flos": 54104549713920.0, + "grad_norm": 0.6718107108151633, + "language_loss": 0.48995575, + "learning_rate": 5.722886764566415e-07, + "loss": 0.56559336, + "num_input_tokens_seen": 272814855, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01289368, + "step": 12648, + "time_per_iteration": 3.0884687900543213 + }, + { + "auxiliary_loss_clip": 0.06397881, + "auxiliary_loss_mlp": 0.01264414, + "balance_loss_clip": 0.06268241, + "balance_loss_mlp": 0.01255801, + "epoch": 0.7604990229971441, + "flos": 19687904161920.0, + "grad_norm": 1.3891263247246097, + "language_loss": 0.76770478, + "learning_rate": 5.720159662918451e-07, + "loss": 0.84432769, + "num_input_tokens_seen": 272834400, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.08612061, + "step": 12649, + "time_per_iteration": 2.4948225021362305 + }, + { + "auxiliary_loss_clip": 0.06400768, + "auxiliary_loss_mlp": 0.01263835, + "balance_loss_clip": 0.06269015, + "balance_loss_mlp": 0.01254501, + "epoch": 0.7605591462498121, + "flos": 25234993906560.0, + "grad_norm": 1.5285209228148775, + "language_loss": 0.6904434, + "learning_rate": 5.717433102763462e-07, + "loss": 0.76708949, + "num_input_tokens_seen": 272854760, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.09332275, + "step": 12650, + "time_per_iteration": 2.5328054428100586 + }, + { + "auxiliary_loss_clip": 0.06313049, + "auxiliary_loss_mlp": 0.01254535, + "balance_loss_clip": 0.06257538, + "balance_loss_mlp": 0.01253279, + "epoch": 0.76061926950248, + "flos": 66803505799680.0, + "grad_norm": 0.7352332079053004, + "language_loss": 0.62801003, + "learning_rate": 5.714707084204838e-07, + "loss": 0.70368588, + "num_input_tokens_seen": 272919030, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01255798, + "step": 12651, + "time_per_iteration": 4.553870916366577 + }, + { + "auxiliary_loss_clip": 0.06400903, + "auxiliary_loss_mlp": 0.01266142, + "balance_loss_clip": 0.06269742, + "balance_loss_mlp": 0.01256629, + "epoch": 0.7606793927551481, + "flos": 25345473914880.0, + "grad_norm": 1.3627527735409288, + "language_loss": 0.71875393, + "learning_rate": 5.711981607345951e-07, + "loss": 0.79542446, + "num_input_tokens_seen": 272938925, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09515381, + "step": 12652, + "time_per_iteration": 2.5254390239715576 + }, + { + "auxiliary_loss_clip": 0.06403194, + "auxiliary_loss_mlp": 0.0126807, + "balance_loss_clip": 0.06270062, + "balance_loss_mlp": 0.01258229, + "epoch": 0.760739516007816, + "flos": 18229553775360.0, + "grad_norm": 1.992377129366734, + "language_loss": 0.80116236, + "learning_rate": 5.709256672290152e-07, + "loss": 0.87787497, + "num_input_tokens_seen": 272954945, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.09838867, + "step": 12653, + "time_per_iteration": 2.475878953933716 + }, + { + "auxiliary_loss_clip": 0.06406933, + "auxiliary_loss_mlp": 0.01265577, + "balance_loss_clip": 0.06269571, + "balance_loss_mlp": 0.01255248, + "epoch": 0.760799639260484, + "flos": 22564717591680.0, + "grad_norm": 1.5079651219958228, + "language_loss": 0.80019051, + "learning_rate": 5.706532279140785e-07, + "loss": 0.87691557, + "num_input_tokens_seen": 272972855, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.10327148, + "step": 12654, + "time_per_iteration": 2.4968621730804443 + }, + { + "auxiliary_loss_clip": 0.06408094, + "auxiliary_loss_mlp": 0.01268021, + "balance_loss_clip": 0.0627185, + "balance_loss_mlp": 0.01256953, + "epoch": 0.760859762513152, + "flos": 22315819438080.0, + "grad_norm": 2.0930481497067968, + "language_loss": 0.79525441, + "learning_rate": 5.703808428001136e-07, + "loss": 0.87201554, + "num_input_tokens_seen": 272989895, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.11065674, + "step": 12655, + "time_per_iteration": 2.5296621322631836 + }, + { + "auxiliary_loss_clip": 0.06400845, + "auxiliary_loss_mlp": 0.01263727, + "balance_loss_clip": 0.06271712, + "balance_loss_mlp": 0.0125565, + "epoch": 0.7609198857658199, + "flos": 24870919916160.0, + "grad_norm": 1.5227214319467992, + "language_loss": 0.68902338, + "learning_rate": 5.701085118974505e-07, + "loss": 0.76566911, + "num_input_tokens_seen": 273011695, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.08068848, + "step": 12656, + "time_per_iteration": 2.541064739227295 + }, + { + "auxiliary_loss_clip": 0.06410336, + "auxiliary_loss_mlp": 0.01267534, + "balance_loss_clip": 0.06272005, + "balance_loss_mlp": 0.01256913, + "epoch": 0.760980009018488, + "flos": 16842424959360.0, + "grad_norm": 2.207190684629195, + "language_loss": 0.73558354, + "learning_rate": 5.698362352164164e-07, + "loss": 0.81236219, + "num_input_tokens_seen": 273028815, + "router_z_loss_clip": 1.38476562, + "router_z_loss_mlp": 0.10632324, + "step": 12657, + "time_per_iteration": 2.492959499359131 + }, + { + "auxiliary_loss_clip": 0.06312352, + "auxiliary_loss_mlp": 0.01255494, + "balance_loss_clip": 0.06256969, + "balance_loss_mlp": 0.01254303, + "epoch": 0.7610401322711559, + "flos": 61248198355200.0, + "grad_norm": 0.8387316949065597, + "language_loss": 0.65017879, + "learning_rate": 5.695640127673347e-07, + "loss": 0.7258572, + "num_input_tokens_seen": 273084080, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01189423, + "step": 12658, + "time_per_iteration": 3.0756664276123047 + }, + { + "auxiliary_loss_clip": 0.06397738, + "auxiliary_loss_mlp": 0.0126605, + "balance_loss_clip": 0.06270427, + "balance_loss_mlp": 0.01255691, + "epoch": 0.7611002555238239, + "flos": 19645920466560.0, + "grad_norm": 1.5440041293540654, + "language_loss": 0.7962606, + "learning_rate": 5.692918445605293e-07, + "loss": 0.87289846, + "num_input_tokens_seen": 273102295, + "router_z_loss_clip": 1.2734375, + "router_z_loss_mlp": 0.1036377, + "step": 12659, + "time_per_iteration": 2.5428194999694824 + }, + { + "auxiliary_loss_clip": 0.0640292, + "auxiliary_loss_mlp": 0.01264514, + "balance_loss_clip": 0.06270297, + "balance_loss_mlp": 0.01255138, + "epoch": 0.7611603787764918, + "flos": 26879825554560.0, + "grad_norm": 1.4756646122445365, + "language_loss": 0.69142807, + "learning_rate": 5.690197306063209e-07, + "loss": 0.76810235, + "num_input_tokens_seen": 273123400, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09375, + "step": 12660, + "time_per_iteration": 4.065267086029053 + }, + { + "auxiliary_loss_clip": 0.06405179, + "auxiliary_loss_mlp": 0.01264177, + "balance_loss_clip": 0.06272516, + "balance_loss_mlp": 0.01254759, + "epoch": 0.7612205020291598, + "flos": 27351570441600.0, + "grad_norm": 1.631280435549901, + "language_loss": 0.70831662, + "learning_rate": 5.687476709150281e-07, + "loss": 0.78501016, + "num_input_tokens_seen": 273145150, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.09423828, + "step": 12661, + "time_per_iteration": 2.541351079940796 + }, + { + "auxiliary_loss_clip": 0.06405234, + "auxiliary_loss_mlp": 0.01265085, + "balance_loss_clip": 0.06271017, + "balance_loss_mlp": 0.01255447, + "epoch": 0.7612806252818277, + "flos": 29322265818240.0, + "grad_norm": 1.4447529833958312, + "language_loss": 0.84105158, + "learning_rate": 5.68475665496966e-07, + "loss": 0.91775477, + "num_input_tokens_seen": 273165180, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.09637451, + "step": 12662, + "time_per_iteration": 2.654850721359253 + }, + { + "auxiliary_loss_clip": 0.06407061, + "auxiliary_loss_mlp": 0.01269015, + "balance_loss_clip": 0.06273231, + "balance_loss_mlp": 0.0125974, + "epoch": 0.7613407485344957, + "flos": 19032067854720.0, + "grad_norm": 1.6864772603594633, + "language_loss": 0.69368142, + "learning_rate": 5.682037143624505e-07, + "loss": 0.77044225, + "num_input_tokens_seen": 273184005, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.09283447, + "step": 12663, + "time_per_iteration": 3.926262617111206 + }, + { + "auxiliary_loss_clip": 0.06401078, + "auxiliary_loss_mlp": 0.01261863, + "balance_loss_clip": 0.0627175, + "balance_loss_mlp": 0.01253119, + "epoch": 0.7614008717871636, + "flos": 23262369886080.0, + "grad_norm": 1.4557154718503251, + "language_loss": 0.70039129, + "learning_rate": 5.67931817521794e-07, + "loss": 0.77702069, + "num_input_tokens_seen": 273203565, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.08746338, + "step": 12664, + "time_per_iteration": 2.5054047107696533 + }, + { + "auxiliary_loss_clip": 0.06409515, + "auxiliary_loss_mlp": 0.01268679, + "balance_loss_clip": 0.06272146, + "balance_loss_mlp": 0.01257724, + "epoch": 0.7614609950398317, + "flos": 21586329792000.0, + "grad_norm": 1.5992794514882698, + "language_loss": 0.79600513, + "learning_rate": 5.676599749853066e-07, + "loss": 0.87278712, + "num_input_tokens_seen": 273221645, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.10949707, + "step": 12665, + "time_per_iteration": 2.599689483642578 + }, + { + "auxiliary_loss_clip": 0.06403616, + "auxiliary_loss_mlp": 0.01268033, + "balance_loss_clip": 0.06274, + "balance_loss_mlp": 0.01258097, + "epoch": 0.7615211182924996, + "flos": 29285523002880.0, + "grad_norm": 1.8706140840131316, + "language_loss": 0.88243985, + "learning_rate": 5.673881867632959e-07, + "loss": 0.95915639, + "num_input_tokens_seen": 273242040, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.09936523, + "step": 12666, + "time_per_iteration": 2.5415070056915283 + }, + { + "auxiliary_loss_clip": 0.06408084, + "auxiliary_loss_mlp": 0.0126673, + "balance_loss_clip": 0.06272887, + "balance_loss_mlp": 0.01256472, + "epoch": 0.7615812415451676, + "flos": 13266156372480.0, + "grad_norm": 2.0248103449736963, + "language_loss": 0.83170617, + "learning_rate": 5.671164528660693e-07, + "loss": 0.90845418, + "num_input_tokens_seen": 273257365, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.10253906, + "step": 12667, + "time_per_iteration": 2.4605929851531982 + }, + { + "auxiliary_loss_clip": 0.06401822, + "auxiliary_loss_mlp": 0.01264725, + "balance_loss_clip": 0.06271848, + "balance_loss_mlp": 0.01255266, + "epoch": 0.7616413647978356, + "flos": 18590105894400.0, + "grad_norm": 1.5289232692663373, + "language_loss": 0.78628266, + "learning_rate": 5.668447733039296e-07, + "loss": 0.86294812, + "num_input_tokens_seen": 273274710, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.09460449, + "step": 12668, + "time_per_iteration": 3.9720492362976074 + }, + { + "auxiliary_loss_clip": 0.06403045, + "auxiliary_loss_mlp": 0.01263851, + "balance_loss_clip": 0.06270594, + "balance_loss_mlp": 0.01254469, + "epoch": 0.7617014880505035, + "flos": 18522280414080.0, + "grad_norm": 1.6924413590277445, + "language_loss": 0.64424682, + "learning_rate": 5.6657314808718e-07, + "loss": 0.72091579, + "num_input_tokens_seen": 273292870, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.09381104, + "step": 12669, + "time_per_iteration": 2.4817726612091064 + }, + { + "auxiliary_loss_clip": 0.0640804, + "auxiliary_loss_mlp": 0.01266418, + "balance_loss_clip": 0.06272504, + "balance_loss_mlp": 0.01255403, + "epoch": 0.7617616113031715, + "flos": 24980184040320.0, + "grad_norm": 1.625894991767346, + "language_loss": 0.66114289, + "learning_rate": 5.663015772261202e-07, + "loss": 0.7378875, + "num_input_tokens_seen": 273312375, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.11016846, + "step": 12670, + "time_per_iteration": 2.531942844390869 + }, + { + "auxiliary_loss_clip": 0.06408806, + "auxiliary_loss_mlp": 0.01267085, + "balance_loss_clip": 0.06272422, + "balance_loss_mlp": 0.01256821, + "epoch": 0.7618217345558395, + "flos": 23301796032000.0, + "grad_norm": 1.6261426293442, + "language_loss": 0.72730261, + "learning_rate": 5.660300607310493e-07, + "loss": 0.80406153, + "num_input_tokens_seen": 273332590, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.10266113, + "step": 12671, + "time_per_iteration": 2.555997133255005 + }, + { + "auxiliary_loss_clip": 0.06401184, + "auxiliary_loss_mlp": 0.01263811, + "balance_loss_clip": 0.06269476, + "balance_loss_mlp": 0.01254686, + "epoch": 0.7618818578085075, + "flos": 25489803772800.0, + "grad_norm": 1.5891051355844041, + "language_loss": 0.73397064, + "learning_rate": 5.657585986122613e-07, + "loss": 0.81062061, + "num_input_tokens_seen": 273352885, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09124756, + "step": 12672, + "time_per_iteration": 2.5291435718536377 + }, + { + "auxiliary_loss_clip": 0.06309396, + "auxiliary_loss_mlp": 0.01251395, + "balance_loss_clip": 0.06254143, + "balance_loss_mlp": 0.01250371, + "epoch": 0.7619419810611754, + "flos": 61168633303680.0, + "grad_norm": 0.7432915400862121, + "language_loss": 0.56722248, + "learning_rate": 5.654871908800506e-07, + "loss": 0.64283037, + "num_input_tokens_seen": 273411730, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01023865, + "step": 12673, + "time_per_iteration": 3.134204864501953 + }, + { + "auxiliary_loss_clip": 0.06401986, + "auxiliary_loss_mlp": 0.01266349, + "balance_loss_clip": 0.06268115, + "balance_loss_mlp": 0.01256371, + "epoch": 0.7620021043138434, + "flos": 23265430560000.0, + "grad_norm": 1.7103416042413309, + "language_loss": 0.74883175, + "learning_rate": 5.652158375447102e-07, + "loss": 0.82551509, + "num_input_tokens_seen": 273430020, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.09985352, + "step": 12674, + "time_per_iteration": 2.507917642593384 + }, + { + "auxiliary_loss_clip": 0.06398366, + "auxiliary_loss_mlp": 0.01265734, + "balance_loss_clip": 0.06268415, + "balance_loss_mlp": 0.01257002, + "epoch": 0.7620622275665113, + "flos": 25089490091520.0, + "grad_norm": 2.2685266755673847, + "language_loss": 0.72315985, + "learning_rate": 5.649445386165286e-07, + "loss": 0.79980081, + "num_input_tokens_seen": 273448690, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.08728027, + "step": 12675, + "time_per_iteration": 2.5618882179260254 + }, + { + "auxiliary_loss_clip": 0.0640251, + "auxiliary_loss_mlp": 0.01264495, + "balance_loss_clip": 0.06272566, + "balance_loss_mlp": 0.01254911, + "epoch": 0.7621223508191793, + "flos": 20160864933120.0, + "grad_norm": 1.9392842077457455, + "language_loss": 0.7294848, + "learning_rate": 5.646732941057936e-07, + "loss": 0.80615485, + "num_input_tokens_seen": 273465190, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.09588623, + "step": 12676, + "time_per_iteration": 2.4889016151428223 + }, + { + "auxiliary_loss_clip": 0.06412819, + "auxiliary_loss_mlp": 0.01265496, + "balance_loss_clip": 0.06273077, + "balance_loss_mlp": 0.01255125, + "epoch": 0.7621824740718472, + "flos": 18005323449600.0, + "grad_norm": 3.350191420610347, + "language_loss": 0.54523033, + "learning_rate": 5.644021040227927e-07, + "loss": 0.62201345, + "num_input_tokens_seen": 273478620, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.10357666, + "step": 12677, + "time_per_iteration": 2.479889392852783 + }, + { + "auxiliary_loss_clip": 0.06403828, + "auxiliary_loss_mlp": 0.01261111, + "balance_loss_clip": 0.06271364, + "balance_loss_mlp": 0.0125102, + "epoch": 0.7622425973245153, + "flos": 21732085169280.0, + "grad_norm": 1.924626512292605, + "language_loss": 0.79229861, + "learning_rate": 5.641309683778064e-07, + "loss": 0.86894798, + "num_input_tokens_seen": 273497635, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.10101318, + "step": 12678, + "time_per_iteration": 2.5050454139709473 + }, + { + "auxiliary_loss_clip": 0.0640271, + "auxiliary_loss_mlp": 0.0126229, + "balance_loss_clip": 0.06268604, + "balance_loss_mlp": 0.01252694, + "epoch": 0.7623027205771832, + "flos": 19724563123200.0, + "grad_norm": 2.0630846770322133, + "language_loss": 0.77460301, + "learning_rate": 5.638598871811175e-07, + "loss": 0.85125297, + "num_input_tokens_seen": 273513955, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.09588623, + "step": 12679, + "time_per_iteration": 2.5036091804504395 + }, + { + "auxiliary_loss_clip": 0.06405875, + "auxiliary_loss_mlp": 0.01264484, + "balance_loss_clip": 0.06272455, + "balance_loss_mlp": 0.0125526, + "epoch": 0.7623628438298512, + "flos": 23995800673920.0, + "grad_norm": 1.5339500294685882, + "language_loss": 0.79924572, + "learning_rate": 5.635888604430059e-07, + "loss": 0.87594938, + "num_input_tokens_seen": 273533970, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.0921936, + "step": 12680, + "time_per_iteration": 2.5672616958618164 + }, + { + "auxiliary_loss_clip": 0.06404954, + "auxiliary_loss_mlp": 0.01265568, + "balance_loss_clip": 0.06273016, + "balance_loss_mlp": 0.0125565, + "epoch": 0.7624229670825191, + "flos": 22352184910080.0, + "grad_norm": 1.9657419278541466, + "language_loss": 0.62747079, + "learning_rate": 5.633178881737493e-07, + "loss": 0.70417601, + "num_input_tokens_seen": 273553090, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09918213, + "step": 12681, + "time_per_iteration": 2.5365428924560547 + }, + { + "auxiliary_loss_clip": 0.06399923, + "auxiliary_loss_mlp": 0.01266445, + "balance_loss_clip": 0.06270124, + "balance_loss_mlp": 0.01256789, + "epoch": 0.7624830903351871, + "flos": 22718522960640.0, + "grad_norm": 2.3247043396178335, + "language_loss": 0.76673269, + "learning_rate": 5.63046970383622e-07, + "loss": 0.84339643, + "num_input_tokens_seen": 273572460, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.09649658, + "step": 12682, + "time_per_iteration": 2.5021934509277344 + }, + { + "auxiliary_loss_clip": 0.06400375, + "auxiliary_loss_mlp": 0.0126528, + "balance_loss_clip": 0.06271029, + "balance_loss_mlp": 0.01256554, + "epoch": 0.7625432135878552, + "flos": 25600870759680.0, + "grad_norm": 1.6797876321314247, + "language_loss": 0.68138206, + "learning_rate": 5.627761070828974e-07, + "loss": 0.75803858, + "num_input_tokens_seen": 273592815, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.08728027, + "step": 12683, + "time_per_iteration": 2.5445661544799805 + }, + { + "auxiliary_loss_clip": 0.06401844, + "auxiliary_loss_mlp": 0.01265651, + "balance_loss_clip": 0.06269109, + "balance_loss_mlp": 0.01256078, + "epoch": 0.7626033368405231, + "flos": 23994417081600.0, + "grad_norm": 1.9075173015451221, + "language_loss": 0.83300132, + "learning_rate": 5.625052982818472e-07, + "loss": 0.90967631, + "num_input_tokens_seen": 273611790, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.09564209, + "step": 12684, + "time_per_iteration": 2.545069932937622 + }, + { + "auxiliary_loss_clip": 0.06406077, + "auxiliary_loss_mlp": 0.0126848, + "balance_loss_clip": 0.06271654, + "balance_loss_mlp": 0.01258264, + "epoch": 0.7626634600931911, + "flos": 12603150541440.0, + "grad_norm": 1.7483092151310056, + "language_loss": 0.82848525, + "learning_rate": 5.622345439907396e-07, + "loss": 0.90523082, + "num_input_tokens_seen": 273628340, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.10211182, + "step": 12685, + "time_per_iteration": 2.5331482887268066 + }, + { + "auxiliary_loss_clip": 0.06405815, + "auxiliary_loss_mlp": 0.01266629, + "balance_loss_clip": 0.0627293, + "balance_loss_mlp": 0.0125692, + "epoch": 0.762723583345859, + "flos": 26329731500160.0, + "grad_norm": 1.6739148989024917, + "language_loss": 0.77748114, + "learning_rate": 5.619638442198422e-07, + "loss": 0.85420561, + "num_input_tokens_seen": 273646585, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.0970459, + "step": 12686, + "time_per_iteration": 2.529662609100342 + }, + { + "auxiliary_loss_clip": 0.06407499, + "auxiliary_loss_mlp": 0.01264706, + "balance_loss_clip": 0.06270917, + "balance_loss_mlp": 0.01254204, + "epoch": 0.762783706598527, + "flos": 21913325550720.0, + "grad_norm": 1.6937601944819862, + "language_loss": 0.72154206, + "learning_rate": 5.616931989794198e-07, + "loss": 0.79826409, + "num_input_tokens_seen": 273665410, + "router_z_loss_clip": 1.36621094, + "router_z_loss_mlp": 0.1050415, + "step": 12687, + "time_per_iteration": 2.486391544342041 + }, + { + "auxiliary_loss_clip": 0.06404573, + "auxiliary_loss_mlp": 0.01266259, + "balance_loss_clip": 0.062728, + "balance_loss_mlp": 0.01256263, + "epoch": 0.7628438298511949, + "flos": 15344983843200.0, + "grad_norm": 3.1096174425988656, + "language_loss": 0.65146047, + "learning_rate": 5.614226082797369e-07, + "loss": 0.72816885, + "num_input_tokens_seen": 273683035, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09991455, + "step": 12688, + "time_per_iteration": 2.486335515975952 + }, + { + "auxiliary_loss_clip": 0.06397952, + "auxiliary_loss_mlp": 0.01267437, + "balance_loss_clip": 0.062691, + "balance_loss_mlp": 0.01258103, + "epoch": 0.7629039531038629, + "flos": 13011388433280.0, + "grad_norm": 1.9926161434676632, + "language_loss": 0.70924902, + "learning_rate": 5.611520721310515e-07, + "loss": 0.78590292, + "num_input_tokens_seen": 273700130, + "router_z_loss_clip": 1.28613281, + "router_z_loss_mlp": 0.09332275, + "step": 12689, + "time_per_iteration": 2.5037851333618164 + }, + { + "auxiliary_loss_clip": 0.06412265, + "auxiliary_loss_mlp": 0.01264555, + "balance_loss_clip": 0.06273138, + "balance_loss_mlp": 0.01254493, + "epoch": 0.7629640763565309, + "flos": 26177938629120.0, + "grad_norm": 1.870564488725158, + "language_loss": 0.70028657, + "learning_rate": 5.608815905436238e-07, + "loss": 0.77705473, + "num_input_tokens_seen": 273720310, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.10058594, + "step": 12690, + "time_per_iteration": 2.533437728881836 + }, + { + "auxiliary_loss_clip": 0.06403746, + "auxiliary_loss_mlp": 0.01262782, + "balance_loss_clip": 0.06271788, + "balance_loss_mlp": 0.01253174, + "epoch": 0.7630241996091989, + "flos": 36802553437440.0, + "grad_norm": 1.3861533863354163, + "language_loss": 0.69748205, + "learning_rate": 5.606111635277109e-07, + "loss": 0.77414727, + "num_input_tokens_seen": 273744475, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.0960083, + "step": 12691, + "time_per_iteration": 4.015859127044678 + }, + { + "auxiliary_loss_clip": 0.06401307, + "auxiliary_loss_mlp": 0.01260884, + "balance_loss_clip": 0.06269828, + "balance_loss_mlp": 0.01252003, + "epoch": 0.7630843228618668, + "flos": 21841600855680.0, + "grad_norm": 1.5523680121734649, + "language_loss": 0.82087487, + "learning_rate": 5.603407910935662e-07, + "loss": 0.89749676, + "num_input_tokens_seen": 273764635, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.08880615, + "step": 12692, + "time_per_iteration": 2.5389950275421143 + }, + { + "auxiliary_loss_clip": 0.06409267, + "auxiliary_loss_mlp": 0.01265339, + "balance_loss_clip": 0.06275039, + "balance_loss_mlp": 0.01255993, + "epoch": 0.7631444461145348, + "flos": 12645385799040.0, + "grad_norm": 2.3344184890866564, + "language_loss": 0.77300888, + "learning_rate": 5.600704732514438e-07, + "loss": 0.84975493, + "num_input_tokens_seen": 273780115, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.09344482, + "step": 12693, + "time_per_iteration": 2.445725917816162 + }, + { + "auxiliary_loss_clip": 0.064025, + "auxiliary_loss_mlp": 0.0126808, + "balance_loss_clip": 0.06269249, + "balance_loss_mlp": 0.01257643, + "epoch": 0.7632045693672027, + "flos": 16842215324160.0, + "grad_norm": 1.879033723685166, + "language_loss": 0.7319355, + "learning_rate": 5.598002100115933e-07, + "loss": 0.80864131, + "num_input_tokens_seen": 273796605, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.10437012, + "step": 12694, + "time_per_iteration": 2.480100154876709 + }, + { + "auxiliary_loss_clip": 0.06401706, + "auxiliary_loss_mlp": 0.01263272, + "balance_loss_clip": 0.06270289, + "balance_loss_mlp": 0.01253663, + "epoch": 0.7632646926198707, + "flos": 22023763632000.0, + "grad_norm": 1.7362595054615078, + "language_loss": 0.70577729, + "learning_rate": 5.595300013842625e-07, + "loss": 0.78242707, + "num_input_tokens_seen": 273816515, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09619141, + "step": 12695, + "time_per_iteration": 2.484557867050171 + }, + { + "auxiliary_loss_clip": 0.06405228, + "auxiliary_loss_mlp": 0.01265272, + "balance_loss_clip": 0.0627223, + "balance_loss_mlp": 0.01255134, + "epoch": 0.7633248158725388, + "flos": 23120974920960.0, + "grad_norm": 1.5006607242564833, + "language_loss": 0.72539437, + "learning_rate": 5.592598473796985e-07, + "loss": 0.80209941, + "num_input_tokens_seen": 273837060, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.10150146, + "step": 12696, + "time_per_iteration": 2.535898208618164 + }, + { + "auxiliary_loss_clip": 0.06401037, + "auxiliary_loss_mlp": 0.0126526, + "balance_loss_clip": 0.06268622, + "balance_loss_mlp": 0.01255568, + "epoch": 0.7633849391252067, + "flos": 10894518408960.0, + "grad_norm": 2.5144564572490116, + "language_loss": 0.71505952, + "learning_rate": 5.589897480081453e-07, + "loss": 0.79172248, + "num_input_tokens_seen": 273853365, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09692383, + "step": 12697, + "time_per_iteration": 2.4591684341430664 + }, + { + "auxiliary_loss_clip": 0.06400824, + "auxiliary_loss_mlp": 0.01260764, + "balance_loss_clip": 0.06270981, + "balance_loss_mlp": 0.01251179, + "epoch": 0.7634450623778747, + "flos": 21000163754880.0, + "grad_norm": 1.880904163415611, + "language_loss": 0.67272222, + "learning_rate": 5.587197032798461e-07, + "loss": 0.74933803, + "num_input_tokens_seen": 273870750, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.0958252, + "step": 12698, + "time_per_iteration": 2.5230917930603027 + }, + { + "auxiliary_loss_clip": 0.06403317, + "auxiliary_loss_mlp": 0.01265477, + "balance_loss_clip": 0.06270997, + "balance_loss_mlp": 0.01255529, + "epoch": 0.7635051856305426, + "flos": 18888366902400.0, + "grad_norm": 1.5780107163253119, + "language_loss": 0.72484887, + "learning_rate": 5.5844971320504e-07, + "loss": 0.8015368, + "num_input_tokens_seen": 273890890, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.0994873, + "step": 12699, + "time_per_iteration": 2.5273780822753906 + }, + { + "auxiliary_loss_clip": 0.0640247, + "auxiliary_loss_mlp": 0.0126796, + "balance_loss_clip": 0.06273928, + "balance_loss_mlp": 0.01258906, + "epoch": 0.7635653088832106, + "flos": 34795492588800.0, + "grad_norm": 1.9895424194721678, + "language_loss": 0.73307264, + "learning_rate": 5.581797777939648e-07, + "loss": 0.8097769, + "num_input_tokens_seen": 273914015, + "router_z_loss_clip": 1.28613281, + "router_z_loss_mlp": 0.09069824, + "step": 12700, + "time_per_iteration": 4.06644868850708 + }, + { + "auxiliary_loss_clip": 0.0640322, + "auxiliary_loss_mlp": 0.01269407, + "balance_loss_clip": 0.06270028, + "balance_loss_mlp": 0.01259608, + "epoch": 0.7636254321358785, + "flos": 23183978791680.0, + "grad_norm": 1.8289500414025046, + "language_loss": 0.69277215, + "learning_rate": 5.579098970568574e-07, + "loss": 0.76949847, + "num_input_tokens_seen": 273927415, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.09796143, + "step": 12701, + "time_per_iteration": 2.4977099895477295 + }, + { + "auxiliary_loss_clip": 0.06401876, + "auxiliary_loss_mlp": 0.01262857, + "balance_loss_clip": 0.06269674, + "balance_loss_mlp": 0.01253243, + "epoch": 0.7636855553885465, + "flos": 21331729560960.0, + "grad_norm": 1.5301057508918974, + "language_loss": 0.64290726, + "learning_rate": 5.576400710039508e-07, + "loss": 0.7195546, + "num_input_tokens_seen": 273946690, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.09606934, + "step": 12702, + "time_per_iteration": 2.4910881519317627 + }, + { + "auxiliary_loss_clip": 0.06402961, + "auxiliary_loss_mlp": 0.01265669, + "balance_loss_clip": 0.06269959, + "balance_loss_mlp": 0.01256234, + "epoch": 0.7637456786412145, + "flos": 28665674824320.0, + "grad_norm": 1.963609141873143, + "language_loss": 0.66137874, + "learning_rate": 5.57370299645477e-07, + "loss": 0.738065, + "num_input_tokens_seen": 273966870, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.09429932, + "step": 12703, + "time_per_iteration": 3.9583401679992676 + }, + { + "auxiliary_loss_clip": 0.06406517, + "auxiliary_loss_mlp": 0.01265828, + "balance_loss_clip": 0.06273364, + "balance_loss_mlp": 0.01256721, + "epoch": 0.7638058018938825, + "flos": 21913577112960.0, + "grad_norm": 2.0195903258707757, + "language_loss": 0.83478069, + "learning_rate": 5.571005829916668e-07, + "loss": 0.91150421, + "num_input_tokens_seen": 273986360, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.09112549, + "step": 12704, + "time_per_iteration": 2.5038557052612305 + }, + { + "auxiliary_loss_clip": 0.0640365, + "auxiliary_loss_mlp": 0.01268211, + "balance_loss_clip": 0.06271724, + "balance_loss_mlp": 0.01258686, + "epoch": 0.7638659251465504, + "flos": 29651777199360.0, + "grad_norm": 1.4030805409759646, + "language_loss": 0.68150222, + "learning_rate": 5.568309210527469e-07, + "loss": 0.75822091, + "num_input_tokens_seen": 274009745, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.09527588, + "step": 12705, + "time_per_iteration": 2.5900156497955322 + }, + { + "auxiliary_loss_clip": 0.06400676, + "auxiliary_loss_mlp": 0.01264845, + "balance_loss_clip": 0.06270821, + "balance_loss_mlp": 0.01255672, + "epoch": 0.7639260483992184, + "flos": 26148449191680.0, + "grad_norm": 1.5410038713701188, + "language_loss": 0.74538386, + "learning_rate": 5.565613138389427e-07, + "loss": 0.82203901, + "num_input_tokens_seen": 274028775, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.09173584, + "step": 12706, + "time_per_iteration": 2.559558391571045 + }, + { + "auxiliary_loss_clip": 0.06403012, + "auxiliary_loss_mlp": 0.01265533, + "balance_loss_clip": 0.0627191, + "balance_loss_mlp": 0.01256336, + "epoch": 0.7639861716518863, + "flos": 20162835504000.0, + "grad_norm": 1.755600712442579, + "language_loss": 0.78974855, + "learning_rate": 5.562917613604781e-07, + "loss": 0.86643398, + "num_input_tokens_seen": 274047520, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09191895, + "step": 12707, + "time_per_iteration": 3.932704210281372 + }, + { + "auxiliary_loss_clip": 0.06401724, + "auxiliary_loss_mlp": 0.01265201, + "balance_loss_clip": 0.06268962, + "balance_loss_mlp": 0.01255283, + "epoch": 0.7640462949045543, + "flos": 18588219177600.0, + "grad_norm": 6.1940407959342885, + "language_loss": 0.80090815, + "learning_rate": 5.560222636275751e-07, + "loss": 0.87757736, + "num_input_tokens_seen": 274065350, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.0993042, + "step": 12708, + "time_per_iteration": 2.4813318252563477 + }, + { + "auxiliary_loss_clip": 0.06315993, + "auxiliary_loss_mlp": 0.0125198, + "balance_loss_clip": 0.06260599, + "balance_loss_mlp": 0.01250996, + "epoch": 0.7641064181572224, + "flos": 68342972538240.0, + "grad_norm": 0.7968333839429529, + "language_loss": 0.5539844, + "learning_rate": 5.557528206504521e-07, + "loss": 0.62966412, + "num_input_tokens_seen": 274122315, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.00983429, + "step": 12709, + "time_per_iteration": 3.1384057998657227 + }, + { + "auxiliary_loss_clip": 0.0640793, + "auxiliary_loss_mlp": 0.01269871, + "balance_loss_clip": 0.06272653, + "balance_loss_mlp": 0.0125925, + "epoch": 0.7641665414098903, + "flos": 17974995471360.0, + "grad_norm": 1.6571298349962345, + "language_loss": 0.63628614, + "learning_rate": 5.554834324393271e-07, + "loss": 0.71306419, + "num_input_tokens_seen": 274140555, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.10614014, + "step": 12710, + "time_per_iteration": 2.503221273422241 + }, + { + "auxiliary_loss_clip": 0.06405756, + "auxiliary_loss_mlp": 0.01266035, + "balance_loss_clip": 0.06270481, + "balance_loss_mlp": 0.01255705, + "epoch": 0.7642266646625583, + "flos": 21258537419520.0, + "grad_norm": 2.423165664894835, + "language_loss": 0.64622939, + "learning_rate": 5.552140990044154e-07, + "loss": 0.72294724, + "num_input_tokens_seen": 274161125, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.10327148, + "step": 12711, + "time_per_iteration": 2.48382568359375 + }, + { + "auxiliary_loss_clip": 0.06402837, + "auxiliary_loss_mlp": 0.01266675, + "balance_loss_clip": 0.06270531, + "balance_loss_mlp": 0.01257216, + "epoch": 0.7642867879152262, + "flos": 22754469162240.0, + "grad_norm": 1.499831368340144, + "language_loss": 0.73271233, + "learning_rate": 5.549448203559293e-07, + "loss": 0.80940747, + "num_input_tokens_seen": 274180835, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.09454346, + "step": 12712, + "time_per_iteration": 2.518559455871582 + }, + { + "auxiliary_loss_clip": 0.06399734, + "auxiliary_loss_mlp": 0.01265086, + "balance_loss_clip": 0.06270479, + "balance_loss_mlp": 0.01256247, + "epoch": 0.7643469111678942, + "flos": 23339000044800.0, + "grad_norm": 4.100229806424162, + "language_loss": 0.80473924, + "learning_rate": 5.546755965040804e-07, + "loss": 0.88138747, + "num_input_tokens_seen": 274201190, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.08837891, + "step": 12713, + "time_per_iteration": 2.495666742324829 + }, + { + "auxiliary_loss_clip": 0.0640631, + "auxiliary_loss_mlp": 0.01266494, + "balance_loss_clip": 0.06271648, + "balance_loss_mlp": 0.01256237, + "epoch": 0.7644070344205621, + "flos": 19861891165440.0, + "grad_norm": 2.1468665185465396, + "language_loss": 0.84159482, + "learning_rate": 5.544064274590776e-07, + "loss": 0.91832292, + "num_input_tokens_seen": 274217595, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.10266113, + "step": 12714, + "time_per_iteration": 2.4871368408203125 + }, + { + "auxiliary_loss_clip": 0.06406413, + "auxiliary_loss_mlp": 0.01267342, + "balance_loss_clip": 0.06272297, + "balance_loss_mlp": 0.01257603, + "epoch": 0.7644671576732301, + "flos": 22097123481600.0, + "grad_norm": 1.4736408355385546, + "language_loss": 0.73087925, + "learning_rate": 5.541373132311287e-07, + "loss": 0.80761683, + "num_input_tokens_seen": 274237885, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.09741211, + "step": 12715, + "time_per_iteration": 2.4971745014190674 + }, + { + "auxiliary_loss_clip": 0.06399769, + "auxiliary_loss_mlp": 0.01265115, + "balance_loss_clip": 0.06267397, + "balance_loss_mlp": 0.01256252, + "epoch": 0.7645272809258981, + "flos": 25488084764160.0, + "grad_norm": 1.606219528134415, + "language_loss": 0.63579881, + "learning_rate": 5.538682538304376e-07, + "loss": 0.71244764, + "num_input_tokens_seen": 274258820, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.08868408, + "step": 12716, + "time_per_iteration": 2.5588536262512207 + }, + { + "auxiliary_loss_clip": 0.06410594, + "auxiliary_loss_mlp": 0.01264337, + "balance_loss_clip": 0.06273409, + "balance_loss_mlp": 0.01254353, + "epoch": 0.7645874041785661, + "flos": 21548035676160.0, + "grad_norm": 1.605402904200963, + "language_loss": 0.80340159, + "learning_rate": 5.535992492672068e-07, + "loss": 0.88015091, + "num_input_tokens_seen": 274278835, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.09991455, + "step": 12717, + "time_per_iteration": 2.4905505180358887 + }, + { + "auxiliary_loss_clip": 0.06401056, + "auxiliary_loss_mlp": 0.01264789, + "balance_loss_clip": 0.06271626, + "balance_loss_mlp": 0.01255342, + "epoch": 0.764647527431234, + "flos": 20637096013440.0, + "grad_norm": 2.3928982518870474, + "language_loss": 0.669339, + "learning_rate": 5.53330299551638e-07, + "loss": 0.74599743, + "num_input_tokens_seen": 274297110, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.09448242, + "step": 12718, + "time_per_iteration": 2.492809772491455 + }, + { + "auxiliary_loss_clip": 0.06399414, + "auxiliary_loss_mlp": 0.01266678, + "balance_loss_clip": 0.06269114, + "balance_loss_mlp": 0.01257368, + "epoch": 0.764707650683902, + "flos": 21440490560640.0, + "grad_norm": 1.7155178939343805, + "language_loss": 0.77496254, + "learning_rate": 5.530614046939286e-07, + "loss": 0.85162342, + "num_input_tokens_seen": 274315610, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.09301758, + "step": 12719, + "time_per_iteration": 2.5259573459625244 + }, + { + "auxiliary_loss_clip": 0.06404945, + "auxiliary_loss_mlp": 0.01264588, + "balance_loss_clip": 0.06272125, + "balance_loss_mlp": 0.01255021, + "epoch": 0.7647677739365699, + "flos": 22717852128000.0, + "grad_norm": 1.9590152643999037, + "language_loss": 0.69958895, + "learning_rate": 5.527925647042754e-07, + "loss": 0.77628434, + "num_input_tokens_seen": 274333975, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09564209, + "step": 12720, + "time_per_iteration": 2.539653778076172 + }, + { + "auxiliary_loss_clip": 0.06404178, + "auxiliary_loss_mlp": 0.01262819, + "balance_loss_clip": 0.06272593, + "balance_loss_mlp": 0.01252716, + "epoch": 0.7648278971892379, + "flos": 21330429822720.0, + "grad_norm": 1.6704748814369004, + "language_loss": 0.73973656, + "learning_rate": 5.52523779592875e-07, + "loss": 0.81640649, + "num_input_tokens_seen": 274353695, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.10107422, + "step": 12721, + "time_per_iteration": 2.501253128051758 + }, + { + "auxiliary_loss_clip": 0.06403898, + "auxiliary_loss_mlp": 0.01264362, + "balance_loss_clip": 0.06270562, + "balance_loss_mlp": 0.01254771, + "epoch": 0.764888020441906, + "flos": 20673545339520.0, + "grad_norm": 1.706168153440744, + "language_loss": 0.73528266, + "learning_rate": 5.522550493699163e-07, + "loss": 0.81196523, + "num_input_tokens_seen": 274371120, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.09594727, + "step": 12722, + "time_per_iteration": 2.509871244430542 + }, + { + "auxiliary_loss_clip": 0.06399025, + "auxiliary_loss_mlp": 0.01265445, + "balance_loss_clip": 0.06269681, + "balance_loss_mlp": 0.01256015, + "epoch": 0.7649481436945739, + "flos": 25089532018560.0, + "grad_norm": 1.7286135730297545, + "language_loss": 0.74329245, + "learning_rate": 5.519863740455912e-07, + "loss": 0.81993717, + "num_input_tokens_seen": 274389665, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.09423828, + "step": 12723, + "time_per_iteration": 2.510096549987793 + }, + { + "auxiliary_loss_clip": 0.06404193, + "auxiliary_loss_mlp": 0.01262404, + "balance_loss_clip": 0.06269242, + "balance_loss_mlp": 0.01252688, + "epoch": 0.7650082669472419, + "flos": 24907998147840.0, + "grad_norm": 2.2850113448580958, + "language_loss": 0.73361677, + "learning_rate": 5.517177536300881e-07, + "loss": 0.81028277, + "num_input_tokens_seen": 274408750, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.09710693, + "step": 12724, + "time_per_iteration": 2.5588150024414062 + }, + { + "auxiliary_loss_clip": 0.06401032, + "auxiliary_loss_mlp": 0.01264201, + "balance_loss_clip": 0.06271203, + "balance_loss_mlp": 0.01254885, + "epoch": 0.7650683901999098, + "flos": 14652614355840.0, + "grad_norm": 1.6932286249415067, + "language_loss": 0.84691983, + "learning_rate": 5.514491881335935e-07, + "loss": 0.92357218, + "num_input_tokens_seen": 274424600, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.09320068, + "step": 12725, + "time_per_iteration": 2.4555823802948 + }, + { + "auxiliary_loss_clip": 0.06405662, + "auxiliary_loss_mlp": 0.01270715, + "balance_loss_clip": 0.06275846, + "balance_loss_mlp": 0.01260433, + "epoch": 0.7651285134525778, + "flos": 26358466250880.0, + "grad_norm": 1.7988072143781486, + "language_loss": 0.77533686, + "learning_rate": 5.511806775662901e-07, + "loss": 0.85210061, + "num_input_tokens_seen": 274443075, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.10284424, + "step": 12726, + "time_per_iteration": 2.56742000579834 + }, + { + "auxiliary_loss_clip": 0.06403583, + "auxiliary_loss_mlp": 0.01263268, + "balance_loss_clip": 0.06271972, + "balance_loss_mlp": 0.01254024, + "epoch": 0.7651886367052457, + "flos": 26653373095680.0, + "grad_norm": 1.6652210765488402, + "language_loss": 0.70600379, + "learning_rate": 5.509122219383615e-07, + "loss": 0.78267229, + "num_input_tokens_seen": 274463240, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09240723, + "step": 12727, + "time_per_iteration": 2.5245282649993896 + }, + { + "auxiliary_loss_clip": 0.06395786, + "auxiliary_loss_mlp": 0.01263203, + "balance_loss_clip": 0.06267853, + "balance_loss_mlp": 0.01254024, + "epoch": 0.7652487599579137, + "flos": 25709967175680.0, + "grad_norm": 1.6422371786213563, + "language_loss": 0.80038959, + "learning_rate": 5.506438212599864e-07, + "loss": 0.87697947, + "num_input_tokens_seen": 274482750, + "router_z_loss_clip": 1.27832031, + "router_z_loss_mlp": 0.09179688, + "step": 12728, + "time_per_iteration": 2.553881883621216 + }, + { + "auxiliary_loss_clip": 0.064078, + "auxiliary_loss_mlp": 0.01267492, + "balance_loss_clip": 0.0627337, + "balance_loss_mlp": 0.01257395, + "epoch": 0.7653088832105817, + "flos": 28593237369600.0, + "grad_norm": 1.6909382906919501, + "language_loss": 0.55773109, + "learning_rate": 5.503754755413424e-07, + "loss": 0.63448405, + "num_input_tokens_seen": 274503545, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.10089111, + "step": 12729, + "time_per_iteration": 2.561567783355713 + }, + { + "auxiliary_loss_clip": 0.06402748, + "auxiliary_loss_mlp": 0.01266568, + "balance_loss_clip": 0.0627131, + "balance_loss_mlp": 0.01256435, + "epoch": 0.7653690064632497, + "flos": 23373311091840.0, + "grad_norm": 1.5255211318254533, + "language_loss": 0.77756214, + "learning_rate": 5.501071847926055e-07, + "loss": 0.85425532, + "num_input_tokens_seen": 274523825, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.10131836, + "step": 12730, + "time_per_iteration": 3.951883316040039 + }, + { + "auxiliary_loss_clip": 0.0640994, + "auxiliary_loss_mlp": 0.01263677, + "balance_loss_clip": 0.06275389, + "balance_loss_mlp": 0.01253496, + "epoch": 0.7654291297159176, + "flos": 15778560395520.0, + "grad_norm": 1.5538691638081712, + "language_loss": 0.68886495, + "learning_rate": 5.498389490239495e-07, + "loss": 0.7656011, + "num_input_tokens_seen": 274541625, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.10180664, + "step": 12731, + "time_per_iteration": 2.496400833129883 + }, + { + "auxiliary_loss_clip": 0.06406744, + "auxiliary_loss_mlp": 0.01266172, + "balance_loss_clip": 0.06273277, + "balance_loss_mlp": 0.01255997, + "epoch": 0.7654892529685856, + "flos": 18038460539520.0, + "grad_norm": 1.970235991711743, + "language_loss": 0.70561087, + "learning_rate": 5.495707682455471e-07, + "loss": 0.78233999, + "num_input_tokens_seen": 274557580, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.10174561, + "step": 12732, + "time_per_iteration": 2.4463298320770264 + }, + { + "auxiliary_loss_clip": 0.06407348, + "auxiliary_loss_mlp": 0.01267052, + "balance_loss_clip": 0.06273159, + "balance_loss_mlp": 0.01257009, + "epoch": 0.7655493762212535, + "flos": 27243522201600.0, + "grad_norm": 1.6975746826212326, + "language_loss": 0.7867943, + "learning_rate": 5.493026424675653e-07, + "loss": 0.86353827, + "num_input_tokens_seen": 274578135, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.10040283, + "step": 12733, + "time_per_iteration": 2.5465524196624756 + }, + { + "auxiliary_loss_clip": 0.06404738, + "auxiliary_loss_mlp": 0.01264475, + "balance_loss_clip": 0.06275003, + "balance_loss_mlp": 0.01254843, + "epoch": 0.7656094994739215, + "flos": 20779706862720.0, + "grad_norm": 1.7438651719482663, + "language_loss": 0.78086102, + "learning_rate": 5.490345717001726e-07, + "loss": 0.85755318, + "num_input_tokens_seen": 274595655, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.09637451, + "step": 12734, + "time_per_iteration": 2.491992235183716 + }, + { + "auxiliary_loss_clip": 0.06409705, + "auxiliary_loss_mlp": 0.01265243, + "balance_loss_clip": 0.06273736, + "balance_loss_mlp": 0.01254628, + "epoch": 0.7656696227265896, + "flos": 23045896062720.0, + "grad_norm": 1.5457458237043498, + "language_loss": 0.73303032, + "learning_rate": 5.48766555953535e-07, + "loss": 0.80977982, + "num_input_tokens_seen": 274616305, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.1060791, + "step": 12735, + "time_per_iteration": 2.549952507019043 + }, + { + "auxiliary_loss_clip": 0.06403875, + "auxiliary_loss_mlp": 0.01265362, + "balance_loss_clip": 0.0627028, + "balance_loss_mlp": 0.01255956, + "epoch": 0.7657297459792575, + "flos": 27532810823040.0, + "grad_norm": 1.38702410103644, + "language_loss": 0.72968668, + "learning_rate": 5.484985952378145e-07, + "loss": 0.80637902, + "num_input_tokens_seen": 274638110, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.09399414, + "step": 12736, + "time_per_iteration": 2.5478687286376953 + }, + { + "auxiliary_loss_clip": 0.06409203, + "auxiliary_loss_mlp": 0.0126645, + "balance_loss_clip": 0.06272754, + "balance_loss_mlp": 0.01255399, + "epoch": 0.7657898692319255, + "flos": 17134103422080.0, + "grad_norm": 1.7853161990922843, + "language_loss": 0.77847868, + "learning_rate": 5.482306895631728e-07, + "loss": 0.85523522, + "num_input_tokens_seen": 274656565, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.11065674, + "step": 12737, + "time_per_iteration": 2.517828941345215 + }, + { + "auxiliary_loss_clip": 0.06403487, + "auxiliary_loss_mlp": 0.01264987, + "balance_loss_clip": 0.06271316, + "balance_loss_mlp": 0.01254795, + "epoch": 0.7658499924845934, + "flos": 21471363590400.0, + "grad_norm": 1.7993008956393386, + "language_loss": 0.7689963, + "learning_rate": 5.479628389397699e-07, + "loss": 0.84568107, + "num_input_tokens_seen": 274674215, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.10186768, + "step": 12738, + "time_per_iteration": 2.4858741760253906 + }, + { + "auxiliary_loss_clip": 0.06409841, + "auxiliary_loss_mlp": 0.01265376, + "balance_loss_clip": 0.06272836, + "balance_loss_mlp": 0.01254748, + "epoch": 0.7659101157372614, + "flos": 29504302813440.0, + "grad_norm": 1.7653019874765563, + "language_loss": 0.6329987, + "learning_rate": 5.476950433777603e-07, + "loss": 0.70975083, + "num_input_tokens_seen": 274693445, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.10620117, + "step": 12739, + "time_per_iteration": 3.9952597618103027 + }, + { + "auxiliary_loss_clip": 0.06407788, + "auxiliary_loss_mlp": 0.0126759, + "balance_loss_clip": 0.06274374, + "balance_loss_mlp": 0.01256718, + "epoch": 0.7659702389899293, + "flos": 18557765418240.0, + "grad_norm": 1.7669010799995182, + "language_loss": 0.7909317, + "learning_rate": 5.474273028873004e-07, + "loss": 0.8676855, + "num_input_tokens_seen": 274712815, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.10870361, + "step": 12740, + "time_per_iteration": 2.5115749835968018 + }, + { + "auxiliary_loss_clip": 0.06403244, + "auxiliary_loss_mlp": 0.01263789, + "balance_loss_clip": 0.06271347, + "balance_loss_mlp": 0.01253853, + "epoch": 0.7660303622425974, + "flos": 23555767357440.0, + "grad_norm": 1.6620793532611546, + "language_loss": 0.65799433, + "learning_rate": 5.471596174785429e-07, + "loss": 0.73466468, + "num_input_tokens_seen": 274732690, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.09924316, + "step": 12741, + "time_per_iteration": 2.55269718170166 + }, + { + "auxiliary_loss_clip": 0.06404097, + "auxiliary_loss_mlp": 0.01266317, + "balance_loss_clip": 0.06272512, + "balance_loss_mlp": 0.01256482, + "epoch": 0.7660904854952653, + "flos": 18922761803520.0, + "grad_norm": 1.4348808707369967, + "language_loss": 0.76128972, + "learning_rate": 5.468919871616386e-07, + "loss": 0.83799386, + "num_input_tokens_seen": 274752460, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09832764, + "step": 12742, + "time_per_iteration": 3.9655463695526123 + }, + { + "auxiliary_loss_clip": 0.06397024, + "auxiliary_loss_mlp": 0.01262102, + "balance_loss_clip": 0.06269021, + "balance_loss_mlp": 0.01253274, + "epoch": 0.7661506087479333, + "flos": 23153986229760.0, + "grad_norm": 1.3105418877806154, + "language_loss": 0.76677555, + "learning_rate": 5.46624411946736e-07, + "loss": 0.84336686, + "num_input_tokens_seen": 274773070, + "router_z_loss_clip": 1.27929688, + "router_z_loss_mlp": 0.08831787, + "step": 12743, + "time_per_iteration": 2.4942922592163086 + }, + { + "auxiliary_loss_clip": 0.064053, + "auxiliary_loss_mlp": 0.01263354, + "balance_loss_clip": 0.06272225, + "balance_loss_mlp": 0.01253918, + "epoch": 0.7662107320006012, + "flos": 17571411480960.0, + "grad_norm": 1.8622912064646877, + "language_loss": 0.75256228, + "learning_rate": 5.463568918439805e-07, + "loss": 0.82924885, + "num_input_tokens_seen": 274790220, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.09442139, + "step": 12744, + "time_per_iteration": 2.500877618789673 + }, + { + "auxiliary_loss_clip": 0.06405517, + "auxiliary_loss_mlp": 0.01265062, + "balance_loss_clip": 0.06271944, + "balance_loss_mlp": 0.01255078, + "epoch": 0.7662708552532692, + "flos": 22308524133120.0, + "grad_norm": 3.023764218410669, + "language_loss": 0.70912051, + "learning_rate": 5.460894268635181e-07, + "loss": 0.78582633, + "num_input_tokens_seen": 274805095, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.09979248, + "step": 12745, + "time_per_iteration": 2.4632673263549805 + }, + { + "auxiliary_loss_clip": 0.06404217, + "auxiliary_loss_mlp": 0.01263005, + "balance_loss_clip": 0.0627097, + "balance_loss_mlp": 0.01252938, + "epoch": 0.7663309785059371, + "flos": 15747477730560.0, + "grad_norm": 2.4148009048873975, + "language_loss": 0.77143252, + "learning_rate": 5.458220170154896e-07, + "loss": 0.84810472, + "num_input_tokens_seen": 274821800, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.10058594, + "step": 12746, + "time_per_iteration": 2.470808506011963 + }, + { + "auxiliary_loss_clip": 0.06317573, + "auxiliary_loss_mlp": 0.01252549, + "balance_loss_clip": 0.06262261, + "balance_loss_mlp": 0.0125142, + "epoch": 0.7663911017586051, + "flos": 62184503877120.0, + "grad_norm": 0.6541980070594193, + "language_loss": 0.56711543, + "learning_rate": 5.455546623100362e-07, + "loss": 0.6428166, + "num_input_tokens_seen": 274886970, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01132202, + "step": 12747, + "time_per_iteration": 4.652554273605347 + }, + { + "auxiliary_loss_clip": 0.06402487, + "auxiliary_loss_mlp": 0.01263124, + "balance_loss_clip": 0.06272968, + "balance_loss_mlp": 0.01254393, + "epoch": 0.7664512250112732, + "flos": 26513361722880.0, + "grad_norm": 1.4294052686303238, + "language_loss": 0.72911537, + "learning_rate": 5.452873627572956e-07, + "loss": 0.80577153, + "num_input_tokens_seen": 274907240, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.08728027, + "step": 12748, + "time_per_iteration": 2.532306432723999 + }, + { + "auxiliary_loss_clip": 0.06404538, + "auxiliary_loss_mlp": 0.01268933, + "balance_loss_clip": 0.0627327, + "balance_loss_mlp": 0.01259348, + "epoch": 0.7665113482639411, + "flos": 16254497986560.0, + "grad_norm": 1.791719003468204, + "language_loss": 0.70015478, + "learning_rate": 5.450201183674052e-07, + "loss": 0.77688944, + "num_input_tokens_seen": 274924650, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.0958252, + "step": 12749, + "time_per_iteration": 2.492206573486328 + }, + { + "auxiliary_loss_clip": 0.06405895, + "auxiliary_loss_mlp": 0.01264322, + "balance_loss_clip": 0.06271075, + "balance_loss_mlp": 0.01254136, + "epoch": 0.7665714715166091, + "flos": 27205102304640.0, + "grad_norm": 1.5075173450833508, + "language_loss": 0.73696417, + "learning_rate": 5.447529291504967e-07, + "loss": 0.81366634, + "num_input_tokens_seen": 274944550, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.10180664, + "step": 12750, + "time_per_iteration": 2.6194586753845215 + }, + { + "auxiliary_loss_clip": 0.06403321, + "auxiliary_loss_mlp": 0.01264912, + "balance_loss_clip": 0.06273864, + "balance_loss_mlp": 0.01255637, + "epoch": 0.766631594769277, + "flos": 21073900947840.0, + "grad_norm": 2.338667432338341, + "language_loss": 0.75889468, + "learning_rate": 5.444857951167026e-07, + "loss": 0.83557701, + "num_input_tokens_seen": 274961330, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.09265137, + "step": 12751, + "time_per_iteration": 2.535900354385376 + }, + { + "auxiliary_loss_clip": 0.06405959, + "auxiliary_loss_mlp": 0.01265211, + "balance_loss_clip": 0.06275126, + "balance_loss_mlp": 0.01255442, + "epoch": 0.766691718021945, + "flos": 24104897089920.0, + "grad_norm": 1.8024081309521767, + "language_loss": 0.61214471, + "learning_rate": 5.442187162761537e-07, + "loss": 0.68885642, + "num_input_tokens_seen": 274981655, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09759521, + "step": 12752, + "time_per_iteration": 2.520057439804077 + }, + { + "auxiliary_loss_clip": 0.06407845, + "auxiliary_loss_mlp": 0.01265918, + "balance_loss_clip": 0.06274091, + "balance_loss_mlp": 0.01255452, + "epoch": 0.7667518412746129, + "flos": 23447383701120.0, + "grad_norm": 2.502768793247081, + "language_loss": 0.68991947, + "learning_rate": 5.439516926389767e-07, + "loss": 0.76665711, + "num_input_tokens_seen": 274999970, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.10467529, + "step": 12753, + "time_per_iteration": 2.5649516582489014 + }, + { + "auxiliary_loss_clip": 0.06405421, + "auxiliary_loss_mlp": 0.01267269, + "balance_loss_clip": 0.06272765, + "balance_loss_mlp": 0.01257339, + "epoch": 0.766811964527281, + "flos": 18154391063040.0, + "grad_norm": 2.2031278091751103, + "language_loss": 0.62667269, + "learning_rate": 5.436847242152971e-07, + "loss": 0.7033996, + "num_input_tokens_seen": 275015805, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.09936523, + "step": 12754, + "time_per_iteration": 2.4367518424987793 + }, + { + "auxiliary_loss_clip": 0.06402913, + "auxiliary_loss_mlp": 0.01263482, + "balance_loss_clip": 0.06272813, + "balance_loss_mlp": 0.01253426, + "epoch": 0.7668720877799489, + "flos": 19542023003520.0, + "grad_norm": 2.343791341299276, + "language_loss": 0.80305493, + "learning_rate": 5.434178110152401e-07, + "loss": 0.87971884, + "num_input_tokens_seen": 275031810, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.10040283, + "step": 12755, + "time_per_iteration": 2.4789938926696777 + }, + { + "auxiliary_loss_clip": 0.06403362, + "auxiliary_loss_mlp": 0.01266077, + "balance_loss_clip": 0.06272961, + "balance_loss_mlp": 0.0125626, + "epoch": 0.7669322110326169, + "flos": 22680899677440.0, + "grad_norm": 1.9246427907733588, + "language_loss": 0.70196575, + "learning_rate": 5.431509530489242e-07, + "loss": 0.77866018, + "num_input_tokens_seen": 275049325, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.09820557, + "step": 12756, + "time_per_iteration": 2.4842453002929688 + }, + { + "auxiliary_loss_clip": 0.06408253, + "auxiliary_loss_mlp": 0.01265925, + "balance_loss_clip": 0.06274906, + "balance_loss_mlp": 0.01256621, + "epoch": 0.7669923342852848, + "flos": 26476702761600.0, + "grad_norm": 1.4236493885684283, + "language_loss": 0.70190722, + "learning_rate": 5.428841503264706e-07, + "loss": 0.77864897, + "num_input_tokens_seen": 275070865, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.09307861, + "step": 12757, + "time_per_iteration": 2.5436339378356934 + }, + { + "auxiliary_loss_clip": 0.06405462, + "auxiliary_loss_mlp": 0.01264437, + "balance_loss_clip": 0.06275049, + "balance_loss_mlp": 0.01254089, + "epoch": 0.7670524575379528, + "flos": 22862643183360.0, + "grad_norm": 1.8472558815325884, + "language_loss": 0.76448315, + "learning_rate": 5.426174028579955e-07, + "loss": 0.84118211, + "num_input_tokens_seen": 275088015, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.10345459, + "step": 12758, + "time_per_iteration": 2.4789509773254395 + }, + { + "auxiliary_loss_clip": 0.06399853, + "auxiliary_loss_mlp": 0.01265053, + "balance_loss_clip": 0.06270798, + "balance_loss_mlp": 0.01255576, + "epoch": 0.7671125807906207, + "flos": 22458136798080.0, + "grad_norm": 1.6508827422801604, + "language_loss": 0.76464295, + "learning_rate": 5.423507106536156e-07, + "loss": 0.84129202, + "num_input_tokens_seen": 275106975, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.0947876, + "step": 12759, + "time_per_iteration": 2.5259945392608643 + }, + { + "auxiliary_loss_clip": 0.0640488, + "auxiliary_loss_mlp": 0.01263564, + "balance_loss_clip": 0.06270535, + "balance_loss_mlp": 0.01254195, + "epoch": 0.7671727040432887, + "flos": 35380275033600.0, + "grad_norm": 1.982345292184502, + "language_loss": 0.68377602, + "learning_rate": 5.420840737234425e-07, + "loss": 0.7604605, + "num_input_tokens_seen": 275129560, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.09368896, + "step": 12760, + "time_per_iteration": 2.5982978343963623 + }, + { + "auxiliary_loss_clip": 0.06406338, + "auxiliary_loss_mlp": 0.01265901, + "balance_loss_clip": 0.06272851, + "balance_loss_mlp": 0.0125584, + "epoch": 0.7672328272959568, + "flos": 22502007210240.0, + "grad_norm": 1.3719850689198565, + "language_loss": 0.79309064, + "learning_rate": 5.418174920775871e-07, + "loss": 0.86981302, + "num_input_tokens_seen": 275151180, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.10058594, + "step": 12761, + "time_per_iteration": 2.5480268001556396 + }, + { + "auxiliary_loss_clip": 0.06403705, + "auxiliary_loss_mlp": 0.01267963, + "balance_loss_clip": 0.06276072, + "balance_loss_mlp": 0.01258289, + "epoch": 0.7672929505486247, + "flos": 22821372247680.0, + "grad_norm": 2.021114982719017, + "language_loss": 0.66376638, + "learning_rate": 5.415509657261589e-07, + "loss": 0.74048305, + "num_input_tokens_seen": 275170605, + "router_z_loss_clip": 1.27539062, + "router_z_loss_mlp": 0.09674072, + "step": 12762, + "time_per_iteration": 2.487494707107544 + }, + { + "auxiliary_loss_clip": 0.06406671, + "auxiliary_loss_mlp": 0.01262822, + "balance_loss_clip": 0.06272823, + "balance_loss_mlp": 0.01253148, + "epoch": 0.7673530738012927, + "flos": 20344956353280.0, + "grad_norm": 1.669517530242866, + "language_loss": 0.74410594, + "learning_rate": 5.412844946792639e-07, + "loss": 0.82080084, + "num_input_tokens_seen": 275188750, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.09667969, + "step": 12763, + "time_per_iteration": 2.50715970993042 + }, + { + "auxiliary_loss_clip": 0.06406026, + "auxiliary_loss_mlp": 0.01264927, + "balance_loss_clip": 0.06275215, + "balance_loss_mlp": 0.01254836, + "epoch": 0.7674131970539606, + "flos": 34942212288000.0, + "grad_norm": 1.4115021004744182, + "language_loss": 0.70948029, + "learning_rate": 5.410180789470067e-07, + "loss": 0.78618985, + "num_input_tokens_seen": 275211365, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.10089111, + "step": 12764, + "time_per_iteration": 2.625321388244629 + }, + { + "auxiliary_loss_clip": 0.06405284, + "auxiliary_loss_mlp": 0.0126607, + "balance_loss_clip": 0.06274922, + "balance_loss_mlp": 0.01256241, + "epoch": 0.7674733203066286, + "flos": 28336247297280.0, + "grad_norm": 1.6715058951392505, + "language_loss": 0.69761688, + "learning_rate": 5.40751718539491e-07, + "loss": 0.77433044, + "num_input_tokens_seen": 275231670, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.0982666, + "step": 12765, + "time_per_iteration": 2.6227502822875977 + }, + { + "auxiliary_loss_clip": 0.06399858, + "auxiliary_loss_mlp": 0.012619, + "balance_loss_clip": 0.06270436, + "balance_loss_mlp": 0.01252769, + "epoch": 0.7675334435592965, + "flos": 16295307724800.0, + "grad_norm": 1.8004519699404298, + "language_loss": 0.6087966, + "learning_rate": 5.404854134668162e-07, + "loss": 0.6854142, + "num_input_tokens_seen": 275249425, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.09136963, + "step": 12766, + "time_per_iteration": 2.4817140102386475 + }, + { + "auxiliary_loss_clip": 0.06319875, + "auxiliary_loss_mlp": 0.01254158, + "balance_loss_clip": 0.06264514, + "balance_loss_mlp": 0.01252872, + "epoch": 0.7675935668119646, + "flos": 64847778376320.0, + "grad_norm": 0.7247432278410384, + "language_loss": 0.6077764, + "learning_rate": 5.402191637390803e-07, + "loss": 0.68351674, + "num_input_tokens_seen": 275312485, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01286316, + "step": 12767, + "time_per_iteration": 3.2508630752563477 + }, + { + "auxiliary_loss_clip": 0.06402268, + "auxiliary_loss_mlp": 0.01266038, + "balance_loss_clip": 0.06271527, + "balance_loss_mlp": 0.0125668, + "epoch": 0.7676536900646325, + "flos": 22682157488640.0, + "grad_norm": 1.91918463694606, + "language_loss": 0.69715631, + "learning_rate": 5.399529693663801e-07, + "loss": 0.77383935, + "num_input_tokens_seen": 275331680, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.09356689, + "step": 12768, + "time_per_iteration": 2.502361297607422 + }, + { + "auxiliary_loss_clip": 0.06411647, + "auxiliary_loss_mlp": 0.01267577, + "balance_loss_clip": 0.06273838, + "balance_loss_mlp": 0.01256729, + "epoch": 0.7677138133173005, + "flos": 26946393223680.0, + "grad_norm": 1.5949336757988604, + "language_loss": 0.70845366, + "learning_rate": 5.3968683035881e-07, + "loss": 0.7852459, + "num_input_tokens_seen": 275351615, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.10864258, + "step": 12769, + "time_per_iteration": 2.554861068725586 + }, + { + "auxiliary_loss_clip": 0.0641087, + "auxiliary_loss_mlp": 0.01267364, + "balance_loss_clip": 0.06275321, + "balance_loss_mlp": 0.01257184, + "epoch": 0.7677739365699684, + "flos": 23805336343680.0, + "grad_norm": 1.7985045785763099, + "language_loss": 0.80694544, + "learning_rate": 5.394207467264611e-07, + "loss": 0.88372779, + "num_input_tokens_seen": 275368815, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10174561, + "step": 12770, + "time_per_iteration": 3.9488418102264404 + }, + { + "auxiliary_loss_clip": 0.06402189, + "auxiliary_loss_mlp": 0.01263232, + "balance_loss_clip": 0.06272912, + "balance_loss_mlp": 0.01254363, + "epoch": 0.7678340598226364, + "flos": 34463423658240.0, + "grad_norm": 1.5007452698192065, + "language_loss": 0.78956687, + "learning_rate": 5.391547184794245e-07, + "loss": 0.86622107, + "num_input_tokens_seen": 275389345, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.08868408, + "step": 12771, + "time_per_iteration": 2.5934486389160156 + }, + { + "auxiliary_loss_clip": 0.06403628, + "auxiliary_loss_mlp": 0.01263065, + "balance_loss_clip": 0.06271377, + "balance_loss_mlp": 0.01253487, + "epoch": 0.7678941830753043, + "flos": 23848493996160.0, + "grad_norm": 1.2517341680866723, + "language_loss": 0.68444574, + "learning_rate": 5.388887456277876e-07, + "loss": 0.76111269, + "num_input_tokens_seen": 275411240, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09576416, + "step": 12772, + "time_per_iteration": 2.5651042461395264 + }, + { + "auxiliary_loss_clip": 0.06401607, + "auxiliary_loss_mlp": 0.01265845, + "balance_loss_clip": 0.0627486, + "balance_loss_mlp": 0.01256893, + "epoch": 0.7679543063279723, + "flos": 25417995223680.0, + "grad_norm": 1.427251107853352, + "language_loss": 0.73993248, + "learning_rate": 5.386228281816349e-07, + "loss": 0.816607, + "num_input_tokens_seen": 275432010, + "router_z_loss_clip": 1.265625, + "router_z_loss_mlp": 0.08953857, + "step": 12773, + "time_per_iteration": 2.5750787258148193 + }, + { + "auxiliary_loss_clip": 0.0639642, + "auxiliary_loss_mlp": 0.01264695, + "balance_loss_clip": 0.06268573, + "balance_loss_mlp": 0.01256554, + "epoch": 0.7680144295806404, + "flos": 27969448049280.0, + "grad_norm": 1.5249418922144822, + "language_loss": 0.81278884, + "learning_rate": 5.383569661510512e-07, + "loss": 0.88940001, + "num_input_tokens_seen": 275453710, + "router_z_loss_clip": 1.27734375, + "router_z_loss_mlp": 0.0814209, + "step": 12774, + "time_per_iteration": 2.549635648727417 + }, + { + "auxiliary_loss_clip": 0.06401657, + "auxiliary_loss_mlp": 0.01264098, + "balance_loss_clip": 0.06272675, + "balance_loss_mlp": 0.01254757, + "epoch": 0.7680745528333083, + "flos": 20419112816640.0, + "grad_norm": 2.7097792481139122, + "language_loss": 0.69999617, + "learning_rate": 5.380911595461177e-07, + "loss": 0.77665365, + "num_input_tokens_seen": 275472915, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.09338379, + "step": 12775, + "time_per_iteration": 2.502872943878174 + }, + { + "auxiliary_loss_clip": 0.06317612, + "auxiliary_loss_mlp": 0.01254016, + "balance_loss_clip": 0.0626227, + "balance_loss_mlp": 0.0125271, + "epoch": 0.7681346760859763, + "flos": 68423124568320.0, + "grad_norm": 0.6822831430052362, + "language_loss": 0.5694207, + "learning_rate": 5.378254083769147e-07, + "loss": 0.64513695, + "num_input_tokens_seen": 275534785, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01306915, + "step": 12776, + "time_per_iteration": 3.1927366256713867 + }, + { + "auxiliary_loss_clip": 0.0640178, + "auxiliary_loss_mlp": 0.0126464, + "balance_loss_clip": 0.06271428, + "balance_loss_mlp": 0.01255545, + "epoch": 0.7681947993386442, + "flos": 21257824659840.0, + "grad_norm": 1.8462760284119832, + "language_loss": 0.74373579, + "learning_rate": 5.375597126535188e-07, + "loss": 0.8204, + "num_input_tokens_seen": 275553205, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.09100342, + "step": 12777, + "time_per_iteration": 2.5175979137420654 + }, + { + "auxiliary_loss_clip": 0.06408069, + "auxiliary_loss_mlp": 0.0126398, + "balance_loss_clip": 0.06275662, + "balance_loss_mlp": 0.01254837, + "epoch": 0.7682549225913122, + "flos": 21404125088640.0, + "grad_norm": 1.9483232393983472, + "language_loss": 0.70101172, + "learning_rate": 5.372940723860043e-07, + "loss": 0.77773219, + "num_input_tokens_seen": 275571490, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.09143066, + "step": 12778, + "time_per_iteration": 2.6068058013916016 + }, + { + "auxiliary_loss_clip": 0.06405266, + "auxiliary_loss_mlp": 0.0126478, + "balance_loss_clip": 0.06274477, + "balance_loss_mlp": 0.01255172, + "epoch": 0.7683150458439801, + "flos": 23045518719360.0, + "grad_norm": 1.8309114800353317, + "language_loss": 0.70335215, + "learning_rate": 5.37028487584446e-07, + "loss": 0.7800526, + "num_input_tokens_seen": 275589665, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09619141, + "step": 12779, + "time_per_iteration": 4.003666639328003 + }, + { + "auxiliary_loss_clip": 0.0640587, + "auxiliary_loss_mlp": 0.01265519, + "balance_loss_clip": 0.062737, + "balance_loss_mlp": 0.01255898, + "epoch": 0.7683751690966482, + "flos": 67346361204480.0, + "grad_norm": 1.5118738364126798, + "language_loss": 0.58973181, + "learning_rate": 5.367629582589133e-07, + "loss": 0.66644573, + "num_input_tokens_seen": 275615605, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09619141, + "step": 12780, + "time_per_iteration": 2.915029525756836 + }, + { + "auxiliary_loss_clip": 0.06409752, + "auxiliary_loss_mlp": 0.01268476, + "balance_loss_clip": 0.06273384, + "balance_loss_mlp": 0.01258587, + "epoch": 0.7684352923493161, + "flos": 21805361164800.0, + "grad_norm": 2.2303773736896373, + "language_loss": 0.68361402, + "learning_rate": 5.364974844194759e-07, + "loss": 0.7603963, + "num_input_tokens_seen": 275634965, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.09881592, + "step": 12781, + "time_per_iteration": 4.043205976486206 + }, + { + "auxiliary_loss_clip": 0.06404178, + "auxiliary_loss_mlp": 0.01263917, + "balance_loss_clip": 0.06271324, + "balance_loss_mlp": 0.01254428, + "epoch": 0.7684954156019841, + "flos": 25854548595840.0, + "grad_norm": 1.651939170673441, + "language_loss": 0.79629219, + "learning_rate": 5.362320660762016e-07, + "loss": 0.87297314, + "num_input_tokens_seen": 275655785, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.0949707, + "step": 12782, + "time_per_iteration": 2.5380043983459473 + }, + { + "auxiliary_loss_clip": 0.06406912, + "auxiliary_loss_mlp": 0.01263775, + "balance_loss_clip": 0.06272779, + "balance_loss_mlp": 0.01253719, + "epoch": 0.768555538854652, + "flos": 25454444549760.0, + "grad_norm": 1.9972993449433587, + "language_loss": 0.66687256, + "learning_rate": 5.35966703239153e-07, + "loss": 0.74357939, + "num_input_tokens_seen": 275676160, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.10058594, + "step": 12783, + "time_per_iteration": 2.5223419666290283 + }, + { + "auxiliary_loss_clip": 0.0640647, + "auxiliary_loss_mlp": 0.01262671, + "balance_loss_clip": 0.06273863, + "balance_loss_mlp": 0.01253069, + "epoch": 0.76861566210732, + "flos": 19652503011840.0, + "grad_norm": 1.5789937278772177, + "language_loss": 0.69208997, + "learning_rate": 5.357013959183938e-07, + "loss": 0.7687813, + "num_input_tokens_seen": 275695660, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09606934, + "step": 12784, + "time_per_iteration": 2.5100221633911133 + }, + { + "auxiliary_loss_clip": 0.06402996, + "auxiliary_loss_mlp": 0.01264042, + "balance_loss_clip": 0.06271263, + "balance_loss_mlp": 0.01255482, + "epoch": 0.7686757853599879, + "flos": 22425586686720.0, + "grad_norm": 2.2747197635366074, + "language_loss": 0.80762935, + "learning_rate": 5.354361441239843e-07, + "loss": 0.88429976, + "num_input_tokens_seen": 275714025, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.08551025, + "step": 12785, + "time_per_iteration": 2.4869916439056396 + }, + { + "auxiliary_loss_clip": 0.06404176, + "auxiliary_loss_mlp": 0.01265645, + "balance_loss_clip": 0.06271531, + "balance_loss_mlp": 0.01255506, + "epoch": 0.768735908612656, + "flos": 47784659690880.0, + "grad_norm": 2.213863326437895, + "language_loss": 0.7748611, + "learning_rate": 5.351709478659836e-07, + "loss": 0.85155928, + "num_input_tokens_seen": 275737300, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.10137939, + "step": 12786, + "time_per_iteration": 2.7327218055725098 + }, + { + "auxiliary_loss_clip": 0.06400453, + "auxiliary_loss_mlp": 0.01264363, + "balance_loss_clip": 0.06269495, + "balance_loss_mlp": 0.01254844, + "epoch": 0.7687960318653239, + "flos": 30270996472320.0, + "grad_norm": 1.9359041928849132, + "language_loss": 0.58734947, + "learning_rate": 5.349058071544468e-07, + "loss": 0.66399765, + "num_input_tokens_seen": 275757895, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.09515381, + "step": 12787, + "time_per_iteration": 4.117979288101196 + }, + { + "auxiliary_loss_clip": 0.06401558, + "auxiliary_loss_mlp": 0.01264466, + "balance_loss_clip": 0.06272475, + "balance_loss_mlp": 0.01254972, + "epoch": 0.7688561551179919, + "flos": 19579562432640.0, + "grad_norm": 1.5619171139299415, + "language_loss": 0.76386726, + "learning_rate": 5.346407219994292e-07, + "loss": 0.84052753, + "num_input_tokens_seen": 275776745, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.0949707, + "step": 12788, + "time_per_iteration": 2.5265915393829346 + }, + { + "auxiliary_loss_clip": 0.06405907, + "auxiliary_loss_mlp": 0.0126463, + "balance_loss_clip": 0.06274015, + "balance_loss_mlp": 0.01254771, + "epoch": 0.7689162783706599, + "flos": 22790373436800.0, + "grad_norm": 1.5307962602577754, + "language_loss": 0.666574, + "learning_rate": 5.343756924109821e-07, + "loss": 0.74327934, + "num_input_tokens_seen": 275797205, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09844971, + "step": 12789, + "time_per_iteration": 2.5482897758483887 + }, + { + "auxiliary_loss_clip": 0.06407897, + "auxiliary_loss_mlp": 0.01269129, + "balance_loss_clip": 0.062732, + "balance_loss_mlp": 0.01258842, + "epoch": 0.7689764016233278, + "flos": 34212764568960.0, + "grad_norm": 1.7716505240879148, + "language_loss": 0.68803114, + "learning_rate": 5.341107183991553e-07, + "loss": 0.76480138, + "num_input_tokens_seen": 275817935, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.10290527, + "step": 12790, + "time_per_iteration": 2.6209323406219482 + }, + { + "auxiliary_loss_clip": 0.06403899, + "auxiliary_loss_mlp": 0.01263088, + "balance_loss_clip": 0.0627263, + "balance_loss_mlp": 0.01253825, + "epoch": 0.7690365248759958, + "flos": 17280152288640.0, + "grad_norm": 1.3993850053379062, + "language_loss": 0.68957317, + "learning_rate": 5.338457999739969e-07, + "loss": 0.76624304, + "num_input_tokens_seen": 275837145, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09265137, + "step": 12791, + "time_per_iteration": 2.5464963912963867 + }, + { + "auxiliary_loss_clip": 0.06400929, + "auxiliary_loss_mlp": 0.01264866, + "balance_loss_clip": 0.06270476, + "balance_loss_mlp": 0.01255418, + "epoch": 0.7690966481286637, + "flos": 18229008723840.0, + "grad_norm": 1.5956237198168277, + "language_loss": 0.79798484, + "learning_rate": 5.335809371455526e-07, + "loss": 0.87464273, + "num_input_tokens_seen": 275855705, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.09448242, + "step": 12792, + "time_per_iteration": 2.489346981048584 + }, + { + "auxiliary_loss_clip": 0.06410688, + "auxiliary_loss_mlp": 0.01268815, + "balance_loss_clip": 0.06273898, + "balance_loss_mlp": 0.01258999, + "epoch": 0.7691567713813318, + "flos": 21543004431360.0, + "grad_norm": 1.8308011822945844, + "language_loss": 0.73121727, + "learning_rate": 5.333161299238673e-07, + "loss": 0.80801225, + "num_input_tokens_seen": 275873930, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.09814453, + "step": 12793, + "time_per_iteration": 2.558523416519165 + }, + { + "auxiliary_loss_clip": 0.06407025, + "auxiliary_loss_mlp": 0.01264714, + "balance_loss_clip": 0.06272246, + "balance_loss_mlp": 0.01254689, + "epoch": 0.7692168946339997, + "flos": 39388568872320.0, + "grad_norm": 1.7835594774438226, + "language_loss": 0.63780582, + "learning_rate": 5.330513783189803e-07, + "loss": 0.7145232, + "num_input_tokens_seen": 275895895, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.1003418, + "step": 12794, + "time_per_iteration": 2.6618335247039795 + }, + { + "auxiliary_loss_clip": 0.06408365, + "auxiliary_loss_mlp": 0.01265956, + "balance_loss_clip": 0.06273225, + "balance_loss_mlp": 0.01256336, + "epoch": 0.7692770178866677, + "flos": 25017010709760.0, + "grad_norm": 1.4664054108250584, + "language_loss": 0.76531231, + "learning_rate": 5.327866823409319e-07, + "loss": 0.84205556, + "num_input_tokens_seen": 275917825, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.09619141, + "step": 12795, + "time_per_iteration": 2.5922963619232178 + }, + { + "auxiliary_loss_clip": 0.0640534, + "auxiliary_loss_mlp": 0.01263991, + "balance_loss_clip": 0.0627051, + "balance_loss_mlp": 0.01253453, + "epoch": 0.7693371411393356, + "flos": 24722984332800.0, + "grad_norm": 1.4884281283084904, + "language_loss": 0.72098613, + "learning_rate": 5.325220419997601e-07, + "loss": 0.79767948, + "num_input_tokens_seen": 275937890, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.10540771, + "step": 12796, + "time_per_iteration": 2.5227742195129395 + }, + { + "auxiliary_loss_clip": 0.06403993, + "auxiliary_loss_mlp": 0.01265667, + "balance_loss_clip": 0.06272089, + "balance_loss_mlp": 0.01255994, + "epoch": 0.7693972643920036, + "flos": 15930311339520.0, + "grad_norm": 1.7278751632986438, + "language_loss": 0.64795017, + "learning_rate": 5.32257457305499e-07, + "loss": 0.72464675, + "num_input_tokens_seen": 275954495, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09667969, + "step": 12797, + "time_per_iteration": 2.503452777862549 + }, + { + "auxiliary_loss_clip": 0.06409369, + "auxiliary_loss_mlp": 0.0127561, + "balance_loss_clip": 0.06275479, + "balance_loss_mlp": 0.01264798, + "epoch": 0.7694573876446715, + "flos": 25412125438080.0, + "grad_norm": 1.8485649321852773, + "language_loss": 0.91645068, + "learning_rate": 5.319929282681823e-07, + "loss": 0.9933005, + "num_input_tokens_seen": 275972395, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.10809326, + "step": 12798, + "time_per_iteration": 2.5266406536102295 + }, + { + "auxiliary_loss_clip": 0.06401522, + "auxiliary_loss_mlp": 0.01265889, + "balance_loss_clip": 0.06268082, + "balance_loss_mlp": 0.01256489, + "epoch": 0.7695175108973396, + "flos": 16659800985600.0, + "grad_norm": 1.7639360291305515, + "language_loss": 0.82879943, + "learning_rate": 5.317284548978418e-07, + "loss": 0.90547353, + "num_input_tokens_seen": 275989020, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.09387207, + "step": 12799, + "time_per_iteration": 2.4981637001037598 + }, + { + "auxiliary_loss_clip": 0.06404725, + "auxiliary_loss_mlp": 0.01268019, + "balance_loss_clip": 0.06270967, + "balance_loss_mlp": 0.01257862, + "epoch": 0.7695776341500075, + "flos": 13631697809280.0, + "grad_norm": 2.5788494866617513, + "language_loss": 0.78243637, + "learning_rate": 5.314640372045045e-07, + "loss": 0.85916382, + "num_input_tokens_seen": 276006525, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.10162354, + "step": 12800, + "time_per_iteration": 2.472907304763794 + }, + { + "auxiliary_loss_clip": 0.06410202, + "auxiliary_loss_mlp": 0.01266803, + "balance_loss_clip": 0.06270645, + "balance_loss_mlp": 0.01256182, + "epoch": 0.7696377574026755, + "flos": 24283034870400.0, + "grad_norm": 1.8264730167588297, + "language_loss": 0.84045184, + "learning_rate": 5.31199675198198e-07, + "loss": 0.9172219, + "num_input_tokens_seen": 276027130, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.10620117, + "step": 12801, + "time_per_iteration": 2.53623366355896 + }, + { + "auxiliary_loss_clip": 0.06406119, + "auxiliary_loss_mlp": 0.01267538, + "balance_loss_clip": 0.06272501, + "balance_loss_mlp": 0.01257495, + "epoch": 0.7696978806553435, + "flos": 20929445308800.0, + "grad_norm": 1.8709548721646438, + "language_loss": 0.73054564, + "learning_rate": 5.30935368888947e-07, + "loss": 0.80728221, + "num_input_tokens_seen": 276045715, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.1003418, + "step": 12802, + "time_per_iteration": 2.4759271144866943 + }, + { + "auxiliary_loss_clip": 0.06399865, + "auxiliary_loss_mlp": 0.01265258, + "balance_loss_clip": 0.06271532, + "balance_loss_mlp": 0.01255757, + "epoch": 0.7697580039080114, + "flos": 22936212668160.0, + "grad_norm": 1.8081953162086668, + "language_loss": 0.76470077, + "learning_rate": 5.306711182867747e-07, + "loss": 0.84135199, + "num_input_tokens_seen": 276065375, + "router_z_loss_clip": 1.28417969, + "router_z_loss_mlp": 0.0949707, + "step": 12803, + "time_per_iteration": 2.5474445819854736 + }, + { + "auxiliary_loss_clip": 0.06313179, + "auxiliary_loss_mlp": 0.01253049, + "balance_loss_clip": 0.06258132, + "balance_loss_mlp": 0.01251863, + "epoch": 0.7698181271606794, + "flos": 68737751850240.0, + "grad_norm": 0.742546771949619, + "language_loss": 0.55879092, + "learning_rate": 5.304069234017001e-07, + "loss": 0.63445318, + "num_input_tokens_seen": 276131405, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.01184082, + "step": 12804, + "time_per_iteration": 3.1489827632904053 + }, + { + "auxiliary_loss_clip": 0.06316254, + "auxiliary_loss_mlp": 0.0125264, + "balance_loss_clip": 0.0626114, + "balance_loss_mlp": 0.01251505, + "epoch": 0.7698782504133473, + "flos": 67430523502080.0, + "grad_norm": 0.7295540312789194, + "language_loss": 0.53939354, + "learning_rate": 5.301427842437429e-07, + "loss": 0.61508244, + "num_input_tokens_seen": 276200755, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.0113678, + "step": 12805, + "time_per_iteration": 3.2659192085266113 + }, + { + "auxiliary_loss_clip": 0.0640514, + "auxiliary_loss_mlp": 0.01270733, + "balance_loss_clip": 0.06272765, + "balance_loss_mlp": 0.01261047, + "epoch": 0.7699383736660154, + "flos": 22494879613440.0, + "grad_norm": 3.06352805467247, + "language_loss": 0.73035467, + "learning_rate": 5.298787008229187e-07, + "loss": 0.80711341, + "num_input_tokens_seen": 276217880, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.09686279, + "step": 12806, + "time_per_iteration": 2.4905054569244385 + }, + { + "auxiliary_loss_clip": 0.06401073, + "auxiliary_loss_mlp": 0.01266133, + "balance_loss_clip": 0.06269582, + "balance_loss_mlp": 0.01256704, + "epoch": 0.7699984969186833, + "flos": 21545520053760.0, + "grad_norm": 1.6739965963260217, + "language_loss": 0.75159943, + "learning_rate": 5.296146731492408e-07, + "loss": 0.82827145, + "num_input_tokens_seen": 276234810, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09423828, + "step": 12807, + "time_per_iteration": 2.5074682235717773 + }, + { + "auxiliary_loss_clip": 0.06406098, + "auxiliary_loss_mlp": 0.01264768, + "balance_loss_clip": 0.0626993, + "balance_loss_mlp": 0.01254098, + "epoch": 0.7700586201713513, + "flos": 21724412520960.0, + "grad_norm": 2.037865665188592, + "language_loss": 0.8067742, + "learning_rate": 5.293507012327218e-07, + "loss": 0.88348287, + "num_input_tokens_seen": 276252850, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.10681152, + "step": 12808, + "time_per_iteration": 3.8791632652282715 + }, + { + "auxiliary_loss_clip": 0.06407686, + "auxiliary_loss_mlp": 0.01266704, + "balance_loss_clip": 0.06271963, + "balance_loss_mlp": 0.01256595, + "epoch": 0.7701187434240192, + "flos": 27863580015360.0, + "grad_norm": 1.7006184108687237, + "language_loss": 0.7921378, + "learning_rate": 5.290867850833718e-07, + "loss": 0.8688817, + "num_input_tokens_seen": 276272525, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10113525, + "step": 12809, + "time_per_iteration": 2.5961480140686035 + }, + { + "auxiliary_loss_clip": 0.06399591, + "auxiliary_loss_mlp": 0.01264077, + "balance_loss_clip": 0.06270431, + "balance_loss_mlp": 0.01254594, + "epoch": 0.7701788666766872, + "flos": 28628848154880.0, + "grad_norm": 1.4421816702879584, + "language_loss": 0.70197344, + "learning_rate": 5.288229247111993e-07, + "loss": 0.77861011, + "num_input_tokens_seen": 276294210, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.0947876, + "step": 12810, + "time_per_iteration": 2.6107945442199707 + }, + { + "auxiliary_loss_clip": 0.06406891, + "auxiliary_loss_mlp": 0.01266297, + "balance_loss_clip": 0.06271058, + "balance_loss_mlp": 0.01254769, + "epoch": 0.7702389899293551, + "flos": 14251671768960.0, + "grad_norm": 2.2769003713635967, + "language_loss": 0.78979844, + "learning_rate": 5.285591201262079e-07, + "loss": 0.8665303, + "num_input_tokens_seen": 276310290, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.11523438, + "step": 12811, + "time_per_iteration": 2.555101156234741 + }, + { + "auxiliary_loss_clip": 0.06317817, + "auxiliary_loss_mlp": 0.01251839, + "balance_loss_clip": 0.06262816, + "balance_loss_mlp": 0.01250771, + "epoch": 0.7702991131820232, + "flos": 70593816441600.0, + "grad_norm": 0.7969175673938892, + "language_loss": 0.56677693, + "learning_rate": 5.28295371338402e-07, + "loss": 0.64247346, + "num_input_tokens_seen": 276371715, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.01069641, + "step": 12812, + "time_per_iteration": 3.1775879859924316 + }, + { + "auxiliary_loss_clip": 0.06404653, + "auxiliary_loss_mlp": 0.01265227, + "balance_loss_clip": 0.0627086, + "balance_loss_mlp": 0.01254898, + "epoch": 0.7703592364346911, + "flos": 25486449609600.0, + "grad_norm": 1.6911953299431426, + "language_loss": 0.72016954, + "learning_rate": 5.280316783577836e-07, + "loss": 0.79686838, + "num_input_tokens_seen": 276389895, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.10327148, + "step": 12813, + "time_per_iteration": 2.525716781616211 + }, + { + "auxiliary_loss_clip": 0.06403896, + "auxiliary_loss_mlp": 0.01265029, + "balance_loss_clip": 0.06270216, + "balance_loss_mlp": 0.01254962, + "epoch": 0.7704193596873591, + "flos": 19286877720960.0, + "grad_norm": 1.5106493285856717, + "language_loss": 0.66542912, + "learning_rate": 5.27768041194351e-07, + "loss": 0.74211836, + "num_input_tokens_seen": 276408990, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.10058594, + "step": 12814, + "time_per_iteration": 2.511730432510376 + }, + { + "auxiliary_loss_clip": 0.06403521, + "auxiliary_loss_mlp": 0.01267694, + "balance_loss_clip": 0.06271755, + "balance_loss_mlp": 0.01258288, + "epoch": 0.7704794829400271, + "flos": 23665031481600.0, + "grad_norm": 1.765991608700586, + "language_loss": 0.65916228, + "learning_rate": 5.275044598581018e-07, + "loss": 0.73587441, + "num_input_tokens_seen": 276428190, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09399414, + "step": 12815, + "time_per_iteration": 2.552647113800049 + }, + { + "auxiliary_loss_clip": 0.06402738, + "auxiliary_loss_mlp": 0.01262909, + "balance_loss_clip": 0.06270017, + "balance_loss_mlp": 0.01253324, + "epoch": 0.770539606192695, + "flos": 18995283112320.0, + "grad_norm": 3.1094364137223325, + "language_loss": 0.65588892, + "learning_rate": 5.272409343590322e-07, + "loss": 0.73254538, + "num_input_tokens_seen": 276446855, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.0958252, + "step": 12816, + "time_per_iteration": 2.5682597160339355 + }, + { + "auxiliary_loss_clip": 0.06410483, + "auxiliary_loss_mlp": 0.01271453, + "balance_loss_clip": 0.06275068, + "balance_loss_mlp": 0.01261321, + "epoch": 0.770599729445363, + "flos": 11833605843840.0, + "grad_norm": 2.2637093644731685, + "language_loss": 0.72246104, + "learning_rate": 5.26977464707133e-07, + "loss": 0.79928041, + "num_input_tokens_seen": 276462000, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.10131836, + "step": 12817, + "time_per_iteration": 2.485805034637451 + }, + { + "auxiliary_loss_clip": 0.06404669, + "auxiliary_loss_mlp": 0.01264386, + "balance_loss_clip": 0.06271846, + "balance_loss_mlp": 0.01254677, + "epoch": 0.770659852698031, + "flos": 17828527334400.0, + "grad_norm": 3.0609511184199523, + "language_loss": 0.61409748, + "learning_rate": 5.267140509123957e-07, + "loss": 0.69078803, + "num_input_tokens_seen": 276481190, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.0970459, + "step": 12818, + "time_per_iteration": 2.487680673599243 + }, + { + "auxiliary_loss_clip": 0.06399722, + "auxiliary_loss_mlp": 0.01262281, + "balance_loss_clip": 0.062704, + "balance_loss_mlp": 0.01253603, + "epoch": 0.770719975950699, + "flos": 21878469452160.0, + "grad_norm": 1.7396688274909713, + "language_loss": 0.67373377, + "learning_rate": 5.264506929848093e-07, + "loss": 0.75035375, + "num_input_tokens_seen": 276499520, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.08676147, + "step": 12819, + "time_per_iteration": 3.9379172325134277 + }, + { + "auxiliary_loss_clip": 0.06406172, + "auxiliary_loss_mlp": 0.01263778, + "balance_loss_clip": 0.06271698, + "balance_loss_mlp": 0.01253848, + "epoch": 0.7707800992033669, + "flos": 21331519925760.0, + "grad_norm": 1.7217491542401215, + "language_loss": 0.57604039, + "learning_rate": 5.261873909343608e-07, + "loss": 0.65273988, + "num_input_tokens_seen": 276519110, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.09924316, + "step": 12820, + "time_per_iteration": 2.495925188064575 + }, + { + "auxiliary_loss_clip": 0.06404679, + "auxiliary_loss_mlp": 0.01262498, + "balance_loss_clip": 0.06269978, + "balance_loss_mlp": 0.01252735, + "epoch": 0.7708402224560349, + "flos": 28186215361920.0, + "grad_norm": 1.643911762743471, + "language_loss": 0.81179225, + "learning_rate": 5.259241447710343e-07, + "loss": 0.88846403, + "num_input_tokens_seen": 276538805, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.09771729, + "step": 12821, + "time_per_iteration": 3.986278772354126 + }, + { + "auxiliary_loss_clip": 0.06404622, + "auxiliary_loss_mlp": 0.012636, + "balance_loss_clip": 0.06271188, + "balance_loss_mlp": 0.01253521, + "epoch": 0.7709003457087028, + "flos": 15382397491200.0, + "grad_norm": 1.8555601189743978, + "language_loss": 0.68379205, + "learning_rate": 5.256609545048114e-07, + "loss": 0.76047421, + "num_input_tokens_seen": 276554770, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.10076904, + "step": 12822, + "time_per_iteration": 2.4856462478637695 + }, + { + "auxiliary_loss_clip": 0.06400201, + "auxiliary_loss_mlp": 0.01266424, + "balance_loss_clip": 0.0626999, + "balance_loss_mlp": 0.01256786, + "epoch": 0.7709604689613708, + "flos": 30628697552640.0, + "grad_norm": 2.043450133419636, + "language_loss": 0.72353333, + "learning_rate": 5.253978201456733e-07, + "loss": 0.80019963, + "num_input_tokens_seen": 276574535, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.09637451, + "step": 12823, + "time_per_iteration": 2.5663697719573975 + }, + { + "auxiliary_loss_clip": 0.06408671, + "auxiliary_loss_mlp": 0.0126507, + "balance_loss_clip": 0.06270947, + "balance_loss_mlp": 0.01254437, + "epoch": 0.7710205922140387, + "flos": 20307207288960.0, + "grad_norm": 1.6756825279286318, + "language_loss": 0.76604235, + "learning_rate": 5.251347417035969e-07, + "loss": 0.84277976, + "num_input_tokens_seen": 276592925, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.10632324, + "step": 12824, + "time_per_iteration": 2.5135273933410645 + }, + { + "auxiliary_loss_clip": 0.0640358, + "auxiliary_loss_mlp": 0.01264934, + "balance_loss_clip": 0.06270701, + "balance_loss_mlp": 0.01255332, + "epoch": 0.7710807154667068, + "flos": 19649987389440.0, + "grad_norm": 2.8682033137355605, + "language_loss": 0.72291267, + "learning_rate": 5.248717191885592e-07, + "loss": 0.79959786, + "num_input_tokens_seen": 276610540, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.0960083, + "step": 12825, + "time_per_iteration": 2.539870262145996 + }, + { + "auxiliary_loss_clip": 0.06397466, + "auxiliary_loss_mlp": 0.01266775, + "balance_loss_clip": 0.06270086, + "balance_loss_mlp": 0.01257602, + "epoch": 0.7711408387193747, + "flos": 20011713465600.0, + "grad_norm": 1.348856880561093, + "language_loss": 0.73990041, + "learning_rate": 5.246087526105343e-07, + "loss": 0.8165428, + "num_input_tokens_seen": 276629200, + "router_z_loss_clip": 1.27539062, + "router_z_loss_mlp": 0.0916748, + "step": 12826, + "time_per_iteration": 3.9455349445343018 + }, + { + "auxiliary_loss_clip": 0.06404951, + "auxiliary_loss_mlp": 0.012643, + "balance_loss_clip": 0.06270047, + "balance_loss_mlp": 0.01253554, + "epoch": 0.7712009619720427, + "flos": 24977794199040.0, + "grad_norm": 1.495331253862981, + "language_loss": 0.81176156, + "learning_rate": 5.243458419794933e-07, + "loss": 0.88845408, + "num_input_tokens_seen": 276648655, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.10748291, + "step": 12827, + "time_per_iteration": 2.5489249229431152 + }, + { + "auxiliary_loss_clip": 0.0631479, + "auxiliary_loss_mlp": 0.01256103, + "balance_loss_clip": 0.06259546, + "balance_loss_mlp": 0.01255053, + "epoch": 0.7712610852247107, + "flos": 63269682105600.0, + "grad_norm": 0.8475476558719117, + "language_loss": 0.55242074, + "learning_rate": 5.240829873054051e-07, + "loss": 0.6281296, + "num_input_tokens_seen": 276716500, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01051331, + "step": 12828, + "time_per_iteration": 3.2874319553375244 + }, + { + "auxiliary_loss_clip": 0.06395887, + "auxiliary_loss_mlp": 0.01264145, + "balance_loss_clip": 0.06267989, + "balance_loss_mlp": 0.01255317, + "epoch": 0.7713212084773786, + "flos": 18703856211840.0, + "grad_norm": 1.6628752588878346, + "language_loss": 0.69472146, + "learning_rate": 5.23820188598238e-07, + "loss": 0.77132177, + "num_input_tokens_seen": 276733535, + "router_z_loss_clip": 1.27832031, + "router_z_loss_mlp": 0.08825684, + "step": 12829, + "time_per_iteration": 2.5006113052368164 + }, + { + "auxiliary_loss_clip": 0.06407359, + "auxiliary_loss_mlp": 0.01263662, + "balance_loss_clip": 0.06270751, + "balance_loss_mlp": 0.01253428, + "epoch": 0.7713813317300466, + "flos": 14178563481600.0, + "grad_norm": 2.5004318889819146, + "language_loss": 0.79485464, + "learning_rate": 5.235574458679579e-07, + "loss": 0.87156487, + "num_input_tokens_seen": 276749575, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.10235596, + "step": 12830, + "time_per_iteration": 2.455521821975708 + }, + { + "auxiliary_loss_clip": 0.06408571, + "auxiliary_loss_mlp": 0.01265761, + "balance_loss_clip": 0.06271582, + "balance_loss_mlp": 0.01254853, + "epoch": 0.7714414549827145, + "flos": 25711266913920.0, + "grad_norm": 1.5558349458942582, + "language_loss": 0.78193223, + "learning_rate": 5.232947591245269e-07, + "loss": 0.85867554, + "num_input_tokens_seen": 276769460, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.10906982, + "step": 12831, + "time_per_iteration": 2.55888295173645 + }, + { + "auxiliary_loss_clip": 0.06400928, + "auxiliary_loss_mlp": 0.01266262, + "balance_loss_clip": 0.06268953, + "balance_loss_mlp": 0.01256547, + "epoch": 0.7715015782353826, + "flos": 30563219986560.0, + "grad_norm": 1.4404933685883998, + "language_loss": 0.61150742, + "learning_rate": 5.230321283779071e-07, + "loss": 0.68817931, + "num_input_tokens_seen": 276790820, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.0970459, + "step": 12832, + "time_per_iteration": 2.5705411434173584 + }, + { + "auxiliary_loss_clip": 0.06408297, + "auxiliary_loss_mlp": 0.01268082, + "balance_loss_clip": 0.06271287, + "balance_loss_mlp": 0.01258271, + "epoch": 0.7715617014880505, + "flos": 20235440666880.0, + "grad_norm": 1.4904530814793735, + "language_loss": 0.79785657, + "learning_rate": 5.227695536380572e-07, + "loss": 0.87462032, + "num_input_tokens_seen": 276811345, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.09814453, + "step": 12833, + "time_per_iteration": 2.5475685596466064 + }, + { + "auxiliary_loss_clip": 0.06315958, + "auxiliary_loss_mlp": 0.01251107, + "balance_loss_clip": 0.06260836, + "balance_loss_mlp": 0.01250079, + "epoch": 0.7716218247407185, + "flos": 63681037326720.0, + "grad_norm": 0.8315874052432679, + "language_loss": 0.55088067, + "learning_rate": 5.22507034914933e-07, + "loss": 0.62655127, + "num_input_tokens_seen": 276870950, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.01027679, + "step": 12834, + "time_per_iteration": 3.1191012859344482 + }, + { + "auxiliary_loss_clip": 0.0640831, + "auxiliary_loss_mlp": 0.01264302, + "balance_loss_clip": 0.06273386, + "balance_loss_mlp": 0.01254294, + "epoch": 0.7716819479933864, + "flos": 19797881045760.0, + "grad_norm": 2.410723884633937, + "language_loss": 0.73350394, + "learning_rate": 5.222445722184903e-07, + "loss": 0.81023002, + "num_input_tokens_seen": 276890760, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.09997559, + "step": 12835, + "time_per_iteration": 2.5506582260131836 + }, + { + "auxiliary_loss_clip": 0.06406028, + "auxiliary_loss_mlp": 0.01267171, + "balance_loss_clip": 0.06272173, + "balance_loss_mlp": 0.01257884, + "epoch": 0.7717420712460544, + "flos": 18448082023680.0, + "grad_norm": 2.0308771684786113, + "language_loss": 0.70508468, + "learning_rate": 5.219821655586814e-07, + "loss": 0.78181666, + "num_input_tokens_seen": 276909625, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.09289551, + "step": 12836, + "time_per_iteration": 2.5232300758361816 + }, + { + "auxiliary_loss_clip": 0.06398998, + "auxiliary_loss_mlp": 0.01268729, + "balance_loss_clip": 0.06270441, + "balance_loss_mlp": 0.01259222, + "epoch": 0.7718021944987223, + "flos": 35198238038400.0, + "grad_norm": 1.831037228573652, + "language_loss": 0.60367215, + "learning_rate": 5.217198149454575e-07, + "loss": 0.68034947, + "num_input_tokens_seen": 276930760, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.09509277, + "step": 12837, + "time_per_iteration": 2.6591076850891113 + }, + { + "auxiliary_loss_clip": 0.06317183, + "auxiliary_loss_mlp": 0.01257562, + "balance_loss_clip": 0.0626177, + "balance_loss_mlp": 0.01256482, + "epoch": 0.7718623177513904, + "flos": 67944503646720.0, + "grad_norm": 0.8462887217652507, + "language_loss": 0.55739456, + "learning_rate": 5.214575203887666e-07, + "loss": 0.63314199, + "num_input_tokens_seen": 276989580, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01081848, + "step": 12838, + "time_per_iteration": 3.0941390991210938 + }, + { + "auxiliary_loss_clip": 0.06402552, + "auxiliary_loss_mlp": 0.01264762, + "balance_loss_clip": 0.06271369, + "balance_loss_mlp": 0.01255345, + "epoch": 0.7719224410040583, + "flos": 18586206679680.0, + "grad_norm": 2.2960724340178156, + "language_loss": 0.69924515, + "learning_rate": 5.211952818985538e-07, + "loss": 0.77591836, + "num_input_tokens_seen": 277005450, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09411621, + "step": 12839, + "time_per_iteration": 2.4651598930358887 + }, + { + "auxiliary_loss_clip": 0.06401128, + "auxiliary_loss_mlp": 0.01263167, + "balance_loss_clip": 0.0627085, + "balance_loss_mlp": 0.01253893, + "epoch": 0.7719825642567263, + "flos": 23082471169920.0, + "grad_norm": 1.724099382102015, + "language_loss": 0.79996341, + "learning_rate": 5.209330994847647e-07, + "loss": 0.87660646, + "num_input_tokens_seen": 277023055, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.09277344, + "step": 12840, + "time_per_iteration": 2.494185447692871 + }, + { + "auxiliary_loss_clip": 0.0640455, + "auxiliary_loss_mlp": 0.01263769, + "balance_loss_clip": 0.06271051, + "balance_loss_mlp": 0.01254202, + "epoch": 0.7720426875093943, + "flos": 20345249842560.0, + "grad_norm": 1.700648368789641, + "language_loss": 0.80246019, + "learning_rate": 5.206709731573402e-07, + "loss": 0.87914336, + "num_input_tokens_seen": 277041150, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.09564209, + "step": 12841, + "time_per_iteration": 2.4959654808044434 + }, + { + "auxiliary_loss_clip": 0.06402302, + "auxiliary_loss_mlp": 0.01263637, + "balance_loss_clip": 0.06268935, + "balance_loss_mlp": 0.01254261, + "epoch": 0.7721028107620622, + "flos": 23887878215040.0, + "grad_norm": 1.6460484096163284, + "language_loss": 0.76556861, + "learning_rate": 5.204089029262208e-07, + "loss": 0.84222806, + "num_input_tokens_seen": 277063895, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.09381104, + "step": 12842, + "time_per_iteration": 2.5414130687713623 + }, + { + "auxiliary_loss_clip": 0.06408067, + "auxiliary_loss_mlp": 0.0126426, + "balance_loss_clip": 0.06272548, + "balance_loss_mlp": 0.01254527, + "epoch": 0.7721629340147302, + "flos": 26658865537920.0, + "grad_norm": 1.6198153669730124, + "language_loss": 0.68824613, + "learning_rate": 5.201468888013445e-07, + "loss": 0.76496947, + "num_input_tokens_seen": 277084045, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.09735107, + "step": 12843, + "time_per_iteration": 2.555246353149414 + }, + { + "auxiliary_loss_clip": 0.06407151, + "auxiliary_loss_mlp": 0.01263842, + "balance_loss_clip": 0.06270268, + "balance_loss_mlp": 0.01254377, + "epoch": 0.7722230572673981, + "flos": 21185261424000.0, + "grad_norm": 1.9549573678277232, + "language_loss": 0.73833585, + "learning_rate": 5.198849307926465e-07, + "loss": 0.81504577, + "num_input_tokens_seen": 277102625, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.09472656, + "step": 12844, + "time_per_iteration": 2.475722312927246 + }, + { + "auxiliary_loss_clip": 0.06400653, + "auxiliary_loss_mlp": 0.01262464, + "balance_loss_clip": 0.0626903, + "balance_loss_mlp": 0.01253327, + "epoch": 0.7722831805200662, + "flos": 27972089452800.0, + "grad_norm": 1.4105737815374062, + "language_loss": 0.71880949, + "learning_rate": 5.196230289100596e-07, + "loss": 0.79544067, + "num_input_tokens_seen": 277123210, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09143066, + "step": 12845, + "time_per_iteration": 2.537477493286133 + }, + { + "auxiliary_loss_clip": 0.06397612, + "auxiliary_loss_mlp": 0.01266239, + "balance_loss_clip": 0.06268354, + "balance_loss_mlp": 0.01257095, + "epoch": 0.7723433037727341, + "flos": 33884049801600.0, + "grad_norm": 1.693366944822723, + "language_loss": 0.64408147, + "learning_rate": 5.193611831635159e-07, + "loss": 0.72071993, + "num_input_tokens_seen": 277144895, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.09143066, + "step": 12846, + "time_per_iteration": 2.5818498134613037 + }, + { + "auxiliary_loss_clip": 0.06312131, + "auxiliary_loss_mlp": 0.01253105, + "balance_loss_clip": 0.06256564, + "balance_loss_mlp": 0.01252078, + "epoch": 0.7724034270254021, + "flos": 62868194467200.0, + "grad_norm": 0.7376748551210195, + "language_loss": 0.61336023, + "learning_rate": 5.19099393562945e-07, + "loss": 0.68901265, + "num_input_tokens_seen": 277205160, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01026917, + "step": 12847, + "time_per_iteration": 3.0541763305664062 + }, + { + "auxiliary_loss_clip": 0.06401889, + "auxiliary_loss_mlp": 0.01264508, + "balance_loss_clip": 0.06268549, + "balance_loss_mlp": 0.01254983, + "epoch": 0.77246355027807, + "flos": 23302299156480.0, + "grad_norm": 1.5812634929817273, + "language_loss": 0.79369843, + "learning_rate": 5.188376601182732e-07, + "loss": 0.8703624, + "num_input_tokens_seen": 277223005, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.09527588, + "step": 12848, + "time_per_iteration": 3.9165518283843994 + }, + { + "auxiliary_loss_clip": 0.06404726, + "auxiliary_loss_mlp": 0.01266909, + "balance_loss_clip": 0.06268495, + "balance_loss_mlp": 0.01257086, + "epoch": 0.772523673530738, + "flos": 20127602062080.0, + "grad_norm": 1.566706530012109, + "language_loss": 0.73342961, + "learning_rate": 5.185759828394261e-07, + "loss": 0.81014597, + "num_input_tokens_seen": 277241785, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.0982666, + "step": 12849, + "time_per_iteration": 2.476515293121338 + }, + { + "auxiliary_loss_clip": 0.06402398, + "auxiliary_loss_mlp": 0.012638, + "balance_loss_clip": 0.06268849, + "balance_loss_mlp": 0.01254126, + "epoch": 0.7725837967834059, + "flos": 17825592441600.0, + "grad_norm": 2.2364064713439156, + "language_loss": 0.78424966, + "learning_rate": 5.183143617363261e-07, + "loss": 0.86091167, + "num_input_tokens_seen": 277259050, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.09667969, + "step": 12850, + "time_per_iteration": 2.4794983863830566 + }, + { + "auxiliary_loss_clip": 0.0640396, + "auxiliary_loss_mlp": 0.0126685, + "balance_loss_clip": 0.06267555, + "balance_loss_mlp": 0.01256616, + "epoch": 0.772643920036074, + "flos": 27206318188800.0, + "grad_norm": 1.5059914394205691, + "language_loss": 0.80266678, + "learning_rate": 5.180527968188935e-07, + "loss": 0.87937486, + "num_input_tokens_seen": 277278235, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.10235596, + "step": 12851, + "time_per_iteration": 2.5322558879852295 + }, + { + "auxiliary_loss_clip": 0.06400898, + "auxiliary_loss_mlp": 0.01263165, + "balance_loss_clip": 0.06270088, + "balance_loss_mlp": 0.01253193, + "epoch": 0.7727040432887419, + "flos": 21585868594560.0, + "grad_norm": 1.7096231270301345, + "language_loss": 0.73980415, + "learning_rate": 5.177912880970474e-07, + "loss": 0.81644481, + "num_input_tokens_seen": 277298355, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09973145, + "step": 12852, + "time_per_iteration": 2.5234642028808594 + }, + { + "auxiliary_loss_clip": 0.06399091, + "auxiliary_loss_mlp": 0.01264912, + "balance_loss_clip": 0.06268281, + "balance_loss_mlp": 0.01255685, + "epoch": 0.7727641665414099, + "flos": 22243172348160.0, + "grad_norm": 1.8458923236919589, + "language_loss": 0.82645077, + "learning_rate": 5.17529835580704e-07, + "loss": 0.90309083, + "num_input_tokens_seen": 277316095, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.09222412, + "step": 12853, + "time_per_iteration": 2.4855525493621826 + }, + { + "auxiliary_loss_clip": 0.06312872, + "auxiliary_loss_mlp": 0.01252237, + "balance_loss_clip": 0.06257433, + "balance_loss_mlp": 0.01251258, + "epoch": 0.7728242897940779, + "flos": 54852613038720.0, + "grad_norm": 0.7809207037354382, + "language_loss": 0.54245615, + "learning_rate": 5.172684392797786e-07, + "loss": 0.6181072, + "num_input_tokens_seen": 277380130, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.00978088, + "step": 12854, + "time_per_iteration": 3.1956636905670166 + }, + { + "auxiliary_loss_clip": 0.06408576, + "auxiliary_loss_mlp": 0.01265841, + "balance_loss_clip": 0.06272317, + "balance_loss_mlp": 0.01255667, + "epoch": 0.7728844130467458, + "flos": 34470970525440.0, + "grad_norm": 1.470895080979425, + "language_loss": 0.7210083, + "learning_rate": 5.170070992041826e-07, + "loss": 0.7977525, + "num_input_tokens_seen": 277404015, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.10168457, + "step": 12855, + "time_per_iteration": 2.6422533988952637 + }, + { + "auxiliary_loss_clip": 0.0640472, + "auxiliary_loss_mlp": 0.01265685, + "balance_loss_clip": 0.06271958, + "balance_loss_mlp": 0.01256059, + "epoch": 0.7729445362994138, + "flos": 18922300606080.0, + "grad_norm": 1.643707808983738, + "language_loss": 0.68152243, + "learning_rate": 5.167458153638254e-07, + "loss": 0.75822645, + "num_input_tokens_seen": 277421375, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09619141, + "step": 12856, + "time_per_iteration": 2.581195592880249 + }, + { + "auxiliary_loss_clip": 0.06403085, + "auxiliary_loss_mlp": 0.01263682, + "balance_loss_clip": 0.06271385, + "balance_loss_mlp": 0.01254241, + "epoch": 0.7730046595520818, + "flos": 22206555313920.0, + "grad_norm": 2.739925215135401, + "language_loss": 0.7896111, + "learning_rate": 5.164845877686162e-07, + "loss": 0.86627877, + "num_input_tokens_seen": 277440170, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.09442139, + "step": 12857, + "time_per_iteration": 2.536677360534668 + }, + { + "auxiliary_loss_clip": 0.06400988, + "auxiliary_loss_mlp": 0.01266407, + "balance_loss_clip": 0.06271593, + "balance_loss_mlp": 0.01256447, + "epoch": 0.7730647828047498, + "flos": 13557289783680.0, + "grad_norm": 1.6864648119346977, + "language_loss": 0.7856096, + "learning_rate": 5.162234164284591e-07, + "loss": 0.86228359, + "num_input_tokens_seen": 277456880, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.09954834, + "step": 12858, + "time_per_iteration": 3.9322428703308105 + }, + { + "auxiliary_loss_clip": 0.06406689, + "auxiliary_loss_mlp": 0.01266364, + "balance_loss_clip": 0.06271519, + "balance_loss_mlp": 0.01256392, + "epoch": 0.7731249060574177, + "flos": 21981654155520.0, + "grad_norm": 1.7779455572777159, + "language_loss": 0.77746201, + "learning_rate": 5.159623013532591e-07, + "loss": 0.8541925, + "num_input_tokens_seen": 277475365, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.09967041, + "step": 12859, + "time_per_iteration": 2.513849973678589 + }, + { + "auxiliary_loss_clip": 0.06403208, + "auxiliary_loss_mlp": 0.01261712, + "balance_loss_clip": 0.06273893, + "balance_loss_mlp": 0.01253284, + "epoch": 0.7731850293100857, + "flos": 22608462222720.0, + "grad_norm": 1.6555727720253302, + "language_loss": 0.67912078, + "learning_rate": 5.157012425529186e-07, + "loss": 0.75576997, + "num_input_tokens_seen": 277494975, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.08428955, + "step": 12860, + "time_per_iteration": 4.005707740783691 + }, + { + "auxiliary_loss_clip": 0.06407683, + "auxiliary_loss_mlp": 0.01265641, + "balance_loss_clip": 0.06270751, + "balance_loss_mlp": 0.01255449, + "epoch": 0.7732451525627536, + "flos": 14103274988160.0, + "grad_norm": 2.651215964660107, + "language_loss": 0.75251514, + "learning_rate": 5.154402400373343e-07, + "loss": 0.82924837, + "num_input_tokens_seen": 277510520, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.10198975, + "step": 12861, + "time_per_iteration": 2.444032907485962 + }, + { + "auxiliary_loss_clip": 0.06406768, + "auxiliary_loss_mlp": 0.01262473, + "balance_loss_clip": 0.06270678, + "balance_loss_mlp": 0.01252328, + "epoch": 0.7733052758154216, + "flos": 21476352908160.0, + "grad_norm": 3.091257297697316, + "language_loss": 0.75125277, + "learning_rate": 5.15179293816405e-07, + "loss": 0.82794511, + "num_input_tokens_seen": 277530505, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.10137939, + "step": 12862, + "time_per_iteration": 2.5575408935546875 + }, + { + "auxiliary_loss_clip": 0.06400394, + "auxiliary_loss_mlp": 0.01264588, + "balance_loss_clip": 0.06270863, + "balance_loss_mlp": 0.01255552, + "epoch": 0.7733653990680895, + "flos": 21400142019840.0, + "grad_norm": 1.5224536718195483, + "language_loss": 0.83015412, + "learning_rate": 5.149184039000256e-07, + "loss": 0.90680391, + "num_input_tokens_seen": 277550810, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.09039307, + "step": 12863, + "time_per_iteration": 2.500004529953003 + }, + { + "auxiliary_loss_clip": 0.06403436, + "auxiliary_loss_mlp": 0.01266726, + "balance_loss_clip": 0.06272671, + "balance_loss_mlp": 0.01257172, + "epoch": 0.7734255223207576, + "flos": 17681849562240.0, + "grad_norm": 1.666044209334627, + "language_loss": 0.73906845, + "learning_rate": 5.146575702980898e-07, + "loss": 0.81577015, + "num_input_tokens_seen": 277567680, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.09558105, + "step": 12864, + "time_per_iteration": 2.502202272415161 + }, + { + "auxiliary_loss_clip": 0.06405224, + "auxiliary_loss_mlp": 0.01262028, + "balance_loss_clip": 0.06273071, + "balance_loss_mlp": 0.01253117, + "epoch": 0.7734856455734255, + "flos": 25238264215680.0, + "grad_norm": 1.8553120895059094, + "language_loss": 0.82274187, + "learning_rate": 5.143967930204871e-07, + "loss": 0.89941442, + "num_input_tokens_seen": 277588970, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.08911133, + "step": 12865, + "time_per_iteration": 2.5821845531463623 + }, + { + "auxiliary_loss_clip": 0.0640586, + "auxiliary_loss_mlp": 0.0126401, + "balance_loss_clip": 0.06269649, + "balance_loss_mlp": 0.01253627, + "epoch": 0.7735457688260935, + "flos": 23438579022720.0, + "grad_norm": 2.0985789262446763, + "language_loss": 0.71729589, + "learning_rate": 5.141360720771077e-07, + "loss": 0.79399455, + "num_input_tokens_seen": 277605450, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.10375977, + "step": 12866, + "time_per_iteration": 3.9061973094940186 + }, + { + "auxiliary_loss_clip": 0.06406082, + "auxiliary_loss_mlp": 0.01266662, + "balance_loss_clip": 0.06272133, + "balance_loss_mlp": 0.01256309, + "epoch": 0.7736058920787615, + "flos": 18734393825280.0, + "grad_norm": 2.2008061294183046, + "language_loss": 0.64883512, + "learning_rate": 5.138754074778371e-07, + "loss": 0.72556257, + "num_input_tokens_seen": 277622530, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.1036377, + "step": 12867, + "time_per_iteration": 2.438513994216919 + }, + { + "auxiliary_loss_clip": 0.06398055, + "auxiliary_loss_mlp": 0.01264338, + "balance_loss_clip": 0.06268299, + "balance_loss_mlp": 0.01254897, + "epoch": 0.7736660153314294, + "flos": 22899931050240.0, + "grad_norm": 1.3982915625107966, + "language_loss": 0.71222079, + "learning_rate": 5.136147992325595e-07, + "loss": 0.7888447, + "num_input_tokens_seen": 277642700, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.09442139, + "step": 12868, + "time_per_iteration": 2.521263599395752 + }, + { + "auxiliary_loss_clip": 0.06407171, + "auxiliary_loss_mlp": 0.01263296, + "balance_loss_clip": 0.06272081, + "balance_loss_mlp": 0.01253252, + "epoch": 0.7737261385840974, + "flos": 13804762417920.0, + "grad_norm": 1.9680842128147285, + "language_loss": 0.78157473, + "learning_rate": 5.133542473511578e-07, + "loss": 0.85827935, + "num_input_tokens_seen": 277660005, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.10046387, + "step": 12869, + "time_per_iteration": 2.4751439094543457 + }, + { + "auxiliary_loss_clip": 0.06399751, + "auxiliary_loss_mlp": 0.01264789, + "balance_loss_clip": 0.06270332, + "balance_loss_mlp": 0.0125536, + "epoch": 0.7737862618367654, + "flos": 28738279987200.0, + "grad_norm": 1.45372997777974, + "language_loss": 0.73862869, + "learning_rate": 5.130937518435124e-07, + "loss": 0.81527412, + "num_input_tokens_seen": 277682890, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.09429932, + "step": 12870, + "time_per_iteration": 2.568042278289795 + }, + { + "auxiliary_loss_clip": 0.06404359, + "auxiliary_loss_mlp": 0.01266949, + "balance_loss_clip": 0.06270356, + "balance_loss_mlp": 0.01257102, + "epoch": 0.7738463850894334, + "flos": 17024126538240.0, + "grad_norm": 1.914928650569768, + "language_loss": 0.75650132, + "learning_rate": 5.12833312719501e-07, + "loss": 0.83321428, + "num_input_tokens_seen": 277699330, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.09851074, + "step": 12871, + "time_per_iteration": 2.4711315631866455 + }, + { + "auxiliary_loss_clip": 0.06402566, + "auxiliary_loss_mlp": 0.0126384, + "balance_loss_clip": 0.06271693, + "balance_loss_mlp": 0.01254416, + "epoch": 0.7739065083421013, + "flos": 20710246227840.0, + "grad_norm": 1.4478463877402143, + "language_loss": 0.69638461, + "learning_rate": 5.12572929988999e-07, + "loss": 0.77304864, + "num_input_tokens_seen": 277718750, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.09417725, + "step": 12872, + "time_per_iteration": 2.520254135131836 + }, + { + "auxiliary_loss_clip": 0.06404334, + "auxiliary_loss_mlp": 0.01264657, + "balance_loss_clip": 0.0627078, + "balance_loss_mlp": 0.01254173, + "epoch": 0.7739666315947693, + "flos": 20702322017280.0, + "grad_norm": 2.162643360462714, + "language_loss": 0.8514446, + "learning_rate": 5.123126036618804e-07, + "loss": 0.92813456, + "num_input_tokens_seen": 277734645, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.10479736, + "step": 12873, + "time_per_iteration": 2.5746922492980957 + }, + { + "auxiliary_loss_clip": 0.06405018, + "auxiliary_loss_mlp": 0.01265436, + "balance_loss_clip": 0.06272902, + "balance_loss_mlp": 0.0125612, + "epoch": 0.7740267548474372, + "flos": 29578501203840.0, + "grad_norm": 2.074777829849384, + "language_loss": 0.66097724, + "learning_rate": 5.120523337480174e-07, + "loss": 0.73768181, + "num_input_tokens_seen": 277755535, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09313965, + "step": 12874, + "time_per_iteration": 2.5801379680633545 + }, + { + "auxiliary_loss_clip": 0.06399316, + "auxiliary_loss_mlp": 0.01262488, + "balance_loss_clip": 0.06268813, + "balance_loss_mlp": 0.01253166, + "epoch": 0.7740868781001052, + "flos": 23665786168320.0, + "grad_norm": 1.7962266070608972, + "language_loss": 0.62437928, + "learning_rate": 5.117921202572785e-07, + "loss": 0.70099723, + "num_input_tokens_seen": 277775585, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.09313965, + "step": 12875, + "time_per_iteration": 2.5030999183654785 + }, + { + "auxiliary_loss_clip": 0.06404817, + "auxiliary_loss_mlp": 0.01262981, + "balance_loss_clip": 0.06269525, + "balance_loss_mlp": 0.0125264, + "epoch": 0.7741470013527731, + "flos": 24724200216960.0, + "grad_norm": 1.663352661776614, + "language_loss": 0.65509927, + "learning_rate": 5.115319631995318e-07, + "loss": 0.73177719, + "num_input_tokens_seen": 277794795, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.10345459, + "step": 12876, + "time_per_iteration": 2.5258145332336426 + }, + { + "auxiliary_loss_clip": 0.06400372, + "auxiliary_loss_mlp": 0.01266731, + "balance_loss_clip": 0.06269747, + "balance_loss_mlp": 0.01258005, + "epoch": 0.7742071246054412, + "flos": 21878092108800.0, + "grad_norm": 1.7333890551620577, + "language_loss": 0.71176594, + "learning_rate": 5.112718625846433e-07, + "loss": 0.78843695, + "num_input_tokens_seen": 277813235, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.08734131, + "step": 12877, + "time_per_iteration": 2.4929704666137695 + }, + { + "auxiliary_loss_clip": 0.06407753, + "auxiliary_loss_mlp": 0.01264403, + "balance_loss_clip": 0.06269468, + "balance_loss_mlp": 0.01254371, + "epoch": 0.7742672478581091, + "flos": 22680815823360.0, + "grad_norm": 1.9764136329910882, + "language_loss": 0.82948673, + "learning_rate": 5.110118184224736e-07, + "loss": 0.90620828, + "num_input_tokens_seen": 277832560, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.1003418, + "step": 12878, + "time_per_iteration": 2.502988338470459 + }, + { + "auxiliary_loss_clip": 0.06402762, + "auxiliary_loss_mlp": 0.01265169, + "balance_loss_clip": 0.06269325, + "balance_loss_mlp": 0.0125531, + "epoch": 0.7743273711107771, + "flos": 18846425134080.0, + "grad_norm": 1.6763538175981627, + "language_loss": 0.73367083, + "learning_rate": 5.10751830722885e-07, + "loss": 0.81035012, + "num_input_tokens_seen": 277850120, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.09857178, + "step": 12879, + "time_per_iteration": 2.4705021381378174 + }, + { + "auxiliary_loss_clip": 0.06397247, + "auxiliary_loss_mlp": 0.01265601, + "balance_loss_clip": 0.06268625, + "balance_loss_mlp": 0.01256219, + "epoch": 0.7743874943634451, + "flos": 28736644832640.0, + "grad_norm": 1.5623883440546136, + "language_loss": 0.79838526, + "learning_rate": 5.104918994957364e-07, + "loss": 0.87501371, + "num_input_tokens_seen": 277871020, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.09381104, + "step": 12880, + "time_per_iteration": 2.556452989578247 + }, + { + "auxiliary_loss_clip": 0.06398898, + "auxiliary_loss_mlp": 0.01266264, + "balance_loss_clip": 0.06267609, + "balance_loss_mlp": 0.01255899, + "epoch": 0.774447617616113, + "flos": 21916344297600.0, + "grad_norm": 1.366667718096845, + "language_loss": 0.70864272, + "learning_rate": 5.102320247508847e-07, + "loss": 0.78529441, + "num_input_tokens_seen": 277891525, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.10375977, + "step": 12881, + "time_per_iteration": 2.521993637084961 + }, + { + "auxiliary_loss_clip": 0.06408711, + "auxiliary_loss_mlp": 0.01270141, + "balance_loss_clip": 0.06270668, + "balance_loss_mlp": 0.01258512, + "epoch": 0.774507740868781, + "flos": 19506789561600.0, + "grad_norm": 2.127818654803154, + "language_loss": 0.84771377, + "learning_rate": 5.099722064981832e-07, + "loss": 0.92450231, + "num_input_tokens_seen": 277910425, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.11627197, + "step": 12882, + "time_per_iteration": 2.5355141162872314 + }, + { + "auxiliary_loss_clip": 0.06311849, + "auxiliary_loss_mlp": 0.01254336, + "balance_loss_clip": 0.06256157, + "balance_loss_mlp": 0.01253313, + "epoch": 0.774567864121449, + "flos": 59447240622720.0, + "grad_norm": 0.7584667410578986, + "language_loss": 0.60187125, + "learning_rate": 5.097124447474858e-07, + "loss": 0.67753309, + "num_input_tokens_seen": 277972795, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.01023102, + "step": 12883, + "time_per_iteration": 3.124359607696533 + }, + { + "auxiliary_loss_clip": 0.06403667, + "auxiliary_loss_mlp": 0.01265861, + "balance_loss_clip": 0.06270087, + "balance_loss_mlp": 0.01255073, + "epoch": 0.774627987374117, + "flos": 13230461733120.0, + "grad_norm": 1.8439274810077488, + "language_loss": 0.72904599, + "learning_rate": 5.094527395086416e-07, + "loss": 0.80574125, + "num_input_tokens_seen": 277990675, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.10778809, + "step": 12884, + "time_per_iteration": 2.4965550899505615 + }, + { + "auxiliary_loss_clip": 0.06399918, + "auxiliary_loss_mlp": 0.01266004, + "balance_loss_clip": 0.06270594, + "balance_loss_mlp": 0.01257301, + "epoch": 0.7746881106267849, + "flos": 21399848530560.0, + "grad_norm": 1.5524278185982343, + "language_loss": 0.81275487, + "learning_rate": 5.091930907914986e-07, + "loss": 0.88941407, + "num_input_tokens_seen": 278010050, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.08703613, + "step": 12885, + "time_per_iteration": 2.557429075241089 + }, + { + "auxiliary_loss_clip": 0.06401367, + "auxiliary_loss_mlp": 0.01263161, + "balance_loss_clip": 0.06271436, + "balance_loss_mlp": 0.01254084, + "epoch": 0.7747482338794529, + "flos": 25636355763840.0, + "grad_norm": 1.6694918727870636, + "language_loss": 0.63739854, + "learning_rate": 5.089334986059029e-07, + "loss": 0.71404386, + "num_input_tokens_seen": 278030660, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.09088135, + "step": 12886, + "time_per_iteration": 2.5352628231048584 + }, + { + "auxiliary_loss_clip": 0.06405632, + "auxiliary_loss_mlp": 0.01262726, + "balance_loss_clip": 0.06271148, + "balance_loss_mlp": 0.01254221, + "epoch": 0.7748083571321208, + "flos": 11551780235520.0, + "grad_norm": 2.0761314412195335, + "language_loss": 0.69713193, + "learning_rate": 5.086739629616987e-07, + "loss": 0.77381551, + "num_input_tokens_seen": 278047645, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.08508301, + "step": 12887, + "time_per_iteration": 3.896411657333374 + }, + { + "auxiliary_loss_clip": 0.06400104, + "auxiliary_loss_mlp": 0.01265417, + "balance_loss_clip": 0.0626978, + "balance_loss_mlp": 0.01256036, + "epoch": 0.7748684803847888, + "flos": 19068433326720.0, + "grad_norm": 1.724718840710913, + "language_loss": 0.70770532, + "learning_rate": 5.084144838687275e-07, + "loss": 0.78436053, + "num_input_tokens_seen": 278066170, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.09381104, + "step": 12888, + "time_per_iteration": 2.5054144859313965 + }, + { + "auxiliary_loss_clip": 0.06406914, + "auxiliary_loss_mlp": 0.01266857, + "balance_loss_clip": 0.06270684, + "balance_loss_mlp": 0.01256372, + "epoch": 0.7749286036374567, + "flos": 22279705528320.0, + "grad_norm": 1.6247326651931444, + "language_loss": 0.8212378, + "learning_rate": 5.081550613368279e-07, + "loss": 0.89797544, + "num_input_tokens_seen": 278085545, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.1048584, + "step": 12889, + "time_per_iteration": 2.503159999847412 + }, + { + "auxiliary_loss_clip": 0.0640256, + "auxiliary_loss_mlp": 0.01267254, + "balance_loss_clip": 0.0627083, + "balance_loss_mlp": 0.01258122, + "epoch": 0.7749887268901248, + "flos": 20198488216320.0, + "grad_norm": 1.8373652721061162, + "language_loss": 0.79928273, + "learning_rate": 5.07895695375838e-07, + "loss": 0.87598085, + "num_input_tokens_seen": 278102995, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.09130859, + "step": 12890, + "time_per_iteration": 2.4615426063537598 + }, + { + "auxiliary_loss_clip": 0.06406836, + "auxiliary_loss_mlp": 0.01270493, + "balance_loss_clip": 0.06273372, + "balance_loss_mlp": 0.01260206, + "epoch": 0.7750488501427927, + "flos": 20343446979840.0, + "grad_norm": 1.6840660181274105, + "language_loss": 0.66623914, + "learning_rate": 5.076363859955932e-07, + "loss": 0.74301237, + "num_input_tokens_seen": 278121460, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.1027832, + "step": 12891, + "time_per_iteration": 2.4890570640563965 + }, + { + "auxiliary_loss_clip": 0.06404784, + "auxiliary_loss_mlp": 0.01265118, + "balance_loss_clip": 0.06270394, + "balance_loss_mlp": 0.01255241, + "epoch": 0.7751089733954607, + "flos": 28371229176960.0, + "grad_norm": 1.3810973475198156, + "language_loss": 0.79341507, + "learning_rate": 5.073771332059257e-07, + "loss": 0.87011403, + "num_input_tokens_seen": 278143905, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.09881592, + "step": 12892, + "time_per_iteration": 2.5426137447357178 + }, + { + "auxiliary_loss_clip": 0.06410879, + "auxiliary_loss_mlp": 0.01265811, + "balance_loss_clip": 0.06274527, + "balance_loss_mlp": 0.01255273, + "epoch": 0.7751690966481286, + "flos": 16949047680000.0, + "grad_norm": 1.9398212373821864, + "language_loss": 0.67894936, + "learning_rate": 5.071179370166669e-07, + "loss": 0.75571626, + "num_input_tokens_seen": 278160850, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.10522461, + "step": 12893, + "time_per_iteration": 2.469115734100342 + }, + { + "auxiliary_loss_clip": 0.06313038, + "auxiliary_loss_mlp": 0.0125019, + "balance_loss_clip": 0.06257471, + "balance_loss_mlp": 0.01248948, + "epoch": 0.7752292199007966, + "flos": 65690179799040.0, + "grad_norm": 0.7899277487406899, + "language_loss": 0.58551872, + "learning_rate": 5.068587974376468e-07, + "loss": 0.66115099, + "num_input_tokens_seen": 278219950, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01241302, + "step": 12894, + "time_per_iteration": 3.1802139282226562 + }, + { + "auxiliary_loss_clip": 0.06405281, + "auxiliary_loss_mlp": 0.012653, + "balance_loss_clip": 0.06270818, + "balance_loss_mlp": 0.01254637, + "epoch": 0.7752893431534646, + "flos": 20600898249600.0, + "grad_norm": 2.1408661734068697, + "language_loss": 0.78008652, + "learning_rate": 5.065997144786895e-07, + "loss": 0.85679233, + "num_input_tokens_seen": 278237805, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.10662842, + "step": 12895, + "time_per_iteration": 2.517387866973877 + }, + { + "auxiliary_loss_clip": 0.06404513, + "auxiliary_loss_mlp": 0.01265126, + "balance_loss_clip": 0.06271935, + "balance_loss_mlp": 0.01255124, + "epoch": 0.7753494664061326, + "flos": 20491592198400.0, + "grad_norm": 1.7101210231802921, + "language_loss": 0.67742205, + "learning_rate": 5.063406881496209e-07, + "loss": 0.75411844, + "num_input_tokens_seen": 278257660, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.10003662, + "step": 12896, + "time_per_iteration": 2.508040428161621 + }, + { + "auxiliary_loss_clip": 0.06401385, + "auxiliary_loss_mlp": 0.01264283, + "balance_loss_clip": 0.06268774, + "balance_loss_mlp": 0.01254717, + "epoch": 0.7754095896588006, + "flos": 20272015774080.0, + "grad_norm": 1.718290101877412, + "language_loss": 0.68828535, + "learning_rate": 5.060817184602629e-07, + "loss": 0.76494199, + "num_input_tokens_seen": 278275110, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.09570312, + "step": 12897, + "time_per_iteration": 3.958052158355713 + }, + { + "auxiliary_loss_clip": 0.06406542, + "auxiliary_loss_mlp": 0.01265206, + "balance_loss_clip": 0.06272966, + "balance_loss_mlp": 0.01255074, + "epoch": 0.7754697129114685, + "flos": 23337784160640.0, + "grad_norm": 1.8777545444749013, + "language_loss": 0.75346845, + "learning_rate": 5.058228054204364e-07, + "loss": 0.83018595, + "num_input_tokens_seen": 278293035, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.10131836, + "step": 12898, + "time_per_iteration": 2.548725128173828 + }, + { + "auxiliary_loss_clip": 0.06405295, + "auxiliary_loss_mlp": 0.0126368, + "balance_loss_clip": 0.06271052, + "balance_loss_mlp": 0.01253231, + "epoch": 0.7755298361641365, + "flos": 17353344430080.0, + "grad_norm": 2.11113178190308, + "language_loss": 0.70727742, + "learning_rate": 5.055639490399588e-07, + "loss": 0.78396714, + "num_input_tokens_seen": 278311010, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.10443115, + "step": 12899, + "time_per_iteration": 2.4659245014190674 + }, + { + "auxiliary_loss_clip": 0.06405385, + "auxiliary_loss_mlp": 0.01264565, + "balance_loss_clip": 0.06272905, + "balance_loss_mlp": 0.01254266, + "epoch": 0.7755899594168044, + "flos": 19651916033280.0, + "grad_norm": 2.07260093915493, + "language_loss": 0.74897844, + "learning_rate": 5.053051493286453e-07, + "loss": 0.82567799, + "num_input_tokens_seen": 278329900, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.10302734, + "step": 12900, + "time_per_iteration": 4.011428117752075 + }, + { + "auxiliary_loss_clip": 0.06400472, + "auxiliary_loss_mlp": 0.01264751, + "balance_loss_clip": 0.06270377, + "balance_loss_mlp": 0.01255525, + "epoch": 0.7756500826694724, + "flos": 27421324565760.0, + "grad_norm": 1.5623703239819655, + "language_loss": 0.77776372, + "learning_rate": 5.050464062963113e-07, + "loss": 0.85441595, + "num_input_tokens_seen": 278349980, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.09234619, + "step": 12901, + "time_per_iteration": 2.551858425140381 + }, + { + "auxiliary_loss_clip": 0.0639973, + "auxiliary_loss_mlp": 0.0126504, + "balance_loss_clip": 0.06269458, + "balance_loss_mlp": 0.01255289, + "epoch": 0.7757102059221404, + "flos": 28738028424960.0, + "grad_norm": 1.3485417524175327, + "language_loss": 0.77421844, + "learning_rate": 5.047877199527666e-07, + "loss": 0.8508662, + "num_input_tokens_seen": 278372485, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.09747314, + "step": 12902, + "time_per_iteration": 2.5616962909698486 + }, + { + "auxiliary_loss_clip": 0.06401799, + "auxiliary_loss_mlp": 0.01266411, + "balance_loss_clip": 0.06270513, + "balance_loss_mlp": 0.01256898, + "epoch": 0.7757703291748084, + "flos": 22492489772160.0, + "grad_norm": 1.8023361426905782, + "language_loss": 0.73515046, + "learning_rate": 5.045290903078215e-07, + "loss": 0.81183261, + "num_input_tokens_seen": 278391660, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.09509277, + "step": 12903, + "time_per_iteration": 2.5368919372558594 + }, + { + "auxiliary_loss_clip": 0.06400372, + "auxiliary_loss_mlp": 0.01263703, + "balance_loss_clip": 0.06269526, + "balance_loss_mlp": 0.01253851, + "epoch": 0.7758304524274763, + "flos": 21435920513280.0, + "grad_norm": 2.3012880989025946, + "language_loss": 0.75830078, + "learning_rate": 5.042705173712835e-07, + "loss": 0.83494151, + "num_input_tokens_seen": 278409125, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09863281, + "step": 12904, + "time_per_iteration": 2.476417064666748 + }, + { + "auxiliary_loss_clip": 0.06397906, + "auxiliary_loss_mlp": 0.01264748, + "balance_loss_clip": 0.06269727, + "balance_loss_mlp": 0.01256093, + "epoch": 0.7758905756801443, + "flos": 23665953876480.0, + "grad_norm": 1.8947972098454593, + "language_loss": 0.68449861, + "learning_rate": 5.040120011529576e-07, + "loss": 0.76112515, + "num_input_tokens_seen": 278429450, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.08654785, + "step": 12905, + "time_per_iteration": 3.922461748123169 + }, + { + "auxiliary_loss_clip": 0.06398395, + "auxiliary_loss_mlp": 0.01266837, + "balance_loss_clip": 0.06270361, + "balance_loss_mlp": 0.0125736, + "epoch": 0.7759506989328122, + "flos": 28372906258560.0, + "grad_norm": 1.53682543204514, + "language_loss": 0.67685688, + "learning_rate": 5.037535416626459e-07, + "loss": 0.75350916, + "num_input_tokens_seen": 278449925, + "router_z_loss_clip": 1.28027344, + "router_z_loss_mlp": 0.0947876, + "step": 12906, + "time_per_iteration": 2.5313022136688232 + }, + { + "auxiliary_loss_clip": 0.06400718, + "auxiliary_loss_mlp": 0.01267007, + "balance_loss_clip": 0.06268603, + "balance_loss_mlp": 0.01257124, + "epoch": 0.7760108221854802, + "flos": 14908053127680.0, + "grad_norm": 2.1235046530395167, + "language_loss": 0.81742978, + "learning_rate": 5.034951389101498e-07, + "loss": 0.8941071, + "num_input_tokens_seen": 278467255, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.09887695, + "step": 12907, + "time_per_iteration": 2.4844870567321777 + }, + { + "auxiliary_loss_clip": 0.06399026, + "auxiliary_loss_mlp": 0.01267683, + "balance_loss_clip": 0.06271745, + "balance_loss_mlp": 0.01258584, + "epoch": 0.7760709454381483, + "flos": 14797615046400.0, + "grad_norm": 2.0283728968783006, + "language_loss": 0.67200708, + "learning_rate": 5.032367929052685e-07, + "loss": 0.74867415, + "num_input_tokens_seen": 278484250, + "router_z_loss_clip": 1.27246094, + "router_z_loss_mlp": 0.09103394, + "step": 12908, + "time_per_iteration": 2.489652633666992 + }, + { + "auxiliary_loss_clip": 0.06403653, + "auxiliary_loss_mlp": 0.01267977, + "balance_loss_clip": 0.06269245, + "balance_loss_mlp": 0.01258017, + "epoch": 0.7761310686908162, + "flos": 17384846365440.0, + "grad_norm": 1.5208070969667713, + "language_loss": 0.70563579, + "learning_rate": 5.029785036577976e-07, + "loss": 0.78235209, + "num_input_tokens_seen": 278502740, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.09954834, + "step": 12909, + "time_per_iteration": 2.484180450439453 + }, + { + "auxiliary_loss_clip": 0.06401674, + "auxiliary_loss_mlp": 0.01271334, + "balance_loss_clip": 0.06272651, + "balance_loss_mlp": 0.01262208, + "epoch": 0.7761911919434842, + "flos": 25563582892800.0, + "grad_norm": 1.6528787080895593, + "language_loss": 0.68030262, + "learning_rate": 5.027202711775324e-07, + "loss": 0.75703275, + "num_input_tokens_seen": 278523890, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.09130859, + "step": 12910, + "time_per_iteration": 2.5219783782958984 + }, + { + "auxiliary_loss_clip": 0.06401049, + "auxiliary_loss_mlp": 0.01265939, + "balance_loss_clip": 0.06268351, + "balance_loss_mlp": 0.01256193, + "epoch": 0.7762513151961521, + "flos": 23185530092160.0, + "grad_norm": 1.572866205055694, + "language_loss": 0.7175374, + "learning_rate": 5.024620954742646e-07, + "loss": 0.79420727, + "num_input_tokens_seen": 278543185, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09747314, + "step": 12911, + "time_per_iteration": 2.533684730529785 + }, + { + "auxiliary_loss_clip": 0.06403443, + "auxiliary_loss_mlp": 0.01265195, + "balance_loss_clip": 0.06270085, + "balance_loss_mlp": 0.01254651, + "epoch": 0.7763114384488201, + "flos": 21696097040640.0, + "grad_norm": 3.1287600736894867, + "language_loss": 0.63521278, + "learning_rate": 5.022039765577836e-07, + "loss": 0.71189916, + "num_input_tokens_seen": 278559220, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.10546875, + "step": 12912, + "time_per_iteration": 2.4713103771209717 + }, + { + "auxiliary_loss_clip": 0.06310222, + "auxiliary_loss_mlp": 0.01256155, + "balance_loss_clip": 0.06254428, + "balance_loss_mlp": 0.012551, + "epoch": 0.776371561701488, + "flos": 69048381335040.0, + "grad_norm": 0.7692138307274686, + "language_loss": 0.53290647, + "learning_rate": 5.019459144378779e-07, + "loss": 0.60857022, + "num_input_tokens_seen": 278618185, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.01056671, + "step": 12913, + "time_per_iteration": 3.1764438152313232 + }, + { + "auxiliary_loss_clip": 0.06402822, + "auxiliary_loss_mlp": 0.01263376, + "balance_loss_clip": 0.06270348, + "balance_loss_mlp": 0.01254495, + "epoch": 0.776431684954156, + "flos": 22900643809920.0, + "grad_norm": 1.5625942669092794, + "language_loss": 0.6230467, + "learning_rate": 5.016879091243338e-07, + "loss": 0.6997087, + "num_input_tokens_seen": 278636210, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.08880615, + "step": 12914, + "time_per_iteration": 2.534447193145752 + }, + { + "auxiliary_loss_clip": 0.06399079, + "auxiliary_loss_mlp": 0.012627, + "balance_loss_clip": 0.06268825, + "balance_loss_mlp": 0.01253259, + "epoch": 0.776491808206824, + "flos": 20266942602240.0, + "grad_norm": 1.633160981645456, + "language_loss": 0.82489586, + "learning_rate": 5.014299606269339e-07, + "loss": 0.9015137, + "num_input_tokens_seen": 278653305, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.09436035, + "step": 12915, + "time_per_iteration": 2.4910573959350586 + }, + { + "auxiliary_loss_clip": 0.06403746, + "auxiliary_loss_mlp": 0.01265286, + "balance_loss_clip": 0.06268285, + "balance_loss_mlp": 0.01255266, + "epoch": 0.776551931459492, + "flos": 26766033310080.0, + "grad_norm": 1.7528109604711235, + "language_loss": 0.74837983, + "learning_rate": 5.011720689554603e-07, + "loss": 0.82507014, + "num_input_tokens_seen": 278671850, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.10021973, + "step": 12916, + "time_per_iteration": 2.5818369388580322 + }, + { + "auxiliary_loss_clip": 0.06402493, + "auxiliary_loss_mlp": 0.01264205, + "balance_loss_clip": 0.06269188, + "balance_loss_mlp": 0.01254281, + "epoch": 0.7766120547121599, + "flos": 52676583960960.0, + "grad_norm": 1.4770261011777261, + "language_loss": 0.65460002, + "learning_rate": 5.009142341196919e-07, + "loss": 0.73126698, + "num_input_tokens_seen": 278697860, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.09924316, + "step": 12917, + "time_per_iteration": 2.776418924331665 + }, + { + "auxiliary_loss_clip": 0.06402885, + "auxiliary_loss_mlp": 0.01264757, + "balance_loss_clip": 0.06269239, + "balance_loss_mlp": 0.0125522, + "epoch": 0.7766721779648279, + "flos": 25163353065600.0, + "grad_norm": 1.489121757644636, + "language_loss": 0.6467213, + "learning_rate": 5.006564561294065e-07, + "loss": 0.72339773, + "num_input_tokens_seen": 278720655, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.09533691, + "step": 12918, + "time_per_iteration": 2.5809319019317627 + }, + { + "auxiliary_loss_clip": 0.06400011, + "auxiliary_loss_mlp": 0.01265679, + "balance_loss_clip": 0.06268477, + "balance_loss_mlp": 0.01256792, + "epoch": 0.7767323012174958, + "flos": 23766161760000.0, + "grad_norm": 2.1752593632817425, + "language_loss": 0.73467445, + "learning_rate": 5.003987349943777e-07, + "loss": 0.81133133, + "num_input_tokens_seen": 278737375, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.08886719, + "step": 12919, + "time_per_iteration": 2.498762369155884 + }, + { + "auxiliary_loss_clip": 0.06403969, + "auxiliary_loss_mlp": 0.01266374, + "balance_loss_clip": 0.06270312, + "balance_loss_mlp": 0.0125626, + "epoch": 0.7767924244701638, + "flos": 22092469580160.0, + "grad_norm": 1.6453382869225388, + "language_loss": 0.79804212, + "learning_rate": 5.001410707243792e-07, + "loss": 0.87474561, + "num_input_tokens_seen": 278756510, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.10113525, + "step": 12920, + "time_per_iteration": 2.5327045917510986 + }, + { + "auxiliary_loss_clip": 0.06406744, + "auxiliary_loss_mlp": 0.01265583, + "balance_loss_clip": 0.06271371, + "balance_loss_mlp": 0.012561, + "epoch": 0.7768525477228319, + "flos": 21988194773760.0, + "grad_norm": 1.540123297700945, + "language_loss": 0.71420145, + "learning_rate": 4.998834633291829e-07, + "loss": 0.79092473, + "num_input_tokens_seen": 278775410, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.09490967, + "step": 12921, + "time_per_iteration": 2.493539333343506 + }, + { + "auxiliary_loss_clip": 0.06407829, + "auxiliary_loss_mlp": 0.01268758, + "balance_loss_clip": 0.06272625, + "balance_loss_mlp": 0.01258643, + "epoch": 0.7769126709754998, + "flos": 21800329920000.0, + "grad_norm": 1.5870112514861305, + "language_loss": 0.764503, + "learning_rate": 4.996259128185547e-07, + "loss": 0.8412689, + "num_input_tokens_seen": 278794260, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.10113525, + "step": 12922, + "time_per_iteration": 2.664897918701172 + }, + { + "auxiliary_loss_clip": 0.06402089, + "auxiliary_loss_mlp": 0.01264843, + "balance_loss_clip": 0.06270384, + "balance_loss_mlp": 0.01254853, + "epoch": 0.7769727942281678, + "flos": 20054242212480.0, + "grad_norm": 2.0384511748654286, + "language_loss": 0.80950773, + "learning_rate": 4.993684192022625e-07, + "loss": 0.88617706, + "num_input_tokens_seen": 278813290, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09991455, + "step": 12923, + "time_per_iteration": 2.4884073734283447 + }, + { + "auxiliary_loss_clip": 0.06402602, + "auxiliary_loss_mlp": 0.01263266, + "balance_loss_clip": 0.06271294, + "balance_loss_mlp": 0.01253914, + "epoch": 0.7770329174808357, + "flos": 21692784804480.0, + "grad_norm": 1.8529148039982746, + "language_loss": 0.92405283, + "learning_rate": 4.991109824900699e-07, + "loss": 1.00071156, + "num_input_tokens_seen": 278830610, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.09356689, + "step": 12924, + "time_per_iteration": 2.52184796333313 + }, + { + "auxiliary_loss_clip": 0.06402275, + "auxiliary_loss_mlp": 0.01264418, + "balance_loss_clip": 0.06269395, + "balance_loss_mlp": 0.01254804, + "epoch": 0.7770930407335037, + "flos": 25856477239680.0, + "grad_norm": 1.997586908265186, + "language_loss": 0.66484189, + "learning_rate": 4.988536026917401e-07, + "loss": 0.74150878, + "num_input_tokens_seen": 278849530, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.09606934, + "step": 12925, + "time_per_iteration": 2.528657913208008 + }, + { + "auxiliary_loss_clip": 0.06409155, + "auxiliary_loss_mlp": 0.01270758, + "balance_loss_clip": 0.06273882, + "balance_loss_mlp": 0.01261019, + "epoch": 0.7771531639861716, + "flos": 24353921024640.0, + "grad_norm": 1.7055491864849242, + "language_loss": 0.72285664, + "learning_rate": 4.985962798170314e-07, + "loss": 0.7996558, + "num_input_tokens_seen": 278869005, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.09729004, + "step": 12926, + "time_per_iteration": 2.529508352279663 + }, + { + "auxiliary_loss_clip": 0.06404512, + "auxiliary_loss_mlp": 0.01263957, + "balance_loss_clip": 0.06270072, + "balance_loss_mlp": 0.01253914, + "epoch": 0.7772132872388396, + "flos": 25637068523520.0, + "grad_norm": 1.8006607912850339, + "language_loss": 0.65851128, + "learning_rate": 4.983390138757027e-07, + "loss": 0.73519599, + "num_input_tokens_seen": 278888790, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.10046387, + "step": 12927, + "time_per_iteration": 3.9577128887176514 + }, + { + "auxiliary_loss_clip": 0.06403954, + "auxiliary_loss_mlp": 0.01268877, + "balance_loss_clip": 0.06270983, + "balance_loss_mlp": 0.01258607, + "epoch": 0.7772734104915076, + "flos": 26074544290560.0, + "grad_norm": 2.5615945281545147, + "language_loss": 0.72538382, + "learning_rate": 4.980818048775093e-07, + "loss": 0.8021121, + "num_input_tokens_seen": 278908150, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.1026001, + "step": 12928, + "time_per_iteration": 2.524092197418213 + }, + { + "auxiliary_loss_clip": 0.06398363, + "auxiliary_loss_mlp": 0.0126847, + "balance_loss_clip": 0.0626855, + "balance_loss_mlp": 0.0125935, + "epoch": 0.7773335337441756, + "flos": 22930887934080.0, + "grad_norm": 1.7899805445519197, + "language_loss": 0.74762726, + "learning_rate": 4.978246528322036e-07, + "loss": 0.82429558, + "num_input_tokens_seen": 278927425, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.09118652, + "step": 12929, + "time_per_iteration": 2.50419282913208 + }, + { + "auxiliary_loss_clip": 0.06401908, + "auxiliary_loss_mlp": 0.01268664, + "balance_loss_clip": 0.06269601, + "balance_loss_mlp": 0.01258871, + "epoch": 0.7773936569968435, + "flos": 20782977171840.0, + "grad_norm": 1.7754986557966836, + "language_loss": 0.77492833, + "learning_rate": 4.975675577495377e-07, + "loss": 0.85163409, + "num_input_tokens_seen": 278946475, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.09796143, + "step": 12930, + "time_per_iteration": 2.5014841556549072 + }, + { + "auxiliary_loss_clip": 0.06403639, + "auxiliary_loss_mlp": 0.01265185, + "balance_loss_clip": 0.06271214, + "balance_loss_mlp": 0.01255291, + "epoch": 0.7774537802495115, + "flos": 20377883808000.0, + "grad_norm": 1.923217497642762, + "language_loss": 0.80022055, + "learning_rate": 4.973105196392613e-07, + "loss": 0.87690878, + "num_input_tokens_seen": 278964345, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09893799, + "step": 12931, + "time_per_iteration": 2.479499340057373 + }, + { + "auxiliary_loss_clip": 0.06306946, + "auxiliary_loss_mlp": 0.0125312, + "balance_loss_clip": 0.06251584, + "balance_loss_mlp": 0.01252035, + "epoch": 0.7775139035021794, + "flos": 53930981980800.0, + "grad_norm": 0.7888811218125162, + "language_loss": 0.59670961, + "learning_rate": 4.970535385111199e-07, + "loss": 0.67231035, + "num_input_tokens_seen": 279022380, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01087189, + "step": 12932, + "time_per_iteration": 3.131812810897827 + }, + { + "auxiliary_loss_clip": 0.06405772, + "auxiliary_loss_mlp": 0.01263803, + "balance_loss_clip": 0.06271382, + "balance_loss_mlp": 0.01254373, + "epoch": 0.7775740267548474, + "flos": 28850437077120.0, + "grad_norm": 1.493641616196245, + "language_loss": 0.76082242, + "learning_rate": 4.967966143748595e-07, + "loss": 0.83751822, + "num_input_tokens_seen": 279044275, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.09436035, + "step": 12933, + "time_per_iteration": 2.657081127166748 + }, + { + "auxiliary_loss_clip": 0.06403433, + "auxiliary_loss_mlp": 0.01262442, + "balance_loss_clip": 0.06271302, + "balance_loss_mlp": 0.01252077, + "epoch": 0.7776341500075155, + "flos": 21879056430720.0, + "grad_norm": 1.8678224067901799, + "language_loss": 0.73828089, + "learning_rate": 4.965397472402215e-07, + "loss": 0.81493968, + "num_input_tokens_seen": 279063375, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.10369873, + "step": 12934, + "time_per_iteration": 2.514028549194336 + }, + { + "auxiliary_loss_clip": 0.06404053, + "auxiliary_loss_mlp": 0.01265488, + "balance_loss_clip": 0.06270254, + "balance_loss_mlp": 0.01255468, + "epoch": 0.7776942732601834, + "flos": 20236027645440.0, + "grad_norm": 1.899249869710296, + "language_loss": 0.70498896, + "learning_rate": 4.962829371169475e-07, + "loss": 0.78168434, + "num_input_tokens_seen": 279082680, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.10009766, + "step": 12935, + "time_per_iteration": 2.5094125270843506 + }, + { + "auxiliary_loss_clip": 0.06407169, + "auxiliary_loss_mlp": 0.01265988, + "balance_loss_clip": 0.06272172, + "balance_loss_mlp": 0.01256333, + "epoch": 0.7777543965128514, + "flos": 22237554124800.0, + "grad_norm": 1.4942918595564652, + "language_loss": 0.83564198, + "learning_rate": 4.960261840147746e-07, + "loss": 0.91237354, + "num_input_tokens_seen": 279099805, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.09661865, + "step": 12936, + "time_per_iteration": 2.4796142578125 + }, + { + "auxiliary_loss_clip": 0.0640949, + "auxiliary_loss_mlp": 0.0126322, + "balance_loss_clip": 0.06271779, + "balance_loss_mlp": 0.01254202, + "epoch": 0.7778145197655193, + "flos": 14507236321920.0, + "grad_norm": 1.7034390365737724, + "language_loss": 0.67389679, + "learning_rate": 4.957694879434397e-07, + "loss": 0.75062388, + "num_input_tokens_seen": 279117975, + "router_z_loss_clip": 1.37695312, + "router_z_loss_mlp": 0.09020996, + "step": 12937, + "time_per_iteration": 3.914120674133301 + }, + { + "auxiliary_loss_clip": 0.06402509, + "auxiliary_loss_mlp": 0.01264387, + "balance_loss_clip": 0.06269647, + "balance_loss_mlp": 0.01254928, + "epoch": 0.7778746430181873, + "flos": 21146338402560.0, + "grad_norm": 1.4641946456132704, + "language_loss": 0.87061489, + "learning_rate": 4.955128489126777e-07, + "loss": 0.94728386, + "num_input_tokens_seen": 279137255, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.09460449, + "step": 12938, + "time_per_iteration": 2.494309663772583 + }, + { + "auxiliary_loss_clip": 0.06401877, + "auxiliary_loss_mlp": 0.01265878, + "balance_loss_clip": 0.06269386, + "balance_loss_mlp": 0.01255972, + "epoch": 0.7779347662708552, + "flos": 20272560825600.0, + "grad_norm": 1.9237142576123536, + "language_loss": 0.8554709, + "learning_rate": 4.95256266932218e-07, + "loss": 0.93214846, + "num_input_tokens_seen": 279154500, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09906006, + "step": 12939, + "time_per_iteration": 2.4730064868927 + }, + { + "auxiliary_loss_clip": 0.06398107, + "auxiliary_loss_mlp": 0.01265311, + "balance_loss_clip": 0.0626917, + "balance_loss_mlp": 0.01256084, + "epoch": 0.7779948895235232, + "flos": 19215153025920.0, + "grad_norm": 1.7540702962563577, + "language_loss": 0.69412231, + "learning_rate": 4.949997420117915e-07, + "loss": 0.77075648, + "num_input_tokens_seen": 279173635, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.09228516, + "step": 12940, + "time_per_iteration": 3.918668270111084 + }, + { + "auxiliary_loss_clip": 0.064026, + "auxiliary_loss_mlp": 0.01265044, + "balance_loss_clip": 0.06269296, + "balance_loss_mlp": 0.01255627, + "epoch": 0.7780550127761912, + "flos": 23921476502400.0, + "grad_norm": 4.631352047296881, + "language_loss": 0.77788246, + "learning_rate": 4.947432741611255e-07, + "loss": 0.85455894, + "num_input_tokens_seen": 279194430, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.09423828, + "step": 12941, + "time_per_iteration": 2.5110888481140137 + }, + { + "auxiliary_loss_clip": 0.06410088, + "auxiliary_loss_mlp": 0.01268786, + "balance_loss_clip": 0.06272246, + "balance_loss_mlp": 0.01257813, + "epoch": 0.7781151360288592, + "flos": 32424148114560.0, + "grad_norm": 2.2460397891674697, + "language_loss": 0.73285127, + "learning_rate": 4.944868633899462e-07, + "loss": 0.80964005, + "num_input_tokens_seen": 279212920, + "router_z_loss_clip": 1.37695312, + "router_z_loss_mlp": 0.10974121, + "step": 12942, + "time_per_iteration": 2.5817012786865234 + }, + { + "auxiliary_loss_clip": 0.06399062, + "auxiliary_loss_mlp": 0.01266209, + "balance_loss_clip": 0.06270151, + "balance_loss_mlp": 0.01257239, + "epoch": 0.7781752592815271, + "flos": 22352981523840.0, + "grad_norm": 1.9559350984473978, + "language_loss": 0.68287194, + "learning_rate": 4.942305097079751e-07, + "loss": 0.75952458, + "num_input_tokens_seen": 279232310, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.08972168, + "step": 12943, + "time_per_iteration": 2.4933464527130127 + }, + { + "auxiliary_loss_clip": 0.06304064, + "auxiliary_loss_mlp": 0.01250725, + "balance_loss_clip": 0.06248597, + "balance_loss_mlp": 0.01249737, + "epoch": 0.7782353825341951, + "flos": 70479101802240.0, + "grad_norm": 0.7622073777913676, + "language_loss": 0.58524758, + "learning_rate": 4.939742131249347e-07, + "loss": 0.66079545, + "num_input_tokens_seen": 279295375, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.00987244, + "step": 12944, + "time_per_iteration": 3.2943570613861084 + }, + { + "auxiliary_loss_clip": 0.0640593, + "auxiliary_loss_mlp": 0.0126598, + "balance_loss_clip": 0.06270279, + "balance_loss_mlp": 0.01255495, + "epoch": 0.778295505786863, + "flos": 19068601034880.0, + "grad_norm": 1.9954002249316443, + "language_loss": 0.68333346, + "learning_rate": 4.937179736505428e-07, + "loss": 0.76005256, + "num_input_tokens_seen": 279313660, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.10491943, + "step": 12945, + "time_per_iteration": 3.963608741760254 + }, + { + "auxiliary_loss_clip": 0.06401619, + "auxiliary_loss_mlp": 0.01263231, + "balance_loss_clip": 0.06268932, + "balance_loss_mlp": 0.01253837, + "epoch": 0.778355629039531, + "flos": 21006662446080.0, + "grad_norm": 2.4482608319638404, + "language_loss": 0.69179362, + "learning_rate": 4.93461791294516e-07, + "loss": 0.76844209, + "num_input_tokens_seen": 279334495, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09387207, + "step": 12946, + "time_per_iteration": 2.528555393218994 + }, + { + "auxiliary_loss_clip": 0.06402339, + "auxiliary_loss_mlp": 0.01264109, + "balance_loss_clip": 0.06268816, + "balance_loss_mlp": 0.01254328, + "epoch": 0.7784157522921991, + "flos": 21404586286080.0, + "grad_norm": 1.63285369155658, + "language_loss": 0.65319461, + "learning_rate": 4.932056660665689e-07, + "loss": 0.72985911, + "num_input_tokens_seen": 279352985, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.09783936, + "step": 12947, + "time_per_iteration": 2.533308744430542 + }, + { + "auxiliary_loss_clip": 0.06402348, + "auxiliary_loss_mlp": 0.01262916, + "balance_loss_clip": 0.06270808, + "balance_loss_mlp": 0.01253499, + "epoch": 0.778475875544867, + "flos": 20820181184640.0, + "grad_norm": 1.87438794738079, + "language_loss": 0.65581381, + "learning_rate": 4.929495979764147e-07, + "loss": 0.73246646, + "num_input_tokens_seen": 279371360, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.09417725, + "step": 12948, + "time_per_iteration": 2.5082039833068848 + }, + { + "auxiliary_loss_clip": 0.0640206, + "auxiliary_loss_mlp": 0.01261972, + "balance_loss_clip": 0.06271663, + "balance_loss_mlp": 0.01252078, + "epoch": 0.778535998797535, + "flos": 14360516622720.0, + "grad_norm": 1.7911059027184133, + "language_loss": 0.75669527, + "learning_rate": 4.926935870337625e-07, + "loss": 0.83333564, + "num_input_tokens_seen": 279389400, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.09893799, + "step": 12949, + "time_per_iteration": 2.499680519104004 + }, + { + "auxiliary_loss_clip": 0.06407519, + "auxiliary_loss_mlp": 0.0126663, + "balance_loss_clip": 0.06271057, + "balance_loss_mlp": 0.01255871, + "epoch": 0.7785961220502029, + "flos": 19215781931520.0, + "grad_norm": 1.2917746110021882, + "language_loss": 0.69081604, + "learning_rate": 4.924376332483202e-07, + "loss": 0.7675575, + "num_input_tokens_seen": 279409715, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.10760498, + "step": 12950, + "time_per_iteration": 2.4793641567230225 + }, + { + "auxiliary_loss_clip": 0.06404532, + "auxiliary_loss_mlp": 0.0126582, + "balance_loss_clip": 0.06268837, + "balance_loss_mlp": 0.01256307, + "epoch": 0.7786562453028709, + "flos": 25745787596160.0, + "grad_norm": 1.5705407772733666, + "language_loss": 0.72314119, + "learning_rate": 4.921817366297938e-07, + "loss": 0.79984468, + "num_input_tokens_seen": 279427705, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.09509277, + "step": 12951, + "time_per_iteration": 2.533123731613159 + }, + { + "auxiliary_loss_clip": 0.06403095, + "auxiliary_loss_mlp": 0.01262496, + "balance_loss_clip": 0.06272363, + "balance_loss_mlp": 0.01252238, + "epoch": 0.7787163685555388, + "flos": 25746584209920.0, + "grad_norm": 1.6880059510178558, + "language_loss": 0.65866429, + "learning_rate": 4.919258971878877e-07, + "loss": 0.73532021, + "num_input_tokens_seen": 279448215, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.1026001, + "step": 12952, + "time_per_iteration": 2.5218706130981445 + }, + { + "auxiliary_loss_clip": 0.06394114, + "auxiliary_loss_mlp": 0.01264734, + "balance_loss_clip": 0.06268984, + "balance_loss_mlp": 0.01256032, + "epoch": 0.7787764918082068, + "flos": 22754385308160.0, + "grad_norm": 2.055033459437186, + "language_loss": 0.81612301, + "learning_rate": 4.916701149323022e-07, + "loss": 0.89271152, + "num_input_tokens_seen": 279466260, + "router_z_loss_clip": 1.25097656, + "router_z_loss_mlp": 0.08709717, + "step": 12953, + "time_per_iteration": 2.5306200981140137 + }, + { + "auxiliary_loss_clip": 0.06410024, + "auxiliary_loss_mlp": 0.01264944, + "balance_loss_clip": 0.06273989, + "balance_loss_mlp": 0.01254972, + "epoch": 0.7788366150608748, + "flos": 15195538886400.0, + "grad_norm": 1.8925370756412514, + "language_loss": 0.76971662, + "learning_rate": 4.91414389872737e-07, + "loss": 0.8464663, + "num_input_tokens_seen": 279484520, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.09960938, + "step": 12954, + "time_per_iteration": 2.4636683464050293 + }, + { + "auxiliary_loss_clip": 0.0640775, + "auxiliary_loss_mlp": 0.01263138, + "balance_loss_clip": 0.06270479, + "balance_loss_mlp": 0.01253369, + "epoch": 0.7788967383135428, + "flos": 21215799037440.0, + "grad_norm": 1.4850490788267763, + "language_loss": 0.7292642, + "learning_rate": 4.911587220188905e-07, + "loss": 0.80597305, + "num_input_tokens_seen": 279503130, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.09765625, + "step": 12955, + "time_per_iteration": 2.4956090450286865 + }, + { + "auxiliary_loss_clip": 0.06403288, + "auxiliary_loss_mlp": 0.01263998, + "balance_loss_clip": 0.06270338, + "balance_loss_mlp": 0.01253973, + "epoch": 0.7789568615662107, + "flos": 21688340538240.0, + "grad_norm": 1.3614080537003919, + "language_loss": 0.68852103, + "learning_rate": 4.909031113804551e-07, + "loss": 0.76519388, + "num_input_tokens_seen": 279521930, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.10021973, + "step": 12956, + "time_per_iteration": 2.5246806144714355 + }, + { + "auxiliary_loss_clip": 0.06403255, + "auxiliary_loss_mlp": 0.01262407, + "balance_loss_clip": 0.06269701, + "balance_loss_mlp": 0.01252864, + "epoch": 0.7790169848188787, + "flos": 26367732126720.0, + "grad_norm": 1.5408189512052117, + "language_loss": 0.7640478, + "learning_rate": 4.906475579671252e-07, + "loss": 0.84070438, + "num_input_tokens_seen": 279542375, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.09539795, + "step": 12957, + "time_per_iteration": 2.560433864593506 + }, + { + "auxiliary_loss_clip": 0.06402086, + "auxiliary_loss_mlp": 0.01265224, + "balance_loss_clip": 0.06269553, + "balance_loss_mlp": 0.01255407, + "epoch": 0.7790771080715466, + "flos": 25522563519360.0, + "grad_norm": 1.6277364892308188, + "language_loss": 0.77872479, + "learning_rate": 4.903920617885917e-07, + "loss": 0.85539794, + "num_input_tokens_seen": 279561885, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.0982666, + "step": 12958, + "time_per_iteration": 2.5132603645324707 + }, + { + "auxiliary_loss_clip": 0.06403212, + "auxiliary_loss_mlp": 0.0126808, + "balance_loss_clip": 0.06270035, + "balance_loss_mlp": 0.01257995, + "epoch": 0.7791372313242146, + "flos": 16039701244800.0, + "grad_norm": 2.1750549436439295, + "language_loss": 0.71726602, + "learning_rate": 4.901366228545418e-07, + "loss": 0.79397893, + "num_input_tokens_seen": 279579965, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.10076904, + "step": 12959, + "time_per_iteration": 2.4766464233398438 + }, + { + "auxiliary_loss_clip": 0.06403412, + "auxiliary_loss_mlp": 0.01265995, + "balance_loss_clip": 0.06269655, + "balance_loss_mlp": 0.01256208, + "epoch": 0.7791973545768827, + "flos": 23849039047680.0, + "grad_norm": 1.6457903967738072, + "language_loss": 0.77779013, + "learning_rate": 4.898812411746632e-07, + "loss": 0.8544842, + "num_input_tokens_seen": 279599030, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.09783936, + "step": 12960, + "time_per_iteration": 2.5057005882263184 + }, + { + "auxiliary_loss_clip": 0.06403294, + "auxiliary_loss_mlp": 0.01269347, + "balance_loss_clip": 0.06269927, + "balance_loss_mlp": 0.0125934, + "epoch": 0.7792574778295506, + "flos": 24174902776320.0, + "grad_norm": 1.862849792327091, + "language_loss": 0.75439703, + "learning_rate": 4.896259167586385e-07, + "loss": 0.83112347, + "num_input_tokens_seen": 279614400, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.10003662, + "step": 12961, + "time_per_iteration": 2.523517608642578 + }, + { + "auxiliary_loss_clip": 0.06400951, + "auxiliary_loss_mlp": 0.01266276, + "balance_loss_clip": 0.06274296, + "balance_loss_mlp": 0.01257592, + "epoch": 0.7793176010822186, + "flos": 21470399268480.0, + "grad_norm": 1.5483353660342332, + "language_loss": 0.73957908, + "learning_rate": 4.893706496161511e-07, + "loss": 0.81625128, + "num_input_tokens_seen": 279633745, + "router_z_loss_clip": 1.26660156, + "router_z_loss_mlp": 0.08679199, + "step": 12962, + "time_per_iteration": 2.498566150665283 + }, + { + "auxiliary_loss_clip": 0.06398464, + "auxiliary_loss_mlp": 0.01264334, + "balance_loss_clip": 0.06269018, + "balance_loss_mlp": 0.01255012, + "epoch": 0.7793777243348865, + "flos": 20672790652800.0, + "grad_norm": 1.8192572691514057, + "language_loss": 0.70224059, + "learning_rate": 4.891154397568795e-07, + "loss": 0.77886856, + "num_input_tokens_seen": 279651165, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.09326172, + "step": 12963, + "time_per_iteration": 2.507917881011963 + }, + { + "auxiliary_loss_clip": 0.06401575, + "auxiliary_loss_mlp": 0.01264258, + "balance_loss_clip": 0.06272756, + "balance_loss_mlp": 0.01254805, + "epoch": 0.7794378475875545, + "flos": 27133126047360.0, + "grad_norm": 1.5815995663676223, + "language_loss": 0.63879544, + "learning_rate": 4.888602871905019e-07, + "loss": 0.71545374, + "num_input_tokens_seen": 279671175, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.09460449, + "step": 12964, + "time_per_iteration": 2.52024245262146 + }, + { + "auxiliary_loss_clip": 0.06404367, + "auxiliary_loss_mlp": 0.01264838, + "balance_loss_clip": 0.0627073, + "balance_loss_mlp": 0.01254622, + "epoch": 0.7794979708402224, + "flos": 28081605139200.0, + "grad_norm": 1.6072168370659738, + "language_loss": 0.76559496, + "learning_rate": 4.88605191926694e-07, + "loss": 0.84228694, + "num_input_tokens_seen": 279688675, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.10211182, + "step": 12965, + "time_per_iteration": 2.5686237812042236 + }, + { + "auxiliary_loss_clip": 0.06394182, + "auxiliary_loss_mlp": 0.01263131, + "balance_loss_clip": 0.06269042, + "balance_loss_mlp": 0.01254429, + "epoch": 0.7795580940928905, + "flos": 26876722953600.0, + "grad_norm": 1.5862680415926609, + "language_loss": 0.72998363, + "learning_rate": 4.883501539751289e-07, + "loss": 0.80655676, + "num_input_tokens_seen": 279710245, + "router_z_loss_clip": 1.25195312, + "router_z_loss_mlp": 0.08703613, + "step": 12966, + "time_per_iteration": 2.51505708694458 + }, + { + "auxiliary_loss_clip": 0.06398065, + "auxiliary_loss_mlp": 0.01262043, + "balance_loss_clip": 0.06270934, + "balance_loss_mlp": 0.01253323, + "epoch": 0.7796182173455584, + "flos": 23841072910080.0, + "grad_norm": 1.47410798363511, + "language_loss": 0.74184883, + "learning_rate": 4.880951733454768e-07, + "loss": 0.81844991, + "num_input_tokens_seen": 279729045, + "router_z_loss_clip": 1.2734375, + "router_z_loss_mlp": 0.08721924, + "step": 12967, + "time_per_iteration": 3.9195239543914795 + }, + { + "auxiliary_loss_clip": 0.06406528, + "auxiliary_loss_mlp": 0.01262611, + "balance_loss_clip": 0.06272194, + "balance_loss_mlp": 0.01253462, + "epoch": 0.7796783405982264, + "flos": 19798384170240.0, + "grad_norm": 2.482748311118984, + "language_loss": 0.72366989, + "learning_rate": 4.878402500474073e-07, + "loss": 0.80036128, + "num_input_tokens_seen": 279748350, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.09155273, + "step": 12968, + "time_per_iteration": 2.5332348346710205 + }, + { + "auxiliary_loss_clip": 0.06398027, + "auxiliary_loss_mlp": 0.01268988, + "balance_loss_clip": 0.0626802, + "balance_loss_mlp": 0.01259249, + "epoch": 0.7797384638508943, + "flos": 15455589632640.0, + "grad_norm": 1.8161833543427846, + "language_loss": 0.61633801, + "learning_rate": 4.875853840905874e-07, + "loss": 0.69300812, + "num_input_tokens_seen": 279765620, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.09735107, + "step": 12969, + "time_per_iteration": 2.477679967880249 + }, + { + "auxiliary_loss_clip": 0.06398109, + "auxiliary_loss_mlp": 0.0126421, + "balance_loss_clip": 0.06271819, + "balance_loss_mlp": 0.01255651, + "epoch": 0.7797985871035623, + "flos": 20928984111360.0, + "grad_norm": 1.617507688823146, + "language_loss": 0.70254469, + "learning_rate": 4.873305754846811e-07, + "loss": 0.77916789, + "num_input_tokens_seen": 279782485, + "router_z_loss_clip": 1.26171875, + "router_z_loss_mlp": 0.08563232, + "step": 12970, + "time_per_iteration": 2.510071039199829 + }, + { + "auxiliary_loss_clip": 0.06403705, + "auxiliary_loss_mlp": 0.01266712, + "balance_loss_clip": 0.06272732, + "balance_loss_mlp": 0.01256901, + "epoch": 0.7798587103562302, + "flos": 36945667411200.0, + "grad_norm": 1.5338115729729769, + "language_loss": 0.72291183, + "learning_rate": 4.870758242393507e-07, + "loss": 0.79961598, + "num_input_tokens_seen": 279804170, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09814453, + "step": 12971, + "time_per_iteration": 2.654513359069824 + }, + { + "auxiliary_loss_clip": 0.06410386, + "auxiliary_loss_mlp": 0.01266468, + "balance_loss_clip": 0.06272395, + "balance_loss_mlp": 0.01256174, + "epoch": 0.7799188336088982, + "flos": 22425880176000.0, + "grad_norm": 1.7218916493252936, + "language_loss": 0.74606651, + "learning_rate": 4.868211303642578e-07, + "loss": 0.82283497, + "num_input_tokens_seen": 279823730, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.10290527, + "step": 12972, + "time_per_iteration": 2.517273187637329 + }, + { + "auxiliary_loss_clip": 0.06402341, + "auxiliary_loss_mlp": 0.01263993, + "balance_loss_clip": 0.06269114, + "balance_loss_mlp": 0.01254146, + "epoch": 0.7799789568615663, + "flos": 18886522112640.0, + "grad_norm": 2.215385328919691, + "language_loss": 0.71494085, + "learning_rate": 4.865664938690584e-07, + "loss": 0.79160416, + "num_input_tokens_seen": 279843035, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.09844971, + "step": 12973, + "time_per_iteration": 2.472104549407959 + }, + { + "auxiliary_loss_clip": 0.06400935, + "auxiliary_loss_mlp": 0.01265477, + "balance_loss_clip": 0.0627044, + "balance_loss_mlp": 0.01256435, + "epoch": 0.7800390801142342, + "flos": 20267781143040.0, + "grad_norm": 1.7807969698368138, + "language_loss": 0.78121793, + "learning_rate": 4.863119147634089e-07, + "loss": 0.85788202, + "num_input_tokens_seen": 279861450, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.09039307, + "step": 12974, + "time_per_iteration": 2.4978132247924805 + }, + { + "auxiliary_loss_clip": 0.06402993, + "auxiliary_loss_mlp": 0.01264928, + "balance_loss_clip": 0.06272218, + "balance_loss_mlp": 0.01255313, + "epoch": 0.7800992033669022, + "flos": 16695831041280.0, + "grad_norm": 1.52512308426482, + "language_loss": 0.6983875, + "learning_rate": 4.86057393056964e-07, + "loss": 0.77506667, + "num_input_tokens_seen": 279878660, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09619141, + "step": 12975, + "time_per_iteration": 2.4792943000793457 + }, + { + "auxiliary_loss_clip": 0.06404307, + "auxiliary_loss_mlp": 0.01265828, + "balance_loss_clip": 0.06273738, + "balance_loss_mlp": 0.01256703, + "epoch": 0.7801593266195701, + "flos": 18590650945920.0, + "grad_norm": 2.5885152450409654, + "language_loss": 0.82135439, + "learning_rate": 4.858029287593739e-07, + "loss": 0.89805579, + "num_input_tokens_seen": 279895685, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.09124756, + "step": 12976, + "time_per_iteration": 3.9093782901763916 + }, + { + "auxiliary_loss_clip": 0.06403226, + "auxiliary_loss_mlp": 0.01266163, + "balance_loss_clip": 0.06269425, + "balance_loss_mlp": 0.01256299, + "epoch": 0.7802194498722381, + "flos": 25492193614080.0, + "grad_norm": 1.298093609119966, + "language_loss": 0.66121942, + "learning_rate": 4.85548521880289e-07, + "loss": 0.73791331, + "num_input_tokens_seen": 279917240, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.09857178, + "step": 12977, + "time_per_iteration": 2.5382373332977295 + }, + { + "auxiliary_loss_clip": 0.06398032, + "auxiliary_loss_mlp": 0.01265496, + "balance_loss_clip": 0.06268156, + "balance_loss_mlp": 0.01256293, + "epoch": 0.780279573124906, + "flos": 31184451757440.0, + "grad_norm": 1.3843135589513191, + "language_loss": 0.74921417, + "learning_rate": 4.852941724293554e-07, + "loss": 0.82584947, + "num_input_tokens_seen": 279938665, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.09204102, + "step": 12978, + "time_per_iteration": 2.5999321937561035 + }, + { + "auxiliary_loss_clip": 0.0640787, + "auxiliary_loss_mlp": 0.01263935, + "balance_loss_clip": 0.06272239, + "balance_loss_mlp": 0.01253529, + "epoch": 0.780339696377574, + "flos": 26951466395520.0, + "grad_norm": 1.7189824497298882, + "language_loss": 0.6233561, + "learning_rate": 4.85039880416219e-07, + "loss": 0.70007408, + "num_input_tokens_seen": 279957965, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.10406494, + "step": 12979, + "time_per_iteration": 4.002735137939453 + }, + { + "auxiliary_loss_clip": 0.0640031, + "auxiliary_loss_mlp": 0.01264611, + "balance_loss_clip": 0.06269379, + "balance_loss_mlp": 0.01255163, + "epoch": 0.780399819630242, + "flos": 27963662117760.0, + "grad_norm": 1.7958108111348887, + "language_loss": 0.77048111, + "learning_rate": 4.847856458505217e-07, + "loss": 0.8471303, + "num_input_tokens_seen": 279977490, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09454346, + "step": 12980, + "time_per_iteration": 2.574740171432495 + }, + { + "auxiliary_loss_clip": 0.06404287, + "auxiliary_loss_mlp": 0.0126621, + "balance_loss_clip": 0.06269396, + "balance_loss_mlp": 0.01256941, + "epoch": 0.78045994288291, + "flos": 22492489772160.0, + "grad_norm": 7.38729106022631, + "language_loss": 0.77965951, + "learning_rate": 4.845314687419046e-07, + "loss": 0.85636449, + "num_input_tokens_seen": 279994220, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.09259033, + "step": 12981, + "time_per_iteration": 2.6090612411499023 + }, + { + "auxiliary_loss_clip": 0.06406559, + "auxiliary_loss_mlp": 0.01273892, + "balance_loss_clip": 0.0627367, + "balance_loss_mlp": 0.01264642, + "epoch": 0.7805200661355779, + "flos": 20857259416320.0, + "grad_norm": 1.7019427662247137, + "language_loss": 0.72918165, + "learning_rate": 4.842773491000067e-07, + "loss": 0.80598617, + "num_input_tokens_seen": 280012590, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.09246826, + "step": 12982, + "time_per_iteration": 2.538454294204712 + }, + { + "auxiliary_loss_clip": 0.06401584, + "auxiliary_loss_mlp": 0.01261641, + "balance_loss_clip": 0.06268401, + "balance_loss_mlp": 0.01251932, + "epoch": 0.7805801893882459, + "flos": 25673014725120.0, + "grad_norm": 1.3557046111100475, + "language_loss": 0.73713994, + "learning_rate": 4.840232869344636e-07, + "loss": 0.8137722, + "num_input_tokens_seen": 280033700, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.0970459, + "step": 12983, + "time_per_iteration": 2.55915904045105 + }, + { + "auxiliary_loss_clip": 0.06403306, + "auxiliary_loss_mlp": 0.01265365, + "balance_loss_clip": 0.06270759, + "balance_loss_mlp": 0.0125584, + "epoch": 0.7806403126409138, + "flos": 11332581154560.0, + "grad_norm": 1.8511733827062056, + "language_loss": 0.7564944, + "learning_rate": 4.837692822549086e-07, + "loss": 0.83318114, + "num_input_tokens_seen": 280052215, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.09521484, + "step": 12984, + "time_per_iteration": 3.9226207733154297 + }, + { + "auxiliary_loss_clip": 0.06401315, + "auxiliary_loss_mlp": 0.01261166, + "balance_loss_clip": 0.06270321, + "balance_loss_mlp": 0.01252345, + "epoch": 0.7807004358935818, + "flos": 19579478578560.0, + "grad_norm": 1.6909183647734616, + "language_loss": 0.81444597, + "learning_rate": 4.835153350709746e-07, + "loss": 0.89107084, + "num_input_tokens_seen": 280070525, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.08831787, + "step": 12985, + "time_per_iteration": 2.495833396911621 + }, + { + "auxiliary_loss_clip": 0.06404648, + "auxiliary_loss_mlp": 0.01270247, + "balance_loss_clip": 0.06273016, + "balance_loss_mlp": 0.01260007, + "epoch": 0.7807605591462499, + "flos": 19141918957440.0, + "grad_norm": 1.5866346872788593, + "language_loss": 0.7735818, + "learning_rate": 4.832614453922915e-07, + "loss": 0.85033077, + "num_input_tokens_seen": 280089855, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.10235596, + "step": 12986, + "time_per_iteration": 2.4942498207092285 + }, + { + "auxiliary_loss_clip": 0.06404544, + "auxiliary_loss_mlp": 0.01262193, + "balance_loss_clip": 0.06271936, + "balance_loss_mlp": 0.01252829, + "epoch": 0.7808206823989178, + "flos": 32382038638080.0, + "grad_norm": 1.540132157025115, + "language_loss": 0.74469846, + "learning_rate": 4.830076132284859e-07, + "loss": 0.82136583, + "num_input_tokens_seen": 280109960, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09375, + "step": 12987, + "time_per_iteration": 2.6014459133148193 + }, + { + "auxiliary_loss_clip": 0.06307278, + "auxiliary_loss_mlp": 0.01248897, + "balance_loss_clip": 0.06251733, + "balance_loss_mlp": 0.01247845, + "epoch": 0.7808808056515858, + "flos": 55070512381440.0, + "grad_norm": 0.7358853994181496, + "language_loss": 0.55100733, + "learning_rate": 4.82753838589184e-07, + "loss": 0.62656909, + "num_input_tokens_seen": 280169805, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01052094, + "step": 12988, + "time_per_iteration": 3.1363513469696045 + }, + { + "auxiliary_loss_clip": 0.06395964, + "auxiliary_loss_mlp": 0.01273063, + "balance_loss_clip": 0.06268729, + "balance_loss_mlp": 0.01264235, + "epoch": 0.7809409289042537, + "flos": 12864375244800.0, + "grad_norm": 2.503136362743708, + "language_loss": 0.80932319, + "learning_rate": 4.82500121484009e-07, + "loss": 0.88601345, + "num_input_tokens_seen": 280184630, + "router_z_loss_clip": 1.27246094, + "router_z_loss_mlp": 0.08831787, + "step": 12989, + "time_per_iteration": 2.4550793170928955 + }, + { + "auxiliary_loss_clip": 0.06397895, + "auxiliary_loss_mlp": 0.0126169, + "balance_loss_clip": 0.06268378, + "balance_loss_mlp": 0.0125269, + "epoch": 0.7810010521569217, + "flos": 21693329856000.0, + "grad_norm": 1.5548108351785217, + "language_loss": 0.70569479, + "learning_rate": 4.822464619225806e-07, + "loss": 0.78229064, + "num_input_tokens_seen": 280203880, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.09002686, + "step": 12990, + "time_per_iteration": 2.534583330154419 + }, + { + "auxiliary_loss_clip": 0.064027, + "auxiliary_loss_mlp": 0.01265995, + "balance_loss_clip": 0.06270639, + "balance_loss_mlp": 0.01255666, + "epoch": 0.7810611754095896, + "flos": 16761560169600.0, + "grad_norm": 2.151540581159162, + "language_loss": 0.78160757, + "learning_rate": 4.819928599145184e-07, + "loss": 0.85829455, + "num_input_tokens_seen": 280220460, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.10327148, + "step": 12991, + "time_per_iteration": 2.4641294479370117 + }, + { + "auxiliary_loss_clip": 0.06403095, + "auxiliary_loss_mlp": 0.01267597, + "balance_loss_clip": 0.06270657, + "balance_loss_mlp": 0.01257071, + "epoch": 0.7811212986622577, + "flos": 43517489063040.0, + "grad_norm": 1.4386933089332317, + "language_loss": 0.66202235, + "learning_rate": 4.817393154694398e-07, + "loss": 0.73872924, + "num_input_tokens_seen": 280242680, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.10528564, + "step": 12992, + "time_per_iteration": 2.712284564971924 + }, + { + "auxiliary_loss_clip": 0.06407847, + "auxiliary_loss_mlp": 0.0126388, + "balance_loss_clip": 0.06273159, + "balance_loss_mlp": 0.01254373, + "epoch": 0.7811814219149256, + "flos": 21763377469440.0, + "grad_norm": 1.666565007875902, + "language_loss": 0.61892599, + "learning_rate": 4.814858285969578e-07, + "loss": 0.69564325, + "num_input_tokens_seen": 280260655, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.09503174, + "step": 12993, + "time_per_iteration": 2.4966509342193604 + }, + { + "auxiliary_loss_clip": 0.06400012, + "auxiliary_loss_mlp": 0.0126208, + "balance_loss_clip": 0.06270296, + "balance_loss_mlp": 0.01252532, + "epoch": 0.7812415451675936, + "flos": 24068447763840.0, + "grad_norm": 1.3952221037257373, + "language_loss": 0.68836015, + "learning_rate": 4.812323993066862e-07, + "loss": 0.76498109, + "num_input_tokens_seen": 280281185, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.09545898, + "step": 12994, + "time_per_iteration": 2.536137819290161 + }, + { + "auxiliary_loss_clip": 0.06404947, + "auxiliary_loss_mlp": 0.01263745, + "balance_loss_clip": 0.06273837, + "balance_loss_mlp": 0.01254703, + "epoch": 0.7813016684202615, + "flos": 18995744309760.0, + "grad_norm": 1.7501216946691078, + "language_loss": 0.69363022, + "learning_rate": 4.809790276082335e-07, + "loss": 0.77031708, + "num_input_tokens_seen": 280298255, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.09039307, + "step": 12995, + "time_per_iteration": 2.470670700073242 + }, + { + "auxiliary_loss_clip": 0.06396692, + "auxiliary_loss_mlp": 0.01265345, + "balance_loss_clip": 0.06268929, + "balance_loss_mlp": 0.0125644, + "epoch": 0.7813617916729295, + "flos": 25267124747520.0, + "grad_norm": 1.5705022516303782, + "language_loss": 0.75361514, + "learning_rate": 4.807257135112088e-07, + "loss": 0.83023554, + "num_input_tokens_seen": 280319000, + "router_z_loss_clip": 1.27734375, + "router_z_loss_mlp": 0.08905029, + "step": 12996, + "time_per_iteration": 2.548156261444092 + }, + { + "auxiliary_loss_clip": 0.06408437, + "auxiliary_loss_mlp": 0.01266772, + "balance_loss_clip": 0.06271097, + "balance_loss_mlp": 0.01256055, + "epoch": 0.7814219149255974, + "flos": 17971557454080.0, + "grad_norm": 2.5240024848484284, + "language_loss": 0.68320543, + "learning_rate": 4.804724570252167e-07, + "loss": 0.75995755, + "num_input_tokens_seen": 280336375, + "router_z_loss_clip": 1.37402344, + "router_z_loss_mlp": 0.10723877, + "step": 12997, + "time_per_iteration": 2.4495344161987305 + }, + { + "auxiliary_loss_clip": 0.06410494, + "auxiliary_loss_mlp": 0.01266795, + "balance_loss_clip": 0.06272165, + "balance_loss_mlp": 0.01256018, + "epoch": 0.7814820381782654, + "flos": 25783368952320.0, + "grad_norm": 1.6126365862237693, + "language_loss": 0.82193416, + "learning_rate": 4.802192581598614e-07, + "loss": 0.89870703, + "num_input_tokens_seen": 280358760, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.10778809, + "step": 12998, + "time_per_iteration": 2.535696506500244 + }, + { + "auxiliary_loss_clip": 0.06407057, + "auxiliary_loss_mlp": 0.01266001, + "balance_loss_clip": 0.06273869, + "balance_loss_mlp": 0.01256166, + "epoch": 0.7815421614309335, + "flos": 20525442048000.0, + "grad_norm": 1.8946982526297624, + "language_loss": 0.7477777, + "learning_rate": 4.799661169247453e-07, + "loss": 0.82450831, + "num_input_tokens_seen": 280377085, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.09844971, + "step": 12999, + "time_per_iteration": 2.4902775287628174 + }, + { + "auxiliary_loss_clip": 0.06407912, + "auxiliary_loss_mlp": 0.01262829, + "balance_loss_clip": 0.06271957, + "balance_loss_mlp": 0.01252517, + "epoch": 0.7816022846836014, + "flos": 21293980496640.0, + "grad_norm": 1.4384947504961985, + "language_loss": 0.84615433, + "learning_rate": 4.797130333294652e-07, + "loss": 0.92286175, + "num_input_tokens_seen": 280395465, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.10314941, + "step": 13000, + "time_per_iteration": 2.512596607208252 + }, + { + "auxiliary_loss_clip": 0.0640571, + "auxiliary_loss_mlp": 0.01264665, + "balance_loss_clip": 0.06273641, + "balance_loss_mlp": 0.01254126, + "epoch": 0.7816624079362694, + "flos": 19214440266240.0, + "grad_norm": 1.8073266601471953, + "language_loss": 0.66751462, + "learning_rate": 4.794600073836192e-07, + "loss": 0.74421835, + "num_input_tokens_seen": 280412775, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.10540771, + "step": 13001, + "time_per_iteration": 2.4772894382476807 + }, + { + "auxiliary_loss_clip": 0.06405921, + "auxiliary_loss_mlp": 0.01263146, + "balance_loss_clip": 0.06271157, + "balance_loss_mlp": 0.01253526, + "epoch": 0.7817225311889373, + "flos": 26111957938560.0, + "grad_norm": 1.5273491192329303, + "language_loss": 0.66959155, + "learning_rate": 4.792070390968027e-07, + "loss": 0.74628222, + "num_input_tokens_seen": 280432905, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.09625244, + "step": 13002, + "time_per_iteration": 2.5820791721343994 + }, + { + "auxiliary_loss_clip": 0.06409384, + "auxiliary_loss_mlp": 0.01266696, + "balance_loss_clip": 0.06275305, + "balance_loss_mlp": 0.01256176, + "epoch": 0.7817826544416053, + "flos": 21257195754240.0, + "grad_norm": 2.018800094451087, + "language_loss": 0.73878789, + "learning_rate": 4.78954128478607e-07, + "loss": 0.81554866, + "num_input_tokens_seen": 280450785, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.10534668, + "step": 13003, + "time_per_iteration": 2.481661319732666 + }, + { + "auxiliary_loss_clip": 0.06404527, + "auxiliary_loss_mlp": 0.01265727, + "balance_loss_clip": 0.0627287, + "balance_loss_mlp": 0.01256208, + "epoch": 0.7818427776942732, + "flos": 19937347367040.0, + "grad_norm": 1.9756660000355053, + "language_loss": 0.62827951, + "learning_rate": 4.787012755386233e-07, + "loss": 0.70498204, + "num_input_tokens_seen": 280468400, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09515381, + "step": 13004, + "time_per_iteration": 2.497821569442749 + }, + { + "auxiliary_loss_clip": 0.0639583, + "auxiliary_loss_mlp": 0.01262478, + "balance_loss_clip": 0.06268562, + "balance_loss_mlp": 0.01253669, + "epoch": 0.7819029009469413, + "flos": 11368443502080.0, + "grad_norm": 1.7802974888908354, + "language_loss": 0.83142269, + "learning_rate": 4.784484802864403e-07, + "loss": 0.90800571, + "num_input_tokens_seen": 280483930, + "router_z_loss_clip": 1.2734375, + "router_z_loss_mlp": 0.08807373, + "step": 13005, + "time_per_iteration": 2.455112934112549 + }, + { + "auxiliary_loss_clip": 0.06402773, + "auxiliary_loss_mlp": 0.01265138, + "balance_loss_clip": 0.06270364, + "balance_loss_mlp": 0.01255172, + "epoch": 0.7819630241996092, + "flos": 24286053617280.0, + "grad_norm": 1.9304449854635368, + "language_loss": 0.73000956, + "learning_rate": 4.781957427316432e-07, + "loss": 0.80668867, + "num_input_tokens_seen": 280503465, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.09973145, + "step": 13006, + "time_per_iteration": 3.923842191696167 + }, + { + "auxiliary_loss_clip": 0.06406109, + "auxiliary_loss_mlp": 0.01263435, + "balance_loss_clip": 0.06271446, + "balance_loss_mlp": 0.01252891, + "epoch": 0.7820231474522772, + "flos": 22715168797440.0, + "grad_norm": 1.5911839097464888, + "language_loss": 0.72339863, + "learning_rate": 4.779430628838157e-07, + "loss": 0.80009413, + "num_input_tokens_seen": 280523375, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.10540771, + "step": 13007, + "time_per_iteration": 2.5166056156158447 + }, + { + "auxiliary_loss_clip": 0.06406694, + "auxiliary_loss_mlp": 0.01267894, + "balance_loss_clip": 0.06271846, + "balance_loss_mlp": 0.0125782, + "epoch": 0.7820832707049451, + "flos": 20053571379840.0, + "grad_norm": 2.020015501308364, + "language_loss": 0.69036144, + "learning_rate": 4.776904407525397e-07, + "loss": 0.76710731, + "num_input_tokens_seen": 280542920, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.10070801, + "step": 13008, + "time_per_iteration": 2.495736837387085 + }, + { + "auxiliary_loss_clip": 0.064032, + "auxiliary_loss_mlp": 0.012644, + "balance_loss_clip": 0.06269944, + "balance_loss_mlp": 0.01253457, + "epoch": 0.7821433939576131, + "flos": 27170246206080.0, + "grad_norm": 1.7298477969217696, + "language_loss": 0.69919395, + "learning_rate": 4.774378763473954e-07, + "loss": 0.77586997, + "num_input_tokens_seen": 280561700, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.10949707, + "step": 13009, + "time_per_iteration": 2.5899367332458496 + }, + { + "auxiliary_loss_clip": 0.06399304, + "auxiliary_loss_mlp": 0.01262145, + "balance_loss_clip": 0.06269169, + "balance_loss_mlp": 0.01252781, + "epoch": 0.782203517210281, + "flos": 22608755712000.0, + "grad_norm": 1.790636522261297, + "language_loss": 0.81948966, + "learning_rate": 4.771853696779586e-07, + "loss": 0.89610416, + "num_input_tokens_seen": 280580605, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.09362793, + "step": 13010, + "time_per_iteration": 2.5066049098968506 + }, + { + "auxiliary_loss_clip": 0.06400339, + "auxiliary_loss_mlp": 0.01262085, + "balance_loss_clip": 0.06270656, + "balance_loss_mlp": 0.01252692, + "epoch": 0.782263640462949, + "flos": 29067539806080.0, + "grad_norm": 1.385682436411659, + "language_loss": 0.62627685, + "learning_rate": 4.76932920753806e-07, + "loss": 0.70290112, + "num_input_tokens_seen": 280601495, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.09399414, + "step": 13011, + "time_per_iteration": 2.6026289463043213 + }, + { + "auxiliary_loss_clip": 0.06399235, + "auxiliary_loss_mlp": 0.0126419, + "balance_loss_clip": 0.0626906, + "balance_loss_mlp": 0.01255306, + "epoch": 0.782323763715617, + "flos": 25306215477120.0, + "grad_norm": 1.6427811316724177, + "language_loss": 0.70159376, + "learning_rate": 4.7668052958450913e-07, + "loss": 0.77822804, + "num_input_tokens_seen": 280622760, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.08883667, + "step": 13012, + "time_per_iteration": 2.53303861618042 + }, + { + "auxiliary_loss_clip": 0.0630969, + "auxiliary_loss_mlp": 0.01250424, + "balance_loss_clip": 0.06253915, + "balance_loss_mlp": 0.01249417, + "epoch": 0.782383886968285, + "flos": 65216548195200.0, + "grad_norm": 0.6922289036219499, + "language_loss": 0.55011511, + "learning_rate": 4.764281961796395e-07, + "loss": 0.62571621, + "num_input_tokens_seen": 280687115, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.0100708, + "step": 13013, + "time_per_iteration": 3.228905439376831 + }, + { + "auxiliary_loss_clip": 0.06409347, + "auxiliary_loss_mlp": 0.01264895, + "balance_loss_clip": 0.06273122, + "balance_loss_mlp": 0.01254708, + "epoch": 0.782444010220953, + "flos": 18411297281280.0, + "grad_norm": 1.7267010887219136, + "language_loss": 0.6554383, + "learning_rate": 4.76175920548765e-07, + "loss": 0.73218066, + "num_input_tokens_seen": 280705000, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.10186768, + "step": 13014, + "time_per_iteration": 2.4842281341552734 + }, + { + "auxiliary_loss_clip": 0.06309456, + "auxiliary_loss_mlp": 0.01249284, + "balance_loss_clip": 0.06253707, + "balance_loss_mlp": 0.01248232, + "epoch": 0.7825041334736209, + "flos": 63977145327360.0, + "grad_norm": 0.6946375412557042, + "language_loss": 0.58183634, + "learning_rate": 4.759237027014524e-07, + "loss": 0.65742373, + "num_input_tokens_seen": 280773525, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.01052094, + "step": 13015, + "time_per_iteration": 4.588924169540405 + }, + { + "auxiliary_loss_clip": 0.06401119, + "auxiliary_loss_mlp": 0.01267469, + "balance_loss_clip": 0.06269481, + "balance_loss_mlp": 0.01258141, + "epoch": 0.7825642567262889, + "flos": 20345585258880.0, + "grad_norm": 1.703957116588016, + "language_loss": 0.75081736, + "learning_rate": 4.756715426472666e-07, + "loss": 0.8275032, + "num_input_tokens_seen": 280791915, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.09326172, + "step": 13016, + "time_per_iteration": 2.5329108238220215 + }, + { + "auxiliary_loss_clip": 0.06404392, + "auxiliary_loss_mlp": 0.01262942, + "balance_loss_clip": 0.0627065, + "balance_loss_mlp": 0.01252303, + "epoch": 0.7826243799789568, + "flos": 20268577756800.0, + "grad_norm": 1.8073604316882006, + "language_loss": 0.75204456, + "learning_rate": 4.7541944039576766e-07, + "loss": 0.82871789, + "num_input_tokens_seen": 280811460, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.10644531, + "step": 13017, + "time_per_iteration": 2.475156307220459 + }, + { + "auxiliary_loss_clip": 0.06402843, + "auxiliary_loss_mlp": 0.01267244, + "balance_loss_clip": 0.06268843, + "balance_loss_mlp": 0.01256974, + "epoch": 0.7826845032316249, + "flos": 21137743359360.0, + "grad_norm": 2.040801926545799, + "language_loss": 0.76392686, + "learning_rate": 4.7516739595651636e-07, + "loss": 0.84062773, + "num_input_tokens_seen": 280825415, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.10272217, + "step": 13018, + "time_per_iteration": 2.487426280975342 + }, + { + "auxiliary_loss_clip": 0.06399854, + "auxiliary_loss_mlp": 0.01266755, + "balance_loss_clip": 0.06267899, + "balance_loss_mlp": 0.01256652, + "epoch": 0.7827446264842928, + "flos": 22498862682240.0, + "grad_norm": 1.372243474464688, + "language_loss": 0.77303207, + "learning_rate": 4.749154093390708e-07, + "loss": 0.84969819, + "num_input_tokens_seen": 280845335, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.10101318, + "step": 13019, + "time_per_iteration": 3.9929661750793457 + }, + { + "auxiliary_loss_clip": 0.06402994, + "auxiliary_loss_mlp": 0.01262289, + "balance_loss_clip": 0.06270827, + "balance_loss_mlp": 0.01252716, + "epoch": 0.7828047497369608, + "flos": 28848298798080.0, + "grad_norm": 1.5302046245116039, + "language_loss": 0.6745941, + "learning_rate": 4.746634805529852e-07, + "loss": 0.75124693, + "num_input_tokens_seen": 280867145, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.09570312, + "step": 13020, + "time_per_iteration": 2.564709424972534 + }, + { + "auxiliary_loss_clip": 0.06400368, + "auxiliary_loss_mlp": 0.012665, + "balance_loss_clip": 0.0626877, + "balance_loss_mlp": 0.01256397, + "epoch": 0.7828648729896287, + "flos": 23264298529920.0, + "grad_norm": 2.6855687872649825, + "language_loss": 0.62745917, + "learning_rate": 4.7441160960781325e-07, + "loss": 0.70412791, + "num_input_tokens_seen": 280886185, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.10101318, + "step": 13021, + "time_per_iteration": 2.4964163303375244 + }, + { + "auxiliary_loss_clip": 0.06403099, + "auxiliary_loss_mlp": 0.01264616, + "balance_loss_clip": 0.06270363, + "balance_loss_mlp": 0.01255592, + "epoch": 0.7829249962422967, + "flos": 25272826824960.0, + "grad_norm": 1.5874593754725228, + "language_loss": 0.69790453, + "learning_rate": 4.7415979651310636e-07, + "loss": 0.77458167, + "num_input_tokens_seen": 280907665, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.090271, + "step": 13022, + "time_per_iteration": 2.5415072441101074 + }, + { + "auxiliary_loss_clip": 0.06309162, + "auxiliary_loss_mlp": 0.01253506, + "balance_loss_clip": 0.06253611, + "balance_loss_mlp": 0.01252549, + "epoch": 0.7829851194949646, + "flos": 70742087441280.0, + "grad_norm": 0.6386935126948231, + "language_loss": 0.56138313, + "learning_rate": 4.739080412784131e-07, + "loss": 0.6370098, + "num_input_tokens_seen": 280971405, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.009552, + "step": 13023, + "time_per_iteration": 4.637472867965698 + }, + { + "auxiliary_loss_clip": 0.06393711, + "auxiliary_loss_mlp": 0.01265571, + "balance_loss_clip": 0.06267409, + "balance_loss_mlp": 0.01256451, + "epoch": 0.7830452427476327, + "flos": 25666977231360.0, + "grad_norm": 1.576482021290812, + "language_loss": 0.67401826, + "learning_rate": 4.736563439132792e-07, + "loss": 0.75061107, + "num_input_tokens_seen": 280989615, + "router_z_loss_clip": 1.26464844, + "router_z_loss_mlp": 0.09118652, + "step": 13024, + "time_per_iteration": 2.538425922393799 + }, + { + "auxiliary_loss_clip": 0.06403638, + "auxiliary_loss_mlp": 0.01263953, + "balance_loss_clip": 0.06269067, + "balance_loss_mlp": 0.0125357, + "epoch": 0.7831053660003006, + "flos": 22791002342400.0, + "grad_norm": 1.5665497407988729, + "language_loss": 0.77940929, + "learning_rate": 4.734047044272498e-07, + "loss": 0.85608524, + "num_input_tokens_seen": 281009450, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.10369873, + "step": 13025, + "time_per_iteration": 2.5431177616119385 + }, + { + "auxiliary_loss_clip": 0.0640173, + "auxiliary_loss_mlp": 0.01265493, + "balance_loss_clip": 0.06270472, + "balance_loss_mlp": 0.01256302, + "epoch": 0.7831654892529686, + "flos": 25819399008000.0, + "grad_norm": 1.644612426825064, + "language_loss": 0.7874493, + "learning_rate": 4.731531228298673e-07, + "loss": 0.86412156, + "num_input_tokens_seen": 281028120, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09197998, + "step": 13026, + "time_per_iteration": 2.556727647781372 + }, + { + "auxiliary_loss_clip": 0.06404313, + "auxiliary_loss_mlp": 0.01262471, + "balance_loss_clip": 0.06272115, + "balance_loss_mlp": 0.01253006, + "epoch": 0.7832256125056366, + "flos": 20776897751040.0, + "grad_norm": 2.5804756283092334, + "language_loss": 0.75804269, + "learning_rate": 4.729015991306715e-07, + "loss": 0.83471048, + "num_input_tokens_seen": 281042130, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.09466553, + "step": 13027, + "time_per_iteration": 2.4878506660461426 + }, + { + "auxiliary_loss_clip": 0.0639909, + "auxiliary_loss_mlp": 0.01265777, + "balance_loss_clip": 0.06269808, + "balance_loss_mlp": 0.01255978, + "epoch": 0.7832857357583045, + "flos": 21512886088320.0, + "grad_norm": 1.7061440421315746, + "language_loss": 0.70765603, + "learning_rate": 4.726501333391997e-07, + "loss": 0.78430474, + "num_input_tokens_seen": 281060945, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.09802246, + "step": 13028, + "time_per_iteration": 2.498478651046753 + }, + { + "auxiliary_loss_clip": 0.06406339, + "auxiliary_loss_mlp": 0.01268084, + "balance_loss_clip": 0.06271327, + "balance_loss_mlp": 0.01257874, + "epoch": 0.7833458590109725, + "flos": 18083714544000.0, + "grad_norm": 1.9644194417750374, + "language_loss": 0.68658125, + "learning_rate": 4.7239872546498774e-07, + "loss": 0.76332551, + "num_input_tokens_seen": 281079270, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.10217285, + "step": 13029, + "time_per_iteration": 2.580122470855713 + }, + { + "auxiliary_loss_clip": 0.06403092, + "auxiliary_loss_mlp": 0.01267866, + "balance_loss_clip": 0.0626725, + "balance_loss_mlp": 0.01258001, + "epoch": 0.7834059822636404, + "flos": 28295521413120.0, + "grad_norm": 1.7391755665392523, + "language_loss": 0.81014347, + "learning_rate": 4.721473755175698e-07, + "loss": 0.88685304, + "num_input_tokens_seen": 281099500, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.09869385, + "step": 13030, + "time_per_iteration": 2.5314316749572754 + }, + { + "auxiliary_loss_clip": 0.06404968, + "auxiliary_loss_mlp": 0.01261968, + "balance_loss_clip": 0.06269055, + "balance_loss_mlp": 0.01251949, + "epoch": 0.7834661055163085, + "flos": 31694281125120.0, + "grad_norm": 1.5048813517509494, + "language_loss": 0.70804811, + "learning_rate": 4.71896083506476e-07, + "loss": 0.78471744, + "num_input_tokens_seen": 281121250, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10021973, + "step": 13031, + "time_per_iteration": 2.5823378562927246 + }, + { + "auxiliary_loss_clip": 0.06405063, + "auxiliary_loss_mlp": 0.01266526, + "balance_loss_clip": 0.06270566, + "balance_loss_mlp": 0.01257079, + "epoch": 0.7835262287689764, + "flos": 12938238218880.0, + "grad_norm": 2.7115393333323468, + "language_loss": 0.78693461, + "learning_rate": 4.7164484944123574e-07, + "loss": 0.86365044, + "num_input_tokens_seen": 281138760, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.09442139, + "step": 13032, + "time_per_iteration": 2.4609038829803467 + }, + { + "auxiliary_loss_clip": 0.06404404, + "auxiliary_loss_mlp": 0.01268456, + "balance_loss_clip": 0.06269069, + "balance_loss_mlp": 0.01258317, + "epoch": 0.7835863520216444, + "flos": 16148671879680.0, + "grad_norm": 1.9002530639505248, + "language_loss": 0.63003838, + "learning_rate": 4.7139367333137726e-07, + "loss": 0.70676696, + "num_input_tokens_seen": 281157420, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.10137939, + "step": 13033, + "time_per_iteration": 2.500108242034912 + }, + { + "auxiliary_loss_clip": 0.06404372, + "auxiliary_loss_mlp": 0.01263517, + "balance_loss_clip": 0.06270869, + "balance_loss_mlp": 0.01253492, + "epoch": 0.7836464752743123, + "flos": 11514660076800.0, + "grad_norm": 1.5173952682400234, + "language_loss": 0.72150695, + "learning_rate": 4.7114255518642255e-07, + "loss": 0.79818583, + "num_input_tokens_seen": 281174620, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.10021973, + "step": 13034, + "time_per_iteration": 2.4920992851257324 + }, + { + "auxiliary_loss_clip": 0.06405693, + "auxiliary_loss_mlp": 0.01268729, + "balance_loss_clip": 0.06272385, + "balance_loss_mlp": 0.01258685, + "epoch": 0.7837065985269803, + "flos": 18229637629440.0, + "grad_norm": 1.7491156010672833, + "language_loss": 0.7212472, + "learning_rate": 4.7089149501589555e-07, + "loss": 0.79799139, + "num_input_tokens_seen": 281193865, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.1005249, + "step": 13035, + "time_per_iteration": 2.482640027999878 + }, + { + "auxiliary_loss_clip": 0.06404319, + "auxiliary_loss_mlp": 0.01270811, + "balance_loss_clip": 0.06270225, + "balance_loss_mlp": 0.01260541, + "epoch": 0.7837667217796482, + "flos": 24761404229760.0, + "grad_norm": 2.0189753157396373, + "language_loss": 0.66216964, + "learning_rate": 4.7064049282931664e-07, + "loss": 0.73892099, + "num_input_tokens_seen": 281212250, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.10266113, + "step": 13036, + "time_per_iteration": 2.5221505165100098 + }, + { + "auxiliary_loss_clip": 0.06407806, + "auxiliary_loss_mlp": 0.01272324, + "balance_loss_clip": 0.06269644, + "balance_loss_mlp": 0.01260981, + "epoch": 0.7838268450323163, + "flos": 22389766266240.0, + "grad_norm": 2.337708376501524, + "language_loss": 0.73523962, + "learning_rate": 4.703895486362031e-07, + "loss": 0.81204098, + "num_input_tokens_seen": 281230850, + "router_z_loss_clip": 1.38183594, + "router_z_loss_mlp": 0.11340332, + "step": 13037, + "time_per_iteration": 2.5027549266815186 + }, + { + "auxiliary_loss_clip": 0.06402339, + "auxiliary_loss_mlp": 0.01265411, + "balance_loss_clip": 0.06268933, + "balance_loss_mlp": 0.01255099, + "epoch": 0.7838869682849842, + "flos": 19506370291200.0, + "grad_norm": 2.111880919052157, + "language_loss": 0.60144168, + "learning_rate": 4.701386624460717e-07, + "loss": 0.67811918, + "num_input_tokens_seen": 281249810, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.10321045, + "step": 13038, + "time_per_iteration": 2.4813334941864014 + }, + { + "auxiliary_loss_clip": 0.06401114, + "auxiliary_loss_mlp": 0.01264836, + "balance_loss_clip": 0.06270541, + "balance_loss_mlp": 0.01255484, + "epoch": 0.7839470915376522, + "flos": 32901553152000.0, + "grad_norm": 1.5605584713979823, + "language_loss": 0.68332416, + "learning_rate": 4.698878342684349e-07, + "loss": 0.75998366, + "num_input_tokens_seen": 281273730, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.09350586, + "step": 13039, + "time_per_iteration": 2.616943359375 + }, + { + "auxiliary_loss_clip": 0.06395827, + "auxiliary_loss_mlp": 0.01261469, + "balance_loss_clip": 0.06267862, + "balance_loss_mlp": 0.01253244, + "epoch": 0.7840072147903202, + "flos": 29683153353600.0, + "grad_norm": 1.67583580210183, + "language_loss": 0.69978261, + "learning_rate": 4.6963706411280537e-07, + "loss": 0.77635556, + "num_input_tokens_seen": 281293670, + "router_z_loss_clip": 1.27832031, + "router_z_loss_mlp": 0.08227539, + "step": 13040, + "time_per_iteration": 2.575289726257324 + }, + { + "auxiliary_loss_clip": 0.06404934, + "auxiliary_loss_mlp": 0.01266779, + "balance_loss_clip": 0.06269483, + "balance_loss_mlp": 0.01256503, + "epoch": 0.7840673380429881, + "flos": 18192601324800.0, + "grad_norm": 1.9496315301470044, + "language_loss": 0.67735672, + "learning_rate": 4.6938635198869116e-07, + "loss": 0.75407386, + "num_input_tokens_seen": 281313070, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.10272217, + "step": 13041, + "time_per_iteration": 2.5014941692352295 + }, + { + "auxiliary_loss_clip": 0.06304124, + "auxiliary_loss_mlp": 0.01252304, + "balance_loss_clip": 0.06248714, + "balance_loss_mlp": 0.01251298, + "epoch": 0.7841274612956561, + "flos": 66365694616320.0, + "grad_norm": 0.8059954256946308, + "language_loss": 0.57385874, + "learning_rate": 4.691356979055998e-07, + "loss": 0.649423, + "num_input_tokens_seen": 281374880, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01005554, + "step": 13042, + "time_per_iteration": 3.0931692123413086 + }, + { + "auxiliary_loss_clip": 0.06405251, + "auxiliary_loss_mlp": 0.0126489, + "balance_loss_clip": 0.06270869, + "balance_loss_mlp": 0.0125564, + "epoch": 0.784187584548324, + "flos": 26655259812480.0, + "grad_norm": 2.4178981590312105, + "language_loss": 0.84631729, + "learning_rate": 4.688851018730369e-07, + "loss": 0.92301869, + "num_input_tokens_seen": 281392620, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.09246826, + "step": 13043, + "time_per_iteration": 2.5591118335723877 + }, + { + "auxiliary_loss_clip": 0.0639644, + "auxiliary_loss_mlp": 0.01264718, + "balance_loss_clip": 0.06267819, + "balance_loss_mlp": 0.01255796, + "epoch": 0.7842477078009921, + "flos": 25747422750720.0, + "grad_norm": 1.364522654088724, + "language_loss": 0.88473415, + "learning_rate": 4.6863456390050425e-07, + "loss": 0.96134579, + "num_input_tokens_seen": 281413140, + "router_z_loss_clip": 1.28613281, + "router_z_loss_mlp": 0.08917236, + "step": 13044, + "time_per_iteration": 2.5349628925323486 + }, + { + "auxiliary_loss_clip": 0.06410815, + "auxiliary_loss_mlp": 0.01269176, + "balance_loss_clip": 0.06271672, + "balance_loss_mlp": 0.01259132, + "epoch": 0.78430783105366, + "flos": 21987398160000.0, + "grad_norm": 1.6046981571270753, + "language_loss": 0.79284698, + "learning_rate": 4.6838408399750195e-07, + "loss": 0.86964685, + "num_input_tokens_seen": 281430860, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.10040283, + "step": 13045, + "time_per_iteration": 3.9486923217773438 + }, + { + "auxiliary_loss_clip": 0.06400262, + "auxiliary_loss_mlp": 0.01262142, + "balance_loss_clip": 0.0626996, + "balance_loss_mlp": 0.01252862, + "epoch": 0.784367954306328, + "flos": 23849122901760.0, + "grad_norm": 1.3651332690132787, + "language_loss": 0.72812819, + "learning_rate": 4.6813366217352925e-07, + "loss": 0.80475229, + "num_input_tokens_seen": 281451385, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.09277344, + "step": 13046, + "time_per_iteration": 2.5449562072753906 + }, + { + "auxiliary_loss_clip": 0.06399076, + "auxiliary_loss_mlp": 0.01262656, + "balance_loss_clip": 0.06269773, + "balance_loss_mlp": 0.01253036, + "epoch": 0.7844280775589959, + "flos": 24833548195200.0, + "grad_norm": 1.4113250051922885, + "language_loss": 0.63375705, + "learning_rate": 4.678832984380809e-07, + "loss": 0.71037436, + "num_input_tokens_seen": 281472255, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.09619141, + "step": 13047, + "time_per_iteration": 2.555187940597534 + }, + { + "auxiliary_loss_clip": 0.06397624, + "auxiliary_loss_mlp": 0.01263441, + "balance_loss_clip": 0.06269644, + "balance_loss_mlp": 0.01253892, + "epoch": 0.7844882008116639, + "flos": 22462245648000.0, + "grad_norm": 1.5637844175125322, + "language_loss": 0.73288012, + "learning_rate": 4.676329928006515e-07, + "loss": 0.8094908, + "num_input_tokens_seen": 281492860, + "router_z_loss_clip": 1.28027344, + "router_z_loss_mlp": 0.09552002, + "step": 13048, + "time_per_iteration": 2.500697374343872 + }, + { + "auxiliary_loss_clip": 0.06406703, + "auxiliary_loss_mlp": 0.01264778, + "balance_loss_clip": 0.06269943, + "balance_loss_mlp": 0.01254586, + "epoch": 0.7845483240643318, + "flos": 26111203251840.0, + "grad_norm": 1.7122203145326895, + "language_loss": 0.74653435, + "learning_rate": 4.6738274527073243e-07, + "loss": 0.8232491, + "num_input_tokens_seen": 281511815, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.10198975, + "step": 13049, + "time_per_iteration": 2.525059700012207 + }, + { + "auxiliary_loss_clip": 0.06406355, + "auxiliary_loss_mlp": 0.012639, + "balance_loss_clip": 0.06269609, + "balance_loss_mlp": 0.0125279, + "epoch": 0.7846084473169999, + "flos": 19360363351680.0, + "grad_norm": 1.8695615724941215, + "language_loss": 0.72989309, + "learning_rate": 4.6713255585781454e-07, + "loss": 0.80659556, + "num_input_tokens_seen": 281530090, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.11114502, + "step": 13050, + "time_per_iteration": 2.502976655960083 + }, + { + "auxiliary_loss_clip": 0.0640547, + "auxiliary_loss_mlp": 0.01264968, + "balance_loss_clip": 0.06273313, + "balance_loss_mlp": 0.01255658, + "epoch": 0.7846685705696678, + "flos": 23331620885760.0, + "grad_norm": 1.8649850140502078, + "language_loss": 0.73895067, + "learning_rate": 4.668824245713825e-07, + "loss": 0.81565511, + "num_input_tokens_seen": 281547075, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09320068, + "step": 13051, + "time_per_iteration": 2.5090999603271484 + }, + { + "auxiliary_loss_clip": 0.06407961, + "auxiliary_loss_mlp": 0.01270446, + "balance_loss_clip": 0.06272332, + "balance_loss_mlp": 0.01259622, + "epoch": 0.7847286938223358, + "flos": 35818379706240.0, + "grad_norm": 2.0718578838618527, + "language_loss": 0.73053241, + "learning_rate": 4.666323514209227e-07, + "loss": 0.80731648, + "num_input_tokens_seen": 281568080, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10827637, + "step": 13052, + "time_per_iteration": 2.6086881160736084 + }, + { + "auxiliary_loss_clip": 0.06395121, + "auxiliary_loss_mlp": 0.01262593, + "balance_loss_clip": 0.06268048, + "balance_loss_mlp": 0.01253241, + "epoch": 0.7847888170750038, + "flos": 18483986298240.0, + "grad_norm": 1.9107364869927201, + "language_loss": 0.69673455, + "learning_rate": 4.663823364159183e-07, + "loss": 0.77331167, + "num_input_tokens_seen": 281586925, + "router_z_loss_clip": 1.27050781, + "router_z_loss_mlp": 0.09344482, + "step": 13053, + "time_per_iteration": 2.471815586090088 + }, + { + "auxiliary_loss_clip": 0.06401109, + "auxiliary_loss_mlp": 0.01260742, + "balance_loss_clip": 0.06270862, + "balance_loss_mlp": 0.01251807, + "epoch": 0.7848489403276717, + "flos": 25126190979840.0, + "grad_norm": 1.8867575378742971, + "language_loss": 0.70537353, + "learning_rate": 4.6613237956584893e-07, + "loss": 0.78199208, + "num_input_tokens_seen": 281603915, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.08929443, + "step": 13054, + "time_per_iteration": 2.5749151706695557 + }, + { + "auxiliary_loss_clip": 0.06405072, + "auxiliary_loss_mlp": 0.01264324, + "balance_loss_clip": 0.06269364, + "balance_loss_mlp": 0.01253971, + "epoch": 0.7849090635803397, + "flos": 26509169018880.0, + "grad_norm": 1.610774832305801, + "language_loss": 0.76244235, + "learning_rate": 4.658824808801938e-07, + "loss": 0.8391363, + "num_input_tokens_seen": 281624220, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10357666, + "step": 13055, + "time_per_iteration": 3.9623241424560547 + }, + { + "auxiliary_loss_clip": 0.06407758, + "auxiliary_loss_mlp": 0.01263649, + "balance_loss_clip": 0.06270571, + "balance_loss_mlp": 0.01253922, + "epoch": 0.7849691868330076, + "flos": 20965978488960.0, + "grad_norm": 1.9205969834144307, + "language_loss": 0.75488204, + "learning_rate": 4.656326403684283e-07, + "loss": 0.83159614, + "num_input_tokens_seen": 281642325, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.09729004, + "step": 13056, + "time_per_iteration": 2.4767720699310303 + }, + { + "auxiliary_loss_clip": 0.06400058, + "auxiliary_loss_mlp": 0.01266253, + "balance_loss_clip": 0.06269453, + "balance_loss_mlp": 0.01256841, + "epoch": 0.7850293100856757, + "flos": 26074628144640.0, + "grad_norm": 1.52924099348992, + "language_loss": 0.70278704, + "learning_rate": 4.6538285804002744e-07, + "loss": 0.77945018, + "num_input_tokens_seen": 281663065, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09423828, + "step": 13057, + "time_per_iteration": 2.5652661323547363 + }, + { + "auxiliary_loss_clip": 0.06407446, + "auxiliary_loss_mlp": 0.01266111, + "balance_loss_clip": 0.06271527, + "balance_loss_mlp": 0.01256789, + "epoch": 0.7850894333383436, + "flos": 22498443411840.0, + "grad_norm": 2.33768341300027, + "language_loss": 0.76614606, + "learning_rate": 4.6513313390446175e-07, + "loss": 0.84288156, + "num_input_tokens_seen": 281681005, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.09326172, + "step": 13058, + "time_per_iteration": 2.479261875152588 + }, + { + "auxiliary_loss_clip": 0.06401752, + "auxiliary_loss_mlp": 0.0126406, + "balance_loss_clip": 0.06268829, + "balance_loss_mlp": 0.01254244, + "epoch": 0.7851495565910116, + "flos": 20564952048000.0, + "grad_norm": 1.4951701283618941, + "language_loss": 0.71132874, + "learning_rate": 4.6488346797120146e-07, + "loss": 0.78798681, + "num_input_tokens_seen": 281697965, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.0982666, + "step": 13059, + "time_per_iteration": 3.9393692016601562 + }, + { + "auxiliary_loss_clip": 0.06412531, + "auxiliary_loss_mlp": 0.01265523, + "balance_loss_clip": 0.06272064, + "balance_loss_mlp": 0.01254842, + "epoch": 0.7852096798436795, + "flos": 15930353266560.0, + "grad_norm": 1.897902046144861, + "language_loss": 0.77542412, + "learning_rate": 4.646338602497144e-07, + "loss": 0.85220468, + "num_input_tokens_seen": 281716035, + "router_z_loss_clip": 1.40332031, + "router_z_loss_mlp": 0.10687256, + "step": 13060, + "time_per_iteration": 2.4718637466430664 + }, + { + "auxiliary_loss_clip": 0.06402256, + "auxiliary_loss_mlp": 0.0126411, + "balance_loss_clip": 0.06269743, + "balance_loss_mlp": 0.01254085, + "epoch": 0.7852698030963475, + "flos": 19068265618560.0, + "grad_norm": 1.8441572725485498, + "language_loss": 0.76857173, + "learning_rate": 4.643843107494654e-07, + "loss": 0.84523541, + "num_input_tokens_seen": 281732815, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.1003418, + "step": 13061, + "time_per_iteration": 2.4667510986328125 + }, + { + "auxiliary_loss_clip": 0.06403807, + "auxiliary_loss_mlp": 0.01266965, + "balance_loss_clip": 0.06270888, + "balance_loss_mlp": 0.01257738, + "epoch": 0.7853299263490154, + "flos": 24651259637760.0, + "grad_norm": 1.784620382168378, + "language_loss": 0.74518055, + "learning_rate": 4.641348194799164e-07, + "loss": 0.82188833, + "num_input_tokens_seen": 281751980, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.09234619, + "step": 13062, + "time_per_iteration": 2.5519487857818604 + }, + { + "auxiliary_loss_clip": 0.06401968, + "auxiliary_loss_mlp": 0.01263435, + "balance_loss_clip": 0.06270862, + "balance_loss_mlp": 0.01254501, + "epoch": 0.7853900496016835, + "flos": 22024518318720.0, + "grad_norm": 1.444565661483555, + "language_loss": 0.6925329, + "learning_rate": 4.638853864505297e-07, + "loss": 0.76918697, + "num_input_tokens_seen": 281772670, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.08935547, + "step": 13063, + "time_per_iteration": 3.896639585494995 + }, + { + "auxiliary_loss_clip": 0.064018, + "auxiliary_loss_mlp": 0.01262061, + "balance_loss_clip": 0.06272244, + "balance_loss_mlp": 0.01252858, + "epoch": 0.7854501728543514, + "flos": 30235343760000.0, + "grad_norm": 1.975335557654558, + "language_loss": 0.72825849, + "learning_rate": 4.636360116707625e-07, + "loss": 0.80489707, + "num_input_tokens_seen": 281792930, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.09210205, + "step": 13064, + "time_per_iteration": 2.567704200744629 + }, + { + "auxiliary_loss_clip": 0.06403325, + "auxiliary_loss_mlp": 0.01265412, + "balance_loss_clip": 0.0626822, + "balance_loss_mlp": 0.01255583, + "epoch": 0.7855102961070194, + "flos": 18849695443200.0, + "grad_norm": 1.5878092382689184, + "language_loss": 0.67936897, + "learning_rate": 4.633866951500718e-07, + "loss": 0.75605631, + "num_input_tokens_seen": 281811805, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.09838867, + "step": 13065, + "time_per_iteration": 2.470630168914795 + }, + { + "auxiliary_loss_clip": 0.06404464, + "auxiliary_loss_mlp": 0.01266751, + "balance_loss_clip": 0.06273209, + "balance_loss_mlp": 0.01257184, + "epoch": 0.7855704193596874, + "flos": 22316574124800.0, + "grad_norm": 3.292833578537852, + "language_loss": 0.75992739, + "learning_rate": 4.6313743689791196e-07, + "loss": 0.83663952, + "num_input_tokens_seen": 281831885, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09576416, + "step": 13066, + "time_per_iteration": 2.5433592796325684 + }, + { + "auxiliary_loss_clip": 0.06310245, + "auxiliary_loss_mlp": 0.01255234, + "balance_loss_clip": 0.06254524, + "balance_loss_mlp": 0.01254291, + "epoch": 0.7856305426123553, + "flos": 60024224638080.0, + "grad_norm": 0.6974485320329921, + "language_loss": 0.53405064, + "learning_rate": 4.628882369237346e-07, + "loss": 0.60970545, + "num_input_tokens_seen": 281900310, + "router_z_loss_clip": 0.55664062, + "router_z_loss_mlp": 0.00940704, + "step": 13067, + "time_per_iteration": 3.3080852031707764 + }, + { + "auxiliary_loss_clip": 0.06404316, + "auxiliary_loss_mlp": 0.0126413, + "balance_loss_clip": 0.06269915, + "balance_loss_mlp": 0.012542, + "epoch": 0.7856906658650233, + "flos": 21874528310400.0, + "grad_norm": 1.4327852205336962, + "language_loss": 0.68056738, + "learning_rate": 4.62639095236989e-07, + "loss": 0.75725186, + "num_input_tokens_seen": 281918870, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.0993042, + "step": 13068, + "time_per_iteration": 2.5869228839874268 + }, + { + "auxiliary_loss_clip": 0.06399503, + "auxiliary_loss_mlp": 0.01263997, + "balance_loss_clip": 0.06269825, + "balance_loss_mlp": 0.01254883, + "epoch": 0.7857507891176913, + "flos": 23629672258560.0, + "grad_norm": 1.764601675005712, + "language_loss": 0.68482268, + "learning_rate": 4.6239001184712267e-07, + "loss": 0.76145768, + "num_input_tokens_seen": 281936905, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.09112549, + "step": 13069, + "time_per_iteration": 2.5437350273132324 + }, + { + "auxiliary_loss_clip": 0.06404187, + "auxiliary_loss_mlp": 0.01263836, + "balance_loss_clip": 0.06271039, + "balance_loss_mlp": 0.01253984, + "epoch": 0.7858109123703593, + "flos": 25527091639680.0, + "grad_norm": 1.7842031457039946, + "language_loss": 0.76992953, + "learning_rate": 4.6214098676358195e-07, + "loss": 0.84660977, + "num_input_tokens_seen": 281955625, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.09857178, + "step": 13070, + "time_per_iteration": 2.5414490699768066 + }, + { + "auxiliary_loss_clip": 0.06396306, + "auxiliary_loss_mlp": 0.01264006, + "balance_loss_clip": 0.06267333, + "balance_loss_mlp": 0.01255298, + "epoch": 0.7858710356230272, + "flos": 17463195532800.0, + "grad_norm": 1.5496724726178355, + "language_loss": 0.6583572, + "learning_rate": 4.618920199958083e-07, + "loss": 0.73496032, + "num_input_tokens_seen": 281973285, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.08703613, + "step": 13071, + "time_per_iteration": 2.469886541366577 + }, + { + "auxiliary_loss_clip": 0.06407128, + "auxiliary_loss_mlp": 0.01264805, + "balance_loss_clip": 0.06271265, + "balance_loss_mlp": 0.01254946, + "epoch": 0.7859311588756952, + "flos": 24686367298560.0, + "grad_norm": 1.6110892083187893, + "language_loss": 0.73717749, + "learning_rate": 4.616431115532442e-07, + "loss": 0.81389678, + "num_input_tokens_seen": 281991410, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.09857178, + "step": 13072, + "time_per_iteration": 2.519676923751831 + }, + { + "auxiliary_loss_clip": 0.06403338, + "auxiliary_loss_mlp": 0.01268392, + "balance_loss_clip": 0.06269255, + "balance_loss_mlp": 0.01257288, + "epoch": 0.7859912821283631, + "flos": 21805654654080.0, + "grad_norm": 1.8631403345440603, + "language_loss": 0.71523631, + "learning_rate": 4.613942614453268e-07, + "loss": 0.79195362, + "num_input_tokens_seen": 282010845, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.11108398, + "step": 13073, + "time_per_iteration": 2.5105767250061035 + }, + { + "auxiliary_loss_clip": 0.06404594, + "auxiliary_loss_mlp": 0.01265595, + "balance_loss_clip": 0.06270787, + "balance_loss_mlp": 0.01255295, + "epoch": 0.7860514053810311, + "flos": 20853108639360.0, + "grad_norm": 1.5490527180797131, + "language_loss": 0.76964885, + "learning_rate": 4.611454696814938e-07, + "loss": 0.84635073, + "num_input_tokens_seen": 282029635, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.10302734, + "step": 13074, + "time_per_iteration": 2.4855496883392334 + }, + { + "auxiliary_loss_clip": 0.06398475, + "auxiliary_loss_mlp": 0.01266136, + "balance_loss_clip": 0.06269623, + "balance_loss_mlp": 0.01256504, + "epoch": 0.786111528633699, + "flos": 24322461016320.0, + "grad_norm": 1.8530422938464213, + "language_loss": 0.75361305, + "learning_rate": 4.608967362711782e-07, + "loss": 0.8302592, + "num_input_tokens_seen": 282050285, + "router_z_loss_clip": 1.28808594, + "router_z_loss_mlp": 0.09637451, + "step": 13075, + "time_per_iteration": 2.5396533012390137 + }, + { + "auxiliary_loss_clip": 0.06403027, + "auxiliary_loss_mlp": 0.01261838, + "balance_loss_clip": 0.06270842, + "balance_loss_mlp": 0.01252677, + "epoch": 0.7861716518863671, + "flos": 24360126226560.0, + "grad_norm": 1.639337001432503, + "language_loss": 0.68816268, + "learning_rate": 4.6064806122381283e-07, + "loss": 0.7648114, + "num_input_tokens_seen": 282071040, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.09161377, + "step": 13076, + "time_per_iteration": 2.507643461227417 + }, + { + "auxiliary_loss_clip": 0.06400099, + "auxiliary_loss_mlp": 0.01267556, + "balance_loss_clip": 0.06270486, + "balance_loss_mlp": 0.01258461, + "epoch": 0.786231775139035, + "flos": 14026728683520.0, + "grad_norm": 2.3148125900767065, + "language_loss": 0.79768962, + "learning_rate": 4.603994445488282e-07, + "loss": 0.87436622, + "num_input_tokens_seen": 282086610, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.09088135, + "step": 13077, + "time_per_iteration": 2.470398426055908 + }, + { + "auxiliary_loss_clip": 0.06401075, + "auxiliary_loss_mlp": 0.0126456, + "balance_loss_clip": 0.06269512, + "balance_loss_mlp": 0.01255, + "epoch": 0.786291898391703, + "flos": 33731795733120.0, + "grad_norm": 1.615733156524089, + "language_loss": 0.70986831, + "learning_rate": 4.6015088625564956e-07, + "loss": 0.78652471, + "num_input_tokens_seen": 282107440, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.09552002, + "step": 13078, + "time_per_iteration": 2.6685726642608643 + }, + { + "auxiliary_loss_clip": 0.06401184, + "auxiliary_loss_mlp": 0.01265393, + "balance_loss_clip": 0.06270616, + "balance_loss_mlp": 0.01255875, + "epoch": 0.786352021644371, + "flos": 25818476613120.0, + "grad_norm": 1.4651879237887804, + "language_loss": 0.81708902, + "learning_rate": 4.599023863537039e-07, + "loss": 0.89375478, + "num_input_tokens_seen": 282127290, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.09509277, + "step": 13079, + "time_per_iteration": 2.5660455226898193 + }, + { + "auxiliary_loss_clip": 0.0639349, + "auxiliary_loss_mlp": 0.01269341, + "balance_loss_clip": 0.0626843, + "balance_loss_mlp": 0.01260209, + "epoch": 0.7864121448970389, + "flos": 28918010995200.0, + "grad_norm": 1.4929435922037373, + "language_loss": 0.68745899, + "learning_rate": 4.596539448524146e-07, + "loss": 0.76408732, + "num_input_tokens_seen": 282147505, + "router_z_loss_clip": 1.25195312, + "router_z_loss_mlp": 0.09124756, + "step": 13080, + "time_per_iteration": 2.5500268936157227 + }, + { + "auxiliary_loss_clip": 0.06401475, + "auxiliary_loss_mlp": 0.0126541, + "balance_loss_clip": 0.06269769, + "balance_loss_mlp": 0.012552, + "epoch": 0.7864722681497069, + "flos": 19214943390720.0, + "grad_norm": 1.6425983942021263, + "language_loss": 0.70132333, + "learning_rate": 4.594055617612016e-07, + "loss": 0.77799213, + "num_input_tokens_seen": 282166450, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.10211182, + "step": 13081, + "time_per_iteration": 2.508885622024536 + }, + { + "auxiliary_loss_clip": 0.06405645, + "auxiliary_loss_mlp": 0.01264379, + "balance_loss_clip": 0.06271995, + "balance_loss_mlp": 0.01255021, + "epoch": 0.7865323914023749, + "flos": 21878008254720.0, + "grad_norm": 2.0927961593492737, + "language_loss": 0.68778342, + "learning_rate": 4.591572370894838e-07, + "loss": 0.76448363, + "num_input_tokens_seen": 282186465, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.09362793, + "step": 13082, + "time_per_iteration": 2.5268876552581787 + }, + { + "auxiliary_loss_clip": 0.0639787, + "auxiliary_loss_mlp": 0.01264108, + "balance_loss_clip": 0.0626892, + "balance_loss_mlp": 0.01254584, + "epoch": 0.7865925146550429, + "flos": 25527385128960.0, + "grad_norm": 1.5194289662582627, + "language_loss": 0.66099608, + "learning_rate": 4.589089708466789e-07, + "loss": 0.73761588, + "num_input_tokens_seen": 282207180, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.09527588, + "step": 13083, + "time_per_iteration": 2.5328421592712402 + }, + { + "auxiliary_loss_clip": 0.06405569, + "auxiliary_loss_mlp": 0.01266332, + "balance_loss_clip": 0.0627001, + "balance_loss_mlp": 0.01255424, + "epoch": 0.7866526379077108, + "flos": 19103121717120.0, + "grad_norm": 2.2309831052205387, + "language_loss": 0.74742764, + "learning_rate": 4.5866076304220015e-07, + "loss": 0.82414663, + "num_input_tokens_seen": 282225865, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.10906982, + "step": 13084, + "time_per_iteration": 3.8599534034729004 + }, + { + "auxiliary_loss_clip": 0.06398539, + "auxiliary_loss_mlp": 0.01265, + "balance_loss_clip": 0.0626938, + "balance_loss_mlp": 0.01255678, + "epoch": 0.7867127611603788, + "flos": 16178245171200.0, + "grad_norm": 1.7096991986275847, + "language_loss": 0.7048676, + "learning_rate": 4.584126136854591e-07, + "loss": 0.7815029, + "num_input_tokens_seen": 282242895, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.09313965, + "step": 13085, + "time_per_iteration": 2.4548091888427734 + }, + { + "auxiliary_loss_clip": 0.06404947, + "auxiliary_loss_mlp": 0.01266508, + "balance_loss_clip": 0.0626765, + "balance_loss_mlp": 0.01256238, + "epoch": 0.7867728844130467, + "flos": 20779329519360.0, + "grad_norm": 1.9009229295966659, + "language_loss": 0.72873515, + "learning_rate": 4.5816452278586617e-07, + "loss": 0.80544972, + "num_input_tokens_seen": 282260425, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.10266113, + "step": 13086, + "time_per_iteration": 2.4679646492004395 + }, + { + "auxiliary_loss_clip": 0.06401749, + "auxiliary_loss_mlp": 0.0126499, + "balance_loss_clip": 0.0626972, + "balance_loss_mlp": 0.01256132, + "epoch": 0.7868330076657147, + "flos": 21766186581120.0, + "grad_norm": 1.6915622771395795, + "language_loss": 0.75259304, + "learning_rate": 4.5791649035282965e-07, + "loss": 0.82926041, + "num_input_tokens_seen": 282279335, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.08862305, + "step": 13087, + "time_per_iteration": 2.4868595600128174 + }, + { + "auxiliary_loss_clip": 0.06401436, + "auxiliary_loss_mlp": 0.01266533, + "balance_loss_clip": 0.06271186, + "balance_loss_mlp": 0.01257431, + "epoch": 0.7868931309183826, + "flos": 25707451553280.0, + "grad_norm": 1.5159741083416707, + "language_loss": 0.71450847, + "learning_rate": 4.5766851639575456e-07, + "loss": 0.79118818, + "num_input_tokens_seen": 282299905, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.09088135, + "step": 13088, + "time_per_iteration": 2.5030412673950195 + }, + { + "auxiliary_loss_clip": 0.06311038, + "auxiliary_loss_mlp": 0.01250466, + "balance_loss_clip": 0.06255361, + "balance_loss_mlp": 0.012495, + "epoch": 0.7869532541710507, + "flos": 64666579921920.0, + "grad_norm": 0.663330829427475, + "language_loss": 0.55047309, + "learning_rate": 4.574206009240431e-07, + "loss": 0.62608814, + "num_input_tokens_seen": 282367620, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.00964355, + "step": 13089, + "time_per_iteration": 3.1940503120422363 + }, + { + "auxiliary_loss_clip": 0.06311715, + "auxiliary_loss_mlp": 0.01259019, + "balance_loss_clip": 0.0625612, + "balance_loss_mlp": 0.01257986, + "epoch": 0.7870133774237186, + "flos": 67475651725440.0, + "grad_norm": 0.7045101458235505, + "language_loss": 0.49567109, + "learning_rate": 4.571727439470976e-07, + "loss": 0.57137847, + "num_input_tokens_seen": 282435695, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01033783, + "step": 13090, + "time_per_iteration": 3.2323949337005615 + }, + { + "auxiliary_loss_clip": 0.06399588, + "auxiliary_loss_mlp": 0.01264155, + "balance_loss_clip": 0.0626979, + "balance_loss_mlp": 0.01255006, + "epoch": 0.7870735006763866, + "flos": 26075592466560.0, + "grad_norm": 1.3918495812457483, + "language_loss": 0.84173477, + "learning_rate": 4.5692494547431583e-07, + "loss": 0.91837221, + "num_input_tokens_seen": 282456025, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.0914917, + "step": 13091, + "time_per_iteration": 2.5303354263305664 + }, + { + "auxiliary_loss_clip": 0.06311627, + "auxiliary_loss_mlp": 0.01253337, + "balance_loss_clip": 0.0625616, + "balance_loss_mlp": 0.01252234, + "epoch": 0.7871336239290546, + "flos": 70310439532800.0, + "grad_norm": 0.6984253533928471, + "language_loss": 0.63944566, + "learning_rate": 4.566772055150947e-07, + "loss": 0.71509528, + "num_input_tokens_seen": 282520995, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01104736, + "step": 13092, + "time_per_iteration": 3.186598300933838 + }, + { + "auxiliary_loss_clip": 0.06405234, + "auxiliary_loss_mlp": 0.01264101, + "balance_loss_clip": 0.06272719, + "balance_loss_mlp": 0.01254749, + "epoch": 0.7871937471817225, + "flos": 15784010910720.0, + "grad_norm": 2.677362510314703, + "language_loss": 0.79394525, + "learning_rate": 4.564295240788285e-07, + "loss": 0.87063861, + "num_input_tokens_seen": 282539355, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09350586, + "step": 13093, + "time_per_iteration": 2.4746809005737305 + }, + { + "auxiliary_loss_clip": 0.06399192, + "auxiliary_loss_mlp": 0.01262897, + "balance_loss_clip": 0.06268847, + "balance_loss_mlp": 0.01253747, + "epoch": 0.7872538704343905, + "flos": 20491466417280.0, + "grad_norm": 1.6510022815590566, + "language_loss": 0.75735247, + "learning_rate": 4.561819011749106e-07, + "loss": 0.83397341, + "num_input_tokens_seen": 282555735, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.0914917, + "step": 13094, + "time_per_iteration": 4.020095109939575 + }, + { + "auxiliary_loss_clip": 0.06407712, + "auxiliary_loss_mlp": 0.01266386, + "balance_loss_clip": 0.06273055, + "balance_loss_mlp": 0.01256719, + "epoch": 0.7873139936870585, + "flos": 25089699726720.0, + "grad_norm": 1.5509563724400146, + "language_loss": 0.79440391, + "learning_rate": 4.5593433681272884e-07, + "loss": 0.87114489, + "num_input_tokens_seen": 282574550, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.09674072, + "step": 13095, + "time_per_iteration": 2.609463930130005 + }, + { + "auxiliary_loss_clip": 0.06408177, + "auxiliary_loss_mlp": 0.01265337, + "balance_loss_clip": 0.06271407, + "balance_loss_mlp": 0.01255425, + "epoch": 0.7873741169397265, + "flos": 30891054286080.0, + "grad_norm": 1.609249488827552, + "language_loss": 0.68118989, + "learning_rate": 4.556868310016715e-07, + "loss": 0.75792503, + "num_input_tokens_seen": 282596520, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.09918213, + "step": 13096, + "time_per_iteration": 2.5687479972839355 + }, + { + "auxiliary_loss_clip": 0.0639504, + "auxiliary_loss_mlp": 0.01263751, + "balance_loss_clip": 0.06268235, + "balance_loss_mlp": 0.01255102, + "epoch": 0.7874342401923944, + "flos": 46802666165760.0, + "grad_norm": 1.4338734934522757, + "language_loss": 0.70958376, + "learning_rate": 4.55439383751125e-07, + "loss": 0.78617167, + "num_input_tokens_seen": 282620560, + "router_z_loss_clip": 1.26953125, + "router_z_loss_mlp": 0.08648682, + "step": 13097, + "time_per_iteration": 2.739225387573242 + }, + { + "auxiliary_loss_clip": 0.0640981, + "auxiliary_loss_mlp": 0.01270015, + "balance_loss_clip": 0.06274028, + "balance_loss_mlp": 0.0125987, + "epoch": 0.7874943634450624, + "flos": 23590958872320.0, + "grad_norm": 4.324515792208533, + "language_loss": 0.8066771, + "learning_rate": 4.5519199507047126e-07, + "loss": 0.8834753, + "num_input_tokens_seen": 282639830, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.10144043, + "step": 13098, + "time_per_iteration": 4.011147737503052 + }, + { + "auxiliary_loss_clip": 0.06403133, + "auxiliary_loss_mlp": 0.01264821, + "balance_loss_clip": 0.06272101, + "balance_loss_mlp": 0.0125591, + "epoch": 0.7875544866977303, + "flos": 20196978842880.0, + "grad_norm": 1.6374038368604131, + "language_loss": 0.74357909, + "learning_rate": 4.5494466496909177e-07, + "loss": 0.82025862, + "num_input_tokens_seen": 282660130, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.08898926, + "step": 13099, + "time_per_iteration": 2.5371813774108887 + }, + { + "auxiliary_loss_clip": 0.06403521, + "auxiliary_loss_mlp": 0.01264223, + "balance_loss_clip": 0.06272208, + "balance_loss_mlp": 0.01254811, + "epoch": 0.7876146099503983, + "flos": 22609342690560.0, + "grad_norm": 1.4701340709539035, + "language_loss": 0.78340292, + "learning_rate": 4.5469739345636603e-07, + "loss": 0.86008036, + "num_input_tokens_seen": 282681125, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.09417725, + "step": 13100, + "time_per_iteration": 2.518275737762451 + }, + { + "auxiliary_loss_clip": 0.06411106, + "auxiliary_loss_mlp": 0.01262468, + "balance_loss_clip": 0.06271806, + "balance_loss_mlp": 0.01251334, + "epoch": 0.7876747332030662, + "flos": 10710217353600.0, + "grad_norm": 2.2988714589951122, + "language_loss": 0.66578412, + "learning_rate": 4.5445018054167007e-07, + "loss": 0.74251986, + "num_input_tokens_seen": 282696690, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.11138916, + "step": 13101, + "time_per_iteration": 2.478010416030884 + }, + { + "auxiliary_loss_clip": 0.06403912, + "auxiliary_loss_mlp": 0.01262729, + "balance_loss_clip": 0.06271445, + "balance_loss_mlp": 0.01253026, + "epoch": 0.7877348564557343, + "flos": 38408462064000.0, + "grad_norm": 1.3711840285849346, + "language_loss": 0.78050315, + "learning_rate": 4.5420302623437745e-07, + "loss": 0.85716951, + "num_input_tokens_seen": 282721210, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.09716797, + "step": 13102, + "time_per_iteration": 2.6512677669525146 + }, + { + "auxiliary_loss_clip": 0.06402024, + "auxiliary_loss_mlp": 0.01263165, + "balance_loss_clip": 0.06270896, + "balance_loss_mlp": 0.01253968, + "epoch": 0.7877949797084022, + "flos": 18334876757760.0, + "grad_norm": 3.387524543051336, + "language_loss": 0.82612967, + "learning_rate": 4.5395593054386093e-07, + "loss": 0.90278161, + "num_input_tokens_seen": 282738505, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09197998, + "step": 13103, + "time_per_iteration": 3.8968992233276367 + }, + { + "auxiliary_loss_clip": 0.0640745, + "auxiliary_loss_mlp": 0.01262901, + "balance_loss_clip": 0.0627317, + "balance_loss_mlp": 0.0125349, + "epoch": 0.7878551029610702, + "flos": 25812942243840.0, + "grad_norm": 2.089208992674617, + "language_loss": 0.80857301, + "learning_rate": 4.537088934794913e-07, + "loss": 0.8852765, + "num_input_tokens_seen": 282756895, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.09405518, + "step": 13104, + "time_per_iteration": 2.531153917312622 + }, + { + "auxiliary_loss_clip": 0.06404544, + "auxiliary_loss_mlp": 0.01264676, + "balance_loss_clip": 0.06272654, + "balance_loss_mlp": 0.0125505, + "epoch": 0.7879152262137382, + "flos": 22348663038720.0, + "grad_norm": 1.6665656648061993, + "language_loss": 0.74192965, + "learning_rate": 4.5346191505063515e-07, + "loss": 0.81862175, + "num_input_tokens_seen": 282774955, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.09619141, + "step": 13105, + "time_per_iteration": 2.470590114593506 + }, + { + "auxiliary_loss_clip": 0.06407781, + "auxiliary_loss_mlp": 0.01265901, + "balance_loss_clip": 0.0627221, + "balance_loss_mlp": 0.01255798, + "epoch": 0.7879753494664061, + "flos": 24791396791680.0, + "grad_norm": 1.540938509232933, + "language_loss": 0.75896162, + "learning_rate": 4.5321499526665776e-07, + "loss": 0.83569837, + "num_input_tokens_seen": 282793165, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.10101318, + "step": 13106, + "time_per_iteration": 2.5313045978546143 + }, + { + "auxiliary_loss_clip": 0.06404249, + "auxiliary_loss_mlp": 0.01267414, + "balance_loss_clip": 0.06271406, + "balance_loss_mlp": 0.01257592, + "epoch": 0.7880354727190741, + "flos": 16914610851840.0, + "grad_norm": 2.261490692087697, + "language_loss": 0.7317878, + "learning_rate": 4.5296813413692337e-07, + "loss": 0.80850446, + "num_input_tokens_seen": 282809820, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.09832764, + "step": 13107, + "time_per_iteration": 2.4657392501831055 + }, + { + "auxiliary_loss_clip": 0.0640149, + "auxiliary_loss_mlp": 0.01266906, + "balance_loss_clip": 0.06272627, + "balance_loss_mlp": 0.01257083, + "epoch": 0.7880955959717421, + "flos": 22236002824320.0, + "grad_norm": 1.7249934129069375, + "language_loss": 0.73170471, + "learning_rate": 4.5272133167079165e-07, + "loss": 0.80838865, + "num_input_tokens_seen": 282828600, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.09820557, + "step": 13108, + "time_per_iteration": 2.522061347961426 + }, + { + "auxiliary_loss_clip": 0.06308442, + "auxiliary_loss_mlp": 0.01251318, + "balance_loss_clip": 0.06252776, + "balance_loss_mlp": 0.01250208, + "epoch": 0.7881557192244101, + "flos": 69201907943040.0, + "grad_norm": 0.865010287169312, + "language_loss": 0.60254252, + "learning_rate": 4.5247458787762216e-07, + "loss": 0.6781401, + "num_input_tokens_seen": 282882775, + "router_z_loss_clip": 0.55810547, + "router_z_loss_mlp": 0.01112366, + "step": 13109, + "time_per_iteration": 3.0764577388763428 + }, + { + "auxiliary_loss_clip": 0.06398489, + "auxiliary_loss_mlp": 0.0126099, + "balance_loss_clip": 0.06271066, + "balance_loss_mlp": 0.01252025, + "epoch": 0.788215842477078, + "flos": 24942225340800.0, + "grad_norm": 1.5302071478358445, + "language_loss": 0.72546446, + "learning_rate": 4.5222790276677126e-07, + "loss": 0.80205929, + "num_input_tokens_seen": 282902680, + "router_z_loss_clip": 1.2734375, + "router_z_loss_mlp": 0.08959961, + "step": 13110, + "time_per_iteration": 2.5210487842559814 + }, + { + "auxiliary_loss_clip": 0.06396982, + "auxiliary_loss_mlp": 0.01264197, + "balance_loss_clip": 0.06268892, + "balance_loss_mlp": 0.01255453, + "epoch": 0.788275965729746, + "flos": 26114054290560.0, + "grad_norm": 1.2956006250382688, + "language_loss": 0.75373393, + "learning_rate": 4.5198127634759455e-07, + "loss": 0.83034575, + "num_input_tokens_seen": 282923625, + "router_z_loss_clip": 1.27929688, + "router_z_loss_mlp": 0.08734131, + "step": 13111, + "time_per_iteration": 2.5650205612182617 + }, + { + "auxiliary_loss_clip": 0.06403745, + "auxiliary_loss_mlp": 0.01269317, + "balance_loss_clip": 0.06272365, + "balance_loss_mlp": 0.01259524, + "epoch": 0.7883360889824139, + "flos": 21221123771520.0, + "grad_norm": 1.7931682275164638, + "language_loss": 0.6193608, + "learning_rate": 4.5173470862944206e-07, + "loss": 0.69609141, + "num_input_tokens_seen": 282941955, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09790039, + "step": 13112, + "time_per_iteration": 2.5178818702697754 + }, + { + "auxiliary_loss_clip": 0.06402722, + "auxiliary_loss_mlp": 0.01268033, + "balance_loss_clip": 0.06271605, + "balance_loss_mlp": 0.01258025, + "epoch": 0.7883962122350819, + "flos": 21148979806080.0, + "grad_norm": 1.7329728491097858, + "language_loss": 0.67358041, + "learning_rate": 4.514881996216644e-07, + "loss": 0.75028789, + "num_input_tokens_seen": 282961280, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.10003662, + "step": 13113, + "time_per_iteration": 2.4997618198394775 + }, + { + "auxiliary_loss_clip": 0.06400861, + "auxiliary_loss_mlp": 0.01265802, + "balance_loss_clip": 0.06271458, + "balance_loss_mlp": 0.01256629, + "epoch": 0.7884563354877498, + "flos": 15308031392640.0, + "grad_norm": 2.191522970823139, + "language_loss": 0.58949661, + "learning_rate": 4.5124174933361e-07, + "loss": 0.66616333, + "num_input_tokens_seen": 282978210, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.0916748, + "step": 13114, + "time_per_iteration": 2.499992609024048 + }, + { + "auxiliary_loss_clip": 0.06405228, + "auxiliary_loss_mlp": 0.01263713, + "balance_loss_clip": 0.06271623, + "balance_loss_mlp": 0.01254063, + "epoch": 0.7885164587404179, + "flos": 24395024252160.0, + "grad_norm": 2.5351098559279452, + "language_loss": 0.67195284, + "learning_rate": 4.5099535777462306e-07, + "loss": 0.74864221, + "num_input_tokens_seen": 282998845, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.09649658, + "step": 13115, + "time_per_iteration": 2.6665830612182617 + }, + { + "auxiliary_loss_clip": 0.06404252, + "auxiliary_loss_mlp": 0.0126713, + "balance_loss_clip": 0.06272528, + "balance_loss_mlp": 0.0125732, + "epoch": 0.7885765819930858, + "flos": 14390047987200.0, + "grad_norm": 1.969107246296687, + "language_loss": 0.8892082, + "learning_rate": 4.50749024954048e-07, + "loss": 0.965922, + "num_input_tokens_seen": 283015200, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.0980835, + "step": 13116, + "time_per_iteration": 2.488569498062134 + }, + { + "auxiliary_loss_clip": 0.06413092, + "auxiliary_loss_mlp": 0.0126853, + "balance_loss_clip": 0.06272166, + "balance_loss_mlp": 0.01257551, + "epoch": 0.7886367052457538, + "flos": 18265835393280.0, + "grad_norm": 2.2399693742143296, + "language_loss": 0.73226219, + "learning_rate": 4.505027508812245e-07, + "loss": 0.80907845, + "num_input_tokens_seen": 283033680, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.10986328, + "step": 13117, + "time_per_iteration": 2.4811642169952393 + }, + { + "auxiliary_loss_clip": 0.06399462, + "auxiliary_loss_mlp": 0.01262163, + "balance_loss_clip": 0.06271145, + "balance_loss_mlp": 0.0125355, + "epoch": 0.7886968284984217, + "flos": 15310588942080.0, + "grad_norm": 1.3858230532181541, + "language_loss": 0.80464065, + "learning_rate": 4.502565355654926e-07, + "loss": 0.88125694, + "num_input_tokens_seen": 283050620, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.08612061, + "step": 13118, + "time_per_iteration": 2.486297369003296 + }, + { + "auxiliary_loss_clip": 0.06400422, + "auxiliary_loss_mlp": 0.01266146, + "balance_loss_clip": 0.06270169, + "balance_loss_mlp": 0.01256538, + "epoch": 0.7887569517510897, + "flos": 21221878458240.0, + "grad_norm": 1.766770664669928, + "language_loss": 0.7323485, + "learning_rate": 4.500103790161878e-07, + "loss": 0.80901414, + "num_input_tokens_seen": 283070215, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.09613037, + "step": 13119, + "time_per_iteration": 2.4904284477233887 + }, + { + "auxiliary_loss_clip": 0.06406539, + "auxiliary_loss_mlp": 0.01262086, + "balance_loss_clip": 0.06272633, + "balance_loss_mlp": 0.01253146, + "epoch": 0.7888170750037578, + "flos": 22717894055040.0, + "grad_norm": 1.2838410999725969, + "language_loss": 0.7203325, + "learning_rate": 4.4976428124264454e-07, + "loss": 0.79701877, + "num_input_tokens_seen": 283091485, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.0894165, + "step": 13120, + "time_per_iteration": 2.531905174255371 + }, + { + "auxiliary_loss_clip": 0.06402384, + "auxiliary_loss_mlp": 0.01269736, + "balance_loss_clip": 0.0627251, + "balance_loss_mlp": 0.01259919, + "epoch": 0.7888771982564257, + "flos": 36437976322560.0, + "grad_norm": 1.5849995361084, + "language_loss": 0.79042959, + "learning_rate": 4.4951824225419564e-07, + "loss": 0.86715084, + "num_input_tokens_seen": 283115040, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.0980835, + "step": 13121, + "time_per_iteration": 2.6270458698272705 + }, + { + "auxiliary_loss_clip": 0.06399482, + "auxiliary_loss_mlp": 0.01265138, + "balance_loss_clip": 0.06271152, + "balance_loss_mlp": 0.01255524, + "epoch": 0.7889373215090937, + "flos": 27317678664960.0, + "grad_norm": 1.3500924966016437, + "language_loss": 0.80276608, + "learning_rate": 4.4927226206017057e-07, + "loss": 0.87941229, + "num_input_tokens_seen": 283136925, + "router_z_loss_clip": 1.28320312, + "router_z_loss_mlp": 0.09613037, + "step": 13122, + "time_per_iteration": 2.5672237873077393 + }, + { + "auxiliary_loss_clip": 0.06403008, + "auxiliary_loss_mlp": 0.01263927, + "balance_loss_clip": 0.06269404, + "balance_loss_mlp": 0.0125526, + "epoch": 0.7889974447617616, + "flos": 19835210839680.0, + "grad_norm": 1.809945605348313, + "language_loss": 0.78323883, + "learning_rate": 4.4902634066989597e-07, + "loss": 0.85990816, + "num_input_tokens_seen": 283155725, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.08666992, + "step": 13123, + "time_per_iteration": 2.5139808654785156 + }, + { + "auxiliary_loss_clip": 0.06405288, + "auxiliary_loss_mlp": 0.01262619, + "balance_loss_clip": 0.06270181, + "balance_loss_mlp": 0.01253154, + "epoch": 0.7890575680144296, + "flos": 17276336928000.0, + "grad_norm": 3.407845901525998, + "language_loss": 0.67230475, + "learning_rate": 4.487804780926985e-07, + "loss": 0.7489838, + "num_input_tokens_seen": 283173845, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.09466553, + "step": 13124, + "time_per_iteration": 3.877263069152832 + }, + { + "auxiliary_loss_clip": 0.06410992, + "auxiliary_loss_mlp": 0.01265224, + "balance_loss_clip": 0.06275047, + "balance_loss_mlp": 0.01255598, + "epoch": 0.7891176912670975, + "flos": 27607596192000.0, + "grad_norm": 2.1455737597716995, + "language_loss": 0.73154545, + "learning_rate": 4.4853467433790036e-07, + "loss": 0.80830753, + "num_input_tokens_seen": 283191985, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.09619141, + "step": 13125, + "time_per_iteration": 2.5944886207580566 + }, + { + "auxiliary_loss_clip": 0.06402256, + "auxiliary_loss_mlp": 0.01261205, + "balance_loss_clip": 0.06267411, + "balance_loss_mlp": 0.01251728, + "epoch": 0.7891778145197655, + "flos": 22718397179520.0, + "grad_norm": 1.8448957307034948, + "language_loss": 0.73224074, + "learning_rate": 4.4828892941482267e-07, + "loss": 0.80887532, + "num_input_tokens_seen": 283210855, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.09472656, + "step": 13126, + "time_per_iteration": 2.6197116374969482 + }, + { + "auxiliary_loss_clip": 0.06406458, + "auxiliary_loss_mlp": 0.01265351, + "balance_loss_clip": 0.06271337, + "balance_loss_mlp": 0.0125604, + "epoch": 0.7892379377724335, + "flos": 17316433906560.0, + "grad_norm": 1.6718073300601826, + "language_loss": 0.77387738, + "learning_rate": 4.480432433327845e-07, + "loss": 0.85059547, + "num_input_tokens_seen": 283229665, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.09301758, + "step": 13127, + "time_per_iteration": 2.475583553314209 + }, + { + "auxiliary_loss_clip": 0.06398283, + "auxiliary_loss_mlp": 0.01266293, + "balance_loss_clip": 0.06270358, + "balance_loss_mlp": 0.01256649, + "epoch": 0.7892980610251015, + "flos": 25782781973760.0, + "grad_norm": 1.6570002472061196, + "language_loss": 0.85693359, + "learning_rate": 4.47797616101103e-07, + "loss": 0.93357939, + "num_input_tokens_seen": 283248615, + "router_z_loss_clip": 1.27734375, + "router_z_loss_mlp": 0.09643555, + "step": 13128, + "time_per_iteration": 2.506098508834839 + }, + { + "auxiliary_loss_clip": 0.06401196, + "auxiliary_loss_mlp": 0.01265664, + "balance_loss_clip": 0.06271003, + "balance_loss_mlp": 0.01256634, + "epoch": 0.7893581842777694, + "flos": 21586371719040.0, + "grad_norm": 1.9505455740147257, + "language_loss": 0.69738185, + "learning_rate": 4.475520477290904e-07, + "loss": 0.77405041, + "num_input_tokens_seen": 283267135, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.09033203, + "step": 13129, + "time_per_iteration": 2.492781400680542 + }, + { + "auxiliary_loss_clip": 0.06314191, + "auxiliary_loss_mlp": 0.01255045, + "balance_loss_clip": 0.06258637, + "balance_loss_mlp": 0.01254005, + "epoch": 0.7894183075304374, + "flos": 69037773793920.0, + "grad_norm": 0.7003894761434999, + "language_loss": 0.61533356, + "learning_rate": 4.473065382260597e-07, + "loss": 0.69102591, + "num_input_tokens_seen": 283328940, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01041412, + "step": 13130, + "time_per_iteration": 3.109016180038452 + }, + { + "auxiliary_loss_clip": 0.06405208, + "auxiliary_loss_mlp": 0.01262252, + "balance_loss_clip": 0.06272055, + "balance_loss_mlp": 0.01252686, + "epoch": 0.7894784307831053, + "flos": 24250107415680.0, + "grad_norm": 1.475922878769178, + "language_loss": 0.74187315, + "learning_rate": 4.4706108760132124e-07, + "loss": 0.81854773, + "num_input_tokens_seen": 283350000, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.09564209, + "step": 13131, + "time_per_iteration": 2.526529312133789 + }, + { + "auxiliary_loss_clip": 0.06417171, + "auxiliary_loss_mlp": 0.01266681, + "balance_loss_clip": 0.06273621, + "balance_loss_mlp": 0.01255297, + "epoch": 0.7895385540357733, + "flos": 20272770460800.0, + "grad_norm": 15.433314794516651, + "language_loss": 0.69895113, + "learning_rate": 4.4681569586418153e-07, + "loss": 0.77578956, + "num_input_tokens_seen": 283368020, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.11376953, + "step": 13132, + "time_per_iteration": 2.5669658184051514 + }, + { + "auxiliary_loss_clip": 0.06403211, + "auxiliary_loss_mlp": 0.01266676, + "balance_loss_clip": 0.06269971, + "balance_loss_mlp": 0.01256573, + "epoch": 0.7895986772884414, + "flos": 21002972866560.0, + "grad_norm": 2.4066374074433186, + "language_loss": 0.61959308, + "learning_rate": 4.465703630239468e-07, + "loss": 0.69629192, + "num_input_tokens_seen": 283387030, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.10113525, + "step": 13133, + "time_per_iteration": 2.4860470294952393 + }, + { + "auxiliary_loss_clip": 0.06406127, + "auxiliary_loss_mlp": 0.01270355, + "balance_loss_clip": 0.06272439, + "balance_loss_mlp": 0.01259644, + "epoch": 0.7896588005411093, + "flos": 18663423816960.0, + "grad_norm": 2.0571343653676326, + "language_loss": 0.8017205, + "learning_rate": 4.463250890899195e-07, + "loss": 0.87848526, + "num_input_tokens_seen": 283402090, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.10717773, + "step": 13134, + "time_per_iteration": 3.9168148040771484 + }, + { + "auxiliary_loss_clip": 0.06404164, + "auxiliary_loss_mlp": 0.01263167, + "balance_loss_clip": 0.06271005, + "balance_loss_mlp": 0.01254059, + "epoch": 0.7897189237937773, + "flos": 18411842332800.0, + "grad_norm": 2.033133539223884, + "language_loss": 0.80772352, + "learning_rate": 4.460798740713998e-07, + "loss": 0.88439691, + "num_input_tokens_seen": 283421035, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.09112549, + "step": 13135, + "time_per_iteration": 2.4654078483581543 + }, + { + "auxiliary_loss_clip": 0.06399068, + "auxiliary_loss_mlp": 0.01263162, + "balance_loss_clip": 0.06268865, + "balance_loss_mlp": 0.01253089, + "epoch": 0.7897790470464452, + "flos": 23738223623040.0, + "grad_norm": 1.6530850460824498, + "language_loss": 0.72782981, + "learning_rate": 4.4583471797768733e-07, + "loss": 0.80445212, + "num_input_tokens_seen": 283441830, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.10076904, + "step": 13136, + "time_per_iteration": 2.5253071784973145 + }, + { + "auxiliary_loss_clip": 0.06410457, + "auxiliary_loss_mlp": 0.01263296, + "balance_loss_clip": 0.06271002, + "balance_loss_mlp": 0.01252222, + "epoch": 0.7898391702991132, + "flos": 15923477232000.0, + "grad_norm": 2.3537390068214656, + "language_loss": 0.70506489, + "learning_rate": 4.455896208180778e-07, + "loss": 0.78180242, + "num_input_tokens_seen": 283459540, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.11077881, + "step": 13137, + "time_per_iteration": 2.468620777130127 + }, + { + "auxiliary_loss_clip": 0.06401488, + "auxiliary_loss_mlp": 0.01264377, + "balance_loss_clip": 0.06271732, + "balance_loss_mlp": 0.01254506, + "epoch": 0.7898992935517811, + "flos": 19835252766720.0, + "grad_norm": 1.578942697411419, + "language_loss": 0.74176329, + "learning_rate": 4.4534458260186645e-07, + "loss": 0.81842196, + "num_input_tokens_seen": 283478790, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.09869385, + "step": 13138, + "time_per_iteration": 3.9565515518188477 + }, + { + "auxiliary_loss_clip": 0.0640148, + "auxiliary_loss_mlp": 0.0126554, + "balance_loss_clip": 0.06271301, + "balance_loss_mlp": 0.01256271, + "epoch": 0.7899594168044491, + "flos": 16221738240000.0, + "grad_norm": 1.9480374334640547, + "language_loss": 0.686391, + "learning_rate": 4.4509960333834426e-07, + "loss": 0.76306117, + "num_input_tokens_seen": 283495720, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.09277344, + "step": 13139, + "time_per_iteration": 2.4804084300994873 + }, + { + "auxiliary_loss_clip": 0.06313749, + "auxiliary_loss_mlp": 0.01251905, + "balance_loss_clip": 0.06258325, + "balance_loss_mlp": 0.01250762, + "epoch": 0.790019540057117, + "flos": 68353496225280.0, + "grad_norm": 0.8282799229852567, + "language_loss": 0.60166419, + "learning_rate": 4.448546830368003e-07, + "loss": 0.67732072, + "num_input_tokens_seen": 283558795, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01143646, + "step": 13140, + "time_per_iteration": 3.181234359741211 + }, + { + "auxiliary_loss_clip": 0.06408462, + "auxiliary_loss_mlp": 0.01266869, + "balance_loss_clip": 0.06275274, + "balance_loss_mlp": 0.01257619, + "epoch": 0.7900796633097851, + "flos": 30340037836800.0, + "grad_norm": 1.5194345427413907, + "language_loss": 0.76587826, + "learning_rate": 4.4460982170652304e-07, + "loss": 0.84263158, + "num_input_tokens_seen": 283579305, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.09259033, + "step": 13141, + "time_per_iteration": 2.5935022830963135 + }, + { + "auxiliary_loss_clip": 0.06406665, + "auxiliary_loss_mlp": 0.01265708, + "balance_loss_clip": 0.06272526, + "balance_loss_mlp": 0.01255421, + "epoch": 0.790139786562453, + "flos": 22133237391360.0, + "grad_norm": 1.706504607669126, + "language_loss": 0.68517488, + "learning_rate": 4.4436501935679694e-07, + "loss": 0.76189852, + "num_input_tokens_seen": 283597840, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.10290527, + "step": 13142, + "time_per_iteration": 3.9123146533966064 + }, + { + "auxiliary_loss_clip": 0.06313135, + "auxiliary_loss_mlp": 0.01253569, + "balance_loss_clip": 0.06257692, + "balance_loss_mlp": 0.01252476, + "epoch": 0.790199909815121, + "flos": 58225210277760.0, + "grad_norm": 0.7895590429355487, + "language_loss": 0.59896362, + "learning_rate": 4.441202759969049e-07, + "loss": 0.6746307, + "num_input_tokens_seen": 283647950, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01094818, + "step": 13143, + "time_per_iteration": 2.9545323848724365 + }, + { + "auxiliary_loss_clip": 0.06407971, + "auxiliary_loss_mlp": 0.01265938, + "balance_loss_clip": 0.06271341, + "balance_loss_mlp": 0.01255495, + "epoch": 0.7902600330677889, + "flos": 34542066314880.0, + "grad_norm": 1.4595073006493966, + "language_loss": 0.74559182, + "learning_rate": 4.4387559163612875e-07, + "loss": 0.82233089, + "num_input_tokens_seen": 283670645, + "router_z_loss_clip": 1.36621094, + "router_z_loss_mlp": 0.10443115, + "step": 13144, + "time_per_iteration": 2.6375374794006348 + }, + { + "auxiliary_loss_clip": 0.06405632, + "auxiliary_loss_mlp": 0.01270956, + "balance_loss_clip": 0.0627213, + "balance_loss_mlp": 0.01260537, + "epoch": 0.7903201563204569, + "flos": 22352981523840.0, + "grad_norm": 1.6890449908385896, + "language_loss": 0.83446616, + "learning_rate": 4.4363096628374605e-07, + "loss": 0.91123205, + "num_input_tokens_seen": 283688830, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.10424805, + "step": 13145, + "time_per_iteration": 2.499363660812378 + }, + { + "auxiliary_loss_clip": 0.06395718, + "auxiliary_loss_mlp": 0.01261823, + "balance_loss_clip": 0.06268772, + "balance_loss_mlp": 0.01252971, + "epoch": 0.790380279573125, + "flos": 22059919468800.0, + "grad_norm": 1.6613829846262294, + "language_loss": 0.7342999, + "learning_rate": 4.4338639994903235e-07, + "loss": 0.81087536, + "num_input_tokens_seen": 283708625, + "router_z_loss_clip": 1.26953125, + "router_z_loss_mlp": 0.08862305, + "step": 13146, + "time_per_iteration": 2.515782356262207 + }, + { + "auxiliary_loss_clip": 0.06406832, + "auxiliary_loss_mlp": 0.01262426, + "balance_loss_clip": 0.0627181, + "balance_loss_mlp": 0.01252704, + "epoch": 0.7904404028257929, + "flos": 20308758589440.0, + "grad_norm": 1.836231171589266, + "language_loss": 0.76197815, + "learning_rate": 4.4314189264126246e-07, + "loss": 0.83867073, + "num_input_tokens_seen": 283725710, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.09716797, + "step": 13147, + "time_per_iteration": 2.4807651042938232 + }, + { + "auxiliary_loss_clip": 0.06400219, + "auxiliary_loss_mlp": 0.01266803, + "balance_loss_clip": 0.06270348, + "balance_loss_mlp": 0.01256921, + "epoch": 0.7905005260784609, + "flos": 20014732212480.0, + "grad_norm": 1.7419913226116706, + "language_loss": 0.72276485, + "learning_rate": 4.428974443697087e-07, + "loss": 0.79943514, + "num_input_tokens_seen": 283744150, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.09881592, + "step": 13148, + "time_per_iteration": 2.506728410720825 + }, + { + "auxiliary_loss_clip": 0.06406561, + "auxiliary_loss_mlp": 0.01264165, + "balance_loss_clip": 0.06271912, + "balance_loss_mlp": 0.01253782, + "epoch": 0.7905606493311288, + "flos": 26913088425600.0, + "grad_norm": 1.5866446208537701, + "language_loss": 0.71421397, + "learning_rate": 4.4265305514363913e-07, + "loss": 0.79092121, + "num_input_tokens_seen": 283764170, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.1038208, + "step": 13149, + "time_per_iteration": 2.5299153327941895 + }, + { + "auxiliary_loss_clip": 0.0640769, + "auxiliary_loss_mlp": 0.01263913, + "balance_loss_clip": 0.0627196, + "balance_loss_mlp": 0.01253417, + "epoch": 0.7906207725837968, + "flos": 23703032108160.0, + "grad_norm": 2.1166900358706138, + "language_loss": 0.65887839, + "learning_rate": 4.424087249723225e-07, + "loss": 0.73559439, + "num_input_tokens_seen": 283784305, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.10498047, + "step": 13150, + "time_per_iteration": 2.5118424892425537 + }, + { + "auxiliary_loss_clip": 0.06400509, + "auxiliary_loss_mlp": 0.01263964, + "balance_loss_clip": 0.06269284, + "balance_loss_mlp": 0.01254171, + "epoch": 0.7906808958364647, + "flos": 20854911502080.0, + "grad_norm": 1.5600793718059285, + "language_loss": 0.70213783, + "learning_rate": 4.421644538650231e-07, + "loss": 0.77878249, + "num_input_tokens_seen": 283804040, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09790039, + "step": 13151, + "time_per_iteration": 2.479990243911743 + }, + { + "auxiliary_loss_clip": 0.06407944, + "auxiliary_loss_mlp": 0.01264552, + "balance_loss_clip": 0.06272637, + "balance_loss_mlp": 0.01254682, + "epoch": 0.7907410190891327, + "flos": 40744866585600.0, + "grad_norm": 1.3436721274508034, + "language_loss": 0.70374179, + "learning_rate": 4.4192024183100306e-07, + "loss": 0.78046679, + "num_input_tokens_seen": 283827120, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.09875488, + "step": 13152, + "time_per_iteration": 2.66023850440979 + }, + { + "auxiliary_loss_clip": 0.06400564, + "auxiliary_loss_mlp": 0.01268098, + "balance_loss_clip": 0.06269571, + "balance_loss_mlp": 0.01258919, + "epoch": 0.7908011423418007, + "flos": 13266198299520.0, + "grad_norm": 1.733827476588534, + "language_loss": 0.72901142, + "learning_rate": 4.4167608887952367e-07, + "loss": 0.8056981, + "num_input_tokens_seen": 283844820, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.09179688, + "step": 13153, + "time_per_iteration": 2.4535181522369385 + }, + { + "auxiliary_loss_clip": 0.06401587, + "auxiliary_loss_mlp": 0.01266515, + "balance_loss_clip": 0.06268425, + "balance_loss_mlp": 0.01256502, + "epoch": 0.7908612655944687, + "flos": 19760718960000.0, + "grad_norm": 1.4410962438109587, + "language_loss": 0.78749764, + "learning_rate": 4.4143199501984306e-07, + "loss": 0.86417866, + "num_input_tokens_seen": 283862870, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.10009766, + "step": 13154, + "time_per_iteration": 2.481267213821411 + }, + { + "auxiliary_loss_clip": 0.06410754, + "auxiliary_loss_mlp": 0.01263056, + "balance_loss_clip": 0.06270463, + "balance_loss_mlp": 0.01252286, + "epoch": 0.7909213888471366, + "flos": 21294064350720.0, + "grad_norm": 1.8857519871038082, + "language_loss": 0.70335776, + "learning_rate": 4.411879602612185e-07, + "loss": 0.78009582, + "num_input_tokens_seen": 283882405, + "router_z_loss_clip": 1.40332031, + "router_z_loss_mlp": 0.10778809, + "step": 13155, + "time_per_iteration": 2.474088668823242 + }, + { + "auxiliary_loss_clip": 0.06405213, + "auxiliary_loss_mlp": 0.01266856, + "balance_loss_clip": 0.06271385, + "balance_loss_mlp": 0.01257069, + "epoch": 0.7909815120998046, + "flos": 22535521643520.0, + "grad_norm": 2.510036385951424, + "language_loss": 0.77293575, + "learning_rate": 4.4094398461290174e-07, + "loss": 0.8496564, + "num_input_tokens_seen": 283902070, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.09790039, + "step": 13156, + "time_per_iteration": 2.513814926147461 + }, + { + "auxiliary_loss_clip": 0.06403618, + "auxiliary_loss_mlp": 0.01263274, + "balance_loss_clip": 0.06271893, + "balance_loss_mlp": 0.01254185, + "epoch": 0.7910416353524725, + "flos": 26735537623680.0, + "grad_norm": 1.591424288088247, + "language_loss": 0.65432274, + "learning_rate": 4.4070006808414526e-07, + "loss": 0.73099172, + "num_input_tokens_seen": 283924100, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09088135, + "step": 13157, + "time_per_iteration": 2.534609079360962 + }, + { + "auxiliary_loss_clip": 0.0640482, + "auxiliary_loss_mlp": 0.01266464, + "balance_loss_clip": 0.06269716, + "balance_loss_mlp": 0.01256272, + "epoch": 0.7911017586051405, + "flos": 24651804689280.0, + "grad_norm": 2.191693050285661, + "language_loss": 0.7477805, + "learning_rate": 4.4045621068419894e-07, + "loss": 0.82449341, + "num_input_tokens_seen": 283944955, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.10192871, + "step": 13158, + "time_per_iteration": 2.5379066467285156 + }, + { + "auxiliary_loss_clip": 0.06396219, + "auxiliary_loss_mlp": 0.01263878, + "balance_loss_clip": 0.06268845, + "balance_loss_mlp": 0.01255116, + "epoch": 0.7911618818578086, + "flos": 17571076064640.0, + "grad_norm": 1.9112834208400953, + "language_loss": 0.67451692, + "learning_rate": 4.40212412422309e-07, + "loss": 0.75111789, + "num_input_tokens_seen": 283963125, + "router_z_loss_clip": 1.2734375, + "router_z_loss_mlp": 0.08764648, + "step": 13159, + "time_per_iteration": 2.464768171310425 + }, + { + "auxiliary_loss_clip": 0.06400043, + "auxiliary_loss_mlp": 0.01266297, + "balance_loss_clip": 0.06269793, + "balance_loss_mlp": 0.0125645, + "epoch": 0.7912220051104765, + "flos": 16726326727680.0, + "grad_norm": 1.6817860395466344, + "language_loss": 0.67496979, + "learning_rate": 4.399686733077206e-07, + "loss": 0.75163317, + "num_input_tokens_seen": 283982850, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.09838867, + "step": 13160, + "time_per_iteration": 2.5563478469848633 + }, + { + "auxiliary_loss_clip": 0.0639656, + "auxiliary_loss_mlp": 0.01260248, + "balance_loss_clip": 0.06270408, + "balance_loss_mlp": 0.01252225, + "epoch": 0.7912821283631445, + "flos": 13703799847680.0, + "grad_norm": 1.7956028234892243, + "language_loss": 0.73223495, + "learning_rate": 4.3972499334967694e-07, + "loss": 0.80880302, + "num_input_tokens_seen": 283998275, + "router_z_loss_clip": 1.26171875, + "router_z_loss_mlp": 0.08007812, + "step": 13161, + "time_per_iteration": 2.449843406677246 + }, + { + "auxiliary_loss_clip": 0.0639775, + "auxiliary_loss_mlp": 0.01264548, + "balance_loss_clip": 0.06270458, + "balance_loss_mlp": 0.01255142, + "epoch": 0.7913422516158124, + "flos": 23775804979200.0, + "grad_norm": 1.579946795431406, + "language_loss": 0.73348385, + "learning_rate": 4.39481372557418e-07, + "loss": 0.81010681, + "num_input_tokens_seen": 284018750, + "router_z_loss_clip": 1.2734375, + "router_z_loss_mlp": 0.09399414, + "step": 13162, + "time_per_iteration": 2.538973093032837 + }, + { + "auxiliary_loss_clip": 0.06408161, + "auxiliary_loss_mlp": 0.01265697, + "balance_loss_clip": 0.06272799, + "balance_loss_mlp": 0.01255326, + "epoch": 0.7914023748684804, + "flos": 19944433036800.0, + "grad_norm": 3.1550813809291127, + "language_loss": 0.72027671, + "learning_rate": 4.392378109401811e-07, + "loss": 0.79701531, + "num_input_tokens_seen": 284037850, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.10369873, + "step": 13163, + "time_per_iteration": 2.481580972671509 + }, + { + "auxiliary_loss_clip": 0.06402975, + "auxiliary_loss_mlp": 0.01263483, + "balance_loss_clip": 0.06272998, + "balance_loss_mlp": 0.01253315, + "epoch": 0.7914624981211483, + "flos": 20601065957760.0, + "grad_norm": 1.7688129227744467, + "language_loss": 0.69559741, + "learning_rate": 4.3899430850720296e-07, + "loss": 0.77226198, + "num_input_tokens_seen": 284056380, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.10168457, + "step": 13164, + "time_per_iteration": 3.9441864490509033 + }, + { + "auxiliary_loss_clip": 0.06400138, + "auxiliary_loss_mlp": 0.01262142, + "balance_loss_clip": 0.0626981, + "balance_loss_mlp": 0.01253058, + "epoch": 0.7915226213738163, + "flos": 21806031997440.0, + "grad_norm": 1.639968913344359, + "language_loss": 0.66723585, + "learning_rate": 4.387508652677177e-07, + "loss": 0.74385864, + "num_input_tokens_seen": 284074945, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.09088135, + "step": 13165, + "time_per_iteration": 2.480177164077759 + }, + { + "auxiliary_loss_clip": 0.06395824, + "auxiliary_loss_mlp": 0.01263637, + "balance_loss_clip": 0.06268749, + "balance_loss_mlp": 0.01254887, + "epoch": 0.7915827446264843, + "flos": 16293714497280.0, + "grad_norm": 1.7980788419504534, + "language_loss": 0.72814763, + "learning_rate": 4.385074812309557e-07, + "loss": 0.80474222, + "num_input_tokens_seen": 284092070, + "router_z_loss_clip": 1.27148438, + "router_z_loss_mlp": 0.08758545, + "step": 13166, + "time_per_iteration": 2.5405478477478027 + }, + { + "auxiliary_loss_clip": 0.06400768, + "auxiliary_loss_mlp": 0.01267015, + "balance_loss_clip": 0.06271509, + "balance_loss_mlp": 0.01256602, + "epoch": 0.7916428678791523, + "flos": 25709673686400.0, + "grad_norm": 1.5950499739045652, + "language_loss": 0.77752012, + "learning_rate": 4.382641564061462e-07, + "loss": 0.85419798, + "num_input_tokens_seen": 284112255, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.10412598, + "step": 13167, + "time_per_iteration": 2.513096332550049 + }, + { + "auxiliary_loss_clip": 0.06400877, + "auxiliary_loss_mlp": 0.01265571, + "balance_loss_clip": 0.0627252, + "balance_loss_mlp": 0.01256553, + "epoch": 0.7917029911318202, + "flos": 23885320665600.0, + "grad_norm": 1.5971175695751862, + "language_loss": 0.84140885, + "learning_rate": 4.3802089080251713e-07, + "loss": 0.9180733, + "num_input_tokens_seen": 284132330, + "router_z_loss_clip": 1.28222656, + "router_z_loss_mlp": 0.09020996, + "step": 13168, + "time_per_iteration": 2.5276131629943848 + }, + { + "auxiliary_loss_clip": 0.06402327, + "auxiliary_loss_mlp": 0.0126475, + "balance_loss_clip": 0.06270839, + "balance_loss_mlp": 0.01254939, + "epoch": 0.7917631143844882, + "flos": 21651975066240.0, + "grad_norm": 1.4948037375095564, + "language_loss": 0.72659689, + "learning_rate": 4.3777768442929155e-07, + "loss": 0.8032676, + "num_input_tokens_seen": 284150640, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09820557, + "step": 13169, + "time_per_iteration": 2.476069211959839 + }, + { + "auxiliary_loss_clip": 0.06405612, + "auxiliary_loss_mlp": 0.01262617, + "balance_loss_clip": 0.06269795, + "balance_loss_mlp": 0.01252794, + "epoch": 0.7918232376371561, + "flos": 38883519187200.0, + "grad_norm": 1.931209408255316, + "language_loss": 0.674968, + "learning_rate": 4.3753453729569287e-07, + "loss": 0.75165027, + "num_input_tokens_seen": 284171910, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.09820557, + "step": 13170, + "time_per_iteration": 2.632267951965332 + }, + { + "auxiliary_loss_clip": 0.06402327, + "auxiliary_loss_mlp": 0.01263649, + "balance_loss_clip": 0.06270221, + "balance_loss_mlp": 0.01255108, + "epoch": 0.7918833608898241, + "flos": 20781551652480.0, + "grad_norm": 1.5871676794676228, + "language_loss": 0.70988441, + "learning_rate": 4.372914494109412e-07, + "loss": 0.7865442, + "num_input_tokens_seen": 284191340, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.08544922, + "step": 13171, + "time_per_iteration": 2.510680675506592 + }, + { + "auxiliary_loss_clip": 0.06402034, + "auxiliary_loss_mlp": 0.01267973, + "balance_loss_clip": 0.06270307, + "balance_loss_mlp": 0.0125855, + "epoch": 0.7919434841424922, + "flos": 33918276994560.0, + "grad_norm": 2.589962482835532, + "language_loss": 0.67366862, + "learning_rate": 4.370484207842553e-07, + "loss": 0.75036865, + "num_input_tokens_seen": 284212495, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.09417725, + "step": 13172, + "time_per_iteration": 2.6106696128845215 + }, + { + "auxiliary_loss_clip": 0.06403903, + "auxiliary_loss_mlp": 0.0126396, + "balance_loss_clip": 0.06273881, + "balance_loss_mlp": 0.01254209, + "epoch": 0.7920036073951601, + "flos": 21070253295360.0, + "grad_norm": 1.738065699124664, + "language_loss": 0.80093193, + "learning_rate": 4.3680545142484893e-07, + "loss": 0.87761056, + "num_input_tokens_seen": 284230825, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.09753418, + "step": 13173, + "time_per_iteration": 3.950551986694336 + }, + { + "auxiliary_loss_clip": 0.06400689, + "auxiliary_loss_mlp": 0.01261307, + "balance_loss_clip": 0.06269704, + "balance_loss_mlp": 0.01252307, + "epoch": 0.7920637306478281, + "flos": 23662138515840.0, + "grad_norm": 1.8426798849917176, + "language_loss": 0.77325201, + "learning_rate": 4.365625413419365e-07, + "loss": 0.84987199, + "num_input_tokens_seen": 284250365, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09002686, + "step": 13174, + "time_per_iteration": 2.591482639312744 + }, + { + "auxiliary_loss_clip": 0.06398596, + "auxiliary_loss_mlp": 0.01261992, + "balance_loss_clip": 0.06270695, + "balance_loss_mlp": 0.01253219, + "epoch": 0.792123853900496, + "flos": 27202251265920.0, + "grad_norm": 1.5031237737360255, + "language_loss": 0.71669394, + "learning_rate": 4.363196905447297e-07, + "loss": 0.79329979, + "num_input_tokens_seen": 284269635, + "router_z_loss_clip": 1.27832031, + "router_z_loss_mlp": 0.08770752, + "step": 13175, + "time_per_iteration": 2.587193489074707 + }, + { + "auxiliary_loss_clip": 0.06401914, + "auxiliary_loss_mlp": 0.01263613, + "balance_loss_clip": 0.06270476, + "balance_loss_mlp": 0.01254601, + "epoch": 0.792183977153164, + "flos": 19104631090560.0, + "grad_norm": 1.9608803410251472, + "language_loss": 0.59982938, + "learning_rate": 4.360768990424364e-07, + "loss": 0.67648464, + "num_input_tokens_seen": 284288380, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09014893, + "step": 13176, + "time_per_iteration": 2.4545774459838867 + }, + { + "auxiliary_loss_clip": 0.06398389, + "auxiliary_loss_mlp": 0.01268261, + "balance_loss_clip": 0.06270067, + "balance_loss_mlp": 0.01258635, + "epoch": 0.7922441004058319, + "flos": 17134564619520.0, + "grad_norm": 1.8342420107617015, + "language_loss": 0.73352873, + "learning_rate": 4.3583416684426376e-07, + "loss": 0.81019521, + "num_input_tokens_seen": 284306920, + "router_z_loss_clip": 1.28320312, + "router_z_loss_mlp": 0.09619141, + "step": 13177, + "time_per_iteration": 3.9278790950775146 + }, + { + "auxiliary_loss_clip": 0.06401221, + "auxiliary_loss_mlp": 0.0126363, + "balance_loss_clip": 0.06272082, + "balance_loss_mlp": 0.01254475, + "epoch": 0.7923042236585, + "flos": 17827395304320.0, + "grad_norm": 1.8523697538025845, + "language_loss": 0.64460981, + "learning_rate": 4.355914939594174e-07, + "loss": 0.72125828, + "num_input_tokens_seen": 284324700, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.09155273, + "step": 13178, + "time_per_iteration": 2.464949131011963 + }, + { + "auxiliary_loss_clip": 0.06402718, + "auxiliary_loss_mlp": 0.01261465, + "balance_loss_clip": 0.06270282, + "balance_loss_mlp": 0.01252804, + "epoch": 0.7923643469111679, + "flos": 29943036391680.0, + "grad_norm": 1.8056668444425423, + "language_loss": 0.69007665, + "learning_rate": 4.3534888039709726e-07, + "loss": 0.76671851, + "num_input_tokens_seen": 284345985, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.08660889, + "step": 13179, + "time_per_iteration": 2.560208559036255 + }, + { + "auxiliary_loss_clip": 0.06402154, + "auxiliary_loss_mlp": 0.01265495, + "balance_loss_clip": 0.06272629, + "balance_loss_mlp": 0.01256155, + "epoch": 0.7924244701638359, + "flos": 22681360874880.0, + "grad_norm": 2.1905203910288105, + "language_loss": 0.74228048, + "learning_rate": 4.3510632616650444e-07, + "loss": 0.81895697, + "num_input_tokens_seen": 284364475, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.09332275, + "step": 13180, + "time_per_iteration": 2.5125856399536133 + }, + { + "auxiliary_loss_clip": 0.06402977, + "auxiliary_loss_mlp": 0.01265326, + "balance_loss_clip": 0.06271179, + "balance_loss_mlp": 0.01254729, + "epoch": 0.7924845934165038, + "flos": 17974031149440.0, + "grad_norm": 2.3420456225908524, + "language_loss": 0.81796247, + "learning_rate": 4.3486383127683646e-07, + "loss": 0.89464545, + "num_input_tokens_seen": 284382125, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.10595703, + "step": 13181, + "time_per_iteration": 2.4527087211608887 + }, + { + "auxiliary_loss_clip": 0.06399131, + "auxiliary_loss_mlp": 0.01263297, + "balance_loss_clip": 0.06270739, + "balance_loss_mlp": 0.01253791, + "epoch": 0.7925447166691718, + "flos": 23483665319040.0, + "grad_norm": 1.8219768185370055, + "language_loss": 0.7760042, + "learning_rate": 4.346213957372895e-07, + "loss": 0.85262847, + "num_input_tokens_seen": 284401585, + "router_z_loss_clip": 1.28320312, + "router_z_loss_mlp": 0.09509277, + "step": 13182, + "time_per_iteration": 4.028662919998169 + }, + { + "auxiliary_loss_clip": 0.06410173, + "auxiliary_loss_mlp": 0.01265893, + "balance_loss_clip": 0.06274082, + "balance_loss_mlp": 0.01254866, + "epoch": 0.7926048399218397, + "flos": 20453591571840.0, + "grad_norm": 1.6188805399457735, + "language_loss": 0.74277139, + "learning_rate": 4.34379019557056e-07, + "loss": 0.8195321, + "num_input_tokens_seen": 284419125, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.11029053, + "step": 13183, + "time_per_iteration": 2.4738929271698 + }, + { + "auxiliary_loss_clip": 0.06403777, + "auxiliary_loss_mlp": 0.01263216, + "balance_loss_clip": 0.06273498, + "balance_loss_mlp": 0.0125424, + "epoch": 0.7926649631745077, + "flos": 37169184977280.0, + "grad_norm": 1.7084157774544453, + "language_loss": 0.68652374, + "learning_rate": 4.341367027453264e-07, + "loss": 0.76319367, + "num_input_tokens_seen": 284440445, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.08978271, + "step": 13184, + "time_per_iteration": 2.6054959297180176 + }, + { + "auxiliary_loss_clip": 0.06404284, + "auxiliary_loss_mlp": 0.01263636, + "balance_loss_clip": 0.06271448, + "balance_loss_mlp": 0.01254082, + "epoch": 0.7927250864271758, + "flos": 17024168465280.0, + "grad_norm": 1.8074716343378143, + "language_loss": 0.71104252, + "learning_rate": 4.338944453112907e-07, + "loss": 0.78772175, + "num_input_tokens_seen": 284459370, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09558105, + "step": 13185, + "time_per_iteration": 2.457500696182251 + }, + { + "auxiliary_loss_clip": 0.06404824, + "auxiliary_loss_mlp": 0.01263758, + "balance_loss_clip": 0.06271466, + "balance_loss_mlp": 0.01254377, + "epoch": 0.7927852096798437, + "flos": 17755041703680.0, + "grad_norm": 2.0425556514381777, + "language_loss": 0.65721595, + "learning_rate": 4.3365224726413375e-07, + "loss": 0.73390174, + "num_input_tokens_seen": 284477525, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.09381104, + "step": 13186, + "time_per_iteration": 2.491744041442871 + }, + { + "auxiliary_loss_clip": 0.06399564, + "auxiliary_loss_mlp": 0.01262578, + "balance_loss_clip": 0.06271927, + "balance_loss_mlp": 0.01253965, + "epoch": 0.7928453329325117, + "flos": 23844636708480.0, + "grad_norm": 1.452369328079203, + "language_loss": 0.77105349, + "learning_rate": 4.334101086130408e-07, + "loss": 0.84767497, + "num_input_tokens_seen": 284496590, + "router_z_loss_clip": 1.27539062, + "router_z_loss_mlp": 0.08612061, + "step": 13187, + "time_per_iteration": 2.512676239013672 + }, + { + "auxiliary_loss_clip": 0.06400672, + "auxiliary_loss_mlp": 0.01265003, + "balance_loss_clip": 0.06270963, + "balance_loss_mlp": 0.01255741, + "epoch": 0.7929054561851796, + "flos": 17460302567040.0, + "grad_norm": 1.9206985573704325, + "language_loss": 0.72777045, + "learning_rate": 4.3316802936719334e-07, + "loss": 0.80442715, + "num_input_tokens_seen": 284511470, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.09259033, + "step": 13188, + "time_per_iteration": 2.4961729049682617 + }, + { + "auxiliary_loss_clip": 0.06405029, + "auxiliary_loss_mlp": 0.01265612, + "balance_loss_clip": 0.06271419, + "balance_loss_mlp": 0.01254633, + "epoch": 0.7929655794378476, + "flos": 21987775503360.0, + "grad_norm": 2.0256790948802066, + "language_loss": 0.63584489, + "learning_rate": 4.329260095357725e-07, + "loss": 0.71255124, + "num_input_tokens_seen": 284531125, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.10980225, + "step": 13189, + "time_per_iteration": 2.481018304824829 + }, + { + "auxiliary_loss_clip": 0.06406255, + "auxiliary_loss_mlp": 0.01267784, + "balance_loss_clip": 0.06275403, + "balance_loss_mlp": 0.01258539, + "epoch": 0.7930257026905155, + "flos": 17279523383040.0, + "grad_norm": 2.1940059966398557, + "language_loss": 0.72796714, + "learning_rate": 4.3268404912795307e-07, + "loss": 0.80470747, + "num_input_tokens_seen": 284549340, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09240723, + "step": 13190, + "time_per_iteration": 2.489017963409424 + }, + { + "auxiliary_loss_clip": 0.06397982, + "auxiliary_loss_mlp": 0.01262706, + "balance_loss_clip": 0.06271739, + "balance_loss_mlp": 0.01254487, + "epoch": 0.7930858259431836, + "flos": 27306693780480.0, + "grad_norm": 2.0481734999626213, + "language_loss": 0.73499632, + "learning_rate": 4.3244214815291166e-07, + "loss": 0.81160319, + "num_input_tokens_seen": 284567060, + "router_z_loss_clip": 1.26269531, + "router_z_loss_mlp": 0.08221436, + "step": 13191, + "time_per_iteration": 2.523073196411133 + }, + { + "auxiliary_loss_clip": 0.06402196, + "auxiliary_loss_mlp": 0.01264267, + "balance_loss_clip": 0.06271292, + "balance_loss_mlp": 0.01254915, + "epoch": 0.7931459491958515, + "flos": 19869647667840.0, + "grad_norm": 1.6892778710359044, + "language_loss": 0.69173294, + "learning_rate": 4.322003066198219e-07, + "loss": 0.76839757, + "num_input_tokens_seen": 284586600, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09350586, + "step": 13192, + "time_per_iteration": 2.4932494163513184 + }, + { + "auxiliary_loss_clip": 0.06401037, + "auxiliary_loss_mlp": 0.0126355, + "balance_loss_clip": 0.06269231, + "balance_loss_mlp": 0.01254395, + "epoch": 0.7932060724485195, + "flos": 23153525032320.0, + "grad_norm": 1.5309974551938075, + "language_loss": 0.75287253, + "learning_rate": 4.3195852453785274e-07, + "loss": 0.82951844, + "num_input_tokens_seen": 284605715, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.0914917, + "step": 13193, + "time_per_iteration": 2.4988462924957275 + }, + { + "auxiliary_loss_clip": 0.0639962, + "auxiliary_loss_mlp": 0.01263491, + "balance_loss_clip": 0.06269534, + "balance_loss_mlp": 0.01253216, + "epoch": 0.7932661957011874, + "flos": 29942617121280.0, + "grad_norm": 1.4608356167152348, + "language_loss": 0.72191167, + "learning_rate": 4.317168019161741e-07, + "loss": 0.7985428, + "num_input_tokens_seen": 284628540, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.1026001, + "step": 13194, + "time_per_iteration": 2.545863151550293 + }, + { + "auxiliary_loss_clip": 0.06407529, + "auxiliary_loss_mlp": 0.01263238, + "balance_loss_clip": 0.06271923, + "balance_loss_mlp": 0.0125323, + "epoch": 0.7933263189538554, + "flos": 22564717591680.0, + "grad_norm": 1.9164119447525156, + "language_loss": 0.70693266, + "learning_rate": 4.314751387639517e-07, + "loss": 0.78364033, + "num_input_tokens_seen": 284646040, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.10015869, + "step": 13195, + "time_per_iteration": 2.478484869003296 + }, + { + "auxiliary_loss_clip": 0.06403863, + "auxiliary_loss_mlp": 0.0126619, + "balance_loss_clip": 0.06272461, + "balance_loss_mlp": 0.0125679, + "epoch": 0.7933864422065233, + "flos": 25485317579520.0, + "grad_norm": 1.4419483453830304, + "language_loss": 0.77285999, + "learning_rate": 4.3123353509034844e-07, + "loss": 0.8495605, + "num_input_tokens_seen": 284665110, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.09411621, + "step": 13196, + "time_per_iteration": 2.5209035873413086 + }, + { + "auxiliary_loss_clip": 0.06408395, + "auxiliary_loss_mlp": 0.01271096, + "balance_loss_clip": 0.06274862, + "balance_loss_mlp": 0.01261196, + "epoch": 0.7934465654591913, + "flos": 33591490871040.0, + "grad_norm": 1.6476530892648569, + "language_loss": 0.6925202, + "learning_rate": 4.309919909045268e-07, + "loss": 0.76931512, + "num_input_tokens_seen": 284686515, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.09899902, + "step": 13197, + "time_per_iteration": 2.6008334159851074 + }, + { + "auxiliary_loss_clip": 0.06401211, + "auxiliary_loss_mlp": 0.012638, + "balance_loss_clip": 0.06270218, + "balance_loss_mlp": 0.01254281, + "epoch": 0.7935066887118594, + "flos": 31440854851200.0, + "grad_norm": 1.7257166200150085, + "language_loss": 0.65332729, + "learning_rate": 4.30750506215646e-07, + "loss": 0.72997743, + "num_input_tokens_seen": 284707300, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09521484, + "step": 13198, + "time_per_iteration": 2.5760626792907715 + }, + { + "auxiliary_loss_clip": 0.06407583, + "auxiliary_loss_mlp": 0.01266914, + "balance_loss_clip": 0.06272698, + "balance_loss_mlp": 0.0125696, + "epoch": 0.7935668119645273, + "flos": 14687638162560.0, + "grad_norm": 1.9381240473938566, + "language_loss": 0.72217059, + "learning_rate": 4.30509081032864e-07, + "loss": 0.79891551, + "num_input_tokens_seen": 284723545, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.0994873, + "step": 13199, + "time_per_iteration": 2.4537320137023926 + }, + { + "auxiliary_loss_clip": 0.06404065, + "auxiliary_loss_mlp": 0.01264064, + "balance_loss_clip": 0.06271455, + "balance_loss_mlp": 0.01254647, + "epoch": 0.7936269352171953, + "flos": 18010061205120.0, + "grad_norm": 1.8593669017855428, + "language_loss": 0.80699968, + "learning_rate": 4.302677153653349e-07, + "loss": 0.88368094, + "num_input_tokens_seen": 284742650, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09411621, + "step": 13200, + "time_per_iteration": 2.4965553283691406 + }, + { + "auxiliary_loss_clip": 0.06395376, + "auxiliary_loss_mlp": 0.01263957, + "balance_loss_clip": 0.06269375, + "balance_loss_mlp": 0.01254527, + "epoch": 0.7936870584698632, + "flos": 18886228623360.0, + "grad_norm": 1.593396762237453, + "language_loss": 0.77522814, + "learning_rate": 4.3002640922221077e-07, + "loss": 0.85182142, + "num_input_tokens_seen": 284760955, + "router_z_loss_clip": 1.26171875, + "router_z_loss_mlp": 0.09423828, + "step": 13201, + "time_per_iteration": 2.497309446334839 + }, + { + "auxiliary_loss_clip": 0.06399371, + "auxiliary_loss_mlp": 0.01265865, + "balance_loss_clip": 0.06270684, + "balance_loss_mlp": 0.01256149, + "epoch": 0.7937471817225312, + "flos": 23373604581120.0, + "grad_norm": 1.5839447213043625, + "language_loss": 0.67329711, + "learning_rate": 4.2978516261264296e-07, + "loss": 0.74994946, + "num_input_tokens_seen": 284780745, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.09716797, + "step": 13202, + "time_per_iteration": 2.5105254650115967 + }, + { + "auxiliary_loss_clip": 0.06399509, + "auxiliary_loss_mlp": 0.01267318, + "balance_loss_clip": 0.06267376, + "balance_loss_mlp": 0.01257501, + "epoch": 0.7938073049751991, + "flos": 22681025458560.0, + "grad_norm": 1.8682622779044114, + "language_loss": 0.75083208, + "learning_rate": 4.2954397554577884e-07, + "loss": 0.82750034, + "num_input_tokens_seen": 284799000, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.09820557, + "step": 13203, + "time_per_iteration": 3.8750996589660645 + }, + { + "auxiliary_loss_clip": 0.06400256, + "auxiliary_loss_mlp": 0.01263086, + "balance_loss_clip": 0.06268462, + "balance_loss_mlp": 0.01253907, + "epoch": 0.7938674282278672, + "flos": 22857150741120.0, + "grad_norm": 1.6792002510464108, + "language_loss": 0.66683894, + "learning_rate": 4.293028480307643e-07, + "loss": 0.74347234, + "num_input_tokens_seen": 284817450, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09173584, + "step": 13204, + "time_per_iteration": 2.4866726398468018 + }, + { + "auxiliary_loss_clip": 0.0640104, + "auxiliary_loss_mlp": 0.01260862, + "balance_loss_clip": 0.0627223, + "balance_loss_mlp": 0.01252249, + "epoch": 0.7939275514805351, + "flos": 27019208021760.0, + "grad_norm": 1.3684183312797948, + "language_loss": 0.79726428, + "learning_rate": 4.290617800767438e-07, + "loss": 0.87388325, + "num_input_tokens_seen": 284838865, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.08605957, + "step": 13205, + "time_per_iteration": 2.555922746658325 + }, + { + "auxiliary_loss_clip": 0.06398693, + "auxiliary_loss_mlp": 0.012639, + "balance_loss_clip": 0.06270471, + "balance_loss_mlp": 0.01254596, + "epoch": 0.7939876747332031, + "flos": 21149315222400.0, + "grad_norm": 1.956372656118469, + "language_loss": 0.77988601, + "learning_rate": 4.28820771692858e-07, + "loss": 0.85651195, + "num_input_tokens_seen": 284857975, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.09295654, + "step": 13206, + "time_per_iteration": 2.5223846435546875 + }, + { + "auxiliary_loss_clip": 0.06407081, + "auxiliary_loss_mlp": 0.01264461, + "balance_loss_clip": 0.06272183, + "balance_loss_mlp": 0.01254638, + "epoch": 0.794047797985871, + "flos": 23294836143360.0, + "grad_norm": 2.5564565777737265, + "language_loss": 0.78640836, + "learning_rate": 4.285798228882456e-07, + "loss": 0.86312377, + "num_input_tokens_seen": 284877145, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.0982666, + "step": 13207, + "time_per_iteration": 2.5289721488952637 + }, + { + "auxiliary_loss_clip": 0.06401804, + "auxiliary_loss_mlp": 0.01266401, + "balance_loss_clip": 0.06270908, + "balance_loss_mlp": 0.01256679, + "epoch": 0.794107921238539, + "flos": 24614978019840.0, + "grad_norm": 1.988476360796287, + "language_loss": 0.84176642, + "learning_rate": 4.2833893367204375e-07, + "loss": 0.91844845, + "num_input_tokens_seen": 284895560, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.097229, + "step": 13208, + "time_per_iteration": 2.5182619094848633 + }, + { + "auxiliary_loss_clip": 0.06307561, + "auxiliary_loss_mlp": 0.01252747, + "balance_loss_clip": 0.06251705, + "balance_loss_mlp": 0.01251759, + "epoch": 0.7941680444912069, + "flos": 64114641077760.0, + "grad_norm": 0.7251481470508581, + "language_loss": 0.58347547, + "learning_rate": 4.280981040533875e-07, + "loss": 0.65907854, + "num_input_tokens_seen": 284963135, + "router_z_loss_clip": 0.56005859, + "router_z_loss_mlp": 0.00986481, + "step": 13209, + "time_per_iteration": 3.215669631958008 + }, + { + "auxiliary_loss_clip": 0.06411248, + "auxiliary_loss_mlp": 0.01263694, + "balance_loss_clip": 0.06275053, + "balance_loss_mlp": 0.01253753, + "epoch": 0.794228167743875, + "flos": 24395653157760.0, + "grad_norm": 2.3239436118534544, + "language_loss": 0.63244212, + "learning_rate": 4.2785733404140825e-07, + "loss": 0.70919156, + "num_input_tokens_seen": 284981755, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.09936523, + "step": 13210, + "time_per_iteration": 2.509675979614258 + }, + { + "auxiliary_loss_clip": 0.0639855, + "auxiliary_loss_mlp": 0.01264565, + "balance_loss_clip": 0.06268808, + "balance_loss_mlp": 0.01255135, + "epoch": 0.794288290996543, + "flos": 28520129082240.0, + "grad_norm": 1.5283303816318292, + "language_loss": 0.69651222, + "learning_rate": 4.2761662364523676e-07, + "loss": 0.77314341, + "num_input_tokens_seen": 285003060, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.09423828, + "step": 13211, + "time_per_iteration": 2.5609560012817383 + }, + { + "auxiliary_loss_clip": 0.0640647, + "auxiliary_loss_mlp": 0.01264423, + "balance_loss_clip": 0.06271889, + "balance_loss_mlp": 0.01253593, + "epoch": 0.7943484142492109, + "flos": 25929333964800.0, + "grad_norm": 1.5675650116890587, + "language_loss": 0.72487032, + "learning_rate": 4.2737597287400074e-07, + "loss": 0.80157924, + "num_input_tokens_seen": 285021640, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.10827637, + "step": 13212, + "time_per_iteration": 2.5255634784698486 + }, + { + "auxiliary_loss_clip": 0.06398303, + "auxiliary_loss_mlp": 0.0126368, + "balance_loss_clip": 0.06271377, + "balance_loss_mlp": 0.01254716, + "epoch": 0.7944085375018789, + "flos": 23922147335040.0, + "grad_norm": 1.6395336684596964, + "language_loss": 0.80590618, + "learning_rate": 4.271353817368246e-07, + "loss": 0.88252604, + "num_input_tokens_seen": 285040490, + "router_z_loss_clip": 1.26855469, + "router_z_loss_mlp": 0.08972168, + "step": 13213, + "time_per_iteration": 3.9452641010284424 + }, + { + "auxiliary_loss_clip": 0.06409128, + "auxiliary_loss_mlp": 0.01263209, + "balance_loss_clip": 0.06274794, + "balance_loss_mlp": 0.0125316, + "epoch": 0.7944686607545468, + "flos": 20236153426560.0, + "grad_norm": 2.1556158344518463, + "language_loss": 0.67980099, + "learning_rate": 4.268948502428327e-07, + "loss": 0.75652432, + "num_input_tokens_seen": 285059270, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.10046387, + "step": 13214, + "time_per_iteration": 2.5221662521362305 + }, + { + "auxiliary_loss_clip": 0.06399108, + "auxiliary_loss_mlp": 0.0126568, + "balance_loss_clip": 0.06270888, + "balance_loss_mlp": 0.01256215, + "epoch": 0.7945287840072148, + "flos": 21987440087040.0, + "grad_norm": 1.6557569175319402, + "language_loss": 0.72647429, + "learning_rate": 4.2665437840114535e-07, + "loss": 0.80312216, + "num_input_tokens_seen": 285075390, + "router_z_loss_clip": 1.28222656, + "router_z_loss_mlp": 0.09454346, + "step": 13215, + "time_per_iteration": 2.482057809829712 + }, + { + "auxiliary_loss_clip": 0.06396606, + "auxiliary_loss_mlp": 0.01264543, + "balance_loss_clip": 0.06269813, + "balance_loss_mlp": 0.0125512, + "epoch": 0.7945889072598827, + "flos": 26405229628800.0, + "grad_norm": 1.661805737915831, + "language_loss": 0.79503906, + "learning_rate": 4.2641396622088253e-07, + "loss": 0.87165052, + "num_input_tokens_seen": 285096290, + "router_z_loss_clip": 1.26855469, + "router_z_loss_mlp": 0.09429932, + "step": 13216, + "time_per_iteration": 2.5464351177215576 + }, + { + "auxiliary_loss_clip": 0.06404807, + "auxiliary_loss_mlp": 0.01263362, + "balance_loss_clip": 0.06270844, + "balance_loss_mlp": 0.01253772, + "epoch": 0.7946490305125508, + "flos": 25817051093760.0, + "grad_norm": 1.6049687625888907, + "language_loss": 0.73967838, + "learning_rate": 4.261736137111598e-07, + "loss": 0.81636012, + "num_input_tokens_seen": 285116020, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.09588623, + "step": 13217, + "time_per_iteration": 3.931478977203369 + }, + { + "auxiliary_loss_clip": 0.06401365, + "auxiliary_loss_mlp": 0.01263665, + "balance_loss_clip": 0.0627373, + "balance_loss_mlp": 0.0125408, + "epoch": 0.7947091537652187, + "flos": 15966425249280.0, + "grad_norm": 1.8482353685704531, + "language_loss": 0.74055278, + "learning_rate": 4.259333208810907e-07, + "loss": 0.81720304, + "num_input_tokens_seen": 285133510, + "router_z_loss_clip": 1.27636719, + "router_z_loss_mlp": 0.09591675, + "step": 13218, + "time_per_iteration": 2.4553987979888916 + }, + { + "auxiliary_loss_clip": 0.06410147, + "auxiliary_loss_mlp": 0.01263676, + "balance_loss_clip": 0.06273754, + "balance_loss_mlp": 0.0125424, + "epoch": 0.7947692770178867, + "flos": 18593753546880.0, + "grad_norm": 1.8816401972337626, + "language_loss": 0.83479667, + "learning_rate": 4.2569308773978817e-07, + "loss": 0.91153485, + "num_input_tokens_seen": 285151690, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.09442139, + "step": 13219, + "time_per_iteration": 2.44667911529541 + }, + { + "auxiliary_loss_clip": 0.06409134, + "auxiliary_loss_mlp": 0.01268173, + "balance_loss_clip": 0.06272696, + "balance_loss_mlp": 0.01258064, + "epoch": 0.7948294002705546, + "flos": 20447344442880.0, + "grad_norm": 1.667648831846699, + "language_loss": 0.7587316, + "learning_rate": 4.2545291429636123e-07, + "loss": 0.83550465, + "num_input_tokens_seen": 285170485, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.10113525, + "step": 13220, + "time_per_iteration": 2.515125036239624 + }, + { + "auxiliary_loss_clip": 0.06413321, + "auxiliary_loss_mlp": 0.01262935, + "balance_loss_clip": 0.0627633, + "balance_loss_mlp": 0.01253041, + "epoch": 0.7948895235232226, + "flos": 38190436940160.0, + "grad_norm": 1.659539697860105, + "language_loss": 0.72439814, + "learning_rate": 4.252128005599176e-07, + "loss": 0.80116069, + "num_input_tokens_seen": 285191050, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.09893799, + "step": 13221, + "time_per_iteration": 4.03423810005188 + }, + { + "auxiliary_loss_clip": 0.06401148, + "auxiliary_loss_mlp": 0.01264392, + "balance_loss_clip": 0.0627249, + "balance_loss_mlp": 0.01255052, + "epoch": 0.7949496467758905, + "flos": 15565231100160.0, + "grad_norm": 2.544368910491826, + "language_loss": 0.75068891, + "learning_rate": 4.249727465395634e-07, + "loss": 0.8273443, + "num_input_tokens_seen": 285208750, + "router_z_loss_clip": 1.28613281, + "router_z_loss_mlp": 0.09332275, + "step": 13222, + "time_per_iteration": 2.491516590118408 + }, + { + "auxiliary_loss_clip": 0.06308898, + "auxiliary_loss_mlp": 0.01254396, + "balance_loss_clip": 0.06253184, + "balance_loss_mlp": 0.01253385, + "epoch": 0.7950097700285585, + "flos": 70915864809600.0, + "grad_norm": 0.7838771916152429, + "language_loss": 0.66774839, + "learning_rate": 4.247327522443993e-07, + "loss": 0.74338138, + "num_input_tokens_seen": 285264605, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.01010132, + "step": 13223, + "time_per_iteration": 3.031728744506836 + }, + { + "auxiliary_loss_clip": 0.06404258, + "auxiliary_loss_mlp": 0.01264069, + "balance_loss_clip": 0.06271882, + "balance_loss_mlp": 0.01253829, + "epoch": 0.7950698932812266, + "flos": 23958470880000.0, + "grad_norm": 1.6379349696855243, + "language_loss": 0.71398437, + "learning_rate": 4.2449281768352717e-07, + "loss": 0.79066753, + "num_input_tokens_seen": 285283940, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.10241699, + "step": 13224, + "time_per_iteration": 2.5175724029541016 + }, + { + "auxiliary_loss_clip": 0.06312153, + "auxiliary_loss_mlp": 0.01251169, + "balance_loss_clip": 0.06256486, + "balance_loss_mlp": 0.01250191, + "epoch": 0.7951300165338945, + "flos": 60300096606720.0, + "grad_norm": 0.6591691135419323, + "language_loss": 0.55062973, + "learning_rate": 4.2425294286604527e-07, + "loss": 0.62626302, + "num_input_tokens_seen": 285349525, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.00976562, + "step": 13225, + "time_per_iteration": 3.178450345993042 + }, + { + "auxiliary_loss_clip": 0.06401074, + "auxiliary_loss_mlp": 0.01261342, + "balance_loss_clip": 0.06272745, + "balance_loss_mlp": 0.01252884, + "epoch": 0.7951901397865625, + "flos": 22825397243520.0, + "grad_norm": 2.154430910035814, + "language_loss": 0.65301824, + "learning_rate": 4.2401312780105034e-07, + "loss": 0.72964251, + "num_input_tokens_seen": 285367355, + "router_z_loss_clip": 1.28320312, + "router_z_loss_mlp": 0.08459473, + "step": 13226, + "time_per_iteration": 2.5249226093292236 + }, + { + "auxiliary_loss_clip": 0.06407489, + "auxiliary_loss_mlp": 0.0126573, + "balance_loss_clip": 0.06274739, + "balance_loss_mlp": 0.01256062, + "epoch": 0.7952502630392304, + "flos": 35703748920960.0, + "grad_norm": 2.011551916679729, + "language_loss": 0.70672739, + "learning_rate": 4.237733724976349e-07, + "loss": 0.78345954, + "num_input_tokens_seen": 285386190, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09680176, + "step": 13227, + "time_per_iteration": 2.6486446857452393 + }, + { + "auxiliary_loss_clip": 0.06398386, + "auxiliary_loss_mlp": 0.01262858, + "balance_loss_clip": 0.06269887, + "balance_loss_mlp": 0.01254162, + "epoch": 0.7953103862918984, + "flos": 25636942742400.0, + "grad_norm": 1.7944937078069616, + "language_loss": 0.69723666, + "learning_rate": 4.2353367696489184e-07, + "loss": 0.77384907, + "num_input_tokens_seen": 285406150, + "router_z_loss_clip": 1.28417969, + "router_z_loss_mlp": 0.08691406, + "step": 13228, + "time_per_iteration": 2.6445536613464355 + }, + { + "auxiliary_loss_clip": 0.06402546, + "auxiliary_loss_mlp": 0.01265131, + "balance_loss_clip": 0.06270213, + "balance_loss_mlp": 0.01255564, + "epoch": 0.7953705095445663, + "flos": 40561487925120.0, + "grad_norm": 1.474530595441345, + "language_loss": 0.70921922, + "learning_rate": 4.232940412119095e-07, + "loss": 0.78589594, + "num_input_tokens_seen": 285429900, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.09558105, + "step": 13229, + "time_per_iteration": 2.6637799739837646 + }, + { + "auxiliary_loss_clip": 0.0641102, + "auxiliary_loss_mlp": 0.0126613, + "balance_loss_clip": 0.06274529, + "balance_loss_mlp": 0.01256063, + "epoch": 0.7954306327972344, + "flos": 27644129372160.0, + "grad_norm": 1.7873536766913725, + "language_loss": 0.71492708, + "learning_rate": 4.2305446524777457e-07, + "loss": 0.79169858, + "num_input_tokens_seen": 285452555, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.10076904, + "step": 13230, + "time_per_iteration": 2.574101209640503 + }, + { + "auxiliary_loss_clip": 0.06309671, + "auxiliary_loss_mlp": 0.01251481, + "balance_loss_clip": 0.06254265, + "balance_loss_mlp": 0.01250479, + "epoch": 0.7954907560499023, + "flos": 59525505936000.0, + "grad_norm": 0.8781067484442618, + "language_loss": 0.63612801, + "learning_rate": 4.2281494908157247e-07, + "loss": 0.71173954, + "num_input_tokens_seen": 285515700, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.0100174, + "step": 13231, + "time_per_iteration": 3.143348217010498 + }, + { + "auxiliary_loss_clip": 0.06401561, + "auxiliary_loss_mlp": 0.01263604, + "balance_loss_clip": 0.0627121, + "balance_loss_mlp": 0.01253615, + "epoch": 0.7955508793025703, + "flos": 20126721594240.0, + "grad_norm": 1.6206459895498453, + "language_loss": 0.69870329, + "learning_rate": 4.2257549272238566e-07, + "loss": 0.77535492, + "num_input_tokens_seen": 285533910, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.09991455, + "step": 13232, + "time_per_iteration": 2.534808874130249 + }, + { + "auxiliary_loss_clip": 0.06401277, + "auxiliary_loss_mlp": 0.01262737, + "balance_loss_clip": 0.06270236, + "balance_loss_mlp": 0.01253272, + "epoch": 0.7956110025552382, + "flos": 26512607036160.0, + "grad_norm": 1.7341819887914223, + "language_loss": 0.78396481, + "learning_rate": 4.223360961792952e-07, + "loss": 0.860605, + "num_input_tokens_seen": 285554080, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09466553, + "step": 13233, + "time_per_iteration": 2.5741093158721924 + }, + { + "auxiliary_loss_clip": 0.06403272, + "auxiliary_loss_mlp": 0.01265137, + "balance_loss_clip": 0.06270528, + "balance_loss_mlp": 0.01255803, + "epoch": 0.7956711258079062, + "flos": 22572138677760.0, + "grad_norm": 1.88878875282178, + "language_loss": 0.78960502, + "learning_rate": 4.220967594613769e-07, + "loss": 0.86628914, + "num_input_tokens_seen": 285572325, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.09332275, + "step": 13234, + "time_per_iteration": 2.5267715454101562 + }, + { + "auxiliary_loss_clip": 0.064052, + "auxiliary_loss_mlp": 0.01262721, + "balance_loss_clip": 0.06274294, + "balance_loss_mlp": 0.01254102, + "epoch": 0.7957312490605741, + "flos": 17383882043520.0, + "grad_norm": 2.969852188387872, + "language_loss": 0.70354939, + "learning_rate": 4.218574825777077e-07, + "loss": 0.78022861, + "num_input_tokens_seen": 285589770, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.08618164, + "step": 13235, + "time_per_iteration": 2.472926616668701 + }, + { + "auxiliary_loss_clip": 0.0640211, + "auxiliary_loss_mlp": 0.0126658, + "balance_loss_clip": 0.06269485, + "balance_loss_mlp": 0.012564, + "epoch": 0.7957913723132422, + "flos": 22497898360320.0, + "grad_norm": 3.326054048453629, + "language_loss": 0.68091619, + "learning_rate": 4.2161826553736145e-07, + "loss": 0.75760305, + "num_input_tokens_seen": 285610065, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.10174561, + "step": 13236, + "time_per_iteration": 2.5275604724884033 + }, + { + "auxiliary_loss_clip": 0.06401785, + "auxiliary_loss_mlp": 0.01265164, + "balance_loss_clip": 0.06272059, + "balance_loss_mlp": 0.01256295, + "epoch": 0.7958514955659101, + "flos": 22644701913600.0, + "grad_norm": 1.5838694899419836, + "language_loss": 0.75233686, + "learning_rate": 4.2137910834940826e-07, + "loss": 0.82900631, + "num_input_tokens_seen": 285628480, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.08874512, + "step": 13237, + "time_per_iteration": 2.5152275562286377 + }, + { + "auxiliary_loss_clip": 0.06404451, + "auxiliary_loss_mlp": 0.0126561, + "balance_loss_clip": 0.06271912, + "balance_loss_mlp": 0.01255788, + "epoch": 0.7959116188185781, + "flos": 20710497790080.0, + "grad_norm": 1.909101485463629, + "language_loss": 0.71454495, + "learning_rate": 4.211400110229175e-07, + "loss": 0.79124558, + "num_input_tokens_seen": 285647805, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.0982666, + "step": 13238, + "time_per_iteration": 2.5149312019348145 + }, + { + "auxiliary_loss_clip": 0.0640163, + "auxiliary_loss_mlp": 0.01263785, + "balance_loss_clip": 0.06269349, + "balance_loss_mlp": 0.01254844, + "epoch": 0.7959717420712461, + "flos": 19030474627200.0, + "grad_norm": 2.2119566924128584, + "language_loss": 0.74293685, + "learning_rate": 4.2090097356695684e-07, + "loss": 0.81959099, + "num_input_tokens_seen": 285665505, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.0894165, + "step": 13239, + "time_per_iteration": 2.4692234992980957 + }, + { + "auxiliary_loss_clip": 0.06405409, + "auxiliary_loss_mlp": 0.01264077, + "balance_loss_clip": 0.0627186, + "balance_loss_mlp": 0.01254314, + "epoch": 0.796031865323914, + "flos": 26363371714560.0, + "grad_norm": 4.594953960637003, + "language_loss": 0.69371974, + "learning_rate": 4.2066199599058814e-07, + "loss": 0.77041459, + "num_input_tokens_seen": 285685855, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.09765625, + "step": 13240, + "time_per_iteration": 2.5826754570007324 + }, + { + "auxiliary_loss_clip": 0.06308684, + "auxiliary_loss_mlp": 0.01255726, + "balance_loss_clip": 0.06253344, + "balance_loss_mlp": 0.01254768, + "epoch": 0.796091988576582, + "flos": 62087119833600.0, + "grad_norm": 0.8806225517212096, + "language_loss": 0.5847106, + "learning_rate": 4.2042307830287526e-07, + "loss": 0.66035473, + "num_input_tokens_seen": 285735710, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.00956726, + "step": 13241, + "time_per_iteration": 2.9126768112182617 + }, + { + "auxiliary_loss_clip": 0.06403052, + "auxiliary_loss_mlp": 0.01265132, + "balance_loss_clip": 0.06270704, + "balance_loss_mlp": 0.01255864, + "epoch": 0.7961521118292499, + "flos": 39029442272640.0, + "grad_norm": 2.127726994888291, + "language_loss": 0.64769882, + "learning_rate": 4.201842205128772e-07, + "loss": 0.72438073, + "num_input_tokens_seen": 285757045, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.09265137, + "step": 13242, + "time_per_iteration": 2.635535717010498 + }, + { + "auxiliary_loss_clip": 0.06402293, + "auxiliary_loss_mlp": 0.01267879, + "balance_loss_clip": 0.06268795, + "balance_loss_mlp": 0.01257795, + "epoch": 0.796212235081918, + "flos": 21769373036160.0, + "grad_norm": 2.0186777582920024, + "language_loss": 0.76239574, + "learning_rate": 4.199454226296526e-07, + "loss": 0.83909744, + "num_input_tokens_seen": 285776050, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.10083008, + "step": 13243, + "time_per_iteration": 3.8618268966674805 + }, + { + "auxiliary_loss_clip": 0.06402823, + "auxiliary_loss_mlp": 0.01264428, + "balance_loss_clip": 0.06270328, + "balance_loss_mlp": 0.01254605, + "epoch": 0.7962723583345859, + "flos": 21185261424000.0, + "grad_norm": 1.6364985939961718, + "language_loss": 0.79507935, + "learning_rate": 4.1970668466225565e-07, + "loss": 0.8717519, + "num_input_tokens_seen": 285796830, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.09832764, + "step": 13244, + "time_per_iteration": 2.51326322555542 + }, + { + "auxiliary_loss_clip": 0.06406613, + "auxiliary_loss_mlp": 0.01264352, + "balance_loss_clip": 0.06270078, + "balance_loss_mlp": 0.01254308, + "epoch": 0.7963324815872539, + "flos": 17134313057280.0, + "grad_norm": 1.908775351263593, + "language_loss": 0.68666172, + "learning_rate": 4.1946800661973934e-07, + "loss": 0.76337141, + "num_input_tokens_seen": 285814755, + "router_z_loss_clip": 1.36621094, + "router_z_loss_mlp": 0.10046387, + "step": 13245, + "time_per_iteration": 2.461880922317505 + }, + { + "auxiliary_loss_clip": 0.0640422, + "auxiliary_loss_mlp": 0.01265244, + "balance_loss_clip": 0.06271861, + "balance_loss_mlp": 0.01255749, + "epoch": 0.7963926048399218, + "flos": 21403873526400.0, + "grad_norm": 1.7297162444203578, + "language_loss": 0.79002523, + "learning_rate": 4.192293885111549e-07, + "loss": 0.86671984, + "num_input_tokens_seen": 285834255, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.0949707, + "step": 13246, + "time_per_iteration": 2.4906105995178223 + }, + { + "auxiliary_loss_clip": 0.06404968, + "auxiliary_loss_mlp": 0.012642, + "balance_loss_clip": 0.06269813, + "balance_loss_mlp": 0.01254073, + "epoch": 0.7964527280925898, + "flos": 25189907610240.0, + "grad_norm": 1.8120227230539676, + "language_loss": 0.66180718, + "learning_rate": 4.1899083034555007e-07, + "loss": 0.73849887, + "num_input_tokens_seen": 285853540, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.10125732, + "step": 13247, + "time_per_iteration": 2.534837484359741 + }, + { + "auxiliary_loss_clip": 0.0639786, + "auxiliary_loss_mlp": 0.01263181, + "balance_loss_clip": 0.06269214, + "balance_loss_mlp": 0.0125458, + "epoch": 0.7965128513452577, + "flos": 27023149163520.0, + "grad_norm": 1.7943633437832778, + "language_loss": 0.71878839, + "learning_rate": 4.1875233213197123e-07, + "loss": 0.79539883, + "num_input_tokens_seen": 285872705, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.08599854, + "step": 13248, + "time_per_iteration": 2.5318338871002197 + }, + { + "auxiliary_loss_clip": 0.06404188, + "auxiliary_loss_mlp": 0.01265183, + "balance_loss_clip": 0.06268889, + "balance_loss_mlp": 0.01255378, + "epoch": 0.7965729745979258, + "flos": 24425436084480.0, + "grad_norm": 2.290940910554294, + "language_loss": 0.76236963, + "learning_rate": 4.1851389387946255e-07, + "loss": 0.83906335, + "num_input_tokens_seen": 285890290, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.0980835, + "step": 13249, + "time_per_iteration": 2.5285370349884033 + }, + { + "auxiliary_loss_clip": 0.06399461, + "auxiliary_loss_mlp": 0.01262002, + "balance_loss_clip": 0.06270114, + "balance_loss_mlp": 0.01252703, + "epoch": 0.7966330978505937, + "flos": 18845838155520.0, + "grad_norm": 1.9207763897520123, + "language_loss": 0.61375982, + "learning_rate": 4.1827551559706674e-07, + "loss": 0.69037437, + "num_input_tokens_seen": 285909190, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.09307861, + "step": 13250, + "time_per_iteration": 2.4775562286376953 + }, + { + "auxiliary_loss_clip": 0.06399567, + "auxiliary_loss_mlp": 0.01263631, + "balance_loss_clip": 0.06269053, + "balance_loss_mlp": 0.01253982, + "epoch": 0.7966932211032617, + "flos": 13157437299840.0, + "grad_norm": 2.289000304094375, + "language_loss": 0.72802746, + "learning_rate": 4.180371972938206e-07, + "loss": 0.80465943, + "num_input_tokens_seen": 285927570, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.09655762, + "step": 13251, + "time_per_iteration": 2.5408740043640137 + }, + { + "auxiliary_loss_clip": 0.06409312, + "auxiliary_loss_mlp": 0.01265133, + "balance_loss_clip": 0.06273971, + "balance_loss_mlp": 0.01254654, + "epoch": 0.7967533443559297, + "flos": 23956290673920.0, + "grad_norm": 1.9875673178726758, + "language_loss": 0.73053861, + "learning_rate": 4.177989389787624e-07, + "loss": 0.80728304, + "num_input_tokens_seen": 285945810, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.1048584, + "step": 13252, + "time_per_iteration": 3.9433846473693848 + }, + { + "auxiliary_loss_clip": 0.06396703, + "auxiliary_loss_mlp": 0.01266191, + "balance_loss_clip": 0.06269825, + "balance_loss_mlp": 0.01256886, + "epoch": 0.7968134676085976, + "flos": 30375984038400.0, + "grad_norm": 1.8369149171198353, + "language_loss": 0.66266763, + "learning_rate": 4.175607406609278e-07, + "loss": 0.73929667, + "num_input_tokens_seen": 285964235, + "router_z_loss_clip": 1.26757812, + "router_z_loss_mlp": 0.09307861, + "step": 13253, + "time_per_iteration": 2.5753839015960693 + }, + { + "auxiliary_loss_clip": 0.06402615, + "auxiliary_loss_mlp": 0.01264505, + "balance_loss_clip": 0.06269044, + "balance_loss_mlp": 0.01254289, + "epoch": 0.7968735908612656, + "flos": 23081590702080.0, + "grad_norm": 1.5642785207566534, + "language_loss": 0.67620826, + "learning_rate": 4.1732260234934767e-07, + "loss": 0.75287944, + "num_input_tokens_seen": 285983710, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.10223389, + "step": 13254, + "time_per_iteration": 2.587885856628418 + }, + { + "auxiliary_loss_clip": 0.0640402, + "auxiliary_loss_mlp": 0.01267658, + "balance_loss_clip": 0.06271625, + "balance_loss_mlp": 0.01258467, + "epoch": 0.7969337141139335, + "flos": 23588275541760.0, + "grad_norm": 2.088422762405943, + "language_loss": 0.69607329, + "learning_rate": 4.1708452405305314e-07, + "loss": 0.77279007, + "num_input_tokens_seen": 286003425, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.09191895, + "step": 13255, + "time_per_iteration": 2.5366928577423096 + }, + { + "auxiliary_loss_clip": 0.06399679, + "auxiliary_loss_mlp": 0.01263773, + "balance_loss_clip": 0.0626971, + "balance_loss_mlp": 0.0125463, + "epoch": 0.7969938373666016, + "flos": 19762018698240.0, + "grad_norm": 1.6762095197917861, + "language_loss": 0.79241788, + "learning_rate": 4.168465057810733e-07, + "loss": 0.86905241, + "num_input_tokens_seen": 286020130, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.09143066, + "step": 13256, + "time_per_iteration": 3.9199607372283936 + }, + { + "auxiliary_loss_clip": 0.06405733, + "auxiliary_loss_mlp": 0.01263678, + "balance_loss_clip": 0.06272037, + "balance_loss_mlp": 0.01254195, + "epoch": 0.7970539606192695, + "flos": 24140969072640.0, + "grad_norm": 1.817522476863435, + "language_loss": 0.66469562, + "learning_rate": 4.166085475424315e-07, + "loss": 0.74138975, + "num_input_tokens_seen": 286040230, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.09484863, + "step": 13257, + "time_per_iteration": 2.4968059062957764 + }, + { + "auxiliary_loss_clip": 0.06411573, + "auxiliary_loss_mlp": 0.01262722, + "balance_loss_clip": 0.06272082, + "balance_loss_mlp": 0.01252977, + "epoch": 0.7971140838719375, + "flos": 17974576200960.0, + "grad_norm": 2.293552355321388, + "language_loss": 0.721138, + "learning_rate": 4.163706493461523e-07, + "loss": 0.79788101, + "num_input_tokens_seen": 286059475, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.09753418, + "step": 13258, + "time_per_iteration": 2.466635227203369 + }, + { + "auxiliary_loss_clip": 0.06404628, + "auxiliary_loss_mlp": 0.01268173, + "balance_loss_clip": 0.06270341, + "balance_loss_mlp": 0.01257439, + "epoch": 0.7971742071246054, + "flos": 19175181828480.0, + "grad_norm": 1.7912391212808825, + "language_loss": 0.69168359, + "learning_rate": 4.1613281120125655e-07, + "loss": 0.76841164, + "num_input_tokens_seen": 286077820, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.1072998, + "step": 13259, + "time_per_iteration": 2.5077145099639893 + }, + { + "auxiliary_loss_clip": 0.06399243, + "auxiliary_loss_mlp": 0.01264467, + "balance_loss_clip": 0.06270258, + "balance_loss_mlp": 0.01255467, + "epoch": 0.7972343303772734, + "flos": 27133335682560.0, + "grad_norm": 1.8522631827723854, + "language_loss": 0.73832285, + "learning_rate": 4.158950331167641e-07, + "loss": 0.81495994, + "num_input_tokens_seen": 286097285, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.09002686, + "step": 13260, + "time_per_iteration": 2.542802333831787 + }, + { + "auxiliary_loss_clip": 0.0640289, + "auxiliary_loss_mlp": 0.01260989, + "balance_loss_clip": 0.06273317, + "balance_loss_mlp": 0.01251559, + "epoch": 0.7972944536299413, + "flos": 21003056720640.0, + "grad_norm": 1.7849042953427723, + "language_loss": 0.78480017, + "learning_rate": 4.1565731510169065e-07, + "loss": 0.86143899, + "num_input_tokens_seen": 286116000, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.09423828, + "step": 13261, + "time_per_iteration": 3.9328079223632812 + }, + { + "auxiliary_loss_clip": 0.06398886, + "auxiliary_loss_mlp": 0.01262833, + "balance_loss_clip": 0.06273298, + "balance_loss_mlp": 0.01254673, + "epoch": 0.7973545768826094, + "flos": 21586455573120.0, + "grad_norm": 1.5738375071778383, + "language_loss": 0.76378083, + "learning_rate": 4.154196571650501e-07, + "loss": 0.84039807, + "num_input_tokens_seen": 286135110, + "router_z_loss_clip": 1.25488281, + "router_z_loss_mlp": 0.081604, + "step": 13262, + "time_per_iteration": 2.563962936401367 + }, + { + "auxiliary_loss_clip": 0.06407683, + "auxiliary_loss_mlp": 0.01266045, + "balance_loss_clip": 0.06271025, + "balance_loss_mlp": 0.01254929, + "epoch": 0.7974147001352773, + "flos": 20564826266880.0, + "grad_norm": 2.3741111295907626, + "language_loss": 0.70724112, + "learning_rate": 4.1518205931585524e-07, + "loss": 0.7839784, + "num_input_tokens_seen": 286152835, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.11126709, + "step": 13263, + "time_per_iteration": 2.4744935035705566 + }, + { + "auxiliary_loss_clip": 0.0641284, + "auxiliary_loss_mlp": 0.01264474, + "balance_loss_clip": 0.06274222, + "balance_loss_mlp": 0.01253174, + "epoch": 0.7974748233879453, + "flos": 21003224428800.0, + "grad_norm": 1.8041636283725375, + "language_loss": 0.71434695, + "learning_rate": 4.149445215631153e-07, + "loss": 0.79112011, + "num_input_tokens_seen": 286171785, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.11297607, + "step": 13264, + "time_per_iteration": 2.485276460647583 + }, + { + "auxiliary_loss_clip": 0.06398866, + "auxiliary_loss_mlp": 0.0126452, + "balance_loss_clip": 0.06270253, + "balance_loss_mlp": 0.01256187, + "epoch": 0.7975349466406133, + "flos": 22571803261440.0, + "grad_norm": 1.6689770527063423, + "language_loss": 0.77659208, + "learning_rate": 4.1470704391583776e-07, + "loss": 0.85322595, + "num_input_tokens_seen": 286190420, + "router_z_loss_clip": 1.28417969, + "router_z_loss_mlp": 0.08331299, + "step": 13265, + "time_per_iteration": 2.50765061378479 + }, + { + "auxiliary_loss_clip": 0.06407373, + "auxiliary_loss_mlp": 0.01269533, + "balance_loss_clip": 0.06273501, + "balance_loss_mlp": 0.01259609, + "epoch": 0.7975950698932812, + "flos": 21696013186560.0, + "grad_norm": 1.8504698542540234, + "language_loss": 0.76059192, + "learning_rate": 4.144696263830285e-07, + "loss": 0.83736098, + "num_input_tokens_seen": 286210105, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.0993042, + "step": 13266, + "time_per_iteration": 2.5207157135009766 + }, + { + "auxiliary_loss_clip": 0.06402943, + "auxiliary_loss_mlp": 0.01264296, + "balance_loss_clip": 0.06272074, + "balance_loss_mlp": 0.01255183, + "epoch": 0.7976551931459492, + "flos": 19609806556800.0, + "grad_norm": 1.6112289211308914, + "language_loss": 0.83747739, + "learning_rate": 4.1423226897369015e-07, + "loss": 0.91414976, + "num_input_tokens_seen": 286228180, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.09112549, + "step": 13267, + "time_per_iteration": 2.523797035217285 + }, + { + "auxiliary_loss_clip": 0.06403189, + "auxiliary_loss_mlp": 0.01266238, + "balance_loss_clip": 0.06272589, + "balance_loss_mlp": 0.01256725, + "epoch": 0.7977153163986171, + "flos": 21693749126400.0, + "grad_norm": 1.4537624263579578, + "language_loss": 0.76656401, + "learning_rate": 4.139949716968223e-07, + "loss": 0.84325826, + "num_input_tokens_seen": 286247305, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09503174, + "step": 13268, + "time_per_iteration": 2.50384783744812 + }, + { + "auxiliary_loss_clip": 0.06404118, + "auxiliary_loss_mlp": 0.0126592, + "balance_loss_clip": 0.06272426, + "balance_loss_mlp": 0.01256574, + "epoch": 0.7977754396512852, + "flos": 23483455683840.0, + "grad_norm": 1.5523298062662978, + "language_loss": 0.78092402, + "learning_rate": 4.1375773456142403e-07, + "loss": 0.85762441, + "num_input_tokens_seen": 286268145, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09344482, + "step": 13269, + "time_per_iteration": 2.544590473175049 + }, + { + "auxiliary_loss_clip": 0.06399094, + "auxiliary_loss_mlp": 0.01261853, + "balance_loss_clip": 0.06270756, + "balance_loss_mlp": 0.01253043, + "epoch": 0.7978355629039531, + "flos": 22388718090240.0, + "grad_norm": 1.6478961708757416, + "language_loss": 0.82291299, + "learning_rate": 4.135205575764922e-07, + "loss": 0.89952242, + "num_input_tokens_seen": 286286775, + "router_z_loss_clip": 1.28222656, + "router_z_loss_mlp": 0.08813477, + "step": 13270, + "time_per_iteration": 2.4902870655059814 + }, + { + "auxiliary_loss_clip": 0.06401956, + "auxiliary_loss_mlp": 0.01264701, + "balance_loss_clip": 0.06270558, + "balance_loss_mlp": 0.01255331, + "epoch": 0.7978956861566211, + "flos": 20272518898560.0, + "grad_norm": 2.1156464454549297, + "language_loss": 0.59938061, + "learning_rate": 4.1328344075101905e-07, + "loss": 0.67604721, + "num_input_tokens_seen": 286305590, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.09362793, + "step": 13271, + "time_per_iteration": 2.5591602325439453 + }, + { + "auxiliary_loss_clip": 0.06410769, + "auxiliary_loss_mlp": 0.01265645, + "balance_loss_clip": 0.06274214, + "balance_loss_mlp": 0.01256037, + "epoch": 0.797955809409289, + "flos": 28120192744320.0, + "grad_norm": 1.4386088451054988, + "language_loss": 0.73758554, + "learning_rate": 4.130463840939975e-07, + "loss": 0.81434965, + "num_input_tokens_seen": 286328050, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.09606934, + "step": 13272, + "time_per_iteration": 2.570200204849243 + }, + { + "auxiliary_loss_clip": 0.06401898, + "auxiliary_loss_mlp": 0.0126542, + "balance_loss_clip": 0.06270777, + "balance_loss_mlp": 0.012558, + "epoch": 0.798015932661957, + "flos": 15564979537920.0, + "grad_norm": 2.1482391429317067, + "language_loss": 0.71803975, + "learning_rate": 4.128093876144161e-07, + "loss": 0.79471296, + "num_input_tokens_seen": 286345265, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09625244, + "step": 13273, + "time_per_iteration": 2.4748198986053467 + }, + { + "auxiliary_loss_clip": 0.0640889, + "auxiliary_loss_mlp": 0.01264134, + "balance_loss_clip": 0.06274156, + "balance_loss_mlp": 0.012539, + "epoch": 0.7980760559146249, + "flos": 23957967755520.0, + "grad_norm": 1.5725586223842085, + "language_loss": 0.75832808, + "learning_rate": 4.1257245132126117e-07, + "loss": 0.83505827, + "num_input_tokens_seen": 286364465, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.10241699, + "step": 13274, + "time_per_iteration": 2.55397629737854 + }, + { + "auxiliary_loss_clip": 0.06394248, + "auxiliary_loss_mlp": 0.01262515, + "balance_loss_clip": 0.06268619, + "balance_loss_mlp": 0.01253622, + "epoch": 0.798136179167293, + "flos": 28045617010560.0, + "grad_norm": 1.334626175327206, + "language_loss": 0.77871919, + "learning_rate": 4.12335575223518e-07, + "loss": 0.85528684, + "num_input_tokens_seen": 286385565, + "router_z_loss_clip": 1.25585938, + "router_z_loss_mlp": 0.08892822, + "step": 13275, + "time_per_iteration": 2.594181776046753 + }, + { + "auxiliary_loss_clip": 0.0640621, + "auxiliary_loss_mlp": 0.01265971, + "balance_loss_clip": 0.06270525, + "balance_loss_mlp": 0.01255189, + "epoch": 0.7981963024199609, + "flos": 35992157074560.0, + "grad_norm": 2.855483452086949, + "language_loss": 0.64085776, + "learning_rate": 4.1209875933016877e-07, + "loss": 0.71757954, + "num_input_tokens_seen": 286403950, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.10784912, + "step": 13276, + "time_per_iteration": 2.5930356979370117 + }, + { + "auxiliary_loss_clip": 0.06401938, + "auxiliary_loss_mlp": 0.01267748, + "balance_loss_clip": 0.06273316, + "balance_loss_mlp": 0.0125805, + "epoch": 0.7982564256726289, + "flos": 25892004170880.0, + "grad_norm": 1.5904474642505515, + "language_loss": 0.61038435, + "learning_rate": 4.118620036501945e-07, + "loss": 0.68708122, + "num_input_tokens_seen": 286426160, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.09692383, + "step": 13277, + "time_per_iteration": 2.5839786529541016 + }, + { + "auxiliary_loss_clip": 0.06411898, + "auxiliary_loss_mlp": 0.0126538, + "balance_loss_clip": 0.06276092, + "balance_loss_mlp": 0.012561, + "epoch": 0.7983165489252969, + "flos": 25746248793600.0, + "grad_norm": 1.8327445572983765, + "language_loss": 0.79849744, + "learning_rate": 4.1162530819257227e-07, + "loss": 0.87527025, + "num_input_tokens_seen": 286446610, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.09283447, + "step": 13278, + "time_per_iteration": 2.5260982513427734 + }, + { + "auxiliary_loss_clip": 0.06405683, + "auxiliary_loss_mlp": 0.01263371, + "balance_loss_clip": 0.06271876, + "balance_loss_mlp": 0.01253518, + "epoch": 0.7983766721779648, + "flos": 21914667216000.0, + "grad_norm": 1.9889744564125917, + "language_loss": 0.63581717, + "learning_rate": 4.113886729662768e-07, + "loss": 0.71250772, + "num_input_tokens_seen": 286465460, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.09844971, + "step": 13279, + "time_per_iteration": 2.5182244777679443 + }, + { + "auxiliary_loss_clip": 0.06394448, + "auxiliary_loss_mlp": 0.01266216, + "balance_loss_clip": 0.06270408, + "balance_loss_mlp": 0.01257925, + "epoch": 0.7984367954306328, + "flos": 29354480513280.0, + "grad_norm": 1.5743045282106698, + "language_loss": 0.71176022, + "learning_rate": 4.111520979802825e-07, + "loss": 0.78836685, + "num_input_tokens_seen": 286485720, + "router_z_loss_clip": 1.24121094, + "router_z_loss_mlp": 0.08282471, + "step": 13280, + "time_per_iteration": 2.575366258621216 + }, + { + "auxiliary_loss_clip": 0.06409226, + "auxiliary_loss_mlp": 0.01266632, + "balance_loss_clip": 0.06273544, + "balance_loss_mlp": 0.01257149, + "epoch": 0.7984969186833007, + "flos": 31365775992960.0, + "grad_norm": 1.6558048262309357, + "language_loss": 0.62836027, + "learning_rate": 4.1091558324355955e-07, + "loss": 0.70511883, + "num_input_tokens_seen": 286507465, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.0947876, + "step": 13281, + "time_per_iteration": 2.624361276626587 + }, + { + "auxiliary_loss_clip": 0.06407207, + "auxiliary_loss_mlp": 0.01265261, + "balance_loss_clip": 0.06269886, + "balance_loss_mlp": 0.01254807, + "epoch": 0.7985570419359688, + "flos": 24319232634240.0, + "grad_norm": 1.8833916192642874, + "language_loss": 0.79982495, + "learning_rate": 4.1067912876507683e-07, + "loss": 0.8765496, + "num_input_tokens_seen": 286526345, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.10449219, + "step": 13282, + "time_per_iteration": 2.522733211517334 + }, + { + "auxiliary_loss_clip": 0.06405975, + "auxiliary_loss_mlp": 0.01265316, + "balance_loss_clip": 0.06271493, + "balance_loss_mlp": 0.01256339, + "epoch": 0.7986171651886367, + "flos": 15747687365760.0, + "grad_norm": 2.26715299858664, + "language_loss": 0.72620189, + "learning_rate": 4.10442734553802e-07, + "loss": 0.8029148, + "num_input_tokens_seen": 286544095, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.08972168, + "step": 13283, + "time_per_iteration": 3.8687400817871094 + }, + { + "auxiliary_loss_clip": 0.06398675, + "auxiliary_loss_mlp": 0.01262054, + "balance_loss_clip": 0.06269114, + "balance_loss_mlp": 0.01253072, + "epoch": 0.7986772884413047, + "flos": 11624175763200.0, + "grad_norm": 2.1421699909472474, + "language_loss": 0.73992294, + "learning_rate": 4.102064006186967e-07, + "loss": 0.81653023, + "num_input_tokens_seen": 286560960, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.08984375, + "step": 13284, + "time_per_iteration": 2.464895486831665 + }, + { + "auxiliary_loss_clip": 0.06401472, + "auxiliary_loss_mlp": 0.01263764, + "balance_loss_clip": 0.06270264, + "balance_loss_mlp": 0.01254883, + "epoch": 0.7987374116939726, + "flos": 22097626606080.0, + "grad_norm": 1.6639585561146113, + "language_loss": 0.70836139, + "learning_rate": 4.0997012696872415e-07, + "loss": 0.78501368, + "num_input_tokens_seen": 286579865, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.08874512, + "step": 13285, + "time_per_iteration": 2.5129339694976807 + }, + { + "auxiliary_loss_clip": 0.06401065, + "auxiliary_loss_mlp": 0.01262275, + "balance_loss_clip": 0.06268647, + "balance_loss_mlp": 0.01252982, + "epoch": 0.7987975349466406, + "flos": 17895807763200.0, + "grad_norm": 1.6553012923822499, + "language_loss": 0.73934168, + "learning_rate": 4.097339136128437e-07, + "loss": 0.81597507, + "num_input_tokens_seen": 286597295, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.09295654, + "step": 13286, + "time_per_iteration": 2.4993607997894287 + }, + { + "auxiliary_loss_clip": 0.0640146, + "auxiliary_loss_mlp": 0.01262205, + "balance_loss_clip": 0.06270432, + "balance_loss_mlp": 0.01252859, + "epoch": 0.7988576581993085, + "flos": 19725359736960.0, + "grad_norm": 1.5989615606819938, + "language_loss": 0.75195587, + "learning_rate": 4.0949776056001296e-07, + "loss": 0.82859248, + "num_input_tokens_seen": 286616270, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09350586, + "step": 13287, + "time_per_iteration": 2.498539447784424 + }, + { + "auxiliary_loss_clip": 0.0640296, + "auxiliary_loss_mlp": 0.01263938, + "balance_loss_clip": 0.06271001, + "balance_loss_mlp": 0.01254598, + "epoch": 0.7989177814519766, + "flos": 28043604512640.0, + "grad_norm": 1.4032913596903045, + "language_loss": 0.62071377, + "learning_rate": 4.092616678191863e-07, + "loss": 0.69738275, + "num_input_tokens_seen": 286638315, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.09338379, + "step": 13288, + "time_per_iteration": 2.5561347007751465 + }, + { + "auxiliary_loss_clip": 0.06401485, + "auxiliary_loss_mlp": 0.01264116, + "balance_loss_clip": 0.06273647, + "balance_loss_mlp": 0.01255122, + "epoch": 0.7989779047046445, + "flos": 28877662454400.0, + "grad_norm": 2.6038900989096705, + "language_loss": 0.70626175, + "learning_rate": 4.090256353993169e-07, + "loss": 0.78291774, + "num_input_tokens_seen": 286658630, + "router_z_loss_clip": 1.27832031, + "router_z_loss_mlp": 0.08996582, + "step": 13289, + "time_per_iteration": 2.5535638332366943 + }, + { + "auxiliary_loss_clip": 0.06396915, + "auxiliary_loss_mlp": 0.01263033, + "balance_loss_clip": 0.06270102, + "balance_loss_mlp": 0.01253771, + "epoch": 0.7990380279573125, + "flos": 18192769032960.0, + "grad_norm": 2.213156856555218, + "language_loss": 0.63382244, + "learning_rate": 4.0878966330935506e-07, + "loss": 0.71042198, + "num_input_tokens_seen": 286676870, + "router_z_loss_clip": 1.26757812, + "router_z_loss_mlp": 0.09259033, + "step": 13290, + "time_per_iteration": 2.4844484329223633 + }, + { + "auxiliary_loss_clip": 0.06406233, + "auxiliary_loss_mlp": 0.01266627, + "balance_loss_clip": 0.06273846, + "balance_loss_mlp": 0.01256458, + "epoch": 0.7990981512099805, + "flos": 20885113699200.0, + "grad_norm": 1.8461892272796565, + "language_loss": 0.71634483, + "learning_rate": 4.08553751558248e-07, + "loss": 0.79307342, + "num_input_tokens_seen": 286694300, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.10168457, + "step": 13291, + "time_per_iteration": 2.526987314224243 + }, + { + "auxiliary_loss_clip": 0.06397383, + "auxiliary_loss_mlp": 0.01264262, + "balance_loss_clip": 0.06268732, + "balance_loss_mlp": 0.01255107, + "epoch": 0.7991582744626484, + "flos": 26106381642240.0, + "grad_norm": 1.5963617377533177, + "language_loss": 0.63653862, + "learning_rate": 4.083179001549422e-07, + "loss": 0.71315503, + "num_input_tokens_seen": 286714545, + "router_z_loss_clip": 1.28613281, + "router_z_loss_mlp": 0.09161377, + "step": 13292, + "time_per_iteration": 3.920006513595581 + }, + { + "auxiliary_loss_clip": 0.06398708, + "auxiliary_loss_mlp": 0.01264318, + "balance_loss_clip": 0.06267934, + "balance_loss_mlp": 0.01254733, + "epoch": 0.7992183977153164, + "flos": 35304106072320.0, + "grad_norm": 1.797759826858067, + "language_loss": 0.56198502, + "learning_rate": 4.0808210910838105e-07, + "loss": 0.63861531, + "num_input_tokens_seen": 286734525, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.0958252, + "step": 13293, + "time_per_iteration": 2.625302314758301 + }, + { + "auxiliary_loss_clip": 0.06404014, + "auxiliary_loss_mlp": 0.01264714, + "balance_loss_clip": 0.06272873, + "balance_loss_mlp": 0.01255284, + "epoch": 0.7992785209679844, + "flos": 51863294632320.0, + "grad_norm": 2.2763572451506944, + "language_loss": 0.71341664, + "learning_rate": 4.0784637842750704e-07, + "loss": 0.79010391, + "num_input_tokens_seen": 286753430, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.09429932, + "step": 13294, + "time_per_iteration": 2.76823353767395 + }, + { + "auxiliary_loss_clip": 0.06401891, + "auxiliary_loss_mlp": 0.01262732, + "balance_loss_clip": 0.06269768, + "balance_loss_mlp": 0.01252623, + "epoch": 0.7993386442206524, + "flos": 22571719407360.0, + "grad_norm": 1.8830431252935182, + "language_loss": 0.72672385, + "learning_rate": 4.0761070812125675e-07, + "loss": 0.80337006, + "num_input_tokens_seen": 286771915, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.10107422, + "step": 13295, + "time_per_iteration": 3.9486594200134277 + }, + { + "auxiliary_loss_clip": 0.06399785, + "auxiliary_loss_mlp": 0.01270961, + "balance_loss_clip": 0.06270969, + "balance_loss_mlp": 0.01262367, + "epoch": 0.7993987674733203, + "flos": 18805112271360.0, + "grad_norm": 1.8035732738246322, + "language_loss": 0.76883113, + "learning_rate": 4.0737509819856797e-07, + "loss": 0.84553862, + "num_input_tokens_seen": 286789835, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.0859375, + "step": 13296, + "time_per_iteration": 2.5124893188476562 + }, + { + "auxiliary_loss_clip": 0.06317963, + "auxiliary_loss_mlp": 0.01251058, + "balance_loss_clip": 0.06262526, + "balance_loss_mlp": 0.0125003, + "epoch": 0.7994588907259883, + "flos": 69443747625600.0, + "grad_norm": 0.6778750345647286, + "language_loss": 0.60765332, + "learning_rate": 4.0713954866837573e-07, + "loss": 0.68334353, + "num_input_tokens_seen": 286855580, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01027679, + "step": 13297, + "time_per_iteration": 3.258441209793091 + }, + { + "auxiliary_loss_clip": 0.06401801, + "auxiliary_loss_mlp": 0.01265804, + "balance_loss_clip": 0.06271636, + "balance_loss_mlp": 0.01256398, + "epoch": 0.7995190139786562, + "flos": 13485439307520.0, + "grad_norm": 2.2443800001049645, + "language_loss": 0.70575351, + "learning_rate": 4.0690405953961073e-07, + "loss": 0.78242958, + "num_input_tokens_seen": 286874360, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.09399414, + "step": 13298, + "time_per_iteration": 2.4816195964813232 + }, + { + "auxiliary_loss_clip": 0.06406148, + "auxiliary_loss_mlp": 0.01264059, + "balance_loss_clip": 0.06270477, + "balance_loss_mlp": 0.01253563, + "epoch": 0.7995791372313242, + "flos": 21659270371200.0, + "grad_norm": 1.914137701086928, + "language_loss": 0.76235688, + "learning_rate": 4.066686308212037e-07, + "loss": 0.839059, + "num_input_tokens_seen": 286891950, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.10498047, + "step": 13299, + "time_per_iteration": 2.491387128829956 + }, + { + "auxiliary_loss_clip": 0.06396549, + "auxiliary_loss_mlp": 0.01265326, + "balance_loss_clip": 0.06268974, + "balance_loss_mlp": 0.01256779, + "epoch": 0.7996392604839921, + "flos": 26075382831360.0, + "grad_norm": 1.6376768390824803, + "language_loss": 0.77644742, + "learning_rate": 4.064332625220828e-07, + "loss": 0.85306615, + "num_input_tokens_seen": 286911725, + "router_z_loss_clip": 1.27734375, + "router_z_loss_mlp": 0.08544922, + "step": 13300, + "time_per_iteration": 3.941457986831665 + }, + { + "auxiliary_loss_clip": 0.06406416, + "auxiliary_loss_mlp": 0.01264711, + "balance_loss_clip": 0.06271292, + "balance_loss_mlp": 0.01255473, + "epoch": 0.7996993837366602, + "flos": 24613594427520.0, + "grad_norm": 1.7813390500304356, + "language_loss": 0.64086711, + "learning_rate": 4.0619795465117115e-07, + "loss": 0.71757841, + "num_input_tokens_seen": 286931400, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.09228516, + "step": 13301, + "time_per_iteration": 2.5052661895751953 + }, + { + "auxiliary_loss_clip": 0.06398593, + "auxiliary_loss_mlp": 0.01264195, + "balance_loss_clip": 0.06270251, + "balance_loss_mlp": 0.01255285, + "epoch": 0.7997595069893281, + "flos": 20997690059520.0, + "grad_norm": 1.5469395807720157, + "language_loss": 0.71982718, + "learning_rate": 4.059627072173928e-07, + "loss": 0.79645514, + "num_input_tokens_seen": 286949795, + "router_z_loss_clip": 1.28320312, + "router_z_loss_mlp": 0.08911133, + "step": 13302, + "time_per_iteration": 2.489457368850708 + }, + { + "auxiliary_loss_clip": 0.06408885, + "auxiliary_loss_mlp": 0.01265444, + "balance_loss_clip": 0.0627289, + "balance_loss_mlp": 0.01255967, + "epoch": 0.7998196302419961, + "flos": 24433528003200.0, + "grad_norm": 1.7910708704236549, + "language_loss": 0.83398485, + "learning_rate": 4.057275202296684e-07, + "loss": 0.91072816, + "num_input_tokens_seen": 286968805, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.09484863, + "step": 13303, + "time_per_iteration": 2.5182011127471924 + }, + { + "auxiliary_loss_clip": 0.06399085, + "auxiliary_loss_mlp": 0.01263644, + "balance_loss_clip": 0.06271808, + "balance_loss_mlp": 0.01254429, + "epoch": 0.7998797534946641, + "flos": 30272715480960.0, + "grad_norm": 1.579021550921295, + "language_loss": 0.58929861, + "learning_rate": 4.054923936969166e-07, + "loss": 0.66592586, + "num_input_tokens_seen": 286990235, + "router_z_loss_clip": 1.27148438, + "router_z_loss_mlp": 0.09210205, + "step": 13304, + "time_per_iteration": 2.584608316421509 + }, + { + "auxiliary_loss_clip": 0.06406042, + "auxiliary_loss_mlp": 0.01261222, + "balance_loss_clip": 0.06271531, + "balance_loss_mlp": 0.0125202, + "epoch": 0.799939876747332, + "flos": 23520785477760.0, + "grad_norm": 1.5411018505136698, + "language_loss": 0.68989539, + "learning_rate": 4.0525732762805265e-07, + "loss": 0.76656806, + "num_input_tokens_seen": 287011060, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.09210205, + "step": 13305, + "time_per_iteration": 2.495842218399048 + }, + { + "auxiliary_loss_clip": 0.06398628, + "auxiliary_loss_mlp": 0.0126253, + "balance_loss_clip": 0.06269637, + "balance_loss_mlp": 0.01254028, + "epoch": 0.8, + "flos": 19324207514880.0, + "grad_norm": 1.5483879862096703, + "language_loss": 0.6919629, + "learning_rate": 4.0502232203199107e-07, + "loss": 0.76857448, + "num_input_tokens_seen": 287029215, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.08493042, + "step": 13306, + "time_per_iteration": 2.4815428256988525 + }, + { + "auxiliary_loss_clip": 0.06404909, + "auxiliary_loss_mlp": 0.01263974, + "balance_loss_clip": 0.06271838, + "balance_loss_mlp": 0.01254813, + "epoch": 0.800060123252668, + "flos": 32420039264640.0, + "grad_norm": 1.3465720910639238, + "language_loss": 0.69548619, + "learning_rate": 4.0478737691764286e-07, + "loss": 0.77217495, + "num_input_tokens_seen": 287050855, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.09155273, + "step": 13307, + "time_per_iteration": 2.5902602672576904 + }, + { + "auxiliary_loss_clip": 0.06402986, + "auxiliary_loss_mlp": 0.01264461, + "balance_loss_clip": 0.06269908, + "balance_loss_mlp": 0.01255151, + "epoch": 0.800120246505336, + "flos": 20016702783360.0, + "grad_norm": 1.932839582685843, + "language_loss": 0.77209872, + "learning_rate": 4.0455249229391677e-07, + "loss": 0.84877324, + "num_input_tokens_seen": 287069915, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.09313965, + "step": 13308, + "time_per_iteration": 2.5227887630462646 + }, + { + "auxiliary_loss_clip": 0.06406727, + "auxiliary_loss_mlp": 0.01264112, + "balance_loss_clip": 0.06270848, + "balance_loss_mlp": 0.0125395, + "epoch": 0.8001803697580039, + "flos": 31876318120320.0, + "grad_norm": 1.398024400765408, + "language_loss": 0.78861815, + "learning_rate": 4.0431766816972e-07, + "loss": 0.86532652, + "num_input_tokens_seen": 287091450, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.10174561, + "step": 13309, + "time_per_iteration": 2.694766044616699 + }, + { + "auxiliary_loss_clip": 0.06317627, + "auxiliary_loss_mlp": 0.01253959, + "balance_loss_clip": 0.06261955, + "balance_loss_mlp": 0.01252847, + "epoch": 0.8002404930106719, + "flos": 63411496341120.0, + "grad_norm": 0.9515368521242993, + "language_loss": 0.64834917, + "learning_rate": 4.040829045539571e-07, + "loss": 0.72406501, + "num_input_tokens_seen": 287148365, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.01114655, + "step": 13310, + "time_per_iteration": 3.0877020359039307 + }, + { + "auxiliary_loss_clip": 0.06409021, + "auxiliary_loss_mlp": 0.01267758, + "balance_loss_clip": 0.06276361, + "balance_loss_mlp": 0.01258257, + "epoch": 0.8003006162633398, + "flos": 27862951109760.0, + "grad_norm": 1.8032558576679762, + "language_loss": 0.83180302, + "learning_rate": 4.0384820145553156e-07, + "loss": 0.90857077, + "num_input_tokens_seen": 287168280, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.0949707, + "step": 13311, + "time_per_iteration": 2.555682897567749 + }, + { + "auxiliary_loss_clip": 0.06402326, + "auxiliary_loss_mlp": 0.01265058, + "balance_loss_clip": 0.06271294, + "balance_loss_mlp": 0.01255944, + "epoch": 0.8003607395160078, + "flos": 18229218359040.0, + "grad_norm": 1.9156158973382509, + "language_loss": 0.6619851, + "learning_rate": 4.0361355888334116e-07, + "loss": 0.73865891, + "num_input_tokens_seen": 287185980, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.09118652, + "step": 13312, + "time_per_iteration": 2.4853975772857666 + }, + { + "auxiliary_loss_clip": 0.06408212, + "auxiliary_loss_mlp": 0.01263878, + "balance_loss_clip": 0.06272315, + "balance_loss_mlp": 0.01253805, + "epoch": 0.8004208627686757, + "flos": 20893331399040.0, + "grad_norm": 1.7788171673051, + "language_loss": 0.75784224, + "learning_rate": 4.033789768462843e-07, + "loss": 0.83456314, + "num_input_tokens_seen": 287203875, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.10070801, + "step": 13313, + "time_per_iteration": 2.5811471939086914 + }, + { + "auxiliary_loss_clip": 0.0640287, + "auxiliary_loss_mlp": 0.01266155, + "balance_loss_clip": 0.06270996, + "balance_loss_mlp": 0.01256661, + "epoch": 0.8004809860213438, + "flos": 26443984942080.0, + "grad_norm": 1.3059892404938946, + "language_loss": 0.75943661, + "learning_rate": 4.031444553532575e-07, + "loss": 0.83612692, + "num_input_tokens_seen": 287226445, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09490967, + "step": 13314, + "time_per_iteration": 2.5711114406585693 + }, + { + "auxiliary_loss_clip": 0.06314123, + "auxiliary_loss_mlp": 0.01251747, + "balance_loss_clip": 0.06258671, + "balance_loss_mlp": 0.0125083, + "epoch": 0.8005411092740117, + "flos": 63668276778240.0, + "grad_norm": 0.7688266609144837, + "language_loss": 0.53789216, + "learning_rate": 4.029099944131522e-07, + "loss": 0.61355084, + "num_input_tokens_seen": 287286240, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.00914764, + "step": 13315, + "time_per_iteration": 3.0470640659332275 + }, + { + "auxiliary_loss_clip": 0.06399442, + "auxiliary_loss_mlp": 0.01266642, + "balance_loss_clip": 0.0626928, + "balance_loss_mlp": 0.0125707, + "epoch": 0.8006012325266797, + "flos": 36146968692480.0, + "grad_norm": 1.5921677145384265, + "language_loss": 0.71092463, + "learning_rate": 4.026755940348603e-07, + "loss": 0.78758544, + "num_input_tokens_seen": 287310265, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.09576416, + "step": 13316, + "time_per_iteration": 2.688965320587158 + }, + { + "auxiliary_loss_clip": 0.06405424, + "auxiliary_loss_mlp": 0.01265946, + "balance_loss_clip": 0.06270652, + "balance_loss_mlp": 0.01256755, + "epoch": 0.8006613557793477, + "flos": 33847390840320.0, + "grad_norm": 1.7083449929688843, + "language_loss": 0.65030324, + "learning_rate": 4.024412542272706e-07, + "loss": 0.72701693, + "num_input_tokens_seen": 287331610, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.09185791, + "step": 13317, + "time_per_iteration": 2.6344261169433594 + }, + { + "auxiliary_loss_clip": 0.06308497, + "auxiliary_loss_mlp": 0.01250396, + "balance_loss_clip": 0.06252623, + "balance_loss_mlp": 0.01249407, + "epoch": 0.8007214790320156, + "flos": 67371041502720.0, + "grad_norm": 0.7463075809766724, + "language_loss": 0.58964193, + "learning_rate": 4.0220697499926783e-07, + "loss": 0.66523087, + "num_input_tokens_seen": 287394795, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.00988007, + "step": 13318, + "time_per_iteration": 3.211217164993286 + }, + { + "auxiliary_loss_clip": 0.06398984, + "auxiliary_loss_mlp": 0.01261211, + "balance_loss_clip": 0.06267591, + "balance_loss_mlp": 0.01252532, + "epoch": 0.8007816022846836, + "flos": 23192406126720.0, + "grad_norm": 3.1434956654413484, + "language_loss": 0.66706848, + "learning_rate": 4.019727563597366e-07, + "loss": 0.74367046, + "num_input_tokens_seen": 287414595, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.08673096, + "step": 13319, + "time_per_iteration": 2.5540733337402344 + }, + { + "auxiliary_loss_clip": 0.06403084, + "auxiliary_loss_mlp": 0.01265724, + "balance_loss_clip": 0.06268618, + "balance_loss_mlp": 0.01255699, + "epoch": 0.8008417255373516, + "flos": 21987901284480.0, + "grad_norm": 1.728669041883902, + "language_loss": 0.73937488, + "learning_rate": 4.0173859831755873e-07, + "loss": 0.81606293, + "num_input_tokens_seen": 287434395, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.10028076, + "step": 13320, + "time_per_iteration": 2.582298994064331 + }, + { + "auxiliary_loss_clip": 0.06404214, + "auxiliary_loss_mlp": 0.01263523, + "balance_loss_clip": 0.06271582, + "balance_loss_mlp": 0.01253575, + "epoch": 0.8009018487900196, + "flos": 16732951200000.0, + "grad_norm": 2.01191871556705, + "language_loss": 0.8012563, + "learning_rate": 4.015045008816138e-07, + "loss": 0.87793362, + "num_input_tokens_seen": 287450590, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09954834, + "step": 13321, + "time_per_iteration": 2.4715728759765625 + }, + { + "auxiliary_loss_clip": 0.06396499, + "auxiliary_loss_mlp": 0.01262518, + "balance_loss_clip": 0.06268975, + "balance_loss_mlp": 0.01253536, + "epoch": 0.8009619720426875, + "flos": 20819887695360.0, + "grad_norm": 1.7373613026127328, + "language_loss": 0.65706664, + "learning_rate": 4.0127046406077825e-07, + "loss": 0.73365676, + "num_input_tokens_seen": 287468455, + "router_z_loss_clip": 1.27441406, + "router_z_loss_mlp": 0.08978271, + "step": 13322, + "time_per_iteration": 3.9246838092803955 + }, + { + "auxiliary_loss_clip": 0.0639898, + "auxiliary_loss_mlp": 0.01263212, + "balance_loss_clip": 0.06267587, + "balance_loss_mlp": 0.01254206, + "epoch": 0.8010220952953555, + "flos": 17936869063680.0, + "grad_norm": 1.6818709041886202, + "language_loss": 0.78149015, + "learning_rate": 4.010364878639265e-07, + "loss": 0.85811198, + "num_input_tokens_seen": 287486485, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09002686, + "step": 13323, + "time_per_iteration": 2.4993720054626465 + }, + { + "auxiliary_loss_clip": 0.06405957, + "auxiliary_loss_mlp": 0.01261855, + "balance_loss_clip": 0.06270777, + "balance_loss_mlp": 0.01251872, + "epoch": 0.8010822185480234, + "flos": 24579241453440.0, + "grad_norm": 2.3981073460441187, + "language_loss": 0.71711612, + "learning_rate": 4.00802572299932e-07, + "loss": 0.79379427, + "num_input_tokens_seen": 287503940, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.09979248, + "step": 13324, + "time_per_iteration": 2.6039645671844482 + }, + { + "auxiliary_loss_clip": 0.06404987, + "auxiliary_loss_mlp": 0.01262523, + "balance_loss_clip": 0.06270103, + "balance_loss_mlp": 0.01252456, + "epoch": 0.8011423418006914, + "flos": 21835563361920.0, + "grad_norm": 1.6339854847519542, + "language_loss": 0.76400465, + "learning_rate": 4.005687173776635e-07, + "loss": 0.84067976, + "num_input_tokens_seen": 287521660, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.10070801, + "step": 13325, + "time_per_iteration": 2.5225205421447754 + }, + { + "auxiliary_loss_clip": 0.06393359, + "auxiliary_loss_mlp": 0.01264051, + "balance_loss_clip": 0.06268814, + "balance_loss_mlp": 0.01256022, + "epoch": 0.8012024650533593, + "flos": 23922021553920.0, + "grad_norm": 1.571695790316147, + "language_loss": 0.80098516, + "learning_rate": 4.003349231059898e-07, + "loss": 0.87755924, + "num_input_tokens_seen": 287541505, + "router_z_loss_clip": 1.24804688, + "router_z_loss_mlp": 0.08026123, + "step": 13326, + "time_per_iteration": 2.5184433460235596 + }, + { + "auxiliary_loss_clip": 0.06396009, + "auxiliary_loss_mlp": 0.01263378, + "balance_loss_clip": 0.06269439, + "balance_loss_mlp": 0.01254921, + "epoch": 0.8012625883060274, + "flos": 23593893765120.0, + "grad_norm": 2.1709213640524156, + "language_loss": 0.66244531, + "learning_rate": 4.001011894937765e-07, + "loss": 0.73903918, + "num_input_tokens_seen": 287560015, + "router_z_loss_clip": 1.265625, + "router_z_loss_mlp": 0.08453369, + "step": 13327, + "time_per_iteration": 2.5192511081695557 + }, + { + "auxiliary_loss_clip": 0.06397668, + "auxiliary_loss_mlp": 0.0126388, + "balance_loss_clip": 0.06270249, + "balance_loss_mlp": 0.0125497, + "epoch": 0.8013227115586953, + "flos": 20820265038720.0, + "grad_norm": 1.5237011846909325, + "language_loss": 0.73911273, + "learning_rate": 3.9986751654988636e-07, + "loss": 0.81572825, + "num_input_tokens_seen": 287579150, + "router_z_loss_clip": 1.27441406, + "router_z_loss_mlp": 0.08911133, + "step": 13328, + "time_per_iteration": 2.490879535675049 + }, + { + "auxiliary_loss_clip": 0.0640716, + "auxiliary_loss_mlp": 0.01265301, + "balance_loss_clip": 0.06271626, + "balance_loss_mlp": 0.0125493, + "epoch": 0.8013828348113633, + "flos": 15893820086400.0, + "grad_norm": 2.1070162273043938, + "language_loss": 0.74215919, + "learning_rate": 3.996339042831798e-07, + "loss": 0.81888378, + "num_input_tokens_seen": 287597420, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.10369873, + "step": 13329, + "time_per_iteration": 2.478027105331421 + }, + { + "auxiliary_loss_clip": 0.06312898, + "auxiliary_loss_mlp": 0.01251725, + "balance_loss_clip": 0.06257395, + "balance_loss_mlp": 0.01250756, + "epoch": 0.8014429580640313, + "flos": 71085183183360.0, + "grad_norm": 0.6797565507978373, + "language_loss": 0.52515209, + "learning_rate": 3.9940035270251605e-07, + "loss": 0.60079831, + "num_input_tokens_seen": 287667280, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.00967407, + "step": 13330, + "time_per_iteration": 3.21134614944458 + }, + { + "auxiliary_loss_clip": 0.0640648, + "auxiliary_loss_mlp": 0.01263996, + "balance_loss_clip": 0.06270502, + "balance_loss_mlp": 0.01253518, + "epoch": 0.8015030813166992, + "flos": 23083100075520.0, + "grad_norm": 1.654890173556639, + "language_loss": 0.7351566, + "learning_rate": 3.991668618167519e-07, + "loss": 0.8118614, + "num_input_tokens_seen": 287687375, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.10479736, + "step": 13331, + "time_per_iteration": 3.970208168029785 + }, + { + "auxiliary_loss_clip": 0.06399897, + "auxiliary_loss_mlp": 0.01262704, + "balance_loss_clip": 0.06269284, + "balance_loss_mlp": 0.01254037, + "epoch": 0.8015632045693672, + "flos": 21878888722560.0, + "grad_norm": 1.8984062723918875, + "language_loss": 0.77560246, + "learning_rate": 3.989334316347401e-07, + "loss": 0.8522284, + "num_input_tokens_seen": 287707895, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.08666992, + "step": 13332, + "time_per_iteration": 2.5455820560455322 + }, + { + "auxiliary_loss_clip": 0.0640306, + "auxiliary_loss_mlp": 0.0126663, + "balance_loss_clip": 0.06269315, + "balance_loss_mlp": 0.01256402, + "epoch": 0.8016233278220352, + "flos": 23663018983680.0, + "grad_norm": 1.6654900113929851, + "language_loss": 0.83571923, + "learning_rate": 3.987000621653338e-07, + "loss": 0.91241622, + "num_input_tokens_seen": 287723990, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.10217285, + "step": 13333, + "time_per_iteration": 2.510481595993042 + }, + { + "auxiliary_loss_clip": 0.06403299, + "auxiliary_loss_mlp": 0.01262076, + "balance_loss_clip": 0.06270902, + "balance_loss_mlp": 0.01252724, + "epoch": 0.8016834510747032, + "flos": 16258732617600.0, + "grad_norm": 1.578647328304289, + "language_loss": 0.73791355, + "learning_rate": 3.9846675341738133e-07, + "loss": 0.81456727, + "num_input_tokens_seen": 287742380, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.09350586, + "step": 13334, + "time_per_iteration": 2.5875518321990967 + }, + { + "auxiliary_loss_clip": 0.06397326, + "auxiliary_loss_mlp": 0.01262334, + "balance_loss_clip": 0.06269726, + "balance_loss_mlp": 0.01253292, + "epoch": 0.8017435743273711, + "flos": 12280892538240.0, + "grad_norm": 1.8344549459968347, + "language_loss": 0.74896538, + "learning_rate": 3.9823350539972967e-07, + "loss": 0.82556194, + "num_input_tokens_seen": 287760130, + "router_z_loss_clip": 1.27539062, + "router_z_loss_mlp": 0.09042358, + "step": 13335, + "time_per_iteration": 4.024559259414673 + }, + { + "auxiliary_loss_clip": 0.06397076, + "auxiliary_loss_mlp": 0.01263938, + "balance_loss_clip": 0.06266247, + "balance_loss_mlp": 0.01254044, + "epoch": 0.8018036975800391, + "flos": 17200880726400.0, + "grad_norm": 1.7648515567643608, + "language_loss": 0.75561655, + "learning_rate": 3.9800031812122416e-07, + "loss": 0.83222669, + "num_input_tokens_seen": 287777565, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.09881592, + "step": 13336, + "time_per_iteration": 2.4966955184936523 + }, + { + "auxiliary_loss_clip": 0.0640955, + "auxiliary_loss_mlp": 0.01264608, + "balance_loss_clip": 0.06270093, + "balance_loss_mlp": 0.01253736, + "epoch": 0.801863820832707, + "flos": 20638228043520.0, + "grad_norm": 1.8494004813437324, + "language_loss": 0.74727678, + "learning_rate": 3.977671915907068e-07, + "loss": 0.82401836, + "num_input_tokens_seen": 287796310, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.10870361, + "step": 13337, + "time_per_iteration": 2.493006944656372 + }, + { + "auxiliary_loss_clip": 0.06406038, + "auxiliary_loss_mlp": 0.01263988, + "balance_loss_clip": 0.06269336, + "balance_loss_mlp": 0.01253962, + "epoch": 0.801923944085375, + "flos": 30453410810880.0, + "grad_norm": 1.5897406325584222, + "language_loss": 0.8002277, + "learning_rate": 3.9753412581701883e-07, + "loss": 0.87692797, + "num_input_tokens_seen": 287817330, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.1003418, + "step": 13338, + "time_per_iteration": 2.5765812397003174 + }, + { + "auxiliary_loss_clip": 0.06405494, + "auxiliary_loss_mlp": 0.01265523, + "balance_loss_clip": 0.06270125, + "balance_loss_mlp": 0.01254317, + "epoch": 0.801984067338043, + "flos": 20016660856320.0, + "grad_norm": 1.9676799431141796, + "language_loss": 0.74850368, + "learning_rate": 3.9730112080899733e-07, + "loss": 0.82521391, + "num_input_tokens_seen": 287835095, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.11218262, + "step": 13339, + "time_per_iteration": 3.9401278495788574 + }, + { + "auxiliary_loss_clip": 0.06401505, + "auxiliary_loss_mlp": 0.01264432, + "balance_loss_clip": 0.06271123, + "balance_loss_mlp": 0.01255253, + "epoch": 0.802044190590711, + "flos": 22790666926080.0, + "grad_norm": 1.5626805992517288, + "language_loss": 0.7945329, + "learning_rate": 3.970681765754775e-07, + "loss": 0.87119228, + "num_input_tokens_seen": 287854595, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.09179688, + "step": 13340, + "time_per_iteration": 2.5232396125793457 + }, + { + "auxiliary_loss_clip": 0.06404866, + "auxiliary_loss_mlp": 0.01263789, + "balance_loss_clip": 0.06272231, + "balance_loss_mlp": 0.01254831, + "epoch": 0.8021043138433789, + "flos": 27607554264960.0, + "grad_norm": 1.7600307740007948, + "language_loss": 0.68075955, + "learning_rate": 3.968352931252936e-07, + "loss": 0.75744605, + "num_input_tokens_seen": 287876960, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.08953857, + "step": 13341, + "time_per_iteration": 2.5519580841064453 + }, + { + "auxiliary_loss_clip": 0.06309702, + "auxiliary_loss_mlp": 0.01251381, + "balance_loss_clip": 0.06254174, + "balance_loss_mlp": 0.01250354, + "epoch": 0.8021644370960469, + "flos": 62080453434240.0, + "grad_norm": 0.7935303767570981, + "language_loss": 0.61211252, + "learning_rate": 3.9660247046727547e-07, + "loss": 0.68772334, + "num_input_tokens_seen": 287936530, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01027679, + "step": 13342, + "time_per_iteration": 3.0668532848358154 + }, + { + "auxiliary_loss_clip": 0.06403046, + "auxiliary_loss_mlp": 0.01263587, + "balance_loss_clip": 0.06271387, + "balance_loss_mlp": 0.01253788, + "epoch": 0.8022245603487148, + "flos": 23367525160320.0, + "grad_norm": 1.685983088220024, + "language_loss": 0.63982582, + "learning_rate": 3.963697086102522e-07, + "loss": 0.71649212, + "num_input_tokens_seen": 287954285, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09802246, + "step": 13343, + "time_per_iteration": 2.52908992767334 + }, + { + "auxiliary_loss_clip": 0.06393635, + "auxiliary_loss_mlp": 0.01262737, + "balance_loss_clip": 0.06267881, + "balance_loss_mlp": 0.01254142, + "epoch": 0.8022846836013828, + "flos": 10858027155840.0, + "grad_norm": 1.7400180605672049, + "language_loss": 0.6898669, + "learning_rate": 3.96137007563051e-07, + "loss": 0.76643062, + "num_input_tokens_seen": 287971595, + "router_z_loss_clip": 1.25683594, + "router_z_loss_mlp": 0.0859375, + "step": 13344, + "time_per_iteration": 2.467531204223633 + }, + { + "auxiliary_loss_clip": 0.06399775, + "auxiliary_loss_mlp": 0.0126374, + "balance_loss_clip": 0.06268416, + "balance_loss_mlp": 0.01254538, + "epoch": 0.8023448068540509, + "flos": 29247899719680.0, + "grad_norm": 1.4831700839828168, + "language_loss": 0.70263791, + "learning_rate": 3.9590436733449506e-07, + "loss": 0.77927303, + "num_input_tokens_seen": 287992540, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.09191895, + "step": 13345, + "time_per_iteration": 2.5930464267730713 + }, + { + "auxiliary_loss_clip": 0.06311318, + "auxiliary_loss_mlp": 0.01250528, + "balance_loss_clip": 0.06255944, + "balance_loss_mlp": 0.01249584, + "epoch": 0.8024049301067188, + "flos": 64172362141440.0, + "grad_norm": 0.847535442910353, + "language_loss": 0.62905973, + "learning_rate": 3.956717879334059e-07, + "loss": 0.70467818, + "num_input_tokens_seen": 288052810, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.0094223, + "step": 13346, + "time_per_iteration": 3.2076127529144287 + }, + { + "auxiliary_loss_clip": 0.06396353, + "auxiliary_loss_mlp": 0.01263037, + "balance_loss_clip": 0.06268937, + "balance_loss_mlp": 0.01253715, + "epoch": 0.8024650533593868, + "flos": 28592985807360.0, + "grad_norm": 3.633465076952704, + "language_loss": 0.72895849, + "learning_rate": 3.9543926936860327e-07, + "loss": 0.80555242, + "num_input_tokens_seen": 288073045, + "router_z_loss_clip": 1.27441406, + "router_z_loss_mlp": 0.09326172, + "step": 13347, + "time_per_iteration": 2.5710387229919434 + }, + { + "auxiliary_loss_clip": 0.06403917, + "auxiliary_loss_mlp": 0.01266411, + "balance_loss_clip": 0.06269814, + "balance_loss_mlp": 0.01256577, + "epoch": 0.8025251766120547, + "flos": 16987844920320.0, + "grad_norm": 2.5900803344062115, + "language_loss": 0.73302913, + "learning_rate": 3.9520681164890493e-07, + "loss": 0.80973244, + "num_input_tokens_seen": 288091165, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.09838867, + "step": 13348, + "time_per_iteration": 2.4676120281219482 + }, + { + "auxiliary_loss_clip": 0.06403141, + "auxiliary_loss_mlp": 0.01262753, + "balance_loss_clip": 0.06271264, + "balance_loss_mlp": 0.01253336, + "epoch": 0.8025852998647227, + "flos": 22170189841920.0, + "grad_norm": 1.6273039125060904, + "language_loss": 0.7625345, + "learning_rate": 3.9497441478312444e-07, + "loss": 0.83919346, + "num_input_tokens_seen": 288110595, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09423828, + "step": 13349, + "time_per_iteration": 2.5136961936950684 + }, + { + "auxiliary_loss_clip": 0.06400917, + "auxiliary_loss_mlp": 0.01264363, + "balance_loss_clip": 0.06269996, + "balance_loss_mlp": 0.01255076, + "epoch": 0.8026454231173906, + "flos": 22023386288640.0, + "grad_norm": 2.7562634008625846, + "language_loss": 0.83666581, + "learning_rate": 3.947420787800755e-07, + "loss": 0.91331869, + "num_input_tokens_seen": 288128995, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09283447, + "step": 13350, + "time_per_iteration": 2.519904851913452 + }, + { + "auxiliary_loss_clip": 0.06399673, + "auxiliary_loss_mlp": 0.01265698, + "balance_loss_clip": 0.0626874, + "balance_loss_mlp": 0.0125665, + "epoch": 0.8027055463700586, + "flos": 22497772579200.0, + "grad_norm": 1.5771958395635441, + "language_loss": 0.71500349, + "learning_rate": 3.945098036485679e-07, + "loss": 0.79165721, + "num_input_tokens_seen": 288149265, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.0904541, + "step": 13351, + "time_per_iteration": 2.536276340484619 + }, + { + "auxiliary_loss_clip": 0.06399149, + "auxiliary_loss_mlp": 0.01267076, + "balance_loss_clip": 0.0626966, + "balance_loss_mlp": 0.01257921, + "epoch": 0.8027656696227266, + "flos": 28920442763520.0, + "grad_norm": 1.6393100884614646, + "language_loss": 0.62040806, + "learning_rate": 3.9427758939740885e-07, + "loss": 0.6970703, + "num_input_tokens_seen": 288170745, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.09161377, + "step": 13352, + "time_per_iteration": 2.572496175765991 + }, + { + "auxiliary_loss_clip": 0.06404066, + "auxiliary_loss_mlp": 0.01264871, + "balance_loss_clip": 0.06273441, + "balance_loss_mlp": 0.01255078, + "epoch": 0.8028257928753946, + "flos": 18595514482560.0, + "grad_norm": 1.84085315360638, + "language_loss": 0.77318871, + "learning_rate": 3.940454360354046e-07, + "loss": 0.84987807, + "num_input_tokens_seen": 288189415, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09796143, + "step": 13353, + "time_per_iteration": 2.591125726699829 + }, + { + "auxiliary_loss_clip": 0.0641156, + "auxiliary_loss_mlp": 0.01271346, + "balance_loss_clip": 0.06270623, + "balance_loss_mlp": 0.01260713, + "epoch": 0.8028859161280625, + "flos": 19135126776960.0, + "grad_norm": 2.1440519982160726, + "language_loss": 0.73642856, + "learning_rate": 3.938133435713582e-07, + "loss": 0.81325769, + "num_input_tokens_seen": 288206900, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.10632324, + "step": 13354, + "time_per_iteration": 2.4713294506073 + }, + { + "auxiliary_loss_clip": 0.06405748, + "auxiliary_loss_mlp": 0.01261139, + "balance_loss_clip": 0.06271609, + "balance_loss_mlp": 0.01251835, + "epoch": 0.8029460393807305, + "flos": 20236069572480.0, + "grad_norm": 2.691632863229345, + "language_loss": 0.65962112, + "learning_rate": 3.935813120140714e-07, + "loss": 0.73628998, + "num_input_tokens_seen": 288224800, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.09295654, + "step": 13355, + "time_per_iteration": 2.487391710281372 + }, + { + "auxiliary_loss_clip": 0.06404544, + "auxiliary_loss_mlp": 0.01265286, + "balance_loss_clip": 0.06268579, + "balance_loss_mlp": 0.01254724, + "epoch": 0.8030061626333984, + "flos": 49794445797120.0, + "grad_norm": 2.169594763741831, + "language_loss": 0.69115853, + "learning_rate": 3.9334934137234235e-07, + "loss": 0.7678569, + "num_input_tokens_seen": 288249400, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.10565186, + "step": 13356, + "time_per_iteration": 2.771540403366089 + }, + { + "auxiliary_loss_clip": 0.06398716, + "auxiliary_loss_mlp": 0.01262043, + "balance_loss_clip": 0.06268562, + "balance_loss_mlp": 0.01253555, + "epoch": 0.8030662858860664, + "flos": 21621479379840.0, + "grad_norm": 1.8816626292041285, + "language_loss": 0.7745564, + "learning_rate": 3.931174316549666e-07, + "loss": 0.85116398, + "num_input_tokens_seen": 288268780, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.08483887, + "step": 13357, + "time_per_iteration": 2.4969570636749268 + }, + { + "auxiliary_loss_clip": 0.0640809, + "auxiliary_loss_mlp": 0.01263369, + "balance_loss_clip": 0.06269683, + "balance_loss_mlp": 0.01253219, + "epoch": 0.8031264091387345, + "flos": 25637194304640.0, + "grad_norm": 1.5133182895220076, + "language_loss": 0.77548575, + "learning_rate": 3.9288558287073937e-07, + "loss": 0.85220027, + "num_input_tokens_seen": 288290830, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.10150146, + "step": 13358, + "time_per_iteration": 2.623896837234497 + }, + { + "auxiliary_loss_clip": 0.0639957, + "auxiliary_loss_mlp": 0.01261602, + "balance_loss_clip": 0.06269436, + "balance_loss_mlp": 0.01252408, + "epoch": 0.8031865323914024, + "flos": 19652335303680.0, + "grad_norm": 1.5054224659704207, + "language_loss": 0.84991813, + "learning_rate": 3.9265379502845143e-07, + "loss": 0.92652988, + "num_input_tokens_seen": 288308865, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.09194946, + "step": 13359, + "time_per_iteration": 2.5500707626342773 + }, + { + "auxiliary_loss_clip": 0.06401375, + "auxiliary_loss_mlp": 0.01262567, + "balance_loss_clip": 0.0627083, + "balance_loss_mlp": 0.01253435, + "epoch": 0.8032466556440704, + "flos": 26174961809280.0, + "grad_norm": 1.8378585000154632, + "language_loss": 0.7306003, + "learning_rate": 3.924220681368928e-07, + "loss": 0.80723965, + "num_input_tokens_seen": 288327325, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.09136963, + "step": 13360, + "time_per_iteration": 2.548150062561035 + }, + { + "auxiliary_loss_clip": 0.06402496, + "auxiliary_loss_mlp": 0.01264804, + "balance_loss_clip": 0.06269519, + "balance_loss_mlp": 0.01255423, + "epoch": 0.8033067788967383, + "flos": 25527049712640.0, + "grad_norm": 2.141449143899577, + "language_loss": 0.69812787, + "learning_rate": 3.921904022048512e-07, + "loss": 0.7748009, + "num_input_tokens_seen": 288347285, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.09387207, + "step": 13361, + "time_per_iteration": 2.5122880935668945 + }, + { + "auxiliary_loss_clip": 0.06408579, + "auxiliary_loss_mlp": 0.01263892, + "balance_loss_clip": 0.06272987, + "balance_loss_mlp": 0.01253861, + "epoch": 0.8033669021494063, + "flos": 24031076042880.0, + "grad_norm": 1.5411892792753266, + "language_loss": 0.70487249, + "learning_rate": 3.919587972411098e-07, + "loss": 0.7815972, + "num_input_tokens_seen": 288367785, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.1003418, + "step": 13362, + "time_per_iteration": 3.9490444660186768 + }, + { + "auxiliary_loss_clip": 0.06412524, + "auxiliary_loss_mlp": 0.01268791, + "balance_loss_clip": 0.06271197, + "balance_loss_mlp": 0.01257299, + "epoch": 0.8034270254020742, + "flos": 13592900568960.0, + "grad_norm": 2.526180707519333, + "language_loss": 0.78481448, + "learning_rate": 3.91727253254452e-07, + "loss": 0.8616277, + "num_input_tokens_seen": 288384135, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.11505127, + "step": 13363, + "time_per_iteration": 2.4621450901031494 + }, + { + "auxiliary_loss_clip": 0.06403825, + "auxiliary_loss_mlp": 0.01266513, + "balance_loss_clip": 0.06268764, + "balance_loss_mlp": 0.01256619, + "epoch": 0.8034871486547422, + "flos": 27419228213760.0, + "grad_norm": 2.002665668472871, + "language_loss": 0.7498951, + "learning_rate": 3.9149577025365787e-07, + "loss": 0.82659847, + "num_input_tokens_seen": 288403805, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.09893799, + "step": 13364, + "time_per_iteration": 2.5504682064056396 + }, + { + "auxiliary_loss_clip": 0.0640076, + "auxiliary_loss_mlp": 0.01264787, + "balance_loss_clip": 0.06270374, + "balance_loss_mlp": 0.01255673, + "epoch": 0.8035472719074102, + "flos": 32606855942400.0, + "grad_norm": 1.9519754952718025, + "language_loss": 0.61201763, + "learning_rate": 3.9126434824750596e-07, + "loss": 0.68867314, + "num_input_tokens_seen": 288424895, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.09112549, + "step": 13365, + "time_per_iteration": 2.637441396713257 + }, + { + "auxiliary_loss_clip": 0.06407268, + "auxiliary_loss_mlp": 0.01265341, + "balance_loss_clip": 0.06271231, + "balance_loss_mlp": 0.01255304, + "epoch": 0.8036073951600782, + "flos": 21294357840000.0, + "grad_norm": 1.6745258568385837, + "language_loss": 0.6602062, + "learning_rate": 3.910329872447706e-07, + "loss": 0.73693228, + "num_input_tokens_seen": 288443865, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.10040283, + "step": 13366, + "time_per_iteration": 2.5265872478485107 + }, + { + "auxiliary_loss_clip": 0.06398745, + "auxiliary_loss_mlp": 0.01261552, + "balance_loss_clip": 0.06269355, + "balance_loss_mlp": 0.01252308, + "epoch": 0.8036675184127461, + "flos": 18119702672640.0, + "grad_norm": 2.0189500018467146, + "language_loss": 0.75098139, + "learning_rate": 3.908016872542259e-07, + "loss": 0.82758439, + "num_input_tokens_seen": 288461065, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.09234619, + "step": 13367, + "time_per_iteration": 2.507988214492798 + }, + { + "auxiliary_loss_clip": 0.06403186, + "auxiliary_loss_mlp": 0.01263311, + "balance_loss_clip": 0.06272097, + "balance_loss_mlp": 0.01254024, + "epoch": 0.8037276416654141, + "flos": 26037298350720.0, + "grad_norm": 1.466952171960805, + "language_loss": 0.74368006, + "learning_rate": 3.905704482846428e-07, + "loss": 0.82034504, + "num_input_tokens_seen": 288481865, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.09283447, + "step": 13368, + "time_per_iteration": 2.5691888332366943 + }, + { + "auxiliary_loss_clip": 0.0640569, + "auxiliary_loss_mlp": 0.0126344, + "balance_loss_clip": 0.06270935, + "balance_loss_mlp": 0.01253671, + "epoch": 0.803787764918082, + "flos": 18807334404480.0, + "grad_norm": 1.851125330609221, + "language_loss": 0.69820118, + "learning_rate": 3.90339270344789e-07, + "loss": 0.77489251, + "num_input_tokens_seen": 288499345, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.09771729, + "step": 13369, + "time_per_iteration": 2.5154571533203125 + }, + { + "auxiliary_loss_clip": 0.06399469, + "auxiliary_loss_mlp": 0.01262611, + "balance_loss_clip": 0.06269622, + "balance_loss_mlp": 0.01253808, + "epoch": 0.80384788817075, + "flos": 20231289889920.0, + "grad_norm": 1.5121727430472034, + "language_loss": 0.73977184, + "learning_rate": 3.901081534434312e-07, + "loss": 0.81639266, + "num_input_tokens_seen": 288517660, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.08807373, + "step": 13370, + "time_per_iteration": 2.501655101776123 + }, + { + "auxiliary_loss_clip": 0.06407988, + "auxiliary_loss_mlp": 0.01264642, + "balance_loss_clip": 0.06271008, + "balance_loss_mlp": 0.012551, + "epoch": 0.8039080114234181, + "flos": 18521232238080.0, + "grad_norm": 2.479350396293282, + "language_loss": 0.87167275, + "learning_rate": 3.898770975893342e-07, + "loss": 0.94839901, + "num_input_tokens_seen": 288534180, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.09539795, + "step": 13371, + "time_per_iteration": 3.886564016342163 + }, + { + "auxiliary_loss_clip": 0.06406743, + "auxiliary_loss_mlp": 0.01265329, + "balance_loss_clip": 0.06270692, + "balance_loss_mlp": 0.01254815, + "epoch": 0.803968134676086, + "flos": 22389053506560.0, + "grad_norm": 1.8483310810057103, + "language_loss": 0.74931836, + "learning_rate": 3.89646102791259e-07, + "loss": 0.82603908, + "num_input_tokens_seen": 288553350, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.10522461, + "step": 13372, + "time_per_iteration": 2.505094289779663 + }, + { + "auxiliary_loss_clip": 0.06399661, + "auxiliary_loss_mlp": 0.01264572, + "balance_loss_clip": 0.06268448, + "balance_loss_mlp": 0.01254707, + "epoch": 0.804028257928754, + "flos": 23849458318080.0, + "grad_norm": 2.2445203393539965, + "language_loss": 0.79285675, + "learning_rate": 3.894151690579646e-07, + "loss": 0.86949909, + "num_input_tokens_seen": 288571325, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09863281, + "step": 13373, + "time_per_iteration": 2.537801742553711 + }, + { + "auxiliary_loss_clip": 0.06399599, + "auxiliary_loss_mlp": 0.01263438, + "balance_loss_clip": 0.06269559, + "balance_loss_mlp": 0.01254897, + "epoch": 0.8040883811814219, + "flos": 23557570220160.0, + "grad_norm": 1.4911107147206584, + "language_loss": 0.74763751, + "learning_rate": 3.8918429639820815e-07, + "loss": 0.82426786, + "num_input_tokens_seen": 288592100, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.08532715, + "step": 13374, + "time_per_iteration": 3.975172281265259 + }, + { + "auxiliary_loss_clip": 0.06405147, + "auxiliary_loss_mlp": 0.01264438, + "balance_loss_clip": 0.06269235, + "balance_loss_mlp": 0.01254198, + "epoch": 0.8041485044340899, + "flos": 19032319416960.0, + "grad_norm": 2.1627910258731546, + "language_loss": 0.69120371, + "learning_rate": 3.889534848207452e-07, + "loss": 0.76789951, + "num_input_tokens_seen": 288612305, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.10247803, + "step": 13375, + "time_per_iteration": 2.5215139389038086 + }, + { + "auxiliary_loss_clip": 0.06307931, + "auxiliary_loss_mlp": 0.0125401, + "balance_loss_clip": 0.06252438, + "balance_loss_mlp": 0.01252982, + "epoch": 0.8042086276867578, + "flos": 70027817310720.0, + "grad_norm": 0.7167965805045454, + "language_loss": 0.55595809, + "learning_rate": 3.887227343343271e-07, + "loss": 0.63157749, + "num_input_tokens_seen": 288676015, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01027679, + "step": 13376, + "time_per_iteration": 3.172804355621338 + }, + { + "auxiliary_loss_clip": 0.06404025, + "auxiliary_loss_mlp": 0.01267218, + "balance_loss_clip": 0.06268938, + "balance_loss_mlp": 0.01257681, + "epoch": 0.8042687509394258, + "flos": 21879014503680.0, + "grad_norm": 1.674981149404826, + "language_loss": 0.73782766, + "learning_rate": 3.8849204494770425e-07, + "loss": 0.81454003, + "num_input_tokens_seen": 288696455, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.09539795, + "step": 13377, + "time_per_iteration": 2.503901243209839 + }, + { + "auxiliary_loss_clip": 0.0640146, + "auxiliary_loss_mlp": 0.01263857, + "balance_loss_clip": 0.0626822, + "balance_loss_mlp": 0.01254237, + "epoch": 0.8043288741920938, + "flos": 26622122722560.0, + "grad_norm": 1.6914077439182815, + "language_loss": 0.70630229, + "learning_rate": 3.8826141666962567e-07, + "loss": 0.78295547, + "num_input_tokens_seen": 288715560, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.09619141, + "step": 13378, + "time_per_iteration": 2.5762038230895996 + }, + { + "auxiliary_loss_clip": 0.06403045, + "auxiliary_loss_mlp": 0.01262509, + "balance_loss_clip": 0.06269714, + "balance_loss_mlp": 0.012533, + "epoch": 0.8043889974447618, + "flos": 33412137206400.0, + "grad_norm": 1.3386362745905136, + "language_loss": 0.69531369, + "learning_rate": 3.880308495088347e-07, + "loss": 0.7719692, + "num_input_tokens_seen": 288739485, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.09204102, + "step": 13379, + "time_per_iteration": 4.13545298576355 + }, + { + "auxiliary_loss_clip": 0.06408659, + "auxiliary_loss_mlp": 0.01264563, + "balance_loss_clip": 0.06269853, + "balance_loss_mlp": 0.01253697, + "epoch": 0.8044491206974297, + "flos": 20382202293120.0, + "grad_norm": 1.6780556856140154, + "language_loss": 0.76333177, + "learning_rate": 3.8780034347407533e-07, + "loss": 0.84006405, + "num_input_tokens_seen": 288757420, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.10864258, + "step": 13380, + "time_per_iteration": 2.5246059894561768 + }, + { + "auxiliary_loss_clip": 0.06399637, + "auxiliary_loss_mlp": 0.01263232, + "balance_loss_clip": 0.06269045, + "balance_loss_mlp": 0.01254035, + "epoch": 0.8045092439500977, + "flos": 23410473177600.0, + "grad_norm": 5.962253365542073, + "language_loss": 0.69472402, + "learning_rate": 3.875698985740887e-07, + "loss": 0.77135271, + "num_input_tokens_seen": 288775535, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.09191895, + "step": 13381, + "time_per_iteration": 2.513369083404541 + }, + { + "auxiliary_loss_clip": 0.06405897, + "auxiliary_loss_mlp": 0.01267366, + "balance_loss_clip": 0.06273341, + "balance_loss_mlp": 0.01257805, + "epoch": 0.8045693672027656, + "flos": 24104058549120.0, + "grad_norm": 1.8201650419638222, + "language_loss": 0.64036882, + "learning_rate": 3.873395148176135e-07, + "loss": 0.71710145, + "num_input_tokens_seen": 288795035, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.09564209, + "step": 13382, + "time_per_iteration": 2.522407054901123 + }, + { + "auxiliary_loss_clip": 0.06400527, + "auxiliary_loss_mlp": 0.01265284, + "balance_loss_clip": 0.06269531, + "balance_loss_mlp": 0.01256176, + "epoch": 0.8046294904554336, + "flos": 27714218912640.0, + "grad_norm": 2.245463185943566, + "language_loss": 0.76378274, + "learning_rate": 3.8710919221338487e-07, + "loss": 0.84044087, + "num_input_tokens_seen": 288816270, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09112549, + "step": 13383, + "time_per_iteration": 2.5720760822296143 + }, + { + "auxiliary_loss_clip": 0.06401812, + "auxiliary_loss_mlp": 0.01262594, + "balance_loss_clip": 0.06270383, + "balance_loss_mlp": 0.01253188, + "epoch": 0.8046896137081017, + "flos": 24979974405120.0, + "grad_norm": 2.429847725728327, + "language_loss": 0.69923508, + "learning_rate": 3.868789307701381e-07, + "loss": 0.77587903, + "num_input_tokens_seen": 288836050, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.09405518, + "step": 13384, + "time_per_iteration": 2.5203967094421387 + }, + { + "auxiliary_loss_clip": 0.06404511, + "auxiliary_loss_mlp": 0.01262325, + "balance_loss_clip": 0.06269559, + "balance_loss_mlp": 0.01252258, + "epoch": 0.8047497369607696, + "flos": 17681178729600.0, + "grad_norm": 2.046096721285892, + "language_loss": 0.79958355, + "learning_rate": 3.8664873049660375e-07, + "loss": 0.87625194, + "num_input_tokens_seen": 288852900, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.10070801, + "step": 13385, + "time_per_iteration": 2.4725265502929688 + }, + { + "auxiliary_loss_clip": 0.06403039, + "auxiliary_loss_mlp": 0.01267415, + "balance_loss_clip": 0.06269456, + "balance_loss_mlp": 0.01257193, + "epoch": 0.8048098602134376, + "flos": 22388550382080.0, + "grad_norm": 1.837937550839016, + "language_loss": 0.72530949, + "learning_rate": 3.864185914015108e-07, + "loss": 0.80201405, + "num_input_tokens_seen": 288872625, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.10223389, + "step": 13386, + "time_per_iteration": 2.486330270767212 + }, + { + "auxiliary_loss_clip": 0.06309167, + "auxiliary_loss_mlp": 0.01254218, + "balance_loss_clip": 0.06253965, + "balance_loss_mlp": 0.01253243, + "epoch": 0.8048699834661055, + "flos": 71221840392960.0, + "grad_norm": 0.6523037243567322, + "language_loss": 0.51220822, + "learning_rate": 3.861885134935865e-07, + "loss": 0.58784211, + "num_input_tokens_seen": 288939180, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.00974274, + "step": 13387, + "time_per_iteration": 3.1729602813720703 + }, + { + "auxiliary_loss_clip": 0.06402306, + "auxiliary_loss_mlp": 0.01263122, + "balance_loss_clip": 0.06268468, + "balance_loss_mlp": 0.01253186, + "epoch": 0.8049301067187735, + "flos": 23667211687680.0, + "grad_norm": 1.5827606972372845, + "language_loss": 0.74150264, + "learning_rate": 3.859584967815559e-07, + "loss": 0.8181569, + "num_input_tokens_seen": 288958925, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.0993042, + "step": 13388, + "time_per_iteration": 2.521761894226074 + }, + { + "auxiliary_loss_clip": 0.0640045, + "auxiliary_loss_mlp": 0.01264289, + "balance_loss_clip": 0.06270331, + "balance_loss_mlp": 0.012544, + "epoch": 0.8049902299714414, + "flos": 24433318368000.0, + "grad_norm": 1.503353867290701, + "language_loss": 0.71913797, + "learning_rate": 3.857285412741411e-07, + "loss": 0.79578537, + "num_input_tokens_seen": 288980935, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.09887695, + "step": 13389, + "time_per_iteration": 2.5576906204223633 + }, + { + "auxiliary_loss_clip": 0.06400909, + "auxiliary_loss_mlp": 0.01263971, + "balance_loss_clip": 0.06271061, + "balance_loss_mlp": 0.01254273, + "epoch": 0.8050503532241094, + "flos": 17498219339520.0, + "grad_norm": 1.9489558948159147, + "language_loss": 0.83189499, + "learning_rate": 3.8549864698006097e-07, + "loss": 0.90854383, + "num_input_tokens_seen": 288996780, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.0970459, + "step": 13390, + "time_per_iteration": 2.4616317749023438 + }, + { + "auxiliary_loss_clip": 0.06308493, + "auxiliary_loss_mlp": 0.01248902, + "balance_loss_clip": 0.06253241, + "balance_loss_mlp": 0.01247792, + "epoch": 0.8051104764767774, + "flos": 57675535493760.0, + "grad_norm": 0.764906547770961, + "language_loss": 0.55567837, + "learning_rate": 3.8526881390803424e-07, + "loss": 0.63125229, + "num_input_tokens_seen": 289057590, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01112366, + "step": 13391, + "time_per_iteration": 3.141718626022339 + }, + { + "auxiliary_loss_clip": 0.06397294, + "auxiliary_loss_mlp": 0.01265249, + "balance_loss_clip": 0.06269481, + "balance_loss_mlp": 0.01256302, + "epoch": 0.8051705997294454, + "flos": 18009138810240.0, + "grad_norm": 1.5129842521720784, + "language_loss": 0.84422779, + "learning_rate": 3.850390420667762e-07, + "loss": 0.92085326, + "num_input_tokens_seen": 289076285, + "router_z_loss_clip": 1.27734375, + "router_z_loss_mlp": 0.0894165, + "step": 13392, + "time_per_iteration": 2.507310390472412 + }, + { + "auxiliary_loss_clip": 0.06402355, + "auxiliary_loss_mlp": 0.01266445, + "balance_loss_clip": 0.06268811, + "balance_loss_mlp": 0.01257063, + "epoch": 0.8052307229821133, + "flos": 26405271555840.0, + "grad_norm": 1.5077686390868956, + "language_loss": 0.705845, + "learning_rate": 3.8480933146499914e-07, + "loss": 0.78253293, + "num_input_tokens_seen": 289097585, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.09381104, + "step": 13393, + "time_per_iteration": 2.60556960105896 + }, + { + "auxiliary_loss_clip": 0.06403892, + "auxiliary_loss_mlp": 0.0126422, + "balance_loss_clip": 0.06269234, + "balance_loss_mlp": 0.01254701, + "epoch": 0.8052908462347813, + "flos": 21762580855680.0, + "grad_norm": 1.8325597430410605, + "language_loss": 0.77066338, + "learning_rate": 3.84579682111414e-07, + "loss": 0.84734452, + "num_input_tokens_seen": 289116890, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.09521484, + "step": 13394, + "time_per_iteration": 2.4934189319610596 + }, + { + "auxiliary_loss_clip": 0.06404327, + "auxiliary_loss_mlp": 0.01264444, + "balance_loss_clip": 0.06272115, + "balance_loss_mlp": 0.0125564, + "epoch": 0.8053509694874492, + "flos": 25448490910080.0, + "grad_norm": 1.6042981916986414, + "language_loss": 0.64741898, + "learning_rate": 3.843500940147304e-07, + "loss": 0.72410667, + "num_input_tokens_seen": 289136670, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.08807373, + "step": 13395, + "time_per_iteration": 2.533311128616333 + }, + { + "auxiliary_loss_clip": 0.06312156, + "auxiliary_loss_mlp": 0.01248555, + "balance_loss_clip": 0.06256828, + "balance_loss_mlp": 0.01247604, + "epoch": 0.8054110927401172, + "flos": 57687316992000.0, + "grad_norm": 0.7425366741213568, + "language_loss": 0.57110387, + "learning_rate": 3.8412056718365206e-07, + "loss": 0.64671093, + "num_input_tokens_seen": 289200150, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.00949097, + "step": 13396, + "time_per_iteration": 3.259113073348999 + }, + { + "auxiliary_loss_clip": 0.06404525, + "auxiliary_loss_mlp": 0.01265419, + "balance_loss_clip": 0.06271605, + "balance_loss_mlp": 0.01255137, + "epoch": 0.8054712159927853, + "flos": 19281385278720.0, + "grad_norm": 1.6270130332272381, + "language_loss": 0.77506781, + "learning_rate": 3.8389110162688353e-07, + "loss": 0.85176718, + "num_input_tokens_seen": 289218125, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.10284424, + "step": 13397, + "time_per_iteration": 2.559624671936035 + }, + { + "auxiliary_loss_clip": 0.06403451, + "auxiliary_loss_mlp": 0.01266829, + "balance_loss_clip": 0.06271873, + "balance_loss_mlp": 0.01257025, + "epoch": 0.8055313392454532, + "flos": 17973402243840.0, + "grad_norm": 1.701332340336638, + "language_loss": 0.70611137, + "learning_rate": 3.836616973531266e-07, + "loss": 0.78281415, + "num_input_tokens_seen": 289237115, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09796143, + "step": 13398, + "time_per_iteration": 2.497774600982666 + }, + { + "auxiliary_loss_clip": 0.06399795, + "auxiliary_loss_mlp": 0.01265088, + "balance_loss_clip": 0.06268992, + "balance_loss_mlp": 0.01256565, + "epoch": 0.8055914624981212, + "flos": 13483133320320.0, + "grad_norm": 2.1436610227849693, + "language_loss": 0.69285464, + "learning_rate": 3.834323543710805e-07, + "loss": 0.76950341, + "num_input_tokens_seen": 289253635, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.08532715, + "step": 13399, + "time_per_iteration": 2.4626171588897705 + }, + { + "auxiliary_loss_clip": 0.06404109, + "auxiliary_loss_mlp": 0.01267023, + "balance_loss_clip": 0.06272507, + "balance_loss_mlp": 0.01258208, + "epoch": 0.8056515857507891, + "flos": 13229832827520.0, + "grad_norm": 2.1990447378092566, + "language_loss": 0.72496057, + "learning_rate": 3.8320307268944153e-07, + "loss": 0.80167186, + "num_input_tokens_seen": 289270085, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.08807373, + "step": 13400, + "time_per_iteration": 2.4746367931365967 + }, + { + "auxiliary_loss_clip": 0.06401473, + "auxiliary_loss_mlp": 0.0126175, + "balance_loss_clip": 0.06270804, + "balance_loss_mlp": 0.012521, + "epoch": 0.8057117090034571, + "flos": 23884943322240.0, + "grad_norm": 1.7063053615868358, + "language_loss": 0.64111948, + "learning_rate": 3.829738523169037e-07, + "loss": 0.71775174, + "num_input_tokens_seen": 289289645, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.09655762, + "step": 13401, + "time_per_iteration": 4.023234128952026 + }, + { + "auxiliary_loss_clip": 0.06402341, + "auxiliary_loss_mlp": 0.01263217, + "balance_loss_clip": 0.06269568, + "balance_loss_mlp": 0.01254301, + "epoch": 0.805771832256125, + "flos": 21220536792960.0, + "grad_norm": 2.264659490025675, + "language_loss": 0.84643924, + "learning_rate": 3.8274469326215985e-07, + "loss": 0.92309481, + "num_input_tokens_seen": 289306630, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.08917236, + "step": 13402, + "time_per_iteration": 2.5050251483917236 + }, + { + "auxiliary_loss_clip": 0.06405149, + "auxiliary_loss_mlp": 0.01263056, + "balance_loss_clip": 0.06272706, + "balance_loss_mlp": 0.01253799, + "epoch": 0.805831955508793, + "flos": 17572627365120.0, + "grad_norm": 2.3703538824260035, + "language_loss": 0.68481362, + "learning_rate": 3.8251559553389876e-07, + "loss": 0.76149571, + "num_input_tokens_seen": 289324960, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.09246826, + "step": 13403, + "time_per_iteration": 2.4735195636749268 + }, + { + "auxiliary_loss_clip": 0.06400239, + "auxiliary_loss_mlp": 0.01261852, + "balance_loss_clip": 0.06271947, + "balance_loss_mlp": 0.0125287, + "epoch": 0.805892078761461, + "flos": 26914975142400.0, + "grad_norm": 1.5925529869996475, + "language_loss": 0.8470757, + "learning_rate": 3.822865591408084e-07, + "loss": 0.92369658, + "num_input_tokens_seen": 289344980, + "router_z_loss_clip": 1.28320312, + "router_z_loss_mlp": 0.08984375, + "step": 13404, + "time_per_iteration": 2.5682694911956787 + }, + { + "auxiliary_loss_clip": 0.06395989, + "auxiliary_loss_mlp": 0.01263837, + "balance_loss_clip": 0.06269123, + "balance_loss_mlp": 0.01255004, + "epoch": 0.805952202014129, + "flos": 31514927460480.0, + "grad_norm": 1.526531849234785, + "language_loss": 0.70693904, + "learning_rate": 3.820575840915743e-07, + "loss": 0.78353727, + "num_input_tokens_seen": 289367500, + "router_z_loss_clip": 1.26855469, + "router_z_loss_mlp": 0.08837891, + "step": 13405, + "time_per_iteration": 2.5887579917907715 + }, + { + "auxiliary_loss_clip": 0.06400827, + "auxiliary_loss_mlp": 0.01262326, + "balance_loss_clip": 0.06271822, + "balance_loss_mlp": 0.01253439, + "epoch": 0.8060123252667969, + "flos": 24396952896000.0, + "grad_norm": 2.4387244414721247, + "language_loss": 0.75653315, + "learning_rate": 3.818286703948788e-07, + "loss": 0.83316469, + "num_input_tokens_seen": 289385930, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.08874512, + "step": 13406, + "time_per_iteration": 2.5906982421875 + }, + { + "auxiliary_loss_clip": 0.0640468, + "auxiliary_loss_mlp": 0.0126352, + "balance_loss_clip": 0.06271018, + "balance_loss_mlp": 0.01254139, + "epoch": 0.8060724485194649, + "flos": 23487145263360.0, + "grad_norm": 1.4318493035492519, + "language_loss": 0.76315004, + "learning_rate": 3.815998180594018e-07, + "loss": 0.83983201, + "num_input_tokens_seen": 289408025, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.09387207, + "step": 13407, + "time_per_iteration": 2.550020456314087 + }, + { + "auxiliary_loss_clip": 0.06398082, + "auxiliary_loss_mlp": 0.01267404, + "balance_loss_clip": 0.06267268, + "balance_loss_mlp": 0.01257849, + "epoch": 0.8061325717721328, + "flos": 18630412508160.0, + "grad_norm": 1.6703188276302636, + "language_loss": 0.74090451, + "learning_rate": 3.81371027093822e-07, + "loss": 0.81755936, + "num_input_tokens_seen": 289426575, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09558105, + "step": 13408, + "time_per_iteration": 2.470579147338867 + }, + { + "auxiliary_loss_clip": 0.0640013, + "auxiliary_loss_mlp": 0.01265287, + "balance_loss_clip": 0.06269395, + "balance_loss_mlp": 0.01255363, + "epoch": 0.8061926950248008, + "flos": 23588862520320.0, + "grad_norm": 2.2758390778618227, + "language_loss": 0.70484757, + "learning_rate": 3.8114229750681523e-07, + "loss": 0.78150177, + "num_input_tokens_seen": 289447760, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09924316, + "step": 13409, + "time_per_iteration": 2.5231001377105713 + }, + { + "auxiliary_loss_clip": 0.06405453, + "auxiliary_loss_mlp": 0.0126443, + "balance_loss_clip": 0.06271958, + "balance_loss_mlp": 0.01254809, + "epoch": 0.8062528182774689, + "flos": 11147735047680.0, + "grad_norm": 2.081436146875831, + "language_loss": 0.77509671, + "learning_rate": 3.809136293070545e-07, + "loss": 0.85179555, + "num_input_tokens_seen": 289463920, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.09625244, + "step": 13410, + "time_per_iteration": 3.973681926727295 + }, + { + "auxiliary_loss_clip": 0.064013, + "auxiliary_loss_mlp": 0.01264495, + "balance_loss_clip": 0.06271458, + "balance_loss_mlp": 0.01254708, + "epoch": 0.8063129415301368, + "flos": 22353484648320.0, + "grad_norm": 1.8160554729971454, + "language_loss": 0.69222361, + "learning_rate": 3.806850225032117e-07, + "loss": 0.76888156, + "num_input_tokens_seen": 289482635, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.09796143, + "step": 13411, + "time_per_iteration": 2.5478432178497314 + }, + { + "auxiliary_loss_clip": 0.0640078, + "auxiliary_loss_mlp": 0.01266206, + "balance_loss_clip": 0.06270846, + "balance_loss_mlp": 0.01256496, + "epoch": 0.8063730647828048, + "flos": 23995297549440.0, + "grad_norm": 1.6928705363709327, + "language_loss": 0.68386424, + "learning_rate": 3.804564771039551e-07, + "loss": 0.76053417, + "num_input_tokens_seen": 289502040, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.0970459, + "step": 13412, + "time_per_iteration": 2.5194411277770996 + }, + { + "auxiliary_loss_clip": 0.06407973, + "auxiliary_loss_mlp": 0.01269354, + "balance_loss_clip": 0.06271837, + "balance_loss_mlp": 0.0125931, + "epoch": 0.8064331880354727, + "flos": 21327369148800.0, + "grad_norm": 2.7853306409882075, + "language_loss": 0.81920803, + "learning_rate": 3.8022799311795064e-07, + "loss": 0.89598131, + "num_input_tokens_seen": 289520740, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.1005249, + "step": 13413, + "time_per_iteration": 3.956393003463745 + }, + { + "auxiliary_loss_clip": 0.06400369, + "auxiliary_loss_mlp": 0.01263377, + "balance_loss_clip": 0.06269833, + "balance_loss_mlp": 0.01254144, + "epoch": 0.8064933112881407, + "flos": 19689036192000.0, + "grad_norm": 1.9565362890159896, + "language_loss": 0.855667, + "learning_rate": 3.7999957055386303e-07, + "loss": 0.9323045, + "num_input_tokens_seen": 289535840, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.09222412, + "step": 13414, + "time_per_iteration": 2.478431463241577 + }, + { + "auxiliary_loss_clip": 0.06395735, + "auxiliary_loss_mlp": 0.01261118, + "balance_loss_clip": 0.06267722, + "balance_loss_mlp": 0.0125241, + "epoch": 0.8065534345408086, + "flos": 19285494128640.0, + "grad_norm": 1.77092386295028, + "language_loss": 0.67096937, + "learning_rate": 3.7977120942035467e-07, + "loss": 0.74753791, + "num_input_tokens_seen": 289555205, + "router_z_loss_clip": 1.27734375, + "router_z_loss_mlp": 0.08703613, + "step": 13415, + "time_per_iteration": 2.563744306564331 + }, + { + "auxiliary_loss_clip": 0.06397079, + "auxiliary_loss_mlp": 0.01262985, + "balance_loss_clip": 0.06269363, + "balance_loss_mlp": 0.01254336, + "epoch": 0.8066135577934767, + "flos": 19682998698240.0, + "grad_norm": 1.462252167408637, + "language_loss": 0.76685238, + "learning_rate": 3.7954290972608383e-07, + "loss": 0.84345299, + "num_input_tokens_seen": 289573000, + "router_z_loss_clip": 1.27636719, + "router_z_loss_mlp": 0.08642578, + "step": 13416, + "time_per_iteration": 2.4804248809814453 + }, + { + "auxiliary_loss_clip": 0.06406631, + "auxiliary_loss_mlp": 0.01264601, + "balance_loss_clip": 0.0627007, + "balance_loss_mlp": 0.0125572, + "epoch": 0.8066736810461446, + "flos": 21150195690240.0, + "grad_norm": 1.5328758960444588, + "language_loss": 0.65077549, + "learning_rate": 3.793146714797086e-07, + "loss": 0.7274878, + "num_input_tokens_seen": 289592625, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.08886719, + "step": 13417, + "time_per_iteration": 2.5191526412963867 + }, + { + "auxiliary_loss_clip": 0.06405359, + "auxiliary_loss_mlp": 0.01264787, + "balance_loss_clip": 0.06270691, + "balance_loss_mlp": 0.01255316, + "epoch": 0.8067338042988126, + "flos": 22604311445760.0, + "grad_norm": 1.8039686506560615, + "language_loss": 0.80821931, + "learning_rate": 3.7908649468988306e-07, + "loss": 0.88492072, + "num_input_tokens_seen": 289610780, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.0947876, + "step": 13418, + "time_per_iteration": 2.5160207748413086 + }, + { + "auxiliary_loss_clip": 0.06405315, + "auxiliary_loss_mlp": 0.01266074, + "balance_loss_clip": 0.0627213, + "balance_loss_mlp": 0.01256197, + "epoch": 0.8067939275514805, + "flos": 16514003681280.0, + "grad_norm": 1.5721182795151136, + "language_loss": 0.8479256, + "learning_rate": 3.7885837936526066e-07, + "loss": 0.92463952, + "num_input_tokens_seen": 289628890, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.09869385, + "step": 13419, + "time_per_iteration": 3.9109416007995605 + }, + { + "auxiliary_loss_clip": 0.06404698, + "auxiliary_loss_mlp": 0.01263141, + "balance_loss_clip": 0.06270822, + "balance_loss_mlp": 0.01253247, + "epoch": 0.8068540508041485, + "flos": 28548276854400.0, + "grad_norm": 3.4687459017553457, + "language_loss": 0.76469827, + "learning_rate": 3.7863032551449047e-07, + "loss": 0.84137666, + "num_input_tokens_seen": 289647220, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.09899902, + "step": 13420, + "time_per_iteration": 2.5552561283111572 + }, + { + "auxiliary_loss_clip": 0.06399071, + "auxiliary_loss_mlp": 0.01262613, + "balance_loss_clip": 0.06269781, + "balance_loss_mlp": 0.01254399, + "epoch": 0.8069141740568164, + "flos": 21658851100800.0, + "grad_norm": 1.688287839835823, + "language_loss": 0.78943896, + "learning_rate": 3.784023331462207e-07, + "loss": 0.86605579, + "num_input_tokens_seen": 289665800, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.08215332, + "step": 13421, + "time_per_iteration": 2.4860880374908447 + }, + { + "auxiliary_loss_clip": 0.0640534, + "auxiliary_loss_mlp": 0.0126436, + "balance_loss_clip": 0.06272358, + "balance_loss_mlp": 0.01255109, + "epoch": 0.8069742973094844, + "flos": 17534962154880.0, + "grad_norm": 1.6579871645529392, + "language_loss": 0.79629791, + "learning_rate": 3.78174402269098e-07, + "loss": 0.8729949, + "num_input_tokens_seen": 289682705, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.09246826, + "step": 13422, + "time_per_iteration": 2.4994351863861084 + }, + { + "auxiliary_loss_clip": 0.0640166, + "auxiliary_loss_mlp": 0.01264919, + "balance_loss_clip": 0.06269953, + "balance_loss_mlp": 0.01255406, + "epoch": 0.8070344205621525, + "flos": 23373646508160.0, + "grad_norm": 1.5141862299887854, + "language_loss": 0.68537223, + "learning_rate": 3.7794653289176347e-07, + "loss": 0.76203805, + "num_input_tokens_seen": 289702920, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.09509277, + "step": 13423, + "time_per_iteration": 2.5125439167022705 + }, + { + "auxiliary_loss_clip": 0.06405628, + "auxiliary_loss_mlp": 0.01264277, + "balance_loss_clip": 0.06268807, + "balance_loss_mlp": 0.0125393, + "epoch": 0.8070945438148204, + "flos": 22936883500800.0, + "grad_norm": 1.7490687501288111, + "language_loss": 0.80183315, + "learning_rate": 3.7771872502285904e-07, + "loss": 0.87853223, + "num_input_tokens_seen": 289723280, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.10351562, + "step": 13424, + "time_per_iteration": 2.525763511657715 + }, + { + "auxiliary_loss_clip": 0.06411269, + "auxiliary_loss_mlp": 0.01266373, + "balance_loss_clip": 0.06275322, + "balance_loss_mlp": 0.01256807, + "epoch": 0.8071546670674884, + "flos": 25307599069440.0, + "grad_norm": 1.3989158711688392, + "language_loss": 0.79125178, + "learning_rate": 3.774909786710232e-07, + "loss": 0.86802822, + "num_input_tokens_seen": 289743475, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.09570312, + "step": 13425, + "time_per_iteration": 2.56131649017334 + }, + { + "auxiliary_loss_clip": 0.06402414, + "auxiliary_loss_mlp": 0.01263563, + "balance_loss_clip": 0.06271134, + "balance_loss_mlp": 0.01255176, + "epoch": 0.8072147903201563, + "flos": 18119534964480.0, + "grad_norm": 3.747532904590834, + "language_loss": 0.75868148, + "learning_rate": 3.772632938448923e-07, + "loss": 0.83534127, + "num_input_tokens_seen": 289761400, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.08392334, + "step": 13426, + "time_per_iteration": 2.5067336559295654 + }, + { + "auxiliary_loss_clip": 0.06402829, + "auxiliary_loss_mlp": 0.01265353, + "balance_loss_clip": 0.06269912, + "balance_loss_mlp": 0.01255823, + "epoch": 0.8072749135728243, + "flos": 26695482572160.0, + "grad_norm": 1.699020195158221, + "language_loss": 0.7311064, + "learning_rate": 3.770356705530997e-07, + "loss": 0.80778825, + "num_input_tokens_seen": 289781025, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.09533691, + "step": 13427, + "time_per_iteration": 2.5475499629974365 + }, + { + "auxiliary_loss_clip": 0.06399049, + "auxiliary_loss_mlp": 0.01264857, + "balance_loss_clip": 0.06268165, + "balance_loss_mlp": 0.01255678, + "epoch": 0.8073350368254922, + "flos": 19245564858240.0, + "grad_norm": 1.5262575334072062, + "language_loss": 0.70244026, + "learning_rate": 3.768081088042774e-07, + "loss": 0.77907926, + "num_input_tokens_seen": 289798380, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.09179688, + "step": 13428, + "time_per_iteration": 2.4958949089050293 + }, + { + "auxiliary_loss_clip": 0.06403936, + "auxiliary_loss_mlp": 0.01261298, + "balance_loss_clip": 0.06270581, + "balance_loss_mlp": 0.01252501, + "epoch": 0.8073951600781603, + "flos": 13339642003200.0, + "grad_norm": 1.7655256411115205, + "language_loss": 0.74963367, + "learning_rate": 3.765806086070544e-07, + "loss": 0.82628596, + "num_input_tokens_seen": 289814515, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.0880127, + "step": 13429, + "time_per_iteration": 2.4495036602020264 + }, + { + "auxiliary_loss_clip": 0.06396128, + "auxiliary_loss_mlp": 0.01267542, + "balance_loss_clip": 0.06269226, + "balance_loss_mlp": 0.01258655, + "epoch": 0.8074552833308282, + "flos": 22859205166080.0, + "grad_norm": 1.6937365017718335, + "language_loss": 0.67073148, + "learning_rate": 3.763531699700568e-07, + "loss": 0.74736816, + "num_input_tokens_seen": 289834315, + "router_z_loss_clip": 1.27050781, + "router_z_loss_mlp": 0.08886719, + "step": 13430, + "time_per_iteration": 2.5136795043945312 + }, + { + "auxiliary_loss_clip": 0.063988, + "auxiliary_loss_mlp": 0.01265178, + "balance_loss_clip": 0.06269097, + "balance_loss_mlp": 0.01255689, + "epoch": 0.8075154065834962, + "flos": 20345627185920.0, + "grad_norm": 1.9845601369160015, + "language_loss": 0.80206978, + "learning_rate": 3.7612579290190994e-07, + "loss": 0.87870961, + "num_input_tokens_seen": 289853770, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.09490967, + "step": 13431, + "time_per_iteration": 2.4789979457855225 + }, + { + "auxiliary_loss_clip": 0.06399001, + "auxiliary_loss_mlp": 0.01262918, + "balance_loss_clip": 0.06270722, + "balance_loss_mlp": 0.01253208, + "epoch": 0.8075755298361641, + "flos": 21914499507840.0, + "grad_norm": 1.684620767458615, + "language_loss": 0.803487, + "learning_rate": 3.7589847741123593e-07, + "loss": 0.88010621, + "num_input_tokens_seen": 289870480, + "router_z_loss_clip": 1.28222656, + "router_z_loss_mlp": 0.09716797, + "step": 13432, + "time_per_iteration": 2.5136168003082275 + }, + { + "auxiliary_loss_clip": 0.06406735, + "auxiliary_loss_mlp": 0.01265738, + "balance_loss_clip": 0.06269664, + "balance_loss_mlp": 0.01255748, + "epoch": 0.8076356530888321, + "flos": 15674746786560.0, + "grad_norm": 1.7687436770793032, + "language_loss": 0.70454299, + "learning_rate": 3.7567122350665415e-07, + "loss": 0.78126764, + "num_input_tokens_seen": 289888275, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.09985352, + "step": 13433, + "time_per_iteration": 2.4561402797698975 + }, + { + "auxiliary_loss_clip": 0.06400432, + "auxiliary_loss_mlp": 0.01263944, + "balance_loss_clip": 0.0626875, + "balance_loss_mlp": 0.01254503, + "epoch": 0.8076957763415, + "flos": 37786182117120.0, + "grad_norm": 1.418853459910882, + "language_loss": 0.72760022, + "learning_rate": 3.754440311967828e-07, + "loss": 0.80424392, + "num_input_tokens_seen": 289911495, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09448242, + "step": 13434, + "time_per_iteration": 2.6385674476623535 + }, + { + "auxiliary_loss_clip": 0.0640171, + "auxiliary_loss_mlp": 0.01262368, + "balance_loss_clip": 0.06270848, + "balance_loss_mlp": 0.01253088, + "epoch": 0.807755899594168, + "flos": 19617059934720.0, + "grad_norm": 1.6864587297815326, + "language_loss": 0.6805675, + "learning_rate": 3.752169004902361e-07, + "loss": 0.75720823, + "num_input_tokens_seen": 289930045, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.09277344, + "step": 13435, + "time_per_iteration": 2.4785990715026855 + }, + { + "auxiliary_loss_clip": 0.06405824, + "auxiliary_loss_mlp": 0.01265903, + "balance_loss_clip": 0.06270979, + "balance_loss_mlp": 0.01255419, + "epoch": 0.8078160228468361, + "flos": 23301628323840.0, + "grad_norm": 1.5075228238156948, + "language_loss": 0.75472784, + "learning_rate": 3.749898313956279e-07, + "loss": 0.83144516, + "num_input_tokens_seen": 289950815, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.1048584, + "step": 13436, + "time_per_iteration": 2.5161588191986084 + }, + { + "auxiliary_loss_clip": 0.0639594, + "auxiliary_loss_mlp": 0.01264176, + "balance_loss_clip": 0.06268739, + "balance_loss_mlp": 0.01255015, + "epoch": 0.807876146099504, + "flos": 27170078497920.0, + "grad_norm": 2.2394405611791233, + "language_loss": 0.70518959, + "learning_rate": 3.747628239215674e-07, + "loss": 0.78179073, + "num_input_tokens_seen": 289971730, + "router_z_loss_clip": 1.27148438, + "router_z_loss_mlp": 0.09161377, + "step": 13437, + "time_per_iteration": 2.544955253601074 + }, + { + "auxiliary_loss_clip": 0.06399636, + "auxiliary_loss_mlp": 0.0126867, + "balance_loss_clip": 0.06271046, + "balance_loss_mlp": 0.01259547, + "epoch": 0.807936269352172, + "flos": 27167017824000.0, + "grad_norm": 1.6660512068527857, + "language_loss": 0.72636318, + "learning_rate": 3.745358780766636e-07, + "loss": 0.80304617, + "num_input_tokens_seen": 289992995, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.09118652, + "step": 13438, + "time_per_iteration": 2.557361602783203 + }, + { + "auxiliary_loss_clip": 0.06401914, + "auxiliary_loss_mlp": 0.01263852, + "balance_loss_clip": 0.06271158, + "balance_loss_mlp": 0.01254596, + "epoch": 0.8079963926048399, + "flos": 20746653626880.0, + "grad_norm": 1.7758378703265403, + "language_loss": 0.77106637, + "learning_rate": 3.7430899386952344e-07, + "loss": 0.84772402, + "num_input_tokens_seen": 290009405, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.09259033, + "step": 13439, + "time_per_iteration": 2.4744443893432617 + }, + { + "auxiliary_loss_clip": 0.0639698, + "auxiliary_loss_mlp": 0.01267748, + "balance_loss_clip": 0.0626818, + "balance_loss_mlp": 0.0125867, + "epoch": 0.8080565158575079, + "flos": 25016675293440.0, + "grad_norm": 1.4635512483706237, + "language_loss": 0.78747815, + "learning_rate": 3.7408217130874786e-07, + "loss": 0.86412537, + "num_input_tokens_seen": 290031085, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.09075928, + "step": 13440, + "time_per_iteration": 3.9725441932678223 + }, + { + "auxiliary_loss_clip": 0.06403578, + "auxiliary_loss_mlp": 0.01264545, + "balance_loss_clip": 0.06269738, + "balance_loss_mlp": 0.01254824, + "epoch": 0.8081166391101758, + "flos": 18704107774080.0, + "grad_norm": 1.8241112266239554, + "language_loss": 0.59381831, + "learning_rate": 3.7385541040293946e-07, + "loss": 0.67049956, + "num_input_tokens_seen": 290048670, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.097229, + "step": 13441, + "time_per_iteration": 2.4906275272369385 + }, + { + "auxiliary_loss_clip": 0.06400108, + "auxiliary_loss_mlp": 0.01265175, + "balance_loss_clip": 0.06268845, + "balance_loss_mlp": 0.01254959, + "epoch": 0.8081767623628439, + "flos": 19834791569280.0, + "grad_norm": 1.7995495906095618, + "language_loss": 0.76109755, + "learning_rate": 3.7362871116069684e-07, + "loss": 0.83775043, + "num_input_tokens_seen": 290064085, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.10211182, + "step": 13442, + "time_per_iteration": 2.4604549407958984 + }, + { + "auxiliary_loss_clip": 0.06400936, + "auxiliary_loss_mlp": 0.01265886, + "balance_loss_clip": 0.06269497, + "balance_loss_mlp": 0.01256963, + "epoch": 0.8082368856155118, + "flos": 35781762672000.0, + "grad_norm": 1.6604750720544754, + "language_loss": 0.70819938, + "learning_rate": 3.734020735906169e-07, + "loss": 0.78486764, + "num_input_tokens_seen": 290086255, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.0892334, + "step": 13443, + "time_per_iteration": 2.649662733078003 + }, + { + "auxiliary_loss_clip": 0.06397702, + "auxiliary_loss_mlp": 0.01263095, + "balance_loss_clip": 0.06270011, + "balance_loss_mlp": 0.01254083, + "epoch": 0.8082970088681798, + "flos": 17203102859520.0, + "grad_norm": 1.69624931733301, + "language_loss": 0.82922244, + "learning_rate": 3.7317549770129286e-07, + "loss": 0.90583038, + "num_input_tokens_seen": 290103995, + "router_z_loss_clip": 1.27539062, + "router_z_loss_mlp": 0.09002686, + "step": 13444, + "time_per_iteration": 2.4664461612701416 + }, + { + "auxiliary_loss_clip": 0.06307255, + "auxiliary_loss_mlp": 0.01255825, + "balance_loss_clip": 0.06252229, + "balance_loss_mlp": 0.0125482, + "epoch": 0.8083571321208477, + "flos": 63571437786240.0, + "grad_norm": 0.8022589405220855, + "language_loss": 0.53542054, + "learning_rate": 3.7294898350131754e-07, + "loss": 0.61105132, + "num_input_tokens_seen": 290157245, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.01004791, + "step": 13445, + "time_per_iteration": 2.97573184967041 + }, + { + "auxiliary_loss_clip": 0.0640046, + "auxiliary_loss_mlp": 0.01265553, + "balance_loss_clip": 0.06270578, + "balance_loss_mlp": 0.01255795, + "epoch": 0.8084172553735157, + "flos": 17936407866240.0, + "grad_norm": 1.9107072136167604, + "language_loss": 0.71992731, + "learning_rate": 3.7272253099927964e-07, + "loss": 0.79658741, + "num_input_tokens_seen": 290174970, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.09759521, + "step": 13446, + "time_per_iteration": 2.470470428466797 + }, + { + "auxiliary_loss_clip": 0.06404857, + "auxiliary_loss_mlp": 0.01268125, + "balance_loss_clip": 0.06270967, + "balance_loss_mlp": 0.0125816, + "epoch": 0.8084773786261836, + "flos": 24104939016960.0, + "grad_norm": 1.745974209686923, + "language_loss": 0.71612984, + "learning_rate": 3.7249614020376606e-07, + "loss": 0.79285973, + "num_input_tokens_seen": 290194395, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.09973145, + "step": 13447, + "time_per_iteration": 2.51505184173584 + }, + { + "auxiliary_loss_clip": 0.06409042, + "auxiliary_loss_mlp": 0.01262824, + "balance_loss_clip": 0.06273446, + "balance_loss_mlp": 0.01252363, + "epoch": 0.8085375018788516, + "flos": 15592288769280.0, + "grad_norm": 2.3228732633180544, + "language_loss": 0.7492891, + "learning_rate": 3.7226981112336197e-07, + "loss": 0.82600772, + "num_input_tokens_seen": 290209200, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10467529, + "step": 13448, + "time_per_iteration": 2.442843437194824 + }, + { + "auxiliary_loss_clip": 0.06307342, + "auxiliary_loss_mlp": 0.01252569, + "balance_loss_clip": 0.06252244, + "balance_loss_mlp": 0.01251538, + "epoch": 0.8085976251315197, + "flos": 67583071059840.0, + "grad_norm": 0.7146391235313417, + "language_loss": 0.6385448, + "learning_rate": 3.7204354376665024e-07, + "loss": 0.71414399, + "num_input_tokens_seen": 290274565, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.01032257, + "step": 13449, + "time_per_iteration": 3.196397066116333 + }, + { + "auxiliary_loss_clip": 0.06401651, + "auxiliary_loss_mlp": 0.01263751, + "balance_loss_clip": 0.06271486, + "balance_loss_mlp": 0.01253922, + "epoch": 0.8086577483841876, + "flos": 22567442849280.0, + "grad_norm": 1.604658676228095, + "language_loss": 0.74288607, + "learning_rate": 3.718173381422105e-07, + "loss": 0.81954008, + "num_input_tokens_seen": 290293630, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.09838867, + "step": 13450, + "time_per_iteration": 4.000797510147095 + }, + { + "auxiliary_loss_clip": 0.06401928, + "auxiliary_loss_mlp": 0.01263996, + "balance_loss_clip": 0.06270078, + "balance_loss_mlp": 0.01254304, + "epoch": 0.8087178716368556, + "flos": 17973947295360.0, + "grad_norm": 1.6133158920878963, + "language_loss": 0.74275053, + "learning_rate": 3.7159119425861986e-07, + "loss": 0.81940979, + "num_input_tokens_seen": 290311450, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09686279, + "step": 13451, + "time_per_iteration": 2.4525790214538574 + }, + { + "auxiliary_loss_clip": 0.06405082, + "auxiliary_loss_mlp": 0.012649, + "balance_loss_clip": 0.06269129, + "balance_loss_mlp": 0.01254339, + "epoch": 0.8087779948895235, + "flos": 21724915645440.0, + "grad_norm": 1.6921247392748657, + "language_loss": 0.8051089, + "learning_rate": 3.713651121244543e-07, + "loss": 0.88180876, + "num_input_tokens_seen": 290330165, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.10552979, + "step": 13452, + "time_per_iteration": 2.516119956970215 + }, + { + "auxiliary_loss_clip": 0.06403025, + "auxiliary_loss_mlp": 0.01262182, + "balance_loss_clip": 0.06269191, + "balance_loss_mlp": 0.0125255, + "epoch": 0.8088381181421915, + "flos": 29100047990400.0, + "grad_norm": 1.6952548496868898, + "language_loss": 0.78266019, + "learning_rate": 3.711390917482875e-07, + "loss": 0.8593123, + "num_input_tokens_seen": 290350815, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.09637451, + "step": 13453, + "time_per_iteration": 4.042112827301025 + }, + { + "auxiliary_loss_clip": 0.06403942, + "auxiliary_loss_mlp": 0.01265524, + "balance_loss_clip": 0.06271793, + "balance_loss_mlp": 0.01255642, + "epoch": 0.8088982413948594, + "flos": 22204668597120.0, + "grad_norm": 2.3407226705929514, + "language_loss": 0.77383858, + "learning_rate": 3.709131331386892e-07, + "loss": 0.85053325, + "num_input_tokens_seen": 290367380, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.09875488, + "step": 13454, + "time_per_iteration": 2.554422378540039 + }, + { + "auxiliary_loss_clip": 0.06400093, + "auxiliary_loss_mlp": 0.01268227, + "balance_loss_clip": 0.06270082, + "balance_loss_mlp": 0.01257391, + "epoch": 0.8089583646475275, + "flos": 28044023783040.0, + "grad_norm": 1.8288081098987639, + "language_loss": 0.76939356, + "learning_rate": 3.7068723630422795e-07, + "loss": 0.84607673, + "num_input_tokens_seen": 290387965, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.1083374, + "step": 13455, + "time_per_iteration": 2.5715341567993164 + }, + { + "auxiliary_loss_clip": 0.06401575, + "auxiliary_loss_mlp": 0.01262608, + "balance_loss_clip": 0.06268826, + "balance_loss_mlp": 0.01253679, + "epoch": 0.8090184879001954, + "flos": 16623309732480.0, + "grad_norm": 1.6907159449842466, + "language_loss": 0.78554362, + "learning_rate": 3.70461401253471e-07, + "loss": 0.86218548, + "num_input_tokens_seen": 290404150, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.0892334, + "step": 13456, + "time_per_iteration": 2.508582830429077 + }, + { + "auxiliary_loss_clip": 0.0640007, + "auxiliary_loss_mlp": 0.01264463, + "balance_loss_clip": 0.06270983, + "balance_loss_mlp": 0.01255498, + "epoch": 0.8090786111528634, + "flos": 27347545445760.0, + "grad_norm": 1.776897039919432, + "language_loss": 0.71710402, + "learning_rate": 3.702356279949801e-07, + "loss": 0.79374933, + "num_input_tokens_seen": 290422370, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.08966064, + "step": 13457, + "time_per_iteration": 2.5812559127807617 + }, + { + "auxiliary_loss_clip": 0.06398778, + "auxiliary_loss_mlp": 0.01262909, + "balance_loss_clip": 0.0626803, + "balance_loss_mlp": 0.01253777, + "epoch": 0.8091387344055313, + "flos": 21112111209600.0, + "grad_norm": 1.6184921643640915, + "language_loss": 0.73064125, + "learning_rate": 3.700099165373176e-07, + "loss": 0.80725813, + "num_input_tokens_seen": 290442645, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09143066, + "step": 13458, + "time_per_iteration": 3.9770147800445557 + }, + { + "auxiliary_loss_clip": 0.06401807, + "auxiliary_loss_mlp": 0.01264535, + "balance_loss_clip": 0.06270815, + "balance_loss_mlp": 0.01255022, + "epoch": 0.8091988576581993, + "flos": 11659702694400.0, + "grad_norm": 2.4320264643935348, + "language_loss": 0.78925645, + "learning_rate": 3.6978426688904275e-07, + "loss": 0.86591995, + "num_input_tokens_seen": 290458520, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.09509277, + "step": 13459, + "time_per_iteration": 2.4999613761901855 + }, + { + "auxiliary_loss_clip": 0.06403743, + "auxiliary_loss_mlp": 0.01264391, + "balance_loss_clip": 0.06267793, + "balance_loss_mlp": 0.01254938, + "epoch": 0.8092589809108672, + "flos": 22969475539200.0, + "grad_norm": 2.9044403495473494, + "language_loss": 0.80189556, + "learning_rate": 3.695586790587113e-07, + "loss": 0.87857693, + "num_input_tokens_seen": 290474465, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.09442139, + "step": 13460, + "time_per_iteration": 2.4736809730529785 + }, + { + "auxiliary_loss_clip": 0.06403811, + "auxiliary_loss_mlp": 0.01265447, + "balance_loss_clip": 0.06270553, + "balance_loss_mlp": 0.01255463, + "epoch": 0.8093191041635353, + "flos": 13265988664320.0, + "grad_norm": 1.703012580351455, + "language_loss": 0.8516379, + "learning_rate": 3.693331530548789e-07, + "loss": 0.92833048, + "num_input_tokens_seen": 290492060, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.09973145, + "step": 13461, + "time_per_iteration": 2.472332000732422 + }, + { + "auxiliary_loss_clip": 0.06405523, + "auxiliary_loss_mlp": 0.01269044, + "balance_loss_clip": 0.06270871, + "balance_loss_mlp": 0.01258995, + "epoch": 0.8093792274162032, + "flos": 25522353884160.0, + "grad_norm": 1.7015064491080825, + "language_loss": 0.76382649, + "learning_rate": 3.69107688886096e-07, + "loss": 0.84057218, + "num_input_tokens_seen": 290511510, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.1005249, + "step": 13462, + "time_per_iteration": 2.5191242694854736 + }, + { + "auxiliary_loss_clip": 0.0640429, + "auxiliary_loss_mlp": 0.01263724, + "balance_loss_clip": 0.06271065, + "balance_loss_mlp": 0.01253812, + "epoch": 0.8094393506688712, + "flos": 23552622829440.0, + "grad_norm": 1.6157350617712933, + "language_loss": 0.82945341, + "learning_rate": 3.6888228656091357e-07, + "loss": 0.90613359, + "num_input_tokens_seen": 290530035, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.09906006, + "step": 13463, + "time_per_iteration": 2.5403740406036377 + }, + { + "auxiliary_loss_clip": 0.06398586, + "auxiliary_loss_mlp": 0.01262495, + "balance_loss_clip": 0.06268895, + "balance_loss_mlp": 0.0125371, + "epoch": 0.8094994739215392, + "flos": 17061624040320.0, + "grad_norm": 3.129891781410948, + "language_loss": 0.6203239, + "learning_rate": 3.686569460878779e-07, + "loss": 0.69693464, + "num_input_tokens_seen": 290548245, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.08789062, + "step": 13464, + "time_per_iteration": 2.5035338401794434 + }, + { + "auxiliary_loss_clip": 0.06398399, + "auxiliary_loss_mlp": 0.01268957, + "balance_loss_clip": 0.06270058, + "balance_loss_mlp": 0.01260386, + "epoch": 0.8095595971742071, + "flos": 23558157198720.0, + "grad_norm": 1.527244163455927, + "language_loss": 0.61969072, + "learning_rate": 3.684316674755341e-07, + "loss": 0.69636428, + "num_input_tokens_seen": 290568625, + "router_z_loss_clip": 1.28417969, + "router_z_loss_mlp": 0.08575439, + "step": 13465, + "time_per_iteration": 2.511592388153076 + }, + { + "auxiliary_loss_clip": 0.06402411, + "auxiliary_loss_mlp": 0.01268671, + "balance_loss_clip": 0.06272465, + "balance_loss_mlp": 0.01259319, + "epoch": 0.8096197204268751, + "flos": 20378973911040.0, + "grad_norm": 1.9869568826877384, + "language_loss": 0.8212142, + "learning_rate": 3.682064507324256e-07, + "loss": 0.89792502, + "num_input_tokens_seen": 290586575, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.09350586, + "step": 13466, + "time_per_iteration": 2.4735896587371826 + }, + { + "auxiliary_loss_clip": 0.06405444, + "auxiliary_loss_mlp": 0.0126549, + "balance_loss_clip": 0.06271167, + "balance_loss_mlp": 0.0125578, + "epoch": 0.809679843679543, + "flos": 27826208294400.0, + "grad_norm": 2.9775086459835225, + "language_loss": 0.76277745, + "learning_rate": 3.6798129586709204e-07, + "loss": 0.83948678, + "num_input_tokens_seen": 290606790, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.09710693, + "step": 13467, + "time_per_iteration": 2.541368007659912 + }, + { + "auxiliary_loss_clip": 0.06402574, + "auxiliary_loss_mlp": 0.0126548, + "balance_loss_clip": 0.06270781, + "balance_loss_mlp": 0.01256462, + "epoch": 0.8097399669322111, + "flos": 22019990198400.0, + "grad_norm": 1.5745990150023057, + "language_loss": 0.791363, + "learning_rate": 3.6775620288807073e-07, + "loss": 0.86804354, + "num_input_tokens_seen": 290625525, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09020996, + "step": 13468, + "time_per_iteration": 2.4730474948883057 + }, + { + "auxiliary_loss_clip": 0.06396127, + "auxiliary_loss_mlp": 0.0126498, + "balance_loss_clip": 0.06268477, + "balance_loss_mlp": 0.01255879, + "epoch": 0.809800090184879, + "flos": 18994905768960.0, + "grad_norm": 1.625398825677948, + "language_loss": 0.68054199, + "learning_rate": 3.675311718038978e-07, + "loss": 0.75715309, + "num_input_tokens_seen": 290644935, + "router_z_loss_clip": 1.27734375, + "router_z_loss_mlp": 0.09100342, + "step": 13469, + "time_per_iteration": 2.484276533126831 + }, + { + "auxiliary_loss_clip": 0.0630585, + "auxiliary_loss_mlp": 0.01249591, + "balance_loss_clip": 0.06250963, + "balance_loss_mlp": 0.01248598, + "epoch": 0.809860213437547, + "flos": 66120653750400.0, + "grad_norm": 0.6770585331201862, + "language_loss": 0.54451334, + "learning_rate": 3.6730620262310683e-07, + "loss": 0.62006778, + "num_input_tokens_seen": 290710735, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.00993347, + "step": 13470, + "time_per_iteration": 3.1943366527557373 + }, + { + "auxiliary_loss_clip": 0.06399186, + "auxiliary_loss_mlp": 0.01263139, + "balance_loss_clip": 0.0626805, + "balance_loss_mlp": 0.01253805, + "epoch": 0.8099203366902149, + "flos": 20888090519040.0, + "grad_norm": 1.850656923683804, + "language_loss": 0.69889498, + "learning_rate": 3.670812953542279e-07, + "loss": 0.77551824, + "num_input_tokens_seen": 290729565, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.09332275, + "step": 13471, + "time_per_iteration": 2.521888494491577 + }, + { + "auxiliary_loss_clip": 0.06400762, + "auxiliary_loss_mlp": 0.01264036, + "balance_loss_clip": 0.06269421, + "balance_loss_mlp": 0.01254899, + "epoch": 0.8099804599428829, + "flos": 26038053037440.0, + "grad_norm": 2.7576436132891584, + "language_loss": 0.80252707, + "learning_rate": 3.6685645000579003e-07, + "loss": 0.87917507, + "num_input_tokens_seen": 290749360, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09143066, + "step": 13472, + "time_per_iteration": 2.5895776748657227 + }, + { + "auxiliary_loss_clip": 0.06301145, + "auxiliary_loss_mlp": 0.01257277, + "balance_loss_clip": 0.06245954, + "balance_loss_mlp": 0.01256171, + "epoch": 0.8100405831955508, + "flos": 69324127522560.0, + "grad_norm": 0.7337883216097973, + "language_loss": 0.57360721, + "learning_rate": 3.666316665863201e-07, + "loss": 0.64919138, + "num_input_tokens_seen": 290812145, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.01108551, + "step": 13473, + "time_per_iteration": 3.0853075981140137 + }, + { + "auxiliary_loss_clip": 0.06402718, + "auxiliary_loss_mlp": 0.01263044, + "balance_loss_clip": 0.06268923, + "balance_loss_mlp": 0.01253585, + "epoch": 0.8101007064482189, + "flos": 15017820376320.0, + "grad_norm": 1.8256752375562084, + "language_loss": 0.74556285, + "learning_rate": 3.664069451043399e-07, + "loss": 0.82222044, + "num_input_tokens_seen": 290829845, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.09454346, + "step": 13474, + "time_per_iteration": 2.4723920822143555 + }, + { + "auxiliary_loss_clip": 0.06406249, + "auxiliary_loss_mlp": 0.01269145, + "balance_loss_clip": 0.06270969, + "balance_loss_mlp": 0.01259698, + "epoch": 0.8101608297008868, + "flos": 21073230115200.0, + "grad_norm": 1.4992308701275703, + "language_loss": 0.78592277, + "learning_rate": 3.661822855683723e-07, + "loss": 0.86267674, + "num_input_tokens_seen": 290848815, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.09442139, + "step": 13475, + "time_per_iteration": 2.49446964263916 + }, + { + "auxiliary_loss_clip": 0.06399214, + "auxiliary_loss_mlp": 0.01264956, + "balance_loss_clip": 0.06269421, + "balance_loss_mlp": 0.01255545, + "epoch": 0.8102209529535548, + "flos": 23737846279680.0, + "grad_norm": 2.1011404448378674, + "language_loss": 0.76127887, + "learning_rate": 3.659576879869364e-07, + "loss": 0.83792061, + "num_input_tokens_seen": 290868580, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.09405518, + "step": 13476, + "time_per_iteration": 2.623260259628296 + }, + { + "auxiliary_loss_clip": 0.06409746, + "auxiliary_loss_mlp": 0.0126529, + "balance_loss_clip": 0.06272443, + "balance_loss_mlp": 0.01255199, + "epoch": 0.8102810762062228, + "flos": 10959408996480.0, + "grad_norm": 1.9990272490296594, + "language_loss": 0.73678935, + "learning_rate": 3.657331523685485e-07, + "loss": 0.81353962, + "num_input_tokens_seen": 290883540, + "router_z_loss_clip": 1.37402344, + "router_z_loss_mlp": 0.10083008, + "step": 13477, + "time_per_iteration": 2.460721731185913 + }, + { + "auxiliary_loss_clip": 0.06398121, + "auxiliary_loss_mlp": 0.01261498, + "balance_loss_clip": 0.06267326, + "balance_loss_mlp": 0.01252123, + "epoch": 0.8103411994588907, + "flos": 14654291437440.0, + "grad_norm": 1.923341621184723, + "language_loss": 0.6978184, + "learning_rate": 3.6550867872172365e-07, + "loss": 0.7744146, + "num_input_tokens_seen": 290901560, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09375, + "step": 13478, + "time_per_iteration": 2.4879016876220703 + }, + { + "auxiliary_loss_clip": 0.06305265, + "auxiliary_loss_mlp": 0.01250833, + "balance_loss_clip": 0.06250156, + "balance_loss_mlp": 0.01249791, + "epoch": 0.8104013227115587, + "flos": 59170964112000.0, + "grad_norm": 0.6706213336405785, + "language_loss": 0.52182806, + "learning_rate": 3.6528426705497293e-07, + "loss": 0.5973891, + "num_input_tokens_seen": 290959185, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.01042175, + "step": 13479, + "time_per_iteration": 3.042278289794922 + }, + { + "auxiliary_loss_clip": 0.06402652, + "auxiliary_loss_mlp": 0.01265309, + "balance_loss_clip": 0.06272212, + "balance_loss_mlp": 0.01256183, + "epoch": 0.8104614459642266, + "flos": 19834833496320.0, + "grad_norm": 1.5781047108750677, + "language_loss": 0.71602416, + "learning_rate": 3.650599173768072e-07, + "loss": 0.79270375, + "num_input_tokens_seen": 290979585, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.09124756, + "step": 13480, + "time_per_iteration": 3.9115874767303467 + }, + { + "auxiliary_loss_clip": 0.06400708, + "auxiliary_loss_mlp": 0.0126312, + "balance_loss_clip": 0.06268963, + "balance_loss_mlp": 0.01253983, + "epoch": 0.8105215692168947, + "flos": 25381294335360.0, + "grad_norm": 1.960101511676754, + "language_loss": 0.79864734, + "learning_rate": 3.648356296957327e-07, + "loss": 0.87528563, + "num_input_tokens_seen": 291000865, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.09136963, + "step": 13481, + "time_per_iteration": 2.5275304317474365 + }, + { + "auxiliary_loss_clip": 0.06402725, + "auxiliary_loss_mlp": 0.0126697, + "balance_loss_clip": 0.06271543, + "balance_loss_mlp": 0.01258047, + "epoch": 0.8105816924695626, + "flos": 20487357567360.0, + "grad_norm": 1.7047460645728882, + "language_loss": 0.72716773, + "learning_rate": 3.646114040202548e-07, + "loss": 0.80386472, + "num_input_tokens_seen": 291018285, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.08929443, + "step": 13482, + "time_per_iteration": 2.5445470809936523 + }, + { + "auxiliary_loss_clip": 0.06404884, + "auxiliary_loss_mlp": 0.01266536, + "balance_loss_clip": 0.06271972, + "balance_loss_mlp": 0.01256773, + "epoch": 0.8106418157222306, + "flos": 14544021064320.0, + "grad_norm": 1.9920968678364395, + "language_loss": 0.65563977, + "learning_rate": 3.6438724035887705e-07, + "loss": 0.73235393, + "num_input_tokens_seen": 291035745, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.09771729, + "step": 13483, + "time_per_iteration": 2.5054430961608887 + }, + { + "auxiliary_loss_clip": 0.06400222, + "auxiliary_loss_mlp": 0.01266657, + "balance_loss_clip": 0.06270905, + "balance_loss_mlp": 0.01257514, + "epoch": 0.8107019389748985, + "flos": 22570964720640.0, + "grad_norm": 1.8159029910366271, + "language_loss": 0.76454484, + "learning_rate": 3.641631387200992e-07, + "loss": 0.84121364, + "num_input_tokens_seen": 291053280, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.09155273, + "step": 13484, + "time_per_iteration": 2.5171079635620117 + }, + { + "auxiliary_loss_clip": 0.06410594, + "auxiliary_loss_mlp": 0.01267955, + "balance_loss_clip": 0.06272984, + "balance_loss_mlp": 0.01257274, + "epoch": 0.8107620622275665, + "flos": 19615634415360.0, + "grad_norm": 1.4402469557627227, + "language_loss": 0.72541213, + "learning_rate": 3.639390991124183e-07, + "loss": 0.80219758, + "num_input_tokens_seen": 291072855, + "router_z_loss_clip": 1.37402344, + "router_z_loss_mlp": 0.10693359, + "step": 13485, + "time_per_iteration": 2.5724942684173584 + }, + { + "auxiliary_loss_clip": 0.06396358, + "auxiliary_loss_mlp": 0.01265165, + "balance_loss_clip": 0.06270035, + "balance_loss_mlp": 0.01256636, + "epoch": 0.8108221854802344, + "flos": 16149007296000.0, + "grad_norm": 1.8147105780341508, + "language_loss": 0.76297033, + "learning_rate": 3.637151215443308e-07, + "loss": 0.83958554, + "num_input_tokens_seen": 291090285, + "router_z_loss_clip": 1.26269531, + "router_z_loss_mlp": 0.08520508, + "step": 13486, + "time_per_iteration": 2.4431118965148926 + }, + { + "auxiliary_loss_clip": 0.06407452, + "auxiliary_loss_mlp": 0.01265864, + "balance_loss_clip": 0.06269525, + "balance_loss_mlp": 0.01256226, + "epoch": 0.8108823087329025, + "flos": 21112656261120.0, + "grad_norm": 1.8644106456764877, + "language_loss": 0.72075516, + "learning_rate": 3.6349120602433045e-07, + "loss": 0.79748833, + "num_input_tokens_seen": 291107675, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.09643555, + "step": 13487, + "time_per_iteration": 2.473879337310791 + }, + { + "auxiliary_loss_clip": 0.06400521, + "auxiliary_loss_mlp": 0.01268012, + "balance_loss_clip": 0.06272428, + "balance_loss_mlp": 0.01259018, + "epoch": 0.8109424319855704, + "flos": 29206377221760.0, + "grad_norm": 2.193678189628865, + "language_loss": 0.84388292, + "learning_rate": 3.6326735256090715e-07, + "loss": 0.92056829, + "num_input_tokens_seen": 291126900, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.08984375, + "step": 13488, + "time_per_iteration": 2.543301582336426 + }, + { + "auxiliary_loss_clip": 0.06405022, + "auxiliary_loss_mlp": 0.01264443, + "balance_loss_clip": 0.06271579, + "balance_loss_mlp": 0.01255198, + "epoch": 0.8110025552382384, + "flos": 23118459298560.0, + "grad_norm": 2.075195554418006, + "language_loss": 0.74304891, + "learning_rate": 3.630435611625502e-07, + "loss": 0.81974351, + "num_input_tokens_seen": 291145285, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.09234619, + "step": 13489, + "time_per_iteration": 3.9371306896209717 + }, + { + "auxiliary_loss_clip": 0.06400748, + "auxiliary_loss_mlp": 0.01266118, + "balance_loss_clip": 0.06272238, + "balance_loss_mlp": 0.01257523, + "epoch": 0.8110626784909064, + "flos": 22386076686720.0, + "grad_norm": 1.8053041582092544, + "language_loss": 0.71944815, + "learning_rate": 3.628198318377453e-07, + "loss": 0.79611677, + "num_input_tokens_seen": 291163485, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.0859375, + "step": 13490, + "time_per_iteration": 2.5005099773406982 + }, + { + "auxiliary_loss_clip": 0.0640538, + "auxiliary_loss_mlp": 0.01266534, + "balance_loss_clip": 0.06270941, + "balance_loss_mlp": 0.01256067, + "epoch": 0.8111228017435743, + "flos": 23374820465280.0, + "grad_norm": 2.2367527372378166, + "language_loss": 0.72137296, + "learning_rate": 3.625961645949762e-07, + "loss": 0.79809213, + "num_input_tokens_seen": 291182215, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.10461426, + "step": 13491, + "time_per_iteration": 2.5076067447662354 + }, + { + "auxiliary_loss_clip": 0.06401882, + "auxiliary_loss_mlp": 0.01265496, + "balance_loss_clip": 0.06270771, + "balance_loss_mlp": 0.01256115, + "epoch": 0.8111829249962423, + "flos": 21292680758400.0, + "grad_norm": 1.729765137359799, + "language_loss": 0.67871809, + "learning_rate": 3.623725594427245e-07, + "loss": 0.7553919, + "num_input_tokens_seen": 291203145, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09381104, + "step": 13492, + "time_per_iteration": 3.959716320037842 + }, + { + "auxiliary_loss_clip": 0.06405997, + "auxiliary_loss_mlp": 0.01263308, + "balance_loss_clip": 0.06272483, + "balance_loss_mlp": 0.01253605, + "epoch": 0.8112430482489102, + "flos": 22352017201920.0, + "grad_norm": 1.7889439150881994, + "language_loss": 0.72219712, + "learning_rate": 3.6214901638947006e-07, + "loss": 0.79889023, + "num_input_tokens_seen": 291220600, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.09698486, + "step": 13493, + "time_per_iteration": 2.498922348022461 + }, + { + "auxiliary_loss_clip": 0.06403151, + "auxiliary_loss_mlp": 0.01266475, + "balance_loss_clip": 0.06270409, + "balance_loss_mlp": 0.01256396, + "epoch": 0.8113031715015783, + "flos": 31146199568640.0, + "grad_norm": 1.5274300154238956, + "language_loss": 0.70765322, + "learning_rate": 3.619255354436885e-07, + "loss": 0.78434944, + "num_input_tokens_seen": 291241195, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.10070801, + "step": 13494, + "time_per_iteration": 2.582156181335449 + }, + { + "auxiliary_loss_clip": 0.06407354, + "auxiliary_loss_mlp": 0.01271061, + "balance_loss_clip": 0.0627186, + "balance_loss_mlp": 0.0126038, + "epoch": 0.8113632947542462, + "flos": 25342077824640.0, + "grad_norm": 1.9696659846261377, + "language_loss": 0.76812732, + "learning_rate": 3.6170211661385543e-07, + "loss": 0.84491146, + "num_input_tokens_seen": 291258715, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.10687256, + "step": 13495, + "time_per_iteration": 2.4944467544555664 + }, + { + "auxiliary_loss_clip": 0.06403805, + "auxiliary_loss_mlp": 0.01265569, + "balance_loss_clip": 0.06269392, + "balance_loss_mlp": 0.0125619, + "epoch": 0.8114234180069142, + "flos": 28446727305600.0, + "grad_norm": 1.6848017039498533, + "language_loss": 0.80030304, + "learning_rate": 3.614787599084417e-07, + "loss": 0.87699676, + "num_input_tokens_seen": 291278030, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.09375, + "step": 13496, + "time_per_iteration": 2.5573971271514893 + }, + { + "auxiliary_loss_clip": 0.06403383, + "auxiliary_loss_mlp": 0.01264908, + "balance_loss_clip": 0.06270055, + "balance_loss_mlp": 0.01254829, + "epoch": 0.8114835412595821, + "flos": 20344998280320.0, + "grad_norm": 1.6257058100958846, + "language_loss": 0.71732903, + "learning_rate": 3.6125546533591787e-07, + "loss": 0.79401189, + "num_input_tokens_seen": 291296740, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.10083008, + "step": 13497, + "time_per_iteration": 3.9020187854766846 + }, + { + "auxiliary_loss_clip": 0.06405488, + "auxiliary_loss_mlp": 0.01264686, + "balance_loss_clip": 0.06270734, + "balance_loss_mlp": 0.01255859, + "epoch": 0.8115436645122501, + "flos": 22497269454720.0, + "grad_norm": 1.6450222664154983, + "language_loss": 0.76774496, + "learning_rate": 3.610322329047508e-07, + "loss": 0.84444666, + "num_input_tokens_seen": 291318730, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.0881958, + "step": 13498, + "time_per_iteration": 2.53695011138916 + }, + { + "auxiliary_loss_clip": 0.06400445, + "auxiliary_loss_mlp": 0.01265682, + "balance_loss_clip": 0.06268942, + "balance_loss_mlp": 0.01256359, + "epoch": 0.811603787764918, + "flos": 13850477619840.0, + "grad_norm": 1.8314590117714953, + "language_loss": 0.84328604, + "learning_rate": 3.608090626234055e-07, + "loss": 0.91994727, + "num_input_tokens_seen": 291336755, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09313965, + "step": 13499, + "time_per_iteration": 2.4478304386138916 + }, + { + "auxiliary_loss_clip": 0.06405481, + "auxiliary_loss_mlp": 0.01265922, + "balance_loss_clip": 0.06274162, + "balance_loss_mlp": 0.01254311, + "epoch": 0.8116639110175861, + "flos": 21620766620160.0, + "grad_norm": 1.4739026591670814, + "language_loss": 0.76078045, + "learning_rate": 3.6058595450034603e-07, + "loss": 0.83749443, + "num_input_tokens_seen": 291356795, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.1161499, + "step": 13500, + "time_per_iteration": 2.5095434188842773 + }, + { + "auxiliary_loss_clip": 0.06305633, + "auxiliary_loss_mlp": 0.01251852, + "balance_loss_clip": 0.06250529, + "balance_loss_mlp": 0.01250827, + "epoch": 0.811724034270254, + "flos": 64481021055360.0, + "grad_norm": 0.7829192652401806, + "language_loss": 0.59720683, + "learning_rate": 3.603629085440303e-07, + "loss": 0.67278171, + "num_input_tokens_seen": 291416005, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.01024628, + "step": 13501, + "time_per_iteration": 3.165794610977173 + }, + { + "auxiliary_loss_clip": 0.06395126, + "auxiliary_loss_mlp": 0.01264174, + "balance_loss_clip": 0.06268613, + "balance_loss_mlp": 0.01255257, + "epoch": 0.811784157522922, + "flos": 24761068813440.0, + "grad_norm": 1.4866763661196265, + "language_loss": 0.793163, + "learning_rate": 3.6013992476291753e-07, + "loss": 0.86975598, + "num_input_tokens_seen": 291434870, + "router_z_loss_clip": 1.26464844, + "router_z_loss_mlp": 0.08905029, + "step": 13502, + "time_per_iteration": 2.5414836406707764 + }, + { + "auxiliary_loss_clip": 0.06399107, + "auxiliary_loss_mlp": 0.0126301, + "balance_loss_clip": 0.0626989, + "balance_loss_mlp": 0.01254188, + "epoch": 0.81184428077559, + "flos": 12172089611520.0, + "grad_norm": 2.6111507442822086, + "language_loss": 0.71246618, + "learning_rate": 3.599170031654635e-07, + "loss": 0.78908736, + "num_input_tokens_seen": 291452230, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.08825684, + "step": 13503, + "time_per_iteration": 2.595961332321167 + }, + { + "auxiliary_loss_clip": 0.06402574, + "auxiliary_loss_mlp": 0.0126551, + "balance_loss_clip": 0.06270054, + "balance_loss_mlp": 0.01255044, + "epoch": 0.8119044040282579, + "flos": 44432621429760.0, + "grad_norm": 1.4625675219914986, + "language_loss": 0.68073899, + "learning_rate": 3.5969414376012065e-07, + "loss": 0.75741982, + "num_input_tokens_seen": 291477425, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.10473633, + "step": 13504, + "time_per_iteration": 2.777693271636963 + }, + { + "auxiliary_loss_clip": 0.06402649, + "auxiliary_loss_mlp": 0.01265991, + "balance_loss_clip": 0.06269131, + "balance_loss_mlp": 0.0125593, + "epoch": 0.8119645272809259, + "flos": 52167131936640.0, + "grad_norm": 1.8871049986927122, + "language_loss": 0.75274026, + "learning_rate": 3.594713465553403e-07, + "loss": 0.82942665, + "num_input_tokens_seen": 291501070, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.10070801, + "step": 13505, + "time_per_iteration": 2.7910561561584473 + }, + { + "auxiliary_loss_clip": 0.06404154, + "auxiliary_loss_mlp": 0.01267225, + "balance_loss_clip": 0.06272307, + "balance_loss_mlp": 0.01257295, + "epoch": 0.8120246505335939, + "flos": 30241842451200.0, + "grad_norm": 4.0148732645076475, + "language_loss": 0.72911733, + "learning_rate": 3.5924861155957123e-07, + "loss": 0.80583107, + "num_input_tokens_seen": 291524945, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09924316, + "step": 13506, + "time_per_iteration": 2.5993027687072754 + }, + { + "auxiliary_loss_clip": 0.06410645, + "auxiliary_loss_mlp": 0.01265349, + "balance_loss_clip": 0.06271695, + "balance_loss_mlp": 0.01255496, + "epoch": 0.8120847737862619, + "flos": 22134243640320.0, + "grad_norm": 2.0108057093252754, + "language_loss": 0.76670831, + "learning_rate": 3.590259387812593e-07, + "loss": 0.84346819, + "num_input_tokens_seen": 291544605, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.09863281, + "step": 13507, + "time_per_iteration": 2.5172982215881348 + }, + { + "auxiliary_loss_clip": 0.06410617, + "auxiliary_loss_mlp": 0.01264037, + "balance_loss_clip": 0.06271885, + "balance_loss_mlp": 0.01253999, + "epoch": 0.8121448970389298, + "flos": 23301963740160.0, + "grad_norm": 1.6354212384469264, + "language_loss": 0.70526397, + "learning_rate": 3.5880332822884783e-07, + "loss": 0.7820105, + "num_input_tokens_seen": 291563850, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.10040283, + "step": 13508, + "time_per_iteration": 2.4784016609191895 + }, + { + "auxiliary_loss_clip": 0.06400366, + "auxiliary_loss_mlp": 0.01263654, + "balance_loss_clip": 0.06270534, + "balance_loss_mlp": 0.01254445, + "epoch": 0.8122050202915978, + "flos": 22170734893440.0, + "grad_norm": 1.5714430393800305, + "language_loss": 0.7640515, + "learning_rate": 3.585807799107785e-07, + "loss": 0.84069169, + "num_input_tokens_seen": 291581730, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.09204102, + "step": 13509, + "time_per_iteration": 2.489997625350952 + }, + { + "auxiliary_loss_clip": 0.06405313, + "auxiliary_loss_mlp": 0.01263273, + "balance_loss_clip": 0.06270471, + "balance_loss_mlp": 0.01253366, + "epoch": 0.8122651435442657, + "flos": 23265765976320.0, + "grad_norm": 1.7111560106150059, + "language_loss": 0.76858175, + "learning_rate": 3.58358293835491e-07, + "loss": 0.84526753, + "num_input_tokens_seen": 291601225, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.09899902, + "step": 13510, + "time_per_iteration": 2.5373711585998535 + }, + { + "auxiliary_loss_clip": 0.06409149, + "auxiliary_loss_mlp": 0.01263873, + "balance_loss_clip": 0.06272087, + "balance_loss_mlp": 0.01253806, + "epoch": 0.8123252667969337, + "flos": 16144940373120.0, + "grad_norm": 1.6338009615149598, + "language_loss": 0.70005399, + "learning_rate": 3.581358700114212e-07, + "loss": 0.77678418, + "num_input_tokens_seen": 291616995, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.10058594, + "step": 13511, + "time_per_iteration": 2.4621431827545166 + }, + { + "auxiliary_loss_clip": 0.06406134, + "auxiliary_loss_mlp": 0.01264376, + "balance_loss_clip": 0.06270903, + "balance_loss_mlp": 0.01254988, + "epoch": 0.8123853900496016, + "flos": 21250738990080.0, + "grad_norm": 3.4887790010923023, + "language_loss": 0.79486072, + "learning_rate": 3.57913508447004e-07, + "loss": 0.87156576, + "num_input_tokens_seen": 291636145, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.09387207, + "step": 13512, + "time_per_iteration": 2.5077874660491943 + }, + { + "auxiliary_loss_clip": 0.06401815, + "auxiliary_loss_mlp": 0.01262813, + "balance_loss_clip": 0.06269997, + "balance_loss_mlp": 0.01253723, + "epoch": 0.8124455133022697, + "flos": 64391156680320.0, + "grad_norm": 1.5302890319846227, + "language_loss": 0.64037752, + "learning_rate": 3.5769120915067076e-07, + "loss": 0.71702385, + "num_input_tokens_seen": 291662440, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09094238, + "step": 13513, + "time_per_iteration": 2.8918113708496094 + }, + { + "auxiliary_loss_clip": 0.06406252, + "auxiliary_loss_mlp": 0.01266377, + "balance_loss_clip": 0.06270798, + "balance_loss_mlp": 0.01256131, + "epoch": 0.8125056365549376, + "flos": 23849039047680.0, + "grad_norm": 1.8518380601721225, + "language_loss": 0.71717697, + "learning_rate": 3.5746897213085194e-07, + "loss": 0.79390329, + "num_input_tokens_seen": 291680950, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.10241699, + "step": 13514, + "time_per_iteration": 2.502861499786377 + }, + { + "auxiliary_loss_clip": 0.06401537, + "auxiliary_loss_mlp": 0.0126663, + "balance_loss_clip": 0.06270736, + "balance_loss_mlp": 0.01257434, + "epoch": 0.8125657598076056, + "flos": 23557109022720.0, + "grad_norm": 1.401731769675591, + "language_loss": 0.63314271, + "learning_rate": 3.5724679739597364e-07, + "loss": 0.70982432, + "num_input_tokens_seen": 291702395, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.09197998, + "step": 13515, + "time_per_iteration": 2.5460987091064453 + }, + { + "auxiliary_loss_clip": 0.06395491, + "auxiliary_loss_mlp": 0.01268356, + "balance_loss_clip": 0.06270037, + "balance_loss_mlp": 0.0125932, + "epoch": 0.8126258830602736, + "flos": 20710497790080.0, + "grad_norm": 1.4629712579476926, + "language_loss": 0.75324374, + "learning_rate": 3.570246849544616e-07, + "loss": 0.8298822, + "num_input_tokens_seen": 291721135, + "router_z_loss_clip": 1.25488281, + "router_z_loss_mlp": 0.09033203, + "step": 13516, + "time_per_iteration": 2.4880564212799072 + }, + { + "auxiliary_loss_clip": 0.06403796, + "auxiliary_loss_mlp": 0.01264686, + "balance_loss_clip": 0.06268365, + "balance_loss_mlp": 0.01254619, + "epoch": 0.8126860063129415, + "flos": 23624095962240.0, + "grad_norm": 1.3855330172277736, + "language_loss": 0.91489208, + "learning_rate": 3.5680263481473907e-07, + "loss": 0.99157685, + "num_input_tokens_seen": 291741235, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.10064697, + "step": 13517, + "time_per_iteration": 2.523481607437134 + }, + { + "auxiliary_loss_clip": 0.06405374, + "auxiliary_loss_mlp": 0.01263081, + "balance_loss_clip": 0.06272045, + "balance_loss_mlp": 0.01253711, + "epoch": 0.8127461295656095, + "flos": 25013740400640.0, + "grad_norm": 1.3744470429477684, + "language_loss": 0.78856122, + "learning_rate": 3.565806469852244e-07, + "loss": 0.86524576, + "num_input_tokens_seen": 291761430, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.09368896, + "step": 13518, + "time_per_iteration": 2.513049602508545 + }, + { + "auxiliary_loss_clip": 0.06401889, + "auxiliary_loss_mlp": 0.01264412, + "balance_loss_clip": 0.06271425, + "balance_loss_mlp": 0.01255799, + "epoch": 0.8128062528182775, + "flos": 27349138673280.0, + "grad_norm": 1.7240881927600378, + "language_loss": 0.79624963, + "learning_rate": 3.56358721474336e-07, + "loss": 0.87291259, + "num_input_tokens_seen": 291781755, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.08612061, + "step": 13519, + "time_per_iteration": 3.9774365425109863 + }, + { + "auxiliary_loss_clip": 0.06407484, + "auxiliary_loss_mlp": 0.01262058, + "balance_loss_clip": 0.06272454, + "balance_loss_mlp": 0.01253022, + "epoch": 0.8128663760709455, + "flos": 26513697139200.0, + "grad_norm": 1.5686471804974786, + "language_loss": 0.70565975, + "learning_rate": 3.561368582904905e-07, + "loss": 0.78235519, + "num_input_tokens_seen": 291804410, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.09033203, + "step": 13520, + "time_per_iteration": 2.5642969608306885 + }, + { + "auxiliary_loss_clip": 0.06403634, + "auxiliary_loss_mlp": 0.01265169, + "balance_loss_clip": 0.06270102, + "balance_loss_mlp": 0.01255746, + "epoch": 0.8129264993236134, + "flos": 17937036771840.0, + "grad_norm": 1.3447484311146394, + "language_loss": 0.72752047, + "learning_rate": 3.5591505744209925e-07, + "loss": 0.80420852, + "num_input_tokens_seen": 291823285, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.09417725, + "step": 13521, + "time_per_iteration": 2.483443260192871 + }, + { + "auxiliary_loss_clip": 0.0640461, + "auxiliary_loss_mlp": 0.01267618, + "balance_loss_clip": 0.06270576, + "balance_loss_mlp": 0.01257527, + "epoch": 0.8129866225762814, + "flos": 26184982371840.0, + "grad_norm": 1.5624785217553507, + "language_loss": 0.70352554, + "learning_rate": 3.5569331893757394e-07, + "loss": 0.78024787, + "num_input_tokens_seen": 291845305, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.10089111, + "step": 13522, + "time_per_iteration": 2.5880520343780518 + }, + { + "auxiliary_loss_clip": 0.06397097, + "auxiliary_loss_mlp": 0.01264851, + "balance_loss_clip": 0.06269517, + "balance_loss_mlp": 0.01255464, + "epoch": 0.8130467458289493, + "flos": 21038457870720.0, + "grad_norm": 1.4005848592108407, + "language_loss": 0.70769501, + "learning_rate": 3.554716427853233e-07, + "loss": 0.78431445, + "num_input_tokens_seen": 291863715, + "router_z_loss_clip": 1.27636719, + "router_z_loss_mlp": 0.09381104, + "step": 13523, + "time_per_iteration": 2.5072546005249023 + }, + { + "auxiliary_loss_clip": 0.06398432, + "auxiliary_loss_mlp": 0.01262757, + "balance_loss_clip": 0.06268294, + "balance_loss_mlp": 0.01252965, + "epoch": 0.8131068690816173, + "flos": 15492500156160.0, + "grad_norm": 2.331426517879502, + "language_loss": 0.70879388, + "learning_rate": 3.5525002899375256e-07, + "loss": 0.78540576, + "num_input_tokens_seen": 291880735, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.09777832, + "step": 13524, + "time_per_iteration": 2.4804911613464355 + }, + { + "auxiliary_loss_clip": 0.06399479, + "auxiliary_loss_mlp": 0.01264007, + "balance_loss_clip": 0.06268516, + "balance_loss_mlp": 0.01254756, + "epoch": 0.8131669923342852, + "flos": 29358924779520.0, + "grad_norm": 1.7261650681481027, + "language_loss": 0.63128257, + "learning_rate": 3.550284775712653e-07, + "loss": 0.70791739, + "num_input_tokens_seen": 291900535, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.09259033, + "step": 13525, + "time_per_iteration": 2.5361483097076416 + }, + { + "auxiliary_loss_clip": 0.06397866, + "auxiliary_loss_mlp": 0.01261329, + "balance_loss_clip": 0.06266545, + "balance_loss_mlp": 0.01251947, + "epoch": 0.8132271155869533, + "flos": 35263883312640.0, + "grad_norm": 1.6825597330397746, + "language_loss": 0.65842247, + "learning_rate": 3.548069885262628e-07, + "loss": 0.73501444, + "num_input_tokens_seen": 291919760, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09381104, + "step": 13526, + "time_per_iteration": 2.6087794303894043 + }, + { + "auxiliary_loss_clip": 0.06400132, + "auxiliary_loss_mlp": 0.01263098, + "balance_loss_clip": 0.06268608, + "balance_loss_mlp": 0.0125408, + "epoch": 0.8132872388396212, + "flos": 27789255843840.0, + "grad_norm": 1.4880547068923822, + "language_loss": 0.75493729, + "learning_rate": 3.5458556186714473e-07, + "loss": 0.83156955, + "num_input_tokens_seen": 291938915, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.09020996, + "step": 13527, + "time_per_iteration": 2.539010763168335 + }, + { + "auxiliary_loss_clip": 0.06402984, + "auxiliary_loss_mlp": 0.01266217, + "balance_loss_clip": 0.06270985, + "balance_loss_mlp": 0.01257098, + "epoch": 0.8133473620922892, + "flos": 27827172616320.0, + "grad_norm": 1.9409057063309785, + "language_loss": 0.70657897, + "learning_rate": 3.5436419760230706e-07, + "loss": 0.78327101, + "num_input_tokens_seen": 291958145, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.09124756, + "step": 13528, + "time_per_iteration": 2.5862042903900146 + }, + { + "auxiliary_loss_clip": 0.06401546, + "auxiliary_loss_mlp": 0.01260608, + "balance_loss_clip": 0.06268697, + "balance_loss_mlp": 0.01251185, + "epoch": 0.8134074853449572, + "flos": 18995534674560.0, + "grad_norm": 1.667225217482648, + "language_loss": 0.68823183, + "learning_rate": 3.5414289574014357e-07, + "loss": 0.76485336, + "num_input_tokens_seen": 291976860, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.09423828, + "step": 13529, + "time_per_iteration": 3.9062068462371826 + }, + { + "auxiliary_loss_clip": 0.06397647, + "auxiliary_loss_mlp": 0.01261144, + "balance_loss_clip": 0.06269309, + "balance_loss_mlp": 0.0125231, + "epoch": 0.8134676085976251, + "flos": 24249646218240.0, + "grad_norm": 1.3410194216884235, + "language_loss": 0.77744162, + "learning_rate": 3.5392165628904635e-07, + "loss": 0.85402954, + "num_input_tokens_seen": 291998085, + "router_z_loss_clip": 1.28417969, + "router_z_loss_mlp": 0.08837891, + "step": 13530, + "time_per_iteration": 2.508969306945801 + }, + { + "auxiliary_loss_clip": 0.06397682, + "auxiliary_loss_mlp": 0.0126503, + "balance_loss_clip": 0.06267507, + "balance_loss_mlp": 0.01255821, + "epoch": 0.8135277318502931, + "flos": 19068391399680.0, + "grad_norm": 1.6036142935304527, + "language_loss": 0.81703323, + "learning_rate": 3.537004792574052e-07, + "loss": 0.89366037, + "num_input_tokens_seen": 292016585, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.09216309, + "step": 13531, + "time_per_iteration": 2.465648889541626 + }, + { + "auxiliary_loss_clip": 0.06403959, + "auxiliary_loss_mlp": 0.0126883, + "balance_loss_clip": 0.06269965, + "balance_loss_mlp": 0.01258853, + "epoch": 0.813587855102961, + "flos": 17274617919360.0, + "grad_norm": 2.024023030441739, + "language_loss": 0.72077084, + "learning_rate": 3.534793646536065e-07, + "loss": 0.7974987, + "num_input_tokens_seen": 292033255, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.09985352, + "step": 13532, + "time_per_iteration": 3.8888938426971436 + }, + { + "auxiliary_loss_clip": 0.06401416, + "auxiliary_loss_mlp": 0.0126398, + "balance_loss_clip": 0.06270882, + "balance_loss_mlp": 0.01254896, + "epoch": 0.8136479783556291, + "flos": 20163883680000.0, + "grad_norm": 1.8388062199181954, + "language_loss": 0.77024227, + "learning_rate": 3.5325831248603533e-07, + "loss": 0.84689629, + "num_input_tokens_seen": 292051800, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.09082031, + "step": 13533, + "time_per_iteration": 2.495201826095581 + }, + { + "auxiliary_loss_clip": 0.0640831, + "auxiliary_loss_mlp": 0.01262799, + "balance_loss_clip": 0.06271634, + "balance_loss_mlp": 0.0125247, + "epoch": 0.813708101608297, + "flos": 22058535876480.0, + "grad_norm": 1.4349700882895242, + "language_loss": 0.76950037, + "learning_rate": 3.5303732276307495e-07, + "loss": 0.84621155, + "num_input_tokens_seen": 292072215, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.10314941, + "step": 13534, + "time_per_iteration": 2.508604049682617 + }, + { + "auxiliary_loss_clip": 0.06400474, + "auxiliary_loss_mlp": 0.01265486, + "balance_loss_clip": 0.06269156, + "balance_loss_mlp": 0.0125685, + "epoch": 0.813768224860965, + "flos": 16177825900800.0, + "grad_norm": 2.1221620950684676, + "language_loss": 0.93678272, + "learning_rate": 3.5281639549310336e-07, + "loss": 1.0134424, + "num_input_tokens_seen": 292088830, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.08630371, + "step": 13535, + "time_per_iteration": 2.4925365447998047 + }, + { + "auxiliary_loss_clip": 0.06397314, + "auxiliary_loss_mlp": 0.01265583, + "balance_loss_clip": 0.06270063, + "balance_loss_mlp": 0.01256451, + "epoch": 0.8138283481136329, + "flos": 24359119977600.0, + "grad_norm": 1.584672003718744, + "language_loss": 0.70635736, + "learning_rate": 3.52595530684499e-07, + "loss": 0.78298628, + "num_input_tokens_seen": 292109225, + "router_z_loss_clip": 1.27246094, + "router_z_loss_mlp": 0.09130859, + "step": 13536, + "time_per_iteration": 2.5193591117858887 + }, + { + "auxiliary_loss_clip": 0.06398758, + "auxiliary_loss_mlp": 0.01267555, + "balance_loss_clip": 0.06267327, + "balance_loss_mlp": 0.01257744, + "epoch": 0.8138884713663009, + "flos": 25522773154560.0, + "grad_norm": 1.4221719644735906, + "language_loss": 0.75364375, + "learning_rate": 3.5237472834563775e-07, + "loss": 0.83030683, + "num_input_tokens_seen": 292129660, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09814453, + "step": 13537, + "time_per_iteration": 3.939565420150757 + }, + { + "auxiliary_loss_clip": 0.06399003, + "auxiliary_loss_mlp": 0.01263044, + "balance_loss_clip": 0.06270146, + "balance_loss_mlp": 0.01254307, + "epoch": 0.8139485946189688, + "flos": 22460736274560.0, + "grad_norm": 1.471007913892401, + "language_loss": 0.76099801, + "learning_rate": 3.5215398848489163e-07, + "loss": 0.83761841, + "num_input_tokens_seen": 292149090, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.08734131, + "step": 13538, + "time_per_iteration": 2.5426995754241943 + }, + { + "auxiliary_loss_clip": 0.06402089, + "auxiliary_loss_mlp": 0.01264597, + "balance_loss_clip": 0.06269269, + "balance_loss_mlp": 0.01255317, + "epoch": 0.8140087178716369, + "flos": 21256566848640.0, + "grad_norm": 1.5246310927324862, + "language_loss": 0.78052437, + "learning_rate": 3.5193331111063176e-07, + "loss": 0.85719126, + "num_input_tokens_seen": 292169260, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.09283447, + "step": 13539, + "time_per_iteration": 2.496209144592285 + }, + { + "auxiliary_loss_clip": 0.06397711, + "auxiliary_loss_mlp": 0.01270691, + "balance_loss_clip": 0.06270097, + "balance_loss_mlp": 0.01261834, + "epoch": 0.8140688411243048, + "flos": 39424179657600.0, + "grad_norm": 2.587253276724192, + "language_loss": 0.66418785, + "learning_rate": 3.5171269623122533e-07, + "loss": 0.74087191, + "num_input_tokens_seen": 292188145, + "router_z_loss_clip": 1.27636719, + "router_z_loss_mlp": 0.08856201, + "step": 13540, + "time_per_iteration": 2.634174108505249 + }, + { + "auxiliary_loss_clip": 0.06402186, + "auxiliary_loss_mlp": 0.01265102, + "balance_loss_clip": 0.06269908, + "balance_loss_mlp": 0.01256024, + "epoch": 0.8141289643769728, + "flos": 25423781155200.0, + "grad_norm": 1.4733031204112998, + "language_loss": 0.67490125, + "learning_rate": 3.5149214385503913e-07, + "loss": 0.7515741, + "num_input_tokens_seen": 292212135, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09063721, + "step": 13541, + "time_per_iteration": 2.57694149017334 + }, + { + "auxiliary_loss_clip": 0.06399746, + "auxiliary_loss_mlp": 0.01265517, + "balance_loss_clip": 0.06268999, + "balance_loss_mlp": 0.01255527, + "epoch": 0.8141890876296408, + "flos": 12572990271360.0, + "grad_norm": 2.4381124883520404, + "language_loss": 0.69473195, + "learning_rate": 3.512716539904355e-07, + "loss": 0.7713846, + "num_input_tokens_seen": 292230645, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09991455, + "step": 13542, + "time_per_iteration": 2.4687132835388184 + }, + { + "auxiliary_loss_clip": 0.06406985, + "auxiliary_loss_mlp": 0.01266697, + "balance_loss_clip": 0.06269906, + "balance_loss_mlp": 0.0125623, + "epoch": 0.8142492108823087, + "flos": 14971015071360.0, + "grad_norm": 3.9353973875515895, + "language_loss": 0.79934382, + "learning_rate": 3.5105122664577613e-07, + "loss": 0.87608063, + "num_input_tokens_seen": 292243540, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.10467529, + "step": 13543, + "time_per_iteration": 2.4158408641815186 + }, + { + "auxiliary_loss_clip": 0.06409101, + "auxiliary_loss_mlp": 0.01264352, + "balance_loss_clip": 0.06271017, + "balance_loss_mlp": 0.01253879, + "epoch": 0.8143093341349767, + "flos": 12426899477760.0, + "grad_norm": 2.3767283525757943, + "language_loss": 0.78172165, + "learning_rate": 3.5083086182942003e-07, + "loss": 0.85845613, + "num_input_tokens_seen": 292261715, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.10467529, + "step": 13544, + "time_per_iteration": 2.4718081951141357 + }, + { + "auxiliary_loss_clip": 0.06415416, + "auxiliary_loss_mlp": 0.0126488, + "balance_loss_clip": 0.06274471, + "balance_loss_mlp": 0.01253859, + "epoch": 0.8143694573876447, + "flos": 11915267247360.0, + "grad_norm": 2.5713851454912557, + "language_loss": 0.74007636, + "learning_rate": 3.5061055954972264e-07, + "loss": 0.81687939, + "num_input_tokens_seen": 292275080, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.11022949, + "step": 13545, + "time_per_iteration": 2.464211940765381 + }, + { + "auxiliary_loss_clip": 0.06398509, + "auxiliary_loss_mlp": 0.01265881, + "balance_loss_clip": 0.0627104, + "balance_loss_mlp": 0.01256928, + "epoch": 0.8144295806403127, + "flos": 21218901638400.0, + "grad_norm": 1.5996287062852548, + "language_loss": 0.77183664, + "learning_rate": 3.5039031981503776e-07, + "loss": 0.84848052, + "num_input_tokens_seen": 292294635, + "router_z_loss_clip": 1.27441406, + "router_z_loss_mlp": 0.08953857, + "step": 13546, + "time_per_iteration": 2.5111136436462402 + }, + { + "auxiliary_loss_clip": 0.06407703, + "auxiliary_loss_mlp": 0.01264502, + "balance_loss_clip": 0.06273138, + "balance_loss_mlp": 0.01255311, + "epoch": 0.8144897038929806, + "flos": 19871450530560.0, + "grad_norm": 2.7448316541236144, + "language_loss": 0.71193993, + "learning_rate": 3.501701426337178e-07, + "loss": 0.78866196, + "num_input_tokens_seen": 292312695, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.09179688, + "step": 13547, + "time_per_iteration": 2.495678186416626 + }, + { + "auxiliary_loss_clip": 0.06408043, + "auxiliary_loss_mlp": 0.01267842, + "balance_loss_clip": 0.06272228, + "balance_loss_mlp": 0.01257775, + "epoch": 0.8145498271456486, + "flos": 24578654474880.0, + "grad_norm": 1.7869845648084397, + "language_loss": 0.71165389, + "learning_rate": 3.49950028014111e-07, + "loss": 0.78841269, + "num_input_tokens_seen": 292332005, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.10070801, + "step": 13548, + "time_per_iteration": 2.562206506729126 + }, + { + "auxiliary_loss_clip": 0.06407051, + "auxiliary_loss_mlp": 0.01261806, + "balance_loss_clip": 0.06273579, + "balance_loss_mlp": 0.01251733, + "epoch": 0.8146099503983165, + "flos": 20199159048960.0, + "grad_norm": 1.9522520316462837, + "language_loss": 0.77203232, + "learning_rate": 3.4972997596456444e-07, + "loss": 0.84872091, + "num_input_tokens_seen": 292348365, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.10083008, + "step": 13549, + "time_per_iteration": 2.502742290496826 + }, + { + "auxiliary_loss_clip": 0.06405576, + "auxiliary_loss_mlp": 0.0126447, + "balance_loss_clip": 0.06270814, + "balance_loss_mlp": 0.01254707, + "epoch": 0.8146700736509845, + "flos": 19543071179520.0, + "grad_norm": 1.8670916613162452, + "language_loss": 0.71610808, + "learning_rate": 3.4950998649342233e-07, + "loss": 0.79280859, + "num_input_tokens_seen": 292368050, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.09753418, + "step": 13550, + "time_per_iteration": 2.6039505004882812 + }, + { + "auxiliary_loss_clip": 0.06397806, + "auxiliary_loss_mlp": 0.01263957, + "balance_loss_clip": 0.06269183, + "balance_loss_mlp": 0.01255654, + "epoch": 0.8147301969036524, + "flos": 18047265217920.0, + "grad_norm": 1.6838631897121676, + "language_loss": 0.71859229, + "learning_rate": 3.4929005960902826e-07, + "loss": 0.79520994, + "num_input_tokens_seen": 292385315, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.08300781, + "step": 13551, + "time_per_iteration": 2.551734447479248 + }, + { + "auxiliary_loss_clip": 0.06410958, + "auxiliary_loss_mlp": 0.01264146, + "balance_loss_clip": 0.06273584, + "balance_loss_mlp": 0.0125393, + "epoch": 0.8147903201563205, + "flos": 18010606256640.0, + "grad_norm": 1.848963719334946, + "language_loss": 0.69100463, + "learning_rate": 3.4907019531971926e-07, + "loss": 0.76775569, + "num_input_tokens_seen": 292403375, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.10217285, + "step": 13552, + "time_per_iteration": 2.5665345191955566 + }, + { + "auxiliary_loss_clip": 0.06406602, + "auxiliary_loss_mlp": 0.01264219, + "balance_loss_clip": 0.0627239, + "balance_loss_mlp": 0.01254492, + "epoch": 0.8148504434089884, + "flos": 20264343125760.0, + "grad_norm": 1.7694730597096269, + "language_loss": 0.82387245, + "learning_rate": 3.4885039363383407e-07, + "loss": 0.90058064, + "num_input_tokens_seen": 292419260, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.09729004, + "step": 13553, + "time_per_iteration": 2.450315475463867 + }, + { + "auxiliary_loss_clip": 0.06406596, + "auxiliary_loss_mlp": 0.01265423, + "balance_loss_clip": 0.06271842, + "balance_loss_mlp": 0.01255916, + "epoch": 0.8149105666616564, + "flos": 12499588494720.0, + "grad_norm": 1.7106764714025673, + "language_loss": 0.68241465, + "learning_rate": 3.4863065455970795e-07, + "loss": 0.75913489, + "num_input_tokens_seen": 292436095, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.09509277, + "step": 13554, + "time_per_iteration": 2.462124824523926 + }, + { + "auxiliary_loss_clip": 0.06406562, + "auxiliary_loss_mlp": 0.01265488, + "balance_loss_clip": 0.06274106, + "balance_loss_mlp": 0.01255624, + "epoch": 0.8149706899143244, + "flos": 32531609376000.0, + "grad_norm": 1.6041901948473798, + "language_loss": 0.6636458, + "learning_rate": 3.484109781056723e-07, + "loss": 0.74036634, + "num_input_tokens_seen": 292457190, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09857178, + "step": 13555, + "time_per_iteration": 2.553244113922119 + }, + { + "auxiliary_loss_clip": 0.0640707, + "auxiliary_loss_mlp": 0.01264187, + "balance_loss_clip": 0.06269799, + "balance_loss_mlp": 0.01254352, + "epoch": 0.8150308131669923, + "flos": 19391362162560.0, + "grad_norm": 1.6698699385134061, + "language_loss": 0.74007624, + "learning_rate": 3.4819136428005844e-07, + "loss": 0.81678879, + "num_input_tokens_seen": 292474300, + "router_z_loss_clip": 1.37402344, + "router_z_loss_mlp": 0.09838867, + "step": 13556, + "time_per_iteration": 2.457014322280884 + }, + { + "auxiliary_loss_clip": 0.064043, + "auxiliary_loss_mlp": 0.01263626, + "balance_loss_clip": 0.06272946, + "balance_loss_mlp": 0.0125465, + "epoch": 0.8150909364196604, + "flos": 17427249331200.0, + "grad_norm": 1.7345154652881483, + "language_loss": 0.8086679, + "learning_rate": 3.4797181309119307e-07, + "loss": 0.88534719, + "num_input_tokens_seen": 292492420, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.08978271, + "step": 13557, + "time_per_iteration": 2.460977077484131 + }, + { + "auxiliary_loss_clip": 0.06408045, + "auxiliary_loss_mlp": 0.01267038, + "balance_loss_clip": 0.06272027, + "balance_loss_mlp": 0.01256923, + "epoch": 0.8151510596723283, + "flos": 27170246206080.0, + "grad_norm": 1.581815205811392, + "language_loss": 0.65745318, + "learning_rate": 3.4775232454740255e-07, + "loss": 0.73420399, + "num_input_tokens_seen": 292512895, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.10119629, + "step": 13558, + "time_per_iteration": 2.5265209674835205 + }, + { + "auxiliary_loss_clip": 0.06310294, + "auxiliary_loss_mlp": 0.01251766, + "balance_loss_clip": 0.06255711, + "balance_loss_mlp": 0.0125069, + "epoch": 0.8152111829249963, + "flos": 64236581896320.0, + "grad_norm": 1.1628242103674278, + "language_loss": 0.56932402, + "learning_rate": 3.4753289865700896e-07, + "loss": 0.64494467, + "num_input_tokens_seen": 292566580, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.01077271, + "step": 13559, + "time_per_iteration": 4.45433497428894 + }, + { + "auxiliary_loss_clip": 0.06308051, + "auxiliary_loss_mlp": 0.01250118, + "balance_loss_clip": 0.06253337, + "balance_loss_mlp": 0.01249046, + "epoch": 0.8152713061776642, + "flos": 67091201193600.0, + "grad_norm": 0.6673892366494375, + "language_loss": 0.55275512, + "learning_rate": 3.473135354283334e-07, + "loss": 0.62833685, + "num_input_tokens_seen": 292621490, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.01073456, + "step": 13560, + "time_per_iteration": 2.997331380844116 + }, + { + "auxiliary_loss_clip": 0.06405302, + "auxiliary_loss_mlp": 0.01266787, + "balance_loss_clip": 0.06273372, + "balance_loss_mlp": 0.01257703, + "epoch": 0.8153314294303322, + "flos": 14396169335040.0, + "grad_norm": 1.5961356559953426, + "language_loss": 0.67774737, + "learning_rate": 3.470942348696948e-07, + "loss": 0.7544682, + "num_input_tokens_seen": 292638660, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.09082031, + "step": 13561, + "time_per_iteration": 2.578291416168213 + }, + { + "auxiliary_loss_clip": 0.06410162, + "auxiliary_loss_mlp": 0.01264267, + "balance_loss_clip": 0.06272513, + "balance_loss_mlp": 0.0125423, + "epoch": 0.8153915526830001, + "flos": 25629563583360.0, + "grad_norm": 1.4593268747943478, + "language_loss": 0.81970775, + "learning_rate": 3.468749969894085e-07, + "loss": 0.89645207, + "num_input_tokens_seen": 292658545, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.10040283, + "step": 13562, + "time_per_iteration": 2.5358498096466064 + }, + { + "auxiliary_loss_clip": 0.06404186, + "auxiliary_loss_mlp": 0.01265369, + "balance_loss_clip": 0.0627111, + "balance_loss_mlp": 0.01255689, + "epoch": 0.8154516759356681, + "flos": 23376120203520.0, + "grad_norm": 1.50215259842858, + "language_loss": 0.71958882, + "learning_rate": 3.4665582179578734e-07, + "loss": 0.79628438, + "num_input_tokens_seen": 292678460, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.09680176, + "step": 13563, + "time_per_iteration": 2.5700597763061523 + }, + { + "auxiliary_loss_clip": 0.06403195, + "auxiliary_loss_mlp": 0.01269781, + "balance_loss_clip": 0.06270723, + "balance_loss_mlp": 0.0125963, + "epoch": 0.815511799188336, + "flos": 28157019413760.0, + "grad_norm": 1.7257040784897213, + "language_loss": 0.70323086, + "learning_rate": 3.4643670929714387e-07, + "loss": 0.77996063, + "num_input_tokens_seen": 292699815, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.10144043, + "step": 13564, + "time_per_iteration": 2.579045295715332 + }, + { + "auxiliary_loss_clip": 0.06402478, + "auxiliary_loss_mlp": 0.01261987, + "balance_loss_clip": 0.06269638, + "balance_loss_mlp": 0.01252862, + "epoch": 0.8155719224410041, + "flos": 16989186585600.0, + "grad_norm": 1.70957475209218, + "language_loss": 0.70465791, + "learning_rate": 3.462176595017854e-07, + "loss": 0.78130251, + "num_input_tokens_seen": 292717370, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.09130859, + "step": 13565, + "time_per_iteration": 2.482426166534424 + }, + { + "auxiliary_loss_clip": 0.0640111, + "auxiliary_loss_mlp": 0.01264996, + "balance_loss_clip": 0.06269612, + "balance_loss_mlp": 0.01255346, + "epoch": 0.815632045693672, + "flos": 24688757139840.0, + "grad_norm": 2.037805159050188, + "language_loss": 0.79566395, + "learning_rate": 3.459986724180188e-07, + "loss": 0.87232494, + "num_input_tokens_seen": 292737110, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09655762, + "step": 13566, + "time_per_iteration": 2.51450514793396 + }, + { + "auxiliary_loss_clip": 0.06398387, + "auxiliary_loss_mlp": 0.01263188, + "balance_loss_clip": 0.06270022, + "balance_loss_mlp": 0.01253991, + "epoch": 0.81569216894634, + "flos": 19944516890880.0, + "grad_norm": 1.680610729726936, + "language_loss": 0.8259697, + "learning_rate": 3.457797480541491e-07, + "loss": 0.90258545, + "num_input_tokens_seen": 292756510, + "router_z_loss_clip": 1.28222656, + "router_z_loss_mlp": 0.09197998, + "step": 13567, + "time_per_iteration": 2.5065062046051025 + }, + { + "auxiliary_loss_clip": 0.063999, + "auxiliary_loss_mlp": 0.01263286, + "balance_loss_clip": 0.06270279, + "balance_loss_mlp": 0.01254661, + "epoch": 0.8157522921990079, + "flos": 21805948143360.0, + "grad_norm": 1.901722812011985, + "language_loss": 0.79928589, + "learning_rate": 3.455608864184771e-07, + "loss": 0.87591779, + "num_input_tokens_seen": 292776710, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.08624268, + "step": 13568, + "time_per_iteration": 2.482262372970581 + }, + { + "auxiliary_loss_clip": 0.06399144, + "auxiliary_loss_mlp": 0.01266035, + "balance_loss_clip": 0.06270231, + "balance_loss_mlp": 0.01257118, + "epoch": 0.8158124154516759, + "flos": 18513098392320.0, + "grad_norm": 1.6787478080624303, + "language_loss": 0.77251327, + "learning_rate": 3.453420875193016e-07, + "loss": 0.84916508, + "num_input_tokens_seen": 292794350, + "router_z_loss_clip": 1.28808594, + "router_z_loss_mlp": 0.0892334, + "step": 13569, + "time_per_iteration": 3.9400181770324707 + }, + { + "auxiliary_loss_clip": 0.06403175, + "auxiliary_loss_mlp": 0.01263186, + "balance_loss_clip": 0.06272935, + "balance_loss_mlp": 0.01254067, + "epoch": 0.815872538704344, + "flos": 26837590296960.0, + "grad_norm": 2.286730013168615, + "language_loss": 0.58822525, + "learning_rate": 3.451233513649199e-07, + "loss": 0.66488886, + "num_input_tokens_seen": 292814005, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.09118652, + "step": 13570, + "time_per_iteration": 2.524815082550049 + }, + { + "auxiliary_loss_clip": 0.0640761, + "auxiliary_loss_mlp": 0.01263763, + "balance_loss_clip": 0.06271629, + "balance_loss_mlp": 0.01253577, + "epoch": 0.8159326619570119, + "flos": 21732127096320.0, + "grad_norm": 1.6002303397111248, + "language_loss": 0.82693851, + "learning_rate": 3.4490467796362687e-07, + "loss": 0.90365231, + "num_input_tokens_seen": 292833485, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10192871, + "step": 13571, + "time_per_iteration": 2.497116804122925 + }, + { + "auxiliary_loss_clip": 0.06401446, + "auxiliary_loss_mlp": 0.0126676, + "balance_loss_clip": 0.06269471, + "balance_loss_mlp": 0.0125621, + "epoch": 0.8159927852096799, + "flos": 13845152885760.0, + "grad_norm": 2.3316068768824905, + "language_loss": 0.79288316, + "learning_rate": 3.446860673237142e-07, + "loss": 0.86956525, + "num_input_tokens_seen": 292848045, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.10546875, + "step": 13572, + "time_per_iteration": 3.9277310371398926 + }, + { + "auxiliary_loss_clip": 0.06405439, + "auxiliary_loss_mlp": 0.01264472, + "balance_loss_clip": 0.06271061, + "balance_loss_mlp": 0.01254965, + "epoch": 0.8160529084623478, + "flos": 24506552436480.0, + "grad_norm": 1.8410369456410705, + "language_loss": 0.65139991, + "learning_rate": 3.4446751945347186e-07, + "loss": 0.72809899, + "num_input_tokens_seen": 292869965, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.09509277, + "step": 13573, + "time_per_iteration": 2.5813302993774414 + }, + { + "auxiliary_loss_clip": 0.06397152, + "auxiliary_loss_mlp": 0.01265693, + "balance_loss_clip": 0.06268078, + "balance_loss_mlp": 0.01257193, + "epoch": 0.8161130317150158, + "flos": 24833170851840.0, + "grad_norm": 3.2728754081568443, + "language_loss": 0.75079989, + "learning_rate": 3.442490343611868e-07, + "loss": 0.8274284, + "num_input_tokens_seen": 292889680, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.08508301, + "step": 13574, + "time_per_iteration": 2.520437002182007 + }, + { + "auxiliary_loss_clip": 0.06406549, + "auxiliary_loss_mlp": 0.01263703, + "balance_loss_clip": 0.06272028, + "balance_loss_mlp": 0.01253737, + "epoch": 0.8161731549676837, + "flos": 30964497989760.0, + "grad_norm": 1.5623209445924822, + "language_loss": 0.5998435, + "learning_rate": 3.4403061205514485e-07, + "loss": 0.67654604, + "num_input_tokens_seen": 292912360, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.09973145, + "step": 13575, + "time_per_iteration": 2.5688116550445557 + }, + { + "auxiliary_loss_clip": 0.06401668, + "auxiliary_loss_mlp": 0.01262946, + "balance_loss_clip": 0.0626979, + "balance_loss_mlp": 0.01253445, + "epoch": 0.8162332782203517, + "flos": 18557975053440.0, + "grad_norm": 2.3600977728532846, + "language_loss": 0.7450968, + "learning_rate": 3.4381225254362736e-07, + "loss": 0.82174295, + "num_input_tokens_seen": 292928325, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09503174, + "step": 13576, + "time_per_iteration": 2.4884495735168457 + }, + { + "auxiliary_loss_clip": 0.06308542, + "auxiliary_loss_mlp": 0.01250123, + "balance_loss_clip": 0.06253725, + "balance_loss_mlp": 0.01249126, + "epoch": 0.8162934014730197, + "flos": 70405700025600.0, + "grad_norm": 0.8084788466791542, + "language_loss": 0.58613569, + "learning_rate": 3.435939558349155e-07, + "loss": 0.6617223, + "num_input_tokens_seen": 292992795, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.00996399, + "step": 13577, + "time_per_iteration": 4.5383522510528564 + }, + { + "auxiliary_loss_clip": 0.06398452, + "auxiliary_loss_mlp": 0.01264954, + "balance_loss_clip": 0.06270564, + "balance_loss_mlp": 0.01255912, + "epoch": 0.8163535247256877, + "flos": 21221165698560.0, + "grad_norm": 1.6710813942162877, + "language_loss": 0.70834422, + "learning_rate": 3.4337572193728747e-07, + "loss": 0.78497839, + "num_input_tokens_seen": 293011950, + "router_z_loss_clip": 1.27832031, + "router_z_loss_mlp": 0.09039307, + "step": 13578, + "time_per_iteration": 2.4779903888702393 + }, + { + "auxiliary_loss_clip": 0.06404679, + "auxiliary_loss_mlp": 0.01264719, + "balance_loss_clip": 0.06272516, + "balance_loss_mlp": 0.01255433, + "epoch": 0.8164136479783556, + "flos": 21104061217920.0, + "grad_norm": 1.6470970354914776, + "language_loss": 0.73678112, + "learning_rate": 3.431575508590172e-07, + "loss": 0.81347507, + "num_input_tokens_seen": 293030175, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.09283447, + "step": 13579, + "time_per_iteration": 2.509214162826538 + }, + { + "auxiliary_loss_clip": 0.06405793, + "auxiliary_loss_mlp": 0.01262409, + "balance_loss_clip": 0.06271651, + "balance_loss_mlp": 0.01253433, + "epoch": 0.8164737712310236, + "flos": 21726215383680.0, + "grad_norm": 1.6525660309020993, + "language_loss": 0.79023516, + "learning_rate": 3.4293944260837873e-07, + "loss": 0.86691713, + "num_input_tokens_seen": 293047980, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.08978271, + "step": 13580, + "time_per_iteration": 2.481717109680176 + }, + { + "auxiliary_loss_clip": 0.06397673, + "auxiliary_loss_mlp": 0.01267208, + "balance_loss_clip": 0.0626862, + "balance_loss_mlp": 0.01258103, + "epoch": 0.8165338944836915, + "flos": 19542903471360.0, + "grad_norm": 1.6359776593640634, + "language_loss": 0.68975896, + "learning_rate": 3.4272139719364314e-07, + "loss": 0.76640779, + "num_input_tokens_seen": 293067030, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.0909729, + "step": 13581, + "time_per_iteration": 2.4774811267852783 + }, + { + "auxiliary_loss_clip": 0.06403127, + "auxiliary_loss_mlp": 0.01263284, + "balance_loss_clip": 0.06270953, + "balance_loss_mlp": 0.01254487, + "epoch": 0.8165940177363595, + "flos": 22934996784000.0, + "grad_norm": 1.6436764796534944, + "language_loss": 0.60097897, + "learning_rate": 3.4250341462307786e-07, + "loss": 0.67764312, + "num_input_tokens_seen": 293085575, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.08795166, + "step": 13582, + "time_per_iteration": 2.5118255615234375 + }, + { + "auxiliary_loss_clip": 0.06393835, + "auxiliary_loss_mlp": 0.0126456, + "balance_loss_clip": 0.06269538, + "balance_loss_mlp": 0.01256323, + "epoch": 0.8166541409890276, + "flos": 23377545722880.0, + "grad_norm": 1.3287136998810383, + "language_loss": 0.82430953, + "learning_rate": 3.4228549490494897e-07, + "loss": 0.90089345, + "num_input_tokens_seen": 293108200, + "router_z_loss_clip": 1.24414062, + "router_z_loss_mlp": 0.0824585, + "step": 13583, + "time_per_iteration": 2.5597774982452393 + }, + { + "auxiliary_loss_clip": 0.06401314, + "auxiliary_loss_mlp": 0.01261966, + "balance_loss_clip": 0.0626954, + "balance_loss_mlp": 0.01252942, + "epoch": 0.8167142642416955, + "flos": 18447872388480.0, + "grad_norm": 1.5767524844469751, + "language_loss": 0.74625087, + "learning_rate": 3.4206763804752093e-07, + "loss": 0.82288373, + "num_input_tokens_seen": 293126020, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09020996, + "step": 13584, + "time_per_iteration": 2.4725546836853027 + }, + { + "auxiliary_loss_clip": 0.06409091, + "auxiliary_loss_mlp": 0.01262966, + "balance_loss_clip": 0.06275168, + "balance_loss_mlp": 0.01253178, + "epoch": 0.8167743874943635, + "flos": 21221333406720.0, + "grad_norm": 1.5450053783632358, + "language_loss": 0.74571323, + "learning_rate": 3.4184984405905405e-07, + "loss": 0.82243377, + "num_input_tokens_seen": 293144620, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.09790039, + "step": 13585, + "time_per_iteration": 2.5259850025177 + }, + { + "auxiliary_loss_clip": 0.06405304, + "auxiliary_loss_mlp": 0.01265456, + "balance_loss_clip": 0.06274252, + "balance_loss_mlp": 0.01255598, + "epoch": 0.8168345107470314, + "flos": 18703646576640.0, + "grad_norm": 1.6400360392779838, + "language_loss": 0.70096934, + "learning_rate": 3.416321129478068e-07, + "loss": 0.77767694, + "num_input_tokens_seen": 293162850, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09857178, + "step": 13586, + "time_per_iteration": 2.4900436401367188 + }, + { + "auxiliary_loss_clip": 0.06405935, + "auxiliary_loss_mlp": 0.01267633, + "balance_loss_clip": 0.06273375, + "balance_loss_mlp": 0.01258442, + "epoch": 0.8168946339996994, + "flos": 16258648763520.0, + "grad_norm": 1.5247146211880829, + "language_loss": 0.61139441, + "learning_rate": 3.4141444472203594e-07, + "loss": 0.68813008, + "num_input_tokens_seen": 293181620, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09191895, + "step": 13587, + "time_per_iteration": 2.514977216720581 + }, + { + "auxiliary_loss_clip": 0.0641003, + "auxiliary_loss_mlp": 0.01265651, + "balance_loss_clip": 0.06272879, + "balance_loss_mlp": 0.01255757, + "epoch": 0.8169547572523673, + "flos": 26948615356800.0, + "grad_norm": 2.301034308927258, + "language_loss": 0.69020987, + "learning_rate": 3.4119683938999624e-07, + "loss": 0.7669667, + "num_input_tokens_seen": 293200270, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.09887695, + "step": 13588, + "time_per_iteration": 2.5375754833221436 + }, + { + "auxiliary_loss_clip": 0.06406662, + "auxiliary_loss_mlp": 0.01272493, + "balance_loss_clip": 0.06272782, + "balance_loss_mlp": 0.0126242, + "epoch": 0.8170148805050353, + "flos": 18958204880640.0, + "grad_norm": 1.5284621283458033, + "language_loss": 0.73197293, + "learning_rate": 3.4097929695993854e-07, + "loss": 0.80876452, + "num_input_tokens_seen": 293218960, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.10070801, + "step": 13589, + "time_per_iteration": 2.517772674560547 + }, + { + "auxiliary_loss_clip": 0.06399844, + "auxiliary_loss_mlp": 0.01267676, + "balance_loss_clip": 0.06269893, + "balance_loss_mlp": 0.0125786, + "epoch": 0.8170750037577033, + "flos": 21841307366400.0, + "grad_norm": 1.7674157156585606, + "language_loss": 0.73466247, + "learning_rate": 3.4076181744011166e-07, + "loss": 0.81133771, + "num_input_tokens_seen": 293236450, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.09820557, + "step": 13590, + "time_per_iteration": 2.4888107776641846 + }, + { + "auxiliary_loss_clip": 0.06408446, + "auxiliary_loss_mlp": 0.01267037, + "balance_loss_clip": 0.06271075, + "balance_loss_mlp": 0.01256833, + "epoch": 0.8171351270103713, + "flos": 33514986493440.0, + "grad_norm": 1.8858247117206646, + "language_loss": 0.65332603, + "learning_rate": 3.4054440083876345e-07, + "loss": 0.7300809, + "num_input_tokens_seen": 293256480, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.10198975, + "step": 13591, + "time_per_iteration": 2.658235788345337 + }, + { + "auxiliary_loss_clip": 0.06408292, + "auxiliary_loss_mlp": 0.01266694, + "balance_loss_clip": 0.06271869, + "balance_loss_mlp": 0.01256693, + "epoch": 0.8171952502630392, + "flos": 22714330256640.0, + "grad_norm": 2.6750207052174817, + "language_loss": 0.68109965, + "learning_rate": 3.403270471641373e-07, + "loss": 0.75784951, + "num_input_tokens_seen": 293274960, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.10003662, + "step": 13592, + "time_per_iteration": 2.569607973098755 + }, + { + "auxiliary_loss_clip": 0.06402638, + "auxiliary_loss_mlp": 0.01263684, + "balance_loss_clip": 0.06269838, + "balance_loss_mlp": 0.01253897, + "epoch": 0.8172553735157072, + "flos": 26730883722240.0, + "grad_norm": 1.8292699977541562, + "language_loss": 0.66788435, + "learning_rate": 3.401097564244759e-07, + "loss": 0.74454749, + "num_input_tokens_seen": 293295945, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.09790039, + "step": 13593, + "time_per_iteration": 2.6358397006988525 + }, + { + "auxiliary_loss_clip": 0.06402188, + "auxiliary_loss_mlp": 0.01262856, + "balance_loss_clip": 0.06270669, + "balance_loss_mlp": 0.01254118, + "epoch": 0.8173154967683751, + "flos": 15966551030400.0, + "grad_norm": 1.8879994801878386, + "language_loss": 0.69759774, + "learning_rate": 3.398925286280188e-07, + "loss": 0.77424812, + "num_input_tokens_seen": 293313300, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.08740234, + "step": 13594, + "time_per_iteration": 2.4728150367736816 + }, + { + "auxiliary_loss_clip": 0.06406444, + "auxiliary_loss_mlp": 0.01262646, + "balance_loss_clip": 0.06270047, + "balance_loss_mlp": 0.01253229, + "epoch": 0.8173756200210431, + "flos": 25992547470720.0, + "grad_norm": 1.7768009841467751, + "language_loss": 0.66399467, + "learning_rate": 3.3967536378300456e-07, + "loss": 0.74068558, + "num_input_tokens_seen": 293333085, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.09417725, + "step": 13595, + "time_per_iteration": 2.581303834915161 + }, + { + "auxiliary_loss_clip": 0.0641185, + "auxiliary_loss_mlp": 0.01271254, + "balance_loss_clip": 0.06272434, + "balance_loss_mlp": 0.01261771, + "epoch": 0.8174357432737112, + "flos": 25671211862400.0, + "grad_norm": 1.436739203563198, + "language_loss": 0.78803599, + "learning_rate": 3.394582618976658e-07, + "loss": 0.86486703, + "num_input_tokens_seen": 293351895, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.09490967, + "step": 13596, + "time_per_iteration": 2.5674192905426025 + }, + { + "auxiliary_loss_clip": 0.06401101, + "auxiliary_loss_mlp": 0.01264044, + "balance_loss_clip": 0.06269896, + "balance_loss_mlp": 0.01254245, + "epoch": 0.8174958665263791, + "flos": 21841517001600.0, + "grad_norm": 4.608400276683875, + "language_loss": 0.58776182, + "learning_rate": 3.392412229802362e-07, + "loss": 0.66441321, + "num_input_tokens_seen": 293371165, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09802246, + "step": 13597, + "time_per_iteration": 2.5309157371520996 + }, + { + "auxiliary_loss_clip": 0.06398574, + "auxiliary_loss_mlp": 0.01266127, + "balance_loss_clip": 0.06269415, + "balance_loss_mlp": 0.01257443, + "epoch": 0.8175559897790471, + "flos": 22462077939840.0, + "grad_norm": 1.411078794675908, + "language_loss": 0.82824457, + "learning_rate": 3.390242470389462e-07, + "loss": 0.90489155, + "num_input_tokens_seen": 293391150, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.0869751, + "step": 13598, + "time_per_iteration": 2.494666337966919 + }, + { + "auxiliary_loss_clip": 0.06402759, + "auxiliary_loss_mlp": 0.01265938, + "balance_loss_clip": 0.06267741, + "balance_loss_mlp": 0.01256384, + "epoch": 0.817616113031715, + "flos": 23621328777600.0, + "grad_norm": 1.988288541952237, + "language_loss": 0.82828057, + "learning_rate": 3.3880733408202277e-07, + "loss": 0.90496761, + "num_input_tokens_seen": 293409440, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.09552002, + "step": 13599, + "time_per_iteration": 3.9362494945526123 + }, + { + "auxiliary_loss_clip": 0.0639835, + "auxiliary_loss_mlp": 0.01264171, + "balance_loss_clip": 0.06268303, + "balance_loss_mlp": 0.0125532, + "epoch": 0.817676236284383, + "flos": 27679572449280.0, + "grad_norm": 1.9427559574144415, + "language_loss": 0.84026325, + "learning_rate": 3.3859048411769186e-07, + "loss": 0.91688854, + "num_input_tokens_seen": 293428995, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.08850098, + "step": 13600, + "time_per_iteration": 2.575437068939209 + }, + { + "auxiliary_loss_clip": 0.06406076, + "auxiliary_loss_mlp": 0.01265545, + "balance_loss_clip": 0.06270483, + "balance_loss_mlp": 0.01255495, + "epoch": 0.8177363595370509, + "flos": 24687918599040.0, + "grad_norm": 1.5668233698326273, + "language_loss": 0.73828596, + "learning_rate": 3.383736971541766e-07, + "loss": 0.8150022, + "num_input_tokens_seen": 293449155, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.10046387, + "step": 13601, + "time_per_iteration": 2.583362579345703 + }, + { + "auxiliary_loss_clip": 0.06410781, + "auxiliary_loss_mlp": 0.01263621, + "balance_loss_clip": 0.06272674, + "balance_loss_mlp": 0.01253757, + "epoch": 0.817796482789719, + "flos": 17351835056640.0, + "grad_norm": 2.6342445376151042, + "language_loss": 0.68994367, + "learning_rate": 3.3815697319969737e-07, + "loss": 0.76668769, + "num_input_tokens_seen": 293466125, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.09863281, + "step": 13602, + "time_per_iteration": 2.4818198680877686 + }, + { + "auxiliary_loss_clip": 0.06398925, + "auxiliary_loss_mlp": 0.01264753, + "balance_loss_clip": 0.06269614, + "balance_loss_mlp": 0.01255664, + "epoch": 0.8178566060423869, + "flos": 17783105621760.0, + "grad_norm": 2.059095926222651, + "language_loss": 0.84576654, + "learning_rate": 3.379403122624718e-07, + "loss": 0.92240334, + "num_input_tokens_seen": 293481345, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.09088135, + "step": 13603, + "time_per_iteration": 2.4598805904388428 + }, + { + "auxiliary_loss_clip": 0.06402913, + "auxiliary_loss_mlp": 0.01264877, + "balance_loss_clip": 0.06270468, + "balance_loss_mlp": 0.01255656, + "epoch": 0.8179167292950549, + "flos": 24980267894400.0, + "grad_norm": 1.5537552775340278, + "language_loss": 0.6937784, + "learning_rate": 3.377237143507159e-07, + "loss": 0.77045631, + "num_input_tokens_seen": 293502330, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.09216309, + "step": 13604, + "time_per_iteration": 2.5589122772216797 + }, + { + "auxiliary_loss_clip": 0.06399256, + "auxiliary_loss_mlp": 0.01266705, + "balance_loss_clip": 0.06269272, + "balance_loss_mlp": 0.01257561, + "epoch": 0.8179768525477228, + "flos": 22863397870080.0, + "grad_norm": 1.66498006246138, + "language_loss": 0.74241424, + "learning_rate": 3.3750717947264406e-07, + "loss": 0.8190738, + "num_input_tokens_seen": 293521415, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.09143066, + "step": 13605, + "time_per_iteration": 2.490889549255371 + }, + { + "auxiliary_loss_clip": 0.06400125, + "auxiliary_loss_mlp": 0.0126796, + "balance_loss_clip": 0.06271368, + "balance_loss_mlp": 0.01257791, + "epoch": 0.8180369758003908, + "flos": 18521064529920.0, + "grad_norm": 1.9159252087251424, + "language_loss": 0.74754506, + "learning_rate": 3.372907076364666e-07, + "loss": 0.8242259, + "num_input_tokens_seen": 293539245, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.10168457, + "step": 13606, + "time_per_iteration": 2.492121696472168 + }, + { + "auxiliary_loss_clip": 0.0640065, + "auxiliary_loss_mlp": 0.01270584, + "balance_loss_clip": 0.06269969, + "balance_loss_mlp": 0.01261566, + "epoch": 0.8180970990530587, + "flos": 33190422503040.0, + "grad_norm": 2.3548971551907916, + "language_loss": 0.65977269, + "learning_rate": 3.370742988503916e-07, + "loss": 0.73648506, + "num_input_tokens_seen": 293560640, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.09020996, + "step": 13607, + "time_per_iteration": 2.5886800289154053 + }, + { + "auxiliary_loss_clip": 0.06403854, + "auxiliary_loss_mlp": 0.01264189, + "balance_loss_clip": 0.06270081, + "balance_loss_mlp": 0.01254449, + "epoch": 0.8181572223057267, + "flos": 25017094563840.0, + "grad_norm": 1.7602024891247647, + "language_loss": 0.70355219, + "learning_rate": 3.3685795312262634e-07, + "loss": 0.78023267, + "num_input_tokens_seen": 293579465, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.09741211, + "step": 13608, + "time_per_iteration": 3.953319549560547 + }, + { + "auxiliary_loss_clip": 0.06399265, + "auxiliary_loss_mlp": 0.01266613, + "balance_loss_clip": 0.06268296, + "balance_loss_mlp": 0.01257154, + "epoch": 0.8182173455583948, + "flos": 28556326846080.0, + "grad_norm": 1.587446090270585, + "language_loss": 0.79743207, + "learning_rate": 3.366416704613735e-07, + "loss": 0.87409091, + "num_input_tokens_seen": 293600540, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09454346, + "step": 13609, + "time_per_iteration": 2.5585644245147705 + }, + { + "auxiliary_loss_clip": 0.06308096, + "auxiliary_loss_mlp": 0.01250941, + "balance_loss_clip": 0.0625338, + "balance_loss_mlp": 0.01249896, + "epoch": 0.8182774688110627, + "flos": 72047051729280.0, + "grad_norm": 0.7345769255501511, + "language_loss": 0.55927861, + "learning_rate": 3.3642545087483544e-07, + "loss": 0.63486898, + "num_input_tokens_seen": 293665160, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.0104599, + "step": 13610, + "time_per_iteration": 3.2368791103363037 + }, + { + "auxiliary_loss_clip": 0.06394055, + "auxiliary_loss_mlp": 0.01265977, + "balance_loss_clip": 0.06268248, + "balance_loss_mlp": 0.01257078, + "epoch": 0.8183375920637307, + "flos": 19761431719680.0, + "grad_norm": 1.6752147679341796, + "language_loss": 0.78055751, + "learning_rate": 3.362092943712107e-07, + "loss": 0.85715789, + "num_input_tokens_seen": 293683995, + "router_z_loss_clip": 1.25878906, + "router_z_loss_mlp": 0.08892822, + "step": 13611, + "time_per_iteration": 2.5044984817504883 + }, + { + "auxiliary_loss_clip": 0.06411519, + "auxiliary_loss_mlp": 0.0126604, + "balance_loss_clip": 0.06271686, + "balance_loss_mlp": 0.01255467, + "epoch": 0.8183977153163986, + "flos": 22347740643840.0, + "grad_norm": 1.936289550368914, + "language_loss": 0.77789629, + "learning_rate": 3.3599320095869745e-07, + "loss": 0.85467196, + "num_input_tokens_seen": 293704115, + "router_z_loss_clip": 1.39941406, + "router_z_loss_mlp": 0.10577393, + "step": 13612, + "time_per_iteration": 3.9493825435638428 + }, + { + "auxiliary_loss_clip": 0.06397919, + "auxiliary_loss_mlp": 0.01263793, + "balance_loss_clip": 0.0626799, + "balance_loss_mlp": 0.01254733, + "epoch": 0.8184578385690666, + "flos": 17718256961280.0, + "grad_norm": 1.9954357370848774, + "language_loss": 0.86433131, + "learning_rate": 3.3577717064548793e-07, + "loss": 0.94094843, + "num_input_tokens_seen": 293722225, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.09063721, + "step": 13613, + "time_per_iteration": 2.493557929992676 + }, + { + "auxiliary_loss_clip": 0.0640028, + "auxiliary_loss_mlp": 0.01263676, + "balance_loss_clip": 0.06271352, + "balance_loss_mlp": 0.01254687, + "epoch": 0.8185179618217345, + "flos": 25707996604800.0, + "grad_norm": 1.7004353778600403, + "language_loss": 0.73161137, + "learning_rate": 3.355612034397746e-07, + "loss": 0.8082509, + "num_input_tokens_seen": 293743995, + "router_z_loss_clip": 1.28808594, + "router_z_loss_mlp": 0.08990479, + "step": 13614, + "time_per_iteration": 2.5492005348205566 + }, + { + "auxiliary_loss_clip": 0.06401198, + "auxiliary_loss_mlp": 0.01266903, + "balance_loss_clip": 0.06267008, + "balance_loss_mlp": 0.01257551, + "epoch": 0.8185780850744026, + "flos": 25967837715840.0, + "grad_norm": 2.1633186140321583, + "language_loss": 0.81232059, + "learning_rate": 3.353452993497479e-07, + "loss": 0.88900155, + "num_input_tokens_seen": 293764935, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.09344482, + "step": 13615, + "time_per_iteration": 2.569638967514038 + }, + { + "auxiliary_loss_clip": 0.0640194, + "auxiliary_loss_mlp": 0.01265752, + "balance_loss_clip": 0.06269952, + "balance_loss_mlp": 0.01256049, + "epoch": 0.8186382083270705, + "flos": 25235455104000.0, + "grad_norm": 2.0233105033334158, + "language_loss": 0.75650156, + "learning_rate": 3.3512945838359375e-07, + "loss": 0.83317852, + "num_input_tokens_seen": 293784035, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.09698486, + "step": 13616, + "time_per_iteration": 3.942837715148926 + }, + { + "auxiliary_loss_clip": 0.06401451, + "auxiliary_loss_mlp": 0.01266202, + "balance_loss_clip": 0.06271508, + "balance_loss_mlp": 0.01256361, + "epoch": 0.8186983315797385, + "flos": 22420890858240.0, + "grad_norm": 1.6571547627109076, + "language_loss": 0.75235343, + "learning_rate": 3.349136805494979e-07, + "loss": 0.82902998, + "num_input_tokens_seen": 293803360, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.09838867, + "step": 13617, + "time_per_iteration": 2.4979913234710693 + }, + { + "auxiliary_loss_clip": 0.06399617, + "auxiliary_loss_mlp": 0.01267076, + "balance_loss_clip": 0.06269683, + "balance_loss_mlp": 0.01257968, + "epoch": 0.8187584548324064, + "flos": 22024560245760.0, + "grad_norm": 1.7428000144990041, + "language_loss": 0.68450582, + "learning_rate": 3.346979658556415e-07, + "loss": 0.76117277, + "num_input_tokens_seen": 293821325, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.09112549, + "step": 13618, + "time_per_iteration": 2.58243465423584 + }, + { + "auxiliary_loss_clip": 0.06411767, + "auxiliary_loss_mlp": 0.01263534, + "balance_loss_clip": 0.06273052, + "balance_loss_mlp": 0.01253604, + "epoch": 0.8188185780850744, + "flos": 29249325239040.0, + "grad_norm": 1.8955704094009027, + "language_loss": 0.69656849, + "learning_rate": 3.344823143102058e-07, + "loss": 0.77332145, + "num_input_tokens_seen": 293840315, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.09936523, + "step": 13619, + "time_per_iteration": 2.552861452102661 + }, + { + "auxiliary_loss_clip": 0.06405166, + "auxiliary_loss_mlp": 0.01267919, + "balance_loss_clip": 0.06271726, + "balance_loss_mlp": 0.01258257, + "epoch": 0.8188787013377423, + "flos": 20701483476480.0, + "grad_norm": 2.032902910475726, + "language_loss": 0.74368906, + "learning_rate": 3.3426672592136694e-07, + "loss": 0.82041991, + "num_input_tokens_seen": 293855685, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.09661865, + "step": 13620, + "time_per_iteration": 2.4927451610565186 + }, + { + "auxiliary_loss_clip": 0.06397671, + "auxiliary_loss_mlp": 0.01265569, + "balance_loss_clip": 0.06268847, + "balance_loss_mlp": 0.01256515, + "epoch": 0.8189388245904103, + "flos": 23739816850560.0, + "grad_norm": 1.5173921020881993, + "language_loss": 0.76409143, + "learning_rate": 3.340512006973011e-07, + "loss": 0.84072381, + "num_input_tokens_seen": 293875540, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.09051514, + "step": 13621, + "time_per_iteration": 2.4968793392181396 + }, + { + "auxiliary_loss_clip": 0.06401004, + "auxiliary_loss_mlp": 0.01262724, + "balance_loss_clip": 0.06269065, + "balance_loss_mlp": 0.01252425, + "epoch": 0.8189989478430784, + "flos": 28262342396160.0, + "grad_norm": 2.0156762185325934, + "language_loss": 0.66266668, + "learning_rate": 3.3383573864618076e-07, + "loss": 0.73930395, + "num_input_tokens_seen": 293896570, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.10302734, + "step": 13622, + "time_per_iteration": 2.556950807571411 + }, + { + "auxiliary_loss_clip": 0.06408148, + "auxiliary_loss_mlp": 0.01264921, + "balance_loss_clip": 0.06274983, + "balance_loss_mlp": 0.01255563, + "epoch": 0.8190590710957463, + "flos": 21404125088640.0, + "grad_norm": 1.7883534032676356, + "language_loss": 0.75312483, + "learning_rate": 3.3362033977617653e-07, + "loss": 0.8298555, + "num_input_tokens_seen": 293914680, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.09356689, + "step": 13623, + "time_per_iteration": 2.499490261077881 + }, + { + "auxiliary_loss_clip": 0.06404785, + "auxiliary_loss_mlp": 0.01265588, + "balance_loss_clip": 0.06270933, + "balance_loss_mlp": 0.01256075, + "epoch": 0.8191191943484143, + "flos": 38804960384640.0, + "grad_norm": 1.8675492206945747, + "language_loss": 0.63666874, + "learning_rate": 3.3340500409545527e-07, + "loss": 0.71337247, + "num_input_tokens_seen": 293936480, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.09515381, + "step": 13624, + "time_per_iteration": 2.6544313430786133 + }, + { + "auxiliary_loss_clip": 0.06400229, + "auxiliary_loss_mlp": 0.01264991, + "balance_loss_clip": 0.06273144, + "balance_loss_mlp": 0.01256104, + "epoch": 0.8191793176010822, + "flos": 25453438300800.0, + "grad_norm": 1.460649877308724, + "language_loss": 0.78395194, + "learning_rate": 3.3318973161218386e-07, + "loss": 0.86060411, + "num_input_tokens_seen": 293957815, + "router_z_loss_clip": 1.27148438, + "router_z_loss_mlp": 0.08880615, + "step": 13625, + "time_per_iteration": 2.5359597206115723 + }, + { + "auxiliary_loss_clip": 0.06410608, + "auxiliary_loss_mlp": 0.01263881, + "balance_loss_clip": 0.06269354, + "balance_loss_mlp": 0.01254035, + "epoch": 0.8192394408537502, + "flos": 25090118997120.0, + "grad_norm": 1.884478371292304, + "language_loss": 0.75783712, + "learning_rate": 3.329745223345244e-07, + "loss": 0.83458203, + "num_input_tokens_seen": 293975440, + "router_z_loss_clip": 1.41308594, + "router_z_loss_mlp": 0.09851074, + "step": 13626, + "time_per_iteration": 2.532646656036377 + }, + { + "auxiliary_loss_clip": 0.06401683, + "auxiliary_loss_mlp": 0.0126518, + "balance_loss_clip": 0.06270789, + "balance_loss_mlp": 0.01256079, + "epoch": 0.8192995641064181, + "flos": 27681291457920.0, + "grad_norm": 1.4150920843677999, + "language_loss": 0.7395972, + "learning_rate": 3.3275937627063823e-07, + "loss": 0.81626576, + "num_input_tokens_seen": 293997540, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09100342, + "step": 13627, + "time_per_iteration": 2.5652401447296143 + }, + { + "auxiliary_loss_clip": 0.06406218, + "auxiliary_loss_mlp": 0.01265828, + "balance_loss_clip": 0.06270798, + "balance_loss_mlp": 0.01255522, + "epoch": 0.8193596873590862, + "flos": 21294944818560.0, + "grad_norm": 1.5860896739474306, + "language_loss": 0.68839937, + "learning_rate": 3.3254429342868353e-07, + "loss": 0.76511979, + "num_input_tokens_seen": 294017030, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.10308838, + "step": 13628, + "time_per_iteration": 2.5479671955108643 + }, + { + "auxiliary_loss_clip": 0.0641032, + "auxiliary_loss_mlp": 0.01265204, + "balance_loss_clip": 0.0627242, + "balance_loss_mlp": 0.01254702, + "epoch": 0.8194198106117541, + "flos": 17498219339520.0, + "grad_norm": 1.489340257893301, + "language_loss": 0.85434711, + "learning_rate": 3.323292738168171e-07, + "loss": 0.93110228, + "num_input_tokens_seen": 294035700, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.10516357, + "step": 13629, + "time_per_iteration": 2.483988046646118 + }, + { + "auxiliary_loss_clip": 0.06403497, + "auxiliary_loss_mlp": 0.01264453, + "balance_loss_clip": 0.06271183, + "balance_loss_mlp": 0.01255209, + "epoch": 0.8194799338644221, + "flos": 15273301075200.0, + "grad_norm": 2.0412434679276203, + "language_loss": 0.74637675, + "learning_rate": 3.3211431744319084e-07, + "loss": 0.82305628, + "num_input_tokens_seen": 294049730, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.09246826, + "step": 13630, + "time_per_iteration": 2.535431146621704 + }, + { + "auxiliary_loss_clip": 0.06406824, + "auxiliary_loss_mlp": 0.0126407, + "balance_loss_clip": 0.06272252, + "balance_loss_mlp": 0.01254468, + "epoch": 0.81954005711709, + "flos": 14723793999360.0, + "grad_norm": 1.6899565751326817, + "language_loss": 0.72566128, + "learning_rate": 3.31899424315957e-07, + "loss": 0.80237019, + "num_input_tokens_seen": 294066545, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.09606934, + "step": 13631, + "time_per_iteration": 2.4677011966705322 + }, + { + "auxiliary_loss_clip": 0.06404364, + "auxiliary_loss_mlp": 0.01262964, + "balance_loss_clip": 0.06271352, + "balance_loss_mlp": 0.01253838, + "epoch": 0.819600180369758, + "flos": 23080416744960.0, + "grad_norm": 1.6434442490728178, + "language_loss": 0.76678276, + "learning_rate": 3.3168459444326447e-07, + "loss": 0.84345603, + "num_input_tokens_seen": 294087455, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.09124756, + "step": 13632, + "time_per_iteration": 2.567342519760132 + }, + { + "auxiliary_loss_clip": 0.06402865, + "auxiliary_loss_mlp": 0.01264631, + "balance_loss_clip": 0.06271514, + "balance_loss_mlp": 0.0125597, + "epoch": 0.8196603036224259, + "flos": 27607176921600.0, + "grad_norm": 1.7777195570066433, + "language_loss": 0.66198611, + "learning_rate": 3.314698278332588e-07, + "loss": 0.73866111, + "num_input_tokens_seen": 294107480, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.08660889, + "step": 13633, + "time_per_iteration": 2.5596518516540527 + }, + { + "auxiliary_loss_clip": 0.06396772, + "auxiliary_loss_mlp": 0.01261231, + "balance_loss_clip": 0.06268521, + "balance_loss_mlp": 0.0125235, + "epoch": 0.8197204268750939, + "flos": 28589086592640.0, + "grad_norm": 1.7854505067066941, + "language_loss": 0.75938737, + "learning_rate": 3.3125512449408513e-07, + "loss": 0.83596742, + "num_input_tokens_seen": 294130115, + "router_z_loss_clip": 1.28222656, + "router_z_loss_mlp": 0.08880615, + "step": 13634, + "time_per_iteration": 2.5733511447906494 + }, + { + "auxiliary_loss_clip": 0.06397436, + "auxiliary_loss_mlp": 0.01268994, + "balance_loss_clip": 0.06269581, + "balance_loss_mlp": 0.01259892, + "epoch": 0.819780550127762, + "flos": 23265011289600.0, + "grad_norm": 2.294761376034913, + "language_loss": 0.81912637, + "learning_rate": 3.310404844338841e-07, + "loss": 0.8957907, + "num_input_tokens_seen": 294148495, + "router_z_loss_clip": 1.27734375, + "router_z_loss_mlp": 0.09094238, + "step": 13635, + "time_per_iteration": 2.521009922027588 + }, + { + "auxiliary_loss_clip": 0.06407675, + "auxiliary_loss_mlp": 0.01266496, + "balance_loss_clip": 0.0627351, + "balance_loss_mlp": 0.01256506, + "epoch": 0.8198406733804299, + "flos": 26692086481920.0, + "grad_norm": 1.490251576995218, + "language_loss": 0.75829619, + "learning_rate": 3.308259076607949e-07, + "loss": 0.83503789, + "num_input_tokens_seen": 294169595, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.09997559, + "step": 13636, + "time_per_iteration": 2.566101551055908 + }, + { + "auxiliary_loss_clip": 0.06401062, + "auxiliary_loss_mlp": 0.01262174, + "balance_loss_clip": 0.06270652, + "balance_loss_mlp": 0.01252727, + "epoch": 0.8199007966330979, + "flos": 20090272268160.0, + "grad_norm": 1.9556414121680055, + "language_loss": 0.81463081, + "learning_rate": 3.3061139418295445e-07, + "loss": 0.89126313, + "num_input_tokens_seen": 294183885, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.09454346, + "step": 13637, + "time_per_iteration": 2.4868228435516357 + }, + { + "auxiliary_loss_clip": 0.06399955, + "auxiliary_loss_mlp": 0.01262595, + "balance_loss_clip": 0.06269722, + "balance_loss_mlp": 0.01253452, + "epoch": 0.8199609198857658, + "flos": 31910503386240.0, + "grad_norm": 1.913324988944965, + "language_loss": 0.70908749, + "learning_rate": 3.3039694400849725e-07, + "loss": 0.78571296, + "num_input_tokens_seen": 294200150, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.09143066, + "step": 13638, + "time_per_iteration": 3.970994472503662 + }, + { + "auxiliary_loss_clip": 0.06407509, + "auxiliary_loss_mlp": 0.01266247, + "balance_loss_clip": 0.06270528, + "balance_loss_mlp": 0.01255232, + "epoch": 0.8200210431384338, + "flos": 26477583229440.0, + "grad_norm": 3.8164285850122854, + "language_loss": 0.80088663, + "learning_rate": 3.3018255714555564e-07, + "loss": 0.87762421, + "num_input_tokens_seen": 294220385, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.11016846, + "step": 13639, + "time_per_iteration": 2.5780816078186035 + }, + { + "auxiliary_loss_clip": 0.06400024, + "auxiliary_loss_mlp": 0.0126402, + "balance_loss_clip": 0.06269088, + "balance_loss_mlp": 0.01255121, + "epoch": 0.8200811663911017, + "flos": 22098087803520.0, + "grad_norm": 2.39997075184638, + "language_loss": 0.79083264, + "learning_rate": 3.299682336022589e-07, + "loss": 0.86747313, + "num_input_tokens_seen": 294239355, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.08898926, + "step": 13640, + "time_per_iteration": 2.482212781906128 + }, + { + "auxiliary_loss_clip": 0.06413399, + "auxiliary_loss_mlp": 0.01270919, + "balance_loss_clip": 0.06273437, + "balance_loss_mlp": 0.01261103, + "epoch": 0.8201412896437698, + "flos": 37602174551040.0, + "grad_norm": 1.7066462026776184, + "language_loss": 0.63058311, + "learning_rate": 3.297539733867336e-07, + "loss": 0.70742631, + "num_input_tokens_seen": 294259395, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.09820557, + "step": 13641, + "time_per_iteration": 2.698233127593994 + }, + { + "auxiliary_loss_clip": 0.06402028, + "auxiliary_loss_mlp": 0.01266334, + "balance_loss_clip": 0.06270909, + "balance_loss_mlp": 0.0125638, + "epoch": 0.8202014128964377, + "flos": 19652461084800.0, + "grad_norm": 1.9366215144343786, + "language_loss": 0.73740256, + "learning_rate": 3.295397765071055e-07, + "loss": 0.81408608, + "num_input_tokens_seen": 294277365, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.09942627, + "step": 13642, + "time_per_iteration": 2.511960744857788 + }, + { + "auxiliary_loss_clip": 0.06402153, + "auxiliary_loss_mlp": 0.01267253, + "balance_loss_clip": 0.06271218, + "balance_loss_mlp": 0.0125796, + "epoch": 0.8202615361491057, + "flos": 31475375533440.0, + "grad_norm": 1.5751213862396989, + "language_loss": 0.70581281, + "learning_rate": 3.2932564297149615e-07, + "loss": 0.78250694, + "num_input_tokens_seen": 294297555, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09295654, + "step": 13643, + "time_per_iteration": 2.6206700801849365 + }, + { + "auxiliary_loss_clip": 0.06402344, + "auxiliary_loss_mlp": 0.01268107, + "balance_loss_clip": 0.06272536, + "balance_loss_mlp": 0.01259215, + "epoch": 0.8203216594017736, + "flos": 24722145792000.0, + "grad_norm": 1.7907058552656372, + "language_loss": 0.66027546, + "learning_rate": 3.291115727880256e-07, + "loss": 0.73697996, + "num_input_tokens_seen": 294317600, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.08898926, + "step": 13644, + "time_per_iteration": 2.5345609188079834 + }, + { + "auxiliary_loss_clip": 0.06405216, + "auxiliary_loss_mlp": 0.01267007, + "balance_loss_clip": 0.06271123, + "balance_loss_mlp": 0.01257149, + "epoch": 0.8203817826544416, + "flos": 26039101213440.0, + "grad_norm": 1.3794487731864136, + "language_loss": 0.70734018, + "learning_rate": 3.2889756596481234e-07, + "loss": 0.78406239, + "num_input_tokens_seen": 294340215, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.09863281, + "step": 13645, + "time_per_iteration": 2.554086446762085 + }, + { + "auxiliary_loss_clip": 0.0639934, + "auxiliary_loss_mlp": 0.0126526, + "balance_loss_clip": 0.06269216, + "balance_loss_mlp": 0.01256069, + "epoch": 0.8204419059071095, + "flos": 25961087462400.0, + "grad_norm": 1.8361710653661691, + "language_loss": 0.7172327, + "learning_rate": 3.286836225099707e-07, + "loss": 0.79387873, + "num_input_tokens_seen": 294358590, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.09185791, + "step": 13646, + "time_per_iteration": 2.5273547172546387 + }, + { + "auxiliary_loss_clip": 0.06407963, + "auxiliary_loss_mlp": 0.01268435, + "balance_loss_clip": 0.062723, + "balance_loss_mlp": 0.01258642, + "epoch": 0.8205020291597775, + "flos": 23585717992320.0, + "grad_norm": 2.0092863306251676, + "language_loss": 0.79515278, + "learning_rate": 3.284697424316132e-07, + "loss": 0.87191677, + "num_input_tokens_seen": 294375825, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.09796143, + "step": 13647, + "time_per_iteration": 2.521698474884033 + }, + { + "auxiliary_loss_clip": 0.06397481, + "auxiliary_loss_mlp": 0.01264437, + "balance_loss_clip": 0.06270614, + "balance_loss_mlp": 0.01255294, + "epoch": 0.8205621524124456, + "flos": 26806759194240.0, + "grad_norm": 1.3474560258501684, + "language_loss": 0.68241918, + "learning_rate": 3.2825592573785034e-07, + "loss": 0.75903839, + "num_input_tokens_seen": 294398500, + "router_z_loss_clip": 1.26953125, + "router_z_loss_mlp": 0.09136963, + "step": 13648, + "time_per_iteration": 3.9541409015655518 + }, + { + "auxiliary_loss_clip": 0.06400238, + "auxiliary_loss_mlp": 0.01268028, + "balance_loss_clip": 0.06267244, + "balance_loss_mlp": 0.0125799, + "epoch": 0.8206222756651135, + "flos": 27535410299520.0, + "grad_norm": 1.764747006246769, + "language_loss": 0.80002069, + "learning_rate": 3.28042172436791e-07, + "loss": 0.87670338, + "num_input_tokens_seen": 294418840, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.10046387, + "step": 13649, + "time_per_iteration": 2.6316652297973633 + }, + { + "auxiliary_loss_clip": 0.06406631, + "auxiliary_loss_mlp": 0.01266937, + "balance_loss_clip": 0.06273945, + "balance_loss_mlp": 0.01256917, + "epoch": 0.8206823989177815, + "flos": 21184967934720.0, + "grad_norm": 1.546894359217093, + "language_loss": 0.69079524, + "learning_rate": 3.278284825365396e-07, + "loss": 0.76753092, + "num_input_tokens_seen": 294438215, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.10021973, + "step": 13650, + "time_per_iteration": 2.5335919857025146 + }, + { + "auxiliary_loss_clip": 0.06402709, + "auxiliary_loss_mlp": 0.01267243, + "balance_loss_clip": 0.06271584, + "balance_loss_mlp": 0.01257176, + "epoch": 0.8207425221704494, + "flos": 11514324660480.0, + "grad_norm": 2.3595864556173614, + "language_loss": 0.61227095, + "learning_rate": 3.276148560452001e-07, + "loss": 0.68897045, + "num_input_tokens_seen": 294455260, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.10064697, + "step": 13651, + "time_per_iteration": 2.4735312461853027 + }, + { + "auxiliary_loss_clip": 0.06405269, + "auxiliary_loss_mlp": 0.01265417, + "balance_loss_clip": 0.06270625, + "balance_loss_mlp": 0.01254968, + "epoch": 0.8208026454231174, + "flos": 19798090680960.0, + "grad_norm": 2.1521682694916313, + "language_loss": 0.72795534, + "learning_rate": 3.2740129297087293e-07, + "loss": 0.80466217, + "num_input_tokens_seen": 294473205, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.10449219, + "step": 13652, + "time_per_iteration": 3.9963738918304443 + }, + { + "auxiliary_loss_clip": 0.06397925, + "auxiliary_loss_mlp": 0.01264227, + "balance_loss_clip": 0.06271631, + "balance_loss_mlp": 0.01255525, + "epoch": 0.8208627686757853, + "flos": 15672692361600.0, + "grad_norm": 2.2964720489620976, + "language_loss": 0.72892058, + "learning_rate": 3.271877933216558e-07, + "loss": 0.80554199, + "num_input_tokens_seen": 294490645, + "router_z_loss_clip": 1.26269531, + "router_z_loss_mlp": 0.08709717, + "step": 13653, + "time_per_iteration": 2.4748480319976807 + }, + { + "auxiliary_loss_clip": 0.06416966, + "auxiliary_loss_mlp": 0.01270598, + "balance_loss_clip": 0.06278365, + "balance_loss_mlp": 0.01260132, + "epoch": 0.8209228919284534, + "flos": 37490897928960.0, + "grad_norm": 1.7768200929387925, + "language_loss": 0.6321249, + "learning_rate": 3.269743571056451e-07, + "loss": 0.70900059, + "num_input_tokens_seen": 294513500, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.10473633, + "step": 13654, + "time_per_iteration": 2.6520609855651855 + }, + { + "auxiliary_loss_clip": 0.06403168, + "auxiliary_loss_mlp": 0.01264491, + "balance_loss_clip": 0.06268303, + "balance_loss_mlp": 0.01254651, + "epoch": 0.8209830151811213, + "flos": 23119759036800.0, + "grad_norm": 1.6261113247907222, + "language_loss": 0.7042315, + "learning_rate": 3.2676098433093447e-07, + "loss": 0.78090811, + "num_input_tokens_seen": 294535710, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.09832764, + "step": 13655, + "time_per_iteration": 4.040972948074341 + }, + { + "auxiliary_loss_clip": 0.06399737, + "auxiliary_loss_mlp": 0.01264964, + "balance_loss_clip": 0.06270001, + "balance_loss_mlp": 0.01255528, + "epoch": 0.8210431384337893, + "flos": 21294567475200.0, + "grad_norm": 1.966782681323648, + "language_loss": 0.8200593, + "learning_rate": 3.265476750056162e-07, + "loss": 0.89670628, + "num_input_tokens_seen": 294554055, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.09429932, + "step": 13656, + "time_per_iteration": 2.5089569091796875 + }, + { + "auxiliary_loss_clip": 0.06398742, + "auxiliary_loss_mlp": 0.01266727, + "balance_loss_clip": 0.06270912, + "balance_loss_mlp": 0.01256898, + "epoch": 0.8211032616864572, + "flos": 11505897325440.0, + "grad_norm": 2.0352847360821196, + "language_loss": 0.73977625, + "learning_rate": 3.2633442913777654e-07, + "loss": 0.81643093, + "num_input_tokens_seen": 294570390, + "router_z_loss_clip": 1.27832031, + "router_z_loss_mlp": 0.0982666, + "step": 13657, + "time_per_iteration": 2.521794080734253 + }, + { + "auxiliary_loss_clip": 0.0640122, + "auxiliary_loss_mlp": 0.0126387, + "balance_loss_clip": 0.06269388, + "balance_loss_mlp": 0.01254966, + "epoch": 0.8211633849391252, + "flos": 29828573314560.0, + "grad_norm": 1.5761103965210477, + "language_loss": 0.55795848, + "learning_rate": 3.2612124673550325e-07, + "loss": 0.63460934, + "num_input_tokens_seen": 294593050, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.08898926, + "step": 13658, + "time_per_iteration": 2.591646432876587 + }, + { + "auxiliary_loss_clip": 0.06403513, + "auxiliary_loss_mlp": 0.01264952, + "balance_loss_clip": 0.0627034, + "balance_loss_mlp": 0.01255427, + "epoch": 0.8212235081917931, + "flos": 13120484849280.0, + "grad_norm": 8.99922619161595, + "language_loss": 0.794406, + "learning_rate": 3.259081278068805e-07, + "loss": 0.87109065, + "num_input_tokens_seen": 294608550, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.09521484, + "step": 13659, + "time_per_iteration": 2.4667892456054688 + }, + { + "auxiliary_loss_clip": 0.06397028, + "auxiliary_loss_mlp": 0.01264114, + "balance_loss_clip": 0.06268722, + "balance_loss_mlp": 0.01255424, + "epoch": 0.8212836314444611, + "flos": 40524828963840.0, + "grad_norm": 1.49148705733067, + "language_loss": 0.59613037, + "learning_rate": 3.256950723599887e-07, + "loss": 0.67274177, + "num_input_tokens_seen": 294630380, + "router_z_loss_clip": 1.28320312, + "router_z_loss_mlp": 0.08691406, + "step": 13660, + "time_per_iteration": 2.6636433601379395 + }, + { + "auxiliary_loss_clip": 0.06408002, + "auxiliary_loss_mlp": 0.0126705, + "balance_loss_clip": 0.06273358, + "balance_loss_mlp": 0.01256811, + "epoch": 0.8213437546971292, + "flos": 18776503301760.0, + "grad_norm": 1.9851690167899483, + "language_loss": 0.73083544, + "learning_rate": 3.254820804029075e-07, + "loss": 0.80758601, + "num_input_tokens_seen": 294648655, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.10241699, + "step": 13661, + "time_per_iteration": 2.4820919036865234 + }, + { + "auxiliary_loss_clip": 0.06408828, + "auxiliary_loss_mlp": 0.01265721, + "balance_loss_clip": 0.06272434, + "balance_loss_mlp": 0.01255904, + "epoch": 0.8214038779497971, + "flos": 19688323432320.0, + "grad_norm": 1.9325667410517933, + "language_loss": 0.75407529, + "learning_rate": 3.252691519437143e-07, + "loss": 0.8308208, + "num_input_tokens_seen": 294666915, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.09814453, + "step": 13662, + "time_per_iteration": 2.473001718521118 + }, + { + "auxiliary_loss_clip": 0.06316656, + "auxiliary_loss_mlp": 0.01256268, + "balance_loss_clip": 0.06261721, + "balance_loss_mlp": 0.01255036, + "epoch": 0.8214640012024651, + "flos": 71624040791040.0, + "grad_norm": 0.7272151584082011, + "language_loss": 0.54061127, + "learning_rate": 3.250562869904825e-07, + "loss": 0.61634052, + "num_input_tokens_seen": 294731545, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.01231384, + "step": 13663, + "time_per_iteration": 3.272303342819214 + }, + { + "auxiliary_loss_clip": 0.06399679, + "auxiliary_loss_mlp": 0.01266039, + "balance_loss_clip": 0.06268212, + "balance_loss_mlp": 0.0125643, + "epoch": 0.821524124455133, + "flos": 14762507385600.0, + "grad_norm": 2.215887467335205, + "language_loss": 0.65920115, + "learning_rate": 3.248434855512838e-07, + "loss": 0.73585832, + "num_input_tokens_seen": 294748745, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09613037, + "step": 13664, + "time_per_iteration": 2.477029323577881 + }, + { + "auxiliary_loss_clip": 0.06399576, + "auxiliary_loss_mlp": 0.01261557, + "balance_loss_clip": 0.06270959, + "balance_loss_mlp": 0.0125267, + "epoch": 0.821584247707801, + "flos": 25089238529280.0, + "grad_norm": 1.4192636174003572, + "language_loss": 0.75023228, + "learning_rate": 3.246307476341881e-07, + "loss": 0.82684362, + "num_input_tokens_seen": 294768955, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.08892822, + "step": 13665, + "time_per_iteration": 2.5525918006896973 + }, + { + "auxiliary_loss_clip": 0.06401828, + "auxiliary_loss_mlp": 0.01264308, + "balance_loss_clip": 0.06269041, + "balance_loss_mlp": 0.01254962, + "epoch": 0.8216443709604689, + "flos": 36839631669120.0, + "grad_norm": 1.9379151169740247, + "language_loss": 0.6576277, + "learning_rate": 3.2441807324726256e-07, + "loss": 0.73428911, + "num_input_tokens_seen": 294789250, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.09350586, + "step": 13666, + "time_per_iteration": 2.607255697250366 + }, + { + "auxiliary_loss_clip": 0.06399558, + "auxiliary_loss_mlp": 0.01266329, + "balance_loss_clip": 0.06267319, + "balance_loss_mlp": 0.01257132, + "epoch": 0.821704494213137, + "flos": 25088693477760.0, + "grad_norm": 1.6153303259870018, + "language_loss": 0.76945007, + "learning_rate": 3.2420546239857174e-07, + "loss": 0.84610897, + "num_input_tokens_seen": 294809760, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.09185791, + "step": 13667, + "time_per_iteration": 2.5342323780059814 + }, + { + "auxiliary_loss_clip": 0.0640602, + "auxiliary_loss_mlp": 0.0126598, + "balance_loss_clip": 0.06270644, + "balance_loss_mlp": 0.01255948, + "epoch": 0.8217646174658049, + "flos": 14361397090560.0, + "grad_norm": 2.524024827589192, + "language_loss": 0.77698529, + "learning_rate": 3.239929150961773e-07, + "loss": 0.85370529, + "num_input_tokens_seen": 294826495, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.10040283, + "step": 13668, + "time_per_iteration": 2.466806411743164 + }, + { + "auxiliary_loss_clip": 0.06399126, + "auxiliary_loss_mlp": 0.01264171, + "balance_loss_clip": 0.06269765, + "balance_loss_mlp": 0.01254933, + "epoch": 0.8218247407184729, + "flos": 22097039627520.0, + "grad_norm": 1.8128637689922475, + "language_loss": 0.73614395, + "learning_rate": 3.2378043134813984e-07, + "loss": 0.81277692, + "num_input_tokens_seen": 294845370, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.09240723, + "step": 13669, + "time_per_iteration": 2.5289034843444824 + }, + { + "auxiliary_loss_clip": 0.06400953, + "auxiliary_loss_mlp": 0.01266356, + "balance_loss_clip": 0.06269199, + "balance_loss_mlp": 0.01256819, + "epoch": 0.8218848639711408, + "flos": 16769694015360.0, + "grad_norm": 1.5575474443223831, + "language_loss": 0.79151839, + "learning_rate": 3.235680111625161e-07, + "loss": 0.86819142, + "num_input_tokens_seen": 294863740, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09533691, + "step": 13670, + "time_per_iteration": 2.4716804027557373 + }, + { + "auxiliary_loss_clip": 0.06409052, + "auxiliary_loss_mlp": 0.01266161, + "balance_loss_clip": 0.06273171, + "balance_loss_mlp": 0.01256415, + "epoch": 0.8219449872238088, + "flos": 26001981054720.0, + "grad_norm": 1.6601212313444933, + "language_loss": 0.7576502, + "learning_rate": 3.2335565454736123e-07, + "loss": 0.83440232, + "num_input_tokens_seen": 294882815, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.09741211, + "step": 13671, + "time_per_iteration": 2.550118923187256 + }, + { + "auxiliary_loss_clip": 0.06410009, + "auxiliary_loss_mlp": 0.01266966, + "balance_loss_clip": 0.06270236, + "balance_loss_mlp": 0.01255969, + "epoch": 0.8220051104764767, + "flos": 20784528472320.0, + "grad_norm": 1.5724018090314842, + "language_loss": 0.76455218, + "learning_rate": 3.23143361510728e-07, + "loss": 0.84132195, + "num_input_tokens_seen": 294901985, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.11004639, + "step": 13672, + "time_per_iteration": 2.5448882579803467 + }, + { + "auxiliary_loss_clip": 0.06402946, + "auxiliary_loss_mlp": 0.01263319, + "balance_loss_clip": 0.06269625, + "balance_loss_mlp": 0.01253175, + "epoch": 0.8220652337291448, + "flos": 14580134974080.0, + "grad_norm": 2.095121195508436, + "language_loss": 0.74924457, + "learning_rate": 3.2293113206066733e-07, + "loss": 0.82590723, + "num_input_tokens_seen": 294919705, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.10150146, + "step": 13673, + "time_per_iteration": 2.5026438236236572 + }, + { + "auxiliary_loss_clip": 0.06410329, + "auxiliary_loss_mlp": 0.01264871, + "balance_loss_clip": 0.06274365, + "balance_loss_mlp": 0.01254941, + "epoch": 0.8221253569818128, + "flos": 23812715502720.0, + "grad_norm": 1.4999475516036749, + "language_loss": 0.79556978, + "learning_rate": 3.227189662052254e-07, + "loss": 0.87232178, + "num_input_tokens_seen": 294939900, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.0993042, + "step": 13674, + "time_per_iteration": 2.5405590534210205 + }, + { + "auxiliary_loss_clip": 0.06404756, + "auxiliary_loss_mlp": 0.01265536, + "balance_loss_clip": 0.06272387, + "balance_loss_mlp": 0.01255398, + "epoch": 0.8221854802344807, + "flos": 21294651329280.0, + "grad_norm": 1.7231814451382148, + "language_loss": 0.70641446, + "learning_rate": 3.225068639524484e-07, + "loss": 0.78311741, + "num_input_tokens_seen": 294959110, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.10144043, + "step": 13675, + "time_per_iteration": 2.514972448348999 + }, + { + "auxiliary_loss_clip": 0.06394869, + "auxiliary_loss_mlp": 0.01267052, + "balance_loss_clip": 0.06267343, + "balance_loss_mlp": 0.01257885, + "epoch": 0.8222456034871487, + "flos": 20962624325760.0, + "grad_norm": 1.5221695463620175, + "language_loss": 0.74239552, + "learning_rate": 3.2229482531037965e-07, + "loss": 0.81901473, + "num_input_tokens_seen": 294978660, + "router_z_loss_clip": 1.27539062, + "router_z_loss_mlp": 0.0916748, + "step": 13676, + "time_per_iteration": 2.6151413917541504 + }, + { + "auxiliary_loss_clip": 0.06403306, + "auxiliary_loss_mlp": 0.01266386, + "balance_loss_clip": 0.06270932, + "balance_loss_mlp": 0.01257195, + "epoch": 0.8223057267398166, + "flos": 21403915453440.0, + "grad_norm": 1.5912659161296756, + "language_loss": 0.80806673, + "learning_rate": 3.2208285028705893e-07, + "loss": 0.88476366, + "num_input_tokens_seen": 294998075, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.09191895, + "step": 13677, + "time_per_iteration": 2.524010419845581 + }, + { + "auxiliary_loss_clip": 0.06403331, + "auxiliary_loss_mlp": 0.01265658, + "balance_loss_clip": 0.06269956, + "balance_loss_mlp": 0.01256258, + "epoch": 0.8223658499924846, + "flos": 15273636491520.0, + "grad_norm": 1.9046398747416602, + "language_loss": 0.70346785, + "learning_rate": 3.218709388905245e-07, + "loss": 0.78015774, + "num_input_tokens_seen": 295015950, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.09405518, + "step": 13678, + "time_per_iteration": 3.918046236038208 + }, + { + "auxiliary_loss_clip": 0.06398967, + "auxiliary_loss_mlp": 0.01266892, + "balance_loss_clip": 0.06268647, + "balance_loss_mlp": 0.0125785, + "epoch": 0.8224259732451525, + "flos": 31257727752960.0, + "grad_norm": 1.3904742391636824, + "language_loss": 0.71421492, + "learning_rate": 3.216590911288133e-07, + "loss": 0.79087353, + "num_input_tokens_seen": 295036800, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.09039307, + "step": 13679, + "time_per_iteration": 2.5868563652038574 + }, + { + "auxiliary_loss_clip": 0.06397314, + "auxiliary_loss_mlp": 0.0126288, + "balance_loss_clip": 0.06268158, + "balance_loss_mlp": 0.01253748, + "epoch": 0.8224860964978206, + "flos": 21580166517120.0, + "grad_norm": 2.1427210155629797, + "language_loss": 0.70038605, + "learning_rate": 3.214473070099564e-07, + "loss": 0.77698797, + "num_input_tokens_seen": 295055300, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.09130859, + "step": 13680, + "time_per_iteration": 2.5864291191101074 + }, + { + "auxiliary_loss_clip": 0.06400996, + "auxiliary_loss_mlp": 0.01262464, + "balance_loss_clip": 0.06270762, + "balance_loss_mlp": 0.01253618, + "epoch": 0.8225462197504885, + "flos": 25490181116160.0, + "grad_norm": 1.609067591062343, + "language_loss": 0.60291123, + "learning_rate": 3.21235586541986e-07, + "loss": 0.67954582, + "num_input_tokens_seen": 295076420, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.08837891, + "step": 13681, + "time_per_iteration": 2.5397136211395264 + }, + { + "auxiliary_loss_clip": 0.06406465, + "auxiliary_loss_mlp": 0.01264863, + "balance_loss_clip": 0.06269526, + "balance_loss_mlp": 0.01255148, + "epoch": 0.8226063430031565, + "flos": 39394941782400.0, + "grad_norm": 1.559829133589283, + "language_loss": 0.70002699, + "learning_rate": 3.2102392973293047e-07, + "loss": 0.77674025, + "num_input_tokens_seen": 295100540, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.09716797, + "step": 13682, + "time_per_iteration": 2.685002565383911 + }, + { + "auxiliary_loss_clip": 0.06403206, + "auxiliary_loss_mlp": 0.01263586, + "balance_loss_clip": 0.06270599, + "balance_loss_mlp": 0.01253334, + "epoch": 0.8226664662558244, + "flos": 22821036831360.0, + "grad_norm": 1.8759178686827869, + "language_loss": 0.79682559, + "learning_rate": 3.20812336590816e-07, + "loss": 0.87349349, + "num_input_tokens_seen": 295120180, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.10253906, + "step": 13683, + "time_per_iteration": 2.519693613052368 + }, + { + "auxiliary_loss_clip": 0.06397998, + "auxiliary_loss_mlp": 0.01263016, + "balance_loss_clip": 0.06270218, + "balance_loss_mlp": 0.01254293, + "epoch": 0.8227265895084924, + "flos": 25672595454720.0, + "grad_norm": 1.9461522710413164, + "language_loss": 0.87060094, + "learning_rate": 3.206008071236661e-07, + "loss": 0.94721103, + "num_input_tokens_seen": 295138530, + "router_z_loss_clip": 1.27636719, + "router_z_loss_mlp": 0.08712769, + "step": 13684, + "time_per_iteration": 2.520162343978882 + }, + { + "auxiliary_loss_clip": 0.06394877, + "auxiliary_loss_mlp": 0.01264494, + "balance_loss_clip": 0.06267917, + "balance_loss_mlp": 0.0125556, + "epoch": 0.8227867127611603, + "flos": 26186827161600.0, + "grad_norm": 1.6760308925685343, + "language_loss": 0.80106431, + "learning_rate": 3.2038934133950157e-07, + "loss": 0.87765801, + "num_input_tokens_seen": 295160260, + "router_z_loss_clip": 1.26953125, + "router_z_loss_mlp": 0.08935547, + "step": 13685, + "time_per_iteration": 2.571464776992798 + }, + { + "auxiliary_loss_clip": 0.06403354, + "auxiliary_loss_mlp": 0.01266206, + "balance_loss_clip": 0.06270622, + "balance_loss_mlp": 0.01256848, + "epoch": 0.8228468360138284, + "flos": 22024602172800.0, + "grad_norm": 1.5711922940184833, + "language_loss": 0.68850559, + "learning_rate": 3.2017793924634194e-07, + "loss": 0.76520115, + "num_input_tokens_seen": 295177055, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.09356689, + "step": 13686, + "time_per_iteration": 2.516918182373047 + }, + { + "auxiliary_loss_clip": 0.06405336, + "auxiliary_loss_mlp": 0.01265357, + "balance_loss_clip": 0.06271816, + "balance_loss_mlp": 0.01256047, + "epoch": 0.8229069592664963, + "flos": 14908723960320.0, + "grad_norm": 2.294675899071434, + "language_loss": 0.78351545, + "learning_rate": 3.1996660085220263e-07, + "loss": 0.86022234, + "num_input_tokens_seen": 295193870, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.09307861, + "step": 13687, + "time_per_iteration": 3.9513440132141113 + }, + { + "auxiliary_loss_clip": 0.06402496, + "auxiliary_loss_mlp": 0.01262779, + "balance_loss_clip": 0.0627033, + "balance_loss_mlp": 0.01253022, + "epoch": 0.8229670825191643, + "flos": 15674956421760.0, + "grad_norm": 1.643594619200351, + "language_loss": 0.72294796, + "learning_rate": 3.1975532616509825e-07, + "loss": 0.79960072, + "num_input_tokens_seen": 295211040, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.09759521, + "step": 13688, + "time_per_iteration": 2.4735567569732666 + }, + { + "auxiliary_loss_clip": 0.0640309, + "auxiliary_loss_mlp": 0.01266638, + "balance_loss_clip": 0.0627107, + "balance_loss_mlp": 0.01257375, + "epoch": 0.8230272057718323, + "flos": 23189890504320.0, + "grad_norm": 1.5346344233597629, + "language_loss": 0.73226428, + "learning_rate": 3.1954411519304025e-07, + "loss": 0.80896151, + "num_input_tokens_seen": 295231300, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.09265137, + "step": 13689, + "time_per_iteration": 2.5417935848236084 + }, + { + "auxiliary_loss_clip": 0.06407392, + "auxiliary_loss_mlp": 0.01262871, + "balance_loss_clip": 0.06272584, + "balance_loss_mlp": 0.01253758, + "epoch": 0.8230873290245002, + "flos": 21038709432960.0, + "grad_norm": 2.1431822438071744, + "language_loss": 0.69692594, + "learning_rate": 3.1933296794403887e-07, + "loss": 0.77362859, + "num_input_tokens_seen": 295251045, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.09106445, + "step": 13690, + "time_per_iteration": 2.5107438564300537 + }, + { + "auxiliary_loss_clip": 0.0640377, + "auxiliary_loss_mlp": 0.01264747, + "balance_loss_clip": 0.06269638, + "balance_loss_mlp": 0.01255389, + "epoch": 0.8231474522771682, + "flos": 21256273359360.0, + "grad_norm": 1.6874962726355067, + "language_loss": 0.85794926, + "learning_rate": 3.191218844260988e-07, + "loss": 0.93463445, + "num_input_tokens_seen": 295270225, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.09350586, + "step": 13691, + "time_per_iteration": 4.0233988761901855 + }, + { + "auxiliary_loss_clip": 0.06406488, + "auxiliary_loss_mlp": 0.01265781, + "balance_loss_clip": 0.06271985, + "balance_loss_mlp": 0.0125637, + "epoch": 0.8232075755298361, + "flos": 23848829412480.0, + "grad_norm": 1.7540371277413798, + "language_loss": 0.76951766, + "learning_rate": 3.189108646472252e-07, + "loss": 0.8462404, + "num_input_tokens_seen": 295288950, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.09423828, + "step": 13692, + "time_per_iteration": 2.5032553672790527 + }, + { + "auxiliary_loss_clip": 0.06399276, + "auxiliary_loss_mlp": 0.01263194, + "balance_loss_clip": 0.06268877, + "balance_loss_mlp": 0.01254254, + "epoch": 0.8232676987825042, + "flos": 21660570109440.0, + "grad_norm": 1.5658390187310423, + "language_loss": 0.71956593, + "learning_rate": 3.186999086154205e-07, + "loss": 0.79619062, + "num_input_tokens_seen": 295309405, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.08935547, + "step": 13693, + "time_per_iteration": 2.5067594051361084 + }, + { + "auxiliary_loss_clip": 0.06396094, + "auxiliary_loss_mlp": 0.0126338, + "balance_loss_clip": 0.06269097, + "balance_loss_mlp": 0.01254367, + "epoch": 0.8233278220351721, + "flos": 26329857281280.0, + "grad_norm": 1.2936928608658458, + "language_loss": 0.8396762, + "learning_rate": 3.1848901633868355e-07, + "loss": 0.91627085, + "num_input_tokens_seen": 295331115, + "router_z_loss_clip": 1.26855469, + "router_z_loss_mlp": 0.09014893, + "step": 13694, + "time_per_iteration": 2.679731845855713 + }, + { + "auxiliary_loss_clip": 0.06406334, + "auxiliary_loss_mlp": 0.01265409, + "balance_loss_clip": 0.06271011, + "balance_loss_mlp": 0.01255771, + "epoch": 0.8233879452878401, + "flos": 21732252877440.0, + "grad_norm": 1.6355767467742353, + "language_loss": 0.77244568, + "learning_rate": 3.182781878250118e-07, + "loss": 0.84916306, + "num_input_tokens_seen": 295350495, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.09637451, + "step": 13695, + "time_per_iteration": 3.963965892791748 + }, + { + "auxiliary_loss_clip": 0.06402577, + "auxiliary_loss_mlp": 0.0126261, + "balance_loss_clip": 0.06271192, + "balance_loss_mlp": 0.0125389, + "epoch": 0.823448068540508, + "flos": 20563903872000.0, + "grad_norm": 1.8210752561146564, + "language_loss": 0.81778234, + "learning_rate": 3.1806742308239985e-07, + "loss": 0.89443427, + "num_input_tokens_seen": 295368225, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.0871582, + "step": 13696, + "time_per_iteration": 2.4970433712005615 + }, + { + "auxiliary_loss_clip": 0.06309157, + "auxiliary_loss_mlp": 0.01254773, + "balance_loss_clip": 0.06254191, + "balance_loss_mlp": 0.01253599, + "epoch": 0.823508191793176, + "flos": 67296130352640.0, + "grad_norm": 0.7182469320351987, + "language_loss": 0.63648844, + "learning_rate": 3.178567221188393e-07, + "loss": 0.7121278, + "num_input_tokens_seen": 295430035, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.01171875, + "step": 13697, + "time_per_iteration": 3.223705291748047 + }, + { + "auxiliary_loss_clip": 0.06395958, + "auxiliary_loss_mlp": 0.0126361, + "balance_loss_clip": 0.06268628, + "balance_loss_mlp": 0.01255003, + "epoch": 0.8235683150458439, + "flos": 17933724535680.0, + "grad_norm": 1.4706232042527567, + "language_loss": 0.72879517, + "learning_rate": 3.1764608494232037e-07, + "loss": 0.80539095, + "num_input_tokens_seen": 295447765, + "router_z_loss_clip": 1.27539062, + "router_z_loss_mlp": 0.08605957, + "step": 13698, + "time_per_iteration": 2.518505334854126 + }, + { + "auxiliary_loss_clip": 0.06407038, + "auxiliary_loss_mlp": 0.0126933, + "balance_loss_clip": 0.06273619, + "balance_loss_mlp": 0.01259203, + "epoch": 0.823628438298512, + "flos": 18922007116800.0, + "grad_norm": 1.754695390070976, + "language_loss": 0.71798617, + "learning_rate": 3.174355115608305e-07, + "loss": 0.79474986, + "num_input_tokens_seen": 295464810, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.10113525, + "step": 13699, + "time_per_iteration": 2.4939382076263428 + }, + { + "auxiliary_loss_clip": 0.06397603, + "auxiliary_loss_mlp": 0.01263248, + "balance_loss_clip": 0.06267754, + "balance_loss_mlp": 0.01253824, + "epoch": 0.8236885615511799, + "flos": 18702221057280.0, + "grad_norm": 1.8849458807724966, + "language_loss": 0.82397747, + "learning_rate": 3.1722500198235526e-07, + "loss": 0.90058601, + "num_input_tokens_seen": 295482605, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.09423828, + "step": 13700, + "time_per_iteration": 2.4839980602264404 + }, + { + "auxiliary_loss_clip": 0.06404804, + "auxiliary_loss_mlp": 0.01263758, + "balance_loss_clip": 0.06269407, + "balance_loss_mlp": 0.01254292, + "epoch": 0.8237486848038479, + "flos": 23701606588800.0, + "grad_norm": 1.5465027348479181, + "language_loss": 0.73049653, + "learning_rate": 3.170145562148763e-07, + "loss": 0.80718207, + "num_input_tokens_seen": 295503780, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.09466553, + "step": 13701, + "time_per_iteration": 2.5388693809509277 + }, + { + "auxiliary_loss_clip": 0.06404001, + "auxiliary_loss_mlp": 0.01265145, + "balance_loss_clip": 0.06270056, + "balance_loss_mlp": 0.01254625, + "epoch": 0.8238088080565159, + "flos": 23448138387840.0, + "grad_norm": 1.7645589694369792, + "language_loss": 0.69761407, + "learning_rate": 3.1680417426637384e-07, + "loss": 0.77430546, + "num_input_tokens_seen": 295522035, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.10522461, + "step": 13702, + "time_per_iteration": 2.5435500144958496 + }, + { + "auxiliary_loss_clip": 0.06406841, + "auxiliary_loss_mlp": 0.01264836, + "balance_loss_clip": 0.06274645, + "balance_loss_mlp": 0.01254882, + "epoch": 0.8238689313091838, + "flos": 22753001715840.0, + "grad_norm": 1.7292259180096456, + "language_loss": 0.74427319, + "learning_rate": 3.1659385614482603e-07, + "loss": 0.82098991, + "num_input_tokens_seen": 295541190, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.0994873, + "step": 13703, + "time_per_iteration": 2.5351295471191406 + }, + { + "auxiliary_loss_clip": 0.06409708, + "auxiliary_loss_mlp": 0.0126609, + "balance_loss_clip": 0.06270649, + "balance_loss_mlp": 0.01255868, + "epoch": 0.8239290545618518, + "flos": 25637236231680.0, + "grad_norm": 1.7672638463517, + "language_loss": 0.70240831, + "learning_rate": 3.1638360185820755e-07, + "loss": 0.77916628, + "num_input_tokens_seen": 295558860, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.10223389, + "step": 13704, + "time_per_iteration": 2.551124095916748 + }, + { + "auxiliary_loss_clip": 0.06402259, + "auxiliary_loss_mlp": 0.01263375, + "balance_loss_clip": 0.06270658, + "balance_loss_mlp": 0.01254392, + "epoch": 0.8239891778145197, + "flos": 26032854084480.0, + "grad_norm": 2.8793334355033076, + "language_loss": 0.64149827, + "learning_rate": 3.161734114144916e-07, + "loss": 0.71815455, + "num_input_tokens_seen": 295578155, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.08978271, + "step": 13705, + "time_per_iteration": 2.5648598670959473 + }, + { + "auxiliary_loss_clip": 0.06407434, + "auxiliary_loss_mlp": 0.01269004, + "balance_loss_clip": 0.06272142, + "balance_loss_mlp": 0.0125933, + "epoch": 0.8240493010671878, + "flos": 21839378722560.0, + "grad_norm": 2.201240453400887, + "language_loss": 0.69536072, + "learning_rate": 3.1596328482164915e-07, + "loss": 0.77212507, + "num_input_tokens_seen": 295599170, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.09680176, + "step": 13706, + "time_per_iteration": 2.5266029834747314 + }, + { + "auxiliary_loss_clip": 0.06408302, + "auxiliary_loss_mlp": 0.0126458, + "balance_loss_clip": 0.06274252, + "balance_loss_mlp": 0.01254483, + "epoch": 0.8241094243198557, + "flos": 18557891199360.0, + "grad_norm": 1.7625023749977664, + "language_loss": 0.69611287, + "learning_rate": 3.157532220876475e-07, + "loss": 0.77284169, + "num_input_tokens_seen": 295617465, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.10083008, + "step": 13707, + "time_per_iteration": 2.5589535236358643 + }, + { + "auxiliary_loss_clip": 0.06404749, + "auxiliary_loss_mlp": 0.01262733, + "balance_loss_clip": 0.06270427, + "balance_loss_mlp": 0.01252881, + "epoch": 0.8241695475725237, + "flos": 25454192987520.0, + "grad_norm": 1.5789270946690015, + "language_loss": 0.79172903, + "learning_rate": 3.1554322322045226e-07, + "loss": 0.86840385, + "num_input_tokens_seen": 295634960, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.09851074, + "step": 13708, + "time_per_iteration": 2.519388437271118 + }, + { + "auxiliary_loss_clip": 0.06402726, + "auxiliary_loss_mlp": 0.0126348, + "balance_loss_clip": 0.06268608, + "balance_loss_mlp": 0.01253418, + "epoch": 0.8242296708251916, + "flos": 18995702382720.0, + "grad_norm": 3.0867439551253195, + "language_loss": 0.69106972, + "learning_rate": 3.1533328822802664e-07, + "loss": 0.76773179, + "num_input_tokens_seen": 295652725, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.10064697, + "step": 13709, + "time_per_iteration": 2.505873441696167 + }, + { + "auxiliary_loss_clip": 0.06406131, + "auxiliary_loss_mlp": 0.01265491, + "balance_loss_clip": 0.06271987, + "balance_loss_mlp": 0.012558, + "epoch": 0.8242897940778596, + "flos": 22607372119680.0, + "grad_norm": 1.8896201135226782, + "language_loss": 0.83090842, + "learning_rate": 3.151234171183319e-07, + "loss": 0.9076246, + "num_input_tokens_seen": 295671195, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.09692383, + "step": 13710, + "time_per_iteration": 2.5083086490631104 + }, + { + "auxiliary_loss_clip": 0.06402289, + "auxiliary_loss_mlp": 0.01264664, + "balance_loss_clip": 0.06270906, + "balance_loss_mlp": 0.01254883, + "epoch": 0.8243499173305275, + "flos": 21474172702080.0, + "grad_norm": 2.010119969171323, + "language_loss": 0.78586245, + "learning_rate": 3.149136098993257e-07, + "loss": 0.8625319, + "num_input_tokens_seen": 295689130, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09780884, + "step": 13711, + "time_per_iteration": 2.504279136657715 + }, + { + "auxiliary_loss_clip": 0.06402823, + "auxiliary_loss_mlp": 0.01266322, + "balance_loss_clip": 0.06270982, + "balance_loss_mlp": 0.01256189, + "epoch": 0.8244100405831956, + "flos": 20016409294080.0, + "grad_norm": 1.7618946203552466, + "language_loss": 0.65925729, + "learning_rate": 3.1470386657896473e-07, + "loss": 0.73594874, + "num_input_tokens_seen": 295706385, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.10131836, + "step": 13712, + "time_per_iteration": 2.468043804168701 + }, + { + "auxiliary_loss_clip": 0.06404002, + "auxiliary_loss_mlp": 0.01265304, + "balance_loss_clip": 0.06271501, + "balance_loss_mlp": 0.0125619, + "epoch": 0.8244701638358635, + "flos": 26437612032000.0, + "grad_norm": 1.6609701051981949, + "language_loss": 0.74622256, + "learning_rate": 3.14494187165202e-07, + "loss": 0.82291561, + "num_input_tokens_seen": 295727925, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09106445, + "step": 13713, + "time_per_iteration": 2.551905393600464 + }, + { + "auxiliary_loss_clip": 0.06404902, + "auxiliary_loss_mlp": 0.01268602, + "balance_loss_clip": 0.06270953, + "balance_loss_mlp": 0.012595, + "epoch": 0.8245302870885315, + "flos": 17645861433600.0, + "grad_norm": 1.6587982213804435, + "language_loss": 0.81258547, + "learning_rate": 3.1428457166598833e-07, + "loss": 0.88932049, + "num_input_tokens_seen": 295744420, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.09106445, + "step": 13714, + "time_per_iteration": 2.452026844024658 + }, + { + "auxiliary_loss_clip": 0.06404838, + "auxiliary_loss_mlp": 0.01266065, + "balance_loss_clip": 0.062736, + "balance_loss_mlp": 0.0125666, + "epoch": 0.8245904103411995, + "flos": 26216023109760.0, + "grad_norm": 2.7428711337446736, + "language_loss": 0.66907775, + "learning_rate": 3.1407502008927235e-07, + "loss": 0.74578679, + "num_input_tokens_seen": 295765105, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.09405518, + "step": 13715, + "time_per_iteration": 2.5828397274017334 + }, + { + "auxiliary_loss_clip": 0.06407429, + "auxiliary_loss_mlp": 0.01263847, + "balance_loss_clip": 0.06271131, + "balance_loss_mlp": 0.01254358, + "epoch": 0.8246505335938674, + "flos": 24211645591680.0, + "grad_norm": 1.6923917814594924, + "language_loss": 0.75099182, + "learning_rate": 3.1386553244300086e-07, + "loss": 0.82770455, + "num_input_tokens_seen": 295784200, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.09484863, + "step": 13716, + "time_per_iteration": 2.4810688495635986 + }, + { + "auxiliary_loss_clip": 0.06310038, + "auxiliary_loss_mlp": 0.01249676, + "balance_loss_clip": 0.06255137, + "balance_loss_mlp": 0.01248568, + "epoch": 0.8247106568465354, + "flos": 67114764190080.0, + "grad_norm": 0.7022312920639184, + "language_loss": 0.58953023, + "learning_rate": 3.136561087351175e-07, + "loss": 0.66512734, + "num_input_tokens_seen": 295846555, + "router_z_loss_clip": 0.54785156, + "router_z_loss_mlp": 0.01110077, + "step": 13717, + "time_per_iteration": 3.246941328048706 + }, + { + "auxiliary_loss_clip": 0.06403467, + "auxiliary_loss_mlp": 0.01264543, + "balance_loss_clip": 0.06271186, + "balance_loss_mlp": 0.01255149, + "epoch": 0.8247707800992033, + "flos": 12573199906560.0, + "grad_norm": 1.9324122684588263, + "language_loss": 0.79839373, + "learning_rate": 3.1344674897356373e-07, + "loss": 0.87507385, + "num_input_tokens_seen": 295863425, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.09387207, + "step": 13718, + "time_per_iteration": 3.9733448028564453 + }, + { + "auxiliary_loss_clip": 0.06403176, + "auxiliary_loss_mlp": 0.01265559, + "balance_loss_clip": 0.06274208, + "balance_loss_mlp": 0.0125573, + "epoch": 0.8248309033518714, + "flos": 15928927747200.0, + "grad_norm": 1.6030825184413535, + "language_loss": 0.69140959, + "learning_rate": 3.132374531662778e-07, + "loss": 0.76809692, + "num_input_tokens_seen": 295880925, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.09832764, + "step": 13719, + "time_per_iteration": 2.5168843269348145 + }, + { + "auxiliary_loss_clip": 0.06406642, + "auxiliary_loss_mlp": 0.01266218, + "balance_loss_clip": 0.06272849, + "balance_loss_mlp": 0.01256348, + "epoch": 0.8248910266045393, + "flos": 17570195596800.0, + "grad_norm": 2.330025020870477, + "language_loss": 0.6986599, + "learning_rate": 3.13028221321197e-07, + "loss": 0.77538854, + "num_input_tokens_seen": 295898205, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.09869385, + "step": 13720, + "time_per_iteration": 2.4678380489349365 + }, + { + "auxiliary_loss_clip": 0.06404991, + "auxiliary_loss_mlp": 0.01264532, + "balance_loss_clip": 0.06269173, + "balance_loss_mlp": 0.01254954, + "epoch": 0.8249511498572073, + "flos": 28626919511040.0, + "grad_norm": 1.5185794987899917, + "language_loss": 0.75965858, + "learning_rate": 3.1281905344625467e-07, + "loss": 0.83635378, + "num_input_tokens_seen": 295918130, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.0958252, + "step": 13721, + "time_per_iteration": 2.6373937129974365 + }, + { + "auxiliary_loss_clip": 0.06402366, + "auxiliary_loss_mlp": 0.01262873, + "balance_loss_clip": 0.0627152, + "balance_loss_mlp": 0.01253718, + "epoch": 0.8250112731098752, + "flos": 25563624819840.0, + "grad_norm": 1.7041844507677804, + "language_loss": 0.77799296, + "learning_rate": 3.1260994954938305e-07, + "loss": 0.85464543, + "num_input_tokens_seen": 295937760, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.0914917, + "step": 13722, + "time_per_iteration": 2.5994813442230225 + }, + { + "auxiliary_loss_clip": 0.06398278, + "auxiliary_loss_mlp": 0.01264674, + "balance_loss_clip": 0.06269009, + "balance_loss_mlp": 0.01255596, + "epoch": 0.8250713963625432, + "flos": 27753645058560.0, + "grad_norm": 1.6949642691113342, + "language_loss": 0.63508642, + "learning_rate": 3.1240090963851205e-07, + "loss": 0.71171594, + "num_input_tokens_seen": 295957585, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.09082031, + "step": 13723, + "time_per_iteration": 2.5635523796081543 + }, + { + "auxiliary_loss_clip": 0.06404909, + "auxiliary_loss_mlp": 0.01267168, + "balance_loss_clip": 0.06271261, + "balance_loss_mlp": 0.01257328, + "epoch": 0.8251315196152111, + "flos": 21616070791680.0, + "grad_norm": 1.4018010369843736, + "language_loss": 0.74626708, + "learning_rate": 3.121919337215666e-07, + "loss": 0.82298779, + "num_input_tokens_seen": 295977135, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.09844971, + "step": 13724, + "time_per_iteration": 2.513502836227417 + }, + { + "auxiliary_loss_clip": 0.06404832, + "auxiliary_loss_mlp": 0.0126482, + "balance_loss_clip": 0.06271145, + "balance_loss_mlp": 0.01254508, + "epoch": 0.8251916428678792, + "flos": 28585983991680.0, + "grad_norm": 1.793661817459537, + "language_loss": 0.64819729, + "learning_rate": 3.1198302180647253e-07, + "loss": 0.72489381, + "num_input_tokens_seen": 295996265, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.10302734, + "step": 13725, + "time_per_iteration": 2.529151678085327 + }, + { + "auxiliary_loss_clip": 0.06405316, + "auxiliary_loss_mlp": 0.01262656, + "balance_loss_clip": 0.06274511, + "balance_loss_mlp": 0.01253227, + "epoch": 0.8252517661205471, + "flos": 23081758410240.0, + "grad_norm": 1.672809814905788, + "language_loss": 0.81857646, + "learning_rate": 3.1177417390115125e-07, + "loss": 0.89525616, + "num_input_tokens_seen": 296014745, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09436035, + "step": 13726, + "time_per_iteration": 2.5228326320648193 + }, + { + "auxiliary_loss_clip": 0.06397386, + "auxiliary_loss_mlp": 0.01264386, + "balance_loss_clip": 0.06270818, + "balance_loss_mlp": 0.01255475, + "epoch": 0.8253118893732151, + "flos": 31767724828800.0, + "grad_norm": 1.6706774467929177, + "language_loss": 0.70475507, + "learning_rate": 3.1156539001352286e-07, + "loss": 0.78137279, + "num_input_tokens_seen": 296036960, + "router_z_loss_clip": 1.265625, + "router_z_loss_mlp": 0.08911133, + "step": 13727, + "time_per_iteration": 3.978147506713867 + }, + { + "auxiliary_loss_clip": 0.0640934, + "auxiliary_loss_mlp": 0.01267735, + "balance_loss_clip": 0.0627425, + "balance_loss_mlp": 0.01256881, + "epoch": 0.8253720126258831, + "flos": 18302326646400.0, + "grad_norm": 1.91309895747183, + "language_loss": 0.63201261, + "learning_rate": 3.113566701515036e-07, + "loss": 0.70878333, + "num_input_tokens_seen": 296056540, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.10858154, + "step": 13728, + "time_per_iteration": 2.5155835151672363 + }, + { + "auxiliary_loss_clip": 0.06411063, + "auxiliary_loss_mlp": 0.01265983, + "balance_loss_clip": 0.0627272, + "balance_loss_mlp": 0.01255486, + "epoch": 0.825432135878551, + "flos": 26804620915200.0, + "grad_norm": 1.603278449226732, + "language_loss": 0.71536702, + "learning_rate": 3.111480143230092e-07, + "loss": 0.7921375, + "num_input_tokens_seen": 296077950, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.10498047, + "step": 13729, + "time_per_iteration": 2.5187203884124756 + }, + { + "auxiliary_loss_clip": 0.06315145, + "auxiliary_loss_mlp": 0.01248813, + "balance_loss_clip": 0.06260362, + "balance_loss_mlp": 0.01247758, + "epoch": 0.825492259131219, + "flos": 54234498597120.0, + "grad_norm": 0.8544615284034055, + "language_loss": 0.62620342, + "learning_rate": 3.109394225359514e-07, + "loss": 0.70184296, + "num_input_tokens_seen": 296127060, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.01055908, + "step": 13730, + "time_per_iteration": 2.9303290843963623 + }, + { + "auxiliary_loss_clip": 0.06404001, + "auxiliary_loss_mlp": 0.01264633, + "balance_loss_clip": 0.06272161, + "balance_loss_mlp": 0.01254744, + "epoch": 0.825552382383887, + "flos": 43765087478400.0, + "grad_norm": 1.7912471248364803, + "language_loss": 0.63930857, + "learning_rate": 3.1073089479823945e-07, + "loss": 0.71599495, + "num_input_tokens_seen": 296147775, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09893799, + "step": 13731, + "time_per_iteration": 4.08091139793396 + }, + { + "auxiliary_loss_clip": 0.06411815, + "auxiliary_loss_mlp": 0.01266713, + "balance_loss_clip": 0.06272149, + "balance_loss_mlp": 0.0125645, + "epoch": 0.825612505636555, + "flos": 12607469026560.0, + "grad_norm": 2.0738047653444855, + "language_loss": 0.70323932, + "learning_rate": 3.105224311177812e-07, + "loss": 0.78002459, + "num_input_tokens_seen": 296163560, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.10266113, + "step": 13732, + "time_per_iteration": 2.4617788791656494 + }, + { + "auxiliary_loss_clip": 0.06410882, + "auxiliary_loss_mlp": 0.01264735, + "balance_loss_clip": 0.06272789, + "balance_loss_mlp": 0.01254304, + "epoch": 0.8256726288892229, + "flos": 17600146231680.0, + "grad_norm": 2.908441012815726, + "language_loss": 0.71335745, + "learning_rate": 3.103140315024817e-07, + "loss": 0.79011369, + "num_input_tokens_seen": 296178730, + "router_z_loss_clip": 1.38183594, + "router_z_loss_mlp": 0.10437012, + "step": 13733, + "time_per_iteration": 2.4824366569519043 + }, + { + "auxiliary_loss_clip": 0.0639869, + "auxiliary_loss_mlp": 0.01262669, + "balance_loss_clip": 0.0626872, + "balance_loss_mlp": 0.01253597, + "epoch": 0.8257327521418909, + "flos": 23812631648640.0, + "grad_norm": 1.388790191971181, + "language_loss": 0.82709062, + "learning_rate": 3.1010569596024437e-07, + "loss": 0.90370417, + "num_input_tokens_seen": 296200175, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.09069824, + "step": 13734, + "time_per_iteration": 4.009546995162964 + }, + { + "auxiliary_loss_clip": 0.06404021, + "auxiliary_loss_mlp": 0.01264839, + "balance_loss_clip": 0.06273267, + "balance_loss_mlp": 0.01255141, + "epoch": 0.8257928753945588, + "flos": 19287129283200.0, + "grad_norm": 1.9103831477956985, + "language_loss": 0.83209223, + "learning_rate": 3.098974244989676e-07, + "loss": 0.90878081, + "num_input_tokens_seen": 296219305, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.09698486, + "step": 13735, + "time_per_iteration": 2.5026960372924805 + }, + { + "auxiliary_loss_clip": 0.06407285, + "auxiliary_loss_mlp": 0.01266501, + "balance_loss_clip": 0.06273124, + "balance_loss_mlp": 0.01256988, + "epoch": 0.8258529986472268, + "flos": 18484782912000.0, + "grad_norm": 1.810689318637808, + "language_loss": 0.70870662, + "learning_rate": 3.096892171265497e-07, + "loss": 0.7854445, + "num_input_tokens_seen": 296236945, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.09515381, + "step": 13736, + "time_per_iteration": 2.473515748977661 + }, + { + "auxiliary_loss_clip": 0.06316115, + "auxiliary_loss_mlp": 0.01253987, + "balance_loss_clip": 0.06261094, + "balance_loss_mlp": 0.01252863, + "epoch": 0.8259131218998947, + "flos": 62154903386880.0, + "grad_norm": 1.3034739276824252, + "language_loss": 0.67937154, + "learning_rate": 3.0948107385088665e-07, + "loss": 0.75507253, + "num_input_tokens_seen": 296294685, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.01126862, + "step": 13737, + "time_per_iteration": 3.0982251167297363 + }, + { + "auxiliary_loss_clip": 0.06403725, + "auxiliary_loss_mlp": 0.01264242, + "balance_loss_clip": 0.06270637, + "balance_loss_mlp": 0.01254538, + "epoch": 0.8259732451525628, + "flos": 22164781253760.0, + "grad_norm": 2.085431266289398, + "language_loss": 0.69943869, + "learning_rate": 3.0927299467987e-07, + "loss": 0.7761184, + "num_input_tokens_seen": 296314790, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.09698486, + "step": 13738, + "time_per_iteration": 2.5181643962860107 + }, + { + "auxiliary_loss_clip": 0.06404846, + "auxiliary_loss_mlp": 0.01267281, + "balance_loss_clip": 0.06271113, + "balance_loss_mlp": 0.0125626, + "epoch": 0.8260333684052307, + "flos": 38370587218560.0, + "grad_norm": 1.709303321450842, + "language_loss": 0.6325919, + "learning_rate": 3.090649796213911e-07, + "loss": 0.70931315, + "num_input_tokens_seen": 296335355, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.11016846, + "step": 13739, + "time_per_iteration": 2.622809886932373 + }, + { + "auxiliary_loss_clip": 0.06316274, + "auxiliary_loss_mlp": 0.01250838, + "balance_loss_clip": 0.06261257, + "balance_loss_mlp": 0.01249742, + "epoch": 0.8260934916578987, + "flos": 62204433949440.0, + "grad_norm": 0.8068403235468483, + "language_loss": 0.59232754, + "learning_rate": 3.0885702868333853e-07, + "loss": 0.66799867, + "num_input_tokens_seen": 296399885, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.01098633, + "step": 13740, + "time_per_iteration": 3.185506582260132 + }, + { + "auxiliary_loss_clip": 0.06413467, + "auxiliary_loss_mlp": 0.01267061, + "balance_loss_clip": 0.06273782, + "balance_loss_mlp": 0.01256571, + "epoch": 0.8261536149105667, + "flos": 22572138677760.0, + "grad_norm": 1.9838230010912559, + "language_loss": 0.75877976, + "learning_rate": 3.086491418735959e-07, + "loss": 0.83558506, + "num_input_tokens_seen": 296417660, + "router_z_loss_clip": 1.39648438, + "router_z_loss_mlp": 0.1048584, + "step": 13741, + "time_per_iteration": 2.5053927898406982 + }, + { + "auxiliary_loss_clip": 0.06405714, + "auxiliary_loss_mlp": 0.01264631, + "balance_loss_clip": 0.06272768, + "balance_loss_mlp": 0.01255124, + "epoch": 0.8262137381632346, + "flos": 32533705728000.0, + "grad_norm": 1.822033080058508, + "language_loss": 0.62812448, + "learning_rate": 3.0844131920004726e-07, + "loss": 0.70482796, + "num_input_tokens_seen": 296438255, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.09515381, + "step": 13742, + "time_per_iteration": 2.5799756050109863 + }, + { + "auxiliary_loss_clip": 0.06413151, + "auxiliary_loss_mlp": 0.01267602, + "balance_loss_clip": 0.06273061, + "balance_loss_mlp": 0.01256343, + "epoch": 0.8262738614159026, + "flos": 14141569104000.0, + "grad_norm": 3.472691543240307, + "language_loss": 0.67042887, + "learning_rate": 3.0823356067057327e-07, + "loss": 0.74723649, + "num_input_tokens_seen": 296454485, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.11254883, + "step": 13743, + "time_per_iteration": 2.4885993003845215 + }, + { + "auxiliary_loss_clip": 0.06408446, + "auxiliary_loss_mlp": 0.01266141, + "balance_loss_clip": 0.06274473, + "balance_loss_mlp": 0.01256133, + "epoch": 0.8263339846685706, + "flos": 19830934281600.0, + "grad_norm": 1.9106016851298016, + "language_loss": 0.67223948, + "learning_rate": 3.0802586629305283e-07, + "loss": 0.74898529, + "num_input_tokens_seen": 296473740, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.10009766, + "step": 13744, + "time_per_iteration": 2.50083589553833 + }, + { + "auxiliary_loss_clip": 0.06407204, + "auxiliary_loss_mlp": 0.0126747, + "balance_loss_clip": 0.06273009, + "balance_loss_mlp": 0.01257391, + "epoch": 0.8263941079212386, + "flos": 22752330883200.0, + "grad_norm": 1.6353552178667967, + "language_loss": 0.75895423, + "learning_rate": 3.078182360753612e-07, + "loss": 0.83570099, + "num_input_tokens_seen": 296493355, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.10083008, + "step": 13745, + "time_per_iteration": 2.5865373611450195 + }, + { + "auxiliary_loss_clip": 0.06400856, + "auxiliary_loss_mlp": 0.01263189, + "balance_loss_clip": 0.062732, + "balance_loss_mlp": 0.01254374, + "epoch": 0.8264542311739065, + "flos": 20126847375360.0, + "grad_norm": 1.8085857006682091, + "language_loss": 0.79174644, + "learning_rate": 3.076106700253709e-07, + "loss": 0.86838686, + "num_input_tokens_seen": 296510520, + "router_z_loss_clip": 1.27832031, + "router_z_loss_mlp": 0.0881958, + "step": 13746, + "time_per_iteration": 2.5261435508728027 + }, + { + "auxiliary_loss_clip": 0.06416452, + "auxiliary_loss_mlp": 0.01265894, + "balance_loss_clip": 0.06277022, + "balance_loss_mlp": 0.0125544, + "epoch": 0.8265143544265745, + "flos": 16842844229760.0, + "grad_norm": 2.5785036479328354, + "language_loss": 0.68477845, + "learning_rate": 3.0740316815095415e-07, + "loss": 0.76160187, + "num_input_tokens_seen": 296528265, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.10461426, + "step": 13747, + "time_per_iteration": 2.5118043422698975 + }, + { + "auxiliary_loss_clip": 0.06406212, + "auxiliary_loss_mlp": 0.01264342, + "balance_loss_clip": 0.06271359, + "balance_loss_mlp": 0.01254013, + "epoch": 0.8265744776792424, + "flos": 22025231078400.0, + "grad_norm": 1.914079416513022, + "language_loss": 0.75505137, + "learning_rate": 3.0719573045997835e-07, + "loss": 0.83175695, + "num_input_tokens_seen": 296547810, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.10327148, + "step": 13748, + "time_per_iteration": 2.5839946269989014 + }, + { + "auxiliary_loss_clip": 0.06398661, + "auxiliary_loss_mlp": 0.01266472, + "balance_loss_clip": 0.06269635, + "balance_loss_mlp": 0.01257442, + "epoch": 0.8266346009319104, + "flos": 19250889592320.0, + "grad_norm": 1.8963276954120185, + "language_loss": 0.63934255, + "learning_rate": 3.069883569603102e-07, + "loss": 0.71599388, + "num_input_tokens_seen": 296565940, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.09033203, + "step": 13749, + "time_per_iteration": 2.465831995010376 + }, + { + "auxiliary_loss_clip": 0.06401607, + "auxiliary_loss_mlp": 0.01267069, + "balance_loss_clip": 0.06269521, + "balance_loss_mlp": 0.01257806, + "epoch": 0.8266947241845783, + "flos": 24173016059520.0, + "grad_norm": 1.605270256625375, + "language_loss": 0.74094856, + "learning_rate": 3.067810476598132e-07, + "loss": 0.81763524, + "num_input_tokens_seen": 296585090, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.09259033, + "step": 13750, + "time_per_iteration": 2.516474723815918 + }, + { + "auxiliary_loss_clip": 0.06407044, + "auxiliary_loss_mlp": 0.01265047, + "balance_loss_clip": 0.0627216, + "balance_loss_mlp": 0.0125489, + "epoch": 0.8267548474372464, + "flos": 21112195063680.0, + "grad_norm": 1.905483524829514, + "language_loss": 0.65982723, + "learning_rate": 3.065738025663496e-07, + "loss": 0.73654807, + "num_input_tokens_seen": 296604950, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.1015625, + "step": 13751, + "time_per_iteration": 2.5073559284210205 + }, + { + "auxiliary_loss_clip": 0.064018, + "auxiliary_loss_mlp": 0.01263322, + "balance_loss_clip": 0.06270954, + "balance_loss_mlp": 0.01254382, + "epoch": 0.8268149706899143, + "flos": 39977711729280.0, + "grad_norm": 1.3811895515091794, + "language_loss": 0.60690141, + "learning_rate": 3.0636662168777607e-07, + "loss": 0.68355262, + "num_input_tokens_seen": 296627780, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.0894165, + "step": 13752, + "time_per_iteration": 2.6502721309661865 + }, + { + "auxiliary_loss_clip": 0.06312872, + "auxiliary_loss_mlp": 0.01249988, + "balance_loss_clip": 0.06258056, + "balance_loss_mlp": 0.01248881, + "epoch": 0.8268750939425823, + "flos": 65799290943360.0, + "grad_norm": 1.574540710975994, + "language_loss": 0.57428581, + "learning_rate": 3.0615950503194986e-07, + "loss": 0.64991438, + "num_input_tokens_seen": 296683850, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.01109314, + "step": 13753, + "time_per_iteration": 3.17626953125 + }, + { + "auxiliary_loss_clip": 0.06316203, + "auxiliary_loss_mlp": 0.01249962, + "balance_loss_clip": 0.06261422, + "balance_loss_mlp": 0.01248899, + "epoch": 0.8269352171952503, + "flos": 52997108227200.0, + "grad_norm": 0.6861116904276556, + "language_loss": 0.54860449, + "learning_rate": 3.0595245260672563e-07, + "loss": 0.62426615, + "num_input_tokens_seen": 296741420, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.01063538, + "step": 13754, + "time_per_iteration": 3.251030921936035 + }, + { + "auxiliary_loss_clip": 0.06401195, + "auxiliary_loss_mlp": 0.01262943, + "balance_loss_clip": 0.06269863, + "balance_loss_mlp": 0.0125386, + "epoch": 0.8269953404479182, + "flos": 23082848513280.0, + "grad_norm": 1.821155505252388, + "language_loss": 0.69514215, + "learning_rate": 3.0574546441995354e-07, + "loss": 0.77178347, + "num_input_tokens_seen": 296759620, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09082031, + "step": 13755, + "time_per_iteration": 2.5638794898986816 + }, + { + "auxiliary_loss_clip": 0.06400982, + "auxiliary_loss_mlp": 0.01263943, + "balance_loss_clip": 0.0627033, + "balance_loss_mlp": 0.01255408, + "epoch": 0.8270554637005862, + "flos": 14215222442880.0, + "grad_norm": 1.9620156908641344, + "language_loss": 0.70154935, + "learning_rate": 3.0553854047948324e-07, + "loss": 0.7781986, + "num_input_tokens_seen": 296777275, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.08538818, + "step": 13756, + "time_per_iteration": 2.4718971252441406 + }, + { + "auxiliary_loss_clip": 0.06405632, + "auxiliary_loss_mlp": 0.01265207, + "balance_loss_clip": 0.06272529, + "balance_loss_mlp": 0.01254997, + "epoch": 0.8271155869532542, + "flos": 21768450641280.0, + "grad_norm": 2.9125961441146204, + "language_loss": 0.72791404, + "learning_rate": 3.053316807931623e-07, + "loss": 0.80462241, + "num_input_tokens_seen": 296796655, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.10217285, + "step": 13757, + "time_per_iteration": 3.9486069679260254 + }, + { + "auxiliary_loss_clip": 0.06411837, + "auxiliary_loss_mlp": 0.01268236, + "balance_loss_clip": 0.06274478, + "balance_loss_mlp": 0.01256374, + "epoch": 0.8271757102059222, + "flos": 15125575127040.0, + "grad_norm": 2.5593838529176467, + "language_loss": 0.69374532, + "learning_rate": 3.0512488536883283e-07, + "loss": 0.77054602, + "num_input_tokens_seen": 296813705, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.11871338, + "step": 13758, + "time_per_iteration": 2.4854576587677 + }, + { + "auxiliary_loss_clip": 0.06399594, + "auxiliary_loss_mlp": 0.01266198, + "balance_loss_clip": 0.06270184, + "balance_loss_mlp": 0.01256984, + "epoch": 0.8272358334585901, + "flos": 24140549802240.0, + "grad_norm": 1.7114391651617498, + "language_loss": 0.70266873, + "learning_rate": 3.0491815421433775e-07, + "loss": 0.77932668, + "num_input_tokens_seen": 296833985, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.09210205, + "step": 13759, + "time_per_iteration": 2.517610788345337 + }, + { + "auxiliary_loss_clip": 0.06402884, + "auxiliary_loss_mlp": 0.01263273, + "balance_loss_clip": 0.0627152, + "balance_loss_mlp": 0.01253415, + "epoch": 0.8272959567112581, + "flos": 18996918266880.0, + "grad_norm": 1.6139248234121746, + "language_loss": 0.71018773, + "learning_rate": 3.047114873375161e-07, + "loss": 0.78684926, + "num_input_tokens_seen": 296850150, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.09863281, + "step": 13760, + "time_per_iteration": 2.5143585205078125 + }, + { + "auxiliary_loss_clip": 0.06399237, + "auxiliary_loss_mlp": 0.01265407, + "balance_loss_clip": 0.06269812, + "balance_loss_mlp": 0.01256162, + "epoch": 0.827356079963926, + "flos": 20637934554240.0, + "grad_norm": 1.8803974399165198, + "language_loss": 0.78203416, + "learning_rate": 3.0450488474620505e-07, + "loss": 0.85868061, + "num_input_tokens_seen": 296869585, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.09240723, + "step": 13761, + "time_per_iteration": 2.4832279682159424 + }, + { + "auxiliary_loss_clip": 0.06399886, + "auxiliary_loss_mlp": 0.01266752, + "balance_loss_clip": 0.06270774, + "balance_loss_mlp": 0.01257627, + "epoch": 0.827416203216594, + "flos": 22422777575040.0, + "grad_norm": 2.196661188611125, + "language_loss": 0.69947863, + "learning_rate": 3.042983464482387e-07, + "loss": 0.77614498, + "num_input_tokens_seen": 296887710, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.09124756, + "step": 13762, + "time_per_iteration": 2.522721290588379 + }, + { + "auxiliary_loss_clip": 0.06399816, + "auxiliary_loss_mlp": 0.01264392, + "balance_loss_clip": 0.06268964, + "balance_loss_mlp": 0.01255082, + "epoch": 0.827476326469262, + "flos": 19032235562880.0, + "grad_norm": 1.792228037314928, + "language_loss": 0.7011888, + "learning_rate": 3.0409187245144853e-07, + "loss": 0.77783084, + "num_input_tokens_seen": 296906265, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09313965, + "step": 13763, + "time_per_iteration": 2.486668825149536 + }, + { + "auxiliary_loss_clip": 0.0631156, + "auxiliary_loss_mlp": 0.01249503, + "balance_loss_clip": 0.06256869, + "balance_loss_mlp": 0.01248406, + "epoch": 0.82753644972193, + "flos": 68520942610560.0, + "grad_norm": 0.817208911394718, + "language_loss": 0.65143663, + "learning_rate": 3.038854627636651e-07, + "loss": 0.7270472, + "num_input_tokens_seen": 296971290, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.01098633, + "step": 13764, + "time_per_iteration": 3.1860270500183105 + }, + { + "auxiliary_loss_clip": 0.06402349, + "auxiliary_loss_mlp": 0.01265175, + "balance_loss_clip": 0.06270835, + "balance_loss_mlp": 0.01255001, + "epoch": 0.8275965729745979, + "flos": 18411255354240.0, + "grad_norm": 2.1408558147856427, + "language_loss": 0.7802, + "learning_rate": 3.0367911739271423e-07, + "loss": 0.8568753, + "num_input_tokens_seen": 296989060, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.10174561, + "step": 13765, + "time_per_iteration": 2.4712343215942383 + }, + { + "auxiliary_loss_clip": 0.06409816, + "auxiliary_loss_mlp": 0.01264455, + "balance_loss_clip": 0.06273708, + "balance_loss_mlp": 0.01254626, + "epoch": 0.8276566962272659, + "flos": 28519625957760.0, + "grad_norm": 1.515558220425856, + "language_loss": 0.62899083, + "learning_rate": 3.034728363464214e-07, + "loss": 0.70573354, + "num_input_tokens_seen": 297011300, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.0982666, + "step": 13766, + "time_per_iteration": 2.5880696773529053 + }, + { + "auxiliary_loss_clip": 0.06403887, + "auxiliary_loss_mlp": 0.01263304, + "balance_loss_clip": 0.06270833, + "balance_loss_mlp": 0.01253523, + "epoch": 0.8277168194799339, + "flos": 20236488842880.0, + "grad_norm": 1.5277982558115004, + "language_loss": 0.82747239, + "learning_rate": 3.03266619632609e-07, + "loss": 0.90414429, + "num_input_tokens_seen": 297030350, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.09777832, + "step": 13767, + "time_per_iteration": 3.9617438316345215 + }, + { + "auxiliary_loss_clip": 0.06405637, + "auxiliary_loss_mlp": 0.0126823, + "balance_loss_clip": 0.06271689, + "balance_loss_mlp": 0.01258717, + "epoch": 0.8277769427326018, + "flos": 28484350588800.0, + "grad_norm": 1.4875953854555823, + "language_loss": 0.69132233, + "learning_rate": 3.030604672590964e-07, + "loss": 0.76806098, + "num_input_tokens_seen": 297049710, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.09509277, + "step": 13768, + "time_per_iteration": 2.60477876663208 + }, + { + "auxiliary_loss_clip": 0.06398913, + "auxiliary_loss_mlp": 0.01264792, + "balance_loss_clip": 0.06269988, + "balance_loss_mlp": 0.01255649, + "epoch": 0.8278370659852698, + "flos": 27204808815360.0, + "grad_norm": 1.7806138521409314, + "language_loss": 0.74606562, + "learning_rate": 3.028543792337006e-07, + "loss": 0.82270265, + "num_input_tokens_seen": 297070510, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.09143066, + "step": 13769, + "time_per_iteration": 2.6588950157165527 + }, + { + "auxiliary_loss_clip": 0.06405737, + "auxiliary_loss_mlp": 0.01267282, + "balance_loss_clip": 0.06271692, + "balance_loss_mlp": 0.01257, + "epoch": 0.8278971892379378, + "flos": 37825272846720.0, + "grad_norm": 1.8746055345971568, + "language_loss": 0.74295783, + "learning_rate": 3.0264835556423675e-07, + "loss": 0.81968796, + "num_input_tokens_seen": 297092585, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.10290527, + "step": 13770, + "time_per_iteration": 4.066660165786743 + }, + { + "auxiliary_loss_clip": 0.06405378, + "auxiliary_loss_mlp": 0.0126564, + "balance_loss_clip": 0.06270339, + "balance_loss_mlp": 0.01255472, + "epoch": 0.8279573124906058, + "flos": 22565933475840.0, + "grad_norm": 1.7096340379903676, + "language_loss": 0.75903618, + "learning_rate": 3.0244239625851785e-07, + "loss": 0.83574641, + "num_input_tokens_seen": 297110055, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.10174561, + "step": 13771, + "time_per_iteration": 2.5009427070617676 + }, + { + "auxiliary_loss_clip": 0.06401806, + "auxiliary_loss_mlp": 0.01267922, + "balance_loss_clip": 0.06269084, + "balance_loss_mlp": 0.01258582, + "epoch": 0.8280174357432737, + "flos": 36073441134720.0, + "grad_norm": 1.4307953664451067, + "language_loss": 0.72807586, + "learning_rate": 3.0223650132435284e-07, + "loss": 0.80477321, + "num_input_tokens_seen": 297132170, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.09350586, + "step": 13772, + "time_per_iteration": 2.598695993423462 + }, + { + "auxiliary_loss_clip": 0.06398449, + "auxiliary_loss_mlp": 0.01266732, + "balance_loss_clip": 0.06268763, + "balance_loss_mlp": 0.01256885, + "epoch": 0.8280775589959417, + "flos": 22966834135680.0, + "grad_norm": 2.013252985793075, + "language_loss": 0.74714899, + "learning_rate": 3.0203067076955035e-07, + "loss": 0.8238008, + "num_input_tokens_seen": 297149515, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.09838867, + "step": 13773, + "time_per_iteration": 2.5045857429504395 + }, + { + "auxiliary_loss_clip": 0.06402349, + "auxiliary_loss_mlp": 0.01264809, + "balance_loss_clip": 0.06272508, + "balance_loss_mlp": 0.01255385, + "epoch": 0.8281376822486096, + "flos": 26069722680960.0, + "grad_norm": 1.7704579459247693, + "language_loss": 0.7591548, + "learning_rate": 3.01824904601915e-07, + "loss": 0.8358264, + "num_input_tokens_seen": 297170320, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.09429932, + "step": 13774, + "time_per_iteration": 3.989100694656372 + }, + { + "auxiliary_loss_clip": 0.064128, + "auxiliary_loss_mlp": 0.01264143, + "balance_loss_clip": 0.0627373, + "balance_loss_mlp": 0.0125432, + "epoch": 0.8281978055012776, + "flos": 20674048464000.0, + "grad_norm": 1.628782431293184, + "language_loss": 0.74902624, + "learning_rate": 3.01619202829249e-07, + "loss": 0.82579559, + "num_input_tokens_seen": 297189935, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.09820557, + "step": 13775, + "time_per_iteration": 2.4677510261535645 + }, + { + "auxiliary_loss_clip": 0.0640965, + "auxiliary_loss_mlp": 0.01264724, + "balance_loss_clip": 0.06271163, + "balance_loss_mlp": 0.01253882, + "epoch": 0.8282579287539455, + "flos": 29323062432000.0, + "grad_norm": 2.180106071080934, + "language_loss": 0.74249536, + "learning_rate": 3.01413565459353e-07, + "loss": 0.81923908, + "num_input_tokens_seen": 297210885, + "router_z_loss_clip": 1.38476562, + "router_z_loss_mlp": 0.10845947, + "step": 13776, + "time_per_iteration": 2.6236319541931152 + }, + { + "auxiliary_loss_clip": 0.0640358, + "auxiliary_loss_mlp": 0.01264371, + "balance_loss_clip": 0.06269941, + "balance_loss_mlp": 0.01254655, + "epoch": 0.8283180520066136, + "flos": 15711699237120.0, + "grad_norm": 1.9384324289396857, + "language_loss": 0.77343374, + "learning_rate": 3.0120799250002483e-07, + "loss": 0.85011321, + "num_input_tokens_seen": 297228500, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.097229, + "step": 13777, + "time_per_iteration": 2.456892490386963 + }, + { + "auxiliary_loss_clip": 0.06401777, + "auxiliary_loss_mlp": 0.01265761, + "balance_loss_clip": 0.06271677, + "balance_loss_mlp": 0.01256558, + "epoch": 0.8283781752592815, + "flos": 24798566315520.0, + "grad_norm": 1.5185722645753612, + "language_loss": 0.82944041, + "learning_rate": 3.010024839590604e-07, + "loss": 0.90611577, + "num_input_tokens_seen": 297249470, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.09191895, + "step": 13778, + "time_per_iteration": 2.5368337631225586 + }, + { + "auxiliary_loss_clip": 0.06397066, + "auxiliary_loss_mlp": 0.01264401, + "balance_loss_clip": 0.06269608, + "balance_loss_mlp": 0.012553, + "epoch": 0.8284382985119495, + "flos": 18987694318080.0, + "grad_norm": 1.7308701020376125, + "language_loss": 0.74615109, + "learning_rate": 3.0079703984425187e-07, + "loss": 0.82276577, + "num_input_tokens_seen": 297265970, + "router_z_loss_clip": 1.2734375, + "router_z_loss_mlp": 0.09100342, + "step": 13779, + "time_per_iteration": 2.4684152603149414 + }, + { + "auxiliary_loss_clip": 0.06314863, + "auxiliary_loss_mlp": 0.01250131, + "balance_loss_clip": 0.06260095, + "balance_loss_mlp": 0.01249052, + "epoch": 0.8284984217646175, + "flos": 61055832579840.0, + "grad_norm": 0.7787786070050955, + "language_loss": 0.56615424, + "learning_rate": 3.0059166016338954e-07, + "loss": 0.64180422, + "num_input_tokens_seen": 297325525, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.01081085, + "step": 13780, + "time_per_iteration": 3.151190757751465 + }, + { + "auxiliary_loss_clip": 0.06399573, + "auxiliary_loss_mlp": 0.01265439, + "balance_loss_clip": 0.06268763, + "balance_loss_mlp": 0.01256081, + "epoch": 0.8285585450172854, + "flos": 19719993075840.0, + "grad_norm": 1.6749294614493886, + "language_loss": 0.80124277, + "learning_rate": 3.0038634492426205e-07, + "loss": 0.87789285, + "num_input_tokens_seen": 297345025, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09350586, + "step": 13781, + "time_per_iteration": 2.486316442489624 + }, + { + "auxiliary_loss_clip": 0.06404715, + "auxiliary_loss_mlp": 0.01265372, + "balance_loss_clip": 0.06271574, + "balance_loss_mlp": 0.01254632, + "epoch": 0.8286186682699535, + "flos": 21695258499840.0, + "grad_norm": 1.909161291798896, + "language_loss": 0.76221263, + "learning_rate": 3.001810941346543e-07, + "loss": 0.83891356, + "num_input_tokens_seen": 297363570, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.10748291, + "step": 13782, + "time_per_iteration": 2.517943859100342 + }, + { + "auxiliary_loss_clip": 0.06404275, + "auxiliary_loss_mlp": 0.01263731, + "balance_loss_clip": 0.06269363, + "balance_loss_mlp": 0.01254212, + "epoch": 0.8286787915226214, + "flos": 25782656192640.0, + "grad_norm": 1.4991404242218924, + "language_loss": 0.76445484, + "learning_rate": 2.9997590780234983e-07, + "loss": 0.84113491, + "num_input_tokens_seen": 297385385, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.09527588, + "step": 13783, + "time_per_iteration": 2.521440267562866 + }, + { + "auxiliary_loss_clip": 0.06402531, + "auxiliary_loss_mlp": 0.01266148, + "balance_loss_clip": 0.06269924, + "balance_loss_mlp": 0.01256873, + "epoch": 0.8287389147752894, + "flos": 21294777110400.0, + "grad_norm": 1.7532816495627446, + "language_loss": 0.74151248, + "learning_rate": 2.997707859351304e-07, + "loss": 0.81819928, + "num_input_tokens_seen": 297403950, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09277344, + "step": 13784, + "time_per_iteration": 2.5014326572418213 + }, + { + "auxiliary_loss_clip": 0.06404807, + "auxiliary_loss_mlp": 0.01266618, + "balance_loss_clip": 0.06268123, + "balance_loss_mlp": 0.01255847, + "epoch": 0.8287990380279573, + "flos": 33552903265920.0, + "grad_norm": 3.27470400867833, + "language_loss": 0.69467115, + "learning_rate": 2.99565728540772e-07, + "loss": 0.77138543, + "num_input_tokens_seen": 297424565, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.10778809, + "step": 13785, + "time_per_iteration": 2.601536989212036 + }, + { + "auxiliary_loss_clip": 0.0640759, + "auxiliary_loss_mlp": 0.01266942, + "balance_loss_clip": 0.06274858, + "balance_loss_mlp": 0.0125722, + "epoch": 0.8288591612806253, + "flos": 22972997410560.0, + "grad_norm": 1.427433422724433, + "language_loss": 0.68698609, + "learning_rate": 2.993607356270516e-07, + "loss": 0.76373136, + "num_input_tokens_seen": 297445180, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.097229, + "step": 13786, + "time_per_iteration": 2.547952175140381 + }, + { + "auxiliary_loss_clip": 0.06411159, + "auxiliary_loss_mlp": 0.01263721, + "balance_loss_clip": 0.06272699, + "balance_loss_mlp": 0.01253648, + "epoch": 0.8289192845332932, + "flos": 18595053285120.0, + "grad_norm": 2.0138458745515635, + "language_loss": 0.77133876, + "learning_rate": 2.991558072017426e-07, + "loss": 0.84808755, + "num_input_tokens_seen": 297463790, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.10070801, + "step": 13787, + "time_per_iteration": 2.48760986328125 + }, + { + "auxiliary_loss_clip": 0.06400535, + "auxiliary_loss_mlp": 0.01266768, + "balance_loss_clip": 0.06270656, + "balance_loss_mlp": 0.01257053, + "epoch": 0.8289794077859612, + "flos": 15455841194880.0, + "grad_norm": 1.5818802638105176, + "language_loss": 0.80619884, + "learning_rate": 2.989509432726163e-07, + "loss": 0.88287187, + "num_input_tokens_seen": 297480100, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.09710693, + "step": 13788, + "time_per_iteration": 2.506680488586426 + }, + { + "auxiliary_loss_clip": 0.0640239, + "auxiliary_loss_mlp": 0.01262913, + "balance_loss_clip": 0.062705, + "balance_loss_mlp": 0.01252935, + "epoch": 0.8290395310386292, + "flos": 28885628592000.0, + "grad_norm": 1.4921693552910416, + "language_loss": 0.71268535, + "learning_rate": 2.9874614384744014e-07, + "loss": 0.78933835, + "num_input_tokens_seen": 297499890, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09973145, + "step": 13789, + "time_per_iteration": 2.559659719467163 + }, + { + "auxiliary_loss_clip": 0.06403467, + "auxiliary_loss_mlp": 0.01265989, + "balance_loss_clip": 0.06268575, + "balance_loss_mlp": 0.01255796, + "epoch": 0.8290996542912972, + "flos": 36585324927360.0, + "grad_norm": 1.757152625782574, + "language_loss": 0.68272877, + "learning_rate": 2.985414089339813e-07, + "loss": 0.75942338, + "num_input_tokens_seen": 297521440, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.10198975, + "step": 13790, + "time_per_iteration": 2.6251883506774902 + }, + { + "auxiliary_loss_clip": 0.06406529, + "auxiliary_loss_mlp": 0.01270326, + "balance_loss_clip": 0.06272461, + "balance_loss_mlp": 0.01259448, + "epoch": 0.8291597775439651, + "flos": 23629756112640.0, + "grad_norm": 1.6234366506097078, + "language_loss": 0.77228737, + "learning_rate": 2.9833673854000265e-07, + "loss": 0.84905589, + "num_input_tokens_seen": 297539920, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.10876465, + "step": 13791, + "time_per_iteration": 2.501948356628418 + }, + { + "auxiliary_loss_clip": 0.0639832, + "auxiliary_loss_mlp": 0.01264601, + "balance_loss_clip": 0.06269881, + "balance_loss_mlp": 0.0125507, + "epoch": 0.8292199007966331, + "flos": 21403873526400.0, + "grad_norm": 1.4641764539166646, + "language_loss": 0.7021268, + "learning_rate": 2.981321326732651e-07, + "loss": 0.77875602, + "num_input_tokens_seen": 297560000, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.09533691, + "step": 13792, + "time_per_iteration": 2.4955878257751465 + }, + { + "auxiliary_loss_clip": 0.06403746, + "auxiliary_loss_mlp": 0.01262629, + "balance_loss_clip": 0.06269513, + "balance_loss_mlp": 0.01253051, + "epoch": 0.829280024049301, + "flos": 28775232437760.0, + "grad_norm": 1.4298994778553897, + "language_loss": 0.65538836, + "learning_rate": 2.9792759134152736e-07, + "loss": 0.73205209, + "num_input_tokens_seen": 297579300, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.09576416, + "step": 13793, + "time_per_iteration": 2.6276164054870605 + }, + { + "auxiliary_loss_clip": 0.06406765, + "auxiliary_loss_mlp": 0.01265372, + "balance_loss_clip": 0.06271418, + "balance_loss_mlp": 0.01254375, + "epoch": 0.829340147301969, + "flos": 19944223401600.0, + "grad_norm": 1.8265320285164077, + "language_loss": 0.66246361, + "learning_rate": 2.977231145525461e-07, + "loss": 0.73918492, + "num_input_tokens_seen": 297598095, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.10992432, + "step": 13794, + "time_per_iteration": 2.5835254192352295 + }, + { + "auxiliary_loss_clip": 0.06403525, + "auxiliary_loss_mlp": 0.01263482, + "balance_loss_clip": 0.06269032, + "balance_loss_mlp": 0.01253224, + "epoch": 0.829400270554637, + "flos": 25235622812160.0, + "grad_norm": 1.749339694321301, + "language_loss": 0.6647079, + "learning_rate": 2.975187023140757e-07, + "loss": 0.74137801, + "num_input_tokens_seen": 297615955, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.10253906, + "step": 13795, + "time_per_iteration": 2.550981044769287 + }, + { + "auxiliary_loss_clip": 0.06396833, + "auxiliary_loss_mlp": 0.01263528, + "balance_loss_clip": 0.06271346, + "balance_loss_mlp": 0.01254325, + "epoch": 0.829460393807305, + "flos": 24470690088960.0, + "grad_norm": 1.6723308404898531, + "language_loss": 0.66547108, + "learning_rate": 2.973143546338661e-07, + "loss": 0.74207467, + "num_input_tokens_seen": 297636285, + "router_z_loss_clip": 1.25488281, + "router_z_loss_mlp": 0.09197998, + "step": 13796, + "time_per_iteration": 3.9565439224243164 + }, + { + "auxiliary_loss_clip": 0.06399691, + "auxiliary_loss_mlp": 0.01264289, + "balance_loss_clip": 0.06269552, + "balance_loss_mlp": 0.01254955, + "epoch": 0.829520517059973, + "flos": 15127923041280.0, + "grad_norm": 1.5185455706473978, + "language_loss": 0.7187897, + "learning_rate": 2.971100715196666e-07, + "loss": 0.79542947, + "num_input_tokens_seen": 297653315, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.09338379, + "step": 13797, + "time_per_iteration": 2.4948043823242188 + }, + { + "auxiliary_loss_clip": 0.06402339, + "auxiliary_loss_mlp": 0.01264653, + "balance_loss_clip": 0.06269293, + "balance_loss_mlp": 0.01255086, + "epoch": 0.8295806403126409, + "flos": 21586413646080.0, + "grad_norm": 2.404757591111986, + "language_loss": 0.7246393, + "learning_rate": 2.969058529792243e-07, + "loss": 0.80130923, + "num_input_tokens_seen": 297673480, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.09576416, + "step": 13798, + "time_per_iteration": 2.4797022342681885 + }, + { + "auxiliary_loss_clip": 0.06397392, + "auxiliary_loss_mlp": 0.01265773, + "balance_loss_clip": 0.06269975, + "balance_loss_mlp": 0.01256153, + "epoch": 0.8296407635653089, + "flos": 21733133345280.0, + "grad_norm": 1.6550926081962973, + "language_loss": 0.76771939, + "learning_rate": 2.967016990202822e-07, + "loss": 0.84435105, + "num_input_tokens_seen": 297693250, + "router_z_loss_clip": 1.27441406, + "router_z_loss_mlp": 0.09613037, + "step": 13799, + "time_per_iteration": 2.555518865585327 + }, + { + "auxiliary_loss_clip": 0.06404122, + "auxiliary_loss_mlp": 0.01265719, + "balance_loss_clip": 0.0627386, + "balance_loss_mlp": 0.01255861, + "epoch": 0.8297008868179768, + "flos": 11185777601280.0, + "grad_norm": 2.1813399594174707, + "language_loss": 0.67236793, + "learning_rate": 2.9649760965058245e-07, + "loss": 0.74906635, + "num_input_tokens_seen": 297710975, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.09844971, + "step": 13800, + "time_per_iteration": 2.4783506393432617 + }, + { + "auxiliary_loss_clip": 0.06410688, + "auxiliary_loss_mlp": 0.01267608, + "balance_loss_clip": 0.06274987, + "balance_loss_mlp": 0.01257475, + "epoch": 0.8297610100706448, + "flos": 20669688051840.0, + "grad_norm": 1.7037177836560289, + "language_loss": 0.74784625, + "learning_rate": 2.9629358487786515e-07, + "loss": 0.82462925, + "num_input_tokens_seen": 297730860, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10125732, + "step": 13801, + "time_per_iteration": 2.5596258640289307 + }, + { + "auxiliary_loss_clip": 0.0640378, + "auxiliary_loss_mlp": 0.01262459, + "balance_loss_clip": 0.06269964, + "balance_loss_mlp": 0.01253578, + "epoch": 0.8298211333233128, + "flos": 20382621563520.0, + "grad_norm": 1.588003382045365, + "language_loss": 0.73570353, + "learning_rate": 2.9608962470986476e-07, + "loss": 0.81236589, + "num_input_tokens_seen": 297749765, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.08880615, + "step": 13802, + "time_per_iteration": 2.496119260787964 + }, + { + "auxiliary_loss_clip": 0.06403106, + "auxiliary_loss_mlp": 0.01264947, + "balance_loss_clip": 0.06270137, + "balance_loss_mlp": 0.01255946, + "epoch": 0.8298812565759808, + "flos": 21515401710720.0, + "grad_norm": 2.0519420047620183, + "language_loss": 0.7494061, + "learning_rate": 2.9588572915431644e-07, + "loss": 0.82608664, + "num_input_tokens_seen": 297770380, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.09002686, + "step": 13803, + "time_per_iteration": 2.6064913272857666 + }, + { + "auxiliary_loss_clip": 0.06402676, + "auxiliary_loss_mlp": 0.01265284, + "balance_loss_clip": 0.0627119, + "balance_loss_mlp": 0.01255806, + "epoch": 0.8299413798286487, + "flos": 22825019900160.0, + "grad_norm": 1.5242051417957505, + "language_loss": 0.79350966, + "learning_rate": 2.9568189821895215e-07, + "loss": 0.87018931, + "num_input_tokens_seen": 297789440, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09484863, + "step": 13804, + "time_per_iteration": 2.505054235458374 + }, + { + "auxiliary_loss_clip": 0.06401961, + "auxiliary_loss_mlp": 0.01267397, + "balance_loss_clip": 0.06270176, + "balance_loss_mlp": 0.01258748, + "epoch": 0.8300015030813167, + "flos": 29686884860160.0, + "grad_norm": 1.6213005522916255, + "language_loss": 0.73804402, + "learning_rate": 2.954781319115016e-07, + "loss": 0.81473756, + "num_input_tokens_seen": 297810425, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.0864563, + "step": 13805, + "time_per_iteration": 2.5898725986480713 + }, + { + "auxiliary_loss_clip": 0.06408954, + "auxiliary_loss_mlp": 0.01266284, + "balance_loss_clip": 0.06273445, + "balance_loss_mlp": 0.01256729, + "epoch": 0.8300616263339846, + "flos": 19725653226240.0, + "grad_norm": 2.0487162307072637, + "language_loss": 0.7747584, + "learning_rate": 2.952744302396906e-07, + "loss": 0.85151076, + "num_input_tokens_seen": 297827680, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.09558105, + "step": 13806, + "time_per_iteration": 3.945065975189209 + }, + { + "auxiliary_loss_clip": 0.06408199, + "auxiliary_loss_mlp": 0.01269037, + "balance_loss_clip": 0.06272151, + "balance_loss_mlp": 0.01258612, + "epoch": 0.8301217495866526, + "flos": 19908151418880.0, + "grad_norm": 1.6678953757169233, + "language_loss": 0.6362474, + "learning_rate": 2.950707932112444e-07, + "loss": 0.71301973, + "num_input_tokens_seen": 297848005, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.10424805, + "step": 13807, + "time_per_iteration": 2.502906560897827 + }, + { + "auxiliary_loss_clip": 0.06403744, + "auxiliary_loss_mlp": 0.01264976, + "balance_loss_clip": 0.06271553, + "balance_loss_mlp": 0.01254712, + "epoch": 0.8301818728393207, + "flos": 19721334741120.0, + "grad_norm": 1.7549844688218141, + "language_loss": 0.73209536, + "learning_rate": 2.948672208338847e-07, + "loss": 0.80878258, + "num_input_tokens_seen": 297866730, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.1026001, + "step": 13808, + "time_per_iteration": 2.5253429412841797 + }, + { + "auxiliary_loss_clip": 0.06410588, + "auxiliary_loss_mlp": 0.01271132, + "balance_loss_clip": 0.06272304, + "balance_loss_mlp": 0.01259962, + "epoch": 0.8302419960919886, + "flos": 28301265417600.0, + "grad_norm": 1.9399976077342271, + "language_loss": 0.66693079, + "learning_rate": 2.9466371311533046e-07, + "loss": 0.74374801, + "num_input_tokens_seen": 297886390, + "router_z_loss_clip": 1.38183594, + "router_z_loss_mlp": 0.11169434, + "step": 13809, + "time_per_iteration": 2.5805299282073975 + }, + { + "auxiliary_loss_clip": 0.06404272, + "auxiliary_loss_mlp": 0.01264954, + "balance_loss_clip": 0.06270543, + "balance_loss_mlp": 0.0125543, + "epoch": 0.8303021193446566, + "flos": 18229344140160.0, + "grad_norm": 2.150755697017939, + "language_loss": 0.74353659, + "learning_rate": 2.9446027006329896e-07, + "loss": 0.82022887, + "num_input_tokens_seen": 297905110, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.09515381, + "step": 13810, + "time_per_iteration": 4.044435739517212 + }, + { + "auxiliary_loss_clip": 0.06400876, + "auxiliary_loss_mlp": 0.01262766, + "balance_loss_clip": 0.06271921, + "balance_loss_mlp": 0.01253873, + "epoch": 0.8303622425973245, + "flos": 23117956174080.0, + "grad_norm": 1.448926431854177, + "language_loss": 0.80966514, + "learning_rate": 2.94256891685505e-07, + "loss": 0.88630158, + "num_input_tokens_seen": 297925460, + "router_z_loss_clip": 1.28808594, + "router_z_loss_mlp": 0.08886719, + "step": 13811, + "time_per_iteration": 2.5290420055389404 + }, + { + "auxiliary_loss_clip": 0.06407966, + "auxiliary_loss_mlp": 0.01264465, + "balance_loss_clip": 0.06273555, + "balance_loss_mlp": 0.0125503, + "epoch": 0.8304223658499925, + "flos": 19578891600000.0, + "grad_norm": 1.6908085329827338, + "language_loss": 0.73443186, + "learning_rate": 2.9405357798966156e-07, + "loss": 0.81115615, + "num_input_tokens_seen": 297941760, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.09442139, + "step": 13812, + "time_per_iteration": 2.568941593170166 + }, + { + "auxiliary_loss_clip": 0.06397095, + "auxiliary_loss_mlp": 0.01262647, + "balance_loss_clip": 0.06270333, + "balance_loss_mlp": 0.0125311, + "epoch": 0.8304824891026604, + "flos": 24433066805760.0, + "grad_norm": 1.5937291888664733, + "language_loss": 0.78513122, + "learning_rate": 2.9385032898347664e-07, + "loss": 0.86172867, + "num_input_tokens_seen": 297959745, + "router_z_loss_clip": 1.26855469, + "router_z_loss_mlp": 0.09539795, + "step": 13813, + "time_per_iteration": 3.97314715385437 + }, + { + "auxiliary_loss_clip": 0.06403156, + "auxiliary_loss_mlp": 0.01268699, + "balance_loss_clip": 0.06268767, + "balance_loss_mlp": 0.01259037, + "epoch": 0.8305426123553284, + "flos": 22388214965760.0, + "grad_norm": 2.2493046221779154, + "language_loss": 0.70725965, + "learning_rate": 2.93647144674658e-07, + "loss": 0.78397816, + "num_input_tokens_seen": 297977665, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.09661865, + "step": 13814, + "time_per_iteration": 2.4843966960906982 + }, + { + "auxiliary_loss_clip": 0.06417993, + "auxiliary_loss_mlp": 0.01265221, + "balance_loss_clip": 0.06274544, + "balance_loss_mlp": 0.0125395, + "epoch": 0.8306027356079964, + "flos": 14908975522560.0, + "grad_norm": 1.9454896448298435, + "language_loss": 0.68298322, + "learning_rate": 2.9344402507091116e-07, + "loss": 0.75981534, + "num_input_tokens_seen": 297993525, + "router_z_loss_clip": 1.43652344, + "router_z_loss_mlp": 0.112854, + "step": 13815, + "time_per_iteration": 2.46174955368042 + }, + { + "auxiliary_loss_clip": 0.06407799, + "auxiliary_loss_mlp": 0.01266189, + "balance_loss_clip": 0.06275922, + "balance_loss_mlp": 0.01256813, + "epoch": 0.8306628588606644, + "flos": 19650406659840.0, + "grad_norm": 1.8213318920984873, + "language_loss": 0.75822055, + "learning_rate": 2.9324097017993745e-07, + "loss": 0.83496046, + "num_input_tokens_seen": 298012920, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09375, + "step": 13816, + "time_per_iteration": 2.4854626655578613 + }, + { + "auxiliary_loss_clip": 0.0640255, + "auxiliary_loss_mlp": 0.01267592, + "balance_loss_clip": 0.06270975, + "balance_loss_mlp": 0.01257877, + "epoch": 0.8307229821133323, + "flos": 24396701333760.0, + "grad_norm": 1.6852177652556903, + "language_loss": 0.81272721, + "learning_rate": 2.930379800094371e-07, + "loss": 0.88942862, + "num_input_tokens_seen": 298033310, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.09716797, + "step": 13817, + "time_per_iteration": 2.5231449604034424 + }, + { + "auxiliary_loss_clip": 0.06404524, + "auxiliary_loss_mlp": 0.01265573, + "balance_loss_clip": 0.06270438, + "balance_loss_mlp": 0.0125544, + "epoch": 0.8307831053660003, + "flos": 21003392136960.0, + "grad_norm": 8.152901765268279, + "language_loss": 0.78097743, + "learning_rate": 2.9283505456710875e-07, + "loss": 0.85767841, + "num_input_tokens_seen": 298053530, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.10137939, + "step": 13818, + "time_per_iteration": 2.485922336578369 + }, + { + "auxiliary_loss_clip": 0.06407157, + "auxiliary_loss_mlp": 0.01267645, + "balance_loss_clip": 0.06273211, + "balance_loss_mlp": 0.01258055, + "epoch": 0.8308432286186682, + "flos": 21403663891200.0, + "grad_norm": 1.7425405604946866, + "language_loss": 0.81941187, + "learning_rate": 2.926321938606453e-07, + "loss": 0.89615989, + "num_input_tokens_seen": 298069305, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.0958252, + "step": 13819, + "time_per_iteration": 2.502380609512329 + }, + { + "auxiliary_loss_clip": 0.06311036, + "auxiliary_loss_mlp": 0.01249, + "balance_loss_clip": 0.06256118, + "balance_loss_mlp": 0.01247877, + "epoch": 0.8309033518713362, + "flos": 62549724625920.0, + "grad_norm": 0.7595557497085774, + "language_loss": 0.56252456, + "learning_rate": 2.924293978977399e-07, + "loss": 0.63812494, + "num_input_tokens_seen": 298125830, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.01125336, + "step": 13820, + "time_per_iteration": 3.130770206451416 + }, + { + "auxiliary_loss_clip": 0.06398563, + "auxiliary_loss_mlp": 0.01264943, + "balance_loss_clip": 0.06269278, + "balance_loss_mlp": 0.0125549, + "epoch": 0.8309634751240043, + "flos": 16984155340800.0, + "grad_norm": 2.741466528675375, + "language_loss": 0.68642658, + "learning_rate": 2.922266666860831e-07, + "loss": 0.76306164, + "num_input_tokens_seen": 298142320, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.09460449, + "step": 13821, + "time_per_iteration": 2.4699923992156982 + }, + { + "auxiliary_loss_clip": 0.06413125, + "auxiliary_loss_mlp": 0.01271837, + "balance_loss_clip": 0.06274682, + "balance_loss_mlp": 0.01261227, + "epoch": 0.8310235983766722, + "flos": 22681067385600.0, + "grad_norm": 2.0343002066143656, + "language_loss": 0.69761801, + "learning_rate": 2.920240002333625e-07, + "loss": 0.77446771, + "num_input_tokens_seen": 298161845, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.10614014, + "step": 13822, + "time_per_iteration": 2.5079588890075684 + }, + { + "auxiliary_loss_clip": 0.06400213, + "auxiliary_loss_mlp": 0.01266021, + "balance_loss_clip": 0.06271067, + "balance_loss_mlp": 0.01256657, + "epoch": 0.8310837216293402, + "flos": 30819539226240.0, + "grad_norm": 1.7328336243228404, + "language_loss": 0.62461919, + "learning_rate": 2.918213985472631e-07, + "loss": 0.70128155, + "num_input_tokens_seen": 298184165, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.09362793, + "step": 13823, + "time_per_iteration": 2.5789008140563965 + }, + { + "auxiliary_loss_clip": 0.06309561, + "auxiliary_loss_mlp": 0.01248333, + "balance_loss_clip": 0.06254762, + "balance_loss_mlp": 0.01247223, + "epoch": 0.8311438448820081, + "flos": 71297338521600.0, + "grad_norm": 1.1093680468899019, + "language_loss": 0.61912626, + "learning_rate": 2.916188616354669e-07, + "loss": 0.69470519, + "num_input_tokens_seen": 298251720, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.01111603, + "step": 13824, + "time_per_iteration": 3.2229104042053223 + }, + { + "auxiliary_loss_clip": 0.06403864, + "auxiliary_loss_mlp": 0.01264891, + "balance_loss_clip": 0.06271043, + "balance_loss_mlp": 0.01255312, + "epoch": 0.8312039681346761, + "flos": 20893457180160.0, + "grad_norm": 1.4744362315601292, + "language_loss": 0.74351555, + "learning_rate": 2.914163895056552e-07, + "loss": 0.82020307, + "num_input_tokens_seen": 298271910, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.09576416, + "step": 13825, + "time_per_iteration": 2.517179250717163 + }, + { + "auxiliary_loss_clip": 0.06408161, + "auxiliary_loss_mlp": 0.01265357, + "balance_loss_clip": 0.06272039, + "balance_loss_mlp": 0.01255022, + "epoch": 0.831264091387344, + "flos": 17022910654080.0, + "grad_norm": 2.2747419309497454, + "language_loss": 0.80132711, + "learning_rate": 2.9121398216550486e-07, + "loss": 0.87806225, + "num_input_tokens_seen": 298288105, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.10333252, + "step": 13826, + "time_per_iteration": 2.525205612182617 + }, + { + "auxiliary_loss_clip": 0.06405398, + "auxiliary_loss_mlp": 0.01267526, + "balance_loss_clip": 0.06271683, + "balance_loss_mlp": 0.01257417, + "epoch": 0.831324214640012, + "flos": 24425436084480.0, + "grad_norm": 1.5111655704985965, + "language_loss": 0.68116403, + "learning_rate": 2.910116396226914e-07, + "loss": 0.75789326, + "num_input_tokens_seen": 298307600, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.10101318, + "step": 13827, + "time_per_iteration": 2.5607199668884277 + }, + { + "auxiliary_loss_clip": 0.06401044, + "auxiliary_loss_mlp": 0.01265932, + "balance_loss_clip": 0.06268896, + "balance_loss_mlp": 0.01257164, + "epoch": 0.83138433789268, + "flos": 13549407500160.0, + "grad_norm": 1.7373805058539677, + "language_loss": 0.74242985, + "learning_rate": 2.9080936188488834e-07, + "loss": 0.81909966, + "num_input_tokens_seen": 298323055, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.08770752, + "step": 13828, + "time_per_iteration": 2.5458273887634277 + }, + { + "auxiliary_loss_clip": 0.06403871, + "auxiliary_loss_mlp": 0.01267016, + "balance_loss_clip": 0.06269043, + "balance_loss_mlp": 0.01257461, + "epoch": 0.831444461145348, + "flos": 44502543262080.0, + "grad_norm": 1.528950624080937, + "language_loss": 0.67366755, + "learning_rate": 2.906071489597657e-07, + "loss": 0.75037646, + "num_input_tokens_seen": 298346950, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.09552002, + "step": 13829, + "time_per_iteration": 2.7112882137298584 + }, + { + "auxiliary_loss_clip": 0.06407791, + "auxiliary_loss_mlp": 0.0126509, + "balance_loss_clip": 0.06270997, + "balance_loss_mlp": 0.01255685, + "epoch": 0.8315045843980159, + "flos": 22710640677120.0, + "grad_norm": 1.4737259193269003, + "language_loss": 0.83000511, + "learning_rate": 2.9040500085499054e-07, + "loss": 0.90673393, + "num_input_tokens_seen": 298366315, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.09411621, + "step": 13830, + "time_per_iteration": 2.552797794342041 + }, + { + "auxiliary_loss_clip": 0.06401931, + "auxiliary_loss_mlp": 0.01267116, + "balance_loss_clip": 0.06269692, + "balance_loss_mlp": 0.01257621, + "epoch": 0.8315647076506839, + "flos": 16879167774720.0, + "grad_norm": 2.538750938791545, + "language_loss": 0.74429476, + "learning_rate": 2.9020291757822925e-07, + "loss": 0.8209852, + "num_input_tokens_seen": 298385185, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.0949707, + "step": 13831, + "time_per_iteration": 2.4796969890594482 + }, + { + "auxiliary_loss_clip": 0.06402907, + "auxiliary_loss_mlp": 0.01264485, + "balance_loss_clip": 0.06268609, + "balance_loss_mlp": 0.01254209, + "epoch": 0.8316248309033518, + "flos": 13813902512640.0, + "grad_norm": 1.6232172408700758, + "language_loss": 0.71379286, + "learning_rate": 2.9000089913714523e-07, + "loss": 0.79046679, + "num_input_tokens_seen": 298402335, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.10272217, + "step": 13832, + "time_per_iteration": 2.503822088241577 + }, + { + "auxiliary_loss_clip": 0.06402344, + "auxiliary_loss_mlp": 0.01266599, + "balance_loss_clip": 0.06269842, + "balance_loss_mlp": 0.01256532, + "epoch": 0.8316849541560198, + "flos": 23519066469120.0, + "grad_norm": 1.7239960485103385, + "language_loss": 0.84317935, + "learning_rate": 2.897989455393979e-07, + "loss": 0.91986877, + "num_input_tokens_seen": 298423370, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.10076904, + "step": 13833, + "time_per_iteration": 2.5225701332092285 + }, + { + "auxiliary_loss_clip": 0.06408376, + "auxiliary_loss_mlp": 0.01269207, + "balance_loss_clip": 0.06272519, + "balance_loss_mlp": 0.01258955, + "epoch": 0.8317450774086879, + "flos": 23778530236800.0, + "grad_norm": 1.4639374420943632, + "language_loss": 0.76301664, + "learning_rate": 2.8959705679264625e-07, + "loss": 0.83979249, + "num_input_tokens_seen": 298444835, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.10253906, + "step": 13834, + "time_per_iteration": 2.6075844764709473 + }, + { + "auxiliary_loss_clip": 0.06396806, + "auxiliary_loss_mlp": 0.01266204, + "balance_loss_clip": 0.06267913, + "balance_loss_mlp": 0.01256846, + "epoch": 0.8318052006613558, + "flos": 16220899699200.0, + "grad_norm": 2.069589955376862, + "language_loss": 0.79849654, + "learning_rate": 2.893952329045459e-07, + "loss": 0.87512666, + "num_input_tokens_seen": 298461845, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.09356689, + "step": 13835, + "time_per_iteration": 3.9197564125061035 + }, + { + "auxiliary_loss_clip": 0.06407574, + "auxiliary_loss_mlp": 0.01269404, + "balance_loss_clip": 0.06272114, + "balance_loss_mlp": 0.01258651, + "epoch": 0.8318653239140238, + "flos": 19980714654720.0, + "grad_norm": 1.9805915742571252, + "language_loss": 0.81482506, + "learning_rate": 2.8919347388274905e-07, + "loss": 0.89159477, + "num_input_tokens_seen": 298479095, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.10760498, + "step": 13836, + "time_per_iteration": 2.506603240966797 + }, + { + "auxiliary_loss_clip": 0.06401465, + "auxiliary_loss_mlp": 0.01266316, + "balance_loss_clip": 0.06271641, + "balance_loss_mlp": 0.01257041, + "epoch": 0.8319254471666917, + "flos": 17709200720640.0, + "grad_norm": 1.8870445084181289, + "language_loss": 0.77578962, + "learning_rate": 2.8899177973490727e-07, + "loss": 0.85246742, + "num_input_tokens_seen": 298494475, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.09277344, + "step": 13837, + "time_per_iteration": 2.454270839691162 + }, + { + "auxiliary_loss_clip": 0.06408006, + "auxiliary_loss_mlp": 0.01264544, + "balance_loss_clip": 0.06268995, + "balance_loss_mlp": 0.01253749, + "epoch": 0.8319855704193597, + "flos": 19542609982080.0, + "grad_norm": 2.110524167983125, + "language_loss": 0.8394767, + "learning_rate": 2.887901504686685e-07, + "loss": 0.91620213, + "num_input_tokens_seen": 298513185, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.10791016, + "step": 13838, + "time_per_iteration": 2.5159199237823486 + }, + { + "auxiliary_loss_clip": 0.06400914, + "auxiliary_loss_mlp": 0.0126734, + "balance_loss_clip": 0.06270094, + "balance_loss_mlp": 0.01257595, + "epoch": 0.8320456936720276, + "flos": 21184339029120.0, + "grad_norm": 1.9156833366254606, + "language_loss": 0.74626046, + "learning_rate": 2.885885860916795e-07, + "loss": 0.82294297, + "num_input_tokens_seen": 298531885, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.09753418, + "step": 13839, + "time_per_iteration": 2.491990327835083 + }, + { + "auxiliary_loss_clip": 0.06401457, + "auxiliary_loss_mlp": 0.01267004, + "balance_loss_clip": 0.06270081, + "balance_loss_mlp": 0.01256871, + "epoch": 0.8321058169246957, + "flos": 33258499545600.0, + "grad_norm": 1.3285467240980675, + "language_loss": 0.6792466, + "learning_rate": 2.8838708661158253e-07, + "loss": 0.7559312, + "num_input_tokens_seen": 298554905, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.10125732, + "step": 13840, + "time_per_iteration": 2.6044259071350098 + }, + { + "auxiliary_loss_clip": 0.06402262, + "auxiliary_loss_mlp": 0.01264716, + "balance_loss_clip": 0.06269044, + "balance_loss_mlp": 0.01254864, + "epoch": 0.8321659401773636, + "flos": 14213042236800.0, + "grad_norm": 4.021967682846655, + "language_loss": 0.79046482, + "learning_rate": 2.8818565203601843e-07, + "loss": 0.86713463, + "num_input_tokens_seen": 298571185, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.09857178, + "step": 13841, + "time_per_iteration": 2.504321813583374 + }, + { + "auxiliary_loss_clip": 0.06404769, + "auxiliary_loss_mlp": 0.01265123, + "balance_loss_clip": 0.06273419, + "balance_loss_mlp": 0.01255116, + "epoch": 0.8322260634300316, + "flos": 15163575753600.0, + "grad_norm": 1.790117375766772, + "language_loss": 0.6903125, + "learning_rate": 2.879842823726262e-07, + "loss": 0.7670114, + "num_input_tokens_seen": 298588505, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.10003662, + "step": 13842, + "time_per_iteration": 2.604609727859497 + }, + { + "auxiliary_loss_clip": 0.06401818, + "auxiliary_loss_mlp": 0.0126278, + "balance_loss_clip": 0.06271365, + "balance_loss_mlp": 0.0125335, + "epoch": 0.8322861866826995, + "flos": 25307766777600.0, + "grad_norm": 1.5988272572181073, + "language_loss": 0.7293849, + "learning_rate": 2.8778297762904124e-07, + "loss": 0.80603087, + "num_input_tokens_seen": 298609295, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.09429932, + "step": 13843, + "time_per_iteration": 2.5483405590057373 + }, + { + "auxiliary_loss_clip": 0.06402604, + "auxiliary_loss_mlp": 0.01265654, + "balance_loss_clip": 0.06272865, + "balance_loss_mlp": 0.012557, + "epoch": 0.8323463099353675, + "flos": 17025048933120.0, + "grad_norm": 1.8400123235458858, + "language_loss": 0.77913845, + "learning_rate": 2.875817378128975e-07, + "loss": 0.85582101, + "num_input_tokens_seen": 298625765, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.09960938, + "step": 13844, + "time_per_iteration": 2.4670820236206055 + }, + { + "auxiliary_loss_clip": 0.06305504, + "auxiliary_loss_mlp": 0.01249937, + "balance_loss_clip": 0.06250882, + "balance_loss_mlp": 0.01248978, + "epoch": 0.8324064331880354, + "flos": 55623891473280.0, + "grad_norm": 0.7656518325639754, + "language_loss": 0.55256236, + "learning_rate": 2.8738056293182624e-07, + "loss": 0.62811679, + "num_input_tokens_seen": 298683005, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.00958252, + "step": 13845, + "time_per_iteration": 3.0772175788879395 + }, + { + "auxiliary_loss_clip": 0.06407619, + "auxiliary_loss_mlp": 0.01269902, + "balance_loss_clip": 0.06272799, + "balance_loss_mlp": 0.01259436, + "epoch": 0.8324665564407034, + "flos": 26145472371840.0, + "grad_norm": 1.647375417376456, + "language_loss": 0.75653505, + "learning_rate": 2.871794529934555e-07, + "loss": 0.83331025, + "num_input_tokens_seen": 298703060, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.10467529, + "step": 13846, + "time_per_iteration": 3.9581072330474854 + }, + { + "auxiliary_loss_clip": 0.06408981, + "auxiliary_loss_mlp": 0.01263143, + "balance_loss_clip": 0.0627064, + "balance_loss_mlp": 0.01253064, + "epoch": 0.8325266796933715, + "flos": 22054846296960.0, + "grad_norm": 1.6287665885130769, + "language_loss": 0.79051238, + "learning_rate": 2.8697840800541115e-07, + "loss": 0.86723363, + "num_input_tokens_seen": 298721765, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.10083008, + "step": 13847, + "time_per_iteration": 2.5202043056488037 + }, + { + "auxiliary_loss_clip": 0.06398055, + "auxiliary_loss_mlp": 0.01262414, + "balance_loss_clip": 0.06268965, + "balance_loss_mlp": 0.01253759, + "epoch": 0.8325868029460394, + "flos": 22822630058880.0, + "grad_norm": 1.530549975631268, + "language_loss": 0.74613917, + "learning_rate": 2.867774279753175e-07, + "loss": 0.82274389, + "num_input_tokens_seen": 298740825, + "router_z_loss_clip": 1.28808594, + "router_z_loss_mlp": 0.08660889, + "step": 13848, + "time_per_iteration": 2.4909098148345947 + }, + { + "auxiliary_loss_clip": 0.06400839, + "auxiliary_loss_mlp": 0.01264258, + "balance_loss_clip": 0.06268533, + "balance_loss_mlp": 0.01254698, + "epoch": 0.8326469261987074, + "flos": 14762800874880.0, + "grad_norm": 1.7394702497172616, + "language_loss": 0.63918781, + "learning_rate": 2.8657651291079554e-07, + "loss": 0.71583879, + "num_input_tokens_seen": 298758515, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.09552002, + "step": 13849, + "time_per_iteration": 3.910769462585449 + }, + { + "auxiliary_loss_clip": 0.06406453, + "auxiliary_loss_mlp": 0.01263072, + "balance_loss_clip": 0.06271137, + "balance_loss_mlp": 0.01253517, + "epoch": 0.8327070494513753, + "flos": 22932145745280.0, + "grad_norm": 2.1227901634386503, + "language_loss": 0.80123168, + "learning_rate": 2.863756628194638e-07, + "loss": 0.87792695, + "num_input_tokens_seen": 298776375, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.09558105, + "step": 13850, + "time_per_iteration": 2.566984176635742 + }, + { + "auxiliary_loss_clip": 0.06396942, + "auxiliary_loss_mlp": 0.01264144, + "balance_loss_clip": 0.06270191, + "balance_loss_mlp": 0.01255215, + "epoch": 0.8327671727040433, + "flos": 20671197425280.0, + "grad_norm": 1.4808337562018643, + "language_loss": 0.7818718, + "learning_rate": 2.8617487770893877e-07, + "loss": 0.85848272, + "num_input_tokens_seen": 298795135, + "router_z_loss_clip": 1.26757812, + "router_z_loss_mlp": 0.08929443, + "step": 13851, + "time_per_iteration": 2.5099880695343018 + }, + { + "auxiliary_loss_clip": 0.06312843, + "auxiliary_loss_mlp": 0.01249612, + "balance_loss_clip": 0.0625807, + "balance_loss_mlp": 0.01248607, + "epoch": 0.8328272959567112, + "flos": 56079353940480.0, + "grad_norm": 0.7536621450911318, + "language_loss": 0.55871034, + "learning_rate": 2.859741575868344e-07, + "loss": 0.63433486, + "num_input_tokens_seen": 298855475, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.01004028, + "step": 13852, + "time_per_iteration": 4.512012481689453 + }, + { + "auxiliary_loss_clip": 0.06398302, + "auxiliary_loss_mlp": 0.01263734, + "balance_loss_clip": 0.06268968, + "balance_loss_mlp": 0.01254489, + "epoch": 0.8328874192093793, + "flos": 32310691286400.0, + "grad_norm": 1.6000652878279704, + "language_loss": 0.67475963, + "learning_rate": 2.8577350246076125e-07, + "loss": 0.75137997, + "num_input_tokens_seen": 298875875, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.09234619, + "step": 13853, + "time_per_iteration": 2.668245792388916 + }, + { + "auxiliary_loss_clip": 0.06401832, + "auxiliary_loss_mlp": 0.01265209, + "balance_loss_clip": 0.06269578, + "balance_loss_mlp": 0.01256078, + "epoch": 0.8329475424620472, + "flos": 23519276104320.0, + "grad_norm": 1.4809556144890181, + "language_loss": 0.78642273, + "learning_rate": 2.855729123383286e-07, + "loss": 0.86309314, + "num_input_tokens_seen": 298895950, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.09124756, + "step": 13854, + "time_per_iteration": 2.5354175567626953 + }, + { + "auxiliary_loss_clip": 0.06309453, + "auxiliary_loss_mlp": 0.0124937, + "balance_loss_clip": 0.06254782, + "balance_loss_mlp": 0.01248336, + "epoch": 0.8330076657147152, + "flos": 67860410474880.0, + "grad_norm": 4.455324750963288, + "language_loss": 0.58546513, + "learning_rate": 2.8537238722714295e-07, + "loss": 0.66105336, + "num_input_tokens_seen": 298955770, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.01035309, + "step": 13855, + "time_per_iteration": 3.0676519870758057 + }, + { + "auxiliary_loss_clip": 0.06402063, + "auxiliary_loss_mlp": 0.01267065, + "balance_loss_clip": 0.06270753, + "balance_loss_mlp": 0.01257623, + "epoch": 0.8330677889673831, + "flos": 22899344071680.0, + "grad_norm": 1.6055070221032506, + "language_loss": 0.72260499, + "learning_rate": 2.8517192713480853e-07, + "loss": 0.79929626, + "num_input_tokens_seen": 298976545, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09442139, + "step": 13856, + "time_per_iteration": 2.50966477394104 + }, + { + "auxiliary_loss_clip": 0.06400804, + "auxiliary_loss_mlp": 0.01263391, + "balance_loss_clip": 0.06269211, + "balance_loss_mlp": 0.01254302, + "epoch": 0.8331279122200511, + "flos": 27352492836480.0, + "grad_norm": 1.6328456842097132, + "language_loss": 0.75703955, + "learning_rate": 2.8497153206892677e-07, + "loss": 0.83368158, + "num_input_tokens_seen": 298996750, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09088135, + "step": 13857, + "time_per_iteration": 2.582209348678589 + }, + { + "auxiliary_loss_clip": 0.06396064, + "auxiliary_loss_mlp": 0.01264751, + "balance_loss_clip": 0.06270817, + "balance_loss_mlp": 0.01256412, + "epoch": 0.833188035472719, + "flos": 19944349182720.0, + "grad_norm": 1.4115605365578703, + "language_loss": 0.73776948, + "learning_rate": 2.847712020370958e-07, + "loss": 0.81437761, + "num_input_tokens_seen": 299014895, + "router_z_loss_clip": 1.25097656, + "router_z_loss_mlp": 0.08343506, + "step": 13858, + "time_per_iteration": 2.4927241802215576 + }, + { + "auxiliary_loss_clip": 0.06408291, + "auxiliary_loss_mlp": 0.01263657, + "balance_loss_clip": 0.06270398, + "balance_loss_mlp": 0.01253077, + "epoch": 0.833248158725387, + "flos": 15238193414400.0, + "grad_norm": 1.8777327656699931, + "language_loss": 0.73586631, + "learning_rate": 2.8457093704691316e-07, + "loss": 0.81258577, + "num_input_tokens_seen": 299032855, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.10577393, + "step": 13859, + "time_per_iteration": 2.486278772354126 + }, + { + "auxiliary_loss_clip": 0.06396432, + "auxiliary_loss_mlp": 0.01261836, + "balance_loss_clip": 0.06268351, + "balance_loss_mlp": 0.01253068, + "epoch": 0.8333082819780551, + "flos": 24542498638080.0, + "grad_norm": 1.8006360517161475, + "language_loss": 0.79587913, + "learning_rate": 2.8437073710597205e-07, + "loss": 0.8724618, + "num_input_tokens_seen": 299052055, + "router_z_loss_clip": 1.27929688, + "router_z_loss_mlp": 0.08758545, + "step": 13860, + "time_per_iteration": 2.5347378253936768 + }, + { + "auxiliary_loss_clip": 0.06397815, + "auxiliary_loss_mlp": 0.01263404, + "balance_loss_clip": 0.06269313, + "balance_loss_mlp": 0.01253587, + "epoch": 0.833368405230723, + "flos": 31475459387520.0, + "grad_norm": 1.331184598111947, + "language_loss": 0.82059163, + "learning_rate": 2.841706022218644e-07, + "loss": 0.8972038, + "num_input_tokens_seen": 299075285, + "router_z_loss_clip": 1.28613281, + "router_z_loss_mlp": 0.09814453, + "step": 13861, + "time_per_iteration": 2.596620798110962 + }, + { + "auxiliary_loss_clip": 0.06403266, + "auxiliary_loss_mlp": 0.01263304, + "balance_loss_clip": 0.06269847, + "balance_loss_mlp": 0.01253612, + "epoch": 0.833428528483391, + "flos": 14907969273600.0, + "grad_norm": 2.0930392556912447, + "language_loss": 0.79152417, + "learning_rate": 2.839705324021806e-07, + "loss": 0.86818981, + "num_input_tokens_seen": 299092520, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.09686279, + "step": 13862, + "time_per_iteration": 2.4472010135650635 + }, + { + "auxiliary_loss_clip": 0.06405707, + "auxiliary_loss_mlp": 0.01262183, + "balance_loss_clip": 0.06271005, + "balance_loss_mlp": 0.0125279, + "epoch": 0.8334886517360589, + "flos": 22206303751680.0, + "grad_norm": 1.6507722224166845, + "language_loss": 0.74980801, + "learning_rate": 2.83770527654505e-07, + "loss": 0.82648689, + "num_input_tokens_seen": 299109450, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.09399414, + "step": 13863, + "time_per_iteration": 2.494450569152832 + }, + { + "auxiliary_loss_clip": 0.06399452, + "auxiliary_loss_mlp": 0.01266138, + "balance_loss_clip": 0.06272257, + "balance_loss_mlp": 0.01256995, + "epoch": 0.8335487749887269, + "flos": 30380386377600.0, + "grad_norm": 5.135787436980748, + "language_loss": 0.74829161, + "learning_rate": 2.835705879864232e-07, + "loss": 0.82494748, + "num_input_tokens_seen": 299129540, + "router_z_loss_clip": 1.27148438, + "router_z_loss_mlp": 0.09136963, + "step": 13864, + "time_per_iteration": 2.5583794116973877 + }, + { + "auxiliary_loss_clip": 0.06403541, + "auxiliary_loss_mlp": 0.01261902, + "balance_loss_clip": 0.06270568, + "balance_loss_mlp": 0.01252132, + "epoch": 0.8336088982413948, + "flos": 24688086307200.0, + "grad_norm": 1.7939101265667057, + "language_loss": 0.69765973, + "learning_rate": 2.833707134055168e-07, + "loss": 0.77431417, + "num_input_tokens_seen": 299148670, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.09765625, + "step": 13865, + "time_per_iteration": 2.534938097000122 + }, + { + "auxiliary_loss_clip": 0.06400782, + "auxiliary_loss_mlp": 0.01264858, + "balance_loss_clip": 0.06268555, + "balance_loss_mlp": 0.01254814, + "epoch": 0.8336690214940629, + "flos": 38185783038720.0, + "grad_norm": 1.4964179575406336, + "language_loss": 0.75587916, + "learning_rate": 2.831709039193653e-07, + "loss": 0.83253551, + "num_input_tokens_seen": 299169330, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.10046387, + "step": 13866, + "time_per_iteration": 2.6298201084136963 + }, + { + "auxiliary_loss_clip": 0.06310411, + "auxiliary_loss_mlp": 0.01251665, + "balance_loss_clip": 0.06255429, + "balance_loss_mlp": 0.01250576, + "epoch": 0.8337291447467308, + "flos": 55580062988160.0, + "grad_norm": 0.8509039314990504, + "language_loss": 0.6284281, + "learning_rate": 2.8297115953554465e-07, + "loss": 0.70404887, + "num_input_tokens_seen": 299220980, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.01091003, + "step": 13867, + "time_per_iteration": 3.0660109519958496 + }, + { + "auxiliary_loss_clip": 0.06398972, + "auxiliary_loss_mlp": 0.01264557, + "balance_loss_clip": 0.0626954, + "balance_loss_mlp": 0.01255515, + "epoch": 0.8337892679993988, + "flos": 24140340167040.0, + "grad_norm": 1.8547798231476953, + "language_loss": 0.72195852, + "learning_rate": 2.827714802616301e-07, + "loss": 0.79859376, + "num_input_tokens_seen": 299240130, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.09039307, + "step": 13868, + "time_per_iteration": 2.5227153301239014 + }, + { + "auxiliary_loss_clip": 0.06403849, + "auxiliary_loss_mlp": 0.0126499, + "balance_loss_clip": 0.06272048, + "balance_loss_mlp": 0.01255406, + "epoch": 0.8338493912520667, + "flos": 28191456241920.0, + "grad_norm": 1.3524554239509516, + "language_loss": 0.8040902, + "learning_rate": 2.8257186610519325e-07, + "loss": 0.88077855, + "num_input_tokens_seen": 299260705, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.0958252, + "step": 13869, + "time_per_iteration": 2.564680814743042 + }, + { + "auxiliary_loss_clip": 0.06403009, + "auxiliary_loss_mlp": 0.01267319, + "balance_loss_clip": 0.06271793, + "balance_loss_mlp": 0.01257818, + "epoch": 0.8339095145047347, + "flos": 22163984640000.0, + "grad_norm": 1.4875129545200938, + "language_loss": 0.82728314, + "learning_rate": 2.823723170738028e-07, + "loss": 0.90398633, + "num_input_tokens_seen": 299278925, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09515381, + "step": 13870, + "time_per_iteration": 2.5508410930633545 + }, + { + "auxiliary_loss_clip": 0.06403069, + "auxiliary_loss_mlp": 0.01263716, + "balance_loss_clip": 0.0626779, + "balance_loss_mlp": 0.01253732, + "epoch": 0.8339696377574026, + "flos": 17312157348480.0, + "grad_norm": 2.4426569314724897, + "language_loss": 0.70744705, + "learning_rate": 2.821728331750264e-07, + "loss": 0.78411496, + "num_input_tokens_seen": 299291580, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.09985352, + "step": 13871, + "time_per_iteration": 2.4675514698028564 + }, + { + "auxiliary_loss_clip": 0.06398696, + "auxiliary_loss_mlp": 0.01268514, + "balance_loss_clip": 0.06271016, + "balance_loss_mlp": 0.01259239, + "epoch": 0.8340297610100706, + "flos": 20674719296640.0, + "grad_norm": 1.8163865761424458, + "language_loss": 0.69741249, + "learning_rate": 2.8197341441642853e-07, + "loss": 0.77408463, + "num_input_tokens_seen": 299310385, + "router_z_loss_clip": 1.27734375, + "router_z_loss_mlp": 0.0927124, + "step": 13872, + "time_per_iteration": 2.491567850112915 + }, + { + "auxiliary_loss_clip": 0.06401074, + "auxiliary_loss_mlp": 0.01264411, + "balance_loss_clip": 0.06269651, + "balance_loss_mlp": 0.01255393, + "epoch": 0.8340898842627387, + "flos": 20520620438400.0, + "grad_norm": 1.9144712990345532, + "language_loss": 0.73314548, + "learning_rate": 2.817740608055712e-07, + "loss": 0.80980027, + "num_input_tokens_seen": 299327660, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09014893, + "step": 13873, + "time_per_iteration": 2.5135762691497803 + }, + { + "auxiliary_loss_clip": 0.0640478, + "auxiliary_loss_mlp": 0.01264886, + "balance_loss_clip": 0.06268793, + "balance_loss_mlp": 0.01253406, + "epoch": 0.8341500075154066, + "flos": 21430889268480.0, + "grad_norm": 1.9289693759151987, + "language_loss": 0.75107884, + "learning_rate": 2.81574772350013e-07, + "loss": 0.82777548, + "num_input_tokens_seen": 299343685, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.1149292, + "step": 13874, + "time_per_iteration": 3.931234836578369 + }, + { + "auxiliary_loss_clip": 0.06398948, + "auxiliary_loss_mlp": 0.01263903, + "balance_loss_clip": 0.06270257, + "balance_loss_mlp": 0.0125433, + "epoch": 0.8342101307680746, + "flos": 22097542752000.0, + "grad_norm": 1.82369189329911, + "language_loss": 0.66693133, + "learning_rate": 2.813755490573118e-07, + "loss": 0.74355984, + "num_input_tokens_seen": 299363305, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.09570312, + "step": 13875, + "time_per_iteration": 2.5164341926574707 + }, + { + "auxiliary_loss_clip": 0.06399906, + "auxiliary_loss_mlp": 0.01265355, + "balance_loss_clip": 0.06269918, + "balance_loss_mlp": 0.01256301, + "epoch": 0.8342702540207425, + "flos": 21877882473600.0, + "grad_norm": 1.7413436247771745, + "language_loss": 0.80487454, + "learning_rate": 2.8117639093502243e-07, + "loss": 0.88152719, + "num_input_tokens_seen": 299382630, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.09051514, + "step": 13876, + "time_per_iteration": 2.5353636741638184 + }, + { + "auxiliary_loss_clip": 0.06400505, + "auxiliary_loss_mlp": 0.01261691, + "balance_loss_clip": 0.06270204, + "balance_loss_mlp": 0.01251999, + "epoch": 0.8343303772734105, + "flos": 22535060446080.0, + "grad_norm": 1.96733671294141, + "language_loss": 0.87216544, + "learning_rate": 2.8097729799069615e-07, + "loss": 0.94878739, + "num_input_tokens_seen": 299402385, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.09686279, + "step": 13877, + "time_per_iteration": 2.505556583404541 + }, + { + "auxiliary_loss_clip": 0.0640115, + "auxiliary_loss_mlp": 0.01262747, + "balance_loss_clip": 0.06269793, + "balance_loss_mlp": 0.01253902, + "epoch": 0.8343905005260784, + "flos": 14945131359360.0, + "grad_norm": 1.8494974533553123, + "language_loss": 0.69351619, + "learning_rate": 2.807782702318828e-07, + "loss": 0.77015519, + "num_input_tokens_seen": 299419820, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.08843994, + "step": 13878, + "time_per_iteration": 2.5137927532196045 + }, + { + "auxiliary_loss_clip": 0.0640167, + "auxiliary_loss_mlp": 0.01265368, + "balance_loss_clip": 0.06269883, + "balance_loss_mlp": 0.01255897, + "epoch": 0.8344506237787465, + "flos": 15017778449280.0, + "grad_norm": 1.9676517124492925, + "language_loss": 0.79576242, + "learning_rate": 2.805793076661309e-07, + "loss": 0.87243277, + "num_input_tokens_seen": 299436265, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.0947876, + "step": 13879, + "time_per_iteration": 2.474787950515747 + }, + { + "auxiliary_loss_clip": 0.06397855, + "auxiliary_loss_mlp": 0.01264416, + "balance_loss_clip": 0.06268258, + "balance_loss_mlp": 0.0125609, + "epoch": 0.8345107470314144, + "flos": 17565122424960.0, + "grad_norm": 1.9102558295245906, + "language_loss": 0.83550584, + "learning_rate": 2.803804103009828e-07, + "loss": 0.91212851, + "num_input_tokens_seen": 299451660, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.08331299, + "step": 13880, + "time_per_iteration": 2.5329551696777344 + }, + { + "auxiliary_loss_clip": 0.06401896, + "auxiliary_loss_mlp": 0.01263382, + "balance_loss_clip": 0.0626949, + "balance_loss_mlp": 0.0125366, + "epoch": 0.8345708702840824, + "flos": 25193513335680.0, + "grad_norm": 1.6329117748195123, + "language_loss": 0.78477925, + "learning_rate": 2.80181578143982e-07, + "loss": 0.86143202, + "num_input_tokens_seen": 299472070, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.09716797, + "step": 13881, + "time_per_iteration": 2.5319807529449463 + }, + { + "auxiliary_loss_clip": 0.06393664, + "auxiliary_loss_mlp": 0.01268201, + "balance_loss_clip": 0.06268856, + "balance_loss_mlp": 0.01260118, + "epoch": 0.8346309935367503, + "flos": 15088580749440.0, + "grad_norm": 2.3152636189856306, + "language_loss": 0.79627961, + "learning_rate": 2.7998281120266807e-07, + "loss": 0.87289822, + "num_input_tokens_seen": 299486725, + "router_z_loss_clip": 1.24902344, + "router_z_loss_mlp": 0.08078003, + "step": 13882, + "time_per_iteration": 2.4848222732543945 + }, + { + "auxiliary_loss_clip": 0.06398013, + "auxiliary_loss_mlp": 0.01266738, + "balance_loss_clip": 0.06268071, + "balance_loss_mlp": 0.01257041, + "epoch": 0.8346911167894183, + "flos": 22937386625280.0, + "grad_norm": 1.9057723326308558, + "language_loss": 0.81047702, + "learning_rate": 2.79784109484579e-07, + "loss": 0.88712454, + "num_input_tokens_seen": 299505435, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.09692383, + "step": 13883, + "time_per_iteration": 2.50827956199646 + }, + { + "auxiliary_loss_clip": 0.06402916, + "auxiliary_loss_mlp": 0.01265732, + "balance_loss_clip": 0.0626992, + "balance_loss_mlp": 0.0125632, + "epoch": 0.8347512400420862, + "flos": 20199159048960.0, + "grad_norm": 2.2082056544036637, + "language_loss": 0.74074388, + "learning_rate": 2.795854729972482e-07, + "loss": 0.81743038, + "num_input_tokens_seen": 299523555, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.09417725, + "step": 13884, + "time_per_iteration": 2.507692813873291 + }, + { + "auxiliary_loss_clip": 0.06410012, + "auxiliary_loss_mlp": 0.01263974, + "balance_loss_clip": 0.06270047, + "balance_loss_mlp": 0.01253382, + "epoch": 0.8348113632947542, + "flos": 25961422878720.0, + "grad_norm": 2.212491110586892, + "language_loss": 0.70608038, + "learning_rate": 2.7938690174820913e-07, + "loss": 0.78282022, + "num_input_tokens_seen": 299541660, + "router_z_loss_clip": 1.40039062, + "router_z_loss_mlp": 0.10595703, + "step": 13885, + "time_per_iteration": 3.943305492401123 + }, + { + "auxiliary_loss_clip": 0.06403215, + "auxiliary_loss_mlp": 0.01261876, + "balance_loss_clip": 0.06270201, + "balance_loss_mlp": 0.01252345, + "epoch": 0.8348714865474223, + "flos": 34213183839360.0, + "grad_norm": 1.4992796639632324, + "language_loss": 0.69971478, + "learning_rate": 2.791883957449912e-07, + "loss": 0.7763657, + "num_input_tokens_seen": 299562465, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.09533691, + "step": 13886, + "time_per_iteration": 2.606997013092041 + }, + { + "auxiliary_loss_clip": 0.06399034, + "auxiliary_loss_mlp": 0.01263136, + "balance_loss_clip": 0.06269737, + "balance_loss_mlp": 0.01253892, + "epoch": 0.8349316098000902, + "flos": 24397162531200.0, + "grad_norm": 3.511326627037885, + "language_loss": 0.79448175, + "learning_rate": 2.7898995499512134e-07, + "loss": 0.87110341, + "num_input_tokens_seen": 299582700, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.0925293, + "step": 13887, + "time_per_iteration": 2.534818172454834 + }, + { + "auxiliary_loss_clip": 0.06410402, + "auxiliary_loss_mlp": 0.01265767, + "balance_loss_clip": 0.06272294, + "balance_loss_mlp": 0.01255294, + "epoch": 0.8349917330527582, + "flos": 23038307268480.0, + "grad_norm": 2.205099646078452, + "language_loss": 0.63997847, + "learning_rate": 2.7879157950612467e-07, + "loss": 0.71674013, + "num_input_tokens_seen": 299600310, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.10467529, + "step": 13888, + "time_per_iteration": 2.494823455810547 + }, + { + "auxiliary_loss_clip": 0.06403908, + "auxiliary_loss_mlp": 0.01264348, + "balance_loss_clip": 0.06267237, + "balance_loss_mlp": 0.01254453, + "epoch": 0.8350518563054261, + "flos": 13630943122560.0, + "grad_norm": 2.167942528379587, + "language_loss": 0.66939718, + "learning_rate": 2.785932692855244e-07, + "loss": 0.74607974, + "num_input_tokens_seen": 299617025, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.09893799, + "step": 13889, + "time_per_iteration": 4.012948513031006 + }, + { + "auxiliary_loss_clip": 0.06399906, + "auxiliary_loss_mlp": 0.01264254, + "balance_loss_clip": 0.06270322, + "balance_loss_mlp": 0.01255284, + "epoch": 0.8351119795580941, + "flos": 21586204010880.0, + "grad_norm": 1.8598402777124405, + "language_loss": 0.69043732, + "learning_rate": 2.783950243408399e-07, + "loss": 0.76707888, + "num_input_tokens_seen": 299633050, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.08978271, + "step": 13890, + "time_per_iteration": 2.538703203201294 + }, + { + "auxiliary_loss_clip": 0.06405049, + "auxiliary_loss_mlp": 0.01267852, + "balance_loss_clip": 0.06271869, + "balance_loss_mlp": 0.01257546, + "epoch": 0.835172102810762, + "flos": 20042921911680.0, + "grad_norm": 2.381877208795948, + "language_loss": 0.59337091, + "learning_rate": 2.7819684467958817e-07, + "loss": 0.67009991, + "num_input_tokens_seen": 299646445, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.10284424, + "step": 13891, + "time_per_iteration": 2.476916551589966 + }, + { + "auxiliary_loss_clip": 0.06403613, + "auxiliary_loss_mlp": 0.01261397, + "balance_loss_clip": 0.06271984, + "balance_loss_mlp": 0.01252373, + "epoch": 0.8352322260634301, + "flos": 25117344374400.0, + "grad_norm": 1.5227027869920424, + "language_loss": 0.72106713, + "learning_rate": 2.779987303092846e-07, + "loss": 0.79771721, + "num_input_tokens_seen": 299662665, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09020996, + "step": 13892, + "time_per_iteration": 3.963770627975464 + }, + { + "auxiliary_loss_clip": 0.06396841, + "auxiliary_loss_mlp": 0.01265593, + "balance_loss_clip": 0.0626855, + "balance_loss_mlp": 0.01256158, + "epoch": 0.835292349316098, + "flos": 24870752208000.0, + "grad_norm": 1.5147233284160702, + "language_loss": 0.65907598, + "learning_rate": 2.7780068123744207e-07, + "loss": 0.73570037, + "num_input_tokens_seen": 299683585, + "router_z_loss_clip": 1.28222656, + "router_z_loss_mlp": 0.09436035, + "step": 13893, + "time_per_iteration": 2.5296645164489746 + }, + { + "auxiliary_loss_clip": 0.06401062, + "auxiliary_loss_mlp": 0.01262319, + "balance_loss_clip": 0.06268641, + "balance_loss_mlp": 0.01253385, + "epoch": 0.835352472568766, + "flos": 19871785946880.0, + "grad_norm": 2.485102746656012, + "language_loss": 0.78644305, + "learning_rate": 2.7760269747156996e-07, + "loss": 0.86307693, + "num_input_tokens_seen": 299702680, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.0892334, + "step": 13894, + "time_per_iteration": 2.5228044986724854 + }, + { + "auxiliary_loss_clip": 0.06396501, + "auxiliary_loss_mlp": 0.01264603, + "balance_loss_clip": 0.06271871, + "balance_loss_mlp": 0.01255949, + "epoch": 0.8354125958214339, + "flos": 22061344988160.0, + "grad_norm": 1.604151565001046, + "language_loss": 0.72635913, + "learning_rate": 2.7740477901917625e-07, + "loss": 0.80297017, + "num_input_tokens_seen": 299721050, + "router_z_loss_clip": 1.24902344, + "router_z_loss_mlp": 0.08654785, + "step": 13895, + "time_per_iteration": 2.5038208961486816 + }, + { + "auxiliary_loss_clip": 0.06404788, + "auxiliary_loss_mlp": 0.01268276, + "balance_loss_clip": 0.06268989, + "balance_loss_mlp": 0.01257661, + "epoch": 0.8354727190741019, + "flos": 21404250869760.0, + "grad_norm": 1.9382861122392194, + "language_loss": 0.7216146, + "learning_rate": 2.772069258877667e-07, + "loss": 0.79834521, + "num_input_tokens_seen": 299738255, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.10601807, + "step": 13896, + "time_per_iteration": 2.5257046222686768 + }, + { + "auxiliary_loss_clip": 0.06398962, + "auxiliary_loss_mlp": 0.01268313, + "balance_loss_clip": 0.06270335, + "balance_loss_mlp": 0.01259211, + "epoch": 0.8355328423267698, + "flos": 50852230940160.0, + "grad_norm": 2.03682748324138, + "language_loss": 0.58762497, + "learning_rate": 2.770091380848423e-07, + "loss": 0.6642977, + "num_input_tokens_seen": 299761315, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.09106445, + "step": 13897, + "time_per_iteration": 2.7454147338867188 + }, + { + "auxiliary_loss_clip": 0.06307182, + "auxiliary_loss_mlp": 0.01250088, + "balance_loss_clip": 0.06252273, + "balance_loss_mlp": 0.0124903, + "epoch": 0.8355929655794379, + "flos": 65571901361280.0, + "grad_norm": 0.7124810299660945, + "language_loss": 0.57679689, + "learning_rate": 2.7681141561790423e-07, + "loss": 0.65236962, + "num_input_tokens_seen": 299828735, + "router_z_loss_clip": 0.54931641, + "router_z_loss_mlp": 0.0105896, + "step": 13898, + "time_per_iteration": 3.2076830863952637 + }, + { + "auxiliary_loss_clip": 0.06407744, + "auxiliary_loss_mlp": 0.01269697, + "balance_loss_clip": 0.06271542, + "balance_loss_mlp": 0.01258861, + "epoch": 0.8356530888321058, + "flos": 19176313858560.0, + "grad_norm": 1.7011409569690659, + "language_loss": 0.80137974, + "learning_rate": 2.7661375849444967e-07, + "loss": 0.87815416, + "num_input_tokens_seen": 299848395, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.10839844, + "step": 13899, + "time_per_iteration": 2.5566153526306152 + }, + { + "auxiliary_loss_clip": 0.06400511, + "auxiliary_loss_mlp": 0.01265988, + "balance_loss_clip": 0.06267878, + "balance_loss_mlp": 0.01256898, + "epoch": 0.8357132120847738, + "flos": 44136624481920.0, + "grad_norm": 1.5711758150102046, + "language_loss": 0.69132239, + "learning_rate": 2.764161667219749e-07, + "loss": 0.76798737, + "num_input_tokens_seen": 299871665, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09088135, + "step": 13900, + "time_per_iteration": 2.7178146839141846 + }, + { + "auxiliary_loss_clip": 0.06403154, + "auxiliary_loss_mlp": 0.01267795, + "balance_loss_clip": 0.06271988, + "balance_loss_mlp": 0.01258658, + "epoch": 0.8357733353374418, + "flos": 24396659406720.0, + "grad_norm": 1.5477695677500147, + "language_loss": 0.71333092, + "learning_rate": 2.762186403079716e-07, + "loss": 0.79004037, + "num_input_tokens_seen": 299891960, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.09136963, + "step": 13901, + "time_per_iteration": 2.5601279735565186 + }, + { + "auxiliary_loss_clip": 0.06405643, + "auxiliary_loss_mlp": 0.01266448, + "balance_loss_clip": 0.06268835, + "balance_loss_mlp": 0.01256607, + "epoch": 0.8358334585901097, + "flos": 20921479171200.0, + "grad_norm": 2.4248634759308247, + "language_loss": 0.803698, + "learning_rate": 2.7602117925992963e-07, + "loss": 0.8804189, + "num_input_tokens_seen": 299905070, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.09844971, + "step": 13902, + "time_per_iteration": 2.4966886043548584 + }, + { + "auxiliary_loss_clip": 0.06397945, + "auxiliary_loss_mlp": 0.01264374, + "balance_loss_clip": 0.06269498, + "balance_loss_mlp": 0.01254849, + "epoch": 0.8358935818427777, + "flos": 19250092978560.0, + "grad_norm": 1.5337059639078017, + "language_loss": 0.62622327, + "learning_rate": 2.758237835853379e-07, + "loss": 0.70284647, + "num_input_tokens_seen": 299925130, + "router_z_loss_clip": 1.28417969, + "router_z_loss_mlp": 0.09527588, + "step": 13903, + "time_per_iteration": 2.487577438354492 + }, + { + "auxiliary_loss_clip": 0.06401621, + "auxiliary_loss_mlp": 0.01268796, + "balance_loss_clip": 0.06271307, + "balance_loss_mlp": 0.01259838, + "epoch": 0.8359537050954456, + "flos": 24140927145600.0, + "grad_norm": 1.6577519293367657, + "language_loss": 0.74130571, + "learning_rate": 2.7562645329168054e-07, + "loss": 0.81800985, + "num_input_tokens_seen": 299943845, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.08953857, + "step": 13904, + "time_per_iteration": 2.523071050643921 + }, + { + "auxiliary_loss_clip": 0.06397306, + "auxiliary_loss_mlp": 0.01262041, + "balance_loss_clip": 0.0626936, + "balance_loss_mlp": 0.01253226, + "epoch": 0.8360138283481137, + "flos": 16186001673600.0, + "grad_norm": 1.6562816533457836, + "language_loss": 0.72656274, + "learning_rate": 2.7542918838644104e-07, + "loss": 0.80315626, + "num_input_tokens_seen": 299961620, + "router_z_loss_clip": 1.27929688, + "router_z_loss_mlp": 0.0881958, + "step": 13905, + "time_per_iteration": 2.4608371257781982 + }, + { + "auxiliary_loss_clip": 0.06397828, + "auxiliary_loss_mlp": 0.01261404, + "balance_loss_clip": 0.06270939, + "balance_loss_mlp": 0.01253095, + "epoch": 0.8360739516007816, + "flos": 22205213648640.0, + "grad_norm": 1.4212033615941317, + "language_loss": 0.66774136, + "learning_rate": 2.752319888771e-07, + "loss": 0.74433374, + "num_input_tokens_seen": 299982170, + "router_z_loss_clip": 1.27050781, + "router_z_loss_mlp": 0.08300781, + "step": 13906, + "time_per_iteration": 2.521660089492798 + }, + { + "auxiliary_loss_clip": 0.06401214, + "auxiliary_loss_mlp": 0.01264056, + "balance_loss_clip": 0.06270771, + "balance_loss_mlp": 0.01254627, + "epoch": 0.8361340748534496, + "flos": 20929445308800.0, + "grad_norm": 1.3169375476629854, + "language_loss": 0.74235398, + "learning_rate": 2.7503485477113475e-07, + "loss": 0.81900668, + "num_input_tokens_seen": 300001330, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.09429932, + "step": 13907, + "time_per_iteration": 2.5083837509155273 + }, + { + "auxiliary_loss_clip": 0.06404978, + "auxiliary_loss_mlp": 0.01264938, + "balance_loss_clip": 0.06269656, + "balance_loss_mlp": 0.01254698, + "epoch": 0.8361941981061175, + "flos": 26180202689280.0, + "grad_norm": 1.7253020139754567, + "language_loss": 0.75386989, + "learning_rate": 2.7483778607602005e-07, + "loss": 0.83056903, + "num_input_tokens_seen": 300020645, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.10241699, + "step": 13908, + "time_per_iteration": 2.5349066257476807 + }, + { + "auxiliary_loss_clip": 0.06400359, + "auxiliary_loss_mlp": 0.012632, + "balance_loss_clip": 0.0626875, + "balance_loss_mlp": 0.01253348, + "epoch": 0.8362543213587855, + "flos": 24425184522240.0, + "grad_norm": 2.1793256705024415, + "language_loss": 0.71560019, + "learning_rate": 2.7464078279922964e-07, + "loss": 0.79223579, + "num_input_tokens_seen": 300039945, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09851074, + "step": 13909, + "time_per_iteration": 2.528615951538086 + }, + { + "auxiliary_loss_clip": 0.06405953, + "auxiliary_loss_mlp": 0.01261262, + "balance_loss_clip": 0.06269771, + "balance_loss_mlp": 0.01251177, + "epoch": 0.8363144446114534, + "flos": 17208217958400.0, + "grad_norm": 1.756161355340015, + "language_loss": 0.7331903, + "learning_rate": 2.744438449482338e-07, + "loss": 0.80986243, + "num_input_tokens_seen": 300058260, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.10083008, + "step": 13910, + "time_per_iteration": 2.47664213180542 + }, + { + "auxiliary_loss_clip": 0.06405869, + "auxiliary_loss_mlp": 0.01264589, + "balance_loss_clip": 0.062729, + "balance_loss_mlp": 0.01255601, + "epoch": 0.8363745678641215, + "flos": 19285116785280.0, + "grad_norm": 1.9545322554977718, + "language_loss": 0.7355817, + "learning_rate": 2.742469725305001e-07, + "loss": 0.81228626, + "num_input_tokens_seen": 300076720, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.08978271, + "step": 13911, + "time_per_iteration": 2.48702073097229 + }, + { + "auxiliary_loss_clip": 0.06402719, + "auxiliary_loss_mlp": 0.01265291, + "balance_loss_clip": 0.06269197, + "balance_loss_mlp": 0.01255719, + "epoch": 0.8364346911167894, + "flos": 11879698389120.0, + "grad_norm": 1.8881216376034646, + "language_loss": 0.78823018, + "learning_rate": 2.740501655534946e-07, + "loss": 0.86491024, + "num_input_tokens_seen": 300092950, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.09564209, + "step": 13912, + "time_per_iteration": 2.4519803524017334 + }, + { + "auxiliary_loss_clip": 0.06396623, + "auxiliary_loss_mlp": 0.01263862, + "balance_loss_clip": 0.06267013, + "balance_loss_mlp": 0.01254766, + "epoch": 0.8364948143694574, + "flos": 20230619057280.0, + "grad_norm": 2.102603712064964, + "language_loss": 0.78802848, + "learning_rate": 2.738534240246797e-07, + "loss": 0.86463332, + "num_input_tokens_seen": 300110950, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.09106445, + "step": 13913, + "time_per_iteration": 2.5273799896240234 + }, + { + "auxiliary_loss_clip": 0.06401996, + "auxiliary_loss_mlp": 0.01269109, + "balance_loss_clip": 0.06268221, + "balance_loss_mlp": 0.01258946, + "epoch": 0.8365549376221254, + "flos": 21618754122240.0, + "grad_norm": 2.0629823157258955, + "language_loss": 0.73688346, + "learning_rate": 2.736567479515153e-07, + "loss": 0.81359446, + "num_input_tokens_seen": 300128705, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.10162354, + "step": 13914, + "time_per_iteration": 3.949573278427124 + }, + { + "auxiliary_loss_clip": 0.06403138, + "auxiliary_loss_mlp": 0.01263701, + "balance_loss_clip": 0.06270348, + "balance_loss_mlp": 0.01253759, + "epoch": 0.8366150608747933, + "flos": 23300831710080.0, + "grad_norm": 1.6012769275209868, + "language_loss": 0.71500385, + "learning_rate": 2.7346013734146025e-07, + "loss": 0.79167223, + "num_input_tokens_seen": 300148635, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.0994873, + "step": 13915, + "time_per_iteration": 2.5438222885131836 + }, + { + "auxiliary_loss_clip": 0.06404576, + "auxiliary_loss_mlp": 0.01266095, + "balance_loss_clip": 0.06271189, + "balance_loss_mlp": 0.01256847, + "epoch": 0.8366751841274613, + "flos": 15273007585920.0, + "grad_norm": 1.8381191598065743, + "language_loss": 0.72815526, + "learning_rate": 2.7326359220197035e-07, + "loss": 0.80486196, + "num_input_tokens_seen": 300165490, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.09249878, + "step": 13916, + "time_per_iteration": 2.4785351753234863 + }, + { + "auxiliary_loss_clip": 0.06402785, + "auxiliary_loss_mlp": 0.01263006, + "balance_loss_clip": 0.06270997, + "balance_loss_mlp": 0.01253523, + "epoch": 0.8367353073801292, + "flos": 13230000535680.0, + "grad_norm": 1.8555718594178066, + "language_loss": 0.74952316, + "learning_rate": 2.7306711254049755e-07, + "loss": 0.82618105, + "num_input_tokens_seen": 300182130, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.09490967, + "step": 13917, + "time_per_iteration": 2.493027687072754 + }, + { + "auxiliary_loss_clip": 0.0639746, + "auxiliary_loss_mlp": 0.01267651, + "balance_loss_clip": 0.06271464, + "balance_loss_mlp": 0.01258747, + "epoch": 0.8367954306327973, + "flos": 24211645591680.0, + "grad_norm": 1.4849909859984562, + "language_loss": 0.79520977, + "learning_rate": 2.728706983644933e-07, + "loss": 0.87186092, + "num_input_tokens_seen": 300203050, + "router_z_loss_clip": 1.25976562, + "router_z_loss_mlp": 0.08911133, + "step": 13918, + "time_per_iteration": 2.52976131439209 + }, + { + "auxiliary_loss_clip": 0.06398945, + "auxiliary_loss_mlp": 0.01262746, + "balance_loss_clip": 0.06267535, + "balance_loss_mlp": 0.01253478, + "epoch": 0.8368555538854652, + "flos": 24541576243200.0, + "grad_norm": 1.6786160238572738, + "language_loss": 0.68168354, + "learning_rate": 2.7267434968140457e-07, + "loss": 0.75830042, + "num_input_tokens_seen": 300224380, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.0927124, + "step": 13919, + "time_per_iteration": 2.599942684173584 + }, + { + "auxiliary_loss_clip": 0.06399108, + "auxiliary_loss_mlp": 0.01264149, + "balance_loss_clip": 0.06269257, + "balance_loss_mlp": 0.01255375, + "epoch": 0.8369156771381332, + "flos": 20264385052800.0, + "grad_norm": 1.5840065764282198, + "language_loss": 0.74044919, + "learning_rate": 2.7247806649867835e-07, + "loss": 0.81708181, + "num_input_tokens_seen": 300242915, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.08764648, + "step": 13920, + "time_per_iteration": 2.4778757095336914 + }, + { + "auxiliary_loss_clip": 0.06400211, + "auxiliary_loss_mlp": 0.01265161, + "balance_loss_clip": 0.0626861, + "balance_loss_mlp": 0.01255589, + "epoch": 0.8369758003908011, + "flos": 21842062053120.0, + "grad_norm": 1.6917442964392928, + "language_loss": 0.68960786, + "learning_rate": 2.722818488237566e-07, + "loss": 0.76626152, + "num_input_tokens_seen": 300261905, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09576416, + "step": 13921, + "time_per_iteration": 2.536294460296631 + }, + { + "auxiliary_loss_clip": 0.06407334, + "auxiliary_loss_mlp": 0.0127037, + "balance_loss_clip": 0.06271015, + "balance_loss_mlp": 0.01260887, + "epoch": 0.8370359236434691, + "flos": 21724664083200.0, + "grad_norm": 1.9282887707714638, + "language_loss": 0.85106057, + "learning_rate": 2.720856966640801e-07, + "loss": 0.92783767, + "num_input_tokens_seen": 300281145, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.09484863, + "step": 13922, + "time_per_iteration": 2.5004947185516357 + }, + { + "auxiliary_loss_clip": 0.06399621, + "auxiliary_loss_mlp": 0.01266109, + "balance_loss_clip": 0.0627037, + "balance_loss_mlp": 0.01256077, + "epoch": 0.837096046896137, + "flos": 23155579457280.0, + "grad_norm": 1.562676302335632, + "language_loss": 0.71699303, + "learning_rate": 2.71889610027088e-07, + "loss": 0.79365033, + "num_input_tokens_seen": 300301610, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.10028076, + "step": 13923, + "time_per_iteration": 2.524562358856201 + }, + { + "auxiliary_loss_clip": 0.06398217, + "auxiliary_loss_mlp": 0.01267054, + "balance_loss_clip": 0.06270584, + "balance_loss_mlp": 0.01257535, + "epoch": 0.8371561701488051, + "flos": 24498795934080.0, + "grad_norm": 1.7133401299934226, + "language_loss": 0.76249665, + "learning_rate": 2.7169358892021433e-07, + "loss": 0.83914936, + "num_input_tokens_seen": 300319420, + "router_z_loss_clip": 1.27636719, + "router_z_loss_mlp": 0.09515381, + "step": 13924, + "time_per_iteration": 2.5283994674682617 + }, + { + "auxiliary_loss_clip": 0.06401788, + "auxiliary_loss_mlp": 0.01267733, + "balance_loss_clip": 0.06271337, + "balance_loss_mlp": 0.01258297, + "epoch": 0.837216293401473, + "flos": 29214636848640.0, + "grad_norm": 1.4446644492995726, + "language_loss": 0.64699805, + "learning_rate": 2.7149763335089293e-07, + "loss": 0.72369325, + "num_input_tokens_seen": 300341325, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.09436035, + "step": 13925, + "time_per_iteration": 4.010638236999512 + }, + { + "auxiliary_loss_clip": 0.06404626, + "auxiliary_loss_mlp": 0.0126583, + "balance_loss_clip": 0.06270886, + "balance_loss_mlp": 0.0125534, + "epoch": 0.837276416654141, + "flos": 25272365627520.0, + "grad_norm": 1.5030494095367346, + "language_loss": 0.74794483, + "learning_rate": 2.713017433265543e-07, + "loss": 0.82464945, + "num_input_tokens_seen": 300361620, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.1048584, + "step": 13926, + "time_per_iteration": 2.5488619804382324 + }, + { + "auxiliary_loss_clip": 0.0640581, + "auxiliary_loss_mlp": 0.01264949, + "balance_loss_clip": 0.0627261, + "balance_loss_mlp": 0.01255258, + "epoch": 0.837336539906809, + "flos": 13887262362240.0, + "grad_norm": 1.6939060787098483, + "language_loss": 0.71716177, + "learning_rate": 2.711059188546274e-07, + "loss": 0.79386938, + "num_input_tokens_seen": 300378675, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.09692383, + "step": 13927, + "time_per_iteration": 2.4630274772644043 + }, + { + "auxiliary_loss_clip": 0.06308782, + "auxiliary_loss_mlp": 0.01252714, + "balance_loss_clip": 0.06254104, + "balance_loss_mlp": 0.01251694, + "epoch": 0.8373966631594769, + "flos": 68891892635520.0, + "grad_norm": 0.6934409858082019, + "language_loss": 0.58671498, + "learning_rate": 2.7091015994253695e-07, + "loss": 0.66233003, + "num_input_tokens_seen": 300449740, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.0102005, + "step": 13928, + "time_per_iteration": 4.668534994125366 + }, + { + "auxiliary_loss_clip": 0.06404306, + "auxiliary_loss_mlp": 0.01266431, + "balance_loss_clip": 0.06273213, + "balance_loss_mlp": 0.01256292, + "epoch": 0.8374567864121449, + "flos": 20455226726400.0, + "grad_norm": 1.60382683016984, + "language_loss": 0.70053691, + "learning_rate": 2.707144665977068e-07, + "loss": 0.77724433, + "num_input_tokens_seen": 300470000, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.10144043, + "step": 13929, + "time_per_iteration": 2.522420644760132 + }, + { + "auxiliary_loss_clip": 0.06407779, + "auxiliary_loss_mlp": 0.01267395, + "balance_loss_clip": 0.06272074, + "balance_loss_mlp": 0.01256827, + "epoch": 0.8375169096648128, + "flos": 41914305694080.0, + "grad_norm": 1.4727423205493513, + "language_loss": 0.67365968, + "learning_rate": 2.705188388275574e-07, + "loss": 0.75041145, + "num_input_tokens_seen": 300494975, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10571289, + "step": 13930, + "time_per_iteration": 2.692265748977661 + }, + { + "auxiliary_loss_clip": 0.06397972, + "auxiliary_loss_mlp": 0.01263394, + "balance_loss_clip": 0.06269804, + "balance_loss_mlp": 0.01254382, + "epoch": 0.8375770329174809, + "flos": 20015235336960.0, + "grad_norm": 1.5773395160364472, + "language_loss": 0.71745491, + "learning_rate": 2.703232766395067e-07, + "loss": 0.79406852, + "num_input_tokens_seen": 300513175, + "router_z_loss_clip": 1.28222656, + "router_z_loss_mlp": 0.09008789, + "step": 13931, + "time_per_iteration": 3.92305064201355 + }, + { + "auxiliary_loss_clip": 0.06398615, + "auxiliary_loss_mlp": 0.01261005, + "balance_loss_clip": 0.06268796, + "balance_loss_mlp": 0.01251838, + "epoch": 0.8376371561701488, + "flos": 22790163801600.0, + "grad_norm": 1.728417843969976, + "language_loss": 0.71899205, + "learning_rate": 2.701277800409705e-07, + "loss": 0.79558825, + "num_input_tokens_seen": 300533770, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.09173584, + "step": 13932, + "time_per_iteration": 2.5266075134277344 + }, + { + "auxiliary_loss_clip": 0.06401943, + "auxiliary_loss_mlp": 0.01265576, + "balance_loss_clip": 0.06270765, + "balance_loss_mlp": 0.01256194, + "epoch": 0.8376972794228168, + "flos": 23921183013120.0, + "grad_norm": 1.9674813085768024, + "language_loss": 0.67152762, + "learning_rate": 2.699323490393628e-07, + "loss": 0.7482028, + "num_input_tokens_seen": 300552995, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09387207, + "step": 13933, + "time_per_iteration": 2.523416042327881 + }, + { + "auxiliary_loss_clip": 0.06398617, + "auxiliary_loss_mlp": 0.01266669, + "balance_loss_clip": 0.0627097, + "balance_loss_mlp": 0.01257067, + "epoch": 0.8377574026754847, + "flos": 13739704122240.0, + "grad_norm": 1.8869122869605348, + "language_loss": 0.7637918, + "learning_rate": 2.697369836420933e-07, + "loss": 0.84044468, + "num_input_tokens_seen": 300570275, + "router_z_loss_clip": 1.27636719, + "router_z_loss_mlp": 0.0960083, + "step": 13934, + "time_per_iteration": 2.467869997024536 + }, + { + "auxiliary_loss_clip": 0.06402792, + "auxiliary_loss_mlp": 0.01265545, + "balance_loss_clip": 0.06273244, + "balance_loss_mlp": 0.0125645, + "epoch": 0.8378175259281527, + "flos": 21657509435520.0, + "grad_norm": 2.239259212098959, + "language_loss": 0.77590597, + "learning_rate": 2.6954168385657115e-07, + "loss": 0.85258937, + "num_input_tokens_seen": 300590875, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.09100342, + "step": 13935, + "time_per_iteration": 2.5480756759643555 + }, + { + "auxiliary_loss_clip": 0.06400282, + "auxiliary_loss_mlp": 0.01261735, + "balance_loss_clip": 0.06268634, + "balance_loss_mlp": 0.01252419, + "epoch": 0.8378776491808206, + "flos": 15453954478080.0, + "grad_norm": 3.5319600801449886, + "language_loss": 0.57043457, + "learning_rate": 2.6934644969020135e-07, + "loss": 0.64705479, + "num_input_tokens_seen": 300607490, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09313965, + "step": 13936, + "time_per_iteration": 2.4494564533233643 + }, + { + "auxiliary_loss_clip": 0.06400599, + "auxiliary_loss_mlp": 0.01263383, + "balance_loss_clip": 0.06270145, + "balance_loss_mlp": 0.01254562, + "epoch": 0.8379377724334887, + "flos": 14725638789120.0, + "grad_norm": 1.878391680874537, + "language_loss": 0.89756596, + "learning_rate": 2.691512811503882e-07, + "loss": 0.97420573, + "num_input_tokens_seen": 300623635, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.0881958, + "step": 13937, + "time_per_iteration": 2.4906821250915527 + }, + { + "auxiliary_loss_clip": 0.06402005, + "auxiliary_loss_mlp": 0.01262073, + "balance_loss_clip": 0.06270915, + "balance_loss_mlp": 0.01252715, + "epoch": 0.8379978956861566, + "flos": 24542163221760.0, + "grad_norm": 1.6373242147454181, + "language_loss": 0.81663287, + "learning_rate": 2.689561782445313e-07, + "loss": 0.89327371, + "num_input_tokens_seen": 300643835, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.09362793, + "step": 13938, + "time_per_iteration": 2.6027586460113525 + }, + { + "auxiliary_loss_clip": 0.06405147, + "auxiliary_loss_mlp": 0.01264623, + "balance_loss_clip": 0.062712, + "balance_loss_mlp": 0.01254711, + "epoch": 0.8380580189388246, + "flos": 18958540296960.0, + "grad_norm": 1.6258157555571138, + "language_loss": 0.70874858, + "learning_rate": 2.6876114098002965e-07, + "loss": 0.78544629, + "num_input_tokens_seen": 300662500, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.09918213, + "step": 13939, + "time_per_iteration": 2.529512882232666 + }, + { + "auxiliary_loss_clip": 0.06405655, + "auxiliary_loss_mlp": 0.01269834, + "balance_loss_clip": 0.06271434, + "balance_loss_mlp": 0.01260071, + "epoch": 0.8381181421914926, + "flos": 26547253499520.0, + "grad_norm": 1.6173394319792127, + "language_loss": 0.76280761, + "learning_rate": 2.6856616936428e-07, + "loss": 0.83956242, + "num_input_tokens_seen": 300681480, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.09759521, + "step": 13940, + "time_per_iteration": 2.539008378982544 + }, + { + "auxiliary_loss_clip": 0.06398639, + "auxiliary_loss_mlp": 0.01263497, + "balance_loss_clip": 0.06269714, + "balance_loss_mlp": 0.01253984, + "epoch": 0.8381782654441605, + "flos": 23297645255040.0, + "grad_norm": 1.571823062249585, + "language_loss": 0.76635635, + "learning_rate": 2.6837126340467374e-07, + "loss": 0.84297776, + "num_input_tokens_seen": 300699165, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.0949707, + "step": 13941, + "time_per_iteration": 2.498035192489624 + }, + { + "auxiliary_loss_clip": 0.06407124, + "auxiliary_loss_mlp": 0.01264368, + "balance_loss_clip": 0.06270672, + "balance_loss_mlp": 0.01254593, + "epoch": 0.8382383886968285, + "flos": 26765739820800.0, + "grad_norm": 1.8960561722214873, + "language_loss": 0.73615742, + "learning_rate": 2.6817642310860276e-07, + "loss": 0.81287235, + "num_input_tokens_seen": 300714615, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.09777832, + "step": 13942, + "time_per_iteration": 2.534268856048584 + }, + { + "auxiliary_loss_clip": 0.06414034, + "auxiliary_loss_mlp": 0.01264269, + "balance_loss_clip": 0.06272998, + "balance_loss_mlp": 0.0125385, + "epoch": 0.8382985119494964, + "flos": 26111790230400.0, + "grad_norm": 1.4953994641731532, + "language_loss": 0.79530114, + "learning_rate": 2.679816484834554e-07, + "loss": 0.87208414, + "num_input_tokens_seen": 300734860, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.10424805, + "step": 13943, + "time_per_iteration": 2.548069715499878 + }, + { + "auxiliary_loss_clip": 0.06402889, + "auxiliary_loss_mlp": 0.01262959, + "balance_loss_clip": 0.0627131, + "balance_loss_mlp": 0.01253482, + "epoch": 0.8383586352021645, + "flos": 16440643831680.0, + "grad_norm": 1.9362990480164113, + "language_loss": 0.85566223, + "learning_rate": 2.6778693953661766e-07, + "loss": 0.93232077, + "num_input_tokens_seen": 300752735, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.0947876, + "step": 13944, + "time_per_iteration": 2.4838459491729736 + }, + { + "auxiliary_loss_clip": 0.0631334, + "auxiliary_loss_mlp": 0.01250973, + "balance_loss_clip": 0.06258479, + "balance_loss_mlp": 0.01249939, + "epoch": 0.8384187584548324, + "flos": 64215226304640.0, + "grad_norm": 0.6035369639047439, + "language_loss": 0.50281239, + "learning_rate": 2.6759229627547263e-07, + "loss": 0.57845557, + "num_input_tokens_seen": 300820760, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.01034546, + "step": 13945, + "time_per_iteration": 3.2410154342651367 + }, + { + "auxiliary_loss_clip": 0.06398898, + "auxiliary_loss_mlp": 0.01263596, + "balance_loss_clip": 0.06270773, + "balance_loss_mlp": 0.01254, + "epoch": 0.8384788817075004, + "flos": 22389514704000.0, + "grad_norm": 1.6641583948208805, + "language_loss": 0.65164709, + "learning_rate": 2.673977187074017e-07, + "loss": 0.72827202, + "num_input_tokens_seen": 300840025, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.0960083, + "step": 13946, + "time_per_iteration": 2.6071982383728027 + }, + { + "auxiliary_loss_clip": 0.06405137, + "auxiliary_loss_mlp": 0.01264205, + "balance_loss_clip": 0.06271759, + "balance_loss_mlp": 0.01254483, + "epoch": 0.8385390049601683, + "flos": 29504512448640.0, + "grad_norm": 1.5353623663640485, + "language_loss": 0.67792797, + "learning_rate": 2.672032068397829e-07, + "loss": 0.75462139, + "num_input_tokens_seen": 300860380, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.09729004, + "step": 13947, + "time_per_iteration": 2.63541579246521 + }, + { + "auxiliary_loss_clip": 0.06404837, + "auxiliary_loss_mlp": 0.0126567, + "balance_loss_clip": 0.06270772, + "balance_loss_mlp": 0.01255472, + "epoch": 0.8385991282128363, + "flos": 32716036212480.0, + "grad_norm": 1.4309661771954254, + "language_loss": 0.6985665, + "learning_rate": 2.6700876067999176e-07, + "loss": 0.77527153, + "num_input_tokens_seen": 300881895, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.10205078, + "step": 13948, + "time_per_iteration": 2.6386852264404297 + }, + { + "auxiliary_loss_clip": 0.06396742, + "auxiliary_loss_mlp": 0.01262841, + "balance_loss_clip": 0.06268944, + "balance_loss_mlp": 0.01254753, + "epoch": 0.8386592514655042, + "flos": 25447023463680.0, + "grad_norm": 1.6745888315245265, + "language_loss": 0.84810793, + "learning_rate": 2.6681438023540194e-07, + "loss": 0.92470378, + "num_input_tokens_seen": 300901575, + "router_z_loss_clip": 1.27832031, + "router_z_loss_mlp": 0.08087158, + "step": 13949, + "time_per_iteration": 2.5083541870117188 + }, + { + "auxiliary_loss_clip": 0.0639628, + "auxiliary_loss_mlp": 0.01266559, + "balance_loss_clip": 0.06268419, + "balance_loss_mlp": 0.01257713, + "epoch": 0.8387193747181723, + "flos": 22022086550400.0, + "grad_norm": 1.712891634847403, + "language_loss": 0.71039176, + "learning_rate": 2.66620065513385e-07, + "loss": 0.78702009, + "num_input_tokens_seen": 300919735, + "router_z_loss_clip": 1.28027344, + "router_z_loss_mlp": 0.08850098, + "step": 13950, + "time_per_iteration": 2.51889967918396 + }, + { + "auxiliary_loss_clip": 0.06399944, + "auxiliary_loss_mlp": 0.01267019, + "balance_loss_clip": 0.06270279, + "balance_loss_mlp": 0.01257375, + "epoch": 0.8387794979708402, + "flos": 18156068144640.0, + "grad_norm": 1.579038787139869, + "language_loss": 0.64784032, + "learning_rate": 2.6642581652130913e-07, + "loss": 0.72450995, + "num_input_tokens_seen": 300939150, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.09643555, + "step": 13951, + "time_per_iteration": 2.4674899578094482 + }, + { + "auxiliary_loss_clip": 0.06404419, + "auxiliary_loss_mlp": 0.0126323, + "balance_loss_clip": 0.06272285, + "balance_loss_mlp": 0.01253396, + "epoch": 0.8388396212235082, + "flos": 25418330640000.0, + "grad_norm": 1.4418962880777595, + "language_loss": 0.70313966, + "learning_rate": 2.662316332665393e-07, + "loss": 0.77981615, + "num_input_tokens_seen": 300959730, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.09832764, + "step": 13952, + "time_per_iteration": 2.554793119430542 + }, + { + "auxiliary_loss_clip": 0.06395441, + "auxiliary_loss_mlp": 0.01263784, + "balance_loss_clip": 0.06266855, + "balance_loss_mlp": 0.01255159, + "epoch": 0.8388997444761762, + "flos": 22279579747200.0, + "grad_norm": 1.8744107681123892, + "language_loss": 0.73154211, + "learning_rate": 2.6603751575643987e-07, + "loss": 0.80813444, + "num_input_tokens_seen": 300976120, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.08624268, + "step": 13953, + "time_per_iteration": 3.9177489280700684 + }, + { + "auxiliary_loss_clip": 0.06400088, + "auxiliary_loss_mlp": 0.01263583, + "balance_loss_clip": 0.06270742, + "balance_loss_mlp": 0.01254296, + "epoch": 0.8389598677288441, + "flos": 19579310870400.0, + "grad_norm": 2.072771641574644, + "language_loss": 0.67898321, + "learning_rate": 2.6584346399837176e-07, + "loss": 0.75561988, + "num_input_tokens_seen": 300995080, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.09289551, + "step": 13954, + "time_per_iteration": 2.489935874938965 + }, + { + "auxiliary_loss_clip": 0.06399843, + "auxiliary_loss_mlp": 0.01265295, + "balance_loss_clip": 0.06269407, + "balance_loss_mlp": 0.01255996, + "epoch": 0.8390199909815121, + "flos": 17390548442880.0, + "grad_norm": 2.000257217636036, + "language_loss": 0.74052519, + "learning_rate": 2.656494779996932e-07, + "loss": 0.81717652, + "num_input_tokens_seen": 301012920, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.09301758, + "step": 13955, + "time_per_iteration": 2.5330069065093994 + }, + { + "auxiliary_loss_clip": 0.06402773, + "auxiliary_loss_mlp": 0.01265265, + "balance_loss_clip": 0.06271341, + "balance_loss_mlp": 0.01256152, + "epoch": 0.83908011423418, + "flos": 24645725268480.0, + "grad_norm": 2.260030049088017, + "language_loss": 0.65815377, + "learning_rate": 2.6545555776775995e-07, + "loss": 0.73483419, + "num_input_tokens_seen": 301028875, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09100342, + "step": 13956, + "time_per_iteration": 2.5096991062164307 + }, + { + "auxiliary_loss_clip": 0.06408071, + "auxiliary_loss_mlp": 0.01264206, + "balance_loss_clip": 0.06273458, + "balance_loss_mlp": 0.0125458, + "epoch": 0.8391402374868481, + "flos": 24725416101120.0, + "grad_norm": 1.7667751667109255, + "language_loss": 0.80019176, + "learning_rate": 2.6526170330992667e-07, + "loss": 0.87691456, + "num_input_tokens_seen": 301050115, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.09625244, + "step": 13957, + "time_per_iteration": 2.5260238647460938 + }, + { + "auxiliary_loss_clip": 0.06310308, + "auxiliary_loss_mlp": 0.01251038, + "balance_loss_clip": 0.06255397, + "balance_loss_mlp": 0.01249962, + "epoch": 0.839200360739516, + "flos": 56891804728320.0, + "grad_norm": 0.7372887676076516, + "language_loss": 0.53274184, + "learning_rate": 2.6506791463354283e-07, + "loss": 0.60835534, + "num_input_tokens_seen": 301114155, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.01077271, + "step": 13958, + "time_per_iteration": 3.2426984310150146 + }, + { + "auxiliary_loss_clip": 0.0640053, + "auxiliary_loss_mlp": 0.0126646, + "balance_loss_clip": 0.06271528, + "balance_loss_mlp": 0.01256756, + "epoch": 0.839260483992184, + "flos": 18338692118400.0, + "grad_norm": 1.7639772408046024, + "language_loss": 0.73410964, + "learning_rate": 2.648741917459574e-07, + "loss": 0.81077951, + "num_input_tokens_seen": 301133150, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.09698486, + "step": 13959, + "time_per_iteration": 2.5194149017333984 + }, + { + "auxiliary_loss_clip": 0.06397633, + "auxiliary_loss_mlp": 0.01265334, + "balance_loss_clip": 0.06271541, + "balance_loss_mlp": 0.01256209, + "epoch": 0.8393206072448519, + "flos": 27095041566720.0, + "grad_norm": 1.5424729354791942, + "language_loss": 0.56095922, + "learning_rate": 2.646805346545169e-07, + "loss": 0.63758886, + "num_input_tokens_seen": 301153600, + "router_z_loss_clip": 1.25976562, + "router_z_loss_mlp": 0.09130859, + "step": 13960, + "time_per_iteration": 2.537529706954956 + }, + { + "auxiliary_loss_clip": 0.06315308, + "auxiliary_loss_mlp": 0.01251161, + "balance_loss_clip": 0.06260296, + "balance_loss_mlp": 0.01249989, + "epoch": 0.8393807304975199, + "flos": 61538619006720.0, + "grad_norm": 0.753849352995641, + "language_loss": 0.60770983, + "learning_rate": 2.6448694336656397e-07, + "loss": 0.68337452, + "num_input_tokens_seen": 301214335, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.01169586, + "step": 13961, + "time_per_iteration": 3.2075889110565186 + }, + { + "auxiliary_loss_clip": 0.06403187, + "auxiliary_loss_mlp": 0.01262351, + "balance_loss_clip": 0.06271735, + "balance_loss_mlp": 0.01252922, + "epoch": 0.8394408537501878, + "flos": 14898787251840.0, + "grad_norm": 2.1675856051835987, + "language_loss": 0.68842262, + "learning_rate": 2.642934178894405e-07, + "loss": 0.76507801, + "num_input_tokens_seen": 301228960, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.09429932, + "step": 13962, + "time_per_iteration": 2.4669265747070312 + }, + { + "auxiliary_loss_clip": 0.0640391, + "auxiliary_loss_mlp": 0.01265749, + "balance_loss_clip": 0.06269991, + "balance_loss_mlp": 0.01256194, + "epoch": 0.8395009770028559, + "flos": 17416516008960.0, + "grad_norm": 1.8287277787854637, + "language_loss": 0.73421824, + "learning_rate": 2.640999582304841e-07, + "loss": 0.81091487, + "num_input_tokens_seen": 301245875, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.09552002, + "step": 13963, + "time_per_iteration": 2.506747245788574 + }, + { + "auxiliary_loss_clip": 0.0640099, + "auxiliary_loss_mlp": 0.01266167, + "balance_loss_clip": 0.06270087, + "balance_loss_mlp": 0.01257585, + "epoch": 0.8395611002555238, + "flos": 27931615130880.0, + "grad_norm": 2.0600599297786646, + "language_loss": 0.7623294, + "learning_rate": 2.6390656439703173e-07, + "loss": 0.839001, + "num_input_tokens_seen": 301265550, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.08587646, + "step": 13964, + "time_per_iteration": 3.9650909900665283 + }, + { + "auxiliary_loss_clip": 0.06405744, + "auxiliary_loss_mlp": 0.0126646, + "balance_loss_clip": 0.06269985, + "balance_loss_mlp": 0.01255589, + "epoch": 0.8396212235081918, + "flos": 11104325832960.0, + "grad_norm": 1.8673180285408302, + "language_loss": 0.78343093, + "learning_rate": 2.637132363964161e-07, + "loss": 0.86015296, + "num_input_tokens_seen": 301282035, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10864258, + "step": 13965, + "time_per_iteration": 2.4702308177948 + }, + { + "auxiliary_loss_clip": 0.06399923, + "auxiliary_loss_mlp": 0.01263836, + "balance_loss_clip": 0.0627108, + "balance_loss_mlp": 0.0125483, + "epoch": 0.8396813467608598, + "flos": 35744307096960.0, + "grad_norm": 1.4537191303723818, + "language_loss": 0.65587616, + "learning_rate": 2.635199742359684e-07, + "loss": 0.73251367, + "num_input_tokens_seen": 301305210, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.09002686, + "step": 13966, + "time_per_iteration": 2.646740436553955 + }, + { + "auxiliary_loss_clip": 0.06399661, + "auxiliary_loss_mlp": 0.01262484, + "balance_loss_clip": 0.06269723, + "balance_loss_mlp": 0.01253705, + "epoch": 0.8397414700135277, + "flos": 26183850341760.0, + "grad_norm": 1.5561591084545034, + "language_loss": 0.74881774, + "learning_rate": 2.633267779230177e-07, + "loss": 0.82543921, + "num_input_tokens_seen": 301324885, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.08782959, + "step": 13967, + "time_per_iteration": 2.5391502380371094 + }, + { + "auxiliary_loss_clip": 0.06401393, + "auxiliary_loss_mlp": 0.01262984, + "balance_loss_clip": 0.06270708, + "balance_loss_mlp": 0.01254157, + "epoch": 0.8398015932661957, + "flos": 18339069461760.0, + "grad_norm": 1.695171973316043, + "language_loss": 0.82986927, + "learning_rate": 2.6313364746488974e-07, + "loss": 0.90651309, + "num_input_tokens_seen": 301343070, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.08837891, + "step": 13968, + "time_per_iteration": 3.9714221954345703 + }, + { + "auxiliary_loss_clip": 0.06404668, + "auxiliary_loss_mlp": 0.01265348, + "balance_loss_clip": 0.06270289, + "balance_loss_mlp": 0.01255877, + "epoch": 0.8398617165188637, + "flos": 17384469022080.0, + "grad_norm": 2.314933377391938, + "language_loss": 0.77319568, + "learning_rate": 2.629405828689075e-07, + "loss": 0.84989589, + "num_input_tokens_seen": 301359280, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.09472656, + "step": 13969, + "time_per_iteration": 2.4677093029022217 + }, + { + "auxiliary_loss_clip": 0.06403182, + "auxiliary_loss_mlp": 0.01262152, + "balance_loss_clip": 0.06268304, + "balance_loss_mlp": 0.01252192, + "epoch": 0.8399218397715317, + "flos": 22936296522240.0, + "grad_norm": 1.9974672469929322, + "language_loss": 0.77134824, + "learning_rate": 2.627475841423923e-07, + "loss": 0.8480016, + "num_input_tokens_seen": 301376465, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.09954834, + "step": 13970, + "time_per_iteration": 3.9030561447143555 + }, + { + "auxiliary_loss_clip": 0.06401689, + "auxiliary_loss_mlp": 0.01266818, + "balance_loss_clip": 0.06269555, + "balance_loss_mlp": 0.01257042, + "epoch": 0.8399819630241996, + "flos": 23156376071040.0, + "grad_norm": 2.053890179437049, + "language_loss": 0.72737366, + "learning_rate": 2.625546512926633e-07, + "loss": 0.80405873, + "num_input_tokens_seen": 301396000, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09771729, + "step": 13971, + "time_per_iteration": 2.514538049697876 + }, + { + "auxiliary_loss_clip": 0.06401571, + "auxiliary_loss_mlp": 0.01263608, + "balance_loss_clip": 0.06270111, + "balance_loss_mlp": 0.01254059, + "epoch": 0.8400420862768676, + "flos": 16402727059200.0, + "grad_norm": 2.070954045877117, + "language_loss": 0.77785814, + "learning_rate": 2.623617843270358e-07, + "loss": 0.85450995, + "num_input_tokens_seen": 301413160, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09545898, + "step": 13972, + "time_per_iteration": 2.4673666954040527 + }, + { + "auxiliary_loss_clip": 0.06399271, + "auxiliary_loss_mlp": 0.01263901, + "balance_loss_clip": 0.06270008, + "balance_loss_mlp": 0.01254484, + "epoch": 0.8401022095295355, + "flos": 21293770861440.0, + "grad_norm": 1.3621569173910255, + "language_loss": 0.68392384, + "learning_rate": 2.6216898325282333e-07, + "loss": 0.76055562, + "num_input_tokens_seen": 301433325, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.09429932, + "step": 13973, + "time_per_iteration": 2.530261516571045 + }, + { + "auxiliary_loss_clip": 0.06399777, + "auxiliary_loss_mlp": 0.01266286, + "balance_loss_clip": 0.06268927, + "balance_loss_mlp": 0.01256875, + "epoch": 0.8401623327822035, + "flos": 17317062812160.0, + "grad_norm": 1.787125184070989, + "language_loss": 0.78559691, + "learning_rate": 2.619762480773382e-07, + "loss": 0.86225754, + "num_input_tokens_seen": 301450265, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09417725, + "step": 13974, + "time_per_iteration": 2.462040424346924 + }, + { + "auxiliary_loss_clip": 0.0640364, + "auxiliary_loss_mlp": 0.01264498, + "balance_loss_clip": 0.06270675, + "balance_loss_mlp": 0.01255211, + "epoch": 0.8402224560348714, + "flos": 22243214275200.0, + "grad_norm": 1.4562103354507534, + "language_loss": 0.72743988, + "learning_rate": 2.617835788078868e-07, + "loss": 0.80412126, + "num_input_tokens_seen": 301470760, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.09289551, + "step": 13975, + "time_per_iteration": 2.555020332336426 + }, + { + "auxiliary_loss_clip": 0.06401096, + "auxiliary_loss_mlp": 0.01266475, + "balance_loss_clip": 0.0627125, + "balance_loss_mlp": 0.01256623, + "epoch": 0.8402825792875395, + "flos": 20236153426560.0, + "grad_norm": 1.6874682167845347, + "language_loss": 0.72985578, + "learning_rate": 2.6159097545177645e-07, + "loss": 0.80653155, + "num_input_tokens_seen": 301489425, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.09857178, + "step": 13976, + "time_per_iteration": 2.4919087886810303 + }, + { + "auxiliary_loss_clip": 0.06400332, + "auxiliary_loss_mlp": 0.01260889, + "balance_loss_clip": 0.06269455, + "balance_loss_mlp": 0.01252359, + "epoch": 0.8403427025402074, + "flos": 23295884319360.0, + "grad_norm": 1.6877264487051344, + "language_loss": 0.72409099, + "learning_rate": 2.61398438016311e-07, + "loss": 0.80070317, + "num_input_tokens_seen": 301508885, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.08526611, + "step": 13977, + "time_per_iteration": 2.5217444896698 + }, + { + "auxiliary_loss_clip": 0.06405861, + "auxiliary_loss_mlp": 0.01264239, + "balance_loss_clip": 0.06272191, + "balance_loss_mlp": 0.01254548, + "epoch": 0.8404028257928754, + "flos": 32684534277120.0, + "grad_norm": 1.3954911875741427, + "language_loss": 0.68609047, + "learning_rate": 2.6120596650879043e-07, + "loss": 0.76279151, + "num_input_tokens_seen": 301533780, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.09686279, + "step": 13978, + "time_per_iteration": 2.6191842555999756 + }, + { + "auxiliary_loss_clip": 0.06397029, + "auxiliary_loss_mlp": 0.01262166, + "balance_loss_clip": 0.06270245, + "balance_loss_mlp": 0.01252808, + "epoch": 0.8404629490455434, + "flos": 16186127454720.0, + "grad_norm": 1.5374499175737208, + "language_loss": 0.78201067, + "learning_rate": 2.610135609365145e-07, + "loss": 0.85860264, + "num_input_tokens_seen": 301551775, + "router_z_loss_clip": 1.26660156, + "router_z_loss_mlp": 0.09350586, + "step": 13979, + "time_per_iteration": 2.4852335453033447 + }, + { + "auxiliary_loss_clip": 0.06403331, + "auxiliary_loss_mlp": 0.01265246, + "balance_loss_clip": 0.06270297, + "balance_loss_mlp": 0.01255214, + "epoch": 0.8405230722982113, + "flos": 15199731590400.0, + "grad_norm": 1.8725202434622394, + "language_loss": 0.78169626, + "learning_rate": 2.60821221306778e-07, + "loss": 0.85838211, + "num_input_tokens_seen": 301570495, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.1003418, + "step": 13980, + "time_per_iteration": 2.4990322589874268 + }, + { + "auxiliary_loss_clip": 0.06397291, + "auxiliary_loss_mlp": 0.0126609, + "balance_loss_clip": 0.06270248, + "balance_loss_mlp": 0.01256941, + "epoch": 0.8405831955508793, + "flos": 27818787208320.0, + "grad_norm": 1.5682421159240296, + "language_loss": 0.86943978, + "learning_rate": 2.606289476268757e-07, + "loss": 0.94607365, + "num_input_tokens_seen": 301591705, + "router_z_loss_clip": 1.27246094, + "router_z_loss_mlp": 0.09155273, + "step": 13981, + "time_per_iteration": 2.568634271621704 + }, + { + "auxiliary_loss_clip": 0.06401773, + "auxiliary_loss_mlp": 0.01267361, + "balance_loss_clip": 0.0627171, + "balance_loss_mlp": 0.01258027, + "epoch": 0.8406433188035473, + "flos": 23776308103680.0, + "grad_norm": 1.7497238195302791, + "language_loss": 0.67594308, + "learning_rate": 2.6043673990409745e-07, + "loss": 0.75263447, + "num_input_tokens_seen": 301611670, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.09332275, + "step": 13982, + "time_per_iteration": 2.507876396179199 + }, + { + "auxiliary_loss_clip": 0.06406415, + "auxiliary_loss_mlp": 0.01263144, + "balance_loss_clip": 0.0627246, + "balance_loss_mlp": 0.01252618, + "epoch": 0.8407034420562153, + "flos": 29213420964480.0, + "grad_norm": 1.5190356335780981, + "language_loss": 0.68256176, + "learning_rate": 2.602445981457324e-07, + "loss": 0.75925732, + "num_input_tokens_seen": 301632540, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.10522461, + "step": 13983, + "time_per_iteration": 2.575272560119629 + }, + { + "auxiliary_loss_clip": 0.06401223, + "auxiliary_loss_mlp": 0.01262569, + "balance_loss_clip": 0.06268837, + "balance_loss_mlp": 0.01253116, + "epoch": 0.8407635653088832, + "flos": 26367396710400.0, + "grad_norm": 1.902440913607337, + "language_loss": 0.79216588, + "learning_rate": 2.6005252235906684e-07, + "loss": 0.86880374, + "num_input_tokens_seen": 301651480, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.09454346, + "step": 13984, + "time_per_iteration": 2.5355708599090576 + }, + { + "auxiliary_loss_clip": 0.06399589, + "auxiliary_loss_mlp": 0.01263872, + "balance_loss_clip": 0.06269123, + "balance_loss_mlp": 0.01254478, + "epoch": 0.8408236885615512, + "flos": 21474927388800.0, + "grad_norm": 1.837857036965972, + "language_loss": 0.61041355, + "learning_rate": 2.598605125513842e-07, + "loss": 0.6870482, + "num_input_tokens_seen": 301670010, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.09387207, + "step": 13985, + "time_per_iteration": 2.5293657779693604 + }, + { + "auxiliary_loss_clip": 0.06404386, + "auxiliary_loss_mlp": 0.0126397, + "balance_loss_clip": 0.06271429, + "balance_loss_mlp": 0.01254373, + "epoch": 0.8408838118142191, + "flos": 22969936736640.0, + "grad_norm": 1.803818187093242, + "language_loss": 0.82403111, + "learning_rate": 2.5966856872996467e-07, + "loss": 0.90071464, + "num_input_tokens_seen": 301689785, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.09588623, + "step": 13986, + "time_per_iteration": 2.5350451469421387 + }, + { + "auxiliary_loss_clip": 0.06401613, + "auxiliary_loss_mlp": 0.0126388, + "balance_loss_clip": 0.06271525, + "balance_loss_mlp": 0.01254892, + "epoch": 0.8409439350668871, + "flos": 26807765443200.0, + "grad_norm": 1.3955353905275312, + "language_loss": 0.66139162, + "learning_rate": 2.5947669090208755e-07, + "loss": 0.73804653, + "num_input_tokens_seen": 301712225, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.08984375, + "step": 13987, + "time_per_iteration": 2.6994168758392334 + }, + { + "auxiliary_loss_clip": 0.06401115, + "auxiliary_loss_mlp": 0.01265067, + "balance_loss_clip": 0.06270722, + "balance_loss_mlp": 0.01256043, + "epoch": 0.841004058319555, + "flos": 26585966885760.0, + "grad_norm": 1.7419411611465583, + "language_loss": 0.67379653, + "learning_rate": 2.5928487907502906e-07, + "loss": 0.75045836, + "num_input_tokens_seen": 301730955, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.09020996, + "step": 13988, + "time_per_iteration": 2.535848617553711 + }, + { + "auxiliary_loss_clip": 0.06402878, + "auxiliary_loss_mlp": 0.01266134, + "balance_loss_clip": 0.06269789, + "balance_loss_mlp": 0.01255912, + "epoch": 0.8410641815722231, + "flos": 14507152467840.0, + "grad_norm": 2.335548469753872, + "language_loss": 0.81167138, + "learning_rate": 2.590931332560622e-07, + "loss": 0.88836145, + "num_input_tokens_seen": 301746930, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.10217285, + "step": 13989, + "time_per_iteration": 2.4869043827056885 + }, + { + "auxiliary_loss_clip": 0.06406767, + "auxiliary_loss_mlp": 0.01262411, + "balance_loss_clip": 0.06272566, + "balance_loss_mlp": 0.01253161, + "epoch": 0.841124304824891, + "flos": 29173994818560.0, + "grad_norm": 1.7072106379508765, + "language_loss": 0.75771666, + "learning_rate": 2.5890145345245826e-07, + "loss": 0.8344084, + "num_input_tokens_seen": 301766945, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.0925293, + "step": 13990, + "time_per_iteration": 2.5450754165649414 + }, + { + "auxiliary_loss_clip": 0.06394493, + "auxiliary_loss_mlp": 0.01266409, + "balance_loss_clip": 0.06268186, + "balance_loss_mlp": 0.01257552, + "epoch": 0.841184428077559, + "flos": 22417410913920.0, + "grad_norm": 1.5743700344824108, + "language_loss": 0.80771601, + "learning_rate": 2.5870983967148597e-07, + "loss": 0.88432503, + "num_input_tokens_seen": 301785460, + "router_z_loss_clip": 1.26367188, + "router_z_loss_mlp": 0.08862305, + "step": 13991, + "time_per_iteration": 2.5198276042938232 + }, + { + "auxiliary_loss_clip": 0.06398806, + "auxiliary_loss_mlp": 0.0126106, + "balance_loss_clip": 0.06268385, + "balance_loss_mlp": 0.01252244, + "epoch": 0.841244551330227, + "flos": 22968846633600.0, + "grad_norm": 2.2103689173127767, + "language_loss": 0.70700645, + "learning_rate": 2.585182919204105e-07, + "loss": 0.78360516, + "num_input_tokens_seen": 301804180, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.0881958, + "step": 13992, + "time_per_iteration": 4.00426983833313 + }, + { + "auxiliary_loss_clip": 0.06402652, + "auxiliary_loss_mlp": 0.01262158, + "balance_loss_clip": 0.06269322, + "balance_loss_mlp": 0.012528, + "epoch": 0.8413046745828949, + "flos": 21039086776320.0, + "grad_norm": 1.5410913015371062, + "language_loss": 0.76244783, + "learning_rate": 2.583268102064959e-07, + "loss": 0.83909595, + "num_input_tokens_seen": 301823670, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.09362793, + "step": 13993, + "time_per_iteration": 2.491050958633423 + }, + { + "auxiliary_loss_clip": 0.06408523, + "auxiliary_loss_mlp": 0.01266993, + "balance_loss_clip": 0.06269802, + "balance_loss_mlp": 0.01256377, + "epoch": 0.841364797835563, + "flos": 27059305000320.0, + "grad_norm": 2.1350785829086214, + "language_loss": 0.74388689, + "learning_rate": 2.5813539453700393e-07, + "loss": 0.82064199, + "num_input_tokens_seen": 301845890, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.1060791, + "step": 13994, + "time_per_iteration": 2.552985906600952 + }, + { + "auxiliary_loss_clip": 0.06396306, + "auxiliary_loss_mlp": 0.01264636, + "balance_loss_clip": 0.06269471, + "balance_loss_mlp": 0.01256059, + "epoch": 0.8414249210882309, + "flos": 17901635621760.0, + "grad_norm": 1.413146140624494, + "language_loss": 0.5934546, + "learning_rate": 2.5794404491919163e-07, + "loss": 0.67006397, + "num_input_tokens_seen": 301863985, + "router_z_loss_clip": 1.26855469, + "router_z_loss_mlp": 0.08569336, + "step": 13995, + "time_per_iteration": 2.4642326831817627 + }, + { + "auxiliary_loss_clip": 0.06402554, + "auxiliary_loss_mlp": 0.01262234, + "balance_loss_clip": 0.06271402, + "balance_loss_mlp": 0.0125262, + "epoch": 0.8414850443408989, + "flos": 25447233098880.0, + "grad_norm": 2.2726761612856206, + "language_loss": 0.72156918, + "learning_rate": 2.577527613603163e-07, + "loss": 0.79821706, + "num_input_tokens_seen": 301882765, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.09619141, + "step": 13996, + "time_per_iteration": 2.5874221324920654 + }, + { + "auxiliary_loss_clip": 0.0640333, + "auxiliary_loss_mlp": 0.01267475, + "balance_loss_clip": 0.06272834, + "balance_loss_mlp": 0.01258988, + "epoch": 0.8415451675935668, + "flos": 23226465611520.0, + "grad_norm": 1.5819410580498, + "language_loss": 0.64570701, + "learning_rate": 2.5756154386763017e-07, + "loss": 0.72241509, + "num_input_tokens_seen": 301902720, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.0848999, + "step": 13997, + "time_per_iteration": 2.4954543113708496 + }, + { + "auxiliary_loss_clip": 0.06407194, + "auxiliary_loss_mlp": 0.01267232, + "balance_loss_clip": 0.062701, + "balance_loss_mlp": 0.01256992, + "epoch": 0.8416052908462348, + "flos": 18551560216320.0, + "grad_norm": 1.9485298310301038, + "language_loss": 0.82216007, + "learning_rate": 2.5737039244838565e-07, + "loss": 0.89890432, + "num_input_tokens_seen": 301921245, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.10241699, + "step": 13998, + "time_per_iteration": 2.496969699859619 + }, + { + "auxiliary_loss_clip": 0.06404015, + "auxiliary_loss_mlp": 0.01269356, + "balance_loss_clip": 0.06272015, + "balance_loss_mlp": 0.01258961, + "epoch": 0.8416654140989027, + "flos": 26112544917120.0, + "grad_norm": 1.4808581499635296, + "language_loss": 0.80342889, + "learning_rate": 2.5717930710982984e-07, + "loss": 0.8801626, + "num_input_tokens_seen": 301942320, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.10400391, + "step": 13999, + "time_per_iteration": 2.52217173576355 + }, + { + "auxiliary_loss_clip": 0.06409043, + "auxiliary_loss_mlp": 0.01265186, + "balance_loss_clip": 0.06274166, + "balance_loss_mlp": 0.01255304, + "epoch": 0.8417255373515707, + "flos": 26440630778880.0, + "grad_norm": 4.858767918566699, + "language_loss": 0.66816556, + "learning_rate": 2.569882878592096e-07, + "loss": 0.74490786, + "num_input_tokens_seen": 301963110, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.09881592, + "step": 14000, + "time_per_iteration": 2.5723514556884766 + }, + { + "auxiliary_loss_clip": 0.06403996, + "auxiliary_loss_mlp": 0.01267083, + "balance_loss_clip": 0.06269436, + "balance_loss_mlp": 0.01257093, + "epoch": 0.8417856606042387, + "flos": 24724703341440.0, + "grad_norm": 1.500004932940948, + "language_loss": 0.7974422, + "learning_rate": 2.5679733470376885e-07, + "loss": 0.87415302, + "num_input_tokens_seen": 301984915, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.09985352, + "step": 14001, + "time_per_iteration": 2.5384724140167236 + }, + { + "auxiliary_loss_clip": 0.06400739, + "auxiliary_loss_mlp": 0.01266336, + "balance_loss_clip": 0.06269417, + "balance_loss_mlp": 0.01256853, + "epoch": 0.8418457838569067, + "flos": 20857259416320.0, + "grad_norm": 1.7632333528169615, + "language_loss": 0.78508544, + "learning_rate": 2.5660644765074703e-07, + "loss": 0.86175615, + "num_input_tokens_seen": 302004095, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.0949707, + "step": 14002, + "time_per_iteration": 2.5468106269836426 + }, + { + "auxiliary_loss_clip": 0.06400124, + "auxiliary_loss_mlp": 0.01266, + "balance_loss_clip": 0.06271224, + "balance_loss_mlp": 0.01256398, + "epoch": 0.8419059071095746, + "flos": 28668651644160.0, + "grad_norm": 1.3302333296141904, + "language_loss": 0.78383386, + "learning_rate": 2.5641562670738334e-07, + "loss": 0.86049509, + "num_input_tokens_seen": 302027250, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.0960083, + "step": 14003, + "time_per_iteration": 2.572388172149658 + }, + { + "auxiliary_loss_clip": 0.06398443, + "auxiliary_loss_mlp": 0.01267978, + "balance_loss_clip": 0.06267287, + "balance_loss_mlp": 0.01258757, + "epoch": 0.8419660303622426, + "flos": 21660150839040.0, + "grad_norm": 1.98720953266761, + "language_loss": 0.65639722, + "learning_rate": 2.5622487188091436e-07, + "loss": 0.73306143, + "num_input_tokens_seen": 302046950, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.09222412, + "step": 14004, + "time_per_iteration": 3.96457576751709 + }, + { + "auxiliary_loss_clip": 0.06407335, + "auxiliary_loss_mlp": 0.01266305, + "balance_loss_clip": 0.06271972, + "balance_loss_mlp": 0.01255814, + "epoch": 0.8420261536149106, + "flos": 25308102193920.0, + "grad_norm": 2.270922911539394, + "language_loss": 0.76293629, + "learning_rate": 2.560341831785724e-07, + "loss": 0.83967268, + "num_input_tokens_seen": 302065470, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.10498047, + "step": 14005, + "time_per_iteration": 2.5258288383483887 + }, + { + "auxiliary_loss_clip": 0.06406075, + "auxiliary_loss_mlp": 0.01265149, + "balance_loss_clip": 0.06271887, + "balance_loss_mlp": 0.01255535, + "epoch": 0.8420862768675785, + "flos": 18768159820800.0, + "grad_norm": 1.6456178296251338, + "language_loss": 0.78003979, + "learning_rate": 2.5584356060758906e-07, + "loss": 0.8567521, + "num_input_tokens_seen": 302083190, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.09606934, + "step": 14006, + "time_per_iteration": 2.489978313446045 + }, + { + "auxiliary_loss_clip": 0.06400469, + "auxiliary_loss_mlp": 0.01262872, + "balance_loss_clip": 0.06269795, + "balance_loss_mlp": 0.01253353, + "epoch": 0.8421464001202466, + "flos": 18333157749120.0, + "grad_norm": 1.7948996432963087, + "language_loss": 0.77462882, + "learning_rate": 2.556530041751932e-07, + "loss": 0.85126221, + "num_input_tokens_seen": 302098820, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.09515381, + "step": 14007, + "time_per_iteration": 3.9048590660095215 + }, + { + "auxiliary_loss_clip": 0.06404168, + "auxiliary_loss_mlp": 0.01261821, + "balance_loss_clip": 0.06270444, + "balance_loss_mlp": 0.01252267, + "epoch": 0.8422065233729145, + "flos": 31544710387200.0, + "grad_norm": 1.6673756075616437, + "language_loss": 0.66031694, + "learning_rate": 2.554625138886102e-07, + "loss": 0.7369768, + "num_input_tokens_seen": 302117075, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.09552002, + "step": 14008, + "time_per_iteration": 2.5647101402282715 + }, + { + "auxiliary_loss_clip": 0.0630706, + "auxiliary_loss_mlp": 0.01249886, + "balance_loss_clip": 0.06252214, + "balance_loss_mlp": 0.01248812, + "epoch": 0.8422666466255825, + "flos": 64316691999360.0, + "grad_norm": 0.7086447716783576, + "language_loss": 0.56921613, + "learning_rate": 2.552720897550631e-07, + "loss": 0.64478564, + "num_input_tokens_seen": 302179735, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.01074982, + "step": 14009, + "time_per_iteration": 3.2140049934387207 + }, + { + "auxiliary_loss_clip": 0.06399348, + "auxiliary_loss_mlp": 0.01265049, + "balance_loss_clip": 0.06270915, + "balance_loss_mlp": 0.01256531, + "epoch": 0.8423267698782504, + "flos": 24323676900480.0, + "grad_norm": 1.225341490624907, + "language_loss": 0.7808187, + "learning_rate": 2.5508173178177304e-07, + "loss": 0.85746264, + "num_input_tokens_seen": 302202055, + "router_z_loss_clip": 1.28613281, + "router_z_loss_mlp": 0.08520508, + "step": 14010, + "time_per_iteration": 3.96768856048584 + }, + { + "auxiliary_loss_clip": 0.06407313, + "auxiliary_loss_mlp": 0.01265279, + "balance_loss_clip": 0.06273588, + "balance_loss_mlp": 0.01254545, + "epoch": 0.8423868931309184, + "flos": 18301949303040.0, + "grad_norm": 1.7313909892121646, + "language_loss": 0.7269572, + "learning_rate": 2.548914399759592e-07, + "loss": 0.80368304, + "num_input_tokens_seen": 302221360, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.10742188, + "step": 14011, + "time_per_iteration": 2.4659523963928223 + }, + { + "auxiliary_loss_clip": 0.06401571, + "auxiliary_loss_mlp": 0.01265496, + "balance_loss_clip": 0.06270736, + "balance_loss_mlp": 0.01256114, + "epoch": 0.8424470163835863, + "flos": 23556983241600.0, + "grad_norm": 1.7946548405120046, + "language_loss": 0.84176588, + "learning_rate": 2.5470121434483636e-07, + "loss": 0.91843653, + "num_input_tokens_seen": 302240715, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.09381104, + "step": 14012, + "time_per_iteration": 2.5705301761627197 + }, + { + "auxiliary_loss_clip": 0.06391717, + "auxiliary_loss_mlp": 0.01266082, + "balance_loss_clip": 0.06269065, + "balance_loss_mlp": 0.01257529, + "epoch": 0.8425071396362543, + "flos": 23776350030720.0, + "grad_norm": 1.5491953146751778, + "language_loss": 0.67853385, + "learning_rate": 2.5451105489561884e-07, + "loss": 0.75511181, + "num_input_tokens_seen": 302260950, + "router_z_loss_clip": 1.22558594, + "router_z_loss_mlp": 0.08551025, + "step": 14013, + "time_per_iteration": 2.5138120651245117 + }, + { + "auxiliary_loss_clip": 0.0640588, + "auxiliary_loss_mlp": 0.01264224, + "balance_loss_clip": 0.06270213, + "balance_loss_mlp": 0.01254199, + "epoch": 0.8425672628889223, + "flos": 16184240737920.0, + "grad_norm": 2.40464734961883, + "language_loss": 0.78383315, + "learning_rate": 2.5432096163551644e-07, + "loss": 0.86053419, + "num_input_tokens_seen": 302277500, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.10028076, + "step": 14014, + "time_per_iteration": 2.499150276184082 + }, + { + "auxiliary_loss_clip": 0.06404585, + "auxiliary_loss_mlp": 0.01263908, + "balance_loss_clip": 0.06272553, + "balance_loss_mlp": 0.01254466, + "epoch": 0.8426273861415903, + "flos": 23155872946560.0, + "grad_norm": 1.6895801007055753, + "language_loss": 0.67373145, + "learning_rate": 2.5413093457173884e-07, + "loss": 0.75041628, + "num_input_tokens_seen": 302297930, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09436035, + "step": 14015, + "time_per_iteration": 2.5022330284118652 + }, + { + "auxiliary_loss_clip": 0.06404251, + "auxiliary_loss_mlp": 0.01263685, + "balance_loss_clip": 0.06272057, + "balance_loss_mlp": 0.01254083, + "epoch": 0.8426875093942582, + "flos": 17463614803200.0, + "grad_norm": 3.5337410606590556, + "language_loss": 0.76054883, + "learning_rate": 2.5394097371149036e-07, + "loss": 0.83722818, + "num_input_tokens_seen": 302315735, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.0960083, + "step": 14016, + "time_per_iteration": 2.483449935913086 + }, + { + "auxiliary_loss_clip": 0.06403068, + "auxiliary_loss_mlp": 0.01266667, + "balance_loss_clip": 0.06270768, + "balance_loss_mlp": 0.0125626, + "epoch": 0.8427476326469262, + "flos": 19645710831360.0, + "grad_norm": 1.789200385527246, + "language_loss": 0.7966969, + "learning_rate": 2.5375107906197544e-07, + "loss": 0.87339425, + "num_input_tokens_seen": 302332790, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.10394287, + "step": 14017, + "time_per_iteration": 2.473740577697754 + }, + { + "auxiliary_loss_clip": 0.06404161, + "auxiliary_loss_mlp": 0.01263517, + "balance_loss_clip": 0.06271141, + "balance_loss_mlp": 0.0125438, + "epoch": 0.8428077558995941, + "flos": 11944882465920.0, + "grad_norm": 2.0996679127374276, + "language_loss": 0.63158822, + "learning_rate": 2.5356125063039525e-07, + "loss": 0.70826501, + "num_input_tokens_seen": 302346490, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.09136963, + "step": 14018, + "time_per_iteration": 2.4304590225219727 + }, + { + "auxiliary_loss_clip": 0.06403518, + "auxiliary_loss_mlp": 0.01266777, + "balance_loss_clip": 0.06271292, + "balance_loss_mlp": 0.0125789, + "epoch": 0.8428678791522621, + "flos": 10456287955200.0, + "grad_norm": 1.7281845201580097, + "language_loss": 0.79151654, + "learning_rate": 2.5337148842394687e-07, + "loss": 0.86821949, + "num_input_tokens_seen": 302363235, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.08886719, + "step": 14019, + "time_per_iteration": 2.447352647781372 + }, + { + "auxiliary_loss_clip": 0.06403257, + "auxiliary_loss_mlp": 0.01265283, + "balance_loss_clip": 0.06270546, + "balance_loss_mlp": 0.01255717, + "epoch": 0.8429280024049302, + "flos": 28774813167360.0, + "grad_norm": 1.7232638375614535, + "language_loss": 0.78435445, + "learning_rate": 2.531817924498265e-07, + "loss": 0.86103988, + "num_input_tokens_seen": 302383270, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09564209, + "step": 14020, + "time_per_iteration": 2.551368474960327 + }, + { + "auxiliary_loss_clip": 0.06403369, + "auxiliary_loss_mlp": 0.01264948, + "balance_loss_clip": 0.06271713, + "balance_loss_mlp": 0.01255417, + "epoch": 0.8429881256575981, + "flos": 19543238887680.0, + "grad_norm": 1.7602731882199467, + "language_loss": 0.71348774, + "learning_rate": 2.5299216271522805e-07, + "loss": 0.79017103, + "num_input_tokens_seen": 302401355, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09527588, + "step": 14021, + "time_per_iteration": 2.4735569953918457 + }, + { + "auxiliary_loss_clip": 0.06406254, + "auxiliary_loss_mlp": 0.01266699, + "balance_loss_clip": 0.06271423, + "balance_loss_mlp": 0.01256441, + "epoch": 0.8430482489102661, + "flos": 24797937409920.0, + "grad_norm": 1.5820497244167908, + "language_loss": 0.69932485, + "learning_rate": 2.5280259922734125e-07, + "loss": 0.77605438, + "num_input_tokens_seen": 302419515, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.10253906, + "step": 14022, + "time_per_iteration": 2.5423529148101807 + }, + { + "auxiliary_loss_clip": 0.06404831, + "auxiliary_loss_mlp": 0.01265319, + "balance_loss_clip": 0.06270556, + "balance_loss_mlp": 0.0125552, + "epoch": 0.843108372162934, + "flos": 21550802860800.0, + "grad_norm": 1.7046614195484213, + "language_loss": 0.72680509, + "learning_rate": 2.526131019933553e-07, + "loss": 0.80350661, + "num_input_tokens_seen": 302438280, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.09796143, + "step": 14023, + "time_per_iteration": 2.484471559524536 + }, + { + "auxiliary_loss_clip": 0.06401069, + "auxiliary_loss_mlp": 0.01265355, + "balance_loss_clip": 0.06270259, + "balance_loss_mlp": 0.01255138, + "epoch": 0.843168495415602, + "flos": 24615816560640.0, + "grad_norm": 1.4810889251875472, + "language_loss": 0.67264724, + "learning_rate": 2.524236710204559e-07, + "loss": 0.74931145, + "num_input_tokens_seen": 302460860, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.10211182, + "step": 14024, + "time_per_iteration": 2.5865228176116943 + }, + { + "auxiliary_loss_clip": 0.06397875, + "auxiliary_loss_mlp": 0.01265022, + "balance_loss_clip": 0.06269605, + "balance_loss_mlp": 0.0125511, + "epoch": 0.8432286186682699, + "flos": 15128216530560.0, + "grad_norm": 2.032466655248574, + "language_loss": 0.81405187, + "learning_rate": 2.522343063158261e-07, + "loss": 0.89068085, + "num_input_tokens_seen": 302476980, + "router_z_loss_clip": 1.28222656, + "router_z_loss_mlp": 0.09912109, + "step": 14025, + "time_per_iteration": 2.465604782104492 + }, + { + "auxiliary_loss_clip": 0.0639737, + "auxiliary_loss_mlp": 0.01261603, + "balance_loss_clip": 0.06269414, + "balance_loss_mlp": 0.01253104, + "epoch": 0.843288741920938, + "flos": 20307920048640.0, + "grad_norm": 1.4533964551508662, + "language_loss": 0.77700567, + "learning_rate": 2.5204500788664606e-07, + "loss": 0.85359538, + "num_input_tokens_seen": 302496380, + "router_z_loss_clip": 1.27929688, + "router_z_loss_mlp": 0.08508301, + "step": 14026, + "time_per_iteration": 2.53076171875 + }, + { + "auxiliary_loss_clip": 0.06398062, + "auxiliary_loss_mlp": 0.01262456, + "balance_loss_clip": 0.06269979, + "balance_loss_mlp": 0.01253087, + "epoch": 0.8433488651736059, + "flos": 23338958117760.0, + "grad_norm": 1.4725617079093607, + "language_loss": 0.82412767, + "learning_rate": 2.518557757400945e-07, + "loss": 0.90073287, + "num_input_tokens_seen": 302516845, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.09375, + "step": 14027, + "time_per_iteration": 2.5195744037628174 + }, + { + "auxiliary_loss_clip": 0.06401826, + "auxiliary_loss_mlp": 0.01262756, + "balance_loss_clip": 0.06271818, + "balance_loss_mlp": 0.01253797, + "epoch": 0.8434089884262739, + "flos": 39467546945280.0, + "grad_norm": 1.6367557813703113, + "language_loss": 0.56320584, + "learning_rate": 2.5166660988334754e-07, + "loss": 0.63985163, + "num_input_tokens_seen": 302538865, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.08947754, + "step": 14028, + "time_per_iteration": 2.6699862480163574 + }, + { + "auxiliary_loss_clip": 0.06402962, + "auxiliary_loss_mlp": 0.01262903, + "balance_loss_clip": 0.06272294, + "balance_loss_mlp": 0.01253933, + "epoch": 0.8434691116789418, + "flos": 23775595344000.0, + "grad_norm": 1.7800167865381953, + "language_loss": 0.64169657, + "learning_rate": 2.51477510323578e-07, + "loss": 0.71835524, + "num_input_tokens_seen": 302557970, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.08966064, + "step": 14029, + "time_per_iteration": 2.5012686252593994 + }, + { + "auxiliary_loss_clip": 0.06397776, + "auxiliary_loss_mlp": 0.01263425, + "balance_loss_clip": 0.06271636, + "balance_loss_mlp": 0.01254949, + "epoch": 0.8435292349316098, + "flos": 22677503587200.0, + "grad_norm": 1.6433020027379726, + "language_loss": 0.75232613, + "learning_rate": 2.51288477067956e-07, + "loss": 0.82893813, + "num_input_tokens_seen": 302578915, + "router_z_loss_clip": 1.26171875, + "router_z_loss_mlp": 0.0847168, + "step": 14030, + "time_per_iteration": 2.5419058799743652 + }, + { + "auxiliary_loss_clip": 0.06398299, + "auxiliary_loss_mlp": 0.01267606, + "balance_loss_clip": 0.06269399, + "balance_loss_mlp": 0.01258075, + "epoch": 0.8435893581842777, + "flos": 18849611589120.0, + "grad_norm": 2.1565835327609406, + "language_loss": 0.83877122, + "learning_rate": 2.510995101236502e-07, + "loss": 0.91543025, + "num_input_tokens_seen": 302596300, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.09533691, + "step": 14031, + "time_per_iteration": 2.468385696411133 + }, + { + "auxiliary_loss_clip": 0.06401075, + "auxiliary_loss_mlp": 0.01263467, + "balance_loss_clip": 0.06271769, + "balance_loss_mlp": 0.01254586, + "epoch": 0.8436494814369457, + "flos": 20710497790080.0, + "grad_norm": 2.151005653825973, + "language_loss": 0.80558878, + "learning_rate": 2.509106094978266e-07, + "loss": 0.88223422, + "num_input_tokens_seen": 302614975, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.08886719, + "step": 14032, + "time_per_iteration": 3.9253792762756348 + }, + { + "auxiliary_loss_clip": 0.06401269, + "auxiliary_loss_mlp": 0.01266295, + "balance_loss_clip": 0.06269183, + "balance_loss_mlp": 0.0125593, + "epoch": 0.8437096046896138, + "flos": 22680731969280.0, + "grad_norm": 1.43708237310059, + "language_loss": 0.75761014, + "learning_rate": 2.507217751976478e-07, + "loss": 0.8342858, + "num_input_tokens_seen": 302636415, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.10369873, + "step": 14033, + "time_per_iteration": 2.545506238937378 + }, + { + "auxiliary_loss_clip": 0.06403454, + "auxiliary_loss_mlp": 0.01266807, + "balance_loss_clip": 0.06270887, + "balance_loss_mlp": 0.01258021, + "epoch": 0.8437697279422817, + "flos": 16185666257280.0, + "grad_norm": 1.695610228137136, + "language_loss": 0.83268261, + "learning_rate": 2.505330072302743e-07, + "loss": 0.9093852, + "num_input_tokens_seen": 302653605, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.08782959, + "step": 14034, + "time_per_iteration": 2.5694990158081055 + }, + { + "auxiliary_loss_clip": 0.06401746, + "auxiliary_loss_mlp": 0.01263914, + "balance_loss_clip": 0.06269741, + "balance_loss_mlp": 0.01254061, + "epoch": 0.8438298511949497, + "flos": 28773178012800.0, + "grad_norm": 1.4341877550440127, + "language_loss": 0.78500712, + "learning_rate": 2.503443056028656e-07, + "loss": 0.86166364, + "num_input_tokens_seen": 302673965, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09851074, + "step": 14035, + "time_per_iteration": 2.603475332260132 + }, + { + "auxiliary_loss_clip": 0.06401128, + "auxiliary_loss_mlp": 0.01261261, + "balance_loss_clip": 0.06270442, + "balance_loss_mlp": 0.01252035, + "epoch": 0.8438899744476176, + "flos": 33731837660160.0, + "grad_norm": 1.4118924926688545, + "language_loss": 0.72302711, + "learning_rate": 2.501556703225751e-07, + "loss": 0.79965097, + "num_input_tokens_seen": 302695560, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.09234619, + "step": 14036, + "time_per_iteration": 2.618654727935791 + }, + { + "auxiliary_loss_clip": 0.06396312, + "auxiliary_loss_mlp": 0.01261207, + "balance_loss_clip": 0.06269594, + "balance_loss_mlp": 0.01252868, + "epoch": 0.8439500977002856, + "flos": 25116421979520.0, + "grad_norm": 1.6362343480396115, + "language_loss": 0.70156783, + "learning_rate": 2.49967101396557e-07, + "loss": 0.77814305, + "num_input_tokens_seen": 302713480, + "router_z_loss_clip": 1.26757812, + "router_z_loss_mlp": 0.08331299, + "step": 14037, + "time_per_iteration": 2.5106723308563232 + }, + { + "auxiliary_loss_clip": 0.06399255, + "auxiliary_loss_mlp": 0.01264455, + "balance_loss_clip": 0.0627047, + "balance_loss_mlp": 0.01256098, + "epoch": 0.8440102209529535, + "flos": 32858060083200.0, + "grad_norm": 1.571189244416603, + "language_loss": 0.69434804, + "learning_rate": 2.4977859883196227e-07, + "loss": 0.77098513, + "num_input_tokens_seen": 302736860, + "router_z_loss_clip": 1.28808594, + "router_z_loss_mlp": 0.08355713, + "step": 14038, + "time_per_iteration": 2.588937282562256 + }, + { + "auxiliary_loss_clip": 0.0640436, + "auxiliary_loss_mlp": 0.01263875, + "balance_loss_clip": 0.06271221, + "balance_loss_mlp": 0.0125463, + "epoch": 0.8440703442056215, + "flos": 23736588468480.0, + "grad_norm": 1.525634873049396, + "language_loss": 0.76716536, + "learning_rate": 2.49590162635938e-07, + "loss": 0.84384775, + "num_input_tokens_seen": 302757745, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.09240723, + "step": 14039, + "time_per_iteration": 2.5490803718566895 + }, + { + "auxiliary_loss_clip": 0.06412183, + "auxiliary_loss_mlp": 0.01262445, + "balance_loss_clip": 0.062753, + "balance_loss_mlp": 0.01252646, + "epoch": 0.8441304674582895, + "flos": 20199955662720.0, + "grad_norm": 1.8775468369698345, + "language_loss": 0.79449338, + "learning_rate": 2.4940179281563046e-07, + "loss": 0.87123966, + "num_input_tokens_seen": 302774885, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.09796143, + "step": 14040, + "time_per_iteration": 2.4884471893310547 + }, + { + "auxiliary_loss_clip": 0.0640001, + "auxiliary_loss_mlp": 0.01266389, + "balance_loss_clip": 0.06269734, + "balance_loss_mlp": 0.01256513, + "epoch": 0.8441905907109575, + "flos": 20224413855360.0, + "grad_norm": 1.8433585006655098, + "language_loss": 0.69202292, + "learning_rate": 2.492134893781821e-07, + "loss": 0.76868689, + "num_input_tokens_seen": 302791035, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.09866333, + "step": 14041, + "time_per_iteration": 2.4893062114715576 + }, + { + "auxiliary_loss_clip": 0.06408129, + "auxiliary_loss_mlp": 0.01265821, + "balance_loss_clip": 0.06273414, + "balance_loss_mlp": 0.01255491, + "epoch": 0.8442507139636254, + "flos": 13521511290240.0, + "grad_norm": 1.7054295527425734, + "language_loss": 0.68817204, + "learning_rate": 2.490252523307341e-07, + "loss": 0.76491153, + "num_input_tokens_seen": 302808650, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.10327148, + "step": 14042, + "time_per_iteration": 2.4641237258911133 + }, + { + "auxiliary_loss_clip": 0.0639908, + "auxiliary_loss_mlp": 0.01266235, + "balance_loss_clip": 0.06270715, + "balance_loss_mlp": 0.01256871, + "epoch": 0.8443108372162934, + "flos": 18225570706560.0, + "grad_norm": 1.5510354554393648, + "language_loss": 0.75078881, + "learning_rate": 2.4883708168042373e-07, + "loss": 0.82744193, + "num_input_tokens_seen": 302824605, + "router_z_loss_clip": 1.28417969, + "router_z_loss_mlp": 0.09356689, + "step": 14043, + "time_per_iteration": 3.892390489578247 + }, + { + "auxiliary_loss_clip": 0.06400645, + "auxiliary_loss_mlp": 0.01261977, + "balance_loss_clip": 0.06269968, + "balance_loss_mlp": 0.0125315, + "epoch": 0.8443709604689613, + "flos": 16110293909760.0, + "grad_norm": 2.1613590719043003, + "language_loss": 0.72651005, + "learning_rate": 2.486489774343865e-07, + "loss": 0.80313635, + "num_input_tokens_seen": 302840170, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.08831787, + "step": 14044, + "time_per_iteration": 2.4726979732513428 + }, + { + "auxiliary_loss_clip": 0.06397988, + "auxiliary_loss_mlp": 0.01263562, + "balance_loss_clip": 0.06269136, + "balance_loss_mlp": 0.01254932, + "epoch": 0.8444310837216293, + "flos": 18517542658560.0, + "grad_norm": 1.454592931872587, + "language_loss": 0.74902761, + "learning_rate": 2.484609395997559e-07, + "loss": 0.82564312, + "num_input_tokens_seen": 302858320, + "router_z_loss_clip": 1.28808594, + "router_z_loss_mlp": 0.08630371, + "step": 14045, + "time_per_iteration": 2.5141093730926514 + }, + { + "auxiliary_loss_clip": 0.06400928, + "auxiliary_loss_mlp": 0.01266482, + "balance_loss_clip": 0.06270893, + "balance_loss_mlp": 0.01257339, + "epoch": 0.8444912069742974, + "flos": 14945215213440.0, + "grad_norm": 1.9915649249395384, + "language_loss": 0.78878438, + "learning_rate": 2.4827296818366216e-07, + "loss": 0.86545849, + "num_input_tokens_seen": 302875255, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.09155273, + "step": 14046, + "time_per_iteration": 2.4441287517547607 + }, + { + "auxiliary_loss_clip": 0.06403919, + "auxiliary_loss_mlp": 0.01266027, + "balance_loss_clip": 0.06271581, + "balance_loss_mlp": 0.01255948, + "epoch": 0.8445513302269653, + "flos": 20126470032000.0, + "grad_norm": 1.93814940449734, + "language_loss": 0.78215307, + "learning_rate": 2.4808506319323255e-07, + "loss": 0.85885251, + "num_input_tokens_seen": 302894690, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.10076904, + "step": 14047, + "time_per_iteration": 3.947803258895874 + }, + { + "auxiliary_loss_clip": 0.06400177, + "auxiliary_loss_mlp": 0.01267116, + "balance_loss_clip": 0.0627154, + "balance_loss_mlp": 0.0125786, + "epoch": 0.8446114534796333, + "flos": 31178162701440.0, + "grad_norm": 1.7964123097724451, + "language_loss": 0.72113055, + "learning_rate": 2.478972246355935e-07, + "loss": 0.7978034, + "num_input_tokens_seen": 302912405, + "router_z_loss_clip": 1.28613281, + "router_z_loss_mlp": 0.0925293, + "step": 14048, + "time_per_iteration": 2.5795657634735107 + }, + { + "auxiliary_loss_clip": 0.06403403, + "auxiliary_loss_mlp": 0.01265353, + "balance_loss_clip": 0.06272613, + "balance_loss_mlp": 0.01255697, + "epoch": 0.8446715767323012, + "flos": 23954613592320.0, + "grad_norm": 1.3616000745091086, + "language_loss": 0.73144412, + "learning_rate": 2.477094525178667e-07, + "loss": 0.80813169, + "num_input_tokens_seen": 302932525, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09661865, + "step": 14049, + "time_per_iteration": 3.9288156032562256 + }, + { + "auxiliary_loss_clip": 0.0630594, + "auxiliary_loss_mlp": 0.0125014, + "balance_loss_clip": 0.06251055, + "balance_loss_mlp": 0.01249117, + "epoch": 0.8447316999849692, + "flos": 68004362989440.0, + "grad_norm": 0.7905781903446938, + "language_loss": 0.60587054, + "learning_rate": 2.475217468471729e-07, + "loss": 0.68143135, + "num_input_tokens_seen": 302991285, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.01023102, + "step": 14050, + "time_per_iteration": 3.077780246734619 + }, + { + "auxiliary_loss_clip": 0.06402567, + "auxiliary_loss_mlp": 0.01264219, + "balance_loss_clip": 0.06271556, + "balance_loss_mlp": 0.01253938, + "epoch": 0.8447918232376371, + "flos": 22425460905600.0, + "grad_norm": 2.519523289840615, + "language_loss": 0.72404873, + "learning_rate": 2.473341076306303e-07, + "loss": 0.80071664, + "num_input_tokens_seen": 303009515, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.10284424, + "step": 14051, + "time_per_iteration": 2.556217670440674 + }, + { + "auxiliary_loss_clip": 0.06396311, + "auxiliary_loss_mlp": 0.01266219, + "balance_loss_clip": 0.06267892, + "balance_loss_mlp": 0.01257243, + "epoch": 0.8448519464903052, + "flos": 23700600339840.0, + "grad_norm": 1.9626022777584542, + "language_loss": 0.74592292, + "learning_rate": 2.471465348753547e-07, + "loss": 0.82254827, + "num_input_tokens_seen": 303026905, + "router_z_loss_clip": 1.28417969, + "router_z_loss_mlp": 0.08984375, + "step": 14052, + "time_per_iteration": 2.5140316486358643 + }, + { + "auxiliary_loss_clip": 0.06395899, + "auxiliary_loss_mlp": 0.01266351, + "balance_loss_clip": 0.06272222, + "balance_loss_mlp": 0.01257941, + "epoch": 0.8449120697429731, + "flos": 13741087714560.0, + "grad_norm": 1.5692386664403404, + "language_loss": 0.73870707, + "learning_rate": 2.469590285884575e-07, + "loss": 0.81532955, + "num_input_tokens_seen": 303045245, + "router_z_loss_clip": 1.23730469, + "router_z_loss_mlp": 0.08404541, + "step": 14053, + "time_per_iteration": 2.5562212467193604 + }, + { + "auxiliary_loss_clip": 0.06402231, + "auxiliary_loss_mlp": 0.01265521, + "balance_loss_clip": 0.06272172, + "balance_loss_mlp": 0.01256235, + "epoch": 0.8449721929956411, + "flos": 20893121763840.0, + "grad_norm": 1.5720536659104367, + "language_loss": 0.74138618, + "learning_rate": 2.467715887770494e-07, + "loss": 0.81806374, + "num_input_tokens_seen": 303065205, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.09283447, + "step": 14054, + "time_per_iteration": 2.499558687210083 + }, + { + "auxiliary_loss_clip": 0.06406872, + "auxiliary_loss_mlp": 0.01263984, + "balance_loss_clip": 0.06270154, + "balance_loss_mlp": 0.01253863, + "epoch": 0.845032316248309, + "flos": 33224985112320.0, + "grad_norm": 1.3467293957479496, + "language_loss": 0.78394425, + "learning_rate": 2.4658421544823895e-07, + "loss": 0.86065292, + "num_input_tokens_seen": 303088250, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.10119629, + "step": 14055, + "time_per_iteration": 2.6246414184570312 + }, + { + "auxiliary_loss_clip": 0.06395009, + "auxiliary_loss_mlp": 0.01266193, + "balance_loss_clip": 0.06266758, + "balance_loss_mlp": 0.01257604, + "epoch": 0.845092439500977, + "flos": 23591755486080.0, + "grad_norm": 1.544566635839548, + "language_loss": 0.73342294, + "learning_rate": 2.463969086091302e-07, + "loss": 0.81003493, + "num_input_tokens_seen": 303109280, + "router_z_loss_clip": 1.28222656, + "router_z_loss_mlp": 0.08587646, + "step": 14056, + "time_per_iteration": 2.508028030395508 + }, + { + "auxiliary_loss_clip": 0.06407695, + "auxiliary_loss_mlp": 0.01264647, + "balance_loss_clip": 0.06270935, + "balance_loss_mlp": 0.01254777, + "epoch": 0.8451525627536449, + "flos": 13338929243520.0, + "grad_norm": 2.1863869456022647, + "language_loss": 0.68351102, + "learning_rate": 2.4620966826682686e-07, + "loss": 0.76023448, + "num_input_tokens_seen": 303126075, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.09881592, + "step": 14057, + "time_per_iteration": 2.548752546310425 + }, + { + "auxiliary_loss_clip": 0.06399477, + "auxiliary_loss_mlp": 0.0126254, + "balance_loss_clip": 0.06268546, + "balance_loss_mlp": 0.0125292, + "epoch": 0.8452126860063129, + "flos": 27825285899520.0, + "grad_norm": 1.5760714164083998, + "language_loss": 0.77413702, + "learning_rate": 2.460224944284284e-07, + "loss": 0.85075724, + "num_input_tokens_seen": 303146920, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.09625244, + "step": 14058, + "time_per_iteration": 2.5370140075683594 + }, + { + "auxiliary_loss_clip": 0.06404024, + "auxiliary_loss_mlp": 0.01264124, + "balance_loss_clip": 0.06272276, + "balance_loss_mlp": 0.01254802, + "epoch": 0.845272809258981, + "flos": 27131868236160.0, + "grad_norm": 1.5270727793259906, + "language_loss": 0.69999516, + "learning_rate": 2.45835387101033e-07, + "loss": 0.77667671, + "num_input_tokens_seen": 303167885, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09320068, + "step": 14059, + "time_per_iteration": 2.5480189323425293 + }, + { + "auxiliary_loss_clip": 0.06407374, + "auxiliary_loss_mlp": 0.01262296, + "balance_loss_clip": 0.0627005, + "balance_loss_mlp": 0.01251961, + "epoch": 0.8453329325116489, + "flos": 18338440556160.0, + "grad_norm": 2.540996267685051, + "language_loss": 0.57944226, + "learning_rate": 2.4564834629173516e-07, + "loss": 0.65613896, + "num_input_tokens_seen": 303185000, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.10339355, + "step": 14060, + "time_per_iteration": 2.481928586959839 + }, + { + "auxiliary_loss_clip": 0.0640597, + "auxiliary_loss_mlp": 0.01263749, + "balance_loss_clip": 0.06269474, + "balance_loss_mlp": 0.01252841, + "epoch": 0.8453930557643169, + "flos": 22681989780480.0, + "grad_norm": 1.4782194608801338, + "language_loss": 0.75907153, + "learning_rate": 2.454613720076277e-07, + "loss": 0.8357687, + "num_input_tokens_seen": 303205210, + "router_z_loss_clip": 1.36621094, + "router_z_loss_mlp": 0.10900879, + "step": 14061, + "time_per_iteration": 2.488678455352783 + }, + { + "auxiliary_loss_clip": 0.06403539, + "auxiliary_loss_mlp": 0.0126313, + "balance_loss_clip": 0.06268848, + "balance_loss_mlp": 0.01253194, + "epoch": 0.8454531790169848, + "flos": 22493034823680.0, + "grad_norm": 3.6244102921260004, + "language_loss": 0.71058381, + "learning_rate": 2.452744642558013e-07, + "loss": 0.78725052, + "num_input_tokens_seen": 303224655, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.09942627, + "step": 14062, + "time_per_iteration": 2.4787416458129883 + }, + { + "auxiliary_loss_clip": 0.06312045, + "auxiliary_loss_mlp": 0.01252111, + "balance_loss_clip": 0.06256789, + "balance_loss_mlp": 0.01251069, + "epoch": 0.8455133022696528, + "flos": 58295383672320.0, + "grad_norm": 0.6264898637302231, + "language_loss": 0.52687728, + "learning_rate": 2.450876230433432e-07, + "loss": 0.60251892, + "num_input_tokens_seen": 303289645, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.01042175, + "step": 14063, + "time_per_iteration": 3.193988800048828 + }, + { + "auxiliary_loss_clip": 0.06397031, + "auxiliary_loss_mlp": 0.01265361, + "balance_loss_clip": 0.06271092, + "balance_loss_mlp": 0.01257398, + "epoch": 0.8455734255223207, + "flos": 21367717689600.0, + "grad_norm": 1.6737838739239328, + "language_loss": 0.82301968, + "learning_rate": 2.449008483773378e-07, + "loss": 0.8996436, + "num_input_tokens_seen": 303308350, + "router_z_loss_clip": 1.26074219, + "router_z_loss_mlp": 0.07965088, + "step": 14064, + "time_per_iteration": 2.4716007709503174 + }, + { + "auxiliary_loss_clip": 0.06409873, + "auxiliary_loss_mlp": 0.0126423, + "balance_loss_clip": 0.06275783, + "balance_loss_mlp": 0.01254872, + "epoch": 0.8456335487749888, + "flos": 20455562142720.0, + "grad_norm": 2.438952619320042, + "language_loss": 0.72705638, + "learning_rate": 2.447141402648685e-07, + "loss": 0.80379742, + "num_input_tokens_seen": 303325230, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.09356689, + "step": 14065, + "time_per_iteration": 2.486729383468628 + }, + { + "auxiliary_loss_clip": 0.06397683, + "auxiliary_loss_mlp": 0.0126263, + "balance_loss_clip": 0.06270014, + "balance_loss_mlp": 0.01254196, + "epoch": 0.8456936720276567, + "flos": 28848592287360.0, + "grad_norm": 1.4053294681947734, + "language_loss": 0.77431583, + "learning_rate": 2.445274987130146e-07, + "loss": 0.85091895, + "num_input_tokens_seen": 303345810, + "router_z_loss_clip": 1.27734375, + "router_z_loss_mlp": 0.08435059, + "step": 14066, + "time_per_iteration": 2.5918047428131104 + }, + { + "auxiliary_loss_clip": 0.06402615, + "auxiliary_loss_mlp": 0.01262305, + "balance_loss_clip": 0.06271371, + "balance_loss_mlp": 0.01252649, + "epoch": 0.8457537952803247, + "flos": 22679222595840.0, + "grad_norm": 1.4389859181784144, + "language_loss": 0.70042717, + "learning_rate": 2.4434092372885363e-07, + "loss": 0.77707636, + "num_input_tokens_seen": 303365140, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09661865, + "step": 14067, + "time_per_iteration": 2.525247097015381 + }, + { + "auxiliary_loss_clip": 0.06396677, + "auxiliary_loss_mlp": 0.01263949, + "balance_loss_clip": 0.06268427, + "balance_loss_mlp": 0.01254717, + "epoch": 0.8458139185329926, + "flos": 33811444638720.0, + "grad_norm": 1.616550126073105, + "language_loss": 0.71155679, + "learning_rate": 2.4415441531946144e-07, + "loss": 0.78816307, + "num_input_tokens_seen": 303386150, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.09234619, + "step": 14068, + "time_per_iteration": 2.6183526515960693 + }, + { + "auxiliary_loss_clip": 0.06309339, + "auxiliary_loss_mlp": 0.01250851, + "balance_loss_clip": 0.06254174, + "balance_loss_mlp": 0.01249894, + "epoch": 0.8458740417856606, + "flos": 70317860618880.0, + "grad_norm": 0.9869339045259047, + "language_loss": 0.60466254, + "learning_rate": 2.4396797349190976e-07, + "loss": 0.68026447, + "num_input_tokens_seen": 303453770, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.009552, + "step": 14069, + "time_per_iteration": 3.223912000656128 + }, + { + "auxiliary_loss_clip": 0.06405959, + "auxiliary_loss_mlp": 0.01263164, + "balance_loss_clip": 0.0627277, + "balance_loss_mlp": 0.01254276, + "epoch": 0.8459341650383285, + "flos": 24177795742080.0, + "grad_norm": 1.5100814720997062, + "language_loss": 0.7470544, + "learning_rate": 2.4378159825326804e-07, + "loss": 0.82374561, + "num_input_tokens_seen": 303474520, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.08886719, + "step": 14070, + "time_per_iteration": 2.5450565814971924 + }, + { + "auxiliary_loss_clip": 0.064023, + "auxiliary_loss_mlp": 0.01266026, + "balance_loss_clip": 0.06273255, + "balance_loss_mlp": 0.01256013, + "epoch": 0.8459942882909965, + "flos": 38190395013120.0, + "grad_norm": 1.6691276484821116, + "language_loss": 0.67298388, + "learning_rate": 2.435952896106039e-07, + "loss": 0.74966717, + "num_input_tokens_seen": 303497345, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.10009766, + "step": 14071, + "time_per_iteration": 4.11489462852478 + }, + { + "auxiliary_loss_clip": 0.06311657, + "auxiliary_loss_mlp": 0.01250821, + "balance_loss_clip": 0.06256663, + "balance_loss_mlp": 0.01249876, + "epoch": 0.8460544115436646, + "flos": 64137212553600.0, + "grad_norm": 0.7266466242386742, + "language_loss": 0.61095023, + "learning_rate": 2.4340904757098313e-07, + "loss": 0.686575, + "num_input_tokens_seen": 303554890, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.00943756, + "step": 14072, + "time_per_iteration": 2.9876792430877686 + }, + { + "auxiliary_loss_clip": 0.06405033, + "auxiliary_loss_mlp": 0.0126483, + "balance_loss_clip": 0.06271338, + "balance_loss_mlp": 0.01254531, + "epoch": 0.8461145347963325, + "flos": 24177753815040.0, + "grad_norm": 2.184634062710798, + "language_loss": 0.72637683, + "learning_rate": 2.4322287214146664e-07, + "loss": 0.80307543, + "num_input_tokens_seen": 303574380, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.10296631, + "step": 14073, + "time_per_iteration": 2.5138275623321533 + }, + { + "auxiliary_loss_clip": 0.06410398, + "auxiliary_loss_mlp": 0.01263688, + "balance_loss_clip": 0.06270458, + "balance_loss_mlp": 0.0125315, + "epoch": 0.8461746580490005, + "flos": 34901863747200.0, + "grad_norm": 1.7949530900019746, + "language_loss": 0.78191227, + "learning_rate": 2.430367633291155e-07, + "loss": 0.85865319, + "num_input_tokens_seen": 303594910, + "router_z_loss_clip": 1.39941406, + "router_z_loss_mlp": 0.10546875, + "step": 14074, + "time_per_iteration": 2.619873046875 + }, + { + "auxiliary_loss_clip": 0.064037, + "auxiliary_loss_mlp": 0.01263016, + "balance_loss_clip": 0.06272943, + "balance_loss_mlp": 0.01253759, + "epoch": 0.8462347813016684, + "flos": 25564127944320.0, + "grad_norm": 3.010228780430648, + "language_loss": 0.75585461, + "learning_rate": 2.4285072114098583e-07, + "loss": 0.8325218, + "num_input_tokens_seen": 303613520, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.0925293, + "step": 14075, + "time_per_iteration": 2.5305089950561523 + }, + { + "auxiliary_loss_clip": 0.06402498, + "auxiliary_loss_mlp": 0.0126512, + "balance_loss_clip": 0.06272259, + "balance_loss_mlp": 0.01255554, + "epoch": 0.8462949045543364, + "flos": 21331855342080.0, + "grad_norm": 2.8956830830227607, + "language_loss": 0.72880858, + "learning_rate": 2.4266474558413355e-07, + "loss": 0.80548477, + "num_input_tokens_seen": 303631225, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.09564209, + "step": 14076, + "time_per_iteration": 2.5211126804351807 + }, + { + "auxiliary_loss_clip": 0.06409035, + "auxiliary_loss_mlp": 0.01266766, + "balance_loss_clip": 0.06273739, + "balance_loss_mlp": 0.01256705, + "epoch": 0.8463550278070043, + "flos": 22643947226880.0, + "grad_norm": 1.8142263370296956, + "language_loss": 0.77469641, + "learning_rate": 2.4247883666560945e-07, + "loss": 0.85145444, + "num_input_tokens_seen": 303649175, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.10058594, + "step": 14077, + "time_per_iteration": 2.4927358627319336 + }, + { + "auxiliary_loss_clip": 0.0640869, + "auxiliary_loss_mlp": 0.01265288, + "balance_loss_clip": 0.06273301, + "balance_loss_mlp": 0.012554, + "epoch": 0.8464151510596724, + "flos": 13010549892480.0, + "grad_norm": 1.9163242247942687, + "language_loss": 0.75092995, + "learning_rate": 2.422929943924643e-07, + "loss": 0.82766974, + "num_input_tokens_seen": 303665915, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.09887695, + "step": 14078, + "time_per_iteration": 2.4891517162323 + }, + { + "auxiliary_loss_clip": 0.06398796, + "auxiliary_loss_mlp": 0.01265037, + "balance_loss_clip": 0.06270543, + "balance_loss_mlp": 0.01255316, + "epoch": 0.8464752743123403, + "flos": 15710231790720.0, + "grad_norm": 2.9876674327438026, + "language_loss": 0.85306883, + "learning_rate": 2.4210721877174565e-07, + "loss": 0.92970717, + "num_input_tokens_seen": 303679985, + "router_z_loss_clip": 1.28320312, + "router_z_loss_mlp": 0.097229, + "step": 14079, + "time_per_iteration": 2.46020770072937 + }, + { + "auxiliary_loss_clip": 0.06414415, + "auxiliary_loss_mlp": 0.01265782, + "balance_loss_clip": 0.06273301, + "balance_loss_mlp": 0.01254928, + "epoch": 0.8465353975650083, + "flos": 21660570109440.0, + "grad_norm": 2.4202133336595826, + "language_loss": 0.58745563, + "learning_rate": 2.419215098104965e-07, + "loss": 0.66425759, + "num_input_tokens_seen": 303698470, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.10870361, + "step": 14080, + "time_per_iteration": 2.520763635635376 + }, + { + "auxiliary_loss_clip": 0.06408149, + "auxiliary_loss_mlp": 0.01263994, + "balance_loss_clip": 0.06270742, + "balance_loss_mlp": 0.01253796, + "epoch": 0.8465955208176762, + "flos": 18521651508480.0, + "grad_norm": 1.9002618050268867, + "language_loss": 0.6564846, + "learning_rate": 2.4173586751576014e-07, + "loss": 0.73320603, + "num_input_tokens_seen": 303716415, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.10198975, + "step": 14081, + "time_per_iteration": 2.4694995880126953 + }, + { + "auxiliary_loss_clip": 0.06404518, + "auxiliary_loss_mlp": 0.01262511, + "balance_loss_clip": 0.06271017, + "balance_loss_mlp": 0.01253815, + "epoch": 0.8466556440703442, + "flos": 24206362784640.0, + "grad_norm": 1.741929690841942, + "language_loss": 0.73086697, + "learning_rate": 2.41550291894576e-07, + "loss": 0.80753726, + "num_input_tokens_seen": 303734490, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.08691406, + "step": 14082, + "time_per_iteration": 2.5245912075042725 + }, + { + "auxiliary_loss_clip": 0.0640555, + "auxiliary_loss_mlp": 0.01262022, + "balance_loss_clip": 0.06270213, + "balance_loss_mlp": 0.01252503, + "epoch": 0.8467157673230121, + "flos": 20382118439040.0, + "grad_norm": 5.9029687604683945, + "language_loss": 0.76243949, + "learning_rate": 2.413647829539809e-07, + "loss": 0.8391152, + "num_input_tokens_seen": 303752310, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.09515381, + "step": 14083, + "time_per_iteration": 3.958021879196167 + }, + { + "auxiliary_loss_clip": 0.06404339, + "auxiliary_loss_mlp": 0.01269365, + "balance_loss_clip": 0.0626808, + "balance_loss_mlp": 0.01259113, + "epoch": 0.8467758905756801, + "flos": 28480870644480.0, + "grad_norm": 1.8273205637866814, + "language_loss": 0.66057962, + "learning_rate": 2.411793407010092e-07, + "loss": 0.73731661, + "num_input_tokens_seen": 303776065, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.10247803, + "step": 14084, + "time_per_iteration": 2.5712640285491943 + }, + { + "auxiliary_loss_clip": 0.06403982, + "auxiliary_loss_mlp": 0.01266515, + "balance_loss_clip": 0.06272845, + "balance_loss_mlp": 0.01256835, + "epoch": 0.8468360138283482, + "flos": 11697367904640.0, + "grad_norm": 1.9024447155732727, + "language_loss": 0.70089591, + "learning_rate": 2.409939651426938e-07, + "loss": 0.77760088, + "num_input_tokens_seen": 303793500, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.09680176, + "step": 14085, + "time_per_iteration": 2.5116045475006104 + }, + { + "auxiliary_loss_clip": 0.06401011, + "auxiliary_loss_mlp": 0.01263688, + "balance_loss_clip": 0.06269903, + "balance_loss_mlp": 0.01254396, + "epoch": 0.8468961370810161, + "flos": 24614726457600.0, + "grad_norm": 1.582597620873215, + "language_loss": 0.7123071, + "learning_rate": 2.408086562860634e-07, + "loss": 0.78895414, + "num_input_tokens_seen": 303814835, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.09295654, + "step": 14086, + "time_per_iteration": 2.5062472820281982 + }, + { + "auxiliary_loss_clip": 0.06402152, + "auxiliary_loss_mlp": 0.01265202, + "balance_loss_clip": 0.06269901, + "balance_loss_mlp": 0.01255951, + "epoch": 0.8469562603336841, + "flos": 19615927904640.0, + "grad_norm": 2.0212942405255347, + "language_loss": 0.75401855, + "learning_rate": 2.4062341413814445e-07, + "loss": 0.83069211, + "num_input_tokens_seen": 303834505, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.09240723, + "step": 14087, + "time_per_iteration": 3.9551570415496826 + }, + { + "auxiliary_loss_clip": 0.06400134, + "auxiliary_loss_mlp": 0.01265984, + "balance_loss_clip": 0.06269534, + "balance_loss_mlp": 0.01256445, + "epoch": 0.847016383586352, + "flos": 22645708162560.0, + "grad_norm": 1.342825997114302, + "language_loss": 0.73916817, + "learning_rate": 2.4043823870596227e-07, + "loss": 0.8158294, + "num_input_tokens_seen": 303855050, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.0954895, + "step": 14088, + "time_per_iteration": 2.540492296218872 + }, + { + "auxiliary_loss_clip": 0.0640047, + "auxiliary_loss_mlp": 0.01265277, + "balance_loss_clip": 0.06268281, + "balance_loss_mlp": 0.01255275, + "epoch": 0.84707650683902, + "flos": 20966565467520.0, + "grad_norm": 2.1758547876889405, + "language_loss": 0.72225606, + "learning_rate": 2.402531299965387e-07, + "loss": 0.79891354, + "num_input_tokens_seen": 303875635, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.10003662, + "step": 14089, + "time_per_iteration": 3.8723671436309814 + }, + { + "auxiliary_loss_clip": 0.06396633, + "auxiliary_loss_mlp": 0.01264492, + "balance_loss_clip": 0.06270199, + "balance_loss_mlp": 0.01255677, + "epoch": 0.8471366300916879, + "flos": 24099111158400.0, + "grad_norm": 1.5614948588231485, + "language_loss": 0.79447126, + "learning_rate": 2.400680880168928e-07, + "loss": 0.87108254, + "num_input_tokens_seen": 303896750, + "router_z_loss_clip": 1.26464844, + "router_z_loss_mlp": 0.0881958, + "step": 14090, + "time_per_iteration": 2.5186121463775635 + }, + { + "auxiliary_loss_clip": 0.064051, + "auxiliary_loss_mlp": 0.01271247, + "balance_loss_clip": 0.06271061, + "balance_loss_mlp": 0.01260507, + "epoch": 0.847196753344356, + "flos": 18338817899520.0, + "grad_norm": 2.1681555308129163, + "language_loss": 0.77695274, + "learning_rate": 2.3988311277404085e-07, + "loss": 0.85371625, + "num_input_tokens_seen": 303915435, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.10742188, + "step": 14091, + "time_per_iteration": 2.4776766300201416 + }, + { + "auxiliary_loss_clip": 0.06313801, + "auxiliary_loss_mlp": 0.01249423, + "balance_loss_clip": 0.06258924, + "balance_loss_mlp": 0.01248393, + "epoch": 0.8472568765970239, + "flos": 49585252550400.0, + "grad_norm": 0.8022713224368199, + "language_loss": 0.59404254, + "learning_rate": 2.396982042749982e-07, + "loss": 0.66967475, + "num_input_tokens_seen": 303977245, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.01030731, + "step": 14092, + "time_per_iteration": 3.243363380432129 + }, + { + "auxiliary_loss_clip": 0.06401625, + "auxiliary_loss_mlp": 0.01266586, + "balance_loss_clip": 0.06269732, + "balance_loss_mlp": 0.01256471, + "epoch": 0.8473169998496919, + "flos": 19284739441920.0, + "grad_norm": 1.7230869725009348, + "language_loss": 0.70479727, + "learning_rate": 2.395133625267756e-07, + "loss": 0.78147936, + "num_input_tokens_seen": 303996055, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.10119629, + "step": 14093, + "time_per_iteration": 2.554523229598999 + }, + { + "auxiliary_loss_clip": 0.0640064, + "auxiliary_loss_mlp": 0.01262162, + "balance_loss_clip": 0.0627271, + "balance_loss_mlp": 0.01253358, + "epoch": 0.8473771231023598, + "flos": 17681262583680.0, + "grad_norm": 2.1470167593348, + "language_loss": 0.83683729, + "learning_rate": 2.3932858753638263e-07, + "loss": 0.91346526, + "num_input_tokens_seen": 304012205, + "router_z_loss_clip": 1.28027344, + "router_z_loss_mlp": 0.0880127, + "step": 14094, + "time_per_iteration": 2.474327564239502 + }, + { + "auxiliary_loss_clip": 0.06397246, + "auxiliary_loss_mlp": 0.01266631, + "balance_loss_clip": 0.06270628, + "balance_loss_mlp": 0.01257237, + "epoch": 0.8474372463550278, + "flos": 26367019367040.0, + "grad_norm": 1.5654273666716596, + "language_loss": 0.7183401, + "learning_rate": 2.3914387931082626e-07, + "loss": 0.79497892, + "num_input_tokens_seen": 304033475, + "router_z_loss_clip": 1.26464844, + "router_z_loss_mlp": 0.09399414, + "step": 14095, + "time_per_iteration": 2.559675693511963 + }, + { + "auxiliary_loss_clip": 0.06399059, + "auxiliary_loss_mlp": 0.01265629, + "balance_loss_clip": 0.06270283, + "balance_loss_mlp": 0.0125589, + "epoch": 0.8474973696076957, + "flos": 23408418752640.0, + "grad_norm": 1.6128422152605608, + "language_loss": 0.80883193, + "learning_rate": 2.3895923785711105e-07, + "loss": 0.88547873, + "num_input_tokens_seen": 304051845, + "router_z_loss_clip": 1.28808594, + "router_z_loss_mlp": 0.09735107, + "step": 14096, + "time_per_iteration": 2.4910190105438232 + }, + { + "auxiliary_loss_clip": 0.06410886, + "auxiliary_loss_mlp": 0.01267989, + "balance_loss_clip": 0.06274761, + "balance_loss_mlp": 0.01257279, + "epoch": 0.8475574928603637, + "flos": 25081523953920.0, + "grad_norm": 1.7270068866988848, + "language_loss": 0.77507085, + "learning_rate": 2.387746631822374e-07, + "loss": 0.85185957, + "num_input_tokens_seen": 304069965, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.10699463, + "step": 14097, + "time_per_iteration": 2.5406811237335205 + }, + { + "auxiliary_loss_clip": 0.06399789, + "auxiliary_loss_mlp": 0.0126681, + "balance_loss_clip": 0.06270649, + "balance_loss_mlp": 0.01258042, + "epoch": 0.8476176161130318, + "flos": 19971532632960.0, + "grad_norm": 1.8020847692391104, + "language_loss": 0.80530119, + "learning_rate": 2.385901552932048e-07, + "loss": 0.88196719, + "num_input_tokens_seen": 304086805, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.08770752, + "step": 14098, + "time_per_iteration": 2.486926794052124 + }, + { + "auxiliary_loss_clip": 0.06402344, + "auxiliary_loss_mlp": 0.01268018, + "balance_loss_clip": 0.06272727, + "balance_loss_mlp": 0.01258267, + "epoch": 0.8476777393656997, + "flos": 21291842217600.0, + "grad_norm": 1.9132060530933808, + "language_loss": 0.72118181, + "learning_rate": 2.3840571419701062e-07, + "loss": 0.79788542, + "num_input_tokens_seen": 304105865, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.09753418, + "step": 14099, + "time_per_iteration": 2.5139384269714355 + }, + { + "auxiliary_loss_clip": 0.06402131, + "auxiliary_loss_mlp": 0.01262911, + "balance_loss_clip": 0.06271783, + "balance_loss_mlp": 0.01252558, + "epoch": 0.8477378626183677, + "flos": 29979276082560.0, + "grad_norm": 1.8485239738364325, + "language_loss": 0.63567179, + "learning_rate": 2.3822133990064787e-07, + "loss": 0.71232224, + "num_input_tokens_seen": 304128300, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.10351562, + "step": 14100, + "time_per_iteration": 2.56445574760437 + }, + { + "auxiliary_loss_clip": 0.06405117, + "auxiliary_loss_mlp": 0.01263495, + "balance_loss_clip": 0.06270204, + "balance_loss_mlp": 0.01252962, + "epoch": 0.8477979858710356, + "flos": 24243650651520.0, + "grad_norm": 2.126244455885968, + "language_loss": 0.73909217, + "learning_rate": 2.380370324111085e-07, + "loss": 0.81577832, + "num_input_tokens_seen": 304143695, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.10534668, + "step": 14101, + "time_per_iteration": 2.529759645462036 + }, + { + "auxiliary_loss_clip": 0.0640009, + "auxiliary_loss_mlp": 0.01263117, + "balance_loss_clip": 0.06269788, + "balance_loss_mlp": 0.01253777, + "epoch": 0.8478581091237036, + "flos": 25600828832640.0, + "grad_norm": 1.4947198559415165, + "language_loss": 0.71708381, + "learning_rate": 2.3785279173538163e-07, + "loss": 0.79371595, + "num_input_tokens_seen": 304165800, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.09344482, + "step": 14102, + "time_per_iteration": 2.539574384689331 + }, + { + "auxiliary_loss_clip": 0.06408991, + "auxiliary_loss_mlp": 0.0126655, + "balance_loss_clip": 0.06274236, + "balance_loss_mlp": 0.01256101, + "epoch": 0.8479182323763715, + "flos": 12061945019520.0, + "grad_norm": 2.418221007739104, + "language_loss": 0.82366699, + "learning_rate": 2.3766861788045366e-07, + "loss": 0.90042239, + "num_input_tokens_seen": 304182910, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.10455322, + "step": 14103, + "time_per_iteration": 2.481079339981079 + }, + { + "auxiliary_loss_clip": 0.06399621, + "auxiliary_loss_mlp": 0.01261485, + "balance_loss_clip": 0.06270504, + "balance_loss_mlp": 0.01252049, + "epoch": 0.8479783556290396, + "flos": 21439693946880.0, + "grad_norm": 2.033398222504212, + "language_loss": 0.78817004, + "learning_rate": 2.374845108533079e-07, + "loss": 0.86478114, + "num_input_tokens_seen": 304200175, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.09429932, + "step": 14104, + "time_per_iteration": 2.490394353866577 + }, + { + "auxiliary_loss_clip": 0.06407318, + "auxiliary_loss_mlp": 0.01265186, + "balance_loss_clip": 0.06273159, + "balance_loss_mlp": 0.01255035, + "epoch": 0.8480384788817075, + "flos": 19648142599680.0, + "grad_norm": 2.0490312403076114, + "language_loss": 0.79098284, + "learning_rate": 2.3730047066092607e-07, + "loss": 0.86770785, + "num_input_tokens_seen": 304217775, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.10144043, + "step": 14105, + "time_per_iteration": 2.485868215560913 + }, + { + "auxiliary_loss_clip": 0.06410661, + "auxiliary_loss_mlp": 0.01267423, + "balance_loss_clip": 0.06273758, + "balance_loss_mlp": 0.01256569, + "epoch": 0.8480986021343755, + "flos": 22495298883840.0, + "grad_norm": 1.5957177290166866, + "language_loss": 0.50232506, + "learning_rate": 2.3711649731028749e-07, + "loss": 0.57910585, + "num_input_tokens_seen": 304235760, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.10858154, + "step": 14106, + "time_per_iteration": 2.5024311542510986 + }, + { + "auxiliary_loss_clip": 0.06401025, + "auxiliary_loss_mlp": 0.01265391, + "balance_loss_clip": 0.06269896, + "balance_loss_mlp": 0.0125577, + "epoch": 0.8481587253870434, + "flos": 22097039627520.0, + "grad_norm": 1.768185108702469, + "language_loss": 0.7552582, + "learning_rate": 2.3693259080836792e-07, + "loss": 0.83192235, + "num_input_tokens_seen": 304253985, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09619141, + "step": 14107, + "time_per_iteration": 2.514000177383423 + }, + { + "auxiliary_loss_clip": 0.06406081, + "auxiliary_loss_mlp": 0.01266176, + "balance_loss_clip": 0.06274906, + "balance_loss_mlp": 0.01257152, + "epoch": 0.8482188486397114, + "flos": 33590945819520.0, + "grad_norm": 2.8502892293190585, + "language_loss": 0.73806465, + "learning_rate": 2.3674875116214087e-07, + "loss": 0.81478727, + "num_input_tokens_seen": 304276785, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09020996, + "step": 14108, + "time_per_iteration": 2.6308159828186035 + }, + { + "auxiliary_loss_clip": 0.06397291, + "auxiliary_loss_mlp": 0.01266603, + "balance_loss_clip": 0.06270851, + "balance_loss_mlp": 0.01256577, + "epoch": 0.8482789718923793, + "flos": 20925084896640.0, + "grad_norm": 4.392299515879854, + "language_loss": 0.72917706, + "learning_rate": 2.3656497837857836e-07, + "loss": 0.80581594, + "num_input_tokens_seen": 304296310, + "router_z_loss_clip": 1.26464844, + "router_z_loss_mlp": 0.10028076, + "step": 14109, + "time_per_iteration": 2.492094039916992 + }, + { + "auxiliary_loss_clip": 0.063987, + "auxiliary_loss_mlp": 0.01264973, + "balance_loss_clip": 0.06269309, + "balance_loss_mlp": 0.01255335, + "epoch": 0.8483390951450474, + "flos": 12901159987200.0, + "grad_norm": 2.2274280206799904, + "language_loss": 0.74444723, + "learning_rate": 2.3638127246464811e-07, + "loss": 0.82108402, + "num_input_tokens_seen": 304311715, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.09643555, + "step": 14110, + "time_per_iteration": 2.4547863006591797 + }, + { + "auxiliary_loss_clip": 0.06399868, + "auxiliary_loss_mlp": 0.0126879, + "balance_loss_clip": 0.06268494, + "balance_loss_mlp": 0.01259289, + "epoch": 0.8483992183977154, + "flos": 25088483842560.0, + "grad_norm": 1.5964367231590322, + "language_loss": 0.76417547, + "learning_rate": 2.3619763342731658e-07, + "loss": 0.84086204, + "num_input_tokens_seen": 304331910, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.0949707, + "step": 14111, + "time_per_iteration": 3.9820806980133057 + }, + { + "auxiliary_loss_clip": 0.06399922, + "auxiliary_loss_mlp": 0.01266512, + "balance_loss_clip": 0.06271142, + "balance_loss_mlp": 0.01257541, + "epoch": 0.8484593416503833, + "flos": 25564631068800.0, + "grad_norm": 1.5630772359474336, + "language_loss": 0.67188197, + "learning_rate": 2.3601406127354772e-07, + "loss": 0.7485463, + "num_input_tokens_seen": 304351405, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.08966064, + "step": 14112, + "time_per_iteration": 2.5049498081207275 + }, + { + "auxiliary_loss_clip": 0.06402437, + "auxiliary_loss_mlp": 0.0126693, + "balance_loss_clip": 0.06270389, + "balance_loss_mlp": 0.01257644, + "epoch": 0.8485194649030513, + "flos": 27205773137280.0, + "grad_norm": 1.5196653604706423, + "language_loss": 0.7372402, + "learning_rate": 2.3583055601030312e-07, + "loss": 0.81393391, + "num_input_tokens_seen": 304372935, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.09289551, + "step": 14113, + "time_per_iteration": 2.5452187061309814 + }, + { + "auxiliary_loss_clip": 0.0639898, + "auxiliary_loss_mlp": 0.01268532, + "balance_loss_clip": 0.06269895, + "balance_loss_mlp": 0.01259228, + "epoch": 0.8485795881557192, + "flos": 24212609913600.0, + "grad_norm": 1.8417781889365228, + "language_loss": 0.66789317, + "learning_rate": 2.3564711764454003e-07, + "loss": 0.74456829, + "num_input_tokens_seen": 304393070, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.09289551, + "step": 14114, + "time_per_iteration": 2.533439874649048 + }, + { + "auxiliary_loss_clip": 0.06404068, + "auxiliary_loss_mlp": 0.0126426, + "balance_loss_clip": 0.06271428, + "balance_loss_mlp": 0.01254616, + "epoch": 0.8486397114083872, + "flos": 21147931630080.0, + "grad_norm": 1.5901930956565895, + "language_loss": 0.7938953, + "learning_rate": 2.3546374618321495e-07, + "loss": 0.87057859, + "num_input_tokens_seen": 304411195, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09643555, + "step": 14115, + "time_per_iteration": 2.4947285652160645 + }, + { + "auxiliary_loss_clip": 0.06404249, + "auxiliary_loss_mlp": 0.01264775, + "balance_loss_clip": 0.06271269, + "balance_loss_mlp": 0.0125503, + "epoch": 0.8486998346610551, + "flos": 19980966216960.0, + "grad_norm": 1.8982053522036084, + "language_loss": 0.79270887, + "learning_rate": 2.3528044163328187e-07, + "loss": 0.86939907, + "num_input_tokens_seen": 304429425, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.09747314, + "step": 14116, + "time_per_iteration": 2.4848196506500244 + }, + { + "auxiliary_loss_clip": 0.06406476, + "auxiliary_loss_mlp": 0.01264395, + "balance_loss_clip": 0.06271321, + "balance_loss_mlp": 0.01254394, + "epoch": 0.8487599579137232, + "flos": 19798468024320.0, + "grad_norm": 2.1065592476865045, + "language_loss": 0.68344235, + "learning_rate": 2.3509720400169076e-07, + "loss": 0.76015103, + "num_input_tokens_seen": 304447460, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.10003662, + "step": 14117, + "time_per_iteration": 2.5438575744628906 + }, + { + "auxiliary_loss_clip": 0.06404263, + "auxiliary_loss_mlp": 0.01263508, + "balance_loss_clip": 0.06269245, + "balance_loss_mlp": 0.01254144, + "epoch": 0.8488200811663911, + "flos": 26403259057920.0, + "grad_norm": 2.1344254269522653, + "language_loss": 0.649701, + "learning_rate": 2.3491403329539096e-07, + "loss": 0.72637874, + "num_input_tokens_seen": 304468230, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.09362793, + "step": 14118, + "time_per_iteration": 2.5292701721191406 + }, + { + "auxiliary_loss_clip": 0.06402715, + "auxiliary_loss_mlp": 0.01263736, + "balance_loss_clip": 0.06272824, + "balance_loss_mlp": 0.012551, + "epoch": 0.8488802044190591, + "flos": 16364307162240.0, + "grad_norm": 1.5920337114960288, + "language_loss": 0.73305792, + "learning_rate": 2.3473092952132757e-07, + "loss": 0.80972242, + "num_input_tokens_seen": 304484860, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.08636475, + "step": 14119, + "time_per_iteration": 2.5488085746765137 + }, + { + "auxiliary_loss_clip": 0.06405111, + "auxiliary_loss_mlp": 0.0126518, + "balance_loss_clip": 0.06273293, + "balance_loss_mlp": 0.01255017, + "epoch": 0.848940327671727, + "flos": 19214985317760.0, + "grad_norm": 1.735285321727865, + "language_loss": 0.78245997, + "learning_rate": 2.345478926864446e-07, + "loss": 0.85916287, + "num_input_tokens_seen": 304503575, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.10168457, + "step": 14120, + "time_per_iteration": 2.4802494049072266 + }, + { + "auxiliary_loss_clip": 0.0640521, + "auxiliary_loss_mlp": 0.01261862, + "balance_loss_clip": 0.06270778, + "balance_loss_mlp": 0.01251956, + "epoch": 0.849000450924395, + "flos": 21877547057280.0, + "grad_norm": 1.653660849157392, + "language_loss": 0.75841606, + "learning_rate": 2.3436492279768227e-07, + "loss": 0.83508676, + "num_input_tokens_seen": 304525005, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.09906006, + "step": 14121, + "time_per_iteration": 2.5294899940490723 + }, + { + "auxiliary_loss_clip": 0.06311592, + "auxiliary_loss_mlp": 0.01250316, + "balance_loss_clip": 0.06256946, + "balance_loss_mlp": 0.01249346, + "epoch": 0.8490605741770629, + "flos": 71187697054080.0, + "grad_norm": 0.8089399370239767, + "language_loss": 0.60124117, + "learning_rate": 2.3418201986197883e-07, + "loss": 0.67686021, + "num_input_tokens_seen": 304585220, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.0096817, + "step": 14122, + "time_per_iteration": 4.59176778793335 + }, + { + "auxiliary_loss_clip": 0.06405739, + "auxiliary_loss_mlp": 0.01266882, + "balance_loss_clip": 0.06273272, + "balance_loss_mlp": 0.01257697, + "epoch": 0.849120697429731, + "flos": 24980393675520.0, + "grad_norm": 1.735217190538918, + "language_loss": 0.79777497, + "learning_rate": 2.3399918388627048e-07, + "loss": 0.87450123, + "num_input_tokens_seen": 304604665, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09191895, + "step": 14123, + "time_per_iteration": 2.5571157932281494 + }, + { + "auxiliary_loss_clip": 0.0639874, + "auxiliary_loss_mlp": 0.01265305, + "balance_loss_clip": 0.06271547, + "balance_loss_mlp": 0.01255899, + "epoch": 0.8491808206823989, + "flos": 23037762216960.0, + "grad_norm": 2.314794878951381, + "language_loss": 0.83767265, + "learning_rate": 2.3381641487749016e-07, + "loss": 0.91431308, + "num_input_tokens_seen": 304620600, + "router_z_loss_clip": 1.27246094, + "router_z_loss_mlp": 0.09411621, + "step": 14124, + "time_per_iteration": 2.4972498416900635 + }, + { + "auxiliary_loss_clip": 0.06402995, + "auxiliary_loss_mlp": 0.01266211, + "balance_loss_clip": 0.0627236, + "balance_loss_mlp": 0.01256448, + "epoch": 0.8492409439350669, + "flos": 23885362592640.0, + "grad_norm": 1.83156410249787, + "language_loss": 0.71961606, + "learning_rate": 2.3363371284256805e-07, + "loss": 0.7963081, + "num_input_tokens_seen": 304639540, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.09753418, + "step": 14125, + "time_per_iteration": 2.489391326904297 + }, + { + "auxiliary_loss_clip": 0.06413139, + "auxiliary_loss_mlp": 0.01265668, + "balance_loss_clip": 0.06274882, + "balance_loss_mlp": 0.01254987, + "epoch": 0.8493010671877349, + "flos": 22426592935680.0, + "grad_norm": 1.586241425813396, + "language_loss": 0.73891562, + "learning_rate": 2.3345107778843288e-07, + "loss": 0.81570363, + "num_input_tokens_seen": 304660595, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.10687256, + "step": 14126, + "time_per_iteration": 3.983708381652832 + }, + { + "auxiliary_loss_clip": 0.06403054, + "auxiliary_loss_mlp": 0.01264392, + "balance_loss_clip": 0.06273109, + "balance_loss_mlp": 0.01254707, + "epoch": 0.8493611904404028, + "flos": 17535087936000.0, + "grad_norm": 1.8650592737197151, + "language_loss": 0.67556584, + "learning_rate": 2.3326850972200928e-07, + "loss": 0.75224024, + "num_input_tokens_seen": 304679580, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.09686279, + "step": 14127, + "time_per_iteration": 2.487192153930664 + }, + { + "auxiliary_loss_clip": 0.06408098, + "auxiliary_loss_mlp": 0.01264632, + "balance_loss_clip": 0.06272452, + "balance_loss_mlp": 0.01254624, + "epoch": 0.8494213136930708, + "flos": 19468872789120.0, + "grad_norm": 2.079377486336631, + "language_loss": 0.6908232, + "learning_rate": 2.330860086502211e-07, + "loss": 0.76755047, + "num_input_tokens_seen": 304698385, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.10003662, + "step": 14128, + "time_per_iteration": 3.9321682453155518 + }, + { + "auxiliary_loss_clip": 0.0640503, + "auxiliary_loss_mlp": 0.01266181, + "balance_loss_clip": 0.06273429, + "balance_loss_mlp": 0.01256203, + "epoch": 0.8494814369457387, + "flos": 18776209812480.0, + "grad_norm": 1.8334204932365141, + "language_loss": 0.77824986, + "learning_rate": 2.3290357457998855e-07, + "loss": 0.85496199, + "num_input_tokens_seen": 304715430, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09973145, + "step": 14129, + "time_per_iteration": 2.477147102355957 + }, + { + "auxiliary_loss_clip": 0.06402892, + "auxiliary_loss_mlp": 0.01262796, + "balance_loss_clip": 0.06271499, + "balance_loss_mlp": 0.01253718, + "epoch": 0.8495415601984068, + "flos": 23338245358080.0, + "grad_norm": 1.6462886650116846, + "language_loss": 0.68294001, + "learning_rate": 2.3272120751823031e-07, + "loss": 0.75959694, + "num_input_tokens_seen": 304734345, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09075928, + "step": 14130, + "time_per_iteration": 2.4999375343322754 + }, + { + "auxiliary_loss_clip": 0.0640253, + "auxiliary_loss_mlp": 0.01261921, + "balance_loss_clip": 0.06270012, + "balance_loss_mlp": 0.01252367, + "epoch": 0.8496016834510747, + "flos": 26619774808320.0, + "grad_norm": 1.5802166891621863, + "language_loss": 0.71646059, + "learning_rate": 2.3253890747186e-07, + "loss": 0.79310513, + "num_input_tokens_seen": 304755030, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.09545898, + "step": 14131, + "time_per_iteration": 2.5575854778289795 + }, + { + "auxiliary_loss_clip": 0.06405224, + "auxiliary_loss_mlp": 0.01265063, + "balance_loss_clip": 0.06270383, + "balance_loss_mlp": 0.01255729, + "epoch": 0.8496618067037427, + "flos": 25486868880000.0, + "grad_norm": 1.7695763181681814, + "language_loss": 0.68790936, + "learning_rate": 2.3235667444779162e-07, + "loss": 0.7646122, + "num_input_tokens_seen": 304774320, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.09332275, + "step": 14132, + "time_per_iteration": 2.5535638332366943 + }, + { + "auxiliary_loss_clip": 0.06400724, + "auxiliary_loss_mlp": 0.01264751, + "balance_loss_clip": 0.0627154, + "balance_loss_mlp": 0.012563, + "epoch": 0.8497219299564106, + "flos": 25381671678720.0, + "grad_norm": 1.5183602424718283, + "language_loss": 0.70325232, + "learning_rate": 2.3217450845293564e-07, + "loss": 0.77990711, + "num_input_tokens_seen": 304795355, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.08459473, + "step": 14133, + "time_per_iteration": 2.5285003185272217 + }, + { + "auxiliary_loss_clip": 0.06314642, + "auxiliary_loss_mlp": 0.0125198, + "balance_loss_clip": 0.06259762, + "balance_loss_mlp": 0.01250997, + "epoch": 0.8497820532090786, + "flos": 67802102432640.0, + "grad_norm": 0.719733671392506, + "language_loss": 0.57708496, + "learning_rate": 2.3199240949419918e-07, + "loss": 0.65275121, + "num_input_tokens_seen": 304863915, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.00982666, + "step": 14134, + "time_per_iteration": 3.2259228229522705 + }, + { + "auxiliary_loss_clip": 0.06407531, + "auxiliary_loss_mlp": 0.01264401, + "balance_loss_clip": 0.06273041, + "balance_loss_mlp": 0.01254947, + "epoch": 0.8498421764617465, + "flos": 23447257920000.0, + "grad_norm": 1.8337709107177125, + "language_loss": 0.78980213, + "learning_rate": 2.3181037757848787e-07, + "loss": 0.86652142, + "num_input_tokens_seen": 304881555, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.09460449, + "step": 14135, + "time_per_iteration": 2.5097665786743164 + }, + { + "auxiliary_loss_clip": 0.06408061, + "auxiliary_loss_mlp": 0.01265083, + "balance_loss_clip": 0.06273302, + "balance_loss_mlp": 0.01254521, + "epoch": 0.8499022997144146, + "flos": 17718424669440.0, + "grad_norm": 1.803510122803531, + "language_loss": 0.63663286, + "learning_rate": 2.316284127127044e-07, + "loss": 0.71336436, + "num_input_tokens_seen": 304898760, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.10559082, + "step": 14136, + "time_per_iteration": 2.4748387336730957 + }, + { + "auxiliary_loss_clip": 0.06406897, + "auxiliary_loss_mlp": 0.01265974, + "balance_loss_clip": 0.06272756, + "balance_loss_mlp": 0.01255686, + "epoch": 0.8499624229670825, + "flos": 18594508233600.0, + "grad_norm": 1.9108052639568265, + "language_loss": 0.8452841, + "learning_rate": 2.3144651490374835e-07, + "loss": 0.92201281, + "num_input_tokens_seen": 304915465, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.10290527, + "step": 14137, + "time_per_iteration": 2.466539144515991 + }, + { + "auxiliary_loss_clip": 0.0639957, + "auxiliary_loss_mlp": 0.01266335, + "balance_loss_clip": 0.0627173, + "balance_loss_mlp": 0.01257281, + "epoch": 0.8500225462197505, + "flos": 24351573110400.0, + "grad_norm": 2.739928375946937, + "language_loss": 0.78818476, + "learning_rate": 2.3126468415851773e-07, + "loss": 0.86484385, + "num_input_tokens_seen": 304933190, + "router_z_loss_clip": 1.27929688, + "router_z_loss_mlp": 0.09051514, + "step": 14138, + "time_per_iteration": 2.5530903339385986 + }, + { + "auxiliary_loss_clip": 0.06402527, + "auxiliary_loss_mlp": 0.0126479, + "balance_loss_clip": 0.06272259, + "balance_loss_mlp": 0.01255271, + "epoch": 0.8500826694724185, + "flos": 16551207694080.0, + "grad_norm": 1.5096380838746266, + "language_loss": 0.64687216, + "learning_rate": 2.310829204839073e-07, + "loss": 0.72354537, + "num_input_tokens_seen": 304951110, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.09521484, + "step": 14139, + "time_per_iteration": 2.4765748977661133 + }, + { + "auxiliary_loss_clip": 0.06402735, + "auxiliary_loss_mlp": 0.01262708, + "balance_loss_clip": 0.06273352, + "balance_loss_mlp": 0.01253451, + "epoch": 0.8501427927250864, + "flos": 16294930381440.0, + "grad_norm": 1.421511629945392, + "language_loss": 0.70614517, + "learning_rate": 2.3090122388681043e-07, + "loss": 0.78279966, + "num_input_tokens_seen": 304969095, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.0925293, + "step": 14140, + "time_per_iteration": 2.498777151107788 + }, + { + "auxiliary_loss_clip": 0.06406597, + "auxiliary_loss_mlp": 0.01266518, + "balance_loss_clip": 0.06272027, + "balance_loss_mlp": 0.01256534, + "epoch": 0.8502029159777544, + "flos": 26695189082880.0, + "grad_norm": 1.9493724688595604, + "language_loss": 0.64299488, + "learning_rate": 2.3071959437411648e-07, + "loss": 0.71972603, + "num_input_tokens_seen": 304989315, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.09979248, + "step": 14141, + "time_per_iteration": 2.5951173305511475 + }, + { + "auxiliary_loss_clip": 0.06401542, + "auxiliary_loss_mlp": 0.01267222, + "balance_loss_clip": 0.0627162, + "balance_loss_mlp": 0.01257614, + "epoch": 0.8502630392304223, + "flos": 35599599895680.0, + "grad_norm": 1.6642597175452942, + "language_loss": 0.71313, + "learning_rate": 2.3053803195271214e-07, + "loss": 0.78981769, + "num_input_tokens_seen": 305011020, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.0960083, + "step": 14142, + "time_per_iteration": 2.644679307937622 + }, + { + "auxiliary_loss_clip": 0.06406039, + "auxiliary_loss_mlp": 0.01263489, + "balance_loss_clip": 0.0627221, + "balance_loss_mlp": 0.01254704, + "epoch": 0.8503231624830904, + "flos": 21655329229440.0, + "grad_norm": 1.5291787912539954, + "language_loss": 0.6560241, + "learning_rate": 2.3035653662948375e-07, + "loss": 0.73271942, + "num_input_tokens_seen": 305033550, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.08782959, + "step": 14143, + "time_per_iteration": 2.550386905670166 + }, + { + "auxiliary_loss_clip": 0.06409223, + "auxiliary_loss_mlp": 0.01267388, + "balance_loss_clip": 0.06273058, + "balance_loss_mlp": 0.01257029, + "epoch": 0.8503832857357583, + "flos": 22423741896960.0, + "grad_norm": 1.9945347024432363, + "language_loss": 0.68129444, + "learning_rate": 2.3017510841131216e-07, + "loss": 0.75806051, + "num_input_tokens_seen": 305052885, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.10357666, + "step": 14144, + "time_per_iteration": 2.535437822341919 + }, + { + "auxiliary_loss_clip": 0.06397337, + "auxiliary_loss_mlp": 0.01264934, + "balance_loss_clip": 0.06270745, + "balance_loss_mlp": 0.01255981, + "epoch": 0.8504434089884263, + "flos": 18703981992960.0, + "grad_norm": 2.059497972093478, + "language_loss": 0.6487931, + "learning_rate": 2.299937473050777e-07, + "loss": 0.72541577, + "num_input_tokens_seen": 305071995, + "router_z_loss_clip": 1.265625, + "router_z_loss_mlp": 0.08953857, + "step": 14145, + "time_per_iteration": 2.4910314083099365 + }, + { + "auxiliary_loss_clip": 0.06402655, + "auxiliary_loss_mlp": 0.01262868, + "balance_loss_clip": 0.06271585, + "balance_loss_mlp": 0.01253784, + "epoch": 0.8505035322410942, + "flos": 20013642109440.0, + "grad_norm": 1.595818409331469, + "language_loss": 0.85513884, + "learning_rate": 2.2981245331765842e-07, + "loss": 0.93179405, + "num_input_tokens_seen": 305090190, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.09075928, + "step": 14146, + "time_per_iteration": 2.5118772983551025 + }, + { + "auxiliary_loss_clip": 0.06399256, + "auxiliary_loss_mlp": 0.01263156, + "balance_loss_clip": 0.06268792, + "balance_loss_mlp": 0.01254543, + "epoch": 0.8505636554937622, + "flos": 20818210613760.0, + "grad_norm": 1.4979672504038752, + "language_loss": 0.84137052, + "learning_rate": 2.2963122645592814e-07, + "loss": 0.91799468, + "num_input_tokens_seen": 305109355, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.08612061, + "step": 14147, + "time_per_iteration": 2.491823673248291 + }, + { + "auxiliary_loss_clip": 0.06407596, + "auxiliary_loss_mlp": 0.01264593, + "balance_loss_clip": 0.06270961, + "balance_loss_mlp": 0.01254549, + "epoch": 0.8506237787464301, + "flos": 14179821292800.0, + "grad_norm": 2.3326412330221284, + "language_loss": 0.86542302, + "learning_rate": 2.2945006672675894e-07, + "loss": 0.94214487, + "num_input_tokens_seen": 305124165, + "router_z_loss_clip": 1.36621094, + "router_z_loss_mlp": 0.10040283, + "step": 14148, + "time_per_iteration": 2.4511468410491943 + }, + { + "auxiliary_loss_clip": 0.06404074, + "auxiliary_loss_mlp": 0.01267682, + "balance_loss_clip": 0.06273896, + "balance_loss_mlp": 0.01257991, + "epoch": 0.8506839019990982, + "flos": 23265095143680.0, + "grad_norm": 2.691845002956324, + "language_loss": 0.72521651, + "learning_rate": 2.292689741370204e-07, + "loss": 0.801934, + "num_input_tokens_seen": 305143940, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.09698486, + "step": 14149, + "time_per_iteration": 2.4899957180023193 + }, + { + "auxiliary_loss_clip": 0.06403546, + "auxiliary_loss_mlp": 0.01262142, + "balance_loss_clip": 0.06271783, + "balance_loss_mlp": 0.01252504, + "epoch": 0.8507440252517661, + "flos": 23665911949440.0, + "grad_norm": 1.5144720298422676, + "language_loss": 0.76150334, + "learning_rate": 2.290879486935804e-07, + "loss": 0.83816022, + "num_input_tokens_seen": 305163505, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.09631348, + "step": 14150, + "time_per_iteration": 3.8928089141845703 + }, + { + "auxiliary_loss_clip": 0.06398553, + "auxiliary_loss_mlp": 0.01263858, + "balance_loss_clip": 0.06269762, + "balance_loss_mlp": 0.01255025, + "epoch": 0.8508041485044341, + "flos": 18667323031680.0, + "grad_norm": 1.6618873770107652, + "language_loss": 0.72802079, + "learning_rate": 2.2890699040330231e-07, + "loss": 0.80464488, + "num_input_tokens_seen": 305182325, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.08837891, + "step": 14151, + "time_per_iteration": 2.4714863300323486 + }, + { + "auxiliary_loss_clip": 0.0630898, + "auxiliary_loss_mlp": 0.01253738, + "balance_loss_clip": 0.06254144, + "balance_loss_mlp": 0.01252743, + "epoch": 0.8508642717571021, + "flos": 52527124275840.0, + "grad_norm": 0.8927928049322662, + "language_loss": 0.59571874, + "learning_rate": 2.2872609927304909e-07, + "loss": 0.67134595, + "num_input_tokens_seen": 305230775, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.0099411, + "step": 14152, + "time_per_iteration": 2.8655712604522705 + }, + { + "auxiliary_loss_clip": 0.06316353, + "auxiliary_loss_mlp": 0.01254234, + "balance_loss_clip": 0.06261283, + "balance_loss_mlp": 0.0125321, + "epoch": 0.85092439500977, + "flos": 69316622582400.0, + "grad_norm": 0.6838202798086767, + "language_loss": 0.60732996, + "learning_rate": 2.285452753096797e-07, + "loss": 0.68303585, + "num_input_tokens_seen": 305296000, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.01024628, + "step": 14153, + "time_per_iteration": 3.1540443897247314 + }, + { + "auxiliary_loss_clip": 0.06401594, + "auxiliary_loss_mlp": 0.01264001, + "balance_loss_clip": 0.06270707, + "balance_loss_mlp": 0.01254584, + "epoch": 0.850984518262438, + "flos": 24396701333760.0, + "grad_norm": 1.5261009228174292, + "language_loss": 0.80733705, + "learning_rate": 2.2836451852005067e-07, + "loss": 0.88399303, + "num_input_tokens_seen": 305314705, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09411621, + "step": 14154, + "time_per_iteration": 2.509315013885498 + }, + { + "auxiliary_loss_clip": 0.0639661, + "auxiliary_loss_mlp": 0.01264654, + "balance_loss_clip": 0.06270568, + "balance_loss_mlp": 0.01256023, + "epoch": 0.851044641515106, + "flos": 23301544469760.0, + "grad_norm": 1.6872874413166468, + "language_loss": 0.80040228, + "learning_rate": 2.281838289110165e-07, + "loss": 0.87701488, + "num_input_tokens_seen": 305333870, + "router_z_loss_clip": 1.25976562, + "router_z_loss_mlp": 0.08630371, + "step": 14155, + "time_per_iteration": 2.5027365684509277 + }, + { + "auxiliary_loss_clip": 0.06406571, + "auxiliary_loss_mlp": 0.01265425, + "balance_loss_clip": 0.06270237, + "balance_loss_mlp": 0.0125556, + "epoch": 0.851104764767774, + "flos": 22055894472960.0, + "grad_norm": 1.8573710226657936, + "language_loss": 0.70853728, + "learning_rate": 2.2800320648942904e-07, + "loss": 0.78525728, + "num_input_tokens_seen": 305352780, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.09863281, + "step": 14156, + "time_per_iteration": 2.479628324508667 + }, + { + "auxiliary_loss_clip": 0.06399591, + "auxiliary_loss_mlp": 0.01266727, + "balance_loss_clip": 0.06270649, + "balance_loss_mlp": 0.0125753, + "epoch": 0.8511648880204419, + "flos": 20711084768640.0, + "grad_norm": 1.7440507839185868, + "language_loss": 0.73986185, + "learning_rate": 2.278226512621386e-07, + "loss": 0.81652504, + "num_input_tokens_seen": 305371370, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.09204102, + "step": 14157, + "time_per_iteration": 2.517547845840454 + }, + { + "auxiliary_loss_clip": 0.06396286, + "auxiliary_loss_mlp": 0.01264892, + "balance_loss_clip": 0.06269678, + "balance_loss_mlp": 0.01256214, + "epoch": 0.8512250112731099, + "flos": 24031537240320.0, + "grad_norm": 1.8245812327511397, + "language_loss": 0.79734576, + "learning_rate": 2.2764216323598995e-07, + "loss": 0.87395757, + "num_input_tokens_seen": 305387955, + "router_z_loss_clip": 1.265625, + "router_z_loss_mlp": 0.08673096, + "step": 14158, + "time_per_iteration": 2.4979214668273926 + }, + { + "auxiliary_loss_clip": 0.0640398, + "auxiliary_loss_mlp": 0.01268649, + "balance_loss_clip": 0.06272298, + "balance_loss_mlp": 0.01258855, + "epoch": 0.8512851345257778, + "flos": 22021583425920.0, + "grad_norm": 1.9713413245067732, + "language_loss": 0.79106247, + "learning_rate": 2.27461742417828e-07, + "loss": 0.86778879, + "num_input_tokens_seen": 305406285, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.09790039, + "step": 14159, + "time_per_iteration": 2.528264284133911 + }, + { + "auxiliary_loss_clip": 0.06402959, + "auxiliary_loss_mlp": 0.01262793, + "balance_loss_clip": 0.06271561, + "balance_loss_mlp": 0.01252976, + "epoch": 0.8513452577784458, + "flos": 14835531818880.0, + "grad_norm": 1.6436898451229665, + "language_loss": 0.71580386, + "learning_rate": 2.2728138881449488e-07, + "loss": 0.7924614, + "num_input_tokens_seen": 305424500, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.09814453, + "step": 14160, + "time_per_iteration": 2.479752779006958 + }, + { + "auxiliary_loss_clip": 0.06410594, + "auxiliary_loss_mlp": 0.01266043, + "balance_loss_clip": 0.06273068, + "balance_loss_mlp": 0.012556, + "epoch": 0.8514053810311137, + "flos": 33043870512000.0, + "grad_norm": 2.103891046025698, + "language_loss": 0.71018016, + "learning_rate": 2.2710110243282866e-07, + "loss": 0.78694654, + "num_input_tokens_seen": 305442990, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.10443115, + "step": 14161, + "time_per_iteration": 2.6188247203826904 + }, + { + "auxiliary_loss_clip": 0.06404144, + "auxiliary_loss_mlp": 0.01263874, + "balance_loss_clip": 0.062693, + "balance_loss_mlp": 0.01254248, + "epoch": 0.8514655042837818, + "flos": 27572027333760.0, + "grad_norm": 2.0966778505863997, + "language_loss": 0.78282481, + "learning_rate": 2.2692088327966653e-07, + "loss": 0.85950494, + "num_input_tokens_seen": 305463065, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.09625244, + "step": 14162, + "time_per_iteration": 4.036656856536865 + }, + { + "auxiliary_loss_clip": 0.06401855, + "auxiliary_loss_mlp": 0.01263883, + "balance_loss_clip": 0.06271641, + "balance_loss_mlp": 0.01254036, + "epoch": 0.8515256275364497, + "flos": 35565163067520.0, + "grad_norm": 1.9877443818476273, + "language_loss": 0.77228487, + "learning_rate": 2.2674073136184235e-07, + "loss": 0.84894228, + "num_input_tokens_seen": 305489070, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.09844971, + "step": 14163, + "time_per_iteration": 2.63171648979187 + }, + { + "auxiliary_loss_clip": 0.06310776, + "auxiliary_loss_mlp": 0.01252981, + "balance_loss_clip": 0.06255888, + "balance_loss_mlp": 0.01251983, + "epoch": 0.8515857507891177, + "flos": 70226681777280.0, + "grad_norm": 0.6817221132059864, + "language_loss": 0.54955924, + "learning_rate": 2.2656064668618735e-07, + "loss": 0.62519681, + "num_input_tokens_seen": 305551490, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.00997925, + "step": 14164, + "time_per_iteration": 3.1551241874694824 + }, + { + "auxiliary_loss_clip": 0.06406744, + "auxiliary_loss_mlp": 0.01270382, + "balance_loss_clip": 0.06274273, + "balance_loss_mlp": 0.01260524, + "epoch": 0.8516458740417857, + "flos": 22682031707520.0, + "grad_norm": 3.0329072828581816, + "language_loss": 0.73003203, + "learning_rate": 2.2638062925953005e-07, + "loss": 0.80680323, + "num_input_tokens_seen": 305570535, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09857178, + "step": 14165, + "time_per_iteration": 2.5035831928253174 + }, + { + "auxiliary_loss_clip": 0.06397499, + "auxiliary_loss_mlp": 0.01262475, + "balance_loss_clip": 0.06270273, + "balance_loss_mlp": 0.01253231, + "epoch": 0.8517059972944536, + "flos": 22754049891840.0, + "grad_norm": 1.484328472533111, + "language_loss": 0.67534792, + "learning_rate": 2.26200679088697e-07, + "loss": 0.7519477, + "num_input_tokens_seen": 305590800, + "router_z_loss_clip": 1.27148438, + "router_z_loss_mlp": 0.09240723, + "step": 14166, + "time_per_iteration": 3.980225086212158 + }, + { + "auxiliary_loss_clip": 0.06407012, + "auxiliary_loss_mlp": 0.01265516, + "balance_loss_clip": 0.06273839, + "balance_loss_mlp": 0.01256164, + "epoch": 0.8517661205471216, + "flos": 21695551989120.0, + "grad_norm": 1.6606333090542271, + "language_loss": 0.73706573, + "learning_rate": 2.260207961805125e-07, + "loss": 0.81379104, + "num_input_tokens_seen": 305609495, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.09350586, + "step": 14167, + "time_per_iteration": 2.5159831047058105 + }, + { + "auxiliary_loss_clip": 0.06402537, + "auxiliary_loss_mlp": 0.0126222, + "balance_loss_clip": 0.06271734, + "balance_loss_mlp": 0.0125341, + "epoch": 0.8518262437997896, + "flos": 25381965168000.0, + "grad_norm": 1.6418130813226552, + "language_loss": 0.80574334, + "learning_rate": 2.258409805417969e-07, + "loss": 0.88239098, + "num_input_tokens_seen": 305629420, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.08807373, + "step": 14168, + "time_per_iteration": 3.9127509593963623 + }, + { + "auxiliary_loss_clip": 0.06400729, + "auxiliary_loss_mlp": 0.01263799, + "balance_loss_clip": 0.06270607, + "balance_loss_mlp": 0.01254233, + "epoch": 0.8518863670524576, + "flos": 27242809441920.0, + "grad_norm": 1.6366824582665955, + "language_loss": 0.76805246, + "learning_rate": 2.2566123217936893e-07, + "loss": 0.84469771, + "num_input_tokens_seen": 305649835, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.09570312, + "step": 14169, + "time_per_iteration": 2.564000129699707 + }, + { + "auxiliary_loss_clip": 0.06407769, + "auxiliary_loss_mlp": 0.01265521, + "balance_loss_clip": 0.06273901, + "balance_loss_mlp": 0.0125574, + "epoch": 0.8519464903051255, + "flos": 20965810780800.0, + "grad_norm": 1.524606449707151, + "language_loss": 0.64094317, + "learning_rate": 2.254815511000452e-07, + "loss": 0.71767604, + "num_input_tokens_seen": 305668840, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.09777832, + "step": 14170, + "time_per_iteration": 2.4731311798095703 + }, + { + "auxiliary_loss_clip": 0.06401997, + "auxiliary_loss_mlp": 0.01263402, + "balance_loss_clip": 0.06271668, + "balance_loss_mlp": 0.01254348, + "epoch": 0.8520066135577935, + "flos": 18447578899200.0, + "grad_norm": 3.4073612372840003, + "language_loss": 0.86964762, + "learning_rate": 2.253019373106384e-07, + "loss": 0.94630164, + "num_input_tokens_seen": 305686955, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.09057617, + "step": 14171, + "time_per_iteration": 2.4719200134277344 + }, + { + "auxiliary_loss_clip": 0.0640336, + "auxiliary_loss_mlp": 0.01264072, + "balance_loss_clip": 0.0627137, + "balance_loss_mlp": 0.01254368, + "epoch": 0.8520667368104614, + "flos": 29137545492480.0, + "grad_norm": 1.7662348242463337, + "language_loss": 0.55010748, + "learning_rate": 2.2512239081796003e-07, + "loss": 0.62678176, + "num_input_tokens_seen": 305706290, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.0970459, + "step": 14172, + "time_per_iteration": 2.545728921890259 + }, + { + "auxiliary_loss_clip": 0.06399302, + "auxiliary_loss_mlp": 0.01263713, + "balance_loss_clip": 0.06271762, + "balance_loss_mlp": 0.01255232, + "epoch": 0.8521268600631294, + "flos": 16039910880000.0, + "grad_norm": 2.3131255138599287, + "language_loss": 0.69956374, + "learning_rate": 2.2494291162881862e-07, + "loss": 0.77619392, + "num_input_tokens_seen": 305723835, + "router_z_loss_clip": 1.27539062, + "router_z_loss_mlp": 0.08477783, + "step": 14173, + "time_per_iteration": 2.505682945251465 + }, + { + "auxiliary_loss_clip": 0.06406021, + "auxiliary_loss_mlp": 0.01266898, + "balance_loss_clip": 0.06273559, + "balance_loss_mlp": 0.0125667, + "epoch": 0.8521869833157973, + "flos": 22461323253120.0, + "grad_norm": 2.130719000445001, + "language_loss": 0.77812624, + "learning_rate": 2.247634997500205e-07, + "loss": 0.85485542, + "num_input_tokens_seen": 305741655, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.10241699, + "step": 14174, + "time_per_iteration": 2.487783908843994 + }, + { + "auxiliary_loss_clip": 0.06406736, + "auxiliary_loss_mlp": 0.01263896, + "balance_loss_clip": 0.06273077, + "balance_loss_mlp": 0.01254199, + "epoch": 0.8522471065684654, + "flos": 24978842375040.0, + "grad_norm": 1.55391099663027, + "language_loss": 0.81712008, + "learning_rate": 2.245841551883676e-07, + "loss": 0.89382648, + "num_input_tokens_seen": 305761890, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.09698486, + "step": 14175, + "time_per_iteration": 2.524867534637451 + }, + { + "auxiliary_loss_clip": 0.06412444, + "auxiliary_loss_mlp": 0.01264709, + "balance_loss_clip": 0.06276155, + "balance_loss_mlp": 0.01254832, + "epoch": 0.8523072298211333, + "flos": 17716076755200.0, + "grad_norm": 2.276000629543861, + "language_loss": 0.65874249, + "learning_rate": 2.2440487795066153e-07, + "loss": 0.73551399, + "num_input_tokens_seen": 305779190, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.09881592, + "step": 14176, + "time_per_iteration": 2.477936029434204 + }, + { + "auxiliary_loss_clip": 0.06401838, + "auxiliary_loss_mlp": 0.01264664, + "balance_loss_clip": 0.06274813, + "balance_loss_mlp": 0.01255282, + "epoch": 0.8523673530738013, + "flos": 25453060957440.0, + "grad_norm": 1.6627416158004444, + "language_loss": 0.78781587, + "learning_rate": 2.2422566804370068e-07, + "loss": 0.86448085, + "num_input_tokens_seen": 305799870, + "router_z_loss_clip": 1.26757812, + "router_z_loss_mlp": 0.09381104, + "step": 14177, + "time_per_iteration": 2.5438950061798096 + }, + { + "auxiliary_loss_clip": 0.06401211, + "auxiliary_loss_mlp": 0.01265433, + "balance_loss_clip": 0.06269382, + "balance_loss_mlp": 0.01255556, + "epoch": 0.8524274763264693, + "flos": 31437416833920.0, + "grad_norm": 1.5992726547756348, + "language_loss": 0.73792171, + "learning_rate": 2.2404652547428026e-07, + "loss": 0.81458819, + "num_input_tokens_seen": 305819695, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09881592, + "step": 14178, + "time_per_iteration": 2.553457736968994 + }, + { + "auxiliary_loss_clip": 0.06405145, + "auxiliary_loss_mlp": 0.0126473, + "balance_loss_clip": 0.06271388, + "balance_loss_mlp": 0.01254419, + "epoch": 0.8524875995791372, + "flos": 17718466596480.0, + "grad_norm": 1.6207840647423646, + "language_loss": 0.74986088, + "learning_rate": 2.238674502491935e-07, + "loss": 0.82655966, + "num_input_tokens_seen": 305837270, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.10302734, + "step": 14179, + "time_per_iteration": 2.4778192043304443 + }, + { + "auxiliary_loss_clip": 0.064025, + "auxiliary_loss_mlp": 0.01264849, + "balance_loss_clip": 0.06273463, + "balance_loss_mlp": 0.01256039, + "epoch": 0.8525477228318052, + "flos": 21693413710080.0, + "grad_norm": 2.2580601470919177, + "language_loss": 0.81900585, + "learning_rate": 2.2368844237523165e-07, + "loss": 0.89567935, + "num_input_tokens_seen": 305855250, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.08813477, + "step": 14180, + "time_per_iteration": 2.5831997394561768 + }, + { + "auxiliary_loss_clip": 0.06404898, + "auxiliary_loss_mlp": 0.01264396, + "balance_loss_clip": 0.06273393, + "balance_loss_mlp": 0.0125462, + "epoch": 0.8526078460844732, + "flos": 24834009392640.0, + "grad_norm": 2.3038873670157045, + "language_loss": 0.61954057, + "learning_rate": 2.235095018591815e-07, + "loss": 0.69623345, + "num_input_tokens_seen": 305875660, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09777832, + "step": 14181, + "time_per_iteration": 2.615877628326416 + }, + { + "auxiliary_loss_clip": 0.06400971, + "auxiliary_loss_mlp": 0.01263288, + "balance_loss_clip": 0.06272621, + "balance_loss_mlp": 0.01254091, + "epoch": 0.8526679693371412, + "flos": 13521469363200.0, + "grad_norm": 2.0632362183656046, + "language_loss": 0.7309761, + "learning_rate": 2.2333062870782894e-07, + "loss": 0.80761874, + "num_input_tokens_seen": 305892415, + "router_z_loss_clip": 1.28417969, + "router_z_loss_mlp": 0.09197998, + "step": 14182, + "time_per_iteration": 2.454415798187256 + }, + { + "auxiliary_loss_clip": 0.0640147, + "auxiliary_loss_mlp": 0.01264054, + "balance_loss_clip": 0.06271984, + "balance_loss_mlp": 0.01254709, + "epoch": 0.8527280925898091, + "flos": 23520911258880.0, + "grad_norm": 1.6302774737251082, + "language_loss": 0.71115839, + "learning_rate": 2.2315182292795697e-07, + "loss": 0.78781366, + "num_input_tokens_seen": 305912665, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.09338379, + "step": 14183, + "time_per_iteration": 2.5552773475646973 + }, + { + "auxiliary_loss_clip": 0.06401762, + "auxiliary_loss_mlp": 0.01263252, + "balance_loss_clip": 0.06273358, + "balance_loss_mlp": 0.01253906, + "epoch": 0.8527882158424771, + "flos": 20309261713920.0, + "grad_norm": 1.7421644295315515, + "language_loss": 0.7277168, + "learning_rate": 2.2297308452634644e-07, + "loss": 0.80436695, + "num_input_tokens_seen": 305931515, + "router_z_loss_clip": 1.28417969, + "router_z_loss_mlp": 0.09344482, + "step": 14184, + "time_per_iteration": 2.4897632598876953 + }, + { + "auxiliary_loss_clip": 0.06405064, + "auxiliary_loss_mlp": 0.01261188, + "balance_loss_clip": 0.06273878, + "balance_loss_mlp": 0.01251747, + "epoch": 0.852848339095145, + "flos": 17208343739520.0, + "grad_norm": 1.7709255697532287, + "language_loss": 0.77010369, + "learning_rate": 2.2279441350977457e-07, + "loss": 0.84676623, + "num_input_tokens_seen": 305949965, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09436035, + "step": 14185, + "time_per_iteration": 2.5324416160583496 + }, + { + "auxiliary_loss_clip": 0.06408064, + "auxiliary_loss_mlp": 0.01262591, + "balance_loss_clip": 0.06271752, + "balance_loss_mlp": 0.01253096, + "epoch": 0.852908462347813, + "flos": 18374847955200.0, + "grad_norm": 1.8096042183588577, + "language_loss": 0.7986542, + "learning_rate": 2.2261580988501637e-07, + "loss": 0.87536073, + "num_input_tokens_seen": 305967820, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.0949707, + "step": 14186, + "time_per_iteration": 2.4652650356292725 + }, + { + "auxiliary_loss_clip": 0.06405443, + "auxiliary_loss_mlp": 0.0126256, + "balance_loss_clip": 0.0627183, + "balance_loss_mlp": 0.01252958, + "epoch": 0.8529685856004809, + "flos": 18630873705600.0, + "grad_norm": 1.5490242087187152, + "language_loss": 0.62591934, + "learning_rate": 2.224372736588449e-07, + "loss": 0.70259941, + "num_input_tokens_seen": 305985505, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.0960083, + "step": 14187, + "time_per_iteration": 2.5199503898620605 + }, + { + "auxiliary_loss_clip": 0.06409691, + "auxiliary_loss_mlp": 0.01263092, + "balance_loss_clip": 0.06272909, + "balance_loss_mlp": 0.012529, + "epoch": 0.853028708853149, + "flos": 29615579435520.0, + "grad_norm": 1.578408505037398, + "language_loss": 0.76792014, + "learning_rate": 2.2225880483803005e-07, + "loss": 0.844648, + "num_input_tokens_seen": 306005220, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.10192871, + "step": 14188, + "time_per_iteration": 2.559159517288208 + }, + { + "auxiliary_loss_clip": 0.06407709, + "auxiliary_loss_mlp": 0.01261931, + "balance_loss_clip": 0.06273156, + "balance_loss_mlp": 0.01251941, + "epoch": 0.8530888321058169, + "flos": 26359304791680.0, + "grad_norm": 1.6865481411500645, + "language_loss": 0.78473645, + "learning_rate": 2.2208040342933932e-07, + "loss": 0.86143285, + "num_input_tokens_seen": 306023785, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.09985352, + "step": 14189, + "time_per_iteration": 2.5550105571746826 + }, + { + "auxiliary_loss_clip": 0.06405266, + "auxiliary_loss_mlp": 0.01264032, + "balance_loss_clip": 0.06272979, + "balance_loss_mlp": 0.01253619, + "epoch": 0.8531489553584849, + "flos": 20528251159680.0, + "grad_norm": 1.946155460997632, + "language_loss": 0.79894865, + "learning_rate": 2.2190206943953793e-07, + "loss": 0.87564158, + "num_input_tokens_seen": 306041600, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.10412598, + "step": 14190, + "time_per_iteration": 3.9609453678131104 + }, + { + "auxiliary_loss_clip": 0.06404427, + "auxiliary_loss_mlp": 0.01268005, + "balance_loss_clip": 0.06273438, + "balance_loss_mlp": 0.01258283, + "epoch": 0.8532090786111529, + "flos": 20710581644160.0, + "grad_norm": 1.8315307088661303, + "language_loss": 0.76509988, + "learning_rate": 2.2172380287538894e-07, + "loss": 0.84182423, + "num_input_tokens_seen": 306060345, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.097229, + "step": 14191, + "time_per_iteration": 2.497880458831787 + }, + { + "auxiliary_loss_clip": 0.06399481, + "auxiliary_loss_mlp": 0.01265109, + "balance_loss_clip": 0.06269594, + "balance_loss_mlp": 0.01255042, + "epoch": 0.8532692018638208, + "flos": 19835085058560.0, + "grad_norm": 1.8333627441476026, + "language_loss": 0.69020867, + "learning_rate": 2.2154560374365073e-07, + "loss": 0.76685452, + "num_input_tokens_seen": 306078285, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.10058594, + "step": 14192, + "time_per_iteration": 2.4836080074310303 + }, + { + "auxiliary_loss_clip": 0.06410177, + "auxiliary_loss_mlp": 0.01267235, + "balance_loss_clip": 0.06271335, + "balance_loss_mlp": 0.01255565, + "epoch": 0.8533293251164888, + "flos": 21003224428800.0, + "grad_norm": 4.054614200028427, + "language_loss": 0.62898421, + "learning_rate": 2.2136747205108164e-07, + "loss": 0.70575833, + "num_input_tokens_seen": 306093760, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.11669922, + "step": 14193, + "time_per_iteration": 2.4602465629577637 + }, + { + "auxiliary_loss_clip": 0.06401785, + "auxiliary_loss_mlp": 0.01261393, + "balance_loss_clip": 0.06270966, + "balance_loss_mlp": 0.01252065, + "epoch": 0.8533894483691568, + "flos": 22426257519360.0, + "grad_norm": 1.772584246462062, + "language_loss": 0.76703686, + "learning_rate": 2.211894078044365e-07, + "loss": 0.8436687, + "num_input_tokens_seen": 306112595, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09326172, + "step": 14194, + "time_per_iteration": 2.486522674560547 + }, + { + "auxiliary_loss_clip": 0.06402128, + "auxiliary_loss_mlp": 0.01261977, + "balance_loss_clip": 0.06269732, + "balance_loss_mlp": 0.01253096, + "epoch": 0.8534495716218248, + "flos": 21622988753280.0, + "grad_norm": 1.8711254841944578, + "language_loss": 0.6979003, + "learning_rate": 2.2101141101046705e-07, + "loss": 0.77454138, + "num_input_tokens_seen": 306131800, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.08880615, + "step": 14195, + "time_per_iteration": 2.4857912063598633 + }, + { + "auxiliary_loss_clip": 0.06402412, + "auxiliary_loss_mlp": 0.01267409, + "balance_loss_clip": 0.06270134, + "balance_loss_mlp": 0.01257729, + "epoch": 0.8535096948744927, + "flos": 22352855742720.0, + "grad_norm": 1.8252311406941406, + "language_loss": 0.85771298, + "learning_rate": 2.2083348167592343e-07, + "loss": 0.93441117, + "num_input_tokens_seen": 306150590, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.09680176, + "step": 14196, + "time_per_iteration": 2.495814800262451 + }, + { + "auxiliary_loss_clip": 0.06310438, + "auxiliary_loss_mlp": 0.01249691, + "balance_loss_clip": 0.06255472, + "balance_loss_mlp": 0.01248657, + "epoch": 0.8535698181271607, + "flos": 52778118781440.0, + "grad_norm": 0.7492715698474276, + "language_loss": 0.55104071, + "learning_rate": 2.2065561980755243e-07, + "loss": 0.62664199, + "num_input_tokens_seen": 306205850, + "router_z_loss_clip": 0.54833984, + "router_z_loss_mlp": 0.01035309, + "step": 14197, + "time_per_iteration": 3.0517899990081787 + }, + { + "auxiliary_loss_clip": 0.06400962, + "auxiliary_loss_mlp": 0.01262147, + "balance_loss_clip": 0.06272976, + "balance_loss_mlp": 0.01252449, + "epoch": 0.8536299413798286, + "flos": 19068978378240.0, + "grad_norm": 1.6048685300978085, + "language_loss": 0.81422484, + "learning_rate": 2.2047782541209826e-07, + "loss": 0.89085591, + "num_input_tokens_seen": 306225220, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.09698486, + "step": 14198, + "time_per_iteration": 2.5209779739379883 + }, + { + "auxiliary_loss_clip": 0.06403227, + "auxiliary_loss_mlp": 0.01263611, + "balance_loss_clip": 0.06271878, + "balance_loss_mlp": 0.01254825, + "epoch": 0.8536900646324966, + "flos": 49355670291840.0, + "grad_norm": 1.3991146351834236, + "language_loss": 0.68443, + "learning_rate": 2.203000984963035e-07, + "loss": 0.76109838, + "num_input_tokens_seen": 306249865, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.08789062, + "step": 14199, + "time_per_iteration": 2.732821464538574 + }, + { + "auxiliary_loss_clip": 0.06397039, + "auxiliary_loss_mlp": 0.01264117, + "balance_loss_clip": 0.06270607, + "balance_loss_mlp": 0.01255212, + "epoch": 0.8537501878851645, + "flos": 21768786057600.0, + "grad_norm": 1.5481845643108143, + "language_loss": 0.86597717, + "learning_rate": 2.201224390669072e-07, + "loss": 0.94258881, + "num_input_tokens_seen": 306270215, + "router_z_loss_clip": 1.26269531, + "router_z_loss_mlp": 0.08905029, + "step": 14200, + "time_per_iteration": 2.51717209815979 + }, + { + "auxiliary_loss_clip": 0.06402627, + "auxiliary_loss_mlp": 0.0126303, + "balance_loss_clip": 0.06271648, + "balance_loss_mlp": 0.01254101, + "epoch": 0.8538103111378326, + "flos": 22275051626880.0, + "grad_norm": 1.664748237948193, + "language_loss": 0.78232074, + "learning_rate": 2.1994484713064666e-07, + "loss": 0.85897732, + "num_input_tokens_seen": 306288960, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.08929443, + "step": 14201, + "time_per_iteration": 3.9599037170410156 + }, + { + "auxiliary_loss_clip": 0.06402917, + "auxiliary_loss_mlp": 0.01267065, + "balance_loss_clip": 0.06274314, + "balance_loss_mlp": 0.01258846, + "epoch": 0.8538704343905005, + "flos": 20310309889920.0, + "grad_norm": 1.8137924392854496, + "language_loss": 0.68695676, + "learning_rate": 2.19767322694256e-07, + "loss": 0.76365662, + "num_input_tokens_seen": 306308735, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.08221436, + "step": 14202, + "time_per_iteration": 2.5016098022460938 + }, + { + "auxiliary_loss_clip": 0.0640841, + "auxiliary_loss_mlp": 0.01265781, + "balance_loss_clip": 0.06275605, + "balance_loss_mlp": 0.01256167, + "epoch": 0.8539305576431685, + "flos": 24762284697600.0, + "grad_norm": 1.435109126468579, + "language_loss": 0.80630964, + "learning_rate": 2.195898657644666e-07, + "loss": 0.88305151, + "num_input_tokens_seen": 306329015, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09613037, + "step": 14203, + "time_per_iteration": 2.5469577312469482 + }, + { + "auxiliary_loss_clip": 0.06407243, + "auxiliary_loss_mlp": 0.01270539, + "balance_loss_clip": 0.06273086, + "balance_loss_mlp": 0.01259566, + "epoch": 0.8539906808958365, + "flos": 26694853666560.0, + "grad_norm": 1.7668265789233564, + "language_loss": 0.6594435, + "learning_rate": 2.1941247634800808e-07, + "loss": 0.73622131, + "num_input_tokens_seen": 306349085, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.10974121, + "step": 14204, + "time_per_iteration": 2.5221924781799316 + }, + { + "auxiliary_loss_clip": 0.06407247, + "auxiliary_loss_mlp": 0.01264623, + "balance_loss_clip": 0.06272349, + "balance_loss_mlp": 0.01254425, + "epoch": 0.8540508041485044, + "flos": 13369718419200.0, + "grad_norm": 2.1751805975593728, + "language_loss": 0.60867941, + "learning_rate": 2.1923515445160667e-07, + "loss": 0.68539816, + "num_input_tokens_seen": 306365385, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.10198975, + "step": 14205, + "time_per_iteration": 2.4865877628326416 + }, + { + "auxiliary_loss_clip": 0.06401113, + "auxiliary_loss_mlp": 0.0126197, + "balance_loss_clip": 0.06271503, + "balance_loss_mlp": 0.01252868, + "epoch": 0.8541109274011724, + "flos": 32789144499840.0, + "grad_norm": 1.841264040666231, + "language_loss": 0.72367227, + "learning_rate": 2.1905790008198655e-07, + "loss": 0.8003031, + "num_input_tokens_seen": 306384585, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.09100342, + "step": 14206, + "time_per_iteration": 4.089895963668823 + }, + { + "auxiliary_loss_clip": 0.06406163, + "auxiliary_loss_mlp": 0.01269422, + "balance_loss_clip": 0.06272157, + "balance_loss_mlp": 0.01259789, + "epoch": 0.8541710506538404, + "flos": 17645022892800.0, + "grad_norm": 2.6328069765844226, + "language_loss": 0.76719952, + "learning_rate": 2.1888071324586987e-07, + "loss": 0.8439554, + "num_input_tokens_seen": 306401565, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.09631348, + "step": 14207, + "time_per_iteration": 3.9147050380706787 + }, + { + "auxiliary_loss_clip": 0.06406431, + "auxiliary_loss_mlp": 0.01265601, + "balance_loss_clip": 0.06272171, + "balance_loss_mlp": 0.01255534, + "epoch": 0.8542311739065084, + "flos": 20268703537920.0, + "grad_norm": 4.198730612623469, + "language_loss": 0.85354292, + "learning_rate": 2.1870359394997485e-07, + "loss": 0.93026328, + "num_input_tokens_seen": 306419995, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.10064697, + "step": 14208, + "time_per_iteration": 2.4986929893493652 + }, + { + "auxiliary_loss_clip": 0.06401771, + "auxiliary_loss_mlp": 0.01262828, + "balance_loss_clip": 0.06270763, + "balance_loss_mlp": 0.01253667, + "epoch": 0.8542912971591763, + "flos": 17791491029760.0, + "grad_norm": 1.5673944060040763, + "language_loss": 0.66329616, + "learning_rate": 2.1852654220101785e-07, + "loss": 0.73994213, + "num_input_tokens_seen": 306439240, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.0916748, + "step": 14209, + "time_per_iteration": 2.463555097579956 + }, + { + "auxiliary_loss_clip": 0.06400887, + "auxiliary_loss_mlp": 0.01264146, + "balance_loss_clip": 0.06271108, + "balance_loss_mlp": 0.01254747, + "epoch": 0.8543514204118443, + "flos": 26986783691520.0, + "grad_norm": 1.7929675763472626, + "language_loss": 0.70580226, + "learning_rate": 2.1834955800571287e-07, + "loss": 0.78245258, + "num_input_tokens_seen": 306458425, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.09399414, + "step": 14210, + "time_per_iteration": 2.550405979156494 + }, + { + "auxiliary_loss_clip": 0.06406937, + "auxiliary_loss_mlp": 0.0126768, + "balance_loss_clip": 0.06274385, + "balance_loss_mlp": 0.01258084, + "epoch": 0.8544115436645122, + "flos": 24031453386240.0, + "grad_norm": 1.3681653014571087, + "language_loss": 0.70620722, + "learning_rate": 2.1817264137077141e-07, + "loss": 0.78295344, + "num_input_tokens_seen": 306477210, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09594727, + "step": 14211, + "time_per_iteration": 2.516709804534912 + }, + { + "auxiliary_loss_clip": 0.06405395, + "auxiliary_loss_mlp": 0.01265339, + "balance_loss_clip": 0.06272474, + "balance_loss_mlp": 0.01255469, + "epoch": 0.8544716669171802, + "flos": 16623603221760.0, + "grad_norm": 2.1078451145204156, + "language_loss": 0.81721437, + "learning_rate": 2.1799579230290166e-07, + "loss": 0.89392173, + "num_input_tokens_seen": 306495820, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.09863281, + "step": 14212, + "time_per_iteration": 2.479947328567505 + }, + { + "auxiliary_loss_clip": 0.06403465, + "auxiliary_loss_mlp": 0.01263033, + "balance_loss_clip": 0.06271081, + "balance_loss_mlp": 0.01253127, + "epoch": 0.8545317901698481, + "flos": 40015376939520.0, + "grad_norm": 2.0862751950857135, + "language_loss": 0.66500002, + "learning_rate": 2.178190108088105e-07, + "loss": 0.74166501, + "num_input_tokens_seen": 306516420, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.09906006, + "step": 14213, + "time_per_iteration": 2.641176462173462 + }, + { + "auxiliary_loss_clip": 0.06403671, + "auxiliary_loss_mlp": 0.01263607, + "balance_loss_clip": 0.06272917, + "balance_loss_mlp": 0.01253862, + "epoch": 0.8545919134225162, + "flos": 19908822251520.0, + "grad_norm": 1.6101825554065545, + "language_loss": 0.78410029, + "learning_rate": 2.1764229689520098e-07, + "loss": 0.86077309, + "num_input_tokens_seen": 306534785, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.09747314, + "step": 14214, + "time_per_iteration": 2.5185306072235107 + }, + { + "auxiliary_loss_clip": 0.06409415, + "auxiliary_loss_mlp": 0.01265369, + "balance_loss_clip": 0.06273215, + "balance_loss_mlp": 0.01254825, + "epoch": 0.8546520366751841, + "flos": 18958959567360.0, + "grad_norm": 2.1746086147260097, + "language_loss": 0.67291975, + "learning_rate": 2.1746565056877397e-07, + "loss": 0.74966758, + "num_input_tokens_seen": 306552440, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.10546875, + "step": 14215, + "time_per_iteration": 2.559387445449829 + }, + { + "auxiliary_loss_clip": 0.06403182, + "auxiliary_loss_mlp": 0.01264976, + "balance_loss_clip": 0.06272992, + "balance_loss_mlp": 0.01255773, + "epoch": 0.8547121599278521, + "flos": 35629298968320.0, + "grad_norm": 1.7345016463439749, + "language_loss": 0.62729144, + "learning_rate": 2.172890718362279e-07, + "loss": 0.703973, + "num_input_tokens_seen": 306573600, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.09197998, + "step": 14216, + "time_per_iteration": 2.631380319595337 + }, + { + "auxiliary_loss_clip": 0.06403802, + "auxiliary_loss_mlp": 0.01263952, + "balance_loss_clip": 0.06269723, + "balance_loss_mlp": 0.01254046, + "epoch": 0.8547722831805201, + "flos": 16915742881920.0, + "grad_norm": 1.5812149458388964, + "language_loss": 0.65813535, + "learning_rate": 2.17112560704259e-07, + "loss": 0.73481297, + "num_input_tokens_seen": 306592840, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.09912109, + "step": 14217, + "time_per_iteration": 2.457961320877075 + }, + { + "auxiliary_loss_clip": 0.06400003, + "auxiliary_loss_mlp": 0.01265845, + "balance_loss_clip": 0.06270915, + "balance_loss_mlp": 0.01256827, + "epoch": 0.854832406433188, + "flos": 23009237101440.0, + "grad_norm": 1.6861315946256161, + "language_loss": 0.65233666, + "learning_rate": 2.1693611717956072e-07, + "loss": 0.72899508, + "num_input_tokens_seen": 306613210, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.09008789, + "step": 14218, + "time_per_iteration": 2.5305798053741455 + }, + { + "auxiliary_loss_clip": 0.06403703, + "auxiliary_loss_mlp": 0.01266926, + "balance_loss_clip": 0.06268973, + "balance_loss_mlp": 0.01257622, + "epoch": 0.854892529685856, + "flos": 20418861254400.0, + "grad_norm": 1.6553984291407586, + "language_loss": 0.70452309, + "learning_rate": 2.167597412688238e-07, + "loss": 0.78122938, + "num_input_tokens_seen": 306631620, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.09301758, + "step": 14219, + "time_per_iteration": 2.5228383541107178 + }, + { + "auxiliary_loss_clip": 0.06408383, + "auxiliary_loss_mlp": 0.01266081, + "balance_loss_clip": 0.06272451, + "balance_loss_mlp": 0.01255507, + "epoch": 0.854952652938524, + "flos": 16404236432640.0, + "grad_norm": 2.1871061782173524, + "language_loss": 0.68056822, + "learning_rate": 2.1658343297873549e-07, + "loss": 0.75731283, + "num_input_tokens_seen": 306646695, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.10577393, + "step": 14220, + "time_per_iteration": 2.4769935607910156 + }, + { + "auxiliary_loss_clip": 0.06399038, + "auxiliary_loss_mlp": 0.01261891, + "balance_loss_clip": 0.06271215, + "balance_loss_mlp": 0.01252861, + "epoch": 0.855012776191192, + "flos": 21185051788800.0, + "grad_norm": 2.8581673001858015, + "language_loss": 0.72015893, + "learning_rate": 2.164071923159827e-07, + "loss": 0.79676819, + "num_input_tokens_seen": 306665465, + "router_z_loss_clip": 1.27832031, + "router_z_loss_mlp": 0.09039307, + "step": 14221, + "time_per_iteration": 2.483891725540161 + }, + { + "auxiliary_loss_clip": 0.06402694, + "auxiliary_loss_mlp": 0.01263341, + "balance_loss_clip": 0.06269461, + "balance_loss_mlp": 0.01253798, + "epoch": 0.8550728994438599, + "flos": 26148239556480.0, + "grad_norm": 2.145984380511565, + "language_loss": 0.60342848, + "learning_rate": 2.1623101928724763e-07, + "loss": 0.68008888, + "num_input_tokens_seen": 306685950, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.09539795, + "step": 14222, + "time_per_iteration": 2.549551486968994 + }, + { + "auxiliary_loss_clip": 0.06401211, + "auxiliary_loss_mlp": 0.01262674, + "balance_loss_clip": 0.0627152, + "balance_loss_mlp": 0.0125362, + "epoch": 0.8551330226965279, + "flos": 22793895308160.0, + "grad_norm": 1.4434546769022616, + "language_loss": 0.84376544, + "learning_rate": 2.1605491389921093e-07, + "loss": 0.92040431, + "num_input_tokens_seen": 306705740, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.09051514, + "step": 14223, + "time_per_iteration": 2.5119271278381348 + }, + { + "auxiliary_loss_clip": 0.0640087, + "auxiliary_loss_mlp": 0.01265091, + "balance_loss_clip": 0.06270584, + "balance_loss_mlp": 0.01255972, + "epoch": 0.8551931459491958, + "flos": 22425586686720.0, + "grad_norm": 1.8195239921480866, + "language_loss": 0.74431682, + "learning_rate": 2.158788761585515e-07, + "loss": 0.82097644, + "num_input_tokens_seen": 306725065, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.09118652, + "step": 14224, + "time_per_iteration": 2.520721912384033 + }, + { + "auxiliary_loss_clip": 0.06403351, + "auxiliary_loss_mlp": 0.01264932, + "balance_loss_clip": 0.0627145, + "balance_loss_mlp": 0.01255818, + "epoch": 0.8552532692018638, + "flos": 19579268943360.0, + "grad_norm": 3.311933017994998, + "language_loss": 0.75833428, + "learning_rate": 2.1570290607194307e-07, + "loss": 0.83501709, + "num_input_tokens_seen": 306743630, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.09118652, + "step": 14225, + "time_per_iteration": 2.4928267002105713 + }, + { + "auxiliary_loss_clip": 0.06402107, + "auxiliary_loss_mlp": 0.01263352, + "balance_loss_clip": 0.0627172, + "balance_loss_mlp": 0.0125434, + "epoch": 0.8553133924545318, + "flos": 26440043800320.0, + "grad_norm": 1.618794757802268, + "language_loss": 0.7746619, + "learning_rate": 2.1552700364605925e-07, + "loss": 0.85131651, + "num_input_tokens_seen": 306763105, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.09008789, + "step": 14226, + "time_per_iteration": 2.5908937454223633 + }, + { + "auxiliary_loss_clip": 0.06408493, + "auxiliary_loss_mlp": 0.01262631, + "balance_loss_clip": 0.06272776, + "balance_loss_mlp": 0.01252642, + "epoch": 0.8553735157071998, + "flos": 16367996741760.0, + "grad_norm": 2.0827352676299817, + "language_loss": 0.54691792, + "learning_rate": 2.153511688875702e-07, + "loss": 0.62362921, + "num_input_tokens_seen": 306779875, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.09991455, + "step": 14227, + "time_per_iteration": 2.4728844165802 + }, + { + "auxiliary_loss_clip": 0.0640135, + "auxiliary_loss_mlp": 0.01265196, + "balance_loss_clip": 0.06272006, + "balance_loss_mlp": 0.01255909, + "epoch": 0.8554336389598677, + "flos": 20893750669440.0, + "grad_norm": 1.8255877057500567, + "language_loss": 0.66183186, + "learning_rate": 2.151754018031442e-07, + "loss": 0.73849732, + "num_input_tokens_seen": 306800015, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.09289551, + "step": 14228, + "time_per_iteration": 2.578582525253296 + }, + { + "auxiliary_loss_clip": 0.06404306, + "auxiliary_loss_mlp": 0.01261575, + "balance_loss_clip": 0.06270082, + "balance_loss_mlp": 0.01251704, + "epoch": 0.8554937622125357, + "flos": 21290542479360.0, + "grad_norm": 1.7630288706046695, + "language_loss": 0.73876858, + "learning_rate": 2.1499970239944542e-07, + "loss": 0.8154273, + "num_input_tokens_seen": 306814160, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.09875488, + "step": 14229, + "time_per_iteration": 4.011183023452759 + }, + { + "auxiliary_loss_clip": 0.06399019, + "auxiliary_loss_mlp": 0.01263221, + "balance_loss_clip": 0.06270005, + "balance_loss_mlp": 0.01254495, + "epoch": 0.8555538854652037, + "flos": 22418752579200.0, + "grad_norm": 1.642260219354586, + "language_loss": 0.7294243, + "learning_rate": 2.1482407068313724e-07, + "loss": 0.80604661, + "num_input_tokens_seen": 306833310, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.08728027, + "step": 14230, + "time_per_iteration": 2.486304521560669 + }, + { + "auxiliary_loss_clip": 0.06403501, + "auxiliary_loss_mlp": 0.0126514, + "balance_loss_clip": 0.06272286, + "balance_loss_mlp": 0.01255639, + "epoch": 0.8556140087178716, + "flos": 20199955662720.0, + "grad_norm": 2.082778168166704, + "language_loss": 0.82605416, + "learning_rate": 2.1464850666087897e-07, + "loss": 0.9027406, + "num_input_tokens_seen": 306851345, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.0949707, + "step": 14231, + "time_per_iteration": 2.5565478801727295 + }, + { + "auxiliary_loss_clip": 0.06408692, + "auxiliary_loss_mlp": 0.01265448, + "balance_loss_clip": 0.0627467, + "balance_loss_mlp": 0.01255083, + "epoch": 0.8556741319705397, + "flos": 22644743840640.0, + "grad_norm": 1.7449765739897811, + "language_loss": 0.6803897, + "learning_rate": 2.1447301033932796e-07, + "loss": 0.7571311, + "num_input_tokens_seen": 306871040, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.1036377, + "step": 14232, + "time_per_iteration": 2.599693536758423 + }, + { + "auxiliary_loss_clip": 0.06405558, + "auxiliary_loss_mlp": 0.01264791, + "balance_loss_clip": 0.06271291, + "balance_loss_mlp": 0.01254545, + "epoch": 0.8557342552232076, + "flos": 23555935065600.0, + "grad_norm": 1.433905025036311, + "language_loss": 0.67211032, + "learning_rate": 2.1429758172513955e-07, + "loss": 0.74881387, + "num_input_tokens_seen": 306891625, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.10253906, + "step": 14233, + "time_per_iteration": 2.5528273582458496 + }, + { + "auxiliary_loss_clip": 0.06397888, + "auxiliary_loss_mlp": 0.01264971, + "balance_loss_clip": 0.06268627, + "balance_loss_mlp": 0.01255953, + "epoch": 0.8557943784758756, + "flos": 19616011758720.0, + "grad_norm": 1.6206343328834838, + "language_loss": 0.77135193, + "learning_rate": 2.1412222082496556e-07, + "loss": 0.84798056, + "num_input_tokens_seen": 306910020, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.09014893, + "step": 14234, + "time_per_iteration": 2.466433525085449 + }, + { + "auxiliary_loss_clip": 0.06311054, + "auxiliary_loss_mlp": 0.01249873, + "balance_loss_clip": 0.06256243, + "balance_loss_mlp": 0.0124884, + "epoch": 0.8558545017285435, + "flos": 70660719527040.0, + "grad_norm": 0.7448880666757703, + "language_loss": 0.58154905, + "learning_rate": 2.1394692764545684e-07, + "loss": 0.65715837, + "num_input_tokens_seen": 306969505, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.0103302, + "step": 14235, + "time_per_iteration": 3.1063690185546875 + }, + { + "auxiliary_loss_clip": 0.06307988, + "auxiliary_loss_mlp": 0.0125195, + "balance_loss_clip": 0.06253141, + "balance_loss_mlp": 0.01250894, + "epoch": 0.8559146249812115, + "flos": 56669586900480.0, + "grad_norm": 0.7679206472060363, + "language_loss": 0.56618702, + "learning_rate": 2.1377170219325858e-07, + "loss": 0.64178634, + "num_input_tokens_seen": 307027710, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.01057434, + "step": 14236, + "time_per_iteration": 3.0186736583709717 + }, + { + "auxiliary_loss_clip": 0.0640348, + "auxiliary_loss_mlp": 0.01264038, + "balance_loss_clip": 0.06271995, + "balance_loss_mlp": 0.01254371, + "epoch": 0.8559747482338794, + "flos": 22894019337600.0, + "grad_norm": 1.5957292123473101, + "language_loss": 0.70495546, + "learning_rate": 2.1359654447501673e-07, + "loss": 0.78163064, + "num_input_tokens_seen": 307045515, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09661865, + "step": 14237, + "time_per_iteration": 2.509390115737915 + }, + { + "auxiliary_loss_clip": 0.06402485, + "auxiliary_loss_mlp": 0.01262428, + "balance_loss_clip": 0.06271048, + "balance_loss_mlp": 0.0125341, + "epoch": 0.8560348714865474, + "flos": 22608588003840.0, + "grad_norm": 2.298866202248753, + "language_loss": 0.64055443, + "learning_rate": 2.1342145449737314e-07, + "loss": 0.71720362, + "num_input_tokens_seen": 307064470, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09014893, + "step": 14238, + "time_per_iteration": 2.5472559928894043 + }, + { + "auxiliary_loss_clip": 0.06398335, + "auxiliary_loss_mlp": 0.01261025, + "balance_loss_clip": 0.06270797, + "balance_loss_mlp": 0.01252663, + "epoch": 0.8560949947392154, + "flos": 17937288334080.0, + "grad_norm": 1.3930808832059673, + "language_loss": 0.6932922, + "learning_rate": 2.1324643226696648e-07, + "loss": 0.76988578, + "num_input_tokens_seen": 307083900, + "router_z_loss_clip": 1.2734375, + "router_z_loss_mlp": 0.08355713, + "step": 14239, + "time_per_iteration": 2.5263397693634033 + }, + { + "auxiliary_loss_clip": 0.06407407, + "auxiliary_loss_mlp": 0.01265921, + "balance_loss_clip": 0.06271498, + "balance_loss_mlp": 0.01256021, + "epoch": 0.8561551179918834, + "flos": 31033623208320.0, + "grad_norm": 1.8670368079308202, + "language_loss": 0.66960537, + "learning_rate": 2.1307147779043455e-07, + "loss": 0.74633867, + "num_input_tokens_seen": 307104590, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.09912109, + "step": 14240, + "time_per_iteration": 2.556577205657959 + }, + { + "auxiliary_loss_clip": 0.06403075, + "auxiliary_loss_mlp": 0.01264958, + "balance_loss_clip": 0.06270295, + "balance_loss_mlp": 0.01254581, + "epoch": 0.8562152412445513, + "flos": 30673196870400.0, + "grad_norm": 1.7026908336354338, + "language_loss": 0.6247797, + "learning_rate": 2.1289659107441182e-07, + "loss": 0.70146, + "num_input_tokens_seen": 307125580, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.1038208, + "step": 14241, + "time_per_iteration": 3.953922986984253 + }, + { + "auxiliary_loss_clip": 0.06409171, + "auxiliary_loss_mlp": 0.01264684, + "balance_loss_clip": 0.06270305, + "balance_loss_mlp": 0.01253914, + "epoch": 0.8562753644972193, + "flos": 31584094606080.0, + "grad_norm": 1.7280214562641805, + "language_loss": 0.74751389, + "learning_rate": 2.1272177212552855e-07, + "loss": 0.82425249, + "num_input_tokens_seen": 307147625, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.10766602, + "step": 14242, + "time_per_iteration": 2.6225974559783936 + }, + { + "auxiliary_loss_clip": 0.06404752, + "auxiliary_loss_mlp": 0.01266213, + "balance_loss_clip": 0.06271575, + "balance_loss_mlp": 0.01255788, + "epoch": 0.8563354877498872, + "flos": 26220844719360.0, + "grad_norm": 2.0910743848690756, + "language_loss": 0.76865256, + "learning_rate": 2.1254702095041498e-07, + "loss": 0.84536231, + "num_input_tokens_seen": 307164665, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.10418701, + "step": 14243, + "time_per_iteration": 2.6213650703430176 + }, + { + "auxiliary_loss_clip": 0.06404091, + "auxiliary_loss_mlp": 0.01263899, + "balance_loss_clip": 0.06271794, + "balance_loss_mlp": 0.0125472, + "epoch": 0.8563956110025552, + "flos": 24141262561920.0, + "grad_norm": 1.716514705669694, + "language_loss": 0.68232524, + "learning_rate": 2.123723375556974e-07, + "loss": 0.75900519, + "num_input_tokens_seen": 307182530, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.09179688, + "step": 14244, + "time_per_iteration": 2.5382473468780518 + }, + { + "auxiliary_loss_clip": 0.06309429, + "auxiliary_loss_mlp": 0.01252704, + "balance_loss_clip": 0.06254511, + "balance_loss_mlp": 0.01251608, + "epoch": 0.8564557342552233, + "flos": 56289329072640.0, + "grad_norm": 0.7489817973332332, + "language_loss": 0.58483648, + "learning_rate": 2.1219772194800046e-07, + "loss": 0.66045779, + "num_input_tokens_seen": 307241240, + "router_z_loss_clip": 0.54931641, + "router_z_loss_mlp": 0.01098633, + "step": 14245, + "time_per_iteration": 4.431305170059204 + }, + { + "auxiliary_loss_clip": 0.06408551, + "auxiliary_loss_mlp": 0.01268725, + "balance_loss_clip": 0.06271117, + "balance_loss_mlp": 0.01258425, + "epoch": 0.8565158575078912, + "flos": 23447341774080.0, + "grad_norm": 1.5238034305670078, + "language_loss": 0.78042555, + "learning_rate": 2.1202317413394488e-07, + "loss": 0.85719824, + "num_input_tokens_seen": 307261485, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.10290527, + "step": 14246, + "time_per_iteration": 2.5076048374176025 + }, + { + "auxiliary_loss_clip": 0.06399557, + "auxiliary_loss_mlp": 0.01263061, + "balance_loss_clip": 0.06269571, + "balance_loss_mlp": 0.01253954, + "epoch": 0.8565759807605592, + "flos": 20382160366080.0, + "grad_norm": 1.895687760539362, + "language_loss": 0.81607592, + "learning_rate": 2.1184869412014938e-07, + "loss": 0.8927021, + "num_input_tokens_seen": 307279160, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.09100342, + "step": 14247, + "time_per_iteration": 3.8989782333374023 + }, + { + "auxiliary_loss_clip": 0.06403957, + "auxiliary_loss_mlp": 0.01266682, + "balance_loss_clip": 0.06271452, + "balance_loss_mlp": 0.01256246, + "epoch": 0.8566361040132271, + "flos": 18813078408960.0, + "grad_norm": 1.6009384046261905, + "language_loss": 0.77626634, + "learning_rate": 2.1167428191323112e-07, + "loss": 0.85297275, + "num_input_tokens_seen": 307297920, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.10437012, + "step": 14248, + "time_per_iteration": 2.458406686782837 + }, + { + "auxiliary_loss_clip": 0.06403801, + "auxiliary_loss_mlp": 0.01262882, + "balance_loss_clip": 0.06269226, + "balance_loss_mlp": 0.01253131, + "epoch": 0.8566962272658951, + "flos": 24542289002880.0, + "grad_norm": 1.7603443054940122, + "language_loss": 0.78292143, + "learning_rate": 2.1149993751980278e-07, + "loss": 0.85958827, + "num_input_tokens_seen": 307318320, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.09747314, + "step": 14249, + "time_per_iteration": 2.5413098335266113 + }, + { + "auxiliary_loss_clip": 0.06403436, + "auxiliary_loss_mlp": 0.01264294, + "balance_loss_clip": 0.06273547, + "balance_loss_mlp": 0.01254739, + "epoch": 0.856756350518563, + "flos": 23184062645760.0, + "grad_norm": 1.5958025284963269, + "language_loss": 0.78781301, + "learning_rate": 2.1132566094647597e-07, + "loss": 0.86449027, + "num_input_tokens_seen": 307336720, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.09552002, + "step": 14250, + "time_per_iteration": 2.5379374027252197 + }, + { + "auxiliary_loss_clip": 0.06401314, + "auxiliary_loss_mlp": 0.0126559, + "balance_loss_clip": 0.06273337, + "balance_loss_mlp": 0.01256906, + "epoch": 0.856816473771231, + "flos": 20814017909760.0, + "grad_norm": 1.6478543991539485, + "language_loss": 0.80071545, + "learning_rate": 2.1115145219985942e-07, + "loss": 0.87738448, + "num_input_tokens_seen": 307354120, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.08685303, + "step": 14251, + "time_per_iteration": 2.5280861854553223 + }, + { + "auxiliary_loss_clip": 0.06403105, + "auxiliary_loss_mlp": 0.01263534, + "balance_loss_clip": 0.06271777, + "balance_loss_mlp": 0.01254206, + "epoch": 0.856876597023899, + "flos": 20234057074560.0, + "grad_norm": 1.9560781121028739, + "language_loss": 0.61853564, + "learning_rate": 2.1097731128656005e-07, + "loss": 0.69520199, + "num_input_tokens_seen": 307373165, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09320068, + "step": 14252, + "time_per_iteration": 2.5199599266052246 + }, + { + "auxiliary_loss_clip": 0.06406347, + "auxiliary_loss_mlp": 0.01266439, + "balance_loss_clip": 0.06272375, + "balance_loss_mlp": 0.01256324, + "epoch": 0.856936720276567, + "flos": 18301991230080.0, + "grad_norm": 1.7507738475608288, + "language_loss": 0.6978209, + "learning_rate": 2.1080323821317924e-07, + "loss": 0.77454877, + "num_input_tokens_seen": 307391000, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.10113525, + "step": 14253, + "time_per_iteration": 2.5490400791168213 + }, + { + "auxiliary_loss_clip": 0.06309576, + "auxiliary_loss_mlp": 0.01251585, + "balance_loss_clip": 0.06254718, + "balance_loss_mlp": 0.01250532, + "epoch": 0.8569968435292349, + "flos": 69897547739520.0, + "grad_norm": 0.7701050589451736, + "language_loss": 0.59286332, + "learning_rate": 2.1062923298631907e-07, + "loss": 0.66847491, + "num_input_tokens_seen": 307452865, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.01053619, + "step": 14254, + "time_per_iteration": 3.209148645401001 + }, + { + "auxiliary_loss_clip": 0.06397738, + "auxiliary_loss_mlp": 0.01265165, + "balance_loss_clip": 0.06269066, + "balance_loss_mlp": 0.01254228, + "epoch": 0.8570569667819029, + "flos": 25855680625920.0, + "grad_norm": 2.258350207103323, + "language_loss": 0.81105256, + "learning_rate": 2.1045529561257825e-07, + "loss": 0.88768154, + "num_input_tokens_seen": 307471940, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.109375, + "step": 14255, + "time_per_iteration": 2.5137405395507812 + }, + { + "auxiliary_loss_clip": 0.06400292, + "auxiliary_loss_mlp": 0.01262539, + "balance_loss_clip": 0.06272858, + "balance_loss_mlp": 0.01253539, + "epoch": 0.8571170900345708, + "flos": 23263627697280.0, + "grad_norm": 1.9053302406900494, + "language_loss": 0.67952186, + "learning_rate": 2.1028142609855126e-07, + "loss": 0.75615019, + "num_input_tokens_seen": 307488745, + "router_z_loss_clip": 1.2734375, + "router_z_loss_mlp": 0.09008789, + "step": 14256, + "time_per_iteration": 2.532684326171875 + }, + { + "auxiliary_loss_clip": 0.06404783, + "auxiliary_loss_mlp": 0.01266294, + "balance_loss_clip": 0.06271411, + "balance_loss_mlp": 0.01256543, + "epoch": 0.8571772132872388, + "flos": 18923851906560.0, + "grad_norm": 1.4788145502824088, + "language_loss": 0.70254731, + "learning_rate": 2.1010762445083218e-07, + "loss": 0.77925813, + "num_input_tokens_seen": 307506855, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.09759521, + "step": 14257, + "time_per_iteration": 2.4685792922973633 + }, + { + "auxiliary_loss_clip": 0.06404016, + "auxiliary_loss_mlp": 0.01260827, + "balance_loss_clip": 0.06273229, + "balance_loss_mlp": 0.0125138, + "epoch": 0.8572373365399069, + "flos": 33257619077760.0, + "grad_norm": 5.167351592300506, + "language_loss": 0.77215445, + "learning_rate": 2.0993389067601197e-07, + "loss": 0.84880286, + "num_input_tokens_seen": 307526115, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.09442139, + "step": 14258, + "time_per_iteration": 2.5947256088256836 + }, + { + "auxiliary_loss_clip": 0.06404524, + "auxiliary_loss_mlp": 0.01264942, + "balance_loss_clip": 0.06275545, + "balance_loss_mlp": 0.01255441, + "epoch": 0.8572974597925748, + "flos": 23333633383680.0, + "grad_norm": 1.616211280257574, + "language_loss": 0.68083584, + "learning_rate": 2.0976022478067735e-07, + "loss": 0.75753057, + "num_input_tokens_seen": 307545230, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.0949707, + "step": 14259, + "time_per_iteration": 2.503953456878662 + }, + { + "auxiliary_loss_clip": 0.06403054, + "auxiliary_loss_mlp": 0.01267159, + "balance_loss_clip": 0.0627027, + "balance_loss_mlp": 0.01256961, + "epoch": 0.8573575830452428, + "flos": 24542875981440.0, + "grad_norm": 1.7496586618740582, + "language_loss": 0.77195299, + "learning_rate": 2.0958662677141437e-07, + "loss": 0.8486551, + "num_input_tokens_seen": 307564900, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.10192871, + "step": 14260, + "time_per_iteration": 2.5407462120056152 + }, + { + "auxiliary_loss_clip": 0.06405485, + "auxiliary_loss_mlp": 0.01263632, + "balance_loss_clip": 0.06271508, + "balance_loss_mlp": 0.01253619, + "epoch": 0.8574177062979107, + "flos": 24171422832000.0, + "grad_norm": 1.7543477262218912, + "language_loss": 0.74165386, + "learning_rate": 2.09413096654806e-07, + "loss": 0.81834501, + "num_input_tokens_seen": 307583500, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.10015869, + "step": 14261, + "time_per_iteration": 2.4984147548675537 + }, + { + "auxiliary_loss_clip": 0.06407628, + "auxiliary_loss_mlp": 0.01265927, + "balance_loss_clip": 0.06272539, + "balance_loss_mlp": 0.01255139, + "epoch": 0.8574778295505787, + "flos": 17936449793280.0, + "grad_norm": 2.9359486176790686, + "language_loss": 0.79358846, + "learning_rate": 2.0923963443743276e-07, + "loss": 0.87032402, + "num_input_tokens_seen": 307601430, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.10784912, + "step": 14262, + "time_per_iteration": 2.4626708030700684 + }, + { + "auxiliary_loss_clip": 0.06400175, + "auxiliary_loss_mlp": 0.01267289, + "balance_loss_clip": 0.0627176, + "balance_loss_mlp": 0.01258098, + "epoch": 0.8575379528032466, + "flos": 21587252186880.0, + "grad_norm": 1.616838611011757, + "language_loss": 0.6784209, + "learning_rate": 2.0906624012587203e-07, + "loss": 0.75509548, + "num_input_tokens_seen": 307621495, + "router_z_loss_clip": 1.28222656, + "router_z_loss_mlp": 0.09185791, + "step": 14263, + "time_per_iteration": 2.4902124404907227 + }, + { + "auxiliary_loss_clip": 0.06405489, + "auxiliary_loss_mlp": 0.01262847, + "balance_loss_clip": 0.06272297, + "balance_loss_mlp": 0.01253471, + "epoch": 0.8575980760559146, + "flos": 21767905589760.0, + "grad_norm": 1.9571137270825887, + "language_loss": 0.79872948, + "learning_rate": 2.088929137266986e-07, + "loss": 0.87541282, + "num_input_tokens_seen": 307640840, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.09375, + "step": 14264, + "time_per_iteration": 2.5202577114105225 + }, + { + "auxiliary_loss_clip": 0.06404608, + "auxiliary_loss_mlp": 0.01269305, + "balance_loss_clip": 0.0627332, + "balance_loss_mlp": 0.01259978, + "epoch": 0.8576581993085826, + "flos": 34395011199360.0, + "grad_norm": 2.2143904362028644, + "language_loss": 0.69639301, + "learning_rate": 2.0871965524648582e-07, + "loss": 0.77313221, + "num_input_tokens_seen": 307663820, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09326172, + "step": 14265, + "time_per_iteration": 2.612647771835327 + }, + { + "auxiliary_loss_clip": 0.06399523, + "auxiliary_loss_mlp": 0.012609, + "balance_loss_clip": 0.06272203, + "balance_loss_mlp": 0.01251942, + "epoch": 0.8577183225612506, + "flos": 23229316650240.0, + "grad_norm": 1.6733169528814695, + "language_loss": 0.65993267, + "learning_rate": 2.085464646918027e-07, + "loss": 0.73653686, + "num_input_tokens_seen": 307682385, + "router_z_loss_clip": 1.2734375, + "router_z_loss_mlp": 0.08966064, + "step": 14266, + "time_per_iteration": 2.5544586181640625 + }, + { + "auxiliary_loss_clip": 0.06401126, + "auxiliary_loss_mlp": 0.01265154, + "balance_loss_clip": 0.06271696, + "balance_loss_mlp": 0.01255563, + "epoch": 0.8577784458139185, + "flos": 28811807544960.0, + "grad_norm": 1.5935040876679754, + "language_loss": 0.75452656, + "learning_rate": 2.0837334206921731e-07, + "loss": 0.83118939, + "num_input_tokens_seen": 307704680, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.09576416, + "step": 14267, + "time_per_iteration": 2.5590057373046875 + }, + { + "auxiliary_loss_clip": 0.06401159, + "auxiliary_loss_mlp": 0.01264336, + "balance_loss_clip": 0.06272185, + "balance_loss_mlp": 0.01255527, + "epoch": 0.8578385690665865, + "flos": 19761683281920.0, + "grad_norm": 1.584742251328993, + "language_loss": 0.87780321, + "learning_rate": 2.082002873852946e-07, + "loss": 0.95445812, + "num_input_tokens_seen": 307723245, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.0880127, + "step": 14268, + "time_per_iteration": 2.525526523590088 + }, + { + "auxiliary_loss_clip": 0.06411083, + "auxiliary_loss_mlp": 0.01266639, + "balance_loss_clip": 0.06275931, + "balance_loss_mlp": 0.012569, + "epoch": 0.8578986923192544, + "flos": 20710330081920.0, + "grad_norm": 2.0171508570409173, + "language_loss": 0.7276274, + "learning_rate": 2.0802730064659667e-07, + "loss": 0.80440462, + "num_input_tokens_seen": 307742510, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.09747314, + "step": 14269, + "time_per_iteration": 3.9116053581237793 + }, + { + "auxiliary_loss_clip": 0.06407435, + "auxiliary_loss_mlp": 0.01264024, + "balance_loss_clip": 0.06273964, + "balance_loss_mlp": 0.0125438, + "epoch": 0.8579588155719224, + "flos": 36110645147520.0, + "grad_norm": 1.865981060297471, + "language_loss": 0.66775644, + "learning_rate": 2.0785438185968252e-07, + "loss": 0.74447107, + "num_input_tokens_seen": 307766030, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.09637451, + "step": 14270, + "time_per_iteration": 2.618803024291992 + }, + { + "auxiliary_loss_clip": 0.06402225, + "auxiliary_loss_mlp": 0.01263727, + "balance_loss_clip": 0.06272581, + "balance_loss_mlp": 0.01254542, + "epoch": 0.8580189388245905, + "flos": 22859540582400.0, + "grad_norm": 2.2948861169859525, + "language_loss": 0.73892224, + "learning_rate": 2.0768153103110997e-07, + "loss": 0.8155818, + "num_input_tokens_seen": 307785800, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.09179688, + "step": 14271, + "time_per_iteration": 2.497725486755371 + }, + { + "auxiliary_loss_clip": 0.06309859, + "auxiliary_loss_mlp": 0.01251844, + "balance_loss_clip": 0.06254922, + "balance_loss_mlp": 0.01250786, + "epoch": 0.8580790620772584, + "flos": 69664414152960.0, + "grad_norm": 0.7639484057926735, + "language_loss": 0.58678043, + "learning_rate": 2.0750874816743358e-07, + "loss": 0.66239738, + "num_input_tokens_seen": 307850995, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.01059723, + "step": 14272, + "time_per_iteration": 3.169260263442993 + }, + { + "auxiliary_loss_clip": 0.06409359, + "auxiliary_loss_mlp": 0.01262454, + "balance_loss_clip": 0.06272221, + "balance_loss_mlp": 0.01252566, + "epoch": 0.8581391853299264, + "flos": 13339306586880.0, + "grad_norm": 1.7586191821345811, + "language_loss": 0.75792611, + "learning_rate": 2.0733603327520499e-07, + "loss": 0.8346442, + "num_input_tokens_seen": 307868585, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.09887695, + "step": 14273, + "time_per_iteration": 2.478921890258789 + }, + { + "auxiliary_loss_clip": 0.06403127, + "auxiliary_loss_mlp": 0.01265609, + "balance_loss_clip": 0.06271982, + "balance_loss_mlp": 0.01256489, + "epoch": 0.8581993085825943, + "flos": 19651664471040.0, + "grad_norm": 1.8547741547168304, + "language_loss": 0.82333291, + "learning_rate": 2.0716338636097385e-07, + "loss": 0.9000203, + "num_input_tokens_seen": 307886820, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09118652, + "step": 14274, + "time_per_iteration": 2.478856086730957 + }, + { + "auxiliary_loss_clip": 0.06313574, + "auxiliary_loss_mlp": 0.01252106, + "balance_loss_clip": 0.06258807, + "balance_loss_mlp": 0.01251031, + "epoch": 0.8582594318352623, + "flos": 55840826494080.0, + "grad_norm": 3.2665895099659745, + "language_loss": 0.60961515, + "learning_rate": 2.0699080743128672e-07, + "loss": 0.68527198, + "num_input_tokens_seen": 307944020, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.01076508, + "step": 14275, + "time_per_iteration": 3.197674036026001 + }, + { + "auxiliary_loss_clip": 0.06405815, + "auxiliary_loss_mlp": 0.01264154, + "balance_loss_clip": 0.06271765, + "balance_loss_mlp": 0.01254314, + "epoch": 0.8583195550879302, + "flos": 24286389033600.0, + "grad_norm": 2.04706011240556, + "language_loss": 0.59755701, + "learning_rate": 2.0681829649268768e-07, + "loss": 0.67425668, + "num_input_tokens_seen": 307961055, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.09844971, + "step": 14276, + "time_per_iteration": 2.530808448791504 + }, + { + "auxiliary_loss_clip": 0.06402551, + "auxiliary_loss_mlp": 0.01264566, + "balance_loss_clip": 0.06271014, + "balance_loss_mlp": 0.01254559, + "epoch": 0.8583796783405983, + "flos": 13449283470720.0, + "grad_norm": 1.6940890444447256, + "language_loss": 0.76255608, + "learning_rate": 2.0664585355171838e-07, + "loss": 0.8392272, + "num_input_tokens_seen": 307978690, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.10009766, + "step": 14277, + "time_per_iteration": 2.541459083557129 + }, + { + "auxiliary_loss_clip": 0.06400612, + "auxiliary_loss_mlp": 0.01266902, + "balance_loss_clip": 0.06269176, + "balance_loss_mlp": 0.01256525, + "epoch": 0.8584398015932662, + "flos": 16185833965440.0, + "grad_norm": 1.51585453174595, + "language_loss": 0.84088707, + "learning_rate": 2.0647347861491803e-07, + "loss": 0.91756225, + "num_input_tokens_seen": 307995870, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.10369873, + "step": 14278, + "time_per_iteration": 2.4656083583831787 + }, + { + "auxiliary_loss_clip": 0.06406611, + "auxiliary_loss_mlp": 0.01267273, + "balance_loss_clip": 0.06270614, + "balance_loss_mlp": 0.01256717, + "epoch": 0.8584999248459342, + "flos": 17455061687040.0, + "grad_norm": 1.7809196500006463, + "language_loss": 0.74783373, + "learning_rate": 2.0630117168882366e-07, + "loss": 0.82457256, + "num_input_tokens_seen": 308013645, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.10552979, + "step": 14279, + "time_per_iteration": 2.515935182571411 + }, + { + "auxiliary_loss_clip": 0.06400705, + "auxiliary_loss_mlp": 0.0126475, + "balance_loss_clip": 0.06270881, + "balance_loss_mlp": 0.01256018, + "epoch": 0.8585600480986021, + "flos": 23447802971520.0, + "grad_norm": 2.7435347339803826, + "language_loss": 0.66660666, + "learning_rate": 2.0612893277996845e-07, + "loss": 0.74326128, + "num_input_tokens_seen": 308032490, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.08734131, + "step": 14280, + "time_per_iteration": 3.916933059692383 + }, + { + "auxiliary_loss_clip": 0.06399409, + "auxiliary_loss_mlp": 0.01264296, + "balance_loss_clip": 0.06269073, + "balance_loss_mlp": 0.01255195, + "epoch": 0.8586201713512701, + "flos": 19944055693440.0, + "grad_norm": 2.5855570213577588, + "language_loss": 0.62396699, + "learning_rate": 2.0595676189488343e-07, + "loss": 0.70060408, + "num_input_tokens_seen": 308052110, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.09106445, + "step": 14281, + "time_per_iteration": 2.505758762359619 + }, + { + "auxiliary_loss_clip": 0.06404914, + "auxiliary_loss_mlp": 0.01264619, + "balance_loss_clip": 0.06272723, + "balance_loss_mlp": 0.0125488, + "epoch": 0.858680294603938, + "flos": 15310211598720.0, + "grad_norm": 1.5539909401541185, + "language_loss": 0.73079032, + "learning_rate": 2.0578465904009845e-07, + "loss": 0.80748564, + "num_input_tokens_seen": 308070660, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09747314, + "step": 14282, + "time_per_iteration": 2.493986129760742 + }, + { + "auxiliary_loss_clip": 0.06398949, + "auxiliary_loss_mlp": 0.01260814, + "balance_loss_clip": 0.06269239, + "balance_loss_mlp": 0.01252117, + "epoch": 0.858740417856606, + "flos": 22717894055040.0, + "grad_norm": 1.8222049767211217, + "language_loss": 0.75866199, + "learning_rate": 2.0561262422213832e-07, + "loss": 0.83525962, + "num_input_tokens_seen": 308089520, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.08691406, + "step": 14283, + "time_per_iteration": 2.5006518363952637 + }, + { + "auxiliary_loss_clip": 0.06400195, + "auxiliary_loss_mlp": 0.01261844, + "balance_loss_clip": 0.06268735, + "balance_loss_mlp": 0.01252963, + "epoch": 0.8588005411092741, + "flos": 34062187582080.0, + "grad_norm": 1.6694102205368824, + "language_loss": 0.60190046, + "learning_rate": 2.0544065744752736e-07, + "loss": 0.67852092, + "num_input_tokens_seen": 308111545, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.08874512, + "step": 14284, + "time_per_iteration": 2.5979769229888916 + }, + { + "auxiliary_loss_clip": 0.06398802, + "auxiliary_loss_mlp": 0.01262388, + "balance_loss_clip": 0.06269779, + "balance_loss_mlp": 0.01253531, + "epoch": 0.858860664361942, + "flos": 28921239377280.0, + "grad_norm": 1.896816667575115, + "language_loss": 0.7606923, + "learning_rate": 2.0526875872278749e-07, + "loss": 0.83730417, + "num_input_tokens_seen": 308129690, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.08856201, + "step": 14285, + "time_per_iteration": 3.9742085933685303 + }, + { + "auxiliary_loss_clip": 0.06406308, + "auxiliary_loss_mlp": 0.01264594, + "balance_loss_clip": 0.06271583, + "balance_loss_mlp": 0.01254682, + "epoch": 0.85892078761461, + "flos": 19798719586560.0, + "grad_norm": 1.6271590224898915, + "language_loss": 0.74210882, + "learning_rate": 2.0509692805443524e-07, + "loss": 0.81881779, + "num_input_tokens_seen": 308147410, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.09906006, + "step": 14286, + "time_per_iteration": 3.961693048477173 + }, + { + "auxiliary_loss_clip": 0.06312392, + "auxiliary_loss_mlp": 0.01251687, + "balance_loss_clip": 0.06257644, + "balance_loss_mlp": 0.01250683, + "epoch": 0.8589809108672779, + "flos": 67125512240640.0, + "grad_norm": 0.7443163222732918, + "language_loss": 0.49355024, + "learning_rate": 2.0492516544898718e-07, + "loss": 0.56919104, + "num_input_tokens_seen": 308204875, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.01003265, + "step": 14287, + "time_per_iteration": 3.081287145614624 + }, + { + "auxiliary_loss_clip": 0.06402116, + "auxiliary_loss_mlp": 0.01263241, + "balance_loss_clip": 0.06269466, + "balance_loss_mlp": 0.01253323, + "epoch": 0.8590410341199459, + "flos": 29724046945920.0, + "grad_norm": 1.7960694275427957, + "language_loss": 0.79450381, + "learning_rate": 2.0475347091295704e-07, + "loss": 0.87115741, + "num_input_tokens_seen": 308225690, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.09912109, + "step": 14288, + "time_per_iteration": 2.549579381942749 + }, + { + "auxiliary_loss_clip": 0.06406873, + "auxiliary_loss_mlp": 0.0126658, + "balance_loss_clip": 0.06272471, + "balance_loss_mlp": 0.01256197, + "epoch": 0.8591011573726138, + "flos": 23994165519360.0, + "grad_norm": 1.8099062195023483, + "language_loss": 0.81317496, + "learning_rate": 2.045818444528553e-07, + "loss": 0.88990951, + "num_input_tokens_seen": 308245255, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.10375977, + "step": 14289, + "time_per_iteration": 2.532503366470337 + }, + { + "auxiliary_loss_clip": 0.06402125, + "auxiliary_loss_mlp": 0.01263769, + "balance_loss_clip": 0.06271179, + "balance_loss_mlp": 0.01254054, + "epoch": 0.8591612806252819, + "flos": 14433876472320.0, + "grad_norm": 1.6143264802543886, + "language_loss": 0.6542815, + "learning_rate": 2.0441028607518973e-07, + "loss": 0.7309404, + "num_input_tokens_seen": 308261755, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.09710693, + "step": 14290, + "time_per_iteration": 2.4673476219177246 + }, + { + "auxiliary_loss_clip": 0.06405544, + "auxiliary_loss_mlp": 0.01262804, + "balance_loss_clip": 0.06270199, + "balance_loss_mlp": 0.012526, + "epoch": 0.8592214038779498, + "flos": 31585268563200.0, + "grad_norm": 2.147386540857062, + "language_loss": 0.5574224, + "learning_rate": 2.0423879578646642e-07, + "loss": 0.63410592, + "num_input_tokens_seen": 308285145, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.10205078, + "step": 14291, + "time_per_iteration": 2.634934425354004 + }, + { + "auxiliary_loss_clip": 0.06403403, + "auxiliary_loss_mlp": 0.01264218, + "balance_loss_clip": 0.06271186, + "balance_loss_mlp": 0.0125427, + "epoch": 0.8592815271306178, + "flos": 17463069751680.0, + "grad_norm": 2.0257150352321256, + "language_loss": 0.71959877, + "learning_rate": 2.0406737359318792e-07, + "loss": 0.79627502, + "num_input_tokens_seen": 308304130, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09954834, + "step": 14292, + "time_per_iteration": 2.4553961753845215 + }, + { + "auxiliary_loss_clip": 0.06403185, + "auxiliary_loss_mlp": 0.01263162, + "balance_loss_clip": 0.06270117, + "balance_loss_mlp": 0.01253929, + "epoch": 0.8593416503832857, + "flos": 25418498348160.0, + "grad_norm": 1.3381246650209893, + "language_loss": 0.71274585, + "learning_rate": 2.038960195018542e-07, + "loss": 0.78940934, + "num_input_tokens_seen": 308324670, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.09228516, + "step": 14293, + "time_per_iteration": 2.56117844581604 + }, + { + "auxiliary_loss_clip": 0.06400074, + "auxiliary_loss_mlp": 0.01261361, + "balance_loss_clip": 0.06270564, + "balance_loss_mlp": 0.01252629, + "epoch": 0.8594017736359537, + "flos": 21003056720640.0, + "grad_norm": 3.825132104527405, + "language_loss": 0.68924177, + "learning_rate": 2.0372473351896358e-07, + "loss": 0.76585615, + "num_input_tokens_seen": 308344215, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.08721924, + "step": 14294, + "time_per_iteration": 2.4963736534118652 + }, + { + "auxiliary_loss_clip": 0.06396788, + "auxiliary_loss_mlp": 0.01263426, + "balance_loss_clip": 0.06268485, + "balance_loss_mlp": 0.01254218, + "epoch": 0.8594618968886216, + "flos": 22097626606080.0, + "grad_norm": 1.805212015136028, + "language_loss": 0.78444296, + "learning_rate": 2.0355351565101087e-07, + "loss": 0.86104512, + "num_input_tokens_seen": 308360520, + "router_z_loss_clip": 1.28222656, + "router_z_loss_mlp": 0.09204102, + "step": 14295, + "time_per_iteration": 2.5134646892547607 + }, + { + "auxiliary_loss_clip": 0.06408249, + "auxiliary_loss_mlp": 0.01265112, + "balance_loss_clip": 0.06271674, + "balance_loss_mlp": 0.01253948, + "epoch": 0.8595220201412896, + "flos": 11661086286720.0, + "grad_norm": 2.7942491090682213, + "language_loss": 0.69070399, + "learning_rate": 2.0338236590448975e-07, + "loss": 0.76743758, + "num_input_tokens_seen": 308376865, + "router_z_loss_clip": 1.36621094, + "router_z_loss_mlp": 0.11151123, + "step": 14296, + "time_per_iteration": 2.475787878036499 + }, + { + "auxiliary_loss_clip": 0.0640314, + "auxiliary_loss_mlp": 0.01263171, + "balance_loss_clip": 0.06271674, + "balance_loss_mlp": 0.01253497, + "epoch": 0.8595821433939577, + "flos": 25046416293120.0, + "grad_norm": 1.9233061484509495, + "language_loss": 0.79669362, + "learning_rate": 2.0321128428588842e-07, + "loss": 0.87335676, + "num_input_tokens_seen": 308395870, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.09674072, + "step": 14297, + "time_per_iteration": 2.5401291847229004 + }, + { + "auxiliary_loss_clip": 0.06396289, + "auxiliary_loss_mlp": 0.01268362, + "balance_loss_clip": 0.06267644, + "balance_loss_mlp": 0.01259951, + "epoch": 0.8596422666466256, + "flos": 28518997052160.0, + "grad_norm": 2.2682977179383372, + "language_loss": 0.68144363, + "learning_rate": 2.030402708016954e-07, + "loss": 0.75809014, + "num_input_tokens_seen": 308417250, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.08410645, + "step": 14298, + "time_per_iteration": 2.5733871459960938 + }, + { + "auxiliary_loss_clip": 0.06398705, + "auxiliary_loss_mlp": 0.01260865, + "balance_loss_clip": 0.06270595, + "balance_loss_mlp": 0.01251913, + "epoch": 0.8597023898992936, + "flos": 13594158380160.0, + "grad_norm": 1.9854858480921735, + "language_loss": 0.68880069, + "learning_rate": 2.0286932545839576e-07, + "loss": 0.76539636, + "num_input_tokens_seen": 308434565, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.08947754, + "step": 14299, + "time_per_iteration": 2.488328456878662 + }, + { + "auxiliary_loss_clip": 0.06404358, + "auxiliary_loss_mlp": 0.01264205, + "balance_loss_clip": 0.06271502, + "balance_loss_mlp": 0.01254454, + "epoch": 0.8597625131519615, + "flos": 32308049882880.0, + "grad_norm": 2.1252767779815374, + "language_loss": 0.71345496, + "learning_rate": 2.0269844826247096e-07, + "loss": 0.79014063, + "num_input_tokens_seen": 308450040, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.09753418, + "step": 14300, + "time_per_iteration": 2.5601115226745605 + }, + { + "auxiliary_loss_clip": 0.06400272, + "auxiliary_loss_mlp": 0.01267131, + "balance_loss_clip": 0.06269163, + "balance_loss_mlp": 0.01258274, + "epoch": 0.8598226364046295, + "flos": 28737860716800.0, + "grad_norm": 1.7436356561716806, + "language_loss": 0.6957137, + "learning_rate": 2.0252763922040116e-07, + "loss": 0.77238768, + "num_input_tokens_seen": 308470545, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.08856201, + "step": 14301, + "time_per_iteration": 2.6039092540740967 + }, + { + "auxiliary_loss_clip": 0.06402557, + "auxiliary_loss_mlp": 0.01265888, + "balance_loss_clip": 0.06270393, + "balance_loss_mlp": 0.01256661, + "epoch": 0.8598827596572974, + "flos": 21878301744000.0, + "grad_norm": 1.5832191765924557, + "language_loss": 0.74322796, + "learning_rate": 2.023568983386641e-07, + "loss": 0.81991243, + "num_input_tokens_seen": 308490020, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.09228516, + "step": 14302, + "time_per_iteration": 2.4957993030548096 + }, + { + "auxiliary_loss_clip": 0.06400271, + "auxiliary_loss_mlp": 0.01260712, + "balance_loss_clip": 0.06272089, + "balance_loss_mlp": 0.01251855, + "epoch": 0.8599428829099655, + "flos": 23773792481280.0, + "grad_norm": 1.75128895706435, + "language_loss": 0.83832628, + "learning_rate": 2.02186225623733e-07, + "loss": 0.91493607, + "num_input_tokens_seen": 308509065, + "router_z_loss_clip": 1.28222656, + "router_z_loss_mlp": 0.08856201, + "step": 14303, + "time_per_iteration": 2.522888660430908 + }, + { + "auxiliary_loss_clip": 0.06405427, + "auxiliary_loss_mlp": 0.01264688, + "balance_loss_clip": 0.06271775, + "balance_loss_mlp": 0.01254543, + "epoch": 0.8600030061626334, + "flos": 16217671317120.0, + "grad_norm": 2.5248591398182327, + "language_loss": 0.7718581, + "learning_rate": 2.0201562108208025e-07, + "loss": 0.84855914, + "num_input_tokens_seen": 308524725, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.10137939, + "step": 14304, + "time_per_iteration": 2.4513118267059326 + }, + { + "auxiliary_loss_clip": 0.06403493, + "auxiliary_loss_mlp": 0.01262423, + "balance_loss_clip": 0.06271586, + "balance_loss_mlp": 0.0125201, + "epoch": 0.8600631294153014, + "flos": 15674830640640.0, + "grad_norm": 2.4458831318070815, + "language_loss": 0.54347569, + "learning_rate": 2.0184508472017537e-07, + "loss": 0.62013483, + "num_input_tokens_seen": 308543525, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.10424805, + "step": 14305, + "time_per_iteration": 2.4636104106903076 + }, + { + "auxiliary_loss_clip": 0.06401916, + "auxiliary_loss_mlp": 0.01266463, + "balance_loss_clip": 0.06271758, + "balance_loss_mlp": 0.01256194, + "epoch": 0.8601232526679693, + "flos": 17498764391040.0, + "grad_norm": 1.7675730532667615, + "language_loss": 0.83626974, + "learning_rate": 2.0167461654448558e-07, + "loss": 0.9129535, + "num_input_tokens_seen": 308557995, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.1027832, + "step": 14306, + "time_per_iteration": 2.438267230987549 + }, + { + "auxiliary_loss_clip": 0.0639829, + "auxiliary_loss_mlp": 0.01261955, + "balance_loss_clip": 0.06269355, + "balance_loss_mlp": 0.01252764, + "epoch": 0.8601833759206373, + "flos": 26994288631680.0, + "grad_norm": 1.2962192910177055, + "language_loss": 0.71717322, + "learning_rate": 2.01504216561474e-07, + "loss": 0.79377568, + "num_input_tokens_seen": 308582750, + "router_z_loss_clip": 1.28808594, + "router_z_loss_mlp": 0.09191895, + "step": 14307, + "time_per_iteration": 2.592008590698242 + }, + { + "auxiliary_loss_clip": 0.06409558, + "auxiliary_loss_mlp": 0.0126879, + "balance_loss_clip": 0.06273729, + "balance_loss_mlp": 0.01258418, + "epoch": 0.8602434991733052, + "flos": 25237006404480.0, + "grad_norm": 1.5952354561078483, + "language_loss": 0.64001, + "learning_rate": 2.0133388477760316e-07, + "loss": 0.71679354, + "num_input_tokens_seen": 308603770, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10369873, + "step": 14308, + "time_per_iteration": 3.9432108402252197 + }, + { + "auxiliary_loss_clip": 0.06312782, + "auxiliary_loss_mlp": 0.01249453, + "balance_loss_clip": 0.06257753, + "balance_loss_mlp": 0.01248612, + "epoch": 0.8603036224259732, + "flos": 71035694547840.0, + "grad_norm": 0.693627555027915, + "language_loss": 0.48403317, + "learning_rate": 2.0116362119933172e-07, + "loss": 0.55965549, + "num_input_tokens_seen": 308667735, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.00842285, + "step": 14309, + "time_per_iteration": 3.236663579940796 + }, + { + "auxiliary_loss_clip": 0.06405216, + "auxiliary_loss_mlp": 0.012639, + "balance_loss_clip": 0.06271836, + "balance_loss_mlp": 0.01253011, + "epoch": 0.8603637456786413, + "flos": 20306452602240.0, + "grad_norm": 5.430428245021858, + "language_loss": 0.6706562, + "learning_rate": 2.0099342583311563e-07, + "loss": 0.74734735, + "num_input_tokens_seen": 308686300, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.10888672, + "step": 14310, + "time_per_iteration": 2.5191948413848877 + }, + { + "auxiliary_loss_clip": 0.06399269, + "auxiliary_loss_mlp": 0.01264383, + "balance_loss_clip": 0.06266133, + "balance_loss_mlp": 0.01255657, + "epoch": 0.8604238689313092, + "flos": 21842397469440.0, + "grad_norm": 1.7447011135153685, + "language_loss": 0.78432125, + "learning_rate": 2.0082329868540905e-07, + "loss": 0.8609578, + "num_input_tokens_seen": 308705825, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.0871582, + "step": 14311, + "time_per_iteration": 2.5042197704315186 + }, + { + "auxiliary_loss_clip": 0.06401919, + "auxiliary_loss_mlp": 0.01263334, + "balance_loss_clip": 0.06270894, + "balance_loss_mlp": 0.01253904, + "epoch": 0.8604839921839772, + "flos": 18010019278080.0, + "grad_norm": 1.955815230439552, + "language_loss": 0.71597105, + "learning_rate": 2.006532397626639e-07, + "loss": 0.79262364, + "num_input_tokens_seen": 308723340, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.09429932, + "step": 14312, + "time_per_iteration": 2.5219128131866455 + }, + { + "auxiliary_loss_clip": 0.06400298, + "auxiliary_loss_mlp": 0.01265117, + "balance_loss_clip": 0.06270005, + "balance_loss_mlp": 0.01255586, + "epoch": 0.8605441154366451, + "flos": 16257558660480.0, + "grad_norm": 1.7707114111635922, + "language_loss": 0.78253788, + "learning_rate": 2.0048324907132797e-07, + "loss": 0.85919207, + "num_input_tokens_seen": 308741280, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.09527588, + "step": 14313, + "time_per_iteration": 2.493755340576172 + }, + { + "auxiliary_loss_clip": 0.06400809, + "auxiliary_loss_mlp": 0.01265934, + "balance_loss_clip": 0.06272502, + "balance_loss_mlp": 0.01255837, + "epoch": 0.8606042386893131, + "flos": 32274745084800.0, + "grad_norm": 1.4922872578644866, + "language_loss": 0.72934496, + "learning_rate": 2.003133266178474e-07, + "loss": 0.80601239, + "num_input_tokens_seen": 308762875, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.10101318, + "step": 14314, + "time_per_iteration": 2.621281862258911 + }, + { + "auxiliary_loss_clip": 0.06400359, + "auxiliary_loss_mlp": 0.01263313, + "balance_loss_clip": 0.06269641, + "balance_loss_mlp": 0.01253687, + "epoch": 0.860664361941981, + "flos": 20235943791360.0, + "grad_norm": 1.7275534208829755, + "language_loss": 0.69404042, + "learning_rate": 2.001434724086657e-07, + "loss": 0.77067709, + "num_input_tokens_seen": 308780315, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09619141, + "step": 14315, + "time_per_iteration": 2.4812421798706055 + }, + { + "auxiliary_loss_clip": 0.06402497, + "auxiliary_loss_mlp": 0.01266885, + "balance_loss_clip": 0.06271563, + "balance_loss_mlp": 0.01257789, + "epoch": 0.8607244851946491, + "flos": 25198586507520.0, + "grad_norm": 1.8449394172666267, + "language_loss": 0.71876442, + "learning_rate": 1.9997368645022418e-07, + "loss": 0.79545832, + "num_input_tokens_seen": 308799435, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09088135, + "step": 14316, + "time_per_iteration": 2.5461459159851074 + }, + { + "auxiliary_loss_clip": 0.06405434, + "auxiliary_loss_mlp": 0.01266236, + "balance_loss_clip": 0.06272785, + "balance_loss_mlp": 0.0125664, + "epoch": 0.860784608447317, + "flos": 20487776837760.0, + "grad_norm": 1.8785897277558985, + "language_loss": 0.8305161, + "learning_rate": 1.9980396874896056e-07, + "loss": 0.90723282, + "num_input_tokens_seen": 308817730, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.09588623, + "step": 14317, + "time_per_iteration": 2.4942269325256348 + }, + { + "auxiliary_loss_clip": 0.06398265, + "auxiliary_loss_mlp": 0.01264212, + "balance_loss_clip": 0.06269276, + "balance_loss_mlp": 0.01255129, + "epoch": 0.860844731699985, + "flos": 50487192627840.0, + "grad_norm": 1.57272546994991, + "language_loss": 0.67247951, + "learning_rate": 1.996343193113108e-07, + "loss": 0.74910432, + "num_input_tokens_seen": 308841735, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.09082031, + "step": 14318, + "time_per_iteration": 2.753952980041504 + }, + { + "auxiliary_loss_clip": 0.06399272, + "auxiliary_loss_mlp": 0.01259503, + "balance_loss_clip": 0.0627184, + "balance_loss_mlp": 0.01250891, + "epoch": 0.8609048549526529, + "flos": 41182468133760.0, + "grad_norm": 1.5269464521671718, + "language_loss": 0.71332115, + "learning_rate": 1.9946473814370911e-07, + "loss": 0.78990889, + "num_input_tokens_seen": 308865050, + "router_z_loss_clip": 1.27441406, + "router_z_loss_mlp": 0.08612061, + "step": 14319, + "time_per_iteration": 2.6694722175598145 + }, + { + "auxiliary_loss_clip": 0.06406449, + "auxiliary_loss_mlp": 0.01263054, + "balance_loss_clip": 0.06272025, + "balance_loss_mlp": 0.01253565, + "epoch": 0.8609649782053209, + "flos": 23957967755520.0, + "grad_norm": 1.5943400470171931, + "language_loss": 0.67575115, + "learning_rate": 1.992952252525839e-07, + "loss": 0.75244617, + "num_input_tokens_seen": 308885375, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.09484863, + "step": 14320, + "time_per_iteration": 3.9435226917266846 + }, + { + "auxiliary_loss_clip": 0.06404917, + "auxiliary_loss_mlp": 0.01263639, + "balance_loss_clip": 0.06270186, + "balance_loss_mlp": 0.01254036, + "epoch": 0.8610251014579888, + "flos": 23119297839360.0, + "grad_norm": 2.410508268349302, + "language_loss": 0.80603713, + "learning_rate": 1.9912578064436446e-07, + "loss": 0.88272274, + "num_input_tokens_seen": 308904700, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.09606934, + "step": 14321, + "time_per_iteration": 2.55540132522583 + }, + { + "auxiliary_loss_clip": 0.06397673, + "auxiliary_loss_mlp": 0.01266501, + "balance_loss_clip": 0.06271833, + "balance_loss_mlp": 0.01257114, + "epoch": 0.8610852247106568, + "flos": 19432800806400.0, + "grad_norm": 1.7626385906380733, + "language_loss": 0.71308374, + "learning_rate": 1.9895640432547567e-07, + "loss": 0.78972548, + "num_input_tokens_seen": 308922985, + "router_z_loss_clip": 1.25878906, + "router_z_loss_mlp": 0.09387207, + "step": 14322, + "time_per_iteration": 2.454256772994995 + }, + { + "auxiliary_loss_clip": 0.06408723, + "auxiliary_loss_mlp": 0.01266883, + "balance_loss_clip": 0.06271061, + "balance_loss_mlp": 0.01256369, + "epoch": 0.8611453479633249, + "flos": 19317163772160.0, + "grad_norm": 1.7944348088812987, + "language_loss": 0.56349087, + "learning_rate": 1.9878709630234102e-07, + "loss": 0.64024693, + "num_input_tokens_seen": 308940765, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.10516357, + "step": 14323, + "time_per_iteration": 2.502837896347046 + }, + { + "auxiliary_loss_clip": 0.06400344, + "auxiliary_loss_mlp": 0.01266337, + "balance_loss_clip": 0.06269696, + "balance_loss_mlp": 0.0125736, + "epoch": 0.8612054712159928, + "flos": 23259602701440.0, + "grad_norm": 1.6798995165774648, + "language_loss": 0.7580722, + "learning_rate": 1.986178565813801e-07, + "loss": 0.83473897, + "num_input_tokens_seen": 308960110, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.08972168, + "step": 14324, + "time_per_iteration": 3.954850912094116 + }, + { + "auxiliary_loss_clip": 0.06402896, + "auxiliary_loss_mlp": 0.01263656, + "balance_loss_clip": 0.06271403, + "balance_loss_mlp": 0.01253416, + "epoch": 0.8612655944686608, + "flos": 16032992918400.0, + "grad_norm": 2.3205040233098866, + "language_loss": 0.66479814, + "learning_rate": 1.9844868516901036e-07, + "loss": 0.74146366, + "num_input_tokens_seen": 308976665, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.10235596, + "step": 14325, + "time_per_iteration": 2.5524306297302246 + }, + { + "auxiliary_loss_clip": 0.06404022, + "auxiliary_loss_mlp": 0.01264163, + "balance_loss_clip": 0.0627131, + "balance_loss_mlp": 0.01254382, + "epoch": 0.8613257177213287, + "flos": 22499407733760.0, + "grad_norm": 2.0036509537419964, + "language_loss": 0.65199041, + "learning_rate": 1.982795820716472e-07, + "loss": 0.72867227, + "num_input_tokens_seen": 308997015, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09765625, + "step": 14326, + "time_per_iteration": 3.9491071701049805 + }, + { + "auxiliary_loss_clip": 0.06404285, + "auxiliary_loss_mlp": 0.01265148, + "balance_loss_clip": 0.06272285, + "balance_loss_mlp": 0.01255719, + "epoch": 0.8613858409739967, + "flos": 17243744889600.0, + "grad_norm": 2.07355797106235, + "language_loss": 0.84871465, + "learning_rate": 1.9811054729570253e-07, + "loss": 0.92540902, + "num_input_tokens_seen": 309015250, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.09436035, + "step": 14327, + "time_per_iteration": 2.4626643657684326 + }, + { + "auxiliary_loss_clip": 0.06403395, + "auxiliary_loss_mlp": 0.01265431, + "balance_loss_clip": 0.06270425, + "balance_loss_mlp": 0.01255793, + "epoch": 0.8614459642266646, + "flos": 22827870938880.0, + "grad_norm": 1.9454555931500424, + "language_loss": 0.75197828, + "learning_rate": 1.9794158084758661e-07, + "loss": 0.82866651, + "num_input_tokens_seen": 309034140, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.09631348, + "step": 14328, + "time_per_iteration": 2.5205399990081787 + }, + { + "auxiliary_loss_clip": 0.06400015, + "auxiliary_loss_mlp": 0.01264532, + "balance_loss_clip": 0.06271035, + "balance_loss_mlp": 0.01255574, + "epoch": 0.8615060874793327, + "flos": 26511349224960.0, + "grad_norm": 1.7079556862754726, + "language_loss": 0.80290902, + "learning_rate": 1.9777268273370673e-07, + "loss": 0.87955445, + "num_input_tokens_seen": 309055075, + "router_z_loss_clip": 1.28808594, + "router_z_loss_mlp": 0.08959961, + "step": 14329, + "time_per_iteration": 2.5383529663085938 + }, + { + "auxiliary_loss_clip": 0.06400427, + "auxiliary_loss_mlp": 0.01268007, + "balance_loss_clip": 0.06269085, + "balance_loss_mlp": 0.01258679, + "epoch": 0.8615662107320006, + "flos": 24067860785280.0, + "grad_norm": 2.856996046278892, + "language_loss": 0.76966333, + "learning_rate": 1.9760385296046757e-07, + "loss": 0.84634769, + "num_input_tokens_seen": 309074650, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09326172, + "step": 14330, + "time_per_iteration": 2.523453950881958 + }, + { + "auxiliary_loss_clip": 0.06401514, + "auxiliary_loss_mlp": 0.01263477, + "balance_loss_clip": 0.06272125, + "balance_loss_mlp": 0.01254405, + "epoch": 0.8616263339846686, + "flos": 24171003561600.0, + "grad_norm": 1.7944132766223935, + "language_loss": 0.65172309, + "learning_rate": 1.974350915342702e-07, + "loss": 0.72837293, + "num_input_tokens_seen": 309094385, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.09069824, + "step": 14331, + "time_per_iteration": 2.494178533554077 + }, + { + "auxiliary_loss_clip": 0.06397793, + "auxiliary_loss_mlp": 0.01264862, + "balance_loss_clip": 0.06269865, + "balance_loss_mlp": 0.01256314, + "epoch": 0.8616864572373365, + "flos": 21730533868800.0, + "grad_norm": 2.0612116619003933, + "language_loss": 0.76773548, + "learning_rate": 1.9726639846151506e-07, + "loss": 0.84436202, + "num_input_tokens_seen": 309111815, + "router_z_loss_clip": 1.27929688, + "router_z_loss_mlp": 0.08544922, + "step": 14332, + "time_per_iteration": 2.5452775955200195 + }, + { + "auxiliary_loss_clip": 0.06406568, + "auxiliary_loss_mlp": 0.01265905, + "balance_loss_clip": 0.0627177, + "balance_loss_mlp": 0.012556, + "epoch": 0.8617465804900045, + "flos": 23773037794560.0, + "grad_norm": 2.589644465810543, + "language_loss": 0.66962624, + "learning_rate": 1.9709777374859904e-07, + "loss": 0.74635088, + "num_input_tokens_seen": 309131385, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.10321045, + "step": 14333, + "time_per_iteration": 2.507899761199951 + }, + { + "auxiliary_loss_clip": 0.06408904, + "auxiliary_loss_mlp": 0.01266112, + "balance_loss_clip": 0.06271466, + "balance_loss_mlp": 0.01255365, + "epoch": 0.8618067037426724, + "flos": 37712612632320.0, + "grad_norm": 2.0727942750697244, + "language_loss": 0.62088275, + "learning_rate": 1.969292174019157e-07, + "loss": 0.69763291, + "num_input_tokens_seen": 309155020, + "router_z_loss_clip": 1.37402344, + "router_z_loss_mlp": 0.10736084, + "step": 14334, + "time_per_iteration": 2.727379322052002 + }, + { + "auxiliary_loss_clip": 0.06409249, + "auxiliary_loss_mlp": 0.01266887, + "balance_loss_clip": 0.06273654, + "balance_loss_mlp": 0.01256861, + "epoch": 0.8618668269953405, + "flos": 21477526865280.0, + "grad_norm": 2.2092003237507627, + "language_loss": 0.69843465, + "learning_rate": 1.967607294278577e-07, + "loss": 0.77519608, + "num_input_tokens_seen": 309172865, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.10028076, + "step": 14335, + "time_per_iteration": 2.5096664428710938 + }, + { + "auxiliary_loss_clip": 0.0640562, + "auxiliary_loss_mlp": 0.01267636, + "balance_loss_clip": 0.06273089, + "balance_loss_mlp": 0.01257927, + "epoch": 0.8619269502480085, + "flos": 22238560373760.0, + "grad_norm": 1.539528781413438, + "language_loss": 0.83133399, + "learning_rate": 1.965923098328135e-07, + "loss": 0.90806651, + "num_input_tokens_seen": 309193575, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.09710693, + "step": 14336, + "time_per_iteration": 2.534871816635132 + }, + { + "auxiliary_loss_clip": 0.06407534, + "auxiliary_loss_mlp": 0.01266904, + "balance_loss_clip": 0.06270752, + "balance_loss_mlp": 0.01257725, + "epoch": 0.8619870735006764, + "flos": 22717181295360.0, + "grad_norm": 1.7880701547963709, + "language_loss": 0.67198873, + "learning_rate": 1.9642395862316907e-07, + "loss": 0.74873316, + "num_input_tokens_seen": 309212680, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.09179688, + "step": 14337, + "time_per_iteration": 2.4912726879119873 + }, + { + "auxiliary_loss_clip": 0.06400966, + "auxiliary_loss_mlp": 0.0126898, + "balance_loss_clip": 0.06269696, + "balance_loss_mlp": 0.01259574, + "epoch": 0.8620471967533444, + "flos": 37528730847360.0, + "grad_norm": 1.850620303251151, + "language_loss": 0.67287397, + "learning_rate": 1.962556758053089e-07, + "loss": 0.74957347, + "num_input_tokens_seen": 309234485, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09411621, + "step": 14338, + "time_per_iteration": 2.630582571029663 + }, + { + "auxiliary_loss_clip": 0.06403364, + "auxiliary_loss_mlp": 0.01264598, + "balance_loss_clip": 0.06270847, + "balance_loss_mlp": 0.01255693, + "epoch": 0.8621073200060123, + "flos": 19688533067520.0, + "grad_norm": 1.6865658257552463, + "language_loss": 0.62323976, + "learning_rate": 1.9608746138561448e-07, + "loss": 0.69991934, + "num_input_tokens_seen": 309253630, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.08905029, + "step": 14339, + "time_per_iteration": 2.4896788597106934 + }, + { + "auxiliary_loss_clip": 0.06398258, + "auxiliary_loss_mlp": 0.01261212, + "balance_loss_clip": 0.06268729, + "balance_loss_mlp": 0.01252009, + "epoch": 0.8621674432586803, + "flos": 14541882785280.0, + "grad_norm": 1.789087653765178, + "language_loss": 0.62707412, + "learning_rate": 1.9591931537046458e-07, + "loss": 0.70366883, + "num_input_tokens_seen": 309270950, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.09197998, + "step": 14340, + "time_per_iteration": 2.514129400253296 + }, + { + "auxiliary_loss_clip": 0.06398233, + "auxiliary_loss_mlp": 0.01270527, + "balance_loss_clip": 0.06275177, + "balance_loss_mlp": 0.01261962, + "epoch": 0.8622275665113482, + "flos": 20746276283520.0, + "grad_norm": 1.55822601807664, + "language_loss": 0.79994321, + "learning_rate": 1.9575123776623493e-07, + "loss": 0.87663078, + "num_input_tokens_seen": 309288780, + "router_z_loss_clip": 1.23046875, + "router_z_loss_mlp": 0.08569336, + "step": 14341, + "time_per_iteration": 2.4803621768951416 + }, + { + "auxiliary_loss_clip": 0.06398244, + "auxiliary_loss_mlp": 0.0126392, + "balance_loss_clip": 0.0626985, + "balance_loss_mlp": 0.01254693, + "epoch": 0.8622876897640163, + "flos": 24722565062400.0, + "grad_norm": 1.537556068716055, + "language_loss": 0.75025547, + "learning_rate": 1.9558322857929887e-07, + "loss": 0.82687712, + "num_input_tokens_seen": 309310875, + "router_z_loss_clip": 1.28320312, + "router_z_loss_mlp": 0.09234619, + "step": 14342, + "time_per_iteration": 2.530914545059204 + }, + { + "auxiliary_loss_clip": 0.06404229, + "auxiliary_loss_mlp": 0.0126642, + "balance_loss_clip": 0.06270722, + "balance_loss_mlp": 0.01255841, + "epoch": 0.8623478130166842, + "flos": 17463153605760.0, + "grad_norm": 6.2322370911509815, + "language_loss": 0.68703187, + "learning_rate": 1.95415287816028e-07, + "loss": 0.76373827, + "num_input_tokens_seen": 309329900, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.10577393, + "step": 14343, + "time_per_iteration": 2.5865073204040527 + }, + { + "auxiliary_loss_clip": 0.06402855, + "auxiliary_loss_mlp": 0.01268855, + "balance_loss_clip": 0.06271795, + "balance_loss_mlp": 0.0125914, + "epoch": 0.8624079362693522, + "flos": 18114252157440.0, + "grad_norm": 1.6558360016746088, + "language_loss": 0.68030214, + "learning_rate": 1.9524741548278967e-07, + "loss": 0.75701928, + "num_input_tokens_seen": 309347870, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09716797, + "step": 14344, + "time_per_iteration": 2.537827730178833 + }, + { + "auxiliary_loss_clip": 0.06405529, + "auxiliary_loss_mlp": 0.01265965, + "balance_loss_clip": 0.06271856, + "balance_loss_mlp": 0.01256232, + "epoch": 0.8624680595220201, + "flos": 30674664316800.0, + "grad_norm": 1.3739709083227454, + "language_loss": 0.81833351, + "learning_rate": 1.9507961158595054e-07, + "loss": 0.8950485, + "num_input_tokens_seen": 309371695, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.09735107, + "step": 14345, + "time_per_iteration": 2.5870211124420166 + }, + { + "auxiliary_loss_clip": 0.06403511, + "auxiliary_loss_mlp": 0.0126453, + "balance_loss_clip": 0.06269494, + "balance_loss_mlp": 0.01254606, + "epoch": 0.8625281827746881, + "flos": 38007771039360.0, + "grad_norm": 1.9821482587026948, + "language_loss": 0.51161534, + "learning_rate": 1.9491187613187355e-07, + "loss": 0.58829576, + "num_input_tokens_seen": 309394645, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.09918213, + "step": 14346, + "time_per_iteration": 2.6315839290618896 + }, + { + "auxiliary_loss_clip": 0.06401588, + "auxiliary_loss_mlp": 0.01266829, + "balance_loss_clip": 0.06270475, + "balance_loss_mlp": 0.01257567, + "epoch": 0.862588306027356, + "flos": 26256874775040.0, + "grad_norm": 1.4328060153446618, + "language_loss": 0.75362718, + "learning_rate": 1.9474420912691913e-07, + "loss": 0.8303113, + "num_input_tokens_seen": 309413170, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.09259033, + "step": 14347, + "time_per_iteration": 2.541715383529663 + }, + { + "auxiliary_loss_clip": 0.06404621, + "auxiliary_loss_mlp": 0.01266719, + "balance_loss_clip": 0.06272689, + "balance_loss_mlp": 0.0125667, + "epoch": 0.862648429280024, + "flos": 25884876574080.0, + "grad_norm": 1.79527283779547, + "language_loss": 0.80723387, + "learning_rate": 1.945766105774449e-07, + "loss": 0.88394725, + "num_input_tokens_seen": 309431315, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.10046387, + "step": 14348, + "time_per_iteration": 3.9310317039489746 + }, + { + "auxiliary_loss_clip": 0.06397234, + "auxiliary_loss_mlp": 0.01262407, + "balance_loss_clip": 0.06269418, + "balance_loss_mlp": 0.01253526, + "epoch": 0.862708552532692, + "flos": 37825608263040.0, + "grad_norm": 1.5979162095683273, + "language_loss": 0.664671, + "learning_rate": 1.9440908048980665e-07, + "loss": 0.74126744, + "num_input_tokens_seen": 309453020, + "router_z_loss_clip": 1.27832031, + "router_z_loss_mlp": 0.08886719, + "step": 14349, + "time_per_iteration": 2.667307138442993 + }, + { + "auxiliary_loss_clip": 0.06402015, + "auxiliary_loss_mlp": 0.01266689, + "balance_loss_clip": 0.06271265, + "balance_loss_mlp": 0.01257206, + "epoch": 0.86276867578536, + "flos": 19096623025920.0, + "grad_norm": 2.3630117925707474, + "language_loss": 0.70285714, + "learning_rate": 1.942416188703573e-07, + "loss": 0.77954423, + "num_input_tokens_seen": 309469780, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09484863, + "step": 14350, + "time_per_iteration": 2.4467716217041016 + }, + { + "auxiliary_loss_clip": 0.06401723, + "auxiliary_loss_mlp": 0.01264798, + "balance_loss_clip": 0.06270139, + "balance_loss_mlp": 0.0125482, + "epoch": 0.862828799038028, + "flos": 22170902601600.0, + "grad_norm": 1.8189959040488348, + "language_loss": 0.77373683, + "learning_rate": 1.9407422572544618e-07, + "loss": 0.85040212, + "num_input_tokens_seen": 309489610, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09979248, + "step": 14351, + "time_per_iteration": 2.5256404876708984 + }, + { + "auxiliary_loss_clip": 0.06401232, + "auxiliary_loss_mlp": 0.01265649, + "balance_loss_clip": 0.06269863, + "balance_loss_mlp": 0.0125622, + "epoch": 0.8628889222906959, + "flos": 23151722169600.0, + "grad_norm": 2.066153736176063, + "language_loss": 0.84886032, + "learning_rate": 1.9390690106142204e-07, + "loss": 0.92552912, + "num_input_tokens_seen": 309508295, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09436035, + "step": 14352, + "time_per_iteration": 2.4913690090179443 + }, + { + "auxiliary_loss_clip": 0.06313725, + "auxiliary_loss_mlp": 0.01255388, + "balance_loss_clip": 0.06258518, + "balance_loss_mlp": 0.01254317, + "epoch": 0.8629490455433639, + "flos": 57837600489600.0, + "grad_norm": 0.7926925126054749, + "language_loss": 0.61875582, + "learning_rate": 1.9373964488462913e-07, + "loss": 0.69444704, + "num_input_tokens_seen": 309567960, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.0107193, + "step": 14353, + "time_per_iteration": 3.177020788192749 + }, + { + "auxiliary_loss_clip": 0.06400892, + "auxiliary_loss_mlp": 0.01265779, + "balance_loss_clip": 0.06272262, + "balance_loss_mlp": 0.01257202, + "epoch": 0.8630091687960318, + "flos": 15924315772800.0, + "grad_norm": 1.6311074355718687, + "language_loss": 0.81987357, + "learning_rate": 1.9357245720140948e-07, + "loss": 0.89654028, + "num_input_tokens_seen": 309586050, + "router_z_loss_clip": 1.28417969, + "router_z_loss_mlp": 0.08575439, + "step": 14354, + "time_per_iteration": 2.4930381774902344 + }, + { + "auxiliary_loss_clip": 0.06401116, + "auxiliary_loss_mlp": 0.0126246, + "balance_loss_clip": 0.06269745, + "balance_loss_mlp": 0.01252792, + "epoch": 0.8630692920486999, + "flos": 17966484282240.0, + "grad_norm": 1.7753060969616925, + "language_loss": 0.85697293, + "learning_rate": 1.934053380181031e-07, + "loss": 0.93360865, + "num_input_tokens_seen": 309602910, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09667969, + "step": 14355, + "time_per_iteration": 2.53157901763916 + }, + { + "auxiliary_loss_clip": 0.06404698, + "auxiliary_loss_mlp": 0.01264579, + "balance_loss_clip": 0.0627116, + "balance_loss_mlp": 0.01254684, + "epoch": 0.8631294153013678, + "flos": 22461658669440.0, + "grad_norm": 2.081321104089011, + "language_loss": 0.58636671, + "learning_rate": 1.9323828734104763e-07, + "loss": 0.66305947, + "num_input_tokens_seen": 309621175, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.09893799, + "step": 14356, + "time_per_iteration": 2.4832444190979004 + }, + { + "auxiliary_loss_clip": 0.06409314, + "auxiliary_loss_mlp": 0.01265255, + "balance_loss_clip": 0.06271717, + "balance_loss_mlp": 0.01254502, + "epoch": 0.8631895385540358, + "flos": 16842676521600.0, + "grad_norm": 2.3858514635605945, + "language_loss": 0.7736609, + "learning_rate": 1.9307130517657756e-07, + "loss": 0.85040665, + "num_input_tokens_seen": 309639395, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.10754395, + "step": 14357, + "time_per_iteration": 2.5092248916625977 + }, + { + "auxiliary_loss_clip": 0.06403995, + "auxiliary_loss_mlp": 0.01266002, + "balance_loss_clip": 0.06271581, + "balance_loss_mlp": 0.01256227, + "epoch": 0.8632496618067037, + "flos": 18703101525120.0, + "grad_norm": 2.0189776853182906, + "language_loss": 0.7785663, + "learning_rate": 1.9290439153102468e-07, + "loss": 0.85526627, + "num_input_tokens_seen": 309657265, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.09765625, + "step": 14358, + "time_per_iteration": 2.4995810985565186 + }, + { + "auxiliary_loss_clip": 0.06403126, + "auxiliary_loss_mlp": 0.0126148, + "balance_loss_clip": 0.06271631, + "balance_loss_mlp": 0.01252575, + "epoch": 0.8633097850593717, + "flos": 24286808304000.0, + "grad_norm": 1.2976012595497113, + "language_loss": 0.75020969, + "learning_rate": 1.9273754641071816e-07, + "loss": 0.82685572, + "num_input_tokens_seen": 309678610, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.08905029, + "step": 14359, + "time_per_iteration": 3.9602229595184326 + }, + { + "auxiliary_loss_clip": 0.06394325, + "auxiliary_loss_mlp": 0.01264972, + "balance_loss_clip": 0.06267578, + "balance_loss_mlp": 0.01256204, + "epoch": 0.8633699083120396, + "flos": 21184926007680.0, + "grad_norm": 1.9803616638517643, + "language_loss": 0.70742667, + "learning_rate": 1.9257076982198517e-07, + "loss": 0.78401971, + "num_input_tokens_seen": 309697710, + "router_z_loss_clip": 1.26953125, + "router_z_loss_mlp": 0.08776855, + "step": 14360, + "time_per_iteration": 2.5267932415008545 + }, + { + "auxiliary_loss_clip": 0.06407928, + "auxiliary_loss_mlp": 0.0126448, + "balance_loss_clip": 0.06273921, + "balance_loss_mlp": 0.01254055, + "epoch": 0.8634300315647077, + "flos": 19250931519360.0, + "grad_norm": 1.9077703243953956, + "language_loss": 0.76441604, + "learning_rate": 1.9240406177114953e-07, + "loss": 0.84114009, + "num_input_tokens_seen": 309715985, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.10424805, + "step": 14361, + "time_per_iteration": 2.490943193435669 + }, + { + "auxiliary_loss_clip": 0.06311232, + "auxiliary_loss_mlp": 0.0125435, + "balance_loss_clip": 0.062562, + "balance_loss_mlp": 0.01253265, + "epoch": 0.8634901548173756, + "flos": 66214572577920.0, + "grad_norm": 0.8650846774823281, + "language_loss": 0.586797, + "learning_rate": 1.922374222645329e-07, + "loss": 0.66245276, + "num_input_tokens_seen": 309779930, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.01087189, + "step": 14362, + "time_per_iteration": 3.1930222511291504 + }, + { + "auxiliary_loss_clip": 0.06408567, + "auxiliary_loss_mlp": 0.01271559, + "balance_loss_clip": 0.06273866, + "balance_loss_mlp": 0.01261497, + "epoch": 0.8635502780700436, + "flos": 24796302255360.0, + "grad_norm": 1.6117142175408212, + "language_loss": 0.80565244, + "learning_rate": 1.9207085130845524e-07, + "loss": 0.88245368, + "num_input_tokens_seen": 309800580, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.10064697, + "step": 14363, + "time_per_iteration": 2.5488052368164062 + }, + { + "auxiliary_loss_clip": 0.06405362, + "auxiliary_loss_mlp": 0.01264899, + "balance_loss_clip": 0.06271777, + "balance_loss_mlp": 0.01254325, + "epoch": 0.8636104013227116, + "flos": 25196657863680.0, + "grad_norm": 3.2911093464095376, + "language_loss": 0.7295658, + "learning_rate": 1.9190434890923112e-07, + "loss": 0.80626839, + "num_input_tokens_seen": 309821725, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.10577393, + "step": 14364, + "time_per_iteration": 3.9698781967163086 + }, + { + "auxiliary_loss_clip": 0.0640732, + "auxiliary_loss_mlp": 0.01263265, + "balance_loss_clip": 0.06272443, + "balance_loss_mlp": 0.01253978, + "epoch": 0.8636705245753795, + "flos": 23885236811520.0, + "grad_norm": 1.455571022027207, + "language_loss": 0.7167381, + "learning_rate": 1.917379150731755e-07, + "loss": 0.79344392, + "num_input_tokens_seen": 309841565, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.09295654, + "step": 14365, + "time_per_iteration": 3.9607086181640625 + }, + { + "auxiliary_loss_clip": 0.06408954, + "auxiliary_loss_mlp": 0.01268552, + "balance_loss_clip": 0.06272392, + "balance_loss_mlp": 0.01257472, + "epoch": 0.8637306478280475, + "flos": 23116824144000.0, + "grad_norm": 1.9610886432207495, + "language_loss": 0.71209329, + "learning_rate": 1.915715498065993e-07, + "loss": 0.78886831, + "num_input_tokens_seen": 309858635, + "router_z_loss_clip": 1.36621094, + "router_z_loss_mlp": 0.11090088, + "step": 14366, + "time_per_iteration": 2.502300977706909 + }, + { + "auxiliary_loss_clip": 0.0639744, + "auxiliary_loss_mlp": 0.01266839, + "balance_loss_clip": 0.06268862, + "balance_loss_mlp": 0.01258071, + "epoch": 0.8637907710807154, + "flos": 21913032061440.0, + "grad_norm": 3.9077232982556196, + "language_loss": 0.81972671, + "learning_rate": 1.9140525311581146e-07, + "loss": 0.89636946, + "num_input_tokens_seen": 309877885, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.08764648, + "step": 14367, + "time_per_iteration": 2.5068411827087402 + }, + { + "auxiliary_loss_clip": 0.0640227, + "auxiliary_loss_mlp": 0.0126336, + "balance_loss_clip": 0.06269377, + "balance_loss_mlp": 0.01253269, + "epoch": 0.8638508943333835, + "flos": 23586263043840.0, + "grad_norm": 2.019581069105167, + "language_loss": 0.6210227, + "learning_rate": 1.9123902500711743e-07, + "loss": 0.69767898, + "num_input_tokens_seen": 309893140, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.10095215, + "step": 14368, + "time_per_iteration": 2.502528429031372 + }, + { + "auxiliary_loss_clip": 0.06402116, + "auxiliary_loss_mlp": 0.01265082, + "balance_loss_clip": 0.06271379, + "balance_loss_mlp": 0.01255427, + "epoch": 0.8639110175860514, + "flos": 25782991608960.0, + "grad_norm": 1.8655713260907347, + "language_loss": 0.76021969, + "learning_rate": 1.91072865486821e-07, + "loss": 0.83689165, + "num_input_tokens_seen": 309914175, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09661865, + "step": 14369, + "time_per_iteration": 2.5583889484405518 + }, + { + "auxiliary_loss_clip": 0.06405649, + "auxiliary_loss_mlp": 0.01268162, + "balance_loss_clip": 0.06269823, + "balance_loss_mlp": 0.01257409, + "epoch": 0.8639711408387194, + "flos": 23376455619840.0, + "grad_norm": 1.8041581348752054, + "language_loss": 0.64473277, + "learning_rate": 1.9090677456122294e-07, + "loss": 0.72147083, + "num_input_tokens_seen": 309932395, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.10754395, + "step": 14370, + "time_per_iteration": 2.523294687271118 + }, + { + "auxiliary_loss_clip": 0.06406188, + "auxiliary_loss_mlp": 0.01265473, + "balance_loss_clip": 0.06274764, + "balance_loss_mlp": 0.01256085, + "epoch": 0.8640312640913873, + "flos": 22133740515840.0, + "grad_norm": 1.5680829975837718, + "language_loss": 0.66822445, + "learning_rate": 1.907407522366209e-07, + "loss": 0.744941, + "num_input_tokens_seen": 309951720, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09381104, + "step": 14371, + "time_per_iteration": 2.529430389404297 + }, + { + "auxiliary_loss_clip": 0.06313685, + "auxiliary_loss_mlp": 0.01251782, + "balance_loss_clip": 0.06259193, + "balance_loss_mlp": 0.012508, + "epoch": 0.8640913873440553, + "flos": 57586998055680.0, + "grad_norm": 0.8486423176680128, + "language_loss": 0.57041156, + "learning_rate": 1.905747985193107e-07, + "loss": 0.64606631, + "num_input_tokens_seen": 310006120, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.0098114, + "step": 14372, + "time_per_iteration": 3.0363752841949463 + }, + { + "auxiliary_loss_clip": 0.06399583, + "auxiliary_loss_mlp": 0.01263288, + "balance_loss_clip": 0.06271808, + "balance_loss_mlp": 0.01253811, + "epoch": 0.8641515105967232, + "flos": 23994165519360.0, + "grad_norm": 1.5906200485227884, + "language_loss": 0.79251468, + "learning_rate": 1.9040891341558597e-07, + "loss": 0.86914343, + "num_input_tokens_seen": 310026740, + "router_z_loss_clip": 1.27734375, + "router_z_loss_mlp": 0.0947876, + "step": 14373, + "time_per_iteration": 2.5637240409851074 + }, + { + "auxiliary_loss_clip": 0.06403147, + "auxiliary_loss_mlp": 0.01263805, + "balance_loss_clip": 0.06269763, + "balance_loss_mlp": 0.01254328, + "epoch": 0.8642116338493913, + "flos": 19068810670080.0, + "grad_norm": 1.7439997489953305, + "language_loss": 0.63977039, + "learning_rate": 1.9024309693173656e-07, + "loss": 0.7164399, + "num_input_tokens_seen": 310044135, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.0947876, + "step": 14374, + "time_per_iteration": 2.5307912826538086 + }, + { + "auxiliary_loss_clip": 0.06398176, + "auxiliary_loss_mlp": 0.01263527, + "balance_loss_clip": 0.06268865, + "balance_loss_mlp": 0.01254652, + "epoch": 0.8642717571020592, + "flos": 18259085139840.0, + "grad_norm": 2.2572077948028433, + "language_loss": 0.77652001, + "learning_rate": 1.9007734907404993e-07, + "loss": 0.85313702, + "num_input_tokens_seen": 310061560, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.08880615, + "step": 14375, + "time_per_iteration": 2.483269453048706 + }, + { + "auxiliary_loss_clip": 0.06401785, + "auxiliary_loss_mlp": 0.01263893, + "balance_loss_clip": 0.06269706, + "balance_loss_mlp": 0.01253892, + "epoch": 0.8643318803547272, + "flos": 57675550222080.0, + "grad_norm": 1.541843891786326, + "language_loss": 0.61128557, + "learning_rate": 1.899116698488117e-07, + "loss": 0.68794239, + "num_input_tokens_seen": 310087310, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.10003662, + "step": 14376, + "time_per_iteration": 2.8843209743499756 + }, + { + "auxiliary_loss_clip": 0.06403586, + "auxiliary_loss_mlp": 0.01264991, + "balance_loss_clip": 0.06272595, + "balance_loss_mlp": 0.01254876, + "epoch": 0.8643920036073952, + "flos": 19615592488320.0, + "grad_norm": 1.5018425014580143, + "language_loss": 0.66786122, + "learning_rate": 1.8974605926230457e-07, + "loss": 0.74454701, + "num_input_tokens_seen": 310106260, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.10107422, + "step": 14377, + "time_per_iteration": 2.5229828357696533 + }, + { + "auxiliary_loss_clip": 0.06406192, + "auxiliary_loss_mlp": 0.01265361, + "balance_loss_clip": 0.06271313, + "balance_loss_mlp": 0.01255604, + "epoch": 0.8644521268600631, + "flos": 20856672437760.0, + "grad_norm": 1.4771903457051754, + "language_loss": 0.70475584, + "learning_rate": 1.8958051732080804e-07, + "loss": 0.78147137, + "num_input_tokens_seen": 310125305, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.09747314, + "step": 14378, + "time_per_iteration": 2.509063720703125 + }, + { + "auxiliary_loss_clip": 0.06313916, + "auxiliary_loss_mlp": 0.01254059, + "balance_loss_clip": 0.06259046, + "balance_loss_mlp": 0.01252975, + "epoch": 0.8645122501127311, + "flos": 66740753491200.0, + "grad_norm": 0.7838907158972782, + "language_loss": 0.60319781, + "learning_rate": 1.894150440305995e-07, + "loss": 0.67887759, + "num_input_tokens_seen": 310189270, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.01085663, + "step": 14379, + "time_per_iteration": 3.1320457458496094 + }, + { + "auxiliary_loss_clip": 0.06399889, + "auxiliary_loss_mlp": 0.01263784, + "balance_loss_clip": 0.0627097, + "balance_loss_mlp": 0.01254605, + "epoch": 0.864572373365399, + "flos": 21696558238080.0, + "grad_norm": 1.5435489146258106, + "language_loss": 0.74544406, + "learning_rate": 1.8924963939795478e-07, + "loss": 0.82208085, + "num_input_tokens_seen": 310208395, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.09179688, + "step": 14380, + "time_per_iteration": 2.533979654312134 + }, + { + "auxiliary_loss_clip": 0.0641063, + "auxiliary_loss_mlp": 0.0126624, + "balance_loss_clip": 0.06273422, + "balance_loss_mlp": 0.01256018, + "epoch": 0.8646324966180671, + "flos": 20272602752640.0, + "grad_norm": 1.8170227609010927, + "language_loss": 0.75806165, + "learning_rate": 1.8908430342914473e-07, + "loss": 0.83483034, + "num_input_tokens_seen": 310227415, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.10235596, + "step": 14381, + "time_per_iteration": 2.497065305709839 + }, + { + "auxiliary_loss_clip": 0.06403077, + "auxiliary_loss_mlp": 0.01262559, + "balance_loss_clip": 0.06272194, + "balance_loss_mlp": 0.01253457, + "epoch": 0.864692619870735, + "flos": 11950752251520.0, + "grad_norm": 2.2051437875425757, + "language_loss": 0.84932131, + "learning_rate": 1.8891903613043892e-07, + "loss": 0.92597765, + "num_input_tokens_seen": 310242625, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09106445, + "step": 14382, + "time_per_iteration": 2.535344362258911 + }, + { + "auxiliary_loss_clip": 0.06403528, + "auxiliary_loss_mlp": 0.0126157, + "balance_loss_clip": 0.06271295, + "balance_loss_mlp": 0.01252058, + "epoch": 0.864752743123403, + "flos": 21477149521920.0, + "grad_norm": 1.6567318612766335, + "language_loss": 0.75987065, + "learning_rate": 1.8875383750810504e-07, + "loss": 0.83652163, + "num_input_tokens_seen": 310260585, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.09509277, + "step": 14383, + "time_per_iteration": 2.5716378688812256 + }, + { + "auxiliary_loss_clip": 0.06400105, + "auxiliary_loss_mlp": 0.01265637, + "balance_loss_clip": 0.06271577, + "balance_loss_mlp": 0.01256738, + "epoch": 0.8648128663760709, + "flos": 19534979260800.0, + "grad_norm": 1.6589847314556463, + "language_loss": 0.84984505, + "learning_rate": 1.8858870756840738e-07, + "loss": 0.92650247, + "num_input_tokens_seen": 310277210, + "router_z_loss_clip": 1.28613281, + "router_z_loss_mlp": 0.08892822, + "step": 14384, + "time_per_iteration": 2.5308241844177246 + }, + { + "auxiliary_loss_clip": 0.06400002, + "auxiliary_loss_mlp": 0.01265, + "balance_loss_clip": 0.06269626, + "balance_loss_mlp": 0.01255344, + "epoch": 0.8648729896287389, + "flos": 21294315912960.0, + "grad_norm": 1.7401611102824495, + "language_loss": 0.81164479, + "learning_rate": 1.884236463176072e-07, + "loss": 0.88829482, + "num_input_tokens_seen": 310296610, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.09655762, + "step": 14385, + "time_per_iteration": 2.4921391010284424 + }, + { + "auxiliary_loss_clip": 0.06406556, + "auxiliary_loss_mlp": 0.01267811, + "balance_loss_clip": 0.06271443, + "balance_loss_mlp": 0.01257428, + "epoch": 0.8649331128814068, + "flos": 24610785315840.0, + "grad_norm": 2.091649700881737, + "language_loss": 0.72774786, + "learning_rate": 1.8825865376196437e-07, + "loss": 0.80449152, + "num_input_tokens_seen": 310316830, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.1038208, + "step": 14386, + "time_per_iteration": 2.545750379562378 + }, + { + "auxiliary_loss_clip": 0.06401771, + "auxiliary_loss_mlp": 0.01262704, + "balance_loss_clip": 0.06270965, + "balance_loss_mlp": 0.01253138, + "epoch": 0.8649932361340749, + "flos": 15383277959040.0, + "grad_norm": 2.209665569654056, + "language_loss": 0.82382894, + "learning_rate": 1.8809372990773476e-07, + "loss": 0.90047371, + "num_input_tokens_seen": 310334355, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09570312, + "step": 14387, + "time_per_iteration": 3.9013686180114746 + }, + { + "auxiliary_loss_clip": 0.0640083, + "auxiliary_loss_mlp": 0.01263962, + "balance_loss_clip": 0.06272831, + "balance_loss_mlp": 0.01255004, + "epoch": 0.8650533593867428, + "flos": 19907312878080.0, + "grad_norm": 2.010329116526224, + "language_loss": 0.68742537, + "learning_rate": 1.8792887476117224e-07, + "loss": 0.76407325, + "num_input_tokens_seen": 310352900, + "router_z_loss_clip": 1.27929688, + "router_z_loss_mlp": 0.08966064, + "step": 14388, + "time_per_iteration": 2.504244565963745 + }, + { + "auxiliary_loss_clip": 0.06398115, + "auxiliary_loss_mlp": 0.01264198, + "balance_loss_clip": 0.06271598, + "balance_loss_mlp": 0.01255323, + "epoch": 0.8651134826394108, + "flos": 25633546652160.0, + "grad_norm": 1.6117795719153174, + "language_loss": 0.90809613, + "learning_rate": 1.877640883285283e-07, + "loss": 0.98471928, + "num_input_tokens_seen": 310372855, + "router_z_loss_clip": 1.265625, + "router_z_loss_mlp": 0.08874512, + "step": 14389, + "time_per_iteration": 2.5962395668029785 + }, + { + "auxiliary_loss_clip": 0.0639938, + "auxiliary_loss_mlp": 0.01263329, + "balance_loss_clip": 0.06270058, + "balance_loss_mlp": 0.01253947, + "epoch": 0.8651736058920788, + "flos": 18740557100160.0, + "grad_norm": 1.8613703066049654, + "language_loss": 0.71011788, + "learning_rate": 1.8759937061605212e-07, + "loss": 0.78674495, + "num_input_tokens_seen": 310391595, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.09375, + "step": 14390, + "time_per_iteration": 2.495643138885498 + }, + { + "auxiliary_loss_clip": 0.06405844, + "auxiliary_loss_mlp": 0.01268761, + "balance_loss_clip": 0.06271544, + "balance_loss_mlp": 0.01259392, + "epoch": 0.8652337291447467, + "flos": 20782977171840.0, + "grad_norm": 1.5876271483812678, + "language_loss": 0.8251009, + "learning_rate": 1.8743472162998941e-07, + "loss": 0.901847, + "num_input_tokens_seen": 310410090, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.09387207, + "step": 14391, + "time_per_iteration": 2.5008716583251953 + }, + { + "auxiliary_loss_clip": 0.06307146, + "auxiliary_loss_mlp": 0.01252466, + "balance_loss_clip": 0.06252509, + "balance_loss_mlp": 0.01251483, + "epoch": 0.8652938523974147, + "flos": 64246895948160.0, + "grad_norm": 0.8370368549242478, + "language_loss": 0.67857373, + "learning_rate": 1.8727014137658337e-07, + "loss": 0.75416982, + "num_input_tokens_seen": 310470055, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.00981903, + "step": 14392, + "time_per_iteration": 3.0305261611938477 + }, + { + "auxiliary_loss_clip": 0.06409582, + "auxiliary_loss_mlp": 0.01263889, + "balance_loss_clip": 0.06272376, + "balance_loss_mlp": 0.01253572, + "epoch": 0.8653539756500827, + "flos": 18046384750080.0, + "grad_norm": 1.905098269493672, + "language_loss": 0.75714177, + "learning_rate": 1.8710562986207523e-07, + "loss": 0.83387649, + "num_input_tokens_seen": 310487665, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.10314941, + "step": 14393, + "time_per_iteration": 2.505152940750122 + }, + { + "auxiliary_loss_clip": 0.06406023, + "auxiliary_loss_mlp": 0.0126336, + "balance_loss_clip": 0.06270998, + "balance_loss_mlp": 0.01253173, + "epoch": 0.8654140989027507, + "flos": 17387865112320.0, + "grad_norm": 1.8277719663675551, + "language_loss": 0.74051988, + "learning_rate": 1.8694118709270357e-07, + "loss": 0.81721365, + "num_input_tokens_seen": 310506130, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.10192871, + "step": 14394, + "time_per_iteration": 2.482966661453247 + }, + { + "auxiliary_loss_clip": 0.0640289, + "auxiliary_loss_mlp": 0.01264117, + "balance_loss_clip": 0.06269561, + "balance_loss_mlp": 0.01254634, + "epoch": 0.8654742221554186, + "flos": 53296390212480.0, + "grad_norm": 1.8585676526788644, + "language_loss": 0.65939879, + "learning_rate": 1.867768130747036e-07, + "loss": 0.7360689, + "num_input_tokens_seen": 310532445, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.09484863, + "step": 14395, + "time_per_iteration": 2.800736904144287 + }, + { + "auxiliary_loss_clip": 0.06404395, + "auxiliary_loss_mlp": 0.01264073, + "balance_loss_clip": 0.06273991, + "balance_loss_mlp": 0.01254239, + "epoch": 0.8655343454080866, + "flos": 23921476502400.0, + "grad_norm": 1.4835131789742315, + "language_loss": 0.68352878, + "learning_rate": 1.8661250781430838e-07, + "loss": 0.76021349, + "num_input_tokens_seen": 310552300, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.09832764, + "step": 14396, + "time_per_iteration": 2.5393667221069336 + }, + { + "auxiliary_loss_clip": 0.06409425, + "auxiliary_loss_mlp": 0.01266633, + "balance_loss_clip": 0.06273856, + "balance_loss_mlp": 0.01255994, + "epoch": 0.8655944686607545, + "flos": 24104016622080.0, + "grad_norm": 2.042547019801872, + "language_loss": 0.69834018, + "learning_rate": 1.8644827131774954e-07, + "loss": 0.77510077, + "num_input_tokens_seen": 310572710, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.10638428, + "step": 14397, + "time_per_iteration": 2.539818286895752 + }, + { + "auxiliary_loss_clip": 0.06403225, + "auxiliary_loss_mlp": 0.01263446, + "balance_loss_clip": 0.0627091, + "balance_loss_mlp": 0.01253373, + "epoch": 0.8656545919134225, + "flos": 23119465547520.0, + "grad_norm": 1.8495016232222878, + "language_loss": 0.63756424, + "learning_rate": 1.86284103591253e-07, + "loss": 0.71423095, + "num_input_tokens_seen": 310592460, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.10070801, + "step": 14398, + "time_per_iteration": 2.538398265838623 + }, + { + "auxiliary_loss_clip": 0.06404422, + "auxiliary_loss_mlp": 0.01268592, + "balance_loss_clip": 0.06273551, + "balance_loss_mlp": 0.01259454, + "epoch": 0.8657147151660904, + "flos": 21148057411200.0, + "grad_norm": 2.1437443287779594, + "language_loss": 0.76056588, + "learning_rate": 1.8612000464104517e-07, + "loss": 0.83729601, + "num_input_tokens_seen": 310609375, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09136963, + "step": 14399, + "time_per_iteration": 3.908792734146118 + }, + { + "auxiliary_loss_clip": 0.06397003, + "auxiliary_loss_mlp": 0.01262133, + "balance_loss_clip": 0.06268921, + "balance_loss_mlp": 0.01253961, + "epoch": 0.8657748384187585, + "flos": 16294972308480.0, + "grad_norm": 1.9617345996315059, + "language_loss": 0.93996477, + "learning_rate": 1.8595597447334855e-07, + "loss": 1.01655602, + "num_input_tokens_seen": 310627405, + "router_z_loss_clip": 1.28222656, + "router_z_loss_mlp": 0.081604, + "step": 14400, + "time_per_iteration": 2.587644338607788 + }, + { + "auxiliary_loss_clip": 0.06404351, + "auxiliary_loss_mlp": 0.01263404, + "balance_loss_clip": 0.06271766, + "balance_loss_mlp": 0.01254314, + "epoch": 0.8658349616714264, + "flos": 30851292723840.0, + "grad_norm": 1.6768484881367147, + "language_loss": 0.67610824, + "learning_rate": 1.8579201309438353e-07, + "loss": 0.75278574, + "num_input_tokens_seen": 310649945, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.09094238, + "step": 14401, + "time_per_iteration": 2.5835890769958496 + }, + { + "auxiliary_loss_clip": 0.06406137, + "auxiliary_loss_mlp": 0.0126592, + "balance_loss_clip": 0.06270184, + "balance_loss_mlp": 0.01256526, + "epoch": 0.8658950849240944, + "flos": 18958833786240.0, + "grad_norm": 2.2258596653957508, + "language_loss": 0.7464267, + "learning_rate": 1.8562812051036714e-07, + "loss": 0.8231473, + "num_input_tokens_seen": 310668285, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.09387207, + "step": 14402, + "time_per_iteration": 2.527329206466675 + }, + { + "auxiliary_loss_clip": 0.06397735, + "auxiliary_loss_mlp": 0.01263573, + "balance_loss_clip": 0.06269282, + "balance_loss_mlp": 0.01254907, + "epoch": 0.8659552081767624, + "flos": 23370501980160.0, + "grad_norm": 1.6213852785308416, + "language_loss": 0.752159, + "learning_rate": 1.8546429672751397e-07, + "loss": 0.82877213, + "num_input_tokens_seen": 310687015, + "router_z_loss_clip": 1.28320312, + "router_z_loss_mlp": 0.08660889, + "step": 14403, + "time_per_iteration": 2.531348705291748 + }, + { + "auxiliary_loss_clip": 0.06404096, + "auxiliary_loss_mlp": 0.01264956, + "balance_loss_clip": 0.06270886, + "balance_loss_mlp": 0.01255234, + "epoch": 0.8660153314294303, + "flos": 23848787485440.0, + "grad_norm": 1.6613689377775722, + "language_loss": 0.73390162, + "learning_rate": 1.853005417520368e-07, + "loss": 0.81059217, + "num_input_tokens_seen": 310707580, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.09716797, + "step": 14404, + "time_per_iteration": 4.003480911254883 + }, + { + "auxiliary_loss_clip": 0.0639967, + "auxiliary_loss_mlp": 0.01266035, + "balance_loss_clip": 0.06270695, + "balance_loss_mlp": 0.01255801, + "epoch": 0.8660754546820983, + "flos": 23119172058240.0, + "grad_norm": 1.6322756861517351, + "language_loss": 0.71098399, + "learning_rate": 1.851368555901447e-07, + "loss": 0.78764105, + "num_input_tokens_seen": 310727300, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.10241699, + "step": 14405, + "time_per_iteration": 4.005644798278809 + }, + { + "auxiliary_loss_clip": 0.06404774, + "auxiliary_loss_mlp": 0.01262757, + "balance_loss_clip": 0.06269382, + "balance_loss_mlp": 0.01252666, + "epoch": 0.8661355779347663, + "flos": 14397175584000.0, + "grad_norm": 1.6421655620173083, + "language_loss": 0.66277993, + "learning_rate": 1.8497323824804467e-07, + "loss": 0.73945522, + "num_input_tokens_seen": 310744935, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.10089111, + "step": 14406, + "time_per_iteration": 2.50840425491333 + }, + { + "auxiliary_loss_clip": 0.06401468, + "auxiliary_loss_mlp": 0.01268771, + "balance_loss_clip": 0.06270108, + "balance_loss_mlp": 0.01260015, + "epoch": 0.8661957011874343, + "flos": 21876331173120.0, + "grad_norm": 1.8022015910030553, + "language_loss": 0.83140111, + "learning_rate": 1.8480968973194177e-07, + "loss": 0.90810353, + "num_input_tokens_seen": 310765085, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.08752441, + "step": 14407, + "time_per_iteration": 2.523522138595581 + }, + { + "auxiliary_loss_clip": 0.06403568, + "auxiliary_loss_mlp": 0.01265957, + "balance_loss_clip": 0.06273694, + "balance_loss_mlp": 0.01256366, + "epoch": 0.8662558244401022, + "flos": 21841600855680.0, + "grad_norm": 1.6449965568009912, + "language_loss": 0.70152688, + "learning_rate": 1.8464621004803748e-07, + "loss": 0.77822208, + "num_input_tokens_seen": 310783260, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.09588623, + "step": 14408, + "time_per_iteration": 2.5317270755767822 + }, + { + "auxiliary_loss_clip": 0.06397519, + "auxiliary_loss_mlp": 0.01264222, + "balance_loss_clip": 0.06270346, + "balance_loss_mlp": 0.01254959, + "epoch": 0.8663159476927702, + "flos": 17389835683200.0, + "grad_norm": 1.7633081999688927, + "language_loss": 0.77345204, + "learning_rate": 1.844827992025304e-07, + "loss": 0.85006946, + "num_input_tokens_seen": 310801970, + "router_z_loss_clip": 1.27246094, + "router_z_loss_mlp": 0.09265137, + "step": 14409, + "time_per_iteration": 2.526059865951538 + }, + { + "auxiliary_loss_clip": 0.06406955, + "auxiliary_loss_mlp": 0.01265018, + "balance_loss_clip": 0.06271859, + "balance_loss_mlp": 0.01254951, + "epoch": 0.8663760709454381, + "flos": 22754385308160.0, + "grad_norm": 1.8171416455536564, + "language_loss": 0.76934552, + "learning_rate": 1.8431945720161757e-07, + "loss": 0.84606528, + "num_input_tokens_seen": 310822070, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.10064697, + "step": 14410, + "time_per_iteration": 2.5280380249023438 + }, + { + "auxiliary_loss_clip": 0.06405914, + "auxiliary_loss_mlp": 0.0126676, + "balance_loss_clip": 0.06273735, + "balance_loss_mlp": 0.01256991, + "epoch": 0.8664361941981061, + "flos": 17381366421120.0, + "grad_norm": 1.9715328032802535, + "language_loss": 0.78025001, + "learning_rate": 1.8415618405149315e-07, + "loss": 0.85697675, + "num_input_tokens_seen": 310838355, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.09765625, + "step": 14411, + "time_per_iteration": 2.515397071838379 + }, + { + "auxiliary_loss_clip": 0.06397986, + "auxiliary_loss_mlp": 0.01263072, + "balance_loss_clip": 0.06267551, + "balance_loss_mlp": 0.01253774, + "epoch": 0.866496317450774, + "flos": 16039994734080.0, + "grad_norm": 1.7277643330108303, + "language_loss": 0.73680794, + "learning_rate": 1.8399297975834794e-07, + "loss": 0.81341851, + "num_input_tokens_seen": 310856055, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.09295654, + "step": 14412, + "time_per_iteration": 2.4690604209899902 + }, + { + "auxiliary_loss_clip": 0.06400064, + "auxiliary_loss_mlp": 0.01267281, + "balance_loss_clip": 0.06271769, + "balance_loss_mlp": 0.0125868, + "epoch": 0.8665564407034421, + "flos": 20821313214720.0, + "grad_norm": 1.8053932243271904, + "language_loss": 0.69647372, + "learning_rate": 1.83829844328371e-07, + "loss": 0.77314717, + "num_input_tokens_seen": 310876695, + "router_z_loss_clip": 1.28222656, + "router_z_loss_mlp": 0.08605957, + "step": 14413, + "time_per_iteration": 2.514761209487915 + }, + { + "auxiliary_loss_clip": 0.06403694, + "auxiliary_loss_mlp": 0.01265064, + "balance_loss_clip": 0.06270799, + "balance_loss_mlp": 0.01255254, + "epoch": 0.86661656395611, + "flos": 15820627944960.0, + "grad_norm": 2.2244360215137684, + "language_loss": 0.63284969, + "learning_rate": 1.8366677776774874e-07, + "loss": 0.70953727, + "num_input_tokens_seen": 310893880, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.0980835, + "step": 14414, + "time_per_iteration": 2.475782871246338 + }, + { + "auxiliary_loss_clip": 0.06403673, + "auxiliary_loss_mlp": 0.0126404, + "balance_loss_clip": 0.06273353, + "balance_loss_mlp": 0.01254652, + "epoch": 0.866676687208778, + "flos": 23043170805120.0, + "grad_norm": 1.623963807084388, + "language_loss": 0.6375469, + "learning_rate": 1.8350378008266377e-07, + "loss": 0.71422398, + "num_input_tokens_seen": 310914145, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.09387207, + "step": 14415, + "time_per_iteration": 2.5194180011749268 + }, + { + "auxiliary_loss_clip": 0.06311454, + "auxiliary_loss_mlp": 0.01254301, + "balance_loss_clip": 0.06256884, + "balance_loss_mlp": 0.01253252, + "epoch": 0.866736810461446, + "flos": 63823256104320.0, + "grad_norm": 0.7752064714418949, + "language_loss": 0.60367054, + "learning_rate": 1.8334085127929754e-07, + "loss": 0.67932814, + "num_input_tokens_seen": 310972825, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.01049805, + "step": 14416, + "time_per_iteration": 3.1916332244873047 + }, + { + "auxiliary_loss_clip": 0.06406388, + "auxiliary_loss_mlp": 0.01263895, + "balance_loss_clip": 0.06270231, + "balance_loss_mlp": 0.01253596, + "epoch": 0.8667969337141139, + "flos": 20455687923840.0, + "grad_norm": 1.758371928696436, + "language_loss": 0.75081879, + "learning_rate": 1.831779913638285e-07, + "loss": 0.82752162, + "num_input_tokens_seen": 310992050, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.10296631, + "step": 14417, + "time_per_iteration": 2.519272565841675 + }, + { + "auxiliary_loss_clip": 0.06401929, + "auxiliary_loss_mlp": 0.01264851, + "balance_loss_clip": 0.06270267, + "balance_loss_mlp": 0.01255493, + "epoch": 0.866857056966782, + "flos": 21660276620160.0, + "grad_norm": 1.4417823685180284, + "language_loss": 0.75500447, + "learning_rate": 1.830152003424319e-07, + "loss": 0.83167231, + "num_input_tokens_seen": 311011105, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09350586, + "step": 14418, + "time_per_iteration": 2.5243372917175293 + }, + { + "auxiliary_loss_clip": 0.06397848, + "auxiliary_loss_mlp": 0.0126541, + "balance_loss_clip": 0.06267963, + "balance_loss_mlp": 0.01255963, + "epoch": 0.8669171802194499, + "flos": 22858785895680.0, + "grad_norm": 1.4538626454884047, + "language_loss": 0.68544036, + "learning_rate": 1.8285247822128126e-07, + "loss": 0.76207292, + "num_input_tokens_seen": 311032080, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.09448242, + "step": 14419, + "time_per_iteration": 2.598567247390747 + }, + { + "auxiliary_loss_clip": 0.06402744, + "auxiliary_loss_mlp": 0.01264128, + "balance_loss_clip": 0.06270118, + "balance_loss_mlp": 0.01254794, + "epoch": 0.8669773034721179, + "flos": 18740137829760.0, + "grad_norm": 1.6269776672151877, + "language_loss": 0.78971106, + "learning_rate": 1.826898250065465e-07, + "loss": 0.86637974, + "num_input_tokens_seen": 311049735, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.09326172, + "step": 14420, + "time_per_iteration": 2.5527749061584473 + }, + { + "auxiliary_loss_clip": 0.06402794, + "auxiliary_loss_mlp": 0.01264773, + "balance_loss_clip": 0.0627051, + "balance_loss_mlp": 0.01255153, + "epoch": 0.8670374267247858, + "flos": 18921923262720.0, + "grad_norm": 1.8416843684547823, + "language_loss": 0.83623648, + "learning_rate": 1.8252724070439586e-07, + "loss": 0.91291213, + "num_input_tokens_seen": 311067675, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09625244, + "step": 14421, + "time_per_iteration": 2.53287935256958 + }, + { + "auxiliary_loss_clip": 0.06307293, + "auxiliary_loss_mlp": 0.01252132, + "balance_loss_clip": 0.06252414, + "balance_loss_mlp": 0.01251069, + "epoch": 0.8670975499774538, + "flos": 48834323458560.0, + "grad_norm": 0.6970048505263723, + "language_loss": 0.4877342, + "learning_rate": 1.823647253209941e-07, + "loss": 0.56332839, + "num_input_tokens_seen": 311126605, + "router_z_loss_clip": 0.54931641, + "router_z_loss_mlp": 0.01064301, + "step": 14422, + "time_per_iteration": 3.2060294151306152 + }, + { + "auxiliary_loss_clip": 0.06402378, + "auxiliary_loss_mlp": 0.0126638, + "balance_loss_clip": 0.06270766, + "balance_loss_mlp": 0.01257374, + "epoch": 0.8671576732301217, + "flos": 26142579406080.0, + "grad_norm": 1.482284163911286, + "language_loss": 0.73646462, + "learning_rate": 1.8220227886250417e-07, + "loss": 0.81315225, + "num_input_tokens_seen": 311147325, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.09008789, + "step": 14423, + "time_per_iteration": 2.5513858795166016 + }, + { + "auxiliary_loss_clip": 0.06397344, + "auxiliary_loss_mlp": 0.01261454, + "balance_loss_clip": 0.06272127, + "balance_loss_mlp": 0.0125339, + "epoch": 0.8672177964827897, + "flos": 18373045092480.0, + "grad_norm": 1.5159393869667968, + "language_loss": 0.7694416, + "learning_rate": 1.8203990133508684e-07, + "loss": 0.84602958, + "num_input_tokens_seen": 311165385, + "router_z_loss_clip": 1.25097656, + "router_z_loss_mlp": 0.08056641, + "step": 14424, + "time_per_iteration": 2.5115561485290527 + }, + { + "auxiliary_loss_clip": 0.06394623, + "auxiliary_loss_mlp": 0.01261736, + "balance_loss_clip": 0.06269346, + "balance_loss_mlp": 0.01253331, + "epoch": 0.8672779197354576, + "flos": 28552385704320.0, + "grad_norm": 1.5295537919973716, + "language_loss": 0.71849072, + "learning_rate": 1.8187759274489767e-07, + "loss": 0.79505438, + "num_input_tokens_seen": 311185860, + "router_z_loss_clip": 1.25390625, + "router_z_loss_mlp": 0.08410645, + "step": 14425, + "time_per_iteration": 2.5568857192993164 + }, + { + "auxiliary_loss_clip": 0.06405246, + "auxiliary_loss_mlp": 0.01264965, + "balance_loss_clip": 0.06270114, + "balance_loss_mlp": 0.01255297, + "epoch": 0.8673380429881257, + "flos": 22389011579520.0, + "grad_norm": 1.4758185818369447, + "language_loss": 0.6852206, + "learning_rate": 1.817153530980926e-07, + "loss": 0.76192278, + "num_input_tokens_seen": 311205810, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.09667969, + "step": 14426, + "time_per_iteration": 2.5231831073760986 + }, + { + "auxiliary_loss_clip": 0.06402829, + "auxiliary_loss_mlp": 0.01263874, + "balance_loss_clip": 0.06270183, + "balance_loss_mlp": 0.01253419, + "epoch": 0.8673981662407936, + "flos": 21002805158400.0, + "grad_norm": 1.7832105670695808, + "language_loss": 0.70722842, + "learning_rate": 1.815531824008234e-07, + "loss": 0.78389543, + "num_input_tokens_seen": 311226080, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.10455322, + "step": 14427, + "time_per_iteration": 4.035536289215088 + }, + { + "auxiliary_loss_clip": 0.0640244, + "auxiliary_loss_mlp": 0.0126232, + "balance_loss_clip": 0.06271227, + "balance_loss_mlp": 0.01252676, + "epoch": 0.8674582894934616, + "flos": 24433863419520.0, + "grad_norm": 1.5804797427940684, + "language_loss": 0.6822958, + "learning_rate": 1.8139108065924004e-07, + "loss": 0.75894332, + "num_input_tokens_seen": 311246380, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09655762, + "step": 14428, + "time_per_iteration": 2.553795099258423 + }, + { + "auxiliary_loss_clip": 0.06399473, + "auxiliary_loss_mlp": 0.01266114, + "balance_loss_clip": 0.0626923, + "balance_loss_mlp": 0.01257334, + "epoch": 0.8675184127461296, + "flos": 20743257536640.0, + "grad_norm": 1.8013326629765731, + "language_loss": 0.71193767, + "learning_rate": 1.812290478794889e-07, + "loss": 0.78859359, + "num_input_tokens_seen": 311266465, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.08776855, + "step": 14429, + "time_per_iteration": 2.493234157562256 + }, + { + "auxiliary_loss_clip": 0.06401441, + "auxiliary_loss_mlp": 0.01264101, + "balance_loss_clip": 0.0627252, + "balance_loss_mlp": 0.01254898, + "epoch": 0.8675785359987975, + "flos": 19141709322240.0, + "grad_norm": 1.8609763049402845, + "language_loss": 0.66596407, + "learning_rate": 1.810670840677151e-07, + "loss": 0.74261945, + "num_input_tokens_seen": 311285075, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.09204102, + "step": 14430, + "time_per_iteration": 2.4854321479797363 + }, + { + "auxiliary_loss_clip": 0.06403784, + "auxiliary_loss_mlp": 0.012671, + "balance_loss_clip": 0.06269564, + "balance_loss_mlp": 0.01256902, + "epoch": 0.8676386592514655, + "flos": 22717223222400.0, + "grad_norm": 2.523579211603687, + "language_loss": 0.69258201, + "learning_rate": 1.8090518923005948e-07, + "loss": 0.76929086, + "num_input_tokens_seen": 311303230, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.10198975, + "step": 14431, + "time_per_iteration": 2.5594279766082764 + }, + { + "auxiliary_loss_clip": 0.06405756, + "auxiliary_loss_mlp": 0.0126775, + "balance_loss_clip": 0.06272927, + "balance_loss_mlp": 0.01258016, + "epoch": 0.8676987825041335, + "flos": 14215054734720.0, + "grad_norm": 2.3061623073742545, + "language_loss": 0.6399014, + "learning_rate": 1.8074336337266116e-07, + "loss": 0.71663648, + "num_input_tokens_seen": 311318070, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.09735107, + "step": 14432, + "time_per_iteration": 2.499904155731201 + }, + { + "auxiliary_loss_clip": 0.06403828, + "auxiliary_loss_mlp": 0.01265326, + "balance_loss_clip": 0.06272545, + "balance_loss_mlp": 0.01256111, + "epoch": 0.8677589057568015, + "flos": 13595080775040.0, + "grad_norm": 1.7789604432407644, + "language_loss": 0.78301966, + "learning_rate": 1.8058160650165656e-07, + "loss": 0.85971117, + "num_input_tokens_seen": 311334885, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09222412, + "step": 14433, + "time_per_iteration": 2.4722964763641357 + }, + { + "auxiliary_loss_clip": 0.06308552, + "auxiliary_loss_mlp": 0.01250803, + "balance_loss_clip": 0.06253849, + "balance_loss_mlp": 0.01249807, + "epoch": 0.8678190290094694, + "flos": 68953303278720.0, + "grad_norm": 0.6938824705198252, + "language_loss": 0.58372235, + "learning_rate": 1.804199186231805e-07, + "loss": 0.65931588, + "num_input_tokens_seen": 311399780, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.00994873, + "step": 14434, + "time_per_iteration": 3.22125506401062 + }, + { + "auxiliary_loss_clip": 0.06397156, + "auxiliary_loss_mlp": 0.01264803, + "balance_loss_clip": 0.06269969, + "balance_loss_mlp": 0.01256273, + "epoch": 0.8678791522621374, + "flos": 32565249590400.0, + "grad_norm": 1.644245978236505, + "language_loss": 0.80153918, + "learning_rate": 1.802582997433628e-07, + "loss": 0.87815875, + "num_input_tokens_seen": 311419610, + "router_z_loss_clip": 1.27148438, + "router_z_loss_mlp": 0.08526611, + "step": 14435, + "time_per_iteration": 2.623704195022583 + }, + { + "auxiliary_loss_clip": 0.06403121, + "auxiliary_loss_mlp": 0.012653, + "balance_loss_clip": 0.06269317, + "balance_loss_mlp": 0.01254756, + "epoch": 0.8679392755148053, + "flos": 35051224849920.0, + "grad_norm": 1.897215126056039, + "language_loss": 0.62167633, + "learning_rate": 1.8009674986833322e-07, + "loss": 0.69836056, + "num_input_tokens_seen": 311440045, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.10546875, + "step": 14436, + "time_per_iteration": 2.632450819015503 + }, + { + "auxiliary_loss_clip": 0.06402992, + "auxiliary_loss_mlp": 0.01262824, + "balance_loss_clip": 0.06270669, + "balance_loss_mlp": 0.0125278, + "epoch": 0.8679993987674733, + "flos": 18558562032000.0, + "grad_norm": 1.9896848572147598, + "language_loss": 0.70140958, + "learning_rate": 1.7993526900421706e-07, + "loss": 0.77806765, + "num_input_tokens_seen": 311456660, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.10040283, + "step": 14437, + "time_per_iteration": 2.541003704071045 + }, + { + "auxiliary_loss_clip": 0.06404081, + "auxiliary_loss_mlp": 0.0126507, + "balance_loss_clip": 0.06273189, + "balance_loss_mlp": 0.01255152, + "epoch": 0.8680595220201412, + "flos": 27461840814720.0, + "grad_norm": 1.956729698736987, + "language_loss": 0.8056224, + "learning_rate": 1.797738571571381e-07, + "loss": 0.88231391, + "num_input_tokens_seen": 311475460, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.09924316, + "step": 14438, + "time_per_iteration": 4.026323556900024 + }, + { + "auxiliary_loss_clip": 0.06396785, + "auxiliary_loss_mlp": 0.012629, + "balance_loss_clip": 0.0627017, + "balance_loss_mlp": 0.01254066, + "epoch": 0.8681196452728093, + "flos": 19214901463680.0, + "grad_norm": 1.7667026459424926, + "language_loss": 0.67657971, + "learning_rate": 1.7961251433321656e-07, + "loss": 0.75317651, + "num_input_tokens_seen": 311494575, + "router_z_loss_clip": 1.265625, + "router_z_loss_mlp": 0.08837891, + "step": 14439, + "time_per_iteration": 2.510300874710083 + }, + { + "auxiliary_loss_clip": 0.06404371, + "auxiliary_loss_mlp": 0.01263942, + "balance_loss_clip": 0.06272165, + "balance_loss_mlp": 0.01255007, + "epoch": 0.8681797685254772, + "flos": 37569498658560.0, + "grad_norm": 2.0023877249640094, + "language_loss": 0.64283299, + "learning_rate": 1.7945124053857085e-07, + "loss": 0.7195161, + "num_input_tokens_seen": 311515805, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.0894165, + "step": 14440, + "time_per_iteration": 2.66216778755188 + }, + { + "auxiliary_loss_clip": 0.06398277, + "auxiliary_loss_mlp": 0.01265472, + "balance_loss_clip": 0.06271653, + "balance_loss_mlp": 0.0125609, + "epoch": 0.8682398917781452, + "flos": 23295842392320.0, + "grad_norm": 1.5322174401444875, + "language_loss": 0.65759438, + "learning_rate": 1.7929003577931722e-07, + "loss": 0.73423183, + "num_input_tokens_seen": 311536000, + "router_z_loss_clip": 1.26757812, + "router_z_loss_mlp": 0.09381104, + "step": 14441, + "time_per_iteration": 2.504725456237793 + }, + { + "auxiliary_loss_clip": 0.06396982, + "auxiliary_loss_mlp": 0.01262947, + "balance_loss_clip": 0.06271137, + "balance_loss_mlp": 0.01254125, + "epoch": 0.8683000150308132, + "flos": 21879433774080.0, + "grad_norm": 1.5819575820693645, + "language_loss": 0.66384351, + "learning_rate": 1.7912890006156722e-07, + "loss": 0.74044275, + "num_input_tokens_seen": 311556220, + "router_z_loss_clip": 1.25976562, + "router_z_loss_mlp": 0.08813477, + "step": 14442, + "time_per_iteration": 2.515378713607788 + }, + { + "auxiliary_loss_clip": 0.06408555, + "auxiliary_loss_mlp": 0.0126728, + "balance_loss_clip": 0.06272847, + "balance_loss_mlp": 0.01256921, + "epoch": 0.8683601382834811, + "flos": 14652404720640.0, + "grad_norm": 1.734423376729254, + "language_loss": 0.72608215, + "learning_rate": 1.7896783339143195e-07, + "loss": 0.80284047, + "num_input_tokens_seen": 311572530, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10345459, + "step": 14443, + "time_per_iteration": 4.0072619915008545 + }, + { + "auxiliary_loss_clip": 0.06403544, + "auxiliary_loss_mlp": 0.01266339, + "balance_loss_clip": 0.06272006, + "balance_loss_mlp": 0.01256617, + "epoch": 0.8684202615361492, + "flos": 26367187075200.0, + "grad_norm": 1.686322881132401, + "language_loss": 0.83196855, + "learning_rate": 1.7880683577501877e-07, + "loss": 0.90866739, + "num_input_tokens_seen": 311591105, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.097229, + "step": 14444, + "time_per_iteration": 4.008268594741821 + }, + { + "auxiliary_loss_clip": 0.06403743, + "auxiliary_loss_mlp": 0.01261873, + "balance_loss_clip": 0.06272523, + "balance_loss_mlp": 0.01252628, + "epoch": 0.8684803847888171, + "flos": 20710246227840.0, + "grad_norm": 1.9141617998597704, + "language_loss": 0.76965976, + "learning_rate": 1.7864590721843342e-07, + "loss": 0.84631586, + "num_input_tokens_seen": 311608350, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09246826, + "step": 14445, + "time_per_iteration": 2.5997262001037598 + }, + { + "auxiliary_loss_clip": 0.06402852, + "auxiliary_loss_mlp": 0.01262345, + "balance_loss_clip": 0.06273001, + "balance_loss_mlp": 0.01252194, + "epoch": 0.8685405080414851, + "flos": 22644743840640.0, + "grad_norm": 1.69315828341739, + "language_loss": 0.68069935, + "learning_rate": 1.7848504772777728e-07, + "loss": 0.75735128, + "num_input_tokens_seen": 311626380, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.10144043, + "step": 14446, + "time_per_iteration": 2.5424163341522217 + }, + { + "auxiliary_loss_clip": 0.06401268, + "auxiliary_loss_mlp": 0.01264762, + "balance_loss_clip": 0.06270047, + "balance_loss_mlp": 0.01255422, + "epoch": 0.868600631294153, + "flos": 24828181534080.0, + "grad_norm": 1.616488905601248, + "language_loss": 0.82849407, + "learning_rate": 1.7832425730915102e-07, + "loss": 0.90515447, + "num_input_tokens_seen": 311644345, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.09344482, + "step": 14447, + "time_per_iteration": 2.6071512699127197 + }, + { + "auxiliary_loss_clip": 0.06400138, + "auxiliary_loss_mlp": 0.0126489, + "balance_loss_clip": 0.06269099, + "balance_loss_mlp": 0.01255937, + "epoch": 0.868660754546821, + "flos": 25120153486080.0, + "grad_norm": 1.624335416002347, + "language_loss": 0.74320281, + "learning_rate": 1.781635359686515e-07, + "loss": 0.81985313, + "num_input_tokens_seen": 311663340, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.08959961, + "step": 14448, + "time_per_iteration": 2.547412633895874 + }, + { + "auxiliary_loss_clip": 0.06402777, + "auxiliary_loss_mlp": 0.01263991, + "balance_loss_clip": 0.06270443, + "balance_loss_mlp": 0.01254299, + "epoch": 0.8687208777994889, + "flos": 12682841374080.0, + "grad_norm": 1.8412426032708813, + "language_loss": 0.80489451, + "learning_rate": 1.7800288371237303e-07, + "loss": 0.88156223, + "num_input_tokens_seen": 311679860, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.09686279, + "step": 14449, + "time_per_iteration": 2.4914026260375977 + }, + { + "auxiliary_loss_clip": 0.0631351, + "auxiliary_loss_mlp": 0.01253647, + "balance_loss_clip": 0.06259002, + "balance_loss_mlp": 0.01252613, + "epoch": 0.8687810010521569, + "flos": 65636959656960.0, + "grad_norm": 0.7923178433705474, + "language_loss": 0.60340738, + "learning_rate": 1.7784230054640758e-07, + "loss": 0.67907894, + "num_input_tokens_seen": 311738135, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.01034546, + "step": 14450, + "time_per_iteration": 3.0573930740356445 + }, + { + "auxiliary_loss_clip": 0.06410334, + "auxiliary_loss_mlp": 0.01264555, + "balance_loss_clip": 0.06276858, + "balance_loss_mlp": 0.0125512, + "epoch": 0.8688411243048249, + "flos": 24250987883520.0, + "grad_norm": 1.5446429349016553, + "language_loss": 0.76378512, + "learning_rate": 1.7768178647684517e-07, + "loss": 0.84053403, + "num_input_tokens_seen": 311756975, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.09436035, + "step": 14451, + "time_per_iteration": 2.5443615913391113 + }, + { + "auxiliary_loss_clip": 0.06400914, + "auxiliary_loss_mlp": 0.01264515, + "balance_loss_clip": 0.0627023, + "balance_loss_mlp": 0.01254943, + "epoch": 0.8689012475574929, + "flos": 18227457423360.0, + "grad_norm": 2.4344123800734487, + "language_loss": 0.72107518, + "learning_rate": 1.7752134150977205e-07, + "loss": 0.79772949, + "num_input_tokens_seen": 311771830, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.09564209, + "step": 14452, + "time_per_iteration": 2.4614477157592773 + }, + { + "auxiliary_loss_clip": 0.06404183, + "auxiliary_loss_mlp": 0.0126295, + "balance_loss_clip": 0.06270374, + "balance_loss_mlp": 0.01253014, + "epoch": 0.8689613708101608, + "flos": 19652922282240.0, + "grad_norm": 1.772178254376601, + "language_loss": 0.72880638, + "learning_rate": 1.7736096565127201e-07, + "loss": 0.80547774, + "num_input_tokens_seen": 311790130, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.09942627, + "step": 14453, + "time_per_iteration": 2.508371591567993 + }, + { + "auxiliary_loss_clip": 0.06399187, + "auxiliary_loss_mlp": 0.01264806, + "balance_loss_clip": 0.06269897, + "balance_loss_mlp": 0.01255669, + "epoch": 0.8690214940628288, + "flos": 11733523741440.0, + "grad_norm": 1.9522335345310335, + "language_loss": 0.73650515, + "learning_rate": 1.7720065890742664e-07, + "loss": 0.8131451, + "num_input_tokens_seen": 311808360, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.09136963, + "step": 14454, + "time_per_iteration": 2.6009294986724854 + }, + { + "auxiliary_loss_clip": 0.06401433, + "auxiliary_loss_mlp": 0.01266363, + "balance_loss_clip": 0.06271113, + "balance_loss_mlp": 0.01256516, + "epoch": 0.8690816173154968, + "flos": 34945566451200.0, + "grad_norm": 1.7631305246108158, + "language_loss": 0.60118473, + "learning_rate": 1.7704042128431552e-07, + "loss": 0.67786264, + "num_input_tokens_seen": 311831325, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.09844971, + "step": 14455, + "time_per_iteration": 2.753415107727051 + }, + { + "auxiliary_loss_clip": 0.06404486, + "auxiliary_loss_mlp": 0.01264704, + "balance_loss_clip": 0.06271438, + "balance_loss_mlp": 0.01255233, + "epoch": 0.8691417405681647, + "flos": 11618809102080.0, + "grad_norm": 2.01471686144797, + "language_loss": 0.80115831, + "learning_rate": 1.7688025278801378e-07, + "loss": 0.87785017, + "num_input_tokens_seen": 311848090, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.09472656, + "step": 14456, + "time_per_iteration": 2.5271530151367188 + }, + { + "auxiliary_loss_clip": 0.06409412, + "auxiliary_loss_mlp": 0.01267391, + "balance_loss_clip": 0.06274113, + "balance_loss_mlp": 0.01257247, + "epoch": 0.8692018638208328, + "flos": 24614936092800.0, + "grad_norm": 2.326789924300959, + "language_loss": 0.74536252, + "learning_rate": 1.7672015342459568e-07, + "loss": 0.82213056, + "num_input_tokens_seen": 311867855, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.10137939, + "step": 14457, + "time_per_iteration": 2.526219129562378 + }, + { + "auxiliary_loss_clip": 0.06399509, + "auxiliary_loss_mlp": 0.01264718, + "balance_loss_clip": 0.06271378, + "balance_loss_mlp": 0.01255784, + "epoch": 0.8692619870735007, + "flos": 26002358398080.0, + "grad_norm": 1.4211804467950002, + "language_loss": 0.7873075, + "learning_rate": 1.765601232001328e-07, + "loss": 0.86394978, + "num_input_tokens_seen": 311888675, + "router_z_loss_clip": 1.28222656, + "router_z_loss_mlp": 0.0894165, + "step": 14458, + "time_per_iteration": 2.5216262340545654 + }, + { + "auxiliary_loss_clip": 0.06402966, + "auxiliary_loss_mlp": 0.01266346, + "balance_loss_clip": 0.06273033, + "balance_loss_mlp": 0.0125663, + "epoch": 0.8693221103261687, + "flos": 18047810269440.0, + "grad_norm": 1.5087935238946328, + "language_loss": 0.71331191, + "learning_rate": 1.7640016212069187e-07, + "loss": 0.79000497, + "num_input_tokens_seen": 311907310, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.097229, + "step": 14459, + "time_per_iteration": 2.4944982528686523 + }, + { + "auxiliary_loss_clip": 0.06394096, + "auxiliary_loss_mlp": 0.01263427, + "balance_loss_clip": 0.06268888, + "balance_loss_mlp": 0.0125485, + "epoch": 0.8693822335788366, + "flos": 27500051076480.0, + "grad_norm": 1.2788295067454918, + "language_loss": 0.74028695, + "learning_rate": 1.762402701923398e-07, + "loss": 0.81686223, + "num_input_tokens_seen": 311929635, + "router_z_loss_clip": 1.25195312, + "router_z_loss_mlp": 0.08575439, + "step": 14460, + "time_per_iteration": 2.56471848487854 + }, + { + "auxiliary_loss_clip": 0.06408993, + "auxiliary_loss_mlp": 0.01266866, + "balance_loss_clip": 0.06271887, + "balance_loss_mlp": 0.01257002, + "epoch": 0.8694423568315046, + "flos": 24104603600640.0, + "grad_norm": 2.393839276229543, + "language_loss": 0.65351462, + "learning_rate": 1.7608044742113947e-07, + "loss": 0.73027325, + "num_input_tokens_seen": 311948800, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.09857178, + "step": 14461, + "time_per_iteration": 2.5354537963867188 + }, + { + "auxiliary_loss_clip": 0.06403669, + "auxiliary_loss_mlp": 0.01267783, + "balance_loss_clip": 0.0627113, + "balance_loss_mlp": 0.0125793, + "epoch": 0.8695024800841725, + "flos": 18366839890560.0, + "grad_norm": 2.377735407196708, + "language_loss": 0.82366443, + "learning_rate": 1.7592069381315123e-07, + "loss": 0.900379, + "num_input_tokens_seen": 311964090, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.09844971, + "step": 14462, + "time_per_iteration": 2.471653938293457 + }, + { + "auxiliary_loss_clip": 0.06403664, + "auxiliary_loss_mlp": 0.01265298, + "balance_loss_clip": 0.0627223, + "balance_loss_mlp": 0.0125529, + "epoch": 0.8695626033368405, + "flos": 14032975812480.0, + "grad_norm": 1.782940361632394, + "language_loss": 0.65303802, + "learning_rate": 1.757610093744335e-07, + "loss": 0.72972763, + "num_input_tokens_seen": 311981460, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.10009766, + "step": 14463, + "time_per_iteration": 2.519047975540161 + }, + { + "auxiliary_loss_clip": 0.06408842, + "auxiliary_loss_mlp": 0.01268237, + "balance_loss_clip": 0.06271829, + "balance_loss_mlp": 0.01257729, + "epoch": 0.8696227265895085, + "flos": 16842508813440.0, + "grad_norm": 1.8832383618141357, + "language_loss": 0.66826367, + "learning_rate": 1.7560139411104058e-07, + "loss": 0.74503446, + "num_input_tokens_seen": 312000115, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.10516357, + "step": 14464, + "time_per_iteration": 2.4889910221099854 + }, + { + "auxiliary_loss_clip": 0.06410474, + "auxiliary_loss_mlp": 0.01263823, + "balance_loss_clip": 0.06273378, + "balance_loss_mlp": 0.01253166, + "epoch": 0.8696828498421765, + "flos": 21805570800000.0, + "grad_norm": 2.242038874190131, + "language_loss": 0.63238472, + "learning_rate": 1.7544184802902607e-07, + "loss": 0.70912772, + "num_input_tokens_seen": 312020770, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.10656738, + "step": 14465, + "time_per_iteration": 2.5462048053741455 + }, + { + "auxiliary_loss_clip": 0.06396791, + "auxiliary_loss_mlp": 0.01265271, + "balance_loss_clip": 0.06269901, + "balance_loss_mlp": 0.012567, + "epoch": 0.8697429730948444, + "flos": 22901691985920.0, + "grad_norm": 1.4710912733423256, + "language_loss": 0.84975493, + "learning_rate": 1.7528237113443934e-07, + "loss": 0.92637551, + "num_input_tokens_seen": 312041870, + "router_z_loss_clip": 1.26855469, + "router_z_loss_mlp": 0.08569336, + "step": 14466, + "time_per_iteration": 3.9527673721313477 + }, + { + "auxiliary_loss_clip": 0.06408149, + "auxiliary_loss_mlp": 0.01267066, + "balance_loss_clip": 0.06272207, + "balance_loss_mlp": 0.01256146, + "epoch": 0.8698030963475124, + "flos": 24724367925120.0, + "grad_norm": 2.1885311234894607, + "language_loss": 0.61972004, + "learning_rate": 1.7512296343332779e-07, + "loss": 0.69647217, + "num_input_tokens_seen": 312058210, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.10913086, + "step": 14467, + "time_per_iteration": 2.531226634979248 + }, + { + "auxiliary_loss_clip": 0.06397028, + "auxiliary_loss_mlp": 0.0126206, + "balance_loss_clip": 0.06269924, + "balance_loss_mlp": 0.01253441, + "epoch": 0.8698632196001803, + "flos": 28450291104000.0, + "grad_norm": 1.3163681767260083, + "language_loss": 0.69067562, + "learning_rate": 1.7496362493173655e-07, + "loss": 0.76726645, + "num_input_tokens_seen": 312082665, + "router_z_loss_clip": 1.27246094, + "router_z_loss_mlp": 0.08624268, + "step": 14468, + "time_per_iteration": 2.617129325866699 + }, + { + "auxiliary_loss_clip": 0.06402217, + "auxiliary_loss_mlp": 0.01263604, + "balance_loss_clip": 0.06272022, + "balance_loss_mlp": 0.0125501, + "epoch": 0.8699233428528483, + "flos": 27643877809920.0, + "grad_norm": 1.469874122619276, + "language_loss": 0.71179217, + "learning_rate": 1.7480435563570773e-07, + "loss": 0.78845036, + "num_input_tokens_seen": 312101960, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.08587646, + "step": 14469, + "time_per_iteration": 2.5837879180908203 + }, + { + "auxiliary_loss_clip": 0.06397484, + "auxiliary_loss_mlp": 0.01262825, + "balance_loss_clip": 0.06272286, + "balance_loss_mlp": 0.01254326, + "epoch": 0.8699834661055164, + "flos": 20051516954880.0, + "grad_norm": 2.250456431690659, + "language_loss": 0.84240717, + "learning_rate": 1.7464515555128024e-07, + "loss": 0.91901028, + "num_input_tokens_seen": 312117125, + "router_z_loss_clip": 1.24902344, + "router_z_loss_mlp": 0.08502197, + "step": 14470, + "time_per_iteration": 2.555173635482788 + }, + { + "auxiliary_loss_clip": 0.06400733, + "auxiliary_loss_mlp": 0.01262712, + "balance_loss_clip": 0.06270544, + "balance_loss_mlp": 0.01253384, + "epoch": 0.8700435893581843, + "flos": 23739607215360.0, + "grad_norm": 1.6759251274970535, + "language_loss": 0.73015386, + "learning_rate": 1.7448602468449148e-07, + "loss": 0.80678833, + "num_input_tokens_seen": 312135775, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.09332275, + "step": 14471, + "time_per_iteration": 2.49556827545166 + }, + { + "auxiliary_loss_clip": 0.06401968, + "auxiliary_loss_mlp": 0.0126843, + "balance_loss_clip": 0.06272097, + "balance_loss_mlp": 0.01259001, + "epoch": 0.8701037126108523, + "flos": 23554886889600.0, + "grad_norm": 1.414338662469805, + "language_loss": 0.79126775, + "learning_rate": 1.7432696304137573e-07, + "loss": 0.86797178, + "num_input_tokens_seen": 312156070, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.09429932, + "step": 14472, + "time_per_iteration": 2.546039581298828 + }, + { + "auxiliary_loss_clip": 0.06400506, + "auxiliary_loss_mlp": 0.01261454, + "balance_loss_clip": 0.06270543, + "balance_loss_mlp": 0.01252371, + "epoch": 0.8701638358635202, + "flos": 18849401953920.0, + "grad_norm": 1.7511234862282108, + "language_loss": 0.72525012, + "learning_rate": 1.741679706279644e-07, + "loss": 0.80186975, + "num_input_tokens_seen": 312174380, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.09075928, + "step": 14473, + "time_per_iteration": 2.4787282943725586 + }, + { + "auxiliary_loss_clip": 0.06408264, + "auxiliary_loss_mlp": 0.01262745, + "balance_loss_clip": 0.06274155, + "balance_loss_mlp": 0.01253232, + "epoch": 0.8702239591161882, + "flos": 27935807834880.0, + "grad_norm": 1.4568573772519522, + "language_loss": 0.72361302, + "learning_rate": 1.7400904745028644e-07, + "loss": 0.80032313, + "num_input_tokens_seen": 312195130, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.09521484, + "step": 14474, + "time_per_iteration": 2.580152750015259 + }, + { + "auxiliary_loss_clip": 0.0640256, + "auxiliary_loss_mlp": 0.01268742, + "balance_loss_clip": 0.06270008, + "balance_loss_mlp": 0.012588, + "epoch": 0.8702840823688561, + "flos": 17239007134080.0, + "grad_norm": 2.0568894505970836, + "language_loss": 0.67749852, + "learning_rate": 1.7385019351436925e-07, + "loss": 0.75421154, + "num_input_tokens_seen": 312212300, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.0994873, + "step": 14475, + "time_per_iteration": 2.4745309352874756 + }, + { + "auxiliary_loss_clip": 0.06405099, + "auxiliary_loss_mlp": 0.01266972, + "balance_loss_clip": 0.06271499, + "balance_loss_mlp": 0.01257334, + "epoch": 0.8703442056215241, + "flos": 19433681274240.0, + "grad_norm": 1.4998627111504736, + "language_loss": 0.78266013, + "learning_rate": 1.736914088262349e-07, + "loss": 0.85938084, + "num_input_tokens_seen": 312231735, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.09637451, + "step": 14476, + "time_per_iteration": 2.5792596340179443 + }, + { + "auxiliary_loss_clip": 0.06402189, + "auxiliary_loss_mlp": 0.01263388, + "balance_loss_clip": 0.06273142, + "balance_loss_mlp": 0.01254185, + "epoch": 0.8704043288741921, + "flos": 22280502142080.0, + "grad_norm": 1.4832205414105002, + "language_loss": 0.72368699, + "learning_rate": 1.7353269339190525e-07, + "loss": 0.8003428, + "num_input_tokens_seen": 312253060, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.09191895, + "step": 14477, + "time_per_iteration": 2.523857593536377 + }, + { + "auxiliary_loss_clip": 0.06404123, + "auxiliary_loss_mlp": 0.01265103, + "balance_loss_clip": 0.06272732, + "balance_loss_mlp": 0.01255906, + "epoch": 0.8704644521268601, + "flos": 16653386148480.0, + "grad_norm": 3.7210066512939064, + "language_loss": 0.59888941, + "learning_rate": 1.7337404721739946e-07, + "loss": 0.67558169, + "num_input_tokens_seen": 312269460, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09191895, + "step": 14478, + "time_per_iteration": 3.9272985458374023 + }, + { + "auxiliary_loss_clip": 0.06400814, + "auxiliary_loss_mlp": 0.01265797, + "balance_loss_clip": 0.06273096, + "balance_loss_mlp": 0.01257178, + "epoch": 0.870524575379528, + "flos": 24287143720320.0, + "grad_norm": 1.561156822868459, + "language_loss": 0.71748471, + "learning_rate": 1.732154703087323e-07, + "loss": 0.79415083, + "num_input_tokens_seen": 312289830, + "router_z_loss_clip": 1.27539062, + "router_z_loss_mlp": 0.08624268, + "step": 14479, + "time_per_iteration": 2.690037727355957 + }, + { + "auxiliary_loss_clip": 0.06402399, + "auxiliary_loss_mlp": 0.0126804, + "balance_loss_clip": 0.06271303, + "balance_loss_mlp": 0.01257693, + "epoch": 0.870584698632196, + "flos": 28776490248960.0, + "grad_norm": 1.313083691844494, + "language_loss": 0.7115078, + "learning_rate": 1.7305696267191805e-07, + "loss": 0.78821218, + "num_input_tokens_seen": 312311320, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.10351562, + "step": 14480, + "time_per_iteration": 2.5635881423950195 + }, + { + "auxiliary_loss_clip": 0.06405388, + "auxiliary_loss_mlp": 0.01266207, + "balance_loss_clip": 0.06272168, + "balance_loss_mlp": 0.01256039, + "epoch": 0.8706448218848639, + "flos": 32457369058560.0, + "grad_norm": 1.5315464053111656, + "language_loss": 0.69993174, + "learning_rate": 1.728985243129666e-07, + "loss": 0.77664775, + "num_input_tokens_seen": 312332095, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.10174561, + "step": 14481, + "time_per_iteration": 2.6091196537017822 + }, + { + "auxiliary_loss_clip": 0.06400968, + "auxiliary_loss_mlp": 0.01264909, + "balance_loss_clip": 0.06270086, + "balance_loss_mlp": 0.01256403, + "epoch": 0.8707049451375319, + "flos": 22754720724480.0, + "grad_norm": 1.6803042036172529, + "language_loss": 0.77415997, + "learning_rate": 1.7274015523788643e-07, + "loss": 0.85081875, + "num_input_tokens_seen": 312351225, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.08496094, + "step": 14482, + "time_per_iteration": 2.505281448364258 + }, + { + "auxiliary_loss_clip": 0.06400886, + "auxiliary_loss_mlp": 0.01271627, + "balance_loss_clip": 0.06271046, + "balance_loss_mlp": 0.01262359, + "epoch": 0.8707650683902, + "flos": 15857496541440.0, + "grad_norm": 1.7059576346478473, + "language_loss": 0.76927876, + "learning_rate": 1.7258185545268234e-07, + "loss": 0.84600389, + "num_input_tokens_seen": 312369730, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.0927124, + "step": 14483, + "time_per_iteration": 3.9307732582092285 + }, + { + "auxiliary_loss_clip": 0.06408566, + "auxiliary_loss_mlp": 0.01267486, + "balance_loss_clip": 0.06271568, + "balance_loss_mlp": 0.01257127, + "epoch": 0.8708251916428679, + "flos": 16473068161920.0, + "grad_norm": 1.8670315835414784, + "language_loss": 0.61994016, + "learning_rate": 1.7242362496335749e-07, + "loss": 0.69670069, + "num_input_tokens_seen": 312386780, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.10351562, + "step": 14484, + "time_per_iteration": 3.927198886871338 + }, + { + "auxiliary_loss_clip": 0.06401549, + "auxiliary_loss_mlp": 0.01264874, + "balance_loss_clip": 0.06271225, + "balance_loss_mlp": 0.01255402, + "epoch": 0.8708853148955359, + "flos": 15383319886080.0, + "grad_norm": 1.6982742251902394, + "language_loss": 0.68225974, + "learning_rate": 1.7226546377591222e-07, + "loss": 0.75892395, + "num_input_tokens_seen": 312404875, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.09472656, + "step": 14485, + "time_per_iteration": 2.4758594036102295 + }, + { + "auxiliary_loss_clip": 0.06400119, + "auxiliary_loss_mlp": 0.01269297, + "balance_loss_clip": 0.06269044, + "balance_loss_mlp": 0.01259224, + "epoch": 0.8709454381482038, + "flos": 30558566085120.0, + "grad_norm": 1.707582248918332, + "language_loss": 0.63406742, + "learning_rate": 1.7210737189634373e-07, + "loss": 0.71076155, + "num_input_tokens_seen": 312425280, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.10076904, + "step": 14486, + "time_per_iteration": 2.600389003753662 + }, + { + "auxiliary_loss_clip": 0.06409895, + "auxiliary_loss_mlp": 0.01270202, + "balance_loss_clip": 0.06275006, + "balance_loss_mlp": 0.01259825, + "epoch": 0.8710055614008718, + "flos": 22608001025280.0, + "grad_norm": 1.9272108546392486, + "language_loss": 0.61984718, + "learning_rate": 1.7194934933064653e-07, + "loss": 0.69664824, + "num_input_tokens_seen": 312443835, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.1036377, + "step": 14487, + "time_per_iteration": 2.5196049213409424 + }, + { + "auxiliary_loss_clip": 0.06400737, + "auxiliary_loss_mlp": 0.01266902, + "balance_loss_clip": 0.0627054, + "balance_loss_mlp": 0.01258652, + "epoch": 0.8710656846535397, + "flos": 18449214053760.0, + "grad_norm": 1.8411007600329907, + "language_loss": 0.68481451, + "learning_rate": 1.7179139608481318e-07, + "loss": 0.76149088, + "num_input_tokens_seen": 312460830, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.08251953, + "step": 14488, + "time_per_iteration": 2.506927967071533 + }, + { + "auxiliary_loss_clip": 0.06402954, + "auxiliary_loss_mlp": 0.01268264, + "balance_loss_clip": 0.06271151, + "balance_loss_mlp": 0.01258317, + "epoch": 0.8711258079062077, + "flos": 16508678947200.0, + "grad_norm": 1.8335601369523609, + "language_loss": 0.85487485, + "learning_rate": 1.716335121648338e-07, + "loss": 0.93158698, + "num_input_tokens_seen": 312477575, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09942627, + "step": 14489, + "time_per_iteration": 2.484161376953125 + }, + { + "auxiliary_loss_clip": 0.06410562, + "auxiliary_loss_mlp": 0.01266116, + "balance_loss_clip": 0.06272433, + "balance_loss_mlp": 0.01255119, + "epoch": 0.8711859311588757, + "flos": 15667786897920.0, + "grad_norm": 6.139143930949815, + "language_loss": 0.76203996, + "learning_rate": 1.7147569757669445e-07, + "loss": 0.83880675, + "num_input_tokens_seen": 312492140, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.11004639, + "step": 14490, + "time_per_iteration": 2.5018839836120605 + }, + { + "auxiliary_loss_clip": 0.06407736, + "auxiliary_loss_mlp": 0.01268396, + "balance_loss_clip": 0.06273264, + "balance_loss_mlp": 0.01257363, + "epoch": 0.8712460544115437, + "flos": 15562589696640.0, + "grad_norm": 1.9796792508389878, + "language_loss": 0.76653862, + "learning_rate": 1.7131795232638012e-07, + "loss": 0.84329993, + "num_input_tokens_seen": 312508400, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.11022949, + "step": 14491, + "time_per_iteration": 2.4751522541046143 + }, + { + "auxiliary_loss_clip": 0.0640479, + "auxiliary_loss_mlp": 0.01265934, + "balance_loss_clip": 0.06274243, + "balance_loss_mlp": 0.01256922, + "epoch": 0.8713061776642116, + "flos": 16769148963840.0, + "grad_norm": 1.5868092330088945, + "language_loss": 0.6700983, + "learning_rate": 1.711602764198723e-07, + "loss": 0.74680555, + "num_input_tokens_seen": 312525915, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09020996, + "step": 14492, + "time_per_iteration": 2.5103981494903564 + }, + { + "auxiliary_loss_clip": 0.06399809, + "auxiliary_loss_mlp": 0.01261278, + "balance_loss_clip": 0.06271499, + "balance_loss_mlp": 0.01252665, + "epoch": 0.8713663009168796, + "flos": 24286766376960.0, + "grad_norm": 1.7963898814832777, + "language_loss": 0.69969654, + "learning_rate": 1.7100266986314992e-07, + "loss": 0.77630746, + "num_input_tokens_seen": 312544735, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.08618164, + "step": 14493, + "time_per_iteration": 2.501518487930298 + }, + { + "auxiliary_loss_clip": 0.06402645, + "auxiliary_loss_mlp": 0.0126872, + "balance_loss_clip": 0.06271104, + "balance_loss_mlp": 0.01258706, + "epoch": 0.8714264241695475, + "flos": 23800724369280.0, + "grad_norm": 2.714150442096016, + "language_loss": 0.89298224, + "learning_rate": 1.7084513266218936e-07, + "loss": 0.96969593, + "num_input_tokens_seen": 312557910, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.10015869, + "step": 14494, + "time_per_iteration": 2.496976375579834 + }, + { + "auxiliary_loss_clip": 0.06397564, + "auxiliary_loss_mlp": 0.01262665, + "balance_loss_clip": 0.06270292, + "balance_loss_mlp": 0.01253927, + "epoch": 0.8714865474222155, + "flos": 38007016352640.0, + "grad_norm": 1.585930512402851, + "language_loss": 0.59490967, + "learning_rate": 1.7068766482296514e-07, + "loss": 0.67151189, + "num_input_tokens_seen": 312580360, + "router_z_loss_clip": 1.27148438, + "router_z_loss_mlp": 0.08737183, + "step": 14495, + "time_per_iteration": 2.6341331005096436 + }, + { + "auxiliary_loss_clip": 0.0640444, + "auxiliary_loss_mlp": 0.01265038, + "balance_loss_clip": 0.06272034, + "balance_loss_mlp": 0.01255495, + "epoch": 0.8715466706748836, + "flos": 22462287575040.0, + "grad_norm": 1.899333408458114, + "language_loss": 0.8036266, + "learning_rate": 1.7053026635144762e-07, + "loss": 0.88032138, + "num_input_tokens_seen": 312597550, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.09539795, + "step": 14496, + "time_per_iteration": 2.512383460998535 + }, + { + "auxiliary_loss_clip": 0.06404877, + "auxiliary_loss_mlp": 0.01264441, + "balance_loss_clip": 0.06272918, + "balance_loss_mlp": 0.01254278, + "epoch": 0.8716067939275515, + "flos": 21221501114880.0, + "grad_norm": 1.979531289163737, + "language_loss": 0.79082352, + "learning_rate": 1.7037293725360624e-07, + "loss": 0.86751664, + "num_input_tokens_seen": 312616435, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.10168457, + "step": 14497, + "time_per_iteration": 2.5025105476379395 + }, + { + "auxiliary_loss_clip": 0.06405815, + "auxiliary_loss_mlp": 0.01265291, + "balance_loss_clip": 0.06270967, + "balance_loss_mlp": 0.01255128, + "epoch": 0.8716669171802195, + "flos": 23003535024000.0, + "grad_norm": 2.3896985728798827, + "language_loss": 0.67118752, + "learning_rate": 1.70215677535406e-07, + "loss": 0.74789858, + "num_input_tokens_seen": 312632770, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.10168457, + "step": 14498, + "time_per_iteration": 2.5077733993530273 + }, + { + "auxiliary_loss_clip": 0.06402379, + "auxiliary_loss_mlp": 0.012634, + "balance_loss_clip": 0.06270681, + "balance_loss_mlp": 0.01254066, + "epoch": 0.8717270404328874, + "flos": 29790991958400.0, + "grad_norm": 2.011348568561811, + "language_loss": 0.5741989, + "learning_rate": 1.700584872028108e-07, + "loss": 0.65085673, + "num_input_tokens_seen": 312651900, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09326172, + "step": 14499, + "time_per_iteration": 2.551210880279541 + }, + { + "auxiliary_loss_clip": 0.06407043, + "auxiliary_loss_mlp": 0.01264588, + "balance_loss_clip": 0.06273316, + "balance_loss_mlp": 0.01254664, + "epoch": 0.8717871636855554, + "flos": 22024686026880.0, + "grad_norm": 1.7042733854363687, + "language_loss": 0.8017959, + "learning_rate": 1.6990136626178097e-07, + "loss": 0.8785122, + "num_input_tokens_seen": 312671380, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.09918213, + "step": 14500, + "time_per_iteration": 2.527987480163574 + }, + { + "auxiliary_loss_clip": 0.06403673, + "auxiliary_loss_mlp": 0.01269023, + "balance_loss_clip": 0.06273565, + "balance_loss_mlp": 0.01259856, + "epoch": 0.8718472869382233, + "flos": 16659842912640.0, + "grad_norm": 1.7725346587418325, + "language_loss": 0.73199558, + "learning_rate": 1.6974431471827466e-07, + "loss": 0.8087225, + "num_input_tokens_seen": 312689215, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.0916748, + "step": 14501, + "time_per_iteration": 2.4719321727752686 + }, + { + "auxiliary_loss_clip": 0.06410412, + "auxiliary_loss_mlp": 0.01264013, + "balance_loss_clip": 0.06273587, + "balance_loss_mlp": 0.01253314, + "epoch": 0.8719074101908914, + "flos": 19500584359680.0, + "grad_norm": 1.6060046992779708, + "language_loss": 0.65037239, + "learning_rate": 1.695873325782482e-07, + "loss": 0.7271167, + "num_input_tokens_seen": 312706400, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.10699463, + "step": 14502, + "time_per_iteration": 2.5199615955352783 + }, + { + "auxiliary_loss_clip": 0.06404664, + "auxiliary_loss_mlp": 0.0126564, + "balance_loss_clip": 0.06272453, + "balance_loss_mlp": 0.01255925, + "epoch": 0.8719675334435593, + "flos": 33078894318720.0, + "grad_norm": 1.9549594610014964, + "language_loss": 0.69178712, + "learning_rate": 1.6943041984765262e-07, + "loss": 0.76849008, + "num_input_tokens_seen": 312727985, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.097229, + "step": 14503, + "time_per_iteration": 2.585371494293213 + }, + { + "auxiliary_loss_clip": 0.06405653, + "auxiliary_loss_mlp": 0.01264169, + "balance_loss_clip": 0.0627344, + "balance_loss_mlp": 0.01254448, + "epoch": 0.8720276566962273, + "flos": 13631404320000.0, + "grad_norm": 2.015312910125128, + "language_loss": 0.69743592, + "learning_rate": 1.6927357653243912e-07, + "loss": 0.7741341, + "num_input_tokens_seen": 312745025, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.09729004, + "step": 14504, + "time_per_iteration": 2.4844253063201904 + }, + { + "auxiliary_loss_clip": 0.06401467, + "auxiliary_loss_mlp": 0.01262384, + "balance_loss_clip": 0.06269079, + "balance_loss_mlp": 0.01252734, + "epoch": 0.8720877799488952, + "flos": 23520995112960.0, + "grad_norm": 1.7542452009567429, + "language_loss": 0.70339608, + "learning_rate": 1.691168026385552e-07, + "loss": 0.78003466, + "num_input_tokens_seen": 312764170, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.09661865, + "step": 14505, + "time_per_iteration": 2.501800537109375 + }, + { + "auxiliary_loss_clip": 0.06400619, + "auxiliary_loss_mlp": 0.0126351, + "balance_loss_clip": 0.06270672, + "balance_loss_mlp": 0.01255177, + "epoch": 0.8721479032015632, + "flos": 20820516600960.0, + "grad_norm": 1.4504260712656618, + "language_loss": 0.78312892, + "learning_rate": 1.6896009817194545e-07, + "loss": 0.85977018, + "num_input_tokens_seen": 312783830, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.08325195, + "step": 14506, + "time_per_iteration": 3.896496534347534 + }, + { + "auxiliary_loss_clip": 0.06404346, + "auxiliary_loss_mlp": 0.0126421, + "balance_loss_clip": 0.0626972, + "balance_loss_mlp": 0.0125459, + "epoch": 0.8722080264542311, + "flos": 19469711329920.0, + "grad_norm": 2.2593739015214895, + "language_loss": 0.74364638, + "learning_rate": 1.6880346313855221e-07, + "loss": 0.82033199, + "num_input_tokens_seen": 312802015, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.09619141, + "step": 14507, + "time_per_iteration": 2.5149693489074707 + }, + { + "auxiliary_loss_clip": 0.06409867, + "auxiliary_loss_mlp": 0.01267946, + "balance_loss_clip": 0.06273276, + "balance_loss_mlp": 0.01258075, + "epoch": 0.8722681497068991, + "flos": 21768241006080.0, + "grad_norm": 2.684746862543845, + "language_loss": 0.72729445, + "learning_rate": 1.686468975443156e-07, + "loss": 0.80407256, + "num_input_tokens_seen": 312820650, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.09869385, + "step": 14508, + "time_per_iteration": 2.480463743209839 + }, + { + "auxiliary_loss_clip": 0.06408631, + "auxiliary_loss_mlp": 0.0126697, + "balance_loss_clip": 0.06272415, + "balance_loss_mlp": 0.0125642, + "epoch": 0.8723282729595672, + "flos": 28884790051200.0, + "grad_norm": 2.2883900025545953, + "language_loss": 0.69032156, + "learning_rate": 1.6849040139517202e-07, + "loss": 0.76707762, + "num_input_tokens_seen": 312841310, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.10546875, + "step": 14509, + "time_per_iteration": 2.5842347145080566 + }, + { + "auxiliary_loss_clip": 0.06403151, + "auxiliary_loss_mlp": 0.01266131, + "balance_loss_clip": 0.06271935, + "balance_loss_mlp": 0.01256589, + "epoch": 0.8723883962122351, + "flos": 26476409272320.0, + "grad_norm": 1.5825052329417453, + "language_loss": 0.58807904, + "learning_rate": 1.683339746970558e-07, + "loss": 0.66477191, + "num_input_tokens_seen": 312862100, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09539795, + "step": 14510, + "time_per_iteration": 2.548917293548584 + }, + { + "auxiliary_loss_clip": 0.06413917, + "auxiliary_loss_mlp": 0.01269969, + "balance_loss_clip": 0.06273636, + "balance_loss_mlp": 0.01258794, + "epoch": 0.8724485194649031, + "flos": 20527664181120.0, + "grad_norm": 2.1184884114038556, + "language_loss": 0.67942345, + "learning_rate": 1.6817761745589865e-07, + "loss": 0.75626224, + "num_input_tokens_seen": 312880220, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.11187744, + "step": 14511, + "time_per_iteration": 2.5175976753234863 + }, + { + "auxiliary_loss_clip": 0.0640533, + "auxiliary_loss_mlp": 0.01264234, + "balance_loss_clip": 0.06270505, + "balance_loss_mlp": 0.01254047, + "epoch": 0.872508642717571, + "flos": 24360335861760.0, + "grad_norm": 1.596141317024249, + "language_loss": 0.81785661, + "learning_rate": 1.6802132967763027e-07, + "loss": 0.89455223, + "num_input_tokens_seen": 312900765, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.10180664, + "step": 14512, + "time_per_iteration": 2.542559862136841 + }, + { + "auxiliary_loss_clip": 0.06310365, + "auxiliary_loss_mlp": 0.01250481, + "balance_loss_clip": 0.06255949, + "balance_loss_mlp": 0.01249467, + "epoch": 0.872568765970239, + "flos": 61427132749440.0, + "grad_norm": 0.7791722432142947, + "language_loss": 0.5879969, + "learning_rate": 1.6786511136817617e-07, + "loss": 0.66360533, + "num_input_tokens_seen": 312955840, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.01014709, + "step": 14513, + "time_per_iteration": 3.0595717430114746 + }, + { + "auxiliary_loss_clip": 0.06401786, + "auxiliary_loss_mlp": 0.01265573, + "balance_loss_clip": 0.0626969, + "balance_loss_mlp": 0.01255917, + "epoch": 0.8726288892229069, + "flos": 22604059883520.0, + "grad_norm": 1.6369159357122527, + "language_loss": 0.76856357, + "learning_rate": 1.6770896253346112e-07, + "loss": 0.84523714, + "num_input_tokens_seen": 312973565, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.09661865, + "step": 14514, + "time_per_iteration": 2.505091905593872 + }, + { + "auxiliary_loss_clip": 0.06408387, + "auxiliary_loss_mlp": 0.01264552, + "balance_loss_clip": 0.06272617, + "balance_loss_mlp": 0.0125461, + "epoch": 0.872689012475575, + "flos": 25892339587200.0, + "grad_norm": 1.7178923167711113, + "language_loss": 0.65753925, + "learning_rate": 1.675528831794055e-07, + "loss": 0.73426867, + "num_input_tokens_seen": 312994660, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.0993042, + "step": 14515, + "time_per_iteration": 2.5665414333343506 + }, + { + "auxiliary_loss_clip": 0.06405771, + "auxiliary_loss_mlp": 0.01264715, + "balance_loss_clip": 0.06272528, + "balance_loss_mlp": 0.01254934, + "epoch": 0.8727491357282429, + "flos": 21513095723520.0, + "grad_norm": 1.926028752131716, + "language_loss": 0.78788495, + "learning_rate": 1.6739687331192842e-07, + "loss": 0.86458981, + "num_input_tokens_seen": 313009860, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.09777832, + "step": 14516, + "time_per_iteration": 2.480694055557251 + }, + { + "auxiliary_loss_clip": 0.0640446, + "auxiliary_loss_mlp": 0.01265123, + "balance_loss_clip": 0.06269546, + "balance_loss_mlp": 0.01254585, + "epoch": 0.8728092589809109, + "flos": 19213392090240.0, + "grad_norm": 2.236925792083213, + "language_loss": 0.72447747, + "learning_rate": 1.672409329369453e-07, + "loss": 0.80117333, + "num_input_tokens_seen": 313027025, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.10534668, + "step": 14517, + "time_per_iteration": 2.4733726978302 + }, + { + "auxiliary_loss_clip": 0.06400529, + "auxiliary_loss_mlp": 0.01267427, + "balance_loss_clip": 0.06271172, + "balance_loss_mlp": 0.01258599, + "epoch": 0.8728693822335788, + "flos": 20601652936320.0, + "grad_norm": 1.738008639362388, + "language_loss": 0.72772276, + "learning_rate": 1.6708506206036966e-07, + "loss": 0.80440235, + "num_input_tokens_seen": 313046830, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.08825684, + "step": 14518, + "time_per_iteration": 3.923923969268799 + }, + { + "auxiliary_loss_clip": 0.06398532, + "auxiliary_loss_mlp": 0.01264388, + "balance_loss_clip": 0.06269579, + "balance_loss_mlp": 0.01255269, + "epoch": 0.8729295054862468, + "flos": 21735523186560.0, + "grad_norm": 1.4853642793865207, + "language_loss": 0.74297607, + "learning_rate": 1.6692926068811275e-07, + "loss": 0.81960523, + "num_input_tokens_seen": 313067715, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.09124756, + "step": 14519, + "time_per_iteration": 2.583524227142334 + }, + { + "auxiliary_loss_clip": 0.0640825, + "auxiliary_loss_mlp": 0.01267705, + "balance_loss_clip": 0.06272946, + "balance_loss_mlp": 0.01256583, + "epoch": 0.8729896287389147, + "flos": 17678788888320.0, + "grad_norm": 2.5521451847443437, + "language_loss": 0.77261472, + "learning_rate": 1.6677352882608142e-07, + "loss": 0.84937429, + "num_input_tokens_seen": 313082305, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.11126709, + "step": 14520, + "time_per_iteration": 2.4702889919281006 + }, + { + "auxiliary_loss_clip": 0.06407069, + "auxiliary_loss_mlp": 0.01265858, + "balance_loss_clip": 0.06271906, + "balance_loss_mlp": 0.01255934, + "epoch": 0.8730497519915827, + "flos": 24578738328960.0, + "grad_norm": 1.679080927037556, + "language_loss": 0.81987226, + "learning_rate": 1.666178664801816e-07, + "loss": 0.89660144, + "num_input_tokens_seen": 313101190, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.09924316, + "step": 14521, + "time_per_iteration": 2.530060052871704 + }, + { + "auxiliary_loss_clip": 0.06406459, + "auxiliary_loss_mlp": 0.01267903, + "balance_loss_clip": 0.06272659, + "balance_loss_mlp": 0.01257777, + "epoch": 0.8731098752442508, + "flos": 13448822273280.0, + "grad_norm": 2.292757707836215, + "language_loss": 0.7680378, + "learning_rate": 1.6646227365631616e-07, + "loss": 0.8447814, + "num_input_tokens_seen": 313118965, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.10125732, + "step": 14522, + "time_per_iteration": 4.001532316207886 + }, + { + "auxiliary_loss_clip": 0.06400695, + "auxiliary_loss_mlp": 0.01266384, + "balance_loss_clip": 0.06270634, + "balance_loss_mlp": 0.01257229, + "epoch": 0.8731699984969187, + "flos": 23480730426240.0, + "grad_norm": 1.896353046813896, + "language_loss": 0.75725633, + "learning_rate": 1.66306750360385e-07, + "loss": 0.83392715, + "num_input_tokens_seen": 313139280, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.0914917, + "step": 14523, + "time_per_iteration": 2.529074192047119 + }, + { + "auxiliary_loss_clip": 0.06400236, + "auxiliary_loss_mlp": 0.01263987, + "balance_loss_clip": 0.06271105, + "balance_loss_mlp": 0.01254784, + "epoch": 0.8732301217495867, + "flos": 17718466596480.0, + "grad_norm": 2.1427135823795354, + "language_loss": 0.78751552, + "learning_rate": 1.6615129659828542e-07, + "loss": 0.86415774, + "num_input_tokens_seen": 313156655, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.09204102, + "step": 14524, + "time_per_iteration": 3.9017279148101807 + }, + { + "auxiliary_loss_clip": 0.06395754, + "auxiliary_loss_mlp": 0.0126382, + "balance_loss_clip": 0.06269418, + "balance_loss_mlp": 0.01254999, + "epoch": 0.8732902450022546, + "flos": 22060883790720.0, + "grad_norm": 1.8911749247959948, + "language_loss": 0.78280824, + "learning_rate": 1.6599591237591272e-07, + "loss": 0.85940397, + "num_input_tokens_seen": 313174050, + "router_z_loss_clip": 1.265625, + "router_z_loss_mlp": 0.0881958, + "step": 14525, + "time_per_iteration": 2.5112502574920654 + }, + { + "auxiliary_loss_clip": 0.06405047, + "auxiliary_loss_mlp": 0.01267041, + "balance_loss_clip": 0.06270174, + "balance_loss_mlp": 0.01257069, + "epoch": 0.8733503682549226, + "flos": 22279495893120.0, + "grad_norm": 1.5433520001458627, + "language_loss": 0.69392395, + "learning_rate": 1.6584059769915902e-07, + "loss": 0.77064478, + "num_input_tokens_seen": 313192765, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.09967041, + "step": 14526, + "time_per_iteration": 2.5193099975585938 + }, + { + "auxiliary_loss_clip": 0.06409685, + "auxiliary_loss_mlp": 0.01267069, + "balance_loss_clip": 0.06273328, + "balance_loss_mlp": 0.01256382, + "epoch": 0.8734104915075905, + "flos": 23370501980160.0, + "grad_norm": 1.732651268082275, + "language_loss": 0.61444616, + "learning_rate": 1.6568535257391326e-07, + "loss": 0.69121373, + "num_input_tokens_seen": 313210925, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.10687256, + "step": 14527, + "time_per_iteration": 2.6036882400512695 + }, + { + "auxiliary_loss_clip": 0.06414483, + "auxiliary_loss_mlp": 0.01268907, + "balance_loss_clip": 0.0627443, + "balance_loss_mlp": 0.01257862, + "epoch": 0.8734706147602586, + "flos": 17718047326080.0, + "grad_norm": 2.000916766827133, + "language_loss": 0.65944868, + "learning_rate": 1.6553017700606265e-07, + "loss": 0.73628259, + "num_input_tokens_seen": 313228250, + "router_z_loss_clip": 1.40039062, + "router_z_loss_mlp": 0.1104126, + "step": 14528, + "time_per_iteration": 2.4655213356018066 + }, + { + "auxiliary_loss_clip": 0.06403276, + "auxiliary_loss_mlp": 0.01264092, + "balance_loss_clip": 0.06274714, + "balance_loss_mlp": 0.01254794, + "epoch": 0.8735307380129265, + "flos": 22055055932160.0, + "grad_norm": 2.336985436344426, + "language_loss": 0.90133297, + "learning_rate": 1.6537507100149205e-07, + "loss": 0.9780066, + "num_input_tokens_seen": 313247880, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.09307861, + "step": 14529, + "time_per_iteration": 2.514073371887207 + }, + { + "auxiliary_loss_clip": 0.063995, + "auxiliary_loss_mlp": 0.01266507, + "balance_loss_clip": 0.06271863, + "balance_loss_mlp": 0.01256881, + "epoch": 0.8735908612655945, + "flos": 25345557768960.0, + "grad_norm": 1.7800121585868869, + "language_loss": 0.85022855, + "learning_rate": 1.6522003456608258e-07, + "loss": 0.92688859, + "num_input_tokens_seen": 313266790, + "router_z_loss_clip": 1.27734375, + "router_z_loss_mlp": 0.09625244, + "step": 14530, + "time_per_iteration": 2.524286985397339 + }, + { + "auxiliary_loss_clip": 0.06402133, + "auxiliary_loss_mlp": 0.01268815, + "balance_loss_clip": 0.06269572, + "balance_loss_mlp": 0.01259702, + "epoch": 0.8736509845182624, + "flos": 21546903646080.0, + "grad_norm": 2.029519430173588, + "language_loss": 0.74400681, + "learning_rate": 1.650650677057128e-07, + "loss": 0.82071632, + "num_input_tokens_seen": 313286805, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09112549, + "step": 14531, + "time_per_iteration": 2.537536144256592 + }, + { + "auxiliary_loss_clip": 0.06398211, + "auxiliary_loss_mlp": 0.01266853, + "balance_loss_clip": 0.06270216, + "balance_loss_mlp": 0.01257811, + "epoch": 0.8737111077709304, + "flos": 22023637850880.0, + "grad_norm": 1.7208212669688667, + "language_loss": 0.6192863, + "learning_rate": 1.6491017042625966e-07, + "loss": 0.69593698, + "num_input_tokens_seen": 313305415, + "router_z_loss_clip": 1.28027344, + "router_z_loss_mlp": 0.09039307, + "step": 14532, + "time_per_iteration": 2.5035369396209717 + }, + { + "auxiliary_loss_clip": 0.06313117, + "auxiliary_loss_mlp": 0.01253845, + "balance_loss_clip": 0.06258602, + "balance_loss_mlp": 0.012528, + "epoch": 0.8737712310235983, + "flos": 70086418842240.0, + "grad_norm": 0.7989490293536622, + "language_loss": 0.58785164, + "learning_rate": 1.6475534273359704e-07, + "loss": 0.66352129, + "num_input_tokens_seen": 313369940, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.0104599, + "step": 14533, + "time_per_iteration": 3.2517998218536377 + }, + { + "auxiliary_loss_clip": 0.06401654, + "auxiliary_loss_mlp": 0.01264271, + "balance_loss_clip": 0.06272509, + "balance_loss_mlp": 0.01254949, + "epoch": 0.8738313542762663, + "flos": 28665968313600.0, + "grad_norm": 2.0402838251566644, + "language_loss": 0.76672494, + "learning_rate": 1.646005846335954e-07, + "loss": 0.84338421, + "num_input_tokens_seen": 313390965, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.09326172, + "step": 14534, + "time_per_iteration": 2.546053409576416 + }, + { + "auxiliary_loss_clip": 0.06403311, + "auxiliary_loss_mlp": 0.01264005, + "balance_loss_clip": 0.06271609, + "balance_loss_mlp": 0.01254874, + "epoch": 0.8738914775289344, + "flos": 22352981523840.0, + "grad_norm": 1.5823807033231816, + "language_loss": 0.75660425, + "learning_rate": 1.6444589613212357e-07, + "loss": 0.8332774, + "num_input_tokens_seen": 313409680, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09118652, + "step": 14535, + "time_per_iteration": 2.539175510406494 + }, + { + "auxiliary_loss_clip": 0.06402861, + "auxiliary_loss_mlp": 0.01262561, + "balance_loss_clip": 0.0627098, + "balance_loss_mlp": 0.0125303, + "epoch": 0.8739516007816023, + "flos": 31767808682880.0, + "grad_norm": 1.9833489778511422, + "language_loss": 0.74645185, + "learning_rate": 1.64291277235048e-07, + "loss": 0.82310605, + "num_input_tokens_seen": 313431335, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.09533691, + "step": 14536, + "time_per_iteration": 2.588463068008423 + }, + { + "auxiliary_loss_clip": 0.06404154, + "auxiliary_loss_mlp": 0.01261289, + "balance_loss_clip": 0.06269206, + "balance_loss_mlp": 0.01251794, + "epoch": 0.8740117240342703, + "flos": 21217518046080.0, + "grad_norm": 1.6487681333797766, + "language_loss": 0.64354205, + "learning_rate": 1.641367279482304e-07, + "loss": 0.72019655, + "num_input_tokens_seen": 313449225, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.09503174, + "step": 14537, + "time_per_iteration": 2.475311517715454 + }, + { + "auxiliary_loss_clip": 0.06402414, + "auxiliary_loss_mlp": 0.0126706, + "balance_loss_clip": 0.06272729, + "balance_loss_mlp": 0.01257392, + "epoch": 0.8740718472869382, + "flos": 25192800576000.0, + "grad_norm": 1.6981530320484999, + "language_loss": 0.57779753, + "learning_rate": 1.6398224827753216e-07, + "loss": 0.65449232, + "num_input_tokens_seen": 313467715, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.09674072, + "step": 14538, + "time_per_iteration": 2.5233047008514404 + }, + { + "auxiliary_loss_clip": 0.0639964, + "auxiliary_loss_mlp": 0.01263306, + "balance_loss_clip": 0.06272976, + "balance_loss_mlp": 0.01254097, + "epoch": 0.8741319705396062, + "flos": 19507124977920.0, + "grad_norm": 1.743989836533952, + "language_loss": 0.68863463, + "learning_rate": 1.6382783822881142e-07, + "loss": 0.76526415, + "num_input_tokens_seen": 313486805, + "router_z_loss_clip": 1.265625, + "router_z_loss_mlp": 0.09210205, + "step": 14539, + "time_per_iteration": 2.4944701194763184 + }, + { + "auxiliary_loss_clip": 0.06409974, + "auxiliary_loss_mlp": 0.01265214, + "balance_loss_clip": 0.06271386, + "balance_loss_mlp": 0.01255112, + "epoch": 0.8741920937922741, + "flos": 14106167953920.0, + "grad_norm": 1.8528727857189147, + "language_loss": 0.74751997, + "learning_rate": 1.6367349780792262e-07, + "loss": 0.82427186, + "num_input_tokens_seen": 313504880, + "router_z_loss_clip": 1.38476562, + "router_z_loss_mlp": 0.10101318, + "step": 14540, + "time_per_iteration": 2.4811830520629883 + }, + { + "auxiliary_loss_clip": 0.0640149, + "auxiliary_loss_mlp": 0.01261579, + "balance_loss_clip": 0.06271747, + "balance_loss_mlp": 0.01251792, + "epoch": 0.8742522170449422, + "flos": 27717363440640.0, + "grad_norm": 1.6180222602989935, + "language_loss": 0.79222339, + "learning_rate": 1.635192270207193e-07, + "loss": 0.86885411, + "num_input_tokens_seen": 313524995, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.09790039, + "step": 14541, + "time_per_iteration": 2.5740039348602295 + }, + { + "auxiliary_loss_clip": 0.06413158, + "auxiliary_loss_mlp": 0.01267604, + "balance_loss_clip": 0.06276666, + "balance_loss_mlp": 0.01256864, + "epoch": 0.8743123402976101, + "flos": 21149021733120.0, + "grad_norm": 1.8380973773337208, + "language_loss": 0.66893399, + "learning_rate": 1.6336502587305035e-07, + "loss": 0.74574167, + "num_input_tokens_seen": 313541740, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.10748291, + "step": 14542, + "time_per_iteration": 2.5132861137390137 + }, + { + "auxiliary_loss_clip": 0.06308813, + "auxiliary_loss_mlp": 0.01251732, + "balance_loss_clip": 0.06254316, + "balance_loss_mlp": 0.01250717, + "epoch": 0.8743724635502781, + "flos": 60888275141760.0, + "grad_norm": 0.7602513032785679, + "language_loss": 0.54570305, + "learning_rate": 1.632108943707642e-07, + "loss": 0.62130845, + "num_input_tokens_seen": 313593445, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.01014709, + "step": 14543, + "time_per_iteration": 2.9452686309814453 + }, + { + "auxiliary_loss_clip": 0.06406276, + "auxiliary_loss_mlp": 0.01263911, + "balance_loss_clip": 0.06272275, + "balance_loss_mlp": 0.01254398, + "epoch": 0.874432586802946, + "flos": 28116545091840.0, + "grad_norm": 1.7912544552975234, + "language_loss": 0.69910216, + "learning_rate": 1.6305683251970458e-07, + "loss": 0.77580404, + "num_input_tokens_seen": 313615640, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.09515381, + "step": 14544, + "time_per_iteration": 2.5625085830688477 + }, + { + "auxiliary_loss_clip": 0.06397738, + "auxiliary_loss_mlp": 0.01259982, + "balance_loss_clip": 0.06271628, + "balance_loss_mlp": 0.01251685, + "epoch": 0.874492710055614, + "flos": 23557067095680.0, + "grad_norm": 1.4418848759585507, + "language_loss": 0.75803328, + "learning_rate": 1.62902840325714e-07, + "loss": 0.83461046, + "num_input_tokens_seen": 313635550, + "router_z_loss_clip": 1.26171875, + "router_z_loss_mlp": 0.08306885, + "step": 14545, + "time_per_iteration": 3.978076696395874 + }, + { + "auxiliary_loss_clip": 0.06402361, + "auxiliary_loss_mlp": 0.01264774, + "balance_loss_clip": 0.062708, + "balance_loss_mlp": 0.01254129, + "epoch": 0.8745528333082819, + "flos": 40925016864000.0, + "grad_norm": 1.6096623490639794, + "language_loss": 0.66167152, + "learning_rate": 1.6274891779463217e-07, + "loss": 0.73834288, + "num_input_tokens_seen": 313659275, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.10644531, + "step": 14546, + "time_per_iteration": 2.6540935039520264 + }, + { + "auxiliary_loss_clip": 0.06403122, + "auxiliary_loss_mlp": 0.0126332, + "balance_loss_clip": 0.06272014, + "balance_loss_mlp": 0.01253467, + "epoch": 0.87461295656095, + "flos": 23629630331520.0, + "grad_norm": 1.581391249306466, + "language_loss": 0.72981352, + "learning_rate": 1.6259506493229536e-07, + "loss": 0.8064779, + "num_input_tokens_seen": 313680595, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.09844971, + "step": 14547, + "time_per_iteration": 2.5465586185455322 + }, + { + "auxiliary_loss_clip": 0.06413304, + "auxiliary_loss_mlp": 0.01266861, + "balance_loss_clip": 0.06273919, + "balance_loss_mlp": 0.01256347, + "epoch": 0.874673079813618, + "flos": 38802235127040.0, + "grad_norm": 2.0398162521863608, + "language_loss": 0.69331336, + "learning_rate": 1.6244128174453752e-07, + "loss": 0.77011502, + "num_input_tokens_seen": 313699730, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.10516357, + "step": 14548, + "time_per_iteration": 2.6202781200408936 + }, + { + "auxiliary_loss_clip": 0.06407377, + "auxiliary_loss_mlp": 0.01264008, + "balance_loss_clip": 0.06269997, + "balance_loss_mlp": 0.01254174, + "epoch": 0.8747332030662859, + "flos": 23702948254080.0, + "grad_norm": 2.380289597874903, + "language_loss": 0.70875394, + "learning_rate": 1.6228756823719093e-07, + "loss": 0.78546774, + "num_input_tokens_seen": 313720090, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.09832764, + "step": 14549, + "time_per_iteration": 2.6334874629974365 + }, + { + "auxiliary_loss_clip": 0.06409204, + "auxiliary_loss_mlp": 0.0126558, + "balance_loss_clip": 0.06271277, + "balance_loss_mlp": 0.01255191, + "epoch": 0.8747933263189539, + "flos": 24469390350720.0, + "grad_norm": 2.097604364393195, + "language_loss": 0.83978105, + "learning_rate": 1.6213392441608352e-07, + "loss": 0.91652894, + "num_input_tokens_seen": 313736795, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.1038208, + "step": 14550, + "time_per_iteration": 2.5499937534332275 + }, + { + "auxiliary_loss_clip": 0.06409267, + "auxiliary_loss_mlp": 0.01262247, + "balance_loss_clip": 0.06273516, + "balance_loss_mlp": 0.01253003, + "epoch": 0.8748534495716218, + "flos": 13814405637120.0, + "grad_norm": 1.5904524065006318, + "language_loss": 0.72164232, + "learning_rate": 1.6198035028704183e-07, + "loss": 0.79835749, + "num_input_tokens_seen": 313754820, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.09246826, + "step": 14551, + "time_per_iteration": 2.4693989753723145 + }, + { + "auxiliary_loss_clip": 0.06401157, + "auxiliary_loss_mlp": 0.01261725, + "balance_loss_clip": 0.06272075, + "balance_loss_mlp": 0.01252582, + "epoch": 0.8749135728242898, + "flos": 29869886177280.0, + "grad_norm": 1.9835642625635446, + "language_loss": 0.64623117, + "learning_rate": 1.6182684585588934e-07, + "loss": 0.72286004, + "num_input_tokens_seen": 313775830, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.09143066, + "step": 14552, + "time_per_iteration": 2.558300733566284 + }, + { + "auxiliary_loss_clip": 0.06409608, + "auxiliary_loss_mlp": 0.01267334, + "balance_loss_clip": 0.06274374, + "balance_loss_mlp": 0.01256581, + "epoch": 0.8749736960769577, + "flos": 24140256312960.0, + "grad_norm": 1.5918713414815686, + "language_loss": 0.79966319, + "learning_rate": 1.616734111284479e-07, + "loss": 0.87643266, + "num_input_tokens_seen": 313795745, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.10754395, + "step": 14553, + "time_per_iteration": 2.5289547443389893 + }, + { + "auxiliary_loss_clip": 0.06405284, + "auxiliary_loss_mlp": 0.01264107, + "balance_loss_clip": 0.0627055, + "balance_loss_mlp": 0.01254594, + "epoch": 0.8750338193296258, + "flos": 17208385666560.0, + "grad_norm": 1.7861330816455667, + "language_loss": 0.70206106, + "learning_rate": 1.6152004611053416e-07, + "loss": 0.77875495, + "num_input_tokens_seen": 313813895, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.09509277, + "step": 14554, + "time_per_iteration": 2.5121958255767822 + }, + { + "auxiliary_loss_clip": 0.06400765, + "auxiliary_loss_mlp": 0.01263457, + "balance_loss_clip": 0.06269407, + "balance_loss_mlp": 0.01254272, + "epoch": 0.8750939425822937, + "flos": 23740110339840.0, + "grad_norm": 1.6171556811070096, + "language_loss": 0.83951151, + "learning_rate": 1.6136675080796457e-07, + "loss": 0.91615379, + "num_input_tokens_seen": 313834225, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.09179688, + "step": 14555, + "time_per_iteration": 2.533935546875 + }, + { + "auxiliary_loss_clip": 0.06403114, + "auxiliary_loss_mlp": 0.01267593, + "balance_loss_clip": 0.06271933, + "balance_loss_mlp": 0.01257753, + "epoch": 0.8751540658349617, + "flos": 26548888654080.0, + "grad_norm": 1.6023816965835223, + "language_loss": 0.71021914, + "learning_rate": 1.6121352522655252e-07, + "loss": 0.78692615, + "num_input_tokens_seen": 313854430, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09844971, + "step": 14556, + "time_per_iteration": 2.5914430618286133 + }, + { + "auxiliary_loss_clip": 0.06408825, + "auxiliary_loss_mlp": 0.0126549, + "balance_loss_clip": 0.0627299, + "balance_loss_mlp": 0.01255179, + "epoch": 0.8752141890876296, + "flos": 19392200703360.0, + "grad_norm": 2.2735534947570115, + "language_loss": 0.7708326, + "learning_rate": 1.6106036937210732e-07, + "loss": 0.84757572, + "num_input_tokens_seen": 313871600, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.10314941, + "step": 14557, + "time_per_iteration": 3.9466445446014404 + }, + { + "auxiliary_loss_clip": 0.06408848, + "auxiliary_loss_mlp": 0.01267158, + "balance_loss_clip": 0.06275624, + "balance_loss_mlp": 0.01256769, + "epoch": 0.8752743123402976, + "flos": 25381462043520.0, + "grad_norm": 1.69314146192959, + "language_loss": 0.83270669, + "learning_rate": 1.6090728325043767e-07, + "loss": 0.90946674, + "num_input_tokens_seen": 313891570, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.1038208, + "step": 14558, + "time_per_iteration": 2.5691773891448975 + }, + { + "auxiliary_loss_clip": 0.06311321, + "auxiliary_loss_mlp": 0.01250089, + "balance_loss_clip": 0.06256986, + "balance_loss_mlp": 0.01249142, + "epoch": 0.8753344355929655, + "flos": 59969578976640.0, + "grad_norm": 0.7810475083105511, + "language_loss": 0.56042981, + "learning_rate": 1.6075426686734784e-07, + "loss": 0.63604391, + "num_input_tokens_seen": 313951290, + "router_z_loss_clip": 0.54345703, + "router_z_loss_mlp": 0.00945282, + "step": 14559, + "time_per_iteration": 3.157846450805664 + }, + { + "auxiliary_loss_clip": 0.06402047, + "auxiliary_loss_mlp": 0.01266495, + "balance_loss_clip": 0.06271435, + "balance_loss_mlp": 0.01257125, + "epoch": 0.8753945588456336, + "flos": 17900419737600.0, + "grad_norm": 1.6963554419042506, + "language_loss": 0.66404682, + "learning_rate": 1.606013202286407e-07, + "loss": 0.74073219, + "num_input_tokens_seen": 313968645, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.09375, + "step": 14560, + "time_per_iteration": 2.470168352127075 + }, + { + "auxiliary_loss_clip": 0.06398799, + "auxiliary_loss_mlp": 0.01262913, + "balance_loss_clip": 0.06269611, + "balance_loss_mlp": 0.01253471, + "epoch": 0.8754546820983016, + "flos": 30921969242880.0, + "grad_norm": 1.8348910812668497, + "language_loss": 0.78910828, + "learning_rate": 1.6044844334011541e-07, + "loss": 0.8657254, + "num_input_tokens_seen": 313987580, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.09442139, + "step": 14561, + "time_per_iteration": 2.5636520385742188 + }, + { + "auxiliary_loss_clip": 0.06408288, + "auxiliary_loss_mlp": 0.01264293, + "balance_loss_clip": 0.06271692, + "balance_loss_mlp": 0.01253952, + "epoch": 0.8755148053509695, + "flos": 20637305648640.0, + "grad_norm": 1.9358118623790102, + "language_loss": 0.78181839, + "learning_rate": 1.6029563620756982e-07, + "loss": 0.85854423, + "num_input_tokens_seen": 314004460, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.10339355, + "step": 14562, + "time_per_iteration": 3.9300997257232666 + }, + { + "auxiliary_loss_clip": 0.06399447, + "auxiliary_loss_mlp": 0.01263478, + "balance_loss_clip": 0.06274161, + "balance_loss_mlp": 0.01254681, + "epoch": 0.8755749286036375, + "flos": 34978326197760.0, + "grad_norm": 1.6279482889503327, + "language_loss": 0.72014946, + "learning_rate": 1.601428988367981e-07, + "loss": 0.79677868, + "num_input_tokens_seen": 314026855, + "router_z_loss_clip": 1.25195312, + "router_z_loss_mlp": 0.08789062, + "step": 14563, + "time_per_iteration": 4.056689977645874 + }, + { + "auxiliary_loss_clip": 0.06408808, + "auxiliary_loss_mlp": 0.01265016, + "balance_loss_clip": 0.06271923, + "balance_loss_mlp": 0.01255283, + "epoch": 0.8756350518563054, + "flos": 18192265908480.0, + "grad_norm": 2.023004884264385, + "language_loss": 0.65937054, + "learning_rate": 1.5999023123359235e-07, + "loss": 0.73610878, + "num_input_tokens_seen": 314042830, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.09735107, + "step": 14564, + "time_per_iteration": 2.4699697494506836 + }, + { + "auxiliary_loss_clip": 0.06403805, + "auxiliary_loss_mlp": 0.01266635, + "balance_loss_clip": 0.06271675, + "balance_loss_mlp": 0.01257623, + "epoch": 0.8756951751089734, + "flos": 20090188414080.0, + "grad_norm": 1.696910224626912, + "language_loss": 0.70870125, + "learning_rate": 1.598376334037408e-07, + "loss": 0.78540564, + "num_input_tokens_seen": 314062225, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.09008789, + "step": 14565, + "time_per_iteration": 2.49548077583313 + }, + { + "auxiliary_loss_clip": 0.0641157, + "auxiliary_loss_mlp": 0.01264443, + "balance_loss_clip": 0.0627208, + "balance_loss_mlp": 0.01253553, + "epoch": 0.8757552983616413, + "flos": 27532349625600.0, + "grad_norm": 1.4285199436173486, + "language_loss": 0.77859598, + "learning_rate": 1.5968510535303102e-07, + "loss": 0.8553561, + "num_input_tokens_seen": 314082325, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.10882568, + "step": 14566, + "time_per_iteration": 2.749091863632202 + }, + { + "auxiliary_loss_clip": 0.06403997, + "auxiliary_loss_mlp": 0.01269996, + "balance_loss_clip": 0.06272083, + "balance_loss_mlp": 0.01260703, + "epoch": 0.8758154216143094, + "flos": 18078138247680.0, + "grad_norm": 1.529339605078132, + "language_loss": 0.71489322, + "learning_rate": 1.5953264708724624e-07, + "loss": 0.79163313, + "num_input_tokens_seen": 314100310, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.09283447, + "step": 14567, + "time_per_iteration": 2.5004701614379883 + }, + { + "auxiliary_loss_clip": 0.06402886, + "auxiliary_loss_mlp": 0.01267484, + "balance_loss_clip": 0.0627336, + "balance_loss_mlp": 0.01258305, + "epoch": 0.8758755448669773, + "flos": 25052621495040.0, + "grad_norm": 1.6619530150648376, + "language_loss": 0.74655724, + "learning_rate": 1.5938025861216776e-07, + "loss": 0.8232609, + "num_input_tokens_seen": 314121330, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.09179688, + "step": 14568, + "time_per_iteration": 2.5281195640563965 + }, + { + "auxiliary_loss_clip": 0.06398214, + "auxiliary_loss_mlp": 0.01265357, + "balance_loss_clip": 0.06268168, + "balance_loss_mlp": 0.01256416, + "epoch": 0.8759356681196453, + "flos": 22863439797120.0, + "grad_norm": 1.9978030218595135, + "language_loss": 0.87101042, + "learning_rate": 1.5922793993357475e-07, + "loss": 0.9476462, + "num_input_tokens_seen": 314139875, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.0894165, + "step": 14569, + "time_per_iteration": 2.5461788177490234 + }, + { + "auxiliary_loss_clip": 0.06404515, + "auxiliary_loss_mlp": 0.01262364, + "balance_loss_clip": 0.06270414, + "balance_loss_mlp": 0.01253435, + "epoch": 0.8759957913723132, + "flos": 21038835214080.0, + "grad_norm": 1.6138151637367601, + "language_loss": 0.7468214, + "learning_rate": 1.5907569105724284e-07, + "loss": 0.82349014, + "num_input_tokens_seen": 314157850, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.0892334, + "step": 14570, + "time_per_iteration": 2.498565196990967 + }, + { + "auxiliary_loss_clip": 0.06409349, + "auxiliary_loss_mlp": 0.01263872, + "balance_loss_clip": 0.06273144, + "balance_loss_mlp": 0.01254192, + "epoch": 0.8760559146249812, + "flos": 20016535075200.0, + "grad_norm": 1.5814035636458428, + "language_loss": 0.68048859, + "learning_rate": 1.5892351198894472e-07, + "loss": 0.75722075, + "num_input_tokens_seen": 314176720, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.09680176, + "step": 14571, + "time_per_iteration": 2.5363006591796875 + }, + { + "auxiliary_loss_clip": 0.06400727, + "auxiliary_loss_mlp": 0.01262869, + "balance_loss_clip": 0.06271683, + "balance_loss_mlp": 0.01254131, + "epoch": 0.8761160378776491, + "flos": 19980253457280.0, + "grad_norm": 1.8860279623082572, + "language_loss": 0.62593281, + "learning_rate": 1.5877140273445156e-07, + "loss": 0.70256877, + "num_input_tokens_seen": 314196645, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.08734131, + "step": 14572, + "time_per_iteration": 2.509617328643799 + }, + { + "auxiliary_loss_clip": 0.06398857, + "auxiliary_loss_mlp": 0.01263429, + "balance_loss_clip": 0.06271888, + "balance_loss_mlp": 0.01254542, + "epoch": 0.8761761611303172, + "flos": 28812101034240.0, + "grad_norm": 1.87554988756501, + "language_loss": 0.74363232, + "learning_rate": 1.5861936329953162e-07, + "loss": 0.82025516, + "num_input_tokens_seen": 314217430, + "router_z_loss_clip": 1.26953125, + "router_z_loss_mlp": 0.08886719, + "step": 14573, + "time_per_iteration": 2.5745317935943604 + }, + { + "auxiliary_loss_clip": 0.0639876, + "auxiliary_loss_mlp": 0.0126231, + "balance_loss_clip": 0.06270745, + "balance_loss_mlp": 0.01253763, + "epoch": 0.8762362843829851, + "flos": 18338356702080.0, + "grad_norm": 1.9590289923808466, + "language_loss": 0.73202926, + "learning_rate": 1.5846739368994966e-07, + "loss": 0.80863994, + "num_input_tokens_seen": 314235310, + "router_z_loss_clip": 1.28222656, + "router_z_loss_mlp": 0.08544922, + "step": 14574, + "time_per_iteration": 2.465827465057373 + }, + { + "auxiliary_loss_clip": 0.06403725, + "auxiliary_loss_mlp": 0.01263106, + "balance_loss_clip": 0.06272864, + "balance_loss_mlp": 0.01253361, + "epoch": 0.8762964076356531, + "flos": 15784681743360.0, + "grad_norm": 1.6549061624891563, + "language_loss": 0.76195455, + "learning_rate": 1.5831549391146903e-07, + "loss": 0.83862293, + "num_input_tokens_seen": 314252355, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09747314, + "step": 14575, + "time_per_iteration": 2.5050904750823975 + }, + { + "auxiliary_loss_clip": 0.06398784, + "auxiliary_loss_mlp": 0.01266013, + "balance_loss_clip": 0.06271212, + "balance_loss_mlp": 0.01256667, + "epoch": 0.8763565308883211, + "flos": 33184175374080.0, + "grad_norm": 1.6971430511045047, + "language_loss": 0.66751701, + "learning_rate": 1.5816366396984916e-07, + "loss": 0.74416494, + "num_input_tokens_seen": 314272755, + "router_z_loss_clip": 1.27734375, + "router_z_loss_mlp": 0.09344482, + "step": 14576, + "time_per_iteration": 2.707777500152588 + }, + { + "auxiliary_loss_clip": 0.06400728, + "auxiliary_loss_mlp": 0.01264456, + "balance_loss_clip": 0.06270373, + "balance_loss_mlp": 0.01255456, + "epoch": 0.876416654140989, + "flos": 15893568524160.0, + "grad_norm": 5.287288925068646, + "language_loss": 0.67297328, + "learning_rate": 1.5801190387084806e-07, + "loss": 0.74962509, + "num_input_tokens_seen": 314291365, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.08996582, + "step": 14577, + "time_per_iteration": 2.516228675842285 + }, + { + "auxiliary_loss_clip": 0.06408198, + "auxiliary_loss_mlp": 0.01264689, + "balance_loss_clip": 0.06274717, + "balance_loss_mlp": 0.01254753, + "epoch": 0.876476777393657, + "flos": 25892381514240.0, + "grad_norm": 2.047552880616012, + "language_loss": 0.71286416, + "learning_rate": 1.5786021362021962e-07, + "loss": 0.78959298, + "num_input_tokens_seen": 314310075, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.09936523, + "step": 14578, + "time_per_iteration": 2.5514087677001953 + }, + { + "auxiliary_loss_clip": 0.06409043, + "auxiliary_loss_mlp": 0.01268646, + "balance_loss_clip": 0.06273985, + "balance_loss_mlp": 0.01258501, + "epoch": 0.876536900646325, + "flos": 13594787285760.0, + "grad_norm": 1.8887093995761175, + "language_loss": 0.7153939, + "learning_rate": 1.5770859322371676e-07, + "loss": 0.79217076, + "num_input_tokens_seen": 314325695, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.10150146, + "step": 14579, + "time_per_iteration": 2.5152196884155273 + }, + { + "auxiliary_loss_clip": 0.06400099, + "auxiliary_loss_mlp": 0.01261571, + "balance_loss_clip": 0.06272951, + "balance_loss_mlp": 0.01252624, + "epoch": 0.876597023898993, + "flos": 12208245448320.0, + "grad_norm": 3.2232555084556265, + "language_loss": 0.69840139, + "learning_rate": 1.5755704268708912e-07, + "loss": 0.7750181, + "num_input_tokens_seen": 314343605, + "router_z_loss_clip": 1.27246094, + "router_z_loss_mlp": 0.0894165, + "step": 14580, + "time_per_iteration": 2.5027308464050293 + }, + { + "auxiliary_loss_clip": 0.06397118, + "auxiliary_loss_mlp": 0.01264215, + "balance_loss_clip": 0.06268962, + "balance_loss_mlp": 0.01254893, + "epoch": 0.8766571471516609, + "flos": 25343629125120.0, + "grad_norm": 1.6080390513913188, + "language_loss": 0.65369827, + "learning_rate": 1.5740556201608256e-07, + "loss": 0.73031157, + "num_input_tokens_seen": 314364275, + "router_z_loss_clip": 1.28417969, + "router_z_loss_mlp": 0.09313965, + "step": 14581, + "time_per_iteration": 2.5610644817352295 + }, + { + "auxiliary_loss_clip": 0.06400178, + "auxiliary_loss_mlp": 0.01263336, + "balance_loss_clip": 0.06271291, + "balance_loss_mlp": 0.01254884, + "epoch": 0.8767172704043289, + "flos": 30120419485440.0, + "grad_norm": 2.0311405699132368, + "language_loss": 0.73738873, + "learning_rate": 1.572541512164416e-07, + "loss": 0.81402385, + "num_input_tokens_seen": 314385140, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.08459473, + "step": 14582, + "time_per_iteration": 2.5676662921905518 + }, + { + "auxiliary_loss_clip": 0.06401975, + "auxiliary_loss_mlp": 0.01266739, + "balance_loss_clip": 0.06271679, + "balance_loss_mlp": 0.01257095, + "epoch": 0.8767773936569968, + "flos": 19287171210240.0, + "grad_norm": 2.1739067295595884, + "language_loss": 0.67125332, + "learning_rate": 1.5710281029390826e-07, + "loss": 0.74794054, + "num_input_tokens_seen": 314403715, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.09649658, + "step": 14583, + "time_per_iteration": 2.5512192249298096 + }, + { + "auxiliary_loss_clip": 0.0640585, + "auxiliary_loss_mlp": 0.01261674, + "balance_loss_clip": 0.06272185, + "balance_loss_mlp": 0.01252614, + "epoch": 0.8768375169096648, + "flos": 21252877269120.0, + "grad_norm": 1.532856465266313, + "language_loss": 0.79368246, + "learning_rate": 1.5695153925422067e-07, + "loss": 0.87035769, + "num_input_tokens_seen": 314421880, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.09069824, + "step": 14584, + "time_per_iteration": 2.484201192855835 + }, + { + "auxiliary_loss_clip": 0.06405112, + "auxiliary_loss_mlp": 0.012662, + "balance_loss_clip": 0.06270323, + "balance_loss_mlp": 0.01256837, + "epoch": 0.8768976401623327, + "flos": 23302383010560.0, + "grad_norm": 1.4894739815416904, + "language_loss": 0.72938401, + "learning_rate": 1.5680033810311555e-07, + "loss": 0.80609715, + "num_input_tokens_seen": 314441585, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.09368896, + "step": 14585, + "time_per_iteration": 3.944657802581787 + }, + { + "auxiliary_loss_clip": 0.06401481, + "auxiliary_loss_mlp": 0.01264972, + "balance_loss_clip": 0.062718, + "balance_loss_mlp": 0.01255263, + "epoch": 0.8769577634150008, + "flos": 21367675762560.0, + "grad_norm": 1.8689895153618223, + "language_loss": 0.74672264, + "learning_rate": 1.5664920684632654e-07, + "loss": 0.82338715, + "num_input_tokens_seen": 314459020, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.0970459, + "step": 14586, + "time_per_iteration": 2.48671555519104 + }, + { + "auxiliary_loss_clip": 0.06402427, + "auxiliary_loss_mlp": 0.0126322, + "balance_loss_clip": 0.06271265, + "balance_loss_mlp": 0.01253922, + "epoch": 0.8770178866676687, + "flos": 23520869331840.0, + "grad_norm": 1.901621628510341, + "language_loss": 0.78764355, + "learning_rate": 1.564981454895844e-07, + "loss": 0.86430001, + "num_input_tokens_seen": 314478935, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09295654, + "step": 14587, + "time_per_iteration": 2.5289950370788574 + }, + { + "auxiliary_loss_clip": 0.06404516, + "auxiliary_loss_mlp": 0.01268497, + "balance_loss_clip": 0.06273125, + "balance_loss_mlp": 0.01258787, + "epoch": 0.8770780099203367, + "flos": 19725150101760.0, + "grad_norm": 1.5376144495313915, + "language_loss": 0.74347901, + "learning_rate": 1.5634715403861697e-07, + "loss": 0.82020915, + "num_input_tokens_seen": 314497635, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.0970459, + "step": 14588, + "time_per_iteration": 2.504408597946167 + }, + { + "auxiliary_loss_clip": 0.06400863, + "auxiliary_loss_mlp": 0.01263355, + "balance_loss_clip": 0.06272131, + "balance_loss_mlp": 0.01254402, + "epoch": 0.8771381331730047, + "flos": 21402028736640.0, + "grad_norm": 2.5853533604834387, + "language_loss": 0.67017472, + "learning_rate": 1.5619623249915016e-07, + "loss": 0.74681687, + "num_input_tokens_seen": 314515445, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.08947754, + "step": 14589, + "time_per_iteration": 2.530637264251709 + }, + { + "auxiliary_loss_clip": 0.06401638, + "auxiliary_loss_mlp": 0.01267687, + "balance_loss_clip": 0.06272372, + "balance_loss_mlp": 0.01258383, + "epoch": 0.8771982564256726, + "flos": 20267194164480.0, + "grad_norm": 2.192494295915613, + "language_loss": 0.71027219, + "learning_rate": 1.5604538087690732e-07, + "loss": 0.78696543, + "num_input_tokens_seen": 314533040, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.09295654, + "step": 14590, + "time_per_iteration": 2.519289970397949 + }, + { + "auxiliary_loss_clip": 0.0641445, + "auxiliary_loss_mlp": 0.01265546, + "balance_loss_clip": 0.06275117, + "balance_loss_mlp": 0.0125474, + "epoch": 0.8772583796783406, + "flos": 12493341365760.0, + "grad_norm": 2.278892739613534, + "language_loss": 0.75203848, + "learning_rate": 1.558945991776086e-07, + "loss": 0.82883847, + "num_input_tokens_seen": 314548280, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.10803223, + "step": 14591, + "time_per_iteration": 2.480944871902466 + }, + { + "auxiliary_loss_clip": 0.06396542, + "auxiliary_loss_mlp": 0.0126499, + "balance_loss_clip": 0.06272044, + "balance_loss_mlp": 0.01255751, + "epoch": 0.8773185029310085, + "flos": 15925992854400.0, + "grad_norm": 1.7438987564474657, + "language_loss": 0.80089593, + "learning_rate": 1.5574388740697096e-07, + "loss": 0.87751126, + "num_input_tokens_seen": 314565345, + "router_z_loss_clip": 1.24609375, + "router_z_loss_mlp": 0.09240723, + "step": 14592, + "time_per_iteration": 2.4851748943328857 + }, + { + "auxiliary_loss_clip": 0.06397837, + "auxiliary_loss_mlp": 0.01266197, + "balance_loss_clip": 0.06270818, + "balance_loss_mlp": 0.01257858, + "epoch": 0.8773786261836766, + "flos": 21510538174080.0, + "grad_norm": 1.550623060936972, + "language_loss": 0.82925177, + "learning_rate": 1.5559324557071052e-07, + "loss": 0.90589213, + "num_input_tokens_seen": 314584190, + "router_z_loss_clip": 1.27148438, + "router_z_loss_mlp": 0.08343506, + "step": 14593, + "time_per_iteration": 2.567701578140259 + }, + { + "auxiliary_loss_clip": 0.06401081, + "auxiliary_loss_mlp": 0.01264113, + "balance_loss_clip": 0.06272095, + "balance_loss_mlp": 0.01255256, + "epoch": 0.8774387494363445, + "flos": 26768884348800.0, + "grad_norm": 1.2807416584393148, + "language_loss": 0.75873339, + "learning_rate": 1.5544267367453845e-07, + "loss": 0.83538544, + "num_input_tokens_seen": 314605625, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.08862305, + "step": 14594, + "time_per_iteration": 2.566321849822998 + }, + { + "auxiliary_loss_clip": 0.0640325, + "auxiliary_loss_mlp": 0.0126413, + "balance_loss_clip": 0.06269722, + "balance_loss_mlp": 0.01255112, + "epoch": 0.8774988726890125, + "flos": 18484782912000.0, + "grad_norm": 1.9693354280798063, + "language_loss": 0.77621579, + "learning_rate": 1.552921717241651e-07, + "loss": 0.8528896, + "num_input_tokens_seen": 314622630, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.090271, + "step": 14595, + "time_per_iteration": 2.4619386196136475 + }, + { + "auxiliary_loss_clip": 0.06402054, + "auxiliary_loss_mlp": 0.01264392, + "balance_loss_clip": 0.06271306, + "balance_loss_mlp": 0.01254921, + "epoch": 0.8775589959416804, + "flos": 24433360295040.0, + "grad_norm": 1.3207424076931227, + "language_loss": 0.70732266, + "learning_rate": 1.5514173972529743e-07, + "loss": 0.78398716, + "num_input_tokens_seen": 314642460, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.09472656, + "step": 14596, + "time_per_iteration": 2.526388645172119 + }, + { + "auxiliary_loss_clip": 0.06398661, + "auxiliary_loss_mlp": 0.01265503, + "balance_loss_clip": 0.06270237, + "balance_loss_mlp": 0.01256628, + "epoch": 0.8776191191943484, + "flos": 23446796722560.0, + "grad_norm": 1.635020983674664, + "language_loss": 0.85904115, + "learning_rate": 1.5499137768364067e-07, + "loss": 0.93568277, + "num_input_tokens_seen": 314659875, + "router_z_loss_clip": 1.28320312, + "router_z_loss_mlp": 0.08874512, + "step": 14597, + "time_per_iteration": 3.9261152744293213 + }, + { + "auxiliary_loss_clip": 0.06402812, + "auxiliary_loss_mlp": 0.01265281, + "balance_loss_clip": 0.06272464, + "balance_loss_mlp": 0.01256185, + "epoch": 0.8776792424470163, + "flos": 26837674151040.0, + "grad_norm": 1.9849273814310462, + "language_loss": 0.72925198, + "learning_rate": 1.5484108560489494e-07, + "loss": 0.80593288, + "num_input_tokens_seen": 314680260, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.09094238, + "step": 14598, + "time_per_iteration": 2.5652682781219482 + }, + { + "auxiliary_loss_clip": 0.06404451, + "auxiliary_loss_mlp": 0.01263229, + "balance_loss_clip": 0.06273061, + "balance_loss_mlp": 0.01253788, + "epoch": 0.8777393656996844, + "flos": 15630499031040.0, + "grad_norm": 2.1509248383698782, + "language_loss": 0.77800953, + "learning_rate": 1.5469086349476036e-07, + "loss": 0.85468638, + "num_input_tokens_seen": 314696260, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09442139, + "step": 14599, + "time_per_iteration": 2.4392573833465576 + }, + { + "auxiliary_loss_clip": 0.0640744, + "auxiliary_loss_mlp": 0.01264831, + "balance_loss_clip": 0.06275728, + "balance_loss_mlp": 0.01255491, + "epoch": 0.8777994889523523, + "flos": 18885977061120.0, + "grad_norm": 1.9773713526565397, + "language_loss": 0.6848346, + "learning_rate": 1.545407113589332e-07, + "loss": 0.76155728, + "num_input_tokens_seen": 314714215, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09344482, + "step": 14600, + "time_per_iteration": 2.5783047676086426 + }, + { + "auxiliary_loss_clip": 0.0640178, + "auxiliary_loss_mlp": 0.01263195, + "balance_loss_clip": 0.06270954, + "balance_loss_mlp": 0.01253658, + "epoch": 0.8778596122050203, + "flos": 48836113850880.0, + "grad_norm": 1.7580584830878268, + "language_loss": 0.69559765, + "learning_rate": 1.543906292031072e-07, + "loss": 0.77224743, + "num_input_tokens_seen": 314735700, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.09527588, + "step": 14601, + "time_per_iteration": 4.200392484664917 + }, + { + "auxiliary_loss_clip": 0.06411396, + "auxiliary_loss_mlp": 0.01267458, + "balance_loss_clip": 0.06274483, + "balance_loss_mlp": 0.0125779, + "epoch": 0.8779197354576883, + "flos": 25666264471680.0, + "grad_norm": 1.7776243951443933, + "language_loss": 0.73434043, + "learning_rate": 1.542406170329733e-07, + "loss": 0.81112897, + "num_input_tokens_seen": 314753335, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.09661865, + "step": 14602, + "time_per_iteration": 2.5296902656555176 + }, + { + "auxiliary_loss_clip": 0.06397757, + "auxiliary_loss_mlp": 0.01264623, + "balance_loss_clip": 0.06269722, + "balance_loss_mlp": 0.01255807, + "epoch": 0.8779798587103562, + "flos": 18849150391680.0, + "grad_norm": 1.6545957796620159, + "language_loss": 0.70951098, + "learning_rate": 1.5409067485422056e-07, + "loss": 0.78613484, + "num_input_tokens_seen": 314770800, + "router_z_loss_clip": 1.27929688, + "router_z_loss_mlp": 0.08813477, + "step": 14603, + "time_per_iteration": 3.900700807571411 + }, + { + "auxiliary_loss_clip": 0.06315686, + "auxiliary_loss_mlp": 0.0125067, + "balance_loss_clip": 0.06261384, + "balance_loss_mlp": 0.0124961, + "epoch": 0.8780399819630242, + "flos": 68634022095360.0, + "grad_norm": 0.7420580476925245, + "language_loss": 0.54075485, + "learning_rate": 1.539408026725344e-07, + "loss": 0.61641842, + "num_input_tokens_seen": 314837275, + "router_z_loss_clip": 0.54589844, + "router_z_loss_mlp": 0.01062012, + "step": 14604, + "time_per_iteration": 3.145667314529419 + }, + { + "auxiliary_loss_clip": 0.06312891, + "auxiliary_loss_mlp": 0.01249667, + "balance_loss_clip": 0.06258688, + "balance_loss_mlp": 0.01248654, + "epoch": 0.8781001052156922, + "flos": 65755908927360.0, + "grad_norm": 0.6879925918981881, + "language_loss": 0.59306002, + "learning_rate": 1.537910004935976e-07, + "loss": 0.66868562, + "num_input_tokens_seen": 314902220, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.01013184, + "step": 14605, + "time_per_iteration": 3.1238157749176025 + }, + { + "auxiliary_loss_clip": 0.06404503, + "auxiliary_loss_mlp": 0.01264254, + "balance_loss_clip": 0.06271055, + "balance_loss_mlp": 0.01254848, + "epoch": 0.8781602284683602, + "flos": 22055391348480.0, + "grad_norm": 1.7310041158158627, + "language_loss": 0.85172927, + "learning_rate": 1.536412683230912e-07, + "loss": 0.92841685, + "num_input_tokens_seen": 314921645, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.09399414, + "step": 14606, + "time_per_iteration": 2.519148349761963 + }, + { + "auxiliary_loss_clip": 0.0640693, + "auxiliary_loss_mlp": 0.01264105, + "balance_loss_clip": 0.06271999, + "balance_loss_mlp": 0.01253997, + "epoch": 0.8782203517210281, + "flos": 17568099244800.0, + "grad_norm": 2.0459573713019266, + "language_loss": 0.71192271, + "learning_rate": 1.534916061666931e-07, + "loss": 0.78863305, + "num_input_tokens_seen": 314939390, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.10113525, + "step": 14607, + "time_per_iteration": 2.476141929626465 + }, + { + "auxiliary_loss_clip": 0.06399085, + "auxiliary_loss_mlp": 0.01265994, + "balance_loss_clip": 0.0627173, + "balance_loss_mlp": 0.01257238, + "epoch": 0.8782804749736961, + "flos": 25527510910080.0, + "grad_norm": 1.6865812212317128, + "language_loss": 0.72198415, + "learning_rate": 1.533420140300785e-07, + "loss": 0.79863501, + "num_input_tokens_seen": 314959205, + "router_z_loss_clip": 1.27539062, + "router_z_loss_mlp": 0.08758545, + "step": 14608, + "time_per_iteration": 2.543273687362671 + }, + { + "auxiliary_loss_clip": 0.06411412, + "auxiliary_loss_mlp": 0.01265109, + "balance_loss_clip": 0.06274945, + "balance_loss_mlp": 0.01255257, + "epoch": 0.878340598226364, + "flos": 21805193456640.0, + "grad_norm": 1.928532327012367, + "language_loss": 0.8771438, + "learning_rate": 1.5319249191891936e-07, + "loss": 0.95390904, + "num_input_tokens_seen": 314977485, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.09863281, + "step": 14609, + "time_per_iteration": 2.486294746398926 + }, + { + "auxiliary_loss_clip": 0.0640282, + "auxiliary_loss_mlp": 0.01268196, + "balance_loss_clip": 0.06272058, + "balance_loss_mlp": 0.01258832, + "epoch": 0.878400721479032, + "flos": 21108211994880.0, + "grad_norm": 1.4945868352839566, + "language_loss": 0.7052213, + "learning_rate": 1.5304303983888643e-07, + "loss": 0.78193146, + "num_input_tokens_seen": 314997830, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09368896, + "step": 14610, + "time_per_iteration": 2.513068437576294 + }, + { + "auxiliary_loss_clip": 0.06398328, + "auxiliary_loss_mlp": 0.0126464, + "balance_loss_clip": 0.06270711, + "balance_loss_mlp": 0.0125533, + "epoch": 0.8784608447316999, + "flos": 20929906506240.0, + "grad_norm": 1.880824719735257, + "language_loss": 0.81051499, + "learning_rate": 1.5289365779564612e-07, + "loss": 0.88714468, + "num_input_tokens_seen": 315016480, + "router_z_loss_clip": 1.27734375, + "router_z_loss_mlp": 0.09307861, + "step": 14611, + "time_per_iteration": 2.4752485752105713 + }, + { + "auxiliary_loss_clip": 0.06400166, + "auxiliary_loss_mlp": 0.01262109, + "balance_loss_clip": 0.06268719, + "balance_loss_mlp": 0.01252476, + "epoch": 0.878520967984368, + "flos": 23337281036160.0, + "grad_norm": 1.4827937857578044, + "language_loss": 0.76664627, + "learning_rate": 1.5274434579486338e-07, + "loss": 0.84326899, + "num_input_tokens_seen": 315036135, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09625244, + "step": 14612, + "time_per_iteration": 2.5153868198394775 + }, + { + "auxiliary_loss_clip": 0.06400725, + "auxiliary_loss_mlp": 0.01263329, + "balance_loss_clip": 0.06272018, + "balance_loss_mlp": 0.01254192, + "epoch": 0.8785810912370359, + "flos": 25525833828480.0, + "grad_norm": 1.4386207413508079, + "language_loss": 0.72404128, + "learning_rate": 1.525951038422002e-07, + "loss": 0.80068183, + "num_input_tokens_seen": 315057995, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.09143066, + "step": 14613, + "time_per_iteration": 2.5526235103607178 + }, + { + "auxiliary_loss_clip": 0.06313758, + "auxiliary_loss_mlp": 0.01250159, + "balance_loss_clip": 0.06259576, + "balance_loss_mlp": 0.01249207, + "epoch": 0.8786412144897039, + "flos": 61857103576320.0, + "grad_norm": 1.1387954879683988, + "language_loss": 0.64722979, + "learning_rate": 1.5244593194331667e-07, + "loss": 0.72286892, + "num_input_tokens_seen": 315104010, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.00950623, + "step": 14614, + "time_per_iteration": 2.897026538848877 + }, + { + "auxiliary_loss_clip": 0.06311168, + "auxiliary_loss_mlp": 0.01251335, + "balance_loss_clip": 0.06256739, + "balance_loss_mlp": 0.01250316, + "epoch": 0.8787013377423719, + "flos": 71011445990400.0, + "grad_norm": 0.70779446038068, + "language_loss": 0.58095002, + "learning_rate": 1.5229683010386762e-07, + "loss": 0.65657508, + "num_input_tokens_seen": 315174550, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.01018524, + "step": 14615, + "time_per_iteration": 3.2636308670043945 + }, + { + "auxiliary_loss_clip": 0.06402515, + "auxiliary_loss_mlp": 0.01265364, + "balance_loss_clip": 0.06271381, + "balance_loss_mlp": 0.01256286, + "epoch": 0.8787614609950398, + "flos": 17353092867840.0, + "grad_norm": 1.8779699458458277, + "language_loss": 0.73255086, + "learning_rate": 1.5214779832950807e-07, + "loss": 0.80922961, + "num_input_tokens_seen": 315191825, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.09082031, + "step": 14616, + "time_per_iteration": 2.5093941688537598 + }, + { + "auxiliary_loss_clip": 0.06311196, + "auxiliary_loss_mlp": 0.01252507, + "balance_loss_clip": 0.06256916, + "balance_loss_mlp": 0.01251385, + "epoch": 0.8788215842477078, + "flos": 72532003633920.0, + "grad_norm": 0.7819923375628035, + "language_loss": 0.5785529, + "learning_rate": 1.5199883662588953e-07, + "loss": 0.65418988, + "num_input_tokens_seen": 315255075, + "router_z_loss_clip": 0.54638672, + "router_z_loss_mlp": 0.01124573, + "step": 14617, + "time_per_iteration": 3.25418758392334 + }, + { + "auxiliary_loss_clip": 0.06399922, + "auxiliary_loss_mlp": 0.01266444, + "balance_loss_clip": 0.06271379, + "balance_loss_mlp": 0.01257146, + "epoch": 0.8788817075003758, + "flos": 24834470590080.0, + "grad_norm": 1.7451091411227035, + "language_loss": 0.84037435, + "learning_rate": 1.5184994499865987e-07, + "loss": 0.91703808, + "num_input_tokens_seen": 315273995, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.09301758, + "step": 14618, + "time_per_iteration": 2.5523579120635986 + }, + { + "auxiliary_loss_clip": 0.0639818, + "auxiliary_loss_mlp": 0.01263411, + "balance_loss_clip": 0.06273776, + "balance_loss_mlp": 0.0125434, + "epoch": 0.8789418307530438, + "flos": 22645498527360.0, + "grad_norm": 1.6061000948299264, + "language_loss": 0.69441819, + "learning_rate": 1.5170112345346598e-07, + "loss": 0.77103406, + "num_input_tokens_seen": 315294485, + "router_z_loss_clip": 1.24414062, + "router_z_loss_mlp": 0.09069824, + "step": 14619, + "time_per_iteration": 2.54170823097229 + }, + { + "auxiliary_loss_clip": 0.06405766, + "auxiliary_loss_mlp": 0.01264393, + "balance_loss_clip": 0.06271112, + "balance_loss_mlp": 0.0125497, + "epoch": 0.8790019540057117, + "flos": 19790795376000.0, + "grad_norm": 1.783720752563742, + "language_loss": 0.77634114, + "learning_rate": 1.5155237199595016e-07, + "loss": 0.85304272, + "num_input_tokens_seen": 315310420, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.09417725, + "step": 14620, + "time_per_iteration": 2.510427474975586 + }, + { + "auxiliary_loss_clip": 0.06402472, + "auxiliary_loss_mlp": 0.01265134, + "balance_loss_clip": 0.06271151, + "balance_loss_mlp": 0.01255687, + "epoch": 0.8790620772583797, + "flos": 20235943791360.0, + "grad_norm": 1.820776592101537, + "language_loss": 0.79876006, + "learning_rate": 1.514036906317542e-07, + "loss": 0.87543613, + "num_input_tokens_seen": 315330110, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.09448242, + "step": 14621, + "time_per_iteration": 2.523426055908203 + }, + { + "auxiliary_loss_clip": 0.06407711, + "auxiliary_loss_mlp": 0.01264569, + "balance_loss_clip": 0.06271552, + "balance_loss_mlp": 0.01255098, + "epoch": 0.8791222005110476, + "flos": 24137111784960.0, + "grad_norm": 1.602537149946791, + "language_loss": 0.67313725, + "learning_rate": 1.5125507936651506e-07, + "loss": 0.74986005, + "num_input_tokens_seen": 315350080, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.09472656, + "step": 14622, + "time_per_iteration": 2.5274059772491455 + }, + { + "auxiliary_loss_clip": 0.06402093, + "auxiliary_loss_mlp": 0.01263964, + "balance_loss_clip": 0.06273091, + "balance_loss_mlp": 0.01254481, + "epoch": 0.8791823237637156, + "flos": 21620263495680.0, + "grad_norm": 1.855612811571573, + "language_loss": 0.72613978, + "learning_rate": 1.511065382058687e-07, + "loss": 0.80280036, + "num_input_tokens_seen": 315366360, + "router_z_loss_clip": 1.28808594, + "router_z_loss_mlp": 0.09490967, + "step": 14623, + "time_per_iteration": 2.510666847229004 + }, + { + "auxiliary_loss_clip": 0.06397058, + "auxiliary_loss_mlp": 0.01263964, + "balance_loss_clip": 0.06268196, + "balance_loss_mlp": 0.01254821, + "epoch": 0.8792424470163835, + "flos": 24250275123840.0, + "grad_norm": 1.5326349370658456, + "language_loss": 0.79326856, + "learning_rate": 1.5095806715544801e-07, + "loss": 0.86987877, + "num_input_tokens_seen": 315385890, + "router_z_loss_clip": 1.28808594, + "router_z_loss_mlp": 0.0914917, + "step": 14624, + "time_per_iteration": 2.566740036010742 + }, + { + "auxiliary_loss_clip": 0.06401555, + "auxiliary_loss_mlp": 0.01267628, + "balance_loss_clip": 0.06269389, + "balance_loss_mlp": 0.01257429, + "epoch": 0.8793025702690516, + "flos": 24899025761280.0, + "grad_norm": 1.7167241879200805, + "language_loss": 0.80230272, + "learning_rate": 1.5080966622088265e-07, + "loss": 0.87899458, + "num_input_tokens_seen": 315403400, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.10205078, + "step": 14625, + "time_per_iteration": 3.918522834777832 + }, + { + "auxiliary_loss_clip": 0.06401938, + "auxiliary_loss_mlp": 0.01265505, + "balance_loss_clip": 0.06273644, + "balance_loss_mlp": 0.01256707, + "epoch": 0.8793626935217195, + "flos": 25379952670080.0, + "grad_norm": 1.5019930803038062, + "language_loss": 0.73864943, + "learning_rate": 1.5066133540779967e-07, + "loss": 0.81532383, + "num_input_tokens_seen": 315423670, + "router_z_loss_clip": 1.28222656, + "router_z_loss_mlp": 0.08795166, + "step": 14626, + "time_per_iteration": 2.562892198562622 + }, + { + "auxiliary_loss_clip": 0.06406923, + "auxiliary_loss_mlp": 0.01265377, + "balance_loss_clip": 0.06273006, + "balance_loss_mlp": 0.01255787, + "epoch": 0.8794228167743875, + "flos": 34686563880960.0, + "grad_norm": 1.3945734521090933, + "language_loss": 0.71120954, + "learning_rate": 1.505130747218246e-07, + "loss": 0.78793246, + "num_input_tokens_seen": 315446265, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.09588623, + "step": 14627, + "time_per_iteration": 2.6167502403259277 + }, + { + "auxiliary_loss_clip": 0.06399681, + "auxiliary_loss_mlp": 0.01263302, + "balance_loss_clip": 0.06269456, + "balance_loss_mlp": 0.01254064, + "epoch": 0.8794829400270555, + "flos": 19470130600320.0, + "grad_norm": 1.7440522993673278, + "language_loss": 0.72579825, + "learning_rate": 1.5036488416857873e-07, + "loss": 0.80242813, + "num_input_tokens_seen": 315464655, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.09246826, + "step": 14628, + "time_per_iteration": 2.4789912700653076 + }, + { + "auxiliary_loss_clip": 0.06404158, + "auxiliary_loss_mlp": 0.01265187, + "balance_loss_clip": 0.06273529, + "balance_loss_mlp": 0.01255114, + "epoch": 0.8795430632797234, + "flos": 15236767895040.0, + "grad_norm": 2.773153659158058, + "language_loss": 0.69403476, + "learning_rate": 1.5021676375368175e-07, + "loss": 0.77072817, + "num_input_tokens_seen": 315481090, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.10064697, + "step": 14629, + "time_per_iteration": 2.4813661575317383 + }, + { + "auxiliary_loss_clip": 0.06396346, + "auxiliary_loss_mlp": 0.01262621, + "balance_loss_clip": 0.06269465, + "balance_loss_mlp": 0.01253895, + "epoch": 0.8796031865323914, + "flos": 27751967976960.0, + "grad_norm": 1.4293653202616396, + "language_loss": 0.68995941, + "learning_rate": 1.5006871348275053e-07, + "loss": 0.76654905, + "num_input_tokens_seen": 315502010, + "router_z_loss_clip": 1.26855469, + "router_z_loss_mlp": 0.08728027, + "step": 14630, + "time_per_iteration": 2.5377347469329834 + }, + { + "auxiliary_loss_clip": 0.06396469, + "auxiliary_loss_mlp": 0.01263738, + "balance_loss_clip": 0.06271411, + "balance_loss_mlp": 0.01254208, + "epoch": 0.8796633097850594, + "flos": 31293506246400.0, + "grad_norm": 1.4070035021312453, + "language_loss": 0.7483651, + "learning_rate": 1.499207333613999e-07, + "loss": 0.82496721, + "num_input_tokens_seen": 315523040, + "router_z_loss_clip": 1.24902344, + "router_z_loss_mlp": 0.09533691, + "step": 14631, + "time_per_iteration": 2.5822885036468506 + }, + { + "auxiliary_loss_clip": 0.06393504, + "auxiliary_loss_mlp": 0.01266538, + "balance_loss_clip": 0.06268861, + "balance_loss_mlp": 0.01257067, + "epoch": 0.8797234330377274, + "flos": 24249981634560.0, + "grad_norm": 1.9319771057822412, + "language_loss": 0.68856537, + "learning_rate": 1.4977282339523954e-07, + "loss": 0.76516581, + "num_input_tokens_seen": 315541865, + "router_z_loss_clip": 1.24511719, + "router_z_loss_mlp": 0.0947876, + "step": 14632, + "time_per_iteration": 2.5268332958221436 + }, + { + "auxiliary_loss_clip": 0.06400291, + "auxiliary_loss_mlp": 0.01262146, + "balance_loss_clip": 0.06270425, + "balance_loss_mlp": 0.01253742, + "epoch": 0.8797835562903953, + "flos": 24173770746240.0, + "grad_norm": 1.6895810277497014, + "language_loss": 0.64861834, + "learning_rate": 1.4962498358987929e-07, + "loss": 0.72524273, + "num_input_tokens_seen": 315561470, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.08404541, + "step": 14633, + "time_per_iteration": 2.5247573852539062 + }, + { + "auxiliary_loss_clip": 0.06401753, + "auxiliary_loss_mlp": 0.01266986, + "balance_loss_clip": 0.0627309, + "balance_loss_mlp": 0.01258165, + "epoch": 0.8798436795430633, + "flos": 19291280060160.0, + "grad_norm": 1.3977423779566516, + "language_loss": 0.84072506, + "learning_rate": 1.4947721395092528e-07, + "loss": 0.91741252, + "num_input_tokens_seen": 315583140, + "router_z_loss_clip": 1.28613281, + "router_z_loss_mlp": 0.08822632, + "step": 14634, + "time_per_iteration": 2.5381462574005127 + }, + { + "auxiliary_loss_clip": 0.06400451, + "auxiliary_loss_mlp": 0.01266972, + "balance_loss_clip": 0.06269066, + "balance_loss_mlp": 0.01257173, + "epoch": 0.8799038027957312, + "flos": 28186173434880.0, + "grad_norm": 1.4907767475913263, + "language_loss": 0.79870266, + "learning_rate": 1.4932951448398056e-07, + "loss": 0.87537694, + "num_input_tokens_seen": 315601935, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09790039, + "step": 14635, + "time_per_iteration": 2.5396430492401123 + }, + { + "auxiliary_loss_clip": 0.06404407, + "auxiliary_loss_mlp": 0.01265799, + "balance_loss_clip": 0.06272666, + "balance_loss_mlp": 0.0125628, + "epoch": 0.8799639260483992, + "flos": 24651636981120.0, + "grad_norm": 1.7695455435420768, + "language_loss": 0.65644789, + "learning_rate": 1.4918188519464648e-07, + "loss": 0.73314989, + "num_input_tokens_seen": 315619995, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.09515381, + "step": 14636, + "time_per_iteration": 3.964998960494995 + }, + { + "auxiliary_loss_clip": 0.06402347, + "auxiliary_loss_mlp": 0.01267397, + "balance_loss_clip": 0.06271206, + "balance_loss_mlp": 0.01257902, + "epoch": 0.8800240493010671, + "flos": 22207058438400.0, + "grad_norm": 1.4677484913942043, + "language_loss": 0.70408964, + "learning_rate": 1.4903432608852074e-07, + "loss": 0.78078711, + "num_input_tokens_seen": 315637895, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.0949707, + "step": 14637, + "time_per_iteration": 2.5140292644500732 + }, + { + "auxiliary_loss_clip": 0.06401545, + "auxiliary_loss_mlp": 0.01264401, + "balance_loss_clip": 0.06271181, + "balance_loss_mlp": 0.01255353, + "epoch": 0.8800841725537352, + "flos": 14251252498560.0, + "grad_norm": 1.8480361398751275, + "language_loss": 0.66556799, + "learning_rate": 1.4888683717119843e-07, + "loss": 0.74222744, + "num_input_tokens_seen": 315655520, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.0904541, + "step": 14638, + "time_per_iteration": 2.519340991973877 + }, + { + "auxiliary_loss_clip": 0.06404281, + "auxiliary_loss_mlp": 0.01263496, + "balance_loss_clip": 0.06272089, + "balance_loss_mlp": 0.01253977, + "epoch": 0.8801442958064031, + "flos": 37425043019520.0, + "grad_norm": 2.054991343187147, + "language_loss": 0.58460569, + "learning_rate": 1.4873941844827286e-07, + "loss": 0.66128349, + "num_input_tokens_seen": 315678955, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.09521484, + "step": 14639, + "time_per_iteration": 2.622095823287964 + }, + { + "auxiliary_loss_clip": 0.0640137, + "auxiliary_loss_mlp": 0.01266992, + "balance_loss_clip": 0.06269941, + "balance_loss_mlp": 0.012573, + "epoch": 0.8802044190590711, + "flos": 25054550138880.0, + "grad_norm": 2.0541054396884677, + "language_loss": 0.74650657, + "learning_rate": 1.4859206992533402e-07, + "loss": 0.82319009, + "num_input_tokens_seen": 315700360, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.09692383, + "step": 14640, + "time_per_iteration": 3.9488940238952637 + }, + { + "auxiliary_loss_clip": 0.06400943, + "auxiliary_loss_mlp": 0.01262613, + "balance_loss_clip": 0.06270456, + "balance_loss_mlp": 0.01253052, + "epoch": 0.8802645423117391, + "flos": 24140717510400.0, + "grad_norm": 1.9319844379203082, + "language_loss": 0.70021105, + "learning_rate": 1.4844479160796985e-07, + "loss": 0.77684665, + "num_input_tokens_seen": 315719270, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.09564209, + "step": 14641, + "time_per_iteration": 2.5713586807250977 + }, + { + "auxiliary_loss_clip": 0.06405936, + "auxiliary_loss_mlp": 0.01262892, + "balance_loss_clip": 0.06272167, + "balance_loss_mlp": 0.01252789, + "epoch": 0.880324665564407, + "flos": 17936994844800.0, + "grad_norm": 1.944450035656478, + "language_loss": 0.85435617, + "learning_rate": 1.4829758350176457e-07, + "loss": 0.9310444, + "num_input_tokens_seen": 315737425, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.10107422, + "step": 14642, + "time_per_iteration": 3.906127691268921 + }, + { + "auxiliary_loss_clip": 0.06403466, + "auxiliary_loss_mlp": 0.01263245, + "balance_loss_clip": 0.06273904, + "balance_loss_mlp": 0.01253691, + "epoch": 0.880384788817075, + "flos": 21293938569600.0, + "grad_norm": 1.7769951500601024, + "language_loss": 0.78894514, + "learning_rate": 1.4815044561230038e-07, + "loss": 0.86561227, + "num_input_tokens_seen": 315755725, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.09558105, + "step": 14643, + "time_per_iteration": 2.519885778427124 + }, + { + "auxiliary_loss_clip": 0.06397131, + "auxiliary_loss_mlp": 0.01262242, + "balance_loss_clip": 0.06270497, + "balance_loss_mlp": 0.01253867, + "epoch": 0.880444912069743, + "flos": 12463390730880.0, + "grad_norm": 1.5041267161215206, + "language_loss": 0.73285198, + "learning_rate": 1.4800337794515705e-07, + "loss": 0.80944562, + "num_input_tokens_seen": 315773835, + "router_z_loss_clip": 1.26757812, + "router_z_loss_mlp": 0.0836792, + "step": 14644, + "time_per_iteration": 2.470648765563965 + }, + { + "auxiliary_loss_clip": 0.06408016, + "auxiliary_loss_mlp": 0.01267274, + "balance_loss_clip": 0.06272088, + "balance_loss_mlp": 0.01257129, + "epoch": 0.880505035322411, + "flos": 13631026976640.0, + "grad_norm": 2.3799093865223213, + "language_loss": 0.7972905, + "learning_rate": 1.47856380505911e-07, + "loss": 0.87404341, + "num_input_tokens_seen": 315790615, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.10144043, + "step": 14645, + "time_per_iteration": 2.518871545791626 + }, + { + "auxiliary_loss_clip": 0.06397209, + "auxiliary_loss_mlp": 0.01264412, + "balance_loss_clip": 0.06271972, + "balance_loss_mlp": 0.01255334, + "epoch": 0.8805651585750789, + "flos": 23189387379840.0, + "grad_norm": 1.4852789962824886, + "language_loss": 0.64198017, + "learning_rate": 1.477094533001364e-07, + "loss": 0.7185964, + "num_input_tokens_seen": 315811010, + "router_z_loss_clip": 1.25195312, + "router_z_loss_mlp": 0.09082031, + "step": 14646, + "time_per_iteration": 2.5021417140960693 + }, + { + "auxiliary_loss_clip": 0.06412499, + "auxiliary_loss_mlp": 0.01263315, + "balance_loss_clip": 0.06275496, + "balance_loss_mlp": 0.01253045, + "epoch": 0.8806252818277469, + "flos": 14908304689920.0, + "grad_norm": 2.619123359403294, + "language_loss": 0.77789688, + "learning_rate": 1.475625963334055e-07, + "loss": 0.85465503, + "num_input_tokens_seen": 315828130, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.10272217, + "step": 14647, + "time_per_iteration": 2.539391040802002 + }, + { + "auxiliary_loss_clip": 0.06398942, + "auxiliary_loss_mlp": 0.01263452, + "balance_loss_clip": 0.06270331, + "balance_loss_mlp": 0.01255, + "epoch": 0.8806854050804148, + "flos": 17644897111680.0, + "grad_norm": 2.192652669524439, + "language_loss": 0.75220722, + "learning_rate": 1.4741580961128652e-07, + "loss": 0.8288312, + "num_input_tokens_seen": 315844900, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.08453369, + "step": 14648, + "time_per_iteration": 2.4884188175201416 + }, + { + "auxiliary_loss_clip": 0.06403202, + "auxiliary_loss_mlp": 0.01265143, + "balance_loss_clip": 0.06270049, + "balance_loss_mlp": 0.01255952, + "epoch": 0.8807455283330828, + "flos": 25338514026240.0, + "grad_norm": 2.5305554735964573, + "language_loss": 0.65665662, + "learning_rate": 1.4726909313934522e-07, + "loss": 0.73334002, + "num_input_tokens_seen": 315863745, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.09191895, + "step": 14649, + "time_per_iteration": 2.5534260272979736 + }, + { + "auxiliary_loss_clip": 0.06403228, + "auxiliary_loss_mlp": 0.01263972, + "balance_loss_clip": 0.06272388, + "balance_loss_mlp": 0.01254495, + "epoch": 0.8808056515857507, + "flos": 25272239846400.0, + "grad_norm": 1.2725171028063786, + "language_loss": 0.62303275, + "learning_rate": 1.4712244692314578e-07, + "loss": 0.69970477, + "num_input_tokens_seen": 315885765, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.0947876, + "step": 14650, + "time_per_iteration": 2.5216543674468994 + }, + { + "auxiliary_loss_clip": 0.06398011, + "auxiliary_loss_mlp": 0.01261953, + "balance_loss_clip": 0.06269711, + "balance_loss_mlp": 0.01253018, + "epoch": 0.8808657748384188, + "flos": 26586176520960.0, + "grad_norm": 1.5795337054633014, + "language_loss": 0.72957003, + "learning_rate": 1.4697587096824914e-07, + "loss": 0.80616963, + "num_input_tokens_seen": 315907340, + "router_z_loss_clip": 1.28320312, + "router_z_loss_mlp": 0.08929443, + "step": 14651, + "time_per_iteration": 2.5674073696136475 + }, + { + "auxiliary_loss_clip": 0.06404445, + "auxiliary_loss_mlp": 0.01262501, + "balance_loss_clip": 0.0627149, + "balance_loss_mlp": 0.01252935, + "epoch": 0.8809258980910867, + "flos": 18667197250560.0, + "grad_norm": 1.6881514833270383, + "language_loss": 0.72177875, + "learning_rate": 1.4682936528021284e-07, + "loss": 0.7984482, + "num_input_tokens_seen": 315924935, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.09576416, + "step": 14652, + "time_per_iteration": 2.4565625190734863 + }, + { + "auxiliary_loss_clip": 0.06400369, + "auxiliary_loss_mlp": 0.01262522, + "balance_loss_clip": 0.062704, + "balance_loss_mlp": 0.01253509, + "epoch": 0.8809860213437547, + "flos": 19798426097280.0, + "grad_norm": 6.259659475652455, + "language_loss": 0.74713862, + "learning_rate": 1.4668292986459286e-07, + "loss": 0.82376754, + "num_input_tokens_seen": 315943165, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.09008789, + "step": 14653, + "time_per_iteration": 2.5095698833465576 + }, + { + "auxiliary_loss_clip": 0.06404018, + "auxiliary_loss_mlp": 0.01267393, + "balance_loss_clip": 0.06269836, + "balance_loss_mlp": 0.01257588, + "epoch": 0.8810461445964227, + "flos": 17900210102400.0, + "grad_norm": 1.7754653756175585, + "language_loss": 0.71624255, + "learning_rate": 1.465365647269421e-07, + "loss": 0.79295671, + "num_input_tokens_seen": 315961340, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.0980835, + "step": 14654, + "time_per_iteration": 2.458045244216919 + }, + { + "auxiliary_loss_clip": 0.06403499, + "auxiliary_loss_mlp": 0.01267179, + "balance_loss_clip": 0.06272502, + "balance_loss_mlp": 0.01257529, + "epoch": 0.8811062678490906, + "flos": 29170766436480.0, + "grad_norm": 1.4291557550809124, + "language_loss": 0.71611077, + "learning_rate": 1.4639026987281012e-07, + "loss": 0.79281753, + "num_input_tokens_seen": 315981335, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.09655762, + "step": 14655, + "time_per_iteration": 2.5877456665039062 + }, + { + "auxiliary_loss_clip": 0.06398024, + "auxiliary_loss_mlp": 0.01264929, + "balance_loss_clip": 0.06269453, + "balance_loss_mlp": 0.01256025, + "epoch": 0.8811663911017587, + "flos": 20344956353280.0, + "grad_norm": 1.56260789406541, + "language_loss": 0.81561428, + "learning_rate": 1.462440453077449e-07, + "loss": 0.89224374, + "num_input_tokens_seen": 316001325, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.08911133, + "step": 14656, + "time_per_iteration": 2.4939017295837402 + }, + { + "auxiliary_loss_clip": 0.06403321, + "auxiliary_loss_mlp": 0.01265996, + "balance_loss_clip": 0.06272201, + "balance_loss_mlp": 0.01257258, + "epoch": 0.8812265143544266, + "flos": 25892926565760.0, + "grad_norm": 1.6558958362539187, + "language_loss": 0.68877184, + "learning_rate": 1.460978910372914e-07, + "loss": 0.76546496, + "num_input_tokens_seen": 316022540, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.08740234, + "step": 14657, + "time_per_iteration": 2.5605247020721436 + }, + { + "auxiliary_loss_clip": 0.0640131, + "auxiliary_loss_mlp": 0.01264715, + "balance_loss_clip": 0.06269861, + "balance_loss_mlp": 0.01255804, + "epoch": 0.8812866376070946, + "flos": 27202335120000.0, + "grad_norm": 1.9275241644467438, + "language_loss": 0.83792698, + "learning_rate": 1.4595180706699207e-07, + "loss": 0.91458726, + "num_input_tokens_seen": 316037735, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.08911133, + "step": 14658, + "time_per_iteration": 2.539914846420288 + }, + { + "auxiliary_loss_clip": 0.06408009, + "auxiliary_loss_mlp": 0.01267518, + "balance_loss_clip": 0.06271239, + "balance_loss_mlp": 0.0125729, + "epoch": 0.8813467608597625, + "flos": 23814266803200.0, + "grad_norm": 1.768545286165811, + "language_loss": 0.77509159, + "learning_rate": 1.4580579340238554e-07, + "loss": 0.85184681, + "num_input_tokens_seen": 316058105, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.10235596, + "step": 14659, + "time_per_iteration": 2.574265480041504 + }, + { + "auxiliary_loss_clip": 0.06399348, + "auxiliary_loss_mlp": 0.01261562, + "balance_loss_clip": 0.06269409, + "balance_loss_mlp": 0.01252377, + "epoch": 0.8814068841124305, + "flos": 21111775793280.0, + "grad_norm": 1.7845469935654699, + "language_loss": 0.60817045, + "learning_rate": 1.4565985004900894e-07, + "loss": 0.68477958, + "num_input_tokens_seen": 316074415, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.09185791, + "step": 14660, + "time_per_iteration": 2.5120184421539307 + }, + { + "auxiliary_loss_clip": 0.06399903, + "auxiliary_loss_mlp": 0.01262177, + "balance_loss_clip": 0.06270248, + "balance_loss_mlp": 0.01252509, + "epoch": 0.8814670073650984, + "flos": 24723822873600.0, + "grad_norm": 1.6340648502892121, + "language_loss": 0.78212428, + "learning_rate": 1.455139770123972e-07, + "loss": 0.8587451, + "num_input_tokens_seen": 316094405, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.09674072, + "step": 14661, + "time_per_iteration": 2.5731544494628906 + }, + { + "auxiliary_loss_clip": 0.06405543, + "auxiliary_loss_mlp": 0.0126526, + "balance_loss_clip": 0.06272089, + "balance_loss_mlp": 0.01255294, + "epoch": 0.8815271306177664, + "flos": 22972913556480.0, + "grad_norm": 1.7150336378950353, + "language_loss": 0.76684302, + "learning_rate": 1.45368174298081e-07, + "loss": 0.84355104, + "num_input_tokens_seen": 316113390, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.09967041, + "step": 14662, + "time_per_iteration": 2.518737554550171 + }, + { + "auxiliary_loss_clip": 0.06397216, + "auxiliary_loss_mlp": 0.01265956, + "balance_loss_clip": 0.06270915, + "balance_loss_mlp": 0.01257356, + "epoch": 0.8815872538704344, + "flos": 19465518625920.0, + "grad_norm": 1.8360238755805145, + "language_loss": 0.73649955, + "learning_rate": 1.4522244191158929e-07, + "loss": 0.81313121, + "num_input_tokens_seen": 316131085, + "router_z_loss_clip": 1.26269531, + "router_z_loss_mlp": 0.08599854, + "step": 14663, + "time_per_iteration": 2.4928483963012695 + }, + { + "auxiliary_loss_clip": 0.06398933, + "auxiliary_loss_mlp": 0.01268829, + "balance_loss_clip": 0.06270891, + "balance_loss_mlp": 0.0125987, + "epoch": 0.8816473771231024, + "flos": 32164097368320.0, + "grad_norm": 1.4224599659696884, + "language_loss": 0.70133549, + "learning_rate": 1.450767798584489e-07, + "loss": 0.77801311, + "num_input_tokens_seen": 316151440, + "router_z_loss_clip": 1.27929688, + "router_z_loss_mlp": 0.08953857, + "step": 14664, + "time_per_iteration": 4.078710079193115 + }, + { + "auxiliary_loss_clip": 0.06400171, + "auxiliary_loss_mlp": 0.01263779, + "balance_loss_clip": 0.06271797, + "balance_loss_mlp": 0.01254916, + "epoch": 0.8817075003757703, + "flos": 19688323432320.0, + "grad_norm": 1.386701890018287, + "language_loss": 0.81031573, + "learning_rate": 1.449311881441828e-07, + "loss": 0.88695526, + "num_input_tokens_seen": 316170750, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.08868408, + "step": 14665, + "time_per_iteration": 2.5095698833465576 + }, + { + "auxiliary_loss_clip": 0.06401434, + "auxiliary_loss_mlp": 0.01260949, + "balance_loss_clip": 0.06272306, + "balance_loss_mlp": 0.01251817, + "epoch": 0.8817676236284383, + "flos": 15673950172800.0, + "grad_norm": 2.3358439244424862, + "language_loss": 0.58787858, + "learning_rate": 1.447856667743117e-07, + "loss": 0.66450244, + "num_input_tokens_seen": 316187265, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.09136963, + "step": 14666, + "time_per_iteration": 2.540194034576416 + }, + { + "auxiliary_loss_clip": 0.06400174, + "auxiliary_loss_mlp": 0.01265605, + "balance_loss_clip": 0.06270184, + "balance_loss_mlp": 0.01255102, + "epoch": 0.8818277468811063, + "flos": 17901048643200.0, + "grad_norm": 1.6530225652639872, + "language_loss": 0.83922029, + "learning_rate": 1.4464021575435403e-07, + "loss": 0.91587806, + "num_input_tokens_seen": 316206555, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.10498047, + "step": 14667, + "time_per_iteration": 2.495633125305176 + }, + { + "auxiliary_loss_clip": 0.06404059, + "auxiliary_loss_mlp": 0.01265655, + "balance_loss_clip": 0.06274028, + "balance_loss_mlp": 0.01255999, + "epoch": 0.8818878701337742, + "flos": 18776461374720.0, + "grad_norm": 1.7309788421424104, + "language_loss": 0.62558234, + "learning_rate": 1.4449483508982563e-07, + "loss": 0.70227951, + "num_input_tokens_seen": 316225210, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.09649658, + "step": 14668, + "time_per_iteration": 2.4942386150360107 + }, + { + "auxiliary_loss_clip": 0.06397483, + "auxiliary_loss_mlp": 0.01261702, + "balance_loss_clip": 0.06268862, + "balance_loss_mlp": 0.01252898, + "epoch": 0.8819479933864423, + "flos": 17718047326080.0, + "grad_norm": 2.2322444364782577, + "language_loss": 0.5726642, + "learning_rate": 1.4434952478623918e-07, + "loss": 0.64925605, + "num_input_tokens_seen": 316242685, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.0880127, + "step": 14669, + "time_per_iteration": 2.5518670082092285 + }, + { + "auxiliary_loss_clip": 0.0640443, + "auxiliary_loss_mlp": 0.01262805, + "balance_loss_clip": 0.06273519, + "balance_loss_mlp": 0.01253489, + "epoch": 0.8820081166391102, + "flos": 11733523741440.0, + "grad_norm": 1.7260866904493628, + "language_loss": 0.71694434, + "learning_rate": 1.442042848491043e-07, + "loss": 0.79361665, + "num_input_tokens_seen": 316260935, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09320068, + "step": 14670, + "time_per_iteration": 2.469038486480713 + }, + { + "auxiliary_loss_clip": 0.06399909, + "auxiliary_loss_mlp": 0.01267979, + "balance_loss_clip": 0.06269167, + "balance_loss_mlp": 0.01258067, + "epoch": 0.8820682398917782, + "flos": 27497745089280.0, + "grad_norm": 2.206437045380329, + "language_loss": 0.7456339, + "learning_rate": 1.44059115283929e-07, + "loss": 0.82231283, + "num_input_tokens_seen": 316281190, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.09924316, + "step": 14671, + "time_per_iteration": 2.5506999492645264 + }, + { + "auxiliary_loss_clip": 0.06403503, + "auxiliary_loss_mlp": 0.01269024, + "balance_loss_clip": 0.06270997, + "balance_loss_mlp": 0.0125882, + "epoch": 0.8821283631444461, + "flos": 16879587045120.0, + "grad_norm": 2.5171122435451245, + "language_loss": 0.85031545, + "learning_rate": 1.43914016096218e-07, + "loss": 0.9270407, + "num_input_tokens_seen": 316297115, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.10205078, + "step": 14672, + "time_per_iteration": 2.5202066898345947 + }, + { + "auxiliary_loss_clip": 0.06396373, + "auxiliary_loss_mlp": 0.01268498, + "balance_loss_clip": 0.0626964, + "balance_loss_mlp": 0.01259724, + "epoch": 0.8821884863971141, + "flos": 24288024188160.0, + "grad_norm": 1.6225814735684048, + "language_loss": 0.72806644, + "learning_rate": 1.4376898729147336e-07, + "loss": 0.8047151, + "num_input_tokens_seen": 316318235, + "router_z_loss_clip": 1.26953125, + "router_z_loss_mlp": 0.08770752, + "step": 14673, + "time_per_iteration": 2.6579220294952393 + }, + { + "auxiliary_loss_clip": 0.06309947, + "auxiliary_loss_mlp": 0.01255376, + "balance_loss_clip": 0.06255542, + "balance_loss_mlp": 0.01254378, + "epoch": 0.882248609649782, + "flos": 59453990876160.0, + "grad_norm": 0.7872167317420794, + "language_loss": 0.49268723, + "learning_rate": 1.4362402887519487e-07, + "loss": 0.56834042, + "num_input_tokens_seen": 316384705, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.00997162, + "step": 14674, + "time_per_iteration": 3.236130475997925 + }, + { + "auxiliary_loss_clip": 0.06401759, + "auxiliary_loss_mlp": 0.01265651, + "balance_loss_clip": 0.06269863, + "balance_loss_mlp": 0.01255608, + "epoch": 0.88230873290245, + "flos": 19943887985280.0, + "grad_norm": 1.8856716394845916, + "language_loss": 0.76288593, + "learning_rate": 1.4347914085287971e-07, + "loss": 0.83956003, + "num_input_tokens_seen": 316401165, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.10046387, + "step": 14675, + "time_per_iteration": 2.5227322578430176 + }, + { + "auxiliary_loss_clip": 0.06397566, + "auxiliary_loss_mlp": 0.01264151, + "balance_loss_clip": 0.06270373, + "balance_loss_mlp": 0.01255306, + "epoch": 0.882368856155118, + "flos": 16368374085120.0, + "grad_norm": 1.6123928744840947, + "language_loss": 0.79259509, + "learning_rate": 1.4333432323002105e-07, + "loss": 0.86921227, + "num_input_tokens_seen": 316418780, + "router_z_loss_clip": 1.2734375, + "router_z_loss_mlp": 0.08843994, + "step": 14676, + "time_per_iteration": 3.8567166328430176 + }, + { + "auxiliary_loss_clip": 0.06307142, + "auxiliary_loss_mlp": 0.01253674, + "balance_loss_clip": 0.06252797, + "balance_loss_mlp": 0.01252705, + "epoch": 0.882428979407786, + "flos": 70617672927360.0, + "grad_norm": 0.6822788139152429, + "language_loss": 0.54586005, + "learning_rate": 1.431895760121109e-07, + "loss": 0.62146819, + "num_input_tokens_seen": 316482030, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.00967407, + "step": 14677, + "time_per_iteration": 3.2512588500976562 + }, + { + "auxiliary_loss_clip": 0.06399799, + "auxiliary_loss_mlp": 0.01263106, + "balance_loss_clip": 0.06268829, + "balance_loss_mlp": 0.01253545, + "epoch": 0.8824891026604539, + "flos": 18156151998720.0, + "grad_norm": 2.014632299610882, + "language_loss": 0.65062732, + "learning_rate": 1.4304489920463847e-07, + "loss": 0.72725636, + "num_input_tokens_seen": 316499175, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.09558105, + "step": 14678, + "time_per_iteration": 2.472111225128174 + }, + { + "auxiliary_loss_clip": 0.06405297, + "auxiliary_loss_mlp": 0.01268562, + "balance_loss_clip": 0.06272408, + "balance_loss_mlp": 0.01259496, + "epoch": 0.8825492259131219, + "flos": 27239664913920.0, + "grad_norm": 1.7861369915928562, + "language_loss": 0.71231997, + "learning_rate": 1.4290029281308936e-07, + "loss": 0.78905857, + "num_input_tokens_seen": 316519495, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.09063721, + "step": 14679, + "time_per_iteration": 2.6039962768554688 + }, + { + "auxiliary_loss_clip": 0.06400929, + "auxiliary_loss_mlp": 0.01265754, + "balance_loss_clip": 0.06271735, + "balance_loss_mlp": 0.01257768, + "epoch": 0.8826093491657898, + "flos": 22281172974720.0, + "grad_norm": 1.5959410569258197, + "language_loss": 0.63950992, + "learning_rate": 1.4275575684294694e-07, + "loss": 0.71617675, + "num_input_tokens_seen": 316538180, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.07983398, + "step": 14680, + "time_per_iteration": 3.9252450466156006 + }, + { + "auxiliary_loss_clip": 0.06397928, + "auxiliary_loss_mlp": 0.01264633, + "balance_loss_clip": 0.06270419, + "balance_loss_mlp": 0.0125605, + "epoch": 0.8826694724184578, + "flos": 14209101095040.0, + "grad_norm": 2.809563443192349, + "language_loss": 0.77776754, + "learning_rate": 1.4261129129969328e-07, + "loss": 0.85439312, + "num_input_tokens_seen": 316551750, + "router_z_loss_clip": 1.27441406, + "router_z_loss_mlp": 0.08575439, + "step": 14681, + "time_per_iteration": 2.4502193927764893 + }, + { + "auxiliary_loss_clip": 0.06403942, + "auxiliary_loss_mlp": 0.01262466, + "balance_loss_clip": 0.06270965, + "balance_loss_mlp": 0.01252685, + "epoch": 0.8827295956711259, + "flos": 20638018408320.0, + "grad_norm": 1.5469151752981896, + "language_loss": 0.72931725, + "learning_rate": 1.424668961888047e-07, + "loss": 0.80598128, + "num_input_tokens_seen": 316570680, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.09783936, + "step": 14682, + "time_per_iteration": 3.962366819381714 + }, + { + "auxiliary_loss_clip": 0.06409137, + "auxiliary_loss_mlp": 0.01270395, + "balance_loss_clip": 0.06273471, + "balance_loss_mlp": 0.01259595, + "epoch": 0.8827897189237938, + "flos": 18518632761600.0, + "grad_norm": 1.6628923088438647, + "language_loss": 0.75193185, + "learning_rate": 1.4232257151575765e-07, + "loss": 0.82872719, + "num_input_tokens_seen": 316588635, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.10791016, + "step": 14683, + "time_per_iteration": 2.5152933597564697 + }, + { + "auxiliary_loss_clip": 0.06403377, + "auxiliary_loss_mlp": 0.01262559, + "balance_loss_clip": 0.06272641, + "balance_loss_mlp": 0.01252677, + "epoch": 0.8828498421764618, + "flos": 22754007964800.0, + "grad_norm": 2.015952811438403, + "language_loss": 0.66169786, + "learning_rate": 1.4217831728602492e-07, + "loss": 0.73835725, + "num_input_tokens_seen": 316607550, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09875488, + "step": 14684, + "time_per_iteration": 2.557262420654297 + }, + { + "auxiliary_loss_clip": 0.06398778, + "auxiliary_loss_mlp": 0.01263689, + "balance_loss_clip": 0.06268162, + "balance_loss_mlp": 0.01254563, + "epoch": 0.8829099654291297, + "flos": 15017694595200.0, + "grad_norm": 1.8477413865365486, + "language_loss": 0.69428438, + "learning_rate": 1.4203413350507677e-07, + "loss": 0.77090901, + "num_input_tokens_seen": 316624460, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.09124756, + "step": 14685, + "time_per_iteration": 2.5324926376342773 + }, + { + "auxiliary_loss_clip": 0.0640468, + "auxiliary_loss_mlp": 0.01262589, + "balance_loss_clip": 0.06270929, + "balance_loss_mlp": 0.01252623, + "epoch": 0.8829700886817977, + "flos": 16725026989440.0, + "grad_norm": 1.8057502590812853, + "language_loss": 0.7455259, + "learning_rate": 1.418900201783806e-07, + "loss": 0.82219857, + "num_input_tokens_seen": 316640765, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.09954834, + "step": 14686, + "time_per_iteration": 2.4790773391723633 + }, + { + "auxiliary_loss_clip": 0.06394429, + "auxiliary_loss_mlp": 0.01265012, + "balance_loss_clip": 0.06266899, + "balance_loss_mlp": 0.01255941, + "epoch": 0.8830302119344656, + "flos": 15267850560000.0, + "grad_norm": 1.7158951019726476, + "language_loss": 0.63215464, + "learning_rate": 1.417459773114007e-07, + "loss": 0.70874906, + "num_input_tokens_seen": 316656120, + "router_z_loss_clip": 1.2734375, + "router_z_loss_mlp": 0.09069824, + "step": 14687, + "time_per_iteration": 2.5241615772247314 + }, + { + "auxiliary_loss_clip": 0.06404291, + "auxiliary_loss_mlp": 0.01262922, + "balance_loss_clip": 0.06270834, + "balance_loss_mlp": 0.01252903, + "epoch": 0.8830903351871336, + "flos": 28624697377920.0, + "grad_norm": 2.595517619251839, + "language_loss": 0.69500947, + "learning_rate": 1.4160200490959984e-07, + "loss": 0.77168155, + "num_input_tokens_seen": 316676095, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.10028076, + "step": 14688, + "time_per_iteration": 2.540933609008789 + }, + { + "auxiliary_loss_clip": 0.06396133, + "auxiliary_loss_mlp": 0.01267955, + "balance_loss_clip": 0.06270996, + "balance_loss_mlp": 0.01259652, + "epoch": 0.8831504584398016, + "flos": 28009167684480.0, + "grad_norm": 1.5638574685604314, + "language_loss": 0.66877151, + "learning_rate": 1.4145810297843697e-07, + "loss": 0.74541235, + "num_input_tokens_seen": 316696235, + "router_z_loss_clip": 1.25292969, + "router_z_loss_mlp": 0.08294678, + "step": 14689, + "time_per_iteration": 2.5956904888153076 + }, + { + "auxiliary_loss_clip": 0.06402047, + "auxiliary_loss_mlp": 0.01265309, + "balance_loss_clip": 0.06273194, + "balance_loss_mlp": 0.01256839, + "epoch": 0.8832105816924696, + "flos": 26587098915840.0, + "grad_norm": 1.2720232823843813, + "language_loss": 0.74491525, + "learning_rate": 1.4131427152336905e-07, + "loss": 0.82158875, + "num_input_tokens_seen": 316719680, + "router_z_loss_clip": 1.28808594, + "router_z_loss_mlp": 0.08465576, + "step": 14690, + "time_per_iteration": 2.550379753112793 + }, + { + "auxiliary_loss_clip": 0.0640257, + "auxiliary_loss_mlp": 0.01265347, + "balance_loss_clip": 0.06271975, + "balance_loss_mlp": 0.01255065, + "epoch": 0.8832707049451375, + "flos": 24905524452480.0, + "grad_norm": 1.3286070309663014, + "language_loss": 0.7308588, + "learning_rate": 1.4117051054985018e-07, + "loss": 0.80753797, + "num_input_tokens_seen": 316739830, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.10272217, + "step": 14691, + "time_per_iteration": 2.618356466293335 + }, + { + "auxiliary_loss_clip": 0.06406618, + "auxiliary_loss_mlp": 0.01263553, + "balance_loss_clip": 0.06271677, + "balance_loss_mlp": 0.01254058, + "epoch": 0.8833308281978055, + "flos": 15456679735680.0, + "grad_norm": 1.9431819438637523, + "language_loss": 0.52190626, + "learning_rate": 1.4102682006333243e-07, + "loss": 0.5986079, + "num_input_tokens_seen": 316758105, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.09490967, + "step": 14692, + "time_per_iteration": 2.4854116439819336 + }, + { + "auxiliary_loss_clip": 0.06404817, + "auxiliary_loss_mlp": 0.0126387, + "balance_loss_clip": 0.06273092, + "balance_loss_mlp": 0.0125397, + "epoch": 0.8833909514504734, + "flos": 20307500778240.0, + "grad_norm": 2.1854307452735884, + "language_loss": 0.61036348, + "learning_rate": 1.4088320006926346e-07, + "loss": 0.6870504, + "num_input_tokens_seen": 316777455, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09906006, + "step": 14693, + "time_per_iteration": 2.5374739170074463 + }, + { + "auxiliary_loss_clip": 0.06395325, + "auxiliary_loss_mlp": 0.01263199, + "balance_loss_clip": 0.06270225, + "balance_loss_mlp": 0.01254532, + "epoch": 0.8834510747031414, + "flos": 20379938232960.0, + "grad_norm": 1.4784746764410908, + "language_loss": 0.75460541, + "learning_rate": 1.407396505730898e-07, + "loss": 0.83119071, + "num_input_tokens_seen": 316796300, + "router_z_loss_clip": 1.25195312, + "router_z_loss_mlp": 0.08666992, + "step": 14694, + "time_per_iteration": 2.543729066848755 + }, + { + "auxiliary_loss_clip": 0.06403571, + "auxiliary_loss_mlp": 0.01265299, + "balance_loss_clip": 0.06269252, + "balance_loss_mlp": 0.01256531, + "epoch": 0.8835111979558095, + "flos": 29759699658240.0, + "grad_norm": 1.9605899347359843, + "language_loss": 0.72491586, + "learning_rate": 1.4059617158025527e-07, + "loss": 0.80160457, + "num_input_tokens_seen": 316819090, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.08770752, + "step": 14695, + "time_per_iteration": 2.5731723308563232 + }, + { + "auxiliary_loss_clip": 0.06393148, + "auxiliary_loss_mlp": 0.0126203, + "balance_loss_clip": 0.06268685, + "balance_loss_mlp": 0.01253942, + "epoch": 0.8835713212084774, + "flos": 24141514124160.0, + "grad_norm": 1.602709205439156, + "language_loss": 0.8027606, + "learning_rate": 1.404527630961998e-07, + "loss": 0.8793124, + "num_input_tokens_seen": 316839250, + "router_z_loss_clip": 1.24511719, + "router_z_loss_mlp": 0.08093262, + "step": 14696, + "time_per_iteration": 2.534120798110962 + }, + { + "auxiliary_loss_clip": 0.06403233, + "auxiliary_loss_mlp": 0.01265612, + "balance_loss_clip": 0.06271463, + "balance_loss_mlp": 0.01256665, + "epoch": 0.8836314444611454, + "flos": 27679656303360.0, + "grad_norm": 2.0173732379548905, + "language_loss": 0.74990559, + "learning_rate": 1.4030942512636236e-07, + "loss": 0.82659405, + "num_input_tokens_seen": 316861315, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.08953857, + "step": 14697, + "time_per_iteration": 2.592552900314331 + }, + { + "auxiliary_loss_clip": 0.06399925, + "auxiliary_loss_mlp": 0.01266921, + "balance_loss_clip": 0.06270725, + "balance_loss_mlp": 0.01257844, + "epoch": 0.8836915677138133, + "flos": 16842634594560.0, + "grad_norm": 1.9895118296401026, + "language_loss": 0.72394419, + "learning_rate": 1.401661576761779e-07, + "loss": 0.80061269, + "num_input_tokens_seen": 316879325, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.09075928, + "step": 14698, + "time_per_iteration": 2.4627113342285156 + }, + { + "auxiliary_loss_clip": 0.06305031, + "auxiliary_loss_mlp": 0.0125323, + "balance_loss_clip": 0.06250586, + "balance_loss_mlp": 0.01252178, + "epoch": 0.8837516909664813, + "flos": 69332261368320.0, + "grad_norm": 0.7740855543002164, + "language_loss": 0.5369336, + "learning_rate": 1.4002296075107856e-07, + "loss": 0.61251622, + "num_input_tokens_seen": 316936425, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.01052856, + "step": 14699, + "time_per_iteration": 3.17140793800354 + }, + { + "auxiliary_loss_clip": 0.06403652, + "auxiliary_loss_mlp": 0.01264634, + "balance_loss_clip": 0.06268007, + "balance_loss_mlp": 0.01254931, + "epoch": 0.8838118142191492, + "flos": 21331142582400.0, + "grad_norm": 1.5418918526110506, + "language_loss": 0.76658535, + "learning_rate": 1.3987983435649508e-07, + "loss": 0.84326828, + "num_input_tokens_seen": 316956360, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.09698486, + "step": 14700, + "time_per_iteration": 2.5061445236206055 + }, + { + "auxiliary_loss_clip": 0.06398652, + "auxiliary_loss_mlp": 0.01261483, + "balance_loss_clip": 0.06270871, + "balance_loss_mlp": 0.01252536, + "epoch": 0.8838719374718172, + "flos": 21476981813760.0, + "grad_norm": 1.816100763964491, + "language_loss": 0.73857808, + "learning_rate": 1.3973677849785494e-07, + "loss": 0.81517947, + "num_input_tokens_seen": 316975295, + "router_z_loss_clip": 1.27636719, + "router_z_loss_mlp": 0.08947754, + "step": 14701, + "time_per_iteration": 2.5440568923950195 + }, + { + "auxiliary_loss_clip": 0.06405409, + "auxiliary_loss_mlp": 0.0126287, + "balance_loss_clip": 0.0626961, + "balance_loss_mlp": 0.01253297, + "epoch": 0.8839320607244852, + "flos": 26476157710080.0, + "grad_norm": 1.7347205509220878, + "language_loss": 0.71765238, + "learning_rate": 1.3959379318058262e-07, + "loss": 0.79433513, + "num_input_tokens_seen": 316994520, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.09570312, + "step": 14702, + "time_per_iteration": 2.5365030765533447 + }, + { + "auxiliary_loss_clip": 0.0640773, + "auxiliary_loss_mlp": 0.01267604, + "balance_loss_clip": 0.06274585, + "balance_loss_mlp": 0.012583, + "epoch": 0.8839921839771532, + "flos": 45232577959680.0, + "grad_norm": 1.4693799837877743, + "language_loss": 0.72042251, + "learning_rate": 1.3945087841010006e-07, + "loss": 0.79717582, + "num_input_tokens_seen": 317018095, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.09307861, + "step": 14703, + "time_per_iteration": 2.7185418605804443 + }, + { + "auxiliary_loss_clip": 0.06394663, + "auxiliary_loss_mlp": 0.01263802, + "balance_loss_clip": 0.0626796, + "balance_loss_mlp": 0.01254761, + "epoch": 0.8840523072298211, + "flos": 20012342371200.0, + "grad_norm": 1.7405173343909983, + "language_loss": 0.6674304, + "learning_rate": 1.3930803419182645e-07, + "loss": 0.74401504, + "num_input_tokens_seen": 317035755, + "router_z_loss_clip": 1.26855469, + "router_z_loss_mlp": 0.0904541, + "step": 14704, + "time_per_iteration": 3.921534776687622 + }, + { + "auxiliary_loss_clip": 0.06395476, + "auxiliary_loss_mlp": 0.01264102, + "balance_loss_clip": 0.06270425, + "balance_loss_mlp": 0.01255644, + "epoch": 0.8841124304824891, + "flos": 24432941024640.0, + "grad_norm": 1.519427157327818, + "language_loss": 0.70908153, + "learning_rate": 1.3916526053117905e-07, + "loss": 0.78567731, + "num_input_tokens_seen": 317055765, + "router_z_loss_clip": 1.25, + "router_z_loss_mlp": 0.08459473, + "step": 14705, + "time_per_iteration": 2.6113686561584473 + }, + { + "auxiliary_loss_clip": 0.06397911, + "auxiliary_loss_mlp": 0.01264052, + "balance_loss_clip": 0.06269821, + "balance_loss_mlp": 0.0125566, + "epoch": 0.884172553735157, + "flos": 31292583851520.0, + "grad_norm": 1.3762163602676374, + "language_loss": 0.70915127, + "learning_rate": 1.3902255743357104e-07, + "loss": 0.78577089, + "num_input_tokens_seen": 317077955, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.08392334, + "step": 14706, + "time_per_iteration": 2.665069580078125 + }, + { + "auxiliary_loss_clip": 0.06399087, + "auxiliary_loss_mlp": 0.01265819, + "balance_loss_clip": 0.06269109, + "balance_loss_mlp": 0.01256843, + "epoch": 0.884232676987825, + "flos": 21396494367360.0, + "grad_norm": 1.5565027115335555, + "language_loss": 0.74541593, + "learning_rate": 1.3887992490441413e-07, + "loss": 0.822065, + "num_input_tokens_seen": 317095825, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.08978271, + "step": 14707, + "time_per_iteration": 2.669102430343628 + }, + { + "auxiliary_loss_clip": 0.06309316, + "auxiliary_loss_mlp": 0.01249357, + "balance_loss_clip": 0.06254923, + "balance_loss_mlp": 0.01248359, + "epoch": 0.8842928002404931, + "flos": 57928668278400.0, + "grad_norm": 0.8267722296709221, + "language_loss": 0.60377383, + "learning_rate": 1.387373629491173e-07, + "loss": 0.67936051, + "num_input_tokens_seen": 317152875, + "router_z_loss_clip": 0.54638672, + "router_z_loss_mlp": 0.00997162, + "step": 14708, + "time_per_iteration": 2.9923834800720215 + }, + { + "auxiliary_loss_clip": 0.06393933, + "auxiliary_loss_mlp": 0.01265055, + "balance_loss_clip": 0.06269866, + "balance_loss_mlp": 0.01257062, + "epoch": 0.884352923493161, + "flos": 41473517690880.0, + "grad_norm": 1.6630393907624046, + "language_loss": 0.67774945, + "learning_rate": 1.3859487157308625e-07, + "loss": 0.75433934, + "num_input_tokens_seen": 317176725, + "router_z_loss_clip": 1.24121094, + "router_z_loss_mlp": 0.07989502, + "step": 14709, + "time_per_iteration": 2.713012933731079 + }, + { + "auxiliary_loss_clip": 0.06405933, + "auxiliary_loss_mlp": 0.01267155, + "balance_loss_clip": 0.06270263, + "balance_loss_mlp": 0.0125667, + "epoch": 0.884413046745829, + "flos": 46552677909120.0, + "grad_norm": 1.5766892978129978, + "language_loss": 0.62479722, + "learning_rate": 1.3845245078172373e-07, + "loss": 0.70152819, + "num_input_tokens_seen": 317206880, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.10479736, + "step": 14710, + "time_per_iteration": 2.767439603805542 + }, + { + "auxiliary_loss_clip": 0.06396196, + "auxiliary_loss_mlp": 0.01264197, + "balance_loss_clip": 0.06270634, + "balance_loss_mlp": 0.01255924, + "epoch": 0.8844731699984969, + "flos": 19141331978880.0, + "grad_norm": 3.098385376741182, + "language_loss": 0.63903069, + "learning_rate": 1.38310100580431e-07, + "loss": 0.7156347, + "num_input_tokens_seen": 317224135, + "router_z_loss_clip": 1.25683594, + "router_z_loss_mlp": 0.08282471, + "step": 14711, + "time_per_iteration": 2.5306129455566406 + }, + { + "auxiliary_loss_clip": 0.06406876, + "auxiliary_loss_mlp": 0.01265093, + "balance_loss_clip": 0.06271248, + "balance_loss_mlp": 0.01255872, + "epoch": 0.8845332932511649, + "flos": 23267736547200.0, + "grad_norm": 1.7593747867980092, + "language_loss": 0.76430249, + "learning_rate": 1.38167820974606e-07, + "loss": 0.84102213, + "num_input_tokens_seen": 317244505, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.09222412, + "step": 14712, + "time_per_iteration": 2.5903677940368652 + }, + { + "auxiliary_loss_clip": 0.06404536, + "auxiliary_loss_mlp": 0.01267354, + "balance_loss_clip": 0.06273165, + "balance_loss_mlp": 0.01258246, + "epoch": 0.8845934165038328, + "flos": 17570027888640.0, + "grad_norm": 2.1477538781818777, + "language_loss": 0.81665063, + "learning_rate": 1.3802561196964368e-07, + "loss": 0.89336956, + "num_input_tokens_seen": 317257830, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.09112549, + "step": 14713, + "time_per_iteration": 2.4900383949279785 + }, + { + "auxiliary_loss_clip": 0.06397398, + "auxiliary_loss_mlp": 0.01261797, + "balance_loss_clip": 0.06267774, + "balance_loss_mlp": 0.01252535, + "epoch": 0.8846535397565009, + "flos": 27492336501120.0, + "grad_norm": 1.69166035128251, + "language_loss": 0.55999333, + "learning_rate": 1.3788347357093688e-07, + "loss": 0.63658524, + "num_input_tokens_seen": 317278430, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.0927124, + "step": 14714, + "time_per_iteration": 2.534978151321411 + }, + { + "auxiliary_loss_clip": 0.06400881, + "auxiliary_loss_mlp": 0.01262206, + "balance_loss_clip": 0.06269959, + "balance_loss_mlp": 0.01253587, + "epoch": 0.8847136630091688, + "flos": 28768020986880.0, + "grad_norm": 1.6242716538465463, + "language_loss": 0.73918736, + "learning_rate": 1.377414057838755e-07, + "loss": 0.81581825, + "num_input_tokens_seen": 317295970, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.08612061, + "step": 14715, + "time_per_iteration": 3.9610276222229004 + }, + { + "auxiliary_loss_clip": 0.06403157, + "auxiliary_loss_mlp": 0.0126659, + "balance_loss_clip": 0.06271261, + "balance_loss_mlp": 0.01257387, + "epoch": 0.8847737862618368, + "flos": 23483623392000.0, + "grad_norm": 1.4848157988551902, + "language_loss": 0.75333452, + "learning_rate": 1.375994086138461e-07, + "loss": 0.83003205, + "num_input_tokens_seen": 317316185, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09204102, + "step": 14716, + "time_per_iteration": 2.5149214267730713 + }, + { + "auxiliary_loss_clip": 0.06399931, + "auxiliary_loss_mlp": 0.01261999, + "balance_loss_clip": 0.06271353, + "balance_loss_mlp": 0.01252676, + "epoch": 0.8848339095145047, + "flos": 18666777980160.0, + "grad_norm": 1.9564063786190344, + "language_loss": 0.7096256, + "learning_rate": 1.3745748206623397e-07, + "loss": 0.78624487, + "num_input_tokens_seen": 317333275, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.09320068, + "step": 14717, + "time_per_iteration": 2.5454225540161133 + }, + { + "auxiliary_loss_clip": 0.06393513, + "auxiliary_loss_mlp": 0.01261753, + "balance_loss_clip": 0.06269147, + "balance_loss_mlp": 0.01253003, + "epoch": 0.8848940327671727, + "flos": 32278518518400.0, + "grad_norm": 3.4354664808670607, + "language_loss": 0.74253142, + "learning_rate": 1.373156261464208e-07, + "loss": 0.81908405, + "num_input_tokens_seen": 317351245, + "router_z_loss_clip": 1.2421875, + "router_z_loss_mlp": 0.08740234, + "step": 14718, + "time_per_iteration": 2.567211627960205 + }, + { + "auxiliary_loss_clip": 0.06400803, + "auxiliary_loss_mlp": 0.01261671, + "balance_loss_clip": 0.06267846, + "balance_loss_mlp": 0.01252033, + "epoch": 0.8849541560198406, + "flos": 24028225004160.0, + "grad_norm": 1.4551817490086836, + "language_loss": 0.78617239, + "learning_rate": 1.3717384085978602e-07, + "loss": 0.86279714, + "num_input_tokens_seen": 317370740, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.09643555, + "step": 14719, + "time_per_iteration": 4.014564514160156 + }, + { + "auxiliary_loss_clip": 0.06404986, + "auxiliary_loss_mlp": 0.01265664, + "balance_loss_clip": 0.06272528, + "balance_loss_mlp": 0.01257254, + "epoch": 0.8850142792725086, + "flos": 16878664650240.0, + "grad_norm": 2.2822989614167515, + "language_loss": 0.72013068, + "learning_rate": 1.3703212621170579e-07, + "loss": 0.79683721, + "num_input_tokens_seen": 317388370, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.08410645, + "step": 14720, + "time_per_iteration": 2.5508828163146973 + }, + { + "auxiliary_loss_clip": 0.06405028, + "auxiliary_loss_mlp": 0.01263708, + "balance_loss_clip": 0.06270377, + "balance_loss_mlp": 0.0125441, + "epoch": 0.8850744025251767, + "flos": 24030824480640.0, + "grad_norm": 1.7235256005815422, + "language_loss": 0.8247689, + "learning_rate": 1.3689048220755383e-07, + "loss": 0.90145624, + "num_input_tokens_seen": 317407390, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.09295654, + "step": 14721, + "time_per_iteration": 3.969202995300293 + }, + { + "auxiliary_loss_clip": 0.06402031, + "auxiliary_loss_mlp": 0.01265058, + "balance_loss_clip": 0.06270001, + "balance_loss_mlp": 0.01255521, + "epoch": 0.8851345257778446, + "flos": 47965816218240.0, + "grad_norm": 1.964786564262649, + "language_loss": 0.62954146, + "learning_rate": 1.3674890885270186e-07, + "loss": 0.70621228, + "num_input_tokens_seen": 317430825, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09545898, + "step": 14722, + "time_per_iteration": 2.7305383682250977 + }, + { + "auxiliary_loss_clip": 0.06398532, + "auxiliary_loss_mlp": 0.01265131, + "balance_loss_clip": 0.06266725, + "balance_loss_mlp": 0.01255242, + "epoch": 0.8851946490305126, + "flos": 36619761755520.0, + "grad_norm": 1.7414583880111092, + "language_loss": 0.68572694, + "learning_rate": 1.3660740615251754e-07, + "loss": 0.76236361, + "num_input_tokens_seen": 317451905, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.09881592, + "step": 14723, + "time_per_iteration": 2.6492748260498047 + }, + { + "auxiliary_loss_clip": 0.06399927, + "auxiliary_loss_mlp": 0.01263745, + "balance_loss_clip": 0.06269683, + "balance_loss_mlp": 0.01254834, + "epoch": 0.8852547722831805, + "flos": 21550802860800.0, + "grad_norm": 1.6351451905657401, + "language_loss": 0.77568376, + "learning_rate": 1.3646597411236703e-07, + "loss": 0.85232049, + "num_input_tokens_seen": 317470030, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.08917236, + "step": 14724, + "time_per_iteration": 2.5171244144439697 + }, + { + "auxiliary_loss_clip": 0.06308331, + "auxiliary_loss_mlp": 0.01249732, + "balance_loss_clip": 0.06254104, + "balance_loss_mlp": 0.01248703, + "epoch": 0.8853148955358485, + "flos": 63077876110080.0, + "grad_norm": 0.783597517732296, + "language_loss": 0.58947587, + "learning_rate": 1.363246127376143e-07, + "loss": 0.66505647, + "num_input_tokens_seen": 317527460, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.01029205, + "step": 14725, + "time_per_iteration": 3.0300509929656982 + }, + { + "auxiliary_loss_clip": 0.06410657, + "auxiliary_loss_mlp": 0.01267993, + "balance_loss_clip": 0.06271988, + "balance_loss_mlp": 0.01257962, + "epoch": 0.8853750187885164, + "flos": 18155606947200.0, + "grad_norm": 1.866018411089085, + "language_loss": 0.68803233, + "learning_rate": 1.3618332203361837e-07, + "loss": 0.76481885, + "num_input_tokens_seen": 317544070, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.1003418, + "step": 14726, + "time_per_iteration": 2.4636669158935547 + }, + { + "auxiliary_loss_clip": 0.06399886, + "auxiliary_loss_mlp": 0.01265553, + "balance_loss_clip": 0.06270148, + "balance_loss_mlp": 0.01257036, + "epoch": 0.8854351420411845, + "flos": 39580500648960.0, + "grad_norm": 1.2347060660537659, + "language_loss": 0.6949172, + "learning_rate": 1.3604210200573785e-07, + "loss": 0.77157164, + "num_input_tokens_seen": 317570275, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.08508301, + "step": 14727, + "time_per_iteration": 2.736482858657837 + }, + { + "auxiliary_loss_clip": 0.06401646, + "auxiliary_loss_mlp": 0.01263244, + "balance_loss_clip": 0.06271316, + "balance_loss_mlp": 0.01254184, + "epoch": 0.8854952652938524, + "flos": 23776140395520.0, + "grad_norm": 1.6133668439229503, + "language_loss": 0.70217514, + "learning_rate": 1.3590095265932733e-07, + "loss": 0.77882403, + "num_input_tokens_seen": 317590160, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.09063721, + "step": 14728, + "time_per_iteration": 2.5058109760284424 + }, + { + "auxiliary_loss_clip": 0.0640386, + "auxiliary_loss_mlp": 0.01261995, + "balance_loss_clip": 0.06270647, + "balance_loss_mlp": 0.01252434, + "epoch": 0.8855553885465204, + "flos": 18295199049600.0, + "grad_norm": 2.1275999023059673, + "language_loss": 0.66818655, + "learning_rate": 1.3575987399973987e-07, + "loss": 0.74484515, + "num_input_tokens_seen": 317608340, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.09558105, + "step": 14729, + "time_per_iteration": 2.521054267883301 + }, + { + "auxiliary_loss_clip": 0.06401055, + "auxiliary_loss_mlp": 0.0126072, + "balance_loss_clip": 0.06272933, + "balance_loss_mlp": 0.01252513, + "epoch": 0.8856155117991883, + "flos": 36876374484480.0, + "grad_norm": 1.8562662991246879, + "language_loss": 0.6310026, + "learning_rate": 1.3561886603232453e-07, + "loss": 0.70762038, + "num_input_tokens_seen": 317629910, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.08209229, + "step": 14730, + "time_per_iteration": 2.6651859283447266 + }, + { + "auxiliary_loss_clip": 0.06397253, + "auxiliary_loss_mlp": 0.01262249, + "balance_loss_clip": 0.0627026, + "balance_loss_mlp": 0.01253553, + "epoch": 0.8856756350518563, + "flos": 22170441404160.0, + "grad_norm": 1.6656970883539435, + "language_loss": 0.79226112, + "learning_rate": 1.3547792876242904e-07, + "loss": 0.86885613, + "num_input_tokens_seen": 317650265, + "router_z_loss_clip": 1.26953125, + "router_z_loss_mlp": 0.0869751, + "step": 14731, + "time_per_iteration": 2.5325546264648438 + }, + { + "auxiliary_loss_clip": 0.06402338, + "auxiliary_loss_mlp": 0.0126849, + "balance_loss_clip": 0.06271227, + "balance_loss_mlp": 0.01259746, + "epoch": 0.8857357583045242, + "flos": 20747282532480.0, + "grad_norm": 1.5228493349215588, + "language_loss": 0.83495152, + "learning_rate": 1.3533706219539708e-07, + "loss": 0.91165972, + "num_input_tokens_seen": 317669045, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.08752441, + "step": 14732, + "time_per_iteration": 2.4797542095184326 + }, + { + "auxiliary_loss_clip": 0.06308968, + "auxiliary_loss_mlp": 0.01249256, + "balance_loss_clip": 0.06254347, + "balance_loss_mlp": 0.01248295, + "epoch": 0.8857958815571922, + "flos": 69913815431040.0, + "grad_norm": 0.8972181039902289, + "language_loss": 0.59697849, + "learning_rate": 1.3519626633657045e-07, + "loss": 0.67256069, + "num_input_tokens_seen": 317728065, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.00959015, + "step": 14733, + "time_per_iteration": 3.1617019176483154 + }, + { + "auxiliary_loss_clip": 0.06401418, + "auxiliary_loss_mlp": 0.01263495, + "balance_loss_clip": 0.06271139, + "balance_loss_mlp": 0.01253863, + "epoch": 0.8858560048098603, + "flos": 15127294135680.0, + "grad_norm": 1.8039314213733861, + "language_loss": 0.6699304, + "learning_rate": 1.3505554119128838e-07, + "loss": 0.74657953, + "num_input_tokens_seen": 317746120, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.09625244, + "step": 14734, + "time_per_iteration": 2.464336395263672 + }, + { + "auxiliary_loss_clip": 0.06398517, + "auxiliary_loss_mlp": 0.0126497, + "balance_loss_clip": 0.06271675, + "balance_loss_mlp": 0.01255905, + "epoch": 0.8859161280625282, + "flos": 16615469376000.0, + "grad_norm": 1.96139376058703, + "language_loss": 0.75832766, + "learning_rate": 1.3491488676488682e-07, + "loss": 0.83496255, + "num_input_tokens_seen": 317762280, + "router_z_loss_clip": 1.26855469, + "router_z_loss_mlp": 0.09057617, + "step": 14735, + "time_per_iteration": 2.499420166015625 + }, + { + "auxiliary_loss_clip": 0.06406797, + "auxiliary_loss_mlp": 0.01263237, + "balance_loss_clip": 0.06273414, + "balance_loss_mlp": 0.0125404, + "epoch": 0.8859762513151962, + "flos": 18699915070080.0, + "grad_norm": 2.4052129022673507, + "language_loss": 0.70763892, + "learning_rate": 1.3477430306270066e-07, + "loss": 0.78433919, + "num_input_tokens_seen": 317780615, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.09197998, + "step": 14736, + "time_per_iteration": 2.4729537963867188 + }, + { + "auxiliary_loss_clip": 0.06403352, + "auxiliary_loss_mlp": 0.01263355, + "balance_loss_clip": 0.06272499, + "balance_loss_mlp": 0.0125423, + "epoch": 0.8860363745678641, + "flos": 19542987325440.0, + "grad_norm": 1.711220105447237, + "language_loss": 0.8489334, + "learning_rate": 1.3463379009005892e-07, + "loss": 0.92560041, + "num_input_tokens_seen": 317798830, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09124756, + "step": 14737, + "time_per_iteration": 2.5370328426361084 + }, + { + "auxiliary_loss_clip": 0.06409991, + "auxiliary_loss_mlp": 0.01267221, + "balance_loss_clip": 0.06270722, + "balance_loss_mlp": 0.01256623, + "epoch": 0.8860964978205321, + "flos": 35963673886080.0, + "grad_norm": 1.905285473109681, + "language_loss": 0.67920482, + "learning_rate": 1.3449334785229093e-07, + "loss": 0.75597692, + "num_input_tokens_seen": 317819235, + "router_z_loss_clip": 1.39257812, + "router_z_loss_mlp": 0.10601807, + "step": 14738, + "time_per_iteration": 2.6281023025512695 + }, + { + "auxiliary_loss_clip": 0.06409208, + "auxiliary_loss_mlp": 0.01263679, + "balance_loss_clip": 0.06271783, + "balance_loss_mlp": 0.01253588, + "epoch": 0.8861566210732, + "flos": 21218524295040.0, + "grad_norm": 1.6152938283716511, + "language_loss": 0.75368971, + "learning_rate": 1.343529763547222e-07, + "loss": 0.83041853, + "num_input_tokens_seen": 317836785, + "router_z_loss_clip": 1.37402344, + "router_z_loss_mlp": 0.10083008, + "step": 14739, + "time_per_iteration": 2.5536062717437744 + }, + { + "auxiliary_loss_clip": 0.06398404, + "auxiliary_loss_mlp": 0.01263694, + "balance_loss_clip": 0.06269807, + "balance_loss_mlp": 0.01255462, + "epoch": 0.886216744325868, + "flos": 14613984823680.0, + "grad_norm": 1.863446316101088, + "language_loss": 0.87359273, + "learning_rate": 1.3421267560267559e-07, + "loss": 0.95021367, + "num_input_tokens_seen": 317854225, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.08227539, + "step": 14740, + "time_per_iteration": 2.4583516120910645 + }, + { + "auxiliary_loss_clip": 0.06400885, + "auxiliary_loss_mlp": 0.01263516, + "balance_loss_clip": 0.06271682, + "balance_loss_mlp": 0.01254206, + "epoch": 0.886276867578536, + "flos": 26658949392000.0, + "grad_norm": 1.8967743887192066, + "language_loss": 0.63574475, + "learning_rate": 1.34072445601471e-07, + "loss": 0.71238875, + "num_input_tokens_seen": 317874865, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.09301758, + "step": 14741, + "time_per_iteration": 2.5750632286071777 + }, + { + "auxiliary_loss_clip": 0.06400025, + "auxiliary_loss_mlp": 0.01268656, + "balance_loss_clip": 0.06270176, + "balance_loss_mlp": 0.01259149, + "epoch": 0.886336990831204, + "flos": 16769735942400.0, + "grad_norm": 1.8023239022858395, + "language_loss": 0.7326563, + "learning_rate": 1.3393228635642717e-07, + "loss": 0.8093431, + "num_input_tokens_seen": 317892830, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.09509277, + "step": 14742, + "time_per_iteration": 2.4618430137634277 + }, + { + "auxiliary_loss_clip": 0.06399601, + "auxiliary_loss_mlp": 0.0126363, + "balance_loss_clip": 0.06269386, + "balance_loss_mlp": 0.01254147, + "epoch": 0.8863971140838719, + "flos": 25272365627520.0, + "grad_norm": 1.947275844906342, + "language_loss": 0.59373927, + "learning_rate": 1.3379219787285733e-07, + "loss": 0.67037159, + "num_input_tokens_seen": 317911780, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.09484863, + "step": 14743, + "time_per_iteration": 2.5227532386779785 + }, + { + "auxiliary_loss_clip": 0.0640617, + "auxiliary_loss_mlp": 0.01269532, + "balance_loss_clip": 0.06273371, + "balance_loss_mlp": 0.01258392, + "epoch": 0.8864572373365399, + "flos": 23411060156160.0, + "grad_norm": 1.6050209562169269, + "language_loss": 0.60046476, + "learning_rate": 1.3365218015607437e-07, + "loss": 0.67722178, + "num_input_tokens_seen": 317932855, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.11138916, + "step": 14744, + "time_per_iteration": 3.923879861831665 + }, + { + "auxiliary_loss_clip": 0.06401066, + "auxiliary_loss_mlp": 0.01263442, + "balance_loss_clip": 0.06270179, + "balance_loss_mlp": 0.01254001, + "epoch": 0.8865173605892078, + "flos": 18554201619840.0, + "grad_norm": 1.5756497333321051, + "language_loss": 0.76668805, + "learning_rate": 1.3351223321138762e-07, + "loss": 0.84333313, + "num_input_tokens_seen": 317952090, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.09436035, + "step": 14745, + "time_per_iteration": 2.4856021404266357 + }, + { + "auxiliary_loss_clip": 0.06399768, + "auxiliary_loss_mlp": 0.01264389, + "balance_loss_clip": 0.06270394, + "balance_loss_mlp": 0.01255251, + "epoch": 0.8865774838418758, + "flos": 19031858219520.0, + "grad_norm": 1.8923480144182985, + "language_loss": 0.77594328, + "learning_rate": 1.3337235704410454e-07, + "loss": 0.85258484, + "num_input_tokens_seen": 317970370, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.09136963, + "step": 14746, + "time_per_iteration": 2.5016493797302246 + }, + { + "auxiliary_loss_clip": 0.06402637, + "auxiliary_loss_mlp": 0.0126552, + "balance_loss_clip": 0.06271207, + "balance_loss_mlp": 0.0125618, + "epoch": 0.8866376070945439, + "flos": 22169602863360.0, + "grad_norm": 1.8379446681951996, + "language_loss": 0.77303553, + "learning_rate": 1.3323255165952873e-07, + "loss": 0.84971702, + "num_input_tokens_seen": 317989125, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09338379, + "step": 14747, + "time_per_iteration": 2.4861974716186523 + }, + { + "auxiliary_loss_clip": 0.06395779, + "auxiliary_loss_mlp": 0.01263313, + "balance_loss_clip": 0.06268896, + "balance_loss_mlp": 0.01254748, + "epoch": 0.8866977303472118, + "flos": 20710539717120.0, + "grad_norm": 1.530148448203103, + "language_loss": 0.82762802, + "learning_rate": 1.3309281706296127e-07, + "loss": 0.90421903, + "num_input_tokens_seen": 318007820, + "router_z_loss_clip": 1.26855469, + "router_z_loss_mlp": 0.08551025, + "step": 14748, + "time_per_iteration": 2.502021551132202 + }, + { + "auxiliary_loss_clip": 0.06401731, + "auxiliary_loss_mlp": 0.01266782, + "balance_loss_clip": 0.0627058, + "balance_loss_mlp": 0.01257162, + "epoch": 0.8867578535998798, + "flos": 48804779623680.0, + "grad_norm": 4.373040844685136, + "language_loss": 0.77577972, + "learning_rate": 1.3295315325970148e-07, + "loss": 0.8524648, + "num_input_tokens_seen": 318030435, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.09613037, + "step": 14749, + "time_per_iteration": 2.727158546447754 + }, + { + "auxiliary_loss_clip": 0.06406604, + "auxiliary_loss_mlp": 0.01265154, + "balance_loss_clip": 0.06270957, + "balance_loss_mlp": 0.01255617, + "epoch": 0.8868179768525477, + "flos": 21111608085120.0, + "grad_norm": 2.390428852813455, + "language_loss": 0.7003032, + "learning_rate": 1.328135602550451e-07, + "loss": 0.77702081, + "num_input_tokens_seen": 318049465, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.09539795, + "step": 14750, + "time_per_iteration": 2.5537924766540527 + }, + { + "auxiliary_loss_clip": 0.06399231, + "auxiliary_loss_mlp": 0.01264164, + "balance_loss_clip": 0.06269191, + "balance_loss_mlp": 0.01255325, + "epoch": 0.8868781001052157, + "flos": 21836653464960.0, + "grad_norm": 1.669612343662207, + "language_loss": 0.59316975, + "learning_rate": 1.3267403805428546e-07, + "loss": 0.66980374, + "num_input_tokens_seen": 318067760, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.08837891, + "step": 14751, + "time_per_iteration": 2.4961390495300293 + }, + { + "auxiliary_loss_clip": 0.06401397, + "auxiliary_loss_mlp": 0.01262515, + "balance_loss_clip": 0.06271613, + "balance_loss_mlp": 0.01252966, + "epoch": 0.8869382233578836, + "flos": 13521469363200.0, + "grad_norm": 2.201193429076569, + "language_loss": 0.81327409, + "learning_rate": 1.3253458666271344e-07, + "loss": 0.88991326, + "num_input_tokens_seen": 318082785, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.09545898, + "step": 14752, + "time_per_iteration": 2.527376651763916 + }, + { + "auxiliary_loss_clip": 0.06405862, + "auxiliary_loss_mlp": 0.01265552, + "balance_loss_clip": 0.0626955, + "balance_loss_mlp": 0.01255365, + "epoch": 0.8869983466105517, + "flos": 22710598750080.0, + "grad_norm": 1.7397771398756352, + "language_loss": 0.80421031, + "learning_rate": 1.3239520608561793e-07, + "loss": 0.8809244, + "num_input_tokens_seen": 318101925, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.10186768, + "step": 14753, + "time_per_iteration": 2.496861457824707 + }, + { + "auxiliary_loss_clip": 0.0639924, + "auxiliary_loss_mlp": 0.01265479, + "balance_loss_clip": 0.06270298, + "balance_loss_mlp": 0.01256569, + "epoch": 0.8870584698632196, + "flos": 15346115873280.0, + "grad_norm": 1.9358713626182904, + "language_loss": 0.65389812, + "learning_rate": 1.3225589632828248e-07, + "loss": 0.73054528, + "num_input_tokens_seen": 318119945, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.08911133, + "step": 14754, + "time_per_iteration": 2.481266736984253 + }, + { + "auxiliary_loss_clip": 0.06402417, + "auxiliary_loss_mlp": 0.01265451, + "balance_loss_clip": 0.06271257, + "balance_loss_mlp": 0.0125654, + "epoch": 0.8871185931158876, + "flos": 26623003190400.0, + "grad_norm": 1.899322495177458, + "language_loss": 0.7491895, + "learning_rate": 1.3211665739599065e-07, + "loss": 0.82586813, + "num_input_tokens_seen": 318139685, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.08911133, + "step": 14755, + "time_per_iteration": 3.9851739406585693 + }, + { + "auxiliary_loss_clip": 0.06400773, + "auxiliary_loss_mlp": 0.01269007, + "balance_loss_clip": 0.06269758, + "balance_loss_mlp": 0.01259745, + "epoch": 0.8871787163685555, + "flos": 21805528872960.0, + "grad_norm": 1.4050001258190605, + "language_loss": 0.78016531, + "learning_rate": 1.3197748929402262e-07, + "loss": 0.85686314, + "num_input_tokens_seen": 318160375, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.09259033, + "step": 14756, + "time_per_iteration": 2.4884493350982666 + }, + { + "auxiliary_loss_clip": 0.06399755, + "auxiliary_loss_mlp": 0.01263375, + "balance_loss_clip": 0.06268262, + "balance_loss_mlp": 0.01253719, + "epoch": 0.8872388396212235, + "flos": 14908262762880.0, + "grad_norm": 2.804203292047771, + "language_loss": 0.77138597, + "learning_rate": 1.3183839202765535e-07, + "loss": 0.84801722, + "num_input_tokens_seen": 318177995, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09655762, + "step": 14757, + "time_per_iteration": 2.458031177520752 + }, + { + "auxiliary_loss_clip": 0.06396057, + "auxiliary_loss_mlp": 0.01265932, + "balance_loss_clip": 0.06269957, + "balance_loss_mlp": 0.0125711, + "epoch": 0.8872989628738914, + "flos": 26439331040640.0, + "grad_norm": 1.7403499564680318, + "language_loss": 0.68120039, + "learning_rate": 1.316993656021632e-07, + "loss": 0.75782031, + "num_input_tokens_seen": 318197030, + "router_z_loss_clip": 1.26171875, + "router_z_loss_mlp": 0.0881958, + "step": 14758, + "time_per_iteration": 2.5202882289886475 + }, + { + "auxiliary_loss_clip": 0.0639921, + "auxiliary_loss_mlp": 0.01265437, + "balance_loss_clip": 0.06269047, + "balance_loss_mlp": 0.01256473, + "epoch": 0.8873590861265594, + "flos": 48153597217920.0, + "grad_norm": 1.6386846273703985, + "language_loss": 0.68983102, + "learning_rate": 1.3156041002281915e-07, + "loss": 0.76647747, + "num_input_tokens_seen": 318221780, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.08972168, + "step": 14759, + "time_per_iteration": 4.159254550933838 + }, + { + "auxiliary_loss_clip": 0.06398255, + "auxiliary_loss_mlp": 0.01263758, + "balance_loss_clip": 0.0626884, + "balance_loss_mlp": 0.01254418, + "epoch": 0.8874192093792275, + "flos": 18338901753600.0, + "grad_norm": 2.3604242969885707, + "language_loss": 0.74442339, + "learning_rate": 1.3142152529489092e-07, + "loss": 0.82104361, + "num_input_tokens_seen": 318239710, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.09350586, + "step": 14760, + "time_per_iteration": 2.4698567390441895 + }, + { + "auxiliary_loss_clip": 0.06404065, + "auxiliary_loss_mlp": 0.01273255, + "balance_loss_clip": 0.06270099, + "balance_loss_mlp": 0.01263736, + "epoch": 0.8874793326318954, + "flos": 17899916613120.0, + "grad_norm": 2.2735692439153237, + "language_loss": 0.7632544, + "learning_rate": 1.3128271142364565e-07, + "loss": 0.84002757, + "num_input_tokens_seen": 318257425, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.09521484, + "step": 14761, + "time_per_iteration": 3.89682936668396 + }, + { + "auxiliary_loss_clip": 0.06400929, + "auxiliary_loss_mlp": 0.01263207, + "balance_loss_clip": 0.06268443, + "balance_loss_mlp": 0.01254052, + "epoch": 0.8875394558845634, + "flos": 31110169512960.0, + "grad_norm": 1.636429643501416, + "language_loss": 0.61458367, + "learning_rate": 1.3114396841434717e-07, + "loss": 0.69122505, + "num_input_tokens_seen": 318278485, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09155273, + "step": 14762, + "time_per_iteration": 2.632906436920166 + }, + { + "auxiliary_loss_clip": 0.06397983, + "auxiliary_loss_mlp": 0.01264663, + "balance_loss_clip": 0.06268691, + "balance_loss_mlp": 0.01254406, + "epoch": 0.8875995791372313, + "flos": 21148392827520.0, + "grad_norm": 1.751322601119736, + "language_loss": 0.64450324, + "learning_rate": 1.3100529627225697e-07, + "loss": 0.72112966, + "num_input_tokens_seen": 318297560, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.10253906, + "step": 14763, + "time_per_iteration": 2.6457977294921875 + }, + { + "auxiliary_loss_clip": 0.06402642, + "auxiliary_loss_mlp": 0.01261912, + "balance_loss_clip": 0.0627153, + "balance_loss_mlp": 0.01252452, + "epoch": 0.8876597023898993, + "flos": 17460554129280.0, + "grad_norm": 2.4705520367844924, + "language_loss": 0.70655769, + "learning_rate": 1.3086669500263335e-07, + "loss": 0.78320324, + "num_input_tokens_seen": 318313060, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09460449, + "step": 14764, + "time_per_iteration": 2.5632097721099854 + }, + { + "auxiliary_loss_clip": 0.06406358, + "auxiliary_loss_mlp": 0.01262549, + "balance_loss_clip": 0.06270573, + "balance_loss_mlp": 0.0125315, + "epoch": 0.8877198256425672, + "flos": 22714036767360.0, + "grad_norm": 2.109687309094666, + "language_loss": 0.65986574, + "learning_rate": 1.3072816461073166e-07, + "loss": 0.73655486, + "num_input_tokens_seen": 318332030, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.09399414, + "step": 14765, + "time_per_iteration": 2.5792641639709473 + }, + { + "auxiliary_loss_clip": 0.06397182, + "auxiliary_loss_mlp": 0.01265927, + "balance_loss_clip": 0.06269948, + "balance_loss_mlp": 0.01257532, + "epoch": 0.8877799488952353, + "flos": 24541995513600.0, + "grad_norm": 1.5120500891311812, + "language_loss": 0.76344001, + "learning_rate": 1.3058970510180568e-07, + "loss": 0.84007108, + "num_input_tokens_seen": 318351090, + "router_z_loss_clip": 1.27441406, + "router_z_loss_mlp": 0.08395386, + "step": 14766, + "time_per_iteration": 2.6661949157714844 + }, + { + "auxiliary_loss_clip": 0.06396287, + "auxiliary_loss_mlp": 0.01267323, + "balance_loss_clip": 0.06269039, + "balance_loss_mlp": 0.01258448, + "epoch": 0.8878400721479032, + "flos": 20965433437440.0, + "grad_norm": 1.7405820386467394, + "language_loss": 0.73762059, + "learning_rate": 1.3045131648110496e-07, + "loss": 0.81425673, + "num_input_tokens_seen": 318372000, + "router_z_loss_clip": 1.27050781, + "router_z_loss_mlp": 0.08880615, + "step": 14767, + "time_per_iteration": 2.5033586025238037 + }, + { + "auxiliary_loss_clip": 0.06396404, + "auxiliary_loss_mlp": 0.01261133, + "balance_loss_clip": 0.06271426, + "balance_loss_mlp": 0.01252556, + "epoch": 0.8879001954005712, + "flos": 25301268086400.0, + "grad_norm": 3.2803844975125003, + "language_loss": 0.71396875, + "learning_rate": 1.303129987538778e-07, + "loss": 0.79054409, + "num_input_tokens_seen": 318391530, + "router_z_loss_clip": 1.24902344, + "router_z_loss_mlp": 0.08569336, + "step": 14768, + "time_per_iteration": 2.6661486625671387 + }, + { + "auxiliary_loss_clip": 0.06398378, + "auxiliary_loss_mlp": 0.01263834, + "balance_loss_clip": 0.06268355, + "balance_loss_mlp": 0.01255001, + "epoch": 0.8879603186532391, + "flos": 23192028783360.0, + "grad_norm": 1.6618639759125788, + "language_loss": 0.70540762, + "learning_rate": 1.3017475192536932e-07, + "loss": 0.78202975, + "num_input_tokens_seen": 318410690, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.08831787, + "step": 14769, + "time_per_iteration": 2.512924909591675 + }, + { + "auxiliary_loss_clip": 0.06403679, + "auxiliary_loss_mlp": 0.01261408, + "balance_loss_clip": 0.06273782, + "balance_loss_mlp": 0.01252354, + "epoch": 0.8880204419059071, + "flos": 13659342456960.0, + "grad_norm": 2.4814123968549127, + "language_loss": 0.67167079, + "learning_rate": 1.3003657600082174e-07, + "loss": 0.74832165, + "num_input_tokens_seen": 318427380, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.09051514, + "step": 14770, + "time_per_iteration": 2.490354061126709 + }, + { + "auxiliary_loss_clip": 0.06397928, + "auxiliary_loss_mlp": 0.01266026, + "balance_loss_clip": 0.0627326, + "balance_loss_mlp": 0.01257055, + "epoch": 0.888080565158575, + "flos": 20638228043520.0, + "grad_norm": 2.5502983496635467, + "language_loss": 0.65957916, + "learning_rate": 1.2989847098547424e-07, + "loss": 0.73621869, + "num_input_tokens_seen": 318448530, + "router_z_loss_clip": 1.24609375, + "router_z_loss_mlp": 0.08972168, + "step": 14771, + "time_per_iteration": 2.528031349182129 + }, + { + "auxiliary_loss_clip": 0.06400346, + "auxiliary_loss_mlp": 0.01261846, + "balance_loss_clip": 0.06270881, + "balance_loss_mlp": 0.01253269, + "epoch": 0.888140688411243, + "flos": 28627338781440.0, + "grad_norm": 1.5852554919043456, + "language_loss": 0.82730216, + "learning_rate": 1.2976043688456396e-07, + "loss": 0.90392411, + "num_input_tokens_seen": 318468655, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.08569336, + "step": 14772, + "time_per_iteration": 2.571366786956787 + }, + { + "auxiliary_loss_clip": 0.06395004, + "auxiliary_loss_mlp": 0.01263606, + "balance_loss_clip": 0.06270064, + "balance_loss_mlp": 0.01255226, + "epoch": 0.8882008116639111, + "flos": 25527301274880.0, + "grad_norm": 1.5194647492720985, + "language_loss": 0.76408058, + "learning_rate": 1.296224737033258e-07, + "loss": 0.84066665, + "num_input_tokens_seen": 318488740, + "router_z_loss_clip": 1.24902344, + "router_z_loss_mlp": 0.08374023, + "step": 14773, + "time_per_iteration": 2.5512452125549316 + }, + { + "auxiliary_loss_clip": 0.06396265, + "auxiliary_loss_mlp": 0.01264026, + "balance_loss_clip": 0.0626926, + "balance_loss_mlp": 0.01255253, + "epoch": 0.888260934916579, + "flos": 27681249530880.0, + "grad_norm": 1.7554405652029053, + "language_loss": 0.75057411, + "learning_rate": 1.294845814469907e-07, + "loss": 0.82717705, + "num_input_tokens_seen": 318508810, + "router_z_loss_clip": 1.27050781, + "router_z_loss_mlp": 0.08782959, + "step": 14774, + "time_per_iteration": 2.580103635787964 + }, + { + "auxiliary_loss_clip": 0.0640349, + "auxiliary_loss_mlp": 0.01265769, + "balance_loss_clip": 0.06272057, + "balance_loss_mlp": 0.01256089, + "epoch": 0.888321058169247, + "flos": 21616615843200.0, + "grad_norm": 2.5677131374215945, + "language_loss": 0.72789186, + "learning_rate": 1.2934676012078783e-07, + "loss": 0.80458438, + "num_input_tokens_seen": 318526860, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09686279, + "step": 14775, + "time_per_iteration": 2.4722659587860107 + }, + { + "auxiliary_loss_clip": 0.06401627, + "auxiliary_loss_mlp": 0.0126518, + "balance_loss_clip": 0.06272218, + "balance_loss_mlp": 0.01256311, + "epoch": 0.8883811814219149, + "flos": 18154768406400.0, + "grad_norm": 1.7615915737374597, + "language_loss": 0.80541307, + "learning_rate": 1.292090097299432e-07, + "loss": 0.88208115, + "num_input_tokens_seen": 318545180, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.08862305, + "step": 14776, + "time_per_iteration": 2.488631010055542 + }, + { + "auxiliary_loss_clip": 0.06408714, + "auxiliary_loss_mlp": 0.01262464, + "balance_loss_clip": 0.0627206, + "balance_loss_mlp": 0.01252826, + "epoch": 0.8884413046745829, + "flos": 28331341833600.0, + "grad_norm": 1.8936331280996206, + "language_loss": 0.6894474, + "learning_rate": 1.290713302796802e-07, + "loss": 0.76615912, + "num_input_tokens_seen": 318564350, + "router_z_loss_clip": 1.36621094, + "router_z_loss_mlp": 0.09637451, + "step": 14777, + "time_per_iteration": 2.5410220623016357 + }, + { + "auxiliary_loss_clip": 0.06399784, + "auxiliary_loss_mlp": 0.01264302, + "balance_loss_clip": 0.06270191, + "balance_loss_mlp": 0.01255213, + "epoch": 0.8885014279272508, + "flos": 15164162732160.0, + "grad_norm": 1.7667313656152588, + "language_loss": 0.71248996, + "learning_rate": 1.2893372177522e-07, + "loss": 0.78913081, + "num_input_tokens_seen": 318582275, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.09094238, + "step": 14778, + "time_per_iteration": 2.4593677520751953 + }, + { + "auxiliary_loss_clip": 0.06401107, + "auxiliary_loss_mlp": 0.01262965, + "balance_loss_clip": 0.0627052, + "balance_loss_mlp": 0.01254, + "epoch": 0.8885615511799189, + "flos": 19105721193600.0, + "grad_norm": 1.5500678278821722, + "language_loss": 0.77619112, + "learning_rate": 1.287961842217804e-07, + "loss": 0.85283184, + "num_input_tokens_seen": 318601230, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.08966064, + "step": 14779, + "time_per_iteration": 2.467658519744873 + }, + { + "auxiliary_loss_clip": 0.06312528, + "auxiliary_loss_mlp": 0.01252679, + "balance_loss_clip": 0.06258145, + "balance_loss_mlp": 0.01251605, + "epoch": 0.8886216744325868, + "flos": 51200735270400.0, + "grad_norm": 0.84904602104289, + "language_loss": 0.56864655, + "learning_rate": 1.2865871762457747e-07, + "loss": 0.64429867, + "num_input_tokens_seen": 318645595, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.01075745, + "step": 14780, + "time_per_iteration": 2.908271074295044 + }, + { + "auxiliary_loss_clip": 0.06315291, + "auxiliary_loss_mlp": 0.01249856, + "balance_loss_clip": 0.0626081, + "balance_loss_mlp": 0.01249003, + "epoch": 0.8886817976852548, + "flos": 61633571281920.0, + "grad_norm": 0.7676462046556519, + "language_loss": 0.62468183, + "learning_rate": 1.2852132198882326e-07, + "loss": 0.7003333, + "num_input_tokens_seen": 318707850, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.00855255, + "step": 14781, + "time_per_iteration": 3.2015137672424316 + }, + { + "auxiliary_loss_clip": 0.06310038, + "auxiliary_loss_mlp": 0.0124953, + "balance_loss_clip": 0.06255679, + "balance_loss_mlp": 0.01248576, + "epoch": 0.8887419209379227, + "flos": 60664464086400.0, + "grad_norm": 0.7663905748294921, + "language_loss": 0.58062631, + "learning_rate": 1.2838399731972805e-07, + "loss": 0.65622199, + "num_input_tokens_seen": 318764915, + "router_z_loss_clip": 0.54638672, + "router_z_loss_mlp": 0.00952911, + "step": 14782, + "time_per_iteration": 2.9721531867980957 + }, + { + "auxiliary_loss_clip": 0.06399249, + "auxiliary_loss_mlp": 0.0126328, + "balance_loss_clip": 0.06271558, + "balance_loss_mlp": 0.01255472, + "epoch": 0.8888020441905907, + "flos": 29213630599680.0, + "grad_norm": 1.6461458074975241, + "language_loss": 0.65778244, + "learning_rate": 1.2824674362249922e-07, + "loss": 0.73440778, + "num_input_tokens_seen": 318785660, + "router_z_loss_clip": 1.27832031, + "router_z_loss_mlp": 0.07806396, + "step": 14783, + "time_per_iteration": 3.9794864654541016 + }, + { + "auxiliary_loss_clip": 0.0640447, + "auxiliary_loss_mlp": 0.01262648, + "balance_loss_clip": 0.06270882, + "balance_loss_mlp": 0.01252867, + "epoch": 0.8888621674432586, + "flos": 22169057811840.0, + "grad_norm": 1.463778407652058, + "language_loss": 0.77528048, + "learning_rate": 1.281095609023415e-07, + "loss": 0.8519516, + "num_input_tokens_seen": 318806080, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.09765625, + "step": 14784, + "time_per_iteration": 2.5277795791625977 + }, + { + "auxiliary_loss_clip": 0.06403342, + "auxiliary_loss_mlp": 0.01272132, + "balance_loss_clip": 0.06270555, + "balance_loss_mlp": 0.01262554, + "epoch": 0.8889222906959267, + "flos": 27680243281920.0, + "grad_norm": 3.057965191651345, + "language_loss": 0.61165977, + "learning_rate": 1.279724491644565e-07, + "loss": 0.68841451, + "num_input_tokens_seen": 318826445, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.09576416, + "step": 14785, + "time_per_iteration": 2.580399990081787 + }, + { + "auxiliary_loss_clip": 0.06400205, + "auxiliary_loss_mlp": 0.01265322, + "balance_loss_clip": 0.06271164, + "balance_loss_mlp": 0.01256251, + "epoch": 0.8889824139485947, + "flos": 14173029112320.0, + "grad_norm": 1.975478801188687, + "language_loss": 0.65172708, + "learning_rate": 1.278354084140445e-07, + "loss": 0.72838235, + "num_input_tokens_seen": 318843915, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.09069824, + "step": 14786, + "time_per_iteration": 2.4636151790618896 + }, + { + "auxiliary_loss_clip": 0.06406666, + "auxiliary_loss_mlp": 0.01267342, + "balance_loss_clip": 0.06271188, + "balance_loss_mlp": 0.01256082, + "epoch": 0.8890425372012626, + "flos": 12856828377600.0, + "grad_norm": 7.700688456498016, + "language_loss": 0.85678732, + "learning_rate": 1.276984386563009e-07, + "loss": 0.93352735, + "num_input_tokens_seen": 318859670, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.11260986, + "step": 14787, + "time_per_iteration": 2.4787025451660156 + }, + { + "auxiliary_loss_clip": 0.06403594, + "auxiliary_loss_mlp": 0.012634, + "balance_loss_clip": 0.0627303, + "balance_loss_mlp": 0.01254645, + "epoch": 0.8891026604539306, + "flos": 21695719697280.0, + "grad_norm": 2.351201834821054, + "language_loss": 0.70638961, + "learning_rate": 1.2756153989642027e-07, + "loss": 0.7830596, + "num_input_tokens_seen": 318877855, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.08758545, + "step": 14788, + "time_per_iteration": 2.504624128341675 + }, + { + "auxiliary_loss_clip": 0.06397562, + "auxiliary_loss_mlp": 0.01263047, + "balance_loss_clip": 0.06271622, + "balance_loss_mlp": 0.01254226, + "epoch": 0.8891627837065985, + "flos": 21877840546560.0, + "grad_norm": 2.261908173801477, + "language_loss": 0.70175314, + "learning_rate": 1.274247121395935e-07, + "loss": 0.77835929, + "num_input_tokens_seen": 318896045, + "router_z_loss_clip": 1.2578125, + "router_z_loss_mlp": 0.08825684, + "step": 14789, + "time_per_iteration": 2.513617992401123 + }, + { + "auxiliary_loss_clip": 0.06400102, + "auxiliary_loss_mlp": 0.01266314, + "balance_loss_clip": 0.06272225, + "balance_loss_mlp": 0.01257505, + "epoch": 0.8892229069592665, + "flos": 21586707135360.0, + "grad_norm": 1.4895103847506954, + "language_loss": 0.70829117, + "learning_rate": 1.2728795539100956e-07, + "loss": 0.78495526, + "num_input_tokens_seen": 318915515, + "router_z_loss_clip": 1.27929688, + "router_z_loss_mlp": 0.08807373, + "step": 14790, + "time_per_iteration": 2.5025522708892822 + }, + { + "auxiliary_loss_clip": 0.06399814, + "auxiliary_loss_mlp": 0.01263203, + "balance_loss_clip": 0.0627079, + "balance_loss_mlp": 0.01254333, + "epoch": 0.8892830302119344, + "flos": 23082680805120.0, + "grad_norm": 1.5934920580532534, + "language_loss": 0.7301842, + "learning_rate": 1.2715126965585387e-07, + "loss": 0.80681431, + "num_input_tokens_seen": 318934305, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.08874512, + "step": 14791, + "time_per_iteration": 2.5074832439422607 + }, + { + "auxiliary_loss_clip": 0.06399459, + "auxiliary_loss_mlp": 0.01265691, + "balance_loss_clip": 0.06273172, + "balance_loss_mlp": 0.01256661, + "epoch": 0.8893431534646025, + "flos": 23078194611840.0, + "grad_norm": 1.4776865540614907, + "language_loss": 0.74067426, + "learning_rate": 1.2701465493931008e-07, + "loss": 0.81732577, + "num_input_tokens_seen": 318953880, + "router_z_loss_clip": 1.26269531, + "router_z_loss_mlp": 0.09033203, + "step": 14792, + "time_per_iteration": 2.5036797523498535 + }, + { + "auxiliary_loss_clip": 0.06406777, + "auxiliary_loss_mlp": 0.01265351, + "balance_loss_clip": 0.06272233, + "balance_loss_mlp": 0.01255612, + "epoch": 0.8894032767172704, + "flos": 22461449034240.0, + "grad_norm": 1.9207360943675909, + "language_loss": 0.66585648, + "learning_rate": 1.2687811124655801e-07, + "loss": 0.74257779, + "num_input_tokens_seen": 318971395, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.09729004, + "step": 14793, + "time_per_iteration": 2.4976742267608643 + }, + { + "auxiliary_loss_clip": 0.06402475, + "auxiliary_loss_mlp": 0.01263987, + "balance_loss_clip": 0.06269317, + "balance_loss_mlp": 0.01254653, + "epoch": 0.8894633999699384, + "flos": 25345348133760.0, + "grad_norm": 1.568161072745724, + "language_loss": 0.72041291, + "learning_rate": 1.2674163858277552e-07, + "loss": 0.79707754, + "num_input_tokens_seen": 318990580, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.09332275, + "step": 14794, + "time_per_iteration": 2.522052764892578 + }, + { + "auxiliary_loss_clip": 0.06405, + "auxiliary_loss_mlp": 0.01265763, + "balance_loss_clip": 0.06268515, + "balance_loss_mlp": 0.01255856, + "epoch": 0.8895235232226063, + "flos": 21000079900800.0, + "grad_norm": 1.4570169942784024, + "language_loss": 0.75557005, + "learning_rate": 1.2660523695313785e-07, + "loss": 0.83227766, + "num_input_tokens_seen": 319010040, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.09899902, + "step": 14795, + "time_per_iteration": 3.9239513874053955 + }, + { + "auxiliary_loss_clip": 0.06307139, + "auxiliary_loss_mlp": 0.01249152, + "balance_loss_clip": 0.06252786, + "balance_loss_mlp": 0.01248141, + "epoch": 0.8895836464752743, + "flos": 69752169705600.0, + "grad_norm": 0.7593022455176621, + "language_loss": 0.56138074, + "learning_rate": 1.2646890636281727e-07, + "loss": 0.6369437, + "num_input_tokens_seen": 319063860, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.01010132, + "step": 14796, + "time_per_iteration": 3.0498147010803223 + }, + { + "auxiliary_loss_clip": 0.06403103, + "auxiliary_loss_mlp": 0.01262207, + "balance_loss_clip": 0.0627062, + "balance_loss_mlp": 0.01251717, + "epoch": 0.8896437697279422, + "flos": 23228520036480.0, + "grad_norm": 1.6659870416154836, + "language_loss": 0.70651698, + "learning_rate": 1.263326468169843e-07, + "loss": 0.7831701, + "num_input_tokens_seen": 319082335, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.1048584, + "step": 14797, + "time_per_iteration": 2.498295783996582 + }, + { + "auxiliary_loss_clip": 0.06308188, + "auxiliary_loss_mlp": 0.01248559, + "balance_loss_clip": 0.06253885, + "balance_loss_mlp": 0.01247547, + "epoch": 0.8897038929806103, + "flos": 70771786513920.0, + "grad_norm": 0.7861850314361323, + "language_loss": 0.5798108, + "learning_rate": 1.2619645832080417e-07, + "loss": 0.65537828, + "num_input_tokens_seen": 319147075, + "router_z_loss_clip": 0.54394531, + "router_z_loss_mlp": 0.01012421, + "step": 14798, + "time_per_iteration": 4.576344728469849 + }, + { + "auxiliary_loss_clip": 0.06401603, + "auxiliary_loss_mlp": 0.01263713, + "balance_loss_clip": 0.06271328, + "balance_loss_mlp": 0.01254802, + "epoch": 0.8897640162332782, + "flos": 19251183081600.0, + "grad_norm": 1.5301138927285134, + "language_loss": 0.79772937, + "learning_rate": 1.2606034087944251e-07, + "loss": 0.8743825, + "num_input_tokens_seen": 319166630, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.08905029, + "step": 14799, + "time_per_iteration": 2.51359224319458 + }, + { + "auxiliary_loss_clip": 0.06312159, + "auxiliary_loss_mlp": 0.01250026, + "balance_loss_clip": 0.06257726, + "balance_loss_mlp": 0.01249046, + "epoch": 0.8898241394859462, + "flos": 41372288830080.0, + "grad_norm": 0.862554760801988, + "language_loss": 0.58133441, + "learning_rate": 1.2592429449806053e-07, + "loss": 0.65695632, + "num_input_tokens_seen": 319221865, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.00979614, + "step": 14800, + "time_per_iteration": 4.55234169960022 + }, + { + "auxiliary_loss_clip": 0.06398645, + "auxiliary_loss_mlp": 0.01264119, + "balance_loss_clip": 0.06269025, + "balance_loss_mlp": 0.01255149, + "epoch": 0.8898842627386142, + "flos": 18991761240960.0, + "grad_norm": 1.4041707387256148, + "language_loss": 0.66151714, + "learning_rate": 1.2578831918181698e-07, + "loss": 0.73814476, + "num_input_tokens_seen": 319240710, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.08978271, + "step": 14801, + "time_per_iteration": 2.4694650173187256 + }, + { + "auxiliary_loss_clip": 0.06408633, + "auxiliary_loss_mlp": 0.01265457, + "balance_loss_clip": 0.06275365, + "balance_loss_mlp": 0.01255354, + "epoch": 0.8899443859912821, + "flos": 13220944295040.0, + "grad_norm": 2.4705918248485266, + "language_loss": 0.75189161, + "learning_rate": 1.256524149358682e-07, + "loss": 0.82863259, + "num_input_tokens_seen": 319256495, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.10101318, + "step": 14802, + "time_per_iteration": 2.5068447589874268 + }, + { + "auxiliary_loss_clip": 0.06400315, + "auxiliary_loss_mlp": 0.01262993, + "balance_loss_clip": 0.06273411, + "balance_loss_mlp": 0.01253856, + "epoch": 0.8900045092439501, + "flos": 22681318947840.0, + "grad_norm": 1.6381133195062223, + "language_loss": 0.73893923, + "learning_rate": 1.2551658176536805e-07, + "loss": 0.81557232, + "num_input_tokens_seen": 319273620, + "router_z_loss_clip": 1.26855469, + "router_z_loss_mlp": 0.09136963, + "step": 14803, + "time_per_iteration": 2.501056432723999 + }, + { + "auxiliary_loss_clip": 0.06399588, + "auxiliary_loss_mlp": 0.01262871, + "balance_loss_clip": 0.06269965, + "balance_loss_mlp": 0.01253299, + "epoch": 0.890064632496618, + "flos": 21147889703040.0, + "grad_norm": 1.7012691749350357, + "language_loss": 0.71806979, + "learning_rate": 1.2538081967546664e-07, + "loss": 0.79469442, + "num_input_tokens_seen": 319291720, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.09576416, + "step": 14804, + "time_per_iteration": 2.4941203594207764 + }, + { + "auxiliary_loss_clip": 0.06400431, + "auxiliary_loss_mlp": 0.01263674, + "balance_loss_clip": 0.0626931, + "balance_loss_mlp": 0.0125425, + "epoch": 0.8901247557492861, + "flos": 23402590894080.0, + "grad_norm": 1.58173973410221, + "language_loss": 0.81494653, + "learning_rate": 1.252451286713123e-07, + "loss": 0.89158762, + "num_input_tokens_seen": 319310380, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09417725, + "step": 14805, + "time_per_iteration": 2.4995498657226562 + }, + { + "auxiliary_loss_clip": 0.06405678, + "auxiliary_loss_mlp": 0.01263308, + "balance_loss_clip": 0.0627286, + "balance_loss_mlp": 0.012537, + "epoch": 0.890184879001954, + "flos": 29177390908800.0, + "grad_norm": 1.7463753983517807, + "language_loss": 0.67048252, + "learning_rate": 1.251095087580505e-07, + "loss": 0.74717236, + "num_input_tokens_seen": 319331765, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.09606934, + "step": 14806, + "time_per_iteration": 2.5823683738708496 + }, + { + "auxiliary_loss_clip": 0.06400896, + "auxiliary_loss_mlp": 0.01263841, + "balance_loss_clip": 0.06270954, + "balance_loss_mlp": 0.01254429, + "epoch": 0.890245002254622, + "flos": 14432912150400.0, + "grad_norm": 1.7922455060213383, + "language_loss": 0.67830801, + "learning_rate": 1.2497395994082438e-07, + "loss": 0.75495535, + "num_input_tokens_seen": 319349135, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.09417725, + "step": 14807, + "time_per_iteration": 2.5916707515716553 + }, + { + "auxiliary_loss_clip": 0.06399317, + "auxiliary_loss_mlp": 0.01263711, + "balance_loss_clip": 0.06270466, + "balance_loss_mlp": 0.01254734, + "epoch": 0.8903051255072899, + "flos": 22388676163200.0, + "grad_norm": 1.744680374078912, + "language_loss": 0.75182492, + "learning_rate": 1.248384822247732e-07, + "loss": 0.82845521, + "num_input_tokens_seen": 319368410, + "router_z_loss_clip": 1.28808594, + "router_z_loss_mlp": 0.08972168, + "step": 14808, + "time_per_iteration": 2.5085625648498535 + }, + { + "auxiliary_loss_clip": 0.06401837, + "auxiliary_loss_mlp": 0.0126289, + "balance_loss_clip": 0.06269499, + "balance_loss_mlp": 0.01254408, + "epoch": 0.8903652487599579, + "flos": 20783689931520.0, + "grad_norm": 2.005761137516875, + "language_loss": 0.81256378, + "learning_rate": 1.2470307561503513e-07, + "loss": 0.88921106, + "num_input_tokens_seen": 319387535, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.08477783, + "step": 14809, + "time_per_iteration": 2.476633310317993 + }, + { + "auxiliary_loss_clip": 0.06402358, + "auxiliary_loss_mlp": 0.01265966, + "balance_loss_clip": 0.06272776, + "balance_loss_mlp": 0.01256847, + "epoch": 0.8904253720126258, + "flos": 24431180088960.0, + "grad_norm": 1.7755328357455793, + "language_loss": 0.68591714, + "learning_rate": 1.2456774011674442e-07, + "loss": 0.7626003, + "num_input_tokens_seen": 319407210, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.09112549, + "step": 14810, + "time_per_iteration": 2.529508590698242 + }, + { + "auxiliary_loss_clip": 0.06403522, + "auxiliary_loss_mlp": 0.01264868, + "balance_loss_clip": 0.0627034, + "balance_loss_mlp": 0.01255879, + "epoch": 0.8904854952652939, + "flos": 19469962892160.0, + "grad_norm": 1.8706593590776184, + "language_loss": 0.7023586, + "learning_rate": 1.2443247573503257e-07, + "loss": 0.77904254, + "num_input_tokens_seen": 319425340, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.08990479, + "step": 14811, + "time_per_iteration": 2.465928792953491 + }, + { + "auxiliary_loss_clip": 0.06403497, + "auxiliary_loss_mlp": 0.01262283, + "balance_loss_clip": 0.06270523, + "balance_loss_mlp": 0.01253414, + "epoch": 0.8905456185179618, + "flos": 50811337347840.0, + "grad_norm": 2.2469275425064743, + "language_loss": 0.65642589, + "learning_rate": 1.2429728247502924e-07, + "loss": 0.73308372, + "num_input_tokens_seen": 319448150, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.08874512, + "step": 14812, + "time_per_iteration": 2.7694013118743896 + }, + { + "auxiliary_loss_clip": 0.06398641, + "auxiliary_loss_mlp": 0.01263031, + "balance_loss_clip": 0.06269665, + "balance_loss_mlp": 0.0125458, + "epoch": 0.8906057417706298, + "flos": 17790568634880.0, + "grad_norm": 1.8555379059256571, + "language_loss": 0.68591535, + "learning_rate": 1.24162160341861e-07, + "loss": 0.76253206, + "num_input_tokens_seen": 319466115, + "router_z_loss_clip": 1.28808594, + "router_z_loss_mlp": 0.08453369, + "step": 14813, + "time_per_iteration": 2.463127851486206 + }, + { + "auxiliary_loss_clip": 0.06410507, + "auxiliary_loss_mlp": 0.01265046, + "balance_loss_clip": 0.06271763, + "balance_loss_mlp": 0.01254455, + "epoch": 0.8906658650232978, + "flos": 21951368104320.0, + "grad_norm": 2.3980423530949944, + "language_loss": 0.76035082, + "learning_rate": 1.2402710934065198e-07, + "loss": 0.83710635, + "num_input_tokens_seen": 319485255, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.10583496, + "step": 14814, + "time_per_iteration": 2.528144121170044 + }, + { + "auxiliary_loss_clip": 0.0640672, + "auxiliary_loss_mlp": 0.01263567, + "balance_loss_clip": 0.06271608, + "balance_loss_mlp": 0.01253721, + "epoch": 0.8907259882759657, + "flos": 21294148204800.0, + "grad_norm": 1.8020653483786722, + "language_loss": 0.74471802, + "learning_rate": 1.2389212947652229e-07, + "loss": 0.82142091, + "num_input_tokens_seen": 319501800, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.09838867, + "step": 14815, + "time_per_iteration": 2.489715814590454 + }, + { + "auxiliary_loss_clip": 0.06397778, + "auxiliary_loss_mlp": 0.01265289, + "balance_loss_clip": 0.06269968, + "balance_loss_mlp": 0.01256545, + "epoch": 0.8907861115286337, + "flos": 20126595813120.0, + "grad_norm": 1.8276250566401673, + "language_loss": 0.75265664, + "learning_rate": 1.237572207545914e-07, + "loss": 0.82928729, + "num_input_tokens_seen": 319520415, + "router_z_loss_clip": 1.27734375, + "router_z_loss_mlp": 0.08740234, + "step": 14816, + "time_per_iteration": 2.5541696548461914 + }, + { + "auxiliary_loss_clip": 0.06403603, + "auxiliary_loss_mlp": 0.01264452, + "balance_loss_clip": 0.06270932, + "balance_loss_mlp": 0.01255386, + "epoch": 0.8908462347813016, + "flos": 20090356122240.0, + "grad_norm": 1.6893324557452318, + "language_loss": 0.77627748, + "learning_rate": 1.2362238317997476e-07, + "loss": 0.85295802, + "num_input_tokens_seen": 319538410, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.09063721, + "step": 14817, + "time_per_iteration": 2.551323652267456 + }, + { + "auxiliary_loss_clip": 0.06309056, + "auxiliary_loss_mlp": 0.01250369, + "balance_loss_clip": 0.06254645, + "balance_loss_mlp": 0.01249346, + "epoch": 0.8909063580339697, + "flos": 65522664288000.0, + "grad_norm": 0.7605080836630386, + "language_loss": 0.56617504, + "learning_rate": 1.2348761675778517e-07, + "loss": 0.64176929, + "num_input_tokens_seen": 319602565, + "router_z_loss_clip": 0.54345703, + "router_z_loss_mlp": 0.01023865, + "step": 14818, + "time_per_iteration": 3.1869611740112305 + }, + { + "auxiliary_loss_clip": 0.06404532, + "auxiliary_loss_mlp": 0.01268345, + "balance_loss_clip": 0.06274045, + "balance_loss_mlp": 0.01258939, + "epoch": 0.8909664812866376, + "flos": 29871018207360.0, + "grad_norm": 1.912148882510469, + "language_loss": 0.64619452, + "learning_rate": 1.2335292149313325e-07, + "loss": 0.72292328, + "num_input_tokens_seen": 319624645, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.09405518, + "step": 14819, + "time_per_iteration": 2.5426406860351562 + }, + { + "auxiliary_loss_clip": 0.06405222, + "auxiliary_loss_mlp": 0.01263411, + "balance_loss_clip": 0.06273527, + "balance_loss_mlp": 0.01253749, + "epoch": 0.8910266045393056, + "flos": 25454151060480.0, + "grad_norm": 2.0396984257073743, + "language_loss": 0.78438711, + "learning_rate": 1.2321829739112731e-07, + "loss": 0.86107349, + "num_input_tokens_seen": 319644040, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.09661865, + "step": 14820, + "time_per_iteration": 2.5258073806762695 + }, + { + "auxiliary_loss_clip": 0.06400749, + "auxiliary_loss_mlp": 0.01264328, + "balance_loss_clip": 0.06270877, + "balance_loss_mlp": 0.01254493, + "epoch": 0.8910867277919735, + "flos": 24506091239040.0, + "grad_norm": 1.6484795223719642, + "language_loss": 0.76428401, + "learning_rate": 1.2308374445687087e-07, + "loss": 0.84093475, + "num_input_tokens_seen": 319663930, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.0982666, + "step": 14821, + "time_per_iteration": 2.5047779083251953 + }, + { + "auxiliary_loss_clip": 0.06309538, + "auxiliary_loss_mlp": 0.01252341, + "balance_loss_clip": 0.06255338, + "balance_loss_mlp": 0.01251348, + "epoch": 0.8911468510446415, + "flos": 60706447781760.0, + "grad_norm": 0.7814778898249498, + "language_loss": 0.59336329, + "learning_rate": 1.2294926269546712e-07, + "loss": 0.66898209, + "num_input_tokens_seen": 319721245, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.00993347, + "step": 14822, + "time_per_iteration": 3.009101390838623 + }, + { + "auxiliary_loss_clip": 0.06401677, + "auxiliary_loss_mlp": 0.0126257, + "balance_loss_clip": 0.06270057, + "balance_loss_mlp": 0.01252986, + "epoch": 0.8912069742973094, + "flos": 25344467665920.0, + "grad_norm": 1.812720827369598, + "language_loss": 0.69541264, + "learning_rate": 1.2281485211201515e-07, + "loss": 0.77205515, + "num_input_tokens_seen": 319741200, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09588623, + "step": 14823, + "time_per_iteration": 3.9813392162323 + }, + { + "auxiliary_loss_clip": 0.06397749, + "auxiliary_loss_mlp": 0.0126276, + "balance_loss_clip": 0.06269428, + "balance_loss_mlp": 0.01254034, + "epoch": 0.8912670975499775, + "flos": 18229427994240.0, + "grad_norm": 1.5767380343948896, + "language_loss": 0.69303524, + "learning_rate": 1.2268051271161262e-07, + "loss": 0.76964033, + "num_input_tokens_seen": 319759265, + "router_z_loss_clip": 1.28320312, + "router_z_loss_mlp": 0.08721924, + "step": 14824, + "time_per_iteration": 2.4433348178863525 + }, + { + "auxiliary_loss_clip": 0.06405216, + "auxiliary_loss_mlp": 0.01263499, + "balance_loss_clip": 0.06270469, + "balance_loss_mlp": 0.01253694, + "epoch": 0.8913272208026454, + "flos": 26511558860160.0, + "grad_norm": 1.8101659396972392, + "language_loss": 0.70682526, + "learning_rate": 1.2254624449935303e-07, + "loss": 0.78351235, + "num_input_tokens_seen": 319777560, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.0980835, + "step": 14825, + "time_per_iteration": 2.5183238983154297 + }, + { + "auxiliary_loss_clip": 0.06400351, + "auxiliary_loss_mlp": 0.01266596, + "balance_loss_clip": 0.06270771, + "balance_loss_mlp": 0.01257065, + "epoch": 0.8913873440553134, + "flos": 18807502112640.0, + "grad_norm": 2.0913429177611467, + "language_loss": 0.70963371, + "learning_rate": 1.2241204748032786e-07, + "loss": 0.78630316, + "num_input_tokens_seen": 319794125, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.09527588, + "step": 14826, + "time_per_iteration": 2.4554646015167236 + }, + { + "auxiliary_loss_clip": 0.06399363, + "auxiliary_loss_mlp": 0.01263058, + "balance_loss_clip": 0.06270689, + "balance_loss_mlp": 0.01254099, + "epoch": 0.8914474673079814, + "flos": 20890899630720.0, + "grad_norm": 1.9654155681394898, + "language_loss": 0.75443125, + "learning_rate": 1.2227792165962615e-07, + "loss": 0.83105552, + "num_input_tokens_seen": 319810310, + "router_z_loss_clip": 1.28613281, + "router_z_loss_mlp": 0.08959961, + "step": 14827, + "time_per_iteration": 2.562636137008667 + }, + { + "auxiliary_loss_clip": 0.06403911, + "auxiliary_loss_mlp": 0.01263366, + "balance_loss_clip": 0.06272019, + "balance_loss_mlp": 0.01253513, + "epoch": 0.8915075905606493, + "flos": 20957551153920.0, + "grad_norm": 1.5895570130516543, + "language_loss": 0.78462636, + "learning_rate": 1.221438670423336e-07, + "loss": 0.86129922, + "num_input_tokens_seen": 319828505, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09851074, + "step": 14828, + "time_per_iteration": 2.4832942485809326 + }, + { + "auxiliary_loss_clip": 0.06401472, + "auxiliary_loss_mlp": 0.01264433, + "balance_loss_clip": 0.06271511, + "balance_loss_mlp": 0.01255367, + "epoch": 0.8915677138133173, + "flos": 23083058148480.0, + "grad_norm": 1.576500276860786, + "language_loss": 0.75334942, + "learning_rate": 1.2200988363353392e-07, + "loss": 0.83000845, + "num_input_tokens_seen": 319848680, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.09075928, + "step": 14829, + "time_per_iteration": 2.5500776767730713 + }, + { + "auxiliary_loss_clip": 0.06400748, + "auxiliary_loss_mlp": 0.01266587, + "balance_loss_clip": 0.06269491, + "balance_loss_mlp": 0.01257479, + "epoch": 0.8916278370659853, + "flos": 23446922503680.0, + "grad_norm": 1.4673976438274965, + "language_loss": 0.84542692, + "learning_rate": 1.2187597143830773e-07, + "loss": 0.92210025, + "num_input_tokens_seen": 319868835, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09112549, + "step": 14830, + "time_per_iteration": 2.5105338096618652 + }, + { + "auxiliary_loss_clip": 0.06398022, + "auxiliary_loss_mlp": 0.0126449, + "balance_loss_clip": 0.06270295, + "balance_loss_mlp": 0.01255996, + "epoch": 0.8916879603186533, + "flos": 25168342383360.0, + "grad_norm": 1.3751500735649531, + "language_loss": 0.75201428, + "learning_rate": 1.2174213046173299e-07, + "loss": 0.82863945, + "num_input_tokens_seen": 319891585, + "router_z_loss_clip": 1.27734375, + "router_z_loss_mlp": 0.08496094, + "step": 14831, + "time_per_iteration": 2.5866332054138184 + }, + { + "auxiliary_loss_clip": 0.06403229, + "auxiliary_loss_mlp": 0.01264299, + "balance_loss_clip": 0.06268588, + "balance_loss_mlp": 0.01254548, + "epoch": 0.8917480835713212, + "flos": 20236027645440.0, + "grad_norm": 1.8114871234332395, + "language_loss": 0.73160887, + "learning_rate": 1.216083607088847e-07, + "loss": 0.8082841, + "num_input_tokens_seen": 319910315, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.09759521, + "step": 14832, + "time_per_iteration": 2.5055291652679443 + }, + { + "auxiliary_loss_clip": 0.06403124, + "auxiliary_loss_mlp": 0.01264791, + "balance_loss_clip": 0.06270224, + "balance_loss_mlp": 0.01255153, + "epoch": 0.8918082068239892, + "flos": 26108729556480.0, + "grad_norm": 1.7973281023337047, + "language_loss": 0.67450631, + "learning_rate": 1.214746621848355e-07, + "loss": 0.75118548, + "num_input_tokens_seen": 319932275, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.09631348, + "step": 14833, + "time_per_iteration": 2.5191965103149414 + }, + { + "auxiliary_loss_clip": 0.06404808, + "auxiliary_loss_mlp": 0.01264902, + "balance_loss_clip": 0.06271433, + "balance_loss_mlp": 0.01254257, + "epoch": 0.8918683300766571, + "flos": 24839124491520.0, + "grad_norm": 1.6520503263058561, + "language_loss": 0.74187469, + "learning_rate": 1.2134103489465575e-07, + "loss": 0.81857181, + "num_input_tokens_seen": 319955335, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.10638428, + "step": 14834, + "time_per_iteration": 4.013251781463623 + }, + { + "auxiliary_loss_clip": 0.06404478, + "auxiliary_loss_mlp": 0.01263789, + "balance_loss_clip": 0.06273828, + "balance_loss_mlp": 0.01254288, + "epoch": 0.8919284533293251, + "flos": 22310955901440.0, + "grad_norm": 1.863798974093549, + "language_loss": 0.79164231, + "learning_rate": 1.2120747884341188e-07, + "loss": 0.868325, + "num_input_tokens_seen": 319973990, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.0949707, + "step": 14835, + "time_per_iteration": 2.4751789569854736 + }, + { + "auxiliary_loss_clip": 0.063965, + "auxiliary_loss_mlp": 0.01263728, + "balance_loss_clip": 0.06268743, + "balance_loss_mlp": 0.01254698, + "epoch": 0.891988576581993, + "flos": 30381518407680.0, + "grad_norm": 1.4245369026634545, + "language_loss": 0.73941118, + "learning_rate": 1.210739940361689e-07, + "loss": 0.81601346, + "num_input_tokens_seen": 319995555, + "router_z_loss_clip": 1.27636719, + "router_z_loss_mlp": 0.09033203, + "step": 14836, + "time_per_iteration": 2.561229944229126 + }, + { + "auxiliary_loss_clip": 0.06401372, + "auxiliary_loss_mlp": 0.01266792, + "balance_loss_clip": 0.06270787, + "balance_loss_mlp": 0.01257524, + "epoch": 0.8920486998346611, + "flos": 15557223035520.0, + "grad_norm": 2.352945147165247, + "language_loss": 0.689592, + "learning_rate": 1.2094058047798838e-07, + "loss": 0.76627362, + "num_input_tokens_seen": 320012385, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.09259033, + "step": 14837, + "time_per_iteration": 2.4373927116394043 + }, + { + "auxiliary_loss_clip": 0.0640661, + "auxiliary_loss_mlp": 0.01265317, + "balance_loss_clip": 0.06270414, + "balance_loss_mlp": 0.01255369, + "epoch": 0.892108823087329, + "flos": 21221333406720.0, + "grad_norm": 1.7967074516272619, + "language_loss": 0.67696273, + "learning_rate": 1.2080723817392913e-07, + "loss": 0.75368202, + "num_input_tokens_seen": 320032390, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.09942627, + "step": 14838, + "time_per_iteration": 3.965111255645752 + }, + { + "auxiliary_loss_clip": 0.06404169, + "auxiliary_loss_mlp": 0.01266814, + "balance_loss_clip": 0.06271984, + "balance_loss_mlp": 0.01257391, + "epoch": 0.892168946339997, + "flos": 21985092172800.0, + "grad_norm": 1.9689260594947426, + "language_loss": 0.76717424, + "learning_rate": 1.2067396712904777e-07, + "loss": 0.84388411, + "num_input_tokens_seen": 320052885, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.09423828, + "step": 14839, + "time_per_iteration": 3.895935297012329 + }, + { + "auxiliary_loss_clip": 0.06311233, + "auxiliary_loss_mlp": 0.01249533, + "balance_loss_clip": 0.06256986, + "balance_loss_mlp": 0.01248568, + "epoch": 0.892229069592665, + "flos": 67494869038080.0, + "grad_norm": 0.6601341927430973, + "language_loss": 0.49224526, + "learning_rate": 1.205407673483978e-07, + "loss": 0.56785291, + "num_input_tokens_seen": 320113685, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.00964355, + "step": 14840, + "time_per_iteration": 3.0776662826538086 + }, + { + "auxiliary_loss_clip": 0.06408979, + "auxiliary_loss_mlp": 0.01264539, + "balance_loss_clip": 0.06271542, + "balance_loss_mlp": 0.01253709, + "epoch": 0.8922891928453329, + "flos": 19464931647360.0, + "grad_norm": 2.036775192434288, + "language_loss": 0.64259487, + "learning_rate": 1.2040763883703074e-07, + "loss": 0.71933007, + "num_input_tokens_seen": 320130810, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.10827637, + "step": 14841, + "time_per_iteration": 2.5317835807800293 + }, + { + "auxiliary_loss_clip": 0.06397078, + "auxiliary_loss_mlp": 0.01266801, + "balance_loss_clip": 0.06271005, + "balance_loss_mlp": 0.01258367, + "epoch": 0.8923493160980009, + "flos": 23374065778560.0, + "grad_norm": 1.5067524723122596, + "language_loss": 0.68637419, + "learning_rate": 1.2027458159999438e-07, + "loss": 0.76301301, + "num_input_tokens_seen": 320152170, + "router_z_loss_clip": 1.26074219, + "router_z_loss_mlp": 0.08428955, + "step": 14842, + "time_per_iteration": 2.5464539527893066 + }, + { + "auxiliary_loss_clip": 0.06398538, + "auxiliary_loss_mlp": 0.01266525, + "balance_loss_clip": 0.06271973, + "balance_loss_mlp": 0.01257984, + "epoch": 0.8924094393506689, + "flos": 26184227685120.0, + "grad_norm": 1.9083387280935236, + "language_loss": 0.80568957, + "learning_rate": 1.2014159564233373e-07, + "loss": 0.88234019, + "num_input_tokens_seen": 320172360, + "router_z_loss_clip": 1.26660156, + "router_z_loss_mlp": 0.08538818, + "step": 14843, + "time_per_iteration": 2.6366734504699707 + }, + { + "auxiliary_loss_clip": 0.06403741, + "auxiliary_loss_mlp": 0.01261264, + "balance_loss_clip": 0.06268854, + "balance_loss_mlp": 0.01251781, + "epoch": 0.8924695626033369, + "flos": 22025147224320.0, + "grad_norm": 2.3820566119919597, + "language_loss": 0.68648458, + "learning_rate": 1.2000868096909257e-07, + "loss": 0.76313466, + "num_input_tokens_seen": 320192130, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.09484863, + "step": 14844, + "time_per_iteration": 2.473205327987671 + }, + { + "auxiliary_loss_clip": 0.06401572, + "auxiliary_loss_mlp": 0.01266646, + "balance_loss_clip": 0.0627027, + "balance_loss_mlp": 0.01257646, + "epoch": 0.8925296858560048, + "flos": 14799292128000.0, + "grad_norm": 2.2923996449190236, + "language_loss": 0.91698718, + "learning_rate": 1.1987583758531038e-07, + "loss": 0.99366921, + "num_input_tokens_seen": 320207760, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.08996582, + "step": 14845, + "time_per_iteration": 2.4725682735443115 + }, + { + "auxiliary_loss_clip": 0.06397889, + "auxiliary_loss_mlp": 0.01266631, + "balance_loss_clip": 0.06271143, + "balance_loss_mlp": 0.012575, + "epoch": 0.8925898091086728, + "flos": 22353275013120.0, + "grad_norm": 1.8851582934669056, + "language_loss": 0.72789091, + "learning_rate": 1.1974306549602476e-07, + "loss": 0.8045361, + "num_input_tokens_seen": 320225325, + "router_z_loss_clip": 1.26660156, + "router_z_loss_mlp": 0.09130859, + "step": 14846, + "time_per_iteration": 2.466618299484253 + }, + { + "auxiliary_loss_clip": 0.06400344, + "auxiliary_loss_mlp": 0.01264224, + "balance_loss_clip": 0.06268599, + "balance_loss_mlp": 0.01254645, + "epoch": 0.8926499323613407, + "flos": 45816773425920.0, + "grad_norm": 2.2290508938220657, + "language_loss": 0.57516384, + "learning_rate": 1.1961036470627094e-07, + "loss": 0.65180945, + "num_input_tokens_seen": 320247645, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.0958252, + "step": 14847, + "time_per_iteration": 2.6878631114959717 + }, + { + "auxiliary_loss_clip": 0.06405343, + "auxiliary_loss_mlp": 0.0126293, + "balance_loss_clip": 0.06273352, + "balance_loss_mlp": 0.01254026, + "epoch": 0.8927100556140087, + "flos": 22133530880640.0, + "grad_norm": 1.7052460383606831, + "language_loss": 0.76622617, + "learning_rate": 1.1947773522108052e-07, + "loss": 0.84290886, + "num_input_tokens_seen": 320266005, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.08911133, + "step": 14848, + "time_per_iteration": 2.4818036556243896 + }, + { + "auxiliary_loss_clip": 0.06397684, + "auxiliary_loss_mlp": 0.01264619, + "balance_loss_clip": 0.06270074, + "balance_loss_mlp": 0.01255648, + "epoch": 0.8927701788666766, + "flos": 28337756670720.0, + "grad_norm": 1.7160281168375413, + "language_loss": 0.69265717, + "learning_rate": 1.1934517704548251e-07, + "loss": 0.7692802, + "num_input_tokens_seen": 320285555, + "router_z_loss_clip": 1.27636719, + "router_z_loss_mlp": 0.08978271, + "step": 14849, + "time_per_iteration": 2.5345237255096436 + }, + { + "auxiliary_loss_clip": 0.0640296, + "auxiliary_loss_mlp": 0.01266234, + "balance_loss_clip": 0.06271099, + "balance_loss_mlp": 0.01257139, + "epoch": 0.8928303021193447, + "flos": 25300932670080.0, + "grad_norm": 9.377316945949495, + "language_loss": 0.80831003, + "learning_rate": 1.1921269018450364e-07, + "loss": 0.88500196, + "num_input_tokens_seen": 320305395, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09094238, + "step": 14850, + "time_per_iteration": 2.5199098587036133 + }, + { + "auxiliary_loss_clip": 0.06401064, + "auxiliary_loss_mlp": 0.01268179, + "balance_loss_clip": 0.06273234, + "balance_loss_mlp": 0.01258905, + "epoch": 0.8928904253720126, + "flos": 22243256202240.0, + "grad_norm": 1.5485045372929462, + "language_loss": 0.75078595, + "learning_rate": 1.1908027464316872e-07, + "loss": 0.82747841, + "num_input_tokens_seen": 320324220, + "router_z_loss_clip": 1.27929688, + "router_z_loss_mlp": 0.09259033, + "step": 14851, + "time_per_iteration": 2.520653247833252 + }, + { + "auxiliary_loss_clip": 0.06399436, + "auxiliary_loss_mlp": 0.01263636, + "balance_loss_clip": 0.06269779, + "balance_loss_mlp": 0.01253873, + "epoch": 0.8929505486246806, + "flos": 27100240519680.0, + "grad_norm": 1.5234739913675641, + "language_loss": 0.78729236, + "learning_rate": 1.1894793042649775e-07, + "loss": 0.86392307, + "num_input_tokens_seen": 320347195, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.09759521, + "step": 14852, + "time_per_iteration": 2.5750808715820312 + }, + { + "auxiliary_loss_clip": 0.06400271, + "auxiliary_loss_mlp": 0.01263228, + "balance_loss_clip": 0.06272772, + "balance_loss_mlp": 0.01254544, + "epoch": 0.8930106718773486, + "flos": 23046021843840.0, + "grad_norm": 1.3447156606133301, + "language_loss": 0.69361079, + "learning_rate": 1.1881565753951006e-07, + "loss": 0.77024567, + "num_input_tokens_seen": 320366850, + "router_z_loss_clip": 1.27539062, + "router_z_loss_mlp": 0.08685303, + "step": 14853, + "time_per_iteration": 2.5206987857818604 + }, + { + "auxiliary_loss_clip": 0.0639962, + "auxiliary_loss_mlp": 0.0126338, + "balance_loss_clip": 0.06269603, + "balance_loss_mlp": 0.01254469, + "epoch": 0.8930707951300165, + "flos": 35635378389120.0, + "grad_norm": 1.7867498059610383, + "language_loss": 0.67108899, + "learning_rate": 1.1868345598722118e-07, + "loss": 0.74771899, + "num_input_tokens_seen": 320388895, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.08905029, + "step": 14854, + "time_per_iteration": 2.6050684452056885 + }, + { + "auxiliary_loss_clip": 0.06395994, + "auxiliary_loss_mlp": 0.01262577, + "balance_loss_clip": 0.06270514, + "balance_loss_mlp": 0.01253732, + "epoch": 0.8931309183826845, + "flos": 23046650749440.0, + "grad_norm": 1.3515219492538217, + "language_loss": 0.74918699, + "learning_rate": 1.1855132577464399e-07, + "loss": 0.8257727, + "num_input_tokens_seen": 320408520, + "router_z_loss_clip": 1.25585938, + "router_z_loss_mlp": 0.08850098, + "step": 14855, + "time_per_iteration": 2.530815362930298 + }, + { + "auxiliary_loss_clip": 0.06400646, + "auxiliary_loss_mlp": 0.01264231, + "balance_loss_clip": 0.06271029, + "balance_loss_mlp": 0.01255427, + "epoch": 0.8931910416353525, + "flos": 26511726568320.0, + "grad_norm": 1.9239485722311656, + "language_loss": 0.64665866, + "learning_rate": 1.1841926690678893e-07, + "loss": 0.72330737, + "num_input_tokens_seen": 320427400, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.0880127, + "step": 14856, + "time_per_iteration": 2.61660099029541 + }, + { + "auxiliary_loss_clip": 0.06401564, + "auxiliary_loss_mlp": 0.01267596, + "balance_loss_clip": 0.06271841, + "balance_loss_mlp": 0.01259007, + "epoch": 0.8932511648880205, + "flos": 24980687164800.0, + "grad_norm": 1.6913640508608676, + "language_loss": 0.66606605, + "learning_rate": 1.1828727938866378e-07, + "loss": 0.74275768, + "num_input_tokens_seen": 320447570, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.08587646, + "step": 14857, + "time_per_iteration": 2.5284576416015625 + }, + { + "auxiliary_loss_clip": 0.06404722, + "auxiliary_loss_mlp": 0.01265153, + "balance_loss_clip": 0.0627214, + "balance_loss_mlp": 0.01255331, + "epoch": 0.8933112881406884, + "flos": 24467377852800.0, + "grad_norm": 2.3222836752374305, + "language_loss": 0.75424057, + "learning_rate": 1.1815536322527408e-07, + "loss": 0.83093929, + "num_input_tokens_seen": 320464405, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.09814453, + "step": 14858, + "time_per_iteration": 2.5318548679351807 + }, + { + "auxiliary_loss_clip": 0.06400517, + "auxiliary_loss_mlp": 0.01269115, + "balance_loss_clip": 0.06269918, + "balance_loss_mlp": 0.01259859, + "epoch": 0.8933714113933564, + "flos": 28300594584960.0, + "grad_norm": 1.5598958760491506, + "language_loss": 0.69930089, + "learning_rate": 1.1802351842162139e-07, + "loss": 0.77599716, + "num_input_tokens_seen": 320485525, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.09246826, + "step": 14859, + "time_per_iteration": 2.5497772693634033 + }, + { + "auxiliary_loss_clip": 0.06392509, + "auxiliary_loss_mlp": 0.01262646, + "balance_loss_clip": 0.06268515, + "balance_loss_mlp": 0.01254284, + "epoch": 0.8934315346460243, + "flos": 21441412955520.0, + "grad_norm": 1.8624217934039429, + "language_loss": 0.75625086, + "learning_rate": 1.1789174498270526e-07, + "loss": 0.83280241, + "num_input_tokens_seen": 320506725, + "router_z_loss_clip": 1.24121094, + "router_z_loss_mlp": 0.08355713, + "step": 14860, + "time_per_iteration": 2.5246856212615967 + }, + { + "auxiliary_loss_clip": 0.06401479, + "auxiliary_loss_mlp": 0.01264873, + "balance_loss_clip": 0.06269905, + "balance_loss_mlp": 0.01255092, + "epoch": 0.8934916578986923, + "flos": 23776475811840.0, + "grad_norm": 1.7913164258614247, + "language_loss": 0.5788613, + "learning_rate": 1.1776004291352303e-07, + "loss": 0.65552485, + "num_input_tokens_seen": 320525425, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.09777832, + "step": 14861, + "time_per_iteration": 2.513174533843994 + }, + { + "auxiliary_loss_clip": 0.06395803, + "auxiliary_loss_mlp": 0.01266204, + "balance_loss_clip": 0.06268242, + "balance_loss_mlp": 0.01257407, + "epoch": 0.8935517811513602, + "flos": 18922090970880.0, + "grad_norm": 2.8229402142894924, + "language_loss": 0.63289392, + "learning_rate": 1.176284122190685e-07, + "loss": 0.70951402, + "num_input_tokens_seen": 320543010, + "router_z_loss_clip": 1.27539062, + "router_z_loss_mlp": 0.0880127, + "step": 14862, + "time_per_iteration": 2.4601802825927734 + }, + { + "auxiliary_loss_clip": 0.06398255, + "auxiliary_loss_mlp": 0.01264936, + "balance_loss_clip": 0.06269197, + "balance_loss_mlp": 0.01255883, + "epoch": 0.8936119044040283, + "flos": 24068280055680.0, + "grad_norm": 1.5659074836236766, + "language_loss": 0.78562599, + "learning_rate": 1.1749685290433298e-07, + "loss": 0.8622579, + "num_input_tokens_seen": 320562180, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.09057617, + "step": 14863, + "time_per_iteration": 3.9400691986083984 + }, + { + "auxiliary_loss_clip": 0.06396215, + "auxiliary_loss_mlp": 0.01263795, + "balance_loss_clip": 0.06269459, + "balance_loss_mlp": 0.01255361, + "epoch": 0.8936720276566962, + "flos": 21330387895680.0, + "grad_norm": 1.807940322380626, + "language_loss": 0.70814526, + "learning_rate": 1.1736536497430627e-07, + "loss": 0.78474534, + "num_input_tokens_seen": 320580395, + "router_z_loss_clip": 1.26855469, + "router_z_loss_mlp": 0.08435059, + "step": 14864, + "time_per_iteration": 2.477184295654297 + }, + { + "auxiliary_loss_clip": 0.06409479, + "auxiliary_loss_mlp": 0.01264945, + "balance_loss_clip": 0.06271873, + "balance_loss_mlp": 0.01255158, + "epoch": 0.8937321509093642, + "flos": 18412093895040.0, + "grad_norm": 1.8448979724824994, + "language_loss": 0.76666725, + "learning_rate": 1.1723394843397283e-07, + "loss": 0.84341156, + "num_input_tokens_seen": 320599505, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.09777832, + "step": 14865, + "time_per_iteration": 2.50520658493042 + }, + { + "auxiliary_loss_clip": 0.06396964, + "auxiliary_loss_mlp": 0.01263849, + "balance_loss_clip": 0.06270568, + "balance_loss_mlp": 0.01254801, + "epoch": 0.8937922741620322, + "flos": 22061344988160.0, + "grad_norm": 1.686573948545257, + "language_loss": 0.71847916, + "learning_rate": 1.1710260328831668e-07, + "loss": 0.79508728, + "num_input_tokens_seen": 320619825, + "router_z_loss_clip": 1.26464844, + "router_z_loss_mlp": 0.09051514, + "step": 14866, + "time_per_iteration": 2.5232789516448975 + }, + { + "auxiliary_loss_clip": 0.06405518, + "auxiliary_loss_mlp": 0.01261975, + "balance_loss_clip": 0.06270327, + "balance_loss_mlp": 0.01251765, + "epoch": 0.8938523974147001, + "flos": 25671169935360.0, + "grad_norm": 1.5088143817745128, + "language_loss": 0.84316403, + "learning_rate": 1.1697132954231869e-07, + "loss": 0.91983891, + "num_input_tokens_seen": 320638515, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.10205078, + "step": 14867, + "time_per_iteration": 2.504709243774414 + }, + { + "auxiliary_loss_clip": 0.06400672, + "auxiliary_loss_mlp": 0.01263794, + "balance_loss_clip": 0.06268955, + "balance_loss_mlp": 0.01255223, + "epoch": 0.8939125206673681, + "flos": 25750567278720.0, + "grad_norm": 1.4933944812080338, + "language_loss": 0.80616713, + "learning_rate": 1.168401272009567e-07, + "loss": 0.88281178, + "num_input_tokens_seen": 320659430, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.08575439, + "step": 14868, + "time_per_iteration": 2.5456981658935547 + }, + { + "auxiliary_loss_clip": 0.06400943, + "auxiliary_loss_mlp": 0.01264224, + "balance_loss_clip": 0.06269291, + "balance_loss_mlp": 0.01254264, + "epoch": 0.8939726439200361, + "flos": 27351863930880.0, + "grad_norm": 1.6782026554135205, + "language_loss": 0.77551532, + "learning_rate": 1.167089962692056e-07, + "loss": 0.85216701, + "num_input_tokens_seen": 320679295, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.0994873, + "step": 14869, + "time_per_iteration": 2.5171701908111572 + }, + { + "auxiliary_loss_clip": 0.06400751, + "auxiliary_loss_mlp": 0.01262574, + "balance_loss_clip": 0.06272287, + "balance_loss_mlp": 0.01253323, + "epoch": 0.8940327671727041, + "flos": 20344956353280.0, + "grad_norm": 1.3418834615415587, + "language_loss": 0.65861583, + "learning_rate": 1.1657793675203853e-07, + "loss": 0.73524916, + "num_input_tokens_seen": 320697535, + "router_z_loss_clip": 1.28320312, + "router_z_loss_mlp": 0.09240723, + "step": 14870, + "time_per_iteration": 2.4959447383880615 + }, + { + "auxiliary_loss_clip": 0.06304982, + "auxiliary_loss_mlp": 0.01250431, + "balance_loss_clip": 0.06250464, + "balance_loss_mlp": 0.01249474, + "epoch": 0.894092890425372, + "flos": 58425919534080.0, + "grad_norm": 0.7802103203986496, + "language_loss": 0.55975109, + "learning_rate": 1.1644694865442461e-07, + "loss": 0.63530517, + "num_input_tokens_seen": 320758635, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.009552, + "step": 14871, + "time_per_iteration": 3.156993865966797 + }, + { + "auxiliary_loss_clip": 0.06400608, + "auxiliary_loss_mlp": 0.0126467, + "balance_loss_clip": 0.06272507, + "balance_loss_mlp": 0.01255956, + "epoch": 0.89415301367804, + "flos": 19835965526400.0, + "grad_norm": 2.0336418069128515, + "language_loss": 0.76816511, + "learning_rate": 1.16316031981331e-07, + "loss": 0.84481788, + "num_input_tokens_seen": 320777175, + "router_z_loss_clip": 1.28027344, + "router_z_loss_mlp": 0.0871582, + "step": 14872, + "time_per_iteration": 2.485140323638916 + }, + { + "auxiliary_loss_clip": 0.0639937, + "auxiliary_loss_mlp": 0.0126479, + "balance_loss_clip": 0.06272227, + "balance_loss_mlp": 0.01256624, + "epoch": 0.8942131369307079, + "flos": 25782907754880.0, + "grad_norm": 1.6493247020685964, + "language_loss": 0.67278552, + "learning_rate": 1.1618518673772215e-07, + "loss": 0.7494272, + "num_input_tokens_seen": 320797670, + "router_z_loss_clip": 1.27050781, + "router_z_loss_mlp": 0.08166504, + "step": 14873, + "time_per_iteration": 2.5375049114227295 + }, + { + "auxiliary_loss_clip": 0.06396008, + "auxiliary_loss_mlp": 0.01265889, + "balance_loss_clip": 0.06269968, + "balance_loss_mlp": 0.01256799, + "epoch": 0.8942732601833759, + "flos": 23155747165440.0, + "grad_norm": 1.7348612988581609, + "language_loss": 0.59519863, + "learning_rate": 1.1605441292856033e-07, + "loss": 0.6718176, + "num_input_tokens_seen": 320817410, + "router_z_loss_clip": 1.26074219, + "router_z_loss_mlp": 0.09094238, + "step": 14874, + "time_per_iteration": 4.009182691574097 + }, + { + "auxiliary_loss_clip": 0.0640433, + "auxiliary_loss_mlp": 0.01262347, + "balance_loss_clip": 0.06272757, + "balance_loss_mlp": 0.01252548, + "epoch": 0.8943333834360438, + "flos": 27863034963840.0, + "grad_norm": 3.262059606823023, + "language_loss": 0.75661355, + "learning_rate": 1.1592371055880356e-07, + "loss": 0.83328027, + "num_input_tokens_seen": 320836745, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09796143, + "step": 14875, + "time_per_iteration": 2.582097291946411 + }, + { + "auxiliary_loss_clip": 0.06409485, + "auxiliary_loss_mlp": 0.0126629, + "balance_loss_clip": 0.06272477, + "balance_loss_mlp": 0.01255901, + "epoch": 0.8943935066887119, + "flos": 22170525258240.0, + "grad_norm": 2.233676801641688, + "language_loss": 0.7754097, + "learning_rate": 1.1579307963340857e-07, + "loss": 0.85216737, + "num_input_tokens_seen": 320853305, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.1038208, + "step": 14876, + "time_per_iteration": 2.49131178855896 + }, + { + "auxiliary_loss_clip": 0.06400561, + "auxiliary_loss_mlp": 0.01262912, + "balance_loss_clip": 0.06270637, + "balance_loss_mlp": 0.01253781, + "epoch": 0.8944536299413798, + "flos": 21476394835200.0, + "grad_norm": 1.6532963820803077, + "language_loss": 0.78540194, + "learning_rate": 1.156625201573287e-07, + "loss": 0.86203676, + "num_input_tokens_seen": 320872885, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.09124756, + "step": 14877, + "time_per_iteration": 3.9302589893341064 + }, + { + "auxiliary_loss_clip": 0.06400222, + "auxiliary_loss_mlp": 0.01262535, + "balance_loss_clip": 0.06270761, + "balance_loss_mlp": 0.01253051, + "epoch": 0.8945137531940478, + "flos": 17754538579200.0, + "grad_norm": 2.0502806010232453, + "language_loss": 0.75457507, + "learning_rate": 1.155320321355151e-07, + "loss": 0.83120263, + "num_input_tokens_seen": 320889755, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.09484863, + "step": 14878, + "time_per_iteration": 2.479912519454956 + }, + { + "auxiliary_loss_clip": 0.06404997, + "auxiliary_loss_mlp": 0.01266971, + "balance_loss_clip": 0.06271661, + "balance_loss_mlp": 0.0125744, + "epoch": 0.8945738764467158, + "flos": 21148644389760.0, + "grad_norm": 1.5435004393122365, + "language_loss": 0.75714976, + "learning_rate": 1.1540161557291539e-07, + "loss": 0.83386946, + "num_input_tokens_seen": 320907860, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.09527588, + "step": 14879, + "time_per_iteration": 3.89373779296875 + }, + { + "auxiliary_loss_clip": 0.06402966, + "auxiliary_loss_mlp": 0.01268024, + "balance_loss_clip": 0.06272627, + "balance_loss_mlp": 0.01258737, + "epoch": 0.8946339996993837, + "flos": 14908304689920.0, + "grad_norm": 1.7286319960162253, + "language_loss": 0.74827355, + "learning_rate": 1.1527127047447538e-07, + "loss": 0.82498348, + "num_input_tokens_seen": 320925825, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.09283447, + "step": 14880, + "time_per_iteration": 2.485443592071533 + }, + { + "auxiliary_loss_clip": 0.06400868, + "auxiliary_loss_mlp": 0.01262655, + "balance_loss_clip": 0.0626995, + "balance_loss_mlp": 0.01253786, + "epoch": 0.8946941229520518, + "flos": 27389738776320.0, + "grad_norm": 1.6052503239792235, + "language_loss": 0.83234966, + "learning_rate": 1.1514099684513822e-07, + "loss": 0.9089849, + "num_input_tokens_seen": 320946165, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.08862305, + "step": 14881, + "time_per_iteration": 2.6131069660186768 + }, + { + "auxiliary_loss_clip": 0.064004, + "auxiliary_loss_mlp": 0.01262592, + "balance_loss_clip": 0.06272516, + "balance_loss_mlp": 0.01253902, + "epoch": 0.8947542462047197, + "flos": 31804467644160.0, + "grad_norm": 1.6227908564694626, + "language_loss": 0.67742473, + "learning_rate": 1.1501079468984287e-07, + "loss": 0.75405467, + "num_input_tokens_seen": 320969330, + "router_z_loss_clip": 1.27832031, + "router_z_loss_mlp": 0.08685303, + "step": 14882, + "time_per_iteration": 2.624990701675415 + }, + { + "auxiliary_loss_clip": 0.06410404, + "auxiliary_loss_mlp": 0.01265699, + "balance_loss_clip": 0.06273839, + "balance_loss_mlp": 0.01255286, + "epoch": 0.8948143694573877, + "flos": 20889390257280.0, + "grad_norm": 2.2592268261234794, + "language_loss": 0.76093864, + "learning_rate": 1.1488066401352691e-07, + "loss": 0.83769971, + "num_input_tokens_seen": 320985055, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.10412598, + "step": 14883, + "time_per_iteration": 2.615189552307129 + }, + { + "auxiliary_loss_clip": 0.06395276, + "auxiliary_loss_mlp": 0.01265504, + "balance_loss_clip": 0.06268727, + "balance_loss_mlp": 0.01256176, + "epoch": 0.8948744927100556, + "flos": 28222287344640.0, + "grad_norm": 1.4620287749625491, + "language_loss": 0.72531396, + "learning_rate": 1.147506048211253e-07, + "loss": 0.80192173, + "num_input_tokens_seen": 321004720, + "router_z_loss_clip": 1.26367188, + "router_z_loss_mlp": 0.09320068, + "step": 14884, + "time_per_iteration": 2.686645269393921 + }, + { + "auxiliary_loss_clip": 0.06399888, + "auxiliary_loss_mlp": 0.01266732, + "balance_loss_clip": 0.06271544, + "balance_loss_mlp": 0.01258155, + "epoch": 0.8949346159627236, + "flos": 21908210451840.0, + "grad_norm": 1.538214913987674, + "language_loss": 0.75908208, + "learning_rate": 1.1462061711756987e-07, + "loss": 0.8357482, + "num_input_tokens_seen": 321022350, + "router_z_loss_clip": 1.28417969, + "router_z_loss_mlp": 0.08575439, + "step": 14885, + "time_per_iteration": 2.5954906940460205 + }, + { + "auxiliary_loss_clip": 0.06404841, + "auxiliary_loss_mlp": 0.01265997, + "balance_loss_clip": 0.06268379, + "balance_loss_mlp": 0.01256138, + "epoch": 0.8949947392153915, + "flos": 21365202067200.0, + "grad_norm": 1.822897035526379, + "language_loss": 0.82082385, + "learning_rate": 1.1449070090778911e-07, + "loss": 0.89753222, + "num_input_tokens_seen": 321040450, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.09857178, + "step": 14886, + "time_per_iteration": 2.547220468521118 + }, + { + "auxiliary_loss_clip": 0.06404007, + "auxiliary_loss_mlp": 0.01264491, + "balance_loss_clip": 0.06273124, + "balance_loss_mlp": 0.01255241, + "epoch": 0.8950548624680595, + "flos": 52456672120320.0, + "grad_norm": 1.403766118863264, + "language_loss": 0.63836366, + "learning_rate": 1.1436085619671043e-07, + "loss": 0.71504867, + "num_input_tokens_seen": 321063970, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.09246826, + "step": 14887, + "time_per_iteration": 2.840047836303711 + }, + { + "auxiliary_loss_clip": 0.06404397, + "auxiliary_loss_mlp": 0.01264814, + "balance_loss_clip": 0.06270733, + "balance_loss_mlp": 0.01255379, + "epoch": 0.8951149857207275, + "flos": 20127643989120.0, + "grad_norm": 2.05922037970012, + "language_loss": 0.61333579, + "learning_rate": 1.1423108298925698e-07, + "loss": 0.69002795, + "num_input_tokens_seen": 321083840, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.09423828, + "step": 14888, + "time_per_iteration": 2.4822325706481934 + }, + { + "auxiliary_loss_clip": 0.0640295, + "auxiliary_loss_mlp": 0.01263259, + "balance_loss_clip": 0.06270017, + "balance_loss_mlp": 0.01253997, + "epoch": 0.8951751089733955, + "flos": 29870515082880.0, + "grad_norm": 1.7326619011020001, + "language_loss": 0.70190442, + "learning_rate": 1.1410138129034952e-07, + "loss": 0.77856648, + "num_input_tokens_seen": 321104165, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.09259033, + "step": 14889, + "time_per_iteration": 2.5692856311798096 + }, + { + "auxiliary_loss_clip": 0.06403539, + "auxiliary_loss_mlp": 0.01267234, + "balance_loss_clip": 0.0627138, + "balance_loss_mlp": 0.01257364, + "epoch": 0.8952352322260634, + "flos": 15267305508480.0, + "grad_norm": 2.187718839417261, + "language_loss": 0.70865494, + "learning_rate": 1.1397175110490676e-07, + "loss": 0.78536266, + "num_input_tokens_seen": 321117290, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.09863281, + "step": 14890, + "time_per_iteration": 2.4263176918029785 + }, + { + "auxiliary_loss_clip": 0.06400955, + "auxiliary_loss_mlp": 0.0126212, + "balance_loss_clip": 0.06268892, + "balance_loss_mlp": 0.01253036, + "epoch": 0.8952953554787314, + "flos": 26805794872320.0, + "grad_norm": 1.6509945503945358, + "language_loss": 0.75869304, + "learning_rate": 1.1384219243784454e-07, + "loss": 0.83532381, + "num_input_tokens_seen": 321137115, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09069824, + "step": 14891, + "time_per_iteration": 2.5538480281829834 + }, + { + "auxiliary_loss_clip": 0.06407404, + "auxiliary_loss_mlp": 0.01265746, + "balance_loss_clip": 0.06271844, + "balance_loss_mlp": 0.01256031, + "epoch": 0.8953554787313994, + "flos": 14142449571840.0, + "grad_norm": 1.8156588804398968, + "language_loss": 0.77074498, + "learning_rate": 1.1371270529407517e-07, + "loss": 0.84747648, + "num_input_tokens_seen": 321154490, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.097229, + "step": 14892, + "time_per_iteration": 2.4513299465179443 + }, + { + "auxiliary_loss_clip": 0.06402381, + "auxiliary_loss_mlp": 0.01265472, + "balance_loss_clip": 0.06271234, + "balance_loss_mlp": 0.01256132, + "epoch": 0.8954156019840673, + "flos": 25710512227200.0, + "grad_norm": 1.8223353261207547, + "language_loss": 0.81747323, + "learning_rate": 1.1358328967850895e-07, + "loss": 0.89415169, + "num_input_tokens_seen": 321175625, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.09338379, + "step": 14893, + "time_per_iteration": 2.5349771976470947 + }, + { + "auxiliary_loss_clip": 0.06401483, + "auxiliary_loss_mlp": 0.01263898, + "balance_loss_clip": 0.06273355, + "balance_loss_mlp": 0.01255554, + "epoch": 0.8954757252367354, + "flos": 21914415653760.0, + "grad_norm": 2.626592017949994, + "language_loss": 0.75162917, + "learning_rate": 1.1345394559605348e-07, + "loss": 0.82828295, + "num_input_tokens_seen": 321193895, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.08343506, + "step": 14894, + "time_per_iteration": 2.4701755046844482 + }, + { + "auxiliary_loss_clip": 0.06405791, + "auxiliary_loss_mlp": 0.01264109, + "balance_loss_clip": 0.06272551, + "balance_loss_mlp": 0.01254006, + "epoch": 0.8955358484894033, + "flos": 12975568012800.0, + "grad_norm": 1.5890644812826222, + "language_loss": 0.66464567, + "learning_rate": 1.1332467305161352e-07, + "loss": 0.74134463, + "num_input_tokens_seen": 321211610, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.10101318, + "step": 14895, + "time_per_iteration": 2.5035977363586426 + }, + { + "auxiliary_loss_clip": 0.06404694, + "auxiliary_loss_mlp": 0.01265678, + "balance_loss_clip": 0.06269982, + "balance_loss_mlp": 0.01255444, + "epoch": 0.8955959717420713, + "flos": 17279565310080.0, + "grad_norm": 1.87791036453397, + "language_loss": 0.67284429, + "learning_rate": 1.1319547205009094e-07, + "loss": 0.74954802, + "num_input_tokens_seen": 321229805, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.10229492, + "step": 14896, + "time_per_iteration": 2.4724619388580322 + }, + { + "auxiliary_loss_clip": 0.06403284, + "auxiliary_loss_mlp": 0.0126487, + "balance_loss_clip": 0.06272653, + "balance_loss_mlp": 0.01255667, + "epoch": 0.8956560949947392, + "flos": 14799208273920.0, + "grad_norm": 1.6470443719838597, + "language_loss": 0.76069427, + "learning_rate": 1.1306634259638492e-07, + "loss": 0.83737576, + "num_input_tokens_seen": 321247165, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.09210205, + "step": 14897, + "time_per_iteration": 2.4986391067504883 + }, + { + "auxiliary_loss_clip": 0.06308123, + "auxiliary_loss_mlp": 0.01251594, + "balance_loss_clip": 0.06253865, + "balance_loss_mlp": 0.01250616, + "epoch": 0.8957162182474072, + "flos": 63626754280320.0, + "grad_norm": 0.7334774931329842, + "language_loss": 0.55192471, + "learning_rate": 1.129372846953931e-07, + "loss": 0.62752187, + "num_input_tokens_seen": 321308425, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.00977325, + "step": 14898, + "time_per_iteration": 3.1359360218048096 + }, + { + "auxiliary_loss_clip": 0.0640052, + "auxiliary_loss_mlp": 0.01265122, + "balance_loss_clip": 0.06268872, + "balance_loss_mlp": 0.01255681, + "epoch": 0.8957763415000751, + "flos": 25016884928640.0, + "grad_norm": 1.570472066859937, + "language_loss": 0.70246518, + "learning_rate": 1.12808298352008e-07, + "loss": 0.77912164, + "num_input_tokens_seen": 321329295, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09442139, + "step": 14899, + "time_per_iteration": 2.5486810207366943 + }, + { + "auxiliary_loss_clip": 0.06403163, + "auxiliary_loss_mlp": 0.01263587, + "balance_loss_clip": 0.06270869, + "balance_loss_mlp": 0.01253061, + "epoch": 0.8958364647527431, + "flos": 19834749642240.0, + "grad_norm": 1.8245090514725772, + "language_loss": 0.73847759, + "learning_rate": 1.1267938357112106e-07, + "loss": 0.81514513, + "num_input_tokens_seen": 321347580, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.10534668, + "step": 14900, + "time_per_iteration": 2.4651243686676025 + }, + { + "auxiliary_loss_clip": 0.06306873, + "auxiliary_loss_mlp": 0.01248856, + "balance_loss_clip": 0.0625267, + "balance_loss_mlp": 0.0124786, + "epoch": 0.895896588005411, + "flos": 65555717523840.0, + "grad_norm": 0.7552312872825258, + "language_loss": 0.6180774, + "learning_rate": 1.1255054035762124e-07, + "loss": 0.69363469, + "num_input_tokens_seen": 321407820, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.00995636, + "step": 14901, + "time_per_iteration": 3.1450839042663574 + }, + { + "auxiliary_loss_clip": 0.06405282, + "auxiliary_loss_mlp": 0.01269268, + "balance_loss_clip": 0.06272149, + "balance_loss_mlp": 0.01259702, + "epoch": 0.8959567112580791, + "flos": 25597726231680.0, + "grad_norm": 1.4924543934723433, + "language_loss": 0.71050578, + "learning_rate": 1.1242176871639441e-07, + "loss": 0.78725129, + "num_input_tokens_seen": 321426745, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.09570312, + "step": 14902, + "time_per_iteration": 4.060534477233887 + }, + { + "auxiliary_loss_clip": 0.0639545, + "auxiliary_loss_mlp": 0.01265072, + "balance_loss_clip": 0.0626891, + "balance_loss_mlp": 0.01256037, + "epoch": 0.896016834510747, + "flos": 24207788304000.0, + "grad_norm": 1.5516479623413435, + "language_loss": 0.78019071, + "learning_rate": 1.1229306865232313e-07, + "loss": 0.85679603, + "num_input_tokens_seen": 321446165, + "router_z_loss_clip": 1.265625, + "router_z_loss_mlp": 0.09033203, + "step": 14903, + "time_per_iteration": 2.530996084213257 + }, + { + "auxiliary_loss_clip": 0.06406369, + "auxiliary_loss_mlp": 0.0126234, + "balance_loss_clip": 0.06271996, + "balance_loss_mlp": 0.01252774, + "epoch": 0.896076957763415, + "flos": 23082638878080.0, + "grad_norm": 1.8242518649454527, + "language_loss": 0.73055351, + "learning_rate": 1.121644401702877e-07, + "loss": 0.80724061, + "num_input_tokens_seen": 321465285, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.09570312, + "step": 14904, + "time_per_iteration": 2.6296870708465576 + }, + { + "auxiliary_loss_clip": 0.06401238, + "auxiliary_loss_mlp": 0.01262525, + "balance_loss_clip": 0.06269231, + "balance_loss_mlp": 0.01252637, + "epoch": 0.8961370810160829, + "flos": 22243130421120.0, + "grad_norm": 1.862824182986126, + "language_loss": 0.75347674, + "learning_rate": 1.12035883275166e-07, + "loss": 0.83011442, + "num_input_tokens_seen": 321483670, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.09887695, + "step": 14905, + "time_per_iteration": 2.5133965015411377 + }, + { + "auxiliary_loss_clip": 0.06398168, + "auxiliary_loss_mlp": 0.01264344, + "balance_loss_clip": 0.06269428, + "balance_loss_mlp": 0.01255487, + "epoch": 0.8961972042687509, + "flos": 23078404247040.0, + "grad_norm": 1.5622217047945155, + "language_loss": 0.76437497, + "learning_rate": 1.1190739797183279e-07, + "loss": 0.84100008, + "num_input_tokens_seen": 321501190, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.08862305, + "step": 14906, + "time_per_iteration": 2.5375421047210693 + }, + { + "auxiliary_loss_clip": 0.06407402, + "auxiliary_loss_mlp": 0.01264174, + "balance_loss_clip": 0.06274619, + "balance_loss_mlp": 0.01254822, + "epoch": 0.896257327521419, + "flos": 18191595075840.0, + "grad_norm": 1.6265989394728257, + "language_loss": 0.745776, + "learning_rate": 1.1177898426515996e-07, + "loss": 0.82249177, + "num_input_tokens_seen": 321518540, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09356689, + "step": 14907, + "time_per_iteration": 2.4702959060668945 + }, + { + "auxiliary_loss_clip": 0.06397235, + "auxiliary_loss_mlp": 0.01269593, + "balance_loss_clip": 0.06267928, + "balance_loss_mlp": 0.01260152, + "epoch": 0.8963174507740869, + "flos": 17901384059520.0, + "grad_norm": 1.964029322424203, + "language_loss": 0.8312695, + "learning_rate": 1.1165064216001785e-07, + "loss": 0.90793782, + "num_input_tokens_seen": 321536555, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.09436035, + "step": 14908, + "time_per_iteration": 2.5215442180633545 + }, + { + "auxiliary_loss_clip": 0.06403542, + "auxiliary_loss_mlp": 0.01266202, + "balance_loss_clip": 0.06269868, + "balance_loss_mlp": 0.01256129, + "epoch": 0.8963775740267549, + "flos": 21038541724800.0, + "grad_norm": 1.7328216295609387, + "language_loss": 0.70987892, + "learning_rate": 1.1152237166127232e-07, + "loss": 0.78657633, + "num_input_tokens_seen": 321557655, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.10070801, + "step": 14909, + "time_per_iteration": 2.50961971282959 + }, + { + "auxiliary_loss_clip": 0.06402007, + "auxiliary_loss_mlp": 0.01265795, + "balance_loss_clip": 0.06269825, + "balance_loss_mlp": 0.01256103, + "epoch": 0.8964376972794228, + "flos": 23185362384000.0, + "grad_norm": 1.6724963003182998, + "language_loss": 0.72410321, + "learning_rate": 1.113941727737877e-07, + "loss": 0.80078113, + "num_input_tokens_seen": 321576160, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.09698486, + "step": 14910, + "time_per_iteration": 2.5077359676361084 + }, + { + "auxiliary_loss_clip": 0.06399799, + "auxiliary_loss_mlp": 0.01265379, + "balance_loss_clip": 0.06270814, + "balance_loss_mlp": 0.0125682, + "epoch": 0.8964978205320908, + "flos": 24979974405120.0, + "grad_norm": 1.770153875298599, + "language_loss": 0.63518411, + "learning_rate": 1.1126604550242502e-07, + "loss": 0.71183586, + "num_input_tokens_seen": 321596205, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.08563232, + "step": 14911, + "time_per_iteration": 2.4959042072296143 + }, + { + "auxiliary_loss_clip": 0.06406086, + "auxiliary_loss_mlp": 0.01267252, + "balance_loss_clip": 0.06273033, + "balance_loss_mlp": 0.01257596, + "epoch": 0.8965579437847587, + "flos": 19178074794240.0, + "grad_norm": 1.6726693619697703, + "language_loss": 0.75323474, + "learning_rate": 1.111379898520437e-07, + "loss": 0.82996809, + "num_input_tokens_seen": 321614800, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.09649658, + "step": 14912, + "time_per_iteration": 2.511392593383789 + }, + { + "auxiliary_loss_clip": 0.06399107, + "auxiliary_loss_mlp": 0.012646, + "balance_loss_clip": 0.06268585, + "balance_loss_mlp": 0.01255028, + "epoch": 0.8966180670374267, + "flos": 24283034870400.0, + "grad_norm": 1.7988610159945775, + "language_loss": 0.82114106, + "learning_rate": 1.1101000582749876e-07, + "loss": 0.89777815, + "num_input_tokens_seen": 321633445, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.09570312, + "step": 14913, + "time_per_iteration": 4.064217805862427 + }, + { + "auxiliary_loss_clip": 0.06404127, + "auxiliary_loss_mlp": 0.01271416, + "balance_loss_clip": 0.06272069, + "balance_loss_mlp": 0.01261235, + "epoch": 0.8966781902900947, + "flos": 13558296032640.0, + "grad_norm": 1.9987077999566127, + "language_loss": 0.61253613, + "learning_rate": 1.1088209343364407e-07, + "loss": 0.6892916, + "num_input_tokens_seen": 321650890, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.10186768, + "step": 14914, + "time_per_iteration": 2.503157377243042 + }, + { + "auxiliary_loss_clip": 0.06308897, + "auxiliary_loss_mlp": 0.01250037, + "balance_loss_clip": 0.06254553, + "balance_loss_mlp": 0.01249038, + "epoch": 0.8967383135427627, + "flos": 65085104666880.0, + "grad_norm": 0.7199686075509744, + "language_loss": 0.54956484, + "learning_rate": 1.1075425267532956e-07, + "loss": 0.6251542, + "num_input_tokens_seen": 321710960, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.00998688, + "step": 14915, + "time_per_iteration": 3.121408462524414 + }, + { + "auxiliary_loss_clip": 0.0639778, + "auxiliary_loss_mlp": 0.01262669, + "balance_loss_clip": 0.0627024, + "balance_loss_mlp": 0.01254021, + "epoch": 0.8967984367954306, + "flos": 29720273512320.0, + "grad_norm": 1.453709134846792, + "language_loss": 0.71710205, + "learning_rate": 1.1062648355740289e-07, + "loss": 0.79370654, + "num_input_tokens_seen": 321733290, + "router_z_loss_clip": 1.27636719, + "router_z_loss_mlp": 0.08648682, + "step": 14916, + "time_per_iteration": 2.5399439334869385 + }, + { + "auxiliary_loss_clip": 0.06399646, + "auxiliary_loss_mlp": 0.01262869, + "balance_loss_clip": 0.06269349, + "balance_loss_mlp": 0.01253386, + "epoch": 0.8968585600480986, + "flos": 25709547905280.0, + "grad_norm": 1.6511135445596639, + "language_loss": 0.77996731, + "learning_rate": 1.1049878608470931e-07, + "loss": 0.85659248, + "num_input_tokens_seen": 321753120, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.0947876, + "step": 14917, + "time_per_iteration": 3.9514448642730713 + }, + { + "auxiliary_loss_clip": 0.06407967, + "auxiliary_loss_mlp": 0.01265951, + "balance_loss_clip": 0.06272604, + "balance_loss_mlp": 0.01255419, + "epoch": 0.8969186833007665, + "flos": 30052552078080.0, + "grad_norm": 2.288875312823381, + "language_loss": 0.6860131, + "learning_rate": 1.1037116026209137e-07, + "loss": 0.76275229, + "num_input_tokens_seen": 321772840, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.10522461, + "step": 14918, + "time_per_iteration": 3.9689831733703613 + }, + { + "auxiliary_loss_clip": 0.06404864, + "auxiliary_loss_mlp": 0.01262687, + "balance_loss_clip": 0.06271897, + "balance_loss_mlp": 0.0125324, + "epoch": 0.8969788065534345, + "flos": 22824390994560.0, + "grad_norm": 2.299615610412693, + "language_loss": 0.83668256, + "learning_rate": 1.102436060943881e-07, + "loss": 0.91335803, + "num_input_tokens_seen": 321791020, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.09442139, + "step": 14919, + "time_per_iteration": 2.5401570796966553 + }, + { + "auxiliary_loss_clip": 0.06404417, + "auxiliary_loss_mlp": 0.0126842, + "balance_loss_clip": 0.06270535, + "balance_loss_mlp": 0.01258698, + "epoch": 0.8970389298061026, + "flos": 13266575642880.0, + "grad_norm": 2.5633891144705134, + "language_loss": 0.73092914, + "learning_rate": 1.1011612358643696e-07, + "loss": 0.80765748, + "num_input_tokens_seen": 321810075, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.097229, + "step": 14920, + "time_per_iteration": 2.546627998352051 + }, + { + "auxiliary_loss_clip": 0.06404185, + "auxiliary_loss_mlp": 0.01262662, + "balance_loss_clip": 0.06273196, + "balance_loss_mlp": 0.0125266, + "epoch": 0.8970990530587705, + "flos": 10270058256000.0, + "grad_norm": 2.486381848845646, + "language_loss": 0.90980357, + "learning_rate": 1.0998871274307164e-07, + "loss": 0.98647201, + "num_input_tokens_seen": 321822635, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.10009766, + "step": 14921, + "time_per_iteration": 2.452223777770996 + }, + { + "auxiliary_loss_clip": 0.06404401, + "auxiliary_loss_mlp": 0.01265926, + "balance_loss_clip": 0.06269224, + "balance_loss_mlp": 0.01255906, + "epoch": 0.8971591763114385, + "flos": 20308884370560.0, + "grad_norm": 1.739666810440783, + "language_loss": 0.74017936, + "learning_rate": 1.0986137356912384e-07, + "loss": 0.81688261, + "num_input_tokens_seen": 321841130, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.10028076, + "step": 14922, + "time_per_iteration": 2.546560525894165 + }, + { + "auxiliary_loss_clip": 0.06400974, + "auxiliary_loss_mlp": 0.01261981, + "balance_loss_clip": 0.0626979, + "balance_loss_mlp": 0.0125257, + "epoch": 0.8972192995641064, + "flos": 23263543843200.0, + "grad_norm": 1.7043702833178804, + "language_loss": 0.7044152, + "learning_rate": 1.097341060694219e-07, + "loss": 0.78104472, + "num_input_tokens_seen": 321859855, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.09411621, + "step": 14923, + "time_per_iteration": 2.4887611865997314 + }, + { + "auxiliary_loss_clip": 0.06407218, + "auxiliary_loss_mlp": 0.01265187, + "balance_loss_clip": 0.06271665, + "balance_loss_mlp": 0.0125518, + "epoch": 0.8972794228167744, + "flos": 18375560714880.0, + "grad_norm": 1.9781381885926022, + "language_loss": 0.71156216, + "learning_rate": 1.0960691024879221e-07, + "loss": 0.78828621, + "num_input_tokens_seen": 321877990, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.10003662, + "step": 14924, + "time_per_iteration": 2.507704257965088 + }, + { + "auxiliary_loss_clip": 0.06404379, + "auxiliary_loss_mlp": 0.01261706, + "balance_loss_clip": 0.06271243, + "balance_loss_mlp": 0.01253141, + "epoch": 0.8973395460694423, + "flos": 23958974004480.0, + "grad_norm": 1.3820942229672155, + "language_loss": 0.72463107, + "learning_rate": 1.0947978611205844e-07, + "loss": 0.80129194, + "num_input_tokens_seen": 321898120, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.08566284, + "step": 14925, + "time_per_iteration": 2.5084264278411865 + }, + { + "auxiliary_loss_clip": 0.06404161, + "auxiliary_loss_mlp": 0.01263424, + "balance_loss_clip": 0.06271493, + "balance_loss_mlp": 0.01254007, + "epoch": 0.8973996693221103, + "flos": 24977458782720.0, + "grad_norm": 1.5685683957200127, + "language_loss": 0.82635689, + "learning_rate": 1.0935273366404008e-07, + "loss": 0.90303278, + "num_input_tokens_seen": 321918140, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.09417725, + "step": 14926, + "time_per_iteration": 2.557849884033203 + }, + { + "auxiliary_loss_clip": 0.06403212, + "auxiliary_loss_mlp": 0.01263645, + "balance_loss_clip": 0.06271018, + "balance_loss_mlp": 0.01254275, + "epoch": 0.8974597925747783, + "flos": 25745997231360.0, + "grad_norm": 1.4572864051065582, + "language_loss": 0.79279351, + "learning_rate": 1.092257529095555e-07, + "loss": 0.86946213, + "num_input_tokens_seen": 321938580, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.09375, + "step": 14927, + "time_per_iteration": 2.5682642459869385 + }, + { + "auxiliary_loss_clip": 0.06400602, + "auxiliary_loss_mlp": 0.01264213, + "balance_loss_clip": 0.06270526, + "balance_loss_mlp": 0.01255308, + "epoch": 0.8975199158274463, + "flos": 38081172816000.0, + "grad_norm": 1.7102877126425073, + "language_loss": 0.66823071, + "learning_rate": 1.0909884385341994e-07, + "loss": 0.74487889, + "num_input_tokens_seen": 321961135, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.08905029, + "step": 14928, + "time_per_iteration": 2.6806201934814453 + }, + { + "auxiliary_loss_clip": 0.06407198, + "auxiliary_loss_mlp": 0.01262321, + "balance_loss_clip": 0.06272136, + "balance_loss_mlp": 0.01251282, + "epoch": 0.8975800390801142, + "flos": 25418875691520.0, + "grad_norm": 2.175076083160526, + "language_loss": 0.71158016, + "learning_rate": 1.0897200650044602e-07, + "loss": 0.78827536, + "num_input_tokens_seen": 321980945, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.11029053, + "step": 14929, + "time_per_iteration": 2.518965005874634 + }, + { + "auxiliary_loss_clip": 0.0640422, + "auxiliary_loss_mlp": 0.01265932, + "balance_loss_clip": 0.06272244, + "balance_loss_mlp": 0.0125683, + "epoch": 0.8976401623327822, + "flos": 21765599602560.0, + "grad_norm": 1.5816996603880829, + "language_loss": 0.68028259, + "learning_rate": 1.0884524085544256e-07, + "loss": 0.75698406, + "num_input_tokens_seen": 322000350, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09106445, + "step": 14930, + "time_per_iteration": 2.5001468658447266 + }, + { + "auxiliary_loss_clip": 0.06397609, + "auxiliary_loss_mlp": 0.01265308, + "balance_loss_clip": 0.06267622, + "balance_loss_mlp": 0.01256058, + "epoch": 0.8977002855854501, + "flos": 13850519546880.0, + "grad_norm": 3.507650532962027, + "language_loss": 0.74712485, + "learning_rate": 1.0871854692321769e-07, + "loss": 0.82375401, + "num_input_tokens_seen": 322018980, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.0925293, + "step": 14931, + "time_per_iteration": 2.468661069869995 + }, + { + "auxiliary_loss_clip": 0.06398958, + "auxiliary_loss_mlp": 0.01267981, + "balance_loss_clip": 0.06269293, + "balance_loss_mlp": 0.01258897, + "epoch": 0.8977604088381181, + "flos": 19433639347200.0, + "grad_norm": 1.6811603420532344, + "language_loss": 0.63567096, + "learning_rate": 1.0859192470857492e-07, + "loss": 0.71234035, + "num_input_tokens_seen": 322037675, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.09082031, + "step": 14932, + "time_per_iteration": 2.500734567642212 + }, + { + "auxiliary_loss_clip": 0.06395967, + "auxiliary_loss_mlp": 0.01263865, + "balance_loss_clip": 0.06271164, + "balance_loss_mlp": 0.01255425, + "epoch": 0.8978205320907862, + "flos": 22747802762880.0, + "grad_norm": 1.5617576374717, + "language_loss": 0.71711791, + "learning_rate": 1.0846537421631552e-07, + "loss": 0.79371631, + "num_input_tokens_seen": 322055130, + "router_z_loss_clip": 1.24707031, + "router_z_loss_mlp": 0.08441162, + "step": 14933, + "time_per_iteration": 2.4802329540252686 + }, + { + "auxiliary_loss_clip": 0.06406559, + "auxiliary_loss_mlp": 0.01268041, + "balance_loss_clip": 0.06271081, + "balance_loss_mlp": 0.01257753, + "epoch": 0.8978806553434541, + "flos": 21366837221760.0, + "grad_norm": 1.528884069249085, + "language_loss": 0.74636477, + "learning_rate": 1.0833889545123898e-07, + "loss": 0.82311076, + "num_input_tokens_seen": 322074850, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.10284424, + "step": 14934, + "time_per_iteration": 2.5407958030700684 + }, + { + "auxiliary_loss_clip": 0.06400236, + "auxiliary_loss_mlp": 0.01266178, + "balance_loss_clip": 0.06271216, + "balance_loss_mlp": 0.01257029, + "epoch": 0.8979407785961221, + "flos": 20930661192960.0, + "grad_norm": 1.684910765856414, + "language_loss": 0.60720909, + "learning_rate": 1.0821248841814123e-07, + "loss": 0.68387318, + "num_input_tokens_seen": 322093315, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.09155273, + "step": 14935, + "time_per_iteration": 2.494798183441162 + }, + { + "auxiliary_loss_clip": 0.06396089, + "auxiliary_loss_mlp": 0.01262066, + "balance_loss_clip": 0.06269929, + "balance_loss_mlp": 0.0125303, + "epoch": 0.89800090184879, + "flos": 25236042082560.0, + "grad_norm": 1.7255902732774182, + "language_loss": 0.76495326, + "learning_rate": 1.0808615312181512e-07, + "loss": 0.84153479, + "num_input_tokens_seen": 322112555, + "router_z_loss_clip": 1.26074219, + "router_z_loss_mlp": 0.09033203, + "step": 14936, + "time_per_iteration": 2.548093318939209 + }, + { + "auxiliary_loss_clip": 0.06402925, + "auxiliary_loss_mlp": 0.01262388, + "balance_loss_clip": 0.0627269, + "balance_loss_mlp": 0.01252905, + "epoch": 0.898061025101458, + "flos": 22568868368640.0, + "grad_norm": 1.5780818295841181, + "language_loss": 0.74487138, + "learning_rate": 1.0795988956705193e-07, + "loss": 0.82152456, + "num_input_tokens_seen": 322130440, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.09484863, + "step": 14937, + "time_per_iteration": 2.4871113300323486 + }, + { + "auxiliary_loss_clip": 0.06305996, + "auxiliary_loss_mlp": 0.01251202, + "balance_loss_clip": 0.06251696, + "balance_loss_mlp": 0.01250233, + "epoch": 0.8981211483541259, + "flos": 56208799699200.0, + "grad_norm": 0.829573035126938, + "language_loss": 0.63498247, + "learning_rate": 1.0783369775863915e-07, + "loss": 0.71055448, + "num_input_tokens_seen": 322187295, + "router_z_loss_clip": 0.54492188, + "router_z_loss_mlp": 0.00967407, + "step": 14938, + "time_per_iteration": 2.991299629211426 + }, + { + "auxiliary_loss_clip": 0.06397615, + "auxiliary_loss_mlp": 0.01263328, + "balance_loss_clip": 0.06271379, + "balance_loss_mlp": 0.01254179, + "epoch": 0.898181271606794, + "flos": 16397234616960.0, + "grad_norm": 3.7900138603468894, + "language_loss": 0.80554181, + "learning_rate": 1.0770757770136251e-07, + "loss": 0.88215125, + "num_input_tokens_seen": 322202965, + "router_z_loss_clip": 1.26171875, + "router_z_loss_mlp": 0.09143066, + "step": 14939, + "time_per_iteration": 2.461031675338745 + }, + { + "auxiliary_loss_clip": 0.06305988, + "auxiliary_loss_mlp": 0.01253105, + "balance_loss_clip": 0.06251763, + "balance_loss_mlp": 0.01252118, + "epoch": 0.8982413948594619, + "flos": 63461655809280.0, + "grad_norm": 0.7334559404863827, + "language_loss": 0.52954245, + "learning_rate": 1.0758152940000375e-07, + "loss": 0.60513341, + "num_input_tokens_seen": 322269490, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.00986481, + "step": 14940, + "time_per_iteration": 3.248729705810547 + }, + { + "auxiliary_loss_clip": 0.06402014, + "auxiliary_loss_mlp": 0.01267397, + "balance_loss_clip": 0.06270303, + "balance_loss_mlp": 0.01257294, + "epoch": 0.8983015181121299, + "flos": 21841810490880.0, + "grad_norm": 1.7036314435960453, + "language_loss": 0.77842438, + "learning_rate": 1.0745555285934327e-07, + "loss": 0.85511851, + "num_input_tokens_seen": 322288060, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.10107422, + "step": 14941, + "time_per_iteration": 4.003239870071411 + }, + { + "auxiliary_loss_clip": 0.06402576, + "auxiliary_loss_mlp": 0.01265073, + "balance_loss_clip": 0.06269994, + "balance_loss_mlp": 0.01255167, + "epoch": 0.8983616413647978, + "flos": 28957604849280.0, + "grad_norm": 1.9606451344783369, + "language_loss": 0.73512655, + "learning_rate": 1.0732964808415834e-07, + "loss": 0.81180304, + "num_input_tokens_seen": 322307930, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.09899902, + "step": 14942, + "time_per_iteration": 2.569955587387085 + }, + { + "auxiliary_loss_clip": 0.06404367, + "auxiliary_loss_mlp": 0.01264132, + "balance_loss_clip": 0.06270196, + "balance_loss_mlp": 0.01254571, + "epoch": 0.8984217646174658, + "flos": 17790820197120.0, + "grad_norm": 2.002654681143642, + "language_loss": 0.80248809, + "learning_rate": 1.0720381507922205e-07, + "loss": 0.87917316, + "num_input_tokens_seen": 322326155, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.09558105, + "step": 14943, + "time_per_iteration": 2.488431930541992 + }, + { + "auxiliary_loss_clip": 0.06405204, + "auxiliary_loss_mlp": 0.01269191, + "balance_loss_clip": 0.06271496, + "balance_loss_mlp": 0.01258945, + "epoch": 0.8984818878701337, + "flos": 23411311718400.0, + "grad_norm": 1.5597743070922876, + "language_loss": 0.71681154, + "learning_rate": 1.0707805384930701e-07, + "loss": 0.7935555, + "num_input_tokens_seen": 322345850, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.10247803, + "step": 14944, + "time_per_iteration": 2.5067203044891357 + }, + { + "auxiliary_loss_clip": 0.06407298, + "auxiliary_loss_mlp": 0.01264929, + "balance_loss_clip": 0.06270649, + "balance_loss_mlp": 0.01254809, + "epoch": 0.8985420111228017, + "flos": 22352604180480.0, + "grad_norm": 2.1061094543474184, + "language_loss": 0.76275969, + "learning_rate": 1.0695236439918187e-07, + "loss": 0.83948195, + "num_input_tokens_seen": 322364715, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.10125732, + "step": 14945, + "time_per_iteration": 2.500641107559204 + }, + { + "auxiliary_loss_clip": 0.06413375, + "auxiliary_loss_mlp": 0.01269223, + "balance_loss_clip": 0.06273663, + "balance_loss_mlp": 0.01258917, + "epoch": 0.8986021343754698, + "flos": 21398381084160.0, + "grad_norm": 2.9283306664334128, + "language_loss": 0.73861766, + "learning_rate": 1.0682674673361302e-07, + "loss": 0.81544363, + "num_input_tokens_seen": 322383570, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.10314941, + "step": 14946, + "time_per_iteration": 2.491835832595825 + }, + { + "auxiliary_loss_clip": 0.06401925, + "auxiliary_loss_mlp": 0.01263432, + "balance_loss_clip": 0.06270081, + "balance_loss_mlp": 0.0125358, + "epoch": 0.8986622576281377, + "flos": 21331897269120.0, + "grad_norm": 2.0231368146788813, + "language_loss": 0.64790112, + "learning_rate": 1.0670120085736334e-07, + "loss": 0.72455472, + "num_input_tokens_seen": 322401375, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09851074, + "step": 14947, + "time_per_iteration": 2.4926280975341797 + }, + { + "auxiliary_loss_clip": 0.06400159, + "auxiliary_loss_mlp": 0.01262141, + "balance_loss_clip": 0.06270196, + "balance_loss_mlp": 0.01253343, + "epoch": 0.8987223808808057, + "flos": 23995171768320.0, + "grad_norm": 1.824984607909439, + "language_loss": 0.70089561, + "learning_rate": 1.0657572677519411e-07, + "loss": 0.77751863, + "num_input_tokens_seen": 322421890, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.08795166, + "step": 14948, + "time_per_iteration": 2.546989679336548 + }, + { + "auxiliary_loss_clip": 0.06400745, + "auxiliary_loss_mlp": 0.01263069, + "balance_loss_clip": 0.06270957, + "balance_loss_mlp": 0.01253985, + "epoch": 0.8987825041334736, + "flos": 41510679776640.0, + "grad_norm": 1.7578036541733197, + "language_loss": 0.74855787, + "learning_rate": 1.0645032449186309e-07, + "loss": 0.82519603, + "num_input_tokens_seen": 322445730, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.09082031, + "step": 14949, + "time_per_iteration": 2.698312997817993 + }, + { + "auxiliary_loss_clip": 0.06405021, + "auxiliary_loss_mlp": 0.01265803, + "balance_loss_clip": 0.06272099, + "balance_loss_mlp": 0.01254871, + "epoch": 0.8988426273861416, + "flos": 27571817698560.0, + "grad_norm": 1.6083544850300273, + "language_loss": 0.75579, + "learning_rate": 1.0632499401212513e-07, + "loss": 0.83249831, + "num_input_tokens_seen": 322464595, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.10925293, + "step": 14950, + "time_per_iteration": 2.553276777267456 + }, + { + "auxiliary_loss_clip": 0.06403638, + "auxiliary_loss_mlp": 0.01263025, + "balance_loss_clip": 0.06273642, + "balance_loss_mlp": 0.01254541, + "epoch": 0.8989027506388095, + "flos": 17098408782720.0, + "grad_norm": 1.6063948284230318, + "language_loss": 0.66535282, + "learning_rate": 1.0619973534073334e-07, + "loss": 0.74201941, + "num_input_tokens_seen": 322483305, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.08486938, + "step": 14951, + "time_per_iteration": 2.487602472305298 + }, + { + "auxiliary_loss_clip": 0.06404173, + "auxiliary_loss_mlp": 0.01263355, + "balance_loss_clip": 0.06266937, + "balance_loss_mlp": 0.01253956, + "epoch": 0.8989628738914776, + "flos": 20560843198080.0, + "grad_norm": 1.8566559318875047, + "language_loss": 0.74081647, + "learning_rate": 1.0607454848243769e-07, + "loss": 0.81749177, + "num_input_tokens_seen": 322501905, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.09393311, + "step": 14952, + "time_per_iteration": 4.035311937332153 + }, + { + "auxiliary_loss_clip": 0.06401406, + "auxiliary_loss_mlp": 0.01264061, + "balance_loss_clip": 0.06271611, + "balance_loss_mlp": 0.0125493, + "epoch": 0.8990229971441455, + "flos": 16256300849280.0, + "grad_norm": 2.420734028106449, + "language_loss": 0.56859446, + "learning_rate": 1.0594943344198481e-07, + "loss": 0.64524913, + "num_input_tokens_seen": 322518135, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.09136963, + "step": 14953, + "time_per_iteration": 2.4741392135620117 + }, + { + "auxiliary_loss_clip": 0.06400678, + "auxiliary_loss_mlp": 0.01262102, + "balance_loss_clip": 0.06271634, + "balance_loss_mlp": 0.0125247, + "epoch": 0.8990831203968135, + "flos": 21987817430400.0, + "grad_norm": 1.7963505164231723, + "language_loss": 0.82287514, + "learning_rate": 1.0582439022411915e-07, + "loss": 0.89950299, + "num_input_tokens_seen": 322537905, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.09637451, + "step": 14954, + "time_per_iteration": 2.5389609336853027 + }, + { + "auxiliary_loss_clip": 0.06401017, + "auxiliary_loss_mlp": 0.01266641, + "balance_loss_clip": 0.06273876, + "balance_loss_mlp": 0.0125748, + "epoch": 0.8991432436494814, + "flos": 27453413479680.0, + "grad_norm": 1.9061442567744085, + "language_loss": 0.60138369, + "learning_rate": 1.0569941883358224e-07, + "loss": 0.67806023, + "num_input_tokens_seen": 322557945, + "router_z_loss_clip": 1.27148438, + "router_z_loss_mlp": 0.09155273, + "step": 14955, + "time_per_iteration": 4.0195207595825195 + }, + { + "auxiliary_loss_clip": 0.06399333, + "auxiliary_loss_mlp": 0.01262833, + "balance_loss_clip": 0.0627117, + "balance_loss_mlp": 0.01253636, + "epoch": 0.8992033669021494, + "flos": 21586245937920.0, + "grad_norm": 1.9656216250623941, + "language_loss": 0.55445802, + "learning_rate": 1.0557451927511341e-07, + "loss": 0.63107967, + "num_input_tokens_seen": 322575765, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.09191895, + "step": 14956, + "time_per_iteration": 2.485630989074707 + }, + { + "auxiliary_loss_clip": 0.06401742, + "auxiliary_loss_mlp": 0.01259934, + "balance_loss_clip": 0.06271005, + "balance_loss_mlp": 0.01250648, + "epoch": 0.8992634901548173, + "flos": 28591644142080.0, + "grad_norm": 1.9186757999102584, + "language_loss": 0.80292857, + "learning_rate": 1.0544969155344863e-07, + "loss": 0.87954533, + "num_input_tokens_seen": 322595665, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09283447, + "step": 14957, + "time_per_iteration": 2.549023151397705 + }, + { + "auxiliary_loss_clip": 0.06405012, + "auxiliary_loss_mlp": 0.01264553, + "balance_loss_clip": 0.06270377, + "balance_loss_mlp": 0.01254486, + "epoch": 0.8993236134074853, + "flos": 19873966152960.0, + "grad_norm": 1.606617914343127, + "language_loss": 0.79137737, + "learning_rate": 1.0532493567332123e-07, + "loss": 0.86807305, + "num_input_tokens_seen": 322614755, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.10070801, + "step": 14958, + "time_per_iteration": 3.8852593898773193 + }, + { + "auxiliary_loss_clip": 0.06400818, + "auxiliary_loss_mlp": 0.0126224, + "balance_loss_clip": 0.06271718, + "balance_loss_mlp": 0.01253407, + "epoch": 0.8993837366601534, + "flos": 19396686896640.0, + "grad_norm": 2.106043903727993, + "language_loss": 0.74878645, + "learning_rate": 1.0520025163946277e-07, + "loss": 0.82541704, + "num_input_tokens_seen": 322633425, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.08837891, + "step": 14959, + "time_per_iteration": 2.491607427597046 + }, + { + "auxiliary_loss_clip": 0.06397241, + "auxiliary_loss_mlp": 0.01264655, + "balance_loss_clip": 0.06269586, + "balance_loss_mlp": 0.01255464, + "epoch": 0.8994438599128213, + "flos": 18557681564160.0, + "grad_norm": 1.7970677871166365, + "language_loss": 0.68824446, + "learning_rate": 1.0507563945660015e-07, + "loss": 0.76486343, + "num_input_tokens_seen": 322652065, + "router_z_loss_clip": 1.27734375, + "router_z_loss_mlp": 0.09185791, + "step": 14960, + "time_per_iteration": 2.473184823989868 + }, + { + "auxiliary_loss_clip": 0.06401291, + "auxiliary_loss_mlp": 0.01261175, + "balance_loss_clip": 0.06271642, + "balance_loss_mlp": 0.01252157, + "epoch": 0.8995039831654893, + "flos": 24434785814400.0, + "grad_norm": 1.656362738673528, + "language_loss": 0.66098744, + "learning_rate": 1.049510991294591e-07, + "loss": 0.73761213, + "num_input_tokens_seen": 322673275, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.09014893, + "step": 14961, + "time_per_iteration": 2.66253662109375 + }, + { + "auxiliary_loss_clip": 0.06398708, + "auxiliary_loss_mlp": 0.01265611, + "balance_loss_clip": 0.0627034, + "balance_loss_mlp": 0.01257105, + "epoch": 0.8995641064181572, + "flos": 21257656951680.0, + "grad_norm": 1.4284268544780132, + "language_loss": 0.83220261, + "learning_rate": 1.0482663066276254e-07, + "loss": 0.90884578, + "num_input_tokens_seen": 322693375, + "router_z_loss_clip": 1.28320312, + "router_z_loss_mlp": 0.08505249, + "step": 14962, + "time_per_iteration": 2.490577220916748 + }, + { + "auxiliary_loss_clip": 0.06411661, + "auxiliary_loss_mlp": 0.0126439, + "balance_loss_clip": 0.06276189, + "balance_loss_mlp": 0.01253924, + "epoch": 0.8996242296708252, + "flos": 23520408134400.0, + "grad_norm": 1.7909885664561782, + "language_loss": 0.76536137, + "learning_rate": 1.047022340612298e-07, + "loss": 0.84212184, + "num_input_tokens_seen": 322712615, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.10473633, + "step": 14963, + "time_per_iteration": 2.548292398452759 + }, + { + "auxiliary_loss_clip": 0.06311448, + "auxiliary_loss_mlp": 0.01255845, + "balance_loss_clip": 0.06257099, + "balance_loss_mlp": 0.01254884, + "epoch": 0.8996843529234931, + "flos": 62421872094720.0, + "grad_norm": 0.7636131914060387, + "language_loss": 0.57454842, + "learning_rate": 1.0457790932957867e-07, + "loss": 0.65022135, + "num_input_tokens_seen": 322766855, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.00959778, + "step": 14964, + "time_per_iteration": 2.9614195823669434 + }, + { + "auxiliary_loss_clip": 0.06410883, + "auxiliary_loss_mlp": 0.01264449, + "balance_loss_clip": 0.06273533, + "balance_loss_mlp": 0.01254307, + "epoch": 0.8997444761761612, + "flos": 24242602475520.0, + "grad_norm": 2.33036033552358, + "language_loss": 0.68011808, + "learning_rate": 1.0445365647252269e-07, + "loss": 0.75687134, + "num_input_tokens_seen": 322781130, + "router_z_loss_clip": 1.37402344, + "router_z_loss_mlp": 0.10140991, + "step": 14965, + "time_per_iteration": 2.4943199157714844 + }, + { + "auxiliary_loss_clip": 0.06403812, + "auxiliary_loss_mlp": 0.01265866, + "balance_loss_clip": 0.06271215, + "balance_loss_mlp": 0.01256216, + "epoch": 0.8998045994288291, + "flos": 21367508054400.0, + "grad_norm": 2.8566612226019354, + "language_loss": 0.72390759, + "learning_rate": 1.0432947549477433e-07, + "loss": 0.8006044, + "num_input_tokens_seen": 322800310, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.09649658, + "step": 14966, + "time_per_iteration": 2.480290412902832 + }, + { + "auxiliary_loss_clip": 0.06401855, + "auxiliary_loss_mlp": 0.01271387, + "balance_loss_clip": 0.06271008, + "balance_loss_mlp": 0.01261618, + "epoch": 0.8998647226814971, + "flos": 28993760686080.0, + "grad_norm": 1.6461811578416619, + "language_loss": 0.7351234, + "learning_rate": 1.0420536640104205e-07, + "loss": 0.81185579, + "num_input_tokens_seen": 322820955, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09765625, + "step": 14967, + "time_per_iteration": 2.5578274726867676 + }, + { + "auxiliary_loss_clip": 0.06400469, + "auxiliary_loss_mlp": 0.01260803, + "balance_loss_clip": 0.06269619, + "balance_loss_mlp": 0.01252011, + "epoch": 0.899924845934165, + "flos": 13630985049600.0, + "grad_norm": 1.7161192874601998, + "language_loss": 0.72534561, + "learning_rate": 1.040813291960323e-07, + "loss": 0.80195838, + "num_input_tokens_seen": 322838780, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.08789062, + "step": 14968, + "time_per_iteration": 2.4440808296203613 + }, + { + "auxiliary_loss_clip": 0.06403413, + "auxiliary_loss_mlp": 0.01266071, + "balance_loss_clip": 0.06271084, + "balance_loss_mlp": 0.01256904, + "epoch": 0.899984969186833, + "flos": 20888258227200.0, + "grad_norm": 1.7973658286855019, + "language_loss": 0.71199846, + "learning_rate": 1.0395736388444864e-07, + "loss": 0.78869331, + "num_input_tokens_seen": 322856710, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.09173584, + "step": 14969, + "time_per_iteration": 2.4951353073120117 + }, + { + "auxiliary_loss_clip": 0.06404494, + "auxiliary_loss_mlp": 0.01261784, + "balance_loss_clip": 0.06270813, + "balance_loss_mlp": 0.01252337, + "epoch": 0.9000450924395009, + "flos": 20927894008320.0, + "grad_norm": 1.857601731037714, + "language_loss": 0.76268947, + "learning_rate": 1.0383347047099201e-07, + "loss": 0.83935225, + "num_input_tokens_seen": 322876070, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.09448242, + "step": 14970, + "time_per_iteration": 2.480330467224121 + }, + { + "auxiliary_loss_clip": 0.06401761, + "auxiliary_loss_mlp": 0.01264551, + "balance_loss_clip": 0.06269549, + "balance_loss_mlp": 0.01255348, + "epoch": 0.900105215692169, + "flos": 17170720456320.0, + "grad_norm": 1.5818903114690037, + "language_loss": 0.73086268, + "learning_rate": 1.0370964896035972e-07, + "loss": 0.80752581, + "num_input_tokens_seen": 322895095, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.09204102, + "step": 14971, + "time_per_iteration": 2.512716293334961 + }, + { + "auxiliary_loss_clip": 0.06404724, + "auxiliary_loss_mlp": 0.01264534, + "balance_loss_clip": 0.06273608, + "balance_loss_mlp": 0.01254175, + "epoch": 0.900165338944837, + "flos": 19937053877760.0, + "grad_norm": 1.9745289708509002, + "language_loss": 0.82069004, + "learning_rate": 1.035858993572476e-07, + "loss": 0.89738262, + "num_input_tokens_seen": 322911845, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.10357666, + "step": 14972, + "time_per_iteration": 2.4601757526397705 + }, + { + "auxiliary_loss_clip": 0.06408463, + "auxiliary_loss_mlp": 0.01264926, + "balance_loss_clip": 0.06272122, + "balance_loss_mlp": 0.01255592, + "epoch": 0.9002254621975049, + "flos": 16112599896960.0, + "grad_norm": 1.8818540963205237, + "language_loss": 0.81552333, + "learning_rate": 1.0346222166634855e-07, + "loss": 0.89225721, + "num_input_tokens_seen": 322928170, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.09332275, + "step": 14973, + "time_per_iteration": 2.475221633911133 + }, + { + "auxiliary_loss_clip": 0.06398419, + "auxiliary_loss_mlp": 0.01266711, + "balance_loss_clip": 0.06268209, + "balance_loss_mlp": 0.0125693, + "epoch": 0.9002855854501729, + "flos": 28483763610240.0, + "grad_norm": 1.7760523165463304, + "language_loss": 0.58510089, + "learning_rate": 1.0333861589235193e-07, + "loss": 0.66175216, + "num_input_tokens_seen": 322948165, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.09783936, + "step": 14974, + "time_per_iteration": 2.5352773666381836 + }, + { + "auxiliary_loss_clip": 0.06406291, + "auxiliary_loss_mlp": 0.01265924, + "balance_loss_clip": 0.0627301, + "balance_loss_mlp": 0.01256643, + "epoch": 0.9003457087028408, + "flos": 25637487793920.0, + "grad_norm": 1.8033115500772146, + "language_loss": 0.63577545, + "learning_rate": 1.0321508203994489e-07, + "loss": 0.71249753, + "num_input_tokens_seen": 322968880, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.09283447, + "step": 14975, + "time_per_iteration": 2.5415873527526855 + }, + { + "auxiliary_loss_clip": 0.06403071, + "auxiliary_loss_mlp": 0.01265131, + "balance_loss_clip": 0.06269182, + "balance_loss_mlp": 0.01255445, + "epoch": 0.9004058319555088, + "flos": 24396323990400.0, + "grad_norm": 1.51522570202554, + "language_loss": 0.72969091, + "learning_rate": 1.0309162011381257e-07, + "loss": 0.80637288, + "num_input_tokens_seen": 322989395, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.09686279, + "step": 14976, + "time_per_iteration": 2.5184712409973145 + }, + { + "auxiliary_loss_clip": 0.06402969, + "auxiliary_loss_mlp": 0.01264535, + "balance_loss_clip": 0.06271479, + "balance_loss_mlp": 0.01255719, + "epoch": 0.9004659552081767, + "flos": 29066994754560.0, + "grad_norm": 1.8113879200430405, + "language_loss": 0.69898343, + "learning_rate": 1.0296823011863565e-07, + "loss": 0.77565849, + "num_input_tokens_seen": 323009060, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.08813477, + "step": 14977, + "time_per_iteration": 2.5655102729797363 + }, + { + "auxiliary_loss_clip": 0.06403376, + "auxiliary_loss_mlp": 0.0126245, + "balance_loss_clip": 0.06269954, + "balance_loss_mlp": 0.0125227, + "epoch": 0.9005260784608448, + "flos": 16769484380160.0, + "grad_norm": 4.2700223305485485, + "language_loss": 0.65910697, + "learning_rate": 1.0284491205909351e-07, + "loss": 0.73576528, + "num_input_tokens_seen": 323027530, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.10174561, + "step": 14978, + "time_per_iteration": 2.446382522583008 + }, + { + "auxiliary_loss_clip": 0.06405294, + "auxiliary_loss_mlp": 0.01266515, + "balance_loss_clip": 0.06270348, + "balance_loss_mlp": 0.01256216, + "epoch": 0.9005862017135127, + "flos": 20382244220160.0, + "grad_norm": 1.7055654083923508, + "language_loss": 0.79123801, + "learning_rate": 1.0272166593986286e-07, + "loss": 0.86795604, + "num_input_tokens_seen": 323045370, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.10284424, + "step": 14979, + "time_per_iteration": 2.4874277114868164 + }, + { + "auxiliary_loss_clip": 0.06307672, + "auxiliary_loss_mlp": 0.012518, + "balance_loss_clip": 0.06253401, + "balance_loss_mlp": 0.0125079, + "epoch": 0.9006463249661807, + "flos": 67599101917440.0, + "grad_norm": 0.7137395392285222, + "language_loss": 0.52951163, + "learning_rate": 1.0259849176561642e-07, + "loss": 0.60510641, + "num_input_tokens_seen": 323105660, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.01009369, + "step": 14980, + "time_per_iteration": 3.1869754791259766 + }, + { + "auxiliary_loss_clip": 0.06407195, + "auxiliary_loss_mlp": 0.0126926, + "balance_loss_clip": 0.06270692, + "balance_loss_mlp": 0.01259426, + "epoch": 0.9007064482188486, + "flos": 28300888074240.0, + "grad_norm": 1.5679808464329743, + "language_loss": 0.82694447, + "learning_rate": 1.0247538954102553e-07, + "loss": 0.90370905, + "num_input_tokens_seen": 323126365, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.0982666, + "step": 14981, + "time_per_iteration": 3.9160542488098145 + }, + { + "auxiliary_loss_clip": 0.06398074, + "auxiliary_loss_mlp": 0.01265032, + "balance_loss_clip": 0.06271156, + "balance_loss_mlp": 0.01255639, + "epoch": 0.9007665714715166, + "flos": 21622737191040.0, + "grad_norm": 1.400293048529382, + "language_loss": 0.81589913, + "learning_rate": 1.0235235927075758e-07, + "loss": 0.8925302, + "num_input_tokens_seen": 323145655, + "router_z_loss_clip": 1.26953125, + "router_z_loss_mlp": 0.09387207, + "step": 14982, + "time_per_iteration": 2.5058610439300537 + }, + { + "auxiliary_loss_clip": 0.06395832, + "auxiliary_loss_mlp": 0.01263704, + "balance_loss_clip": 0.06270994, + "balance_loss_mlp": 0.01255037, + "epoch": 0.9008266947241845, + "flos": 26549098289280.0, + "grad_norm": 1.7768075203157598, + "language_loss": 0.7178492, + "learning_rate": 1.0222940095947885e-07, + "loss": 0.79444456, + "num_input_tokens_seen": 323164540, + "router_z_loss_clip": 1.24804688, + "router_z_loss_mlp": 0.08660889, + "step": 14983, + "time_per_iteration": 2.5296106338500977 + }, + { + "auxiliary_loss_clip": 0.06400231, + "auxiliary_loss_mlp": 0.01265711, + "balance_loss_clip": 0.06272098, + "balance_loss_mlp": 0.01257039, + "epoch": 0.9008868179768525, + "flos": 23116907998080.0, + "grad_norm": 1.269960431360642, + "language_loss": 0.75048274, + "learning_rate": 1.0210651461185115e-07, + "loss": 0.82714218, + "num_input_tokens_seen": 323186960, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.08660889, + "step": 14984, + "time_per_iteration": 2.544950246810913 + }, + { + "auxiliary_loss_clip": 0.06398641, + "auxiliary_loss_mlp": 0.01266345, + "balance_loss_clip": 0.06270674, + "balance_loss_mlp": 0.01256802, + "epoch": 0.9009469412295206, + "flos": 19066546609920.0, + "grad_norm": 1.3816348199344486, + "language_loss": 0.70344037, + "learning_rate": 1.0198370023253456e-07, + "loss": 0.78009021, + "num_input_tokens_seen": 323206135, + "router_z_loss_clip": 1.28027344, + "router_z_loss_mlp": 0.09552002, + "step": 14985, + "time_per_iteration": 2.4892797470092773 + }, + { + "auxiliary_loss_clip": 0.06402488, + "auxiliary_loss_mlp": 0.01263035, + "balance_loss_clip": 0.06268957, + "balance_loss_mlp": 0.01253617, + "epoch": 0.9010070644821885, + "flos": 23229065088000.0, + "grad_norm": 1.882791144388424, + "language_loss": 0.70384359, + "learning_rate": 1.0186095782618643e-07, + "loss": 0.7804988, + "num_input_tokens_seen": 323225980, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.09411621, + "step": 14986, + "time_per_iteration": 2.504513740539551 + }, + { + "auxiliary_loss_clip": 0.06405906, + "auxiliary_loss_mlp": 0.01263679, + "balance_loss_clip": 0.06271657, + "balance_loss_mlp": 0.01254661, + "epoch": 0.9010671877348565, + "flos": 17390674224000.0, + "grad_norm": 1.5819824224389398, + "language_loss": 0.76687872, + "learning_rate": 1.0173828739746104e-07, + "loss": 0.84357452, + "num_input_tokens_seen": 323243700, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.09020996, + "step": 14987, + "time_per_iteration": 2.469608783721924 + }, + { + "auxiliary_loss_clip": 0.06404476, + "auxiliary_loss_mlp": 0.01265663, + "balance_loss_clip": 0.06274141, + "balance_loss_mlp": 0.01256537, + "epoch": 0.9011273109875244, + "flos": 21914625288960.0, + "grad_norm": 1.7711059610657074, + "language_loss": 0.74044967, + "learning_rate": 1.0161568895100981e-07, + "loss": 0.81715107, + "num_input_tokens_seen": 323261535, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.09118652, + "step": 14988, + "time_per_iteration": 2.5563955307006836 + }, + { + "auxiliary_loss_clip": 0.06406365, + "auxiliary_loss_mlp": 0.01266135, + "balance_loss_clip": 0.06271102, + "balance_loss_mlp": 0.01255574, + "epoch": 0.9011874342401924, + "flos": 24067651150080.0, + "grad_norm": 1.7919012597313317, + "language_loss": 0.6937961, + "learning_rate": 1.0149316249148188e-07, + "loss": 0.7705211, + "num_input_tokens_seen": 323281855, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.10565186, + "step": 14989, + "time_per_iteration": 2.521286725997925 + }, + { + "auxiliary_loss_clip": 0.06404412, + "auxiliary_loss_mlp": 0.01264705, + "balance_loss_clip": 0.06270802, + "balance_loss_mlp": 0.01255889, + "epoch": 0.9012475574928603, + "flos": 16763572667520.0, + "grad_norm": 1.9829784311923562, + "language_loss": 0.80470562, + "learning_rate": 1.0137070802352376e-07, + "loss": 0.88139677, + "num_input_tokens_seen": 323299505, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.08825684, + "step": 14990, + "time_per_iteration": 2.47330379486084 + }, + { + "auxiliary_loss_clip": 0.06409752, + "auxiliary_loss_mlp": 0.01264204, + "balance_loss_clip": 0.06274055, + "balance_loss_mlp": 0.01254489, + "epoch": 0.9013076807455284, + "flos": 19976689658880.0, + "grad_norm": 1.6858389926968038, + "language_loss": 0.78232729, + "learning_rate": 1.0124832555177842e-07, + "loss": 0.85906684, + "num_input_tokens_seen": 323318365, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.09710693, + "step": 14991, + "time_per_iteration": 2.4976749420166016 + }, + { + "auxiliary_loss_clip": 0.06310493, + "auxiliary_loss_mlp": 0.01250757, + "balance_loss_clip": 0.06256165, + "balance_loss_mlp": 0.01249734, + "epoch": 0.9013678039981963, + "flos": 65200070868480.0, + "grad_norm": 0.771418761968222, + "language_loss": 0.59844536, + "learning_rate": 1.0112601508088726e-07, + "loss": 0.67405784, + "num_input_tokens_seen": 323371835, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.01023102, + "step": 14992, + "time_per_iteration": 4.404303073883057 + }, + { + "auxiliary_loss_clip": 0.06398614, + "auxiliary_loss_mlp": 0.01266162, + "balance_loss_clip": 0.06269006, + "balance_loss_mlp": 0.01256423, + "epoch": 0.9014279272508643, + "flos": 20527370691840.0, + "grad_norm": 2.0885867633446833, + "language_loss": 0.83284277, + "learning_rate": 1.0100377661548764e-07, + "loss": 0.90949053, + "num_input_tokens_seen": 323388495, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.09741211, + "step": 14993, + "time_per_iteration": 2.501352071762085 + }, + { + "auxiliary_loss_clip": 0.06403168, + "auxiliary_loss_mlp": 0.01264173, + "balance_loss_clip": 0.06271326, + "balance_loss_mlp": 0.0125457, + "epoch": 0.9014880505035322, + "flos": 17314421408640.0, + "grad_norm": 1.9120593810256001, + "language_loss": 0.73393512, + "learning_rate": 1.0088161016021502e-07, + "loss": 0.81060851, + "num_input_tokens_seen": 323405280, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09606934, + "step": 14994, + "time_per_iteration": 2.463254928588867 + }, + { + "auxiliary_loss_clip": 0.06396592, + "auxiliary_loss_mlp": 0.01264052, + "balance_loss_clip": 0.06269167, + "balance_loss_mlp": 0.01255076, + "epoch": 0.9015481737562002, + "flos": 28410445687680.0, + "grad_norm": 15.899977864830745, + "language_loss": 0.64903772, + "learning_rate": 1.0075951571970187e-07, + "loss": 0.72564423, + "num_input_tokens_seen": 323425310, + "router_z_loss_clip": 1.27441406, + "router_z_loss_mlp": 0.08978271, + "step": 14995, + "time_per_iteration": 3.9784598350524902 + }, + { + "auxiliary_loss_clip": 0.06406161, + "auxiliary_loss_mlp": 0.01265735, + "balance_loss_clip": 0.06272201, + "balance_loss_mlp": 0.01256371, + "epoch": 0.9016082970088681, + "flos": 29760454344960.0, + "grad_norm": 1.9295019510354385, + "language_loss": 0.67002177, + "learning_rate": 1.0063749329857873e-07, + "loss": 0.7467407, + "num_input_tokens_seen": 323447805, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.09368896, + "step": 14996, + "time_per_iteration": 2.565322160720825 + }, + { + "auxiliary_loss_clip": 0.06399288, + "auxiliary_loss_mlp": 0.01261496, + "balance_loss_clip": 0.06269487, + "balance_loss_mlp": 0.01252764, + "epoch": 0.9016684202615362, + "flos": 23519905009920.0, + "grad_norm": 1.9107555524376416, + "language_loss": 0.66491365, + "learning_rate": 1.0051554290147168e-07, + "loss": 0.74152148, + "num_input_tokens_seen": 323467150, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.08743286, + "step": 14997, + "time_per_iteration": 3.9145309925079346 + }, + { + "auxiliary_loss_clip": 0.06399675, + "auxiliary_loss_mlp": 0.01265504, + "balance_loss_clip": 0.06270206, + "balance_loss_mlp": 0.01255658, + "epoch": 0.9017285435142042, + "flos": 16984323048960.0, + "grad_norm": 1.6649709431983433, + "language_loss": 0.77622521, + "learning_rate": 1.0039366453300613e-07, + "loss": 0.85287696, + "num_input_tokens_seen": 323484250, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.09851074, + "step": 14998, + "time_per_iteration": 2.529517412185669 + }, + { + "auxiliary_loss_clip": 0.06404671, + "auxiliary_loss_mlp": 0.01262218, + "balance_loss_clip": 0.06271236, + "balance_loss_mlp": 0.01252837, + "epoch": 0.9017886667668721, + "flos": 21399051916800.0, + "grad_norm": 1.59161018782867, + "language_loss": 0.75096691, + "learning_rate": 1.0027185819780281e-07, + "loss": 0.82763588, + "num_input_tokens_seen": 323502910, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.09387207, + "step": 14999, + "time_per_iteration": 2.490741014480591 + }, + { + "auxiliary_loss_clip": 0.06399871, + "auxiliary_loss_mlp": 0.01266503, + "balance_loss_clip": 0.06271258, + "balance_loss_mlp": 0.01256942, + "epoch": 0.9018487900195401, + "flos": 21002972866560.0, + "grad_norm": 2.103504102903878, + "language_loss": 0.75620949, + "learning_rate": 1.0015012390048117e-07, + "loss": 0.83287323, + "num_input_tokens_seen": 323521820, + "router_z_loss_clip": 1.28417969, + "router_z_loss_mlp": 0.09564209, + "step": 15000, + "time_per_iteration": 2.588860511779785 + }, + { + "auxiliary_loss_clip": 0.0639964, + "auxiliary_loss_mlp": 0.01266266, + "balance_loss_clip": 0.06270966, + "balance_loss_mlp": 0.01257296, + "epoch": 0.901908913272208, + "flos": 53370085478400.0, + "grad_norm": 2.339615199248997, + "language_loss": 0.81363082, + "learning_rate": 1.0002846164565704e-07, + "loss": 0.8902899, + "num_input_tokens_seen": 323543200, + "router_z_loss_clip": 1.28613281, + "router_z_loss_mlp": 0.08966064, + "step": 15001, + "time_per_iteration": 2.7768962383270264 + }, + { + "auxiliary_loss_clip": 0.06399134, + "auxiliary_loss_mlp": 0.01263715, + "balance_loss_clip": 0.06270103, + "balance_loss_mlp": 0.01254906, + "epoch": 0.901969036524876, + "flos": 22096201086720.0, + "grad_norm": 1.7575723088457134, + "language_loss": 0.78756481, + "learning_rate": 9.990687143794407e-08, + "loss": 0.86419332, + "num_input_tokens_seen": 323563075, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.0880127, + "step": 15002, + "time_per_iteration": 2.5058481693267822 + }, + { + "auxiliary_loss_clip": 0.0640651, + "auxiliary_loss_mlp": 0.01263017, + "balance_loss_clip": 0.06274793, + "balance_loss_mlp": 0.0125295, + "epoch": 0.9020291597775439, + "flos": 23840653639680.0, + "grad_norm": 1.9143661946542763, + "language_loss": 0.68313885, + "learning_rate": 9.978535328195347e-08, + "loss": 0.75983411, + "num_input_tokens_seen": 323579065, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.10076904, + "step": 15003, + "time_per_iteration": 2.474975824356079 + }, + { + "auxiliary_loss_clip": 0.064068, + "auxiliary_loss_mlp": 0.01263994, + "balance_loss_clip": 0.06272157, + "balance_loss_mlp": 0.01254171, + "epoch": 0.902089283030212, + "flos": 18330767907840.0, + "grad_norm": 1.6505314719382027, + "language_loss": 0.86296797, + "learning_rate": 9.9663907182292e-08, + "loss": 0.93967593, + "num_input_tokens_seen": 323594835, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.09820557, + "step": 15004, + "time_per_iteration": 2.4478914737701416 + }, + { + "auxiliary_loss_clip": 0.06403968, + "auxiliary_loss_mlp": 0.01265292, + "balance_loss_clip": 0.06270397, + "balance_loss_mlp": 0.01255612, + "epoch": 0.9021494062828799, + "flos": 24177208763520.0, + "grad_norm": 2.1879472494001546, + "language_loss": 0.72795928, + "learning_rate": 9.954253314356575e-08, + "loss": 0.80465186, + "num_input_tokens_seen": 323611475, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.09686279, + "step": 15005, + "time_per_iteration": 2.4971089363098145 + }, + { + "auxiliary_loss_clip": 0.06404206, + "auxiliary_loss_mlp": 0.01265568, + "balance_loss_clip": 0.06268016, + "balance_loss_mlp": 0.01255602, + "epoch": 0.9022095295355479, + "flos": 21623366096640.0, + "grad_norm": 1.793458776106301, + "language_loss": 0.71351212, + "learning_rate": 9.942123117037748e-08, + "loss": 0.79020989, + "num_input_tokens_seen": 323629730, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.09967041, + "step": 15006, + "time_per_iteration": 2.4973998069763184 + }, + { + "auxiliary_loss_clip": 0.06405459, + "auxiliary_loss_mlp": 0.01263428, + "balance_loss_clip": 0.06272218, + "balance_loss_mlp": 0.01254947, + "epoch": 0.9022696527882158, + "flos": 18730871953920.0, + "grad_norm": 1.8715422678325178, + "language_loss": 0.84960949, + "learning_rate": 9.930000126732618e-08, + "loss": 0.92629838, + "num_input_tokens_seen": 323646000, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.08477783, + "step": 15007, + "time_per_iteration": 2.4507057666778564 + }, + { + "auxiliary_loss_clip": 0.06399123, + "auxiliary_loss_mlp": 0.01264283, + "balance_loss_clip": 0.06270652, + "balance_loss_mlp": 0.01255324, + "epoch": 0.9023297760408838, + "flos": 26768548932480.0, + "grad_norm": 1.4952724913749835, + "language_loss": 0.78544199, + "learning_rate": 9.917884343900928e-08, + "loss": 0.8620761, + "num_input_tokens_seen": 323667250, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.08966064, + "step": 15008, + "time_per_iteration": 2.5391016006469727 + }, + { + "auxiliary_loss_clip": 0.06395697, + "auxiliary_loss_mlp": 0.01263912, + "balance_loss_clip": 0.06271064, + "balance_loss_mlp": 0.01255159, + "epoch": 0.9023898992935517, + "flos": 20528921992320.0, + "grad_norm": 2.089305963207464, + "language_loss": 0.73686892, + "learning_rate": 9.905775769002156e-08, + "loss": 0.813465, + "num_input_tokens_seen": 323687150, + "router_z_loss_clip": 1.24707031, + "router_z_loss_mlp": 0.08743286, + "step": 15009, + "time_per_iteration": 2.4703476428985596 + }, + { + "auxiliary_loss_clip": 0.06399488, + "auxiliary_loss_mlp": 0.01262587, + "balance_loss_clip": 0.06270318, + "balance_loss_mlp": 0.01252937, + "epoch": 0.9024500225462198, + "flos": 17462315064960.0, + "grad_norm": 1.6513544611324535, + "language_loss": 0.73667175, + "learning_rate": 9.893674402495399e-08, + "loss": 0.8132925, + "num_input_tokens_seen": 323703660, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.09649658, + "step": 15010, + "time_per_iteration": 2.5722885131835938 + }, + { + "auxiliary_loss_clip": 0.06401055, + "auxiliary_loss_mlp": 0.01263209, + "balance_loss_clip": 0.06269281, + "balance_loss_mlp": 0.01253685, + "epoch": 0.9025101457988878, + "flos": 20819887695360.0, + "grad_norm": 1.8237598528390848, + "language_loss": 0.74242365, + "learning_rate": 9.881580244839538e-08, + "loss": 0.81906629, + "num_input_tokens_seen": 323722060, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09521484, + "step": 15011, + "time_per_iteration": 2.4827427864074707 + }, + { + "auxiliary_loss_clip": 0.06407499, + "auxiliary_loss_mlp": 0.01263501, + "balance_loss_clip": 0.0627194, + "balance_loss_mlp": 0.01253529, + "epoch": 0.9025702690515557, + "flos": 19032445198080.0, + "grad_norm": 1.7995959341286187, + "language_loss": 0.73437095, + "learning_rate": 9.869493296493204e-08, + "loss": 0.81108093, + "num_input_tokens_seen": 323740645, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.09979248, + "step": 15012, + "time_per_iteration": 2.4940521717071533 + }, + { + "auxiliary_loss_clip": 0.06397925, + "auxiliary_loss_mlp": 0.01264675, + "balance_loss_clip": 0.06269205, + "balance_loss_mlp": 0.01255406, + "epoch": 0.9026303923042237, + "flos": 19688952337920.0, + "grad_norm": 1.48602837314537, + "language_loss": 0.69452763, + "learning_rate": 9.857413557914763e-08, + "loss": 0.77115357, + "num_input_tokens_seen": 323758905, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.09259033, + "step": 15013, + "time_per_iteration": 2.4835736751556396 + }, + { + "auxiliary_loss_clip": 0.06398869, + "auxiliary_loss_mlp": 0.01260522, + "balance_loss_clip": 0.06272131, + "balance_loss_mlp": 0.01251594, + "epoch": 0.9026905155568916, + "flos": 24615019946880.0, + "grad_norm": 1.7650439718162378, + "language_loss": 0.73028564, + "learning_rate": 9.845341029562249e-08, + "loss": 0.80687964, + "num_input_tokens_seen": 323780595, + "router_z_loss_clip": 1.26660156, + "router_z_loss_mlp": 0.0892334, + "step": 15014, + "time_per_iteration": 2.545559883117676 + }, + { + "auxiliary_loss_clip": 0.06403096, + "auxiliary_loss_mlp": 0.01264563, + "balance_loss_clip": 0.06270044, + "balance_loss_mlp": 0.01254896, + "epoch": 0.9027506388095596, + "flos": 20528041524480.0, + "grad_norm": 1.7507431286300652, + "language_loss": 0.72524196, + "learning_rate": 9.833275711893474e-08, + "loss": 0.80191857, + "num_input_tokens_seen": 323798160, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.09661865, + "step": 15015, + "time_per_iteration": 2.4903807640075684 + }, + { + "auxiliary_loss_clip": 0.06400931, + "auxiliary_loss_mlp": 0.01265325, + "balance_loss_clip": 0.06269611, + "balance_loss_mlp": 0.01256021, + "epoch": 0.9028107620622275, + "flos": 22791211977600.0, + "grad_norm": 2.296107301723219, + "language_loss": 0.69238591, + "learning_rate": 9.821217605365895e-08, + "loss": 0.76904845, + "num_input_tokens_seen": 323816810, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09295654, + "step": 15016, + "time_per_iteration": 2.504646062850952 + }, + { + "auxiliary_loss_clip": 0.06400882, + "auxiliary_loss_mlp": 0.01265162, + "balance_loss_clip": 0.06271025, + "balance_loss_mlp": 0.01256323, + "epoch": 0.9028708853148956, + "flos": 25417534026240.0, + "grad_norm": 1.7870514242976832, + "language_loss": 0.70508265, + "learning_rate": 9.809166710436855e-08, + "loss": 0.78174311, + "num_input_tokens_seen": 323836900, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.08837891, + "step": 15017, + "time_per_iteration": 2.5365939140319824 + }, + { + "auxiliary_loss_clip": 0.06402348, + "auxiliary_loss_mlp": 0.01266381, + "balance_loss_clip": 0.06272686, + "balance_loss_mlp": 0.01256856, + "epoch": 0.9029310085675635, + "flos": 21877714765440.0, + "grad_norm": 1.5325047994601255, + "language_loss": 0.69792432, + "learning_rate": 9.797123027563237e-08, + "loss": 0.77461159, + "num_input_tokens_seen": 323855325, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.09527588, + "step": 15018, + "time_per_iteration": 2.566941738128662 + }, + { + "auxiliary_loss_clip": 0.06402241, + "auxiliary_loss_mlp": 0.01263584, + "balance_loss_clip": 0.06271377, + "balance_loss_mlp": 0.01254047, + "epoch": 0.9029911318202315, + "flos": 26221725187200.0, + "grad_norm": 1.7617066238132792, + "language_loss": 0.69269657, + "learning_rate": 9.785086557201782e-08, + "loss": 0.76935482, + "num_input_tokens_seen": 323875650, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.09545898, + "step": 15019, + "time_per_iteration": 2.5253076553344727 + }, + { + "auxiliary_loss_clip": 0.06397457, + "auxiliary_loss_mlp": 0.01264732, + "balance_loss_clip": 0.06268983, + "balance_loss_mlp": 0.01256363, + "epoch": 0.9030512550728994, + "flos": 15966886446720.0, + "grad_norm": 1.889114929079113, + "language_loss": 0.7230109, + "learning_rate": 9.773057299808951e-08, + "loss": 0.79963273, + "num_input_tokens_seen": 323892920, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.08374023, + "step": 15020, + "time_per_iteration": 2.468628406524658 + }, + { + "auxiliary_loss_clip": 0.06404897, + "auxiliary_loss_mlp": 0.01268613, + "balance_loss_clip": 0.06270586, + "balance_loss_mlp": 0.01258916, + "epoch": 0.9031113783255674, + "flos": 23994375154560.0, + "grad_norm": 1.4194454202400997, + "language_loss": 0.74583924, + "learning_rate": 9.7610352558408e-08, + "loss": 0.82257438, + "num_input_tokens_seen": 323913835, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.09698486, + "step": 15021, + "time_per_iteration": 3.985873222351074 + }, + { + "auxiliary_loss_clip": 0.06407882, + "auxiliary_loss_mlp": 0.01264222, + "balance_loss_clip": 0.06272886, + "balance_loss_mlp": 0.01254, + "epoch": 0.9031715015782353, + "flos": 22243843180800.0, + "grad_norm": 2.5045903448395137, + "language_loss": 0.73161501, + "learning_rate": 9.749020425753251e-08, + "loss": 0.80833614, + "num_input_tokens_seen": 323933440, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.10217285, + "step": 15022, + "time_per_iteration": 2.5113275051116943 + }, + { + "auxiliary_loss_clip": 0.06393677, + "auxiliary_loss_mlp": 0.01267404, + "balance_loss_clip": 0.06270428, + "balance_loss_mlp": 0.01257975, + "epoch": 0.9032316248309034, + "flos": 26330402332800.0, + "grad_norm": 3.967318803725848, + "language_loss": 0.72854298, + "learning_rate": 9.737012810001943e-08, + "loss": 0.80515379, + "num_input_tokens_seen": 323954090, + "router_z_loss_clip": 1.23242188, + "router_z_loss_mlp": 0.09423828, + "step": 15023, + "time_per_iteration": 2.5420546531677246 + }, + { + "auxiliary_loss_clip": 0.06403374, + "auxiliary_loss_mlp": 0.01262483, + "balance_loss_clip": 0.06272282, + "balance_loss_mlp": 0.01253543, + "epoch": 0.9032917480835713, + "flos": 22643066759040.0, + "grad_norm": 1.6550162923878977, + "language_loss": 0.83047354, + "learning_rate": 9.725012409042155e-08, + "loss": 0.90713215, + "num_input_tokens_seen": 323974040, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.08929443, + "step": 15024, + "time_per_iteration": 2.4915647506713867 + }, + { + "auxiliary_loss_clip": 0.06401648, + "auxiliary_loss_mlp": 0.0126249, + "balance_loss_clip": 0.06268153, + "balance_loss_mlp": 0.01253245, + "epoch": 0.9033518713362393, + "flos": 23885614154880.0, + "grad_norm": 1.4118760042972751, + "language_loss": 0.69764483, + "learning_rate": 9.713019223328966e-08, + "loss": 0.77428621, + "num_input_tokens_seen": 323996125, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.09246826, + "step": 15025, + "time_per_iteration": 2.5418436527252197 + }, + { + "auxiliary_loss_clip": 0.0639978, + "auxiliary_loss_mlp": 0.01265465, + "balance_loss_clip": 0.06270677, + "balance_loss_mlp": 0.01256614, + "epoch": 0.9034119945889073, + "flos": 26912333738880.0, + "grad_norm": 1.6472456604256864, + "language_loss": 0.77497172, + "learning_rate": 9.70103325331717e-08, + "loss": 0.85162413, + "num_input_tokens_seen": 324017645, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.08856201, + "step": 15026, + "time_per_iteration": 2.542853355407715 + }, + { + "auxiliary_loss_clip": 0.0640185, + "auxiliary_loss_mlp": 0.01264911, + "balance_loss_clip": 0.06272145, + "balance_loss_mlp": 0.01255636, + "epoch": 0.9034721178415752, + "flos": 20856462802560.0, + "grad_norm": 1.7153056741233828, + "language_loss": 0.69028974, + "learning_rate": 9.68905449946129e-08, + "loss": 0.76695728, + "num_input_tokens_seen": 324036875, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.09265137, + "step": 15027, + "time_per_iteration": 2.541903018951416 + }, + { + "auxiliary_loss_clip": 0.06398702, + "auxiliary_loss_mlp": 0.01262434, + "balance_loss_clip": 0.06273375, + "balance_loss_mlp": 0.01253147, + "epoch": 0.9035322410942432, + "flos": 22240447090560.0, + "grad_norm": 1.5068481483988292, + "language_loss": 0.75781077, + "learning_rate": 9.677082962215477e-08, + "loss": 0.83442211, + "num_input_tokens_seen": 324057045, + "router_z_loss_clip": 1.25292969, + "router_z_loss_mlp": 0.09283447, + "step": 15028, + "time_per_iteration": 2.5198581218719482 + }, + { + "auxiliary_loss_clip": 0.06401777, + "auxiliary_loss_mlp": 0.01264092, + "balance_loss_clip": 0.06272782, + "balance_loss_mlp": 0.01254365, + "epoch": 0.9035923643469111, + "flos": 25930843338240.0, + "grad_norm": 1.6223052048522015, + "language_loss": 0.69506884, + "learning_rate": 9.665118642033765e-08, + "loss": 0.77172744, + "num_input_tokens_seen": 324079735, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.09735107, + "step": 15029, + "time_per_iteration": 2.587470531463623 + }, + { + "auxiliary_loss_clip": 0.06409352, + "auxiliary_loss_mlp": 0.01263235, + "balance_loss_clip": 0.06274136, + "balance_loss_mlp": 0.01253246, + "epoch": 0.9036524875995792, + "flos": 20346088383360.0, + "grad_norm": 1.9631111792955274, + "language_loss": 0.74286699, + "learning_rate": 9.653161539369858e-08, + "loss": 0.81959289, + "num_input_tokens_seen": 324097785, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.09991455, + "step": 15030, + "time_per_iteration": 2.503896951675415 + }, + { + "auxiliary_loss_clip": 0.06404515, + "auxiliary_loss_mlp": 0.01261624, + "balance_loss_clip": 0.0626976, + "balance_loss_mlp": 0.01251652, + "epoch": 0.9037126108522471, + "flos": 40124137939200.0, + "grad_norm": 1.6436403874655139, + "language_loss": 0.6833986, + "learning_rate": 9.641211654677151e-08, + "loss": 0.76006001, + "num_input_tokens_seen": 324121625, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.09973145, + "step": 15031, + "time_per_iteration": 4.073733329772949 + }, + { + "auxiliary_loss_clip": 0.06398544, + "auxiliary_loss_mlp": 0.01262429, + "balance_loss_clip": 0.06269525, + "balance_loss_mlp": 0.01253322, + "epoch": 0.9037727341049151, + "flos": 23338874263680.0, + "grad_norm": 1.465363790750211, + "language_loss": 0.7664578, + "learning_rate": 9.629268988408723e-08, + "loss": 0.84306753, + "num_input_tokens_seen": 324142535, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.09112549, + "step": 15032, + "time_per_iteration": 2.532316207885742 + }, + { + "auxiliary_loss_clip": 0.06404598, + "auxiliary_loss_mlp": 0.01265709, + "balance_loss_clip": 0.06271706, + "balance_loss_mlp": 0.01256142, + "epoch": 0.903832857357583, + "flos": 12827506648320.0, + "grad_norm": 1.7777263252161932, + "language_loss": 0.75482416, + "learning_rate": 9.617333541017502e-08, + "loss": 0.83152729, + "num_input_tokens_seen": 324159610, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.09564209, + "step": 15033, + "time_per_iteration": 2.4739763736724854 + }, + { + "auxiliary_loss_clip": 0.0640469, + "auxiliary_loss_mlp": 0.01261941, + "balance_loss_clip": 0.06270737, + "balance_loss_mlp": 0.01252571, + "epoch": 0.903892980610251, + "flos": 25710176810880.0, + "grad_norm": 1.6001227374225993, + "language_loss": 0.73648345, + "learning_rate": 9.605405312956105e-08, + "loss": 0.81314975, + "num_input_tokens_seen": 324182510, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.09375, + "step": 15034, + "time_per_iteration": 2.6218338012695312 + }, + { + "auxiliary_loss_clip": 0.06400965, + "auxiliary_loss_mlp": 0.01267772, + "balance_loss_clip": 0.06270188, + "balance_loss_mlp": 0.01258414, + "epoch": 0.9039531038629189, + "flos": 14689357171200.0, + "grad_norm": 1.6406698929246424, + "language_loss": 0.63630551, + "learning_rate": 9.593484304676791e-08, + "loss": 0.71299291, + "num_input_tokens_seen": 324200555, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09356689, + "step": 15035, + "time_per_iteration": 3.9817230701446533 + }, + { + "auxiliary_loss_clip": 0.06408051, + "auxiliary_loss_mlp": 0.01264822, + "balance_loss_clip": 0.0627642, + "balance_loss_mlp": 0.01254773, + "epoch": 0.904013227115587, + "flos": 24031830729600.0, + "grad_norm": 2.2548052275485717, + "language_loss": 0.61979508, + "learning_rate": 9.581570516631643e-08, + "loss": 0.69652379, + "num_input_tokens_seen": 324220255, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.1005249, + "step": 15036, + "time_per_iteration": 2.5301129817962646 + }, + { + "auxiliary_loss_clip": 0.06398427, + "auxiliary_loss_mlp": 0.01266003, + "balance_loss_clip": 0.06272119, + "balance_loss_mlp": 0.01257683, + "epoch": 0.9040733503682549, + "flos": 22863020526720.0, + "grad_norm": 1.5445550025492283, + "language_loss": 0.8279326, + "learning_rate": 9.569663949272455e-08, + "loss": 0.90457696, + "num_input_tokens_seen": 324237855, + "router_z_loss_clip": 1.26269531, + "router_z_loss_mlp": 0.08312988, + "step": 15037, + "time_per_iteration": 3.9757161140441895 + }, + { + "auxiliary_loss_clip": 0.0640467, + "auxiliary_loss_mlp": 0.01261891, + "balance_loss_clip": 0.0627031, + "balance_loss_mlp": 0.01252652, + "epoch": 0.9041334736209229, + "flos": 19981175852160.0, + "grad_norm": 3.8362695019003703, + "language_loss": 0.6746912, + "learning_rate": 9.557764603050667e-08, + "loss": 0.75135684, + "num_input_tokens_seen": 324257050, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.09240723, + "step": 15038, + "time_per_iteration": 2.483499765396118 + }, + { + "auxiliary_loss_clip": 0.06400178, + "auxiliary_loss_mlp": 0.01264492, + "balance_loss_clip": 0.06270482, + "balance_loss_mlp": 0.01255128, + "epoch": 0.9041935968735909, + "flos": 17536387674240.0, + "grad_norm": 1.9515146557246112, + "language_loss": 0.75760317, + "learning_rate": 9.545872478417494e-08, + "loss": 0.83424991, + "num_input_tokens_seen": 324275510, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.09356689, + "step": 15039, + "time_per_iteration": 2.4685962200164795 + }, + { + "auxiliary_loss_clip": 0.06397585, + "auxiliary_loss_mlp": 0.01264821, + "balance_loss_clip": 0.06270954, + "balance_loss_mlp": 0.0125575, + "epoch": 0.9042537201262588, + "flos": 22786138805760.0, + "grad_norm": 1.4938055012181715, + "language_loss": 0.70288754, + "learning_rate": 9.533987575823977e-08, + "loss": 0.77951157, + "num_input_tokens_seen": 324295150, + "router_z_loss_clip": 1.26660156, + "router_z_loss_mlp": 0.09069824, + "step": 15040, + "time_per_iteration": 2.491750717163086 + }, + { + "auxiliary_loss_clip": 0.0639802, + "auxiliary_loss_mlp": 0.01262156, + "balance_loss_clip": 0.06270084, + "balance_loss_mlp": 0.01252995, + "epoch": 0.9043138433789268, + "flos": 20601778717440.0, + "grad_norm": 1.6249589859719578, + "language_loss": 0.67891502, + "learning_rate": 9.522109895720709e-08, + "loss": 0.75551683, + "num_input_tokens_seen": 324313855, + "router_z_loss_clip": 1.28027344, + "router_z_loss_mlp": 0.09155273, + "step": 15041, + "time_per_iteration": 2.4903454780578613 + }, + { + "auxiliary_loss_clip": 0.06401966, + "auxiliary_loss_mlp": 0.01265404, + "balance_loss_clip": 0.06270871, + "balance_loss_mlp": 0.01255808, + "epoch": 0.9043739666315948, + "flos": 32971223422080.0, + "grad_norm": 1.8083812356166467, + "language_loss": 0.5776667, + "learning_rate": 9.510239438558155e-08, + "loss": 0.65434039, + "num_input_tokens_seen": 324338465, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.09594727, + "step": 15042, + "time_per_iteration": 2.6052052974700928 + }, + { + "auxiliary_loss_clip": 0.06309783, + "auxiliary_loss_mlp": 0.01252944, + "balance_loss_clip": 0.0625516, + "balance_loss_mlp": 0.01251936, + "epoch": 0.9044340898842628, + "flos": 67316563549440.0, + "grad_norm": 0.7739673625252199, + "language_loss": 0.56937176, + "learning_rate": 9.498376204786351e-08, + "loss": 0.64499903, + "num_input_tokens_seen": 324398740, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.01007843, + "step": 15043, + "time_per_iteration": 3.1082680225372314 + }, + { + "auxiliary_loss_clip": 0.06401354, + "auxiliary_loss_mlp": 0.01262146, + "balance_loss_clip": 0.06270433, + "balance_loss_mlp": 0.01252353, + "epoch": 0.9044942131369307, + "flos": 17719053575040.0, + "grad_norm": 1.5454963743123358, + "language_loss": 0.70180726, + "learning_rate": 9.486520194855274e-08, + "loss": 0.77844226, + "num_input_tokens_seen": 324417335, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09802246, + "step": 15044, + "time_per_iteration": 2.512294054031372 + }, + { + "auxiliary_loss_clip": 0.06407118, + "auxiliary_loss_mlp": 0.01268666, + "balance_loss_clip": 0.06272253, + "balance_loss_mlp": 0.01258509, + "epoch": 0.9045543363895987, + "flos": 17826137493120.0, + "grad_norm": 2.078656560936693, + "language_loss": 0.6995939, + "learning_rate": 9.474671409214407e-08, + "loss": 0.77635169, + "num_input_tokens_seen": 324433240, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.10162354, + "step": 15045, + "time_per_iteration": 2.4667201042175293 + }, + { + "auxiliary_loss_clip": 0.06404091, + "auxiliary_loss_mlp": 0.01266009, + "balance_loss_clip": 0.06270969, + "balance_loss_mlp": 0.01255948, + "epoch": 0.9046144596422666, + "flos": 21879349920000.0, + "grad_norm": 6.184482867221641, + "language_loss": 0.66192079, + "learning_rate": 9.462829848313081e-08, + "loss": 0.73862171, + "num_input_tokens_seen": 324452675, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.10064697, + "step": 15046, + "time_per_iteration": 2.486665964126587 + }, + { + "auxiliary_loss_clip": 0.06403056, + "auxiliary_loss_mlp": 0.0126387, + "balance_loss_clip": 0.0626939, + "balance_loss_mlp": 0.0125382, + "epoch": 0.9046745828949346, + "flos": 17677866493440.0, + "grad_norm": 1.9702778577435238, + "language_loss": 0.6221115, + "learning_rate": 9.450995512600379e-08, + "loss": 0.69878078, + "num_input_tokens_seen": 324467865, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.10058594, + "step": 15047, + "time_per_iteration": 2.4436275959014893 + }, + { + "auxiliary_loss_clip": 0.06400335, + "auxiliary_loss_mlp": 0.01266598, + "balance_loss_clip": 0.06270543, + "balance_loss_mlp": 0.01257502, + "epoch": 0.9047347061476025, + "flos": 25709631759360.0, + "grad_norm": 1.433089504689409, + "language_loss": 0.71434736, + "learning_rate": 9.439168402525032e-08, + "loss": 0.7910167, + "num_input_tokens_seen": 324490430, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.09094238, + "step": 15048, + "time_per_iteration": 2.529222249984741 + }, + { + "auxiliary_loss_clip": 0.06401604, + "auxiliary_loss_mlp": 0.01265479, + "balance_loss_clip": 0.06268995, + "balance_loss_mlp": 0.01255853, + "epoch": 0.9047948294002706, + "flos": 15163449972480.0, + "grad_norm": 1.9513151131510529, + "language_loss": 0.75001335, + "learning_rate": 9.427348518535483e-08, + "loss": 0.82668418, + "num_input_tokens_seen": 324506620, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09631348, + "step": 15049, + "time_per_iteration": 2.481271743774414 + }, + { + "auxiliary_loss_clip": 0.06397744, + "auxiliary_loss_mlp": 0.01262639, + "balance_loss_clip": 0.0626848, + "balance_loss_mlp": 0.01253204, + "epoch": 0.9048549526529385, + "flos": 21878846795520.0, + "grad_norm": 2.2351800902186243, + "language_loss": 0.75558716, + "learning_rate": 9.415535861079993e-08, + "loss": 0.83219099, + "num_input_tokens_seen": 324525505, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.09436035, + "step": 15050, + "time_per_iteration": 2.6334476470947266 + }, + { + "auxiliary_loss_clip": 0.06403841, + "auxiliary_loss_mlp": 0.01262044, + "balance_loss_clip": 0.06271207, + "balance_loss_mlp": 0.01252353, + "epoch": 0.9049150759056065, + "flos": 23552790537600.0, + "grad_norm": 1.7362546421895346, + "language_loss": 0.82079089, + "learning_rate": 9.403730430606472e-08, + "loss": 0.89744979, + "num_input_tokens_seen": 324544415, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.09692383, + "step": 15051, + "time_per_iteration": 2.523456573486328 + }, + { + "auxiliary_loss_clip": 0.06402219, + "auxiliary_loss_mlp": 0.01263229, + "balance_loss_clip": 0.06270407, + "balance_loss_mlp": 0.01254336, + "epoch": 0.9049751991582745, + "flos": 19651957960320.0, + "grad_norm": 1.966519944539865, + "language_loss": 0.89343834, + "learning_rate": 9.391932227562582e-08, + "loss": 0.97009277, + "num_input_tokens_seen": 324562555, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.08898926, + "step": 15052, + "time_per_iteration": 2.478151798248291 + }, + { + "auxiliary_loss_clip": 0.06406327, + "auxiliary_loss_mlp": 0.012654, + "balance_loss_clip": 0.06270624, + "balance_loss_mlp": 0.01255613, + "epoch": 0.9050353224109424, + "flos": 15601638499200.0, + "grad_norm": 2.0979073593011495, + "language_loss": 0.77037603, + "learning_rate": 9.380141252395724e-08, + "loss": 0.84709334, + "num_input_tokens_seen": 324580865, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.09777832, + "step": 15053, + "time_per_iteration": 2.4709739685058594 + }, + { + "auxiliary_loss_clip": 0.06399354, + "auxiliary_loss_mlp": 0.0126397, + "balance_loss_clip": 0.06270497, + "balance_loss_mlp": 0.0125497, + "epoch": 0.9050954456636104, + "flos": 28191078898560.0, + "grad_norm": 1.875148681506397, + "language_loss": 0.73177737, + "learning_rate": 9.368357505553049e-08, + "loss": 0.80841064, + "num_input_tokens_seen": 324600665, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.09002686, + "step": 15054, + "time_per_iteration": 2.5475215911865234 + }, + { + "auxiliary_loss_clip": 0.06402034, + "auxiliary_loss_mlp": 0.01264626, + "balance_loss_clip": 0.06272063, + "balance_loss_mlp": 0.01255804, + "epoch": 0.9051555689162784, + "flos": 25737444115200.0, + "grad_norm": 1.5847730284358719, + "language_loss": 0.83485198, + "learning_rate": 9.356580987481333e-08, + "loss": 0.91151857, + "num_input_tokens_seen": 324618145, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.0881958, + "step": 15055, + "time_per_iteration": 2.538119077682495 + }, + { + "auxiliary_loss_clip": 0.06400138, + "auxiliary_loss_mlp": 0.01262787, + "balance_loss_clip": 0.06271436, + "balance_loss_mlp": 0.01253405, + "epoch": 0.9052156921689464, + "flos": 23263795405440.0, + "grad_norm": 1.5354699500322193, + "language_loss": 0.85279965, + "learning_rate": 9.344811698627176e-08, + "loss": 0.92942894, + "num_input_tokens_seen": 324638165, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.09387207, + "step": 15056, + "time_per_iteration": 2.523686408996582 + }, + { + "auxiliary_loss_clip": 0.06402357, + "auxiliary_loss_mlp": 0.01267292, + "balance_loss_clip": 0.06270941, + "balance_loss_mlp": 0.01258047, + "epoch": 0.9052758154216143, + "flos": 29571038190720.0, + "grad_norm": 1.8112643765194574, + "language_loss": 0.72546428, + "learning_rate": 9.333049639436863e-08, + "loss": 0.80216074, + "num_input_tokens_seen": 324658560, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09246826, + "step": 15057, + "time_per_iteration": 2.587482213973999 + }, + { + "auxiliary_loss_clip": 0.06398334, + "auxiliary_loss_mlp": 0.01263054, + "balance_loss_clip": 0.06271854, + "balance_loss_mlp": 0.0125434, + "epoch": 0.9053359386742823, + "flos": 22134285567360.0, + "grad_norm": 4.2714331701731885, + "language_loss": 0.81114525, + "learning_rate": 9.321294810356418e-08, + "loss": 0.88775909, + "num_input_tokens_seen": 324679185, + "router_z_loss_clip": 1.26660156, + "router_z_loss_mlp": 0.0871582, + "step": 15058, + "time_per_iteration": 2.5192415714263916 + }, + { + "auxiliary_loss_clip": 0.06307732, + "auxiliary_loss_mlp": 0.01250617, + "balance_loss_clip": 0.06253529, + "balance_loss_mlp": 0.01249746, + "epoch": 0.9053960619269502, + "flos": 67112332421760.0, + "grad_norm": 0.6586954372577108, + "language_loss": 0.51446468, + "learning_rate": 9.309547211831592e-08, + "loss": 0.59004819, + "num_input_tokens_seen": 324744830, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.00873566, + "step": 15059, + "time_per_iteration": 3.2848002910614014 + }, + { + "auxiliary_loss_clip": 0.06403908, + "auxiliary_loss_mlp": 0.01265364, + "balance_loss_clip": 0.06271765, + "balance_loss_mlp": 0.01256018, + "epoch": 0.9054561851796182, + "flos": 15820921434240.0, + "grad_norm": 3.296870649078698, + "language_loss": 0.67341602, + "learning_rate": 9.297806844307831e-08, + "loss": 0.75010878, + "num_input_tokens_seen": 324762905, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09344482, + "step": 15060, + "time_per_iteration": 2.514012098312378 + }, + { + "auxiliary_loss_clip": 0.06402002, + "auxiliary_loss_mlp": 0.01265399, + "balance_loss_clip": 0.06269133, + "balance_loss_mlp": 0.01255397, + "epoch": 0.9055163084322861, + "flos": 17572837000320.0, + "grad_norm": 1.9490761162977135, + "language_loss": 0.64140469, + "learning_rate": 9.286073708230357e-08, + "loss": 0.71807867, + "num_input_tokens_seen": 324781905, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.10003662, + "step": 15061, + "time_per_iteration": 4.011102676391602 + }, + { + "auxiliary_loss_clip": 0.06401615, + "auxiliary_loss_mlp": 0.01264256, + "balance_loss_clip": 0.0627028, + "balance_loss_mlp": 0.01254558, + "epoch": 0.9055764316849542, + "flos": 17645358309120.0, + "grad_norm": 1.760466857694858, + "language_loss": 0.71594036, + "learning_rate": 9.274347804044058e-08, + "loss": 0.79259902, + "num_input_tokens_seen": 324799260, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.09698486, + "step": 15062, + "time_per_iteration": 2.4741172790527344 + }, + { + "auxiliary_loss_clip": 0.06401698, + "auxiliary_loss_mlp": 0.01266798, + "balance_loss_clip": 0.0627198, + "balance_loss_mlp": 0.01257488, + "epoch": 0.9056365549376221, + "flos": 20127098937600.0, + "grad_norm": 1.6172347718122244, + "language_loss": 0.70928562, + "learning_rate": 9.2626291321936e-08, + "loss": 0.78597057, + "num_input_tokens_seen": 324817800, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.09307861, + "step": 15063, + "time_per_iteration": 2.4766180515289307 + }, + { + "auxiliary_loss_clip": 0.06397741, + "auxiliary_loss_mlp": 0.01264342, + "balance_loss_clip": 0.0627069, + "balance_loss_mlp": 0.01255396, + "epoch": 0.9056966781902901, + "flos": 27606002964480.0, + "grad_norm": 1.5248390937922436, + "language_loss": 0.72296852, + "learning_rate": 9.250917693123406e-08, + "loss": 0.79958934, + "num_input_tokens_seen": 324838445, + "router_z_loss_clip": 1.26953125, + "router_z_loss_mlp": 0.08947754, + "step": 15064, + "time_per_iteration": 2.5452868938446045 + }, + { + "auxiliary_loss_clip": 0.06402265, + "auxiliary_loss_mlp": 0.01263796, + "balance_loss_clip": 0.06268708, + "balance_loss_mlp": 0.01255106, + "epoch": 0.9057568014429581, + "flos": 25926986050560.0, + "grad_norm": 1.7435921110411652, + "language_loss": 0.70200551, + "learning_rate": 9.23921348727752e-08, + "loss": 0.77866608, + "num_input_tokens_seen": 324859895, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.08691406, + "step": 15065, + "time_per_iteration": 2.5181055068969727 + }, + { + "auxiliary_loss_clip": 0.06401214, + "auxiliary_loss_mlp": 0.01264477, + "balance_loss_clip": 0.06270632, + "balance_loss_mlp": 0.01254851, + "epoch": 0.905816924695626, + "flos": 22937093136000.0, + "grad_norm": 1.533976766894516, + "language_loss": 0.63432038, + "learning_rate": 9.227516515099743e-08, + "loss": 0.71097726, + "num_input_tokens_seen": 324879580, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.09625244, + "step": 15066, + "time_per_iteration": 2.5122158527374268 + }, + { + "auxiliary_loss_clip": 0.06410006, + "auxiliary_loss_mlp": 0.01263218, + "balance_loss_clip": 0.06271099, + "balance_loss_mlp": 0.01252441, + "epoch": 0.905877047948294, + "flos": 22162894536960.0, + "grad_norm": 1.777219964068019, + "language_loss": 0.80306625, + "learning_rate": 9.215826777033675e-08, + "loss": 0.87979841, + "num_input_tokens_seen": 324898950, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.10766602, + "step": 15067, + "time_per_iteration": 2.4910852909088135 + }, + { + "auxiliary_loss_clip": 0.06400168, + "auxiliary_loss_mlp": 0.01266388, + "balance_loss_clip": 0.06269554, + "balance_loss_mlp": 0.01256393, + "epoch": 0.905937171200962, + "flos": 15310253525760.0, + "grad_norm": 1.552097033204445, + "language_loss": 0.69955444, + "learning_rate": 9.204144273522563e-08, + "loss": 0.77621996, + "num_input_tokens_seen": 324917455, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.09985352, + "step": 15068, + "time_per_iteration": 2.5215139389038086 + }, + { + "auxiliary_loss_clip": 0.06396197, + "auxiliary_loss_mlp": 0.01265147, + "balance_loss_clip": 0.06269008, + "balance_loss_mlp": 0.0125663, + "epoch": 0.90599729445363, + "flos": 19468914716160.0, + "grad_norm": 1.805239207493818, + "language_loss": 0.85927349, + "learning_rate": 9.19246900500943e-08, + "loss": 0.93588692, + "num_input_tokens_seen": 324934495, + "router_z_loss_clip": 1.27050781, + "router_z_loss_mlp": 0.08514404, + "step": 15069, + "time_per_iteration": 2.4659931659698486 + }, + { + "auxiliary_loss_clip": 0.06407644, + "auxiliary_loss_mlp": 0.01265898, + "balance_loss_clip": 0.0627166, + "balance_loss_mlp": 0.01255926, + "epoch": 0.9060574177062979, + "flos": 23739816850560.0, + "grad_norm": 1.8280166423907744, + "language_loss": 0.5974074, + "learning_rate": 9.180800971936987e-08, + "loss": 0.67414284, + "num_input_tokens_seen": 324953230, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.09967041, + "step": 15070, + "time_per_iteration": 3.9546656608581543 + }, + { + "auxiliary_loss_clip": 0.0640643, + "auxiliary_loss_mlp": 0.01265113, + "balance_loss_clip": 0.06271288, + "balance_loss_mlp": 0.01255755, + "epoch": 0.9061175409589659, + "flos": 17316853176960.0, + "grad_norm": 1.9844350397935704, + "language_loss": 0.81391585, + "learning_rate": 9.169140174747724e-08, + "loss": 0.89063132, + "num_input_tokens_seen": 324969880, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.09356689, + "step": 15071, + "time_per_iteration": 2.4680888652801514 + }, + { + "auxiliary_loss_clip": 0.06404223, + "auxiliary_loss_mlp": 0.01267825, + "balance_loss_clip": 0.06269695, + "balance_loss_mlp": 0.01257705, + "epoch": 0.9061776642116338, + "flos": 17783063694720.0, + "grad_norm": 1.8768433932169004, + "language_loss": 0.61904967, + "learning_rate": 9.157486613883758e-08, + "loss": 0.6957702, + "num_input_tokens_seen": 324987005, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.10125732, + "step": 15072, + "time_per_iteration": 2.4591763019561768 + }, + { + "auxiliary_loss_clip": 0.06402346, + "auxiliary_loss_mlp": 0.0126459, + "balance_loss_clip": 0.06271026, + "balance_loss_mlp": 0.01253814, + "epoch": 0.9062377874643018, + "flos": 42787580146560.0, + "grad_norm": 1.9902101584979952, + "language_loss": 0.72696972, + "learning_rate": 9.145840289787021e-08, + "loss": 0.80363911, + "num_input_tokens_seen": 325010700, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.10778809, + "step": 15073, + "time_per_iteration": 2.7119879722595215 + }, + { + "auxiliary_loss_clip": 0.06397014, + "auxiliary_loss_mlp": 0.01263309, + "balance_loss_clip": 0.06270237, + "balance_loss_mlp": 0.01254624, + "epoch": 0.9062979107169697, + "flos": 16367032419840.0, + "grad_norm": 1.764665765355135, + "language_loss": 0.81274933, + "learning_rate": 9.134201202899161e-08, + "loss": 0.88935256, + "num_input_tokens_seen": 325028760, + "router_z_loss_clip": 1.26855469, + "router_z_loss_mlp": 0.08685303, + "step": 15074, + "time_per_iteration": 2.4704678058624268 + }, + { + "auxiliary_loss_clip": 0.06309762, + "auxiliary_loss_mlp": 0.01249224, + "balance_loss_clip": 0.06255601, + "balance_loss_mlp": 0.01248231, + "epoch": 0.9063580339696378, + "flos": 69336286364160.0, + "grad_norm": 0.7296001006592828, + "language_loss": 0.52386355, + "learning_rate": 9.122569353661513e-08, + "loss": 0.59945345, + "num_input_tokens_seen": 325093545, + "router_z_loss_clip": 0.54248047, + "router_z_loss_mlp": 0.00992584, + "step": 15075, + "time_per_iteration": 4.617650508880615 + }, + { + "auxiliary_loss_clip": 0.06307253, + "auxiliary_loss_mlp": 0.01248452, + "balance_loss_clip": 0.06252947, + "balance_loss_mlp": 0.01247452, + "epoch": 0.9064181572223057, + "flos": 58813388812800.0, + "grad_norm": 0.7084404872191936, + "language_loss": 0.62037706, + "learning_rate": 9.11094474251517e-08, + "loss": 0.69593406, + "num_input_tokens_seen": 325152295, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.00999451, + "step": 15076, + "time_per_iteration": 3.049726724624634 + }, + { + "auxiliary_loss_clip": 0.06398588, + "auxiliary_loss_mlp": 0.01263843, + "balance_loss_clip": 0.06269225, + "balance_loss_mlp": 0.01254611, + "epoch": 0.9064782804749737, + "flos": 21769205328000.0, + "grad_norm": 1.6263272411743717, + "language_loss": 0.82236755, + "learning_rate": 9.09932736990091e-08, + "loss": 0.89899194, + "num_input_tokens_seen": 325169705, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.09234619, + "step": 15077, + "time_per_iteration": 3.9159936904907227 + }, + { + "auxiliary_loss_clip": 0.06396757, + "auxiliary_loss_mlp": 0.01267967, + "balance_loss_clip": 0.06269564, + "balance_loss_mlp": 0.01259337, + "epoch": 0.9065384037276417, + "flos": 21403747745280.0, + "grad_norm": 1.4172221106724106, + "language_loss": 0.84297204, + "learning_rate": 9.08771723625934e-08, + "loss": 0.91961926, + "num_input_tokens_seen": 325189175, + "router_z_loss_clip": 1.27246094, + "router_z_loss_mlp": 0.08630371, + "step": 15078, + "time_per_iteration": 2.5148606300354004 + }, + { + "auxiliary_loss_clip": 0.06395961, + "auxiliary_loss_mlp": 0.01261788, + "balance_loss_clip": 0.06270163, + "balance_loss_mlp": 0.01253188, + "epoch": 0.9065985269803096, + "flos": 38291734926720.0, + "grad_norm": 1.4055545219540846, + "language_loss": 0.6550107, + "learning_rate": 9.076114342030617e-08, + "loss": 0.73158824, + "num_input_tokens_seen": 325211020, + "router_z_loss_clip": 1.2578125, + "router_z_loss_mlp": 0.08599854, + "step": 15079, + "time_per_iteration": 2.6431503295898438 + }, + { + "auxiliary_loss_clip": 0.06400599, + "auxiliary_loss_mlp": 0.01264169, + "balance_loss_clip": 0.06269769, + "balance_loss_mlp": 0.01254889, + "epoch": 0.9066586502329776, + "flos": 44828406990720.0, + "grad_norm": 1.5673413930371245, + "language_loss": 0.70924938, + "learning_rate": 9.064518687654765e-08, + "loss": 0.78589708, + "num_input_tokens_seen": 325236970, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.0927124, + "step": 15080, + "time_per_iteration": 2.7151243686676025 + }, + { + "auxiliary_loss_clip": 0.06409639, + "auxiliary_loss_mlp": 0.01261513, + "balance_loss_clip": 0.0627256, + "balance_loss_mlp": 0.01251368, + "epoch": 0.9067187734856456, + "flos": 18629825529600.0, + "grad_norm": 2.407695406101915, + "language_loss": 0.7148692, + "learning_rate": 9.052930273571547e-08, + "loss": 0.79158074, + "num_input_tokens_seen": 325252670, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.10144043, + "step": 15081, + "time_per_iteration": 2.5449743270874023 + }, + { + "auxiliary_loss_clip": 0.06400509, + "auxiliary_loss_mlp": 0.01261877, + "balance_loss_clip": 0.06271397, + "balance_loss_mlp": 0.01251965, + "epoch": 0.9067788967383136, + "flos": 22754217600000.0, + "grad_norm": 2.1469946393929935, + "language_loss": 0.74491692, + "learning_rate": 9.04134910022032e-08, + "loss": 0.82154077, + "num_input_tokens_seen": 325273860, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.09912109, + "step": 15082, + "time_per_iteration": 2.513711929321289 + }, + { + "auxiliary_loss_clip": 0.06398562, + "auxiliary_loss_mlp": 0.01265255, + "balance_loss_clip": 0.06270002, + "balance_loss_mlp": 0.01256463, + "epoch": 0.9068390199909815, + "flos": 27677853440640.0, + "grad_norm": 1.8132990718715725, + "language_loss": 0.78194749, + "learning_rate": 9.029775168040266e-08, + "loss": 0.85858572, + "num_input_tokens_seen": 325294140, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.08782959, + "step": 15083, + "time_per_iteration": 2.5405113697052 + }, + { + "auxiliary_loss_clip": 0.06396039, + "auxiliary_loss_mlp": 0.01261891, + "balance_loss_clip": 0.06269726, + "balance_loss_mlp": 0.01253183, + "epoch": 0.9068991432436495, + "flos": 24250987883520.0, + "grad_norm": 1.5606180532346916, + "language_loss": 0.69092917, + "learning_rate": 9.01820847747028e-08, + "loss": 0.76750851, + "num_input_tokens_seen": 325313130, + "router_z_loss_clip": 1.26367188, + "router_z_loss_mlp": 0.08703613, + "step": 15084, + "time_per_iteration": 2.514923095703125 + }, + { + "auxiliary_loss_clip": 0.06400265, + "auxiliary_loss_mlp": 0.01266118, + "balance_loss_clip": 0.06270438, + "balance_loss_mlp": 0.01256784, + "epoch": 0.9069592664963174, + "flos": 28040040714240.0, + "grad_norm": 2.1153010193521946, + "language_loss": 0.67261243, + "learning_rate": 9.006649028948965e-08, + "loss": 0.74927622, + "num_input_tokens_seen": 325334880, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.09332275, + "step": 15085, + "time_per_iteration": 2.54697322845459 + }, + { + "auxiliary_loss_clip": 0.06311613, + "auxiliary_loss_mlp": 0.01250731, + "balance_loss_clip": 0.06257414, + "balance_loss_mlp": 0.01249732, + "epoch": 0.9070193897489854, + "flos": 68796479162880.0, + "grad_norm": 0.7613186514195954, + "language_loss": 0.61280566, + "learning_rate": 8.995096822914638e-08, + "loss": 0.68842912, + "num_input_tokens_seen": 325394175, + "router_z_loss_clip": 0.54248047, + "router_z_loss_mlp": 0.00998688, + "step": 15086, + "time_per_iteration": 3.126314163208008 + }, + { + "auxiliary_loss_clip": 0.06399283, + "auxiliary_loss_mlp": 0.01268957, + "balance_loss_clip": 0.06270003, + "balance_loss_mlp": 0.01259372, + "epoch": 0.9070795130016533, + "flos": 23448515731200.0, + "grad_norm": 1.464283060306305, + "language_loss": 0.72384381, + "learning_rate": 8.983551859805416e-08, + "loss": 0.8005262, + "num_input_tokens_seen": 325415020, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.0958252, + "step": 15087, + "time_per_iteration": 2.5283164978027344 + }, + { + "auxiliary_loss_clip": 0.06401356, + "auxiliary_loss_mlp": 0.01263049, + "balance_loss_clip": 0.06269845, + "balance_loss_mlp": 0.01253422, + "epoch": 0.9071396362543214, + "flos": 18922384460160.0, + "grad_norm": 2.001227665639937, + "language_loss": 0.76600784, + "learning_rate": 8.972014140059058e-08, + "loss": 0.84265184, + "num_input_tokens_seen": 325433595, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09625244, + "step": 15088, + "time_per_iteration": 2.4616496562957764 + }, + { + "auxiliary_loss_clip": 0.06397097, + "auxiliary_loss_mlp": 0.01263128, + "balance_loss_clip": 0.06272545, + "balance_loss_mlp": 0.01254426, + "epoch": 0.9071997595069893, + "flos": 25235706666240.0, + "grad_norm": 1.9506446362411543, + "language_loss": 0.73176634, + "learning_rate": 8.960483664113038e-08, + "loss": 0.80836862, + "num_input_tokens_seen": 325451605, + "router_z_loss_clip": 1.24609375, + "router_z_loss_mlp": 0.08703613, + "step": 15089, + "time_per_iteration": 2.5427422523498535 + }, + { + "auxiliary_loss_clip": 0.06397973, + "auxiliary_loss_mlp": 0.01264219, + "balance_loss_clip": 0.06272795, + "balance_loss_mlp": 0.01256298, + "epoch": 0.9072598827596573, + "flos": 24352453578240.0, + "grad_norm": 1.8246434429888692, + "language_loss": 0.75705659, + "learning_rate": 8.948960432404628e-08, + "loss": 0.83367848, + "num_input_tokens_seen": 325470645, + "router_z_loss_clip": 1.25195312, + "router_z_loss_mlp": 0.07922363, + "step": 15090, + "time_per_iteration": 2.5452728271484375 + }, + { + "auxiliary_loss_clip": 0.06400724, + "auxiliary_loss_mlp": 0.01267571, + "balance_loss_clip": 0.0626859, + "balance_loss_mlp": 0.01257468, + "epoch": 0.9073200060123253, + "flos": 22681654364160.0, + "grad_norm": 2.143089382853149, + "language_loss": 0.77423573, + "learning_rate": 8.93744444537079e-08, + "loss": 0.85091865, + "num_input_tokens_seen": 325488070, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.10107422, + "step": 15091, + "time_per_iteration": 2.4868202209472656 + }, + { + "auxiliary_loss_clip": 0.06397654, + "auxiliary_loss_mlp": 0.01263957, + "balance_loss_clip": 0.0627251, + "balance_loss_mlp": 0.01256113, + "epoch": 0.9073801292649932, + "flos": 23702151640320.0, + "grad_norm": 1.4693758835684605, + "language_loss": 0.86293435, + "learning_rate": 8.925935703448217e-08, + "loss": 0.93955046, + "num_input_tokens_seen": 325509285, + "router_z_loss_clip": 1.25097656, + "router_z_loss_mlp": 0.07843018, + "step": 15092, + "time_per_iteration": 2.5014595985412598 + }, + { + "auxiliary_loss_clip": 0.06402805, + "auxiliary_loss_mlp": 0.01262805, + "balance_loss_clip": 0.0627242, + "balance_loss_mlp": 0.0125365, + "epoch": 0.9074402525176612, + "flos": 25382636000640.0, + "grad_norm": 1.507029531138036, + "language_loss": 0.78888041, + "learning_rate": 8.914434207073296e-08, + "loss": 0.86553651, + "num_input_tokens_seen": 325529360, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.09155273, + "step": 15093, + "time_per_iteration": 2.583144426345825 + }, + { + "auxiliary_loss_clip": 0.06309871, + "auxiliary_loss_mlp": 0.01252503, + "balance_loss_clip": 0.06255481, + "balance_loss_mlp": 0.01251333, + "epoch": 0.9075003757703292, + "flos": 67667178960000.0, + "grad_norm": 0.7248238804514167, + "language_loss": 0.5692569, + "learning_rate": 8.902939956682188e-08, + "loss": 0.64488065, + "num_input_tokens_seen": 325583565, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.01167297, + "step": 15094, + "time_per_iteration": 3.065505266189575 + }, + { + "auxiliary_loss_clip": 0.06404001, + "auxiliary_loss_mlp": 0.01262814, + "balance_loss_clip": 0.06270079, + "balance_loss_mlp": 0.01253093, + "epoch": 0.9075604990229972, + "flos": 22459897733760.0, + "grad_norm": 2.3026997740502297, + "language_loss": 0.71735692, + "learning_rate": 8.891452952710742e-08, + "loss": 0.79402506, + "num_input_tokens_seen": 325603690, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.097229, + "step": 15095, + "time_per_iteration": 2.5325124263763428 + }, + { + "auxiliary_loss_clip": 0.06400533, + "auxiliary_loss_mlp": 0.01265643, + "balance_loss_clip": 0.06269962, + "balance_loss_mlp": 0.01256262, + "epoch": 0.9076206222756651, + "flos": 19542735763200.0, + "grad_norm": 1.6933352125689685, + "language_loss": 0.74221349, + "learning_rate": 8.879973195594526e-08, + "loss": 0.81887525, + "num_input_tokens_seen": 325622255, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.09387207, + "step": 15096, + "time_per_iteration": 2.4719395637512207 + }, + { + "auxiliary_loss_clip": 0.06403936, + "auxiliary_loss_mlp": 0.01263226, + "balance_loss_clip": 0.06269987, + "balance_loss_mlp": 0.01252587, + "epoch": 0.9076807455283331, + "flos": 30124654116480.0, + "grad_norm": 1.8580529883394223, + "language_loss": 0.58028173, + "learning_rate": 8.868500685768898e-08, + "loss": 0.65695339, + "num_input_tokens_seen": 325640165, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.10644531, + "step": 15097, + "time_per_iteration": 2.554093837738037 + }, + { + "auxiliary_loss_clip": 0.06394961, + "auxiliary_loss_mlp": 0.01262336, + "balance_loss_clip": 0.06267217, + "balance_loss_mlp": 0.01253639, + "epoch": 0.907740868781001, + "flos": 18703478868480.0, + "grad_norm": 1.5527007642230701, + "language_loss": 0.79784089, + "learning_rate": 8.857035423668935e-08, + "loss": 0.87441391, + "num_input_tokens_seen": 325659455, + "router_z_loss_clip": 1.27734375, + "router_z_loss_mlp": 0.0869751, + "step": 15098, + "time_per_iteration": 2.5422494411468506 + }, + { + "auxiliary_loss_clip": 0.06405206, + "auxiliary_loss_mlp": 0.01263684, + "balance_loss_clip": 0.06270834, + "balance_loss_mlp": 0.01254458, + "epoch": 0.907800992033669, + "flos": 22645540454400.0, + "grad_norm": 1.6203953780141742, + "language_loss": 0.66362941, + "learning_rate": 8.845577409729266e-08, + "loss": 0.74031836, + "num_input_tokens_seen": 325678095, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.09216309, + "step": 15099, + "time_per_iteration": 2.53924822807312 + }, + { + "auxiliary_loss_clip": 0.06402986, + "auxiliary_loss_mlp": 0.01264278, + "balance_loss_clip": 0.06270178, + "balance_loss_mlp": 0.01253925, + "epoch": 0.907861115286337, + "flos": 21293980496640.0, + "grad_norm": 2.113947678970701, + "language_loss": 0.70936823, + "learning_rate": 8.834126644384477e-08, + "loss": 0.78604084, + "num_input_tokens_seen": 325695825, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.10357666, + "step": 15100, + "time_per_iteration": 2.500608444213867 + }, + { + "auxiliary_loss_clip": 0.06306085, + "auxiliary_loss_mlp": 0.0124919, + "balance_loss_clip": 0.06251926, + "balance_loss_mlp": 0.01248136, + "epoch": 0.907921238539005, + "flos": 69759800426880.0, + "grad_norm": 0.6247804404635554, + "language_loss": 0.5343653, + "learning_rate": 8.822683128068775e-08, + "loss": 0.609918, + "num_input_tokens_seen": 325764515, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.01055145, + "step": 15101, + "time_per_iteration": 4.569448232650757 + }, + { + "auxiliary_loss_clip": 0.06403472, + "auxiliary_loss_mlp": 0.01263011, + "balance_loss_clip": 0.06273133, + "balance_loss_mlp": 0.01253654, + "epoch": 0.9079813617916729, + "flos": 23484168443520.0, + "grad_norm": 2.4551114582819764, + "language_loss": 0.68570346, + "learning_rate": 8.811246861216081e-08, + "loss": 0.76236832, + "num_input_tokens_seen": 325783235, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.09362793, + "step": 15102, + "time_per_iteration": 2.543745517730713 + }, + { + "auxiliary_loss_clip": 0.06400967, + "auxiliary_loss_mlp": 0.01264699, + "balance_loss_clip": 0.06271027, + "balance_loss_mlp": 0.01255114, + "epoch": 0.9080414850443409, + "flos": 22936590011520.0, + "grad_norm": 1.8212057779957778, + "language_loss": 0.7951529, + "learning_rate": 8.799817844260049e-08, + "loss": 0.8718096, + "num_input_tokens_seen": 325800195, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.09588623, + "step": 15103, + "time_per_iteration": 2.4846246242523193 + }, + { + "auxiliary_loss_clip": 0.06402376, + "auxiliary_loss_mlp": 0.01267473, + "balance_loss_clip": 0.06269386, + "balance_loss_mlp": 0.01258401, + "epoch": 0.9081016082970089, + "flos": 26184269612160.0, + "grad_norm": 1.995512307901863, + "language_loss": 0.71880859, + "learning_rate": 8.78839607763413e-08, + "loss": 0.79550713, + "num_input_tokens_seen": 325820215, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.09069824, + "step": 15104, + "time_per_iteration": 2.5300004482269287 + }, + { + "auxiliary_loss_clip": 0.06399778, + "auxiliary_loss_mlp": 0.01263283, + "balance_loss_clip": 0.06271459, + "balance_loss_mlp": 0.01254467, + "epoch": 0.9081617315496768, + "flos": 24469054934400.0, + "grad_norm": 1.6559231689282168, + "language_loss": 0.78008848, + "learning_rate": 8.77698156177138e-08, + "loss": 0.85671914, + "num_input_tokens_seen": 325838415, + "router_z_loss_clip": 1.28320312, + "router_z_loss_mlp": 0.08813477, + "step": 15105, + "time_per_iteration": 2.520692825317383 + }, + { + "auxiliary_loss_clip": 0.06401225, + "auxiliary_loss_mlp": 0.01265497, + "balance_loss_clip": 0.06269834, + "balance_loss_mlp": 0.01256401, + "epoch": 0.9082218548023449, + "flos": 24752599551360.0, + "grad_norm": 1.7549028809217568, + "language_loss": 0.73971152, + "learning_rate": 8.765574297104628e-08, + "loss": 0.81637871, + "num_input_tokens_seen": 325855580, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09088135, + "step": 15106, + "time_per_iteration": 2.5180251598358154 + }, + { + "auxiliary_loss_clip": 0.06404307, + "auxiliary_loss_mlp": 0.01264352, + "balance_loss_clip": 0.06271388, + "balance_loss_mlp": 0.01254249, + "epoch": 0.9082819780550128, + "flos": 24427448582400.0, + "grad_norm": 1.5903230958882113, + "language_loss": 0.80446184, + "learning_rate": 8.754174284066462e-08, + "loss": 0.8811484, + "num_input_tokens_seen": 325874890, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.10101318, + "step": 15107, + "time_per_iteration": 2.560788154602051 + }, + { + "auxiliary_loss_clip": 0.06312685, + "auxiliary_loss_mlp": 0.01250294, + "balance_loss_clip": 0.0625825, + "balance_loss_mlp": 0.01249236, + "epoch": 0.9083421013076808, + "flos": 59630535429120.0, + "grad_norm": 0.8314070940246863, + "language_loss": 0.59992969, + "learning_rate": 8.742781523089205e-08, + "loss": 0.67555946, + "num_input_tokens_seen": 325935835, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.0105896, + "step": 15108, + "time_per_iteration": 3.0896317958831787 + }, + { + "auxiliary_loss_clip": 0.06400774, + "auxiliary_loss_mlp": 0.01261142, + "balance_loss_clip": 0.06267995, + "balance_loss_mlp": 0.01252034, + "epoch": 0.9084022245603487, + "flos": 33628652956800.0, + "grad_norm": 2.03070687094374, + "language_loss": 0.74325216, + "learning_rate": 8.73139601460482e-08, + "loss": 0.81987131, + "num_input_tokens_seen": 325958035, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.09112549, + "step": 15109, + "time_per_iteration": 2.618248462677002 + }, + { + "auxiliary_loss_clip": 0.06398752, + "auxiliary_loss_mlp": 0.01262631, + "balance_loss_clip": 0.06270365, + "balance_loss_mlp": 0.01253815, + "epoch": 0.9084623478130167, + "flos": 24978465031680.0, + "grad_norm": 2.0096064178066273, + "language_loss": 0.71743369, + "learning_rate": 8.720017759045073e-08, + "loss": 0.79404759, + "num_input_tokens_seen": 325979870, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.08807373, + "step": 15110, + "time_per_iteration": 3.9737777709960938 + }, + { + "auxiliary_loss_clip": 0.06398316, + "auxiliary_loss_mlp": 0.01263963, + "balance_loss_clip": 0.06271097, + "balance_loss_mlp": 0.01254802, + "epoch": 0.9085224710656846, + "flos": 31468918769280.0, + "grad_norm": 2.1410515920625364, + "language_loss": 0.68859386, + "learning_rate": 8.708646756841421e-08, + "loss": 0.76521665, + "num_input_tokens_seen": 325998245, + "router_z_loss_clip": 1.2734375, + "router_z_loss_mlp": 0.09161377, + "step": 15111, + "time_per_iteration": 2.568233013153076 + }, + { + "auxiliary_loss_clip": 0.06308745, + "auxiliary_loss_mlp": 0.01249082, + "balance_loss_clip": 0.06254536, + "balance_loss_mlp": 0.01248148, + "epoch": 0.9085825943183526, + "flos": 64935450074880.0, + "grad_norm": 0.6818975607395432, + "language_loss": 0.51562428, + "learning_rate": 8.697283008425026e-08, + "loss": 0.5912025, + "num_input_tokens_seen": 326061770, + "router_z_loss_clip": 0.54248047, + "router_z_loss_mlp": 0.00931549, + "step": 15112, + "time_per_iteration": 3.218057632446289 + }, + { + "auxiliary_loss_clip": 0.06401073, + "auxiliary_loss_mlp": 0.01265191, + "balance_loss_clip": 0.06268831, + "balance_loss_mlp": 0.01256173, + "epoch": 0.9086427175710206, + "flos": 18959253056640.0, + "grad_norm": 1.927505414115429, + "language_loss": 0.70069271, + "learning_rate": 8.685926514226837e-08, + "loss": 0.77735531, + "num_input_tokens_seen": 326080945, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.09014893, + "step": 15113, + "time_per_iteration": 2.580868721008301 + }, + { + "auxiliary_loss_clip": 0.06401566, + "auxiliary_loss_mlp": 0.01267122, + "balance_loss_clip": 0.06270175, + "balance_loss_mlp": 0.01257699, + "epoch": 0.9087028408236886, + "flos": 34022258311680.0, + "grad_norm": 2.107615186119017, + "language_loss": 0.79321289, + "learning_rate": 8.674577274677508e-08, + "loss": 0.86989981, + "num_input_tokens_seen": 326100630, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09429932, + "step": 15114, + "time_per_iteration": 4.032289028167725 + }, + { + "auxiliary_loss_clip": 0.06410873, + "auxiliary_loss_mlp": 0.01266597, + "balance_loss_clip": 0.06274423, + "balance_loss_mlp": 0.01256035, + "epoch": 0.9087629640763565, + "flos": 21951032688000.0, + "grad_norm": 1.9480884837439871, + "language_loss": 0.70168352, + "learning_rate": 8.663235290207405e-08, + "loss": 0.77845824, + "num_input_tokens_seen": 326120145, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.10552979, + "step": 15115, + "time_per_iteration": 2.5174953937530518 + }, + { + "auxiliary_loss_clip": 0.06407836, + "auxiliary_loss_mlp": 0.01262941, + "balance_loss_clip": 0.06271894, + "balance_loss_mlp": 0.01252754, + "epoch": 0.9088230873290245, + "flos": 21769456890240.0, + "grad_norm": 1.407962111970601, + "language_loss": 0.65673447, + "learning_rate": 8.651900561246561e-08, + "loss": 0.73344225, + "num_input_tokens_seen": 326140715, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.10180664, + "step": 15116, + "time_per_iteration": 3.929631471633911 + }, + { + "auxiliary_loss_clip": 0.06397676, + "auxiliary_loss_mlp": 0.01267156, + "balance_loss_clip": 0.06271522, + "balance_loss_mlp": 0.01257566, + "epoch": 0.9088832105816925, + "flos": 21547322916480.0, + "grad_norm": 1.4695312079859524, + "language_loss": 0.69494951, + "learning_rate": 8.640573088224812e-08, + "loss": 0.77159774, + "num_input_tokens_seen": 326159130, + "router_z_loss_clip": 1.26269531, + "router_z_loss_mlp": 0.09588623, + "step": 15117, + "time_per_iteration": 2.5169076919555664 + }, + { + "auxiliary_loss_clip": 0.06400181, + "auxiliary_loss_mlp": 0.01267852, + "balance_loss_clip": 0.06269901, + "balance_loss_mlp": 0.01258715, + "epoch": 0.9089433338343604, + "flos": 26004203187840.0, + "grad_norm": 1.358588776880828, + "language_loss": 0.74719739, + "learning_rate": 8.629252871571745e-08, + "loss": 0.82387769, + "num_input_tokens_seen": 326181375, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.09143066, + "step": 15118, + "time_per_iteration": 2.5945725440979004 + }, + { + "auxiliary_loss_clip": 0.06408937, + "auxiliary_loss_mlp": 0.0126524, + "balance_loss_clip": 0.06269845, + "balance_loss_mlp": 0.01254183, + "epoch": 0.9090034570870285, + "flos": 21184758299520.0, + "grad_norm": 2.0413531147204345, + "language_loss": 0.72784328, + "learning_rate": 8.617939911716554e-08, + "loss": 0.8045851, + "num_input_tokens_seen": 326199740, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.1104126, + "step": 15119, + "time_per_iteration": 2.5365755558013916 + }, + { + "auxiliary_loss_clip": 0.06409705, + "auxiliary_loss_mlp": 0.01263579, + "balance_loss_clip": 0.0627287, + "balance_loss_mlp": 0.0125263, + "epoch": 0.9090635803396964, + "flos": 16147036725120.0, + "grad_norm": 2.3146876326826233, + "language_loss": 0.71590072, + "learning_rate": 8.60663420908827e-08, + "loss": 0.79263353, + "num_input_tokens_seen": 326214350, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.10943604, + "step": 15120, + "time_per_iteration": 2.4872450828552246 + }, + { + "auxiliary_loss_clip": 0.06401677, + "auxiliary_loss_mlp": 0.01262323, + "balance_loss_clip": 0.06268568, + "balance_loss_mlp": 0.01252894, + "epoch": 0.9091237035923644, + "flos": 20597250597120.0, + "grad_norm": 1.9625105264787481, + "language_loss": 0.66382855, + "learning_rate": 8.595335764115596e-08, + "loss": 0.74046856, + "num_input_tokens_seen": 326234580, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.09429932, + "step": 15121, + "time_per_iteration": 2.5439295768737793 + }, + { + "auxiliary_loss_clip": 0.06402369, + "auxiliary_loss_mlp": 0.01269485, + "balance_loss_clip": 0.06271164, + "balance_loss_mlp": 0.01259179, + "epoch": 0.9091838268450323, + "flos": 52239275902080.0, + "grad_norm": 1.9522631564696673, + "language_loss": 0.70143443, + "learning_rate": 8.58404457722699e-08, + "loss": 0.77815294, + "num_input_tokens_seen": 326259080, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.10302734, + "step": 15122, + "time_per_iteration": 2.7782716751098633 + }, + { + "auxiliary_loss_clip": 0.06399389, + "auxiliary_loss_mlp": 0.01262307, + "balance_loss_clip": 0.06270258, + "balance_loss_mlp": 0.01253009, + "epoch": 0.9092439500977003, + "flos": 20566084078080.0, + "grad_norm": 1.2228012273882412, + "language_loss": 0.74737382, + "learning_rate": 8.572760648850575e-08, + "loss": 0.8239907, + "num_input_tokens_seen": 326280175, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.09295654, + "step": 15123, + "time_per_iteration": 2.548868417739868 + }, + { + "auxiliary_loss_clip": 0.06397559, + "auxiliary_loss_mlp": 0.01264083, + "balance_loss_clip": 0.06270659, + "balance_loss_mlp": 0.01255303, + "epoch": 0.9093040733503682, + "flos": 28624823159040.0, + "grad_norm": 1.786331644949096, + "language_loss": 0.75845641, + "learning_rate": 8.561483979414253e-08, + "loss": 0.83507288, + "num_input_tokens_seen": 326297990, + "router_z_loss_clip": 1.27050781, + "router_z_loss_mlp": 0.08782959, + "step": 15124, + "time_per_iteration": 2.561037302017212 + }, + { + "auxiliary_loss_clip": 0.06398606, + "auxiliary_loss_mlp": 0.01266988, + "balance_loss_clip": 0.06268884, + "balance_loss_mlp": 0.0125766, + "epoch": 0.9093641966030362, + "flos": 23446838649600.0, + "grad_norm": 1.8436669176844096, + "language_loss": 0.72484279, + "learning_rate": 8.55021456934566e-08, + "loss": 0.80149877, + "num_input_tokens_seen": 326316735, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.09326172, + "step": 15125, + "time_per_iteration": 2.519473075866699 + }, + { + "auxiliary_loss_clip": 0.06397496, + "auxiliary_loss_mlp": 0.01263813, + "balance_loss_clip": 0.06270289, + "balance_loss_mlp": 0.0125501, + "epoch": 0.9094243198557042, + "flos": 16805807925120.0, + "grad_norm": 1.5501227828920265, + "language_loss": 0.79221696, + "learning_rate": 8.538952419072143e-08, + "loss": 0.86883008, + "num_input_tokens_seen": 326334370, + "router_z_loss_clip": 1.2734375, + "router_z_loss_mlp": 0.08795166, + "step": 15126, + "time_per_iteration": 2.4721574783325195 + }, + { + "auxiliary_loss_clip": 0.0640032, + "auxiliary_loss_mlp": 0.01267544, + "balance_loss_clip": 0.06272551, + "balance_loss_mlp": 0.012588, + "epoch": 0.9094844431083722, + "flos": 24279051801600.0, + "grad_norm": 1.446842251564929, + "language_loss": 0.75611615, + "learning_rate": 8.527697529020694e-08, + "loss": 0.83279485, + "num_input_tokens_seen": 326353435, + "router_z_loss_clip": 1.27929688, + "router_z_loss_mlp": 0.08743286, + "step": 15127, + "time_per_iteration": 2.519174337387085 + }, + { + "auxiliary_loss_clip": 0.06402364, + "auxiliary_loss_mlp": 0.01263756, + "balance_loss_clip": 0.06269338, + "balance_loss_mlp": 0.01254607, + "epoch": 0.9095445663610401, + "flos": 21951116542080.0, + "grad_norm": 2.994024762493421, + "language_loss": 0.62593842, + "learning_rate": 8.516449899618173e-08, + "loss": 0.70259964, + "num_input_tokens_seen": 326371810, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.0914917, + "step": 15128, + "time_per_iteration": 2.492807388305664 + }, + { + "auxiliary_loss_clip": 0.06399337, + "auxiliary_loss_mlp": 0.01263081, + "balance_loss_clip": 0.06269845, + "balance_loss_mlp": 0.01253616, + "epoch": 0.9096046896137081, + "flos": 19799096929920.0, + "grad_norm": 2.004480478019155, + "language_loss": 0.76882553, + "learning_rate": 8.505209531291013e-08, + "loss": 0.84544969, + "num_input_tokens_seen": 326391380, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.09466553, + "step": 15129, + "time_per_iteration": 2.5134694576263428 + }, + { + "auxiliary_loss_clip": 0.06405028, + "auxiliary_loss_mlp": 0.01262605, + "balance_loss_clip": 0.06271479, + "balance_loss_mlp": 0.0125302, + "epoch": 0.909664812866376, + "flos": 22644701913600.0, + "grad_norm": 1.922995524134768, + "language_loss": 0.84000599, + "learning_rate": 8.49397642446552e-08, + "loss": 0.91668236, + "num_input_tokens_seen": 326408800, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.09576416, + "step": 15130, + "time_per_iteration": 2.49751353263855 + }, + { + "auxiliary_loss_clip": 0.06402621, + "auxiliary_loss_mlp": 0.01262359, + "balance_loss_clip": 0.06272228, + "balance_loss_mlp": 0.01252691, + "epoch": 0.909724936119044, + "flos": 39860439540480.0, + "grad_norm": 1.684451923385225, + "language_loss": 0.75303972, + "learning_rate": 8.482750579567644e-08, + "loss": 0.8296895, + "num_input_tokens_seen": 326431565, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.09661865, + "step": 15131, + "time_per_iteration": 2.6618237495422363 + }, + { + "auxiliary_loss_clip": 0.06401692, + "auxiliary_loss_mlp": 0.0126297, + "balance_loss_clip": 0.06270601, + "balance_loss_mlp": 0.01253773, + "epoch": 0.9097850593717121, + "flos": 35078953351680.0, + "grad_norm": 1.8018216027233815, + "language_loss": 0.59644824, + "learning_rate": 8.471531997023085e-08, + "loss": 0.67309487, + "num_input_tokens_seen": 326451715, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09204102, + "step": 15132, + "time_per_iteration": 2.5843985080718994 + }, + { + "auxiliary_loss_clip": 0.06398638, + "auxiliary_loss_mlp": 0.0126275, + "balance_loss_clip": 0.06269633, + "balance_loss_mlp": 0.01254317, + "epoch": 0.90984518262438, + "flos": 23374149632640.0, + "grad_norm": 1.413260935585949, + "language_loss": 0.83113134, + "learning_rate": 8.460320677257193e-08, + "loss": 0.90774524, + "num_input_tokens_seen": 326470855, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.08435059, + "step": 15133, + "time_per_iteration": 2.537156581878662 + }, + { + "auxiliary_loss_clip": 0.06399462, + "auxiliary_loss_mlp": 0.01262679, + "balance_loss_clip": 0.0626839, + "balance_loss_mlp": 0.01253434, + "epoch": 0.909905305877048, + "flos": 27530085565440.0, + "grad_norm": 1.6843084731476905, + "language_loss": 0.73938394, + "learning_rate": 8.449116620695118e-08, + "loss": 0.81600529, + "num_input_tokens_seen": 326490480, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.09240723, + "step": 15134, + "time_per_iteration": 2.5576279163360596 + }, + { + "auxiliary_loss_clip": 0.06413636, + "auxiliary_loss_mlp": 0.01264703, + "balance_loss_clip": 0.062745, + "balance_loss_mlp": 0.01255179, + "epoch": 0.9099654291297159, + "flos": 24353921024640.0, + "grad_norm": 1.4339167033731788, + "language_loss": 0.73107815, + "learning_rate": 8.437919827761786e-08, + "loss": 0.80786151, + "num_input_tokens_seen": 326509445, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.09521484, + "step": 15135, + "time_per_iteration": 2.600571870803833 + }, + { + "auxiliary_loss_clip": 0.06398353, + "auxiliary_loss_mlp": 0.01262496, + "balance_loss_clip": 0.06270214, + "balance_loss_mlp": 0.01253162, + "epoch": 0.9100255523823839, + "flos": 21221626896000.0, + "grad_norm": 1.7085160018816423, + "language_loss": 0.70284522, + "learning_rate": 8.426730298881702e-08, + "loss": 0.77945369, + "num_input_tokens_seen": 326528380, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.09332275, + "step": 15136, + "time_per_iteration": 2.4926037788391113 + }, + { + "auxiliary_loss_clip": 0.06304874, + "auxiliary_loss_mlp": 0.01251653, + "balance_loss_clip": 0.06250328, + "balance_loss_mlp": 0.01250625, + "epoch": 0.9100856756350518, + "flos": 46067292005760.0, + "grad_norm": 0.80453023989808, + "language_loss": 0.59098959, + "learning_rate": 8.415548034479214e-08, + "loss": 0.66655481, + "num_input_tokens_seen": 326576940, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.01027679, + "step": 15137, + "time_per_iteration": 2.8737428188323975 + }, + { + "auxiliary_loss_clip": 0.06404972, + "auxiliary_loss_mlp": 0.01264173, + "balance_loss_clip": 0.06272592, + "balance_loss_mlp": 0.01255208, + "epoch": 0.9101457988877198, + "flos": 20236111499520.0, + "grad_norm": 1.4827649946447703, + "language_loss": 0.82628894, + "learning_rate": 8.40437303497834e-08, + "loss": 0.90298033, + "num_input_tokens_seen": 326596100, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.08966064, + "step": 15138, + "time_per_iteration": 2.4917473793029785 + }, + { + "auxiliary_loss_clip": 0.06394204, + "auxiliary_loss_mlp": 0.01261553, + "balance_loss_clip": 0.06268851, + "balance_loss_mlp": 0.01252928, + "epoch": 0.9102059221403878, + "flos": 26622458138880.0, + "grad_norm": 2.0023017385136392, + "language_loss": 0.81339759, + "learning_rate": 8.39320530080283e-08, + "loss": 0.8899551, + "num_input_tokens_seen": 326615700, + "router_z_loss_clip": 1.25390625, + "router_z_loss_mlp": 0.08636475, + "step": 15139, + "time_per_iteration": 2.5509281158447266 + }, + { + "auxiliary_loss_clip": 0.0640308, + "auxiliary_loss_mlp": 0.01263473, + "balance_loss_clip": 0.06273255, + "balance_loss_mlp": 0.01254026, + "epoch": 0.9102660453930558, + "flos": 21915086486400.0, + "grad_norm": 1.5474154648257277, + "language_loss": 0.77706277, + "learning_rate": 8.382044832376167e-08, + "loss": 0.85372829, + "num_input_tokens_seen": 326635905, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.09454346, + "step": 15140, + "time_per_iteration": 3.9355709552764893 + }, + { + "auxiliary_loss_clip": 0.06400235, + "auxiliary_loss_mlp": 0.01260713, + "balance_loss_clip": 0.06271002, + "balance_loss_mlp": 0.01252071, + "epoch": 0.9103261686457237, + "flos": 36185933640960.0, + "grad_norm": 1.8719337735504868, + "language_loss": 0.66449845, + "learning_rate": 8.370891630121569e-08, + "loss": 0.74110788, + "num_input_tokens_seen": 326661855, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.08648682, + "step": 15141, + "time_per_iteration": 2.647343873977661 + }, + { + "auxiliary_loss_clip": 0.06405683, + "auxiliary_loss_mlp": 0.01266424, + "balance_loss_clip": 0.06270161, + "balance_loss_mlp": 0.01256976, + "epoch": 0.9103862918983917, + "flos": 23885362592640.0, + "grad_norm": 6.054142486418284, + "language_loss": 0.75214803, + "learning_rate": 8.359745694462005e-08, + "loss": 0.8288691, + "num_input_tokens_seen": 326679320, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.09448242, + "step": 15142, + "time_per_iteration": 2.5260467529296875 + }, + { + "auxiliary_loss_clip": 0.06397744, + "auxiliary_loss_mlp": 0.01263466, + "balance_loss_clip": 0.06268731, + "balance_loss_mlp": 0.01254508, + "epoch": 0.9104464151510596, + "flos": 14944837870080.0, + "grad_norm": 1.6281016166898625, + "language_loss": 0.64735144, + "learning_rate": 8.348607025820076e-08, + "loss": 0.7239635, + "num_input_tokens_seen": 326698110, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.08959961, + "step": 15143, + "time_per_iteration": 2.478365182876587 + }, + { + "auxiliary_loss_clip": 0.06402953, + "auxiliary_loss_mlp": 0.01262903, + "balance_loss_clip": 0.06269629, + "balance_loss_mlp": 0.0125302, + "epoch": 0.9105065384037276, + "flos": 33664096033920.0, + "grad_norm": 1.8192012493861849, + "language_loss": 0.61270368, + "learning_rate": 8.337475624618152e-08, + "loss": 0.68936229, + "num_input_tokens_seen": 326718370, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.09875488, + "step": 15144, + "time_per_iteration": 2.612241506576538 + }, + { + "auxiliary_loss_clip": 0.06393068, + "auxiliary_loss_mlp": 0.01265463, + "balance_loss_clip": 0.06268917, + "balance_loss_mlp": 0.0125663, + "epoch": 0.9105666616563957, + "flos": 24323634973440.0, + "grad_norm": 1.7059892216742707, + "language_loss": 0.71336597, + "learning_rate": 8.326351491278382e-08, + "loss": 0.78995132, + "num_input_tokens_seen": 326738445, + "router_z_loss_clip": 1.24121094, + "router_z_loss_mlp": 0.08837891, + "step": 15145, + "time_per_iteration": 2.5258352756500244 + }, + { + "auxiliary_loss_clip": 0.06395367, + "auxiliary_loss_mlp": 0.01263535, + "balance_loss_clip": 0.06269669, + "balance_loss_mlp": 0.01254458, + "epoch": 0.9106267849090636, + "flos": 29979527644800.0, + "grad_norm": 1.5087408228781412, + "language_loss": 0.7090916, + "learning_rate": 8.315234626222545e-08, + "loss": 0.78568059, + "num_input_tokens_seen": 326758855, + "router_z_loss_clip": 1.25585938, + "router_z_loss_mlp": 0.09069824, + "step": 15146, + "time_per_iteration": 2.7402799129486084 + }, + { + "auxiliary_loss_clip": 0.06400052, + "auxiliary_loss_mlp": 0.01262786, + "balance_loss_clip": 0.06270608, + "balance_loss_mlp": 0.01254155, + "epoch": 0.9106869081617316, + "flos": 25344761155200.0, + "grad_norm": 1.7237443516781754, + "language_loss": 0.73024035, + "learning_rate": 8.304125029872233e-08, + "loss": 0.80686873, + "num_input_tokens_seen": 326777140, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.08624268, + "step": 15147, + "time_per_iteration": 2.5613772869110107 + }, + { + "auxiliary_loss_clip": 0.06405227, + "auxiliary_loss_mlp": 0.01263577, + "balance_loss_clip": 0.06269574, + "balance_loss_mlp": 0.01254267, + "epoch": 0.9107470314143995, + "flos": 18192936741120.0, + "grad_norm": 1.8228865120504234, + "language_loss": 0.80208182, + "learning_rate": 8.293022702648711e-08, + "loss": 0.87876976, + "num_input_tokens_seen": 326794070, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.09307861, + "step": 15148, + "time_per_iteration": 2.4916961193084717 + }, + { + "auxiliary_loss_clip": 0.06404668, + "auxiliary_loss_mlp": 0.0126411, + "balance_loss_clip": 0.06271308, + "balance_loss_mlp": 0.01254412, + "epoch": 0.9108071546670675, + "flos": 23557696001280.0, + "grad_norm": 1.6542822970415358, + "language_loss": 0.68148386, + "learning_rate": 8.281927644972996e-08, + "loss": 0.75817162, + "num_input_tokens_seen": 326814695, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.09692383, + "step": 15149, + "time_per_iteration": 3.9529452323913574 + }, + { + "auxiliary_loss_clip": 0.06406561, + "auxiliary_loss_mlp": 0.01265217, + "balance_loss_clip": 0.06273574, + "balance_loss_mlp": 0.01256035, + "epoch": 0.9108672779197354, + "flos": 25637487793920.0, + "grad_norm": 1.477688710921721, + "language_loss": 0.63625479, + "learning_rate": 8.270839857265776e-08, + "loss": 0.71297264, + "num_input_tokens_seen": 326835295, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.09179688, + "step": 15150, + "time_per_iteration": 2.53456449508667 + }, + { + "auxiliary_loss_clip": 0.06401673, + "auxiliary_loss_mlp": 0.0126291, + "balance_loss_clip": 0.0627019, + "balance_loss_mlp": 0.01253874, + "epoch": 0.9109274011724035, + "flos": 22344470334720.0, + "grad_norm": 1.7663276861657815, + "language_loss": 0.73236012, + "learning_rate": 8.259759339947514e-08, + "loss": 0.80900592, + "num_input_tokens_seen": 326853350, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09039307, + "step": 15151, + "time_per_iteration": 2.515439510345459 + }, + { + "auxiliary_loss_clip": 0.06399186, + "auxiliary_loss_mlp": 0.01265861, + "balance_loss_clip": 0.06269082, + "balance_loss_mlp": 0.01256437, + "epoch": 0.9109875244250714, + "flos": 26695524499200.0, + "grad_norm": 1.4955695387299417, + "language_loss": 0.64540172, + "learning_rate": 8.248686093438429e-08, + "loss": 0.72205222, + "num_input_tokens_seen": 326873425, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.09417725, + "step": 15152, + "time_per_iteration": 2.547096014022827 + }, + { + "auxiliary_loss_clip": 0.06403639, + "auxiliary_loss_mlp": 0.01266075, + "balance_loss_clip": 0.06273131, + "balance_loss_mlp": 0.01256735, + "epoch": 0.9110476476777394, + "flos": 22936799646720.0, + "grad_norm": 1.8229658483887674, + "language_loss": 0.73700202, + "learning_rate": 8.23762011815834e-08, + "loss": 0.81369913, + "num_input_tokens_seen": 326893455, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.09338379, + "step": 15153, + "time_per_iteration": 2.515530824661255 + }, + { + "auxiliary_loss_clip": 0.06403325, + "auxiliary_loss_mlp": 0.01264561, + "balance_loss_clip": 0.06271794, + "balance_loss_mlp": 0.01254756, + "epoch": 0.9111077709304073, + "flos": 13476718483200.0, + "grad_norm": 1.9576939804869533, + "language_loss": 0.7254191, + "learning_rate": 8.226561414526956e-08, + "loss": 0.80209798, + "num_input_tokens_seen": 326910210, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09802246, + "step": 15154, + "time_per_iteration": 4.011706590652466 + }, + { + "auxiliary_loss_clip": 0.06400883, + "auxiliary_loss_mlp": 0.01264225, + "balance_loss_clip": 0.06272145, + "balance_loss_mlp": 0.01254599, + "epoch": 0.9111678941830753, + "flos": 20856924000000.0, + "grad_norm": 1.6070045592329703, + "language_loss": 0.82313609, + "learning_rate": 8.215509982963564e-08, + "loss": 0.89978719, + "num_input_tokens_seen": 326929350, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.09631348, + "step": 15155, + "time_per_iteration": 2.5135505199432373 + }, + { + "auxiliary_loss_clip": 0.06403641, + "auxiliary_loss_mlp": 0.012676, + "balance_loss_clip": 0.06273505, + "balance_loss_mlp": 0.01258659, + "epoch": 0.9112280174357432, + "flos": 19688281505280.0, + "grad_norm": 1.4380707223539104, + "language_loss": 0.59939194, + "learning_rate": 8.204465823887252e-08, + "loss": 0.67610437, + "num_input_tokens_seen": 326949060, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.0894165, + "step": 15156, + "time_per_iteration": 3.994004487991333 + }, + { + "auxiliary_loss_clip": 0.06406192, + "auxiliary_loss_mlp": 0.01265569, + "balance_loss_clip": 0.0627121, + "balance_loss_mlp": 0.01254643, + "epoch": 0.9112881406884112, + "flos": 25454192987520.0, + "grad_norm": 1.7593571365414977, + "language_loss": 0.74333876, + "learning_rate": 8.193428937716796e-08, + "loss": 0.82005632, + "num_input_tokens_seen": 326968950, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.10919189, + "step": 15157, + "time_per_iteration": 2.54280686378479 + }, + { + "auxiliary_loss_clip": 0.0640168, + "auxiliary_loss_mlp": 0.01261948, + "balance_loss_clip": 0.06268957, + "balance_loss_mlp": 0.01253401, + "epoch": 0.9113482639410793, + "flos": 33074324271360.0, + "grad_norm": 1.6469178321530784, + "language_loss": 0.59426653, + "learning_rate": 8.182399324870747e-08, + "loss": 0.67090285, + "num_input_tokens_seen": 326989455, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.08551025, + "step": 15158, + "time_per_iteration": 2.6101877689361572 + }, + { + "auxiliary_loss_clip": 0.0639876, + "auxiliary_loss_mlp": 0.01263604, + "balance_loss_clip": 0.0626954, + "balance_loss_mlp": 0.01254717, + "epoch": 0.9114083871937472, + "flos": 21842103980160.0, + "grad_norm": 1.7579172043530233, + "language_loss": 0.6775853, + "learning_rate": 8.171376985767375e-08, + "loss": 0.75420892, + "num_input_tokens_seen": 327009640, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.08886719, + "step": 15159, + "time_per_iteration": 2.4980640411376953 + }, + { + "auxiliary_loss_clip": 0.06402466, + "auxiliary_loss_mlp": 0.01262425, + "balance_loss_clip": 0.06270958, + "balance_loss_mlp": 0.01253777, + "epoch": 0.9114685104464152, + "flos": 27096299377920.0, + "grad_norm": 1.9611572487780382, + "language_loss": 0.78373706, + "learning_rate": 8.160361920824588e-08, + "loss": 0.86038601, + "num_input_tokens_seen": 327027690, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.08654785, + "step": 15160, + "time_per_iteration": 2.5919408798217773 + }, + { + "auxiliary_loss_clip": 0.06406088, + "auxiliary_loss_mlp": 0.01266258, + "balance_loss_clip": 0.06273904, + "balance_loss_mlp": 0.01256048, + "epoch": 0.9115286336990831, + "flos": 17972731411200.0, + "grad_norm": 1.807136826815418, + "language_loss": 0.69505328, + "learning_rate": 8.149354130460073e-08, + "loss": 0.77177674, + "num_input_tokens_seen": 327045915, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.10205078, + "step": 15161, + "time_per_iteration": 2.484355926513672 + }, + { + "auxiliary_loss_clip": 0.06401908, + "auxiliary_loss_mlp": 0.01265154, + "balance_loss_clip": 0.06269228, + "balance_loss_mlp": 0.01255099, + "epoch": 0.9115887569517511, + "flos": 22936506157440.0, + "grad_norm": 1.654027416988286, + "language_loss": 0.75972486, + "learning_rate": 8.138353615091321e-08, + "loss": 0.8363955, + "num_input_tokens_seen": 327066355, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.10058594, + "step": 15162, + "time_per_iteration": 2.5151309967041016 + }, + { + "auxiliary_loss_clip": 0.06398072, + "auxiliary_loss_mlp": 0.01262761, + "balance_loss_clip": 0.06267852, + "balance_loss_mlp": 0.01253481, + "epoch": 0.911648880204419, + "flos": 23995339476480.0, + "grad_norm": 1.734863559014141, + "language_loss": 0.66808069, + "learning_rate": 8.127360375135395e-08, + "loss": 0.74468899, + "num_input_tokens_seen": 327086735, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.09283447, + "step": 15163, + "time_per_iteration": 2.5094223022460938 + }, + { + "auxiliary_loss_clip": 0.06410325, + "auxiliary_loss_mlp": 0.01262997, + "balance_loss_clip": 0.06272718, + "balance_loss_mlp": 0.01253347, + "epoch": 0.911709003457087, + "flos": 17060911280640.0, + "grad_norm": 2.5549807341049893, + "language_loss": 0.7104494, + "learning_rate": 8.116374411009186e-08, + "loss": 0.78718263, + "num_input_tokens_seen": 327104035, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.09661865, + "step": 15164, + "time_per_iteration": 2.524186849594116 + }, + { + "auxiliary_loss_clip": 0.06397158, + "auxiliary_loss_mlp": 0.01264303, + "balance_loss_clip": 0.0627102, + "balance_loss_mlp": 0.01255928, + "epoch": 0.911769126709755, + "flos": 21659857349760.0, + "grad_norm": 1.5173262591042511, + "language_loss": 0.76362646, + "learning_rate": 8.105395723129315e-08, + "loss": 0.84024107, + "num_input_tokens_seen": 327124370, + "router_z_loss_clip": 1.25976562, + "router_z_loss_mlp": 0.08374023, + "step": 15165, + "time_per_iteration": 2.5094478130340576 + }, + { + "auxiliary_loss_clip": 0.06401199, + "auxiliary_loss_mlp": 0.01263972, + "balance_loss_clip": 0.06269228, + "balance_loss_mlp": 0.01254036, + "epoch": 0.911829249962423, + "flos": 24797224650240.0, + "grad_norm": 2.5732167401800026, + "language_loss": 0.72387528, + "learning_rate": 8.094424311912074e-08, + "loss": 0.80052704, + "num_input_tokens_seen": 327140915, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09942627, + "step": 15166, + "time_per_iteration": 2.552232265472412 + }, + { + "auxiliary_loss_clip": 0.06402378, + "auxiliary_loss_mlp": 0.01264778, + "balance_loss_clip": 0.06268582, + "balance_loss_mlp": 0.01254472, + "epoch": 0.9118893732150909, + "flos": 20965684999680.0, + "grad_norm": 1.9072835391866958, + "language_loss": 0.7338112, + "learning_rate": 8.083460177773482e-08, + "loss": 0.81048274, + "num_input_tokens_seen": 327158940, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.10314941, + "step": 15167, + "time_per_iteration": 2.5074968338012695 + }, + { + "auxiliary_loss_clip": 0.06309468, + "auxiliary_loss_mlp": 0.01249426, + "balance_loss_clip": 0.06255375, + "balance_loss_mlp": 0.01248414, + "epoch": 0.9119494964677589, + "flos": 67937753393280.0, + "grad_norm": 0.7591368082582344, + "language_loss": 0.65499896, + "learning_rate": 8.072503321129298e-08, + "loss": 0.73058796, + "num_input_tokens_seen": 327217450, + "router_z_loss_clip": 0.54248047, + "router_z_loss_mlp": 0.01011658, + "step": 15168, + "time_per_iteration": 3.1166579723358154 + }, + { + "auxiliary_loss_clip": 0.06396022, + "auxiliary_loss_mlp": 0.01262898, + "balance_loss_clip": 0.06267242, + "balance_loss_mlp": 0.01254395, + "epoch": 0.9120096197204268, + "flos": 18557430001920.0, + "grad_norm": 2.4249937166543587, + "language_loss": 0.78455007, + "learning_rate": 8.061553742395033e-08, + "loss": 0.8611393, + "num_input_tokens_seen": 327233905, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.08499146, + "step": 15169, + "time_per_iteration": 2.4771196842193604 + }, + { + "auxiliary_loss_clip": 0.06401431, + "auxiliary_loss_mlp": 0.0126634, + "balance_loss_clip": 0.06269872, + "balance_loss_mlp": 0.01256595, + "epoch": 0.9120697429730948, + "flos": 19031690511360.0, + "grad_norm": 1.9684543700960608, + "language_loss": 0.82421303, + "learning_rate": 8.05061144198591e-08, + "loss": 0.90089071, + "num_input_tokens_seen": 327252430, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09741211, + "step": 15170, + "time_per_iteration": 2.4824554920196533 + }, + { + "auxiliary_loss_clip": 0.06403299, + "auxiliary_loss_mlp": 0.01265146, + "balance_loss_clip": 0.06272299, + "balance_loss_mlp": 0.01255424, + "epoch": 0.9121298662257629, + "flos": 17169127228800.0, + "grad_norm": 1.9931452501477718, + "language_loss": 0.77126348, + "learning_rate": 8.039676420316799e-08, + "loss": 0.84794796, + "num_input_tokens_seen": 327269215, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.09729004, + "step": 15171, + "time_per_iteration": 2.4650163650512695 + }, + { + "auxiliary_loss_clip": 0.06395893, + "auxiliary_loss_mlp": 0.0126533, + "balance_loss_clip": 0.06268039, + "balance_loss_mlp": 0.01255865, + "epoch": 0.9121899894784308, + "flos": 19688826556800.0, + "grad_norm": 1.334235978901617, + "language_loss": 0.6716094, + "learning_rate": 8.02874867780241e-08, + "loss": 0.74822164, + "num_input_tokens_seen": 327290320, + "router_z_loss_clip": 1.27832031, + "router_z_loss_mlp": 0.09466553, + "step": 15172, + "time_per_iteration": 2.513577461242676 + }, + { + "auxiliary_loss_clip": 0.06402537, + "auxiliary_loss_mlp": 0.01266519, + "balance_loss_clip": 0.0627134, + "balance_loss_mlp": 0.01256833, + "epoch": 0.9122501127310988, + "flos": 22242124172160.0, + "grad_norm": 1.593741100302707, + "language_loss": 0.75324094, + "learning_rate": 8.017828214857103e-08, + "loss": 0.82993144, + "num_input_tokens_seen": 327310150, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.09686279, + "step": 15173, + "time_per_iteration": 2.5007779598236084 + }, + { + "auxiliary_loss_clip": 0.06409647, + "auxiliary_loss_mlp": 0.01263462, + "balance_loss_clip": 0.06272635, + "balance_loss_mlp": 0.01253574, + "epoch": 0.9123102359837667, + "flos": 15961939056000.0, + "grad_norm": 2.6696952213402607, + "language_loss": 0.65961421, + "learning_rate": 8.00691503189499e-08, + "loss": 0.73634529, + "num_input_tokens_seen": 327326660, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.09893799, + "step": 15174, + "time_per_iteration": 2.4853627681732178 + }, + { + "auxiliary_loss_clip": 0.06404449, + "auxiliary_loss_mlp": 0.01266595, + "balance_loss_clip": 0.06270468, + "balance_loss_mlp": 0.01256862, + "epoch": 0.9123703592364347, + "flos": 25162849941120.0, + "grad_norm": 1.5703785543649638, + "language_loss": 0.75523746, + "learning_rate": 7.996009129329894e-08, + "loss": 0.83194792, + "num_input_tokens_seen": 327346700, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.09735107, + "step": 15175, + "time_per_iteration": 2.555255174636841 + }, + { + "auxiliary_loss_clip": 0.06308284, + "auxiliary_loss_mlp": 0.01250077, + "balance_loss_clip": 0.0625402, + "balance_loss_mlp": 0.01249143, + "epoch": 0.9124304824891026, + "flos": 60820659296640.0, + "grad_norm": 0.9596461602053525, + "language_loss": 0.58555514, + "learning_rate": 7.985110507575421e-08, + "loss": 0.66113877, + "num_input_tokens_seen": 327403050, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.00931549, + "step": 15176, + "time_per_iteration": 3.1778509616851807 + }, + { + "auxiliary_loss_clip": 0.06401191, + "auxiliary_loss_mlp": 0.01265992, + "balance_loss_clip": 0.06269446, + "balance_loss_mlp": 0.01256944, + "epoch": 0.9124906057417707, + "flos": 18156906685440.0, + "grad_norm": 1.7664992295670066, + "language_loss": 0.65369797, + "learning_rate": 7.97421916704475e-08, + "loss": 0.73036981, + "num_input_tokens_seen": 327422225, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09039307, + "step": 15177, + "time_per_iteration": 2.4894156455993652 + }, + { + "auxiliary_loss_clip": 0.06400608, + "auxiliary_loss_mlp": 0.01264318, + "balance_loss_clip": 0.06271262, + "balance_loss_mlp": 0.01255127, + "epoch": 0.9125507289944386, + "flos": 11690617651200.0, + "grad_norm": 2.0964544968020746, + "language_loss": 0.81507087, + "learning_rate": 7.963335108150926e-08, + "loss": 0.89172012, + "num_input_tokens_seen": 327437025, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.09191895, + "step": 15178, + "time_per_iteration": 2.4541144371032715 + }, + { + "auxiliary_loss_clip": 0.06400141, + "auxiliary_loss_mlp": 0.01263487, + "balance_loss_clip": 0.06270543, + "balance_loss_mlp": 0.01254373, + "epoch": 0.9126108522471066, + "flos": 17754580506240.0, + "grad_norm": 1.9718139410424265, + "language_loss": 0.7923696, + "learning_rate": 7.952458331306711e-08, + "loss": 0.86900592, + "num_input_tokens_seen": 327453915, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.09112549, + "step": 15179, + "time_per_iteration": 2.4755301475524902 + }, + { + "auxiliary_loss_clip": 0.06398898, + "auxiliary_loss_mlp": 0.01263453, + "balance_loss_clip": 0.06269644, + "balance_loss_mlp": 0.01254519, + "epoch": 0.9126709754997745, + "flos": 27643039269120.0, + "grad_norm": 1.5444069929332227, + "language_loss": 0.68083477, + "learning_rate": 7.941588836924507e-08, + "loss": 0.75745833, + "num_input_tokens_seen": 327474415, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.08935547, + "step": 15180, + "time_per_iteration": 3.9795782566070557 + }, + { + "auxiliary_loss_clip": 0.06395189, + "auxiliary_loss_mlp": 0.01265637, + "balance_loss_clip": 0.0626757, + "balance_loss_mlp": 0.01257203, + "epoch": 0.9127310987524425, + "flos": 15930520974720.0, + "grad_norm": 1.7977625815153482, + "language_loss": 0.75159156, + "learning_rate": 7.930726625416495e-08, + "loss": 0.82819986, + "num_input_tokens_seen": 327492750, + "router_z_loss_clip": 1.27734375, + "router_z_loss_mlp": 0.08428955, + "step": 15181, + "time_per_iteration": 2.493853807449341 + }, + { + "auxiliary_loss_clip": 0.06406903, + "auxiliary_loss_mlp": 0.0126666, + "balance_loss_clip": 0.06270269, + "balance_loss_mlp": 0.01257296, + "epoch": 0.9127912220051104, + "flos": 21542207817600.0, + "grad_norm": 1.6739957519158306, + "language_loss": 0.7473678, + "learning_rate": 7.919871697194614e-08, + "loss": 0.82410347, + "num_input_tokens_seen": 327509470, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.09375, + "step": 15182, + "time_per_iteration": 2.5281310081481934 + }, + { + "auxiliary_loss_clip": 0.06404476, + "auxiliary_loss_mlp": 0.01262375, + "balance_loss_clip": 0.06270052, + "balance_loss_mlp": 0.01252439, + "epoch": 0.9128513452577784, + "flos": 24070837605120.0, + "grad_norm": 1.3928021431516506, + "language_loss": 0.76586825, + "learning_rate": 7.909024052670421e-08, + "loss": 0.84253675, + "num_input_tokens_seen": 327530520, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.09936523, + "step": 15183, + "time_per_iteration": 2.549593448638916 + }, + { + "auxiliary_loss_clip": 0.06403659, + "auxiliary_loss_mlp": 0.01266055, + "balance_loss_clip": 0.06268917, + "balance_loss_mlp": 0.0125628, + "epoch": 0.9129114685104465, + "flos": 16221989802240.0, + "grad_norm": 2.1106683874916925, + "language_loss": 0.76460809, + "learning_rate": 7.898183692255256e-08, + "loss": 0.8413052, + "num_input_tokens_seen": 327546960, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.09765625, + "step": 15184, + "time_per_iteration": 2.4702370166778564 + }, + { + "auxiliary_loss_clip": 0.06401117, + "auxiliary_loss_mlp": 0.01265712, + "balance_loss_clip": 0.06270198, + "balance_loss_mlp": 0.01256283, + "epoch": 0.9129715917631144, + "flos": 19389349664640.0, + "grad_norm": 1.6484733671686076, + "language_loss": 0.74769634, + "learning_rate": 7.887350616360233e-08, + "loss": 0.8243646, + "num_input_tokens_seen": 327564830, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09423828, + "step": 15185, + "time_per_iteration": 2.492485761642456 + }, + { + "auxiliary_loss_clip": 0.06400957, + "auxiliary_loss_mlp": 0.01265918, + "balance_loss_clip": 0.0627048, + "balance_loss_mlp": 0.01256166, + "epoch": 0.9130317150157824, + "flos": 20595992785920.0, + "grad_norm": 2.006751528269808, + "language_loss": 0.68653584, + "learning_rate": 7.876524825396158e-08, + "loss": 0.76320457, + "num_input_tokens_seen": 327583675, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.09741211, + "step": 15186, + "time_per_iteration": 2.485649347305298 + }, + { + "auxiliary_loss_clip": 0.06410342, + "auxiliary_loss_mlp": 0.01262913, + "balance_loss_clip": 0.06271516, + "balance_loss_mlp": 0.01253096, + "epoch": 0.9130918382684503, + "flos": 20194714782720.0, + "grad_norm": 1.795742988224212, + "language_loss": 0.77302891, + "learning_rate": 7.865706319773502e-08, + "loss": 0.84976149, + "num_input_tokens_seen": 327602280, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.09820557, + "step": 15187, + "time_per_iteration": 2.4841318130493164 + }, + { + "auxiliary_loss_clip": 0.06398897, + "auxiliary_loss_mlp": 0.01263601, + "balance_loss_clip": 0.06267929, + "balance_loss_mlp": 0.01254571, + "epoch": 0.9131519615211183, + "flos": 25563960236160.0, + "grad_norm": 2.105861883241293, + "language_loss": 0.66391146, + "learning_rate": 7.854895099902515e-08, + "loss": 0.74053645, + "num_input_tokens_seen": 327623515, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.090271, + "step": 15188, + "time_per_iteration": 4.009814023971558 + }, + { + "auxiliary_loss_clip": 0.06398279, + "auxiliary_loss_mlp": 0.01266124, + "balance_loss_clip": 0.06269646, + "balance_loss_mlp": 0.01256492, + "epoch": 0.9132120847737862, + "flos": 17937414115200.0, + "grad_norm": 1.9445407212493928, + "language_loss": 0.76366603, + "learning_rate": 7.844091166193157e-08, + "loss": 0.84031004, + "num_input_tokens_seen": 327642875, + "router_z_loss_clip": 1.28613281, + "router_z_loss_mlp": 0.09631348, + "step": 15189, + "time_per_iteration": 2.485355854034424 + }, + { + "auxiliary_loss_clip": 0.06399502, + "auxiliary_loss_mlp": 0.01264259, + "balance_loss_clip": 0.06270356, + "balance_loss_mlp": 0.0125573, + "epoch": 0.9132722080264543, + "flos": 20053822942080.0, + "grad_norm": 1.763084249703843, + "language_loss": 0.76183271, + "learning_rate": 7.8332945190551e-08, + "loss": 0.83847034, + "num_input_tokens_seen": 327662450, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.08532715, + "step": 15190, + "time_per_iteration": 2.51477313041687 + }, + { + "auxiliary_loss_clip": 0.06304602, + "auxiliary_loss_mlp": 0.01248492, + "balance_loss_clip": 0.06250489, + "balance_loss_mlp": 0.01247529, + "epoch": 0.9133323312791222, + "flos": 70461603498240.0, + "grad_norm": 0.69994498946902, + "language_loss": 0.57092154, + "learning_rate": 7.822505158897797e-08, + "loss": 0.64645249, + "num_input_tokens_seen": 327723845, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.00962067, + "step": 15191, + "time_per_iteration": 3.1387722492218018 + }, + { + "auxiliary_loss_clip": 0.06404773, + "auxiliary_loss_mlp": 0.01266029, + "balance_loss_clip": 0.06270269, + "balance_loss_mlp": 0.01256611, + "epoch": 0.9133924545317902, + "flos": 25490851948800.0, + "grad_norm": 1.8893008015714516, + "language_loss": 0.74291134, + "learning_rate": 7.81172308613034e-08, + "loss": 0.8196193, + "num_input_tokens_seen": 327742590, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.09417725, + "step": 15192, + "time_per_iteration": 2.5557541847229004 + }, + { + "auxiliary_loss_clip": 0.06398205, + "auxiliary_loss_mlp": 0.01265254, + "balance_loss_clip": 0.06269048, + "balance_loss_mlp": 0.01255693, + "epoch": 0.9134525777844581, + "flos": 39939920737920.0, + "grad_norm": 1.5014180075629815, + "language_loss": 0.6911993, + "learning_rate": 7.800948301161647e-08, + "loss": 0.76783395, + "num_input_tokens_seen": 327764350, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.09558105, + "step": 15193, + "time_per_iteration": 4.1078901290893555 + }, + { + "auxiliary_loss_clip": 0.0639585, + "auxiliary_loss_mlp": 0.01260777, + "balance_loss_clip": 0.06267818, + "balance_loss_mlp": 0.01251891, + "epoch": 0.9135127010371261, + "flos": 20893037909760.0, + "grad_norm": 1.7245818478003463, + "language_loss": 0.73219973, + "learning_rate": 7.790180804400215e-08, + "loss": 0.80876601, + "num_input_tokens_seen": 327783120, + "router_z_loss_clip": 1.28222656, + "router_z_loss_mlp": 0.08880615, + "step": 15194, + "time_per_iteration": 2.547111988067627 + }, + { + "auxiliary_loss_clip": 0.06405854, + "auxiliary_loss_mlp": 0.01268882, + "balance_loss_clip": 0.06268877, + "balance_loss_mlp": 0.01257956, + "epoch": 0.913572824289794, + "flos": 20819468424960.0, + "grad_norm": 1.8031874353131485, + "language_loss": 0.62096351, + "learning_rate": 7.779420596254383e-08, + "loss": 0.69771087, + "num_input_tokens_seen": 327801960, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.10931396, + "step": 15195, + "time_per_iteration": 3.9417948722839355 + }, + { + "auxiliary_loss_clip": 0.06398496, + "auxiliary_loss_mlp": 0.01264512, + "balance_loss_clip": 0.06267463, + "balance_loss_mlp": 0.01255285, + "epoch": 0.913632947542462, + "flos": 25710470300160.0, + "grad_norm": 1.4279035452599953, + "language_loss": 0.7193073, + "learning_rate": 7.768667677132201e-08, + "loss": 0.79593736, + "num_input_tokens_seen": 327823795, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.09222412, + "step": 15196, + "time_per_iteration": 2.551023483276367 + }, + { + "auxiliary_loss_clip": 0.06397398, + "auxiliary_loss_mlp": 0.01267249, + "balance_loss_clip": 0.06269406, + "balance_loss_mlp": 0.01258421, + "epoch": 0.9136930707951301, + "flos": 26293366028160.0, + "grad_norm": 1.6867538606308004, + "language_loss": 0.71241689, + "learning_rate": 7.757922047441411e-08, + "loss": 0.78906339, + "num_input_tokens_seen": 327845175, + "router_z_loss_clip": 1.27929688, + "router_z_loss_mlp": 0.0881958, + "step": 15197, + "time_per_iteration": 2.5550129413604736 + }, + { + "auxiliary_loss_clip": 0.06408559, + "auxiliary_loss_mlp": 0.01262566, + "balance_loss_clip": 0.06272875, + "balance_loss_mlp": 0.01252784, + "epoch": 0.913753194047798, + "flos": 22098590928000.0, + "grad_norm": 1.8710706746015826, + "language_loss": 0.78052139, + "learning_rate": 7.747183707589489e-08, + "loss": 0.85723269, + "num_input_tokens_seen": 327863150, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.09783936, + "step": 15198, + "time_per_iteration": 2.5072240829467773 + }, + { + "auxiliary_loss_clip": 0.06394757, + "auxiliary_loss_mlp": 0.01263949, + "balance_loss_clip": 0.06267546, + "balance_loss_mlp": 0.01255193, + "epoch": 0.913813317300466, + "flos": 23594061473280.0, + "grad_norm": 1.2968049238366115, + "language_loss": 0.67974442, + "learning_rate": 7.736452657983616e-08, + "loss": 0.75633144, + "num_input_tokens_seen": 327883445, + "router_z_loss_clip": 1.27148438, + "router_z_loss_mlp": 0.08758545, + "step": 15199, + "time_per_iteration": 2.534032106399536 + }, + { + "auxiliary_loss_clip": 0.0640765, + "auxiliary_loss_mlp": 0.01264658, + "balance_loss_clip": 0.06274316, + "balance_loss_mlp": 0.01255145, + "epoch": 0.9138734405531339, + "flos": 28883993437440.0, + "grad_norm": 1.4924819881457518, + "language_loss": 0.676305, + "learning_rate": 7.725728899030714e-08, + "loss": 0.75302815, + "num_input_tokens_seen": 327905745, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.09515381, + "step": 15200, + "time_per_iteration": 2.5669631958007812 + }, + { + "auxiliary_loss_clip": 0.06398766, + "auxiliary_loss_mlp": 0.01266384, + "balance_loss_clip": 0.0627182, + "balance_loss_mlp": 0.01257456, + "epoch": 0.9139335638058019, + "flos": 22827829011840.0, + "grad_norm": 1.5812565319228622, + "language_loss": 0.7186532, + "learning_rate": 7.715012431137435e-08, + "loss": 0.79530466, + "num_input_tokens_seen": 327925435, + "router_z_loss_clip": 1.26855469, + "router_z_loss_mlp": 0.0892334, + "step": 15201, + "time_per_iteration": 2.5404951572418213 + }, + { + "auxiliary_loss_clip": 0.06400613, + "auxiliary_loss_mlp": 0.01260801, + "balance_loss_clip": 0.06268527, + "balance_loss_mlp": 0.01251843, + "epoch": 0.9139936870584698, + "flos": 18009977351040.0, + "grad_norm": 1.789758567160699, + "language_loss": 0.70331693, + "learning_rate": 7.704303254710165e-08, + "loss": 0.77993107, + "num_input_tokens_seen": 327944145, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.08959961, + "step": 15202, + "time_per_iteration": 2.6106953620910645 + }, + { + "auxiliary_loss_clip": 0.06399814, + "auxiliary_loss_mlp": 0.01264792, + "balance_loss_clip": 0.06268477, + "balance_loss_mlp": 0.01255016, + "epoch": 0.9140538103111379, + "flos": 15818992790400.0, + "grad_norm": 1.8688438464961967, + "language_loss": 0.6666283, + "learning_rate": 7.693601370155001e-08, + "loss": 0.74327433, + "num_input_tokens_seen": 327960565, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.09777832, + "step": 15203, + "time_per_iteration": 2.5028200149536133 + }, + { + "auxiliary_loss_clip": 0.06404755, + "auxiliary_loss_mlp": 0.01267578, + "balance_loss_clip": 0.06273845, + "balance_loss_mlp": 0.01258315, + "epoch": 0.9141139335638058, + "flos": 23993704321920.0, + "grad_norm": 1.615125656411442, + "language_loss": 0.69094318, + "learning_rate": 7.682906777877751e-08, + "loss": 0.76766646, + "num_input_tokens_seen": 327981180, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.0927124, + "step": 15204, + "time_per_iteration": 2.5456814765930176 + }, + { + "auxiliary_loss_clip": 0.0640422, + "auxiliary_loss_mlp": 0.01265902, + "balance_loss_clip": 0.06271097, + "balance_loss_mlp": 0.01256174, + "epoch": 0.9141740568164738, + "flos": 24031243751040.0, + "grad_norm": 1.933761420354856, + "language_loss": 0.60122651, + "learning_rate": 7.672219478283915e-08, + "loss": 0.67792773, + "num_input_tokens_seen": 328001500, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.09729004, + "step": 15205, + "time_per_iteration": 2.5356082916259766 + }, + { + "auxiliary_loss_clip": 0.06395389, + "auxiliary_loss_mlp": 0.01264629, + "balance_loss_clip": 0.06268188, + "balance_loss_mlp": 0.01255611, + "epoch": 0.9142341800691417, + "flos": 27025958275200.0, + "grad_norm": 1.7761946490024947, + "language_loss": 0.81234074, + "learning_rate": 7.661539471778811e-08, + "loss": 0.88894093, + "num_input_tokens_seen": 328023025, + "router_z_loss_clip": 1.27246094, + "router_z_loss_mlp": 0.09014893, + "step": 15206, + "time_per_iteration": 2.5894620418548584 + }, + { + "auxiliary_loss_clip": 0.06404903, + "auxiliary_loss_mlp": 0.0126205, + "balance_loss_clip": 0.06271455, + "balance_loss_mlp": 0.01253056, + "epoch": 0.9142943033218097, + "flos": 20418735473280.0, + "grad_norm": 2.8299467191418333, + "language_loss": 0.74824673, + "learning_rate": 7.650866758767382e-08, + "loss": 0.8249163, + "num_input_tokens_seen": 328041410, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.08996582, + "step": 15207, + "time_per_iteration": 2.5086050033569336 + }, + { + "auxiliary_loss_clip": 0.06402467, + "auxiliary_loss_mlp": 0.01264601, + "balance_loss_clip": 0.06271173, + "balance_loss_mlp": 0.01254892, + "epoch": 0.9143544265744776, + "flos": 19761389792640.0, + "grad_norm": 1.4655535636017647, + "language_loss": 0.72923332, + "learning_rate": 7.640201339654373e-08, + "loss": 0.80590397, + "num_input_tokens_seen": 328060495, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09710693, + "step": 15208, + "time_per_iteration": 2.5494110584259033 + }, + { + "auxiliary_loss_clip": 0.06401486, + "auxiliary_loss_mlp": 0.01262111, + "balance_loss_clip": 0.06272633, + "balance_loss_mlp": 0.01253522, + "epoch": 0.9144145498271457, + "flos": 17171181653760.0, + "grad_norm": 2.2763772203960086, + "language_loss": 0.86370367, + "learning_rate": 7.629543214844237e-08, + "loss": 0.94033957, + "num_input_tokens_seen": 328076905, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.0859375, + "step": 15209, + "time_per_iteration": 2.4788320064544678 + }, + { + "auxiliary_loss_clip": 0.06401129, + "auxiliary_loss_mlp": 0.01266162, + "balance_loss_clip": 0.06271241, + "balance_loss_mlp": 0.01257269, + "epoch": 0.9144746730798137, + "flos": 23731766858880.0, + "grad_norm": 1.579155450029156, + "language_loss": 0.75406897, + "learning_rate": 7.618892384741093e-08, + "loss": 0.83074194, + "num_input_tokens_seen": 328096960, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.08886719, + "step": 15210, + "time_per_iteration": 2.5567657947540283 + }, + { + "auxiliary_loss_clip": 0.06400596, + "auxiliary_loss_mlp": 0.01264105, + "balance_loss_clip": 0.06268501, + "balance_loss_mlp": 0.01255122, + "epoch": 0.9145347963324816, + "flos": 25854842085120.0, + "grad_norm": 1.979200231812929, + "language_loss": 0.77927828, + "learning_rate": 7.6082488497488e-08, + "loss": 0.85592532, + "num_input_tokens_seen": 328115445, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.08984375, + "step": 15211, + "time_per_iteration": 2.552198648452759 + }, + { + "auxiliary_loss_clip": 0.0640268, + "auxiliary_loss_mlp": 0.01261832, + "balance_loss_clip": 0.06270398, + "balance_loss_mlp": 0.01252629, + "epoch": 0.9145949195851496, + "flos": 19248457824000.0, + "grad_norm": 2.10166098094478, + "language_loss": 0.82732511, + "learning_rate": 7.597612610270986e-08, + "loss": 0.90397024, + "num_input_tokens_seen": 328133965, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.09204102, + "step": 15212, + "time_per_iteration": 2.513986110687256 + }, + { + "auxiliary_loss_clip": 0.06398089, + "auxiliary_loss_mlp": 0.01264444, + "balance_loss_clip": 0.06269515, + "balance_loss_mlp": 0.01255665, + "epoch": 0.9146550428378175, + "flos": 18302284719360.0, + "grad_norm": 1.652995444238016, + "language_loss": 0.84054744, + "learning_rate": 7.586983666711022e-08, + "loss": 0.91717279, + "num_input_tokens_seen": 328151520, + "router_z_loss_clip": 1.28613281, + "router_z_loss_mlp": 0.08776855, + "step": 15213, + "time_per_iteration": 2.4883370399475098 + }, + { + "auxiliary_loss_clip": 0.06401733, + "auxiliary_loss_mlp": 0.01264518, + "balance_loss_clip": 0.06270234, + "balance_loss_mlp": 0.01255261, + "epoch": 0.9147151660904855, + "flos": 20090481903360.0, + "grad_norm": 1.824328091244105, + "language_loss": 0.71026123, + "learning_rate": 7.576362019471894e-08, + "loss": 0.78692377, + "num_input_tokens_seen": 328171275, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09259033, + "step": 15214, + "time_per_iteration": 2.646428346633911 + }, + { + "auxiliary_loss_clip": 0.06405354, + "auxiliary_loss_mlp": 0.0126419, + "balance_loss_clip": 0.06271201, + "balance_loss_mlp": 0.01254623, + "epoch": 0.9147752893431534, + "flos": 24395988574080.0, + "grad_norm": 1.704762447240634, + "language_loss": 0.63240612, + "learning_rate": 7.565747668956413e-08, + "loss": 0.70910156, + "num_input_tokens_seen": 328192115, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.09564209, + "step": 15215, + "time_per_iteration": 2.53265643119812 + }, + { + "auxiliary_loss_clip": 0.06403671, + "auxiliary_loss_mlp": 0.01263526, + "balance_loss_clip": 0.0626839, + "balance_loss_mlp": 0.01253512, + "epoch": 0.9148354125958215, + "flos": 18156277779840.0, + "grad_norm": 2.2416131553032983, + "language_loss": 0.76165468, + "learning_rate": 7.555140615567058e-08, + "loss": 0.83832663, + "num_input_tokens_seen": 328208990, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.10009766, + "step": 15216, + "time_per_iteration": 2.4794795513153076 + }, + { + "auxiliary_loss_clip": 0.0640347, + "auxiliary_loss_mlp": 0.01269309, + "balance_loss_clip": 0.06272964, + "balance_loss_mlp": 0.0125951, + "epoch": 0.9148955358484894, + "flos": 23374233486720.0, + "grad_norm": 2.196642746611264, + "language_loss": 0.68317431, + "learning_rate": 7.544540859706062e-08, + "loss": 0.75990212, + "num_input_tokens_seen": 328227840, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.0980835, + "step": 15217, + "time_per_iteration": 2.5035665035247803 + }, + { + "auxiliary_loss_clip": 0.06397339, + "auxiliary_loss_mlp": 0.01263699, + "balance_loss_clip": 0.06268431, + "balance_loss_mlp": 0.01254029, + "epoch": 0.9149556591011574, + "flos": 18082205170560.0, + "grad_norm": 1.8248039597500896, + "language_loss": 0.80576724, + "learning_rate": 7.533948401775347e-08, + "loss": 0.88237762, + "num_input_tokens_seen": 328246250, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.09667969, + "step": 15218, + "time_per_iteration": 2.4810121059417725 + }, + { + "auxiliary_loss_clip": 0.06306933, + "auxiliary_loss_mlp": 0.01255386, + "balance_loss_clip": 0.06252693, + "balance_loss_mlp": 0.01254361, + "epoch": 0.9150157823538253, + "flos": 54602220240000.0, + "grad_norm": 0.8181156143430024, + "language_loss": 0.58716023, + "learning_rate": 7.523363242176595e-08, + "loss": 0.6627835, + "num_input_tokens_seen": 328303625, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.01025391, + "step": 15219, + "time_per_iteration": 4.510970592498779 + }, + { + "auxiliary_loss_clip": 0.0639798, + "auxiliary_loss_mlp": 0.01263707, + "balance_loss_clip": 0.06269677, + "balance_loss_mlp": 0.01254683, + "epoch": 0.9150759056064933, + "flos": 17898616874880.0, + "grad_norm": 2.651595808916399, + "language_loss": 0.78293604, + "learning_rate": 7.512785381311216e-08, + "loss": 0.85955286, + "num_input_tokens_seen": 328322135, + "router_z_loss_clip": 1.28222656, + "router_z_loss_mlp": 0.090271, + "step": 15220, + "time_per_iteration": 2.4863898754119873 + }, + { + "auxiliary_loss_clip": 0.06403407, + "auxiliary_loss_mlp": 0.01264138, + "balance_loss_clip": 0.0626848, + "balance_loss_mlp": 0.01254214, + "epoch": 0.9151360288591612, + "flos": 18078725226240.0, + "grad_norm": 1.7108553042471706, + "language_loss": 0.65879726, + "learning_rate": 7.50221481958031e-08, + "loss": 0.73547268, + "num_input_tokens_seen": 328340750, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.09924316, + "step": 15221, + "time_per_iteration": 2.4642598628997803 + }, + { + "auxiliary_loss_clip": 0.06398383, + "auxiliary_loss_mlp": 0.0126148, + "balance_loss_clip": 0.0626786, + "balance_loss_mlp": 0.01252003, + "epoch": 0.9151961521118293, + "flos": 19360614913920.0, + "grad_norm": 1.718973391494924, + "language_loss": 0.84501803, + "learning_rate": 7.491651557384692e-08, + "loss": 0.92161667, + "num_input_tokens_seen": 328359995, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.0947876, + "step": 15222, + "time_per_iteration": 2.471740245819092 + }, + { + "auxiliary_loss_clip": 0.06308072, + "auxiliary_loss_mlp": 0.0125194, + "balance_loss_clip": 0.06253721, + "balance_loss_mlp": 0.01250893, + "epoch": 0.9152562753644973, + "flos": 72167174956800.0, + "grad_norm": 0.7092684015563987, + "language_loss": 0.49536896, + "learning_rate": 7.481095595124953e-08, + "loss": 0.5709691, + "num_input_tokens_seen": 328426865, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.01048279, + "step": 15223, + "time_per_iteration": 3.159543752670288 + }, + { + "auxiliary_loss_clip": 0.0640175, + "auxiliary_loss_mlp": 0.0126443, + "balance_loss_clip": 0.0627071, + "balance_loss_mlp": 0.01254488, + "epoch": 0.9153163986171652, + "flos": 20783270661120.0, + "grad_norm": 1.6312984984407164, + "language_loss": 0.72100401, + "learning_rate": 7.470546933201349e-08, + "loss": 0.79766577, + "num_input_tokens_seen": 328445970, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09942627, + "step": 15224, + "time_per_iteration": 2.497352361679077 + }, + { + "auxiliary_loss_clip": 0.06398828, + "auxiliary_loss_mlp": 0.01261128, + "balance_loss_clip": 0.06269911, + "balance_loss_mlp": 0.01252211, + "epoch": 0.9153765218698332, + "flos": 23046902311680.0, + "grad_norm": 1.8848265932846708, + "language_loss": 0.81092465, + "learning_rate": 7.460005572013895e-08, + "loss": 0.88752425, + "num_input_tokens_seen": 328464585, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.0892334, + "step": 15225, + "time_per_iteration": 2.5618300437927246 + }, + { + "auxiliary_loss_clip": 0.06398889, + "auxiliary_loss_mlp": 0.0126229, + "balance_loss_clip": 0.06268218, + "balance_loss_mlp": 0.0125317, + "epoch": 0.9154366451225011, + "flos": 28999295055360.0, + "grad_norm": 1.3043395747962432, + "language_loss": 0.71588331, + "learning_rate": 7.44947151196238e-08, + "loss": 0.79249507, + "num_input_tokens_seen": 328490155, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.09124756, + "step": 15226, + "time_per_iteration": 2.6222610473632812 + }, + { + "auxiliary_loss_clip": 0.06400748, + "auxiliary_loss_mlp": 0.01263826, + "balance_loss_clip": 0.06268212, + "balance_loss_mlp": 0.01254456, + "epoch": 0.9154967683751691, + "flos": 22316029073280.0, + "grad_norm": 1.870267091222323, + "language_loss": 0.7535274, + "learning_rate": 7.43894475344613e-08, + "loss": 0.83017313, + "num_input_tokens_seen": 328508275, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09365845, + "step": 15227, + "time_per_iteration": 3.9587011337280273 + }, + { + "auxiliary_loss_clip": 0.06399345, + "auxiliary_loss_mlp": 0.01263901, + "balance_loss_clip": 0.0627011, + "balance_loss_mlp": 0.01255276, + "epoch": 0.915556891627837, + "flos": 24578360985600.0, + "grad_norm": 1.5200774095907186, + "language_loss": 0.74375439, + "learning_rate": 7.428425296864404e-08, + "loss": 0.82038689, + "num_input_tokens_seen": 328529425, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.08630371, + "step": 15228, + "time_per_iteration": 2.5360701084136963 + }, + { + "auxiliary_loss_clip": 0.06402157, + "auxiliary_loss_mlp": 0.01265448, + "balance_loss_clip": 0.06272555, + "balance_loss_mlp": 0.01256287, + "epoch": 0.9156170148805051, + "flos": 22171363799040.0, + "grad_norm": 1.5117785921082858, + "language_loss": 0.72036177, + "learning_rate": 7.417913142616106e-08, + "loss": 0.79703784, + "num_input_tokens_seen": 328550200, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.0916748, + "step": 15229, + "time_per_iteration": 2.5301578044891357 + }, + { + "auxiliary_loss_clip": 0.06400885, + "auxiliary_loss_mlp": 0.01266333, + "balance_loss_clip": 0.06270942, + "balance_loss_mlp": 0.01256397, + "epoch": 0.915677138133173, + "flos": 20926552343040.0, + "grad_norm": 1.5178465460863502, + "language_loss": 0.83324695, + "learning_rate": 7.407408291099848e-08, + "loss": 0.90991908, + "num_input_tokens_seen": 328568540, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.09936523, + "step": 15230, + "time_per_iteration": 2.4890830516815186 + }, + { + "auxiliary_loss_clip": 0.06398893, + "auxiliary_loss_mlp": 0.01261014, + "balance_loss_clip": 0.06271241, + "balance_loss_mlp": 0.01251907, + "epoch": 0.915737261385841, + "flos": 24350734569600.0, + "grad_norm": 2.140708224764665, + "language_loss": 0.83798474, + "learning_rate": 7.396910742713957e-08, + "loss": 0.9145838, + "num_input_tokens_seen": 328587300, + "router_z_loss_clip": 1.27734375, + "router_z_loss_mlp": 0.09112549, + "step": 15231, + "time_per_iteration": 2.5503671169281006 + }, + { + "auxiliary_loss_clip": 0.06395644, + "auxiliary_loss_mlp": 0.01262039, + "balance_loss_clip": 0.06266124, + "balance_loss_mlp": 0.01253051, + "epoch": 0.9157973846385089, + "flos": 26768758567680.0, + "grad_norm": 1.412460383804666, + "language_loss": 0.72348028, + "learning_rate": 7.386420497856516e-08, + "loss": 0.80005717, + "num_input_tokens_seen": 328610055, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.08978271, + "step": 15232, + "time_per_iteration": 2.536257266998291 + }, + { + "auxiliary_loss_clip": 0.06403804, + "auxiliary_loss_mlp": 0.01263561, + "balance_loss_clip": 0.06271422, + "balance_loss_mlp": 0.01254436, + "epoch": 0.9158575078911769, + "flos": 18484657130880.0, + "grad_norm": 2.3550676100990775, + "language_loss": 0.6826663, + "learning_rate": 7.375937556925338e-08, + "loss": 0.75933993, + "num_input_tokens_seen": 328626815, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.09124756, + "step": 15233, + "time_per_iteration": 4.011778831481934 + }, + { + "auxiliary_loss_clip": 0.064054, + "auxiliary_loss_mlp": 0.01265597, + "balance_loss_clip": 0.06272289, + "balance_loss_mlp": 0.01255769, + "epoch": 0.9159176311438448, + "flos": 21805403091840.0, + "grad_norm": 1.916334328828353, + "language_loss": 0.69990098, + "learning_rate": 7.365461920317861e-08, + "loss": 0.77661097, + "num_input_tokens_seen": 328643995, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.09832764, + "step": 15234, + "time_per_iteration": 2.5241239070892334 + }, + { + "auxiliary_loss_clip": 0.06404121, + "auxiliary_loss_mlp": 0.01263525, + "balance_loss_clip": 0.06271881, + "balance_loss_mlp": 0.01253964, + "epoch": 0.9159777543965129, + "flos": 24789552001920.0, + "grad_norm": 1.6575192392751135, + "language_loss": 0.8802951, + "learning_rate": 7.354993588431391e-08, + "loss": 0.95697153, + "num_input_tokens_seen": 328659565, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09558105, + "step": 15235, + "time_per_iteration": 3.9579367637634277 + }, + { + "auxiliary_loss_clip": 0.06400644, + "auxiliary_loss_mlp": 0.01266559, + "balance_loss_clip": 0.06269062, + "balance_loss_mlp": 0.01256987, + "epoch": 0.9160378776491809, + "flos": 26875800558720.0, + "grad_norm": 1.690257425906499, + "language_loss": 0.77583575, + "learning_rate": 7.344532561662853e-08, + "loss": 0.85250783, + "num_input_tokens_seen": 328679045, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.09576416, + "step": 15236, + "time_per_iteration": 2.5500221252441406 + }, + { + "auxiliary_loss_clip": 0.06309772, + "auxiliary_loss_mlp": 0.01251276, + "balance_loss_clip": 0.06255564, + "balance_loss_mlp": 0.01250298, + "epoch": 0.9160980009018488, + "flos": 70598596124160.0, + "grad_norm": 0.6553616821648679, + "language_loss": 0.6221928, + "learning_rate": 7.334078840409019e-08, + "loss": 0.69780326, + "num_input_tokens_seen": 328744565, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.00977325, + "step": 15237, + "time_per_iteration": 3.084401845932007 + }, + { + "auxiliary_loss_clip": 0.0640253, + "auxiliary_loss_mlp": 0.0126268, + "balance_loss_clip": 0.06270298, + "balance_loss_mlp": 0.01252827, + "epoch": 0.9161581241545168, + "flos": 16294846527360.0, + "grad_norm": 1.9192593491707206, + "language_loss": 0.75049806, + "learning_rate": 7.323632425066151e-08, + "loss": 0.82715011, + "num_input_tokens_seen": 328762455, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.09863281, + "step": 15238, + "time_per_iteration": 2.4591023921966553 + }, + { + "auxiliary_loss_clip": 0.06403898, + "auxiliary_loss_mlp": 0.0126337, + "balance_loss_clip": 0.06271527, + "balance_loss_mlp": 0.01253672, + "epoch": 0.9162182474071847, + "flos": 18443386195200.0, + "grad_norm": 1.5712034366167735, + "language_loss": 0.74555534, + "learning_rate": 7.313193316030464e-08, + "loss": 0.82222801, + "num_input_tokens_seen": 328780320, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.0970459, + "step": 15239, + "time_per_iteration": 2.5155394077301025 + }, + { + "auxiliary_loss_clip": 0.06404249, + "auxiliary_loss_mlp": 0.01269015, + "balance_loss_clip": 0.06271224, + "balance_loss_mlp": 0.01259258, + "epoch": 0.9162783706598527, + "flos": 19172498497920.0, + "grad_norm": 1.883459603997045, + "language_loss": 0.63822246, + "learning_rate": 7.302761513697819e-08, + "loss": 0.71495509, + "num_input_tokens_seen": 328797570, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.09765625, + "step": 15240, + "time_per_iteration": 2.5100462436676025 + }, + { + "auxiliary_loss_clip": 0.06401497, + "auxiliary_loss_mlp": 0.0126341, + "balance_loss_clip": 0.06272344, + "balance_loss_mlp": 0.0125438, + "epoch": 0.9163384939125206, + "flos": 20419322451840.0, + "grad_norm": 1.7171261992686273, + "language_loss": 0.76934052, + "learning_rate": 7.292337018463746e-08, + "loss": 0.84598958, + "num_input_tokens_seen": 328814075, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.090271, + "step": 15241, + "time_per_iteration": 2.515197992324829 + }, + { + "auxiliary_loss_clip": 0.06415688, + "auxiliary_loss_mlp": 0.01267436, + "balance_loss_clip": 0.06273037, + "balance_loss_mlp": 0.01256236, + "epoch": 0.9163986171651887, + "flos": 19651957960320.0, + "grad_norm": 2.7440161948074984, + "language_loss": 0.68086845, + "learning_rate": 7.281919830723549e-08, + "loss": 0.75769967, + "num_input_tokens_seen": 328831990, + "router_z_loss_clip": 1.42675781, + "router_z_loss_mlp": 0.11193848, + "step": 15242, + "time_per_iteration": 2.5829575061798096 + }, + { + "auxiliary_loss_clip": 0.06399854, + "auxiliary_loss_mlp": 0.01263264, + "balance_loss_clip": 0.06268453, + "balance_loss_mlp": 0.01254204, + "epoch": 0.9164587404178566, + "flos": 12827967845760.0, + "grad_norm": 1.757331084176624, + "language_loss": 0.81106311, + "learning_rate": 7.271509950872334e-08, + "loss": 0.88769436, + "num_input_tokens_seen": 328849105, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09069824, + "step": 15243, + "time_per_iteration": 2.5732226371765137 + }, + { + "auxiliary_loss_clip": 0.06405694, + "auxiliary_loss_mlp": 0.01266735, + "balance_loss_clip": 0.0627118, + "balance_loss_mlp": 0.01256996, + "epoch": 0.9165188636705246, + "flos": 22315903292160.0, + "grad_norm": 3.9147017718887205, + "language_loss": 0.82610697, + "learning_rate": 7.261107379304721e-08, + "loss": 0.90283132, + "num_input_tokens_seen": 328866810, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.09735107, + "step": 15244, + "time_per_iteration": 2.501309871673584 + }, + { + "auxiliary_loss_clip": 0.06406015, + "auxiliary_loss_mlp": 0.01265083, + "balance_loss_clip": 0.06269778, + "balance_loss_mlp": 0.01255153, + "epoch": 0.9165789869231925, + "flos": 18229218359040.0, + "grad_norm": 2.4095610629063176, + "language_loss": 0.72487861, + "learning_rate": 7.250712116415214e-08, + "loss": 0.80158961, + "num_input_tokens_seen": 328885325, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.0993042, + "step": 15245, + "time_per_iteration": 2.51802921295166 + }, + { + "auxiliary_loss_clip": 0.064008, + "auxiliary_loss_mlp": 0.01263885, + "balance_loss_clip": 0.06269535, + "balance_loss_mlp": 0.01254885, + "epoch": 0.9166391101758605, + "flos": 13695414439680.0, + "grad_norm": 1.5418326168026033, + "language_loss": 0.74834359, + "learning_rate": 7.240324162598033e-08, + "loss": 0.82499039, + "num_input_tokens_seen": 328902655, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.08990479, + "step": 15246, + "time_per_iteration": 2.4759280681610107 + }, + { + "auxiliary_loss_clip": 0.06401987, + "auxiliary_loss_mlp": 0.01264745, + "balance_loss_clip": 0.06271073, + "balance_loss_mlp": 0.01255065, + "epoch": 0.9166992334285284, + "flos": 17352380108160.0, + "grad_norm": 2.6245151033033802, + "language_loss": 0.75630188, + "learning_rate": 7.229943518247106e-08, + "loss": 0.83296925, + "num_input_tokens_seen": 328918440, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09680176, + "step": 15247, + "time_per_iteration": 2.50736927986145 + }, + { + "auxiliary_loss_clip": 0.06403103, + "auxiliary_loss_mlp": 0.01263507, + "balance_loss_clip": 0.06269411, + "balance_loss_mlp": 0.01254096, + "epoch": 0.9167593566811965, + "flos": 23737678571520.0, + "grad_norm": 1.734119816640847, + "language_loss": 0.76551712, + "learning_rate": 7.219570183756052e-08, + "loss": 0.84218323, + "num_input_tokens_seen": 328938055, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.09405518, + "step": 15248, + "time_per_iteration": 2.5225977897644043 + }, + { + "auxiliary_loss_clip": 0.06402726, + "auxiliary_loss_mlp": 0.01267049, + "balance_loss_clip": 0.06269974, + "balance_loss_mlp": 0.01256589, + "epoch": 0.9168194799338644, + "flos": 27825537461760.0, + "grad_norm": 2.0530525588042634, + "language_loss": 0.739088, + "learning_rate": 7.209204159518178e-08, + "loss": 0.81578577, + "num_input_tokens_seen": 328957895, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.10467529, + "step": 15249, + "time_per_iteration": 2.5675055980682373 + }, + { + "auxiliary_loss_clip": 0.06401356, + "auxiliary_loss_mlp": 0.01265318, + "balance_loss_clip": 0.06270999, + "balance_loss_mlp": 0.01256509, + "epoch": 0.9168796031865324, + "flos": 21722609658240.0, + "grad_norm": 1.9372290328284216, + "language_loss": 0.76030535, + "learning_rate": 7.198845445926616e-08, + "loss": 0.83697212, + "num_input_tokens_seen": 328971365, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.08813477, + "step": 15250, + "time_per_iteration": 2.4735028743743896 + }, + { + "auxiliary_loss_clip": 0.06397949, + "auxiliary_loss_mlp": 0.01265748, + "balance_loss_clip": 0.06268395, + "balance_loss_mlp": 0.01256193, + "epoch": 0.9169397264392004, + "flos": 23411185937280.0, + "grad_norm": 1.8107623073184385, + "language_loss": 0.76076829, + "learning_rate": 7.188494043374138e-08, + "loss": 0.83740526, + "num_input_tokens_seen": 328990830, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.09545898, + "step": 15251, + "time_per_iteration": 2.6144092082977295 + }, + { + "auxiliary_loss_clip": 0.0640536, + "auxiliary_loss_mlp": 0.01263626, + "balance_loss_clip": 0.06273532, + "balance_loss_mlp": 0.01253958, + "epoch": 0.9169998496918683, + "flos": 23957716193280.0, + "grad_norm": 2.127162243234926, + "language_loss": 0.80199194, + "learning_rate": 7.178149952253298e-08, + "loss": 0.87868178, + "num_input_tokens_seen": 329008345, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09667969, + "step": 15252, + "time_per_iteration": 2.5656697750091553 + }, + { + "auxiliary_loss_clip": 0.0640313, + "auxiliary_loss_mlp": 0.0126583, + "balance_loss_clip": 0.06271911, + "balance_loss_mlp": 0.01256287, + "epoch": 0.9170599729445363, + "flos": 18338314775040.0, + "grad_norm": 1.430147384395712, + "language_loss": 0.77667707, + "learning_rate": 7.167813172956316e-08, + "loss": 0.85336667, + "num_input_tokens_seen": 329027440, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.09539795, + "step": 15253, + "time_per_iteration": 2.5039689540863037 + }, + { + "auxiliary_loss_clip": 0.06402656, + "auxiliary_loss_mlp": 0.01263281, + "balance_loss_clip": 0.0627113, + "balance_loss_mlp": 0.01254513, + "epoch": 0.9171200961972042, + "flos": 22681528583040.0, + "grad_norm": 1.727297082986554, + "language_loss": 0.72871399, + "learning_rate": 7.157483705875256e-08, + "loss": 0.80537337, + "num_input_tokens_seen": 329046445, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.08776855, + "step": 15254, + "time_per_iteration": 2.5122387409210205 + }, + { + "auxiliary_loss_clip": 0.06395872, + "auxiliary_loss_mlp": 0.01264189, + "balance_loss_clip": 0.06269526, + "balance_loss_mlp": 0.01254825, + "epoch": 0.9171802194498723, + "flos": 26725726696320.0, + "grad_norm": 1.4812567402844228, + "language_loss": 0.79206324, + "learning_rate": 7.14716155140167e-08, + "loss": 0.86866391, + "num_input_tokens_seen": 329065555, + "router_z_loss_clip": 1.26464844, + "router_z_loss_mlp": 0.09356689, + "step": 15255, + "time_per_iteration": 2.5765507221221924 + }, + { + "auxiliary_loss_clip": 0.064024, + "auxiliary_loss_mlp": 0.01269302, + "balance_loss_clip": 0.06268662, + "balance_loss_mlp": 0.01260224, + "epoch": 0.9172403427025402, + "flos": 37898423061120.0, + "grad_norm": 1.872101049589666, + "language_loss": 0.68329966, + "learning_rate": 7.136846709927047e-08, + "loss": 0.76001668, + "num_input_tokens_seen": 329087515, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.09082031, + "step": 15256, + "time_per_iteration": 2.6418230533599854 + }, + { + "auxiliary_loss_clip": 0.06400028, + "auxiliary_loss_mlp": 0.01263708, + "balance_loss_clip": 0.0627111, + "balance_loss_mlp": 0.01254976, + "epoch": 0.9173004659552082, + "flos": 17060743572480.0, + "grad_norm": 1.8283973623759848, + "language_loss": 0.84006357, + "learning_rate": 7.126539181842561e-08, + "loss": 0.91670096, + "num_input_tokens_seen": 329106820, + "router_z_loss_clip": 1.28808594, + "router_z_loss_mlp": 0.08734131, + "step": 15257, + "time_per_iteration": 2.5305137634277344 + }, + { + "auxiliary_loss_clip": 0.0639857, + "auxiliary_loss_mlp": 0.01263291, + "balance_loss_clip": 0.0627027, + "balance_loss_mlp": 0.01254678, + "epoch": 0.9173605892078761, + "flos": 22208358176640.0, + "grad_norm": 1.5204666136912515, + "language_loss": 0.77536505, + "learning_rate": 7.116238967539012e-08, + "loss": 0.85198367, + "num_input_tokens_seen": 329126515, + "router_z_loss_clip": 1.28417969, + "router_z_loss_mlp": 0.08618164, + "step": 15258, + "time_per_iteration": 2.5125315189361572 + }, + { + "auxiliary_loss_clip": 0.06402186, + "auxiliary_loss_mlp": 0.01265531, + "balance_loss_clip": 0.06273498, + "balance_loss_mlp": 0.01256227, + "epoch": 0.9174207124605441, + "flos": 16513248994560.0, + "grad_norm": 1.9960678800991773, + "language_loss": 0.78876376, + "learning_rate": 7.105946067406999e-08, + "loss": 0.86544091, + "num_input_tokens_seen": 329142660, + "router_z_loss_clip": 1.28613281, + "router_z_loss_mlp": 0.09307861, + "step": 15259, + "time_per_iteration": 3.941746950149536 + }, + { + "auxiliary_loss_clip": 0.06399495, + "auxiliary_loss_mlp": 0.01264365, + "balance_loss_clip": 0.06270274, + "balance_loss_mlp": 0.01255651, + "epoch": 0.917480835713212, + "flos": 24542582492160.0, + "grad_norm": 1.4851816549824022, + "language_loss": 0.76305032, + "learning_rate": 7.095660481836895e-08, + "loss": 0.8396889, + "num_input_tokens_seen": 329162575, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.0871582, + "step": 15260, + "time_per_iteration": 2.54323148727417 + }, + { + "auxiliary_loss_clip": 0.06400856, + "auxiliary_loss_mlp": 0.01262484, + "balance_loss_clip": 0.06270303, + "balance_loss_mlp": 0.01253311, + "epoch": 0.9175409589658801, + "flos": 20886036094080.0, + "grad_norm": 1.4569612276520922, + "language_loss": 0.61439729, + "learning_rate": 7.085382211218637e-08, + "loss": 0.69103068, + "num_input_tokens_seen": 329182090, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.09179688, + "step": 15261, + "time_per_iteration": 2.519350290298462 + }, + { + "auxiliary_loss_clip": 0.063967, + "auxiliary_loss_mlp": 0.01261936, + "balance_loss_clip": 0.06268273, + "balance_loss_mlp": 0.01252865, + "epoch": 0.917601082218548, + "flos": 14280113030400.0, + "grad_norm": 1.8017934646848675, + "language_loss": 0.74208277, + "learning_rate": 7.075111255942002e-08, + "loss": 0.81866914, + "num_input_tokens_seen": 329196535, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.09063721, + "step": 15262, + "time_per_iteration": 2.4770686626434326 + }, + { + "auxiliary_loss_clip": 0.06404866, + "auxiliary_loss_mlp": 0.01263429, + "balance_loss_clip": 0.06268941, + "balance_loss_mlp": 0.0125425, + "epoch": 0.917661205471216, + "flos": 19105301923200.0, + "grad_norm": 1.713441901458641, + "language_loss": 0.77938473, + "learning_rate": 7.064847616396496e-08, + "loss": 0.85606766, + "num_input_tokens_seen": 329215135, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.09179688, + "step": 15263, + "time_per_iteration": 2.4721927642822266 + }, + { + "auxiliary_loss_clip": 0.06405415, + "auxiliary_loss_mlp": 0.01265853, + "balance_loss_clip": 0.06269035, + "balance_loss_mlp": 0.01256025, + "epoch": 0.917721328723884, + "flos": 21113075531520.0, + "grad_norm": 2.2981718419830894, + "language_loss": 0.75979543, + "learning_rate": 7.054591292971324e-08, + "loss": 0.83650815, + "num_input_tokens_seen": 329235150, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.09832764, + "step": 15264, + "time_per_iteration": 2.5106306076049805 + }, + { + "auxiliary_loss_clip": 0.06397746, + "auxiliary_loss_mlp": 0.01263995, + "balance_loss_clip": 0.06265679, + "balance_loss_mlp": 0.01254398, + "epoch": 0.9177814519765519, + "flos": 21949439460480.0, + "grad_norm": 1.607338475004671, + "language_loss": 0.83605742, + "learning_rate": 7.044342286055394e-08, + "loss": 0.91267478, + "num_input_tokens_seen": 329254365, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09594727, + "step": 15265, + "time_per_iteration": 2.500438928604126 + }, + { + "auxiliary_loss_clip": 0.06404482, + "auxiliary_loss_mlp": 0.01267303, + "balance_loss_clip": 0.06270517, + "balance_loss_mlp": 0.01256759, + "epoch": 0.9178415752292199, + "flos": 24212693767680.0, + "grad_norm": 1.4811768769102642, + "language_loss": 0.73341453, + "learning_rate": 7.034100596037306e-08, + "loss": 0.81013238, + "num_input_tokens_seen": 329274385, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.10552979, + "step": 15266, + "time_per_iteration": 3.9415042400360107 + }, + { + "auxiliary_loss_clip": 0.06403729, + "auxiliary_loss_mlp": 0.01265804, + "balance_loss_clip": 0.06271026, + "balance_loss_mlp": 0.01256506, + "epoch": 0.9179016984818879, + "flos": 20047324250880.0, + "grad_norm": 1.5268706819082398, + "language_loss": 0.77726352, + "learning_rate": 7.023866223305486e-08, + "loss": 0.85395879, + "num_input_tokens_seen": 329292160, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09289551, + "step": 15267, + "time_per_iteration": 2.5025975704193115 + }, + { + "auxiliary_loss_clip": 0.06306774, + "auxiliary_loss_mlp": 0.0124874, + "balance_loss_clip": 0.06252508, + "balance_loss_mlp": 0.01247798, + "epoch": 0.9179618217345559, + "flos": 65577561511680.0, + "grad_norm": 0.7361853308076762, + "language_loss": 0.55530179, + "learning_rate": 7.013639168247975e-08, + "loss": 0.63085693, + "num_input_tokens_seen": 329351870, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.00940704, + "step": 15268, + "time_per_iteration": 3.1551411151885986 + }, + { + "auxiliary_loss_clip": 0.0640334, + "auxiliary_loss_mlp": 0.0126454, + "balance_loss_clip": 0.06272043, + "balance_loss_mlp": 0.01255224, + "epoch": 0.9180219449872238, + "flos": 21331016801280.0, + "grad_norm": 1.7178999838576712, + "language_loss": 0.76744187, + "learning_rate": 7.0034194312526e-08, + "loss": 0.84412068, + "num_input_tokens_seen": 329370930, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.09313965, + "step": 15269, + "time_per_iteration": 2.5461537837982178 + }, + { + "auxiliary_loss_clip": 0.06400153, + "auxiliary_loss_mlp": 0.01268007, + "balance_loss_clip": 0.06269392, + "balance_loss_mlp": 0.01257689, + "epoch": 0.9180820682398918, + "flos": 41069137086720.0, + "grad_norm": 1.706172460230681, + "language_loss": 0.72979438, + "learning_rate": 6.993207012706936e-08, + "loss": 0.806476, + "num_input_tokens_seen": 329391275, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.10321045, + "step": 15270, + "time_per_iteration": 2.6807196140289307 + }, + { + "auxiliary_loss_clip": 0.06395302, + "auxiliary_loss_mlp": 0.01268583, + "balance_loss_clip": 0.06266629, + "balance_loss_mlp": 0.01259571, + "epoch": 0.9181421914925597, + "flos": 28080179619840.0, + "grad_norm": 1.4631420859140687, + "language_loss": 0.79966378, + "learning_rate": 6.98300191299821e-08, + "loss": 0.8763026, + "num_input_tokens_seen": 329412775, + "router_z_loss_clip": 1.28613281, + "router_z_loss_mlp": 0.09008789, + "step": 15271, + "time_per_iteration": 2.6022467613220215 + }, + { + "auxiliary_loss_clip": 0.0640102, + "auxiliary_loss_mlp": 0.01263986, + "balance_loss_clip": 0.06268465, + "balance_loss_mlp": 0.01254706, + "epoch": 0.9182023147452277, + "flos": 29177181273600.0, + "grad_norm": 1.8997922177263993, + "language_loss": 0.72772801, + "learning_rate": 6.972804132513355e-08, + "loss": 0.80437815, + "num_input_tokens_seen": 329432440, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.09277344, + "step": 15272, + "time_per_iteration": 2.5741183757781982 + }, + { + "auxiliary_loss_clip": 0.06399629, + "auxiliary_loss_mlp": 0.01266695, + "balance_loss_clip": 0.06269245, + "balance_loss_mlp": 0.01257784, + "epoch": 0.9182624379978956, + "flos": 24067651150080.0, + "grad_norm": 1.823337092754064, + "language_loss": 0.72748905, + "learning_rate": 6.962613671639105e-08, + "loss": 0.80415225, + "num_input_tokens_seen": 329450605, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.08911133, + "step": 15273, + "time_per_iteration": 3.989461898803711 + }, + { + "auxiliary_loss_clip": 0.06397839, + "auxiliary_loss_mlp": 0.01266929, + "balance_loss_clip": 0.06272411, + "balance_loss_mlp": 0.01258454, + "epoch": 0.9183225612505637, + "flos": 23300035096320.0, + "grad_norm": 1.4793794409400558, + "language_loss": 0.74706221, + "learning_rate": 6.952430530761933e-08, + "loss": 0.82370985, + "num_input_tokens_seen": 329470550, + "router_z_loss_clip": 1.25390625, + "router_z_loss_mlp": 0.08477783, + "step": 15274, + "time_per_iteration": 2.520556688308716 + }, + { + "auxiliary_loss_clip": 0.06403947, + "auxiliary_loss_mlp": 0.0126299, + "balance_loss_clip": 0.06271337, + "balance_loss_mlp": 0.012539, + "epoch": 0.9183826845032316, + "flos": 19615257072000.0, + "grad_norm": 1.5221375197874305, + "language_loss": 0.69075209, + "learning_rate": 6.942254710267902e-08, + "loss": 0.76742148, + "num_input_tokens_seen": 329489765, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09088135, + "step": 15275, + "time_per_iteration": 3.905719041824341 + }, + { + "auxiliary_loss_clip": 0.06398068, + "auxiliary_loss_mlp": 0.01264874, + "balance_loss_clip": 0.06269246, + "balance_loss_mlp": 0.01255296, + "epoch": 0.9184428077558996, + "flos": 18485034474240.0, + "grad_norm": 1.8827436840113005, + "language_loss": 0.72488761, + "learning_rate": 6.932086210542953e-08, + "loss": 0.80151707, + "num_input_tokens_seen": 329507040, + "router_z_loss_clip": 1.28808594, + "router_z_loss_mlp": 0.09576416, + "step": 15276, + "time_per_iteration": 2.485471248626709 + }, + { + "auxiliary_loss_clip": 0.06402228, + "auxiliary_loss_mlp": 0.01261956, + "balance_loss_clip": 0.06271537, + "balance_loss_mlp": 0.01253277, + "epoch": 0.9185029310085676, + "flos": 20747366386560.0, + "grad_norm": 1.7691227354314663, + "language_loss": 0.73457116, + "learning_rate": 6.921925031972642e-08, + "loss": 0.81121302, + "num_input_tokens_seen": 329525540, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.08679199, + "step": 15277, + "time_per_iteration": 2.512688159942627 + }, + { + "auxiliary_loss_clip": 0.06307656, + "auxiliary_loss_mlp": 0.01251054, + "balance_loss_clip": 0.06253561, + "balance_loss_mlp": 0.01250129, + "epoch": 0.9185630542612355, + "flos": 68229641491200.0, + "grad_norm": 0.706284622540633, + "language_loss": 0.59206891, + "learning_rate": 6.91177117494226e-08, + "loss": 0.66765606, + "num_input_tokens_seen": 329592905, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.00922394, + "step": 15278, + "time_per_iteration": 3.2377090454101562 + }, + { + "auxiliary_loss_clip": 0.06395192, + "auxiliary_loss_mlp": 0.01259779, + "balance_loss_clip": 0.06267422, + "balance_loss_mlp": 0.01251649, + "epoch": 0.9186231775139035, + "flos": 12244317431040.0, + "grad_norm": 1.7835726733368307, + "language_loss": 0.64503765, + "learning_rate": 6.901624639836879e-08, + "loss": 0.7215873, + "num_input_tokens_seen": 329610150, + "router_z_loss_clip": 1.27539062, + "router_z_loss_mlp": 0.08123779, + "step": 15279, + "time_per_iteration": 2.475576877593994 + }, + { + "auxiliary_loss_clip": 0.0631056, + "auxiliary_loss_mlp": 0.01249529, + "balance_loss_clip": 0.0625634, + "balance_loss_mlp": 0.01248621, + "epoch": 0.9186833007665715, + "flos": 63958739356800.0, + "grad_norm": 0.8219128410312971, + "language_loss": 0.60080945, + "learning_rate": 6.891485427041211e-08, + "loss": 0.67641032, + "num_input_tokens_seen": 329673650, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.00904846, + "step": 15280, + "time_per_iteration": 3.119189977645874 + }, + { + "auxiliary_loss_clip": 0.06403612, + "auxiliary_loss_mlp": 0.01263581, + "balance_loss_clip": 0.06269744, + "balance_loss_mlp": 0.01253639, + "epoch": 0.9187434240192395, + "flos": 19980882362880.0, + "grad_norm": 1.890303690282995, + "language_loss": 0.70436323, + "learning_rate": 6.881353536939815e-08, + "loss": 0.78103518, + "num_input_tokens_seen": 329692520, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.09942627, + "step": 15281, + "time_per_iteration": 2.531141996383667 + }, + { + "auxiliary_loss_clip": 0.06401566, + "auxiliary_loss_mlp": 0.01263049, + "balance_loss_clip": 0.06269042, + "balance_loss_mlp": 0.01253209, + "epoch": 0.9188035472719074, + "flos": 25234742344320.0, + "grad_norm": 1.9786800170515064, + "language_loss": 0.84562802, + "learning_rate": 6.871228969916831e-08, + "loss": 0.92227417, + "num_input_tokens_seen": 329713750, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.09838867, + "step": 15282, + "time_per_iteration": 2.5332024097442627 + }, + { + "auxiliary_loss_clip": 0.06399123, + "auxiliary_loss_mlp": 0.01271317, + "balance_loss_clip": 0.06269504, + "balance_loss_mlp": 0.01261411, + "epoch": 0.9188636705245754, + "flos": 18411423062400.0, + "grad_norm": 2.0072759179217052, + "language_loss": 0.60563141, + "learning_rate": 6.861111726356194e-08, + "loss": 0.68233585, + "num_input_tokens_seen": 329730960, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.09906006, + "step": 15283, + "time_per_iteration": 2.5034496784210205 + }, + { + "auxiliary_loss_clip": 0.06406576, + "auxiliary_loss_mlp": 0.0126769, + "balance_loss_clip": 0.0626885, + "balance_loss_mlp": 0.01257879, + "epoch": 0.9189237937772433, + "flos": 23775930760320.0, + "grad_norm": 1.7836030599883965, + "language_loss": 0.65816599, + "learning_rate": 6.851001806641554e-08, + "loss": 0.73490864, + "num_input_tokens_seen": 329750975, + "router_z_loss_clip": 1.37695312, + "router_z_loss_mlp": 0.09820557, + "step": 15284, + "time_per_iteration": 2.5270888805389404 + }, + { + "auxiliary_loss_clip": 0.06401928, + "auxiliary_loss_mlp": 0.01261516, + "balance_loss_clip": 0.06270975, + "balance_loss_mlp": 0.01252217, + "epoch": 0.9189839170299113, + "flos": 21220914136320.0, + "grad_norm": 1.9502901912071402, + "language_loss": 0.73604786, + "learning_rate": 6.840899211156292e-08, + "loss": 0.81268227, + "num_input_tokens_seen": 329769645, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.09295654, + "step": 15285, + "time_per_iteration": 2.5270345211029053 + }, + { + "auxiliary_loss_clip": 0.063976, + "auxiliary_loss_mlp": 0.01263018, + "balance_loss_clip": 0.06268349, + "balance_loss_mlp": 0.01253726, + "epoch": 0.9190440402825792, + "flos": 16732993127040.0, + "grad_norm": 1.9982888502982128, + "language_loss": 0.72159714, + "learning_rate": 6.830803940283458e-08, + "loss": 0.79820335, + "num_input_tokens_seen": 329788185, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.09295654, + "step": 15286, + "time_per_iteration": 2.4716579914093018 + }, + { + "auxiliary_loss_clip": 0.06399448, + "auxiliary_loss_mlp": 0.01263944, + "balance_loss_clip": 0.06268711, + "balance_loss_mlp": 0.01254229, + "epoch": 0.9191041635352473, + "flos": 23448012606720.0, + "grad_norm": 1.604320036693306, + "language_loss": 0.7369895, + "learning_rate": 6.820715994405945e-08, + "loss": 0.81362337, + "num_input_tokens_seen": 329806780, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.0970459, + "step": 15287, + "time_per_iteration": 2.541874885559082 + }, + { + "auxiliary_loss_clip": 0.06403069, + "auxiliary_loss_mlp": 0.01265047, + "balance_loss_clip": 0.06271331, + "balance_loss_mlp": 0.01254968, + "epoch": 0.9191642867879152, + "flos": 18813581533440.0, + "grad_norm": 1.9153203073753247, + "language_loss": 0.65538442, + "learning_rate": 6.810635373906226e-08, + "loss": 0.73206556, + "num_input_tokens_seen": 329826350, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.10070801, + "step": 15288, + "time_per_iteration": 2.48822021484375 + }, + { + "auxiliary_loss_clip": 0.06402881, + "auxiliary_loss_mlp": 0.01264206, + "balance_loss_clip": 0.06272922, + "balance_loss_mlp": 0.01254985, + "epoch": 0.9192244100405832, + "flos": 32169170540160.0, + "grad_norm": 2.0269640241218303, + "language_loss": 0.71110076, + "learning_rate": 6.800562079166549e-08, + "loss": 0.78777158, + "num_input_tokens_seen": 329846160, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.09228516, + "step": 15289, + "time_per_iteration": 2.617255926132202 + }, + { + "auxiliary_loss_clip": 0.06402991, + "auxiliary_loss_mlp": 0.01265573, + "balance_loss_clip": 0.06271317, + "balance_loss_mlp": 0.01255768, + "epoch": 0.9192845332932512, + "flos": 16362420445440.0, + "grad_norm": 1.8310281360833698, + "language_loss": 0.74637043, + "learning_rate": 6.790496110568921e-08, + "loss": 0.82305604, + "num_input_tokens_seen": 329862020, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.09802246, + "step": 15290, + "time_per_iteration": 2.478506088256836 + }, + { + "auxiliary_loss_clip": 0.06398199, + "auxiliary_loss_mlp": 0.01262641, + "balance_loss_clip": 0.06270142, + "balance_loss_mlp": 0.01253968, + "epoch": 0.9193446565459191, + "flos": 26621661525120.0, + "grad_norm": 1.9398963623899734, + "language_loss": 0.719679, + "learning_rate": 6.78043746849506e-08, + "loss": 0.79628742, + "num_input_tokens_seen": 329880185, + "router_z_loss_clip": 1.27929688, + "router_z_loss_mlp": 0.08666992, + "step": 15291, + "time_per_iteration": 2.5524001121520996 + }, + { + "auxiliary_loss_clip": 0.06399632, + "auxiliary_loss_mlp": 0.01267484, + "balance_loss_clip": 0.06270288, + "balance_loss_mlp": 0.01258168, + "epoch": 0.9194047797985871, + "flos": 22498778828160.0, + "grad_norm": 1.543404805290079, + "language_loss": 0.71005565, + "learning_rate": 6.770386153326346e-08, + "loss": 0.78672683, + "num_input_tokens_seen": 329900255, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.09313965, + "step": 15292, + "time_per_iteration": 2.6065571308135986 + }, + { + "auxiliary_loss_clip": 0.06402849, + "auxiliary_loss_mlp": 0.01263278, + "balance_loss_clip": 0.06270827, + "balance_loss_mlp": 0.01253068, + "epoch": 0.9194649030512551, + "flos": 25085171606400.0, + "grad_norm": 1.8067565930105831, + "language_loss": 0.73275411, + "learning_rate": 6.760342165443988e-08, + "loss": 0.80941534, + "num_input_tokens_seen": 329919095, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.10205078, + "step": 15293, + "time_per_iteration": 2.61039662361145 + }, + { + "auxiliary_loss_clip": 0.06400138, + "auxiliary_loss_mlp": 0.01265567, + "balance_loss_clip": 0.0627121, + "balance_loss_mlp": 0.01256453, + "epoch": 0.9195250263039231, + "flos": 11915938080000.0, + "grad_norm": 1.8020463710370824, + "language_loss": 0.78330243, + "learning_rate": 6.750305505228837e-08, + "loss": 0.85995948, + "num_input_tokens_seen": 329936505, + "router_z_loss_clip": 1.28808594, + "router_z_loss_mlp": 0.09100342, + "step": 15294, + "time_per_iteration": 2.493028163909912 + }, + { + "auxiliary_loss_clip": 0.0640836, + "auxiliary_loss_mlp": 0.01268598, + "balance_loss_clip": 0.06273803, + "balance_loss_mlp": 0.01257929, + "epoch": 0.919585149556591, + "flos": 21840426898560.0, + "grad_norm": 1.44776982902165, + "language_loss": 0.77154565, + "learning_rate": 6.74027617306141e-08, + "loss": 0.84831524, + "num_input_tokens_seen": 329956795, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.10662842, + "step": 15295, + "time_per_iteration": 2.553980588912964 + }, + { + "auxiliary_loss_clip": 0.06398003, + "auxiliary_loss_mlp": 0.01267619, + "balance_loss_clip": 0.062723, + "balance_loss_mlp": 0.01259066, + "epoch": 0.919645272809259, + "flos": 28191623950080.0, + "grad_norm": 3.7930778156513245, + "language_loss": 0.71295464, + "learning_rate": 6.730254169322114e-08, + "loss": 0.78961086, + "num_input_tokens_seen": 329977195, + "router_z_loss_clip": 1.2578125, + "router_z_loss_mlp": 0.08563232, + "step": 15296, + "time_per_iteration": 2.5601587295532227 + }, + { + "auxiliary_loss_clip": 0.06399599, + "auxiliary_loss_mlp": 0.0126506, + "balance_loss_clip": 0.06269962, + "balance_loss_mlp": 0.01255178, + "epoch": 0.9197053960619269, + "flos": 18338734045440.0, + "grad_norm": 1.87886497767534, + "language_loss": 0.75809079, + "learning_rate": 6.720239494390912e-08, + "loss": 0.83473742, + "num_input_tokens_seen": 329992095, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.09881592, + "step": 15297, + "time_per_iteration": 2.5021512508392334 + }, + { + "auxiliary_loss_clip": 0.06400803, + "auxiliary_loss_mlp": 0.01268368, + "balance_loss_clip": 0.06269927, + "balance_loss_mlp": 0.01259064, + "epoch": 0.9197655193145949, + "flos": 28190911190400.0, + "grad_norm": 1.8177051823695647, + "language_loss": 0.73887002, + "learning_rate": 6.710232148647676e-08, + "loss": 0.81556177, + "num_input_tokens_seen": 330011490, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09307861, + "step": 15298, + "time_per_iteration": 3.9610788822174072 + }, + { + "auxiliary_loss_clip": 0.06405745, + "auxiliary_loss_mlp": 0.01265367, + "balance_loss_clip": 0.06272408, + "balance_loss_mlp": 0.0125527, + "epoch": 0.9198256425672628, + "flos": 17311234953600.0, + "grad_norm": 1.9682637509338687, + "language_loss": 0.79818356, + "learning_rate": 6.70023213247175e-08, + "loss": 0.87489468, + "num_input_tokens_seen": 330027885, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.10089111, + "step": 15299, + "time_per_iteration": 2.5930144786834717 + }, + { + "auxiliary_loss_clip": 0.06398566, + "auxiliary_loss_mlp": 0.01263143, + "balance_loss_clip": 0.06269201, + "balance_loss_mlp": 0.01253994, + "epoch": 0.9198857658199309, + "flos": 17864347754880.0, + "grad_norm": 2.0170678317240185, + "language_loss": 0.63947648, + "learning_rate": 6.690239446242385e-08, + "loss": 0.71609354, + "num_input_tokens_seen": 330046230, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.0914917, + "step": 15300, + "time_per_iteration": 2.491405487060547 + }, + { + "auxiliary_loss_clip": 0.06394336, + "auxiliary_loss_mlp": 0.01263452, + "balance_loss_clip": 0.06269766, + "balance_loss_mlp": 0.01255459, + "epoch": 0.9199458890725988, + "flos": 22134117859200.0, + "grad_norm": 1.6376619653433249, + "language_loss": 0.69386828, + "learning_rate": 6.680254090338545e-08, + "loss": 0.77044618, + "num_input_tokens_seen": 330065535, + "router_z_loss_clip": 1.24511719, + "router_z_loss_mlp": 0.07989502, + "step": 15301, + "time_per_iteration": 2.517106056213379 + }, + { + "auxiliary_loss_clip": 0.06403923, + "auxiliary_loss_mlp": 0.01263436, + "balance_loss_clip": 0.06270855, + "balance_loss_mlp": 0.01253088, + "epoch": 0.9200060123252668, + "flos": 16039533536640.0, + "grad_norm": 1.8442828866072565, + "language_loss": 0.71317738, + "learning_rate": 6.670276065138814e-08, + "loss": 0.78985095, + "num_input_tokens_seen": 330082920, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.10351562, + "step": 15302, + "time_per_iteration": 2.4811885356903076 + }, + { + "auxiliary_loss_clip": 0.06403451, + "auxiliary_loss_mlp": 0.0126337, + "balance_loss_clip": 0.0627024, + "balance_loss_mlp": 0.0125375, + "epoch": 0.9200661355779348, + "flos": 26870853168000.0, + "grad_norm": 1.597458857738985, + "language_loss": 0.76678693, + "learning_rate": 6.660305371021579e-08, + "loss": 0.84345514, + "num_input_tokens_seen": 330101165, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.09613037, + "step": 15303, + "time_per_iteration": 2.548341989517212 + }, + { + "auxiliary_loss_clip": 0.06402119, + "auxiliary_loss_mlp": 0.01266402, + "balance_loss_clip": 0.06271823, + "balance_loss_mlp": 0.01257068, + "epoch": 0.9201262588306027, + "flos": 12791686227840.0, + "grad_norm": 2.0394625643099435, + "language_loss": 0.87783742, + "learning_rate": 6.650342008365006e-08, + "loss": 0.95452261, + "num_input_tokens_seen": 330118775, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.09332275, + "step": 15304, + "time_per_iteration": 2.4814488887786865 + }, + { + "auxiliary_loss_clip": 0.0641056, + "auxiliary_loss_mlp": 0.01268156, + "balance_loss_clip": 0.06273954, + "balance_loss_mlp": 0.01256724, + "epoch": 0.9201863820832707, + "flos": 20637934554240.0, + "grad_norm": 1.7672455563097456, + "language_loss": 0.77882159, + "learning_rate": 6.64038597754677e-08, + "loss": 0.85560876, + "num_input_tokens_seen": 330135570, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.11413574, + "step": 15305, + "time_per_iteration": 2.5235755443573 + }, + { + "auxiliary_loss_clip": 0.06401099, + "auxiliary_loss_mlp": 0.0126516, + "balance_loss_clip": 0.06268904, + "balance_loss_mlp": 0.01255975, + "epoch": 0.9202465053359387, + "flos": 26403007495680.0, + "grad_norm": 2.2842473577556497, + "language_loss": 0.81661773, + "learning_rate": 6.630437278944501e-08, + "loss": 0.89328027, + "num_input_tokens_seen": 330152840, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.09179688, + "step": 15306, + "time_per_iteration": 3.9354968070983887 + }, + { + "auxiliary_loss_clip": 0.06398699, + "auxiliary_loss_mlp": 0.01265097, + "balance_loss_clip": 0.06270522, + "balance_loss_mlp": 0.01256305, + "epoch": 0.9203066285886067, + "flos": 10492737281280.0, + "grad_norm": 1.8746939053209624, + "language_loss": 0.72304678, + "learning_rate": 6.62049591293541e-08, + "loss": 0.79968476, + "num_input_tokens_seen": 330168605, + "router_z_loss_clip": 1.28320312, + "router_z_loss_mlp": 0.08789062, + "step": 15307, + "time_per_iteration": 2.4903953075408936 + }, + { + "auxiliary_loss_clip": 0.06403868, + "auxiliary_loss_mlp": 0.01262191, + "balance_loss_clip": 0.06270027, + "balance_loss_mlp": 0.01252726, + "epoch": 0.9203667518412746, + "flos": 19396770750720.0, + "grad_norm": 1.8214262025870762, + "language_loss": 0.786762, + "learning_rate": 6.610561879896526e-08, + "loss": 0.86342263, + "num_input_tokens_seen": 330186160, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.09460449, + "step": 15308, + "time_per_iteration": 2.4916763305664062 + }, + { + "auxiliary_loss_clip": 0.06398311, + "auxiliary_loss_mlp": 0.0126164, + "balance_loss_clip": 0.06267833, + "balance_loss_mlp": 0.01252425, + "epoch": 0.9204268750939426, + "flos": 15930520974720.0, + "grad_norm": 1.810335481306463, + "language_loss": 0.77935588, + "learning_rate": 6.600635180204484e-08, + "loss": 0.85595536, + "num_input_tokens_seen": 330201780, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.09210205, + "step": 15309, + "time_per_iteration": 2.4542508125305176 + }, + { + "auxiliary_loss_clip": 0.06400943, + "auxiliary_loss_mlp": 0.01261859, + "balance_loss_clip": 0.0626944, + "balance_loss_mlp": 0.01252847, + "epoch": 0.9204869983466105, + "flos": 16477302792960.0, + "grad_norm": 1.7421035243048335, + "language_loss": 0.66452754, + "learning_rate": 6.590715814235781e-08, + "loss": 0.74115556, + "num_input_tokens_seen": 330219165, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09011841, + "step": 15310, + "time_per_iteration": 2.4991562366485596 + }, + { + "auxiliary_loss_clip": 0.06399545, + "auxiliary_loss_mlp": 0.01263459, + "balance_loss_clip": 0.06268573, + "balance_loss_mlp": 0.01253803, + "epoch": 0.9205471215992785, + "flos": 21544933075200.0, + "grad_norm": 1.6637113509144883, + "language_loss": 0.66279554, + "learning_rate": 6.580803782366495e-08, + "loss": 0.73942566, + "num_input_tokens_seen": 330238975, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09649658, + "step": 15311, + "time_per_iteration": 2.4965457916259766 + }, + { + "auxiliary_loss_clip": 0.0639765, + "auxiliary_loss_mlp": 0.01265166, + "balance_loss_clip": 0.06265511, + "balance_loss_mlp": 0.01255432, + "epoch": 0.9206072448519464, + "flos": 25012272954240.0, + "grad_norm": 1.8269618240158574, + "language_loss": 0.76250952, + "learning_rate": 6.570899084972503e-08, + "loss": 0.83913767, + "num_input_tokens_seen": 330259755, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09735107, + "step": 15312, + "time_per_iteration": 3.9788658618927 + }, + { + "auxiliary_loss_clip": 0.06397028, + "auxiliary_loss_mlp": 0.01268151, + "balance_loss_clip": 0.06270073, + "balance_loss_mlp": 0.01259199, + "epoch": 0.9206673681046145, + "flos": 20529047773440.0, + "grad_norm": 1.6388491370190603, + "language_loss": 0.79423517, + "learning_rate": 6.561001722429394e-08, + "loss": 0.87088692, + "num_input_tokens_seen": 330277660, + "router_z_loss_clip": 1.27050781, + "router_z_loss_mlp": 0.08959961, + "step": 15313, + "time_per_iteration": 2.4897162914276123 + }, + { + "auxiliary_loss_clip": 0.06402104, + "auxiliary_loss_mlp": 0.01262155, + "balance_loss_clip": 0.06269892, + "balance_loss_mlp": 0.01252368, + "epoch": 0.9207274913572824, + "flos": 20889222549120.0, + "grad_norm": 1.670329128161987, + "language_loss": 0.78675765, + "learning_rate": 6.55111169511251e-08, + "loss": 0.86340022, + "num_input_tokens_seen": 330295455, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.09790039, + "step": 15314, + "time_per_iteration": 3.9323294162750244 + }, + { + "auxiliary_loss_clip": 0.06409357, + "auxiliary_loss_mlp": 0.01266101, + "balance_loss_clip": 0.06271656, + "balance_loss_mlp": 0.01255509, + "epoch": 0.9207876146099504, + "flos": 22714414110720.0, + "grad_norm": 1.7791309268152706, + "language_loss": 0.79277146, + "learning_rate": 6.541229003396864e-08, + "loss": 0.86952603, + "num_input_tokens_seen": 330315310, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.10601807, + "step": 15315, + "time_per_iteration": 2.5845134258270264 + }, + { + "auxiliary_loss_clip": 0.06407665, + "auxiliary_loss_mlp": 0.01267885, + "balance_loss_clip": 0.06270912, + "balance_loss_mlp": 0.01257508, + "epoch": 0.9208477378626184, + "flos": 18511966362240.0, + "grad_norm": 1.9500495947335497, + "language_loss": 0.76453424, + "learning_rate": 6.531353647657156e-08, + "loss": 0.84128976, + "num_input_tokens_seen": 330333260, + "router_z_loss_clip": 1.36621094, + "router_z_loss_mlp": 0.1038208, + "step": 15316, + "time_per_iteration": 2.47459077835083 + }, + { + "auxiliary_loss_clip": 0.0640117, + "auxiliary_loss_mlp": 0.01263889, + "balance_loss_clip": 0.06267554, + "balance_loss_mlp": 0.01254216, + "epoch": 0.9209078611152863, + "flos": 23005757157120.0, + "grad_norm": 1.5768988455786053, + "language_loss": 0.69479769, + "learning_rate": 6.521485628267931e-08, + "loss": 0.77144837, + "num_input_tokens_seen": 330352465, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.09661865, + "step": 15317, + "time_per_iteration": 2.527420997619629 + }, + { + "auxiliary_loss_clip": 0.06401445, + "auxiliary_loss_mlp": 0.01265355, + "balance_loss_clip": 0.062697, + "balance_loss_mlp": 0.01255824, + "epoch": 0.9209679843679544, + "flos": 24068447763840.0, + "grad_norm": 1.5969618693252037, + "language_loss": 0.8386265, + "learning_rate": 6.511624945603378e-08, + "loss": 0.91529447, + "num_input_tokens_seen": 330372685, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09533691, + "step": 15318, + "time_per_iteration": 2.5386664867401123 + }, + { + "auxiliary_loss_clip": 0.06403956, + "auxiliary_loss_mlp": 0.01263441, + "balance_loss_clip": 0.06273109, + "balance_loss_mlp": 0.01254422, + "epoch": 0.9210281076206223, + "flos": 13558505667840.0, + "grad_norm": 1.7973020316666544, + "language_loss": 0.85918063, + "learning_rate": 6.501771600037354e-08, + "loss": 0.93585461, + "num_input_tokens_seen": 330388860, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09020996, + "step": 15319, + "time_per_iteration": 2.5289907455444336 + }, + { + "auxiliary_loss_clip": 0.06306411, + "auxiliary_loss_mlp": 0.01248044, + "balance_loss_clip": 0.06252417, + "balance_loss_mlp": 0.01247074, + "epoch": 0.9210882308732903, + "flos": 71448292851840.0, + "grad_norm": 0.7592752330183857, + "language_loss": 0.56235629, + "learning_rate": 6.491925591943559e-08, + "loss": 0.63790083, + "num_input_tokens_seen": 330448735, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.00968933, + "step": 15320, + "time_per_iteration": 3.1707842350006104 + }, + { + "auxiliary_loss_clip": 0.06406188, + "auxiliary_loss_mlp": 0.0126704, + "balance_loss_clip": 0.06270667, + "balance_loss_mlp": 0.0125655, + "epoch": 0.9211483541259582, + "flos": 18514020787200.0, + "grad_norm": 2.407910490278205, + "language_loss": 0.6486662, + "learning_rate": 6.482086921695384e-08, + "loss": 0.72539854, + "num_input_tokens_seen": 330465600, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.10491943, + "step": 15321, + "time_per_iteration": 2.503638505935669 + }, + { + "auxiliary_loss_clip": 0.06396494, + "auxiliary_loss_mlp": 0.01264162, + "balance_loss_clip": 0.06272007, + "balance_loss_mlp": 0.01255263, + "epoch": 0.9212084773786262, + "flos": 23264927435520.0, + "grad_norm": 1.5551004297855493, + "language_loss": 0.71829319, + "learning_rate": 6.47225558966582e-08, + "loss": 0.79489976, + "num_input_tokens_seen": 330485770, + "router_z_loss_clip": 1.24414062, + "router_z_loss_mlp": 0.08901978, + "step": 15322, + "time_per_iteration": 2.5333313941955566 + }, + { + "auxiliary_loss_clip": 0.06396886, + "auxiliary_loss_mlp": 0.01266738, + "balance_loss_clip": 0.06267932, + "balance_loss_mlp": 0.01257655, + "epoch": 0.9212686006312941, + "flos": 16295056162560.0, + "grad_norm": 1.6480851550140987, + "language_loss": 0.69842833, + "learning_rate": 6.462431596227725e-08, + "loss": 0.77506459, + "num_input_tokens_seen": 330504255, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.09088135, + "step": 15323, + "time_per_iteration": 2.4778027534484863 + }, + { + "auxiliary_loss_clip": 0.06403235, + "auxiliary_loss_mlp": 0.01267314, + "balance_loss_clip": 0.06269836, + "balance_loss_mlp": 0.01256948, + "epoch": 0.9213287238839621, + "flos": 19790837303040.0, + "grad_norm": 1.9637834340414146, + "language_loss": 0.74995911, + "learning_rate": 6.452614941753597e-08, + "loss": 0.82666463, + "num_input_tokens_seen": 330520705, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.1036377, + "step": 15324, + "time_per_iteration": 2.488264322280884 + }, + { + "auxiliary_loss_clip": 0.06405512, + "auxiliary_loss_mlp": 0.01267457, + "balance_loss_clip": 0.06274214, + "balance_loss_mlp": 0.01257915, + "epoch": 0.92138884713663, + "flos": 21036361518720.0, + "grad_norm": 2.1445778052802327, + "language_loss": 0.71659297, + "learning_rate": 6.442805626615744e-08, + "loss": 0.79332268, + "num_input_tokens_seen": 330539245, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09539795, + "step": 15325, + "time_per_iteration": 2.496718406677246 + }, + { + "auxiliary_loss_clip": 0.06398599, + "auxiliary_loss_mlp": 0.01262694, + "balance_loss_clip": 0.06267601, + "balance_loss_mlp": 0.0125333, + "epoch": 0.9214489703892981, + "flos": 28595207940480.0, + "grad_norm": 1.4431088490493214, + "language_loss": 0.78559232, + "learning_rate": 6.433003651186109e-08, + "loss": 0.86220527, + "num_input_tokens_seen": 330561815, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09356689, + "step": 15326, + "time_per_iteration": 2.569300889968872 + }, + { + "auxiliary_loss_clip": 0.06409511, + "auxiliary_loss_mlp": 0.01267744, + "balance_loss_clip": 0.06275136, + "balance_loss_mlp": 0.01257391, + "epoch": 0.921509093641966, + "flos": 16366864711680.0, + "grad_norm": 3.0252741922568465, + "language_loss": 0.71586484, + "learning_rate": 6.42320901583635e-08, + "loss": 0.79263741, + "num_input_tokens_seen": 330579760, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.10345459, + "step": 15327, + "time_per_iteration": 2.4783525466918945 + }, + { + "auxiliary_loss_clip": 0.0640553, + "auxiliary_loss_mlp": 0.01265754, + "balance_loss_clip": 0.06269649, + "balance_loss_mlp": 0.01255627, + "epoch": 0.921569216894634, + "flos": 26837632224000.0, + "grad_norm": 1.6779125016260046, + "language_loss": 0.78150362, + "learning_rate": 6.413421720937906e-08, + "loss": 0.85821646, + "num_input_tokens_seen": 330598545, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.10131836, + "step": 15328, + "time_per_iteration": 2.540372371673584 + }, + { + "auxiliary_loss_clip": 0.06400491, + "auxiliary_loss_mlp": 0.01261732, + "balance_loss_clip": 0.06271934, + "balance_loss_mlp": 0.01253054, + "epoch": 0.921629340147302, + "flos": 24652140105600.0, + "grad_norm": 2.2635066688956957, + "language_loss": 0.71408528, + "learning_rate": 6.4036417668619e-08, + "loss": 0.79070753, + "num_input_tokens_seen": 330616700, + "router_z_loss_clip": 1.28613281, + "router_z_loss_mlp": 0.08679199, + "step": 15329, + "time_per_iteration": 2.533205986022949 + }, + { + "auxiliary_loss_clip": 0.06399167, + "auxiliary_loss_mlp": 0.01261949, + "balance_loss_clip": 0.06268907, + "balance_loss_mlp": 0.01253318, + "epoch": 0.9216894633999699, + "flos": 15092018766720.0, + "grad_norm": 1.8806450993945985, + "language_loss": 0.86950338, + "learning_rate": 6.393869153979192e-08, + "loss": 0.94611454, + "num_input_tokens_seen": 330633355, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.08630371, + "step": 15330, + "time_per_iteration": 2.4801652431488037 + }, + { + "auxiliary_loss_clip": 0.06404316, + "auxiliary_loss_mlp": 0.01264793, + "balance_loss_clip": 0.06271324, + "balance_loss_mlp": 0.0125512, + "epoch": 0.921749586652638, + "flos": 19209912145920.0, + "grad_norm": 2.21823378133338, + "language_loss": 0.76192427, + "learning_rate": 6.384103882660397e-08, + "loss": 0.83861536, + "num_input_tokens_seen": 330651470, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.09674072, + "step": 15331, + "time_per_iteration": 2.484335422515869 + }, + { + "auxiliary_loss_clip": 0.0640348, + "auxiliary_loss_mlp": 0.0126417, + "balance_loss_clip": 0.06270707, + "balance_loss_mlp": 0.01254621, + "epoch": 0.9218097099053059, + "flos": 20528796211200.0, + "grad_norm": 1.4680320475819244, + "language_loss": 0.75768459, + "learning_rate": 6.374345953275794e-08, + "loss": 0.83436108, + "num_input_tokens_seen": 330669170, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09552002, + "step": 15332, + "time_per_iteration": 2.5472254753112793 + }, + { + "auxiliary_loss_clip": 0.06399745, + "auxiliary_loss_mlp": 0.012679, + "balance_loss_clip": 0.06270576, + "balance_loss_mlp": 0.01259242, + "epoch": 0.9218698331579739, + "flos": 17354518387200.0, + "grad_norm": 1.6404932332375755, + "language_loss": 0.7481606, + "learning_rate": 6.364595366195358e-08, + "loss": 0.82483709, + "num_input_tokens_seen": 330686635, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.08657837, + "step": 15333, + "time_per_iteration": 2.5107102394104004 + }, + { + "auxiliary_loss_clip": 0.06310606, + "auxiliary_loss_mlp": 0.01248711, + "balance_loss_clip": 0.0625622, + "balance_loss_mlp": 0.01247726, + "epoch": 0.9219299564106418, + "flos": 61975717430400.0, + "grad_norm": 0.7759353424239996, + "language_loss": 0.52887559, + "learning_rate": 6.354852121788879e-08, + "loss": 0.60446876, + "num_input_tokens_seen": 330749160, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.00983429, + "step": 15334, + "time_per_iteration": 3.109227180480957 + }, + { + "auxiliary_loss_clip": 0.06396239, + "auxiliary_loss_mlp": 0.01262699, + "balance_loss_clip": 0.06269791, + "balance_loss_mlp": 0.01254223, + "epoch": 0.9219900796633098, + "flos": 15706542211200.0, + "grad_norm": 1.7785905559381385, + "language_loss": 0.62691534, + "learning_rate": 6.345116220425839e-08, + "loss": 0.70350474, + "num_input_tokens_seen": 330766840, + "router_z_loss_clip": 1.26367188, + "router_z_loss_mlp": 0.0847168, + "step": 15335, + "time_per_iteration": 2.5022456645965576 + }, + { + "auxiliary_loss_clip": 0.06401903, + "auxiliary_loss_mlp": 0.01266885, + "balance_loss_clip": 0.06270576, + "balance_loss_mlp": 0.01257539, + "epoch": 0.9220502029159777, + "flos": 24938996958720.0, + "grad_norm": 1.5764942536870223, + "language_loss": 0.71558487, + "learning_rate": 6.335387662475366e-08, + "loss": 0.79227275, + "num_input_tokens_seen": 330785585, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09338379, + "step": 15336, + "time_per_iteration": 2.5597825050354004 + }, + { + "auxiliary_loss_clip": 0.06400605, + "auxiliary_loss_mlp": 0.01263441, + "balance_loss_clip": 0.06271902, + "balance_loss_mlp": 0.01254894, + "epoch": 0.9221103261686457, + "flos": 15672315018240.0, + "grad_norm": 1.803852700991986, + "language_loss": 0.72009486, + "learning_rate": 6.325666448306433e-08, + "loss": 0.79673529, + "num_input_tokens_seen": 330800750, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.08544922, + "step": 15337, + "time_per_iteration": 3.9219424724578857 + }, + { + "auxiliary_loss_clip": 0.06308219, + "auxiliary_loss_mlp": 0.01248795, + "balance_loss_clip": 0.06254087, + "balance_loss_mlp": 0.0124781, + "epoch": 0.9221704494213137, + "flos": 67536643098240.0, + "grad_norm": 0.8647733027794, + "language_loss": 0.65245771, + "learning_rate": 6.31595257828763e-08, + "loss": 0.72802794, + "num_input_tokens_seen": 330863640, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.00984955, + "step": 15338, + "time_per_iteration": 3.142150640487671 + }, + { + "auxiliary_loss_clip": 0.06404249, + "auxiliary_loss_mlp": 0.01264427, + "balance_loss_clip": 0.06273044, + "balance_loss_mlp": 0.01255236, + "epoch": 0.9222305726739817, + "flos": 30234798708480.0, + "grad_norm": 1.6484364978205361, + "language_loss": 0.67409325, + "learning_rate": 6.306246052787289e-08, + "loss": 0.75077999, + "num_input_tokens_seen": 330884675, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.09191895, + "step": 15339, + "time_per_iteration": 2.593411684036255 + }, + { + "auxiliary_loss_clip": 0.06400622, + "auxiliary_loss_mlp": 0.01263453, + "balance_loss_clip": 0.06269693, + "balance_loss_mlp": 0.01254399, + "epoch": 0.9222906959266496, + "flos": 25344132249600.0, + "grad_norm": 1.7385628862396276, + "language_loss": 0.71863818, + "learning_rate": 6.296546872173513e-08, + "loss": 0.79527897, + "num_input_tokens_seen": 330904125, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09051514, + "step": 15340, + "time_per_iteration": 2.5827271938323975 + }, + { + "auxiliary_loss_clip": 0.064013, + "auxiliary_loss_mlp": 0.01266685, + "balance_loss_clip": 0.06271731, + "balance_loss_mlp": 0.01257506, + "epoch": 0.9223508191793176, + "flos": 27607260775680.0, + "grad_norm": 1.4559470665197816, + "language_loss": 0.70787621, + "learning_rate": 6.286855036814098e-08, + "loss": 0.78455609, + "num_input_tokens_seen": 330925140, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.09179688, + "step": 15341, + "time_per_iteration": 2.69647479057312 + }, + { + "auxiliary_loss_clip": 0.06392725, + "auxiliary_loss_mlp": 0.01263032, + "balance_loss_clip": 0.06267273, + "balance_loss_mlp": 0.01254956, + "epoch": 0.9224109424319856, + "flos": 27314869553280.0, + "grad_norm": 1.5381458649062534, + "language_loss": 0.67303658, + "learning_rate": 6.277170547076571e-08, + "loss": 0.74959409, + "num_input_tokens_seen": 330946625, + "router_z_loss_clip": 1.25488281, + "router_z_loss_mlp": 0.08068848, + "step": 15342, + "time_per_iteration": 2.588177442550659 + }, + { + "auxiliary_loss_clip": 0.06401019, + "auxiliary_loss_mlp": 0.01262683, + "balance_loss_clip": 0.06269694, + "balance_loss_mlp": 0.01253474, + "epoch": 0.9224710656846535, + "flos": 48218152389120.0, + "grad_norm": 2.052024165680001, + "language_loss": 0.69629633, + "learning_rate": 6.26749340332815e-08, + "loss": 0.7729333, + "num_input_tokens_seen": 330967795, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09210205, + "step": 15343, + "time_per_iteration": 2.7706665992736816 + }, + { + "auxiliary_loss_clip": 0.063094, + "auxiliary_loss_mlp": 0.01249689, + "balance_loss_clip": 0.06255051, + "balance_loss_mlp": 0.01248708, + "epoch": 0.9225311889373216, + "flos": 66743814165120.0, + "grad_norm": 0.8019643704800373, + "language_loss": 0.51885521, + "learning_rate": 6.257823605935786e-08, + "loss": 0.59444606, + "num_input_tokens_seen": 331040850, + "router_z_loss_clip": 0.54638672, + "router_z_loss_mlp": 0.00980377, + "step": 15344, + "time_per_iteration": 3.30328631401062 + }, + { + "auxiliary_loss_clip": 0.06392275, + "auxiliary_loss_mlp": 0.01264218, + "balance_loss_clip": 0.06267268, + "balance_loss_mlp": 0.01255981, + "epoch": 0.9225913121899895, + "flos": 22277525322240.0, + "grad_norm": 1.572478644220583, + "language_loss": 0.70385808, + "learning_rate": 6.248161155266162e-08, + "loss": 0.78042299, + "num_input_tokens_seen": 331060595, + "router_z_loss_clip": 1.25, + "router_z_loss_mlp": 0.0824585, + "step": 15345, + "time_per_iteration": 3.937687397003174 + }, + { + "auxiliary_loss_clip": 0.06402814, + "auxiliary_loss_mlp": 0.01267303, + "balance_loss_clip": 0.06271317, + "balance_loss_mlp": 0.01257075, + "epoch": 0.9226514354426575, + "flos": 20088679040640.0, + "grad_norm": 2.342779825818367, + "language_loss": 0.77456373, + "learning_rate": 6.238506051685677e-08, + "loss": 0.85126495, + "num_input_tokens_seen": 331080195, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.10223389, + "step": 15346, + "time_per_iteration": 2.569237232208252 + }, + { + "auxiliary_loss_clip": 0.0640711, + "auxiliary_loss_mlp": 0.01270235, + "balance_loss_clip": 0.06270187, + "balance_loss_mlp": 0.01259381, + "epoch": 0.9227115586953254, + "flos": 16076402133120.0, + "grad_norm": 6.129129283291578, + "language_loss": 0.76381576, + "learning_rate": 6.228858295560457e-08, + "loss": 0.84058923, + "num_input_tokens_seen": 331097645, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.10845947, + "step": 15347, + "time_per_iteration": 2.54581618309021 + }, + { + "auxiliary_loss_clip": 0.06394706, + "auxiliary_loss_mlp": 0.01264003, + "balance_loss_clip": 0.06268799, + "balance_loss_mlp": 0.01255444, + "epoch": 0.9227716819479934, + "flos": 20451788709120.0, + "grad_norm": 1.6400744592090153, + "language_loss": 0.76745045, + "learning_rate": 6.219217887256367e-08, + "loss": 0.84403753, + "num_input_tokens_seen": 331116830, + "router_z_loss_clip": 1.25976562, + "router_z_loss_mlp": 0.08563232, + "step": 15348, + "time_per_iteration": 2.590552806854248 + }, + { + "auxiliary_loss_clip": 0.0640482, + "auxiliary_loss_mlp": 0.01263248, + "balance_loss_clip": 0.06270683, + "balance_loss_mlp": 0.01253097, + "epoch": 0.9228318052006613, + "flos": 25014033889920.0, + "grad_norm": 1.7903050543327186, + "language_loss": 0.68388069, + "learning_rate": 6.209584827138959e-08, + "loss": 0.76056135, + "num_input_tokens_seen": 331137235, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.10150146, + "step": 15349, + "time_per_iteration": 2.5478007793426514 + }, + { + "auxiliary_loss_clip": 0.0640128, + "auxiliary_loss_mlp": 0.01262275, + "balance_loss_clip": 0.06269504, + "balance_loss_mlp": 0.01253227, + "epoch": 0.9228919284533293, + "flos": 12682170541440.0, + "grad_norm": 2.1989132821719948, + "language_loss": 0.87228858, + "learning_rate": 6.199959115573495e-08, + "loss": 0.94892418, + "num_input_tokens_seen": 331153155, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.09051514, + "step": 15350, + "time_per_iteration": 2.5094597339630127 + }, + { + "auxiliary_loss_clip": 0.06312097, + "auxiliary_loss_mlp": 0.01249183, + "balance_loss_clip": 0.06257882, + "balance_loss_mlp": 0.01248158, + "epoch": 0.9229520517059973, + "flos": 70005050928000.0, + "grad_norm": 0.7490449092962135, + "language_loss": 0.60287833, + "learning_rate": 6.190340752924994e-08, + "loss": 0.67849118, + "num_input_tokens_seen": 331214895, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.01025391, + "step": 15351, + "time_per_iteration": 3.092261791229248 + }, + { + "auxiliary_loss_clip": 0.06403425, + "auxiliary_loss_mlp": 0.01263289, + "balance_loss_clip": 0.06269398, + "balance_loss_mlp": 0.01253901, + "epoch": 0.9230121749586653, + "flos": 14799166346880.0, + "grad_norm": 2.2204736454747493, + "language_loss": 0.77420902, + "learning_rate": 6.180729739558233e-08, + "loss": 0.85087621, + "num_input_tokens_seen": 331232185, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.09393311, + "step": 15352, + "time_per_iteration": 3.931007146835327 + }, + { + "auxiliary_loss_clip": 0.064078, + "auxiliary_loss_mlp": 0.01262825, + "balance_loss_clip": 0.0627102, + "balance_loss_mlp": 0.01252758, + "epoch": 0.9230722982113332, + "flos": 22974003659520.0, + "grad_norm": 2.303163162043219, + "language_loss": 0.5970825, + "learning_rate": 6.171126075837585e-08, + "loss": 0.67378873, + "num_input_tokens_seen": 331251065, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.10070801, + "step": 15353, + "time_per_iteration": 2.5389790534973145 + }, + { + "auxiliary_loss_clip": 0.06398928, + "auxiliary_loss_mlp": 0.01262823, + "balance_loss_clip": 0.06270197, + "balance_loss_mlp": 0.01253939, + "epoch": 0.9231324214640012, + "flos": 18557346147840.0, + "grad_norm": 1.5949625436453003, + "language_loss": 0.74683791, + "learning_rate": 6.161529762127293e-08, + "loss": 0.82345545, + "num_input_tokens_seen": 331269110, + "router_z_loss_clip": 1.28808594, + "router_z_loss_mlp": 0.08889771, + "step": 15354, + "time_per_iteration": 4.007373571395874 + }, + { + "auxiliary_loss_clip": 0.06408745, + "auxiliary_loss_mlp": 0.01265787, + "balance_loss_clip": 0.06272165, + "balance_loss_mlp": 0.0125532, + "epoch": 0.9231925447166691, + "flos": 22087899532800.0, + "grad_norm": 1.884882701150637, + "language_loss": 0.65271533, + "learning_rate": 6.1519407987912e-08, + "loss": 0.72946066, + "num_input_tokens_seen": 331286555, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.10473633, + "step": 15355, + "time_per_iteration": 2.5409066677093506 + }, + { + "auxiliary_loss_clip": 0.06397177, + "auxiliary_loss_mlp": 0.01263201, + "balance_loss_clip": 0.06271057, + "balance_loss_mlp": 0.01254028, + "epoch": 0.9232526679693371, + "flos": 26548259748480.0, + "grad_norm": 1.546790587862242, + "language_loss": 0.74723232, + "learning_rate": 6.142359186192947e-08, + "loss": 0.82383615, + "num_input_tokens_seen": 331307660, + "router_z_loss_clip": 1.26074219, + "router_z_loss_mlp": 0.09179688, + "step": 15356, + "time_per_iteration": 2.5545573234558105 + }, + { + "auxiliary_loss_clip": 0.06402813, + "auxiliary_loss_mlp": 0.01264241, + "balance_loss_clip": 0.0627003, + "balance_loss_mlp": 0.01254603, + "epoch": 0.9233127912220052, + "flos": 14761878480000.0, + "grad_norm": 1.6173539213907528, + "language_loss": 0.60903341, + "learning_rate": 6.132784924695844e-08, + "loss": 0.68570393, + "num_input_tokens_seen": 331324885, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.09637451, + "step": 15357, + "time_per_iteration": 2.479755163192749 + }, + { + "auxiliary_loss_clip": 0.06403501, + "auxiliary_loss_mlp": 0.01264325, + "balance_loss_clip": 0.0626936, + "balance_loss_mlp": 0.01254848, + "epoch": 0.9233729144746731, + "flos": 25268298704640.0, + "grad_norm": 1.3619838972501352, + "language_loss": 0.70080173, + "learning_rate": 6.123218014662956e-08, + "loss": 0.77747995, + "num_input_tokens_seen": 331345885, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.0947876, + "step": 15358, + "time_per_iteration": 2.5597140789031982 + }, + { + "auxiliary_loss_clip": 0.06399107, + "auxiliary_loss_mlp": 0.01262902, + "balance_loss_clip": 0.06269094, + "balance_loss_mlp": 0.01254063, + "epoch": 0.9234330377273411, + "flos": 27856368564480.0, + "grad_norm": 1.796399091870678, + "language_loss": 0.73676997, + "learning_rate": 6.113658456457104e-08, + "loss": 0.81339008, + "num_input_tokens_seen": 331364320, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.08837891, + "step": 15359, + "time_per_iteration": 2.582848072052002 + }, + { + "auxiliary_loss_clip": 0.06400593, + "auxiliary_loss_mlp": 0.01263199, + "balance_loss_clip": 0.06269514, + "balance_loss_mlp": 0.01253847, + "epoch": 0.923493160980009, + "flos": 24615313436160.0, + "grad_norm": 1.8173722037046873, + "language_loss": 0.65140021, + "learning_rate": 6.104106250440732e-08, + "loss": 0.72803813, + "num_input_tokens_seen": 331384135, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09356689, + "step": 15360, + "time_per_iteration": 2.5897343158721924 + }, + { + "auxiliary_loss_clip": 0.06310426, + "auxiliary_loss_mlp": 0.01250329, + "balance_loss_clip": 0.06256235, + "balance_loss_mlp": 0.0124932, + "epoch": 0.923553284232677, + "flos": 67721656913280.0, + "grad_norm": 0.7579229937332289, + "language_loss": 0.5489769, + "learning_rate": 6.094561396976083e-08, + "loss": 0.62458444, + "num_input_tokens_seen": 331440645, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.01008606, + "step": 15361, + "time_per_iteration": 3.076972723007202 + }, + { + "auxiliary_loss_clip": 0.0640441, + "auxiliary_loss_mlp": 0.01263174, + "balance_loss_clip": 0.06269506, + "balance_loss_mlp": 0.01252404, + "epoch": 0.9236134074853449, + "flos": 18813246117120.0, + "grad_norm": 1.9671802371462084, + "language_loss": 0.70403993, + "learning_rate": 6.085023896425112e-08, + "loss": 0.78071576, + "num_input_tokens_seen": 331459580, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.10760498, + "step": 15362, + "time_per_iteration": 2.5362637042999268 + }, + { + "auxiliary_loss_clip": 0.06406496, + "auxiliary_loss_mlp": 0.0126344, + "balance_loss_clip": 0.06270804, + "balance_loss_mlp": 0.01253278, + "epoch": 0.923673530738013, + "flos": 27789800895360.0, + "grad_norm": 1.3407454971691222, + "language_loss": 0.75910234, + "learning_rate": 6.075493749149463e-08, + "loss": 0.83580172, + "num_input_tokens_seen": 331481560, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10162354, + "step": 15363, + "time_per_iteration": 2.552292585372925 + }, + { + "auxiliary_loss_clip": 0.06403825, + "auxiliary_loss_mlp": 0.01265451, + "balance_loss_clip": 0.06272069, + "balance_loss_mlp": 0.01256027, + "epoch": 0.9237336539906809, + "flos": 26804369352960.0, + "grad_norm": 1.950831388344252, + "language_loss": 0.83409828, + "learning_rate": 6.065970955510514e-08, + "loss": 0.91079104, + "num_input_tokens_seen": 331499090, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09423828, + "step": 15364, + "time_per_iteration": 2.556971549987793 + }, + { + "auxiliary_loss_clip": 0.06398296, + "auxiliary_loss_mlp": 0.01265053, + "balance_loss_clip": 0.06268522, + "balance_loss_mlp": 0.01256631, + "epoch": 0.9237937772433489, + "flos": 23594648451840.0, + "grad_norm": 1.5023507773294924, + "language_loss": 0.68472719, + "learning_rate": 6.056455515869419e-08, + "loss": 0.7613607, + "num_input_tokens_seen": 331519420, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.08422852, + "step": 15365, + "time_per_iteration": 2.525970935821533 + }, + { + "auxiliary_loss_clip": 0.06400183, + "auxiliary_loss_mlp": 0.01265116, + "balance_loss_clip": 0.06269205, + "balance_loss_mlp": 0.0125546, + "epoch": 0.9238539004960168, + "flos": 26147736432000.0, + "grad_norm": 2.741191058954429, + "language_loss": 0.62701088, + "learning_rate": 6.046947430586913e-08, + "loss": 0.70366389, + "num_input_tokens_seen": 331538720, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09661865, + "step": 15366, + "time_per_iteration": 2.571578025817871 + }, + { + "auxiliary_loss_clip": 0.06403293, + "auxiliary_loss_mlp": 0.01261168, + "balance_loss_clip": 0.06273372, + "balance_loss_mlp": 0.01251261, + "epoch": 0.9239140237486848, + "flos": 21074152510080.0, + "grad_norm": 1.7815327579173699, + "language_loss": 0.74507236, + "learning_rate": 6.037446700023619e-08, + "loss": 0.82171696, + "num_input_tokens_seen": 331558505, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.09899902, + "step": 15367, + "time_per_iteration": 2.5045971870422363 + }, + { + "auxiliary_loss_clip": 0.06390847, + "auxiliary_loss_mlp": 0.01264934, + "balance_loss_clip": 0.06267439, + "balance_loss_mlp": 0.01255922, + "epoch": 0.9239741470013527, + "flos": 24614810311680.0, + "grad_norm": 1.8519512729741396, + "language_loss": 0.64742005, + "learning_rate": 6.027953324539759e-08, + "loss": 0.72397792, + "num_input_tokens_seen": 331578440, + "router_z_loss_clip": 1.23339844, + "router_z_loss_mlp": 0.08996582, + "step": 15368, + "time_per_iteration": 2.544147491455078 + }, + { + "auxiliary_loss_clip": 0.06404577, + "auxiliary_loss_mlp": 0.01267709, + "balance_loss_clip": 0.06269414, + "balance_loss_mlp": 0.0125754, + "epoch": 0.9240342702540207, + "flos": 24725290320000.0, + "grad_norm": 1.790282394600615, + "language_loss": 0.74812615, + "learning_rate": 6.018467304495401e-08, + "loss": 0.82484901, + "num_input_tokens_seen": 331598945, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.10168457, + "step": 15369, + "time_per_iteration": 2.524303913116455 + }, + { + "auxiliary_loss_clip": 0.06408086, + "auxiliary_loss_mlp": 0.01264607, + "balance_loss_clip": 0.06271143, + "balance_loss_mlp": 0.01253383, + "epoch": 0.9240943935066888, + "flos": 20856253167360.0, + "grad_norm": 1.8071530163307696, + "language_loss": 0.77047461, + "learning_rate": 6.008988640250145e-08, + "loss": 0.84720153, + "num_input_tokens_seen": 331616700, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.11230469, + "step": 15370, + "time_per_iteration": 2.513298988342285 + }, + { + "auxiliary_loss_clip": 0.06402336, + "auxiliary_loss_mlp": 0.01261917, + "balance_loss_clip": 0.0627064, + "balance_loss_mlp": 0.01252923, + "epoch": 0.9241545167593567, + "flos": 24469222642560.0, + "grad_norm": 2.0099399345355575, + "language_loss": 0.67316246, + "learning_rate": 5.999517332163528e-08, + "loss": 0.74980497, + "num_input_tokens_seen": 331635625, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.08996582, + "step": 15371, + "time_per_iteration": 2.520193576812744 + }, + { + "auxiliary_loss_clip": 0.06306948, + "auxiliary_loss_mlp": 0.01251246, + "balance_loss_clip": 0.062529, + "balance_loss_mlp": 0.01250195, + "epoch": 0.9242146400120247, + "flos": 61847110212480.0, + "grad_norm": 0.7120628094396801, + "language_loss": 0.5773133, + "learning_rate": 5.99005338059464e-08, + "loss": 0.65289533, + "num_input_tokens_seen": 331698595, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.01052094, + "step": 15372, + "time_per_iteration": 3.0978200435638428 + }, + { + "auxiliary_loss_clip": 0.06395283, + "auxiliary_loss_mlp": 0.01266989, + "balance_loss_clip": 0.06267901, + "balance_loss_mlp": 0.01258782, + "epoch": 0.9242747632646926, + "flos": 22053923902080.0, + "grad_norm": 1.7652087955090183, + "language_loss": 0.70249438, + "learning_rate": 5.98059678590237e-08, + "loss": 0.77911711, + "num_input_tokens_seen": 331717975, + "router_z_loss_clip": 1.2734375, + "router_z_loss_mlp": 0.08209229, + "step": 15373, + "time_per_iteration": 2.4996917247772217 + }, + { + "auxiliary_loss_clip": 0.06402817, + "auxiliary_loss_mlp": 0.01269313, + "balance_loss_clip": 0.06271312, + "balance_loss_mlp": 0.01259436, + "epoch": 0.9243348865173606, + "flos": 18484195933440.0, + "grad_norm": 2.6606321172292424, + "language_loss": 0.75800008, + "learning_rate": 5.971147548445299e-08, + "loss": 0.83472145, + "num_input_tokens_seen": 331737220, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09881592, + "step": 15374, + "time_per_iteration": 2.4819071292877197 + }, + { + "auxiliary_loss_clip": 0.06398623, + "auxiliary_loss_mlp": 0.01262613, + "balance_loss_clip": 0.06267889, + "balance_loss_mlp": 0.01253738, + "epoch": 0.9243950097700285, + "flos": 23265556341120.0, + "grad_norm": 1.5989491973910335, + "language_loss": 0.6470179, + "learning_rate": 5.961705668581784e-08, + "loss": 0.72363025, + "num_input_tokens_seen": 331757300, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.08874512, + "step": 15375, + "time_per_iteration": 2.511228561401367 + }, + { + "auxiliary_loss_clip": 0.06398005, + "auxiliary_loss_mlp": 0.01260851, + "balance_loss_clip": 0.06269285, + "balance_loss_mlp": 0.01251189, + "epoch": 0.9244551330226966, + "flos": 29756261640960.0, + "grad_norm": 2.2942145440392028, + "language_loss": 0.66584778, + "learning_rate": 5.952271146669829e-08, + "loss": 0.74243629, + "num_input_tokens_seen": 331776995, + "router_z_loss_clip": 1.28808594, + "router_z_loss_mlp": 0.09655762, + "step": 15376, + "time_per_iteration": 2.5877747535705566 + }, + { + "auxiliary_loss_clip": 0.06310389, + "auxiliary_loss_mlp": 0.01248316, + "balance_loss_clip": 0.06256086, + "balance_loss_mlp": 0.01247218, + "epoch": 0.9245152562753645, + "flos": 68885310090240.0, + "grad_norm": 0.6448904976403038, + "language_loss": 0.61183542, + "learning_rate": 5.94284398306717e-08, + "loss": 0.68742251, + "num_input_tokens_seen": 331845015, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.01100159, + "step": 15377, + "time_per_iteration": 4.591358184814453 + }, + { + "auxiliary_loss_clip": 0.06397624, + "auxiliary_loss_mlp": 0.01264112, + "balance_loss_clip": 0.06267756, + "balance_loss_mlp": 0.01254254, + "epoch": 0.9245753795280325, + "flos": 21585575105280.0, + "grad_norm": 1.6098683920154133, + "language_loss": 0.74425101, + "learning_rate": 5.933424178131341e-08, + "loss": 0.82086837, + "num_input_tokens_seen": 331862795, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.09851074, + "step": 15378, + "time_per_iteration": 2.498936653137207 + }, + { + "auxiliary_loss_clip": 0.06402528, + "auxiliary_loss_mlp": 0.01263964, + "balance_loss_clip": 0.06269968, + "balance_loss_mlp": 0.01254314, + "epoch": 0.9246355027807004, + "flos": 34504694593920.0, + "grad_norm": 1.8895065800436894, + "language_loss": 0.62142766, + "learning_rate": 5.924011732219503e-08, + "loss": 0.69809258, + "num_input_tokens_seen": 331882535, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09655762, + "step": 15379, + "time_per_iteration": 2.6365721225738525 + }, + { + "auxiliary_loss_clip": 0.06397697, + "auxiliary_loss_mlp": 0.01264574, + "balance_loss_clip": 0.06270209, + "balance_loss_mlp": 0.01255764, + "epoch": 0.9246956260333684, + "flos": 15958123695360.0, + "grad_norm": 1.9053224282223191, + "language_loss": 0.83903706, + "learning_rate": 5.914606645688591e-08, + "loss": 0.91565973, + "num_input_tokens_seen": 331899335, + "router_z_loss_clip": 1.27441406, + "router_z_loss_mlp": 0.08813477, + "step": 15380, + "time_per_iteration": 2.4695920944213867 + }, + { + "auxiliary_loss_clip": 0.06402585, + "auxiliary_loss_mlp": 0.01265336, + "balance_loss_clip": 0.0626857, + "balance_loss_mlp": 0.01254715, + "epoch": 0.9247557492860363, + "flos": 23375197808640.0, + "grad_norm": 1.384509137636546, + "language_loss": 0.7339139, + "learning_rate": 5.905208918895233e-08, + "loss": 0.81059313, + "num_input_tokens_seen": 331919030, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.10614014, + "step": 15381, + "time_per_iteration": 2.534614086151123 + }, + { + "auxiliary_loss_clip": 0.0640035, + "auxiliary_loss_mlp": 0.01262661, + "balance_loss_clip": 0.06271455, + "balance_loss_mlp": 0.01253595, + "epoch": 0.9248158725387043, + "flos": 23046608822400.0, + "grad_norm": 1.680142462272489, + "language_loss": 0.78818119, + "learning_rate": 5.8958185521958524e-08, + "loss": 0.86481124, + "num_input_tokens_seen": 331936465, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.09057617, + "step": 15382, + "time_per_iteration": 2.5061895847320557 + }, + { + "auxiliary_loss_clip": 0.06401303, + "auxiliary_loss_mlp": 0.01264469, + "balance_loss_clip": 0.06268425, + "balance_loss_mlp": 0.01254354, + "epoch": 0.9248759957913724, + "flos": 22527974776320.0, + "grad_norm": 1.7961295169638432, + "language_loss": 0.74988508, + "learning_rate": 5.886435545946455e-08, + "loss": 0.82654279, + "num_input_tokens_seen": 331954625, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.10107422, + "step": 15383, + "time_per_iteration": 2.581434488296509 + }, + { + "auxiliary_loss_clip": 0.06396997, + "auxiliary_loss_mlp": 0.01261141, + "balance_loss_clip": 0.06268598, + "balance_loss_mlp": 0.01252499, + "epoch": 0.9249361190440403, + "flos": 25454318768640.0, + "grad_norm": 1.566333672745091, + "language_loss": 0.75798136, + "learning_rate": 5.8770599005028456e-08, + "loss": 0.83456272, + "num_input_tokens_seen": 331975865, + "router_z_loss_clip": 1.28222656, + "router_z_loss_mlp": 0.08642578, + "step": 15384, + "time_per_iteration": 2.55129075050354 + }, + { + "auxiliary_loss_clip": 0.0639509, + "auxiliary_loss_mlp": 0.01261598, + "balance_loss_clip": 0.06269214, + "balance_loss_mlp": 0.01252306, + "epoch": 0.9249962422967083, + "flos": 12382358232960.0, + "grad_norm": 2.143877935574221, + "language_loss": 0.66191006, + "learning_rate": 5.8676916162206045e-08, + "loss": 0.73847699, + "num_input_tokens_seen": 331992760, + "router_z_loss_clip": 1.2578125, + "router_z_loss_mlp": 0.09289551, + "step": 15385, + "time_per_iteration": 3.918323516845703 + }, + { + "auxiliary_loss_clip": 0.0639942, + "auxiliary_loss_mlp": 0.01268229, + "balance_loss_clip": 0.06270313, + "balance_loss_mlp": 0.01259204, + "epoch": 0.9250563655493762, + "flos": 22936003032960.0, + "grad_norm": 1.8903454338190138, + "language_loss": 0.80601746, + "learning_rate": 5.85833069345496e-08, + "loss": 0.88269401, + "num_input_tokens_seen": 332011890, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.090271, + "step": 15386, + "time_per_iteration": 2.52738094329834 + }, + { + "auxiliary_loss_clip": 0.06399529, + "auxiliary_loss_mlp": 0.01263013, + "balance_loss_clip": 0.0627138, + "balance_loss_mlp": 0.01253727, + "epoch": 0.9251164888020442, + "flos": 18484573276800.0, + "grad_norm": 1.9057906513931537, + "language_loss": 0.75911927, + "learning_rate": 5.8489771325608504e-08, + "loss": 0.83574468, + "num_input_tokens_seen": 332029485, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.09283447, + "step": 15387, + "time_per_iteration": 2.479053020477295 + }, + { + "auxiliary_loss_clip": 0.06396089, + "auxiliary_loss_mlp": 0.01263451, + "balance_loss_clip": 0.06269023, + "balance_loss_mlp": 0.01254779, + "epoch": 0.9251766120547121, + "flos": 33045505666560.0, + "grad_norm": 1.2958399719477445, + "language_loss": 0.70158648, + "learning_rate": 5.839630933893014e-08, + "loss": 0.77818191, + "num_input_tokens_seen": 332052970, + "router_z_loss_clip": 1.27148438, + "router_z_loss_mlp": 0.08660889, + "step": 15388, + "time_per_iteration": 2.7240984439849854 + }, + { + "auxiliary_loss_clip": 0.06403159, + "auxiliary_loss_mlp": 0.01266072, + "balance_loss_clip": 0.06270151, + "balance_loss_mlp": 0.01256702, + "epoch": 0.9252367353073802, + "flos": 24394563054720.0, + "grad_norm": 1.6728291040294425, + "language_loss": 0.81795633, + "learning_rate": 5.8302920978058115e-08, + "loss": 0.89464867, + "num_input_tokens_seen": 332070395, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.09368896, + "step": 15389, + "time_per_iteration": 2.5441529750823975 + }, + { + "auxiliary_loss_clip": 0.06410511, + "auxiliary_loss_mlp": 0.01266804, + "balance_loss_clip": 0.06269868, + "balance_loss_mlp": 0.01256213, + "epoch": 0.9252968585600481, + "flos": 18922887584640.0, + "grad_norm": 1.643054722636028, + "language_loss": 0.79540706, + "learning_rate": 5.820960624653381e-08, + "loss": 0.87218022, + "num_input_tokens_seen": 332090185, + "router_z_loss_clip": 1.40527344, + "router_z_loss_mlp": 0.10577393, + "step": 15390, + "time_per_iteration": 3.9439857006073 + }, + { + "auxiliary_loss_clip": 0.06405532, + "auxiliary_loss_mlp": 0.01265289, + "balance_loss_clip": 0.06270221, + "balance_loss_mlp": 0.01255448, + "epoch": 0.9253569818127161, + "flos": 21731707825920.0, + "grad_norm": 1.8343388341488236, + "language_loss": 0.75466919, + "learning_rate": 5.811636514789597e-08, + "loss": 0.83137739, + "num_input_tokens_seen": 332109050, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.09838867, + "step": 15391, + "time_per_iteration": 2.5190751552581787 + }, + { + "auxiliary_loss_clip": 0.06401081, + "auxiliary_loss_mlp": 0.0126542, + "balance_loss_clip": 0.06268418, + "balance_loss_mlp": 0.01255937, + "epoch": 0.925417105065384, + "flos": 34248878478720.0, + "grad_norm": 2.6134750174735615, + "language_loss": 0.52719831, + "learning_rate": 5.80231976856802e-08, + "loss": 0.60386336, + "num_input_tokens_seen": 332131180, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09490967, + "step": 15392, + "time_per_iteration": 2.618853807449341 + }, + { + "auxiliary_loss_clip": 0.06401975, + "auxiliary_loss_mlp": 0.01263312, + "balance_loss_clip": 0.06268699, + "balance_loss_mlp": 0.01254097, + "epoch": 0.925477228318052, + "flos": 25966915320960.0, + "grad_norm": 1.5816032710587289, + "language_loss": 0.7732839, + "learning_rate": 5.7930103863419454e-08, + "loss": 0.84993678, + "num_input_tokens_seen": 332149555, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.09210205, + "step": 15393, + "time_per_iteration": 2.537705659866333 + }, + { + "auxiliary_loss_clip": 0.06396216, + "auxiliary_loss_mlp": 0.01266005, + "balance_loss_clip": 0.06267455, + "balance_loss_mlp": 0.01256475, + "epoch": 0.9255373515707199, + "flos": 11843039427840.0, + "grad_norm": 1.996154441217668, + "language_loss": 0.69555247, + "learning_rate": 5.783708368464357e-08, + "loss": 0.77217472, + "num_input_tokens_seen": 332165830, + "router_z_loss_clip": 1.28808594, + "router_z_loss_mlp": 0.09539795, + "step": 15394, + "time_per_iteration": 3.9489758014678955 + }, + { + "auxiliary_loss_clip": 0.06405875, + "auxiliary_loss_mlp": 0.0126477, + "balance_loss_clip": 0.06272207, + "balance_loss_mlp": 0.01254965, + "epoch": 0.925597474823388, + "flos": 21440784049920.0, + "grad_norm": 1.656460677506419, + "language_loss": 0.73046553, + "learning_rate": 5.7744137152879956e-08, + "loss": 0.80717206, + "num_input_tokens_seen": 332185130, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.0980835, + "step": 15395, + "time_per_iteration": 2.6102516651153564 + }, + { + "auxiliary_loss_clip": 0.06395631, + "auxiliary_loss_mlp": 0.01263537, + "balance_loss_clip": 0.06268463, + "balance_loss_mlp": 0.01254513, + "epoch": 0.925657598076056, + "flos": 22864320264960.0, + "grad_norm": 1.8614827496346085, + "language_loss": 0.71563172, + "learning_rate": 5.7651264271653785e-08, + "loss": 0.79222345, + "num_input_tokens_seen": 332203695, + "router_z_loss_clip": 1.26953125, + "router_z_loss_mlp": 0.09020996, + "step": 15396, + "time_per_iteration": 2.4928057193756104 + }, + { + "auxiliary_loss_clip": 0.06398199, + "auxiliary_loss_mlp": 0.01264777, + "balance_loss_clip": 0.06268467, + "balance_loss_mlp": 0.01254763, + "epoch": 0.9257177213287239, + "flos": 25711350768000.0, + "grad_norm": 1.848315648403689, + "language_loss": 0.87198037, + "learning_rate": 5.755846504448603e-08, + "loss": 0.94861013, + "num_input_tokens_seen": 332224850, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.10009766, + "step": 15397, + "time_per_iteration": 2.54464054107666 + }, + { + "auxiliary_loss_clip": 0.06308962, + "auxiliary_loss_mlp": 0.01250606, + "balance_loss_clip": 0.06255, + "balance_loss_mlp": 0.01249661, + "epoch": 0.9257778445813919, + "flos": 59610955501440.0, + "grad_norm": 0.7882354200342199, + "language_loss": 0.55162835, + "learning_rate": 5.746573947489586e-08, + "loss": 0.62722397, + "num_input_tokens_seen": 332278085, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.00942993, + "step": 15398, + "time_per_iteration": 2.9914557933807373 + }, + { + "auxiliary_loss_clip": 0.06410329, + "auxiliary_loss_mlp": 0.01264914, + "balance_loss_clip": 0.06272009, + "balance_loss_mlp": 0.01254346, + "epoch": 0.9258379678340598, + "flos": 27716860316160.0, + "grad_norm": 1.6589961349835687, + "language_loss": 0.76505327, + "learning_rate": 5.7373087566400025e-08, + "loss": 0.8418057, + "num_input_tokens_seen": 332297875, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.10571289, + "step": 15399, + "time_per_iteration": 2.5598769187927246 + }, + { + "auxiliary_loss_clip": 0.06392607, + "auxiliary_loss_mlp": 0.01261184, + "balance_loss_clip": 0.06267655, + "balance_loss_mlp": 0.01252952, + "epoch": 0.9258980910867278, + "flos": 24870500645760.0, + "grad_norm": 1.4116581037404592, + "language_loss": 0.78297949, + "learning_rate": 5.7280509322510826e-08, + "loss": 0.85951746, + "num_input_tokens_seen": 332318500, + "router_z_loss_clip": 1.24902344, + "router_z_loss_mlp": 0.08227539, + "step": 15400, + "time_per_iteration": 2.5266971588134766 + }, + { + "auxiliary_loss_clip": 0.06312054, + "auxiliary_loss_mlp": 0.01255899, + "balance_loss_clip": 0.06257905, + "balance_loss_mlp": 0.01254794, + "epoch": 0.9259582143393957, + "flos": 63153625800960.0, + "grad_norm": 0.7063959708054426, + "language_loss": 0.51333666, + "learning_rate": 5.718800474673946e-08, + "loss": 0.5890162, + "num_input_tokens_seen": 332381980, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.01106262, + "step": 15401, + "time_per_iteration": 3.093920946121216 + }, + { + "auxiliary_loss_clip": 0.0639642, + "auxiliary_loss_mlp": 0.01264437, + "balance_loss_clip": 0.06271479, + "balance_loss_mlp": 0.01255997, + "epoch": 0.9260183375920638, + "flos": 24132835226880.0, + "grad_norm": 1.5775889664181235, + "language_loss": 0.82458878, + "learning_rate": 5.709557384259378e-08, + "loss": 0.90119737, + "num_input_tokens_seen": 332399510, + "router_z_loss_clip": 1.24902344, + "router_z_loss_mlp": 0.08447266, + "step": 15402, + "time_per_iteration": 2.5282785892486572 + }, + { + "auxiliary_loss_clip": 0.06307814, + "auxiliary_loss_mlp": 0.01254092, + "balance_loss_clip": 0.06253652, + "balance_loss_mlp": 0.01252993, + "epoch": 0.9260784608447317, + "flos": 63064863999360.0, + "grad_norm": 0.7161646458588573, + "language_loss": 0.51258361, + "learning_rate": 5.700321661357876e-08, + "loss": 0.58820271, + "num_input_tokens_seen": 332459130, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.01101685, + "step": 15403, + "time_per_iteration": 3.221836566925049 + }, + { + "auxiliary_loss_clip": 0.0631336, + "auxiliary_loss_mlp": 0.01254044, + "balance_loss_clip": 0.06259177, + "balance_loss_mlp": 0.01253067, + "epoch": 0.9261385840973997, + "flos": 70607652364800.0, + "grad_norm": 0.6694714734059207, + "language_loss": 0.58772385, + "learning_rate": 5.69109330631965e-08, + "loss": 0.66339797, + "num_input_tokens_seen": 332526555, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.00976562, + "step": 15404, + "time_per_iteration": 3.1927330493927 + }, + { + "auxiliary_loss_clip": 0.06401071, + "auxiliary_loss_mlp": 0.01264228, + "balance_loss_clip": 0.06268735, + "balance_loss_mlp": 0.01254673, + "epoch": 0.9261987073500676, + "flos": 20236111499520.0, + "grad_norm": 1.9818455249680897, + "language_loss": 0.71835959, + "learning_rate": 5.681872319494596e-08, + "loss": 0.79501259, + "num_input_tokens_seen": 332544005, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.09558105, + "step": 15405, + "time_per_iteration": 2.499476432800293 + }, + { + "auxiliary_loss_clip": 0.06404161, + "auxiliary_loss_mlp": 0.01268691, + "balance_loss_clip": 0.06269959, + "balance_loss_mlp": 0.01259065, + "epoch": 0.9262588306027356, + "flos": 20959563651840.0, + "grad_norm": 1.7250744191621226, + "language_loss": 0.69170922, + "learning_rate": 5.672658701232458e-08, + "loss": 0.76843774, + "num_input_tokens_seen": 332563070, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.09625244, + "step": 15406, + "time_per_iteration": 2.5540614128112793 + }, + { + "auxiliary_loss_clip": 0.0640143, + "auxiliary_loss_mlp": 0.01263229, + "balance_loss_clip": 0.06268954, + "balance_loss_mlp": 0.01253126, + "epoch": 0.9263189538554035, + "flos": 22164361983360.0, + "grad_norm": 2.1174818175534242, + "language_loss": 0.76692176, + "learning_rate": 5.663452451882555e-08, + "loss": 0.84356833, + "num_input_tokens_seen": 332579620, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.10101318, + "step": 15407, + "time_per_iteration": 2.5082249641418457 + }, + { + "auxiliary_loss_clip": 0.06410325, + "auxiliary_loss_mlp": 0.01266644, + "balance_loss_clip": 0.06271269, + "balance_loss_mlp": 0.0125613, + "epoch": 0.9263790771080715, + "flos": 18193146376320.0, + "grad_norm": 1.7688340349597225, + "language_loss": 0.72253478, + "learning_rate": 5.6542535717940096e-08, + "loss": 0.79930449, + "num_input_tokens_seen": 332597795, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.10516357, + "step": 15408, + "time_per_iteration": 2.5314793586730957 + }, + { + "auxiliary_loss_clip": 0.06398048, + "auxiliary_loss_mlp": 0.0126162, + "balance_loss_clip": 0.06270379, + "balance_loss_mlp": 0.01253442, + "epoch": 0.9264392003607396, + "flos": 48189501492480.0, + "grad_norm": 5.21505973276934, + "language_loss": 0.68691289, + "learning_rate": 5.645062061315675e-08, + "loss": 0.76350951, + "num_input_tokens_seen": 332620375, + "router_z_loss_clip": 1.27734375, + "router_z_loss_mlp": 0.08184814, + "step": 15409, + "time_per_iteration": 2.755697011947632 + }, + { + "auxiliary_loss_clip": 0.06404391, + "auxiliary_loss_mlp": 0.01267031, + "balance_loss_clip": 0.06271081, + "balance_loss_mlp": 0.0125663, + "epoch": 0.9264993236134075, + "flos": 26395586409600.0, + "grad_norm": 1.7559130928965878, + "language_loss": 0.75985503, + "learning_rate": 5.6358779207960506e-08, + "loss": 0.83656931, + "num_input_tokens_seen": 332639510, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.10394287, + "step": 15410, + "time_per_iteration": 2.5520312786102295 + }, + { + "auxiliary_loss_clip": 0.06401296, + "auxiliary_loss_mlp": 0.01263143, + "balance_loss_clip": 0.06268159, + "balance_loss_mlp": 0.01253797, + "epoch": 0.9265594468660755, + "flos": 20925881510400.0, + "grad_norm": 1.552254697633523, + "language_loss": 0.82113504, + "learning_rate": 5.6267011505833905e-08, + "loss": 0.89777941, + "num_input_tokens_seen": 332658350, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.09344482, + "step": 15411, + "time_per_iteration": 2.5069782733917236 + }, + { + "auxiliary_loss_clip": 0.06407169, + "auxiliary_loss_mlp": 0.01262961, + "balance_loss_clip": 0.06274098, + "balance_loss_mlp": 0.01253573, + "epoch": 0.9266195701187434, + "flos": 17529930910080.0, + "grad_norm": 1.7428936214869757, + "language_loss": 0.75701684, + "learning_rate": 5.617531751025728e-08, + "loss": 0.83371818, + "num_input_tokens_seen": 332676715, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.09387207, + "step": 15412, + "time_per_iteration": 2.5123889446258545 + }, + { + "auxiliary_loss_clip": 0.06398541, + "auxiliary_loss_mlp": 0.0126566, + "balance_loss_clip": 0.06267709, + "balance_loss_mlp": 0.01256439, + "epoch": 0.9266796933714114, + "flos": 33696436510080.0, + "grad_norm": 1.5679043837553974, + "language_loss": 0.67275411, + "learning_rate": 5.6083697224707406e-08, + "loss": 0.74939615, + "num_input_tokens_seen": 332701470, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09222412, + "step": 15413, + "time_per_iteration": 2.605947732925415 + }, + { + "auxiliary_loss_clip": 0.06403206, + "auxiliary_loss_mlp": 0.01262992, + "balance_loss_clip": 0.06269696, + "balance_loss_mlp": 0.01253169, + "epoch": 0.9267398166240793, + "flos": 18922510241280.0, + "grad_norm": 1.6198376571408515, + "language_loss": 0.7588625, + "learning_rate": 5.5992150652658167e-08, + "loss": 0.83552444, + "num_input_tokens_seen": 332719060, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.09820557, + "step": 15414, + "time_per_iteration": 2.4856977462768555 + }, + { + "auxiliary_loss_clip": 0.06399503, + "auxiliary_loss_mlp": 0.01264925, + "balance_loss_clip": 0.06270388, + "balance_loss_mlp": 0.01256246, + "epoch": 0.9267999398767474, + "flos": 20484129185280.0, + "grad_norm": 2.030820880788606, + "language_loss": 0.81923372, + "learning_rate": 5.59006777975819e-08, + "loss": 0.89587802, + "num_input_tokens_seen": 332736345, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.08679199, + "step": 15415, + "time_per_iteration": 2.4929685592651367 + }, + { + "auxiliary_loss_clip": 0.06406386, + "auxiliary_loss_mlp": 0.01265515, + "balance_loss_clip": 0.06271857, + "balance_loss_mlp": 0.01255394, + "epoch": 0.9268600631294153, + "flos": 24796092620160.0, + "grad_norm": 1.5707213378789486, + "language_loss": 0.5453577, + "learning_rate": 5.580927866294671e-08, + "loss": 0.62207669, + "num_input_tokens_seen": 332756270, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.10131836, + "step": 15416, + "time_per_iteration": 4.090368747711182 + }, + { + "auxiliary_loss_clip": 0.06395909, + "auxiliary_loss_mlp": 0.01263225, + "balance_loss_clip": 0.06268269, + "balance_loss_mlp": 0.01254302, + "epoch": 0.9269201863820833, + "flos": 18703059598080.0, + "grad_norm": 1.4326729115430334, + "language_loss": 0.72303391, + "learning_rate": 5.571795325221807e-08, + "loss": 0.79962528, + "num_input_tokens_seen": 332775185, + "router_z_loss_clip": 1.27734375, + "router_z_loss_mlp": 0.08917236, + "step": 15417, + "time_per_iteration": 2.492025136947632 + }, + { + "auxiliary_loss_clip": 0.06399834, + "auxiliary_loss_mlp": 0.012626, + "balance_loss_clip": 0.06270199, + "balance_loss_mlp": 0.01253331, + "epoch": 0.9269803096347512, + "flos": 20930451557760.0, + "grad_norm": 1.915992557586703, + "language_loss": 0.75794625, + "learning_rate": 5.5626701568859624e-08, + "loss": 0.83457053, + "num_input_tokens_seen": 332794320, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.09265137, + "step": 15418, + "time_per_iteration": 2.500960111618042 + }, + { + "auxiliary_loss_clip": 0.06400837, + "auxiliary_loss_mlp": 0.01265825, + "balance_loss_clip": 0.06271046, + "balance_loss_mlp": 0.01256485, + "epoch": 0.9270404328874192, + "flos": 28010425495680.0, + "grad_norm": 1.4576581953985273, + "language_loss": 0.76502192, + "learning_rate": 5.553552361633174e-08, + "loss": 0.84168857, + "num_input_tokens_seen": 332818095, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.09344482, + "step": 15419, + "time_per_iteration": 2.5978782176971436 + }, + { + "auxiliary_loss_clip": 0.06393886, + "auxiliary_loss_mlp": 0.01261694, + "balance_loss_clip": 0.0626778, + "balance_loss_mlp": 0.01253612, + "epoch": 0.9271005561400871, + "flos": 25897790102400.0, + "grad_norm": 1.5679935415739816, + "language_loss": 0.7624113, + "learning_rate": 5.5444419398091636e-08, + "loss": 0.83896708, + "num_input_tokens_seen": 332839860, + "router_z_loss_clip": 1.26074219, + "router_z_loss_mlp": 0.08081055, + "step": 15420, + "time_per_iteration": 2.547791004180908 + }, + { + "auxiliary_loss_clip": 0.06403813, + "auxiliary_loss_mlp": 0.01264966, + "balance_loss_clip": 0.06269305, + "balance_loss_mlp": 0.01254708, + "epoch": 0.9271606793927551, + "flos": 27061443279360.0, + "grad_norm": 1.3671087136068567, + "language_loss": 0.76732445, + "learning_rate": 5.535338891759389e-08, + "loss": 0.84401226, + "num_input_tokens_seen": 332861155, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.10266113, + "step": 15421, + "time_per_iteration": 2.579566717147827 + }, + { + "auxiliary_loss_clip": 0.06401263, + "auxiliary_loss_mlp": 0.01264215, + "balance_loss_clip": 0.06270991, + "balance_loss_mlp": 0.01254958, + "epoch": 0.9272208026454232, + "flos": 26216442380160.0, + "grad_norm": 2.015466462348958, + "language_loss": 0.72872943, + "learning_rate": 5.526243217829041e-08, + "loss": 0.80538422, + "num_input_tokens_seen": 332881110, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.0925293, + "step": 15422, + "time_per_iteration": 2.556781530380249 + }, + { + "auxiliary_loss_clip": 0.06401004, + "auxiliary_loss_mlp": 0.01263615, + "balance_loss_clip": 0.06268564, + "balance_loss_mlp": 0.01254305, + "epoch": 0.9272809258980911, + "flos": 12463348803840.0, + "grad_norm": 1.9568135682925627, + "language_loss": 0.77870274, + "learning_rate": 5.517154918363065e-08, + "loss": 0.85534894, + "num_input_tokens_seen": 332899350, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09307861, + "step": 15423, + "time_per_iteration": 2.54386568069458 + }, + { + "auxiliary_loss_clip": 0.06402774, + "auxiliary_loss_mlp": 0.01262642, + "balance_loss_clip": 0.06267941, + "balance_loss_mlp": 0.01252688, + "epoch": 0.9273410491507591, + "flos": 22863523651200.0, + "grad_norm": 1.9393896166418776, + "language_loss": 0.75592458, + "learning_rate": 5.508073993706053e-08, + "loss": 0.83257878, + "num_input_tokens_seen": 332918105, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.09954834, + "step": 15424, + "time_per_iteration": 3.950807571411133 + }, + { + "auxiliary_loss_clip": 0.06308335, + "auxiliary_loss_mlp": 0.01251168, + "balance_loss_clip": 0.06254116, + "balance_loss_mlp": 0.01250244, + "epoch": 0.927401172403427, + "flos": 47681963383680.0, + "grad_norm": 0.7629522595192675, + "language_loss": 0.60162652, + "learning_rate": 5.499000444202351e-08, + "loss": 0.67722148, + "num_input_tokens_seen": 332969490, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.00921631, + "step": 15425, + "time_per_iteration": 2.9016902446746826 + }, + { + "auxiliary_loss_clip": 0.06402518, + "auxiliary_loss_mlp": 0.01261675, + "balance_loss_clip": 0.06271154, + "balance_loss_mlp": 0.01252585, + "epoch": 0.927461295656095, + "flos": 29980324258560.0, + "grad_norm": 1.366559565689854, + "language_loss": 0.71148986, + "learning_rate": 5.489934270196106e-08, + "loss": 0.78813183, + "num_input_tokens_seen": 332988805, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09088135, + "step": 15426, + "time_per_iteration": 2.61396861076355 + }, + { + "auxiliary_loss_clip": 0.06402343, + "auxiliary_loss_mlp": 0.01262233, + "balance_loss_clip": 0.0627178, + "balance_loss_mlp": 0.01253388, + "epoch": 0.9275214189087629, + "flos": 20381573387520.0, + "grad_norm": 1.8238747923679495, + "language_loss": 0.83321905, + "learning_rate": 5.480875472030977e-08, + "loss": 0.9098649, + "num_input_tokens_seen": 333007960, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.08843994, + "step": 15427, + "time_per_iteration": 2.533583641052246 + }, + { + "auxiliary_loss_clip": 0.06399953, + "auxiliary_loss_mlp": 0.01264957, + "balance_loss_clip": 0.0626848, + "balance_loss_mlp": 0.01255158, + "epoch": 0.927581542161431, + "flos": 22389850120320.0, + "grad_norm": 1.562228354740854, + "language_loss": 0.77034312, + "learning_rate": 5.471824050050555e-08, + "loss": 0.84699225, + "num_input_tokens_seen": 333026035, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09802246, + "step": 15428, + "time_per_iteration": 2.5238113403320312 + }, + { + "auxiliary_loss_clip": 0.0640026, + "auxiliary_loss_mlp": 0.01264868, + "balance_loss_clip": 0.06270307, + "balance_loss_mlp": 0.01255528, + "epoch": 0.9276416654140989, + "flos": 23959435201920.0, + "grad_norm": 1.7264807975252925, + "language_loss": 0.7457782, + "learning_rate": 5.4627800045980555e-08, + "loss": 0.82242942, + "num_input_tokens_seen": 333045590, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.09338379, + "step": 15429, + "time_per_iteration": 2.5195233821868896 + }, + { + "auxiliary_loss_clip": 0.06396069, + "auxiliary_loss_mlp": 0.01264681, + "balance_loss_clip": 0.06268522, + "balance_loss_mlp": 0.01255723, + "epoch": 0.9277017886667669, + "flos": 13922831220480.0, + "grad_norm": 1.7362302718251208, + "language_loss": 0.75345081, + "learning_rate": 5.45374333601647e-08, + "loss": 0.83005834, + "num_input_tokens_seen": 333063355, + "router_z_loss_clip": 1.27441406, + "router_z_loss_mlp": 0.08959961, + "step": 15430, + "time_per_iteration": 3.9254066944122314 + }, + { + "auxiliary_loss_clip": 0.06402864, + "auxiliary_loss_mlp": 0.01262331, + "balance_loss_clip": 0.06269671, + "balance_loss_mlp": 0.01252478, + "epoch": 0.9277619119194348, + "flos": 35675768856960.0, + "grad_norm": 1.3448855002348141, + "language_loss": 0.76524234, + "learning_rate": 5.444714044648391e-08, + "loss": 0.84189427, + "num_input_tokens_seen": 333088045, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.09851074, + "step": 15431, + "time_per_iteration": 2.6647591590881348 + }, + { + "auxiliary_loss_clip": 0.06399286, + "auxiliary_loss_mlp": 0.01265502, + "balance_loss_clip": 0.06271762, + "balance_loss_mlp": 0.0125649, + "epoch": 0.9278220351721028, + "flos": 23847907017600.0, + "grad_norm": 1.598032669675074, + "language_loss": 0.70804644, + "learning_rate": 5.4356921308363e-08, + "loss": 0.78469431, + "num_input_tokens_seen": 333108005, + "router_z_loss_clip": 1.27539062, + "router_z_loss_mlp": 0.09014893, + "step": 15432, + "time_per_iteration": 2.521979808807373 + }, + { + "auxiliary_loss_clip": 0.06401653, + "auxiliary_loss_mlp": 0.01268277, + "balance_loss_clip": 0.06268461, + "balance_loss_mlp": 0.01258746, + "epoch": 0.9278821584247707, + "flos": 15232952534400.0, + "grad_norm": 3.130753679955256, + "language_loss": 0.83228093, + "learning_rate": 5.4266775949222354e-08, + "loss": 0.90898025, + "num_input_tokens_seen": 333124335, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.09533691, + "step": 15433, + "time_per_iteration": 3.868227481842041 + }, + { + "auxiliary_loss_clip": 0.06392471, + "auxiliary_loss_mlp": 0.01264408, + "balance_loss_clip": 0.06267262, + "balance_loss_mlp": 0.01256147, + "epoch": 0.9279422816774388, + "flos": 24688379796480.0, + "grad_norm": 1.704558942323815, + "language_loss": 0.67013133, + "learning_rate": 5.417670437248056e-08, + "loss": 0.74670017, + "num_input_tokens_seen": 333143995, + "router_z_loss_clip": 1.25390625, + "router_z_loss_mlp": 0.08258057, + "step": 15434, + "time_per_iteration": 2.5150067806243896 + }, + { + "auxiliary_loss_clip": 0.06390243, + "auxiliary_loss_mlp": 0.01261235, + "balance_loss_clip": 0.06267539, + "balance_loss_mlp": 0.01252938, + "epoch": 0.9280024049301068, + "flos": 19174762558080.0, + "grad_norm": 1.6939832412088915, + "language_loss": 0.68807113, + "learning_rate": 5.40867065815529e-08, + "loss": 0.76458597, + "num_input_tokens_seen": 333162805, + "router_z_loss_clip": 1.22753906, + "router_z_loss_mlp": 0.08300781, + "step": 15435, + "time_per_iteration": 2.5746238231658936 + }, + { + "auxiliary_loss_clip": 0.06400537, + "auxiliary_loss_mlp": 0.01264275, + "balance_loss_clip": 0.06268658, + "balance_loss_mlp": 0.01254757, + "epoch": 0.9280625281827747, + "flos": 11397304033920.0, + "grad_norm": 1.8675874882503214, + "language_loss": 0.72116661, + "learning_rate": 5.399678257985263e-08, + "loss": 0.79781473, + "num_input_tokens_seen": 333175770, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09521484, + "step": 15436, + "time_per_iteration": 2.4609224796295166 + }, + { + "auxiliary_loss_clip": 0.06404845, + "auxiliary_loss_mlp": 0.01266496, + "balance_loss_clip": 0.0627347, + "balance_loss_mlp": 0.01257287, + "epoch": 0.9281226514354427, + "flos": 24791732208000.0, + "grad_norm": 1.898604382401611, + "language_loss": 0.67076588, + "learning_rate": 5.390693237078925e-08, + "loss": 0.74747938, + "num_input_tokens_seen": 333194775, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09204102, + "step": 15437, + "time_per_iteration": 2.574120044708252 + }, + { + "auxiliary_loss_clip": 0.06404506, + "auxiliary_loss_mlp": 0.01265505, + "balance_loss_clip": 0.06270991, + "balance_loss_mlp": 0.01254728, + "epoch": 0.9281827746881106, + "flos": 15088077624960.0, + "grad_norm": 1.8931845608351296, + "language_loss": 0.71641231, + "learning_rate": 5.3817155957770254e-08, + "loss": 0.7931124, + "num_input_tokens_seen": 333208920, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.10778809, + "step": 15438, + "time_per_iteration": 2.5342071056365967 + }, + { + "auxiliary_loss_clip": 0.06401535, + "auxiliary_loss_mlp": 0.01263761, + "balance_loss_clip": 0.06268774, + "balance_loss_mlp": 0.01255089, + "epoch": 0.9282428979407786, + "flos": 24142101102720.0, + "grad_norm": 1.7631471978480706, + "language_loss": 0.64994079, + "learning_rate": 5.3727453344199366e-08, + "loss": 0.72659373, + "num_input_tokens_seen": 333229350, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.08679199, + "step": 15439, + "time_per_iteration": 2.5256354808807373 + }, + { + "auxiliary_loss_clip": 0.0639973, + "auxiliary_loss_mlp": 0.01264033, + "balance_loss_clip": 0.06269728, + "balance_loss_mlp": 0.01255349, + "epoch": 0.9283030211934465, + "flos": 24829523199360.0, + "grad_norm": 1.6821997919344165, + "language_loss": 0.70312607, + "learning_rate": 5.363782453347876e-08, + "loss": 0.7797637, + "num_input_tokens_seen": 333246125, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.0869751, + "step": 15440, + "time_per_iteration": 2.5232927799224854 + }, + { + "auxiliary_loss_clip": 0.06404891, + "auxiliary_loss_mlp": 0.01265965, + "balance_loss_clip": 0.06268373, + "balance_loss_mlp": 0.01255523, + "epoch": 0.9283631444461146, + "flos": 23986702506240.0, + "grad_norm": 1.5413519977968317, + "language_loss": 0.77124566, + "learning_rate": 5.354826952900682e-08, + "loss": 0.84795421, + "num_input_tokens_seen": 333263685, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.10430908, + "step": 15441, + "time_per_iteration": 2.516756772994995 + }, + { + "auxiliary_loss_clip": 0.06398309, + "auxiliary_loss_mlp": 0.01262603, + "balance_loss_clip": 0.06272468, + "balance_loss_mlp": 0.01254735, + "epoch": 0.9284232676987825, + "flos": 22791253904640.0, + "grad_norm": 1.5445819988173333, + "language_loss": 0.64162666, + "learning_rate": 5.345878833417949e-08, + "loss": 0.71823585, + "num_input_tokens_seen": 333282435, + "router_z_loss_clip": 1.25976562, + "router_z_loss_mlp": 0.07873535, + "step": 15442, + "time_per_iteration": 2.505448341369629 + }, + { + "auxiliary_loss_clip": 0.06404665, + "auxiliary_loss_mlp": 0.01268101, + "balance_loss_clip": 0.0626903, + "balance_loss_mlp": 0.01258314, + "epoch": 0.9284833909514505, + "flos": 19506621853440.0, + "grad_norm": 1.7431674890191913, + "language_loss": 0.80909652, + "learning_rate": 5.3369380952390295e-08, + "loss": 0.8858242, + "num_input_tokens_seen": 333300400, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.09790039, + "step": 15443, + "time_per_iteration": 2.562551498413086 + }, + { + "auxiliary_loss_clip": 0.06403337, + "auxiliary_loss_mlp": 0.0126488, + "balance_loss_clip": 0.06270701, + "balance_loss_mlp": 0.01255629, + "epoch": 0.9285435142041184, + "flos": 23192783470080.0, + "grad_norm": 1.9512114579199797, + "language_loss": 0.65079677, + "learning_rate": 5.328004738702896e-08, + "loss": 0.72747898, + "num_input_tokens_seen": 333318980, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.0925293, + "step": 15444, + "time_per_iteration": 2.5125370025634766 + }, + { + "auxiliary_loss_clip": 0.06402203, + "auxiliary_loss_mlp": 0.01263334, + "balance_loss_clip": 0.06270593, + "balance_loss_mlp": 0.0125425, + "epoch": 0.9286036374567864, + "flos": 17681220656640.0, + "grad_norm": 2.010684849546823, + "language_loss": 0.73854786, + "learning_rate": 5.3190787641483215e-08, + "loss": 0.81520319, + "num_input_tokens_seen": 333334135, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.09082031, + "step": 15445, + "time_per_iteration": 2.5049667358398438 + }, + { + "auxiliary_loss_clip": 0.064026, + "auxiliary_loss_mlp": 0.01262565, + "balance_loss_clip": 0.06271416, + "balance_loss_mlp": 0.01253165, + "epoch": 0.9286637607094543, + "flos": 20892995982720.0, + "grad_norm": 1.5998111247681204, + "language_loss": 0.71395653, + "learning_rate": 5.3101601719138135e-08, + "loss": 0.79060817, + "num_input_tokens_seen": 333353325, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.09399414, + "step": 15446, + "time_per_iteration": 2.502922296524048 + }, + { + "auxiliary_loss_clip": 0.06408063, + "auxiliary_loss_mlp": 0.01262626, + "balance_loss_clip": 0.06270014, + "balance_loss_mlp": 0.01252642, + "epoch": 0.9287238839621224, + "flos": 19032025927680.0, + "grad_norm": 1.8680884802805782, + "language_loss": 0.69709033, + "learning_rate": 5.301248962337523e-08, + "loss": 0.77379727, + "num_input_tokens_seen": 333371110, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.09979248, + "step": 15447, + "time_per_iteration": 2.498037815093994 + }, + { + "auxiliary_loss_clip": 0.06395551, + "auxiliary_loss_mlp": 0.0126141, + "balance_loss_clip": 0.06271149, + "balance_loss_mlp": 0.01252809, + "epoch": 0.9287840072147904, + "flos": 20563065331200.0, + "grad_norm": 1.463542829558656, + "language_loss": 0.72163129, + "learning_rate": 5.292345135757403e-08, + "loss": 0.79820085, + "num_input_tokens_seen": 333391420, + "router_z_loss_clip": 1.24511719, + "router_z_loss_mlp": 0.08605957, + "step": 15448, + "time_per_iteration": 2.5169200897216797 + }, + { + "auxiliary_loss_clip": 0.06399667, + "auxiliary_loss_mlp": 0.01262909, + "balance_loss_clip": 0.06270666, + "balance_loss_mlp": 0.01253128, + "epoch": 0.9288441304674583, + "flos": 21257069973120.0, + "grad_norm": 1.631031069367745, + "language_loss": 0.74867898, + "learning_rate": 5.283448692511072e-08, + "loss": 0.82530475, + "num_input_tokens_seen": 333410365, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.09790039, + "step": 15449, + "time_per_iteration": 2.5181782245635986 + }, + { + "auxiliary_loss_clip": 0.06401692, + "auxiliary_loss_mlp": 0.01260945, + "balance_loss_clip": 0.06271457, + "balance_loss_mlp": 0.01251426, + "epoch": 0.9289042537201263, + "flos": 27676763337600.0, + "grad_norm": 2.2115875222336716, + "language_loss": 0.67882347, + "learning_rate": 5.27455963293586e-08, + "loss": 0.75544983, + "num_input_tokens_seen": 333430000, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.09503174, + "step": 15450, + "time_per_iteration": 2.588937759399414 + }, + { + "auxiliary_loss_clip": 0.06401034, + "auxiliary_loss_mlp": 0.01262114, + "balance_loss_clip": 0.06269682, + "balance_loss_mlp": 0.01253311, + "epoch": 0.9289643769727942, + "flos": 19323788244480.0, + "grad_norm": 1.901357650419004, + "language_loss": 0.71771216, + "learning_rate": 5.265677957368875e-08, + "loss": 0.79434371, + "num_input_tokens_seen": 333445800, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.08795166, + "step": 15451, + "time_per_iteration": 2.5311567783355713 + }, + { + "auxiliary_loss_clip": 0.06402208, + "auxiliary_loss_mlp": 0.01262611, + "balance_loss_clip": 0.06270938, + "balance_loss_mlp": 0.01253527, + "epoch": 0.9290245002254622, + "flos": 14062255614720.0, + "grad_norm": 2.063265286417505, + "language_loss": 0.73937112, + "learning_rate": 5.25680366614687e-08, + "loss": 0.8160193, + "num_input_tokens_seen": 333461550, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.09075928, + "step": 15452, + "time_per_iteration": 2.533107042312622 + }, + { + "auxiliary_loss_clip": 0.06399271, + "auxiliary_loss_mlp": 0.0126503, + "balance_loss_clip": 0.06270489, + "balance_loss_mlp": 0.01255851, + "epoch": 0.9290846234781301, + "flos": 20053235963520.0, + "grad_norm": 2.3098184994717785, + "language_loss": 0.74543643, + "learning_rate": 5.2479367596064196e-08, + "loss": 0.82207942, + "num_input_tokens_seen": 333478835, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.09173584, + "step": 15453, + "time_per_iteration": 2.505582094192505 + }, + { + "auxiliary_loss_clip": 0.06307368, + "auxiliary_loss_mlp": 0.01250217, + "balance_loss_clip": 0.062534, + "balance_loss_mlp": 0.0124918, + "epoch": 0.9291447467307982, + "flos": 61244592629760.0, + "grad_norm": 0.8123240258072839, + "language_loss": 0.60719591, + "learning_rate": 5.2390772380837226e-08, + "loss": 0.6827718, + "num_input_tokens_seen": 333535250, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.01038361, + "step": 15454, + "time_per_iteration": 3.0330328941345215 + }, + { + "auxiliary_loss_clip": 0.06403492, + "auxiliary_loss_mlp": 0.01267869, + "balance_loss_clip": 0.06270558, + "balance_loss_mlp": 0.0125835, + "epoch": 0.9292048699834661, + "flos": 20558746846080.0, + "grad_norm": 1.4616904844748926, + "language_loss": 0.69075823, + "learning_rate": 5.230225101914709e-08, + "loss": 0.76747185, + "num_input_tokens_seen": 333553805, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.09521484, + "step": 15455, + "time_per_iteration": 3.9310483932495117 + }, + { + "auxiliary_loss_clip": 0.06399804, + "auxiliary_loss_mlp": 0.0126208, + "balance_loss_clip": 0.06269494, + "balance_loss_mlp": 0.01253366, + "epoch": 0.9292649932361341, + "flos": 23630510799360.0, + "grad_norm": 1.5254212820753648, + "language_loss": 0.65071934, + "learning_rate": 5.22138035143509e-08, + "loss": 0.72733819, + "num_input_tokens_seen": 333572800, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.08709717, + "step": 15456, + "time_per_iteration": 2.5281927585601807 + }, + { + "auxiliary_loss_clip": 0.06399552, + "auxiliary_loss_mlp": 0.01266719, + "balance_loss_clip": 0.06271125, + "balance_loss_mlp": 0.01257141, + "epoch": 0.929325116488802, + "flos": 15014843556480.0, + "grad_norm": 1.6452448643687836, + "language_loss": 0.68623769, + "learning_rate": 5.2125429869802615e-08, + "loss": 0.76290047, + "num_input_tokens_seen": 333588520, + "router_z_loss_clip": 1.28222656, + "router_z_loss_mlp": 0.0958252, + "step": 15457, + "time_per_iteration": 2.4656875133514404 + }, + { + "auxiliary_loss_clip": 0.0640226, + "auxiliary_loss_mlp": 0.01262411, + "balance_loss_clip": 0.06269163, + "balance_loss_mlp": 0.01252749, + "epoch": 0.92938523974147, + "flos": 17973108754560.0, + "grad_norm": 2.0948470161883717, + "language_loss": 0.81135142, + "learning_rate": 5.203713008885291e-08, + "loss": 0.8879981, + "num_input_tokens_seen": 333603435, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.09655762, + "step": 15458, + "time_per_iteration": 2.483344316482544 + }, + { + "auxiliary_loss_clip": 0.06399539, + "auxiliary_loss_mlp": 0.01264005, + "balance_loss_clip": 0.06268502, + "balance_loss_mlp": 0.01254754, + "epoch": 0.9294453629941379, + "flos": 23009740225920.0, + "grad_norm": 1.5981022484787952, + "language_loss": 0.72647446, + "learning_rate": 5.194890417485065e-08, + "loss": 0.80310988, + "num_input_tokens_seen": 333623305, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.0925293, + "step": 15459, + "time_per_iteration": 2.5095856189727783 + }, + { + "auxiliary_loss_clip": 0.06403077, + "auxiliary_loss_mlp": 0.01264372, + "balance_loss_clip": 0.0627103, + "balance_loss_mlp": 0.01255223, + "epoch": 0.929505486246806, + "flos": 17060827426560.0, + "grad_norm": 2.205290237596035, + "language_loss": 0.59509528, + "learning_rate": 5.1860752131141384e-08, + "loss": 0.67176986, + "num_input_tokens_seen": 333641205, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.09143066, + "step": 15460, + "time_per_iteration": 2.4897260665893555 + }, + { + "auxiliary_loss_clip": 0.06407061, + "auxiliary_loss_mlp": 0.01267368, + "balance_loss_clip": 0.06273078, + "balance_loss_mlp": 0.01257909, + "epoch": 0.9295656094994739, + "flos": 27347084248320.0, + "grad_norm": 1.858696453479836, + "language_loss": 0.81050324, + "learning_rate": 5.177267396106733e-08, + "loss": 0.88724756, + "num_input_tokens_seen": 333659615, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.09466553, + "step": 15461, + "time_per_iteration": 2.5442938804626465 + }, + { + "auxiliary_loss_clip": 0.06401002, + "auxiliary_loss_mlp": 0.01264519, + "balance_loss_clip": 0.06271482, + "balance_loss_mlp": 0.01255275, + "epoch": 0.9296257327521419, + "flos": 21477443011200.0, + "grad_norm": 1.6125510363493594, + "language_loss": 0.78114223, + "learning_rate": 5.168466966796869e-08, + "loss": 0.85779738, + "num_input_tokens_seen": 333678985, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.09246826, + "step": 15462, + "time_per_iteration": 2.5683822631835938 + }, + { + "auxiliary_loss_clip": 0.06399049, + "auxiliary_loss_mlp": 0.01262981, + "balance_loss_clip": 0.06268325, + "balance_loss_mlp": 0.01254207, + "epoch": 0.9296858560048099, + "flos": 16368248304000.0, + "grad_norm": 1.8573692546143064, + "language_loss": 0.63046449, + "learning_rate": 5.159673925518282e-08, + "loss": 0.70708477, + "num_input_tokens_seen": 333696410, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.08764648, + "step": 15463, + "time_per_iteration": 2.4773969650268555 + }, + { + "auxiliary_loss_clip": 0.06398252, + "auxiliary_loss_mlp": 0.01262228, + "balance_loss_clip": 0.06268728, + "balance_loss_mlp": 0.01253466, + "epoch": 0.9297459792574778, + "flos": 29865819254400.0, + "grad_norm": 1.4275812835029746, + "language_loss": 0.71507215, + "learning_rate": 5.15088827260437e-08, + "loss": 0.79167688, + "num_input_tokens_seen": 333716615, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.08758545, + "step": 15464, + "time_per_iteration": 3.9611384868621826 + }, + { + "auxiliary_loss_clip": 0.06404192, + "auxiliary_loss_mlp": 0.01259513, + "balance_loss_clip": 0.06270679, + "balance_loss_mlp": 0.01250835, + "epoch": 0.9298061025101458, + "flos": 15930353266560.0, + "grad_norm": 1.866301443113407, + "language_loss": 0.78163409, + "learning_rate": 5.1421100083883115e-08, + "loss": 0.85827112, + "num_input_tokens_seen": 333732800, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.08679199, + "step": 15465, + "time_per_iteration": 2.5284931659698486 + }, + { + "auxiliary_loss_clip": 0.0631011, + "auxiliary_loss_mlp": 0.01253376, + "balance_loss_clip": 0.06255974, + "balance_loss_mlp": 0.0125237, + "epoch": 0.9298662257628137, + "flos": 64118498365440.0, + "grad_norm": 0.6980012483793121, + "language_loss": 0.56405276, + "learning_rate": 5.133339133202952e-08, + "loss": 0.6396876, + "num_input_tokens_seen": 333799300, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.01006317, + "step": 15466, + "time_per_iteration": 3.244619846343994 + }, + { + "auxiliary_loss_clip": 0.06403805, + "auxiliary_loss_mlp": 0.0126834, + "balance_loss_clip": 0.06270371, + "balance_loss_mlp": 0.01258588, + "epoch": 0.9299263490154818, + "flos": 24287143720320.0, + "grad_norm": 1.3940934660028805, + "language_loss": 0.73205161, + "learning_rate": 5.1245756473809355e-08, + "loss": 0.80877304, + "num_input_tokens_seen": 333820360, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.09747314, + "step": 15467, + "time_per_iteration": 2.5676679611206055 + }, + { + "auxiliary_loss_clip": 0.06403539, + "auxiliary_loss_mlp": 0.01265696, + "balance_loss_clip": 0.06271035, + "balance_loss_mlp": 0.01256458, + "epoch": 0.9299864722681497, + "flos": 23300999418240.0, + "grad_norm": 1.6752251187046447, + "language_loss": 0.72396517, + "learning_rate": 5.1158195512545076e-08, + "loss": 0.80065751, + "num_input_tokens_seen": 333840415, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.09240723, + "step": 15468, + "time_per_iteration": 2.59311580657959 + }, + { + "auxiliary_loss_clip": 0.06405564, + "auxiliary_loss_mlp": 0.01262883, + "balance_loss_clip": 0.0627134, + "balance_loss_mlp": 0.01253424, + "epoch": 0.9300465955208177, + "flos": 21402112590720.0, + "grad_norm": 1.6073000412647687, + "language_loss": 0.75552547, + "learning_rate": 5.107070845155737e-08, + "loss": 0.83220994, + "num_input_tokens_seen": 333859910, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.09466553, + "step": 15469, + "time_per_iteration": 2.5530714988708496 + }, + { + "auxiliary_loss_clip": 0.06402186, + "auxiliary_loss_mlp": 0.01267177, + "balance_loss_clip": 0.06269206, + "balance_loss_mlp": 0.01257629, + "epoch": 0.9301067187734856, + "flos": 24578319058560.0, + "grad_norm": 1.7588900587413723, + "language_loss": 0.76161134, + "learning_rate": 5.098329529416379e-08, + "loss": 0.838305, + "num_input_tokens_seen": 333880495, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.09545898, + "step": 15470, + "time_per_iteration": 4.046792984008789 + }, + { + "auxiliary_loss_clip": 0.063991, + "auxiliary_loss_mlp": 0.01265604, + "balance_loss_clip": 0.06269463, + "balance_loss_mlp": 0.01256431, + "epoch": 0.9301668420261536, + "flos": 22202949588480.0, + "grad_norm": 1.50853778846898, + "language_loss": 0.74989831, + "learning_rate": 5.089595604367902e-08, + "loss": 0.82654536, + "num_input_tokens_seen": 333897640, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.09179688, + "step": 15471, + "time_per_iteration": 2.523951530456543 + }, + { + "auxiliary_loss_clip": 0.06401512, + "auxiliary_loss_mlp": 0.01264888, + "balance_loss_clip": 0.06271497, + "balance_loss_mlp": 0.01255661, + "epoch": 0.9302269652788215, + "flos": 17753196913920.0, + "grad_norm": 2.3784631998670203, + "language_loss": 0.69654554, + "learning_rate": 5.080869070341487e-08, + "loss": 0.77320957, + "num_input_tokens_seen": 333913670, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.09234619, + "step": 15472, + "time_per_iteration": 2.523432493209839 + }, + { + "auxiliary_loss_clip": 0.06395452, + "auxiliary_loss_mlp": 0.01263156, + "balance_loss_clip": 0.06270222, + "balance_loss_mlp": 0.01254281, + "epoch": 0.9302870885314896, + "flos": 19396854604800.0, + "grad_norm": 1.6143670274863005, + "language_loss": 0.88837874, + "learning_rate": 5.0721499276680233e-08, + "loss": 0.96496475, + "num_input_tokens_seen": 333934105, + "router_z_loss_clip": 1.25390625, + "router_z_loss_mlp": 0.08880615, + "step": 15473, + "time_per_iteration": 4.012357473373413 + }, + { + "auxiliary_loss_clip": 0.06405994, + "auxiliary_loss_mlp": 0.01265627, + "balance_loss_clip": 0.06271876, + "balance_loss_mlp": 0.01255274, + "epoch": 0.9303472117841575, + "flos": 21766396216320.0, + "grad_norm": 2.419925900963914, + "language_loss": 0.64569032, + "learning_rate": 5.063438176678203e-08, + "loss": 0.72240651, + "num_input_tokens_seen": 333953635, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.10357666, + "step": 15474, + "time_per_iteration": 2.5024755001068115 + }, + { + "auxiliary_loss_clip": 0.06400555, + "auxiliary_loss_mlp": 0.01264178, + "balance_loss_clip": 0.06268995, + "balance_loss_mlp": 0.01254409, + "epoch": 0.9304073350368255, + "flos": 19615844050560.0, + "grad_norm": 1.7539760136561613, + "language_loss": 0.74913669, + "learning_rate": 5.054733817702339e-08, + "loss": 0.82578397, + "num_input_tokens_seen": 333971825, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.09765625, + "step": 15475, + "time_per_iteration": 2.4802138805389404 + }, + { + "auxiliary_loss_clip": 0.06402318, + "auxiliary_loss_mlp": 0.01267821, + "balance_loss_clip": 0.06271594, + "balance_loss_mlp": 0.01258761, + "epoch": 0.9304674582894935, + "flos": 30448756909440.0, + "grad_norm": 2.5253856676415296, + "language_loss": 0.67179549, + "learning_rate": 5.0460368510704786e-08, + "loss": 0.74849689, + "num_input_tokens_seen": 333990120, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.09057617, + "step": 15476, + "time_per_iteration": 2.5887856483459473 + }, + { + "auxiliary_loss_clip": 0.06402615, + "auxiliary_loss_mlp": 0.01265997, + "balance_loss_clip": 0.06271078, + "balance_loss_mlp": 0.01256782, + "epoch": 0.9305275815421614, + "flos": 17791532956800.0, + "grad_norm": 2.2931059467330814, + "language_loss": 0.69080395, + "learning_rate": 5.0373472771124914e-08, + "loss": 0.76749009, + "num_input_tokens_seen": 334007970, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09204102, + "step": 15477, + "time_per_iteration": 2.46964430809021 + }, + { + "auxiliary_loss_clip": 0.06398468, + "auxiliary_loss_mlp": 0.01266148, + "balance_loss_clip": 0.06270145, + "balance_loss_mlp": 0.01257142, + "epoch": 0.9305877047948294, + "flos": 25304999592960.0, + "grad_norm": 2.133764472350911, + "language_loss": 0.58989286, + "learning_rate": 5.0286650961578027e-08, + "loss": 0.66653895, + "num_input_tokens_seen": 334027120, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.09008789, + "step": 15478, + "time_per_iteration": 2.5351498126983643 + }, + { + "auxiliary_loss_clip": 0.06409034, + "auxiliary_loss_mlp": 0.01265248, + "balance_loss_clip": 0.06270212, + "balance_loss_mlp": 0.01254585, + "epoch": 0.9306478280474973, + "flos": 16981975134720.0, + "grad_norm": 1.7786919360630835, + "language_loss": 0.79033351, + "learning_rate": 5.01999030853566e-08, + "loss": 0.86707628, + "num_input_tokens_seen": 334042785, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.10662842, + "step": 15479, + "time_per_iteration": 2.4584336280822754 + }, + { + "auxiliary_loss_clip": 0.06400747, + "auxiliary_loss_mlp": 0.0126376, + "balance_loss_clip": 0.06269468, + "balance_loss_mlp": 0.01254379, + "epoch": 0.9307079513001654, + "flos": 35672121204480.0, + "grad_norm": 1.6572796741868023, + "language_loss": 0.68828124, + "learning_rate": 5.0113229145750445e-08, + "loss": 0.76492631, + "num_input_tokens_seen": 334063480, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09393311, + "step": 15480, + "time_per_iteration": 2.6261415481567383 + }, + { + "auxiliary_loss_clip": 0.06401486, + "auxiliary_loss_mlp": 0.01264294, + "balance_loss_clip": 0.06269836, + "balance_loss_mlp": 0.01254984, + "epoch": 0.9307680745528333, + "flos": 19214146776960.0, + "grad_norm": 1.7175902100711526, + "language_loss": 0.68017375, + "learning_rate": 5.002662914604583e-08, + "loss": 0.75683153, + "num_input_tokens_seen": 334082005, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.09307861, + "step": 15481, + "time_per_iteration": 2.481839179992676 + }, + { + "auxiliary_loss_clip": 0.06399475, + "auxiliary_loss_mlp": 0.01263901, + "balance_loss_clip": 0.06270431, + "balance_loss_mlp": 0.01255145, + "epoch": 0.9308281978055013, + "flos": 19068684888960.0, + "grad_norm": 1.7362782888725026, + "language_loss": 0.74914646, + "learning_rate": 4.994010308952701e-08, + "loss": 0.82578027, + "num_input_tokens_seen": 334101375, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.08746338, + "step": 15482, + "time_per_iteration": 2.521629810333252 + }, + { + "auxiliary_loss_clip": 0.06396139, + "auxiliary_loss_mlp": 0.01263596, + "balance_loss_clip": 0.06269595, + "balance_loss_mlp": 0.01254542, + "epoch": 0.9308883210581692, + "flos": 20527748035200.0, + "grad_norm": 1.865123226027677, + "language_loss": 0.80490708, + "learning_rate": 4.985365097947469e-08, + "loss": 0.88150442, + "num_input_tokens_seen": 334119460, + "router_z_loss_clip": 1.26464844, + "router_z_loss_mlp": 0.0904541, + "step": 15483, + "time_per_iteration": 2.533062696456909 + }, + { + "auxiliary_loss_clip": 0.06400363, + "auxiliary_loss_mlp": 0.01264643, + "balance_loss_clip": 0.06269716, + "balance_loss_mlp": 0.01255118, + "epoch": 0.9309484443108372, + "flos": 13005686355840.0, + "grad_norm": 1.8891510591308605, + "language_loss": 0.74612212, + "learning_rate": 4.976727281916782e-08, + "loss": 0.82277215, + "num_input_tokens_seen": 334136065, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09527588, + "step": 15484, + "time_per_iteration": 2.5859484672546387 + }, + { + "auxiliary_loss_clip": 0.06404746, + "auxiliary_loss_mlp": 0.01264949, + "balance_loss_clip": 0.06271218, + "balance_loss_mlp": 0.01255776, + "epoch": 0.9310085675635051, + "flos": 12572654855040.0, + "grad_norm": 2.023027681276139, + "language_loss": 0.76634532, + "learning_rate": 4.968096861188087e-08, + "loss": 0.84304231, + "num_input_tokens_seen": 334153690, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.09173584, + "step": 15485, + "time_per_iteration": 2.5508246421813965 + }, + { + "auxiliary_loss_clip": 0.0640571, + "auxiliary_loss_mlp": 0.01266589, + "balance_loss_clip": 0.06270456, + "balance_loss_mlp": 0.01256862, + "epoch": 0.9310686908161732, + "flos": 23484378078720.0, + "grad_norm": 1.7812037755211436, + "language_loss": 0.78332233, + "learning_rate": 4.959473836088723e-08, + "loss": 0.86004531, + "num_input_tokens_seen": 334171880, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.09729004, + "step": 15486, + "time_per_iteration": 2.535637140274048 + }, + { + "auxiliary_loss_clip": 0.06408517, + "auxiliary_loss_mlp": 0.01266169, + "balance_loss_clip": 0.06274606, + "balance_loss_mlp": 0.01256144, + "epoch": 0.9311288140688411, + "flos": 24177124909440.0, + "grad_norm": 2.198527808951168, + "language_loss": 0.77455759, + "learning_rate": 4.950858206945674e-08, + "loss": 0.85130453, + "num_input_tokens_seen": 334190005, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.10021973, + "step": 15487, + "time_per_iteration": 2.5223898887634277 + }, + { + "auxiliary_loss_clip": 0.06398556, + "auxiliary_loss_mlp": 0.01260459, + "balance_loss_clip": 0.06268291, + "balance_loss_mlp": 0.01251006, + "epoch": 0.9311889373215091, + "flos": 35598929063040.0, + "grad_norm": 1.8567185005188602, + "language_loss": 0.67377645, + "learning_rate": 4.942249974085633e-08, + "loss": 0.75036657, + "num_input_tokens_seen": 334209545, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.09460449, + "step": 15488, + "time_per_iteration": 2.619208335876465 + }, + { + "auxiliary_loss_clip": 0.06397253, + "auxiliary_loss_mlp": 0.01265084, + "balance_loss_clip": 0.06270263, + "balance_loss_mlp": 0.01256089, + "epoch": 0.9312490605741771, + "flos": 20236824259200.0, + "grad_norm": 1.7224807859602875, + "language_loss": 0.75432515, + "learning_rate": 4.933649137834983e-08, + "loss": 0.83094847, + "num_input_tokens_seen": 334228900, + "router_z_loss_clip": 1.27050781, + "router_z_loss_mlp": 0.08996582, + "step": 15489, + "time_per_iteration": 2.5090341567993164 + }, + { + "auxiliary_loss_clip": 0.06405045, + "auxiliary_loss_mlp": 0.01263019, + "balance_loss_clip": 0.0627015, + "balance_loss_mlp": 0.01253292, + "epoch": 0.931309183826845, + "flos": 13955087842560.0, + "grad_norm": 2.0628027282737396, + "language_loss": 0.80944282, + "learning_rate": 4.925055698519931e-08, + "loss": 0.88612348, + "num_input_tokens_seen": 334245500, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.097229, + "step": 15490, + "time_per_iteration": 2.4866514205932617 + }, + { + "auxiliary_loss_clip": 0.06403734, + "auxiliary_loss_mlp": 0.01266039, + "balance_loss_clip": 0.06270062, + "balance_loss_mlp": 0.01255554, + "epoch": 0.931369307079513, + "flos": 20162877431040.0, + "grad_norm": 1.8170541366291355, + "language_loss": 0.72400761, + "learning_rate": 4.9164696564663264e-08, + "loss": 0.80070531, + "num_input_tokens_seen": 334264370, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.10479736, + "step": 15491, + "time_per_iteration": 2.538468837738037 + }, + { + "auxiliary_loss_clip": 0.06393816, + "auxiliary_loss_mlp": 0.01264838, + "balance_loss_clip": 0.06267494, + "balance_loss_mlp": 0.0125638, + "epoch": 0.931429430332181, + "flos": 25345725477120.0, + "grad_norm": 1.8003153236272884, + "language_loss": 0.74667656, + "learning_rate": 4.9078910119997096e-08, + "loss": 0.82326305, + "num_input_tokens_seen": 334283905, + "router_z_loss_clip": 1.26269531, + "router_z_loss_mlp": 0.08459473, + "step": 15492, + "time_per_iteration": 2.6507134437561035 + }, + { + "auxiliary_loss_clip": 0.06310092, + "auxiliary_loss_mlp": 0.01250657, + "balance_loss_clip": 0.06255943, + "balance_loss_mlp": 0.01249686, + "epoch": 0.931489553584849, + "flos": 71245208482560.0, + "grad_norm": 0.6897832124619488, + "language_loss": 0.53372693, + "learning_rate": 4.899319765445442e-08, + "loss": 0.60933441, + "num_input_tokens_seen": 334339925, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.00970459, + "step": 15493, + "time_per_iteration": 3.021958112716675 + }, + { + "auxiliary_loss_clip": 0.06401284, + "auxiliary_loss_mlp": 0.01264813, + "balance_loss_clip": 0.06270571, + "balance_loss_mlp": 0.01256242, + "epoch": 0.9315496768375169, + "flos": 14648253943680.0, + "grad_norm": 1.768280806379928, + "language_loss": 0.70375299, + "learning_rate": 4.890755917128531e-08, + "loss": 0.78041399, + "num_input_tokens_seen": 334357225, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.08575439, + "step": 15494, + "time_per_iteration": 2.4740707874298096 + }, + { + "auxiliary_loss_clip": 0.06405485, + "auxiliary_loss_mlp": 0.01265527, + "balance_loss_clip": 0.06271463, + "balance_loss_mlp": 0.01255812, + "epoch": 0.9316098000901849, + "flos": 28337505108480.0, + "grad_norm": 1.5961909410807655, + "language_loss": 0.68592763, + "learning_rate": 4.882199467373671e-08, + "loss": 0.76263779, + "num_input_tokens_seen": 334375945, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.09716797, + "step": 15495, + "time_per_iteration": 4.0202531814575195 + }, + { + "auxiliary_loss_clip": 0.06397967, + "auxiliary_loss_mlp": 0.01263218, + "balance_loss_clip": 0.06270482, + "balance_loss_mlp": 0.01254111, + "epoch": 0.9316699233428528, + "flos": 28520338717440.0, + "grad_norm": 1.810348188530725, + "language_loss": 0.62453389, + "learning_rate": 4.8736504165053815e-08, + "loss": 0.70114577, + "num_input_tokens_seen": 334395310, + "router_z_loss_clip": 1.2734375, + "router_z_loss_mlp": 0.09106445, + "step": 15496, + "time_per_iteration": 2.5821802616119385 + }, + { + "auxiliary_loss_clip": 0.06402013, + "auxiliary_loss_mlp": 0.01265862, + "balance_loss_clip": 0.06270453, + "balance_loss_mlp": 0.01256599, + "epoch": 0.9317300465955208, + "flos": 33701887025280.0, + "grad_norm": 1.570853840724038, + "language_loss": 0.76926303, + "learning_rate": 4.865108764847825e-08, + "loss": 0.84594178, + "num_input_tokens_seen": 334416965, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.0925293, + "step": 15497, + "time_per_iteration": 2.6000030040740967 + }, + { + "auxiliary_loss_clip": 0.06406631, + "auxiliary_loss_mlp": 0.01266459, + "balance_loss_clip": 0.06270823, + "balance_loss_mlp": 0.0125576, + "epoch": 0.9317901698481887, + "flos": 23664779919360.0, + "grad_norm": 1.6175776581744283, + "language_loss": 0.662678, + "learning_rate": 4.856574512724898e-08, + "loss": 0.73940897, + "num_input_tokens_seen": 334435620, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.10693359, + "step": 15498, + "time_per_iteration": 2.5351293087005615 + }, + { + "auxiliary_loss_clip": 0.06401354, + "auxiliary_loss_mlp": 0.01266939, + "balance_loss_clip": 0.06269923, + "balance_loss_mlp": 0.01256401, + "epoch": 0.9318502931008568, + "flos": 20966397759360.0, + "grad_norm": 1.5626366594075778, + "language_loss": 0.79703665, + "learning_rate": 4.8480476604602305e-08, + "loss": 0.87371957, + "num_input_tokens_seen": 334456210, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.10534668, + "step": 15499, + "time_per_iteration": 2.5085513591766357 + }, + { + "auxiliary_loss_clip": 0.0639477, + "auxiliary_loss_mlp": 0.01268461, + "balance_loss_clip": 0.06268457, + "balance_loss_mlp": 0.01258978, + "epoch": 0.9319104163535247, + "flos": 23447844898560.0, + "grad_norm": 1.6015703430685497, + "language_loss": 0.76808083, + "learning_rate": 4.8395282083771196e-08, + "loss": 0.84471321, + "num_input_tokens_seen": 334475485, + "router_z_loss_clip": 1.26367188, + "router_z_loss_mlp": 0.09484863, + "step": 15500, + "time_per_iteration": 2.538321018218994 + }, + { + "auxiliary_loss_clip": 0.06396381, + "auxiliary_loss_mlp": 0.01262529, + "balance_loss_clip": 0.06267996, + "balance_loss_mlp": 0.01253427, + "epoch": 0.9319705396061927, + "flos": 22354197408000.0, + "grad_norm": 1.7160437702231266, + "language_loss": 0.72390819, + "learning_rate": 4.8310161567987064e-08, + "loss": 0.80049717, + "num_input_tokens_seen": 334494740, + "router_z_loss_clip": 1.28320312, + "router_z_loss_mlp": 0.09106445, + "step": 15501, + "time_per_iteration": 2.506465196609497 + }, + { + "auxiliary_loss_clip": 0.06406382, + "auxiliary_loss_mlp": 0.01262539, + "balance_loss_clip": 0.06271172, + "balance_loss_mlp": 0.0125327, + "epoch": 0.9320306628588607, + "flos": 20999450995200.0, + "grad_norm": 1.6621036286836153, + "language_loss": 0.6654309, + "learning_rate": 4.822511506047666e-08, + "loss": 0.74212009, + "num_input_tokens_seen": 334511910, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.0927124, + "step": 15502, + "time_per_iteration": 2.4847748279571533 + }, + { + "auxiliary_loss_clip": 0.06403543, + "auxiliary_loss_mlp": 0.01263989, + "balance_loss_clip": 0.06269436, + "balance_loss_mlp": 0.01255096, + "epoch": 0.9320907861115286, + "flos": 24545727020160.0, + "grad_norm": 1.48735457149782, + "language_loss": 0.65586728, + "learning_rate": 4.814014256446586e-08, + "loss": 0.73254263, + "num_input_tokens_seen": 334533150, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.08892822, + "step": 15503, + "time_per_iteration": 4.052160024642944 + }, + { + "auxiliary_loss_clip": 0.06403034, + "auxiliary_loss_mlp": 0.01265098, + "balance_loss_clip": 0.06270871, + "balance_loss_mlp": 0.01254745, + "epoch": 0.9321509093641966, + "flos": 19790418032640.0, + "grad_norm": 1.480948638802982, + "language_loss": 0.75340253, + "learning_rate": 4.805524408317652e-08, + "loss": 0.83008385, + "num_input_tokens_seen": 334550940, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.10345459, + "step": 15504, + "time_per_iteration": 2.5183193683624268 + }, + { + "auxiliary_loss_clip": 0.06404573, + "auxiliary_loss_mlp": 0.01265438, + "balance_loss_clip": 0.06273066, + "balance_loss_mlp": 0.01255645, + "epoch": 0.9322110326168646, + "flos": 24979597061760.0, + "grad_norm": 2.1014126245091735, + "language_loss": 0.71645415, + "learning_rate": 4.797041961982762e-08, + "loss": 0.79315424, + "num_input_tokens_seen": 334570935, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09790039, + "step": 15505, + "time_per_iteration": 2.5211434364318848 + }, + { + "auxiliary_loss_clip": 0.06400719, + "auxiliary_loss_mlp": 0.01261551, + "balance_loss_clip": 0.06268628, + "balance_loss_mlp": 0.01252402, + "epoch": 0.9322711558695326, + "flos": 16149175004160.0, + "grad_norm": 1.7427121022281884, + "language_loss": 0.75388575, + "learning_rate": 4.788566917763614e-08, + "loss": 0.83050847, + "num_input_tokens_seen": 334589315, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09143066, + "step": 15506, + "time_per_iteration": 2.4648678302764893 + }, + { + "auxiliary_loss_clip": 0.06394555, + "auxiliary_loss_mlp": 0.01264013, + "balance_loss_clip": 0.06267924, + "balance_loss_mlp": 0.0125484, + "epoch": 0.9323312791222005, + "flos": 23739187944960.0, + "grad_norm": 1.7165726591251698, + "language_loss": 0.83231521, + "learning_rate": 4.780099275981597e-08, + "loss": 0.90890092, + "num_input_tokens_seen": 334608990, + "router_z_loss_clip": 1.26757812, + "router_z_loss_mlp": 0.09173584, + "step": 15507, + "time_per_iteration": 2.5396206378936768 + }, + { + "auxiliary_loss_clip": 0.06407491, + "auxiliary_loss_mlp": 0.01263332, + "balance_loss_clip": 0.06273882, + "balance_loss_mlp": 0.01253896, + "epoch": 0.9323914023748685, + "flos": 20784318837120.0, + "grad_norm": 1.6429448873571484, + "language_loss": 0.67592001, + "learning_rate": 4.771639036957742e-08, + "loss": 0.75262833, + "num_input_tokens_seen": 334628655, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.09436035, + "step": 15508, + "time_per_iteration": 2.501565933227539 + }, + { + "auxiliary_loss_clip": 0.06400056, + "auxiliary_loss_mlp": 0.01266031, + "balance_loss_clip": 0.06271896, + "balance_loss_mlp": 0.0125709, + "epoch": 0.9324515256275364, + "flos": 23922021553920.0, + "grad_norm": 1.6443982436727373, + "language_loss": 0.72509021, + "learning_rate": 4.7631862010129033e-08, + "loss": 0.80175108, + "num_input_tokens_seen": 334648295, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.0894165, + "step": 15509, + "time_per_iteration": 3.96291184425354 + }, + { + "auxiliary_loss_clip": 0.06400399, + "auxiliary_loss_mlp": 0.01263532, + "balance_loss_clip": 0.06270161, + "balance_loss_mlp": 0.01254091, + "epoch": 0.9325116488802044, + "flos": 18011193235200.0, + "grad_norm": 1.8032935257192066, + "language_loss": 0.74504322, + "learning_rate": 4.754740768467624e-08, + "loss": 0.82168245, + "num_input_tokens_seen": 334666280, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.09442139, + "step": 15510, + "time_per_iteration": 2.4776346683502197 + }, + { + "auxiliary_loss_clip": 0.06406374, + "auxiliary_loss_mlp": 0.01261789, + "balance_loss_clip": 0.06270321, + "balance_loss_mlp": 0.01252175, + "epoch": 0.9325717721328723, + "flos": 29029036055040.0, + "grad_norm": 1.6506300537711536, + "language_loss": 0.70206726, + "learning_rate": 4.746302739642161e-08, + "loss": 0.77874887, + "num_input_tokens_seen": 334688830, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.09619141, + "step": 15511, + "time_per_iteration": 2.582463502883911 + }, + { + "auxiliary_loss_clip": 0.06401817, + "auxiliary_loss_mlp": 0.01266769, + "balance_loss_clip": 0.06271385, + "balance_loss_mlp": 0.01257262, + "epoch": 0.9326318953855404, + "flos": 21651681576960.0, + "grad_norm": 1.9988751237601965, + "language_loss": 0.78188848, + "learning_rate": 4.737872114856412e-08, + "loss": 0.85857439, + "num_input_tokens_seen": 334705205, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.09503174, + "step": 15512, + "time_per_iteration": 2.494394540786743 + }, + { + "auxiliary_loss_clip": 0.06400086, + "auxiliary_loss_mlp": 0.01261219, + "balance_loss_clip": 0.06269924, + "balance_loss_mlp": 0.01251474, + "epoch": 0.9326920186382083, + "flos": 26072573719680.0, + "grad_norm": 1.5043776839825136, + "language_loss": 0.80977184, + "learning_rate": 4.7294488944301436e-08, + "loss": 0.8863849, + "num_input_tokens_seen": 334723830, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.09741211, + "step": 15513, + "time_per_iteration": 4.013933181762695 + }, + { + "auxiliary_loss_clip": 0.06411409, + "auxiliary_loss_mlp": 0.01267814, + "balance_loss_clip": 0.0627336, + "balance_loss_mlp": 0.01256966, + "epoch": 0.9327521418908763, + "flos": 12061945019520.0, + "grad_norm": 1.9976650496804842, + "language_loss": 0.80668688, + "learning_rate": 4.721033078682768e-08, + "loss": 0.88347912, + "num_input_tokens_seen": 334740825, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.10858154, + "step": 15514, + "time_per_iteration": 2.4943747520446777 + }, + { + "auxiliary_loss_clip": 0.06396277, + "auxiliary_loss_mlp": 0.01265518, + "balance_loss_clip": 0.06269284, + "balance_loss_mlp": 0.01256565, + "epoch": 0.9328122651435443, + "flos": 43844233259520.0, + "grad_norm": 1.850634533570311, + "language_loss": 0.71329403, + "learning_rate": 4.7126246679333626e-08, + "loss": 0.78991199, + "num_input_tokens_seen": 334765825, + "router_z_loss_clip": 1.26855469, + "router_z_loss_mlp": 0.08953857, + "step": 15515, + "time_per_iteration": 2.72308611869812 + }, + { + "auxiliary_loss_clip": 0.0640793, + "auxiliary_loss_mlp": 0.01262767, + "balance_loss_clip": 0.06270353, + "balance_loss_mlp": 0.01252647, + "epoch": 0.9328723883962122, + "flos": 15200318568960.0, + "grad_norm": 2.4044334079280882, + "language_loss": 0.81314027, + "learning_rate": 4.704223662500806e-08, + "loss": 0.88984722, + "num_input_tokens_seen": 334782680, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.10125732, + "step": 15516, + "time_per_iteration": 2.5302047729492188 + }, + { + "auxiliary_loss_clip": 0.06406114, + "auxiliary_loss_mlp": 0.01265832, + "balance_loss_clip": 0.06271726, + "balance_loss_mlp": 0.01255943, + "epoch": 0.9329325116488802, + "flos": 20267194164480.0, + "grad_norm": 1.6559287001330782, + "language_loss": 0.80651397, + "learning_rate": 4.695830062703643e-08, + "loss": 0.88323343, + "num_input_tokens_seen": 334800160, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.09893799, + "step": 15517, + "time_per_iteration": 2.5221047401428223 + }, + { + "auxiliary_loss_clip": 0.06403969, + "auxiliary_loss_mlp": 0.01265946, + "balance_loss_clip": 0.06271014, + "balance_loss_mlp": 0.01256051, + "epoch": 0.9329926349015482, + "flos": 13119981724800.0, + "grad_norm": 2.3377447085938563, + "language_loss": 0.74920237, + "learning_rate": 4.687443868860219e-08, + "loss": 0.82590151, + "num_input_tokens_seen": 334815840, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.09899902, + "step": 15518, + "time_per_iteration": 2.527200222015381 + }, + { + "auxiliary_loss_clip": 0.06399631, + "auxiliary_loss_mlp": 0.0126335, + "balance_loss_clip": 0.06269, + "balance_loss_mlp": 0.01254559, + "epoch": 0.9330527581542162, + "flos": 23047070019840.0, + "grad_norm": 1.9434422747125724, + "language_loss": 0.75886834, + "learning_rate": 4.679065081288458e-08, + "loss": 0.83549809, + "num_input_tokens_seen": 334834735, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.08789062, + "step": 15519, + "time_per_iteration": 2.49600887298584 + }, + { + "auxiliary_loss_clip": 0.06401511, + "auxiliary_loss_mlp": 0.01266494, + "balance_loss_clip": 0.06272543, + "balance_loss_mlp": 0.01256451, + "epoch": 0.9331128814068841, + "flos": 15565021464960.0, + "grad_norm": 1.9352369672878387, + "language_loss": 0.83285367, + "learning_rate": 4.6706937003061275e-08, + "loss": 0.90953374, + "num_input_tokens_seen": 334853490, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.10040283, + "step": 15520, + "time_per_iteration": 2.496610641479492 + }, + { + "auxiliary_loss_clip": 0.06397337, + "auxiliary_loss_mlp": 0.01267035, + "balance_loss_clip": 0.06269252, + "balance_loss_mlp": 0.01258064, + "epoch": 0.9331730046595521, + "flos": 22278070373760.0, + "grad_norm": 1.5332547398860534, + "language_loss": 0.76337314, + "learning_rate": 4.6623297262306846e-08, + "loss": 0.84001684, + "num_input_tokens_seen": 334873675, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.08959961, + "step": 15521, + "time_per_iteration": 2.493025779724121 + }, + { + "auxiliary_loss_clip": 0.063995, + "auxiliary_loss_mlp": 0.0126359, + "balance_loss_clip": 0.06270102, + "balance_loss_mlp": 0.01254667, + "epoch": 0.93323312791222, + "flos": 15782920807680.0, + "grad_norm": 4.746978619733777, + "language_loss": 0.77783549, + "learning_rate": 4.6539731593792545e-08, + "loss": 0.85446644, + "num_input_tokens_seen": 334890970, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.08935547, + "step": 15522, + "time_per_iteration": 2.483830213546753 + }, + { + "auxiliary_loss_clip": 0.06401372, + "auxiliary_loss_mlp": 0.01263805, + "balance_loss_clip": 0.06269466, + "balance_loss_mlp": 0.01253565, + "epoch": 0.933293251164888, + "flos": 22016342545920.0, + "grad_norm": 1.8324231152169705, + "language_loss": 0.6271559, + "learning_rate": 4.6456240000687373e-08, + "loss": 0.70380771, + "num_input_tokens_seen": 334906635, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.10247803, + "step": 15523, + "time_per_iteration": 2.464359760284424 + }, + { + "auxiliary_loss_clip": 0.06401858, + "auxiliary_loss_mlp": 0.01268604, + "balance_loss_clip": 0.06272347, + "balance_loss_mlp": 0.01259008, + "epoch": 0.933353374417556, + "flos": 26038556161920.0, + "grad_norm": 1.6351277834664266, + "language_loss": 0.68286568, + "learning_rate": 4.63728224861577e-08, + "loss": 0.7595703, + "num_input_tokens_seen": 334926230, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.0960083, + "step": 15524, + "time_per_iteration": 2.5472025871276855 + }, + { + "auxiliary_loss_clip": 0.06402338, + "auxiliary_loss_mlp": 0.0126476, + "balance_loss_clip": 0.06269465, + "balance_loss_mlp": 0.01254788, + "epoch": 0.933413497670224, + "flos": 24907075752960.0, + "grad_norm": 1.6411454444510272, + "language_loss": 0.73814523, + "learning_rate": 4.628947905336589e-08, + "loss": 0.81481624, + "num_input_tokens_seen": 334946680, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.09973145, + "step": 15525, + "time_per_iteration": 2.5322306156158447 + }, + { + "auxiliary_loss_clip": 0.06398012, + "auxiliary_loss_mlp": 0.01262306, + "balance_loss_clip": 0.06270038, + "balance_loss_mlp": 0.01253449, + "epoch": 0.9334736209228919, + "flos": 23694227429760.0, + "grad_norm": 2.041587925291887, + "language_loss": 0.84483254, + "learning_rate": 4.6206209705473175e-08, + "loss": 0.92143565, + "num_input_tokens_seen": 334964785, + "router_z_loss_clip": 1.27929688, + "router_z_loss_mlp": 0.08862305, + "step": 15526, + "time_per_iteration": 2.519195556640625 + }, + { + "auxiliary_loss_clip": 0.06403422, + "auxiliary_loss_mlp": 0.01265587, + "balance_loss_clip": 0.06271212, + "balance_loss_mlp": 0.0125633, + "epoch": 0.9335337441755599, + "flos": 15382732907520.0, + "grad_norm": 1.8383298430053767, + "language_loss": 0.6973694, + "learning_rate": 4.61230144456366e-08, + "loss": 0.77405953, + "num_input_tokens_seen": 334982400, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.0927124, + "step": 15527, + "time_per_iteration": 2.456176996231079 + }, + { + "auxiliary_loss_clip": 0.06408224, + "auxiliary_loss_mlp": 0.01262635, + "balance_loss_clip": 0.06273658, + "balance_loss_mlp": 0.01252043, + "epoch": 0.9335938674282279, + "flos": 16112180626560.0, + "grad_norm": 2.2241549741395574, + "language_loss": 0.65134645, + "learning_rate": 4.603989327701141e-08, + "loss": 0.728055, + "num_input_tokens_seen": 334999685, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.10595703, + "step": 15528, + "time_per_iteration": 2.4924302101135254 + }, + { + "auxiliary_loss_clip": 0.06401557, + "auxiliary_loss_mlp": 0.0126415, + "balance_loss_clip": 0.06268039, + "balance_loss_mlp": 0.01254357, + "epoch": 0.9336539906808958, + "flos": 18958875713280.0, + "grad_norm": 1.7399334221654377, + "language_loss": 0.74828267, + "learning_rate": 4.5956846202748867e-08, + "loss": 0.82493973, + "num_input_tokens_seen": 335019160, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.09790039, + "step": 15529, + "time_per_iteration": 2.482252597808838 + }, + { + "auxiliary_loss_clip": 0.06400265, + "auxiliary_loss_mlp": 0.01263909, + "balance_loss_clip": 0.06269649, + "balance_loss_mlp": 0.01255407, + "epoch": 0.9337141139335638, + "flos": 18114168303360.0, + "grad_norm": 1.7494873639650692, + "language_loss": 0.63001961, + "learning_rate": 4.5873873225998674e-08, + "loss": 0.70666134, + "num_input_tokens_seen": 335037350, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.08505249, + "step": 15530, + "time_per_iteration": 2.564744234085083 + }, + { + "auxiliary_loss_clip": 0.06398335, + "auxiliary_loss_mlp": 0.01263036, + "balance_loss_clip": 0.06270778, + "balance_loss_mlp": 0.01253749, + "epoch": 0.9337742371862318, + "flos": 17351122296960.0, + "grad_norm": 1.7194228505060978, + "language_loss": 0.73030329, + "learning_rate": 4.5790974349907194e-08, + "loss": 0.80691695, + "num_input_tokens_seen": 335056060, + "router_z_loss_clip": 1.27539062, + "router_z_loss_mlp": 0.09283447, + "step": 15531, + "time_per_iteration": 2.522684097290039 + }, + { + "auxiliary_loss_clip": 0.06400237, + "auxiliary_loss_mlp": 0.01264634, + "balance_loss_clip": 0.06271127, + "balance_loss_mlp": 0.01254772, + "epoch": 0.9338343604388998, + "flos": 29066575484160.0, + "grad_norm": 1.6454676066397984, + "language_loss": 0.71094078, + "learning_rate": 4.5708149577617925e-08, + "loss": 0.78758943, + "num_input_tokens_seen": 335075410, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.09866333, + "step": 15532, + "time_per_iteration": 2.5740439891815186 + }, + { + "auxiliary_loss_clip": 0.0640241, + "auxiliary_loss_mlp": 0.01263663, + "balance_loss_clip": 0.06269048, + "balance_loss_mlp": 0.01254288, + "epoch": 0.9338944836915677, + "flos": 18666819907200.0, + "grad_norm": 1.5232167653668405, + "language_loss": 0.73042238, + "learning_rate": 4.5625398912271016e-08, + "loss": 0.80708313, + "num_input_tokens_seen": 335095190, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.09381104, + "step": 15533, + "time_per_iteration": 2.587557554244995 + }, + { + "auxiliary_loss_clip": 0.06396709, + "auxiliary_loss_mlp": 0.01263683, + "balance_loss_clip": 0.06268157, + "balance_loss_mlp": 0.01254963, + "epoch": 0.9339546069442357, + "flos": 16623309732480.0, + "grad_norm": 1.6925618891662986, + "language_loss": 0.79914582, + "learning_rate": 4.554272235700507e-08, + "loss": 0.87574971, + "num_input_tokens_seen": 335113825, + "router_z_loss_clip": 1.28417969, + "router_z_loss_mlp": 0.0871582, + "step": 15534, + "time_per_iteration": 2.499203681945801 + }, + { + "auxiliary_loss_clip": 0.06394495, + "auxiliary_loss_mlp": 0.01265151, + "balance_loss_clip": 0.06272149, + "balance_loss_mlp": 0.01256836, + "epoch": 0.9340147301969036, + "flos": 23699384455680.0, + "grad_norm": 1.5632032653776713, + "language_loss": 0.74868226, + "learning_rate": 4.546011991495513e-08, + "loss": 0.8252787, + "num_input_tokens_seen": 335136425, + "router_z_loss_clip": 1.22363281, + "router_z_loss_mlp": 0.08319092, + "step": 15535, + "time_per_iteration": 3.9140188694000244 + }, + { + "auxiliary_loss_clip": 0.06405044, + "auxiliary_loss_mlp": 0.0126181, + "balance_loss_clip": 0.06272762, + "balance_loss_mlp": 0.01253042, + "epoch": 0.9340748534495716, + "flos": 28661440193280.0, + "grad_norm": 1.9268953260365462, + "language_loss": 0.78152293, + "learning_rate": 4.537759158925292e-08, + "loss": 0.85819149, + "num_input_tokens_seen": 335157925, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.08770752, + "step": 15536, + "time_per_iteration": 2.5641329288482666 + }, + { + "auxiliary_loss_clip": 0.06401525, + "auxiliary_loss_mlp": 0.01264478, + "balance_loss_clip": 0.06270687, + "balance_loss_mlp": 0.01255269, + "epoch": 0.9341349767022396, + "flos": 24906530701440.0, + "grad_norm": 1.4301567901014753, + "language_loss": 0.80895746, + "learning_rate": 4.5295137383028593e-08, + "loss": 0.88561743, + "num_input_tokens_seen": 335177840, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.09210205, + "step": 15537, + "time_per_iteration": 2.5457959175109863 + }, + { + "auxiliary_loss_clip": 0.06404231, + "auxiliary_loss_mlp": 0.01264078, + "balance_loss_clip": 0.06270302, + "balance_loss_mlp": 0.01254697, + "epoch": 0.9341950999549076, + "flos": 29067204389760.0, + "grad_norm": 1.8933325679633086, + "language_loss": 0.77954888, + "learning_rate": 4.5212757299408764e-08, + "loss": 0.85623199, + "num_input_tokens_seen": 335199470, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.09387207, + "step": 15538, + "time_per_iteration": 2.5645008087158203 + }, + { + "auxiliary_loss_clip": 0.06401729, + "auxiliary_loss_mlp": 0.01265221, + "balance_loss_clip": 0.06271592, + "balance_loss_mlp": 0.01255911, + "epoch": 0.9342552232075755, + "flos": 23593893765120.0, + "grad_norm": 1.685361007162288, + "language_loss": 0.7330637, + "learning_rate": 4.513045134151672e-08, + "loss": 0.80973315, + "num_input_tokens_seen": 335218885, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.09301758, + "step": 15539, + "time_per_iteration": 2.5273890495300293 + }, + { + "auxiliary_loss_clip": 0.06399798, + "auxiliary_loss_mlp": 0.01264706, + "balance_loss_clip": 0.06271564, + "balance_loss_mlp": 0.01256314, + "epoch": 0.9343153464602435, + "flos": 36730325617920.0, + "grad_norm": 1.4282033939406924, + "language_loss": 0.65054214, + "learning_rate": 4.504821951247373e-08, + "loss": 0.72718728, + "num_input_tokens_seen": 335239485, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.08392334, + "step": 15540, + "time_per_iteration": 2.713907241821289 + }, + { + "auxiliary_loss_clip": 0.06400084, + "auxiliary_loss_mlp": 0.01264105, + "balance_loss_clip": 0.06270008, + "balance_loss_mlp": 0.0125449, + "epoch": 0.9343754697129115, + "flos": 22243004640000.0, + "grad_norm": 1.9745672183993257, + "language_loss": 0.76623344, + "learning_rate": 4.496606181539864e-08, + "loss": 0.8428753, + "num_input_tokens_seen": 335258355, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.09625244, + "step": 15541, + "time_per_iteration": 2.537337064743042 + }, + { + "auxiliary_loss_clip": 0.06400786, + "auxiliary_loss_mlp": 0.0126562, + "balance_loss_clip": 0.06271005, + "balance_loss_mlp": 0.01255827, + "epoch": 0.9344355929655794, + "flos": 29717128984320.0, + "grad_norm": 1.7902468110763983, + "language_loss": 0.675026, + "learning_rate": 4.4883978253406066e-08, + "loss": 0.75169003, + "num_input_tokens_seen": 335276835, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.09790039, + "step": 15542, + "time_per_iteration": 2.589301347732544 + }, + { + "auxiliary_loss_clip": 0.06398873, + "auxiliary_loss_mlp": 0.01264541, + "balance_loss_clip": 0.06269249, + "balance_loss_mlp": 0.01254998, + "epoch": 0.9344957162182475, + "flos": 18886438258560.0, + "grad_norm": 2.562374344000717, + "language_loss": 0.69583577, + "learning_rate": 4.480196882960907e-08, + "loss": 0.77246988, + "num_input_tokens_seen": 335296220, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.09545898, + "step": 15543, + "time_per_iteration": 3.915339708328247 + }, + { + "auxiliary_loss_clip": 0.06405383, + "auxiliary_loss_mlp": 0.01263667, + "balance_loss_clip": 0.06269714, + "balance_loss_mlp": 0.01253564, + "epoch": 0.9345558394709154, + "flos": 27425181853440.0, + "grad_norm": 2.8974325946656303, + "language_loss": 0.70212889, + "learning_rate": 4.4720033547117394e-08, + "loss": 0.77881944, + "num_input_tokens_seen": 335316335, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.10101318, + "step": 15544, + "time_per_iteration": 2.552852153778076 + }, + { + "auxiliary_loss_clip": 0.06405076, + "auxiliary_loss_mlp": 0.01266145, + "balance_loss_clip": 0.06271882, + "balance_loss_mlp": 0.01256483, + "epoch": 0.9346159627235834, + "flos": 20747659875840.0, + "grad_norm": 1.548835129494503, + "language_loss": 0.77488774, + "learning_rate": 4.463817240903789e-08, + "loss": 0.85159993, + "num_input_tokens_seen": 335335545, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.09661865, + "step": 15545, + "time_per_iteration": 2.561870813369751 + }, + { + "auxiliary_loss_clip": 0.0640424, + "auxiliary_loss_mlp": 0.01264324, + "balance_loss_clip": 0.0626965, + "balance_loss_mlp": 0.01254752, + "epoch": 0.9346760859762513, + "flos": 21075578029440.0, + "grad_norm": 1.423420905987788, + "language_loss": 0.69126034, + "learning_rate": 4.455638541847495e-08, + "loss": 0.767946, + "num_input_tokens_seen": 335355350, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.09576416, + "step": 15546, + "time_per_iteration": 2.504326581954956 + }, + { + "auxiliary_loss_clip": 0.06395329, + "auxiliary_loss_mlp": 0.0126154, + "balance_loss_clip": 0.0626837, + "balance_loss_mlp": 0.01253082, + "epoch": 0.9347362092289193, + "flos": 29212540496640.0, + "grad_norm": 1.728698051619845, + "language_loss": 0.82426834, + "learning_rate": 4.447467257852966e-08, + "loss": 0.90083706, + "num_input_tokens_seen": 335375160, + "router_z_loss_clip": 1.26855469, + "router_z_loss_mlp": 0.08459473, + "step": 15547, + "time_per_iteration": 2.564218044281006 + }, + { + "auxiliary_loss_clip": 0.06397106, + "auxiliary_loss_mlp": 0.01264609, + "balance_loss_clip": 0.06268612, + "balance_loss_mlp": 0.01256104, + "epoch": 0.9347963324815872, + "flos": 19433429712000.0, + "grad_norm": 1.7812542299870269, + "language_loss": 0.83993661, + "learning_rate": 4.439303389230087e-08, + "loss": 0.91655374, + "num_input_tokens_seen": 335394080, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.08508301, + "step": 15548, + "time_per_iteration": 2.4733710289001465 + }, + { + "auxiliary_loss_clip": 0.06411811, + "auxiliary_loss_mlp": 0.01266367, + "balance_loss_clip": 0.06275804, + "balance_loss_mlp": 0.01255292, + "epoch": 0.9348564557342552, + "flos": 36910475896320.0, + "grad_norm": 1.6747966040501179, + "language_loss": 0.65960097, + "learning_rate": 4.4311469362884326e-08, + "loss": 0.73638272, + "num_input_tokens_seen": 335414230, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.11065674, + "step": 15549, + "time_per_iteration": 4.084869623184204 + }, + { + "auxiliary_loss_clip": 0.06403033, + "auxiliary_loss_mlp": 0.01262193, + "balance_loss_clip": 0.06271501, + "balance_loss_mlp": 0.01252752, + "epoch": 0.9349165789869232, + "flos": 21696684019200.0, + "grad_norm": 1.700964891054384, + "language_loss": 0.8061015, + "learning_rate": 4.4229978993372665e-08, + "loss": 0.88275379, + "num_input_tokens_seen": 335432890, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09436035, + "step": 15550, + "time_per_iteration": 2.499324083328247 + }, + { + "auxiliary_loss_clip": 0.06405445, + "auxiliary_loss_mlp": 0.01265455, + "balance_loss_clip": 0.06276134, + "balance_loss_mlp": 0.01255978, + "epoch": 0.9349767022395912, + "flos": 18850114713600.0, + "grad_norm": 1.7308471893198725, + "language_loss": 0.7611016, + "learning_rate": 4.4148562786856524e-08, + "loss": 0.83781064, + "num_input_tokens_seen": 335452085, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.09472656, + "step": 15551, + "time_per_iteration": 2.489948272705078 + }, + { + "auxiliary_loss_clip": 0.06396884, + "auxiliary_loss_mlp": 0.01262996, + "balance_loss_clip": 0.06270495, + "balance_loss_mlp": 0.01255111, + "epoch": 0.9350368254922591, + "flos": 24980477529600.0, + "grad_norm": 1.5471348014278214, + "language_loss": 0.73827606, + "learning_rate": 4.406722074642255e-08, + "loss": 0.81487489, + "num_input_tokens_seen": 335472130, + "router_z_loss_clip": 1.26464844, + "router_z_loss_mlp": 0.07891846, + "step": 15552, + "time_per_iteration": 2.5838027000427246 + }, + { + "auxiliary_loss_clip": 0.06398878, + "auxiliary_loss_mlp": 0.01266903, + "balance_loss_clip": 0.06268165, + "balance_loss_mlp": 0.01257813, + "epoch": 0.9350969487449271, + "flos": 23076391749120.0, + "grad_norm": 1.569356822541186, + "language_loss": 0.77291447, + "learning_rate": 4.3985952875155386e-08, + "loss": 0.8495723, + "num_input_tokens_seen": 335489970, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09088135, + "step": 15553, + "time_per_iteration": 3.920443534851074 + }, + { + "auxiliary_loss_clip": 0.06403033, + "auxiliary_loss_mlp": 0.0126984, + "balance_loss_clip": 0.06268983, + "balance_loss_mlp": 0.01259314, + "epoch": 0.9351570719975951, + "flos": 18631209121920.0, + "grad_norm": 1.5901890244896573, + "language_loss": 0.78230381, + "learning_rate": 4.390475917613723e-08, + "loss": 0.85903263, + "num_input_tokens_seen": 335509125, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.10522461, + "step": 15554, + "time_per_iteration": 2.5145413875579834 + }, + { + "auxiliary_loss_clip": 0.06394763, + "auxiliary_loss_mlp": 0.01263815, + "balance_loss_clip": 0.06269139, + "balance_loss_mlp": 0.01255966, + "epoch": 0.935217195250263, + "flos": 15893862013440.0, + "grad_norm": 1.5004203898596764, + "language_loss": 0.68972766, + "learning_rate": 4.382363965244695e-08, + "loss": 0.76631343, + "num_input_tokens_seen": 335525620, + "router_z_loss_clip": 1.25683594, + "router_z_loss_mlp": 0.07843018, + "step": 15555, + "time_per_iteration": 2.478994369506836 + }, + { + "auxiliary_loss_clip": 0.0639784, + "auxiliary_loss_mlp": 0.01264208, + "balance_loss_clip": 0.06269526, + "balance_loss_mlp": 0.01254689, + "epoch": 0.935277318502931, + "flos": 24397372166400.0, + "grad_norm": 1.6233160508843345, + "language_loss": 0.75448465, + "learning_rate": 4.374259430715965e-08, + "loss": 0.83110511, + "num_input_tokens_seen": 335547565, + "router_z_loss_clip": 1.28320312, + "router_z_loss_mlp": 0.09515381, + "step": 15556, + "time_per_iteration": 2.5654189586639404 + }, + { + "auxiliary_loss_clip": 0.06400485, + "auxiliary_loss_mlp": 0.01265083, + "balance_loss_clip": 0.06270866, + "balance_loss_mlp": 0.01256327, + "epoch": 0.935337441755599, + "flos": 27607721973120.0, + "grad_norm": 1.4937701005093391, + "language_loss": 0.72718519, + "learning_rate": 4.366162314334953e-08, + "loss": 0.80384088, + "num_input_tokens_seen": 335570285, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.08746338, + "step": 15557, + "time_per_iteration": 2.5661914348602295 + }, + { + "auxiliary_loss_clip": 0.06403461, + "auxiliary_loss_mlp": 0.01266447, + "balance_loss_clip": 0.0627244, + "balance_loss_mlp": 0.01256844, + "epoch": 0.935397565008267, + "flos": 20488699232640.0, + "grad_norm": 1.479053055288317, + "language_loss": 0.63463771, + "learning_rate": 4.358072616408681e-08, + "loss": 0.71133679, + "num_input_tokens_seen": 335588600, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09594727, + "step": 15558, + "time_per_iteration": 2.4923977851867676 + }, + { + "auxiliary_loss_clip": 0.06402527, + "auxiliary_loss_mlp": 0.0126766, + "balance_loss_clip": 0.06272481, + "balance_loss_mlp": 0.01257456, + "epoch": 0.9354576882609349, + "flos": 23660293726080.0, + "grad_norm": 1.7353882784834274, + "language_loss": 0.73151875, + "learning_rate": 4.34999033724388e-08, + "loss": 0.80822068, + "num_input_tokens_seen": 335606235, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.10198975, + "step": 15559, + "time_per_iteration": 2.5124833583831787 + }, + { + "auxiliary_loss_clip": 0.06400333, + "auxiliary_loss_mlp": 0.01260437, + "balance_loss_clip": 0.0627114, + "balance_loss_mlp": 0.01252236, + "epoch": 0.9355178115136029, + "flos": 36693834364800.0, + "grad_norm": 1.5332066334129346, + "language_loss": 0.64076531, + "learning_rate": 4.341915477147062e-08, + "loss": 0.71737301, + "num_input_tokens_seen": 335628240, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.08197021, + "step": 15560, + "time_per_iteration": 2.6758434772491455 + }, + { + "auxiliary_loss_clip": 0.06415723, + "auxiliary_loss_mlp": 0.01267002, + "balance_loss_clip": 0.06274995, + "balance_loss_mlp": 0.01255785, + "epoch": 0.9355779347662708, + "flos": 14464833356160.0, + "grad_norm": 2.091115456103633, + "language_loss": 0.64280677, + "learning_rate": 4.3338480364244034e-08, + "loss": 0.71963406, + "num_input_tokens_seen": 335643755, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.11218262, + "step": 15561, + "time_per_iteration": 2.5375335216522217 + }, + { + "auxiliary_loss_clip": 0.06398933, + "auxiliary_loss_mlp": 0.01266271, + "balance_loss_clip": 0.06270428, + "balance_loss_mlp": 0.0125668, + "epoch": 0.9356380580189388, + "flos": 23192783470080.0, + "grad_norm": 2.046301744114267, + "language_loss": 0.7559768, + "learning_rate": 4.325788015381859e-08, + "loss": 0.83262885, + "num_input_tokens_seen": 335665160, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.09594727, + "step": 15562, + "time_per_iteration": 2.5097131729125977 + }, + { + "auxiliary_loss_clip": 0.06310297, + "auxiliary_loss_mlp": 0.01249402, + "balance_loss_clip": 0.06256372, + "balance_loss_mlp": 0.01248288, + "epoch": 0.9356981812716068, + "flos": 67490592480000.0, + "grad_norm": 0.9299181656084027, + "language_loss": 0.62328547, + "learning_rate": 4.31773541432503e-08, + "loss": 0.69888246, + "num_input_tokens_seen": 335715240, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.01116943, + "step": 15563, + "time_per_iteration": 2.9744601249694824 + }, + { + "auxiliary_loss_clip": 0.06396849, + "auxiliary_loss_mlp": 0.0126558, + "balance_loss_clip": 0.06269947, + "balance_loss_mlp": 0.01256756, + "epoch": 0.9357583045242748, + "flos": 24688631358720.0, + "grad_norm": 1.421638923084558, + "language_loss": 0.78548312, + "learning_rate": 4.3096902335592714e-08, + "loss": 0.8621074, + "num_input_tokens_seen": 335734970, + "router_z_loss_clip": 1.26855469, + "router_z_loss_mlp": 0.08816528, + "step": 15564, + "time_per_iteration": 2.528512954711914 + }, + { + "auxiliary_loss_clip": 0.06406452, + "auxiliary_loss_mlp": 0.01264834, + "balance_loss_clip": 0.06271413, + "balance_loss_mlp": 0.01255346, + "epoch": 0.9358184277769427, + "flos": 19469795184000.0, + "grad_norm": 2.174716619334903, + "language_loss": 0.78390223, + "learning_rate": 4.301652473389694e-08, + "loss": 0.86061513, + "num_input_tokens_seen": 335753435, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.09490967, + "step": 15565, + "time_per_iteration": 2.4927587509155273 + }, + { + "auxiliary_loss_clip": 0.06398039, + "auxiliary_loss_mlp": 0.01262656, + "balance_loss_clip": 0.06270307, + "balance_loss_mlp": 0.01254275, + "epoch": 0.9358785510296107, + "flos": 18923055292800.0, + "grad_norm": 2.3604474699248086, + "language_loss": 0.72209811, + "learning_rate": 4.2936221341210774e-08, + "loss": 0.7987051, + "num_input_tokens_seen": 335772105, + "router_z_loss_clip": 1.27929688, + "router_z_loss_mlp": 0.08380127, + "step": 15566, + "time_per_iteration": 2.469862222671509 + }, + { + "auxiliary_loss_clip": 0.06401025, + "auxiliary_loss_mlp": 0.01264516, + "balance_loss_clip": 0.06269009, + "balance_loss_mlp": 0.01255063, + "epoch": 0.9359386742822787, + "flos": 23448096460800.0, + "grad_norm": 1.7674222051319097, + "language_loss": 0.68101299, + "learning_rate": 4.285599216057889e-08, + "loss": 0.75766838, + "num_input_tokens_seen": 335789125, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09448242, + "step": 15567, + "time_per_iteration": 2.5092694759368896 + }, + { + "auxiliary_loss_clip": 0.06399126, + "auxiliary_loss_mlp": 0.01265065, + "balance_loss_clip": 0.06268832, + "balance_loss_mlp": 0.01255159, + "epoch": 0.9359987975349466, + "flos": 32752275903360.0, + "grad_norm": 1.9133350433830412, + "language_loss": 0.62613881, + "learning_rate": 4.277583719504418e-08, + "loss": 0.70278078, + "num_input_tokens_seen": 335810995, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.09893799, + "step": 15568, + "time_per_iteration": 2.590184211730957 + }, + { + "auxiliary_loss_clip": 0.06399098, + "auxiliary_loss_mlp": 0.01262364, + "balance_loss_clip": 0.06269547, + "balance_loss_mlp": 0.01253251, + "epoch": 0.9360589207876147, + "flos": 22826151930240.0, + "grad_norm": 1.5331346258977052, + "language_loss": 0.79038656, + "learning_rate": 4.269575644764556e-08, + "loss": 0.86700118, + "num_input_tokens_seen": 335830580, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.09118652, + "step": 15569, + "time_per_iteration": 2.547078847885132 + }, + { + "auxiliary_loss_clip": 0.06405905, + "auxiliary_loss_mlp": 0.01266784, + "balance_loss_clip": 0.06274277, + "balance_loss_mlp": 0.01257414, + "epoch": 0.9361190440402826, + "flos": 20891318901120.0, + "grad_norm": 2.1386136697606517, + "language_loss": 0.70064366, + "learning_rate": 4.261574992142014e-08, + "loss": 0.77737057, + "num_input_tokens_seen": 335846515, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09368896, + "step": 15570, + "time_per_iteration": 2.511286973953247 + }, + { + "auxiliary_loss_clip": 0.06400268, + "auxiliary_loss_mlp": 0.0126654, + "balance_loss_clip": 0.06268404, + "balance_loss_mlp": 0.01256568, + "epoch": 0.9361791672929506, + "flos": 19323872098560.0, + "grad_norm": 1.9820727131819575, + "language_loss": 0.79175496, + "learning_rate": 4.2535817619401726e-08, + "loss": 0.86842304, + "num_input_tokens_seen": 335863350, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.09973145, + "step": 15571, + "time_per_iteration": 2.5219452381134033 + }, + { + "auxiliary_loss_clip": 0.06402998, + "auxiliary_loss_mlp": 0.0126496, + "balance_loss_clip": 0.06271084, + "balance_loss_mlp": 0.01255965, + "epoch": 0.9362392905456185, + "flos": 15163491899520.0, + "grad_norm": 1.9713117286932247, + "language_loss": 0.77583826, + "learning_rate": 4.2455959544621224e-08, + "loss": 0.85251784, + "num_input_tokens_seen": 335880510, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.08996582, + "step": 15572, + "time_per_iteration": 2.4837546348571777 + }, + { + "auxiliary_loss_clip": 0.06396253, + "auxiliary_loss_mlp": 0.01264793, + "balance_loss_clip": 0.0626861, + "balance_loss_mlp": 0.01255775, + "epoch": 0.9362994137982865, + "flos": 22091589112320.0, + "grad_norm": 1.592764345612902, + "language_loss": 0.78254807, + "learning_rate": 4.237617570010688e-08, + "loss": 0.85915852, + "num_input_tokens_seen": 335899440, + "router_z_loss_clip": 1.27539062, + "router_z_loss_mlp": 0.09014893, + "step": 15573, + "time_per_iteration": 2.538482427597046 + }, + { + "auxiliary_loss_clip": 0.06395616, + "auxiliary_loss_mlp": 0.01265839, + "balance_loss_clip": 0.06269381, + "balance_loss_mlp": 0.01257023, + "epoch": 0.9363595370509544, + "flos": 23518772979840.0, + "grad_norm": 1.4505342920053566, + "language_loss": 0.74485445, + "learning_rate": 4.2296466088884044e-08, + "loss": 0.82146895, + "num_input_tokens_seen": 335919540, + "router_z_loss_clip": 1.26367188, + "router_z_loss_mlp": 0.08813477, + "step": 15574, + "time_per_iteration": 3.9295005798339844 + }, + { + "auxiliary_loss_clip": 0.06395365, + "auxiliary_loss_mlp": 0.01261285, + "balance_loss_clip": 0.06269053, + "balance_loss_mlp": 0.01252112, + "epoch": 0.9364196603036224, + "flos": 27130442716800.0, + "grad_norm": 2.361043736430351, + "language_loss": 0.68279696, + "learning_rate": 4.221683071397564e-08, + "loss": 0.75936341, + "num_input_tokens_seen": 335939665, + "router_z_loss_clip": 1.26367188, + "router_z_loss_mlp": 0.09173584, + "step": 15575, + "time_per_iteration": 2.606562852859497 + }, + { + "auxiliary_loss_clip": 0.06395829, + "auxiliary_loss_mlp": 0.01265677, + "balance_loss_clip": 0.06269231, + "balance_loss_mlp": 0.01256021, + "epoch": 0.9364797835562904, + "flos": 18485034474240.0, + "grad_norm": 1.6188828089297882, + "language_loss": 0.65445733, + "learning_rate": 4.2137269578401026e-08, + "loss": 0.73107237, + "num_input_tokens_seen": 335958580, + "router_z_loss_clip": 1.26660156, + "router_z_loss_mlp": 0.09655762, + "step": 15576, + "time_per_iteration": 2.5505363941192627 + }, + { + "auxiliary_loss_clip": 0.06399767, + "auxiliary_loss_mlp": 0.01265648, + "balance_loss_clip": 0.06267945, + "balance_loss_mlp": 0.01255402, + "epoch": 0.9365399068089584, + "flos": 13010507965440.0, + "grad_norm": 2.0026006343601725, + "language_loss": 0.76252437, + "learning_rate": 4.2057782685177566e-08, + "loss": 0.8391785, + "num_input_tokens_seen": 335974965, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.10247803, + "step": 15577, + "time_per_iteration": 2.5032527446746826 + }, + { + "auxiliary_loss_clip": 0.06399457, + "auxiliary_loss_mlp": 0.01266406, + "balance_loss_clip": 0.06267272, + "balance_loss_mlp": 0.01256828, + "epoch": 0.9366000300616263, + "flos": 25673559776640.0, + "grad_norm": 2.037972462404189, + "language_loss": 0.52709925, + "learning_rate": 4.1978370037318855e-08, + "loss": 0.60375792, + "num_input_tokens_seen": 335996575, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.0958252, + "step": 15578, + "time_per_iteration": 2.575258731842041 + }, + { + "auxiliary_loss_clip": 0.0639855, + "auxiliary_loss_mlp": 0.01265718, + "balance_loss_clip": 0.06271734, + "balance_loss_mlp": 0.01256956, + "epoch": 0.9366601533142943, + "flos": 21439652019840.0, + "grad_norm": 1.6897117136078763, + "language_loss": 0.70794189, + "learning_rate": 4.189903163783692e-08, + "loss": 0.78458452, + "num_input_tokens_seen": 336017265, + "router_z_loss_clip": 1.26855469, + "router_z_loss_mlp": 0.08746338, + "step": 15579, + "time_per_iteration": 2.5197277069091797 + }, + { + "auxiliary_loss_clip": 0.06398612, + "auxiliary_loss_mlp": 0.01261657, + "balance_loss_clip": 0.06269911, + "balance_loss_mlp": 0.01252639, + "epoch": 0.9367202765669622, + "flos": 24099362720640.0, + "grad_norm": 1.789359287334025, + "language_loss": 0.76805091, + "learning_rate": 4.181976748973959e-08, + "loss": 0.84465355, + "num_input_tokens_seen": 336035905, + "router_z_loss_clip": 1.28613281, + "router_z_loss_mlp": 0.09014893, + "step": 15580, + "time_per_iteration": 2.5272631645202637 + }, + { + "auxiliary_loss_clip": 0.0640737, + "auxiliary_loss_mlp": 0.01263031, + "balance_loss_clip": 0.06271207, + "balance_loss_mlp": 0.01252285, + "epoch": 0.9367803998196302, + "flos": 20895511605120.0, + "grad_norm": 4.066229369441099, + "language_loss": 0.66627061, + "learning_rate": 4.1740577596033114e-08, + "loss": 0.74297458, + "num_input_tokens_seen": 336055585, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.10748291, + "step": 15581, + "time_per_iteration": 2.5539963245391846 + }, + { + "auxiliary_loss_clip": 0.06398203, + "auxiliary_loss_mlp": 0.01266342, + "balance_loss_clip": 0.0626883, + "balance_loss_mlp": 0.01256978, + "epoch": 0.9368405230722983, + "flos": 22570838939520.0, + "grad_norm": 1.4665280133275418, + "language_loss": 0.76610607, + "learning_rate": 4.166146195972042e-08, + "loss": 0.8427515, + "num_input_tokens_seen": 336076695, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.09362793, + "step": 15582, + "time_per_iteration": 4.0959906578063965 + }, + { + "auxiliary_loss_clip": 0.06399594, + "auxiliary_loss_mlp": 0.01263756, + "balance_loss_clip": 0.06270076, + "balance_loss_mlp": 0.01254195, + "epoch": 0.9369006463249662, + "flos": 18886228623360.0, + "grad_norm": 1.845169870254204, + "language_loss": 0.74022168, + "learning_rate": 4.1582420583800905e-08, + "loss": 0.81685519, + "num_input_tokens_seen": 336094740, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.09558105, + "step": 15583, + "time_per_iteration": 2.6164638996124268 + }, + { + "auxiliary_loss_clip": 0.06408083, + "auxiliary_loss_mlp": 0.01269119, + "balance_loss_clip": 0.06271075, + "balance_loss_mlp": 0.01259439, + "epoch": 0.9369607695776342, + "flos": 26439750311040.0, + "grad_norm": 2.094527372320434, + "language_loss": 0.84255081, + "learning_rate": 4.1503453471272376e-08, + "loss": 0.91932285, + "num_input_tokens_seen": 336113985, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.09680176, + "step": 15584, + "time_per_iteration": 2.5555663108825684 + }, + { + "auxiliary_loss_clip": 0.0641142, + "auxiliary_loss_mlp": 0.01267981, + "balance_loss_clip": 0.06274232, + "balance_loss_mlp": 0.01257527, + "epoch": 0.9370208928303021, + "flos": 39576769142400.0, + "grad_norm": 1.4458449658506118, + "language_loss": 0.72188222, + "learning_rate": 4.1424560625129334e-08, + "loss": 0.79867625, + "num_input_tokens_seen": 336136395, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.10455322, + "step": 15585, + "time_per_iteration": 2.6535887718200684 + }, + { + "auxiliary_loss_clip": 0.06396037, + "auxiliary_loss_mlp": 0.01263316, + "balance_loss_clip": 0.06268879, + "balance_loss_mlp": 0.01254334, + "epoch": 0.9370810160829701, + "flos": 22969223976960.0, + "grad_norm": 2.3040990220175535, + "language_loss": 0.80541742, + "learning_rate": 4.134574204836316e-08, + "loss": 0.88201094, + "num_input_tokens_seen": 336156345, + "router_z_loss_clip": 1.27246094, + "router_z_loss_mlp": 0.08978271, + "step": 15586, + "time_per_iteration": 2.5093491077423096 + }, + { + "auxiliary_loss_clip": 0.06403472, + "auxiliary_loss_mlp": 0.01269686, + "balance_loss_clip": 0.06273106, + "balance_loss_mlp": 0.01260226, + "epoch": 0.937141139335638, + "flos": 23081590702080.0, + "grad_norm": 1.4938915537331265, + "language_loss": 0.76188564, + "learning_rate": 4.126699774396258e-08, + "loss": 0.83861721, + "num_input_tokens_seen": 336176760, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.09460449, + "step": 15587, + "time_per_iteration": 2.5432510375976562 + }, + { + "auxiliary_loss_clip": 0.06406046, + "auxiliary_loss_mlp": 0.01262902, + "balance_loss_clip": 0.06271333, + "balance_loss_mlp": 0.01252721, + "epoch": 0.937201262588306, + "flos": 16361246488320.0, + "grad_norm": 2.7151633052231774, + "language_loss": 0.87725753, + "learning_rate": 4.118832771491387e-08, + "loss": 0.95394701, + "num_input_tokens_seen": 336193285, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.10186768, + "step": 15588, + "time_per_iteration": 2.479767322540283 + }, + { + "auxiliary_loss_clip": 0.06396212, + "auxiliary_loss_mlp": 0.01263659, + "balance_loss_clip": 0.06270182, + "balance_loss_mlp": 0.01255195, + "epoch": 0.937261385840974, + "flos": 20200374933120.0, + "grad_norm": 1.6537690665751095, + "language_loss": 0.78271496, + "learning_rate": 4.11097319642002e-08, + "loss": 0.85931367, + "num_input_tokens_seen": 336211425, + "router_z_loss_clip": 1.25878906, + "router_z_loss_mlp": 0.08465576, + "step": 15589, + "time_per_iteration": 4.0159831047058105 + }, + { + "auxiliary_loss_clip": 0.06398676, + "auxiliary_loss_mlp": 0.01262823, + "balance_loss_clip": 0.062714, + "balance_loss_mlp": 0.01253787, + "epoch": 0.937321509093642, + "flos": 18301781594880.0, + "grad_norm": 1.6602653892740842, + "language_loss": 0.778988, + "learning_rate": 4.103121049480163e-08, + "loss": 0.85560298, + "num_input_tokens_seen": 336230205, + "router_z_loss_clip": 1.27246094, + "router_z_loss_mlp": 0.09033203, + "step": 15590, + "time_per_iteration": 2.473738193511963 + }, + { + "auxiliary_loss_clip": 0.06412096, + "auxiliary_loss_mlp": 0.01267354, + "balance_loss_clip": 0.0627647, + "balance_loss_mlp": 0.01257591, + "epoch": 0.9373816323463099, + "flos": 25891710681600.0, + "grad_norm": 1.9786385015228094, + "language_loss": 0.71866137, + "learning_rate": 4.095276330969577e-08, + "loss": 0.79545587, + "num_input_tokens_seen": 336252440, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.09765625, + "step": 15591, + "time_per_iteration": 2.570101022720337 + }, + { + "auxiliary_loss_clip": 0.06408433, + "auxiliary_loss_mlp": 0.01267971, + "balance_loss_clip": 0.06272327, + "balance_loss_mlp": 0.01257058, + "epoch": 0.9374417555989779, + "flos": 27206234334720.0, + "grad_norm": 1.7598443823568033, + "language_loss": 0.53974843, + "learning_rate": 4.0874390411857804e-08, + "loss": 0.61651254, + "num_input_tokens_seen": 336273845, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.10906982, + "step": 15592, + "time_per_iteration": 4.081035137176514 + }, + { + "auxiliary_loss_clip": 0.06399275, + "auxiliary_loss_mlp": 0.01262346, + "balance_loss_clip": 0.06270185, + "balance_loss_mlp": 0.01253209, + "epoch": 0.9375018788516458, + "flos": 23627701687680.0, + "grad_norm": 1.5745760731175873, + "language_loss": 0.67514831, + "learning_rate": 4.0796091804259136e-08, + "loss": 0.75176454, + "num_input_tokens_seen": 336292790, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.09130859, + "step": 15593, + "time_per_iteration": 2.508760452270508 + }, + { + "auxiliary_loss_clip": 0.06400297, + "auxiliary_loss_mlp": 0.01263636, + "balance_loss_clip": 0.06268944, + "balance_loss_mlp": 0.01254385, + "epoch": 0.9375620021043138, + "flos": 22686098630400.0, + "grad_norm": 1.4850027564581405, + "language_loss": 0.74354887, + "learning_rate": 4.0717867489868715e-08, + "loss": 0.82018816, + "num_input_tokens_seen": 336312600, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.09259033, + "step": 15594, + "time_per_iteration": 2.5463995933532715 + }, + { + "auxiliary_loss_clip": 0.06398121, + "auxiliary_loss_mlp": 0.01263132, + "balance_loss_clip": 0.0627058, + "balance_loss_mlp": 0.01254197, + "epoch": 0.9376221253569819, + "flos": 27567121870080.0, + "grad_norm": 1.5092053336620472, + "language_loss": 0.73907506, + "learning_rate": 4.063971747165351e-08, + "loss": 0.8156876, + "num_input_tokens_seen": 336332770, + "router_z_loss_clip": 1.27441406, + "router_z_loss_mlp": 0.0892334, + "step": 15595, + "time_per_iteration": 2.5729222297668457 + }, + { + "auxiliary_loss_clip": 0.06404946, + "auxiliary_loss_mlp": 0.01265806, + "balance_loss_clip": 0.06270959, + "balance_loss_mlp": 0.01256352, + "epoch": 0.9376822486096498, + "flos": 24136063608960.0, + "grad_norm": 1.892076191551823, + "language_loss": 0.7632336, + "learning_rate": 4.056164175257626e-08, + "loss": 0.83994108, + "num_input_tokens_seen": 336351445, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.09454346, + "step": 15596, + "time_per_iteration": 2.5364673137664795 + }, + { + "auxiliary_loss_clip": 0.06401411, + "auxiliary_loss_mlp": 0.01268101, + "balance_loss_clip": 0.06270882, + "balance_loss_mlp": 0.01258492, + "epoch": 0.9377423718623178, + "flos": 22790666926080.0, + "grad_norm": 1.6042943416913158, + "language_loss": 0.78836501, + "learning_rate": 4.0483640335597926e-08, + "loss": 0.86506015, + "num_input_tokens_seen": 336368690, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.09606934, + "step": 15597, + "time_per_iteration": 2.5183331966400146 + }, + { + "auxiliary_loss_clip": 0.06409448, + "auxiliary_loss_mlp": 0.01263004, + "balance_loss_clip": 0.06272915, + "balance_loss_mlp": 0.0125299, + "epoch": 0.9378024951149857, + "flos": 19174427141760.0, + "grad_norm": 1.5171680951862323, + "language_loss": 0.812361, + "learning_rate": 4.0405713223676363e-08, + "loss": 0.88908553, + "num_input_tokens_seen": 336388165, + "router_z_loss_clip": 1.36621094, + "router_z_loss_mlp": 0.10009766, + "step": 15598, + "time_per_iteration": 2.547635078430176 + }, + { + "auxiliary_loss_clip": 0.06408492, + "auxiliary_loss_mlp": 0.01265418, + "balance_loss_clip": 0.06269473, + "balance_loss_mlp": 0.01255118, + "epoch": 0.9378626183676537, + "flos": 23510890696320.0, + "grad_norm": 3.988859299196599, + "language_loss": 0.62941587, + "learning_rate": 4.0327860419766994e-08, + "loss": 0.70615494, + "num_input_tokens_seen": 336406475, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.10302734, + "step": 15599, + "time_per_iteration": 2.5061824321746826 + }, + { + "auxiliary_loss_clip": 0.06402044, + "auxiliary_loss_mlp": 0.01264248, + "balance_loss_clip": 0.06269282, + "balance_loss_mlp": 0.01255039, + "epoch": 0.9379227416203216, + "flos": 18411548843520.0, + "grad_norm": 1.7602701335437743, + "language_loss": 0.73915505, + "learning_rate": 4.0250081926821e-08, + "loss": 0.81581795, + "num_input_tokens_seen": 336424690, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09216309, + "step": 15600, + "time_per_iteration": 2.4871292114257812 + }, + { + "auxiliary_loss_clip": 0.06400068, + "auxiliary_loss_mlp": 0.01264599, + "balance_loss_clip": 0.06271948, + "balance_loss_mlp": 0.01255873, + "epoch": 0.9379828648729897, + "flos": 17827646866560.0, + "grad_norm": 1.946057242530572, + "language_loss": 0.70149601, + "learning_rate": 4.0172377747788474e-08, + "loss": 0.77814269, + "num_input_tokens_seen": 336443055, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.0871582, + "step": 15601, + "time_per_iteration": 2.4916884899139404 + }, + { + "auxiliary_loss_clip": 0.06305277, + "auxiliary_loss_mlp": 0.01251346, + "balance_loss_clip": 0.06251266, + "balance_loss_mlp": 0.01250292, + "epoch": 0.9380429881256576, + "flos": 68044376113920.0, + "grad_norm": 0.7389611059273472, + "language_loss": 0.5819695, + "learning_rate": 4.009474788561573e-08, + "loss": 0.65753579, + "num_input_tokens_seen": 336510190, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.01054382, + "step": 15602, + "time_per_iteration": 3.2857046127319336 + }, + { + "auxiliary_loss_clip": 0.0640846, + "auxiliary_loss_mlp": 0.01264932, + "balance_loss_clip": 0.06275323, + "balance_loss_mlp": 0.01255675, + "epoch": 0.9381031113783256, + "flos": 20783228734080.0, + "grad_norm": 1.8320878544992856, + "language_loss": 0.71827531, + "learning_rate": 4.001719234324663e-08, + "loss": 0.79500926, + "num_input_tokens_seen": 336529250, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.0925293, + "step": 15603, + "time_per_iteration": 2.523958921432495 + }, + { + "auxiliary_loss_clip": 0.0639255, + "auxiliary_loss_mlp": 0.01269196, + "balance_loss_clip": 0.0627018, + "balance_loss_mlp": 0.01260988, + "epoch": 0.9381632346309935, + "flos": 19030935824640.0, + "grad_norm": 1.5522803660196332, + "language_loss": 0.76325035, + "learning_rate": 3.993971112362171e-08, + "loss": 0.83986783, + "num_input_tokens_seen": 336548530, + "router_z_loss_clip": 1.22363281, + "router_z_loss_mlp": 0.08203125, + "step": 15604, + "time_per_iteration": 2.572173595428467 + }, + { + "auxiliary_loss_clip": 0.06403452, + "auxiliary_loss_mlp": 0.01265098, + "balance_loss_clip": 0.06271511, + "balance_loss_mlp": 0.01255019, + "epoch": 0.9382233578836615, + "flos": 23520617769600.0, + "grad_norm": 1.891479976745369, + "language_loss": 0.65401727, + "learning_rate": 3.9862304229679734e-08, + "loss": 0.73070276, + "num_input_tokens_seen": 336568510, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.10070801, + "step": 15605, + "time_per_iteration": 2.520111083984375 + }, + { + "auxiliary_loss_clip": 0.06409709, + "auxiliary_loss_mlp": 0.01267027, + "balance_loss_clip": 0.06272443, + "balance_loss_mlp": 0.01256972, + "epoch": 0.9382834811363294, + "flos": 43077539600640.0, + "grad_norm": 1.699505727802155, + "language_loss": 0.67158365, + "learning_rate": 3.9784971664355683e-08, + "loss": 0.7483511, + "num_input_tokens_seen": 336592020, + "router_z_loss_clip": 1.37402344, + "router_z_loss_mlp": 0.1005249, + "step": 15606, + "time_per_iteration": 2.7113072872161865 + }, + { + "auxiliary_loss_clip": 0.06395191, + "auxiliary_loss_mlp": 0.01266636, + "balance_loss_clip": 0.06269544, + "balance_loss_mlp": 0.01258374, + "epoch": 0.9383436043889974, + "flos": 16441943569920.0, + "grad_norm": 1.6594414641865307, + "language_loss": 0.77971619, + "learning_rate": 3.970771343058166e-08, + "loss": 0.85633445, + "num_input_tokens_seen": 336610010, + "router_z_loss_clip": 1.25585938, + "router_z_loss_mlp": 0.0826416, + "step": 15607, + "time_per_iteration": 2.479999303817749 + }, + { + "auxiliary_loss_clip": 0.06402883, + "auxiliary_loss_mlp": 0.01262038, + "balance_loss_clip": 0.06271037, + "balance_loss_mlp": 0.01252769, + "epoch": 0.9384037276416655, + "flos": 20746863262080.0, + "grad_norm": 1.7801054619230159, + "language_loss": 0.83052444, + "learning_rate": 3.963052953128776e-08, + "loss": 0.90717363, + "num_input_tokens_seen": 336628520, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.0927124, + "step": 15608, + "time_per_iteration": 2.5650830268859863 + }, + { + "auxiliary_loss_clip": 0.06400616, + "auxiliary_loss_mlp": 0.01267206, + "balance_loss_clip": 0.06271856, + "balance_loss_mlp": 0.01257401, + "epoch": 0.9384638508943334, + "flos": 19068726816000.0, + "grad_norm": 1.6719463976708178, + "language_loss": 0.69115657, + "learning_rate": 3.9553419969400536e-08, + "loss": 0.76783478, + "num_input_tokens_seen": 336647365, + "router_z_loss_clip": 1.28808594, + "router_z_loss_mlp": 0.09802246, + "step": 15609, + "time_per_iteration": 2.498767614364624 + }, + { + "auxiliary_loss_clip": 0.06405382, + "auxiliary_loss_mlp": 0.01263162, + "balance_loss_clip": 0.06270407, + "balance_loss_mlp": 0.01252899, + "epoch": 0.9385239741470014, + "flos": 23411730988800.0, + "grad_norm": 1.946583052250983, + "language_loss": 0.75374961, + "learning_rate": 3.9476384747844316e-08, + "loss": 0.83043504, + "num_input_tokens_seen": 336667165, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.1026001, + "step": 15610, + "time_per_iteration": 2.5711920261383057 + }, + { + "auxiliary_loss_clip": 0.06403802, + "auxiliary_loss_mlp": 0.01262478, + "balance_loss_clip": 0.06272253, + "balance_loss_mlp": 0.01253609, + "epoch": 0.9385840973996693, + "flos": 12829938416640.0, + "grad_norm": 1.6994334075613569, + "language_loss": 0.75466156, + "learning_rate": 3.939942386953987e-08, + "loss": 0.83132434, + "num_input_tokens_seen": 336684130, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.08862305, + "step": 15611, + "time_per_iteration": 2.483549118041992 + }, + { + "auxiliary_loss_clip": 0.06401208, + "auxiliary_loss_mlp": 0.01265334, + "balance_loss_clip": 0.06270809, + "balance_loss_mlp": 0.01256107, + "epoch": 0.9386442206523373, + "flos": 15492416302080.0, + "grad_norm": 1.9065211631243921, + "language_loss": 0.66030884, + "learning_rate": 3.9322537337405756e-08, + "loss": 0.73697424, + "num_input_tokens_seen": 336701520, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.09222412, + "step": 15612, + "time_per_iteration": 2.4864282608032227 + }, + { + "auxiliary_loss_clip": 0.06397071, + "auxiliary_loss_mlp": 0.01262431, + "balance_loss_clip": 0.06269386, + "balance_loss_mlp": 0.01253508, + "epoch": 0.9387043439050052, + "flos": 21185219496960.0, + "grad_norm": 1.9313570682062124, + "language_loss": 0.5721032, + "learning_rate": 3.924572515435742e-08, + "loss": 0.64869821, + "num_input_tokens_seen": 336720675, + "router_z_loss_clip": 1.27734375, + "router_z_loss_mlp": 0.0892334, + "step": 15613, + "time_per_iteration": 2.4872400760650635 + }, + { + "auxiliary_loss_clip": 0.06404439, + "auxiliary_loss_mlp": 0.01269408, + "balance_loss_clip": 0.0627088, + "balance_loss_mlp": 0.01259853, + "epoch": 0.9387644671576733, + "flos": 27674918547840.0, + "grad_norm": 1.9388248320141126, + "language_loss": 0.70511746, + "learning_rate": 3.916898732330764e-08, + "loss": 0.78185594, + "num_input_tokens_seen": 336741005, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.09545898, + "step": 15614, + "time_per_iteration": 3.943666934967041 + }, + { + "auxiliary_loss_clip": 0.06404942, + "auxiliary_loss_mlp": 0.01266663, + "balance_loss_clip": 0.06271072, + "balance_loss_mlp": 0.01256745, + "epoch": 0.9388245904103412, + "flos": 18841100400000.0, + "grad_norm": 1.8251928384631169, + "language_loss": 0.81327057, + "learning_rate": 3.9092323847166544e-08, + "loss": 0.88998669, + "num_input_tokens_seen": 336757990, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.09918213, + "step": 15615, + "time_per_iteration": 2.487769603729248 + }, + { + "auxiliary_loss_clip": 0.06396815, + "auxiliary_loss_mlp": 0.01261659, + "balance_loss_clip": 0.06269054, + "balance_loss_mlp": 0.01252486, + "epoch": 0.9388847136630092, + "flos": 25490893875840.0, + "grad_norm": 1.5481190981940673, + "language_loss": 0.71929944, + "learning_rate": 3.901573472884134e-08, + "loss": 0.79588419, + "num_input_tokens_seen": 336777705, + "router_z_loss_clip": 1.27832031, + "router_z_loss_mlp": 0.09173584, + "step": 15616, + "time_per_iteration": 2.5378410816192627 + }, + { + "auxiliary_loss_clip": 0.06402715, + "auxiliary_loss_mlp": 0.01264302, + "balance_loss_clip": 0.06272252, + "balance_loss_mlp": 0.01254909, + "epoch": 0.9389448369156771, + "flos": 18741102151680.0, + "grad_norm": 1.7803352890368735, + "language_loss": 0.66485155, + "learning_rate": 3.89392199712355e-08, + "loss": 0.74152172, + "num_input_tokens_seen": 336798275, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.09387207, + "step": 15617, + "time_per_iteration": 2.5118300914764404 + }, + { + "auxiliary_loss_clip": 0.06406648, + "auxiliary_loss_mlp": 0.01264715, + "balance_loss_clip": 0.06271216, + "balance_loss_mlp": 0.01254273, + "epoch": 0.9390049601683451, + "flos": 21722945074560.0, + "grad_norm": 1.945035864880724, + "language_loss": 0.73652196, + "learning_rate": 3.886277957725092e-08, + "loss": 0.81323552, + "num_input_tokens_seen": 336813835, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.10443115, + "step": 15618, + "time_per_iteration": 2.489013671875 + }, + { + "auxiliary_loss_clip": 0.06410211, + "auxiliary_loss_mlp": 0.01265609, + "balance_loss_clip": 0.06271806, + "balance_loss_mlp": 0.01255518, + "epoch": 0.939065083421013, + "flos": 19397357729280.0, + "grad_norm": 1.8849612887114653, + "language_loss": 0.70230412, + "learning_rate": 3.878641354978662e-08, + "loss": 0.77906239, + "num_input_tokens_seen": 336832210, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.10089111, + "step": 15619, + "time_per_iteration": 2.50981068611145 + }, + { + "auxiliary_loss_clip": 0.06404421, + "auxiliary_loss_mlp": 0.01265416, + "balance_loss_clip": 0.06272148, + "balance_loss_mlp": 0.01255199, + "epoch": 0.939125206673681, + "flos": 24688505577600.0, + "grad_norm": 1.5841389932494754, + "language_loss": 0.77946162, + "learning_rate": 3.8710121891737834e-08, + "loss": 0.85615999, + "num_input_tokens_seen": 336851380, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.10217285, + "step": 15620, + "time_per_iteration": 2.541767120361328 + }, + { + "auxiliary_loss_clip": 0.06399068, + "auxiliary_loss_mlp": 0.01262232, + "balance_loss_clip": 0.06271023, + "balance_loss_mlp": 0.01253005, + "epoch": 0.9391853299263491, + "flos": 16331505488640.0, + "grad_norm": 2.1364779923575026, + "language_loss": 0.73495758, + "learning_rate": 3.8633904605998025e-08, + "loss": 0.81157064, + "num_input_tokens_seen": 336868525, + "router_z_loss_clip": 1.28027344, + "router_z_loss_mlp": 0.09222412, + "step": 15621, + "time_per_iteration": 2.5422234535217285 + }, + { + "auxiliary_loss_clip": 0.06412639, + "auxiliary_loss_mlp": 0.01269163, + "balance_loss_clip": 0.06276237, + "balance_loss_mlp": 0.01258935, + "epoch": 0.939245453179017, + "flos": 11660541235200.0, + "grad_norm": 2.3036117116482524, + "language_loss": 0.67062247, + "learning_rate": 3.855776169545688e-08, + "loss": 0.74744046, + "num_input_tokens_seen": 336886200, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.10235596, + "step": 15622, + "time_per_iteration": 3.9106016159057617 + }, + { + "auxiliary_loss_clip": 0.06401062, + "auxiliary_loss_mlp": 0.01266555, + "balance_loss_clip": 0.06272039, + "balance_loss_mlp": 0.01257781, + "epoch": 0.939305576431685, + "flos": 23155369822080.0, + "grad_norm": 1.6184026237616547, + "language_loss": 0.71614575, + "learning_rate": 3.848169316300209e-08, + "loss": 0.79282188, + "num_input_tokens_seen": 336905815, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.08770752, + "step": 15623, + "time_per_iteration": 2.535764694213867 + }, + { + "auxiliary_loss_clip": 0.06404904, + "auxiliary_loss_mlp": 0.01267971, + "balance_loss_clip": 0.06273766, + "balance_loss_mlp": 0.01258458, + "epoch": 0.9393656996843529, + "flos": 33295493923200.0, + "grad_norm": 1.8929766893988849, + "language_loss": 0.72837877, + "learning_rate": 3.84056990115178e-08, + "loss": 0.80510753, + "num_input_tokens_seen": 336928460, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.09515381, + "step": 15624, + "time_per_iteration": 2.6262624263763428 + }, + { + "auxiliary_loss_clip": 0.06399508, + "auxiliary_loss_mlp": 0.01269514, + "balance_loss_clip": 0.06270696, + "balance_loss_mlp": 0.01260263, + "epoch": 0.9394258229370209, + "flos": 21695887405440.0, + "grad_norm": 1.7875404465361746, + "language_loss": 0.89779687, + "learning_rate": 3.832977924388614e-08, + "loss": 0.97448707, + "num_input_tokens_seen": 336948320, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.09240723, + "step": 15625, + "time_per_iteration": 2.531123161315918 + }, + { + "auxiliary_loss_clip": 0.06399558, + "auxiliary_loss_mlp": 0.01262032, + "balance_loss_clip": 0.06269208, + "balance_loss_mlp": 0.01252478, + "epoch": 0.9394859461896888, + "flos": 23880289420800.0, + "grad_norm": 4.111605423444732, + "language_loss": 0.83748984, + "learning_rate": 3.825393386298592e-08, + "loss": 0.91410571, + "num_input_tokens_seen": 336967670, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.09545898, + "step": 15626, + "time_per_iteration": 2.5196032524108887 + }, + { + "auxiliary_loss_clip": 0.06308495, + "auxiliary_loss_mlp": 0.01251926, + "balance_loss_clip": 0.06254559, + "balance_loss_mlp": 0.0125083, + "epoch": 0.9395460694423569, + "flos": 61584963114240.0, + "grad_norm": 0.7637423536356234, + "language_loss": 0.56075698, + "learning_rate": 3.8178162871693284e-08, + "loss": 0.63636124, + "num_input_tokens_seen": 337028395, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.0109787, + "step": 15627, + "time_per_iteration": 3.1151974201202393 + }, + { + "auxiliary_loss_clip": 0.06399734, + "auxiliary_loss_mlp": 0.01262903, + "balance_loss_clip": 0.06269808, + "balance_loss_mlp": 0.01254522, + "epoch": 0.9396061926950248, + "flos": 21001966617600.0, + "grad_norm": 2.0549661543440725, + "language_loss": 0.70356309, + "learning_rate": 3.810246627288105e-08, + "loss": 0.78018951, + "num_input_tokens_seen": 337048150, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.08380127, + "step": 15628, + "time_per_iteration": 3.9435250759124756 + }, + { + "auxiliary_loss_clip": 0.06402381, + "auxiliary_loss_mlp": 0.01264022, + "balance_loss_clip": 0.06272991, + "balance_loss_mlp": 0.01255188, + "epoch": 0.9396663159476928, + "flos": 27494726342400.0, + "grad_norm": 1.608508127182665, + "language_loss": 0.7580415, + "learning_rate": 3.8026844069420025e-08, + "loss": 0.83470553, + "num_input_tokens_seen": 337069315, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.08837891, + "step": 15629, + "time_per_iteration": 2.6277477741241455 + }, + { + "auxiliary_loss_clip": 0.06394442, + "auxiliary_loss_mlp": 0.01261005, + "balance_loss_clip": 0.06268346, + "balance_loss_mlp": 0.01252273, + "epoch": 0.9397264392003607, + "flos": 19433555493120.0, + "grad_norm": 1.7353515662757615, + "language_loss": 0.74587202, + "learning_rate": 3.795129626417748e-08, + "loss": 0.8224265, + "num_input_tokens_seen": 337087765, + "router_z_loss_clip": 1.26074219, + "router_z_loss_mlp": 0.08734131, + "step": 15630, + "time_per_iteration": 2.5997049808502197 + }, + { + "auxiliary_loss_clip": 0.06399633, + "auxiliary_loss_mlp": 0.01262857, + "balance_loss_clip": 0.06272737, + "balance_loss_mlp": 0.01254709, + "epoch": 0.9397865624530287, + "flos": 18010732037760.0, + "grad_norm": 1.9830238944989997, + "language_loss": 0.69652402, + "learning_rate": 3.787582286001845e-08, + "loss": 0.77314889, + "num_input_tokens_seen": 337106265, + "router_z_loss_clip": 1.26855469, + "router_z_loss_mlp": 0.08154297, + "step": 15631, + "time_per_iteration": 2.516963481903076 + }, + { + "auxiliary_loss_clip": 0.06397713, + "auxiliary_loss_mlp": 0.01265126, + "balance_loss_clip": 0.06269372, + "balance_loss_mlp": 0.01255822, + "epoch": 0.9398466857056966, + "flos": 22571132428800.0, + "grad_norm": 1.4686823843430021, + "language_loss": 0.75433683, + "learning_rate": 3.7800423859805086e-08, + "loss": 0.83096522, + "num_input_tokens_seen": 337126090, + "router_z_loss_clip": 1.28417969, + "router_z_loss_mlp": 0.09307861, + "step": 15632, + "time_per_iteration": 3.9681499004364014 + }, + { + "auxiliary_loss_clip": 0.06407969, + "auxiliary_loss_mlp": 0.01263576, + "balance_loss_clip": 0.06271549, + "balance_loss_mlp": 0.01253222, + "epoch": 0.9399068089583646, + "flos": 24542666346240.0, + "grad_norm": 2.093894657159583, + "language_loss": 0.7490074, + "learning_rate": 3.772509926639622e-08, + "loss": 0.82572281, + "num_input_tokens_seen": 337145655, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.10351562, + "step": 15633, + "time_per_iteration": 2.5607848167419434 + }, + { + "auxiliary_loss_clip": 0.06401691, + "auxiliary_loss_mlp": 0.01266106, + "balance_loss_clip": 0.06268854, + "balance_loss_mlp": 0.01255801, + "epoch": 0.9399669322110327, + "flos": 25637529720960.0, + "grad_norm": 1.842729170438083, + "language_loss": 0.72873878, + "learning_rate": 3.764984908264823e-08, + "loss": 0.8054167, + "num_input_tokens_seen": 337164805, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.10296631, + "step": 15634, + "time_per_iteration": 2.5304877758026123 + }, + { + "auxiliary_loss_clip": 0.06408176, + "auxiliary_loss_mlp": 0.01264401, + "balance_loss_clip": 0.06273288, + "balance_loss_mlp": 0.01254405, + "epoch": 0.9400270554637006, + "flos": 17094593422080.0, + "grad_norm": 1.5847517594895608, + "language_loss": 0.69334674, + "learning_rate": 3.75746733114144e-08, + "loss": 0.77007252, + "num_input_tokens_seen": 337182280, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.09997559, + "step": 15635, + "time_per_iteration": 2.5305612087249756 + }, + { + "auxiliary_loss_clip": 0.06394704, + "auxiliary_loss_mlp": 0.01261499, + "balance_loss_clip": 0.06268582, + "balance_loss_mlp": 0.01252845, + "epoch": 0.9400871787163686, + "flos": 22061764258560.0, + "grad_norm": 1.5394095539238604, + "language_loss": 0.74523485, + "learning_rate": 3.7499571955545985e-08, + "loss": 0.82179689, + "num_input_tokens_seen": 337203495, + "router_z_loss_clip": 1.26269531, + "router_z_loss_mlp": 0.08654785, + "step": 15636, + "time_per_iteration": 2.5429651737213135 + }, + { + "auxiliary_loss_clip": 0.06401463, + "auxiliary_loss_mlp": 0.01262977, + "balance_loss_clip": 0.06270332, + "balance_loss_mlp": 0.01253553, + "epoch": 0.9401473019690365, + "flos": 16988431898880.0, + "grad_norm": 1.907903865743405, + "language_loss": 0.83414614, + "learning_rate": 3.7424545017890054e-08, + "loss": 0.91079056, + "num_input_tokens_seen": 337220435, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.09423828, + "step": 15637, + "time_per_iteration": 2.5974667072296143 + }, + { + "auxiliary_loss_clip": 0.06402609, + "auxiliary_loss_mlp": 0.01265808, + "balance_loss_clip": 0.06269225, + "balance_loss_mlp": 0.01256391, + "epoch": 0.9402074252217045, + "flos": 19687946088960.0, + "grad_norm": 2.20612132803902, + "language_loss": 0.69127619, + "learning_rate": 3.7349592501292325e-08, + "loss": 0.76796037, + "num_input_tokens_seen": 337238095, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.09417725, + "step": 15638, + "time_per_iteration": 2.5368309020996094 + }, + { + "auxiliary_loss_clip": 0.06396491, + "auxiliary_loss_mlp": 0.01264929, + "balance_loss_clip": 0.06270848, + "balance_loss_mlp": 0.01256448, + "epoch": 0.9402675484743724, + "flos": 24761278448640.0, + "grad_norm": 1.57830953149631, + "language_loss": 0.848472, + "learning_rate": 3.727471440859498e-08, + "loss": 0.92508614, + "num_input_tokens_seen": 337256645, + "router_z_loss_clip": 1.25683594, + "router_z_loss_mlp": 0.08477783, + "step": 15639, + "time_per_iteration": 2.528841257095337 + }, + { + "auxiliary_loss_clip": 0.06401523, + "auxiliary_loss_mlp": 0.01262071, + "balance_loss_clip": 0.0626966, + "balance_loss_mlp": 0.01253255, + "epoch": 0.9403276717270405, + "flos": 25566014661120.0, + "grad_norm": 1.4451560995387316, + "language_loss": 0.784464, + "learning_rate": 3.719991074263662e-08, + "loss": 0.86109996, + "num_input_tokens_seen": 337278360, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.08813477, + "step": 15640, + "time_per_iteration": 2.539466619491577 + }, + { + "auxiliary_loss_clip": 0.06403446, + "auxiliary_loss_mlp": 0.01264478, + "balance_loss_clip": 0.06269012, + "balance_loss_mlp": 0.01255412, + "epoch": 0.9403877949797084, + "flos": 26697453143040.0, + "grad_norm": 1.431088063022994, + "language_loss": 0.74448258, + "learning_rate": 3.7125181506254544e-08, + "loss": 0.82116181, + "num_input_tokens_seen": 337302480, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.09063721, + "step": 15641, + "time_per_iteration": 2.5854341983795166 + }, + { + "auxiliary_loss_clip": 0.06407844, + "auxiliary_loss_mlp": 0.01268272, + "balance_loss_clip": 0.06270669, + "balance_loss_mlp": 0.01257466, + "epoch": 0.9404479182323764, + "flos": 15016856054400.0, + "grad_norm": 1.9684805464288027, + "language_loss": 0.82889009, + "learning_rate": 3.7050526702282256e-08, + "loss": 0.90565127, + "num_input_tokens_seen": 337316600, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.10803223, + "step": 15642, + "time_per_iteration": 2.4660263061523438 + }, + { + "auxiliary_loss_clip": 0.06396569, + "auxiliary_loss_mlp": 0.01267, + "balance_loss_clip": 0.0626855, + "balance_loss_mlp": 0.01258441, + "epoch": 0.9405080414850443, + "flos": 24980645237760.0, + "grad_norm": 1.7977314668470241, + "language_loss": 0.68295997, + "learning_rate": 3.697594633355084e-08, + "loss": 0.75959563, + "num_input_tokens_seen": 337336895, + "router_z_loss_clip": 1.28027344, + "router_z_loss_mlp": 0.08557129, + "step": 15643, + "time_per_iteration": 2.5947160720825195 + }, + { + "auxiliary_loss_clip": 0.06406666, + "auxiliary_loss_mlp": 0.01264827, + "balance_loss_clip": 0.06273131, + "balance_loss_mlp": 0.0125535, + "epoch": 0.9405681647377123, + "flos": 20850131819520.0, + "grad_norm": 1.9653990343363072, + "language_loss": 0.76734209, + "learning_rate": 3.6901440402888226e-08, + "loss": 0.84405702, + "num_input_tokens_seen": 337355105, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.0947876, + "step": 15644, + "time_per_iteration": 2.5140726566314697 + }, + { + "auxiliary_loss_clip": 0.06398097, + "auxiliary_loss_mlp": 0.01264658, + "balance_loss_clip": 0.06269826, + "balance_loss_mlp": 0.0125598, + "epoch": 0.9406282879903802, + "flos": 23812380086400.0, + "grad_norm": 1.5018434731522488, + "language_loss": 0.6776011, + "learning_rate": 3.682700891311974e-08, + "loss": 0.75422859, + "num_input_tokens_seen": 337374905, + "router_z_loss_clip": 1.28222656, + "router_z_loss_mlp": 0.08685303, + "step": 15645, + "time_per_iteration": 2.5149364471435547 + }, + { + "auxiliary_loss_clip": 0.06395334, + "auxiliary_loss_mlp": 0.01266219, + "balance_loss_clip": 0.06269147, + "balance_loss_mlp": 0.01257261, + "epoch": 0.9406884112430483, + "flos": 27682716977280.0, + "grad_norm": 1.3496847114989412, + "language_loss": 0.70362568, + "learning_rate": 3.6752651867067774e-08, + "loss": 0.78024125, + "num_input_tokens_seen": 337397130, + "router_z_loss_clip": 1.26269531, + "router_z_loss_mlp": 0.08953857, + "step": 15646, + "time_per_iteration": 2.565032958984375 + }, + { + "auxiliary_loss_clip": 0.06398815, + "auxiliary_loss_mlp": 0.01261727, + "balance_loss_clip": 0.06269072, + "balance_loss_mlp": 0.01253018, + "epoch": 0.9407485344957163, + "flos": 23081590702080.0, + "grad_norm": 1.5299768389325743, + "language_loss": 0.74550891, + "learning_rate": 3.667836926755208e-08, + "loss": 0.82211429, + "num_input_tokens_seen": 337418660, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.08709717, + "step": 15647, + "time_per_iteration": 2.52329158782959 + }, + { + "auxiliary_loss_clip": 0.06308979, + "auxiliary_loss_mlp": 0.01247889, + "balance_loss_clip": 0.06254758, + "balance_loss_mlp": 0.01246815, + "epoch": 0.9408086577483842, + "flos": 71034143247360.0, + "grad_norm": 0.8645069850890814, + "language_loss": 0.63526928, + "learning_rate": 3.660416111738907e-08, + "loss": 0.71083796, + "num_input_tokens_seen": 337478055, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.01075745, + "step": 15648, + "time_per_iteration": 3.233332872390747 + }, + { + "auxiliary_loss_clip": 0.06401809, + "auxiliary_loss_mlp": 0.01261765, + "balance_loss_clip": 0.06273667, + "balance_loss_mlp": 0.01253027, + "epoch": 0.9408687810010522, + "flos": 23737468936320.0, + "grad_norm": 1.3199036053586422, + "language_loss": 0.66599685, + "learning_rate": 3.653002741939337e-08, + "loss": 0.74263257, + "num_input_tokens_seen": 337499405, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.08740234, + "step": 15649, + "time_per_iteration": 2.5568881034851074 + }, + { + "auxiliary_loss_clip": 0.06399603, + "auxiliary_loss_mlp": 0.01263735, + "balance_loss_clip": 0.06268597, + "balance_loss_mlp": 0.01254967, + "epoch": 0.9409289042537201, + "flos": 18375225298560.0, + "grad_norm": 2.070554549702626, + "language_loss": 0.77568704, + "learning_rate": 3.645596817637586e-08, + "loss": 0.85232043, + "num_input_tokens_seen": 337517195, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.08770752, + "step": 15650, + "time_per_iteration": 2.4954206943511963 + }, + { + "auxiliary_loss_clip": 0.06402092, + "auxiliary_loss_mlp": 0.01263238, + "balance_loss_clip": 0.06272111, + "balance_loss_mlp": 0.01254596, + "epoch": 0.9409890275063881, + "flos": 23885111030400.0, + "grad_norm": 1.6619608167936917, + "language_loss": 0.74290323, + "learning_rate": 3.638198339114451e-08, + "loss": 0.81955653, + "num_input_tokens_seen": 337535245, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.08630371, + "step": 15651, + "time_per_iteration": 2.5114126205444336 + }, + { + "auxiliary_loss_clip": 0.06400727, + "auxiliary_loss_mlp": 0.01262851, + "balance_loss_clip": 0.06271733, + "balance_loss_mlp": 0.01253607, + "epoch": 0.941049150759056, + "flos": 16550704569600.0, + "grad_norm": 1.9371023578664908, + "language_loss": 0.72369295, + "learning_rate": 3.630807306650507e-08, + "loss": 0.80032873, + "num_input_tokens_seen": 337553040, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.0925293, + "step": 15652, + "time_per_iteration": 2.490548849105835 + }, + { + "auxiliary_loss_clip": 0.06408902, + "auxiliary_loss_mlp": 0.01264671, + "balance_loss_clip": 0.06270728, + "balance_loss_mlp": 0.01254592, + "epoch": 0.9411092740117241, + "flos": 25125310512000.0, + "grad_norm": 1.5890222954041313, + "language_loss": 0.66336501, + "learning_rate": 3.6234237205260645e-08, + "loss": 0.7401008, + "num_input_tokens_seen": 337574580, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.10076904, + "step": 15653, + "time_per_iteration": 3.934385061264038 + }, + { + "auxiliary_loss_clip": 0.06402892, + "auxiliary_loss_mlp": 0.01264013, + "balance_loss_clip": 0.06269339, + "balance_loss_mlp": 0.01253767, + "epoch": 0.941169397264392, + "flos": 21148644389760.0, + "grad_norm": 1.8935835038310136, + "language_loss": 0.78094435, + "learning_rate": 3.6160475810210536e-08, + "loss": 0.85761338, + "num_input_tokens_seen": 337593010, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.10247803, + "step": 15654, + "time_per_iteration": 2.5380873680114746 + }, + { + "auxiliary_loss_clip": 0.06412641, + "auxiliary_loss_mlp": 0.01263841, + "balance_loss_clip": 0.06273723, + "balance_loss_mlp": 0.01254489, + "epoch": 0.94122952051706, + "flos": 38518103531520.0, + "grad_norm": 2.6656013558269, + "language_loss": 0.70125389, + "learning_rate": 3.6086788884152065e-08, + "loss": 0.77801865, + "num_input_tokens_seen": 337616170, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.09350586, + "step": 15655, + "time_per_iteration": 2.662172317504883 + }, + { + "auxiliary_loss_clip": 0.06398033, + "auxiliary_loss_mlp": 0.01262956, + "balance_loss_clip": 0.0626789, + "balance_loss_mlp": 0.01253169, + "epoch": 0.9412896437697279, + "flos": 18375099517440.0, + "grad_norm": 1.6963116521742299, + "language_loss": 0.7260558, + "learning_rate": 3.601317642987944e-08, + "loss": 0.80266565, + "num_input_tokens_seen": 337635215, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.09783936, + "step": 15656, + "time_per_iteration": 2.478374481201172 + }, + { + "auxiliary_loss_clip": 0.06401219, + "auxiliary_loss_mlp": 0.01263672, + "balance_loss_clip": 0.06271031, + "balance_loss_mlp": 0.01254182, + "epoch": 0.9413497670223959, + "flos": 25892046097920.0, + "grad_norm": 1.89156015011812, + "language_loss": 0.78345996, + "learning_rate": 3.593963845018377e-08, + "loss": 0.86010885, + "num_input_tokens_seen": 337654195, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.09490967, + "step": 15657, + "time_per_iteration": 2.5166099071502686 + }, + { + "auxiliary_loss_clip": 0.06401125, + "auxiliary_loss_mlp": 0.01265038, + "balance_loss_clip": 0.06268708, + "balance_loss_mlp": 0.01255758, + "epoch": 0.9414098902750638, + "flos": 16623980565120.0, + "grad_norm": 3.3736293450967505, + "language_loss": 0.84897089, + "learning_rate": 3.586617494785371e-08, + "loss": 0.92563248, + "num_input_tokens_seen": 337671810, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.09289551, + "step": 15658, + "time_per_iteration": 2.4719231128692627 + }, + { + "auxiliary_loss_clip": 0.06407331, + "auxiliary_loss_mlp": 0.01266897, + "balance_loss_clip": 0.06271299, + "balance_loss_mlp": 0.01256049, + "epoch": 0.9414700135277319, + "flos": 18631041413760.0, + "grad_norm": 2.0197764771126936, + "language_loss": 0.71193194, + "learning_rate": 3.5792785925675254e-08, + "loss": 0.78867424, + "num_input_tokens_seen": 337689410, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.10848999, + "step": 15659, + "time_per_iteration": 2.469200611114502 + }, + { + "auxiliary_loss_clip": 0.0640014, + "auxiliary_loss_mlp": 0.01267204, + "balance_loss_clip": 0.06271692, + "balance_loss_mlp": 0.01258019, + "epoch": 0.9415301367803999, + "flos": 26286280358400.0, + "grad_norm": 1.643546636264258, + "language_loss": 0.79811406, + "learning_rate": 3.571947138643172e-08, + "loss": 0.87478751, + "num_input_tokens_seen": 337709950, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.09191895, + "step": 15660, + "time_per_iteration": 2.5389978885650635 + }, + { + "auxiliary_loss_clip": 0.06393769, + "auxiliary_loss_mlp": 0.01262754, + "balance_loss_clip": 0.06267805, + "balance_loss_mlp": 0.01253766, + "epoch": 0.9415902600330678, + "flos": 23268617015040.0, + "grad_norm": 1.3569546875428349, + "language_loss": 0.68124604, + "learning_rate": 3.564623133290201e-08, + "loss": 0.75781125, + "num_input_tokens_seen": 337731320, + "router_z_loss_clip": 1.25878906, + "router_z_loss_mlp": 0.08984375, + "step": 15661, + "time_per_iteration": 3.984410285949707 + }, + { + "auxiliary_loss_clip": 0.06403223, + "auxiliary_loss_mlp": 0.01268302, + "balance_loss_clip": 0.06272446, + "balance_loss_mlp": 0.01258783, + "epoch": 0.9416503832857358, + "flos": 14724171342720.0, + "grad_norm": 1.8492726006521825, + "language_loss": 0.6645698, + "learning_rate": 3.557306576786434e-08, + "loss": 0.74128503, + "num_input_tokens_seen": 337747720, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09521484, + "step": 15662, + "time_per_iteration": 2.4829232692718506 + }, + { + "auxiliary_loss_clip": 0.06309918, + "auxiliary_loss_mlp": 0.01248909, + "balance_loss_clip": 0.06255955, + "balance_loss_mlp": 0.0124794, + "epoch": 0.9417105065384037, + "flos": 70331333927040.0, + "grad_norm": 0.7645309383813702, + "language_loss": 0.59303248, + "learning_rate": 3.5499974694092935e-08, + "loss": 0.66862071, + "num_input_tokens_seen": 337806930, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.00967407, + "step": 15663, + "time_per_iteration": 3.204615354537964 + }, + { + "auxiliary_loss_clip": 0.06405449, + "auxiliary_loss_mlp": 0.01265808, + "balance_loss_clip": 0.06269373, + "balance_loss_mlp": 0.01255437, + "epoch": 0.9417706297910717, + "flos": 34066380286080.0, + "grad_norm": 1.7732726183519205, + "language_loss": 0.66930187, + "learning_rate": 3.542695811435914e-08, + "loss": 0.74601436, + "num_input_tokens_seen": 337828100, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.1036377, + "step": 15664, + "time_per_iteration": 2.6080029010772705 + }, + { + "auxiliary_loss_clip": 0.06399654, + "auxiliary_loss_mlp": 0.0126386, + "balance_loss_clip": 0.06270635, + "balance_loss_mlp": 0.01254973, + "epoch": 0.9418307530437396, + "flos": 16477135084800.0, + "grad_norm": 2.310935997550932, + "language_loss": 0.74091578, + "learning_rate": 3.535401603143207e-08, + "loss": 0.81755096, + "num_input_tokens_seen": 337844805, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.08880615, + "step": 15665, + "time_per_iteration": 2.483211040496826 + }, + { + "auxiliary_loss_clip": 0.06396838, + "auxiliary_loss_mlp": 0.01264954, + "balance_loss_clip": 0.06268667, + "balance_loss_mlp": 0.01256026, + "epoch": 0.9418908762964077, + "flos": 11258089274880.0, + "grad_norm": 2.6110981514445366, + "language_loss": 0.6352722, + "learning_rate": 3.528114844807773e-08, + "loss": 0.7118901, + "num_input_tokens_seen": 337860490, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.08929443, + "step": 15666, + "time_per_iteration": 2.5411856174468994 + }, + { + "auxiliary_loss_clip": 0.06402782, + "auxiliary_loss_mlp": 0.01263561, + "balance_loss_clip": 0.06272171, + "balance_loss_mlp": 0.01254063, + "epoch": 0.9419509995490756, + "flos": 18444182808960.0, + "grad_norm": 1.991838709857188, + "language_loss": 0.78680706, + "learning_rate": 3.520835536705902e-08, + "loss": 0.86347044, + "num_input_tokens_seen": 337878360, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.09500122, + "step": 15667, + "time_per_iteration": 3.9452993869781494 + }, + { + "auxiliary_loss_clip": 0.06400198, + "auxiliary_loss_mlp": 0.01262756, + "balance_loss_clip": 0.06271772, + "balance_loss_mlp": 0.01254638, + "epoch": 0.9420111228017436, + "flos": 20743760661120.0, + "grad_norm": 1.6944923844867426, + "language_loss": 0.75551254, + "learning_rate": 3.5135636791136404e-08, + "loss": 0.83214211, + "num_input_tokens_seen": 337895635, + "router_z_loss_clip": 1.28613281, + "router_z_loss_mlp": 0.08123779, + "step": 15668, + "time_per_iteration": 2.524935007095337 + }, + { + "auxiliary_loss_clip": 0.06403884, + "auxiliary_loss_mlp": 0.01269735, + "balance_loss_clip": 0.06270599, + "balance_loss_mlp": 0.01260168, + "epoch": 0.9420712460544115, + "flos": 21148267046400.0, + "grad_norm": 2.4528189170116774, + "language_loss": 0.59678006, + "learning_rate": 3.506299272306723e-08, + "loss": 0.67351627, + "num_input_tokens_seen": 337913940, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.09564209, + "step": 15669, + "time_per_iteration": 2.4999589920043945 + }, + { + "auxiliary_loss_clip": 0.06396198, + "auxiliary_loss_mlp": 0.0126024, + "balance_loss_clip": 0.06269813, + "balance_loss_mlp": 0.01251812, + "epoch": 0.9421313693070795, + "flos": 15857244979200.0, + "grad_norm": 1.4484921317506239, + "language_loss": 0.77208281, + "learning_rate": 3.4990423165606406e-08, + "loss": 0.84864712, + "num_input_tokens_seen": 337932015, + "router_z_loss_clip": 1.26367188, + "router_z_loss_mlp": 0.08422852, + "step": 15670, + "time_per_iteration": 2.4799532890319824 + }, + { + "auxiliary_loss_clip": 0.06402656, + "auxiliary_loss_mlp": 0.01264404, + "balance_loss_clip": 0.06273, + "balance_loss_mlp": 0.01254748, + "epoch": 0.9421914925597474, + "flos": 32424106187520.0, + "grad_norm": 2.128403859031794, + "language_loss": 0.65426135, + "learning_rate": 3.491792812150574e-08, + "loss": 0.73093194, + "num_input_tokens_seen": 337953345, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.09667969, + "step": 15671, + "time_per_iteration": 3.975170850753784 + }, + { + "auxiliary_loss_clip": 0.06401955, + "auxiliary_loss_mlp": 0.01267564, + "balance_loss_clip": 0.06270818, + "balance_loss_mlp": 0.01257986, + "epoch": 0.9422516158124155, + "flos": 19724521196160.0, + "grad_norm": 1.5351118428964867, + "language_loss": 0.79441094, + "learning_rate": 3.48455075935139e-08, + "loss": 0.87110615, + "num_input_tokens_seen": 337973685, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09576416, + "step": 15672, + "time_per_iteration": 2.4977033138275146 + }, + { + "auxiliary_loss_clip": 0.06408137, + "auxiliary_loss_mlp": 0.01264621, + "balance_loss_clip": 0.06270933, + "balance_loss_mlp": 0.01254214, + "epoch": 0.9423117390650835, + "flos": 16258858398720.0, + "grad_norm": 1.991030547608086, + "language_loss": 0.74059123, + "learning_rate": 3.47731615843776e-08, + "loss": 0.8173188, + "num_input_tokens_seen": 337989175, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.10412598, + "step": 15673, + "time_per_iteration": 2.448622226715088 + }, + { + "auxiliary_loss_clip": 0.0639824, + "auxiliary_loss_mlp": 0.01263085, + "balance_loss_clip": 0.06268054, + "balance_loss_mlp": 0.01253715, + "epoch": 0.9423718623177514, + "flos": 31804803060480.0, + "grad_norm": 1.4075068342748132, + "language_loss": 0.70376456, + "learning_rate": 3.470089009683974e-08, + "loss": 0.78037775, + "num_input_tokens_seen": 338011800, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.09368896, + "step": 15674, + "time_per_iteration": 2.5917158126831055 + }, + { + "auxiliary_loss_clip": 0.06401472, + "auxiliary_loss_mlp": 0.01264344, + "balance_loss_clip": 0.06269686, + "balance_loss_mlp": 0.01255684, + "epoch": 0.9424319855704194, + "flos": 23338622701440.0, + "grad_norm": 1.6840645348051175, + "language_loss": 0.81582546, + "learning_rate": 3.462869313364125e-08, + "loss": 0.89248359, + "num_input_tokens_seen": 338032120, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.08660889, + "step": 15675, + "time_per_iteration": 2.5051825046539307 + }, + { + "auxiliary_loss_clip": 0.06400142, + "auxiliary_loss_mlp": 0.01265582, + "balance_loss_clip": 0.0627027, + "balance_loss_mlp": 0.01257076, + "epoch": 0.9424921088230873, + "flos": 20783983420800.0, + "grad_norm": 1.5494780490790538, + "language_loss": 0.63124716, + "learning_rate": 3.4556570697519494e-08, + "loss": 0.7079044, + "num_input_tokens_seen": 338051880, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.08508301, + "step": 15676, + "time_per_iteration": 2.4995803833007812 + }, + { + "auxiliary_loss_clip": 0.06400351, + "auxiliary_loss_mlp": 0.01264973, + "balance_loss_clip": 0.06268977, + "balance_loss_mlp": 0.01254709, + "epoch": 0.9425522320757553, + "flos": 19032780614400.0, + "grad_norm": 1.7622357826868196, + "language_loss": 0.67433226, + "learning_rate": 3.448452279120984e-08, + "loss": 0.7509855, + "num_input_tokens_seen": 338069665, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.1026001, + "step": 15677, + "time_per_iteration": 2.5142791271209717 + }, + { + "auxiliary_loss_clip": 0.06405545, + "auxiliary_loss_mlp": 0.01263466, + "balance_loss_clip": 0.06270891, + "balance_loss_mlp": 0.01253458, + "epoch": 0.9426123553284232, + "flos": 25162346816640.0, + "grad_norm": 1.7717990036864524, + "language_loss": 0.64982033, + "learning_rate": 3.441254941744387e-08, + "loss": 0.72651047, + "num_input_tokens_seen": 338090490, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.10003662, + "step": 15678, + "time_per_iteration": 2.5930380821228027 + }, + { + "auxiliary_loss_clip": 0.06398059, + "auxiliary_loss_mlp": 0.01267241, + "balance_loss_clip": 0.06267848, + "balance_loss_mlp": 0.01258092, + "epoch": 0.9426724785810913, + "flos": 21185848402560.0, + "grad_norm": 1.4818609891623467, + "language_loss": 0.74543768, + "learning_rate": 3.434065057895097e-08, + "loss": 0.82209063, + "num_input_tokens_seen": 338109825, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.09155273, + "step": 15679, + "time_per_iteration": 2.4969890117645264 + }, + { + "auxiliary_loss_clip": 0.06406982, + "auxiliary_loss_mlp": 0.01267063, + "balance_loss_clip": 0.062732, + "balance_loss_mlp": 0.01257223, + "epoch": 0.9427326018337592, + "flos": 14762171969280.0, + "grad_norm": 2.028620141533925, + "language_loss": 0.77248597, + "learning_rate": 3.426882627845762e-08, + "loss": 0.84922642, + "num_input_tokens_seen": 338125790, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.09832764, + "step": 15680, + "time_per_iteration": 2.4729225635528564 + }, + { + "auxiliary_loss_clip": 0.06401733, + "auxiliary_loss_mlp": 0.01269172, + "balance_loss_clip": 0.06271403, + "balance_loss_mlp": 0.01259439, + "epoch": 0.9427927250864272, + "flos": 20930032287360.0, + "grad_norm": 1.7948180035587007, + "language_loss": 0.75664496, + "learning_rate": 3.419707651868742e-08, + "loss": 0.833354, + "num_input_tokens_seen": 338145610, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.09735107, + "step": 15681, + "time_per_iteration": 2.5000479221343994 + }, + { + "auxiliary_loss_clip": 0.06404436, + "auxiliary_loss_mlp": 0.01266864, + "balance_loss_clip": 0.06271823, + "balance_loss_mlp": 0.0125725, + "epoch": 0.9428528483390951, + "flos": 19758119483520.0, + "grad_norm": 1.682204296334067, + "language_loss": 0.65451252, + "learning_rate": 3.412540130236086e-08, + "loss": 0.73122549, + "num_input_tokens_seen": 338165960, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.09613037, + "step": 15682, + "time_per_iteration": 2.5290274620056152 + }, + { + "auxiliary_loss_clip": 0.06400858, + "auxiliary_loss_mlp": 0.01263081, + "balance_loss_clip": 0.06269148, + "balance_loss_mlp": 0.01253365, + "epoch": 0.9429129715917631, + "flos": 24541869732480.0, + "grad_norm": 1.6078440758053596, + "language_loss": 0.76264083, + "learning_rate": 3.405380063219665e-08, + "loss": 0.83928025, + "num_input_tokens_seen": 338187215, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09716797, + "step": 15683, + "time_per_iteration": 2.5387845039367676 + }, + { + "auxiliary_loss_clip": 0.06404649, + "auxiliary_loss_mlp": 0.01266852, + "balance_loss_clip": 0.06270392, + "balance_loss_mlp": 0.01256398, + "epoch": 0.942973094844431, + "flos": 17964304076160.0, + "grad_norm": 2.5267719992452076, + "language_loss": 0.75809973, + "learning_rate": 3.398227451090885e-08, + "loss": 0.83481473, + "num_input_tokens_seen": 338201825, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.10461426, + "step": 15684, + "time_per_iteration": 2.483170747756958 + }, + { + "auxiliary_loss_clip": 0.06397957, + "auxiliary_loss_mlp": 0.01264368, + "balance_loss_clip": 0.06269065, + "balance_loss_mlp": 0.01255523, + "epoch": 0.9430332180970991, + "flos": 26144382268800.0, + "grad_norm": 1.5399234901397196, + "language_loss": 0.77343988, + "learning_rate": 3.391082294121017e-08, + "loss": 0.85006315, + "num_input_tokens_seen": 338220865, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.08843994, + "step": 15685, + "time_per_iteration": 2.5491085052490234 + }, + { + "auxiliary_loss_clip": 0.06397514, + "auxiliary_loss_mlp": 0.01261396, + "balance_loss_clip": 0.06270064, + "balance_loss_mlp": 0.01252807, + "epoch": 0.943093341349767, + "flos": 23958177390720.0, + "grad_norm": 1.7162540789171723, + "language_loss": 0.76184905, + "learning_rate": 3.383944592581023e-08, + "loss": 0.83843815, + "num_input_tokens_seen": 338240160, + "router_z_loss_clip": 1.27539062, + "router_z_loss_mlp": 0.0859375, + "step": 15686, + "time_per_iteration": 2.588693857192993 + }, + { + "auxiliary_loss_clip": 0.06403645, + "auxiliary_loss_mlp": 0.01264923, + "balance_loss_clip": 0.06270447, + "balance_loss_mlp": 0.01255572, + "epoch": 0.943153464602435, + "flos": 17974324638720.0, + "grad_norm": 1.6255235883785641, + "language_loss": 0.80987608, + "learning_rate": 3.376814346741575e-08, + "loss": 0.88656175, + "num_input_tokens_seen": 338259305, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.09350586, + "step": 15687, + "time_per_iteration": 2.4934589862823486 + }, + { + "auxiliary_loss_clip": 0.06407475, + "auxiliary_loss_mlp": 0.01265472, + "balance_loss_clip": 0.06271624, + "balance_loss_mlp": 0.0125503, + "epoch": 0.943213587855103, + "flos": 14506733197440.0, + "grad_norm": 2.2198187889767516, + "language_loss": 0.7578727, + "learning_rate": 3.369691556873011e-08, + "loss": 0.83460218, + "num_input_tokens_seen": 338274950, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10443115, + "step": 15688, + "time_per_iteration": 2.497774600982666 + }, + { + "auxiliary_loss_clip": 0.0639424, + "auxiliary_loss_mlp": 0.01264677, + "balance_loss_clip": 0.06269427, + "balance_loss_mlp": 0.01255188, + "epoch": 0.9432737111077709, + "flos": 28994054175360.0, + "grad_norm": 1.6545855096259856, + "language_loss": 0.68633425, + "learning_rate": 3.3625762232454504e-08, + "loss": 0.76292336, + "num_input_tokens_seen": 338295585, + "router_z_loss_clip": 1.24707031, + "router_z_loss_mlp": 0.0947876, + "step": 15689, + "time_per_iteration": 2.6034674644470215 + }, + { + "auxiliary_loss_clip": 0.06400025, + "auxiliary_loss_mlp": 0.01265711, + "balance_loss_clip": 0.06270765, + "balance_loss_mlp": 0.01257444, + "epoch": 0.9433338343604389, + "flos": 21614267928960.0, + "grad_norm": 1.6339942455994367, + "language_loss": 0.80775511, + "learning_rate": 3.35546834612872e-08, + "loss": 0.88441241, + "num_input_tokens_seen": 338314555, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.0826416, + "step": 15690, + "time_per_iteration": 2.523336410522461 + }, + { + "auxiliary_loss_clip": 0.06400111, + "auxiliary_loss_mlp": 0.01261797, + "balance_loss_clip": 0.06271239, + "balance_loss_mlp": 0.01252052, + "epoch": 0.9433939576131068, + "flos": 33190632138240.0, + "grad_norm": 1.7354077420100367, + "language_loss": 0.60600984, + "learning_rate": 3.348367925792317e-08, + "loss": 0.68262887, + "num_input_tokens_seen": 338336260, + "router_z_loss_clip": 1.28808594, + "router_z_loss_mlp": 0.09735107, + "step": 15691, + "time_per_iteration": 2.606536626815796 + }, + { + "auxiliary_loss_clip": 0.06404334, + "auxiliary_loss_mlp": 0.01266204, + "balance_loss_clip": 0.06272846, + "balance_loss_mlp": 0.01256769, + "epoch": 0.9434540808657749, + "flos": 20492808082560.0, + "grad_norm": 1.4492750689861678, + "language_loss": 0.6661129, + "learning_rate": 3.341274962505514e-08, + "loss": 0.74281824, + "num_input_tokens_seen": 338354680, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09436035, + "step": 15692, + "time_per_iteration": 2.498673439025879 + }, + { + "auxiliary_loss_clip": 0.06399876, + "auxiliary_loss_mlp": 0.01265516, + "balance_loss_clip": 0.06269374, + "balance_loss_mlp": 0.01255997, + "epoch": 0.9435142041184428, + "flos": 21549293487360.0, + "grad_norm": 2.3030634231510545, + "language_loss": 0.74972957, + "learning_rate": 3.334189456537251e-08, + "loss": 0.82638347, + "num_input_tokens_seen": 338372490, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.09515381, + "step": 15693, + "time_per_iteration": 3.9923908710479736 + }, + { + "auxiliary_loss_clip": 0.06400185, + "auxiliary_loss_mlp": 0.01262209, + "balance_loss_clip": 0.06271058, + "balance_loss_mlp": 0.01252881, + "epoch": 0.9435743273711108, + "flos": 25016004460800.0, + "grad_norm": 1.5946007545759409, + "language_loss": 0.73723388, + "learning_rate": 3.327111408156291e-08, + "loss": 0.81385785, + "num_input_tokens_seen": 338390870, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.09338379, + "step": 15694, + "time_per_iteration": 2.516932487487793 + }, + { + "auxiliary_loss_clip": 0.06313274, + "auxiliary_loss_mlp": 0.01251927, + "balance_loss_clip": 0.06259228, + "balance_loss_mlp": 0.01250888, + "epoch": 0.9436344506237787, + "flos": 60179916723840.0, + "grad_norm": 0.6942834206013441, + "language_loss": 0.50500864, + "learning_rate": 3.3200408176309316e-08, + "loss": 0.5806607, + "num_input_tokens_seen": 338453075, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.01039886, + "step": 15695, + "time_per_iteration": 3.178891181945801 + }, + { + "auxiliary_loss_clip": 0.06396429, + "auxiliary_loss_mlp": 0.0126797, + "balance_loss_clip": 0.06271218, + "balance_loss_mlp": 0.01259691, + "epoch": 0.9436945738764467, + "flos": 22243885107840.0, + "grad_norm": 1.5773322030260613, + "language_loss": 0.65293247, + "learning_rate": 3.312977685229335e-08, + "loss": 0.72957647, + "num_input_tokens_seen": 338471770, + "router_z_loss_clip": 1.25195312, + "router_z_loss_mlp": 0.08276367, + "step": 15696, + "time_per_iteration": 2.501094341278076 + }, + { + "auxiliary_loss_clip": 0.06403381, + "auxiliary_loss_mlp": 0.01261862, + "balance_loss_clip": 0.0627207, + "balance_loss_mlp": 0.01252683, + "epoch": 0.9437546971291146, + "flos": 25052034516480.0, + "grad_norm": 1.6284029505922766, + "language_loss": 0.66615683, + "learning_rate": 3.305922011219353e-08, + "loss": 0.7428093, + "num_input_tokens_seen": 338492190, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.09179688, + "step": 15697, + "time_per_iteration": 2.541961431503296 + }, + { + "auxiliary_loss_clip": 0.06310762, + "auxiliary_loss_mlp": 0.01253679, + "balance_loss_clip": 0.06256643, + "balance_loss_mlp": 0.01252642, + "epoch": 0.9438148203817827, + "flos": 56809556346240.0, + "grad_norm": 0.844263571757514, + "language_loss": 0.63148797, + "learning_rate": 3.298873795868506e-08, + "loss": 0.70713234, + "num_input_tokens_seen": 338552560, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.01036835, + "step": 15698, + "time_per_iteration": 3.051950216293335 + }, + { + "auxiliary_loss_clip": 0.06405546, + "auxiliary_loss_mlp": 0.0126485, + "balance_loss_clip": 0.06270891, + "balance_loss_mlp": 0.01255081, + "epoch": 0.9438749436344506, + "flos": 22352981523840.0, + "grad_norm": 1.8322973887510348, + "language_loss": 0.69760531, + "learning_rate": 3.291833039444092e-08, + "loss": 0.77430928, + "num_input_tokens_seen": 338571770, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.09759521, + "step": 15699, + "time_per_iteration": 2.504598379135132 + }, + { + "auxiliary_loss_clip": 0.06397957, + "auxiliary_loss_mlp": 0.01264465, + "balance_loss_clip": 0.06271023, + "balance_loss_mlp": 0.01255441, + "epoch": 0.9439350668871186, + "flos": 13375881694080.0, + "grad_norm": 2.165048866443223, + "language_loss": 0.74769372, + "learning_rate": 3.2847997422130734e-08, + "loss": 0.82431793, + "num_input_tokens_seen": 338587310, + "router_z_loss_clip": 1.27050781, + "router_z_loss_mlp": 0.090271, + "step": 15700, + "time_per_iteration": 2.4962573051452637 + }, + { + "auxiliary_loss_clip": 0.06398397, + "auxiliary_loss_mlp": 0.01263164, + "balance_loss_clip": 0.06269559, + "balance_loss_mlp": 0.01254033, + "epoch": 0.9439951901397866, + "flos": 17791113686400.0, + "grad_norm": 1.531110206414724, + "language_loss": 0.7072165, + "learning_rate": 3.2777739044421495e-08, + "loss": 0.78383207, + "num_input_tokens_seen": 338606235, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.09136963, + "step": 15701, + "time_per_iteration": 3.9256973266601562 + }, + { + "auxiliary_loss_clip": 0.06410138, + "auxiliary_loss_mlp": 0.01263745, + "balance_loss_clip": 0.06272127, + "balance_loss_mlp": 0.01254095, + "epoch": 0.9440553133924545, + "flos": 18885473936640.0, + "grad_norm": 1.6976214240404868, + "language_loss": 0.78259611, + "learning_rate": 3.2707555263977505e-08, + "loss": 0.85933489, + "num_input_tokens_seen": 338624090, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.09649658, + "step": 15702, + "time_per_iteration": 2.5262832641601562 + }, + { + "auxiliary_loss_clip": 0.06404, + "auxiliary_loss_mlp": 0.0126415, + "balance_loss_clip": 0.06271169, + "balance_loss_mlp": 0.01254548, + "epoch": 0.9441154366451225, + "flos": 19579017381120.0, + "grad_norm": 2.6087828966167326, + "language_loss": 0.66408789, + "learning_rate": 3.2637446083460194e-08, + "loss": 0.74076939, + "num_input_tokens_seen": 338643695, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.0960083, + "step": 15703, + "time_per_iteration": 2.4908831119537354 + }, + { + "auxiliary_loss_clip": 0.06404126, + "auxiliary_loss_mlp": 0.01264876, + "balance_loss_clip": 0.06271374, + "balance_loss_mlp": 0.01255685, + "epoch": 0.9441755598977905, + "flos": 30302037210240.0, + "grad_norm": 1.5526862694072474, + "language_loss": 0.73514414, + "learning_rate": 3.256741150552833e-08, + "loss": 0.81183422, + "num_input_tokens_seen": 338664725, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.09191895, + "step": 15704, + "time_per_iteration": 2.578453302383423 + }, + { + "auxiliary_loss_clip": 0.06397037, + "auxiliary_loss_mlp": 0.01265014, + "balance_loss_clip": 0.06270902, + "balance_loss_mlp": 0.01255686, + "epoch": 0.9442356831504585, + "flos": 20674174245120.0, + "grad_norm": 1.9988169073450903, + "language_loss": 0.74711281, + "learning_rate": 3.2497451532837336e-08, + "loss": 0.82373333, + "num_input_tokens_seen": 338683990, + "router_z_loss_clip": 1.26269531, + "router_z_loss_mlp": 0.09332275, + "step": 15705, + "time_per_iteration": 2.50264835357666 + }, + { + "auxiliary_loss_clip": 0.06400542, + "auxiliary_loss_mlp": 0.0126436, + "balance_loss_clip": 0.06270608, + "balance_loss_mlp": 0.01255139, + "epoch": 0.9442958064031264, + "flos": 16112809532160.0, + "grad_norm": 1.6809193926837838, + "language_loss": 0.77485085, + "learning_rate": 3.2427566168039986e-08, + "loss": 0.8514998, + "num_input_tokens_seen": 338702025, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.09222412, + "step": 15706, + "time_per_iteration": 2.4911396503448486 + }, + { + "auxiliary_loss_clip": 0.06395966, + "auxiliary_loss_mlp": 0.01261484, + "balance_loss_clip": 0.06269921, + "balance_loss_mlp": 0.01252537, + "epoch": 0.9443559296557944, + "flos": 20453381936640.0, + "grad_norm": 2.6863035412051612, + "language_loss": 0.69485629, + "learning_rate": 3.23577554137866e-08, + "loss": 0.77143085, + "num_input_tokens_seen": 338720920, + "router_z_loss_clip": 1.25976562, + "router_z_loss_mlp": 0.08953857, + "step": 15707, + "time_per_iteration": 3.932788133621216 + }, + { + "auxiliary_loss_clip": 0.06392172, + "auxiliary_loss_mlp": 0.01261239, + "balance_loss_clip": 0.06267689, + "balance_loss_mlp": 0.0125284, + "epoch": 0.9444160529084623, + "flos": 21616406208000.0, + "grad_norm": 1.7233425168990235, + "language_loss": 0.69313765, + "learning_rate": 3.22880192727244e-08, + "loss": 0.76967174, + "num_input_tokens_seen": 338739590, + "router_z_loss_clip": 1.24414062, + "router_z_loss_mlp": 0.08398438, + "step": 15708, + "time_per_iteration": 2.488739490509033 + }, + { + "auxiliary_loss_clip": 0.06398219, + "auxiliary_loss_mlp": 0.01263278, + "balance_loss_clip": 0.06269833, + "balance_loss_mlp": 0.01254599, + "epoch": 0.9444761761611303, + "flos": 18447620826240.0, + "grad_norm": 2.4449285040700905, + "language_loss": 0.7077049, + "learning_rate": 3.221835774749748e-08, + "loss": 0.78431988, + "num_input_tokens_seen": 338757240, + "router_z_loss_clip": 1.28320312, + "router_z_loss_mlp": 0.08679199, + "step": 15709, + "time_per_iteration": 2.486844539642334 + }, + { + "auxiliary_loss_clip": 0.06396931, + "auxiliary_loss_mlp": 0.01264514, + "balance_loss_clip": 0.06268953, + "balance_loss_mlp": 0.01255675, + "epoch": 0.9445362994137982, + "flos": 20963043596160.0, + "grad_norm": 1.9344210100070667, + "language_loss": 0.85356987, + "learning_rate": 3.214877084074774e-08, + "loss": 0.93018436, + "num_input_tokens_seen": 338773750, + "router_z_loss_clip": 1.27929688, + "router_z_loss_mlp": 0.08837891, + "step": 15710, + "time_per_iteration": 2.477931261062622 + }, + { + "auxiliary_loss_clip": 0.06406383, + "auxiliary_loss_mlp": 0.0126325, + "balance_loss_clip": 0.06272225, + "balance_loss_mlp": 0.01253284, + "epoch": 0.9445964226664663, + "flos": 20309555203200.0, + "grad_norm": 1.6267551376340164, + "language_loss": 0.71685177, + "learning_rate": 3.2079258555113956e-08, + "loss": 0.79354811, + "num_input_tokens_seen": 338792115, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.09967041, + "step": 15711, + "time_per_iteration": 3.9364025592803955 + }, + { + "auxiliary_loss_clip": 0.06402559, + "auxiliary_loss_mlp": 0.01262817, + "balance_loss_clip": 0.06272049, + "balance_loss_mlp": 0.01254096, + "epoch": 0.9446565459191342, + "flos": 26403259057920.0, + "grad_norm": 2.3323613984996707, + "language_loss": 0.69751537, + "learning_rate": 3.200982089323179e-08, + "loss": 0.77416909, + "num_input_tokens_seen": 338812480, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.0871582, + "step": 15712, + "time_per_iteration": 2.556997060775757 + }, + { + "auxiliary_loss_clip": 0.0640899, + "auxiliary_loss_mlp": 0.01265857, + "balance_loss_clip": 0.06272276, + "balance_loss_mlp": 0.01255212, + "epoch": 0.9447166691718022, + "flos": 16550327226240.0, + "grad_norm": 2.2946300657355976, + "language_loss": 0.70720011, + "learning_rate": 3.1940457857734246e-08, + "loss": 0.78394854, + "num_input_tokens_seen": 338829105, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.10650635, + "step": 15713, + "time_per_iteration": 2.5120773315429688 + }, + { + "auxiliary_loss_clip": 0.0639579, + "auxiliary_loss_mlp": 0.01266227, + "balance_loss_clip": 0.06270416, + "balance_loss_mlp": 0.01256828, + "epoch": 0.9447767924244702, + "flos": 29171604977280.0, + "grad_norm": 1.4532838118975553, + "language_loss": 0.76606899, + "learning_rate": 3.187116945125212e-08, + "loss": 0.84268916, + "num_input_tokens_seen": 338850670, + "router_z_loss_clip": 1.25195312, + "router_z_loss_mlp": 0.09399414, + "step": 15714, + "time_per_iteration": 2.5846641063690186 + }, + { + "auxiliary_loss_clip": 0.06404714, + "auxiliary_loss_mlp": 0.01265239, + "balance_loss_clip": 0.06270965, + "balance_loss_mlp": 0.01255577, + "epoch": 0.9448369156771381, + "flos": 19279875905280.0, + "grad_norm": 1.7877405259726427, + "language_loss": 0.68124247, + "learning_rate": 3.1801955676412194e-08, + "loss": 0.75794196, + "num_input_tokens_seen": 338867795, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.09661865, + "step": 15715, + "time_per_iteration": 2.517007350921631 + }, + { + "auxiliary_loss_clip": 0.06405981, + "auxiliary_loss_mlp": 0.01265021, + "balance_loss_clip": 0.06272849, + "balance_loss_mlp": 0.01254763, + "epoch": 0.9448970389298061, + "flos": 23847823163520.0, + "grad_norm": 1.7071461081986556, + "language_loss": 0.74850857, + "learning_rate": 3.173281653583948e-08, + "loss": 0.82521862, + "num_input_tokens_seen": 338887205, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.1026001, + "step": 15716, + "time_per_iteration": 2.5198490619659424 + }, + { + "auxiliary_loss_clip": 0.06407739, + "auxiliary_loss_mlp": 0.01264418, + "balance_loss_clip": 0.06275283, + "balance_loss_mlp": 0.01255078, + "epoch": 0.944957162182474, + "flos": 22388760017280.0, + "grad_norm": 1.6811142354543167, + "language_loss": 0.62509549, + "learning_rate": 3.166375203215565e-08, + "loss": 0.70181704, + "num_input_tokens_seen": 338906130, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09338379, + "step": 15717, + "time_per_iteration": 2.5217764377593994 + }, + { + "auxiliary_loss_clip": 0.06400305, + "auxiliary_loss_mlp": 0.0126476, + "balance_loss_clip": 0.06270771, + "balance_loss_mlp": 0.01255444, + "epoch": 0.9450172854351421, + "flos": 17389584120960.0, + "grad_norm": 1.5696006706759635, + "language_loss": 0.7965737, + "learning_rate": 3.1594762167979514e-08, + "loss": 0.87322432, + "num_input_tokens_seen": 338923045, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.09313965, + "step": 15718, + "time_per_iteration": 2.4564990997314453 + }, + { + "auxiliary_loss_clip": 0.06306401, + "auxiliary_loss_mlp": 0.01249456, + "balance_loss_clip": 0.06252193, + "balance_loss_mlp": 0.01248478, + "epoch": 0.94507740868781, + "flos": 68487092760960.0, + "grad_norm": 0.6920512223947758, + "language_loss": 0.57755935, + "learning_rate": 3.152584694592719e-08, + "loss": 0.65311795, + "num_input_tokens_seen": 338987545, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.00977325, + "step": 15719, + "time_per_iteration": 3.150592565536499 + }, + { + "auxiliary_loss_clip": 0.06405877, + "auxiliary_loss_mlp": 0.01267549, + "balance_loss_clip": 0.06272814, + "balance_loss_mlp": 0.0125797, + "epoch": 0.945137531940478, + "flos": 21148895952000.0, + "grad_norm": 1.5595416281624737, + "language_loss": 0.75960934, + "learning_rate": 3.145700636861193e-08, + "loss": 0.83634359, + "num_input_tokens_seen": 339007830, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.09570312, + "step": 15720, + "time_per_iteration": 2.521066427230835 + }, + { + "auxiliary_loss_clip": 0.06395644, + "auxiliary_loss_mlp": 0.01265072, + "balance_loss_clip": 0.0626734, + "balance_loss_mlp": 0.01256763, + "epoch": 0.9451976551931459, + "flos": 24540611921280.0, + "grad_norm": 1.9699653920542373, + "language_loss": 0.73071945, + "learning_rate": 3.138824043864452e-08, + "loss": 0.80732661, + "num_input_tokens_seen": 339028980, + "router_z_loss_clip": 1.28320312, + "router_z_loss_mlp": 0.08300781, + "step": 15721, + "time_per_iteration": 2.525794267654419 + }, + { + "auxiliary_loss_clip": 0.06402142, + "auxiliary_loss_mlp": 0.01262673, + "balance_loss_clip": 0.06270024, + "balance_loss_mlp": 0.01253369, + "epoch": 0.9452577784458139, + "flos": 23447299847040.0, + "grad_norm": 1.718614090375189, + "language_loss": 0.85034347, + "learning_rate": 3.131954915863244e-08, + "loss": 0.92699158, + "num_input_tokens_seen": 339047950, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.09301758, + "step": 15722, + "time_per_iteration": 2.536926746368408 + }, + { + "auxiliary_loss_clip": 0.06309976, + "auxiliary_loss_mlp": 0.01254115, + "balance_loss_clip": 0.06255897, + "balance_loss_mlp": 0.01253094, + "epoch": 0.9453179016984818, + "flos": 52036749054720.0, + "grad_norm": 0.884744124121599, + "language_loss": 0.64469177, + "learning_rate": 3.125093253118005e-08, + "loss": 0.72033274, + "num_input_tokens_seen": 339104535, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.01020813, + "step": 15723, + "time_per_iteration": 3.1003150939941406 + }, + { + "auxiliary_loss_clip": 0.06405857, + "auxiliary_loss_mlp": 0.01265921, + "balance_loss_clip": 0.0627241, + "balance_loss_mlp": 0.01255646, + "epoch": 0.9453780249511499, + "flos": 13476886191360.0, + "grad_norm": 1.970769174235418, + "language_loss": 0.7331022, + "learning_rate": 3.1182390558889715e-08, + "loss": 0.80982006, + "num_input_tokens_seen": 339122050, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.10266113, + "step": 15724, + "time_per_iteration": 2.4845023155212402 + }, + { + "auxiliary_loss_clip": 0.06401257, + "auxiliary_loss_mlp": 0.01266566, + "balance_loss_clip": 0.0627144, + "balance_loss_mlp": 0.01257625, + "epoch": 0.9454381482038178, + "flos": 23265262851840.0, + "grad_norm": 1.854039175790055, + "language_loss": 0.84987056, + "learning_rate": 3.111392324436024e-08, + "loss": 0.92654884, + "num_input_tokens_seen": 339138940, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.08947754, + "step": 15725, + "time_per_iteration": 2.5003042221069336 + }, + { + "auxiliary_loss_clip": 0.06403221, + "auxiliary_loss_mlp": 0.01262907, + "balance_loss_clip": 0.06270561, + "balance_loss_mlp": 0.01253627, + "epoch": 0.9454982714564858, + "flos": 19502093733120.0, + "grad_norm": 1.8779217955872736, + "language_loss": 0.71166205, + "learning_rate": 3.104553059018822e-08, + "loss": 0.78832328, + "num_input_tokens_seen": 339158245, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09277344, + "step": 15726, + "time_per_iteration": 2.5910589694976807 + }, + { + "auxiliary_loss_clip": 0.06402659, + "auxiliary_loss_mlp": 0.0126494, + "balance_loss_clip": 0.062715, + "balance_loss_mlp": 0.0125532, + "epoch": 0.9455583947091538, + "flos": 23264801654400.0, + "grad_norm": 1.8879911426467153, + "language_loss": 0.61094165, + "learning_rate": 3.097721259896735e-08, + "loss": 0.68761766, + "num_input_tokens_seen": 339178200, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.09619141, + "step": 15727, + "time_per_iteration": 2.636110782623291 + }, + { + "auxiliary_loss_clip": 0.06398436, + "auxiliary_loss_mlp": 0.01268185, + "balance_loss_clip": 0.06271493, + "balance_loss_mlp": 0.01259614, + "epoch": 0.9456185179618217, + "flos": 17678327690880.0, + "grad_norm": 1.7197111625111396, + "language_loss": 0.82013702, + "learning_rate": 3.0908969273287566e-08, + "loss": 0.8968032, + "num_input_tokens_seen": 339193950, + "router_z_loss_clip": 1.26953125, + "router_z_loss_mlp": 0.08569336, + "step": 15728, + "time_per_iteration": 2.5550687313079834 + }, + { + "auxiliary_loss_clip": 0.06308329, + "auxiliary_loss_mlp": 0.01249812, + "balance_loss_clip": 0.0625433, + "balance_loss_mlp": 0.01248773, + "epoch": 0.9456786412144897, + "flos": 61433002535040.0, + "grad_norm": 0.7391636345974608, + "language_loss": 0.58712065, + "learning_rate": 3.08408006157368e-08, + "loss": 0.66270202, + "num_input_tokens_seen": 339252330, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.01039124, + "step": 15729, + "time_per_iteration": 3.104180335998535 + }, + { + "auxiliary_loss_clip": 0.06399846, + "auxiliary_loss_mlp": 0.01264543, + "balance_loss_clip": 0.06271389, + "balance_loss_mlp": 0.01255465, + "epoch": 0.9457387644671577, + "flos": 18594340525440.0, + "grad_norm": 2.1443897362387814, + "language_loss": 0.77353084, + "learning_rate": 3.077270662890052e-08, + "loss": 0.85017467, + "num_input_tokens_seen": 339270325, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.09082031, + "step": 15730, + "time_per_iteration": 2.5131759643554688 + }, + { + "auxiliary_loss_clip": 0.06399836, + "auxiliary_loss_mlp": 0.01267427, + "balance_loss_clip": 0.06267837, + "balance_loss_mlp": 0.01257688, + "epoch": 0.9457988877198257, + "flos": 21115381518720.0, + "grad_norm": 1.6416517192605633, + "language_loss": 0.63005936, + "learning_rate": 3.070468731536047e-08, + "loss": 0.70673198, + "num_input_tokens_seen": 339291980, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09747314, + "step": 15731, + "time_per_iteration": 2.530729293823242 + }, + { + "auxiliary_loss_clip": 0.06402969, + "auxiliary_loss_mlp": 0.01262855, + "balance_loss_clip": 0.06271915, + "balance_loss_mlp": 0.01252955, + "epoch": 0.9458590109724936, + "flos": 26695734134400.0, + "grad_norm": 1.9292294773012948, + "language_loss": 0.6470663, + "learning_rate": 3.063674267769589e-08, + "loss": 0.7237246, + "num_input_tokens_seen": 339311795, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.09899902, + "step": 15732, + "time_per_iteration": 3.9439215660095215 + }, + { + "auxiliary_loss_clip": 0.06409542, + "auxiliary_loss_mlp": 0.01262122, + "balance_loss_clip": 0.06273539, + "balance_loss_mlp": 0.01252383, + "epoch": 0.9459191342251616, + "flos": 18667616520960.0, + "grad_norm": 1.677687050760564, + "language_loss": 0.84323162, + "learning_rate": 3.056887271848363e-08, + "loss": 0.91994834, + "num_input_tokens_seen": 339327745, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.09741211, + "step": 15733, + "time_per_iteration": 2.488312005996704 + }, + { + "auxiliary_loss_clip": 0.06393486, + "auxiliary_loss_mlp": 0.01264252, + "balance_loss_clip": 0.06267101, + "balance_loss_mlp": 0.01255633, + "epoch": 0.9459792574778295, + "flos": 23404226048640.0, + "grad_norm": 1.452807558700151, + "language_loss": 0.72373539, + "learning_rate": 3.0501077440297173e-08, + "loss": 0.80031276, + "num_input_tokens_seen": 339346445, + "router_z_loss_clip": 1.26464844, + "router_z_loss_mlp": 0.08612061, + "step": 15734, + "time_per_iteration": 2.6829605102539062 + }, + { + "auxiliary_loss_clip": 0.06394021, + "auxiliary_loss_mlp": 0.01264276, + "balance_loss_clip": 0.06269066, + "balance_loss_mlp": 0.01256527, + "epoch": 0.9460393807304975, + "flos": 24400474767360.0, + "grad_norm": 1.566131852204227, + "language_loss": 0.86707246, + "learning_rate": 3.043335684570692e-08, + "loss": 0.94365543, + "num_input_tokens_seen": 339367945, + "router_z_loss_clip": 1.25, + "router_z_loss_mlp": 0.07739258, + "step": 15735, + "time_per_iteration": 2.549342632293701 + }, + { + "auxiliary_loss_clip": 0.06399663, + "auxiliary_loss_mlp": 0.01263854, + "balance_loss_clip": 0.06269069, + "balance_loss_mlp": 0.01254887, + "epoch": 0.9460995039831654, + "flos": 21944995194240.0, + "grad_norm": 1.6903865141289935, + "language_loss": 0.67260051, + "learning_rate": 3.036571093728102e-08, + "loss": 0.74923569, + "num_input_tokens_seen": 339386060, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.08969116, + "step": 15736, + "time_per_iteration": 2.4905238151550293 + }, + { + "auxiliary_loss_clip": 0.06303936, + "auxiliary_loss_mlp": 0.01249824, + "balance_loss_clip": 0.06249891, + "balance_loss_mlp": 0.01248861, + "epoch": 0.9461596272358335, + "flos": 70342738081920.0, + "grad_norm": 0.8456385965936714, + "language_loss": 0.65439987, + "learning_rate": 3.029813971758499e-08, + "loss": 0.72993743, + "num_input_tokens_seen": 339446695, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.00961304, + "step": 15737, + "time_per_iteration": 3.1456351280212402 + }, + { + "auxiliary_loss_clip": 0.06310228, + "auxiliary_loss_mlp": 0.01250707, + "balance_loss_clip": 0.06256226, + "balance_loss_mlp": 0.01249746, + "epoch": 0.9462197504885014, + "flos": 58612427994240.0, + "grad_norm": 0.7768588148943026, + "language_loss": 0.58685583, + "learning_rate": 3.0230643189181225e-08, + "loss": 0.66246521, + "num_input_tokens_seen": 339510080, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.00959778, + "step": 15738, + "time_per_iteration": 3.1362509727478027 + }, + { + "auxiliary_loss_clip": 0.06394856, + "auxiliary_loss_mlp": 0.01264418, + "balance_loss_clip": 0.06267979, + "balance_loss_mlp": 0.01256121, + "epoch": 0.9462798737411694, + "flos": 23439333709440.0, + "grad_norm": 1.8516554697337375, + "language_loss": 0.71715391, + "learning_rate": 3.016322135462834e-08, + "loss": 0.79374659, + "num_input_tokens_seen": 339529335, + "router_z_loss_clip": 1.26855469, + "router_z_loss_mlp": 0.08294678, + "step": 15739, + "time_per_iteration": 2.5040197372436523 + }, + { + "auxiliary_loss_clip": 0.06402469, + "auxiliary_loss_mlp": 0.01265002, + "balance_loss_clip": 0.06269836, + "balance_loss_mlp": 0.01255227, + "epoch": 0.9463399969938374, + "flos": 25053082692480.0, + "grad_norm": 2.1300906946077953, + "language_loss": 0.6520685, + "learning_rate": 3.009587421648363e-08, + "loss": 0.7287432, + "num_input_tokens_seen": 339548820, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.09765625, + "step": 15740, + "time_per_iteration": 3.9453022480010986 + }, + { + "auxiliary_loss_clip": 0.06396136, + "auxiliary_loss_mlp": 0.01269325, + "balance_loss_clip": 0.06268455, + "balance_loss_mlp": 0.01260164, + "epoch": 0.9464001202465053, + "flos": 24359455393920.0, + "grad_norm": 1.573667052728098, + "language_loss": 0.66363811, + "learning_rate": 3.0028601777301045e-08, + "loss": 0.74029279, + "num_input_tokens_seen": 339566775, + "router_z_loss_clip": 1.27734375, + "router_z_loss_mlp": 0.09155273, + "step": 15741, + "time_per_iteration": 2.5351650714874268 + }, + { + "auxiliary_loss_clip": 0.06402055, + "auxiliary_loss_mlp": 0.01265101, + "balance_loss_clip": 0.06270935, + "balance_loss_mlp": 0.01256018, + "epoch": 0.9464602434991733, + "flos": 17171181653760.0, + "grad_norm": 1.8202599223323925, + "language_loss": 0.76282263, + "learning_rate": 2.9961404039630987e-08, + "loss": 0.83949423, + "num_input_tokens_seen": 339581905, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09094238, + "step": 15742, + "time_per_iteration": 2.4993362426757812 + }, + { + "auxiliary_loss_clip": 0.06400387, + "auxiliary_loss_mlp": 0.01264176, + "balance_loss_clip": 0.06272254, + "balance_loss_mlp": 0.01255152, + "epoch": 0.9465203667518413, + "flos": 19944265328640.0, + "grad_norm": 1.8835810090915717, + "language_loss": 0.72201908, + "learning_rate": 2.989428100602187e-08, + "loss": 0.79866475, + "num_input_tokens_seen": 339599870, + "router_z_loss_clip": 1.27929688, + "router_z_loss_mlp": 0.090271, + "step": 15743, + "time_per_iteration": 2.5028302669525146 + }, + { + "auxiliary_loss_clip": 0.06402981, + "auxiliary_loss_mlp": 0.01265574, + "balance_loss_clip": 0.06269473, + "balance_loss_mlp": 0.01255585, + "epoch": 0.9465804900045093, + "flos": 20126470032000.0, + "grad_norm": 1.615168658581885, + "language_loss": 0.80039352, + "learning_rate": 2.982723267901943e-08, + "loss": 0.87707901, + "num_input_tokens_seen": 339620250, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.09991455, + "step": 15744, + "time_per_iteration": 2.5396833419799805 + }, + { + "auxiliary_loss_clip": 0.06402554, + "auxiliary_loss_mlp": 0.01267498, + "balance_loss_clip": 0.06269826, + "balance_loss_mlp": 0.01257502, + "epoch": 0.9466406132571772, + "flos": 23917870776960.0, + "grad_norm": 1.6501908259993738, + "language_loss": 0.78493166, + "learning_rate": 2.9760259061165417e-08, + "loss": 0.86163217, + "num_input_tokens_seen": 339639900, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.09991455, + "step": 15745, + "time_per_iteration": 2.5577425956726074 + }, + { + "auxiliary_loss_clip": 0.06403811, + "auxiliary_loss_mlp": 0.01268431, + "balance_loss_clip": 0.06269467, + "balance_loss_mlp": 0.01258113, + "epoch": 0.9467007365098452, + "flos": 19938563251200.0, + "grad_norm": 1.513557471901544, + "language_loss": 0.70127267, + "learning_rate": 2.9693360155000014e-08, + "loss": 0.77799511, + "num_input_tokens_seen": 339658970, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.10308838, + "step": 15746, + "time_per_iteration": 2.5116147994995117 + }, + { + "auxiliary_loss_clip": 0.06400457, + "auxiliary_loss_mlp": 0.01264802, + "balance_loss_clip": 0.06270906, + "balance_loss_mlp": 0.01255092, + "epoch": 0.9467608597625131, + "flos": 19315318982400.0, + "grad_norm": 2.038079128612824, + "language_loss": 0.56620514, + "learning_rate": 2.962653596305964e-08, + "loss": 0.64285767, + "num_input_tokens_seen": 339675600, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.09710693, + "step": 15747, + "time_per_iteration": 4.008328914642334 + }, + { + "auxiliary_loss_clip": 0.06305839, + "auxiliary_loss_mlp": 0.01248436, + "balance_loss_clip": 0.06251822, + "balance_loss_mlp": 0.01247403, + "epoch": 0.9468209830151811, + "flos": 69650578229760.0, + "grad_norm": 0.6388680889443452, + "language_loss": 0.53260732, + "learning_rate": 2.955978648787871e-08, + "loss": 0.60815012, + "num_input_tokens_seen": 339744505, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.01033783, + "step": 15748, + "time_per_iteration": 3.302865743637085 + }, + { + "auxiliary_loss_clip": 0.06403889, + "auxiliary_loss_mlp": 0.01263785, + "balance_loss_clip": 0.06272125, + "balance_loss_mlp": 0.0125432, + "epoch": 0.946881106267849, + "flos": 27024029631360.0, + "grad_norm": 1.6131180095460511, + "language_loss": 0.66900456, + "learning_rate": 2.9493111731988096e-08, + "loss": 0.74568129, + "num_input_tokens_seen": 339765810, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09460449, + "step": 15749, + "time_per_iteration": 2.5552892684936523 + }, + { + "auxiliary_loss_clip": 0.06402941, + "auxiliary_loss_mlp": 0.01263987, + "balance_loss_clip": 0.06269799, + "balance_loss_mlp": 0.01253371, + "epoch": 0.9469412295205171, + "flos": 20195721031680.0, + "grad_norm": 1.9171819700733619, + "language_loss": 0.76360601, + "learning_rate": 2.942651169791621e-08, + "loss": 0.84027529, + "num_input_tokens_seen": 339784125, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.10620117, + "step": 15750, + "time_per_iteration": 3.9931576251983643 + }, + { + "auxiliary_loss_clip": 0.06399237, + "auxiliary_loss_mlp": 0.01263463, + "balance_loss_clip": 0.06271112, + "balance_loss_mlp": 0.01254403, + "epoch": 0.947001352773185, + "flos": 21331352217600.0, + "grad_norm": 6.300306404866139, + "language_loss": 0.6824044, + "learning_rate": 2.9359986388188372e-08, + "loss": 0.75903136, + "num_input_tokens_seen": 339803450, + "router_z_loss_clip": 1.28027344, + "router_z_loss_mlp": 0.09057617, + "step": 15751, + "time_per_iteration": 2.5015761852264404 + }, + { + "auxiliary_loss_clip": 0.06403518, + "auxiliary_loss_mlp": 0.01264898, + "balance_loss_clip": 0.062707, + "balance_loss_mlp": 0.01255296, + "epoch": 0.947061476025853, + "flos": 21950403782400.0, + "grad_norm": 1.5258403559147693, + "language_loss": 0.65762782, + "learning_rate": 2.929353580532723e-08, + "loss": 0.734312, + "num_input_tokens_seen": 339823215, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.0960083, + "step": 15752, + "time_per_iteration": 2.5320088863372803 + }, + { + "auxiliary_loss_clip": 0.0640187, + "auxiliary_loss_mlp": 0.01265282, + "balance_loss_clip": 0.06272066, + "balance_loss_mlp": 0.01256121, + "epoch": 0.947121599278521, + "flos": 21400645144320.0, + "grad_norm": 1.5250116712794441, + "language_loss": 0.71658498, + "learning_rate": 2.9227159951852764e-08, + "loss": 0.79325652, + "num_input_tokens_seen": 339842230, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.09161377, + "step": 15753, + "time_per_iteration": 2.5358986854553223 + }, + { + "auxiliary_loss_clip": 0.06404962, + "auxiliary_loss_mlp": 0.01264996, + "balance_loss_clip": 0.06269598, + "balance_loss_mlp": 0.0125484, + "epoch": 0.9471817225311889, + "flos": 23082387315840.0, + "grad_norm": 2.318871000803308, + "language_loss": 0.70373905, + "learning_rate": 2.9160858830281855e-08, + "loss": 0.78043866, + "num_input_tokens_seen": 339861640, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.10168457, + "step": 15754, + "time_per_iteration": 2.5675079822540283 + }, + { + "auxiliary_loss_clip": 0.06402844, + "auxiliary_loss_mlp": 0.01262674, + "balance_loss_clip": 0.06269033, + "balance_loss_mlp": 0.01253113, + "epoch": 0.947241845783857, + "flos": 11915476882560.0, + "grad_norm": 2.1288030858444107, + "language_loss": 0.79356575, + "learning_rate": 2.9094632443129153e-08, + "loss": 0.8702209, + "num_input_tokens_seen": 339878210, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.09552002, + "step": 15755, + "time_per_iteration": 2.4721009731292725 + }, + { + "auxiliary_loss_clip": 0.06409688, + "auxiliary_loss_mlp": 0.01266846, + "balance_loss_clip": 0.06272167, + "balance_loss_mlp": 0.01255771, + "epoch": 0.9473019690365249, + "flos": 20746947116160.0, + "grad_norm": 2.0378371913661333, + "language_loss": 0.75405908, + "learning_rate": 2.9028480792904876e-08, + "loss": 0.83082443, + "num_input_tokens_seen": 339894255, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.11071777, + "step": 15756, + "time_per_iteration": 2.5512049198150635 + }, + { + "auxiliary_loss_clip": 0.06406745, + "auxiliary_loss_mlp": 0.01263174, + "balance_loss_clip": 0.06272939, + "balance_loss_mlp": 0.01253458, + "epoch": 0.9473620922891929, + "flos": 17645735652480.0, + "grad_norm": 2.0156510018018317, + "language_loss": 0.74623597, + "learning_rate": 2.8962403882118347e-08, + "loss": 0.82293516, + "num_input_tokens_seen": 339912425, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.09710693, + "step": 15757, + "time_per_iteration": 2.500520706176758 + }, + { + "auxiliary_loss_clip": 0.06404679, + "auxiliary_loss_mlp": 0.01263311, + "balance_loss_clip": 0.06270847, + "balance_loss_mlp": 0.01253744, + "epoch": 0.9474222155418608, + "flos": 23556731679360.0, + "grad_norm": 2.0076349731074843, + "language_loss": 0.79710162, + "learning_rate": 2.889640171327512e-08, + "loss": 0.8737815, + "num_input_tokens_seen": 339929635, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.09558105, + "step": 15758, + "time_per_iteration": 2.536018133163452 + }, + { + "auxiliary_loss_clip": 0.06401306, + "auxiliary_loss_mlp": 0.0126548, + "balance_loss_clip": 0.06272912, + "balance_loss_mlp": 0.01256027, + "epoch": 0.9474823387945288, + "flos": 27097179845760.0, + "grad_norm": 1.7167664956687578, + "language_loss": 0.7194469, + "learning_rate": 2.8830474288877638e-08, + "loss": 0.79611474, + "num_input_tokens_seen": 339951200, + "router_z_loss_clip": 1.28320312, + "router_z_loss_mlp": 0.09454346, + "step": 15759, + "time_per_iteration": 2.537297487258911 + }, + { + "auxiliary_loss_clip": 0.06395267, + "auxiliary_loss_mlp": 0.01263013, + "balance_loss_clip": 0.06270848, + "balance_loss_mlp": 0.01255175, + "epoch": 0.9475424620471967, + "flos": 22973207045760.0, + "grad_norm": 1.518890611164647, + "language_loss": 0.75593793, + "learning_rate": 2.8764621611426344e-08, + "loss": 0.83252072, + "num_input_tokens_seen": 339971820, + "router_z_loss_clip": 1.24316406, + "router_z_loss_mlp": 0.07830811, + "step": 15760, + "time_per_iteration": 2.506772518157959 + }, + { + "auxiliary_loss_clip": 0.0640036, + "auxiliary_loss_mlp": 0.01262958, + "balance_loss_clip": 0.06270038, + "balance_loss_mlp": 0.01254065, + "epoch": 0.9476025852998647, + "flos": 20053864869120.0, + "grad_norm": 1.6429269418431312, + "language_loss": 0.72826153, + "learning_rate": 2.8698843683418128e-08, + "loss": 0.80489469, + "num_input_tokens_seen": 339989420, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.08880615, + "step": 15761, + "time_per_iteration": 2.461029291152954 + }, + { + "auxiliary_loss_clip": 0.06400488, + "auxiliary_loss_mlp": 0.01263148, + "balance_loss_clip": 0.0627092, + "balance_loss_mlp": 0.01254511, + "epoch": 0.9476627085525327, + "flos": 14980700217600.0, + "grad_norm": 2.0388938661384066, + "language_loss": 0.72076392, + "learning_rate": 2.863314050734722e-08, + "loss": 0.7974003, + "num_input_tokens_seen": 340006690, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.08642578, + "step": 15762, + "time_per_iteration": 2.4437167644500732 + }, + { + "auxiliary_loss_clip": 0.06409766, + "auxiliary_loss_mlp": 0.01263153, + "balance_loss_clip": 0.0627232, + "balance_loss_mlp": 0.01253235, + "epoch": 0.9477228318052007, + "flos": 18703772357760.0, + "grad_norm": 1.9307223538038316, + "language_loss": 0.67410612, + "learning_rate": 2.856751208570518e-08, + "loss": 0.7508353, + "num_input_tokens_seen": 340025480, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.09912109, + "step": 15763, + "time_per_iteration": 2.4636471271514893 + }, + { + "auxiliary_loss_clip": 0.06403434, + "auxiliary_loss_mlp": 0.01263146, + "balance_loss_clip": 0.06269363, + "balance_loss_mlp": 0.01254378, + "epoch": 0.9477829550578686, + "flos": 23881295669760.0, + "grad_norm": 1.6268798558288402, + "language_loss": 0.70511979, + "learning_rate": 2.8501958420980466e-08, + "loss": 0.78178561, + "num_input_tokens_seen": 340043785, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.08764648, + "step": 15764, + "time_per_iteration": 2.507150888442993 + }, + { + "auxiliary_loss_clip": 0.06395758, + "auxiliary_loss_mlp": 0.0126393, + "balance_loss_clip": 0.06272718, + "balance_loss_mlp": 0.01255937, + "epoch": 0.9478430783105366, + "flos": 22569119930880.0, + "grad_norm": 1.638940250411441, + "language_loss": 0.71428376, + "learning_rate": 2.8436479515659306e-08, + "loss": 0.79088062, + "num_input_tokens_seen": 340064360, + "router_z_loss_clip": 1.23144531, + "router_z_loss_mlp": 0.07983398, + "step": 15765, + "time_per_iteration": 2.507747173309326 + }, + { + "auxiliary_loss_clip": 0.06314638, + "auxiliary_loss_mlp": 0.01249169, + "balance_loss_clip": 0.06260315, + "balance_loss_mlp": 0.01248279, + "epoch": 0.9479032015632046, + "flos": 60874103802240.0, + "grad_norm": 0.7940134593806808, + "language_loss": 0.58885753, + "learning_rate": 2.8371075372224384e-08, + "loss": 0.66449559, + "num_input_tokens_seen": 340114425, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.00889587, + "step": 15766, + "time_per_iteration": 2.895747184753418 + }, + { + "auxiliary_loss_clip": 0.0640133, + "auxiliary_loss_mlp": 0.01264695, + "balance_loss_clip": 0.06271998, + "balance_loss_mlp": 0.01255641, + "epoch": 0.9479633248158725, + "flos": 14689105608960.0, + "grad_norm": 2.0710351232242337, + "language_loss": 0.74133766, + "learning_rate": 2.8305745993155938e-08, + "loss": 0.81799787, + "num_input_tokens_seen": 340132200, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.09057617, + "step": 15767, + "time_per_iteration": 2.4537556171417236 + }, + { + "auxiliary_loss_clip": 0.06410235, + "auxiliary_loss_mlp": 0.01264184, + "balance_loss_clip": 0.06274644, + "balance_loss_mlp": 0.01254445, + "epoch": 0.9480234480685406, + "flos": 20339170421760.0, + "grad_norm": 2.3853256310763684, + "language_loss": 0.73483276, + "learning_rate": 2.8240491380931096e-08, + "loss": 0.81157696, + "num_input_tokens_seen": 340149175, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.09735107, + "step": 15768, + "time_per_iteration": 2.5297107696533203 + }, + { + "auxiliary_loss_clip": 0.06308576, + "auxiliary_loss_mlp": 0.01250161, + "balance_loss_clip": 0.06254381, + "balance_loss_mlp": 0.01249126, + "epoch": 0.9480835713212085, + "flos": 70314548382720.0, + "grad_norm": 0.7196411504801323, + "language_loss": 0.55233341, + "learning_rate": 2.8175311538024326e-08, + "loss": 0.62792081, + "num_input_tokens_seen": 340208155, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.01035309, + "step": 15769, + "time_per_iteration": 3.135312557220459 + }, + { + "auxiliary_loss_clip": 0.06404492, + "auxiliary_loss_mlp": 0.01262015, + "balance_loss_clip": 0.06270418, + "balance_loss_mlp": 0.01253131, + "epoch": 0.9481436945738765, + "flos": 25457211734400.0, + "grad_norm": 1.291960686791139, + "language_loss": 0.77551377, + "learning_rate": 2.8110206466907428e-08, + "loss": 0.85217881, + "num_input_tokens_seen": 340229275, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.08895874, + "step": 15770, + "time_per_iteration": 2.549916982650757 + }, + { + "auxiliary_loss_clip": 0.06405759, + "auxiliary_loss_mlp": 0.01265581, + "balance_loss_clip": 0.06275308, + "balance_loss_mlp": 0.01255514, + "epoch": 0.9482038178265444, + "flos": 26987244888960.0, + "grad_norm": 1.726653277690328, + "language_loss": 0.80475664, + "learning_rate": 2.8045176170049313e-08, + "loss": 0.88147008, + "num_input_tokens_seen": 340248920, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.10058594, + "step": 15771, + "time_per_iteration": 2.5613114833831787 + }, + { + "auxiliary_loss_clip": 0.06398853, + "auxiliary_loss_mlp": 0.01263649, + "balance_loss_clip": 0.06269822, + "balance_loss_mlp": 0.01254696, + "epoch": 0.9482639410792124, + "flos": 17791239467520.0, + "grad_norm": 1.9987146967466614, + "language_loss": 0.70112687, + "learning_rate": 2.7980220649915566e-08, + "loss": 0.77775192, + "num_input_tokens_seen": 340266775, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.08959961, + "step": 15772, + "time_per_iteration": 3.8485605716705322 + }, + { + "auxiliary_loss_clip": 0.06399487, + "auxiliary_loss_mlp": 0.01261828, + "balance_loss_clip": 0.06269841, + "balance_loss_mlp": 0.01252583, + "epoch": 0.9483240643318803, + "flos": 21003098647680.0, + "grad_norm": 1.447529838975947, + "language_loss": 0.74107957, + "learning_rate": 2.7915339908969327e-08, + "loss": 0.8176927, + "num_input_tokens_seen": 340285295, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.09240723, + "step": 15773, + "time_per_iteration": 2.500173807144165 + }, + { + "auxiliary_loss_clip": 0.06403539, + "auxiliary_loss_mlp": 0.01263968, + "balance_loss_clip": 0.06268892, + "balance_loss_mlp": 0.01253555, + "epoch": 0.9483841875845483, + "flos": 20089349873280.0, + "grad_norm": 2.1086250224803806, + "language_loss": 0.63228577, + "learning_rate": 2.7850533949671072e-08, + "loss": 0.70896089, + "num_input_tokens_seen": 340304265, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.10412598, + "step": 15774, + "time_per_iteration": 2.585265636444092 + }, + { + "auxiliary_loss_clip": 0.0640205, + "auxiliary_loss_mlp": 0.01263784, + "balance_loss_clip": 0.06270356, + "balance_loss_mlp": 0.01254056, + "epoch": 0.9484443108372163, + "flos": 20819929622400.0, + "grad_norm": 1.7035210571527313, + "language_loss": 0.59463555, + "learning_rate": 2.7785802774478396e-08, + "loss": 0.67129385, + "num_input_tokens_seen": 340323690, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09735107, + "step": 15775, + "time_per_iteration": 2.563870906829834 + }, + { + "auxiliary_loss_clip": 0.06404445, + "auxiliary_loss_mlp": 0.01266804, + "balance_loss_clip": 0.06271166, + "balance_loss_mlp": 0.01257125, + "epoch": 0.9485044340898843, + "flos": 36438018249600.0, + "grad_norm": 1.4654814011520536, + "language_loss": 0.61937261, + "learning_rate": 2.772114638584555e-08, + "loss": 0.6960851, + "num_input_tokens_seen": 340345830, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.09674072, + "step": 15776, + "time_per_iteration": 2.62610125541687 + }, + { + "auxiliary_loss_clip": 0.06404588, + "auxiliary_loss_mlp": 0.01264603, + "balance_loss_clip": 0.06271777, + "balance_loss_mlp": 0.01255132, + "epoch": 0.9485645573425522, + "flos": 22609300763520.0, + "grad_norm": 1.9419474034086324, + "language_loss": 0.73911107, + "learning_rate": 2.765656478622458e-08, + "loss": 0.81580293, + "num_input_tokens_seen": 340365910, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09466553, + "step": 15777, + "time_per_iteration": 2.5099053382873535 + }, + { + "auxiliary_loss_clip": 0.06414537, + "auxiliary_loss_mlp": 0.01265641, + "balance_loss_clip": 0.06272923, + "balance_loss_mlp": 0.01255365, + "epoch": 0.9486246805952202, + "flos": 22024266756480.0, + "grad_norm": 2.4570684024376885, + "language_loss": 0.71977055, + "learning_rate": 2.759205797806441e-08, + "loss": 0.79657233, + "num_input_tokens_seen": 340383935, + "router_z_loss_clip": 1.41308594, + "router_z_loss_mlp": 0.10272217, + "step": 15778, + "time_per_iteration": 2.4870026111602783 + }, + { + "auxiliary_loss_clip": 0.06396791, + "auxiliary_loss_mlp": 0.01265306, + "balance_loss_clip": 0.06273665, + "balance_loss_mlp": 0.01257277, + "epoch": 0.9486848038478882, + "flos": 16514297170560.0, + "grad_norm": 1.9713243247520542, + "language_loss": 0.69818199, + "learning_rate": 2.7527625963810865e-08, + "loss": 0.77480304, + "num_input_tokens_seen": 340402760, + "router_z_loss_clip": 1.23046875, + "router_z_loss_mlp": 0.08032227, + "step": 15779, + "time_per_iteration": 2.5228939056396484 + }, + { + "auxiliary_loss_clip": 0.06406988, + "auxiliary_loss_mlp": 0.01263384, + "balance_loss_clip": 0.06274127, + "balance_loss_mlp": 0.01253204, + "epoch": 0.9487449271005561, + "flos": 19250344540800.0, + "grad_norm": 2.158437031271148, + "language_loss": 0.7843678, + "learning_rate": 2.7463268745907542e-08, + "loss": 0.86107153, + "num_input_tokens_seen": 340422105, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.10174561, + "step": 15780, + "time_per_iteration": 4.039035081863403 + }, + { + "auxiliary_loss_clip": 0.06399371, + "auxiliary_loss_mlp": 0.01268072, + "balance_loss_clip": 0.06269532, + "balance_loss_mlp": 0.01258738, + "epoch": 0.9488050503532242, + "flos": 21769205328000.0, + "grad_norm": 1.652828919215293, + "language_loss": 0.66618556, + "learning_rate": 2.7398986326794494e-08, + "loss": 0.74285996, + "num_input_tokens_seen": 340441160, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.09326172, + "step": 15781, + "time_per_iteration": 2.5114023685455322 + }, + { + "auxiliary_loss_clip": 0.06399278, + "auxiliary_loss_mlp": 0.01269723, + "balance_loss_clip": 0.062714, + "balance_loss_mlp": 0.01260156, + "epoch": 0.9488651736058921, + "flos": 18374764101120.0, + "grad_norm": 1.8456931190486248, + "language_loss": 0.80244529, + "learning_rate": 2.733477870890999e-08, + "loss": 0.87913531, + "num_input_tokens_seen": 340458200, + "router_z_loss_clip": 1.27832031, + "router_z_loss_mlp": 0.09564209, + "step": 15782, + "time_per_iteration": 2.523489236831665 + }, + { + "auxiliary_loss_clip": 0.0630802, + "auxiliary_loss_mlp": 0.01249376, + "balance_loss_clip": 0.06253742, + "balance_loss_mlp": 0.01248354, + "epoch": 0.9489252968585601, + "flos": 70107130800000.0, + "grad_norm": 0.7092659629806969, + "language_loss": 0.59900188, + "learning_rate": 2.7270645894688082e-08, + "loss": 0.67457592, + "num_input_tokens_seen": 340526420, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.01021576, + "step": 15783, + "time_per_iteration": 3.2024121284484863 + }, + { + "auxiliary_loss_clip": 0.06402528, + "auxiliary_loss_mlp": 0.01266593, + "balance_loss_clip": 0.06270333, + "balance_loss_mlp": 0.01256651, + "epoch": 0.948985420111228, + "flos": 27862909182720.0, + "grad_norm": 1.627858945896465, + "language_loss": 0.74303591, + "learning_rate": 2.720658788656105e-08, + "loss": 0.81972712, + "num_input_tokens_seen": 340546325, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.09936523, + "step": 15784, + "time_per_iteration": 2.545043468475342 + }, + { + "auxiliary_loss_clip": 0.06405601, + "auxiliary_loss_mlp": 0.01268009, + "balance_loss_clip": 0.06272575, + "balance_loss_mlp": 0.01258413, + "epoch": 0.949045543363896, + "flos": 24322880286720.0, + "grad_norm": 1.7686500585497513, + "language_loss": 0.69748747, + "learning_rate": 2.714260468695806e-08, + "loss": 0.77422357, + "num_input_tokens_seen": 340565145, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.0958252, + "step": 15785, + "time_per_iteration": 2.505894184112549 + }, + { + "auxiliary_loss_clip": 0.06406058, + "auxiliary_loss_mlp": 0.0126643, + "balance_loss_clip": 0.06270994, + "balance_loss_mlp": 0.0125712, + "epoch": 0.9491056666165639, + "flos": 24248262625920.0, + "grad_norm": 1.5913923023691325, + "language_loss": 0.7625891, + "learning_rate": 2.707869629830495e-08, + "loss": 0.83931398, + "num_input_tokens_seen": 340585465, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.09313965, + "step": 15786, + "time_per_iteration": 3.9345221519470215 + }, + { + "auxiliary_loss_clip": 0.06399442, + "auxiliary_loss_mlp": 0.01264758, + "balance_loss_clip": 0.06269728, + "balance_loss_mlp": 0.01256109, + "epoch": 0.949165789869232, + "flos": 24537509320320.0, + "grad_norm": 2.558063223282522, + "language_loss": 0.79310948, + "learning_rate": 2.7014862723025335e-08, + "loss": 0.86975145, + "num_input_tokens_seen": 340606010, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.08642578, + "step": 15787, + "time_per_iteration": 2.5140228271484375 + }, + { + "auxiliary_loss_clip": 0.06398906, + "auxiliary_loss_mlp": 0.01263863, + "balance_loss_clip": 0.06272651, + "balance_loss_mlp": 0.01255003, + "epoch": 0.9492259131218999, + "flos": 22241662974720.0, + "grad_norm": 1.482913828210554, + "language_loss": 0.76110846, + "learning_rate": 2.6951103963540388e-08, + "loss": 0.83773613, + "num_input_tokens_seen": 340626135, + "router_z_loss_clip": 1.26269531, + "router_z_loss_mlp": 0.08859253, + "step": 15788, + "time_per_iteration": 2.49965763092041 + }, + { + "auxiliary_loss_clip": 0.06405517, + "auxiliary_loss_mlp": 0.01266294, + "balance_loss_clip": 0.06271803, + "balance_loss_mlp": 0.0125696, + "epoch": 0.9492860363745679, + "flos": 22972955483520.0, + "grad_norm": 1.5889024657895832, + "language_loss": 0.72189152, + "learning_rate": 2.6887420022266848e-08, + "loss": 0.79860961, + "num_input_tokens_seen": 340644870, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.09332275, + "step": 15789, + "time_per_iteration": 2.556658983230591 + }, + { + "auxiliary_loss_clip": 0.06401318, + "auxiliary_loss_mlp": 0.01266331, + "balance_loss_clip": 0.06272426, + "balance_loss_mlp": 0.0125649, + "epoch": 0.9493461596272358, + "flos": 18376357328640.0, + "grad_norm": 1.820508624210969, + "language_loss": 0.73197401, + "learning_rate": 2.682381090161989e-08, + "loss": 0.80865049, + "num_input_tokens_seen": 340663695, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.09851074, + "step": 15790, + "time_per_iteration": 3.926544189453125 + }, + { + "auxiliary_loss_clip": 0.06403148, + "auxiliary_loss_mlp": 0.01263876, + "balance_loss_clip": 0.06268154, + "balance_loss_mlp": 0.01254185, + "epoch": 0.9494062828799038, + "flos": 20018002521600.0, + "grad_norm": 1.8246160541331784, + "language_loss": 0.77819729, + "learning_rate": 2.6760276604012033e-08, + "loss": 0.85486752, + "num_input_tokens_seen": 340682970, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.09698486, + "step": 15791, + "time_per_iteration": 2.4806320667266846 + }, + { + "auxiliary_loss_clip": 0.0640974, + "auxiliary_loss_mlp": 0.01265101, + "balance_loss_clip": 0.0627186, + "balance_loss_mlp": 0.01254939, + "epoch": 0.9494664061325718, + "flos": 27234843304320.0, + "grad_norm": 1.8993527124962928, + "language_loss": 0.74267161, + "learning_rate": 2.6696817131852234e-08, + "loss": 0.81942004, + "num_input_tokens_seen": 340702275, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.10162354, + "step": 15792, + "time_per_iteration": 2.5601704120635986 + }, + { + "auxiliary_loss_clip": 0.06402216, + "auxiliary_loss_mlp": 0.01262243, + "balance_loss_clip": 0.06271151, + "balance_loss_mlp": 0.01252837, + "epoch": 0.9495265293852397, + "flos": 18375812277120.0, + "grad_norm": 1.7327549003896519, + "language_loss": 0.78444892, + "learning_rate": 2.663343248754679e-08, + "loss": 0.86109352, + "num_input_tokens_seen": 340719060, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09405518, + "step": 15793, + "time_per_iteration": 2.4936344623565674 + }, + { + "auxiliary_loss_clip": 0.06399348, + "auxiliary_loss_mlp": 0.01263265, + "balance_loss_clip": 0.06267807, + "balance_loss_mlp": 0.01253889, + "epoch": 0.9495866526379078, + "flos": 23082429242880.0, + "grad_norm": 1.6722001726685662, + "language_loss": 0.77888709, + "learning_rate": 2.6570122673499562e-08, + "loss": 0.85551322, + "num_input_tokens_seen": 340737815, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09375, + "step": 15794, + "time_per_iteration": 2.497514247894287 + }, + { + "auxiliary_loss_clip": 0.06406198, + "auxiliary_loss_mlp": 0.0126488, + "balance_loss_clip": 0.06271206, + "balance_loss_mlp": 0.01254897, + "epoch": 0.9496467758905757, + "flos": 17535632987520.0, + "grad_norm": 1.9049729517954086, + "language_loss": 0.61179888, + "learning_rate": 2.650688769211107e-08, + "loss": 0.6885097, + "num_input_tokens_seen": 340756035, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.09979248, + "step": 15795, + "time_per_iteration": 2.5063045024871826 + }, + { + "auxiliary_loss_clip": 0.06395505, + "auxiliary_loss_mlp": 0.01265243, + "balance_loss_clip": 0.06269419, + "balance_loss_mlp": 0.01255897, + "epoch": 0.9497068991432437, + "flos": 24140759437440.0, + "grad_norm": 1.51218594053535, + "language_loss": 0.79580635, + "learning_rate": 2.644372754577895e-08, + "loss": 0.87241381, + "num_input_tokens_seen": 340775620, + "router_z_loss_clip": 1.26074219, + "router_z_loss_mlp": 0.09338379, + "step": 15796, + "time_per_iteration": 2.5217463970184326 + }, + { + "auxiliary_loss_clip": 0.06400493, + "auxiliary_loss_mlp": 0.01265932, + "balance_loss_clip": 0.06268636, + "balance_loss_mlp": 0.01255793, + "epoch": 0.9497670223959116, + "flos": 20309597130240.0, + "grad_norm": 1.9588104868661271, + "language_loss": 0.75637573, + "learning_rate": 2.6380642236898398e-08, + "loss": 0.83303994, + "num_input_tokens_seen": 340794510, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.10137939, + "step": 15797, + "time_per_iteration": 2.510477066040039 + }, + { + "auxiliary_loss_clip": 0.06401858, + "auxiliary_loss_mlp": 0.01262483, + "balance_loss_clip": 0.06269763, + "balance_loss_mlp": 0.01253071, + "epoch": 0.9498271456485796, + "flos": 13704009482880.0, + "grad_norm": 2.197071076360675, + "language_loss": 0.66319734, + "learning_rate": 2.6317631767861727e-08, + "loss": 0.73984075, + "num_input_tokens_seen": 340812955, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09417725, + "step": 15798, + "time_per_iteration": 2.466979503631592 + }, + { + "auxiliary_loss_clip": 0.06409442, + "auxiliary_loss_mlp": 0.01265644, + "balance_loss_clip": 0.06273577, + "balance_loss_mlp": 0.01255994, + "epoch": 0.9498872689012475, + "flos": 20820348892800.0, + "grad_norm": 1.7595466908543556, + "language_loss": 0.77202052, + "learning_rate": 2.6254696141058575e-08, + "loss": 0.84877139, + "num_input_tokens_seen": 340829200, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.09646606, + "step": 15799, + "time_per_iteration": 2.502589225769043 + }, + { + "auxiliary_loss_clip": 0.06398167, + "auxiliary_loss_mlp": 0.01263962, + "balance_loss_clip": 0.06270022, + "balance_loss_mlp": 0.01254795, + "epoch": 0.9499473921539155, + "flos": 21039044849280.0, + "grad_norm": 1.7937393457780948, + "language_loss": 0.71204829, + "learning_rate": 2.6191835358874814e-08, + "loss": 0.78866959, + "num_input_tokens_seen": 340848035, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.0916748, + "step": 15800, + "time_per_iteration": 2.496887683868408 + }, + { + "auxiliary_loss_clip": 0.06400058, + "auxiliary_loss_mlp": 0.01265689, + "balance_loss_clip": 0.06269508, + "balance_loss_mlp": 0.0125657, + "epoch": 0.9500075154065835, + "flos": 21005446561920.0, + "grad_norm": 1.7439251794642465, + "language_loss": 0.71854639, + "learning_rate": 2.6129049423694315e-08, + "loss": 0.79520386, + "num_input_tokens_seen": 340870025, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.09118652, + "step": 15801, + "time_per_iteration": 2.600644588470459 + }, + { + "auxiliary_loss_clip": 0.06403385, + "auxiliary_loss_mlp": 0.01263835, + "balance_loss_clip": 0.06271951, + "balance_loss_mlp": 0.01254578, + "epoch": 0.9500676386592515, + "flos": 25129461288960.0, + "grad_norm": 1.5167901940299169, + "language_loss": 0.81219077, + "learning_rate": 2.6066338337898508e-08, + "loss": 0.88886297, + "num_input_tokens_seen": 340892290, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.0925293, + "step": 15802, + "time_per_iteration": 2.5559613704681396 + }, + { + "auxiliary_loss_clip": 0.06404102, + "auxiliary_loss_mlp": 0.01264645, + "balance_loss_clip": 0.06270744, + "balance_loss_mlp": 0.01254799, + "epoch": 0.9501277619119194, + "flos": 27530462908800.0, + "grad_norm": 1.5351955934289538, + "language_loss": 0.67835546, + "learning_rate": 2.60037021038646e-08, + "loss": 0.75504291, + "num_input_tokens_seen": 340912260, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.09838867, + "step": 15803, + "time_per_iteration": 2.5468993186950684 + }, + { + "auxiliary_loss_clip": 0.06397918, + "auxiliary_loss_mlp": 0.01264702, + "balance_loss_clip": 0.06269416, + "balance_loss_mlp": 0.01254885, + "epoch": 0.9501878851645874, + "flos": 20820306965760.0, + "grad_norm": 1.6488350985874107, + "language_loss": 0.76223731, + "learning_rate": 2.5941140723968247e-08, + "loss": 0.83886349, + "num_input_tokens_seen": 340928930, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.0982666, + "step": 15804, + "time_per_iteration": 2.482729434967041 + }, + { + "auxiliary_loss_clip": 0.06402758, + "auxiliary_loss_mlp": 0.01265776, + "balance_loss_clip": 0.06268962, + "balance_loss_mlp": 0.0125618, + "epoch": 0.9502480084172553, + "flos": 18375309152640.0, + "grad_norm": 3.607340173427983, + "language_loss": 0.73302132, + "learning_rate": 2.5878654200581775e-08, + "loss": 0.80970663, + "num_input_tokens_seen": 340946615, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.09606934, + "step": 15805, + "time_per_iteration": 2.4692134857177734 + }, + { + "auxiliary_loss_clip": 0.06401005, + "auxiliary_loss_mlp": 0.01268215, + "balance_loss_clip": 0.06270203, + "balance_loss_mlp": 0.01258422, + "epoch": 0.9503081316699233, + "flos": 23556270481920.0, + "grad_norm": 1.4624608104842494, + "language_loss": 0.80504966, + "learning_rate": 2.5816242536074618e-08, + "loss": 0.88174188, + "num_input_tokens_seen": 340967545, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09790039, + "step": 15806, + "time_per_iteration": 2.5002782344818115 + }, + { + "auxiliary_loss_clip": 0.06403825, + "auxiliary_loss_mlp": 0.01266066, + "balance_loss_clip": 0.06271081, + "balance_loss_mlp": 0.0125706, + "epoch": 0.9503682549225914, + "flos": 18046217041920.0, + "grad_norm": 1.909262236411516, + "language_loss": 0.82481933, + "learning_rate": 2.5753905732813108e-08, + "loss": 0.90151823, + "num_input_tokens_seen": 340984955, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09008789, + "step": 15807, + "time_per_iteration": 2.4873461723327637 + }, + { + "auxiliary_loss_clip": 0.06400104, + "auxiliary_loss_mlp": 0.01265767, + "balance_loss_clip": 0.0627058, + "balance_loss_mlp": 0.01256564, + "epoch": 0.9504283781752593, + "flos": 25893429690240.0, + "grad_norm": 1.5955782807041765, + "language_loss": 0.7199322, + "learning_rate": 2.5691643793161355e-08, + "loss": 0.79659086, + "num_input_tokens_seen": 341007300, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.09204102, + "step": 15808, + "time_per_iteration": 2.540447473526001 + }, + { + "auxiliary_loss_clip": 0.06396027, + "auxiliary_loss_mlp": 0.01265833, + "balance_loss_clip": 0.06267738, + "balance_loss_mlp": 0.01256618, + "epoch": 0.9504885014279273, + "flos": 22130009009280.0, + "grad_norm": 1.3816783547504883, + "language_loss": 0.69870842, + "learning_rate": 2.562945671948058e-08, + "loss": 0.77532703, + "num_input_tokens_seen": 341026695, + "router_z_loss_clip": 1.28613281, + "router_z_loss_mlp": 0.09216309, + "step": 15809, + "time_per_iteration": 2.4813284873962402 + }, + { + "auxiliary_loss_clip": 0.06396701, + "auxiliary_loss_mlp": 0.01261651, + "balance_loss_clip": 0.06267259, + "balance_loss_mlp": 0.01253027, + "epoch": 0.9505486246805952, + "flos": 21622317920640.0, + "grad_norm": 1.4773684576527446, + "language_loss": 0.75935221, + "learning_rate": 2.5567344514128452e-08, + "loss": 0.83593571, + "num_input_tokens_seen": 341047080, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.08630371, + "step": 15810, + "time_per_iteration": 2.4988956451416016 + }, + { + "auxiliary_loss_clip": 0.06400326, + "auxiliary_loss_mlp": 0.01267617, + "balance_loss_clip": 0.06269518, + "balance_loss_mlp": 0.01257865, + "epoch": 0.9506087479332632, + "flos": 22534766956800.0, + "grad_norm": 1.3339331298451294, + "language_loss": 0.80074775, + "learning_rate": 2.5505307179460643e-08, + "loss": 0.87742716, + "num_input_tokens_seen": 341067310, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09753418, + "step": 15811, + "time_per_iteration": 3.8706562519073486 + }, + { + "auxiliary_loss_clip": 0.06399944, + "auxiliary_loss_mlp": 0.01264686, + "balance_loss_clip": 0.0626929, + "balance_loss_mlp": 0.01255168, + "epoch": 0.9506688711859311, + "flos": 27534823320960.0, + "grad_norm": 2.3021025111119133, + "language_loss": 0.70557272, + "learning_rate": 2.5443344717829495e-08, + "loss": 0.78221905, + "num_input_tokens_seen": 341085110, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.09509277, + "step": 15812, + "time_per_iteration": 2.5505876541137695 + }, + { + "auxiliary_loss_clip": 0.06405829, + "auxiliary_loss_mlp": 0.01262148, + "balance_loss_clip": 0.06271984, + "balance_loss_mlp": 0.01252826, + "epoch": 0.9507289944385992, + "flos": 19872037509120.0, + "grad_norm": 1.526419629738536, + "language_loss": 0.656178, + "learning_rate": 2.538145713158446e-08, + "loss": 0.73285776, + "num_input_tokens_seen": 341103190, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.09320068, + "step": 15813, + "time_per_iteration": 2.504990816116333 + }, + { + "auxiliary_loss_clip": 0.06402929, + "auxiliary_loss_mlp": 0.01264397, + "balance_loss_clip": 0.06271286, + "balance_loss_mlp": 0.01254694, + "epoch": 0.9507891176912671, + "flos": 25200515151360.0, + "grad_norm": 1.3164663911360832, + "language_loss": 0.70462513, + "learning_rate": 2.5319644423072327e-08, + "loss": 0.7812984, + "num_input_tokens_seen": 341125695, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.0970459, + "step": 15814, + "time_per_iteration": 2.5385372638702393 + }, + { + "auxiliary_loss_clip": 0.06397622, + "auxiliary_loss_mlp": 0.01262752, + "balance_loss_clip": 0.06269576, + "balance_loss_mlp": 0.01254253, + "epoch": 0.9508492409439351, + "flos": 24906446847360.0, + "grad_norm": 1.8852174609712755, + "language_loss": 0.63183349, + "learning_rate": 2.5257906594637445e-08, + "loss": 0.7084372, + "num_input_tokens_seen": 341143930, + "router_z_loss_clip": 1.28027344, + "router_z_loss_mlp": 0.08496094, + "step": 15815, + "time_per_iteration": 2.53188419342041 + }, + { + "auxiliary_loss_clip": 0.06401452, + "auxiliary_loss_mlp": 0.01264924, + "balance_loss_clip": 0.06271219, + "balance_loss_mlp": 0.01255883, + "epoch": 0.950909364196603, + "flos": 29791033885440.0, + "grad_norm": 1.7527785707750094, + "language_loss": 0.59055346, + "learning_rate": 2.519624364862061e-08, + "loss": 0.66721725, + "num_input_tokens_seen": 341164280, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.0904541, + "step": 15816, + "time_per_iteration": 2.5678937435150146 + }, + { + "auxiliary_loss_clip": 0.06401551, + "auxiliary_loss_mlp": 0.01261651, + "balance_loss_clip": 0.0627007, + "balance_loss_mlp": 0.01252478, + "epoch": 0.950969487449271, + "flos": 24724745268480.0, + "grad_norm": 1.574607991311696, + "language_loss": 0.73901993, + "learning_rate": 2.513465558735994e-08, + "loss": 0.81565189, + "num_input_tokens_seen": 341183670, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.09173584, + "step": 15817, + "time_per_iteration": 2.529062271118164 + }, + { + "auxiliary_loss_clip": 0.0640544, + "auxiliary_loss_mlp": 0.01266326, + "balance_loss_clip": 0.0627112, + "balance_loss_mlp": 0.0125611, + "epoch": 0.9510296107019389, + "flos": 13704302972160.0, + "grad_norm": 1.658723255681471, + "language_loss": 0.60563654, + "learning_rate": 2.5073142413190918e-08, + "loss": 0.68235421, + "num_input_tokens_seen": 341201900, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.10217285, + "step": 15818, + "time_per_iteration": 2.4677538871765137 + }, + { + "auxiliary_loss_clip": 0.06399883, + "auxiliary_loss_mlp": 0.01263447, + "balance_loss_clip": 0.06269322, + "balance_loss_mlp": 0.01253767, + "epoch": 0.9510897339546069, + "flos": 17317691717760.0, + "grad_norm": 1.6828133029068784, + "language_loss": 0.69863963, + "learning_rate": 2.5011704128446552e-08, + "loss": 0.77527297, + "num_input_tokens_seen": 341218340, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.09680176, + "step": 15819, + "time_per_iteration": 3.972642421722412 + }, + { + "auxiliary_loss_clip": 0.0640963, + "auxiliary_loss_mlp": 0.0126202, + "balance_loss_clip": 0.06275742, + "balance_loss_mlp": 0.01253055, + "epoch": 0.951149857207275, + "flos": 14799292128000.0, + "grad_norm": 1.637089994669383, + "language_loss": 0.74310344, + "learning_rate": 2.49503407354561e-08, + "loss": 0.81981993, + "num_input_tokens_seen": 341235885, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.08966064, + "step": 15820, + "time_per_iteration": 2.514216184616089 + }, + { + "auxiliary_loss_clip": 0.06404862, + "auxiliary_loss_mlp": 0.0126351, + "balance_loss_clip": 0.06269901, + "balance_loss_mlp": 0.01253491, + "epoch": 0.9512099804599429, + "flos": 19397273875200.0, + "grad_norm": 1.8501796910784354, + "language_loss": 0.78652138, + "learning_rate": 2.4889052236546804e-08, + "loss": 0.86320508, + "num_input_tokens_seen": 341255280, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.10028076, + "step": 15821, + "time_per_iteration": 2.4915084838867188 + }, + { + "auxiliary_loss_clip": 0.06399101, + "auxiliary_loss_mlp": 0.01262518, + "balance_loss_clip": 0.0626865, + "balance_loss_mlp": 0.01252993, + "epoch": 0.9512701037126109, + "flos": 36766816871040.0, + "grad_norm": 1.4188367342021355, + "language_loss": 0.71510702, + "learning_rate": 2.4827838634042586e-08, + "loss": 0.79172319, + "num_input_tokens_seen": 341279055, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.09533691, + "step": 15822, + "time_per_iteration": 2.641385078430176 + }, + { + "auxiliary_loss_clip": 0.06399742, + "auxiliary_loss_mlp": 0.01263886, + "balance_loss_clip": 0.06270236, + "balance_loss_mlp": 0.01255202, + "epoch": 0.9513302269652788, + "flos": 22644911548800.0, + "grad_norm": 1.5230172306663716, + "language_loss": 0.6589359, + "learning_rate": 2.47666999302647e-08, + "loss": 0.73557216, + "num_input_tokens_seen": 341298560, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.08691406, + "step": 15823, + "time_per_iteration": 2.6643285751342773 + }, + { + "auxiliary_loss_clip": 0.0639899, + "auxiliary_loss_mlp": 0.01264, + "balance_loss_clip": 0.06269787, + "balance_loss_mlp": 0.01255042, + "epoch": 0.9513903502179468, + "flos": 22899847196160.0, + "grad_norm": 1.6545118844209308, + "language_loss": 0.77469099, + "learning_rate": 2.4705636127531292e-08, + "loss": 0.85132086, + "num_input_tokens_seen": 341316650, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.08959961, + "step": 15824, + "time_per_iteration": 2.560600757598877 + }, + { + "auxiliary_loss_clip": 0.06404689, + "auxiliary_loss_mlp": 0.01262938, + "balance_loss_clip": 0.0626903, + "balance_loss_mlp": 0.0125274, + "epoch": 0.9514504734706147, + "flos": 27936143251200.0, + "grad_norm": 1.8708540735128236, + "language_loss": 0.74260736, + "learning_rate": 2.4644647228158065e-08, + "loss": 0.8192836, + "num_input_tokens_seen": 341336185, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10192871, + "step": 15825, + "time_per_iteration": 2.566944122314453 + }, + { + "auxiliary_loss_clip": 0.06308633, + "auxiliary_loss_mlp": 0.01249825, + "balance_loss_clip": 0.06254488, + "balance_loss_mlp": 0.01248835, + "epoch": 0.9515105967232828, + "flos": 67386485381760.0, + "grad_norm": 0.8171627417310032, + "language_loss": 0.53219813, + "learning_rate": 2.458373323445806e-08, + "loss": 0.60778272, + "num_input_tokens_seen": 341395795, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.00989532, + "step": 15826, + "time_per_iteration": 4.5212695598602295 + }, + { + "auxiliary_loss_clip": 0.06403694, + "auxiliary_loss_mlp": 0.01263494, + "balance_loss_clip": 0.06270428, + "balance_loss_mlp": 0.01253779, + "epoch": 0.9515707199759507, + "flos": 25853290784640.0, + "grad_norm": 1.7303662165905656, + "language_loss": 0.73298597, + "learning_rate": 2.452289414874076e-08, + "loss": 0.80965781, + "num_input_tokens_seen": 341415675, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.09716797, + "step": 15827, + "time_per_iteration": 2.5447840690612793 + }, + { + "auxiliary_loss_clip": 0.06404355, + "auxiliary_loss_mlp": 0.01266063, + "balance_loss_clip": 0.06272575, + "balance_loss_mlp": 0.0125593, + "epoch": 0.9516308432286187, + "flos": 21834389404800.0, + "grad_norm": 1.8023851639179382, + "language_loss": 0.74833316, + "learning_rate": 2.4462129973313207e-08, + "loss": 0.82503736, + "num_input_tokens_seen": 341432990, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.10131836, + "step": 15828, + "time_per_iteration": 2.4686501026153564 + }, + { + "auxiliary_loss_clip": 0.06403244, + "auxiliary_loss_mlp": 0.01265275, + "balance_loss_clip": 0.06274635, + "balance_loss_mlp": 0.01256406, + "epoch": 0.9516909664812866, + "flos": 27276617364480.0, + "grad_norm": 1.5487816970397665, + "language_loss": 0.73187357, + "learning_rate": 2.440144071047978e-08, + "loss": 0.80855876, + "num_input_tokens_seen": 341454100, + "router_z_loss_clip": 1.28613281, + "router_z_loss_mlp": 0.08874512, + "step": 15829, + "time_per_iteration": 2.542429208755493 + }, + { + "auxiliary_loss_clip": 0.06404226, + "auxiliary_loss_mlp": 0.01266859, + "balance_loss_clip": 0.06272517, + "balance_loss_mlp": 0.01258043, + "epoch": 0.9517510897339546, + "flos": 21221752677120.0, + "grad_norm": 1.837415216575745, + "language_loss": 0.61719525, + "learning_rate": 2.4340826362541533e-08, + "loss": 0.69390613, + "num_input_tokens_seen": 341472955, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.08807373, + "step": 15830, + "time_per_iteration": 3.9777581691741943 + }, + { + "auxiliary_loss_clip": 0.06406231, + "auxiliary_loss_mlp": 0.01270469, + "balance_loss_clip": 0.06272319, + "balance_loss_mlp": 0.0126011, + "epoch": 0.9518112129866225, + "flos": 18739928194560.0, + "grad_norm": 2.4162096913039286, + "language_loss": 0.73349452, + "learning_rate": 2.428028693179729e-08, + "loss": 0.81026161, + "num_input_tokens_seen": 341490165, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.10357666, + "step": 15831, + "time_per_iteration": 2.5067529678344727 + }, + { + "auxiliary_loss_clip": 0.0640035, + "auxiliary_loss_mlp": 0.01262917, + "balance_loss_clip": 0.06271564, + "balance_loss_mlp": 0.01253935, + "epoch": 0.9518713362392905, + "flos": 16769274744960.0, + "grad_norm": 1.653127425404805, + "language_loss": 0.65777677, + "learning_rate": 2.4219822420542545e-08, + "loss": 0.73440945, + "num_input_tokens_seen": 341508055, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.08984375, + "step": 15832, + "time_per_iteration": 2.4970624446868896 + }, + { + "auxiliary_loss_clip": 0.06398977, + "auxiliary_loss_mlp": 0.01268006, + "balance_loss_clip": 0.06273643, + "balance_loss_mlp": 0.01258475, + "epoch": 0.9519314594919586, + "flos": 15235887427200.0, + "grad_norm": 1.7378729185986037, + "language_loss": 0.7819438, + "learning_rate": 2.4159432831070135e-08, + "loss": 0.85861361, + "num_input_tokens_seen": 341526155, + "router_z_loss_clip": 1.25292969, + "router_z_loss_mlp": 0.09527588, + "step": 15833, + "time_per_iteration": 2.460865020751953 + }, + { + "auxiliary_loss_clip": 0.06399127, + "auxiliary_loss_mlp": 0.01262041, + "balance_loss_clip": 0.0627199, + "balance_loss_mlp": 0.01253261, + "epoch": 0.9519915827446265, + "flos": 19358770124160.0, + "grad_norm": 2.181001598505818, + "language_loss": 0.7522788, + "learning_rate": 2.4099118165670007e-08, + "loss": 0.8288905, + "num_input_tokens_seen": 341540450, + "router_z_loss_clip": 1.26953125, + "router_z_loss_mlp": 0.08782959, + "step": 15834, + "time_per_iteration": 2.5118231773376465 + }, + { + "auxiliary_loss_clip": 0.06410512, + "auxiliary_loss_mlp": 0.01265298, + "balance_loss_clip": 0.0627307, + "balance_loss_mlp": 0.01255403, + "epoch": 0.9520517059972945, + "flos": 22271697463680.0, + "grad_norm": 1.899995669990022, + "language_loss": 0.76650679, + "learning_rate": 2.4038878426629216e-08, + "loss": 0.84326494, + "num_input_tokens_seen": 341557865, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.09899902, + "step": 15835, + "time_per_iteration": 2.519073486328125 + }, + { + "auxiliary_loss_clip": 0.06403553, + "auxiliary_loss_mlp": 0.01265144, + "balance_loss_clip": 0.06271343, + "balance_loss_mlp": 0.01255876, + "epoch": 0.9521118292499624, + "flos": 14866907973120.0, + "grad_norm": 2.201120374190252, + "language_loss": 0.66960144, + "learning_rate": 2.397871361623238e-08, + "loss": 0.74628842, + "num_input_tokens_seen": 341573890, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.09265137, + "step": 15836, + "time_per_iteration": 2.5229427814483643 + }, + { + "auxiliary_loss_clip": 0.06397817, + "auxiliary_loss_mlp": 0.01262274, + "balance_loss_clip": 0.06269939, + "balance_loss_mlp": 0.01253011, + "epoch": 0.9521719525026304, + "flos": 23514747984000.0, + "grad_norm": 3.4643899323136553, + "language_loss": 0.70896757, + "learning_rate": 2.391862373676057e-08, + "loss": 0.78556848, + "num_input_tokens_seen": 341593770, + "router_z_loss_clip": 1.27832031, + "router_z_loss_mlp": 0.0927124, + "step": 15837, + "time_per_iteration": 2.5268142223358154 + }, + { + "auxiliary_loss_clip": 0.06405401, + "auxiliary_loss_mlp": 0.01263888, + "balance_loss_clip": 0.06271522, + "balance_loss_mlp": 0.01253648, + "epoch": 0.9522320757552983, + "flos": 19720328492160.0, + "grad_norm": 2.1007938575310847, + "language_loss": 0.73421597, + "learning_rate": 2.3858608790492617e-08, + "loss": 0.81090885, + "num_input_tokens_seen": 341612065, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.10241699, + "step": 15838, + "time_per_iteration": 2.5051467418670654 + }, + { + "auxiliary_loss_clip": 0.06400177, + "auxiliary_loss_mlp": 0.0126335, + "balance_loss_clip": 0.06268861, + "balance_loss_mlp": 0.01254606, + "epoch": 0.9522921990079664, + "flos": 25928369642880.0, + "grad_norm": 1.8172457888979467, + "language_loss": 0.78152144, + "learning_rate": 2.379866877970449e-08, + "loss": 0.85815668, + "num_input_tokens_seen": 341631365, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.08740234, + "step": 15839, + "time_per_iteration": 2.5397469997406006 + }, + { + "auxiliary_loss_clip": 0.06404764, + "auxiliary_loss_mlp": 0.01264586, + "balance_loss_clip": 0.06270763, + "balance_loss_mlp": 0.01255839, + "epoch": 0.9523523222606343, + "flos": 19214104849920.0, + "grad_norm": 1.5224815877407776, + "language_loss": 0.80422169, + "learning_rate": 2.3738803706668585e-08, + "loss": 0.88091516, + "num_input_tokens_seen": 341650300, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.0874939, + "step": 15840, + "time_per_iteration": 2.50748872756958 + }, + { + "auxiliary_loss_clip": 0.06395362, + "auxiliary_loss_mlp": 0.01263889, + "balance_loss_clip": 0.06268735, + "balance_loss_mlp": 0.01255753, + "epoch": 0.9524124455133023, + "flos": 20927265102720.0, + "grad_norm": 1.8175470123467525, + "language_loss": 0.73156214, + "learning_rate": 2.3679013573655314e-08, + "loss": 0.80815464, + "num_input_tokens_seen": 341667680, + "router_z_loss_clip": 1.265625, + "router_z_loss_mlp": 0.08129883, + "step": 15841, + "time_per_iteration": 2.4872241020202637 + }, + { + "auxiliary_loss_clip": 0.06395878, + "auxiliary_loss_mlp": 0.0126485, + "balance_loss_clip": 0.0627192, + "balance_loss_mlp": 0.01256249, + "epoch": 0.9524725687659702, + "flos": 18849527735040.0, + "grad_norm": 1.7833255311576237, + "language_loss": 0.79193342, + "learning_rate": 2.3619298382931972e-08, + "loss": 0.86854064, + "num_input_tokens_seen": 341685760, + "router_z_loss_clip": 1.24023438, + "router_z_loss_mlp": 0.0859375, + "step": 15842, + "time_per_iteration": 2.486121654510498 + }, + { + "auxiliary_loss_clip": 0.06402968, + "auxiliary_loss_mlp": 0.01264831, + "balance_loss_clip": 0.06273231, + "balance_loss_mlp": 0.01255526, + "epoch": 0.9525326920186382, + "flos": 22681318947840.0, + "grad_norm": 1.6085240870156023, + "language_loss": 0.72762179, + "learning_rate": 2.3559658136762973e-08, + "loss": 0.80429983, + "num_input_tokens_seen": 341705300, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.09307861, + "step": 15843, + "time_per_iteration": 2.523148775100708 + }, + { + "auxiliary_loss_clip": 0.06404278, + "auxiliary_loss_mlp": 0.01264952, + "balance_loss_clip": 0.06271654, + "balance_loss_mlp": 0.01255058, + "epoch": 0.9525928152713061, + "flos": 22092469580160.0, + "grad_norm": 1.502432873794168, + "language_loss": 0.78351128, + "learning_rate": 2.3500092837409612e-08, + "loss": 0.86020356, + "num_input_tokens_seen": 341724565, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09899902, + "step": 15844, + "time_per_iteration": 2.524716377258301 + }, + { + "auxiliary_loss_clip": 0.06406938, + "auxiliary_loss_mlp": 0.01266712, + "balance_loss_clip": 0.06270063, + "balance_loss_mlp": 0.01255578, + "epoch": 0.9526529385239741, + "flos": 20711084768640.0, + "grad_norm": 2.0263100699563488, + "language_loss": 0.70321971, + "learning_rate": 2.3440602487130977e-08, + "loss": 0.77995622, + "num_input_tokens_seen": 341743605, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.11138916, + "step": 15845, + "time_per_iteration": 2.500941038131714 + }, + { + "auxiliary_loss_clip": 0.06405, + "auxiliary_loss_mlp": 0.01265696, + "balance_loss_clip": 0.06269988, + "balance_loss_mlp": 0.01256338, + "epoch": 0.9527130617766422, + "flos": 23374820465280.0, + "grad_norm": 1.3991687644307798, + "language_loss": 0.75763822, + "learning_rate": 2.338118708818282e-08, + "loss": 0.83434522, + "num_input_tokens_seen": 341763475, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.09356689, + "step": 15846, + "time_per_iteration": 2.5281105041503906 + }, + { + "auxiliary_loss_clip": 0.06399485, + "auxiliary_loss_mlp": 0.01262481, + "balance_loss_clip": 0.06267849, + "balance_loss_mlp": 0.01253689, + "epoch": 0.9527731850293101, + "flos": 18991341970560.0, + "grad_norm": 1.6178897673715225, + "language_loss": 0.78373063, + "learning_rate": 2.3321846642817998e-08, + "loss": 0.86035025, + "num_input_tokens_seen": 341781265, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.0880127, + "step": 15847, + "time_per_iteration": 2.481491804122925 + }, + { + "auxiliary_loss_clip": 0.06396569, + "auxiliary_loss_mlp": 0.01264523, + "balance_loss_clip": 0.06267966, + "balance_loss_mlp": 0.01255624, + "epoch": 0.9528333082819781, + "flos": 19324123660800.0, + "grad_norm": 1.5274665589358507, + "language_loss": 0.77939975, + "learning_rate": 2.326258115328672e-08, + "loss": 0.85601068, + "num_input_tokens_seen": 341798825, + "router_z_loss_clip": 1.28613281, + "router_z_loss_mlp": 0.08905029, + "step": 15848, + "time_per_iteration": 2.5218746662139893 + }, + { + "auxiliary_loss_clip": 0.06409523, + "auxiliary_loss_mlp": 0.01266597, + "balance_loss_clip": 0.06273653, + "balance_loss_mlp": 0.01256178, + "epoch": 0.952893431534646, + "flos": 23958135463680.0, + "grad_norm": 1.5417221900752704, + "language_loss": 0.72081304, + "learning_rate": 2.320339062183674e-08, + "loss": 0.79757422, + "num_input_tokens_seen": 341819480, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.10424805, + "step": 15849, + "time_per_iteration": 2.5317416191101074 + }, + { + "auxiliary_loss_clip": 0.06408659, + "auxiliary_loss_mlp": 0.01266373, + "balance_loss_clip": 0.0627094, + "balance_loss_mlp": 0.01255829, + "epoch": 0.952953554787314, + "flos": 21036529226880.0, + "grad_norm": 1.660132090953839, + "language_loss": 0.75134432, + "learning_rate": 2.314427505071226e-08, + "loss": 0.8280946, + "num_input_tokens_seen": 341838035, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.10546875, + "step": 15850, + "time_per_iteration": 3.9556667804718018 + }, + { + "auxiliary_loss_clip": 0.06401952, + "auxiliary_loss_mlp": 0.01264257, + "balance_loss_clip": 0.06270756, + "balance_loss_mlp": 0.01255198, + "epoch": 0.9530136780399819, + "flos": 22389472776960.0, + "grad_norm": 2.0028001807866973, + "language_loss": 0.72165865, + "learning_rate": 2.308523444215482e-08, + "loss": 0.79832071, + "num_input_tokens_seen": 341855895, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09063721, + "step": 15851, + "time_per_iteration": 2.546628952026367 + }, + { + "auxiliary_loss_clip": 0.06401463, + "auxiliary_loss_mlp": 0.01264181, + "balance_loss_clip": 0.06271876, + "balance_loss_mlp": 0.01255515, + "epoch": 0.95307380129265, + "flos": 22165452086400.0, + "grad_norm": 2.8670815366039606, + "language_loss": 0.79601598, + "learning_rate": 2.3026268798403525e-08, + "loss": 0.87267244, + "num_input_tokens_seen": 341875240, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.08666992, + "step": 15852, + "time_per_iteration": 2.514052152633667 + }, + { + "auxiliary_loss_clip": 0.06401996, + "auxiliary_loss_mlp": 0.01266531, + "balance_loss_clip": 0.06269959, + "balance_loss_mlp": 0.01257191, + "epoch": 0.9531339245453179, + "flos": 44033607486720.0, + "grad_norm": 2.022438359351555, + "language_loss": 0.59703016, + "learning_rate": 2.2967378121694138e-08, + "loss": 0.67371547, + "num_input_tokens_seen": 341901020, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.09344482, + "step": 15853, + "time_per_iteration": 2.73101806640625 + }, + { + "auxiliary_loss_clip": 0.06398737, + "auxiliary_loss_mlp": 0.01263116, + "balance_loss_clip": 0.06272894, + "balance_loss_mlp": 0.01254556, + "epoch": 0.9531940477979859, + "flos": 20272938168960.0, + "grad_norm": 1.6568412443068294, + "language_loss": 0.72921371, + "learning_rate": 2.290856241425998e-08, + "loss": 0.80583227, + "num_input_tokens_seen": 341919365, + "router_z_loss_clip": 1.25878906, + "router_z_loss_mlp": 0.08557129, + "step": 15854, + "time_per_iteration": 2.475628137588501 + }, + { + "auxiliary_loss_clip": 0.06404815, + "auxiliary_loss_mlp": 0.01262782, + "balance_loss_clip": 0.06271343, + "balance_loss_mlp": 0.01253573, + "epoch": 0.9532541710506538, + "flos": 25342413240960.0, + "grad_norm": 2.097114010753005, + "language_loss": 0.67732322, + "learning_rate": 2.284982167833127e-08, + "loss": 0.75399917, + "num_input_tokens_seen": 341939985, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.09216309, + "step": 15855, + "time_per_iteration": 2.5460567474365234 + }, + { + "auxiliary_loss_clip": 0.06401306, + "auxiliary_loss_mlp": 0.01267791, + "balance_loss_clip": 0.06270517, + "balance_loss_mlp": 0.01258147, + "epoch": 0.9533142943033218, + "flos": 26476576980480.0, + "grad_norm": 1.5411782595098198, + "language_loss": 0.76690978, + "learning_rate": 2.279115591613556e-08, + "loss": 0.84360075, + "num_input_tokens_seen": 341959255, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09631348, + "step": 15856, + "time_per_iteration": 2.5271217823028564 + }, + { + "auxiliary_loss_clip": 0.06399896, + "auxiliary_loss_mlp": 0.0126262, + "balance_loss_clip": 0.06270279, + "balance_loss_mlp": 0.01253203, + "epoch": 0.9533744175559897, + "flos": 23663270545920.0, + "grad_norm": 1.480276533024058, + "language_loss": 0.77887392, + "learning_rate": 2.2732565129897075e-08, + "loss": 0.85549903, + "num_input_tokens_seen": 341977205, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.09423828, + "step": 15857, + "time_per_iteration": 2.526076555252075 + }, + { + "auxiliary_loss_clip": 0.06311148, + "auxiliary_loss_mlp": 0.01248159, + "balance_loss_clip": 0.06257018, + "balance_loss_mlp": 0.01247038, + "epoch": 0.9534345408086577, + "flos": 61070270209920.0, + "grad_norm": 0.6905807509758151, + "language_loss": 0.62508583, + "learning_rate": 2.267404932183803e-08, + "loss": 0.70067894, + "num_input_tokens_seen": 342038545, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.01124573, + "step": 15858, + "time_per_iteration": 4.573625564575195 + }, + { + "auxiliary_loss_clip": 0.06398419, + "auxiliary_loss_mlp": 0.01262909, + "balance_loss_clip": 0.06267797, + "balance_loss_mlp": 0.01254243, + "epoch": 0.9534946640613258, + "flos": 18957450193920.0, + "grad_norm": 1.4570268848956331, + "language_loss": 0.57324982, + "learning_rate": 2.2615608494177097e-08, + "loss": 0.64986312, + "num_input_tokens_seen": 342058195, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.08666992, + "step": 15859, + "time_per_iteration": 2.5377213954925537 + }, + { + "auxiliary_loss_clip": 0.06395449, + "auxiliary_loss_mlp": 0.01262921, + "balance_loss_clip": 0.06269926, + "balance_loss_mlp": 0.01254517, + "epoch": 0.9535547873139937, + "flos": 16659884839680.0, + "grad_norm": 1.9960585900313483, + "language_loss": 0.81999767, + "learning_rate": 2.2557242649130504e-08, + "loss": 0.89658141, + "num_input_tokens_seen": 342075025, + "router_z_loss_clip": 1.25390625, + "router_z_loss_mlp": 0.08398438, + "step": 15860, + "time_per_iteration": 2.499480962753296 + }, + { + "auxiliary_loss_clip": 0.0640311, + "auxiliary_loss_mlp": 0.01263902, + "balance_loss_clip": 0.06270963, + "balance_loss_mlp": 0.01254842, + "epoch": 0.9536149105666617, + "flos": 20674048464000.0, + "grad_norm": 1.6914081967904189, + "language_loss": 0.67099893, + "learning_rate": 2.249895178891159e-08, + "loss": 0.74766904, + "num_input_tokens_seen": 342094595, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.09057617, + "step": 15861, + "time_per_iteration": 2.5145528316497803 + }, + { + "auxiliary_loss_clip": 0.06404839, + "auxiliary_loss_mlp": 0.01266949, + "balance_loss_clip": 0.06272271, + "balance_loss_mlp": 0.0125743, + "epoch": 0.9536750338193296, + "flos": 30708304531200.0, + "grad_norm": 1.7038056043376955, + "language_loss": 0.65918678, + "learning_rate": 2.244073591573037e-08, + "loss": 0.73590457, + "num_input_tokens_seen": 342115970, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09527588, + "step": 15862, + "time_per_iteration": 2.603203535079956 + }, + { + "auxiliary_loss_clip": 0.06399581, + "auxiliary_loss_mlp": 0.01268128, + "balance_loss_clip": 0.06274375, + "balance_loss_mlp": 0.01259566, + "epoch": 0.9537351570719976, + "flos": 20410559700480.0, + "grad_norm": 1.9688037838707206, + "language_loss": 0.67976749, + "learning_rate": 2.238259503179485e-08, + "loss": 0.75644457, + "num_input_tokens_seen": 342134080, + "router_z_loss_clip": 1.25195312, + "router_z_loss_mlp": 0.08566284, + "step": 15863, + "time_per_iteration": 2.4922752380371094 + }, + { + "auxiliary_loss_clip": 0.0639983, + "auxiliary_loss_mlp": 0.01266275, + "balance_loss_clip": 0.06269602, + "balance_loss_mlp": 0.01257543, + "epoch": 0.9537952803246655, + "flos": 29936076503040.0, + "grad_norm": 1.911779704928809, + "language_loss": 0.78732878, + "learning_rate": 2.2324529139309267e-08, + "loss": 0.86398983, + "num_input_tokens_seen": 342154725, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.08728027, + "step": 15864, + "time_per_iteration": 2.5733559131622314 + }, + { + "auxiliary_loss_clip": 0.06401516, + "auxiliary_loss_mlp": 0.01262552, + "balance_loss_clip": 0.06272466, + "balance_loss_mlp": 0.01253886, + "epoch": 0.9538554035773336, + "flos": 20527580327040.0, + "grad_norm": 1.9107480949648576, + "language_loss": 0.59663749, + "learning_rate": 2.226653824047586e-08, + "loss": 0.67327815, + "num_input_tokens_seen": 342172275, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.08660889, + "step": 15865, + "time_per_iteration": 2.497642993927002 + }, + { + "auxiliary_loss_clip": 0.06402111, + "auxiliary_loss_mlp": 0.01268229, + "balance_loss_clip": 0.06271199, + "balance_loss_mlp": 0.01259008, + "epoch": 0.9539155268300015, + "flos": 18412555092480.0, + "grad_norm": 1.675507337482719, + "language_loss": 0.69925714, + "learning_rate": 2.2208622337493765e-08, + "loss": 0.77596056, + "num_input_tokens_seen": 342190880, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09222412, + "step": 15866, + "time_per_iteration": 4.007173299789429 + }, + { + "auxiliary_loss_clip": 0.06402818, + "auxiliary_loss_mlp": 0.01267711, + "balance_loss_clip": 0.06272386, + "balance_loss_mlp": 0.01257846, + "epoch": 0.9539756500826695, + "flos": 26220425448960.0, + "grad_norm": 3.175329411462929, + "language_loss": 0.85554373, + "learning_rate": 2.215078143255855e-08, + "loss": 0.93224895, + "num_input_tokens_seen": 342208165, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.09863281, + "step": 15867, + "time_per_iteration": 2.5468525886535645 + }, + { + "auxiliary_loss_clip": 0.06306315, + "auxiliary_loss_mlp": 0.01249791, + "balance_loss_clip": 0.06252342, + "balance_loss_mlp": 0.01248795, + "epoch": 0.9540357733353374, + "flos": 68310673989120.0, + "grad_norm": 0.7435794957212412, + "language_loss": 0.61859345, + "learning_rate": 2.2093015527864024e-08, + "loss": 0.6941545, + "num_input_tokens_seen": 342277110, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.00995636, + "step": 15868, + "time_per_iteration": 3.1682119369506836 + }, + { + "auxiliary_loss_clip": 0.06400545, + "auxiliary_loss_mlp": 0.01265566, + "balance_loss_clip": 0.06270404, + "balance_loss_mlp": 0.01256417, + "epoch": 0.9540958965880054, + "flos": 21294693256320.0, + "grad_norm": 1.9119613617330347, + "language_loss": 0.60321581, + "learning_rate": 2.2035324625600425e-08, + "loss": 0.67987692, + "num_input_tokens_seen": 342294695, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.0914917, + "step": 15869, + "time_per_iteration": 3.9212167263031006 + }, + { + "auxiliary_loss_clip": 0.06401898, + "auxiliary_loss_mlp": 0.01263992, + "balance_loss_clip": 0.06271598, + "balance_loss_mlp": 0.01255755, + "epoch": 0.9541560198406733, + "flos": 19756819745280.0, + "grad_norm": 1.512796436338129, + "language_loss": 0.71245605, + "learning_rate": 2.197770872795579e-08, + "loss": 0.78911495, + "num_input_tokens_seen": 342314970, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.08239746, + "step": 15870, + "time_per_iteration": 2.5445003509521484 + }, + { + "auxiliary_loss_clip": 0.06398092, + "auxiliary_loss_mlp": 0.01262742, + "balance_loss_clip": 0.06267514, + "balance_loss_mlp": 0.01253176, + "epoch": 0.9542161430933414, + "flos": 24722229646080.0, + "grad_norm": 2.7015684448513255, + "language_loss": 0.77101582, + "learning_rate": 2.1920167837114368e-08, + "loss": 0.84762418, + "num_input_tokens_seen": 342334255, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.09564209, + "step": 15871, + "time_per_iteration": 2.5978353023529053 + }, + { + "auxiliary_loss_clip": 0.06402687, + "auxiliary_loss_mlp": 0.01266317, + "balance_loss_clip": 0.062712, + "balance_loss_mlp": 0.01256083, + "epoch": 0.9542762663460094, + "flos": 31073762113920.0, + "grad_norm": 1.836271204712955, + "language_loss": 0.58700699, + "learning_rate": 2.1862701955258634e-08, + "loss": 0.663697, + "num_input_tokens_seen": 342354730, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.10229492, + "step": 15872, + "time_per_iteration": 2.569619655609131 + }, + { + "auxiliary_loss_clip": 0.06405389, + "auxiliary_loss_mlp": 0.0126534, + "balance_loss_clip": 0.06270651, + "balance_loss_mlp": 0.01255404, + "epoch": 0.9543363895986773, + "flos": 20782935244800.0, + "grad_norm": 1.4552113328660328, + "language_loss": 0.75296628, + "learning_rate": 2.1805311084567514e-08, + "loss": 0.82967359, + "num_input_tokens_seen": 342374565, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.09936523, + "step": 15873, + "time_per_iteration": 2.5089752674102783 + }, + { + "auxiliary_loss_clip": 0.06401756, + "auxiliary_loss_mlp": 0.01263022, + "balance_loss_clip": 0.06270768, + "balance_loss_mlp": 0.01253205, + "epoch": 0.9543965128513453, + "flos": 24469725767040.0, + "grad_norm": 1.8570446909627079, + "language_loss": 0.62738776, + "learning_rate": 2.1747995227217265e-08, + "loss": 0.70403558, + "num_input_tokens_seen": 342394590, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.0980835, + "step": 15874, + "time_per_iteration": 2.529031276702881 + }, + { + "auxiliary_loss_clip": 0.06400266, + "auxiliary_loss_mlp": 0.01264719, + "balance_loss_clip": 0.06271619, + "balance_loss_mlp": 0.01255719, + "epoch": 0.9544566361040132, + "flos": 15265838062080.0, + "grad_norm": 2.078620235439226, + "language_loss": 0.89995027, + "learning_rate": 2.169075438538104e-08, + "loss": 0.97660017, + "num_input_tokens_seen": 342410445, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.08990479, + "step": 15875, + "time_per_iteration": 2.4796183109283447 + }, + { + "auxiliary_loss_clip": 0.06407903, + "auxiliary_loss_mlp": 0.01264624, + "balance_loss_clip": 0.06271803, + "balance_loss_mlp": 0.01254408, + "epoch": 0.9545167593566812, + "flos": 25925434750080.0, + "grad_norm": 1.6683219273292442, + "language_loss": 0.67765808, + "learning_rate": 2.1633588561229765e-08, + "loss": 0.75438333, + "num_input_tokens_seen": 342430970, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.10211182, + "step": 15876, + "time_per_iteration": 2.5184824466705322 + }, + { + "auxiliary_loss_clip": 0.06402661, + "auxiliary_loss_mlp": 0.01266519, + "balance_loss_clip": 0.06270103, + "balance_loss_mlp": 0.01256553, + "epoch": 0.9545768826093491, + "flos": 25635014098560.0, + "grad_norm": 1.790004894907314, + "language_loss": 0.69065815, + "learning_rate": 2.1576497756931267e-08, + "loss": 0.76734996, + "num_input_tokens_seen": 342449505, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.09967041, + "step": 15877, + "time_per_iteration": 2.5234038829803467 + }, + { + "auxiliary_loss_clip": 0.06403767, + "auxiliary_loss_mlp": 0.01263566, + "balance_loss_clip": 0.06269973, + "balance_loss_mlp": 0.01253815, + "epoch": 0.9546370058620172, + "flos": 22497982214400.0, + "grad_norm": 2.586759661224603, + "language_loss": 0.70764804, + "learning_rate": 2.1519481974650035e-08, + "loss": 0.78432131, + "num_input_tokens_seen": 342470390, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.09741211, + "step": 15878, + "time_per_iteration": 2.5088722705841064 + }, + { + "auxiliary_loss_clip": 0.06397127, + "auxiliary_loss_mlp": 0.01262444, + "balance_loss_clip": 0.06268129, + "balance_loss_mlp": 0.01253634, + "epoch": 0.9546971291146851, + "flos": 24616738955520.0, + "grad_norm": 1.3740237570513218, + "language_loss": 0.68291056, + "learning_rate": 2.1462541216548335e-08, + "loss": 0.75950634, + "num_input_tokens_seen": 342492560, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.08813477, + "step": 15879, + "time_per_iteration": 2.5728976726531982 + }, + { + "auxiliary_loss_clip": 0.06399859, + "auxiliary_loss_mlp": 0.01263148, + "balance_loss_clip": 0.06270926, + "balance_loss_mlp": 0.01253951, + "epoch": 0.9547572523673531, + "flos": 28665297480960.0, + "grad_norm": 1.9338134404565663, + "language_loss": 0.85166907, + "learning_rate": 2.1405675484785334e-08, + "loss": 0.92829913, + "num_input_tokens_seen": 342512315, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.09210205, + "step": 15880, + "time_per_iteration": 2.682302713394165 + }, + { + "auxiliary_loss_clip": 0.06399159, + "auxiliary_loss_mlp": 0.0126469, + "balance_loss_clip": 0.06267743, + "balance_loss_mlp": 0.01255326, + "epoch": 0.954817375620021, + "flos": 33811067295360.0, + "grad_norm": 1.7252221713052975, + "language_loss": 0.72050363, + "learning_rate": 2.134888478151753e-08, + "loss": 0.79714215, + "num_input_tokens_seen": 342533060, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09362793, + "step": 15881, + "time_per_iteration": 2.6338717937469482 + }, + { + "auxiliary_loss_clip": 0.06399329, + "auxiliary_loss_mlp": 0.01264091, + "balance_loss_clip": 0.06269658, + "balance_loss_mlp": 0.01254596, + "epoch": 0.954877498872689, + "flos": 14433373347840.0, + "grad_norm": 2.028539816265887, + "language_loss": 0.72078586, + "learning_rate": 2.1292169108898083e-08, + "loss": 0.79742002, + "num_input_tokens_seen": 342550830, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.09490967, + "step": 15882, + "time_per_iteration": 2.4859602451324463 + }, + { + "auxiliary_loss_clip": 0.06404308, + "auxiliary_loss_mlp": 0.01264748, + "balance_loss_clip": 0.06271283, + "balance_loss_mlp": 0.01255748, + "epoch": 0.9549376221253569, + "flos": 59282129681280.0, + "grad_norm": 1.6753503027814232, + "language_loss": 0.66631484, + "learning_rate": 2.1235528469078168e-08, + "loss": 0.74300539, + "num_input_tokens_seen": 342575070, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.09002686, + "step": 15883, + "time_per_iteration": 2.858281373977661 + }, + { + "auxiliary_loss_clip": 0.06403692, + "auxiliary_loss_mlp": 0.01264383, + "balance_loss_clip": 0.06270359, + "balance_loss_mlp": 0.01254167, + "epoch": 0.954997745378025, + "flos": 17280068434560.0, + "grad_norm": 2.018487507978806, + "language_loss": 0.77985692, + "learning_rate": 2.1178962864205175e-08, + "loss": 0.85653764, + "num_input_tokens_seen": 342592215, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.10223389, + "step": 15884, + "time_per_iteration": 2.4717769622802734 + }, + { + "auxiliary_loss_clip": 0.06402802, + "auxiliary_loss_mlp": 0.01263008, + "balance_loss_clip": 0.06270074, + "balance_loss_mlp": 0.01253949, + "epoch": 0.955057868630693, + "flos": 13011472287360.0, + "grad_norm": 1.7174754271027919, + "language_loss": 0.7789489, + "learning_rate": 2.1122472296424054e-08, + "loss": 0.85560703, + "num_input_tokens_seen": 342610030, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09057617, + "step": 15885, + "time_per_iteration": 2.47308087348938 + }, + { + "auxiliary_loss_clip": 0.06403592, + "auxiliary_loss_mlp": 0.01264253, + "balance_loss_clip": 0.062719, + "balance_loss_mlp": 0.01255324, + "epoch": 0.9551179918833609, + "flos": 22644240716160.0, + "grad_norm": 1.6851003761813457, + "language_loss": 0.70151675, + "learning_rate": 2.1066056767877317e-08, + "loss": 0.77819514, + "num_input_tokens_seen": 342626475, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.08929443, + "step": 15886, + "time_per_iteration": 2.5218918323516846 + }, + { + "auxiliary_loss_clip": 0.0640685, + "auxiliary_loss_mlp": 0.01264104, + "balance_loss_clip": 0.0627156, + "balance_loss_mlp": 0.01253261, + "epoch": 0.9551781151360289, + "flos": 21549125779200.0, + "grad_norm": 1.64294120083005, + "language_loss": 0.72599673, + "learning_rate": 2.1009716280703916e-08, + "loss": 0.8027063, + "num_input_tokens_seen": 342646645, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.10852051, + "step": 15887, + "time_per_iteration": 2.5182785987854004 + }, + { + "auxiliary_loss_clip": 0.06395856, + "auxiliary_loss_mlp": 0.01263725, + "balance_loss_clip": 0.06269971, + "balance_loss_mlp": 0.01254534, + "epoch": 0.9552382383886968, + "flos": 20708191802880.0, + "grad_norm": 1.9888087849985687, + "language_loss": 0.57053173, + "learning_rate": 2.0953450837040364e-08, + "loss": 0.64712757, + "num_input_tokens_seen": 342663615, + "router_z_loss_clip": 1.25683594, + "router_z_loss_mlp": 0.09197998, + "step": 15888, + "time_per_iteration": 2.4768011569976807 + }, + { + "auxiliary_loss_clip": 0.06306279, + "auxiliary_loss_mlp": 0.01249003, + "balance_loss_clip": 0.06252466, + "balance_loss_mlp": 0.01247993, + "epoch": 0.9552983616413648, + "flos": 67789859736960.0, + "grad_norm": 0.6952192032198299, + "language_loss": 0.57792616, + "learning_rate": 2.0897260439020514e-08, + "loss": 0.65347898, + "num_input_tokens_seen": 342728275, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.01010132, + "step": 15889, + "time_per_iteration": 3.172846794128418 + }, + { + "auxiliary_loss_clip": 0.06403498, + "auxiliary_loss_mlp": 0.01264184, + "balance_loss_clip": 0.06267909, + "balance_loss_mlp": 0.01254278, + "epoch": 0.9553584848940327, + "flos": 21586413646080.0, + "grad_norm": 1.3421643090083555, + "language_loss": 0.66883469, + "learning_rate": 2.084114508877466e-08, + "loss": 0.74551147, + "num_input_tokens_seen": 342748860, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.09906006, + "step": 15890, + "time_per_iteration": 3.8940742015838623 + }, + { + "auxiliary_loss_clip": 0.06402219, + "auxiliary_loss_mlp": 0.01263198, + "balance_loss_clip": 0.06271324, + "balance_loss_mlp": 0.01254263, + "epoch": 0.9554186081467008, + "flos": 24215251317120.0, + "grad_norm": 1.4384385434971376, + "language_loss": 0.74144399, + "learning_rate": 2.0785104788430874e-08, + "loss": 0.81809819, + "num_input_tokens_seen": 342769705, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.08929443, + "step": 15891, + "time_per_iteration": 2.506944179534912 + }, + { + "auxiliary_loss_clip": 0.06399399, + "auxiliary_loss_mlp": 0.01263987, + "balance_loss_clip": 0.06272076, + "balance_loss_mlp": 0.01254945, + "epoch": 0.9554787313993687, + "flos": 16256845900800.0, + "grad_norm": 1.9258127915032677, + "language_loss": 0.78508484, + "learning_rate": 2.072913954011435e-08, + "loss": 0.86171877, + "num_input_tokens_seen": 342787000, + "router_z_loss_clip": 1.2734375, + "router_z_loss_mlp": 0.09039307, + "step": 15892, + "time_per_iteration": 2.4849460124969482 + }, + { + "auxiliary_loss_clip": 0.06401937, + "auxiliary_loss_mlp": 0.01264788, + "balance_loss_clip": 0.06271841, + "balance_loss_mlp": 0.01255042, + "epoch": 0.9555388546520367, + "flos": 23410850520960.0, + "grad_norm": 4.041459820212515, + "language_loss": 0.69976628, + "learning_rate": 2.0673249345947386e-08, + "loss": 0.77643347, + "num_input_tokens_seen": 342807795, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.09741211, + "step": 15893, + "time_per_iteration": 2.5242111682891846 + }, + { + "auxiliary_loss_clip": 0.06400245, + "auxiliary_loss_mlp": 0.01265117, + "balance_loss_clip": 0.06271841, + "balance_loss_mlp": 0.0125474, + "epoch": 0.9555989779047046, + "flos": 14799417909120.0, + "grad_norm": 1.7085696744264771, + "language_loss": 0.66091406, + "learning_rate": 2.0617434208048955e-08, + "loss": 0.73756772, + "num_input_tokens_seen": 342825490, + "router_z_loss_clip": 1.28417969, + "router_z_loss_mlp": 0.10375977, + "step": 15894, + "time_per_iteration": 2.4788177013397217 + }, + { + "auxiliary_loss_clip": 0.06402315, + "auxiliary_loss_mlp": 0.01265129, + "balance_loss_clip": 0.06269212, + "balance_loss_mlp": 0.01255175, + "epoch": 0.9556591011573726, + "flos": 22243298129280.0, + "grad_norm": 1.7483661442382448, + "language_loss": 0.82017207, + "learning_rate": 2.056169412853581e-08, + "loss": 0.89684647, + "num_input_tokens_seen": 342844965, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.09954834, + "step": 15895, + "time_per_iteration": 2.498887777328491 + }, + { + "auxiliary_loss_clip": 0.06403477, + "auxiliary_loss_mlp": 0.0126659, + "balance_loss_clip": 0.06272532, + "balance_loss_mlp": 0.01257476, + "epoch": 0.9557192244100405, + "flos": 27862741474560.0, + "grad_norm": 1.5507506491352763, + "language_loss": 0.72899592, + "learning_rate": 2.0506029109521593e-08, + "loss": 0.80569655, + "num_input_tokens_seen": 342865915, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.09118652, + "step": 15896, + "time_per_iteration": 2.564551830291748 + }, + { + "auxiliary_loss_clip": 0.06398298, + "auxiliary_loss_mlp": 0.01264488, + "balance_loss_clip": 0.06269994, + "balance_loss_mlp": 0.0125488, + "epoch": 0.9557793476627086, + "flos": 17608531639680.0, + "grad_norm": 1.9204607289870128, + "language_loss": 0.79759163, + "learning_rate": 2.045043915311706e-08, + "loss": 0.87421948, + "num_input_tokens_seen": 342884000, + "router_z_loss_clip": 1.28320312, + "router_z_loss_mlp": 0.09613037, + "step": 15897, + "time_per_iteration": 3.9504964351654053 + }, + { + "auxiliary_loss_clip": 0.06402426, + "auxiliary_loss_mlp": 0.0126348, + "balance_loss_clip": 0.06270665, + "balance_loss_mlp": 0.01253601, + "epoch": 0.9558394709153766, + "flos": 23881798794240.0, + "grad_norm": 1.5071236590809027, + "language_loss": 0.72668207, + "learning_rate": 2.03949242614303e-08, + "loss": 0.80334115, + "num_input_tokens_seen": 342903095, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09884644, + "step": 15898, + "time_per_iteration": 2.5182039737701416 + }, + { + "auxiliary_loss_clip": 0.06307501, + "auxiliary_loss_mlp": 0.01250726, + "balance_loss_clip": 0.06253572, + "balance_loss_mlp": 0.01249622, + "epoch": 0.9558995941680445, + "flos": 53698995152640.0, + "grad_norm": 0.8900999457262652, + "language_loss": 0.52336156, + "learning_rate": 2.033948443656652e-08, + "loss": 0.59894383, + "num_input_tokens_seen": 342958155, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.01106262, + "step": 15899, + "time_per_iteration": 3.0710113048553467 + }, + { + "auxiliary_loss_clip": 0.06409016, + "auxiliary_loss_mlp": 0.01266314, + "balance_loss_clip": 0.06271899, + "balance_loss_mlp": 0.0125565, + "epoch": 0.9559597174207125, + "flos": 13768355018880.0, + "grad_norm": 3.1416892180470533, + "language_loss": 0.69164026, + "learning_rate": 2.028411968062782e-08, + "loss": 0.76839364, + "num_input_tokens_seen": 342972500, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.10662842, + "step": 15900, + "time_per_iteration": 2.4697251319885254 + }, + { + "auxiliary_loss_clip": 0.06400748, + "auxiliary_loss_mlp": 0.01264197, + "balance_loss_clip": 0.06269322, + "balance_loss_mlp": 0.01254553, + "epoch": 0.9560198406733804, + "flos": 19942210903680.0, + "grad_norm": 1.799845968546889, + "language_loss": 0.83136785, + "learning_rate": 2.0228829995713627e-08, + "loss": 0.9080174, + "num_input_tokens_seen": 342989035, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09637451, + "step": 15901, + "time_per_iteration": 2.4810070991516113 + }, + { + "auxiliary_loss_clip": 0.06309229, + "auxiliary_loss_mlp": 0.0125074, + "balance_loss_clip": 0.06255125, + "balance_loss_mlp": 0.01249663, + "epoch": 0.9560799639260484, + "flos": 57306388331520.0, + "grad_norm": 0.8422702355549128, + "language_loss": 0.54080284, + "learning_rate": 2.0173615383920485e-08, + "loss": 0.61640251, + "num_input_tokens_seen": 343051675, + "router_z_loss_clip": 0.54199219, + "router_z_loss_mlp": 0.01078033, + "step": 15902, + "time_per_iteration": 3.218306303024292 + }, + { + "auxiliary_loss_clip": 0.06393287, + "auxiliary_loss_mlp": 0.012633, + "balance_loss_clip": 0.06269377, + "balance_loss_mlp": 0.01255715, + "epoch": 0.9561400871787163, + "flos": 18923264928000.0, + "grad_norm": 1.5636157887301885, + "language_loss": 0.85598201, + "learning_rate": 2.01184758473425e-08, + "loss": 0.93254787, + "num_input_tokens_seen": 343068895, + "router_z_loss_clip": 1.24023438, + "router_z_loss_mlp": 0.07583618, + "step": 15903, + "time_per_iteration": 2.540703773498535 + }, + { + "auxiliary_loss_clip": 0.06400403, + "auxiliary_loss_mlp": 0.01264973, + "balance_loss_clip": 0.0626982, + "balance_loss_mlp": 0.01256205, + "epoch": 0.9562002104313844, + "flos": 18044036835840.0, + "grad_norm": 2.1727192495909162, + "language_loss": 0.80775261, + "learning_rate": 2.0063411388070217e-08, + "loss": 0.88440645, + "num_input_tokens_seen": 343087115, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.08758545, + "step": 15904, + "time_per_iteration": 2.5238215923309326 + }, + { + "auxiliary_loss_clip": 0.06405573, + "auxiliary_loss_mlp": 0.01263965, + "balance_loss_clip": 0.06272165, + "balance_loss_mlp": 0.01253791, + "epoch": 0.9562603336840523, + "flos": 24724619487360.0, + "grad_norm": 2.309004230193841, + "language_loss": 0.60495961, + "learning_rate": 2.0008422008191972e-08, + "loss": 0.68165493, + "num_input_tokens_seen": 343105575, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.10180664, + "step": 15905, + "time_per_iteration": 2.5484659671783447 + }, + { + "auxiliary_loss_clip": 0.0639789, + "auxiliary_loss_mlp": 0.0126401, + "balance_loss_clip": 0.06268601, + "balance_loss_mlp": 0.01255028, + "epoch": 0.9563204569367203, + "flos": 21183332780160.0, + "grad_norm": 1.726875839834982, + "language_loss": 0.70595205, + "learning_rate": 1.995350770979254e-08, + "loss": 0.78257102, + "num_input_tokens_seen": 343123025, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.08990479, + "step": 15906, + "time_per_iteration": 3.9245364665985107 + }, + { + "auxiliary_loss_clip": 0.06408137, + "auxiliary_loss_mlp": 0.01263171, + "balance_loss_clip": 0.06272523, + "balance_loss_mlp": 0.0125271, + "epoch": 0.9563805801893882, + "flos": 20235901864320.0, + "grad_norm": 1.7588326158627845, + "language_loss": 0.70970643, + "learning_rate": 1.9898668494954473e-08, + "loss": 0.78641951, + "num_input_tokens_seen": 343141625, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.10455322, + "step": 15907, + "time_per_iteration": 2.4972972869873047 + }, + { + "auxiliary_loss_clip": 0.06399702, + "auxiliary_loss_mlp": 0.01266174, + "balance_loss_clip": 0.06271538, + "balance_loss_mlp": 0.01257567, + "epoch": 0.9564407034420562, + "flos": 25418079077760.0, + "grad_norm": 2.2506849509040543, + "language_loss": 0.70946819, + "learning_rate": 1.9843904365757447e-08, + "loss": 0.78612697, + "num_input_tokens_seen": 343161300, + "router_z_loss_clip": 1.28320312, + "router_z_loss_mlp": 0.08605957, + "step": 15908, + "time_per_iteration": 2.5539722442626953 + }, + { + "auxiliary_loss_clip": 0.06401962, + "auxiliary_loss_mlp": 0.01264879, + "balance_loss_clip": 0.06271769, + "balance_loss_mlp": 0.01256022, + "epoch": 0.9565008266947241, + "flos": 18629699748480.0, + "grad_norm": 2.0637627701483607, + "language_loss": 0.82866412, + "learning_rate": 1.978921532427802e-08, + "loss": 0.90533257, + "num_input_tokens_seen": 343177815, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.08856201, + "step": 15909, + "time_per_iteration": 3.9678423404693604 + }, + { + "auxiliary_loss_clip": 0.06401636, + "auxiliary_loss_mlp": 0.01262877, + "balance_loss_clip": 0.06272514, + "balance_loss_mlp": 0.01253639, + "epoch": 0.9565609499473922, + "flos": 24868865491200.0, + "grad_norm": 1.7859019883624712, + "language_loss": 0.67964911, + "learning_rate": 1.9734601372590086e-08, + "loss": 0.75629425, + "num_input_tokens_seen": 343198140, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.09234619, + "step": 15910, + "time_per_iteration": 2.5445590019226074 + }, + { + "auxiliary_loss_clip": 0.06406734, + "auxiliary_loss_mlp": 0.01263274, + "balance_loss_clip": 0.06272303, + "balance_loss_mlp": 0.01253886, + "epoch": 0.9566210732000601, + "flos": 21804858040320.0, + "grad_norm": 2.0219141580296256, + "language_loss": 0.74345183, + "learning_rate": 1.968006251276444e-08, + "loss": 0.82015193, + "num_input_tokens_seen": 343218280, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.09393311, + "step": 15911, + "time_per_iteration": 2.5246856212615967 + }, + { + "auxiliary_loss_clip": 0.06402273, + "auxiliary_loss_mlp": 0.01266264, + "balance_loss_clip": 0.06270364, + "balance_loss_mlp": 0.01257348, + "epoch": 0.9566811964527281, + "flos": 18703562722560.0, + "grad_norm": 1.7881819879076843, + "language_loss": 0.6983766, + "learning_rate": 1.9625598746869198e-08, + "loss": 0.77506196, + "num_input_tokens_seen": 343236850, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.08911133, + "step": 15912, + "time_per_iteration": 2.4712233543395996 + }, + { + "auxiliary_loss_clip": 0.06402682, + "auxiliary_loss_mlp": 0.012665, + "balance_loss_clip": 0.06271908, + "balance_loss_mlp": 0.01257058, + "epoch": 0.9567413197053961, + "flos": 13004763960960.0, + "grad_norm": 3.3702578825008147, + "language_loss": 0.72631347, + "learning_rate": 1.95712100769696e-08, + "loss": 0.80300522, + "num_input_tokens_seen": 343253065, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09442139, + "step": 15913, + "time_per_iteration": 2.491908311843872 + }, + { + "auxiliary_loss_clip": 0.06399457, + "auxiliary_loss_mlp": 0.01266561, + "balance_loss_clip": 0.06270806, + "balance_loss_mlp": 0.01257335, + "epoch": 0.956801442958064, + "flos": 19725401664000.0, + "grad_norm": 2.223834124894749, + "language_loss": 0.73728657, + "learning_rate": 1.9516896505128444e-08, + "loss": 0.81394672, + "num_input_tokens_seen": 343270330, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.09222412, + "step": 15914, + "time_per_iteration": 2.4964563846588135 + }, + { + "auxiliary_loss_clip": 0.06398837, + "auxiliary_loss_mlp": 0.01263467, + "balance_loss_clip": 0.06268872, + "balance_loss_mlp": 0.01253978, + "epoch": 0.956861566210732, + "flos": 18228631380480.0, + "grad_norm": 1.552289311371977, + "language_loss": 0.67290843, + "learning_rate": 1.9462658033404965e-08, + "loss": 0.74953151, + "num_input_tokens_seen": 343289625, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.09484863, + "step": 15915, + "time_per_iteration": 2.5190324783325195 + }, + { + "auxiliary_loss_clip": 0.06394604, + "auxiliary_loss_mlp": 0.0126414, + "balance_loss_clip": 0.06268029, + "balance_loss_mlp": 0.01255014, + "epoch": 0.9569216894634, + "flos": 22202949588480.0, + "grad_norm": 1.6620877394499343, + "language_loss": 0.64458013, + "learning_rate": 1.9408494663855967e-08, + "loss": 0.72116756, + "num_input_tokens_seen": 343309200, + "router_z_loss_clip": 1.26367188, + "router_z_loss_mlp": 0.09124756, + "step": 15916, + "time_per_iteration": 2.491138219833374 + }, + { + "auxiliary_loss_clip": 0.06395577, + "auxiliary_loss_mlp": 0.01263704, + "balance_loss_clip": 0.0627159, + "balance_loss_mlp": 0.01255464, + "epoch": 0.956981812716068, + "flos": 21695719697280.0, + "grad_norm": 1.8254745953624876, + "language_loss": 0.80804276, + "learning_rate": 1.935440639853536e-08, + "loss": 0.88463557, + "num_input_tokens_seen": 343326270, + "router_z_loss_clip": 1.23925781, + "router_z_loss_mlp": 0.08242798, + "step": 15917, + "time_per_iteration": 2.5050711631774902 + }, + { + "auxiliary_loss_clip": 0.06400816, + "auxiliary_loss_mlp": 0.01268269, + "balance_loss_clip": 0.06271309, + "balance_loss_mlp": 0.01258321, + "epoch": 0.9570419359687359, + "flos": 13996065288960.0, + "grad_norm": 1.542027352693381, + "language_loss": 0.73089451, + "learning_rate": 1.9300393239494172e-08, + "loss": 0.80758536, + "num_input_tokens_seen": 343344430, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.09960938, + "step": 15918, + "time_per_iteration": 2.4727392196655273 + }, + { + "auxiliary_loss_clip": 0.06310041, + "auxiliary_loss_mlp": 0.01250785, + "balance_loss_clip": 0.06256156, + "balance_loss_mlp": 0.01249783, + "epoch": 0.9571020592214039, + "flos": 65219525015040.0, + "grad_norm": 0.6115592062767367, + "language_loss": 0.53111994, + "learning_rate": 1.924645518878032e-08, + "loss": 0.6067282, + "num_input_tokens_seen": 343416155, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.0100174, + "step": 15919, + "time_per_iteration": 3.272456645965576 + }, + { + "auxiliary_loss_clip": 0.064109, + "auxiliary_loss_mlp": 0.01269147, + "balance_loss_clip": 0.06275045, + "balance_loss_mlp": 0.01258651, + "epoch": 0.9571621824740718, + "flos": 17389793756160.0, + "grad_norm": 2.6495483249351137, + "language_loss": 0.76336288, + "learning_rate": 1.919259224843972e-08, + "loss": 0.84016335, + "num_input_tokens_seen": 343431715, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.1050415, + "step": 15920, + "time_per_iteration": 2.536787509918213 + }, + { + "auxiliary_loss_clip": 0.0640638, + "auxiliary_loss_mlp": 0.01267318, + "balance_loss_clip": 0.06273204, + "balance_loss_mlp": 0.0125712, + "epoch": 0.9572223057267398, + "flos": 14543434085760.0, + "grad_norm": 1.7185782559349, + "language_loss": 0.79365337, + "learning_rate": 1.9138804420514298e-08, + "loss": 0.87039036, + "num_input_tokens_seen": 343450425, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.10198975, + "step": 15921, + "time_per_iteration": 2.5111634731292725 + }, + { + "auxiliary_loss_clip": 0.06408585, + "auxiliary_loss_mlp": 0.01264797, + "balance_loss_clip": 0.06270958, + "balance_loss_mlp": 0.01254396, + "epoch": 0.9572824289794077, + "flos": 33956151840000.0, + "grad_norm": 1.7702021043483893, + "language_loss": 0.5147, + "learning_rate": 1.9085091707044197e-08, + "loss": 0.59143382, + "num_input_tokens_seen": 343470445, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.10406494, + "step": 15922, + "time_per_iteration": 2.5946807861328125 + }, + { + "auxiliary_loss_clip": 0.06403722, + "auxiliary_loss_mlp": 0.01265384, + "balance_loss_clip": 0.06269565, + "balance_loss_mlp": 0.01255764, + "epoch": 0.9573425522320758, + "flos": 18700418194560.0, + "grad_norm": 1.9436710836250617, + "language_loss": 0.84095252, + "learning_rate": 1.903145411006557e-08, + "loss": 0.91764355, + "num_input_tokens_seen": 343485200, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.09625244, + "step": 15923, + "time_per_iteration": 2.478198289871216 + }, + { + "auxiliary_loss_clip": 0.06399676, + "auxiliary_loss_mlp": 0.0126405, + "balance_loss_clip": 0.06269531, + "balance_loss_mlp": 0.01255187, + "epoch": 0.9574026754847437, + "flos": 28517571532800.0, + "grad_norm": 1.5492156766676946, + "language_loss": 0.7513963, + "learning_rate": 1.8977891631613008e-08, + "loss": 0.82803351, + "num_input_tokens_seen": 343505080, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.08862305, + "step": 15924, + "time_per_iteration": 2.5611090660095215 + }, + { + "auxiliary_loss_clip": 0.06402448, + "auxiliary_loss_mlp": 0.01262647, + "balance_loss_clip": 0.06271331, + "balance_loss_mlp": 0.01253594, + "epoch": 0.9574627987374117, + "flos": 24359203831680.0, + "grad_norm": 2.195724562368793, + "language_loss": 0.86041164, + "learning_rate": 1.892440427371711e-08, + "loss": 0.93706262, + "num_input_tokens_seen": 343523995, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09051514, + "step": 15925, + "time_per_iteration": 2.5580694675445557 + }, + { + "auxiliary_loss_clip": 0.06405063, + "auxiliary_loss_mlp": 0.01265178, + "balance_loss_clip": 0.06269714, + "balance_loss_mlp": 0.01255004, + "epoch": 0.9575229219900797, + "flos": 23516928190080.0, + "grad_norm": 1.83782139466113, + "language_loss": 0.76031494, + "learning_rate": 1.8870992038406474e-08, + "loss": 0.8370173, + "num_input_tokens_seen": 343542015, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.10174561, + "step": 15926, + "time_per_iteration": 2.6703908443450928 + }, + { + "auxiliary_loss_clip": 0.06407382, + "auxiliary_loss_mlp": 0.0126202, + "balance_loss_clip": 0.06274736, + "balance_loss_mlp": 0.01253759, + "epoch": 0.9575830452427476, + "flos": 22681486656000.0, + "grad_norm": 1.5772300841265903, + "language_loss": 0.78243768, + "learning_rate": 1.8817654927706373e-08, + "loss": 0.85913169, + "num_input_tokens_seen": 343561680, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.08276367, + "step": 15927, + "time_per_iteration": 2.569844961166382 + }, + { + "auxiliary_loss_clip": 0.06403775, + "auxiliary_loss_mlp": 0.01266085, + "balance_loss_clip": 0.06269503, + "balance_loss_mlp": 0.01255643, + "epoch": 0.9576431684954156, + "flos": 30493633570560.0, + "grad_norm": 1.749047653525374, + "language_loss": 0.68875557, + "learning_rate": 1.8764392943639183e-08, + "loss": 0.76545417, + "num_input_tokens_seen": 343585290, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.10449219, + "step": 15928, + "time_per_iteration": 2.6400134563446045 + }, + { + "auxiliary_loss_clip": 0.0640448, + "auxiliary_loss_mlp": 0.01264922, + "balance_loss_clip": 0.06272465, + "balance_loss_mlp": 0.01255296, + "epoch": 0.9577032917480836, + "flos": 21693497564160.0, + "grad_norm": 1.7657767995871196, + "language_loss": 0.82337755, + "learning_rate": 1.871120608822485e-08, + "loss": 0.90007156, + "num_input_tokens_seen": 343604045, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09631348, + "step": 15929, + "time_per_iteration": 2.537607431411743 + }, + { + "auxiliary_loss_clip": 0.06409724, + "auxiliary_loss_mlp": 0.01267462, + "balance_loss_clip": 0.06272496, + "balance_loss_mlp": 0.01257663, + "epoch": 0.9577634150007516, + "flos": 29030838917760.0, + "grad_norm": 1.3603689969387036, + "language_loss": 0.72440124, + "learning_rate": 1.8658094363480202e-08, + "loss": 0.80117309, + "num_input_tokens_seen": 343626595, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.09802246, + "step": 15930, + "time_per_iteration": 4.076937198638916 + }, + { + "auxiliary_loss_clip": 0.06400728, + "auxiliary_loss_mlp": 0.01262169, + "balance_loss_clip": 0.0627092, + "balance_loss_mlp": 0.01253586, + "epoch": 0.9578235382534195, + "flos": 19288429021440.0, + "grad_norm": 1.7666162202134825, + "language_loss": 0.62475115, + "learning_rate": 1.8605057771419185e-08, + "loss": 0.70138013, + "num_input_tokens_seen": 343646195, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.08587646, + "step": 15931, + "time_per_iteration": 2.4878103733062744 + }, + { + "auxiliary_loss_clip": 0.06398283, + "auxiliary_loss_mlp": 0.0126528, + "balance_loss_clip": 0.06270614, + "balance_loss_mlp": 0.01256822, + "epoch": 0.9578836615060875, + "flos": 13704428753280.0, + "grad_norm": 3.5194186637129548, + "language_loss": 0.69838828, + "learning_rate": 1.8552096314052633e-08, + "loss": 0.77502394, + "num_input_tokens_seen": 343663665, + "router_z_loss_clip": 1.27636719, + "router_z_loss_mlp": 0.08453369, + "step": 15932, + "time_per_iteration": 2.5196003913879395 + }, + { + "auxiliary_loss_clip": 0.06407235, + "auxiliary_loss_mlp": 0.01272005, + "balance_loss_clip": 0.06270652, + "balance_loss_mlp": 0.0126152, + "epoch": 0.9579437847587554, + "flos": 17059988885760.0, + "grad_norm": 1.7465631161736164, + "language_loss": 0.75582886, + "learning_rate": 1.849920999338961e-08, + "loss": 0.83262122, + "num_input_tokens_seen": 343682145, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.1048584, + "step": 15933, + "time_per_iteration": 2.5064492225646973 + }, + { + "auxiliary_loss_clip": 0.06308126, + "auxiliary_loss_mlp": 0.01248499, + "balance_loss_clip": 0.06254178, + "balance_loss_mlp": 0.01247536, + "epoch": 0.9580039080114234, + "flos": 60587875854720.0, + "grad_norm": 0.7159109651995939, + "language_loss": 0.57357532, + "learning_rate": 1.8446398811434948e-08, + "loss": 0.64914161, + "num_input_tokens_seen": 343744685, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.00962067, + "step": 15934, + "time_per_iteration": 3.2443442344665527 + }, + { + "auxiliary_loss_clip": 0.06307364, + "auxiliary_loss_mlp": 0.01247753, + "balance_loss_clip": 0.06253395, + "balance_loss_mlp": 0.01246772, + "epoch": 0.9580640312640913, + "flos": 66254837264640.0, + "grad_norm": 0.9651737078828977, + "language_loss": 0.65949249, + "learning_rate": 1.8393662770191277e-08, + "loss": 0.7350437, + "num_input_tokens_seen": 343801835, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.00979614, + "step": 15935, + "time_per_iteration": 3.065608501434326 + }, + { + "auxiliary_loss_clip": 0.0630898, + "auxiliary_loss_mlp": 0.01251402, + "balance_loss_clip": 0.06254997, + "balance_loss_mlp": 0.01250436, + "epoch": 0.9581241545167594, + "flos": 62236145520000.0, + "grad_norm": 0.7631981636188135, + "language_loss": 0.56839162, + "learning_rate": 1.8341001871658546e-08, + "loss": 0.64399546, + "num_input_tokens_seen": 343861515, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.00964355, + "step": 15936, + "time_per_iteration": 3.1163625717163086 + }, + { + "auxiliary_loss_clip": 0.06401271, + "auxiliary_loss_mlp": 0.01267128, + "balance_loss_clip": 0.06268574, + "balance_loss_mlp": 0.01257687, + "epoch": 0.9581842777694273, + "flos": 23774714876160.0, + "grad_norm": 1.5239589067044021, + "language_loss": 0.78735429, + "learning_rate": 1.8288416117833825e-08, + "loss": 0.86403823, + "num_input_tokens_seen": 343881240, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.09448242, + "step": 15937, + "time_per_iteration": 4.0462646484375 + }, + { + "auxiliary_loss_clip": 0.06402034, + "auxiliary_loss_mlp": 0.01264412, + "balance_loss_clip": 0.06271479, + "balance_loss_mlp": 0.01254339, + "epoch": 0.9582444010220953, + "flos": 21219111273600.0, + "grad_norm": 1.677321670215532, + "language_loss": 0.68562138, + "learning_rate": 1.8235905510710636e-08, + "loss": 0.76228583, + "num_input_tokens_seen": 343900885, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.10064697, + "step": 15938, + "time_per_iteration": 2.497121572494507 + }, + { + "auxiliary_loss_clip": 0.06402011, + "auxiliary_loss_mlp": 0.0126384, + "balance_loss_clip": 0.06271237, + "balance_loss_mlp": 0.01254625, + "epoch": 0.9583045242747633, + "flos": 23811876961920.0, + "grad_norm": 2.306411620688474, + "language_loss": 0.66241562, + "learning_rate": 1.8183470052280712e-08, + "loss": 0.73907411, + "num_input_tokens_seen": 343918460, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.09210205, + "step": 15939, + "time_per_iteration": 2.526710033416748 + }, + { + "auxiliary_loss_clip": 0.06401028, + "auxiliary_loss_mlp": 0.01261972, + "balance_loss_clip": 0.06270058, + "balance_loss_mlp": 0.01253108, + "epoch": 0.9583646475274312, + "flos": 24137908398720.0, + "grad_norm": 1.9566475767780982, + "language_loss": 0.73915648, + "learning_rate": 1.8131109744532025e-08, + "loss": 0.81578648, + "num_input_tokens_seen": 343938030, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.08868408, + "step": 15940, + "time_per_iteration": 2.561065673828125 + }, + { + "auxiliary_loss_clip": 0.06404864, + "auxiliary_loss_mlp": 0.01265122, + "balance_loss_clip": 0.06271879, + "balance_loss_mlp": 0.0125512, + "epoch": 0.9584247707800992, + "flos": 20892954055680.0, + "grad_norm": 1.7935762593019313, + "language_loss": 0.73054647, + "learning_rate": 1.8078824589450535e-08, + "loss": 0.80724633, + "num_input_tokens_seen": 343956635, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.09997559, + "step": 15941, + "time_per_iteration": 2.5311267375946045 + }, + { + "auxiliary_loss_clip": 0.06403222, + "auxiliary_loss_mlp": 0.0126599, + "balance_loss_clip": 0.06272411, + "balance_loss_mlp": 0.01256561, + "epoch": 0.9584848940327672, + "flos": 26074753925760.0, + "grad_norm": 2.6796518959373086, + "language_loss": 0.7163468, + "learning_rate": 1.8026614589018442e-08, + "loss": 0.79303896, + "num_input_tokens_seen": 343976625, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09429932, + "step": 15942, + "time_per_iteration": 2.5477967262268066 + }, + { + "auxiliary_loss_clip": 0.06404561, + "auxiliary_loss_mlp": 0.0126497, + "balance_loss_clip": 0.06271345, + "balance_loss_mlp": 0.01254951, + "epoch": 0.9585450172854352, + "flos": 34501088868480.0, + "grad_norm": 1.640983954823699, + "language_loss": 0.72097212, + "learning_rate": 1.797447974521571e-08, + "loss": 0.79766738, + "num_input_tokens_seen": 343997790, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.10021973, + "step": 15943, + "time_per_iteration": 2.6213395595550537 + }, + { + "auxiliary_loss_clip": 0.06406368, + "auxiliary_loss_mlp": 0.01266644, + "balance_loss_clip": 0.06272337, + "balance_loss_mlp": 0.01256744, + "epoch": 0.9586051405381031, + "flos": 23117159560320.0, + "grad_norm": 2.4382664366899873, + "language_loss": 0.68584573, + "learning_rate": 1.792242006001965e-08, + "loss": 0.76257586, + "num_input_tokens_seen": 344016935, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.09906006, + "step": 15944, + "time_per_iteration": 2.527688503265381 + }, + { + "auxiliary_loss_clip": 0.06400511, + "auxiliary_loss_mlp": 0.01265871, + "balance_loss_clip": 0.06268411, + "balance_loss_mlp": 0.01255964, + "epoch": 0.9586652637907711, + "flos": 19609135724160.0, + "grad_norm": 1.9938870353448976, + "language_loss": 0.66536617, + "learning_rate": 1.7870435535403795e-08, + "loss": 0.74202991, + "num_input_tokens_seen": 344035590, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.09912109, + "step": 15945, + "time_per_iteration": 3.923600673675537 + }, + { + "auxiliary_loss_clip": 0.06310786, + "auxiliary_loss_mlp": 0.01252735, + "balance_loss_clip": 0.06256623, + "balance_loss_mlp": 0.01251638, + "epoch": 0.958725387043439, + "flos": 72093815107200.0, + "grad_norm": 0.7394875290848417, + "language_loss": 0.61828369, + "learning_rate": 1.7818526173339678e-08, + "loss": 0.69391894, + "num_input_tokens_seen": 344100845, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.01098633, + "step": 15946, + "time_per_iteration": 3.237788438796997 + }, + { + "auxiliary_loss_clip": 0.06400455, + "auxiliary_loss_mlp": 0.01263062, + "balance_loss_clip": 0.06272161, + "balance_loss_mlp": 0.01254151, + "epoch": 0.958785510296107, + "flos": 28919310733440.0, + "grad_norm": 1.520574817813325, + "language_loss": 0.75433493, + "learning_rate": 1.7766691975795723e-08, + "loss": 0.83097005, + "num_input_tokens_seen": 344121780, + "router_z_loss_clip": 1.28417969, + "router_z_loss_mlp": 0.08917236, + "step": 15947, + "time_per_iteration": 2.5902247428894043 + }, + { + "auxiliary_loss_clip": 0.06399107, + "auxiliary_loss_mlp": 0.0126601, + "balance_loss_clip": 0.06268102, + "balance_loss_mlp": 0.01256682, + "epoch": 0.958845633548775, + "flos": 18482854268160.0, + "grad_norm": 2.0981305077445676, + "language_loss": 0.70112932, + "learning_rate": 1.771493294473747e-08, + "loss": 0.77778053, + "num_input_tokens_seen": 344140150, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09320068, + "step": 15948, + "time_per_iteration": 3.957618236541748 + }, + { + "auxiliary_loss_clip": 0.06398233, + "auxiliary_loss_mlp": 0.01262024, + "balance_loss_clip": 0.06268825, + "balance_loss_mlp": 0.01252362, + "epoch": 0.958905756801443, + "flos": 24213783870720.0, + "grad_norm": 7.304958158083634, + "language_loss": 0.7873342, + "learning_rate": 1.7663249082127574e-08, + "loss": 0.86393678, + "num_input_tokens_seen": 344158200, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.09661865, + "step": 15949, + "time_per_iteration": 2.538614511489868 + }, + { + "auxiliary_loss_clip": 0.06403197, + "auxiliary_loss_mlp": 0.01262903, + "balance_loss_clip": 0.0627002, + "balance_loss_mlp": 0.01253629, + "epoch": 0.9589658800541109, + "flos": 25014662795520.0, + "grad_norm": 1.8198938167398784, + "language_loss": 0.69052678, + "learning_rate": 1.761164038992602e-08, + "loss": 0.76718783, + "num_input_tokens_seen": 344174720, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.09283447, + "step": 15950, + "time_per_iteration": 2.5288169384002686 + }, + { + "auxiliary_loss_clip": 0.06401816, + "auxiliary_loss_mlp": 0.01268119, + "balance_loss_clip": 0.06269851, + "balance_loss_mlp": 0.01259273, + "epoch": 0.9590260033067789, + "flos": 23521456310400.0, + "grad_norm": 1.6945586951033367, + "language_loss": 0.86529648, + "learning_rate": 1.7560106870089687e-08, + "loss": 0.94199586, + "num_input_tokens_seen": 344192580, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.08843994, + "step": 15951, + "time_per_iteration": 2.5392637252807617 + }, + { + "auxiliary_loss_clip": 0.06405854, + "auxiliary_loss_mlp": 0.01264128, + "balance_loss_clip": 0.06270808, + "balance_loss_mlp": 0.01253703, + "epoch": 0.9590861265594469, + "flos": 25527427056000.0, + "grad_norm": 4.282815391208873, + "language_loss": 0.8056556, + "learning_rate": 1.7508648524572568e-08, + "loss": 0.88235545, + "num_input_tokens_seen": 344210345, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.10412598, + "step": 15952, + "time_per_iteration": 2.5456416606903076 + }, + { + "auxiliary_loss_clip": 0.06403787, + "auxiliary_loss_mlp": 0.01266317, + "balance_loss_clip": 0.0627217, + "balance_loss_mlp": 0.01256077, + "epoch": 0.9591462498121148, + "flos": 21185806475520.0, + "grad_norm": 1.9010894377049286, + "language_loss": 0.6990664, + "learning_rate": 1.7457265355326434e-08, + "loss": 0.77576745, + "num_input_tokens_seen": 344229540, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.10235596, + "step": 15953, + "time_per_iteration": 2.5646610260009766 + }, + { + "auxiliary_loss_clip": 0.06404779, + "auxiliary_loss_mlp": 0.01267328, + "balance_loss_clip": 0.06272354, + "balance_loss_mlp": 0.01257177, + "epoch": 0.9592063730647828, + "flos": 21729024495360.0, + "grad_norm": 3.7840506918954557, + "language_loss": 0.58236861, + "learning_rate": 1.7405957364299285e-08, + "loss": 0.65908968, + "num_input_tokens_seen": 344247830, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.10150146, + "step": 15954, + "time_per_iteration": 2.501776933670044 + }, + { + "auxiliary_loss_clip": 0.06404117, + "auxiliary_loss_mlp": 0.01263181, + "balance_loss_clip": 0.06270336, + "balance_loss_mlp": 0.01253537, + "epoch": 0.9592664963174508, + "flos": 29897992022400.0, + "grad_norm": 2.9078911705966095, + "language_loss": 0.74191898, + "learning_rate": 1.7354724553437117e-08, + "loss": 0.81859195, + "num_input_tokens_seen": 344267760, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.09649658, + "step": 15955, + "time_per_iteration": 2.5696985721588135 + }, + { + "auxiliary_loss_clip": 0.0640043, + "auxiliary_loss_mlp": 0.01266787, + "balance_loss_clip": 0.06268075, + "balance_loss_mlp": 0.01256553, + "epoch": 0.9593266195701188, + "flos": 18004652616960.0, + "grad_norm": 1.8259803400807233, + "language_loss": 0.62581319, + "learning_rate": 1.7303566924682378e-08, + "loss": 0.70248532, + "num_input_tokens_seen": 344284905, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.10235596, + "step": 15956, + "time_per_iteration": 2.4732725620269775 + }, + { + "auxiliary_loss_clip": 0.06403741, + "auxiliary_loss_mlp": 0.01265541, + "balance_loss_clip": 0.06271461, + "balance_loss_mlp": 0.01256088, + "epoch": 0.9593867428227867, + "flos": 18843364460160.0, + "grad_norm": 1.9369477994253566, + "language_loss": 0.60280073, + "learning_rate": 1.725248447997507e-08, + "loss": 0.67949355, + "num_input_tokens_seen": 344302025, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.09454346, + "step": 15957, + "time_per_iteration": 2.504669427871704 + }, + { + "auxiliary_loss_clip": 0.0640239, + "auxiliary_loss_mlp": 0.01266865, + "balance_loss_clip": 0.06269728, + "balance_loss_mlp": 0.01255743, + "epoch": 0.9594468660754547, + "flos": 29574266572800.0, + "grad_norm": 2.0245547455705264, + "language_loss": 0.74410594, + "learning_rate": 1.7201477221252314e-08, + "loss": 0.82079852, + "num_input_tokens_seen": 344321935, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.11120605, + "step": 15958, + "time_per_iteration": 2.5677356719970703 + }, + { + "auxiliary_loss_clip": 0.06397437, + "auxiliary_loss_mlp": 0.01265983, + "balance_loss_clip": 0.06268004, + "balance_loss_mlp": 0.01256553, + "epoch": 0.9595069893281226, + "flos": 20709365760000.0, + "grad_norm": 1.539498065951829, + "language_loss": 0.74628884, + "learning_rate": 1.7150545150448116e-08, + "loss": 0.822923, + "num_input_tokens_seen": 344340405, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.09417725, + "step": 15959, + "time_per_iteration": 2.536829710006714 + }, + { + "auxiliary_loss_clip": 0.06405512, + "auxiliary_loss_mlp": 0.01265512, + "balance_loss_clip": 0.06271296, + "balance_loss_mlp": 0.01255862, + "epoch": 0.9595671125807906, + "flos": 22459855806720.0, + "grad_norm": 1.9304133607099632, + "language_loss": 0.64810073, + "learning_rate": 1.7099688269493816e-08, + "loss": 0.72481102, + "num_input_tokens_seen": 344359925, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.09643555, + "step": 15960, + "time_per_iteration": 2.511936664581299 + }, + { + "auxiliary_loss_clip": 0.06398654, + "auxiliary_loss_mlp": 0.01263314, + "balance_loss_clip": 0.06271854, + "balance_loss_mlp": 0.01254159, + "epoch": 0.9596272358334585, + "flos": 23922063480960.0, + "grad_norm": 1.6378255149464493, + "language_loss": 0.78098899, + "learning_rate": 1.7048906580318544e-08, + "loss": 0.85760868, + "num_input_tokens_seen": 344379100, + "router_z_loss_clip": 1.26855469, + "router_z_loss_mlp": 0.09161377, + "step": 15961, + "time_per_iteration": 2.532150983810425 + }, + { + "auxiliary_loss_clip": 0.06397168, + "auxiliary_loss_mlp": 0.01268616, + "balance_loss_clip": 0.06268074, + "balance_loss_mlp": 0.0125961, + "epoch": 0.9596873590861266, + "flos": 17677740712320.0, + "grad_norm": 1.8261694186593203, + "language_loss": 0.76113975, + "learning_rate": 1.699820008484698e-08, + "loss": 0.83779764, + "num_input_tokens_seen": 344396895, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.09008789, + "step": 15962, + "time_per_iteration": 2.462209939956665 + }, + { + "auxiliary_loss_clip": 0.06404586, + "auxiliary_loss_mlp": 0.01265561, + "balance_loss_clip": 0.06270142, + "balance_loss_mlp": 0.01255422, + "epoch": 0.9597474823387945, + "flos": 25815038595840.0, + "grad_norm": 2.1500884319333466, + "language_loss": 0.71985179, + "learning_rate": 1.6947568785002698e-08, + "loss": 0.79655325, + "num_input_tokens_seen": 344415115, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.10150146, + "step": 15963, + "time_per_iteration": 2.535642385482788 + }, + { + "auxiliary_loss_clip": 0.06392812, + "auxiliary_loss_mlp": 0.01264787, + "balance_loss_clip": 0.06268126, + "balance_loss_mlp": 0.01256192, + "epoch": 0.9598076055914625, + "flos": 23775218000640.0, + "grad_norm": 1.3971515613610286, + "language_loss": 0.74030179, + "learning_rate": 1.689701268270527e-08, + "loss": 0.81687784, + "num_input_tokens_seen": 344435185, + "router_z_loss_clip": 1.24707031, + "router_z_loss_mlp": 0.0859375, + "step": 15964, + "time_per_iteration": 2.52500581741333 + }, + { + "auxiliary_loss_clip": 0.06307586, + "auxiliary_loss_mlp": 0.0124987, + "balance_loss_clip": 0.06253596, + "balance_loss_mlp": 0.01248861, + "epoch": 0.9598677288441305, + "flos": 56531435045760.0, + "grad_norm": 0.8705968118534945, + "language_loss": 0.57773823, + "learning_rate": 1.684653177987161e-08, + "loss": 0.6533128, + "num_input_tokens_seen": 344488950, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.01008606, + "step": 15965, + "time_per_iteration": 3.1062443256378174 + }, + { + "auxiliary_loss_clip": 0.06403217, + "auxiliary_loss_mlp": 0.01265006, + "balance_loss_clip": 0.06270359, + "balance_loss_mlp": 0.01255487, + "epoch": 0.9599278520967984, + "flos": 23003241534720.0, + "grad_norm": 1.6069333020666432, + "language_loss": 0.78958309, + "learning_rate": 1.6796126078416627e-08, + "loss": 0.8662653, + "num_input_tokens_seen": 344506740, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.09521484, + "step": 15966, + "time_per_iteration": 2.4983363151550293 + }, + { + "auxiliary_loss_clip": 0.06399991, + "auxiliary_loss_mlp": 0.01263589, + "balance_loss_clip": 0.06269903, + "balance_loss_mlp": 0.01254809, + "epoch": 0.9599879753494664, + "flos": 23046399187200.0, + "grad_norm": 1.7301576567619177, + "language_loss": 0.79460174, + "learning_rate": 1.674579558025102e-08, + "loss": 0.87123752, + "num_input_tokens_seen": 344526670, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.08782959, + "step": 15967, + "time_per_iteration": 2.5906291007995605 + }, + { + "auxiliary_loss_clip": 0.06405335, + "auxiliary_loss_mlp": 0.01264036, + "balance_loss_clip": 0.06271484, + "balance_loss_mlp": 0.01253546, + "epoch": 0.9600480986021344, + "flos": 16396731492480.0, + "grad_norm": 1.8178242289336397, + "language_loss": 0.80317146, + "learning_rate": 1.669554028728348e-08, + "loss": 0.87986517, + "num_input_tokens_seen": 344541995, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.10491943, + "step": 15968, + "time_per_iteration": 2.5032947063446045 + }, + { + "auxiliary_loss_clip": 0.06406718, + "auxiliary_loss_mlp": 0.01266637, + "balance_loss_clip": 0.06270508, + "balance_loss_mlp": 0.01256469, + "epoch": 0.9601082218548024, + "flos": 24282741381120.0, + "grad_norm": 2.288236761604915, + "language_loss": 0.67642689, + "learning_rate": 1.6645360201420044e-08, + "loss": 0.75316042, + "num_input_tokens_seen": 344559980, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.10162354, + "step": 15969, + "time_per_iteration": 3.9625113010406494 + }, + { + "auxiliary_loss_clip": 0.06400546, + "auxiliary_loss_mlp": 0.01265001, + "balance_loss_clip": 0.06270244, + "balance_loss_mlp": 0.01255893, + "epoch": 0.9601683451074703, + "flos": 19616137539840.0, + "grad_norm": 2.845353279559271, + "language_loss": 0.79347444, + "learning_rate": 1.6595255324563186e-08, + "loss": 0.87012994, + "num_input_tokens_seen": 344577765, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.09112549, + "step": 15970, + "time_per_iteration": 2.5543136596679688 + }, + { + "auxiliary_loss_clip": 0.06397574, + "auxiliary_loss_mlp": 0.01262648, + "balance_loss_clip": 0.06270392, + "balance_loss_mlp": 0.01252437, + "epoch": 0.9602284683601383, + "flos": 26658320486400.0, + "grad_norm": 1.6064611852721693, + "language_loss": 0.77587306, + "learning_rate": 1.654522565861316e-08, + "loss": 0.85247523, + "num_input_tokens_seen": 344597650, + "router_z_loss_clip": 1.27246094, + "router_z_loss_mlp": 0.10198975, + "step": 15971, + "time_per_iteration": 2.5803046226501465 + }, + { + "auxiliary_loss_clip": 0.0640654, + "auxiliary_loss_mlp": 0.0127055, + "balance_loss_clip": 0.06269947, + "balance_loss_mlp": 0.01260501, + "epoch": 0.9602885916128062, + "flos": 15558564700800.0, + "grad_norm": 1.7619680373804267, + "language_loss": 0.67380464, + "learning_rate": 1.64952712054669e-08, + "loss": 0.75057554, + "num_input_tokens_seen": 344613580, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.10046387, + "step": 15972, + "time_per_iteration": 2.498838186264038 + }, + { + "auxiliary_loss_clip": 0.06402527, + "auxiliary_loss_mlp": 0.01266197, + "balance_loss_clip": 0.06271423, + "balance_loss_mlp": 0.0125734, + "epoch": 0.9603487148654742, + "flos": 16506918011520.0, + "grad_norm": 2.00764116027108, + "language_loss": 0.76161063, + "learning_rate": 1.644539196701844e-08, + "loss": 0.83829796, + "num_input_tokens_seen": 344626910, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.08862305, + "step": 15973, + "time_per_iteration": 2.4790399074554443 + }, + { + "auxiliary_loss_clip": 0.06398208, + "auxiliary_loss_mlp": 0.0126264, + "balance_loss_clip": 0.06269785, + "balance_loss_mlp": 0.01253265, + "epoch": 0.9604088381181421, + "flos": 20850844579200.0, + "grad_norm": 1.5560491123984277, + "language_loss": 0.6949749, + "learning_rate": 1.639558794515983e-08, + "loss": 0.77158332, + "num_input_tokens_seen": 344644330, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.09368896, + "step": 15974, + "time_per_iteration": 2.5170116424560547 + }, + { + "auxiliary_loss_clip": 0.06401684, + "auxiliary_loss_mlp": 0.01263757, + "balance_loss_clip": 0.06267555, + "balance_loss_mlp": 0.01254149, + "epoch": 0.9604689613708102, + "flos": 19689287754240.0, + "grad_norm": 1.9711138139103617, + "language_loss": 0.6806975, + "learning_rate": 1.6345859141779105e-08, + "loss": 0.75735193, + "num_input_tokens_seen": 344663910, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.0960083, + "step": 15975, + "time_per_iteration": 2.5701375007629395 + }, + { + "auxiliary_loss_clip": 0.06393464, + "auxiliary_loss_mlp": 0.0126123, + "balance_loss_clip": 0.06268396, + "balance_loss_mlp": 0.01252844, + "epoch": 0.9605290846234781, + "flos": 24104435892480.0, + "grad_norm": 1.8738118251682123, + "language_loss": 0.55862868, + "learning_rate": 1.6296205558762322e-08, + "loss": 0.63517565, + "num_input_tokens_seen": 344682320, + "router_z_loss_clip": 1.25195312, + "router_z_loss_mlp": 0.08392334, + "step": 15976, + "time_per_iteration": 4.048995494842529 + }, + { + "auxiliary_loss_clip": 0.06395699, + "auxiliary_loss_mlp": 0.01269742, + "balance_loss_clip": 0.06268542, + "balance_loss_mlp": 0.0126107, + "epoch": 0.9605892078761461, + "flos": 27129394540800.0, + "grad_norm": 1.8917776879450527, + "language_loss": 0.6844517, + "learning_rate": 1.624662719799219e-08, + "loss": 0.76110613, + "num_input_tokens_seen": 344701355, + "router_z_loss_clip": 1.27050781, + "router_z_loss_mlp": 0.08679199, + "step": 15977, + "time_per_iteration": 2.530975103378296 + }, + { + "auxiliary_loss_clip": 0.06400748, + "auxiliary_loss_mlp": 0.01264114, + "balance_loss_clip": 0.06269416, + "balance_loss_mlp": 0.01254839, + "epoch": 0.9606493311288141, + "flos": 14142114155520.0, + "grad_norm": 1.6662861951181476, + "language_loss": 0.82018828, + "learning_rate": 1.6197124061348766e-08, + "loss": 0.89683688, + "num_input_tokens_seen": 344717980, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.0927124, + "step": 15978, + "time_per_iteration": 2.44873046875 + }, + { + "auxiliary_loss_clip": 0.06404868, + "auxiliary_loss_mlp": 0.01262098, + "balance_loss_clip": 0.06270764, + "balance_loss_mlp": 0.01251995, + "epoch": 0.960709454381482, + "flos": 15818489665920.0, + "grad_norm": 2.0740905644965864, + "language_loss": 0.83917105, + "learning_rate": 1.614769615070921e-08, + "loss": 0.91584074, + "num_input_tokens_seen": 344733480, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.10107422, + "step": 15979, + "time_per_iteration": 2.4589617252349854 + }, + { + "auxiliary_loss_clip": 0.06404734, + "auxiliary_loss_mlp": 0.01263469, + "balance_loss_clip": 0.062713, + "balance_loss_mlp": 0.01254731, + "epoch": 0.96076957763415, + "flos": 22572054823680.0, + "grad_norm": 1.4954834953684717, + "language_loss": 0.79959273, + "learning_rate": 1.6098343467947805e-08, + "loss": 0.87627476, + "num_input_tokens_seen": 344752130, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.08734131, + "step": 15980, + "time_per_iteration": 2.511533498764038 + }, + { + "auxiliary_loss_clip": 0.0640362, + "auxiliary_loss_mlp": 0.01263144, + "balance_loss_clip": 0.06268869, + "balance_loss_mlp": 0.01253697, + "epoch": 0.960829700886818, + "flos": 24688212088320.0, + "grad_norm": 1.903020531997726, + "language_loss": 0.68203151, + "learning_rate": 1.6049066014935942e-08, + "loss": 0.75869906, + "num_input_tokens_seen": 344771195, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.09442139, + "step": 15981, + "time_per_iteration": 2.520338535308838 + }, + { + "auxiliary_loss_clip": 0.06401807, + "auxiliary_loss_mlp": 0.01266037, + "balance_loss_clip": 0.06271201, + "balance_loss_mlp": 0.01256608, + "epoch": 0.960889824139486, + "flos": 26549517559680.0, + "grad_norm": 1.448278163725355, + "language_loss": 0.70106196, + "learning_rate": 1.5999863793542344e-08, + "loss": 0.77774036, + "num_input_tokens_seen": 344793150, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.09429932, + "step": 15982, + "time_per_iteration": 2.5638973712921143 + }, + { + "auxiliary_loss_clip": 0.06308071, + "auxiliary_loss_mlp": 0.01250914, + "balance_loss_clip": 0.06253908, + "balance_loss_mlp": 0.01249987, + "epoch": 0.9609499473921539, + "flos": 71133638371200.0, + "grad_norm": 0.6588987615366447, + "language_loss": 0.53301847, + "learning_rate": 1.595073680563286e-08, + "loss": 0.60860837, + "num_input_tokens_seen": 344852855, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.00924683, + "step": 15983, + "time_per_iteration": 3.2202537059783936 + }, + { + "auxiliary_loss_clip": 0.06403141, + "auxiliary_loss_mlp": 0.01264496, + "balance_loss_clip": 0.06271559, + "balance_loss_mlp": 0.01255255, + "epoch": 0.9610100706448219, + "flos": 20557740597120.0, + "grad_norm": 2.132875740331415, + "language_loss": 0.67696095, + "learning_rate": 1.5901685053070212e-08, + "loss": 0.75363725, + "num_input_tokens_seen": 344869830, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09243774, + "step": 15984, + "time_per_iteration": 2.4828972816467285 + }, + { + "auxiliary_loss_clip": 0.06395225, + "auxiliary_loss_mlp": 0.01264558, + "balance_loss_clip": 0.06270009, + "balance_loss_mlp": 0.01255748, + "epoch": 0.9610701938974898, + "flos": 14069425138560.0, + "grad_norm": 1.4813244917974475, + "language_loss": 0.6780051, + "learning_rate": 1.5852708537714477e-08, + "loss": 0.75460297, + "num_input_tokens_seen": 344888905, + "router_z_loss_clip": 1.25292969, + "router_z_loss_mlp": 0.0880127, + "step": 15985, + "time_per_iteration": 3.950624704360962 + }, + { + "auxiliary_loss_clip": 0.06401645, + "auxiliary_loss_mlp": 0.01266624, + "balance_loss_clip": 0.06269781, + "balance_loss_mlp": 0.01256938, + "epoch": 0.9611303171501578, + "flos": 20236195353600.0, + "grad_norm": 1.7938469650350048, + "language_loss": 0.7897535, + "learning_rate": 1.580380726142283e-08, + "loss": 0.86643624, + "num_input_tokens_seen": 344907160, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09686279, + "step": 15986, + "time_per_iteration": 2.4934823513031006 + }, + { + "auxiliary_loss_clip": 0.06401192, + "auxiliary_loss_mlp": 0.01266929, + "balance_loss_clip": 0.06271122, + "balance_loss_mlp": 0.012566, + "epoch": 0.9611904404028258, + "flos": 20955957926400.0, + "grad_norm": 2.0809357131228423, + "language_loss": 0.63982856, + "learning_rate": 1.5754981226049792e-08, + "loss": 0.71650976, + "num_input_tokens_seen": 344922400, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.10333252, + "step": 15987, + "time_per_iteration": 2.4966821670532227 + }, + { + "auxiliary_loss_clip": 0.06399138, + "auxiliary_loss_mlp": 0.01263515, + "balance_loss_clip": 0.06273428, + "balance_loss_mlp": 0.01255147, + "epoch": 0.9612505636554938, + "flos": 24834806006400.0, + "grad_norm": 1.5786304249382652, + "language_loss": 0.67162049, + "learning_rate": 1.5706230433446544e-08, + "loss": 0.74824703, + "num_input_tokens_seen": 344941910, + "router_z_loss_clip": 1.25683594, + "router_z_loss_mlp": 0.08361816, + "step": 15988, + "time_per_iteration": 4.050363540649414 + }, + { + "auxiliary_loss_clip": 0.06401965, + "auxiliary_loss_mlp": 0.01266454, + "balance_loss_clip": 0.06269932, + "balance_loss_mlp": 0.01257531, + "epoch": 0.9613106869081617, + "flos": 17170636602240.0, + "grad_norm": 1.9067338568780405, + "language_loss": 0.7483418, + "learning_rate": 1.5657554885462055e-08, + "loss": 0.82502604, + "num_input_tokens_seen": 344960020, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.08929443, + "step": 15989, + "time_per_iteration": 2.4900639057159424 + }, + { + "auxiliary_loss_clip": 0.06311363, + "auxiliary_loss_mlp": 0.01250371, + "balance_loss_clip": 0.06257341, + "balance_loss_mlp": 0.01249386, + "epoch": 0.9613708101608297, + "flos": 61582279783680.0, + "grad_norm": 0.7995098975386216, + "language_loss": 0.63284862, + "learning_rate": 1.5608954583941737e-08, + "loss": 0.70846593, + "num_input_tokens_seen": 345018290, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.00983429, + "step": 15990, + "time_per_iteration": 3.012808322906494 + }, + { + "auxiliary_loss_clip": 0.06398995, + "auxiliary_loss_mlp": 0.01262542, + "balance_loss_clip": 0.06267406, + "balance_loss_mlp": 0.01253172, + "epoch": 0.9614309334134977, + "flos": 27425349561600.0, + "grad_norm": 1.9733105896619667, + "language_loss": 0.78653449, + "learning_rate": 1.5560429530729003e-08, + "loss": 0.86314988, + "num_input_tokens_seen": 345040235, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09375, + "step": 15991, + "time_per_iteration": 2.575064182281494 + }, + { + "auxiliary_loss_clip": 0.06408799, + "auxiliary_loss_mlp": 0.01267574, + "balance_loss_clip": 0.06271989, + "balance_loss_mlp": 0.01257107, + "epoch": 0.9614910566661656, + "flos": 22825564951680.0, + "grad_norm": 3.560030551697313, + "language_loss": 0.85130018, + "learning_rate": 1.5511979727663493e-08, + "loss": 0.92806387, + "num_input_tokens_seen": 345054540, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.10467529, + "step": 15992, + "time_per_iteration": 2.5204951763153076 + }, + { + "auxiliary_loss_clip": 0.06402579, + "auxiliary_loss_mlp": 0.01266631, + "balance_loss_clip": 0.06270155, + "balance_loss_mlp": 0.01256618, + "epoch": 0.9615511799188337, + "flos": 20674090391040.0, + "grad_norm": 1.9027763344253423, + "language_loss": 0.73045832, + "learning_rate": 1.5463605176582406e-08, + "loss": 0.80715036, + "num_input_tokens_seen": 345074035, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.10021973, + "step": 15993, + "time_per_iteration": 2.528385877609253 + }, + { + "auxiliary_loss_clip": 0.06401677, + "auxiliary_loss_mlp": 0.01263253, + "balance_loss_clip": 0.06269389, + "balance_loss_mlp": 0.0125368, + "epoch": 0.9616113031715016, + "flos": 33158123953920.0, + "grad_norm": 1.5426026145316933, + "language_loss": 0.68642288, + "learning_rate": 1.5415305879320716e-08, + "loss": 0.76307219, + "num_input_tokens_seen": 345099270, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.09564209, + "step": 15994, + "time_per_iteration": 2.6216301918029785 + }, + { + "auxiliary_loss_clip": 0.06400389, + "auxiliary_loss_mlp": 0.01263471, + "balance_loss_clip": 0.06268929, + "balance_loss_mlp": 0.01254566, + "epoch": 0.9616714264241696, + "flos": 25016843001600.0, + "grad_norm": 1.8467550508155814, + "language_loss": 0.84644687, + "learning_rate": 1.5367081837709183e-08, + "loss": 0.92308545, + "num_input_tokens_seen": 345116975, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.08892822, + "step": 15995, + "time_per_iteration": 2.563554525375366 + }, + { + "auxiliary_loss_clip": 0.06411675, + "auxiliary_loss_mlp": 0.01267604, + "balance_loss_clip": 0.06273677, + "balance_loss_mlp": 0.01257221, + "epoch": 0.9617315496768375, + "flos": 13551629633280.0, + "grad_norm": 2.057298976603726, + "language_loss": 0.76097316, + "learning_rate": 1.5318933053576788e-08, + "loss": 0.83776593, + "num_input_tokens_seen": 345133645, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.10394287, + "step": 15996, + "time_per_iteration": 2.478343963623047 + }, + { + "auxiliary_loss_clip": 0.06398165, + "auxiliary_loss_mlp": 0.01265357, + "balance_loss_clip": 0.06267761, + "balance_loss_mlp": 0.01255939, + "epoch": 0.9617916729295055, + "flos": 11259221304960.0, + "grad_norm": 1.8482484197146472, + "language_loss": 0.77136171, + "learning_rate": 1.52708595287494e-08, + "loss": 0.84799695, + "num_input_tokens_seen": 345150740, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.09423828, + "step": 15997, + "time_per_iteration": 2.5085597038269043 + }, + { + "auxiliary_loss_clip": 0.06397088, + "auxiliary_loss_mlp": 0.01264136, + "balance_loss_clip": 0.06270058, + "balance_loss_mlp": 0.01255344, + "epoch": 0.9618517961821734, + "flos": 22826235784320.0, + "grad_norm": 1.5938896462134406, + "language_loss": 0.67285407, + "learning_rate": 1.522286126505001e-08, + "loss": 0.7494663, + "num_input_tokens_seen": 345170365, + "router_z_loss_clip": 1.27050781, + "router_z_loss_mlp": 0.08789062, + "step": 15998, + "time_per_iteration": 2.5118253231048584 + }, + { + "auxiliary_loss_clip": 0.06399897, + "auxiliary_loss_mlp": 0.01264603, + "balance_loss_clip": 0.06270373, + "balance_loss_mlp": 0.01255072, + "epoch": 0.9619119194348414, + "flos": 16622848535040.0, + "grad_norm": 1.496371845917081, + "language_loss": 0.72930491, + "learning_rate": 1.5174938264298498e-08, + "loss": 0.80594993, + "num_input_tokens_seen": 345188930, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.09527588, + "step": 15999, + "time_per_iteration": 2.506864547729492 + }, + { + "auxiliary_loss_clip": 0.06395978, + "auxiliary_loss_mlp": 0.01265996, + "balance_loss_clip": 0.06269437, + "balance_loss_mlp": 0.01257574, + "epoch": 0.9619720426875094, + "flos": 24542037440640.0, + "grad_norm": 1.6415628522989876, + "language_loss": 0.65517807, + "learning_rate": 1.5127090528312514e-08, + "loss": 0.73179787, + "num_input_tokens_seen": 345209615, + "router_z_loss_clip": 1.265625, + "router_z_loss_mlp": 0.08422852, + "step": 16000, + "time_per_iteration": 2.5260074138641357 + }, + { + "auxiliary_loss_clip": 0.06402802, + "auxiliary_loss_mlp": 0.01263738, + "balance_loss_clip": 0.06270752, + "balance_loss_mlp": 0.01253295, + "epoch": 0.9620321659401774, + "flos": 20638647313920.0, + "grad_norm": 3.855036180657502, + "language_loss": 0.75523168, + "learning_rate": 1.5079318058905723e-08, + "loss": 0.83189702, + "num_input_tokens_seen": 345229175, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.10430908, + "step": 16001, + "time_per_iteration": 2.5201330184936523 + }, + { + "auxiliary_loss_clip": 0.06402686, + "auxiliary_loss_mlp": 0.01266273, + "balance_loss_clip": 0.06271547, + "balance_loss_mlp": 0.01256653, + "epoch": 0.9620922891928453, + "flos": 18521232238080.0, + "grad_norm": 1.6547442520201165, + "language_loss": 0.68397254, + "learning_rate": 1.5031620857890447e-08, + "loss": 0.76066214, + "num_input_tokens_seen": 345247815, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09619141, + "step": 16002, + "time_per_iteration": 2.483081817626953 + }, + { + "auxiliary_loss_clip": 0.06401039, + "auxiliary_loss_mlp": 0.01265908, + "balance_loss_clip": 0.06271882, + "balance_loss_mlp": 0.01256532, + "epoch": 0.9621524124455133, + "flos": 28774980875520.0, + "grad_norm": 1.195438695245258, + "language_loss": 0.64683259, + "learning_rate": 1.4983998927074804e-08, + "loss": 0.72350204, + "num_input_tokens_seen": 345269935, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.09375, + "step": 16003, + "time_per_iteration": 2.586229085922241 + }, + { + "auxiliary_loss_clip": 0.06402837, + "auxiliary_loss_mlp": 0.0126463, + "balance_loss_clip": 0.06271525, + "balance_loss_mlp": 0.01255141, + "epoch": 0.9622125356981813, + "flos": 19104882652800.0, + "grad_norm": 1.7948469305878696, + "language_loss": 0.7638011, + "learning_rate": 1.493645226826512e-08, + "loss": 0.8404758, + "num_input_tokens_seen": 345288310, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.09490967, + "step": 16004, + "time_per_iteration": 2.493025541305542 + }, + { + "auxiliary_loss_clip": 0.06399065, + "auxiliary_loss_mlp": 0.01264795, + "balance_loss_clip": 0.06270385, + "balance_loss_mlp": 0.01255223, + "epoch": 0.9622726589508492, + "flos": 20309010151680.0, + "grad_norm": 1.9981031350559504, + "language_loss": 0.79513681, + "learning_rate": 1.4888980883263958e-08, + "loss": 0.87177539, + "num_input_tokens_seen": 345306615, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.09570312, + "step": 16005, + "time_per_iteration": 2.5306947231292725 + }, + { + "auxiliary_loss_clip": 0.06400214, + "auxiliary_loss_mlp": 0.01262513, + "balance_loss_clip": 0.06271853, + "balance_loss_mlp": 0.01253435, + "epoch": 0.9623327822035173, + "flos": 54942060401280.0, + "grad_norm": 1.8020406678297956, + "language_loss": 0.68003178, + "learning_rate": 1.4841584773871652e-08, + "loss": 0.75665909, + "num_input_tokens_seen": 345331935, + "router_z_loss_clip": 1.28320312, + "router_z_loss_mlp": 0.09075928, + "step": 16006, + "time_per_iteration": 2.816959857940674 + }, + { + "auxiliary_loss_clip": 0.06397587, + "auxiliary_loss_mlp": 0.012623, + "balance_loss_clip": 0.06273156, + "balance_loss_mlp": 0.01253276, + "epoch": 0.9623929054561852, + "flos": 21764928769920.0, + "grad_norm": 1.5013515092363827, + "language_loss": 0.78550291, + "learning_rate": 1.479426394188521e-08, + "loss": 0.86210179, + "num_input_tokens_seen": 345351510, + "router_z_loss_clip": 1.24414062, + "router_z_loss_mlp": 0.09020996, + "step": 16007, + "time_per_iteration": 2.5247249603271484 + }, + { + "auxiliary_loss_clip": 0.0640254, + "auxiliary_loss_mlp": 0.01264076, + "balance_loss_clip": 0.06270196, + "balance_loss_mlp": 0.01254373, + "epoch": 0.9624530287088532, + "flos": 17937414115200.0, + "grad_norm": 1.816767417350666, + "language_loss": 0.67981744, + "learning_rate": 1.4747018389099198e-08, + "loss": 0.75648361, + "num_input_tokens_seen": 345367750, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.09698486, + "step": 16008, + "time_per_iteration": 2.4601643085479736 + }, + { + "auxiliary_loss_clip": 0.06404279, + "auxiliary_loss_mlp": 0.01265584, + "balance_loss_clip": 0.06271291, + "balance_loss_mlp": 0.01255248, + "epoch": 0.9625131519615211, + "flos": 23259686555520.0, + "grad_norm": 2.1142432172822456, + "language_loss": 0.73074311, + "learning_rate": 1.469984811730529e-08, + "loss": 0.80744171, + "num_input_tokens_seen": 345384790, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.10345459, + "step": 16009, + "time_per_iteration": 3.9339191913604736 + }, + { + "auxiliary_loss_clip": 0.06400783, + "auxiliary_loss_mlp": 0.01263245, + "balance_loss_clip": 0.06271462, + "balance_loss_mlp": 0.01254382, + "epoch": 0.9625732752141891, + "flos": 18922636022400.0, + "grad_norm": 2.192710915297057, + "language_loss": 0.7549693, + "learning_rate": 1.4652753128292061e-08, + "loss": 0.83160961, + "num_input_tokens_seen": 345403390, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.08856201, + "step": 16010, + "time_per_iteration": 2.5013561248779297 + }, + { + "auxiliary_loss_clip": 0.06405942, + "auxiliary_loss_mlp": 0.0126574, + "balance_loss_clip": 0.0627039, + "balance_loss_mlp": 0.01254319, + "epoch": 0.962633398466857, + "flos": 16258439128320.0, + "grad_norm": 1.712569944229846, + "language_loss": 0.69567752, + "learning_rate": 1.4605733423845635e-08, + "loss": 0.77239436, + "num_input_tokens_seen": 345418685, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.11419678, + "step": 16011, + "time_per_iteration": 2.4701602458953857 + }, + { + "auxiliary_loss_clip": 0.06400087, + "auxiliary_loss_mlp": 0.01263956, + "balance_loss_clip": 0.06270588, + "balance_loss_mlp": 0.01255069, + "epoch": 0.962693521719525, + "flos": 54209174664960.0, + "grad_norm": 1.8665711044506685, + "language_loss": 0.68777549, + "learning_rate": 1.4558789005748585e-08, + "loss": 0.76441598, + "num_input_tokens_seen": 345442380, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.08886719, + "step": 16012, + "time_per_iteration": 2.7930734157562256 + }, + { + "auxiliary_loss_clip": 0.06411394, + "auxiliary_loss_mlp": 0.01265092, + "balance_loss_clip": 0.06273941, + "balance_loss_mlp": 0.01254423, + "epoch": 0.962753644972193, + "flos": 33113540782080.0, + "grad_norm": 2.007287931479522, + "language_loss": 0.72470278, + "learning_rate": 1.4511919875781264e-08, + "loss": 0.8014676, + "num_input_tokens_seen": 345463815, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.10668945, + "step": 16013, + "time_per_iteration": 2.607010841369629 + }, + { + "auxiliary_loss_clip": 0.06398678, + "auxiliary_loss_mlp": 0.01263775, + "balance_loss_clip": 0.06269355, + "balance_loss_mlp": 0.01253821, + "epoch": 0.962813768224861, + "flos": 42240504839040.0, + "grad_norm": 2.1001634109531433, + "language_loss": 0.63370138, + "learning_rate": 1.4465126035720698e-08, + "loss": 0.71032596, + "num_input_tokens_seen": 345484525, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.09954834, + "step": 16014, + "time_per_iteration": 2.7006850242614746 + }, + { + "auxiliary_loss_clip": 0.06395663, + "auxiliary_loss_mlp": 0.01264971, + "balance_loss_clip": 0.06269664, + "balance_loss_mlp": 0.01256478, + "epoch": 0.9628738914775289, + "flos": 43954671340800.0, + "grad_norm": 1.4423438502368708, + "language_loss": 0.72028565, + "learning_rate": 1.4418407487341688e-08, + "loss": 0.79689205, + "num_input_tokens_seen": 345508295, + "router_z_loss_clip": 1.25976562, + "router_z_loss_mlp": 0.08483887, + "step": 16015, + "time_per_iteration": 2.7649402618408203 + }, + { + "auxiliary_loss_clip": 0.06401665, + "auxiliary_loss_mlp": 0.01265296, + "balance_loss_clip": 0.06270321, + "balance_loss_mlp": 0.01255414, + "epoch": 0.9629340147301969, + "flos": 15601596572160.0, + "grad_norm": 1.9682425360643256, + "language_loss": 0.77071536, + "learning_rate": 1.4371764232415707e-08, + "loss": 0.84738493, + "num_input_tokens_seen": 345525155, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.09881592, + "step": 16016, + "time_per_iteration": 4.025376796722412 + }, + { + "auxiliary_loss_clip": 0.06310678, + "auxiliary_loss_mlp": 0.01250101, + "balance_loss_clip": 0.06256417, + "balance_loss_mlp": 0.01249044, + "epoch": 0.9629941379828649, + "flos": 62969827870080.0, + "grad_norm": 0.8541107533621018, + "language_loss": 0.63163006, + "learning_rate": 1.4325196272711337e-08, + "loss": 0.70723784, + "num_input_tokens_seen": 345578905, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.01058197, + "step": 16017, + "time_per_iteration": 3.1209259033203125 + }, + { + "auxiliary_loss_clip": 0.06404077, + "auxiliary_loss_mlp": 0.01264759, + "balance_loss_clip": 0.0627223, + "balance_loss_mlp": 0.01255103, + "epoch": 0.9630542612355328, + "flos": 29907006336000.0, + "grad_norm": 1.7708678376407427, + "language_loss": 0.67122102, + "learning_rate": 1.4278703609994502e-08, + "loss": 0.74790937, + "num_input_tokens_seen": 345598965, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09649658, + "step": 16018, + "time_per_iteration": 2.6136341094970703 + }, + { + "auxiliary_loss_clip": 0.06403263, + "auxiliary_loss_mlp": 0.01262583, + "balance_loss_clip": 0.06271482, + "balance_loss_mlp": 0.01253011, + "epoch": 0.9631143844882009, + "flos": 17900335883520.0, + "grad_norm": 1.6914005371501741, + "language_loss": 0.79650891, + "learning_rate": 1.4232286246028457e-08, + "loss": 0.8731674, + "num_input_tokens_seen": 345617945, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.0958252, + "step": 16019, + "time_per_iteration": 2.6144886016845703 + }, + { + "auxiliary_loss_clip": 0.06397004, + "auxiliary_loss_mlp": 0.01263057, + "balance_loss_clip": 0.06269085, + "balance_loss_mlp": 0.01254641, + "epoch": 0.9631745077408688, + "flos": 26146101277440.0, + "grad_norm": 1.351412513525788, + "language_loss": 0.71868813, + "learning_rate": 1.4185944182572907e-08, + "loss": 0.79528868, + "num_input_tokens_seen": 345637920, + "router_z_loss_clip": 1.27832031, + "router_z_loss_mlp": 0.08410645, + "step": 16020, + "time_per_iteration": 2.537116765975952 + }, + { + "auxiliary_loss_clip": 0.06400692, + "auxiliary_loss_mlp": 0.01266716, + "balance_loss_clip": 0.06269675, + "balance_loss_mlp": 0.01257555, + "epoch": 0.9632346309935368, + "flos": 24980729091840.0, + "grad_norm": 1.6112903009597273, + "language_loss": 0.76956975, + "learning_rate": 1.4139677421385331e-08, + "loss": 0.84624374, + "num_input_tokens_seen": 345656195, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.09161377, + "step": 16021, + "time_per_iteration": 2.5795507431030273 + }, + { + "auxiliary_loss_clip": 0.06410046, + "auxiliary_loss_mlp": 0.0126309, + "balance_loss_clip": 0.06272537, + "balance_loss_mlp": 0.01253065, + "epoch": 0.9632947542462047, + "flos": 23623005859200.0, + "grad_norm": 2.052482591151295, + "language_loss": 0.65333438, + "learning_rate": 1.4093485964220331e-08, + "loss": 0.73006582, + "num_input_tokens_seen": 345676700, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.10028076, + "step": 16022, + "time_per_iteration": 2.4925694465637207 + }, + { + "auxiliary_loss_clip": 0.06400712, + "auxiliary_loss_mlp": 0.0126106, + "balance_loss_clip": 0.06271265, + "balance_loss_mlp": 0.01251887, + "epoch": 0.9633548774988727, + "flos": 26402755933440.0, + "grad_norm": 2.041932123027993, + "language_loss": 0.73429894, + "learning_rate": 1.4047369812829168e-08, + "loss": 0.81091666, + "num_input_tokens_seen": 345696725, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.09179688, + "step": 16023, + "time_per_iteration": 2.6148433685302734 + }, + { + "auxiliary_loss_clip": 0.06398109, + "auxiliary_loss_mlp": 0.012652, + "balance_loss_clip": 0.06269968, + "balance_loss_mlp": 0.01256456, + "epoch": 0.9634150007515406, + "flos": 23774295605760.0, + "grad_norm": 1.3771901625449594, + "language_loss": 0.8138119, + "learning_rate": 1.4001328968960891e-08, + "loss": 0.89044499, + "num_input_tokens_seen": 345716245, + "router_z_loss_clip": 1.28027344, + "router_z_loss_mlp": 0.08746338, + "step": 16024, + "time_per_iteration": 2.521254539489746 + }, + { + "auxiliary_loss_clip": 0.06408462, + "auxiliary_loss_mlp": 0.01262523, + "balance_loss_clip": 0.06271751, + "balance_loss_mlp": 0.01252671, + "epoch": 0.9634751240042086, + "flos": 24142436519040.0, + "grad_norm": 1.3519204413028436, + "language_loss": 0.81720084, + "learning_rate": 1.3955363434361212e-08, + "loss": 0.89391065, + "num_input_tokens_seen": 345739060, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.09857178, + "step": 16025, + "time_per_iteration": 4.056759595870972 + }, + { + "auxiliary_loss_clip": 0.06406177, + "auxiliary_loss_mlp": 0.0126363, + "balance_loss_clip": 0.06270571, + "balance_loss_mlp": 0.01254421, + "epoch": 0.9635352472568766, + "flos": 24355346544000.0, + "grad_norm": 1.6633226224806905, + "language_loss": 0.76957327, + "learning_rate": 1.3909473210773181e-08, + "loss": 0.8462714, + "num_input_tokens_seen": 345758325, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.09210205, + "step": 16026, + "time_per_iteration": 2.5177974700927734 + }, + { + "auxiliary_loss_clip": 0.0640067, + "auxiliary_loss_mlp": 0.01270768, + "balance_loss_clip": 0.06268805, + "balance_loss_mlp": 0.01260993, + "epoch": 0.9635953705095446, + "flos": 23991062918400.0, + "grad_norm": 1.6668938865230072, + "language_loss": 0.6339668, + "learning_rate": 1.3863658299936965e-08, + "loss": 0.7106812, + "num_input_tokens_seen": 345778530, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09777832, + "step": 16027, + "time_per_iteration": 2.5560450553894043 + }, + { + "auxiliary_loss_clip": 0.06407472, + "auxiliary_loss_mlp": 0.01267154, + "balance_loss_clip": 0.06273127, + "balance_loss_mlp": 0.01257462, + "epoch": 0.9636554937622125, + "flos": 19834540007040.0, + "grad_norm": 1.6733275013477416, + "language_loss": 0.87025476, + "learning_rate": 1.3817918703589837e-08, + "loss": 0.94700098, + "num_input_tokens_seen": 345796535, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.09692383, + "step": 16028, + "time_per_iteration": 3.9252398014068604 + }, + { + "auxiliary_loss_clip": 0.0631086, + "auxiliary_loss_mlp": 0.01252273, + "balance_loss_clip": 0.06256698, + "balance_loss_mlp": 0.01251267, + "epoch": 0.9637156170148805, + "flos": 67454520497280.0, + "grad_norm": 0.6687418840467081, + "language_loss": 0.53127611, + "learning_rate": 1.3772254423466412e-08, + "loss": 0.60690737, + "num_input_tokens_seen": 345859700, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.01006317, + "step": 16029, + "time_per_iteration": 3.0885190963745117 + }, + { + "auxiliary_loss_clip": 0.06406175, + "auxiliary_loss_mlp": 0.01263355, + "balance_loss_clip": 0.06271643, + "balance_loss_mlp": 0.01253562, + "epoch": 0.9637757402675484, + "flos": 20306788018560.0, + "grad_norm": 1.5288285449125392, + "language_loss": 0.74157113, + "learning_rate": 1.372666546129797e-08, + "loss": 0.81826651, + "num_input_tokens_seen": 345878760, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.09796143, + "step": 16030, + "time_per_iteration": 2.5154569149017334 + }, + { + "auxiliary_loss_clip": 0.0639952, + "auxiliary_loss_mlp": 0.0126644, + "balance_loss_clip": 0.06270611, + "balance_loss_mlp": 0.01257249, + "epoch": 0.9638358635202164, + "flos": 27241803192960.0, + "grad_norm": 1.8304305412759827, + "language_loss": 0.65878218, + "learning_rate": 1.3681151818813575e-08, + "loss": 0.7354418, + "num_input_tokens_seen": 345900445, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.09191895, + "step": 16031, + "time_per_iteration": 2.5666158199310303 + }, + { + "auxiliary_loss_clip": 0.06310733, + "auxiliary_loss_mlp": 0.01250007, + "balance_loss_clip": 0.06256757, + "balance_loss_mlp": 0.0124902, + "epoch": 0.9638959867728845, + "flos": 70309768700160.0, + "grad_norm": 0.855502378370066, + "language_loss": 0.60727084, + "learning_rate": 1.3635713497738955e-08, + "loss": 0.68287826, + "num_input_tokens_seen": 345961020, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.00986481, + "step": 16032, + "time_per_iteration": 3.1735146045684814 + }, + { + "auxiliary_loss_clip": 0.06392821, + "auxiliary_loss_mlp": 0.01264604, + "balance_loss_clip": 0.06269621, + "balance_loss_mlp": 0.01256319, + "epoch": 0.9639561100255524, + "flos": 25414012154880.0, + "grad_norm": 1.8239636455461146, + "language_loss": 0.66663599, + "learning_rate": 1.3590350499796954e-08, + "loss": 0.74321026, + "num_input_tokens_seen": 345980210, + "router_z_loss_clip": 1.23242188, + "router_z_loss_mlp": 0.08282471, + "step": 16033, + "time_per_iteration": 2.56622314453125 + }, + { + "auxiliary_loss_clip": 0.06402284, + "auxiliary_loss_mlp": 0.01261476, + "balance_loss_clip": 0.06272136, + "balance_loss_mlp": 0.01252363, + "epoch": 0.9640162332782204, + "flos": 18119744599680.0, + "grad_norm": 1.7952029192998942, + "language_loss": 0.65676892, + "learning_rate": 1.3545062826707976e-08, + "loss": 0.73340648, + "num_input_tokens_seen": 345998280, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.09112549, + "step": 16034, + "time_per_iteration": 2.4902241230010986 + }, + { + "auxiliary_loss_clip": 0.06397773, + "auxiliary_loss_mlp": 0.01263891, + "balance_loss_clip": 0.06269251, + "balance_loss_mlp": 0.01254539, + "epoch": 0.9640763565308883, + "flos": 23446964430720.0, + "grad_norm": 2.2171721620826665, + "language_loss": 0.74419838, + "learning_rate": 1.3499850480189313e-08, + "loss": 0.82081503, + "num_input_tokens_seen": 346015545, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.09356689, + "step": 16035, + "time_per_iteration": 2.5261058807373047 + }, + { + "auxiliary_loss_clip": 0.06402203, + "auxiliary_loss_mlp": 0.01260621, + "balance_loss_clip": 0.06272241, + "balance_loss_mlp": 0.01252038, + "epoch": 0.9641364797835563, + "flos": 22425964030080.0, + "grad_norm": 1.85699593571715, + "language_loss": 0.82645416, + "learning_rate": 1.3454713461955591e-08, + "loss": 0.90308243, + "num_input_tokens_seen": 346034055, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.08575439, + "step": 16036, + "time_per_iteration": 2.5058321952819824 + }, + { + "auxiliary_loss_clip": 0.06399503, + "auxiliary_loss_mlp": 0.01262795, + "balance_loss_clip": 0.0626888, + "balance_loss_mlp": 0.01253478, + "epoch": 0.9641966030362242, + "flos": 30629284531200.0, + "grad_norm": 1.8406886485490508, + "language_loss": 0.70395046, + "learning_rate": 1.340965177371789e-08, + "loss": 0.78057343, + "num_input_tokens_seen": 346054130, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09320068, + "step": 16037, + "time_per_iteration": 2.5934836864471436 + }, + { + "auxiliary_loss_clip": 0.06400578, + "auxiliary_loss_mlp": 0.01265146, + "balance_loss_clip": 0.06268116, + "balance_loss_mlp": 0.0125602, + "epoch": 0.9642567262888923, + "flos": 20958347767680.0, + "grad_norm": 2.2949598508589024, + "language_loss": 0.63063121, + "learning_rate": 1.3364665417185506e-08, + "loss": 0.7072885, + "num_input_tokens_seen": 346072990, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.09124756, + "step": 16038, + "time_per_iteration": 2.528991460800171 + }, + { + "auxiliary_loss_clip": 0.06402931, + "auxiliary_loss_mlp": 0.01266559, + "balance_loss_clip": 0.06269977, + "balance_loss_mlp": 0.01256867, + "epoch": 0.9643168495415602, + "flos": 22646253214080.0, + "grad_norm": 2.710323469198111, + "language_loss": 0.70936692, + "learning_rate": 1.3319754394064187e-08, + "loss": 0.78606176, + "num_input_tokens_seen": 346093745, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.09692383, + "step": 16039, + "time_per_iteration": 2.559022903442383 + }, + { + "auxiliary_loss_clip": 0.06404687, + "auxiliary_loss_mlp": 0.0126847, + "balance_loss_clip": 0.06272136, + "balance_loss_mlp": 0.01258567, + "epoch": 0.9643769727942282, + "flos": 20272435044480.0, + "grad_norm": 1.9949930425544389, + "language_loss": 0.73979366, + "learning_rate": 1.327491870605657e-08, + "loss": 0.81652522, + "num_input_tokens_seen": 346110115, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09902954, + "step": 16040, + "time_per_iteration": 2.500765562057495 + }, + { + "auxiliary_loss_clip": 0.06403273, + "auxiliary_loss_mlp": 0.01263933, + "balance_loss_clip": 0.06270061, + "balance_loss_mlp": 0.01254777, + "epoch": 0.9644370960468961, + "flos": 13887052727040.0, + "grad_norm": 1.8870655198248234, + "language_loss": 0.72925007, + "learning_rate": 1.3230158354863296e-08, + "loss": 0.80592215, + "num_input_tokens_seen": 346127165, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.09155273, + "step": 16041, + "time_per_iteration": 2.4811394214630127 + }, + { + "auxiliary_loss_clip": 0.06392974, + "auxiliary_loss_mlp": 0.01262963, + "balance_loss_clip": 0.06269765, + "balance_loss_mlp": 0.01254207, + "epoch": 0.9644972192995641, + "flos": 17243912597760.0, + "grad_norm": 1.6302297136942336, + "language_loss": 0.72166139, + "learning_rate": 1.3185473342181674e-08, + "loss": 0.79822075, + "num_input_tokens_seen": 346145950, + "router_z_loss_clip": 1.23144531, + "router_z_loss_mlp": 0.08764648, + "step": 16042, + "time_per_iteration": 2.464141368865967 + }, + { + "auxiliary_loss_clip": 0.06405792, + "auxiliary_loss_mlp": 0.01262034, + "balance_loss_clip": 0.06271081, + "balance_loss_mlp": 0.01253404, + "epoch": 0.964557342552232, + "flos": 23846858841600.0, + "grad_norm": 1.7036888779753476, + "language_loss": 0.81625164, + "learning_rate": 1.3140863669705683e-08, + "loss": 0.89292991, + "num_input_tokens_seen": 346165005, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.08636475, + "step": 16043, + "time_per_iteration": 2.5336403846740723 + }, + { + "auxiliary_loss_clip": 0.0640493, + "auxiliary_loss_mlp": 0.012649, + "balance_loss_clip": 0.06274771, + "balance_loss_mlp": 0.0125607, + "epoch": 0.9646174658049, + "flos": 21659605787520.0, + "grad_norm": 1.4250533671062502, + "language_loss": 0.71966612, + "learning_rate": 1.3096329339127522e-08, + "loss": 0.79636443, + "num_input_tokens_seen": 346185095, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.08831787, + "step": 16044, + "time_per_iteration": 2.4873435497283936 + }, + { + "auxiliary_loss_clip": 0.06398635, + "auxiliary_loss_mlp": 0.01260999, + "balance_loss_clip": 0.06268857, + "balance_loss_mlp": 0.01251611, + "epoch": 0.9646775890575681, + "flos": 17135403160320.0, + "grad_norm": 1.6962750102757636, + "language_loss": 0.70311677, + "learning_rate": 1.3051870352135397e-08, + "loss": 0.77971309, + "num_input_tokens_seen": 346202580, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.09387207, + "step": 16045, + "time_per_iteration": 2.5005507469177246 + }, + { + "auxiliary_loss_clip": 0.06401645, + "auxiliary_loss_mlp": 0.01264346, + "balance_loss_clip": 0.06270438, + "balance_loss_mlp": 0.01255405, + "epoch": 0.964737712310236, + "flos": 13010717600640.0, + "grad_norm": 1.7958263286958636, + "language_loss": 0.75115418, + "learning_rate": 1.3007486710415737e-08, + "loss": 0.82781404, + "num_input_tokens_seen": 346219395, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.0894165, + "step": 16046, + "time_per_iteration": 2.4769492149353027 + }, + { + "auxiliary_loss_clip": 0.06402702, + "auxiliary_loss_mlp": 0.0126378, + "balance_loss_clip": 0.06269902, + "balance_loss_mlp": 0.0125391, + "epoch": 0.964797835562904, + "flos": 24286011690240.0, + "grad_norm": 1.5485557136808419, + "language_loss": 0.62918746, + "learning_rate": 1.2963178415651199e-08, + "loss": 0.70585227, + "num_input_tokens_seen": 346239715, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.09863281, + "step": 16047, + "time_per_iteration": 2.5291333198547363 + }, + { + "auxiliary_loss_clip": 0.06404603, + "auxiliary_loss_mlp": 0.01265766, + "balance_loss_clip": 0.06273589, + "balance_loss_mlp": 0.01256468, + "epoch": 0.9648579588155719, + "flos": 20529089700480.0, + "grad_norm": 1.7369231208534281, + "language_loss": 0.69178629, + "learning_rate": 1.2918945469521992e-08, + "loss": 0.76849002, + "num_input_tokens_seen": 346258500, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09295654, + "step": 16048, + "time_per_iteration": 3.8918800354003906 + }, + { + "auxiliary_loss_clip": 0.06404486, + "auxiliary_loss_mlp": 0.01267225, + "balance_loss_clip": 0.0627009, + "balance_loss_mlp": 0.01257027, + "epoch": 0.9649180820682399, + "flos": 32162042943360.0, + "grad_norm": 1.8321763154478243, + "language_loss": 0.63903487, + "learning_rate": 1.2874787873705662e-08, + "loss": 0.71575201, + "num_input_tokens_seen": 346279110, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.10186768, + "step": 16049, + "time_per_iteration": 2.570418119430542 + }, + { + "auxiliary_loss_clip": 0.06402539, + "auxiliary_loss_mlp": 0.01261874, + "balance_loss_clip": 0.06269829, + "balance_loss_mlp": 0.01252886, + "epoch": 0.9649782053209078, + "flos": 20528963919360.0, + "grad_norm": 1.7027576373675015, + "language_loss": 0.71291816, + "learning_rate": 1.2830705629876427e-08, + "loss": 0.78956234, + "num_input_tokens_seen": 346297860, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.08990479, + "step": 16050, + "time_per_iteration": 2.5361132621765137 + }, + { + "auxiliary_loss_clip": 0.06408322, + "auxiliary_loss_mlp": 0.01263097, + "balance_loss_clip": 0.06272562, + "balance_loss_mlp": 0.01252988, + "epoch": 0.9650383285735759, + "flos": 43077623454720.0, + "grad_norm": 1.7842069676990906, + "language_loss": 0.70066154, + "learning_rate": 1.278669873970606e-08, + "loss": 0.77737582, + "num_input_tokens_seen": 346319860, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.10107422, + "step": 16051, + "time_per_iteration": 2.677975654602051 + }, + { + "auxiliary_loss_clip": 0.06307529, + "auxiliary_loss_mlp": 0.01252916, + "balance_loss_clip": 0.06253548, + "balance_loss_mlp": 0.01251916, + "epoch": 0.9650984518262438, + "flos": 61767083963520.0, + "grad_norm": 0.8182337392431096, + "language_loss": 0.59232974, + "learning_rate": 1.2742767204863004e-08, + "loss": 0.66793418, + "num_input_tokens_seen": 346379025, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.00999451, + "step": 16052, + "time_per_iteration": 3.1726770401000977 + }, + { + "auxiliary_loss_clip": 0.063959, + "auxiliary_loss_mlp": 0.01262271, + "balance_loss_clip": 0.06268722, + "balance_loss_mlp": 0.01253068, + "epoch": 0.9651585750789118, + "flos": 29797155233280.0, + "grad_norm": 1.566863639244542, + "language_loss": 0.74622291, + "learning_rate": 1.2698911027013482e-08, + "loss": 0.82280469, + "num_input_tokens_seen": 346402250, + "router_z_loss_clip": 1.27148438, + "router_z_loss_mlp": 0.09204102, + "step": 16053, + "time_per_iteration": 2.576726198196411 + }, + { + "auxiliary_loss_clip": 0.06400575, + "auxiliary_loss_mlp": 0.01262414, + "balance_loss_clip": 0.06268197, + "balance_loss_mlp": 0.01252883, + "epoch": 0.9652186983315797, + "flos": 16878664650240.0, + "grad_norm": 1.8893492919268848, + "language_loss": 0.68987983, + "learning_rate": 1.2655130207820386e-08, + "loss": 0.76650977, + "num_input_tokens_seen": 346419555, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.09527588, + "step": 16054, + "time_per_iteration": 2.479691505432129 + }, + { + "auxiliary_loss_clip": 0.064014, + "auxiliary_loss_mlp": 0.01263325, + "balance_loss_clip": 0.06271985, + "balance_loss_mlp": 0.01254968, + "epoch": 0.9652788215842477, + "flos": 31657831799040.0, + "grad_norm": 1.4060273362324986, + "language_loss": 0.62068862, + "learning_rate": 1.2611424748943944e-08, + "loss": 0.69733584, + "num_input_tokens_seen": 346441245, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.08361816, + "step": 16055, + "time_per_iteration": 4.018486499786377 + }, + { + "auxiliary_loss_clip": 0.06399205, + "auxiliary_loss_mlp": 0.01264446, + "balance_loss_clip": 0.06270594, + "balance_loss_mlp": 0.01255433, + "epoch": 0.9653389448369156, + "flos": 24761236521600.0, + "grad_norm": 1.998915754260937, + "language_loss": 0.76546788, + "learning_rate": 1.2567794652041719e-08, + "loss": 0.84210438, + "num_input_tokens_seen": 346460065, + "router_z_loss_clip": 1.28613281, + "router_z_loss_mlp": 0.09008789, + "step": 16056, + "time_per_iteration": 2.551823854446411 + }, + { + "auxiliary_loss_clip": 0.06400546, + "auxiliary_loss_mlp": 0.01267414, + "balance_loss_clip": 0.06270006, + "balance_loss_mlp": 0.01258729, + "epoch": 0.9653990680895836, + "flos": 20302511460480.0, + "grad_norm": 1.692625022004946, + "language_loss": 0.72081912, + "learning_rate": 1.2524239918767498e-08, + "loss": 0.79749864, + "num_input_tokens_seen": 346478005, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.08679199, + "step": 16057, + "time_per_iteration": 2.5352344512939453 + }, + { + "auxiliary_loss_clip": 0.06398775, + "auxiliary_loss_mlp": 0.01263303, + "balance_loss_clip": 0.06269361, + "balance_loss_mlp": 0.01254189, + "epoch": 0.9654591913422517, + "flos": 22535395862400.0, + "grad_norm": 1.7376745718681348, + "language_loss": 0.71854722, + "learning_rate": 1.2480760550773295e-08, + "loss": 0.79516792, + "num_input_tokens_seen": 346497575, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.09118652, + "step": 16058, + "time_per_iteration": 2.5178182125091553 + }, + { + "auxiliary_loss_clip": 0.06398124, + "auxiliary_loss_mlp": 0.01263491, + "balance_loss_clip": 0.06270248, + "balance_loss_mlp": 0.0125408, + "epoch": 0.9655193145949196, + "flos": 26770645284480.0, + "grad_norm": 1.546516279721211, + "language_loss": 0.74440265, + "learning_rate": 1.2437356549708011e-08, + "loss": 0.82101882, + "num_input_tokens_seen": 346520000, + "router_z_loss_clip": 1.27832031, + "router_z_loss_mlp": 0.09405518, + "step": 16059, + "time_per_iteration": 2.562965154647827 + }, + { + "auxiliary_loss_clip": 0.0640713, + "auxiliary_loss_mlp": 0.01266425, + "balance_loss_clip": 0.06271156, + "balance_loss_mlp": 0.01256746, + "epoch": 0.9655794378475876, + "flos": 41979741333120.0, + "grad_norm": 1.699778030433775, + "language_loss": 0.73402834, + "learning_rate": 1.239402791721722e-08, + "loss": 0.8107639, + "num_input_tokens_seen": 346541605, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.09680176, + "step": 16060, + "time_per_iteration": 2.6744906902313232 + }, + { + "auxiliary_loss_clip": 0.06393793, + "auxiliary_loss_mlp": 0.01261439, + "balance_loss_clip": 0.06268264, + "balance_loss_mlp": 0.01252915, + "epoch": 0.9656395611002555, + "flos": 27716860316160.0, + "grad_norm": 1.5503214965387115, + "language_loss": 0.7667194, + "learning_rate": 1.2350774654944273e-08, + "loss": 0.84327173, + "num_input_tokens_seen": 346560955, + "router_z_loss_clip": 1.25488281, + "router_z_loss_mlp": 0.08526611, + "step": 16061, + "time_per_iteration": 2.5771090984344482 + }, + { + "auxiliary_loss_clip": 0.0630983, + "auxiliary_loss_mlp": 0.01248501, + "balance_loss_clip": 0.06255753, + "balance_loss_mlp": 0.01247496, + "epoch": 0.9656996843529235, + "flos": 68987949742080.0, + "grad_norm": 0.780210844217019, + "language_loss": 0.64234674, + "learning_rate": 1.2307596764528749e-08, + "loss": 0.71793002, + "num_input_tokens_seen": 346621615, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.01005554, + "step": 16062, + "time_per_iteration": 3.2013790607452393 + }, + { + "auxiliary_loss_clip": 0.06394887, + "auxiliary_loss_mlp": 0.01266088, + "balance_loss_clip": 0.06270029, + "balance_loss_mlp": 0.01257683, + "epoch": 0.9657598076055914, + "flos": 20637599137920.0, + "grad_norm": 2.251945173497628, + "language_loss": 0.934484, + "learning_rate": 1.226449424760867e-08, + "loss": 1.01109374, + "num_input_tokens_seen": 346637460, + "router_z_loss_clip": 1.24707031, + "router_z_loss_mlp": 0.08413696, + "step": 16063, + "time_per_iteration": 2.5948007106781006 + }, + { + "auxiliary_loss_clip": 0.06403172, + "auxiliary_loss_mlp": 0.01266989, + "balance_loss_clip": 0.06272347, + "balance_loss_mlp": 0.01257953, + "epoch": 0.9658199308582595, + "flos": 20454765528960.0, + "grad_norm": 1.8938965740794855, + "language_loss": 0.81982899, + "learning_rate": 1.2221467105818062e-08, + "loss": 0.89653063, + "num_input_tokens_seen": 346655625, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09039307, + "step": 16064, + "time_per_iteration": 3.946958303451538 + }, + { + "auxiliary_loss_clip": 0.06401001, + "auxiliary_loss_mlp": 0.01261606, + "balance_loss_clip": 0.0627339, + "balance_loss_mlp": 0.01252772, + "epoch": 0.9658800541109274, + "flos": 24725038757760.0, + "grad_norm": 1.5779341158882096, + "language_loss": 0.84311408, + "learning_rate": 1.2178515340788731e-08, + "loss": 0.91974014, + "num_input_tokens_seen": 346675220, + "router_z_loss_clip": 1.27636719, + "router_z_loss_mlp": 0.08837891, + "step": 16065, + "time_per_iteration": 2.647083282470703 + }, + { + "auxiliary_loss_clip": 0.06402124, + "auxiliary_loss_mlp": 0.0126448, + "balance_loss_clip": 0.06270837, + "balance_loss_mlp": 0.01255623, + "epoch": 0.9659401773635954, + "flos": 21615399959040.0, + "grad_norm": 2.432738378484276, + "language_loss": 0.67548525, + "learning_rate": 1.2135638954149151e-08, + "loss": 0.75215131, + "num_input_tokens_seen": 346694710, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.08850098, + "step": 16066, + "time_per_iteration": 2.568356990814209 + }, + { + "auxiliary_loss_clip": 0.06398377, + "auxiliary_loss_mlp": 0.0126593, + "balance_loss_clip": 0.06268573, + "balance_loss_mlp": 0.01256897, + "epoch": 0.9660003006162633, + "flos": 20307123434880.0, + "grad_norm": 1.7910369908094568, + "language_loss": 0.82607502, + "learning_rate": 1.209283794752558e-08, + "loss": 0.90271813, + "num_input_tokens_seen": 346712645, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.09036255, + "step": 16067, + "time_per_iteration": 2.5698952674865723 + }, + { + "auxiliary_loss_clip": 0.06401904, + "auxiliary_loss_mlp": 0.01264106, + "balance_loss_clip": 0.06271727, + "balance_loss_mlp": 0.01254325, + "epoch": 0.9660604238689313, + "flos": 24468803372160.0, + "grad_norm": 1.671137077977421, + "language_loss": 0.69428784, + "learning_rate": 1.2050112322540496e-08, + "loss": 0.77094793, + "num_input_tokens_seen": 346732375, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.09783936, + "step": 16068, + "time_per_iteration": 4.0155861377716064 + }, + { + "auxiliary_loss_clip": 0.0639426, + "auxiliary_loss_mlp": 0.01267688, + "balance_loss_clip": 0.06271375, + "balance_loss_mlp": 0.01259755, + "epoch": 0.9661205471215992, + "flos": 19869983084160.0, + "grad_norm": 1.7705169776652172, + "language_loss": 0.68107969, + "learning_rate": 1.20074620808146e-08, + "loss": 0.75769919, + "num_input_tokens_seen": 346750430, + "router_z_loss_clip": 1.22949219, + "router_z_loss_mlp": 0.07928467, + "step": 16069, + "time_per_iteration": 2.496572256088257 + }, + { + "auxiliary_loss_clip": 0.06400932, + "auxiliary_loss_mlp": 0.01262822, + "balance_loss_clip": 0.06271296, + "balance_loss_mlp": 0.01253595, + "epoch": 0.9661806703742672, + "flos": 20564071580160.0, + "grad_norm": 1.710702523196639, + "language_loss": 0.89453393, + "learning_rate": 1.1964887223964826e-08, + "loss": 0.9711715, + "num_input_tokens_seen": 346768455, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.09228516, + "step": 16070, + "time_per_iteration": 2.58213472366333 + }, + { + "auxiliary_loss_clip": 0.06401291, + "auxiliary_loss_mlp": 0.01266178, + "balance_loss_clip": 0.06269821, + "balance_loss_mlp": 0.01255884, + "epoch": 0.9662407936269353, + "flos": 21436842908160.0, + "grad_norm": 2.0425263157777604, + "language_loss": 0.77503681, + "learning_rate": 1.1922387753605878e-08, + "loss": 0.85171151, + "num_input_tokens_seen": 346786530, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.10290527, + "step": 16071, + "time_per_iteration": 2.5824472904205322 + }, + { + "auxiliary_loss_clip": 0.06395756, + "auxiliary_loss_mlp": 0.01263186, + "balance_loss_clip": 0.06267762, + "balance_loss_mlp": 0.01253059, + "epoch": 0.9663009168796032, + "flos": 14908178908800.0, + "grad_norm": 1.734770632308268, + "language_loss": 0.66013038, + "learning_rate": 1.1879963671349137e-08, + "loss": 0.73671985, + "num_input_tokens_seen": 346804635, + "router_z_loss_clip": 1.28027344, + "router_z_loss_mlp": 0.10113525, + "step": 16072, + "time_per_iteration": 2.5067734718322754 + }, + { + "auxiliary_loss_clip": 0.06405023, + "auxiliary_loss_mlp": 0.01263151, + "balance_loss_clip": 0.06272658, + "balance_loss_mlp": 0.01253936, + "epoch": 0.9663610401322712, + "flos": 24316842792960.0, + "grad_norm": 1.5748682596234602, + "language_loss": 0.78113818, + "learning_rate": 1.1837614978803534e-08, + "loss": 0.85781991, + "num_input_tokens_seen": 346823070, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.09216309, + "step": 16073, + "time_per_iteration": 2.608790636062622 + }, + { + "auxiliary_loss_clip": 0.06407337, + "auxiliary_loss_mlp": 0.01262702, + "balance_loss_clip": 0.06271721, + "balance_loss_mlp": 0.01252706, + "epoch": 0.9664211633849391, + "flos": 17643345811200.0, + "grad_norm": 1.9546716126874173, + "language_loss": 0.75967658, + "learning_rate": 1.1795341677574677e-08, + "loss": 0.83637702, + "num_input_tokens_seen": 346841180, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.09991455, + "step": 16074, + "time_per_iteration": 2.613964557647705 + }, + { + "auxiliary_loss_clip": 0.06400394, + "auxiliary_loss_mlp": 0.01265189, + "balance_loss_clip": 0.06268935, + "balance_loss_mlp": 0.01255223, + "epoch": 0.9664812866376071, + "flos": 29797239087360.0, + "grad_norm": 1.3986555912156662, + "language_loss": 0.75712979, + "learning_rate": 1.1753143769265728e-08, + "loss": 0.83378559, + "num_input_tokens_seen": 346864250, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.09960938, + "step": 16075, + "time_per_iteration": 2.7057979106903076 + }, + { + "auxiliary_loss_clip": 0.06403182, + "auxiliary_loss_mlp": 0.01265451, + "balance_loss_clip": 0.06269626, + "balance_loss_mlp": 0.01256255, + "epoch": 0.966541409890275, + "flos": 14287450262400.0, + "grad_norm": 1.7774078486578757, + "language_loss": 0.78800076, + "learning_rate": 1.171102125547696e-08, + "loss": 0.86468703, + "num_input_tokens_seen": 346881955, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.09204102, + "step": 16076, + "time_per_iteration": 2.5402417182922363 + }, + { + "auxiliary_loss_clip": 0.06401198, + "auxiliary_loss_mlp": 0.01265198, + "balance_loss_clip": 0.06270036, + "balance_loss_mlp": 0.01255322, + "epoch": 0.9666015331429431, + "flos": 19865790380160.0, + "grad_norm": 1.5934368657490992, + "language_loss": 0.72737241, + "learning_rate": 1.166897413780532e-08, + "loss": 0.80403632, + "num_input_tokens_seen": 346900445, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.09875488, + "step": 16077, + "time_per_iteration": 2.535360097885132 + }, + { + "auxiliary_loss_clip": 0.06399302, + "auxiliary_loss_mlp": 0.01263469, + "balance_loss_clip": 0.06269421, + "balance_loss_mlp": 0.01254129, + "epoch": 0.966661656395611, + "flos": 27133335682560.0, + "grad_norm": 1.8761219493118404, + "language_loss": 0.59630072, + "learning_rate": 1.1627002417845533e-08, + "loss": 0.67292845, + "num_input_tokens_seen": 346920135, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.09344482, + "step": 16078, + "time_per_iteration": 2.561671018600464 + }, + { + "auxiliary_loss_clip": 0.06405197, + "auxiliary_loss_mlp": 0.01262495, + "balance_loss_clip": 0.06270532, + "balance_loss_mlp": 0.01252428, + "epoch": 0.966721779648279, + "flos": 21514856659200.0, + "grad_norm": 1.7688121157900791, + "language_loss": 0.72058022, + "learning_rate": 1.158510609718899e-08, + "loss": 0.79725718, + "num_input_tokens_seen": 346940450, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.10064697, + "step": 16079, + "time_per_iteration": 2.529829263687134 + }, + { + "auxiliary_loss_clip": 0.06397161, + "auxiliary_loss_mlp": 0.01264122, + "balance_loss_clip": 0.06270564, + "balance_loss_mlp": 0.01255199, + "epoch": 0.9667819029009469, + "flos": 23884859468160.0, + "grad_norm": 1.5564630804369735, + "language_loss": 0.72879219, + "learning_rate": 1.1543285177424644e-08, + "loss": 0.80540496, + "num_input_tokens_seen": 346960935, + "router_z_loss_clip": 1.265625, + "router_z_loss_mlp": 0.08917236, + "step": 16080, + "time_per_iteration": 2.5244600772857666 + }, + { + "auxiliary_loss_clip": 0.06398826, + "auxiliary_loss_mlp": 0.01264318, + "balance_loss_clip": 0.06269746, + "balance_loss_mlp": 0.01255324, + "epoch": 0.9668420261536149, + "flos": 21513682702080.0, + "grad_norm": 1.8020849522821436, + "language_loss": 0.74110532, + "learning_rate": 1.1501539660138115e-08, + "loss": 0.81773674, + "num_input_tokens_seen": 346980100, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.08990479, + "step": 16081, + "time_per_iteration": 2.4842236042022705 + }, + { + "auxiliary_loss_clip": 0.06397434, + "auxiliary_loss_mlp": 0.01264751, + "balance_loss_clip": 0.062673, + "balance_loss_mlp": 0.01255578, + "epoch": 0.9669021494062828, + "flos": 26694434396160.0, + "grad_norm": 1.538464840175787, + "language_loss": 0.67664808, + "learning_rate": 1.145986954691236e-08, + "loss": 0.75326991, + "num_input_tokens_seen": 347001250, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.09173584, + "step": 16082, + "time_per_iteration": 2.5313684940338135 + }, + { + "auxiliary_loss_clip": 0.06400424, + "auxiliary_loss_mlp": 0.01264878, + "balance_loss_clip": 0.06270989, + "balance_loss_mlp": 0.01255556, + "epoch": 0.9669622726589508, + "flos": 29832724091520.0, + "grad_norm": 1.4347274539872106, + "language_loss": 0.7732228, + "learning_rate": 1.141827483932789e-08, + "loss": 0.84987581, + "num_input_tokens_seen": 347022975, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.09320068, + "step": 16083, + "time_per_iteration": 2.6201815605163574 + }, + { + "auxiliary_loss_clip": 0.06402251, + "auxiliary_loss_mlp": 0.01264976, + "balance_loss_clip": 0.06270413, + "balance_loss_mlp": 0.01255546, + "epoch": 0.9670223959116189, + "flos": 22927911114240.0, + "grad_norm": 1.9457609743548718, + "language_loss": 0.79789531, + "learning_rate": 1.1376755538961669e-08, + "loss": 0.87456757, + "num_input_tokens_seen": 347038780, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09436035, + "step": 16084, + "time_per_iteration": 2.51789927482605 + }, + { + "auxiliary_loss_clip": 0.06404713, + "auxiliary_loss_mlp": 0.01263186, + "balance_loss_clip": 0.06271202, + "balance_loss_mlp": 0.01252964, + "epoch": 0.9670825191642868, + "flos": 18630412508160.0, + "grad_norm": 2.4606761386831133, + "language_loss": 0.68396688, + "learning_rate": 1.1335311647387991e-08, + "loss": 0.76064587, + "num_input_tokens_seen": 347056705, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.10223389, + "step": 16085, + "time_per_iteration": 2.474874258041382 + }, + { + "auxiliary_loss_clip": 0.06406981, + "auxiliary_loss_mlp": 0.01264663, + "balance_loss_clip": 0.06271201, + "balance_loss_mlp": 0.0125419, + "epoch": 0.9671426424169548, + "flos": 24504707646720.0, + "grad_norm": 2.075044751177439, + "language_loss": 0.68617862, + "learning_rate": 1.1293943166178709e-08, + "loss": 0.76289505, + "num_input_tokens_seen": 347075710, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.10473633, + "step": 16086, + "time_per_iteration": 2.534994125366211 + }, + { + "auxiliary_loss_clip": 0.06402737, + "auxiliary_loss_mlp": 0.01265826, + "balance_loss_clip": 0.06271712, + "balance_loss_mlp": 0.01255884, + "epoch": 0.9672027656696227, + "flos": 20376625996800.0, + "grad_norm": 1.3946644640700947, + "language_loss": 0.7882064, + "learning_rate": 1.125265009690235e-08, + "loss": 0.86489207, + "num_input_tokens_seen": 347092325, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09942627, + "step": 16087, + "time_per_iteration": 2.4735782146453857 + }, + { + "auxiliary_loss_clip": 0.06399234, + "auxiliary_loss_mlp": 0.01261819, + "balance_loss_clip": 0.06268933, + "balance_loss_mlp": 0.01252837, + "epoch": 0.9672628889222907, + "flos": 18886186696320.0, + "grad_norm": 1.8117496525637224, + "language_loss": 0.71433723, + "learning_rate": 1.1211432441124769e-08, + "loss": 0.79094768, + "num_input_tokens_seen": 347110595, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.08990479, + "step": 16088, + "time_per_iteration": 3.8949713706970215 + }, + { + "auxiliary_loss_clip": 0.06397194, + "auxiliary_loss_mlp": 0.01262715, + "balance_loss_clip": 0.06270102, + "balance_loss_mlp": 0.01253715, + "epoch": 0.9673230121749586, + "flos": 28702962691200.0, + "grad_norm": 1.577967984656714, + "language_loss": 0.70956695, + "learning_rate": 1.117029020040916e-08, + "loss": 0.78616601, + "num_input_tokens_seen": 347131625, + "router_z_loss_clip": 1.27148438, + "router_z_loss_mlp": 0.09002686, + "step": 16089, + "time_per_iteration": 2.5853075981140137 + }, + { + "auxiliary_loss_clip": 0.06403333, + "auxiliary_loss_mlp": 0.01264796, + "balance_loss_clip": 0.06271292, + "balance_loss_mlp": 0.01255235, + "epoch": 0.9673831354276267, + "flos": 20490544022400.0, + "grad_norm": 1.9844262982420549, + "language_loss": 0.75145471, + "learning_rate": 1.1129223376315167e-08, + "loss": 0.82813597, + "num_input_tokens_seen": 347147910, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09558105, + "step": 16090, + "time_per_iteration": 2.469186544418335 + }, + { + "auxiliary_loss_clip": 0.06409271, + "auxiliary_loss_mlp": 0.01264002, + "balance_loss_clip": 0.06270892, + "balance_loss_mlp": 0.01253899, + "epoch": 0.9674432586802946, + "flos": 26804872477440.0, + "grad_norm": 1.5950063142097652, + "language_loss": 0.68768305, + "learning_rate": 1.1088231970400653e-08, + "loss": 0.76441574, + "num_input_tokens_seen": 347168805, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.10101318, + "step": 16091, + "time_per_iteration": 2.563216209411621 + }, + { + "auxiliary_loss_clip": 0.06398912, + "auxiliary_loss_mlp": 0.01263687, + "balance_loss_clip": 0.06270887, + "balance_loss_mlp": 0.012539, + "epoch": 0.9675033819329626, + "flos": 22317706154880.0, + "grad_norm": 1.8144338488923422, + "language_loss": 0.77032447, + "learning_rate": 1.1047315984219484e-08, + "loss": 0.84695041, + "num_input_tokens_seen": 347189455, + "router_z_loss_clip": 1.28027344, + "router_z_loss_mlp": 0.09783936, + "step": 16092, + "time_per_iteration": 2.528087854385376 + }, + { + "auxiliary_loss_clip": 0.06399173, + "auxiliary_loss_mlp": 0.01263601, + "balance_loss_clip": 0.06269817, + "balance_loss_mlp": 0.01255101, + "epoch": 0.9675635051856305, + "flos": 12680367678720.0, + "grad_norm": 1.8169609266887585, + "language_loss": 0.7681576, + "learning_rate": 1.1006475419323313e-08, + "loss": 0.84478533, + "num_input_tokens_seen": 347206030, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.08496094, + "step": 16093, + "time_per_iteration": 2.4785947799682617 + }, + { + "auxiliary_loss_clip": 0.06403705, + "auxiliary_loss_mlp": 0.0126625, + "balance_loss_clip": 0.06271917, + "balance_loss_mlp": 0.01256416, + "epoch": 0.9676236284382985, + "flos": 24615439217280.0, + "grad_norm": 1.550034543506878, + "language_loss": 0.69245452, + "learning_rate": 1.096571027726112e-08, + "loss": 0.76915407, + "num_input_tokens_seen": 347226250, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09832764, + "step": 16094, + "time_per_iteration": 2.531022310256958 + }, + { + "auxiliary_loss_clip": 0.06406316, + "auxiliary_loss_mlp": 0.01266357, + "balance_loss_clip": 0.06270891, + "balance_loss_mlp": 0.01257136, + "epoch": 0.9676837516909664, + "flos": 23373772289280.0, + "grad_norm": 2.0284619015774745, + "language_loss": 0.75801766, + "learning_rate": 1.0925020559578557e-08, + "loss": 0.83474445, + "num_input_tokens_seen": 347247350, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.09222412, + "step": 16095, + "time_per_iteration": 3.973430633544922 + }, + { + "auxiliary_loss_clip": 0.06406826, + "auxiliary_loss_mlp": 0.01263981, + "balance_loss_clip": 0.06270942, + "balance_loss_mlp": 0.01254546, + "epoch": 0.9677438749436345, + "flos": 20493395061120.0, + "grad_norm": 2.546128984208035, + "language_loss": 0.70797509, + "learning_rate": 1.0884406267818392e-08, + "loss": 0.78468317, + "num_input_tokens_seen": 347266870, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.09436035, + "step": 16096, + "time_per_iteration": 2.556928873062134 + }, + { + "auxiliary_loss_clip": 0.0640536, + "auxiliary_loss_mlp": 0.01263747, + "balance_loss_clip": 0.06272483, + "balance_loss_mlp": 0.01254085, + "epoch": 0.9678039981963025, + "flos": 47566341077760.0, + "grad_norm": 1.6741629416522243, + "language_loss": 0.71720374, + "learning_rate": 1.0843867403520946e-08, + "loss": 0.79389483, + "num_input_tokens_seen": 347290120, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.09661865, + "step": 16097, + "time_per_iteration": 2.7817232608795166 + }, + { + "auxiliary_loss_clip": 0.06399585, + "auxiliary_loss_mlp": 0.01266789, + "balance_loss_clip": 0.06271115, + "balance_loss_mlp": 0.01257449, + "epoch": 0.9678641214489704, + "flos": 25046542074240.0, + "grad_norm": 1.8782821270100718, + "language_loss": 0.78498095, + "learning_rate": 1.0803403968223434e-08, + "loss": 0.86164474, + "num_input_tokens_seen": 347308785, + "router_z_loss_clip": 1.28417969, + "router_z_loss_mlp": 0.09338379, + "step": 16098, + "time_per_iteration": 2.5505471229553223 + }, + { + "auxiliary_loss_clip": 0.06397729, + "auxiliary_loss_mlp": 0.01262535, + "balance_loss_clip": 0.06268919, + "balance_loss_mlp": 0.01253708, + "epoch": 0.9679242447016384, + "flos": 19246319544960.0, + "grad_norm": 1.724025286564301, + "language_loss": 0.90831089, + "learning_rate": 1.0763015963459965e-08, + "loss": 0.98491359, + "num_input_tokens_seen": 347326375, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.08831787, + "step": 16099, + "time_per_iteration": 2.484697103500366 + }, + { + "auxiliary_loss_clip": 0.0640512, + "auxiliary_loss_mlp": 0.01264422, + "balance_loss_clip": 0.06269465, + "balance_loss_mlp": 0.01254641, + "epoch": 0.9679843679543063, + "flos": 33262943811840.0, + "grad_norm": 1.8533709433525063, + "language_loss": 0.66165268, + "learning_rate": 1.0722703390762643e-08, + "loss": 0.73834813, + "num_input_tokens_seen": 347348250, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.09777832, + "step": 16100, + "time_per_iteration": 2.6391396522521973 + }, + { + "auxiliary_loss_clip": 0.06401994, + "auxiliary_loss_mlp": 0.01264329, + "balance_loss_clip": 0.06270385, + "balance_loss_mlp": 0.01254805, + "epoch": 0.9680444912069743, + "flos": 22790205728640.0, + "grad_norm": 1.4418574001305366, + "language_loss": 0.73443776, + "learning_rate": 1.0682466251659584e-08, + "loss": 0.81110096, + "num_input_tokens_seen": 347367400, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09533691, + "step": 16101, + "time_per_iteration": 2.4959254264831543 + }, + { + "auxiliary_loss_clip": 0.06401779, + "auxiliary_loss_mlp": 0.01263958, + "balance_loss_clip": 0.06270876, + "balance_loss_mlp": 0.012546, + "epoch": 0.9681046144596422, + "flos": 24030866407680.0, + "grad_norm": 1.476677590253325, + "language_loss": 0.73699975, + "learning_rate": 1.0642304547676672e-08, + "loss": 0.8136571, + "num_input_tokens_seen": 347387600, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.09356689, + "step": 16102, + "time_per_iteration": 2.527163505554199 + }, + { + "auxiliary_loss_clip": 0.06401537, + "auxiliary_loss_mlp": 0.01268193, + "balance_loss_clip": 0.06270529, + "balance_loss_mlp": 0.01257899, + "epoch": 0.9681647377123103, + "flos": 23447802971520.0, + "grad_norm": 1.896455412966277, + "language_loss": 0.77483177, + "learning_rate": 1.0602218280337139e-08, + "loss": 0.85152906, + "num_input_tokens_seen": 347406915, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.10296631, + "step": 16103, + "time_per_iteration": 3.9508111476898193 + }, + { + "auxiliary_loss_clip": 0.06402817, + "auxiliary_loss_mlp": 0.01263081, + "balance_loss_clip": 0.06272209, + "balance_loss_mlp": 0.01254427, + "epoch": 0.9682248609649782, + "flos": 22681780145280.0, + "grad_norm": 1.7473063951215217, + "language_loss": 0.80425286, + "learning_rate": 1.0562207451160655e-08, + "loss": 0.88091195, + "num_input_tokens_seen": 347425140, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.08654785, + "step": 16104, + "time_per_iteration": 2.5212666988372803 + }, + { + "auxiliary_loss_clip": 0.06398646, + "auxiliary_loss_mlp": 0.01262819, + "balance_loss_clip": 0.06269979, + "balance_loss_mlp": 0.01254284, + "epoch": 0.9682849842176462, + "flos": 24435750136320.0, + "grad_norm": 1.553672505568153, + "language_loss": 0.77860147, + "learning_rate": 1.0522272061664672e-08, + "loss": 0.85521615, + "num_input_tokens_seen": 347446350, + "router_z_loss_clip": 1.28613281, + "router_z_loss_mlp": 0.08532715, + "step": 16105, + "time_per_iteration": 2.526402711868286 + }, + { + "auxiliary_loss_clip": 0.06307848, + "auxiliary_loss_mlp": 0.01250922, + "balance_loss_clip": 0.06253837, + "balance_loss_mlp": 0.01249911, + "epoch": 0.9683451074703141, + "flos": 60013365534720.0, + "grad_norm": 0.8157679586212945, + "language_loss": 0.56714195, + "learning_rate": 1.0482412113363536e-08, + "loss": 0.64272964, + "num_input_tokens_seen": 347510135, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.01011658, + "step": 16106, + "time_per_iteration": 3.1907763481140137 + }, + { + "auxiliary_loss_clip": 0.06308085, + "auxiliary_loss_mlp": 0.01250817, + "balance_loss_clip": 0.06253918, + "balance_loss_mlp": 0.01249806, + "epoch": 0.9684052307229821, + "flos": 52712850850560.0, + "grad_norm": 0.8588329806048718, + "language_loss": 0.61471093, + "learning_rate": 1.0442627607768707e-08, + "loss": 0.69029999, + "num_input_tokens_seen": 347562505, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.01010895, + "step": 16107, + "time_per_iteration": 3.0176451206207275 + }, + { + "auxiliary_loss_clip": 0.06401956, + "auxiliary_loss_mlp": 0.0126273, + "balance_loss_clip": 0.06270234, + "balance_loss_mlp": 0.01252859, + "epoch": 0.96846535397565, + "flos": 22790457290880.0, + "grad_norm": 2.0284139673557635, + "language_loss": 0.74127901, + "learning_rate": 1.040291854638875e-08, + "loss": 0.81792581, + "num_input_tokens_seen": 347579150, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09875488, + "step": 16108, + "time_per_iteration": 3.937136650085449 + }, + { + "auxiliary_loss_clip": 0.06403004, + "auxiliary_loss_mlp": 0.01261473, + "balance_loss_clip": 0.06271024, + "balance_loss_mlp": 0.01252359, + "epoch": 0.968525477228318, + "flos": 23329482606720.0, + "grad_norm": 2.048945101246752, + "language_loss": 0.57015377, + "learning_rate": 1.0363284930729576e-08, + "loss": 0.64679849, + "num_input_tokens_seen": 347596705, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09112549, + "step": 16109, + "time_per_iteration": 2.5268101692199707 + }, + { + "auxiliary_loss_clip": 0.06305698, + "auxiliary_loss_mlp": 0.01249198, + "balance_loss_clip": 0.06251822, + "balance_loss_mlp": 0.01248142, + "epoch": 0.9685856004809861, + "flos": 67903651981440.0, + "grad_norm": 0.6567864126752433, + "language_loss": 0.54225814, + "learning_rate": 1.0323726762294205e-08, + "loss": 0.61780703, + "num_input_tokens_seen": 347661870, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.01056671, + "step": 16110, + "time_per_iteration": 3.203383207321167 + }, + { + "auxiliary_loss_clip": 0.0640424, + "auxiliary_loss_mlp": 0.01263261, + "balance_loss_clip": 0.06270111, + "balance_loss_mlp": 0.01253194, + "epoch": 0.968645723733654, + "flos": 33956277621120.0, + "grad_norm": 1.395578578385916, + "language_loss": 0.62541378, + "learning_rate": 1.0284244042582325e-08, + "loss": 0.70208883, + "num_input_tokens_seen": 347684295, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.10058594, + "step": 16111, + "time_per_iteration": 2.640765428543091 + }, + { + "auxiliary_loss_clip": 0.06400396, + "auxiliary_loss_mlp": 0.0126341, + "balance_loss_clip": 0.06270713, + "balance_loss_mlp": 0.0125463, + "epoch": 0.968705846986322, + "flos": 18557388074880.0, + "grad_norm": 1.8951473791498206, + "language_loss": 0.74788642, + "learning_rate": 1.024483677309118e-08, + "loss": 0.82452452, + "num_input_tokens_seen": 347702585, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.08776855, + "step": 16112, + "time_per_iteration": 2.5014288425445557 + }, + { + "auxiliary_loss_clip": 0.06395774, + "auxiliary_loss_mlp": 0.01264106, + "balance_loss_clip": 0.06268512, + "balance_loss_mlp": 0.01255517, + "epoch": 0.9687659702389899, + "flos": 17426704279680.0, + "grad_norm": 2.0013501762386072, + "language_loss": 0.67307127, + "learning_rate": 1.020550495531558e-08, + "loss": 0.74967003, + "num_input_tokens_seen": 347721810, + "router_z_loss_clip": 1.27246094, + "router_z_loss_mlp": 0.0859375, + "step": 16113, + "time_per_iteration": 2.479163646697998 + }, + { + "auxiliary_loss_clip": 0.06308687, + "auxiliary_loss_mlp": 0.01250527, + "balance_loss_clip": 0.06254673, + "balance_loss_mlp": 0.01249524, + "epoch": 0.9688260934916579, + "flos": 62067231688320.0, + "grad_norm": 0.785383139879687, + "language_loss": 0.56577516, + "learning_rate": 1.0166248590746329e-08, + "loss": 0.64136732, + "num_input_tokens_seen": 347782330, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.01003265, + "step": 16114, + "time_per_iteration": 3.1394646167755127 + }, + { + "auxiliary_loss_clip": 0.06402376, + "auxiliary_loss_mlp": 0.01268137, + "balance_loss_clip": 0.06271395, + "balance_loss_mlp": 0.01258618, + "epoch": 0.9688862167443258, + "flos": 15080363049600.0, + "grad_norm": 1.8572842989291634, + "language_loss": 0.82534641, + "learning_rate": 1.0127067680872458e-08, + "loss": 0.90205157, + "num_input_tokens_seen": 347794835, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.09521484, + "step": 16115, + "time_per_iteration": 2.4412038326263428 + }, + { + "auxiliary_loss_clip": 0.06396943, + "auxiliary_loss_mlp": 0.01261817, + "balance_loss_clip": 0.06272493, + "balance_loss_mlp": 0.01253377, + "epoch": 0.9689463399969939, + "flos": 19944391109760.0, + "grad_norm": 1.4226958516999226, + "language_loss": 0.72081476, + "learning_rate": 1.0087962227179448e-08, + "loss": 0.79740238, + "num_input_tokens_seen": 347814320, + "router_z_loss_clip": 1.24414062, + "router_z_loss_mlp": 0.08435059, + "step": 16116, + "time_per_iteration": 2.5535671710968018 + }, + { + "auxiliary_loss_clip": 0.06405754, + "auxiliary_loss_mlp": 0.01261237, + "balance_loss_clip": 0.06272267, + "balance_loss_mlp": 0.01251587, + "epoch": 0.9690064632496618, + "flos": 19579101235200.0, + "grad_norm": 2.2415643926520614, + "language_loss": 0.75798059, + "learning_rate": 1.0048932231150553e-08, + "loss": 0.83465052, + "num_input_tokens_seen": 347832125, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.09661865, + "step": 16117, + "time_per_iteration": 2.483868360519409 + }, + { + "auxiliary_loss_clip": 0.06406679, + "auxiliary_loss_mlp": 0.0126341, + "balance_loss_clip": 0.06273545, + "balance_loss_mlp": 0.01254004, + "epoch": 0.9690665865023298, + "flos": 21878846795520.0, + "grad_norm": 2.007341004668209, + "language_loss": 0.77854973, + "learning_rate": 1.000997769426548e-08, + "loss": 0.8552506, + "num_input_tokens_seen": 347850765, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.09399414, + "step": 16118, + "time_per_iteration": 2.5185434818267822 + }, + { + "auxiliary_loss_clip": 0.06402394, + "auxiliary_loss_mlp": 0.01264527, + "balance_loss_clip": 0.06269039, + "balance_loss_mlp": 0.01254758, + "epoch": 0.9691267097549977, + "flos": 21000541098240.0, + "grad_norm": 1.7099772377431646, + "language_loss": 0.78459924, + "learning_rate": 9.971098618001272e-09, + "loss": 0.86126846, + "num_input_tokens_seen": 347870125, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.09771729, + "step": 16119, + "time_per_iteration": 2.612290859222412 + }, + { + "auxiliary_loss_clip": 0.06396645, + "auxiliary_loss_mlp": 0.01263245, + "balance_loss_clip": 0.06271589, + "balance_loss_mlp": 0.01254609, + "epoch": 0.9691868330076657, + "flos": 24285885909120.0, + "grad_norm": 1.3978893166659911, + "language_loss": 0.75944752, + "learning_rate": 9.932295003832747e-09, + "loss": 0.83604646, + "num_input_tokens_seen": 347890615, + "router_z_loss_clip": 1.25097656, + "router_z_loss_mlp": 0.08636475, + "step": 16120, + "time_per_iteration": 2.5401206016540527 + }, + { + "auxiliary_loss_clip": 0.0640015, + "auxiliary_loss_mlp": 0.01262274, + "balance_loss_clip": 0.06269264, + "balance_loss_mlp": 0.01252923, + "epoch": 0.9692469562603336, + "flos": 17681430291840.0, + "grad_norm": 1.946020897677594, + "language_loss": 0.69889534, + "learning_rate": 9.89356685323095e-09, + "loss": 0.77551961, + "num_input_tokens_seen": 347908685, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.09344482, + "step": 16121, + "time_per_iteration": 2.4932589530944824 + }, + { + "auxiliary_loss_clip": 0.06398712, + "auxiliary_loss_mlp": 0.01261825, + "balance_loss_clip": 0.06269211, + "balance_loss_mlp": 0.01252211, + "epoch": 0.9693070795130017, + "flos": 26841783000960.0, + "grad_norm": 2.6697458666208007, + "language_loss": 0.6931926, + "learning_rate": 9.854914167664486e-09, + "loss": 0.76979792, + "num_input_tokens_seen": 347926385, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.09613037, + "step": 16122, + "time_per_iteration": 2.5934178829193115 + }, + { + "auxiliary_loss_clip": 0.0640236, + "auxiliary_loss_mlp": 0.01261205, + "balance_loss_clip": 0.06270461, + "balance_loss_mlp": 0.01252849, + "epoch": 0.9693672027656697, + "flos": 18083127565440.0, + "grad_norm": 1.887635490879254, + "language_loss": 0.75718206, + "learning_rate": 9.81633694859907e-09, + "loss": 0.83381778, + "num_input_tokens_seen": 347945290, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.08355713, + "step": 16123, + "time_per_iteration": 2.526440382003784 + }, + { + "auxiliary_loss_clip": 0.0640337, + "auxiliary_loss_mlp": 0.01262305, + "balance_loss_clip": 0.06270259, + "balance_loss_mlp": 0.01252536, + "epoch": 0.9694273260183376, + "flos": 21769582671360.0, + "grad_norm": 1.3729033080387363, + "language_loss": 0.74643373, + "learning_rate": 9.777835197497753e-09, + "loss": 0.82309043, + "num_input_tokens_seen": 347966330, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.09771729, + "step": 16124, + "time_per_iteration": 2.551767349243164 + }, + { + "auxiliary_loss_clip": 0.06402075, + "auxiliary_loss_mlp": 0.01262872, + "balance_loss_clip": 0.06270434, + "balance_loss_mlp": 0.01253716, + "epoch": 0.9694874492710056, + "flos": 24433066805760.0, + "grad_norm": 1.95841723109516, + "language_loss": 0.74200714, + "learning_rate": 9.739408915820258e-09, + "loss": 0.81865656, + "num_input_tokens_seen": 347982590, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09155273, + "step": 16125, + "time_per_iteration": 2.55316424369812 + }, + { + "auxiliary_loss_clip": 0.06305213, + "auxiliary_loss_mlp": 0.0125144, + "balance_loss_clip": 0.06251328, + "balance_loss_mlp": 0.01250412, + "epoch": 0.9695475725236735, + "flos": 67669191457920.0, + "grad_norm": 0.8771800111615311, + "language_loss": 0.61598706, + "learning_rate": 9.70105810502364e-09, + "loss": 0.69155359, + "num_input_tokens_seen": 348043310, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.01027679, + "step": 16126, + "time_per_iteration": 3.0755326747894287 + }, + { + "auxiliary_loss_clip": 0.06398349, + "auxiliary_loss_mlp": 0.01264514, + "balance_loss_clip": 0.06271584, + "balance_loss_mlp": 0.01255293, + "epoch": 0.9696076957763415, + "flos": 19134330163200.0, + "grad_norm": 1.5311536279147961, + "language_loss": 0.75146884, + "learning_rate": 9.662782766562738e-09, + "loss": 0.82809746, + "num_input_tokens_seen": 348062200, + "router_z_loss_clip": 1.26855469, + "router_z_loss_mlp": 0.09222412, + "step": 16127, + "time_per_iteration": 3.9446663856506348 + }, + { + "auxiliary_loss_clip": 0.06405523, + "auxiliary_loss_mlp": 0.01262243, + "balance_loss_clip": 0.06270227, + "balance_loss_mlp": 0.01252146, + "epoch": 0.9696678190290094, + "flos": 15492248593920.0, + "grad_norm": 1.533562341751804, + "language_loss": 0.69545048, + "learning_rate": 9.62458290188839e-09, + "loss": 0.77212816, + "num_input_tokens_seen": 348080685, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.10101318, + "step": 16128, + "time_per_iteration": 2.4981887340545654 + }, + { + "auxiliary_loss_clip": 0.06400339, + "auxiliary_loss_mlp": 0.01266042, + "balance_loss_clip": 0.06270851, + "balance_loss_mlp": 0.01256761, + "epoch": 0.9697279422816775, + "flos": 36217225941120.0, + "grad_norm": 1.5209597540885744, + "language_loss": 0.65483963, + "learning_rate": 9.586458512449213e-09, + "loss": 0.73150343, + "num_input_tokens_seen": 348102500, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.09277344, + "step": 16129, + "time_per_iteration": 2.6251938343048096 + }, + { + "auxiliary_loss_clip": 0.06407736, + "auxiliary_loss_mlp": 0.0126171, + "balance_loss_clip": 0.06270853, + "balance_loss_mlp": 0.01252024, + "epoch": 0.9697880655343454, + "flos": 25491103511040.0, + "grad_norm": 1.8080137782892927, + "language_loss": 0.63748336, + "learning_rate": 9.548409599691166e-09, + "loss": 0.71417773, + "num_input_tokens_seen": 348122515, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.09692383, + "step": 16130, + "time_per_iteration": 2.534078359603882 + }, + { + "auxiliary_loss_clip": 0.06406684, + "auxiliary_loss_mlp": 0.01266248, + "balance_loss_clip": 0.06270098, + "balance_loss_mlp": 0.01256336, + "epoch": 0.9698481887870134, + "flos": 15337688538240.0, + "grad_norm": 2.3021960280258718, + "language_loss": 0.70279443, + "learning_rate": 9.510436165056867e-09, + "loss": 0.77952373, + "num_input_tokens_seen": 348138775, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.09912109, + "step": 16131, + "time_per_iteration": 2.4530463218688965 + }, + { + "auxiliary_loss_clip": 0.06404746, + "auxiliary_loss_mlp": 0.01267276, + "balance_loss_clip": 0.06270657, + "balance_loss_mlp": 0.01257173, + "epoch": 0.9699083120396813, + "flos": 21988907533440.0, + "grad_norm": 2.025844934607916, + "language_loss": 0.76757103, + "learning_rate": 9.472538209986058e-09, + "loss": 0.84429133, + "num_input_tokens_seen": 348157115, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.10107422, + "step": 16132, + "time_per_iteration": 2.5047919750213623 + }, + { + "auxiliary_loss_clip": 0.0640052, + "auxiliary_loss_mlp": 0.01265217, + "balance_loss_clip": 0.06269385, + "balance_loss_mlp": 0.01255042, + "epoch": 0.9699684352923493, + "flos": 15668625438720.0, + "grad_norm": 2.7063973551454263, + "language_loss": 0.79410255, + "learning_rate": 9.434715735916477e-09, + "loss": 0.8707599, + "num_input_tokens_seen": 348173035, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.10168457, + "step": 16133, + "time_per_iteration": 2.4512226581573486 + }, + { + "auxiliary_loss_clip": 0.06397133, + "auxiliary_loss_mlp": 0.01267095, + "balance_loss_clip": 0.06269794, + "balance_loss_mlp": 0.01258476, + "epoch": 0.9700285585450172, + "flos": 21914876851200.0, + "grad_norm": 1.5611198022203323, + "language_loss": 0.64911574, + "learning_rate": 9.396968744281863e-09, + "loss": 0.72575808, + "num_input_tokens_seen": 348192960, + "router_z_loss_clip": 1.27246094, + "router_z_loss_mlp": 0.08618164, + "step": 16134, + "time_per_iteration": 2.500866413116455 + }, + { + "auxiliary_loss_clip": 0.06402618, + "auxiliary_loss_mlp": 0.01262072, + "balance_loss_clip": 0.0627054, + "balance_loss_mlp": 0.01252786, + "epoch": 0.9700886817976853, + "flos": 23921686137600.0, + "grad_norm": 1.8077102580122415, + "language_loss": 0.80706894, + "learning_rate": 9.359297236513519e-09, + "loss": 0.88371587, + "num_input_tokens_seen": 348212805, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.09289551, + "step": 16135, + "time_per_iteration": 3.9117238521575928 + }, + { + "auxiliary_loss_clip": 0.06405312, + "auxiliary_loss_mlp": 0.01263739, + "balance_loss_clip": 0.06270383, + "balance_loss_mlp": 0.01253267, + "epoch": 0.9701488050503532, + "flos": 25454989601280.0, + "grad_norm": 1.6284393285017646, + "language_loss": 0.73501408, + "learning_rate": 9.321701214040079e-09, + "loss": 0.81170464, + "num_input_tokens_seen": 348232900, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.10473633, + "step": 16136, + "time_per_iteration": 2.5779073238372803 + }, + { + "auxiliary_loss_clip": 0.06398432, + "auxiliary_loss_mlp": 0.01267079, + "balance_loss_clip": 0.06269141, + "balance_loss_mlp": 0.01257644, + "epoch": 0.9702089283030212, + "flos": 20596453983360.0, + "grad_norm": 1.492877171392222, + "language_loss": 0.76563627, + "learning_rate": 9.28418067828729e-09, + "loss": 0.84229136, + "num_input_tokens_seen": 348253065, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.09442139, + "step": 16137, + "time_per_iteration": 2.538085460662842 + }, + { + "auxiliary_loss_clip": 0.06306077, + "auxiliary_loss_mlp": 0.01249847, + "balance_loss_clip": 0.06252094, + "balance_loss_mlp": 0.01248849, + "epoch": 0.9702690515556892, + "flos": 70671955973760.0, + "grad_norm": 0.821661417803752, + "language_loss": 0.5493418, + "learning_rate": 9.246735630678015e-09, + "loss": 0.62490106, + "num_input_tokens_seen": 348316075, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.00997925, + "step": 16138, + "time_per_iteration": 3.2206809520721436 + }, + { + "auxiliary_loss_clip": 0.06400603, + "auxiliary_loss_mlp": 0.01266479, + "balance_loss_clip": 0.06268343, + "balance_loss_mlp": 0.01257031, + "epoch": 0.9703291748083571, + "flos": 35890104401280.0, + "grad_norm": 1.6919399068394998, + "language_loss": 0.70817888, + "learning_rate": 9.209366072632007e-09, + "loss": 0.78484976, + "num_input_tokens_seen": 348337605, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.09448242, + "step": 16139, + "time_per_iteration": 2.685359239578247 + }, + { + "auxiliary_loss_clip": 0.06405871, + "auxiliary_loss_mlp": 0.01265937, + "balance_loss_clip": 0.06271709, + "balance_loss_mlp": 0.01255846, + "epoch": 0.9703892980610251, + "flos": 24323383411200.0, + "grad_norm": 1.4852004067198157, + "language_loss": 0.72197908, + "learning_rate": 9.172072005566134e-09, + "loss": 0.79869711, + "num_input_tokens_seen": 348359430, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.10089111, + "step": 16140, + "time_per_iteration": 2.558115005493164 + }, + { + "auxiliary_loss_clip": 0.06405499, + "auxiliary_loss_mlp": 0.01266107, + "balance_loss_clip": 0.06272194, + "balance_loss_mlp": 0.01256433, + "epoch": 0.970449421313693, + "flos": 18009474226560.0, + "grad_norm": 3.19764117051917, + "language_loss": 0.69224846, + "learning_rate": 9.13485343089504e-09, + "loss": 0.76896447, + "num_input_tokens_seen": 348377890, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.09674072, + "step": 16141, + "time_per_iteration": 2.482884168624878 + }, + { + "auxiliary_loss_clip": 0.06398399, + "auxiliary_loss_mlp": 0.01262865, + "balance_loss_clip": 0.06271194, + "balance_loss_mlp": 0.01253692, + "epoch": 0.9705095445663611, + "flos": 25345054644480.0, + "grad_norm": 1.7252528313404465, + "language_loss": 0.68293542, + "learning_rate": 9.097710350029597e-09, + "loss": 0.75954807, + "num_input_tokens_seen": 348396550, + "router_z_loss_clip": 1.27246094, + "router_z_loss_mlp": 0.0916748, + "step": 16142, + "time_per_iteration": 4.058878183364868 + }, + { + "auxiliary_loss_clip": 0.06401761, + "auxiliary_loss_mlp": 0.01262507, + "balance_loss_clip": 0.0626963, + "balance_loss_mlp": 0.01253132, + "epoch": 0.970569667819029, + "flos": 26840860606080.0, + "grad_norm": 1.8571958847472876, + "language_loss": 0.55470061, + "learning_rate": 9.060642764378457e-09, + "loss": 0.63134331, + "num_input_tokens_seen": 348417120, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.09375, + "step": 16143, + "time_per_iteration": 2.5692148208618164 + }, + { + "auxiliary_loss_clip": 0.06405912, + "auxiliary_loss_mlp": 0.01267612, + "balance_loss_clip": 0.06272087, + "balance_loss_mlp": 0.01258742, + "epoch": 0.970629791071697, + "flos": 25855764480000.0, + "grad_norm": 2.158347081633599, + "language_loss": 0.67963922, + "learning_rate": 9.023650675347382e-09, + "loss": 0.75637448, + "num_input_tokens_seen": 348437750, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.08874512, + "step": 16144, + "time_per_iteration": 2.5477588176727295 + }, + { + "auxiliary_loss_clip": 0.06398851, + "auxiliary_loss_mlp": 0.01265158, + "balance_loss_clip": 0.06270637, + "balance_loss_mlp": 0.0125611, + "epoch": 0.9706899143243649, + "flos": 36549294871680.0, + "grad_norm": 1.7214087229077903, + "language_loss": 0.72277164, + "learning_rate": 8.986734084339253e-09, + "loss": 0.79941171, + "num_input_tokens_seen": 348460935, + "router_z_loss_clip": 1.28320312, + "router_z_loss_mlp": 0.0904541, + "step": 16145, + "time_per_iteration": 2.6755943298339844 + }, + { + "auxiliary_loss_clip": 0.0640352, + "auxiliary_loss_mlp": 0.01263869, + "balance_loss_clip": 0.06269689, + "balance_loss_mlp": 0.01253522, + "epoch": 0.9707500375770329, + "flos": 12271794370560.0, + "grad_norm": 2.6855467217537488, + "language_loss": 0.80483818, + "learning_rate": 8.949892992753395e-09, + "loss": 0.88151205, + "num_input_tokens_seen": 348474480, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.10351562, + "step": 16146, + "time_per_iteration": 2.4482696056365967 + }, + { + "auxiliary_loss_clip": 0.063061, + "auxiliary_loss_mlp": 0.01250418, + "balance_loss_clip": 0.06252153, + "balance_loss_mlp": 0.01249364, + "epoch": 0.9708101608297008, + "flos": 60874550271360.0, + "grad_norm": 0.8926605376395859, + "language_loss": 0.546646, + "learning_rate": 8.91312740198713e-09, + "loss": 0.62221122, + "num_input_tokens_seen": 348541220, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.01055145, + "step": 16147, + "time_per_iteration": 3.1784896850585938 + }, + { + "auxiliary_loss_clip": 0.06404494, + "auxiliary_loss_mlp": 0.01265443, + "balance_loss_clip": 0.06269732, + "balance_loss_mlp": 0.01255663, + "epoch": 0.9708702840823689, + "flos": 27131952090240.0, + "grad_norm": 4.377042255553633, + "language_loss": 0.61389154, + "learning_rate": 8.876437313434682e-09, + "loss": 0.69059098, + "num_input_tokens_seen": 348559230, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.09783936, + "step": 16148, + "time_per_iteration": 3.9833836555480957 + }, + { + "auxiliary_loss_clip": 0.06399462, + "auxiliary_loss_mlp": 0.01263798, + "balance_loss_clip": 0.06271495, + "balance_loss_mlp": 0.01254553, + "epoch": 0.9709304073350368, + "flos": 20784067274880.0, + "grad_norm": 1.8003493724827047, + "language_loss": 0.73550653, + "learning_rate": 8.839822728487155e-09, + "loss": 0.81213915, + "num_input_tokens_seen": 348577850, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.09246826, + "step": 16149, + "time_per_iteration": 2.518012046813965 + }, + { + "auxiliary_loss_clip": 0.06402236, + "auxiliary_loss_mlp": 0.01263658, + "balance_loss_clip": 0.06271193, + "balance_loss_mlp": 0.01254408, + "epoch": 0.9709905305877048, + "flos": 41943627423360.0, + "grad_norm": 1.959430214101398, + "language_loss": 0.75053811, + "learning_rate": 8.803283648533222e-09, + "loss": 0.82719702, + "num_input_tokens_seen": 348598345, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.09246826, + "step": 16150, + "time_per_iteration": 2.6981914043426514 + }, + { + "auxiliary_loss_clip": 0.0641038, + "auxiliary_loss_mlp": 0.01268959, + "balance_loss_clip": 0.06272288, + "balance_loss_mlp": 0.01257408, + "epoch": 0.9710506538403728, + "flos": 17171349361920.0, + "grad_norm": 2.6505663185230803, + "language_loss": 0.73947191, + "learning_rate": 8.766820074958214e-09, + "loss": 0.81626534, + "num_input_tokens_seen": 348616300, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.11547852, + "step": 16151, + "time_per_iteration": 2.4698150157928467 + }, + { + "auxiliary_loss_clip": 0.0639576, + "auxiliary_loss_mlp": 0.01262487, + "balance_loss_clip": 0.06268339, + "balance_loss_mlp": 0.01253153, + "epoch": 0.9711107770930407, + "flos": 21178972368000.0, + "grad_norm": 1.7020963339660558, + "language_loss": 0.74932683, + "learning_rate": 8.730432009145027e-09, + "loss": 0.82590926, + "num_input_tokens_seen": 348633845, + "router_z_loss_clip": 1.2734375, + "router_z_loss_mlp": 0.09332275, + "step": 16152, + "time_per_iteration": 2.5061516761779785 + }, + { + "auxiliary_loss_clip": 0.06401396, + "auxiliary_loss_mlp": 0.01263582, + "balance_loss_clip": 0.06271546, + "balance_loss_mlp": 0.0125398, + "epoch": 0.9711709003457087, + "flos": 22243675472640.0, + "grad_norm": 1.8409380245762448, + "language_loss": 0.67063367, + "learning_rate": 8.694119452473448e-09, + "loss": 0.74728346, + "num_input_tokens_seen": 348653070, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.0960083, + "step": 16153, + "time_per_iteration": 2.5174050331115723 + }, + { + "auxiliary_loss_clip": 0.06401861, + "auxiliary_loss_mlp": 0.01268174, + "balance_loss_clip": 0.06270944, + "balance_loss_mlp": 0.01259204, + "epoch": 0.9712310235983767, + "flos": 26221096281600.0, + "grad_norm": 1.5163475252585155, + "language_loss": 0.70737278, + "learning_rate": 8.65788240632037e-09, + "loss": 0.78407311, + "num_input_tokens_seen": 348672145, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.08978271, + "step": 16154, + "time_per_iteration": 2.55505633354187 + }, + { + "auxiliary_loss_clip": 0.06405511, + "auxiliary_loss_mlp": 0.01265417, + "balance_loss_clip": 0.06270428, + "balance_loss_mlp": 0.01255082, + "epoch": 0.9712911468510447, + "flos": 20674509661440.0, + "grad_norm": 1.7710831738309059, + "language_loss": 0.81191093, + "learning_rate": 8.621720872059812e-09, + "loss": 0.88862026, + "num_input_tokens_seen": 348690615, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.10327148, + "step": 16155, + "time_per_iteration": 2.4887568950653076 + }, + { + "auxiliary_loss_clip": 0.06409426, + "auxiliary_loss_mlp": 0.01266787, + "balance_loss_clip": 0.06273002, + "balance_loss_mlp": 0.01256958, + "epoch": 0.9713512701037126, + "flos": 13557960616320.0, + "grad_norm": 1.9435807645982621, + "language_loss": 0.67513001, + "learning_rate": 8.58563485106334e-09, + "loss": 0.75189221, + "num_input_tokens_seen": 348708665, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.0982666, + "step": 16156, + "time_per_iteration": 2.4993584156036377 + }, + { + "auxiliary_loss_clip": 0.06404352, + "auxiliary_loss_mlp": 0.01263135, + "balance_loss_clip": 0.06270174, + "balance_loss_mlp": 0.01254081, + "epoch": 0.9714113933563806, + "flos": 25855890261120.0, + "grad_norm": 2.458858040967428, + "language_loss": 0.91195989, + "learning_rate": 8.54962434469919e-09, + "loss": 0.98863471, + "num_input_tokens_seen": 348726105, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.0904541, + "step": 16157, + "time_per_iteration": 2.5206339359283447 + }, + { + "auxiliary_loss_clip": 0.06405168, + "auxiliary_loss_mlp": 0.01261509, + "balance_loss_clip": 0.06270272, + "balance_loss_mlp": 0.01252938, + "epoch": 0.9714715166090485, + "flos": 12746809566720.0, + "grad_norm": 2.148569057457713, + "language_loss": 0.72731894, + "learning_rate": 8.513689354332721e-09, + "loss": 0.80398571, + "num_input_tokens_seen": 348743360, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.08575439, + "step": 16158, + "time_per_iteration": 2.4993045330047607 + }, + { + "auxiliary_loss_clip": 0.06398468, + "auxiliary_loss_mlp": 0.01263592, + "balance_loss_clip": 0.0626895, + "balance_loss_mlp": 0.0125443, + "epoch": 0.9715316398617165, + "flos": 18411423062400.0, + "grad_norm": 2.253671983046757, + "language_loss": 0.6065799, + "learning_rate": 8.477829881326836e-09, + "loss": 0.68320048, + "num_input_tokens_seen": 348759045, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.0916748, + "step": 16159, + "time_per_iteration": 2.5027124881744385 + }, + { + "auxiliary_loss_clip": 0.0639558, + "auxiliary_loss_mlp": 0.01264017, + "balance_loss_clip": 0.06269194, + "balance_loss_mlp": 0.01255434, + "epoch": 0.9715917631143844, + "flos": 28921490939520.0, + "grad_norm": 1.595247357103686, + "language_loss": 0.78944242, + "learning_rate": 8.44204592704112e-09, + "loss": 0.86603844, + "num_input_tokens_seen": 348779910, + "router_z_loss_clip": 1.26464844, + "router_z_loss_mlp": 0.08575439, + "step": 16160, + "time_per_iteration": 2.5898780822753906 + }, + { + "auxiliary_loss_clip": 0.06308243, + "auxiliary_loss_mlp": 0.01251149, + "balance_loss_clip": 0.06254422, + "balance_loss_mlp": 0.01250153, + "epoch": 0.9716518863670525, + "flos": 65958504900480.0, + "grad_norm": 0.7522955925244894, + "language_loss": 0.54286468, + "learning_rate": 8.406337492832704e-09, + "loss": 0.61845851, + "num_input_tokens_seen": 348838995, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.00994873, + "step": 16161, + "time_per_iteration": 3.1553361415863037 + }, + { + "auxiliary_loss_clip": 0.06398444, + "auxiliary_loss_mlp": 0.01263413, + "balance_loss_clip": 0.06270605, + "balance_loss_mlp": 0.01254282, + "epoch": 0.9717120096197204, + "flos": 17718592377600.0, + "grad_norm": 1.69794740323834, + "language_loss": 0.71924436, + "learning_rate": 8.3707045800554e-09, + "loss": 0.79586291, + "num_input_tokens_seen": 348858090, + "router_z_loss_clip": 1.27832031, + "router_z_loss_mlp": 0.09130859, + "step": 16162, + "time_per_iteration": 2.529026746749878 + }, + { + "auxiliary_loss_clip": 0.063986, + "auxiliary_loss_mlp": 0.01265674, + "balance_loss_clip": 0.06268875, + "balance_loss_mlp": 0.0125622, + "epoch": 0.9717721328723884, + "flos": 24470522380800.0, + "grad_norm": 1.5641682606376985, + "language_loss": 0.78791863, + "learning_rate": 8.335147190060787e-09, + "loss": 0.86456132, + "num_input_tokens_seen": 348877885, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.09454346, + "step": 16163, + "time_per_iteration": 2.5521621704101562 + }, + { + "auxiliary_loss_clip": 0.06400799, + "auxiliary_loss_mlp": 0.01263838, + "balance_loss_clip": 0.0627103, + "balance_loss_mlp": 0.01254832, + "epoch": 0.9718322561250564, + "flos": 20782641755520.0, + "grad_norm": 1.9434386776023218, + "language_loss": 0.73329967, + "learning_rate": 8.299665324196903e-09, + "loss": 0.80994606, + "num_input_tokens_seen": 348897720, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.09008789, + "step": 16164, + "time_per_iteration": 2.5233001708984375 + }, + { + "auxiliary_loss_clip": 0.06404097, + "auxiliary_loss_mlp": 0.0126725, + "balance_loss_clip": 0.0627019, + "balance_loss_mlp": 0.01257225, + "epoch": 0.9718923793777243, + "flos": 19031900146560.0, + "grad_norm": 2.0895359758091194, + "language_loss": 0.84477919, + "learning_rate": 8.264258983809114e-09, + "loss": 0.92149264, + "num_input_tokens_seen": 348915410, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.10015869, + "step": 16165, + "time_per_iteration": 2.4885025024414062 + }, + { + "auxiliary_loss_clip": 0.06401068, + "auxiliary_loss_mlp": 0.01261942, + "balance_loss_clip": 0.06270339, + "balance_loss_mlp": 0.01253967, + "epoch": 0.9719525026303923, + "flos": 21878175962880.0, + "grad_norm": 1.4925569897983804, + "language_loss": 0.79246068, + "learning_rate": 8.228928170240345e-09, + "loss": 0.8690908, + "num_input_tokens_seen": 348934335, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.07977295, + "step": 16166, + "time_per_iteration": 2.505911111831665 + }, + { + "auxiliary_loss_clip": 0.0639866, + "auxiliary_loss_mlp": 0.01263924, + "balance_loss_clip": 0.06269057, + "balance_loss_mlp": 0.01254107, + "epoch": 0.9720126258830603, + "flos": 14434631159040.0, + "grad_norm": 1.7340216606889713, + "language_loss": 0.71028543, + "learning_rate": 8.193672884830195e-09, + "loss": 0.78691125, + "num_input_tokens_seen": 348952405, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.0982666, + "step": 16167, + "time_per_iteration": 4.039773941040039 + }, + { + "auxiliary_loss_clip": 0.06401287, + "auxiliary_loss_mlp": 0.01263666, + "balance_loss_clip": 0.06272106, + "balance_loss_mlp": 0.0125432, + "epoch": 0.9720727491357283, + "flos": 26258551856640.0, + "grad_norm": 1.4905836885557386, + "language_loss": 0.76212865, + "learning_rate": 8.158493128915812e-09, + "loss": 0.83877814, + "num_input_tokens_seen": 348973580, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.09350586, + "step": 16168, + "time_per_iteration": 2.571298837661743 + }, + { + "auxiliary_loss_clip": 0.06404977, + "auxiliary_loss_mlp": 0.01264172, + "balance_loss_clip": 0.06272201, + "balance_loss_mlp": 0.01254564, + "epoch": 0.9721328723883962, + "flos": 22680648115200.0, + "grad_norm": 2.0966560068036073, + "language_loss": 0.72333491, + "learning_rate": 8.123388903830797e-09, + "loss": 0.80002642, + "num_input_tokens_seen": 348992035, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.0960083, + "step": 16169, + "time_per_iteration": 2.514556646347046 + }, + { + "auxiliary_loss_clip": 0.06403787, + "auxiliary_loss_mlp": 0.01263177, + "balance_loss_clip": 0.06268648, + "balance_loss_mlp": 0.01253354, + "epoch": 0.9721929956410642, + "flos": 28081647066240.0, + "grad_norm": 1.657160830557666, + "language_loss": 0.57263756, + "learning_rate": 8.088360210906309e-09, + "loss": 0.64930725, + "num_input_tokens_seen": 349013160, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.09820557, + "step": 16170, + "time_per_iteration": 2.5566329956054688 + }, + { + "auxiliary_loss_clip": 0.06402764, + "auxiliary_loss_mlp": 0.01265099, + "balance_loss_clip": 0.06270877, + "balance_loss_mlp": 0.01255258, + "epoch": 0.9722531188937321, + "flos": 21002595523200.0, + "grad_norm": 1.5645513876953863, + "language_loss": 0.71513534, + "learning_rate": 8.053407051471062e-09, + "loss": 0.79181397, + "num_input_tokens_seen": 349033485, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09832764, + "step": 16171, + "time_per_iteration": 2.521963119506836 + }, + { + "auxiliary_loss_clip": 0.06400986, + "auxiliary_loss_mlp": 0.01265808, + "balance_loss_clip": 0.06269605, + "balance_loss_mlp": 0.01256069, + "epoch": 0.9723132421464001, + "flos": 16076108643840.0, + "grad_norm": 1.684444185792019, + "language_loss": 0.68665528, + "learning_rate": 8.018529426850218e-09, + "loss": 0.76332319, + "num_input_tokens_seen": 349051705, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.09747314, + "step": 16172, + "time_per_iteration": 2.4726855754852295 + }, + { + "auxiliary_loss_clip": 0.06400435, + "auxiliary_loss_mlp": 0.01263752, + "balance_loss_clip": 0.06272088, + "balance_loss_mlp": 0.01255044, + "epoch": 0.972373365399068, + "flos": 27753183861120.0, + "grad_norm": 1.7449556340792685, + "language_loss": 0.86100602, + "learning_rate": 7.983727338366274e-09, + "loss": 0.93764794, + "num_input_tokens_seen": 349070825, + "router_z_loss_clip": 1.28320312, + "router_z_loss_mlp": 0.08703613, + "step": 16173, + "time_per_iteration": 2.5892083644866943 + }, + { + "auxiliary_loss_clip": 0.06409517, + "auxiliary_loss_mlp": 0.01266374, + "balance_loss_clip": 0.06271982, + "balance_loss_mlp": 0.0125527, + "epoch": 0.9724334886517361, + "flos": 23009614444800.0, + "grad_norm": 1.7640837556867108, + "language_loss": 0.64575619, + "learning_rate": 7.949000787339289e-09, + "loss": 0.72251511, + "num_input_tokens_seen": 349089730, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.11090088, + "step": 16174, + "time_per_iteration": 3.989103317260742 + }, + { + "auxiliary_loss_clip": 0.06399212, + "auxiliary_loss_mlp": 0.01266929, + "balance_loss_clip": 0.06270289, + "balance_loss_mlp": 0.01258275, + "epoch": 0.972493611904404, + "flos": 25454067206400.0, + "grad_norm": 1.712366988133228, + "language_loss": 0.78392601, + "learning_rate": 7.914349775085538e-09, + "loss": 0.86058748, + "num_input_tokens_seen": 349111315, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.08654785, + "step": 16175, + "time_per_iteration": 2.548630475997925 + }, + { + "auxiliary_loss_clip": 0.06401244, + "auxiliary_loss_mlp": 0.01266243, + "balance_loss_clip": 0.06271894, + "balance_loss_mlp": 0.01256337, + "epoch": 0.972553735157072, + "flos": 16988767315200.0, + "grad_norm": 2.017456752421388, + "language_loss": 0.57784498, + "learning_rate": 7.879774302919307e-09, + "loss": 0.65451986, + "num_input_tokens_seen": 349129495, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.09906006, + "step": 16176, + "time_per_iteration": 2.4894320964813232 + }, + { + "auxiliary_loss_clip": 0.06400141, + "auxiliary_loss_mlp": 0.01263307, + "balance_loss_clip": 0.06271263, + "balance_loss_mlp": 0.01254569, + "epoch": 0.97261385840974, + "flos": 26111916011520.0, + "grad_norm": 2.620974908086474, + "language_loss": 0.72649771, + "learning_rate": 7.845274372151545e-09, + "loss": 0.80313218, + "num_input_tokens_seen": 349148850, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.08740234, + "step": 16177, + "time_per_iteration": 2.536285400390625 + }, + { + "auxiliary_loss_clip": 0.06406036, + "auxiliary_loss_mlp": 0.01265412, + "balance_loss_clip": 0.06271951, + "balance_loss_mlp": 0.0125618, + "epoch": 0.9726739816624079, + "flos": 25455031528320.0, + "grad_norm": 1.6608985876914684, + "language_loss": 0.68600643, + "learning_rate": 7.810849984090984e-09, + "loss": 0.76272094, + "num_input_tokens_seen": 349167620, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.09228516, + "step": 16178, + "time_per_iteration": 2.54495906829834 + }, + { + "auxiliary_loss_clip": 0.06405666, + "auxiliary_loss_mlp": 0.01264633, + "balance_loss_clip": 0.06270958, + "balance_loss_mlp": 0.01254405, + "epoch": 0.972734104915076, + "flos": 29021237625600.0, + "grad_norm": 2.148587612037516, + "language_loss": 0.6748485, + "learning_rate": 7.776501140042358e-09, + "loss": 0.75155145, + "num_input_tokens_seen": 349185845, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.10229492, + "step": 16179, + "time_per_iteration": 2.5600404739379883 + }, + { + "auxiliary_loss_clip": 0.06396864, + "auxiliary_loss_mlp": 0.0126201, + "balance_loss_clip": 0.06269827, + "balance_loss_mlp": 0.01253212, + "epoch": 0.9727942281677439, + "flos": 23443861829760.0, + "grad_norm": 1.8043958106995313, + "language_loss": 0.77263665, + "learning_rate": 7.742227841308624e-09, + "loss": 0.8492254, + "num_input_tokens_seen": 349204525, + "router_z_loss_clip": 1.27050781, + "router_z_loss_mlp": 0.0880127, + "step": 16180, + "time_per_iteration": 2.521084785461426 + }, + { + "auxiliary_loss_clip": 0.06407119, + "auxiliary_loss_mlp": 0.0126681, + "balance_loss_clip": 0.06269898, + "balance_loss_mlp": 0.01256618, + "epoch": 0.9728543514204119, + "flos": 31732994511360.0, + "grad_norm": 1.4950380620703005, + "language_loss": 0.76710343, + "learning_rate": 7.708030089189188e-09, + "loss": 0.84384269, + "num_input_tokens_seen": 349228075, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.10180664, + "step": 16181, + "time_per_iteration": 2.5928866863250732 + }, + { + "auxiliary_loss_clip": 0.06401683, + "auxiliary_loss_mlp": 0.01263081, + "balance_loss_clip": 0.06270894, + "balance_loss_mlp": 0.01254003, + "epoch": 0.9729144746730798, + "flos": 16294888454400.0, + "grad_norm": 1.3587136174189807, + "language_loss": 0.6363312, + "learning_rate": 7.67390788498079e-09, + "loss": 0.71297884, + "num_input_tokens_seen": 349246990, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.09075928, + "step": 16182, + "time_per_iteration": 3.9371418952941895 + }, + { + "auxiliary_loss_clip": 0.06401983, + "auxiliary_loss_mlp": 0.01265037, + "balance_loss_clip": 0.06269817, + "balance_loss_mlp": 0.01255512, + "epoch": 0.9729745979257478, + "flos": 25047632177280.0, + "grad_norm": 1.6902434550844887, + "language_loss": 0.62347919, + "learning_rate": 7.639861229977507e-09, + "loss": 0.70014942, + "num_input_tokens_seen": 349265890, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.09527588, + "step": 16183, + "time_per_iteration": 2.5505123138427734 + }, + { + "auxiliary_loss_clip": 0.06394369, + "auxiliary_loss_mlp": 0.01265951, + "balance_loss_clip": 0.06267164, + "balance_loss_mlp": 0.01256623, + "epoch": 0.9730347211784157, + "flos": 22645456600320.0, + "grad_norm": 2.073017408654554, + "language_loss": 0.77957594, + "learning_rate": 7.605890125470527e-09, + "loss": 0.85617918, + "num_input_tokens_seen": 349285275, + "router_z_loss_clip": 1.27148438, + "router_z_loss_mlp": 0.09326172, + "step": 16184, + "time_per_iteration": 2.5804505348205566 + }, + { + "auxiliary_loss_clip": 0.06400636, + "auxiliary_loss_mlp": 0.01264673, + "balance_loss_clip": 0.06270216, + "balance_loss_mlp": 0.01255625, + "epoch": 0.9730948444310837, + "flos": 11003195554560.0, + "grad_norm": 2.1007472833639764, + "language_loss": 0.79576832, + "learning_rate": 7.571994572747709e-09, + "loss": 0.87242138, + "num_input_tokens_seen": 349301515, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.09051514, + "step": 16185, + "time_per_iteration": 2.4700310230255127 + }, + { + "auxiliary_loss_clip": 0.06404022, + "auxiliary_loss_mlp": 0.0126446, + "balance_loss_clip": 0.06270284, + "balance_loss_mlp": 0.01255167, + "epoch": 0.9731549676837516, + "flos": 16804969384320.0, + "grad_norm": 1.7281880541829828, + "language_loss": 0.77737701, + "learning_rate": 7.538174573094469e-09, + "loss": 0.85406184, + "num_input_tokens_seen": 349319590, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.09289551, + "step": 16186, + "time_per_iteration": 2.495136022567749 + }, + { + "auxiliary_loss_clip": 0.06399482, + "auxiliary_loss_mlp": 0.01261887, + "balance_loss_clip": 0.06269419, + "balance_loss_mlp": 0.01252344, + "epoch": 0.9732150909364197, + "flos": 21148057411200.0, + "grad_norm": 1.5535957867301606, + "language_loss": 0.65284431, + "learning_rate": 7.504430127793337e-09, + "loss": 0.72945803, + "num_input_tokens_seen": 349339230, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.09539795, + "step": 16187, + "time_per_iteration": 2.518338680267334 + }, + { + "auxiliary_loss_clip": 0.06399734, + "auxiliary_loss_mlp": 0.01264685, + "balance_loss_clip": 0.06269566, + "balance_loss_mlp": 0.01255297, + "epoch": 0.9732752141890876, + "flos": 33735401458560.0, + "grad_norm": 1.82910578171191, + "language_loss": 0.80486286, + "learning_rate": 7.47076123812418e-09, + "loss": 0.88150704, + "num_input_tokens_seen": 349361155, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.09387207, + "step": 16188, + "time_per_iteration": 4.078651666641235 + }, + { + "auxiliary_loss_clip": 0.0639957, + "auxiliary_loss_mlp": 0.01265825, + "balance_loss_clip": 0.06272013, + "balance_loss_mlp": 0.01257331, + "epoch": 0.9733353374417556, + "flos": 23411144010240.0, + "grad_norm": 1.9709286631587892, + "language_loss": 0.79032779, + "learning_rate": 7.437167905363084e-09, + "loss": 0.86698174, + "num_input_tokens_seen": 349379335, + "router_z_loss_clip": 1.27734375, + "router_z_loss_mlp": 0.0848999, + "step": 16189, + "time_per_iteration": 2.5257105827331543 + }, + { + "auxiliary_loss_clip": 0.06399654, + "auxiliary_loss_mlp": 0.01264485, + "balance_loss_clip": 0.06269268, + "balance_loss_mlp": 0.01254859, + "epoch": 0.9733954606944236, + "flos": 39175113795840.0, + "grad_norm": 1.7501353346003765, + "language_loss": 0.5154829, + "learning_rate": 7.403650130784367e-09, + "loss": 0.59212422, + "num_input_tokens_seen": 349401575, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.09619141, + "step": 16190, + "time_per_iteration": 2.6552765369415283 + }, + { + "auxiliary_loss_clip": 0.06401493, + "auxiliary_loss_mlp": 0.0126365, + "balance_loss_clip": 0.06270113, + "balance_loss_mlp": 0.01254399, + "epoch": 0.9734555839470915, + "flos": 21988404408960.0, + "grad_norm": 1.6917761337688713, + "language_loss": 0.80587709, + "learning_rate": 7.3702079156590105e-09, + "loss": 0.88252854, + "num_input_tokens_seen": 349420650, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09240723, + "step": 16191, + "time_per_iteration": 2.544218063354492 + }, + { + "auxiliary_loss_clip": 0.06401005, + "auxiliary_loss_mlp": 0.01263985, + "balance_loss_clip": 0.06270884, + "balance_loss_mlp": 0.01255462, + "epoch": 0.9735157071997596, + "flos": 16580152080000.0, + "grad_norm": 1.6445033626278693, + "language_loss": 0.8313902, + "learning_rate": 7.336841261255111e-09, + "loss": 0.90804017, + "num_input_tokens_seen": 349436830, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.08526611, + "step": 16192, + "time_per_iteration": 2.4879636764526367 + }, + { + "auxiliary_loss_clip": 0.0640302, + "auxiliary_loss_mlp": 0.01265061, + "balance_loss_clip": 0.06269884, + "balance_loss_mlp": 0.01255596, + "epoch": 0.9735758304524275, + "flos": 20228313070080.0, + "grad_norm": 1.7244487674572468, + "language_loss": 0.75065506, + "learning_rate": 7.303550168837658e-09, + "loss": 0.82733583, + "num_input_tokens_seen": 349454325, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.09472656, + "step": 16193, + "time_per_iteration": 2.4931979179382324 + }, + { + "auxiliary_loss_clip": 0.06397454, + "auxiliary_loss_mlp": 0.01262104, + "balance_loss_clip": 0.06270149, + "balance_loss_mlp": 0.01253688, + "epoch": 0.9736359537050955, + "flos": 23659077841920.0, + "grad_norm": 1.629712416735138, + "language_loss": 0.85322011, + "learning_rate": 7.270334639669417e-09, + "loss": 0.92981565, + "num_input_tokens_seen": 349470230, + "router_z_loss_clip": 1.2734375, + "router_z_loss_mlp": 0.08416748, + "step": 16194, + "time_per_iteration": 2.505967140197754 + }, + { + "auxiliary_loss_clip": 0.06396167, + "auxiliary_loss_mlp": 0.01264562, + "balance_loss_clip": 0.06270817, + "balance_loss_mlp": 0.01255919, + "epoch": 0.9736960769577634, + "flos": 15565692297600.0, + "grad_norm": 1.4618204477527796, + "language_loss": 0.76054919, + "learning_rate": 7.237194675009828e-09, + "loss": 0.83715641, + "num_input_tokens_seen": 349486250, + "router_z_loss_clip": 1.25195312, + "router_z_loss_mlp": 0.08648682, + "step": 16195, + "time_per_iteration": 2.4902737140655518 + }, + { + "auxiliary_loss_clip": 0.0630816, + "auxiliary_loss_mlp": 0.01249959, + "balance_loss_clip": 0.06254224, + "balance_loss_mlp": 0.01248933, + "epoch": 0.9737562002104314, + "flos": 65369781313920.0, + "grad_norm": 0.7068967034804419, + "language_loss": 0.52516842, + "learning_rate": 7.204130276115439e-09, + "loss": 0.60074961, + "num_input_tokens_seen": 349545865, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.01026154, + "step": 16196, + "time_per_iteration": 3.0891356468200684 + }, + { + "auxiliary_loss_clip": 0.06402862, + "auxiliary_loss_mlp": 0.01264517, + "balance_loss_clip": 0.06270996, + "balance_loss_mlp": 0.01255028, + "epoch": 0.9738163234630993, + "flos": 27203760639360.0, + "grad_norm": 1.5067079067640303, + "language_loss": 0.76304662, + "learning_rate": 7.171141444240136e-09, + "loss": 0.83972049, + "num_input_tokens_seen": 349566080, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09484863, + "step": 16197, + "time_per_iteration": 2.5539703369140625 + }, + { + "auxiliary_loss_clip": 0.06407809, + "auxiliary_loss_mlp": 0.01266448, + "balance_loss_clip": 0.06270401, + "balance_loss_mlp": 0.01256124, + "epoch": 0.9738764467157673, + "flos": 21075745737600.0, + "grad_norm": 1.7086384340605625, + "language_loss": 0.67975712, + "learning_rate": 7.13822818063492e-09, + "loss": 0.75649977, + "num_input_tokens_seen": 349585665, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.10327148, + "step": 16198, + "time_per_iteration": 2.503563165664673 + }, + { + "auxiliary_loss_clip": 0.06400761, + "auxiliary_loss_mlp": 0.01264048, + "balance_loss_clip": 0.06268206, + "balance_loss_mlp": 0.01254678, + "epoch": 0.9739365699684353, + "flos": 21367633835520.0, + "grad_norm": 1.6722273103700527, + "language_loss": 0.77999789, + "learning_rate": 7.10539048654768e-09, + "loss": 0.85664594, + "num_input_tokens_seen": 349605125, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.09362793, + "step": 16199, + "time_per_iteration": 2.5150656700134277 + }, + { + "auxiliary_loss_clip": 0.06402802, + "auxiliary_loss_mlp": 0.01264046, + "balance_loss_clip": 0.06271003, + "balance_loss_mlp": 0.0125411, + "epoch": 0.9739966932211033, + "flos": 21907497692160.0, + "grad_norm": 1.5607608988910977, + "language_loss": 0.79645491, + "learning_rate": 7.072628363223865e-09, + "loss": 0.87312341, + "num_input_tokens_seen": 349623360, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09936523, + "step": 16200, + "time_per_iteration": 2.5212936401367188 + }, + { + "auxiliary_loss_clip": 0.06407085, + "auxiliary_loss_mlp": 0.01263577, + "balance_loss_clip": 0.06268042, + "balance_loss_mlp": 0.01253474, + "epoch": 0.9740568164737712, + "flos": 24834344808960.0, + "grad_norm": 2.2264646235457937, + "language_loss": 0.69207859, + "learning_rate": 7.039941811905592e-09, + "loss": 0.76878524, + "num_input_tokens_seen": 349644390, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.10113525, + "step": 16201, + "time_per_iteration": 2.5361874103546143 + }, + { + "auxiliary_loss_clip": 0.06404103, + "auxiliary_loss_mlp": 0.01265158, + "balance_loss_clip": 0.06272092, + "balance_loss_mlp": 0.01256105, + "epoch": 0.9741169397264392, + "flos": 23630426945280.0, + "grad_norm": 1.5091663740328265, + "language_loss": 0.72960538, + "learning_rate": 7.0073308338325364e-09, + "loss": 0.80629796, + "num_input_tokens_seen": 349663200, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09051514, + "step": 16202, + "time_per_iteration": 2.53006649017334 + }, + { + "auxiliary_loss_clip": 0.064046, + "auxiliary_loss_mlp": 0.01264665, + "balance_loss_clip": 0.06270882, + "balance_loss_mlp": 0.01255236, + "epoch": 0.9741770629791072, + "flos": 18846718623360.0, + "grad_norm": 1.822554423323346, + "language_loss": 0.72919339, + "learning_rate": 6.974795430241265e-09, + "loss": 0.80588603, + "num_input_tokens_seen": 349681975, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.09423828, + "step": 16203, + "time_per_iteration": 2.504948616027832 + }, + { + "auxiliary_loss_clip": 0.06402065, + "auxiliary_loss_mlp": 0.01262649, + "balance_loss_clip": 0.06270267, + "balance_loss_mlp": 0.0125347, + "epoch": 0.9742371862317751, + "flos": 22352813815680.0, + "grad_norm": 2.4570819002926303, + "language_loss": 0.77505815, + "learning_rate": 6.942335602365235e-09, + "loss": 0.85170531, + "num_input_tokens_seen": 349701185, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.09179688, + "step": 16204, + "time_per_iteration": 2.499577760696411 + }, + { + "auxiliary_loss_clip": 0.06406648, + "auxiliary_loss_mlp": 0.01266659, + "balance_loss_clip": 0.06274957, + "balance_loss_mlp": 0.01257093, + "epoch": 0.9742973094844432, + "flos": 21769289182080.0, + "grad_norm": 1.965411642233907, + "language_loss": 0.79419672, + "learning_rate": 6.909951351435905e-09, + "loss": 0.87092984, + "num_input_tokens_seen": 349720360, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09564209, + "step": 16205, + "time_per_iteration": 2.4995784759521484 + }, + { + "auxiliary_loss_clip": 0.06399336, + "auxiliary_loss_mlp": 0.01265129, + "balance_loss_clip": 0.06269155, + "balance_loss_mlp": 0.01256147, + "epoch": 0.9743574327371111, + "flos": 26255700817920.0, + "grad_norm": 1.508831100662547, + "language_loss": 0.7445184, + "learning_rate": 6.87764267868074e-09, + "loss": 0.82116306, + "num_input_tokens_seen": 349741040, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.08984375, + "step": 16206, + "time_per_iteration": 4.032231092453003 + }, + { + "auxiliary_loss_clip": 0.06402233, + "auxiliary_loss_mlp": 0.01262179, + "balance_loss_clip": 0.06268986, + "balance_loss_mlp": 0.01252487, + "epoch": 0.9744175559897791, + "flos": 12354252387840.0, + "grad_norm": 2.280007782311689, + "language_loss": 0.84424287, + "learning_rate": 6.8454095853252015e-09, + "loss": 0.92088699, + "num_input_tokens_seen": 349758895, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.09686279, + "step": 16207, + "time_per_iteration": 2.6139605045318604 + }, + { + "auxiliary_loss_clip": 0.06399205, + "auxiliary_loss_mlp": 0.01262873, + "balance_loss_clip": 0.06271303, + "balance_loss_mlp": 0.01254231, + "epoch": 0.974477679242447, + "flos": 28404575902080.0, + "grad_norm": 1.4963528987347634, + "language_loss": 0.71026999, + "learning_rate": 6.813252072591425e-09, + "loss": 0.78689075, + "num_input_tokens_seen": 349779740, + "router_z_loss_clip": 1.27929688, + "router_z_loss_mlp": 0.08648682, + "step": 16208, + "time_per_iteration": 2.599848747253418 + }, + { + "auxiliary_loss_clip": 0.06394268, + "auxiliary_loss_mlp": 0.01262607, + "balance_loss_clip": 0.06270576, + "balance_loss_mlp": 0.0125409, + "epoch": 0.974537802495115, + "flos": 17791155613440.0, + "grad_norm": 1.6815172078173168, + "language_loss": 0.77535599, + "learning_rate": 6.781170141698878e-09, + "loss": 0.85192478, + "num_input_tokens_seen": 349796820, + "router_z_loss_clip": 1.23828125, + "router_z_loss_mlp": 0.08526611, + "step": 16209, + "time_per_iteration": 2.4785659313201904 + }, + { + "auxiliary_loss_clip": 0.06402382, + "auxiliary_loss_mlp": 0.01263455, + "balance_loss_clip": 0.06268477, + "balance_loss_mlp": 0.0125365, + "epoch": 0.9745979257477829, + "flos": 23849164828800.0, + "grad_norm": 1.5681531369172674, + "language_loss": 0.79805732, + "learning_rate": 6.749163793864144e-09, + "loss": 0.87471569, + "num_input_tokens_seen": 349816550, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.09802246, + "step": 16210, + "time_per_iteration": 2.525526285171509 + }, + { + "auxiliary_loss_clip": 0.06400919, + "auxiliary_loss_mlp": 0.01262256, + "balance_loss_clip": 0.06269119, + "balance_loss_mlp": 0.0125294, + "epoch": 0.9746580490004509, + "flos": 27023484579840.0, + "grad_norm": 2.075547249109443, + "language_loss": 0.78150928, + "learning_rate": 6.7172330303009176e-09, + "loss": 0.85814106, + "num_input_tokens_seen": 349834350, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09307861, + "step": 16211, + "time_per_iteration": 2.5355217456817627 + }, + { + "auxiliary_loss_clip": 0.06411395, + "auxiliary_loss_mlp": 0.01265327, + "balance_loss_clip": 0.06274585, + "balance_loss_mlp": 0.01255027, + "epoch": 0.9747181722531189, + "flos": 19798132608000.0, + "grad_norm": 1.9008085045696146, + "language_loss": 0.7795732, + "learning_rate": 6.685377852219787e-09, + "loss": 0.85634041, + "num_input_tokens_seen": 349853460, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.10296631, + "step": 16212, + "time_per_iteration": 2.506300926208496 + }, + { + "auxiliary_loss_clip": 0.06398016, + "auxiliary_loss_mlp": 0.0126477, + "balance_loss_clip": 0.06269819, + "balance_loss_mlp": 0.01256008, + "epoch": 0.9747782955057869, + "flos": 31438590791040.0, + "grad_norm": 1.3851280595823252, + "language_loss": 0.80251986, + "learning_rate": 6.653598260829118e-09, + "loss": 0.87914777, + "num_input_tokens_seen": 349874830, + "router_z_loss_clip": 1.28027344, + "router_z_loss_mlp": 0.08764648, + "step": 16213, + "time_per_iteration": 2.5735127925872803 + }, + { + "auxiliary_loss_clip": 0.0640009, + "auxiliary_loss_mlp": 0.01263743, + "balance_loss_clip": 0.06269902, + "balance_loss_mlp": 0.01254558, + "epoch": 0.9748384187584548, + "flos": 15966802592640.0, + "grad_norm": 1.8081777723616046, + "language_loss": 0.66367626, + "learning_rate": 6.6218942573335044e-09, + "loss": 0.7403146, + "num_input_tokens_seen": 349893690, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.09185791, + "step": 16214, + "time_per_iteration": 4.007796049118042 + }, + { + "auxiliary_loss_clip": 0.06407678, + "auxiliary_loss_mlp": 0.01270943, + "balance_loss_clip": 0.06271762, + "balance_loss_mlp": 0.01261251, + "epoch": 0.9748985420111228, + "flos": 20565035902080.0, + "grad_norm": 3.5974058234157056, + "language_loss": 0.74614125, + "learning_rate": 6.5902658429355386e-09, + "loss": 0.82292747, + "num_input_tokens_seen": 349912480, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.09692383, + "step": 16215, + "time_per_iteration": 2.621452808380127 + }, + { + "auxiliary_loss_clip": 0.06399758, + "auxiliary_loss_mlp": 0.01264078, + "balance_loss_clip": 0.06270334, + "balance_loss_mlp": 0.01254696, + "epoch": 0.9749586652637908, + "flos": 36730577180160.0, + "grad_norm": 1.6258391416497984, + "language_loss": 0.66849625, + "learning_rate": 6.558713018834483e-09, + "loss": 0.74513459, + "num_input_tokens_seen": 349932470, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.09381104, + "step": 16216, + "time_per_iteration": 2.61350417137146 + }, + { + "auxiliary_loss_clip": 0.06405714, + "auxiliary_loss_mlp": 0.01264792, + "balance_loss_clip": 0.062713, + "balance_loss_mlp": 0.01255393, + "epoch": 0.9750187885164587, + "flos": 11003908314240.0, + "grad_norm": 1.786638757254164, + "language_loss": 0.72343373, + "learning_rate": 6.527235786226937e-09, + "loss": 0.80013883, + "num_input_tokens_seen": 349949060, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.09399414, + "step": 16217, + "time_per_iteration": 2.466787576675415 + }, + { + "auxiliary_loss_clip": 0.06400132, + "auxiliary_loss_mlp": 0.01262208, + "balance_loss_clip": 0.06270897, + "balance_loss_mlp": 0.01253667, + "epoch": 0.9750789117691268, + "flos": 25746668064000.0, + "grad_norm": 1.4800942983039718, + "language_loss": 0.78881538, + "learning_rate": 6.495834146306167e-09, + "loss": 0.86543876, + "num_input_tokens_seen": 349968010, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.08532715, + "step": 16218, + "time_per_iteration": 2.6768579483032227 + }, + { + "auxiliary_loss_clip": 0.06398283, + "auxiliary_loss_mlp": 0.01261833, + "balance_loss_clip": 0.06271155, + "balance_loss_mlp": 0.01252458, + "epoch": 0.9751390350217947, + "flos": 13338971170560.0, + "grad_norm": 1.8880651410649392, + "language_loss": 0.77665508, + "learning_rate": 6.464508100263222e-09, + "loss": 0.85325623, + "num_input_tokens_seen": 349985270, + "router_z_loss_clip": 1.27246094, + "router_z_loss_mlp": 0.09362793, + "step": 16219, + "time_per_iteration": 2.511852741241455 + }, + { + "auxiliary_loss_clip": 0.06405136, + "auxiliary_loss_mlp": 0.01262829, + "balance_loss_clip": 0.06272408, + "balance_loss_mlp": 0.0125393, + "epoch": 0.9751991582744627, + "flos": 22827283960320.0, + "grad_norm": 1.5654377531659194, + "language_loss": 0.81504959, + "learning_rate": 6.433257649285817e-09, + "loss": 0.89172924, + "num_input_tokens_seen": 350003935, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.08905029, + "step": 16220, + "time_per_iteration": 2.478729009628296 + }, + { + "auxiliary_loss_clip": 0.06398819, + "auxiliary_loss_mlp": 0.01262589, + "balance_loss_clip": 0.06270699, + "balance_loss_mlp": 0.01253696, + "epoch": 0.9752592815271306, + "flos": 19652293376640.0, + "grad_norm": 1.7313417854694155, + "language_loss": 0.75431448, + "learning_rate": 6.402082794559227e-09, + "loss": 0.83092856, + "num_input_tokens_seen": 350023595, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.08892822, + "step": 16221, + "time_per_iteration": 2.4944918155670166 + }, + { + "auxiliary_loss_clip": 0.06398918, + "auxiliary_loss_mlp": 0.01265498, + "balance_loss_clip": 0.06270978, + "balance_loss_mlp": 0.0125623, + "epoch": 0.9753194047797986, + "flos": 26698165902720.0, + "grad_norm": 1.457397211257543, + "language_loss": 0.66733098, + "learning_rate": 6.370983537265395e-09, + "loss": 0.74397516, + "num_input_tokens_seen": 350045920, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.09265137, + "step": 16222, + "time_per_iteration": 3.96037220954895 + }, + { + "auxiliary_loss_clip": 0.06399057, + "auxiliary_loss_mlp": 0.01263788, + "balance_loss_clip": 0.06270253, + "balance_loss_mlp": 0.01254787, + "epoch": 0.9753795280324665, + "flos": 23228478109440.0, + "grad_norm": 1.713022931639831, + "language_loss": 0.88554835, + "learning_rate": 6.3399598785836004e-09, + "loss": 0.9621768, + "num_input_tokens_seen": 350063925, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.08996582, + "step": 16223, + "time_per_iteration": 2.514981269836426 + }, + { + "auxiliary_loss_clip": 0.06396091, + "auxiliary_loss_mlp": 0.01265957, + "balance_loss_clip": 0.06269166, + "balance_loss_mlp": 0.01257177, + "epoch": 0.9754396512851345, + "flos": 19469920965120.0, + "grad_norm": 1.6965637319333444, + "language_loss": 0.74798816, + "learning_rate": 6.309011819690457e-09, + "loss": 0.82460868, + "num_input_tokens_seen": 350080900, + "router_z_loss_clip": 1.26855469, + "router_z_loss_mlp": 0.08764648, + "step": 16224, + "time_per_iteration": 2.4790241718292236 + }, + { + "auxiliary_loss_clip": 0.06309325, + "auxiliary_loss_mlp": 0.01249123, + "balance_loss_clip": 0.06255152, + "balance_loss_mlp": 0.012481, + "epoch": 0.9754997745378025, + "flos": 68478875061120.0, + "grad_norm": 0.7927113550551911, + "language_loss": 0.59015584, + "learning_rate": 6.278139361759249e-09, + "loss": 0.66574037, + "num_input_tokens_seen": 350144550, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.01023102, + "step": 16225, + "time_per_iteration": 3.09687876701355 + }, + { + "auxiliary_loss_clip": 0.06404333, + "auxiliary_loss_mlp": 0.01270773, + "balance_loss_clip": 0.06274021, + "balance_loss_mlp": 0.01261505, + "epoch": 0.9755598977904705, + "flos": 26402252808960.0, + "grad_norm": 1.669263937257646, + "language_loss": 0.68925965, + "learning_rate": 6.247342505960818e-09, + "loss": 0.7660107, + "num_input_tokens_seen": 350164050, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.09265137, + "step": 16226, + "time_per_iteration": 2.5773234367370605 + }, + { + "auxiliary_loss_clip": 0.06400628, + "auxiliary_loss_mlp": 0.01261945, + "balance_loss_clip": 0.0626903, + "balance_loss_mlp": 0.01252522, + "epoch": 0.9756200210431384, + "flos": 16623225878400.0, + "grad_norm": 1.6660576711306636, + "language_loss": 0.83624262, + "learning_rate": 6.216621253462894e-09, + "loss": 0.91286826, + "num_input_tokens_seen": 350181350, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09417725, + "step": 16227, + "time_per_iteration": 2.486311435699463 + }, + { + "auxiliary_loss_clip": 0.06398968, + "auxiliary_loss_mlp": 0.01264262, + "balance_loss_clip": 0.06270081, + "balance_loss_mlp": 0.01255321, + "epoch": 0.9756801442958064, + "flos": 23629798039680.0, + "grad_norm": 1.652694974526233, + "language_loss": 0.77968043, + "learning_rate": 6.185975605430549e-09, + "loss": 0.85631275, + "num_input_tokens_seen": 350199765, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.0894165, + "step": 16228, + "time_per_iteration": 3.98093843460083 + }, + { + "auxiliary_loss_clip": 0.06308308, + "auxiliary_loss_mlp": 0.01248433, + "balance_loss_clip": 0.06254362, + "balance_loss_mlp": 0.01247415, + "epoch": 0.9757402675484744, + "flos": 61642432615680.0, + "grad_norm": 0.8144485911431966, + "language_loss": 0.55775505, + "learning_rate": 6.155405563025962e-09, + "loss": 0.63332248, + "num_input_tokens_seen": 350256420, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.01017761, + "step": 16229, + "time_per_iteration": 3.1203420162200928 + }, + { + "auxiliary_loss_clip": 0.06401952, + "auxiliary_loss_mlp": 0.01267662, + "balance_loss_clip": 0.06270453, + "balance_loss_mlp": 0.01258298, + "epoch": 0.9758003908011423, + "flos": 24065470944000.0, + "grad_norm": 1.894418364311992, + "language_loss": 0.7524991, + "learning_rate": 6.124911127407984e-09, + "loss": 0.82919526, + "num_input_tokens_seen": 350276270, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09362793, + "step": 16230, + "time_per_iteration": 2.5575931072235107 + }, + { + "auxiliary_loss_clip": 0.06396811, + "auxiliary_loss_mlp": 0.01264254, + "balance_loss_clip": 0.06271554, + "balance_loss_mlp": 0.01255773, + "epoch": 0.9758605140538104, + "flos": 17498764391040.0, + "grad_norm": 1.8422769218162587, + "language_loss": 0.71889436, + "learning_rate": 6.094492299733245e-09, + "loss": 0.79550505, + "num_input_tokens_seen": 350295000, + "router_z_loss_clip": 1.25195312, + "router_z_loss_mlp": 0.08483887, + "step": 16231, + "time_per_iteration": 2.5055992603302 + }, + { + "auxiliary_loss_clip": 0.06407274, + "auxiliary_loss_mlp": 0.01266757, + "balance_loss_clip": 0.06271669, + "balance_loss_mlp": 0.01256463, + "epoch": 0.9759206373064783, + "flos": 24833883611520.0, + "grad_norm": 1.7197145025092386, + "language_loss": 0.76920104, + "learning_rate": 6.064149081155267e-09, + "loss": 0.84594142, + "num_input_tokens_seen": 350314980, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.10296631, + "step": 16232, + "time_per_iteration": 2.5294857025146484 + }, + { + "auxiliary_loss_clip": 0.0630935, + "auxiliary_loss_mlp": 0.01249753, + "balance_loss_clip": 0.06255519, + "balance_loss_mlp": 0.01248793, + "epoch": 0.9759807605591463, + "flos": 68179649731200.0, + "grad_norm": 0.719631552631875, + "language_loss": 0.53778744, + "learning_rate": 6.033881472824465e-09, + "loss": 0.61337841, + "num_input_tokens_seen": 350371985, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.00958252, + "step": 16233, + "time_per_iteration": 3.017638683319092 + }, + { + "auxiliary_loss_clip": 0.0640213, + "auxiliary_loss_mlp": 0.01266568, + "balance_loss_clip": 0.06271942, + "balance_loss_mlp": 0.01256853, + "epoch": 0.9760408838118142, + "flos": 18995199258240.0, + "grad_norm": 1.757221153024699, + "language_loss": 0.71420014, + "learning_rate": 6.003689475888807e-09, + "loss": 0.79088712, + "num_input_tokens_seen": 350390590, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.0970459, + "step": 16234, + "time_per_iteration": 2.493136167526245 + }, + { + "auxiliary_loss_clip": 0.06408353, + "auxiliary_loss_mlp": 0.01266546, + "balance_loss_clip": 0.06272238, + "balance_loss_mlp": 0.01257104, + "epoch": 0.9761010070644822, + "flos": 17131210456320.0, + "grad_norm": 2.3283739707112354, + "language_loss": 0.79285693, + "learning_rate": 5.973573091493156e-09, + "loss": 0.86960596, + "num_input_tokens_seen": 350403770, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.09448242, + "step": 16235, + "time_per_iteration": 2.48677921295166 + }, + { + "auxiliary_loss_clip": 0.06400178, + "auxiliary_loss_mlp": 0.01265132, + "balance_loss_clip": 0.06271134, + "balance_loss_mlp": 0.01255166, + "epoch": 0.9761611303171501, + "flos": 22058829365760.0, + "grad_norm": 1.763069638375242, + "language_loss": 0.77298689, + "learning_rate": 5.943532320779265e-09, + "loss": 0.84964001, + "num_input_tokens_seen": 350421870, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.09954834, + "step": 16236, + "time_per_iteration": 2.5670228004455566 + }, + { + "auxiliary_loss_clip": 0.06401862, + "auxiliary_loss_mlp": 0.01265343, + "balance_loss_clip": 0.06270871, + "balance_loss_mlp": 0.01256682, + "epoch": 0.9762212535698181, + "flos": 21763167834240.0, + "grad_norm": 1.9679872991470369, + "language_loss": 0.75770509, + "learning_rate": 5.913567164886446e-09, + "loss": 0.83437711, + "num_input_tokens_seen": 350440025, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.08654785, + "step": 16237, + "time_per_iteration": 2.51847505569458 + }, + { + "auxiliary_loss_clip": 0.06401821, + "auxiliary_loss_mlp": 0.01266592, + "balance_loss_clip": 0.06269572, + "balance_loss_mlp": 0.01255786, + "epoch": 0.9762813768224861, + "flos": 25928746986240.0, + "grad_norm": 1.5570589919233344, + "language_loss": 0.73076248, + "learning_rate": 5.8836776249509e-09, + "loss": 0.8074466, + "num_input_tokens_seen": 350459435, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.10803223, + "step": 16238, + "time_per_iteration": 2.527402877807617 + }, + { + "auxiliary_loss_clip": 0.06403423, + "auxiliary_loss_mlp": 0.01265456, + "balance_loss_clip": 0.06271146, + "balance_loss_mlp": 0.01256283, + "epoch": 0.9763415000751541, + "flos": 24057169390080.0, + "grad_norm": 2.218643213238991, + "language_loss": 0.84103715, + "learning_rate": 5.8538637021063875e-09, + "loss": 0.91772586, + "num_input_tokens_seen": 350472655, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.09173584, + "step": 16239, + "time_per_iteration": 2.5126121044158936 + }, + { + "auxiliary_loss_clip": 0.06400665, + "auxiliary_loss_mlp": 0.01266419, + "balance_loss_clip": 0.06270189, + "balance_loss_mlp": 0.01257031, + "epoch": 0.976401623327822, + "flos": 17024252319360.0, + "grad_norm": 2.8876025020508265, + "language_loss": 0.60672832, + "learning_rate": 5.824125397483115e-09, + "loss": 0.6833992, + "num_input_tokens_seen": 350488160, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.09387207, + "step": 16240, + "time_per_iteration": 2.463484287261963 + }, + { + "auxiliary_loss_clip": 0.06397688, + "auxiliary_loss_mlp": 0.01265751, + "balance_loss_clip": 0.06269402, + "balance_loss_mlp": 0.01256286, + "epoch": 0.97646174658049, + "flos": 16112432188800.0, + "grad_norm": 2.071519660187613, + "language_loss": 0.82556367, + "learning_rate": 5.7944627122088474e-09, + "loss": 0.90219802, + "num_input_tokens_seen": 350506065, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.09460449, + "step": 16241, + "time_per_iteration": 2.5261969566345215 + }, + { + "auxiliary_loss_clip": 0.06403396, + "auxiliary_loss_mlp": 0.01264797, + "balance_loss_clip": 0.06272305, + "balance_loss_mlp": 0.01255433, + "epoch": 0.9765218698331579, + "flos": 21259292106240.0, + "grad_norm": 1.6838503485732548, + "language_loss": 0.83407527, + "learning_rate": 5.764875647408463e-09, + "loss": 0.91075718, + "num_input_tokens_seen": 350524495, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.09362793, + "step": 16242, + "time_per_iteration": 2.504279136657715 + }, + { + "auxiliary_loss_clip": 0.06404735, + "auxiliary_loss_mlp": 0.0126526, + "balance_loss_clip": 0.06273401, + "balance_loss_mlp": 0.01255939, + "epoch": 0.9765819930858259, + "flos": 18593963182080.0, + "grad_norm": 1.7428652843510748, + "language_loss": 0.75944352, + "learning_rate": 5.7353642042037294e-09, + "loss": 0.83614349, + "num_input_tokens_seen": 350544185, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09320068, + "step": 16243, + "time_per_iteration": 2.5364439487457275 + }, + { + "auxiliary_loss_clip": 0.0640022, + "auxiliary_loss_mlp": 0.01267394, + "balance_loss_clip": 0.06269416, + "balance_loss_mlp": 0.01257899, + "epoch": 0.976642116338494, + "flos": 20273105877120.0, + "grad_norm": 1.6450832165857223, + "language_loss": 0.6998055, + "learning_rate": 5.705928383713754e-09, + "loss": 0.77648169, + "num_input_tokens_seen": 350562675, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.09503174, + "step": 16244, + "time_per_iteration": 2.503443717956543 + }, + { + "auxiliary_loss_clip": 0.06406413, + "auxiliary_loss_mlp": 0.01269299, + "balance_loss_clip": 0.06273812, + "balance_loss_mlp": 0.01259357, + "epoch": 0.9767022395911619, + "flos": 25556497223040.0, + "grad_norm": 1.7598332350638926, + "language_loss": 0.84103727, + "learning_rate": 5.676568187055197e-09, + "loss": 0.91779447, + "num_input_tokens_seen": 350581535, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.09942627, + "step": 16245, + "time_per_iteration": 2.5492780208587646 + }, + { + "auxiliary_loss_clip": 0.06397044, + "auxiliary_loss_mlp": 0.01262033, + "balance_loss_clip": 0.06267294, + "balance_loss_mlp": 0.01252812, + "epoch": 0.9767623628438299, + "flos": 21769163400960.0, + "grad_norm": 1.4065715679155657, + "language_loss": 0.78878963, + "learning_rate": 5.647283615340726e-09, + "loss": 0.86538041, + "num_input_tokens_seen": 350601615, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.09222412, + "step": 16246, + "time_per_iteration": 4.017332315444946 + }, + { + "auxiliary_loss_clip": 0.06389856, + "auxiliary_loss_mlp": 0.01259694, + "balance_loss_clip": 0.06268258, + "balance_loss_mlp": 0.01251588, + "epoch": 0.9768224860964978, + "flos": 15856490292480.0, + "grad_norm": 1.4347284082361575, + "language_loss": 0.74287903, + "learning_rate": 5.6180746696812275e-09, + "loss": 0.81937456, + "num_input_tokens_seen": 350619580, + "router_z_loss_clip": 1.21386719, + "router_z_loss_mlp": 0.08105469, + "step": 16247, + "time_per_iteration": 2.4851341247558594 + }, + { + "auxiliary_loss_clip": 0.0640361, + "auxiliary_loss_mlp": 0.01263997, + "balance_loss_clip": 0.06272487, + "balance_loss_mlp": 0.0125505, + "epoch": 0.9768826093491658, + "flos": 25157441352960.0, + "grad_norm": 1.4750714987336841, + "language_loss": 0.80060053, + "learning_rate": 5.58894135118404e-09, + "loss": 0.8772766, + "num_input_tokens_seen": 350640015, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.08947754, + "step": 16248, + "time_per_iteration": 2.538630485534668 + }, + { + "auxiliary_loss_clip": 0.06412353, + "auxiliary_loss_mlp": 0.01267958, + "balance_loss_clip": 0.06277192, + "balance_loss_mlp": 0.01257765, + "epoch": 0.9769427326018337, + "flos": 22973794024320.0, + "grad_norm": 1.7015435546437248, + "language_loss": 0.79519981, + "learning_rate": 5.559883660954278e-09, + "loss": 0.87200296, + "num_input_tokens_seen": 350659155, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.10180664, + "step": 16249, + "time_per_iteration": 2.5262768268585205 + }, + { + "auxiliary_loss_clip": 0.06397509, + "auxiliary_loss_mlp": 0.01267019, + "balance_loss_clip": 0.06270598, + "balance_loss_mlp": 0.01257393, + "epoch": 0.9770028558545018, + "flos": 15269066444160.0, + "grad_norm": 1.8482758647978654, + "language_loss": 0.66747582, + "learning_rate": 5.530901600093507e-09, + "loss": 0.74412113, + "num_input_tokens_seen": 350676615, + "router_z_loss_clip": 1.26855469, + "router_z_loss_mlp": 0.09613037, + "step": 16250, + "time_per_iteration": 2.475498914718628 + }, + { + "auxiliary_loss_clip": 0.0631108, + "auxiliary_loss_mlp": 0.01248906, + "balance_loss_clip": 0.06257159, + "balance_loss_mlp": 0.01247916, + "epoch": 0.9770629791071697, + "flos": 71470277349120.0, + "grad_norm": 0.766535928446672, + "language_loss": 0.59739006, + "learning_rate": 5.501995169700846e-09, + "loss": 0.6729899, + "num_input_tokens_seen": 350736805, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.0098877, + "step": 16251, + "time_per_iteration": 3.171131134033203 + }, + { + "auxiliary_loss_clip": 0.06401361, + "auxiliary_loss_mlp": 0.01265235, + "balance_loss_clip": 0.06270295, + "balance_loss_mlp": 0.01256032, + "epoch": 0.9771231023598377, + "flos": 22418375235840.0, + "grad_norm": 1.6976848540118503, + "language_loss": 0.78588271, + "learning_rate": 5.473164370872307e-09, + "loss": 0.86254865, + "num_input_tokens_seen": 350753600, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.09197998, + "step": 16252, + "time_per_iteration": 2.5451128482818604 + }, + { + "auxiliary_loss_clip": 0.06400634, + "auxiliary_loss_mlp": 0.01263344, + "balance_loss_clip": 0.06269819, + "balance_loss_mlp": 0.01253623, + "epoch": 0.9771832256125056, + "flos": 19031942073600.0, + "grad_norm": 3.8752836290944774, + "language_loss": 0.65360057, + "learning_rate": 5.444409204701461e-09, + "loss": 0.73024035, + "num_input_tokens_seen": 350771225, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09729004, + "step": 16253, + "time_per_iteration": 3.9955294132232666 + }, + { + "auxiliary_loss_clip": 0.06406756, + "auxiliary_loss_mlp": 0.01265874, + "balance_loss_clip": 0.06273551, + "balance_loss_mlp": 0.01255592, + "epoch": 0.9772433488651736, + "flos": 17827982282880.0, + "grad_norm": 2.0997041921444652, + "language_loss": 0.77016485, + "learning_rate": 5.415729672278324e-09, + "loss": 0.84689116, + "num_input_tokens_seen": 350789100, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.10272217, + "step": 16254, + "time_per_iteration": 2.4991238117218018 + }, + { + "auxiliary_loss_clip": 0.06405216, + "auxiliary_loss_mlp": 0.01266948, + "balance_loss_clip": 0.06271631, + "balance_loss_mlp": 0.01256881, + "epoch": 0.9773034721178415, + "flos": 37638246533760.0, + "grad_norm": 2.3865763339015618, + "language_loss": 0.64227772, + "learning_rate": 5.387125774690471e-09, + "loss": 0.71899939, + "num_input_tokens_seen": 350811085, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.10064697, + "step": 16255, + "time_per_iteration": 2.8432374000549316 + }, + { + "auxiliary_loss_clip": 0.06406088, + "auxiliary_loss_mlp": 0.01265056, + "balance_loss_clip": 0.06270261, + "balance_loss_mlp": 0.01254858, + "epoch": 0.9773635953705095, + "flos": 20308590881280.0, + "grad_norm": 1.9209330151147832, + "language_loss": 0.7554915, + "learning_rate": 5.358597513023033e-09, + "loss": 0.83220291, + "num_input_tokens_seen": 350831065, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.10192871, + "step": 16256, + "time_per_iteration": 2.539581537246704 + }, + { + "auxiliary_loss_clip": 0.06402241, + "auxiliary_loss_mlp": 0.01267896, + "balance_loss_clip": 0.06274899, + "balance_loss_mlp": 0.01258186, + "epoch": 0.9774237186231776, + "flos": 22315735584000.0, + "grad_norm": 2.134374282243183, + "language_loss": 0.78430331, + "learning_rate": 5.330144888357369e-09, + "loss": 0.86100471, + "num_input_tokens_seen": 350849675, + "router_z_loss_clip": 1.27539062, + "router_z_loss_mlp": 0.0970459, + "step": 16257, + "time_per_iteration": 2.521059513092041 + }, + { + "auxiliary_loss_clip": 0.06404999, + "auxiliary_loss_mlp": 0.01264619, + "balance_loss_clip": 0.06275123, + "balance_loss_mlp": 0.01255106, + "epoch": 0.9774838418758455, + "flos": 24211435956480.0, + "grad_norm": 2.254901577298529, + "language_loss": 0.75327086, + "learning_rate": 5.301767901772391e-09, + "loss": 0.82996702, + "num_input_tokens_seen": 350868955, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.09509277, + "step": 16258, + "time_per_iteration": 2.5600156784057617 + }, + { + "auxiliary_loss_clip": 0.06308343, + "auxiliary_loss_mlp": 0.01249899, + "balance_loss_clip": 0.06254452, + "balance_loss_mlp": 0.01248971, + "epoch": 0.9775439651285135, + "flos": 66378691998720.0, + "grad_norm": 0.6729555007121276, + "language_loss": 0.59753788, + "learning_rate": 5.273466554344353e-09, + "loss": 0.67312038, + "num_input_tokens_seen": 350935110, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.00926208, + "step": 16259, + "time_per_iteration": 3.2042317390441895 + }, + { + "auxiliary_loss_clip": 0.06408554, + "auxiliary_loss_mlp": 0.01265358, + "balance_loss_clip": 0.06274059, + "balance_loss_mlp": 0.01255547, + "epoch": 0.9776040883811814, + "flos": 22608168733440.0, + "grad_norm": 1.5933188792012458, + "language_loss": 0.7377913, + "learning_rate": 5.2452408471461705e-09, + "loss": 0.81453043, + "num_input_tokens_seen": 350953220, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.0980835, + "step": 16260, + "time_per_iteration": 2.520371675491333 + }, + { + "auxiliary_loss_clip": 0.06402797, + "auxiliary_loss_mlp": 0.0126442, + "balance_loss_clip": 0.06271645, + "balance_loss_mlp": 0.01254412, + "epoch": 0.9776642116338494, + "flos": 18448082023680.0, + "grad_norm": 1.9636602337481959, + "language_loss": 0.80066824, + "learning_rate": 5.2170907812485456e-09, + "loss": 0.87734044, + "num_input_tokens_seen": 350971915, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.10009766, + "step": 16261, + "time_per_iteration": 3.925679922103882 + }, + { + "auxiliary_loss_clip": 0.06401169, + "auxiliary_loss_mlp": 0.01262925, + "balance_loss_clip": 0.06267936, + "balance_loss_mlp": 0.01253269, + "epoch": 0.9777243348865173, + "flos": 22645121184000.0, + "grad_norm": 2.276585327345628, + "language_loss": 0.74144262, + "learning_rate": 5.189016357718845e-09, + "loss": 0.81808358, + "num_input_tokens_seen": 350990470, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.09649658, + "step": 16262, + "time_per_iteration": 2.5040698051452637 + }, + { + "auxiliary_loss_clip": 0.06405801, + "auxiliary_loss_mlp": 0.0126505, + "balance_loss_clip": 0.06272787, + "balance_loss_mlp": 0.01254345, + "epoch": 0.9777844581391854, + "flos": 31329410520960.0, + "grad_norm": 2.244508140891946, + "language_loss": 0.7062791, + "learning_rate": 5.16101757762133e-09, + "loss": 0.78298759, + "num_input_tokens_seen": 351010755, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.1071167, + "step": 16263, + "time_per_iteration": 2.6393070220947266 + }, + { + "auxiliary_loss_clip": 0.06403024, + "auxiliary_loss_mlp": 0.0126253, + "balance_loss_clip": 0.06270716, + "balance_loss_mlp": 0.01253053, + "epoch": 0.9778445813918533, + "flos": 23045728354560.0, + "grad_norm": 1.6974232351495746, + "language_loss": 0.66375017, + "learning_rate": 5.133094442018038e-09, + "loss": 0.74040568, + "num_input_tokens_seen": 351029965, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.09484863, + "step": 16264, + "time_per_iteration": 2.505544900894165 + }, + { + "auxiliary_loss_clip": 0.06409594, + "auxiliary_loss_mlp": 0.01265425, + "balance_loss_clip": 0.06271692, + "balance_loss_mlp": 0.01255602, + "epoch": 0.9779047046445213, + "flos": 17572082313600.0, + "grad_norm": 2.0688603545679585, + "language_loss": 0.73281831, + "learning_rate": 5.105246951967679e-09, + "loss": 0.80956852, + "num_input_tokens_seen": 351046205, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.09820557, + "step": 16265, + "time_per_iteration": 2.477476119995117 + }, + { + "auxiliary_loss_clip": 0.06397505, + "auxiliary_loss_mlp": 0.01262251, + "balance_loss_clip": 0.06270322, + "balance_loss_mlp": 0.01253298, + "epoch": 0.9779648278971892, + "flos": 20747492167680.0, + "grad_norm": 1.8532261221017665, + "language_loss": 0.68805051, + "learning_rate": 5.077475108526297e-09, + "loss": 0.76464808, + "num_input_tokens_seen": 351065390, + "router_z_loss_clip": 1.27050781, + "router_z_loss_mlp": 0.08953857, + "step": 16266, + "time_per_iteration": 2.505934238433838 + }, + { + "auxiliary_loss_clip": 0.06398112, + "auxiliary_loss_mlp": 0.01264596, + "balance_loss_clip": 0.06272861, + "balance_loss_mlp": 0.01255992, + "epoch": 0.9780249511498572, + "flos": 21032336522880.0, + "grad_norm": 2.6886691630763884, + "language_loss": 0.8669281, + "learning_rate": 5.049778912747049e-09, + "loss": 0.94355524, + "num_input_tokens_seen": 351084355, + "router_z_loss_clip": 1.25097656, + "router_z_loss_mlp": 0.08602905, + "step": 16267, + "time_per_iteration": 2.510568141937256 + }, + { + "auxiliary_loss_clip": 0.06402868, + "auxiliary_loss_mlp": 0.01263569, + "balance_loss_clip": 0.0627014, + "balance_loss_mlp": 0.01253591, + "epoch": 0.9780850744025251, + "flos": 30782167505280.0, + "grad_norm": 1.6577621473420363, + "language_loss": 0.70518297, + "learning_rate": 5.022158365679985e-09, + "loss": 0.78184736, + "num_input_tokens_seen": 351105870, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.09967041, + "step": 16268, + "time_per_iteration": 4.031549453735352 + }, + { + "auxiliary_loss_clip": 0.06402364, + "auxiliary_loss_mlp": 0.0126831, + "balance_loss_clip": 0.06270558, + "balance_loss_mlp": 0.01258832, + "epoch": 0.9781451976551931, + "flos": 20309219786880.0, + "grad_norm": 1.5149963209120108, + "language_loss": 0.74065733, + "learning_rate": 4.994613468372711e-09, + "loss": 0.8173641, + "num_input_tokens_seen": 351124760, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09472656, + "step": 16269, + "time_per_iteration": 2.4883556365966797 + }, + { + "auxiliary_loss_clip": 0.06404697, + "auxiliary_loss_mlp": 0.0126611, + "balance_loss_clip": 0.06272128, + "balance_loss_mlp": 0.01256036, + "epoch": 0.9782053209078612, + "flos": 24323383411200.0, + "grad_norm": 2.11112255910788, + "language_loss": 0.70834357, + "learning_rate": 4.967144221869501e-09, + "loss": 0.78505164, + "num_input_tokens_seen": 351142820, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.10076904, + "step": 16270, + "time_per_iteration": 2.5375027656555176 + }, + { + "auxiliary_loss_clip": 0.06403029, + "auxiliary_loss_mlp": 0.01263166, + "balance_loss_clip": 0.0627147, + "balance_loss_mlp": 0.01253874, + "epoch": 0.9782654441605291, + "flos": 32497717599360.0, + "grad_norm": 3.0838275536528705, + "language_loss": 0.64377117, + "learning_rate": 4.939750627212191e-09, + "loss": 0.72043312, + "num_input_tokens_seen": 351164805, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09301758, + "step": 16271, + "time_per_iteration": 2.5905959606170654 + }, + { + "auxiliary_loss_clip": 0.06396818, + "auxiliary_loss_mlp": 0.01263415, + "balance_loss_clip": 0.06269811, + "balance_loss_mlp": 0.01253783, + "epoch": 0.9783255674131971, + "flos": 26986280567040.0, + "grad_norm": 1.9658813772061734, + "language_loss": 0.70980221, + "learning_rate": 4.912432685439505e-09, + "loss": 0.78640461, + "num_input_tokens_seen": 351187005, + "router_z_loss_clip": 1.27050781, + "router_z_loss_mlp": 0.09631348, + "step": 16272, + "time_per_iteration": 2.5623769760131836 + }, + { + "auxiliary_loss_clip": 0.06402478, + "auxiliary_loss_mlp": 0.01267088, + "balance_loss_clip": 0.06270878, + "balance_loss_mlp": 0.01257736, + "epoch": 0.978385690665865, + "flos": 23118920496000.0, + "grad_norm": 3.4786188165318648, + "language_loss": 0.67056668, + "learning_rate": 4.88519039758728e-09, + "loss": 0.74726236, + "num_input_tokens_seen": 351208450, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09356689, + "step": 16273, + "time_per_iteration": 2.516294002532959 + }, + { + "auxiliary_loss_clip": 0.06402078, + "auxiliary_loss_mlp": 0.01264409, + "balance_loss_clip": 0.06269372, + "balance_loss_mlp": 0.01255099, + "epoch": 0.978445813918533, + "flos": 25416527777280.0, + "grad_norm": 1.5271544085655164, + "language_loss": 0.74288815, + "learning_rate": 4.85802376468869e-09, + "loss": 0.81955302, + "num_input_tokens_seen": 351229585, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09313965, + "step": 16274, + "time_per_iteration": 2.5984392166137695 + }, + { + "auxiliary_loss_clip": 0.06401587, + "auxiliary_loss_mlp": 0.01265008, + "balance_loss_clip": 0.0627111, + "balance_loss_mlp": 0.01255793, + "epoch": 0.9785059371712009, + "flos": 23556983241600.0, + "grad_norm": 1.707553695357098, + "language_loss": 0.7783469, + "learning_rate": 4.830932787773579e-09, + "loss": 0.85501283, + "num_input_tokens_seen": 351249525, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.09210205, + "step": 16275, + "time_per_iteration": 2.5418179035186768 + }, + { + "auxiliary_loss_clip": 0.06406128, + "auxiliary_loss_mlp": 0.01262648, + "balance_loss_clip": 0.06272225, + "balance_loss_mlp": 0.01253469, + "epoch": 0.978566060423869, + "flos": 34359945465600.0, + "grad_norm": 1.5276794434381622, + "language_loss": 0.71135265, + "learning_rate": 4.803917467869567e-09, + "loss": 0.7880404, + "num_input_tokens_seen": 351272530, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.09179688, + "step": 16276, + "time_per_iteration": 2.622631311416626 + }, + { + "auxiliary_loss_clip": 0.06400249, + "auxiliary_loss_mlp": 0.01263911, + "balance_loss_clip": 0.06272748, + "balance_loss_mlp": 0.01255346, + "epoch": 0.9786261836765369, + "flos": 11623546857600.0, + "grad_norm": 3.17667163989501, + "language_loss": 0.85745251, + "learning_rate": 4.776977806000726e-09, + "loss": 0.93409419, + "num_input_tokens_seen": 351288530, + "router_z_loss_clip": 1.27441406, + "router_z_loss_mlp": 0.08563232, + "step": 16277, + "time_per_iteration": 2.4804911613464355 + }, + { + "auxiliary_loss_clip": 0.06398945, + "auxiliary_loss_mlp": 0.01262406, + "balance_loss_clip": 0.06271117, + "balance_loss_mlp": 0.01253317, + "epoch": 0.9786863069292049, + "flos": 17426746206720.0, + "grad_norm": 1.7095033174168577, + "language_loss": 0.71152186, + "learning_rate": 4.7501138031891264e-09, + "loss": 0.78813535, + "num_input_tokens_seen": 351305890, + "router_z_loss_clip": 1.27734375, + "router_z_loss_mlp": 0.09088135, + "step": 16278, + "time_per_iteration": 2.488579750061035 + }, + { + "auxiliary_loss_clip": 0.06398286, + "auxiliary_loss_mlp": 0.01261989, + "balance_loss_clip": 0.06268737, + "balance_loss_mlp": 0.01252339, + "epoch": 0.9787464301818728, + "flos": 20850341454720.0, + "grad_norm": 1.68580975011962, + "language_loss": 0.84887272, + "learning_rate": 4.723325460453065e-09, + "loss": 0.92547548, + "num_input_tokens_seen": 351325010, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.09649658, + "step": 16279, + "time_per_iteration": 2.543829917907715 + }, + { + "auxiliary_loss_clip": 0.06398898, + "auxiliary_loss_mlp": 0.0126355, + "balance_loss_clip": 0.06267536, + "balance_loss_mlp": 0.01254275, + "epoch": 0.9788065534345408, + "flos": 18228757161600.0, + "grad_norm": 1.79953849939751, + "language_loss": 0.7903899, + "learning_rate": 4.696612778808395e-09, + "loss": 0.86701441, + "num_input_tokens_seen": 351343060, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09283447, + "step": 16280, + "time_per_iteration": 2.5347559452056885 + }, + { + "auxiliary_loss_clip": 0.06397119, + "auxiliary_loss_mlp": 0.01265633, + "balance_loss_clip": 0.06271647, + "balance_loss_mlp": 0.01256973, + "epoch": 0.9788666766872087, + "flos": 21584359221120.0, + "grad_norm": 1.5249645071415179, + "language_loss": 0.79882574, + "learning_rate": 4.669975759268085e-09, + "loss": 0.87545323, + "num_input_tokens_seen": 351363260, + "router_z_loss_clip": 1.25488281, + "router_z_loss_mlp": 0.08666992, + "step": 16281, + "time_per_iteration": 2.5423452854156494 + }, + { + "auxiliary_loss_clip": 0.06401223, + "auxiliary_loss_mlp": 0.01266758, + "balance_loss_clip": 0.06269599, + "balance_loss_mlp": 0.01256965, + "epoch": 0.9789267999398767, + "flos": 24907536950400.0, + "grad_norm": 1.5917480809235194, + "language_loss": 0.80182159, + "learning_rate": 4.643414402842216e-09, + "loss": 0.87850136, + "num_input_tokens_seen": 351382610, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09796143, + "step": 16282, + "time_per_iteration": 2.5288219451904297 + }, + { + "auxiliary_loss_clip": 0.06399183, + "auxiliary_loss_mlp": 0.0126555, + "balance_loss_clip": 0.06268679, + "balance_loss_mlp": 0.01255412, + "epoch": 0.9789869231925448, + "flos": 19579185089280.0, + "grad_norm": 2.068232253290479, + "language_loss": 0.8363508, + "learning_rate": 4.616928710538204e-09, + "loss": 0.91299808, + "num_input_tokens_seen": 351401075, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.10137939, + "step": 16283, + "time_per_iteration": 2.475937604904175 + }, + { + "auxiliary_loss_clip": 0.06399857, + "auxiliary_loss_mlp": 0.01263668, + "balance_loss_clip": 0.06268431, + "balance_loss_mlp": 0.01254245, + "epoch": 0.9790470464452127, + "flos": 16801657148160.0, + "grad_norm": 1.6072240404976843, + "language_loss": 0.72103167, + "learning_rate": 4.590518683360134e-09, + "loss": 0.79766691, + "num_input_tokens_seen": 351419275, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09411621, + "step": 16284, + "time_per_iteration": 2.473494529724121 + }, + { + "auxiliary_loss_clip": 0.06401023, + "auxiliary_loss_mlp": 0.01266157, + "balance_loss_clip": 0.06273106, + "balance_loss_mlp": 0.01257723, + "epoch": 0.9791071696978807, + "flos": 18375267225600.0, + "grad_norm": 1.7801771621665499, + "language_loss": 0.64641076, + "learning_rate": 4.56418432230965e-09, + "loss": 0.72308254, + "num_input_tokens_seen": 351437375, + "router_z_loss_clip": 1.27832031, + "router_z_loss_mlp": 0.08435059, + "step": 16285, + "time_per_iteration": 4.014649391174316 + }, + { + "auxiliary_loss_clip": 0.06402356, + "auxiliary_loss_mlp": 0.01266814, + "balance_loss_clip": 0.06273103, + "balance_loss_mlp": 0.01257664, + "epoch": 0.9791672929505486, + "flos": 24177166836480.0, + "grad_norm": 1.4206006238516855, + "language_loss": 0.70657575, + "learning_rate": 4.537925628385286e-09, + "loss": 0.78326744, + "num_input_tokens_seen": 351457810, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.09143066, + "step": 16286, + "time_per_iteration": 2.5511789321899414 + }, + { + "auxiliary_loss_clip": 0.06395744, + "auxiliary_loss_mlp": 0.01265186, + "balance_loss_clip": 0.06267752, + "balance_loss_mlp": 0.01255583, + "epoch": 0.9792274162032166, + "flos": 24361216329600.0, + "grad_norm": 1.3312898540617772, + "language_loss": 0.58715498, + "learning_rate": 4.511742602582691e-09, + "loss": 0.66376424, + "num_input_tokens_seen": 351478825, + "router_z_loss_clip": 1.27929688, + "router_z_loss_mlp": 0.09613037, + "step": 16287, + "time_per_iteration": 2.5384435653686523 + }, + { + "auxiliary_loss_clip": 0.06399453, + "auxiliary_loss_mlp": 0.01262835, + "balance_loss_clip": 0.06270657, + "balance_loss_mlp": 0.01253811, + "epoch": 0.9792875394558845, + "flos": 26402965568640.0, + "grad_norm": 1.63050988384962, + "language_loss": 0.81943876, + "learning_rate": 4.485635245894626e-09, + "loss": 0.89606166, + "num_input_tokens_seen": 351498785, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.09020996, + "step": 16288, + "time_per_iteration": 2.5366978645324707 + }, + { + "auxiliary_loss_clip": 0.06400405, + "auxiliary_loss_mlp": 0.01265614, + "balance_loss_clip": 0.06270171, + "balance_loss_mlp": 0.01255815, + "epoch": 0.9793476627085526, + "flos": 28155635821440.0, + "grad_norm": 1.4168880450273769, + "language_loss": 0.71902084, + "learning_rate": 4.459603559311631e-09, + "loss": 0.795681, + "num_input_tokens_seen": 351520235, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.09796143, + "step": 16289, + "time_per_iteration": 2.5798122882843018 + }, + { + "auxiliary_loss_clip": 0.063995, + "auxiliary_loss_mlp": 0.01262223, + "balance_loss_clip": 0.06270827, + "balance_loss_mlp": 0.01253199, + "epoch": 0.9794077859612205, + "flos": 16769568234240.0, + "grad_norm": 4.451777244467701, + "language_loss": 0.75933874, + "learning_rate": 4.43364754382003e-09, + "loss": 0.83595598, + "num_input_tokens_seen": 351538900, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.09020996, + "step": 16290, + "time_per_iteration": 2.4823756217956543 + }, + { + "auxiliary_loss_clip": 0.06403105, + "auxiliary_loss_mlp": 0.01263116, + "balance_loss_clip": 0.06269795, + "balance_loss_mlp": 0.01253108, + "epoch": 0.9794679092138885, + "flos": 19286793866880.0, + "grad_norm": 1.4953530744469736, + "language_loss": 0.67339337, + "learning_rate": 4.4077672004048105e-09, + "loss": 0.75005561, + "num_input_tokens_seen": 351558715, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.10015869, + "step": 16291, + "time_per_iteration": 2.5128190517425537 + }, + { + "auxiliary_loss_clip": 0.06406611, + "auxiliary_loss_mlp": 0.01265154, + "balance_loss_clip": 0.06271151, + "balance_loss_mlp": 0.01254866, + "epoch": 0.9795280324665564, + "flos": 32164139295360.0, + "grad_norm": 1.837132230394904, + "language_loss": 0.62766051, + "learning_rate": 4.3819625300467456e-09, + "loss": 0.70437813, + "num_input_tokens_seen": 351578450, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.10284424, + "step": 16292, + "time_per_iteration": 2.6618642807006836 + }, + { + "auxiliary_loss_clip": 0.06400578, + "auxiliary_loss_mlp": 0.01265969, + "balance_loss_clip": 0.06269545, + "balance_loss_mlp": 0.01256063, + "epoch": 0.9795881557192244, + "flos": 19066714318080.0, + "grad_norm": 1.6297309965936324, + "language_loss": 0.73538578, + "learning_rate": 4.356233533724829e-09, + "loss": 0.8120513, + "num_input_tokens_seen": 351597195, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09912109, + "step": 16293, + "time_per_iteration": 3.9606332778930664 + }, + { + "auxiliary_loss_clip": 0.06403802, + "auxiliary_loss_mlp": 0.01262473, + "balance_loss_clip": 0.06269664, + "balance_loss_mlp": 0.01252799, + "epoch": 0.9796482789718923, + "flos": 28337505108480.0, + "grad_norm": 1.6383548808431236, + "language_loss": 0.84130985, + "learning_rate": 4.330580212414503e-09, + "loss": 0.91797256, + "num_input_tokens_seen": 351617460, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.09674072, + "step": 16294, + "time_per_iteration": 2.6089725494384766 + }, + { + "auxiliary_loss_clip": 0.06393368, + "auxiliary_loss_mlp": 0.01267559, + "balance_loss_clip": 0.06268513, + "balance_loss_mlp": 0.01259262, + "epoch": 0.9797084022245603, + "flos": 17973821514240.0, + "grad_norm": 1.8121690447623178, + "language_loss": 0.71849918, + "learning_rate": 4.305002567088767e-09, + "loss": 0.79510844, + "num_input_tokens_seen": 351635900, + "router_z_loss_clip": 1.25, + "router_z_loss_mlp": 0.08294678, + "step": 16295, + "time_per_iteration": 2.494866132736206 + }, + { + "auxiliary_loss_clip": 0.06407996, + "auxiliary_loss_mlp": 0.01266646, + "balance_loss_clip": 0.06274095, + "balance_loss_mlp": 0.01256567, + "epoch": 0.9797685254772284, + "flos": 20272980096000.0, + "grad_norm": 1.689190760934112, + "language_loss": 0.80868363, + "learning_rate": 4.2795005987170674e-09, + "loss": 0.8854301, + "num_input_tokens_seen": 351655400, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.10083008, + "step": 16296, + "time_per_iteration": 2.5353195667266846 + }, + { + "auxiliary_loss_clip": 0.06396893, + "auxiliary_loss_mlp": 0.01263523, + "balance_loss_clip": 0.06268729, + "balance_loss_mlp": 0.01254499, + "epoch": 0.9798286487298963, + "flos": 26914513944960.0, + "grad_norm": 1.8507340964976773, + "language_loss": 0.75629425, + "learning_rate": 4.254074308266853e-09, + "loss": 0.83289838, + "num_input_tokens_seen": 351675505, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.09020996, + "step": 16297, + "time_per_iteration": 2.566253185272217 + }, + { + "auxiliary_loss_clip": 0.06409726, + "auxiliary_loss_mlp": 0.01265641, + "balance_loss_clip": 0.06272483, + "balance_loss_mlp": 0.01256253, + "epoch": 0.9798887719825643, + "flos": 27168233708160.0, + "grad_norm": 1.5228355519225918, + "language_loss": 0.78694081, + "learning_rate": 4.228723696702019e-09, + "loss": 0.86369449, + "num_input_tokens_seen": 351697920, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.09399414, + "step": 16298, + "time_per_iteration": 2.635408639907837 + }, + { + "auxiliary_loss_clip": 0.06396599, + "auxiliary_loss_mlp": 0.0126188, + "balance_loss_clip": 0.06269842, + "balance_loss_mlp": 0.01252785, + "epoch": 0.9799488952352322, + "flos": 20674803150720.0, + "grad_norm": 1.6048617132975538, + "language_loss": 0.73147827, + "learning_rate": 4.203448764984019e-09, + "loss": 0.80806303, + "num_input_tokens_seen": 351717615, + "router_z_loss_clip": 1.26953125, + "router_z_loss_mlp": 0.09088135, + "step": 16299, + "time_per_iteration": 2.5388383865356445 + }, + { + "auxiliary_loss_clip": 0.06401886, + "auxiliary_loss_mlp": 0.0126338, + "balance_loss_clip": 0.06268089, + "balance_loss_mlp": 0.01254105, + "epoch": 0.9800090184879002, + "flos": 21987691649280.0, + "grad_norm": 2.0565505689040795, + "language_loss": 0.89151061, + "learning_rate": 4.178249514071419e-09, + "loss": 0.96816331, + "num_input_tokens_seen": 351735260, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.0927124, + "step": 16300, + "time_per_iteration": 2.510451316833496 + }, + { + "auxiliary_loss_clip": 0.06408317, + "auxiliary_loss_mlp": 0.01265306, + "balance_loss_clip": 0.06273539, + "balance_loss_mlp": 0.01255155, + "epoch": 0.9800691417405681, + "flos": 21294860964480.0, + "grad_norm": 1.950668796450147, + "language_loss": 0.78290796, + "learning_rate": 4.1531259449194555e-09, + "loss": 0.85964411, + "num_input_tokens_seen": 351755800, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.10150146, + "step": 16301, + "time_per_iteration": 3.9293153285980225 + }, + { + "auxiliary_loss_clip": 0.06402375, + "auxiliary_loss_mlp": 0.01266486, + "balance_loss_clip": 0.06270753, + "balance_loss_mlp": 0.01256502, + "epoch": 0.9801292649932362, + "flos": 18445398693120.0, + "grad_norm": 2.786273844322341, + "language_loss": 0.75642586, + "learning_rate": 4.128078058480921e-09, + "loss": 0.83311445, + "num_input_tokens_seen": 351774790, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09985352, + "step": 16302, + "time_per_iteration": 2.5075223445892334 + }, + { + "auxiliary_loss_clip": 0.06404446, + "auxiliary_loss_mlp": 0.0126291, + "balance_loss_clip": 0.06273034, + "balance_loss_mlp": 0.01253045, + "epoch": 0.9801893882459041, + "flos": 25053418108800.0, + "grad_norm": 1.6673066496570457, + "language_loss": 0.79480714, + "learning_rate": 4.103105855705724e-09, + "loss": 0.8714807, + "num_input_tokens_seen": 351792855, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09851074, + "step": 16303, + "time_per_iteration": 2.628279209136963 + }, + { + "auxiliary_loss_clip": 0.06405927, + "auxiliary_loss_mlp": 0.01263016, + "balance_loss_clip": 0.06270191, + "balance_loss_mlp": 0.01253062, + "epoch": 0.9802495114985721, + "flos": 18516787971840.0, + "grad_norm": 1.8702096510195432, + "language_loss": 0.83911574, + "learning_rate": 4.078209337540883e-09, + "loss": 0.91580522, + "num_input_tokens_seen": 351811450, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.09967041, + "step": 16304, + "time_per_iteration": 2.5042169094085693 + }, + { + "auxiliary_loss_clip": 0.06394262, + "auxiliary_loss_mlp": 0.01262398, + "balance_loss_clip": 0.06268616, + "balance_loss_mlp": 0.01253875, + "epoch": 0.98030963475124, + "flos": 21476143272960.0, + "grad_norm": 1.8927432348814315, + "language_loss": 0.70325917, + "learning_rate": 4.053388504930089e-09, + "loss": 0.77982581, + "num_input_tokens_seen": 351831960, + "router_z_loss_clip": 1.25585938, + "router_z_loss_mlp": 0.08526611, + "step": 16305, + "time_per_iteration": 2.5113353729248047 + }, + { + "auxiliary_loss_clip": 0.06407525, + "auxiliary_loss_mlp": 0.01264496, + "balance_loss_clip": 0.06273907, + "balance_loss_mlp": 0.01254578, + "epoch": 0.980369758003908, + "flos": 20418483911040.0, + "grad_norm": 1.8068750092854737, + "language_loss": 0.72213495, + "learning_rate": 4.028643358815032e-09, + "loss": 0.79885519, + "num_input_tokens_seen": 351851585, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.09918213, + "step": 16306, + "time_per_iteration": 2.5188653469085693 + }, + { + "auxiliary_loss_clip": 0.06395418, + "auxiliary_loss_mlp": 0.0126193, + "balance_loss_clip": 0.06268764, + "balance_loss_mlp": 0.01253502, + "epoch": 0.9804298812565759, + "flos": 23405064589440.0, + "grad_norm": 1.5213209869306519, + "language_loss": 0.73565251, + "learning_rate": 4.00397390013385e-09, + "loss": 0.81222594, + "num_input_tokens_seen": 351871085, + "router_z_loss_clip": 1.26367188, + "router_z_loss_mlp": 0.08422852, + "step": 16307, + "time_per_iteration": 3.985133171081543 + }, + { + "auxiliary_loss_clip": 0.06392866, + "auxiliary_loss_mlp": 0.01262041, + "balance_loss_clip": 0.06268162, + "balance_loss_mlp": 0.01253899, + "epoch": 0.980490004509244, + "flos": 23299028847360.0, + "grad_norm": 1.3292657797175953, + "language_loss": 0.7521047, + "learning_rate": 3.979380129822018e-09, + "loss": 0.82865375, + "num_input_tokens_seen": 351891775, + "router_z_loss_clip": 1.24609375, + "router_z_loss_mlp": 0.08135986, + "step": 16308, + "time_per_iteration": 2.545912265777588 + }, + { + "auxiliary_loss_clip": 0.06303553, + "auxiliary_loss_mlp": 0.01251644, + "balance_loss_clip": 0.06249667, + "balance_loss_mlp": 0.01250615, + "epoch": 0.980550127761912, + "flos": 56067991712640.0, + "grad_norm": 0.8050036087826854, + "language_loss": 0.57682216, + "learning_rate": 3.954862048811902e-09, + "loss": 0.65237415, + "num_input_tokens_seen": 351946770, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.01029968, + "step": 16309, + "time_per_iteration": 2.9991166591644287 + }, + { + "auxiliary_loss_clip": 0.06399623, + "auxiliary_loss_mlp": 0.01265391, + "balance_loss_clip": 0.06267844, + "balance_loss_mlp": 0.01256194, + "epoch": 0.9806102510145799, + "flos": 25339562202240.0, + "grad_norm": 1.6272757671722682, + "language_loss": 0.66520619, + "learning_rate": 3.930419658033646e-09, + "loss": 0.7418564, + "num_input_tokens_seen": 351966155, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09191895, + "step": 16310, + "time_per_iteration": 2.5256764888763428 + }, + { + "auxiliary_loss_clip": 0.06307549, + "auxiliary_loss_mlp": 0.01249123, + "balance_loss_clip": 0.06253639, + "balance_loss_mlp": 0.01248124, + "epoch": 0.9806703742672479, + "flos": 67297472017920.0, + "grad_norm": 0.8343331868495012, + "language_loss": 0.54504246, + "learning_rate": 3.906052958413841e-09, + "loss": 0.62060916, + "num_input_tokens_seen": 352031655, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.00998688, + "step": 16311, + "time_per_iteration": 3.235471248626709 + }, + { + "auxiliary_loss_clip": 0.06400076, + "auxiliary_loss_mlp": 0.01262219, + "balance_loss_clip": 0.06269625, + "balance_loss_mlp": 0.01253004, + "epoch": 0.9807304975199158, + "flos": 25236084009600.0, + "grad_norm": 1.569735113945606, + "language_loss": 0.79639947, + "learning_rate": 3.881761950876638e-09, + "loss": 0.87302244, + "num_input_tokens_seen": 352051920, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.09216309, + "step": 16312, + "time_per_iteration": 2.5837817192077637 + }, + { + "auxiliary_loss_clip": 0.06399613, + "auxiliary_loss_mlp": 0.01263333, + "balance_loss_clip": 0.06270465, + "balance_loss_mlp": 0.01255012, + "epoch": 0.9807906207725838, + "flos": 17462021575680.0, + "grad_norm": 1.8369595786658577, + "language_loss": 0.6327976, + "learning_rate": 3.8575466363430785e-09, + "loss": 0.70942706, + "num_input_tokens_seen": 352069315, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.08325195, + "step": 16313, + "time_per_iteration": 2.485778570175171 + }, + { + "auxiliary_loss_clip": 0.06398313, + "auxiliary_loss_mlp": 0.01263511, + "balance_loss_clip": 0.06268698, + "balance_loss_mlp": 0.0125407, + "epoch": 0.9808507440252517, + "flos": 21038709432960.0, + "grad_norm": 1.8257284943536598, + "language_loss": 0.72914076, + "learning_rate": 3.833407015731316e-09, + "loss": 0.80575901, + "num_input_tokens_seen": 352089480, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.09442139, + "step": 16314, + "time_per_iteration": 2.522977590560913 + }, + { + "auxiliary_loss_clip": 0.06308355, + "auxiliary_loss_mlp": 0.01248498, + "balance_loss_clip": 0.06254214, + "balance_loss_mlp": 0.01247535, + "epoch": 0.9809108672779198, + "flos": 64063307652480.0, + "grad_norm": 0.6727311068794228, + "language_loss": 0.51654154, + "learning_rate": 3.80934308995684e-09, + "loss": 0.5921101, + "num_input_tokens_seen": 352150000, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.00961304, + "step": 16315, + "time_per_iteration": 3.1521832942962646 + }, + { + "auxiliary_loss_clip": 0.06402422, + "auxiliary_loss_mlp": 0.01263871, + "balance_loss_clip": 0.06269836, + "balance_loss_mlp": 0.01255019, + "epoch": 0.9809709905305877, + "flos": 22786683857280.0, + "grad_norm": 1.2900137630224915, + "language_loss": 0.69874811, + "learning_rate": 3.785354859932033e-09, + "loss": 0.77541101, + "num_input_tokens_seen": 352170990, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.08843994, + "step": 16316, + "time_per_iteration": 2.5589540004730225 + }, + { + "auxiliary_loss_clip": 0.06403589, + "auxiliary_loss_mlp": 0.01263012, + "balance_loss_clip": 0.06271669, + "balance_loss_mlp": 0.01254274, + "epoch": 0.9810311137832557, + "flos": 37022423351040.0, + "grad_norm": 1.7188947170249258, + "language_loss": 0.55401117, + "learning_rate": 3.76144232656661e-09, + "loss": 0.6306771, + "num_input_tokens_seen": 352195335, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.08734131, + "step": 16317, + "time_per_iteration": 2.6282479763031006 + }, + { + "auxiliary_loss_clip": 0.06401145, + "auxiliary_loss_mlp": 0.01270262, + "balance_loss_clip": 0.06272769, + "balance_loss_mlp": 0.01260905, + "epoch": 0.9810912370359236, + "flos": 18922258679040.0, + "grad_norm": 1.5547999119596, + "language_loss": 0.73396158, + "learning_rate": 3.737605490767404e-09, + "loss": 0.81067568, + "num_input_tokens_seen": 352214170, + "router_z_loss_clip": 1.28417969, + "router_z_loss_mlp": 0.09350586, + "step": 16318, + "time_per_iteration": 2.5051674842834473 + }, + { + "auxiliary_loss_clip": 0.06401484, + "auxiliary_loss_mlp": 0.01265731, + "balance_loss_clip": 0.06274587, + "balance_loss_mlp": 0.01256397, + "epoch": 0.9811513602885916, + "flos": 18447411191040.0, + "grad_norm": 1.9997801159399626, + "language_loss": 0.82393742, + "learning_rate": 3.7138443534383555e-09, + "loss": 0.90060955, + "num_input_tokens_seen": 352231470, + "router_z_loss_clip": 1.26757812, + "router_z_loss_mlp": 0.09332275, + "step": 16319, + "time_per_iteration": 2.4961817264556885 + }, + { + "auxiliary_loss_clip": 0.06305759, + "auxiliary_loss_mlp": 0.01249631, + "balance_loss_clip": 0.06251506, + "balance_loss_mlp": 0.01248486, + "epoch": 0.9812114835412595, + "flos": 68078603306880.0, + "grad_norm": 0.7284136479958665, + "language_loss": 0.53509539, + "learning_rate": 3.6901589154803014e-09, + "loss": 0.61064935, + "num_input_tokens_seen": 352291770, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.01144409, + "step": 16320, + "time_per_iteration": 3.0633654594421387 + }, + { + "auxiliary_loss_clip": 0.06399468, + "auxiliary_loss_mlp": 0.01265154, + "balance_loss_clip": 0.0626857, + "balance_loss_mlp": 0.01255969, + "epoch": 0.9812716067939276, + "flos": 25379826888960.0, + "grad_norm": 1.768638435944613, + "language_loss": 0.73735636, + "learning_rate": 3.6665491777914116e-09, + "loss": 0.81400257, + "num_input_tokens_seen": 352310735, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09185791, + "step": 16321, + "time_per_iteration": 2.566575527191162 + }, + { + "auxiliary_loss_clip": 0.06396846, + "auxiliary_loss_mlp": 0.01263617, + "balance_loss_clip": 0.06269559, + "balance_loss_mlp": 0.01254444, + "epoch": 0.9813317300465956, + "flos": 22863439797120.0, + "grad_norm": 1.5226563597520282, + "language_loss": 0.79048485, + "learning_rate": 3.6430151412669698e-09, + "loss": 0.86708951, + "num_input_tokens_seen": 352329545, + "router_z_loss_clip": 1.2734375, + "router_z_loss_mlp": 0.09173584, + "step": 16322, + "time_per_iteration": 2.549025535583496 + }, + { + "auxiliary_loss_clip": 0.06402303, + "auxiliary_loss_mlp": 0.01266539, + "balance_loss_clip": 0.06273006, + "balance_loss_mlp": 0.01257437, + "epoch": 0.9813918532992635, + "flos": 23593767984000.0, + "grad_norm": 1.4992713181150594, + "language_loss": 0.80698186, + "learning_rate": 3.619556806799595e-09, + "loss": 0.88367027, + "num_input_tokens_seen": 352352080, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.09100342, + "step": 16323, + "time_per_iteration": 2.542644739151001 + }, + { + "auxiliary_loss_clip": 0.06404383, + "auxiliary_loss_mlp": 0.01265912, + "balance_loss_clip": 0.06270544, + "balance_loss_mlp": 0.0125637, + "epoch": 0.9814519765519315, + "flos": 19611860981760.0, + "grad_norm": 1.9971610080835347, + "language_loss": 0.84855533, + "learning_rate": 3.596174175278799e-09, + "loss": 0.92525834, + "num_input_tokens_seen": 352366455, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.09552002, + "step": 16324, + "time_per_iteration": 3.8597731590270996 + }, + { + "auxiliary_loss_clip": 0.06401306, + "auxiliary_loss_mlp": 0.01264136, + "balance_loss_clip": 0.06270885, + "balance_loss_mlp": 0.01254921, + "epoch": 0.9815120998045994, + "flos": 33954390904320.0, + "grad_norm": 1.403861865593316, + "language_loss": 0.74789631, + "learning_rate": 3.5728672475909827e-09, + "loss": 0.82455075, + "num_input_tokens_seen": 352386090, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.09216309, + "step": 16325, + "time_per_iteration": 2.6116855144500732 + }, + { + "auxiliary_loss_clip": 0.06395521, + "auxiliary_loss_mlp": 0.01263546, + "balance_loss_clip": 0.0626988, + "balance_loss_mlp": 0.01254653, + "epoch": 0.9815722230572674, + "flos": 20856295094400.0, + "grad_norm": 1.6689629805802655, + "language_loss": 0.76699644, + "learning_rate": 3.5496360246201063e-09, + "loss": 0.8435871, + "num_input_tokens_seen": 352404000, + "router_z_loss_clip": 1.25585938, + "router_z_loss_mlp": 0.08892822, + "step": 16326, + "time_per_iteration": 2.4666171073913574 + }, + { + "auxiliary_loss_clip": 0.06404449, + "auxiliary_loss_mlp": 0.01264706, + "balance_loss_clip": 0.06272893, + "balance_loss_mlp": 0.01254698, + "epoch": 0.9816323463099353, + "flos": 22901356569600.0, + "grad_norm": 1.675938455983492, + "language_loss": 0.67456639, + "learning_rate": 3.5264805072470205e-09, + "loss": 0.75125796, + "num_input_tokens_seen": 352423540, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.10003662, + "step": 16327, + "time_per_iteration": 2.5008022785186768 + }, + { + "auxiliary_loss_clip": 0.06408646, + "auxiliary_loss_mlp": 0.01265428, + "balance_loss_clip": 0.06271295, + "balance_loss_mlp": 0.01255361, + "epoch": 0.9816924695626034, + "flos": 31547351790720.0, + "grad_norm": 1.412267833838127, + "language_loss": 0.73885894, + "learning_rate": 3.5034006963501337e-09, + "loss": 0.81559968, + "num_input_tokens_seen": 352445530, + "router_z_loss_clip": 1.37402344, + "router_z_loss_mlp": 0.10070801, + "step": 16328, + "time_per_iteration": 2.5715060234069824 + }, + { + "auxiliary_loss_clip": 0.06412687, + "auxiliary_loss_mlp": 0.01268223, + "balance_loss_clip": 0.0627219, + "balance_loss_mlp": 0.01257988, + "epoch": 0.9817525928152713, + "flos": 21513305358720.0, + "grad_norm": 1.842030012246621, + "language_loss": 0.81347609, + "learning_rate": 3.4803965928040802e-09, + "loss": 0.89028519, + "num_input_tokens_seen": 352466325, + "router_z_loss_clip": 1.40429688, + "router_z_loss_mlp": 0.10235596, + "step": 16329, + "time_per_iteration": 2.5269720554351807 + }, + { + "auxiliary_loss_clip": 0.06407592, + "auxiliary_loss_mlp": 0.01266637, + "balance_loss_clip": 0.06271036, + "balance_loss_mlp": 0.0125626, + "epoch": 0.9818127160679393, + "flos": 25556539150080.0, + "grad_norm": 1.7679142745259233, + "language_loss": 0.76537704, + "learning_rate": 3.4574681974817168e-09, + "loss": 0.8421194, + "num_input_tokens_seen": 352485505, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.10375977, + "step": 16330, + "time_per_iteration": 2.5097715854644775 + }, + { + "auxiliary_loss_clip": 0.06417432, + "auxiliary_loss_mlp": 0.01265777, + "balance_loss_clip": 0.06274364, + "balance_loss_mlp": 0.01254118, + "epoch": 0.9818728393206072, + "flos": 28811220566400.0, + "grad_norm": 3.7665228770401518, + "language_loss": 0.66577238, + "learning_rate": 3.434615511252126e-09, + "loss": 0.74260449, + "num_input_tokens_seen": 352505360, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.11645508, + "step": 16331, + "time_per_iteration": 2.594588041305542 + }, + { + "auxiliary_loss_clip": 0.0640003, + "auxiliary_loss_mlp": 0.01264275, + "balance_loss_clip": 0.06271006, + "balance_loss_mlp": 0.01255412, + "epoch": 0.9819329625732752, + "flos": 23229023160960.0, + "grad_norm": 1.8265104369584833, + "language_loss": 0.73624349, + "learning_rate": 3.411838534981948e-09, + "loss": 0.81288654, + "num_input_tokens_seen": 352524035, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.08862305, + "step": 16332, + "time_per_iteration": 3.9030022621154785 + }, + { + "auxiliary_loss_clip": 0.06402284, + "auxiliary_loss_mlp": 0.01265638, + "balance_loss_clip": 0.06271557, + "balance_loss_mlp": 0.01256882, + "epoch": 0.9819930858259431, + "flos": 17536261893120.0, + "grad_norm": 1.7201228746182549, + "language_loss": 0.76839882, + "learning_rate": 3.389137269534936e-09, + "loss": 0.84507805, + "num_input_tokens_seen": 352543210, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.08752441, + "step": 16333, + "time_per_iteration": 2.5092711448669434 + }, + { + "auxiliary_loss_clip": 0.06401891, + "auxiliary_loss_mlp": 0.01263466, + "balance_loss_clip": 0.06271283, + "balance_loss_mlp": 0.01254305, + "epoch": 0.9820532090786112, + "flos": 12534570374400.0, + "grad_norm": 1.9033760890389273, + "language_loss": 0.73437434, + "learning_rate": 3.366511715771958e-09, + "loss": 0.81102788, + "num_input_tokens_seen": 352559770, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.09161377, + "step": 16334, + "time_per_iteration": 2.4836056232452393 + }, + { + "auxiliary_loss_clip": 0.06403394, + "auxiliary_loss_mlp": 0.01265276, + "balance_loss_clip": 0.06271391, + "balance_loss_mlp": 0.01255435, + "epoch": 0.9821133323312792, + "flos": 18845586593280.0, + "grad_norm": 1.7621498824539008, + "language_loss": 0.78639001, + "learning_rate": 3.3439618745509934e-09, + "loss": 0.86307669, + "num_input_tokens_seen": 352577690, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.09838867, + "step": 16335, + "time_per_iteration": 2.494976043701172 + }, + { + "auxiliary_loss_clip": 0.06405871, + "auxiliary_loss_mlp": 0.01267908, + "balance_loss_clip": 0.0627166, + "balance_loss_mlp": 0.01257406, + "epoch": 0.9821734555839471, + "flos": 34832612747520.0, + "grad_norm": 2.0448132834813593, + "language_loss": 0.6420033, + "learning_rate": 3.3214877467271362e-09, + "loss": 0.71874112, + "num_input_tokens_seen": 352598850, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.1050415, + "step": 16336, + "time_per_iteration": 2.605154514312744 + }, + { + "auxiliary_loss_clip": 0.06409524, + "auxiliary_loss_mlp": 0.01263456, + "balance_loss_clip": 0.0627144, + "balance_loss_mlp": 0.0125327, + "epoch": 0.9822335788366151, + "flos": 17133768005760.0, + "grad_norm": 2.4121534627506183, + "language_loss": 0.73113394, + "learning_rate": 3.299089333152372e-09, + "loss": 0.80786371, + "num_input_tokens_seen": 352616130, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.10186768, + "step": 16337, + "time_per_iteration": 2.492018222808838 + }, + { + "auxiliary_loss_clip": 0.06404828, + "auxiliary_loss_mlp": 0.01266072, + "balance_loss_clip": 0.06271564, + "balance_loss_mlp": 0.01256196, + "epoch": 0.982293702089283, + "flos": 20819468424960.0, + "grad_norm": 1.6227440209505577, + "language_loss": 0.73327523, + "learning_rate": 3.2767666346764645e-09, + "loss": 0.80998421, + "num_input_tokens_seen": 352636885, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.09881592, + "step": 16338, + "time_per_iteration": 2.511469602584839 + }, + { + "auxiliary_loss_clip": 0.06401011, + "auxiliary_loss_mlp": 0.01264448, + "balance_loss_clip": 0.06268863, + "balance_loss_mlp": 0.01255096, + "epoch": 0.982353825341951, + "flos": 24687708963840.0, + "grad_norm": 1.5242122575774386, + "language_loss": 0.81808567, + "learning_rate": 3.2545196521454045e-09, + "loss": 0.89474022, + "num_input_tokens_seen": 352657905, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.09350586, + "step": 16339, + "time_per_iteration": 2.5525383949279785 + }, + { + "auxiliary_loss_clip": 0.06396718, + "auxiliary_loss_mlp": 0.01263936, + "balance_loss_clip": 0.06269798, + "balance_loss_mlp": 0.01254888, + "epoch": 0.982413948594619, + "flos": 20856840145920.0, + "grad_norm": 7.946616687424166, + "language_loss": 0.63168478, + "learning_rate": 3.232348386403405e-09, + "loss": 0.70829129, + "num_input_tokens_seen": 352676320, + "router_z_loss_clip": 1.26855469, + "router_z_loss_mlp": 0.09051514, + "step": 16340, + "time_per_iteration": 3.950870990753174 + }, + { + "auxiliary_loss_clip": 0.06404588, + "auxiliary_loss_mlp": 0.01262603, + "balance_loss_clip": 0.06271665, + "balance_loss_mlp": 0.0125318, + "epoch": 0.982474071847287, + "flos": 15382774834560.0, + "grad_norm": 2.1427722252854626, + "language_loss": 0.85878891, + "learning_rate": 3.2102528382904613e-09, + "loss": 0.93546081, + "num_input_tokens_seen": 352692665, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.09417725, + "step": 16341, + "time_per_iteration": 2.4532127380371094 + }, + { + "auxiliary_loss_clip": 0.06396417, + "auxiliary_loss_mlp": 0.0126733, + "balance_loss_clip": 0.06270136, + "balance_loss_mlp": 0.01258371, + "epoch": 0.9825341950999549, + "flos": 23782471378560.0, + "grad_norm": 1.3471902958727353, + "language_loss": 0.67131615, + "learning_rate": 3.188233008645014e-09, + "loss": 0.74795365, + "num_input_tokens_seen": 352716130, + "router_z_loss_clip": 1.26171875, + "router_z_loss_mlp": 0.08959961, + "step": 16342, + "time_per_iteration": 2.611873149871826 + }, + { + "auxiliary_loss_clip": 0.06402282, + "auxiliary_loss_mlp": 0.01265067, + "balance_loss_clip": 0.0626906, + "balance_loss_mlp": 0.01256055, + "epoch": 0.9825943183526229, + "flos": 22752708226560.0, + "grad_norm": 1.4818959973540065, + "language_loss": 0.77696526, + "learning_rate": 3.16628889830195e-09, + "loss": 0.85363877, + "num_input_tokens_seen": 352734705, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.09008789, + "step": 16343, + "time_per_iteration": 2.5588226318359375 + }, + { + "auxiliary_loss_clip": 0.06398541, + "auxiliary_loss_mlp": 0.01262034, + "balance_loss_clip": 0.06269187, + "balance_loss_mlp": 0.01253642, + "epoch": 0.9826544416052908, + "flos": 27717489221760.0, + "grad_norm": 1.4769865198658847, + "language_loss": 0.75333172, + "learning_rate": 3.1444205080932707e-09, + "loss": 0.82993752, + "num_input_tokens_seen": 352756225, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.08392334, + "step": 16344, + "time_per_iteration": 2.5765645503997803 + }, + { + "auxiliary_loss_clip": 0.06400666, + "auxiliary_loss_mlp": 0.01263473, + "balance_loss_clip": 0.0626943, + "balance_loss_mlp": 0.01253376, + "epoch": 0.9827145648579588, + "flos": 26948699210880.0, + "grad_norm": 1.922930318885977, + "language_loss": 0.67135489, + "learning_rate": 3.122627838848313e-09, + "loss": 0.74799621, + "num_input_tokens_seen": 352776210, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.10107422, + "step": 16345, + "time_per_iteration": 2.533918857574463 + }, + { + "auxiliary_loss_clip": 0.06396809, + "auxiliary_loss_mlp": 0.01261827, + "balance_loss_clip": 0.0627033, + "balance_loss_mlp": 0.01253537, + "epoch": 0.9827746881106267, + "flos": 21872138469120.0, + "grad_norm": 1.3537926665164286, + "language_loss": 0.79563165, + "learning_rate": 3.1009108913933045e-09, + "loss": 0.87221801, + "num_input_tokens_seen": 352795455, + "router_z_loss_clip": 1.26464844, + "router_z_loss_mlp": 0.08288574, + "step": 16346, + "time_per_iteration": 3.958854913711548 + }, + { + "auxiliary_loss_clip": 0.06413849, + "auxiliary_loss_mlp": 0.0126616, + "balance_loss_clip": 0.06275063, + "balance_loss_mlp": 0.01256176, + "epoch": 0.9828348113632948, + "flos": 20857175562240.0, + "grad_norm": 1.985745822904642, + "language_loss": 0.75521713, + "learning_rate": 3.079269666552031e-09, + "loss": 0.83201724, + "num_input_tokens_seen": 352812895, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.09991455, + "step": 16347, + "time_per_iteration": 2.535245656967163 + }, + { + "auxiliary_loss_clip": 0.06396177, + "auxiliary_loss_mlp": 0.01263212, + "balance_loss_clip": 0.06270447, + "balance_loss_mlp": 0.01254695, + "epoch": 0.9828949346159628, + "flos": 34577886735360.0, + "grad_norm": 2.740882984240411, + "language_loss": 0.6695146, + "learning_rate": 3.0577041651449474e-09, + "loss": 0.74610847, + "num_input_tokens_seen": 352835470, + "router_z_loss_clip": 1.2578125, + "router_z_loss_mlp": 0.08514404, + "step": 16348, + "time_per_iteration": 2.6063122749328613 + }, + { + "auxiliary_loss_clip": 0.06400393, + "auxiliary_loss_mlp": 0.01264818, + "balance_loss_clip": 0.06270978, + "balance_loss_mlp": 0.01254721, + "epoch": 0.9829550578686307, + "flos": 24463562492160.0, + "grad_norm": 1.859593683804768, + "language_loss": 0.69546545, + "learning_rate": 3.0362143879898437e-09, + "loss": 0.7721175, + "num_input_tokens_seen": 352854295, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.10095215, + "step": 16349, + "time_per_iteration": 2.538785934448242 + }, + { + "auxiliary_loss_clip": 0.06395674, + "auxiliary_loss_mlp": 0.01264209, + "balance_loss_clip": 0.06270944, + "balance_loss_mlp": 0.01255292, + "epoch": 0.9830151811212987, + "flos": 16915784808960.0, + "grad_norm": 1.6752687286579624, + "language_loss": 0.75853312, + "learning_rate": 3.0148003359014018e-09, + "loss": 0.835132, + "num_input_tokens_seen": 352869695, + "router_z_loss_clip": 1.24609375, + "router_z_loss_mlp": 0.08905029, + "step": 16350, + "time_per_iteration": 2.46547269821167 + }, + { + "auxiliary_loss_clip": 0.06401215, + "auxiliary_loss_mlp": 0.01266283, + "balance_loss_clip": 0.06269281, + "balance_loss_mlp": 0.01256854, + "epoch": 0.9830753043739666, + "flos": 21294735183360.0, + "grad_norm": 1.9963831633917941, + "language_loss": 0.84258103, + "learning_rate": 2.9934620096920826e-09, + "loss": 0.91925597, + "num_input_tokens_seen": 352887430, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09429932, + "step": 16351, + "time_per_iteration": 2.572175979614258 + }, + { + "auxiliary_loss_clip": 0.06398397, + "auxiliary_loss_mlp": 0.01260592, + "balance_loss_clip": 0.06267038, + "balance_loss_mlp": 0.01251526, + "epoch": 0.9831354276266346, + "flos": 31731736700160.0, + "grad_norm": 1.4942934246036372, + "language_loss": 0.6857751, + "learning_rate": 2.972199410170795e-09, + "loss": 0.76236498, + "num_input_tokens_seen": 352907555, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09069824, + "step": 16352, + "time_per_iteration": 2.5960402488708496 + }, + { + "auxiliary_loss_clip": 0.06403258, + "auxiliary_loss_mlp": 0.01261007, + "balance_loss_clip": 0.06273116, + "balance_loss_mlp": 0.01252054, + "epoch": 0.9831955508793025, + "flos": 21625923646080.0, + "grad_norm": 1.3954339265151765, + "language_loss": 0.6703254, + "learning_rate": 2.951012538143782e-09, + "loss": 0.74696803, + "num_input_tokens_seen": 352928670, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.08944702, + "step": 16353, + "time_per_iteration": 2.5353140830993652 + }, + { + "auxiliary_loss_clip": 0.06395429, + "auxiliary_loss_mlp": 0.01264292, + "balance_loss_clip": 0.06268495, + "balance_loss_mlp": 0.0125559, + "epoch": 0.9832556741319706, + "flos": 22975177616640.0, + "grad_norm": 1.6379749253440405, + "language_loss": 0.74751425, + "learning_rate": 2.9299013944144025e-09, + "loss": 0.82411146, + "num_input_tokens_seen": 352948345, + "router_z_loss_clip": 1.26855469, + "router_z_loss_mlp": 0.08703613, + "step": 16354, + "time_per_iteration": 2.508065700531006 + }, + { + "auxiliary_loss_clip": 0.06398819, + "auxiliary_loss_mlp": 0.01263889, + "balance_loss_clip": 0.06268892, + "balance_loss_mlp": 0.0125468, + "epoch": 0.9833157973846385, + "flos": 21330178260480.0, + "grad_norm": 2.1076687660597644, + "language_loss": 0.77908456, + "learning_rate": 2.9088659797835702e-09, + "loss": 0.85571158, + "num_input_tokens_seen": 352967250, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.09216309, + "step": 16355, + "time_per_iteration": 2.508748769760132 + }, + { + "auxiliary_loss_clip": 0.06398673, + "auxiliary_loss_mlp": 0.01264487, + "balance_loss_clip": 0.06269018, + "balance_loss_mlp": 0.01255743, + "epoch": 0.9833759206373065, + "flos": 21074991050880.0, + "grad_norm": 1.7510865399500044, + "language_loss": 0.73771065, + "learning_rate": 2.8879062950484256e-09, + "loss": 0.81434226, + "num_input_tokens_seen": 352984725, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.08746338, + "step": 16356, + "time_per_iteration": 2.4964609146118164 + }, + { + "auxiliary_loss_clip": 0.06397355, + "auxiliary_loss_mlp": 0.01264905, + "balance_loss_clip": 0.06268449, + "balance_loss_mlp": 0.01256, + "epoch": 0.9834360438899744, + "flos": 18703227306240.0, + "grad_norm": 1.7922829245383989, + "language_loss": 0.76294625, + "learning_rate": 2.8670223410041104e-09, + "loss": 0.83956885, + "num_input_tokens_seen": 353003480, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.08911133, + "step": 16357, + "time_per_iteration": 2.508512258529663 + }, + { + "auxiliary_loss_clip": 0.0640227, + "auxiliary_loss_mlp": 0.01263006, + "balance_loss_clip": 0.06272359, + "balance_loss_mlp": 0.01253743, + "epoch": 0.9834961671426424, + "flos": 21111524231040.0, + "grad_norm": 2.0027677805382953, + "language_loss": 0.80176306, + "learning_rate": 2.846214118442436e-09, + "loss": 0.87841582, + "num_input_tokens_seen": 353021425, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.09259033, + "step": 16358, + "time_per_iteration": 2.4842851161956787 + }, + { + "auxiliary_loss_clip": 0.06400406, + "auxiliary_loss_mlp": 0.01262987, + "balance_loss_clip": 0.06269883, + "balance_loss_mlp": 0.01254094, + "epoch": 0.9835562903953103, + "flos": 26694853666560.0, + "grad_norm": 2.106405637853541, + "language_loss": 0.67995811, + "learning_rate": 2.8254816281523263e-09, + "loss": 0.75659204, + "num_input_tokens_seen": 353039870, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.08886719, + "step": 16359, + "time_per_iteration": 2.543684720993042 + }, + { + "auxiliary_loss_clip": 0.06396379, + "auxiliary_loss_mlp": 0.01260995, + "balance_loss_clip": 0.06268568, + "balance_loss_mlp": 0.01252264, + "epoch": 0.9836164136479784, + "flos": 22096578430080.0, + "grad_norm": 1.5676577636482238, + "language_loss": 0.69622505, + "learning_rate": 2.804824870920264e-09, + "loss": 0.77279884, + "num_input_tokens_seen": 353059750, + "router_z_loss_clip": 1.27636719, + "router_z_loss_mlp": 0.08728027, + "step": 16360, + "time_per_iteration": 2.5693228244781494 + }, + { + "auxiliary_loss_clip": 0.06402056, + "auxiliary_loss_mlp": 0.01263576, + "balance_loss_clip": 0.06269471, + "balance_loss_mlp": 0.0125389, + "epoch": 0.9836765369006463, + "flos": 23885194884480.0, + "grad_norm": 1.682194458725563, + "language_loss": 0.8439554, + "learning_rate": 2.7842438475293996e-09, + "loss": 0.92061168, + "num_input_tokens_seen": 353079940, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09674072, + "step": 16361, + "time_per_iteration": 2.560330390930176 + }, + { + "auxiliary_loss_clip": 0.06402538, + "auxiliary_loss_mlp": 0.01263822, + "balance_loss_clip": 0.06270505, + "balance_loss_mlp": 0.01255251, + "epoch": 0.9837366601533143, + "flos": 25851529848960.0, + "grad_norm": 1.6385001954034184, + "language_loss": 0.7628051, + "learning_rate": 2.76373855876022e-09, + "loss": 0.83946872, + "num_input_tokens_seen": 353099990, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.08575439, + "step": 16362, + "time_per_iteration": 2.5176074504852295 + }, + { + "auxiliary_loss_clip": 0.06398503, + "auxiliary_loss_mlp": 0.01266553, + "balance_loss_clip": 0.06268647, + "balance_loss_mlp": 0.0125685, + "epoch": 0.9837967834059823, + "flos": 21363902328960.0, + "grad_norm": 1.5985135435768925, + "language_loss": 0.71467978, + "learning_rate": 2.7433090053901043e-09, + "loss": 0.79133034, + "num_input_tokens_seen": 353118710, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.0970459, + "step": 16363, + "time_per_iteration": 3.905139684677124 + }, + { + "auxiliary_loss_clip": 0.06394857, + "auxiliary_loss_mlp": 0.01264694, + "balance_loss_clip": 0.06269969, + "balance_loss_mlp": 0.01256219, + "epoch": 0.9838569066586502, + "flos": 18521819216640.0, + "grad_norm": 1.6859812607317168, + "language_loss": 0.63076383, + "learning_rate": 2.7229551881937653e-09, + "loss": 0.70735937, + "num_input_tokens_seen": 353136415, + "router_z_loss_clip": 1.25, + "router_z_loss_mlp": 0.08477783, + "step": 16364, + "time_per_iteration": 2.5008041858673096 + }, + { + "auxiliary_loss_clip": 0.06401549, + "auxiliary_loss_mlp": 0.01262269, + "balance_loss_clip": 0.0627073, + "balance_loss_mlp": 0.01253793, + "epoch": 0.9839170299113182, + "flos": 22458430287360.0, + "grad_norm": 1.9025940336475926, + "language_loss": 0.75345969, + "learning_rate": 2.702677107943252e-09, + "loss": 0.83009791, + "num_input_tokens_seen": 353154650, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.0847168, + "step": 16365, + "time_per_iteration": 2.552847146987915 + }, + { + "auxiliary_loss_clip": 0.06399475, + "auxiliary_loss_mlp": 0.01264327, + "balance_loss_clip": 0.0627087, + "balance_loss_mlp": 0.01255554, + "epoch": 0.9839771531639862, + "flos": 27899861633280.0, + "grad_norm": 1.6224580462196883, + "language_loss": 0.76744139, + "learning_rate": 2.6824747654072832e-09, + "loss": 0.84407938, + "num_input_tokens_seen": 353174065, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.08776855, + "step": 16366, + "time_per_iteration": 2.5814366340637207 + }, + { + "auxiliary_loss_clip": 0.06397621, + "auxiliary_loss_mlp": 0.0126483, + "balance_loss_clip": 0.06269, + "balance_loss_mlp": 0.01255752, + "epoch": 0.9840372764166542, + "flos": 28221071460480.0, + "grad_norm": 1.5360929282556393, + "language_loss": 0.77089232, + "learning_rate": 2.662348161352357e-09, + "loss": 0.84751683, + "num_input_tokens_seen": 353193560, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.09075928, + "step": 16367, + "time_per_iteration": 2.548718214035034 + }, + { + "auxiliary_loss_clip": 0.06398439, + "auxiliary_loss_mlp": 0.01263987, + "balance_loss_clip": 0.06268852, + "balance_loss_mlp": 0.01254933, + "epoch": 0.9840973996693221, + "flos": 23410682812800.0, + "grad_norm": 1.451840758159792, + "language_loss": 0.61724389, + "learning_rate": 2.642297296540974e-09, + "loss": 0.69386816, + "num_input_tokens_seen": 353213525, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.09057617, + "step": 16368, + "time_per_iteration": 2.532034158706665 + }, + { + "auxiliary_loss_clip": 0.06396456, + "auxiliary_loss_mlp": 0.01267037, + "balance_loss_clip": 0.06270956, + "balance_loss_mlp": 0.01258698, + "epoch": 0.9841575229219901, + "flos": 21401986809600.0, + "grad_norm": 1.5041768156140347, + "language_loss": 0.6552428, + "learning_rate": 2.6223221717340816e-09, + "loss": 0.73187768, + "num_input_tokens_seen": 353234000, + "router_z_loss_clip": 1.25488281, + "router_z_loss_mlp": 0.08343506, + "step": 16369, + "time_per_iteration": 2.520037889480591 + }, + { + "auxiliary_loss_clip": 0.06402774, + "auxiliary_loss_mlp": 0.0126442, + "balance_loss_clip": 0.06270762, + "balance_loss_mlp": 0.01254126, + "epoch": 0.984217646174658, + "flos": 24471277067520.0, + "grad_norm": 1.753415617022144, + "language_loss": 0.68846416, + "learning_rate": 2.6024227876886295e-09, + "loss": 0.76513612, + "num_input_tokens_seen": 353254940, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.10296631, + "step": 16370, + "time_per_iteration": 2.540699005126953 + }, + { + "auxiliary_loss_clip": 0.0640409, + "auxiliary_loss_mlp": 0.01266605, + "balance_loss_clip": 0.06269194, + "balance_loss_mlp": 0.01256246, + "epoch": 0.984277769427326, + "flos": 16440559977600.0, + "grad_norm": 2.157249724896927, + "language_loss": 0.73935145, + "learning_rate": 2.582599145159792e-09, + "loss": 0.8160584, + "num_input_tokens_seen": 353272590, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.10357666, + "step": 16371, + "time_per_iteration": 2.454529047012329 + }, + { + "auxiliary_loss_clip": 0.06309754, + "auxiliary_loss_mlp": 0.01249704, + "balance_loss_clip": 0.06255664, + "balance_loss_mlp": 0.01248747, + "epoch": 0.9843378926799939, + "flos": 64551487939200.0, + "grad_norm": 0.7685676506536336, + "language_loss": 0.64979422, + "learning_rate": 2.562851244898745e-09, + "loss": 0.72538882, + "num_input_tokens_seen": 353334380, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.00955963, + "step": 16372, + "time_per_iteration": 4.500819206237793 + }, + { + "auxiliary_loss_clip": 0.0639531, + "auxiliary_loss_mlp": 0.0126257, + "balance_loss_clip": 0.0626704, + "balance_loss_mlp": 0.01253796, + "epoch": 0.984398015932662, + "flos": 17388326309760.0, + "grad_norm": 1.6860490980606475, + "language_loss": 0.71169502, + "learning_rate": 2.5431790876544456e-09, + "loss": 0.78827381, + "num_input_tokens_seen": 353351640, + "router_z_loss_clip": 1.28222656, + "router_z_loss_mlp": 0.087677, + "step": 16373, + "time_per_iteration": 2.457385301589966 + }, + { + "auxiliary_loss_clip": 0.06398892, + "auxiliary_loss_mlp": 0.01265678, + "balance_loss_clip": 0.06270701, + "balance_loss_mlp": 0.01256857, + "epoch": 0.9844581391853299, + "flos": 23885991498240.0, + "grad_norm": 1.5447539198468738, + "language_loss": 0.81465459, + "learning_rate": 2.523582674173186e-09, + "loss": 0.89130032, + "num_input_tokens_seen": 353372555, + "router_z_loss_clip": 1.28222656, + "router_z_loss_mlp": 0.08825684, + "step": 16374, + "time_per_iteration": 2.5521185398101807 + }, + { + "auxiliary_loss_clip": 0.06403422, + "auxiliary_loss_mlp": 0.01265136, + "balance_loss_clip": 0.06271537, + "balance_loss_mlp": 0.01256016, + "epoch": 0.9845182624379979, + "flos": 19871534384640.0, + "grad_norm": 1.7001768463921554, + "language_loss": 0.69477171, + "learning_rate": 2.504062005197927e-09, + "loss": 0.77145725, + "num_input_tokens_seen": 353391385, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09112549, + "step": 16375, + "time_per_iteration": 2.4824092388153076 + }, + { + "auxiliary_loss_clip": 0.06405924, + "auxiliary_loss_mlp": 0.01263771, + "balance_loss_clip": 0.06271198, + "balance_loss_mlp": 0.01254246, + "epoch": 0.9845783856906659, + "flos": 28261839271680.0, + "grad_norm": 1.9798268500878542, + "language_loss": 0.80762142, + "learning_rate": 2.484617081468521e-09, + "loss": 0.88431835, + "num_input_tokens_seen": 353411630, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.09515381, + "step": 16376, + "time_per_iteration": 2.564424753189087 + }, + { + "auxiliary_loss_clip": 0.06399219, + "auxiliary_loss_mlp": 0.01263402, + "balance_loss_clip": 0.06270926, + "balance_loss_mlp": 0.01253967, + "epoch": 0.9846385089433338, + "flos": 28335702245760.0, + "grad_norm": 1.4082081602945489, + "language_loss": 0.62552863, + "learning_rate": 2.4652479037228224e-09, + "loss": 0.70215487, + "num_input_tokens_seen": 353432895, + "router_z_loss_clip": 1.28222656, + "router_z_loss_mlp": 0.09429932, + "step": 16377, + "time_per_iteration": 2.58390212059021 + }, + { + "auxiliary_loss_clip": 0.06403971, + "auxiliary_loss_mlp": 0.01265767, + "balance_loss_clip": 0.06272383, + "balance_loss_mlp": 0.01256588, + "epoch": 0.9846986321960018, + "flos": 24323718827520.0, + "grad_norm": 1.541732057428472, + "language_loss": 0.73141658, + "learning_rate": 2.445954472695133e-09, + "loss": 0.80811405, + "num_input_tokens_seen": 353454195, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.09173584, + "step": 16378, + "time_per_iteration": 2.5272939205169678 + }, + { + "auxiliary_loss_clip": 0.06401136, + "auxiliary_loss_mlp": 0.01265891, + "balance_loss_clip": 0.06269161, + "balance_loss_mlp": 0.01256461, + "epoch": 0.9847587554486698, + "flos": 27279426476160.0, + "grad_norm": 1.6453729293875299, + "language_loss": 0.71287769, + "learning_rate": 2.426736789116868e-09, + "loss": 0.78954792, + "num_input_tokens_seen": 353475125, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09429932, + "step": 16379, + "time_per_iteration": 3.9946951866149902 + }, + { + "auxiliary_loss_clip": 0.06402892, + "auxiliary_loss_mlp": 0.01264316, + "balance_loss_clip": 0.06270932, + "balance_loss_mlp": 0.01254589, + "epoch": 0.9848188787013378, + "flos": 16547937384960.0, + "grad_norm": 1.675981927204607, + "language_loss": 0.68351865, + "learning_rate": 2.407594853716999e-09, + "loss": 0.76019073, + "num_input_tokens_seen": 353493265, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09719849, + "step": 16380, + "time_per_iteration": 2.525541305541992 + }, + { + "auxiliary_loss_clip": 0.06406681, + "auxiliary_loss_mlp": 0.01265103, + "balance_loss_clip": 0.06270894, + "balance_loss_mlp": 0.01255358, + "epoch": 0.9848790019540057, + "flos": 20199871808640.0, + "grad_norm": 1.8917613358360588, + "language_loss": 0.78484976, + "learning_rate": 2.38852866722139e-09, + "loss": 0.86156762, + "num_input_tokens_seen": 353511650, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.09741211, + "step": 16381, + "time_per_iteration": 2.5284276008605957 + }, + { + "auxiliary_loss_clip": 0.06401529, + "auxiliary_loss_mlp": 0.01263906, + "balance_loss_clip": 0.06269079, + "balance_loss_mlp": 0.01254101, + "epoch": 0.9849391252066737, + "flos": 28267750984320.0, + "grad_norm": 1.3772384607089387, + "language_loss": 0.82476425, + "learning_rate": 2.3695382303527965e-09, + "loss": 0.90141863, + "num_input_tokens_seen": 353534035, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.0980835, + "step": 16382, + "time_per_iteration": 2.5919766426086426 + }, + { + "auxiliary_loss_clip": 0.06408627, + "auxiliary_loss_mlp": 0.01264361, + "balance_loss_clip": 0.06271975, + "balance_loss_mlp": 0.0125492, + "epoch": 0.9849992484593416, + "flos": 22461407107200.0, + "grad_norm": 1.6817529475209232, + "language_loss": 0.74892008, + "learning_rate": 2.3506235438315316e-09, + "loss": 0.82564998, + "num_input_tokens_seen": 353549950, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.09436035, + "step": 16383, + "time_per_iteration": 2.514427900314331 + }, + { + "auxiliary_loss_clip": 0.06402783, + "auxiliary_loss_mlp": 0.01265978, + "balance_loss_clip": 0.06272022, + "balance_loss_mlp": 0.01256644, + "epoch": 0.9850593717120096, + "flos": 34505994332160.0, + "grad_norm": 1.417993131097162, + "language_loss": 0.66312635, + "learning_rate": 2.3317846083750203e-09, + "loss": 0.73981392, + "num_input_tokens_seen": 353573745, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09332275, + "step": 16384, + "time_per_iteration": 2.6268670558929443 + }, + { + "auxiliary_loss_clip": 0.06408171, + "auxiliary_loss_mlp": 0.01266699, + "balance_loss_clip": 0.0627324, + "balance_loss_mlp": 0.01256679, + "epoch": 0.9851194949646775, + "flos": 38846524809600.0, + "grad_norm": 2.0407585132753474, + "language_loss": 0.70862484, + "learning_rate": 2.313021424697359e-09, + "loss": 0.78537351, + "num_input_tokens_seen": 353595335, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.10021973, + "step": 16385, + "time_per_iteration": 2.644968032836914 + }, + { + "auxiliary_loss_clip": 0.06406495, + "auxiliary_loss_mlp": 0.01267976, + "balance_loss_clip": 0.06273443, + "balance_loss_mlp": 0.01258511, + "epoch": 0.9851796182173456, + "flos": 17718215034240.0, + "grad_norm": 1.8403638705762766, + "language_loss": 0.81630373, + "learning_rate": 2.294333993509978e-09, + "loss": 0.89304841, + "num_input_tokens_seen": 353614270, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.09460449, + "step": 16386, + "time_per_iteration": 3.917997360229492 + }, + { + "auxiliary_loss_clip": 0.06405159, + "auxiliary_loss_mlp": 0.01265158, + "balance_loss_clip": 0.06271283, + "balance_loss_mlp": 0.01255127, + "epoch": 0.9852397414700135, + "flos": 27461756960640.0, + "grad_norm": 1.9733443741817431, + "language_loss": 0.67915004, + "learning_rate": 2.2757223155216442e-09, + "loss": 0.75585318, + "num_input_tokens_seen": 353634900, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.10040283, + "step": 16387, + "time_per_iteration": 2.5964622497558594 + }, + { + "auxiliary_loss_clip": 0.06393988, + "auxiliary_loss_mlp": 0.01264067, + "balance_loss_clip": 0.06269428, + "balance_loss_mlp": 0.01255752, + "epoch": 0.9852998647226815, + "flos": 18302662062720.0, + "grad_norm": 1.6277320463659288, + "language_loss": 0.74601555, + "learning_rate": 2.257186391438237e-09, + "loss": 0.82259607, + "num_input_tokens_seen": 353652890, + "router_z_loss_clip": 1.24511719, + "router_z_loss_mlp": 0.08312988, + "step": 16388, + "time_per_iteration": 2.5200042724609375 + }, + { + "auxiliary_loss_clip": 0.06399764, + "auxiliary_loss_mlp": 0.01266601, + "balance_loss_clip": 0.06269439, + "balance_loss_mlp": 0.01257291, + "epoch": 0.9853599879753495, + "flos": 19648058745600.0, + "grad_norm": 1.5789948007972028, + "language_loss": 0.82318109, + "learning_rate": 2.238726221962528e-09, + "loss": 0.89984477, + "num_input_tokens_seen": 353671295, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.09313965, + "step": 16389, + "time_per_iteration": 2.5028319358825684 + }, + { + "auxiliary_loss_clip": 0.06399673, + "auxiliary_loss_mlp": 0.012661, + "balance_loss_clip": 0.06269571, + "balance_loss_mlp": 0.01257118, + "epoch": 0.9854201112280174, + "flos": 23848745558400.0, + "grad_norm": 1.9542914856542009, + "language_loss": 0.67416507, + "learning_rate": 2.2203418077946234e-09, + "loss": 0.75082278, + "num_input_tokens_seen": 353690560, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.08978271, + "step": 16390, + "time_per_iteration": 2.524301052093506 + }, + { + "auxiliary_loss_clip": 0.06404354, + "auxiliary_loss_mlp": 0.01263587, + "balance_loss_clip": 0.0627258, + "balance_loss_mlp": 0.01253514, + "epoch": 0.9854802344806854, + "flos": 30088330571520.0, + "grad_norm": 1.5456986712452574, + "language_loss": 0.77386737, + "learning_rate": 2.2020331496312994e-09, + "loss": 0.85054678, + "num_input_tokens_seen": 353710660, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.10076904, + "step": 16391, + "time_per_iteration": 2.5559659004211426 + }, + { + "auxiliary_loss_clip": 0.06395002, + "auxiliary_loss_mlp": 0.01266032, + "balance_loss_clip": 0.06271442, + "balance_loss_mlp": 0.01257074, + "epoch": 0.9855403577333534, + "flos": 21913744821120.0, + "grad_norm": 1.673902135646454, + "language_loss": 0.68136293, + "learning_rate": 2.1838002481673333e-09, + "loss": 0.75797331, + "num_input_tokens_seen": 353730440, + "router_z_loss_clip": 1.23535156, + "router_z_loss_mlp": 0.08966064, + "step": 16392, + "time_per_iteration": 2.5317015647888184 + }, + { + "auxiliary_loss_clip": 0.06408426, + "auxiliary_loss_mlp": 0.01263266, + "balance_loss_clip": 0.06270889, + "balance_loss_mlp": 0.01252859, + "epoch": 0.9856004809860214, + "flos": 15419182233600.0, + "grad_norm": 1.7111314079552304, + "language_loss": 0.56011736, + "learning_rate": 2.1656431040937286e-09, + "loss": 0.63683426, + "num_input_tokens_seen": 353748360, + "router_z_loss_clip": 1.37695312, + "router_z_loss_mlp": 0.10400391, + "step": 16393, + "time_per_iteration": 2.476515293121338 + }, + { + "auxiliary_loss_clip": 0.06411494, + "auxiliary_loss_mlp": 0.01265344, + "balance_loss_clip": 0.06273687, + "balance_loss_mlp": 0.01255861, + "epoch": 0.9856606042386893, + "flos": 13656742980480.0, + "grad_norm": 3.1315340219077794, + "language_loss": 0.79706287, + "learning_rate": 2.1475617180990444e-09, + "loss": 0.87383127, + "num_input_tokens_seen": 353760880, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.09484863, + "step": 16394, + "time_per_iteration": 2.4626893997192383 + }, + { + "auxiliary_loss_clip": 0.06404269, + "auxiliary_loss_mlp": 0.01262883, + "balance_loss_clip": 0.06270118, + "balance_loss_mlp": 0.01253221, + "epoch": 0.9857207274913573, + "flos": 23486222868480.0, + "grad_norm": 1.467210916610673, + "language_loss": 0.76540744, + "learning_rate": 2.129556090869178e-09, + "loss": 0.84207892, + "num_input_tokens_seen": 353782255, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.09667969, + "step": 16395, + "time_per_iteration": 2.525965452194214 + }, + { + "auxiliary_loss_clip": 0.06400509, + "auxiliary_loss_mlp": 0.0126337, + "balance_loss_clip": 0.06270809, + "balance_loss_mlp": 0.01254554, + "epoch": 0.9857808507440252, + "flos": 21071217617280.0, + "grad_norm": 1.8150456310357506, + "language_loss": 0.75683588, + "learning_rate": 2.1116262230866933e-09, + "loss": 0.83347464, + "num_input_tokens_seen": 353803580, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.08813477, + "step": 16396, + "time_per_iteration": 2.567934513092041 + }, + { + "auxiliary_loss_clip": 0.06400032, + "auxiliary_loss_mlp": 0.01263122, + "balance_loss_clip": 0.06269535, + "balance_loss_mlp": 0.01254121, + "epoch": 0.9858409739966932, + "flos": 25308395683200.0, + "grad_norm": 1.3662476334903952, + "language_loss": 0.71217585, + "learning_rate": 2.0937721154317133e-09, + "loss": 0.78880739, + "num_input_tokens_seen": 353824200, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.08996582, + "step": 16397, + "time_per_iteration": 2.5428028106689453 + }, + { + "auxiliary_loss_clip": 0.06395601, + "auxiliary_loss_mlp": 0.01262092, + "balance_loss_clip": 0.06271599, + "balance_loss_mlp": 0.01253616, + "epoch": 0.9859010972493611, + "flos": 20565077829120.0, + "grad_norm": 1.5361350556521405, + "language_loss": 0.71496713, + "learning_rate": 2.0759937685810304e-09, + "loss": 0.79154408, + "num_input_tokens_seen": 353843350, + "router_z_loss_clip": 1.23925781, + "router_z_loss_mlp": 0.08477783, + "step": 16398, + "time_per_iteration": 2.5399317741394043 + }, + { + "auxiliary_loss_clip": 0.06398591, + "auxiliary_loss_mlp": 0.01261999, + "balance_loss_clip": 0.06270011, + "balance_loss_mlp": 0.01253434, + "epoch": 0.9859612205020292, + "flos": 24762075062400.0, + "grad_norm": 1.3521426462373807, + "language_loss": 0.74462658, + "learning_rate": 2.058291183208771e-09, + "loss": 0.82123244, + "num_input_tokens_seen": 353864520, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.08563232, + "step": 16399, + "time_per_iteration": 2.5816903114318848 + }, + { + "auxiliary_loss_clip": 0.06400129, + "auxiliary_loss_mlp": 0.01264452, + "balance_loss_clip": 0.06268509, + "balance_loss_mlp": 0.0125535, + "epoch": 0.9860213437546971, + "flos": 21112236990720.0, + "grad_norm": 3.4281947603629495, + "language_loss": 0.57744968, + "learning_rate": 2.0406643599863993e-09, + "loss": 0.65409541, + "num_input_tokens_seen": 353882240, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.09106445, + "step": 16400, + "time_per_iteration": 2.5620059967041016 + }, + { + "auxiliary_loss_clip": 0.06412265, + "auxiliary_loss_mlp": 0.0126489, + "balance_loss_clip": 0.0627275, + "balance_loss_mlp": 0.01254585, + "epoch": 0.9860814670073651, + "flos": 19142212446720.0, + "grad_norm": 1.6519096165686342, + "language_loss": 0.81009173, + "learning_rate": 2.023113299582491e-09, + "loss": 0.88686335, + "num_input_tokens_seen": 353901590, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.10308838, + "step": 16401, + "time_per_iteration": 2.6548011302948 + }, + { + "auxiliary_loss_clip": 0.06398042, + "auxiliary_loss_mlp": 0.01263271, + "balance_loss_clip": 0.06269659, + "balance_loss_mlp": 0.01253371, + "epoch": 0.9861415902600331, + "flos": 17242570932480.0, + "grad_norm": 1.9964613223358685, + "language_loss": 0.78200734, + "learning_rate": 2.005638002662069e-09, + "loss": 0.85862046, + "num_input_tokens_seen": 353918785, + "router_z_loss_clip": 1.28417969, + "router_z_loss_mlp": 0.09899902, + "step": 16402, + "time_per_iteration": 2.594348430633545 + }, + { + "auxiliary_loss_clip": 0.06402256, + "auxiliary_loss_mlp": 0.01262163, + "balance_loss_clip": 0.06270587, + "balance_loss_mlp": 0.01252978, + "epoch": 0.986201713512701, + "flos": 27790052457600.0, + "grad_norm": 1.7160674070535198, + "language_loss": 0.70323497, + "learning_rate": 1.9882384698881596e-09, + "loss": 0.77987915, + "num_input_tokens_seen": 353940390, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09185791, + "step": 16403, + "time_per_iteration": 4.039694547653198 + }, + { + "auxiliary_loss_clip": 0.06401487, + "auxiliary_loss_mlp": 0.01264425, + "balance_loss_clip": 0.06270707, + "balance_loss_mlp": 0.01255705, + "epoch": 0.986261836765369, + "flos": 28737902643840.0, + "grad_norm": 1.7955118608228118, + "language_loss": 0.74658298, + "learning_rate": 1.9709147019204566e-09, + "loss": 0.82324219, + "num_input_tokens_seen": 353962180, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.08721924, + "step": 16404, + "time_per_iteration": 2.539034366607666 + }, + { + "auxiliary_loss_clip": 0.06399557, + "auxiliary_loss_mlp": 0.0126528, + "balance_loss_clip": 0.06267157, + "balance_loss_mlp": 0.01255934, + "epoch": 0.986321960018037, + "flos": 34322028693120.0, + "grad_norm": 1.643480275660223, + "language_loss": 0.70353627, + "learning_rate": 1.953666699415768e-09, + "loss": 0.78018463, + "num_input_tokens_seen": 353984305, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.09344482, + "step": 16405, + "time_per_iteration": 2.6273982524871826 + }, + { + "auxiliary_loss_clip": 0.06396019, + "auxiliary_loss_mlp": 0.01263846, + "balance_loss_clip": 0.06269442, + "balance_loss_mlp": 0.01255764, + "epoch": 0.986382083270705, + "flos": 25196406301440.0, + "grad_norm": 1.6118450408666642, + "language_loss": 0.69949228, + "learning_rate": 1.93649446302846e-09, + "loss": 0.77609086, + "num_input_tokens_seen": 354004495, + "router_z_loss_clip": 1.265625, + "router_z_loss_mlp": 0.08087158, + "step": 16406, + "time_per_iteration": 2.5140862464904785 + }, + { + "auxiliary_loss_clip": 0.06398158, + "auxiliary_loss_mlp": 0.01267786, + "balance_loss_clip": 0.06270266, + "balance_loss_mlp": 0.01258655, + "epoch": 0.9864422065233729, + "flos": 11028953485440.0, + "grad_norm": 2.6977989926594597, + "language_loss": 0.75664067, + "learning_rate": 1.9193979934095663e-09, + "loss": 0.83330011, + "num_input_tokens_seen": 354015985, + "router_z_loss_clip": 1.27832031, + "router_z_loss_mlp": 0.09130859, + "step": 16407, + "time_per_iteration": 2.4719793796539307 + }, + { + "auxiliary_loss_clip": 0.06401893, + "auxiliary_loss_mlp": 0.0126585, + "balance_loss_clip": 0.06271636, + "balance_loss_mlp": 0.01256402, + "epoch": 0.9865023297760409, + "flos": 16551291548160.0, + "grad_norm": 1.7687262607764567, + "language_loss": 0.78086448, + "learning_rate": 1.9023772912072357e-09, + "loss": 0.85754192, + "num_input_tokens_seen": 354033260, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.09448242, + "step": 16408, + "time_per_iteration": 2.484081983566284 + }, + { + "auxiliary_loss_clip": 0.06408665, + "auxiliary_loss_mlp": 0.01263338, + "balance_loss_clip": 0.06272249, + "balance_loss_mlp": 0.0125243, + "epoch": 0.9865624530287088, + "flos": 18886186696320.0, + "grad_norm": 1.6749403374040852, + "language_loss": 0.68618417, + "learning_rate": 1.8854323570669515e-09, + "loss": 0.76290423, + "num_input_tokens_seen": 354052825, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.10900879, + "step": 16409, + "time_per_iteration": 2.57738995552063 + }, + { + "auxiliary_loss_clip": 0.06307763, + "auxiliary_loss_mlp": 0.01249973, + "balance_loss_clip": 0.06253904, + "balance_loss_mlp": 0.01248934, + "epoch": 0.9866225762813768, + "flos": 68905869068160.0, + "grad_norm": 0.7792419194004762, + "language_loss": 0.60673237, + "learning_rate": 1.8685631916313118e-09, + "loss": 0.68230969, + "num_input_tokens_seen": 354113920, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.01039124, + "step": 16410, + "time_per_iteration": 3.1789920330047607 + }, + { + "auxiliary_loss_clip": 0.06402837, + "auxiliary_loss_mlp": 0.0126605, + "balance_loss_clip": 0.06270561, + "balance_loss_mlp": 0.01256507, + "epoch": 0.9866826995340447, + "flos": 29030796990720.0, + "grad_norm": 1.9710910309404892, + "language_loss": 0.66693377, + "learning_rate": 1.8517697955400258e-09, + "loss": 0.74362266, + "num_input_tokens_seen": 354134210, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.09552002, + "step": 16411, + "time_per_iteration": 2.593170166015625 + }, + { + "auxiliary_loss_clip": 0.06309229, + "auxiliary_loss_mlp": 0.012507, + "balance_loss_clip": 0.06255028, + "balance_loss_mlp": 0.01249633, + "epoch": 0.9867428227867128, + "flos": 65399004460800.0, + "grad_norm": 0.7100176404553015, + "language_loss": 0.56223959, + "learning_rate": 1.8350521694299182e-09, + "loss": 0.63783884, + "num_input_tokens_seen": 354198010, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.01068115, + "step": 16412, + "time_per_iteration": 4.65021014213562 + }, + { + "auxiliary_loss_clip": 0.06410616, + "auxiliary_loss_mlp": 0.01264203, + "balance_loss_clip": 0.06273398, + "balance_loss_mlp": 0.01253856, + "epoch": 0.9868029460393807, + "flos": 26513697139200.0, + "grad_norm": 1.4737285322847526, + "language_loss": 0.73170412, + "learning_rate": 1.818410313934926e-09, + "loss": 0.80845225, + "num_input_tokens_seen": 354220000, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.10345459, + "step": 16413, + "time_per_iteration": 2.5816121101379395 + }, + { + "auxiliary_loss_clip": 0.0640188, + "auxiliary_loss_mlp": 0.01265077, + "balance_loss_clip": 0.06269288, + "balance_loss_mlp": 0.01255695, + "epoch": 0.9868630692920487, + "flos": 22974087513600.0, + "grad_norm": 1.3119989471392648, + "language_loss": 0.71715784, + "learning_rate": 1.8018442296858782e-09, + "loss": 0.79382741, + "num_input_tokens_seen": 354240910, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09381104, + "step": 16414, + "time_per_iteration": 2.621397018432617 + }, + { + "auxiliary_loss_clip": 0.0639587, + "auxiliary_loss_mlp": 0.01265336, + "balance_loss_clip": 0.06269387, + "balance_loss_mlp": 0.01256461, + "epoch": 0.9869231925447167, + "flos": 19834833496320.0, + "grad_norm": 1.641333270842883, + "language_loss": 0.70467007, + "learning_rate": 1.7853539173111608e-09, + "loss": 0.78128219, + "num_input_tokens_seen": 354259430, + "router_z_loss_clip": 1.265625, + "router_z_loss_mlp": 0.08880615, + "step": 16415, + "time_per_iteration": 2.472790241241455 + }, + { + "auxiliary_loss_clip": 0.06392305, + "auxiliary_loss_mlp": 0.01261183, + "balance_loss_clip": 0.06268395, + "balance_loss_mlp": 0.01252636, + "epoch": 0.9869833157973846, + "flos": 20201716598400.0, + "grad_norm": 1.4440519411439314, + "language_loss": 0.75557512, + "learning_rate": 1.7689393774362737e-09, + "loss": 0.83210999, + "num_input_tokens_seen": 354279490, + "router_z_loss_clip": 1.23730469, + "router_z_loss_mlp": 0.08551025, + "step": 16416, + "time_per_iteration": 2.5069968700408936 + }, + { + "auxiliary_loss_clip": 0.06400134, + "auxiliary_loss_mlp": 0.01266213, + "balance_loss_clip": 0.062718, + "balance_loss_mlp": 0.01256843, + "epoch": 0.9870434390500527, + "flos": 16103753291520.0, + "grad_norm": 1.8261025745727175, + "language_loss": 0.70445406, + "learning_rate": 1.7526006106833858e-09, + "loss": 0.78111756, + "num_input_tokens_seen": 354295080, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.09368896, + "step": 16417, + "time_per_iteration": 2.4517784118652344 + }, + { + "auxiliary_loss_clip": 0.06412635, + "auxiliary_loss_mlp": 0.01265538, + "balance_loss_clip": 0.06275108, + "balance_loss_mlp": 0.01255513, + "epoch": 0.9871035623027206, + "flos": 21766941267840.0, + "grad_norm": 1.4145230092930503, + "language_loss": 0.70816773, + "learning_rate": 1.7363376176720013e-09, + "loss": 0.78494942, + "num_input_tokens_seen": 354314610, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.10021973, + "step": 16418, + "time_per_iteration": 2.5196893215179443 + }, + { + "auxiliary_loss_clip": 0.06306736, + "auxiliary_loss_mlp": 0.01250685, + "balance_loss_clip": 0.06252833, + "balance_loss_mlp": 0.0124971, + "epoch": 0.9871636855553886, + "flos": 70240936970880.0, + "grad_norm": 0.6409677987917212, + "language_loss": 0.53744692, + "learning_rate": 1.7201503990189603e-09, + "loss": 0.61302114, + "num_input_tokens_seen": 354383115, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.00975037, + "step": 16419, + "time_per_iteration": 4.717554330825806 + }, + { + "auxiliary_loss_clip": 0.06404417, + "auxiliary_loss_mlp": 0.01264412, + "balance_loss_clip": 0.06268717, + "balance_loss_mlp": 0.01254464, + "epoch": 0.9872238088080565, + "flos": 25052789203200.0, + "grad_norm": 2.3110174767600635, + "language_loss": 0.78357494, + "learning_rate": 1.7040389553382162e-09, + "loss": 0.86026323, + "num_input_tokens_seen": 354403115, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.0994873, + "step": 16420, + "time_per_iteration": 2.522343635559082 + }, + { + "auxiliary_loss_clip": 0.06399032, + "auxiliary_loss_mlp": 0.01265144, + "balance_loss_clip": 0.06271401, + "balance_loss_mlp": 0.01256126, + "epoch": 0.9872839320607245, + "flos": 19472268879360.0, + "grad_norm": 1.4612255153298364, + "language_loss": 0.7113086, + "learning_rate": 1.6880032872403916e-09, + "loss": 0.7879504, + "num_input_tokens_seen": 354424520, + "router_z_loss_clip": 1.27734375, + "router_z_loss_mlp": 0.09020996, + "step": 16421, + "time_per_iteration": 2.5539984703063965 + }, + { + "auxiliary_loss_clip": 0.06407337, + "auxiliary_loss_mlp": 0.01266413, + "balance_loss_clip": 0.06271101, + "balance_loss_mlp": 0.0125659, + "epoch": 0.9873440553133924, + "flos": 26950166657280.0, + "grad_norm": 1.8769291751528816, + "language_loss": 0.82184935, + "learning_rate": 1.6720433953338886e-09, + "loss": 0.89858687, + "num_input_tokens_seen": 354444800, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.09820557, + "step": 16422, + "time_per_iteration": 2.5518367290496826 + }, + { + "auxiliary_loss_clip": 0.06399193, + "auxiliary_loss_mlp": 0.01263419, + "balance_loss_clip": 0.0627217, + "balance_loss_mlp": 0.01254884, + "epoch": 0.9874041785660604, + "flos": 19068181764480.0, + "grad_norm": 1.559911203458106, + "language_loss": 0.85809267, + "learning_rate": 1.656159280223779e-09, + "loss": 0.93471885, + "num_input_tokens_seen": 354464590, + "router_z_loss_clip": 1.27050781, + "router_z_loss_mlp": 0.08538818, + "step": 16423, + "time_per_iteration": 2.511932849884033 + }, + { + "auxiliary_loss_clip": 0.06401457, + "auxiliary_loss_mlp": 0.01264252, + "balance_loss_clip": 0.06268983, + "balance_loss_mlp": 0.01255144, + "epoch": 0.9874643018187284, + "flos": 21112195063680.0, + "grad_norm": 1.7455614219935738, + "language_loss": 0.70705903, + "learning_rate": 1.6403509425122475e-09, + "loss": 0.78371602, + "num_input_tokens_seen": 354484145, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.09106445, + "step": 16424, + "time_per_iteration": 2.5012552738189697 + }, + { + "auxiliary_loss_clip": 0.0640029, + "auxiliary_loss_mlp": 0.01266657, + "balance_loss_clip": 0.062686, + "balance_loss_mlp": 0.01257197, + "epoch": 0.9875244250713964, + "flos": 24432982951680.0, + "grad_norm": 1.9007162164582931, + "language_loss": 0.81031597, + "learning_rate": 1.6246183827990366e-09, + "loss": 0.88698548, + "num_input_tokens_seen": 354502475, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.09466553, + "step": 16425, + "time_per_iteration": 2.5602309703826904 + }, + { + "auxiliary_loss_clip": 0.06403489, + "auxiliary_loss_mlp": 0.01265933, + "balance_loss_clip": 0.06270744, + "balance_loss_mlp": 0.01255884, + "epoch": 0.9875845483240643, + "flos": 25124388117120.0, + "grad_norm": 1.972243539520393, + "language_loss": 0.80218101, + "learning_rate": 1.6089616016803364e-09, + "loss": 0.87887526, + "num_input_tokens_seen": 354521855, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.10046387, + "step": 16426, + "time_per_iteration": 4.0034801959991455 + }, + { + "auxiliary_loss_clip": 0.06401198, + "auxiliary_loss_mlp": 0.01268645, + "balance_loss_clip": 0.06273003, + "balance_loss_mlp": 0.01258763, + "epoch": 0.9876446715767323, + "flos": 16587447384960.0, + "grad_norm": 1.730891223738535, + "language_loss": 0.84535158, + "learning_rate": 1.593380599750338e-09, + "loss": 0.92205, + "num_input_tokens_seen": 354539535, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.09887695, + "step": 16427, + "time_per_iteration": 2.5224578380584717 + }, + { + "auxiliary_loss_clip": 0.06397066, + "auxiliary_loss_mlp": 0.01263748, + "balance_loss_clip": 0.06267832, + "balance_loss_mlp": 0.01254218, + "epoch": 0.9877047948294003, + "flos": 21622527555840.0, + "grad_norm": 1.6150790821834389, + "language_loss": 0.70599663, + "learning_rate": 1.577875377599458e-09, + "loss": 0.78260475, + "num_input_tokens_seen": 354557430, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.09527588, + "step": 16428, + "time_per_iteration": 2.530439615249634 + }, + { + "auxiliary_loss_clip": 0.06398337, + "auxiliary_loss_mlp": 0.01265208, + "balance_loss_clip": 0.06270449, + "balance_loss_mlp": 0.01256386, + "epoch": 0.9877649180820682, + "flos": 21184842153600.0, + "grad_norm": 1.9151863241472484, + "language_loss": 0.80755043, + "learning_rate": 1.5624459358158926e-09, + "loss": 0.88418591, + "num_input_tokens_seen": 354574735, + "router_z_loss_clip": 1.27929688, + "router_z_loss_mlp": 0.0881958, + "step": 16429, + "time_per_iteration": 2.5248844623565674 + }, + { + "auxiliary_loss_clip": 0.06398588, + "auxiliary_loss_mlp": 0.01266267, + "balance_loss_clip": 0.06267557, + "balance_loss_mlp": 0.01256724, + "epoch": 0.9878250413347363, + "flos": 39758596502400.0, + "grad_norm": 1.529650874257726, + "language_loss": 0.62086964, + "learning_rate": 1.5470922749845073e-09, + "loss": 0.69751823, + "num_input_tokens_seen": 354597050, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09545898, + "step": 16430, + "time_per_iteration": 2.6865828037261963 + }, + { + "auxiliary_loss_clip": 0.06405398, + "auxiliary_loss_mlp": 0.01268313, + "balance_loss_clip": 0.06273668, + "balance_loss_mlp": 0.01259044, + "epoch": 0.9878851645874042, + "flos": 29433584367360.0, + "grad_norm": 1.294361870195289, + "language_loss": 0.73193979, + "learning_rate": 1.531814395687725e-09, + "loss": 0.8086769, + "num_input_tokens_seen": 354619095, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.09277344, + "step": 16431, + "time_per_iteration": 2.584623336791992 + }, + { + "auxiliary_loss_clip": 0.06408115, + "auxiliary_loss_mlp": 0.01268719, + "balance_loss_clip": 0.06277065, + "balance_loss_mlp": 0.01259021, + "epoch": 0.9879452878400722, + "flos": 15810230039040.0, + "grad_norm": 2.003563247379043, + "language_loss": 0.80578899, + "learning_rate": 1.5166122985048602e-09, + "loss": 0.88255733, + "num_input_tokens_seen": 354633790, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.09698486, + "step": 16432, + "time_per_iteration": 2.4977803230285645 + }, + { + "auxiliary_loss_clip": 0.0639713, + "auxiliary_loss_mlp": 0.01263453, + "balance_loss_clip": 0.06268157, + "balance_loss_mlp": 0.01255121, + "epoch": 0.9880054110927401, + "flos": 22239985893120.0, + "grad_norm": 1.8780022898088136, + "language_loss": 0.80855387, + "learning_rate": 1.5014859840123405e-09, + "loss": 0.88515973, + "num_input_tokens_seen": 354653180, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.08331299, + "step": 16433, + "time_per_iteration": 2.559974431991577 + }, + { + "auxiliary_loss_clip": 0.06398477, + "auxiliary_loss_mlp": 0.01263192, + "balance_loss_clip": 0.06270765, + "balance_loss_mlp": 0.01254067, + "epoch": 0.9880655343454081, + "flos": 28770830098560.0, + "grad_norm": 3.071762614653828, + "language_loss": 0.65055972, + "learning_rate": 1.4864354527837075e-09, + "loss": 0.72717643, + "num_input_tokens_seen": 354669900, + "router_z_loss_clip": 1.27734375, + "router_z_loss_mlp": 0.09130859, + "step": 16434, + "time_per_iteration": 2.5954465866088867 + }, + { + "auxiliary_loss_clip": 0.06404148, + "auxiliary_loss_mlp": 0.01263004, + "balance_loss_clip": 0.06270909, + "balance_loss_mlp": 0.01253104, + "epoch": 0.988125657598076, + "flos": 32861581954560.0, + "grad_norm": 1.5114449517285122, + "language_loss": 0.69690335, + "learning_rate": 1.4714607053896154e-09, + "loss": 0.77357489, + "num_input_tokens_seen": 354693165, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.09899902, + "step": 16435, + "time_per_iteration": 2.6048479080200195 + }, + { + "auxiliary_loss_clip": 0.06401417, + "auxiliary_loss_mlp": 0.01263505, + "balance_loss_clip": 0.0627191, + "balance_loss_mlp": 0.01253915, + "epoch": 0.988185780850744, + "flos": 19396728823680.0, + "grad_norm": 1.5491204598191355, + "language_loss": 0.75873798, + "learning_rate": 1.4565617423980548e-09, + "loss": 0.83538723, + "num_input_tokens_seen": 354711915, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.09594727, + "step": 16436, + "time_per_iteration": 2.5019142627716064 + }, + { + "auxiliary_loss_clip": 0.06399369, + "auxiliary_loss_mlp": 0.01264401, + "balance_loss_clip": 0.06268755, + "balance_loss_mlp": 0.012549, + "epoch": 0.988245904103412, + "flos": 22534976592000.0, + "grad_norm": 2.2922063156337216, + "language_loss": 0.74628437, + "learning_rate": 1.4417385643741286e-09, + "loss": 0.82292199, + "num_input_tokens_seen": 354729135, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.09503174, + "step": 16437, + "time_per_iteration": 2.4945950508117676 + }, + { + "auxiliary_loss_clip": 0.06395677, + "auxiliary_loss_mlp": 0.01265076, + "balance_loss_clip": 0.06269895, + "balance_loss_mlp": 0.0125623, + "epoch": 0.98830602735608, + "flos": 28666974562560.0, + "grad_norm": 1.3644693930192495, + "language_loss": 0.60571569, + "learning_rate": 1.4269911718796103e-09, + "loss": 0.68232322, + "num_input_tokens_seen": 354752530, + "router_z_loss_clip": 1.25585938, + "router_z_loss_mlp": 0.08850098, + "step": 16438, + "time_per_iteration": 2.5724501609802246 + }, + { + "auxiliary_loss_clip": 0.06400715, + "auxiliary_loss_mlp": 0.01265196, + "balance_loss_clip": 0.06271615, + "balance_loss_mlp": 0.01255546, + "epoch": 0.9883661506087479, + "flos": 21002343960960.0, + "grad_norm": 1.7275853151179177, + "language_loss": 0.71979439, + "learning_rate": 1.4123195654738295e-09, + "loss": 0.79645348, + "num_input_tokens_seen": 354771135, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.09649658, + "step": 16439, + "time_per_iteration": 2.4901344776153564 + }, + { + "auxiliary_loss_clip": 0.06396712, + "auxiliary_loss_mlp": 0.01265241, + "balance_loss_clip": 0.06268465, + "balance_loss_mlp": 0.01256145, + "epoch": 0.9884262738614159, + "flos": 32714065641600.0, + "grad_norm": 1.7534107682801081, + "language_loss": 0.60016227, + "learning_rate": 1.3977237457134528e-09, + "loss": 0.67678177, + "num_input_tokens_seen": 354791800, + "router_z_loss_clip": 1.28320312, + "router_z_loss_mlp": 0.09100342, + "step": 16440, + "time_per_iteration": 2.6009092330932617 + }, + { + "auxiliary_loss_clip": 0.06403635, + "auxiliary_loss_mlp": 0.01262738, + "balance_loss_clip": 0.06269899, + "balance_loss_mlp": 0.01253243, + "epoch": 0.9884863971140839, + "flos": 17570153669760.0, + "grad_norm": 2.2528495077342634, + "language_loss": 0.76208878, + "learning_rate": 1.3832037131513707e-09, + "loss": 0.83875251, + "num_input_tokens_seen": 354809200, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.09503174, + "step": 16441, + "time_per_iteration": 2.520890712738037 + }, + { + "auxiliary_loss_clip": 0.06403451, + "auxiliary_loss_mlp": 0.01265503, + "balance_loss_clip": 0.06271541, + "balance_loss_mlp": 0.01256348, + "epoch": 0.9885465203667518, + "flos": 40562116830720.0, + "grad_norm": 1.8751614088289563, + "language_loss": 0.6817615, + "learning_rate": 1.3687594683386982e-09, + "loss": 0.75845104, + "num_input_tokens_seen": 354829945, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.09155273, + "step": 16442, + "time_per_iteration": 2.7019503116607666 + }, + { + "auxiliary_loss_clip": 0.06398676, + "auxiliary_loss_mlp": 0.01266035, + "balance_loss_clip": 0.06270617, + "balance_loss_mlp": 0.01257142, + "epoch": 0.9886066436194199, + "flos": 13813022044800.0, + "grad_norm": 2.4056325615728693, + "language_loss": 0.74363172, + "learning_rate": 1.3543910118227753e-09, + "loss": 0.82027876, + "num_input_tokens_seen": 354845055, + "router_z_loss_clip": 1.28222656, + "router_z_loss_mlp": 0.08892822, + "step": 16443, + "time_per_iteration": 3.9015562534332275 + }, + { + "auxiliary_loss_clip": 0.06400269, + "auxiliary_loss_mlp": 0.01268202, + "balance_loss_clip": 0.0626837, + "balance_loss_mlp": 0.01258326, + "epoch": 0.9886667668720878, + "flos": 23330824272000.0, + "grad_norm": 1.903669663592203, + "language_loss": 0.7392866, + "learning_rate": 1.3400983441487213e-09, + "loss": 0.81597131, + "num_input_tokens_seen": 354864680, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.09875488, + "step": 16444, + "time_per_iteration": 2.555422067642212 + }, + { + "auxiliary_loss_clip": 0.06397615, + "auxiliary_loss_mlp": 0.01264619, + "balance_loss_clip": 0.06270696, + "balance_loss_mlp": 0.01256238, + "epoch": 0.9887268901247558, + "flos": 22711814634240.0, + "grad_norm": 2.125613653372287, + "language_loss": 0.69637549, + "learning_rate": 1.325881465858547e-09, + "loss": 0.77299786, + "num_input_tokens_seen": 354885685, + "router_z_loss_clip": 1.26855469, + "router_z_loss_mlp": 0.08380127, + "step": 16445, + "time_per_iteration": 2.561236619949341 + }, + { + "auxiliary_loss_clip": 0.06407273, + "auxiliary_loss_mlp": 0.01262681, + "balance_loss_clip": 0.06277097, + "balance_loss_mlp": 0.01253269, + "epoch": 0.9887870133774237, + "flos": 13046118750720.0, + "grad_norm": 3.18173440901386, + "language_loss": 0.60854781, + "learning_rate": 1.311740377491155e-09, + "loss": 0.68524736, + "num_input_tokens_seen": 354901505, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.09411621, + "step": 16446, + "time_per_iteration": 2.4627370834350586 + }, + { + "auxiliary_loss_clip": 0.06401445, + "auxiliary_loss_mlp": 0.0126252, + "balance_loss_clip": 0.06271827, + "balance_loss_mlp": 0.01253967, + "epoch": 0.9888471366300917, + "flos": 15164288513280.0, + "grad_norm": 2.1657095582443797, + "language_loss": 0.71381092, + "learning_rate": 1.297675079582783e-09, + "loss": 0.79045057, + "num_input_tokens_seen": 354920060, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.08544922, + "step": 16447, + "time_per_iteration": 2.516580104827881 + }, + { + "auxiliary_loss_clip": 0.06397137, + "auxiliary_loss_mlp": 0.01264224, + "balance_loss_clip": 0.06267823, + "balance_loss_mlp": 0.01255445, + "epoch": 0.9889072598827596, + "flos": 25125771709440.0, + "grad_norm": 1.6408411032004997, + "language_loss": 0.83849478, + "learning_rate": 1.2836855726667818e-09, + "loss": 0.91510838, + "num_input_tokens_seen": 354938690, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.08776855, + "step": 16448, + "time_per_iteration": 2.6334075927734375 + }, + { + "auxiliary_loss_clip": 0.06398049, + "auxiliary_loss_mlp": 0.01263872, + "balance_loss_clip": 0.0627088, + "balance_loss_mlp": 0.01255378, + "epoch": 0.9889673831354276, + "flos": 16734502500480.0, + "grad_norm": 2.1555382523852766, + "language_loss": 0.70484287, + "learning_rate": 1.26977185727406e-09, + "loss": 0.78146207, + "num_input_tokens_seen": 354956955, + "router_z_loss_clip": 1.27148438, + "router_z_loss_mlp": 0.0848999, + "step": 16449, + "time_per_iteration": 2.533296823501587 + }, + { + "auxiliary_loss_clip": 0.06404455, + "auxiliary_loss_mlp": 0.01263914, + "balance_loss_clip": 0.06269993, + "balance_loss_mlp": 0.01254764, + "epoch": 0.9890275063880956, + "flos": 35593059277440.0, + "grad_norm": 2.393318173005223, + "language_loss": 0.74060148, + "learning_rate": 1.25593393393153e-09, + "loss": 0.81728518, + "num_input_tokens_seen": 354976800, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.09143066, + "step": 16450, + "time_per_iteration": 2.622335195541382 + }, + { + "auxiliary_loss_clip": 0.06403831, + "auxiliary_loss_mlp": 0.01265203, + "balance_loss_clip": 0.06269386, + "balance_loss_mlp": 0.01255755, + "epoch": 0.9890876296407636, + "flos": 18958246807680.0, + "grad_norm": 1.56920034871992, + "language_loss": 0.79678428, + "learning_rate": 1.242171803164549e-09, + "loss": 0.8734746, + "num_input_tokens_seen": 354996625, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.09454346, + "step": 16451, + "time_per_iteration": 2.5179364681243896 + }, + { + "auxiliary_loss_clip": 0.06404501, + "auxiliary_loss_mlp": 0.01263638, + "balance_loss_clip": 0.06270505, + "balance_loss_mlp": 0.01254179, + "epoch": 0.9891477528934315, + "flos": 23776140395520.0, + "grad_norm": 2.1825746418947283, + "language_loss": 0.70112723, + "learning_rate": 1.2284854654946996e-09, + "loss": 0.77780861, + "num_input_tokens_seen": 355014535, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.09460449, + "step": 16452, + "time_per_iteration": 3.9754366874694824 + }, + { + "auxiliary_loss_clip": 0.06395538, + "auxiliary_loss_mlp": 0.01263016, + "balance_loss_clip": 0.06269531, + "balance_loss_mlp": 0.01254999, + "epoch": 0.9892078761460995, + "flos": 20778490978560.0, + "grad_norm": 1.550723942339921, + "language_loss": 0.74353349, + "learning_rate": 1.2148749214409004e-09, + "loss": 0.82011908, + "num_input_tokens_seen": 355033280, + "router_z_loss_clip": 1.26074219, + "router_z_loss_mlp": 0.08013916, + "step": 16453, + "time_per_iteration": 2.5287036895751953 + }, + { + "auxiliary_loss_clip": 0.06401984, + "auxiliary_loss_mlp": 0.01266017, + "balance_loss_clip": 0.06270185, + "balance_loss_mlp": 0.01256003, + "epoch": 0.9892679993987675, + "flos": 23374568903040.0, + "grad_norm": 2.164886509887776, + "language_loss": 0.70232868, + "learning_rate": 1.2013401715191828e-09, + "loss": 0.77900863, + "num_input_tokens_seen": 355053320, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.10009766, + "step": 16454, + "time_per_iteration": 2.5466010570526123 + }, + { + "auxiliary_loss_clip": 0.06396247, + "auxiliary_loss_mlp": 0.01268105, + "balance_loss_clip": 0.06270434, + "balance_loss_mlp": 0.01259206, + "epoch": 0.9893281226514354, + "flos": 22711101874560.0, + "grad_norm": 1.889448765409953, + "language_loss": 0.75790614, + "learning_rate": 1.1878812162433583e-09, + "loss": 0.83454967, + "num_input_tokens_seen": 355070230, + "router_z_loss_clip": 1.25878906, + "router_z_loss_mlp": 0.08898926, + "step": 16455, + "time_per_iteration": 2.5108723640441895 + }, + { + "auxiliary_loss_clip": 0.06395634, + "auxiliary_loss_mlp": 0.01266751, + "balance_loss_clip": 0.0626895, + "balance_loss_mlp": 0.0125793, + "epoch": 0.9893882459041035, + "flos": 21802761688320.0, + "grad_norm": 1.755990040999191, + "language_loss": 0.65666765, + "learning_rate": 1.1744980561230188e-09, + "loss": 0.73329145, + "num_input_tokens_seen": 355090125, + "router_z_loss_clip": 1.26660156, + "router_z_loss_mlp": 0.0881958, + "step": 16456, + "time_per_iteration": 2.6398427486419678 + }, + { + "auxiliary_loss_clip": 0.06405662, + "auxiliary_loss_mlp": 0.0126407, + "balance_loss_clip": 0.06272131, + "balance_loss_mlp": 0.01254074, + "epoch": 0.9894483691567714, + "flos": 18119618818560.0, + "grad_norm": 1.8518732955546615, + "language_loss": 0.74572182, + "learning_rate": 1.161190691666203e-09, + "loss": 0.82241917, + "num_input_tokens_seen": 355107890, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.09997559, + "step": 16457, + "time_per_iteration": 2.546666383743286 + }, + { + "auxiliary_loss_clip": 0.06405069, + "auxiliary_loss_mlp": 0.01261015, + "balance_loss_clip": 0.06272469, + "balance_loss_mlp": 0.01251633, + "epoch": 0.9895084924094394, + "flos": 31219559418240.0, + "grad_norm": 4.713405572654526, + "language_loss": 0.69061947, + "learning_rate": 1.1479591233773954e-09, + "loss": 0.76728028, + "num_input_tokens_seen": 355126340, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09387207, + "step": 16458, + "time_per_iteration": 4.058138847351074 + }, + { + "auxiliary_loss_clip": 0.06397022, + "auxiliary_loss_mlp": 0.01264937, + "balance_loss_clip": 0.0626925, + "balance_loss_mlp": 0.01255836, + "epoch": 0.9895686156621073, + "flos": 19683376041600.0, + "grad_norm": 1.6011210328127727, + "language_loss": 0.79420429, + "learning_rate": 1.1348033517581956e-09, + "loss": 0.87082392, + "num_input_tokens_seen": 355144025, + "router_z_loss_clip": 1.27832031, + "router_z_loss_mlp": 0.09106445, + "step": 16459, + "time_per_iteration": 2.4854841232299805 + }, + { + "auxiliary_loss_clip": 0.06404197, + "auxiliary_loss_mlp": 0.01262909, + "balance_loss_clip": 0.06271499, + "balance_loss_mlp": 0.01253587, + "epoch": 0.9896287389147753, + "flos": 23587604709120.0, + "grad_norm": 1.7868867036072664, + "language_loss": 0.71253073, + "learning_rate": 1.1217233773075373e-09, + "loss": 0.78920174, + "num_input_tokens_seen": 355163125, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.09320068, + "step": 16460, + "time_per_iteration": 2.5770578384399414 + }, + { + "auxiliary_loss_clip": 0.06403832, + "auxiliary_loss_mlp": 0.01263939, + "balance_loss_clip": 0.06271418, + "balance_loss_mlp": 0.01254564, + "epoch": 0.9896888621674432, + "flos": 29612854177920.0, + "grad_norm": 1.4614514408304804, + "language_loss": 0.8714518, + "learning_rate": 1.1087192005214685e-09, + "loss": 0.94812953, + "num_input_tokens_seen": 355184060, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.09381104, + "step": 16461, + "time_per_iteration": 2.5683257579803467 + }, + { + "auxiliary_loss_clip": 0.06397907, + "auxiliary_loss_mlp": 0.01267148, + "balance_loss_clip": 0.06267931, + "balance_loss_mlp": 0.01256991, + "epoch": 0.9897489854201112, + "flos": 23701648515840.0, + "grad_norm": 1.9559550168181632, + "language_loss": 0.63296109, + "learning_rate": 1.09579082189315e-09, + "loss": 0.70961165, + "num_input_tokens_seen": 355204505, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.10162354, + "step": 16462, + "time_per_iteration": 2.5388832092285156 + }, + { + "auxiliary_loss_clip": 0.06400032, + "auxiliary_loss_mlp": 0.01263191, + "balance_loss_clip": 0.06270525, + "balance_loss_mlp": 0.01254602, + "epoch": 0.9898091086727792, + "flos": 13230252097920.0, + "grad_norm": 1.655754614262544, + "language_loss": 0.73308957, + "learning_rate": 1.0829382419126343e-09, + "loss": 0.80972171, + "num_input_tokens_seen": 355223055, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.08581543, + "step": 16463, + "time_per_iteration": 2.4658756256103516 + }, + { + "auxiliary_loss_clip": 0.06401134, + "auxiliary_loss_mlp": 0.01265976, + "balance_loss_clip": 0.06269235, + "balance_loss_mlp": 0.01256022, + "epoch": 0.9898692319254472, + "flos": 22937135063040.0, + "grad_norm": 7.43106423326432, + "language_loss": 0.7060079, + "learning_rate": 1.0701614610675314e-09, + "loss": 0.78267902, + "num_input_tokens_seen": 355242000, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09960938, + "step": 16464, + "time_per_iteration": 2.5187556743621826 + }, + { + "auxiliary_loss_clip": 0.06401595, + "auxiliary_loss_mlp": 0.01263658, + "balance_loss_clip": 0.06268543, + "balance_loss_mlp": 0.01254116, + "epoch": 0.9899293551781151, + "flos": 12463223022720.0, + "grad_norm": 1.840444252233611, + "language_loss": 0.73403418, + "learning_rate": 1.0574604798421204e-09, + "loss": 0.81068671, + "num_input_tokens_seen": 355260175, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.09545898, + "step": 16465, + "time_per_iteration": 3.9173574447631836 + }, + { + "auxiliary_loss_clip": 0.06399283, + "auxiliary_loss_mlp": 0.0126429, + "balance_loss_clip": 0.06270085, + "balance_loss_mlp": 0.01255648, + "epoch": 0.9899894784307831, + "flos": 26878567743360.0, + "grad_norm": 1.5836016872401681, + "language_loss": 0.86692631, + "learning_rate": 1.0448352987182386e-09, + "loss": 0.94356197, + "num_input_tokens_seen": 355281930, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.08642578, + "step": 16466, + "time_per_iteration": 2.5749897956848145 + }, + { + "auxiliary_loss_clip": 0.06399287, + "auxiliary_loss_mlp": 0.01264079, + "balance_loss_clip": 0.06269732, + "balance_loss_mlp": 0.01254775, + "epoch": 0.990049601683451, + "flos": 21548287238400.0, + "grad_norm": 1.6828167464492951, + "language_loss": 0.7183401, + "learning_rate": 1.0322859181743915e-09, + "loss": 0.79497385, + "num_input_tokens_seen": 355301555, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.09301758, + "step": 16467, + "time_per_iteration": 2.5681228637695312 + }, + { + "auxiliary_loss_clip": 0.06401198, + "auxiliary_loss_mlp": 0.01265476, + "balance_loss_clip": 0.06271577, + "balance_loss_mlp": 0.01256428, + "epoch": 0.990109724936119, + "flos": 28780137901440.0, + "grad_norm": 1.2060811454546625, + "language_loss": 0.65264559, + "learning_rate": 1.019812338686643e-09, + "loss": 0.7293123, + "num_input_tokens_seen": 355324925, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.09039307, + "step": 16468, + "time_per_iteration": 2.5990076065063477 + }, + { + "auxiliary_loss_clip": 0.06405121, + "auxiliary_loss_mlp": 0.01263189, + "balance_loss_clip": 0.06269127, + "balance_loss_mlp": 0.0125342, + "epoch": 0.9901698481887871, + "flos": 29281288371840.0, + "grad_norm": 1.6622205655344582, + "language_loss": 0.62186044, + "learning_rate": 1.0074145607281704e-09, + "loss": 0.69854355, + "num_input_tokens_seen": 355343875, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.09765625, + "step": 16469, + "time_per_iteration": 2.6046206951141357 + }, + { + "auxiliary_loss_clip": 0.06401995, + "auxiliary_loss_mlp": 0.01264459, + "balance_loss_clip": 0.06269933, + "balance_loss_mlp": 0.01255072, + "epoch": 0.990229971441455, + "flos": 15964161189120.0, + "grad_norm": 2.6347119694953007, + "language_loss": 0.70456368, + "learning_rate": 9.950925847685976e-10, + "loss": 0.78122824, + "num_input_tokens_seen": 355358835, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.09393311, + "step": 16470, + "time_per_iteration": 2.4915683269500732 + }, + { + "auxiliary_loss_clip": 0.06312285, + "auxiliary_loss_mlp": 0.01249711, + "balance_loss_clip": 0.06258221, + "balance_loss_mlp": 0.01248712, + "epoch": 0.990290094694123, + "flos": 69801322924800.0, + "grad_norm": 0.6553477289574845, + "language_loss": 0.55503154, + "learning_rate": 9.828464112755509e-10, + "loss": 0.63065147, + "num_input_tokens_seen": 355431225, + "router_z_loss_clip": 0.54003906, + "router_z_loss_mlp": 0.00998688, + "step": 16471, + "time_per_iteration": 3.337892770767212 + }, + { + "auxiliary_loss_clip": 0.06401256, + "auxiliary_loss_mlp": 0.0126515, + "balance_loss_clip": 0.06271065, + "balance_loss_mlp": 0.0125612, + "epoch": 0.9903502179467909, + "flos": 16257894076800.0, + "grad_norm": 1.8227264770016582, + "language_loss": 0.84216011, + "learning_rate": 9.706760407131032e-10, + "loss": 0.9188242, + "num_input_tokens_seen": 355448250, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.09033203, + "step": 16472, + "time_per_iteration": 2.5153591632843018 + }, + { + "auxiliary_loss_clip": 0.06403252, + "auxiliary_loss_mlp": 0.01265496, + "balance_loss_clip": 0.06271196, + "balance_loss_mlp": 0.01255965, + "epoch": 0.9904103411994589, + "flos": 21694671521280.0, + "grad_norm": 1.7363396784721263, + "language_loss": 0.86251837, + "learning_rate": 9.585814735431075e-10, + "loss": 0.93920588, + "num_input_tokens_seen": 355467040, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.09527588, + "step": 16473, + "time_per_iteration": 2.514474630355835 + }, + { + "auxiliary_loss_clip": 0.0639959, + "auxiliary_loss_mlp": 0.01267324, + "balance_loss_clip": 0.0626872, + "balance_loss_mlp": 0.01258724, + "epoch": 0.9904704644521268, + "flos": 25746584209920.0, + "grad_norm": 1.9148437433101497, + "language_loss": 0.84488249, + "learning_rate": 9.465627102240859e-10, + "loss": 0.9215517, + "num_input_tokens_seen": 355487825, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.08605957, + "step": 16474, + "time_per_iteration": 2.561305284500122 + }, + { + "auxiliary_loss_clip": 0.06397276, + "auxiliary_loss_mlp": 0.01263163, + "balance_loss_clip": 0.06266478, + "balance_loss_mlp": 0.01254049, + "epoch": 0.9905305877047949, + "flos": 21914834924160.0, + "grad_norm": 2.553445622723368, + "language_loss": 0.76806021, + "learning_rate": 9.346197512116738e-10, + "loss": 0.84466457, + "num_input_tokens_seen": 355507445, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09112549, + "step": 16475, + "time_per_iteration": 2.5100929737091064 + }, + { + "auxiliary_loss_clip": 0.06403254, + "auxiliary_loss_mlp": 0.01262449, + "balance_loss_clip": 0.06270232, + "balance_loss_mlp": 0.01252871, + "epoch": 0.9905907109574628, + "flos": 21397961813760.0, + "grad_norm": 1.4250465308129456, + "language_loss": 0.7599352, + "learning_rate": 9.227525969588423e-10, + "loss": 0.8365922, + "num_input_tokens_seen": 355527205, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.09576416, + "step": 16476, + "time_per_iteration": 2.5377602577209473 + }, + { + "auxiliary_loss_clip": 0.06410898, + "auxiliary_loss_mlp": 0.0126517, + "balance_loss_clip": 0.0627154, + "balance_loss_mlp": 0.01254298, + "epoch": 0.9906508342101308, + "flos": 20527831889280.0, + "grad_norm": 2.5556456243776684, + "language_loss": 0.67784524, + "learning_rate": 9.109612479154538e-10, + "loss": 0.75460589, + "num_input_tokens_seen": 355544740, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.10876465, + "step": 16477, + "time_per_iteration": 2.500948667526245 + }, + { + "auxiliary_loss_clip": 0.06406661, + "auxiliary_loss_mlp": 0.01267782, + "balance_loss_clip": 0.06271023, + "balance_loss_mlp": 0.01257113, + "epoch": 0.9907109574627987, + "flos": 21367633835520.0, + "grad_norm": 1.8026145726768161, + "language_loss": 0.71967936, + "learning_rate": 8.992457045289282e-10, + "loss": 0.79642379, + "num_input_tokens_seen": 355564385, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.10656738, + "step": 16478, + "time_per_iteration": 2.5416836738586426 + }, + { + "auxiliary_loss_clip": 0.0640207, + "auxiliary_loss_mlp": 0.01265345, + "balance_loss_clip": 0.06270047, + "balance_loss_mlp": 0.01255707, + "epoch": 0.9907710807154667, + "flos": 17342820743040.0, + "grad_norm": 2.2389355543560874, + "language_loss": 0.81408846, + "learning_rate": 8.876059672433545e-10, + "loss": 0.89076257, + "num_input_tokens_seen": 355579260, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.09631348, + "step": 16479, + "time_per_iteration": 2.4918854236602783 + }, + { + "auxiliary_loss_clip": 0.06405993, + "auxiliary_loss_mlp": 0.01266846, + "balance_loss_clip": 0.06272171, + "balance_loss_mlp": 0.01257417, + "epoch": 0.9908312039681346, + "flos": 28629518987520.0, + "grad_norm": 1.7024929779820783, + "language_loss": 0.6656878, + "learning_rate": 8.760420364999355e-10, + "loss": 0.7424162, + "num_input_tokens_seen": 355599790, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.09429932, + "step": 16480, + "time_per_iteration": 2.5911026000976562 + }, + { + "auxiliary_loss_clip": 0.06397465, + "auxiliary_loss_mlp": 0.01264495, + "balance_loss_clip": 0.06268594, + "balance_loss_mlp": 0.01255424, + "epoch": 0.9908913272208026, + "flos": 35779079341440.0, + "grad_norm": 1.9992383349547551, + "language_loss": 0.72380996, + "learning_rate": 8.645539127374313e-10, + "loss": 0.80042958, + "num_input_tokens_seen": 355620925, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.09069824, + "step": 16481, + "time_per_iteration": 2.6130805015563965 + }, + { + "auxiliary_loss_clip": 0.06397593, + "auxiliary_loss_mlp": 0.0126149, + "balance_loss_clip": 0.06269701, + "balance_loss_mlp": 0.01252591, + "epoch": 0.9909514504734707, + "flos": 19908444908160.0, + "grad_norm": 2.088225556047704, + "language_loss": 0.77833641, + "learning_rate": 8.531415963912713e-10, + "loss": 0.8549273, + "num_input_tokens_seen": 355639165, + "router_z_loss_clip": 1.27734375, + "router_z_loss_mlp": 0.08886719, + "step": 16482, + "time_per_iteration": 2.500314235687256 + }, + { + "auxiliary_loss_clip": 0.06400485, + "auxiliary_loss_mlp": 0.01263677, + "balance_loss_clip": 0.06268109, + "balance_loss_mlp": 0.01254331, + "epoch": 0.9910115737261386, + "flos": 20009910602880.0, + "grad_norm": 1.7779031696268206, + "language_loss": 0.75710553, + "learning_rate": 8.418050878944427e-10, + "loss": 0.83374715, + "num_input_tokens_seen": 355657320, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.09344482, + "step": 16483, + "time_per_iteration": 3.906383752822876 + }, + { + "auxiliary_loss_clip": 0.06312563, + "auxiliary_loss_mlp": 0.01253629, + "balance_loss_clip": 0.0625831, + "balance_loss_mlp": 0.01252545, + "epoch": 0.9910716969788066, + "flos": 70708950351360.0, + "grad_norm": 0.6833139744850949, + "language_loss": 0.53665406, + "learning_rate": 8.305443876768237e-10, + "loss": 0.61231595, + "num_input_tokens_seen": 355726370, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.010849, + "step": 16484, + "time_per_iteration": 3.2748491764068604 + }, + { + "auxiliary_loss_clip": 0.06397069, + "auxiliary_loss_mlp": 0.01263274, + "balance_loss_clip": 0.06271018, + "balance_loss_mlp": 0.01254608, + "epoch": 0.9911318202314745, + "flos": 21440448633600.0, + "grad_norm": 1.584141486996251, + "language_loss": 0.8189832, + "learning_rate": 8.19359496165184e-10, + "loss": 0.89558661, + "num_input_tokens_seen": 355745840, + "router_z_loss_clip": 1.26074219, + "router_z_loss_mlp": 0.08666992, + "step": 16485, + "time_per_iteration": 2.522608757019043 + }, + { + "auxiliary_loss_clip": 0.0639887, + "auxiliary_loss_mlp": 0.0126673, + "balance_loss_clip": 0.06270351, + "balance_loss_mlp": 0.01257402, + "epoch": 0.9911919434841425, + "flos": 19832653290240.0, + "grad_norm": 1.523507059973884, + "language_loss": 0.81901872, + "learning_rate": 8.082504137836288e-10, + "loss": 0.89567471, + "num_input_tokens_seen": 355763385, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.09332275, + "step": 16486, + "time_per_iteration": 2.5003557205200195 + }, + { + "auxiliary_loss_clip": 0.06405136, + "auxiliary_loss_mlp": 0.01263298, + "balance_loss_clip": 0.06271749, + "balance_loss_mlp": 0.01253887, + "epoch": 0.9912520667368104, + "flos": 41729040316800.0, + "grad_norm": 1.374674132460458, + "language_loss": 0.66326475, + "learning_rate": 7.972171409538209e-10, + "loss": 0.73994911, + "num_input_tokens_seen": 355786075, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.09417725, + "step": 16487, + "time_per_iteration": 2.6989879608154297 + }, + { + "auxiliary_loss_clip": 0.06396048, + "auxiliary_loss_mlp": 0.01260581, + "balance_loss_clip": 0.06269118, + "balance_loss_mlp": 0.01252671, + "epoch": 0.9913121899894785, + "flos": 23776559665920.0, + "grad_norm": 1.5105370838435217, + "language_loss": 0.77039683, + "learning_rate": 7.862596780936481e-10, + "loss": 0.84696317, + "num_input_tokens_seen": 355806295, + "router_z_loss_clip": 1.26953125, + "router_z_loss_mlp": 0.07913208, + "step": 16488, + "time_per_iteration": 2.5589473247528076 + }, + { + "auxiliary_loss_clip": 0.06408311, + "auxiliary_loss_mlp": 0.012619, + "balance_loss_clip": 0.06270079, + "balance_loss_mlp": 0.01251982, + "epoch": 0.9913723132421464, + "flos": 23776559665920.0, + "grad_norm": 4.081303895397492, + "language_loss": 0.68999302, + "learning_rate": 7.753780256190001e-10, + "loss": 0.76669514, + "num_input_tokens_seen": 355825730, + "router_z_loss_clip": 1.38183594, + "router_z_loss_mlp": 0.09912109, + "step": 16489, + "time_per_iteration": 2.5339298248291016 + }, + { + "auxiliary_loss_clip": 0.06312118, + "auxiliary_loss_mlp": 0.0125074, + "balance_loss_clip": 0.06257981, + "balance_loss_mlp": 0.01249744, + "epoch": 0.9914324364948144, + "flos": 71287234104960.0, + "grad_norm": 0.5966014121504264, + "language_loss": 0.52483445, + "learning_rate": 7.645721839424357e-10, + "loss": 0.60046303, + "num_input_tokens_seen": 355891545, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.00995636, + "step": 16490, + "time_per_iteration": 3.2971177101135254 + }, + { + "auxiliary_loss_clip": 0.06410297, + "auxiliary_loss_mlp": 0.01269129, + "balance_loss_clip": 0.06273858, + "balance_loss_mlp": 0.012588, + "epoch": 0.9914925597474823, + "flos": 23702109713280.0, + "grad_norm": 1.5769147749467787, + "language_loss": 0.75964558, + "learning_rate": 7.538421534734052e-10, + "loss": 0.83643979, + "num_input_tokens_seen": 355909920, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.10327148, + "step": 16491, + "time_per_iteration": 2.5335545539855957 + }, + { + "auxiliary_loss_clip": 0.06408622, + "auxiliary_loss_mlp": 0.01265093, + "balance_loss_clip": 0.06274007, + "balance_loss_mlp": 0.01254478, + "epoch": 0.9915526830001503, + "flos": 13437250410240.0, + "grad_norm": 2.0325070946840644, + "language_loss": 0.70255387, + "learning_rate": 7.431879346191383e-10, + "loss": 0.77929103, + "num_input_tokens_seen": 355923130, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.10626221, + "step": 16492, + "time_per_iteration": 3.9161179065704346 + }, + { + "auxiliary_loss_clip": 0.06401629, + "auxiliary_loss_mlp": 0.01263978, + "balance_loss_clip": 0.06271149, + "balance_loss_mlp": 0.01254191, + "epoch": 0.9916128062528182, + "flos": 20747282532480.0, + "grad_norm": 1.742564772152948, + "language_loss": 0.68796587, + "learning_rate": 7.326095277837563e-10, + "loss": 0.76462197, + "num_input_tokens_seen": 355941960, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.09783936, + "step": 16493, + "time_per_iteration": 2.5178070068359375 + }, + { + "auxiliary_loss_clip": 0.06404144, + "auxiliary_loss_mlp": 0.01264334, + "balance_loss_clip": 0.06268735, + "balance_loss_mlp": 0.0125478, + "epoch": 0.9916729295054862, + "flos": 22492825188480.0, + "grad_norm": 1.6130531837005415, + "language_loss": 0.71639037, + "learning_rate": 7.221069333678276e-10, + "loss": 0.79307514, + "num_input_tokens_seen": 355961640, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.09552002, + "step": 16494, + "time_per_iteration": 2.538949728012085 + }, + { + "auxiliary_loss_clip": 0.06406216, + "auxiliary_loss_mlp": 0.0126424, + "balance_loss_clip": 0.06271614, + "balance_loss_mlp": 0.01253755, + "epoch": 0.9917330527581543, + "flos": 14797573119360.0, + "grad_norm": 1.963098186344062, + "language_loss": 0.68285948, + "learning_rate": 7.116801517701443e-10, + "loss": 0.75956404, + "num_input_tokens_seen": 355977980, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.10491943, + "step": 16495, + "time_per_iteration": 2.4931821823120117 + }, + { + "auxiliary_loss_clip": 0.06310745, + "auxiliary_loss_mlp": 0.01252706, + "balance_loss_clip": 0.06256633, + "balance_loss_mlp": 0.01251622, + "epoch": 0.9917931760108222, + "flos": 59209551717120.0, + "grad_norm": 0.6971695961276645, + "language_loss": 0.5343821, + "learning_rate": 7.013291833859458e-10, + "loss": 0.61001664, + "num_input_tokens_seen": 356042900, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.01085663, + "step": 16496, + "time_per_iteration": 3.313877820968628 + }, + { + "auxiliary_loss_clip": 0.0640336, + "auxiliary_loss_mlp": 0.01264656, + "balance_loss_clip": 0.06270392, + "balance_loss_mlp": 0.01255054, + "epoch": 0.9918532992634902, + "flos": 26769052056960.0, + "grad_norm": 1.686792956138552, + "language_loss": 0.71729428, + "learning_rate": 6.91054028607585e-10, + "loss": 0.7939744, + "num_input_tokens_seen": 356063000, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.09613037, + "step": 16497, + "time_per_iteration": 4.06347918510437 + }, + { + "auxiliary_loss_clip": 0.06407828, + "auxiliary_loss_mlp": 0.01265363, + "balance_loss_clip": 0.06272013, + "balance_loss_mlp": 0.01255547, + "epoch": 0.9919134225161581, + "flos": 14980993706880.0, + "grad_norm": 2.091155080212875, + "language_loss": 0.82478547, + "learning_rate": 6.808546878249721e-10, + "loss": 0.90151739, + "num_input_tokens_seen": 356078130, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.0982666, + "step": 16498, + "time_per_iteration": 2.5037145614624023 + }, + { + "auxiliary_loss_clip": 0.06402234, + "auxiliary_loss_mlp": 0.01266692, + "balance_loss_clip": 0.06271948, + "balance_loss_mlp": 0.01257448, + "epoch": 0.9919735457688261, + "flos": 27825537461760.0, + "grad_norm": 3.5794951967468447, + "language_loss": 0.68476105, + "learning_rate": 6.707311614246869e-10, + "loss": 0.76145029, + "num_input_tokens_seen": 356101655, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.09246826, + "step": 16499, + "time_per_iteration": 2.5629689693450928 + }, + { + "auxiliary_loss_clip": 0.06405471, + "auxiliary_loss_mlp": 0.01263161, + "balance_loss_clip": 0.06270543, + "balance_loss_mlp": 0.01253792, + "epoch": 0.992033669021494, + "flos": 22568994149760.0, + "grad_norm": 2.4469510189518684, + "language_loss": 0.82463717, + "learning_rate": 6.606834497904223e-10, + "loss": 0.90132344, + "num_input_tokens_seen": 356121425, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.09368896, + "step": 16500, + "time_per_iteration": 2.5607094764709473 + }, + { + "auxiliary_loss_clip": 0.06403733, + "auxiliary_loss_mlp": 0.0126595, + "balance_loss_clip": 0.06271171, + "balance_loss_mlp": 0.01256121, + "epoch": 0.9920937922741621, + "flos": 25381671678720.0, + "grad_norm": 5.293314511420753, + "language_loss": 0.82256448, + "learning_rate": 6.507115533036511e-10, + "loss": 0.89926136, + "num_input_tokens_seen": 356140710, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09832764, + "step": 16501, + "time_per_iteration": 2.547940731048584 + }, + { + "auxiliary_loss_clip": 0.06401893, + "auxiliary_loss_mlp": 0.01267237, + "balance_loss_clip": 0.06269954, + "balance_loss_mlp": 0.01257897, + "epoch": 0.99215391552683, + "flos": 22061009571840.0, + "grad_norm": 2.044596215484759, + "language_loss": 0.7750001, + "learning_rate": 6.408154723420711e-10, + "loss": 0.85169148, + "num_input_tokens_seen": 356159835, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.09338379, + "step": 16502, + "time_per_iteration": 2.52785325050354 + }, + { + "auxiliary_loss_clip": 0.06407385, + "auxiliary_loss_mlp": 0.0126289, + "balance_loss_clip": 0.06270929, + "balance_loss_mlp": 0.01252549, + "epoch": 0.992214038779498, + "flos": 15419349941760.0, + "grad_norm": 2.2650147973319337, + "language_loss": 0.71174729, + "learning_rate": 6.309952072811597e-10, + "loss": 0.78845006, + "num_input_tokens_seen": 356177555, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.10333252, + "step": 16503, + "time_per_iteration": 2.479231595993042 + }, + { + "auxiliary_loss_clip": 0.06309342, + "auxiliary_loss_mlp": 0.01248757, + "balance_loss_clip": 0.06255215, + "balance_loss_mlp": 0.01247744, + "epoch": 0.9922741620321659, + "flos": 62035184701440.0, + "grad_norm": 0.6268759345910434, + "language_loss": 0.55145812, + "learning_rate": 6.212507584932858e-10, + "loss": 0.62703907, + "num_input_tokens_seen": 356244975, + "router_z_loss_clip": 0.54248047, + "router_z_loss_mlp": 0.01013184, + "step": 16504, + "time_per_iteration": 3.2505059242248535 + }, + { + "auxiliary_loss_clip": 0.06399435, + "auxiliary_loss_mlp": 0.01265661, + "balance_loss_clip": 0.06268956, + "balance_loss_mlp": 0.01257209, + "epoch": 0.9923342852848339, + "flos": 17171223580800.0, + "grad_norm": 1.6208802676549345, + "language_loss": 0.69611251, + "learning_rate": 6.115821263481536e-10, + "loss": 0.77276349, + "num_input_tokens_seen": 356262605, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.08441162, + "step": 16505, + "time_per_iteration": 3.9143412113189697 + }, + { + "auxiliary_loss_clip": 0.06404525, + "auxiliary_loss_mlp": 0.01263876, + "balance_loss_clip": 0.06269157, + "balance_loss_mlp": 0.01253356, + "epoch": 0.9923944085375018, + "flos": 23189555088000.0, + "grad_norm": 1.923670918802994, + "language_loss": 0.66283721, + "learning_rate": 6.019893112119146e-10, + "loss": 0.73952121, + "num_input_tokens_seen": 356278935, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.10522461, + "step": 16506, + "time_per_iteration": 2.4962158203125 + }, + { + "auxiliary_loss_clip": 0.0640049, + "auxiliary_loss_mlp": 0.01263896, + "balance_loss_clip": 0.06270368, + "balance_loss_mlp": 0.01254461, + "epoch": 0.9924545317901698, + "flos": 20820181184640.0, + "grad_norm": 2.195088142816573, + "language_loss": 0.63749093, + "learning_rate": 5.924723134487219e-10, + "loss": 0.71413481, + "num_input_tokens_seen": 356295675, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.09442139, + "step": 16507, + "time_per_iteration": 2.4816720485687256 + }, + { + "auxiliary_loss_clip": 0.06400108, + "auxiliary_loss_mlp": 0.01262795, + "balance_loss_clip": 0.06268136, + "balance_loss_mlp": 0.01253461, + "epoch": 0.9925146550428379, + "flos": 20089517581440.0, + "grad_norm": 2.0367572587682714, + "language_loss": 0.72877479, + "learning_rate": 5.830311334193983e-10, + "loss": 0.80540383, + "num_input_tokens_seen": 356312885, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.09332275, + "step": 16508, + "time_per_iteration": 2.481667995452881 + }, + { + "auxiliary_loss_clip": 0.06402674, + "auxiliary_loss_mlp": 0.01264359, + "balance_loss_clip": 0.06270348, + "balance_loss_mlp": 0.01254245, + "epoch": 0.9925747782955058, + "flos": 24980812945920.0, + "grad_norm": 1.4154056439024716, + "language_loss": 0.70592123, + "learning_rate": 5.736657714818793e-10, + "loss": 0.78259158, + "num_input_tokens_seen": 356334070, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.10113525, + "step": 16509, + "time_per_iteration": 2.5196590423583984 + }, + { + "auxiliary_loss_clip": 0.06405061, + "auxiliary_loss_mlp": 0.01265677, + "balance_loss_clip": 0.0627228, + "balance_loss_mlp": 0.01255789, + "epoch": 0.9926349015481738, + "flos": 60485250931200.0, + "grad_norm": 1.8295494813147601, + "language_loss": 0.68665648, + "learning_rate": 5.643762279912146e-10, + "loss": 0.76336384, + "num_input_tokens_seen": 356359410, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09893799, + "step": 16510, + "time_per_iteration": 2.8475050926208496 + }, + { + "auxiliary_loss_clip": 0.06405565, + "auxiliary_loss_mlp": 0.01264078, + "balance_loss_clip": 0.06273197, + "balance_loss_mlp": 0.01254261, + "epoch": 0.9926950248008417, + "flos": 20748163000320.0, + "grad_norm": 2.178338500168841, + "language_loss": 0.81844068, + "learning_rate": 5.551625032997886e-10, + "loss": 0.89513707, + "num_input_tokens_seen": 356378345, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.09814453, + "step": 16511, + "time_per_iteration": 2.5016791820526123 + }, + { + "auxiliary_loss_clip": 0.06398685, + "auxiliary_loss_mlp": 0.01262596, + "balance_loss_clip": 0.06270064, + "balance_loss_mlp": 0.01254126, + "epoch": 0.9927551480535097, + "flos": 24359874664320.0, + "grad_norm": 1.862945910053827, + "language_loss": 0.91819113, + "learning_rate": 5.460245977570998e-10, + "loss": 0.99480402, + "num_input_tokens_seen": 356397345, + "router_z_loss_clip": 1.28613281, + "router_z_loss_mlp": 0.08459473, + "step": 16512, + "time_per_iteration": 2.534518003463745 + }, + { + "auxiliary_loss_clip": 0.06313323, + "auxiliary_loss_mlp": 0.01262737, + "balance_loss_clip": 0.06259029, + "balance_loss_mlp": 0.01261737, + "epoch": 0.9928152713061776, + "flos": 71296751543040.0, + "grad_norm": 0.6913965440802265, + "language_loss": 0.55126524, + "learning_rate": 5.369625117095378e-10, + "loss": 0.62702584, + "num_input_tokens_seen": 356459160, + "router_z_loss_clip": 0.54345703, + "router_z_loss_mlp": 0.00999451, + "step": 16513, + "time_per_iteration": 3.224245071411133 + }, + { + "auxiliary_loss_clip": 0.06400467, + "auxiliary_loss_mlp": 0.01264294, + "balance_loss_clip": 0.06269906, + "balance_loss_mlp": 0.0125437, + "epoch": 0.9928753945588457, + "flos": 57821850650880.0, + "grad_norm": 1.4693700782931527, + "language_loss": 0.6477679, + "learning_rate": 5.279762455006054e-10, + "loss": 0.72441554, + "num_input_tokens_seen": 356486405, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.0993042, + "step": 16514, + "time_per_iteration": 2.8541526794433594 + }, + { + "auxiliary_loss_clip": 0.06402757, + "auxiliary_loss_mlp": 0.01267583, + "balance_loss_clip": 0.06270185, + "balance_loss_mlp": 0.01257296, + "epoch": 0.9929355178115136, + "flos": 19574363479680.0, + "grad_norm": 1.8661171060371329, + "language_loss": 0.73515117, + "learning_rate": 5.190657994713632e-10, + "loss": 0.81185454, + "num_input_tokens_seen": 356502905, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.10296631, + "step": 16515, + "time_per_iteration": 2.4777932167053223 + }, + { + "auxiliary_loss_clip": 0.06404781, + "auxiliary_loss_mlp": 0.01266644, + "balance_loss_clip": 0.06273709, + "balance_loss_mlp": 0.01256732, + "epoch": 0.9929956410641816, + "flos": 22971026839680.0, + "grad_norm": 1.348064631886549, + "language_loss": 0.77389991, + "learning_rate": 5.102311739593191e-10, + "loss": 0.85061419, + "num_input_tokens_seen": 356523830, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.09912109, + "step": 16516, + "time_per_iteration": 2.5608432292938232 + }, + { + "auxiliary_loss_clip": 0.06398392, + "auxiliary_loss_mlp": 0.01266123, + "balance_loss_clip": 0.06268544, + "balance_loss_mlp": 0.01256968, + "epoch": 0.9930557643168495, + "flos": 22573228780800.0, + "grad_norm": 1.58329129583989, + "language_loss": 0.78152323, + "learning_rate": 5.014723692997602e-10, + "loss": 0.85816836, + "num_input_tokens_seen": 356543965, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.09161377, + "step": 16517, + "time_per_iteration": 2.528740167617798 + }, + { + "auxiliary_loss_clip": 0.06407169, + "auxiliary_loss_mlp": 0.01267301, + "balance_loss_clip": 0.06271128, + "balance_loss_mlp": 0.01257741, + "epoch": 0.9931158875695175, + "flos": 17206624730880.0, + "grad_norm": 1.9618850991719492, + "language_loss": 0.67701828, + "learning_rate": 4.927893858248655e-10, + "loss": 0.75376302, + "num_input_tokens_seen": 356561530, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.09564209, + "step": 16518, + "time_per_iteration": 2.548466205596924 + }, + { + "auxiliary_loss_clip": 0.06309474, + "auxiliary_loss_mlp": 0.01253105, + "balance_loss_clip": 0.06255181, + "balance_loss_mlp": 0.01252109, + "epoch": 0.9931760108221854, + "flos": 63729142369920.0, + "grad_norm": 0.7167826797108764, + "language_loss": 0.53387469, + "learning_rate": 4.84182223863483e-10, + "loss": 0.60950041, + "num_input_tokens_seen": 356616845, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.00995636, + "step": 16519, + "time_per_iteration": 3.033399820327759 + }, + { + "auxiliary_loss_clip": 0.06400052, + "auxiliary_loss_mlp": 0.01264927, + "balance_loss_clip": 0.06270394, + "balance_loss_mlp": 0.01256076, + "epoch": 0.9932361340748534, + "flos": 15310253525760.0, + "grad_norm": 1.8743335975768634, + "language_loss": 0.60528129, + "learning_rate": 4.756508837426842e-10, + "loss": 0.68193108, + "num_input_tokens_seen": 356633560, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.08850098, + "step": 16520, + "time_per_iteration": 2.517338514328003 + }, + { + "auxiliary_loss_clip": 0.06401677, + "auxiliary_loss_mlp": 0.01264223, + "balance_loss_clip": 0.06270006, + "balance_loss_mlp": 0.01254859, + "epoch": 0.9932962573275215, + "flos": 36073776551040.0, + "grad_norm": 1.8911026139940599, + "language_loss": 0.62225491, + "learning_rate": 4.671953657853223e-10, + "loss": 0.69891393, + "num_input_tokens_seen": 356657600, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09356689, + "step": 16521, + "time_per_iteration": 2.6538894176483154 + }, + { + "auxiliary_loss_clip": 0.06403658, + "auxiliary_loss_mlp": 0.01264234, + "balance_loss_clip": 0.0626916, + "balance_loss_mlp": 0.01254268, + "epoch": 0.9933563805801894, + "flos": 21476939886720.0, + "grad_norm": 1.7541359047868907, + "language_loss": 0.74881208, + "learning_rate": 4.5881567031225145e-10, + "loss": 0.82549095, + "num_input_tokens_seen": 356675880, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.09973145, + "step": 16522, + "time_per_iteration": 3.9640562534332275 + }, + { + "auxiliary_loss_clip": 0.06399375, + "auxiliary_loss_mlp": 0.01265263, + "balance_loss_clip": 0.06270382, + "balance_loss_mlp": 0.01255673, + "epoch": 0.9934165038328574, + "flos": 23993117343360.0, + "grad_norm": 1.410603102343642, + "language_loss": 0.73254204, + "learning_rate": 4.5051179764143964e-10, + "loss": 0.80918843, + "num_input_tokens_seen": 356696000, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.0960083, + "step": 16523, + "time_per_iteration": 2.5403733253479004 + }, + { + "auxiliary_loss_clip": 0.06399913, + "auxiliary_loss_mlp": 0.01262714, + "balance_loss_clip": 0.06268643, + "balance_loss_mlp": 0.0125332, + "epoch": 0.9934766270855253, + "flos": 21914206018560.0, + "grad_norm": 1.45594715483847, + "language_loss": 0.71754086, + "learning_rate": 4.422837480875241e-10, + "loss": 0.79416716, + "num_input_tokens_seen": 356716845, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.09399414, + "step": 16524, + "time_per_iteration": 2.49554181098938 + }, + { + "auxiliary_loss_clip": 0.06401279, + "auxiliary_loss_mlp": 0.01261178, + "balance_loss_clip": 0.06269774, + "balance_loss_mlp": 0.01252035, + "epoch": 0.9935367503381933, + "flos": 17134900035840.0, + "grad_norm": 1.9388609047910152, + "language_loss": 0.79724878, + "learning_rate": 4.341315219624775e-10, + "loss": 0.87387335, + "num_input_tokens_seen": 356732100, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.09143066, + "step": 16525, + "time_per_iteration": 2.4689719676971436 + }, + { + "auxiliary_loss_clip": 0.06404391, + "auxiliary_loss_mlp": 0.0126308, + "balance_loss_clip": 0.06273483, + "balance_loss_mlp": 0.01253841, + "epoch": 0.9935968735908612, + "flos": 22352813815680.0, + "grad_norm": 2.8533353027739246, + "language_loss": 0.74970055, + "learning_rate": 4.2605511957582995e-10, + "loss": 0.82637525, + "num_input_tokens_seen": 356751480, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.09240723, + "step": 16526, + "time_per_iteration": 2.5054593086242676 + }, + { + "auxiliary_loss_clip": 0.06396805, + "auxiliary_loss_mlp": 0.01266824, + "balance_loss_clip": 0.06269647, + "balance_loss_mlp": 0.0125765, + "epoch": 0.9936569968435293, + "flos": 29468230830720.0, + "grad_norm": 1.4052771435638536, + "language_loss": 0.72989619, + "learning_rate": 4.180545412333369e-10, + "loss": 0.8065325, + "num_input_tokens_seen": 356772650, + "router_z_loss_clip": 1.27148438, + "router_z_loss_mlp": 0.0916748, + "step": 16527, + "time_per_iteration": 2.5682404041290283 + }, + { + "auxiliary_loss_clip": 0.06403709, + "auxiliary_loss_mlp": 0.01263759, + "balance_loss_clip": 0.06269115, + "balance_loss_mlp": 0.0125427, + "epoch": 0.9937171200961972, + "flos": 16549488685440.0, + "grad_norm": 2.193816392359614, + "language_loss": 0.7689482, + "learning_rate": 4.1012978723875547e-10, + "loss": 0.8456229, + "num_input_tokens_seen": 356788510, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.09490967, + "step": 16528, + "time_per_iteration": 2.4579873085021973 + }, + { + "auxiliary_loss_clip": 0.06405492, + "auxiliary_loss_mlp": 0.01264731, + "balance_loss_clip": 0.06270828, + "balance_loss_mlp": 0.01253919, + "epoch": 0.9937772433488652, + "flos": 24397330239360.0, + "grad_norm": 2.1223276204344494, + "language_loss": 0.68164897, + "learning_rate": 4.022808578922898e-10, + "loss": 0.75835121, + "num_input_tokens_seen": 356809115, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.10809326, + "step": 16529, + "time_per_iteration": 2.5395190715789795 + }, + { + "auxiliary_loss_clip": 0.06410487, + "auxiliary_loss_mlp": 0.01266372, + "balance_loss_clip": 0.062738, + "balance_loss_mlp": 0.01255357, + "epoch": 0.9938373666015331, + "flos": 15675459546240.0, + "grad_norm": 1.9586531091846018, + "language_loss": 0.65134317, + "learning_rate": 3.9450775349170186e-10, + "loss": 0.7281118, + "num_input_tokens_seen": 356826410, + "router_z_loss_clip": 1.36621094, + "router_z_loss_mlp": 0.11016846, + "step": 16530, + "time_per_iteration": 2.4729955196380615 + }, + { + "auxiliary_loss_clip": 0.06402886, + "auxiliary_loss_mlp": 0.0126345, + "balance_loss_clip": 0.06270776, + "balance_loss_mlp": 0.01254307, + "epoch": 0.9938974898542011, + "flos": 19501590608640.0, + "grad_norm": 1.9185750704175901, + "language_loss": 0.71495968, + "learning_rate": 3.8681047433186676e-10, + "loss": 0.79162306, + "num_input_tokens_seen": 356844990, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.0914917, + "step": 16531, + "time_per_iteration": 2.512540340423584 + }, + { + "auxiliary_loss_clip": 0.06404379, + "auxiliary_loss_mlp": 0.01270128, + "balance_loss_clip": 0.06272028, + "balance_loss_mlp": 0.01260485, + "epoch": 0.993957613106869, + "flos": 26914220455680.0, + "grad_norm": 1.3658354956475158, + "language_loss": 0.74276423, + "learning_rate": 3.791890207045512e-10, + "loss": 0.81950933, + "num_input_tokens_seen": 356866530, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.09643555, + "step": 16532, + "time_per_iteration": 4.052224397659302 + }, + { + "auxiliary_loss_clip": 0.06394548, + "auxiliary_loss_mlp": 0.01260602, + "balance_loss_clip": 0.06271665, + "balance_loss_mlp": 0.01252836, + "epoch": 0.994017736359537, + "flos": 14944921724160.0, + "grad_norm": 1.627443205614894, + "language_loss": 0.70665741, + "learning_rate": 3.7164339289885717e-10, + "loss": 0.78320897, + "num_input_tokens_seen": 356884660, + "router_z_loss_clip": 1.22851562, + "router_z_loss_mlp": 0.07769775, + "step": 16533, + "time_per_iteration": 2.5302624702453613 + }, + { + "auxiliary_loss_clip": 0.0640518, + "auxiliary_loss_mlp": 0.01263424, + "balance_loss_clip": 0.06270548, + "balance_loss_mlp": 0.01253088, + "epoch": 0.9940778596122051, + "flos": 15383361813120.0, + "grad_norm": 2.0547818206893362, + "language_loss": 0.84855843, + "learning_rate": 3.641735912007782e-10, + "loss": 0.92524445, + "num_input_tokens_seen": 356900895, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.10321045, + "step": 16534, + "time_per_iteration": 2.528353452682495 + }, + { + "auxiliary_loss_clip": 0.06395555, + "auxiliary_loss_mlp": 0.01264136, + "balance_loss_clip": 0.06271446, + "balance_loss_mlp": 0.01255118, + "epoch": 0.994137982864873, + "flos": 25235077760640.0, + "grad_norm": 1.3590448936998143, + "language_loss": 0.66083765, + "learning_rate": 3.567796158934211e-10, + "loss": 0.73743457, + "num_input_tokens_seen": 356920985, + "router_z_loss_clip": 1.24023438, + "router_z_loss_mlp": 0.09020996, + "step": 16535, + "time_per_iteration": 2.5445032119750977 + }, + { + "auxiliary_loss_clip": 0.06400403, + "auxiliary_loss_mlp": 0.01261695, + "balance_loss_clip": 0.06271672, + "balance_loss_mlp": 0.01253261, + "epoch": 0.994198106117541, + "flos": 18448040096640.0, + "grad_norm": 1.492382097158509, + "language_loss": 0.64826763, + "learning_rate": 3.4946146725767235e-10, + "loss": 0.72488862, + "num_input_tokens_seen": 356939800, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.08435059, + "step": 16536, + "time_per_iteration": 2.4790172576904297 + }, + { + "auxiliary_loss_clip": 0.063953, + "auxiliary_loss_mlp": 0.01266284, + "balance_loss_clip": 0.06267138, + "balance_loss_mlp": 0.01257325, + "epoch": 0.9942582293702089, + "flos": 16659675204480.0, + "grad_norm": 1.6373933785699804, + "language_loss": 0.79013014, + "learning_rate": 3.4221914557064357e-10, + "loss": 0.86674595, + "num_input_tokens_seen": 356957780, + "router_z_loss_clip": 1.28320312, + "router_z_loss_mlp": 0.08959961, + "step": 16537, + "time_per_iteration": 3.9477853775024414 + }, + { + "auxiliary_loss_clip": 0.06408946, + "auxiliary_loss_mlp": 0.01265972, + "balance_loss_clip": 0.06270771, + "balance_loss_mlp": 0.01255249, + "epoch": 0.9943183526228769, + "flos": 21951032688000.0, + "grad_norm": 1.5863603537176718, + "language_loss": 0.68719506, + "learning_rate": 3.35052651107004e-10, + "loss": 0.76394421, + "num_input_tokens_seen": 356979185, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.10717773, + "step": 16538, + "time_per_iteration": 2.560777187347412 + }, + { + "auxiliary_loss_clip": 0.06395986, + "auxiliary_loss_mlp": 0.01264597, + "balance_loss_clip": 0.06270739, + "balance_loss_mlp": 0.01255543, + "epoch": 0.9943784758755448, + "flos": 23849458318080.0, + "grad_norm": 1.9320392025007822, + "language_loss": 0.75314772, + "learning_rate": 3.2796198413853614e-10, + "loss": 0.82975346, + "num_input_tokens_seen": 356997735, + "router_z_loss_clip": 1.25292969, + "router_z_loss_mlp": 0.0904541, + "step": 16539, + "time_per_iteration": 2.5456387996673584 + }, + { + "auxiliary_loss_clip": 0.06405414, + "auxiliary_loss_mlp": 0.01263516, + "balance_loss_clip": 0.06272653, + "balance_loss_mlp": 0.01254414, + "epoch": 0.9944385991282129, + "flos": 21476310981120.0, + "grad_norm": 2.0585320600581287, + "language_loss": 0.70989531, + "learning_rate": 3.209471449341361e-10, + "loss": 0.78658462, + "num_input_tokens_seen": 357015660, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.09100342, + "step": 16540, + "time_per_iteration": 2.56339955329895 + }, + { + "auxiliary_loss_clip": 0.06397563, + "auxiliary_loss_mlp": 0.01263176, + "balance_loss_clip": 0.06268452, + "balance_loss_mlp": 0.01254253, + "epoch": 0.9944987223808808, + "flos": 22933193921280.0, + "grad_norm": 1.9177356075251677, + "language_loss": 0.75796914, + "learning_rate": 3.140081337600353e-10, + "loss": 0.83457649, + "num_input_tokens_seen": 357034800, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.08935547, + "step": 16541, + "time_per_iteration": 2.5349810123443604 + }, + { + "auxiliary_loss_clip": 0.06401087, + "auxiliary_loss_mlp": 0.01264778, + "balance_loss_clip": 0.06270842, + "balance_loss_mlp": 0.01254931, + "epoch": 0.9945588456335488, + "flos": 22389640485120.0, + "grad_norm": 1.8943263701308943, + "language_loss": 0.76886356, + "learning_rate": 3.0714495087891255e-10, + "loss": 0.84552217, + "num_input_tokens_seen": 357053785, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.09851074, + "step": 16542, + "time_per_iteration": 2.5378565788269043 + }, + { + "auxiliary_loss_clip": 0.06405424, + "auxiliary_loss_mlp": 0.01264121, + "balance_loss_clip": 0.06272386, + "balance_loss_mlp": 0.01253923, + "epoch": 0.9946189688862167, + "flos": 21403915453440.0, + "grad_norm": 1.9598697762283788, + "language_loss": 0.75353408, + "learning_rate": 3.0035759655122615e-10, + "loss": 0.83022952, + "num_input_tokens_seen": 357072025, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.10205078, + "step": 16543, + "time_per_iteration": 2.5946569442749023 + }, + { + "auxiliary_loss_clip": 0.06407975, + "auxiliary_loss_mlp": 0.01263482, + "balance_loss_clip": 0.06271738, + "balance_loss_mlp": 0.01253195, + "epoch": 0.9946790921388847, + "flos": 12420526567680.0, + "grad_norm": 3.189833149975994, + "language_loss": 0.81971997, + "learning_rate": 2.9364607103454785e-10, + "loss": 0.89643455, + "num_input_tokens_seen": 357086960, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.10296631, + "step": 16544, + "time_per_iteration": 2.595747470855713 + }, + { + "auxiliary_loss_clip": 0.06400429, + "auxiliary_loss_mlp": 0.01262665, + "balance_loss_clip": 0.06269884, + "balance_loss_mlp": 0.01253426, + "epoch": 0.9947392153915526, + "flos": 19063611717120.0, + "grad_norm": 1.7015698654881692, + "language_loss": 0.79186726, + "learning_rate": 2.870103745831187e-10, + "loss": 0.86849821, + "num_input_tokens_seen": 357105095, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.09234619, + "step": 16545, + "time_per_iteration": 3.9479551315307617 + }, + { + "auxiliary_loss_clip": 0.06405969, + "auxiliary_loss_mlp": 0.01262518, + "balance_loss_clip": 0.06272364, + "balance_loss_mlp": 0.01253288, + "epoch": 0.9947993386442207, + "flos": 27316295072640.0, + "grad_norm": 1.803677846508674, + "language_loss": 0.72430396, + "learning_rate": 2.8045050744873733e-10, + "loss": 0.80098879, + "num_input_tokens_seen": 357125065, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.09225464, + "step": 16546, + "time_per_iteration": 2.5391626358032227 + }, + { + "auxiliary_loss_clip": 0.064002, + "auxiliary_loss_mlp": 0.01262515, + "balance_loss_clip": 0.06271345, + "balance_loss_mlp": 0.01253336, + "epoch": 0.9948594618968887, + "flos": 20811586141440.0, + "grad_norm": 1.9161103078847286, + "language_loss": 0.77849805, + "learning_rate": 2.739664698798716e-10, + "loss": 0.85512525, + "num_input_tokens_seen": 357141600, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.09185791, + "step": 16547, + "time_per_iteration": 2.5525412559509277 + }, + { + "auxiliary_loss_clip": 0.06404825, + "auxiliary_loss_mlp": 0.01264169, + "balance_loss_clip": 0.06271931, + "balance_loss_mlp": 0.01255348, + "epoch": 0.9949195851495566, + "flos": 23299364263680.0, + "grad_norm": 2.386588561491637, + "language_loss": 0.70458543, + "learning_rate": 2.67558262122769e-10, + "loss": 0.78127539, + "num_input_tokens_seen": 357157880, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.08825684, + "step": 16548, + "time_per_iteration": 2.487410306930542 + }, + { + "auxiliary_loss_clip": 0.06400785, + "auxiliary_loss_mlp": 0.01262532, + "balance_loss_clip": 0.06270401, + "balance_loss_mlp": 0.01253395, + "epoch": 0.9949797084022246, + "flos": 18521441873280.0, + "grad_norm": 1.807359351493948, + "language_loss": 0.75424373, + "learning_rate": 2.6122588442012427e-10, + "loss": 0.83087683, + "num_input_tokens_seen": 357176705, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.09136963, + "step": 16549, + "time_per_iteration": 2.479980230331421 + }, + { + "auxiliary_loss_clip": 0.06405661, + "auxiliary_loss_mlp": 0.01265691, + "balance_loss_clip": 0.06271679, + "balance_loss_mlp": 0.01255302, + "epoch": 0.9950398316548925, + "flos": 30415326330240.0, + "grad_norm": 3.759297696668105, + "language_loss": 0.74710596, + "learning_rate": 2.5496933701241177e-10, + "loss": 0.82381952, + "num_input_tokens_seen": 357197630, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.1038208, + "step": 16550, + "time_per_iteration": 2.5653908252716064 + }, + { + "auxiliary_loss_clip": 0.06400557, + "auxiliary_loss_mlp": 0.01263204, + "balance_loss_clip": 0.06270449, + "balance_loss_mlp": 0.01254406, + "epoch": 0.9950999549075605, + "flos": 19906893607680.0, + "grad_norm": 1.505447061940754, + "language_loss": 0.78210282, + "learning_rate": 2.4878862013655297e-10, + "loss": 0.85874045, + "num_input_tokens_seen": 357215445, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.08795166, + "step": 16551, + "time_per_iteration": 2.502298593521118 + }, + { + "auxiliary_loss_clip": 0.06394917, + "auxiliary_loss_mlp": 0.01266008, + "balance_loss_clip": 0.06272274, + "balance_loss_mlp": 0.01258039, + "epoch": 0.9951600781602284, + "flos": 17609412107520.0, + "grad_norm": 1.3321877426988011, + "language_loss": 0.66736692, + "learning_rate": 2.426837340270271e-10, + "loss": 0.74397612, + "num_input_tokens_seen": 357234285, + "router_z_loss_clip": 1.2265625, + "router_z_loss_mlp": 0.07971191, + "step": 16552, + "time_per_iteration": 2.5603482723236084 + }, + { + "auxiliary_loss_clip": 0.06401337, + "auxiliary_loss_mlp": 0.01263383, + "balance_loss_clip": 0.06268856, + "balance_loss_mlp": 0.01254255, + "epoch": 0.9952202014128965, + "flos": 28958485317120.0, + "grad_norm": 1.2833907558121627, + "language_loss": 0.81770164, + "learning_rate": 2.3665467891520465e-10, + "loss": 0.89434886, + "num_input_tokens_seen": 357257565, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.09127808, + "step": 16553, + "time_per_iteration": 2.576486825942993 + }, + { + "auxiliary_loss_clip": 0.06314038, + "auxiliary_loss_mlp": 0.01256617, + "balance_loss_clip": 0.06259907, + "balance_loss_mlp": 0.01255608, + "epoch": 0.9952803246655644, + "flos": 70833014720640.0, + "grad_norm": 0.732372532890913, + "language_loss": 0.57316744, + "learning_rate": 2.3070145503001348e-10, + "loss": 0.64887393, + "num_input_tokens_seen": 357320205, + "router_z_loss_clip": 0.54199219, + "router_z_loss_mlp": 0.01009369, + "step": 16554, + "time_per_iteration": 3.206148147583008 + }, + { + "auxiliary_loss_clip": 0.0640728, + "auxiliary_loss_mlp": 0.0126751, + "balance_loss_clip": 0.06274211, + "balance_loss_mlp": 0.01258307, + "epoch": 0.9953404479182324, + "flos": 21805570800000.0, + "grad_norm": 1.762448547669116, + "language_loss": 0.77524269, + "learning_rate": 2.24824062597051e-10, + "loss": 0.85199058, + "num_input_tokens_seen": 357340695, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.09197998, + "step": 16555, + "time_per_iteration": 2.519479274749756 + }, + { + "auxiliary_loss_clip": 0.06400803, + "auxiliary_loss_mlp": 0.01267755, + "balance_loss_clip": 0.06269628, + "balance_loss_mlp": 0.01258355, + "epoch": 0.9954005711709003, + "flos": 21942647280000.0, + "grad_norm": 2.0814748850322156, + "language_loss": 0.86322951, + "learning_rate": 2.1902250183902793e-10, + "loss": 0.93991506, + "num_input_tokens_seen": 357357505, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.09399414, + "step": 16556, + "time_per_iteration": 2.5571491718292236 + }, + { + "auxiliary_loss_clip": 0.06396689, + "auxiliary_loss_mlp": 0.0126468, + "balance_loss_clip": 0.06268832, + "balance_loss_mlp": 0.01255734, + "epoch": 0.9954606944235683, + "flos": 19360656840960.0, + "grad_norm": 1.6249222072461627, + "language_loss": 0.72927034, + "learning_rate": 2.132967729762125e-10, + "loss": 0.805884, + "num_input_tokens_seen": 357375395, + "router_z_loss_clip": 1.27832031, + "router_z_loss_mlp": 0.08947754, + "step": 16557, + "time_per_iteration": 2.5323092937469482 + }, + { + "auxiliary_loss_clip": 0.06396444, + "auxiliary_loss_mlp": 0.0126417, + "balance_loss_clip": 0.06270406, + "balance_loss_mlp": 0.01255772, + "epoch": 0.9955208176762362, + "flos": 30526477171200.0, + "grad_norm": 1.7597019969018155, + "language_loss": 0.7678116, + "learning_rate": 2.0764687622554233e-10, + "loss": 0.84441775, + "num_input_tokens_seen": 357397375, + "router_z_loss_clip": 1.25976562, + "router_z_loss_mlp": 0.08395386, + "step": 16558, + "time_per_iteration": 2.5865776538848877 + }, + { + "auxiliary_loss_clip": 0.06402529, + "auxiliary_loss_mlp": 0.01263721, + "balance_loss_clip": 0.06270144, + "balance_loss_mlp": 0.0125397, + "epoch": 0.9955809409289043, + "flos": 30016102752000.0, + "grad_norm": 1.795429364473874, + "language_loss": 0.63227272, + "learning_rate": 2.0207281180129044e-10, + "loss": 0.70893526, + "num_input_tokens_seen": 357418880, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.09759521, + "step": 16559, + "time_per_iteration": 2.5742897987365723 + }, + { + "auxiliary_loss_clip": 0.06398577, + "auxiliary_loss_mlp": 0.01266428, + "balance_loss_clip": 0.06270035, + "balance_loss_mlp": 0.01257189, + "epoch": 0.9956410641815723, + "flos": 21549670830720.0, + "grad_norm": 1.7103757872781167, + "language_loss": 0.7445935, + "learning_rate": 1.965745799148433e-10, + "loss": 0.82124352, + "num_input_tokens_seen": 357438310, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.09240723, + "step": 16560, + "time_per_iteration": 2.4979653358459473 + }, + { + "auxiliary_loss_clip": 0.06398787, + "auxiliary_loss_mlp": 0.01262797, + "balance_loss_clip": 0.06268584, + "balance_loss_mlp": 0.01253695, + "epoch": 0.9957011874342402, + "flos": 21695929332480.0, + "grad_norm": 1.6604206822913, + "language_loss": 0.79359847, + "learning_rate": 1.9115218077470073e-10, + "loss": 0.87021428, + "num_input_tokens_seen": 357457155, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.09100342, + "step": 16561, + "time_per_iteration": 2.5015368461608887 + }, + { + "auxiliary_loss_clip": 0.06396727, + "auxiliary_loss_mlp": 0.01263664, + "balance_loss_clip": 0.06269149, + "balance_loss_mlp": 0.01255087, + "epoch": 0.9957613106869082, + "flos": 17706810879360.0, + "grad_norm": 2.6002951438446718, + "language_loss": 0.65660673, + "learning_rate": 1.8580561458647614e-10, + "loss": 0.73321062, + "num_input_tokens_seen": 357468060, + "router_z_loss_clip": 1.27636719, + "router_z_loss_mlp": 0.08569336, + "step": 16562, + "time_per_iteration": 3.870602607727051 + }, + { + "auxiliary_loss_clip": 0.06403091, + "auxiliary_loss_mlp": 0.01267098, + "balance_loss_clip": 0.06268853, + "balance_loss_mlp": 0.0125743, + "epoch": 0.9958214339395761, + "flos": 30564016600320.0, + "grad_norm": 1.8613517918936233, + "language_loss": 0.64495075, + "learning_rate": 1.805348815528962e-10, + "loss": 0.72165263, + "num_input_tokens_seen": 357489665, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.09655762, + "step": 16563, + "time_per_iteration": 2.5799973011016846 + }, + { + "auxiliary_loss_clip": 0.06400756, + "auxiliary_loss_mlp": 0.01266447, + "balance_loss_clip": 0.06270421, + "balance_loss_mlp": 0.01257149, + "epoch": 0.9958815571922441, + "flos": 24175825171200.0, + "grad_norm": 1.467683459705596, + "language_loss": 0.65106744, + "learning_rate": 1.7533998187380105e-10, + "loss": 0.72773945, + "num_input_tokens_seen": 357511975, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.09301758, + "step": 16564, + "time_per_iteration": 2.638465404510498 + }, + { + "auxiliary_loss_clip": 0.06400171, + "auxiliary_loss_mlp": 0.01265945, + "balance_loss_clip": 0.06270692, + "balance_loss_mlp": 0.01256485, + "epoch": 0.995941680444912, + "flos": 15492458229120.0, + "grad_norm": 1.759207175120335, + "language_loss": 0.74907964, + "learning_rate": 1.7022091574636633e-10, + "loss": 0.82574081, + "num_input_tokens_seen": 357529345, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.09454346, + "step": 16565, + "time_per_iteration": 2.513090133666992 + }, + { + "auxiliary_loss_clip": 0.06399176, + "auxiliary_loss_mlp": 0.0126437, + "balance_loss_clip": 0.06268928, + "balance_loss_mlp": 0.01255316, + "epoch": 0.9960018036975801, + "flos": 18626597147520.0, + "grad_norm": 1.999585355447059, + "language_loss": 0.79579604, + "learning_rate": 1.6517768336443694e-10, + "loss": 0.87243158, + "num_input_tokens_seen": 357547615, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.0904541, + "step": 16566, + "time_per_iteration": 2.4897356033325195 + }, + { + "auxiliary_loss_clip": 0.0639536, + "auxiliary_loss_mlp": 0.01264477, + "balance_loss_clip": 0.0626775, + "balance_loss_mlp": 0.0125579, + "epoch": 0.996061926950248, + "flos": 20090314195200.0, + "grad_norm": 1.6960254260383738, + "language_loss": 0.71283329, + "learning_rate": 1.6021028491941535e-10, + "loss": 0.78943169, + "num_input_tokens_seen": 357567380, + "router_z_loss_clip": 1.27539062, + "router_z_loss_mlp": 0.08688354, + "step": 16567, + "time_per_iteration": 2.4799892902374268 + }, + { + "auxiliary_loss_clip": 0.06404319, + "auxiliary_loss_mlp": 0.01268033, + "balance_loss_clip": 0.0627034, + "balance_loss_mlp": 0.01257883, + "epoch": 0.996122050202916, + "flos": 24353879097600.0, + "grad_norm": 2.5978628938543085, + "language_loss": 0.78895438, + "learning_rate": 1.5531872059959538e-10, + "loss": 0.86567795, + "num_input_tokens_seen": 357586435, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.10150146, + "step": 16568, + "time_per_iteration": 2.606168270111084 + }, + { + "auxiliary_loss_clip": 0.06396884, + "auxiliary_loss_mlp": 0.01265522, + "balance_loss_clip": 0.06271239, + "balance_loss_mlp": 0.01257404, + "epoch": 0.9961821734555839, + "flos": 24204895338240.0, + "grad_norm": 1.7013100229361442, + "language_loss": 0.82422203, + "learning_rate": 1.5050299059060634e-10, + "loss": 0.90084612, + "num_input_tokens_seen": 357604720, + "router_z_loss_clip": 1.25683594, + "router_z_loss_mlp": 0.08123779, + "step": 16569, + "time_per_iteration": 2.7738900184631348 + }, + { + "auxiliary_loss_clip": 0.06398392, + "auxiliary_loss_mlp": 0.01264989, + "balance_loss_clip": 0.06272103, + "balance_loss_mlp": 0.01256334, + "epoch": 0.9962422967082519, + "flos": 22639628741760.0, + "grad_norm": 1.7744118698102032, + "language_loss": 0.70764375, + "learning_rate": 1.457630950747468e-10, + "loss": 0.78427756, + "num_input_tokens_seen": 357622345, + "router_z_loss_clip": 1.26367188, + "router_z_loss_mlp": 0.08654785, + "step": 16570, + "time_per_iteration": 2.5719547271728516 + }, + { + "auxiliary_loss_clip": 0.06403951, + "auxiliary_loss_mlp": 0.01267572, + "balance_loss_clip": 0.06273632, + "balance_loss_mlp": 0.01257731, + "epoch": 0.9963024199609198, + "flos": 26403259057920.0, + "grad_norm": 1.528477322587173, + "language_loss": 0.7513268, + "learning_rate": 1.4109903423209502e-10, + "loss": 0.82804203, + "num_input_tokens_seen": 357642710, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.09838867, + "step": 16571, + "time_per_iteration": 2.541731595993042 + }, + { + "auxiliary_loss_clip": 0.06398408, + "auxiliary_loss_mlp": 0.01263982, + "balance_loss_clip": 0.06269142, + "balance_loss_mlp": 0.0125497, + "epoch": 0.9963625432135879, + "flos": 16587153895680.0, + "grad_norm": 2.215286054451634, + "language_loss": 0.79922211, + "learning_rate": 1.3651080823939843e-10, + "loss": 0.87584603, + "num_input_tokens_seen": 357659870, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.09014893, + "step": 16572, + "time_per_iteration": 3.920409917831421 + }, + { + "auxiliary_loss_clip": 0.06398214, + "auxiliary_loss_mlp": 0.01264598, + "balance_loss_clip": 0.06270143, + "balance_loss_mlp": 0.01255115, + "epoch": 0.9964226664662559, + "flos": 26475696512640.0, + "grad_norm": 1.736067517515339, + "language_loss": 0.70695126, + "learning_rate": 1.3199841727074e-10, + "loss": 0.78357947, + "num_input_tokens_seen": 357677075, + "router_z_loss_clip": 1.28027344, + "router_z_loss_mlp": 0.09484863, + "step": 16573, + "time_per_iteration": 2.518183469772339 + }, + { + "auxiliary_loss_clip": 0.0640582, + "auxiliary_loss_mlp": 0.01268788, + "balance_loss_clip": 0.06269335, + "balance_loss_mlp": 0.01258667, + "epoch": 0.9964827897189238, + "flos": 27454755144960.0, + "grad_norm": 1.5539558414743522, + "language_loss": 0.63445759, + "learning_rate": 1.275618614968721e-10, + "loss": 0.71120363, + "num_input_tokens_seen": 357696715, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.10119629, + "step": 16574, + "time_per_iteration": 2.5316076278686523 + }, + { + "auxiliary_loss_clip": 0.06409708, + "auxiliary_loss_mlp": 0.01264743, + "balance_loss_clip": 0.06273416, + "balance_loss_mlp": 0.01254437, + "epoch": 0.9965429129715918, + "flos": 11725138333440.0, + "grad_norm": 2.458562193325811, + "language_loss": 0.76547927, + "learning_rate": 1.2320114108654856e-10, + "loss": 0.84222376, + "num_input_tokens_seen": 357712345, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.10308838, + "step": 16575, + "time_per_iteration": 2.4830782413482666 + }, + { + "auxiliary_loss_clip": 0.06404927, + "auxiliary_loss_mlp": 0.01265709, + "balance_loss_clip": 0.06273346, + "balance_loss_mlp": 0.0125569, + "epoch": 0.9966030362242597, + "flos": 19762186406400.0, + "grad_norm": 1.68476172893604, + "language_loss": 0.70171261, + "learning_rate": 1.1891625620474855e-10, + "loss": 0.77841902, + "num_input_tokens_seen": 357731815, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.10021973, + "step": 16576, + "time_per_iteration": 2.4806809425354004 + }, + { + "auxiliary_loss_clip": 0.06396693, + "auxiliary_loss_mlp": 0.0126643, + "balance_loss_clip": 0.06271367, + "balance_loss_mlp": 0.01257466, + "epoch": 0.9966631594769277, + "flos": 23922021553920.0, + "grad_norm": 1.514407622643374, + "language_loss": 0.72368443, + "learning_rate": 1.1470720701400871e-10, + "loss": 0.80031562, + "num_input_tokens_seen": 357751640, + "router_z_loss_clip": 1.25488281, + "router_z_loss_mlp": 0.08966064, + "step": 16577, + "time_per_iteration": 3.9655070304870605 + }, + { + "auxiliary_loss_clip": 0.06402753, + "auxiliary_loss_mlp": 0.01267642, + "balance_loss_clip": 0.06271574, + "balance_loss_mlp": 0.01258356, + "epoch": 0.9967232827295956, + "flos": 15564979537920.0, + "grad_norm": 1.8728768870401162, + "language_loss": 0.79020208, + "learning_rate": 1.1057399367397912e-10, + "loss": 0.86690605, + "num_input_tokens_seen": 357769850, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.09295654, + "step": 16578, + "time_per_iteration": 2.465836524963379 + }, + { + "auxiliary_loss_clip": 0.06401658, + "auxiliary_loss_mlp": 0.01263584, + "balance_loss_clip": 0.06269468, + "balance_loss_mlp": 0.01254334, + "epoch": 0.9967834059822637, + "flos": 20819216862720.0, + "grad_norm": 1.5291705366711337, + "language_loss": 0.7613309, + "learning_rate": 1.0651661634142328e-10, + "loss": 0.83798331, + "num_input_tokens_seen": 357789550, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.09246826, + "step": 16579, + "time_per_iteration": 2.563567876815796 + }, + { + "auxiliary_loss_clip": 0.06405777, + "auxiliary_loss_mlp": 0.01267794, + "balance_loss_clip": 0.0627277, + "balance_loss_mlp": 0.01257476, + "epoch": 0.9968435292349316, + "flos": 36727809995520.0, + "grad_norm": 2.038574869304339, + "language_loss": 0.69993865, + "learning_rate": 1.0253507516999604e-10, + "loss": 0.77667433, + "num_input_tokens_seen": 357809525, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.10321045, + "step": 16580, + "time_per_iteration": 2.6372199058532715 + }, + { + "auxiliary_loss_clip": 0.06400767, + "auxiliary_loss_mlp": 0.01267201, + "balance_loss_clip": 0.06268618, + "balance_loss_mlp": 0.01257908, + "epoch": 0.9969036524875996, + "flos": 26768213516160.0, + "grad_norm": 1.8631774429365007, + "language_loss": 0.80034542, + "learning_rate": 9.862937031113184e-11, + "loss": 0.87702513, + "num_input_tokens_seen": 357829795, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.09295654, + "step": 16581, + "time_per_iteration": 2.586275815963745 + }, + { + "auxiliary_loss_clip": 0.06398311, + "auxiliary_loss_mlp": 0.01263701, + "balance_loss_clip": 0.06269346, + "balance_loss_mlp": 0.01254796, + "epoch": 0.9969637757402675, + "flos": 24834219027840.0, + "grad_norm": 1.5224635632541534, + "language_loss": 0.80819917, + "learning_rate": 9.479950191249031e-11, + "loss": 0.88481927, + "num_input_tokens_seen": 357851655, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.08898926, + "step": 16582, + "time_per_iteration": 2.5423171520233154 + }, + { + "auxiliary_loss_clip": 0.06398583, + "auxiliary_loss_mlp": 0.01262708, + "balance_loss_clip": 0.06271505, + "balance_loss_mlp": 0.01253851, + "epoch": 0.9970238989929355, + "flos": 23045309084160.0, + "grad_norm": 1.6463581574005606, + "language_loss": 0.60997719, + "learning_rate": 9.104547011951069e-11, + "loss": 0.68659008, + "num_input_tokens_seen": 357871205, + "router_z_loss_clip": 1.27148438, + "router_z_loss_mlp": 0.08856201, + "step": 16583, + "time_per_iteration": 2.5455894470214844 + }, + { + "auxiliary_loss_clip": 0.06403083, + "auxiliary_loss_mlp": 0.01263359, + "balance_loss_clip": 0.06270447, + "balance_loss_mlp": 0.01254263, + "epoch": 0.9970840222456034, + "flos": 25305418863360.0, + "grad_norm": 1.6487266342882827, + "language_loss": 0.78016913, + "learning_rate": 8.736727507452357e-11, + "loss": 0.85683358, + "num_input_tokens_seen": 357892145, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.09094238, + "step": 16584, + "time_per_iteration": 3.9929842948913574 + }, + { + "auxiliary_loss_clip": 0.06401587, + "auxiliary_loss_mlp": 0.01265911, + "balance_loss_clip": 0.06273124, + "balance_loss_mlp": 0.01257322, + "epoch": 0.9971441454982715, + "flos": 21621898650240.0, + "grad_norm": 1.4117034682008287, + "language_loss": 0.69645995, + "learning_rate": 8.376491691697297e-11, + "loss": 0.77313489, + "num_input_tokens_seen": 357911205, + "router_z_loss_clip": 1.28613281, + "router_z_loss_mlp": 0.0859375, + "step": 16585, + "time_per_iteration": 2.5167651176452637 + }, + { + "auxiliary_loss_clip": 0.06399348, + "auxiliary_loss_mlp": 0.01263258, + "balance_loss_clip": 0.06271613, + "balance_loss_mlp": 0.01253739, + "epoch": 0.9972042687509394, + "flos": 14980867925760.0, + "grad_norm": 2.0755557682308963, + "language_loss": 0.81635392, + "learning_rate": 8.023839578363834e-11, + "loss": 0.89298004, + "num_input_tokens_seen": 357928190, + "router_z_loss_clip": 1.27734375, + "router_z_loss_mlp": 0.09515381, + "step": 16586, + "time_per_iteration": 2.5256056785583496 + }, + { + "auxiliary_loss_clip": 0.06401335, + "auxiliary_loss_mlp": 0.01262833, + "balance_loss_clip": 0.06269982, + "balance_loss_mlp": 0.01253904, + "epoch": 0.9972643920036074, + "flos": 25812858389760.0, + "grad_norm": 1.749230535961165, + "language_loss": 0.78177583, + "learning_rate": 7.678771180796851e-11, + "loss": 0.85841757, + "num_input_tokens_seen": 357946985, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.08929443, + "step": 16587, + "time_per_iteration": 2.542367696762085 + }, + { + "auxiliary_loss_clip": 0.06403758, + "auxiliary_loss_mlp": 0.01269466, + "balance_loss_clip": 0.06272189, + "balance_loss_mlp": 0.01260162, + "epoch": 0.9973245152562754, + "flos": 23332124010240.0, + "grad_norm": 1.844090752894055, + "language_loss": 0.72692442, + "learning_rate": 7.341286512074773e-11, + "loss": 0.8036567, + "num_input_tokens_seen": 357966720, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.09307861, + "step": 16588, + "time_per_iteration": 2.5101404190063477 + }, + { + "auxiliary_loss_clip": 0.06406671, + "auxiliary_loss_mlp": 0.01265692, + "balance_loss_clip": 0.0626927, + "balance_loss_mlp": 0.01255702, + "epoch": 0.9973846385089433, + "flos": 12170999508480.0, + "grad_norm": 2.447200723458138, + "language_loss": 0.82740468, + "learning_rate": 7.011385585031781e-11, + "loss": 0.90412831, + "num_input_tokens_seen": 357981375, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.09991455, + "step": 16589, + "time_per_iteration": 2.452146053314209 + }, + { + "auxiliary_loss_clip": 0.06408519, + "auxiliary_loss_mlp": 0.01264547, + "balance_loss_clip": 0.06271757, + "balance_loss_mlp": 0.01253317, + "epoch": 0.9974447617616113, + "flos": 20050929976320.0, + "grad_norm": 2.015417296795279, + "language_loss": 0.70627606, + "learning_rate": 6.689068412168986e-11, + "loss": 0.78300673, + "num_input_tokens_seen": 358000290, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.11236572, + "step": 16590, + "time_per_iteration": 2.5470008850097656 + }, + { + "auxiliary_loss_clip": 0.06405114, + "auxiliary_loss_mlp": 0.01263511, + "balance_loss_clip": 0.06271853, + "balance_loss_mlp": 0.01253945, + "epoch": 0.9975048850142793, + "flos": 32022744330240.0, + "grad_norm": 1.7156925678226993, + "language_loss": 0.63968062, + "learning_rate": 6.374335005676634e-11, + "loss": 0.71636689, + "num_input_tokens_seen": 358022075, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.09570312, + "step": 16591, + "time_per_iteration": 2.5790483951568604 + }, + { + "auxiliary_loss_clip": 0.06401144, + "auxiliary_loss_mlp": 0.0126748, + "balance_loss_clip": 0.06268914, + "balance_loss_mlp": 0.01258552, + "epoch": 0.9975650082669473, + "flos": 36941600488320.0, + "grad_norm": 1.6209737833273146, + "language_loss": 0.7318058, + "learning_rate": 6.067185377522933e-11, + "loss": 0.80849206, + "num_input_tokens_seen": 358043940, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.08929443, + "step": 16592, + "time_per_iteration": 2.6874001026153564 + }, + { + "auxiliary_loss_clip": 0.06400564, + "auxiliary_loss_mlp": 0.01264326, + "balance_loss_clip": 0.06268974, + "balance_loss_mlp": 0.01254747, + "epoch": 0.9976251315196152, + "flos": 16477722063360.0, + "grad_norm": 1.4238943744939072, + "language_loss": 0.8514542, + "learning_rate": 5.767619539343016e-11, + "loss": 0.92810309, + "num_input_tokens_seen": 358062720, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.0958252, + "step": 16593, + "time_per_iteration": 2.500425338745117 + }, + { + "auxiliary_loss_clip": 0.06400873, + "auxiliary_loss_mlp": 0.01266904, + "balance_loss_clip": 0.06271567, + "balance_loss_mlp": 0.01258059, + "epoch": 0.9976852547722832, + "flos": 19653048063360.0, + "grad_norm": 1.552542866202301, + "language_loss": 0.69804668, + "learning_rate": 5.4756375024833656e-11, + "loss": 0.77472448, + "num_input_tokens_seen": 358081560, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.08837891, + "step": 16594, + "time_per_iteration": 2.499431610107422 + }, + { + "auxiliary_loss_clip": 0.06403884, + "auxiliary_loss_mlp": 0.01267202, + "balance_loss_clip": 0.06269734, + "balance_loss_mlp": 0.01257862, + "epoch": 0.9977453780249511, + "flos": 20454597820800.0, + "grad_norm": 1.975113527631894, + "language_loss": 0.73193353, + "learning_rate": 5.1912392780462113e-11, + "loss": 0.80864441, + "num_input_tokens_seen": 358099065, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.09338379, + "step": 16595, + "time_per_iteration": 2.4891738891601562 + }, + { + "auxiliary_loss_clip": 0.06308937, + "auxiliary_loss_mlp": 0.01250785, + "balance_loss_clip": 0.06254812, + "balance_loss_mlp": 0.01249741, + "epoch": 0.9978055012776191, + "flos": 65472085549440.0, + "grad_norm": 0.766579678458714, + "language_loss": 0.60467255, + "learning_rate": 4.9144248768007156e-11, + "loss": 0.68026978, + "num_input_tokens_seen": 358156095, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.01044464, + "step": 16596, + "time_per_iteration": 3.0071284770965576 + }, + { + "auxiliary_loss_clip": 0.06399873, + "auxiliary_loss_mlp": 0.01265753, + "balance_loss_clip": 0.06268875, + "balance_loss_mlp": 0.01256431, + "epoch": 0.997865624530287, + "flos": 20637808773120.0, + "grad_norm": 1.6512537501923108, + "language_loss": 0.77633482, + "learning_rate": 4.645194309227385e-11, + "loss": 0.8529911, + "num_input_tokens_seen": 358175230, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.09320068, + "step": 16597, + "time_per_iteration": 2.4939262866973877 + }, + { + "auxiliary_loss_clip": 0.06402931, + "auxiliary_loss_mlp": 0.0126284, + "balance_loss_clip": 0.06270836, + "balance_loss_mlp": 0.01253089, + "epoch": 0.9979257477829551, + "flos": 29394703272960.0, + "grad_norm": 1.6822966575262215, + "language_loss": 0.82273138, + "learning_rate": 4.383547585562475e-11, + "loss": 0.89938903, + "num_input_tokens_seen": 358197075, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.09753418, + "step": 16598, + "time_per_iteration": 2.57281494140625 + }, + { + "auxiliary_loss_clip": 0.06406281, + "auxiliary_loss_mlp": 0.01268438, + "balance_loss_clip": 0.06270172, + "balance_loss_mlp": 0.01257631, + "epoch": 0.997985871035623, + "flos": 22641180042240.0, + "grad_norm": 1.9442107163563487, + "language_loss": 0.65055943, + "learning_rate": 4.129484715709175e-11, + "loss": 0.7273066, + "num_input_tokens_seen": 358215925, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.10803223, + "step": 16599, + "time_per_iteration": 2.5110907554626465 + }, + { + "auxiliary_loss_clip": 0.06311208, + "auxiliary_loss_mlp": 0.01254339, + "balance_loss_clip": 0.06257115, + "balance_loss_mlp": 0.0125329, + "epoch": 0.998045994288291, + "flos": 61823421434880.0, + "grad_norm": 0.8427819693945304, + "language_loss": 0.62358809, + "learning_rate": 3.8830057093264256e-11, + "loss": 0.69924355, + "num_input_tokens_seen": 358269035, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.01049805, + "step": 16600, + "time_per_iteration": 3.0938379764556885 + }, + { + "auxiliary_loss_clip": 0.06400381, + "auxiliary_loss_mlp": 0.01262242, + "balance_loss_clip": 0.0626972, + "balance_loss_mlp": 0.01253212, + "epoch": 0.998106117540959, + "flos": 19251686206080.0, + "grad_norm": 1.6103653898018497, + "language_loss": 0.78675622, + "learning_rate": 3.644110575717896e-11, + "loss": 0.86338246, + "num_input_tokens_seen": 358287680, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.09033203, + "step": 16601, + "time_per_iteration": 2.6078760623931885 + }, + { + "auxiliary_loss_clip": 0.06409572, + "auxiliary_loss_mlp": 0.01264878, + "balance_loss_clip": 0.06273425, + "balance_loss_mlp": 0.01255484, + "epoch": 0.9981662407936269, + "flos": 21112656261120.0, + "grad_norm": 1.8622477211411699, + "language_loss": 0.82537067, + "learning_rate": 3.412799323987414e-11, + "loss": 0.90211511, + "num_input_tokens_seen": 358304080, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.09393311, + "step": 16602, + "time_per_iteration": 3.946537494659424 + }, + { + "auxiliary_loss_clip": 0.06402224, + "auxiliary_loss_mlp": 0.01264728, + "balance_loss_clip": 0.06271221, + "balance_loss_mlp": 0.01255478, + "epoch": 0.998226364046295, + "flos": 24323802681600.0, + "grad_norm": 2.0035158293659663, + "language_loss": 0.62724072, + "learning_rate": 3.189071962883538e-11, + "loss": 0.70391023, + "num_input_tokens_seen": 358323670, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.0925293, + "step": 16603, + "time_per_iteration": 2.5347180366516113 + }, + { + "auxiliary_loss_clip": 0.06403463, + "auxiliary_loss_mlp": 0.0126348, + "balance_loss_clip": 0.06271768, + "balance_loss_mlp": 0.01253836, + "epoch": 0.9982864872989629, + "flos": 23842246867200.0, + "grad_norm": 1.8205508857856618, + "language_loss": 0.71622694, + "learning_rate": 2.972928500866168e-11, + "loss": 0.79289639, + "num_input_tokens_seen": 358341980, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09637451, + "step": 16604, + "time_per_iteration": 2.5248515605926514 + }, + { + "auxiliary_loss_clip": 0.06401166, + "auxiliary_loss_mlp": 0.01260416, + "balance_loss_clip": 0.06269663, + "balance_loss_mlp": 0.0125101, + "epoch": 0.9983466105516309, + "flos": 18339069461760.0, + "grad_norm": 1.486707520198961, + "language_loss": 0.64735997, + "learning_rate": 2.7643689461953613e-11, + "loss": 0.72397572, + "num_input_tokens_seen": 358360400, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.09405518, + "step": 16605, + "time_per_iteration": 2.5203909873962402 + }, + { + "auxiliary_loss_clip": 0.06399968, + "auxiliary_loss_mlp": 0.01262227, + "balance_loss_clip": 0.06270541, + "balance_loss_mlp": 0.01252654, + "epoch": 0.9984067338042988, + "flos": 17242235516160.0, + "grad_norm": 1.6138823205609316, + "language_loss": 0.71377051, + "learning_rate": 2.5633933067092938e-11, + "loss": 0.7903924, + "num_input_tokens_seen": 358378990, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.09570312, + "step": 16606, + "time_per_iteration": 2.472602128982544 + }, + { + "auxiliary_loss_clip": 0.06399357, + "auxiliary_loss_mlp": 0.01262803, + "balance_loss_clip": 0.06269458, + "balance_loss_mlp": 0.01253666, + "epoch": 0.9984668570569668, + "flos": 20674174245120.0, + "grad_norm": 1.8806503380972919, + "language_loss": 0.82498664, + "learning_rate": 2.370001590090709e-11, + "loss": 0.90160817, + "num_input_tokens_seen": 358395970, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.0914917, + "step": 16607, + "time_per_iteration": 2.511127471923828 + }, + { + "auxiliary_loss_clip": 0.06407319, + "auxiliary_loss_mlp": 0.01264489, + "balance_loss_clip": 0.06272326, + "balance_loss_mlp": 0.01254362, + "epoch": 0.9985269803096347, + "flos": 30270241785600.0, + "grad_norm": 1.8869176334897872, + "language_loss": 0.66939551, + "learning_rate": 2.184193803622669e-11, + "loss": 0.7461136, + "num_input_tokens_seen": 358417355, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.10125732, + "step": 16608, + "time_per_iteration": 2.5830514430999756 + }, + { + "auxiliary_loss_clip": 0.06404091, + "auxiliary_loss_mlp": 0.01264125, + "balance_loss_clip": 0.06271875, + "balance_loss_mlp": 0.01254887, + "epoch": 0.9985871035623027, + "flos": 10565510152320.0, + "grad_norm": 2.1287331538283936, + "language_loss": 0.80895412, + "learning_rate": 2.0059699543883978e-11, + "loss": 0.88563633, + "num_input_tokens_seen": 358434345, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.09240723, + "step": 16609, + "time_per_iteration": 2.485151529312134 + }, + { + "auxiliary_loss_clip": 0.06403465, + "auxiliary_loss_mlp": 0.01265215, + "balance_loss_clip": 0.06271623, + "balance_loss_mlp": 0.01255684, + "epoch": 0.9986472268149706, + "flos": 16879125847680.0, + "grad_norm": 1.3951775563827955, + "language_loss": 0.62941349, + "learning_rate": 1.8353300491158462e-11, + "loss": 0.70610029, + "num_input_tokens_seen": 358452870, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09527588, + "step": 16610, + "time_per_iteration": 2.5005035400390625 + }, + { + "auxiliary_loss_clip": 0.06397352, + "auxiliary_loss_mlp": 0.01264817, + "balance_loss_clip": 0.06267397, + "balance_loss_mlp": 0.0125596, + "epoch": 0.9987073500676387, + "flos": 22061093425920.0, + "grad_norm": 1.999034741081423, + "language_loss": 0.67834997, + "learning_rate": 1.672274094288717e-11, + "loss": 0.75497168, + "num_input_tokens_seen": 358472210, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.08862305, + "step": 16611, + "time_per_iteration": 2.514544725418091 + }, + { + "auxiliary_loss_clip": 0.06401592, + "auxiliary_loss_mlp": 0.01263398, + "balance_loss_clip": 0.06270025, + "balance_loss_mlp": 0.0125382, + "epoch": 0.9987674733203066, + "flos": 30490866385920.0, + "grad_norm": 2.8198538577186265, + "language_loss": 0.70121431, + "learning_rate": 1.5168020961020544e-11, + "loss": 0.77786428, + "num_input_tokens_seen": 358493840, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.09570312, + "step": 16612, + "time_per_iteration": 4.045984268188477 + }, + { + "auxiliary_loss_clip": 0.06396015, + "auxiliary_loss_mlp": 0.01264259, + "balance_loss_clip": 0.06269395, + "balance_loss_mlp": 0.01255772, + "epoch": 0.9988275965729746, + "flos": 27752554955520.0, + "grad_norm": 1.4374108761182864, + "language_loss": 0.74011898, + "learning_rate": 1.3689140604400407e-11, + "loss": 0.81672174, + "num_input_tokens_seen": 358515060, + "router_z_loss_clip": 1.26660156, + "router_z_loss_mlp": 0.08483887, + "step": 16613, + "time_per_iteration": 2.565751314163208 + }, + { + "auxiliary_loss_clip": 0.06403059, + "auxiliary_loss_mlp": 0.01267726, + "balance_loss_clip": 0.06270773, + "balance_loss_mlp": 0.01257641, + "epoch": 0.9988877198256426, + "flos": 17528966588160.0, + "grad_norm": 2.032430631725315, + "language_loss": 0.73772359, + "learning_rate": 1.2286099928981996e-11, + "loss": 0.81443143, + "num_input_tokens_seen": 358528200, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.10083008, + "step": 16614, + "time_per_iteration": 2.455465078353882 + }, + { + "auxiliary_loss_clip": 0.06402258, + "auxiliary_loss_mlp": 0.01264143, + "balance_loss_clip": 0.06270853, + "balance_loss_mlp": 0.012546, + "epoch": 0.9989478430783105, + "flos": 21002889012480.0, + "grad_norm": 1.9940582429405083, + "language_loss": 0.73076797, + "learning_rate": 1.0958898988278065e-11, + "loss": 0.80743194, + "num_input_tokens_seen": 358548360, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.09539795, + "step": 16615, + "time_per_iteration": 2.5054819583892822 + }, + { + "auxiliary_loss_clip": 0.06406209, + "auxiliary_loss_mlp": 0.0126395, + "balance_loss_clip": 0.06272269, + "balance_loss_mlp": 0.01254479, + "epoch": 0.9990079663309785, + "flos": 13375672058880.0, + "grad_norm": 3.418620590990309, + "language_loss": 0.77891582, + "learning_rate": 9.70753783247069e-12, + "loss": 0.8556174, + "num_input_tokens_seen": 358566270, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.09466553, + "step": 16616, + "time_per_iteration": 3.982736825942993 + }, + { + "auxiliary_loss_clip": 0.06401001, + "auxiliary_loss_mlp": 0.01269245, + "balance_loss_clip": 0.06271502, + "balance_loss_mlp": 0.01260153, + "epoch": 0.9990680895836465, + "flos": 17315805000960.0, + "grad_norm": 1.7493662985892016, + "language_loss": 0.83197755, + "learning_rate": 8.532016508855378e-12, + "loss": 0.90867996, + "num_input_tokens_seen": 358584710, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.0909729, + "step": 16617, + "time_per_iteration": 2.481229782104492 + }, + { + "auxiliary_loss_clip": 0.06399592, + "auxiliary_loss_mlp": 0.0126193, + "balance_loss_clip": 0.06269813, + "balance_loss_mlp": 0.01253162, + "epoch": 0.9991282128363145, + "flos": 24215041681920.0, + "grad_norm": 1.5472149441524297, + "language_loss": 0.78848952, + "learning_rate": 7.43233506206309e-12, + "loss": 0.86510473, + "num_input_tokens_seen": 358606750, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.08764648, + "step": 16618, + "time_per_iteration": 2.5298168659210205 + }, + { + "auxiliary_loss_clip": 0.06397195, + "auxiliary_loss_mlp": 0.01262323, + "balance_loss_clip": 0.06267681, + "balance_loss_mlp": 0.01252799, + "epoch": 0.9991883360889824, + "flos": 21181110647040.0, + "grad_norm": 1.6455695651366786, + "language_loss": 0.7489872, + "learning_rate": 6.408493534060255e-12, + "loss": 0.82558239, + "num_input_tokens_seen": 358624675, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.09527588, + "step": 16619, + "time_per_iteration": 2.5116331577301025 + }, + { + "auxiliary_loss_clip": 0.06394048, + "auxiliary_loss_mlp": 0.01264154, + "balance_loss_clip": 0.06267348, + "balance_loss_mlp": 0.01255571, + "epoch": 0.9992484593416504, + "flos": 19907229024000.0, + "grad_norm": 1.8478849238967225, + "language_loss": 0.86866474, + "learning_rate": 5.460491963260594e-12, + "loss": 0.9452467, + "num_input_tokens_seen": 358640715, + "router_z_loss_clip": 1.26757812, + "router_z_loss_mlp": 0.08581543, + "step": 16620, + "time_per_iteration": 2.4666316509246826 + }, + { + "auxiliary_loss_clip": 0.06398039, + "auxiliary_loss_mlp": 0.01263946, + "balance_loss_clip": 0.06269631, + "balance_loss_mlp": 0.01255381, + "epoch": 0.9993085825943183, + "flos": 24863834246400.0, + "grad_norm": 2.0773440241084855, + "language_loss": 0.7270844, + "learning_rate": 4.58833038607942e-12, + "loss": 0.80370426, + "num_input_tokens_seen": 358659630, + "router_z_loss_clip": 1.28613281, + "router_z_loss_mlp": 0.08569336, + "step": 16621, + "time_per_iteration": 2.542825698852539 + }, + { + "auxiliary_loss_clip": 0.06309964, + "auxiliary_loss_mlp": 0.0125154, + "balance_loss_clip": 0.06255855, + "balance_loss_mlp": 0.01250484, + "epoch": 0.9993687058469863, + "flos": 71307149448960.0, + "grad_norm": 0.7280436002919584, + "language_loss": 0.56537503, + "learning_rate": 3.79200883515729e-12, + "loss": 0.64099008, + "num_input_tokens_seen": 358727840, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.01057434, + "step": 16622, + "time_per_iteration": 3.3803882598876953 + }, + { + "auxiliary_loss_clip": 0.06399865, + "auxiliary_loss_mlp": 0.01263676, + "balance_loss_clip": 0.06268437, + "balance_loss_mlp": 0.01253949, + "epoch": 0.9994288290996542, + "flos": 12203843109120.0, + "grad_norm": 1.9127246932088902, + "language_loss": 0.71968305, + "learning_rate": 3.071527340914315e-12, + "loss": 0.79631841, + "num_input_tokens_seen": 358744125, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.097229, + "step": 16623, + "time_per_iteration": 2.5517661571502686 + }, + { + "auxiliary_loss_clip": 0.06400504, + "auxiliary_loss_mlp": 0.01265963, + "balance_loss_clip": 0.06271066, + "balance_loss_mlp": 0.01255801, + "epoch": 0.9994889523523223, + "flos": 17894927295360.0, + "grad_norm": 1.794645940520366, + "language_loss": 0.74947834, + "learning_rate": 2.4268859304399368e-12, + "loss": 0.82614297, + "num_input_tokens_seen": 358761420, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.10168457, + "step": 16624, + "time_per_iteration": 3.9927942752838135 + }, + { + "auxiliary_loss_clip": 0.06401871, + "auxiliary_loss_mlp": 0.01264474, + "balance_loss_clip": 0.0626986, + "balance_loss_mlp": 0.01255384, + "epoch": 0.9995490756049902, + "flos": 26586218448000.0, + "grad_norm": 2.153450022332739, + "language_loss": 0.73763341, + "learning_rate": 1.8580846286031514e-12, + "loss": 0.81429684, + "num_input_tokens_seen": 358782600, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.09094238, + "step": 16625, + "time_per_iteration": 2.5666937828063965 + }, + { + "auxiliary_loss_clip": 0.06395371, + "auxiliary_loss_mlp": 0.01264271, + "balance_loss_clip": 0.06267975, + "balance_loss_mlp": 0.01254293, + "epoch": 0.9996091988576582, + "flos": 22206555313920.0, + "grad_norm": 1.9802842228291273, + "language_loss": 0.78101254, + "learning_rate": 1.3651234567202408e-12, + "loss": 0.85760903, + "num_input_tokens_seen": 358801220, + "router_z_loss_clip": 1.2734375, + "router_z_loss_mlp": 0.09979248, + "step": 16626, + "time_per_iteration": 2.5068161487579346 + }, + { + "auxiliary_loss_clip": 0.06398835, + "auxiliary_loss_mlp": 0.01267262, + "balance_loss_clip": 0.0627013, + "balance_loss_mlp": 0.01257898, + "epoch": 0.9996693221103262, + "flos": 27379257016320.0, + "grad_norm": 1.598569345061047, + "language_loss": 0.82122838, + "learning_rate": 9.480024334429515e-13, + "loss": 0.89788932, + "num_input_tokens_seen": 358819190, + "router_z_loss_clip": 1.28613281, + "router_z_loss_mlp": 0.09356689, + "step": 16627, + "time_per_iteration": 2.522557258605957 + }, + { + "auxiliary_loss_clip": 0.06405565, + "auxiliary_loss_mlp": 0.01266216, + "balance_loss_clip": 0.06270921, + "balance_loss_mlp": 0.01255958, + "epoch": 0.9997294453629941, + "flos": 26877729202560.0, + "grad_norm": 1.7858605797788545, + "language_loss": 0.70790946, + "learning_rate": 6.067215747584952e-13, + "loss": 0.78462732, + "num_input_tokens_seen": 358839850, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.1026001, + "step": 16628, + "time_per_iteration": 2.5772440433502197 + }, + { + "auxiliary_loss_clip": 0.06401081, + "auxiliary_loss_mlp": 0.01266476, + "balance_loss_clip": 0.0626926, + "balance_loss_mlp": 0.01257023, + "epoch": 0.9997895686156621, + "flos": 23483707246080.0, + "grad_norm": 1.3163404239979697, + "language_loss": 0.75694299, + "learning_rate": 3.4128089332341456e-13, + "loss": 0.83361858, + "num_input_tokens_seen": 358859805, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.09448242, + "step": 16629, + "time_per_iteration": 2.538621187210083 + }, + { + "auxiliary_loss_clip": 0.06406366, + "auxiliary_loss_mlp": 0.01265595, + "balance_loss_clip": 0.06269718, + "balance_loss_mlp": 0.0125557, + "epoch": 0.9998496918683301, + "flos": 20230325568000.0, + "grad_norm": 1.543408158505846, + "language_loss": 0.6084404, + "learning_rate": 1.5168039935176126e-13, + "loss": 0.68516004, + "num_input_tokens_seen": 358877900, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.10021973, + "step": 16630, + "time_per_iteration": 2.5081424713134766 + }, + { + "auxiliary_loss_clip": 0.06402503, + "auxiliary_loss_mlp": 0.01264058, + "balance_loss_clip": 0.06271052, + "balance_loss_mlp": 0.01254819, + "epoch": 0.9999098151209981, + "flos": 21659354225280.0, + "grad_norm": 1.8304152411760084, + "language_loss": 0.60664153, + "learning_rate": 3.792010017100722e-14, + "loss": 0.68330717, + "num_input_tokens_seen": 358897285, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.09246826, + "step": 16631, + "time_per_iteration": 2.49533748626709 + }, + { + "auxiliary_loss_clip": 0.0639651, + "auxiliary_loss_mlp": 0.01262988, + "balance_loss_clip": 0.06269827, + "balance_loss_mlp": 0.01254816, + "epoch": 0.999969938373666, + "flos": 11549054977920.0, + "grad_norm": 1.7303148261606152, + "language_loss": 0.73035192, + "learning_rate": 0.0, + "loss": 0.80694693, + "num_input_tokens_seen": 358911570, + "router_z_loss_clip": 1.26757812, + "router_z_loss_mlp": 0.08172607, + "step": 16632, + "time_per_iteration": 2.4620893001556396 + }, + { + "epoch": 0.999969938373666, + "num_input_tokens_seen": 358911570, + "step": 16632, + "total_flos": 1.399648566653223e+18, + "train_loss": 0.8986138688644978, + "train_runtime": 46835.7738, + "train_samples_per_second": 14.205, + "train_steps_per_second": 0.355 + } + ], + "logging_steps": 1.0, + "max_steps": 16632, + "num_input_tokens_seen": 358911570, + "num_train_epochs": 1, + "save_steps": 3328, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.399648566653223e+18, + "train_batch_size": 5, + "trial_name": null, + "trial_params": null +} diff --git a/sft/revise_Full_smoe_tcmoe/training_args.bin b/sft/revise_Full_smoe_tcmoe/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..97c752df28a864c1e1da329f5474435eefe7778b --- /dev/null +++ b/sft/revise_Full_smoe_tcmoe/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fda08a1e9d46ee3a47070dfbfdde239474b3b39c0e298dedbf0b0dd9cdd3c27e +size 7992